From b9975d0cd1525443524c219f3d5385d21aee14d4 Mon Sep 17 00:00:00 2001
From: Vladimir Sokolovsky <vlad@mellanox.com>
Date: Fri, 8 Jun 2018 16:37:22 -0500
Subject: [PATCH] Initial commit. Based on v4.17

Signed-off-by: Vladimir Sokolovsky <vlad@mellanox.com>
---
 .gitignore                                    |   134 +
 block/blk-mq-rdma.c                           |    52 +
 drivers/infiniband/Kconfig                    |   111 +
 drivers/infiniband/Makefile                   |     4 +
 drivers/infiniband/core/Makefile              |    39 +
 drivers/infiniband/core/addr.c                |   843 +
 drivers/infiniband/core/agent.c               |   221 +
 drivers/infiniband/core/agent.h               |    51 +
 drivers/infiniband/core/cache.c               |  1311 ++
 drivers/infiniband/core/cgroup.c              |    62 +
 drivers/infiniband/core/cm.c                  |  4507 +++++
 drivers/infiniband/core/cm_msgs.h             |   836 +
 drivers/infiniband/core/cma.c                 |  4612 +++++
 drivers/infiniband/core/cma_configfs.c        |   350 +
 drivers/infiniband/core/cma_priv.h            |    97 +
 drivers/infiniband/core/core_priv.h           |   347 +
 drivers/infiniband/core/cq.c                  |   227 +
 drivers/infiniband/core/device.c              |  1268 ++
 drivers/infiniband/core/fmr_pool.c            |   507 +
 drivers/infiniband/core/iwcm.c                |  1203 ++
 drivers/infiniband/core/iwcm.h                |    62 +
 drivers/infiniband/core/iwpm_msg.c            |   750 +
 drivers/infiniband/core/iwpm_util.c           |   754 +
 drivers/infiniband/core/iwpm_util.h           |   269 +
 drivers/infiniband/core/mad.c                 |  3350 ++++
 drivers/infiniband/core/mad_priv.h            |   225 +
 drivers/infiniband/core/mad_rmpp.c            |   968 +
 drivers/infiniband/core/mad_rmpp.h            |    58 +
 drivers/infiniband/core/mr_pool.c             |    86 +
 drivers/infiniband/core/multicast.c           |   898 +
 drivers/infiniband/core/netlink.c             |   311 +
 drivers/infiniband/core/nldev.c               |  1029 +
 drivers/infiniband/core/opa_smi.h             |    78 +
 drivers/infiniband/core/packer.c              |   201 +
 drivers/infiniband/core/rdma_core.c           |   819 +
 drivers/infiniband/core/rdma_core.h           |   120 +
 drivers/infiniband/core/restrack.c            |   222 +
 drivers/infiniband/core/roce_gid_mgmt.c       |   783 +
 drivers/infiniband/core/rw.c                  |   745 +
 drivers/infiniband/core/sa.h                  |    66 +
 drivers/infiniband/core/sa_query.c            |  2530 +++
 drivers/infiniband/core/security.c            |   755 +
 drivers/infiniband/core/smi.c                 |   338 +
 drivers/infiniband/core/smi.h                 |    90 +
 drivers/infiniband/core/sysfs.c               |  1380 ++
 drivers/infiniband/core/ucm.c                 |  1359 ++
 drivers/infiniband/core/ucma.c                |  1832 ++
 drivers/infiniband/core/ud_header.c           |   547 +
 drivers/infiniband/core/umem.c                |   360 +
 drivers/infiniband/core/umem_odp.c            |   828 +
 drivers/infiniband/core/user_mad.c            |  1405 ++
 drivers/infiniband/core/uverbs.h              |   352 +
 drivers/infiniband/core/uverbs_cmd.c          |  4066 ++++
 drivers/infiniband/core/uverbs_ioctl.c        |   422 +
 drivers/infiniband/core/uverbs_ioctl_merge.c  |   665 +
 drivers/infiniband/core/uverbs_main.c         |  1282 ++
 drivers/infiniband/core/uverbs_marshall.c     |   217 +
 drivers/infiniband/core/uverbs_std_types.c    |   311 +
 drivers/infiniband/core/uverbs_std_types_cq.c |   210 +
 drivers/infiniband/core/uverbs_std_types_dm.c |   108 +
 .../core/uverbs_std_types_flow_action.c       |   435 +
 drivers/infiniband/core/uverbs_std_types_mr.c |   147 +
 drivers/infiniband/core/verbs.c               |  2344 +++
 drivers/infiniband/hw/Makefile                |    16 +
 drivers/infiniband/hw/bnxt_re/Kconfig         |    10 +
 drivers/infiniband/hw/bnxt_re/Makefile        |     7 +
 drivers/infiniband/hw/bnxt_re/bnxt_re.h       |   189 +
 drivers/infiniband/hw/bnxt_re/hw_counters.c   |   239 +
 drivers/infiniband/hw/bnxt_re/hw_counters.h   |   101 +
 drivers/infiniband/hw/bnxt_re/ib_verbs.c      |  3803 ++++
 drivers/infiniband/hw/bnxt_re/ib_verbs.h      |   226 +
 drivers/infiniband/hw/bnxt_re/main.c          |  1659 ++
 drivers/infiniband/hw/bnxt_re/qplib_fp.c      |  2842 +++
 drivers/infiniband/hw/bnxt_re/qplib_fp.h      |   529 +
 drivers/infiniband/hw/bnxt_re/qplib_rcfw.c    |   767 +
 drivers/infiniband/hw/bnxt_re/qplib_rcfw.h    |   222 +
 drivers/infiniband/hw/bnxt_re/qplib_res.c     |   830 +
 drivers/infiniband/hw/bnxt_re/qplib_res.h     |   229 +
 drivers/infiniband/hw/bnxt_re/qplib_sp.c      |   851 +
 drivers/infiniband/hw/bnxt_re/qplib_sp.h      |   252 +
 drivers/infiniband/hw/bnxt_re/roce_hsi.h      |  2969 +++
 drivers/infiniband/hw/cxgb3/Kconfig           |    18 +
 drivers/infiniband/hw/cxgb3/Makefile          |     7 +
 drivers/infiniband/hw/cxgb3/cxio_hal.c        |  1332 ++
 drivers/infiniband/hw/cxgb3/cxio_hal.h        |   205 +
 drivers/infiniband/hw/cxgb3/cxio_resource.c   |   344 +
 drivers/infiniband/hw/cxgb3/cxio_resource.h   |    69 +
 drivers/infiniband/hw/cxgb3/cxio_wr.h         |   802 +
 drivers/infiniband/hw/cxgb3/iwch.c            |   290 +
 drivers/infiniband/hw/cxgb3/iwch.h            |   180 +
 drivers/infiniband/hw/cxgb3/iwch_cm.c         |  2258 +++
 drivers/infiniband/hw/cxgb3/iwch_cm.h         |   233 +
 drivers/infiniband/hw/cxgb3/iwch_cq.c         |   222 +
 drivers/infiniband/hw/cxgb3/iwch_ev.c         |   232 +
 drivers/infiniband/hw/cxgb3/iwch_mem.c        |   101 +
 drivers/infiniband/hw/cxgb3/iwch_provider.c   |  1475 ++
 drivers/infiniband/hw/cxgb3/iwch_provider.h   |   347 +
 drivers/infiniband/hw/cxgb3/iwch_qp.c         |  1082 ++
 drivers/infiniband/hw/cxgb3/tcb.h             |   632 +
 drivers/infiniband/hw/cxgb4/Kconfig           |    19 +
 drivers/infiniband/hw/cxgb4/Makefile          |     6 +
 drivers/infiniband/hw/cxgb4/cm.c              |  4263 +++++
 drivers/infiniband/hw/cxgb4/cq.c              |  1041 ++
 drivers/infiniband/hw/cxgb4/device.c          |  1578 ++
 drivers/infiniband/hw/cxgb4/ev.c              |   243 +
 drivers/infiniband/hw/cxgb4/id_table.c        |   111 +
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h        |  1085 ++
 drivers/infiniband/hw/cxgb4/mem.c             |   835 +
 drivers/infiniband/hw/cxgb4/provider.c        |   685 +
 drivers/infiniband/hw/cxgb4/qp.c              |  2109 +++
 drivers/infiniband/hw/cxgb4/resource.c        |   471 +
 drivers/infiniband/hw/cxgb4/t4.h              |   719 +
 drivers/infiniband/hw/cxgb4/t4fw_ri_api.h     |   768 +
 drivers/infiniband/hw/hfi1/Kconfig            |    22 +
 drivers/infiniband/hw/hfi1/Makefile           |    22 +
 drivers/infiniband/hw/hfi1/affinity.c         |   777 +
 drivers/infiniband/hw/hfi1/affinity.h         |   122 +
 drivers/infiniband/hw/hfi1/aspm.h             |   322 +
 drivers/infiniband/hw/hfi1/chip.c             | 15451 ++++++++++++++++
 drivers/infiniband/hw/hfi1/chip.h             |  1393 ++
 drivers/infiniband/hw/hfi1/chip_registers.h   |  1316 ++
 drivers/infiniband/hw/hfi1/common.h           |   396 +
 drivers/infiniband/hw/hfi1/debugfs.c          |  1585 ++
 drivers/infiniband/hw/hfi1/debugfs.h          |   131 +
 drivers/infiniband/hw/hfi1/device.c           |   183 +
 drivers/infiniband/hw/hfi1/device.h           |    60 +
 drivers/infiniband/hw/hfi1/driver.c           |  1749 ++
 drivers/infiniband/hw/hfi1/efivar.c           |   182 +
 drivers/infiniband/hw/hfi1/efivar.h           |    57 +
 drivers/infiniband/hw/hfi1/eprom.c            |   491 +
 drivers/infiniband/hw/hfi1/eprom.h            |    52 +
 drivers/infiniband/hw/hfi1/exp_rcv.c          |   114 +
 drivers/infiniband/hw/hfi1/exp_rcv.h          |   190 +
 drivers/infiniband/hw/hfi1/file_ops.c         |  1703 ++
 drivers/infiniband/hw/hfi1/firmware.c         |  2305 +++
 drivers/infiniband/hw/hfi1/hfi.h              |  2448 +++
 drivers/infiniband/hw/hfi1/init.c             |  2092 +++
 drivers/infiniband/hw/hfi1/intr.c             |   263 +
 drivers/infiniband/hw/hfi1/iowait.h           |   383 +
 drivers/infiniband/hw/hfi1/mad.c              |  4949 +++++
 drivers/infiniband/hw/hfi1/mad.h              |   478 +
 drivers/infiniband/hw/hfi1/mmu_rb.c           |   354 +
 drivers/infiniband/hw/hfi1/mmu_rb.h           |    88 +
 drivers/infiniband/hw/hfi1/opa_compat.h       |   128 +
 drivers/infiniband/hw/hfi1/pcie.c             |  1479 ++
 drivers/infiniband/hw/hfi1/pio.c              |  2092 +++
 drivers/infiniband/hw/hfi1/pio.h              |   330 +
 drivers/infiniband/hw/hfi1/pio_copy.c         |   757 +
 drivers/infiniband/hw/hfi1/platform.c         |  1077 ++
 drivers/infiniband/hw/hfi1/platform.h         |   412 +
 drivers/infiniband/hw/hfi1/qp.c               |   858 +
 drivers/infiniband/hw/hfi1/qp.h               |   133 +
 drivers/infiniband/hw/hfi1/qsfp.c             |   858 +
 drivers/infiniband/hw/hfi1/qsfp.h             |   246 +
 drivers/infiniband/hw/hfi1/rc.c               |  2519 +++
 drivers/infiniband/hw/hfi1/ruc.c              |  1110 ++
 drivers/infiniband/hw/hfi1/sdma.c             |  3426 ++++
 drivers/infiniband/hw/hfi1/sdma.h             |  1101 ++
 drivers/infiniband/hw/hfi1/sdma_txreq.h       |   135 +
 drivers/infiniband/hw/hfi1/sysfs.c            |   887 +
 drivers/infiniband/hw/hfi1/trace.c            |   390 +
 drivers/infiniband/hw/hfi1/trace.h            |    64 +
 drivers/infiniband/hw/hfi1/trace_ctxts.h      |   147 +
 drivers/infiniband/hw/hfi1/trace_dbg.h        |   155 +
 drivers/infiniband/hw/hfi1/trace_ibhdrs.h     |   446 +
 drivers/infiniband/hw/hfi1/trace_misc.h       |   149 +
 drivers/infiniband/hw/hfi1/trace_mmu.h        |    95 +
 drivers/infiniband/hw/hfi1/trace_rc.h         |   118 +
 drivers/infiniband/hw/hfi1/trace_rx.h         |   269 +
 drivers/infiniband/hw/hfi1/trace_tx.h         |   853 +
 drivers/infiniband/hw/hfi1/uc.c               |   588 +
 drivers/infiniband/hw/hfi1/ud.c               |  1056 ++
 drivers/infiniband/hw/hfi1/user_exp_rcv.c     |   977 +
 drivers/infiniband/hw/hfi1/user_exp_rcv.h     |    98 +
 drivers/infiniband/hw/hfi1/user_pages.c       |   137 +
 drivers/infiniband/hw/hfi1/user_sdma.c        |  1533 ++
 drivers/infiniband/hw/hfi1/user_sdma.h        |   264 +
 drivers/infiniband/hw/hfi1/verbs.c            |  2042 ++
 drivers/infiniband/hw/hfi1/verbs.h            |   449 +
 drivers/infiniband/hw/hfi1/verbs_txreq.c      |   141 +
 drivers/infiniband/hw/hfi1/verbs_txreq.h      |   127 +
 drivers/infiniband/hw/hfi1/vnic.h             |   169 +
 drivers/infiniband/hw/hfi1/vnic_main.c        |   859 +
 drivers/infiniband/hw/hfi1/vnic_sdma.c        |   324 +
 drivers/infiniband/hw/hns/Kconfig             |    31 +
 drivers/infiniband/hw/hns/Makefile            |    14 +
 drivers/infiniband/hw/hns/hns_roce_ah.c       |   125 +
 drivers/infiniband/hw/hns/hns_roce_alloc.c    |   249 +
 drivers/infiniband/hw/hns/hns_roce_cmd.c      |   286 +
 drivers/infiniband/hw/hns/hns_roce_cmd.h      |   134 +
 drivers/infiniband/hw/hns/hns_roce_common.h   |   399 +
 drivers/infiniband/hw/hns/hns_roce_cq.c       |   543 +
 drivers/infiniband/hw/hns/hns_roce_db.c       |   180 +
 drivers/infiniband/hw/hns/hns_roce_device.h   |   997 +
 drivers/infiniband/hw/hns/hns_roce_hem.c      |  1052 ++
 drivers/infiniband/hw/hns/hns_roce_hem.h      |   165 +
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c    |  4989 +++++
 drivers/infiniband/hw/hns/hns_roce_hw_v1.h    |  1111 ++
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c    |  4836 +++++
 drivers/infiniband/hw/hns/hns_roce_hw_v2.h    |  1456 ++
 drivers/infiniband/hw/hns/hns_roce_main.c     |   852 +
 drivers/infiniband/hw/hns/hns_roce_mr.c       |  1209 ++
 drivers/infiniband/hw/hns/hns_roce_pd.c       |   150 +
 drivers/infiniband/hw/hns/hns_roce_qp.c       |  1081 ++
 drivers/infiniband/hw/i40iw/Kconfig           |     7 +
 drivers/infiniband/hw/i40iw/Makefile          |    10 +
 drivers/infiniband/hw/i40iw/i40iw.h           |   602 +
 drivers/infiniband/hw/i40iw/i40iw_cm.c        |  4403 +++++
 drivers/infiniband/hw/i40iw/i40iw_cm.h        |   460 +
 drivers/infiniband/hw/i40iw/i40iw_ctrl.c      |  5198 ++++++
 drivers/infiniband/hw/i40iw/i40iw_d.h         |  1737 ++
 drivers/infiniband/hw/i40iw/i40iw_hmc.c       |   821 +
 drivers/infiniband/hw/i40iw/i40iw_hmc.h       |   241 +
 drivers/infiniband/hw/i40iw/i40iw_hw.c        |   805 +
 drivers/infiniband/hw/i40iw/i40iw_main.c      |  2063 +++
 drivers/infiniband/hw/i40iw/i40iw_osdep.h     |   217 +
 drivers/infiniband/hw/i40iw/i40iw_p.h         |   128 +
 drivers/infiniband/hw/i40iw/i40iw_pble.c      |   612 +
 drivers/infiniband/hw/i40iw/i40iw_pble.h      |   131 +
 drivers/infiniband/hw/i40iw/i40iw_puda.c      |  1493 ++
 drivers/infiniband/hw/i40iw/i40iw_puda.h      |   188 +
 drivers/infiniband/hw/i40iw/i40iw_register.h  |  1030 +
 drivers/infiniband/hw/i40iw/i40iw_status.h    |   101 +
 drivers/infiniband/hw/i40iw/i40iw_type.h      |  1363 ++
 drivers/infiniband/hw/i40iw/i40iw_uk.c        |  1232 ++
 drivers/infiniband/hw/i40iw/i40iw_user.h      |   430 +
 drivers/infiniband/hw/i40iw/i40iw_utils.c     |  1544 ++
 drivers/infiniband/hw/i40iw/i40iw_verbs.c     |  2961 +++
 drivers/infiniband/hw/i40iw/i40iw_verbs.h     |   180 +
 drivers/infiniband/hw/i40iw/i40iw_vf.c        |    85 +
 drivers/infiniband/hw/i40iw/i40iw_vf.h        |    62 +
 drivers/infiniband/hw/i40iw/i40iw_virtchnl.c  |   756 +
 drivers/infiniband/hw/i40iw/i40iw_virtchnl.h  |   124 +
 drivers/infiniband/hw/mlx4/Kconfig            |    11 +
 drivers/infiniband/hw/mlx4/Makefile           |     3 +
 drivers/infiniband/hw/mlx4/ah.c               |   219 +
 drivers/infiniband/hw/mlx4/alias_GUID.c       |   905 +
 drivers/infiniband/hw/mlx4/cm.c               |   480 +
 drivers/infiniband/hw/mlx4/cq.c               |   980 +
 drivers/infiniband/hw/mlx4/doorbell.c         |    96 +
 drivers/infiniband/hw/mlx4/mad.c              |  2384 +++
 drivers/infiniband/hw/mlx4/main.c             |  3472 ++++
 drivers/infiniband/hw/mlx4/mcg.c              |  1257 ++
 drivers/infiniband/hw/mlx4/mlx4_ib.h          |   925 +
 drivers/infiniband/hw/mlx4/mr.c               |   789 +
 drivers/infiniband/hw/mlx4/qp.c               |  4467 +++++
 drivers/infiniband/hw/mlx4/srq.c              |   378 +
 drivers/infiniband/hw/mlx4/sysfs.c            |   890 +
 drivers/infiniband/hw/mlx5/Kconfig            |     9 +
 drivers/infiniband/hw/mlx5/Makefile           |     5 +
 drivers/infiniband/hw/mlx5/ah.c               |   145 +
 drivers/infiniband/hw/mlx5/cmd.c              |   172 +
 drivers/infiniband/hw/mlx5/cmd.h              |    48 +
 drivers/infiniband/hw/mlx5/cong.c             |   456 +
 drivers/infiniband/hw/mlx5/cq.c               |  1464 ++
 drivers/infiniband/hw/mlx5/doorbell.c         |    98 +
 drivers/infiniband/hw/mlx5/gsi.c              |   541 +
 drivers/infiniband/hw/mlx5/ib_rep.c           |   192 +
 drivers/infiniband/hw/mlx5/ib_rep.h           |    72 +
 drivers/infiniband/hw/mlx5/ib_virt.c          |   203 +
 drivers/infiniband/hw/mlx5/mad.c              |   609 +
 drivers/infiniband/hw/mlx5/main.c             |  5821 ++++++
 drivers/infiniband/hw/mlx5/mem.c              |   234 +
 drivers/infiniband/hw/mlx5/mlx5_ib.h          |  1285 ++
 drivers/infiniband/hw/mlx5/mr.c               |  2000 ++
 drivers/infiniband/hw/mlx5/odp.c              |  1228 ++
 drivers/infiniband/hw/mlx5/qp.c               |  5670 ++++++
 drivers/infiniband/hw/mlx5/srq.c              |   510 +
 drivers/infiniband/hw/mthca/Kconfig           |    17 +
 drivers/infiniband/hw/mthca/Makefile          |     8 +
 drivers/infiniband/hw/mthca/mthca_allocator.c |   301 +
 drivers/infiniband/hw/mthca/mthca_av.c        |   379 +
 drivers/infiniband/hw/mthca/mthca_catas.c     |   198 +
 drivers/infiniband/hw/mthca/mthca_cmd.c       |  1977 ++
 drivers/infiniband/hw/mthca/mthca_cmd.h       |   325 +
 .../infiniband/hw/mthca/mthca_config_reg.h    |    48 +
 drivers/infiniband/hw/mthca/mthca_cq.c        |   981 +
 drivers/infiniband/hw/mthca/mthca_dev.h       |   597 +
 drivers/infiniband/hw/mthca/mthca_doorbell.h  |   109 +
 drivers/infiniband/hw/mthca/mthca_eq.c        |   905 +
 drivers/infiniband/hw/mthca/mthca_mad.c       |   351 +
 drivers/infiniband/hw/mthca/mthca_main.c      |  1272 ++
 drivers/infiniband/hw/mthca/mthca_mcg.c       |   335 +
 drivers/infiniband/hw/mthca/mthca_memfree.c   |   758 +
 drivers/infiniband/hw/mthca/mthca_memfree.h   |   179 +
 drivers/infiniband/hw/mthca/mthca_mr.c        |   965 +
 drivers/infiniband/hw/mthca/mthca_pd.c        |    81 +
 drivers/infiniband/hw/mthca/mthca_profile.c   |   281 +
 drivers/infiniband/hw/mthca/mthca_profile.h   |    59 +
 drivers/infiniband/hw/mthca/mthca_provider.c  |  1321 ++
 drivers/infiniband/hw/mthca/mthca_provider.h  |   344 +
 drivers/infiniband/hw/mthca/mthca_qp.c        |  2323 +++
 drivers/infiniband/hw/mthca/mthca_reset.c     |   282 +
 drivers/infiniband/hw/mthca/mthca_srq.c       |   696 +
 drivers/infiniband/hw/mthca/mthca_uar.c       |    78 +
 drivers/infiniband/hw/mthca/mthca_wqe.h       |   131 +
 drivers/infiniband/hw/nes/Kconfig             |    15 +
 drivers/infiniband/hw/nes/Makefile            |     3 +
 drivers/infiniband/hw/nes/nes.c               |  1208 ++
 drivers/infiniband/hw/nes/nes.h               |   583 +
 drivers/infiniband/hw/nes/nes_cm.c            |  3989 ++++
 drivers/infiniband/hw/nes/nes_cm.h            |   470 +
 drivers/infiniband/hw/nes/nes_context.h       |   193 +
 drivers/infiniband/hw/nes/nes_hw.c            |  3889 ++++
 drivers/infiniband/hw/nes/nes_hw.h            |  1380 ++
 drivers/infiniband/hw/nes/nes_mgt.c           |  1156 ++
 drivers/infiniband/hw/nes/nes_mgt.h           |    97 +
 drivers/infiniband/hw/nes/nes_nic.c           |  1872 ++
 drivers/infiniband/hw/nes/nes_utils.c         |   916 +
 drivers/infiniband/hw/nes/nes_verbs.c         |  3905 ++++
 drivers/infiniband/hw/nes/nes_verbs.h         |   199 +
 drivers/infiniband/hw/ocrdma/Kconfig          |     8 +
 drivers/infiniband/hw/ocrdma/Makefile         |     5 +
 drivers/infiniband/hw/ocrdma/ocrdma.h         |   609 +
 drivers/infiniband/hw/ocrdma/ocrdma_ah.c      |   302 +
 drivers/infiniband/hw/ocrdma/ocrdma_ah.h      |    68 +
 drivers/infiniband/hw/ocrdma/ocrdma_hw.c      |  3254 ++++
 drivers/infiniband/hw/ocrdma/ocrdma_hw.h      |   159 +
 drivers/infiniband/hw/ocrdma/ocrdma_main.c    |   478 +
 drivers/infiniband/hw/ocrdma/ocrdma_sli.h     |  2240 +++
 drivers/infiniband/hw/ocrdma/ocrdma_stats.c   |   862 +
 drivers/infiniband/hw/ocrdma/ocrdma_stats.h   |    75 +
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c   |  3052 +++
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h   |   116 +
 drivers/infiniband/hw/qedr/Kconfig            |    10 +
 drivers/infiniband/hw/qedr/Makefile           |     3 +
 drivers/infiniband/hw/qedr/main.c             |   991 +
 drivers/infiniband/hw/qedr/qedr.h             |   541 +
 drivers/infiniband/hw/qedr/qedr_hsi_rdma.h    |   769 +
 drivers/infiniband/hw/qedr/qedr_iw_cm.c       |   752 +
 drivers/infiniband/hw/qedr/qedr_iw_cm.h       |    49 +
 drivers/infiniband/hw/qedr/qedr_roce_cm.c     |   741 +
 drivers/infiniband/hw/qedr/qedr_roce_cm.h     |    60 +
 drivers/infiniband/hw/qedr/verbs.c            |  3792 ++++
 drivers/infiniband/hw/qedr/verbs.h            |    98 +
 drivers/infiniband/hw/qib/Kconfig             |    16 +
 drivers/infiniband/hw/qib/Makefile            |    17 +
 drivers/infiniband/hw/qib/qib.h               |  1525 ++
 drivers/infiniband/hw/qib/qib_6120_regs.h     |   977 +
 drivers/infiniband/hw/qib/qib_7220.h          |   149 +
 drivers/infiniband/hw/qib/qib_7220_regs.h     |  1496 ++
 drivers/infiniband/hw/qib/qib_7322_regs.h     |  3163 ++++
 drivers/infiniband/hw/qib/qib_common.h        |   805 +
 drivers/infiniband/hw/qib/qib_debugfs.c       |   285 +
 drivers/infiniband/hw/qib/qib_debugfs.h       |    45 +
 drivers/infiniband/hw/qib/qib_diag.c          |   906 +
 drivers/infiniband/hw/qib/qib_driver.c        |   807 +
 drivers/infiniband/hw/qib/qib_eeprom.c        |   271 +
 drivers/infiniband/hw/qib/qib_file_ops.c      |  2401 +++
 drivers/infiniband/hw/qib/qib_fs.c            |   605 +
 drivers/infiniband/hw/qib/qib_iba6120.c       |  3541 ++++
 drivers/infiniband/hw/qib/qib_iba7220.c       |  4600 +++++
 drivers/infiniband/hw/qib/qib_iba7322.c       |  8506 +++++++++
 drivers/infiniband/hw/qib/qib_init.c          |  1817 ++
 drivers/infiniband/hw/qib/qib_intr.c          |   240 +
 drivers/infiniband/hw/qib/qib_mad.c           |  2498 +++
 drivers/infiniband/hw/qib/qib_mad.h           |   300 +
 drivers/infiniband/hw/qib/qib_pcie.c          |   614 +
 drivers/infiniband/hw/qib/qib_pio_copy.c      |    64 +
 drivers/infiniband/hw/qib/qib_qp.c            |   457 +
 drivers/infiniband/hw/qib/qib_qsfp.c          |   549 +
 drivers/infiniband/hw/qib/qib_qsfp.h          |   188 +
 drivers/infiniband/hw/qib/qib_rc.c            |  2156 +++
 drivers/infiniband/hw/qib/qib_ruc.c           |   804 +
 drivers/infiniband/hw/qib/qib_sd7220.c        |  1450 ++
 drivers/infiniband/hw/qib/qib_sdma.c          |  1020 +
 drivers/infiniband/hw/qib/qib_sysfs.c         |   871 +
 drivers/infiniband/hw/qib/qib_twsi.c          |   501 +
 drivers/infiniband/hw/qib/qib_tx.c            |   568 +
 drivers/infiniband/hw/qib/qib_uc.c            |   521 +
 drivers/infiniband/hw/qib/qib_ud.c            |   587 +
 drivers/infiniband/hw/qib/qib_user_pages.c    |   158 +
 drivers/infiniband/hw/qib/qib_user_sdma.c     |  1465 ++
 drivers/infiniband/hw/qib/qib_user_sdma.h     |    52 +
 drivers/infiniband/hw/qib/qib_verbs.c         |  1743 ++
 drivers/infiniband/hw/qib/qib_verbs.h         |   414 +
 drivers/infiniband/hw/qib/qib_wc_ppc64.c      |    62 +
 drivers/infiniband/hw/qib/qib_wc_x86_64.c     |   150 +
 drivers/infiniband/hw/usnic/Kconfig           |    10 +
 drivers/infiniband/hw/usnic/Makefile          |    16 +
 drivers/infiniband/hw/usnic/usnic.h           |    44 +
 drivers/infiniband/hw/usnic/usnic_abi.h       |    88 +
 .../hw/usnic/usnic_common_pkt_hdr.h           |    41 +
 .../infiniband/hw/usnic/usnic_common_util.h   |    49 +
 drivers/infiniband/hw/usnic/usnic_debugfs.c   |   170 +
 drivers/infiniband/hw/usnic/usnic_debugfs.h   |    44 +
 drivers/infiniband/hw/usnic/usnic_fwd.c       |   357 +
 drivers/infiniband/hw/usnic/usnic_fwd.h       |   129 +
 drivers/infiniband/hw/usnic/usnic_ib.h        |   133 +
 drivers/infiniband/hw/usnic/usnic_ib_main.c   |   715 +
 drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c |   766 +
 drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h |   109 +
 drivers/infiniband/hw/usnic/usnic_ib_sysfs.c  |   340 +
 drivers/infiniband/hw/usnic/usnic_ib_sysfs.h  |    44 +
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c  |   810 +
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h  |    92 +
 drivers/infiniband/hw/usnic/usnic_log.h       |    73 +
 drivers/infiniband/hw/usnic/usnic_transport.c |   214 +
 drivers/infiniband/hw/usnic/usnic_transport.h |    66 +
 drivers/infiniband/hw/usnic/usnic_uiom.c      |   602 +
 drivers/infiniband/hw/usnic/usnic_uiom.h      |    95 +
 .../hw/usnic/usnic_uiom_interval_tree.c       |   270 +
 .../hw/usnic/usnic_uiom_interval_tree.h       |    88 +
 drivers/infiniband/hw/usnic/usnic_vnic.c      |   476 +
 drivers/infiniband/hw/usnic/usnic_vnic.h      |   118 +
 drivers/infiniband/hw/vmw_pvrdma/Kconfig      |     7 +
 drivers/infiniband/hw/vmw_pvrdma/Makefile     |     3 +
 drivers/infiniband/hw/vmw_pvrdma/pvrdma.h     |   523 +
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_cmd.c |   119 +
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c  |   443 +
 .../infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h |   667 +
 .../hw/vmw_pvrdma/pvrdma_doorbell.c           |   127 +
 .../infiniband/hw/vmw_pvrdma/pvrdma_main.c    |  1156 ++
 .../infiniband/hw/vmw_pvrdma/pvrdma_misc.c    |   312 +
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c  |   327 +
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c  |  1003 +
 .../infiniband/hw/vmw_pvrdma/pvrdma_ring.h    |   114 +
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c |   322 +
 .../infiniband/hw/vmw_pvrdma/pvrdma_verbs.c   |   593 +
 .../infiniband/hw/vmw_pvrdma/pvrdma_verbs.h   |   454 +
 drivers/infiniband/sw/Makefile                |     2 +
 drivers/infiniband/sw/rdmavt/Kconfig          |     7 +
 drivers/infiniband/sw/rdmavt/Makefile         |    13 +
 drivers/infiniband/sw/rdmavt/ah.c             |   189 +
 drivers/infiniband/sw/rdmavt/ah.h             |    59 +
 drivers/infiniband/sw/rdmavt/cq.c             |   550 +
 drivers/infiniband/sw/rdmavt/cq.h             |    64 +
 drivers/infiniband/sw/rdmavt/mad.c            |   171 +
 drivers/infiniband/sw/rdmavt/mad.h            |    60 +
 drivers/infiniband/sw/rdmavt/mcast.c          |   443 +
 drivers/infiniband/sw/rdmavt/mcast.h          |    58 +
 drivers/infiniband/sw/rdmavt/mmap.c           |   208 +
 drivers/infiniband/sw/rdmavt/mmap.h           |    63 +
 drivers/infiniband/sw/rdmavt/mr.c             |  1114 ++
 drivers/infiniband/sw/rdmavt/mr.h             |    94 +
 drivers/infiniband/sw/rdmavt/pd.c             |   119 +
 drivers/infiniband/sw/rdmavt/pd.h             |    58 +
 drivers/infiniband/sw/rdmavt/qp.c             |  2316 +++
 drivers/infiniband/sw/rdmavt/qp.h             |    69 +
 drivers/infiniband/sw/rdmavt/rc.c             |   189 +
 drivers/infiniband/sw/rdmavt/srq.c            |   355 +
 drivers/infiniband/sw/rdmavt/srq.h            |    62 +
 drivers/infiniband/sw/rdmavt/trace.c          |    49 +
 drivers/infiniband/sw/rdmavt/trace.h          |    56 +
 drivers/infiniband/sw/rdmavt/trace_cq.h       |   127 +
 drivers/infiniband/sw/rdmavt/trace_mr.h       |   174 +
 drivers/infiniband/sw/rdmavt/trace_qp.h       |   138 +
 drivers/infiniband/sw/rdmavt/trace_rc.h       |   109 +
 drivers/infiniband/sw/rdmavt/trace_rvt.h      |    81 +
 drivers/infiniband/sw/rdmavt/trace_tx.h       |   163 +
 drivers/infiniband/sw/rdmavt/vt.c             |   899 +
 drivers/infiniband/sw/rdmavt/vt.h             |   102 +
 drivers/infiniband/sw/rxe/Kconfig             |    26 +
 drivers/infiniband/sw/rxe/Makefile            |    25 +
 drivers/infiniband/sw/rxe/rxe.c               |   378 +
 drivers/infiniband/sw/rxe/rxe.h               |   113 +
 drivers/infiniband/sw/rxe/rxe_av.c            |    92 +
 drivers/infiniband/sw/rxe/rxe_comp.c          |   771 +
 drivers/infiniband/sw/rxe/rxe_cq.c            |   185 +
 drivers/infiniband/sw/rxe/rxe_hdr.h           |   960 +
 drivers/infiniband/sw/rxe/rxe_hw_counters.c   |    78 +
 drivers/infiniband/sw/rxe/rxe_hw_counters.h   |    61 +
 drivers/infiniband/sw/rxe/rxe_icrc.c          |    96 +
 drivers/infiniband/sw/rxe/rxe_loc.h           |   298 +
 drivers/infiniband/sw/rxe/rxe_mcast.c         |   190 +
 drivers/infiniband/sw/rxe/rxe_mmap.c          |   173 +
 drivers/infiniband/sw/rxe/rxe_mr.c            |   648 +
 drivers/infiniband/sw/rxe/rxe_net.c           |   770 +
 drivers/infiniband/sw/rxe/rxe_net.h           |    54 +
 drivers/infiniband/sw/rxe/rxe_opcode.c        |   961 +
 drivers/infiniband/sw/rxe/rxe_opcode.h        |   129 +
 drivers/infiniband/sw/rxe/rxe_param.h         |   171 +
 drivers/infiniband/sw/rxe/rxe_pool.c          |   499 +
 drivers/infiniband/sw/rxe/rxe_pool.h          |   165 +
 drivers/infiniband/sw/rxe/rxe_qp.c            |   859 +
 drivers/infiniband/sw/rxe/rxe_queue.c         |   212 +
 drivers/infiniband/sw/rxe/rxe_queue.h         |   179 +
 drivers/infiniband/sw/rxe/rxe_recv.c          |   414 +
 drivers/infiniband/sw/rxe/rxe_req.c           |   751 +
 drivers/infiniband/sw/rxe/rxe_resp.c          |  1405 ++
 drivers/infiniband/sw/rxe/rxe_srq.c           |   179 +
 drivers/infiniband/sw/rxe/rxe_sysfs.c         |   157 +
 drivers/infiniband/sw/rxe/rxe_task.c          |   173 +
 drivers/infiniband/sw/rxe/rxe_task.h          |    96 +
 drivers/infiniband/sw/rxe/rxe_verbs.c         |  1328 ++
 drivers/infiniband/sw/rxe/rxe_verbs.h         |   470 +
 drivers/infiniband/ulp/Makefile               |     7 +
 drivers/infiniband/ulp/ipoib/Kconfig          |    49 +
 drivers/infiniband/ulp/ipoib/Makefile         |    13 +
 drivers/infiniband/ulp/ipoib/ipoib.h          |   838 +
 drivers/infiniband/ulp/ipoib/ipoib_cm.c       |  1663 ++
 drivers/infiniband/ulp/ipoib/ipoib_ethtool.c  |   230 +
 drivers/infiniband/ulp/ipoib/ipoib_fs.c       |   298 +
 drivers/infiniband/ulp/ipoib/ipoib_ib.c       |  1317 ++
 drivers/infiniband/ulp/ipoib/ipoib_main.c     |  2524 +++
 .../infiniband/ulp/ipoib/ipoib_multicast.c    |  1067 ++
 drivers/infiniband/ulp/ipoib/ipoib_netlink.c  |   185 +
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c    |   295 +
 drivers/infiniband/ulp/ipoib/ipoib_vlan.c     |   247 +
 drivers/infiniband/ulp/iser/Kconfig           |    12 +
 drivers/infiniband/ulp/iser/Makefile          |     4 +
 drivers/infiniband/ulp/iser/iscsi_iser.c      |  1133 ++
 drivers/infiniband/ulp/iser/iscsi_iser.h      |   718 +
 drivers/infiniband/ulp/iser/iser_initiator.c  |   775 +
 drivers/infiniband/ulp/iser/iser_memory.c     |   578 +
 drivers/infiniband/ulp/iser/iser_verbs.c      |  1160 ++
 drivers/infiniband/ulp/isert/Kconfig          |     5 +
 drivers/infiniband/ulp/isert/Makefile         |     2 +
 drivers/infiniband/ulp/isert/ib_isert.c       |  2725 +++
 drivers/infiniband/ulp/isert/ib_isert.h       |   201 +
 drivers/infiniband/ulp/opa_vnic/Kconfig       |     8 +
 drivers/infiniband/ulp/opa_vnic/Makefile      |     7 +
 .../infiniband/ulp/opa_vnic/opa_vnic_encap.c  |   513 +
 .../infiniband/ulp/opa_vnic/opa_vnic_encap.h  |   501 +
 .../ulp/opa_vnic/opa_vnic_ethtool.c           |   187 +
 .../ulp/opa_vnic/opa_vnic_internal.h          |   330 +
 .../infiniband/ulp/opa_vnic/opa_vnic_netdev.c |   403 +
 .../infiniband/ulp/opa_vnic/opa_vnic_vema.c   |  1076 ++
 .../ulp/opa_vnic/opa_vnic_vema_iface.c        |   390 +
 drivers/infiniband/ulp/srp/Kbuild             |     1 +
 drivers/infiniband/ulp/srp/Kconfig            |    12 +
 drivers/infiniband/ulp/srp/ib_srp.c           |  4263 +++++
 drivers/infiniband/ulp/srp/ib_srp.h           |   337 +
 drivers/infiniband/ulp/srpt/Kconfig           |    12 +
 drivers/infiniband/ulp/srpt/Makefile          |     2 +
 drivers/infiniband/ulp/srpt/ib_dm_mad.h       |   139 +
 drivers/infiniband/ulp/srpt/ib_srpt.c         |  3762 ++++
 drivers/infiniband/ulp/srpt/ib_srpt.h         |   418 +
 drivers/net/ethernet/broadcom/Kconfig         |   235 +
 drivers/net/ethernet/broadcom/Makefile        |    18 +
 drivers/net/ethernet/broadcom/bnxt/Makefile   |     4 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     |  9092 +++++++++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     |  1482 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c |   633 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h |    44 +
 .../net/ethernet/broadcom/bnxt/bnxt_devlink.c |    65 +
 .../net/ethernet/broadcom/bnxt/bnxt_devlink.h |    39 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c |    32 +
 .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c |  2730 +++
 .../net/ethernet/broadcom/bnxt/bnxt_ethtool.h |    48 +
 .../net/ethernet/broadcom/bnxt/bnxt_fw_hdr.h  |   119 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h |  6406 +++++++
 .../ethernet/broadcom/bnxt/bnxt_nvm_defs.h    |    72 +
 .../net/ethernet/broadcom/bnxt/bnxt_sriov.c   |  1131 ++
 .../net/ethernet/broadcom/bnxt/bnxt_sriov.h   |    26 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c  |  1677 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.h  |   230 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c |   487 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h |   103 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c |   564 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h |    64 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c |   231 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h |    21 +
 drivers/net/ethernet/chelsio/Kconfig          |   135 +
 drivers/net/ethernet/chelsio/cxgb3/Makefile   |     8 +
 drivers/net/ethernet/chelsio/cxgb3/adapter.h  |   333 +
 drivers/net/ethernet/chelsio/cxgb3/ael1002.c  |   941 +
 drivers/net/ethernet/chelsio/cxgb3/aq100x.c   |   354 +
 drivers/net/ethernet/chelsio/cxgb3/common.h   |   773 +
 .../ethernet/chelsio/cxgb3/cxgb3_ctl_defs.h   |   189 +
 .../net/ethernet/chelsio/cxgb3/cxgb3_defs.h   |   111 +
 .../net/ethernet/chelsio/cxgb3/cxgb3_ioctl.h  |   177 +
 .../net/ethernet/chelsio/cxgb3/cxgb3_main.c   |  3451 ++++
 .../ethernet/chelsio/cxgb3/cxgb3_offload.c    |  1403 ++
 .../ethernet/chelsio/cxgb3/cxgb3_offload.h    |   209 +
 .../ethernet/chelsio/cxgb3/firmware_exports.h |   177 +
 drivers/net/ethernet/chelsio/cxgb3/l2t.c      |   464 +
 drivers/net/ethernet/chelsio/cxgb3/l2t.h      |   148 +
 drivers/net/ethernet/chelsio/cxgb3/mc5.c      |   422 +
 drivers/net/ethernet/chelsio/cxgb3/regs.h     |  2564 +++
 drivers/net/ethernet/chelsio/cxgb3/sge.c      |  3371 ++++
 drivers/net/ethernet/chelsio/cxgb3/sge_defs.h |   256 +
 drivers/net/ethernet/chelsio/cxgb3/t3_cpl.h   |  1495 ++
 drivers/net/ethernet/chelsio/cxgb3/t3_hw.c    |  3811 ++++
 drivers/net/ethernet/chelsio/cxgb3/t3cdev.h   |    70 +
 drivers/net/ethernet/chelsio/cxgb3/version.h  |    44 +
 drivers/net/ethernet/chelsio/cxgb3/vsc8211.c  |   416 +
 drivers/net/ethernet/chelsio/cxgb3/xgmac.c    |   657 +
 drivers/net/ethernet/chelsio/cxgb4/Makefile   |    14 +
 drivers/net/ethernet/chelsio/cxgb4/clip_tbl.c |   332 +
 drivers/net/ethernet/chelsio/cxgb4/clip_tbl.h |    45 +
 .../net/ethernet/chelsio/cxgb4/cudbg_common.c |    68 +
 .../net/ethernet/chelsio/cxgb4/cudbg_entity.h |   452 +
 drivers/net/ethernet/chelsio/cxgb4/cudbg_if.h |   100 +
 .../net/ethernet/chelsio/cxgb4/cudbg_lib.c    |  2774 +++
 .../net/ethernet/chelsio/cxgb4/cudbg_lib.h    |   185 +
 .../ethernet/chelsio/cxgb4/cudbg_lib_common.h |    89 +
 .../net/ethernet/chelsio/cxgb4/cudbg_zlib.c   |    82 +
 .../net/ethernet/chelsio/cxgb4/cudbg_zlib.h   |    43 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h    |  1790 ++
 .../net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c  |   490 +
 .../net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h  |    48 +
 .../net/ethernet/chelsio/cxgb4/cxgb4_dcb.c    |  1266 ++
 .../net/ethernet/chelsio/cxgb4/cxgb4_dcb.h    |   163 +
 .../ethernet/chelsio/cxgb4/cxgb4_debugfs.c    |  3061 +++
 .../ethernet/chelsio/cxgb4/cxgb4_debugfs.h    |    83 +
 .../ethernet/chelsio/cxgb4/cxgb4_ethtool.c    |  1547 ++
 .../net/ethernet/chelsio/cxgb4/cxgb4_fcoe.c   |   122 +
 .../net/ethernet/chelsio/cxgb4/cxgb4_fcoe.h   |    57 +
 .../net/ethernet/chelsio/cxgb4/cxgb4_filter.c |  1757 ++
 .../net/ethernet/chelsio/cxgb4/cxgb4_filter.h |    56 +
 .../net/ethernet/chelsio/cxgb4/cxgb4_main.c   |  6003 ++++++
 .../net/ethernet/chelsio/cxgb4/cxgb4_ptp.c    |   476 +
 .../net/ethernet/chelsio/cxgb4/cxgb4_ptp.h    |    74 +
 .../ethernet/chelsio/cxgb4/cxgb4_tc_flower.c  |   876 +
 .../ethernet/chelsio/cxgb4/cxgb4_tc_flower.h  |   120 +
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c |   480 +
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h |    52 +
 .../chelsio/cxgb4/cxgb4_tc_u32_parse.h        |   294 +
 .../net/ethernet/chelsio/cxgb4/cxgb4_uld.c    |   812 +
 .../net/ethernet/chelsio/cxgb4/cxgb4_uld.h    |   424 +
 drivers/net/ethernet/chelsio/cxgb4/l2t.c      |   759 +
 drivers/net/ethernet/chelsio/cxgb4/l2t.h      |   127 +
 drivers/net/ethernet/chelsio/cxgb4/sched.c    |   553 +
 drivers/net/ethernet/chelsio/cxgb4/sched.h    |   110 +
 drivers/net/ethernet/chelsio/cxgb4/sge.c      |  3715 ++++
 drivers/net/ethernet/chelsio/cxgb4/smt.c      |   247 +
 drivers/net/ethernet/chelsio/cxgb4/smt.h      |    76 +
 drivers/net/ethernet/chelsio/cxgb4/srq.c      |   138 +
 drivers/net/ethernet/chelsio/cxgb4/srq.h      |    65 +
 .../net/ethernet/chelsio/cxgb4/t4_chip_type.h |    85 +
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c    |  9998 ++++++++++
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.h    |   300 +
 drivers/net/ethernet/chelsio/cxgb4/t4_msg.h   |  2304 +++
 .../ethernet/chelsio/cxgb4/t4_pci_id_tbl.h    |   213 +
 drivers/net/ethernet/chelsio/cxgb4/t4_regs.h  |  3315 ++++
 drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h   |    69 +
 .../net/ethernet/chelsio/cxgb4/t4_values.h    |   156 +
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h |  3999 ++++
 .../net/ethernet/chelsio/cxgb4/t4fw_version.h |    64 +
 drivers/net/ethernet/emulex/Kconfig           |    21 +
 drivers/net/ethernet/emulex/Makefile          |     5 +
 drivers/net/ethernet/emulex/benet/Kconfig     |    15 +
 drivers/net/ethernet/emulex/benet/Makefile    |     7 +
 drivers/net/ethernet/emulex/benet/be.h        |  1002 +
 drivers/net/ethernet/emulex/benet/be_cmds.c   |  5085 +++++
 drivers/net/ethernet/emulex/benet/be_cmds.h   |  2515 +++
 .../net/ethernet/emulex/benet/be_ethtool.c    |  1433 ++
 drivers/net/ethernet/emulex/benet/be_hw.h     |   375 +
 drivers/net/ethernet/emulex/benet/be_main.c   |  6211 +++++++
 drivers/net/ethernet/emulex/benet/be_roce.c   |   161 +
 drivers/net/ethernet/emulex/benet/be_roce.h   |    77 +
 drivers/net/ethernet/intel/Kconfig            |   283 +
 drivers/net/ethernet/intel/i40e/Makefile      |    51 +
 drivers/net/ethernet/intel/i40e/i40e.h        |  1120 ++
 drivers/net/ethernet/intel/i40e/i40e_adminq.c |  1059 ++
 drivers/net/ethernet/intel/i40e/i40e_adminq.h |   160 +
 .../net/ethernet/intel/i40e/i40e_adminq_cmd.h |  2897 +++
 drivers/net/ethernet/intel/i40e/i40e_alloc.h  |    59 +
 drivers/net/ethernet/intel/i40e/i40e_client.c |   839 +
 drivers/net/ethernet/intel/i40e/i40e_client.h |   227 +
 drivers/net/ethernet/intel/i40e/i40e_common.c |  5717 ++++++
 drivers/net/ethernet/intel/i40e/i40e_dcb.c    |   975 +
 drivers/net/ethernet/intel/i40e/i40e_dcb.h    |   152 +
 drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c |   321 +
 .../net/ethernet/intel/i40e/i40e_debugfs.c    |  1838 ++
 drivers/net/ethernet/intel/i40e/i40e_devids.h |    56 +
 drivers/net/ethernet/intel/i40e/i40e_diag.c   |   154 +
 drivers/net/ethernet/intel/i40e/i40e_diag.h   |    52 +
 .../net/ethernet/intel/i40e/i40e_ethtool.c    |  4717 +++++
 drivers/net/ethernet/intel/i40e/i40e_hmc.c    |   358 +
 drivers/net/ethernet/intel/i40e/i40e_hmc.h    |   239 +
 .../net/ethernet/intel/i40e/i40e_lan_hmc.c    |  1143 ++
 .../net/ethernet/intel/i40e/i40e_lan_hmc.h    |   182 +
 drivers/net/ethernet/intel/i40e/i40e_main.c   | 14484 +++++++++++++++
 drivers/net/ethernet/intel/i40e/i40e_nvm.c    |  1590 ++
 drivers/net/ethernet/intel/i40e/i40e_osdep.h  |    82 +
 .../net/ethernet/intel/i40e/i40e_prototype.h  |   460 +
 drivers/net/ethernet/intel/i40e/i40e_ptp.c    |   819 +
 .../net/ethernet/intel/i40e/i40e_register.h   |  5354 ++++++
 drivers/net/ethernet/intel/i40e/i40e_status.h |   102 +
 drivers/net/ethernet/intel/i40e/i40e_trace.h  |   230 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |  3715 ++++
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |   601 +
 drivers/net/ethernet/intel/i40e/i40e_type.h   |  1595 ++
 .../ethernet/intel/i40e/i40e_virtchnl_pf.c    |  4284 +++++
 .../ethernet/intel/i40e/i40e_virtchnl_pf.h    |   164 +
 drivers/net/ethernet/mellanox/Kconfig         |    25 +
 drivers/net/ethernet/mellanox/Makefile        |     8 +
 drivers/net/ethernet/mellanox/mlx4/Kconfig    |    48 +
 drivers/net/ethernet/mellanox/mlx4/Makefile   |    12 +
 drivers/net/ethernet/mellanox/mlx4/alloc.c    |   819 +
 drivers/net/ethernet/mellanox/mlx4/catas.c    |   332 +
 drivers/net/ethernet/mellanox/mlx4/cmd.c      |  3431 ++++
 drivers/net/ethernet/mellanox/mlx4/cq.c       |   421 +
 drivers/net/ethernet/mellanox/mlx4/en_clock.c |   303 +
 drivers/net/ethernet/mellanox/mlx4/en_cq.c    |   218 +
 .../net/ethernet/mellanox/mlx4/en_dcb_nl.c    |   753 +
 .../net/ethernet/mellanox/mlx4/en_ethtool.c   |  2160 +++
 drivers/net/ethernet/mellanox/mlx4/en_main.c  |   394 +
 .../net/ethernet/mellanox/mlx4/en_netdev.c    |  3677 ++++
 drivers/net/ethernet/mellanox/mlx4/en_port.c  |   432 +
 drivers/net/ethernet/mellanox/mlx4/en_port.h  |   586 +
 .../net/ethernet/mellanox/mlx4/en_resources.c |   115 +
 drivers/net/ethernet/mellanox/mlx4/en_rx.c    |  1264 ++
 .../net/ethernet/mellanox/mlx4/en_selftest.c  |   204 +
 drivers/net/ethernet/mellanox/mlx4/en_tx.c    |  1184 ++
 drivers/net/ethernet/mellanox/mlx4/eq.c       |  1565 ++
 drivers/net/ethernet/mellanox/mlx4/fw.c       |  3073 +++
 drivers/net/ethernet/mellanox/mlx4/fw.h       |   255 +
 drivers/net/ethernet/mellanox/mlx4/fw_qos.c   |   289 +
 drivers/net/ethernet/mellanox/mlx4/fw_qos.h   |   145 +
 drivers/net/ethernet/mellanox/mlx4/icm.c      |   468 +
 drivers/net/ethernet/mellanox/mlx4/icm.h      |   128 +
 drivers/net/ethernet/mellanox/mlx4/intf.c     |   275 +
 drivers/net/ethernet/mellanox/mlx4/main.c     |  4204 +++++
 drivers/net/ethernet/mellanox/mlx4/mcg.c      |  1647 ++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h     |  1482 ++
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h  |   847 +
 .../net/ethernet/mellanox/mlx4/mlx4_stats.h   |   132 +
 drivers/net/ethernet/mellanox/mlx4/mr.c       |  1156 ++
 drivers/net/ethernet/mellanox/mlx4/pd.c       |   295 +
 drivers/net/ethernet/mellanox/mlx4/port.c     |  2135 +++
 drivers/net/ethernet/mellanox/mlx4/profile.c  |   269 +
 drivers/net/ethernet/mellanox/mlx4/qp.c       |   965 +
 drivers/net/ethernet/mellanox/mlx4/reset.c    |   184 +
 .../ethernet/mellanox/mlx4/resource_tracker.c |  5414 ++++++
 drivers/net/ethernet/mellanox/mlx4/sense.c    |   143 +
 drivers/net/ethernet/mellanox/mlx4/srq.c      |   309 +
 .../net/ethernet/mellanox/mlx5/core/Kconfig   |    87 +
 .../net/ethernet/mellanox/mlx5/core/Makefile  |    31 +
 .../mellanox/mlx5/core/accel/Makefile         |     1 +
 .../ethernet/mellanox/mlx5/core/accel/ipsec.c |   109 +
 .../ethernet/mellanox/mlx5/core/accel/ipsec.h |    88 +
 .../net/ethernet/mellanox/mlx5/core/alloc.c   |   311 +
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c |  1887 ++
 drivers/net/ethernet/mellanox/mlx5/core/cq.c  |   221 +
 .../net/ethernet/mellanox/mlx5/core/debugfs.c |   611 +
 drivers/net/ethernet/mellanox/mlx5/core/dev.c |   471 +
 .../ethernet/mellanox/mlx5/core/diag/Makefile |     1 +
 .../mellanox/mlx5/core/diag/fs_tracepoint.c   |   264 +
 .../mellanox/mlx5/core/diag/fs_tracepoint.h   |   284 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  1110 ++
 .../mellanox/mlx5/core/en_accel/Makefile      |     1 +
 .../mellanox/mlx5/core/en_accel/ipsec.c       |   550 +
 .../mellanox/mlx5/core/en_accel/ipsec.h       |   164 +
 .../mellanox/mlx5/core/en_accel/ipsec_rxtx.c  |   411 +
 .../mellanox/mlx5/core/en_accel/ipsec_rxtx.h  |    60 +
 .../mellanox/mlx5/core/en_accel/ipsec_stats.c |   133 +
 .../net/ethernet/mellanox/mlx5/core/en_arfs.c |   738 +
 .../ethernet/mellanox/mlx5/core/en_common.c   |   185 +
 .../ethernet/mellanox/mlx5/core/en_dcbnl.c    |  1095 ++
 .../net/ethernet/mellanox/mlx5/core/en_dim.c  |    48 +
 .../ethernet/mellanox/mlx5/core/en_ethtool.c  |  1712 ++
 .../net/ethernet/mellanox/mlx5/core/en_fs.c   |  1568 ++
 .../mellanox/mlx5/core/en_fs_ethtool.c        |   589 +
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  4771 +++++
 .../net/ethernet/mellanox/mlx5/core/en_rep.c  |  1239 ++
 .../net/ethernet/mellanox/mlx5/core/en_rep.h  |   168 +
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   |  1413 ++
 .../ethernet/mellanox/mlx5/core/en_selftest.c |   370 +
 .../ethernet/mellanox/mlx5/core/en_stats.c    |  1277 ++
 .../ethernet/mellanox/mlx5/core/en_stats.h    |   236 +
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   |  2727 +++
 .../net/ethernet/mellanox/mlx5/core/en_tc.h   |    72 +
 .../net/ethernet/mellanox/mlx5/core/en_tx.c   |   664 +
 .../net/ethernet/mellanox/mlx5/core/en_txrx.c |   112 +
 drivers/net/ethernet/mellanox/mlx5/core/eq.c  |   953 +
 .../net/ethernet/mellanox/mlx5/core/eswitch.c |  2226 +++
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |   285 +
 .../mellanox/mlx5/core/eswitch_offloads.c     |  1266 ++
 .../ethernet/mellanox/mlx5/core/fpga/Makefile |     1 +
 .../ethernet/mellanox/mlx5/core/fpga/cmd.c    |   238 +
 .../ethernet/mellanox/mlx5/core/fpga/cmd.h    |    84 +
 .../ethernet/mellanox/mlx5/core/fpga/conn.c   |  1043 ++
 .../ethernet/mellanox/mlx5/core/fpga/conn.h   |    96 +
 .../ethernet/mellanox/mlx5/core/fpga/core.c   |   282 +
 .../ethernet/mellanox/mlx5/core/fpga/core.h   |   113 +
 .../ethernet/mellanox/mlx5/core/fpga/ipsec.c  |  1530 ++
 .../ethernet/mellanox/mlx5/core/fpga/ipsec.h  |   142 +
 .../ethernet/mellanox/mlx5/core/fpga/sdk.c    |   170 +
 .../ethernet/mellanox/mlx5/core/fpga/sdk.h    |   204 +
 .../net/ethernet/mellanox/mlx5/core/fs_cmd.c  |   747 +
 .../net/ethernet/mellanox/mlx5/core/fs_cmd.h  |   101 +
 .../net/ethernet/mellanox/mlx5/core/fs_core.c |  2753 +++
 .../net/ethernet/mellanox/mlx5/core/fs_core.h |   287 +
 .../ethernet/mellanox/mlx5/core/fs_counters.c |   352 +
 drivers/net/ethernet/mellanox/mlx5/core/fw.c  |   526 +
 .../net/ethernet/mellanox/mlx5/core/health.c  |   393 +
 .../mellanox/mlx5/core/ipoib/Makefile         |     1 +
 .../mellanox/mlx5/core/ipoib/ethtool.c        |   258 +
 .../ethernet/mellanox/mlx5/core/ipoib/ipoib.c |   685 +
 .../ethernet/mellanox/mlx5/core/ipoib/ipoib.h |   100 +
 .../mellanox/mlx5/core/ipoib/ipoib_vlan.c     |   357 +
 drivers/net/ethernet/mellanox/mlx5/core/lag.c |   691 +
 .../ethernet/mellanox/mlx5/core/lib/Makefile  |     1 +
 .../ethernet/mellanox/mlx5/core/lib/clock.c   |   585 +
 .../ethernet/mellanox/mlx5/core/lib/clock.h   |    51 +
 .../net/ethernet/mellanox/mlx5/core/lib/gid.c |   158 +
 .../ethernet/mellanox/mlx5/core/lib/mlx5.h    |    43 +
 .../ethernet/mellanox/mlx5/core/lib/mpfs.c    |   201 +
 .../ethernet/mellanox/mlx5/core/lib/mpfs.h    |    95 +
 drivers/net/ethernet/mellanox/mlx5/core/mad.c |    75 +
 .../net/ethernet/mellanox/mlx5/core/main.c    |  1702 ++
 drivers/net/ethernet/mellanox/mlx5/core/mcg.c |    66 +
 .../ethernet/mellanox/mlx5/core/mlx5_core.h   |   212 +
 drivers/net/ethernet/mellanox/mlx5/core/mr.c  |   210 +
 .../ethernet/mellanox/mlx5/core/pagealloc.c   |   579 +
 drivers/net/ethernet/mellanox/mlx5/core/pd.c  |    62 +
 .../net/ethernet/mellanox/mlx5/core/port.c    |  1120 ++
 drivers/net/ethernet/mellanox/mlx5/core/qp.c  |   635 +
 drivers/net/ethernet/mellanox/mlx5/core/rl.c  |   288 +
 .../net/ethernet/mellanox/mlx5/core/sriov.c   |   277 +
 drivers/net/ethernet/mellanox/mlx5/core/srq.c |   692 +
 .../ethernet/mellanox/mlx5/core/transobj.c    |   618 +
 drivers/net/ethernet/mellanox/mlx5/core/uar.c |   329 +
 .../net/ethernet/mellanox/mlx5/core/vport.c   |  1217 ++
 .../net/ethernet/mellanox/mlx5/core/vxlan.c   |   190 +
 .../net/ethernet/mellanox/mlx5/core/vxlan.h   |    63 +
 drivers/net/ethernet/mellanox/mlx5/core/wq.c  |   238 +
 drivers/net/ethernet/mellanox/mlx5/core/wq.h  |   203 +
 drivers/net/ethernet/mellanox/mlxfw/Kconfig   |    13 +
 drivers/net/ethernet/mellanox/mlxfw/Makefile  |     2 +
 drivers/net/ethernet/mellanox/mlxfw/mlxfw.h   |   111 +
 .../net/ethernet/mellanox/mlxfw/mlxfw_fsm.c   |   273 +
 .../net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c  |   619 +
 .../net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h  |    66 +
 .../ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h |    60 +
 .../mellanox/mlxfw/mlxfw_mfa2_format.h        |   103 +
 .../ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h  |    98 +
 .../mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c     |   126 +
 .../mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h     |    71 +
 drivers/net/ethernet/mellanox/mlxsw/Kconfig   |   108 +
 drivers/net/ethernet/mellanox/mlxsw/Makefile  |    27 +
 drivers/net/ethernet/mellanox/mlxsw/cmd.h     |  1183 ++
 drivers/net/ethernet/mellanox/mlxsw/core.c    |  1890 ++
 drivers/net/ethernet/mellanox/mlxsw/core.h    |   416 +
 .../mellanox/mlxsw/core_acl_flex_actions.c    |  1202 ++
 .../mellanox/mlxsw/core_acl_flex_actions.h    |    92 +
 .../mellanox/mlxsw/core_acl_flex_keys.c       |   475 +
 .../mellanox/mlxsw/core_acl_flex_keys.h       |   250 +
 .../net/ethernet/mellanox/mlxsw/core_hwmon.c  |   372 +
 .../ethernet/mellanox/mlxsw/core_thermal.c    |   442 +
 drivers/net/ethernet/mellanox/mlxsw/emad.h    |   127 +
 drivers/net/ethernet/mellanox/mlxsw/i2c.c     |   583 +
 drivers/net/ethernet/mellanox/mlxsw/i2c.h     |    60 +
 drivers/net/ethernet/mellanox/mlxsw/ib.h      |    39 +
 drivers/net/ethernet/mellanox/mlxsw/item.h    |   541 +
 drivers/net/ethernet/mellanox/mlxsw/minimal.c |    97 +
 drivers/net/ethernet/mellanox/mlxsw/pci.c     |  1807 ++
 drivers/net/ethernet/mellanox/mlxsw/pci.h     |    65 +
 drivers/net/ethernet/mellanox/mlxsw/pci_hw.h  |   230 +
 drivers/net/ethernet/mellanox/mlxsw/port.h    |    74 +
 drivers/net/ethernet/mellanox/mlxsw/reg.h     |  7962 ++++++++
 .../net/ethernet/mellanox/mlxsw/resources.h   |   155 +
 .../net/ethernet/mellanox/mlxsw/spectrum.c    |  4742 +++++
 .../net/ethernet/mellanox/mlxsw/spectrum.h    |   633 +
 .../ethernet/mellanox/mlxsw/spectrum_acl.c    |   896 +
 .../mlxsw/spectrum_acl_flex_actions.c         |   179 +
 .../mlxsw/spectrum_acl_flex_actions.h         |    44 +
 .../mellanox/mlxsw/spectrum_acl_flex_keys.h   |   124 +
 .../mellanox/mlxsw/spectrum_acl_tcam.c        |  1140 ++
 .../mellanox/mlxsw/spectrum_buffers.c         |  1055 ++
 .../ethernet/mellanox/mlxsw/spectrum_cnt.c    |   207 +
 .../ethernet/mellanox/mlxsw/spectrum_cnt.h    |    54 +
 .../ethernet/mellanox/mlxsw/spectrum_dcb.c    |   486 +
 .../ethernet/mellanox/mlxsw/spectrum_dpipe.c  |  1337 ++
 .../ethernet/mellanox/mlxsw/spectrum_dpipe.h  |    61 +
 .../ethernet/mellanox/mlxsw/spectrum_fid.c    |   992 +
 .../ethernet/mellanox/mlxsw/spectrum_flower.c |   495 +
 .../ethernet/mellanox/mlxsw/spectrum_ipip.c   |   369 +
 .../ethernet/mellanox/mlxsw/spectrum_ipip.h   |    97 +
 .../ethernet/mellanox/mlxsw/spectrum_kvdl.c   |   454 +
 .../net/ethernet/mellanox/mlxsw/spectrum_mr.c |  1080 ++
 .../net/ethernet/mellanox/mlxsw/spectrum_mr.h |   135 +
 .../mellanox/mlxsw/spectrum_mr_tcam.c         |   879 +
 .../mellanox/mlxsw/spectrum_mr_tcam.h         |    43 +
 .../ethernet/mellanox/mlxsw/spectrum_qdisc.c  |   764 +
 .../ethernet/mellanox/mlxsw/spectrum_router.c |  7305 ++++++++
 .../ethernet/mellanox/mlxsw/spectrum_router.h |   149 +
 .../ethernet/mellanox/mlxsw/spectrum_span.c   |   824 +
 .../ethernet/mellanox/mlxsw/spectrum_span.h   |   107 +
 .../mellanox/mlxsw/spectrum_switchdev.c       |  2360 +++
 .../net/ethernet/mellanox/mlxsw/switchib.c    |   604 +
 .../net/ethernet/mellanox/mlxsw/switchx2.c    |  1764 ++
 drivers/net/ethernet/mellanox/mlxsw/trap.h    |   108 +
 .../net/ethernet/mellanox/mlxsw/txheader.h    |    81 +
 drivers/net/ethernet/qlogic/Kconfig           |   124 +
 drivers/net/ethernet/qlogic/Makefile          |    11 +
 drivers/net/ethernet/qlogic/netxen/Makefile   |    27 +
 .../net/ethernet/qlogic/netxen/netxen_nic.h   |  1889 ++
 .../ethernet/qlogic/netxen/netxen_nic_ctx.c   |   938 +
 .../qlogic/netxen/netxen_nic_ethtool.c        |   964 +
 .../ethernet/qlogic/netxen/netxen_nic_hdr.h   |  1079 ++
 .../ethernet/qlogic/netxen/netxen_nic_hw.c    |  2570 +++
 .../ethernet/qlogic/netxen/netxen_nic_hw.h    |   285 +
 .../ethernet/qlogic/netxen/netxen_nic_init.c  |  1928 ++
 .../ethernet/qlogic/netxen/netxen_nic_main.c  |  3532 ++++
 drivers/net/ethernet/qlogic/qed/Makefile      |    12 +
 drivers/net/ethernet/qlogic/qed/qed.h         |   857 +
 drivers/net/ethernet/qlogic/qed/qed_cxt.c     |  2553 +++
 drivers/net/ethernet/qlogic/qed/qed_cxt.h     |   244 +
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c    |  2408 +++
 drivers/net/ethernet/qlogic/qed/qed_dcbx.h    |   126 +
 drivers/net/ethernet/qlogic/qed/qed_debug.c   |  8099 ++++++++
 drivers/net/ethernet/qlogic/qed/qed_debug.h   |    57 +
 drivers/net/ethernet/qlogic/qed/qed_dev.c     |  4262 +++++
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h |   477 +
 drivers/net/ethernet/qlogic/qed/qed_fcoe.c    |  1051 ++
 drivers/net/ethernet/qlogic/qed/qed_fcoe.h    |    79 +
 drivers/net/ethernet/qlogic/qed/qed_hsi.h     | 12814 +++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_hw.c      |   877 +
 drivers/net/ethernet/qlogic/qed/qed_hw.h      |   306 +
 .../ethernet/qlogic/qed/qed_init_fw_funcs.c   |  1511 ++
 .../net/ethernet/qlogic/qed/qed_init_ops.c    |   590 +
 .../net/ethernet/qlogic/qed/qed_init_ops.h    |   134 +
 drivers/net/ethernet/qlogic/qed/qed_int.c     |  2234 +++
 drivers/net/ethernet/qlogic/qed/qed_int.h     |   424 +
 drivers/net/ethernet/qlogic/qed/qed_iscsi.c   |  1460 ++
 drivers/net/ethernet/qlogic/qed/qed_iscsi.h   |    89 +
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c   |  3186 ++++
 drivers/net/ethernet/qlogic/qed/qed_iwarp.h   |   232 +
 drivers/net/ethernet/qlogic/qed/qed_l2.c      |  2900 +++
 drivers/net/ethernet/qlogic/qed/qed_l2.h      |   450 +
 drivers/net/ethernet/qlogic/qed/qed_ll2.c     |  2513 +++
 drivers/net/ethernet/qlogic/qed/qed_ll2.h     |   268 +
 drivers/net/ethernet/qlogic/qed/qed_main.c    |  2097 +++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c     |  3055 +++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h     |  1002 +
 drivers/net/ethernet/qlogic/qed/qed_ooo.c     |   497 +
 drivers/net/ethernet/qlogic/qed/qed_ooo.h     |   198 +
 drivers/net/ethernet/qlogic/qed/qed_ptp.c     |   452 +
 drivers/net/ethernet/qlogic/qed/qed_rdma.c    |  1797 ++
 drivers/net/ethernet/qlogic/qed/qed_rdma.h    |   206 +
 .../net/ethernet/qlogic/qed/qed_reg_addr.h    |  1631 ++
 drivers/net/ethernet/qlogic/qed/qed_roce.c    |  1171 ++
 drivers/net/ethernet/qlogic/qed/qed_roce.h    |    59 +
 .../net/ethernet/qlogic/qed/qed_selftest.c    |   211 +
 .../net/ethernet/qlogic/qed/qed_selftest.h    |    51 +
 drivers/net/ethernet/qlogic/qed/qed_sp.h      |   483 +
 .../net/ethernet/qlogic/qed/qed_sp_commands.c |   543 +
 drivers/net/ethernet/qlogic/qed/qed_spq.c     |   991 +
 drivers/net/ethernet/qlogic/qed/qed_sriov.c   |  5026 +++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h   |   476 +
 drivers/net/ethernet/qlogic/qed/qed_vf.c      |  1685 ++
 drivers/net/ethernet/qlogic/qed/qed_vf.h      |  1239 ++
 drivers/net/ethernet/qlogic/qede/Makefile     |     5 +
 drivers/net/ethernet/qlogic/qede/qede.h       |   546 +
 drivers/net/ethernet/qlogic/qede/qede_dcbnl.c |   352 +
 .../net/ethernet/qlogic/qede/qede_ethtool.c   |  1854 ++
 .../net/ethernet/qlogic/qede/qede_filter.c    |  1693 ++
 drivers/net/ethernet/qlogic/qede/qede_fp.c    |  1717 ++
 drivers/net/ethernet/qlogic/qede/qede_main.c  |  2179 +++
 drivers/net/ethernet/qlogic/qede/qede_ptp.c   |   553 +
 drivers/net/ethernet/qlogic/qede/qede_ptp.h   |    63 +
 drivers/net/ethernet/qlogic/qede/qede_rdma.c  |   314 +
 drivers/net/ethernet/qlogic/qla3xxx.c         |  3949 ++++
 drivers/net/ethernet/qlogic/qla3xxx.h         |  1189 ++
 drivers/net/ethernet/qlogic/qlcnic/Makefile   |    16 +
 drivers/net/ethernet/qlogic/qlcnic/qlcnic.h   |  2426 +++
 .../ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c   |  4237 +++++
 .../ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h   |   666 +
 .../ethernet/qlogic/qlcnic/qlcnic_83xx_init.c |  2612 +++
 .../ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c |   285 +
 .../net/ethernet/qlogic/qlcnic/qlcnic_ctx.c   |  1430 ++
 .../net/ethernet/qlogic/qlcnic/qlcnic_dcb.c   |  1147 ++
 .../net/ethernet/qlogic/qlcnic/qlcnic_dcb.h   |   121 +
 .../ethernet/qlogic/qlcnic/qlcnic_ethtool.c   |  1892 ++
 .../net/ethernet/qlogic/qlcnic/qlcnic_hdr.h   |   948 +
 .../net/ethernet/qlogic/qlcnic/qlcnic_hw.c    |  1704 ++
 .../net/ethernet/qlogic/qlcnic/qlcnic_hw.h    |   226 +
 .../net/ethernet/qlogic/qlcnic/qlcnic_init.c  |  1306 ++
 .../net/ethernet/qlogic/qlcnic/qlcnic_io.c    |  2230 +++
 .../net/ethernet/qlogic/qlcnic/qlcnic_main.c  |  4362 +++++
 .../ethernet/qlogic/qlcnic/qlcnic_minidump.c  |  1451 ++
 .../net/ethernet/qlogic/qlcnic/qlcnic_sriov.h |   278 +
 .../qlogic/qlcnic/qlcnic_sriov_common.c       |  2232 +++
 .../ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c  |  2051 ++
 .../net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c |  1430 ++
 drivers/net/ethernet/qlogic/qlge/Makefile     |     7 +
 drivers/net/ethernet/qlogic/qlge/qlge.h       |  2354 +++
 drivers/net/ethernet/qlogic/qlge/qlge_dbg.c   |  2024 ++
 .../net/ethernet/qlogic/qlge/qlge_ethtool.c   |   735 +
 drivers/net/ethernet/qlogic/qlge/qlge_main.c  |  5029 +++++
 drivers/net/ethernet/qlogic/qlge/qlge_mpi.c   |  1284 ++
 drivers/nvme/Kconfig                          |     6 +
 drivers/nvme/Makefile                         |     3 +
 drivers/nvme/host/Kconfig                     |    59 +
 drivers/nvme/host/Makefile                    |    23 +
 drivers/nvme/host/core.c                      |  3651 ++++
 drivers/nvme/host/fabrics.c                   |  1158 ++
 drivers/nvme/host/fabrics.h                   |   163 +
 drivers/nvme/host/fault_inject.c              |    79 +
 drivers/nvme/host/fc.c                        |  3365 ++++
 drivers/nvme/host/lightnvm.c                  |  1309 ++
 drivers/nvme/host/multipath.c                 |   248 +
 drivers/nvme/host/nvme.h                      |   547 +
 drivers/nvme/host/pci.c                       |  2767 +++
 drivers/nvme/host/rdma.c                      |  2083 +++
 drivers/nvme/host/trace.c                     |   130 +
 drivers/nvme/host/trace.h                     |   165 +
 drivers/nvme/target/Kconfig                   |    62 +
 drivers/nvme/target/Makefile                  |    14 +
 drivers/nvme/target/admin-cmd.c               |   611 +
 drivers/nvme/target/configfs.c                |  1017 +
 drivers/nvme/target/core.c                    |  1062 ++
 drivers/nvme/target/discovery.c               |   244 +
 drivers/nvme/target/fabrics-cmd.c             |   264 +
 drivers/nvme/target/fc.c                      |  2550 +++
 drivers/nvme/target/fcloop.c                  |  1387 ++
 drivers/nvme/target/io-cmd.c                  |   230 +
 drivers/nvme/target/loop.c                    |   721 +
 drivers/nvme/target/nvmet.h                   |   341 +
 drivers/nvme/target/rdma.c                    |  1542 ++
 drivers/scsi/Kconfig                          |  1507 ++
 drivers/scsi/Makefile                         |   205 +
 drivers/scsi/cxgbi/Kconfig                    |     2 +
 drivers/scsi/cxgbi/Makefile                   |     4 +
 drivers/scsi/cxgbi/cxgb3i/Kbuild              |     4 +
 drivers/scsi/cxgbi/cxgb3i/Kconfig             |    11 +
 drivers/scsi/cxgbi/cxgb3i/cxgb3i.c            |  1405 ++
 drivers/scsi/cxgbi/cxgb3i/cxgb3i.h            |    62 +
 drivers/scsi/cxgbi/cxgb4i/Kbuild              |     4 +
 drivers/scsi/cxgbi/cxgb4i/Kconfig             |    11 +
 drivers/scsi/cxgbi/cxgb4i/cxgb4i.c            |  2171 +++
 drivers/scsi/cxgbi/cxgb4i/cxgb4i.h            |    28 +
 drivers/scsi/cxgbi/libcxgbi.c                 |  2783 +++
 drivers/scsi/cxgbi/libcxgbi.h                 |   620 +
 drivers/scsi/qedf/Kconfig                     |    11 +
 drivers/scsi/qedf/Makefile                    |     5 +
 drivers/scsi/qedf/drv_fcoe_fw_funcs.c         |   199 +
 drivers/scsi/qedf/drv_fcoe_fw_funcs.h         |    93 +
 drivers/scsi/qedf/drv_scsi_fw_funcs.c         |    44 +
 drivers/scsi/qedf/drv_scsi_fw_funcs.h         |    85 +
 drivers/scsi/qedf/qedf.h                      |   545 +
 drivers/scsi/qedf/qedf_attr.c                 |   184 +
 drivers/scsi/qedf/qedf_dbg.c                  |   195 +
 drivers/scsi/qedf/qedf_dbg.h                  |   160 +
 drivers/scsi/qedf/qedf_debugfs.c              |   460 +
 drivers/scsi/qedf/qedf_els.c                  |   966 +
 drivers/scsi/qedf/qedf_fip.c                  |   246 +
 drivers/scsi/qedf/qedf_hsi.h                  |   354 +
 drivers/scsi/qedf/qedf_io.c                   |  2098 +++
 drivers/scsi/qedf/qedf_main.c                 |  3498 ++++
 drivers/scsi/qedf/qedf_version.h              |    15 +
 drivers/scsi/qedi/Kconfig                     |    12 +
 drivers/scsi/qedi/Makefile                    |     5 +
 drivers/scsi/qedi/qedi.h                      |   381 +
 drivers/scsi/qedi/qedi_dbg.c                  |   146 +
 drivers/scsi/qedi/qedi_dbg.h                  |   142 +
 drivers/scsi/qedi/qedi_debugfs.c              |   244 +
 drivers/scsi/qedi/qedi_fw.c                   |  2233 +++
 drivers/scsi/qedi/qedi_fw_api.c               |   803 +
 drivers/scsi/qedi/qedi_fw_iscsi.h             |   117 +
 drivers/scsi/qedi/qedi_fw_scsi.h              |    55 +
 drivers/scsi/qedi/qedi_gbl.h                  |    80 +
 drivers/scsi/qedi/qedi_hsi.h                  |    52 +
 drivers/scsi/qedi/qedi_iscsi.c                |  1622 ++
 drivers/scsi/qedi/qedi_iscsi.h                |   232 +
 drivers/scsi/qedi/qedi_main.c                 |  2506 +++
 drivers/scsi/qedi/qedi_nvm_iscsi_cfg.h        |   210 +
 drivers/scsi/qedi/qedi_sysfs.c                |    52 +
 drivers/scsi/qedi/qedi_version.h              |    14 +
 drivers/scsi/scsi_priv.h                      |   197 +
 drivers/scsi/scsi_transport_srp.c             |   909 +
 include/linux/blk-mq-rdma.h                   |    10 +
 include/linux/cgroup_rdma.h                   |    53 +
 include/linux/mlx4/cmd.h                      |   334 +
 include/linux/mlx4/cq.h                       |   182 +
 include/linux/mlx4/device.h                   |  1595 ++
 include/linux/mlx4/doorbell.h                 |    86 +
 include/linux/mlx4/driver.h                   |   117 +
 include/linux/mlx4/qp.h                       |   506 +
 include/linux/mlx4/srq.h                      |    44 +
 include/linux/mlx5/accel.h                    |   144 +
 include/linux/mlx5/cmd.h                      |    51 +
 include/linux/mlx5/cq.h                       |   205 +
 include/linux/mlx5/device.h                   |  1229 ++
 include/linux/mlx5/doorbell.h                 |    81 +
 include/linux/mlx5/driver.h                   |  1292 ++
 include/linux/mlx5/eswitch.h                  |    58 +
 include/linux/mlx5/fs.h                       |   188 +
 include/linux/mlx5/fs_helpers.h               |   142 +
 include/linux/mlx5/mlx5_ifc.h                 |  9044 +++++++++
 include/linux/mlx5/mlx5_ifc_fpga.h            |   522 +
 include/linux/mlx5/port.h                     |   193 +
 include/linux/mlx5/qp.h                       |   643 +
 include/linux/mlx5/srq.h                      |    71 +
 include/linux/mlx5/transobj.h                 |   101 +
 include/linux/mlx5/vport.h                    |   126 +
 include/linux/nvme-rdma.h                     |    95 +
 include/linux/nvme.h                          |  1196 ++
 include/linux/qed/common_hsi.h                |  1391 ++
 include/linux/qed/eth_common.h                |   481 +
 include/linux/qed/fcoe_common.h               |   744 +
 include/linux/qed/iscsi_common.h              |  1572 ++
 include/linux/qed/iwarp_common.h              |    56 +
 include/linux/qed/qed_chain.h                 |   692 +
 include/linux/qed/qed_eth_if.h                |   360 +
 include/linux/qed/qed_fcoe_if.h               |   151 +
 include/linux/qed/qed_if.h                    |  1034 ++
 include/linux/qed/qed_iov_if.h                |    60 +
 include/linux/qed/qed_iscsi_if.h              |   260 +
 include/linux/qed/qed_ll2_if.h                |   299 +
 include/linux/qed/qed_rdma_if.h               |   696 +
 include/linux/qed/qede_rdma.h                 |    94 +
 include/linux/qed/rdma_common.h               |    73 +
 include/linux/qed/roce_common.h               |    68 +
 include/linux/qed/storage_common.h            |   182 +
 include/linux/qed/tcp_common.h                |   281 +
 include/linux/sunrpc/addr.h                   |   184 +
 include/linux/sunrpc/auth.h                   |   232 +
 include/linux/sunrpc/auth_gss.h               |    94 +
 include/linux/sunrpc/bc_xprt.h                |    72 +
 include/linux/sunrpc/cache.h                  |   302 +
 include/linux/sunrpc/clnt.h                   |   228 +
 include/linux/sunrpc/debug.h                  |   130 +
 include/linux/sunrpc/gss_api.h                |   165 +
 include/linux/sunrpc/gss_asn1.h               |    81 +
 include/linux/sunrpc/gss_err.h                |   167 +
 include/linux/sunrpc/gss_krb5.h               |   331 +
 include/linux/sunrpc/gss_krb5_enctypes.h      |     4 +
 include/linux/sunrpc/metrics.h                |   103 +
 include/linux/sunrpc/msg_prot.h               |   221 +
 include/linux/sunrpc/rpc_pipe_fs.h            |   138 +
 include/linux/sunrpc/rpc_rdma.h               |   125 +
 include/linux/sunrpc/sched.h                  |   302 +
 include/linux/sunrpc/stats.h                  |    85 +
 include/linux/sunrpc/svc.h                    |   517 +
 include/linux/sunrpc/svc_rdma.h               |   203 +
 include/linux/sunrpc/svc_xprt.h               |   221 +
 include/linux/sunrpc/svcauth.h                |   186 +
 include/linux/sunrpc/svcauth_gss.h            |    28 +
 include/linux/sunrpc/svcsock.h                |    75 +
 include/linux/sunrpc/timer.h                  |    50 +
 include/linux/sunrpc/types.h                  |    24 +
 include/linux/sunrpc/xdr.h                    |   531 +
 include/linux/sunrpc/xprt.h                   |   488 +
 include/linux/sunrpc/xprtmultipath.h          |    72 +
 include/linux/sunrpc/xprtrdma.h               |    72 +
 include/linux/sunrpc/xprtsock.h               |    91 +
 include/rdma/ib.h                             |   106 +
 include/rdma/ib_addr.h                        |   315 +
 include/rdma/ib_cache.h                       |   152 +
 include/rdma/ib_cm.h                          |   612 +
 include/rdma/ib_fmr_pool.h                    |    93 +
 include/rdma/ib_hdrs.h                        |   341 +
 include/rdma/ib_mad.h                         |   891 +
 include/rdma/ib_marshall.h                    |    55 +
 include/rdma/ib_pack.h                        |   311 +
 include/rdma/ib_pma.h                         |   157 +
 include/rdma/ib_sa.h                          |   711 +
 include/rdma/ib_smi.h                         |   175 +
 include/rdma/ib_umem.h                        |   108 +
 include/rdma/ib_umem_odp.h                    |   174 +
 include/rdma/ib_verbs.h                       |  4015 ++++
 include/rdma/iw_cm.h                          |   262 +
 include/rdma/iw_portmap.h                     |   224 +
 include/rdma/mr_pool.h                        |    25 +
 include/rdma/opa_addr.h                       |   133 +
 include/rdma/opa_port_info.h                  |   418 +
 include/rdma/opa_smi.h                        |   151 +
 include/rdma/opa_vnic.h                       |   138 +
 include/rdma/rdma_cm.h                        |   423 +
 include/rdma/rdma_cm_ib.h                     |    54 +
 include/rdma/rdma_netlink.h                   |   102 +
 include/rdma/rdma_vt.h                        |   557 +
 include/rdma/rdmavt_cq.h                      |    99 +
 include/rdma/rdmavt_mr.h                      |   197 +
 include/rdma/rdmavt_qp.h                      |   706 +
 include/rdma/restrack.h                       |   177 +
 include/rdma/rw.h                             |    90 +
 include/rdma/uverbs_ioctl.h                   |   558 +
 include/rdma/uverbs_named_ioctl.h             |    90 +
 include/rdma/uverbs_std_types.h               |   114 +
 include/rdma/uverbs_types.h                   |   181 +
 include/scsi/iser.h                           |    78 +
 include/scsi/scsi_transport_srp.h             |   145 +
 include/scsi/srp.h                            |   298 +
 include/uapi/linux/nvme_ioctl.h               |    67 +
 include/uapi/linux/rds.h                      |   338 +
 include/uapi/rdma/bnxt_re-abi.h               |   106 +
 include/uapi/rdma/cxgb3-abi.h                 |    82 +
 include/uapi/rdma/cxgb4-abi.h                 |    87 +
 include/uapi/rdma/hfi/hfi1_ioctl.h            |   174 +
 include/uapi/rdma/hfi/hfi1_user.h             |   267 +
 include/uapi/rdma/hns-abi.h                   |    71 +
 include/uapi/rdma/i40iw-abi.h                 |   107 +
 include/uapi/rdma/ib_user_cm.h                |   326 +
 include/uapi/rdma/ib_user_ioctl_cmds.h        |   134 +
 include/uapi/rdma/ib_user_ioctl_verbs.h       |   102 +
 include/uapi/rdma/ib_user_mad.h               |   234 +
 include/uapi/rdma/ib_user_sa.h                |    77 +
 include/uapi/rdma/ib_user_verbs.h             |  1210 ++
 include/uapi/rdma/mlx4-abi.h                  |   191 +
 include/uapi/rdma/mlx5-abi.h                  |   444 +
 include/uapi/rdma/mlx5_user_ioctl_cmds.h      |    48 +
 include/uapi/rdma/mlx5_user_ioctl_verbs.h     |    43 +
 include/uapi/rdma/mthca-abi.h                 |   112 +
 include/uapi/rdma/nes-abi.h                   |   115 +
 include/uapi/rdma/ocrdma-abi.h                |   152 +
 include/uapi/rdma/qedr-abi.h                  |   114 +
 include/uapi/rdma/rdma_netlink.h              |   406 +
 include/uapi/rdma/rdma_user_cm.h              |   324 +
 include/uapi/rdma/rdma_user_ioctl.h           |    85 +
 include/uapi/rdma/rdma_user_ioctl_cmds.h      |    99 +
 include/uapi/rdma/rdma_user_rxe.h             |   179 +
 include/uapi/rdma/vmw_pvrdma-abi.h            |   297 +
 net/rds/Kconfig                               |    27 +
 net/rds/Makefile                              |    18 +
 net/rds/af_rds.c                              |   707 +
 net/rds/bind.c                                |   201 +
 net/rds/cong.c                                |   425 +
 net/rds/connection.c                          |   751 +
 net/rds/ib.c                                  |   478 +
 net/rds/ib.h                                  |   434 +
 net/rds/ib_cm.c                               |  1026 +
 net/rds/ib_fmr.c                              |   258 +
 net/rds/ib_frmr.c                             |   378 +
 net/rds/ib_mr.h                               |   143 +
 net/rds/ib_rdma.c                             |   635 +
 net/rds/ib_recv.c                             |  1071 ++
 net/rds/ib_ring.c                             |   168 +
 net/rds/ib_send.c                             |   995 +
 net/rds/ib_stats.c                            |   107 +
 net/rds/ib_sysctl.c                           |   121 +
 net/rds/info.c                                |   243 +
 net/rds/info.h                                |    31 +
 net/rds/loop.c                                |   196 +
 net/rds/loop.h                                |    10 +
 net/rds/message.c                             |   517 +
 net/rds/page.c                                |   167 +
 net/rds/rdma.c                                |   884 +
 net/rds/rdma_transport.c                      |   232 +
 net/rds/rdma_transport.h                      |    20 +
 net/rds/rds.h                                 |   956 +
 net/rds/rds_single_path.h                     |    31 +
 net/rds/recv.c                                |   772 +
 net/rds/send.c                                |  1367 ++
 net/rds/stats.c                               |   152 +
 net/rds/sysctl.c                              |   110 +
 net/rds/tcp.c                                 |   636 +
 net/rds/tcp.h                                 |    99 +
 net/rds/tcp_connect.c                         |   189 +
 net/rds/tcp_listen.c                          |   317 +
 net/rds/tcp_recv.c                            |   348 +
 net/rds/tcp_send.c                            |   230 +
 net/rds/tcp_stats.c                           |    74 +
 net/rds/threads.c                             |   261 +
 net/rds/transport.c                           |   155 +
 net/sunrpc/Kconfig                            |    64 +
 net/sunrpc/xprtrdma/Makefile                  |     9 +
 net/sunrpc/xprtrdma/backchannel.c             |   370 +
 net/sunrpc/xprtrdma/fmr_ops.c                 |   324 +
 net/sunrpc/xprtrdma/frwr_ops.c                |   589 +
 net/sunrpc/xprtrdma/module.c                  |    48 +
 net/sunrpc/xprtrdma/rpc_rdma.c                |  1414 ++
 net/sunrpc/xprtrdma/svc_rdma.c                |   267 +
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c    |   362 +
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c       |   587 +
 net/sunrpc/xprtrdma/svc_rdma_rw.c             |   868 +
 net/sunrpc/xprtrdma/svc_rdma_sendto.c         |   699 +
 net/sunrpc/xprtrdma/svc_rdma_transport.c      |  1048 ++
 net/sunrpc/xprtrdma/transport.c               |   874 +
 net/sunrpc/xprtrdma/verbs.c                   |  1640 ++
 net/sunrpc/xprtrdma/xprt_rdma.h               |   679 +
 1256 files changed, 1001276 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 block/blk-mq-rdma.c
 create mode 100644 drivers/infiniband/Kconfig
 create mode 100644 drivers/infiniband/Makefile
 create mode 100644 drivers/infiniband/core/Makefile
 create mode 100644 drivers/infiniband/core/addr.c
 create mode 100644 drivers/infiniband/core/agent.c
 create mode 100644 drivers/infiniband/core/agent.h
 create mode 100644 drivers/infiniband/core/cache.c
 create mode 100644 drivers/infiniband/core/cgroup.c
 create mode 100644 drivers/infiniband/core/cm.c
 create mode 100644 drivers/infiniband/core/cm_msgs.h
 create mode 100644 drivers/infiniband/core/cma.c
 create mode 100644 drivers/infiniband/core/cma_configfs.c
 create mode 100644 drivers/infiniband/core/cma_priv.h
 create mode 100644 drivers/infiniband/core/core_priv.h
 create mode 100644 drivers/infiniband/core/cq.c
 create mode 100644 drivers/infiniband/core/device.c
 create mode 100644 drivers/infiniband/core/fmr_pool.c
 create mode 100644 drivers/infiniband/core/iwcm.c
 create mode 100644 drivers/infiniband/core/iwcm.h
 create mode 100644 drivers/infiniband/core/iwpm_msg.c
 create mode 100644 drivers/infiniband/core/iwpm_util.c
 create mode 100644 drivers/infiniband/core/iwpm_util.h
 create mode 100644 drivers/infiniband/core/mad.c
 create mode 100644 drivers/infiniband/core/mad_priv.h
 create mode 100644 drivers/infiniband/core/mad_rmpp.c
 create mode 100644 drivers/infiniband/core/mad_rmpp.h
 create mode 100644 drivers/infiniband/core/mr_pool.c
 create mode 100644 drivers/infiniband/core/multicast.c
 create mode 100644 drivers/infiniband/core/netlink.c
 create mode 100644 drivers/infiniband/core/nldev.c
 create mode 100644 drivers/infiniband/core/opa_smi.h
 create mode 100644 drivers/infiniband/core/packer.c
 create mode 100644 drivers/infiniband/core/rdma_core.c
 create mode 100644 drivers/infiniband/core/rdma_core.h
 create mode 100644 drivers/infiniband/core/restrack.c
 create mode 100644 drivers/infiniband/core/roce_gid_mgmt.c
 create mode 100644 drivers/infiniband/core/rw.c
 create mode 100644 drivers/infiniband/core/sa.h
 create mode 100644 drivers/infiniband/core/sa_query.c
 create mode 100644 drivers/infiniband/core/security.c
 create mode 100644 drivers/infiniband/core/smi.c
 create mode 100644 drivers/infiniband/core/smi.h
 create mode 100644 drivers/infiniband/core/sysfs.c
 create mode 100644 drivers/infiniband/core/ucm.c
 create mode 100644 drivers/infiniband/core/ucma.c
 create mode 100644 drivers/infiniband/core/ud_header.c
 create mode 100644 drivers/infiniband/core/umem.c
 create mode 100644 drivers/infiniband/core/umem_odp.c
 create mode 100644 drivers/infiniband/core/user_mad.c
 create mode 100644 drivers/infiniband/core/uverbs.h
 create mode 100644 drivers/infiniband/core/uverbs_cmd.c
 create mode 100644 drivers/infiniband/core/uverbs_ioctl.c
 create mode 100644 drivers/infiniband/core/uverbs_ioctl_merge.c
 create mode 100644 drivers/infiniband/core/uverbs_main.c
 create mode 100644 drivers/infiniband/core/uverbs_marshall.c
 create mode 100644 drivers/infiniband/core/uverbs_std_types.c
 create mode 100644 drivers/infiniband/core/uverbs_std_types_cq.c
 create mode 100644 drivers/infiniband/core/uverbs_std_types_dm.c
 create mode 100644 drivers/infiniband/core/uverbs_std_types_flow_action.c
 create mode 100644 drivers/infiniband/core/uverbs_std_types_mr.c
 create mode 100644 drivers/infiniband/core/verbs.c
 create mode 100644 drivers/infiniband/hw/Makefile
 create mode 100644 drivers/infiniband/hw/bnxt_re/Kconfig
 create mode 100644 drivers/infiniband/hw/bnxt_re/Makefile
 create mode 100644 drivers/infiniband/hw/bnxt_re/bnxt_re.h
 create mode 100644 drivers/infiniband/hw/bnxt_re/hw_counters.c
 create mode 100644 drivers/infiniband/hw/bnxt_re/hw_counters.h
 create mode 100644 drivers/infiniband/hw/bnxt_re/ib_verbs.c
 create mode 100644 drivers/infiniband/hw/bnxt_re/ib_verbs.h
 create mode 100644 drivers/infiniband/hw/bnxt_re/main.c
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_fp.c
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_fp.h
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_res.c
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_res.h
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_sp.c
 create mode 100644 drivers/infiniband/hw/bnxt_re/qplib_sp.h
 create mode 100644 drivers/infiniband/hw/bnxt_re/roce_hsi.h
 create mode 100644 drivers/infiniband/hw/cxgb3/Kconfig
 create mode 100644 drivers/infiniband/hw/cxgb3/Makefile
 create mode 100644 drivers/infiniband/hw/cxgb3/cxio_hal.c
 create mode 100644 drivers/infiniband/hw/cxgb3/cxio_hal.h
 create mode 100644 drivers/infiniband/hw/cxgb3/cxio_resource.c
 create mode 100644 drivers/infiniband/hw/cxgb3/cxio_resource.h
 create mode 100644 drivers/infiniband/hw/cxgb3/cxio_wr.h
 create mode 100644 drivers/infiniband/hw/cxgb3/iwch.c
 create mode 100644 drivers/infiniband/hw/cxgb3/iwch.h
 create mode 100644 drivers/infiniband/hw/cxgb3/iwch_cm.c
 create mode 100644 drivers/infiniband/hw/cxgb3/iwch_cm.h
 create mode 100644 drivers/infiniband/hw/cxgb3/iwch_cq.c
 create mode 100644 drivers/infiniband/hw/cxgb3/iwch_ev.c
 create mode 100644 drivers/infiniband/hw/cxgb3/iwch_mem.c
 create mode 100644 drivers/infiniband/hw/cxgb3/iwch_provider.c
 create mode 100644 drivers/infiniband/hw/cxgb3/iwch_provider.h
 create mode 100644 drivers/infiniband/hw/cxgb3/iwch_qp.c
 create mode 100644 drivers/infiniband/hw/cxgb3/tcb.h
 create mode 100644 drivers/infiniband/hw/cxgb4/Kconfig
 create mode 100644 drivers/infiniband/hw/cxgb4/Makefile
 create mode 100644 drivers/infiniband/hw/cxgb4/cm.c
 create mode 100644 drivers/infiniband/hw/cxgb4/cq.c
 create mode 100644 drivers/infiniband/hw/cxgb4/device.c
 create mode 100644 drivers/infiniband/hw/cxgb4/ev.c
 create mode 100644 drivers/infiniband/hw/cxgb4/id_table.c
 create mode 100644 drivers/infiniband/hw/cxgb4/iw_cxgb4.h
 create mode 100644 drivers/infiniband/hw/cxgb4/mem.c
 create mode 100644 drivers/infiniband/hw/cxgb4/provider.c
 create mode 100644 drivers/infiniband/hw/cxgb4/qp.c
 create mode 100644 drivers/infiniband/hw/cxgb4/resource.c
 create mode 100644 drivers/infiniband/hw/cxgb4/t4.h
 create mode 100644 drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
 create mode 100644 drivers/infiniband/hw/hfi1/Kconfig
 create mode 100644 drivers/infiniband/hw/hfi1/Makefile
 create mode 100644 drivers/infiniband/hw/hfi1/affinity.c
 create mode 100644 drivers/infiniband/hw/hfi1/affinity.h
 create mode 100644 drivers/infiniband/hw/hfi1/aspm.h
 create mode 100644 drivers/infiniband/hw/hfi1/chip.c
 create mode 100644 drivers/infiniband/hw/hfi1/chip.h
 create mode 100644 drivers/infiniband/hw/hfi1/chip_registers.h
 create mode 100644 drivers/infiniband/hw/hfi1/common.h
 create mode 100644 drivers/infiniband/hw/hfi1/debugfs.c
 create mode 100644 drivers/infiniband/hw/hfi1/debugfs.h
 create mode 100644 drivers/infiniband/hw/hfi1/device.c
 create mode 100644 drivers/infiniband/hw/hfi1/device.h
 create mode 100644 drivers/infiniband/hw/hfi1/driver.c
 create mode 100644 drivers/infiniband/hw/hfi1/efivar.c
 create mode 100644 drivers/infiniband/hw/hfi1/efivar.h
 create mode 100644 drivers/infiniband/hw/hfi1/eprom.c
 create mode 100644 drivers/infiniband/hw/hfi1/eprom.h
 create mode 100644 drivers/infiniband/hw/hfi1/exp_rcv.c
 create mode 100644 drivers/infiniband/hw/hfi1/exp_rcv.h
 create mode 100644 drivers/infiniband/hw/hfi1/file_ops.c
 create mode 100644 drivers/infiniband/hw/hfi1/firmware.c
 create mode 100644 drivers/infiniband/hw/hfi1/hfi.h
 create mode 100644 drivers/infiniband/hw/hfi1/init.c
 create mode 100644 drivers/infiniband/hw/hfi1/intr.c
 create mode 100644 drivers/infiniband/hw/hfi1/iowait.h
 create mode 100644 drivers/infiniband/hw/hfi1/mad.c
 create mode 100644 drivers/infiniband/hw/hfi1/mad.h
 create mode 100644 drivers/infiniband/hw/hfi1/mmu_rb.c
 create mode 100644 drivers/infiniband/hw/hfi1/mmu_rb.h
 create mode 100644 drivers/infiniband/hw/hfi1/opa_compat.h
 create mode 100644 drivers/infiniband/hw/hfi1/pcie.c
 create mode 100644 drivers/infiniband/hw/hfi1/pio.c
 create mode 100644 drivers/infiniband/hw/hfi1/pio.h
 create mode 100644 drivers/infiniband/hw/hfi1/pio_copy.c
 create mode 100644 drivers/infiniband/hw/hfi1/platform.c
 create mode 100644 drivers/infiniband/hw/hfi1/platform.h
 create mode 100644 drivers/infiniband/hw/hfi1/qp.c
 create mode 100644 drivers/infiniband/hw/hfi1/qp.h
 create mode 100644 drivers/infiniband/hw/hfi1/qsfp.c
 create mode 100644 drivers/infiniband/hw/hfi1/qsfp.h
 create mode 100644 drivers/infiniband/hw/hfi1/rc.c
 create mode 100644 drivers/infiniband/hw/hfi1/ruc.c
 create mode 100644 drivers/infiniband/hw/hfi1/sdma.c
 create mode 100644 drivers/infiniband/hw/hfi1/sdma.h
 create mode 100644 drivers/infiniband/hw/hfi1/sdma_txreq.h
 create mode 100644 drivers/infiniband/hw/hfi1/sysfs.c
 create mode 100644 drivers/infiniband/hw/hfi1/trace.c
 create mode 100644 drivers/infiniband/hw/hfi1/trace.h
 create mode 100644 drivers/infiniband/hw/hfi1/trace_ctxts.h
 create mode 100644 drivers/infiniband/hw/hfi1/trace_dbg.h
 create mode 100644 drivers/infiniband/hw/hfi1/trace_ibhdrs.h
 create mode 100644 drivers/infiniband/hw/hfi1/trace_misc.h
 create mode 100644 drivers/infiniband/hw/hfi1/trace_mmu.h
 create mode 100644 drivers/infiniband/hw/hfi1/trace_rc.h
 create mode 100644 drivers/infiniband/hw/hfi1/trace_rx.h
 create mode 100644 drivers/infiniband/hw/hfi1/trace_tx.h
 create mode 100644 drivers/infiniband/hw/hfi1/uc.c
 create mode 100644 drivers/infiniband/hw/hfi1/ud.c
 create mode 100644 drivers/infiniband/hw/hfi1/user_exp_rcv.c
 create mode 100644 drivers/infiniband/hw/hfi1/user_exp_rcv.h
 create mode 100644 drivers/infiniband/hw/hfi1/user_pages.c
 create mode 100644 drivers/infiniband/hw/hfi1/user_sdma.c
 create mode 100644 drivers/infiniband/hw/hfi1/user_sdma.h
 create mode 100644 drivers/infiniband/hw/hfi1/verbs.c
 create mode 100644 drivers/infiniband/hw/hfi1/verbs.h
 create mode 100644 drivers/infiniband/hw/hfi1/verbs_txreq.c
 create mode 100644 drivers/infiniband/hw/hfi1/verbs_txreq.h
 create mode 100644 drivers/infiniband/hw/hfi1/vnic.h
 create mode 100644 drivers/infiniband/hw/hfi1/vnic_main.c
 create mode 100644 drivers/infiniband/hw/hfi1/vnic_sdma.c
 create mode 100644 drivers/infiniband/hw/hns/Kconfig
 create mode 100644 drivers/infiniband/hw/hns/Makefile
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_ah.c
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_alloc.c
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_cmd.c
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_cmd.h
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_common.h
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_cq.c
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_db.c
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_device.h
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_hem.c
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_hem.h
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_hw_v1.c
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_hw_v1.h
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_hw_v2.c
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_hw_v2.h
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_main.c
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_mr.c
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_pd.c
 create mode 100644 drivers/infiniband/hw/hns/hns_roce_qp.c
 create mode 100644 drivers/infiniband/hw/i40iw/Kconfig
 create mode 100644 drivers/infiniband/hw/i40iw/Makefile
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_cm.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_cm.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_ctrl.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_d.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_hmc.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_hmc.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_hw.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_main.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_osdep.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_p.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_pble.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_pble.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_puda.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_puda.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_register.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_status.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_type.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_uk.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_user.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_utils.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_verbs.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_verbs.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_vf.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_vf.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_virtchnl.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_virtchnl.h
 create mode 100644 drivers/infiniband/hw/mlx4/Kconfig
 create mode 100644 drivers/infiniband/hw/mlx4/Makefile
 create mode 100644 drivers/infiniband/hw/mlx4/ah.c
 create mode 100644 drivers/infiniband/hw/mlx4/alias_GUID.c
 create mode 100644 drivers/infiniband/hw/mlx4/cm.c
 create mode 100644 drivers/infiniband/hw/mlx4/cq.c
 create mode 100644 drivers/infiniband/hw/mlx4/doorbell.c
 create mode 100644 drivers/infiniband/hw/mlx4/mad.c
 create mode 100644 drivers/infiniband/hw/mlx4/main.c
 create mode 100644 drivers/infiniband/hw/mlx4/mcg.c
 create mode 100644 drivers/infiniband/hw/mlx4/mlx4_ib.h
 create mode 100644 drivers/infiniband/hw/mlx4/mr.c
 create mode 100644 drivers/infiniband/hw/mlx4/qp.c
 create mode 100644 drivers/infiniband/hw/mlx4/srq.c
 create mode 100644 drivers/infiniband/hw/mlx4/sysfs.c
 create mode 100644 drivers/infiniband/hw/mlx5/Kconfig
 create mode 100644 drivers/infiniband/hw/mlx5/Makefile
 create mode 100644 drivers/infiniband/hw/mlx5/ah.c
 create mode 100644 drivers/infiniband/hw/mlx5/cmd.c
 create mode 100644 drivers/infiniband/hw/mlx5/cmd.h
 create mode 100644 drivers/infiniband/hw/mlx5/cong.c
 create mode 100644 drivers/infiniband/hw/mlx5/cq.c
 create mode 100644 drivers/infiniband/hw/mlx5/doorbell.c
 create mode 100644 drivers/infiniband/hw/mlx5/gsi.c
 create mode 100644 drivers/infiniband/hw/mlx5/ib_rep.c
 create mode 100644 drivers/infiniband/hw/mlx5/ib_rep.h
 create mode 100644 drivers/infiniband/hw/mlx5/ib_virt.c
 create mode 100644 drivers/infiniband/hw/mlx5/mad.c
 create mode 100644 drivers/infiniband/hw/mlx5/main.c
 create mode 100644 drivers/infiniband/hw/mlx5/mem.c
 create mode 100644 drivers/infiniband/hw/mlx5/mlx5_ib.h
 create mode 100644 drivers/infiniband/hw/mlx5/mr.c
 create mode 100644 drivers/infiniband/hw/mlx5/odp.c
 create mode 100644 drivers/infiniband/hw/mlx5/qp.c
 create mode 100644 drivers/infiniband/hw/mlx5/srq.c
 create mode 100644 drivers/infiniband/hw/mthca/Kconfig
 create mode 100644 drivers/infiniband/hw/mthca/Makefile
 create mode 100644 drivers/infiniband/hw/mthca/mthca_allocator.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_av.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_catas.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_cmd.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_cmd.h
 create mode 100644 drivers/infiniband/hw/mthca/mthca_config_reg.h
 create mode 100644 drivers/infiniband/hw/mthca/mthca_cq.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_dev.h
 create mode 100644 drivers/infiniband/hw/mthca/mthca_doorbell.h
 create mode 100644 drivers/infiniband/hw/mthca/mthca_eq.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_mad.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_main.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_mcg.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_memfree.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_memfree.h
 create mode 100644 drivers/infiniband/hw/mthca/mthca_mr.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_pd.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_profile.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_profile.h
 create mode 100644 drivers/infiniband/hw/mthca/mthca_provider.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_provider.h
 create mode 100644 drivers/infiniband/hw/mthca/mthca_qp.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_reset.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_srq.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_uar.c
 create mode 100644 drivers/infiniband/hw/mthca/mthca_wqe.h
 create mode 100644 drivers/infiniband/hw/nes/Kconfig
 create mode 100644 drivers/infiniband/hw/nes/Makefile
 create mode 100644 drivers/infiniband/hw/nes/nes.c
 create mode 100644 drivers/infiniband/hw/nes/nes.h
 create mode 100644 drivers/infiniband/hw/nes/nes_cm.c
 create mode 100644 drivers/infiniband/hw/nes/nes_cm.h
 create mode 100644 drivers/infiniband/hw/nes/nes_context.h
 create mode 100644 drivers/infiniband/hw/nes/nes_hw.c
 create mode 100644 drivers/infiniband/hw/nes/nes_hw.h
 create mode 100644 drivers/infiniband/hw/nes/nes_mgt.c
 create mode 100644 drivers/infiniband/hw/nes/nes_mgt.h
 create mode 100644 drivers/infiniband/hw/nes/nes_nic.c
 create mode 100644 drivers/infiniband/hw/nes/nes_utils.c
 create mode 100644 drivers/infiniband/hw/nes/nes_verbs.c
 create mode 100644 drivers/infiniband/hw/nes/nes_verbs.h
 create mode 100644 drivers/infiniband/hw/ocrdma/Kconfig
 create mode 100644 drivers/infiniband/hw/ocrdma/Makefile
 create mode 100644 drivers/infiniband/hw/ocrdma/ocrdma.h
 create mode 100644 drivers/infiniband/hw/ocrdma/ocrdma_ah.c
 create mode 100644 drivers/infiniband/hw/ocrdma/ocrdma_ah.h
 create mode 100644 drivers/infiniband/hw/ocrdma/ocrdma_hw.c
 create mode 100644 drivers/infiniband/hw/ocrdma/ocrdma_hw.h
 create mode 100644 drivers/infiniband/hw/ocrdma/ocrdma_main.c
 create mode 100644 drivers/infiniband/hw/ocrdma/ocrdma_sli.h
 create mode 100644 drivers/infiniband/hw/ocrdma/ocrdma_stats.c
 create mode 100644 drivers/infiniband/hw/ocrdma/ocrdma_stats.h
 create mode 100644 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
 create mode 100644 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
 create mode 100644 drivers/infiniband/hw/qedr/Kconfig
 create mode 100644 drivers/infiniband/hw/qedr/Makefile
 create mode 100644 drivers/infiniband/hw/qedr/main.c
 create mode 100644 drivers/infiniband/hw/qedr/qedr.h
 create mode 100644 drivers/infiniband/hw/qedr/qedr_hsi_rdma.h
 create mode 100644 drivers/infiniband/hw/qedr/qedr_iw_cm.c
 create mode 100644 drivers/infiniband/hw/qedr/qedr_iw_cm.h
 create mode 100644 drivers/infiniband/hw/qedr/qedr_roce_cm.c
 create mode 100644 drivers/infiniband/hw/qedr/qedr_roce_cm.h
 create mode 100644 drivers/infiniband/hw/qedr/verbs.c
 create mode 100644 drivers/infiniband/hw/qedr/verbs.h
 create mode 100644 drivers/infiniband/hw/qib/Kconfig
 create mode 100644 drivers/infiniband/hw/qib/Makefile
 create mode 100644 drivers/infiniband/hw/qib/qib.h
 create mode 100644 drivers/infiniband/hw/qib/qib_6120_regs.h
 create mode 100644 drivers/infiniband/hw/qib/qib_7220.h
 create mode 100644 drivers/infiniband/hw/qib/qib_7220_regs.h
 create mode 100644 drivers/infiniband/hw/qib/qib_7322_regs.h
 create mode 100644 drivers/infiniband/hw/qib/qib_common.h
 create mode 100644 drivers/infiniband/hw/qib/qib_debugfs.c
 create mode 100644 drivers/infiniband/hw/qib/qib_debugfs.h
 create mode 100644 drivers/infiniband/hw/qib/qib_diag.c
 create mode 100644 drivers/infiniband/hw/qib/qib_driver.c
 create mode 100644 drivers/infiniband/hw/qib/qib_eeprom.c
 create mode 100644 drivers/infiniband/hw/qib/qib_file_ops.c
 create mode 100644 drivers/infiniband/hw/qib/qib_fs.c
 create mode 100644 drivers/infiniband/hw/qib/qib_iba6120.c
 create mode 100644 drivers/infiniband/hw/qib/qib_iba7220.c
 create mode 100644 drivers/infiniband/hw/qib/qib_iba7322.c
 create mode 100644 drivers/infiniband/hw/qib/qib_init.c
 create mode 100644 drivers/infiniband/hw/qib/qib_intr.c
 create mode 100644 drivers/infiniband/hw/qib/qib_mad.c
 create mode 100644 drivers/infiniband/hw/qib/qib_mad.h
 create mode 100644 drivers/infiniband/hw/qib/qib_pcie.c
 create mode 100644 drivers/infiniband/hw/qib/qib_pio_copy.c
 create mode 100644 drivers/infiniband/hw/qib/qib_qp.c
 create mode 100644 drivers/infiniband/hw/qib/qib_qsfp.c
 create mode 100644 drivers/infiniband/hw/qib/qib_qsfp.h
 create mode 100644 drivers/infiniband/hw/qib/qib_rc.c
 create mode 100644 drivers/infiniband/hw/qib/qib_ruc.c
 create mode 100644 drivers/infiniband/hw/qib/qib_sd7220.c
 create mode 100644 drivers/infiniband/hw/qib/qib_sdma.c
 create mode 100644 drivers/infiniband/hw/qib/qib_sysfs.c
 create mode 100644 drivers/infiniband/hw/qib/qib_twsi.c
 create mode 100644 drivers/infiniband/hw/qib/qib_tx.c
 create mode 100644 drivers/infiniband/hw/qib/qib_uc.c
 create mode 100644 drivers/infiniband/hw/qib/qib_ud.c
 create mode 100644 drivers/infiniband/hw/qib/qib_user_pages.c
 create mode 100644 drivers/infiniband/hw/qib/qib_user_sdma.c
 create mode 100644 drivers/infiniband/hw/qib/qib_user_sdma.h
 create mode 100644 drivers/infiniband/hw/qib/qib_verbs.c
 create mode 100644 drivers/infiniband/hw/qib/qib_verbs.h
 create mode 100644 drivers/infiniband/hw/qib/qib_wc_ppc64.c
 create mode 100644 drivers/infiniband/hw/qib/qib_wc_x86_64.c
 create mode 100644 drivers/infiniband/hw/usnic/Kconfig
 create mode 100644 drivers/infiniband/hw/usnic/Makefile
 create mode 100644 drivers/infiniband/hw/usnic/usnic.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_abi.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_common_util.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_debugfs.c
 create mode 100644 drivers/infiniband/hw/usnic/usnic_debugfs.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_fwd.c
 create mode 100644 drivers/infiniband/hw/usnic/usnic_fwd.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_ib.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_ib_main.c
 create mode 100644 drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
 create mode 100644 drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
 create mode 100644 drivers/infiniband/hw/usnic/usnic_ib_sysfs.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_ib_verbs.c
 create mode 100644 drivers/infiniband/hw/usnic/usnic_ib_verbs.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_log.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_transport.c
 create mode 100644 drivers/infiniband/hw/usnic/usnic_transport.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_uiom.c
 create mode 100644 drivers/infiniband/hw/usnic/usnic_uiom.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
 create mode 100644 drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
 create mode 100644 drivers/infiniband/hw/usnic/usnic_vnic.c
 create mode 100644 drivers/infiniband/hw/usnic/usnic_vnic.h
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/Kconfig
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/Makefile
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/pvrdma_cmd.c
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/pvrdma_doorbell.c
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
 create mode 100644 drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
 create mode 100644 drivers/infiniband/sw/Makefile
 create mode 100644 drivers/infiniband/sw/rdmavt/Kconfig
 create mode 100644 drivers/infiniband/sw/rdmavt/Makefile
 create mode 100644 drivers/infiniband/sw/rdmavt/ah.c
 create mode 100644 drivers/infiniband/sw/rdmavt/ah.h
 create mode 100644 drivers/infiniband/sw/rdmavt/cq.c
 create mode 100644 drivers/infiniband/sw/rdmavt/cq.h
 create mode 100644 drivers/infiniband/sw/rdmavt/mad.c
 create mode 100644 drivers/infiniband/sw/rdmavt/mad.h
 create mode 100644 drivers/infiniband/sw/rdmavt/mcast.c
 create mode 100644 drivers/infiniband/sw/rdmavt/mcast.h
 create mode 100644 drivers/infiniband/sw/rdmavt/mmap.c
 create mode 100644 drivers/infiniband/sw/rdmavt/mmap.h
 create mode 100644 drivers/infiniband/sw/rdmavt/mr.c
 create mode 100644 drivers/infiniband/sw/rdmavt/mr.h
 create mode 100644 drivers/infiniband/sw/rdmavt/pd.c
 create mode 100644 drivers/infiniband/sw/rdmavt/pd.h
 create mode 100644 drivers/infiniband/sw/rdmavt/qp.c
 create mode 100644 drivers/infiniband/sw/rdmavt/qp.h
 create mode 100644 drivers/infiniband/sw/rdmavt/rc.c
 create mode 100644 drivers/infiniband/sw/rdmavt/srq.c
 create mode 100644 drivers/infiniband/sw/rdmavt/srq.h
 create mode 100644 drivers/infiniband/sw/rdmavt/trace.c
 create mode 100644 drivers/infiniband/sw/rdmavt/trace.h
 create mode 100644 drivers/infiniband/sw/rdmavt/trace_cq.h
 create mode 100644 drivers/infiniband/sw/rdmavt/trace_mr.h
 create mode 100644 drivers/infiniband/sw/rdmavt/trace_qp.h
 create mode 100644 drivers/infiniband/sw/rdmavt/trace_rc.h
 create mode 100644 drivers/infiniband/sw/rdmavt/trace_rvt.h
 create mode 100644 drivers/infiniband/sw/rdmavt/trace_tx.h
 create mode 100644 drivers/infiniband/sw/rdmavt/vt.c
 create mode 100644 drivers/infiniband/sw/rdmavt/vt.h
 create mode 100644 drivers/infiniband/sw/rxe/Kconfig
 create mode 100644 drivers/infiniband/sw/rxe/Makefile
 create mode 100644 drivers/infiniband/sw/rxe/rxe.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe.h
 create mode 100644 drivers/infiniband/sw/rxe/rxe_av.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_comp.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_cq.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_hdr.h
 create mode 100644 drivers/infiniband/sw/rxe/rxe_hw_counters.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_hw_counters.h
 create mode 100644 drivers/infiniband/sw/rxe/rxe_icrc.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_loc.h
 create mode 100644 drivers/infiniband/sw/rxe/rxe_mcast.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_mmap.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_mr.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_net.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_net.h
 create mode 100644 drivers/infiniband/sw/rxe/rxe_opcode.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_opcode.h
 create mode 100644 drivers/infiniband/sw/rxe/rxe_param.h
 create mode 100644 drivers/infiniband/sw/rxe/rxe_pool.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_pool.h
 create mode 100644 drivers/infiniband/sw/rxe/rxe_qp.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_queue.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_queue.h
 create mode 100644 drivers/infiniband/sw/rxe/rxe_recv.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_req.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_resp.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_srq.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_sysfs.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_task.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_task.h
 create mode 100644 drivers/infiniband/sw/rxe/rxe_verbs.c
 create mode 100644 drivers/infiniband/sw/rxe/rxe_verbs.h
 create mode 100644 drivers/infiniband/ulp/Makefile
 create mode 100644 drivers/infiniband/ulp/ipoib/Kconfig
 create mode 100644 drivers/infiniband/ulp/ipoib/Makefile
 create mode 100644 drivers/infiniband/ulp/ipoib/ipoib.h
 create mode 100644 drivers/infiniband/ulp/ipoib/ipoib_cm.c
 create mode 100644 drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
 create mode 100644 drivers/infiniband/ulp/ipoib/ipoib_fs.c
 create mode 100644 drivers/infiniband/ulp/ipoib/ipoib_ib.c
 create mode 100644 drivers/infiniband/ulp/ipoib/ipoib_main.c
 create mode 100644 drivers/infiniband/ulp/ipoib/ipoib_multicast.c
 create mode 100644 drivers/infiniband/ulp/ipoib/ipoib_netlink.c
 create mode 100644 drivers/infiniband/ulp/ipoib/ipoib_verbs.c
 create mode 100644 drivers/infiniband/ulp/ipoib/ipoib_vlan.c
 create mode 100644 drivers/infiniband/ulp/iser/Kconfig
 create mode 100644 drivers/infiniband/ulp/iser/Makefile
 create mode 100644 drivers/infiniband/ulp/iser/iscsi_iser.c
 create mode 100644 drivers/infiniband/ulp/iser/iscsi_iser.h
 create mode 100644 drivers/infiniband/ulp/iser/iser_initiator.c
 create mode 100644 drivers/infiniband/ulp/iser/iser_memory.c
 create mode 100644 drivers/infiniband/ulp/iser/iser_verbs.c
 create mode 100644 drivers/infiniband/ulp/isert/Kconfig
 create mode 100644 drivers/infiniband/ulp/isert/Makefile
 create mode 100644 drivers/infiniband/ulp/isert/ib_isert.c
 create mode 100644 drivers/infiniband/ulp/isert/ib_isert.h
 create mode 100644 drivers/infiniband/ulp/opa_vnic/Kconfig
 create mode 100644 drivers/infiniband/ulp/opa_vnic/Makefile
 create mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c
 create mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
 create mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c
 create mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h
 create mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
 create mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
 create mode 100644 drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c
 create mode 100644 drivers/infiniband/ulp/srp/Kbuild
 create mode 100644 drivers/infiniband/ulp/srp/Kconfig
 create mode 100644 drivers/infiniband/ulp/srp/ib_srp.c
 create mode 100644 drivers/infiniband/ulp/srp/ib_srp.h
 create mode 100644 drivers/infiniband/ulp/srpt/Kconfig
 create mode 100644 drivers/infiniband/ulp/srpt/Makefile
 create mode 100644 drivers/infiniband/ulp/srpt/ib_dm_mad.h
 create mode 100644 drivers/infiniband/ulp/srpt/ib_srpt.c
 create mode 100644 drivers/infiniband/ulp/srpt/ib_srpt.h
 create mode 100644 drivers/net/ethernet/broadcom/Kconfig
 create mode 100644 drivers/net/ethernet/broadcom/Makefile
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/Makefile
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt.h
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_fw_hdr.h
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_nvm_defs.h
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.h
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
 create mode 100644 drivers/net/ethernet/chelsio/Kconfig
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/Makefile
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/adapter.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/ael1002.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/aq100x.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/common.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/cxgb3_ctl_defs.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/cxgb3_ioctl.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/firmware_exports.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/l2t.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/l2t.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/mc5.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/regs.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/sge.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/sge_defs.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/t3_cpl.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/t3_hw.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/t3cdev.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/version.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/vsc8211.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb3/xgmac.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/Makefile
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/clip_tbl.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/clip_tbl.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cudbg_common.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cudbg_if.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cudbg_lib_common.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cudbg_zlib.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cudbg_zlib.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_fcoe.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_fcoe.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/l2t.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/l2t.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/sched.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/sched.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/sge.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/smt.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/smt.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/srq.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/srq.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/t4_chip_type.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/t4_hw.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/t4_values.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h
 create mode 100644 drivers/net/ethernet/emulex/Kconfig
 create mode 100644 drivers/net/ethernet/emulex/Makefile
 create mode 100644 drivers/net/ethernet/emulex/benet/Kconfig
 create mode 100644 drivers/net/ethernet/emulex/benet/Makefile
 create mode 100644 drivers/net/ethernet/emulex/benet/be.h
 create mode 100644 drivers/net/ethernet/emulex/benet/be_cmds.c
 create mode 100644 drivers/net/ethernet/emulex/benet/be_cmds.h
 create mode 100644 drivers/net/ethernet/emulex/benet/be_ethtool.c
 create mode 100644 drivers/net/ethernet/emulex/benet/be_hw.h
 create mode 100644 drivers/net/ethernet/emulex/benet/be_main.c
 create mode 100644 drivers/net/ethernet/emulex/benet/be_roce.c
 create mode 100644 drivers/net/ethernet/emulex/benet/be_roce.h
 create mode 100644 drivers/net/ethernet/intel/Kconfig
 create mode 100644 drivers/net/ethernet/intel/i40e/Makefile
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_adminq.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_adminq.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_alloc.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_client.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_client.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_common.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_dcb.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_dcb.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_debugfs.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_devids.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_diag.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_diag.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_ethtool.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_hmc.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_hmc.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_lan_hmc.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_lan_hmc.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_main.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_nvm.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_osdep.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_prototype.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_ptp.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_register.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_status.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_trace.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_txrx.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_txrx.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_type.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
 create mode 100644 drivers/net/ethernet/mellanox/Kconfig
 create mode 100644 drivers/net/ethernet/mellanox/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/Kconfig
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/alloc.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/catas.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/cmd.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/cq.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/en_clock.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/en_cq.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/en_main.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/en_netdev.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/en_port.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/en_port.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/en_resources.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/en_rx.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/en_selftest.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/en_tx.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/eq.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/fw.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/fw.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/fw_qos.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/fw_qos.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/icm.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/icm.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/intf.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/main.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/mcg.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/mlx4.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/mlx4_stats.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/mr.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/pd.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/port.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/profile.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/qp.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/reset.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/sense.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/srq.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/Kconfig
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/accel/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/alloc.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/cmd.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/cq.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/dev.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_common.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_main.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/eq.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fw.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/health.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/ipoib/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lag.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/mad.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/main.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/mcg.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/mr.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/pd.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/port.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/qp.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/rl.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sriov.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/srq.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/transobj.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/uar.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/vport.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/wq.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/wq.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxfw/Kconfig
 create mode 100644 drivers/net/ethernet/mellanox/mlxfw/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlxfw/mlxfw.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_format.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/Kconfig
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/cmd.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/core.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/core.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/emad.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/i2c.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/i2c.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/ib.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/item.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/minimal.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/pci.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/pci.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/port.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/reg.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/resources.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_keys.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_fid.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/switchib.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/switchx2.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/trap.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/txheader.h
 create mode 100644 drivers/net/ethernet/qlogic/Kconfig
 create mode 100644 drivers/net/ethernet/qlogic/Makefile
 create mode 100644 drivers/net/ethernet/qlogic/netxen/Makefile
 create mode 100644 drivers/net/ethernet/qlogic/netxen/netxen_nic.h
 create mode 100644 drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c
 create mode 100644 drivers/net/ethernet/qlogic/netxen/netxen_nic_ethtool.c
 create mode 100644 drivers/net/ethernet/qlogic/netxen/netxen_nic_hdr.h
 create mode 100644 drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c
 create mode 100644 drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.h
 create mode 100644 drivers/net/ethernet/qlogic/netxen/netxen_nic_init.c
 create mode 100644 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/Makefile
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_cxt.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_cxt.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_dcbx.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_dcbx.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_debug.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_debug.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_dev.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_dev_api.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_fcoe.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_fcoe.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_hsi.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_hw.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_hw.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_init_ops.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_init_ops.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_int.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_int.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_iscsi.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_iscsi.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_iwarp.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_iwarp.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_l2.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_l2.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_ll2.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_ll2.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_main.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_mcp.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_mcp.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_ooo.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_ooo.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_ptp.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_rdma.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_rdma.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_roce.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_roce.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_selftest.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_selftest.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_sp.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_spq.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_sriov.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_sriov.h
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_vf.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_vf.h
 create mode 100644 drivers/net/ethernet/qlogic/qede/Makefile
 create mode 100644 drivers/net/ethernet/qlogic/qede/qede.h
 create mode 100644 drivers/net/ethernet/qlogic/qede/qede_dcbnl.c
 create mode 100644 drivers/net/ethernet/qlogic/qede/qede_ethtool.c
 create mode 100644 drivers/net/ethernet/qlogic/qede/qede_filter.c
 create mode 100644 drivers/net/ethernet/qlogic/qede/qede_fp.c
 create mode 100644 drivers/net/ethernet/qlogic/qede/qede_main.c
 create mode 100644 drivers/net/ethernet/qlogic/qede/qede_ptp.c
 create mode 100644 drivers/net/ethernet/qlogic/qede/qede_ptp.h
 create mode 100644 drivers/net/ethernet/qlogic/qede/qede_rdma.c
 create mode 100644 drivers/net/ethernet/qlogic/qla3xxx.c
 create mode 100644 drivers/net/ethernet/qlogic/qla3xxx.h
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/Makefile
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_hdr.h
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.h
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
 create mode 100644 drivers/net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c
 create mode 100644 drivers/net/ethernet/qlogic/qlge/Makefile
 create mode 100644 drivers/net/ethernet/qlogic/qlge/qlge.h
 create mode 100644 drivers/net/ethernet/qlogic/qlge/qlge_dbg.c
 create mode 100644 drivers/net/ethernet/qlogic/qlge/qlge_ethtool.c
 create mode 100644 drivers/net/ethernet/qlogic/qlge/qlge_main.c
 create mode 100644 drivers/net/ethernet/qlogic/qlge/qlge_mpi.c
 create mode 100644 drivers/nvme/Kconfig
 create mode 100644 drivers/nvme/Makefile
 create mode 100644 drivers/nvme/host/Kconfig
 create mode 100644 drivers/nvme/host/Makefile
 create mode 100644 drivers/nvme/host/core.c
 create mode 100644 drivers/nvme/host/fabrics.c
 create mode 100644 drivers/nvme/host/fabrics.h
 create mode 100644 drivers/nvme/host/fault_inject.c
 create mode 100644 drivers/nvme/host/fc.c
 create mode 100644 drivers/nvme/host/lightnvm.c
 create mode 100644 drivers/nvme/host/multipath.c
 create mode 100644 drivers/nvme/host/nvme.h
 create mode 100644 drivers/nvme/host/pci.c
 create mode 100644 drivers/nvme/host/rdma.c
 create mode 100644 drivers/nvme/host/trace.c
 create mode 100644 drivers/nvme/host/trace.h
 create mode 100644 drivers/nvme/target/Kconfig
 create mode 100644 drivers/nvme/target/Makefile
 create mode 100644 drivers/nvme/target/admin-cmd.c
 create mode 100644 drivers/nvme/target/configfs.c
 create mode 100644 drivers/nvme/target/core.c
 create mode 100644 drivers/nvme/target/discovery.c
 create mode 100644 drivers/nvme/target/fabrics-cmd.c
 create mode 100644 drivers/nvme/target/fc.c
 create mode 100644 drivers/nvme/target/fcloop.c
 create mode 100644 drivers/nvme/target/io-cmd.c
 create mode 100644 drivers/nvme/target/loop.c
 create mode 100644 drivers/nvme/target/nvmet.h
 create mode 100644 drivers/nvme/target/rdma.c
 create mode 100644 drivers/scsi/Kconfig
 create mode 100644 drivers/scsi/Makefile
 create mode 100644 drivers/scsi/cxgbi/Kconfig
 create mode 100644 drivers/scsi/cxgbi/Makefile
 create mode 100644 drivers/scsi/cxgbi/cxgb3i/Kbuild
 create mode 100644 drivers/scsi/cxgbi/cxgb3i/Kconfig
 create mode 100644 drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
 create mode 100644 drivers/scsi/cxgbi/cxgb3i/cxgb3i.h
 create mode 100644 drivers/scsi/cxgbi/cxgb4i/Kbuild
 create mode 100644 drivers/scsi/cxgbi/cxgb4i/Kconfig
 create mode 100644 drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
 create mode 100644 drivers/scsi/cxgbi/cxgb4i/cxgb4i.h
 create mode 100644 drivers/scsi/cxgbi/libcxgbi.c
 create mode 100644 drivers/scsi/cxgbi/libcxgbi.h
 create mode 100644 drivers/scsi/qedf/Kconfig
 create mode 100644 drivers/scsi/qedf/Makefile
 create mode 100644 drivers/scsi/qedf/drv_fcoe_fw_funcs.c
 create mode 100644 drivers/scsi/qedf/drv_fcoe_fw_funcs.h
 create mode 100644 drivers/scsi/qedf/drv_scsi_fw_funcs.c
 create mode 100644 drivers/scsi/qedf/drv_scsi_fw_funcs.h
 create mode 100644 drivers/scsi/qedf/qedf.h
 create mode 100644 drivers/scsi/qedf/qedf_attr.c
 create mode 100644 drivers/scsi/qedf/qedf_dbg.c
 create mode 100644 drivers/scsi/qedf/qedf_dbg.h
 create mode 100644 drivers/scsi/qedf/qedf_debugfs.c
 create mode 100644 drivers/scsi/qedf/qedf_els.c
 create mode 100644 drivers/scsi/qedf/qedf_fip.c
 create mode 100644 drivers/scsi/qedf/qedf_hsi.h
 create mode 100644 drivers/scsi/qedf/qedf_io.c
 create mode 100644 drivers/scsi/qedf/qedf_main.c
 create mode 100644 drivers/scsi/qedf/qedf_version.h
 create mode 100644 drivers/scsi/qedi/Kconfig
 create mode 100644 drivers/scsi/qedi/Makefile
 create mode 100644 drivers/scsi/qedi/qedi.h
 create mode 100644 drivers/scsi/qedi/qedi_dbg.c
 create mode 100644 drivers/scsi/qedi/qedi_dbg.h
 create mode 100644 drivers/scsi/qedi/qedi_debugfs.c
 create mode 100644 drivers/scsi/qedi/qedi_fw.c
 create mode 100644 drivers/scsi/qedi/qedi_fw_api.c
 create mode 100644 drivers/scsi/qedi/qedi_fw_iscsi.h
 create mode 100644 drivers/scsi/qedi/qedi_fw_scsi.h
 create mode 100644 drivers/scsi/qedi/qedi_gbl.h
 create mode 100644 drivers/scsi/qedi/qedi_hsi.h
 create mode 100644 drivers/scsi/qedi/qedi_iscsi.c
 create mode 100644 drivers/scsi/qedi/qedi_iscsi.h
 create mode 100644 drivers/scsi/qedi/qedi_main.c
 create mode 100644 drivers/scsi/qedi/qedi_nvm_iscsi_cfg.h
 create mode 100644 drivers/scsi/qedi/qedi_sysfs.c
 create mode 100644 drivers/scsi/qedi/qedi_version.h
 create mode 100644 drivers/scsi/scsi_priv.h
 create mode 100644 drivers/scsi/scsi_transport_srp.c
 create mode 100644 include/linux/blk-mq-rdma.h
 create mode 100644 include/linux/cgroup_rdma.h
 create mode 100644 include/linux/mlx4/cmd.h
 create mode 100644 include/linux/mlx4/cq.h
 create mode 100644 include/linux/mlx4/device.h
 create mode 100644 include/linux/mlx4/doorbell.h
 create mode 100644 include/linux/mlx4/driver.h
 create mode 100644 include/linux/mlx4/qp.h
 create mode 100644 include/linux/mlx4/srq.h
 create mode 100644 include/linux/mlx5/accel.h
 create mode 100644 include/linux/mlx5/cmd.h
 create mode 100644 include/linux/mlx5/cq.h
 create mode 100644 include/linux/mlx5/device.h
 create mode 100644 include/linux/mlx5/doorbell.h
 create mode 100644 include/linux/mlx5/driver.h
 create mode 100644 include/linux/mlx5/eswitch.h
 create mode 100644 include/linux/mlx5/fs.h
 create mode 100644 include/linux/mlx5/fs_helpers.h
 create mode 100644 include/linux/mlx5/mlx5_ifc.h
 create mode 100644 include/linux/mlx5/mlx5_ifc_fpga.h
 create mode 100644 include/linux/mlx5/port.h
 create mode 100644 include/linux/mlx5/qp.h
 create mode 100644 include/linux/mlx5/srq.h
 create mode 100644 include/linux/mlx5/transobj.h
 create mode 100644 include/linux/mlx5/vport.h
 create mode 100644 include/linux/nvme-rdma.h
 create mode 100644 include/linux/nvme.h
 create mode 100644 include/linux/qed/common_hsi.h
 create mode 100644 include/linux/qed/eth_common.h
 create mode 100644 include/linux/qed/fcoe_common.h
 create mode 100644 include/linux/qed/iscsi_common.h
 create mode 100644 include/linux/qed/iwarp_common.h
 create mode 100644 include/linux/qed/qed_chain.h
 create mode 100644 include/linux/qed/qed_eth_if.h
 create mode 100644 include/linux/qed/qed_fcoe_if.h
 create mode 100644 include/linux/qed/qed_if.h
 create mode 100644 include/linux/qed/qed_iov_if.h
 create mode 100644 include/linux/qed/qed_iscsi_if.h
 create mode 100644 include/linux/qed/qed_ll2_if.h
 create mode 100644 include/linux/qed/qed_rdma_if.h
 create mode 100644 include/linux/qed/qede_rdma.h
 create mode 100644 include/linux/qed/rdma_common.h
 create mode 100644 include/linux/qed/roce_common.h
 create mode 100644 include/linux/qed/storage_common.h
 create mode 100644 include/linux/qed/tcp_common.h
 create mode 100644 include/linux/sunrpc/addr.h
 create mode 100644 include/linux/sunrpc/auth.h
 create mode 100644 include/linux/sunrpc/auth_gss.h
 create mode 100644 include/linux/sunrpc/bc_xprt.h
 create mode 100644 include/linux/sunrpc/cache.h
 create mode 100644 include/linux/sunrpc/clnt.h
 create mode 100644 include/linux/sunrpc/debug.h
 create mode 100644 include/linux/sunrpc/gss_api.h
 create mode 100644 include/linux/sunrpc/gss_asn1.h
 create mode 100644 include/linux/sunrpc/gss_err.h
 create mode 100644 include/linux/sunrpc/gss_krb5.h
 create mode 100644 include/linux/sunrpc/gss_krb5_enctypes.h
 create mode 100644 include/linux/sunrpc/metrics.h
 create mode 100644 include/linux/sunrpc/msg_prot.h
 create mode 100644 include/linux/sunrpc/rpc_pipe_fs.h
 create mode 100644 include/linux/sunrpc/rpc_rdma.h
 create mode 100644 include/linux/sunrpc/sched.h
 create mode 100644 include/linux/sunrpc/stats.h
 create mode 100644 include/linux/sunrpc/svc.h
 create mode 100644 include/linux/sunrpc/svc_rdma.h
 create mode 100644 include/linux/sunrpc/svc_xprt.h
 create mode 100644 include/linux/sunrpc/svcauth.h
 create mode 100644 include/linux/sunrpc/svcauth_gss.h
 create mode 100644 include/linux/sunrpc/svcsock.h
 create mode 100644 include/linux/sunrpc/timer.h
 create mode 100644 include/linux/sunrpc/types.h
 create mode 100644 include/linux/sunrpc/xdr.h
 create mode 100644 include/linux/sunrpc/xprt.h
 create mode 100644 include/linux/sunrpc/xprtmultipath.h
 create mode 100644 include/linux/sunrpc/xprtrdma.h
 create mode 100644 include/linux/sunrpc/xprtsock.h
 create mode 100644 include/rdma/ib.h
 create mode 100644 include/rdma/ib_addr.h
 create mode 100644 include/rdma/ib_cache.h
 create mode 100644 include/rdma/ib_cm.h
 create mode 100644 include/rdma/ib_fmr_pool.h
 create mode 100644 include/rdma/ib_hdrs.h
 create mode 100644 include/rdma/ib_mad.h
 create mode 100644 include/rdma/ib_marshall.h
 create mode 100644 include/rdma/ib_pack.h
 create mode 100644 include/rdma/ib_pma.h
 create mode 100644 include/rdma/ib_sa.h
 create mode 100644 include/rdma/ib_smi.h
 create mode 100644 include/rdma/ib_umem.h
 create mode 100644 include/rdma/ib_umem_odp.h
 create mode 100644 include/rdma/ib_verbs.h
 create mode 100644 include/rdma/iw_cm.h
 create mode 100644 include/rdma/iw_portmap.h
 create mode 100644 include/rdma/mr_pool.h
 create mode 100644 include/rdma/opa_addr.h
 create mode 100644 include/rdma/opa_port_info.h
 create mode 100644 include/rdma/opa_smi.h
 create mode 100644 include/rdma/opa_vnic.h
 create mode 100644 include/rdma/rdma_cm.h
 create mode 100644 include/rdma/rdma_cm_ib.h
 create mode 100644 include/rdma/rdma_netlink.h
 create mode 100644 include/rdma/rdma_vt.h
 create mode 100644 include/rdma/rdmavt_cq.h
 create mode 100644 include/rdma/rdmavt_mr.h
 create mode 100644 include/rdma/rdmavt_qp.h
 create mode 100644 include/rdma/restrack.h
 create mode 100644 include/rdma/rw.h
 create mode 100644 include/rdma/uverbs_ioctl.h
 create mode 100644 include/rdma/uverbs_named_ioctl.h
 create mode 100644 include/rdma/uverbs_std_types.h
 create mode 100644 include/rdma/uverbs_types.h
 create mode 100644 include/scsi/iser.h
 create mode 100644 include/scsi/scsi_transport_srp.h
 create mode 100644 include/scsi/srp.h
 create mode 100644 include/uapi/linux/nvme_ioctl.h
 create mode 100644 include/uapi/linux/rds.h
 create mode 100644 include/uapi/rdma/bnxt_re-abi.h
 create mode 100644 include/uapi/rdma/cxgb3-abi.h
 create mode 100644 include/uapi/rdma/cxgb4-abi.h
 create mode 100644 include/uapi/rdma/hfi/hfi1_ioctl.h
 create mode 100644 include/uapi/rdma/hfi/hfi1_user.h
 create mode 100644 include/uapi/rdma/hns-abi.h
 create mode 100644 include/uapi/rdma/i40iw-abi.h
 create mode 100644 include/uapi/rdma/ib_user_cm.h
 create mode 100644 include/uapi/rdma/ib_user_ioctl_cmds.h
 create mode 100644 include/uapi/rdma/ib_user_ioctl_verbs.h
 create mode 100644 include/uapi/rdma/ib_user_mad.h
 create mode 100644 include/uapi/rdma/ib_user_sa.h
 create mode 100644 include/uapi/rdma/ib_user_verbs.h
 create mode 100644 include/uapi/rdma/mlx4-abi.h
 create mode 100644 include/uapi/rdma/mlx5-abi.h
 create mode 100644 include/uapi/rdma/mlx5_user_ioctl_cmds.h
 create mode 100644 include/uapi/rdma/mlx5_user_ioctl_verbs.h
 create mode 100644 include/uapi/rdma/mthca-abi.h
 create mode 100644 include/uapi/rdma/nes-abi.h
 create mode 100644 include/uapi/rdma/ocrdma-abi.h
 create mode 100644 include/uapi/rdma/qedr-abi.h
 create mode 100644 include/uapi/rdma/rdma_netlink.h
 create mode 100644 include/uapi/rdma/rdma_user_cm.h
 create mode 100644 include/uapi/rdma/rdma_user_ioctl.h
 create mode 100644 include/uapi/rdma/rdma_user_ioctl_cmds.h
 create mode 100644 include/uapi/rdma/rdma_user_rxe.h
 create mode 100644 include/uapi/rdma/vmw_pvrdma-abi.h
 create mode 100644 net/rds/Kconfig
 create mode 100644 net/rds/Makefile
 create mode 100644 net/rds/af_rds.c
 create mode 100644 net/rds/bind.c
 create mode 100644 net/rds/cong.c
 create mode 100644 net/rds/connection.c
 create mode 100644 net/rds/ib.c
 create mode 100644 net/rds/ib.h
 create mode 100644 net/rds/ib_cm.c
 create mode 100644 net/rds/ib_fmr.c
 create mode 100644 net/rds/ib_frmr.c
 create mode 100644 net/rds/ib_mr.h
 create mode 100644 net/rds/ib_rdma.c
 create mode 100644 net/rds/ib_recv.c
 create mode 100644 net/rds/ib_ring.c
 create mode 100644 net/rds/ib_send.c
 create mode 100644 net/rds/ib_stats.c
 create mode 100644 net/rds/ib_sysctl.c
 create mode 100644 net/rds/info.c
 create mode 100644 net/rds/info.h
 create mode 100644 net/rds/loop.c
 create mode 100644 net/rds/loop.h
 create mode 100644 net/rds/message.c
 create mode 100644 net/rds/page.c
 create mode 100644 net/rds/rdma.c
 create mode 100644 net/rds/rdma_transport.c
 create mode 100644 net/rds/rdma_transport.h
 create mode 100644 net/rds/rds.h
 create mode 100644 net/rds/rds_single_path.h
 create mode 100644 net/rds/recv.c
 create mode 100644 net/rds/send.c
 create mode 100644 net/rds/stats.c
 create mode 100644 net/rds/sysctl.c
 create mode 100644 net/rds/tcp.c
 create mode 100644 net/rds/tcp.h
 create mode 100644 net/rds/tcp_connect.c
 create mode 100644 net/rds/tcp_listen.c
 create mode 100644 net/rds/tcp_recv.c
 create mode 100644 net/rds/tcp_send.c
 create mode 100644 net/rds/tcp_stats.c
 create mode 100644 net/rds/threads.c
 create mode 100644 net/rds/transport.c
 create mode 100644 net/sunrpc/Kconfig
 create mode 100644 net/sunrpc/xprtrdma/Makefile
 create mode 100644 net/sunrpc/xprtrdma/backchannel.c
 create mode 100644 net/sunrpc/xprtrdma/fmr_ops.c
 create mode 100644 net/sunrpc/xprtrdma/frwr_ops.c
 create mode 100644 net/sunrpc/xprtrdma/module.c
 create mode 100644 net/sunrpc/xprtrdma/rpc_rdma.c
 create mode 100644 net/sunrpc/xprtrdma/svc_rdma.c
 create mode 100644 net/sunrpc/xprtrdma/svc_rdma_backchannel.c
 create mode 100644 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
 create mode 100644 net/sunrpc/xprtrdma/svc_rdma_rw.c
 create mode 100644 net/sunrpc/xprtrdma/svc_rdma_sendto.c
 create mode 100644 net/sunrpc/xprtrdma/svc_rdma_transport.c
 create mode 100644 net/sunrpc/xprtrdma/transport.c
 create mode 100644 net/sunrpc/xprtrdma/verbs.c
 create mode 100644 net/sunrpc/xprtrdma/xprt_rdma.h

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..97ba6b7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,134 @@
+#
+# NOTE! Don't add files that are generated in specific
+# subdirectories here. Add them in the ".gitignore" file
+# in that subdirectory instead.
+#
+# NOTE! Please use 'git ls-files -i --exclude-standard'
+# command after changing this file, to see if there are
+# any tracked files which get ignored after the change.
+#
+# Normal rules (sorted alphabetically)
+#
+.*
+*.a
+*.asn1.[ch]
+*.bin
+*.bz2
+*.c.[012]*.*
+*.dtb
+*.dtb.S
+*.dwo
+*.elf
+*.gcno
+*.gz
+*.i
+*.ko
+*.lex.c
+*.ll
+*.lst
+*.lz4
+*.lzma
+*.lzo
+*.mod.c
+*.o
+*.o.*
+*.order
+*.patch
+*.s
+*.so
+*.so.dbg
+*.su
+*.symtypes
+*.tab.[ch]
+*.tar
+*.xz
+Module.symvers
+modules.builtin
+
+#
+# Top-level generic files
+#
+/tags
+/TAGS
+/linux
+/vmlinux
+/vmlinux.32
+/vmlinux-gdb.py
+/vmlinuz
+/System.map
+/Module.markers
+
+#
+# RPM spec file (make rpm-pkg)
+#
+/*.spec
+
+#
+# Debian directory (make deb-pkg)
+#
+/debian/
+
+#
+# Snap directory (make snap-pkg)
+#
+/snap/
+
+#
+# tar directory (make tar*-pkg)
+#
+/tar-install/
+
+#
+# git files that we don't want to ignore even if they are dot-files
+#
+!.gitignore
+!.mailmap
+!.cocciconfig
+!.clang-format
+
+#
+# Generated include files
+#
+include/config
+include/generated
+include/ksym
+arch/*/include/generated
+
+# stgit generated dirs
+patches-*
+
+# quilt's files
+patches
+series
+
+# cscope files
+cscope.*
+ncscope.*
+
+# gnu global files
+GPATH
+GRTAGS
+GSYMS
+GTAGS
+
+# id-utils files
+ID
+
+*.orig
+*~
+\#*#
+
+#
+# Leavings from module signing
+#
+extra_certificates
+signing_key.pem
+signing_key.priv
+signing_key.x509
+x509.genkey
+
+# Kconfig presets
+all.config
+
+# Kdevelop4
+*.kdev4
diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
new file mode 100644
index 0000000..996167f
--- /dev/null
+++ b/block/blk-mq-rdma.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 Sagi Grimberg.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/blk-mq.h>
+#include <linux/blk-mq-rdma.h>
+#include <rdma/ib_verbs.h>
+
+/**
+ * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device
+ * @set:	tagset to provide the mapping for
+ * @dev:	rdma device associated with @set.
+ * @first_vec:	first interrupt vectors to use for queues (usually 0)
+ *
+ * This function assumes the rdma device @dev has at least as many available
+ * interrupt vetors as @set has queues.  It will then query it's affinity mask
+ * and built queue mapping that maps a queue to the CPUs that have irq affinity
+ * for the corresponding vector.
+ *
+ * In case either the driver passed a @dev with less vectors than
+ * @set->nr_hw_queues, or @dev does not provide an affinity mask for a
+ * vector, we fallback to the naive mapping.
+ */
+int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set,
+		struct ib_device *dev, int first_vec)
+{
+	const struct cpumask *mask;
+	unsigned int queue, cpu;
+
+	for (queue = 0; queue < set->nr_hw_queues; queue++) {
+		mask = ib_get_vector_affinity(dev, first_vec + queue);
+		if (!mask)
+			goto fallback;
+
+		for_each_cpu(cpu, mask)
+			set->mq_map[cpu] = queue;
+	}
+
+	return 0;
+
+fallback:
+	return blk_mq_map_queues(set);
+}
+EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues);
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
new file mode 100644
index 0000000..2a972ed
--- /dev/null
+++ b/drivers/infiniband/Kconfig
@@ -0,0 +1,111 @@
+menuconfig INFINIBAND
+	tristate "InfiniBand support"
+	depends on HAS_IOMEM && HAS_DMA
+	depends on NET
+	depends on INET
+	depends on m || IPV6 != m
+	depends on !ALPHA
+	select IRQ_POLL
+	---help---
+	  Core support for InfiniBand (IB).  Make sure to also select
+	  any protocols you wish to use as well as drivers for your
+	  InfiniBand hardware.
+
+if INFINIBAND
+
+config INFINIBAND_USER_MAD
+	tristate "InfiniBand userspace MAD support"
+	depends on INFINIBAND
+	---help---
+	  Userspace InfiniBand Management Datagram (MAD) support.  This
+	  is the kernel side of the userspace MAD support, which allows
+	  userspace processes to send and receive MADs. You will also
+	  need libibumad from rdma-core
+	  <https://github.com/linux-rdma/rdma-core>.
+
+config INFINIBAND_USER_ACCESS
+	tristate "InfiniBand userspace access (verbs and CM)"
+	select ANON_INODES
+	---help---
+	  Userspace InfiniBand access support.  This enables the
+	  kernel side of userspace verbs and the userspace
+	  communication manager (CM).  This allows userspace processes
+	  to set up connections and directly access InfiniBand
+	  hardware for fast-path operations.  You will also need
+	  libibverbs, libibcm and a hardware driver library from
+	  rdma-core <https://github.com/linux-rdma/rdma-core>.
+
+config INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI
+	bool "Allow experimental legacy verbs in new ioctl uAPI  (EXPERIMENTAL)"
+	depends on INFINIBAND_USER_ACCESS
+	---help---
+	  IOCTL based uAPI support for Infiniband is enabled by default for
+	  new verbs only. This allows userspace to invoke the IOCTL based uAPI
+	  for current legacy verbs too.
+
+config INFINIBAND_USER_MEM
+	bool
+	depends on INFINIBAND_USER_ACCESS != n
+	depends on MMU
+	default y
+
+config INFINIBAND_ON_DEMAND_PAGING
+	bool "InfiniBand on-demand paging support"
+	depends on INFINIBAND_USER_MEM
+	select MMU_NOTIFIER
+	default y
+	---help---
+	  On demand paging support for the InfiniBand subsystem.
+	  Together with driver support this allows registration of
+	  memory regions without pinning their pages, fetching the
+	  pages on demand instead.
+
+config INFINIBAND_ADDR_TRANS
+	bool "RDMA/CM"
+	depends on INFINIBAND
+	default y
+	---help---
+	  Support for RDMA communication manager (CM).
+	  This allows for a generic connection abstraction over RDMA.
+
+config INFINIBAND_ADDR_TRANS_CONFIGFS
+	bool
+	depends on INFINIBAND_ADDR_TRANS && CONFIGFS_FS && !(INFINIBAND=y && CONFIGFS_FS=m)
+	default y
+	---help---
+	  ConfigFS support for RDMA communication manager (CM).
+	  This allows the user to config the default GID type that the CM
+	  uses for each device, when initiaing new connections.
+
+source "drivers/infiniband/hw/mthca/Kconfig"
+source "drivers/infiniband/hw/qib/Kconfig"
+source "drivers/infiniband/hw/cxgb3/Kconfig"
+source "drivers/infiniband/hw/cxgb4/Kconfig"
+source "drivers/infiniband/hw/i40iw/Kconfig"
+source "drivers/infiniband/hw/mlx4/Kconfig"
+source "drivers/infiniband/hw/mlx5/Kconfig"
+source "drivers/infiniband/hw/nes/Kconfig"
+source "drivers/infiniband/hw/ocrdma/Kconfig"
+source "drivers/infiniband/hw/vmw_pvrdma/Kconfig"
+source "drivers/infiniband/hw/usnic/Kconfig"
+source "drivers/infiniband/hw/hns/Kconfig"
+
+source "drivers/infiniband/ulp/ipoib/Kconfig"
+
+source "drivers/infiniband/ulp/srp/Kconfig"
+source "drivers/infiniband/ulp/srpt/Kconfig"
+
+source "drivers/infiniband/ulp/iser/Kconfig"
+source "drivers/infiniband/ulp/isert/Kconfig"
+
+source "drivers/infiniband/ulp/opa_vnic/Kconfig"
+source "drivers/infiniband/sw/rdmavt/Kconfig"
+source "drivers/infiniband/sw/rxe/Kconfig"
+
+source "drivers/infiniband/hw/hfi1/Kconfig"
+
+source "drivers/infiniband/hw/qedr/Kconfig"
+
+source "drivers/infiniband/hw/bnxt_re/Kconfig"
+
+endif # INFINIBAND
diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile
new file mode 100644
index 0000000..fad0b44
--- /dev/null
+++ b/drivers/infiniband/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_INFINIBAND)		+= core/
+obj-$(CONFIG_INFINIBAND)		+= hw/
+obj-$(CONFIG_INFINIBAND)		+= ulp/
+obj-$(CONFIG_INFINIBAND)		+= sw/
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
new file mode 100644
index 0000000..dda9e85
--- /dev/null
+++ b/drivers/infiniband/core/Makefile
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: GPL-2.0
+infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= rdma_cm.o
+user_access-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= rdma_ucm.o
+
+obj-$(CONFIG_INFINIBAND) +=		ib_core.o ib_cm.o iw_cm.o \
+					$(infiniband-y)
+obj-$(CONFIG_INFINIBAND_USER_MAD) +=	ib_umad.o
+obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
+					$(user_access-y)
+
+ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
+				device.o fmr_pool.o cache.o netlink.o \
+				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
+				multicast.o mad.o smi.o agent.o mad_rmpp.o \
+				security.o nldev.o restrack.o
+
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
+ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
+
+ib_cm-y :=			cm.o
+
+iw_cm-y :=			iwcm.o iwpm_util.o iwpm_msg.o
+
+rdma_cm-y :=			cma.o
+
+rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o
+
+rdma_ucm-y :=			ucma.o
+
+ib_umad-y :=			user_mad.o
+
+ib_ucm-y :=			ucm.o
+
+ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
+				rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
+				uverbs_ioctl_merge.o uverbs_std_types_cq.o \
+				uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
+				uverbs_std_types_mr.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
new file mode 100644
index 0000000..88a7542
--- /dev/null
+++ b/drivers/infiniband/core/addr.c
@@ -0,0 +1,843 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mutex.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/module.h>
+#include <net/arp.h>
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <net/netevent.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+
+#include "core_priv.h"
+
+struct addr_req {
+	struct list_head list;
+	struct sockaddr_storage src_addr;
+	struct sockaddr_storage dst_addr;
+	struct rdma_dev_addr *addr;
+	struct rdma_addr_client *client;
+	void *context;
+	void (*callback)(int status, struct sockaddr *src_addr,
+			 struct rdma_dev_addr *addr, void *context);
+	unsigned long timeout;
+	struct delayed_work work;
+	int status;
+	u32 seq;
+};
+
+static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0);
+
+static void process_req(struct work_struct *work);
+
+static DEFINE_MUTEX(lock);
+static LIST_HEAD(req_list);
+static DECLARE_DELAYED_WORK(work, process_req);
+static struct workqueue_struct *addr_wq;
+
+static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = {
+	[LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
+		.len = sizeof(struct rdma_nla_ls_gid)},
+};
+
+static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh)
+{
+	struct nlattr *tb[LS_NLA_TYPE_MAX] = {};
+	int ret;
+
+	if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+		return false;
+
+	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+			nlmsg_len(nlh), ib_nl_addr_policy, NULL);
+	if (ret)
+		return false;
+
+	return true;
+}
+
+static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh)
+{
+	const struct nlattr *head, *curr;
+	union ib_gid gid;
+	struct addr_req *req;
+	int len, rem;
+	int found = 0;
+
+	head = (const struct nlattr *)nlmsg_data(nlh);
+	len = nlmsg_len(nlh);
+
+	nla_for_each_attr(curr, head, len, rem) {
+		if (curr->nla_type == LS_NLA_TYPE_DGID)
+			memcpy(&gid, nla_data(curr), nla_len(curr));
+	}
+
+	mutex_lock(&lock);
+	list_for_each_entry(req, &req_list, list) {
+		if (nlh->nlmsg_seq != req->seq)
+			continue;
+		/* We set the DGID part, the rest was set earlier */
+		rdma_addr_set_dgid(req->addr, &gid);
+		req->status = 0;
+		found = 1;
+		break;
+	}
+	mutex_unlock(&lock);
+
+	if (!found)
+		pr_info("Couldn't find request waiting for DGID: %pI6\n",
+			&gid);
+}
+
+int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
+			     struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
+{
+	if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
+	    !(NETLINK_CB(skb).sk))
+		return -EPERM;
+
+	if (ib_nl_is_good_ip_resp(nlh))
+		ib_nl_process_good_ip_rsep(nlh);
+
+	return skb->len;
+}
+
+static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr,
+			     const void *daddr,
+			     u32 seq, u16 family)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	struct rdma_ls_ip_resolve_header *header;
+	void *data;
+	size_t size;
+	int attrtype;
+	int len;
+
+	if (family == AF_INET) {
+		size = sizeof(struct in_addr);
+		attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV4;
+	} else {
+		size = sizeof(struct in6_addr);
+		attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6;
+	}
+
+	len = nla_total_size(sizeof(size));
+	len += NLMSG_ALIGN(sizeof(*header));
+
+	skb = nlmsg_new(len, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_LS,
+			    RDMA_NL_LS_OP_IP_RESOLVE, NLM_F_REQUEST);
+	if (!data) {
+		nlmsg_free(skb);
+		return -ENODATA;
+	}
+
+	/* Construct the family header first */
+	header = skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+	header->ifindex = dev_addr->bound_dev_if;
+	nla_put(skb, attrtype, size, daddr);
+
+	/* Repair the nlmsg header length */
+	nlmsg_end(skb, nlh);
+	rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, GFP_KERNEL);
+
+	/* Make the request retry, so when we get the response from userspace
+	 * we will have something.
+	 */
+	return -ENODATA;
+}
+
+int rdma_addr_size(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		return sizeof(struct sockaddr_in);
+	case AF_INET6:
+		return sizeof(struct sockaddr_in6);
+	case AF_IB:
+		return sizeof(struct sockaddr_ib);
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(rdma_addr_size);
+
+int rdma_addr_size_in6(struct sockaddr_in6 *addr)
+{
+	int ret = rdma_addr_size((struct sockaddr *) addr);
+
+	return ret <= sizeof(*addr) ? ret : 0;
+}
+EXPORT_SYMBOL(rdma_addr_size_in6);
+
+int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr)
+{
+	int ret = rdma_addr_size((struct sockaddr *) addr);
+
+	return ret <= sizeof(*addr) ? ret : 0;
+}
+EXPORT_SYMBOL(rdma_addr_size_kss);
+
+static struct rdma_addr_client self;
+
+void rdma_addr_register_client(struct rdma_addr_client *client)
+{
+	atomic_set(&client->refcount, 1);
+	init_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_register_client);
+
+static inline void put_client(struct rdma_addr_client *client)
+{
+	if (atomic_dec_and_test(&client->refcount))
+		complete(&client->comp);
+}
+
+void rdma_addr_unregister_client(struct rdma_addr_client *client)
+{
+	put_client(client);
+	wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_unregister_client);
+
+void rdma_copy_addr(struct rdma_dev_addr *dev_addr,
+		    const struct net_device *dev,
+		    const unsigned char *dst_dev_addr)
+{
+	dev_addr->dev_type = dev->type;
+	memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
+	memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
+	if (dst_dev_addr)
+		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
+	dev_addr->bound_dev_if = dev->ifindex;
+}
+EXPORT_SYMBOL(rdma_copy_addr);
+
+int rdma_translate_ip(const struct sockaddr *addr,
+		      struct rdma_dev_addr *dev_addr)
+{
+	struct net_device *dev;
+
+	if (dev_addr->bound_dev_if) {
+		dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
+		if (!dev)
+			return -ENODEV;
+		rdma_copy_addr(dev_addr, dev, NULL);
+		dev_put(dev);
+		return 0;
+	}
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		dev = ip_dev_find(dev_addr->net,
+			((const struct sockaddr_in *)addr)->sin_addr.s_addr);
+
+		if (!dev)
+			return -EADDRNOTAVAIL;
+
+		rdma_copy_addr(dev_addr, dev, NULL);
+		dev_put(dev);
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		rcu_read_lock();
+		for_each_netdev_rcu(dev_addr->net, dev) {
+			if (ipv6_chk_addr(dev_addr->net,
+					  &((const struct sockaddr_in6 *)addr)->sin6_addr,
+					  dev, 1)) {
+				rdma_copy_addr(dev_addr, dev, NULL);
+				break;
+			}
+		}
+		rcu_read_unlock();
+		break;
+#endif
+	}
+	return 0;
+}
+EXPORT_SYMBOL(rdma_translate_ip);
+
+static void set_timeout(struct delayed_work *delayed_work, unsigned long time)
+{
+	unsigned long delay;
+
+	delay = time - jiffies;
+	if ((long)delay < 0)
+		delay = 0;
+
+	mod_delayed_work(addr_wq, delayed_work, delay);
+}
+
+static void queue_req(struct addr_req *req)
+{
+	struct addr_req *temp_req;
+
+	mutex_lock(&lock);
+	list_for_each_entry_reverse(temp_req, &req_list, list) {
+		if (time_after_eq(req->timeout, temp_req->timeout))
+			break;
+	}
+
+	list_add(&req->list, &temp_req->list);
+
+	set_timeout(&req->work, req->timeout);
+	mutex_unlock(&lock);
+}
+
+static int ib_nl_fetch_ha(const struct dst_entry *dst,
+			  struct rdma_dev_addr *dev_addr,
+			  const void *daddr, u32 seq, u16 family)
+{
+	if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS))
+		return -EADDRNOTAVAIL;
+
+	/* We fill in what we can, the response will fill the rest */
+	rdma_copy_addr(dev_addr, dst->dev, NULL);
+	return ib_nl_ip_send_msg(dev_addr, daddr, seq, family);
+}
+
+static int dst_fetch_ha(const struct dst_entry *dst,
+			struct rdma_dev_addr *dev_addr,
+			const void *daddr)
+{
+	struct neighbour *n;
+	int ret = 0;
+
+	n = dst_neigh_lookup(dst, daddr);
+
+	rcu_read_lock();
+	if (!n || !(n->nud_state & NUD_VALID)) {
+		if (n)
+			neigh_event_send(n, NULL);
+		ret = -ENODATA;
+	} else {
+		rdma_copy_addr(dev_addr, dst->dev, n->ha);
+	}
+	rcu_read_unlock();
+
+	if (n)
+		neigh_release(n);
+
+	return ret;
+}
+
+static bool has_gateway(const struct dst_entry *dst, sa_family_t family)
+{
+	struct rtable *rt;
+	struct rt6_info *rt6;
+
+	if (family == AF_INET) {
+		rt = container_of(dst, struct rtable, dst);
+		return rt->rt_uses_gateway;
+	}
+
+	rt6 = container_of(dst, struct rt6_info, dst);
+	return rt6->rt6i_flags & RTF_GATEWAY;
+}
+
+static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
+		    const struct sockaddr *dst_in, u32 seq)
+{
+	const struct sockaddr_in *dst_in4 =
+		(const struct sockaddr_in *)dst_in;
+	const struct sockaddr_in6 *dst_in6 =
+		(const struct sockaddr_in6 *)dst_in;
+	const void *daddr = (dst_in->sa_family == AF_INET) ?
+		(const void *)&dst_in4->sin_addr.s_addr :
+		(const void *)&dst_in6->sin6_addr;
+	sa_family_t family = dst_in->sa_family;
+
+	/* Gateway + ARPHRD_INFINIBAND -> IB router */
+	if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND)
+		return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family);
+	else
+		return dst_fetch_ha(dst, dev_addr, daddr);
+}
+
+static int addr4_resolve(struct sockaddr_in *src_in,
+			 const struct sockaddr_in *dst_in,
+			 struct rdma_dev_addr *addr,
+			 struct rtable **prt)
+{
+	__be32 src_ip = src_in->sin_addr.s_addr;
+	__be32 dst_ip = dst_in->sin_addr.s_addr;
+	struct rtable *rt;
+	struct flowi4 fl4;
+	int ret;
+
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.daddr = dst_ip;
+	fl4.saddr = src_ip;
+	fl4.flowi4_oif = addr->bound_dev_if;
+	rt = ip_route_output_key(addr->net, &fl4);
+	ret = PTR_ERR_OR_ZERO(rt);
+	if (ret)
+		return ret;
+
+	src_in->sin_family = AF_INET;
+	src_in->sin_addr.s_addr = fl4.saddr;
+
+	/* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
+	 * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
+	 * type accordingly.
+	 */
+	if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND)
+		addr->network = RDMA_NETWORK_IPV4;
+
+	addr->hoplimit = ip4_dst_hoplimit(&rt->dst);
+
+	*prt = rt;
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+			 const struct sockaddr_in6 *dst_in,
+			 struct rdma_dev_addr *addr,
+			 struct dst_entry **pdst)
+{
+	struct flowi6 fl6;
+	struct dst_entry *dst;
+	struct rt6_info *rt;
+	int ret;
+
+	memset(&fl6, 0, sizeof fl6);
+	fl6.daddr = dst_in->sin6_addr;
+	fl6.saddr = src_in->sin6_addr;
+	fl6.flowi6_oif = addr->bound_dev_if;
+
+	ret = ipv6_stub->ipv6_dst_lookup(addr->net, NULL, &dst, &fl6);
+	if (ret < 0)
+		return ret;
+
+	rt = (struct rt6_info *)dst;
+	if (ipv6_addr_any(&src_in->sin6_addr)) {
+		src_in->sin6_family = AF_INET6;
+		src_in->sin6_addr = fl6.saddr;
+	}
+
+	/* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
+	 * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
+	 * type accordingly.
+	 */
+	if (rt->rt6i_flags & RTF_GATEWAY &&
+	    ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND)
+		addr->network = RDMA_NETWORK_IPV6;
+
+	addr->hoplimit = ip6_dst_hoplimit(dst);
+
+	*pdst = dst;
+	return 0;
+}
+#else
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+			 const struct sockaddr_in6 *dst_in,
+			 struct rdma_dev_addr *addr,
+			 struct dst_entry **pdst)
+{
+	return -EADDRNOTAVAIL;
+}
+#endif
+
+static int addr_resolve_neigh(const struct dst_entry *dst,
+			      const struct sockaddr *dst_in,
+			      struct rdma_dev_addr *addr,
+			      u32 seq)
+{
+	if (dst->dev->flags & IFF_LOOPBACK) {
+		int ret;
+
+		ret = rdma_translate_ip(dst_in, addr);
+		if (!ret)
+			memcpy(addr->dst_dev_addr, addr->src_dev_addr,
+			       MAX_ADDR_LEN);
+
+		return ret;
+	}
+
+	/* If the device doesn't do ARP internally */
+	if (!(dst->dev->flags & IFF_NOARP))
+		return fetch_ha(dst, addr, dst_in, seq);
+
+	rdma_copy_addr(addr, dst->dev, NULL);
+
+	return 0;
+}
+
+static int addr_resolve(struct sockaddr *src_in,
+			const struct sockaddr *dst_in,
+			struct rdma_dev_addr *addr,
+			bool resolve_neigh,
+			u32 seq)
+{
+	struct net_device *ndev;
+	struct dst_entry *dst;
+	int ret;
+
+	if (!addr->net) {
+		pr_warn_ratelimited("%s: missing namespace\n", __func__);
+		return -EINVAL;
+	}
+
+	if (src_in->sa_family == AF_INET) {
+		struct rtable *rt = NULL;
+		const struct sockaddr_in *dst_in4 =
+			(const struct sockaddr_in *)dst_in;
+
+		ret = addr4_resolve((struct sockaddr_in *)src_in,
+				    dst_in4, addr, &rt);
+		if (ret)
+			return ret;
+
+		if (resolve_neigh)
+			ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq);
+
+		if (addr->bound_dev_if) {
+			ndev = dev_get_by_index(addr->net, addr->bound_dev_if);
+		} else {
+			ndev = rt->dst.dev;
+			dev_hold(ndev);
+		}
+
+		ip_rt_put(rt);
+	} else {
+		const struct sockaddr_in6 *dst_in6 =
+			(const struct sockaddr_in6 *)dst_in;
+
+		ret = addr6_resolve((struct sockaddr_in6 *)src_in,
+				    dst_in6, addr,
+				    &dst);
+		if (ret)
+			return ret;
+
+		if (resolve_neigh)
+			ret = addr_resolve_neigh(dst, dst_in, addr, seq);
+
+		if (addr->bound_dev_if) {
+			ndev = dev_get_by_index(addr->net, addr->bound_dev_if);
+		} else {
+			ndev = dst->dev;
+			dev_hold(ndev);
+		}
+
+		dst_release(dst);
+	}
+
+	if (ndev) {
+		if (ndev->flags & IFF_LOOPBACK)
+			ret = rdma_translate_ip(dst_in, addr);
+		else
+			addr->bound_dev_if = ndev->ifindex;
+		dev_put(ndev);
+	}
+
+	return ret;
+}
+
+static void process_one_req(struct work_struct *_work)
+{
+	struct addr_req *req;
+	struct sockaddr *src_in, *dst_in;
+
+	mutex_lock(&lock);
+	req = container_of(_work, struct addr_req, work.work);
+
+	if (req->status == -ENODATA) {
+		src_in = (struct sockaddr *)&req->src_addr;
+		dst_in = (struct sockaddr *)&req->dst_addr;
+		req->status = addr_resolve(src_in, dst_in, req->addr,
+					   true, req->seq);
+		if (req->status && time_after_eq(jiffies, req->timeout)) {
+			req->status = -ETIMEDOUT;
+		} else if (req->status == -ENODATA) {
+			/* requeue the work for retrying again */
+			set_timeout(&req->work, req->timeout);
+			mutex_unlock(&lock);
+			return;
+		}
+	}
+	list_del(&req->list);
+	mutex_unlock(&lock);
+
+	/*
+	 * Although the work will normally have been canceled by the
+	 * workqueue, it can still be requeued as long as it is on the
+	 * req_list, so it could have been requeued before we grabbed &lock.
+	 * We need to cancel it after it is removed from req_list to really be
+	 * sure it is safe to free.
+	 */
+	cancel_delayed_work(&req->work);
+
+	req->callback(req->status, (struct sockaddr *)&req->src_addr,
+		req->addr, req->context);
+	put_client(req->client);
+	kfree(req);
+}
+
+static void process_req(struct work_struct *work)
+{
+	struct addr_req *req, *temp_req;
+	struct sockaddr *src_in, *dst_in;
+	struct list_head done_list;
+
+	INIT_LIST_HEAD(&done_list);
+
+	mutex_lock(&lock);
+	list_for_each_entry_safe(req, temp_req, &req_list, list) {
+		if (req->status == -ENODATA) {
+			src_in = (struct sockaddr *) &req->src_addr;
+			dst_in = (struct sockaddr *) &req->dst_addr;
+			req->status = addr_resolve(src_in, dst_in, req->addr,
+						   true, req->seq);
+			if (req->status && time_after_eq(jiffies, req->timeout))
+				req->status = -ETIMEDOUT;
+			else if (req->status == -ENODATA) {
+				set_timeout(&req->work, req->timeout);
+				continue;
+			}
+		}
+		list_move_tail(&req->list, &done_list);
+	}
+
+	mutex_unlock(&lock);
+
+	list_for_each_entry_safe(req, temp_req, &done_list, list) {
+		list_del(&req->list);
+		/* It is safe to cancel other work items from this work item
+		 * because at a time there can be only one work item running
+		 * with this single threaded work queue.
+		 */
+		cancel_delayed_work(&req->work);
+		req->callback(req->status, (struct sockaddr *) &req->src_addr,
+			req->addr, req->context);
+		put_client(req->client);
+		kfree(req);
+	}
+}
+
+int rdma_resolve_ip(struct rdma_addr_client *client,
+		    struct sockaddr *src_addr, struct sockaddr *dst_addr,
+		    struct rdma_dev_addr *addr, int timeout_ms,
+		    void (*callback)(int status, struct sockaddr *src_addr,
+				     struct rdma_dev_addr *addr, void *context),
+		    void *context)
+{
+	struct sockaddr *src_in, *dst_in;
+	struct addr_req *req;
+	int ret = 0;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	src_in = (struct sockaddr *) &req->src_addr;
+	dst_in = (struct sockaddr *) &req->dst_addr;
+
+	if (src_addr) {
+		if (src_addr->sa_family != dst_addr->sa_family) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		memcpy(src_in, src_addr, rdma_addr_size(src_addr));
+	} else {
+		src_in->sa_family = dst_addr->sa_family;
+	}
+
+	memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr));
+	req->addr = addr;
+	req->callback = callback;
+	req->context = context;
+	req->client = client;
+	atomic_inc(&client->refcount);
+	INIT_DELAYED_WORK(&req->work, process_one_req);
+	req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq);
+
+	req->status = addr_resolve(src_in, dst_in, addr, true, req->seq);
+	switch (req->status) {
+	case 0:
+		req->timeout = jiffies;
+		queue_req(req);
+		break;
+	case -ENODATA:
+		req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
+		queue_req(req);
+		break;
+	default:
+		ret = req->status;
+		atomic_dec(&client->refcount);
+		goto err;
+	}
+	return ret;
+err:
+	kfree(req);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ip);
+
+int rdma_resolve_ip_route(struct sockaddr *src_addr,
+			  const struct sockaddr *dst_addr,
+			  struct rdma_dev_addr *addr)
+{
+	struct sockaddr_storage ssrc_addr = {};
+	struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr;
+
+	if (src_addr) {
+		if (src_addr->sa_family != dst_addr->sa_family)
+			return -EINVAL;
+
+		memcpy(src_in, src_addr, rdma_addr_size(src_addr));
+	} else {
+		src_in->sa_family = dst_addr->sa_family;
+	}
+
+	return addr_resolve(src_in, dst_addr, addr, false, 0);
+}
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr)
+{
+	struct addr_req *req, *temp_req;
+
+	mutex_lock(&lock);
+	list_for_each_entry_safe(req, temp_req, &req_list, list) {
+		if (req->addr == addr) {
+			req->status = -ECANCELED;
+			req->timeout = jiffies;
+			list_move(&req->list, &req_list);
+			set_timeout(&req->work, req->timeout);
+			break;
+		}
+	}
+	mutex_unlock(&lock);
+}
+EXPORT_SYMBOL(rdma_addr_cancel);
+
+struct resolve_cb_context {
+	struct completion comp;
+	int status;
+};
+
+static void resolve_cb(int status, struct sockaddr *src_addr,
+	     struct rdma_dev_addr *addr, void *context)
+{
+	((struct resolve_cb_context *)context)->status = status;
+	complete(&((struct resolve_cb_context *)context)->comp);
+}
+
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+				 const union ib_gid *dgid,
+				 u8 *dmac, const struct net_device *ndev,
+				 int *hoplimit)
+{
+	struct rdma_dev_addr dev_addr;
+	struct resolve_cb_context ctx;
+	union {
+		struct sockaddr     _sockaddr;
+		struct sockaddr_in  _sockaddr_in;
+		struct sockaddr_in6 _sockaddr_in6;
+	} sgid_addr, dgid_addr;
+	int ret;
+
+	rdma_gid2ip(&sgid_addr._sockaddr, sgid);
+	rdma_gid2ip(&dgid_addr._sockaddr, dgid);
+
+	memset(&dev_addr, 0, sizeof(dev_addr));
+	dev_addr.bound_dev_if = ndev->ifindex;
+	dev_addr.net = &init_net;
+
+	init_completion(&ctx.comp);
+	ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr,
+			&dev_addr, 1000, resolve_cb, &ctx);
+	if (ret)
+		return ret;
+
+	wait_for_completion(&ctx.comp);
+
+	ret = ctx.status;
+	if (ret)
+		return ret;
+
+	memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
+	*hoplimit = dev_addr.hoplimit;
+	return 0;
+}
+
+static int netevent_callback(struct notifier_block *self, unsigned long event,
+	void *ctx)
+{
+	if (event == NETEVENT_NEIGH_UPDATE) {
+		struct neighbour *neigh = ctx;
+
+		if (neigh->nud_state & NUD_VALID)
+			set_timeout(&work, jiffies);
+	}
+	return 0;
+}
+
+static struct notifier_block nb = {
+	.notifier_call = netevent_callback
+};
+
+int addr_init(void)
+{
+	addr_wq = alloc_ordered_workqueue("ib_addr", 0);
+	if (!addr_wq)
+		return -ENOMEM;
+
+	register_netevent_notifier(&nb);
+	rdma_addr_register_client(&self);
+
+	return 0;
+}
+
+void addr_cleanup(void)
+{
+	rdma_addr_unregister_client(&self);
+	unregister_netevent_notifier(&nb);
+	destroy_workqueue(addr_wq);
+}
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
new file mode 100644
index 0000000..324ef85
--- /dev/null
+++ b/drivers/infiniband/core/agent.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "agent.h"
+#include "smi.h"
+#include "mad_priv.h"
+
+#define SPFX "ib_agent: "
+
+struct ib_agent_port_private {
+	struct list_head port_list;
+	struct ib_mad_agent *agent[2];
+};
+
+static DEFINE_SPINLOCK(ib_agent_port_list_lock);
+static LIST_HEAD(ib_agent_port_list);
+
+static struct ib_agent_port_private *
+__ib_get_agent_port(const struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *entry;
+
+	list_for_each_entry(entry, &ib_agent_port_list, port_list) {
+		if (entry->agent[1]->device == device &&
+		    entry->agent[1]->port_num == port_num)
+			return entry;
+	}
+	return NULL;
+}
+
+static struct ib_agent_port_private *
+ib_get_agent_port(const struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	entry = __ib_get_agent_port(device, port_num);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+	return entry;
+}
+
+void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh,
+			 const struct ib_wc *wc, const struct ib_device *device,
+			 int port_num, int qpn, size_t resp_mad_len, bool opa)
+{
+	struct ib_agent_port_private *port_priv;
+	struct ib_mad_agent *agent;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_ah *ah;
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	if (rdma_cap_ib_switch(device))
+		port_priv = ib_get_agent_port(device, 0);
+	else
+		port_priv = ib_get_agent_port(device, port_num);
+
+	if (!port_priv) {
+		dev_err(&device->dev, "Unable to find port agent\n");
+		return;
+	}
+
+	agent = port_priv->agent[qpn];
+	ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
+	if (IS_ERR(ah)) {
+		dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n",
+			PTR_ERR(ah));
+		return;
+	}
+
+	if (opa && mad_hdr->base_version != OPA_MGMT_BASE_VERSION)
+		resp_mad_len = IB_MGMT_MAD_SIZE;
+
+	send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
+				      IB_MGMT_MAD_HDR,
+				      resp_mad_len - IB_MGMT_MAD_HDR,
+				      GFP_KERNEL,
+				      mad_hdr->base_version);
+	if (IS_ERR(send_buf)) {
+		dev_err(&device->dev, "ib_create_send_mad error\n");
+		goto err1;
+	}
+
+	memcpy(send_buf->mad, mad_hdr, resp_mad_len);
+	send_buf->ah = ah;
+
+	if (rdma_cap_ib_switch(device)) {
+		mad_send_wr = container_of(send_buf,
+					   struct ib_mad_send_wr_private,
+					   send_buf);
+		mad_send_wr->send_wr.port_num = port_num;
+	}
+
+	if (ib_post_send_mad(send_buf, NULL)) {
+		dev_err(&device->dev, "ib_post_send_mad error\n");
+		goto err2;
+	}
+	return;
+err2:
+	ib_free_send_mad(send_buf);
+err1:
+	rdma_destroy_ah(ah);
+}
+
+static void agent_send_handler(struct ib_mad_agent *mad_agent,
+			       struct ib_mad_send_wc *mad_send_wc)
+{
+	rdma_destroy_ah(mad_send_wc->send_buf->ah);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int ib_agent_port_open(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *port_priv;
+	unsigned long flags;
+	int ret;
+
+	/* Create new device info */
+	port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+	if (!port_priv) {
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	if (rdma_cap_ib_smi(device, port_num)) {
+		/* Obtain send only MAD agent for SMI QP */
+		port_priv->agent[0] = ib_register_mad_agent(device, port_num,
+							    IB_QPT_SMI, NULL, 0,
+							    &agent_send_handler,
+							    NULL, NULL, 0);
+		if (IS_ERR(port_priv->agent[0])) {
+			ret = PTR_ERR(port_priv->agent[0]);
+			goto error2;
+		}
+	}
+
+	/* Obtain send only MAD agent for GSI QP */
+	port_priv->agent[1] = ib_register_mad_agent(device, port_num,
+						    IB_QPT_GSI, NULL, 0,
+						    &agent_send_handler,
+						    NULL, NULL, 0);
+	if (IS_ERR(port_priv->agent[1])) {
+		ret = PTR_ERR(port_priv->agent[1]);
+		goto error3;
+	}
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	list_add_tail(&port_priv->port_list, &ib_agent_port_list);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+	return 0;
+
+error3:
+	if (port_priv->agent[0])
+		ib_unregister_mad_agent(port_priv->agent[0]);
+error2:
+	kfree(port_priv);
+error1:
+	return ret;
+}
+
+int ib_agent_port_close(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *port_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	port_priv = __ib_get_agent_port(device, port_num);
+	if (port_priv == NULL) {
+		spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+		dev_err(&device->dev, "Port %d not found\n", port_num);
+		return -ENODEV;
+	}
+	list_del(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+	ib_unregister_mad_agent(port_priv->agent[1]);
+	if (port_priv->agent[0])
+		ib_unregister_mad_agent(port_priv->agent[0]);
+
+	kfree(port_priv);
+	return 0;
+}
diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h
new file mode 100644
index 0000000..65f92be
--- /dev/null
+++ b/drivers/infiniband/core/agent.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __AGENT_H_
+#define __AGENT_H_
+
+#include <linux/err.h>
+#include <rdma/ib_mad.h>
+
+extern int ib_agent_port_open(struct ib_device *device, int port_num);
+
+extern int ib_agent_port_close(struct ib_device *device, int port_num);
+
+extern void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh,
+				const struct ib_wc *wc, const struct ib_device *device,
+				int port_num, int qpn, size_t resp_mad_len, bool opa);
+
+#endif	/* __AGENT_H_ */
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
new file mode 100644
index 0000000..ecc55e9
--- /dev/null
+++ b/drivers/infiniband/core/cache.c
@@ -0,0 +1,1311 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
+
+#include <rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+struct ib_pkey_cache {
+	int             table_len;
+	u16             table[0];
+};
+
+struct ib_update_work {
+	struct work_struct work;
+	struct ib_device  *device;
+	u8                 port_num;
+	bool		   enforce_security;
+};
+
+union ib_gid zgid;
+EXPORT_SYMBOL(zgid);
+
+enum gid_attr_find_mask {
+	GID_ATTR_FIND_MASK_GID          = 1UL << 0,
+	GID_ATTR_FIND_MASK_NETDEV	= 1UL << 1,
+	GID_ATTR_FIND_MASK_DEFAULT	= 1UL << 2,
+	GID_ATTR_FIND_MASK_GID_TYPE	= 1UL << 3,
+};
+
+enum gid_table_entry_props {
+	GID_TABLE_ENTRY_INVALID		= 1UL << 0,
+	GID_TABLE_ENTRY_DEFAULT		= 1UL << 1,
+};
+
+struct ib_gid_table_entry {
+	unsigned long	    props;
+	union ib_gid        gid;
+	struct ib_gid_attr  attr;
+	void		   *context;
+};
+
+struct ib_gid_table {
+	int                  sz;
+	/* In RoCE, adding a GID to the table requires:
+	 * (a) Find if this GID is already exists.
+	 * (b) Find a free space.
+	 * (c) Write the new GID
+	 *
+	 * Delete requires different set of operations:
+	 * (a) Find the GID
+	 * (b) Delete it.
+	 *
+	 **/
+	/* Any writer to data_vec must hold this lock and the write side of
+	 * rwlock. readers must hold only rwlock. All writers must be in a
+	 * sleepable context.
+	 */
+	struct mutex         lock;
+	/* rwlock protects data_vec[ix]->props. */
+	rwlock_t	     rwlock;
+	struct ib_gid_table_entry *data_vec;
+};
+
+static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port)
+{
+	struct ib_event event;
+
+	event.device		= ib_dev;
+	event.element.port_num	= port;
+	event.event		= IB_EVENT_GID_CHANGE;
+
+	ib_dispatch_event(&event);
+}
+
+static const char * const gid_type_str[] = {
+	[IB_GID_TYPE_IB]	= "IB/RoCE v1",
+	[IB_GID_TYPE_ROCE_UDP_ENCAP]	= "RoCE v2",
+};
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type)
+{
+	if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type])
+		return gid_type_str[gid_type];
+
+	return "Invalid GID type";
+}
+EXPORT_SYMBOL(ib_cache_gid_type_str);
+
+int ib_cache_gid_parse_type_str(const char *buf)
+{
+	unsigned int i;
+	size_t len;
+	int err = -EINVAL;
+
+	len = strlen(buf);
+	if (len == 0)
+		return -EINVAL;
+
+	if (buf[len - 1] == '\n')
+		len--;
+
+	for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i)
+		if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) &&
+		    len == strlen(gid_type_str[i])) {
+			err = i;
+			break;
+		}
+
+	return err;
+}
+EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
+
+static void del_roce_gid(struct ib_device *device, u8 port_num,
+			 struct ib_gid_table *table, int ix)
+{
+	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
+		 device->name, port_num, ix,
+		 table->data_vec[ix].gid.raw);
+
+	if (rdma_cap_roce_gid_table(device, port_num))
+		device->del_gid(&table->data_vec[ix].attr,
+				&table->data_vec[ix].context);
+	dev_put(table->data_vec[ix].attr.ndev);
+}
+
+static int add_roce_gid(struct ib_gid_table *table,
+			const union ib_gid *gid,
+			const struct ib_gid_attr *attr)
+{
+	struct ib_gid_table_entry *entry;
+	int ix = attr->index;
+	int ret = 0;
+
+	if (!attr->ndev) {
+		pr_err("%s NULL netdev device=%s port=%d index=%d\n",
+		       __func__, attr->device->name, attr->port_num,
+		       attr->index);
+		return -EINVAL;
+	}
+
+	entry = &table->data_vec[ix];
+	if ((entry->props & GID_TABLE_ENTRY_INVALID) == 0) {
+		WARN(1, "GID table corruption device=%s port=%d index=%d\n",
+		     attr->device->name, attr->port_num,
+		     attr->index);
+		return -EINVAL;
+	}
+
+	if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) {
+		ret = attr->device->add_gid(gid, attr, &entry->context);
+		if (ret) {
+			pr_err("%s GID add failed device=%s port=%d index=%d\n",
+			       __func__, attr->device->name, attr->port_num,
+			       attr->index);
+			goto add_err;
+		}
+	}
+	dev_hold(attr->ndev);
+
+add_err:
+	if (!ret)
+		pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
+			 attr->device->name, attr->port_num, ix, gid->raw);
+	return ret;
+}
+
+/**
+ * add_modify_gid - Add or modify GID table entry
+ *
+ * @table:	GID table in which GID to be added or modified
+ * @gid:	GID content
+ * @attr:	Attributes of the GID
+ *
+ * Returns 0 on success or appropriate error code. It accepts zero
+ * GID addition for non RoCE ports for HCA's who report them as valid
+ * GID. However such zero GIDs are not added to the cache.
+ */
+static int add_modify_gid(struct ib_gid_table *table,
+			  const union ib_gid *gid,
+			  const struct ib_gid_attr *attr)
+{
+	int ret;
+
+	if (rdma_protocol_roce(attr->device, attr->port_num)) {
+		ret = add_roce_gid(table, gid, attr);
+		if (ret)
+			return ret;
+	} else {
+		/*
+		 * Some HCA's report multiple GID entries with only one
+		 * valid GID, but remaining as zero GID.
+		 * So ignore such behavior for IB link layer and don't
+		 * fail the call, but don't add such entry to GID cache.
+		 */
+		if (!memcmp(gid, &zgid, sizeof(*gid)))
+			return 0;
+	}
+
+	lockdep_assert_held(&table->lock);
+	memcpy(&table->data_vec[attr->index].gid, gid, sizeof(*gid));
+	memcpy(&table->data_vec[attr->index].attr, attr, sizeof(*attr));
+
+	write_lock_irq(&table->rwlock);
+	table->data_vec[attr->index].props &= ~GID_TABLE_ENTRY_INVALID;
+	write_unlock_irq(&table->rwlock);
+	return 0;
+}
+
+/**
+ * del_gid - Delete GID table entry
+ *
+ * @ib_dev:	IB device whose GID entry to be deleted
+ * @port:	Port number of the IB device
+ * @table:	GID table of the IB device for a port
+ * @ix:		GID entry index to delete
+ *
+ */
+static void del_gid(struct ib_device *ib_dev, u8 port,
+		    struct ib_gid_table *table, int ix)
+{
+	lockdep_assert_held(&table->lock);
+	write_lock_irq(&table->rwlock);
+	table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
+	write_unlock_irq(&table->rwlock);
+
+	if (rdma_protocol_roce(ib_dev, port))
+		del_roce_gid(ib_dev, port, table, ix);
+	memcpy(&table->data_vec[ix].gid, &zgid, sizeof(zgid));
+	memset(&table->data_vec[ix].attr, 0, sizeof(table->data_vec[ix].attr));
+	table->data_vec[ix].context = NULL;
+}
+
+/* rwlock should be read locked, or lock should be held */
+static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
+		    const struct ib_gid_attr *val, bool default_gid,
+		    unsigned long mask, int *pempty)
+{
+	int i = 0;
+	int found = -1;
+	int empty = pempty ? -1 : 0;
+
+	while (i < table->sz && (found < 0 || empty < 0)) {
+		struct ib_gid_table_entry *data = &table->data_vec[i];
+		struct ib_gid_attr *attr = &data->attr;
+		int curr_index = i;
+
+		i++;
+
+		/* find_gid() is used during GID addition where it is expected
+		 * to return a free entry slot which is not duplicate.
+		 * Free entry slot is requested and returned if pempty is set,
+		 * so lookup free slot only if requested.
+		 */
+		if (pempty && empty < 0) {
+			if (data->props & GID_TABLE_ENTRY_INVALID &&
+			    (default_gid ==
+			     !!(data->props & GID_TABLE_ENTRY_DEFAULT))) {
+				/*
+				 * Found an invalid (free) entry; allocate it.
+				 * If default GID is requested, then our
+				 * found slot must be one of the DEFAULT
+				 * reserved slots or we fail.
+				 * This ensures that only DEFAULT reserved
+				 * slots are used for default property GIDs.
+				 */
+				empty = curr_index;
+			}
+		}
+
+		/*
+		 * Additionally find_gid() is used to find valid entry during
+		 * lookup operation, where validity needs to be checked. So
+		 * find the empty entry first to continue to search for a free
+		 * slot and ignore its INVALID flag.
+		 */
+		if (data->props & GID_TABLE_ENTRY_INVALID)
+			continue;
+
+		if (found >= 0)
+			continue;
+
+		if (mask & GID_ATTR_FIND_MASK_GID_TYPE &&
+		    attr->gid_type != val->gid_type)
+			continue;
+
+		if (mask & GID_ATTR_FIND_MASK_GID &&
+		    memcmp(gid, &data->gid, sizeof(*gid)))
+			continue;
+
+		if (mask & GID_ATTR_FIND_MASK_NETDEV &&
+		    attr->ndev != val->ndev)
+			continue;
+
+		if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
+		    !!(data->props & GID_TABLE_ENTRY_DEFAULT) !=
+		    default_gid)
+			continue;
+
+		found = curr_index;
+	}
+
+	if (pempty)
+		*pempty = empty;
+
+	return found;
+}
+
+static void make_default_gid(struct  net_device *dev, union ib_gid *gid)
+{
+	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+	addrconf_ifid_eui48(&gid->raw[8], dev);
+}
+
+static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+			      union ib_gid *gid, struct ib_gid_attr *attr,
+			      unsigned long mask, bool default_gid)
+{
+	struct ib_gid_table *table;
+	int ret = 0;
+	int empty;
+	int ix;
+
+	/* Do not allow adding zero GID in support of
+	 * IB spec version 1.3 section 4.1.1 point (6) and
+	 * section 12.7.10 and section 12.7.20
+	 */
+	if (!memcmp(gid, &zgid, sizeof(*gid)))
+		return -EINVAL;
+
+	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
+
+	mutex_lock(&table->lock);
+
+	ix = find_gid(table, gid, attr, default_gid, mask, &empty);
+	if (ix >= 0)
+		goto out_unlock;
+
+	if (empty < 0) {
+		ret = -ENOSPC;
+		goto out_unlock;
+	}
+	attr->device = ib_dev;
+	attr->index = empty;
+	attr->port_num = port;
+	ret = add_modify_gid(table, gid, attr);
+	if (!ret)
+		dispatch_gid_change_event(ib_dev, port);
+
+out_unlock:
+	mutex_unlock(&table->lock);
+	if (ret)
+		pr_warn("%s: unable to add gid %pI6 error=%d\n",
+			__func__, gid->raw, ret);
+	return ret;
+}
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+		     union ib_gid *gid, struct ib_gid_attr *attr)
+{
+	struct net_device *idev;
+	unsigned long mask;
+	int ret;
+
+	if (ib_dev->get_netdev) {
+		idev = ib_dev->get_netdev(ib_dev, port);
+		if (idev && attr->ndev != idev) {
+			union ib_gid default_gid;
+
+			/* Adding default GIDs in not permitted */
+			make_default_gid(idev, &default_gid);
+			if (!memcmp(gid, &default_gid, sizeof(*gid))) {
+				dev_put(idev);
+				return -EPERM;
+			}
+		}
+		if (idev)
+			dev_put(idev);
+	}
+
+	mask = GID_ATTR_FIND_MASK_GID |
+	       GID_ATTR_FIND_MASK_GID_TYPE |
+	       GID_ATTR_FIND_MASK_NETDEV;
+
+	ret = __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false);
+	return ret;
+}
+
+static int
+_ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+		  union ib_gid *gid, struct ib_gid_attr *attr,
+		  unsigned long mask, bool default_gid)
+{
+	struct ib_gid_table *table;
+	int ret = 0;
+	int ix;
+
+	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
+
+	mutex_lock(&table->lock);
+
+	ix = find_gid(table, gid, attr, default_gid, mask, NULL);
+	if (ix < 0) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	del_gid(ib_dev, port, table, ix);
+	dispatch_gid_change_event(ib_dev, port);
+
+out_unlock:
+	mutex_unlock(&table->lock);
+	if (ret)
+		pr_debug("%s: can't delete gid %pI6 error=%d\n",
+			 __func__, gid->raw, ret);
+	return ret;
+}
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+		     union ib_gid *gid, struct ib_gid_attr *attr)
+{
+	unsigned long mask = GID_ATTR_FIND_MASK_GID	  |
+			     GID_ATTR_FIND_MASK_GID_TYPE |
+			     GID_ATTR_FIND_MASK_DEFAULT  |
+			     GID_ATTR_FIND_MASK_NETDEV;
+
+	return _ib_cache_gid_del(ib_dev, port, gid, attr, mask, false);
+}
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+				     struct net_device *ndev)
+{
+	struct ib_gid_table *table;
+	int ix;
+	bool deleted = false;
+
+	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
+
+	mutex_lock(&table->lock);
+
+	for (ix = 0; ix < table->sz; ix++) {
+		if (table->data_vec[ix].attr.ndev == ndev) {
+			del_gid(ib_dev, port, table, ix);
+			deleted = true;
+		}
+	}
+
+	mutex_unlock(&table->lock);
+
+	if (deleted)
+		dispatch_gid_change_event(ib_dev, port);
+
+	return 0;
+}
+
+static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
+			      union ib_gid *gid, struct ib_gid_attr *attr)
+{
+	struct ib_gid_table *table;
+
+	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
+
+	if (index < 0 || index >= table->sz)
+		return -EINVAL;
+
+	if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID)
+		return -EINVAL;
+
+	memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
+	if (attr) {
+		memcpy(attr, &table->data_vec[index].attr, sizeof(*attr));
+		if (attr->ndev)
+			dev_hold(attr->ndev);
+	}
+
+	return 0;
+}
+
+static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
+				    const union ib_gid *gid,
+				    const struct ib_gid_attr *val,
+				    unsigned long mask,
+				    u8 *port, u16 *index)
+{
+	struct ib_gid_table *table;
+	u8 p;
+	int local_index;
+	unsigned long flags;
+
+	for (p = 0; p < ib_dev->phys_port_cnt; p++) {
+		table = ib_dev->cache.ports[p].gid;
+		read_lock_irqsave(&table->rwlock, flags);
+		local_index = find_gid(table, gid, val, false, mask, NULL);
+		if (local_index >= 0) {
+			if (index)
+				*index = local_index;
+			if (port)
+				*port = p + rdma_start_port(ib_dev);
+			read_unlock_irqrestore(&table->rwlock, flags);
+			return 0;
+		}
+		read_unlock_irqrestore(&table->rwlock, flags);
+	}
+
+	return -ENOENT;
+}
+
+static int ib_cache_gid_find(struct ib_device *ib_dev,
+			     const union ib_gid *gid,
+			     enum ib_gid_type gid_type,
+			     struct net_device *ndev, u8 *port,
+			     u16 *index)
+{
+	unsigned long mask = GID_ATTR_FIND_MASK_GID |
+			     GID_ATTR_FIND_MASK_GID_TYPE;
+	struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
+
+	if (ndev)
+		mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+	return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val,
+					mask, port, index);
+}
+
+/**
+ * ib_find_cached_gid_by_port - Returns the GID table index where a specified
+ * GID value occurs. It searches for the specified GID value in the local
+ * software cache.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
+ * @port_num: The port number of the device where the GID value should be
+ *   searched.
+ * @ndev: In RoCE, the net device of the device. Null means ignore.
+ * @index: The index into the cached GID table where the GID was found. This
+ *   parameter may be NULL.
+ */
+int ib_find_cached_gid_by_port(struct ib_device *ib_dev,
+			       const union ib_gid *gid,
+			       enum ib_gid_type gid_type,
+			       u8 port, struct net_device *ndev,
+			       u16 *index)
+{
+	int local_index;
+	struct ib_gid_table *table;
+	unsigned long mask = GID_ATTR_FIND_MASK_GID |
+			     GID_ATTR_FIND_MASK_GID_TYPE;
+	struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type};
+	unsigned long flags;
+
+	if (!rdma_is_port_valid(ib_dev, port))
+		return -ENOENT;
+
+	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
+
+	if (ndev)
+		mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+	read_lock_irqsave(&table->rwlock, flags);
+	local_index = find_gid(table, gid, &val, false, mask, NULL);
+	if (local_index >= 0) {
+		if (index)
+			*index = local_index;
+		read_unlock_irqrestore(&table->rwlock, flags);
+		return 0;
+	}
+
+	read_unlock_irqrestore(&table->rwlock, flags);
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_cached_gid_by_port);
+
+/**
+ * ib_cache_gid_find_by_filter - Returns the GID table index where a specified
+ * GID value occurs
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value could be
+ *   searched.
+ * @filter: The filter function is executed on any matching GID in the table.
+ *   If the filter function returns true, the corresponding index is returned,
+ *   otherwise, we continue searching the GID table. It's guaranteed that
+ *   while filter is executed, ndev field is valid and the structure won't
+ *   change. filter is executed in an atomic context. filter must not be NULL.
+ * @index: The index into the cached GID table where the GID was found. This
+ *   parameter may be NULL.
+ *
+ * ib_cache_gid_find_by_filter() searches for the specified GID value
+ * of which the filter function returns true in the port's GID table.
+ * This function is only supported on RoCE ports.
+ *
+ */
+static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
+				       const union ib_gid *gid,
+				       u8 port,
+				       bool (*filter)(const union ib_gid *,
+						      const struct ib_gid_attr *,
+						      void *),
+				       void *context,
+				       u16 *index)
+{
+	struct ib_gid_table *table;
+	unsigned int i;
+	unsigned long flags;
+	bool found = false;
+
+
+	if (!rdma_is_port_valid(ib_dev, port) ||
+	    !rdma_protocol_roce(ib_dev, port))
+		return -EPROTONOSUPPORT;
+
+	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
+
+	read_lock_irqsave(&table->rwlock, flags);
+	for (i = 0; i < table->sz; i++) {
+		struct ib_gid_attr attr;
+
+		if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
+			continue;
+
+		if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
+			continue;
+
+		memcpy(&attr, &table->data_vec[i].attr, sizeof(attr));
+
+		if (filter(gid, &attr, context)) {
+			found = true;
+			if (index)
+				*index = i;
+			break;
+		}
+	}
+	read_unlock_irqrestore(&table->rwlock, flags);
+
+	if (!found)
+		return -ENOENT;
+	return 0;
+}
+
+static struct ib_gid_table *alloc_gid_table(int sz)
+{
+	struct ib_gid_table *table =
+		kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
+	int i;
+
+	if (!table)
+		return NULL;
+
+	table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
+	if (!table->data_vec)
+		goto err_free_table;
+
+	mutex_init(&table->lock);
+
+	table->sz = sz;
+	rwlock_init(&table->rwlock);
+
+	/* Mark all entries as invalid so that allocator can allocate
+	 * one of the invalid (free) entry.
+	 */
+	for (i = 0; i < sz; i++)
+		table->data_vec[i].props |= GID_TABLE_ENTRY_INVALID;
+	return table;
+
+err_free_table:
+	kfree(table);
+	return NULL;
+}
+
+static void release_gid_table(struct ib_gid_table *table)
+{
+	if (table) {
+		kfree(table->data_vec);
+		kfree(table);
+	}
+}
+
+static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
+				   struct ib_gid_table *table)
+{
+	int i;
+	bool deleted = false;
+
+	if (!table)
+		return;
+
+	mutex_lock(&table->lock);
+	for (i = 0; i < table->sz; ++i) {
+		if (memcmp(&table->data_vec[i].gid, &zgid,
+			   sizeof(table->data_vec[i].gid))) {
+			del_gid(ib_dev, port, table, i);
+			deleted = true;
+		}
+	}
+	mutex_unlock(&table->lock);
+
+	if (deleted)
+		dispatch_gid_change_event(ib_dev, port);
+}
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+				  struct net_device *ndev,
+				  unsigned long gid_type_mask,
+				  enum ib_cache_gid_default_mode mode)
+{
+	union ib_gid gid = { };
+	struct ib_gid_attr gid_attr;
+	struct ib_gid_table *table;
+	unsigned int gid_type;
+	unsigned long mask;
+
+	table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid;
+
+	mask = GID_ATTR_FIND_MASK_GID_TYPE |
+	       GID_ATTR_FIND_MASK_DEFAULT |
+	       GID_ATTR_FIND_MASK_NETDEV;
+	memset(&gid_attr, 0, sizeof(gid_attr));
+	gid_attr.ndev = ndev;
+
+	for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) {
+		if (1UL << gid_type & ~gid_type_mask)
+			continue;
+
+		gid_attr.gid_type = gid_type;
+
+		if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) {
+			make_default_gid(ndev, &gid);
+			__ib_cache_gid_add(ib_dev, port, &gid,
+					   &gid_attr, mask, true);
+		} else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) {
+			_ib_cache_gid_del(ib_dev, port, &gid,
+					  &gid_attr, mask, true);
+		}
+	}
+}
+
+static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
+				     struct ib_gid_table *table)
+{
+	unsigned int i;
+	unsigned long roce_gid_type_mask;
+	unsigned int num_default_gids;
+	unsigned int current_gid = 0;
+
+	roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+	num_default_gids = hweight_long(roce_gid_type_mask);
+	for (i = 0; i < num_default_gids && i < table->sz; i++) {
+		struct ib_gid_table_entry *entry =
+			&table->data_vec[i];
+
+		entry->props |= GID_TABLE_ENTRY_DEFAULT;
+		current_gid = find_next_bit(&roce_gid_type_mask,
+					    BITS_PER_LONG,
+					    current_gid);
+		entry->attr.gid_type = current_gid++;
+	}
+
+	return 0;
+}
+
+static int _gid_table_setup_one(struct ib_device *ib_dev)
+{
+	u8 port;
+	struct ib_gid_table *table;
+	int err = 0;
+
+	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+		u8 rdma_port = port + rdma_start_port(ib_dev);
+
+		table =
+			alloc_gid_table(
+				ib_dev->port_immutable[rdma_port].gid_tbl_len);
+		if (!table) {
+			err = -ENOMEM;
+			goto rollback_table_setup;
+		}
+
+		err = gid_table_reserve_default(ib_dev,
+						port + rdma_start_port(ib_dev),
+						table);
+		if (err)
+			goto rollback_table_setup;
+		ib_dev->cache.ports[port].gid = table;
+	}
+
+	return 0;
+
+rollback_table_setup:
+	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+		table = ib_dev->cache.ports[port].gid;
+
+		cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+				       table);
+		release_gid_table(table);
+	}
+
+	return err;
+}
+
+static void gid_table_release_one(struct ib_device *ib_dev)
+{
+	struct ib_gid_table *table;
+	u8 port;
+
+	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+		table = ib_dev->cache.ports[port].gid;
+		release_gid_table(table);
+		ib_dev->cache.ports[port].gid = NULL;
+	}
+}
+
+static void gid_table_cleanup_one(struct ib_device *ib_dev)
+{
+	struct ib_gid_table *table;
+	u8 port;
+
+	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+		table = ib_dev->cache.ports[port].gid;
+		cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+				       table);
+	}
+}
+
+static int gid_table_setup_one(struct ib_device *ib_dev)
+{
+	int err;
+
+	err = _gid_table_setup_one(ib_dev);
+
+	if (err)
+		return err;
+
+	rdma_roce_rescan_device(ib_dev);
+
+	return err;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+		      u8                port_num,
+		      int               index,
+		      union ib_gid     *gid,
+		      struct ib_gid_attr *gid_attr)
+{
+	int res;
+	unsigned long flags;
+	struct ib_gid_table *table;
+
+	if (!rdma_is_port_valid(device, port_num))
+		return -EINVAL;
+
+	table = device->cache.ports[port_num - rdma_start_port(device)].gid;
+	read_lock_irqsave(&table->rwlock, flags);
+	res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr);
+	read_unlock_irqrestore(&table->rwlock, flags);
+
+	return res;
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+/**
+ * ib_find_cached_gid - Returns the port number and GID table index where
+ *   a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
+ * @ndev: In RoCE, the net device of the device. NULL means ignore.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the cached GID table where the GID was found.  This
+ *   parameter may be NULL.
+ *
+ * ib_find_cached_gid() searches for the specified GID value in
+ * the local software cache.
+ */
+int ib_find_cached_gid(struct ib_device *device,
+		       const union ib_gid *gid,
+		       enum ib_gid_type gid_type,
+		       struct net_device *ndev,
+		       u8               *port_num,
+		       u16              *index)
+{
+	return ib_cache_gid_find(device, gid, gid_type, ndev, port_num, index);
+}
+EXPORT_SYMBOL(ib_find_cached_gid);
+
+int ib_find_gid_by_filter(struct ib_device *device,
+			  const union ib_gid *gid,
+			  u8 port_num,
+			  bool (*filter)(const union ib_gid *gid,
+					 const struct ib_gid_attr *,
+					 void *),
+			  void *context, u16 *index)
+{
+	/* Only RoCE GID table supports filter function */
+	if (!rdma_protocol_roce(device, port_num) && filter)
+		return -EPROTONOSUPPORT;
+
+	return ib_cache_gid_find_by_filter(device, gid,
+					   port_num, filter,
+					   context, index);
+}
+
+int ib_get_cached_pkey(struct ib_device *device,
+		       u8                port_num,
+		       int               index,
+		       u16              *pkey)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int ret = 0;
+
+	if (!rdma_is_port_valid(device, port_num))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+
+	if (index < 0 || index >= cache->table_len)
+		ret = -EINVAL;
+	else
+		*pkey = cache->table[index];
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_pkey);
+
+int ib_get_cached_subnet_prefix(struct ib_device *device,
+				u8                port_num,
+				u64              *sn_pfx)
+{
+	unsigned long flags;
+	int p;
+
+	if (!rdma_is_port_valid(device, port_num))
+		return -EINVAL;
+
+	p = port_num - rdma_start_port(device);
+	read_lock_irqsave(&device->cache.lock, flags);
+	*sn_pfx = device->cache.ports[p].subnet_prefix;
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_get_cached_subnet_prefix);
+
+int ib_find_cached_pkey(struct ib_device *device,
+			u8                port_num,
+			u16               pkey,
+			u16              *index)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int i;
+	int ret = -ENOENT;
+	int partial_ix = -1;
+
+	if (!rdma_is_port_valid(device, port_num))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+
+	*index = -1;
+
+	for (i = 0; i < cache->table_len; ++i)
+		if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+			if (cache->table[i] & 0x8000) {
+				*index = i;
+				ret = 0;
+				break;
+			} else
+				partial_ix = i;
+		}
+
+	if (ret && partial_ix >= 0) {
+		*index = partial_ix;
+		ret = 0;
+	}
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_pkey);
+
+int ib_find_exact_cached_pkey(struct ib_device *device,
+			      u8                port_num,
+			      u16               pkey,
+			      u16              *index)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int i;
+	int ret = -ENOENT;
+
+	if (!rdma_is_port_valid(device, port_num))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
+
+	*index = -1;
+
+	for (i = 0; i < cache->table_len; ++i)
+		if (cache->table[i] == pkey) {
+			*index = i;
+			ret = 0;
+			break;
+		}
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_exact_cached_pkey);
+
+int ib_get_cached_lmc(struct ib_device *device,
+		      u8                port_num,
+		      u8                *lmc)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	if (!rdma_is_port_valid(device, port_num))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+	*lmc = device->cache.ports[port_num - rdma_start_port(device)].lmc;
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_lmc);
+
+int ib_get_cached_port_state(struct ib_device   *device,
+			     u8                  port_num,
+			     enum ib_port_state *port_state)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	if (!rdma_is_port_valid(device, port_num))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+	*port_state = device->cache.ports[port_num
+		- rdma_start_port(device)].port_state;
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_port_state);
+
+static int config_non_roce_gid_cache(struct ib_device *device,
+				     u8 port, int gid_tbl_len)
+{
+	struct ib_gid_attr gid_attr = {};
+	struct ib_gid_table *table;
+	union ib_gid gid;
+	int ret = 0;
+	int i;
+
+	gid_attr.device = device;
+	gid_attr.port_num = port;
+	table = device->cache.ports[port - rdma_start_port(device)].gid;
+
+	mutex_lock(&table->lock);
+	for (i = 0; i < gid_tbl_len; ++i) {
+		if (!device->query_gid)
+			continue;
+		ret = device->query_gid(device, port, i, &gid);
+		if (ret) {
+			pr_warn("query_gid failed (%d) for %s (index %d)\n",
+				ret, device->name, i);
+			goto err;
+		}
+		gid_attr.index = i;
+		add_modify_gid(table, &gid, &gid_attr);
+	}
+err:
+	mutex_unlock(&table->lock);
+	return ret;
+}
+
+static void ib_cache_update(struct ib_device *device,
+			    u8                port,
+			    bool	      enforce_security)
+{
+	struct ib_port_attr       *tprops = NULL;
+	struct ib_pkey_cache      *pkey_cache = NULL, *old_pkey_cache;
+	int                        i;
+	int                        ret;
+	struct ib_gid_table	  *table;
+
+	if (!rdma_is_port_valid(device, port))
+		return;
+
+	table = device->cache.ports[port - rdma_start_port(device)].gid;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		return;
+
+	ret = ib_query_port(device, port, tprops);
+	if (ret) {
+		pr_warn("ib_query_port failed (%d) for %s\n",
+			ret, device->name);
+		goto err;
+	}
+
+	if (!rdma_protocol_roce(device, port)) {
+		ret = config_non_roce_gid_cache(device, port,
+						tprops->gid_tbl_len);
+		if (ret)
+			goto err;
+	}
+
+	pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len *
+			     sizeof *pkey_cache->table, GFP_KERNEL);
+	if (!pkey_cache)
+		goto err;
+
+	pkey_cache->table_len = tprops->pkey_tbl_len;
+
+	for (i = 0; i < pkey_cache->table_len; ++i) {
+		ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
+		if (ret) {
+			pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n",
+				ret, device->name, i);
+			goto err;
+		}
+	}
+
+	write_lock_irq(&device->cache.lock);
+
+	old_pkey_cache = device->cache.ports[port -
+		rdma_start_port(device)].pkey;
+
+	device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache;
+	device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc;
+	device->cache.ports[port - rdma_start_port(device)].port_state =
+		tprops->state;
+
+	device->cache.ports[port - rdma_start_port(device)].subnet_prefix =
+							tprops->subnet_prefix;
+	write_unlock_irq(&device->cache.lock);
+
+	if (enforce_security)
+		ib_security_cache_change(device,
+					 port,
+					 tprops->subnet_prefix);
+
+	kfree(old_pkey_cache);
+	kfree(tprops);
+	return;
+
+err:
+	kfree(pkey_cache);
+	kfree(tprops);
+}
+
+static void ib_cache_task(struct work_struct *_work)
+{
+	struct ib_update_work *work =
+		container_of(_work, struct ib_update_work, work);
+
+	ib_cache_update(work->device,
+			work->port_num,
+			work->enforce_security);
+	kfree(work);
+}
+
+static void ib_cache_event(struct ib_event_handler *handler,
+			   struct ib_event *event)
+{
+	struct ib_update_work *work;
+
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE   ||
+	    event->event == IB_EVENT_CLIENT_REREGISTER ||
+	    event->event == IB_EVENT_GID_CHANGE) {
+		work = kmalloc(sizeof *work, GFP_ATOMIC);
+		if (work) {
+			INIT_WORK(&work->work, ib_cache_task);
+			work->device   = event->device;
+			work->port_num = event->element.port_num;
+			if (event->event == IB_EVENT_PKEY_CHANGE ||
+			    event->event == IB_EVENT_GID_CHANGE)
+				work->enforce_security = true;
+			else
+				work->enforce_security = false;
+
+			queue_work(ib_wq, &work->work);
+		}
+	}
+}
+
+int ib_cache_setup_one(struct ib_device *device)
+{
+	int p;
+	int err;
+
+	rwlock_init(&device->cache.lock);
+
+	device->cache.ports =
+		kzalloc(sizeof(*device->cache.ports) *
+			(rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
+	if (!device->cache.ports)
+		return -ENOMEM;
+
+	err = gid_table_setup_one(device);
+	if (err) {
+		kfree(device->cache.ports);
+		device->cache.ports = NULL;
+		return err;
+	}
+
+	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
+		ib_cache_update(device, p + rdma_start_port(device), true);
+
+	INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
+			      device, ib_cache_event);
+	ib_register_event_handler(&device->cache.event_handler);
+	return 0;
+}
+
+void ib_cache_release_one(struct ib_device *device)
+{
+	int p;
+
+	/*
+	 * The release function frees all the cache elements.
+	 * This function should be called as part of freeing
+	 * all the device's resources when the cache could no
+	 * longer be accessed.
+	 */
+	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
+		kfree(device->cache.ports[p].pkey);
+
+	gid_table_release_one(device);
+	kfree(device->cache.ports);
+}
+
+void ib_cache_cleanup_one(struct ib_device *device)
+{
+	/* The cleanup function unregisters the event handler,
+	 * waits for all in-progress workqueue elements and cleans
+	 * up the GID cache. This function should be called after
+	 * the device was removed from the devices list and all
+	 * clients were removed, so the cache exists but is
+	 * non-functional and shouldn't be updated anymore.
+	 */
+	ib_unregister_event_handler(&device->cache.event_handler);
+	flush_workqueue(ib_wq);
+	gid_table_cleanup_one(device);
+}
+
+void __init ib_cache_setup(void)
+{
+	roce_gid_mgmt_init();
+}
+
+void __exit ib_cache_cleanup(void)
+{
+	roce_gid_mgmt_cleanup();
+}
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
new file mode 100644
index 0000000..126ac5f
--- /dev/null
+++ b/drivers/infiniband/core/cgroup.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include "core_priv.h"
+
+/**
+ * ib_device_register_rdmacg - register with rdma cgroup.
+ * @device: device to register to participate in resource
+ *          accounting by rdma cgroup.
+ *
+ * Register with the rdma cgroup. Should be called before
+ * exposing rdma device to user space applications to avoid
+ * resource accounting leak.
+ * Returns 0 on success or otherwise failure code.
+ */
+int ib_device_register_rdmacg(struct ib_device *device)
+{
+	device->cg_device.name = device->name;
+	return rdmacg_register_device(&device->cg_device);
+}
+
+/**
+ * ib_device_unregister_rdmacg - unregister with rdma cgroup.
+ * @device: device to unregister.
+ *
+ * Unregister with the rdma cgroup. Should be called after
+ * all the resources are deallocated, and after a stage when any
+ * other resource allocation by user application cannot be done
+ * for this device to avoid any leak in accounting.
+ */
+void ib_device_unregister_rdmacg(struct ib_device *device)
+{
+	rdmacg_unregister_device(&device->cg_device);
+}
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+			 struct ib_device *device,
+			 enum rdmacg_resource_type resource_index)
+{
+	return rdmacg_try_charge(&cg_obj->cg, &device->cg_device,
+				 resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_try_charge);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+			struct ib_device *device,
+			enum rdmacg_resource_type resource_index)
+{
+	rdmacg_uncharge(cg_obj->cg, &device->cg_device,
+			resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_uncharge);
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
new file mode 100644
index 0000000..a92e1a5
--- /dev/null
+++ b/drivers/infiniband/core/cm.c
@@ -0,0 +1,4507 @@
+/*
+ * Copyright (c) 2004-2007 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/kdev_t.h>
+#include <linux/etherdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include "cm_msgs.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static const char * const ibcm_rej_reason_strs[] = {
+	[IB_CM_REJ_NO_QP]			= "no QP",
+	[IB_CM_REJ_NO_EEC]			= "no EEC",
+	[IB_CM_REJ_NO_RESOURCES]		= "no resources",
+	[IB_CM_REJ_TIMEOUT]			= "timeout",
+	[IB_CM_REJ_UNSUPPORTED]			= "unsupported",
+	[IB_CM_REJ_INVALID_COMM_ID]		= "invalid comm ID",
+	[IB_CM_REJ_INVALID_COMM_INSTANCE]	= "invalid comm instance",
+	[IB_CM_REJ_INVALID_SERVICE_ID]		= "invalid service ID",
+	[IB_CM_REJ_INVALID_TRANSPORT_TYPE]	= "invalid transport type",
+	[IB_CM_REJ_STALE_CONN]			= "stale conn",
+	[IB_CM_REJ_RDC_NOT_EXIST]		= "RDC not exist",
+	[IB_CM_REJ_INVALID_GID]			= "invalid GID",
+	[IB_CM_REJ_INVALID_LID]			= "invalid LID",
+	[IB_CM_REJ_INVALID_SL]			= "invalid SL",
+	[IB_CM_REJ_INVALID_TRAFFIC_CLASS]	= "invalid traffic class",
+	[IB_CM_REJ_INVALID_HOP_LIMIT]		= "invalid hop limit",
+	[IB_CM_REJ_INVALID_PACKET_RATE]		= "invalid packet rate",
+	[IB_CM_REJ_INVALID_ALT_GID]		= "invalid alt GID",
+	[IB_CM_REJ_INVALID_ALT_LID]		= "invalid alt LID",
+	[IB_CM_REJ_INVALID_ALT_SL]		= "invalid alt SL",
+	[IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS]	= "invalid alt traffic class",
+	[IB_CM_REJ_INVALID_ALT_HOP_LIMIT]	= "invalid alt hop limit",
+	[IB_CM_REJ_INVALID_ALT_PACKET_RATE]	= "invalid alt packet rate",
+	[IB_CM_REJ_PORT_CM_REDIRECT]		= "port CM redirect",
+	[IB_CM_REJ_PORT_REDIRECT]		= "port redirect",
+	[IB_CM_REJ_INVALID_MTU]			= "invalid MTU",
+	[IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES]	= "insufficient resp resources",
+	[IB_CM_REJ_CONSUMER_DEFINED]		= "consumer defined",
+	[IB_CM_REJ_INVALID_RNR_RETRY]		= "invalid RNR retry",
+	[IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID]	= "duplicate local comm ID",
+	[IB_CM_REJ_INVALID_CLASS_VERSION]	= "invalid class version",
+	[IB_CM_REJ_INVALID_FLOW_LABEL]		= "invalid flow label",
+	[IB_CM_REJ_INVALID_ALT_FLOW_LABEL]	= "invalid alt flow label",
+};
+
+const char *__attribute_const__ ibcm_reject_msg(int reason)
+{
+	size_t index = reason;
+
+	if (index < ARRAY_SIZE(ibcm_rej_reason_strs) &&
+	    ibcm_rej_reason_strs[index])
+		return ibcm_rej_reason_strs[index];
+	else
+		return "unrecognized reason";
+}
+EXPORT_SYMBOL(ibcm_reject_msg);
+
+static void cm_add_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device, void *client_data);
+
+static struct ib_client cm_client = {
+	.name   = "cm",
+	.add    = cm_add_one,
+	.remove = cm_remove_one
+};
+
+static struct ib_cm {
+	spinlock_t lock;
+	struct list_head device_list;
+	rwlock_t device_lock;
+	struct rb_root listen_service_table;
+	u64 listen_service_id;
+	/* struct rb_root peer_service_table; todo: fix peer to peer */
+	struct rb_root remote_qp_table;
+	struct rb_root remote_id_table;
+	struct rb_root remote_sidr_table;
+	struct idr local_id_table;
+	__be32 random_id_operand;
+	struct list_head timewait_list;
+	struct workqueue_struct *wq;
+	/* Sync on cm change port state */
+	spinlock_t state_lock;
+} cm;
+
+/* Counter indexes ordered by attribute ID */
+enum {
+	CM_REQ_COUNTER,
+	CM_MRA_COUNTER,
+	CM_REJ_COUNTER,
+	CM_REP_COUNTER,
+	CM_RTU_COUNTER,
+	CM_DREQ_COUNTER,
+	CM_DREP_COUNTER,
+	CM_SIDR_REQ_COUNTER,
+	CM_SIDR_REP_COUNTER,
+	CM_LAP_COUNTER,
+	CM_APR_COUNTER,
+	CM_ATTR_COUNT,
+	CM_ATTR_ID_OFFSET = 0x0010,
+};
+
+enum {
+	CM_XMIT,
+	CM_XMIT_RETRIES,
+	CM_RECV,
+	CM_RECV_DUPLICATES,
+	CM_COUNTER_GROUPS
+};
+
+static char const counter_group_names[CM_COUNTER_GROUPS]
+				     [sizeof("cm_rx_duplicates")] = {
+	"cm_tx_msgs", "cm_tx_retries",
+	"cm_rx_msgs", "cm_rx_duplicates"
+};
+
+struct cm_counter_group {
+	struct kobject obj;
+	atomic_long_t counter[CM_ATTR_COUNT];
+};
+
+struct cm_counter_attribute {
+	struct attribute attr;
+	int index;
+};
+
+#define CM_COUNTER_ATTR(_name, _index) \
+struct cm_counter_attribute cm_##_name##_counter_attr = { \
+	.attr = { .name = __stringify(_name), .mode = 0444 }, \
+	.index = _index \
+}
+
+static CM_COUNTER_ATTR(req, CM_REQ_COUNTER);
+static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER);
+static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER);
+static CM_COUNTER_ATTR(rep, CM_REP_COUNTER);
+static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER);
+static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER);
+static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER);
+static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER);
+static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER);
+static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER);
+static CM_COUNTER_ATTR(apr, CM_APR_COUNTER);
+
+static struct attribute *cm_counter_default_attrs[] = {
+	&cm_req_counter_attr.attr,
+	&cm_mra_counter_attr.attr,
+	&cm_rej_counter_attr.attr,
+	&cm_rep_counter_attr.attr,
+	&cm_rtu_counter_attr.attr,
+	&cm_dreq_counter_attr.attr,
+	&cm_drep_counter_attr.attr,
+	&cm_sidr_req_counter_attr.attr,
+	&cm_sidr_rep_counter_attr.attr,
+	&cm_lap_counter_attr.attr,
+	&cm_apr_counter_attr.attr,
+	NULL
+};
+
+struct cm_port {
+	struct cm_device *cm_dev;
+	struct ib_mad_agent *mad_agent;
+	struct kobject port_obj;
+	u8 port_num;
+	struct list_head cm_priv_prim_list;
+	struct list_head cm_priv_altr_list;
+	struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
+};
+
+struct cm_device {
+	struct list_head list;
+	struct ib_device *ib_device;
+	struct device *device;
+	u8 ack_delay;
+	int going_down;
+	struct cm_port *port[0];
+};
+
+struct cm_av {
+	struct cm_port *port;
+	union ib_gid dgid;
+	struct rdma_ah_attr ah_attr;
+	u16 pkey_index;
+	u8 timeout;
+};
+
+struct cm_work {
+	struct delayed_work work;
+	struct list_head list;
+	struct cm_port *port;
+	struct ib_mad_recv_wc *mad_recv_wc;	/* Received MADs */
+	__be32 local_id;			/* Established / timewait */
+	__be32 remote_id;
+	struct ib_cm_event cm_event;
+	struct sa_path_rec path[0];
+};
+
+struct cm_timewait_info {
+	struct cm_work work;			/* Must be first. */
+	struct list_head list;
+	struct rb_node remote_qp_node;
+	struct rb_node remote_id_node;
+	__be64 remote_ca_guid;
+	__be32 remote_qpn;
+	u8 inserted_remote_qp;
+	u8 inserted_remote_id;
+};
+
+struct cm_id_private {
+	struct ib_cm_id	id;
+
+	struct rb_node service_node;
+	struct rb_node sidr_id_node;
+	spinlock_t lock;	/* Do not acquire inside cm.lock */
+	struct completion comp;
+	atomic_t refcount;
+	/* Number of clients sharing this ib_cm_id. Only valid for listeners.
+	 * Protected by the cm.lock spinlock. */
+	int listen_sharecount;
+
+	struct ib_mad_send_buf *msg;
+	struct cm_timewait_info *timewait_info;
+	/* todo: use alternate port on send failure */
+	struct cm_av av;
+	struct cm_av alt_av;
+
+	void *private_data;
+	__be64 tid;
+	__be32 local_qpn;
+	__be32 remote_qpn;
+	enum ib_qp_type qp_type;
+	__be32 sq_psn;
+	__be32 rq_psn;
+	int timeout_ms;
+	enum ib_mtu path_mtu;
+	__be16 pkey;
+	u8 private_data_len;
+	u8 max_cm_retries;
+	u8 peer_to_peer;
+	u8 responder_resources;
+	u8 initiator_depth;
+	u8 retry_count;
+	u8 rnr_retry_count;
+	u8 service_timeout;
+	u8 target_ack_delay;
+
+	struct list_head prim_list;
+	struct list_head altr_list;
+	/* Indicates that the send port mad is registered and av is set */
+	int prim_send_port_not_ready;
+	int altr_send_port_not_ready;
+
+	struct list_head work_list;
+	atomic_t work_count;
+};
+
+static void cm_work_handler(struct work_struct *work);
+
+static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
+{
+	if (atomic_dec_and_test(&cm_id_priv->refcount))
+		complete(&cm_id_priv->comp);
+}
+
+static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
+			struct ib_mad_send_buf **msg)
+{
+	struct ib_mad_agent *mad_agent;
+	struct ib_mad_send_buf *m;
+	struct ib_ah *ah;
+	struct cm_av *av;
+	unsigned long flags, flags2;
+	int ret = 0;
+
+	/* don't let the port to be released till the agent is down */
+	spin_lock_irqsave(&cm.state_lock, flags2);
+	spin_lock_irqsave(&cm.lock, flags);
+	if (!cm_id_priv->prim_send_port_not_ready)
+		av = &cm_id_priv->av;
+	else if (!cm_id_priv->altr_send_port_not_ready &&
+		 (cm_id_priv->alt_av.port))
+		av = &cm_id_priv->alt_av;
+	else {
+		pr_info("%s: not valid CM id\n", __func__);
+		ret = -ENODEV;
+		spin_unlock_irqrestore(&cm.lock, flags);
+		goto out;
+	}
+	spin_unlock_irqrestore(&cm.lock, flags);
+	/* Make sure the port haven't released the mad yet */
+	mad_agent = cm_id_priv->av.port->mad_agent;
+	if (!mad_agent) {
+		pr_info("%s: not a valid MAD agent\n", __func__);
+		ret = -ENODEV;
+		goto out;
+	}
+	ah = rdma_create_ah(mad_agent->qp->pd, &av->ah_attr);
+	if (IS_ERR(ah)) {
+		ret = PTR_ERR(ah);
+		goto out;
+	}
+
+	m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
+			       av->pkey_index,
+			       0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+			       GFP_ATOMIC,
+			       IB_MGMT_BASE_VERSION);
+	if (IS_ERR(m)) {
+		rdma_destroy_ah(ah);
+		ret = PTR_ERR(m);
+		goto out;
+	}
+
+	/* Timeout set by caller if response is expected. */
+	m->ah = ah;
+	m->retries = cm_id_priv->max_cm_retries;
+
+	atomic_inc(&cm_id_priv->refcount);
+	m->context[0] = cm_id_priv;
+	*msg = m;
+
+out:
+	spin_unlock_irqrestore(&cm.state_lock, flags2);
+	return ret;
+}
+
+static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port,
+							   struct ib_mad_recv_wc *mad_recv_wc)
+{
+	return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
+				  0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+				  GFP_ATOMIC,
+				  IB_MGMT_BASE_VERSION);
+}
+
+static int cm_create_response_msg_ah(struct cm_port *port,
+				     struct ib_mad_recv_wc *mad_recv_wc,
+				     struct ib_mad_send_buf *msg)
+{
+	struct ib_ah *ah;
+
+	ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc,
+				  mad_recv_wc->recv_buf.grh, port->port_num);
+	if (IS_ERR(ah))
+		return PTR_ERR(ah);
+
+	msg->ah = ah;
+	return 0;
+}
+
+static void cm_free_msg(struct ib_mad_send_buf *msg)
+{
+	if (msg->ah)
+		rdma_destroy_ah(msg->ah);
+	if (msg->context[0])
+		cm_deref_id(msg->context[0]);
+	ib_free_send_mad(msg);
+}
+
+static int cm_alloc_response_msg(struct cm_port *port,
+				 struct ib_mad_recv_wc *mad_recv_wc,
+				 struct ib_mad_send_buf **msg)
+{
+	struct ib_mad_send_buf *m;
+	int ret;
+
+	m = cm_alloc_response_msg_no_ah(port, mad_recv_wc);
+	if (IS_ERR(m))
+		return PTR_ERR(m);
+
+	ret = cm_create_response_msg_ah(port, mad_recv_wc, m);
+	if (ret) {
+		cm_free_msg(m);
+		return ret;
+	}
+
+	*msg = m;
+	return 0;
+}
+
+static void * cm_copy_private_data(const void *private_data,
+				   u8 private_data_len)
+{
+	void *data;
+
+	if (!private_data || !private_data_len)
+		return NULL;
+
+	data = kmemdup(private_data, private_data_len, GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	return data;
+}
+
+static void cm_set_private_data(struct cm_id_private *cm_id_priv,
+				 void *private_data, u8 private_data_len)
+{
+	if (cm_id_priv->private_data && cm_id_priv->private_data_len)
+		kfree(cm_id_priv->private_data);
+
+	cm_id_priv->private_data = private_data;
+	cm_id_priv->private_data_len = private_data_len;
+}
+
+static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
+				   struct ib_grh *grh, struct cm_av *av)
+{
+	av->port = port;
+	av->pkey_index = wc->pkey_index;
+	return ib_init_ah_attr_from_wc(port->cm_dev->ib_device,
+				       port->port_num, wc,
+				       grh, &av->ah_attr);
+}
+
+static int add_cm_id_to_port_list(struct cm_id_private *cm_id_priv,
+				  struct cm_av *av,
+				  struct cm_port *port)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&cm.lock, flags);
+
+	if (&cm_id_priv->av == av)
+		list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list);
+	else if (&cm_id_priv->alt_av == av)
+		list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list);
+	else
+		ret = -EINVAL;
+
+	spin_unlock_irqrestore(&cm.lock, flags);
+	return ret;
+}
+
+static struct cm_port *get_cm_port_from_path(struct sa_path_rec *path)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port = NULL;
+	unsigned long flags;
+	u8 p;
+	struct net_device *ndev = ib_get_ndev_from_path(path);
+
+	read_lock_irqsave(&cm.device_lock, flags);
+	list_for_each_entry(cm_dev, &cm.device_list, list) {
+		if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
+					sa_conv_pathrec_to_gid_type(path),
+					ndev, &p, NULL)) {
+			port = cm_dev->port[p - 1];
+			break;
+		}
+	}
+	read_unlock_irqrestore(&cm.device_lock, flags);
+
+	if (ndev)
+		dev_put(ndev);
+	return port;
+}
+
+static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av,
+			      struct cm_id_private *cm_id_priv)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port;
+	int ret;
+
+	port = get_cm_port_from_path(path);
+	if (!port)
+		return -EINVAL;
+	cm_dev = port->cm_dev;
+
+	ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num,
+				  be16_to_cpu(path->pkey), &av->pkey_index);
+	if (ret)
+		return ret;
+
+	av->port = port;
+	ret = ib_init_ah_attr_from_path(cm_dev->ib_device, port->port_num, path,
+					&av->ah_attr);
+	if (ret)
+		return ret;
+
+	av->timeout = path->packet_life_time + 1;
+
+	ret = add_cm_id_to_port_list(cm_id_priv, av, port);
+	return ret;
+}
+
+static int cm_alloc_id(struct cm_id_private *cm_id_priv)
+{
+	unsigned long flags;
+	int id;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irqsave(&cm.lock, flags);
+
+	id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT);
+
+	spin_unlock_irqrestore(&cm.lock, flags);
+	idr_preload_end();
+
+	cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
+	return id < 0 ? id : 0;
+}
+
+static void cm_free_id(__be32 local_id)
+{
+	spin_lock_irq(&cm.lock);
+	idr_remove(&cm.local_id_table,
+		   (__force int) (local_id ^ cm.random_id_operand));
+	spin_unlock_irq(&cm.lock);
+}
+
+static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)
+{
+	struct cm_id_private *cm_id_priv;
+
+	cm_id_priv = idr_find(&cm.local_id_table,
+			      (__force int) (local_id ^ cm.random_id_operand));
+	if (cm_id_priv) {
+		if (cm_id_priv->id.remote_id == remote_id)
+			atomic_inc(&cm_id_priv->refcount);
+		else
+			cm_id_priv = NULL;
+	}
+
+	return cm_id_priv;
+}
+
+static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
+{
+	struct cm_id_private *cm_id_priv;
+
+	spin_lock_irq(&cm.lock);
+	cm_id_priv = cm_get_id(local_id, remote_id);
+	spin_unlock_irq(&cm.lock);
+
+	return cm_id_priv;
+}
+
+/*
+ * Trivial helpers to strip endian annotation and compare; the
+ * endianness doesn't actually matter since we just need a stable
+ * order for the RB tree.
+ */
+static int be32_lt(__be32 a, __be32 b)
+{
+	return (__force u32) a < (__force u32) b;
+}
+
+static int be32_gt(__be32 a, __be32 b)
+{
+	return (__force u32) a > (__force u32) b;
+}
+
+static int be64_lt(__be64 a, __be64 b)
+{
+	return (__force u64) a < (__force u64) b;
+}
+
+static int be64_gt(__be64 a, __be64 b)
+{
+	return (__force u64) a > (__force u64) b;
+}
+
+static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
+{
+	struct rb_node **link = &cm.listen_service_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_id_private *cur_cm_id_priv;
+	__be64 service_id = cm_id_priv->id.service_id;
+	__be64 service_mask = cm_id_priv->id.service_mask;
+
+	while (*link) {
+		parent = *link;
+		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+					  service_node);
+		if ((cur_cm_id_priv->id.service_mask & service_id) ==
+		    (service_mask & cur_cm_id_priv->id.service_id) &&
+		    (cm_id_priv->id.device == cur_cm_id_priv->id.device))
+			return cur_cm_id_priv;
+
+		if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
+			link = &(*link)->rb_left;
+		else if (cm_id_priv->id.device > cur_cm_id_priv->id.device)
+			link = &(*link)->rb_right;
+		else if (be64_lt(service_id, cur_cm_id_priv->id.service_id))
+			link = &(*link)->rb_left;
+		else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
+			link = &(*link)->rb_right;
+		else
+			link = &(*link)->rb_right;
+	}
+	rb_link_node(&cm_id_priv->service_node, parent, link);
+	rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table);
+	return NULL;
+}
+
+static struct cm_id_private * cm_find_listen(struct ib_device *device,
+					     __be64 service_id)
+{
+	struct rb_node *node = cm.listen_service_table.rb_node;
+	struct cm_id_private *cm_id_priv;
+
+	while (node) {
+		cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
+		if ((cm_id_priv->id.service_mask & service_id) ==
+		     cm_id_priv->id.service_id &&
+		    (cm_id_priv->id.device == device))
+			return cm_id_priv;
+
+		if (device < cm_id_priv->id.device)
+			node = node->rb_left;
+		else if (device > cm_id_priv->id.device)
+			node = node->rb_right;
+		else if (be64_lt(service_id, cm_id_priv->id.service_id))
+			node = node->rb_left;
+		else if (be64_gt(service_id, cm_id_priv->id.service_id))
+			node = node->rb_right;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info
+						     *timewait_info)
+{
+	struct rb_node **link = &cm.remote_id_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_timewait_info *cur_timewait_info;
+	__be64 remote_ca_guid = timewait_info->remote_ca_guid;
+	__be32 remote_id = timewait_info->work.remote_id;
+
+	while (*link) {
+		parent = *link;
+		cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+					     remote_id_node);
+		if (be32_lt(remote_id, cur_timewait_info->work.remote_id))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_id, cur_timewait_info->work.remote_id))
+			link = &(*link)->rb_right;
+		else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_left;
+		else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_right;
+		else
+			return cur_timewait_info;
+	}
+	timewait_info->inserted_remote_id = 1;
+	rb_link_node(&timewait_info->remote_id_node, parent, link);
+	rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table);
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid,
+						   __be32 remote_id)
+{
+	struct rb_node *node = cm.remote_id_table.rb_node;
+	struct cm_timewait_info *timewait_info;
+
+	while (node) {
+		timewait_info = rb_entry(node, struct cm_timewait_info,
+					 remote_id_node);
+		if (be32_lt(remote_id, timewait_info->work.remote_id))
+			node = node->rb_left;
+		else if (be32_gt(remote_id, timewait_info->work.remote_id))
+			node = node->rb_right;
+		else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid))
+			node = node->rb_left;
+		else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid))
+			node = node->rb_right;
+		else
+			return timewait_info;
+	}
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info
+						      *timewait_info)
+{
+	struct rb_node **link = &cm.remote_qp_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_timewait_info *cur_timewait_info;
+	__be64 remote_ca_guid = timewait_info->remote_ca_guid;
+	__be32 remote_qpn = timewait_info->remote_qpn;
+
+	while (*link) {
+		parent = *link;
+		cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+					     remote_qp_node);
+		if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn))
+			link = &(*link)->rb_right;
+		else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_left;
+		else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_right;
+		else
+			return cur_timewait_info;
+	}
+	timewait_info->inserted_remote_qp = 1;
+	rb_link_node(&timewait_info->remote_qp_node, parent, link);
+	rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+	return NULL;
+}
+
+static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private
+						    *cm_id_priv)
+{
+	struct rb_node **link = &cm.remote_sidr_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_id_private *cur_cm_id_priv;
+	union ib_gid *port_gid = &cm_id_priv->av.dgid;
+	__be32 remote_id = cm_id_priv->id.remote_id;
+
+	while (*link) {
+		parent = *link;
+		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+					  sidr_id_node);
+		if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id))
+			link = &(*link)->rb_right;
+		else {
+			int cmp;
+			cmp = memcmp(port_gid, &cur_cm_id_priv->av.dgid,
+				     sizeof *port_gid);
+			if (cmp < 0)
+				link = &(*link)->rb_left;
+			else if (cmp > 0)
+				link = &(*link)->rb_right;
+			else
+				return cur_cm_id_priv;
+		}
+	}
+	rb_link_node(&cm_id_priv->sidr_id_node, parent, link);
+	rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+	return NULL;
+}
+
+static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv,
+			       enum ib_cm_sidr_status status)
+{
+	struct ib_cm_sidr_rep_param param;
+
+	memset(&param, 0, sizeof param);
+	param.status = status;
+	ib_send_cm_sidr_rep(&cm_id_priv->id, &param);
+}
+
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+				 ib_cm_handler cm_handler,
+				 void *context)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL);
+	if (!cm_id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	cm_id_priv->id.state = IB_CM_IDLE;
+	cm_id_priv->id.device = device;
+	cm_id_priv->id.cm_handler = cm_handler;
+	cm_id_priv->id.context = context;
+	cm_id_priv->id.remote_cm_qpn = 1;
+	ret = cm_alloc_id(cm_id_priv);
+	if (ret)
+		goto error;
+
+	spin_lock_init(&cm_id_priv->lock);
+	init_completion(&cm_id_priv->comp);
+	INIT_LIST_HEAD(&cm_id_priv->work_list);
+	INIT_LIST_HEAD(&cm_id_priv->prim_list);
+	INIT_LIST_HEAD(&cm_id_priv->altr_list);
+	atomic_set(&cm_id_priv->work_count, -1);
+	atomic_set(&cm_id_priv->refcount, 1);
+	return &cm_id_priv->id;
+
+error:
+	kfree(cm_id_priv);
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_cm_id);
+
+static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv)
+{
+	struct cm_work *work;
+
+	if (list_empty(&cm_id_priv->work_list))
+		return NULL;
+
+	work = list_entry(cm_id_priv->work_list.next, struct cm_work, list);
+	list_del(&work->list);
+	return work;
+}
+
+static void cm_free_work(struct cm_work *work)
+{
+	if (work->mad_recv_wc)
+		ib_free_recv_mad(work->mad_recv_wc);
+	kfree(work);
+}
+
+static inline int cm_convert_to_ms(int iba_time)
+{
+	/* approximate conversion to ms from 4.096us x 2^iba_time */
+	return 1 << max(iba_time - 8, 0);
+}
+
+/*
+ * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time
+ * Because of how ack_timeout is stored, adding one doubles the timeout.
+ * To avoid large timeouts, select the max(ack_delay, life_time + 1), and
+ * increment it (round up) only if the other is within 50%.
+ */
+static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time)
+{
+	int ack_timeout = packet_life_time + 1;
+
+	if (ack_timeout >= ca_ack_delay)
+		ack_timeout += (ca_ack_delay >= (ack_timeout - 1));
+	else
+		ack_timeout = ca_ack_delay +
+			      (ack_timeout >= (ca_ack_delay - 1));
+
+	return min(31, ack_timeout);
+}
+
+static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info)
+{
+	if (timewait_info->inserted_remote_id) {
+		rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table);
+		timewait_info->inserted_remote_id = 0;
+	}
+
+	if (timewait_info->inserted_remote_qp) {
+		rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+		timewait_info->inserted_remote_qp = 0;
+	}
+}
+
+static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id)
+{
+	struct cm_timewait_info *timewait_info;
+
+	timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL);
+	if (!timewait_info)
+		return ERR_PTR(-ENOMEM);
+
+	timewait_info->work.local_id = local_id;
+	INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler);
+	timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT;
+	return timewait_info;
+}
+
+static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
+{
+	int wait_time;
+	unsigned long flags;
+	struct cm_device *cm_dev;
+
+	cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client);
+	if (!cm_dev)
+		return;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	cm_cleanup_timewait(cm_id_priv->timewait_info);
+	list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list);
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	/*
+	 * The cm_id could be destroyed by the user before we exit timewait.
+	 * To protect against this, we search for the cm_id after exiting
+	 * timewait before notifying the user that we've exited timewait.
+	 */
+	cm_id_priv->id.state = IB_CM_TIMEWAIT;
+	wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
+
+	/* Check if the device started its remove_one */
+	spin_lock_irqsave(&cm.lock, flags);
+	if (!cm_dev->going_down)
+		queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
+				   msecs_to_jiffies(wait_time));
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	cm_id_priv->timewait_info = NULL;
+}
+
+static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
+{
+	unsigned long flags;
+
+	cm_id_priv->id.state = IB_CM_IDLE;
+	if (cm_id_priv->timewait_info) {
+		spin_lock_irqsave(&cm.lock, flags);
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irqrestore(&cm.lock, flags);
+		kfree(cm_id_priv->timewait_info);
+		cm_id_priv->timewait_info = NULL;
+	}
+}
+
+static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_work *work;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+retest:
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id->state) {
+	case IB_CM_LISTEN:
+		spin_unlock_irq(&cm_id_priv->lock);
+
+		spin_lock_irq(&cm.lock);
+		if (--cm_id_priv->listen_sharecount > 0) {
+			/* The id is still shared. */
+			cm_deref_id(cm_id_priv);
+			spin_unlock_irq(&cm.lock);
+			return;
+		}
+		rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
+		spin_unlock_irq(&cm.lock);
+		break;
+	case IB_CM_SIDR_REQ_SENT:
+		cm_id->state = IB_CM_IDLE;
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	case IB_CM_SIDR_REQ_RCVD:
+		spin_unlock_irq(&cm_id_priv->lock);
+		cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
+		spin_lock_irq(&cm.lock);
+		if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
+			rb_erase(&cm_id_priv->sidr_id_node,
+				 &cm.remote_sidr_table);
+		spin_unlock_irq(&cm.lock);
+		break;
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT,
+			       &cm_id_priv->id.device->node_guid,
+			       sizeof cm_id_priv->id.device->node_guid,
+			       NULL, 0);
+		break;
+	case IB_CM_REQ_RCVD:
+		if (err == -ENOMEM) {
+			/* Do not reject to allow future retries. */
+			cm_reset_to_idle(cm_id_priv);
+			spin_unlock_irq(&cm_id_priv->lock);
+		} else {
+			spin_unlock_irq(&cm_id_priv->lock);
+			ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+				       NULL, 0, NULL, 0);
+		}
+		break;
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* Fall through */
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+			       NULL, 0, NULL, 0);
+		break;
+	case IB_CM_ESTABLISHED:
+		spin_unlock_irq(&cm_id_priv->lock);
+		if (cm_id_priv->qp_type == IB_QPT_XRC_TGT)
+			break;
+		ib_send_cm_dreq(cm_id, NULL, 0);
+		goto retest;
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		cm_enter_timewait(cm_id_priv);
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	case IB_CM_DREQ_RCVD:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_drep(cm_id, NULL, 0);
+		break;
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	}
+
+	spin_lock_irq(&cm.lock);
+	if (!list_empty(&cm_id_priv->altr_list) &&
+	    (!cm_id_priv->altr_send_port_not_ready))
+		list_del(&cm_id_priv->altr_list);
+	if (!list_empty(&cm_id_priv->prim_list) &&
+	    (!cm_id_priv->prim_send_port_not_ready))
+		list_del(&cm_id_priv->prim_list);
+	spin_unlock_irq(&cm.lock);
+
+	cm_free_id(cm_id->local_id);
+	cm_deref_id(cm_id_priv);
+	wait_for_completion(&cm_id_priv->comp);
+	while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
+		cm_free_work(work);
+	kfree(cm_id_priv->private_data);
+	kfree(cm_id_priv);
+}
+
+void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+{
+	cm_destroy_id(cm_id, 0);
+}
+EXPORT_SYMBOL(ib_destroy_cm_id);
+
+/**
+ * __ib_cm_listen - Initiates listening on the specified service ID for
+ *   connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ *   and service ID resolution requests.  The service ID should be specified
+ *   network-byte order.  If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ *   assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ *   range of service IDs.  If set to 0, the service ID is matched
+ *   exactly.  This parameter is ignored if %service_id is set to
+ *   IB_CM_ASSIGN_SERVICE_ID.
+ */
+static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+			  __be64 service_mask)
+{
+	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+	int ret = 0;
+
+	service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
+	service_id &= service_mask;
+	if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
+	    (service_id != IB_CM_ASSIGN_SERVICE_ID))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	if (cm_id->state != IB_CM_IDLE)
+		return -EINVAL;
+
+	cm_id->state = IB_CM_LISTEN;
+	++cm_id_priv->listen_sharecount;
+
+	if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
+		cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
+		cm_id->service_mask = ~cpu_to_be64(0);
+	} else {
+		cm_id->service_id = service_id;
+		cm_id->service_mask = service_mask;
+	}
+	cur_cm_id_priv = cm_insert_listen(cm_id_priv);
+
+	if (cur_cm_id_priv) {
+		cm_id->state = IB_CM_IDLE;
+		--cm_id_priv->listen_sharecount;
+		ret = -EBUSY;
+	}
+	return ret;
+}
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	ret = __ib_cm_listen(cm_id, service_id, service_mask);
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_listen);
+
+/**
+ * Create a new listening ib_cm_id and listen on the given service ID.
+ *
+ * If there's an existing ID listening on that same device and service ID,
+ * return it.
+ *
+ * @device: Device associated with the cm_id.  All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @service_id: Service identifier matched against incoming connection
+ *   and service ID resolution requests.  The service ID should be specified
+ *   network-byte order.  If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ *   assign a service ID to the caller.
+ *
+ * Callers should call ib_destroy_cm_id when done with the listener ID.
+ */
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+				     ib_cm_handler cm_handler,
+				     __be64 service_id)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_cm_id *cm_id;
+	unsigned long flags;
+	int err = 0;
+
+	/* Create an ID in advance, since the creation may sleep */
+	cm_id = ib_create_cm_id(device, cm_handler, NULL);
+	if (IS_ERR(cm_id))
+		return cm_id;
+
+	spin_lock_irqsave(&cm.lock, flags);
+
+	if (service_id == IB_CM_ASSIGN_SERVICE_ID)
+		goto new_id;
+
+	/* Find an existing ID */
+	cm_id_priv = cm_find_listen(device, service_id);
+	if (cm_id_priv) {
+		if (cm_id->cm_handler != cm_handler || cm_id->context) {
+			/* Sharing an ib_cm_id with different handlers is not
+			 * supported */
+			spin_unlock_irqrestore(&cm.lock, flags);
+			return ERR_PTR(-EINVAL);
+		}
+		atomic_inc(&cm_id_priv->refcount);
+		++cm_id_priv->listen_sharecount;
+		spin_unlock_irqrestore(&cm.lock, flags);
+
+		ib_destroy_cm_id(cm_id);
+		cm_id = &cm_id_priv->id;
+		return cm_id;
+	}
+
+new_id:
+	/* Use newly created ID */
+	err = __ib_cm_listen(cm_id, service_id, 0);
+
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	if (err) {
+		ib_destroy_cm_id(cm_id);
+		return ERR_PTR(err);
+	}
+	return cm_id;
+}
+EXPORT_SYMBOL(ib_cm_insert_listen);
+
+static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
+			  enum cm_msg_sequence msg_seq)
+{
+	u64 hi_tid, low_tid;
+
+	hi_tid   = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32;
+	low_tid  = (u64) ((__force u32)cm_id_priv->id.local_id |
+			  (msg_seq << 30));
+	return cpu_to_be64(hi_tid | low_tid);
+}
+
+static void cm_format_mad_hdr(struct ib_mad_hdr *hdr,
+			      __be16 attr_id, __be64 tid)
+{
+	hdr->base_version  = IB_MGMT_BASE_VERSION;
+	hdr->mgmt_class	   = IB_MGMT_CLASS_CM;
+	hdr->class_version = IB_CM_CLASS_VERSION;
+	hdr->method	   = IB_MGMT_METHOD_SEND;
+	hdr->attr_id	   = attr_id;
+	hdr->tid	   = tid;
+}
+
+static void cm_format_req(struct cm_req_msg *req_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_cm_req_param *param)
+{
+	struct sa_path_rec *pri_path = param->primary_path;
+	struct sa_path_rec *alt_path = param->alternate_path;
+	bool pri_ext = false;
+
+	if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA)
+		pri_ext = opa_is_extended_lid(pri_path->opa.dlid,
+					      pri_path->opa.slid);
+
+	cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ));
+
+	req_msg->local_comm_id = cm_id_priv->id.local_id;
+	req_msg->service_id = param->service_id;
+	req_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+	cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num));
+	cm_req_set_init_depth(req_msg, param->initiator_depth);
+	cm_req_set_remote_resp_timeout(req_msg,
+				       param->remote_cm_response_timeout);
+	cm_req_set_qp_type(req_msg, param->qp_type);
+	cm_req_set_flow_ctrl(req_msg, param->flow_control);
+	cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn));
+	cm_req_set_local_resp_timeout(req_msg,
+				      param->local_cm_response_timeout);
+	req_msg->pkey = param->primary_path->pkey;
+	cm_req_set_path_mtu(req_msg, param->primary_path->mtu);
+	cm_req_set_max_cm_retries(req_msg, param->max_cm_retries);
+
+	if (param->qp_type != IB_QPT_XRC_INI) {
+		cm_req_set_resp_res(req_msg, param->responder_resources);
+		cm_req_set_retry_count(req_msg, param->retry_count);
+		cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count);
+		cm_req_set_srq(req_msg, param->srq);
+	}
+
+	req_msg->primary_local_gid = pri_path->sgid;
+	req_msg->primary_remote_gid = pri_path->dgid;
+	if (pri_ext) {
+		req_msg->primary_local_gid.global.interface_id
+			= OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid));
+		req_msg->primary_remote_gid.global.interface_id
+			= OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid));
+	}
+	if (pri_path->hop_limit <= 1) {
+		req_msg->primary_local_lid = pri_ext ? 0 :
+			htons(ntohl(sa_path_get_slid(pri_path)));
+		req_msg->primary_remote_lid = pri_ext ? 0 :
+			htons(ntohl(sa_path_get_dlid(pri_path)));
+	} else {
+		/* Work-around until there's a way to obtain remote LID info */
+		req_msg->primary_local_lid = IB_LID_PERMISSIVE;
+		req_msg->primary_remote_lid = IB_LID_PERMISSIVE;
+	}
+	cm_req_set_primary_flow_label(req_msg, pri_path->flow_label);
+	cm_req_set_primary_packet_rate(req_msg, pri_path->rate);
+	req_msg->primary_traffic_class = pri_path->traffic_class;
+	req_msg->primary_hop_limit = pri_path->hop_limit;
+	cm_req_set_primary_sl(req_msg, pri_path->sl);
+	cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1));
+	cm_req_set_primary_local_ack_timeout(req_msg,
+		cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+			       pri_path->packet_life_time));
+
+	if (alt_path) {
+		bool alt_ext = false;
+
+		if (alt_path->rec_type == SA_PATH_REC_TYPE_OPA)
+			alt_ext = opa_is_extended_lid(alt_path->opa.dlid,
+						      alt_path->opa.slid);
+
+		req_msg->alt_local_gid = alt_path->sgid;
+		req_msg->alt_remote_gid = alt_path->dgid;
+		if (alt_ext) {
+			req_msg->alt_local_gid.global.interface_id
+				= OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid));
+			req_msg->alt_remote_gid.global.interface_id
+				= OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid));
+		}
+		if (alt_path->hop_limit <= 1) {
+			req_msg->alt_local_lid = alt_ext ? 0 :
+				htons(ntohl(sa_path_get_slid(alt_path)));
+			req_msg->alt_remote_lid = alt_ext ? 0 :
+				htons(ntohl(sa_path_get_dlid(alt_path)));
+		} else {
+			req_msg->alt_local_lid = IB_LID_PERMISSIVE;
+			req_msg->alt_remote_lid = IB_LID_PERMISSIVE;
+		}
+		cm_req_set_alt_flow_label(req_msg,
+					  alt_path->flow_label);
+		cm_req_set_alt_packet_rate(req_msg, alt_path->rate);
+		req_msg->alt_traffic_class = alt_path->traffic_class;
+		req_msg->alt_hop_limit = alt_path->hop_limit;
+		cm_req_set_alt_sl(req_msg, alt_path->sl);
+		cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1));
+		cm_req_set_alt_local_ack_timeout(req_msg,
+			cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+				       alt_path->packet_life_time));
+	}
+
+	if (param->private_data && param->private_data_len)
+		memcpy(req_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+static int cm_validate_req_param(struct ib_cm_req_param *param)
+{
+	/* peer-to-peer not supported */
+	if (param->peer_to_peer)
+		return -EINVAL;
+
+	if (!param->primary_path)
+		return -EINVAL;
+
+	if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC &&
+	    param->qp_type != IB_QPT_XRC_INI)
+		return -EINVAL;
+
+	if (param->private_data &&
+	    param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	if (param->alternate_path &&
+	    (param->alternate_path->pkey != param->primary_path->pkey ||
+	     param->alternate_path->mtu != param->primary_path->mtu))
+		return -EINVAL;
+
+	return 0;
+}
+
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+		   struct ib_cm_req_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_req_msg *req_msg;
+	unsigned long flags;
+	int ret;
+
+	ret = cm_validate_req_param(param);
+	if (ret)
+		return ret;
+
+	/* Verify that we're not in timewait. */
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_IDLE) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = -EINVAL;
+		goto out;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+							    id.local_id);
+	if (IS_ERR(cm_id_priv->timewait_info)) {
+		ret = PTR_ERR(cm_id_priv->timewait_info);
+		goto out;
+	}
+
+	ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av,
+				 cm_id_priv);
+	if (ret)
+		goto error1;
+	if (param->alternate_path) {
+		ret = cm_init_av_by_path(param->alternate_path,
+					 &cm_id_priv->alt_av, cm_id_priv);
+		if (ret)
+			goto error1;
+	}
+	cm_id->service_id = param->service_id;
+	cm_id->service_mask = ~cpu_to_be64(0);
+	cm_id_priv->timeout_ms = cm_convert_to_ms(
+				    param->primary_path->packet_life_time) * 2 +
+				 cm_convert_to_ms(
+				    param->remote_cm_response_timeout);
+	cm_id_priv->max_cm_retries = param->max_cm_retries;
+	cm_id_priv->initiator_depth = param->initiator_depth;
+	cm_id_priv->responder_resources = param->responder_resources;
+	cm_id_priv->retry_count = param->retry_count;
+	cm_id_priv->path_mtu = param->primary_path->mtu;
+	cm_id_priv->pkey = param->primary_path->pkey;
+	cm_id_priv->qp_type = param->qp_type;
+
+	ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg);
+	if (ret)
+		goto error1;
+
+	req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad;
+	cm_format_req(req_msg, cm_id_priv, param);
+	cm_id_priv->tid = req_msg->hdr.tid;
+	cm_id_priv->msg->timeout_ms = cm_id_priv->timeout_ms;
+	cm_id_priv->msg->context[1] = (void *) (unsigned long) IB_CM_REQ_SENT;
+
+	cm_id_priv->local_qpn = cm_req_get_local_qpn(req_msg);
+	cm_id_priv->rq_psn = cm_req_get_starting_psn(req_msg);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	ret = ib_post_send_mad(cm_id_priv->msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		goto error2;
+	}
+	BUG_ON(cm_id->state != IB_CM_IDLE);
+	cm_id->state = IB_CM_REQ_SENT;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error2:	cm_free_msg(cm_id_priv->msg);
+error1:	kfree(cm_id_priv->timewait_info);
+out:	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_req);
+
+static int cm_issue_rej(struct cm_port *port,
+			struct ib_mad_recv_wc *mad_recv_wc,
+			enum ib_cm_rej_reason reason,
+			enum cm_msg_response msg_rejected,
+			void *ari, u8 ari_length)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	struct cm_rej_msg *rej_msg, *rcv_msg;
+	int ret;
+
+	ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+	if (ret)
+		return ret;
+
+	/* We just need common CM header information.  Cast to any message. */
+	rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad;
+	rej_msg = (struct cm_rej_msg *) msg->mad;
+
+	cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid);
+	rej_msg->remote_comm_id = rcv_msg->local_comm_id;
+	rej_msg->local_comm_id = rcv_msg->remote_comm_id;
+	cm_rej_set_msg_rejected(rej_msg, msg_rejected);
+	rej_msg->reason = cpu_to_be16(reason);
+
+	if (ari && ari_length) {
+		cm_rej_set_reject_info_len(rej_msg, ari_length);
+		memcpy(rej_msg->ari, ari, ari_length);
+	}
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+	return ret;
+}
+
+static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid,
+				    __be32 local_qpn, __be32 remote_qpn)
+{
+	return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) ||
+		((local_ca_guid == remote_ca_guid) &&
+		 (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn))));
+}
+
+static bool cm_req_has_alt_path(struct cm_req_msg *req_msg)
+{
+	return ((req_msg->alt_local_lid) ||
+		(ib_is_opa_gid(&req_msg->alt_local_gid)));
+}
+
+static void cm_path_set_rec_type(struct ib_device *ib_device, u8 port_num,
+				 struct sa_path_rec *path, union ib_gid *gid)
+{
+	if (ib_is_opa_gid(gid) && rdma_cap_opa_ah(ib_device, port_num))
+		path->rec_type = SA_PATH_REC_TYPE_OPA;
+	else
+		path->rec_type = SA_PATH_REC_TYPE_IB;
+}
+
+static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg,
+					struct sa_path_rec *primary_path,
+					struct sa_path_rec *alt_path)
+{
+	u32 lid;
+
+	if (primary_path->rec_type != SA_PATH_REC_TYPE_OPA) {
+		sa_path_set_dlid(primary_path,
+				 ntohs(req_msg->primary_local_lid));
+		sa_path_set_slid(primary_path,
+				 ntohs(req_msg->primary_remote_lid));
+	} else {
+		lid = opa_get_lid_from_gid(&req_msg->primary_local_gid);
+		sa_path_set_dlid(primary_path, lid);
+
+		lid = opa_get_lid_from_gid(&req_msg->primary_remote_gid);
+		sa_path_set_slid(primary_path, lid);
+	}
+
+	if (!cm_req_has_alt_path(req_msg))
+		return;
+
+	if (alt_path->rec_type != SA_PATH_REC_TYPE_OPA) {
+		sa_path_set_dlid(alt_path, ntohs(req_msg->alt_local_lid));
+		sa_path_set_slid(alt_path, ntohs(req_msg->alt_remote_lid));
+	} else {
+		lid = opa_get_lid_from_gid(&req_msg->alt_local_gid);
+		sa_path_set_dlid(alt_path, lid);
+
+		lid = opa_get_lid_from_gid(&req_msg->alt_remote_gid);
+		sa_path_set_slid(alt_path, lid);
+	}
+}
+
+static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
+				     struct sa_path_rec *primary_path,
+				     struct sa_path_rec *alt_path)
+{
+	primary_path->dgid = req_msg->primary_local_gid;
+	primary_path->sgid = req_msg->primary_remote_gid;
+	primary_path->flow_label = cm_req_get_primary_flow_label(req_msg);
+	primary_path->hop_limit = req_msg->primary_hop_limit;
+	primary_path->traffic_class = req_msg->primary_traffic_class;
+	primary_path->reversible = 1;
+	primary_path->pkey = req_msg->pkey;
+	primary_path->sl = cm_req_get_primary_sl(req_msg);
+	primary_path->mtu_selector = IB_SA_EQ;
+	primary_path->mtu = cm_req_get_path_mtu(req_msg);
+	primary_path->rate_selector = IB_SA_EQ;
+	primary_path->rate = cm_req_get_primary_packet_rate(req_msg);
+	primary_path->packet_life_time_selector = IB_SA_EQ;
+	primary_path->packet_life_time =
+		cm_req_get_primary_local_ack_timeout(req_msg);
+	primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+	primary_path->service_id = req_msg->service_id;
+	if (sa_path_is_roce(primary_path))
+		primary_path->roce.route_resolved = false;
+
+	if (cm_req_has_alt_path(req_msg)) {
+		alt_path->dgid = req_msg->alt_local_gid;
+		alt_path->sgid = req_msg->alt_remote_gid;
+		alt_path->flow_label = cm_req_get_alt_flow_label(req_msg);
+		alt_path->hop_limit = req_msg->alt_hop_limit;
+		alt_path->traffic_class = req_msg->alt_traffic_class;
+		alt_path->reversible = 1;
+		alt_path->pkey = req_msg->pkey;
+		alt_path->sl = cm_req_get_alt_sl(req_msg);
+		alt_path->mtu_selector = IB_SA_EQ;
+		alt_path->mtu = cm_req_get_path_mtu(req_msg);
+		alt_path->rate_selector = IB_SA_EQ;
+		alt_path->rate = cm_req_get_alt_packet_rate(req_msg);
+		alt_path->packet_life_time_selector = IB_SA_EQ;
+		alt_path->packet_life_time =
+			cm_req_get_alt_local_ack_timeout(req_msg);
+		alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+		alt_path->service_id = req_msg->service_id;
+
+		if (sa_path_is_roce(alt_path))
+			alt_path->roce.route_resolved = false;
+	}
+	cm_format_path_lid_from_req(req_msg, primary_path, alt_path);
+}
+
+static u16 cm_get_bth_pkey(struct cm_work *work)
+{
+	struct ib_device *ib_dev = work->port->cm_dev->ib_device;
+	u8 port_num = work->port->port_num;
+	u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
+	u16 pkey;
+	int ret;
+
+	ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey);
+	if (ret) {
+		dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n",
+				     port_num, pkey_index, ret);
+		return 0;
+	}
+
+	return pkey;
+}
+
+/**
+ * Convert OPA SGID to IB SGID
+ * ULPs (such as IPoIB) do not understand OPA GIDs and will
+ * reject them as the local_gid will not match the sgid. Therefore,
+ * change the pathrec's SGID to an IB SGID.
+ *
+ * @work: Work completion
+ * @path: Path record
+ */
+static void cm_opa_to_ib_sgid(struct cm_work *work,
+			      struct sa_path_rec *path)
+{
+	struct ib_device *dev = work->port->cm_dev->ib_device;
+	u8 port_num = work->port->port_num;
+
+	if (rdma_cap_opa_ah(dev, port_num) &&
+	    (ib_is_opa_gid(&path->sgid))) {
+		union ib_gid sgid;
+
+		if (ib_get_cached_gid(dev, port_num, 0, &sgid, NULL)) {
+			dev_warn(&dev->dev,
+				 "Error updating sgid in CM request\n");
+			return;
+		}
+
+		path->sgid = sgid;
+	}
+}
+
+static void cm_format_req_event(struct cm_work *work,
+				struct cm_id_private *cm_id_priv,
+				struct ib_cm_id *listen_id)
+{
+	struct cm_req_msg *req_msg;
+	struct ib_cm_req_event_param *param;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.req_rcvd;
+	param->listen_id = listen_id;
+	param->bth_pkey = cm_get_bth_pkey(work);
+	param->port = cm_id_priv->av.port->port_num;
+	param->primary_path = &work->path[0];
+	cm_opa_to_ib_sgid(work, param->primary_path);
+	if (cm_req_has_alt_path(req_msg)) {
+		param->alternate_path = &work->path[1];
+		cm_opa_to_ib_sgid(work, param->alternate_path);
+	} else {
+		param->alternate_path = NULL;
+	}
+	param->remote_ca_guid = req_msg->local_ca_guid;
+	param->remote_qkey = be32_to_cpu(req_msg->local_qkey);
+	param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg));
+	param->qp_type = cm_req_get_qp_type(req_msg);
+	param->starting_psn = be32_to_cpu(cm_req_get_starting_psn(req_msg));
+	param->responder_resources = cm_req_get_init_depth(req_msg);
+	param->initiator_depth = cm_req_get_resp_res(req_msg);
+	param->local_cm_response_timeout =
+					cm_req_get_remote_resp_timeout(req_msg);
+	param->flow_control = cm_req_get_flow_ctrl(req_msg);
+	param->remote_cm_response_timeout =
+					cm_req_get_local_resp_timeout(req_msg);
+	param->retry_count = cm_req_get_retry_count(req_msg);
+	param->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+	param->srq = cm_req_get_srq(req_msg);
+	work->cm_event.private_data = &req_msg->private_data;
+}
+
+static void cm_process_work(struct cm_id_private *cm_id_priv,
+			    struct cm_work *work)
+{
+	int ret;
+
+	/* We will typically only have the current event to report. */
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event);
+	cm_free_work(work);
+
+	while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) {
+		spin_lock_irq(&cm_id_priv->lock);
+		work = cm_dequeue_work(cm_id_priv);
+		spin_unlock_irq(&cm_id_priv->lock);
+		BUG_ON(!work);
+		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id,
+						&work->cm_event);
+		cm_free_work(work);
+	}
+	cm_deref_id(cm_id_priv);
+	if (ret)
+		cm_destroy_id(&cm_id_priv->id, ret);
+}
+
+static void cm_format_mra(struct cm_mra_msg *mra_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum cm_msg_response msg_mraed, u8 service_timeout,
+			  const void *private_data, u8 private_data_len)
+{
+	cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
+	cm_mra_set_msg_mraed(mra_msg, msg_mraed);
+	mra_msg->local_comm_id = cm_id_priv->id.local_id;
+	mra_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_mra_set_service_timeout(mra_msg, service_timeout);
+
+	if (private_data && private_data_len)
+		memcpy(mra_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_format_rej(struct cm_rej_msg *rej_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum ib_cm_rej_reason reason,
+			  void *ari,
+			  u8 ari_length,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid);
+	rej_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	switch(cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+		rej_msg->local_comm_id = 0;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+		break;
+	case IB_CM_MRA_REQ_SENT:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+		break;
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REP);
+		break;
+	default:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_OTHER);
+		break;
+	}
+
+	rej_msg->reason = cpu_to_be16(reason);
+	if (ari && ari_length) {
+		cm_rej_set_reject_info_len(rej_msg, ari_length);
+		memcpy(rej_msg->ari, ari, ari_length);
+	}
+
+	if (private_data && private_data_len)
+		memcpy(rej_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_dup_req_handler(struct cm_work *work,
+			       struct cm_id_private *cm_id_priv)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REQ_COUNTER]);
+
+	/* Quick state check to discard duplicate REQs. */
+	if (cm_id_priv->id.state == IB_CM_REQ_RCVD)
+		return;
+
+	ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+	if (ret)
+		return;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_MRA_REQ_SENT:
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+		break;
+	case IB_CM_TIMEWAIT:
+		cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv,
+			      IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0);
+		break;
+	default:
+		goto unlock;
+	}
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		goto free;
+	return;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+free:	cm_free_msg(msg);
+}
+
+static struct cm_id_private * cm_match_req(struct cm_work *work,
+					   struct cm_id_private *cm_id_priv)
+{
+	struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv;
+	struct cm_timewait_info *timewait_info;
+	struct cm_req_msg *req_msg;
+	struct ib_cm_id *cm_id;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+	/* Check for possible duplicate REQ. */
+	spin_lock_irq(&cm.lock);
+	timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info);
+	if (timewait_info) {
+		cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
+					   timewait_info->work.remote_id);
+		spin_unlock_irq(&cm.lock);
+		if (cur_cm_id_priv) {
+			cm_dup_req_handler(work, cur_cm_id_priv);
+			cm_deref_id(cur_cm_id_priv);
+		}
+		return NULL;
+	}
+
+	/* Check for stale connections. */
+	timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+	if (timewait_info) {
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
+					   timewait_info->work.remote_id);
+
+		spin_unlock_irq(&cm.lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
+			     NULL, 0);
+		if (cur_cm_id_priv) {
+			cm_id = &cur_cm_id_priv->id;
+			ib_send_cm_dreq(cm_id, NULL, 0);
+			cm_deref_id(cur_cm_id_priv);
+		}
+		return NULL;
+	}
+
+	/* Find matching listen request. */
+	listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
+					   req_msg->service_id);
+	if (!listen_cm_id_priv) {
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irq(&cm.lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ,
+			     NULL, 0);
+		goto out;
+	}
+	atomic_inc(&listen_cm_id_priv->refcount);
+	atomic_inc(&cm_id_priv->refcount);
+	cm_id_priv->id.state = IB_CM_REQ_RCVD;
+	atomic_inc(&cm_id_priv->work_count);
+	spin_unlock_irq(&cm.lock);
+out:
+	return listen_cm_id_priv;
+}
+
+/*
+ * Work-around for inter-subnet connections.  If the LIDs are permissive,
+ * we need to override the LID/SL data in the REQ with the LID information
+ * in the work completion.
+ */
+static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
+{
+	if (!cm_req_get_primary_subnet_local(req_msg)) {
+		if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->primary_local_lid = ib_lid_be16(wc->slid);
+			cm_req_set_primary_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+
+	if (!cm_req_get_alt_subnet_local(req_msg)) {
+		if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->alt_local_lid = ib_lid_be16(wc->slid);
+			cm_req_set_alt_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+}
+
+static int cm_req_handler(struct cm_work *work)
+{
+	struct ib_cm_id *cm_id;
+	struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
+	struct cm_req_msg *req_msg;
+	union ib_gid gid;
+	struct ib_gid_attr gid_attr;
+	const struct ib_global_route *grh;
+	int ret;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	cm_id_priv->id.remote_id = req_msg->local_comm_id;
+	ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				      work->mad_recv_wc->recv_buf.grh,
+				      &cm_id_priv->av);
+	if (ret)
+		goto destroy;
+	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+							    id.local_id);
+	if (IS_ERR(cm_id_priv->timewait_info)) {
+		ret = PTR_ERR(cm_id_priv->timewait_info);
+		goto destroy;
+	}
+	cm_id_priv->timewait_info->work.remote_id = req_msg->local_comm_id;
+	cm_id_priv->timewait_info->remote_ca_guid = req_msg->local_ca_guid;
+	cm_id_priv->timewait_info->remote_qpn = cm_req_get_local_qpn(req_msg);
+
+	listen_cm_id_priv = cm_match_req(work, cm_id_priv);
+	if (!listen_cm_id_priv) {
+		pr_debug("%s: local_id %d, no listen_cm_id_priv\n", __func__,
+			 be32_to_cpu(cm_id->local_id));
+		ret = -EINVAL;
+		goto free_timeinfo;
+	}
+
+	cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
+	cm_id_priv->id.context = listen_cm_id_priv->id.context;
+	cm_id_priv->id.service_id = req_msg->service_id;
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+	cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
+
+	memset(&work->path[0], 0, sizeof(work->path[0]));
+	if (cm_req_has_alt_path(req_msg))
+		memset(&work->path[1], 0, sizeof(work->path[1]));
+	grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr);
+	ret = ib_get_cached_gid(work->port->cm_dev->ib_device,
+				work->port->port_num,
+				grh->sgid_index,
+				&gid, &gid_attr);
+	if (ret) {
+		ib_send_cm_rej(cm_id, IB_CM_REJ_UNSUPPORTED, NULL, 0, NULL, 0);
+		goto rejected;
+	}
+
+	if (gid_attr.ndev) {
+		work->path[0].rec_type =
+			sa_conv_gid_to_pathrec_type(gid_attr.gid_type);
+		sa_path_set_ifindex(&work->path[0],
+				    gid_attr.ndev->ifindex);
+		sa_path_set_ndev(&work->path[0],
+				 dev_net(gid_attr.ndev));
+		dev_put(gid_attr.ndev);
+	} else {
+		cm_path_set_rec_type(work->port->cm_dev->ib_device,
+				     work->port->port_num,
+				     &work->path[0],
+				     &req_msg->primary_local_gid);
+	}
+	if (cm_req_has_alt_path(req_msg))
+		work->path[1].rec_type = work->path[0].rec_type;
+	cm_format_paths_from_req(req_msg, &work->path[0],
+				 &work->path[1]);
+	if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE)
+		sa_path_set_dmac(&work->path[0],
+				 cm_id_priv->av.ah_attr.roce.dmac);
+	work->path[0].hop_limit = grh->hop_limit;
+	ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av,
+				 cm_id_priv);
+	if (ret) {
+		int err;
+
+		err = ib_get_cached_gid(work->port->cm_dev->ib_device,
+					work->port->port_num, 0,
+					&work->path[0].sgid,
+					NULL);
+		if (err)
+			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+				       NULL, 0, NULL, 0);
+		else
+			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+				       &work->path[0].sgid,
+				       sizeof(work->path[0].sgid),
+				       NULL, 0);
+		goto rejected;
+	}
+	if (cm_req_has_alt_path(req_msg)) {
+		ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av,
+					 cm_id_priv);
+		if (ret) {
+			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID,
+				       &work->path[0].sgid,
+				       sizeof(work->path[0].sgid), NULL, 0);
+			goto rejected;
+		}
+	}
+	cm_id_priv->tid = req_msg->hdr.tid;
+	cm_id_priv->timeout_ms = cm_convert_to_ms(
+					cm_req_get_local_resp_timeout(req_msg));
+	cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg);
+	cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg);
+	cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg);
+	cm_id_priv->responder_resources = cm_req_get_init_depth(req_msg);
+	cm_id_priv->path_mtu = cm_req_get_path_mtu(req_msg);
+	cm_id_priv->pkey = req_msg->pkey;
+	cm_id_priv->sq_psn = cm_req_get_starting_psn(req_msg);
+	cm_id_priv->retry_count = cm_req_get_retry_count(req_msg);
+	cm_id_priv->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+	cm_id_priv->qp_type = cm_req_get_qp_type(req_msg);
+
+	cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
+	cm_process_work(cm_id_priv, work);
+	cm_deref_id(listen_cm_id_priv);
+	return 0;
+
+rejected:
+	atomic_dec(&cm_id_priv->refcount);
+	cm_deref_id(listen_cm_id_priv);
+free_timeinfo:
+	kfree(cm_id_priv->timewait_info);
+destroy:
+	ib_destroy_cm_id(cm_id);
+	return ret;
+}
+
+static void cm_format_rep(struct cm_rep_msg *rep_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_cm_rep_param *param)
+{
+	cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid);
+	rep_msg->local_comm_id = cm_id_priv->id.local_id;
+	rep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn));
+	rep_msg->resp_resources = param->responder_resources;
+	cm_rep_set_target_ack_delay(rep_msg,
+				    cm_id_priv->av.port->cm_dev->ack_delay);
+	cm_rep_set_failover(rep_msg, param->failover_accepted);
+	cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count);
+	rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+
+	if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) {
+		rep_msg->initiator_depth = param->initiator_depth;
+		cm_rep_set_flow_ctrl(rep_msg, param->flow_control);
+		cm_rep_set_srq(rep_msg, param->srq);
+		cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num));
+	} else {
+		cm_rep_set_srq(rep_msg, 1);
+		cm_rep_set_local_eecn(rep_msg, cpu_to_be32(param->qp_num));
+	}
+
+	if (param->private_data && param->private_data_len)
+		memcpy(rep_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+		   struct ib_cm_rep_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	struct cm_rep_msg *rep_msg;
+	unsigned long flags;
+	int ret;
+
+	if (param->private_data &&
+	    param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_REQ_RCVD &&
+	    cm_id->state != IB_CM_MRA_REQ_SENT) {
+		pr_debug("%s: local_comm_id %d, cm_id->state: %d\n", __func__,
+			 be32_to_cpu(cm_id_priv->id.local_id), cm_id->state);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	rep_msg = (struct cm_rep_msg *) msg->mad;
+	cm_format_rep(rep_msg, cm_id_priv, param);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_REP_SENT;
+	cm_id_priv->msg = msg;
+	cm_id_priv->initiator_depth = param->initiator_depth;
+	cm_id_priv->responder_resources = param->responder_resources;
+	cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg);
+	cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF);
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rep);
+
+static void cm_format_rtu(struct cm_rtu_msg *rtu_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid);
+	rtu_msg->local_comm_id = cm_id_priv->id.local_id;
+	rtu_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	if (private_data && private_data_len)
+		memcpy(rtu_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	void *data;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_REP_RCVD &&
+	    cm_id->state != IB_CM_MRA_REP_SENT) {
+		pr_debug("%s: local_id %d, cm_id->state %d\n", __func__,
+			 be32_to_cpu(cm_id->local_id), cm_id->state);
+		ret = -EINVAL;
+		goto error;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto error;
+
+	cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+		      private_data, private_data_len);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		kfree(data);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_ESTABLISHED;
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rtu);
+
+static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type)
+{
+	struct cm_rep_msg *rep_msg;
+	struct ib_cm_rep_event_param *param;
+
+	rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.rep_rcvd;
+	param->remote_ca_guid = rep_msg->local_ca_guid;
+	param->remote_qkey = be32_to_cpu(rep_msg->local_qkey);
+	param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type));
+	param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg));
+	param->responder_resources = rep_msg->initiator_depth;
+	param->initiator_depth = rep_msg->resp_resources;
+	param->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+	param->failover_accepted = cm_rep_get_failover(rep_msg);
+	param->flow_control = cm_rep_get_flow_ctrl(rep_msg);
+	param->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+	param->srq = cm_rep_get_srq(rep_msg);
+	work->cm_event.private_data = &rep_msg->private_data;
+}
+
+static void cm_dup_rep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rep_msg *rep_msg;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id,
+				   rep_msg->local_comm_id);
+	if (!cm_id_priv)
+		return;
+
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REP_COUNTER]);
+	ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+	if (ret)
+		goto deref;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state == IB_CM_ESTABLISHED)
+		cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+	else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+	else
+		goto unlock;
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		goto free;
+	goto deref;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+free:	cm_free_msg(msg);
+deref:	cm_deref_id(cm_id_priv);
+}
+
+static int cm_rep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rep_msg *rep_msg;
+	int ret;
+	struct cm_id_private *cur_cm_id_priv;
+	struct ib_cm_id *cm_id;
+	struct cm_timewait_info *timewait_info;
+
+	rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0);
+	if (!cm_id_priv) {
+		cm_dup_rep_handler(work);
+		pr_debug("%s: remote_comm_id %d, no cm_id_priv\n", __func__,
+			 be32_to_cpu(rep_msg->remote_comm_id));
+		return -EINVAL;
+	}
+
+	cm_format_rep_event(work, cm_id_priv->qp_type);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+		break;
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		pr_debug("%s: cm_id_priv->id.state: %d, local_comm_id %d, remote_comm_id %d\n",
+			 __func__, cm_id_priv->id.state,
+			 be32_to_cpu(rep_msg->local_comm_id),
+			 be32_to_cpu(rep_msg->remote_comm_id));
+		goto error;
+	}
+
+	cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id;
+	cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid;
+	cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
+
+	spin_lock(&cm.lock);
+	/* Check for duplicate REP. */
+	if (cm_insert_remote_id(cm_id_priv->timewait_info)) {
+		spin_unlock(&cm.lock);
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		pr_debug("%s: Failed to insert remote id %d\n", __func__,
+			 be32_to_cpu(rep_msg->remote_comm_id));
+		goto error;
+	}
+	/* Check for a stale connection. */
+	timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+	if (timewait_info) {
+		rb_erase(&cm_id_priv->timewait_info->remote_id_node,
+			 &cm.remote_id_table);
+		cm_id_priv->timewait_info->inserted_remote_id = 0;
+		cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
+					   timewait_info->work.remote_id);
+
+		spin_unlock(&cm.lock);
+		spin_unlock_irq(&cm_id_priv->lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
+			     NULL, 0);
+		ret = -EINVAL;
+		pr_debug("%s: Stale connection. local_comm_id %d, remote_comm_id %d\n",
+			 __func__, be32_to_cpu(rep_msg->local_comm_id),
+			 be32_to_cpu(rep_msg->remote_comm_id));
+
+		if (cur_cm_id_priv) {
+			cm_id = &cur_cm_id_priv->id;
+			ib_send_cm_dreq(cm_id, NULL, 0);
+			cm_deref_id(cur_cm_id_priv);
+		}
+
+		goto error;
+	}
+	spin_unlock(&cm.lock);
+
+	cm_id_priv->id.state = IB_CM_REP_RCVD;
+	cm_id_priv->id.remote_id = rep_msg->local_comm_id;
+	cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
+	cm_id_priv->initiator_depth = rep_msg->resp_resources;
+	cm_id_priv->responder_resources = rep_msg->initiator_depth;
+	cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg);
+	cm_id_priv->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+	cm_id_priv->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+	cm_id_priv->av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->av.timeout - 1);
+	cm_id_priv->alt_av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->alt_av.timeout - 1);
+
+	/* todo: handle peer_to_peer */
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+error:
+	cm_deref_id(cm_id_priv);
+	return ret;
+}
+
+static int cm_establish_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	/* See comment in cm_establish about lookup. */
+	cm_id_priv = cm_acquire_id(work->local_id, work->remote_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_rtu_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rtu_msg *rtu_msg;
+	int ret;
+
+	rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rtu_msg->remote_comm_id,
+				   rtu_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &rtu_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_REP_SENT &&
+	    cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_RTU_COUNTER]);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_ESTABLISHED;
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_dreq(struct cm_dreq_msg *dreq_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_DREQ));
+	dreq_msg->local_comm_id = cm_id_priv->id.local_id;
+	dreq_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_dreq_set_remote_qpn(dreq_msg, cm_id_priv->remote_qpn);
+
+	if (private_data && private_data_len)
+		memcpy(dreq_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED) {
+		pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__,
+			 be32_to_cpu(cm_id->local_id), cm_id->state);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (cm_id->lap_state == IB_CM_LAP_SENT ||
+	    cm_id->lap_state == IB_CM_MRA_LAP_RCVD)
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret) {
+		cm_enter_timewait(cm_id_priv);
+		goto out;
+	}
+
+	cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv,
+		       private_data, private_data_len);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		cm_enter_timewait(cm_id_priv);
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_DREQ_SENT;
+	cm_id_priv->msg = msg;
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_dreq);
+
+static void cm_format_drep(struct cm_drep_msg *drep_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid);
+	drep_msg->local_comm_id = cm_id_priv->id.local_id;
+	drep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	if (private_data && private_data_len)
+		memcpy(drep_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	void *data;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_DREQ_RCVD) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		kfree(data);
+		pr_debug("%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n",
+			 __func__, be32_to_cpu(cm_id->local_id), cm_id->state);
+		return -EINVAL;
+	}
+
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	cm_enter_timewait(cm_id_priv);
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+		       private_data, private_data_len);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_drep);
+
+static int cm_issue_drep(struct cm_port *port,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	struct cm_dreq_msg *dreq_msg;
+	struct cm_drep_msg *drep_msg;
+	int ret;
+
+	ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+	if (ret)
+		return ret;
+
+	dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad;
+	drep_msg = (struct cm_drep_msg *) msg->mad;
+
+	cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid);
+	drep_msg->remote_comm_id = dreq_msg->local_comm_id;
+	drep_msg->local_comm_id = dreq_msg->remote_comm_id;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+	return ret;
+}
+
+static int cm_dreq_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_dreq_msg *dreq_msg;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id,
+				   dreq_msg->local_comm_id);
+	if (!cm_id_priv) {
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		cm_issue_drep(work->port, work->mad_recv_wc);
+		pr_debug("%s: no cm_id_priv, local_comm_id %d, remote_comm_id %d\n",
+			 __func__, be32_to_cpu(dreq_msg->local_comm_id),
+			 be32_to_cpu(dreq_msg->remote_comm_id));
+		return -EINVAL;
+	}
+
+	work->cm_event.private_data = &dreq_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->local_qpn != cm_dreq_get_remote_qpn(dreq_msg))
+		goto unlock;
+
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REP_SENT:
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
+		    cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+			ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		break;
+	case IB_CM_MRA_REP_RCVD:
+		break;
+	case IB_CM_TIMEWAIT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
+		if (IS_ERR(msg))
+			goto unlock;
+
+		cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+			       cm_id_priv->private_data,
+			       cm_id_priv->private_data_len);
+		spin_unlock_irq(&cm_id_priv->lock);
+
+		if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) ||
+		    ib_post_send_mad(msg, NULL))
+			cm_free_msg(msg);
+		goto deref;
+	case IB_CM_DREQ_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		goto unlock;
+	default:
+		pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
+		goto unlock;
+	}
+	cm_id_priv->id.state = IB_CM_DREQ_RCVD;
+	cm_id_priv->tid = dreq_msg->hdr.tid;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+deref:	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_drep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_drep_msg *drep_msg;
+	int ret;
+
+	drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(drep_msg->remote_comm_id,
+				   drep_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &drep_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_DREQ_SENT &&
+	    cm_id_priv->id.state != IB_CM_DREQ_RCVD) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_enter_timewait(cm_id_priv);
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+		   enum ib_cm_rej_reason reason,
+		   void *ari,
+		   u8 ari_length,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) ||
+	    (ari && ari_length > IB_CM_REJ_ARI_LENGTH))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id->state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (!ret)
+			cm_format_rej((struct cm_rej_msg *) msg->mad,
+				      cm_id_priv, reason, ari, ari_length,
+				      private_data, private_data_len);
+
+		cm_reset_to_idle(cm_id_priv);
+		break;
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (!ret)
+			cm_format_rej((struct cm_rej_msg *) msg->mad,
+				      cm_id_priv, reason, ari, ari_length,
+				      private_data, private_data_len);
+
+		cm_enter_timewait(cm_id_priv);
+		break;
+	default:
+		pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__,
+			 be32_to_cpu(cm_id_priv->id.local_id), cm_id->state);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (ret)
+		goto out;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rej);
+
+static void cm_format_rej_event(struct cm_work *work)
+{
+	struct cm_rej_msg *rej_msg;
+	struct ib_cm_rej_event_param *param;
+
+	rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.rej_rcvd;
+	param->ari = rej_msg->ari;
+	param->ari_length = cm_rej_get_reject_info_len(rej_msg);
+	param->reason = __be16_to_cpu(rej_msg->reason);
+	work->cm_event.private_data = &rej_msg->private_data;
+}
+
+static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
+{
+	struct cm_timewait_info *timewait_info;
+	struct cm_id_private *cm_id_priv;
+	__be32 remote_id;
+
+	remote_id = rej_msg->local_comm_id;
+
+	if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_TIMEOUT) {
+		spin_lock_irq(&cm.lock);
+		timewait_info = cm_find_remote_id( *((__be64 *) rej_msg->ari),
+						  remote_id);
+		if (!timewait_info) {
+			spin_unlock_irq(&cm.lock);
+			return NULL;
+		}
+		cm_id_priv = idr_find(&cm.local_id_table, (__force int)
+				      (timewait_info->work.local_id ^
+				       cm.random_id_operand));
+		if (cm_id_priv) {
+			if (cm_id_priv->id.remote_id == remote_id)
+				atomic_inc(&cm_id_priv->refcount);
+			else
+				cm_id_priv = NULL;
+		}
+		spin_unlock_irq(&cm.lock);
+	} else if (cm_rej_get_msg_rejected(rej_msg) == CM_MSG_RESPONSE_REQ)
+		cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, 0);
+	else
+		cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, remote_id);
+
+	return cm_id_priv;
+}
+
+static int cm_rej_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rej_msg *rej_msg;
+	int ret;
+
+	rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_rejected_id(rej_msg);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	cm_format_rej_event(work);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* fall through */
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+		if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_STALE_CONN)
+			cm_enter_timewait(cm_id_priv);
+		else
+			cm_reset_to_idle(cm_id_priv);
+		break;
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* fall through */
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		cm_enter_timewait(cm_id_priv);
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT ||
+		    cm_id_priv->id.lap_state == IB_CM_LAP_SENT) {
+			if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT)
+				ib_cancel_mad(cm_id_priv->av.port->mad_agent,
+					      cm_id_priv->msg);
+			cm_enter_timewait(cm_id_priv);
+			break;
+		}
+		/* fall through */
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+		   u8 service_timeout,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	enum ib_cm_state cm_state;
+	enum ib_cm_lap_state lap_state;
+	enum cm_msg_response msg_response;
+	void *data;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch(cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+		cm_state = IB_CM_MRA_REQ_SENT;
+		lap_state = cm_id->lap_state;
+		msg_response = CM_MSG_RESPONSE_REQ;
+		break;
+	case IB_CM_REP_RCVD:
+		cm_state = IB_CM_MRA_REP_SENT;
+		lap_state = cm_id->lap_state;
+		msg_response = CM_MSG_RESPONSE_REP;
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_id->lap_state == IB_CM_LAP_RCVD) {
+			cm_state = cm_id->state;
+			lap_state = IB_CM_MRA_LAP_SENT;
+			msg_response = CM_MSG_RESPONSE_OTHER;
+			break;
+		}
+		/* fall through */
+	default:
+		pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
+		ret = -EINVAL;
+		goto error1;
+	}
+
+	if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (ret)
+			goto error1;
+
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      msg_response, service_timeout,
+			      private_data, private_data_len);
+		ret = ib_post_send_mad(msg, NULL);
+		if (ret)
+			goto error2;
+	}
+
+	cm_id->state = cm_state;
+	cm_id->lap_state = lap_state;
+	cm_id_priv->service_timeout = service_timeout;
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error1:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	return ret;
+
+error2:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	cm_free_msg(msg);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_mra);
+
+static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
+{
+	switch (cm_mra_get_msg_mraed(mra_msg)) {
+	case CM_MSG_RESPONSE_REQ:
+		return cm_acquire_id(mra_msg->remote_comm_id, 0);
+	case CM_MSG_RESPONSE_REP:
+	case CM_MSG_RESPONSE_OTHER:
+		return cm_acquire_id(mra_msg->remote_comm_id,
+				     mra_msg->local_comm_id);
+	default:
+		return NULL;
+	}
+}
+
+static int cm_mra_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_mra_msg *mra_msg;
+	int timeout, ret;
+
+	mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_mraed_id(mra_msg);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &mra_msg->private_data;
+	work->cm_event.param.mra_rcvd.service_timeout =
+					cm_mra_get_service_timeout(mra_msg);
+	timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) +
+		  cm_convert_to_ms(cm_id_priv->av.timeout);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REQ ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout))
+			goto out;
+		cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD;
+		break;
+	case IB_CM_REP_SENT:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REP ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout))
+			goto out;
+		cm_id_priv->id.state = IB_CM_MRA_REP_RCVD;
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER ||
+		    cm_id_priv->id.lap_state != IB_CM_LAP_SENT ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout)) {
+			if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+				atomic_long_inc(&work->port->
+						counter_group[CM_RECV_DUPLICATES].
+						counter[CM_MRA_COUNTER]);
+			goto out;
+		}
+		cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD;
+		break;
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_MRA_REP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_MRA_COUNTER]);
+		/* fall through */
+	default:
+		pr_debug("%s local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
+		goto out;
+	}
+
+	cm_id_priv->msg->context[1] = (void *) (unsigned long)
+				      cm_id_priv->id.state;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_lap(struct cm_lap_msg *lap_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct sa_path_rec *alternate_path,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	bool alt_ext = false;
+
+	if (alternate_path->rec_type == SA_PATH_REC_TYPE_OPA)
+		alt_ext = opa_is_extended_lid(alternate_path->opa.dlid,
+					      alternate_path->opa.slid);
+	cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP));
+	lap_msg->local_comm_id = cm_id_priv->id.local_id;
+	lap_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn);
+	/* todo: need remote CM response timeout */
+	cm_lap_set_remote_resp_timeout(lap_msg, 0x1F);
+	lap_msg->alt_local_lid =
+		htons(ntohl(sa_path_get_slid(alternate_path)));
+	lap_msg->alt_remote_lid =
+		htons(ntohl(sa_path_get_dlid(alternate_path)));
+	lap_msg->alt_local_gid = alternate_path->sgid;
+	lap_msg->alt_remote_gid = alternate_path->dgid;
+	if (alt_ext) {
+		lap_msg->alt_local_gid.global.interface_id
+			= OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.slid));
+		lap_msg->alt_remote_gid.global.interface_id
+			= OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.dlid));
+	}
+	cm_lap_set_flow_label(lap_msg, alternate_path->flow_label);
+	cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class);
+	lap_msg->alt_hop_limit = alternate_path->hop_limit;
+	cm_lap_set_packet_rate(lap_msg, alternate_path->rate);
+	cm_lap_set_sl(lap_msg, alternate_path->sl);
+	cm_lap_set_subnet_local(lap_msg, 1); /* local only... */
+	cm_lap_set_local_ack_timeout(lap_msg,
+		cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+			       alternate_path->packet_life_time));
+
+	if (private_data && private_data_len)
+		memcpy(lap_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_lap(struct ib_cm_id *cm_id,
+		   struct sa_path_rec *alternate_path,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_LAP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED ||
+	    (cm_id->lap_state != IB_CM_LAP_UNINIT &&
+	     cm_id->lap_state != IB_CM_LAP_IDLE)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av,
+				 cm_id_priv);
+	if (ret)
+		goto out;
+	cm_id_priv->alt_av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->alt_av.timeout - 1);
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_lap((struct cm_lap_msg *) msg->mad, cm_id_priv,
+		      alternate_path, private_data, private_data_len);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_ESTABLISHED;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->lap_state = IB_CM_LAP_SENT;
+	cm_id_priv->msg = msg;
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_lap);
+
+static void cm_format_path_lid_from_lap(struct cm_lap_msg *lap_msg,
+					struct sa_path_rec *path)
+{
+	u32 lid;
+
+	if (path->rec_type != SA_PATH_REC_TYPE_OPA) {
+		sa_path_set_dlid(path, ntohs(lap_msg->alt_local_lid));
+		sa_path_set_slid(path, ntohs(lap_msg->alt_remote_lid));
+	} else {
+		lid = opa_get_lid_from_gid(&lap_msg->alt_local_gid);
+		sa_path_set_dlid(path, lid);
+
+		lid = opa_get_lid_from_gid(&lap_msg->alt_remote_gid);
+		sa_path_set_slid(path, lid);
+	}
+}
+
+static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv,
+				    struct sa_path_rec *path,
+				    struct cm_lap_msg *lap_msg)
+{
+	path->dgid = lap_msg->alt_local_gid;
+	path->sgid = lap_msg->alt_remote_gid;
+	path->flow_label = cm_lap_get_flow_label(lap_msg);
+	path->hop_limit = lap_msg->alt_hop_limit;
+	path->traffic_class = cm_lap_get_traffic_class(lap_msg);
+	path->reversible = 1;
+	path->pkey = cm_id_priv->pkey;
+	path->sl = cm_lap_get_sl(lap_msg);
+	path->mtu_selector = IB_SA_EQ;
+	path->mtu = cm_id_priv->path_mtu;
+	path->rate_selector = IB_SA_EQ;
+	path->rate = cm_lap_get_packet_rate(lap_msg);
+	path->packet_life_time_selector = IB_SA_EQ;
+	path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg);
+	path->packet_life_time -= (path->packet_life_time > 0);
+	cm_format_path_lid_from_lap(lap_msg, path);
+}
+
+static int cm_lap_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_lap_msg *lap_msg;
+	struct ib_cm_lap_event_param *param;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	/* Currently Alternate path messages are not supported for
+	 * RoCE link layer.
+	 */
+	if (rdma_protocol_roce(work->port->cm_dev->ib_device,
+			       work->port->port_num))
+		return -EINVAL;
+
+	/* todo: verify LAP request and send reject APR if invalid. */
+	lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(lap_msg->remote_comm_id,
+				   lap_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				      work->mad_recv_wc->recv_buf.grh,
+				      &cm_id_priv->av);
+	if (ret)
+		goto deref;
+
+	param = &work->cm_event.param.lap_rcvd;
+	memset(&work->path[0], 0, sizeof(work->path[1]));
+	cm_path_set_rec_type(work->port->cm_dev->ib_device,
+			     work->port->port_num,
+			     &work->path[0],
+			     &lap_msg->alt_local_gid);
+	param->alternate_path = &work->path[0];
+	cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg);
+	work->cm_event.private_data = &lap_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED)
+		goto unlock;
+
+	switch (cm_id_priv->id.lap_state) {
+	case IB_CM_LAP_UNINIT:
+	case IB_CM_LAP_IDLE:
+		break;
+	case IB_CM_MRA_LAP_SENT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
+		msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
+		if (IS_ERR(msg))
+			goto unlock;
+
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_OTHER,
+			      cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+		spin_unlock_irq(&cm_id_priv->lock);
+
+		if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) ||
+		    ib_post_send_mad(msg, NULL))
+			cm_free_msg(msg);
+		goto deref;
+	case IB_CM_LAP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
+		goto unlock;
+	default:
+		goto unlock;
+	}
+
+	cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
+	cm_id_priv->tid = lap_msg->hdr.tid;
+	cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av,
+			   cm_id_priv);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+deref:	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_apr(struct cm_apr_msg *apr_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum ib_cm_apr_status status,
+			  void *info,
+			  u8 info_length,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&apr_msg->hdr, CM_APR_ATTR_ID, cm_id_priv->tid);
+	apr_msg->local_comm_id = cm_id_priv->id.local_id;
+	apr_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	apr_msg->ap_status = (u8) status;
+
+	if (info && info_length) {
+		apr_msg->info_length = info_length;
+		memcpy(apr_msg->info, info, info_length);
+	}
+
+	if (private_data && private_data_len)
+		memcpy(apr_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_apr(struct ib_cm_id *cm_id,
+		   enum ib_cm_apr_status status,
+		   void *info,
+		   u8 info_length,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((private_data && private_data_len > IB_CM_APR_PRIVATE_DATA_SIZE) ||
+	    (info && info_length > IB_CM_APR_INFO_LENGTH))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED ||
+	    (cm_id->lap_state != IB_CM_LAP_RCVD &&
+	     cm_id->lap_state != IB_CM_MRA_LAP_SENT)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_apr((struct cm_apr_msg *) msg->mad, cm_id_priv, status,
+		      info, info_length, private_data, private_data_len);
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->lap_state = IB_CM_LAP_IDLE;
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_apr);
+
+static int cm_apr_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_apr_msg *apr_msg;
+	int ret;
+
+	/* Currently Alternate path messages are not supported for
+	 * RoCE link layer.
+	 */
+	if (rdma_protocol_roce(work->port->cm_dev->ib_device,
+			       work->port->port_num))
+		return -EINVAL;
+
+	apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(apr_msg->remote_comm_id,
+				   apr_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL; /* Unmatched reply. */
+
+	work->cm_event.param.apr_rcvd.ap_status = apr_msg->ap_status;
+	work->cm_event.param.apr_rcvd.apr_info = &apr_msg->info;
+	work->cm_event.param.apr_rcvd.info_len = apr_msg->info_length;
+	work->cm_event.private_data = &apr_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED ||
+	    (cm_id_priv->id.lap_state != IB_CM_LAP_SENT &&
+	     cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.lap_state = IB_CM_LAP_IDLE;
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	cm_id_priv->msg = NULL;
+
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_timewait_handler(struct cm_work *work)
+{
+	struct cm_timewait_info *timewait_info;
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	timewait_info = (struct cm_timewait_info *)work;
+	spin_lock_irq(&cm.lock);
+	list_del(&timewait_info->list);
+	spin_unlock_irq(&cm.lock);
+
+	cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
+				   timewait_info->work.remote_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_TIMEWAIT ||
+	    cm_id_priv->remote_qpn != timewait_info->remote_qpn) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_IDLE;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg,
+			       struct cm_id_private *cm_id_priv,
+			       struct ib_cm_sidr_req_param *param)
+{
+	cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR));
+	sidr_req_msg->request_id = cm_id_priv->id.local_id;
+	sidr_req_msg->pkey = param->path->pkey;
+	sidr_req_msg->service_id = param->service_id;
+
+	if (param->private_data && param->private_data_len)
+		memcpy(sidr_req_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_req_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (!param->path || (param->private_data &&
+	     param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	ret = cm_init_av_by_path(param->path, &cm_id_priv->av, cm_id_priv);
+	if (ret)
+		goto out;
+
+	cm_id->service_id = param->service_id;
+	cm_id->service_mask = ~cpu_to_be64(0);
+	cm_id_priv->timeout_ms = param->timeout_ms;
+	cm_id_priv->max_cm_retries = param->max_cm_retries;
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_sidr_req((struct cm_sidr_req_msg *) msg->mad, cm_id_priv,
+			   param);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state == IB_CM_IDLE)
+		ret = ib_post_send_mad(msg, NULL);
+	else
+		ret = -EINVAL;
+
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		goto out;
+	}
+	cm_id->state = IB_CM_SIDR_REQ_SENT;
+	cm_id_priv->msg = msg;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_req);
+
+static void cm_format_sidr_req_event(struct cm_work *work,
+				     struct ib_cm_id *listen_id)
+{
+	struct cm_sidr_req_msg *sidr_req_msg;
+	struct ib_cm_sidr_req_event_param *param;
+
+	sidr_req_msg = (struct cm_sidr_req_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.sidr_req_rcvd;
+	param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
+	param->listen_id = listen_id;
+	param->service_id = sidr_req_msg->service_id;
+	param->bth_pkey = cm_get_bth_pkey(work);
+	param->port = work->port->port_num;
+	work->cm_event.private_data = &sidr_req_msg->private_data;
+}
+
+static int cm_sidr_req_handler(struct cm_work *work)
+{
+	struct ib_cm_id *cm_id;
+	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+	struct cm_sidr_req_msg *sidr_req_msg;
+	struct ib_wc *wc;
+	int ret;
+
+	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	/* Record SGID/SLID and request ID for lookup. */
+	sidr_req_msg = (struct cm_sidr_req_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	wc = work->mad_recv_wc->wc;
+	cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid);
+	cm_id_priv->av.dgid.global.interface_id = 0;
+	ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				      work->mad_recv_wc->recv_buf.grh,
+				      &cm_id_priv->av);
+	if (ret)
+		goto out;
+
+	cm_id_priv->id.remote_id = sidr_req_msg->request_id;
+	cm_id_priv->tid = sidr_req_msg->hdr.tid;
+	atomic_inc(&cm_id_priv->work_count);
+
+	spin_lock_irq(&cm.lock);
+	cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
+	if (cur_cm_id_priv) {
+		spin_unlock_irq(&cm.lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_SIDR_REQ_COUNTER]);
+		goto out; /* Duplicate message. */
+	}
+	cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
+	cur_cm_id_priv = cm_find_listen(cm_id->device,
+					sidr_req_msg->service_id);
+	if (!cur_cm_id_priv) {
+		spin_unlock_irq(&cm.lock);
+		cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
+		goto out; /* No match. */
+	}
+	atomic_inc(&cur_cm_id_priv->refcount);
+	atomic_inc(&cm_id_priv->refcount);
+	spin_unlock_irq(&cm.lock);
+
+	cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler;
+	cm_id_priv->id.context = cur_cm_id_priv->id.context;
+	cm_id_priv->id.service_id = sidr_req_msg->service_id;
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+	cm_format_sidr_req_event(work, &cur_cm_id_priv->id);
+	cm_process_work(cm_id_priv, work);
+	cm_deref_id(cur_cm_id_priv);
+	return 0;
+out:
+	ib_destroy_cm_id(&cm_id_priv->id);
+	return -EINVAL;
+}
+
+static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg,
+			       struct cm_id_private *cm_id_priv,
+			       struct ib_cm_sidr_rep_param *param)
+{
+	cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID,
+			  cm_id_priv->tid);
+	sidr_rep_msg->request_id = cm_id_priv->id.remote_id;
+	sidr_rep_msg->status = param->status;
+	cm_sidr_rep_set_qpn(sidr_rep_msg, cpu_to_be32(param->qp_num));
+	sidr_rep_msg->service_id = cm_id_priv->id.service_id;
+	sidr_rep_msg->qkey = cpu_to_be32(param->qkey);
+
+	if (param->info && param->info_length)
+		memcpy(sidr_rep_msg->info, param->info, param->info_length);
+
+	if (param->private_data && param->private_data_len)
+		memcpy(sidr_rep_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_rep_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) ||
+	    (param->private_data &&
+	     param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_SIDR_REQ_RCVD) {
+		ret = -EINVAL;
+		goto error;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto error;
+
+	cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv,
+			   param);
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+	cm_id->state = IB_CM_IDLE;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	spin_lock_irqsave(&cm.lock, flags);
+	if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) {
+		rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+		RB_CLEAR_NODE(&cm_id_priv->sidr_id_node);
+	}
+	spin_unlock_irqrestore(&cm.lock, flags);
+	return 0;
+
+error:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_rep);
+
+static void cm_format_sidr_rep_event(struct cm_work *work)
+{
+	struct cm_sidr_rep_msg *sidr_rep_msg;
+	struct ib_cm_sidr_rep_event_param *param;
+
+	sidr_rep_msg = (struct cm_sidr_rep_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.sidr_rep_rcvd;
+	param->status = sidr_rep_msg->status;
+	param->qkey = be32_to_cpu(sidr_rep_msg->qkey);
+	param->qpn = be32_to_cpu(cm_sidr_rep_get_qpn(sidr_rep_msg));
+	param->info = &sidr_rep_msg->info;
+	param->info_len = sidr_rep_msg->info_length;
+	work->cm_event.private_data = &sidr_rep_msg->private_data;
+}
+
+static int cm_sidr_rep_handler(struct cm_work *work)
+{
+	struct cm_sidr_rep_msg *sidr_rep_msg;
+	struct cm_id_private *cm_id_priv;
+
+	sidr_rep_msg = (struct cm_sidr_rep_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(sidr_rep_msg->request_id, 0);
+	if (!cm_id_priv)
+		return -EINVAL; /* Unmatched reply. */
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_IDLE;
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	cm_format_sidr_rep_event(work);
+	cm_process_work(cm_id_priv, work);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_process_send_error(struct ib_mad_send_buf *msg,
+				  enum ib_wc_status wc_status)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_cm_event cm_event;
+	enum ib_cm_state state;
+	int ret;
+
+	memset(&cm_event, 0, sizeof cm_event);
+	cm_id_priv = msg->context[0];
+
+	/* Discard old sends or ones without a response. */
+	spin_lock_irq(&cm_id_priv->lock);
+	state = (enum ib_cm_state) (unsigned long) msg->context[1];
+	if (msg != cm_id_priv->msg || state != cm_id_priv->id.state)
+		goto discard;
+
+	pr_debug_ratelimited("CM: failed sending MAD in state %d. (%s)\n",
+			     state, ib_wc_status_msg(wc_status));
+	switch (state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+		cm_reset_to_idle(cm_id_priv);
+		cm_event.event = IB_CM_REQ_ERROR;
+		break;
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		cm_reset_to_idle(cm_id_priv);
+		cm_event.event = IB_CM_REP_ERROR;
+		break;
+	case IB_CM_DREQ_SENT:
+		cm_enter_timewait(cm_id_priv);
+		cm_event.event = IB_CM_DREQ_ERROR;
+		break;
+	case IB_CM_SIDR_REQ_SENT:
+		cm_id_priv->id.state = IB_CM_IDLE;
+		cm_event.event = IB_CM_SIDR_REQ_ERROR;
+		break;
+	default:
+		goto discard;
+	}
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_event.param.send_status = wc_status;
+
+	/* No other events can occur on the cm_id at this point. */
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event);
+	cm_free_msg(msg);
+	if (ret)
+		ib_destroy_cm_id(&cm_id_priv->id);
+	return;
+discard:
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_free_msg(msg);
+}
+
+static void cm_send_handler(struct ib_mad_agent *mad_agent,
+			    struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
+	struct cm_port *port;
+	u16 attr_index;
+
+	port = mad_agent->context;
+	attr_index = be16_to_cpu(((struct ib_mad_hdr *)
+				  msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
+
+	/*
+	 * If the send was in response to a received message (context[0] is not
+	 * set to a cm_id), and is not a REJ, then it is a send that was
+	 * manually retried.
+	 */
+	if (!msg->context[0] && (attr_index != CM_REJ_COUNTER))
+		msg->retries = 1;
+
+	atomic_long_add(1 + msg->retries,
+			&port->counter_group[CM_XMIT].counter[attr_index]);
+	if (msg->retries)
+		atomic_long_add(msg->retries,
+				&port->counter_group[CM_XMIT_RETRIES].
+				counter[attr_index]);
+
+	switch (mad_send_wc->status) {
+	case IB_WC_SUCCESS:
+	case IB_WC_WR_FLUSH_ERR:
+		cm_free_msg(msg);
+		break;
+	default:
+		if (msg->context[0] && msg->context[1])
+			cm_process_send_error(msg, mad_send_wc->status);
+		else
+			cm_free_msg(msg);
+		break;
+	}
+}
+
+static void cm_work_handler(struct work_struct *_work)
+{
+	struct cm_work *work = container_of(_work, struct cm_work, work.work);
+	int ret;
+
+	switch (work->cm_event.event) {
+	case IB_CM_REQ_RECEIVED:
+		ret = cm_req_handler(work);
+		break;
+	case IB_CM_MRA_RECEIVED:
+		ret = cm_mra_handler(work);
+		break;
+	case IB_CM_REJ_RECEIVED:
+		ret = cm_rej_handler(work);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ret = cm_rep_handler(work);
+		break;
+	case IB_CM_RTU_RECEIVED:
+		ret = cm_rtu_handler(work);
+		break;
+	case IB_CM_USER_ESTABLISHED:
+		ret = cm_establish_handler(work);
+		break;
+	case IB_CM_DREQ_RECEIVED:
+		ret = cm_dreq_handler(work);
+		break;
+	case IB_CM_DREP_RECEIVED:
+		ret = cm_drep_handler(work);
+		break;
+	case IB_CM_SIDR_REQ_RECEIVED:
+		ret = cm_sidr_req_handler(work);
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		ret = cm_sidr_rep_handler(work);
+		break;
+	case IB_CM_LAP_RECEIVED:
+		ret = cm_lap_handler(work);
+		break;
+	case IB_CM_APR_RECEIVED:
+		ret = cm_apr_handler(work);
+		break;
+	case IB_CM_TIMEWAIT_EXIT:
+		ret = cm_timewait_handler(work);
+		break;
+	default:
+		pr_debug("cm_event.event: 0x%x\n", work->cm_event.event);
+		ret = -EINVAL;
+		break;
+	}
+	if (ret)
+		cm_free_work(work);
+}
+
+static int cm_establish(struct ib_cm_id *cm_id)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_work *work;
+	unsigned long flags;
+	int ret = 0;
+	struct cm_device *cm_dev;
+
+	cm_dev = ib_get_client_data(cm_id->device, &cm_client);
+	if (!cm_dev)
+		return -ENODEV;
+
+	work = kmalloc(sizeof *work, GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id->state)
+	{
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		cm_id->state = IB_CM_ESTABLISHED;
+		break;
+	case IB_CM_ESTABLISHED:
+		ret = -EISCONN;
+		break;
+	default:
+		pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__,
+			 be32_to_cpu(cm_id->local_id), cm_id->state);
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	if (ret) {
+		kfree(work);
+		goto out;
+	}
+
+	/*
+	 * The CM worker thread may try to destroy the cm_id before it
+	 * can execute this work item.  To prevent potential deadlock,
+	 * we need to find the cm_id once we're in the context of the
+	 * worker thread, rather than holding a reference on it.
+	 */
+	INIT_DELAYED_WORK(&work->work, cm_work_handler);
+	work->local_id = cm_id->local_id;
+	work->remote_id = cm_id->remote_id;
+	work->mad_recv_wc = NULL;
+	work->cm_event.event = IB_CM_USER_ESTABLISHED;
+
+	/* Check if the device started its remove_one */
+	spin_lock_irqsave(&cm.lock, flags);
+	if (!cm_dev->going_down) {
+		queue_delayed_work(cm.wq, &work->work, 0);
+	} else {
+		kfree(work);
+		ret = -ENODEV;
+	}
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+out:
+	return ret;
+}
+
+static int cm_migrate(struct ib_cm_id *cm_id)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_av tmp_av;
+	unsigned long flags;
+	int tmp_send_port_not_ready;
+	int ret = 0;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state == IB_CM_ESTABLISHED &&
+	    (cm_id->lap_state == IB_CM_LAP_UNINIT ||
+	     cm_id->lap_state == IB_CM_LAP_IDLE)) {
+		cm_id->lap_state = IB_CM_LAP_IDLE;
+		/* Swap address vector */
+		tmp_av = cm_id_priv->av;
+		cm_id_priv->av = cm_id_priv->alt_av;
+		cm_id_priv->alt_av = tmp_av;
+		/* Swap port send ready state */
+		tmp_send_port_not_ready = cm_id_priv->prim_send_port_not_ready;
+		cm_id_priv->prim_send_port_not_ready = cm_id_priv->altr_send_port_not_ready;
+		cm_id_priv->altr_send_port_not_ready = tmp_send_port_not_ready;
+	} else
+		ret = -EINVAL;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
+{
+	int ret;
+
+	switch (event) {
+	case IB_EVENT_COMM_EST:
+		ret = cm_establish(cm_id);
+		break;
+	case IB_EVENT_PATH_MIG:
+		ret = cm_migrate(cm_id);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_notify);
+
+static void cm_recv_handler(struct ib_mad_agent *mad_agent,
+			    struct ib_mad_send_buf *send_buf,
+			    struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct cm_port *port = mad_agent->context;
+	struct cm_work *work;
+	enum ib_cm_event_type event;
+	bool alt_path = false;
+	u16 attr_id;
+	int paths = 0;
+	int going_down = 0;
+
+	switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
+	case CM_REQ_ATTR_ID:
+		alt_path = cm_req_has_alt_path((struct cm_req_msg *)
+						mad_recv_wc->recv_buf.mad);
+		paths = 1 + (alt_path != 0);
+		event = IB_CM_REQ_RECEIVED;
+		break;
+	case CM_MRA_ATTR_ID:
+		event = IB_CM_MRA_RECEIVED;
+		break;
+	case CM_REJ_ATTR_ID:
+		event = IB_CM_REJ_RECEIVED;
+		break;
+	case CM_REP_ATTR_ID:
+		event = IB_CM_REP_RECEIVED;
+		break;
+	case CM_RTU_ATTR_ID:
+		event = IB_CM_RTU_RECEIVED;
+		break;
+	case CM_DREQ_ATTR_ID:
+		event = IB_CM_DREQ_RECEIVED;
+		break;
+	case CM_DREP_ATTR_ID:
+		event = IB_CM_DREP_RECEIVED;
+		break;
+	case CM_SIDR_REQ_ATTR_ID:
+		event = IB_CM_SIDR_REQ_RECEIVED;
+		break;
+	case CM_SIDR_REP_ATTR_ID:
+		event = IB_CM_SIDR_REP_RECEIVED;
+		break;
+	case CM_LAP_ATTR_ID:
+		paths = 1;
+		event = IB_CM_LAP_RECEIVED;
+		break;
+	case CM_APR_ATTR_ID:
+		event = IB_CM_APR_RECEIVED;
+		break;
+	default:
+		ib_free_recv_mad(mad_recv_wc);
+		return;
+	}
+
+	attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id);
+	atomic_long_inc(&port->counter_group[CM_RECV].
+			counter[attr_id - CM_ATTR_ID_OFFSET]);
+
+	work = kmalloc(sizeof(*work) + sizeof(struct sa_path_rec) * paths,
+		       GFP_KERNEL);
+	if (!work) {
+		ib_free_recv_mad(mad_recv_wc);
+		return;
+	}
+
+	INIT_DELAYED_WORK(&work->work, cm_work_handler);
+	work->cm_event.event = event;
+	work->mad_recv_wc = mad_recv_wc;
+	work->port = port;
+
+	/* Check if the device started its remove_one */
+	spin_lock_irq(&cm.lock);
+	if (!port->cm_dev->going_down)
+		queue_delayed_work(cm.wq, &work->work, 0);
+	else
+		going_down = 1;
+	spin_unlock_irq(&cm.lock);
+
+	if (going_down) {
+		kfree(work);
+		ib_free_recv_mad(mad_recv_wc);
+	}
+}
+
+static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
+				struct ib_qp_attr *qp_attr,
+				int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS |
+				IB_QP_PKEY_INDEX | IB_QP_PORT;
+		qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE;
+		if (cm_id_priv->responder_resources)
+			qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
+						    IB_ACCESS_REMOTE_ATOMIC;
+		qp_attr->pkey_index = cm_id_priv->av.pkey_index;
+		qp_attr->port_num = cm_id_priv->av.port->port_num;
+		ret = 0;
+		break;
+	default:
+		pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
+			       struct ib_qp_attr *qp_attr,
+			       int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
+				IB_QP_DEST_QPN | IB_QP_RQ_PSN;
+		qp_attr->ah_attr = cm_id_priv->av.ah_attr;
+		qp_attr->path_mtu = cm_id_priv->path_mtu;
+		qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
+		qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
+		if (cm_id_priv->qp_type == IB_QPT_RC ||
+		    cm_id_priv->qp_type == IB_QPT_XRC_TGT) {
+			*qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC |
+					 IB_QP_MIN_RNR_TIMER;
+			qp_attr->max_dest_rd_atomic =
+					cm_id_priv->responder_resources;
+			qp_attr->min_rnr_timer = 0;
+		}
+		if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) {
+			*qp_attr_mask |= IB_QP_ALT_PATH;
+			qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+		}
+		ret = 0;
+		break;
+	default:
+		pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
+			       struct ib_qp_attr *qp_attr,
+			       int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	/* Allow transition to RTS before sending REP */
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) {
+			*qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN;
+			qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn);
+			switch (cm_id_priv->qp_type) {
+			case IB_QPT_RC:
+			case IB_QPT_XRC_INI:
+				*qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
+						 IB_QP_MAX_QP_RD_ATOMIC;
+				qp_attr->retry_cnt = cm_id_priv->retry_count;
+				qp_attr->rnr_retry = cm_id_priv->rnr_retry_count;
+				qp_attr->max_rd_atomic = cm_id_priv->initiator_depth;
+				/* fall through */
+			case IB_QPT_XRC_TGT:
+				*qp_attr_mask |= IB_QP_TIMEOUT;
+				qp_attr->timeout = cm_id_priv->av.timeout;
+				break;
+			default:
+				break;
+			}
+			if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) {
+				*qp_attr_mask |= IB_QP_PATH_MIG_STATE;
+				qp_attr->path_mig_state = IB_MIG_REARM;
+			}
+		} else {
+			*qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
+			qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+			qp_attr->path_mig_state = IB_MIG_REARM;
+		}
+		ret = 0;
+		break;
+	default:
+		pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+		       struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	switch (qp_attr->qp_state) {
+	case IB_QPS_INIT:
+		ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTR:
+		ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTS:
+		ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_init_qp_attr);
+
+static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
+			       char *buf)
+{
+	struct cm_counter_group *group;
+	struct cm_counter_attribute *cm_attr;
+
+	group = container_of(obj, struct cm_counter_group, obj);
+	cm_attr = container_of(attr, struct cm_counter_attribute, attr);
+
+	return sprintf(buf, "%ld\n",
+		       atomic_long_read(&group->counter[cm_attr->index]));
+}
+
+static const struct sysfs_ops cm_counter_ops = {
+	.show = cm_show_counter
+};
+
+static struct kobj_type cm_counter_obj_type = {
+	.sysfs_ops = &cm_counter_ops,
+	.default_attrs = cm_counter_default_attrs
+};
+
+static void cm_release_port_obj(struct kobject *obj)
+{
+	struct cm_port *cm_port;
+
+	cm_port = container_of(obj, struct cm_port, port_obj);
+	kfree(cm_port);
+}
+
+static struct kobj_type cm_port_obj_type = {
+	.release = cm_release_port_obj
+};
+
+static char *cm_devnode(struct device *dev, umode_t *mode)
+{
+	if (mode)
+		*mode = 0666;
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+struct class cm_class = {
+	.owner   = THIS_MODULE,
+	.name    = "infiniband_cm",
+	.devnode = cm_devnode,
+};
+EXPORT_SYMBOL(cm_class);
+
+static int cm_create_port_fs(struct cm_port *port)
+{
+	int i, ret;
+
+	ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type,
+				   &port->cm_dev->device->kobj,
+				   "%d", port->port_num);
+	if (ret) {
+		kfree(port);
+		return ret;
+	}
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++) {
+		ret = kobject_init_and_add(&port->counter_group[i].obj,
+					   &cm_counter_obj_type,
+					   &port->port_obj,
+					   "%s", counter_group_names[i]);
+		if (ret)
+			goto error;
+	}
+
+	return 0;
+
+error:
+	while (i--)
+		kobject_put(&port->counter_group[i].obj);
+	kobject_put(&port->port_obj);
+	return ret;
+
+}
+
+static void cm_remove_port_fs(struct cm_port *port)
+{
+	int i;
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++)
+		kobject_put(&port->counter_group[i].obj);
+
+	kobject_put(&port->port_obj);
+}
+
+static void cm_add_one(struct ib_device *ib_device)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port;
+	struct ib_mad_reg_req reg_req = {
+		.mgmt_class = IB_MGMT_CLASS_CM,
+		.mgmt_class_version = IB_CM_CLASS_VERSION,
+	};
+	struct ib_port_modify port_modify = {
+		.set_port_cap_mask = IB_PORT_CM_SUP
+	};
+	unsigned long flags;
+	int ret;
+	int count = 0;
+	u8 i;
+
+	cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
+			 ib_device->phys_port_cnt, GFP_KERNEL);
+	if (!cm_dev)
+		return;
+
+	cm_dev->ib_device = ib_device;
+	cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
+	cm_dev->going_down = 0;
+	cm_dev->device = device_create(&cm_class, &ib_device->dev,
+				       MKDEV(0, 0), NULL,
+				       "%s", ib_device->name);
+	if (IS_ERR(cm_dev->device)) {
+		kfree(cm_dev);
+		return;
+	}
+
+	set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
+	for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+		if (!rdma_cap_ib_cm(ib_device, i))
+			continue;
+
+		port = kzalloc(sizeof *port, GFP_KERNEL);
+		if (!port)
+			goto error1;
+
+		cm_dev->port[i-1] = port;
+		port->cm_dev = cm_dev;
+		port->port_num = i;
+
+		INIT_LIST_HEAD(&port->cm_priv_prim_list);
+		INIT_LIST_HEAD(&port->cm_priv_altr_list);
+
+		ret = cm_create_port_fs(port);
+		if (ret)
+			goto error1;
+
+		port->mad_agent = ib_register_mad_agent(ib_device, i,
+							IB_QPT_GSI,
+							&reg_req,
+							0,
+							cm_send_handler,
+							cm_recv_handler,
+							port,
+							0);
+		if (IS_ERR(port->mad_agent))
+			goto error2;
+
+		ret = ib_modify_port(ib_device, i, 0, &port_modify);
+		if (ret)
+			goto error3;
+
+		count++;
+	}
+
+	if (!count)
+		goto free;
+
+	ib_set_client_data(ib_device, &cm_client, cm_dev);
+
+	write_lock_irqsave(&cm.device_lock, flags);
+	list_add_tail(&cm_dev->list, &cm.device_list);
+	write_unlock_irqrestore(&cm.device_lock, flags);
+	return;
+
+error3:
+	ib_unregister_mad_agent(port->mad_agent);
+error2:
+	cm_remove_port_fs(port);
+error1:
+	port_modify.set_port_cap_mask = 0;
+	port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
+	while (--i) {
+		if (!rdma_cap_ib_cm(ib_device, i))
+			continue;
+
+		port = cm_dev->port[i-1];
+		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+		ib_unregister_mad_agent(port->mad_agent);
+		cm_remove_port_fs(port);
+	}
+free:
+	device_unregister(cm_dev->device);
+	kfree(cm_dev);
+}
+
+static void cm_remove_one(struct ib_device *ib_device, void *client_data)
+{
+	struct cm_device *cm_dev = client_data;
+	struct cm_port *port;
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_agent *cur_mad_agent;
+	struct ib_port_modify port_modify = {
+		.clr_port_cap_mask = IB_PORT_CM_SUP
+	};
+	unsigned long flags;
+	int i;
+
+	if (!cm_dev)
+		return;
+
+	write_lock_irqsave(&cm.device_lock, flags);
+	list_del(&cm_dev->list);
+	write_unlock_irqrestore(&cm.device_lock, flags);
+
+	spin_lock_irq(&cm.lock);
+	cm_dev->going_down = 1;
+	spin_unlock_irq(&cm.lock);
+
+	for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+		if (!rdma_cap_ib_cm(ib_device, i))
+			continue;
+
+		port = cm_dev->port[i-1];
+		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+		/* Mark all the cm_id's as not valid */
+		spin_lock_irq(&cm.lock);
+		list_for_each_entry(cm_id_priv, &port->cm_priv_altr_list, altr_list)
+			cm_id_priv->altr_send_port_not_ready = 1;
+		list_for_each_entry(cm_id_priv, &port->cm_priv_prim_list, prim_list)
+			cm_id_priv->prim_send_port_not_ready = 1;
+		spin_unlock_irq(&cm.lock);
+		/*
+		 * We flush the queue here after the going_down set, this
+		 * verify that no new works will be queued in the recv handler,
+		 * after that we can call the unregister_mad_agent
+		 */
+		flush_workqueue(cm.wq);
+		spin_lock_irq(&cm.state_lock);
+		cur_mad_agent = port->mad_agent;
+		port->mad_agent = NULL;
+		spin_unlock_irq(&cm.state_lock);
+		ib_unregister_mad_agent(cur_mad_agent);
+		cm_remove_port_fs(port);
+	}
+
+	device_unregister(cm_dev->device);
+	kfree(cm_dev);
+}
+
+static int __init ib_cm_init(void)
+{
+	int ret;
+
+	memset(&cm, 0, sizeof cm);
+	INIT_LIST_HEAD(&cm.device_list);
+	rwlock_init(&cm.device_lock);
+	spin_lock_init(&cm.lock);
+	spin_lock_init(&cm.state_lock);
+	cm.listen_service_table = RB_ROOT;
+	cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
+	cm.remote_id_table = RB_ROOT;
+	cm.remote_qp_table = RB_ROOT;
+	cm.remote_sidr_table = RB_ROOT;
+	idr_init(&cm.local_id_table);
+	get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
+	INIT_LIST_HEAD(&cm.timewait_list);
+
+	ret = class_register(&cm_class);
+	if (ret) {
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	cm.wq = alloc_workqueue("ib_cm", 0, 1);
+	if (!cm.wq) {
+		ret = -ENOMEM;
+		goto error2;
+	}
+
+	ret = ib_register_client(&cm_client);
+	if (ret)
+		goto error3;
+
+	return 0;
+error3:
+	destroy_workqueue(cm.wq);
+error2:
+	class_unregister(&cm_class);
+error1:
+	idr_destroy(&cm.local_id_table);
+	return ret;
+}
+
+static void __exit ib_cm_cleanup(void)
+{
+	struct cm_timewait_info *timewait_info, *tmp;
+
+	spin_lock_irq(&cm.lock);
+	list_for_each_entry(timewait_info, &cm.timewait_list, list)
+		cancel_delayed_work(&timewait_info->work.work);
+	spin_unlock_irq(&cm.lock);
+
+	ib_unregister_client(&cm_client);
+	destroy_workqueue(cm.wq);
+
+	list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) {
+		list_del(&timewait_info->list);
+		kfree(timewait_info);
+	}
+
+	class_unregister(&cm_class);
+	idr_destroy(&cm.local_id_table);
+}
+
+module_init(ib_cm_init);
+module_exit(ib_cm_cleanup);
+
diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h
new file mode 100644
index 0000000..8b76f0e
--- /dev/null
+++ b/drivers/infiniband/core/cm_msgs.h
@@ -0,0 +1,836 @@
+/*
+ * Copyright (c) 2004, 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING the madirectory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use source and binary forms, with or
+ *     withmodification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retathe above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS THE
+ * SOFTWARE.
+ */
+#if !defined(CM_MSGS_H)
+#define CM_MSGS_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_cm.h>
+
+/*
+ * Parameters to routines below should be in network-byte order, and values
+ * are returned in network-byte order.
+ */
+
+#define IB_CM_CLASS_VERSION	2 /* IB specification 1.2 */
+
+enum cm_msg_sequence {
+	CM_MSG_SEQUENCE_REQ,
+	CM_MSG_SEQUENCE_LAP,
+	CM_MSG_SEQUENCE_DREQ,
+	CM_MSG_SEQUENCE_SIDR
+};
+
+struct cm_req_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 rsvd4;
+	__be64 service_id;
+	__be64 local_ca_guid;
+	__be32 rsvd24;
+	__be32 local_qkey;
+	/* local QPN:24, responder resources:8 */
+	__be32 offset32;
+	/* local EECN:24, initiator depth:8 */
+	__be32 offset36;
+	/*
+	 * remote EECN:24, remote CM response timeout:5,
+	 * transport service type:2, end-to-end flow control:1
+	 */
+	__be32 offset40;
+	/* starting PSN:24, local CM response timeout:5, retry count:3 */
+	__be32 offset44;
+	__be16 pkey;
+	/* path MTU:4, RDC exists:1, RNR retry count:3. */
+	u8 offset50;
+	/* max CM Retries:4, SRQ:1, extended transport type:3 */
+	u8 offset51;
+
+	__be16 primary_local_lid;
+	__be16 primary_remote_lid;
+	union ib_gid primary_local_gid;
+	union ib_gid primary_remote_gid;
+	/* flow label:20, rsvd:6, packet rate:6 */
+	__be32 primary_offset88;
+	u8 primary_traffic_class;
+	u8 primary_hop_limit;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 primary_offset94;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 primary_offset95;
+
+	__be16 alt_local_lid;
+	__be16 alt_remote_lid;
+	union ib_gid alt_local_gid;
+	union ib_gid alt_remote_gid;
+	/* flow label:20, rsvd:6, packet rate:6 */
+	__be32 alt_offset132;
+	u8 alt_traffic_class;
+	u8 alt_hop_limit;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 alt_offset138;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 alt_offset139;
+
+	u32 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->offset32) >> 8);
+}
+
+static inline void cm_req_set_local_qpn(struct cm_req_msg *req_msg, __be32 qpn)
+{
+	req_msg->offset32 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					 (be32_to_cpu(req_msg->offset32) &
+					  0x000000FF));
+}
+
+static inline u8 cm_req_get_resp_res(struct cm_req_msg *req_msg)
+{
+	return (u8) be32_to_cpu(req_msg->offset32);
+}
+
+static inline void cm_req_set_resp_res(struct cm_req_msg *req_msg, u8 resp_res)
+{
+	req_msg->offset32 = cpu_to_be32(resp_res |
+					(be32_to_cpu(req_msg->offset32) &
+					 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_init_depth(struct cm_req_msg *req_msg)
+{
+	return (u8) be32_to_cpu(req_msg->offset36);
+}
+
+static inline void cm_req_set_init_depth(struct cm_req_msg *req_msg,
+					 u8 init_depth)
+{
+	req_msg->offset36 = cpu_to_be32(init_depth |
+					(be32_to_cpu(req_msg->offset36) &
+					 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_remote_resp_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) ((be32_to_cpu(req_msg->offset40) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_remote_resp_timeout(struct cm_req_msg *req_msg,
+						  u8 resp_timeout)
+{
+	req_msg->offset40 = cpu_to_be32((resp_timeout << 3) |
+					 (be32_to_cpu(req_msg->offset40) &
+					  0xFFFFFF07));
+}
+
+static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg)
+{
+	u8 transport_type = (u8) (be32_to_cpu(req_msg->offset40) & 0x06) >> 1;
+	switch(transport_type) {
+	case 0: return IB_QPT_RC;
+	case 1: return IB_QPT_UC;
+	case 3:
+		switch (req_msg->offset51 & 0x7) {
+		case 1: return IB_QPT_XRC_TGT;
+		default: return 0;
+		}
+	default: return 0;
+	}
+}
+
+static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg,
+				      enum ib_qp_type qp_type)
+{
+	switch(qp_type) {
+	case IB_QPT_UC:
+		req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+						  req_msg->offset40) &
+						   0xFFFFFFF9) | 0x2);
+		break;
+	case IB_QPT_XRC_INI:
+		req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+						 req_msg->offset40) &
+						   0xFFFFFFF9) | 0x6);
+		req_msg->offset51 = (req_msg->offset51 & 0xF8) | 1;
+		break;
+	default:
+		req_msg->offset40 = cpu_to_be32(be32_to_cpu(
+						 req_msg->offset40) &
+						  0xFFFFFFF9);
+	}
+}
+
+static inline u8 cm_req_get_flow_ctrl(struct cm_req_msg *req_msg)
+{
+	return be32_to_cpu(req_msg->offset40) & 0x1;
+}
+
+static inline void cm_req_set_flow_ctrl(struct cm_req_msg *req_msg,
+					u8 flow_ctrl)
+{
+	req_msg->offset40 = cpu_to_be32((flow_ctrl & 0x1) |
+					 (be32_to_cpu(req_msg->offset40) &
+					  0xFFFFFFFE));
+}
+
+static inline __be32 cm_req_get_starting_psn(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->offset44) >> 8);
+}
+
+static inline void cm_req_set_starting_psn(struct cm_req_msg *req_msg,
+					   __be32 starting_psn)
+{
+	req_msg->offset44 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+			    (be32_to_cpu(req_msg->offset44) & 0x000000FF));
+}
+
+static inline u8 cm_req_get_local_resp_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) ((be32_to_cpu(req_msg->offset44) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_local_resp_timeout(struct cm_req_msg *req_msg,
+						 u8 resp_timeout)
+{
+	req_msg->offset44 = cpu_to_be32((resp_timeout << 3) |
+			    (be32_to_cpu(req_msg->offset44) & 0xFFFFFF07));
+}
+
+static inline u8 cm_req_get_retry_count(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->offset44) & 0x7);
+}
+
+static inline void cm_req_set_retry_count(struct cm_req_msg *req_msg,
+					  u8 retry_count)
+{
+	req_msg->offset44 = cpu_to_be32((retry_count & 0x7) |
+			    (be32_to_cpu(req_msg->offset44) & 0xFFFFFFF8));
+}
+
+static inline u8 cm_req_get_path_mtu(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset50 >> 4;
+}
+
+static inline void cm_req_set_path_mtu(struct cm_req_msg *req_msg, u8 path_mtu)
+{
+	req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF) | (path_mtu << 4));
+}
+
+static inline u8 cm_req_get_rnr_retry_count(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset50 & 0x7;
+}
+
+static inline void cm_req_set_rnr_retry_count(struct cm_req_msg *req_msg,
+					      u8 rnr_retry_count)
+{
+	req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF8) |
+				  (rnr_retry_count & 0x7));
+}
+
+static inline u8 cm_req_get_max_cm_retries(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset51 >> 4;
+}
+
+static inline void cm_req_set_max_cm_retries(struct cm_req_msg *req_msg,
+					     u8 retries)
+{
+	req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF) | (retries << 4));
+}
+
+static inline u8 cm_req_get_srq(struct cm_req_msg *req_msg)
+{
+	return (req_msg->offset51 & 0x8) >> 3;
+}
+
+static inline void cm_req_set_srq(struct cm_req_msg *req_msg, u8 srq)
+{
+	req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF7) |
+				  ((srq & 0x1) << 3));
+}
+
+static inline __be32 cm_req_get_primary_flow_label(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->primary_offset88) >> 12);
+}
+
+static inline void cm_req_set_primary_flow_label(struct cm_req_msg *req_msg,
+						 __be32 flow_label)
+{
+	req_msg->primary_offset88 = cpu_to_be32(
+				    (be32_to_cpu(req_msg->primary_offset88) &
+				     0x00000FFF) |
+				     (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_primary_packet_rate(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->primary_offset88) & 0x3F);
+}
+
+static inline void cm_req_set_primary_packet_rate(struct cm_req_msg *req_msg,
+						  u8 rate)
+{
+	req_msg->primary_offset88 = cpu_to_be32(
+				    (be32_to_cpu(req_msg->primary_offset88) &
+				     0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_primary_sl(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->primary_offset94 >> 4);
+}
+
+static inline void cm_req_set_primary_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+	req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0x0F) |
+					  (sl << 4));
+}
+
+static inline u8 cm_req_get_primary_subnet_local(struct cm_req_msg *req_msg)
+{
+	return (u8) ((req_msg->primary_offset94 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_primary_subnet_local(struct cm_req_msg *req_msg,
+						   u8 subnet_local)
+{
+	req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0xF7) |
+					  ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_primary_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->primary_offset95 >> 3);
+}
+
+static inline void cm_req_set_primary_local_ack_timeout(struct cm_req_msg *req_msg,
+							u8 local_ack_timeout)
+{
+	req_msg->primary_offset95 = (u8) ((req_msg->primary_offset95 & 0x07) |
+					  (local_ack_timeout << 3));
+}
+
+static inline __be32 cm_req_get_alt_flow_label(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->alt_offset132) >> 12);
+}
+
+static inline void cm_req_set_alt_flow_label(struct cm_req_msg *req_msg,
+					     __be32 flow_label)
+{
+	req_msg->alt_offset132 = cpu_to_be32(
+				 (be32_to_cpu(req_msg->alt_offset132) &
+				  0x00000FFF) |
+				  (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_alt_packet_rate(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->alt_offset132) & 0x3F);
+}
+
+static inline void cm_req_set_alt_packet_rate(struct cm_req_msg *req_msg,
+					      u8 rate)
+{
+	req_msg->alt_offset132 = cpu_to_be32(
+				 (be32_to_cpu(req_msg->alt_offset132) &
+				  0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_alt_sl(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->alt_offset138 >> 4);
+}
+
+static inline void cm_req_set_alt_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+	req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0x0F) |
+				       (sl << 4));
+}
+
+static inline u8 cm_req_get_alt_subnet_local(struct cm_req_msg *req_msg)
+{
+	return (u8) ((req_msg->alt_offset138 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_alt_subnet_local(struct cm_req_msg *req_msg,
+					       u8 subnet_local)
+{
+	req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0xF7) |
+				       ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_alt_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->alt_offset139 >> 3);
+}
+
+static inline void cm_req_set_alt_local_ack_timeout(struct cm_req_msg *req_msg,
+						    u8 local_ack_timeout)
+{
+	req_msg->alt_offset139 = (u8) ((req_msg->alt_offset139 & 0x07) |
+				       (local_ack_timeout << 3));
+}
+
+/* Message REJected or MRAed */
+enum cm_msg_response {
+	CM_MSG_RESPONSE_REQ = 0x0,
+	CM_MSG_RESPONSE_REP = 0x1,
+	CM_MSG_RESPONSE_OTHER = 0x2
+};
+
+ struct cm_mra_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* message MRAed:2, rsvd:6 */
+	u8 offset8;
+	/* service timeout:5, rsvd:3 */
+	u8 offset9;
+
+	u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg)
+{
+	return (u8) (mra_msg->offset8 >> 6);
+}
+
+static inline void cm_mra_set_msg_mraed(struct cm_mra_msg *mra_msg, u8 msg)
+{
+	mra_msg->offset8 = (u8) ((mra_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_mra_get_service_timeout(struct cm_mra_msg *mra_msg)
+{
+	return (u8) (mra_msg->offset9 >> 3);
+}
+
+static inline void cm_mra_set_service_timeout(struct cm_mra_msg *mra_msg,
+					      u8 service_timeout)
+{
+	mra_msg->offset9 = (u8) ((mra_msg->offset9 & 0x07) |
+				 (service_timeout << 3));
+}
+
+struct cm_rej_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* message REJected:2, rsvd:6 */
+	u8 offset8;
+	/* reject info length:7, rsvd:1. */
+	u8 offset9;
+	__be16 reason;
+	u8 ari[IB_CM_REJ_ARI_LENGTH];
+
+	u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg)
+{
+	return (u8) (rej_msg->offset8 >> 6);
+}
+
+static inline void cm_rej_set_msg_rejected(struct cm_rej_msg *rej_msg, u8 msg)
+{
+	rej_msg->offset8 = (u8) ((rej_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_rej_get_reject_info_len(struct cm_rej_msg *rej_msg)
+{
+	return (u8) (rej_msg->offset9 >> 1);
+}
+
+static inline void cm_rej_set_reject_info_len(struct cm_rej_msg *rej_msg,
+					      u8 len)
+{
+	rej_msg->offset9 = (u8) ((rej_msg->offset9 & 0x1) | (len << 1));
+}
+
+struct cm_rep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	__be32 local_qkey;
+	/* local QPN:24, rsvd:8 */
+	__be32 offset12;
+	/* local EECN:24, rsvd:8 */
+	__be32 offset16;
+	/* starting PSN:24 rsvd:8 */
+	__be32 offset20;
+	u8 resp_resources;
+	u8 initiator_depth;
+	/* target ACK delay:5, failover accepted:2, end-to-end flow control:1 */
+	u8 offset26;
+	/* RNR retry count:3, SRQ:1, rsvd:5 */
+	u8 offset27;
+	__be64 local_ca_guid;
+
+	u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(rep_msg->offset12) >> 8);
+}
+
+static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn)
+{
+	rep_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+			    (be32_to_cpu(rep_msg->offset12) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_local_eecn(struct cm_rep_msg *rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(rep_msg->offset16) >> 8);
+}
+
+static inline void cm_rep_set_local_eecn(struct cm_rep_msg *rep_msg, __be32 eecn)
+{
+	rep_msg->offset16 = cpu_to_be32((be32_to_cpu(eecn) << 8) |
+			    (be32_to_cpu(rep_msg->offset16) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_qpn(struct cm_rep_msg *rep_msg, enum ib_qp_type qp_type)
+{
+	return (qp_type == IB_QPT_XRC_INI) ?
+		cm_rep_get_local_eecn(rep_msg) : cm_rep_get_local_qpn(rep_msg);
+}
+
+static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8);
+}
+
+static inline void cm_rep_set_starting_psn(struct cm_rep_msg *rep_msg,
+					   __be32 starting_psn)
+{
+	rep_msg->offset20 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+			    (be32_to_cpu(rep_msg->offset20) & 0x000000FF));
+}
+
+static inline u8 cm_rep_get_target_ack_delay(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset26 >> 3);
+}
+
+static inline void cm_rep_set_target_ack_delay(struct cm_rep_msg *rep_msg,
+					       u8 target_ack_delay)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0x07) |
+				  (target_ack_delay << 3));
+}
+
+static inline u8 cm_rep_get_failover(struct cm_rep_msg *rep_msg)
+{
+	return (u8) ((rep_msg->offset26 & 0x06) >> 1);
+}
+
+static inline void cm_rep_set_failover(struct cm_rep_msg *rep_msg, u8 failover)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xF9) |
+				  ((failover & 0x3) << 1));
+}
+
+static inline u8 cm_rep_get_flow_ctrl(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset26 & 0x01);
+}
+
+static inline void cm_rep_set_flow_ctrl(struct cm_rep_msg *rep_msg,
+					    u8 flow_ctrl)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xFE) |
+				  (flow_ctrl & 0x1));
+}
+
+static inline u8 cm_rep_get_rnr_retry_count(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset27 >> 5);
+}
+
+static inline void cm_rep_set_rnr_retry_count(struct cm_rep_msg *rep_msg,
+					      u8 rnr_retry_count)
+{
+	rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0x1F) |
+				  (rnr_retry_count << 5));
+}
+
+static inline u8 cm_rep_get_srq(struct cm_rep_msg *rep_msg)
+{
+	return (u8) ((rep_msg->offset27 >> 4) & 0x1);
+}
+
+static inline void cm_rep_set_srq(struct cm_rep_msg *rep_msg, u8 srq)
+{
+	rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0xEF) |
+				  ((srq & 0x1) << 4));
+}
+
+struct cm_rtu_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_dreq_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* remote QPN/EECN:24, rsvd:8 */
+	__be32 offset8;
+
+	u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg)
+{
+	return cpu_to_be32(be32_to_cpu(dreq_msg->offset8) >> 8);
+}
+
+static inline void cm_dreq_set_remote_qpn(struct cm_dreq_msg *dreq_msg, __be32 qpn)
+{
+	dreq_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+			    (be32_to_cpu(dreq_msg->offset8) & 0x000000FF));
+}
+
+struct cm_drep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_lap_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	__be32 rsvd8;
+	/* remote QPN/EECN:24, remote CM response timeout:5, rsvd:3 */
+	__be32 offset12;
+	__be32 rsvd16;
+
+	__be16 alt_local_lid;
+	__be16 alt_remote_lid;
+	union ib_gid alt_local_gid;
+	union ib_gid alt_remote_gid;
+	/* flow label:20, rsvd:4, traffic class:8 */
+	__be32 offset56;
+	u8 alt_hop_limit;
+	/* rsvd:2, packet rate:6 */
+	u8 offset61;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 offset62;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 offset63;
+
+	u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE];
+} __attribute__  ((packed));
+
+static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg)
+{
+	return cpu_to_be32(be32_to_cpu(lap_msg->offset12) >> 8);
+}
+
+static inline void cm_lap_set_remote_qpn(struct cm_lap_msg *lap_msg, __be32 qpn)
+{
+	lap_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					 (be32_to_cpu(lap_msg->offset12) &
+					  0x000000FF));
+}
+
+static inline u8 cm_lap_get_remote_resp_timeout(struct cm_lap_msg *lap_msg)
+{
+	return (u8) ((be32_to_cpu(lap_msg->offset12) & 0xF8) >> 3);
+}
+
+static inline void cm_lap_set_remote_resp_timeout(struct cm_lap_msg *lap_msg,
+						  u8 resp_timeout)
+{
+	lap_msg->offset12 = cpu_to_be32((resp_timeout << 3) |
+					 (be32_to_cpu(lap_msg->offset12) &
+					  0xFFFFFF07));
+}
+
+static inline __be32 cm_lap_get_flow_label(struct cm_lap_msg *lap_msg)
+{
+	return cpu_to_be32(be32_to_cpu(lap_msg->offset56) >> 12);
+}
+
+static inline void cm_lap_set_flow_label(struct cm_lap_msg *lap_msg,
+					 __be32 flow_label)
+{
+	lap_msg->offset56 = cpu_to_be32(
+				 (be32_to_cpu(lap_msg->offset56) & 0x00000FFF) |
+				 (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_lap_get_traffic_class(struct cm_lap_msg *lap_msg)
+{
+	return (u8) be32_to_cpu(lap_msg->offset56);
+}
+
+static inline void cm_lap_set_traffic_class(struct cm_lap_msg *lap_msg,
+					    u8 traffic_class)
+{
+	lap_msg->offset56 = cpu_to_be32(traffic_class |
+					 (be32_to_cpu(lap_msg->offset56) &
+					  0xFFFFFF00));
+}
+
+static inline u8 cm_lap_get_packet_rate(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset61 & 0x3F;
+}
+
+static inline void cm_lap_set_packet_rate(struct cm_lap_msg *lap_msg,
+					  u8 packet_rate)
+{
+	lap_msg->offset61 = (packet_rate & 0x3F) | (lap_msg->offset61 & 0xC0);
+}
+
+static inline u8 cm_lap_get_sl(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset62 >> 4;
+}
+
+static inline void cm_lap_set_sl(struct cm_lap_msg *lap_msg, u8 sl)
+{
+	lap_msg->offset62 = (sl << 4) | (lap_msg->offset62 & 0x0F);
+}
+
+static inline u8 cm_lap_get_subnet_local(struct cm_lap_msg *lap_msg)
+{
+	return (lap_msg->offset62 >> 3) & 0x1;
+}
+
+static inline void cm_lap_set_subnet_local(struct cm_lap_msg *lap_msg,
+					   u8 subnet_local)
+{
+	lap_msg->offset62 = ((subnet_local & 0x1) << 3) |
+			     (lap_msg->offset61 & 0xF7);
+}
+static inline u8 cm_lap_get_local_ack_timeout(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset63 >> 3;
+}
+
+static inline void cm_lap_set_local_ack_timeout(struct cm_lap_msg *lap_msg,
+						u8 local_ack_timeout)
+{
+	lap_msg->offset63 = (local_ack_timeout << 3) |
+			    (lap_msg->offset63 & 0x07);
+}
+
+struct cm_apr_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 info_length;
+	u8 ap_status;
+	__be16 rsvd;
+	u8 info[IB_CM_APR_INFO_LENGTH];
+
+	u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+struct cm_sidr_req_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 request_id;
+	__be16 pkey;
+	__be16 rsvd;
+	__be64 service_id;
+
+	u32 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
+} __attribute__ ((packed));
+
+struct cm_sidr_rep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 request_id;
+	u8 status;
+	u8 info_length;
+	__be16 rsvd;
+	/* QPN:24, rsvd:8 */
+	__be32 offset8;
+	__be64 service_id;
+	__be32 qkey;
+	u8 info[IB_CM_SIDR_REP_INFO_LENGTH];
+
+	u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(sidr_rep_msg->offset8) >> 8);
+}
+
+static inline void cm_sidr_rep_set_qpn(struct cm_sidr_rep_msg *sidr_rep_msg,
+				       __be32 qpn)
+{
+	sidr_rep_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					(be32_to_cpu(sidr_rep_msg->offset8) &
+					 0x000000FF));
+}
+
+#endif /* CM_MSGS_H */
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
new file mode 100644
index 0000000..a693fcd
--- /dev/null
+++ b/drivers/infiniband/core/cma.c
@@ -0,0 +1,4612 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/igmp.h>
+#include <linux/idr.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/route.h>
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/ip_fib.h>
+#include <net/ip6_route.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/ib.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_sa.h>
+#include <rdma/iw_cm.h>
+
+#include "core_priv.h"
+#include "cma_priv.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("Generic RDMA CM Agent");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000
+#define CMA_MAX_CM_RETRIES 15
+#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define CMA_IBOE_PACKET_LIFETIME 18
+#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP
+
+static const char * const cma_events[] = {
+	[RDMA_CM_EVENT_ADDR_RESOLVED]	 = "address resolved",
+	[RDMA_CM_EVENT_ADDR_ERROR]	 = "address error",
+	[RDMA_CM_EVENT_ROUTE_RESOLVED]	 = "route resolved ",
+	[RDMA_CM_EVENT_ROUTE_ERROR]	 = "route error",
+	[RDMA_CM_EVENT_CONNECT_REQUEST]	 = "connect request",
+	[RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response",
+	[RDMA_CM_EVENT_CONNECT_ERROR]	 = "connect error",
+	[RDMA_CM_EVENT_UNREACHABLE]	 = "unreachable",
+	[RDMA_CM_EVENT_REJECTED]	 = "rejected",
+	[RDMA_CM_EVENT_ESTABLISHED]	 = "established",
+	[RDMA_CM_EVENT_DISCONNECTED]	 = "disconnected",
+	[RDMA_CM_EVENT_DEVICE_REMOVAL]	 = "device removal",
+	[RDMA_CM_EVENT_MULTICAST_JOIN]	 = "multicast join",
+	[RDMA_CM_EVENT_MULTICAST_ERROR]	 = "multicast error",
+	[RDMA_CM_EVENT_ADDR_CHANGE]	 = "address change",
+	[RDMA_CM_EVENT_TIMEWAIT_EXIT]	 = "timewait exit",
+};
+
+const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event)
+{
+	size_t index = event;
+
+	return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ?
+			cma_events[index] : "unrecognized event";
+}
+EXPORT_SYMBOL(rdma_event_msg);
+
+const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id,
+						int reason)
+{
+	if (rdma_ib_or_roce(id->device, id->port_num))
+		return ibcm_reject_msg(reason);
+
+	if (rdma_protocol_iwarp(id->device, id->port_num))
+		return iwcm_reject_msg(reason);
+
+	WARN_ON_ONCE(1);
+	return "unrecognized transport";
+}
+EXPORT_SYMBOL(rdma_reject_msg);
+
+bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason)
+{
+	if (rdma_ib_or_roce(id->device, id->port_num))
+		return reason == IB_CM_REJ_CONSUMER_DEFINED;
+
+	if (rdma_protocol_iwarp(id->device, id->port_num))
+		return reason == -ECONNREFUSED;
+
+	WARN_ON_ONCE(1);
+	return false;
+}
+EXPORT_SYMBOL(rdma_is_consumer_reject);
+
+const void *rdma_consumer_reject_data(struct rdma_cm_id *id,
+				      struct rdma_cm_event *ev, u8 *data_len)
+{
+	const void *p;
+
+	if (rdma_is_consumer_reject(id, ev->status)) {
+		*data_len = ev->param.conn.private_data_len;
+		p = ev->param.conn.private_data;
+	} else {
+		*data_len = 0;
+		p = NULL;
+	}
+	return p;
+}
+EXPORT_SYMBOL(rdma_consumer_reject_data);
+
+static void cma_add_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device, void *client_data);
+
+static struct ib_client cma_client = {
+	.name   = "cma",
+	.add    = cma_add_one,
+	.remove = cma_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static struct rdma_addr_client addr_client;
+static LIST_HEAD(dev_list);
+static LIST_HEAD(listen_any_list);
+static DEFINE_MUTEX(lock);
+static struct workqueue_struct *cma_wq;
+static unsigned int cma_pernet_id;
+
+struct cma_pernet {
+	struct idr tcp_ps;
+	struct idr udp_ps;
+	struct idr ipoib_ps;
+	struct idr ib_ps;
+};
+
+static struct cma_pernet *cma_pernet(struct net *net)
+{
+	return net_generic(net, cma_pernet_id);
+}
+
+static struct idr *cma_pernet_idr(struct net *net, enum rdma_ucm_port_space ps)
+{
+	struct cma_pernet *pernet = cma_pernet(net);
+
+	switch (ps) {
+	case RDMA_PS_TCP:
+		return &pernet->tcp_ps;
+	case RDMA_PS_UDP:
+		return &pernet->udp_ps;
+	case RDMA_PS_IPOIB:
+		return &pernet->ipoib_ps;
+	case RDMA_PS_IB:
+		return &pernet->ib_ps;
+	default:
+		return NULL;
+	}
+}
+
+struct cma_device {
+	struct list_head	list;
+	struct ib_device	*device;
+	struct completion	comp;
+	atomic_t		refcount;
+	struct list_head	id_list;
+	enum ib_gid_type	*default_gid_type;
+	u8			*default_roce_tos;
+};
+
+struct rdma_bind_list {
+	enum rdma_ucm_port_space ps;
+	struct hlist_head	owners;
+	unsigned short		port;
+};
+
+struct class_port_info_context {
+	struct ib_class_port_info	*class_port_info;
+	struct ib_device		*device;
+	struct completion		done;
+	struct ib_sa_query		*sa_query;
+	u8				port_num;
+};
+
+static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps,
+			struct rdma_bind_list *bind_list, int snum)
+{
+	struct idr *idr = cma_pernet_idr(net, ps);
+
+	return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
+}
+
+static struct rdma_bind_list *cma_ps_find(struct net *net,
+					  enum rdma_ucm_port_space ps, int snum)
+{
+	struct idr *idr = cma_pernet_idr(net, ps);
+
+	return idr_find(idr, snum);
+}
+
+static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps,
+			  int snum)
+{
+	struct idr *idr = cma_pernet_idr(net, ps);
+
+	idr_remove(idr, snum);
+}
+
+enum {
+	CMA_OPTION_AFONLY,
+};
+
+void cma_ref_dev(struct cma_device *cma_dev)
+{
+	atomic_inc(&cma_dev->refcount);
+}
+
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter	filter,
+					     void		*cookie)
+{
+	struct cma_device *cma_dev;
+	struct cma_device *found_cma_dev = NULL;
+
+	mutex_lock(&lock);
+
+	list_for_each_entry(cma_dev, &dev_list, list)
+		if (filter(cma_dev->device, cookie)) {
+			found_cma_dev = cma_dev;
+			break;
+		}
+
+	if (found_cma_dev)
+		cma_ref_dev(found_cma_dev);
+	mutex_unlock(&lock);
+	return found_cma_dev;
+}
+
+int cma_get_default_gid_type(struct cma_device *cma_dev,
+			     unsigned int port)
+{
+	if (!rdma_is_port_valid(cma_dev->device, port))
+		return -EINVAL;
+
+	return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)];
+}
+
+int cma_set_default_gid_type(struct cma_device *cma_dev,
+			     unsigned int port,
+			     enum ib_gid_type default_gid_type)
+{
+	unsigned long supported_gids;
+
+	if (!rdma_is_port_valid(cma_dev->device, port))
+		return -EINVAL;
+
+	supported_gids = roce_gid_type_mask_support(cma_dev->device, port);
+
+	if (!(supported_gids & 1 << default_gid_type))
+		return -EINVAL;
+
+	cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] =
+		default_gid_type;
+
+	return 0;
+}
+
+int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port)
+{
+	if (!rdma_is_port_valid(cma_dev->device, port))
+		return -EINVAL;
+
+	return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)];
+}
+
+int cma_set_default_roce_tos(struct cma_device *cma_dev, unsigned int port,
+			     u8 default_roce_tos)
+{
+	if (!rdma_is_port_valid(cma_dev->device, port))
+		return -EINVAL;
+
+	cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)] =
+		 default_roce_tos;
+
+	return 0;
+}
+struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev)
+{
+	return cma_dev->device;
+}
+
+/*
+ * Device removal can occur at anytime, so we need extra handling to
+ * serialize notifying the user of device removal with other callbacks.
+ * We do this by disabling removal notification while a callback is in process,
+ * and reporting it after the callback completes.
+ */
+
+struct cma_multicast {
+	struct rdma_id_private *id_priv;
+	union {
+		struct ib_sa_multicast *ib;
+	} multicast;
+	struct list_head	list;
+	void			*context;
+	struct sockaddr_storage	addr;
+	struct kref		mcref;
+	bool			igmp_joined;
+	u8			join_state;
+};
+
+struct cma_work {
+	struct work_struct	work;
+	struct rdma_id_private	*id;
+	enum rdma_cm_state	old_state;
+	enum rdma_cm_state	new_state;
+	struct rdma_cm_event	event;
+};
+
+struct cma_ndev_work {
+	struct work_struct	work;
+	struct rdma_id_private	*id;
+	struct rdma_cm_event	event;
+};
+
+struct iboe_mcast_work {
+	struct work_struct	 work;
+	struct rdma_id_private	*id;
+	struct cma_multicast	*mc;
+};
+
+union cma_ip_addr {
+	struct in6_addr ip6;
+	struct {
+		__be32 pad[3];
+		__be32 addr;
+	} ip4;
+};
+
+struct cma_hdr {
+	u8 cma_version;
+	u8 ip_version;	/* IP version: 7:4 */
+	__be16 port;
+	union cma_ip_addr src_addr;
+	union cma_ip_addr dst_addr;
+};
+
+#define CMA_VERSION 0x00
+
+struct cma_req_info {
+	struct sockaddr_storage listen_addr_storage;
+	struct sockaddr_storage src_addr_storage;
+	struct ib_device *device;
+	int port;
+	union ib_gid local_gid;
+	__be64 service_id;
+	u16 pkey;
+	bool has_gid:1;
+};
+
+static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	ret = (id_priv->state == comp);
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+
+static int cma_comp_exch(struct rdma_id_private *id_priv,
+			 enum rdma_cm_state comp, enum rdma_cm_state exch)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	if ((ret = (id_priv->state == comp)))
+		id_priv->state = exch;
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+
+static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
+				   enum rdma_cm_state exch)
+{
+	unsigned long flags;
+	enum rdma_cm_state old;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	old = id_priv->state;
+	id_priv->state = exch;
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return old;
+}
+
+static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
+{
+	return hdr->ip_version >> 4;
+}
+
+static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+{
+	hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
+}
+
+static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
+{
+	struct in_device *in_dev = NULL;
+
+	if (ndev) {
+		rtnl_lock();
+		in_dev = __in_dev_get_rtnl(ndev);
+		if (in_dev) {
+			if (join)
+				ip_mc_inc_group(in_dev,
+						*(__be32 *)(mgid->raw + 12));
+			else
+				ip_mc_dec_group(in_dev,
+						*(__be32 *)(mgid->raw + 12));
+		}
+		rtnl_unlock();
+	}
+	return (in_dev) ? 0 : -ENODEV;
+}
+
+static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
+			       struct cma_device *cma_dev)
+{
+	cma_ref_dev(cma_dev);
+	id_priv->cma_dev = cma_dev;
+	id_priv->gid_type = 0;
+	id_priv->id.device = cma_dev->device;
+	id_priv->id.route.addr.dev_addr.transport =
+		rdma_node_get_transport(cma_dev->device->node_type);
+	list_add_tail(&id_priv->list, &cma_dev->id_list);
+	id_priv->res.type = RDMA_RESTRACK_CM_ID;
+	rdma_restrack_add(&id_priv->res);
+}
+
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+			      struct cma_device *cma_dev)
+{
+	_cma_attach_to_dev(id_priv, cma_dev);
+	id_priv->gid_type =
+		cma_dev->default_gid_type[id_priv->id.port_num -
+					  rdma_start_port(cma_dev->device)];
+}
+
+void cma_deref_dev(struct cma_device *cma_dev)
+{
+	if (atomic_dec_and_test(&cma_dev->refcount))
+		complete(&cma_dev->comp);
+}
+
+static inline void release_mc(struct kref *kref)
+{
+	struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref);
+
+	kfree(mc->multicast.ib);
+	kfree(mc);
+}
+
+static void cma_release_dev(struct rdma_id_private *id_priv)
+{
+	mutex_lock(&lock);
+	list_del(&id_priv->list);
+	cma_deref_dev(id_priv->cma_dev);
+	id_priv->cma_dev = NULL;
+	mutex_unlock(&lock);
+}
+
+static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
+{
+	return (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+}
+
+static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
+{
+	return (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
+}
+
+static inline unsigned short cma_family(struct rdma_id_private *id_priv)
+{
+	return id_priv->id.route.addr.src_addr.ss_family;
+}
+
+static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey)
+{
+	struct ib_sa_mcmember_rec rec;
+	int ret = 0;
+
+	if (id_priv->qkey) {
+		if (qkey && id_priv->qkey != qkey)
+			return -EINVAL;
+		return 0;
+	}
+
+	if (qkey) {
+		id_priv->qkey = qkey;
+		return 0;
+	}
+
+	switch (id_priv->id.ps) {
+	case RDMA_PS_UDP:
+	case RDMA_PS_IB:
+		id_priv->qkey = RDMA_UDP_QKEY;
+		break;
+	case RDMA_PS_IPOIB:
+		ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid);
+		ret = ib_sa_get_mcmember_rec(id_priv->id.device,
+					     id_priv->id.port_num, &rec.mgid,
+					     &rec);
+		if (!ret)
+			id_priv->qkey = be32_to_cpu(rec.qkey);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr)
+{
+	dev_addr->dev_type = ARPHRD_INFINIBAND;
+	rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr);
+	ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey));
+}
+
+static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
+{
+	int ret;
+
+	if (addr->sa_family != AF_IB) {
+		ret = rdma_translate_ip(addr, dev_addr);
+	} else {
+		cma_translate_ib((struct sockaddr_ib *) addr, dev_addr);
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static inline int cma_validate_port(struct ib_device *device, u8 port,
+				    enum ib_gid_type gid_type,
+				    union ib_gid *gid,
+				    struct rdma_id_private *id_priv)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	int bound_if_index = dev_addr->bound_dev_if;
+	int dev_type = dev_addr->dev_type;
+	struct net_device *ndev = NULL;
+	int ret = -ENODEV;
+
+	if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port))
+		return ret;
+
+	if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port))
+		return ret;
+
+	if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
+		ndev = dev_get_by_index(dev_addr->net, bound_if_index);
+		if (!ndev)
+			return ret;
+	} else {
+		gid_type = IB_GID_TYPE_IB;
+	}
+
+	ret = ib_find_cached_gid_by_port(device, gid, gid_type, port,
+					 ndev, NULL);
+
+	if (ndev)
+		dev_put(ndev);
+
+	return ret;
+}
+
+static int cma_acquire_dev(struct rdma_id_private *id_priv,
+			   struct rdma_id_private *listen_id_priv)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	struct cma_device *cma_dev;
+	union ib_gid gid, iboe_gid, *gidp;
+	int ret = -ENODEV;
+	u8 port;
+
+	if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
+	    id_priv->id.ps == RDMA_PS_IPOIB)
+		return -EINVAL;
+
+	mutex_lock(&lock);
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+		    &iboe_gid);
+
+	memcpy(&gid, dev_addr->src_dev_addr +
+	       rdma_addr_gid_offset(dev_addr), sizeof gid);
+
+	if (listen_id_priv) {
+		cma_dev = listen_id_priv->cma_dev;
+		port = listen_id_priv->id.port_num;
+		gidp = rdma_protocol_roce(cma_dev->device, port) ?
+		       &iboe_gid : &gid;
+
+		ret = cma_validate_port(cma_dev->device, port,
+					rdma_protocol_ib(cma_dev->device, port) ?
+					IB_GID_TYPE_IB :
+					listen_id_priv->gid_type, gidp,
+					id_priv);
+		if (!ret) {
+			id_priv->id.port_num = port;
+			goto out;
+		}
+	}
+
+	list_for_each_entry(cma_dev, &dev_list, list) {
+		for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) {
+			if (listen_id_priv &&
+			    listen_id_priv->cma_dev == cma_dev &&
+			    listen_id_priv->id.port_num == port)
+				continue;
+
+			gidp = rdma_protocol_roce(cma_dev->device, port) ?
+			       &iboe_gid : &gid;
+
+			ret = cma_validate_port(cma_dev->device, port,
+						rdma_protocol_ib(cma_dev->device, port) ?
+						IB_GID_TYPE_IB :
+						cma_dev->default_gid_type[port - 1],
+						gidp, id_priv);
+			if (!ret) {
+				id_priv->id.port_num = port;
+				goto out;
+			}
+		}
+	}
+
+out:
+	if (!ret)
+		cma_attach_to_dev(id_priv, cma_dev);
+
+	mutex_unlock(&lock);
+	return ret;
+}
+
+/*
+ * Select the source IB device and address to reach the destination IB address.
+ */
+static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
+{
+	struct cma_device *cma_dev, *cur_dev;
+	struct sockaddr_ib *addr;
+	union ib_gid gid, sgid, *dgid;
+	u16 pkey, index;
+	u8 p;
+	enum ib_port_state port_state;
+	int i;
+
+	cma_dev = NULL;
+	addr = (struct sockaddr_ib *) cma_dst_addr(id_priv);
+	dgid = (union ib_gid *) &addr->sib_addr;
+	pkey = ntohs(addr->sib_pkey);
+
+	list_for_each_entry(cur_dev, &dev_list, list) {
+		for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+			if (!rdma_cap_af_ib(cur_dev->device, p))
+				continue;
+
+			if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index))
+				continue;
+
+			if (ib_get_cached_port_state(cur_dev->device, p, &port_state))
+				continue;
+			for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i,
+						       &gid, NULL);
+			     i++) {
+				if (!memcmp(&gid, dgid, sizeof(gid))) {
+					cma_dev = cur_dev;
+					sgid = gid;
+					id_priv->id.port_num = p;
+					goto found;
+				}
+
+				if (!cma_dev && (gid.global.subnet_prefix ==
+				    dgid->global.subnet_prefix) &&
+				    port_state == IB_PORT_ACTIVE) {
+					cma_dev = cur_dev;
+					sgid = gid;
+					id_priv->id.port_num = p;
+				}
+			}
+		}
+	}
+
+	if (!cma_dev)
+		return -ENODEV;
+
+found:
+	cma_attach_to_dev(id_priv, cma_dev);
+	addr = (struct sockaddr_ib *) cma_src_addr(id_priv);
+	memcpy(&addr->sib_addr, &sgid, sizeof sgid);
+	cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr);
+	return 0;
+}
+
+static void cma_deref_id(struct rdma_id_private *id_priv)
+{
+	if (atomic_dec_and_test(&id_priv->refcount))
+		complete(&id_priv->comp);
+}
+
+struct rdma_cm_id *__rdma_create_id(struct net *net,
+				    rdma_cm_event_handler event_handler,
+				    void *context, enum rdma_ucm_port_space ps,
+				    enum ib_qp_type qp_type, const char *caller)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL);
+	if (!id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	if (caller)
+		id_priv->res.kern_name = caller;
+	else
+		rdma_restrack_set_task(&id_priv->res, current);
+	id_priv->state = RDMA_CM_IDLE;
+	id_priv->id.context = context;
+	id_priv->id.event_handler = event_handler;
+	id_priv->id.ps = ps;
+	id_priv->id.qp_type = qp_type;
+	id_priv->tos_set = false;
+	spin_lock_init(&id_priv->lock);
+	mutex_init(&id_priv->qp_mutex);
+	init_completion(&id_priv->comp);
+	atomic_set(&id_priv->refcount, 1);
+	mutex_init(&id_priv->handler_mutex);
+	INIT_LIST_HEAD(&id_priv->listen_list);
+	INIT_LIST_HEAD(&id_priv->mc_list);
+	get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
+	id_priv->id.route.addr.dev_addr.net = get_net(net);
+	id_priv->seq_num &= 0x00ffffff;
+
+	return &id_priv->id;
+}
+EXPORT_SYMBOL(__rdma_create_id);
+
+static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret)
+		return ret;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+	if (ret)
+		return ret;
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
+
+	return ret;
+}
+
+static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	return ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+}
+
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+		   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct rdma_id_private *id_priv;
+	struct ib_qp *qp;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id->device != pd->device)
+		return -EINVAL;
+
+	qp_init_attr->port_num = id->port_num;
+	qp = ib_create_qp(pd, qp_init_attr);
+	if (IS_ERR(qp))
+		return PTR_ERR(qp);
+
+	if (id->qp_type == IB_QPT_UD)
+		ret = cma_init_ud_qp(id_priv, qp);
+	else
+		ret = cma_init_conn_qp(id_priv, qp);
+	if (ret)
+		goto err;
+
+	id->qp = qp;
+	id_priv->qp_num = qp->qp_num;
+	id_priv->srq = (qp->srq != NULL);
+	return 0;
+err:
+	ib_destroy_qp(qp);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_create_qp);
+
+void rdma_destroy_qp(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	mutex_lock(&id_priv->qp_mutex);
+	ib_destroy_qp(id_priv->id.qp);
+	id_priv->id.qp = NULL;
+	mutex_unlock(&id_priv->qp_mutex);
+}
+EXPORT_SYMBOL(rdma_destroy_qp);
+
+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Need to update QP attributes from default values. */
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+	if (ret)
+		goto out;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	BUG_ON(id_priv->cma_dev->device != id_priv->id.device);
+
+	if (conn_param)
+		qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	if (conn_param)
+		qp_attr.max_rd_atomic = conn_param->initiator_depth;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_modify_qp_err(struct rdma_id_private *id_priv)
+{
+	struct ib_qp_attr qp_attr;
+	int ret;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
+			       struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	int ret;
+	u16 pkey;
+
+	if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num))
+		pkey = 0xffff;
+	else
+		pkey = ib_addr_get_pkey(dev_addr);
+
+	ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
+				  pkey, &qp_attr->pkey_index);
+	if (ret)
+		return ret;
+
+	qp_attr->port_num = id_priv->id.port_num;
+	*qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+	if (id_priv->id.qp_type == IB_QPT_UD) {
+		ret = cma_set_qkey(id_priv, 0);
+		if (ret)
+			return ret;
+
+		qp_attr->qkey = id_priv->qkey;
+		*qp_attr_mask |= IB_QP_QKEY;
+	} else {
+		qp_attr->qp_access_flags = 0;
+		*qp_attr_mask |= IB_QP_ACCESS_FLAGS;
+	}
+	return 0;
+}
+
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct rdma_id_private *id_priv;
+	int ret = 0;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (rdma_cap_ib_cm(id->device, id->port_num)) {
+		if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD))
+			ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
+		else
+			ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
+						 qp_attr_mask);
+
+		if (qp_attr->qp_state == IB_QPS_RTR)
+			qp_attr->rq_psn = id_priv->seq_num;
+	} else if (rdma_cap_iw_cm(id->device, id->port_num)) {
+		if (!id_priv->cm_id.iw) {
+			qp_attr->qp_access_flags = 0;
+			*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+		} else
+			ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
+						 qp_attr_mask);
+		qp_attr->port_num = id_priv->id.port_num;
+		*qp_attr_mask |= IB_QP_PORT;
+	} else
+		ret = -ENOSYS;
+
+	return ret;
+}
+EXPORT_SYMBOL(rdma_init_qp_attr);
+
+static inline int cma_zero_addr(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr);
+	case AF_INET6:
+		return ipv6_addr_any(&((struct sockaddr_in6 *) addr)->sin6_addr);
+	case AF_IB:
+		return ib_addr_any(&((struct sockaddr_ib *) addr)->sib_addr);
+	default:
+		return 0;
+	}
+}
+
+static inline int cma_loopback_addr(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr);
+	case AF_INET6:
+		return ipv6_addr_loopback(&((struct sockaddr_in6 *) addr)->sin6_addr);
+	case AF_IB:
+		return ib_addr_loopback(&((struct sockaddr_ib *) addr)->sib_addr);
+	default:
+		return 0;
+	}
+}
+
+static inline int cma_any_addr(struct sockaddr *addr)
+{
+	return cma_zero_addr(addr) || cma_loopback_addr(addr);
+}
+
+static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst)
+{
+	if (src->sa_family != dst->sa_family)
+		return -1;
+
+	switch (src->sa_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *) src)->sin_addr.s_addr !=
+		       ((struct sockaddr_in *) dst)->sin_addr.s_addr;
+	case AF_INET6:
+		return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr,
+				     &((struct sockaddr_in6 *) dst)->sin6_addr);
+	default:
+		return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr,
+				   &((struct sockaddr_ib *) dst)->sib_addr);
+	}
+}
+
+static __be16 cma_port(struct sockaddr *addr)
+{
+	struct sockaddr_ib *sib;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *) addr)->sin_port;
+	case AF_INET6:
+		return ((struct sockaddr_in6 *) addr)->sin6_port;
+	case AF_IB:
+		sib = (struct sockaddr_ib *) addr;
+		return htons((u16) (be64_to_cpu(sib->sib_sid) &
+				    be64_to_cpu(sib->sib_sid_mask)));
+	default:
+		return 0;
+	}
+}
+
+static inline int cma_any_port(struct sockaddr *addr)
+{
+	return !cma_port(addr);
+}
+
+static void cma_save_ib_info(struct sockaddr *src_addr,
+			     struct sockaddr *dst_addr,
+			     struct rdma_cm_id *listen_id,
+			     struct sa_path_rec *path)
+{
+	struct sockaddr_ib *listen_ib, *ib;
+
+	listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
+	if (src_addr) {
+		ib = (struct sockaddr_ib *)src_addr;
+		ib->sib_family = AF_IB;
+		if (path) {
+			ib->sib_pkey = path->pkey;
+			ib->sib_flowinfo = path->flow_label;
+			memcpy(&ib->sib_addr, &path->sgid, 16);
+			ib->sib_sid = path->service_id;
+			ib->sib_scope_id = 0;
+		} else {
+			ib->sib_pkey = listen_ib->sib_pkey;
+			ib->sib_flowinfo = listen_ib->sib_flowinfo;
+			ib->sib_addr = listen_ib->sib_addr;
+			ib->sib_sid = listen_ib->sib_sid;
+			ib->sib_scope_id = listen_ib->sib_scope_id;
+		}
+		ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+	}
+	if (dst_addr) {
+		ib = (struct sockaddr_ib *)dst_addr;
+		ib->sib_family = AF_IB;
+		if (path) {
+			ib->sib_pkey = path->pkey;
+			ib->sib_flowinfo = path->flow_label;
+			memcpy(&ib->sib_addr, &path->dgid, 16);
+		}
+	}
+}
+
+static void cma_save_ip4_info(struct sockaddr_in *src_addr,
+			      struct sockaddr_in *dst_addr,
+			      struct cma_hdr *hdr,
+			      __be16 local_port)
+{
+	if (src_addr) {
+		*src_addr = (struct sockaddr_in) {
+			.sin_family = AF_INET,
+			.sin_addr.s_addr = hdr->dst_addr.ip4.addr,
+			.sin_port = local_port,
+		};
+	}
+
+	if (dst_addr) {
+		*dst_addr = (struct sockaddr_in) {
+			.sin_family = AF_INET,
+			.sin_addr.s_addr = hdr->src_addr.ip4.addr,
+			.sin_port = hdr->port,
+		};
+	}
+}
+
+static void cma_save_ip6_info(struct sockaddr_in6 *src_addr,
+			      struct sockaddr_in6 *dst_addr,
+			      struct cma_hdr *hdr,
+			      __be16 local_port)
+{
+	if (src_addr) {
+		*src_addr = (struct sockaddr_in6) {
+			.sin6_family = AF_INET6,
+			.sin6_addr = hdr->dst_addr.ip6,
+			.sin6_port = local_port,
+		};
+	}
+
+	if (dst_addr) {
+		*dst_addr = (struct sockaddr_in6) {
+			.sin6_family = AF_INET6,
+			.sin6_addr = hdr->src_addr.ip6,
+			.sin6_port = hdr->port,
+		};
+	}
+}
+
+static u16 cma_port_from_service_id(__be64 service_id)
+{
+	return (u16)be64_to_cpu(service_id);
+}
+
+static int cma_save_ip_info(struct sockaddr *src_addr,
+			    struct sockaddr *dst_addr,
+			    struct ib_cm_event *ib_event,
+			    __be64 service_id)
+{
+	struct cma_hdr *hdr;
+	__be16 port;
+
+	hdr = ib_event->private_data;
+	if (hdr->cma_version != CMA_VERSION)
+		return -EINVAL;
+
+	port = htons(cma_port_from_service_id(service_id));
+
+	switch (cma_get_ip_ver(hdr)) {
+	case 4:
+		cma_save_ip4_info((struct sockaddr_in *)src_addr,
+				  (struct sockaddr_in *)dst_addr, hdr, port);
+		break;
+	case 6:
+		cma_save_ip6_info((struct sockaddr_in6 *)src_addr,
+				  (struct sockaddr_in6 *)dst_addr, hdr, port);
+		break;
+	default:
+		return -EAFNOSUPPORT;
+	}
+
+	return 0;
+}
+
+static int cma_save_net_info(struct sockaddr *src_addr,
+			     struct sockaddr *dst_addr,
+			     struct rdma_cm_id *listen_id,
+			     struct ib_cm_event *ib_event,
+			     sa_family_t sa_family, __be64 service_id)
+{
+	if (sa_family == AF_IB) {
+		if (ib_event->event == IB_CM_REQ_RECEIVED)
+			cma_save_ib_info(src_addr, dst_addr, listen_id,
+					 ib_event->param.req_rcvd.primary_path);
+		else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+			cma_save_ib_info(src_addr, dst_addr, listen_id, NULL);
+		return 0;
+	}
+
+	return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
+}
+
+static int cma_save_req_info(const struct ib_cm_event *ib_event,
+			     struct cma_req_info *req)
+{
+	const struct ib_cm_req_event_param *req_param =
+		&ib_event->param.req_rcvd;
+	const struct ib_cm_sidr_req_event_param *sidr_param =
+		&ib_event->param.sidr_req_rcvd;
+
+	switch (ib_event->event) {
+	case IB_CM_REQ_RECEIVED:
+		req->device	= req_param->listen_id->device;
+		req->port	= req_param->port;
+		memcpy(&req->local_gid, &req_param->primary_path->sgid,
+		       sizeof(req->local_gid));
+		req->has_gid	= true;
+		req->service_id = req_param->primary_path->service_id;
+		req->pkey	= be16_to_cpu(req_param->primary_path->pkey);
+		if (req->pkey != req_param->bth_pkey)
+			pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n"
+					    "RDMA CMA: in the future this may cause the request to be dropped\n",
+					    req_param->bth_pkey, req->pkey);
+		break;
+	case IB_CM_SIDR_REQ_RECEIVED:
+		req->device	= sidr_param->listen_id->device;
+		req->port	= sidr_param->port;
+		req->has_gid	= false;
+		req->service_id	= sidr_param->service_id;
+		req->pkey	= sidr_param->pkey;
+		if (req->pkey != sidr_param->bth_pkey)
+			pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n"
+					    "RDMA CMA: in the future this may cause the request to be dropped\n",
+					    sidr_param->bth_pkey, req->pkey);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static bool validate_ipv4_net_dev(struct net_device *net_dev,
+				  const struct sockaddr_in *dst_addr,
+				  const struct sockaddr_in *src_addr)
+{
+	__be32 daddr = dst_addr->sin_addr.s_addr,
+	       saddr = src_addr->sin_addr.s_addr;
+	struct fib_result res;
+	struct flowi4 fl4;
+	int err;
+	bool ret;
+
+	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+	    ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) ||
+	    ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) ||
+	    ipv4_is_loopback(saddr))
+		return false;
+
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.flowi4_iif = net_dev->ifindex;
+	fl4.daddr = daddr;
+	fl4.saddr = saddr;
+
+	rcu_read_lock();
+	err = fib_lookup(dev_net(net_dev), &fl4, &res, 0);
+	ret = err == 0 && FIB_RES_DEV(res) == net_dev;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static bool validate_ipv6_net_dev(struct net_device *net_dev,
+				  const struct sockaddr_in6 *dst_addr,
+				  const struct sockaddr_in6 *src_addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	const int strict = ipv6_addr_type(&dst_addr->sin6_addr) &
+			   IPV6_ADDR_LINKLOCAL;
+	struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr,
+					 &src_addr->sin6_addr, net_dev->ifindex,
+					 NULL, strict);
+	bool ret;
+
+	if (!rt)
+		return false;
+
+	ret = rt->rt6i_idev->dev == net_dev;
+	ip6_rt_put(rt);
+
+	return ret;
+#else
+	return false;
+#endif
+}
+
+static bool validate_net_dev(struct net_device *net_dev,
+			     const struct sockaddr *daddr,
+			     const struct sockaddr *saddr)
+{
+	const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr;
+	const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr;
+	const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
+	const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr;
+
+	switch (daddr->sa_family) {
+	case AF_INET:
+		return saddr->sa_family == AF_INET &&
+		       validate_ipv4_net_dev(net_dev, daddr4, saddr4);
+
+	case AF_INET6:
+		return saddr->sa_family == AF_INET6 &&
+		       validate_ipv6_net_dev(net_dev, daddr6, saddr6);
+
+	default:
+		return false;
+	}
+}
+
+static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event,
+					  struct cma_req_info *req)
+{
+	struct sockaddr *listen_addr =
+			(struct sockaddr *)&req->listen_addr_storage;
+	struct sockaddr *src_addr = (struct sockaddr *)&req->src_addr_storage;
+	struct net_device *net_dev;
+	const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
+	int err;
+
+	err = cma_save_ip_info(listen_addr, src_addr, ib_event,
+			       req->service_id);
+	if (err)
+		return ERR_PTR(err);
+
+	net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey,
+					   gid, listen_addr);
+	if (!net_dev)
+		return ERR_PTR(-ENODEV);
+
+	return net_dev;
+}
+
+static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id)
+{
+	return (be64_to_cpu(service_id) >> 16) & 0xffff;
+}
+
+static bool cma_match_private_data(struct rdma_id_private *id_priv,
+				   const struct cma_hdr *hdr)
+{
+	struct sockaddr *addr = cma_src_addr(id_priv);
+	__be32 ip4_addr;
+	struct in6_addr ip6_addr;
+
+	if (cma_any_addr(addr) && !id_priv->afonly)
+		return true;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr;
+		if (cma_get_ip_ver(hdr) != 4)
+			return false;
+		if (!cma_any_addr(addr) &&
+		    hdr->dst_addr.ip4.addr != ip4_addr)
+			return false;
+		break;
+	case AF_INET6:
+		ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr;
+		if (cma_get_ip_ver(hdr) != 6)
+			return false;
+		if (!cma_any_addr(addr) &&
+		    memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr)))
+			return false;
+		break;
+	case AF_IB:
+		return true;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static bool cma_protocol_roce(const struct rdma_cm_id *id)
+{
+	struct ib_device *device = id->device;
+	const int port_num = id->port_num ?: rdma_start_port(device);
+
+	return rdma_protocol_roce(device, port_num);
+}
+
+static bool cma_match_net_dev(const struct rdma_cm_id *id,
+			      const struct net_device *net_dev,
+			      u8 port_num)
+{
+	const struct rdma_addr *addr = &id->route.addr;
+
+	if (!net_dev)
+		/* This request is an AF_IB request or a RoCE request */
+		return (!id->port_num || id->port_num == port_num) &&
+		       (addr->src_addr.ss_family == AF_IB ||
+			rdma_protocol_roce(id->device, port_num));
+
+	return !addr->dev_addr.bound_dev_if ||
+	       (net_eq(dev_net(net_dev), addr->dev_addr.net) &&
+		addr->dev_addr.bound_dev_if == net_dev->ifindex);
+}
+
+static struct rdma_id_private *cma_find_listener(
+		const struct rdma_bind_list *bind_list,
+		const struct ib_cm_id *cm_id,
+		const struct ib_cm_event *ib_event,
+		const struct cma_req_info *req,
+		const struct net_device *net_dev)
+{
+	struct rdma_id_private *id_priv, *id_priv_dev;
+
+	if (!bind_list)
+		return ERR_PTR(-EINVAL);
+
+	hlist_for_each_entry(id_priv, &bind_list->owners, node) {
+		if (cma_match_private_data(id_priv, ib_event->private_data)) {
+			if (id_priv->id.device == cm_id->device &&
+			    cma_match_net_dev(&id_priv->id, net_dev, req->port))
+				return id_priv;
+			list_for_each_entry(id_priv_dev,
+					    &id_priv->listen_list,
+					    listen_list) {
+				if (id_priv_dev->id.device == cm_id->device &&
+				    cma_match_net_dev(&id_priv_dev->id, net_dev, req->port))
+					return id_priv_dev;
+			}
+		}
+	}
+
+	return ERR_PTR(-EINVAL);
+}
+
+static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id,
+						 struct ib_cm_event *ib_event,
+						 struct net_device **net_dev)
+{
+	struct cma_req_info req;
+	struct rdma_bind_list *bind_list;
+	struct rdma_id_private *id_priv;
+	int err;
+
+	err = cma_save_req_info(ib_event, &req);
+	if (err)
+		return ERR_PTR(err);
+
+	*net_dev = cma_get_net_dev(ib_event, &req);
+	if (IS_ERR(*net_dev)) {
+		if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
+			/* Assuming the protocol is AF_IB */
+			*net_dev = NULL;
+		} else if (rdma_protocol_roce(req.device, req.port)) {
+			/* TODO find the net dev matching the request parameters
+			 * through the RoCE GID table */
+			*net_dev = NULL;
+		} else {
+			return ERR_CAST(*net_dev);
+		}
+	}
+
+	/*
+	 * Net namespace might be getting deleted while route lookup,
+	 * cm_id lookup is in progress. Therefore, perform netdevice
+	 * validation, cm_id lookup under rcu lock.
+	 * RCU lock along with netdevice state check, synchronizes with
+	 * netdevice migrating to different net namespace and also avoids
+	 * case where net namespace doesn't get deleted while lookup is in
+	 * progress.
+	 * If the device state is not IFF_UP, its properties such as ifindex
+	 * and nd_net cannot be trusted to remain valid without rcu lock.
+	 * net/core/dev.c change_net_namespace() ensures to synchronize with
+	 * ongoing operations on net device after device is closed using
+	 * synchronize_net().
+	 */
+	rcu_read_lock();
+	if (*net_dev) {
+		/*
+		 * If netdevice is down, it is likely that it is administratively
+		 * down or it might be migrating to different namespace.
+		 * In that case avoid further processing, as the net namespace
+		 * or ifindex may change.
+		 */
+		if (((*net_dev)->flags & IFF_UP) == 0) {
+			id_priv = ERR_PTR(-EHOSTUNREACH);
+			goto err;
+		}
+
+		if (!validate_net_dev(*net_dev,
+				 (struct sockaddr *)&req.listen_addr_storage,
+				 (struct sockaddr *)&req.src_addr_storage)) {
+			id_priv = ERR_PTR(-EHOSTUNREACH);
+			goto err;
+		}
+	}
+
+	bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net,
+				rdma_ps_from_service_id(req.service_id),
+				cma_port_from_service_id(req.service_id));
+	id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev);
+err:
+	rcu_read_unlock();
+	if (IS_ERR(id_priv) && *net_dev) {
+		dev_put(*net_dev);
+		*net_dev = NULL;
+	}
+	return id_priv;
+}
+
+static inline u8 cma_user_data_offset(struct rdma_id_private *id_priv)
+{
+	return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
+}
+
+static void cma_cancel_route(struct rdma_id_private *id_priv)
+{
+	if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) {
+		if (id_priv->query)
+			ib_sa_cancel_query(id_priv->query_id, id_priv->query);
+	}
+}
+
+static void cma_cancel_listens(struct rdma_id_private *id_priv)
+{
+	struct rdma_id_private *dev_id_priv;
+
+	/*
+	 * Remove from listen_any_list to prevent added devices from spawning
+	 * additional listen requests.
+	 */
+	mutex_lock(&lock);
+	list_del(&id_priv->list);
+
+	while (!list_empty(&id_priv->listen_list)) {
+		dev_id_priv = list_entry(id_priv->listen_list.next,
+					 struct rdma_id_private, listen_list);
+		/* sync with device removal to avoid duplicate destruction */
+		list_del_init(&dev_id_priv->list);
+		list_del(&dev_id_priv->listen_list);
+		mutex_unlock(&lock);
+
+		rdma_destroy_id(&dev_id_priv->id);
+		mutex_lock(&lock);
+	}
+	mutex_unlock(&lock);
+}
+
+static void cma_cancel_operation(struct rdma_id_private *id_priv,
+				 enum rdma_cm_state state)
+{
+	switch (state) {
+	case RDMA_CM_ADDR_QUERY:
+		rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
+		break;
+	case RDMA_CM_ROUTE_QUERY:
+		cma_cancel_route(id_priv);
+		break;
+	case RDMA_CM_LISTEN:
+		if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev)
+			cma_cancel_listens(id_priv);
+		break;
+	default:
+		break;
+	}
+}
+
+static void cma_release_port(struct rdma_id_private *id_priv)
+{
+	struct rdma_bind_list *bind_list = id_priv->bind_list;
+	struct net *net = id_priv->id.route.addr.dev_addr.net;
+
+	if (!bind_list)
+		return;
+
+	mutex_lock(&lock);
+	hlist_del(&id_priv->node);
+	if (hlist_empty(&bind_list->owners)) {
+		cma_ps_remove(net, bind_list->ps, bind_list->port);
+		kfree(bind_list);
+	}
+	mutex_unlock(&lock);
+}
+
+static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
+{
+	struct cma_multicast *mc;
+
+	while (!list_empty(&id_priv->mc_list)) {
+		mc = container_of(id_priv->mc_list.next,
+				  struct cma_multicast, list);
+		list_del(&mc->list);
+		if (rdma_cap_ib_mcast(id_priv->cma_dev->device,
+				      id_priv->id.port_num)) {
+			ib_sa_free_multicast(mc->multicast.ib);
+			kfree(mc);
+		} else {
+			if (mc->igmp_joined) {
+				struct rdma_dev_addr *dev_addr =
+					&id_priv->id.route.addr.dev_addr;
+				struct net_device *ndev = NULL;
+
+				if (dev_addr->bound_dev_if)
+					ndev = dev_get_by_index(&init_net,
+								dev_addr->bound_dev_if);
+				if (ndev) {
+					cma_igmp_send(ndev,
+						      &mc->multicast.ib->rec.mgid,
+						      false);
+					dev_put(ndev);
+				}
+			}
+			kref_put(&mc->mcref, release_mc);
+		}
+	}
+}
+
+void rdma_destroy_id(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+	enum rdma_cm_state state;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	state = cma_exch(id_priv, RDMA_CM_DESTROYING);
+	cma_cancel_operation(id_priv, state);
+
+	/*
+	 * Wait for any active callback to finish.  New callbacks will find
+	 * the id_priv state set to destroying and abort.
+	 */
+	mutex_lock(&id_priv->handler_mutex);
+	mutex_unlock(&id_priv->handler_mutex);
+
+	if (id_priv->cma_dev) {
+		rdma_restrack_del(&id_priv->res);
+		if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
+			if (id_priv->cm_id.ib)
+				ib_destroy_cm_id(id_priv->cm_id.ib);
+		} else if (rdma_cap_iw_cm(id_priv->id.device, 1)) {
+			if (id_priv->cm_id.iw)
+				iw_destroy_cm_id(id_priv->cm_id.iw);
+		}
+		cma_leave_mc_groups(id_priv);
+		cma_release_dev(id_priv);
+	}
+
+	cma_release_port(id_priv);
+	cma_deref_id(id_priv);
+	wait_for_completion(&id_priv->comp);
+
+	if (id_priv->internal_id)
+		cma_deref_id(id_priv->id.context);
+
+	kfree(id_priv->id.route.path_rec);
+	put_net(id_priv->id.route.addr.dev_addr.net);
+	kfree(id_priv);
+}
+EXPORT_SYMBOL(rdma_destroy_id);
+
+static int cma_rep_recv(struct rdma_id_private *id_priv)
+{
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, NULL);
+	if (ret)
+		goto reject;
+
+	ret = cma_modify_qp_rts(id_priv, NULL);
+	if (ret)
+		goto reject;
+
+	ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
+	if (ret)
+		goto reject;
+
+	return 0;
+reject:
+	pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret);
+	cma_modify_qp_err(id_priv);
+	ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
+		       NULL, 0, NULL, 0);
+	return ret;
+}
+
+static void cma_set_rep_event_data(struct rdma_cm_event *event,
+				   struct ib_cm_rep_event_param *rep_data,
+				   void *private_data)
+{
+	event->param.conn.private_data = private_data;
+	event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+	event->param.conn.responder_resources = rep_data->responder_resources;
+	event->param.conn.initiator_depth = rep_data->initiator_depth;
+	event->param.conn.flow_control = rep_data->flow_control;
+	event->param.conn.rnr_retry_count = rep_data->rnr_retry_count;
+	event->param.conn.srq = rep_data->srq;
+	event->param.conn.qp_num = rep_data->remote_qpn;
+}
+
+static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv = cm_id->context;
+	struct rdma_cm_event event;
+	int ret = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
+	     id_priv->state != RDMA_CM_CONNECT) ||
+	    (ib_event->event == IB_CM_TIMEWAIT_EXIT &&
+	     id_priv->state != RDMA_CM_DISCONNECT))
+		goto out;
+
+	memset(&event, 0, sizeof event);
+	switch (ib_event->event) {
+	case IB_CM_REQ_ERROR:
+	case IB_CM_REP_ERROR:
+		event.event = RDMA_CM_EVENT_UNREACHABLE;
+		event.status = -ETIMEDOUT;
+		break;
+	case IB_CM_REP_RECEIVED:
+		if (cma_comp(id_priv, RDMA_CM_CONNECT) &&
+		    (id_priv->id.qp_type != IB_QPT_UD))
+			ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+		if (id_priv->id.qp) {
+			event.status = cma_rep_recv(id_priv);
+			event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
+						     RDMA_CM_EVENT_ESTABLISHED;
+		} else {
+			event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
+		}
+		cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
+				       ib_event->private_data);
+		break;
+	case IB_CM_RTU_RECEIVED:
+	case IB_CM_USER_ESTABLISHED:
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		break;
+	case IB_CM_DREQ_ERROR:
+		event.status = -ETIMEDOUT; /* fall through */
+	case IB_CM_DREQ_RECEIVED:
+	case IB_CM_DREP_RECEIVED:
+		if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT,
+				   RDMA_CM_DISCONNECT))
+			goto out;
+		event.event = RDMA_CM_EVENT_DISCONNECTED;
+		break;
+	case IB_CM_TIMEWAIT_EXIT:
+		event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT;
+		break;
+	case IB_CM_MRA_RECEIVED:
+		/* ignore event */
+		goto out;
+	case IB_CM_REJ_RECEIVED:
+		pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id,
+										ib_event->param.rej_rcvd.reason));
+		cma_modify_qp_err(id_priv);
+		event.status = ib_event->param.rej_rcvd.reason;
+		event.event = RDMA_CM_EVENT_REJECTED;
+		event.param.conn.private_data = ib_event->private_data;
+		event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+		break;
+	default:
+		pr_err("RDMA CMA: unexpected IB CM event: %d\n",
+		       ib_event->event);
+		goto out;
+	}
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.ib = NULL;
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
+					       struct ib_cm_event *ib_event,
+					       struct net_device *net_dev)
+{
+	struct rdma_id_private *listen_id_priv;
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id;
+	struct rdma_route *rt;
+	const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+	struct sa_path_rec *path = ib_event->param.req_rcvd.primary_path;
+	const __be64 service_id =
+		ib_event->param.req_rcvd.primary_path->service_id;
+	int ret;
+
+	listen_id_priv = container_of(listen_id, struct rdma_id_private, id);
+	id = __rdma_create_id(listen_id->route.addr.dev_addr.net,
+			    listen_id->event_handler, listen_id->context,
+			    listen_id->ps, ib_event->param.req_rcvd.qp_type,
+			    listen_id_priv->res.kern_name);
+	if (IS_ERR(id))
+		return NULL;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+			      (struct sockaddr *)&id->route.addr.dst_addr,
+			      listen_id, ib_event, ss_family, service_id))
+		goto err;
+
+	rt = &id->route;
+	rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
+	rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths,
+			       GFP_KERNEL);
+	if (!rt->path_rec)
+		goto err;
+
+	rt->path_rec[0] = *path;
+	if (rt->num_paths == 2)
+		rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
+
+	if (net_dev) {
+		rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL);
+	} else {
+		if (!cma_protocol_roce(listen_id) &&
+		    cma_any_addr(cma_src_addr(id_priv))) {
+			rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
+			rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+			ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
+		} else if (!cma_any_addr(cma_src_addr(id_priv))) {
+			ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+			if (ret)
+				goto err;
+		}
+	}
+	rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
+
+	id_priv->state = RDMA_CM_CONNECT;
+	return id_priv;
+
+err:
+	rdma_destroy_id(id);
+	return NULL;
+}
+
+static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
+					      struct ib_cm_event *ib_event,
+					      struct net_device *net_dev)
+{
+	struct rdma_id_private *listen_id_priv;
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id;
+	const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+	struct net *net = listen_id->route.addr.dev_addr.net;
+	int ret;
+
+	listen_id_priv = container_of(listen_id, struct rdma_id_private, id);
+	id = __rdma_create_id(net, listen_id->event_handler, listen_id->context,
+			      listen_id->ps, IB_QPT_UD,
+			      listen_id_priv->res.kern_name);
+	if (IS_ERR(id))
+		return NULL;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+			      (struct sockaddr *)&id->route.addr.dst_addr,
+			      listen_id, ib_event, ss_family,
+			      ib_event->param.sidr_req_rcvd.service_id))
+		goto err;
+
+	if (net_dev) {
+		rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL);
+	} else {
+		if (!cma_any_addr(cma_src_addr(id_priv))) {
+			ret = cma_translate_addr(cma_src_addr(id_priv),
+						 &id->route.addr.dev_addr);
+			if (ret)
+				goto err;
+		}
+	}
+
+	id_priv->state = RDMA_CM_CONNECT;
+	return id_priv;
+err:
+	rdma_destroy_id(id);
+	return NULL;
+}
+
+static void cma_set_req_event_data(struct rdma_cm_event *event,
+				   struct ib_cm_req_event_param *req_data,
+				   void *private_data, int offset)
+{
+	event->param.conn.private_data = private_data + offset;
+	event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset;
+	event->param.conn.responder_resources = req_data->responder_resources;
+	event->param.conn.initiator_depth = req_data->initiator_depth;
+	event->param.conn.flow_control = req_data->flow_control;
+	event->param.conn.retry_count = req_data->retry_count;
+	event->param.conn.rnr_retry_count = req_data->rnr_retry_count;
+	event->param.conn.srq = req_data->srq;
+	event->param.conn.qp_num = req_data->remote_qpn;
+}
+
+static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event)
+{
+	return (((ib_event->event == IB_CM_REQ_RECEIVED) &&
+		 (ib_event->param.req_rcvd.qp_type == id->qp_type)) ||
+		((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) &&
+		 (id->qp_type == IB_QPT_UD)) ||
+		(!id->qp_type));
+}
+
+static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *listen_id, *conn_id = NULL;
+	struct rdma_cm_event event;
+	struct net_device *net_dev;
+	u8 offset;
+	int ret;
+
+	listen_id = cma_id_from_event(cm_id, ib_event, &net_dev);
+	if (IS_ERR(listen_id))
+		return PTR_ERR(listen_id);
+
+	if (!cma_check_req_qp_type(&listen_id->id, ib_event)) {
+		ret = -EINVAL;
+		goto net_dev_put;
+	}
+
+	mutex_lock(&listen_id->handler_mutex);
+	if (listen_id->state != RDMA_CM_LISTEN) {
+		ret = -ECONNABORTED;
+		goto err1;
+	}
+
+	memset(&event, 0, sizeof event);
+	offset = cma_user_data_offset(listen_id);
+	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+	if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
+		conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev);
+		event.param.ud.private_data = ib_event->private_data + offset;
+		event.param.ud.private_data_len =
+				IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
+	} else {
+		conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev);
+		cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
+				       ib_event->private_data, offset);
+	}
+	if (!conn_id) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+	ret = cma_acquire_dev(conn_id, listen_id);
+	if (ret)
+		goto err2;
+
+	conn_id->cm_id.ib = cm_id;
+	cm_id->context = conn_id;
+	cm_id->cm_handler = cma_ib_handler;
+
+	/*
+	 * Protect against the user destroying conn_id from another thread
+	 * until we're done accessing it.
+	 */
+	atomic_inc(&conn_id->refcount);
+	ret = conn_id->id.event_handler(&conn_id->id, &event);
+	if (ret)
+		goto err3;
+	/*
+	 * Acquire mutex to prevent user executing rdma_destroy_id()
+	 * while we're accessing the cm_id.
+	 */
+	mutex_lock(&lock);
+	if (cma_comp(conn_id, RDMA_CM_CONNECT) &&
+	    (conn_id->id.qp_type != IB_QPT_UD))
+		ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+	mutex_unlock(&lock);
+	mutex_unlock(&conn_id->handler_mutex);
+	mutex_unlock(&listen_id->handler_mutex);
+	cma_deref_id(conn_id);
+	if (net_dev)
+		dev_put(net_dev);
+	return 0;
+
+err3:
+	cma_deref_id(conn_id);
+	/* Destroy the CM ID by returning a non-zero value. */
+	conn_id->cm_id.ib = NULL;
+err2:
+	cma_exch(conn_id, RDMA_CM_DESTROYING);
+	mutex_unlock(&conn_id->handler_mutex);
+err1:
+	mutex_unlock(&listen_id->handler_mutex);
+	if (conn_id)
+		rdma_destroy_id(&conn_id->id);
+
+net_dev_put:
+	if (net_dev)
+		dev_put(net_dev);
+
+	return ret;
+}
+
+__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	if (addr->sa_family == AF_IB)
+		return ((struct sockaddr_ib *) addr)->sib_sid;
+
+	return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr)));
+}
+EXPORT_SYMBOL(rdma_get_service_id);
+
+void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid,
+		    union ib_gid *dgid)
+{
+	struct rdma_addr *addr = &cm_id->route.addr;
+
+	if (!cm_id->device) {
+		if (sgid)
+			memset(sgid, 0, sizeof(*sgid));
+		if (dgid)
+			memset(dgid, 0, sizeof(*dgid));
+		return;
+	}
+
+	if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) {
+		if (sgid)
+			rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid);
+		if (dgid)
+			rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid);
+	} else {
+		if (sgid)
+			rdma_addr_get_sgid(&addr->dev_addr, sgid);
+		if (dgid)
+			rdma_addr_get_dgid(&addr->dev_addr, dgid);
+	}
+}
+EXPORT_SYMBOL(rdma_read_gids);
+
+static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
+{
+	struct rdma_id_private *id_priv = iw_id->context;
+	struct rdma_cm_event event;
+	int ret = 0;
+	struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+	struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state != RDMA_CM_CONNECT)
+		goto out;
+
+	memset(&event, 0, sizeof event);
+	switch (iw_event->event) {
+	case IW_CM_EVENT_CLOSE:
+		event.event = RDMA_CM_EVENT_DISCONNECTED;
+		break;
+	case IW_CM_EVENT_CONNECT_REPLY:
+		memcpy(cma_src_addr(id_priv), laddr,
+		       rdma_addr_size(laddr));
+		memcpy(cma_dst_addr(id_priv), raddr,
+		       rdma_addr_size(raddr));
+		switch (iw_event->status) {
+		case 0:
+			event.event = RDMA_CM_EVENT_ESTABLISHED;
+			event.param.conn.initiator_depth = iw_event->ird;
+			event.param.conn.responder_resources = iw_event->ord;
+			break;
+		case -ECONNRESET:
+		case -ECONNREFUSED:
+			event.event = RDMA_CM_EVENT_REJECTED;
+			break;
+		case -ETIMEDOUT:
+			event.event = RDMA_CM_EVENT_UNREACHABLE;
+			break;
+		default:
+			event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+			break;
+		}
+		break;
+	case IW_CM_EVENT_ESTABLISHED:
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		event.param.conn.initiator_depth = iw_event->ird;
+		event.param.conn.responder_resources = iw_event->ord;
+		break;
+	default:
+		BUG_ON(1);
+	}
+
+	event.status = iw_event->status;
+	event.param.conn.private_data = iw_event->private_data;
+	event.param.conn.private_data_len = iw_event->private_data_len;
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.iw = NULL;
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static int iw_conn_req_handler(struct iw_cm_id *cm_id,
+			       struct iw_cm_event *iw_event)
+{
+	struct rdma_cm_id *new_cm_id;
+	struct rdma_id_private *listen_id, *conn_id;
+	struct rdma_cm_event event;
+	int ret = -ECONNABORTED;
+	struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+	struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
+
+	listen_id = cm_id->context;
+
+	mutex_lock(&listen_id->handler_mutex);
+	if (listen_id->state != RDMA_CM_LISTEN)
+		goto out;
+
+	/* Create a new RDMA id for the new IW CM ID */
+	new_cm_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net,
+				     listen_id->id.event_handler,
+				     listen_id->id.context,
+				     RDMA_PS_TCP, IB_QPT_RC,
+				     listen_id->res.kern_name);
+	if (IS_ERR(new_cm_id)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	conn_id = container_of(new_cm_id, struct rdma_id_private, id);
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+	conn_id->state = RDMA_CM_CONNECT;
+
+	ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	ret = cma_acquire_dev(conn_id, listen_id);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	conn_id->cm_id.iw = cm_id;
+	cm_id->context = conn_id;
+	cm_id->cm_handler = cma_iw_handler;
+
+	memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr));
+	memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr));
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+	event.param.conn.private_data = iw_event->private_data;
+	event.param.conn.private_data_len = iw_event->private_data_len;
+	event.param.conn.initiator_depth = iw_event->ird;
+	event.param.conn.responder_resources = iw_event->ord;
+
+	/*
+	 * Protect against the user destroying conn_id from another thread
+	 * until we're done accessing it.
+	 */
+	atomic_inc(&conn_id->refcount);
+	ret = conn_id->id.event_handler(&conn_id->id, &event);
+	if (ret) {
+		/* User wants to destroy the CM ID */
+		conn_id->cm_id.iw = NULL;
+		cma_exch(conn_id, RDMA_CM_DESTROYING);
+		mutex_unlock(&conn_id->handler_mutex);
+		cma_deref_id(conn_id);
+		rdma_destroy_id(&conn_id->id);
+		goto out;
+	}
+
+	mutex_unlock(&conn_id->handler_mutex);
+	cma_deref_id(conn_id);
+
+out:
+	mutex_unlock(&listen_id->handler_mutex);
+	return ret;
+}
+
+static int cma_ib_listen(struct rdma_id_private *id_priv)
+{
+	struct sockaddr *addr;
+	struct ib_cm_id	*id;
+	__be64 svc_id;
+
+	addr = cma_src_addr(id_priv);
+	svc_id = rdma_get_service_id(&id_priv->id, addr);
+	id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id);
+	if (IS_ERR(id))
+		return PTR_ERR(id);
+	id_priv->cm_id.ib = id;
+
+	return 0;
+}
+
+static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
+{
+	int ret;
+	struct iw_cm_id	*id;
+
+	id = iw_create_cm_id(id_priv->id.device,
+			     iw_conn_req_handler,
+			     id_priv);
+	if (IS_ERR(id))
+		return PTR_ERR(id);
+
+	id->tos = id_priv->tos;
+	id_priv->cm_id.iw = id;
+
+	memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
+	       rdma_addr_size(cma_src_addr(id_priv)));
+
+	ret = iw_cm_listen(id_priv->cm_id.iw, backlog);
+
+	if (ret) {
+		iw_destroy_cm_id(id_priv->cm_id.iw);
+		id_priv->cm_id.iw = NULL;
+	}
+
+	return ret;
+}
+
+static int cma_listen_handler(struct rdma_cm_id *id,
+			      struct rdma_cm_event *event)
+{
+	struct rdma_id_private *id_priv = id->context;
+
+	id->context = id_priv->id.context;
+	id->event_handler = id_priv->id.event_handler;
+	return id_priv->id.event_handler(id, event);
+}
+
+static void cma_listen_on_dev(struct rdma_id_private *id_priv,
+			      struct cma_device *cma_dev)
+{
+	struct rdma_id_private *dev_id_priv;
+	struct rdma_cm_id *id;
+	struct net *net = id_priv->id.route.addr.dev_addr.net;
+	int ret;
+
+	if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1))
+		return;
+
+	id = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps,
+			      id_priv->id.qp_type, id_priv->res.kern_name);
+	if (IS_ERR(id))
+		return;
+
+	dev_id_priv = container_of(id, struct rdma_id_private, id);
+
+	dev_id_priv->state = RDMA_CM_ADDR_BOUND;
+	memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv),
+	       rdma_addr_size(cma_src_addr(id_priv)));
+
+	_cma_attach_to_dev(dev_id_priv, cma_dev);
+	list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
+	atomic_inc(&id_priv->refcount);
+	dev_id_priv->internal_id = 1;
+	dev_id_priv->afonly = id_priv->afonly;
+
+	ret = rdma_listen(id, id_priv->backlog);
+	if (ret)
+		pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n",
+			ret, cma_dev->device->name);
+}
+
+static void cma_listen_on_all(struct rdma_id_private *id_priv)
+{
+	struct cma_device *cma_dev;
+
+	mutex_lock(&lock);
+	list_add_tail(&id_priv->list, &listen_any_list);
+	list_for_each_entry(cma_dev, &dev_list, list)
+		cma_listen_on_dev(id_priv, cma_dev);
+	mutex_unlock(&lock);
+}
+
+void rdma_set_service_type(struct rdma_cm_id *id, int tos)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	id_priv->tos = (u8) tos;
+	id_priv->tos_set = true;
+}
+EXPORT_SYMBOL(rdma_set_service_type);
+
+static void cma_query_handler(int status, struct sa_path_rec *path_rec,
+			      void *context)
+{
+	struct cma_work *work = context;
+	struct rdma_route *route;
+
+	route = &work->id->id.route;
+
+	if (!status) {
+		route->num_paths = 1;
+		*route->path_rec = *path_rec;
+	} else {
+		work->old_state = RDMA_CM_ROUTE_QUERY;
+		work->new_state = RDMA_CM_ADDR_RESOLVED;
+		work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+		work->event.status = status;
+		pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n",
+				     status);
+	}
+
+	queue_work(cma_wq, &work->work);
+}
+
+static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
+			      struct cma_work *work)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	struct sa_path_rec path_rec;
+	ib_sa_comp_mask comp_mask;
+	struct sockaddr_in6 *sin6;
+	struct sockaddr_ib *sib;
+
+	memset(&path_rec, 0, sizeof path_rec);
+
+	if (rdma_cap_opa_ah(id_priv->id.device, id_priv->id.port_num))
+		path_rec.rec_type = SA_PATH_REC_TYPE_OPA;
+	else
+		path_rec.rec_type = SA_PATH_REC_TYPE_IB;
+	rdma_addr_get_sgid(dev_addr, &path_rec.sgid);
+	rdma_addr_get_dgid(dev_addr, &path_rec.dgid);
+	path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+	path_rec.numb_path = 1;
+	path_rec.reversible = 1;
+	path_rec.service_id = rdma_get_service_id(&id_priv->id,
+						  cma_dst_addr(id_priv));
+
+	comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+		    IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
+		    IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
+
+	switch (cma_family(id_priv)) {
+	case AF_INET:
+		path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
+		comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
+		break;
+	case AF_INET6:
+		sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
+		path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
+		comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+		break;
+	case AF_IB:
+		sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+		path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20);
+		comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+		break;
+	}
+
+	id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
+					       id_priv->id.port_num, &path_rec,
+					       comp_mask, timeout_ms,
+					       GFP_KERNEL, cma_query_handler,
+					       work, &id_priv->query);
+
+	return (id_priv->query_id < 0) ? id_priv->query_id : 0;
+}
+
+static void cma_work_handler(struct work_struct *_work)
+{
+	struct cma_work *work = container_of(_work, struct cma_work, work);
+	struct rdma_id_private *id_priv = work->id;
+	int destroy = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
+		goto out;
+
+	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		destroy = 1;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+	if (destroy)
+		rdma_destroy_id(&id_priv->id);
+	kfree(work);
+}
+
+static void cma_ndev_work_handler(struct work_struct *_work)
+{
+	struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work);
+	struct rdma_id_private *id_priv = work->id;
+	int destroy = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state == RDMA_CM_DESTROYING ||
+	    id_priv->state == RDMA_CM_DEVICE_REMOVAL)
+		goto out;
+
+	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		destroy = 1;
+	}
+
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+	if (destroy)
+		rdma_destroy_id(&id_priv->id);
+	kfree(work);
+}
+
+static void cma_init_resolve_route_work(struct cma_work *work,
+					struct rdma_id_private *id_priv)
+{
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = RDMA_CM_ROUTE_QUERY;
+	work->new_state = RDMA_CM_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+}
+
+static void cma_init_resolve_addr_work(struct cma_work *work,
+				       struct rdma_id_private *id_priv)
+{
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = RDMA_CM_ADDR_QUERY;
+	work->new_state = RDMA_CM_ADDR_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+}
+
+static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+	struct rdma_route *route = &id_priv->id.route;
+	struct cma_work *work;
+	int ret;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	cma_init_resolve_route_work(work, id_priv);
+
+	route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
+	if (!route->path_rec) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	ret = cma_query_ib_route(id_priv, timeout_ms, work);
+	if (ret)
+		goto err2;
+
+	return 0;
+err2:
+	kfree(route->path_rec);
+	route->path_rec = NULL;
+err1:
+	kfree(work);
+	return ret;
+}
+
+static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type,
+					   unsigned long supported_gids,
+					   enum ib_gid_type default_gid)
+{
+	if ((network_type == RDMA_NETWORK_IPV4 ||
+	     network_type == RDMA_NETWORK_IPV6) &&
+	    test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids))
+		return IB_GID_TYPE_ROCE_UDP_ENCAP;
+
+	return default_gid;
+}
+
+/*
+ * cma_iboe_set_path_rec_l2_fields() is helper function which sets
+ * path record type based on GID type.
+ * It also sets up other L2 fields which includes destination mac address
+ * netdev ifindex, of the path record.
+ * It returns the netdev of the bound interface for this path record entry.
+ */
+static struct net_device *
+cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv)
+{
+	struct rdma_route *route = &id_priv->id.route;
+	enum ib_gid_type gid_type = IB_GID_TYPE_ROCE;
+	struct rdma_addr *addr = &route->addr;
+	unsigned long supported_gids;
+	struct net_device *ndev;
+
+	if (!addr->dev_addr.bound_dev_if)
+		return NULL;
+
+	ndev = dev_get_by_index(addr->dev_addr.net,
+				addr->dev_addr.bound_dev_if);
+	if (!ndev)
+		return NULL;
+
+	supported_gids = roce_gid_type_mask_support(id_priv->id.device,
+						    id_priv->id.port_num);
+	gid_type = cma_route_gid_type(addr->dev_addr.network,
+				      supported_gids,
+				      id_priv->gid_type);
+	/* Use the hint from IP Stack to select GID Type */
+	if (gid_type < ib_network_to_gid_type(addr->dev_addr.network))
+		gid_type = ib_network_to_gid_type(addr->dev_addr.network);
+	route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type);
+
+	route->path_rec->roce.route_resolved = true;
+	sa_path_set_ndev(route->path_rec, addr->dev_addr.net);
+	sa_path_set_ifindex(route->path_rec, ndev->ifindex);
+	sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr);
+	return ndev;
+}
+
+int rdma_set_ib_path(struct rdma_cm_id *id,
+		     struct sa_path_rec *path_rec)
+{
+	struct rdma_id_private *id_priv;
+	struct net_device *ndev;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+			   RDMA_CM_ROUTE_RESOLVED))
+		return -EINVAL;
+
+	id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec),
+				     GFP_KERNEL);
+	if (!id->route.path_rec) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	if (rdma_protocol_roce(id->device, id->port_num)) {
+		ndev = cma_iboe_set_path_rec_l2_fields(id_priv);
+		if (!ndev) {
+			ret = -ENODEV;
+			goto err_free;
+		}
+		dev_put(ndev);
+	}
+
+	id->route.num_paths = 1;
+	return 0;
+
+err_free:
+	kfree(id->route.path_rec);
+	id->route.path_rec = NULL;
+err:
+	cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_set_ib_path);
+
+static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+	struct cma_work *work;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	cma_init_resolve_route_work(work, id_priv);
+	queue_work(cma_wq, &work->work);
+	return 0;
+}
+
+static int iboe_tos_to_sl(struct net_device *ndev, int tos)
+{
+	int prio;
+	struct net_device *dev;
+
+	prio = rt_tos2priority(tos);
+	dev = is_vlan_dev(ndev) ? vlan_dev_real_dev(ndev) : ndev;
+	if (dev->num_tc)
+		return netdev_get_prio_tc_map(dev, prio);
+
+#if IS_ENABLED(CONFIG_VLAN_8021Q)
+	if (is_vlan_dev(ndev))
+		return (vlan_dev_get_egress_qos_mask(ndev, prio) &
+			VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+#endif
+	return 0;
+}
+
+static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
+{
+	struct rdma_route *route = &id_priv->id.route;
+	struct rdma_addr *addr = &route->addr;
+	struct cma_work *work;
+	int ret;
+	struct net_device *ndev;
+
+	u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num -
+					rdma_start_port(id_priv->cma_dev->device)];
+	u8 tos = id_priv->tos_set ? id_priv->tos : default_roce_tos;
+
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
+	if (!route->path_rec) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	route->num_paths = 1;
+
+	ndev = cma_iboe_set_path_rec_l2_fields(id_priv);
+	if (!ndev) {
+		ret = -ENODEV;
+		goto err2;
+	}
+
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+		    &route->path_rec->sgid);
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
+		    &route->path_rec->dgid);
+
+	if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB)
+		/* TODO: get the hoplimit from the inet/inet6 device */
+		route->path_rec->hop_limit = addr->dev_addr.hoplimit;
+	else
+		route->path_rec->hop_limit = 1;
+	route->path_rec->reversible = 1;
+	route->path_rec->pkey = cpu_to_be16(0xffff);
+	route->path_rec->mtu_selector = IB_SA_EQ;
+	route->path_rec->sl = iboe_tos_to_sl(ndev, tos);
+	route->path_rec->traffic_class = tos;
+	route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
+	route->path_rec->rate_selector = IB_SA_EQ;
+	route->path_rec->rate = iboe_get_rate(ndev);
+	dev_put(ndev);
+	route->path_rec->packet_life_time_selector = IB_SA_EQ;
+	route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME;
+	if (!route->path_rec->mtu) {
+		ret = -EINVAL;
+		goto err2;
+	}
+
+	cma_init_resolve_route_work(work, id_priv);
+	queue_work(cma_wq, &work->work);
+
+	return 0;
+
+err2:
+	kfree(route->path_rec);
+	route->path_rec = NULL;
+err1:
+	kfree(work);
+	return ret;
+}
+
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY))
+		return -EINVAL;
+
+	atomic_inc(&id_priv->refcount);
+	if (rdma_cap_ib_sa(id->device, id->port_num))
+		ret = cma_resolve_ib_route(id_priv, timeout_ms);
+	else if (rdma_protocol_roce(id->device, id->port_num))
+		ret = cma_resolve_iboe_route(id_priv);
+	else if (rdma_protocol_iwarp(id->device, id->port_num))
+		ret = cma_resolve_iw_route(id_priv, timeout_ms);
+	else
+		ret = -ENOSYS;
+
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED);
+	cma_deref_id(id_priv);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_route);
+
+static void cma_set_loopback(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+		break;
+	case AF_INET6:
+		ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr,
+			      0, 0, 0, htonl(1));
+		break;
+	default:
+		ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr,
+			    0, 0, 0, htonl(1));
+		break;
+	}
+}
+
+static int cma_bind_loopback(struct rdma_id_private *id_priv)
+{
+	struct cma_device *cma_dev, *cur_dev;
+	union ib_gid gid;
+	enum ib_port_state port_state;
+	u16 pkey;
+	int ret;
+	u8 p;
+
+	cma_dev = NULL;
+	mutex_lock(&lock);
+	list_for_each_entry(cur_dev, &dev_list, list) {
+		if (cma_family(id_priv) == AF_IB &&
+		    !rdma_cap_ib_cm(cur_dev->device, 1))
+			continue;
+
+		if (!cma_dev)
+			cma_dev = cur_dev;
+
+		for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+			if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) &&
+			    port_state == IB_PORT_ACTIVE) {
+				cma_dev = cur_dev;
+				goto port_found;
+			}
+		}
+	}
+
+	if (!cma_dev) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	p = 1;
+
+port_found:
+	ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid, NULL);
+	if (ret)
+		goto out;
+
+	ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey);
+	if (ret)
+		goto out;
+
+	id_priv->id.route.addr.dev_addr.dev_type =
+		(rdma_protocol_ib(cma_dev->device, p)) ?
+		ARPHRD_INFINIBAND : ARPHRD_ETHER;
+
+	rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+	ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
+	id_priv->id.port_num = p;
+	cma_attach_to_dev(id_priv, cma_dev);
+	cma_set_loopback(cma_src_addr(id_priv));
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static void addr_handler(int status, struct sockaddr *src_addr,
+			 struct rdma_dev_addr *dev_addr, void *context)
+{
+	struct rdma_id_private *id_priv = context;
+	struct rdma_cm_event event;
+
+	memset(&event, 0, sizeof event);
+	mutex_lock(&id_priv->handler_mutex);
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY,
+			   RDMA_CM_ADDR_RESOLVED))
+		goto out;
+
+	memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr));
+	if (!status && !id_priv->cma_dev) {
+		status = cma_acquire_dev(id_priv, NULL);
+		if (status)
+			pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n",
+					     status);
+	} else {
+		pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status);
+	}
+
+	if (status) {
+		if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+				   RDMA_CM_ADDR_BOUND))
+			goto out;
+		event.event = RDMA_CM_EVENT_ADDR_ERROR;
+		event.status = status;
+	} else
+		event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+
+	if (id_priv->id.event_handler(&id_priv->id, &event)) {
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		cma_deref_id(id_priv);
+		rdma_destroy_id(&id_priv->id);
+		return;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+}
+
+static int cma_resolve_loopback(struct rdma_id_private *id_priv)
+{
+	struct cma_work *work;
+	union ib_gid gid;
+	int ret;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	if (!id_priv->cma_dev) {
+		ret = cma_bind_loopback(id_priv);
+		if (ret)
+			goto err;
+	}
+
+	rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
+
+	cma_init_resolve_addr_work(work, id_priv);
+	queue_work(cma_wq, &work->work);
+	return 0;
+err:
+	kfree(work);
+	return ret;
+}
+
+static int cma_resolve_ib_addr(struct rdma_id_private *id_priv)
+{
+	struct cma_work *work;
+	int ret;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	if (!id_priv->cma_dev) {
+		ret = cma_resolve_ib_dev(id_priv);
+		if (ret)
+			goto err;
+	}
+
+	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *)
+		&(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr));
+
+	cma_init_resolve_addr_work(work, id_priv);
+	queue_work(cma_wq, &work->work);
+	return 0;
+err:
+	kfree(work);
+	return ret;
+}
+
+static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+			 struct sockaddr *dst_addr)
+{
+	if (!src_addr || !src_addr->sa_family) {
+		src_addr = (struct sockaddr *) &id->route.addr.src_addr;
+		src_addr->sa_family = dst_addr->sa_family;
+		if (IS_ENABLED(CONFIG_IPV6) &&
+		    dst_addr->sa_family == AF_INET6) {
+			struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr;
+			struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr;
+			src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
+			if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+				id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id;
+		} else if (dst_addr->sa_family == AF_IB) {
+			((struct sockaddr_ib *) src_addr)->sib_pkey =
+				((struct sockaddr_ib *) dst_addr)->sib_pkey;
+		}
+	}
+	return rdma_bind_addr(id, src_addr);
+}
+
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+		      struct sockaddr *dst_addr, int timeout_ms)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
+	if (id_priv->state == RDMA_CM_IDLE) {
+		ret = cma_bind_addr(id, src_addr, dst_addr);
+		if (ret) {
+			memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr));
+			return ret;
+		}
+	}
+
+	if (cma_family(id_priv) != dst_addr->sa_family) {
+		memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr));
+		return -EINVAL;
+	}
+
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) {
+		memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr));
+		return -EINVAL;
+	}
+
+	atomic_inc(&id_priv->refcount);
+	if (cma_any_addr(dst_addr)) {
+		ret = cma_resolve_loopback(id_priv);
+	} else {
+		if (dst_addr->sa_family == AF_IB) {
+			ret = cma_resolve_ib_addr(id_priv);
+		} else {
+			ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv),
+					      dst_addr, &id->route.addr.dev_addr,
+					      timeout_ms, addr_handler, id_priv);
+		}
+	}
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
+	cma_deref_id(id_priv);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_addr);
+
+int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse)
+{
+	struct rdma_id_private *id_priv;
+	unsigned long flags;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	spin_lock_irqsave(&id_priv->lock, flags);
+	if (reuse || id_priv->state == RDMA_CM_IDLE) {
+		id_priv->reuseaddr = reuse;
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_set_reuseaddr);
+
+int rdma_set_afonly(struct rdma_cm_id *id, int afonly)
+{
+	struct rdma_id_private *id_priv;
+	unsigned long flags;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	spin_lock_irqsave(&id_priv->lock, flags);
+	if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) {
+		id_priv->options |= (1 << CMA_OPTION_AFONLY);
+		id_priv->afonly = afonly;
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_set_afonly);
+
+static void cma_bind_port(struct rdma_bind_list *bind_list,
+			  struct rdma_id_private *id_priv)
+{
+	struct sockaddr *addr;
+	struct sockaddr_ib *sib;
+	u64 sid, mask;
+	__be16 port;
+
+	addr = cma_src_addr(id_priv);
+	port = htons(bind_list->port);
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *) addr)->sin_port = port;
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *) addr)->sin6_port = port;
+		break;
+	case AF_IB:
+		sib = (struct sockaddr_ib *) addr;
+		sid = be64_to_cpu(sib->sib_sid);
+		mask = be64_to_cpu(sib->sib_sid_mask);
+		sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port));
+		sib->sib_sid_mask = cpu_to_be64(~0ULL);
+		break;
+	}
+	id_priv->bind_list = bind_list;
+	hlist_add_head(&id_priv->node, &bind_list->owners);
+}
+
+static int cma_alloc_port(enum rdma_ucm_port_space ps,
+			  struct rdma_id_private *id_priv, unsigned short snum)
+{
+	struct rdma_bind_list *bind_list;
+	int ret;
+
+	bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
+	if (!bind_list)
+		return -ENOMEM;
+
+	ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list,
+			   snum);
+	if (ret < 0)
+		goto err;
+
+	bind_list->ps = ps;
+	bind_list->port = (unsigned short)ret;
+	cma_bind_port(bind_list, id_priv);
+	return 0;
+err:
+	kfree(bind_list);
+	return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
+}
+
+static int cma_port_is_unique(struct rdma_bind_list *bind_list,
+			      struct rdma_id_private *id_priv)
+{
+	struct rdma_id_private *cur_id;
+	struct sockaddr  *daddr = cma_dst_addr(id_priv);
+	struct sockaddr  *saddr = cma_src_addr(id_priv);
+	__be16 dport = cma_port(daddr);
+
+	hlist_for_each_entry(cur_id, &bind_list->owners, node) {
+		struct sockaddr  *cur_daddr = cma_dst_addr(cur_id);
+		struct sockaddr  *cur_saddr = cma_src_addr(cur_id);
+		__be16 cur_dport = cma_port(cur_daddr);
+
+		if (id_priv == cur_id)
+			continue;
+
+		/* different dest port -> unique */
+		if (!cma_any_port(daddr) &&
+		    !cma_any_port(cur_daddr) &&
+		    (dport != cur_dport))
+			continue;
+
+		/* different src address -> unique */
+		if (!cma_any_addr(saddr) &&
+		    !cma_any_addr(cur_saddr) &&
+		    cma_addr_cmp(saddr, cur_saddr))
+			continue;
+
+		/* different dst address -> unique */
+		if (!cma_any_addr(daddr) &&
+		    !cma_any_addr(cur_daddr) &&
+		    cma_addr_cmp(daddr, cur_daddr))
+			continue;
+
+		return -EADDRNOTAVAIL;
+	}
+	return 0;
+}
+
+static int cma_alloc_any_port(enum rdma_ucm_port_space ps,
+			      struct rdma_id_private *id_priv)
+{
+	static unsigned int last_used_port;
+	int low, high, remaining;
+	unsigned int rover;
+	struct net *net = id_priv->id.route.addr.dev_addr.net;
+
+	inet_get_local_port_range(net, &low, &high);
+	remaining = (high - low) + 1;
+	rover = prandom_u32() % remaining + low;
+retry:
+	if (last_used_port != rover) {
+		struct rdma_bind_list *bind_list;
+		int ret;
+
+		bind_list = cma_ps_find(net, ps, (unsigned short)rover);
+
+		if (!bind_list) {
+			ret = cma_alloc_port(ps, id_priv, rover);
+		} else {
+			ret = cma_port_is_unique(bind_list, id_priv);
+			if (!ret)
+				cma_bind_port(bind_list, id_priv);
+		}
+		/*
+		 * Remember previously used port number in order to avoid
+		 * re-using same port immediately after it is closed.
+		 */
+		if (!ret)
+			last_used_port = rover;
+		if (ret != -EADDRNOTAVAIL)
+			return ret;
+	}
+	if (--remaining) {
+		rover++;
+		if ((rover < low) || (rover > high))
+			rover = low;
+		goto retry;
+	}
+	return -EADDRNOTAVAIL;
+}
+
+/*
+ * Check that the requested port is available.  This is called when trying to
+ * bind to a specific port, or when trying to listen on a bound port.  In
+ * the latter case, the provided id_priv may already be on the bind_list, but
+ * we still need to check that it's okay to start listening.
+ */
+static int cma_check_port(struct rdma_bind_list *bind_list,
+			  struct rdma_id_private *id_priv, uint8_t reuseaddr)
+{
+	struct rdma_id_private *cur_id;
+	struct sockaddr *addr, *cur_addr;
+
+	addr = cma_src_addr(id_priv);
+	hlist_for_each_entry(cur_id, &bind_list->owners, node) {
+		if (id_priv == cur_id)
+			continue;
+
+		if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr &&
+		    cur_id->reuseaddr)
+			continue;
+
+		cur_addr = cma_src_addr(cur_id);
+		if (id_priv->afonly && cur_id->afonly &&
+		    (addr->sa_family != cur_addr->sa_family))
+			continue;
+
+		if (cma_any_addr(addr) || cma_any_addr(cur_addr))
+			return -EADDRNOTAVAIL;
+
+		if (!cma_addr_cmp(addr, cur_addr))
+			return -EADDRINUSE;
+	}
+	return 0;
+}
+
+static int cma_use_port(enum rdma_ucm_port_space ps,
+			struct rdma_id_private *id_priv)
+{
+	struct rdma_bind_list *bind_list;
+	unsigned short snum;
+	int ret;
+
+	snum = ntohs(cma_port(cma_src_addr(id_priv)));
+	if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+		return -EACCES;
+
+	bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum);
+	if (!bind_list) {
+		ret = cma_alloc_port(ps, id_priv, snum);
+	} else {
+		ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr);
+		if (!ret)
+			cma_bind_port(bind_list, id_priv);
+	}
+	return ret;
+}
+
+static int cma_bind_listen(struct rdma_id_private *id_priv)
+{
+	struct rdma_bind_list *bind_list = id_priv->bind_list;
+	int ret = 0;
+
+	mutex_lock(&lock);
+	if (bind_list->owners.first->next)
+		ret = cma_check_port(bind_list, id_priv, 0);
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static enum rdma_ucm_port_space
+cma_select_inet_ps(struct rdma_id_private *id_priv)
+{
+	switch (id_priv->id.ps) {
+	case RDMA_PS_TCP:
+	case RDMA_PS_UDP:
+	case RDMA_PS_IPOIB:
+	case RDMA_PS_IB:
+		return id_priv->id.ps;
+	default:
+
+		return 0;
+	}
+}
+
+static enum rdma_ucm_port_space
+cma_select_ib_ps(struct rdma_id_private *id_priv)
+{
+	enum rdma_ucm_port_space ps = 0;
+	struct sockaddr_ib *sib;
+	u64 sid_ps, mask, sid;
+
+	sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+	mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK;
+	sid = be64_to_cpu(sib->sib_sid) & mask;
+
+	if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
+		sid_ps = RDMA_IB_IP_PS_IB;
+		ps = RDMA_PS_IB;
+	} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
+		   (sid == (RDMA_IB_IP_PS_TCP & mask))) {
+		sid_ps = RDMA_IB_IP_PS_TCP;
+		ps = RDMA_PS_TCP;
+	} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
+		   (sid == (RDMA_IB_IP_PS_UDP & mask))) {
+		sid_ps = RDMA_IB_IP_PS_UDP;
+		ps = RDMA_PS_UDP;
+	}
+
+	if (ps) {
+		sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib)));
+		sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK |
+						be64_to_cpu(sib->sib_sid_mask));
+	}
+	return ps;
+}
+
+static int cma_get_port(struct rdma_id_private *id_priv)
+{
+	enum rdma_ucm_port_space ps;
+	int ret;
+
+	if (cma_family(id_priv) != AF_IB)
+		ps = cma_select_inet_ps(id_priv);
+	else
+		ps = cma_select_ib_ps(id_priv);
+	if (!ps)
+		return -EPROTONOSUPPORT;
+
+	mutex_lock(&lock);
+	if (cma_any_port(cma_src_addr(id_priv)))
+		ret = cma_alloc_any_port(ps, id_priv);
+	else
+		ret = cma_use_port(ps, id_priv);
+	mutex_unlock(&lock);
+
+	return ret;
+}
+
+static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
+			       struct sockaddr *addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	struct sockaddr_in6 *sin6;
+
+	if (addr->sa_family != AF_INET6)
+		return 0;
+
+	sin6 = (struct sockaddr_in6 *) addr;
+
+	if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+		return 0;
+
+	if (!sin6->sin6_scope_id)
+			return -EINVAL;
+
+	dev_addr->bound_dev_if = sin6->sin6_scope_id;
+#endif
+	return 0;
+}
+
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id_priv->state == RDMA_CM_IDLE) {
+		id->route.addr.src_addr.ss_family = AF_INET;
+		ret = rdma_bind_addr(id, cma_src_addr(id_priv));
+		if (ret)
+			return ret;
+	}
+
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN))
+		return -EINVAL;
+
+	if (id_priv->reuseaddr) {
+		ret = cma_bind_listen(id_priv);
+		if (ret)
+			goto err;
+	}
+
+	id_priv->backlog = backlog;
+	if (id->device) {
+		if (rdma_cap_ib_cm(id->device, 1)) {
+			ret = cma_ib_listen(id_priv);
+			if (ret)
+				goto err;
+		} else if (rdma_cap_iw_cm(id->device, 1)) {
+			ret = cma_iw_listen(id_priv, backlog);
+			if (ret)
+				goto err;
+		} else {
+			ret = -ENOSYS;
+			goto err;
+		}
+	} else
+		cma_listen_on_all(id_priv);
+
+	return 0;
+err:
+	id_priv->backlog = 0;
+	cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_listen);
+
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+	struct sockaddr  *daddr;
+
+	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 &&
+	    addr->sa_family != AF_IB)
+		return -EAFNOSUPPORT;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND))
+		return -EINVAL;
+
+	ret = cma_check_linklocal(&id->route.addr.dev_addr, addr);
+	if (ret)
+		goto err1;
+
+	memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr));
+	if (!cma_any_addr(addr)) {
+		ret = cma_translate_addr(addr, &id->route.addr.dev_addr);
+		if (ret)
+			goto err1;
+
+		ret = cma_acquire_dev(id_priv, NULL);
+		if (ret)
+			goto err1;
+	}
+
+	if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) {
+		if (addr->sa_family == AF_INET)
+			id_priv->afonly = 1;
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (addr->sa_family == AF_INET6) {
+			struct net *net = id_priv->id.route.addr.dev_addr.net;
+
+			id_priv->afonly = net->ipv6.sysctl.bindv6only;
+		}
+#endif
+	}
+	daddr = cma_dst_addr(id_priv);
+	daddr->sa_family = addr->sa_family;
+
+	ret = cma_get_port(id_priv);
+	if (ret)
+		goto err2;
+
+	return 0;
+err2:
+	if (id_priv->cma_dev) {
+		rdma_restrack_del(&id_priv->res);
+		cma_release_dev(id_priv);
+	}
+err1:
+	cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_bind_addr);
+
+static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv)
+{
+	struct cma_hdr *cma_hdr;
+
+	cma_hdr = hdr;
+	cma_hdr->cma_version = CMA_VERSION;
+	if (cma_family(id_priv) == AF_INET) {
+		struct sockaddr_in *src4, *dst4;
+
+		src4 = (struct sockaddr_in *) cma_src_addr(id_priv);
+		dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv);
+
+		cma_set_ip_ver(cma_hdr, 4);
+		cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+		cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+		cma_hdr->port = src4->sin_port;
+	} else if (cma_family(id_priv) == AF_INET6) {
+		struct sockaddr_in6 *src6, *dst6;
+
+		src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
+		dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv);
+
+		cma_set_ip_ver(cma_hdr, 6);
+		cma_hdr->src_addr.ip6 = src6->sin6_addr;
+		cma_hdr->dst_addr.ip6 = dst6->sin6_addr;
+		cma_hdr->port = src6->sin6_port;
+	}
+	return 0;
+}
+
+static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
+				struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv = cm_id->context;
+	struct rdma_cm_event event;
+	struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
+	int ret = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state != RDMA_CM_CONNECT)
+		goto out;
+
+	memset(&event, 0, sizeof event);
+	switch (ib_event->event) {
+	case IB_CM_SIDR_REQ_ERROR:
+		event.event = RDMA_CM_EVENT_UNREACHABLE;
+		event.status = -ETIMEDOUT;
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		event.param.ud.private_data = ib_event->private_data;
+		event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+		if (rep->status != IB_SIDR_SUCCESS) {
+			event.event = RDMA_CM_EVENT_UNREACHABLE;
+			event.status = ib_event->param.sidr_rep_rcvd.status;
+			pr_debug_ratelimited("RDMA CM: UNREACHABLE: bad SIDR reply. status %d\n",
+					     event.status);
+			break;
+		}
+		ret = cma_set_qkey(id_priv, rep->qkey);
+		if (ret) {
+			pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to set qkey. status %d\n", ret);
+			event.event = RDMA_CM_EVENT_ADDR_ERROR;
+			event.status = ret;
+			break;
+		}
+		ib_init_ah_attr_from_path(id_priv->id.device,
+					  id_priv->id.port_num,
+					  id_priv->id.route.path_rec,
+					  &event.param.ud.ah_attr);
+		event.param.ud.qp_num = rep->qpn;
+		event.param.ud.qkey = rep->qkey;
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		event.status = 0;
+		break;
+	default:
+		pr_err("RDMA CMA: unexpected IB CM event: %d\n",
+		       ib_event->event);
+		goto out;
+	}
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.ib = NULL;
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
+			      struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_sidr_req_param req;
+	struct ib_cm_id	*id;
+	void *private_data;
+	u8 offset;
+	int ret;
+
+	memset(&req, 0, sizeof req);
+	offset = cma_user_data_offset(id_priv);
+	req.private_data_len = offset + conn_param->private_data_len;
+	if (req.private_data_len < conn_param->private_data_len)
+		return -EINVAL;
+
+	if (req.private_data_len) {
+		private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+		if (!private_data)
+			return -ENOMEM;
+	} else {
+		private_data = NULL;
+	}
+
+	if (conn_param->private_data && conn_param->private_data_len)
+		memcpy(private_data + offset, conn_param->private_data,
+		       conn_param->private_data_len);
+
+	if (private_data) {
+		ret = cma_format_hdr(private_data, id_priv);
+		if (ret)
+			goto out;
+		req.private_data = private_data;
+	}
+
+	id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler,
+			     id_priv);
+	if (IS_ERR(id)) {
+		ret = PTR_ERR(id);
+		goto out;
+	}
+	id_priv->cm_id.ib = id;
+
+	req.path = id_priv->id.route.path_rec;
+	req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+	req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
+	req.max_cm_retries = CMA_MAX_CM_RETRIES;
+
+	ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
+	if (ret) {
+		ib_destroy_cm_id(id_priv->cm_id.ib);
+		id_priv->cm_id.ib = NULL;
+	}
+out:
+	kfree(private_data);
+	return ret;
+}
+
+static int cma_connect_ib(struct rdma_id_private *id_priv,
+			  struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_req_param req;
+	struct rdma_route *route;
+	void *private_data;
+	struct ib_cm_id	*id;
+	u8 offset;
+	int ret;
+
+	memset(&req, 0, sizeof req);
+	offset = cma_user_data_offset(id_priv);
+	req.private_data_len = offset + conn_param->private_data_len;
+	if (req.private_data_len < conn_param->private_data_len)
+		return -EINVAL;
+
+	if (req.private_data_len) {
+		private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+		if (!private_data)
+			return -ENOMEM;
+	} else {
+		private_data = NULL;
+	}
+
+	if (conn_param->private_data && conn_param->private_data_len)
+		memcpy(private_data + offset, conn_param->private_data,
+		       conn_param->private_data_len);
+
+	id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv);
+	if (IS_ERR(id)) {
+		ret = PTR_ERR(id);
+		goto out;
+	}
+	id_priv->cm_id.ib = id;
+
+	route = &id_priv->id.route;
+	if (private_data) {
+		ret = cma_format_hdr(private_data, id_priv);
+		if (ret)
+			goto out;
+		req.private_data = private_data;
+	}
+
+	req.primary_path = &route->path_rec[0];
+	if (route->num_paths == 2)
+		req.alternate_path = &route->path_rec[1];
+
+	req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+	req.qp_num = id_priv->qp_num;
+	req.qp_type = id_priv->id.qp_type;
+	req.starting_psn = id_priv->seq_num;
+	req.responder_resources = conn_param->responder_resources;
+	req.initiator_depth = conn_param->initiator_depth;
+	req.flow_control = conn_param->flow_control;
+	req.retry_count = min_t(u8, 7, conn_param->retry_count);
+	req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
+	req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+	req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+	req.max_cm_retries = CMA_MAX_CM_RETRIES;
+	req.srq = id_priv->srq ? 1 : 0;
+
+	ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
+out:
+	if (ret && !IS_ERR(id)) {
+		ib_destroy_cm_id(id);
+		id_priv->cm_id.ib = NULL;
+	}
+
+	kfree(private_data);
+	return ret;
+}
+
+static int cma_connect_iw(struct rdma_id_private *id_priv,
+			  struct rdma_conn_param *conn_param)
+{
+	struct iw_cm_id *cm_id;
+	int ret;
+	struct iw_cm_conn_param iw_param;
+
+	cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+
+	cm_id->tos = id_priv->tos;
+	id_priv->cm_id.iw = cm_id;
+
+	memcpy(&cm_id->local_addr, cma_src_addr(id_priv),
+	       rdma_addr_size(cma_src_addr(id_priv)));
+	memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv),
+	       rdma_addr_size(cma_dst_addr(id_priv)));
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	if (conn_param) {
+		iw_param.ord = conn_param->initiator_depth;
+		iw_param.ird = conn_param->responder_resources;
+		iw_param.private_data = conn_param->private_data;
+		iw_param.private_data_len = conn_param->private_data_len;
+		iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num;
+	} else {
+		memset(&iw_param, 0, sizeof iw_param);
+		iw_param.qpn = id_priv->qp_num;
+	}
+	ret = iw_cm_connect(cm_id, &iw_param);
+out:
+	if (ret) {
+		iw_destroy_cm_id(cm_id);
+		id_priv->cm_id.iw = NULL;
+	}
+	return ret;
+}
+
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
+		return -EINVAL;
+
+	if (!id->qp) {
+		id_priv->qp_num = conn_param->qp_num;
+		id_priv->srq = conn_param->srq;
+	}
+
+	if (rdma_cap_ib_cm(id->device, id->port_num)) {
+		if (id->qp_type == IB_QPT_UD)
+			ret = cma_resolve_ib_udp(id_priv, conn_param);
+		else
+			ret = cma_connect_ib(id_priv, conn_param);
+	} else if (rdma_cap_iw_cm(id->device, id->port_num))
+		ret = cma_connect_iw(id_priv, conn_param);
+	else
+		ret = -ENOSYS;
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_connect);
+
+static int cma_accept_ib(struct rdma_id_private *id_priv,
+			 struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_rep_param rep;
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	ret = cma_modify_qp_rts(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	memset(&rep, 0, sizeof rep);
+	rep.qp_num = id_priv->qp_num;
+	rep.starting_psn = id_priv->seq_num;
+	rep.private_data = conn_param->private_data;
+	rep.private_data_len = conn_param->private_data_len;
+	rep.responder_resources = conn_param->responder_resources;
+	rep.initiator_depth = conn_param->initiator_depth;
+	rep.failover_accepted = 0;
+	rep.flow_control = conn_param->flow_control;
+	rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
+	rep.srq = id_priv->srq ? 1 : 0;
+
+	ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
+out:
+	return ret;
+}
+
+static int cma_accept_iw(struct rdma_id_private *id_priv,
+		  struct rdma_conn_param *conn_param)
+{
+	struct iw_cm_conn_param iw_param;
+	int ret;
+
+	if (!conn_param)
+		return -EINVAL;
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		return ret;
+
+	iw_param.ord = conn_param->initiator_depth;
+	iw_param.ird = conn_param->responder_resources;
+	iw_param.private_data = conn_param->private_data;
+	iw_param.private_data_len = conn_param->private_data_len;
+	if (id_priv->id.qp) {
+		iw_param.qpn = id_priv->qp_num;
+	} else
+		iw_param.qpn = conn_param->qp_num;
+
+	return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
+}
+
+static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
+			     enum ib_cm_sidr_status status, u32 qkey,
+			     const void *private_data, int private_data_len)
+{
+	struct ib_cm_sidr_rep_param rep;
+	int ret;
+
+	memset(&rep, 0, sizeof rep);
+	rep.status = status;
+	if (status == IB_SIDR_SUCCESS) {
+		ret = cma_set_qkey(id_priv, qkey);
+		if (ret)
+			return ret;
+		rep.qp_num = id_priv->qp_num;
+		rep.qkey = id_priv->qkey;
+	}
+	rep.private_data = private_data;
+	rep.private_data_len = private_data_len;
+
+	return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
+}
+
+int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param,
+		  const char *caller)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+
+	if (caller)
+		id_priv->res.kern_name = caller;
+	else
+		rdma_restrack_set_task(&id_priv->res, current);
+
+	if (!cma_comp(id_priv, RDMA_CM_CONNECT))
+		return -EINVAL;
+
+	if (!id->qp && conn_param) {
+		id_priv->qp_num = conn_param->qp_num;
+		id_priv->srq = conn_param->srq;
+	}
+
+	if (rdma_cap_ib_cm(id->device, id->port_num)) {
+		if (id->qp_type == IB_QPT_UD) {
+			if (conn_param)
+				ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+							conn_param->qkey,
+							conn_param->private_data,
+							conn_param->private_data_len);
+			else
+				ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+							0, NULL, 0);
+		} else {
+			if (conn_param)
+				ret = cma_accept_ib(id_priv, conn_param);
+			else
+				ret = cma_rep_recv(id_priv);
+		}
+	} else if (rdma_cap_iw_cm(id->device, id->port_num))
+		ret = cma_accept_iw(id_priv, conn_param);
+	else
+		ret = -ENOSYS;
+
+	if (ret)
+		goto reject;
+
+	return 0;
+reject:
+	cma_modify_qp_err(id_priv);
+	rdma_reject(id, NULL, 0);
+	return ret;
+}
+EXPORT_SYMBOL(__rdma_accept);
+
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!id_priv->cm_id.ib)
+		return -EINVAL;
+
+	switch (id->device->node_type) {
+	case RDMA_NODE_IB_CA:
+		ret = ib_cm_notify(id_priv->cm_id.ib, event);
+		break;
+	default:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_notify);
+
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+		u8 private_data_len)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!id_priv->cm_id.ib)
+		return -EINVAL;
+
+	if (rdma_cap_ib_cm(id->device, id->port_num)) {
+		if (id->qp_type == IB_QPT_UD)
+			ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0,
+						private_data, private_data_len);
+		else
+			ret = ib_send_cm_rej(id_priv->cm_id.ib,
+					     IB_CM_REJ_CONSUMER_DEFINED, NULL,
+					     0, private_data, private_data_len);
+	} else if (rdma_cap_iw_cm(id->device, id->port_num)) {
+		ret = iw_cm_reject(id_priv->cm_id.iw,
+				   private_data, private_data_len);
+	} else
+		ret = -ENOSYS;
+
+	return ret;
+}
+EXPORT_SYMBOL(rdma_reject);
+
+int rdma_disconnect(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!id_priv->cm_id.ib)
+		return -EINVAL;
+
+	if (rdma_cap_ib_cm(id->device, id->port_num)) {
+		ret = cma_modify_qp_err(id_priv);
+		if (ret)
+			goto out;
+		/* Initiate or respond to a disconnect. */
+		if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
+			ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
+	} else if (rdma_cap_iw_cm(id->device, id->port_num)) {
+		ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
+	} else
+		ret = -EINVAL;
+
+out:
+	return ret;
+}
+EXPORT_SYMBOL(rdma_disconnect);
+
+static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc = multicast->context;
+	struct rdma_cm_event event;
+	int ret = 0;
+
+	id_priv = mc->id_priv;
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state != RDMA_CM_ADDR_BOUND &&
+	    id_priv->state != RDMA_CM_ADDR_RESOLVED)
+		goto out;
+
+	if (!status)
+		status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey));
+	else
+		pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n",
+				     status);
+	mutex_lock(&id_priv->qp_mutex);
+	if (!status && id_priv->id.qp) {
+		status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
+					 be16_to_cpu(multicast->rec.mlid));
+		if (status)
+			pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to attach QP. status %d\n",
+					     status);
+	}
+	mutex_unlock(&id_priv->qp_mutex);
+
+	memset(&event, 0, sizeof event);
+	event.status = status;
+	event.param.ud.private_data = mc->context;
+	if (!status) {
+		struct rdma_dev_addr *dev_addr =
+			&id_priv->id.route.addr.dev_addr;
+		struct net_device *ndev =
+			dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
+		enum ib_gid_type gid_type =
+			id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+			rdma_start_port(id_priv->cma_dev->device)];
+
+		event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
+		ret = ib_init_ah_from_mcmember(id_priv->id.device,
+					       id_priv->id.port_num,
+					       &multicast->rec,
+					       ndev, gid_type,
+					       &event.param.ud.ah_attr);
+		if (ret)
+			event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
+
+		event.param.ud.qp_num = 0xFFFFFF;
+		event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
+		if (ndev)
+			dev_put(ndev);
+	} else
+		event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return 0;
+	}
+
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return 0;
+}
+
+static void cma_set_mgid(struct rdma_id_private *id_priv,
+			 struct sockaddr *addr, union ib_gid *mgid)
+{
+	unsigned char mc_map[MAX_ADDR_LEN];
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	struct sockaddr_in *sin = (struct sockaddr_in *) addr;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr;
+
+	if (cma_any_addr(addr)) {
+		memset(mgid, 0, sizeof *mgid);
+	} else if ((addr->sa_family == AF_INET6) &&
+		   ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) ==
+								 0xFF10A01B)) {
+		/* IPv6 address is an SA assigned MGID. */
+		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+	} else if (addr->sa_family == AF_IB) {
+		memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid);
+	} else if ((addr->sa_family == AF_INET6)) {
+		ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map);
+		if (id_priv->id.ps == RDMA_PS_UDP)
+			mc_map[7] = 0x01;	/* Use RDMA CM signature */
+		*mgid = *(union ib_gid *) (mc_map + 4);
+	} else {
+		ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
+		if (id_priv->id.ps == RDMA_PS_UDP)
+			mc_map[7] = 0x01;	/* Use RDMA CM signature */
+		*mgid = *(union ib_gid *) (mc_map + 4);
+	}
+}
+
+static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
+				 struct cma_multicast *mc)
+{
+	struct ib_sa_mcmember_rec rec;
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	ib_sa_comp_mask comp_mask;
+	int ret;
+
+	ib_addr_get_mgid(dev_addr, &rec.mgid);
+	ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
+				     &rec.mgid, &rec);
+	if (ret)
+		return ret;
+
+	ret = cma_set_qkey(id_priv, 0);
+	if (ret)
+		return ret;
+
+	cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
+	rec.qkey = cpu_to_be32(id_priv->qkey);
+	rdma_addr_get_sgid(dev_addr, &rec.port_gid);
+	rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+	rec.join_state = mc->join_state;
+
+	if ((rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) &&
+	    (!ib_sa_sendonly_fullmem_support(&sa_client,
+					     id_priv->id.device,
+					     id_priv->id.port_num))) {
+		pr_warn("RDMA CM: %s port %u Unable to multicast join\n"
+			"RDMA CM: SM doesn't support Send Only Full Member option\n",
+			id_priv->id.device->name, id_priv->id.port_num);
+		return -EOPNOTSUPP;
+	}
+
+	comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
+		    IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
+		    IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL |
+		    IB_SA_MCMEMBER_REC_FLOW_LABEL |
+		    IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+
+	if (id_priv->id.ps == RDMA_PS_IPOIB)
+		comp_mask |= IB_SA_MCMEMBER_REC_RATE |
+			     IB_SA_MCMEMBER_REC_RATE_SELECTOR |
+			     IB_SA_MCMEMBER_REC_MTU_SELECTOR |
+			     IB_SA_MCMEMBER_REC_MTU |
+			     IB_SA_MCMEMBER_REC_HOP_LIMIT;
+
+	mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device,
+						id_priv->id.port_num, &rec,
+						comp_mask, GFP_KERNEL,
+						cma_ib_mc_handler, mc);
+	return PTR_ERR_OR_ZERO(mc->multicast.ib);
+}
+
+static void iboe_mcast_work_handler(struct work_struct *work)
+{
+	struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work);
+	struct cma_multicast *mc = mw->mc;
+	struct ib_sa_multicast *m = mc->multicast.ib;
+
+	mc->multicast.ib->context = mc;
+	cma_ib_mc_handler(0, m);
+	kref_put(&mc->mcref, release_mc);
+	kfree(mw);
+}
+
+static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
+			      enum ib_gid_type gid_type)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
+
+	if (cma_any_addr(addr)) {
+		memset(mgid, 0, sizeof *mgid);
+	} else if (addr->sa_family == AF_INET6) {
+		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+	} else {
+		mgid->raw[0] =
+			(gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff;
+		mgid->raw[1] =
+			(gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e;
+		mgid->raw[2] = 0;
+		mgid->raw[3] = 0;
+		mgid->raw[4] = 0;
+		mgid->raw[5] = 0;
+		mgid->raw[6] = 0;
+		mgid->raw[7] = 0;
+		mgid->raw[8] = 0;
+		mgid->raw[9] = 0;
+		mgid->raw[10] = 0xff;
+		mgid->raw[11] = 0xff;
+		*(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr;
+	}
+}
+
+static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
+				   struct cma_multicast *mc)
+{
+	struct iboe_mcast_work *work;
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	int err = 0;
+	struct sockaddr *addr = (struct sockaddr *)&mc->addr;
+	struct net_device *ndev = NULL;
+	enum ib_gid_type gid_type;
+	bool send_only;
+
+	send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN);
+
+	if (cma_zero_addr((struct sockaddr *)&mc->addr))
+		return -EINVAL;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL);
+	if (!mc->multicast.ib) {
+		err = -ENOMEM;
+		goto out1;
+	}
+
+	gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+		   rdma_start_port(id_priv->cma_dev->device)];
+	cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid, gid_type);
+
+	mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
+	if (id_priv->id.ps == RDMA_PS_UDP)
+		mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+
+	if (dev_addr->bound_dev_if)
+		ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
+	if (!ndev) {
+		err = -ENODEV;
+		goto out2;
+	}
+	mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
+	mc->multicast.ib->rec.hop_limit = 1;
+	mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
+
+	if (addr->sa_family == AF_INET) {
+		if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
+			mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
+			if (!send_only) {
+				err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid,
+						    true);
+				if (!err)
+					mc->igmp_joined = true;
+			}
+		}
+	} else {
+		if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+			err = -ENOTSUPP;
+	}
+	dev_put(ndev);
+	if (err || !mc->multicast.ib->rec.mtu) {
+		if (!err)
+			err = -EINVAL;
+		goto out2;
+	}
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+		    &mc->multicast.ib->rec.port_gid);
+	work->id = id_priv;
+	work->mc = mc;
+	INIT_WORK(&work->work, iboe_mcast_work_handler);
+	kref_get(&mc->mcref);
+	queue_work(cma_wq, &work->work);
+
+	return 0;
+
+out2:
+	kfree(mc->multicast.ib);
+out1:
+	kfree(work);
+	return err;
+}
+
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+			u8 join_state, void *context)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc;
+	int ret;
+
+	if (!id->device)
+		return -EINVAL;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) &&
+	    !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED))
+		return -EINVAL;
+
+	mc = kmalloc(sizeof *mc, GFP_KERNEL);
+	if (!mc)
+		return -ENOMEM;
+
+	memcpy(&mc->addr, addr, rdma_addr_size(addr));
+	mc->context = context;
+	mc->id_priv = id_priv;
+	mc->igmp_joined = false;
+	mc->join_state = join_state;
+	spin_lock(&id_priv->lock);
+	list_add(&mc->list, &id_priv->mc_list);
+	spin_unlock(&id_priv->lock);
+
+	if (rdma_protocol_roce(id->device, id->port_num)) {
+		kref_init(&mc->mcref);
+		ret = cma_iboe_join_multicast(id_priv, mc);
+	} else if (rdma_cap_ib_mcast(id->device, id->port_num))
+		ret = cma_join_ib_multicast(id_priv, mc);
+	else
+		ret = -ENOSYS;
+
+	if (ret) {
+		spin_lock_irq(&id_priv->lock);
+		list_del(&mc->list);
+		spin_unlock_irq(&id_priv->lock);
+		kfree(mc);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_join_multicast);
+
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	spin_lock_irq(&id_priv->lock);
+	list_for_each_entry(mc, &id_priv->mc_list, list) {
+		if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) {
+			list_del(&mc->list);
+			spin_unlock_irq(&id_priv->lock);
+
+			if (id->qp)
+				ib_detach_mcast(id->qp,
+						&mc->multicast.ib->rec.mgid,
+						be16_to_cpu(mc->multicast.ib->rec.mlid));
+
+			BUG_ON(id_priv->cma_dev->device != id->device);
+
+			if (rdma_cap_ib_mcast(id->device, id->port_num)) {
+				ib_sa_free_multicast(mc->multicast.ib);
+				kfree(mc);
+			} else if (rdma_protocol_roce(id->device, id->port_num)) {
+				if (mc->igmp_joined) {
+					struct rdma_dev_addr *dev_addr =
+						&id->route.addr.dev_addr;
+					struct net_device *ndev = NULL;
+
+					if (dev_addr->bound_dev_if)
+						ndev = dev_get_by_index(dev_addr->net,
+									dev_addr->bound_dev_if);
+					if (ndev) {
+						cma_igmp_send(ndev,
+							      &mc->multicast.ib->rec.mgid,
+							      false);
+						dev_put(ndev);
+					}
+					mc->igmp_joined = false;
+				}
+				kref_put(&mc->mcref, release_mc);
+			}
+			return;
+		}
+	}
+	spin_unlock_irq(&id_priv->lock);
+}
+EXPORT_SYMBOL(rdma_leave_multicast);
+
+static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv)
+{
+	struct rdma_dev_addr *dev_addr;
+	struct cma_ndev_work *work;
+
+	dev_addr = &id_priv->id.route.addr.dev_addr;
+
+	if ((dev_addr->bound_dev_if == ndev->ifindex) &&
+	    (net_eq(dev_net(ndev), dev_addr->net)) &&
+	    memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
+		pr_info("RDMA CM addr change for ndev %s used by id %p\n",
+			ndev->name, &id_priv->id);
+		work = kzalloc(sizeof *work, GFP_KERNEL);
+		if (!work)
+			return -ENOMEM;
+
+		INIT_WORK(&work->work, cma_ndev_work_handler);
+		work->id = id_priv;
+		work->event.event = RDMA_CM_EVENT_ADDR_CHANGE;
+		atomic_inc(&id_priv->refcount);
+		queue_work(cma_wq, &work->work);
+	}
+
+	return 0;
+}
+
+static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
+			       void *ptr)
+{
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	struct cma_device *cma_dev;
+	struct rdma_id_private *id_priv;
+	int ret = NOTIFY_DONE;
+
+	if (event != NETDEV_BONDING_FAILOVER)
+		return NOTIFY_DONE;
+
+	if (!netif_is_bond_master(ndev))
+		return NOTIFY_DONE;
+
+	mutex_lock(&lock);
+	list_for_each_entry(cma_dev, &dev_list, list)
+		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+			ret = cma_netdev_change(ndev, id_priv);
+			if (ret)
+				goto out;
+		}
+
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static struct notifier_block cma_nb = {
+	.notifier_call = cma_netdev_callback
+};
+
+static void cma_add_one(struct ib_device *device)
+{
+	struct cma_device *cma_dev;
+	struct rdma_id_private *id_priv;
+	unsigned int i;
+	unsigned long supported_gids = 0;
+
+	cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL);
+	if (!cma_dev)
+		return;
+
+	cma_dev->device = device;
+	cma_dev->default_gid_type = kcalloc(device->phys_port_cnt,
+					    sizeof(*cma_dev->default_gid_type),
+					    GFP_KERNEL);
+	if (!cma_dev->default_gid_type)
+		goto free_cma_dev;
+
+	cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt,
+					    sizeof(*cma_dev->default_roce_tos),
+					    GFP_KERNEL);
+	if (!cma_dev->default_roce_tos)
+		goto free_gid_type;
+
+	for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+		supported_gids = roce_gid_type_mask_support(device, i);
+		WARN_ON(!supported_gids);
+		if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE))
+			cma_dev->default_gid_type[i - rdma_start_port(device)] =
+				CMA_PREFERRED_ROCE_GID_TYPE;
+		else
+			cma_dev->default_gid_type[i - rdma_start_port(device)] =
+				find_first_bit(&supported_gids, BITS_PER_LONG);
+		cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0;
+	}
+
+	init_completion(&cma_dev->comp);
+	atomic_set(&cma_dev->refcount, 1);
+	INIT_LIST_HEAD(&cma_dev->id_list);
+	ib_set_client_data(device, &cma_client, cma_dev);
+
+	mutex_lock(&lock);
+	list_add_tail(&cma_dev->list, &dev_list);
+	list_for_each_entry(id_priv, &listen_any_list, list)
+		cma_listen_on_dev(id_priv, cma_dev);
+	mutex_unlock(&lock);
+
+	return;
+
+free_gid_type:
+	kfree(cma_dev->default_gid_type);
+
+free_cma_dev:
+	kfree(cma_dev);
+
+	return;
+}
+
+static int cma_remove_id_dev(struct rdma_id_private *id_priv)
+{
+	struct rdma_cm_event event;
+	enum rdma_cm_state state;
+	int ret = 0;
+
+	/* Record that we want to remove the device */
+	state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL);
+	if (state == RDMA_CM_DESTROYING)
+		return 0;
+
+	cma_cancel_operation(id_priv, state);
+	mutex_lock(&id_priv->handler_mutex);
+
+	/* Check for destruction from another callback. */
+	if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL))
+		goto out;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static void cma_process_remove(struct cma_device *cma_dev)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	mutex_lock(&lock);
+	while (!list_empty(&cma_dev->id_list)) {
+		id_priv = list_entry(cma_dev->id_list.next,
+				     struct rdma_id_private, list);
+
+		list_del(&id_priv->listen_list);
+		list_del_init(&id_priv->list);
+		atomic_inc(&id_priv->refcount);
+		mutex_unlock(&lock);
+
+		ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv);
+		cma_deref_id(id_priv);
+		if (ret)
+			rdma_destroy_id(&id_priv->id);
+
+		mutex_lock(&lock);
+	}
+	mutex_unlock(&lock);
+
+	cma_deref_dev(cma_dev);
+	wait_for_completion(&cma_dev->comp);
+}
+
+static void cma_remove_one(struct ib_device *device, void *client_data)
+{
+	struct cma_device *cma_dev = client_data;
+
+	if (!cma_dev)
+		return;
+
+	mutex_lock(&lock);
+	list_del(&cma_dev->list);
+	mutex_unlock(&lock);
+
+	cma_process_remove(cma_dev);
+	kfree(cma_dev->default_roce_tos);
+	kfree(cma_dev->default_gid_type);
+	kfree(cma_dev);
+}
+
+static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlmsghdr *nlh;
+	struct rdma_cm_id_stats *id_stats;
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id = NULL;
+	struct cma_device *cma_dev;
+	int i_dev = 0, i_id = 0;
+
+	/*
+	 * We export all of the IDs as a sequence of messages.  Each
+	 * ID gets its own netlink message.
+	 */
+	mutex_lock(&lock);
+
+	list_for_each_entry(cma_dev, &dev_list, list) {
+		if (i_dev < cb->args[0]) {
+			i_dev++;
+			continue;
+		}
+
+		i_id = 0;
+		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+			if (i_id < cb->args[1]) {
+				i_id++;
+				continue;
+			}
+
+			id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq,
+						sizeof *id_stats, RDMA_NL_RDMA_CM,
+						RDMA_NL_RDMA_CM_ID_STATS,
+						NLM_F_MULTI);
+			if (!id_stats)
+				goto out;
+
+			memset(id_stats, 0, sizeof *id_stats);
+			id = &id_priv->id;
+			id_stats->node_type = id->route.addr.dev_addr.dev_type;
+			id_stats->port_num = id->port_num;
+			id_stats->bound_dev_if =
+				id->route.addr.dev_addr.bound_dev_if;
+
+			if (ibnl_put_attr(skb, nlh,
+					  rdma_addr_size(cma_src_addr(id_priv)),
+					  cma_src_addr(id_priv),
+					  RDMA_NL_RDMA_CM_ATTR_SRC_ADDR))
+				goto out;
+			if (ibnl_put_attr(skb, nlh,
+					  rdma_addr_size(cma_dst_addr(id_priv)),
+					  cma_dst_addr(id_priv),
+					  RDMA_NL_RDMA_CM_ATTR_DST_ADDR))
+				goto out;
+
+			id_stats->pid	= task_pid_vnr(id_priv->res.task);
+			id_stats->port_space	= id->ps;
+			id_stats->cm_state	= id_priv->state;
+			id_stats->qp_num	= id_priv->qp_num;
+			id_stats->qp_type	= id->qp_type;
+
+			i_id++;
+			nlmsg_end(skb, nlh);
+		}
+
+		cb->args[1] = 0;
+		i_dev++;
+	}
+
+out:
+	mutex_unlock(&lock);
+	cb->args[0] = i_dev;
+	cb->args[1] = i_id;
+
+	return skb->len;
+}
+
+static const struct rdma_nl_cbs cma_cb_table[RDMA_NL_RDMA_CM_NUM_OPS] = {
+	[RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats},
+};
+
+static int cma_init_net(struct net *net)
+{
+	struct cma_pernet *pernet = cma_pernet(net);
+
+	idr_init(&pernet->tcp_ps);
+	idr_init(&pernet->udp_ps);
+	idr_init(&pernet->ipoib_ps);
+	idr_init(&pernet->ib_ps);
+
+	return 0;
+}
+
+static void cma_exit_net(struct net *net)
+{
+	struct cma_pernet *pernet = cma_pernet(net);
+
+	idr_destroy(&pernet->tcp_ps);
+	idr_destroy(&pernet->udp_ps);
+	idr_destroy(&pernet->ipoib_ps);
+	idr_destroy(&pernet->ib_ps);
+}
+
+static struct pernet_operations cma_pernet_operations = {
+	.init = cma_init_net,
+	.exit = cma_exit_net,
+	.id = &cma_pernet_id,
+	.size = sizeof(struct cma_pernet),
+};
+
+static int __init cma_init(void)
+{
+	int ret;
+
+	cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM);
+	if (!cma_wq)
+		return -ENOMEM;
+
+	ret = register_pernet_subsys(&cma_pernet_operations);
+	if (ret)
+		goto err_wq;
+
+	ib_sa_register_client(&sa_client);
+	rdma_addr_register_client(&addr_client);
+	register_netdevice_notifier(&cma_nb);
+
+	ret = ib_register_client(&cma_client);
+	if (ret)
+		goto err;
+
+	rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table);
+	cma_configfs_init();
+
+	return 0;
+
+err:
+	unregister_netdevice_notifier(&cma_nb);
+	rdma_addr_unregister_client(&addr_client);
+	ib_sa_unregister_client(&sa_client);
+err_wq:
+	destroy_workqueue(cma_wq);
+	return ret;
+}
+
+static void __exit cma_cleanup(void)
+{
+	cma_configfs_exit();
+	rdma_nl_unregister(RDMA_NL_RDMA_CM);
+	ib_unregister_client(&cma_client);
+	unregister_netdevice_notifier(&cma_nb);
+	rdma_addr_unregister_client(&addr_client);
+	ib_sa_unregister_client(&sa_client);
+	unregister_pernet_subsys(&cma_pernet_operations);
+	destroy_workqueue(cma_wq);
+}
+
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1);
+
+module_init(cma_init);
+module_exit(cma_cleanup);
diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c
new file mode 100644
index 0000000..eee38b4
--- /dev/null
+++ b/drivers/infiniband/core/cma_configfs.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/configfs.h>
+#include <rdma/ib_verbs.h>
+#include "core_priv.h"
+
+struct cma_device;
+
+struct cma_dev_group;
+
+struct cma_dev_port_group {
+	unsigned int		port_num;
+	struct cma_dev_group	*cma_dev_group;
+	struct config_group	group;
+};
+
+struct cma_dev_group {
+	char				name[IB_DEVICE_NAME_MAX];
+	struct config_group		device_group;
+	struct config_group		ports_group;
+	struct cma_dev_port_group	*ports;
+};
+
+static struct cma_dev_port_group *to_dev_port_group(struct config_item *item)
+{
+	struct config_group *group;
+
+	if (!item)
+		return NULL;
+
+	group = container_of(item, struct config_group, cg_item);
+	return container_of(group, struct cma_dev_port_group, group);
+}
+
+static bool filter_by_name(struct ib_device *ib_dev, void *cookie)
+{
+	return !strcmp(ib_dev->name, cookie);
+}
+
+static int cma_configfs_params_get(struct config_item *item,
+				   struct cma_device **pcma_dev,
+				   struct cma_dev_port_group **pgroup)
+{
+	struct cma_dev_port_group *group = to_dev_port_group(item);
+	struct cma_device *cma_dev;
+
+	if (!group)
+		return -ENODEV;
+
+	cma_dev = cma_enum_devices_by_ibdev(filter_by_name,
+					    group->cma_dev_group->name);
+	if (!cma_dev)
+		return -ENODEV;
+
+	*pcma_dev = cma_dev;
+	*pgroup = group;
+
+	return 0;
+}
+
+static void cma_configfs_params_put(struct cma_device *cma_dev)
+{
+	cma_deref_dev(cma_dev);
+}
+
+static ssize_t default_roce_mode_show(struct config_item *item,
+				      char *buf)
+{
+	struct cma_device *cma_dev;
+	struct cma_dev_port_group *group;
+	int gid_type;
+	ssize_t ret;
+
+	ret = cma_configfs_params_get(item, &cma_dev, &group);
+	if (ret)
+		return ret;
+
+	gid_type = cma_get_default_gid_type(cma_dev, group->port_num);
+	cma_configfs_params_put(cma_dev);
+
+	if (gid_type < 0)
+		return gid_type;
+
+	return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_type));
+}
+
+static ssize_t default_roce_mode_store(struct config_item *item,
+				       const char *buf, size_t count)
+{
+	struct cma_device *cma_dev;
+	struct cma_dev_port_group *group;
+	int gid_type = ib_cache_gid_parse_type_str(buf);
+	ssize_t ret;
+
+	if (gid_type < 0)
+		return -EINVAL;
+
+	ret = cma_configfs_params_get(item, &cma_dev, &group);
+	if (ret)
+		return ret;
+
+	ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type);
+
+	cma_configfs_params_put(cma_dev);
+
+	return !ret ? strnlen(buf, count) : ret;
+}
+
+CONFIGFS_ATTR(, default_roce_mode);
+
+static ssize_t default_roce_tos_show(struct config_item *item, char *buf)
+{
+	struct cma_device *cma_dev;
+	struct cma_dev_port_group *group;
+	ssize_t ret;
+	u8 tos;
+
+	ret = cma_configfs_params_get(item, &cma_dev, &group);
+	if (ret)
+		return ret;
+
+	tos = cma_get_default_roce_tos(cma_dev, group->port_num);
+	cma_configfs_params_put(cma_dev);
+
+	return sprintf(buf, "%u\n", tos);
+}
+
+static ssize_t default_roce_tos_store(struct config_item *item,
+				      const char *buf, size_t count)
+{
+	struct cma_device *cma_dev;
+	struct cma_dev_port_group *group;
+	ssize_t ret;
+	u8 tos;
+
+	ret = kstrtou8(buf, 0, &tos);
+	if (ret)
+		return ret;
+
+	ret = cma_configfs_params_get(item, &cma_dev, &group);
+	if (ret)
+		return ret;
+
+	ret = cma_set_default_roce_tos(cma_dev, group->port_num, tos);
+	cma_configfs_params_put(cma_dev);
+
+	return ret ? ret : strnlen(buf, count);
+}
+
+CONFIGFS_ATTR(, default_roce_tos);
+
+static struct configfs_attribute *cma_configfs_attributes[] = {
+	&attr_default_roce_mode,
+	&attr_default_roce_tos,
+	NULL,
+};
+
+static const struct config_item_type cma_port_group_type = {
+	.ct_attrs	= cma_configfs_attributes,
+	.ct_owner	= THIS_MODULE
+};
+
+static int make_cma_ports(struct cma_dev_group *cma_dev_group,
+			  struct cma_device *cma_dev)
+{
+	struct ib_device *ibdev;
+	unsigned int i;
+	unsigned int ports_num;
+	struct cma_dev_port_group *ports;
+	int err;
+
+	ibdev = cma_get_ib_dev(cma_dev);
+
+	if (!ibdev)
+		return -ENODEV;
+
+	ports_num = ibdev->phys_port_cnt;
+	ports = kcalloc(ports_num, sizeof(*cma_dev_group->ports),
+			GFP_KERNEL);
+
+	if (!ports) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	for (i = 0; i < ports_num; i++) {
+		char port_str[10];
+
+		ports[i].port_num = i + 1;
+		snprintf(port_str, sizeof(port_str), "%u", i + 1);
+		ports[i].cma_dev_group = cma_dev_group;
+		config_group_init_type_name(&ports[i].group,
+					    port_str,
+					    &cma_port_group_type);
+		configfs_add_default_group(&ports[i].group,
+				&cma_dev_group->ports_group);
+
+	}
+	cma_dev_group->ports = ports;
+
+	return 0;
+free:
+	kfree(ports);
+	cma_dev_group->ports = NULL;
+	return err;
+}
+
+static void release_cma_dev(struct config_item  *item)
+{
+	struct config_group *group = container_of(item, struct config_group,
+						  cg_item);
+	struct cma_dev_group *cma_dev_group = container_of(group,
+							   struct cma_dev_group,
+							   device_group);
+
+	kfree(cma_dev_group);
+};
+
+static void release_cma_ports_group(struct config_item  *item)
+{
+	struct config_group *group = container_of(item, struct config_group,
+						  cg_item);
+	struct cma_dev_group *cma_dev_group = container_of(group,
+							   struct cma_dev_group,
+							   ports_group);
+
+	kfree(cma_dev_group->ports);
+	cma_dev_group->ports = NULL;
+};
+
+static struct configfs_item_operations cma_ports_item_ops = {
+	.release = release_cma_ports_group
+};
+
+static const struct config_item_type cma_ports_group_type = {
+	.ct_item_ops	= &cma_ports_item_ops,
+	.ct_owner	= THIS_MODULE
+};
+
+static struct configfs_item_operations cma_device_item_ops = {
+	.release = release_cma_dev
+};
+
+static const struct config_item_type cma_device_group_type = {
+	.ct_item_ops	= &cma_device_item_ops,
+	.ct_owner	= THIS_MODULE
+};
+
+static struct config_group *make_cma_dev(struct config_group *group,
+					 const char *name)
+{
+	int err = -ENODEV;
+	struct cma_device *cma_dev = cma_enum_devices_by_ibdev(filter_by_name,
+							       (void *)name);
+	struct cma_dev_group *cma_dev_group = NULL;
+
+	if (!cma_dev)
+		goto fail;
+
+	cma_dev_group = kzalloc(sizeof(*cma_dev_group), GFP_KERNEL);
+
+	if (!cma_dev_group) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	strlcpy(cma_dev_group->name, name, sizeof(cma_dev_group->name));
+
+	config_group_init_type_name(&cma_dev_group->ports_group, "ports",
+				    &cma_ports_group_type);
+
+	err = make_cma_ports(cma_dev_group, cma_dev);
+	if (err)
+		goto fail;
+
+	config_group_init_type_name(&cma_dev_group->device_group, name,
+				    &cma_device_group_type);
+	configfs_add_default_group(&cma_dev_group->ports_group,
+			&cma_dev_group->device_group);
+
+	cma_deref_dev(cma_dev);
+	return &cma_dev_group->device_group;
+
+fail:
+	if (cma_dev)
+		cma_deref_dev(cma_dev);
+	kfree(cma_dev_group);
+	return ERR_PTR(err);
+}
+
+static struct configfs_group_operations cma_subsys_group_ops = {
+	.make_group	= make_cma_dev,
+};
+
+static const struct config_item_type cma_subsys_type = {
+	.ct_group_ops	= &cma_subsys_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct configfs_subsystem cma_subsys = {
+	.su_group	= {
+		.cg_item	= {
+			.ci_namebuf	= "rdma_cm",
+			.ci_type	= &cma_subsys_type,
+		},
+	},
+};
+
+int __init cma_configfs_init(void)
+{
+	config_group_init(&cma_subsys.su_group);
+	mutex_init(&cma_subsys.su_mutex);
+	return configfs_register_subsystem(&cma_subsys);
+}
+
+void __exit cma_configfs_exit(void)
+{
+	configfs_unregister_subsystem(&cma_subsys);
+}
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
new file mode 100644
index 0000000..194cfe7
--- /dev/null
+++ b/drivers/infiniband/core/cma_priv.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _CMA_PRIV_H
+#define _CMA_PRIV_H
+
+enum rdma_cm_state {
+	RDMA_CM_IDLE,
+	RDMA_CM_ADDR_QUERY,
+	RDMA_CM_ADDR_RESOLVED,
+	RDMA_CM_ROUTE_QUERY,
+	RDMA_CM_ROUTE_RESOLVED,
+	RDMA_CM_CONNECT,
+	RDMA_CM_DISCONNECT,
+	RDMA_CM_ADDR_BOUND,
+	RDMA_CM_LISTEN,
+	RDMA_CM_DEVICE_REMOVAL,
+	RDMA_CM_DESTROYING
+};
+
+struct rdma_id_private {
+	struct rdma_cm_id	id;
+
+	struct rdma_bind_list	*bind_list;
+	struct hlist_node	node;
+	struct list_head	list; /* listen_any_list or cma_device.list */
+	struct list_head	listen_list; /* per device listens */
+	struct cma_device	*cma_dev;
+	struct list_head	mc_list;
+
+	int			internal_id;
+	enum rdma_cm_state	state;
+	spinlock_t		lock;
+	struct mutex		qp_mutex;
+
+	struct completion	comp;
+	atomic_t		refcount;
+	struct mutex		handler_mutex;
+
+	int			backlog;
+	int			timeout_ms;
+	struct ib_sa_query	*query;
+	int			query_id;
+	union {
+		struct ib_cm_id	*ib;
+		struct iw_cm_id	*iw;
+	} cm_id;
+
+	u32			seq_num;
+	u32			qkey;
+	u32			qp_num;
+	u32			options;
+	u8			srq;
+	u8			tos;
+	bool			tos_set;
+	u8			reuseaddr;
+	u8			afonly;
+	enum ib_gid_type	gid_type;
+
+	/*
+	 * Internal to RDMA/core, don't use in the drivers
+	 */
+	struct rdma_restrack_entry     res;
+};
+#endif /* _CMA_PRIV_H */
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
new file mode 100644
index 0000000..54163a6
--- /dev/null
+++ b/drivers/infiniband/core/core_priv.h
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _CORE_PRIV_H
+#define _CORE_PRIV_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/cgroup_rdma.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/opa_addr.h>
+#include <rdma/ib_mad.h>
+#include <rdma/restrack.h>
+#include "mad_priv.h"
+
+/* Total number of ports combined across all struct ib_devices's */
+#define RDMA_MAX_PORTS 1024
+
+struct pkey_index_qp_list {
+	struct list_head    pkey_index_list;
+	u16                 pkey_index;
+	/* Lock to hold while iterating the qp_list. */
+	spinlock_t          qp_list_lock;
+	struct list_head    qp_list;
+};
+
+#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS)
+int cma_configfs_init(void);
+void cma_configfs_exit(void);
+#else
+static inline int cma_configfs_init(void)
+{
+	return 0;
+}
+
+static inline void cma_configfs_exit(void)
+{
+}
+#endif
+struct cma_device;
+void cma_ref_dev(struct cma_device *cma_dev);
+void cma_deref_dev(struct cma_device *cma_dev);
+typedef bool (*cma_device_filter)(struct ib_device *, void *);
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter	filter,
+					     void		*cookie);
+int cma_get_default_gid_type(struct cma_device *cma_dev,
+			     unsigned int port);
+int cma_set_default_gid_type(struct cma_device *cma_dev,
+			     unsigned int port,
+			     enum ib_gid_type default_gid_type);
+int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port);
+int cma_set_default_roce_tos(struct cma_device *a_dev, unsigned int port,
+			     u8 default_roce_tos);
+struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev);
+
+int  ib_device_register_sysfs(struct ib_device *device,
+			      int (*port_callback)(struct ib_device *,
+						   u8, struct kobject *));
+void ib_device_unregister_sysfs(struct ib_device *device);
+
+void ib_cache_setup(void);
+void ib_cache_cleanup(void);
+
+typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
+	      struct net_device *idev, void *cookie);
+
+typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port,
+	     struct net_device *idev, void *cookie);
+
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+			 roce_netdev_filter filter,
+			 void *filter_cookie,
+			 roce_netdev_callback cb,
+			 void *cookie);
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+			      void *filter_cookie,
+			      roce_netdev_callback cb,
+			      void *cookie);
+
+typedef int (*nldev_callback)(struct ib_device *device,
+			      struct sk_buff *skb,
+			      struct netlink_callback *cb,
+			      unsigned int idx);
+
+int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
+		     struct netlink_callback *cb);
+
+enum ib_cache_gid_default_mode {
+	IB_CACHE_GID_DEFAULT_MODE_SET,
+	IB_CACHE_GID_DEFAULT_MODE_DELETE
+};
+
+int ib_cache_gid_parse_type_str(const char *buf);
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type);
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+				  struct net_device *ndev,
+				  unsigned long gid_type_mask,
+				  enum ib_cache_gid_default_mode mode);
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+		     union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+		     union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+				     struct net_device *ndev);
+
+int roce_gid_mgmt_init(void);
+void roce_gid_mgmt_cleanup(void);
+
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port);
+
+int ib_cache_setup_one(struct ib_device *device);
+void ib_cache_cleanup_one(struct ib_device *device);
+void ib_cache_release_one(struct ib_device *device);
+
+#ifdef CONFIG_CGROUP_RDMA
+int ib_device_register_rdmacg(struct ib_device *device);
+void ib_device_unregister_rdmacg(struct ib_device *device);
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+			 struct ib_device *device,
+			 enum rdmacg_resource_type resource_index);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+			struct ib_device *device,
+			enum rdmacg_resource_type resource_index);
+#else
+static inline int ib_device_register_rdmacg(struct ib_device *device)
+{ return 0; }
+
+static inline void ib_device_unregister_rdmacg(struct ib_device *device)
+{ }
+
+static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+				       struct ib_device *device,
+				       enum rdmacg_resource_type resource_index)
+{ return 0; }
+
+static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+				      struct ib_device *device,
+				      enum rdmacg_resource_type resource_index)
+{ }
+#endif
+
+static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
+					 struct net_device *upper)
+{
+	return netdev_has_upper_dev_all_rcu(dev, upper);
+}
+
+int addr_init(void);
+void addr_cleanup(void);
+
+int ib_mad_init(void);
+void ib_mad_cleanup(void);
+
+int ib_sa_init(void);
+void ib_sa_cleanup(void);
+
+int rdma_nl_init(void);
+void rdma_nl_exit(void);
+
+int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+			      struct nlmsghdr *nlh,
+			      struct netlink_ext_ack *extack);
+int ib_nl_handle_set_timeout(struct sk_buff *skb,
+			     struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack);
+int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
+			     struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack);
+
+int ib_get_cached_subnet_prefix(struct ib_device *device,
+				u8                port_num,
+				u64              *sn_pfx);
+
+#ifdef CONFIG_SECURITY_INFINIBAND
+void ib_security_destroy_port_pkey_list(struct ib_device *device);
+
+void ib_security_cache_change(struct ib_device *device,
+			      u8 port_num,
+			      u64 subnet_prefix);
+
+int ib_security_modify_qp(struct ib_qp *qp,
+			  struct ib_qp_attr *qp_attr,
+			  int qp_attr_mask,
+			  struct ib_udata *udata);
+
+int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev);
+void ib_destroy_qp_security_begin(struct ib_qp_security *sec);
+void ib_destroy_qp_security_abort(struct ib_qp_security *sec);
+void ib_destroy_qp_security_end(struct ib_qp_security *sec);
+int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev);
+void ib_close_shared_qp_security(struct ib_qp_security *sec);
+int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
+				enum ib_qp_type qp_type);
+void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent);
+int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index);
+#else
+static inline void ib_security_destroy_port_pkey_list(struct ib_device *device)
+{
+}
+
+static inline void ib_security_cache_change(struct ib_device *device,
+					    u8 port_num,
+					    u64 subnet_prefix)
+{
+}
+
+static inline int ib_security_modify_qp(struct ib_qp *qp,
+					struct ib_qp_attr *qp_attr,
+					int qp_attr_mask,
+					struct ib_udata *udata)
+{
+	return qp->device->modify_qp(qp->real_qp,
+				     qp_attr,
+				     qp_attr_mask,
+				     udata);
+}
+
+static inline int ib_create_qp_security(struct ib_qp *qp,
+					struct ib_device *dev)
+{
+	return 0;
+}
+
+static inline void ib_destroy_qp_security_begin(struct ib_qp_security *sec)
+{
+}
+
+static inline void ib_destroy_qp_security_abort(struct ib_qp_security *sec)
+{
+}
+
+static inline void ib_destroy_qp_security_end(struct ib_qp_security *sec)
+{
+}
+
+static inline int ib_open_shared_qp_security(struct ib_qp *qp,
+					     struct ib_device *dev)
+{
+	return 0;
+}
+
+static inline void ib_close_shared_qp_security(struct ib_qp_security *sec)
+{
+}
+
+static inline int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
+					      enum ib_qp_type qp_type)
+{
+	return 0;
+}
+
+static inline void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent)
+{
+}
+
+static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
+					  u16 pkey_index)
+{
+	return 0;
+}
+#endif
+
+struct ib_device *ib_device_get_by_index(u32 ifindex);
+/* RDMA device netlink */
+void nldev_init(void);
+void nldev_exit(void);
+
+static inline struct ib_qp *_ib_create_qp(struct ib_device *dev,
+					  struct ib_pd *pd,
+					  struct ib_qp_init_attr *attr,
+					  struct ib_udata *udata,
+					  struct ib_uobject *uobj)
+{
+	struct ib_qp *qp;
+
+	if (!dev->create_qp)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	qp = dev->create_qp(pd, attr, udata);
+	if (IS_ERR(qp))
+		return qp;
+
+	qp->device = dev;
+	qp->pd = pd;
+	qp->uobject = uobj;
+	/*
+	 * We don't track XRC QPs for now, because they don't have PD
+	 * and more importantly they are created internaly by driver,
+	 * see mlx5 create_dev_resources() as an example.
+	 */
+	if (attr->qp_type < IB_QPT_XRC_INI) {
+		qp->res.type = RDMA_RESTRACK_QP;
+		rdma_restrack_add(&qp->res);
+	} else
+		qp->res.valid = false;
+
+	return qp;
+}
+
+struct rdma_dev_addr;
+int rdma_resolve_ip_route(struct sockaddr *src_addr,
+			  const struct sockaddr *dst_addr,
+			  struct rdma_dev_addr *addr);
+
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+				 const union ib_gid *dgid,
+				 u8 *dmac, const struct net_device *ndev,
+				 int *hoplimit);
+
+#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
new file mode 100644
index 0000000..af5ad6a
--- /dev/null
+++ b/drivers/infiniband/core/cq.c
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2015 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+
+/* # of WCs to poll for with a single call to ib_poll_cq */
+#define IB_POLL_BATCH			16
+#define IB_POLL_BATCH_DIRECT		8
+
+/* # of WCs to iterate over before yielding */
+#define IB_POLL_BUDGET_IRQ		256
+#define IB_POLL_BUDGET_WORKQUEUE	65536
+
+#define IB_POLL_FLAGS \
+	(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
+
+static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
+			   int batch)
+{
+	int i, n, completed = 0;
+
+	/*
+	 * budget might be (-1) if the caller does not
+	 * want to bound this call, thus we need unsigned
+	 * minimum here.
+	 */
+	while ((n = ib_poll_cq(cq, min_t(u32, batch,
+					 budget - completed), wcs)) > 0) {
+		for (i = 0; i < n; i++) {
+			struct ib_wc *wc = &wcs[i];
+
+			if (wc->wr_cqe)
+				wc->wr_cqe->done(cq, wc);
+			else
+				WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
+		}
+
+		completed += n;
+
+		if (n != batch || (budget != -1 && completed >= budget))
+			break;
+	}
+
+	return completed;
+}
+
+/**
+ * ib_process_direct_cq - process a CQ in caller context
+ * @cq:		CQ to process
+ * @budget:	number of CQEs to poll for
+ *
+ * This function is used to process all outstanding CQ entries.
+ * It does not offload CQ processing to a different context and does
+ * not ask for completion interrupts from the HCA.
+ * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger
+ * concurrent processing.
+ *
+ * Note: do not pass -1 as %budget unless it is guaranteed that the number
+ * of completions that will be processed is small.
+ */
+int ib_process_cq_direct(struct ib_cq *cq, int budget)
+{
+	struct ib_wc wcs[IB_POLL_BATCH_DIRECT];
+
+	return __ib_process_cq(cq, budget, wcs, IB_POLL_BATCH_DIRECT);
+}
+EXPORT_SYMBOL(ib_process_cq_direct);
+
+static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
+{
+	WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
+}
+
+static int ib_poll_handler(struct irq_poll *iop, int budget)
+{
+	struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+	int completed;
+
+	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
+	if (completed < budget) {
+		irq_poll_complete(&cq->iop);
+		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+			irq_poll_sched(&cq->iop);
+	}
+
+	return completed;
+}
+
+static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
+{
+	irq_poll_sched(&cq->iop);
+}
+
+static void ib_cq_poll_work(struct work_struct *work)
+{
+	struct ib_cq *cq = container_of(work, struct ib_cq, work);
+	int completed;
+
+	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc,
+				    IB_POLL_BATCH);
+	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
+	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+		queue_work(ib_comp_wq, &cq->work);
+}
+
+static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
+{
+	queue_work(ib_comp_wq, &cq->work);
+}
+
+/**
+ * __ib_alloc_cq - allocate a completion queue
+ * @dev:		device to allocate the CQ for
+ * @private:		driver private data, accessible from cq->cq_context
+ * @nr_cqe:		number of CQEs to allocate
+ * @comp_vector:	HCA completion vectors for this CQ
+ * @poll_ctx:		context to poll the CQ from.
+ * @caller:		module owner name.
+ *
+ * This is the proper interface to allocate a CQ for in-kernel users. A
+ * CQ allocated with this interface will automatically be polled from the
+ * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id
+ * to use this CQ abstraction.
+ */
+struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
+			    int nr_cqe, int comp_vector,
+			    enum ib_poll_context poll_ctx, const char *caller)
+{
+	struct ib_cq_init_attr cq_attr = {
+		.cqe		= nr_cqe,
+		.comp_vector	= comp_vector,
+	};
+	struct ib_cq *cq;
+	int ret = -ENOMEM;
+
+	cq = dev->create_cq(dev, &cq_attr, NULL, NULL);
+	if (IS_ERR(cq))
+		return cq;
+
+	cq->device = dev;
+	cq->uobject = NULL;
+	cq->event_handler = NULL;
+	cq->cq_context = private;
+	cq->poll_ctx = poll_ctx;
+	atomic_set(&cq->usecnt, 0);
+
+	cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
+	if (!cq->wc)
+		goto out_destroy_cq;
+
+	cq->res.type = RDMA_RESTRACK_CQ;
+	cq->res.kern_name = caller;
+	rdma_restrack_add(&cq->res);
+
+	switch (cq->poll_ctx) {
+	case IB_POLL_DIRECT:
+		cq->comp_handler = ib_cq_completion_direct;
+		break;
+	case IB_POLL_SOFTIRQ:
+		cq->comp_handler = ib_cq_completion_softirq;
+
+		irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
+		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+		break;
+	case IB_POLL_WORKQUEUE:
+		cq->comp_handler = ib_cq_completion_workqueue;
+		INIT_WORK(&cq->work, ib_cq_poll_work);
+		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+		break;
+	default:
+		ret = -EINVAL;
+		goto out_free_wc;
+	}
+
+	return cq;
+
+out_free_wc:
+	kfree(cq->wc);
+	rdma_restrack_del(&cq->res);
+out_destroy_cq:
+	cq->device->destroy_cq(cq);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(__ib_alloc_cq);
+
+/**
+ * ib_free_cq - free a completion queue
+ * @cq:		completion queue to free.
+ */
+void ib_free_cq(struct ib_cq *cq)
+{
+	int ret;
+
+	if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
+		return;
+
+	switch (cq->poll_ctx) {
+	case IB_POLL_DIRECT:
+		break;
+	case IB_POLL_SOFTIRQ:
+		irq_poll_disable(&cq->iop);
+		break;
+	case IB_POLL_WORKQUEUE:
+		cancel_work_sync(&cq->work);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+	}
+
+	kfree(cq->wc);
+	rdma_restrack_del(&cq->res);
+	ret = cq->device->destroy_cq(cq);
+	WARN_ON_ONCE(ret);
+}
+EXPORT_SYMBOL(ib_free_cq);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
new file mode 100644
index 0000000..ea9fbcf
--- /dev/null
+++ b/drivers/infiniband/core/device.c
@@ -0,0 +1,1268 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("core kernel InfiniBand API");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_client_data {
+	struct list_head  list;
+	struct ib_client *client;
+	void *            data;
+	/* The device or client is going down. Do not call client or device
+	 * callbacks other than remove(). */
+	bool		  going_down;
+};
+
+struct workqueue_struct *ib_comp_wq;
+struct workqueue_struct *ib_wq;
+EXPORT_SYMBOL_GPL(ib_wq);
+
+/* The device_list and client_list contain devices and clients after their
+ * registration has completed, and the devices and clients are removed
+ * during unregistration. */
+static LIST_HEAD(device_list);
+static LIST_HEAD(client_list);
+
+/*
+ * device_mutex and lists_rwsem protect access to both device_list and
+ * client_list.  device_mutex protects writer access by device and client
+ * registration / de-registration.  lists_rwsem protects reader access to
+ * these lists.  Iterators of these lists must lock it for read, while updates
+ * to the lists must be done with a write lock. A special case is when the
+ * device_mutex is locked. In this case locking the lists for read access is
+ * not necessary as the device_mutex implies it.
+ *
+ * lists_rwsem also protects access to the client data list.
+ */
+static DEFINE_MUTEX(device_mutex);
+static DECLARE_RWSEM(lists_rwsem);
+
+static int ib_security_change(struct notifier_block *nb, unsigned long event,
+			      void *lsm_data);
+static void ib_policy_change_task(struct work_struct *work);
+static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task);
+
+static struct notifier_block ibdev_lsm_nb = {
+	.notifier_call = ib_security_change,
+};
+
+static int ib_device_check_mandatory(struct ib_device *device)
+{
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+	static const struct {
+		size_t offset;
+		char  *name;
+	} mandatory_table[] = {
+		IB_MANDATORY_FUNC(query_device),
+		IB_MANDATORY_FUNC(query_port),
+		IB_MANDATORY_FUNC(query_pkey),
+		IB_MANDATORY_FUNC(alloc_pd),
+		IB_MANDATORY_FUNC(dealloc_pd),
+		IB_MANDATORY_FUNC(create_ah),
+		IB_MANDATORY_FUNC(destroy_ah),
+		IB_MANDATORY_FUNC(create_qp),
+		IB_MANDATORY_FUNC(modify_qp),
+		IB_MANDATORY_FUNC(destroy_qp),
+		IB_MANDATORY_FUNC(post_send),
+		IB_MANDATORY_FUNC(post_recv),
+		IB_MANDATORY_FUNC(create_cq),
+		IB_MANDATORY_FUNC(destroy_cq),
+		IB_MANDATORY_FUNC(poll_cq),
+		IB_MANDATORY_FUNC(req_notify_cq),
+		IB_MANDATORY_FUNC(get_dma_mr),
+		IB_MANDATORY_FUNC(dereg_mr),
+		IB_MANDATORY_FUNC(get_port_immutable)
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
+		if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
+			pr_warn("Device %s is missing mandatory function %s\n",
+				device->name, mandatory_table[i].name);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static struct ib_device *__ib_device_get_by_index(u32 index)
+{
+	struct ib_device *device;
+
+	list_for_each_entry(device, &device_list, core_list)
+		if (device->index == index)
+			return device;
+
+	return NULL;
+}
+
+/*
+ * Caller is responsible to return refrerence count by calling put_device()
+ */
+struct ib_device *ib_device_get_by_index(u32 index)
+{
+	struct ib_device *device;
+
+	down_read(&lists_rwsem);
+	device = __ib_device_get_by_index(index);
+	if (device)
+		get_device(&device->dev);
+
+	up_read(&lists_rwsem);
+	return device;
+}
+
+static struct ib_device *__ib_device_get_by_name(const char *name)
+{
+	struct ib_device *device;
+
+	list_for_each_entry(device, &device_list, core_list)
+		if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+			return device;
+
+	return NULL;
+}
+
+static int alloc_name(char *name)
+{
+	unsigned long *inuse;
+	char buf[IB_DEVICE_NAME_MAX];
+	struct ib_device *device;
+	int i;
+
+	inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+	if (!inuse)
+		return -ENOMEM;
+
+	list_for_each_entry(device, &device_list, core_list) {
+		if (!sscanf(device->name, name, &i))
+			continue;
+		if (i < 0 || i >= PAGE_SIZE * 8)
+			continue;
+		snprintf(buf, sizeof buf, name, i);
+		if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
+			set_bit(i, inuse);
+	}
+
+	i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
+	free_page((unsigned long) inuse);
+	snprintf(buf, sizeof buf, name, i);
+
+	if (__ib_device_get_by_name(buf))
+		return -ENFILE;
+
+	strlcpy(name, buf, IB_DEVICE_NAME_MAX);
+	return 0;
+}
+
+static void ib_device_release(struct device *device)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	WARN_ON(dev->reg_state == IB_DEV_REGISTERED);
+	if (dev->reg_state == IB_DEV_UNREGISTERED) {
+		/*
+		 * In IB_DEV_UNINITIALIZED state, cache or port table
+		 * is not even created. Free cache and port table only when
+		 * device reaches UNREGISTERED state.
+		 */
+		ib_cache_release_one(dev);
+		kfree(dev->port_immutable);
+	}
+	kfree(dev);
+}
+
+static int ib_device_uevent(struct device *device,
+			    struct kobj_uevent_env *env)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	if (add_uevent_var(env, "NAME=%s", dev->name))
+		return -ENOMEM;
+
+	/*
+	 * It would be nice to pass the node GUID with the event...
+	 */
+
+	return 0;
+}
+
+static struct class ib_class = {
+	.name    = "infiniband",
+	.dev_release = ib_device_release,
+	.dev_uevent = ib_device_uevent,
+};
+
+/**
+ * ib_alloc_device - allocate an IB device struct
+ * @size:size of structure to allocate
+ *
+ * Low-level drivers should use ib_alloc_device() to allocate &struct
+ * ib_device.  @size is the size of the structure to be allocated,
+ * including any private data used by the low-level driver.
+ * ib_dealloc_device() must be used to free structures allocated with
+ * ib_alloc_device().
+ */
+struct ib_device *ib_alloc_device(size_t size)
+{
+	struct ib_device *device;
+
+	if (WARN_ON(size < sizeof(struct ib_device)))
+		return NULL;
+
+	device = kzalloc(size, GFP_KERNEL);
+	if (!device)
+		return NULL;
+
+	rdma_restrack_init(&device->res);
+
+	device->dev.class = &ib_class;
+	device_initialize(&device->dev);
+
+	dev_set_drvdata(&device->dev, device);
+
+	INIT_LIST_HEAD(&device->event_handler_list);
+	spin_lock_init(&device->event_handler_lock);
+	spin_lock_init(&device->client_data_lock);
+	INIT_LIST_HEAD(&device->client_data_list);
+	INIT_LIST_HEAD(&device->port_list);
+
+	return device;
+}
+EXPORT_SYMBOL(ib_alloc_device);
+
+/**
+ * ib_dealloc_device - free an IB device struct
+ * @device:structure to free
+ *
+ * Free a structure allocated with ib_alloc_device().
+ */
+void ib_dealloc_device(struct ib_device *device)
+{
+	WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
+		device->reg_state != IB_DEV_UNINITIALIZED);
+	rdma_restrack_clean(&device->res);
+	put_device(&device->dev);
+}
+EXPORT_SYMBOL(ib_dealloc_device);
+
+static int add_client_context(struct ib_device *device, struct ib_client *client)
+{
+	struct ib_client_data *context;
+	unsigned long flags;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	context->client = client;
+	context->data   = NULL;
+	context->going_down = false;
+
+	down_write(&lists_rwsem);
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_add(&context->list, &device->client_data_list);
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	up_write(&lists_rwsem);
+
+	return 0;
+}
+
+static int verify_immutable(const struct ib_device *dev, u8 port)
+{
+	return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
+			    rdma_max_mad_size(dev, port) != 0);
+}
+
+static int read_port_immutable(struct ib_device *device)
+{
+	int ret;
+	u8 start_port = rdma_start_port(device);
+	u8 end_port = rdma_end_port(device);
+	u8 port;
+
+	/**
+	 * device->port_immutable is indexed directly by the port number to make
+	 * access to this data as efficient as possible.
+	 *
+	 * Therefore port_immutable is declared as a 1 based array with
+	 * potential empty slots at the beginning.
+	 */
+	device->port_immutable = kzalloc(sizeof(*device->port_immutable)
+					 * (end_port + 1),
+					 GFP_KERNEL);
+	if (!device->port_immutable)
+		return -ENOMEM;
+
+	for (port = start_port; port <= end_port; ++port) {
+		ret = device->get_port_immutable(device, port,
+						 &device->port_immutable[port]);
+		if (ret)
+			return ret;
+
+		if (verify_immutable(device, port))
+			return -EINVAL;
+	}
+	return 0;
+}
+
+void ib_get_device_fw_str(struct ib_device *dev, char *str)
+{
+	if (dev->get_dev_fw_str)
+		dev->get_dev_fw_str(dev, str);
+	else
+		str[0] = '\0';
+}
+EXPORT_SYMBOL(ib_get_device_fw_str);
+
+static int setup_port_pkey_list(struct ib_device *device)
+{
+	int i;
+
+	/**
+	 * device->port_pkey_list is indexed directly by the port number,
+	 * Therefore it is declared as a 1 based array with potential empty
+	 * slots at the beginning.
+	 */
+	device->port_pkey_list = kcalloc(rdma_end_port(device) + 1,
+					 sizeof(*device->port_pkey_list),
+					 GFP_KERNEL);
+
+	if (!device->port_pkey_list)
+		return -ENOMEM;
+
+	for (i = 0; i < (rdma_end_port(device) + 1); i++) {
+		spin_lock_init(&device->port_pkey_list[i].list_lock);
+		INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list);
+	}
+
+	return 0;
+}
+
+static void ib_policy_change_task(struct work_struct *work)
+{
+	struct ib_device *dev;
+
+	down_read(&lists_rwsem);
+	list_for_each_entry(dev, &device_list, core_list) {
+		int i;
+
+		for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) {
+			u64 sp;
+			int ret = ib_get_cached_subnet_prefix(dev,
+							      i,
+							      &sp);
+
+			WARN_ONCE(ret,
+				  "ib_get_cached_subnet_prefix err: %d, this should never happen here\n",
+				  ret);
+			if (!ret)
+				ib_security_cache_change(dev, i, sp);
+		}
+	}
+	up_read(&lists_rwsem);
+}
+
+static int ib_security_change(struct notifier_block *nb, unsigned long event,
+			      void *lsm_data)
+{
+	if (event != LSM_POLICY_CHANGE)
+		return NOTIFY_DONE;
+
+	schedule_work(&ib_policy_change_work);
+
+	return NOTIFY_OK;
+}
+
+/**
+ *	__dev_new_index	-	allocate an device index
+ *
+ *	Returns a suitable unique value for a new device interface
+ *	number.  It assumes that there are less than 2^32-1 ib devices
+ *	will be present in the system.
+ */
+static u32 __dev_new_index(void)
+{
+	/*
+	 * The device index to allow stable naming.
+	 * Similar to struct net -> ifindex.
+	 */
+	static u32 index;
+
+	for (;;) {
+		if (!(++index))
+			index = 1;
+
+		if (!__ib_device_get_by_index(index))
+			return index;
+	}
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core.  All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ */
+int ib_register_device(struct ib_device *device,
+		       int (*port_callback)(struct ib_device *,
+					    u8, struct kobject *))
+{
+	int ret;
+	struct ib_client *client;
+	struct ib_udata uhw = {.outlen = 0, .inlen = 0};
+	struct device *parent = device->dev.parent;
+
+	WARN_ON_ONCE(device->dma_device);
+	if (device->dev.dma_ops) {
+		/*
+		 * The caller provided custom DMA operations. Copy the
+		 * DMA-related fields that are used by e.g. dma_alloc_coherent()
+		 * into device->dev.
+		 */
+		device->dma_device = &device->dev;
+		if (!device->dev.dma_mask) {
+			if (parent)
+				device->dev.dma_mask = parent->dma_mask;
+			else
+				WARN_ON_ONCE(true);
+		}
+		if (!device->dev.coherent_dma_mask) {
+			if (parent)
+				device->dev.coherent_dma_mask =
+					parent->coherent_dma_mask;
+			else
+				WARN_ON_ONCE(true);
+		}
+	} else {
+		/*
+		 * The caller did not provide custom DMA operations. Use the
+		 * DMA mapping operations of the parent device.
+		 */
+		WARN_ON_ONCE(!parent);
+		device->dma_device = parent;
+	}
+
+	mutex_lock(&device_mutex);
+
+	if (strchr(device->name, '%')) {
+		ret = alloc_name(device->name);
+		if (ret)
+			goto out;
+	}
+
+	if (ib_device_check_mandatory(device)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = read_port_immutable(device);
+	if (ret) {
+		pr_warn("Couldn't create per port immutable data %s\n",
+			device->name);
+		goto out;
+	}
+
+	ret = setup_port_pkey_list(device);
+	if (ret) {
+		pr_warn("Couldn't create per port_pkey_list\n");
+		goto out;
+	}
+
+	ret = ib_cache_setup_one(device);
+	if (ret) {
+		pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n");
+		goto port_cleanup;
+	}
+
+	ret = ib_device_register_rdmacg(device);
+	if (ret) {
+		pr_warn("Couldn't register device with rdma cgroup\n");
+		goto cache_cleanup;
+	}
+
+	memset(&device->attrs, 0, sizeof(device->attrs));
+	ret = device->query_device(device, &device->attrs, &uhw);
+	if (ret) {
+		pr_warn("Couldn't query the device attributes\n");
+		goto cg_cleanup;
+	}
+
+	ret = ib_device_register_sysfs(device, port_callback);
+	if (ret) {
+		pr_warn("Couldn't register device %s with driver model\n",
+			device->name);
+		goto cg_cleanup;
+	}
+
+	device->reg_state = IB_DEV_REGISTERED;
+
+	list_for_each_entry(client, &client_list, list)
+		if (!add_client_context(device, client) && client->add)
+			client->add(device);
+
+	device->index = __dev_new_index();
+	down_write(&lists_rwsem);
+	list_add_tail(&device->core_list, &device_list);
+	up_write(&lists_rwsem);
+	mutex_unlock(&device_mutex);
+	return 0;
+
+cg_cleanup:
+	ib_device_unregister_rdmacg(device);
+cache_cleanup:
+	ib_cache_cleanup_one(device);
+	ib_cache_release_one(device);
+port_cleanup:
+	kfree(device->port_immutable);
+out:
+	mutex_unlock(&device_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_device);
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device:Device to unregister
+ *
+ * Unregister an IB device.  All clients will receive a remove callback.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+	struct ib_client_data *context, *tmp;
+	unsigned long flags;
+
+	mutex_lock(&device_mutex);
+
+	down_write(&lists_rwsem);
+	list_del(&device->core_list);
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+		context->going_down = true;
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	downgrade_write(&lists_rwsem);
+
+	list_for_each_entry_safe(context, tmp, &device->client_data_list,
+				 list) {
+		if (context->client->remove)
+			context->client->remove(device, context->data);
+	}
+	up_read(&lists_rwsem);
+
+	ib_device_unregister_rdmacg(device);
+	ib_device_unregister_sysfs(device);
+
+	mutex_unlock(&device_mutex);
+
+	ib_cache_cleanup_one(device);
+
+	ib_security_destroy_port_pkey_list(device);
+	kfree(device->port_pkey_list);
+
+	down_write(&lists_rwsem);
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+		kfree(context);
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	up_write(&lists_rwsem);
+
+	device->reg_state = IB_DEV_UNREGISTERED;
+}
+EXPORT_SYMBOL(ib_unregister_device);
+
+/**
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal.  When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered).  In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ */
+int ib_register_client(struct ib_client *client)
+{
+	struct ib_device *device;
+
+	mutex_lock(&device_mutex);
+
+	list_for_each_entry(device, &device_list, core_list)
+		if (!add_client_context(device, client) && client->add)
+			client->add(device);
+
+	down_write(&lists_rwsem);
+	list_add_tail(&client->list, &client_list);
+	up_write(&lists_rwsem);
+
+	mutex_unlock(&device_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_register_client);
+
+/**
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration.  When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ */
+void ib_unregister_client(struct ib_client *client)
+{
+	struct ib_client_data *context, *tmp;
+	struct ib_device *device;
+	unsigned long flags;
+
+	mutex_lock(&device_mutex);
+
+	down_write(&lists_rwsem);
+	list_del(&client->list);
+	up_write(&lists_rwsem);
+
+	list_for_each_entry(device, &device_list, core_list) {
+		struct ib_client_data *found_context = NULL;
+
+		down_write(&lists_rwsem);
+		spin_lock_irqsave(&device->client_data_lock, flags);
+		list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+			if (context->client == client) {
+				context->going_down = true;
+				found_context = context;
+				break;
+			}
+		spin_unlock_irqrestore(&device->client_data_lock, flags);
+		up_write(&lists_rwsem);
+
+		if (client->remove)
+			client->remove(device, found_context ?
+					       found_context->data : NULL);
+
+		if (!found_context) {
+			pr_warn("No client context found for %s/%s\n",
+				device->name, client->name);
+			continue;
+		}
+
+		down_write(&lists_rwsem);
+		spin_lock_irqsave(&device->client_data_lock, flags);
+		list_del(&found_context->list);
+		kfree(found_context);
+		spin_unlock_irqrestore(&device->client_data_lock, flags);
+		up_write(&lists_rwsem);
+	}
+
+	mutex_unlock(&device_mutex);
+}
+EXPORT_SYMBOL(ib_unregister_client);
+
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns client context set with
+ * ib_set_client_data().
+ */
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
+{
+	struct ib_client_data *context;
+	void *ret = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry(context, &device->client_data_list, list)
+		if (context->client == client) {
+			ret = context->data;
+			break;
+		}
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_client_data);
+
+/**
+ * ib_set_client_data - Set IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context that can be retrieved with
+ * ib_get_client_data().
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+			void *data)
+{
+	struct ib_client_data *context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry(context, &device->client_data_list, list)
+		if (context->client == client) {
+			context->data = data;
+			goto out;
+		}
+
+	pr_warn("No client context found for %s/%s\n",
+		device->name, client->name);
+
+out:
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+}
+EXPORT_SYMBOL(ib_set_client_data);
+
+/**
+ * ib_register_event_handler - Register an IB event handler
+ * @event_handler:Handler to register
+ *
+ * ib_register_event_handler() registers an event handler that will be
+ * called back when asynchronous IB events occur (as defined in
+ * chapter 11 of the InfiniBand Architecture Specification).  This
+ * callback may occur in interrupt context.
+ */
+void ib_register_event_handler(struct ib_event_handler *event_handler)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+	list_add_tail(&event_handler->list,
+		      &event_handler->device->event_handler_list);
+	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_register_event_handler);
+
+/**
+ * ib_unregister_event_handler - Unregister an event handler
+ * @event_handler:Handler to unregister
+ *
+ * Unregister an event handler registered with
+ * ib_register_event_handler().
+ */
+void ib_unregister_event_handler(struct ib_event_handler *event_handler)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+	list_del(&event_handler->list);
+	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_unregister_event_handler);
+
+/**
+ * ib_dispatch_event - Dispatch an asynchronous event
+ * @event:Event to dispatch
+ *
+ * Low-level drivers must call ib_dispatch_event() to dispatch the
+ * event to all registered event handlers when an asynchronous event
+ * occurs.
+ */
+void ib_dispatch_event(struct ib_event *event)
+{
+	unsigned long flags;
+	struct ib_event_handler *handler;
+
+	spin_lock_irqsave(&event->device->event_handler_lock, flags);
+
+	list_for_each_entry(handler, &event->device->event_handler_list, list)
+		handler->handler(handler, event);
+
+	spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_dispatch_event);
+
+/**
+ * ib_query_port - Query IB port attributes
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @port_attr:Port attributes
+ *
+ * ib_query_port() returns the attributes of a port through the
+ * @port_attr pointer.
+ */
+int ib_query_port(struct ib_device *device,
+		  u8 port_num,
+		  struct ib_port_attr *port_attr)
+{
+	union ib_gid gid;
+	int err;
+
+	if (!rdma_is_port_valid(device, port_num))
+		return -EINVAL;
+
+	memset(port_attr, 0, sizeof(*port_attr));
+	err = device->query_port(device, port_num, port_attr);
+	if (err || port_attr->subnet_prefix)
+		return err;
+
+	if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND)
+		return 0;
+
+	err = device->query_gid(device, port_num, 0, &gid);
+	if (err)
+		return err;
+
+	port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
+	return 0;
+}
+EXPORT_SYMBOL(ib_query_port);
+
+/**
+ * ib_query_gid - Get GID table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:GID table index to query
+ * @gid:Returned GID
+ * @attr: Returned GID attributes related to this GID index (only in RoCE).
+ *   NULL means ignore.
+ *
+ * ib_query_gid() fetches the specified GID table entry from the cache.
+ */
+int ib_query_gid(struct ib_device *device,
+		 u8 port_num, int index, union ib_gid *gid,
+		 struct ib_gid_attr *attr)
+{
+	return ib_get_cached_gid(device, port_num, index, gid, attr);
+}
+EXPORT_SYMBOL(ib_query_gid);
+
+/**
+ * ib_enum_roce_netdev - enumerate all RoCE ports
+ * @ib_dev : IB device we want to query
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all of the physical RoCE ports of ib_dev
+ * which are related to netdevice and calls callback() on each
+ * device for which filter() function returns non zero.
+ */
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+			 roce_netdev_filter filter,
+			 void *filter_cookie,
+			 roce_netdev_callback cb,
+			 void *cookie)
+{
+	u8 port;
+
+	for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
+	     port++)
+		if (rdma_protocol_roce(ib_dev, port)) {
+			struct net_device *idev = NULL;
+
+			if (ib_dev->get_netdev)
+				idev = ib_dev->get_netdev(ib_dev, port);
+
+			if (idev &&
+			    idev->reg_state >= NETREG_UNREGISTERED) {
+				dev_put(idev);
+				idev = NULL;
+			}
+
+			if (filter(ib_dev, port, idev, filter_cookie))
+				cb(ib_dev, port, idev, cookie);
+
+			if (idev)
+				dev_put(idev);
+		}
+}
+
+/**
+ * ib_enum_all_roce_netdevs - enumerate all RoCE devices
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all RoCE devices' physical ports which are related
+ * to netdevices and calls callback() on each device for which
+ * filter() function returns non zero.
+ */
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+			      void *filter_cookie,
+			      roce_netdev_callback cb,
+			      void *cookie)
+{
+	struct ib_device *dev;
+
+	down_read(&lists_rwsem);
+	list_for_each_entry(dev, &device_list, core_list)
+		ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
+	up_read(&lists_rwsem);
+}
+
+/**
+ * ib_enum_all_devs - enumerate all ib_devices
+ * @cb: Callback to call for each found ib_device
+ *
+ * Enumerates all ib_devices and calls callback() on each device.
+ */
+int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
+		     struct netlink_callback *cb)
+{
+	struct ib_device *dev;
+	unsigned int idx = 0;
+	int ret = 0;
+
+	down_read(&lists_rwsem);
+	list_for_each_entry(dev, &device_list, core_list) {
+		ret = nldev_cb(dev, skb, cb, idx);
+		if (ret)
+			break;
+		idx++;
+	}
+
+	up_read(&lists_rwsem);
+	return ret;
+}
+
+/**
+ * ib_query_pkey - Get P_Key table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:P_Key table index to query
+ * @pkey:Returned P_Key
+ *
+ * ib_query_pkey() fetches the specified P_Key table entry.
+ */
+int ib_query_pkey(struct ib_device *device,
+		  u8 port_num, u16 index, u16 *pkey)
+{
+	return device->query_pkey(device, port_num, index, pkey);
+}
+EXPORT_SYMBOL(ib_query_pkey);
+
+/**
+ * ib_modify_device - Change IB device attributes
+ * @device:Device to modify
+ * @device_modify_mask:Mask of attributes to change
+ * @device_modify:New attribute values
+ *
+ * ib_modify_device() changes a device's attributes as specified by
+ * the @device_modify_mask and @device_modify structure.
+ */
+int ib_modify_device(struct ib_device *device,
+		     int device_modify_mask,
+		     struct ib_device_modify *device_modify)
+{
+	if (!device->modify_device)
+		return -ENOSYS;
+
+	return device->modify_device(device, device_modify_mask,
+				     device_modify);
+}
+EXPORT_SYMBOL(ib_modify_device);
+
+/**
+ * ib_modify_port - Modifies the attributes for the specified port.
+ * @device: The device to modify.
+ * @port_num: The number of the port to modify.
+ * @port_modify_mask: Mask used to specify which attributes of the port
+ *   to change.
+ * @port_modify: New attribute values for the port.
+ *
+ * ib_modify_port() changes a port's attributes as specified by the
+ * @port_modify_mask and @port_modify structure.
+ */
+int ib_modify_port(struct ib_device *device,
+		   u8 port_num, int port_modify_mask,
+		   struct ib_port_modify *port_modify)
+{
+	int rc;
+
+	if (!rdma_is_port_valid(device, port_num))
+		return -EINVAL;
+
+	if (device->modify_port)
+		rc = device->modify_port(device, port_num, port_modify_mask,
+					   port_modify);
+	else
+		rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS;
+	return rc;
+}
+EXPORT_SYMBOL(ib_modify_port);
+
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ *   a specified GID value occurs. Its searches only for IB link layer.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found.  This
+ *   parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+		u8 *port_num, u16 *index)
+{
+	union ib_gid tmp_gid;
+	int ret, port, i;
+
+	for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+		if (!rdma_protocol_ib(device, port))
+			continue;
+
+		for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
+			ret = ib_query_gid(device, port, i, &tmp_gid, NULL);
+			if (ret)
+				return ret;
+			if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+				*port_num = port;
+				if (index)
+					*index = i;
+				return 0;
+			}
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+		 u8 port_num, u16 pkey, u16 *index)
+{
+	int ret, i;
+	u16 tmp_pkey;
+	int partial_ix = -1;
+
+	for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) {
+		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+		if (ret)
+			return ret;
+		if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
+			/* if there is full-member pkey take it.*/
+			if (tmp_pkey & 0x8000) {
+				*index = i;
+				return 0;
+			}
+			if (partial_ix < 0)
+				partial_ix = i;
+		}
+	}
+
+	/*no full-member, if exists take the limited*/
+	if (partial_ix >= 0) {
+		*index = partial_ix;
+		return 0;
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
+/**
+ * ib_get_net_dev_by_params() - Return the appropriate net_dev
+ * for a received CM request
+ * @dev:	An RDMA device on which the request has been received.
+ * @port:	Port number on the RDMA device.
+ * @pkey:	The Pkey the request came on.
+ * @gid:	A GID that the net_dev uses to communicate.
+ * @addr:	Contains the IP address that the request specified as its
+ *		destination.
+ */
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
+					    u8 port,
+					    u16 pkey,
+					    const union ib_gid *gid,
+					    const struct sockaddr *addr)
+{
+	struct net_device *net_dev = NULL;
+	struct ib_client_data *context;
+
+	if (!rdma_protocol_ib(dev, port))
+		return NULL;
+
+	down_read(&lists_rwsem);
+
+	list_for_each_entry(context, &dev->client_data_list, list) {
+		struct ib_client *client = context->client;
+
+		if (context->going_down)
+			continue;
+
+		if (client->get_net_dev_by_params) {
+			net_dev = client->get_net_dev_by_params(dev, port, pkey,
+								gid, addr,
+								context->data);
+			if (net_dev)
+				break;
+		}
+	}
+
+	up_read(&lists_rwsem);
+
+	return net_dev;
+}
+EXPORT_SYMBOL(ib_get_net_dev_by_params);
+
+static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
+	[RDMA_NL_LS_OP_RESOLVE] = {
+		.doit = ib_nl_handle_resolve_resp,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
+	[RDMA_NL_LS_OP_SET_TIMEOUT] = {
+		.doit = ib_nl_handle_set_timeout,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
+	[RDMA_NL_LS_OP_IP_RESOLVE] = {
+		.doit = ib_nl_handle_ip_res_resp,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
+};
+
+static int __init ib_core_init(void)
+{
+	int ret;
+
+	ib_wq = alloc_workqueue("infiniband", 0, 0);
+	if (!ib_wq)
+		return -ENOMEM;
+
+	ib_comp_wq = alloc_workqueue("ib-comp-wq",
+			WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+	if (!ib_comp_wq) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = class_register(&ib_class);
+	if (ret) {
+		pr_warn("Couldn't create InfiniBand device class\n");
+		goto err_comp;
+	}
+
+	ret = rdma_nl_init();
+	if (ret) {
+		pr_warn("Couldn't init IB netlink interface: err %d\n", ret);
+		goto err_sysfs;
+	}
+
+	ret = addr_init();
+	if (ret) {
+		pr_warn("Could't init IB address resolution\n");
+		goto err_ibnl;
+	}
+
+	ret = ib_mad_init();
+	if (ret) {
+		pr_warn("Couldn't init IB MAD\n");
+		goto err_addr;
+	}
+
+	ret = ib_sa_init();
+	if (ret) {
+		pr_warn("Couldn't init SA\n");
+		goto err_mad;
+	}
+
+	ret = register_lsm_notifier(&ibdev_lsm_nb);
+	if (ret) {
+		pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
+		goto err_sa;
+	}
+
+	nldev_init();
+	rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
+	ib_cache_setup();
+
+	return 0;
+
+err_sa:
+	ib_sa_cleanup();
+err_mad:
+	ib_mad_cleanup();
+err_addr:
+	addr_cleanup();
+err_ibnl:
+	rdma_nl_exit();
+err_sysfs:
+	class_unregister(&ib_class);
+err_comp:
+	destroy_workqueue(ib_comp_wq);
+err:
+	destroy_workqueue(ib_wq);
+	return ret;
+}
+
+static void __exit ib_core_cleanup(void)
+{
+	ib_cache_cleanup();
+	nldev_exit();
+	rdma_nl_unregister(RDMA_NL_LS);
+	unregister_lsm_notifier(&ibdev_lsm_nb);
+	ib_sa_cleanup();
+	ib_mad_cleanup();
+	addr_cleanup();
+	rdma_nl_exit();
+	class_unregister(&ib_class);
+	destroy_workqueue(ib_comp_wq);
+	/* Make sure that any pending umem accounting work is done. */
+	destroy_workqueue(ib_wq);
+}
+
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
+
+subsys_initcall(ib_core_init);
+module_exit(ib_core_cleanup);
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
new file mode 100644
index 0000000..a0a9ed7
--- /dev/null
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <linux/kthread.h>
+
+#include <rdma/ib_fmr_pool.h>
+
+#include "core_priv.h"
+
+#define PFX "fmr_pool: "
+
+enum {
+	IB_FMR_MAX_REMAPS = 32,
+
+	IB_FMR_HASH_BITS  = 8,
+	IB_FMR_HASH_SIZE  = 1 << IB_FMR_HASH_BITS,
+	IB_FMR_HASH_MASK  = IB_FMR_HASH_SIZE - 1
+};
+
+/*
+ * If an FMR is not in use, then the list member will point to either
+ * its pool's free_list (if the FMR can be mapped again; that is,
+ * remap_count < pool->max_remaps) or its pool's dirty_list (if the
+ * FMR needs to be unmapped before being remapped).  In either of
+ * these cases it is a bug if the ref_count is not 0.  In other words,
+ * if ref_count is > 0, then the list member must not be linked into
+ * either free_list or dirty_list.
+ *
+ * The cache_node member is used to link the FMR into a cache bucket
+ * (if caching is enabled).  This is independent of the reference
+ * count of the FMR.  When a valid FMR is released, its ref_count is
+ * decremented, and if ref_count reaches 0, the FMR is placed in
+ * either free_list or dirty_list as appropriate.  However, it is not
+ * removed from the cache and may be "revived" if a call to
+ * ib_fmr_register_physical() occurs before the FMR is remapped.  In
+ * this case we just increment the ref_count and remove the FMR from
+ * free_list/dirty_list.
+ *
+ * Before we remap an FMR from free_list, we remove it from the cache
+ * (to prevent another user from obtaining a stale FMR).  When an FMR
+ * is released, we add it to the tail of the free list, so that our
+ * cache eviction policy is "least recently used."
+ *
+ * All manipulation of ref_count, list and cache_node is protected by
+ * pool_lock to maintain consistency.
+ */
+
+struct ib_fmr_pool {
+	spinlock_t                pool_lock;
+
+	int                       pool_size;
+	int                       max_pages;
+	int			  max_remaps;
+	int                       dirty_watermark;
+	int                       dirty_len;
+	struct list_head          free_list;
+	struct list_head          dirty_list;
+	struct hlist_head        *cache_bucket;
+
+	void                     (*flush_function)(struct ib_fmr_pool *pool,
+						   void *              arg);
+	void                     *flush_arg;
+
+	struct kthread_worker	  *worker;
+	struct kthread_work	  work;
+
+	atomic_t                  req_ser;
+	atomic_t                  flush_ser;
+
+	wait_queue_head_t         force_wait;
+};
+
+static inline u32 ib_fmr_hash(u64 first_page)
+{
+	return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) &
+		(IB_FMR_HASH_SIZE - 1);
+}
+
+/* Caller must hold pool_lock */
+static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool,
+						      u64 *page_list,
+						      int  page_list_len,
+						      u64  io_virtual_address)
+{
+	struct hlist_head *bucket;
+	struct ib_pool_fmr *fmr;
+
+	if (!pool->cache_bucket)
+		return NULL;
+
+	bucket = pool->cache_bucket + ib_fmr_hash(*page_list);
+
+	hlist_for_each_entry(fmr, bucket, cache_node)
+		if (io_virtual_address == fmr->io_virtual_address &&
+		    page_list_len      == fmr->page_list_len      &&
+		    !memcmp(page_list, fmr->page_list,
+			    page_list_len * sizeof *page_list))
+			return fmr;
+
+	return NULL;
+}
+
+static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
+{
+	int                 ret;
+	struct ib_pool_fmr *fmr;
+	LIST_HEAD(unmap_list);
+	LIST_HEAD(fmr_list);
+
+	spin_lock_irq(&pool->pool_lock);
+
+	list_for_each_entry(fmr, &pool->dirty_list, list) {
+		hlist_del_init(&fmr->cache_node);
+		fmr->remap_count = 0;
+		list_add_tail(&fmr->fmr->list, &fmr_list);
+
+#ifdef DEBUG
+		if (fmr->ref_count !=0) {
+			pr_warn(PFX "Unmapping FMR 0x%08x with ref count %d\n",
+				fmr, fmr->ref_count);
+		}
+#endif
+	}
+
+	list_splice_init(&pool->dirty_list, &unmap_list);
+	pool->dirty_len = 0;
+
+	spin_unlock_irq(&pool->pool_lock);
+
+	if (list_empty(&unmap_list)) {
+		return;
+	}
+
+	ret = ib_unmap_fmr(&fmr_list);
+	if (ret)
+		pr_warn(PFX "ib_unmap_fmr returned %d\n", ret);
+
+	spin_lock_irq(&pool->pool_lock);
+	list_splice(&unmap_list, &pool->free_list);
+	spin_unlock_irq(&pool->pool_lock);
+}
+
+static void ib_fmr_cleanup_func(struct kthread_work *work)
+{
+	struct ib_fmr_pool *pool = container_of(work, struct ib_fmr_pool, work);
+
+	ib_fmr_batch_release(pool);
+	atomic_inc(&pool->flush_ser);
+	wake_up_interruptible(&pool->force_wait);
+
+	if (pool->flush_function)
+		pool->flush_function(pool, pool->flush_arg);
+
+	if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0)
+		kthread_queue_work(pool->worker, &pool->work);
+}
+
+/**
+ * ib_create_fmr_pool - Create an FMR pool
+ * @pd:Protection domain for FMRs
+ * @params:FMR pool parameters
+ *
+ * Create a pool of FMRs.  Return value is pointer to new pool or
+ * error code if creation failed.
+ */
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
+				       struct ib_fmr_pool_param *params)
+{
+	struct ib_device   *device;
+	struct ib_fmr_pool *pool;
+	int i;
+	int ret;
+	int max_remaps;
+
+	if (!params)
+		return ERR_PTR(-EINVAL);
+
+	device = pd->device;
+	if (!device->alloc_fmr    || !device->dealloc_fmr  ||
+	    !device->map_phys_fmr || !device->unmap_fmr) {
+		pr_info(PFX "Device %s does not support FMRs\n", device->name);
+		return ERR_PTR(-ENOSYS);
+	}
+
+	if (!device->attrs.max_map_per_fmr)
+		max_remaps = IB_FMR_MAX_REMAPS;
+	else
+		max_remaps = device->attrs.max_map_per_fmr;
+
+	pool = kmalloc(sizeof *pool, GFP_KERNEL);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	pool->cache_bucket   = NULL;
+	pool->flush_function = params->flush_function;
+	pool->flush_arg      = params->flush_arg;
+
+	INIT_LIST_HEAD(&pool->free_list);
+	INIT_LIST_HEAD(&pool->dirty_list);
+
+	if (params->cache) {
+		pool->cache_bucket =
+			kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket,
+				GFP_KERNEL);
+		if (!pool->cache_bucket) {
+			ret = -ENOMEM;
+			goto out_free_pool;
+		}
+
+		for (i = 0; i < IB_FMR_HASH_SIZE; ++i)
+			INIT_HLIST_HEAD(pool->cache_bucket + i);
+	}
+
+	pool->pool_size       = 0;
+	pool->max_pages       = params->max_pages_per_fmr;
+	pool->max_remaps      = max_remaps;
+	pool->dirty_watermark = params->dirty_watermark;
+	pool->dirty_len       = 0;
+	spin_lock_init(&pool->pool_lock);
+	atomic_set(&pool->req_ser,   0);
+	atomic_set(&pool->flush_ser, 0);
+	init_waitqueue_head(&pool->force_wait);
+
+	pool->worker = kthread_create_worker(0, "ib_fmr(%s)", device->name);
+	if (IS_ERR(pool->worker)) {
+		pr_warn(PFX "couldn't start cleanup kthread worker\n");
+		ret = PTR_ERR(pool->worker);
+		goto out_free_pool;
+	}
+	kthread_init_work(&pool->work, ib_fmr_cleanup_func);
+
+	{
+		struct ib_pool_fmr *fmr;
+		struct ib_fmr_attr fmr_attr = {
+			.max_pages  = params->max_pages_per_fmr,
+			.max_maps   = pool->max_remaps,
+			.page_shift = params->page_shift
+		};
+		int bytes_per_fmr = sizeof *fmr;
+
+		if (pool->cache_bucket)
+			bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64);
+
+		for (i = 0; i < params->pool_size; ++i) {
+			fmr = kmalloc(bytes_per_fmr, GFP_KERNEL);
+			if (!fmr)
+				goto out_fail;
+
+			fmr->pool             = pool;
+			fmr->remap_count      = 0;
+			fmr->ref_count        = 0;
+			INIT_HLIST_NODE(&fmr->cache_node);
+
+			fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr);
+			if (IS_ERR(fmr->fmr)) {
+				pr_warn(PFX "fmr_create failed for FMR %d\n",
+					i);
+				kfree(fmr);
+				goto out_fail;
+			}
+
+			list_add_tail(&fmr->list, &pool->free_list);
+			++pool->pool_size;
+		}
+	}
+
+	return pool;
+
+ out_free_pool:
+	kfree(pool->cache_bucket);
+	kfree(pool);
+
+	return ERR_PTR(ret);
+
+ out_fail:
+	ib_destroy_fmr_pool(pool);
+
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_fmr_pool);
+
+/**
+ * ib_destroy_fmr_pool - Free FMR pool
+ * @pool:FMR pool to free
+ *
+ * Destroy an FMR pool and free all associated resources.
+ */
+void ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
+{
+	struct ib_pool_fmr *fmr;
+	struct ib_pool_fmr *tmp;
+	LIST_HEAD(fmr_list);
+	int                 i;
+
+	kthread_destroy_worker(pool->worker);
+	ib_fmr_batch_release(pool);
+
+	i = 0;
+	list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) {
+		if (fmr->remap_count) {
+			INIT_LIST_HEAD(&fmr_list);
+			list_add_tail(&fmr->fmr->list, &fmr_list);
+			ib_unmap_fmr(&fmr_list);
+		}
+		ib_dealloc_fmr(fmr->fmr);
+		list_del(&fmr->list);
+		kfree(fmr);
+		++i;
+	}
+
+	if (i < pool->pool_size)
+		pr_warn(PFX "pool still has %d regions registered\n",
+			pool->pool_size - i);
+
+	kfree(pool->cache_bucket);
+	kfree(pool);
+}
+EXPORT_SYMBOL(ib_destroy_fmr_pool);
+
+/**
+ * ib_flush_fmr_pool - Invalidate all unmapped FMRs
+ * @pool:FMR pool to flush
+ *
+ * Ensure that all unmapped FMRs are fully invalidated.
+ */
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool)
+{
+	int serial;
+	struct ib_pool_fmr *fmr, *next;
+
+	/*
+	 * The free_list holds FMRs that may have been used
+	 * but have not been remapped enough times to be dirty.
+	 * Put them on the dirty list now so that the cleanup
+	 * thread will reap them too.
+	 */
+	spin_lock_irq(&pool->pool_lock);
+	list_for_each_entry_safe(fmr, next, &pool->free_list, list) {
+		if (fmr->remap_count > 0)
+			list_move(&fmr->list, &pool->dirty_list);
+	}
+	spin_unlock_irq(&pool->pool_lock);
+
+	serial = atomic_inc_return(&pool->req_ser);
+	kthread_queue_work(pool->worker, &pool->work);
+
+	if (wait_event_interruptible(pool->force_wait,
+				     atomic_read(&pool->flush_ser) - serial >= 0))
+		return -EINTR;
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_flush_fmr_pool);
+
+/**
+ * ib_fmr_pool_map_phys - Map an FMR from an FMR pool.
+ * @pool_handle: FMR pool to allocate FMR from
+ * @page_list: List of pages to map
+ * @list_len: Number of pages in @page_list
+ * @io_virtual_address: I/O virtual address for new FMR
+ */
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+					 u64                *page_list,
+					 int                 list_len,
+					 u64                 io_virtual_address)
+{
+	struct ib_fmr_pool *pool = pool_handle;
+	struct ib_pool_fmr *fmr;
+	unsigned long       flags;
+	int                 result;
+
+	if (list_len < 1 || list_len > pool->max_pages)
+		return ERR_PTR(-EINVAL);
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	fmr = ib_fmr_cache_lookup(pool,
+				  page_list,
+				  list_len,
+				  io_virtual_address);
+	if (fmr) {
+		/* found in cache */
+		++fmr->ref_count;
+		if (fmr->ref_count == 1) {
+			list_del(&fmr->list);
+		}
+
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+		return fmr;
+	}
+
+	if (list_empty(&pool->free_list)) {
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+		return ERR_PTR(-EAGAIN);
+	}
+
+	fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list);
+	list_del(&fmr->list);
+	hlist_del_init(&fmr->cache_node);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	result = ib_map_phys_fmr(fmr->fmr, page_list, list_len,
+				 io_virtual_address);
+
+	if (result) {
+		spin_lock_irqsave(&pool->pool_lock, flags);
+		list_add(&fmr->list, &pool->free_list);
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+		pr_warn(PFX "fmr_map returns %d\n", result);
+
+		return ERR_PTR(result);
+	}
+
+	++fmr->remap_count;
+	fmr->ref_count = 1;
+
+	if (pool->cache_bucket) {
+		fmr->io_virtual_address = io_virtual_address;
+		fmr->page_list_len      = list_len;
+		memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list));
+
+		spin_lock_irqsave(&pool->pool_lock, flags);
+		hlist_add_head(&fmr->cache_node,
+			       pool->cache_bucket + ib_fmr_hash(fmr->page_list[0]));
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+	}
+
+	return fmr;
+}
+EXPORT_SYMBOL(ib_fmr_pool_map_phys);
+
+/**
+ * ib_fmr_pool_unmap - Unmap FMR
+ * @fmr:FMR to unmap
+ *
+ * Unmap an FMR.  The FMR mapping may remain valid until the FMR is
+ * reused (or until ib_flush_fmr_pool() is called).
+ */
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
+{
+	struct ib_fmr_pool *pool;
+	unsigned long flags;
+
+	pool = fmr->pool;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+
+	--fmr->ref_count;
+	if (!fmr->ref_count) {
+		if (fmr->remap_count < pool->max_remaps) {
+			list_add_tail(&fmr->list, &pool->free_list);
+		} else {
+			list_add_tail(&fmr->list, &pool->dirty_list);
+			if (++pool->dirty_len >= pool->dirty_watermark) {
+				atomic_inc(&pool->req_ser);
+				kthread_queue_work(pool->worker, &pool->work);
+			}
+		}
+	}
+
+#ifdef DEBUG
+	if (fmr->ref_count < 0)
+		pr_warn(PFX "FMR %p has ref count %d < 0\n",
+			fmr, fmr->ref_count);
+#endif
+
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_fmr_pool_unmap);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
new file mode 100644
index 0000000..5d676cf
--- /dev/null
+++ b/drivers/infiniband/core/iwcm.c
@@ -0,0 +1,1203 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_addr.h>
+#include <rdma/iw_portmap.h>
+#include <rdma/rdma_netlink.h>
+
+#include "iwcm.h"
+
+MODULE_AUTHOR("Tom Tucker");
+MODULE_DESCRIPTION("iWARP CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static const char * const iwcm_rej_reason_strs[] = {
+	[ECONNRESET]			= "reset by remote host",
+	[ECONNREFUSED]			= "refused by remote application",
+	[ETIMEDOUT]			= "setup timeout",
+};
+
+const char *__attribute_const__ iwcm_reject_msg(int reason)
+{
+	size_t index;
+
+	/* iWARP uses negative errnos */
+	index = -reason;
+
+	if (index < ARRAY_SIZE(iwcm_rej_reason_strs) &&
+	    iwcm_rej_reason_strs[index])
+		return iwcm_rej_reason_strs[index];
+	else
+		return "unrecognized reason";
+}
+EXPORT_SYMBOL(iwcm_reject_msg);
+
+static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = {
+	[RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb},
+	[RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb},
+	[RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb},
+	[RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb},
+	[RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
+	[RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
+	[RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}
+};
+
+static struct workqueue_struct *iwcm_wq;
+struct iwcm_work {
+	struct work_struct work;
+	struct iwcm_id_private *cm_id;
+	struct list_head list;
+	struct iw_cm_event event;
+	struct list_head free_list;
+};
+
+static unsigned int default_backlog = 256;
+
+static struct ctl_table_header *iwcm_ctl_table_hdr;
+static struct ctl_table iwcm_ctl_table[] = {
+	{
+		.procname	= "default_backlog",
+		.data		= &default_backlog,
+		.maxlen		= sizeof(default_backlog),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+/*
+ * The following services provide a mechanism for pre-allocating iwcm_work
+ * elements.  The design pre-allocates them  based on the cm_id type:
+ *	LISTENING IDS: 	Get enough elements preallocated to handle the
+ *			listen backlog.
+ *	ACTIVE IDS:	4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE
+ *	PASSIVE IDS:	3: ESTABLISHED, DISCONNECT, CLOSE
+ *
+ * Allocating them in connect and listen avoids having to deal
+ * with allocation failures on the event upcall from the provider (which
+ * is called in the interrupt context).
+ *
+ * One exception is when creating the cm_id for incoming connection requests.
+ * There are two cases:
+ * 1) in the event upcall, cm_event_handler(), for a listening cm_id.  If
+ *    the backlog is exceeded, then no more connection request events will
+ *    be processed.  cm_event_handler() returns -ENOMEM in this case.  Its up
+ *    to the provider to reject the connection request.
+ * 2) in the connection request workqueue handler, cm_conn_req_handler().
+ *    If work elements cannot be allocated for the new connect request cm_id,
+ *    then IWCM will call the provider reject method.  This is ok since
+ *    cm_conn_req_handler() runs in the workqueue thread context.
+ */
+
+static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv)
+{
+	struct iwcm_work *work;
+
+	if (list_empty(&cm_id_priv->work_free_list))
+		return NULL;
+	work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work,
+			  free_list);
+	list_del_init(&work->free_list);
+	return work;
+}
+
+static void put_work(struct iwcm_work *work)
+{
+	list_add(&work->free_list, &work->cm_id->work_free_list);
+}
+
+static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv)
+{
+	struct list_head *e, *tmp;
+
+	list_for_each_safe(e, tmp, &cm_id_priv->work_free_list)
+		kfree(list_entry(e, struct iwcm_work, free_list));
+}
+
+static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)
+{
+	struct iwcm_work *work;
+
+	BUG_ON(!list_empty(&cm_id_priv->work_free_list));
+	while (count--) {
+		work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL);
+		if (!work) {
+			dealloc_work_entries(cm_id_priv);
+			return -ENOMEM;
+		}
+		work->cm_id = cm_id_priv;
+		INIT_LIST_HEAD(&work->list);
+		put_work(work);
+	}
+	return 0;
+}
+
+/*
+ * Save private data from incoming connection requests to
+ * iw_cm_event, so the low level driver doesn't have to. Adjust
+ * the event ptr to point to the local copy.
+ */
+static int copy_private_data(struct iw_cm_event *event)
+{
+	void *p;
+
+	p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC);
+	if (!p)
+		return -ENOMEM;
+	event->private_data = p;
+	return 0;
+}
+
+static void free_cm_id(struct iwcm_id_private *cm_id_priv)
+{
+	dealloc_work_entries(cm_id_priv);
+	kfree(cm_id_priv);
+}
+
+/*
+ * Release a reference on cm_id. If the last reference is being
+ * released, free the cm_id and return 1.
+ */
+static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
+{
+	BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+	if (atomic_dec_and_test(&cm_id_priv->refcount)) {
+		BUG_ON(!list_empty(&cm_id_priv->work_list));
+		free_cm_id(cm_id_priv);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void add_ref(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	atomic_inc(&cm_id_priv->refcount);
+}
+
+static void rem_ref(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	(void)iwcm_deref_id(cm_id_priv);
+}
+
+static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
+
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+				 iw_cm_handler cm_handler,
+				 void *context)
+{
+	struct iwcm_id_private *cm_id_priv;
+
+	cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL);
+	if (!cm_id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	cm_id_priv->state = IW_CM_STATE_IDLE;
+	cm_id_priv->id.device = device;
+	cm_id_priv->id.cm_handler = cm_handler;
+	cm_id_priv->id.context = context;
+	cm_id_priv->id.event_handler = cm_event_handler;
+	cm_id_priv->id.add_ref = add_ref;
+	cm_id_priv->id.rem_ref = rem_ref;
+	spin_lock_init(&cm_id_priv->lock);
+	atomic_set(&cm_id_priv->refcount, 1);
+	init_waitqueue_head(&cm_id_priv->connect_wait);
+	init_completion(&cm_id_priv->destroy_comp);
+	INIT_LIST_HEAD(&cm_id_priv->work_list);
+	INIT_LIST_HEAD(&cm_id_priv->work_free_list);
+
+	return &cm_id_priv->id;
+}
+EXPORT_SYMBOL(iw_create_cm_id);
+
+
+static int iwcm_modify_qp_err(struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+
+	if (!qp)
+		return -EINVAL;
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * This is really the RDMAC CLOSING state. It is most similar to the
+ * IB SQD QP state.
+ */
+static int iwcm_modify_qp_sqd(struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+
+	BUG_ON(qp == NULL);
+	qp_attr.qp_state = IB_QPS_SQD;
+	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * Block if a passive or active connection is currently being processed. Then
+ * process the event as follows:
+ * - If we are ESTABLISHED, move to CLOSING and modify the QP state
+ *   based on the abrupt flag
+ * - If the connection is already in the CLOSING or IDLE state, the peer is
+ *   disconnecting concurrently with us and we've already seen the
+ *   DISCONNECT event -- ignore the request and return 0
+ * - Disconnect on a listening endpoint returns -EINVAL
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+	struct ib_qp *qp = NULL;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	/* Wait if we're currently in a connect or accept downcall */
+	wait_event(cm_id_priv->connect_wait,
+		   !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_ESTABLISHED:
+		cm_id_priv->state = IW_CM_STATE_CLOSING;
+
+		/* QP could be <nul> for user-mode client */
+		if (cm_id_priv->qp)
+			qp = cm_id_priv->qp;
+		else
+			ret = -EINVAL;
+		break;
+	case IW_CM_STATE_LISTEN:
+		ret = -EINVAL;
+		break;
+	case IW_CM_STATE_CLOSING:
+		/* remote peer closed first */
+	case IW_CM_STATE_IDLE:
+		/* accept or connect returned !0 */
+		break;
+	case IW_CM_STATE_CONN_RECV:
+		/*
+		 * App called disconnect before/without calling accept after
+		 * connect_request event delivered.
+		 */
+		break;
+	case IW_CM_STATE_CONN_SENT:
+		/* Can only get here if wait above fails */
+	default:
+		BUG();
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	if (qp) {
+		if (abrupt)
+			ret = iwcm_modify_qp_err(qp);
+		else
+			ret = iwcm_modify_qp_sqd(qp);
+
+		/*
+		 * If both sides are disconnecting the QP could
+		 * already be in ERR or SQD states
+		 */
+		ret = 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_disconnect);
+
+/*
+ * CM_ID <-- DESTROYING
+ *
+ * Clean up all resources associated with the connection and release
+ * the initial reference taken by iw_create_cm_id.
+ */
+static void destroy_cm_id(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	/*
+	 * Wait if we're currently in a connect or accept downcall. A
+	 * listening endpoint should never block here.
+	 */
+	wait_event(cm_id_priv->connect_wait,
+		   !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+	/*
+	 * Since we're deleting the cm_id, drop any events that
+	 * might arrive before the last dereference.
+	 */
+	set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_LISTEN:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		/* destroy the listening endpoint */
+		cm_id->device->iwcm->destroy_listen(cm_id);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_ESTABLISHED:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		/* Abrupt close of the connection */
+		(void)iwcm_modify_qp_err(cm_id_priv->qp);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CLOSING:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		break;
+	case IW_CM_STATE_CONN_RECV:
+		/*
+		 * App called destroy before/without calling accept after
+		 * receiving connection request event notification or
+		 * returned non zero from the event callback function.
+		 * In either case, must tell the provider to reject.
+		 */
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_id->device->iwcm->reject(cm_id, NULL, 0);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_DESTROYING:
+	default:
+		BUG();
+		break;
+	}
+	if (cm_id_priv->qp) {
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	if (cm_id->mapped) {
+		iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr);
+		iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM);
+	}
+
+	(void)iwcm_deref_id(cm_id_priv);
+}
+
+/*
+ * This function is only called by the application thread and cannot
+ * be called by the event thread. The function will wait for all
+ * references to be released on the cm_id and then kfree the cm_id
+ * object.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id)
+{
+	destroy_cm_id(cm_id);
+}
+EXPORT_SYMBOL(iw_destroy_cm_id);
+
+/**
+ * iw_cm_check_wildcard - If IP address is 0 then use original
+ * @pm_addr: sockaddr containing the ip to check for wildcard
+ * @cm_addr: sockaddr containing the actual IP address
+ * @cm_outaddr: sockaddr to set IP addr which leaving port
+ *
+ *  Checks the pm_addr for wildcard and then sets cm_outaddr's
+ *  IP to the actual (cm_addr).
+ */
+static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr,
+				 struct sockaddr_storage *cm_addr,
+				 struct sockaddr_storage *cm_outaddr)
+{
+	if (pm_addr->ss_family == AF_INET) {
+		struct sockaddr_in *pm4_addr = (struct sockaddr_in *)pm_addr;
+
+		if (pm4_addr->sin_addr.s_addr == htonl(INADDR_ANY)) {
+			struct sockaddr_in *cm4_addr =
+				(struct sockaddr_in *)cm_addr;
+			struct sockaddr_in *cm4_outaddr =
+				(struct sockaddr_in *)cm_outaddr;
+
+			cm4_outaddr->sin_addr = cm4_addr->sin_addr;
+		}
+	} else {
+		struct sockaddr_in6 *pm6_addr = (struct sockaddr_in6 *)pm_addr;
+
+		if (ipv6_addr_type(&pm6_addr->sin6_addr) == IPV6_ADDR_ANY) {
+			struct sockaddr_in6 *cm6_addr =
+				(struct sockaddr_in6 *)cm_addr;
+			struct sockaddr_in6 *cm6_outaddr =
+				(struct sockaddr_in6 *)cm_outaddr;
+
+			cm6_outaddr->sin6_addr = cm6_addr->sin6_addr;
+		}
+	}
+}
+
+/**
+ * iw_cm_map - Use portmapper to map the ports
+ * @cm_id: connection manager pointer
+ * @active: Indicates the active side when true
+ * returns nonzero for error only if iwpm_create_mapinfo() fails
+ *
+ * Tries to add a mapping for a port using the Portmapper. If
+ * successful in mapping the IP/Port it will check the remote
+ * mapped IP address for a wildcard IP address and replace the
+ * zero IP address with the remote_addr.
+ */
+static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
+{
+	struct iwpm_dev_data pm_reg_msg;
+	struct iwpm_sa_data pm_msg;
+	int status;
+
+	cm_id->m_local_addr = cm_id->local_addr;
+	cm_id->m_remote_addr = cm_id->remote_addr;
+
+	memcpy(pm_reg_msg.dev_name, cm_id->device->name,
+	       sizeof(pm_reg_msg.dev_name));
+	memcpy(pm_reg_msg.if_name, cm_id->device->iwcm->ifname,
+	       sizeof(pm_reg_msg.if_name));
+
+	if (iwpm_register_pid(&pm_reg_msg, RDMA_NL_IWCM) ||
+	    !iwpm_valid_pid())
+		return 0;
+
+	cm_id->mapped = true;
+	pm_msg.loc_addr = cm_id->local_addr;
+	pm_msg.rem_addr = cm_id->remote_addr;
+	if (active)
+		status = iwpm_add_and_query_mapping(&pm_msg,
+						    RDMA_NL_IWCM);
+	else
+		status = iwpm_add_mapping(&pm_msg, RDMA_NL_IWCM);
+
+	if (!status) {
+		cm_id->m_local_addr = pm_msg.mapped_loc_addr;
+		if (active) {
+			cm_id->m_remote_addr = pm_msg.mapped_rem_addr;
+			iw_cm_check_wildcard(&pm_msg.mapped_rem_addr,
+					     &cm_id->remote_addr,
+					     &cm_id->m_remote_addr);
+		}
+	}
+
+	return iwpm_create_mapinfo(&cm_id->local_addr,
+				   &cm_id->m_local_addr,
+				   RDMA_NL_IWCM);
+}
+
+/*
+ * CM_ID <-- LISTEN
+ *
+ * Start listening for connect requests. Generates one CONNECT_REQUEST
+ * event for each inbound connect request.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	if (!backlog)
+		backlog = default_backlog;
+
+	ret = alloc_work_entries(cm_id_priv, backlog);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+		cm_id_priv->state = IW_CM_STATE_LISTEN;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = iw_cm_map(cm_id, false);
+		if (!ret)
+			ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
+		if (ret)
+			cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_listen);
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * Rejects an inbound connection request. No events are generated.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id,
+		 const void *private_data,
+		 u8 private_data_len)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	cm_id_priv->state = IW_CM_STATE_IDLE;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->reject(cm_id, private_data,
+					  private_data_len);
+
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_reject);
+
+/*
+ * CM_ID <-- ESTABLISHED
+ *
+ * Accepts an inbound connection request and generates an ESTABLISHED
+ * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block
+ * until the ESTABLISHED event is received from the provider.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id,
+		 struct iw_cm_conn_param *iw_param)
+{
+	struct iwcm_id_private *cm_id_priv;
+	struct ib_qp *qp;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	/* Get the ib_qp given the QPN */
+	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+	if (!qp) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	cm_id->device->iwcm->add_ref(qp);
+	cm_id_priv->qp = qp;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->accept(cm_id, iw_param);
+	if (ret) {
+		/* An error on accept precludes provider events */
+		BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		if (cm_id_priv->qp) {
+			cm_id->device->iwcm->rem_ref(qp);
+			cm_id_priv->qp = NULL;
+		}
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_accept);
+
+/*
+ * Active Side: CM_ID <-- CONN_SENT
+ *
+ * If successful, results in the generation of a CONNECT_REPLY
+ * event. iw_cm_disconnect and iw_cm_destroy will block until the
+ * CONNECT_REPLY event is received from the provider.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+	unsigned long flags;
+	struct ib_qp *qp;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	ret = alloc_work_entries(cm_id_priv, 4);
+	if (ret)
+		return ret;
+
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	if (cm_id_priv->state != IW_CM_STATE_IDLE) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	/* Get the ib_qp given the QPN */
+	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+	if (!qp) {
+		ret = -EINVAL;
+		goto err;
+	}
+	cm_id->device->iwcm->add_ref(qp);
+	cm_id_priv->qp = qp;
+	cm_id_priv->state = IW_CM_STATE_CONN_SENT;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = iw_cm_map(cm_id, true);
+	if (!ret)
+		ret = cm_id->device->iwcm->connect(cm_id, iw_param);
+	if (!ret)
+		return 0;	/* success */
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->qp) {
+		cm_id->device->iwcm->rem_ref(qp);
+		cm_id_priv->qp = NULL;
+	}
+	cm_id_priv->state = IW_CM_STATE_IDLE;
+err:
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	wake_up_all(&cm_id_priv->connect_wait);
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_connect);
+
+/*
+ * Passive Side: new CM_ID <-- CONN_RECV
+ *
+ * Handles an inbound connect request. The function creates a new
+ * iw_cm_id to represent the new connection and inherits the client
+ * callback function and other attributes from the listening parent.
+ *
+ * The work item contains a pointer to the listen_cm_id and the event. The
+ * listen_cm_id contains the client cm_handler, context and
+ * device. These are copied when the device is cloned. The event
+ * contains the new four tuple.
+ *
+ * An error on the child should not affect the parent, so this
+ * function does not return a value.
+ */
+static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
+				struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	struct iw_cm_id *cm_id;
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+
+	/*
+	 * The provider should never generate a connection request
+	 * event with a bad status.
+	 */
+	BUG_ON(iw_event->status);
+
+	cm_id = iw_create_cm_id(listen_id_priv->id.device,
+				listen_id_priv->id.cm_handler,
+				listen_id_priv->id.context);
+	/* If the cm_id could not be created, ignore the request */
+	if (IS_ERR(cm_id))
+		goto out;
+
+	cm_id->provider_data = iw_event->provider_data;
+	cm_id->m_local_addr = iw_event->local_addr;
+	cm_id->m_remote_addr = iw_event->remote_addr;
+	cm_id->local_addr = listen_id_priv->id.local_addr;
+
+	ret = iwpm_get_remote_info(&listen_id_priv->id.m_local_addr,
+				   &iw_event->remote_addr,
+				   &cm_id->remote_addr,
+				   RDMA_NL_IWCM);
+	if (ret) {
+		cm_id->remote_addr = iw_event->remote_addr;
+	} else {
+		iw_cm_check_wildcard(&listen_id_priv->id.m_local_addr,
+				     &iw_event->local_addr,
+				     &cm_id->local_addr);
+		iw_event->local_addr = cm_id->local_addr;
+		iw_event->remote_addr = cm_id->remote_addr;
+	}
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	cm_id_priv->state = IW_CM_STATE_CONN_RECV;
+
+	/*
+	 * We could be destroying the listening id. If so, ignore this
+	 * upcall.
+	 */
+	spin_lock_irqsave(&listen_id_priv->lock, flags);
+	if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
+		spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+		iw_cm_reject(cm_id, NULL, 0);
+		iw_destroy_cm_id(cm_id);
+		goto out;
+	}
+	spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+
+	ret = alloc_work_entries(cm_id_priv, 3);
+	if (ret) {
+		iw_cm_reject(cm_id, NULL, 0);
+		iw_destroy_cm_id(cm_id);
+		goto out;
+	}
+
+	/* Call the client CM handler */
+	ret = cm_id->cm_handler(cm_id, iw_event);
+	if (ret) {
+		iw_cm_reject(cm_id, NULL, 0);
+		iw_destroy_cm_id(cm_id);
+	}
+
+out:
+	if (iw_event->private_data_len)
+		kfree(iw_event->private_data);
+}
+
+/*
+ * Passive Side: CM_ID <-- ESTABLISHED
+ *
+ * The provider generated an ESTABLISHED event which means that
+ * the MPA negotion has completed successfully and we are now in MPA
+ * FPDU mode.
+ *
+ * This event can only be received in the CONN_RECV state. If the
+ * remote peer closed, the ESTABLISHED event would be received followed
+ * by the CLOSE event. If the app closes, it will block until we wake
+ * it up after processing this event.
+ */
+static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv,
+			       struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	/*
+	 * We clear the CONNECT_WAIT bit here to allow the callback
+	 * function to call iw_cm_disconnect. Calling iw_destroy_cm_id
+	 * from a callback handler is not allowed.
+	 */
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+	cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+
+/*
+ * Active Side: CM_ID <-- ESTABLISHED
+ *
+ * The app has called connect and is waiting for the established event to
+ * post it's requests to the server. This event will wake up anyone
+ * blocked in iw_cm_disconnect or iw_destroy_id.
+ */
+static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
+			       struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	/*
+	 * Clear the connect wait bit so a callback function calling
+	 * iw_cm_disconnect will not wait and deadlock this thread
+	 */
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+	if (iw_event->status == 0) {
+		cm_id_priv->id.m_local_addr = iw_event->local_addr;
+		cm_id_priv->id.m_remote_addr = iw_event->remote_addr;
+		iw_event->local_addr = cm_id_priv->id.local_addr;
+		iw_event->remote_addr = cm_id_priv->id.remote_addr;
+		cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+	} else {
+		/* REJECTED or RESET */
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+
+	if (iw_event->private_data_len)
+		kfree(iw_event->private_data);
+
+	/* Wake up waiters on connect complete */
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * If in the ESTABLISHED state, move to CLOSING.
+ */
+static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv,
+				  struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED)
+		cm_id_priv->state = IW_CM_STATE_CLOSING;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * If in the ESTBLISHED or CLOSING states, the QP will have have been
+ * moved by the provider to the ERR state. Disassociate the CM_ID from
+ * the QP,  move to IDLE, and remove the 'connected' reference.
+ *
+ * If in some other state, the cm_id was destroyed asynchronously.
+ * This is the last reference that will result in waking up
+ * the app thread blocked in iw_destroy_cm_id.
+ */
+static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
+				  struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret = 0;
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	if (cm_id_priv->qp) {
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+	}
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_ESTABLISHED:
+	case IW_CM_STATE_CLOSING:
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_DESTROYING:
+		break;
+	default:
+		BUG();
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+
+static int process_event(struct iwcm_id_private *cm_id_priv,
+			 struct iw_cm_event *iw_event)
+{
+	int ret = 0;
+
+	switch (iw_event->event) {
+	case IW_CM_EVENT_CONNECT_REQUEST:
+		cm_conn_req_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_CONNECT_REPLY:
+		ret = cm_conn_rep_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_ESTABLISHED:
+		ret = cm_conn_est_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_DISCONNECT:
+		cm_disconnect_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_CLOSE:
+		ret = cm_close_handler(cm_id_priv, iw_event);
+		break;
+	default:
+		BUG();
+	}
+
+	return ret;
+}
+
+/*
+ * Process events on the work_list for the cm_id. If the callback
+ * function requests that the cm_id be deleted, a flag is set in the
+ * cm_id flags to indicate that when the last reference is
+ * removed, the cm_id is to be destroyed. This is necessary to
+ * distinguish between an object that will be destroyed by the app
+ * thread asleep on the destroy_comp list vs. an object destroyed
+ * here synchronously when the last reference is removed.
+ */
+static void cm_work_handler(struct work_struct *_work)
+{
+	struct iwcm_work *work = container_of(_work, struct iwcm_work, work);
+	struct iw_cm_event levent;
+	struct iwcm_id_private *cm_id_priv = work->cm_id;
+	unsigned long flags;
+	int empty;
+	int ret = 0;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	empty = list_empty(&cm_id_priv->work_list);
+	while (!empty) {
+		work = list_entry(cm_id_priv->work_list.next,
+				  struct iwcm_work, list);
+		list_del_init(&work->list);
+		empty = list_empty(&cm_id_priv->work_list);
+		levent = work->event;
+		put_work(work);
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+		if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
+			ret = process_event(cm_id_priv, &levent);
+			if (ret)
+				destroy_cm_id(&cm_id_priv->id);
+		} else
+			pr_debug("dropping event %d\n", levent.event);
+		if (iwcm_deref_id(cm_id_priv))
+			return;
+		if (empty)
+			return;
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * This function is called on interrupt context. Schedule events on
+ * the iwcm_wq thread to allow callback functions to downcall into
+ * the CM and/or block.  Events are queued to a per-CM_ID
+ * work_list. If this is the first event on the work_list, the work
+ * element is also queued on the iwcm_wq thread.
+ *
+ * Each event holds a reference on the cm_id. Until the last posted
+ * event has been delivered and processed, the cm_id cannot be
+ * deleted.
+ *
+ * Returns:
+ * 	      0	- the event was handled.
+ *	-ENOMEM	- the event was not handled due to lack of resources.
+ */
+static int cm_event_handler(struct iw_cm_id *cm_id,
+			     struct iw_cm_event *iw_event)
+{
+	struct iwcm_work *work;
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	work = get_work(cm_id_priv);
+	if (!work) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_WORK(&work->work, cm_work_handler);
+	work->cm_id = cm_id_priv;
+	work->event = *iw_event;
+
+	if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST ||
+	     work->event.event == IW_CM_EVENT_CONNECT_REPLY) &&
+	    work->event.private_data_len) {
+		ret = copy_private_data(&work->event);
+		if (ret) {
+			put_work(work);
+			goto out;
+		}
+	}
+
+	atomic_inc(&cm_id_priv->refcount);
+	if (list_empty(&cm_id_priv->work_list)) {
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+		queue_work(iwcm_wq, &work->work);
+	} else
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+out:
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv,
+				  struct ib_qp_attr *qp_attr,
+				  int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_CONN_RECV:
+	case IW_CM_STATE_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+		qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE|
+					   IB_ACCESS_REMOTE_READ;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv,
+				  struct ib_qp_attr *qp_attr,
+				  int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_CONN_RECV:
+	case IW_CM_STATE_ESTABLISHED:
+		*qp_attr_mask = 0;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id,
+		       struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	switch (qp_attr->qp_state) {
+	case IB_QPS_INIT:
+	case IB_QPS_RTR:
+		ret = iwcm_init_qp_init_attr(cm_id_priv,
+					     qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTS:
+		ret = iwcm_init_qp_rts_attr(cm_id_priv,
+					    qp_attr, qp_attr_mask);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_init_qp_attr);
+
+static int __init iw_cm_init(void)
+{
+	int ret;
+
+	ret = iwpm_init(RDMA_NL_IWCM);
+	if (ret)
+		pr_err("iw_cm: couldn't init iwpm\n");
+	else
+		rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table);
+	iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0);
+	if (!iwcm_wq)
+		return -ENOMEM;
+
+	iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm",
+						 iwcm_ctl_table);
+	if (!iwcm_ctl_table_hdr) {
+		pr_err("iw_cm: couldn't register sysctl paths\n");
+		destroy_workqueue(iwcm_wq);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void __exit iw_cm_cleanup(void)
+{
+	unregister_net_sysctl_table(iwcm_ctl_table_hdr);
+	destroy_workqueue(iwcm_wq);
+	rdma_nl_unregister(RDMA_NL_IWCM);
+	iwpm_exit(RDMA_NL_IWCM);
+}
+
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_IWCM, 2);
+
+module_init(iw_cm_init);
+module_exit(iw_cm_cleanup);
diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h
new file mode 100644
index 0000000..82c2cd1
--- /dev/null
+++ b/drivers/infiniband/core/iwcm.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef IWCM_H
+#define IWCM_H
+
+enum iw_cm_state {
+	IW_CM_STATE_IDLE,             /* unbound, inactive */
+	IW_CM_STATE_LISTEN,           /* listen waiting for connect */
+	IW_CM_STATE_CONN_RECV,        /* inbound waiting for user accept */
+	IW_CM_STATE_CONN_SENT,        /* outbound waiting for peer accept */
+	IW_CM_STATE_ESTABLISHED,      /* established */
+	IW_CM_STATE_CLOSING,	      /* disconnect */
+	IW_CM_STATE_DESTROYING        /* object being deleted */
+};
+
+struct iwcm_id_private {
+	struct iw_cm_id	id;
+	enum iw_cm_state state;
+	unsigned long flags;
+	struct ib_qp *qp;
+	struct completion destroy_comp;
+	wait_queue_head_t connect_wait;
+	struct list_head work_list;
+	spinlock_t lock;
+	atomic_t refcount;
+	struct list_head work_free_list;
+};
+
+#define IWCM_F_DROP_EVENTS	  1
+#define IWCM_F_CONNECT_WAIT       2
+
+#endif /* IWCM_H */
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
new file mode 100644
index 0000000..8861c05
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -0,0 +1,750 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser";
+static int iwpm_ulib_version = 3;
+static int iwpm_user_pid = IWPM_PID_UNDEFINED;
+static atomic_t echo_nlmsg_seq;
+
+int iwpm_valid_pid(void)
+{
+	return iwpm_user_pid > 0;
+}
+
+/*
+ * iwpm_register_pid - Send a netlink query to user space
+ *                     for the iwarp port mapper pid
+ *
+ * nlmsg attributes:
+ *	[IWPM_NLA_REG_PID_SEQ]
+ *	[IWPM_NLA_REG_IF_NAME]
+ *	[IWPM_NLA_REG_IBDEV_NAME]
+ *	[IWPM_NLA_REG_ULIB_NAME]
+ */
+int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
+{
+	struct sk_buff *skb = NULL;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		err_str = "Invalid port mapper client";
+		goto pid_query_error;
+	}
+	if (iwpm_check_registration(nl_client, IWPM_REG_VALID) ||
+			iwpm_user_pid == IWPM_PID_UNAVAILABLE)
+		return 0;
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto pid_query_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL);
+	if (!nlmsg_request) {
+		err_str = "Unable to allocate netlink request";
+		goto pid_query_error;
+	}
+	msg_seq = atomic_read(&echo_nlmsg_seq);
+
+	/* fill in the pid request message */
+	err_str = "Unable to put attribute of the nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_REG_PID_SEQ);
+	if (ret)
+		goto pid_query_error;
+	ret = ibnl_put_attr(skb, nlh, IFNAMSIZ,
+			    pm_msg->if_name, IWPM_NLA_REG_IF_NAME);
+	if (ret)
+		goto pid_query_error;
+	ret = ibnl_put_attr(skb, nlh, IWPM_DEVNAME_SIZE,
+				pm_msg->dev_name, IWPM_NLA_REG_IBDEV_NAME);
+	if (ret)
+		goto pid_query_error;
+	ret = ibnl_put_attr(skb, nlh, IWPM_ULIBNAME_SIZE,
+				(char *)iwpm_ulib_name, IWPM_NLA_REG_ULIB_NAME);
+	if (ret)
+		goto pid_query_error;
+
+	nlmsg_end(skb, nlh);
+
+	pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n",
+		__func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name);
+
+	ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL);
+	if (ret) {
+		skb = NULL; /* skb is freed in the netlink send-op handling */
+		iwpm_user_pid = IWPM_PID_UNAVAILABLE;
+		err_str = "Unable to send a nlmsg";
+		goto pid_query_error;
+	}
+	nlmsg_request->req_buffer = pm_msg;
+	ret = iwpm_wait_complete_req(nlmsg_request);
+	return ret;
+pid_query_error:
+	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+	if (skb)
+		dev_kfree_skb(skb);
+	if (nlmsg_request)
+		iwpm_free_nlmsg_request(&nlmsg_request->kref);
+	return ret;
+}
+
+/*
+ * iwpm_add_mapping - Send a netlink add mapping message
+ *                    to the port mapper
+ * nlmsg attributes:
+ *	[IWPM_NLA_MANAGE_MAPPING_SEQ]
+ *	[IWPM_NLA_MANAGE_ADDR]
+ */
+int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
+{
+	struct sk_buff *skb = NULL;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		err_str = "Invalid port mapper client";
+		goto add_mapping_error;
+	}
+	if (!iwpm_valid_pid())
+		return 0;
+	if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) {
+		err_str = "Unregistered port mapper client";
+		goto add_mapping_error;
+	}
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto add_mapping_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL);
+	if (!nlmsg_request) {
+		err_str = "Unable to allocate netlink request";
+		goto add_mapping_error;
+	}
+	msg_seq = atomic_read(&echo_nlmsg_seq);
+	/* fill in the add mapping message */
+	err_str = "Unable to put attribute of the nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+				IWPM_NLA_MANAGE_MAPPING_SEQ);
+	if (ret)
+		goto add_mapping_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+				&pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR);
+	if (ret)
+		goto add_mapping_error;
+
+	nlmsg_end(skb, nlh);
+	nlmsg_request->req_buffer = pm_msg;
+
+	ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
+	if (ret) {
+		skb = NULL; /* skb is freed in the netlink send-op handling */
+		iwpm_user_pid = IWPM_PID_UNDEFINED;
+		err_str = "Unable to send a nlmsg";
+		goto add_mapping_error;
+	}
+	ret = iwpm_wait_complete_req(nlmsg_request);
+	return ret;
+add_mapping_error:
+	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+	if (skb)
+		dev_kfree_skb(skb);
+	if (nlmsg_request)
+		iwpm_free_nlmsg_request(&nlmsg_request->kref);
+	return ret;
+}
+
+/*
+ * iwpm_add_and_query_mapping - Send a netlink add and query
+ *                              mapping message to the port mapper
+ * nlmsg attributes:
+ *	[IWPM_NLA_QUERY_MAPPING_SEQ]
+ *	[IWPM_NLA_QUERY_LOCAL_ADDR]
+ *	[IWPM_NLA_QUERY_REMOTE_ADDR]
+ */
+int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
+{
+	struct sk_buff *skb = NULL;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		err_str = "Invalid port mapper client";
+		goto query_mapping_error;
+	}
+	if (!iwpm_valid_pid())
+		return 0;
+	if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) {
+		err_str = "Unregistered port mapper client";
+		goto query_mapping_error;
+	}
+	ret = -ENOMEM;
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto query_mapping_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq,
+				nl_client, GFP_KERNEL);
+	if (!nlmsg_request) {
+		err_str = "Unable to allocate netlink request";
+		goto query_mapping_error;
+	}
+	msg_seq = atomic_read(&echo_nlmsg_seq);
+
+	/* fill in the query message */
+	err_str = "Unable to put attribute of the nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+				IWPM_NLA_QUERY_MAPPING_SEQ);
+	if (ret)
+		goto query_mapping_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+				&pm_msg->loc_addr, IWPM_NLA_QUERY_LOCAL_ADDR);
+	if (ret)
+		goto query_mapping_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+				&pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR);
+	if (ret)
+		goto query_mapping_error;
+
+	nlmsg_end(skb, nlh);
+	nlmsg_request->req_buffer = pm_msg;
+
+	ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
+	if (ret) {
+		skb = NULL; /* skb is freed in the netlink send-op handling */
+		err_str = "Unable to send a nlmsg";
+		goto query_mapping_error;
+	}
+	ret = iwpm_wait_complete_req(nlmsg_request);
+	return ret;
+query_mapping_error:
+	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+	if (skb)
+		dev_kfree_skb(skb);
+	if (nlmsg_request)
+		iwpm_free_nlmsg_request(&nlmsg_request->kref);
+	return ret;
+}
+
+/*
+ * iwpm_remove_mapping - Send a netlink remove mapping message
+ *                       to the port mapper
+ * nlmsg attributes:
+ *	[IWPM_NLA_MANAGE_MAPPING_SEQ]
+ *	[IWPM_NLA_MANAGE_ADDR]
+ */
+int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		err_str = "Invalid port mapper client";
+		goto remove_mapping_error;
+	}
+	if (!iwpm_valid_pid())
+		return 0;
+	if (iwpm_check_registration(nl_client, IWPM_REG_UNDEF)) {
+		err_str = "Unregistered port mapper client";
+		goto remove_mapping_error;
+	}
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client);
+	if (!skb) {
+		ret = -ENOMEM;
+		err_str = "Unable to create a nlmsg";
+		goto remove_mapping_error;
+	}
+	msg_seq = atomic_read(&echo_nlmsg_seq);
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	err_str = "Unable to put attribute of the nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+				IWPM_NLA_MANAGE_MAPPING_SEQ);
+	if (ret)
+		goto remove_mapping_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+				local_addr, IWPM_NLA_MANAGE_ADDR);
+	if (ret)
+		goto remove_mapping_error;
+
+	nlmsg_end(skb, nlh);
+
+	ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
+	if (ret) {
+		skb = NULL; /* skb is freed in the netlink send-op handling */
+		iwpm_user_pid = IWPM_PID_UNDEFINED;
+		err_str = "Unable to send a nlmsg";
+		goto remove_mapping_error;
+	}
+	iwpm_print_sockaddr(local_addr,
+			"remove_mapping: Local sockaddr:");
+	return 0;
+remove_mapping_error:
+	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+	if (skb)
+		dev_kfree_skb_any(skb);
+	return ret;
+}
+
+/* netlink attribute policy for the received response to register pid request */
+static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = {
+	[IWPM_NLA_RREG_PID_SEQ]     = { .type = NLA_U32 },
+	[IWPM_NLA_RREG_IBDEV_NAME]  = { .type = NLA_STRING,
+					.len = IWPM_DEVNAME_SIZE - 1 },
+	[IWPM_NLA_RREG_ULIB_NAME]   = { .type = NLA_STRING,
+					.len = IWPM_ULIBNAME_SIZE - 1 },
+	[IWPM_NLA_RREG_ULIB_VER]    = { .type = NLA_U16 },
+	[IWPM_NLA_RREG_PID_ERR]     = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_register_pid_cb - Process a port mapper response to
+ *                        iwpm_register_pid()
+ */
+int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlattr *nltb[IWPM_NLA_RREG_PID_MAX];
+	struct iwpm_dev_data *pm_msg;
+	char *dev_name, *iwpm_name;
+	u32 msg_seq;
+	u8 nl_client;
+	u16 iwpm_version;
+	const char *msg_type = "Register Pid response";
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_RREG_PID_MAX,
+				resp_reg_policy, nltb, msg_type))
+		return -EINVAL;
+
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_RREG_PID_SEQ]);
+	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+	if (!nlmsg_request) {
+		pr_info("%s: Could not find a matching request (seq = %u)\n",
+				 __func__, msg_seq);
+		return -EINVAL;
+	}
+	pm_msg = nlmsg_request->req_buffer;
+	nl_client = nlmsg_request->nl_client;
+	dev_name = (char *)nla_data(nltb[IWPM_NLA_RREG_IBDEV_NAME]);
+	iwpm_name = (char *)nla_data(nltb[IWPM_NLA_RREG_ULIB_NAME]);
+	iwpm_version = nla_get_u16(nltb[IWPM_NLA_RREG_ULIB_VER]);
+
+	/* check device name, ulib name and version */
+	if (strcmp(pm_msg->dev_name, dev_name) ||
+			strcmp(iwpm_ulib_name, iwpm_name) ||
+			iwpm_version != iwpm_ulib_version) {
+
+		pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n",
+				__func__, dev_name, iwpm_name, iwpm_version);
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto register_pid_response_exit;
+	}
+	iwpm_user_pid = cb->nlh->nlmsg_pid;
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
+			__func__, iwpm_user_pid);
+	if (iwpm_valid_client(nl_client))
+		iwpm_set_registration(nl_client, IWPM_REG_VALID);
+register_pid_response_exit:
+	nlmsg_request->request_done = 1;
+	/* always for found nlmsg_request */
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	barrier();
+	up(&nlmsg_request->sem);
+	return 0;
+}
+
+/* netlink attribute policy for the received response to add mapping request */
+static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
+	[IWPM_NLA_MANAGE_MAPPING_SEQ]     = { .type = NLA_U32 },
+	[IWPM_NLA_MANAGE_ADDR]            = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RMANAGE_MAPPING_ERR]	  = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_add_mapping_cb - Process a port mapper response to
+ *                       iwpm_add_mapping()
+ */
+int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct iwpm_sa_data *pm_msg;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlattr *nltb[IWPM_NLA_RMANAGE_MAPPING_MAX];
+	struct sockaddr_storage *local_sockaddr;
+	struct sockaddr_storage *mapped_sockaddr;
+	const char *msg_type;
+	u32 msg_seq;
+
+	msg_type = "Add Mapping response";
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_RMANAGE_MAPPING_MAX,
+				resp_add_policy, nltb, msg_type))
+		return -EINVAL;
+
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]);
+	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+	if (!nlmsg_request) {
+		pr_info("%s: Could not find a matching request (seq = %u)\n",
+				 __func__, msg_seq);
+		return -EINVAL;
+	}
+	pm_msg = nlmsg_request->req_buffer;
+	local_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_MANAGE_ADDR]);
+	mapped_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]);
+
+	if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) {
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto add_mapping_response_exit;
+	}
+	if (mapped_sockaddr->ss_family != local_sockaddr->ss_family) {
+		pr_info("%s: Sockaddr family doesn't match the requested one\n",
+				__func__);
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto add_mapping_response_exit;
+	}
+	memcpy(&pm_msg->mapped_loc_addr, mapped_sockaddr,
+			sizeof(*mapped_sockaddr));
+	iwpm_print_sockaddr(&pm_msg->loc_addr,
+			"add_mapping: Local sockaddr:");
+	iwpm_print_sockaddr(&pm_msg->mapped_loc_addr,
+			"add_mapping: Mapped local sockaddr:");
+
+add_mapping_response_exit:
+	nlmsg_request->request_done = 1;
+	/* always for found request */
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	barrier();
+	up(&nlmsg_request->sem);
+	return 0;
+}
+
+/* netlink attribute policy for the response to add and query mapping request
+ * and response with remote address info */
+static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = {
+	[IWPM_NLA_QUERY_MAPPING_SEQ]      = { .type = NLA_U32 },
+	[IWPM_NLA_QUERY_LOCAL_ADDR]       = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_QUERY_REMOTE_ADDR]      = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPING_ERR]	  = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_add_and_query_mapping_cb - Process a port mapper response to
+ *                                 iwpm_add_and_query_mapping()
+ */
+int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
+				struct netlink_callback *cb)
+{
+	struct iwpm_sa_data *pm_msg;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX];
+	struct sockaddr_storage *local_sockaddr, *remote_sockaddr;
+	struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr;
+	const char *msg_type;
+	u32 msg_seq;
+	u16 err_code;
+
+	msg_type = "Query Mapping response";
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX,
+				resp_query_policy, nltb, msg_type))
+		return -EINVAL;
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]);
+	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+	if (!nlmsg_request) {
+		pr_info("%s: Could not find a matching request (seq = %u)\n",
+				 __func__, msg_seq);
+		return -EINVAL;
+	}
+	pm_msg = nlmsg_request->req_buffer;
+	local_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+	remote_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+	mapped_loc_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
+	mapped_rem_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]);
+
+	err_code = nla_get_u16(nltb[IWPM_NLA_RQUERY_MAPPING_ERR]);
+	if (err_code == IWPM_REMOTE_QUERY_REJECT) {
+		pr_info("%s: Received a Reject (pid = %u, echo seq = %u)\n",
+			__func__, cb->nlh->nlmsg_pid, msg_seq);
+		nlmsg_request->err_code = IWPM_REMOTE_QUERY_REJECT;
+	}
+	if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr) ||
+		iwpm_compare_sockaddr(remote_sockaddr, &pm_msg->rem_addr)) {
+		pr_info("%s: Incorrect local sockaddr\n", __func__);
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto query_mapping_response_exit;
+	}
+	if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family ||
+		mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) {
+		pr_info("%s: Sockaddr family doesn't match the requested one\n",
+				__func__);
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto query_mapping_response_exit;
+	}
+	memcpy(&pm_msg->mapped_loc_addr, mapped_loc_sockaddr,
+			sizeof(*mapped_loc_sockaddr));
+	memcpy(&pm_msg->mapped_rem_addr, mapped_rem_sockaddr,
+			sizeof(*mapped_rem_sockaddr));
+
+	iwpm_print_sockaddr(&pm_msg->loc_addr,
+			"query_mapping: Local sockaddr:");
+	iwpm_print_sockaddr(&pm_msg->mapped_loc_addr,
+			"query_mapping: Mapped local sockaddr:");
+	iwpm_print_sockaddr(&pm_msg->rem_addr,
+			"query_mapping: Remote sockaddr:");
+	iwpm_print_sockaddr(&pm_msg->mapped_rem_addr,
+			"query_mapping: Mapped remote sockaddr:");
+query_mapping_response_exit:
+	nlmsg_request->request_done = 1;
+	/* always for found request */
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	barrier();
+	up(&nlmsg_request->sem);
+	return 0;
+}
+
+/*
+ * iwpm_remote_info_cb - Process a port mapper message, containing
+ *			  the remote connecting peer address info
+ */
+int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX];
+	struct sockaddr_storage *local_sockaddr, *remote_sockaddr;
+	struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr;
+	struct iwpm_remote_info *rem_info;
+	const char *msg_type;
+	u8 nl_client;
+	int ret = -EINVAL;
+
+	msg_type = "Remote Mapping info";
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX,
+				resp_query_policy, nltb, msg_type))
+		return ret;
+
+	nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+	if (!iwpm_valid_client(nl_client)) {
+		pr_info("%s: Invalid port mapper client = %d\n",
+				__func__, nl_client);
+		return ret;
+	}
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+	local_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+	remote_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+	mapped_loc_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
+	mapped_rem_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]);
+
+	if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family ||
+		mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) {
+		pr_info("%s: Sockaddr family doesn't match the requested one\n",
+				__func__);
+		return ret;
+	}
+	rem_info = kzalloc(sizeof(struct iwpm_remote_info), GFP_ATOMIC);
+	if (!rem_info) {
+		ret = -ENOMEM;
+		return ret;
+	}
+	memcpy(&rem_info->mapped_loc_sockaddr, mapped_loc_sockaddr,
+	       sizeof(struct sockaddr_storage));
+	memcpy(&rem_info->remote_sockaddr, remote_sockaddr,
+	       sizeof(struct sockaddr_storage));
+	memcpy(&rem_info->mapped_rem_sockaddr, mapped_rem_sockaddr,
+	       sizeof(struct sockaddr_storage));
+	rem_info->nl_client = nl_client;
+
+	iwpm_add_remote_info(rem_info);
+
+	iwpm_print_sockaddr(local_sockaddr,
+			"remote_info: Local sockaddr:");
+	iwpm_print_sockaddr(mapped_loc_sockaddr,
+			"remote_info: Mapped local sockaddr:");
+	iwpm_print_sockaddr(remote_sockaddr,
+			"remote_info: Remote sockaddr:");
+	iwpm_print_sockaddr(mapped_rem_sockaddr,
+			"remote_info: Mapped remote sockaddr:");
+	return ret;
+}
+
+/* netlink attribute policy for the received request for mapping info */
+static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = {
+	[IWPM_NLA_MAPINFO_ULIB_NAME] = { .type = NLA_STRING,
+					.len = IWPM_ULIBNAME_SIZE - 1 },
+	[IWPM_NLA_MAPINFO_ULIB_VER]  = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_mapping_info_cb - Process a port mapper request for mapping info
+ */
+int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX];
+	const char *msg_type = "Mapping Info response";
+	u8 nl_client;
+	char *iwpm_name;
+	u16 iwpm_version;
+	int ret = -EINVAL;
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_REQ_MAX,
+				resp_mapinfo_policy, nltb, msg_type)) {
+		pr_info("%s: Unable to parse nlmsg\n", __func__);
+		return ret;
+	}
+	iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]);
+	iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]);
+	if (strcmp(iwpm_ulib_name, iwpm_name) ||
+			iwpm_version != iwpm_ulib_version) {
+		pr_info("%s: Invalid port mapper name = %s version = %d\n",
+				__func__, iwpm_name, iwpm_version);
+		return ret;
+	}
+	nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+	if (!iwpm_valid_client(nl_client)) {
+		pr_info("%s: Invalid port mapper client = %d\n",
+				__func__, nl_client);
+		return ret;
+	}
+	iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	iwpm_user_pid = cb->nlh->nlmsg_pid;
+	if (!iwpm_mapinfo_available())
+		return 0;
+	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
+		 __func__, iwpm_user_pid);
+	ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid);
+	return ret;
+}
+
+/* netlink attribute policy for the received mapping info ack */
+static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = {
+	[IWPM_NLA_MAPINFO_SEQ]    =   { .type = NLA_U32 },
+	[IWPM_NLA_MAPINFO_SEND_NUM] = { .type = NLA_U32 },
+	[IWPM_NLA_MAPINFO_ACK_NUM] =  { .type = NLA_U32 }
+};
+
+/*
+ * iwpm_ack_mapping_info_cb - Process a port mapper ack for
+ *                            the provided mapping info records
+ */
+int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *nltb[IWPM_NLA_MAPINFO_NUM_MAX];
+	u32 mapinfo_send, mapinfo_ack;
+	const char *msg_type = "Mapping Info Ack";
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_NUM_MAX,
+				ack_mapinfo_policy, nltb, msg_type))
+		return -EINVAL;
+	mapinfo_send = nla_get_u32(nltb[IWPM_NLA_MAPINFO_SEND_NUM]);
+	mapinfo_ack = nla_get_u32(nltb[IWPM_NLA_MAPINFO_ACK_NUM]);
+	if (mapinfo_ack != mapinfo_send)
+		pr_info("%s: Invalid mapinfo number (sent = %u ack-ed = %u)\n",
+			__func__, mapinfo_send, mapinfo_ack);
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	return 0;
+}
+
+/* netlink attribute policy for the received port mapper error message */
+static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = {
+	[IWPM_NLA_ERR_SEQ]        = { .type = NLA_U32 },
+	[IWPM_NLA_ERR_CODE]       = { .type = NLA_U16 },
+};
+
+/*
+ * iwpm_mapping_error_cb - Process a port mapper error message
+ */
+int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	int nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+	struct nlattr *nltb[IWPM_NLA_ERR_MAX];
+	u32 msg_seq;
+	u16 err_code;
+	const char *msg_type = "Mapping Error Msg";
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_ERR_MAX,
+				map_error_policy, nltb, msg_type))
+		return -EINVAL;
+
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_ERR_SEQ]);
+	err_code = nla_get_u16(nltb[IWPM_NLA_ERR_CODE]);
+	pr_info("%s: Received msg seq = %u err code = %u client = %d\n",
+				__func__, msg_seq, err_code, nl_client);
+	/* look for nlmsg_request */
+	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+	if (!nlmsg_request) {
+		/* not all errors have associated requests */
+		pr_debug("Could not find matching req (seq = %u)\n", msg_seq);
+		return 0;
+	}
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	nlmsg_request->err_code = err_code;
+	nlmsg_request->request_done = 1;
+	/* always for found request */
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	barrier();
+	up(&nlmsg_request->sem);
+	return 0;
+}
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
new file mode 100644
index 0000000..da12da1
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+#define IWPM_MAPINFO_HASH_SIZE	512
+#define IWPM_MAPINFO_HASH_MASK	(IWPM_MAPINFO_HASH_SIZE - 1)
+#define IWPM_REMINFO_HASH_SIZE	64
+#define IWPM_REMINFO_HASH_MASK	(IWPM_REMINFO_HASH_SIZE - 1)
+#define IWPM_MSG_SIZE		512
+
+static LIST_HEAD(iwpm_nlmsg_req_list);
+static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock);
+
+static struct hlist_head *iwpm_hash_bucket;
+static DEFINE_SPINLOCK(iwpm_mapinfo_lock);
+
+static struct hlist_head *iwpm_reminfo_bucket;
+static DEFINE_SPINLOCK(iwpm_reminfo_lock);
+
+static DEFINE_MUTEX(iwpm_admin_lock);
+static struct iwpm_admin_data iwpm_admin;
+
+int iwpm_init(u8 nl_client)
+{
+	int ret = 0;
+	mutex_lock(&iwpm_admin_lock);
+	if (atomic_read(&iwpm_admin.refcount) == 0) {
+		iwpm_hash_bucket = kzalloc(IWPM_MAPINFO_HASH_SIZE *
+					sizeof(struct hlist_head), GFP_KERNEL);
+		if (!iwpm_hash_bucket) {
+			ret = -ENOMEM;
+			goto init_exit;
+		}
+		iwpm_reminfo_bucket = kzalloc(IWPM_REMINFO_HASH_SIZE *
+					sizeof(struct hlist_head), GFP_KERNEL);
+		if (!iwpm_reminfo_bucket) {
+			kfree(iwpm_hash_bucket);
+			ret = -ENOMEM;
+			goto init_exit;
+		}
+	}
+	atomic_inc(&iwpm_admin.refcount);
+init_exit:
+	mutex_unlock(&iwpm_admin_lock);
+	if (!ret) {
+		iwpm_set_valid(nl_client, 1);
+		iwpm_set_registration(nl_client, IWPM_REG_UNDEF);
+		pr_debug("%s: Mapinfo and reminfo tables are created\n",
+				__func__);
+	}
+	return ret;
+}
+
+static void free_hash_bucket(void);
+static void free_reminfo_bucket(void);
+
+int iwpm_exit(u8 nl_client)
+{
+
+	if (!iwpm_valid_client(nl_client))
+		return -EINVAL;
+	mutex_lock(&iwpm_admin_lock);
+	if (atomic_read(&iwpm_admin.refcount) == 0) {
+		mutex_unlock(&iwpm_admin_lock);
+		pr_err("%s Incorrect usage - negative refcount\n", __func__);
+		return -EINVAL;
+	}
+	if (atomic_dec_and_test(&iwpm_admin.refcount)) {
+		free_hash_bucket();
+		free_reminfo_bucket();
+		pr_debug("%s: Resources are destroyed\n", __func__);
+	}
+	mutex_unlock(&iwpm_admin_lock);
+	iwpm_set_valid(nl_client, 0);
+	iwpm_set_registration(nl_client, IWPM_REG_UNDEF);
+	return 0;
+}
+
+static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *,
+					       struct sockaddr_storage *);
+
+int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
+			struct sockaddr_storage *mapped_sockaddr,
+			u8 nl_client)
+{
+	struct hlist_head *hash_bucket_head = NULL;
+	struct iwpm_mapping_info *map_info;
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client))
+		return ret;
+	map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL);
+	if (!map_info)
+		return -ENOMEM;
+
+	memcpy(&map_info->local_sockaddr, local_sockaddr,
+	       sizeof(struct sockaddr_storage));
+	memcpy(&map_info->mapped_sockaddr, mapped_sockaddr,
+	       sizeof(struct sockaddr_storage));
+	map_info->nl_client = nl_client;
+
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	if (iwpm_hash_bucket) {
+		hash_bucket_head = get_mapinfo_hash_bucket(
+					&map_info->local_sockaddr,
+					&map_info->mapped_sockaddr);
+		if (hash_bucket_head) {
+			hlist_add_head(&map_info->hlist_node, hash_bucket_head);
+			ret = 0;
+		}
+	}
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+
+	if (!hash_bucket_head)
+		kfree(map_info);
+	return ret;
+}
+
+int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr,
+			struct sockaddr_storage *mapped_local_addr)
+{
+	struct hlist_node *tmp_hlist_node;
+	struct hlist_head *hash_bucket_head;
+	struct iwpm_mapping_info *map_info = NULL;
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	if (iwpm_hash_bucket) {
+		hash_bucket_head = get_mapinfo_hash_bucket(
+					local_sockaddr,
+					mapped_local_addr);
+		if (!hash_bucket_head)
+			goto remove_mapinfo_exit;
+
+		hlist_for_each_entry_safe(map_info, tmp_hlist_node,
+					hash_bucket_head, hlist_node) {
+
+			if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr,
+						mapped_local_addr)) {
+
+				hlist_del_init(&map_info->hlist_node);
+				kfree(map_info);
+				ret = 0;
+				break;
+			}
+		}
+	}
+remove_mapinfo_exit:
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+	return ret;
+}
+
+static void free_hash_bucket(void)
+{
+	struct hlist_node *tmp_hlist_node;
+	struct iwpm_mapping_info *map_info;
+	unsigned long flags;
+	int i;
+
+	/* remove all the mapinfo data from the list */
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+		hlist_for_each_entry_safe(map_info, tmp_hlist_node,
+			&iwpm_hash_bucket[i], hlist_node) {
+
+				hlist_del_init(&map_info->hlist_node);
+				kfree(map_info);
+			}
+	}
+	/* free the hash list */
+	kfree(iwpm_hash_bucket);
+	iwpm_hash_bucket = NULL;
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+}
+
+static void free_reminfo_bucket(void)
+{
+	struct hlist_node *tmp_hlist_node;
+	struct iwpm_remote_info *rem_info;
+	unsigned long flags;
+	int i;
+
+	/* remove all the remote info from the list */
+	spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+	for (i = 0; i < IWPM_REMINFO_HASH_SIZE; i++) {
+		hlist_for_each_entry_safe(rem_info, tmp_hlist_node,
+			&iwpm_reminfo_bucket[i], hlist_node) {
+
+				hlist_del_init(&rem_info->hlist_node);
+				kfree(rem_info);
+			}
+	}
+	/* free the hash list */
+	kfree(iwpm_reminfo_bucket);
+	iwpm_reminfo_bucket = NULL;
+	spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+}
+
+static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *,
+						struct sockaddr_storage *);
+
+void iwpm_add_remote_info(struct iwpm_remote_info *rem_info)
+{
+	struct hlist_head *hash_bucket_head;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+	if (iwpm_reminfo_bucket) {
+		hash_bucket_head = get_reminfo_hash_bucket(
+					&rem_info->mapped_loc_sockaddr,
+					&rem_info->mapped_rem_sockaddr);
+		if (hash_bucket_head)
+			hlist_add_head(&rem_info->hlist_node, hash_bucket_head);
+	}
+	spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+}
+
+int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
+			 struct sockaddr_storage *mapped_rem_addr,
+			 struct sockaddr_storage *remote_addr,
+			 u8 nl_client)
+{
+	struct hlist_node *tmp_hlist_node;
+	struct hlist_head *hash_bucket_head;
+	struct iwpm_remote_info *rem_info = NULL;
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		pr_info("%s: Invalid client = %d\n", __func__, nl_client);
+		return ret;
+	}
+	spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+	if (iwpm_reminfo_bucket) {
+		hash_bucket_head = get_reminfo_hash_bucket(
+					mapped_loc_addr,
+					mapped_rem_addr);
+		if (!hash_bucket_head)
+			goto get_remote_info_exit;
+		hlist_for_each_entry_safe(rem_info, tmp_hlist_node,
+					hash_bucket_head, hlist_node) {
+
+			if (!iwpm_compare_sockaddr(&rem_info->mapped_loc_sockaddr,
+				mapped_loc_addr) &&
+				!iwpm_compare_sockaddr(&rem_info->mapped_rem_sockaddr,
+				mapped_rem_addr)) {
+
+				memcpy(remote_addr, &rem_info->remote_sockaddr,
+					sizeof(struct sockaddr_storage));
+				iwpm_print_sockaddr(remote_addr,
+						"get_remote_info: Remote sockaddr:");
+
+				hlist_del_init(&rem_info->hlist_node);
+				kfree(rem_info);
+				ret = 0;
+				break;
+			}
+		}
+	}
+get_remote_info_exit:
+	spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+	return ret;
+}
+
+struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
+					u8 nl_client, gfp_t gfp)
+{
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	unsigned long flags;
+
+	nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp);
+	if (!nlmsg_request)
+		return NULL;
+
+	spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+	list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list);
+	spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+
+	kref_init(&nlmsg_request->kref);
+	kref_get(&nlmsg_request->kref);
+	nlmsg_request->nlmsg_seq = nlmsg_seq;
+	nlmsg_request->nl_client = nl_client;
+	nlmsg_request->request_done = 0;
+	nlmsg_request->err_code = 0;
+	sema_init(&nlmsg_request->sem, 1);
+	down(&nlmsg_request->sem);
+	return nlmsg_request;
+}
+
+void iwpm_free_nlmsg_request(struct kref *kref)
+{
+	struct iwpm_nlmsg_request *nlmsg_request;
+	unsigned long flags;
+
+	nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref);
+
+	spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+	list_del_init(&nlmsg_request->inprocess_list);
+	spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+
+	if (!nlmsg_request->request_done)
+		pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n",
+			__func__, nlmsg_request->nlmsg_seq);
+	kfree(nlmsg_request);
+}
+
+struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq)
+{
+	struct iwpm_nlmsg_request *nlmsg_request;
+	struct iwpm_nlmsg_request *found_request = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+	list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list,
+			    inprocess_list) {
+		if (nlmsg_request->nlmsg_seq == echo_seq) {
+			found_request = nlmsg_request;
+			kref_get(&nlmsg_request->kref);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+	return found_request;
+}
+
+int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request)
+{
+	int ret;
+
+	ret = down_timeout(&nlmsg_request->sem, IWPM_NL_TIMEOUT);
+	if (ret) {
+		ret = -EINVAL;
+		pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n",
+			__func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq);
+	} else {
+		ret = nlmsg_request->err_code;
+	}
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	return ret;
+}
+
+int iwpm_get_nlmsg_seq(void)
+{
+	return atomic_inc_return(&iwpm_admin.nlmsg_seq);
+}
+
+int iwpm_valid_client(u8 nl_client)
+{
+	return iwpm_admin.client_list[nl_client];
+}
+
+void iwpm_set_valid(u8 nl_client, int valid)
+{
+	iwpm_admin.client_list[nl_client] = valid;
+}
+
+/* valid client */
+u32 iwpm_get_registration(u8 nl_client)
+{
+	return iwpm_admin.reg_list[nl_client];
+}
+
+/* valid client */
+void iwpm_set_registration(u8 nl_client, u32 reg)
+{
+	iwpm_admin.reg_list[nl_client] = reg;
+}
+
+/* valid client */
+u32 iwpm_check_registration(u8 nl_client, u32 reg)
+{
+	return (iwpm_get_registration(nl_client) & reg);
+}
+
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+				struct sockaddr_storage *b_sockaddr)
+{
+	if (a_sockaddr->ss_family != b_sockaddr->ss_family)
+		return 1;
+	if (a_sockaddr->ss_family == AF_INET) {
+		struct sockaddr_in *a4_sockaddr =
+			(struct sockaddr_in *)a_sockaddr;
+		struct sockaddr_in *b4_sockaddr =
+			(struct sockaddr_in *)b_sockaddr;
+		if (!memcmp(&a4_sockaddr->sin_addr,
+			&b4_sockaddr->sin_addr, sizeof(struct in_addr))
+			&& a4_sockaddr->sin_port == b4_sockaddr->sin_port)
+				return 0;
+
+	} else if (a_sockaddr->ss_family == AF_INET6) {
+		struct sockaddr_in6 *a6_sockaddr =
+			(struct sockaddr_in6 *)a_sockaddr;
+		struct sockaddr_in6 *b6_sockaddr =
+			(struct sockaddr_in6 *)b_sockaddr;
+		if (!memcmp(&a6_sockaddr->sin6_addr,
+			&b6_sockaddr->sin6_addr, sizeof(struct in6_addr))
+			&& a6_sockaddr->sin6_port == b6_sockaddr->sin6_port)
+				return 0;
+
+	} else {
+		pr_err("%s: Invalid sockaddr family\n", __func__);
+	}
+	return 1;
+}
+
+struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
+						int nl_client)
+{
+	struct sk_buff *skb = NULL;
+
+	skb = dev_alloc_skb(IWPM_MSG_SIZE);
+	if (!skb)
+		goto create_nlmsg_exit;
+
+	if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op,
+			   NLM_F_REQUEST))) {
+		pr_warn("%s: Unable to put the nlmsg header\n", __func__);
+		dev_kfree_skb(skb);
+		skb = NULL;
+	}
+create_nlmsg_exit:
+	return skb;
+}
+
+int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
+				   const struct nla_policy *nlmsg_policy,
+				   struct nlattr *nltb[], const char *msg_type)
+{
+	int nlh_len = 0;
+	int ret;
+	const char *err_str = "";
+
+	ret = nlmsg_validate(cb->nlh, nlh_len, policy_max - 1, nlmsg_policy,
+			     NULL);
+	if (ret) {
+		err_str = "Invalid attribute";
+		goto parse_nlmsg_error;
+	}
+	ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max - 1,
+			  nlmsg_policy, NULL);
+	if (ret) {
+		err_str = "Unable to parse the nlmsg";
+		goto parse_nlmsg_error;
+	}
+	ret = iwpm_validate_nlmsg_attr(nltb, policy_max);
+	if (ret) {
+		err_str = "Invalid NULL attribute";
+		goto parse_nlmsg_error;
+	}
+	return 0;
+parse_nlmsg_error:
+	pr_warn("%s: %s (msg type %s ret = %d)\n",
+			__func__, err_str, msg_type, ret);
+	return ret;
+}
+
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg)
+{
+	struct sockaddr_in6 *sockaddr_v6;
+	struct sockaddr_in *sockaddr_v4;
+
+	switch (sockaddr->ss_family) {
+	case AF_INET:
+		sockaddr_v4 = (struct sockaddr_in *)sockaddr;
+		pr_debug("%s IPV4 %pI4: %u(0x%04X)\n",
+			msg, &sockaddr_v4->sin_addr,
+			ntohs(sockaddr_v4->sin_port),
+			ntohs(sockaddr_v4->sin_port));
+		break;
+	case AF_INET6:
+		sockaddr_v6 = (struct sockaddr_in6 *)sockaddr;
+		pr_debug("%s IPV6 %pI6: %u(0x%04X)\n",
+			msg, &sockaddr_v6->sin6_addr,
+			ntohs(sockaddr_v6->sin6_port),
+			ntohs(sockaddr_v6->sin6_port));
+		break;
+	default:
+		break;
+	}
+}
+
+static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr)
+{
+	u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0);
+	u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0);
+	return hash;
+}
+
+static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr)
+{
+	u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0);
+	u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0);
+	return hash;
+}
+
+static int get_hash_bucket(struct sockaddr_storage *a_sockaddr,
+				struct sockaddr_storage *b_sockaddr, u32 *hash)
+{
+	u32 a_hash, b_hash;
+
+	if (a_sockaddr->ss_family == AF_INET) {
+		a_hash = iwpm_ipv4_jhash((struct sockaddr_in *) a_sockaddr);
+		b_hash = iwpm_ipv4_jhash((struct sockaddr_in *) b_sockaddr);
+
+	} else if (a_sockaddr->ss_family == AF_INET6) {
+		a_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) a_sockaddr);
+		b_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) b_sockaddr);
+	} else {
+		pr_err("%s: Invalid sockaddr family\n", __func__);
+		return -EINVAL;
+	}
+
+	if (a_hash == b_hash) /* if port mapper isn't available */
+		*hash = a_hash;
+	else
+		*hash = jhash_2words(a_hash, b_hash, 0);
+	return 0;
+}
+
+static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage
+				*local_sockaddr, struct sockaddr_storage
+				*mapped_sockaddr)
+{
+	u32 hash;
+	int ret;
+
+	ret = get_hash_bucket(local_sockaddr, mapped_sockaddr, &hash);
+	if (ret)
+		return NULL;
+	return &iwpm_hash_bucket[hash & IWPM_MAPINFO_HASH_MASK];
+}
+
+static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage
+				*mapped_loc_sockaddr, struct sockaddr_storage
+				*mapped_rem_sockaddr)
+{
+	u32 hash;
+	int ret;
+
+	ret = get_hash_bucket(mapped_loc_sockaddr, mapped_rem_sockaddr, &hash);
+	if (ret)
+		return NULL;
+	return &iwpm_reminfo_bucket[hash & IWPM_REMINFO_HASH_MASK];
+}
+
+static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto mapinfo_num_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	msg_seq = 0;
+	err_str = "Unable to put attribute of mapinfo number nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ);
+	if (ret)
+		goto mapinfo_num_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32),
+				&mapping_num, IWPM_NLA_MAPINFO_SEND_NUM);
+	if (ret)
+		goto mapinfo_num_error;
+
+	nlmsg_end(skb, nlh);
+
+	ret = rdma_nl_unicast(skb, iwpm_pid);
+	if (ret) {
+		skb = NULL;
+		err_str = "Unable to send a nlmsg";
+		goto mapinfo_num_error;
+	}
+	pr_debug("%s: Sent mapping number = %d\n", __func__, mapping_num);
+	return 0;
+mapinfo_num_error:
+	pr_info("%s: %s\n", __func__, err_str);
+	if (skb)
+		dev_kfree_skb(skb);
+	return ret;
+}
+
+static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid)
+{
+	struct nlmsghdr *nlh = NULL;
+	int ret = 0;
+
+	if (!skb)
+		return ret;
+	if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
+			   RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
+		pr_warn("%s Unable to put NLMSG_DONE\n", __func__);
+		dev_kfree_skb(skb);
+		return -ENOMEM;
+	}
+	nlh->nlmsg_type = NLMSG_DONE;
+	ret = rdma_nl_unicast(skb, iwpm_pid);
+	if (ret)
+		pr_warn("%s Unable to send a nlmsg\n", __func__);
+	return ret;
+}
+
+int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid)
+{
+	struct iwpm_mapping_info *map_info;
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	int skb_num = 0, mapping_num = 0;
+	int i = 0, nlmsg_bytes = 0;
+	unsigned long flags;
+	const char *err_str = "";
+	int ret;
+
+	skb = dev_alloc_skb(NLMSG_GOODSIZE);
+	if (!skb) {
+		ret = -ENOMEM;
+		err_str = "Unable to allocate skb";
+		goto send_mapping_info_exit;
+	}
+	skb_num++;
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	ret = -EINVAL;
+	for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+		hlist_for_each_entry(map_info, &iwpm_hash_bucket[i],
+				     hlist_node) {
+			if (map_info->nl_client != nl_client)
+				continue;
+			nlh = NULL;
+			if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
+					RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
+				ret = -ENOMEM;
+				err_str = "Unable to put the nlmsg header";
+				goto send_mapping_info_unlock;
+			}
+			err_str = "Unable to put attribute of the nlmsg";
+			ret = ibnl_put_attr(skb, nlh,
+					sizeof(struct sockaddr_storage),
+					&map_info->local_sockaddr,
+					IWPM_NLA_MAPINFO_LOCAL_ADDR);
+			if (ret)
+				goto send_mapping_info_unlock;
+
+			ret = ibnl_put_attr(skb, nlh,
+					sizeof(struct sockaddr_storage),
+					&map_info->mapped_sockaddr,
+					IWPM_NLA_MAPINFO_MAPPED_ADDR);
+			if (ret)
+				goto send_mapping_info_unlock;
+
+			nlmsg_end(skb, nlh);
+
+			iwpm_print_sockaddr(&map_info->local_sockaddr,
+				"send_mapping_info: Local sockaddr:");
+			iwpm_print_sockaddr(&map_info->mapped_sockaddr,
+				"send_mapping_info: Mapped local sockaddr:");
+			mapping_num++;
+			nlmsg_bytes += nlh->nlmsg_len;
+
+			/* check if all mappings can fit in one skb */
+			if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) {
+				/* and leave room for NLMSG_DONE */
+				nlmsg_bytes = 0;
+				skb_num++;
+				spin_unlock_irqrestore(&iwpm_mapinfo_lock,
+						       flags);
+				/* send the skb */
+				ret = send_nlmsg_done(skb, nl_client, iwpm_pid);
+				skb = NULL;
+				if (ret) {
+					err_str = "Unable to send map info";
+					goto send_mapping_info_exit;
+				}
+				if (skb_num == IWPM_MAPINFO_SKB_COUNT) {
+					ret = -ENOMEM;
+					err_str = "Insufficient skbs for map info";
+					goto send_mapping_info_exit;
+				}
+				skb = dev_alloc_skb(NLMSG_GOODSIZE);
+				if (!skb) {
+					ret = -ENOMEM;
+					err_str = "Unable to allocate skb";
+					goto send_mapping_info_exit;
+				}
+				spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+			}
+		}
+	}
+send_mapping_info_unlock:
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+send_mapping_info_exit:
+	if (ret) {
+		pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret);
+		if (skb)
+			dev_kfree_skb(skb);
+		return ret;
+	}
+	send_nlmsg_done(skb, nl_client, iwpm_pid);
+	return send_mapinfo_num(mapping_num, nl_client, iwpm_pid);
+}
+
+int iwpm_mapinfo_available(void)
+{
+	unsigned long flags;
+	int full_bucket = 0, i = 0;
+
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	if (iwpm_hash_bucket) {
+		for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+			if (!hlist_empty(&iwpm_hash_bucket[i])) {
+				full_bucket = 1;
+				break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+	return full_bucket;
+}
diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h
new file mode 100644
index 0000000..af1fc14
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_util.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IWPM_UTIL_H
+#define _IWPM_UTIL_H
+
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/jhash.h>
+#include <linux/kref.h>
+#include <net/netlink.h>
+#include <linux/errno.h>
+#include <rdma/iw_portmap.h>
+#include <rdma/rdma_netlink.h>
+
+
+#define IWPM_NL_RETRANS		3
+#define IWPM_NL_TIMEOUT		(10*HZ)
+#define IWPM_MAPINFO_SKB_COUNT	20
+
+#define IWPM_PID_UNDEFINED     -1
+#define IWPM_PID_UNAVAILABLE   -2
+
+#define IWPM_REG_UNDEF          0x01
+#define IWPM_REG_VALID          0x02
+#define IWPM_REG_INCOMPL        0x04
+
+struct iwpm_nlmsg_request {
+	struct list_head    inprocess_list;
+	__u32               nlmsg_seq;
+	void                *req_buffer;
+	u8	            nl_client;
+	u8                  request_done;
+	u16                 err_code;
+	struct semaphore    sem;
+	struct kref         kref;
+};
+
+struct iwpm_mapping_info {
+	struct hlist_node hlist_node;
+	struct sockaddr_storage local_sockaddr;
+	struct sockaddr_storage mapped_sockaddr;
+	u8     nl_client;
+};
+
+struct iwpm_remote_info {
+	struct hlist_node hlist_node;
+	struct sockaddr_storage remote_sockaddr;
+	struct sockaddr_storage mapped_loc_sockaddr;
+	struct sockaddr_storage mapped_rem_sockaddr;
+	u8     nl_client;
+};
+
+struct iwpm_admin_data {
+	atomic_t refcount;
+	atomic_t nlmsg_seq;
+	int      client_list[RDMA_NL_NUM_CLIENTS];
+	u32      reg_list[RDMA_NL_NUM_CLIENTS];
+};
+
+/**
+ * iwpm_get_nlmsg_request - Allocate and initialize netlink message request
+ * @nlmsg_seq: Sequence number of the netlink message
+ * @nl_client: The index of the netlink client
+ * @gfp: Indicates how the memory for the request should be allocated
+ *
+ * Returns the newly allocated netlink request object if successful,
+ * otherwise returns NULL
+ */
+struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
+						u8 nl_client, gfp_t gfp);
+
+/**
+ * iwpm_free_nlmsg_request - Deallocate netlink message request
+ * @kref: Holds reference of netlink message request
+ */
+void iwpm_free_nlmsg_request(struct kref *kref);
+
+/**
+ * iwpm_find_nlmsg_request - Find netlink message request in the request list
+ * @echo_seq: Sequence number of the netlink request to find
+ *
+ * Returns the found netlink message request,
+ * if not found, returns NULL
+ */
+struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq);
+
+/**
+ * iwpm_wait_complete_req - Block while servicing the netlink request
+ * @nlmsg_request: Netlink message request to service
+ *
+ * Wakes up, after the request is completed or expired
+ * Returns 0 if the request is complete without error
+ */
+int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request);
+
+/**
+ * iwpm_get_nlmsg_seq - Get the sequence number for a netlink
+ *			message to send to the port mapper
+ *
+ * Returns the sequence number for the netlink message.
+ */
+int iwpm_get_nlmsg_seq(void);
+
+/**
+ * iwpm_add_reminfo - Add remote address info of the connecting peer
+ *                    to the remote info hash table
+ * @reminfo: The remote info to be added
+ */
+void iwpm_add_remote_info(struct iwpm_remote_info *reminfo);
+
+/**
+ * iwpm_valid_client - Check if the port mapper client is valid
+ * @nl_client: The index of the netlink client
+ *
+ * Valid clients need to call iwpm_init() before using
+ * the port mapper
+ */
+int iwpm_valid_client(u8 nl_client);
+
+/**
+ * iwpm_set_valid - Set the port mapper client to valid or not
+ * @nl_client: The index of the netlink client
+ * @valid: 1 if valid or 0 if invalid
+ */
+void iwpm_set_valid(u8 nl_client, int valid);
+
+/**
+ * iwpm_check_registration - Check if the client registration
+ *			      matches the given one
+ * @nl_client: The index of the netlink client
+ * @reg: The given registration type to compare with
+ *
+ * Call iwpm_register_pid() to register a client
+ * Returns true if the client registration matches reg,
+ * otherwise returns false
+ */
+u32 iwpm_check_registration(u8 nl_client, u32 reg);
+
+/**
+ * iwpm_set_registration - Set the client registration
+ * @nl_client: The index of the netlink client
+ * @reg: Registration type to set
+ */
+void iwpm_set_registration(u8 nl_client, u32 reg);
+
+/**
+ * iwpm_get_registration
+ * @nl_client: The index of the netlink client
+ *
+ * Returns the client registration type
+ */
+u32 iwpm_get_registration(u8 nl_client);
+
+/**
+ * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of
+ *                     a client to the user space port mapper
+ * @nl_client: The index of the netlink client
+ * @iwpm_pid: The pid of the user space port mapper
+ *
+ * If successful, returns the number of sent mapping info records
+ */
+int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid);
+
+/**
+ * iwpm_mapinfo_available - Check if any mapping info records is available
+ *		            in the hash table
+ *
+ * Returns 1 if mapping information is available, otherwise returns 0
+ */
+int iwpm_mapinfo_available(void);
+
+/**
+ * iwpm_compare_sockaddr - Compare two sockaddr storage structs
+ *
+ * Returns 0 if they are holding the same ip/tcp address info,
+ * otherwise returns 1
+ */
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+			struct sockaddr_storage *b_sockaddr);
+
+/**
+ * iwpm_validate_nlmsg_attr - Check for NULL netlink attributes
+ * @nltb: Holds address of each netlink message attributes
+ * @nla_count: Number of netlink message attributes
+ *
+ * Returns error if any of the nla_count attributes is NULL
+ */
+static inline int iwpm_validate_nlmsg_attr(struct nlattr *nltb[],
+					   int nla_count)
+{
+	int i;
+	for (i = 1; i < nla_count; i++) {
+		if (!nltb[i])
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * iwpm_create_nlmsg - Allocate skb and form a netlink message
+ * @nl_op: Netlink message opcode
+ * @nlh: Holds address of the netlink message header in skb
+ * @nl_client: The index of the netlink client
+ *
+ * Returns the newly allcated skb, or NULL if the tailroom of the skb
+ * is insufficient to store the message header and payload
+ */
+struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
+					int nl_client);
+
+/**
+ * iwpm_parse_nlmsg - Validate and parse the received netlink message
+ * @cb: Netlink callback structure
+ * @policy_max: Maximum attribute type to be expected
+ * @nlmsg_policy: Validation policy
+ * @nltb: Array to store policy_max parsed elements
+ * @msg_type: Type of netlink message
+ *
+ * Returns 0 on success or a negative error code
+ */
+int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
+				const struct nla_policy *nlmsg_policy,
+				struct nlattr *nltb[], const char *msg_type);
+
+/**
+ * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port
+ * @sockaddr: Socket address to print
+ * @msg: Message to print
+ */
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg);
+#endif
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
new file mode 100644
index 0000000..b28452a
--- /dev/null
+++ b/drivers/infiniband/core/mad.c
@@ -0,0 +1,3350 @@
+/*
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/security.h>
+#include <rdma/ib_cache.h>
+
+#include "mad_priv.h"
+#include "core_priv.h"
+#include "mad_rmpp.h"
+#include "smi.h"
+#include "opa_smi.h"
+#include "agent.h"
+
+static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
+static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
+
+module_param_named(send_queue_size, mad_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests");
+module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
+
+static struct list_head ib_mad_port_list;
+static atomic_t ib_mad_client_id = ATOMIC_INIT(0);
+
+/* Port list lock */
+static DEFINE_SPINLOCK(ib_mad_port_list_lock);
+
+/* Forward declarations */
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+			 struct ib_mad_reg_req *mad_reg_req);
+static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
+static struct ib_mad_agent_private *find_mad_agent(
+					struct ib_mad_port_private *port_priv,
+					const struct ib_mad_hdr *mad);
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad);
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
+static void timeout_sends(struct work_struct *work);
+static void local_completions(struct work_struct *work);
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			      struct ib_mad_agent_private *agent_priv,
+			      u8 mgmt_class);
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			   struct ib_mad_agent_private *agent_priv);
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+			      struct ib_wc *wc);
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc);
+
+/*
+ * Returns a ib_mad_port_private structure or NULL for a device/port
+ * Assumes ib_mad_port_list_lock is being held
+ */
+static inline struct ib_mad_port_private *
+__ib_get_mad_port(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *entry;
+
+	list_for_each_entry(entry, &ib_mad_port_list, port_list) {
+		if (entry->device == device && entry->port_num == port_num)
+			return entry;
+	}
+	return NULL;
+}
+
+/*
+ * Wrapper function to return a ib_mad_port_private structure or NULL
+ * for a device/port
+ */
+static inline struct ib_mad_port_private *
+ib_get_mad_port(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	entry = __ib_get_mad_port(device, port_num);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	return entry;
+}
+
+static inline u8 convert_mgmt_class(u8 mgmt_class)
+{
+	/* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */
+	return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ?
+		0 : mgmt_class;
+}
+
+static int get_spl_qp_index(enum ib_qp_type qp_type)
+{
+	switch (qp_type)
+	{
+	case IB_QPT_SMI:
+		return 0;
+	case IB_QPT_GSI:
+		return 1;
+	default:
+		return -1;
+	}
+}
+
+static int vendor_class_index(u8 mgmt_class)
+{
+	return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START;
+}
+
+static int is_vendor_class(u8 mgmt_class)
+{
+	if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) ||
+	    (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END))
+		return 0;
+	return 1;
+}
+
+static int is_vendor_oui(char *oui)
+{
+	if (oui[0] || oui[1] || oui[2])
+		return 1;
+	return 0;
+}
+
+static int is_vendor_method_in_use(
+		struct ib_mad_mgmt_vendor_class *vendor_class,
+		struct ib_mad_reg_req *mad_reg_req)
+{
+	struct ib_mad_mgmt_method_table *method;
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) {
+			method = vendor_class->method_table[i];
+			if (method) {
+				if (method_in_use(&method, mad_reg_req))
+					return 1;
+				else
+					break;
+			}
+		}
+	}
+	return 0;
+}
+
+int ib_response_mad(const struct ib_mad_hdr *hdr)
+{
+	return ((hdr->method & IB_MGMT_METHOD_RESP) ||
+		(hdr->method == IB_MGMT_METHOD_TRAP_REPRESS) ||
+		((hdr->mgmt_class == IB_MGMT_CLASS_BM) &&
+		 (hdr->attr_mod & IB_BM_ATTR_MOD_RESP)));
+}
+EXPORT_SYMBOL(ib_response_mad);
+
+/*
+ * ib_register_mad_agent - Register to send/receive MADs
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   struct ib_mad_reg_req *mad_reg_req,
+					   u8 rmpp_version,
+					   ib_mad_send_handler send_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context,
+					   u32 registration_flags)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent *ret = ERR_PTR(-EINVAL);
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_reg_req *reg_req = NULL;
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+	struct ib_mad_mgmt_vendor_class *vendor_class;
+	struct ib_mad_mgmt_method_table *method;
+	int ret2, qpn;
+	unsigned long flags;
+	u8 mgmt_class, vclass;
+
+	/* Validate parameters */
+	qpn = get_spl_qp_index(qp_type);
+	if (qpn == -1) {
+		dev_notice(&device->dev,
+			   "ib_register_mad_agent: invalid QP Type %d\n",
+			   qp_type);
+		goto error1;
+	}
+
+	if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) {
+		dev_notice(&device->dev,
+			   "ib_register_mad_agent: invalid RMPP Version %u\n",
+			   rmpp_version);
+		goto error1;
+	}
+
+	/* Validate MAD registration request if supplied */
+	if (mad_reg_req) {
+		if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) {
+			dev_notice(&device->dev,
+				   "ib_register_mad_agent: invalid Class Version %u\n",
+				   mad_reg_req->mgmt_class_version);
+			goto error1;
+		}
+		if (!recv_handler) {
+			dev_notice(&device->dev,
+				   "ib_register_mad_agent: no recv_handler\n");
+			goto error1;
+		}
+		if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
+			/*
+			 * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only
+			 * one in this range currently allowed
+			 */
+			if (mad_reg_req->mgmt_class !=
+			    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		} else if (mad_reg_req->mgmt_class == 0) {
+			/*
+			 * Class 0 is reserved in IBA and is used for
+			 * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+			 */
+			dev_notice(&device->dev,
+				   "ib_register_mad_agent: Invalid Mgmt Class 0\n");
+			goto error1;
+		} else if (is_vendor_class(mad_reg_req->mgmt_class)) {
+			/*
+			 * If class is in "new" vendor range,
+			 * ensure supplied OUI is not zero
+			 */
+			if (!is_vendor_oui(mad_reg_req->oui)) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: No OUI specified for class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		}
+		/* Make sure class supplied is consistent with RMPP */
+		if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
+			if (rmpp_version) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		}
+
+		/* Make sure class supplied is consistent with QP type */
+		if (qp_type == IB_QPT_SMI) {
+			if ((mad_reg_req->mgmt_class !=
+					IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
+			    (mad_reg_req->mgmt_class !=
+					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		} else {
+			if ((mad_reg_req->mgmt_class ==
+					IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+			    (mad_reg_req->mgmt_class ==
+					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		}
+	} else {
+		/* No registration request supplied */
+		if (!send_handler)
+			goto error1;
+		if (registration_flags & IB_MAD_USER_RMPP)
+			goto error1;
+	}
+
+	/* Validate device and port */
+	port_priv = ib_get_mad_port(device, port_num);
+	if (!port_priv) {
+		dev_notice(&device->dev,
+			   "ib_register_mad_agent: Invalid port %d\n",
+			   port_num);
+		ret = ERR_PTR(-ENODEV);
+		goto error1;
+	}
+
+	/* Verify the QP requested is supported.  For example, Ethernet devices
+	 * will not have QP0 */
+	if (!port_priv->qp_info[qpn].qp) {
+		dev_notice(&device->dev,
+			   "ib_register_mad_agent: QP %d not supported\n", qpn);
+		ret = ERR_PTR(-EPROTONOSUPPORT);
+		goto error1;
+	}
+
+	/* Allocate structures */
+	mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
+	if (!mad_agent_priv) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error1;
+	}
+
+	if (mad_reg_req) {
+		reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
+		if (!reg_req) {
+			ret = ERR_PTR(-ENOMEM);
+			goto error3;
+		}
+	}
+
+	/* Now, fill in the various structures */
+	mad_agent_priv->qp_info = &port_priv->qp_info[qpn];
+	mad_agent_priv->reg_req = reg_req;
+	mad_agent_priv->agent.rmpp_version = rmpp_version;
+	mad_agent_priv->agent.device = device;
+	mad_agent_priv->agent.recv_handler = recv_handler;
+	mad_agent_priv->agent.send_handler = send_handler;
+	mad_agent_priv->agent.context = context;
+	mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
+	mad_agent_priv->agent.port_num = port_num;
+	mad_agent_priv->agent.flags = registration_flags;
+	spin_lock_init(&mad_agent_priv->lock);
+	INIT_LIST_HEAD(&mad_agent_priv->send_list);
+	INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+	INIT_LIST_HEAD(&mad_agent_priv->done_list);
+	INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+	INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
+	INIT_LIST_HEAD(&mad_agent_priv->local_list);
+	INIT_WORK(&mad_agent_priv->local_work, local_completions);
+	atomic_set(&mad_agent_priv->refcount, 1);
+	init_completion(&mad_agent_priv->comp);
+
+	ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type);
+	if (ret2) {
+		ret = ERR_PTR(ret2);
+		goto error4;
+	}
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	mad_agent_priv->agent.hi_tid = atomic_inc_return(&ib_mad_client_id);
+
+	/*
+	 * Make sure MAD registration (if supplied)
+	 * is non overlapping with any existing ones
+	 */
+	if (mad_reg_req) {
+		mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class);
+		if (!is_vendor_class(mgmt_class)) {
+			class = port_priv->version[mad_reg_req->
+						   mgmt_class_version].class;
+			if (class) {
+				method = class->method_table[mgmt_class];
+				if (method) {
+					if (method_in_use(&method,
+							   mad_reg_req))
+						goto error5;
+				}
+			}
+			ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv,
+						  mgmt_class);
+		} else {
+			/* "New" vendor class range */
+			vendor = port_priv->version[mad_reg_req->
+						    mgmt_class_version].vendor;
+			if (vendor) {
+				vclass = vendor_class_index(mgmt_class);
+				vendor_class = vendor->vendor_class[vclass];
+				if (vendor_class) {
+					if (is_vendor_method_in_use(
+							vendor_class,
+							mad_reg_req))
+						goto error5;
+				}
+			}
+			ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv);
+		}
+		if (ret2) {
+			ret = ERR_PTR(ret2);
+			goto error5;
+		}
+	}
+
+	/* Add mad agent into port's agent list */
+	list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list);
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	return &mad_agent_priv->agent;
+error5:
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+	ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
+error4:
+	kfree(reg_req);
+error3:
+	kfree(mad_agent_priv);
+error1:
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_agent);
+
+static inline int is_snooping_sends(int mad_snoop_flags)
+{
+	return (mad_snoop_flags &
+		(/*IB_MAD_SNOOP_POSTED_SENDS |
+		 IB_MAD_SNOOP_RMPP_SENDS |*/
+		 IB_MAD_SNOOP_SEND_COMPLETIONS /*|
+		 IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/));
+}
+
+static inline int is_snooping_recvs(int mad_snoop_flags)
+{
+	return (mad_snoop_flags &
+		(IB_MAD_SNOOP_RECVS /*|
+		 IB_MAD_SNOOP_RMPP_RECVS*/));
+}
+
+static int register_snoop_agent(struct ib_mad_qp_info *qp_info,
+				struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	struct ib_mad_snoop_private **new_snoop_table;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	/* Check for empty slot in array. */
+	for (i = 0; i < qp_info->snoop_table_size; i++)
+		if (!qp_info->snoop_table[i])
+			break;
+
+	if (i == qp_info->snoop_table_size) {
+		/* Grow table. */
+		new_snoop_table = krealloc(qp_info->snoop_table,
+					   sizeof mad_snoop_priv *
+					   (qp_info->snoop_table_size + 1),
+					   GFP_ATOMIC);
+		if (!new_snoop_table) {
+			i = -ENOMEM;
+			goto out;
+		}
+
+		qp_info->snoop_table = new_snoop_table;
+		qp_info->snoop_table_size++;
+	}
+	qp_info->snoop_table[i] = mad_snoop_priv;
+	atomic_inc(&qp_info->snoop_count);
+out:
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+	return i;
+}
+
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   int mad_snoop_flags,
+					   ib_mad_snoop_handler snoop_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent *ret;
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	int qpn;
+	int err;
+
+	/* Validate parameters */
+	if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) ||
+	    (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) {
+		ret = ERR_PTR(-EINVAL);
+		goto error1;
+	}
+	qpn = get_spl_qp_index(qp_type);
+	if (qpn == -1) {
+		ret = ERR_PTR(-EINVAL);
+		goto error1;
+	}
+	port_priv = ib_get_mad_port(device, port_num);
+	if (!port_priv) {
+		ret = ERR_PTR(-ENODEV);
+		goto error1;
+	}
+	/* Allocate structures */
+	mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL);
+	if (!mad_snoop_priv) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error1;
+	}
+
+	/* Now, fill in the various structures */
+	mad_snoop_priv->qp_info = &port_priv->qp_info[qpn];
+	mad_snoop_priv->agent.device = device;
+	mad_snoop_priv->agent.recv_handler = recv_handler;
+	mad_snoop_priv->agent.snoop_handler = snoop_handler;
+	mad_snoop_priv->agent.context = context;
+	mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp;
+	mad_snoop_priv->agent.port_num = port_num;
+	mad_snoop_priv->mad_snoop_flags = mad_snoop_flags;
+	init_completion(&mad_snoop_priv->comp);
+
+	err = ib_mad_agent_security_setup(&mad_snoop_priv->agent, qp_type);
+	if (err) {
+		ret = ERR_PTR(err);
+		goto error2;
+	}
+
+	mad_snoop_priv->snoop_index = register_snoop_agent(
+						&port_priv->qp_info[qpn],
+						mad_snoop_priv);
+	if (mad_snoop_priv->snoop_index < 0) {
+		ret = ERR_PTR(mad_snoop_priv->snoop_index);
+		goto error3;
+	}
+
+	atomic_set(&mad_snoop_priv->refcount, 1);
+	return &mad_snoop_priv->agent;
+error3:
+	ib_mad_agent_security_cleanup(&mad_snoop_priv->agent);
+error2:
+	kfree(mad_snoop_priv);
+error1:
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_snoop);
+
+static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+	if (atomic_dec_and_test(&mad_agent_priv->refcount))
+		complete(&mad_agent_priv->comp);
+}
+
+static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	if (atomic_dec_and_test(&mad_snoop_priv->refcount))
+		complete(&mad_snoop_priv->comp);
+}
+
+static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+
+	/* Note that we could still be handling received MADs */
+
+	/*
+	 * Canceling all sends results in dropping received response
+	 * MADs, preventing us from queuing additional work
+	 */
+	cancel_mads(mad_agent_priv);
+	port_priv = mad_agent_priv->qp_info->port_priv;
+	cancel_delayed_work(&mad_agent_priv->timed_work);
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	remove_mad_reg_req(mad_agent_priv);
+	list_del(&mad_agent_priv->agent_list);
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	flush_workqueue(port_priv->wq);
+	ib_cancel_rmpp_recvs(mad_agent_priv);
+
+	deref_mad_agent(mad_agent_priv);
+	wait_for_completion(&mad_agent_priv->comp);
+
+	ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
+
+	kfree(mad_agent_priv->reg_req);
+	kfree(mad_agent_priv);
+}
+
+static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	struct ib_mad_qp_info *qp_info;
+	unsigned long flags;
+
+	qp_info = mad_snoop_priv->qp_info;
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL;
+	atomic_dec(&qp_info->snoop_count);
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+
+	deref_snoop_agent(mad_snoop_priv);
+	wait_for_completion(&mad_snoop_priv->comp);
+
+	ib_mad_agent_security_cleanup(&mad_snoop_priv->agent);
+
+	kfree(mad_snoop_priv);
+}
+
+/*
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services
+ */
+void ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_snoop_private *mad_snoop_priv;
+
+	/* If the TID is zero, the agent can only snoop. */
+	if (mad_agent->hi_tid) {
+		mad_agent_priv = container_of(mad_agent,
+					      struct ib_mad_agent_private,
+					      agent);
+		unregister_mad_agent(mad_agent_priv);
+	} else {
+		mad_snoop_priv = container_of(mad_agent,
+					      struct ib_mad_snoop_private,
+					      agent);
+		unregister_mad_snoop(mad_snoop_priv);
+	}
+}
+EXPORT_SYMBOL(ib_unregister_mad_agent);
+
+static void dequeue_mad(struct ib_mad_list_head *mad_list)
+{
+	struct ib_mad_queue *mad_queue;
+	unsigned long flags;
+
+	BUG_ON(!mad_list->mad_queue);
+	mad_queue = mad_list->mad_queue;
+	spin_lock_irqsave(&mad_queue->lock, flags);
+	list_del(&mad_list->list);
+	mad_queue->count--;
+	spin_unlock_irqrestore(&mad_queue->lock, flags);
+}
+
+static void snoop_send(struct ib_mad_qp_info *qp_info,
+		       struct ib_mad_send_buf *send_buf,
+		       struct ib_mad_send_wc *mad_send_wc,
+		       int mad_snoop_flags)
+{
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	for (i = 0; i < qp_info->snoop_table_size; i++) {
+		mad_snoop_priv = qp_info->snoop_table[i];
+		if (!mad_snoop_priv ||
+		    !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+			continue;
+
+		atomic_inc(&mad_snoop_priv->refcount);
+		spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+		mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent,
+						    send_buf, mad_send_wc);
+		deref_snoop_agent(mad_snoop_priv);
+		spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	}
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void snoop_recv(struct ib_mad_qp_info *qp_info,
+		       struct ib_mad_recv_wc *mad_recv_wc,
+		       int mad_snoop_flags)
+{
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	for (i = 0; i < qp_info->snoop_table_size; i++) {
+		mad_snoop_priv = qp_info->snoop_table[i];
+		if (!mad_snoop_priv ||
+		    !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+			continue;
+
+		atomic_inc(&mad_snoop_priv->refcount);
+		spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+		mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL,
+						   mad_recv_wc);
+		deref_snoop_agent(mad_snoop_priv);
+		spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	}
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid,
+		u16 pkey_index, u8 port_num, struct ib_wc *wc)
+{
+	memset(wc, 0, sizeof *wc);
+	wc->wr_cqe = cqe;
+	wc->status = IB_WC_SUCCESS;
+	wc->opcode = IB_WC_RECV;
+	wc->pkey_index = pkey_index;
+	wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh);
+	wc->src_qp = IB_QP0;
+	wc->qp = qp;
+	wc->slid = slid;
+	wc->sl = 0;
+	wc->dlid_path_bits = 0;
+	wc->port_num = port_num;
+}
+
+static size_t mad_priv_size(const struct ib_mad_private *mp)
+{
+	return sizeof(struct ib_mad_private) + mp->mad_size;
+}
+
+static struct ib_mad_private *alloc_mad_private(size_t mad_size, gfp_t flags)
+{
+	size_t size = sizeof(struct ib_mad_private) + mad_size;
+	struct ib_mad_private *ret = kzalloc(size, flags);
+
+	if (ret)
+		ret->mad_size = mad_size;
+
+	return ret;
+}
+
+static size_t port_mad_size(const struct ib_mad_port_private *port_priv)
+{
+	return rdma_max_mad_size(port_priv->device, port_priv->port_num);
+}
+
+static size_t mad_priv_dma_size(const struct ib_mad_private *mp)
+{
+	return sizeof(struct ib_grh) + mp->mad_size;
+}
+
+/*
+ * Return 0 if SMP is to be sent
+ * Return 1 if SMP was consumed locally (whether or not solicited)
+ * Return < 0 if error
+ */
+static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
+				  struct ib_mad_send_wr_private *mad_send_wr)
+{
+	int ret = 0;
+	struct ib_smp *smp = mad_send_wr->send_buf.mad;
+	struct opa_smp *opa_smp = (struct opa_smp *)smp;
+	unsigned long flags;
+	struct ib_mad_local_private *local;
+	struct ib_mad_private *mad_priv;
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent_private *recv_mad_agent = NULL;
+	struct ib_device *device = mad_agent_priv->agent.device;
+	u8 port_num;
+	struct ib_wc mad_wc;
+	struct ib_ud_wr *send_wr = &mad_send_wr->send_wr;
+	size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv);
+	u16 out_mad_pkey_index = 0;
+	u16 drslid;
+	bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
+				    mad_agent_priv->qp_info->port_priv->port_num);
+
+	if (rdma_cap_ib_switch(device) &&
+	    smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		port_num = send_wr->port_num;
+	else
+		port_num = mad_agent_priv->agent.port_num;
+
+	/*
+	 * Directed route handling starts if the initial LID routed part of
+	 * a request or the ending LID routed part of a response is empty.
+	 * If we are at the start of the LID routed part, don't update the
+	 * hop_ptr or hop_cnt.  See section 14.2.2, Vol 1 IB spec.
+	 */
+	if (opa && smp->class_version == OPA_SM_CLASS_VERSION) {
+		u32 opa_drslid;
+
+		if ((opa_get_smp_direction(opa_smp)
+		     ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) ==
+		     OPA_LID_PERMISSIVE &&
+		     opa_smi_handle_dr_smp_send(opa_smp,
+						rdma_cap_ib_switch(device),
+						port_num) == IB_SMI_DISCARD) {
+			ret = -EINVAL;
+			dev_err(&device->dev, "OPA Invalid directed route\n");
+			goto out;
+		}
+		opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid);
+		if (opa_drslid != be32_to_cpu(OPA_LID_PERMISSIVE) &&
+		    opa_drslid & 0xffff0000) {
+			ret = -EINVAL;
+			dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n",
+			       opa_drslid);
+			goto out;
+		}
+		drslid = (u16)(opa_drslid & 0x0000ffff);
+
+		/* Check to post send on QP or process locally */
+		if (opa_smi_check_local_smp(opa_smp, device) == IB_SMI_DISCARD &&
+		    opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD)
+			goto out;
+	} else {
+		if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
+		     IB_LID_PERMISSIVE &&
+		     smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) ==
+		     IB_SMI_DISCARD) {
+			ret = -EINVAL;
+			dev_err(&device->dev, "Invalid directed route\n");
+			goto out;
+		}
+		drslid = be16_to_cpu(smp->dr_slid);
+
+		/* Check to post send on QP or process locally */
+		if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
+		    smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
+			goto out;
+	}
+
+	local = kmalloc(sizeof *local, GFP_ATOMIC);
+	if (!local) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	local->mad_priv = NULL;
+	local->recv_mad_agent = NULL;
+	mad_priv = alloc_mad_private(mad_size, GFP_ATOMIC);
+	if (!mad_priv) {
+		ret = -ENOMEM;
+		kfree(local);
+		goto out;
+	}
+
+	build_smp_wc(mad_agent_priv->agent.qp,
+		     send_wr->wr.wr_cqe, drslid,
+		     send_wr->pkey_index,
+		     send_wr->port_num, &mad_wc);
+
+	if (opa && smp->base_version == OPA_MGMT_BASE_VERSION) {
+		mad_wc.byte_len = mad_send_wr->send_buf.hdr_len
+					+ mad_send_wr->send_buf.data_len
+					+ sizeof(struct ib_grh);
+	}
+
+	/* No GRH for DR SMP */
+	ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
+				  (const struct ib_mad_hdr *)smp, mad_size,
+				  (struct ib_mad_hdr *)mad_priv->mad,
+				  &mad_size, &out_mad_pkey_index);
+	switch (ret)
+	{
+	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
+		if (ib_response_mad((const struct ib_mad_hdr *)mad_priv->mad) &&
+		    mad_agent_priv->agent.recv_handler) {
+			local->mad_priv = mad_priv;
+			local->recv_mad_agent = mad_agent_priv;
+			/*
+			 * Reference MAD agent until receive
+			 * side of local completion handled
+			 */
+			atomic_inc(&mad_agent_priv->refcount);
+		} else
+			kfree(mad_priv);
+		break;
+	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
+		kfree(mad_priv);
+		break;
+	case IB_MAD_RESULT_SUCCESS:
+		/* Treat like an incoming receive MAD */
+		port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
+					    mad_agent_priv->agent.port_num);
+		if (port_priv) {
+			memcpy(mad_priv->mad, smp, mad_priv->mad_size);
+			recv_mad_agent = find_mad_agent(port_priv,
+						        (const struct ib_mad_hdr *)mad_priv->mad);
+		}
+		if (!port_priv || !recv_mad_agent) {
+			/*
+			 * No receiving agent so drop packet and
+			 * generate send completion.
+			 */
+			kfree(mad_priv);
+			break;
+		}
+		local->mad_priv = mad_priv;
+		local->recv_mad_agent = recv_mad_agent;
+		break;
+	default:
+		kfree(mad_priv);
+		kfree(local);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	local->mad_send_wr = mad_send_wr;
+	if (opa) {
+		local->mad_send_wr->send_wr.pkey_index = out_mad_pkey_index;
+		local->return_wc_byte_len = mad_size;
+	}
+	/* Reference MAD agent until send side of local completion handled */
+	atomic_inc(&mad_agent_priv->refcount);
+	/* Queue local completion to local list */
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	list_add_tail(&local->completion_list, &mad_agent_priv->local_list);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+	queue_work(mad_agent_priv->qp_info->port_priv->wq,
+		   &mad_agent_priv->local_work);
+	ret = 1;
+out:
+	return ret;
+}
+
+static int get_pad_size(int hdr_len, int data_len, size_t mad_size)
+{
+	int seg_size, pad;
+
+	seg_size = mad_size - hdr_len;
+	if (data_len && seg_size) {
+		pad = seg_size - data_len % seg_size;
+		return pad == seg_size ? 0 : pad;
+	} else
+		return seg_size;
+}
+
+static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_segment *s, *t;
+
+	list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) {
+		list_del(&s->list);
+		kfree(s);
+	}
+}
+
+static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
+				size_t mad_size, gfp_t gfp_mask)
+{
+	struct ib_mad_send_buf *send_buf = &send_wr->send_buf;
+	struct ib_rmpp_mad *rmpp_mad = send_buf->mad;
+	struct ib_rmpp_segment *seg = NULL;
+	int left, seg_size, pad;
+
+	send_buf->seg_size = mad_size - send_buf->hdr_len;
+	send_buf->seg_rmpp_size = mad_size - IB_MGMT_RMPP_HDR;
+	seg_size = send_buf->seg_size;
+	pad = send_wr->pad;
+
+	/* Allocate data segments. */
+	for (left = send_buf->data_len + pad; left > 0; left -= seg_size) {
+		seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask);
+		if (!seg) {
+			free_send_rmpp_list(send_wr);
+			return -ENOMEM;
+		}
+		seg->num = ++send_buf->seg_count;
+		list_add_tail(&seg->list, &send_wr->rmpp_list);
+	}
+
+	/* Zero any padding */
+	if (pad)
+		memset(seg->data + seg_size - pad, 0, pad);
+
+	rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv->
+					  agent.rmpp_version;
+	rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+	send_wr->cur_seg = container_of(send_wr->rmpp_list.next,
+					struct ib_rmpp_segment, list);
+	send_wr->last_ack_seg = send_wr->cur_seg;
+	return 0;
+}
+
+int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent)
+{
+	return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP);
+}
+EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent);
+
+struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
+					    u32 remote_qpn, u16 pkey_index,
+					    int rmpp_active,
+					    int hdr_len, int data_len,
+					    gfp_t gfp_mask,
+					    u8 base_version)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	int pad, message_size, ret, size;
+	void *buf;
+	size_t mad_size;
+	bool opa;
+
+	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+				      agent);
+
+	opa = rdma_cap_opa_mad(mad_agent->device, mad_agent->port_num);
+
+	if (opa && base_version == OPA_MGMT_BASE_VERSION)
+		mad_size = sizeof(struct opa_mad);
+	else
+		mad_size = sizeof(struct ib_mad);
+
+	pad = get_pad_size(hdr_len, data_len, mad_size);
+	message_size = hdr_len + data_len + pad;
+
+	if (ib_mad_kernel_rmpp_agent(mad_agent)) {
+		if (!rmpp_active && message_size > mad_size)
+			return ERR_PTR(-EINVAL);
+	} else
+		if (rmpp_active || message_size > mad_size)
+			return ERR_PTR(-EINVAL);
+
+	size = rmpp_active ? hdr_len : mad_size;
+	buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+
+	mad_send_wr = buf + size;
+	INIT_LIST_HEAD(&mad_send_wr->rmpp_list);
+	mad_send_wr->send_buf.mad = buf;
+	mad_send_wr->send_buf.hdr_len = hdr_len;
+	mad_send_wr->send_buf.data_len = data_len;
+	mad_send_wr->pad = pad;
+
+	mad_send_wr->mad_agent_priv = mad_agent_priv;
+	mad_send_wr->sg_list[0].length = hdr_len;
+	mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey;
+
+	/* OPA MADs don't have to be the full 2048 bytes */
+	if (opa && base_version == OPA_MGMT_BASE_VERSION &&
+	    data_len < mad_size - hdr_len)
+		mad_send_wr->sg_list[1].length = data_len;
+	else
+		mad_send_wr->sg_list[1].length = mad_size - hdr_len;
+
+	mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
+
+	mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+
+	mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
+	mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list;
+	mad_send_wr->send_wr.wr.num_sge = 2;
+	mad_send_wr->send_wr.wr.opcode = IB_WR_SEND;
+	mad_send_wr->send_wr.wr.send_flags = IB_SEND_SIGNALED;
+	mad_send_wr->send_wr.remote_qpn = remote_qpn;
+	mad_send_wr->send_wr.remote_qkey = IB_QP_SET_QKEY;
+	mad_send_wr->send_wr.pkey_index = pkey_index;
+
+	if (rmpp_active) {
+		ret = alloc_send_rmpp_list(mad_send_wr, mad_size, gfp_mask);
+		if (ret) {
+			kfree(buf);
+			return ERR_PTR(ret);
+		}
+	}
+
+	mad_send_wr->send_buf.mad_agent = mad_agent;
+	atomic_inc(&mad_agent_priv->refcount);
+	return &mad_send_wr->send_buf;
+}
+EXPORT_SYMBOL(ib_create_send_mad);
+
+int ib_get_mad_data_offset(u8 mgmt_class)
+{
+	if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+		return IB_MGMT_SA_HDR;
+	else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+		 (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+		 (mgmt_class == IB_MGMT_CLASS_BIS))
+		return IB_MGMT_DEVICE_HDR;
+	else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+		 (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+		return IB_MGMT_VENDOR_HDR;
+	else
+		return IB_MGMT_MAD_HDR;
+}
+EXPORT_SYMBOL(ib_get_mad_data_offset);
+
+int ib_is_mad_class_rmpp(u8 mgmt_class)
+{
+	if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) ||
+	    (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+	    (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+	    (mgmt_class == IB_MGMT_CLASS_BIS) ||
+	    ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+	     (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)))
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL(ib_is_mad_class_rmpp);
+
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct list_head *list;
+
+	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+				   send_buf);
+	list = &mad_send_wr->cur_seg->list;
+
+	if (mad_send_wr->cur_seg->num < seg_num) {
+		list_for_each_entry(mad_send_wr->cur_seg, list, list)
+			if (mad_send_wr->cur_seg->num == seg_num)
+				break;
+	} else if (mad_send_wr->cur_seg->num > seg_num) {
+		list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list)
+			if (mad_send_wr->cur_seg->num == seg_num)
+				break;
+	}
+	return mad_send_wr->cur_seg->data;
+}
+EXPORT_SYMBOL(ib_get_rmpp_segment);
+
+static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	if (mad_send_wr->send_buf.seg_count)
+		return ib_get_rmpp_segment(&mad_send_wr->send_buf,
+					   mad_send_wr->seg_num);
+	else
+		return mad_send_wr->send_buf.mad +
+		       mad_send_wr->send_buf.hdr_len;
+}
+
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	mad_agent_priv = container_of(send_buf->mad_agent,
+				      struct ib_mad_agent_private, agent);
+	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+				   send_buf);
+
+	free_send_rmpp_list(mad_send_wr);
+	kfree(send_buf->mad);
+	deref_mad_agent(mad_agent_priv);
+}
+EXPORT_SYMBOL(ib_free_send_mad);
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_qp_info *qp_info;
+	struct list_head *list;
+	struct ib_send_wr *bad_send_wr;
+	struct ib_mad_agent *mad_agent;
+	struct ib_sge *sge;
+	unsigned long flags;
+	int ret;
+
+	/* Set WR ID to find mad_send_wr upon completion */
+	qp_info = mad_send_wr->mad_agent_priv->qp_info;
+	mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+	mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+	mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
+
+	mad_agent = mad_send_wr->send_buf.mad_agent;
+	sge = mad_send_wr->sg_list;
+	sge[0].addr = ib_dma_map_single(mad_agent->device,
+					mad_send_wr->send_buf.mad,
+					sge[0].length,
+					DMA_TO_DEVICE);
+	if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr)))
+		return -ENOMEM;
+
+	mad_send_wr->header_mapping = sge[0].addr;
+
+	sge[1].addr = ib_dma_map_single(mad_agent->device,
+					ib_get_payload(mad_send_wr),
+					sge[1].length,
+					DMA_TO_DEVICE);
+	if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) {
+		ib_dma_unmap_single(mad_agent->device,
+				    mad_send_wr->header_mapping,
+				    sge[0].length, DMA_TO_DEVICE);
+		return -ENOMEM;
+	}
+	mad_send_wr->payload_mapping = sge[1].addr;
+
+	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+	if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
+		ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr,
+				   &bad_send_wr);
+		list = &qp_info->send_queue.list;
+	} else {
+		ret = 0;
+		list = &qp_info->overflow_list;
+	}
+
+	if (!ret) {
+		qp_info->send_queue.count++;
+		list_add_tail(&mad_send_wr->mad_list.list, list);
+	}
+	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+	if (ret) {
+		ib_dma_unmap_single(mad_agent->device,
+				    mad_send_wr->header_mapping,
+				    sge[0].length, DMA_TO_DEVICE);
+		ib_dma_unmap_single(mad_agent->device,
+				    mad_send_wr->payload_mapping,
+				    sge[1].length, DMA_TO_DEVICE);
+	}
+	return ret;
+}
+
+/*
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ *  with the registered client
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+		     struct ib_mad_send_buf **bad_send_buf)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_buf *next_send_buf;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	/* Walk list of send WRs and post each on send list */
+	for (; send_buf; send_buf = next_send_buf) {
+		mad_send_wr = container_of(send_buf,
+					   struct ib_mad_send_wr_private,
+					   send_buf);
+		mad_agent_priv = mad_send_wr->mad_agent_priv;
+
+		ret = ib_mad_enforce_security(mad_agent_priv,
+					      mad_send_wr->send_wr.pkey_index);
+		if (ret)
+			goto error;
+
+		if (!send_buf->mad_agent->send_handler ||
+		    (send_buf->timeout_ms &&
+		     !send_buf->mad_agent->recv_handler)) {
+			ret = -EINVAL;
+			goto error;
+		}
+
+		if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) {
+			if (mad_agent_priv->agent.rmpp_version) {
+				ret = -EINVAL;
+				goto error;
+			}
+		}
+
+		/*
+		 * Save pointer to next work request to post in case the
+		 * current one completes, and the user modifies the work
+		 * request associated with the completion
+		 */
+		next_send_buf = send_buf->next;
+		mad_send_wr->send_wr.ah = send_buf->ah;
+
+		if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class ==
+		    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+			ret = handle_outgoing_dr_smp(mad_agent_priv,
+						     mad_send_wr);
+			if (ret < 0)		/* error */
+				goto error;
+			else if (ret == 1)	/* locally consumed */
+				continue;
+		}
+
+		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+		/* Timeout will be updated after send completes */
+		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+		mad_send_wr->max_retries = send_buf->retries;
+		mad_send_wr->retries_left = send_buf->retries;
+		send_buf->retries = 0;
+		/* Reference for work request to QP + response */
+		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
+		mad_send_wr->status = IB_WC_SUCCESS;
+
+		/* Reference MAD agent until send completes */
+		atomic_inc(&mad_agent_priv->refcount);
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		list_add_tail(&mad_send_wr->agent_list,
+			      &mad_agent_priv->send_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+			ret = ib_send_rmpp_mad(mad_send_wr);
+			if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED)
+				ret = ib_send_mad(mad_send_wr);
+		} else
+			ret = ib_send_mad(mad_send_wr);
+		if (ret < 0) {
+			/* Fail send request */
+			spin_lock_irqsave(&mad_agent_priv->lock, flags);
+			list_del(&mad_send_wr->agent_list);
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			atomic_dec(&mad_agent_priv->refcount);
+			goto error;
+		}
+	}
+	return 0;
+error:
+	if (bad_send_buf)
+		*bad_send_buf = send_buf;
+	return ret;
+}
+EXPORT_SYMBOL(ib_post_send_mad);
+
+/*
+ * ib_free_recv_mad - Returns data buffers used to receive
+ *  a MAD to the access layer
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf;
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *priv;
+	struct list_head free_list;
+
+	INIT_LIST_HEAD(&free_list);
+	list_splice_init(&mad_recv_wc->rmpp_list, &free_list);
+
+	list_for_each_entry_safe(mad_recv_buf, temp_recv_buf,
+					&free_list, list) {
+		mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc,
+					   recv_buf);
+		mad_priv_hdr = container_of(mad_recv_wc,
+					    struct ib_mad_private_header,
+					    recv_wc);
+		priv = container_of(mad_priv_hdr, struct ib_mad_private,
+				    header);
+		kfree(priv);
+	}
+}
+EXPORT_SYMBOL(ib_free_recv_mad);
+
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+					u8 rmpp_version,
+					ib_mad_send_handler send_handler,
+					ib_mad_recv_handler recv_handler,
+					void *context)
+{
+	return ERR_PTR(-EINVAL);	/* XXX: for now */
+}
+EXPORT_SYMBOL(ib_redirect_mad_qp);
+
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+		      struct ib_wc *wc)
+{
+	dev_err(&mad_agent->device->dev,
+		"ib_process_mad_wc() not implemented yet\n");
+	return 0;
+}
+EXPORT_SYMBOL(ib_process_mad_wc);
+
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+			 struct ib_mad_reg_req *mad_reg_req)
+{
+	int i;
+
+	for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) {
+		if ((*method)->agent[i]) {
+			pr_err("Method %d already in use\n", i);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int allocate_method_table(struct ib_mad_mgmt_method_table **method)
+{
+	/* Allocate management method table */
+	*method = kzalloc(sizeof **method, GFP_ATOMIC);
+	return (*method) ? 0 : (-ENOMEM);
+}
+
+/*
+ * Check to see if there are any methods still in use
+ */
+static int check_method_table(struct ib_mad_mgmt_method_table *method)
+{
+	int i;
+
+	for (i = 0; i < IB_MGMT_MAX_METHODS; i++)
+		if (method->agent[i])
+			return 1;
+	return 0;
+}
+
+/*
+ * Check to see if there are any method tables for this class still in use
+ */
+static int check_class_table(struct ib_mad_mgmt_class_table *class)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_CLASS; i++)
+		if (class->method_table[i])
+			return 1;
+	return 0;
+}
+
+static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++)
+		if (vendor_class->method_table[i])
+			return 1;
+	return 0;
+}
+
+static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class,
+			   const char *oui)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++)
+		/* Is there matching OUI for this vendor class ? */
+		if (!memcmp(vendor_class->oui[i], oui, 3))
+			return i;
+
+	return -1;
+}
+
+static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++)
+		if (vendor->vendor_class[i])
+			return 1;
+
+	return 0;
+}
+
+static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method,
+				     struct ib_mad_agent_private *agent)
+{
+	int i;
+
+	/* Remove any methods for this mad agent */
+	for (i = 0; i < IB_MGMT_MAX_METHODS; i++) {
+		if (method->agent[i] == agent) {
+			method->agent[i] = NULL;
+		}
+	}
+}
+
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			      struct ib_mad_agent_private *agent_priv,
+			      u8 mgmt_class)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_class_table **class;
+	struct ib_mad_mgmt_method_table **method;
+	int i, ret;
+
+	port_priv = agent_priv->qp_info->port_priv;
+	class = &port_priv->version[mad_reg_req->mgmt_class_version].class;
+	if (!*class) {
+		/* Allocate management class table for "new" class version */
+		*class = kzalloc(sizeof **class, GFP_ATOMIC);
+		if (!*class) {
+			ret = -ENOMEM;
+			goto error1;
+		}
+
+		/* Allocate method table for this management class */
+		method = &(*class)->method_table[mgmt_class];
+		if ((ret = allocate_method_table(method)))
+			goto error2;
+	} else {
+		method = &(*class)->method_table[mgmt_class];
+		if (!*method) {
+			/* Allocate method table for this management class */
+			if ((ret = allocate_method_table(method)))
+				goto error1;
+		}
+	}
+
+	/* Now, make sure methods are not already in use */
+	if (method_in_use(method, mad_reg_req))
+		goto error3;
+
+	/* Finally, add in methods being registered */
+	for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
+		(*method)->agent[i] = agent_priv;
+
+	return 0;
+
+error3:
+	/* Remove any methods for this mad agent */
+	remove_methods_mad_agent(*method, agent_priv);
+	/* Now, check to see if there are any methods in use */
+	if (!check_method_table(*method)) {
+		/* If not, release management method table */
+		kfree(*method);
+		*method = NULL;
+	}
+	ret = -EINVAL;
+	goto error1;
+error2:
+	kfree(*class);
+	*class = NULL;
+error1:
+	return ret;
+}
+
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			   struct ib_mad_agent_private *agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_vendor_class_table **vendor_table;
+	struct ib_mad_mgmt_vendor_class_table *vendor = NULL;
+	struct ib_mad_mgmt_vendor_class *vendor_class = NULL;
+	struct ib_mad_mgmt_method_table **method;
+	int i, ret = -ENOMEM;
+	u8 vclass;
+
+	/* "New" vendor (with OUI) class */
+	vclass = vendor_class_index(mad_reg_req->mgmt_class);
+	port_priv = agent_priv->qp_info->port_priv;
+	vendor_table = &port_priv->version[
+				mad_reg_req->mgmt_class_version].vendor;
+	if (!*vendor_table) {
+		/* Allocate mgmt vendor class table for "new" class version */
+		vendor = kzalloc(sizeof *vendor, GFP_ATOMIC);
+		if (!vendor)
+			goto error1;
+
+		*vendor_table = vendor;
+	}
+	if (!(*vendor_table)->vendor_class[vclass]) {
+		/* Allocate table for this management vendor class */
+		vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC);
+		if (!vendor_class)
+			goto error2;
+
+		(*vendor_table)->vendor_class[vclass] = vendor_class;
+	}
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		/* Is there matching OUI for this vendor class ? */
+		if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i],
+			    mad_reg_req->oui, 3)) {
+			method = &(*vendor_table)->vendor_class[
+						vclass]->method_table[i];
+			BUG_ON(!*method);
+			goto check_in_use;
+		}
+	}
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		/* OUI slot available ? */
+		if (!is_vendor_oui((*vendor_table)->vendor_class[
+				vclass]->oui[i])) {
+			method = &(*vendor_table)->vendor_class[
+				vclass]->method_table[i];
+			BUG_ON(*method);
+			/* Allocate method table for this OUI */
+			if ((ret = allocate_method_table(method)))
+				goto error3;
+			memcpy((*vendor_table)->vendor_class[vclass]->oui[i],
+			       mad_reg_req->oui, 3);
+			goto check_in_use;
+		}
+	}
+	dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n");
+	goto error3;
+
+check_in_use:
+	/* Now, make sure methods are not already in use */
+	if (method_in_use(method, mad_reg_req))
+		goto error4;
+
+	/* Finally, add in methods being registered */
+	for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
+		(*method)->agent[i] = agent_priv;
+
+	return 0;
+
+error4:
+	/* Remove any methods for this mad agent */
+	remove_methods_mad_agent(*method, agent_priv);
+	/* Now, check to see if there are any methods in use */
+	if (!check_method_table(*method)) {
+		/* If not, release management method table */
+		kfree(*method);
+		*method = NULL;
+	}
+	ret = -EINVAL;
+error3:
+	if (vendor_class) {
+		(*vendor_table)->vendor_class[vclass] = NULL;
+		kfree(vendor_class);
+	}
+error2:
+	if (vendor) {
+		*vendor_table = NULL;
+		kfree(vendor);
+	}
+error1:
+	return ret;
+}
+
+static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_method_table *method;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+	struct ib_mad_mgmt_vendor_class *vendor_class;
+	int index;
+	u8 mgmt_class;
+
+	/*
+	 * Was MAD registration request supplied
+	 * with original registration ?
+	 */
+	if (!agent_priv->reg_req) {
+		goto out;
+	}
+
+	port_priv = agent_priv->qp_info->port_priv;
+	mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class);
+	class = port_priv->version[
+			agent_priv->reg_req->mgmt_class_version].class;
+	if (!class)
+		goto vendor_check;
+
+	method = class->method_table[mgmt_class];
+	if (method) {
+		/* Remove any methods for this mad agent */
+		remove_methods_mad_agent(method, agent_priv);
+		/* Now, check to see if there are any methods still in use */
+		if (!check_method_table(method)) {
+			/* If not, release management method table */
+			kfree(method);
+			class->method_table[mgmt_class] = NULL;
+			/* Any management classes left ? */
+			if (!check_class_table(class)) {
+				/* If not, release management class table */
+				kfree(class);
+				port_priv->version[
+					agent_priv->reg_req->
+					mgmt_class_version].class = NULL;
+			}
+		}
+	}
+
+vendor_check:
+	if (!is_vendor_class(mgmt_class))
+		goto out;
+
+	/* normalize mgmt_class to vendor range 2 */
+	mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class);
+	vendor = port_priv->version[
+			agent_priv->reg_req->mgmt_class_version].vendor;
+
+	if (!vendor)
+		goto out;
+
+	vendor_class = vendor->vendor_class[mgmt_class];
+	if (vendor_class) {
+		index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui);
+		if (index < 0)
+			goto out;
+		method = vendor_class->method_table[index];
+		if (method) {
+			/* Remove any methods for this mad agent */
+			remove_methods_mad_agent(method, agent_priv);
+			/*
+			 * Now, check to see if there are
+			 * any methods still in use
+			 */
+			if (!check_method_table(method)) {
+				/* If not, release management method table */
+				kfree(method);
+				vendor_class->method_table[index] = NULL;
+				memset(vendor_class->oui[index], 0, 3);
+				/* Any OUIs left ? */
+				if (!check_vendor_class(vendor_class)) {
+					/* If not, release vendor class table */
+					kfree(vendor_class);
+					vendor->vendor_class[mgmt_class] = NULL;
+					/* Any other vendor classes left ? */
+					if (!check_vendor_table(vendor)) {
+						kfree(vendor);
+						port_priv->version[
+							agent_priv->reg_req->
+							mgmt_class_version].
+							vendor = NULL;
+					}
+				}
+			}
+		}
+	}
+
+out:
+	return;
+}
+
+static struct ib_mad_agent_private *
+find_mad_agent(struct ib_mad_port_private *port_priv,
+	       const struct ib_mad_hdr *mad_hdr)
+{
+	struct ib_mad_agent_private *mad_agent = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	if (ib_response_mad(mad_hdr)) {
+		u32 hi_tid;
+		struct ib_mad_agent_private *entry;
+
+		/*
+		 * Routing is based on high 32 bits of transaction ID
+		 * of MAD.
+		 */
+		hi_tid = be64_to_cpu(mad_hdr->tid) >> 32;
+		list_for_each_entry(entry, &port_priv->agent_list, agent_list) {
+			if (entry->agent.hi_tid == hi_tid) {
+				mad_agent = entry;
+				break;
+			}
+		}
+	} else {
+		struct ib_mad_mgmt_class_table *class;
+		struct ib_mad_mgmt_method_table *method;
+		struct ib_mad_mgmt_vendor_class_table *vendor;
+		struct ib_mad_mgmt_vendor_class *vendor_class;
+		const struct ib_vendor_mad *vendor_mad;
+		int index;
+
+		/*
+		 * Routing is based on version, class, and method
+		 * For "newer" vendor MADs, also based on OUI
+		 */
+		if (mad_hdr->class_version >= MAX_MGMT_VERSION)
+			goto out;
+		if (!is_vendor_class(mad_hdr->mgmt_class)) {
+			class = port_priv->version[
+					mad_hdr->class_version].class;
+			if (!class)
+				goto out;
+			if (convert_mgmt_class(mad_hdr->mgmt_class) >=
+			    ARRAY_SIZE(class->method_table))
+				goto out;
+			method = class->method_table[convert_mgmt_class(
+							mad_hdr->mgmt_class)];
+			if (method)
+				mad_agent = method->agent[mad_hdr->method &
+							  ~IB_MGMT_METHOD_RESP];
+		} else {
+			vendor = port_priv->version[
+					mad_hdr->class_version].vendor;
+			if (!vendor)
+				goto out;
+			vendor_class = vendor->vendor_class[vendor_class_index(
+						mad_hdr->mgmt_class)];
+			if (!vendor_class)
+				goto out;
+			/* Find matching OUI */
+			vendor_mad = (const struct ib_vendor_mad *)mad_hdr;
+			index = find_vendor_oui(vendor_class, vendor_mad->oui);
+			if (index == -1)
+				goto out;
+			method = vendor_class->method_table[index];
+			if (method) {
+				mad_agent = method->agent[mad_hdr->method &
+							  ~IB_MGMT_METHOD_RESP];
+			}
+		}
+	}
+
+	if (mad_agent) {
+		if (mad_agent->agent.recv_handler)
+			atomic_inc(&mad_agent->refcount);
+		else {
+			dev_notice(&port_priv->device->dev,
+				   "No receive handler for client %p on port %d\n",
+				   &mad_agent->agent, port_priv->port_num);
+			mad_agent = NULL;
+		}
+	}
+out:
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	return mad_agent;
+}
+
+static int validate_mad(const struct ib_mad_hdr *mad_hdr,
+			const struct ib_mad_qp_info *qp_info,
+			bool opa)
+{
+	int valid = 0;
+	u32 qp_num = qp_info->qp->qp_num;
+
+	/* Make sure MAD base version is understood */
+	if (mad_hdr->base_version != IB_MGMT_BASE_VERSION &&
+	    (!opa || mad_hdr->base_version != OPA_MGMT_BASE_VERSION)) {
+		pr_err("MAD received with unsupported base version %d %s\n",
+		       mad_hdr->base_version, opa ? "(opa)" : "");
+		goto out;
+	}
+
+	/* Filter SMI packets sent to other than QP0 */
+	if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+	    (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+		if (qp_num == 0)
+			valid = 1;
+	} else {
+		/* CM attributes other than ClassPortInfo only use Send method */
+		if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_CM) &&
+		    (mad_hdr->attr_id != IB_MGMT_CLASSPORTINFO_ATTR_ID) &&
+		    (mad_hdr->method != IB_MGMT_METHOD_SEND))
+			goto out;
+		/* Filter GSI packets sent to QP0 */
+		if (qp_num != 0)
+			valid = 1;
+	}
+
+out:
+	return valid;
+}
+
+static int is_rmpp_data_mad(const struct ib_mad_agent_private *mad_agent_priv,
+			    const struct ib_mad_hdr *mad_hdr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_hdr;
+	return !mad_agent_priv->agent.rmpp_version ||
+		!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) ||
+		!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+				    IB_MGMT_RMPP_FLAG_ACTIVE) ||
+		(rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
+}
+
+static inline int rcv_has_same_class(const struct ib_mad_send_wr_private *wr,
+				     const struct ib_mad_recv_wc *rwc)
+{
+	return ((struct ib_mad_hdr *)(wr->send_buf.mad))->mgmt_class ==
+		rwc->recv_buf.mad->mad_hdr.mgmt_class;
+}
+
+static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv,
+				   const struct ib_mad_send_wr_private *wr,
+				   const struct ib_mad_recv_wc *rwc )
+{
+	struct rdma_ah_attr attr;
+	u8 send_resp, rcv_resp;
+	union ib_gid sgid;
+	struct ib_device *device = mad_agent_priv->agent.device;
+	u8 port_num = mad_agent_priv->agent.port_num;
+	u8 lmc;
+	bool has_grh;
+
+	send_resp = ib_response_mad((struct ib_mad_hdr *)wr->send_buf.mad);
+	rcv_resp = ib_response_mad(&rwc->recv_buf.mad->mad_hdr);
+
+	if (send_resp == rcv_resp)
+		/* both requests, or both responses. GIDs different */
+		return 0;
+
+	if (rdma_query_ah(wr->send_buf.ah, &attr))
+		/* Assume not equal, to avoid false positives. */
+		return 0;
+
+	has_grh = !!(rdma_ah_get_ah_flags(&attr) & IB_AH_GRH);
+	if (has_grh != !!(rwc->wc->wc_flags & IB_WC_GRH))
+		/* one has GID, other does not.  Assume different */
+		return 0;
+
+	if (!send_resp && rcv_resp) {
+		/* is request/response. */
+		if (!has_grh) {
+			if (ib_get_cached_lmc(device, port_num, &lmc))
+				return 0;
+			return (!lmc || !((rdma_ah_get_path_bits(&attr) ^
+					   rwc->wc->dlid_path_bits) &
+					  ((1 << lmc) - 1)));
+		} else {
+			const struct ib_global_route *grh =
+					rdma_ah_read_grh(&attr);
+
+			if (ib_get_cached_gid(device, port_num,
+					      grh->sgid_index, &sgid, NULL))
+				return 0;
+			return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw,
+				       16);
+		}
+	}
+
+	if (!has_grh)
+		return rdma_ah_get_dlid(&attr) == rwc->wc->slid;
+	else
+		return !memcmp(rdma_ah_read_grh(&attr)->dgid.raw,
+			       rwc->recv_buf.grh->sgid.raw,
+			       16);
+}
+
+static inline int is_direct(u8 class)
+{
+	return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE);
+}
+
+struct ib_mad_send_wr_private*
+ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
+		 const struct ib_mad_recv_wc *wc)
+{
+	struct ib_mad_send_wr_private *wr;
+	const struct ib_mad_hdr *mad_hdr;
+
+	mad_hdr = &wc->recv_buf.mad->mad_hdr;
+
+	list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) {
+		if ((wr->tid == mad_hdr->tid) &&
+		    rcv_has_same_class(wr, wc) &&
+		    /*
+		     * Don't check GID for direct routed MADs.
+		     * These might have permissive LIDs.
+		     */
+		    (is_direct(mad_hdr->mgmt_class) ||
+		     rcv_has_same_gid(mad_agent_priv, wr, wc)))
+			return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+	}
+
+	/*
+	 * It's possible to receive the response before we've
+	 * been notified that the send has completed
+	 */
+	list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) {
+		if (is_rmpp_data_mad(mad_agent_priv, wr->send_buf.mad) &&
+		    wr->tid == mad_hdr->tid &&
+		    wr->timeout &&
+		    rcv_has_same_class(wr, wc) &&
+		    /*
+		     * Don't check GID for direct routed MADs.
+		     * These might have permissive LIDs.
+		     */
+		    (is_direct(mad_hdr->mgmt_class) ||
+		     rcv_has_same_gid(mad_agent_priv, wr, wc)))
+			/* Verify request has not been canceled */
+			return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+	}
+	return NULL;
+}
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	mad_send_wr->timeout = 0;
+	if (mad_send_wr->refcount == 1)
+		list_move_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->done_list);
+}
+
+static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
+				 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	unsigned long flags;
+	int ret;
+
+	INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
+	ret = ib_mad_enforce_security(mad_agent_priv,
+				      mad_recv_wc->wc->pkey_index);
+	if (ret) {
+		ib_free_recv_mad(mad_recv_wc);
+		deref_mad_agent(mad_agent_priv);
+		return;
+	}
+
+	list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
+	if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+		mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
+						      mad_recv_wc);
+		if (!mad_recv_wc) {
+			deref_mad_agent(mad_agent_priv);
+			return;
+		}
+	}
+
+	/* Complete corresponding request */
+	if (ib_response_mad(&mad_recv_wc->recv_buf.mad->mad_hdr)) {
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
+		if (!mad_send_wr) {
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)
+			   && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class)
+			   && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr)
+					& IB_MGMT_RMPP_FLAG_ACTIVE)) {
+				/* user rmpp is in effect
+				 * and this is an active RMPP MAD
+				 */
+				mad_agent_priv->agent.recv_handler(
+						&mad_agent_priv->agent, NULL,
+						mad_recv_wc);
+				atomic_dec(&mad_agent_priv->refcount);
+			} else {
+				/* not user rmpp, revert to normal behavior and
+				 * drop the mad */
+				ib_free_recv_mad(mad_recv_wc);
+				deref_mad_agent(mad_agent_priv);
+				return;
+			}
+		} else {
+			ib_mark_mad_done(mad_send_wr);
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+			/* Defined behavior is to complete response before request */
+			mad_agent_priv->agent.recv_handler(
+					&mad_agent_priv->agent,
+					&mad_send_wr->send_buf,
+					mad_recv_wc);
+			atomic_dec(&mad_agent_priv->refcount);
+
+			mad_send_wc.status = IB_WC_SUCCESS;
+			mad_send_wc.vendor_err = 0;
+			mad_send_wc.send_buf = &mad_send_wr->send_buf;
+			ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+		}
+	} else {
+		mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL,
+						   mad_recv_wc);
+		deref_mad_agent(mad_agent_priv);
+	}
+
+	return;
+}
+
+static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv,
+				     const struct ib_mad_qp_info *qp_info,
+				     const struct ib_wc *wc,
+				     int port_num,
+				     struct ib_mad_private *recv,
+				     struct ib_mad_private *response)
+{
+	enum smi_forward_action retsmi;
+	struct ib_smp *smp = (struct ib_smp *)recv->mad;
+
+	if (smi_handle_dr_smp_recv(smp,
+				   rdma_cap_ib_switch(port_priv->device),
+				   port_num,
+				   port_priv->device->phys_port_cnt) ==
+				   IB_SMI_DISCARD)
+		return IB_SMI_DISCARD;
+
+	retsmi = smi_check_forward_dr_smp(smp);
+	if (retsmi == IB_SMI_LOCAL)
+		return IB_SMI_HANDLE;
+
+	if (retsmi == IB_SMI_SEND) { /* don't forward */
+		if (smi_handle_dr_smp_send(smp,
+					   rdma_cap_ib_switch(port_priv->device),
+					   port_num) == IB_SMI_DISCARD)
+			return IB_SMI_DISCARD;
+
+		if (smi_check_local_smp(smp, port_priv->device) == IB_SMI_DISCARD)
+			return IB_SMI_DISCARD;
+	} else if (rdma_cap_ib_switch(port_priv->device)) {
+		/* forward case for switches */
+		memcpy(response, recv, mad_priv_size(response));
+		response->header.recv_wc.wc = &response->header.wc;
+		response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad;
+		response->header.recv_wc.recv_buf.grh = &response->grh;
+
+		agent_send_response((const struct ib_mad_hdr *)response->mad,
+				    &response->grh, wc,
+				    port_priv->device,
+				    smi_get_fwd_port(smp),
+				    qp_info->qp->qp_num,
+				    response->mad_size,
+				    false);
+
+		return IB_SMI_DISCARD;
+	}
+	return IB_SMI_HANDLE;
+}
+
+static bool generate_unmatched_resp(const struct ib_mad_private *recv,
+				    struct ib_mad_private *response,
+				    size_t *resp_len, bool opa)
+{
+	const struct ib_mad_hdr *recv_hdr = (const struct ib_mad_hdr *)recv->mad;
+	struct ib_mad_hdr *resp_hdr = (struct ib_mad_hdr *)response->mad;
+
+	if (recv_hdr->method == IB_MGMT_METHOD_GET ||
+	    recv_hdr->method == IB_MGMT_METHOD_SET) {
+		memcpy(response, recv, mad_priv_size(response));
+		response->header.recv_wc.wc = &response->header.wc;
+		response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad;
+		response->header.recv_wc.recv_buf.grh = &response->grh;
+		resp_hdr->method = IB_MGMT_METHOD_GET_RESP;
+		resp_hdr->status = cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB);
+		if (recv_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+			resp_hdr->status |= IB_SMP_DIRECTION;
+
+		if (opa && recv_hdr->base_version == OPA_MGMT_BASE_VERSION) {
+			if (recv_hdr->mgmt_class ==
+			    IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+			    recv_hdr->mgmt_class ==
+			    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+				*resp_len = opa_get_smp_header_size(
+							(struct opa_smp *)recv->mad);
+			else
+				*resp_len = sizeof(struct ib_mad_hdr);
+		}
+
+		return true;
+	} else {
+		return false;
+	}
+}
+
+static enum smi_action
+handle_opa_smi(struct ib_mad_port_private *port_priv,
+	       struct ib_mad_qp_info *qp_info,
+	       struct ib_wc *wc,
+	       int port_num,
+	       struct ib_mad_private *recv,
+	       struct ib_mad_private *response)
+{
+	enum smi_forward_action retsmi;
+	struct opa_smp *smp = (struct opa_smp *)recv->mad;
+
+	if (opa_smi_handle_dr_smp_recv(smp,
+				   rdma_cap_ib_switch(port_priv->device),
+				   port_num,
+				   port_priv->device->phys_port_cnt) ==
+				   IB_SMI_DISCARD)
+		return IB_SMI_DISCARD;
+
+	retsmi = opa_smi_check_forward_dr_smp(smp);
+	if (retsmi == IB_SMI_LOCAL)
+		return IB_SMI_HANDLE;
+
+	if (retsmi == IB_SMI_SEND) { /* don't forward */
+		if (opa_smi_handle_dr_smp_send(smp,
+					   rdma_cap_ib_switch(port_priv->device),
+					   port_num) == IB_SMI_DISCARD)
+			return IB_SMI_DISCARD;
+
+		if (opa_smi_check_local_smp(smp, port_priv->device) ==
+		    IB_SMI_DISCARD)
+			return IB_SMI_DISCARD;
+
+	} else if (rdma_cap_ib_switch(port_priv->device)) {
+		/* forward case for switches */
+		memcpy(response, recv, mad_priv_size(response));
+		response->header.recv_wc.wc = &response->header.wc;
+		response->header.recv_wc.recv_buf.opa_mad =
+				(struct opa_mad *)response->mad;
+		response->header.recv_wc.recv_buf.grh = &response->grh;
+
+		agent_send_response((const struct ib_mad_hdr *)response->mad,
+				    &response->grh, wc,
+				    port_priv->device,
+				    opa_smi_get_fwd_port(smp),
+				    qp_info->qp->qp_num,
+				    recv->header.wc.byte_len,
+				    true);
+
+		return IB_SMI_DISCARD;
+	}
+
+	return IB_SMI_HANDLE;
+}
+
+static enum smi_action
+handle_smi(struct ib_mad_port_private *port_priv,
+	   struct ib_mad_qp_info *qp_info,
+	   struct ib_wc *wc,
+	   int port_num,
+	   struct ib_mad_private *recv,
+	   struct ib_mad_private *response,
+	   bool opa)
+{
+	struct ib_mad_hdr *mad_hdr = (struct ib_mad_hdr *)recv->mad;
+
+	if (opa && mad_hdr->base_version == OPA_MGMT_BASE_VERSION &&
+	    mad_hdr->class_version == OPA_SM_CLASS_VERSION)
+		return handle_opa_smi(port_priv, qp_info, wc, port_num, recv,
+				      response);
+
+	return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response);
+}
+
+static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_mad_port_private *port_priv = cq->cq_context;
+	struct ib_mad_list_head *mad_list =
+		container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
+	struct ib_mad_qp_info *qp_info;
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *recv, *response = NULL;
+	struct ib_mad_agent_private *mad_agent;
+	int port_num;
+	int ret = IB_MAD_RESULT_SUCCESS;
+	size_t mad_size;
+	u16 resp_mad_pkey_index = 0;
+	bool opa;
+
+	if (list_empty_careful(&port_priv->port_list))
+		return;
+
+	if (wc->status != IB_WC_SUCCESS) {
+		/*
+		 * Receive errors indicate that the QP has entered the error
+		 * state - error handling/shutdown code will cleanup
+		 */
+		return;
+	}
+
+	qp_info = mad_list->mad_queue->qp_info;
+	dequeue_mad(mad_list);
+
+	opa = rdma_cap_opa_mad(qp_info->port_priv->device,
+			       qp_info->port_priv->port_num);
+
+	mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
+				    mad_list);
+	recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+	ib_dma_unmap_single(port_priv->device,
+			    recv->header.mapping,
+			    mad_priv_dma_size(recv),
+			    DMA_FROM_DEVICE);
+
+	/* Setup MAD receive work completion from "normal" work completion */
+	recv->header.wc = *wc;
+	recv->header.recv_wc.wc = &recv->header.wc;
+
+	if (opa && ((struct ib_mad_hdr *)(recv->mad))->base_version == OPA_MGMT_BASE_VERSION) {
+		recv->header.recv_wc.mad_len = wc->byte_len - sizeof(struct ib_grh);
+		recv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad);
+	} else {
+		recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+		recv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad);
+	}
+
+	recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad;
+	recv->header.recv_wc.recv_buf.grh = &recv->grh;
+
+	if (atomic_read(&qp_info->snoop_count))
+		snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS);
+
+	/* Validate MAD */
+	if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa))
+		goto out;
+
+	mad_size = recv->mad_size;
+	response = alloc_mad_private(mad_size, GFP_KERNEL);
+	if (!response)
+		goto out;
+
+	if (rdma_cap_ib_switch(port_priv->device))
+		port_num = wc->port_num;
+	else
+		port_num = port_priv->port_num;
+
+	if (((struct ib_mad_hdr *)recv->mad)->mgmt_class ==
+	    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (handle_smi(port_priv, qp_info, wc, port_num, recv,
+			       response, opa)
+		    == IB_SMI_DISCARD)
+			goto out;
+	}
+
+	/* Give driver "right of first refusal" on incoming MAD */
+	if (port_priv->device->process_mad) {
+		ret = port_priv->device->process_mad(port_priv->device, 0,
+						     port_priv->port_num,
+						     wc, &recv->grh,
+						     (const struct ib_mad_hdr *)recv->mad,
+						     recv->mad_size,
+						     (struct ib_mad_hdr *)response->mad,
+						     &mad_size, &resp_mad_pkey_index);
+
+		if (opa)
+			wc->pkey_index = resp_mad_pkey_index;
+
+		if (ret & IB_MAD_RESULT_SUCCESS) {
+			if (ret & IB_MAD_RESULT_CONSUMED)
+				goto out;
+			if (ret & IB_MAD_RESULT_REPLY) {
+				agent_send_response((const struct ib_mad_hdr *)response->mad,
+						    &recv->grh, wc,
+						    port_priv->device,
+						    port_num,
+						    qp_info->qp->qp_num,
+						    mad_size, opa);
+				goto out;
+			}
+		}
+	}
+
+	mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad);
+	if (mad_agent) {
+		ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
+		/*
+		 * recv is freed up in error cases in ib_mad_complete_recv
+		 * or via recv_handler in ib_mad_complete_recv()
+		 */
+		recv = NULL;
+	} else if ((ret & IB_MAD_RESULT_SUCCESS) &&
+		   generate_unmatched_resp(recv, response, &mad_size, opa)) {
+		agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc,
+				    port_priv->device, port_num,
+				    qp_info->qp->qp_num, mad_size, opa);
+	}
+
+out:
+	/* Post another receive request for this QP */
+	if (response) {
+		ib_mad_post_receive_mads(qp_info, response);
+		kfree(recv);
+	} else
+		ib_mad_post_receive_mads(qp_info, recv);
+}
+
+static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long delay;
+
+	if (list_empty(&mad_agent_priv->wait_list)) {
+		cancel_delayed_work(&mad_agent_priv->timed_work);
+	} else {
+		mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+					 struct ib_mad_send_wr_private,
+					 agent_list);
+
+		if (time_after(mad_agent_priv->timeout,
+			       mad_send_wr->timeout)) {
+			mad_agent_priv->timeout = mad_send_wr->timeout;
+			delay = mad_send_wr->timeout - jiffies;
+			if ((long)delay <= 0)
+				delay = 1;
+			mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+					 &mad_agent_priv->timed_work, delay);
+		}
+	}
+}
+
+static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *temp_mad_send_wr;
+	struct list_head *list_item;
+	unsigned long delay;
+
+	mad_agent_priv = mad_send_wr->mad_agent_priv;
+	list_del(&mad_send_wr->agent_list);
+
+	delay = mad_send_wr->timeout;
+	mad_send_wr->timeout += jiffies;
+
+	if (delay) {
+		list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
+			temp_mad_send_wr = list_entry(list_item,
+						struct ib_mad_send_wr_private,
+						agent_list);
+			if (time_after(mad_send_wr->timeout,
+				       temp_mad_send_wr->timeout))
+				break;
+		}
+	}
+	else
+		list_item = &mad_agent_priv->wait_list;
+	list_add(&mad_send_wr->agent_list, list_item);
+
+	/* Reschedule a work item if we have a shorter timeout */
+	if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list)
+		mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+				 &mad_agent_priv->timed_work, delay);
+}
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+			  int timeout_ms)
+{
+	mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+	wait_for_response(mad_send_wr);
+}
+
+/*
+ * Process a send work completion
+ */
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+			     struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_agent_private	*mad_agent_priv;
+	unsigned long			flags;
+	int				ret;
+
+	mad_agent_priv = mad_send_wr->mad_agent_priv;
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+		ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc);
+		if (ret == IB_RMPP_RESULT_CONSUMED)
+			goto done;
+	} else
+		ret = IB_RMPP_RESULT_UNHANDLED;
+
+	if (mad_send_wc->status != IB_WC_SUCCESS &&
+	    mad_send_wr->status == IB_WC_SUCCESS) {
+		mad_send_wr->status = mad_send_wc->status;
+		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+	}
+
+	if (--mad_send_wr->refcount > 0) {
+		if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
+		    mad_send_wr->status == IB_WC_SUCCESS) {
+			wait_for_response(mad_send_wr);
+		}
+		goto done;
+	}
+
+	/* Remove send from MAD agent and notify client of completion */
+	list_del(&mad_send_wr->agent_list);
+	adjust_timeout(mad_agent_priv);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+	if (mad_send_wr->status != IB_WC_SUCCESS )
+		mad_send_wc->status = mad_send_wr->status;
+	if (ret == IB_RMPP_RESULT_INTERNAL)
+		ib_rmpp_send_handler(mad_send_wc);
+	else
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   mad_send_wc);
+
+	/* Release reference on agent taken when sending */
+	deref_mad_agent(mad_agent_priv);
+	return;
+done:
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_mad_port_private *port_priv = cq->cq_context;
+	struct ib_mad_list_head *mad_list =
+		container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
+	struct ib_mad_send_wr_private	*mad_send_wr, *queued_send_wr;
+	struct ib_mad_qp_info		*qp_info;
+	struct ib_mad_queue		*send_queue;
+	struct ib_send_wr		*bad_send_wr;
+	struct ib_mad_send_wc		mad_send_wc;
+	unsigned long flags;
+	int ret;
+
+	if (list_empty_careful(&port_priv->port_list))
+		return;
+
+	if (wc->status != IB_WC_SUCCESS) {
+		if (!ib_mad_send_error(port_priv, wc))
+			return;
+	}
+
+	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+				   mad_list);
+	send_queue = mad_list->mad_queue;
+	qp_info = send_queue->qp_info;
+
+retry:
+	ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+			    mad_send_wr->header_mapping,
+			    mad_send_wr->sg_list[0].length, DMA_TO_DEVICE);
+	ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+			    mad_send_wr->payload_mapping,
+			    mad_send_wr->sg_list[1].length, DMA_TO_DEVICE);
+	queued_send_wr = NULL;
+	spin_lock_irqsave(&send_queue->lock, flags);
+	list_del(&mad_list->list);
+
+	/* Move queued send to the send queue */
+	if (send_queue->count-- > send_queue->max_active) {
+		mad_list = container_of(qp_info->overflow_list.next,
+					struct ib_mad_list_head, list);
+		queued_send_wr = container_of(mad_list,
+					struct ib_mad_send_wr_private,
+					mad_list);
+		list_move_tail(&mad_list->list, &send_queue->list);
+	}
+	spin_unlock_irqrestore(&send_queue->lock, flags);
+
+	mad_send_wc.send_buf = &mad_send_wr->send_buf;
+	mad_send_wc.status = wc->status;
+	mad_send_wc.vendor_err = wc->vendor_err;
+	if (atomic_read(&qp_info->snoop_count))
+		snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc,
+			   IB_MAD_SNOOP_SEND_COMPLETIONS);
+	ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+
+	if (queued_send_wr) {
+		ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr,
+				   &bad_send_wr);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"ib_post_send failed: %d\n", ret);
+			mad_send_wr = queued_send_wr;
+			wc->status = IB_WC_LOC_QP_OP_ERR;
+			goto retry;
+		}
+	}
+}
+
+static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_list_head *mad_list;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+	list_for_each_entry(mad_list, &qp_info->send_queue.list, list) {
+		mad_send_wr = container_of(mad_list,
+					   struct ib_mad_send_wr_private,
+					   mad_list);
+		mad_send_wr->retry = 1;
+	}
+	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+}
+
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+		struct ib_wc *wc)
+{
+	struct ib_mad_list_head *mad_list =
+		container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
+	struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	int ret;
+
+	/*
+	 * Send errors will transition the QP to SQE - move
+	 * QP to RTS and repost flushed work requests
+	 */
+	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+				   mad_list);
+	if (wc->status == IB_WC_WR_FLUSH_ERR) {
+		if (mad_send_wr->retry) {
+			/* Repost send */
+			struct ib_send_wr *bad_send_wr;
+
+			mad_send_wr->retry = 0;
+			ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr,
+					&bad_send_wr);
+			if (!ret)
+				return false;
+		}
+	} else {
+		struct ib_qp_attr *attr;
+
+		/* Transition QP to RTS and fail offending send */
+		attr = kmalloc(sizeof *attr, GFP_KERNEL);
+		if (attr) {
+			attr->qp_state = IB_QPS_RTS;
+			attr->cur_qp_state = IB_QPS_SQE;
+			ret = ib_modify_qp(qp_info->qp, attr,
+					   IB_QP_STATE | IB_QP_CUR_STATE);
+			kfree(attr);
+			if (ret)
+				dev_err(&port_priv->device->dev,
+					"%s - ib_modify_qp to RTS: %d\n",
+					__func__, ret);
+			else
+				mark_sends_for_retry(qp_info);
+		}
+	}
+
+	return true;
+}
+
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+	unsigned long flags;
+	struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	struct list_head cancel_list;
+
+	INIT_LIST_HEAD(&cancel_list);
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+				 &mad_agent_priv->send_list, agent_list) {
+		if (mad_send_wr->status == IB_WC_SUCCESS) {
+			mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+			mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+		}
+	}
+
+	/* Empty wait list to prevent receives from finding a request */
+	list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+	/* Report all cancelled requests */
+	mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+	mad_send_wc.vendor_err = 0;
+
+	list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+				 &cancel_list, agent_list) {
+		mad_send_wc.send_buf = &mad_send_wr->send_buf;
+		list_del(&mad_send_wr->agent_list);
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+		atomic_dec(&mad_agent_priv->refcount);
+	}
+}
+
+static struct ib_mad_send_wr_private*
+find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
+	     struct ib_mad_send_buf *send_buf)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+			    agent_list) {
+		if (&mad_send_wr->send_buf == send_buf)
+			return mad_send_wr;
+	}
+
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
+			    agent_list) {
+		if (is_rmpp_data_mad(mad_agent_priv,
+				     mad_send_wr->send_buf.mad) &&
+		    &mad_send_wr->send_buf == send_buf)
+			return mad_send_wr;
+	}
+	return NULL;
+}
+
+int ib_modify_mad(struct ib_mad_agent *mad_agent,
+		  struct ib_mad_send_buf *send_buf, u32 timeout_ms)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long flags;
+	int active;
+
+	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+				      agent);
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
+	if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		return -EINVAL;
+	}
+
+	active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1);
+	if (!timeout_ms) {
+		mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+	}
+
+	mad_send_wr->send_buf.timeout_ms = timeout_ms;
+	if (active)
+		mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+	else
+		ib_reset_mad_timeout(mad_send_wr, timeout_ms);
+
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(ib_modify_mad);
+
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+		   struct ib_mad_send_buf *send_buf)
+{
+	ib_modify_mad(mad_agent, send_buf, 0);
+}
+EXPORT_SYMBOL(ib_cancel_mad);
+
+static void local_completions(struct work_struct *work)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_local_private *local;
+	struct ib_mad_agent_private *recv_mad_agent;
+	unsigned long flags;
+	int free_mad;
+	struct ib_wc wc;
+	struct ib_mad_send_wc mad_send_wc;
+	bool opa;
+
+	mad_agent_priv =
+		container_of(work, struct ib_mad_agent_private, local_work);
+
+	opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
+			       mad_agent_priv->qp_info->port_priv->port_num);
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	while (!list_empty(&mad_agent_priv->local_list)) {
+		local = list_entry(mad_agent_priv->local_list.next,
+				   struct ib_mad_local_private,
+				   completion_list);
+		list_del(&local->completion_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		free_mad = 0;
+		if (local->mad_priv) {
+			u8 base_version;
+			recv_mad_agent = local->recv_mad_agent;
+			if (!recv_mad_agent) {
+				dev_err(&mad_agent_priv->agent.device->dev,
+					"No receive MAD agent for local completion\n");
+				free_mad = 1;
+				goto local_send_completion;
+			}
+
+			/*
+			 * Defined behavior is to complete response
+			 * before request
+			 */
+			build_smp_wc(recv_mad_agent->agent.qp,
+				     local->mad_send_wr->send_wr.wr.wr_cqe,
+				     be16_to_cpu(IB_LID_PERMISSIVE),
+				     local->mad_send_wr->send_wr.pkey_index,
+				     recv_mad_agent->agent.port_num, &wc);
+
+			local->mad_priv->header.recv_wc.wc = &wc;
+
+			base_version = ((struct ib_mad_hdr *)(local->mad_priv->mad))->base_version;
+			if (opa && base_version == OPA_MGMT_BASE_VERSION) {
+				local->mad_priv->header.recv_wc.mad_len = local->return_wc_byte_len;
+				local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad);
+			} else {
+				local->mad_priv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+				local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad);
+			}
+
+			INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list);
+			list_add(&local->mad_priv->header.recv_wc.recv_buf.list,
+				 &local->mad_priv->header.recv_wc.rmpp_list);
+			local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
+			local->mad_priv->header.recv_wc.recv_buf.mad =
+						(struct ib_mad *)local->mad_priv->mad;
+			if (atomic_read(&recv_mad_agent->qp_info->snoop_count))
+				snoop_recv(recv_mad_agent->qp_info,
+					  &local->mad_priv->header.recv_wc,
+					   IB_MAD_SNOOP_RECVS);
+			recv_mad_agent->agent.recv_handler(
+						&recv_mad_agent->agent,
+						&local->mad_send_wr->send_buf,
+						&local->mad_priv->header.recv_wc);
+			spin_lock_irqsave(&recv_mad_agent->lock, flags);
+			atomic_dec(&recv_mad_agent->refcount);
+			spin_unlock_irqrestore(&recv_mad_agent->lock, flags);
+		}
+
+local_send_completion:
+		/* Complete send */
+		mad_send_wc.status = IB_WC_SUCCESS;
+		mad_send_wc.vendor_err = 0;
+		mad_send_wc.send_buf = &local->mad_send_wr->send_buf;
+		if (atomic_read(&mad_agent_priv->qp_info->snoop_count))
+			snoop_send(mad_agent_priv->qp_info,
+				   &local->mad_send_wr->send_buf,
+				   &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS);
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		atomic_dec(&mad_agent_priv->refcount);
+		if (free_mad)
+			kfree(local->mad_priv);
+		kfree(local);
+	}
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	int ret;
+
+	if (!mad_send_wr->retries_left)
+		return -ETIMEDOUT;
+
+	mad_send_wr->retries_left--;
+	mad_send_wr->send_buf.retries++;
+
+	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+
+	if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) {
+		ret = ib_retry_rmpp(mad_send_wr);
+		switch (ret) {
+		case IB_RMPP_RESULT_UNHANDLED:
+			ret = ib_send_mad(mad_send_wr);
+			break;
+		case IB_RMPP_RESULT_CONSUMED:
+			ret = 0;
+			break;
+		default:
+			ret = -ECOMM;
+			break;
+		}
+	} else
+		ret = ib_send_mad(mad_send_wr);
+
+	if (!ret) {
+		mad_send_wr->refcount++;
+		list_add_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->send_list);
+	}
+	return ret;
+}
+
+static void timeout_sends(struct work_struct *work)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	unsigned long flags, delay;
+
+	mad_agent_priv = container_of(work, struct ib_mad_agent_private,
+				      timed_work.work);
+	mad_send_wc.vendor_err = 0;
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	while (!list_empty(&mad_agent_priv->wait_list)) {
+		mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+					 struct ib_mad_send_wr_private,
+					 agent_list);
+
+		if (time_after(mad_send_wr->timeout, jiffies)) {
+			delay = mad_send_wr->timeout - jiffies;
+			if ((long)delay <= 0)
+				delay = 1;
+			queue_delayed_work(mad_agent_priv->qp_info->
+					   port_priv->wq,
+					   &mad_agent_priv->timed_work, delay);
+			break;
+		}
+
+		list_del(&mad_send_wr->agent_list);
+		if (mad_send_wr->status == IB_WC_SUCCESS &&
+		    !retry_send(mad_send_wr))
+			continue;
+
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		if (mad_send_wr->status == IB_WC_SUCCESS)
+			mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
+		else
+			mad_send_wc.status = mad_send_wr->status;
+		mad_send_wc.send_buf = &mad_send_wr->send_buf;
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		atomic_dec(&mad_agent_priv->refcount);
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	}
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+/*
+ * Allocate receive MADs and post receive WRs for them
+ */
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad)
+{
+	unsigned long flags;
+	int post, ret;
+	struct ib_mad_private *mad_priv;
+	struct ib_sge sg_list;
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+
+	/* Initialize common scatter list fields */
+	sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
+
+	/* Initialize common receive WR fields */
+	recv_wr.next = NULL;
+	recv_wr.sg_list = &sg_list;
+	recv_wr.num_sge = 1;
+
+	do {
+		/* Allocate and map receive buffer */
+		if (mad) {
+			mad_priv = mad;
+			mad = NULL;
+		} else {
+			mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv),
+						     GFP_ATOMIC);
+			if (!mad_priv) {
+				ret = -ENOMEM;
+				break;
+			}
+		}
+		sg_list.length = mad_priv_dma_size(mad_priv);
+		sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
+						 &mad_priv->grh,
+						 mad_priv_dma_size(mad_priv),
+						 DMA_FROM_DEVICE);
+		if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device,
+						  sg_list.addr))) {
+			ret = -ENOMEM;
+			break;
+		}
+		mad_priv->header.mapping = sg_list.addr;
+		mad_priv->header.mad_list.mad_queue = recv_queue;
+		mad_priv->header.mad_list.cqe.done = ib_mad_recv_done;
+		recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe;
+
+		/* Post receive WR */
+		spin_lock_irqsave(&recv_queue->lock, flags);
+		post = (++recv_queue->count < recv_queue->max_active);
+		list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+		spin_unlock_irqrestore(&recv_queue->lock, flags);
+		ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr);
+		if (ret) {
+			spin_lock_irqsave(&recv_queue->lock, flags);
+			list_del(&mad_priv->header.mad_list.list);
+			recv_queue->count--;
+			spin_unlock_irqrestore(&recv_queue->lock, flags);
+			ib_dma_unmap_single(qp_info->port_priv->device,
+					    mad_priv->header.mapping,
+					    mad_priv_dma_size(mad_priv),
+					    DMA_FROM_DEVICE);
+			kfree(mad_priv);
+			dev_err(&qp_info->port_priv->device->dev,
+				"ib_post_recv failed: %d\n", ret);
+			break;
+		}
+	} while (post);
+
+	return ret;
+}
+
+/*
+ * Return all the posted receive MADs
+ */
+static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info)
+{
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *recv;
+	struct ib_mad_list_head *mad_list;
+
+	if (!qp_info->qp)
+		return;
+
+	while (!list_empty(&qp_info->recv_queue.list)) {
+
+		mad_list = list_entry(qp_info->recv_queue.list.next,
+				      struct ib_mad_list_head, list);
+		mad_priv_hdr = container_of(mad_list,
+					    struct ib_mad_private_header,
+					    mad_list);
+		recv = container_of(mad_priv_hdr, struct ib_mad_private,
+				    header);
+
+		/* Remove from posted receive MAD list */
+		list_del(&mad_list->list);
+
+		ib_dma_unmap_single(qp_info->port_priv->device,
+				    recv->header.mapping,
+				    mad_priv_dma_size(recv),
+				    DMA_FROM_DEVICE);
+		kfree(recv);
+	}
+
+	qp_info->recv_queue.count = 0;
+}
+
+/*
+ * Start the port
+ */
+static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
+{
+	int ret, i;
+	struct ib_qp_attr *attr;
+	struct ib_qp *qp;
+	u16 pkey_index;
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	ret = ib_find_pkey(port_priv->device, port_priv->port_num,
+			   IB_DEFAULT_PKEY_FULL, &pkey_index);
+	if (ret)
+		pkey_index = 0;
+
+	for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+		qp = port_priv->qp_info[i].qp;
+		if (!qp)
+			continue;
+
+		/*
+		 * PKey index for QP1 is irrelevant but
+		 * one is needed for the Reset to Init transition
+		 */
+		attr->qp_state = IB_QPS_INIT;
+		attr->pkey_index = pkey_index;
+		attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE |
+					     IB_QP_PKEY_INDEX | IB_QP_QKEY);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"Couldn't change QP%d state to INIT: %d\n",
+				i, ret);
+			goto out;
+		}
+
+		attr->qp_state = IB_QPS_RTR;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"Couldn't change QP%d state to RTR: %d\n",
+				i, ret);
+			goto out;
+		}
+
+		attr->qp_state = IB_QPS_RTS;
+		attr->sq_psn = IB_MAD_SEND_Q_PSN;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"Couldn't change QP%d state to RTS: %d\n",
+				i, ret);
+			goto out;
+		}
+	}
+
+	ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		dev_err(&port_priv->device->dev,
+			"Failed to request completion notification: %d\n",
+			ret);
+		goto out;
+	}
+
+	for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+		if (!port_priv->qp_info[i].qp)
+			continue;
+
+		ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"Couldn't post receive WRs\n");
+			goto out;
+		}
+	}
+out:
+	kfree(attr);
+	return ret;
+}
+
+static void qp_event_handler(struct ib_event *event, void *qp_context)
+{
+	struct ib_mad_qp_info	*qp_info = qp_context;
+
+	/* It's worse than that! He's dead, Jim! */
+	dev_err(&qp_info->port_priv->device->dev,
+		"Fatal error (%d) on MAD QP (%d)\n",
+		event->event, qp_info->qp->qp_num);
+}
+
+static void init_mad_queue(struct ib_mad_qp_info *qp_info,
+			   struct ib_mad_queue *mad_queue)
+{
+	mad_queue->qp_info = qp_info;
+	mad_queue->count = 0;
+	spin_lock_init(&mad_queue->lock);
+	INIT_LIST_HEAD(&mad_queue->list);
+}
+
+static void init_mad_qp(struct ib_mad_port_private *port_priv,
+			struct ib_mad_qp_info *qp_info)
+{
+	qp_info->port_priv = port_priv;
+	init_mad_queue(qp_info, &qp_info->send_queue);
+	init_mad_queue(qp_info, &qp_info->recv_queue);
+	INIT_LIST_HEAD(&qp_info->overflow_list);
+	spin_lock_init(&qp_info->snoop_lock);
+	qp_info->snoop_table = NULL;
+	qp_info->snoop_table_size = 0;
+	atomic_set(&qp_info->snoop_count, 0);
+}
+
+static int create_mad_qp(struct ib_mad_qp_info *qp_info,
+			 enum ib_qp_type qp_type)
+{
+	struct ib_qp_init_attr	qp_init_attr;
+	int ret;
+
+	memset(&qp_init_attr, 0, sizeof qp_init_attr);
+	qp_init_attr.send_cq = qp_info->port_priv->cq;
+	qp_init_attr.recv_cq = qp_info->port_priv->cq;
+	qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.cap.max_send_wr = mad_sendq_size;
+	qp_init_attr.cap.max_recv_wr = mad_recvq_size;
+	qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG;
+	qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG;
+	qp_init_attr.qp_type = qp_type;
+	qp_init_attr.port_num = qp_info->port_priv->port_num;
+	qp_init_attr.qp_context = qp_info;
+	qp_init_attr.event_handler = qp_event_handler;
+	qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr);
+	if (IS_ERR(qp_info->qp)) {
+		dev_err(&qp_info->port_priv->device->dev,
+			"Couldn't create ib_mad QP%d\n",
+			get_spl_qp_index(qp_type));
+		ret = PTR_ERR(qp_info->qp);
+		goto error;
+	}
+	/* Use minimum queue sizes unless the CQ is resized */
+	qp_info->send_queue.max_active = mad_sendq_size;
+	qp_info->recv_queue.max_active = mad_recvq_size;
+	return 0;
+
+error:
+	return ret;
+}
+
+static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
+{
+	if (!qp_info->qp)
+		return;
+
+	ib_destroy_qp(qp_info->qp);
+	kfree(qp_info->snoop_table);
+}
+
+/*
+ * Open the port
+ * Create the QP, PD, MR, and CQ if needed
+ */
+static int ib_mad_port_open(struct ib_device *device,
+			    int port_num)
+{
+	int ret, cq_size;
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+	char name[sizeof "ib_mad123"];
+	int has_smi;
+
+	if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE))
+		return -EFAULT;
+
+	if (WARN_ON(rdma_cap_opa_mad(device, port_num) &&
+		    rdma_max_mad_size(device, port_num) < OPA_MGMT_MAD_SIZE))
+		return -EFAULT;
+
+	/* Create new device info */
+	port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+	if (!port_priv)
+		return -ENOMEM;
+
+	port_priv->device = device;
+	port_priv->port_num = port_num;
+	spin_lock_init(&port_priv->reg_lock);
+	INIT_LIST_HEAD(&port_priv->agent_list);
+	init_mad_qp(port_priv, &port_priv->qp_info[0]);
+	init_mad_qp(port_priv, &port_priv->qp_info[1]);
+
+	cq_size = mad_sendq_size + mad_recvq_size;
+	has_smi = rdma_cap_ib_smi(device, port_num);
+	if (has_smi)
+		cq_size *= 2;
+
+	port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0,
+			IB_POLL_WORKQUEUE);
+	if (IS_ERR(port_priv->cq)) {
+		dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
+		ret = PTR_ERR(port_priv->cq);
+		goto error3;
+	}
+
+	port_priv->pd = ib_alloc_pd(device, 0);
+	if (IS_ERR(port_priv->pd)) {
+		dev_err(&device->dev, "Couldn't create ib_mad PD\n");
+		ret = PTR_ERR(port_priv->pd);
+		goto error4;
+	}
+
+	if (has_smi) {
+		ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
+		if (ret)
+			goto error6;
+	}
+	ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
+	if (ret)
+		goto error7;
+
+	snprintf(name, sizeof name, "ib_mad%d", port_num);
+	port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+	if (!port_priv->wq) {
+		ret = -ENOMEM;
+		goto error8;
+	}
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	list_add_tail(&port_priv->port_list, &ib_mad_port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	ret = ib_mad_port_start(port_priv);
+	if (ret) {
+		dev_err(&device->dev, "Couldn't start port\n");
+		goto error9;
+	}
+
+	return 0;
+
+error9:
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	list_del_init(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	destroy_workqueue(port_priv->wq);
+error8:
+	destroy_mad_qp(&port_priv->qp_info[1]);
+error7:
+	destroy_mad_qp(&port_priv->qp_info[0]);
+error6:
+	ib_dealloc_pd(port_priv->pd);
+error4:
+	ib_free_cq(port_priv->cq);
+	cleanup_recv_queue(&port_priv->qp_info[1]);
+	cleanup_recv_queue(&port_priv->qp_info[0]);
+error3:
+	kfree(port_priv);
+
+	return ret;
+}
+
+/*
+ * Close the port
+ * If there are no classes using the port, free the port
+ * resources (CQ, MR, PD, QP) and remove the port's info structure
+ */
+static int ib_mad_port_close(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	port_priv = __ib_get_mad_port(device, port_num);
+	if (port_priv == NULL) {
+		spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+		dev_err(&device->dev, "Port %d not found\n", port_num);
+		return -ENODEV;
+	}
+	list_del_init(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	destroy_workqueue(port_priv->wq);
+	destroy_mad_qp(&port_priv->qp_info[1]);
+	destroy_mad_qp(&port_priv->qp_info[0]);
+	ib_dealloc_pd(port_priv->pd);
+	ib_free_cq(port_priv->cq);
+	cleanup_recv_queue(&port_priv->qp_info[1]);
+	cleanup_recv_queue(&port_priv->qp_info[0]);
+	/* XXX: Handle deallocation of MAD registration tables */
+
+	kfree(port_priv);
+
+	return 0;
+}
+
+static void ib_mad_init_device(struct ib_device *device)
+{
+	int start, i;
+
+	start = rdma_start_port(device);
+
+	for (i = start; i <= rdma_end_port(device); i++) {
+		if (!rdma_cap_ib_mad(device, i))
+			continue;
+
+		if (ib_mad_port_open(device, i)) {
+			dev_err(&device->dev, "Couldn't open port %d\n", i);
+			goto error;
+		}
+		if (ib_agent_port_open(device, i)) {
+			dev_err(&device->dev,
+				"Couldn't open port %d for agents\n", i);
+			goto error_agent;
+		}
+	}
+	return;
+
+error_agent:
+	if (ib_mad_port_close(device, i))
+		dev_err(&device->dev, "Couldn't close port %d\n", i);
+
+error:
+	while (--i >= start) {
+		if (!rdma_cap_ib_mad(device, i))
+			continue;
+
+		if (ib_agent_port_close(device, i))
+			dev_err(&device->dev,
+				"Couldn't close port %d for agents\n", i);
+		if (ib_mad_port_close(device, i))
+			dev_err(&device->dev, "Couldn't close port %d\n", i);
+	}
+}
+
+static void ib_mad_remove_device(struct ib_device *device, void *client_data)
+{
+	int i;
+
+	for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+		if (!rdma_cap_ib_mad(device, i))
+			continue;
+
+		if (ib_agent_port_close(device, i))
+			dev_err(&device->dev,
+				"Couldn't close port %d for agents\n", i);
+		if (ib_mad_port_close(device, i))
+			dev_err(&device->dev, "Couldn't close port %d\n", i);
+	}
+}
+
+static struct ib_client mad_client = {
+	.name   = "mad",
+	.add = ib_mad_init_device,
+	.remove = ib_mad_remove_device
+};
+
+int ib_mad_init(void)
+{
+	mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
+	mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
+
+	mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE);
+	mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE);
+
+	INIT_LIST_HEAD(&ib_mad_port_list);
+
+	if (ib_register_client(&mad_client)) {
+		pr_err("Couldn't register ib_mad client\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void ib_mad_cleanup(void)
+{
+	ib_unregister_client(&mad_client);
+}
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
new file mode 100644
index 0000000..28669f6
--- /dev/null
+++ b/drivers/infiniband/core/mad_priv.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __IB_MAD_PRIV_H__
+#define __IB_MAD_PRIV_H__
+
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/opa_smi.h>
+
+#define IB_MAD_QPS_CORE		2 /* Always QP0 and QP1 as a minimum */
+
+/* QP and CQ parameters */
+#define IB_MAD_QP_SEND_SIZE	128
+#define IB_MAD_QP_RECV_SIZE	512
+#define IB_MAD_QP_MIN_SIZE	64
+#define IB_MAD_QP_MAX_SIZE	8192
+#define IB_MAD_SEND_REQ_MAX_SG	2
+#define IB_MAD_RECV_REQ_MAX_SG	1
+
+#define IB_MAD_SEND_Q_PSN	0
+
+/* Registration table sizes */
+#define MAX_MGMT_CLASS		80
+#define MAX_MGMT_VERSION	0x83
+#define MAX_MGMT_OUI		8
+#define MAX_MGMT_VENDOR_RANGE2	(IB_MGMT_CLASS_VENDOR_RANGE2_END - \
+				IB_MGMT_CLASS_VENDOR_RANGE2_START + 1)
+
+struct ib_mad_list_head {
+	struct list_head list;
+	struct ib_cqe cqe;
+	struct ib_mad_queue *mad_queue;
+};
+
+struct ib_mad_private_header {
+	struct ib_mad_list_head mad_list;
+	struct ib_mad_recv_wc recv_wc;
+	struct ib_wc wc;
+	u64 mapping;
+} __attribute__ ((packed));
+
+struct ib_mad_private {
+	struct ib_mad_private_header header;
+	size_t mad_size;
+	struct ib_grh grh;
+	u8 mad[0];
+} __attribute__ ((packed));
+
+struct ib_rmpp_segment {
+	struct list_head list;
+	u32 num;
+	u8 data[0];
+};
+
+struct ib_mad_agent_private {
+	struct list_head agent_list;
+	struct ib_mad_agent agent;
+	struct ib_mad_reg_req *reg_req;
+	struct ib_mad_qp_info *qp_info;
+
+	spinlock_t lock;
+	struct list_head send_list;
+	struct list_head wait_list;
+	struct list_head done_list;
+	struct delayed_work timed_work;
+	unsigned long timeout;
+	struct list_head local_list;
+	struct work_struct local_work;
+	struct list_head rmpp_list;
+
+	atomic_t refcount;
+	struct completion comp;
+};
+
+struct ib_mad_snoop_private {
+	struct ib_mad_agent agent;
+	struct ib_mad_qp_info *qp_info;
+	int snoop_index;
+	int mad_snoop_flags;
+	atomic_t refcount;
+	struct completion comp;
+};
+
+struct ib_mad_send_wr_private {
+	struct ib_mad_list_head mad_list;
+	struct list_head agent_list;
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_buf send_buf;
+	u64 header_mapping;
+	u64 payload_mapping;
+	struct ib_ud_wr send_wr;
+	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
+	__be64 tid;
+	unsigned long timeout;
+	int max_retries;
+	int retries_left;
+	int retry;
+	int refcount;
+	enum ib_wc_status status;
+
+	/* RMPP control */
+	struct list_head rmpp_list;
+	struct ib_rmpp_segment *last_ack_seg;
+	struct ib_rmpp_segment *cur_seg;
+	int last_ack;
+	int seg_num;
+	int newwin;
+	int pad;
+};
+
+struct ib_mad_local_private {
+	struct list_head completion_list;
+	struct ib_mad_private *mad_priv;
+	struct ib_mad_agent_private *recv_mad_agent;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	size_t return_wc_byte_len;
+};
+
+struct ib_mad_mgmt_method_table {
+	struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS];
+};
+
+struct ib_mad_mgmt_class_table {
+	struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS];
+};
+
+struct ib_mad_mgmt_vendor_class {
+	u8	oui[MAX_MGMT_OUI][3];
+	struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI];
+};
+
+struct ib_mad_mgmt_vendor_class_table {
+	struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2];
+};
+
+struct ib_mad_mgmt_version_table {
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+};
+
+struct ib_mad_queue {
+	spinlock_t lock;
+	struct list_head list;
+	int count;
+	int max_active;
+	struct ib_mad_qp_info *qp_info;
+};
+
+struct ib_mad_qp_info {
+	struct ib_mad_port_private *port_priv;
+	struct ib_qp *qp;
+	struct ib_mad_queue send_queue;
+	struct ib_mad_queue recv_queue;
+	struct list_head overflow_list;
+	spinlock_t snoop_lock;
+	struct ib_mad_snoop_private **snoop_table;
+	int snoop_table_size;
+	atomic_t snoop_count;
+};
+
+struct ib_mad_port_private {
+	struct list_head port_list;
+	struct ib_device *device;
+	int port_num;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+
+	spinlock_t reg_lock;
+	struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
+	struct list_head agent_list;
+	struct workqueue_struct *wq;
+	struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+};
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_send_wr_private *
+ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
+		 const struct ib_mad_recv_wc *mad_recv_wc);
+
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+			     struct ib_mad_send_wc *mad_send_wc);
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+			  int timeout_ms);
+
+#endif	/* __IB_MAD_PRIV_H__ */
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
new file mode 100644
index 0000000..e5cf09c
--- /dev/null
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -0,0 +1,968 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+
+enum rmpp_state {
+	RMPP_STATE_ACTIVE,
+	RMPP_STATE_TIMEOUT,
+	RMPP_STATE_COMPLETE,
+	RMPP_STATE_CANCELING
+};
+
+struct mad_rmpp_recv {
+	struct ib_mad_agent_private *agent;
+	struct list_head list;
+	struct delayed_work timeout_work;
+	struct delayed_work cleanup_work;
+	struct completion comp;
+	enum rmpp_state state;
+	spinlock_t lock;
+	atomic_t refcount;
+
+	struct ib_ah *ah;
+	struct ib_mad_recv_wc *rmpp_wc;
+	struct ib_mad_recv_buf *cur_seg_buf;
+	int last_ack;
+	int seg_num;
+	int newwin;
+	int repwin;
+
+	__be64 tid;
+	u32 src_qp;
+	u32 slid;
+	u8 mgmt_class;
+	u8 class_version;
+	u8 method;
+	u8 base_version;
+};
+
+static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+	if (atomic_dec_and_test(&rmpp_recv->refcount))
+		complete(&rmpp_recv->comp);
+}
+
+static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+	deref_rmpp_recv(rmpp_recv);
+	wait_for_completion(&rmpp_recv->comp);
+	rdma_destroy_ah(rmpp_recv->ah);
+	kfree(rmpp_recv);
+}
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent)
+{
+	struct mad_rmpp_recv *rmpp_recv, *temp_rmpp_recv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->state != RMPP_STATE_COMPLETE)
+			ib_free_recv_mad(rmpp_recv->rmpp_wc);
+		rmpp_recv->state = RMPP_STATE_CANCELING;
+	}
+	spin_unlock_irqrestore(&agent->lock, flags);
+
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		cancel_delayed_work(&rmpp_recv->timeout_work);
+		cancel_delayed_work(&rmpp_recv->cleanup_work);
+	}
+
+	flush_workqueue(agent->qp_info->port_priv->wq);
+
+	list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv,
+				 &agent->rmpp_list, list) {
+		list_del(&rmpp_recv->list);
+		destroy_rmpp_recv(rmpp_recv);
+	}
+}
+
+static void format_ack(struct ib_mad_send_buf *msg,
+		       struct ib_rmpp_mad *data,
+		       struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_rmpp_mad *ack = msg->mad;
+	unsigned long flags;
+
+	memcpy(ack, &data->mad_hdr, msg->hdr_len);
+
+	ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK;
+	ib_set_rmpp_flags(&ack->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+	spin_lock_irqsave(&rmpp_recv->lock, flags);
+	rmpp_recv->last_ack = rmpp_recv->seg_num;
+	ack->rmpp_hdr.seg_num = cpu_to_be32(rmpp_recv->seg_num);
+	ack->rmpp_hdr.paylen_newwin = cpu_to_be32(rmpp_recv->newwin);
+	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+}
+
+static void ack_recv(struct mad_rmpp_recv *rmpp_recv,
+		     struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	int ret, hdr_len;
+
+	hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+	msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp,
+				 recv_wc->wc->pkey_index, 1, hdr_len,
+				 0, GFP_KERNEL,
+				 IB_MGMT_BASE_VERSION);
+	if (IS_ERR(msg))
+		return;
+
+	format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv);
+	msg->ah = rmpp_recv->ah;
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		ib_free_send_mad(msg);
+}
+
+static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
+						  struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_ah *ah;
+	int hdr_len;
+
+	ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
+				  recv_wc->recv_buf.grh, agent->port_num);
+	if (IS_ERR(ah))
+		return (void *) ah;
+
+	hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+	msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
+				 recv_wc->wc->pkey_index, 1,
+				 hdr_len, 0, GFP_KERNEL,
+				 IB_MGMT_BASE_VERSION);
+	if (IS_ERR(msg))
+		rdma_destroy_ah(ah);
+	else {
+		msg->ah = ah;
+		msg->context[0] = ah;
+	}
+
+	return msg;
+}
+
+static void ack_ds_ack(struct ib_mad_agent_private *agent,
+		       struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	msg = alloc_response_msg(&agent->agent, recv_wc);
+	if (IS_ERR(msg))
+		return;
+
+	rmpp_mad = msg->mad;
+	memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+	rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.seg_num = 0;
+	rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(1);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		rdma_destroy_ah(msg->ah);
+		ib_free_send_mad(msg);
+	}
+}
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc)
+{
+	if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah)
+		rdma_destroy_ah(mad_send_wc->send_buf->ah);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void nack_recv(struct ib_mad_agent_private *agent,
+		      struct ib_mad_recv_wc *recv_wc, u8 rmpp_status)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	msg = alloc_response_msg(&agent->agent, recv_wc);
+	if (IS_ERR(msg))
+		return;
+
+	rmpp_mad = msg->mad;
+	memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+	rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION;
+	rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ABORT;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.rmpp_status = rmpp_status;
+	rmpp_mad->rmpp_hdr.seg_num = 0;
+	rmpp_mad->rmpp_hdr.paylen_newwin = 0;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		rdma_destroy_ah(msg->ah);
+		ib_free_send_mad(msg);
+	}
+}
+
+static void recv_timeout_handler(struct work_struct *work)
+{
+	struct mad_rmpp_recv *rmpp_recv =
+		container_of(work, struct mad_rmpp_recv, timeout_work.work);
+	struct ib_mad_recv_wc *rmpp_wc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+	if (rmpp_recv->state != RMPP_STATE_ACTIVE) {
+		spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+		return;
+	}
+	rmpp_recv->state = RMPP_STATE_TIMEOUT;
+	list_del(&rmpp_recv->list);
+	spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+
+	rmpp_wc = rmpp_recv->rmpp_wc;
+	nack_recv(rmpp_recv->agent, rmpp_wc, IB_MGMT_RMPP_STATUS_T2L);
+	destroy_rmpp_recv(rmpp_recv);
+	ib_free_recv_mad(rmpp_wc);
+}
+
+static void recv_cleanup_handler(struct work_struct *work)
+{
+	struct mad_rmpp_recv *rmpp_recv =
+		container_of(work, struct mad_rmpp_recv, cleanup_work.work);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+	if (rmpp_recv->state == RMPP_STATE_CANCELING) {
+		spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+		return;
+	}
+	list_del(&rmpp_recv->list);
+	spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+	destroy_rmpp_recv(rmpp_recv);
+}
+
+static struct mad_rmpp_recv *
+create_rmpp_recv(struct ib_mad_agent_private *agent,
+		 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_hdr *mad_hdr;
+
+	rmpp_recv = kmalloc(sizeof *rmpp_recv, GFP_KERNEL);
+	if (!rmpp_recv)
+		return NULL;
+
+	rmpp_recv->ah = ib_create_ah_from_wc(agent->agent.qp->pd,
+					     mad_recv_wc->wc,
+					     mad_recv_wc->recv_buf.grh,
+					     agent->agent.port_num);
+	if (IS_ERR(rmpp_recv->ah))
+		goto error;
+
+	rmpp_recv->agent = agent;
+	init_completion(&rmpp_recv->comp);
+	INIT_DELAYED_WORK(&rmpp_recv->timeout_work, recv_timeout_handler);
+	INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler);
+	spin_lock_init(&rmpp_recv->lock);
+	rmpp_recv->state = RMPP_STATE_ACTIVE;
+	atomic_set(&rmpp_recv->refcount, 1);
+
+	rmpp_recv->rmpp_wc = mad_recv_wc;
+	rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf;
+	rmpp_recv->newwin = 1;
+	rmpp_recv->seg_num = 1;
+	rmpp_recv->last_ack = 0;
+	rmpp_recv->repwin = 1;
+
+	mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+	rmpp_recv->tid = mad_hdr->tid;
+	rmpp_recv->src_qp = mad_recv_wc->wc->src_qp;
+	rmpp_recv->slid = mad_recv_wc->wc->slid;
+	rmpp_recv->mgmt_class = mad_hdr->mgmt_class;
+	rmpp_recv->class_version = mad_hdr->class_version;
+	rmpp_recv->method  = mad_hdr->method;
+	rmpp_recv->base_version  = mad_hdr->base_version;
+	return rmpp_recv;
+
+error:	kfree(rmpp_recv);
+	return NULL;
+}
+
+static struct mad_rmpp_recv *
+find_rmpp_recv(struct ib_mad_agent_private *agent,
+	       struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_hdr *mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->tid == mad_hdr->tid &&
+		    rmpp_recv->src_qp == mad_recv_wc->wc->src_qp &&
+		    rmpp_recv->slid == mad_recv_wc->wc->slid &&
+		    rmpp_recv->mgmt_class == mad_hdr->mgmt_class &&
+		    rmpp_recv->class_version == mad_hdr->class_version &&
+		    rmpp_recv->method == mad_hdr->method)
+			return rmpp_recv;
+	}
+	return NULL;
+}
+
+static struct mad_rmpp_recv *
+acquire_rmpp_recv(struct ib_mad_agent_private *agent,
+		  struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+	if (rmpp_recv)
+		atomic_inc(&rmpp_recv->refcount);
+	spin_unlock_irqrestore(&agent->lock, flags);
+	return rmpp_recv;
+}
+
+static struct mad_rmpp_recv *
+insert_rmpp_recv(struct ib_mad_agent_private *agent,
+		 struct mad_rmpp_recv *rmpp_recv)
+{
+	struct mad_rmpp_recv *cur_rmpp_recv;
+
+	cur_rmpp_recv = find_rmpp_recv(agent, rmpp_recv->rmpp_wc);
+	if (!cur_rmpp_recv)
+		list_add_tail(&rmpp_recv->list, &agent->rmpp_list);
+
+	return cur_rmpp_recv;
+}
+
+static inline int get_last_flag(struct ib_mad_recv_buf *seg)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+	return ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_LAST;
+}
+
+static inline int get_seg_num(struct ib_mad_recv_buf *seg)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+	return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+}
+
+static inline struct ib_mad_recv_buf * get_next_seg(struct list_head *rmpp_list,
+						    struct ib_mad_recv_buf *seg)
+{
+	if (seg->list.next == rmpp_list)
+		return NULL;
+
+	return container_of(seg->list.next, struct ib_mad_recv_buf, list);
+}
+
+static inline int window_size(struct ib_mad_agent_private *agent)
+{
+	return max(agent->qp_info->recv_queue.max_active >> 3, 1);
+}
+
+static struct ib_mad_recv_buf * find_seg_location(struct list_head *rmpp_list,
+						  int seg_num)
+{
+	struct ib_mad_recv_buf *seg_buf;
+	int cur_seg_num;
+
+	list_for_each_entry_reverse(seg_buf, rmpp_list, list) {
+		cur_seg_num = get_seg_num(seg_buf);
+		if (seg_num > cur_seg_num)
+			return seg_buf;
+		if (seg_num == cur_seg_num)
+			break;
+	}
+	return NULL;
+}
+
+static void update_seg_num(struct mad_rmpp_recv *rmpp_recv,
+			   struct ib_mad_recv_buf *new_buf)
+{
+	struct list_head *rmpp_list = &rmpp_recv->rmpp_wc->rmpp_list;
+
+	while (new_buf && (get_seg_num(new_buf) == rmpp_recv->seg_num + 1)) {
+		rmpp_recv->cur_seg_buf = new_buf;
+		rmpp_recv->seg_num++;
+		new_buf = get_next_seg(rmpp_list, new_buf);
+	}
+}
+
+static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int hdr_size, data_size, pad;
+	bool opa = rdma_cap_opa_mad(rmpp_recv->agent->qp_info->port_priv->device,
+				    rmpp_recv->agent->qp_info->port_priv->port_num);
+
+	rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad;
+
+	hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+	if (opa && rmpp_recv->base_version == OPA_MGMT_BASE_VERSION) {
+		data_size = sizeof(struct opa_rmpp_mad) - hdr_size;
+		pad = OPA_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+		if (pad > OPA_MGMT_RMPP_DATA || pad < 0)
+			pad = 0;
+	} else {
+		data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
+		pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+		if (pad > IB_MGMT_RMPP_DATA || pad < 0)
+			pad = 0;
+	}
+
+	return hdr_size + rmpp_recv->seg_num * data_size - pad;
+}
+
+static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_mad_recv_wc *rmpp_wc;
+
+	ack_recv(rmpp_recv, rmpp_recv->rmpp_wc);
+	if (rmpp_recv->seg_num > 1)
+		cancel_delayed_work(&rmpp_recv->timeout_work);
+
+	rmpp_wc = rmpp_recv->rmpp_wc;
+	rmpp_wc->mad_len = get_mad_len(rmpp_recv);
+	/* 10 seconds until we can find the packet lifetime */
+	queue_delayed_work(rmpp_recv->agent->qp_info->port_priv->wq,
+			   &rmpp_recv->cleanup_work, msecs_to_jiffies(10000));
+	return rmpp_wc;
+}
+
+static struct ib_mad_recv_wc *
+continue_rmpp(struct ib_mad_agent_private *agent,
+	      struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_recv_buf *prev_buf;
+	struct ib_mad_recv_wc *done_wc;
+	int seg_num;
+	unsigned long flags;
+
+	rmpp_recv = acquire_rmpp_recv(agent, mad_recv_wc);
+	if (!rmpp_recv)
+		goto drop1;
+
+	seg_num = get_seg_num(&mad_recv_wc->recv_buf);
+
+	spin_lock_irqsave(&rmpp_recv->lock, flags);
+	if ((rmpp_recv->state == RMPP_STATE_TIMEOUT) ||
+	    (seg_num > rmpp_recv->newwin))
+		goto drop3;
+
+	if ((seg_num <= rmpp_recv->last_ack) ||
+	    (rmpp_recv->state == RMPP_STATE_COMPLETE)) {
+		spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+		ack_recv(rmpp_recv, mad_recv_wc);
+		goto drop2;
+	}
+
+	prev_buf = find_seg_location(&rmpp_recv->rmpp_wc->rmpp_list, seg_num);
+	if (!prev_buf)
+		goto drop3;
+
+	done_wc = NULL;
+	list_add(&mad_recv_wc->recv_buf.list, &prev_buf->list);
+	if (rmpp_recv->cur_seg_buf == prev_buf) {
+		update_seg_num(rmpp_recv, &mad_recv_wc->recv_buf);
+		if (get_last_flag(rmpp_recv->cur_seg_buf)) {
+			rmpp_recv->state = RMPP_STATE_COMPLETE;
+			spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+			done_wc = complete_rmpp(rmpp_recv);
+			goto out;
+		} else if (rmpp_recv->seg_num == rmpp_recv->newwin) {
+			rmpp_recv->newwin += window_size(agent);
+			spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+			ack_recv(rmpp_recv, mad_recv_wc);
+			goto out;
+		}
+	}
+	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+out:
+	deref_rmpp_recv(rmpp_recv);
+	return done_wc;
+
+drop3:	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+drop2:	deref_rmpp_recv(rmpp_recv);
+drop1:	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static struct ib_mad_recv_wc *
+start_rmpp(struct ib_mad_agent_private *agent,
+	   struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	unsigned long flags;
+
+	rmpp_recv = create_rmpp_recv(agent, mad_recv_wc);
+	if (!rmpp_recv) {
+		ib_free_recv_mad(mad_recv_wc);
+		return NULL;
+	}
+
+	spin_lock_irqsave(&agent->lock, flags);
+	if (insert_rmpp_recv(agent, rmpp_recv)) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		/* duplicate first MAD */
+		destroy_rmpp_recv(rmpp_recv);
+		return continue_rmpp(agent, mad_recv_wc);
+	}
+	atomic_inc(&rmpp_recv->refcount);
+
+	if (get_last_flag(&mad_recv_wc->recv_buf)) {
+		rmpp_recv->state = RMPP_STATE_COMPLETE;
+		spin_unlock_irqrestore(&agent->lock, flags);
+		complete_rmpp(rmpp_recv);
+	} else {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		/* 40 seconds until we can find the packet lifetimes */
+		queue_delayed_work(agent->qp_info->port_priv->wq,
+				   &rmpp_recv->timeout_work,
+				   msecs_to_jiffies(40000));
+		rmpp_recv->newwin += window_size(agent);
+		ack_recv(rmpp_recv, mad_recv_wc);
+		mad_recv_wc = NULL;
+	}
+	deref_rmpp_recv(rmpp_recv);
+	return mad_recv_wc;
+}
+
+static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int timeout;
+	u32 paylen = 0;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num);
+
+	if (mad_send_wr->seg_num == 1) {
+		rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST;
+		paylen = (mad_send_wr->send_buf.seg_count *
+			  mad_send_wr->send_buf.seg_rmpp_size) -
+			  mad_send_wr->pad;
+	}
+
+	if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) {
+		rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST;
+		paylen = mad_send_wr->send_buf.seg_rmpp_size - mad_send_wr->pad;
+	}
+	rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
+
+	/* 2 seconds for an ACK until we can find the packet lifetime */
+	timeout = mad_send_wr->send_buf.timeout_ms;
+	if (!timeout || timeout > 2000)
+		mad_send_wr->timeout = msecs_to_jiffies(2000);
+
+	return ib_send_mad(mad_send_wr);
+}
+
+static void abort_send(struct ib_mad_agent_private *agent,
+		       struct ib_mad_recv_wc *mad_recv_wc, u8 rmpp_status)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc wc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+	if (!mad_send_wr)
+		goto out;	/* Unmatched send */
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+	    (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+		goto out;	/* Send is already done */
+
+	ib_mark_mad_done(mad_send_wr);
+	spin_unlock_irqrestore(&agent->lock, flags);
+
+	wc.status = IB_WC_REM_ABORT_ERR;
+	wc.vendor_err = rmpp_status;
+	wc.send_buf = &mad_send_wr->send_buf;
+	ib_mad_complete_send_wr(mad_send_wr, &wc);
+	return;
+out:
+	spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr,
+				   int seg_num)
+{
+	struct list_head *list;
+
+	wr->last_ack = seg_num;
+	list = &wr->last_ack_seg->list;
+	list_for_each_entry(wr->last_ack_seg, list, list)
+		if (wr->last_ack_seg->num == seg_num)
+			break;
+}
+
+static void process_ds_ack(struct ib_mad_agent_private *agent,
+			   struct ib_mad_recv_wc *mad_recv_wc, int newwin)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+
+	rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+	if (rmpp_recv && rmpp_recv->state == RMPP_STATE_COMPLETE)
+		rmpp_recv->repwin = newwin;
+}
+
+static void process_rmpp_ack(struct ib_mad_agent_private *agent,
+			     struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_rmpp_mad *rmpp_mad;
+	unsigned long flags;
+	int seg_num, newwin, ret;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+	if (rmpp_mad->rmpp_hdr.rmpp_status) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		return;
+	}
+
+	seg_num = be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+	newwin = be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+	if (newwin < seg_num) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+		return;
+	}
+
+	spin_lock_irqsave(&agent->lock, flags);
+	mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+	if (!mad_send_wr) {
+		if (!seg_num)
+			process_ds_ack(agent, mad_recv_wc, newwin);
+		goto out;	/* Unmatched or DS RMPP ACK */
+	}
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) &&
+	    (mad_send_wr->timeout)) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		ack_ds_ack(agent, mad_recv_wc);
+		return;		/* Repeated ACK for DS RMPP transaction */
+	}
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+	    (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+		goto out;	/* Send is already done */
+
+	if (seg_num > mad_send_wr->send_buf.seg_count ||
+	    seg_num > mad_send_wr->newwin) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+		return;
+	}
+
+	if (newwin < mad_send_wr->newwin || seg_num < mad_send_wr->last_ack)
+		goto out;	/* Old ACK */
+
+	if (seg_num > mad_send_wr->last_ack) {
+		adjust_last_ack(mad_send_wr, seg_num);
+		mad_send_wr->retries_left = mad_send_wr->max_retries;
+	}
+	mad_send_wr->newwin = newwin;
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+		/* If no response is expected, the ACK completes the send */
+		if (!mad_send_wr->send_buf.timeout_ms) {
+			struct ib_mad_send_wc wc;
+
+			ib_mark_mad_done(mad_send_wr);
+			spin_unlock_irqrestore(&agent->lock, flags);
+
+			wc.status = IB_WC_SUCCESS;
+			wc.vendor_err = 0;
+			wc.send_buf = &mad_send_wr->send_buf;
+			ib_mad_complete_send_wr(mad_send_wr, &wc);
+			return;
+		}
+		if (mad_send_wr->refcount == 1)
+			ib_reset_mad_timeout(mad_send_wr,
+					     mad_send_wr->send_buf.timeout_ms);
+		spin_unlock_irqrestore(&agent->lock, flags);
+		ack_ds_ack(agent, mad_recv_wc);
+		return;
+	} else if (mad_send_wr->refcount == 1 &&
+		   mad_send_wr->seg_num < mad_send_wr->newwin &&
+		   mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) {
+		/* Send failure will just result in a timeout/retry */
+		ret = send_next_seg(mad_send_wr);
+		if (ret)
+			goto out;
+
+		mad_send_wr->refcount++;
+		list_move_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->send_list);
+	}
+out:
+	spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static struct ib_mad_recv_wc *
+process_rmpp_data(struct ib_mad_agent_private *agent,
+		  struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_hdr *rmpp_hdr;
+	u8 rmpp_status;
+
+	rmpp_hdr = &((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr;
+
+	if (rmpp_hdr->rmpp_status) {
+		rmpp_status = IB_MGMT_RMPP_STATUS_BAD_STATUS;
+		goto bad;
+	}
+
+	if (rmpp_hdr->seg_num == cpu_to_be32(1)) {
+		if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) {
+			rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+			goto bad;
+		}
+		return start_rmpp(agent, mad_recv_wc);
+	} else {
+		if (ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST) {
+			rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+			goto bad;
+		}
+		return continue_rmpp(agent, mad_recv_wc);
+	}
+bad:
+	nack_recv(agent, mad_recv_wc, rmpp_status);
+	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static void process_rmpp_stop(struct ib_mad_agent_private *agent,
+			      struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_status != IB_MGMT_RMPP_STATUS_RESX) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+	} else
+		abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+static void process_rmpp_abort(struct ib_mad_agent_private *agent,
+			       struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_status < IB_MGMT_RMPP_STATUS_ABORT_MIN ||
+	    rmpp_mad->rmpp_hdr.rmpp_status > IB_MGMT_RMPP_STATUS_ABORT_MAX) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+	} else
+		abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+			struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+	if (!(rmpp_mad->rmpp_hdr.rmpp_rtime_flags & IB_MGMT_RMPP_FLAG_ACTIVE))
+		return mad_recv_wc;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_version != IB_MGMT_RMPP_VERSION) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+		goto out;
+	}
+
+	switch (rmpp_mad->rmpp_hdr.rmpp_type) {
+	case IB_MGMT_RMPP_TYPE_DATA:
+		return process_rmpp_data(agent, mad_recv_wc);
+	case IB_MGMT_RMPP_TYPE_ACK:
+		process_rmpp_ack(agent, mad_recv_wc);
+		break;
+	case IB_MGMT_RMPP_TYPE_STOP:
+		process_rmpp_stop(agent, mad_recv_wc);
+		break;
+	case IB_MGMT_RMPP_TYPE_ABORT:
+		process_rmpp_abort(agent, mad_recv_wc);
+		break;
+	default:
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+		break;
+	}
+out:
+	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv;
+	struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad;
+	struct mad_rmpp_recv *rmpp_recv;
+	struct rdma_ah_attr ah_attr;
+	unsigned long flags;
+	int newwin = 1;
+
+	if (!(mad_hdr->method & IB_MGMT_METHOD_RESP))
+		goto out;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->tid != mad_hdr->tid ||
+		    rmpp_recv->mgmt_class != mad_hdr->mgmt_class ||
+		    rmpp_recv->class_version != mad_hdr->class_version ||
+		    (rmpp_recv->method & IB_MGMT_METHOD_RESP))
+			continue;
+
+		if (rdma_query_ah(mad_send_wr->send_buf.ah, &ah_attr))
+			continue;
+
+		if (rmpp_recv->slid == rdma_ah_get_dlid(&ah_attr)) {
+			newwin = rmpp_recv->repwin;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&agent->lock, flags);
+out:
+	return newwin;
+}
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) {
+		mad_send_wr->seg_num = 1;
+		return IB_RMPP_RESULT_INTERNAL;
+	}
+
+	mad_send_wr->newwin = init_newwin(mad_send_wr);
+
+	/* We need to wait for the final ACK even if there isn't a response */
+	mad_send_wr->refcount += (mad_send_wr->timeout == 0);
+	ret = send_next_seg(mad_send_wr);
+	if (!ret)
+		return IB_RMPP_RESULT_CONSUMED;
+	return ret;
+}
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+			    struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+	if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
+		return IB_RMPP_RESULT_INTERNAL;	 /* ACK, STOP, or ABORT */
+
+	if (mad_send_wc->status != IB_WC_SUCCESS ||
+	    mad_send_wr->status != IB_WC_SUCCESS)
+		return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */
+
+	if (!mad_send_wr->timeout)
+		return IB_RMPP_RESULT_PROCESSED; /* Response received */
+
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+		mad_send_wr->timeout =
+			msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+		return IB_RMPP_RESULT_PROCESSED; /* Send done */
+	}
+
+	if (mad_send_wr->seg_num == mad_send_wr->newwin ||
+	    mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count)
+		return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */
+
+	ret = send_next_seg(mad_send_wr);
+	if (ret) {
+		mad_send_wc->status = IB_WC_GENERAL_ERR;
+		return IB_RMPP_RESULT_PROCESSED;
+	}
+	return IB_RMPP_RESULT_CONSUMED;
+}
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count)
+		return IB_RMPP_RESULT_PROCESSED;
+
+	mad_send_wr->seg_num = mad_send_wr->last_ack;
+	mad_send_wr->cur_seg = mad_send_wr->last_ack_seg;
+
+	ret = send_next_seg(mad_send_wr);
+	if (ret)
+		return IB_RMPP_RESULT_PROCESSED;
+
+	return IB_RMPP_RESULT_CONSUMED;
+}
diff --git a/drivers/infiniband/core/mad_rmpp.h b/drivers/infiniband/core/mad_rmpp.h
new file mode 100644
index 0000000..3d336bf
--- /dev/null
+++ b/drivers/infiniband/core/mad_rmpp.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MAD_RMPP_H__
+#define __MAD_RMPP_H__
+
+enum {
+	IB_RMPP_RESULT_PROCESSED,
+	IB_RMPP_RESULT_CONSUMED,
+	IB_RMPP_RESULT_INTERNAL,
+	IB_RMPP_RESULT_UNHANDLED
+};
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+			struct ib_mad_recv_wc *mad_recv_wc);
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+			    struct ib_mad_send_wc *mad_send_wc);
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc);
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent);
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr);
+
+#endif	/* __MAD_RMPP_H__ */
diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c
new file mode 100644
index 0000000..49d478b
--- /dev/null
+++ b/drivers/infiniband/core/mr_pool.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <rdma/ib_verbs.h>
+#include <rdma/mr_pool.h>
+
+struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list)
+{
+	struct ib_mr *mr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->mr_lock, flags);
+	mr = list_first_entry_or_null(list, struct ib_mr, qp_entry);
+	if (mr) {
+		list_del(&mr->qp_entry);
+		qp->mrs_used++;
+	}
+	spin_unlock_irqrestore(&qp->mr_lock, flags);
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_mr_pool_get);
+
+void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->mr_lock, flags);
+	list_add(&mr->qp_entry, list);
+	qp->mrs_used--;
+	spin_unlock_irqrestore(&qp->mr_lock, flags);
+}
+EXPORT_SYMBOL(ib_mr_pool_put);
+
+int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
+		enum ib_mr_type type, u32 max_num_sg)
+{
+	struct ib_mr *mr;
+	unsigned long flags;
+	int ret, i;
+
+	for (i = 0; i < nr; i++) {
+		mr = ib_alloc_mr(qp->pd, type, max_num_sg);
+		if (IS_ERR(mr)) {
+			ret = PTR_ERR(mr);
+			goto out;
+		}
+
+		spin_lock_irqsave(&qp->mr_lock, flags);
+		list_add_tail(&mr->qp_entry, list);
+		spin_unlock_irqrestore(&qp->mr_lock, flags);
+	}
+
+	return 0;
+out:
+	ib_mr_pool_destroy(qp, list);
+	return ret;
+}
+EXPORT_SYMBOL(ib_mr_pool_init);
+
+void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list)
+{
+	struct ib_mr *mr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->mr_lock, flags);
+	while (!list_empty(list)) {
+		mr = list_first_entry(list, struct ib_mr, qp_entry);
+		list_del(&mr->qp_entry);
+
+		spin_unlock_irqrestore(&qp->mr_lock, flags);
+		ib_dereg_mr(mr);
+		spin_lock_irqsave(&qp->mr_lock, flags);
+	}
+	spin_unlock_irqrestore(&qp->mr_lock, flags);
+}
+EXPORT_SYMBOL(ib_mr_pool_destroy);
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
new file mode 100644
index 0000000..4eb72ff
--- /dev/null
+++ b/drivers/infiniband/core/multicast.c
@@ -0,0 +1,898 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/random.h>
+
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+static void mcast_add_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device, void *client_data);
+
+static struct ib_client mcast_client = {
+	.name   = "ib_multicast",
+	.add    = mcast_add_one,
+	.remove = mcast_remove_one
+};
+
+static struct ib_sa_client	sa_client;
+static struct workqueue_struct	*mcast_wq;
+static union ib_gid mgid0;
+
+struct mcast_device;
+
+struct mcast_port {
+	struct mcast_device	*dev;
+	spinlock_t		lock;
+	struct rb_root		table;
+	atomic_t		refcount;
+	struct completion	comp;
+	u8			port_num;
+};
+
+struct mcast_device {
+	struct ib_device	*device;
+	struct ib_event_handler	event_handler;
+	int			start_port;
+	int			end_port;
+	struct mcast_port	port[0];
+};
+
+enum mcast_state {
+	MCAST_JOINING,
+	MCAST_MEMBER,
+	MCAST_ERROR,
+};
+
+enum mcast_group_state {
+	MCAST_IDLE,
+	MCAST_BUSY,
+	MCAST_GROUP_ERROR,
+	MCAST_PKEY_EVENT
+};
+
+enum {
+	MCAST_INVALID_PKEY_INDEX = 0xFFFF
+};
+
+struct mcast_member;
+
+struct mcast_group {
+	struct ib_sa_mcmember_rec rec;
+	struct rb_node		node;
+	struct mcast_port	*port;
+	spinlock_t		lock;
+	struct work_struct	work;
+	struct list_head	pending_list;
+	struct list_head	active_list;
+	struct mcast_member	*last_join;
+	int			members[NUM_JOIN_MEMBERSHIP_TYPES];
+	atomic_t		refcount;
+	enum mcast_group_state	state;
+	struct ib_sa_query	*query;
+	u16			pkey_index;
+	u8			leave_state;
+	int			retries;
+};
+
+struct mcast_member {
+	struct ib_sa_multicast	multicast;
+	struct ib_sa_client	*client;
+	struct mcast_group	*group;
+	struct list_head	list;
+	enum mcast_state	state;
+	atomic_t		refcount;
+	struct completion	comp;
+};
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+			 void *context);
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+			  void *context);
+
+static struct mcast_group *mcast_find(struct mcast_port *port,
+				      union ib_gid *mgid)
+{
+	struct rb_node *node = port->table.rb_node;
+	struct mcast_group *group;
+	int ret;
+
+	while (node) {
+		group = rb_entry(node, struct mcast_group, node);
+		ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+		if (!ret)
+			return group;
+
+		if (ret < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mcast_port *port,
+					struct mcast_group *group,
+					int allow_duplicates)
+{
+	struct rb_node **link = &port->table.rb_node;
+	struct rb_node *parent = NULL;
+	struct mcast_group *cur_group;
+	int ret;
+
+	while (*link) {
+		parent = *link;
+		cur_group = rb_entry(parent, struct mcast_group, node);
+
+		ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+			     sizeof group->rec.mgid);
+		if (ret < 0)
+			link = &(*link)->rb_left;
+		else if (ret > 0)
+			link = &(*link)->rb_right;
+		else if (allow_duplicates)
+			link = &(*link)->rb_left;
+		else
+			return cur_group;
+	}
+	rb_link_node(&group->node, parent, link);
+	rb_insert_color(&group->node, &port->table);
+	return NULL;
+}
+
+static void deref_port(struct mcast_port *port)
+{
+	if (atomic_dec_and_test(&port->refcount))
+		complete(&port->comp);
+}
+
+static void release_group(struct mcast_group *group)
+{
+	struct mcast_port *port = group->port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	if (atomic_dec_and_test(&group->refcount)) {
+		rb_erase(&group->node, &port->table);
+		spin_unlock_irqrestore(&port->lock, flags);
+		kfree(group);
+		deref_port(port);
+	} else
+		spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void deref_member(struct mcast_member *member)
+{
+	if (atomic_dec_and_test(&member->refcount))
+		complete(&member->comp);
+}
+
+static void queue_join(struct mcast_member *member)
+{
+	struct mcast_group *group = member->group;
+	unsigned long flags;
+
+	spin_lock_irqsave(&group->lock, flags);
+	list_add_tail(&member->list, &group->pending_list);
+	if (group->state == MCAST_IDLE) {
+		group->state = MCAST_BUSY;
+		atomic_inc(&group->refcount);
+		queue_work(mcast_wq, &group->work);
+	}
+	spin_unlock_irqrestore(&group->lock, flags);
+}
+
+/*
+ * A multicast group has four types of members: full member, non member,
+ * sendonly non member and sendonly full member.
+ * We need to keep track of the number of members of each
+ * type based on their join state.  Adjust the number of members the belong to
+ * the specified join states.
+ */
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+	int i;
+
+	for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1)
+		if (join_state & 0x1)
+			group->members[i] += inc;
+}
+
+/*
+ * If a multicast group has zero members left for a particular join state, but
+ * the group is still a member with the SA, we need to leave that join state.
+ * Determine which join states we still belong to, but that do not have any
+ * active members.
+ */
+static u8 get_leave_state(struct mcast_group *group)
+{
+	u8 leave_state = 0;
+	int i;
+
+	for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++)
+		if (!group->members[i])
+			leave_state |= (0x1 << i);
+
+	return leave_state & group->rec.join_state;
+}
+
+static int check_selector(ib_sa_comp_mask comp_mask,
+			  ib_sa_comp_mask selector_mask,
+			  ib_sa_comp_mask value_mask,
+			  u8 selector, u8 src_value, u8 dst_value)
+{
+	int err;
+
+	if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+		return 0;
+
+	switch (selector) {
+	case IB_SA_GT:
+		err = (src_value <= dst_value);
+		break;
+	case IB_SA_LT:
+		err = (src_value >= dst_value);
+		break;
+	case IB_SA_EQ:
+		err = (src_value != dst_value);
+		break;
+	default:
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+static int cmp_rec(struct ib_sa_mcmember_rec *src,
+		   struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask)
+{
+	/* MGID must already match */
+
+	if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID &&
+	    memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+		return -EINVAL;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+			   IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
+			   src->mtu, dst->mtu))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+	    src->traffic_class != dst->traffic_class)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+		return -EINVAL;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+			   IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
+			   src->rate, dst->rate))
+		return -EINVAL;
+	if (check_selector(comp_mask,
+			   IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+			   IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+			   dst->packet_life_time_selector,
+			   src->packet_life_time, dst->packet_life_time))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+	    src->flow_label != dst->flow_label)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+	    src->hop_limit != dst->hop_limit)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope)
+		return -EINVAL;
+
+	/* join_state checked separately, proxy_join ignored */
+
+	return 0;
+}
+
+static int send_join(struct mcast_group *group, struct mcast_member *member)
+{
+	struct mcast_port *port = group->port;
+	int ret;
+
+	group->last_join = member;
+	ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+				       port->port_num, IB_MGMT_METHOD_SET,
+				       &member->multicast.rec,
+				       member->multicast.comp_mask,
+				       3000, GFP_KERNEL, join_handler, group,
+				       &group->query);
+	return (ret > 0) ? 0 : ret;
+}
+
+static int send_leave(struct mcast_group *group, u8 leave_state)
+{
+	struct mcast_port *port = group->port;
+	struct ib_sa_mcmember_rec rec;
+	int ret;
+
+	rec = group->rec;
+	rec.join_state = leave_state;
+	group->leave_state = leave_state;
+
+	ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+				       port->port_num, IB_SA_METHOD_DELETE, &rec,
+				       IB_SA_MCMEMBER_REC_MGID     |
+				       IB_SA_MCMEMBER_REC_PORT_GID |
+				       IB_SA_MCMEMBER_REC_JOIN_STATE,
+				       3000, GFP_KERNEL, leave_handler,
+				       group, &group->query);
+	return (ret > 0) ? 0 : ret;
+}
+
+static void join_group(struct mcast_group *group, struct mcast_member *member,
+		       u8 join_state)
+{
+	member->state = MCAST_MEMBER;
+	adjust_membership(group, join_state, 1);
+	group->rec.join_state |= join_state;
+	member->multicast.rec = group->rec;
+	member->multicast.rec.join_state = join_state;
+	list_move(&member->list, &group->active_list);
+}
+
+static int fail_join(struct mcast_group *group, struct mcast_member *member,
+		     int status)
+{
+	spin_lock_irq(&group->lock);
+	list_del_init(&member->list);
+	spin_unlock_irq(&group->lock);
+	return member->multicast.callback(status, &member->multicast);
+}
+
+static void process_group_error(struct mcast_group *group)
+{
+	struct mcast_member *member;
+	int ret = 0;
+	u16 pkey_index;
+
+	if (group->state == MCAST_PKEY_EVENT)
+		ret = ib_find_pkey(group->port->dev->device,
+				   group->port->port_num,
+				   be16_to_cpu(group->rec.pkey), &pkey_index);
+
+	spin_lock_irq(&group->lock);
+	if (group->state == MCAST_PKEY_EVENT && !ret &&
+	    group->pkey_index == pkey_index)
+		goto out;
+
+	while (!list_empty(&group->active_list)) {
+		member = list_entry(group->active_list.next,
+				    struct mcast_member, list);
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		adjust_membership(group, member->multicast.rec.join_state, -1);
+		member->state = MCAST_ERROR;
+		spin_unlock_irq(&group->lock);
+
+		ret = member->multicast.callback(-ENETRESET,
+						 &member->multicast);
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+		spin_lock_irq(&group->lock);
+	}
+
+	group->rec.join_state = 0;
+out:
+	group->state = MCAST_BUSY;
+	spin_unlock_irq(&group->lock);
+}
+
+static void mcast_work_handler(struct work_struct *work)
+{
+	struct mcast_group *group;
+	struct mcast_member *member;
+	struct ib_sa_multicast *multicast;
+	int status, ret;
+	u8 join_state;
+
+	group = container_of(work, typeof(*group), work);
+retest:
+	spin_lock_irq(&group->lock);
+	while (!list_empty(&group->pending_list) ||
+	       (group->state != MCAST_BUSY)) {
+
+		if (group->state != MCAST_BUSY) {
+			spin_unlock_irq(&group->lock);
+			process_group_error(group);
+			goto retest;
+		}
+
+		member = list_entry(group->pending_list.next,
+				    struct mcast_member, list);
+		multicast = &member->multicast;
+		join_state = multicast->rec.join_state;
+		atomic_inc(&member->refcount);
+
+		if (join_state == (group->rec.join_state & join_state)) {
+			status = cmp_rec(&group->rec, &multicast->rec,
+					 multicast->comp_mask);
+			if (!status)
+				join_group(group, member, join_state);
+			else
+				list_del_init(&member->list);
+			spin_unlock_irq(&group->lock);
+			ret = multicast->callback(status, multicast);
+		} else {
+			spin_unlock_irq(&group->lock);
+			status = send_join(group, member);
+			if (!status) {
+				deref_member(member);
+				return;
+			}
+			ret = fail_join(group, member, status);
+		}
+
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+		spin_lock_irq(&group->lock);
+	}
+
+	join_state = get_leave_state(group);
+	if (join_state) {
+		group->rec.join_state &= ~join_state;
+		spin_unlock_irq(&group->lock);
+		if (send_leave(group, join_state))
+			goto retest;
+	} else {
+		group->state = MCAST_IDLE;
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+}
+
+/*
+ * Fail a join request if it is still active - at the head of the pending queue.
+ */
+static void process_join_error(struct mcast_group *group, int status)
+{
+	struct mcast_member *member;
+	int ret;
+
+	spin_lock_irq(&group->lock);
+	member = list_entry(group->pending_list.next,
+			    struct mcast_member, list);
+	if (group->last_join == member) {
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		spin_unlock_irq(&group->lock);
+		ret = member->multicast.callback(status, &member->multicast);
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+	} else
+		spin_unlock_irq(&group->lock);
+}
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+			 void *context)
+{
+	struct mcast_group *group = context;
+	u16 pkey_index = MCAST_INVALID_PKEY_INDEX;
+
+	if (status)
+		process_join_error(group, status);
+	else {
+		int mgids_changed, is_mgid0;
+
+		if (ib_find_pkey(group->port->dev->device,
+				 group->port->port_num, be16_to_cpu(rec->pkey),
+				 &pkey_index))
+			pkey_index = MCAST_INVALID_PKEY_INDEX;
+
+		spin_lock_irq(&group->port->lock);
+		if (group->state == MCAST_BUSY &&
+		    group->pkey_index == MCAST_INVALID_PKEY_INDEX)
+			group->pkey_index = pkey_index;
+		mgids_changed = memcmp(&rec->mgid, &group->rec.mgid,
+				       sizeof(group->rec.mgid));
+		group->rec = *rec;
+		if (mgids_changed) {
+			rb_erase(&group->node, &group->port->table);
+			is_mgid0 = !memcmp(&mgid0, &group->rec.mgid,
+					   sizeof(mgid0));
+			mcast_insert(group->port, group, is_mgid0);
+		}
+		spin_unlock_irq(&group->port->lock);
+	}
+	mcast_work_handler(&group->work);
+}
+
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+			  void *context)
+{
+	struct mcast_group *group = context;
+
+	if (status && group->retries > 0 &&
+	    !send_leave(group, group->leave_state))
+		group->retries--;
+	else
+		mcast_work_handler(&group->work);
+}
+
+static struct mcast_group *acquire_group(struct mcast_port *port,
+					 union ib_gid *mgid, gfp_t gfp_mask)
+{
+	struct mcast_group *group, *cur_group;
+	unsigned long flags;
+	int is_mgid0;
+
+	is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+	if (!is_mgid0) {
+		spin_lock_irqsave(&port->lock, flags);
+		group = mcast_find(port, mgid);
+		if (group)
+			goto found;
+		spin_unlock_irqrestore(&port->lock, flags);
+	}
+
+	group = kzalloc(sizeof *group, gfp_mask);
+	if (!group)
+		return NULL;
+
+	group->retries = 3;
+	group->port = port;
+	group->rec.mgid = *mgid;
+	group->pkey_index = MCAST_INVALID_PKEY_INDEX;
+	INIT_LIST_HEAD(&group->pending_list);
+	INIT_LIST_HEAD(&group->active_list);
+	INIT_WORK(&group->work, mcast_work_handler);
+	spin_lock_init(&group->lock);
+
+	spin_lock_irqsave(&port->lock, flags);
+	cur_group = mcast_insert(port, group, is_mgid0);
+	if (cur_group) {
+		kfree(group);
+		group = cur_group;
+	} else
+		atomic_inc(&port->refcount);
+found:
+	atomic_inc(&group->refcount);
+	spin_unlock_irqrestore(&port->lock, flags);
+	return group;
+}
+
+/*
+ * We serialize all join requests to a single group to make our lives much
+ * easier.  Otherwise, two users could try to join the same group
+ * simultaneously, with different configurations, one could leave while the
+ * join is in progress, etc., which makes locking around error recovery
+ * difficult.
+ */
+struct ib_sa_multicast *
+ib_sa_join_multicast(struct ib_sa_client *client,
+		     struct ib_device *device, u8 port_num,
+		     struct ib_sa_mcmember_rec *rec,
+		     ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+		     int (*callback)(int status,
+				     struct ib_sa_multicast *multicast),
+		     void *context)
+{
+	struct mcast_device *dev;
+	struct mcast_member *member;
+	struct ib_sa_multicast *multicast;
+	int ret;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+
+	member = kmalloc(sizeof *member, gfp_mask);
+	if (!member)
+		return ERR_PTR(-ENOMEM);
+
+	ib_sa_client_get(client);
+	member->client = client;
+	member->multicast.rec = *rec;
+	member->multicast.comp_mask = comp_mask;
+	member->multicast.callback = callback;
+	member->multicast.context = context;
+	init_completion(&member->comp);
+	atomic_set(&member->refcount, 1);
+	member->state = MCAST_JOINING;
+
+	member->group = acquire_group(&dev->port[port_num - dev->start_port],
+				      &rec->mgid, gfp_mask);
+	if (!member->group) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * The user will get the multicast structure in their callback.  They
+	 * could then free the multicast structure before we can return from
+	 * this routine.  So we save the pointer to return before queuing
+	 * any callback.
+	 */
+	multicast = &member->multicast;
+	queue_join(member);
+	return multicast;
+
+err:
+	ib_sa_client_put(client);
+	kfree(member);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_sa_join_multicast);
+
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast)
+{
+	struct mcast_member *member;
+	struct mcast_group *group;
+
+	member = container_of(multicast, struct mcast_member, multicast);
+	group = member->group;
+
+	spin_lock_irq(&group->lock);
+	if (member->state == MCAST_MEMBER)
+		adjust_membership(group, multicast->rec.join_state, -1);
+
+	list_del_init(&member->list);
+
+	if (group->state == MCAST_IDLE) {
+		group->state = MCAST_BUSY;
+		spin_unlock_irq(&group->lock);
+		/* Continue to hold reference on group until callback */
+		queue_work(mcast_wq, &group->work);
+	} else {
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+
+	deref_member(member);
+	wait_for_completion(&member->comp);
+	ib_sa_client_put(member->client);
+	kfree(member);
+}
+EXPORT_SYMBOL(ib_sa_free_multicast);
+
+int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+			   union ib_gid *mgid, struct ib_sa_mcmember_rec *rec)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	struct mcast_group *group;
+	unsigned long flags;
+	int ret = 0;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return -ENODEV;
+
+	port = &dev->port[port_num - dev->start_port];
+	spin_lock_irqsave(&port->lock, flags);
+	group = mcast_find(port, mgid);
+	if (group)
+		*rec = group->rec;
+	else
+		ret = -EADDRNOTAVAIL;
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
+
+int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+			     struct ib_sa_mcmember_rec *rec,
+			     struct net_device *ndev,
+			     enum ib_gid_type gid_type,
+			     struct rdma_ah_attr *ah_attr)
+{
+	int ret;
+	u16 gid_index;
+
+	/* GID table is not based on the netdevice for IB link layer,
+	 * so ignore ndev during search.
+	 */
+	if (rdma_protocol_ib(device, port_num))
+		ndev = NULL;
+	else if (!rdma_protocol_roce(device, port_num))
+		return -EINVAL;
+
+	ret = ib_find_cached_gid_by_port(device, &rec->port_gid,
+					 gid_type, port_num,
+					 ndev,
+					 &gid_index);
+	if (ret)
+		return ret;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->type = rdma_ah_find_type(device, port_num);
+
+	rdma_ah_set_dlid(ah_attr, be16_to_cpu(rec->mlid));
+	rdma_ah_set_sl(ah_attr, rec->sl);
+	rdma_ah_set_port_num(ah_attr, port_num);
+	rdma_ah_set_static_rate(ah_attr, rec->rate);
+
+	rdma_ah_set_grh(ah_attr, &rec->mgid,
+			be32_to_cpu(rec->flow_label),
+			(u8)gid_index,
+			rec->hop_limit,
+			rec->traffic_class);
+	return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_mcmember);
+
+static void mcast_groups_event(struct mcast_port *port,
+			       enum mcast_group_state state)
+{
+	struct mcast_group *group;
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	for (node = rb_first(&port->table); node; node = rb_next(node)) {
+		group = rb_entry(node, struct mcast_group, node);
+		spin_lock(&group->lock);
+		if (group->state == MCAST_IDLE) {
+			atomic_inc(&group->refcount);
+			queue_work(mcast_wq, &group->work);
+		}
+		if (group->state != MCAST_GROUP_ERROR)
+			group->state = state;
+		spin_unlock(&group->lock);
+	}
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void mcast_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+	struct mcast_device *dev;
+	int index;
+
+	dev = container_of(handler, struct mcast_device, event_handler);
+	if (!rdma_cap_ib_mcast(dev->device, event->element.port_num))
+		return;
+
+	index = event->element.port_num - dev->start_port;
+
+	switch (event->event) {
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_LID_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+		mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
+		break;
+	case IB_EVENT_PKEY_CHANGE:
+		mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT);
+		break;
+	default:
+		break;
+	}
+}
+
+static void mcast_add_one(struct ib_device *device)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	int i;
+	int count = 0;
+
+	dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
+		      GFP_KERNEL);
+	if (!dev)
+		return;
+
+	dev->start_port = rdma_start_port(device);
+	dev->end_port = rdma_end_port(device);
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		if (!rdma_cap_ib_mcast(device, dev->start_port + i))
+			continue;
+		port = &dev->port[i];
+		port->dev = dev;
+		port->port_num = dev->start_port + i;
+		spin_lock_init(&port->lock);
+		port->table = RB_ROOT;
+		init_completion(&port->comp);
+		atomic_set(&port->refcount, 1);
+		++count;
+	}
+
+	if (!count) {
+		kfree(dev);
+		return;
+	}
+
+	dev->device = device;
+	ib_set_client_data(device, &mcast_client, dev);
+
+	INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler);
+	ib_register_event_handler(&dev->event_handler);
+}
+
+static void mcast_remove_one(struct ib_device *device, void *client_data)
+{
+	struct mcast_device *dev = client_data;
+	struct mcast_port *port;
+	int i;
+
+	if (!dev)
+		return;
+
+	ib_unregister_event_handler(&dev->event_handler);
+	flush_workqueue(mcast_wq);
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		if (rdma_cap_ib_mcast(device, dev->start_port + i)) {
+			port = &dev->port[i];
+			deref_port(port);
+			wait_for_completion(&port->comp);
+		}
+	}
+
+	kfree(dev);
+}
+
+int mcast_init(void)
+{
+	int ret;
+
+	mcast_wq = alloc_ordered_workqueue("ib_mcast", WQ_MEM_RECLAIM);
+	if (!mcast_wq)
+		return -ENOMEM;
+
+	ib_sa_register_client(&sa_client);
+
+	ret = ib_register_client(&mcast_client);
+	if (ret)
+		goto err;
+	return 0;
+
+err:
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(mcast_wq);
+	return ret;
+}
+
+void mcast_cleanup(void)
+{
+	ib_unregister_client(&mcast_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(mcast_wq);
+}
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
new file mode 100644
index 0000000..3ccaae1
--- /dev/null
+++ b/drivers/infiniband/core/netlink.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies Inc.  All rights reserved.
+ * Copyright (c) 2010 Voltaire Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/export.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <rdma/rdma_netlink.h>
+#include <linux/module.h>
+#include "core_priv.h"
+
+static DEFINE_MUTEX(rdma_nl_mutex);
+static struct sock *nls;
+static struct {
+	const struct rdma_nl_cbs   *cb_table;
+} rdma_nl_types[RDMA_NL_NUM_CLIENTS];
+
+int rdma_nl_chk_listeners(unsigned int group)
+{
+	return (netlink_has_listeners(nls, group)) ? 0 : -1;
+}
+EXPORT_SYMBOL(rdma_nl_chk_listeners);
+
+static bool is_nl_msg_valid(unsigned int type, unsigned int op)
+{
+	static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = {
+		[RDMA_NL_RDMA_CM] = RDMA_NL_RDMA_CM_NUM_OPS,
+		[RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS,
+		[RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS,
+		[RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS,
+	};
+
+	/*
+	 * This BUILD_BUG_ON is intended to catch addition of new
+	 * RDMA netlink protocol without updating the array above.
+	 */
+	BUILD_BUG_ON(RDMA_NL_NUM_CLIENTS != 6);
+
+	if (type >= RDMA_NL_NUM_CLIENTS)
+		return false;
+
+	return (op < max_num_ops[type]) ? true : false;
+}
+
+static bool is_nl_valid(unsigned int type, unsigned int op)
+{
+	const struct rdma_nl_cbs *cb_table;
+
+	if (!is_nl_msg_valid(type, op))
+		return false;
+
+	if (!rdma_nl_types[type].cb_table) {
+		mutex_unlock(&rdma_nl_mutex);
+		request_module("rdma-netlink-subsys-%d", type);
+		mutex_lock(&rdma_nl_mutex);
+	}
+
+	cb_table = rdma_nl_types[type].cb_table;
+
+	if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit))
+		return false;
+	return true;
+}
+
+void rdma_nl_register(unsigned int index,
+		      const struct rdma_nl_cbs cb_table[])
+{
+	mutex_lock(&rdma_nl_mutex);
+	if (!is_nl_msg_valid(index, 0)) {
+		/*
+		 * All clients are not interesting in success/failure of
+		 * this call. They want to see the print to error log and
+		 * continue their initialization. Print warning for them,
+		 * because it is programmer's error to be here.
+		 */
+		mutex_unlock(&rdma_nl_mutex);
+		WARN(true,
+		     "The not-valid %u index was supplied to RDMA netlink\n",
+		     index);
+		return;
+	}
+
+	if (rdma_nl_types[index].cb_table) {
+		mutex_unlock(&rdma_nl_mutex);
+		WARN(true,
+		     "The %u index is already registered in RDMA netlink\n",
+		     index);
+		return;
+	}
+
+	rdma_nl_types[index].cb_table = cb_table;
+	mutex_unlock(&rdma_nl_mutex);
+}
+EXPORT_SYMBOL(rdma_nl_register);
+
+void rdma_nl_unregister(unsigned int index)
+{
+	mutex_lock(&rdma_nl_mutex);
+	rdma_nl_types[index].cb_table = NULL;
+	mutex_unlock(&rdma_nl_mutex);
+}
+EXPORT_SYMBOL(rdma_nl_unregister);
+
+void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
+		   int len, int client, int op, int flags)
+{
+	*nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), len, flags);
+	if (!*nlh)
+		return NULL;
+	return nlmsg_data(*nlh);
+}
+EXPORT_SYMBOL(ibnl_put_msg);
+
+int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
+		  int len, void *data, int type)
+{
+	if (nla_put(skb, type, len, data)) {
+		nlmsg_cancel(skb, nlh);
+		return -EMSGSIZE;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ibnl_put_attr);
+
+static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			   struct netlink_ext_ack *extack)
+{
+	int type = nlh->nlmsg_type;
+	unsigned int index = RDMA_NL_GET_CLIENT(type);
+	unsigned int op = RDMA_NL_GET_OP(type);
+	const struct rdma_nl_cbs *cb_table;
+
+	if (!is_nl_valid(index, op))
+		return -EINVAL;
+
+	cb_table = rdma_nl_types[index].cb_table;
+
+	if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) &&
+	    !netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	/*
+	 * LS responses overload the 0x100 (NLM_F_ROOT) flag.  Don't
+	 * mistakenly call the .dump() function.
+	 */
+	if (index == RDMA_NL_LS) {
+		if (cb_table[op].doit)
+			return cb_table[op].doit(skb, nlh, extack);
+		return -EINVAL;
+	}
+	/* FIXME: Convert IWCM to properly handle doit callbacks */
+	if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM ||
+	    index == RDMA_NL_IWCM) {
+		struct netlink_dump_control c = {
+			.dump = cb_table[op].dump,
+		};
+		if (c.dump)
+			return netlink_dump_start(nls, skb, nlh, &c);
+		return -EINVAL;
+	}
+
+	if (cb_table[op].doit)
+		return cb_table[op].doit(skb, nlh, extack);
+
+	return 0;
+}
+
+/*
+ * This function is similar to netlink_rcv_skb with one exception:
+ * It calls to the callback for the netlink messages without NLM_F_REQUEST
+ * flag. These messages are intended for RDMA_NL_LS consumer, so it is allowed
+ * for that consumer only.
+ */
+static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
+						   struct nlmsghdr *,
+						   struct netlink_ext_ack *))
+{
+	struct netlink_ext_ack extack = {};
+	struct nlmsghdr *nlh;
+	int err;
+
+	while (skb->len >= nlmsg_total_size(0)) {
+		int msglen;
+
+		nlh = nlmsg_hdr(skb);
+		err = 0;
+
+		if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
+			return 0;
+
+		/*
+		 * Generally speaking, the only requests are handled
+		 * by the kernel, but RDMA_NL_LS is different, because it
+		 * runs backward netlink scheme. Kernel initiates messages
+		 * and waits for reply with data to keep pathrecord cache
+		 * in sync.
+		 */
+		if (!(nlh->nlmsg_flags & NLM_F_REQUEST) &&
+		    (RDMA_NL_GET_CLIENT(nlh->nlmsg_type) != RDMA_NL_LS))
+			goto ack;
+
+		/* Skip control messages */
+		if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
+			goto ack;
+
+		err = cb(skb, nlh, &extack);
+		if (err == -EINTR)
+			goto skip;
+
+ack:
+		if (nlh->nlmsg_flags & NLM_F_ACK || err)
+			netlink_ack(skb, nlh, err, &extack);
+
+skip:
+		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (msglen > skb->len)
+			msglen = skb->len;
+		skb_pull(skb, msglen);
+	}
+
+	return 0;
+}
+
+static void rdma_nl_rcv(struct sk_buff *skb)
+{
+	mutex_lock(&rdma_nl_mutex);
+	rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg);
+	mutex_unlock(&rdma_nl_mutex);
+}
+
+int rdma_nl_unicast(struct sk_buff *skb, u32 pid)
+{
+	int err;
+
+	err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT);
+	return (err < 0) ? err : 0;
+}
+EXPORT_SYMBOL(rdma_nl_unicast);
+
+int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid)
+{
+	int err;
+
+	err = netlink_unicast(nls, skb, pid, 0);
+	return (err < 0) ? err : 0;
+}
+EXPORT_SYMBOL(rdma_nl_unicast_wait);
+
+int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags)
+{
+	return nlmsg_multicast(nls, skb, 0, group, flags);
+}
+EXPORT_SYMBOL(rdma_nl_multicast);
+
+int __init rdma_nl_init(void)
+{
+	struct netlink_kernel_cfg cfg = {
+		.input	= rdma_nl_rcv,
+	};
+
+	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg);
+	if (!nls)
+		return -ENOMEM;
+
+	nls->sk_sndtimeo = 10 * HZ;
+	return 0;
+}
+
+void rdma_nl_exit(void)
+{
+	int idx;
+
+	for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++)
+		rdma_nl_unregister(idx);
+
+	netlink_kernel_release(nls);
+}
+
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA);
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
new file mode 100644
index 0000000..eb56776
--- /dev/null
+++ b/drivers/infiniband/core/nldev.c
@@ -0,0 +1,1029 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <net/netlink.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_netlink.h>
+
+#include "core_priv.h"
+#include "cma_priv.h"
+
+static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
+	[RDMA_NLDEV_ATTR_DEV_INDEX]     = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_DEV_NAME]	= { .type = NLA_NUL_STRING,
+					    .len = IB_DEVICE_NAME_MAX - 1},
+	[RDMA_NLDEV_ATTR_PORT_INDEX]	= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_FW_VERSION]	= { .type = NLA_NUL_STRING,
+					    .len = IB_FW_VERSION_NAME_MAX - 1},
+	[RDMA_NLDEV_ATTR_NODE_GUID]	= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_SUBNET_PREFIX]	= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_LID]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_SM_LID]	= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_LMC]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_PORT_STATE]	= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_SUMMARY]	= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY]	= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = { .type = NLA_NUL_STRING,
+					     .len = 16 },
+	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_RES_QP]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_QP_ENTRY]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_LQPN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_RQPN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_RQ_PSN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_SQ_PSN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_TYPE]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_STATE]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_PID]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_KERN_NAME]		= { .type = NLA_NUL_STRING,
+						    .len = TASK_COMM_LEN },
+	[RDMA_NLDEV_ATTR_RES_CM_ID]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY]	= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_PS]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_SRC_ADDR]	= {
+			.len = sizeof(struct __kernel_sockaddr_storage) },
+	[RDMA_NLDEV_ATTR_RES_DST_ADDR]	= {
+			.len = sizeof(struct __kernel_sockaddr_storage) },
+	[RDMA_NLDEV_ATTR_RES_CQ]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_CQ_ENTRY]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_CQE]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_USECNT]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_RES_POLL_CTX]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_MR]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_MR_ENTRY]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_RKEY]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_LKEY]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_IOVA]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_RES_MRLEN]		= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_RES_PD]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_PD_ENTRY]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY]	= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY] = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_NDEV_INDEX]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_NDEV_NAME]		= { .type = NLA_NUL_STRING,
+						    .len = IFNAMSIZ },
+};
+
+static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device)
+{
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
+		return -EMSGSIZE;
+	if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
+{
+	char fw[IB_FW_VERSION_NAME_MAX];
+
+	if (fill_nldev_handle(msg, device))
+		return -EMSGSIZE;
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device)))
+		return -EMSGSIZE;
+
+	BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64));
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
+			      device->attrs.device_cap_flags, 0))
+		return -EMSGSIZE;
+
+	ib_get_device_fw_str(device, fw);
+	/* Device without FW has strlen(fw) = 0 */
+	if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw))
+		return -EMSGSIZE;
+
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID,
+			      be64_to_cpu(device->node_guid), 0))
+		return -EMSGSIZE;
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID,
+			      be64_to_cpu(device->attrs.sys_image_guid), 0))
+		return -EMSGSIZE;
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int fill_port_info(struct sk_buff *msg,
+			  struct ib_device *device, u32 port,
+			  const struct net *net)
+{
+	struct net_device *netdev = NULL;
+	struct ib_port_attr attr;
+	int ret;
+
+	if (fill_nldev_handle(msg, device))
+		return -EMSGSIZE;
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port))
+		return -EMSGSIZE;
+
+	ret = ib_query_port(device, port, &attr);
+	if (ret)
+		return ret;
+
+	BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64));
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
+			      (u64)attr.port_cap_flags, 0))
+		return -EMSGSIZE;
+	if (rdma_protocol_ib(device, port) &&
+	    nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX,
+			      attr.subnet_prefix, 0))
+		return -EMSGSIZE;
+	if (rdma_protocol_ib(device, port)) {
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid))
+			return -EMSGSIZE;
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid))
+			return -EMSGSIZE;
+		if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc))
+			return -EMSGSIZE;
+	}
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_STATE, attr.state))
+		return -EMSGSIZE;
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state))
+		return -EMSGSIZE;
+
+	if (device->get_netdev)
+		netdev = device->get_netdev(device, port);
+
+	if (netdev && net_eq(dev_net(netdev), net)) {
+		ret = nla_put_u32(msg,
+				  RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex);
+		if (ret)
+			goto out;
+		ret = nla_put_string(msg,
+				     RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name);
+	}
+
+out:
+	if (netdev)
+		dev_put(netdev);
+	return ret;
+}
+
+static int fill_res_info_entry(struct sk_buff *msg,
+			       const char *name, u64 curr)
+{
+	struct nlattr *entry_attr;
+
+	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY);
+	if (!entry_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, name))
+		goto err;
+	if (nla_put_u64_64bit(msg,
+			      RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr, 0))
+		goto err;
+
+	nla_nest_end(msg, entry_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, entry_attr);
+	return -EMSGSIZE;
+}
+
+static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
+{
+	static const char * const names[RDMA_RESTRACK_MAX] = {
+		[RDMA_RESTRACK_PD] = "pd",
+		[RDMA_RESTRACK_CQ] = "cq",
+		[RDMA_RESTRACK_QP] = "qp",
+		[RDMA_RESTRACK_CM_ID] = "cm_id",
+		[RDMA_RESTRACK_MR] = "mr",
+	};
+
+	struct rdma_restrack_root *res = &device->res;
+	struct nlattr *table_attr;
+	int ret, i, curr;
+
+	if (fill_nldev_handle(msg, device))
+		return -EMSGSIZE;
+
+	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY);
+	if (!table_attr)
+		return -EMSGSIZE;
+
+	for (i = 0; i < RDMA_RESTRACK_MAX; i++) {
+		if (!names[i])
+			continue;
+		curr = rdma_restrack_count(res, i, task_active_pid_ns(current));
+		ret = fill_res_info_entry(msg, names[i], curr);
+		if (ret)
+			goto err;
+	}
+
+	nla_nest_end(msg, table_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, table_attr);
+	return ret;
+}
+
+static int fill_res_name_pid(struct sk_buff *msg,
+			     struct rdma_restrack_entry *res)
+{
+	/*
+	 * For user resources, user is should read /proc/PID/comm to get the
+	 * name of the task file.
+	 */
+	if (rdma_is_kernel_res(res)) {
+		if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME,
+		    res->kern_name))
+			return -EMSGSIZE;
+	} else {
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID,
+		    task_pid_vnr(res->task)))
+			return -EMSGSIZE;
+	}
+	return 0;
+}
+
+static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb,
+			     struct rdma_restrack_entry *res, uint32_t port)
+{
+	struct ib_qp *qp = container_of(res, struct ib_qp, res);
+	struct ib_qp_init_attr qp_init_attr;
+	struct nlattr *entry_attr;
+	struct ib_qp_attr qp_attr;
+	int ret;
+
+	ret = ib_query_qp(qp, &qp_attr, 0, &qp_init_attr);
+	if (ret)
+		return ret;
+
+	if (port && port != qp_attr.port_num)
+		return 0;
+
+	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
+	if (!entry_attr)
+		goto out;
+
+	/* In create_qp() port is not set yet */
+	if (qp_attr.port_num &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp_attr.port_num))
+		goto err;
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num))
+		goto err;
+	if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) {
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN,
+				qp_attr.dest_qp_num))
+			goto err;
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQ_PSN,
+				qp_attr.rq_psn))
+			goto err;
+	}
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SQ_PSN, qp_attr.sq_psn))
+		goto err;
+
+	if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC ||
+	    qp->qp_type == IB_QPT_XRC_INI || qp->qp_type == IB_QPT_XRC_TGT) {
+		if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE,
+			       qp_attr.path_mig_state))
+			goto err;
+	}
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, qp->qp_type))
+		goto err;
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state))
+		goto err;
+
+	if (fill_res_name_pid(msg, res))
+		goto err;
+
+	nla_nest_end(msg, entry_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, entry_attr);
+out:
+	return -EMSGSIZE;
+}
+
+static int fill_res_cm_id_entry(struct sk_buff *msg,
+				struct netlink_callback *cb,
+				struct rdma_restrack_entry *res, uint32_t port)
+{
+	struct rdma_id_private *id_priv =
+				container_of(res, struct rdma_id_private, res);
+	struct rdma_cm_id *cm_id = &id_priv->id;
+	struct nlattr *entry_attr;
+
+	if (port && port != cm_id->port_num)
+		return 0;
+
+	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY);
+	if (!entry_attr)
+		goto out;
+
+	if (cm_id->port_num &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num))
+		goto err;
+
+	if (id_priv->qp_num) {
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, id_priv->qp_num))
+			goto err;
+		if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, cm_id->qp_type))
+			goto err;
+	}
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PS, cm_id->ps))
+		goto err;
+
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, id_priv->state))
+		goto err;
+
+	if (cm_id->route.addr.src_addr.ss_family &&
+	    nla_put(msg, RDMA_NLDEV_ATTR_RES_SRC_ADDR,
+		    sizeof(cm_id->route.addr.src_addr),
+		    &cm_id->route.addr.src_addr))
+		goto err;
+	if (cm_id->route.addr.dst_addr.ss_family &&
+	    nla_put(msg, RDMA_NLDEV_ATTR_RES_DST_ADDR,
+		    sizeof(cm_id->route.addr.dst_addr),
+		    &cm_id->route.addr.dst_addr))
+		goto err;
+
+	if (fill_res_name_pid(msg, res))
+		goto err;
+
+	nla_nest_end(msg, entry_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, entry_attr);
+out:
+	return -EMSGSIZE;
+}
+
+static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb,
+			     struct rdma_restrack_entry *res, uint32_t port)
+{
+	struct ib_cq *cq = container_of(res, struct ib_cq, res);
+	struct nlattr *entry_attr;
+
+	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CQ_ENTRY);
+	if (!entry_attr)
+		goto out;
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe))
+		goto err;
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT,
+			      atomic_read(&cq->usecnt), 0))
+		goto err;
+
+	/* Poll context is only valid for kernel CQs */
+	if (rdma_is_kernel_res(res) &&
+	    nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx))
+		goto err;
+
+	if (fill_res_name_pid(msg, res))
+		goto err;
+
+	nla_nest_end(msg, entry_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, entry_attr);
+out:
+	return -EMSGSIZE;
+}
+
+static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb,
+			     struct rdma_restrack_entry *res, uint32_t port)
+{
+	struct ib_mr *mr = container_of(res, struct ib_mr, res);
+	struct nlattr *entry_attr;
+
+	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_MR_ENTRY);
+	if (!entry_attr)
+		goto out;
+
+	if (netlink_capable(cb->skb, CAP_NET_ADMIN)) {
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey))
+			goto err;
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey))
+			goto err;
+		if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_IOVA,
+				      mr->iova, 0))
+			goto err;
+	}
+
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, 0))
+		goto err;
+
+	if (fill_res_name_pid(msg, res))
+		goto err;
+
+	nla_nest_end(msg, entry_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, entry_attr);
+out:
+	return -EMSGSIZE;
+}
+
+static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb,
+			     struct rdma_restrack_entry *res, uint32_t port)
+{
+	struct ib_pd *pd = container_of(res, struct ib_pd, res);
+	struct nlattr *entry_attr;
+
+	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_PD_ENTRY);
+	if (!entry_attr)
+		goto out;
+
+	if (netlink_capable(cb->skb, CAP_NET_ADMIN)) {
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY,
+				pd->local_dma_lkey))
+			goto err;
+		if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) &&
+		    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY,
+				pd->unsafe_global_rkey))
+			goto err;
+	}
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT,
+			      atomic_read(&pd->usecnt), 0))
+		goto err;
+	if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY,
+			pd->unsafe_global_rkey))
+		goto err;
+
+	if (fill_res_name_pid(msg, res))
+		goto err;
+
+	nla_nest_end(msg, entry_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, entry_attr);
+out:
+	return -EMSGSIZE;
+}
+
+static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	struct sk_buff *msg;
+	u32 index;
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+
+	device = ib_device_get_by_index(index);
+	if (!device)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+			0, 0);
+
+	err = fill_dev_info(msg, device);
+	if (err)
+		goto err_free;
+
+	nlmsg_end(msg, nlh);
+
+	put_device(&device->dev);
+	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_free:
+	nlmsg_free(msg);
+err:
+	put_device(&device->dev);
+	return err;
+}
+
+static int _nldev_get_dumpit(struct ib_device *device,
+			     struct sk_buff *skb,
+			     struct netlink_callback *cb,
+			     unsigned int idx)
+{
+	int start = cb->args[0];
+	struct nlmsghdr *nlh;
+
+	if (idx < start)
+		return 0;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+			0, NLM_F_MULTI);
+
+	if (fill_dev_info(skb, device)) {
+		nlmsg_cancel(skb, nlh);
+		goto out;
+	}
+
+	nlmsg_end(skb, nlh);
+
+	idx++;
+
+out:	cb->args[0] = idx;
+	return skb->len;
+}
+
+static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	/*
+	 * There is no need to take lock, because
+	 * we are relying on ib_core's lists_rwsem
+	 */
+	return ib_enum_all_devs(_nldev_get_dumpit, skb, cb);
+}
+
+static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	struct sk_buff *msg;
+	u32 index;
+	u32 port;
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (err ||
+	    !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+	    !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(index);
+	if (!device)
+		return -EINVAL;
+
+	port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+	if (!rdma_is_port_valid(device, port)) {
+		err = -EINVAL;
+		goto err;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+			0, 0);
+
+	err = fill_port_info(msg, device, port, sock_net(skb->sk));
+	if (err)
+		goto err_free;
+
+	nlmsg_end(msg, nlh);
+	put_device(&device->dev);
+
+	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_free:
+	nlmsg_free(msg);
+err:
+	put_device(&device->dev);
+	return err;
+}
+
+static int nldev_port_get_dumpit(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	int start = cb->args[0];
+	struct nlmsghdr *nlh;
+	u32 idx = 0;
+	u32 ifindex;
+	int err;
+	u32 p;
+
+	err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, NULL);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(ifindex);
+	if (!device)
+		return -EINVAL;
+
+	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+		/*
+		 * The dumpit function returns all information from specific
+		 * index. This specific index is taken from the netlink
+		 * messages request sent by user and it is available
+		 * in cb->args[0].
+		 *
+		 * Usually, the user doesn't fill this field and it causes
+		 * to return everything.
+		 *
+		 */
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+
+		nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+				cb->nlh->nlmsg_seq,
+				RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+						 RDMA_NLDEV_CMD_PORT_GET),
+				0, NLM_F_MULTI);
+
+		if (fill_port_info(skb, device, p, sock_net(skb->sk))) {
+			nlmsg_cancel(skb, nlh);
+			goto out;
+		}
+		idx++;
+		nlmsg_end(skb, nlh);
+	}
+
+out:
+	put_device(&device->dev);
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			      struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	struct sk_buff *msg;
+	u32 index;
+	int ret;
+
+	ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(index);
+	if (!device)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET),
+			0, 0);
+
+	ret = fill_res_info(msg, device);
+	if (ret)
+		goto err_free;
+
+	nlmsg_end(msg, nlh);
+	put_device(&device->dev);
+	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_free:
+	nlmsg_free(msg);
+err:
+	put_device(&device->dev);
+	return ret;
+}
+
+static int _nldev_res_get_dumpit(struct ib_device *device,
+				 struct sk_buff *skb,
+				 struct netlink_callback *cb,
+				 unsigned int idx)
+{
+	int start = cb->args[0];
+	struct nlmsghdr *nlh;
+
+	if (idx < start)
+		return 0;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET),
+			0, NLM_F_MULTI);
+
+	if (fill_res_info(skb, device)) {
+		nlmsg_cancel(skb, nlh);
+		goto out;
+	}
+
+	nlmsg_end(skb, nlh);
+
+	idx++;
+
+out:
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+static int nldev_res_get_dumpit(struct sk_buff *skb,
+				struct netlink_callback *cb)
+{
+	return ib_enum_all_devs(_nldev_res_get_dumpit, skb, cb);
+}
+
+struct nldev_fill_res_entry {
+	int (*fill_res_func)(struct sk_buff *msg, struct netlink_callback *cb,
+			     struct rdma_restrack_entry *res, u32 port);
+	enum rdma_nldev_attr nldev_attr;
+	enum rdma_nldev_command nldev_cmd;
+};
+
+static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
+	[RDMA_RESTRACK_QP] = {
+		.fill_res_func = fill_res_qp_entry,
+		.nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET,
+		.nldev_attr = RDMA_NLDEV_ATTR_RES_QP,
+	},
+	[RDMA_RESTRACK_CM_ID] = {
+		.fill_res_func = fill_res_cm_id_entry,
+		.nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET,
+		.nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID,
+	},
+	[RDMA_RESTRACK_CQ] = {
+		.fill_res_func = fill_res_cq_entry,
+		.nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET,
+		.nldev_attr = RDMA_NLDEV_ATTR_RES_CQ,
+	},
+	[RDMA_RESTRACK_MR] = {
+		.fill_res_func = fill_res_mr_entry,
+		.nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET,
+		.nldev_attr = RDMA_NLDEV_ATTR_RES_MR,
+	},
+	[RDMA_RESTRACK_PD] = {
+		.fill_res_func = fill_res_pd_entry,
+		.nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET,
+		.nldev_attr = RDMA_NLDEV_ATTR_RES_PD,
+	},
+};
+
+static int res_get_common_dumpit(struct sk_buff *skb,
+				 struct netlink_callback *cb,
+				 enum rdma_restrack_type res_type)
+{
+	const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct rdma_restrack_entry *res;
+	int err, ret = 0, idx = 0;
+	struct nlattr *table_attr;
+	struct ib_device *device;
+	int start = cb->args[0];
+	struct nlmsghdr *nlh;
+	u32 index, port = 0;
+	bool filled = false;
+
+	err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, NULL);
+	/*
+	 * Right now, we are expecting the device index to get res information,
+	 * but it is possible to extend this code to return all devices in
+	 * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX.
+	 * if it doesn't exist, we will iterate over all devices.
+	 *
+	 * But it is not needed for now.
+	 */
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(index);
+	if (!device)
+		return -EINVAL;
+
+	/*
+	 * If no PORT_INDEX is supplied, we will return all QPs from that device
+	 */
+	if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+		port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+		if (!rdma_is_port_valid(device, port)) {
+			ret = -EINVAL;
+			goto err_index;
+		}
+	}
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd),
+			0, NLM_F_MULTI);
+
+	if (fill_nldev_handle(skb, device)) {
+		ret = -EMSGSIZE;
+		goto err;
+	}
+
+	table_attr = nla_nest_start(skb, fe->nldev_attr);
+	if (!table_attr) {
+		ret = -EMSGSIZE;
+		goto err;
+	}
+
+	down_read(&device->res.rwsem);
+	hash_for_each_possible(device->res.hash, res, node, res_type) {
+		if (idx < start)
+			goto next;
+
+		if ((rdma_is_kernel_res(res) &&
+		     task_active_pid_ns(current) != &init_pid_ns) ||
+		    (!rdma_is_kernel_res(res) && task_active_pid_ns(current) !=
+		     task_active_pid_ns(res->task)))
+			/*
+			 * 1. Kern resources should be visible in init
+			 *    namspace only
+			 * 2. Present only resources visible in the current
+			 *    namespace
+			 */
+			goto next;
+
+		if (!rdma_restrack_get(res))
+			/*
+			 * Resource is under release now, but we are not
+			 * relesing lock now, so it will be released in
+			 * our next pass, once we will get ->next pointer.
+			 */
+			goto next;
+
+		filled = true;
+
+		up_read(&device->res.rwsem);
+		ret = fe->fill_res_func(skb, cb, res, port);
+		down_read(&device->res.rwsem);
+		/*
+		 * Return resource back, but it won't be released till
+		 * the &device->res.rwsem will be released for write.
+		 */
+		rdma_restrack_put(res);
+
+		if (ret == -EMSGSIZE)
+			/*
+			 * There is a chance to optimize here.
+			 * It can be done by using list_prepare_entry
+			 * and list_for_each_entry_continue afterwards.
+			 */
+			break;
+		if (ret)
+			goto res_err;
+next:		idx++;
+	}
+	up_read(&device->res.rwsem);
+
+	nla_nest_end(skb, table_attr);
+	nlmsg_end(skb, nlh);
+	cb->args[0] = idx;
+
+	/*
+	 * No more entries to fill, cancel the message and
+	 * return 0 to mark end of dumpit.
+	 */
+	if (!filled)
+		goto err;
+
+	put_device(&device->dev);
+	return skb->len;
+
+res_err:
+	nla_nest_cancel(skb, table_attr);
+	up_read(&device->res.rwsem);
+
+err:
+	nlmsg_cancel(skb, nlh);
+
+err_index:
+	put_device(&device->dev);
+	return ret;
+}
+
+static int nldev_res_get_qp_dumpit(struct sk_buff *skb,
+				   struct netlink_callback *cb)
+{
+	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_QP);
+}
+
+static int nldev_res_get_cm_id_dumpit(struct sk_buff *skb,
+				      struct netlink_callback *cb)
+{
+	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CM_ID);
+}
+
+static int nldev_res_get_cq_dumpit(struct sk_buff *skb,
+				   struct netlink_callback *cb)
+{
+	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CQ);
+}
+
+static int nldev_res_get_mr_dumpit(struct sk_buff *skb,
+				   struct netlink_callback *cb)
+{
+	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR);
+}
+
+static int nldev_res_get_pd_dumpit(struct sk_buff *skb,
+				   struct netlink_callback *cb)
+{
+	return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD);
+}
+
+static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
+	[RDMA_NLDEV_CMD_GET] = {
+		.doit = nldev_get_doit,
+		.dump = nldev_get_dumpit,
+	},
+	[RDMA_NLDEV_CMD_PORT_GET] = {
+		.doit = nldev_port_get_doit,
+		.dump = nldev_port_get_dumpit,
+	},
+	[RDMA_NLDEV_CMD_RES_GET] = {
+		.doit = nldev_res_get_doit,
+		.dump = nldev_res_get_dumpit,
+	},
+	[RDMA_NLDEV_CMD_RES_QP_GET] = {
+		.dump = nldev_res_get_qp_dumpit,
+		/*
+		 * .doit is not implemented yet for two reasons:
+		 * 1. It is not needed yet.
+		 * 2. There is a need to provide identifier, while it is easy
+		 * for the QPs (device index + port index + LQPN), it is not
+		 * the case for the rest of resources (PD and CQ). Because it
+		 * is better to provide similar interface for all resources,
+		 * let's wait till we will have other resources implemented
+		 * too.
+		 */
+	},
+	[RDMA_NLDEV_CMD_RES_CM_ID_GET] = {
+		.dump = nldev_res_get_cm_id_dumpit,
+	},
+	[RDMA_NLDEV_CMD_RES_CQ_GET] = {
+		.dump = nldev_res_get_cq_dumpit,
+	},
+	[RDMA_NLDEV_CMD_RES_MR_GET] = {
+		.dump = nldev_res_get_mr_dumpit,
+	},
+	[RDMA_NLDEV_CMD_RES_PD_GET] = {
+		.dump = nldev_res_get_pd_dumpit,
+	},
+};
+
+void __init nldev_init(void)
+{
+	rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table);
+}
+
+void __exit nldev_exit(void)
+{
+	rdma_nl_unregister(RDMA_NL_NLDEV);
+}
+
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5);
diff --git a/drivers/infiniband/core/opa_smi.h b/drivers/infiniband/core/opa_smi.h
new file mode 100644
index 0000000..3bfab35
--- /dev/null
+++ b/drivers/infiniband/core/opa_smi.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __OPA_SMI_H_
+#define __OPA_SMI_H_
+
+#include <rdma/ib_smi.h>
+#include <rdma/opa_smi.h>
+
+#include "smi.h"
+
+enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,
+				       int port_num, int phys_port_cnt);
+int opa_smi_get_fwd_port(struct opa_smp *smp);
+extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp);
+extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,
+					      bool is_switch, int port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action opa_smi_check_local_smp(struct opa_smp *smp,
+						      struct ib_device *device)
+{
+	/* C14-9:3 -- We're at the end of the DR segment of path */
+	/* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+	return (device->process_mad &&
+		!opa_get_smp_direction(smp) &&
+		(smp->hop_ptr == smp->hop_cnt + 1)) ?
+		IB_SMI_HANDLE : IB_SMI_DISCARD;
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action opa_smi_check_local_returning_smp(struct opa_smp *smp,
+								struct ib_device *device)
+{
+	/* C14-13:3 -- We're at the end of the DR segment of path */
+	/* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+	return (device->process_mad &&
+		opa_get_smp_direction(smp) &&
+		!smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD;
+}
+
+#endif	/* __OPA_SMI_H_ */
diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c
new file mode 100644
index 0000000..19b1ee3
--- /dev/null
+++ b/drivers/infiniband/core/packer.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <linux/string.h>
+
+#include <rdma/ib_pack.h>
+
+static u64 value_read(int offset, int size, void *structure)
+{
+	switch (size) {
+	case 1: return                *(u8  *) (structure + offset);
+	case 2: return be16_to_cpup((__be16 *) (structure + offset));
+	case 4: return be32_to_cpup((__be32 *) (structure + offset));
+	case 8: return be64_to_cpup((__be64 *) (structure + offset));
+	default:
+		pr_warn("Field size %d bits not handled\n", size * 8);
+		return 0;
+	}
+}
+
+/**
+ * ib_pack - Pack a structure into a buffer
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @structure:Structure to pack from
+ * @buf:Buffer to pack into
+ *
+ * ib_pack() packs a list of structure fields into a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_pack(const struct ib_field        *desc,
+	     int                           desc_len,
+	     void                         *structure,
+	     void                         *buf)
+{
+	int i;
+
+	for (i = 0; i < desc_len; ++i) {
+		if (desc[i].size_bits <= 32) {
+			int shift;
+			u32 val;
+			__be32 mask;
+			__be32 *addr;
+
+			shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+			if (desc[i].struct_size_bytes)
+				val = value_read(desc[i].struct_offset_bytes,
+						 desc[i].struct_size_bytes,
+						 structure) << shift;
+			else
+				val = 0;
+
+			mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift);
+			addr = (__be32 *) buf + desc[i].offset_words;
+			*addr = (*addr & ~mask) | (cpu_to_be32(val) & mask);
+		} else if (desc[i].size_bits <= 64) {
+			int shift;
+			u64 val;
+			__be64 mask;
+			__be64 *addr;
+
+			shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+			if (desc[i].struct_size_bytes)
+				val = value_read(desc[i].struct_offset_bytes,
+						 desc[i].struct_size_bytes,
+						 structure) << shift;
+			else
+				val = 0;
+
+			mask = cpu_to_be64((~0ull >> (64 - desc[i].size_bits)) << shift);
+			addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words);
+			*addr = (*addr & ~mask) | (cpu_to_be64(val) & mask);
+		} else {
+			if (desc[i].offset_bits % 8 ||
+			    desc[i].size_bits   % 8) {
+				pr_warn("Structure field %s of size %d bits is not byte-aligned\n",
+					desc[i].field_name, desc[i].size_bits);
+			}
+
+			if (desc[i].struct_size_bytes)
+				memcpy(buf + desc[i].offset_words * 4 +
+				       desc[i].offset_bits / 8,
+				       structure + desc[i].struct_offset_bytes,
+				       desc[i].size_bits / 8);
+			else
+				memset(buf + desc[i].offset_words * 4 +
+				       desc[i].offset_bits / 8,
+				       0,
+				       desc[i].size_bits / 8);
+		}
+	}
+}
+EXPORT_SYMBOL(ib_pack);
+
+static void value_write(int offset, int size, u64 val, void *structure)
+{
+	switch (size * 8) {
+	case 8:  *(    u8 *) (structure + offset) = val; break;
+	case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break;
+	case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break;
+	case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break;
+	default:
+		pr_warn("Field size %d bits not handled\n", size * 8);
+	}
+}
+
+/**
+ * ib_unpack - Unpack a buffer into a structure
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @buf:Buffer to unpack from
+ * @structure:Structure to unpack into
+ *
+ * ib_pack() unpacks a list of structure fields from a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_unpack(const struct ib_field        *desc,
+	       int                           desc_len,
+	       void                         *buf,
+	       void                         *structure)
+{
+	int i;
+
+	for (i = 0; i < desc_len; ++i) {
+		if (!desc[i].struct_size_bytes)
+			continue;
+
+		if (desc[i].size_bits <= 32) {
+			int shift;
+			u32  val;
+			u32  mask;
+			__be32 *addr;
+
+			shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+			mask = ((1ull << desc[i].size_bits) - 1) << shift;
+			addr = (__be32 *) buf + desc[i].offset_words;
+			val = (be32_to_cpup(addr) & mask) >> shift;
+			value_write(desc[i].struct_offset_bytes,
+				    desc[i].struct_size_bytes,
+				    val,
+				    structure);
+		} else if (desc[i].size_bits <= 64) {
+			int shift;
+			u64  val;
+			u64  mask;
+			__be64 *addr;
+
+			shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+			mask = (~0ull >> (64 - desc[i].size_bits)) << shift;
+			addr = (__be64 *) buf + desc[i].offset_words;
+			val = (be64_to_cpup(addr) & mask) >> shift;
+			value_write(desc[i].struct_offset_bytes,
+				    desc[i].struct_size_bytes,
+				    val,
+				    structure);
+		} else {
+			if (desc[i].offset_bits % 8 ||
+			    desc[i].size_bits   % 8) {
+				pr_warn("Structure field %s of size %d bits is not byte-aligned\n",
+					desc[i].field_name, desc[i].size_bits);
+			}
+
+			memcpy(structure + desc[i].struct_offset_bytes,
+			       buf + desc[i].offset_words * 4 +
+			       desc[i].offset_bits / 8,
+			       desc[i].size_bits / 8);
+		}
+	}
+}
+EXPORT_SYMBOL(ib_unpack);
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
new file mode 100644
index 0000000..a6e9049
--- /dev/null
+++ b/drivers/infiniband/core/rdma_core.c
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/uverbs_types.h>
+#include <linux/rcupdate.h>
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/rdma_user_ioctl.h>
+#include "uverbs.h"
+#include "core_priv.h"
+#include "rdma_core.h"
+
+int uverbs_ns_idx(u16 *id, unsigned int ns_count)
+{
+	int ret = (*id & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT;
+
+	if (ret >= ns_count)
+		return -EINVAL;
+
+	*id &= ~UVERBS_ID_NS_MASK;
+	return ret;
+}
+
+const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev,
+						   uint16_t object)
+{
+	const struct uverbs_root_spec *object_hash = ibdev->specs_root;
+	const struct uverbs_object_spec_hash *objects;
+	int ret = uverbs_ns_idx(&object, object_hash->num_buckets);
+
+	if (ret < 0)
+		return NULL;
+
+	objects = object_hash->object_buckets[ret];
+
+	if (object >= objects->num_objects)
+		return NULL;
+
+	return objects->objects[object];
+}
+
+const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object,
+						   uint16_t method)
+{
+	const struct uverbs_method_spec_hash *methods;
+	int ret = uverbs_ns_idx(&method, object->num_buckets);
+
+	if (ret < 0)
+		return NULL;
+
+	methods = object->method_buckets[ret];
+	if (method >= methods->num_methods)
+		return NULL;
+
+	return methods->methods[method];
+}
+
+void uverbs_uobject_get(struct ib_uobject *uobject)
+{
+	kref_get(&uobject->ref);
+}
+
+static void uverbs_uobject_free(struct kref *ref)
+{
+	struct ib_uobject *uobj =
+		container_of(ref, struct ib_uobject, ref);
+
+	if (uobj->type->type_class->needs_kfree_rcu)
+		kfree_rcu(uobj, rcu);
+	else
+		kfree(uobj);
+}
+
+void uverbs_uobject_put(struct ib_uobject *uobject)
+{
+	kref_put(&uobject->ref, uverbs_uobject_free);
+}
+
+static int uverbs_try_lock_object(struct ib_uobject *uobj, bool exclusive)
+{
+	/*
+	 * When a shared access is required, we use a positive counter. Each
+	 * shared access request checks that the value != -1 and increment it.
+	 * Exclusive access is required for operations like write or destroy.
+	 * In exclusive access mode, we check that the counter is zero (nobody
+	 * claimed this object) and we set it to -1. Releasing a shared access
+	 * lock is done simply by decreasing the counter. As for exclusive
+	 * access locks, since only a single one of them is is allowed
+	 * concurrently, setting the counter to zero is enough for releasing
+	 * this lock.
+	 */
+	if (!exclusive)
+		return __atomic_add_unless(&uobj->usecnt, 1, -1) == -1 ?
+			-EBUSY : 0;
+
+	/* lock is either WRITE or DESTROY - should be exclusive */
+	return atomic_cmpxchg(&uobj->usecnt, 0, -1) == 0 ? 0 : -EBUSY;
+}
+
+static struct ib_uobject *alloc_uobj(struct ib_ucontext *context,
+				     const struct uverbs_obj_type *type)
+{
+	struct ib_uobject *uobj = kzalloc(type->obj_size, GFP_KERNEL);
+
+	if (!uobj)
+		return ERR_PTR(-ENOMEM);
+	/*
+	 * user_handle should be filled by the handler,
+	 * The object is added to the list in the commit stage.
+	 */
+	uobj->context = context;
+	uobj->type = type;
+	/*
+	 * Allocated objects start out as write locked to deny any other
+	 * syscalls from accessing them until they are committed. See
+	 * rdma_alloc_commit_uobject
+	 */
+	atomic_set(&uobj->usecnt, -1);
+	kref_init(&uobj->ref);
+
+	return uobj;
+}
+
+static int idr_add_uobj(struct ib_uobject *uobj)
+{
+	int ret;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&uobj->context->ufile->idr_lock);
+
+	/*
+	 * We start with allocating an idr pointing to NULL. This represents an
+	 * object which isn't initialized yet. We'll replace it later on with
+	 * the real object once we commit.
+	 */
+	ret = idr_alloc(&uobj->context->ufile->idr, NULL, 0,
+			min_t(unsigned long, U32_MAX - 1, INT_MAX), GFP_NOWAIT);
+	if (ret >= 0)
+		uobj->id = ret;
+
+	spin_unlock(&uobj->context->ufile->idr_lock);
+	idr_preload_end();
+
+	return ret < 0 ? ret : 0;
+}
+
+/*
+ * It only removes it from the uobjects list, uverbs_uobject_put() is still
+ * required.
+ */
+static void uverbs_idr_remove_uobj(struct ib_uobject *uobj)
+{
+	spin_lock(&uobj->context->ufile->idr_lock);
+	idr_remove(&uobj->context->ufile->idr, uobj->id);
+	spin_unlock(&uobj->context->ufile->idr_lock);
+}
+
+/* Returns the ib_uobject or an error. The caller should check for IS_ERR. */
+static struct ib_uobject *lookup_get_idr_uobject(const struct uverbs_obj_type *type,
+						 struct ib_ucontext *ucontext,
+						 int id, bool exclusive)
+{
+	struct ib_uobject *uobj;
+
+	rcu_read_lock();
+	/* object won't be released as we're protected in rcu */
+	uobj = idr_find(&ucontext->ufile->idr, id);
+	if (!uobj) {
+		uobj = ERR_PTR(-ENOENT);
+		goto free;
+	}
+
+	/*
+	 * The idr_find is guaranteed to return a pointer to something that
+	 * isn't freed yet, or NULL, as the free after idr_remove goes through
+	 * kfree_rcu(). However the object may still have been released and
+	 * kfree() could be called at any time.
+	 */
+	if (!kref_get_unless_zero(&uobj->ref))
+		uobj = ERR_PTR(-ENOENT);
+
+free:
+	rcu_read_unlock();
+	return uobj;
+}
+
+static struct ib_uobject *lookup_get_fd_uobject(const struct uverbs_obj_type *type,
+						struct ib_ucontext *ucontext,
+						int id, bool exclusive)
+{
+	struct file *f;
+	struct ib_uobject *uobject;
+	const struct uverbs_obj_fd_type *fd_type =
+		container_of(type, struct uverbs_obj_fd_type, type);
+
+	if (exclusive)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	f = fget(id);
+	if (!f)
+		return ERR_PTR(-EBADF);
+
+	uobject = f->private_data;
+	/*
+	 * fget(id) ensures we are not currently running uverbs_close_fd,
+	 * and the caller is expected to ensure that uverbs_close_fd is never
+	 * done while a call top lookup is possible.
+	 */
+	if (f->f_op != fd_type->fops) {
+		fput(f);
+		return ERR_PTR(-EBADF);
+	}
+
+	uverbs_uobject_get(uobject);
+	return uobject;
+}
+
+struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_obj_type *type,
+					   struct ib_ucontext *ucontext,
+					   int id, bool exclusive)
+{
+	struct ib_uobject *uobj;
+	int ret;
+
+	uobj = type->type_class->lookup_get(type, ucontext, id, exclusive);
+	if (IS_ERR(uobj))
+		return uobj;
+
+	if (uobj->type != type) {
+		ret = -EINVAL;
+		goto free;
+	}
+
+	ret = uverbs_try_lock_object(uobj, exclusive);
+	if (ret) {
+		WARN(ucontext->cleanup_reason,
+		     "ib_uverbs: Trying to lookup_get while cleanup context\n");
+		goto free;
+	}
+
+	return uobj;
+free:
+	uobj->type->type_class->lookup_put(uobj, exclusive);
+	uverbs_uobject_put(uobj);
+	return ERR_PTR(ret);
+}
+
+static struct ib_uobject *alloc_begin_idr_uobject(const struct uverbs_obj_type *type,
+						  struct ib_ucontext *ucontext)
+{
+	int ret;
+	struct ib_uobject *uobj;
+
+	uobj = alloc_uobj(ucontext, type);
+	if (IS_ERR(uobj))
+		return uobj;
+
+	ret = idr_add_uobj(uobj);
+	if (ret)
+		goto uobj_put;
+
+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ucontext->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto idr_remove;
+
+	return uobj;
+
+idr_remove:
+	uverbs_idr_remove_uobj(uobj);
+uobj_put:
+	uverbs_uobject_put(uobj);
+	return ERR_PTR(ret);
+}
+
+static struct ib_uobject *alloc_begin_fd_uobject(const struct uverbs_obj_type *type,
+						 struct ib_ucontext *ucontext)
+{
+	const struct uverbs_obj_fd_type *fd_type =
+		container_of(type, struct uverbs_obj_fd_type, type);
+	int new_fd;
+	struct ib_uobject *uobj;
+	struct ib_uobject_file *uobj_file;
+	struct file *filp;
+
+	new_fd = get_unused_fd_flags(O_CLOEXEC);
+	if (new_fd < 0)
+		return ERR_PTR(new_fd);
+
+	uobj = alloc_uobj(ucontext, type);
+	if (IS_ERR(uobj)) {
+		put_unused_fd(new_fd);
+		return uobj;
+	}
+
+	uobj_file = container_of(uobj, struct ib_uobject_file, uobj);
+	filp = anon_inode_getfile(fd_type->name,
+				  fd_type->fops,
+				  uobj_file,
+				  fd_type->flags);
+	if (IS_ERR(filp)) {
+		put_unused_fd(new_fd);
+		uverbs_uobject_put(uobj);
+		return (void *)filp;
+	}
+
+	uobj_file->uobj.id = new_fd;
+	uobj_file->uobj.object = filp;
+	uobj_file->ufile = ucontext->ufile;
+	INIT_LIST_HEAD(&uobj->list);
+	kref_get(&uobj_file->ufile->ref);
+
+	return uobj;
+}
+
+struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_obj_type *type,
+					    struct ib_ucontext *ucontext)
+{
+	return type->type_class->alloc_begin(type, ucontext);
+}
+
+static int __must_check remove_commit_idr_uobject(struct ib_uobject *uobj,
+						  enum rdma_remove_reason why)
+{
+	const struct uverbs_obj_idr_type *idr_type =
+		container_of(uobj->type, struct uverbs_obj_idr_type,
+			     type);
+	int ret = idr_type->destroy_object(uobj, why);
+
+	/*
+	 * We can only fail gracefully if the user requested to destroy the
+	 * object. In the rest of the cases, just remove whatever you can.
+	 */
+	if (why == RDMA_REMOVE_DESTROY && ret)
+		return ret;
+
+	ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
+			   RDMACG_RESOURCE_HCA_OBJECT);
+	uverbs_idr_remove_uobj(uobj);
+
+	return ret;
+}
+
+static void alloc_abort_fd_uobject(struct ib_uobject *uobj)
+{
+	struct ib_uobject_file *uobj_file =
+		container_of(uobj, struct ib_uobject_file, uobj);
+	struct file *filp = uobj->object;
+	int id = uobj_file->uobj.id;
+
+	/* Unsuccessful NEW */
+	fput(filp);
+	put_unused_fd(id);
+}
+
+static int __must_check remove_commit_fd_uobject(struct ib_uobject *uobj,
+						 enum rdma_remove_reason why)
+{
+	const struct uverbs_obj_fd_type *fd_type =
+		container_of(uobj->type, struct uverbs_obj_fd_type, type);
+	struct ib_uobject_file *uobj_file =
+		container_of(uobj, struct ib_uobject_file, uobj);
+	int ret = fd_type->context_closed(uobj_file, why);
+
+	if (why == RDMA_REMOVE_DESTROY && ret)
+		return ret;
+
+	if (why == RDMA_REMOVE_DURING_CLEANUP) {
+		alloc_abort_fd_uobject(uobj);
+		return ret;
+	}
+
+	uobj_file->uobj.context = NULL;
+	return ret;
+}
+
+static void assert_uverbs_usecnt(struct ib_uobject *uobj, bool exclusive)
+{
+#ifdef CONFIG_LOCKDEP
+	if (exclusive)
+		WARN_ON(atomic_read(&uobj->usecnt) != -1);
+	else
+		WARN_ON(atomic_read(&uobj->usecnt) <= 0);
+#endif
+}
+
+static int __must_check _rdma_remove_commit_uobject(struct ib_uobject *uobj,
+						    enum rdma_remove_reason why)
+{
+	int ret;
+	struct ib_ucontext *ucontext = uobj->context;
+
+	ret = uobj->type->type_class->remove_commit(uobj, why);
+	if (ret && why == RDMA_REMOVE_DESTROY) {
+		/* We couldn't remove the object, so just unlock the uobject */
+		atomic_set(&uobj->usecnt, 0);
+		uobj->type->type_class->lookup_put(uobj, true);
+	} else {
+		mutex_lock(&ucontext->uobjects_lock);
+		list_del(&uobj->list);
+		mutex_unlock(&ucontext->uobjects_lock);
+		/* put the ref we took when we created the object */
+		uverbs_uobject_put(uobj);
+	}
+
+	return ret;
+}
+
+/* This is called only for user requested DESTROY reasons */
+int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj)
+{
+	int ret;
+	struct ib_ucontext *ucontext = uobj->context;
+
+	/* put the ref count we took at lookup_get */
+	uverbs_uobject_put(uobj);
+	/* Cleanup is running. Calling this should have been impossible */
+	if (!down_read_trylock(&ucontext->cleanup_rwsem)) {
+		WARN(true, "ib_uverbs: Cleanup is running while removing an uobject\n");
+		return 0;
+	}
+	assert_uverbs_usecnt(uobj, true);
+	ret = _rdma_remove_commit_uobject(uobj, RDMA_REMOVE_DESTROY);
+
+	up_read(&ucontext->cleanup_rwsem);
+	return ret;
+}
+
+static int null_obj_type_class_remove_commit(struct ib_uobject *uobj,
+					     enum rdma_remove_reason why)
+{
+	return 0;
+}
+
+static const struct uverbs_obj_type null_obj_type = {
+	.type_class = &((const struct uverbs_obj_type_class){
+			.remove_commit = null_obj_type_class_remove_commit,
+			/* be cautious */
+			.needs_kfree_rcu = true}),
+};
+
+int rdma_explicit_destroy(struct ib_uobject *uobject)
+{
+	int ret;
+	struct ib_ucontext *ucontext = uobject->context;
+
+	/* Cleanup is running. Calling this should have been impossible */
+	if (!down_read_trylock(&ucontext->cleanup_rwsem)) {
+		WARN(true, "ib_uverbs: Cleanup is running while removing an uobject\n");
+		return 0;
+	}
+	assert_uverbs_usecnt(uobject, true);
+	ret = uobject->type->type_class->remove_commit(uobject,
+						       RDMA_REMOVE_DESTROY);
+	if (ret)
+		goto out;
+
+	uobject->type = &null_obj_type;
+
+out:
+	up_read(&ucontext->cleanup_rwsem);
+	return ret;
+}
+
+static void alloc_commit_idr_uobject(struct ib_uobject *uobj)
+{
+	spin_lock(&uobj->context->ufile->idr_lock);
+	/*
+	 * We already allocated this IDR with a NULL object, so
+	 * this shouldn't fail.
+	 */
+	WARN_ON(idr_replace(&uobj->context->ufile->idr,
+			    uobj, uobj->id));
+	spin_unlock(&uobj->context->ufile->idr_lock);
+}
+
+static void alloc_commit_fd_uobject(struct ib_uobject *uobj)
+{
+	struct ib_uobject_file *uobj_file =
+		container_of(uobj, struct ib_uobject_file, uobj);
+
+	fd_install(uobj_file->uobj.id, uobj->object);
+	/* This shouldn't be used anymore. Use the file object instead */
+	uobj_file->uobj.id = 0;
+	/* Get another reference as we export this to the fops */
+	uverbs_uobject_get(&uobj_file->uobj);
+}
+
+int rdma_alloc_commit_uobject(struct ib_uobject *uobj)
+{
+	/* Cleanup is running. Calling this should have been impossible */
+	if (!down_read_trylock(&uobj->context->cleanup_rwsem)) {
+		int ret;
+
+		WARN(true, "ib_uverbs: Cleanup is running while allocating an uobject\n");
+		ret = uobj->type->type_class->remove_commit(uobj,
+							    RDMA_REMOVE_DURING_CLEANUP);
+		if (ret)
+			pr_warn("ib_uverbs: cleanup of idr object %d failed\n",
+				uobj->id);
+		return ret;
+	}
+
+	/* matches atomic_set(-1) in alloc_uobj */
+	assert_uverbs_usecnt(uobj, true);
+	atomic_set(&uobj->usecnt, 0);
+
+	mutex_lock(&uobj->context->uobjects_lock);
+	list_add(&uobj->list, &uobj->context->uobjects);
+	mutex_unlock(&uobj->context->uobjects_lock);
+
+	uobj->type->type_class->alloc_commit(uobj);
+	up_read(&uobj->context->cleanup_rwsem);
+
+	return 0;
+}
+
+static void alloc_abort_idr_uobject(struct ib_uobject *uobj)
+{
+	uverbs_idr_remove_uobj(uobj);
+	ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
+			   RDMACG_RESOURCE_HCA_OBJECT);
+	uverbs_uobject_put(uobj);
+}
+
+void rdma_alloc_abort_uobject(struct ib_uobject *uobj)
+{
+	uobj->type->type_class->alloc_abort(uobj);
+}
+
+static void lookup_put_idr_uobject(struct ib_uobject *uobj, bool exclusive)
+{
+}
+
+static void lookup_put_fd_uobject(struct ib_uobject *uobj, bool exclusive)
+{
+	struct file *filp = uobj->object;
+
+	WARN_ON(exclusive);
+	/* This indirectly calls uverbs_close_fd and free the object */
+	fput(filp);
+}
+
+void rdma_lookup_put_uobject(struct ib_uobject *uobj, bool exclusive)
+{
+	assert_uverbs_usecnt(uobj, exclusive);
+	uobj->type->type_class->lookup_put(uobj, exclusive);
+	/*
+	 * In order to unlock an object, either decrease its usecnt for
+	 * read access or zero it in case of exclusive access. See
+	 * uverbs_try_lock_object for locking schema information.
+	 */
+	if (!exclusive)
+		atomic_dec(&uobj->usecnt);
+	else
+		atomic_set(&uobj->usecnt, 0);
+
+	uverbs_uobject_put(uobj);
+}
+
+const struct uverbs_obj_type_class uverbs_idr_class = {
+	.alloc_begin = alloc_begin_idr_uobject,
+	.lookup_get = lookup_get_idr_uobject,
+	.alloc_commit = alloc_commit_idr_uobject,
+	.alloc_abort = alloc_abort_idr_uobject,
+	.lookup_put = lookup_put_idr_uobject,
+	.remove_commit = remove_commit_idr_uobject,
+	/*
+	 * When we destroy an object, we first just lock it for WRITE and
+	 * actually DESTROY it in the finalize stage. So, the problematic
+	 * scenario is when we just started the finalize stage of the
+	 * destruction (nothing was executed yet). Now, the other thread
+	 * fetched the object for READ access, but it didn't lock it yet.
+	 * The DESTROY thread continues and starts destroying the object.
+	 * When the other thread continue - without the RCU, it would
+	 * access freed memory. However, the rcu_read_lock delays the free
+	 * until the rcu_read_lock of the READ operation quits. Since the
+	 * exclusive lock of the object is still taken by the DESTROY flow, the
+	 * READ operation will get -EBUSY and it'll just bail out.
+	 */
+	.needs_kfree_rcu = true,
+};
+
+static void _uverbs_close_fd(struct ib_uobject_file *uobj_file)
+{
+	struct ib_ucontext *ucontext;
+	struct ib_uverbs_file *ufile = uobj_file->ufile;
+	int ret;
+
+	mutex_lock(&uobj_file->ufile->cleanup_mutex);
+
+	/* uobject was either already cleaned up or is cleaned up right now anyway */
+	if (!uobj_file->uobj.context ||
+	    !down_read_trylock(&uobj_file->uobj.context->cleanup_rwsem))
+		goto unlock;
+
+	ucontext = uobj_file->uobj.context;
+	ret = _rdma_remove_commit_uobject(&uobj_file->uobj, RDMA_REMOVE_CLOSE);
+	up_read(&ucontext->cleanup_rwsem);
+	if (ret)
+		pr_warn("uverbs: unable to clean up uobject file in uverbs_close_fd.\n");
+unlock:
+	mutex_unlock(&ufile->cleanup_mutex);
+}
+
+void uverbs_close_fd(struct file *f)
+{
+	struct ib_uobject_file *uobj_file = f->private_data;
+	struct kref *uverbs_file_ref = &uobj_file->ufile->ref;
+
+	_uverbs_close_fd(uobj_file);
+	uverbs_uobject_put(&uobj_file->uobj);
+	kref_put(uverbs_file_ref, ib_uverbs_release_file);
+}
+
+void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed)
+{
+	enum rdma_remove_reason reason = device_removed ?
+		RDMA_REMOVE_DRIVER_REMOVE : RDMA_REMOVE_CLOSE;
+	unsigned int cur_order = 0;
+
+	ucontext->cleanup_reason = reason;
+	/*
+	 * Waits for all remove_commit and alloc_commit to finish. Logically, We
+	 * want to hold this forever as the context is going to be destroyed,
+	 * but we'll release it since it causes a "held lock freed" BUG message.
+	 */
+	down_write(&ucontext->cleanup_rwsem);
+
+	while (!list_empty(&ucontext->uobjects)) {
+		struct ib_uobject *obj, *next_obj;
+		unsigned int next_order = UINT_MAX;
+
+		/*
+		 * This shouldn't run while executing other commands on this
+		 * context. Thus, the only thing we should take care of is
+		 * releasing a FD while traversing this list. The FD could be
+		 * closed and released from the _release fop of this FD.
+		 * In order to mitigate this, we add a lock.
+		 * We take and release the lock per order traversal in order
+		 * to let other threads (which might still use the FDs) chance
+		 * to run.
+		 */
+		mutex_lock(&ucontext->uobjects_lock);
+		list_for_each_entry_safe(obj, next_obj, &ucontext->uobjects,
+					 list) {
+			if (obj->type->destroy_order == cur_order) {
+				int ret;
+
+				/*
+				 * if we hit this WARN_ON, that means we are
+				 * racing with a lookup_get.
+				 */
+				WARN_ON(uverbs_try_lock_object(obj, true));
+				ret = obj->type->type_class->remove_commit(obj,
+									   reason);
+				list_del(&obj->list);
+				if (ret)
+					pr_warn("ib_uverbs: failed to remove uobject id %d order %u\n",
+						obj->id, cur_order);
+				/* put the ref we took when we created the object */
+				uverbs_uobject_put(obj);
+			} else {
+				next_order = min(next_order,
+						 obj->type->destroy_order);
+			}
+		}
+		mutex_unlock(&ucontext->uobjects_lock);
+		cur_order = next_order;
+	}
+	up_write(&ucontext->cleanup_rwsem);
+}
+
+void uverbs_initialize_ucontext(struct ib_ucontext *ucontext)
+{
+	ucontext->cleanup_reason = 0;
+	mutex_init(&ucontext->uobjects_lock);
+	INIT_LIST_HEAD(&ucontext->uobjects);
+	init_rwsem(&ucontext->cleanup_rwsem);
+}
+
+const struct uverbs_obj_type_class uverbs_fd_class = {
+	.alloc_begin = alloc_begin_fd_uobject,
+	.lookup_get = lookup_get_fd_uobject,
+	.alloc_commit = alloc_commit_fd_uobject,
+	.alloc_abort = alloc_abort_fd_uobject,
+	.lookup_put = lookup_put_fd_uobject,
+	.remove_commit = remove_commit_fd_uobject,
+	.needs_kfree_rcu = false,
+};
+
+struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs,
+						   struct ib_ucontext *ucontext,
+						   enum uverbs_obj_access access,
+						   int id)
+{
+	switch (access) {
+	case UVERBS_ACCESS_READ:
+		return rdma_lookup_get_uobject(type_attrs, ucontext, id, false);
+	case UVERBS_ACCESS_DESTROY:
+	case UVERBS_ACCESS_WRITE:
+		return rdma_lookup_get_uobject(type_attrs, ucontext, id, true);
+	case UVERBS_ACCESS_NEW:
+		return rdma_alloc_begin_uobject(type_attrs, ucontext);
+	default:
+		WARN_ON(true);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+}
+
+int uverbs_finalize_object(struct ib_uobject *uobj,
+			   enum uverbs_obj_access access,
+			   bool commit)
+{
+	int ret = 0;
+
+	/*
+	 * refcounts should be handled at the object level and not at the
+	 * uobject level. Refcounts of the objects themselves are done in
+	 * handlers.
+	 */
+
+	switch (access) {
+	case UVERBS_ACCESS_READ:
+		rdma_lookup_put_uobject(uobj, false);
+		break;
+	case UVERBS_ACCESS_WRITE:
+		rdma_lookup_put_uobject(uobj, true);
+		break;
+	case UVERBS_ACCESS_DESTROY:
+		if (commit)
+			ret = rdma_remove_commit_uobject(uobj);
+		else
+			rdma_lookup_put_uobject(uobj, true);
+		break;
+	case UVERBS_ACCESS_NEW:
+		if (commit)
+			ret = rdma_alloc_commit_uobject(uobj);
+		else
+			rdma_alloc_abort_uobject(uobj);
+		break;
+	default:
+		WARN_ON(true);
+		ret = -EOPNOTSUPP;
+	}
+
+	return ret;
+}
+
+int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle,
+			    struct uverbs_attr_spec_hash * const *spec_hash,
+			    size_t num,
+			    bool commit)
+{
+	unsigned int i;
+	int ret = 0;
+
+	for (i = 0; i < num; i++) {
+		struct uverbs_attr_bundle_hash *curr_bundle =
+			&attrs_bundle->hash[i];
+		const struct uverbs_attr_spec_hash *curr_spec_bucket =
+			spec_hash[i];
+		unsigned int j;
+
+		for (j = 0; j < curr_bundle->num_attrs; j++) {
+			struct uverbs_attr *attr;
+			const struct uverbs_attr_spec *spec;
+
+			if (!uverbs_attr_is_valid_in_hash(curr_bundle, j))
+				continue;
+
+			attr = &curr_bundle->attrs[j];
+			spec = &curr_spec_bucket->attrs[j];
+
+			if (spec->type == UVERBS_ATTR_TYPE_IDR ||
+			    spec->type == UVERBS_ATTR_TYPE_FD) {
+				int current_ret;
+
+				current_ret = uverbs_finalize_object(attr->obj_attr.uobject,
+								     spec->obj.access,
+								     commit);
+				if (!ret)
+					ret = current_ret;
+			}
+		}
+	}
+	return ret;
+}
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
new file mode 100644
index 0000000..1efcf93
--- /dev/null
+++ b/drivers/infiniband/core/rdma_core.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005-2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_CORE_H
+#define RDMA_CORE_H
+
+#include <linux/idr.h>
+#include <rdma/uverbs_types.h>
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/ib_verbs.h>
+#include <linux/mutex.h>
+
+int uverbs_ns_idx(u16 *id, unsigned int ns_count);
+const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev,
+						   uint16_t object);
+const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object,
+						   uint16_t method);
+/*
+ * These functions initialize the context and cleanups its uobjects.
+ * The context has a list of objects which is protected by a mutex
+ * on the context. initialize_ucontext should be called when we create
+ * a context.
+ * cleanup_ucontext removes all uobjects from the context and puts them.
+ */
+void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed);
+void uverbs_initialize_ucontext(struct ib_ucontext *ucontext);
+
+/*
+ * uverbs_uobject_get is called in order to increase the reference count on
+ * an uobject. This is useful when a handler wants to keep the uobject's memory
+ * alive, regardless if this uobject is still alive in the context's objects
+ * repository. Objects are put via uverbs_uobject_put.
+ */
+void uverbs_uobject_get(struct ib_uobject *uobject);
+
+/*
+ * In order to indicate we no longer needs this uobject, uverbs_uobject_put
+ * is called. When the reference count is decreased, the uobject is freed.
+ * For example, this is used when attaching a completion channel to a CQ.
+ */
+void uverbs_uobject_put(struct ib_uobject *uobject);
+
+/* Indicate this fd is no longer used by this consumer, but its memory isn't
+ * necessarily released yet. When the last reference is put, we release the
+ * memory. After this call is executed, calling uverbs_uobject_get isn't
+ * allowed.
+ * This must be called from the release file_operations of the file!
+ */
+void uverbs_close_fd(struct file *f);
+
+/*
+ * Get an ib_uobject that corresponds to the given id from ucontext, assuming
+ * the object is from the given type. Lock it to the required access when
+ * applicable.
+ * This function could create (access == NEW), destroy (access == DESTROY)
+ * or unlock (access == READ || access == WRITE) objects if required.
+ * The action will be finalized only when uverbs_finalize_object or
+ * uverbs_finalize_objects are called.
+ */
+struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs,
+						   struct ib_ucontext *ucontext,
+						   enum uverbs_obj_access access,
+						   int id);
+int uverbs_finalize_object(struct ib_uobject *uobj,
+			   enum uverbs_obj_access access,
+			   bool commit);
+/*
+ * Note that certain finalize stages could return a status:
+ *   (a) alloc_commit could return a failure if the object is committed at the
+ *       same time when the context is destroyed.
+ *   (b) remove_commit could fail if the object wasn't destroyed successfully.
+ * Since multiple objects could be finalized in one transaction, it is very NOT
+ * recommended to have several finalize actions which have side effects.
+ * For example, it's NOT recommended to have a certain action which has both
+ * a commit action and a destroy action or two destroy objects in the same
+ * action. The rule of thumb is to have one destroy or commit action with
+ * multiple lookups.
+ * The first non zero return value of finalize_object is returned from this
+ * function. For example, this could happen when we couldn't destroy an
+ * object.
+ */
+int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle,
+			    struct uverbs_attr_spec_hash * const *spec_hash,
+			    size_t num,
+			    bool commit);
+
+#endif /* RDMA_CORE_H */
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
new file mode 100644
index 0000000..efddd13
--- /dev/null
+++ b/drivers/infiniband/core/restrack.c
@@ -0,0 +1,222 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved.
+ */
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/restrack.h>
+#include <linux/mutex.h>
+#include <linux/sched/task.h>
+#include <linux/pid_namespace.h>
+
+#include "cma_priv.h"
+
+void rdma_restrack_init(struct rdma_restrack_root *res)
+{
+	init_rwsem(&res->rwsem);
+}
+
+static const char *type2str(enum rdma_restrack_type type)
+{
+	static const char * const names[RDMA_RESTRACK_MAX] = {
+		[RDMA_RESTRACK_PD] = "PD",
+		[RDMA_RESTRACK_CQ] = "CQ",
+		[RDMA_RESTRACK_QP] = "QP",
+		[RDMA_RESTRACK_CM_ID] = "CM_ID",
+		[RDMA_RESTRACK_MR] = "MR",
+	};
+
+	return names[type];
+};
+
+void rdma_restrack_clean(struct rdma_restrack_root *res)
+{
+	struct rdma_restrack_entry *e;
+	char buf[TASK_COMM_LEN];
+	struct ib_device *dev;
+	const char *owner;
+	int bkt;
+
+	if (hash_empty(res->hash))
+		return;
+
+	dev = container_of(res, struct ib_device, res);
+	pr_err("restrack: %s", CUT_HERE);
+	pr_err("restrack: BUG: RESTRACK detected leak of resources on %s\n",
+	       dev->name);
+	hash_for_each(res->hash, bkt, e, node) {
+		if (rdma_is_kernel_res(e)) {
+			owner = e->kern_name;
+		} else {
+			/*
+			 * There is no need to call get_task_struct here,
+			 * because we can be here only if there are more
+			 * get_task_struct() call than put_task_struct().
+			 */
+			get_task_comm(buf, e->task);
+			owner = buf;
+		}
+
+		pr_err("restrack: %s %s object allocated by %s is not freed\n",
+		       rdma_is_kernel_res(e) ? "Kernel" : "User",
+		       type2str(e->type), owner);
+	}
+	pr_err("restrack: %s", CUT_HERE);
+}
+
+int rdma_restrack_count(struct rdma_restrack_root *res,
+			enum rdma_restrack_type type,
+			struct pid_namespace *ns)
+{
+	struct rdma_restrack_entry *e;
+	u32 cnt = 0;
+
+	down_read(&res->rwsem);
+	hash_for_each_possible(res->hash, e, node, type) {
+		if (ns == &init_pid_ns ||
+		    (!rdma_is_kernel_res(e) &&
+		     ns == task_active_pid_ns(e->task)))
+			cnt++;
+	}
+	up_read(&res->rwsem);
+	return cnt;
+}
+EXPORT_SYMBOL(rdma_restrack_count);
+
+static void set_kern_name(struct rdma_restrack_entry *res)
+{
+	struct ib_pd *pd;
+
+	switch (res->type) {
+	case RDMA_RESTRACK_QP:
+		pd = container_of(res, struct ib_qp, res)->pd;
+		if (!pd) {
+			WARN_ONCE(true, "XRC QPs are not supported\n");
+			/* Survive, despite the programmer's error */
+			res->kern_name = " ";
+		}
+		break;
+	case RDMA_RESTRACK_MR:
+		pd = container_of(res, struct ib_mr, res)->pd;
+		break;
+	default:
+		/* Other types set kern_name directly */
+		pd = NULL;
+		break;
+	}
+
+	if (pd)
+		res->kern_name = pd->res.kern_name;
+}
+
+static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
+{
+	switch (res->type) {
+	case RDMA_RESTRACK_PD:
+		return container_of(res, struct ib_pd, res)->device;
+	case RDMA_RESTRACK_CQ:
+		return container_of(res, struct ib_cq, res)->device;
+	case RDMA_RESTRACK_QP:
+		return container_of(res, struct ib_qp, res)->device;
+	case RDMA_RESTRACK_CM_ID:
+		return container_of(res, struct rdma_id_private,
+				    res)->id.device;
+	case RDMA_RESTRACK_MR:
+		return container_of(res, struct ib_mr, res)->device;
+	default:
+		WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
+		return NULL;
+	}
+}
+
+static bool res_is_user(struct rdma_restrack_entry *res)
+{
+	switch (res->type) {
+	case RDMA_RESTRACK_PD:
+		return container_of(res, struct ib_pd, res)->uobject;
+	case RDMA_RESTRACK_CQ:
+		return container_of(res, struct ib_cq, res)->uobject;
+	case RDMA_RESTRACK_QP:
+		return container_of(res, struct ib_qp, res)->uobject;
+	case RDMA_RESTRACK_CM_ID:
+		return !res->kern_name;
+	case RDMA_RESTRACK_MR:
+		return container_of(res, struct ib_mr, res)->pd->uobject;
+	default:
+		WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
+		return false;
+	}
+}
+
+void rdma_restrack_add(struct rdma_restrack_entry *res)
+{
+	struct ib_device *dev = res_to_dev(res);
+
+	if (!dev)
+		return;
+
+	if (res->type != RDMA_RESTRACK_CM_ID || !res_is_user(res))
+		res->task = NULL;
+
+	if (res_is_user(res)) {
+		if (!res->task)
+			rdma_restrack_set_task(res, current);
+		res->kern_name = NULL;
+	} else {
+		set_kern_name(res);
+	}
+
+	kref_init(&res->kref);
+	init_completion(&res->comp);
+	res->valid = true;
+
+	down_write(&dev->res.rwsem);
+	hash_add(dev->res.hash, &res->node, res->type);
+	up_write(&dev->res.rwsem);
+}
+EXPORT_SYMBOL(rdma_restrack_add);
+
+int __must_check rdma_restrack_get(struct rdma_restrack_entry *res)
+{
+	return kref_get_unless_zero(&res->kref);
+}
+EXPORT_SYMBOL(rdma_restrack_get);
+
+static void restrack_release(struct kref *kref)
+{
+	struct rdma_restrack_entry *res;
+
+	res = container_of(kref, struct rdma_restrack_entry, kref);
+	complete(&res->comp);
+}
+
+int rdma_restrack_put(struct rdma_restrack_entry *res)
+{
+	return kref_put(&res->kref, restrack_release);
+}
+EXPORT_SYMBOL(rdma_restrack_put);
+
+void rdma_restrack_del(struct rdma_restrack_entry *res)
+{
+	struct ib_device *dev;
+
+	if (!res->valid)
+		return;
+
+	dev = res_to_dev(res);
+	if (!dev)
+		return;
+
+	rdma_restrack_put(res);
+
+	wait_for_completion(&res->comp);
+
+	down_write(&dev->res.rwsem);
+	hash_del(&res->node);
+	res->valid = false;
+	if (res->task)
+		put_task_struct(res->task);
+	up_write(&dev->res.rwsem);
+}
+EXPORT_SYMBOL(rdma_restrack_del);
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
new file mode 100644
index 0000000..c0e4fd5
--- /dev/null
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -0,0 +1,783 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/in.h>
+#include <linux/in6.h>
+
+/* For in6_dev_get/in6_dev_put */
+#include <net/addrconf.h>
+#include <net/bonding.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+static struct workqueue_struct *gid_cache_wq;
+
+static struct workqueue_struct *gid_cache_wq;
+
+enum gid_op_type {
+	GID_DEL = 0,
+	GID_ADD
+};
+
+struct update_gid_event_work {
+	struct work_struct work;
+	union ib_gid       gid;
+	struct ib_gid_attr gid_attr;
+	enum gid_op_type gid_op;
+};
+
+#define ROCE_NETDEV_CALLBACK_SZ		3
+struct netdev_event_work_cmd {
+	roce_netdev_callback	cb;
+	roce_netdev_filter	filter;
+	struct net_device	*ndev;
+	struct net_device	*filter_ndev;
+};
+
+struct netdev_event_work {
+	struct work_struct		work;
+	struct netdev_event_work_cmd	cmds[ROCE_NETDEV_CALLBACK_SZ];
+};
+
+static const struct {
+	bool (*is_supported)(const struct ib_device *device, u8 port_num);
+	enum ib_gid_type gid_type;
+} PORT_CAP_TO_GID_TYPE[] = {
+	{rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE},
+	{rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP},
+};
+
+#define CAP_TO_GID_TABLE_SIZE	ARRAY_SIZE(PORT_CAP_TO_GID_TYPE)
+
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port)
+{
+	int i;
+	unsigned int ret_flags = 0;
+
+	if (!rdma_protocol_roce(ib_dev, port))
+		return 1UL << IB_GID_TYPE_IB;
+
+	for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++)
+		if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port))
+			ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type;
+
+	return ret_flags;
+}
+EXPORT_SYMBOL(roce_gid_type_mask_support);
+
+static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
+		       u8 port, union ib_gid *gid,
+		       struct ib_gid_attr *gid_attr)
+{
+	int i;
+	unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+	for (i = 0; i < IB_GID_TYPE_SIZE; i++) {
+		if ((1UL << i) & gid_type_mask) {
+			gid_attr->gid_type = i;
+			switch (gid_op) {
+			case GID_ADD:
+				ib_cache_gid_add(ib_dev, port,
+						 gid, gid_attr);
+				break;
+			case GID_DEL:
+				ib_cache_gid_del(ib_dev, port,
+						 gid, gid_attr);
+				break;
+			}
+		}
+	}
+}
+
+enum bonding_slave_state {
+	BONDING_SLAVE_STATE_ACTIVE	= 1UL << 0,
+	BONDING_SLAVE_STATE_INACTIVE	= 1UL << 1,
+	/* No primary slave or the device isn't a slave in bonding */
+	BONDING_SLAVE_STATE_NA		= 1UL << 2,
+};
+
+static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev,
+								   struct net_device *upper)
+{
+	if (upper && netif_is_bond_master(upper)) {
+		struct net_device *pdev =
+			bond_option_active_slave_get_rcu(netdev_priv(upper));
+
+		if (pdev)
+			return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE :
+				BONDING_SLAVE_STATE_INACTIVE;
+	}
+
+	return BONDING_SLAVE_STATE_NA;
+}
+
+#define REQUIRED_BOND_STATES		(BONDING_SLAVE_STATE_ACTIVE |	\
+					 BONDING_SLAVE_STATE_NA)
+static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
+				 struct net_device *rdma_ndev, void *cookie)
+{
+	struct net_device *real_dev;
+	int res;
+
+	if (!rdma_ndev)
+		return 0;
+
+	rcu_read_lock();
+	real_dev = rdma_vlan_dev_real_dev(cookie);
+	if (!real_dev)
+		real_dev = cookie;
+
+	res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) &&
+	       (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
+		REQUIRED_BOND_STATES)) ||
+	       real_dev == rdma_ndev);
+
+	rcu_read_unlock();
+	return res;
+}
+
+static int is_eth_port_inactive_slave(struct ib_device *ib_dev, u8 port,
+				      struct net_device *rdma_ndev, void *cookie)
+{
+	struct net_device *master_dev;
+	int res;
+
+	if (!rdma_ndev)
+		return 0;
+
+	rcu_read_lock();
+	master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+	res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) ==
+		BONDING_SLAVE_STATE_INACTIVE;
+	rcu_read_unlock();
+
+	return res;
+}
+
+static int pass_all_filter(struct ib_device *ib_dev, u8 port,
+			   struct net_device *rdma_ndev, void *cookie)
+{
+	return 1;
+}
+
+static int upper_device_filter(struct ib_device *ib_dev, u8 port,
+			       struct net_device *rdma_ndev, void *cookie)
+{
+	int res;
+
+	if (!rdma_ndev)
+		return 0;
+
+	if (rdma_ndev == cookie)
+		return 1;
+
+	rcu_read_lock();
+	res = rdma_is_upper_dev_rcu(rdma_ndev, cookie);
+	rcu_read_unlock();
+
+	return res;
+}
+
+static void update_gid_ip(enum gid_op_type gid_op,
+			  struct ib_device *ib_dev,
+			  u8 port, struct net_device *ndev,
+			  struct sockaddr *addr)
+{
+	union ib_gid gid;
+	struct ib_gid_attr gid_attr;
+
+	rdma_ip2gid(addr, &gid);
+	memset(&gid_attr, 0, sizeof(gid_attr));
+	gid_attr.ndev = ndev;
+
+	update_gid(gid_op, ib_dev, port, &gid, &gid_attr);
+}
+
+static void enum_netdev_default_gids(struct ib_device *ib_dev,
+				     u8 port, struct net_device *event_ndev,
+				     struct net_device *rdma_ndev)
+{
+	unsigned long gid_type_mask;
+
+	rcu_read_lock();
+	if (!rdma_ndev ||
+	    ((rdma_ndev != event_ndev &&
+	      !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
+	     is_eth_active_slave_of_bonding_rcu(rdma_ndev,
+						netdev_master_upper_dev_get_rcu(rdma_ndev)) ==
+	     BONDING_SLAVE_STATE_INACTIVE)) {
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
+
+	gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+	ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask,
+				     IB_CACHE_GID_DEFAULT_MODE_SET);
+}
+
+static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
+					    u8 port,
+					    struct net_device *event_ndev,
+					    struct net_device *rdma_ndev)
+{
+	struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev);
+	unsigned long gid_type_mask;
+
+	if (!rdma_ndev)
+		return;
+
+	if (!real_dev)
+		real_dev = event_ndev;
+
+	rcu_read_lock();
+
+	if (((rdma_ndev != event_ndev &&
+	      !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
+	     is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev)
+						 ==
+	     BONDING_SLAVE_STATE_INACTIVE)) {
+		rcu_read_unlock();
+		return;
+	}
+
+	rcu_read_unlock();
+
+	gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+	ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+				     gid_type_mask,
+				     IB_CACHE_GID_DEFAULT_MODE_DELETE);
+}
+
+static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
+				 u8 port, struct net_device *ndev)
+{
+	struct in_device *in_dev;
+	struct sin_list {
+		struct list_head	list;
+		struct sockaddr_in	ip;
+	};
+	struct sin_list *sin_iter;
+	struct sin_list *sin_temp;
+
+	LIST_HEAD(sin_list);
+	if (ndev->reg_state >= NETREG_UNREGISTERING)
+		return;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(ndev);
+	if (!in_dev) {
+		rcu_read_unlock();
+		return;
+	}
+
+	for_ifa(in_dev) {
+		struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+		if (!entry)
+			continue;
+
+		entry->ip.sin_family = AF_INET;
+		entry->ip.sin_addr.s_addr = ifa->ifa_address;
+		list_add_tail(&entry->list, &sin_list);
+	}
+	endfor_ifa(in_dev);
+	rcu_read_unlock();
+
+	list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) {
+		update_gid_ip(GID_ADD, ib_dev, port, ndev,
+			      (struct sockaddr *)&sin_iter->ip);
+		list_del(&sin_iter->list);
+		kfree(sin_iter);
+	}
+}
+
+static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
+				 u8 port, struct net_device *ndev)
+{
+	struct inet6_ifaddr *ifp;
+	struct inet6_dev *in6_dev;
+	struct sin6_list {
+		struct list_head	list;
+		struct sockaddr_in6	sin6;
+	};
+	struct sin6_list *sin6_iter;
+	struct sin6_list *sin6_temp;
+	struct ib_gid_attr gid_attr = {.ndev = ndev};
+	LIST_HEAD(sin6_list);
+
+	if (ndev->reg_state >= NETREG_UNREGISTERING)
+		return;
+
+	in6_dev = in6_dev_get(ndev);
+	if (!in6_dev)
+		return;
+
+	read_lock_bh(&in6_dev->lock);
+	list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+		struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+		if (!entry)
+			continue;
+
+		entry->sin6.sin6_family = AF_INET6;
+		entry->sin6.sin6_addr = ifp->addr;
+		list_add_tail(&entry->list, &sin6_list);
+	}
+	read_unlock_bh(&in6_dev->lock);
+
+	in6_dev_put(in6_dev);
+
+	list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) {
+		union ib_gid	gid;
+
+		rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid);
+		update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr);
+		list_del(&sin6_iter->list);
+		kfree(sin6_iter);
+	}
+}
+
+static void _add_netdev_ips(struct ib_device *ib_dev, u8 port,
+			    struct net_device *ndev)
+{
+	enum_netdev_ipv4_ips(ib_dev, port, ndev);
+	if (IS_ENABLED(CONFIG_IPV6))
+		enum_netdev_ipv6_ips(ib_dev, port, ndev);
+}
+
+static void add_netdev_ips(struct ib_device *ib_dev, u8 port,
+			   struct net_device *rdma_ndev, void *cookie)
+{
+	enum_netdev_default_gids(ib_dev, port, cookie, rdma_ndev);
+	_add_netdev_ips(ib_dev, port, cookie);
+}
+
+static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
+			   struct net_device *rdma_ndev, void *cookie)
+{
+	ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie);
+}
+
+static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
+				    u8 port,
+				    struct net_device *rdma_ndev,
+				    void *cookie)
+{
+	struct net *net;
+	struct net_device *ndev;
+
+	/* Lock the rtnl to make sure the netdevs does not move under
+	 * our feet
+	 */
+	rtnl_lock();
+	down_read(&net_rwsem);
+	for_each_net(net)
+		for_each_netdev(net, ndev)
+			if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev))
+				add_netdev_ips(ib_dev, port, rdma_ndev, ndev);
+	up_read(&net_rwsem);
+	rtnl_unlock();
+}
+
+/**
+ * rdma_roce_rescan_device - Rescan all of the network devices in the system
+ * and add their gids, as needed, to the relevant RoCE devices.
+ *
+ * @device:         the rdma device
+ */
+void rdma_roce_rescan_device(struct ib_device *ib_dev)
+{
+	ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL,
+			    enum_all_gids_of_dev_cb, NULL);
+}
+EXPORT_SYMBOL(rdma_roce_rescan_device);
+
+static void callback_for_addr_gid_device_scan(struct ib_device *device,
+					      u8 port,
+					      struct net_device *rdma_ndev,
+					      void *cookie)
+{
+	struct update_gid_event_work *parsed = cookie;
+
+	return update_gid(parsed->gid_op, device,
+			  port, &parsed->gid,
+			  &parsed->gid_attr);
+}
+
+struct upper_list {
+	struct list_head list;
+	struct net_device *upper;
+};
+
+static int netdev_upper_walk(struct net_device *upper, void *data)
+{
+	struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+	struct list_head *upper_list = data;
+
+	if (!entry)
+		return 0;
+
+	list_add_tail(&entry->list, upper_list);
+	dev_hold(upper);
+	entry->upper = upper;
+
+	return 0;
+}
+
+static void handle_netdev_upper(struct ib_device *ib_dev, u8 port,
+				void *cookie,
+				void (*handle_netdev)(struct ib_device *ib_dev,
+						      u8 port,
+						      struct net_device *ndev))
+{
+	struct net_device *ndev = cookie;
+	struct upper_list *upper_iter;
+	struct upper_list *upper_temp;
+	LIST_HEAD(upper_list);
+
+	rcu_read_lock();
+	netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &upper_list);
+	rcu_read_unlock();
+
+	handle_netdev(ib_dev, port, ndev);
+	list_for_each_entry_safe(upper_iter, upper_temp, &upper_list,
+				 list) {
+		handle_netdev(ib_dev, port, upper_iter->upper);
+		dev_put(upper_iter->upper);
+		list_del(&upper_iter->list);
+		kfree(upper_iter);
+	}
+}
+
+static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+				      struct net_device *event_ndev)
+{
+	ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+				 struct net_device *rdma_ndev, void *cookie)
+{
+	handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids);
+}
+
+static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+				 struct net_device *rdma_ndev, void *cookie)
+{
+	handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips);
+}
+
+static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port,
+					struct net_device *rdma_ndev,
+					void *cookie)
+{
+	struct net_device *master_ndev;
+
+	rcu_read_lock();
+	master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+	if (master_ndev)
+		dev_hold(master_ndev);
+	rcu_read_unlock();
+
+	if (master_ndev) {
+		bond_delete_netdev_default_gids(ib_dev, port, master_ndev,
+						rdma_ndev);
+		dev_put(master_ndev);
+	}
+}
+
+static void del_netdev_default_ips(struct ib_device *ib_dev, u8 port,
+				   struct net_device *rdma_ndev, void *cookie)
+{
+	bond_delete_netdev_default_gids(ib_dev, port, cookie, rdma_ndev);
+}
+
+/* The following functions operate on all IB devices. netdevice_event and
+ * addr_event execute ib_enum_all_roce_netdevs through a work.
+ * ib_enum_all_roce_netdevs iterates through all IB devices.
+ */
+
+static void netdevice_event_work_handler(struct work_struct *_work)
+{
+	struct netdev_event_work *work =
+		container_of(_work, struct netdev_event_work, work);
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
+		ib_enum_all_roce_netdevs(work->cmds[i].filter,
+					 work->cmds[i].filter_ndev,
+					 work->cmds[i].cb,
+					 work->cmds[i].ndev);
+		dev_put(work->cmds[i].ndev);
+		dev_put(work->cmds[i].filter_ndev);
+	}
+
+	kfree(work);
+}
+
+static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
+				struct net_device *ndev)
+{
+	unsigned int i;
+	struct netdev_event_work *ndev_work =
+		kmalloc(sizeof(*ndev_work), GFP_KERNEL);
+
+	if (!ndev_work)
+		return NOTIFY_DONE;
+
+	memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
+	for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) {
+		if (!ndev_work->cmds[i].ndev)
+			ndev_work->cmds[i].ndev = ndev;
+		if (!ndev_work->cmds[i].filter_ndev)
+			ndev_work->cmds[i].filter_ndev = ndev;
+		dev_hold(ndev_work->cmds[i].ndev);
+		dev_hold(ndev_work->cmds[i].filter_ndev);
+	}
+	INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
+
+	queue_work(gid_cache_wq, &ndev_work->work);
+
+	return NOTIFY_DONE;
+}
+
+static const struct netdev_event_work_cmd add_cmd = {
+	.cb = add_netdev_ips, .filter = is_eth_port_of_netdev};
+static const struct netdev_event_work_cmd add_cmd_upper_ips = {
+	.cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev};
+
+static void netdevice_event_changeupper(struct netdev_notifier_changeupper_info *changeupper_info,
+					struct netdev_event_work_cmd *cmds)
+{
+	static const struct netdev_event_work_cmd upper_ips_del_cmd = {
+		.cb = del_netdev_upper_ips, .filter = upper_device_filter};
+	static const struct netdev_event_work_cmd bonding_default_del_cmd = {
+		.cb = del_netdev_default_ips, .filter = is_eth_port_inactive_slave};
+
+	if (changeupper_info->linking == false) {
+		cmds[0] = upper_ips_del_cmd;
+		cmds[0].ndev = changeupper_info->upper_dev;
+		cmds[1] = add_cmd;
+	} else {
+		cmds[0] = bonding_default_del_cmd;
+		cmds[0].ndev = changeupper_info->upper_dev;
+		cmds[1] = add_cmd_upper_ips;
+		cmds[1].ndev = changeupper_info->upper_dev;
+		cmds[1].filter_ndev = changeupper_info->upper_dev;
+	}
+}
+
+static int netdevice_event(struct notifier_block *this, unsigned long event,
+			   void *ptr)
+{
+	static const struct netdev_event_work_cmd del_cmd = {
+		.cb = del_netdev_ips, .filter = pass_all_filter};
+	static const struct netdev_event_work_cmd bonding_default_del_cmd_join = {
+		.cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave};
+	static const struct netdev_event_work_cmd default_del_cmd = {
+		.cb = del_netdev_default_ips, .filter = pass_all_filter};
+	static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = {
+		.cb = del_netdev_upper_ips, .filter = upper_device_filter};
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };
+
+	if (ndev->type != ARPHRD_ETHER)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+	case NETDEV_UP:
+		cmds[0] = bonding_default_del_cmd_join;
+		cmds[1] = add_cmd;
+		break;
+
+	case NETDEV_UNREGISTER:
+		if (ndev->reg_state < NETREG_UNREGISTERED)
+			cmds[0] = del_cmd;
+		else
+			return NOTIFY_DONE;
+		break;
+
+	case NETDEV_CHANGEADDR:
+		cmds[0] = default_del_cmd;
+		cmds[1] = add_cmd;
+		break;
+
+	case NETDEV_CHANGEUPPER:
+		netdevice_event_changeupper(
+			container_of(ptr, struct netdev_notifier_changeupper_info, info),
+			cmds);
+		break;
+
+	case NETDEV_BONDING_FAILOVER:
+		cmds[0] = bonding_event_ips_del_cmd;
+		cmds[1] = bonding_default_del_cmd_join;
+		cmds[2] = add_cmd_upper_ips;
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return netdevice_queue_work(cmds, ndev);
+}
+
+static void update_gid_event_work_handler(struct work_struct *_work)
+{
+	struct update_gid_event_work *work =
+		container_of(_work, struct update_gid_event_work, work);
+
+	ib_enum_all_roce_netdevs(is_eth_port_of_netdev, work->gid_attr.ndev,
+				 callback_for_addr_gid_device_scan, work);
+
+	dev_put(work->gid_attr.ndev);
+	kfree(work);
+}
+
+static int addr_event(struct notifier_block *this, unsigned long event,
+		      struct sockaddr *sa, struct net_device *ndev)
+{
+	struct update_gid_event_work *work;
+	enum gid_op_type gid_op;
+
+	if (ndev->type != ARPHRD_ETHER)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UP:
+		gid_op = GID_ADD;
+		break;
+
+	case NETDEV_DOWN:
+		gid_op = GID_DEL;
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return NOTIFY_DONE;
+
+	INIT_WORK(&work->work, update_gid_event_work_handler);
+
+	rdma_ip2gid(sa, &work->gid);
+	work->gid_op = gid_op;
+
+	memset(&work->gid_attr, 0, sizeof(work->gid_attr));
+	dev_hold(ndev);
+	work->gid_attr.ndev   = ndev;
+
+	queue_work(gid_cache_wq, &work->work);
+
+	return NOTIFY_DONE;
+}
+
+static int inetaddr_event(struct notifier_block *this, unsigned long event,
+			  void *ptr)
+{
+	struct sockaddr_in	in;
+	struct net_device	*ndev;
+	struct in_ifaddr	*ifa = ptr;
+
+	in.sin_family = AF_INET;
+	in.sin_addr.s_addr = ifa->ifa_address;
+	ndev = ifa->ifa_dev->dev;
+
+	return addr_event(this, event, (struct sockaddr *)&in, ndev);
+}
+
+static int inet6addr_event(struct notifier_block *this, unsigned long event,
+			   void *ptr)
+{
+	struct sockaddr_in6	in6;
+	struct net_device	*ndev;
+	struct inet6_ifaddr	*ifa6 = ptr;
+
+	in6.sin6_family = AF_INET6;
+	in6.sin6_addr = ifa6->addr;
+	ndev = ifa6->idev->dev;
+
+	return addr_event(this, event, (struct sockaddr *)&in6, ndev);
+}
+
+static struct notifier_block nb_netdevice = {
+	.notifier_call = netdevice_event
+};
+
+static struct notifier_block nb_inetaddr = {
+	.notifier_call = inetaddr_event
+};
+
+static struct notifier_block nb_inet6addr = {
+	.notifier_call = inet6addr_event
+};
+
+int __init roce_gid_mgmt_init(void)
+{
+	gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0);
+	if (!gid_cache_wq)
+		return -ENOMEM;
+
+	register_inetaddr_notifier(&nb_inetaddr);
+	if (IS_ENABLED(CONFIG_IPV6))
+		register_inet6addr_notifier(&nb_inet6addr);
+	/* We relay on the netdevice notifier to enumerate all
+	 * existing devices in the system. Register to this notifier
+	 * last to make sure we will not miss any IP add/del
+	 * callbacks.
+	 */
+	register_netdevice_notifier(&nb_netdevice);
+
+	return 0;
+}
+
+void __exit roce_gid_mgmt_cleanup(void)
+{
+	if (IS_ENABLED(CONFIG_IPV6))
+		unregister_inet6addr_notifier(&nb_inet6addr);
+	unregister_inetaddr_notifier(&nb_inetaddr);
+	unregister_netdevice_notifier(&nb_netdevice);
+	/* Ensure all gid deletion tasks complete before we go down,
+	 * to avoid any reference to free'd memory. By the time
+	 * ib-core is removed, all physical devices have been removed,
+	 * so no issue with remaining hardware contexts.
+	 */
+	destroy_workqueue(gid_cache_wq);
+}
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
new file mode 100644
index 0000000..c8963e9
--- /dev/null
+++ b/drivers/infiniband/core/rw.c
@@ -0,0 +1,745 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <rdma/mr_pool.h>
+#include <rdma/rw.h>
+
+enum {
+	RDMA_RW_SINGLE_WR,
+	RDMA_RW_MULTI_WR,
+	RDMA_RW_MR,
+	RDMA_RW_SIG_MR,
+};
+
+static bool rdma_rw_force_mr;
+module_param_named(force_mr, rdma_rw_force_mr, bool, 0);
+MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations");
+
+/*
+ * Check if the device might use memory registration.  This is currently only
+ * true for iWarp devices. In the future we can hopefully fine tune this based
+ * on HCA driver input.
+ */
+static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num)
+{
+	if (rdma_protocol_iwarp(dev, port_num))
+		return true;
+	if (unlikely(rdma_rw_force_mr))
+		return true;
+	return false;
+}
+
+/*
+ * Check if the device will use memory registration for this RW operation.
+ * We currently always use memory registrations for iWarp RDMA READs, and
+ * have a debug option to force usage of MRs.
+ *
+ * XXX: In the future we can hopefully fine tune this based on HCA driver
+ * input.
+ */
+static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
+		enum dma_data_direction dir, int dma_nents)
+{
+	if (rdma_protocol_iwarp(dev, port_num) && dir == DMA_FROM_DEVICE)
+		return true;
+	if (unlikely(rdma_rw_force_mr))
+		return true;
+	return false;
+}
+
+static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
+{
+	/* arbitrary limit to avoid allocating gigantic resources */
+	return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
+}
+
+/* Caller must have zero-initialized *reg. */
+static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
+		struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
+		u32 sg_cnt, u32 offset)
+{
+	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+	u32 nents = min(sg_cnt, pages_per_mr);
+	int count = 0, ret;
+
+	reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs);
+	if (!reg->mr)
+		return -EAGAIN;
+
+	if (reg->mr->need_inval) {
+		reg->inv_wr.opcode = IB_WR_LOCAL_INV;
+		reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
+		reg->inv_wr.next = &reg->reg_wr.wr;
+		count++;
+	} else {
+		reg->inv_wr.next = NULL;
+	}
+
+	ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE);
+	if (ret < nents) {
+		ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr);
+		return -EINVAL;
+	}
+
+	reg->reg_wr.wr.opcode = IB_WR_REG_MR;
+	reg->reg_wr.mr = reg->mr;
+	reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
+	if (rdma_protocol_iwarp(qp->device, port_num))
+		reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
+	count++;
+
+	reg->sge.addr = reg->mr->iova;
+	reg->sge.length = reg->mr->length;
+	return count;
+}
+
+static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+		u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
+		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+	struct rdma_rw_reg_ctx *prev = NULL;
+	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+	int i, j, ret = 0, count = 0;
+
+	ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr;
+	ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL);
+	if (!ctx->reg) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < ctx->nr_ops; i++) {
+		struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
+		u32 nents = min(sg_cnt, pages_per_mr);
+
+		ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt,
+				offset);
+		if (ret < 0)
+			goto out_free;
+		count += ret;
+
+		if (prev) {
+			if (reg->mr->need_inval)
+				prev->wr.wr.next = &reg->inv_wr;
+			else
+				prev->wr.wr.next = &reg->reg_wr.wr;
+		}
+
+		reg->reg_wr.wr.next = &reg->wr.wr;
+
+		reg->wr.wr.sg_list = &reg->sge;
+		reg->wr.wr.num_sge = 1;
+		reg->wr.remote_addr = remote_addr;
+		reg->wr.rkey = rkey;
+		if (dir == DMA_TO_DEVICE) {
+			reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
+		} else if (!rdma_cap_read_inv(qp->device, port_num)) {
+			reg->wr.wr.opcode = IB_WR_RDMA_READ;
+		} else {
+			reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+			reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
+		}
+		count++;
+
+		remote_addr += reg->sge.length;
+		sg_cnt -= nents;
+		for (j = 0; j < nents; j++)
+			sg = sg_next(sg);
+		prev = reg;
+		offset = 0;
+	}
+
+	if (prev)
+		prev->wr.wr.next = NULL;
+
+	ctx->type = RDMA_RW_MR;
+	return count;
+
+out_free:
+	while (--i >= 0)
+		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+	kfree(ctx->reg);
+out:
+	return ret;
+}
+
+static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+		struct scatterlist *sg, u32 sg_cnt, u32 offset,
+		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+	struct ib_device *dev = qp->pd->device;
+	u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
+		      qp->max_read_sge;
+	struct ib_sge *sge;
+	u32 total_len = 0, i, j;
+
+	ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge);
+
+	ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL);
+	if (!ctx->map.sges)
+		goto out;
+
+	ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL);
+	if (!ctx->map.wrs)
+		goto out_free_sges;
+
+	for (i = 0; i < ctx->nr_ops; i++) {
+		struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
+		u32 nr_sge = min(sg_cnt, max_sge);
+
+		if (dir == DMA_TO_DEVICE)
+			rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+		else
+			rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+		rdma_wr->remote_addr = remote_addr + total_len;
+		rdma_wr->rkey = rkey;
+		rdma_wr->wr.num_sge = nr_sge;
+		rdma_wr->wr.sg_list = sge;
+
+		for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
+			sge->addr = ib_sg_dma_address(dev, sg) + offset;
+			sge->length = ib_sg_dma_len(dev, sg) - offset;
+			sge->lkey = qp->pd->local_dma_lkey;
+
+			total_len += sge->length;
+			sge++;
+			sg_cnt--;
+			offset = 0;
+		}
+
+		rdma_wr->wr.next = i + 1 < ctx->nr_ops ?
+			&ctx->map.wrs[i + 1].wr : NULL;
+	}
+
+	ctx->type = RDMA_RW_MULTI_WR;
+	return ctx->nr_ops;
+
+out_free_sges:
+	kfree(ctx->map.sges);
+out:
+	return -ENOMEM;
+}
+
+static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+		struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
+		enum dma_data_direction dir)
+{
+	struct ib_device *dev = qp->pd->device;
+	struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
+
+	ctx->nr_ops = 1;
+
+	ctx->single.sge.lkey = qp->pd->local_dma_lkey;
+	ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset;
+	ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset;
+
+	memset(rdma_wr, 0, sizeof(*rdma_wr));
+	if (dir == DMA_TO_DEVICE)
+		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+	else
+		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+	rdma_wr->wr.sg_list = &ctx->single.sge;
+	rdma_wr->wr.num_sge = 1;
+	rdma_wr->remote_addr = remote_addr;
+	rdma_wr->rkey = rkey;
+
+	ctx->type = RDMA_RW_SINGLE_WR;
+	return 1;
+}
+
+/**
+ * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
+ * @ctx:	context to initialize
+ * @qp:		queue pair to operate on
+ * @port_num:	port num to which the connection is bound
+ * @sg:		scatterlist to READ/WRITE from/to
+ * @sg_cnt:	number of entries in @sg
+ * @sg_offset:	current byte offset into @sg
+ * @remote_addr:remote address to read/write (relative to @rkey)
+ * @rkey:	remote key to operate on
+ * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ *
+ * Returns the number of WQEs that will be needed on the workqueue if
+ * successful, or a negative error code.
+ */
+int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+		struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
+		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+	struct ib_device *dev = qp->pd->device;
+	int ret;
+
+	ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+	if (!ret)
+		return -ENOMEM;
+	sg_cnt = ret;
+
+	/*
+	 * Skip to the S/G entry that sg_offset falls into:
+	 */
+	for (;;) {
+		u32 len = ib_sg_dma_len(dev, sg);
+
+		if (sg_offset < len)
+			break;
+
+		sg = sg_next(sg);
+		sg_offset -= len;
+		sg_cnt--;
+	}
+
+	ret = -EIO;
+	if (WARN_ON_ONCE(sg_cnt == 0))
+		goto out_unmap_sg;
+
+	if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) {
+		ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt,
+				sg_offset, remote_addr, rkey, dir);
+	} else if (sg_cnt > 1) {
+		ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset,
+				remote_addr, rkey, dir);
+	} else {
+		ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset,
+				remote_addr, rkey, dir);
+	}
+
+	if (ret < 0)
+		goto out_unmap_sg;
+	return ret;
+
+out_unmap_sg:
+	ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_init);
+
+/**
+ * rdma_rw_ctx_signature init - initialize a RW context with signature offload
+ * @ctx:	context to initialize
+ * @qp:		queue pair to operate on
+ * @port_num:	port num to which the connection is bound
+ * @sg:		scatterlist to READ/WRITE from/to
+ * @sg_cnt:	number of entries in @sg
+ * @prot_sg:	scatterlist to READ/WRITE protection information from/to
+ * @prot_sg_cnt: number of entries in @prot_sg
+ * @sig_attrs:	signature offloading algorithms
+ * @remote_addr:remote address to read/write (relative to @rkey)
+ * @rkey:	remote key to operate on
+ * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ *
+ * Returns the number of WQEs that will be needed on the workqueue if
+ * successful, or a negative error code.
+ */
+int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+		u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+		struct scatterlist *prot_sg, u32 prot_sg_cnt,
+		struct ib_sig_attrs *sig_attrs,
+		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+	struct ib_device *dev = qp->pd->device;
+	u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+	struct ib_rdma_wr *rdma_wr;
+	struct ib_send_wr *prev_wr = NULL;
+	int count = 0, ret;
+
+	if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
+		pr_err("SG count too large\n");
+		return -EINVAL;
+	}
+
+	ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+	if (!ret)
+		return -ENOMEM;
+	sg_cnt = ret;
+
+	ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
+	if (!ret) {
+		ret = -ENOMEM;
+		goto out_unmap_sg;
+	}
+	prot_sg_cnt = ret;
+
+	ctx->type = RDMA_RW_SIG_MR;
+	ctx->nr_ops = 1;
+	ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL);
+	if (!ctx->sig) {
+		ret = -ENOMEM;
+		goto out_unmap_prot_sg;
+	}
+
+	ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0);
+	if (ret < 0)
+		goto out_free_ctx;
+	count += ret;
+	prev_wr = &ctx->sig->data.reg_wr.wr;
+
+	ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot,
+				  prot_sg, prot_sg_cnt, 0);
+	if (ret < 0)
+		goto out_destroy_data_mr;
+	count += ret;
+
+	if (ctx->sig->prot.inv_wr.next)
+		prev_wr->next = &ctx->sig->prot.inv_wr;
+	else
+		prev_wr->next = &ctx->sig->prot.reg_wr.wr;
+	prev_wr = &ctx->sig->prot.reg_wr.wr;
+
+	ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs);
+	if (!ctx->sig->sig_mr) {
+		ret = -EAGAIN;
+		goto out_destroy_prot_mr;
+	}
+
+	if (ctx->sig->sig_mr->need_inval) {
+		memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr));
+
+		ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV;
+		ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey;
+
+		prev_wr->next = &ctx->sig->sig_inv_wr;
+		prev_wr = &ctx->sig->sig_inv_wr;
+	}
+
+	ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR;
+	ctx->sig->sig_wr.wr.wr_cqe = NULL;
+	ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge;
+	ctx->sig->sig_wr.wr.num_sge = 1;
+	ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE;
+	ctx->sig->sig_wr.sig_attrs = sig_attrs;
+	ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr;
+	if (prot_sg_cnt)
+		ctx->sig->sig_wr.prot = &ctx->sig->prot.sge;
+	prev_wr->next = &ctx->sig->sig_wr.wr;
+	prev_wr = &ctx->sig->sig_wr.wr;
+	count++;
+
+	ctx->sig->sig_sge.addr = 0;
+	ctx->sig->sig_sge.length = ctx->sig->data.sge.length;
+	if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE)
+		ctx->sig->sig_sge.length += ctx->sig->prot.sge.length;
+
+	rdma_wr = &ctx->sig->data.wr;
+	rdma_wr->wr.sg_list = &ctx->sig->sig_sge;
+	rdma_wr->wr.num_sge = 1;
+	rdma_wr->remote_addr = remote_addr;
+	rdma_wr->rkey = rkey;
+	if (dir == DMA_TO_DEVICE)
+		rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+	else
+		rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+	prev_wr->next = &rdma_wr->wr;
+	prev_wr = &rdma_wr->wr;
+	count++;
+
+	return count;
+
+out_destroy_prot_mr:
+	if (prot_sg_cnt)
+		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
+out_destroy_data_mr:
+	ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+out_free_ctx:
+	kfree(ctx->sig);
+out_unmap_prot_sg:
+	ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
+out_unmap_sg:
+	ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_signature_init);
+
+/*
+ * Now that we are going to post the WRs we can update the lkey and need_inval
+ * state on the MRs.  If we were doing this at init time, we would get double
+ * or missing invalidations if a context was initialized but not actually
+ * posted.
+ */
+static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval)
+{
+	reg->mr->need_inval = need_inval;
+	ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey));
+	reg->reg_wr.key = reg->mr->lkey;
+	reg->sge.lkey = reg->mr->lkey;
+}
+
+/**
+ * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation
+ * @ctx:	context to operate on
+ * @qp:		queue pair to operate on
+ * @port_num:	port num to which the connection is bound
+ * @cqe:	completion queue entry for the last WR
+ * @chain_wr:	WR to append to the posted chain
+ *
+ * Return the WR chain for the set of RDMA READ/WRITE operations described by
+ * @ctx, as well as any memory registration operations needed.  If @chain_wr
+ * is non-NULL the WR it points to will be appended to the chain of WRs posted.
+ * If @chain_wr is not set @cqe must be set so that the caller gets a
+ * completion notification.
+ */
+struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+		u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+{
+	struct ib_send_wr *first_wr, *last_wr;
+	int i;
+
+	switch (ctx->type) {
+	case RDMA_RW_SIG_MR:
+		rdma_rw_update_lkey(&ctx->sig->data, true);
+		if (ctx->sig->prot.mr)
+			rdma_rw_update_lkey(&ctx->sig->prot, true);
+	
+		ctx->sig->sig_mr->need_inval = true;
+		ib_update_fast_reg_key(ctx->sig->sig_mr,
+			ib_inc_rkey(ctx->sig->sig_mr->lkey));
+		ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey;
+
+		if (ctx->sig->data.inv_wr.next)
+			first_wr = &ctx->sig->data.inv_wr;
+		else
+			first_wr = &ctx->sig->data.reg_wr.wr;
+		last_wr = &ctx->sig->data.wr.wr;
+		break;
+	case RDMA_RW_MR:
+		for (i = 0; i < ctx->nr_ops; i++) {
+			rdma_rw_update_lkey(&ctx->reg[i],
+				ctx->reg[i].wr.wr.opcode !=
+					IB_WR_RDMA_READ_WITH_INV);
+		}
+
+		if (ctx->reg[0].inv_wr.next)
+			first_wr = &ctx->reg[0].inv_wr;
+		else
+			first_wr = &ctx->reg[0].reg_wr.wr;
+		last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
+		break;
+	case RDMA_RW_MULTI_WR:
+		first_wr = &ctx->map.wrs[0].wr;
+		last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
+		break;
+	case RDMA_RW_SINGLE_WR:
+		first_wr = &ctx->single.wr.wr;
+		last_wr = &ctx->single.wr.wr;
+		break;
+	default:
+		BUG();
+	}
+
+	if (chain_wr) {
+		last_wr->next = chain_wr;
+	} else {
+		last_wr->wr_cqe = cqe;
+		last_wr->send_flags |= IB_SEND_SIGNALED;
+	}
+
+	return first_wr;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_wrs);
+
+/**
+ * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation
+ * @ctx:	context to operate on
+ * @qp:		queue pair to operate on
+ * @port_num:	port num to which the connection is bound
+ * @cqe:	completion queue entry for the last WR
+ * @chain_wr:	WR to append to the posted chain
+ *
+ * Post the set of RDMA READ/WRITE operations described by @ctx, as well as
+ * any memory registration operations needed.  If @chain_wr is non-NULL the
+ * WR it points to will be appended to the chain of WRs posted.  If @chain_wr
+ * is not set @cqe must be set so that the caller gets a completion
+ * notification.
+ */
+int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+		struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+{
+	struct ib_send_wr *first_wr, *bad_wr;
+
+	first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr);
+	return ib_post_send(qp, first_wr, &bad_wr);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_post);
+
+/**
+ * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init
+ * @ctx:	context to release
+ * @qp:		queue pair to operate on
+ * @port_num:	port num to which the connection is bound
+ * @sg:		scatterlist that was used for the READ/WRITE
+ * @sg_cnt:	number of entries in @sg
+ * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ */
+void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+		struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir)
+{
+	int i;
+
+	switch (ctx->type) {
+	case RDMA_RW_MR:
+		for (i = 0; i < ctx->nr_ops; i++)
+			ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+		kfree(ctx->reg);
+		break;
+	case RDMA_RW_MULTI_WR:
+		kfree(ctx->map.wrs);
+		kfree(ctx->map.sges);
+		break;
+	case RDMA_RW_SINGLE_WR:
+		break;
+	default:
+		BUG();
+		break;
+	}
+
+	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_destroy);
+
+/**
+ * rdma_rw_ctx_destroy_signature - release all resources allocated by
+ *	rdma_rw_ctx_init_signature
+ * @ctx:	context to release
+ * @qp:		queue pair to operate on
+ * @port_num:	port num to which the connection is bound
+ * @sg:		scatterlist that was used for the READ/WRITE
+ * @sg_cnt:	number of entries in @sg
+ * @prot_sg:	scatterlist that was used for the READ/WRITE of the PI
+ * @prot_sg_cnt: number of entries in @prot_sg
+ * @dir:	%DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ */
+void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+		u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+		struct scatterlist *prot_sg, u32 prot_sg_cnt,
+		enum dma_data_direction dir)
+{
+	if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
+		return;
+
+	ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+
+	if (ctx->sig->prot.mr) {
+		ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
+		ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
+	}
+
+	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr);
+	kfree(ctx->sig);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
+
+/**
+ * rdma_rw_mr_factor - return number of MRs required for a payload
+ * @device:	device handling the connection
+ * @port_num:	port num to which the connection is bound
+ * @maxpages:	maximum payload pages per rdma_rw_ctx
+ *
+ * Returns the number of MRs the device requires to move @maxpayload
+ * bytes. The returned value is used during transport creation to
+ * compute max_rdma_ctxts and the size of the transport's Send and
+ * Send Completion Queues.
+ */
+unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num,
+			       unsigned int maxpages)
+{
+	unsigned int mr_pages;
+
+	if (rdma_rw_can_use_mr(device, port_num))
+		mr_pages = rdma_rw_fr_page_list_len(device);
+	else
+		mr_pages = device->attrs.max_sge_rd;
+	return DIV_ROUND_UP(maxpages, mr_pages);
+}
+EXPORT_SYMBOL(rdma_rw_mr_factor);
+
+void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
+{
+	u32 factor;
+
+	WARN_ON_ONCE(attr->port_num == 0);
+
+	/*
+	 * Each context needs at least one RDMA READ or WRITE WR.
+	 *
+	 * For some hardware we might need more, eventually we should ask the
+	 * HCA driver for a multiplier here.
+	 */
+	factor = 1;
+
+	/*
+	 * If the devices needs MRs to perform RDMA READ or WRITE operations,
+	 * we'll need two additional MRs for the registrations and the
+	 * invalidation.
+	 */
+	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
+		factor += 6;	/* (inv + reg) * (data + prot + sig) */
+	else if (rdma_rw_can_use_mr(dev, attr->port_num))
+		factor += 2;	/* inv + reg */
+
+	attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
+
+	/*
+	 * But maybe we were just too high in the sky and the device doesn't
+	 * even support all we need, and we'll have to live with what we get..
+	 */
+	attr->cap.max_send_wr =
+		min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr);
+}
+
+int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
+{
+	struct ib_device *dev = qp->pd->device;
+	u32 nr_mrs = 0, nr_sig_mrs = 0;
+	int ret = 0;
+
+	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) {
+		nr_sig_mrs = attr->cap.max_rdma_ctxs;
+		nr_mrs = attr->cap.max_rdma_ctxs * 2;
+	} else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
+		nr_mrs = attr->cap.max_rdma_ctxs;
+	}
+
+	if (nr_mrs) {
+		ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
+				IB_MR_TYPE_MEM_REG,
+				rdma_rw_fr_page_list_len(dev));
+		if (ret) {
+			pr_err("%s: failed to allocated %d MRs\n",
+				__func__, nr_mrs);
+			return ret;
+		}
+	}
+
+	if (nr_sig_mrs) {
+		ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
+				IB_MR_TYPE_SIGNATURE, 2);
+		if (ret) {
+			pr_err("%s: failed to allocated %d SIG MRs\n",
+				__func__, nr_mrs);
+			goto out_free_rdma_mrs;
+		}
+	}
+
+	return 0;
+
+out_free_rdma_mrs:
+	ib_mr_pool_destroy(qp, &qp->rdma_mrs);
+	return ret;
+}
+
+void rdma_rw_cleanup_mrs(struct ib_qp *qp)
+{
+	ib_mr_pool_destroy(qp, &qp->sig_mrs);
+	ib_mr_pool_destroy(qp, &qp->rdma_mrs);
+}
diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h
new file mode 100644
index 0000000..b1d4bbf
--- /dev/null
+++ b/drivers/infiniband/core/sa.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SA_H
+#define SA_H
+
+#include <rdma/ib_sa.h>
+
+static inline void ib_sa_client_get(struct ib_sa_client *client)
+{
+	atomic_inc(&client->users);
+}
+
+static inline void ib_sa_client_put(struct ib_sa_client *client)
+{
+	if (atomic_dec_and_test(&client->users))
+		complete(&client->comp);
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+			     struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec,
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, gfp_t gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **sa_query);
+
+int mcast_init(void);
+void mcast_cleanup(void);
+
+#endif /* SA_H */
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
new file mode 100644
index 0000000..a61ec7e
--- /dev/null
+++ b/drivers/infiniband/core/sa_query.c
@@ -0,0 +1,2530 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/if_ether.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_cache.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+#include <uapi/rdma/ib_user_sa.h>
+#include <rdma/ib_marshall.h>
+#include <rdma/ib_addr.h>
+#include <rdma/opa_addr.h>
+#include "sa.h"
+#include "core_priv.h"
+
+#define IB_SA_LOCAL_SVC_TIMEOUT_MIN		100
+#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT		2000
+#define IB_SA_LOCAL_SVC_TIMEOUT_MAX		200000
+#define IB_SA_CPI_MAX_RETRY_CNT			3
+#define IB_SA_CPI_RETRY_WAIT			1000 /*msecs */
+static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
+
+struct ib_sa_sm_ah {
+	struct ib_ah        *ah;
+	struct kref          ref;
+	u16		     pkey_index;
+	u8		     src_path_mask;
+};
+
+enum rdma_class_port_info_type {
+	RDMA_CLASS_PORT_INFO_IB,
+	RDMA_CLASS_PORT_INFO_OPA
+};
+
+struct rdma_class_port_info {
+	enum rdma_class_port_info_type type;
+	union {
+		struct ib_class_port_info ib;
+		struct opa_class_port_info opa;
+	};
+};
+
+struct ib_sa_classport_cache {
+	bool valid;
+	int retry_cnt;
+	struct rdma_class_port_info data;
+};
+
+struct ib_sa_port {
+	struct ib_mad_agent *agent;
+	struct ib_sa_sm_ah  *sm_ah;
+	struct work_struct   update_task;
+	struct ib_sa_classport_cache classport_info;
+	struct delayed_work ib_cpi_work;
+	spinlock_t                   classport_lock; /* protects class port info set */
+	spinlock_t           ah_lock;
+	u8                   port_num;
+};
+
+struct ib_sa_device {
+	int                     start_port, end_port;
+	struct ib_event_handler event_handler;
+	struct ib_sa_port port[0];
+};
+
+struct ib_sa_query {
+	void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+	void (*release)(struct ib_sa_query *);
+	struct ib_sa_client    *client;
+	struct ib_sa_port      *port;
+	struct ib_mad_send_buf *mad_buf;
+	struct ib_sa_sm_ah     *sm_ah;
+	int			id;
+	u32			flags;
+	struct list_head	list; /* Local svc request list */
+	u32			seq; /* Local svc request sequence number */
+	unsigned long		timeout; /* Local svc timeout */
+	u8			path_use; /* How will the pathrecord be used */
+};
+
+#define IB_SA_ENABLE_LOCAL_SERVICE	0x00000001
+#define IB_SA_CANCEL			0x00000002
+#define IB_SA_QUERY_OPA			0x00000004
+
+struct ib_sa_service_query {
+	void (*callback)(int, struct ib_sa_service_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_path_query {
+	void (*callback)(int, struct sa_path_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+	struct sa_path_rec *conv_pr;
+};
+
+struct ib_sa_guidinfo_query {
+	void (*callback)(int, struct ib_sa_guidinfo_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_classport_info_query {
+	void (*callback)(void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_mcmember_query {
+	void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+static LIST_HEAD(ib_nl_request_list);
+static DEFINE_SPINLOCK(ib_nl_request_lock);
+static atomic_t ib_nl_sa_request_seq;
+static struct workqueue_struct *ib_nl_wq;
+static struct delayed_work ib_nl_timed_work;
+static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = {
+	[LS_NLA_TYPE_PATH_RECORD]	= {.type = NLA_BINARY,
+		.len = sizeof(struct ib_path_rec_data)},
+	[LS_NLA_TYPE_TIMEOUT]		= {.type = NLA_U32},
+	[LS_NLA_TYPE_SERVICE_ID]	= {.type = NLA_U64},
+	[LS_NLA_TYPE_DGID]		= {.type = NLA_BINARY,
+		.len = sizeof(struct rdma_nla_ls_gid)},
+	[LS_NLA_TYPE_SGID]		= {.type = NLA_BINARY,
+		.len = sizeof(struct rdma_nla_ls_gid)},
+	[LS_NLA_TYPE_TCLASS]		= {.type = NLA_U8},
+	[LS_NLA_TYPE_PKEY]		= {.type = NLA_U16},
+	[LS_NLA_TYPE_QOS_CLASS]		= {.type = NLA_U16},
+};
+
+
+static void ib_sa_add_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device, void *client_data);
+
+static struct ib_client sa_client = {
+	.name   = "sa",
+	.add    = ib_sa_add_one,
+	.remove = ib_sa_remove_one
+};
+
+static DEFINE_SPINLOCK(idr_lock);
+static DEFINE_IDR(query_idr);
+
+static DEFINE_SPINLOCK(tid_lock);
+static u32 tid;
+
+#define PATH_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct sa_path_rec, field),	\
+	.struct_size_bytes   = sizeof((struct sa_path_rec *)0)->field,	\
+	.field_name          = "sa_path_rec:" #field
+
+static const struct ib_field path_rec_table[] = {
+	{ PATH_REC_FIELD(service_id),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 64 },
+	{ PATH_REC_FIELD(dgid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(sgid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(ib.dlid),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(ib.slid),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(ib.raw_traffic),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 11,
+	  .offset_bits  = 1,
+	  .size_bits    = 3 },
+	{ PATH_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ PATH_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(traffic_class),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(reversible),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ PATH_REC_FIELD(numb_path),
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 7 },
+	{ PATH_REC_FIELD(pkey),
+	  .offset_words = 12,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(qos_class),
+	  .offset_words = 13,
+	  .offset_bits  = 0,
+	  .size_bits    = 12 },
+	{ PATH_REC_FIELD(sl),
+	  .offset_words = 13,
+	  .offset_bits  = 12,
+	  .size_bits    = 4 },
+	{ PATH_REC_FIELD(mtu_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(mtu),
+	  .offset_words = 13,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(rate_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(rate),
+	  .offset_words = 13,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 14,
+	  .offset_bits  = 0,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(packet_life_time),
+	  .offset_words = 14,
+	  .offset_bits  = 2,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(preference),
+	  .offset_words = 14,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ RESERVED,
+	  .offset_words = 14,
+	  .offset_bits  = 16,
+	  .size_bits    = 48 },
+};
+
+#define OPA_PATH_REC_FIELD(field) \
+	.struct_offset_bytes = \
+		offsetof(struct sa_path_rec, field), \
+	.struct_size_bytes   = \
+		sizeof((struct sa_path_rec *)0)->field,	\
+	.field_name          = "sa_path_rec:" #field
+
+static const struct ib_field opa_path_rec_table[] = {
+	{ OPA_PATH_REC_FIELD(service_id),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 64 },
+	{ OPA_PATH_REC_FIELD(dgid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ OPA_PATH_REC_FIELD(sgid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ OPA_PATH_REC_FIELD(opa.dlid),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ OPA_PATH_REC_FIELD(opa.slid),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ OPA_PATH_REC_FIELD(opa.raw_traffic),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 12,
+	  .offset_bits  = 1,
+	  .size_bits    = 3 },
+	{ OPA_PATH_REC_FIELD(flow_label),
+	  .offset_words = 12,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ OPA_PATH_REC_FIELD(hop_limit),
+	  .offset_words = 12,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ OPA_PATH_REC_FIELD(traffic_class),
+	  .offset_words = 13,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ OPA_PATH_REC_FIELD(reversible),
+	  .offset_words = 13,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ OPA_PATH_REC_FIELD(numb_path),
+	  .offset_words = 13,
+	  .offset_bits  = 9,
+	  .size_bits    = 7 },
+	{ OPA_PATH_REC_FIELD(pkey),
+	  .offset_words = 13,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ OPA_PATH_REC_FIELD(opa.l2_8B),
+	  .offset_words = 14,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ OPA_PATH_REC_FIELD(opa.l2_10B),
+	  .offset_words = 14,
+	  .offset_bits  = 1,
+	  .size_bits    = 1 },
+	{ OPA_PATH_REC_FIELD(opa.l2_9B),
+	  .offset_words = 14,
+	  .offset_bits  = 2,
+	  .size_bits    = 1 },
+	{ OPA_PATH_REC_FIELD(opa.l2_16B),
+	  .offset_words = 14,
+	  .offset_bits  = 3,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 14,
+	  .offset_bits  = 4,
+	  .size_bits    = 2 },
+	{ OPA_PATH_REC_FIELD(opa.qos_type),
+	  .offset_words = 14,
+	  .offset_bits  = 6,
+	  .size_bits    = 2 },
+	{ OPA_PATH_REC_FIELD(opa.qos_priority),
+	  .offset_words = 14,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ RESERVED,
+	  .offset_words = 14,
+	  .offset_bits  = 16,
+	  .size_bits    = 3 },
+	{ OPA_PATH_REC_FIELD(sl),
+	  .offset_words = 14,
+	  .offset_bits  = 19,
+	  .size_bits    = 5 },
+	{ RESERVED,
+	  .offset_words = 14,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ OPA_PATH_REC_FIELD(mtu_selector),
+	  .offset_words = 15,
+	  .offset_bits  = 0,
+	  .size_bits    = 2 },
+	{ OPA_PATH_REC_FIELD(mtu),
+	  .offset_words = 15,
+	  .offset_bits  = 2,
+	  .size_bits    = 6 },
+	{ OPA_PATH_REC_FIELD(rate_selector),
+	  .offset_words = 15,
+	  .offset_bits  = 8,
+	  .size_bits    = 2 },
+	{ OPA_PATH_REC_FIELD(rate),
+	  .offset_words = 15,
+	  .offset_bits  = 10,
+	  .size_bits    = 6 },
+	{ OPA_PATH_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 15,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ OPA_PATH_REC_FIELD(packet_life_time),
+	  .offset_words = 15,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ OPA_PATH_REC_FIELD(preference),
+	  .offset_words = 15,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+};
+
+#define MCMEMBER_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field),	\
+	.struct_size_bytes   = sizeof ((struct ib_sa_mcmember_rec *) 0)->field,	\
+	.field_name          = "sa_mcmember_rec:" #field
+
+static const struct ib_field mcmember_rec_table[] = {
+	{ MCMEMBER_REC_FIELD(mgid),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(port_gid),
+	  .offset_words = 4,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(qkey),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ MCMEMBER_REC_FIELD(mlid),
+	  .offset_words = 9,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(mtu_selector),
+	  .offset_words = 9,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(mtu),
+	  .offset_words = 9,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(traffic_class),
+	  .offset_words = 9,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(pkey),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(rate_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(rate),
+	  .offset_words = 10,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(packet_life_time),
+	  .offset_words = 10,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(sl),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ MCMEMBER_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(scope),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(join_state),
+	  .offset_words = 12,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(proxy_join),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 23 },
+};
+
+#define SERVICE_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_service_rec, field),	\
+	.struct_size_bytes   = sizeof ((struct ib_sa_service_rec *) 0)->field,	\
+	.field_name          = "sa_service_rec:" #field
+
+static const struct ib_field service_rec_table[] = {
+	{ SERVICE_REC_FIELD(id),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 64 },
+	{ SERVICE_REC_FIELD(gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(pkey),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ SERVICE_REC_FIELD(lease),
+	  .offset_words = 7,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ SERVICE_REC_FIELD(key),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(name),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 64*8 },
+	{ SERVICE_REC_FIELD(data8),
+	  .offset_words = 28,
+	  .offset_bits  = 0,
+	  .size_bits    = 16*8 },
+	{ SERVICE_REC_FIELD(data16),
+	  .offset_words = 32,
+	  .offset_bits  = 0,
+	  .size_bits    = 8*16 },
+	{ SERVICE_REC_FIELD(data32),
+	  .offset_words = 36,
+	  .offset_bits  = 0,
+	  .size_bits    = 4*32 },
+	{ SERVICE_REC_FIELD(data64),
+	  .offset_words = 40,
+	  .offset_bits  = 0,
+	  .size_bits    = 2*64 },
+};
+
+#define CLASSPORTINFO_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_class_port_info, field),	\
+	.struct_size_bytes   = sizeof((struct ib_class_port_info *)0)->field,	\
+	.field_name          = "ib_class_port_info:" #field
+
+static const struct ib_field ib_classport_info_rec_table[] = {
+	{ CLASSPORTINFO_REC_FIELD(base_version),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ CLASSPORTINFO_REC_FIELD(class_version),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ CLASSPORTINFO_REC_FIELD(capability_mask),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ CLASSPORTINFO_REC_FIELD(redirect_gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ CLASSPORTINFO_REC_FIELD(redirect_tcslfl),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ CLASSPORTINFO_REC_FIELD(redirect_lid),
+	  .offset_words = 7,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ CLASSPORTINFO_REC_FIELD(redirect_pkey),
+	  .offset_words = 7,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+
+	{ CLASSPORTINFO_REC_FIELD(redirect_qp),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ CLASSPORTINFO_REC_FIELD(redirect_qkey),
+	  .offset_words = 9,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+
+	{ CLASSPORTINFO_REC_FIELD(trap_gid),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ CLASSPORTINFO_REC_FIELD(trap_tcslfl),
+	  .offset_words = 14,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+
+	{ CLASSPORTINFO_REC_FIELD(trap_lid),
+	  .offset_words = 15,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ CLASSPORTINFO_REC_FIELD(trap_pkey),
+	  .offset_words = 15,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+
+	{ CLASSPORTINFO_REC_FIELD(trap_hlqp),
+	  .offset_words = 16,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ CLASSPORTINFO_REC_FIELD(trap_qkey),
+	  .offset_words = 17,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+};
+
+#define OPA_CLASSPORTINFO_REC_FIELD(field) \
+	.struct_offset_bytes =\
+		offsetof(struct opa_class_port_info, field),	\
+	.struct_size_bytes   = \
+		sizeof((struct opa_class_port_info *)0)->field,	\
+	.field_name          = "opa_class_port_info:" #field
+
+static const struct ib_field opa_classport_info_rec_table[] = {
+	{ OPA_CLASSPORTINFO_REC_FIELD(base_version),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(class_version),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(cap_mask),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(redirect_gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(redirect_tc_fl),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(redirect_lid),
+	  .offset_words = 7,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(redirect_sl_qp),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(redirect_qkey),
+	  .offset_words = 9,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(trap_gid),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(trap_tc_fl),
+	  .offset_words = 14,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(trap_lid),
+	  .offset_words = 15,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(trap_hl_qp),
+	  .offset_words = 16,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(trap_qkey),
+	  .offset_words = 17,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(trap_pkey),
+	  .offset_words = 18,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(redirect_pkey),
+	  .offset_words = 18,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ OPA_CLASSPORTINFO_REC_FIELD(trap_sl_rsvd),
+	  .offset_words = 19,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ RESERVED,
+	  .offset_words = 19,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 },
+};
+
+#define GUIDINFO_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field),	\
+	.struct_size_bytes   = sizeof((struct ib_sa_guidinfo_rec *) 0)->field,	\
+	.field_name          = "sa_guidinfo_rec:" #field
+
+static const struct ib_field guidinfo_rec_table[] = {
+	{ GUIDINFO_REC_FIELD(lid),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ GUIDINFO_REC_FIELD(block_num),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 8 },
+	{ GUIDINFO_REC_FIELD(res1),
+	  .offset_words = 0,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ GUIDINFO_REC_FIELD(res2),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ GUIDINFO_REC_FIELD(guid_info_list),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 512 },
+};
+
+static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
+{
+	query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE;
+}
+
+static inline int ib_sa_query_cancelled(struct ib_sa_query *query)
+{
+	return (query->flags & IB_SA_CANCEL);
+}
+
+static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
+				     struct ib_sa_query *query)
+{
+	struct sa_path_rec *sa_rec = query->mad_buf->context[1];
+	struct ib_sa_mad *mad = query->mad_buf->mad;
+	ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask;
+	u16 val16;
+	u64 val64;
+	struct rdma_ls_resolve_header *header;
+
+	query->mad_buf->context[1] = NULL;
+
+	/* Construct the family header first */
+	header = skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+	memcpy(header->device_name, query->port->agent->device->name,
+	       LS_DEVICE_NAME_MAX);
+	header->port_num = query->port->port_num;
+
+	if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
+	    sa_rec->reversible != 0)
+		query->path_use = LS_RESOLVE_PATH_USE_GMP;
+	else
+		query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL;
+	header->path_use = query->path_use;
+
+	/* Now build the attributes */
+	if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) {
+		val64 = be64_to_cpu(sa_rec->service_id);
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID,
+			sizeof(val64), &val64);
+	}
+	if (comp_mask & IB_SA_PATH_REC_DGID)
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID,
+			sizeof(sa_rec->dgid), &sa_rec->dgid);
+	if (comp_mask & IB_SA_PATH_REC_SGID)
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID,
+			sizeof(sa_rec->sgid), &sa_rec->sgid);
+	if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS,
+			sizeof(sa_rec->traffic_class), &sa_rec->traffic_class);
+
+	if (comp_mask & IB_SA_PATH_REC_PKEY) {
+		val16 = be16_to_cpu(sa_rec->pkey);
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY,
+			sizeof(val16), &val16);
+	}
+	if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) {
+		val16 = be16_to_cpu(sa_rec->qos_class);
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS,
+			sizeof(val16), &val16);
+	}
+}
+
+static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask)
+{
+	int len = 0;
+
+	if (comp_mask & IB_SA_PATH_REC_SERVICE_ID)
+		len += nla_total_size(sizeof(u64));
+	if (comp_mask & IB_SA_PATH_REC_DGID)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+	if (comp_mask & IB_SA_PATH_REC_SGID)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+	if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+		len += nla_total_size(sizeof(u8));
+	if (comp_mask & IB_SA_PATH_REC_PKEY)
+		len += nla_total_size(sizeof(u16));
+	if (comp_mask & IB_SA_PATH_REC_QOS_CLASS)
+		len += nla_total_size(sizeof(u16));
+
+	/*
+	 * Make sure that at least some of the required comp_mask bits are
+	 * set.
+	 */
+	if (WARN_ON(len == 0))
+		return len;
+
+	/* Add the family header */
+	len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header));
+
+	return len;
+}
+
+static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	void *data;
+	int ret = 0;
+	struct ib_sa_mad *mad;
+	int len;
+
+	mad = query->mad_buf->mad;
+	len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask);
+	if (len <= 0)
+		return -EMSGSIZE;
+
+	skb = nlmsg_new(len, gfp_mask);
+	if (!skb)
+		return -ENOMEM;
+
+	/* Put nlmsg header only for now */
+	data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS,
+			    RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST);
+	if (!data) {
+		nlmsg_free(skb);
+		return -EMSGSIZE;
+	}
+
+	/* Add attributes */
+	ib_nl_set_path_rec_attrs(skb, query);
+
+	/* Repair the nlmsg header length */
+	nlmsg_end(skb, nlh);
+
+	ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask);
+	if (!ret)
+		ret = len;
+	else
+		ret = 0;
+
+	return ret;
+}
+
+static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+	unsigned long flags;
+	unsigned long delay;
+	int ret;
+
+	INIT_LIST_HEAD(&query->list);
+	query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
+
+	/* Put the request on the list first.*/
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
+	query->timeout = delay + jiffies;
+	list_add_tail(&query->list, &ib_nl_request_list);
+	/* Start the timeout if this is the only request */
+	if (ib_nl_request_list.next == &query->list)
+		queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+	ret = ib_nl_send_msg(query, gfp_mask);
+	if (ret <= 0) {
+		ret = -EIO;
+		/* Remove the request */
+		spin_lock_irqsave(&ib_nl_request_lock, flags);
+		list_del(&query->list);
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+	} else {
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int ib_nl_cancel_request(struct ib_sa_query *query)
+{
+	unsigned long flags;
+	struct ib_sa_query *wait_query;
+	int found = 0;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	list_for_each_entry(wait_query, &ib_nl_request_list, list) {
+		/* Let the timeout to take care of the callback */
+		if (query == wait_query) {
+			query->flags |= IB_SA_CANCEL;
+			query->timeout = jiffies;
+			list_move(&query->list, &ib_nl_request_list);
+			found = 1;
+			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+	return found;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc);
+
+static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
+					   const struct nlmsghdr *nlh)
+{
+	struct ib_mad_send_wc mad_send_wc;
+	struct ib_sa_mad *mad = NULL;
+	const struct nlattr *head, *curr;
+	struct ib_path_rec_data  *rec;
+	int len, rem;
+	u32 mask = 0;
+	int status = -EIO;
+
+	if (query->callback) {
+		head = (const struct nlattr *) nlmsg_data(nlh);
+		len = nlmsg_len(nlh);
+		switch (query->path_use) {
+		case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
+			mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
+			break;
+
+		case LS_RESOLVE_PATH_USE_ALL:
+		case LS_RESOLVE_PATH_USE_GMP:
+		default:
+			mask = IB_PATH_PRIMARY | IB_PATH_GMP |
+				IB_PATH_BIDIRECTIONAL;
+			break;
+		}
+		nla_for_each_attr(curr, head, len, rem) {
+			if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
+				rec = nla_data(curr);
+				/*
+				 * Get the first one. In the future, we may
+				 * need to get up to 6 pathrecords.
+				 */
+				if ((rec->flags & mask) == mask) {
+					mad = query->mad_buf->mad;
+					mad->mad_hdr.method |=
+						IB_MGMT_METHOD_RESP;
+					memcpy(mad->data, rec->path_rec,
+					       sizeof(rec->path_rec));
+					status = 0;
+					break;
+				}
+			}
+		}
+		query->callback(query, status, mad);
+	}
+
+	mad_send_wc.send_buf = query->mad_buf;
+	mad_send_wc.status = IB_WC_SUCCESS;
+	send_handler(query->mad_buf->mad_agent, &mad_send_wc);
+}
+
+static void ib_nl_request_timeout(struct work_struct *work)
+{
+	unsigned long flags;
+	struct ib_sa_query *query;
+	unsigned long delay;
+	struct ib_mad_send_wc mad_send_wc;
+	int ret;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	while (!list_empty(&ib_nl_request_list)) {
+		query = list_entry(ib_nl_request_list.next,
+				   struct ib_sa_query, list);
+
+		if (time_after(query->timeout, jiffies)) {
+			delay = query->timeout - jiffies;
+			if ((long)delay <= 0)
+				delay = 1;
+			queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+			break;
+		}
+
+		list_del(&query->list);
+		ib_sa_disable_local_svc(query);
+		/* Hold the lock to protect against query cancellation */
+		if (ib_sa_query_cancelled(query))
+			ret = -1;
+		else
+			ret = ib_post_send_mad(query->mad_buf, NULL);
+		if (ret) {
+			mad_send_wc.send_buf = query->mad_buf;
+			mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+			spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+			send_handler(query->port->agent, &mad_send_wc);
+			spin_lock_irqsave(&ib_nl_request_lock, flags);
+		}
+	}
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+}
+
+int ib_nl_handle_set_timeout(struct sk_buff *skb,
+			     struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
+{
+	int timeout, delta, abs_delta;
+	const struct nlattr *attr;
+	unsigned long flags;
+	struct ib_sa_query *query;
+	long delay = 0;
+	struct nlattr *tb[LS_NLA_TYPE_MAX];
+	int ret;
+
+	if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
+	    !(NETLINK_CB(skb).sk))
+		return -EPERM;
+
+	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+			nlmsg_len(nlh), ib_nl_policy, NULL);
+	attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT];
+	if (ret || !attr)
+		goto settimeout_out;
+
+	timeout = *(int *) nla_data(attr);
+	if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN)
+		timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN;
+	if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
+		timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
+
+	delta = timeout - sa_local_svc_timeout_ms;
+	if (delta < 0)
+		abs_delta = -delta;
+	else
+		abs_delta = delta;
+
+	if (delta != 0) {
+		spin_lock_irqsave(&ib_nl_request_lock, flags);
+		sa_local_svc_timeout_ms = timeout;
+		list_for_each_entry(query, &ib_nl_request_list, list) {
+			if (delta < 0 && abs_delta > query->timeout)
+				query->timeout = 0;
+			else
+				query->timeout += delta;
+
+			/* Get the new delay from the first entry */
+			if (!delay) {
+				delay = query->timeout - jiffies;
+				if (delay <= 0)
+					delay = 1;
+			}
+		}
+		if (delay)
+			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
+					 (unsigned long)delay);
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+	}
+
+settimeout_out:
+	return skb->len;
+}
+
+static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
+{
+	struct nlattr *tb[LS_NLA_TYPE_MAX];
+	int ret;
+
+	if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+		return 0;
+
+	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+			nlmsg_len(nlh), ib_nl_policy, NULL);
+	if (ret)
+		return 0;
+
+	return 1;
+}
+
+int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+			      struct nlmsghdr *nlh,
+			      struct netlink_ext_ack *extack)
+{
+	unsigned long flags;
+	struct ib_sa_query *query;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_send_wc mad_send_wc;
+	int found = 0;
+	int ret;
+
+	if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
+	    !(NETLINK_CB(skb).sk))
+		return -EPERM;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	list_for_each_entry(query, &ib_nl_request_list, list) {
+		/*
+		 * If the query is cancelled, let the timeout routine
+		 * take care of it.
+		 */
+		if (nlh->nlmsg_seq == query->seq) {
+			found = !ib_sa_query_cancelled(query);
+			if (found)
+				list_del(&query->list);
+			break;
+		}
+	}
+
+	if (!found) {
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		goto resp_out;
+	}
+
+	send_buf = query->mad_buf;
+
+	if (!ib_nl_is_good_resolve_resp(nlh)) {
+		/* if the result is a failure, send out the packet via IB */
+		ib_sa_disable_local_svc(query);
+		ret = ib_post_send_mad(query->mad_buf, NULL);
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		if (ret) {
+			mad_send_wc.send_buf = send_buf;
+			mad_send_wc.status = IB_WC_GENERAL_ERR;
+			send_handler(query->port->agent, &mad_send_wc);
+		}
+	} else {
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		ib_nl_process_good_resolve_rsp(query, nlh);
+	}
+
+resp_out:
+	return skb->len;
+}
+
+static void free_sm_ah(struct kref *kref)
+{
+	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
+
+	rdma_destroy_ah(sm_ah->ah);
+	kfree(sm_ah);
+}
+
+void ib_sa_register_client(struct ib_sa_client *client)
+{
+	atomic_set(&client->users, 1);
+	init_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_register_client);
+
+void ib_sa_unregister_client(struct ib_sa_client *client)
+{
+	ib_sa_client_put(client);
+	wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_unregister_client);
+
+/**
+ * ib_sa_cancel_query - try to cancel an SA query
+ * @id:ID of query to cancel
+ * @query:query pointer to cancel
+ *
+ * Try to cancel an SA query.  If the id and query don't match up or
+ * the query has already completed, nothing is done.  Otherwise the
+ * query is canceled and will complete with a status of -EINTR.
+ */
+void ib_sa_cancel_query(int id, struct ib_sa_query *query)
+{
+	unsigned long flags;
+	struct ib_mad_agent *agent;
+	struct ib_mad_send_buf *mad_buf;
+
+	spin_lock_irqsave(&idr_lock, flags);
+	if (idr_find(&query_idr, id) != query) {
+		spin_unlock_irqrestore(&idr_lock, flags);
+		return;
+	}
+	agent = query->port->agent;
+	mad_buf = query->mad_buf;
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	/*
+	 * If the query is still on the netlink request list, schedule
+	 * it to be cancelled by the timeout routine. Otherwise, it has been
+	 * sent to the MAD layer and has to be cancelled from there.
+	 */
+	if (!ib_nl_cancel_request(query))
+		ib_cancel_mad(agent, mad_buf);
+}
+EXPORT_SYMBOL(ib_sa_cancel_query);
+
+static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
+{
+	struct ib_sa_device *sa_dev;
+	struct ib_sa_port   *port;
+	unsigned long flags;
+	u8 src_path_mask;
+
+	sa_dev = ib_get_client_data(device, &sa_client);
+	if (!sa_dev)
+		return 0x7f;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	spin_lock_irqsave(&port->ah_lock, flags);
+	src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f;
+	spin_unlock_irqrestore(&port->ah_lock, flags);
+
+	return src_path_mask;
+}
+
+static int
+roce_resolve_route_from_path(struct ib_device *device, u8 port_num,
+			     struct sa_path_rec *rec)
+{
+	struct net_device *resolved_dev;
+	struct net_device *ndev;
+	struct net_device *idev;
+	struct rdma_dev_addr dev_addr = {
+		.bound_dev_if = ((sa_path_get_ifindex(rec) >= 0) ?
+				 sa_path_get_ifindex(rec) : 0),
+		.net = sa_path_get_ndev(rec) ?
+			sa_path_get_ndev(rec) :
+			&init_net
+	};
+	union {
+		struct sockaddr     _sockaddr;
+		struct sockaddr_in  _sockaddr_in;
+		struct sockaddr_in6 _sockaddr_in6;
+	} sgid_addr, dgid_addr;
+	int ret;
+
+	if (rec->roce.route_resolved)
+		return 0;
+
+	if (!device->get_netdev)
+		return -EOPNOTSUPP;
+
+	rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid);
+	rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid);
+
+	/* validate the route */
+	ret = rdma_resolve_ip_route(&sgid_addr._sockaddr,
+				    &dgid_addr._sockaddr, &dev_addr);
+	if (ret)
+		return ret;
+
+	if ((dev_addr.network == RDMA_NETWORK_IPV4 ||
+	     dev_addr.network == RDMA_NETWORK_IPV6) &&
+	    rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2)
+		return -EINVAL;
+
+	idev = device->get_netdev(device, port_num);
+	if (!idev)
+		return -ENODEV;
+
+	resolved_dev = dev_get_by_index(dev_addr.net,
+					dev_addr.bound_dev_if);
+	if (!resolved_dev) {
+		ret = -ENODEV;
+		goto done;
+	}
+	ndev = ib_get_ndev_from_path(rec);
+	rcu_read_lock();
+	if ((ndev && ndev != resolved_dev) ||
+	    (resolved_dev != idev &&
+	     !rdma_is_upper_dev_rcu(idev, resolved_dev)))
+		ret = -EHOSTUNREACH;
+	rcu_read_unlock();
+	dev_put(resolved_dev);
+	if (ndev)
+		dev_put(ndev);
+done:
+	dev_put(idev);
+	if (!ret)
+		rec->roce.route_resolved = true;
+	return ret;
+}
+
+static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num,
+				   struct sa_path_rec *rec,
+				   struct rdma_ah_attr *ah_attr)
+{
+	enum ib_gid_type type = sa_conv_pathrec_to_gid_type(rec);
+	struct net_device *ndev;
+	u16 gid_index;
+	int ret;
+
+	ndev = ib_get_ndev_from_path(rec);
+	ret = ib_find_cached_gid_by_port(device, &rec->sgid, type,
+					 port_num, ndev, &gid_index);
+	if (ndev)
+		dev_put(ndev);
+	if (ret)
+		return ret;
+
+	rdma_ah_set_grh(ah_attr, &rec->dgid,
+			be32_to_cpu(rec->flow_label),
+			gid_index, rec->hop_limit,
+			rec->traffic_class);
+	return 0;
+}
+
+int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num,
+			      struct sa_path_rec *rec,
+			      struct rdma_ah_attr *ah_attr)
+{
+	int ret = 0;
+
+	memset(ah_attr, 0, sizeof(*ah_attr));
+	ah_attr->type = rdma_ah_find_type(device, port_num);
+	rdma_ah_set_sl(ah_attr, rec->sl);
+	rdma_ah_set_port_num(ah_attr, port_num);
+	rdma_ah_set_static_rate(ah_attr, rec->rate);
+
+	if (sa_path_is_roce(rec)) {
+		ret = roce_resolve_route_from_path(device, port_num, rec);
+		if (ret)
+			return ret;
+
+		memcpy(ah_attr->roce.dmac, sa_path_get_dmac(rec), ETH_ALEN);
+	} else {
+		rdma_ah_set_dlid(ah_attr, be32_to_cpu(sa_path_get_dlid(rec)));
+		if (sa_path_is_opa(rec) &&
+		    rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE))
+			rdma_ah_set_make_grd(ah_attr, true);
+
+		rdma_ah_set_path_bits(ah_attr,
+				      be32_to_cpu(sa_path_get_slid(rec)) &
+				      get_src_path_mask(device, port_num));
+	}
+
+	if (rec->hop_limit > 0 || sa_path_is_roce(rec))
+		ret = init_ah_attr_grh_fields(device, port_num, rec, ah_attr);
+	return ret;
+}
+EXPORT_SYMBOL(ib_init_ah_attr_from_path);
+
+static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+	struct rdma_ah_attr ah_attr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&query->port->ah_lock, flags);
+	if (!query->port->sm_ah) {
+		spin_unlock_irqrestore(&query->port->ah_lock, flags);
+		return -EAGAIN;
+	}
+	kref_get(&query->port->sm_ah->ref);
+	query->sm_ah = query->port->sm_ah;
+	spin_unlock_irqrestore(&query->port->ah_lock, flags);
+
+	/*
+	 * Always check if sm_ah has valid dlid assigned,
+	 * before querying for class port info
+	 */
+	if ((rdma_query_ah(query->sm_ah->ah, &ah_attr) < 0) ||
+	    !rdma_is_valid_unicast_lid(&ah_attr)) {
+		kref_put(&query->sm_ah->ref, free_sm_ah);
+		return -EAGAIN;
+	}
+	query->mad_buf = ib_create_send_mad(query->port->agent, 1,
+					    query->sm_ah->pkey_index,
+					    0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
+					    gfp_mask,
+					    ((query->flags & IB_SA_QUERY_OPA) ?
+					     OPA_MGMT_BASE_VERSION :
+					     IB_MGMT_BASE_VERSION));
+	if (IS_ERR(query->mad_buf)) {
+		kref_put(&query->sm_ah->ref, free_sm_ah);
+		return -ENOMEM;
+	}
+
+	query->mad_buf->ah = query->sm_ah->ah;
+
+	return 0;
+}
+
+static void free_mad(struct ib_sa_query *query)
+{
+	ib_free_send_mad(query->mad_buf);
+	kref_put(&query->sm_ah->ref, free_sm_ah);
+}
+
+static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent)
+{
+	struct ib_sa_mad *mad = query->mad_buf->mad;
+	unsigned long flags;
+
+	memset(mad, 0, sizeof *mad);
+
+	if (query->flags & IB_SA_QUERY_OPA) {
+		mad->mad_hdr.base_version  = OPA_MGMT_BASE_VERSION;
+		mad->mad_hdr.class_version = OPA_SA_CLASS_VERSION;
+	} else {
+		mad->mad_hdr.base_version  = IB_MGMT_BASE_VERSION;
+		mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+	}
+	mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_SUBN_ADM;
+	spin_lock_irqsave(&tid_lock, flags);
+	mad->mad_hdr.tid           =
+		cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++);
+	spin_unlock_irqrestore(&tid_lock, flags);
+}
+
+static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
+{
+	bool preload = gfpflags_allow_blocking(gfp_mask);
+	unsigned long flags;
+	int ret, id;
+
+	if (preload)
+		idr_preload(gfp_mask);
+	spin_lock_irqsave(&idr_lock, flags);
+
+	id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT);
+
+	spin_unlock_irqrestore(&idr_lock, flags);
+	if (preload)
+		idr_preload_end();
+	if (id < 0)
+		return id;
+
+	query->mad_buf->timeout_ms  = timeout_ms;
+	query->mad_buf->context[0] = query;
+	query->id = id;
+
+	if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) &&
+	    (!(query->flags & IB_SA_QUERY_OPA))) {
+		if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) {
+			if (!ib_nl_make_request(query, gfp_mask))
+				return id;
+		}
+		ib_sa_disable_local_svc(query);
+	}
+
+	ret = ib_post_send_mad(query->mad_buf, NULL);
+	if (ret) {
+		spin_lock_irqsave(&idr_lock, flags);
+		idr_remove(&query_idr, id);
+		spin_unlock_irqrestore(&idr_lock, flags);
+	}
+
+	/*
+	 * It's not safe to dereference query any more, because the
+	 * send may already have completed and freed the query in
+	 * another context.
+	 */
+	return ret ? ret : id;
+}
+
+void ib_sa_unpack_path(void *attribute, struct sa_path_rec *rec)
+{
+	ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_path);
+
+void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute)
+{
+	ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute);
+}
+EXPORT_SYMBOL(ib_sa_pack_path);
+
+static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client,
+					 struct ib_device *device,
+					 u8 port_num)
+{
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port *port;
+	unsigned long flags;
+	bool ret = false;
+
+	if (!sa_dev)
+		return ret;
+
+	port = &sa_dev->port[port_num - sa_dev->start_port];
+	spin_lock_irqsave(&port->classport_lock, flags);
+	if (!port->classport_info.valid)
+		goto ret;
+
+	if (port->classport_info.data.type == RDMA_CLASS_PORT_INFO_OPA)
+		ret = opa_get_cpi_capmask2(&port->classport_info.data.opa) &
+			OPA_CLASS_PORT_INFO_PR_SUPPORT;
+ret:
+	spin_unlock_irqrestore(&port->classport_lock, flags);
+	return ret;
+}
+
+enum opa_pr_supported {
+	PR_NOT_SUPPORTED,
+	PR_OPA_SUPPORTED,
+	PR_IB_SUPPORTED
+};
+
+/**
+ * Check if current PR query can be an OPA query.
+ * Retuns PR_NOT_SUPPORTED if a path record query is not
+ * possible, PR_OPA_SUPPORTED if an OPA path record query
+ * is possible and PR_IB_SUPPORTED if an IB path record
+ * query is possible.
+ */
+static int opa_pr_query_possible(struct ib_sa_client *client,
+				 struct ib_device *device,
+				 u8 port_num,
+				 struct sa_path_rec *rec)
+{
+	struct ib_port_attr port_attr;
+
+	if (ib_query_port(device, port_num, &port_attr))
+		return PR_NOT_SUPPORTED;
+
+	if (ib_sa_opa_pathrecord_support(client, device, port_num))
+		return PR_OPA_SUPPORTED;
+
+	if (port_attr.lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
+		return PR_NOT_SUPPORTED;
+	else
+		return PR_IB_SUPPORTED;
+}
+
+static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
+				    int status,
+				    struct ib_sa_mad *mad)
+{
+	struct ib_sa_path_query *query =
+		container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+	if (mad) {
+		struct sa_path_rec rec;
+
+		if (sa_query->flags & IB_SA_QUERY_OPA) {
+			ib_unpack(opa_path_rec_table,
+				  ARRAY_SIZE(opa_path_rec_table),
+				  mad->data, &rec);
+			rec.rec_type = SA_PATH_REC_TYPE_OPA;
+			query->callback(status, &rec, query->context);
+		} else {
+			ib_unpack(path_rec_table,
+				  ARRAY_SIZE(path_rec_table),
+				  mad->data, &rec);
+			rec.rec_type = SA_PATH_REC_TYPE_IB;
+			sa_path_set_ndev(&rec, NULL);
+			sa_path_set_ifindex(&rec, 0);
+			sa_path_set_dmac_zero(&rec);
+
+			if (query->conv_pr) {
+				struct sa_path_rec opa;
+
+				memset(&opa, 0, sizeof(struct sa_path_rec));
+				sa_convert_path_ib_to_opa(&opa, &rec);
+				query->callback(status, &opa, query->context);
+			} else {
+				query->callback(status, &rec, query->context);
+			}
+		}
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
+{
+	struct ib_sa_path_query *query =
+		container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+	kfree(query->conv_pr);
+	kfree(query);
+}
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path.  The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code.  Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+		       struct ib_device *device, u8 port_num,
+		       struct sa_path_rec *rec,
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, gfp_t gfp_mask,
+		       void (*callback)(int status,
+					struct sa_path_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **sa_query)
+{
+	struct ib_sa_path_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	enum opa_pr_supported status;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	if ((rec->rec_type != SA_PATH_REC_TYPE_IB) &&
+	    (rec->rec_type != SA_PATH_REC_TYPE_OPA))
+		return -EINVAL;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kzalloc(sizeof(*query), gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	if (rec->rec_type == SA_PATH_REC_TYPE_OPA) {
+		status = opa_pr_query_possible(client, device, port_num, rec);
+		if (status == PR_NOT_SUPPORTED) {
+			ret = -EINVAL;
+			goto err1;
+		} else if (status == PR_OPA_SUPPORTED) {
+			query->sa_query.flags |= IB_SA_QUERY_OPA;
+		} else {
+			query->conv_pr =
+				kmalloc(sizeof(*query->conv_pr), gfp_mask);
+			if (!query->conv_pr) {
+				ret = -ENOMEM;
+				goto err1;
+			}
+		}
+	}
+
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err2;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(&query->sa_query, agent);
+
+	query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_path_rec_release;
+	mad->mad_hdr.method	 = IB_MGMT_METHOD_GET;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	if (query->sa_query.flags & IB_SA_QUERY_OPA) {
+		ib_pack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table),
+			rec, mad->data);
+	} else if (query->conv_pr) {
+		sa_convert_path_opa_to_ib(query->conv_pr, rec);
+		ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table),
+			query->conv_pr, mad->data);
+	} else {
+		ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table),
+			rec, mad->data);
+	}
+
+	*sa_query = &query->sa_query;
+
+	query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE;
+	query->sa_query.mad_buf->context[1] = (query->conv_pr) ?
+						query->conv_pr : rec;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err3;
+
+	return ret;
+
+err3:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+err2:
+	kfree(query->conv_pr);
+err1:
+	kfree(query);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
+
+static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query,
+				    int status,
+				    struct ib_sa_mad *mad)
+{
+	struct ib_sa_service_query *query =
+		container_of(sa_query, struct ib_sa_service_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_service_rec rec;
+
+		ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_service_query, sa_query));
+}
+
+/**
+ * ib_sa_service_rec_query - Start Service Record operation
+ * @client:SA client
+ * @device:device to send request on
+ * @port_num: port number to send request on
+ * @method:SA method - should be get, set, or delete
+ * @rec:Service Record to send in request
+ * @comp_mask:component mask to send in request
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when request completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:request context, used to cancel request
+ *
+ * Send a Service Record set/get/delete to the SA to register,
+ * unregister or query a service record.
+ * The callback function will be called when the request completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_service_rec_query() is negative, it is an
+ * error code.  Otherwise it is a request ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_service_rec_query(struct ib_sa_client *client,
+			    struct ib_device *device, u8 port_num, u8 method,
+			    struct ib_sa_service_rec *rec,
+			    ib_sa_comp_mask comp_mask,
+			    int timeout_ms, gfp_t gfp_mask,
+			    void (*callback)(int status,
+					     struct ib_sa_service_rec *resp,
+					     void *context),
+			    void *context,
+			    struct ib_sa_query **sa_query)
+{
+	struct ib_sa_service_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	if (method != IB_MGMT_METHOD_GET &&
+	    method != IB_MGMT_METHOD_SET &&
+	    method != IB_SA_METHOD_DELETE)
+		return -EINVAL;
+
+	query = kzalloc(sizeof(*query), gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(&query->sa_query, agent);
+
+	query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_service_rec_release;
+	mad->mad_hdr.method	 = method;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_SERVICE_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table),
+		rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_service_rec_query);
+
+static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
+					int status,
+					struct ib_sa_mad *mad)
+{
+	struct ib_sa_mcmember_query *query =
+		container_of(sa_query, struct ib_sa_mcmember_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_mcmember_rec rec;
+
+		ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query));
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+			     struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec,
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, gfp_t gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **sa_query)
+{
+	struct ib_sa_mcmember_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kzalloc(sizeof(*query), gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(&query->sa_query, agent);
+
+	query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_mcmember_rec_release;
+	mad->mad_hdr.method	 = method;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+		rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+
+/* Support GuidInfoRecord */
+static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query,
+					int status,
+					struct ib_sa_mad *mad)
+{
+	struct ib_sa_guidinfo_query *query =
+		container_of(sa_query, struct ib_sa_guidinfo_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_guidinfo_rec rec;
+
+		ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query));
+}
+
+int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
+			      struct ib_device *device, u8 port_num,
+			      struct ib_sa_guidinfo_rec *rec,
+			      ib_sa_comp_mask comp_mask, u8 method,
+			      int timeout_ms, gfp_t gfp_mask,
+			      void (*callback)(int status,
+					       struct ib_sa_guidinfo_rec *resp,
+					       void *context),
+			      void *context,
+			      struct ib_sa_query **sa_query)
+{
+	struct ib_sa_guidinfo_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	if (method != IB_MGMT_METHOD_GET &&
+	    method != IB_MGMT_METHOD_SET &&
+	    method != IB_SA_METHOD_DELETE) {
+		return -EINVAL;
+	}
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kzalloc(sizeof(*query), gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(&query->sa_query, agent);
+
+	query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_guidinfo_rec_release;
+
+	mad->mad_hdr.method	 = method;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec,
+		mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_guid_info_rec_query);
+
+bool ib_sa_sendonly_fullmem_support(struct ib_sa_client *client,
+				    struct ib_device *device,
+				    u8 port_num)
+{
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port *port;
+	bool ret = false;
+	unsigned long flags;
+
+	if (!sa_dev)
+		return ret;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+
+	spin_lock_irqsave(&port->classport_lock, flags);
+	if ((port->classport_info.valid) &&
+	    (port->classport_info.data.type == RDMA_CLASS_PORT_INFO_IB))
+		ret = ib_get_cpi_capmask2(&port->classport_info.data.ib)
+			& IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT;
+	spin_unlock_irqrestore(&port->classport_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_sendonly_fullmem_support);
+
+struct ib_classport_info_context {
+	struct completion	done;
+	struct ib_sa_query	*sa_query;
+};
+
+static void ib_classportinfo_cb(void *context)
+{
+	struct ib_classport_info_context *cb_ctx = context;
+
+	complete(&cb_ctx->done);
+}
+
+static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
+					      int status,
+					      struct ib_sa_mad *mad)
+{
+	unsigned long flags;
+	struct ib_sa_classport_info_query *query =
+		container_of(sa_query, struct ib_sa_classport_info_query, sa_query);
+	struct ib_sa_classport_cache *info = &sa_query->port->classport_info;
+
+	if (mad) {
+		if (sa_query->flags & IB_SA_QUERY_OPA) {
+			struct opa_class_port_info rec;
+
+			ib_unpack(opa_classport_info_rec_table,
+				  ARRAY_SIZE(opa_classport_info_rec_table),
+				  mad->data, &rec);
+
+			spin_lock_irqsave(&sa_query->port->classport_lock,
+					  flags);
+			if (!status && !info->valid) {
+				memcpy(&info->data.opa, &rec,
+				       sizeof(info->data.opa));
+
+				info->valid = true;
+				info->data.type = RDMA_CLASS_PORT_INFO_OPA;
+			}
+			spin_unlock_irqrestore(&sa_query->port->classport_lock,
+					       flags);
+
+		} else {
+			struct ib_class_port_info rec;
+
+			ib_unpack(ib_classport_info_rec_table,
+				  ARRAY_SIZE(ib_classport_info_rec_table),
+				  mad->data, &rec);
+
+			spin_lock_irqsave(&sa_query->port->classport_lock,
+					  flags);
+			if (!status && !info->valid) {
+				memcpy(&info->data.ib, &rec,
+				       sizeof(info->data.ib));
+
+				info->valid = true;
+				info->data.type = RDMA_CLASS_PORT_INFO_IB;
+			}
+			spin_unlock_irqrestore(&sa_query->port->classport_lock,
+					       flags);
+		}
+	}
+	query->callback(query->context);
+}
+
+static void ib_sa_classport_info_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_classport_info_query,
+			   sa_query));
+}
+
+static int ib_sa_classport_info_rec_query(struct ib_sa_port *port,
+					  int timeout_ms,
+					  void (*callback)(void *context),
+					  void *context,
+					  struct ib_sa_query **sa_query)
+{
+	struct ib_mad_agent *agent;
+	struct ib_sa_classport_info_query *query;
+	struct ib_sa_mad *mad;
+	gfp_t gfp_mask = GFP_KERNEL;
+	int ret;
+
+	agent = port->agent;
+
+	query = kzalloc(sizeof(*query), gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port = port;
+	query->sa_query.flags |= rdma_cap_opa_ah(port->agent->device,
+						 port->port_num) ?
+				 IB_SA_QUERY_OPA : 0;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err_free;
+
+	query->callback = callback;
+	query->context = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(&query->sa_query, agent);
+
+	query->sa_query.callback = ib_sa_classport_info_rec_callback;
+	query->sa_query.release  = ib_sa_classport_info_rec_release;
+	mad->mad_hdr.method	 = IB_MGMT_METHOD_GET;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO);
+	mad->sa_hdr.comp_mask	 = 0;
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err_free_mad;
+
+	return ret;
+
+err_free_mad:
+	*sa_query = NULL;
+	free_mad(&query->sa_query);
+
+err_free:
+	kfree(query);
+	return ret;
+}
+
+static void update_ib_cpi(struct work_struct *work)
+{
+	struct ib_sa_port *port =
+		container_of(work, struct ib_sa_port, ib_cpi_work.work);
+	struct ib_classport_info_context *cb_context;
+	unsigned long flags;
+	int ret;
+
+	/* If the classport info is valid, nothing
+	 * to do here.
+	 */
+	spin_lock_irqsave(&port->classport_lock, flags);
+	if (port->classport_info.valid) {
+		spin_unlock_irqrestore(&port->classport_lock, flags);
+		return;
+	}
+	spin_unlock_irqrestore(&port->classport_lock, flags);
+
+	cb_context = kmalloc(sizeof(*cb_context), GFP_KERNEL);
+	if (!cb_context)
+		goto err_nomem;
+
+	init_completion(&cb_context->done);
+
+	ret = ib_sa_classport_info_rec_query(port, 3000,
+					     ib_classportinfo_cb, cb_context,
+					     &cb_context->sa_query);
+	if (ret < 0)
+		goto free_cb_err;
+	wait_for_completion(&cb_context->done);
+free_cb_err:
+	kfree(cb_context);
+	spin_lock_irqsave(&port->classport_lock, flags);
+
+	/* If the classport info is still not valid, the query should have
+	 * failed for some reason. Retry issuing the query
+	 */
+	if (!port->classport_info.valid) {
+		port->classport_info.retry_cnt++;
+		if (port->classport_info.retry_cnt <=
+		    IB_SA_CPI_MAX_RETRY_CNT) {
+			unsigned long delay =
+				msecs_to_jiffies(IB_SA_CPI_RETRY_WAIT);
+
+			queue_delayed_work(ib_wq, &port->ib_cpi_work, delay);
+		}
+	}
+	spin_unlock_irqrestore(&port->classport_lock, flags);
+
+err_nomem:
+	return;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_sa_query *query = mad_send_wc->send_buf->context[0];
+	unsigned long flags;
+
+	if (query->callback)
+		switch (mad_send_wc->status) {
+		case IB_WC_SUCCESS:
+			/* No callback -- already got recv */
+			break;
+		case IB_WC_RESP_TIMEOUT_ERR:
+			query->callback(query, -ETIMEDOUT, NULL);
+			break;
+		case IB_WC_WR_FLUSH_ERR:
+			query->callback(query, -EINTR, NULL);
+			break;
+		default:
+			query->callback(query, -EIO, NULL);
+			break;
+		}
+
+	spin_lock_irqsave(&idr_lock, flags);
+	idr_remove(&query_idr, query->id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	free_mad(query);
+	if (query->client)
+		ib_sa_client_put(query->client);
+	query->release(query);
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+			 struct ib_mad_send_buf *send_buf,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_sa_query *query;
+
+	if (!send_buf)
+		return;
+
+	query = send_buf->context[0];
+	if (query->callback) {
+		if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+			query->callback(query,
+					mad_recv_wc->recv_buf.mad->mad_hdr.status ?
+					-EINVAL : 0,
+					(struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+		else
+			query->callback(query, -EIO, NULL);
+	}
+
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static void update_sm_ah(struct work_struct *work)
+{
+	struct ib_sa_port *port =
+		container_of(work, struct ib_sa_port, update_task);
+	struct ib_sa_sm_ah *new_ah;
+	struct ib_port_attr port_attr;
+	struct rdma_ah_attr   ah_attr;
+
+	if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
+		pr_warn("Couldn't query port\n");
+		return;
+	}
+
+	new_ah = kmalloc(sizeof(*new_ah), GFP_KERNEL);
+	if (!new_ah)
+		return;
+
+	kref_init(&new_ah->ref);
+	new_ah->src_path_mask = (1 << port_attr.lmc) - 1;
+
+	new_ah->pkey_index = 0;
+	if (ib_find_pkey(port->agent->device, port->port_num,
+			 IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index))
+		pr_err("Couldn't find index for default PKey\n");
+
+	memset(&ah_attr, 0, sizeof(ah_attr));
+	ah_attr.type = rdma_ah_find_type(port->agent->device,
+					 port->port_num);
+	rdma_ah_set_dlid(&ah_attr, port_attr.sm_lid);
+	rdma_ah_set_sl(&ah_attr, port_attr.sm_sl);
+	rdma_ah_set_port_num(&ah_attr, port->port_num);
+	if (port_attr.grh_required) {
+		if (ah_attr.type == RDMA_AH_ATTR_TYPE_OPA) {
+			rdma_ah_set_make_grd(&ah_attr, true);
+		} else {
+			rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH);
+			rdma_ah_set_subnet_prefix(&ah_attr,
+						  cpu_to_be64(port_attr.subnet_prefix));
+			rdma_ah_set_interface_id(&ah_attr,
+						 cpu_to_be64(IB_SA_WELL_KNOWN_GUID));
+		}
+	}
+
+	new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr);
+	if (IS_ERR(new_ah->ah)) {
+		pr_warn("Couldn't create new SM AH\n");
+		kfree(new_ah);
+		return;
+	}
+
+	spin_lock_irq(&port->ah_lock);
+	if (port->sm_ah)
+		kref_put(&port->sm_ah->ref, free_sm_ah);
+	port->sm_ah = new_ah;
+	spin_unlock_irq(&port->ah_lock);
+}
+
+static void ib_sa_event(struct ib_event_handler *handler,
+			struct ib_event *event)
+{
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE   ||
+	    event->event == IB_EVENT_CLIENT_REREGISTER) {
+		unsigned long flags;
+		struct ib_sa_device *sa_dev =
+			container_of(handler, typeof(*sa_dev), event_handler);
+		u8 port_num = event->element.port_num - sa_dev->start_port;
+		struct ib_sa_port *port = &sa_dev->port[port_num];
+
+		if (!rdma_cap_ib_sa(handler->device, port->port_num))
+			return;
+
+		spin_lock_irqsave(&port->ah_lock, flags);
+		if (port->sm_ah)
+			kref_put(&port->sm_ah->ref, free_sm_ah);
+		port->sm_ah = NULL;
+		spin_unlock_irqrestore(&port->ah_lock, flags);
+
+		if (event->event == IB_EVENT_SM_CHANGE ||
+		    event->event == IB_EVENT_CLIENT_REREGISTER ||
+		    event->event == IB_EVENT_LID_CHANGE ||
+		    event->event == IB_EVENT_PORT_ACTIVE) {
+			unsigned long delay =
+				msecs_to_jiffies(IB_SA_CPI_RETRY_WAIT);
+
+			spin_lock_irqsave(&port->classport_lock, flags);
+			port->classport_info.valid = false;
+			port->classport_info.retry_cnt = 0;
+			spin_unlock_irqrestore(&port->classport_lock, flags);
+			queue_delayed_work(ib_wq,
+					   &port->ib_cpi_work, delay);
+		}
+		queue_work(ib_wq, &sa_dev->port[port_num].update_task);
+	}
+}
+
+static void ib_sa_add_one(struct ib_device *device)
+{
+	struct ib_sa_device *sa_dev;
+	int s, e, i;
+	int count = 0;
+
+	s = rdma_start_port(device);
+	e = rdma_end_port(device);
+
+	sa_dev = kzalloc(sizeof *sa_dev +
+			 (e - s + 1) * sizeof (struct ib_sa_port),
+			 GFP_KERNEL);
+	if (!sa_dev)
+		return;
+
+	sa_dev->start_port = s;
+	sa_dev->end_port   = e;
+
+	for (i = 0; i <= e - s; ++i) {
+		spin_lock_init(&sa_dev->port[i].ah_lock);
+		if (!rdma_cap_ib_sa(device, i + 1))
+			continue;
+
+		sa_dev->port[i].sm_ah    = NULL;
+		sa_dev->port[i].port_num = i + s;
+
+		spin_lock_init(&sa_dev->port[i].classport_lock);
+		sa_dev->port[i].classport_info.valid = false;
+
+		sa_dev->port[i].agent =
+			ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+					      NULL, 0, send_handler,
+					      recv_handler, sa_dev, 0);
+		if (IS_ERR(sa_dev->port[i].agent))
+			goto err;
+
+		INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah);
+		INIT_DELAYED_WORK(&sa_dev->port[i].ib_cpi_work,
+				  update_ib_cpi);
+
+		count++;
+	}
+
+	if (!count)
+		goto free;
+
+	ib_set_client_data(device, &sa_client, sa_dev);
+
+	/*
+	 * We register our event handler after everything is set up,
+	 * and then update our cached info after the event handler is
+	 * registered to avoid any problems if a port changes state
+	 * during our initialization.
+	 */
+
+	INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
+	ib_register_event_handler(&sa_dev->event_handler);
+
+	for (i = 0; i <= e - s; ++i) {
+		if (rdma_cap_ib_sa(device, i + 1))
+			update_sm_ah(&sa_dev->port[i].update_task);
+	}
+
+	return;
+
+err:
+	while (--i >= 0) {
+		if (rdma_cap_ib_sa(device, i + 1))
+			ib_unregister_mad_agent(sa_dev->port[i].agent);
+	}
+free:
+	kfree(sa_dev);
+	return;
+}
+
+static void ib_sa_remove_one(struct ib_device *device, void *client_data)
+{
+	struct ib_sa_device *sa_dev = client_data;
+	int i;
+
+	if (!sa_dev)
+		return;
+
+	ib_unregister_event_handler(&sa_dev->event_handler);
+	flush_workqueue(ib_wq);
+
+	for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
+		if (rdma_cap_ib_sa(device, i + 1)) {
+			cancel_delayed_work_sync(&sa_dev->port[i].ib_cpi_work);
+			ib_unregister_mad_agent(sa_dev->port[i].agent);
+			if (sa_dev->port[i].sm_ah)
+				kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
+		}
+
+	}
+
+	kfree(sa_dev);
+}
+
+int ib_sa_init(void)
+{
+	int ret;
+
+	get_random_bytes(&tid, sizeof tid);
+
+	atomic_set(&ib_nl_sa_request_seq, 0);
+
+	ret = ib_register_client(&sa_client);
+	if (ret) {
+		pr_err("Couldn't register ib_sa client\n");
+		goto err1;
+	}
+
+	ret = mcast_init();
+	if (ret) {
+		pr_err("Couldn't initialize multicast handling\n");
+		goto err2;
+	}
+
+	ib_nl_wq = alloc_ordered_workqueue("ib_nl_sa_wq", WQ_MEM_RECLAIM);
+	if (!ib_nl_wq) {
+		ret = -ENOMEM;
+		goto err3;
+	}
+
+	INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
+
+	return 0;
+
+err3:
+	mcast_cleanup();
+err2:
+	ib_unregister_client(&sa_client);
+err1:
+	return ret;
+}
+
+void ib_sa_cleanup(void)
+{
+	cancel_delayed_work(&ib_nl_timed_work);
+	flush_workqueue(ib_nl_wq);
+	destroy_workqueue(ib_nl_wq);
+	mcast_cleanup();
+	ib_unregister_client(&sa_client);
+	idr_destroy(&query_idr);
+}
diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
new file mode 100644
index 0000000..b61dda6
--- /dev/null
+++ b/drivers/infiniband/core/security.c
@@ -0,0 +1,755 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef CONFIG_SECURITY_INFINIBAND
+
+#include <linux/security.h>
+#include <linux/completion.h>
+#include <linux/list.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include "core_priv.h"
+#include "mad_priv.h"
+
+static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp)
+{
+	struct pkey_index_qp_list *pkey = NULL;
+	struct pkey_index_qp_list *tmp_pkey;
+	struct ib_device *dev = pp->sec->dev;
+
+	spin_lock(&dev->port_pkey_list[pp->port_num].list_lock);
+	list_for_each_entry(tmp_pkey,
+			    &dev->port_pkey_list[pp->port_num].pkey_list,
+			    pkey_index_list) {
+		if (tmp_pkey->pkey_index == pp->pkey_index) {
+			pkey = tmp_pkey;
+			break;
+		}
+	}
+	spin_unlock(&dev->port_pkey_list[pp->port_num].list_lock);
+	return pkey;
+}
+
+static int get_pkey_and_subnet_prefix(struct ib_port_pkey *pp,
+				      u16 *pkey,
+				      u64 *subnet_prefix)
+{
+	struct ib_device *dev = pp->sec->dev;
+	int ret;
+
+	ret = ib_get_cached_pkey(dev, pp->port_num, pp->pkey_index, pkey);
+	if (ret)
+		return ret;
+
+	ret = ib_get_cached_subnet_prefix(dev, pp->port_num, subnet_prefix);
+
+	return ret;
+}
+
+static int enforce_qp_pkey_security(u16 pkey,
+				    u64 subnet_prefix,
+				    struct ib_qp_security *qp_sec)
+{
+	struct ib_qp_security *shared_qp_sec;
+	int ret;
+
+	ret = security_ib_pkey_access(qp_sec->security, subnet_prefix, pkey);
+	if (ret)
+		return ret;
+
+	list_for_each_entry(shared_qp_sec,
+			    &qp_sec->shared_qp_list,
+			    shared_qp_list) {
+		ret = security_ib_pkey_access(shared_qp_sec->security,
+					      subnet_prefix,
+					      pkey);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+/* The caller of this function must hold the QP security
+ * mutex of the QP of the security structure in *pps.
+ *
+ * It takes separate ports_pkeys and security structure
+ * because in some cases the pps will be for a new settings
+ * or the pps will be for the real QP and security structure
+ * will be for a shared QP.
+ */
+static int check_qp_port_pkey_settings(struct ib_ports_pkeys *pps,
+				       struct ib_qp_security *sec)
+{
+	u64 subnet_prefix;
+	u16 pkey;
+	int ret = 0;
+
+	if (!pps)
+		return 0;
+
+	if (pps->main.state != IB_PORT_PKEY_NOT_VALID) {
+		ret = get_pkey_and_subnet_prefix(&pps->main,
+						 &pkey,
+						 &subnet_prefix);
+		if (ret)
+			return ret;
+
+		ret = enforce_qp_pkey_security(pkey,
+					       subnet_prefix,
+					       sec);
+		if (ret)
+			return ret;
+	}
+
+	if (pps->alt.state != IB_PORT_PKEY_NOT_VALID) {
+		ret = get_pkey_and_subnet_prefix(&pps->alt,
+						 &pkey,
+						 &subnet_prefix);
+		if (ret)
+			return ret;
+
+		ret = enforce_qp_pkey_security(pkey,
+					       subnet_prefix,
+					       sec);
+	}
+
+	return ret;
+}
+
+/* The caller of this function must hold the QP security
+ * mutex.
+ */
+static void qp_to_error(struct ib_qp_security *sec)
+{
+	struct ib_qp_security *shared_qp_sec;
+	struct ib_qp_attr attr = {
+		.qp_state = IB_QPS_ERR
+	};
+	struct ib_event event = {
+		.event = IB_EVENT_QP_FATAL
+	};
+
+	/* If the QP is in the process of being destroyed
+	 * the qp pointer in the security structure is
+	 * undefined.  It cannot be modified now.
+	 */
+	if (sec->destroying)
+		return;
+
+	ib_modify_qp(sec->qp,
+		     &attr,
+		     IB_QP_STATE);
+
+	if (sec->qp->event_handler && sec->qp->qp_context) {
+		event.element.qp = sec->qp;
+		sec->qp->event_handler(&event,
+				       sec->qp->qp_context);
+	}
+
+	list_for_each_entry(shared_qp_sec,
+			    &sec->shared_qp_list,
+			    shared_qp_list) {
+		struct ib_qp *qp = shared_qp_sec->qp;
+
+		if (qp->event_handler && qp->qp_context) {
+			event.element.qp = qp;
+			event.device = qp->device;
+			qp->event_handler(&event,
+					  qp->qp_context);
+		}
+	}
+}
+
+static inline void check_pkey_qps(struct pkey_index_qp_list *pkey,
+				  struct ib_device *device,
+				  u8 port_num,
+				  u64 subnet_prefix)
+{
+	struct ib_port_pkey *pp, *tmp_pp;
+	bool comp;
+	LIST_HEAD(to_error_list);
+	u16 pkey_val;
+
+	if (!ib_get_cached_pkey(device,
+				port_num,
+				pkey->pkey_index,
+				&pkey_val)) {
+		spin_lock(&pkey->qp_list_lock);
+		list_for_each_entry(pp, &pkey->qp_list, qp_list) {
+			if (atomic_read(&pp->sec->error_list_count))
+				continue;
+
+			if (enforce_qp_pkey_security(pkey_val,
+						     subnet_prefix,
+						     pp->sec)) {
+				atomic_inc(&pp->sec->error_list_count);
+				list_add(&pp->to_error_list,
+					 &to_error_list);
+			}
+		}
+		spin_unlock(&pkey->qp_list_lock);
+	}
+
+	list_for_each_entry_safe(pp,
+				 tmp_pp,
+				 &to_error_list,
+				 to_error_list) {
+		mutex_lock(&pp->sec->mutex);
+		qp_to_error(pp->sec);
+		list_del(&pp->to_error_list);
+		atomic_dec(&pp->sec->error_list_count);
+		comp = pp->sec->destroying;
+		mutex_unlock(&pp->sec->mutex);
+
+		if (comp)
+			complete(&pp->sec->error_complete);
+	}
+}
+
+/* The caller of this function must hold the QP security
+ * mutex.
+ */
+static int port_pkey_list_insert(struct ib_port_pkey *pp)
+{
+	struct pkey_index_qp_list *tmp_pkey;
+	struct pkey_index_qp_list *pkey;
+	struct ib_device *dev;
+	u8 port_num = pp->port_num;
+	int ret = 0;
+
+	if (pp->state != IB_PORT_PKEY_VALID)
+		return 0;
+
+	dev = pp->sec->dev;
+
+	pkey = get_pkey_idx_qp_list(pp);
+
+	if (!pkey) {
+		bool found = false;
+
+		pkey = kzalloc(sizeof(*pkey), GFP_KERNEL);
+		if (!pkey)
+			return -ENOMEM;
+
+		spin_lock(&dev->port_pkey_list[port_num].list_lock);
+		/* Check for the PKey again.  A racing process may
+		 * have created it.
+		 */
+		list_for_each_entry(tmp_pkey,
+				    &dev->port_pkey_list[port_num].pkey_list,
+				    pkey_index_list) {
+			if (tmp_pkey->pkey_index == pp->pkey_index) {
+				kfree(pkey);
+				pkey = tmp_pkey;
+				found = true;
+				break;
+			}
+		}
+
+		if (!found) {
+			pkey->pkey_index = pp->pkey_index;
+			spin_lock_init(&pkey->qp_list_lock);
+			INIT_LIST_HEAD(&pkey->qp_list);
+			list_add(&pkey->pkey_index_list,
+				 &dev->port_pkey_list[port_num].pkey_list);
+		}
+		spin_unlock(&dev->port_pkey_list[port_num].list_lock);
+	}
+
+	spin_lock(&pkey->qp_list_lock);
+	list_add(&pp->qp_list, &pkey->qp_list);
+	spin_unlock(&pkey->qp_list_lock);
+
+	pp->state = IB_PORT_PKEY_LISTED;
+
+	return ret;
+}
+
+/* The caller of this function must hold the QP security
+ * mutex.
+ */
+static void port_pkey_list_remove(struct ib_port_pkey *pp)
+{
+	struct pkey_index_qp_list *pkey;
+
+	if (pp->state != IB_PORT_PKEY_LISTED)
+		return;
+
+	pkey = get_pkey_idx_qp_list(pp);
+
+	spin_lock(&pkey->qp_list_lock);
+	list_del(&pp->qp_list);
+	spin_unlock(&pkey->qp_list_lock);
+
+	/* The setting may still be valid, i.e. after
+	 * a destroy has failed for example.
+	 */
+	pp->state = IB_PORT_PKEY_VALID;
+}
+
+static void destroy_qp_security(struct ib_qp_security *sec)
+{
+	security_ib_free_security(sec->security);
+	kfree(sec->ports_pkeys);
+	kfree(sec);
+}
+
+/* The caller of this function must hold the QP security
+ * mutex.
+ */
+static struct ib_ports_pkeys *get_new_pps(const struct ib_qp *qp,
+					  const struct ib_qp_attr *qp_attr,
+					  int qp_attr_mask)
+{
+	struct ib_ports_pkeys *new_pps;
+	struct ib_ports_pkeys *qp_pps = qp->qp_sec->ports_pkeys;
+
+	new_pps = kzalloc(sizeof(*new_pps), GFP_KERNEL);
+	if (!new_pps)
+		return NULL;
+
+	if (qp_attr_mask & (IB_QP_PKEY_INDEX | IB_QP_PORT)) {
+		if (!qp_pps) {
+			new_pps->main.port_num = qp_attr->port_num;
+			new_pps->main.pkey_index = qp_attr->pkey_index;
+		} else {
+			new_pps->main.port_num = (qp_attr_mask & IB_QP_PORT) ?
+						  qp_attr->port_num :
+						  qp_pps->main.port_num;
+
+			new_pps->main.pkey_index =
+					(qp_attr_mask & IB_QP_PKEY_INDEX) ?
+					 qp_attr->pkey_index :
+					 qp_pps->main.pkey_index;
+		}
+		new_pps->main.state = IB_PORT_PKEY_VALID;
+	} else if (qp_pps) {
+		new_pps->main.port_num = qp_pps->main.port_num;
+		new_pps->main.pkey_index = qp_pps->main.pkey_index;
+		if (qp_pps->main.state != IB_PORT_PKEY_NOT_VALID)
+			new_pps->main.state = IB_PORT_PKEY_VALID;
+	}
+
+	if (qp_attr_mask & IB_QP_ALT_PATH) {
+		new_pps->alt.port_num = qp_attr->alt_port_num;
+		new_pps->alt.pkey_index = qp_attr->alt_pkey_index;
+		new_pps->alt.state = IB_PORT_PKEY_VALID;
+	} else if (qp_pps) {
+		new_pps->alt.port_num = qp_pps->alt.port_num;
+		new_pps->alt.pkey_index = qp_pps->alt.pkey_index;
+		if (qp_pps->alt.state != IB_PORT_PKEY_NOT_VALID)
+			new_pps->alt.state = IB_PORT_PKEY_VALID;
+	}
+
+	new_pps->main.sec = qp->qp_sec;
+	new_pps->alt.sec = qp->qp_sec;
+	return new_pps;
+}
+
+int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev)
+{
+	struct ib_qp *real_qp = qp->real_qp;
+	int ret;
+
+	ret = ib_create_qp_security(qp, dev);
+
+	if (ret)
+		return ret;
+
+	if (!qp->qp_sec)
+		return 0;
+
+	mutex_lock(&real_qp->qp_sec->mutex);
+	ret = check_qp_port_pkey_settings(real_qp->qp_sec->ports_pkeys,
+					  qp->qp_sec);
+
+	if (ret)
+		goto ret;
+
+	if (qp != real_qp)
+		list_add(&qp->qp_sec->shared_qp_list,
+			 &real_qp->qp_sec->shared_qp_list);
+ret:
+	mutex_unlock(&real_qp->qp_sec->mutex);
+	if (ret)
+		destroy_qp_security(qp->qp_sec);
+
+	return ret;
+}
+
+void ib_close_shared_qp_security(struct ib_qp_security *sec)
+{
+	struct ib_qp *real_qp = sec->qp->real_qp;
+
+	mutex_lock(&real_qp->qp_sec->mutex);
+	list_del(&sec->shared_qp_list);
+	mutex_unlock(&real_qp->qp_sec->mutex);
+
+	destroy_qp_security(sec);
+}
+
+int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev)
+{
+	u8 i = rdma_start_port(dev);
+	bool is_ib = false;
+	int ret;
+
+	while (i <= rdma_end_port(dev) && !is_ib)
+		is_ib = rdma_protocol_ib(dev, i++);
+
+	/* If this isn't an IB device don't create the security context */
+	if (!is_ib)
+		return 0;
+
+	qp->qp_sec = kzalloc(sizeof(*qp->qp_sec), GFP_KERNEL);
+	if (!qp->qp_sec)
+		return -ENOMEM;
+
+	qp->qp_sec->qp = qp;
+	qp->qp_sec->dev = dev;
+	mutex_init(&qp->qp_sec->mutex);
+	INIT_LIST_HEAD(&qp->qp_sec->shared_qp_list);
+	atomic_set(&qp->qp_sec->error_list_count, 0);
+	init_completion(&qp->qp_sec->error_complete);
+	ret = security_ib_alloc_security(&qp->qp_sec->security);
+	if (ret) {
+		kfree(qp->qp_sec);
+		qp->qp_sec = NULL;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_create_qp_security);
+
+void ib_destroy_qp_security_begin(struct ib_qp_security *sec)
+{
+	/* Return if not IB */
+	if (!sec)
+		return;
+
+	mutex_lock(&sec->mutex);
+
+	/* Remove the QP from the lists so it won't get added to
+	 * a to_error_list during the destroy process.
+	 */
+	if (sec->ports_pkeys) {
+		port_pkey_list_remove(&sec->ports_pkeys->main);
+		port_pkey_list_remove(&sec->ports_pkeys->alt);
+	}
+
+	/* If the QP is already in one or more of those lists
+	 * the destroying flag will ensure the to error flow
+	 * doesn't operate on an undefined QP.
+	 */
+	sec->destroying = true;
+
+	/* Record the error list count to know how many completions
+	 * to wait for.
+	 */
+	sec->error_comps_pending = atomic_read(&sec->error_list_count);
+
+	mutex_unlock(&sec->mutex);
+}
+
+void ib_destroy_qp_security_abort(struct ib_qp_security *sec)
+{
+	int ret;
+	int i;
+
+	/* Return if not IB */
+	if (!sec)
+		return;
+
+	/* If a concurrent cache update is in progress this
+	 * QP security could be marked for an error state
+	 * transition.  Wait for this to complete.
+	 */
+	for (i = 0; i < sec->error_comps_pending; i++)
+		wait_for_completion(&sec->error_complete);
+
+	mutex_lock(&sec->mutex);
+	sec->destroying = false;
+
+	/* Restore the position in the lists and verify
+	 * access is still allowed in case a cache update
+	 * occurred while attempting to destroy.
+	 *
+	 * Because these setting were listed already
+	 * and removed during ib_destroy_qp_security_begin
+	 * we know the pkey_index_qp_list for the PKey
+	 * already exists so port_pkey_list_insert won't fail.
+	 */
+	if (sec->ports_pkeys) {
+		port_pkey_list_insert(&sec->ports_pkeys->main);
+		port_pkey_list_insert(&sec->ports_pkeys->alt);
+	}
+
+	ret = check_qp_port_pkey_settings(sec->ports_pkeys, sec);
+	if (ret)
+		qp_to_error(sec);
+
+	mutex_unlock(&sec->mutex);
+}
+
+void ib_destroy_qp_security_end(struct ib_qp_security *sec)
+{
+	int i;
+
+	/* Return if not IB */
+	if (!sec)
+		return;
+
+	/* If a concurrent cache update is occurring we must
+	 * wait until this QP security structure is processed
+	 * in the QP to error flow before destroying it because
+	 * the to_error_list is in use.
+	 */
+	for (i = 0; i < sec->error_comps_pending; i++)
+		wait_for_completion(&sec->error_complete);
+
+	destroy_qp_security(sec);
+}
+
+void ib_security_cache_change(struct ib_device *device,
+			      u8 port_num,
+			      u64 subnet_prefix)
+{
+	struct pkey_index_qp_list *pkey;
+
+	list_for_each_entry(pkey,
+			    &device->port_pkey_list[port_num].pkey_list,
+			    pkey_index_list) {
+		check_pkey_qps(pkey,
+			       device,
+			       port_num,
+			       subnet_prefix);
+	}
+}
+
+void ib_security_destroy_port_pkey_list(struct ib_device *device)
+{
+	struct pkey_index_qp_list *pkey, *tmp_pkey;
+	int i;
+
+	for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+		spin_lock(&device->port_pkey_list[i].list_lock);
+		list_for_each_entry_safe(pkey,
+					 tmp_pkey,
+					 &device->port_pkey_list[i].pkey_list,
+					 pkey_index_list) {
+			list_del(&pkey->pkey_index_list);
+			kfree(pkey);
+		}
+		spin_unlock(&device->port_pkey_list[i].list_lock);
+	}
+}
+
+int ib_security_modify_qp(struct ib_qp *qp,
+			  struct ib_qp_attr *qp_attr,
+			  int qp_attr_mask,
+			  struct ib_udata *udata)
+{
+	int ret = 0;
+	struct ib_ports_pkeys *tmp_pps;
+	struct ib_ports_pkeys *new_pps = NULL;
+	struct ib_qp *real_qp = qp->real_qp;
+	bool special_qp = (real_qp->qp_type == IB_QPT_SMI ||
+			   real_qp->qp_type == IB_QPT_GSI ||
+			   real_qp->qp_type >= IB_QPT_RESERVED1);
+	bool pps_change = ((qp_attr_mask & (IB_QP_PKEY_INDEX | IB_QP_PORT)) ||
+			   (qp_attr_mask & IB_QP_ALT_PATH));
+
+	WARN_ONCE((qp_attr_mask & IB_QP_PORT &&
+		   rdma_protocol_ib(real_qp->device, qp_attr->port_num) &&
+		   !real_qp->qp_sec),
+		   "%s: QP security is not initialized for IB QP: %d\n",
+		   __func__, real_qp->qp_num);
+
+	/* The port/pkey settings are maintained only for the real QP. Open
+	 * handles on the real QP will be in the shared_qp_list. When
+	 * enforcing security on the real QP all the shared QPs will be
+	 * checked as well.
+	 */
+
+	if (pps_change && !special_qp && real_qp->qp_sec) {
+		mutex_lock(&real_qp->qp_sec->mutex);
+		new_pps = get_new_pps(real_qp,
+				      qp_attr,
+				      qp_attr_mask);
+		if (!new_pps) {
+			mutex_unlock(&real_qp->qp_sec->mutex);
+			return -ENOMEM;
+		}
+		/* Add this QP to the lists for the new port
+		 * and pkey settings before checking for permission
+		 * in case there is a concurrent cache update
+		 * occurring.  Walking the list for a cache change
+		 * doesn't acquire the security mutex unless it's
+		 * sending the QP to error.
+		 */
+		ret = port_pkey_list_insert(&new_pps->main);
+
+		if (!ret)
+			ret = port_pkey_list_insert(&new_pps->alt);
+
+		if (!ret)
+			ret = check_qp_port_pkey_settings(new_pps,
+							  real_qp->qp_sec);
+	}
+
+	if (!ret)
+		ret = real_qp->device->modify_qp(real_qp,
+						 qp_attr,
+						 qp_attr_mask,
+						 udata);
+
+	if (new_pps) {
+		/* Clean up the lists and free the appropriate
+		 * ports_pkeys structure.
+		 */
+		if (ret) {
+			tmp_pps = new_pps;
+		} else {
+			tmp_pps = real_qp->qp_sec->ports_pkeys;
+			real_qp->qp_sec->ports_pkeys = new_pps;
+		}
+
+		if (tmp_pps) {
+			port_pkey_list_remove(&tmp_pps->main);
+			port_pkey_list_remove(&tmp_pps->alt);
+		}
+		kfree(tmp_pps);
+		mutex_unlock(&real_qp->qp_sec->mutex);
+	}
+	return ret;
+}
+
+static int ib_security_pkey_access(struct ib_device *dev,
+				   u8 port_num,
+				   u16 pkey_index,
+				   void *sec)
+{
+	u64 subnet_prefix;
+	u16 pkey;
+	int ret;
+
+	if (!rdma_protocol_ib(dev, port_num))
+		return 0;
+
+	ret = ib_get_cached_pkey(dev, port_num, pkey_index, &pkey);
+	if (ret)
+		return ret;
+
+	ret = ib_get_cached_subnet_prefix(dev, port_num, &subnet_prefix);
+
+	if (ret)
+		return ret;
+
+	return security_ib_pkey_access(sec, subnet_prefix, pkey);
+}
+
+static int ib_mad_agent_security_change(struct notifier_block *nb,
+					unsigned long event,
+					void *data)
+{
+	struct ib_mad_agent *ag = container_of(nb, struct ib_mad_agent, lsm_nb);
+
+	if (event != LSM_POLICY_CHANGE)
+		return NOTIFY_DONE;
+
+	ag->smp_allowed = !security_ib_endport_manage_subnet(ag->security,
+							     ag->device->name,
+							     ag->port_num);
+
+	return NOTIFY_OK;
+}
+
+int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
+				enum ib_qp_type qp_type)
+{
+	int ret;
+
+	if (!rdma_protocol_ib(agent->device, agent->port_num))
+		return 0;
+
+	ret = security_ib_alloc_security(&agent->security);
+	if (ret)
+		return ret;
+
+	if (qp_type != IB_QPT_SMI)
+		return 0;
+
+	ret = security_ib_endport_manage_subnet(agent->security,
+						agent->device->name,
+						agent->port_num);
+	if (ret)
+		return ret;
+
+	agent->lsm_nb.notifier_call = ib_mad_agent_security_change;
+	ret = register_lsm_notifier(&agent->lsm_nb);
+	if (ret)
+		return ret;
+
+	agent->smp_allowed = true;
+	agent->lsm_nb_reg = true;
+	return 0;
+}
+
+void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent)
+{
+	if (!rdma_protocol_ib(agent->device, agent->port_num))
+		return;
+
+	security_ib_free_security(agent->security);
+	if (agent->lsm_nb_reg)
+		unregister_lsm_notifier(&agent->lsm_nb);
+}
+
+int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index)
+{
+	if (!rdma_protocol_ib(map->agent.device, map->agent.port_num))
+		return 0;
+
+	if (map->agent.qp->qp_type == IB_QPT_SMI) {
+		if (!map->agent.smp_allowed)
+			return -EACCES;
+		return 0;
+	}
+
+	return ib_security_pkey_access(map->agent.device,
+				       map->agent.port_num,
+				       pkey_index,
+				       map->agent.security);
+}
+
+#endif /* CONFIG_SECURITY_INFINIBAND */
diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c
new file mode 100644
index 0000000..f19b238
--- /dev/null
+++ b/drivers/infiniband/core/smi.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <rdma/ib_smi.h>
+#include "smi.h"
+#include "opa_smi.h"
+
+static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num,
+						u8 *hop_ptr, u8 hop_cnt,
+						const u8 *initial_path,
+						const u8 *return_path,
+						u8 direction,
+						bool dr_dlid_is_permissive,
+						bool dr_slid_is_permissive)
+{
+	/* See section 14.2.2.2, Vol 1 IB spec */
+	/* C14-6 -- valid hop_cnt values are from 0 to 63 */
+	if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+		return IB_SMI_DISCARD;
+
+	if (!direction) {
+		/* C14-9:1 */
+		if (hop_cnt && *hop_ptr == 0) {
+			(*hop_ptr)++;
+			return (initial_path[*hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:2 */
+		if (*hop_ptr && *hop_ptr < hop_cnt) {
+			if (!is_switch)
+				return IB_SMI_DISCARD;
+
+			/* return_path set when received */
+			(*hop_ptr)++;
+			return (initial_path[*hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:3 -- We're at the end of the DR segment of path */
+		if (*hop_ptr == hop_cnt) {
+			/* return_path set when received */
+			(*hop_ptr)++;
+			return (is_switch ||
+				dr_dlid_is_permissive ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		/* C14-9:5 -- Fail unreasonable hop pointer */
+		return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+	} else {
+		/* C14-13:1 */
+		if (hop_cnt && *hop_ptr == hop_cnt + 1) {
+			(*hop_ptr)--;
+			return (return_path[*hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:2 */
+		if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) {
+			if (!is_switch)
+				return IB_SMI_DISCARD;
+
+			(*hop_ptr)--;
+			return (return_path[*hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:3 -- at the end of the DR segment of path */
+		if (*hop_ptr == 1) {
+			(*hop_ptr)--;
+			/* C14-13:3 -- SMPs destined for SM shouldn't be here */
+			return (is_switch ||
+				dr_slid_is_permissive ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
+		if (*hop_ptr == 0)
+			return IB_SMI_HANDLE;
+
+		/* C14-13:5 -- Check for unreasonable hop pointer */
+		return IB_SMI_DISCARD;
+	}
+}
+
+/*
+ * Fixup a directed route SMP for sending
+ * Return IB_SMI_DISCARD if the SMP should be discarded
+ */
+enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+				       bool is_switch, int port_num)
+{
+	return __smi_handle_dr_smp_send(is_switch, port_num,
+					&smp->hop_ptr, smp->hop_cnt,
+					smp->initial_path,
+					smp->return_path,
+					ib_get_smp_direction(smp),
+					smp->dr_dlid == IB_LID_PERMISSIVE,
+					smp->dr_slid == IB_LID_PERMISSIVE);
+}
+
+enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,
+				       bool is_switch, int port_num)
+{
+	return __smi_handle_dr_smp_send(is_switch, port_num,
+					&smp->hop_ptr, smp->hop_cnt,
+					smp->route.dr.initial_path,
+					smp->route.dr.return_path,
+					opa_get_smp_direction(smp),
+					smp->route.dr.dr_dlid ==
+					OPA_LID_PERMISSIVE,
+					smp->route.dr.dr_slid ==
+					OPA_LID_PERMISSIVE);
+}
+
+static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num,
+						int phys_port_cnt,
+						u8 *hop_ptr, u8 hop_cnt,
+						const u8 *initial_path,
+						u8 *return_path,
+						u8 direction,
+						bool dr_dlid_is_permissive,
+						bool dr_slid_is_permissive)
+{
+	/* See section 14.2.2.2, Vol 1 IB spec */
+	/* C14-6 -- valid hop_cnt values are from 0 to 63 */
+	if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+		return IB_SMI_DISCARD;
+
+	if (!direction) {
+		/* C14-9:1 -- sender should have incremented hop_ptr */
+		if (hop_cnt && *hop_ptr == 0)
+			return IB_SMI_DISCARD;
+
+		/* C14-9:2 -- intermediate hop */
+		if (*hop_ptr && *hop_ptr < hop_cnt) {
+			if (!is_switch)
+				return IB_SMI_DISCARD;
+
+			return_path[*hop_ptr] = port_num;
+			/* hop_ptr updated when sending */
+			return (initial_path[*hop_ptr+1] <= phys_port_cnt ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:3 -- We're at the end of the DR segment of path */
+		if (*hop_ptr == hop_cnt) {
+			if (hop_cnt)
+				return_path[*hop_ptr] = port_num;
+			/* hop_ptr updated when sending */
+
+			return (is_switch ||
+				dr_dlid_is_permissive ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		/* C14-9:5 -- fail unreasonable hop pointer */
+		return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+	} else {
+
+		/* C14-13:1 */
+		if (hop_cnt && *hop_ptr == hop_cnt + 1) {
+			(*hop_ptr)--;
+			return (return_path[*hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:2 */
+		if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) {
+			if (!is_switch)
+				return IB_SMI_DISCARD;
+
+			/* hop_ptr updated when sending */
+			return (return_path[*hop_ptr-1] <= phys_port_cnt ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:3 -- We're at the end of the DR segment of path */
+		if (*hop_ptr == 1) {
+			if (dr_slid_is_permissive) {
+				/* giving SMP to SM - update hop_ptr */
+				(*hop_ptr)--;
+				return IB_SMI_HANDLE;
+			}
+			/* hop_ptr updated when sending */
+			return (is_switch ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:4 -- hop_ptr = 0 -> give to SM */
+		/* C14-13:5 -- Check for unreasonable hop pointer */
+		return (*hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+	}
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return IB_SMI_DISCARD if the SMP should be dropped
+ */
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,
+				       int port_num, int phys_port_cnt)
+{
+	return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,
+					&smp->hop_ptr, smp->hop_cnt,
+					smp->initial_path,
+					smp->return_path,
+					ib_get_smp_direction(smp),
+					smp->dr_dlid == IB_LID_PERMISSIVE,
+					smp->dr_slid == IB_LID_PERMISSIVE);
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return IB_SMI_DISCARD if the SMP should be dropped
+ */
+enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,
+					   int port_num, int phys_port_cnt)
+{
+	return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,
+					&smp->hop_ptr, smp->hop_cnt,
+					smp->route.dr.initial_path,
+					smp->route.dr.return_path,
+					opa_get_smp_direction(smp),
+					smp->route.dr.dr_dlid ==
+					OPA_LID_PERMISSIVE,
+					smp->route.dr.dr_slid ==
+					OPA_LID_PERMISSIVE);
+}
+
+static enum smi_forward_action __smi_check_forward_dr_smp(u8 hop_ptr, u8 hop_cnt,
+							  u8 direction,
+							  bool dr_dlid_is_permissive,
+							  bool dr_slid_is_permissive)
+{
+	if (!direction) {
+		/* C14-9:2 -- intermediate hop */
+		if (hop_ptr && hop_ptr < hop_cnt)
+			return IB_SMI_FORWARD;
+
+		/* C14-9:3 -- at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt)
+			return (dr_dlid_is_permissive ?
+				IB_SMI_SEND : IB_SMI_LOCAL);
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		if (hop_ptr == hop_cnt + 1)
+			return IB_SMI_SEND;
+	} else {
+		/* C14-13:2  -- intermediate hop */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt)
+			return IB_SMI_FORWARD;
+
+		/* C14-13:3 -- at the end of the DR segment of path */
+		if (hop_ptr == 1)
+			return (!dr_slid_is_permissive ?
+				IB_SMI_SEND : IB_SMI_LOCAL);
+	}
+	return IB_SMI_LOCAL;
+
+}
+
+enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
+{
+	return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt,
+					  ib_get_smp_direction(smp),
+					  smp->dr_dlid == IB_LID_PERMISSIVE,
+					  smp->dr_slid == IB_LID_PERMISSIVE);
+}
+
+enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp)
+{
+	return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt,
+					  opa_get_smp_direction(smp),
+					  smp->route.dr.dr_dlid ==
+					  OPA_LID_PERMISSIVE,
+					  smp->route.dr.dr_slid ==
+					  OPA_LID_PERMISSIVE);
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int smi_get_fwd_port(struct ib_smp *smp)
+{
+	return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] :
+		smp->return_path[smp->hop_ptr-1]);
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int opa_smi_get_fwd_port(struct opa_smp *smp)
+{
+	return !opa_get_smp_direction(smp) ? smp->route.dr.initial_path[smp->hop_ptr+1] :
+		smp->route.dr.return_path[smp->hop_ptr-1];
+}
diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h
new file mode 100644
index 0000000..33c91c8
--- /dev/null
+++ b/drivers/infiniband/core/smi.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __SMI_H_
+#define __SMI_H_
+
+#include <rdma/ib_smi.h>
+
+enum smi_action {
+	IB_SMI_DISCARD,
+	IB_SMI_HANDLE
+};
+
+enum smi_forward_action {
+	IB_SMI_LOCAL,	/* SMP should be completed up the stack */
+	IB_SMI_SEND,	/* received DR SMP should be forwarded to the send queue */
+	IB_SMI_FORWARD	/* SMP should be forwarded (for switches only) */
+};
+
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,
+				       int port_num, int phys_port_cnt);
+int smi_get_fwd_port(struct ib_smp *smp);
+extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp);
+extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+					      bool is_switch, int port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_smp(struct ib_smp *smp,
+						  struct ib_device *device)
+{
+	/* C14-9:3 -- We're at the end of the DR segment of path */
+	/* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+	return ((device->process_mad &&
+		!ib_get_smp_direction(smp) &&
+		(smp->hop_ptr == smp->hop_cnt + 1)) ?
+		IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp,
+						   struct ib_device *device)
+{
+	/* C14-13:3 -- We're at the end of the DR segment of path */
+	/* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+	return ((device->process_mad &&
+		ib_get_smp_direction(smp) &&
+		!smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+#endif	/* __SMI_H_ */
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
new file mode 100644
index 0000000..31c7efa
--- /dev/null
+++ b/drivers/infiniband/core/sysfs.c
@@ -0,0 +1,1380 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_pma.h>
+
+struct ib_port;
+
+struct gid_attr_group {
+	struct ib_port		*port;
+	struct kobject		kobj;
+	struct attribute_group	ndev;
+	struct attribute_group	type;
+};
+struct ib_port {
+	struct kobject         kobj;
+	struct ib_device      *ibdev;
+	struct gid_attr_group *gid_attr_group;
+	struct attribute_group gid_group;
+	struct attribute_group pkey_group;
+	struct attribute_group *pma_table;
+	struct attribute_group *hw_stats_ag;
+	struct rdma_hw_stats   *hw_stats;
+	u8                     port_num;
+};
+
+struct port_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf);
+	ssize_t (*store)(struct ib_port *, struct port_attribute *,
+			 const char *buf, size_t count);
+};
+
+#define PORT_ATTR(_name, _mode, _show, _store) \
+struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define PORT_ATTR_RO(_name) \
+struct port_attribute port_attr_##_name = __ATTR_RO(_name)
+
+struct port_table_attribute {
+	struct port_attribute	attr;
+	char			name[8];
+	int			index;
+	__be16			attr_id;
+};
+
+struct hw_stats_attribute {
+	struct attribute	attr;
+	ssize_t			(*show)(struct kobject *kobj,
+					struct attribute *attr, char *buf);
+	ssize_t			(*store)(struct kobject *kobj,
+					 struct attribute *attr,
+					 const char *buf,
+					 size_t count);
+	int			index;
+	u8			port_num;
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+	if (!port_attr->show)
+		return -EIO;
+
+	return port_attr->show(p, port_attr, buf);
+}
+
+static ssize_t port_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t count)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+	if (!port_attr->store)
+		return -EIO;
+	return port_attr->store(p, port_attr, buf, count);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+	.show	= port_attr_show,
+	.store	= port_attr_store
+};
+
+static ssize_t gid_attr_show(struct kobject *kobj,
+			     struct attribute *attr, char *buf)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct ib_port *p = container_of(kobj, struct gid_attr_group,
+					 kobj)->port;
+
+	if (!port_attr->show)
+		return -EIO;
+
+	return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops gid_attr_sysfs_ops = {
+	.show = gid_attr_show
+};
+
+static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
+			  char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	static const char *state_name[] = {
+		[IB_PORT_NOP]		= "NOP",
+		[IB_PORT_DOWN]		= "DOWN",
+		[IB_PORT_INIT]		= "INIT",
+		[IB_PORT_ARMED]		= "ARMED",
+		[IB_PORT_ACTIVE]	= "ACTIVE",
+		[IB_PORT_ACTIVE_DEFER]	= "ACTIVE_DEFER"
+	};
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d: %s\n", attr.state,
+		       attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ?
+		       state_name[attr.state] : "UNKNOWN");
+}
+
+static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
+			char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%x\n", attr.lid);
+}
+
+static ssize_t lid_mask_count_show(struct ib_port *p,
+				   struct port_attribute *unused,
+				   char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d\n", attr.lmc);
+}
+
+static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
+			   char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%x\n", attr.sm_lid);
+}
+
+static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
+			  char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d\n", attr.sm_sl);
+}
+
+static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
+			     char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+}
+
+static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
+			 char *buf)
+{
+	struct ib_port_attr attr;
+	char *speed = "";
+	int rate;		/* in deci-Gb/sec */
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	switch (attr.active_speed) {
+	case IB_SPEED_DDR:
+		speed = " DDR";
+		rate = 50;
+		break;
+	case IB_SPEED_QDR:
+		speed = " QDR";
+		rate = 100;
+		break;
+	case IB_SPEED_FDR10:
+		speed = " FDR10";
+		rate = 100;
+		break;
+	case IB_SPEED_FDR:
+		speed = " FDR";
+		rate = 140;
+		break;
+	case IB_SPEED_EDR:
+		speed = " EDR";
+		rate = 250;
+		break;
+	case IB_SPEED_HDR:
+		speed = " HDR";
+		rate = 500;
+		break;
+	case IB_SPEED_SDR:
+	default:		/* default to SDR for invalid rates */
+		speed = " SDR";
+		rate = 25;
+		break;
+	}
+
+	rate *= ib_width_enum_to_int(attr.active_width);
+	if (rate < 0)
+		return -EINVAL;
+
+	return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
+		       rate / 10, rate % 10 ? ".5" : "",
+		       ib_width_enum_to_int(attr.active_width), speed);
+}
+
+static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
+			       char *buf)
+{
+	struct ib_port_attr attr;
+
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	switch (attr.phys_state) {
+	case 1:  return sprintf(buf, "1: Sleep\n");
+	case 2:  return sprintf(buf, "2: Polling\n");
+	case 3:  return sprintf(buf, "3: Disabled\n");
+	case 4:  return sprintf(buf, "4: PortConfigurationTraining\n");
+	case 5:  return sprintf(buf, "5: LinkUp\n");
+	case 6:  return sprintf(buf, "6: LinkErrorRecovery\n");
+	case 7:  return sprintf(buf, "7: Phy Test\n");
+	default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+	}
+}
+
+static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused,
+			       char *buf)
+{
+	switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) {
+	case IB_LINK_LAYER_INFINIBAND:
+		return sprintf(buf, "%s\n", "InfiniBand");
+	case IB_LINK_LAYER_ETHERNET:
+		return sprintf(buf, "%s\n", "Ethernet");
+	default:
+		return sprintf(buf, "%s\n", "Unknown");
+	}
+}
+
+static PORT_ATTR_RO(state);
+static PORT_ATTR_RO(lid);
+static PORT_ATTR_RO(lid_mask_count);
+static PORT_ATTR_RO(sm_lid);
+static PORT_ATTR_RO(sm_sl);
+static PORT_ATTR_RO(cap_mask);
+static PORT_ATTR_RO(rate);
+static PORT_ATTR_RO(phys_state);
+static PORT_ATTR_RO(link_layer);
+
+static struct attribute *port_default_attrs[] = {
+	&port_attr_state.attr,
+	&port_attr_lid.attr,
+	&port_attr_lid_mask_count.attr,
+	&port_attr_sm_lid.attr,
+	&port_attr_sm_sl.attr,
+	&port_attr_cap_mask.attr,
+	&port_attr_rate.attr,
+	&port_attr_phys_state.attr,
+	&port_attr_link_layer.attr,
+	NULL
+};
+
+static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf)
+{
+	if (!gid_attr->ndev)
+		return -EINVAL;
+
+	return sprintf(buf, "%s\n", gid_attr->ndev->name);
+}
+
+static size_t print_gid_type(struct ib_gid_attr *gid_attr, char *buf)
+{
+	return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type));
+}
+
+static ssize_t _show_port_gid_attr(struct ib_port *p,
+				   struct port_attribute *attr,
+				   char *buf,
+				   size_t (*print)(struct ib_gid_attr *gid_attr,
+						   char *buf))
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	union ib_gid gid;
+	struct ib_gid_attr gid_attr = {};
+	ssize_t ret;
+
+	ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid,
+			   &gid_attr);
+	if (ret)
+		goto err;
+
+	ret = print(&gid_attr, buf);
+
+err:
+	if (gid_attr.ndev)
+		dev_put(gid_attr.ndev);
+	return ret;
+}
+
+static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
+			     char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	union ib_gid *pgid;
+	union ib_gid gid;
+	ssize_t ret;
+
+	ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, NULL);
+
+	/* If reading GID fails, it is likely due to GID entry being empty
+	 * (invalid) or reserved GID in the table.
+	 * User space expects to read GID table entries as long as it given
+	 * index is within GID table size.
+	 * Administrative/debugging tool fails to query rest of the GID entries
+	 * if it hits error while querying a GID of the given index.
+	 * To avoid user space throwing such error on fail to read gid, return
+	 * zero GID as before. This maintains backward compatibility.
+	 */
+	if (ret)
+		pgid = &zgid;
+	else
+		pgid = &gid;
+	return sprintf(buf, "%pI6\n", pgid->raw);
+}
+
+static ssize_t show_port_gid_attr_ndev(struct ib_port *p,
+				       struct port_attribute *attr, char *buf)
+{
+	return _show_port_gid_attr(p, attr, buf, print_ndev);
+}
+
+static ssize_t show_port_gid_attr_gid_type(struct ib_port *p,
+					   struct port_attribute *attr,
+					   char *buf)
+{
+	return _show_port_gid_attr(p, attr, buf, print_gid_type);
+}
+
+static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
+			      char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	u16 pkey;
+	ssize_t ret;
+
+	ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define PORT_PMA_ATTR(_name, _counter, _width, _offset)			\
+struct port_table_attribute port_pma_attr_##_name = {			\
+	.attr  = __ATTR(_name, S_IRUGO, show_pma_counter, NULL),	\
+	.index = (_offset) | ((_width) << 16) | ((_counter) << 24),	\
+	.attr_id = IB_PMA_PORT_COUNTERS ,				\
+}
+
+#define PORT_PMA_ATTR_EXT(_name, _width, _offset)			\
+struct port_table_attribute port_pma_attr_ext_##_name = {		\
+	.attr  = __ATTR(_name, S_IRUGO, show_pma_counter, NULL),	\
+	.index = (_offset) | ((_width) << 16),				\
+	.attr_id = IB_PMA_PORT_COUNTERS_EXT ,				\
+}
+
+/*
+ * Get a Perfmgmt MAD block of data.
+ * Returns error code or the number of bytes retrieved.
+ */
+static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr,
+		void *data, int offset, size_t size)
+{
+	struct ib_mad *in_mad;
+	struct ib_mad *out_mad;
+	size_t mad_size = sizeof(*out_mad);
+	u16 out_mad_pkey_index = 0;
+	ssize_t ret;
+
+	if (!dev->process_mad)
+		return -ENOSYS;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	in_mad->mad_hdr.base_version  = 1;
+	in_mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_PERF_MGMT;
+	in_mad->mad_hdr.class_version = 1;
+	in_mad->mad_hdr.method        = IB_MGMT_METHOD_GET;
+	in_mad->mad_hdr.attr_id       = attr;
+
+	if (attr != IB_PMA_CLASS_PORT_INFO)
+		in_mad->data[41] = port_num;	/* PortSelect field */
+
+	if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY,
+		 port_num, NULL, NULL,
+		 (const struct ib_mad_hdr *)in_mad, mad_size,
+		 (struct ib_mad_hdr *)out_mad, &mad_size,
+		 &out_mad_pkey_index) &
+	     (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
+	    (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	memcpy(data, out_mad->data + offset, size);
+	ret = size;
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return ret;
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+				char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	int offset = tab_attr->index & 0xffff;
+	int width  = (tab_attr->index >> 16) & 0xff;
+	ssize_t ret;
+	u8 data[8];
+
+	ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data,
+			40 + offset / 8, sizeof(data));
+	if (ret < 0)
+		return sprintf(buf, "N/A (no PMA)\n");
+
+	switch (width) {
+	case 4:
+		ret = sprintf(buf, "%u\n", (*data >>
+					    (4 - (offset % 8))) & 0xf);
+		break;
+	case 8:
+		ret = sprintf(buf, "%u\n", *data);
+		break;
+	case 16:
+		ret = sprintf(buf, "%u\n",
+			      be16_to_cpup((__be16 *)data));
+		break;
+	case 32:
+		ret = sprintf(buf, "%u\n",
+			      be32_to_cpup((__be32 *)data));
+		break;
+	case 64:
+		ret = sprintf(buf, "%llu\n",
+				be64_to_cpup((__be64 *)data));
+		break;
+
+	default:
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static PORT_PMA_ATTR(symbol_error		    ,  0, 16,  32);
+static PORT_PMA_ATTR(link_error_recovery	    ,  1,  8,  48);
+static PORT_PMA_ATTR(link_downed		    ,  2,  8,  56);
+static PORT_PMA_ATTR(port_rcv_errors		    ,  3, 16,  64);
+static PORT_PMA_ATTR(port_rcv_remote_physical_errors,  4, 16,  80);
+static PORT_PMA_ATTR(port_rcv_switch_relay_errors   ,  5, 16,  96);
+static PORT_PMA_ATTR(port_xmit_discards		    ,  6, 16, 112);
+static PORT_PMA_ATTR(port_xmit_constraint_errors    ,  7,  8, 128);
+static PORT_PMA_ATTR(port_rcv_constraint_errors	    ,  8,  8, 136);
+static PORT_PMA_ATTR(local_link_integrity_errors    ,  9,  4, 152);
+static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10,  4, 156);
+static PORT_PMA_ATTR(VL15_dropped		    , 11, 16, 176);
+static PORT_PMA_ATTR(port_xmit_data		    , 12, 32, 192);
+static PORT_PMA_ATTR(port_rcv_data		    , 13, 32, 224);
+static PORT_PMA_ATTR(port_xmit_packets		    , 14, 32, 256);
+static PORT_PMA_ATTR(port_rcv_packets		    , 15, 32, 288);
+static PORT_PMA_ATTR(port_xmit_wait		    ,  0, 32, 320);
+
+/*
+ * Counters added by extended set
+ */
+static PORT_PMA_ATTR_EXT(port_xmit_data		    , 64,  64);
+static PORT_PMA_ATTR_EXT(port_rcv_data		    , 64, 128);
+static PORT_PMA_ATTR_EXT(port_xmit_packets	    , 64, 192);
+static PORT_PMA_ATTR_EXT(port_rcv_packets	    , 64, 256);
+static PORT_PMA_ATTR_EXT(unicast_xmit_packets	    , 64, 320);
+static PORT_PMA_ATTR_EXT(unicast_rcv_packets	    , 64, 384);
+static PORT_PMA_ATTR_EXT(multicast_xmit_packets	    , 64, 448);
+static PORT_PMA_ATTR_EXT(multicast_rcv_packets	    , 64, 512);
+
+static struct attribute *pma_attrs[] = {
+	&port_pma_attr_symbol_error.attr.attr,
+	&port_pma_attr_link_error_recovery.attr.attr,
+	&port_pma_attr_link_downed.attr.attr,
+	&port_pma_attr_port_rcv_errors.attr.attr,
+	&port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+	&port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+	&port_pma_attr_port_xmit_discards.attr.attr,
+	&port_pma_attr_port_xmit_constraint_errors.attr.attr,
+	&port_pma_attr_port_rcv_constraint_errors.attr.attr,
+	&port_pma_attr_local_link_integrity_errors.attr.attr,
+	&port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+	&port_pma_attr_VL15_dropped.attr.attr,
+	&port_pma_attr_port_xmit_data.attr.attr,
+	&port_pma_attr_port_rcv_data.attr.attr,
+	&port_pma_attr_port_xmit_packets.attr.attr,
+	&port_pma_attr_port_rcv_packets.attr.attr,
+	&port_pma_attr_port_xmit_wait.attr.attr,
+	NULL
+};
+
+static struct attribute *pma_attrs_ext[] = {
+	&port_pma_attr_symbol_error.attr.attr,
+	&port_pma_attr_link_error_recovery.attr.attr,
+	&port_pma_attr_link_downed.attr.attr,
+	&port_pma_attr_port_rcv_errors.attr.attr,
+	&port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+	&port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+	&port_pma_attr_port_xmit_discards.attr.attr,
+	&port_pma_attr_port_xmit_constraint_errors.attr.attr,
+	&port_pma_attr_port_rcv_constraint_errors.attr.attr,
+	&port_pma_attr_local_link_integrity_errors.attr.attr,
+	&port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+	&port_pma_attr_VL15_dropped.attr.attr,
+	&port_pma_attr_ext_port_xmit_data.attr.attr,
+	&port_pma_attr_ext_port_rcv_data.attr.attr,
+	&port_pma_attr_ext_port_xmit_packets.attr.attr,
+	&port_pma_attr_port_xmit_wait.attr.attr,
+	&port_pma_attr_ext_port_rcv_packets.attr.attr,
+	&port_pma_attr_ext_unicast_rcv_packets.attr.attr,
+	&port_pma_attr_ext_unicast_xmit_packets.attr.attr,
+	&port_pma_attr_ext_multicast_rcv_packets.attr.attr,
+	&port_pma_attr_ext_multicast_xmit_packets.attr.attr,
+	NULL
+};
+
+static struct attribute *pma_attrs_noietf[] = {
+	&port_pma_attr_symbol_error.attr.attr,
+	&port_pma_attr_link_error_recovery.attr.attr,
+	&port_pma_attr_link_downed.attr.attr,
+	&port_pma_attr_port_rcv_errors.attr.attr,
+	&port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+	&port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+	&port_pma_attr_port_xmit_discards.attr.attr,
+	&port_pma_attr_port_xmit_constraint_errors.attr.attr,
+	&port_pma_attr_port_rcv_constraint_errors.attr.attr,
+	&port_pma_attr_local_link_integrity_errors.attr.attr,
+	&port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+	&port_pma_attr_VL15_dropped.attr.attr,
+	&port_pma_attr_ext_port_xmit_data.attr.attr,
+	&port_pma_attr_ext_port_rcv_data.attr.attr,
+	&port_pma_attr_ext_port_xmit_packets.attr.attr,
+	&port_pma_attr_ext_port_rcv_packets.attr.attr,
+	&port_pma_attr_port_xmit_wait.attr.attr,
+	NULL
+};
+
+static struct attribute_group pma_group = {
+	.name  = "counters",
+	.attrs  = pma_attrs
+};
+
+static struct attribute_group pma_group_ext = {
+	.name  = "counters",
+	.attrs  = pma_attrs_ext
+};
+
+static struct attribute_group pma_group_noietf = {
+	.name  = "counters",
+	.attrs  = pma_attrs_noietf
+};
+
+static void ib_port_release(struct kobject *kobj)
+{
+	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+	struct attribute *a;
+	int i;
+
+	if (p->gid_group.attrs) {
+		for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+			kfree(a);
+
+		kfree(p->gid_group.attrs);
+	}
+
+	if (p->pkey_group.attrs) {
+		for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+			kfree(a);
+
+		kfree(p->pkey_group.attrs);
+	}
+
+	kfree(p);
+}
+
+static void ib_port_gid_attr_release(struct kobject *kobj)
+{
+	struct gid_attr_group *g = container_of(kobj, struct gid_attr_group,
+						kobj);
+	struct attribute *a;
+	int i;
+
+	if (g->ndev.attrs) {
+		for (i = 0; (a = g->ndev.attrs[i]); ++i)
+			kfree(a);
+
+		kfree(g->ndev.attrs);
+	}
+
+	if (g->type.attrs) {
+		for (i = 0; (a = g->type.attrs[i]); ++i)
+			kfree(a);
+
+		kfree(g->type.attrs);
+	}
+
+	kfree(g);
+}
+
+static struct kobj_type port_type = {
+	.release       = ib_port_release,
+	.sysfs_ops     = &port_sysfs_ops,
+	.default_attrs = port_default_attrs
+};
+
+static struct kobj_type gid_attr_type = {
+	.sysfs_ops      = &gid_attr_sysfs_ops,
+	.release        = ib_port_gid_attr_release
+};
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct ib_port *,
+				  struct port_attribute *, char *buf),
+		  int len)
+{
+	struct attribute **tab_attr;
+	struct port_table_attribute *element;
+	int i;
+
+	tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL);
+	if (!tab_attr)
+		return NULL;
+
+	for (i = 0; i < len; i++) {
+		element = kzalloc(sizeof(struct port_table_attribute),
+				  GFP_KERNEL);
+		if (!element)
+			goto err;
+
+		if (snprintf(element->name, sizeof(element->name),
+			     "%d", i) >= sizeof(element->name)) {
+			kfree(element);
+			goto err;
+		}
+
+		element->attr.attr.name  = element->name;
+		element->attr.attr.mode  = S_IRUGO;
+		element->attr.show       = show;
+		element->index		 = i;
+		sysfs_attr_init(&element->attr.attr);
+
+		tab_attr[i] = &element->attr.attr;
+	}
+
+	return tab_attr;
+
+err:
+	while (--i >= 0)
+		kfree(tab_attr[i]);
+	kfree(tab_attr);
+	return NULL;
+}
+
+/*
+ * Figure out which counter table to use depending on
+ * the device capabilities.
+ */
+static struct attribute_group *get_counter_table(struct ib_device *dev,
+						 int port_num)
+{
+	struct ib_class_port_info cpi;
+
+	if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO,
+				&cpi, 40, sizeof(cpi)) >= 0) {
+		if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH)
+			/* We have extended counters */
+			return &pma_group_ext;
+
+		if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF)
+			/* But not the IETF ones */
+			return &pma_group_noietf;
+	}
+
+	/* Fall back to normal counters */
+	return &pma_group;
+}
+
+static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats,
+			   u8 port_num, int index)
+{
+	int ret;
+
+	if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan))
+		return 0;
+	ret = dev->get_hw_stats(dev, stats, port_num, index);
+	if (ret < 0)
+		return ret;
+	if (ret == stats->num_counters)
+		stats->timestamp = jiffies;
+
+	return 0;
+}
+
+static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf)
+{
+	return sprintf(buf, "%llu\n", stats->value[index]);
+}
+
+static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct ib_device *dev;
+	struct ib_port *port;
+	struct hw_stats_attribute *hsa;
+	struct rdma_hw_stats *stats;
+	int ret;
+
+	hsa = container_of(attr, struct hw_stats_attribute, attr);
+	if (!hsa->port_num) {
+		dev = container_of((struct device *)kobj,
+				   struct ib_device, dev);
+		stats = dev->hw_stats;
+	} else {
+		port = container_of(kobj, struct ib_port, kobj);
+		dev = port->ibdev;
+		stats = port->hw_stats;
+	}
+	mutex_lock(&stats->lock);
+	ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index);
+	if (ret)
+		goto unlock;
+	ret = print_hw_stat(stats, hsa->index, buf);
+unlock:
+	mutex_unlock(&stats->lock);
+
+	return ret;
+}
+
+static ssize_t show_stats_lifespan(struct kobject *kobj,
+				   struct attribute *attr,
+				   char *buf)
+{
+	struct hw_stats_attribute *hsa;
+	struct rdma_hw_stats *stats;
+	int msecs;
+
+	hsa = container_of(attr, struct hw_stats_attribute, attr);
+	if (!hsa->port_num) {
+		struct ib_device *dev = container_of((struct device *)kobj,
+						     struct ib_device, dev);
+
+		stats = dev->hw_stats;
+	} else {
+		struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+		stats = p->hw_stats;
+	}
+
+	mutex_lock(&stats->lock);
+	msecs = jiffies_to_msecs(stats->lifespan);
+	mutex_unlock(&stats->lock);
+
+	return sprintf(buf, "%d\n", msecs);
+}
+
+static ssize_t set_stats_lifespan(struct kobject *kobj,
+				  struct attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct hw_stats_attribute *hsa;
+	struct rdma_hw_stats *stats;
+	int msecs;
+	int jiffies;
+	int ret;
+
+	ret = kstrtoint(buf, 10, &msecs);
+	if (ret)
+		return ret;
+	if (msecs < 0 || msecs > 10000)
+		return -EINVAL;
+	jiffies = msecs_to_jiffies(msecs);
+	hsa = container_of(attr, struct hw_stats_attribute, attr);
+	if (!hsa->port_num) {
+		struct ib_device *dev = container_of((struct device *)kobj,
+						     struct ib_device, dev);
+
+		stats = dev->hw_stats;
+	} else {
+		struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+		stats = p->hw_stats;
+	}
+
+	mutex_lock(&stats->lock);
+	stats->lifespan = jiffies;
+	mutex_unlock(&stats->lock);
+
+	return count;
+}
+
+static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group)
+{
+	struct attribute **attr;
+
+	sysfs_remove_group(kobj, attr_group);
+
+	for (attr = attr_group->attrs; *attr; attr++)
+		kfree(*attr);
+	kfree(attr_group);
+}
+
+static struct attribute *alloc_hsa(int index, u8 port_num, const char *name)
+{
+	struct hw_stats_attribute *hsa;
+
+	hsa = kmalloc(sizeof(*hsa), GFP_KERNEL);
+	if (!hsa)
+		return NULL;
+
+	hsa->attr.name = (char *)name;
+	hsa->attr.mode = S_IRUGO;
+	hsa->show = show_hw_stats;
+	hsa->store = NULL;
+	hsa->index = index;
+	hsa->port_num = port_num;
+
+	return &hsa->attr;
+}
+
+static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num)
+{
+	struct hw_stats_attribute *hsa;
+
+	hsa = kmalloc(sizeof(*hsa), GFP_KERNEL);
+	if (!hsa)
+		return NULL;
+
+	hsa->attr.name = name;
+	hsa->attr.mode = S_IWUSR | S_IRUGO;
+	hsa->show = show_stats_lifespan;
+	hsa->store = set_stats_lifespan;
+	hsa->index = 0;
+	hsa->port_num = port_num;
+
+	return &hsa->attr;
+}
+
+static void setup_hw_stats(struct ib_device *device, struct ib_port *port,
+			   u8 port_num)
+{
+	struct attribute_group *hsag;
+	struct rdma_hw_stats *stats;
+	int i, ret;
+
+	stats = device->alloc_hw_stats(device, port_num);
+
+	if (!stats)
+		return;
+
+	if (!stats->names || stats->num_counters <= 0)
+		goto err_free_stats;
+
+	/*
+	 * Two extra attribue elements here, one for the lifespan entry and
+	 * one to NULL terminate the list for the sysfs core code
+	 */
+	hsag = kzalloc(sizeof(*hsag) +
+		       sizeof(void *) * (stats->num_counters + 2),
+		       GFP_KERNEL);
+	if (!hsag)
+		goto err_free_stats;
+
+	ret = device->get_hw_stats(device, stats, port_num,
+				   stats->num_counters);
+	if (ret != stats->num_counters)
+		goto err_free_hsag;
+
+	stats->timestamp = jiffies;
+
+	hsag->name = "hw_counters";
+	hsag->attrs = (void *)hsag + sizeof(*hsag);
+
+	for (i = 0; i < stats->num_counters; i++) {
+		hsag->attrs[i] = alloc_hsa(i, port_num, stats->names[i]);
+		if (!hsag->attrs[i])
+			goto err;
+		sysfs_attr_init(hsag->attrs[i]);
+	}
+
+	mutex_init(&stats->lock);
+	/* treat an error here as non-fatal */
+	hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num);
+	if (hsag->attrs[i])
+		sysfs_attr_init(hsag->attrs[i]);
+
+	if (port) {
+		struct kobject *kobj = &port->kobj;
+		ret = sysfs_create_group(kobj, hsag);
+		if (ret)
+			goto err;
+		port->hw_stats_ag = hsag;
+		port->hw_stats = stats;
+	} else {
+		struct kobject *kobj = &device->dev.kobj;
+		ret = sysfs_create_group(kobj, hsag);
+		if (ret)
+			goto err;
+		device->hw_stats_ag = hsag;
+		device->hw_stats = stats;
+	}
+
+	return;
+
+err:
+	for (; i >= 0; i--)
+		kfree(hsag->attrs[i]);
+err_free_hsag:
+	kfree(hsag);
+err_free_stats:
+	kfree(stats);
+	return;
+}
+
+static int add_port(struct ib_device *device, int port_num,
+		    int (*port_callback)(struct ib_device *,
+					 u8, struct kobject *))
+{
+	struct ib_port *p;
+	struct ib_port_attr attr;
+	int i;
+	int ret;
+
+	ret = ib_query_port(device, port_num, &attr);
+	if (ret)
+		return ret;
+
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	p->ibdev      = device;
+	p->port_num   = port_num;
+
+	ret = kobject_init_and_add(&p->kobj, &port_type,
+				   device->ports_parent,
+				   "%d", port_num);
+	if (ret) {
+		kfree(p);
+		return ret;
+	}
+
+	p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL);
+	if (!p->gid_attr_group) {
+		ret = -ENOMEM;
+		goto err_put;
+	}
+
+	p->gid_attr_group->port = p;
+	ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type,
+				   &p->kobj, "gid_attrs");
+	if (ret) {
+		kfree(p->gid_attr_group);
+		goto err_put;
+	}
+
+	p->pma_table = get_counter_table(device, port_num);
+	ret = sysfs_create_group(&p->kobj, p->pma_table);
+	if (ret)
+		goto err_put_gid_attrs;
+
+	p->gid_group.name  = "gids";
+	p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
+	if (!p->gid_group.attrs) {
+		ret = -ENOMEM;
+		goto err_remove_pma;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &p->gid_group);
+	if (ret)
+		goto err_free_gid;
+
+	p->gid_attr_group->ndev.name = "ndevs";
+	p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev,
+							  attr.gid_tbl_len);
+	if (!p->gid_attr_group->ndev.attrs) {
+		ret = -ENOMEM;
+		goto err_remove_gid;
+	}
+
+	ret = sysfs_create_group(&p->gid_attr_group->kobj,
+				 &p->gid_attr_group->ndev);
+	if (ret)
+		goto err_free_gid_ndev;
+
+	p->gid_attr_group->type.name = "types";
+	p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type,
+							  attr.gid_tbl_len);
+	if (!p->gid_attr_group->type.attrs) {
+		ret = -ENOMEM;
+		goto err_remove_gid_ndev;
+	}
+
+	ret = sysfs_create_group(&p->gid_attr_group->kobj,
+				 &p->gid_attr_group->type);
+	if (ret)
+		goto err_free_gid_type;
+
+	p->pkey_group.name  = "pkeys";
+	p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
+						attr.pkey_tbl_len);
+	if (!p->pkey_group.attrs) {
+		ret = -ENOMEM;
+		goto err_remove_gid_type;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+	if (ret)
+		goto err_free_pkey;
+
+	if (port_callback) {
+		ret = port_callback(device, port_num, &p->kobj);
+		if (ret)
+			goto err_remove_pkey;
+	}
+
+	/*
+	 * If port == 0, it means we have only one port and the parent
+	 * device, not this port device, should be the holder of the
+	 * hw_counters
+	 */
+	if (device->alloc_hw_stats && port_num)
+		setup_hw_stats(device, p, port_num);
+
+	list_add_tail(&p->kobj.entry, &device->port_list);
+
+	kobject_uevent(&p->kobj, KOBJ_ADD);
+	return 0;
+
+err_remove_pkey:
+	sysfs_remove_group(&p->kobj, &p->pkey_group);
+
+err_free_pkey:
+	for (i = 0; i < attr.pkey_tbl_len; ++i)
+		kfree(p->pkey_group.attrs[i]);
+
+	kfree(p->pkey_group.attrs);
+	p->pkey_group.attrs = NULL;
+
+err_remove_gid_type:
+	sysfs_remove_group(&p->gid_attr_group->kobj,
+			   &p->gid_attr_group->type);
+
+err_free_gid_type:
+	for (i = 0; i < attr.gid_tbl_len; ++i)
+		kfree(p->gid_attr_group->type.attrs[i]);
+
+	kfree(p->gid_attr_group->type.attrs);
+	p->gid_attr_group->type.attrs = NULL;
+
+err_remove_gid_ndev:
+	sysfs_remove_group(&p->gid_attr_group->kobj,
+			   &p->gid_attr_group->ndev);
+
+err_free_gid_ndev:
+	for (i = 0; i < attr.gid_tbl_len; ++i)
+		kfree(p->gid_attr_group->ndev.attrs[i]);
+
+	kfree(p->gid_attr_group->ndev.attrs);
+	p->gid_attr_group->ndev.attrs = NULL;
+
+err_remove_gid:
+	sysfs_remove_group(&p->kobj, &p->gid_group);
+
+err_free_gid:
+	for (i = 0; i < attr.gid_tbl_len; ++i)
+		kfree(p->gid_group.attrs[i]);
+
+	kfree(p->gid_group.attrs);
+	p->gid_group.attrs = NULL;
+
+err_remove_pma:
+	sysfs_remove_group(&p->kobj, p->pma_table);
+
+err_put_gid_attrs:
+	kobject_put(&p->gid_attr_group->kobj);
+
+err_put:
+	kobject_put(&p->kobj);
+	return ret;
+}
+
+static ssize_t show_node_type(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	switch (dev->node_type) {
+	case RDMA_NODE_IB_CA:	  return sprintf(buf, "%d: CA\n", dev->node_type);
+	case RDMA_NODE_RNIC:	  return sprintf(buf, "%d: RNIC\n", dev->node_type);
+	case RDMA_NODE_USNIC:	  return sprintf(buf, "%d: usNIC\n", dev->node_type);
+	case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type);
+	case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
+	case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
+	default:		  return sprintf(buf, "%d: <unknown>\n", dev->node_type);
+	}
+}
+
+static ssize_t show_sys_image_guid(struct device *device,
+				   struct device_attribute *dev_attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]),
+		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[1]),
+		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]),
+		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3]));
+}
+
+static ssize_t show_node_guid(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[1]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[2]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[3]));
+}
+
+static ssize_t show_node_desc(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	return sprintf(buf, "%.64s\n", dev->node_desc);
+}
+
+static ssize_t set_node_desc(struct device *device,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device_modify desc = {};
+	int ret;
+
+	if (!dev->modify_device)
+		return -EIO;
+
+	memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX));
+	ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	ib_get_device_fw_str(dev, buf);
+	strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX);
+	return strlen(buf);
+}
+
+static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
+static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
+static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
+static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+
+static struct device_attribute *ib_class_attributes[] = {
+	&dev_attr_node_type,
+	&dev_attr_sys_image_guid,
+	&dev_attr_node_guid,
+	&dev_attr_node_desc,
+	&dev_attr_fw_ver,
+};
+
+static void free_port_list_attributes(struct ib_device *device)
+{
+	struct kobject *p, *t;
+
+	list_for_each_entry_safe(p, t, &device->port_list, entry) {
+		struct ib_port *port = container_of(p, struct ib_port, kobj);
+		list_del(&p->entry);
+		if (port->hw_stats) {
+			kfree(port->hw_stats);
+			free_hsag(&port->kobj, port->hw_stats_ag);
+		}
+		sysfs_remove_group(p, port->pma_table);
+		sysfs_remove_group(p, &port->pkey_group);
+		sysfs_remove_group(p, &port->gid_group);
+		sysfs_remove_group(&port->gid_attr_group->kobj,
+				   &port->gid_attr_group->ndev);
+		sysfs_remove_group(&port->gid_attr_group->kobj,
+				   &port->gid_attr_group->type);
+		kobject_put(&port->gid_attr_group->kobj);
+		kobject_put(p);
+	}
+
+	kobject_put(device->ports_parent);
+}
+
+int ib_device_register_sysfs(struct ib_device *device,
+			     int (*port_callback)(struct ib_device *,
+						  u8, struct kobject *))
+{
+	struct device *class_dev = &device->dev;
+	int ret;
+	int i;
+
+	ret = dev_set_name(class_dev, "%s", device->name);
+	if (ret)
+		return ret;
+
+	ret = device_add(class_dev);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
+		ret = device_create_file(class_dev, ib_class_attributes[i]);
+		if (ret)
+			goto err_unregister;
+	}
+
+	device->ports_parent = kobject_create_and_add("ports",
+						      &class_dev->kobj);
+	if (!device->ports_parent) {
+		ret = -ENOMEM;
+		goto err_put;
+	}
+
+	if (rdma_cap_ib_switch(device)) {
+		ret = add_port(device, 0, port_callback);
+		if (ret)
+			goto err_put;
+	} else {
+		for (i = 1; i <= device->phys_port_cnt; ++i) {
+			ret = add_port(device, i, port_callback);
+			if (ret)
+				goto err_put;
+		}
+	}
+
+	if (device->alloc_hw_stats)
+		setup_hw_stats(device, NULL, 0);
+
+	return 0;
+
+err_put:
+	free_port_list_attributes(device);
+
+err_unregister:
+	device_del(class_dev);
+
+err:
+	return ret;
+}
+
+void ib_device_unregister_sysfs(struct ib_device *device)
+{
+	int i;
+
+	/* Hold kobject until ib_dealloc_device() */
+	kobject_get(&device->dev.kobj);
+
+	free_port_list_attributes(device);
+
+	if (device->hw_stats) {
+		kfree(device->hw_stats);
+		free_hsag(&device->dev.kobj, device->hw_stats_ag);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i)
+		device_remove_file(&device->dev, ib_class_attributes[i]);
+
+	device_unregister(&device->dev);
+}
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
new file mode 100644
index 0000000..9eef96d
--- /dev/null
+++ b/drivers/infiniband/core/ucm.c
@@ -0,0 +1,1359 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer in the documentation and/or other materials
+ *	provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+#include <linux/uaccess.h>
+
+#include <rdma/ib.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_user_cm.h>
+#include <rdma/ib_marshall.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Libor Michalek");
+MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_ucm_device {
+	int			devnum;
+	struct cdev		cdev;
+	struct device		dev;
+	struct ib_device	*ib_dev;
+};
+
+struct ib_ucm_file {
+	struct mutex file_mutex;
+	struct file *filp;
+	struct ib_ucm_device *device;
+
+	struct list_head  ctxs;
+	struct list_head  events;
+	wait_queue_head_t poll_wait;
+};
+
+struct ib_ucm_context {
+	int                 id;
+	struct completion   comp;
+	atomic_t            ref;
+	int		    events_reported;
+
+	struct ib_ucm_file *file;
+	struct ib_cm_id    *cm_id;
+	__u64		   uid;
+
+	struct list_head    events;    /* list of pending events. */
+	struct list_head    file_list; /* member in file ctx list */
+};
+
+struct ib_ucm_event {
+	struct ib_ucm_context *ctx;
+	struct list_head file_list; /* member in file event list */
+	struct list_head ctx_list;  /* member in ctx event list */
+
+	struct ib_cm_id *cm_id;
+	struct ib_ucm_event_resp resp;
+	void *data;
+	void *info;
+	int data_len;
+	int info_len;
+};
+
+enum {
+	IB_UCM_MAJOR = 231,
+	IB_UCM_BASE_MINOR = 224,
+	IB_UCM_MAX_DEVICES = RDMA_MAX_PORTS,
+	IB_UCM_NUM_FIXED_MINOR = 32,
+	IB_UCM_NUM_DYNAMIC_MINOR = IB_UCM_MAX_DEVICES - IB_UCM_NUM_FIXED_MINOR,
+};
+
+#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
+static dev_t dynamic_ucm_dev;
+
+static void ib_ucm_add_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
+
+static struct ib_client ucm_client = {
+	.name   = "ucm",
+	.add    = ib_ucm_add_one,
+	.remove = ib_ucm_remove_one
+};
+
+static DEFINE_MUTEX(ctx_id_mutex);
+static DEFINE_IDR(ctx_id_table);
+static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
+
+static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
+{
+	struct ib_ucm_context *ctx;
+
+	mutex_lock(&ctx_id_mutex);
+	ctx = idr_find(&ctx_id_table, id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	else
+		atomic_inc(&ctx->ref);
+	mutex_unlock(&ctx_id_mutex);
+
+	return ctx;
+}
+
+static void ib_ucm_ctx_put(struct ib_ucm_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->ref))
+		complete(&ctx->comp);
+}
+
+static inline int ib_ucm_new_cm_id(int event)
+{
+	return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED;
+}
+
+static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx)
+{
+	struct ib_ucm_event *uevent;
+
+	mutex_lock(&ctx->file->file_mutex);
+	list_del(&ctx->file_list);
+	while (!list_empty(&ctx->events)) {
+
+		uevent = list_entry(ctx->events.next,
+				    struct ib_ucm_event, ctx_list);
+		list_del(&uevent->file_list);
+		list_del(&uevent->ctx_list);
+		mutex_unlock(&ctx->file->file_mutex);
+
+		/* clear incoming connections. */
+		if (ib_ucm_new_cm_id(uevent->resp.event))
+			ib_destroy_cm_id(uevent->cm_id);
+
+		kfree(uevent);
+		mutex_lock(&ctx->file->file_mutex);
+	}
+	mutex_unlock(&ctx->file->file_mutex);
+}
+
+static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
+{
+	struct ib_ucm_context *ctx;
+
+	ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	atomic_set(&ctx->ref, 1);
+	init_completion(&ctx->comp);
+	ctx->file = file;
+	INIT_LIST_HEAD(&ctx->events);
+
+	mutex_lock(&ctx_id_mutex);
+	ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL);
+	mutex_unlock(&ctx_id_mutex);
+	if (ctx->id < 0)
+		goto error;
+
+	list_add_tail(&ctx->file_list, &file->ctxs);
+	return ctx;
+
+error:
+	kfree(ctx);
+	return NULL;
+}
+
+static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq,
+				 struct ib_cm_req_event_param *kreq)
+{
+	ureq->remote_ca_guid             = kreq->remote_ca_guid;
+	ureq->remote_qkey                = kreq->remote_qkey;
+	ureq->remote_qpn                 = kreq->remote_qpn;
+	ureq->qp_type                    = kreq->qp_type;
+	ureq->starting_psn               = kreq->starting_psn;
+	ureq->responder_resources        = kreq->responder_resources;
+	ureq->initiator_depth            = kreq->initiator_depth;
+	ureq->local_cm_response_timeout  = kreq->local_cm_response_timeout;
+	ureq->flow_control               = kreq->flow_control;
+	ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout;
+	ureq->retry_count                = kreq->retry_count;
+	ureq->rnr_retry_count            = kreq->rnr_retry_count;
+	ureq->srq                        = kreq->srq;
+	ureq->port			 = kreq->port;
+
+	ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path);
+	if (kreq->alternate_path)
+		ib_copy_path_rec_to_user(&ureq->alternate_path,
+					 kreq->alternate_path);
+}
+
+static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep,
+				 struct ib_cm_rep_event_param *krep)
+{
+	urep->remote_ca_guid      = krep->remote_ca_guid;
+	urep->remote_qkey         = krep->remote_qkey;
+	urep->remote_qpn          = krep->remote_qpn;
+	urep->starting_psn        = krep->starting_psn;
+	urep->responder_resources = krep->responder_resources;
+	urep->initiator_depth     = krep->initiator_depth;
+	urep->target_ack_delay    = krep->target_ack_delay;
+	urep->failover_accepted   = krep->failover_accepted;
+	urep->flow_control        = krep->flow_control;
+	urep->rnr_retry_count     = krep->rnr_retry_count;
+	urep->srq                 = krep->srq;
+}
+
+static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep,
+				      struct ib_cm_sidr_rep_event_param *krep)
+{
+	urep->status = krep->status;
+	urep->qkey   = krep->qkey;
+	urep->qpn    = krep->qpn;
+};
+
+static int ib_ucm_event_process(struct ib_cm_event *evt,
+				struct ib_ucm_event *uvt)
+{
+	void *info = NULL;
+
+	switch (evt->event) {
+	case IB_CM_REQ_RECEIVED:
+		ib_ucm_event_req_get(&uvt->resp.u.req_resp,
+				     &evt->param.req_rcvd);
+		uvt->data_len      = IB_CM_REQ_PRIVATE_DATA_SIZE;
+		uvt->resp.present  = IB_UCM_PRES_PRIMARY;
+		uvt->resp.present |= (evt->param.req_rcvd.alternate_path ?
+				      IB_UCM_PRES_ALTERNATE : 0);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ib_ucm_event_rep_get(&uvt->resp.u.rep_resp,
+				     &evt->param.rep_rcvd);
+		uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_RTU_RECEIVED:
+		uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_DREQ_RECEIVED:
+		uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_DREP_RECEIVED:
+		uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_MRA_RECEIVED:
+		uvt->resp.u.mra_resp.timeout =
+					evt->param.mra_rcvd.service_timeout;
+		uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_REJ_RECEIVED:
+		uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason;
+		uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.rej_rcvd.ari_length;
+		info	      = evt->param.rej_rcvd.ari;
+		break;
+	case IB_CM_LAP_RECEIVED:
+		ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path,
+					 evt->param.lap_rcvd.alternate_path);
+		uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE;
+		uvt->resp.present = IB_UCM_PRES_ALTERNATE;
+		break;
+	case IB_CM_APR_RECEIVED:
+		uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status;
+		uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.apr_rcvd.info_len;
+		info	      = evt->param.apr_rcvd.apr_info;
+		break;
+	case IB_CM_SIDR_REQ_RECEIVED:
+		uvt->resp.u.sidr_req_resp.pkey =
+					evt->param.sidr_req_rcvd.pkey;
+		uvt->resp.u.sidr_req_resp.port =
+					evt->param.sidr_req_rcvd.port;
+		uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp,
+					  &evt->param.sidr_rep_rcvd);
+		uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.sidr_rep_rcvd.info_len;
+		info	      = evt->param.sidr_rep_rcvd.info;
+		break;
+	default:
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	}
+
+	if (uvt->data_len) {
+		uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL);
+		if (!uvt->data)
+			goto err1;
+
+		uvt->resp.present |= IB_UCM_PRES_DATA;
+	}
+
+	if (uvt->info_len) {
+		uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL);
+		if (!uvt->info)
+			goto err2;
+
+		uvt->resp.present |= IB_UCM_PRES_INFO;
+	}
+	return 0;
+
+err2:
+	kfree(uvt->data);
+err1:
+	return -ENOMEM;
+}
+
+static int ib_ucm_event_handler(struct ib_cm_id *cm_id,
+				struct ib_cm_event *event)
+{
+	struct ib_ucm_event *uevent;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	ctx = cm_id->context;
+
+	uevent = kzalloc(sizeof *uevent, GFP_KERNEL);
+	if (!uevent)
+		goto err1;
+
+	uevent->ctx = ctx;
+	uevent->cm_id = cm_id;
+	uevent->resp.uid = ctx->uid;
+	uevent->resp.id = ctx->id;
+	uevent->resp.event = event->event;
+
+	result = ib_ucm_event_process(event, uevent);
+	if (result)
+		goto err2;
+
+	mutex_lock(&ctx->file->file_mutex);
+	list_add_tail(&uevent->file_list, &ctx->file->events);
+	list_add_tail(&uevent->ctx_list, &ctx->events);
+	wake_up_interruptible(&ctx->file->poll_wait);
+	mutex_unlock(&ctx->file->file_mutex);
+	return 0;
+
+err2:
+	kfree(uevent);
+err1:
+	/* Destroy new cm_id's */
+	return ib_ucm_new_cm_id(event->event);
+}
+
+static ssize_t ib_ucm_event(struct ib_ucm_file *file,
+			    const char __user *inbuf,
+			    int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_event_get cmd;
+	struct ib_ucm_event *uevent;
+	int result = 0;
+
+	if (out_len < sizeof(struct ib_ucm_event_resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->file_mutex);
+	while (list_empty(&file->events)) {
+		mutex_unlock(&file->file_mutex);
+
+		if (file->filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->events)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->file_mutex);
+	}
+
+	uevent = list_entry(file->events.next, struct ib_ucm_event, file_list);
+
+	if (ib_ucm_new_cm_id(uevent->resp.event)) {
+		ctx = ib_ucm_ctx_alloc(file);
+		if (!ctx) {
+			result = -ENOMEM;
+			goto done;
+		}
+
+		ctx->cm_id = uevent->cm_id;
+		ctx->cm_id->context = ctx;
+		uevent->resp.id = ctx->id;
+	}
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response),
+			 &uevent->resp, sizeof(uevent->resp))) {
+		result = -EFAULT;
+		goto done;
+	}
+
+	if (uevent->data) {
+		if (cmd.data_len < uevent->data_len) {
+			result = -ENOMEM;
+			goto done;
+		}
+		if (copy_to_user(u64_to_user_ptr(cmd.data),
+				 uevent->data, uevent->data_len)) {
+			result = -EFAULT;
+			goto done;
+		}
+	}
+
+	if (uevent->info) {
+		if (cmd.info_len < uevent->info_len) {
+			result = -ENOMEM;
+			goto done;
+		}
+		if (copy_to_user(u64_to_user_ptr(cmd.info),
+				 uevent->info, uevent->info_len)) {
+			result = -EFAULT;
+			goto done;
+		}
+	}
+
+	list_del(&uevent->file_list);
+	list_del(&uevent->ctx_list);
+	uevent->ctx->events_reported++;
+
+	kfree(uevent->data);
+	kfree(uevent->info);
+	kfree(uevent);
+done:
+	mutex_unlock(&file->file_mutex);
+	return result;
+}
+
+static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	struct ib_ucm_create_id cmd;
+	struct ib_ucm_create_id_resp resp;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->file_mutex);
+	ctx = ib_ucm_ctx_alloc(file);
+	mutex_unlock(&file->file_mutex);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->uid = cmd.uid;
+	ctx->cm_id = ib_create_cm_id(file->device->ib_dev,
+				     ib_ucm_event_handler, ctx);
+	if (IS_ERR(ctx->cm_id)) {
+		result = PTR_ERR(ctx->cm_id);
+		goto err1;
+	}
+
+	resp.id = ctx->id;
+	if (copy_to_user(u64_to_user_ptr(cmd.response),
+			 &resp, sizeof(resp))) {
+		result = -EFAULT;
+		goto err2;
+	}
+	return 0;
+
+err2:
+	ib_destroy_cm_id(ctx->cm_id);
+err1:
+	mutex_lock(&ctx_id_mutex);
+	idr_remove(&ctx_id_table, ctx->id);
+	mutex_unlock(&ctx_id_mutex);
+	kfree(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct ib_ucm_destroy_id cmd;
+	struct ib_ucm_destroy_id_resp resp;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&ctx_id_mutex);
+	ctx = idr_find(&ctx_id_table, cmd.id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	else
+		idr_remove(&ctx_id_table, ctx->id);
+	mutex_unlock(&ctx_id_mutex);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ib_ucm_ctx_put(ctx);
+	wait_for_completion(&ctx->comp);
+
+	/* No new events will be generated after destroying the cm_id. */
+	ib_destroy_cm_id(ctx->cm_id);
+	/* Cleanup events not yet reported to the user. */
+	ib_ucm_cleanup_events(ctx);
+
+	resp.events_reported = ctx->events_reported;
+	if (copy_to_user(u64_to_user_ptr(cmd.response),
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+	kfree(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file,
+			      const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct ib_ucm_attr_id_resp resp;
+	struct ib_ucm_attr_id cmd;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.service_id   = ctx->cm_id->service_id;
+	resp.service_mask = ctx->cm_id->service_mask;
+	resp.local_id     = ctx->cm_id->local_id;
+	resp.remote_id    = ctx->cm_id->remote_id;
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response),
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len)
+{
+	struct ib_uverbs_qp_attr resp;
+	struct ib_ucm_init_qp_attr cmd;
+	struct ib_ucm_context *ctx;
+	struct ib_qp_attr qp_attr;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.qp_attr_mask = 0;
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.qp_state = cmd.qp_state;
+	result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+	if (result)
+		goto out;
+
+	ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr);
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response),
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+out:
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static int ucm_validate_listen(__be64 service_id, __be64 service_mask)
+{
+	service_id &= service_mask;
+
+	if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) ||
+	    ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID))
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
+			     const char __user *inbuf,
+			     int in_len, int out_len)
+{
+	struct ib_ucm_listen cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	result = ucm_validate_listen(cmd.service_id, cmd.service_mask);
+	if (result)
+		goto out;
+
+	result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
+out:
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_notify(struct ib_ucm_file *file,
+			     const char __user *inbuf,
+			     int in_len, int out_len)
+{
+	struct ib_ucm_notify cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len)
+{
+	void *data;
+
+	*dest = NULL;
+
+	if (!len)
+		return 0;
+
+	data = memdup_user(u64_to_user_ptr(src), len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	*dest = data;
+	return 0;
+}
+
+static int ib_ucm_path_get(struct sa_path_rec **path, u64 src)
+{
+	struct ib_user_path_rec upath;
+	struct sa_path_rec  *sa_path;
+
+	*path = NULL;
+
+	if (!src)
+		return 0;
+
+	sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL);
+	if (!sa_path)
+		return -ENOMEM;
+
+	if (copy_from_user(&upath, u64_to_user_ptr(src),
+			   sizeof(upath))) {
+
+		kfree(sa_path);
+		return -EFAULT;
+	}
+
+	ib_copy_path_rec_from_user(sa_path, &upath);
+	*path = sa_path;
+	return 0;
+}
+
+static ssize_t ib_ucm_send_req(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_cm_req_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_req cmd;
+	int result;
+
+	param.private_data   = NULL;
+	param.primary_path   = NULL;
+	param.alternate_path = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.primary_path, cmd.primary_path);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.alternate_path, cmd.alternate_path);
+	if (result)
+		goto done;
+
+	param.private_data_len           = cmd.len;
+	param.service_id                 = cmd.sid;
+	param.qp_num                     = cmd.qpn;
+	param.qp_type                    = cmd.qp_type;
+	param.starting_psn               = cmd.psn;
+	param.peer_to_peer               = cmd.peer_to_peer;
+	param.responder_resources        = cmd.responder_resources;
+	param.initiator_depth            = cmd.initiator_depth;
+	param.remote_cm_response_timeout = cmd.remote_cm_response_timeout;
+	param.flow_control               = cmd.flow_control;
+	param.local_cm_response_timeout  = cmd.local_cm_response_timeout;
+	param.retry_count                = cmd.retry_count;
+	param.rnr_retry_count            = cmd.rnr_retry_count;
+	param.max_cm_retries             = cmd.max_cm_retries;
+	param.srq                        = cmd.srq;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_req(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.primary_path);
+	kfree(param.alternate_path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_cm_rep_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_rep cmd;
+	int result;
+
+	param.private_data = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	param.qp_num              = cmd.qpn;
+	param.starting_psn        = cmd.psn;
+	param.private_data_len    = cmd.len;
+	param.responder_resources = cmd.responder_resources;
+	param.initiator_depth     = cmd.initiator_depth;
+	param.failover_accepted   = cmd.failover_accepted;
+	param.flow_control        = cmd.flow_control;
+	param.rnr_retry_count     = cmd.rnr_retry_count;
+	param.srq                 = cmd.srq;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		ctx->uid = cmd.uid;
+		result = ib_send_cm_rep(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(param.private_data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file,
+					const char __user *inbuf, int in_len,
+					int (*func)(struct ib_cm_id *cm_id,
+						    const void *private_data,
+						    u8 private_data_len))
+{
+	struct ib_ucm_private_data cmd;
+	struct ib_ucm_context *ctx;
+	const void *private_data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = func(ctx->cm_id, private_data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(private_data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu);
+}
+
+static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq);
+}
+
+static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep);
+}
+
+static ssize_t ib_ucm_send_info(struct ib_ucm_file *file,
+				const char __user *inbuf, int in_len,
+				int (*func)(struct ib_cm_id *cm_id,
+					    int status,
+					    const void *info,
+					    u8 info_len,
+					    const void *data,
+					    u8 data_len))
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_info cmd;
+	const void *data = NULL;
+	const void *info = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len);
+	if (result)
+		goto done;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = func(ctx->cm_id, cmd.status, info, cmd.info_len,
+			      data, cmd.data_len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(data);
+	kfree(info);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej);
+}
+
+static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr);
+}
+
+static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_mra cmd;
+	const void *data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct sa_path_rec *path = NULL;
+	struct ib_ucm_lap cmd;
+	const void *data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&path, cmd.path);
+	if (result)
+		goto done;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(data);
+	kfree(path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct ib_cm_sidr_req_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_sidr_req cmd;
+	int result;
+
+	param.private_data = NULL;
+	param.path = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.path, cmd.path);
+	if (result)
+		goto done;
+
+	param.private_data_len = cmd.len;
+	param.service_id       = cmd.sid;
+	param.timeout_ms       = cmd.timeout;
+	param.max_cm_retries   = cmd.max_cm_retries;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_sidr_req(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct ib_cm_sidr_rep_param param;
+	struct ib_ucm_sidr_rep cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	param.info = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data,
+				   cmd.data, cmd.data_len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_alloc_data(&param.info, cmd.info, cmd.info_len);
+	if (result)
+		goto done;
+
+	param.qp_num		= cmd.qpn;
+	param.qkey		= cmd.qkey;
+	param.status		= cmd.status;
+	param.info_length	= cmd.info_len;
+	param.private_data_len	= cmd.data_len;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_sidr_rep(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.info);
+	return result;
+}
+
+static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file,
+				  const char __user *inbuf,
+				  int in_len, int out_len) = {
+	[IB_USER_CM_CMD_CREATE_ID]     = ib_ucm_create_id,
+	[IB_USER_CM_CMD_DESTROY_ID]    = ib_ucm_destroy_id,
+	[IB_USER_CM_CMD_ATTR_ID]       = ib_ucm_attr_id,
+	[IB_USER_CM_CMD_LISTEN]        = ib_ucm_listen,
+	[IB_USER_CM_CMD_NOTIFY]        = ib_ucm_notify,
+	[IB_USER_CM_CMD_SEND_REQ]      = ib_ucm_send_req,
+	[IB_USER_CM_CMD_SEND_REP]      = ib_ucm_send_rep,
+	[IB_USER_CM_CMD_SEND_RTU]      = ib_ucm_send_rtu,
+	[IB_USER_CM_CMD_SEND_DREQ]     = ib_ucm_send_dreq,
+	[IB_USER_CM_CMD_SEND_DREP]     = ib_ucm_send_drep,
+	[IB_USER_CM_CMD_SEND_REJ]      = ib_ucm_send_rej,
+	[IB_USER_CM_CMD_SEND_MRA]      = ib_ucm_send_mra,
+	[IB_USER_CM_CMD_SEND_LAP]      = ib_ucm_send_lap,
+	[IB_USER_CM_CMD_SEND_APR]      = ib_ucm_send_apr,
+	[IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req,
+	[IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep,
+	[IB_USER_CM_CMD_EVENT]	       = ib_ucm_event,
+	[IB_USER_CM_CMD_INIT_QP_ATTR]  = ib_ucm_init_qp_attr,
+};
+
+static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
+			    size_t len, loff_t *pos)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	struct ib_ucm_cmd_hdr hdr;
+	ssize_t result;
+
+	if (!ib_safe_file_access(filp)) {
+		pr_err_once("ucm_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
+			    task_tgid_vnr(current), current->comm);
+		return -EACCES;
+	}
+
+	if (len < sizeof(hdr))
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof(hdr)))
+		return -EFAULT;
+
+	if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
+		return -EINVAL;
+
+	if (hdr.in + sizeof(hdr) > len)
+		return -EINVAL;
+
+	result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr),
+					hdr.in, hdr.out);
+	if (!result)
+		result = len;
+
+	return result;
+}
+
+static __poll_t ib_ucm_poll(struct file *filp,
+				struct poll_table_struct *wait)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	__poll_t mask = 0;
+
+	poll_wait(filp, &file->poll_wait, wait);
+
+	if (!list_empty(&file->events))
+		mask = EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * ib_ucm_open() does not need the BKL:
+ *
+ *  - no global state is referred to;
+ *  - there is no ioctl method to race against;
+ *  - no further module initialization is required for open to work
+ *    after the device is registered.
+ */
+static int ib_ucm_open(struct inode *inode, struct file *filp)
+{
+	struct ib_ucm_file *file;
+
+	file = kmalloc(sizeof(*file), GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&file->events);
+	INIT_LIST_HEAD(&file->ctxs);
+	init_waitqueue_head(&file->poll_wait);
+
+	mutex_init(&file->file_mutex);
+
+	filp->private_data = file;
+	file->filp = filp;
+	file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev);
+
+	return nonseekable_open(inode, filp);
+}
+
+static int ib_ucm_close(struct inode *inode, struct file *filp)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	struct ib_ucm_context *ctx;
+
+	mutex_lock(&file->file_mutex);
+	while (!list_empty(&file->ctxs)) {
+		ctx = list_entry(file->ctxs.next,
+				 struct ib_ucm_context, file_list);
+		mutex_unlock(&file->file_mutex);
+
+		mutex_lock(&ctx_id_mutex);
+		idr_remove(&ctx_id_table, ctx->id);
+		mutex_unlock(&ctx_id_mutex);
+
+		ib_destroy_cm_id(ctx->cm_id);
+		ib_ucm_cleanup_events(ctx);
+		kfree(ctx);
+
+		mutex_lock(&file->file_mutex);
+	}
+	mutex_unlock(&file->file_mutex);
+	kfree(file);
+	return 0;
+}
+
+static void ib_ucm_release_dev(struct device *dev)
+{
+	struct ib_ucm_device *ucm_dev;
+
+	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+	kfree(ucm_dev);
+}
+
+static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev)
+{
+	clear_bit(ucm_dev->devnum, dev_map);
+}
+
+static const struct file_operations ucm_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = ib_ucm_open,
+	.release = ib_ucm_close,
+	.write	 = ib_ucm_write,
+	.poll    = ib_ucm_poll,
+	.llseek	 = no_llseek,
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_ucm_device *ucm_dev;
+
+	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+	return sprintf(buf, "%s\n", ucm_dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static void ib_ucm_add_one(struct ib_device *device)
+{
+	int devnum;
+	dev_t base;
+	struct ib_ucm_device *ucm_dev;
+
+	if (!device->alloc_ucontext || !rdma_cap_ib_cm(device, 1))
+		return;
+
+	ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
+	if (!ucm_dev)
+		return;
+
+	device_initialize(&ucm_dev->dev);
+	ucm_dev->ib_dev = device;
+	ucm_dev->dev.release = ib_ucm_release_dev;
+
+	devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
+	if (devnum >= IB_UCM_MAX_DEVICES)
+		goto err;
+	ucm_dev->devnum = devnum;
+	set_bit(devnum, dev_map);
+	if (devnum >= IB_UCM_NUM_FIXED_MINOR)
+		base = dynamic_ucm_dev + devnum - IB_UCM_NUM_FIXED_MINOR;
+	else
+		base = IB_UCM_BASE_DEV + devnum;
+
+	cdev_init(&ucm_dev->cdev, &ucm_fops);
+	ucm_dev->cdev.owner = THIS_MODULE;
+	kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
+
+	ucm_dev->dev.class = &cm_class;
+	ucm_dev->dev.parent = device->dev.parent;
+	ucm_dev->dev.devt = base;
+
+	dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum);
+	if (cdev_device_add(&ucm_dev->cdev, &ucm_dev->dev))
+		goto err_devnum;
+
+	if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev))
+		goto err_dev;
+
+	ib_set_client_data(device, &ucm_client, ucm_dev);
+	return;
+
+err_dev:
+	cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev);
+err_devnum:
+	ib_ucm_free_dev(ucm_dev);
+err:
+	put_device(&ucm_dev->dev);
+	return;
+}
+
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
+{
+	struct ib_ucm_device *ucm_dev = client_data;
+
+	if (!ucm_dev)
+		return;
+
+	cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev);
+	ib_ucm_free_dev(ucm_dev);
+	put_device(&ucm_dev->dev);
+}
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+			 __stringify(IB_USER_CM_ABI_VERSION));
+
+static int __init ib_ucm_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR,
+				     "infiniband_cm");
+	if (ret) {
+		pr_err("ucm: couldn't register device number\n");
+		goto error1;
+	}
+
+	ret = alloc_chrdev_region(&dynamic_ucm_dev, 0, IB_UCM_NUM_DYNAMIC_MINOR,
+				  "infiniband_cm");
+	if (ret) {
+		pr_err("ucm: couldn't register dynamic device number\n");
+		goto err_alloc;
+	}
+
+	ret = class_create_file(&cm_class, &class_attr_abi_version.attr);
+	if (ret) {
+		pr_err("ucm: couldn't create abi_version attribute\n");
+		goto error2;
+	}
+
+	ret = ib_register_client(&ucm_client);
+	if (ret) {
+		pr_err("ucm: couldn't register client\n");
+		goto error3;
+	}
+	return 0;
+
+error3:
+	class_remove_file(&cm_class, &class_attr_abi_version.attr);
+error2:
+	unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
+err_alloc:
+	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
+error1:
+	return ret;
+}
+
+static void __exit ib_ucm_cleanup(void)
+{
+	ib_unregister_client(&ucm_client);
+	class_remove_file(&cm_class, &class_attr_abi_version.attr);
+	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
+	unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
+	idr_destroy(&ctx_id_table);
+}
+
+module_init(ib_ucm_init);
+module_exit(ib_ucm_cleanup);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
new file mode 100644
index 0000000..eab43b1
--- /dev/null
+++ b/drivers/infiniband/core/ucma.c
@@ -0,0 +1,1832 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer in the documentation and/or other materials
+ *	provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+
+#include <rdma/rdma_user_cm.h>
+#include <rdma/ib_marshall.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static unsigned int max_backlog = 1024;
+
+static struct ctl_table_header *ucma_ctl_table_hdr;
+static struct ctl_table ucma_ctl_table[] = {
+	{
+		.procname	= "max_backlog",
+		.data		= &max_backlog,
+		.maxlen		= sizeof max_backlog,
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+struct ucma_file {
+	struct mutex		mut;
+	struct file		*filp;
+	struct list_head	ctx_list;
+	struct list_head	event_list;
+	wait_queue_head_t	poll_wait;
+	struct workqueue_struct	*close_wq;
+};
+
+struct ucma_context {
+	int			id;
+	struct completion	comp;
+	atomic_t		ref;
+	int			events_reported;
+	int			backlog;
+
+	struct ucma_file	*file;
+	struct rdma_cm_id	*cm_id;
+	u64			uid;
+
+	struct list_head	list;
+	struct list_head	mc_list;
+	/* mark that device is in process of destroying the internal HW
+	 * resources, protected by the global mut
+	 */
+	int			closing;
+	/* sync between removal event and id destroy, protected by file mut */
+	int			destroying;
+	struct work_struct	close_work;
+};
+
+struct ucma_multicast {
+	struct ucma_context	*ctx;
+	int			id;
+	int			events_reported;
+
+	u64			uid;
+	u8			join_state;
+	struct list_head	list;
+	struct sockaddr_storage	addr;
+};
+
+struct ucma_event {
+	struct ucma_context	*ctx;
+	struct ucma_multicast	*mc;
+	struct list_head	list;
+	struct rdma_cm_id	*cm_id;
+	struct rdma_ucm_event_resp resp;
+	struct work_struct	close_work;
+};
+
+static DEFINE_MUTEX(mut);
+static DEFINE_IDR(ctx_idr);
+static DEFINE_IDR(multicast_idr);
+
+static inline struct ucma_context *_ucma_find_context(int id,
+						      struct ucma_file *file)
+{
+	struct ucma_context *ctx;
+
+	ctx = idr_find(&ctx_idr, id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file || !ctx->cm_id)
+		ctx = ERR_PTR(-EINVAL);
+	return ctx;
+}
+
+static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
+{
+	struct ucma_context *ctx;
+
+	mutex_lock(&mut);
+	ctx = _ucma_find_context(id, file);
+	if (!IS_ERR(ctx)) {
+		if (ctx->closing)
+			ctx = ERR_PTR(-EIO);
+		else
+			atomic_inc(&ctx->ref);
+	}
+	mutex_unlock(&mut);
+	return ctx;
+}
+
+static void ucma_put_ctx(struct ucma_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->ref))
+		complete(&ctx->comp);
+}
+
+/*
+ * Same as ucm_get_ctx but requires that ->cm_id->device is valid, eg that the
+ * CM_ID is bound.
+ */
+static struct ucma_context *ucma_get_ctx_dev(struct ucma_file *file, int id)
+{
+	struct ucma_context *ctx = ucma_get_ctx(file, id);
+
+	if (IS_ERR(ctx))
+		return ctx;
+	if (!ctx->cm_id->device) {
+		ucma_put_ctx(ctx);
+		return ERR_PTR(-EINVAL);
+	}
+	return ctx;
+}
+
+static void ucma_close_event_id(struct work_struct *work)
+{
+	struct ucma_event *uevent_close =  container_of(work, struct ucma_event, close_work);
+
+	rdma_destroy_id(uevent_close->cm_id);
+	kfree(uevent_close);
+}
+
+static void ucma_close_id(struct work_struct *work)
+{
+	struct ucma_context *ctx =  container_of(work, struct ucma_context, close_work);
+
+	/* once all inflight tasks are finished, we close all underlying
+	 * resources. The context is still alive till its explicit destryoing
+	 * by its creator.
+	 */
+	ucma_put_ctx(ctx);
+	wait_for_completion(&ctx->comp);
+	/* No new events will be generated after destroying the id. */
+	rdma_destroy_id(ctx->cm_id);
+}
+
+static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
+{
+	struct ucma_context *ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	INIT_WORK(&ctx->close_work, ucma_close_id);
+	atomic_set(&ctx->ref, 1);
+	init_completion(&ctx->comp);
+	INIT_LIST_HEAD(&ctx->mc_list);
+	ctx->file = file;
+
+	mutex_lock(&mut);
+	ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL);
+	mutex_unlock(&mut);
+	if (ctx->id < 0)
+		goto error;
+
+	list_add_tail(&ctx->list, &file->ctx_list);
+	return ctx;
+
+error:
+	kfree(ctx);
+	return NULL;
+}
+
+static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx)
+{
+	struct ucma_multicast *mc;
+
+	mc = kzalloc(sizeof(*mc), GFP_KERNEL);
+	if (!mc)
+		return NULL;
+
+	mutex_lock(&mut);
+	mc->id = idr_alloc(&multicast_idr, mc, 0, 0, GFP_KERNEL);
+	mutex_unlock(&mut);
+	if (mc->id < 0)
+		goto error;
+
+	mc->ctx = ctx;
+	list_add_tail(&mc->list, &ctx->mc_list);
+	return mc;
+
+error:
+	kfree(mc);
+	return NULL;
+}
+
+static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst,
+				 struct rdma_conn_param *src)
+{
+	if (src->private_data_len)
+		memcpy(dst->private_data, src->private_data,
+		       src->private_data_len);
+	dst->private_data_len = src->private_data_len;
+	dst->responder_resources =src->responder_resources;
+	dst->initiator_depth = src->initiator_depth;
+	dst->flow_control = src->flow_control;
+	dst->retry_count = src->retry_count;
+	dst->rnr_retry_count = src->rnr_retry_count;
+	dst->srq = src->srq;
+	dst->qp_num = src->qp_num;
+}
+
+static void ucma_copy_ud_event(struct ib_device *device,
+			       struct rdma_ucm_ud_param *dst,
+			       struct rdma_ud_param *src)
+{
+	if (src->private_data_len)
+		memcpy(dst->private_data, src->private_data,
+		       src->private_data_len);
+	dst->private_data_len = src->private_data_len;
+	ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr);
+	dst->qp_num = src->qp_num;
+	dst->qkey = src->qkey;
+}
+
+static void ucma_set_event_context(struct ucma_context *ctx,
+				   struct rdma_cm_event *event,
+				   struct ucma_event *uevent)
+{
+	uevent->ctx = ctx;
+	switch (event->event) {
+	case RDMA_CM_EVENT_MULTICAST_JOIN:
+	case RDMA_CM_EVENT_MULTICAST_ERROR:
+		uevent->mc = (struct ucma_multicast *)
+			     event->param.ud.private_data;
+		uevent->resp.uid = uevent->mc->uid;
+		uevent->resp.id = uevent->mc->id;
+		break;
+	default:
+		uevent->resp.uid = ctx->uid;
+		uevent->resp.id = ctx->id;
+		break;
+	}
+}
+
+/* Called with file->mut locked for the relevant context. */
+static void ucma_removal_event_handler(struct rdma_cm_id *cm_id)
+{
+	struct ucma_context *ctx = cm_id->context;
+	struct ucma_event *con_req_eve;
+	int event_found = 0;
+
+	if (ctx->destroying)
+		return;
+
+	/* only if context is pointing to cm_id that it owns it and can be
+	 * queued to be closed, otherwise that cm_id is an inflight one that
+	 * is part of that context event list pending to be detached and
+	 * reattached to its new context as part of ucma_get_event,
+	 * handled separately below.
+	 */
+	if (ctx->cm_id == cm_id) {
+		mutex_lock(&mut);
+		ctx->closing = 1;
+		mutex_unlock(&mut);
+		queue_work(ctx->file->close_wq, &ctx->close_work);
+		return;
+	}
+
+	list_for_each_entry(con_req_eve, &ctx->file->event_list, list) {
+		if (con_req_eve->cm_id == cm_id &&
+		    con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+			list_del(&con_req_eve->list);
+			INIT_WORK(&con_req_eve->close_work, ucma_close_event_id);
+			queue_work(ctx->file->close_wq, &con_req_eve->close_work);
+			event_found = 1;
+			break;
+		}
+	}
+	if (!event_found)
+		pr_err("ucma_removal_event_handler: warning: connect request event wasn't found\n");
+}
+
+static int ucma_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event)
+{
+	struct ucma_event *uevent;
+	struct ucma_context *ctx = cm_id->context;
+	int ret = 0;
+
+	uevent = kzalloc(sizeof(*uevent), GFP_KERNEL);
+	if (!uevent)
+		return event->event == RDMA_CM_EVENT_CONNECT_REQUEST;
+
+	mutex_lock(&ctx->file->mut);
+	uevent->cm_id = cm_id;
+	ucma_set_event_context(ctx, event, uevent);
+	uevent->resp.event = event->event;
+	uevent->resp.status = event->status;
+	if (cm_id->qp_type == IB_QPT_UD)
+		ucma_copy_ud_event(cm_id->device, &uevent->resp.param.ud,
+				   &event->param.ud);
+	else
+		ucma_copy_conn_event(&uevent->resp.param.conn,
+				     &event->param.conn);
+
+	if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+		if (!ctx->backlog) {
+			ret = -ENOMEM;
+			kfree(uevent);
+			goto out;
+		}
+		ctx->backlog--;
+	} else if (!ctx->uid || ctx->cm_id != cm_id) {
+		/*
+		 * We ignore events for new connections until userspace has set
+		 * their context.  This can only happen if an error occurs on a
+		 * new connection before the user accepts it.  This is okay,
+		 * since the accept will just fail later. However, we do need
+		 * to release the underlying HW resources in case of a device
+		 * removal event.
+		 */
+		if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+			ucma_removal_event_handler(cm_id);
+
+		kfree(uevent);
+		goto out;
+	}
+
+	list_add_tail(&uevent->list, &ctx->file->event_list);
+	wake_up_interruptible(&ctx->file->poll_wait);
+	if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+		ucma_removal_event_handler(cm_id);
+out:
+	mutex_unlock(&ctx->file->mut);
+	return ret;
+}
+
+static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct ucma_context *ctx;
+	struct rdma_ucm_get_event cmd;
+	struct ucma_event *uevent;
+	int ret = 0;
+
+	/*
+	 * Old 32 bit user space does not send the 4 byte padding in the
+	 * reserved field. We don't care, allow it to keep working.
+	 */
+	if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->mut);
+	while (list_empty(&file->event_list)) {
+		mutex_unlock(&file->mut);
+
+		if (file->filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->event_list)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->mut);
+	}
+
+	uevent = list_entry(file->event_list.next, struct ucma_event, list);
+
+	if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+		ctx = ucma_alloc_ctx(file);
+		if (!ctx) {
+			ret = -ENOMEM;
+			goto done;
+		}
+		uevent->ctx->backlog++;
+		ctx->cm_id = uevent->cm_id;
+		ctx->cm_id->context = ctx;
+		uevent->resp.id = ctx->id;
+	}
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response),
+			 &uevent->resp,
+			 min_t(size_t, out_len, sizeof(uevent->resp)))) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	list_del(&uevent->list);
+	uevent->ctx->events_reported++;
+	if (uevent->mc)
+		uevent->mc->events_reported++;
+	kfree(uevent);
+done:
+	mutex_unlock(&file->mut);
+	return ret;
+}
+
+static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type)
+{
+	switch (cmd->ps) {
+	case RDMA_PS_TCP:
+		*qp_type = IB_QPT_RC;
+		return 0;
+	case RDMA_PS_UDP:
+	case RDMA_PS_IPOIB:
+		*qp_type = IB_QPT_UD;
+		return 0;
+	case RDMA_PS_IB:
+		*qp_type = cmd->qp_type;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct rdma_ucm_create_id cmd;
+	struct rdma_ucm_create_id_resp resp;
+	struct ucma_context *ctx;
+	struct rdma_cm_id *cm_id;
+	enum ib_qp_type qp_type;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ret = ucma_get_qp_type(&cmd, &qp_type);
+	if (ret)
+		return ret;
+
+	mutex_lock(&file->mut);
+	ctx = ucma_alloc_ctx(file);
+	mutex_unlock(&file->mut);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->uid = cmd.uid;
+	cm_id = __rdma_create_id(current->nsproxy->net_ns,
+				 ucma_event_handler, ctx, cmd.ps, qp_type, NULL);
+	if (IS_ERR(cm_id)) {
+		ret = PTR_ERR(cm_id);
+		goto err1;
+	}
+
+	resp.id = ctx->id;
+	if (copy_to_user(u64_to_user_ptr(cmd.response),
+			 &resp, sizeof(resp))) {
+		ret = -EFAULT;
+		goto err2;
+	}
+
+	ctx->cm_id = cm_id;
+	return 0;
+
+err2:
+	rdma_destroy_id(cm_id);
+err1:
+	mutex_lock(&mut);
+	idr_remove(&ctx_idr, ctx->id);
+	mutex_unlock(&mut);
+	mutex_lock(&file->mut);
+	list_del(&ctx->list);
+	mutex_unlock(&file->mut);
+	kfree(ctx);
+	return ret;
+}
+
+static void ucma_cleanup_multicast(struct ucma_context *ctx)
+{
+	struct ucma_multicast *mc, *tmp;
+
+	mutex_lock(&mut);
+	list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
+		list_del(&mc->list);
+		idr_remove(&multicast_idr, mc->id);
+		kfree(mc);
+	}
+	mutex_unlock(&mut);
+}
+
+static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) {
+		if (uevent->mc != mc)
+			continue;
+
+		list_del(&uevent->list);
+		kfree(uevent);
+	}
+}
+
+/*
+ * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At
+ * this point, no new events will be reported from the hardware. However, we
+ * still need to cleanup the UCMA context for this ID. Specifically, there
+ * might be events that have not yet been consumed by the user space software.
+ * These might include pending connect requests which we have not completed
+ * processing.  We cannot call rdma_destroy_id while holding the lock of the
+ * context (file->mut), as it might cause a deadlock. We therefore extract all
+ * relevant events from the context pending events list while holding the
+ * mutex. After that we release them as needed.
+ */
+static int ucma_free_ctx(struct ucma_context *ctx)
+{
+	int events_reported;
+	struct ucma_event *uevent, *tmp;
+	LIST_HEAD(list);
+
+
+	ucma_cleanup_multicast(ctx);
+
+	/* Cleanup events not yet reported to the user. */
+	mutex_lock(&ctx->file->mut);
+	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) {
+		if (uevent->ctx == ctx)
+			list_move_tail(&uevent->list, &list);
+	}
+	list_del(&ctx->list);
+	mutex_unlock(&ctx->file->mut);
+
+	list_for_each_entry_safe(uevent, tmp, &list, list) {
+		list_del(&uevent->list);
+		if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST)
+			rdma_destroy_id(uevent->cm_id);
+		kfree(uevent);
+	}
+
+	events_reported = ctx->events_reported;
+	kfree(ctx);
+	return events_reported;
+}
+
+static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_destroy_id cmd;
+	struct rdma_ucm_destroy_id_resp resp;
+	struct ucma_context *ctx;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&mut);
+	ctx = _ucma_find_context(cmd.id, file);
+	if (!IS_ERR(ctx))
+		idr_remove(&ctx_idr, ctx->id);
+	mutex_unlock(&mut);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	mutex_lock(&ctx->file->mut);
+	ctx->destroying = 1;
+	mutex_unlock(&ctx->file->mut);
+
+	flush_workqueue(ctx->file->close_wq);
+	/* At this point it's guaranteed that there is no inflight
+	 * closing task */
+	mutex_lock(&mut);
+	if (!ctx->closing) {
+		mutex_unlock(&mut);
+		ucma_put_ctx(ctx);
+		wait_for_completion(&ctx->comp);
+		rdma_destroy_id(ctx->cm_id);
+	} else {
+		mutex_unlock(&mut);
+	}
+
+	resp.events_reported = ucma_free_ctx(ctx);
+	if (copy_to_user(u64_to_user_ptr(cmd.response),
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct rdma_ucm_bind_ip cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (!rdma_addr_size_in6(&cmd.addr))
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf,
+			 int in_len, int out_len)
+{
+	struct rdma_ucm_bind cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (cmd.reserved || !cmd.addr_size ||
+	    cmd.addr_size != rdma_addr_size_kss(&cmd.addr))
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_resolve_ip(struct ucma_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_resolve_ip cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	if ((cmd.src_addr.sin6_family && !rdma_addr_size_in6(&cmd.src_addr)) ||
+	    !rdma_addr_size_in6(&cmd.dst_addr))
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
+				(struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_resolve_addr(struct ucma_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct rdma_ucm_resolve_addr cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (cmd.reserved ||
+	    (cmd.src_size && (cmd.src_size != rdma_addr_size_kss(&cmd.src_addr))) ||
+	    !cmd.dst_size || (cmd.dst_size != rdma_addr_size_kss(&cmd.dst_addr)))
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
+				(struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_resolve_route(struct ucma_file *file,
+				  const char __user *inbuf,
+				  int in_len, int out_len)
+{
+	struct rdma_ucm_resolve_route cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx_dev(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
+			       struct rdma_route *route)
+{
+	struct rdma_dev_addr *dev_addr;
+
+	resp->num_paths = route->num_paths;
+	switch (route->num_paths) {
+	case 0:
+		dev_addr = &route->addr.dev_addr;
+		rdma_addr_get_dgid(dev_addr,
+				   (union ib_gid *) &resp->ib_route[0].dgid);
+		rdma_addr_get_sgid(dev_addr,
+				   (union ib_gid *) &resp->ib_route[0].sgid);
+		resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+		break;
+	case 2:
+		ib_copy_path_rec_to_user(&resp->ib_route[1],
+					 &route->path_rec[1]);
+		/* fall through */
+	case 1:
+		ib_copy_path_rec_to_user(&resp->ib_route[0],
+					 &route->path_rec[0]);
+		break;
+	default:
+		break;
+	}
+}
+
+static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp,
+				 struct rdma_route *route)
+{
+
+	resp->num_paths = route->num_paths;
+	switch (route->num_paths) {
+	case 0:
+		rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr,
+			    (union ib_gid *)&resp->ib_route[0].dgid);
+		rdma_ip2gid((struct sockaddr *)&route->addr.src_addr,
+			    (union ib_gid *)&resp->ib_route[0].sgid);
+		resp->ib_route[0].pkey = cpu_to_be16(0xffff);
+		break;
+	case 2:
+		ib_copy_path_rec_to_user(&resp->ib_route[1],
+					 &route->path_rec[1]);
+		/* fall through */
+	case 1:
+		ib_copy_path_rec_to_user(&resp->ib_route[0],
+					 &route->path_rec[0]);
+		break;
+	default:
+		break;
+	}
+}
+
+static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp,
+			       struct rdma_route *route)
+{
+	struct rdma_dev_addr *dev_addr;
+
+	dev_addr = &route->addr.dev_addr;
+	rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid);
+	rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid);
+}
+
+static ssize_t ucma_query_route(struct ucma_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	struct rdma_ucm_query cmd;
+	struct rdma_ucm_query_route_resp resp;
+	struct ucma_context *ctx;
+	struct sockaddr *addr;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	memset(&resp, 0, sizeof resp);
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+	memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ?
+				     sizeof(struct sockaddr_in) :
+				     sizeof(struct sockaddr_in6));
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+	memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ?
+				     sizeof(struct sockaddr_in) :
+				     sizeof(struct sockaddr_in6));
+	if (!ctx->cm_id->device)
+		goto out;
+
+	resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
+	resp.port_num = ctx->cm_id->port_num;
+
+	if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num))
+		ucma_copy_ib_route(&resp, &ctx->cm_id->route);
+	else if (rdma_protocol_roce(ctx->cm_id->device, ctx->cm_id->port_num))
+		ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
+	else if (rdma_protocol_iwarp(ctx->cm_id->device, ctx->cm_id->port_num))
+		ucma_copy_iw_route(&resp, &ctx->cm_id->route);
+
+out:
+	if (copy_to_user(u64_to_user_ptr(cmd.response),
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static void ucma_query_device_addr(struct rdma_cm_id *cm_id,
+				   struct rdma_ucm_query_addr_resp *resp)
+{
+	if (!cm_id->device)
+		return;
+
+	resp->node_guid = (__force __u64) cm_id->device->node_guid;
+	resp->port_num = cm_id->port_num;
+	resp->pkey = (__force __u16) cpu_to_be16(
+		     ib_addr_get_pkey(&cm_id->route.addr.dev_addr));
+}
+
+static ssize_t ucma_query_addr(struct ucma_context *ctx,
+			       void __user *response, int out_len)
+{
+	struct rdma_ucm_query_addr_resp resp;
+	struct sockaddr *addr;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	memset(&resp, 0, sizeof resp);
+
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+	resp.src_size = rdma_addr_size(addr);
+	memcpy(&resp.src_addr, addr, resp.src_size);
+
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+	resp.dst_size = rdma_addr_size(addr);
+	memcpy(&resp.dst_addr, addr, resp.dst_size);
+
+	ucma_query_device_addr(ctx->cm_id, &resp);
+
+	if (copy_to_user(response, &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static ssize_t ucma_query_path(struct ucma_context *ctx,
+			       void __user *response, int out_len)
+{
+	struct rdma_ucm_query_path_resp *resp;
+	int i, ret = 0;
+
+	if (out_len < sizeof(*resp))
+		return -ENOSPC;
+
+	resp = kzalloc(out_len, GFP_KERNEL);
+	if (!resp)
+		return -ENOMEM;
+
+	resp->num_paths = ctx->cm_id->route.num_paths;
+	for (i = 0, out_len -= sizeof(*resp);
+	     i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data);
+	     i++, out_len -= sizeof(struct ib_path_rec_data)) {
+		struct sa_path_rec *rec = &ctx->cm_id->route.path_rec[i];
+
+		resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY |
+					   IB_PATH_BIDIRECTIONAL;
+		if (rec->rec_type == SA_PATH_REC_TYPE_OPA) {
+			struct sa_path_rec ib;
+
+			sa_convert_path_opa_to_ib(&ib, rec);
+			ib_sa_pack_path(&ib, &resp->path_data[i].path_rec);
+
+		} else {
+			ib_sa_pack_path(rec, &resp->path_data[i].path_rec);
+		}
+	}
+
+	if (copy_to_user(response, resp,
+			 sizeof(*resp) + (i * sizeof(struct ib_path_rec_data))))
+		ret = -EFAULT;
+
+	kfree(resp);
+	return ret;
+}
+
+static ssize_t ucma_query_gid(struct ucma_context *ctx,
+			      void __user *response, int out_len)
+{
+	struct rdma_ucm_query_addr_resp resp;
+	struct sockaddr_ib *addr;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	memset(&resp, 0, sizeof resp);
+
+	ucma_query_device_addr(ctx->cm_id, &resp);
+
+	addr = (struct sockaddr_ib *) &resp.src_addr;
+	resp.src_size = sizeof(*addr);
+	if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) {
+		memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size);
+	} else {
+		addr->sib_family = AF_IB;
+		addr->sib_pkey = (__force __be16) resp.pkey;
+		rdma_read_gids(ctx->cm_id, (union ib_gid *)&addr->sib_addr,
+			       NULL);
+		addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+						    &ctx->cm_id->route.addr.src_addr);
+	}
+
+	addr = (struct sockaddr_ib *) &resp.dst_addr;
+	resp.dst_size = sizeof(*addr);
+	if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) {
+		memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size);
+	} else {
+		addr->sib_family = AF_IB;
+		addr->sib_pkey = (__force __be16) resp.pkey;
+		rdma_read_gids(ctx->cm_id, NULL,
+			       (union ib_gid *)&addr->sib_addr);
+		addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+						    &ctx->cm_id->route.addr.dst_addr);
+	}
+
+	if (copy_to_user(response, &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static ssize_t ucma_query(struct ucma_file *file,
+			  const char __user *inbuf,
+			  int in_len, int out_len)
+{
+	struct rdma_ucm_query cmd;
+	struct ucma_context *ctx;
+	void __user *response;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	response = u64_to_user_ptr(cmd.response);
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	switch (cmd.option) {
+	case RDMA_USER_CM_QUERY_ADDR:
+		ret = ucma_query_addr(ctx, response, out_len);
+		break;
+	case RDMA_USER_CM_QUERY_PATH:
+		ret = ucma_query_path(ctx, response, out_len);
+		break;
+	case RDMA_USER_CM_QUERY_GID:
+		ret = ucma_query_gid(ctx, response, out_len);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static void ucma_copy_conn_param(struct rdma_cm_id *id,
+				 struct rdma_conn_param *dst,
+				 struct rdma_ucm_conn_param *src)
+{
+	dst->private_data = src->private_data;
+	dst->private_data_len = src->private_data_len;
+	dst->responder_resources =src->responder_resources;
+	dst->initiator_depth = src->initiator_depth;
+	dst->flow_control = src->flow_control;
+	dst->retry_count = src->retry_count;
+	dst->rnr_retry_count = src->rnr_retry_count;
+	dst->srq = src->srq;
+	dst->qp_num = src->qp_num;
+	dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0;
+}
+
+static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf,
+			    int in_len, int out_len)
+{
+	struct rdma_ucm_connect cmd;
+	struct rdma_conn_param conn_param;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (!cmd.conn_param.valid)
+		return -EINVAL;
+
+	ctx = ucma_get_ctx_dev(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
+	ret = rdma_connect(ctx->cm_id, &conn_param);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_listen cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ?
+		       cmd.backlog : max_backlog;
+	ret = rdma_listen(ctx->cm_id, ctx->backlog);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_accept cmd;
+	struct rdma_conn_param conn_param;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx_dev(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	if (cmd.conn_param.valid) {
+		ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
+		mutex_lock(&file->mut);
+		ret = __rdma_accept(ctx->cm_id, &conn_param, NULL);
+		if (!ret)
+			ctx->uid = cmd.uid;
+		mutex_unlock(&file->mut);
+	} else
+		ret = __rdma_accept(ctx->cm_id, NULL, NULL);
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_reject cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx_dev(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_disconnect cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx_dev(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_disconnect(ctx->cm_id);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_init_qp_attr(struct ucma_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct rdma_ucm_init_qp_attr cmd;
+	struct ib_uverbs_qp_attr resp;
+	struct ucma_context *ctx;
+	struct ib_qp_attr qp_attr;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (cmd.qp_state > IB_QPS_ERR)
+		return -EINVAL;
+
+	ctx = ucma_get_ctx_dev(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.qp_attr_mask = 0;
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.qp_state = cmd.qp_state;
+	ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+	if (ret)
+		goto out;
+
+	ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr);
+	if (copy_to_user(u64_to_user_ptr(cmd.response),
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+out:
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static int ucma_set_option_id(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret = 0;
+
+	switch (optname) {
+	case RDMA_OPTION_ID_TOS:
+		if (optlen != sizeof(u8)) {
+			ret = -EINVAL;
+			break;
+		}
+		rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
+		break;
+	case RDMA_OPTION_ID_REUSEADDR:
+		if (optlen != sizeof(int)) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0);
+		break;
+	case RDMA_OPTION_ID_AFONLY:
+		if (optlen != sizeof(int)) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static int ucma_set_ib_path(struct ucma_context *ctx,
+			    struct ib_path_rec_data *path_data, size_t optlen)
+{
+	struct sa_path_rec sa_path;
+	struct rdma_cm_event event;
+	int ret;
+
+	if (optlen % sizeof(*path_data))
+		return -EINVAL;
+
+	for (; optlen; optlen -= sizeof(*path_data), path_data++) {
+		if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY |
+					 IB_PATH_BIDIRECTIONAL))
+			break;
+	}
+
+	if (!optlen)
+		return -EINVAL;
+
+	if (!ctx->cm_id->device)
+		return -EINVAL;
+
+	memset(&sa_path, 0, sizeof(sa_path));
+
+	sa_path.rec_type = SA_PATH_REC_TYPE_IB;
+	ib_sa_unpack_path(path_data->path_rec, &sa_path);
+
+	if (rdma_cap_opa_ah(ctx->cm_id->device, ctx->cm_id->port_num)) {
+		struct sa_path_rec opa;
+
+		sa_convert_path_ib_to_opa(&opa, &sa_path);
+		ret = rdma_set_ib_path(ctx->cm_id, &opa);
+	} else {
+		ret = rdma_set_ib_path(ctx->cm_id, &sa_path);
+	}
+	if (ret)
+		return ret;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	return ucma_event_handler(ctx->cm_id, &event);
+}
+
+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (optname) {
+	case RDMA_OPTION_IB_PATH:
+		ret = ucma_set_ib_path(ctx, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static int ucma_set_option_level(struct ucma_context *ctx, int level,
+				 int optname, void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (level) {
+	case RDMA_OPTION_ID:
+		ret = ucma_set_option_id(ctx, optname, optval, optlen);
+		break;
+	case RDMA_OPTION_IB:
+		ret = ucma_set_option_ib(ctx, optname, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_set_option cmd;
+	struct ucma_context *ctx;
+	void *optval;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (unlikely(cmd.optlen > KMALLOC_MAX_SIZE))
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	optval = memdup_user(u64_to_user_ptr(cmd.optval),
+			     cmd.optlen);
+	if (IS_ERR(optval)) {
+		ret = PTR_ERR(optval);
+		goto out;
+	}
+
+	ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
+				    cmd.optlen);
+	kfree(optval);
+
+out:
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_notify cmd;
+	struct ucma_context *ctx;
+	int ret = -EINVAL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	if (ctx->cm_id->device)
+		ret = rdma_notify(ctx->cm_id, (enum ib_event_type)cmd.event);
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_process_join(struct ucma_file *file,
+				 struct rdma_ucm_join_mcast *cmd,  int out_len)
+{
+	struct rdma_ucm_create_id_resp resp;
+	struct ucma_context *ctx;
+	struct ucma_multicast *mc;
+	struct sockaddr *addr;
+	int ret;
+	u8 join_state;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	addr = (struct sockaddr *) &cmd->addr;
+	if (cmd->addr_size != rdma_addr_size(addr))
+		return -EINVAL;
+
+	if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER)
+		join_state = BIT(FULLMEMBER_JOIN);
+	else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER)
+		join_state = BIT(SENDONLY_FULLMEMBER_JOIN);
+	else
+		return -EINVAL;
+
+	ctx = ucma_get_ctx_dev(file, cmd->id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	mutex_lock(&file->mut);
+	mc = ucma_alloc_multicast(ctx);
+	if (!mc) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+	mc->join_state = join_state;
+	mc->uid = cmd->uid;
+	memcpy(&mc->addr, addr, cmd->addr_size);
+	ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr,
+				  join_state, mc);
+	if (ret)
+		goto err2;
+
+	resp.id = mc->id;
+	if (copy_to_user(u64_to_user_ptr(cmd->response),
+			 &resp, sizeof(resp))) {
+		ret = -EFAULT;
+		goto err3;
+	}
+
+	mutex_unlock(&file->mut);
+	ucma_put_ctx(ctx);
+	return 0;
+
+err3:
+	rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
+	ucma_cleanup_mc_events(mc);
+err2:
+	mutex_lock(&mut);
+	idr_remove(&multicast_idr, mc->id);
+	mutex_unlock(&mut);
+	list_del(&mc->list);
+	kfree(mc);
+err1:
+	mutex_unlock(&file->mut);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_join_ip_multicast(struct ucma_file *file,
+				      const char __user *inbuf,
+				      int in_len, int out_len)
+{
+	struct rdma_ucm_join_ip_mcast cmd;
+	struct rdma_ucm_join_mcast join_cmd;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	join_cmd.response = cmd.response;
+	join_cmd.uid = cmd.uid;
+	join_cmd.id = cmd.id;
+	join_cmd.addr_size = rdma_addr_size_in6(&cmd.addr);
+	if (!join_cmd.addr_size)
+		return -EINVAL;
+
+	join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER;
+	memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size);
+
+	return ucma_process_join(file, &join_cmd, out_len);
+}
+
+static ssize_t ucma_join_multicast(struct ucma_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len)
+{
+	struct rdma_ucm_join_mcast cmd;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (!rdma_addr_size_kss(&cmd.addr))
+		return -EINVAL;
+
+	return ucma_process_join(file, &cmd, out_len);
+}
+
+static ssize_t ucma_leave_multicast(struct ucma_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct rdma_ucm_destroy_id cmd;
+	struct rdma_ucm_destroy_id_resp resp;
+	struct ucma_multicast *mc;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&mut);
+	mc = idr_find(&multicast_idr, cmd.id);
+	if (!mc)
+		mc = ERR_PTR(-ENOENT);
+	else if (mc->ctx->file != file)
+		mc = ERR_PTR(-EINVAL);
+	else if (!atomic_inc_not_zero(&mc->ctx->ref))
+		mc = ERR_PTR(-ENXIO);
+	else
+		idr_remove(&multicast_idr, mc->id);
+	mutex_unlock(&mut);
+
+	if (IS_ERR(mc)) {
+		ret = PTR_ERR(mc);
+		goto out;
+	}
+
+	rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr);
+	mutex_lock(&mc->ctx->file->mut);
+	ucma_cleanup_mc_events(mc);
+	list_del(&mc->list);
+	mutex_unlock(&mc->ctx->file->mut);
+
+	ucma_put_ctx(mc->ctx);
+	resp.events_reported = mc->events_reported;
+	kfree(mc);
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response),
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+out:
+	return ret;
+}
+
+static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	/* Acquire mutex's based on pointer comparison to prevent deadlock. */
+	if (file1 < file2) {
+		mutex_lock(&file1->mut);
+		mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING);
+	} else {
+		mutex_lock(&file2->mut);
+		mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	if (file1 < file2) {
+		mutex_unlock(&file2->mut);
+		mutex_unlock(&file1->mut);
+	} else {
+		mutex_unlock(&file1->mut);
+		mutex_unlock(&file2->mut);
+	}
+}
+
+static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list)
+		if (uevent->ctx == ctx)
+			list_move_tail(&uevent->list, &file->event_list);
+}
+
+static ssize_t ucma_migrate_id(struct ucma_file *new_file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_migrate_id cmd;
+	struct rdma_ucm_migrate_resp resp;
+	struct ucma_context *ctx;
+	struct fd f;
+	struct ucma_file *cur_file;
+	int ret = 0;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	/* Get current fd to protect against it being closed */
+	f = fdget(cmd.fd);
+	if (!f.file)
+		return -ENOENT;
+
+	/* Validate current fd and prevent destruction of id. */
+	ctx = ucma_get_ctx(f.file->private_data, cmd.id);
+	if (IS_ERR(ctx)) {
+		ret = PTR_ERR(ctx);
+		goto file_put;
+	}
+
+	cur_file = ctx->file;
+	if (cur_file == new_file) {
+		resp.events_reported = ctx->events_reported;
+		goto response;
+	}
+
+	/*
+	 * Migrate events between fd's, maintaining order, and avoiding new
+	 * events being added before existing events.
+	 */
+	ucma_lock_files(cur_file, new_file);
+	mutex_lock(&mut);
+
+	list_move_tail(&ctx->list, &new_file->ctx_list);
+	ucma_move_events(ctx, new_file);
+	ctx->file = new_file;
+	resp.events_reported = ctx->events_reported;
+
+	mutex_unlock(&mut);
+	ucma_unlock_files(cur_file, new_file);
+
+response:
+	if (copy_to_user(u64_to_user_ptr(cmd.response),
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	ucma_put_ctx(ctx);
+file_put:
+	fdput(f);
+	return ret;
+}
+
+static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len) = {
+	[RDMA_USER_CM_CMD_CREATE_ID] 	 = ucma_create_id,
+	[RDMA_USER_CM_CMD_DESTROY_ID]	 = ucma_destroy_id,
+	[RDMA_USER_CM_CMD_BIND_IP]	 = ucma_bind_ip,
+	[RDMA_USER_CM_CMD_RESOLVE_IP]	 = ucma_resolve_ip,
+	[RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route,
+	[RDMA_USER_CM_CMD_QUERY_ROUTE]	 = ucma_query_route,
+	[RDMA_USER_CM_CMD_CONNECT]	 = ucma_connect,
+	[RDMA_USER_CM_CMD_LISTEN]	 = ucma_listen,
+	[RDMA_USER_CM_CMD_ACCEPT]	 = ucma_accept,
+	[RDMA_USER_CM_CMD_REJECT]	 = ucma_reject,
+	[RDMA_USER_CM_CMD_DISCONNECT]	 = ucma_disconnect,
+	[RDMA_USER_CM_CMD_INIT_QP_ATTR]	 = ucma_init_qp_attr,
+	[RDMA_USER_CM_CMD_GET_EVENT]	 = ucma_get_event,
+	[RDMA_USER_CM_CMD_GET_OPTION]	 = NULL,
+	[RDMA_USER_CM_CMD_SET_OPTION]	 = ucma_set_option,
+	[RDMA_USER_CM_CMD_NOTIFY]	 = ucma_notify,
+	[RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast,
+	[RDMA_USER_CM_CMD_LEAVE_MCAST]	 = ucma_leave_multicast,
+	[RDMA_USER_CM_CMD_MIGRATE_ID]	 = ucma_migrate_id,
+	[RDMA_USER_CM_CMD_QUERY]	 = ucma_query,
+	[RDMA_USER_CM_CMD_BIND]		 = ucma_bind,
+	[RDMA_USER_CM_CMD_RESOLVE_ADDR]	 = ucma_resolve_addr,
+	[RDMA_USER_CM_CMD_JOIN_MCAST]	 = ucma_join_multicast
+};
+
+static ssize_t ucma_write(struct file *filp, const char __user *buf,
+			  size_t len, loff_t *pos)
+{
+	struct ucma_file *file = filp->private_data;
+	struct rdma_ucm_cmd_hdr hdr;
+	ssize_t ret;
+
+	if (!ib_safe_file_access(filp)) {
+		pr_err_once("ucma_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
+			    task_tgid_vnr(current), current->comm);
+		return -EACCES;
+	}
+
+	if (len < sizeof(hdr))
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof(hdr)))
+		return -EFAULT;
+
+	if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table))
+		return -EINVAL;
+
+	if (hdr.in + sizeof(hdr) > len)
+		return -EINVAL;
+
+	if (!ucma_cmd_table[hdr.cmd])
+		return -ENOSYS;
+
+	ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out);
+	if (!ret)
+		ret = len;
+
+	return ret;
+}
+
+static __poll_t ucma_poll(struct file *filp, struct poll_table_struct *wait)
+{
+	struct ucma_file *file = filp->private_data;
+	__poll_t mask = 0;
+
+	poll_wait(filp, &file->poll_wait, wait);
+
+	if (!list_empty(&file->event_list))
+		mask = EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * ucma_open() does not need the BKL:
+ *
+ *  - no global state is referred to;
+ *  - there is no ioctl method to race against;
+ *  - no further module initialization is required for open to work
+ *    after the device is registered.
+ */
+static int ucma_open(struct inode *inode, struct file *filp)
+{
+	struct ucma_file *file;
+
+	file = kmalloc(sizeof *file, GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	file->close_wq = alloc_ordered_workqueue("ucma_close_id",
+						 WQ_MEM_RECLAIM);
+	if (!file->close_wq) {
+		kfree(file);
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&file->event_list);
+	INIT_LIST_HEAD(&file->ctx_list);
+	init_waitqueue_head(&file->poll_wait);
+	mutex_init(&file->mut);
+
+	filp->private_data = file;
+	file->filp = filp;
+
+	return nonseekable_open(inode, filp);
+}
+
+static int ucma_close(struct inode *inode, struct file *filp)
+{
+	struct ucma_file *file = filp->private_data;
+	struct ucma_context *ctx, *tmp;
+
+	mutex_lock(&file->mut);
+	list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+		ctx->destroying = 1;
+		mutex_unlock(&file->mut);
+
+		mutex_lock(&mut);
+		idr_remove(&ctx_idr, ctx->id);
+		mutex_unlock(&mut);
+
+		flush_workqueue(file->close_wq);
+		/* At that step once ctx was marked as destroying and workqueue
+		 * was flushed we are safe from any inflights handlers that
+		 * might put other closing task.
+		 */
+		mutex_lock(&mut);
+		if (!ctx->closing) {
+			mutex_unlock(&mut);
+			/* rdma_destroy_id ensures that no event handlers are
+			 * inflight for that id before releasing it.
+			 */
+			rdma_destroy_id(ctx->cm_id);
+		} else {
+			mutex_unlock(&mut);
+		}
+
+		ucma_free_ctx(ctx);
+		mutex_lock(&file->mut);
+	}
+	mutex_unlock(&file->mut);
+	destroy_workqueue(file->close_wq);
+	kfree(file);
+	return 0;
+}
+
+static const struct file_operations ucma_fops = {
+	.owner 	 = THIS_MODULE,
+	.open 	 = ucma_open,
+	.release = ucma_close,
+	.write	 = ucma_write,
+	.poll    = ucma_poll,
+	.llseek	 = no_llseek,
+};
+
+static struct miscdevice ucma_misc = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "rdma_cm",
+	.nodename	= "infiniband/rdma_cm",
+	.mode		= 0666,
+	.fops		= &ucma_fops,
+};
+
+static ssize_t show_abi_version(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int __init ucma_init(void)
+{
+	int ret;
+
+	ret = misc_register(&ucma_misc);
+	if (ret)
+		return ret;
+
+	ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version);
+	if (ret) {
+		pr_err("rdma_ucm: couldn't create abi_version attr\n");
+		goto err1;
+	}
+
+	ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table);
+	if (!ucma_ctl_table_hdr) {
+		pr_err("rdma_ucm: couldn't register sysctl paths\n");
+		ret = -ENOMEM;
+		goto err2;
+	}
+	return 0;
+err2:
+	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+err1:
+	misc_deregister(&ucma_misc);
+	return ret;
+}
+
+static void __exit ucma_cleanup(void)
+{
+	unregister_net_sysctl_table(ucma_ctl_table_hdr);
+	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+	misc_deregister(&ucma_misc);
+	idr_destroy(&ctx_idr);
+	idr_destroy(&multicast_idr);
+}
+
+module_init(ucma_init);
+module_exit(ucma_cleanup);
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
new file mode 100644
index 0000000..29a45d2
--- /dev/null
+++ b/drivers/infiniband/core/ud_header.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/export.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+
+#include <rdma/ib_pack.h>
+
+#define STRUCT_FIELD(header, field) \
+	.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field),      \
+	.struct_size_bytes   = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \
+	.field_name          = #header ":" #field
+
+static const struct ib_field lrh_table[]  = {
+	{ STRUCT_FIELD(lrh, virtual_lane),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(lrh, link_version),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(lrh, service_level),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 4 },
+	{ RESERVED,
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(lrh, link_next_header),
+	  .offset_words = 0,
+	  .offset_bits  = 14,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(lrh, destination_lid),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 5 },
+	{ STRUCT_FIELD(lrh, packet_length),
+	  .offset_words = 1,
+	  .offset_bits  = 5,
+	  .size_bits    = 11 },
+	{ STRUCT_FIELD(lrh, source_lid),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field eth_table[]  = {
+	{ STRUCT_FIELD(eth, dmac_h),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ STRUCT_FIELD(eth, dmac_l),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(eth, smac_h),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(eth, smac_l),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ STRUCT_FIELD(eth, type),
+	  .offset_words = 3,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field vlan_table[]  = {
+	{ STRUCT_FIELD(vlan, tag),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(vlan, type),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field ip4_table[]  = {
+	{ STRUCT_FIELD(ip4, ver),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(ip4, hdr_len),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(ip4, tos),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(ip4, tot_len),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(ip4, id),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(ip4, frag_off),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(ip4, ttl),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(ip4, protocol),
+	  .offset_words = 2,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(ip4, check),
+	  .offset_words = 2,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(ip4, saddr),
+	  .offset_words = 3,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ STRUCT_FIELD(ip4, daddr),
+	  .offset_words = 4,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 }
+};
+
+static const struct ib_field udp_table[]  = {
+	{ STRUCT_FIELD(udp, sport),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(udp, dport),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(udp, length),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(udp, csum),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field grh_table[]  = {
+	{ STRUCT_FIELD(grh, ip_version),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(grh, traffic_class),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, flow_label),
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 20 },
+	{ STRUCT_FIELD(grh, payload_length),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(grh, next_header),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, hop_limit),
+	  .offset_words = 1,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, source_gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ STRUCT_FIELD(grh, destination_gid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 }
+};
+
+static const struct ib_field bth_table[]  = {
+	{ STRUCT_FIELD(bth, opcode),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(bth, solicited_event),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ STRUCT_FIELD(bth, mig_req),
+	  .offset_words = 0,
+	  .offset_bits  = 9,
+	  .size_bits    = 1 },
+	{ STRUCT_FIELD(bth, pad_count),
+	  .offset_words = 0,
+	  .offset_bits  = 10,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(bth, transport_header_version),
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(bth, pkey),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(bth, destination_qpn),
+	  .offset_words = 1,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 },
+	{ STRUCT_FIELD(bth, ack_req),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 2,
+	  .offset_bits  = 1,
+	  .size_bits    = 7 },
+	{ STRUCT_FIELD(bth, psn),
+	  .offset_words = 2,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 }
+};
+
+static const struct ib_field deth_table[] = {
+	{ STRUCT_FIELD(deth, qkey),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(deth, source_qpn),
+	  .offset_words = 1,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 }
+};
+
+__sum16 ib_ud_ip4_csum(struct ib_ud_header *header)
+{
+	struct iphdr iph;
+
+	iph.ihl		= 5;
+	iph.version	= 4;
+	iph.tos		= header->ip4.tos;
+	iph.tot_len	= header->ip4.tot_len;
+	iph.id		= header->ip4.id;
+	iph.frag_off	= header->ip4.frag_off;
+	iph.ttl		= header->ip4.ttl;
+	iph.protocol	= header->ip4.protocol;
+	iph.check	= 0;
+	iph.saddr	= header->ip4.saddr;
+	iph.daddr	= header->ip4.daddr;
+
+	return ip_fast_csum((u8 *)&iph, iph.ihl);
+}
+EXPORT_SYMBOL(ib_ud_ip4_csum);
+
+/**
+ * ib_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @lrh_present: specify if LRH is present
+ * @eth_present: specify if Eth header is present
+ * @vlan_present: packet is tagged vlan
+ * @grh_present: GRH flag (if non-zero, GRH will be included)
+ * @ip_version: if non-zero, IP header, V4 or V6, will be included
+ * @udp_present :if non-zero, UDP header will be included
+ * @immediate_present: specify if immediate data is present
+ * @header:Structure to initialize
+ */
+int ib_ud_header_init(int     payload_bytes,
+		      int    lrh_present,
+		      int    eth_present,
+		      int    vlan_present,
+		      int    grh_present,
+		      int    ip_version,
+		      int    udp_present,
+		      int    immediate_present,
+		      struct ib_ud_header *header)
+{
+	size_t udp_bytes = udp_present ? IB_UDP_BYTES : 0;
+
+	grh_present = grh_present && !ip_version;
+	memset(header, 0, sizeof *header);
+
+	/*
+	 * UDP header without IP header doesn't make sense
+	 */
+	if (udp_present && ip_version != 4 && ip_version != 6)
+		return -EINVAL;
+
+	if (lrh_present) {
+		u16 packet_length;
+
+		header->lrh.link_version     = 0;
+		header->lrh.link_next_header =
+			grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
+		packet_length = (IB_LRH_BYTES	+
+				 IB_BTH_BYTES	+
+				 IB_DETH_BYTES	+
+				 (grh_present ? IB_GRH_BYTES : 0) +
+				 payload_bytes	+
+				 4		+ /* ICRC     */
+				 3) / 4;	  /* round up */
+		header->lrh.packet_length = cpu_to_be16(packet_length);
+	}
+
+	if (vlan_present)
+		header->eth.type = cpu_to_be16(ETH_P_8021Q);
+
+	if (ip_version == 6 || grh_present) {
+		header->grh.ip_version      = 6;
+		header->grh.payload_length  =
+			cpu_to_be16((udp_bytes        +
+				     IB_BTH_BYTES     +
+				     IB_DETH_BYTES    +
+				     payload_bytes    +
+				     4                + /* ICRC     */
+				     3) & ~3);          /* round up */
+		header->grh.next_header     = udp_present ? IPPROTO_UDP : 0x1b;
+	}
+
+	if (ip_version == 4) {
+		header->ip4.ver = 4; /* version 4 */
+		header->ip4.hdr_len = 5; /* 5 words */
+		header->ip4.tot_len =
+			cpu_to_be16(IB_IP4_BYTES   +
+				     udp_bytes     +
+				     IB_BTH_BYTES  +
+				     IB_DETH_BYTES +
+				     payload_bytes +
+				     4);     /* ICRC     */
+		header->ip4.protocol = IPPROTO_UDP;
+	}
+	if (udp_present && ip_version)
+		header->udp.length =
+			cpu_to_be16(IB_UDP_BYTES   +
+				     IB_BTH_BYTES  +
+				     IB_DETH_BYTES +
+				     payload_bytes +
+				     4);     /* ICRC     */
+
+	if (immediate_present)
+		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+	else
+		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY;
+	header->bth.pad_count                = (4 - payload_bytes) & 3;
+	header->bth.transport_header_version = 0;
+
+	header->lrh_present = lrh_present;
+	header->eth_present = eth_present;
+	header->vlan_present = vlan_present;
+	header->grh_present = grh_present || (ip_version == 6);
+	header->ipv4_present = ip_version == 4;
+	header->udp_present = udp_present;
+	header->immediate_present = immediate_present;
+	return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_init);
+
+/**
+ * ib_ud_header_pack - Pack UD header struct into wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_pack(struct ib_ud_header *header,
+		      void                *buf)
+{
+	int len = 0;
+
+	if (header->lrh_present) {
+		ib_pack(lrh_table, ARRAY_SIZE(lrh_table),
+			&header->lrh, buf + len);
+		len += IB_LRH_BYTES;
+	}
+	if (header->eth_present) {
+		ib_pack(eth_table, ARRAY_SIZE(eth_table),
+			&header->eth, buf + len);
+		len += IB_ETH_BYTES;
+	}
+	if (header->vlan_present) {
+		ib_pack(vlan_table, ARRAY_SIZE(vlan_table),
+			&header->vlan, buf + len);
+		len += IB_VLAN_BYTES;
+	}
+	if (header->grh_present) {
+		ib_pack(grh_table, ARRAY_SIZE(grh_table),
+			&header->grh, buf + len);
+		len += IB_GRH_BYTES;
+	}
+	if (header->ipv4_present) {
+		ib_pack(ip4_table, ARRAY_SIZE(ip4_table),
+			&header->ip4, buf + len);
+		len += IB_IP4_BYTES;
+	}
+	if (header->udp_present) {
+		ib_pack(udp_table, ARRAY_SIZE(udp_table),
+			&header->udp, buf + len);
+		len += IB_UDP_BYTES;
+	}
+
+	ib_pack(bth_table, ARRAY_SIZE(bth_table),
+		&header->bth, buf + len);
+	len += IB_BTH_BYTES;
+
+	ib_pack(deth_table, ARRAY_SIZE(deth_table),
+		&header->deth, buf + len);
+	len += IB_DETH_BYTES;
+
+	if (header->immediate_present) {
+		memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data);
+		len += sizeof header->immediate_data;
+	}
+
+	return len;
+}
+EXPORT_SYMBOL(ib_ud_header_pack);
+
+/**
+ * ib_ud_header_unpack - Unpack UD header struct from wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() unpacks the UD header structure @header from wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_unpack(void                *buf,
+			struct ib_ud_header *header)
+{
+	ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
+		  buf, &header->lrh);
+	buf += IB_LRH_BYTES;
+
+	if (header->lrh.link_version != 0) {
+		pr_warn("Invalid LRH.link_version %d\n",
+			header->lrh.link_version);
+		return -EINVAL;
+	}
+
+	switch (header->lrh.link_next_header) {
+	case IB_LNH_IBA_LOCAL:
+		header->grh_present = 0;
+		break;
+
+	case IB_LNH_IBA_GLOBAL:
+		header->grh_present = 1;
+		ib_unpack(grh_table, ARRAY_SIZE(grh_table),
+			  buf, &header->grh);
+		buf += IB_GRH_BYTES;
+
+		if (header->grh.ip_version != 6) {
+			pr_warn("Invalid GRH.ip_version %d\n",
+				header->grh.ip_version);
+			return -EINVAL;
+		}
+		if (header->grh.next_header != 0x1b) {
+			pr_warn("Invalid GRH.next_header 0x%02x\n",
+				header->grh.next_header);
+			return -EINVAL;
+		}
+		break;
+
+	default:
+		pr_warn("Invalid LRH.link_next_header %d\n",
+			header->lrh.link_next_header);
+		return -EINVAL;
+	}
+
+	ib_unpack(bth_table, ARRAY_SIZE(bth_table),
+		  buf, &header->bth);
+	buf += IB_BTH_BYTES;
+
+	switch (header->bth.opcode) {
+	case IB_OPCODE_UD_SEND_ONLY:
+		header->immediate_present = 0;
+		break;
+	case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
+		header->immediate_present = 1;
+		break;
+	default:
+		pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode);
+		return -EINVAL;
+	}
+
+	if (header->bth.transport_header_version != 0) {
+		pr_warn("Invalid BTH.transport_header_version %d\n",
+			header->bth.transport_header_version);
+		return -EINVAL;
+	}
+
+	ib_unpack(deth_table, ARRAY_SIZE(deth_table),
+		  buf, &header->deth);
+	buf += IB_DETH_BYTES;
+
+	if (header->immediate_present)
+		memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_unpack);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
new file mode 100644
index 0000000..2b6c9b5
--- /dev/null
+++ b/drivers/infiniband/core/umem.c
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#include <linux/export.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem_odp.h>
+
+#include "uverbs.h"
+
+
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
+{
+	struct scatterlist *sg;
+	struct page *page;
+	int i;
+
+	if (umem->nmap > 0)
+		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
+				umem->npages,
+				DMA_BIDIRECTIONAL);
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
+
+		page = sg_page(sg);
+		if (!PageDirty(page) && umem->writable && dirty)
+			set_page_dirty_lock(page);
+		put_page(page);
+	}
+
+	sg_free_table(&umem->sg_head);
+	return;
+
+}
+
+/**
+ * ib_umem_get - Pin and DMA map userspace memory.
+ *
+ * If access flags indicate ODP memory, avoid pinning. Instead, stores
+ * the mm for future page fault handling in conjunction with MMU notifiers.
+ *
+ * @context: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @dmasync: flush in-flight DMA when the memory region is written
+ */
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+			    size_t size, int access, int dmasync)
+{
+	struct ib_umem *umem;
+	struct page **page_list;
+	struct vm_area_struct **vma_list;
+	unsigned long locked;
+	unsigned long lock_limit;
+	unsigned long cur_base;
+	unsigned long npages;
+	int ret;
+	int i;
+	unsigned long dma_attrs = 0;
+	struct scatterlist *sg, *sg_list_start;
+	int need_release = 0;
+	unsigned int gup_flags = FOLL_WRITE;
+
+	if (dmasync)
+		dma_attrs |= DMA_ATTR_WRITE_BARRIER;
+
+	/*
+	 * If the combination of the addr and size requested for this memory
+	 * region causes an integer overflow, return error.
+	 */
+	if (((addr + size) < addr) ||
+	    PAGE_ALIGN(addr + size) < (addr + size))
+		return ERR_PTR(-EINVAL);
+
+	if (!can_do_mlock())
+		return ERR_PTR(-EPERM);
+
+	umem = kzalloc(sizeof *umem, GFP_KERNEL);
+	if (!umem)
+		return ERR_PTR(-ENOMEM);
+
+	umem->context    = context;
+	umem->length     = size;
+	umem->address    = addr;
+	umem->page_shift = PAGE_SHIFT;
+	/*
+	 * We ask for writable memory if any of the following
+	 * access flags are set.  "Local write" and "remote write"
+	 * obviously require write access.  "Remote atomic" can do
+	 * things like fetch and add, which will modify memory, and
+	 * "MW bind" can change permissions by binding a window.
+	 */
+	umem->writable  = !!(access &
+		(IB_ACCESS_LOCAL_WRITE   | IB_ACCESS_REMOTE_WRITE |
+		 IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
+
+	if (access & IB_ACCESS_ON_DEMAND) {
+		ret = ib_umem_odp_get(context, umem, access);
+		if (ret) {
+			kfree(umem);
+			return ERR_PTR(ret);
+		}
+		return umem;
+	}
+
+	umem->odp_data = NULL;
+
+	/* We assume the memory is from hugetlb until proved otherwise */
+	umem->hugetlb   = 1;
+
+	page_list = (struct page **) __get_free_page(GFP_KERNEL);
+	if (!page_list) {
+		kfree(umem);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * if we can't alloc the vma_list, it's not so bad;
+	 * just assume the memory is not hugetlb memory
+	 */
+	vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
+	if (!vma_list)
+		umem->hugetlb = 0;
+
+	npages = ib_umem_num_pages(umem);
+
+	down_write(&current->mm->mmap_sem);
+
+	locked     = npages + current->mm->pinned_vm;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	cur_base = addr & PAGE_MASK;
+
+	if (npages == 0 || npages > UINT_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
+	if (ret)
+		goto out;
+
+	if (!umem->writable)
+		gup_flags |= FOLL_FORCE;
+
+	need_release = 1;
+	sg_list_start = umem->sg_head.sgl;
+
+	while (npages) {
+		ret = get_user_pages_longterm(cur_base,
+				     min_t(unsigned long, npages,
+					   PAGE_SIZE / sizeof (struct page *)),
+				     gup_flags, page_list, vma_list);
+
+		if (ret < 0)
+			goto out;
+
+		umem->npages += ret;
+		cur_base += ret * PAGE_SIZE;
+		npages   -= ret;
+
+		for_each_sg(sg_list_start, sg, ret, i) {
+			if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
+				umem->hugetlb = 0;
+
+			sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
+		}
+
+		/* preparing for next loop */
+		sg_list_start = sg;
+	}
+
+	umem->nmap = ib_dma_map_sg_attrs(context->device,
+				  umem->sg_head.sgl,
+				  umem->npages,
+				  DMA_BIDIRECTIONAL,
+				  dma_attrs);
+
+	if (umem->nmap <= 0) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = 0;
+
+out:
+	if (ret < 0) {
+		if (need_release)
+			__ib_umem_release(context->device, umem, 0);
+		kfree(umem);
+	} else
+		current->mm->pinned_vm = locked;
+
+	up_write(&current->mm->mmap_sem);
+	if (vma_list)
+		free_page((unsigned long) vma_list);
+	free_page((unsigned long) page_list);
+
+	return ret < 0 ? ERR_PTR(ret) : umem;
+}
+EXPORT_SYMBOL(ib_umem_get);
+
+static void ib_umem_account(struct work_struct *work)
+{
+	struct ib_umem *umem = container_of(work, struct ib_umem, work);
+
+	down_write(&umem->mm->mmap_sem);
+	umem->mm->pinned_vm -= umem->diff;
+	up_write(&umem->mm->mmap_sem);
+	mmput(umem->mm);
+	kfree(umem);
+}
+
+/**
+ * ib_umem_release - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+void ib_umem_release(struct ib_umem *umem)
+{
+	struct ib_ucontext *context = umem->context;
+	struct mm_struct *mm;
+	struct task_struct *task;
+	unsigned long diff;
+
+	if (umem->odp_data) {
+		ib_umem_odp_release(umem);
+		return;
+	}
+
+	__ib_umem_release(umem->context->device, umem, 1);
+
+	task = get_pid_task(umem->context->tgid, PIDTYPE_PID);
+	if (!task)
+		goto out;
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	diff = ib_umem_num_pages(umem);
+
+	/*
+	 * We may be called with the mm's mmap_sem already held.  This
+	 * can happen when a userspace munmap() is the call that drops
+	 * the last reference to our file and calls our release
+	 * method.  If there are memory regions to destroy, we'll end
+	 * up here and not be able to take the mmap_sem.  In that case
+	 * we defer the vm_locked accounting to the system workqueue.
+	 */
+	if (context->closing) {
+		if (!down_write_trylock(&mm->mmap_sem)) {
+			INIT_WORK(&umem->work, ib_umem_account);
+			umem->mm   = mm;
+			umem->diff = diff;
+
+			queue_work(ib_wq, &umem->work);
+			return;
+		}
+	} else
+		down_write(&mm->mmap_sem);
+
+	mm->pinned_vm -= diff;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+out:
+	kfree(umem);
+}
+EXPORT_SYMBOL(ib_umem_release);
+
+int ib_umem_page_count(struct ib_umem *umem)
+{
+	int i;
+	int n;
+	struct scatterlist *sg;
+
+	if (umem->odp_data)
+		return ib_umem_num_pages(umem);
+
+	n = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
+		n += sg_dma_len(sg) >> umem->page_shift;
+
+	return n;
+}
+EXPORT_SYMBOL(ib_umem_page_count);
+
+/*
+ * Copy from the given ib_umem's pages to the given buffer.
+ *
+ * umem - the umem to copy from
+ * offset - offset to start copying from
+ * dst - destination buffer
+ * length - buffer length
+ *
+ * Returns 0 on success, or an error code.
+ */
+int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+		      size_t length)
+{
+	size_t end = offset + length;
+	int ret;
+
+	if (offset > umem->length || length > umem->length - offset) {
+		pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
+		       offset, umem->length, end);
+		return -EINVAL;
+	}
+
+	ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length,
+				 offset + ib_umem_offset(umem));
+
+	if (ret < 0)
+		return ret;
+	else if (ret != length)
+		return -EINVAL;
+	else
+		return 0;
+}
+EXPORT_SYMBOL(ib_umem_copy_from);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
new file mode 100644
index 0000000..2aadf58
--- /dev/null
+++ b/drivers/infiniband/core/umem_odp.c
@@ -0,0 +1,828 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+#include <linux/pid.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/hugetlb.h>
+#include <linux/interval_tree_generic.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+
+/*
+ * The ib_umem list keeps track of memory regions for which the HW
+ * device request to receive notification when the related memory
+ * mapping is changed.
+ *
+ * ib_umem_lock protects the list.
+ */
+
+static u64 node_start(struct umem_odp_node *n)
+{
+	struct ib_umem_odp *umem_odp =
+			container_of(n, struct ib_umem_odp, interval_tree);
+
+	return ib_umem_start(umem_odp->umem);
+}
+
+/* Note that the representation of the intervals in the interval tree
+ * considers the ending point as contained in the interval, while the
+ * function ib_umem_end returns the first address which is not contained
+ * in the umem.
+ */
+static u64 node_last(struct umem_odp_node *n)
+{
+	struct ib_umem_odp *umem_odp =
+			container_of(n, struct ib_umem_odp, interval_tree);
+
+	return ib_umem_end(umem_odp->umem) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
+		     node_start, node_last, static, rbt_ib_umem)
+
+static void ib_umem_notifier_start_account(struct ib_umem *item)
+{
+	mutex_lock(&item->odp_data->umem_mutex);
+
+	/* Only update private counters for this umem if it has them.
+	 * Otherwise skip it. All page faults will be delayed for this umem. */
+	if (item->odp_data->mn_counters_active) {
+		int notifiers_count = item->odp_data->notifiers_count++;
+
+		if (notifiers_count == 0)
+			/* Initialize the completion object for waiting on
+			 * notifiers. Since notifier_count is zero, no one
+			 * should be waiting right now. */
+			reinit_completion(&item->odp_data->notifier_completion);
+	}
+	mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+static void ib_umem_notifier_end_account(struct ib_umem *item)
+{
+	mutex_lock(&item->odp_data->umem_mutex);
+
+	/* Only update private counters for this umem if it has them.
+	 * Otherwise skip it. All page faults will be delayed for this umem. */
+	if (item->odp_data->mn_counters_active) {
+		/*
+		 * This sequence increase will notify the QP page fault that
+		 * the page that is going to be mapped in the spte could have
+		 * been freed.
+		 */
+		++item->odp_data->notifiers_seq;
+		if (--item->odp_data->notifiers_count == 0)
+			complete_all(&item->odp_data->notifier_completion);
+	}
+	mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+/* Account for a new mmu notifier in an ib_ucontext. */
+static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
+{
+	atomic_inc(&context->notifier_count);
+}
+
+/* Account for a terminating mmu notifier in an ib_ucontext.
+ *
+ * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
+ * the function takes the semaphore itself. */
+static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
+{
+	int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
+
+	if (zero_notifiers &&
+	    !list_empty(&context->no_private_counters)) {
+		/* No currently running mmu notifiers. Now is the chance to
+		 * add private accounting to all previously added umems. */
+		struct ib_umem_odp *odp_data, *next;
+
+		/* Prevent concurrent mmu notifiers from working on the
+		 * no_private_counters list. */
+		down_write(&context->umem_rwsem);
+
+		/* Read the notifier_count again, with the umem_rwsem
+		 * semaphore taken for write. */
+		if (!atomic_read(&context->notifier_count)) {
+			list_for_each_entry_safe(odp_data, next,
+						 &context->no_private_counters,
+						 no_private_counters) {
+				mutex_lock(&odp_data->umem_mutex);
+				odp_data->mn_counters_active = true;
+				list_del(&odp_data->no_private_counters);
+				complete_all(&odp_data->notifier_completion);
+				mutex_unlock(&odp_data->umem_mutex);
+			}
+		}
+
+		up_write(&context->umem_rwsem);
+	}
+}
+
+static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
+					       u64 end, void *cookie) {
+	/*
+	 * Increase the number of notifiers running, to
+	 * prevent any further fault handling on this MR.
+	 */
+	ib_umem_notifier_start_account(item);
+	item->odp_data->dying = 1;
+	/* Make sure that the fact the umem is dying is out before we release
+	 * all pending page faults. */
+	smp_wmb();
+	complete_all(&item->odp_data->notifier_completion);
+	item->context->invalidate_range(item, ib_umem_start(item),
+					ib_umem_end(item));
+	return 0;
+}
+
+static void ib_umem_notifier_release(struct mmu_notifier *mn,
+				     struct mm_struct *mm)
+{
+	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+	if (!context->invalidate_range)
+		return;
+
+	ib_ucontext_notifier_start_account(context);
+	down_read(&context->umem_rwsem);
+	rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
+				      ULLONG_MAX,
+				      ib_umem_notifier_release_trampoline,
+				      NULL);
+	up_read(&context->umem_rwsem);
+}
+
+static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
+				      u64 end, void *cookie)
+{
+	ib_umem_notifier_start_account(item);
+	item->context->invalidate_range(item, start, start + PAGE_SIZE);
+	ib_umem_notifier_end_account(item);
+	return 0;
+}
+
+static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
+					     u64 end, void *cookie)
+{
+	ib_umem_notifier_start_account(item);
+	item->context->invalidate_range(item, start, end);
+	return 0;
+}
+
+static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
+						    struct mm_struct *mm,
+						    unsigned long start,
+						    unsigned long end)
+{
+	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+	if (!context->invalidate_range)
+		return;
+
+	ib_ucontext_notifier_start_account(context);
+	down_read(&context->umem_rwsem);
+	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+				      end,
+				      invalidate_range_start_trampoline, NULL);
+	up_read(&context->umem_rwsem);
+}
+
+static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
+					   u64 end, void *cookie)
+{
+	ib_umem_notifier_end_account(item);
+	return 0;
+}
+
+static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
+						  struct mm_struct *mm,
+						  unsigned long start,
+						  unsigned long end)
+{
+	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+	if (!context->invalidate_range)
+		return;
+
+	down_read(&context->umem_rwsem);
+	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+				      end,
+				      invalidate_range_end_trampoline, NULL);
+	up_read(&context->umem_rwsem);
+	ib_ucontext_notifier_end_account(context);
+}
+
+static const struct mmu_notifier_ops ib_umem_notifiers = {
+	.release                    = ib_umem_notifier_release,
+	.invalidate_range_start     = ib_umem_notifier_invalidate_range_start,
+	.invalidate_range_end       = ib_umem_notifier_invalidate_range_end,
+};
+
+struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context,
+				  unsigned long addr,
+				  size_t size)
+{
+	struct ib_umem *umem;
+	struct ib_umem_odp *odp_data;
+	int pages = size >> PAGE_SHIFT;
+	int ret;
+
+	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+	if (!umem)
+		return ERR_PTR(-ENOMEM);
+
+	umem->context    = context;
+	umem->length     = size;
+	umem->address    = addr;
+	umem->page_shift = PAGE_SHIFT;
+	umem->writable   = 1;
+
+	odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
+	if (!odp_data) {
+		ret = -ENOMEM;
+		goto out_umem;
+	}
+	odp_data->umem = umem;
+
+	mutex_init(&odp_data->umem_mutex);
+	init_completion(&odp_data->notifier_completion);
+
+	odp_data->page_list = vzalloc(pages * sizeof(*odp_data->page_list));
+	if (!odp_data->page_list) {
+		ret = -ENOMEM;
+		goto out_odp_data;
+	}
+
+	odp_data->dma_list = vzalloc(pages * sizeof(*odp_data->dma_list));
+	if (!odp_data->dma_list) {
+		ret = -ENOMEM;
+		goto out_page_list;
+	}
+
+	down_write(&context->umem_rwsem);
+	context->odp_mrs_count++;
+	rbt_ib_umem_insert(&odp_data->interval_tree, &context->umem_tree);
+	if (likely(!atomic_read(&context->notifier_count)))
+		odp_data->mn_counters_active = true;
+	else
+		list_add(&odp_data->no_private_counters,
+			 &context->no_private_counters);
+	up_write(&context->umem_rwsem);
+
+	umem->odp_data = odp_data;
+
+	return umem;
+
+out_page_list:
+	vfree(odp_data->page_list);
+out_odp_data:
+	kfree(odp_data);
+out_umem:
+	kfree(umem);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_alloc_odp_umem);
+
+int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem,
+		    int access)
+{
+	int ret_val;
+	struct pid *our_pid;
+	struct mm_struct *mm = get_task_mm(current);
+
+	if (!mm)
+		return -EINVAL;
+
+	if (access & IB_ACCESS_HUGETLB) {
+		struct vm_area_struct *vma;
+		struct hstate *h;
+
+		down_read(&mm->mmap_sem);
+		vma = find_vma(mm, ib_umem_start(umem));
+		if (!vma || !is_vm_hugetlb_page(vma)) {
+			up_read(&mm->mmap_sem);
+			return -EINVAL;
+		}
+		h = hstate_vma(vma);
+		umem->page_shift = huge_page_shift(h);
+		up_read(&mm->mmap_sem);
+		umem->hugetlb = 1;
+	} else {
+		umem->hugetlb = 0;
+	}
+
+	/* Prevent creating ODP MRs in child processes */
+	rcu_read_lock();
+	our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
+	rcu_read_unlock();
+	put_pid(our_pid);
+	if (context->tgid != our_pid) {
+		ret_val = -EINVAL;
+		goto out_mm;
+	}
+
+	umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
+	if (!umem->odp_data) {
+		ret_val = -ENOMEM;
+		goto out_mm;
+	}
+	umem->odp_data->umem = umem;
+
+	mutex_init(&umem->odp_data->umem_mutex);
+
+	init_completion(&umem->odp_data->notifier_completion);
+
+	if (ib_umem_num_pages(umem)) {
+		umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
+					    sizeof(*umem->odp_data->page_list));
+		if (!umem->odp_data->page_list) {
+			ret_val = -ENOMEM;
+			goto out_odp_data;
+		}
+
+		umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) *
+					  sizeof(*umem->odp_data->dma_list));
+		if (!umem->odp_data->dma_list) {
+			ret_val = -ENOMEM;
+			goto out_page_list;
+		}
+	}
+
+	/*
+	 * When using MMU notifiers, we will get a
+	 * notification before the "current" task (and MM) is
+	 * destroyed. We use the umem_rwsem semaphore to synchronize.
+	 */
+	down_write(&context->umem_rwsem);
+	context->odp_mrs_count++;
+	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+		rbt_ib_umem_insert(&umem->odp_data->interval_tree,
+				   &context->umem_tree);
+	if (likely(!atomic_read(&context->notifier_count)) ||
+	    context->odp_mrs_count == 1)
+		umem->odp_data->mn_counters_active = true;
+	else
+		list_add(&umem->odp_data->no_private_counters,
+			 &context->no_private_counters);
+	downgrade_write(&context->umem_rwsem);
+
+	if (context->odp_mrs_count == 1) {
+		/*
+		 * Note that at this point, no MMU notifier is running
+		 * for this context!
+		 */
+		atomic_set(&context->notifier_count, 0);
+		INIT_HLIST_NODE(&context->mn.hlist);
+		context->mn.ops = &ib_umem_notifiers;
+		/*
+		 * Lock-dep detects a false positive for mmap_sem vs.
+		 * umem_rwsem, due to not grasping downgrade_write correctly.
+		 */
+		lockdep_off();
+		ret_val = mmu_notifier_register(&context->mn, mm);
+		lockdep_on();
+		if (ret_val) {
+			pr_err("Failed to register mmu_notifier %d\n", ret_val);
+			ret_val = -EBUSY;
+			goto out_mutex;
+		}
+	}
+
+	up_read(&context->umem_rwsem);
+
+	/*
+	 * Note that doing an mmput can cause a notifier for the relevant mm.
+	 * If the notifier is called while we hold the umem_rwsem, this will
+	 * cause a deadlock. Therefore, we release the reference only after we
+	 * released the semaphore.
+	 */
+	mmput(mm);
+	return 0;
+
+out_mutex:
+	up_read(&context->umem_rwsem);
+	vfree(umem->odp_data->dma_list);
+out_page_list:
+	vfree(umem->odp_data->page_list);
+out_odp_data:
+	kfree(umem->odp_data);
+out_mm:
+	mmput(mm);
+	return ret_val;
+}
+
+void ib_umem_odp_release(struct ib_umem *umem)
+{
+	struct ib_ucontext *context = umem->context;
+
+	/*
+	 * Ensure that no more pages are mapped in the umem.
+	 *
+	 * It is the driver's responsibility to ensure, before calling us,
+	 * that the hardware will not attempt to access the MR any more.
+	 */
+	ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
+				    ib_umem_end(umem));
+
+	down_write(&context->umem_rwsem);
+	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+		rbt_ib_umem_remove(&umem->odp_data->interval_tree,
+				   &context->umem_tree);
+	context->odp_mrs_count--;
+	if (!umem->odp_data->mn_counters_active) {
+		list_del(&umem->odp_data->no_private_counters);
+		complete_all(&umem->odp_data->notifier_completion);
+	}
+
+	/*
+	 * Downgrade the lock to a read lock. This ensures that the notifiers
+	 * (who lock the mutex for reading) will be able to finish, and we
+	 * will be able to enventually obtain the mmu notifiers SRCU. Note
+	 * that since we are doing it atomically, no other user could register
+	 * and unregister while we do the check.
+	 */
+	downgrade_write(&context->umem_rwsem);
+	if (!context->odp_mrs_count) {
+		struct task_struct *owning_process = NULL;
+		struct mm_struct *owning_mm        = NULL;
+
+		owning_process = get_pid_task(context->tgid,
+					      PIDTYPE_PID);
+		if (owning_process == NULL)
+			/*
+			 * The process is already dead, notifier were removed
+			 * already.
+			 */
+			goto out;
+
+		owning_mm = get_task_mm(owning_process);
+		if (owning_mm == NULL)
+			/*
+			 * The process' mm is already dead, notifier were
+			 * removed already.
+			 */
+			goto out_put_task;
+		mmu_notifier_unregister(&context->mn, owning_mm);
+
+		mmput(owning_mm);
+
+out_put_task:
+		put_task_struct(owning_process);
+	}
+out:
+	up_read(&context->umem_rwsem);
+
+	vfree(umem->odp_data->dma_list);
+	vfree(umem->odp_data->page_list);
+	kfree(umem->odp_data);
+	kfree(umem);
+}
+
+/*
+ * Map for DMA and insert a single page into the on-demand paging page tables.
+ *
+ * @umem: the umem to insert the page to.
+ * @page_index: index in the umem to add the page to.
+ * @page: the page struct to map and add.
+ * @access_mask: access permissions needed for this page.
+ * @current_seq: sequence number for synchronization with invalidations.
+ *               the sequence number is taken from
+ *               umem->odp_data->notifiers_seq.
+ *
+ * The function returns -EFAULT if the DMA mapping operation fails. It returns
+ * -EAGAIN if a concurrent invalidation prevents us from updating the page.
+ *
+ * The page is released via put_page even if the operation failed. For
+ * on-demand pinning, the page is released whenever it isn't stored in the
+ * umem.
+ */
+static int ib_umem_odp_map_dma_single_page(
+		struct ib_umem *umem,
+		int page_index,
+		struct page *page,
+		u64 access_mask,
+		unsigned long current_seq)
+{
+	struct ib_device *dev = umem->context->device;
+	dma_addr_t dma_addr;
+	int stored_page = 0;
+	int remove_existing_mapping = 0;
+	int ret = 0;
+
+	/*
+	 * Note: we avoid writing if seq is different from the initial seq, to
+	 * handle case of a racing notifier. This check also allows us to bail
+	 * early if we have a notifier running in parallel with us.
+	 */
+	if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+	if (!(umem->odp_data->dma_list[page_index])) {
+		dma_addr = ib_dma_map_page(dev,
+					   page,
+					   0, BIT(umem->page_shift),
+					   DMA_BIDIRECTIONAL);
+		if (ib_dma_mapping_error(dev, dma_addr)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		umem->odp_data->dma_list[page_index] = dma_addr | access_mask;
+		umem->odp_data->page_list[page_index] = page;
+		umem->npages++;
+		stored_page = 1;
+	} else if (umem->odp_data->page_list[page_index] == page) {
+		umem->odp_data->dma_list[page_index] |= access_mask;
+	} else {
+		pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
+		       umem->odp_data->page_list[page_index], page);
+		/* Better remove the mapping now, to prevent any further
+		 * damage. */
+		remove_existing_mapping = 1;
+	}
+
+out:
+	/* On Demand Paging - avoid pinning the page */
+	if (umem->context->invalidate_range || !stored_page)
+		put_page(page);
+
+	if (remove_existing_mapping && umem->context->invalidate_range) {
+		invalidate_page_trampoline(
+			umem,
+			ib_umem_start(umem) + (page_index >> umem->page_shift),
+			ib_umem_start(umem) + ((page_index + 1) >>
+					       umem->page_shift),
+			NULL);
+		ret = -EAGAIN;
+	}
+
+	return ret;
+}
+
+/**
+ * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
+ *
+ * Pins the range of pages passed in the argument, and maps them to
+ * DMA addresses. The DMA addresses of the mapped pages is updated in
+ * umem->odp_data->dma_list.
+ *
+ * Returns the number of pages mapped in success, negative error code
+ * for failure.
+ * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
+ * the function from completing its task.
+ * An -ENOENT error code indicates that userspace process is being terminated
+ * and mm was already destroyed.
+ * @umem: the umem to map and pin
+ * @user_virt: the address from which we need to map.
+ * @bcnt: the minimal number of bytes to pin and map. The mapping might be
+ *        bigger due to alignment, and may also be smaller in case of an error
+ *        pinning or mapping a page. The actual pages mapped is returned in
+ *        the return value.
+ * @access_mask: bit mask of the requested access permissions for the given
+ *               range.
+ * @current_seq: the MMU notifiers sequance value for synchronization with
+ *               invalidations. the sequance number is read from
+ *               umem->odp_data->notifiers_seq before calling this function
+ */
+int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
+			      u64 access_mask, unsigned long current_seq)
+{
+	struct task_struct *owning_process  = NULL;
+	struct mm_struct   *owning_mm       = NULL;
+	struct page       **local_page_list = NULL;
+	u64 page_mask, off;
+	int j, k, ret = 0, start_idx, npages = 0, page_shift;
+	unsigned int flags = 0;
+	phys_addr_t p = 0;
+
+	if (access_mask == 0)
+		return -EINVAL;
+
+	if (user_virt < ib_umem_start(umem) ||
+	    user_virt + bcnt > ib_umem_end(umem))
+		return -EFAULT;
+
+	local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
+	if (!local_page_list)
+		return -ENOMEM;
+
+	page_shift = umem->page_shift;
+	page_mask = ~(BIT(page_shift) - 1);
+	off = user_virt & (~page_mask);
+	user_virt = user_virt & page_mask;
+	bcnt += off; /* Charge for the first page offset as well. */
+
+	owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
+	if (owning_process == NULL) {
+		ret = -EINVAL;
+		goto out_no_task;
+	}
+
+	owning_mm = get_task_mm(owning_process);
+	if (owning_mm == NULL) {
+		ret = -ENOENT;
+		goto out_put_task;
+	}
+
+	if (access_mask & ODP_WRITE_ALLOWED_BIT)
+		flags |= FOLL_WRITE;
+
+	start_idx = (user_virt - ib_umem_start(umem)) >> page_shift;
+	k = start_idx;
+
+	while (bcnt > 0) {
+		const size_t gup_num_pages = min_t(size_t,
+				(bcnt + BIT(page_shift) - 1) >> page_shift,
+				PAGE_SIZE / sizeof(struct page *));
+
+		down_read(&owning_mm->mmap_sem);
+		/*
+		 * Note: this might result in redundent page getting. We can
+		 * avoid this by checking dma_list to be 0 before calling
+		 * get_user_pages. However, this make the code much more
+		 * complex (and doesn't gain us much performance in most use
+		 * cases).
+		 */
+		npages = get_user_pages_remote(owning_process, owning_mm,
+				user_virt, gup_num_pages,
+				flags, local_page_list, NULL, NULL);
+		up_read(&owning_mm->mmap_sem);
+
+		if (npages < 0)
+			break;
+
+		bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
+		mutex_lock(&umem->odp_data->umem_mutex);
+		for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) {
+			if (user_virt & ~page_mask) {
+				p += PAGE_SIZE;
+				if (page_to_phys(local_page_list[j]) != p) {
+					ret = -EFAULT;
+					break;
+				}
+				put_page(local_page_list[j]);
+				continue;
+			}
+
+			ret = ib_umem_odp_map_dma_single_page(
+					umem, k, local_page_list[j],
+					access_mask, current_seq);
+			if (ret < 0)
+				break;
+
+			p = page_to_phys(local_page_list[j]);
+			k++;
+		}
+		mutex_unlock(&umem->odp_data->umem_mutex);
+
+		if (ret < 0) {
+			/* Release left over pages when handling errors. */
+			for (++j; j < npages; ++j)
+				put_page(local_page_list[j]);
+			break;
+		}
+	}
+
+	if (ret >= 0) {
+		if (npages < 0 && k == start_idx)
+			ret = npages;
+		else
+			ret = k - start_idx;
+	}
+
+	mmput(owning_mm);
+out_put_task:
+	put_task_struct(owning_process);
+out_no_task:
+	free_page((unsigned long)local_page_list);
+	return ret;
+}
+EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
+
+void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
+				 u64 bound)
+{
+	int idx;
+	u64 addr;
+	struct ib_device *dev = umem->context->device;
+
+	virt  = max_t(u64, virt,  ib_umem_start(umem));
+	bound = min_t(u64, bound, ib_umem_end(umem));
+	/* Note that during the run of this function, the
+	 * notifiers_count of the MR is > 0, preventing any racing
+	 * faults from completion. We might be racing with other
+	 * invalidations, so we must make sure we free each page only
+	 * once. */
+	mutex_lock(&umem->odp_data->umem_mutex);
+	for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) {
+		idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
+		if (umem->odp_data->page_list[idx]) {
+			struct page *page = umem->odp_data->page_list[idx];
+			dma_addr_t dma = umem->odp_data->dma_list[idx];
+			dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
+
+			WARN_ON(!dma_addr);
+
+			ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
+					  DMA_BIDIRECTIONAL);
+			if (dma & ODP_WRITE_ALLOWED_BIT) {
+				struct page *head_page = compound_head(page);
+				/*
+				 * set_page_dirty prefers being called with
+				 * the page lock. However, MMU notifiers are
+				 * called sometimes with and sometimes without
+				 * the lock. We rely on the umem_mutex instead
+				 * to prevent other mmu notifiers from
+				 * continuing and allowing the page mapping to
+				 * be removed.
+				 */
+				set_page_dirty(head_page);
+			}
+			/* on demand pinning support */
+			if (!umem->context->invalidate_range)
+				put_page(page);
+			umem->odp_data->page_list[idx] = NULL;
+			umem->odp_data->dma_list[idx] = 0;
+			umem->npages--;
+		}
+	}
+	mutex_unlock(&umem->odp_data->umem_mutex);
+}
+EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
+
+/* @last is not a part of the interval. See comment for function
+ * node_last.
+ */
+int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
+				  u64 start, u64 last,
+				  umem_call_back cb,
+				  void *cookie)
+{
+	int ret_val = 0;
+	struct umem_odp_node *node, *next;
+	struct ib_umem_odp *umem;
+
+	if (unlikely(start == last))
+		return ret_val;
+
+	for (node = rbt_ib_umem_iter_first(root, start, last - 1);
+			node; node = next) {
+		next = rbt_ib_umem_iter_next(node, start, last - 1);
+		umem = container_of(node, struct ib_umem_odp, interval_tree);
+		ret_val = cb(umem->umem, start, last, cookie) || ret_val;
+	}
+
+	return ret_val;
+}
+EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range);
+
+struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root,
+				       u64 addr, u64 length)
+{
+	struct umem_odp_node *node;
+
+	node = rbt_ib_umem_iter_first(root, addr, addr + length - 1);
+	if (node)
+		return container_of(node, struct ib_umem_odp, interval_tree);
+	return NULL;
+
+}
+EXPORT_SYMBOL(rbt_ib_umem_lookup);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
new file mode 100644
index 0000000..bb98c9e
--- /dev/null
+++ b/drivers/infiniband/core/user_mad.c
@@ -0,0 +1,1405 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2008 Cisco. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "user_mad: " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/dma-mapping.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
+
+#include <linux/uaccess.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_mad.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	IB_UMAD_MAX_PORTS  = RDMA_MAX_PORTS,
+	IB_UMAD_MAX_AGENTS = 32,
+
+	IB_UMAD_MAJOR      = 231,
+	IB_UMAD_MINOR_BASE = 0,
+	IB_UMAD_NUM_FIXED_MINOR = 64,
+	IB_UMAD_NUM_DYNAMIC_MINOR = IB_UMAD_MAX_PORTS - IB_UMAD_NUM_FIXED_MINOR,
+	IB_ISSM_MINOR_BASE        = IB_UMAD_NUM_FIXED_MINOR,
+};
+
+/*
+ * Our lifetime rules for these structs are the following:
+ * device special file is opened, we take a reference on the
+ * ib_umad_port's struct ib_umad_device. We drop these
+ * references in the corresponding close().
+ *
+ * In addition to references coming from open character devices, there
+ * is one more reference to each ib_umad_device representing the
+ * module's reference taken when allocating the ib_umad_device in
+ * ib_umad_add_one().
+ *
+ * When destroying an ib_umad_device, we drop the module's reference.
+ */
+
+struct ib_umad_port {
+	struct cdev           cdev;
+	struct device	      *dev;
+
+	struct cdev           sm_cdev;
+	struct device	      *sm_dev;
+	struct semaphore       sm_sem;
+
+	struct mutex	       file_mutex;
+	struct list_head       file_list;
+
+	struct ib_device      *ib_dev;
+	struct ib_umad_device *umad_dev;
+	int                    dev_num;
+	u8                     port_num;
+};
+
+struct ib_umad_device {
+	struct kobject       kobj;
+	struct ib_umad_port  port[0];
+};
+
+struct ib_umad_file {
+	struct mutex		mutex;
+	struct ib_umad_port    *port;
+	struct list_head	recv_list;
+	struct list_head	send_list;
+	struct list_head	port_list;
+	spinlock_t		send_lock;
+	wait_queue_head_t	recv_wait;
+	struct ib_mad_agent    *agent[IB_UMAD_MAX_AGENTS];
+	int			agents_dead;
+	u8			use_pkey_index;
+	u8			already_used;
+};
+
+struct ib_umad_packet {
+	struct ib_mad_send_buf *msg;
+	struct ib_mad_recv_wc  *recv_wc;
+	struct list_head   list;
+	int		   length;
+	struct ib_user_mad mad;
+};
+
+static struct class *umad_class;
+
+static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
+static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) +
+				   IB_UMAD_NUM_FIXED_MINOR;
+static dev_t dynamic_umad_dev;
+static dev_t dynamic_issm_dev;
+
+static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
+
+static void ib_umad_add_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device, void *client_data);
+
+static void ib_umad_release_dev(struct kobject *kobj)
+{
+	struct ib_umad_device *dev =
+		container_of(kobj, struct ib_umad_device, kobj);
+
+	kfree(dev);
+}
+
+static struct kobj_type ib_umad_dev_ktype = {
+	.release = ib_umad_release_dev,
+};
+
+static int hdr_size(struct ib_umad_file *file)
+{
+	return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) :
+		sizeof (struct ib_user_mad_hdr_old);
+}
+
+/* caller must hold file->mutex */
+static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
+{
+	return file->agents_dead ? NULL : file->agent[id];
+}
+
+static int queue_packet(struct ib_umad_file *file,
+			struct ib_mad_agent *agent,
+			struct ib_umad_packet *packet)
+{
+	int ret = 1;
+
+	mutex_lock(&file->mutex);
+
+	for (packet->mad.hdr.id = 0;
+	     packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
+	     packet->mad.hdr.id++)
+		if (agent == __get_agent(file, packet->mad.hdr.id)) {
+			list_add_tail(&packet->list, &file->recv_list);
+			wake_up_interruptible(&file->recv_wait);
+			ret = 0;
+			break;
+		}
+
+	mutex_unlock(&file->mutex);
+
+	return ret;
+}
+
+static void dequeue_send(struct ib_umad_file *file,
+			 struct ib_umad_packet *packet)
+{
+	spin_lock_irq(&file->send_lock);
+	list_del(&packet->list);
+	spin_unlock_irq(&file->send_lock);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *send_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet = send_wc->send_buf->context[0];
+
+	dequeue_send(file, packet);
+	rdma_destroy_ah(packet->msg->ah);
+	ib_free_send_mad(packet->msg);
+
+	if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
+		packet->length = IB_MGMT_MAD_HDR;
+		packet->mad.hdr.status = ETIMEDOUT;
+		if (!queue_packet(file, agent, packet))
+			return;
+	}
+	kfree(packet);
+}
+
+static void recv_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_buf *send_buf,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet;
+
+	if (mad_recv_wc->wc->status != IB_WC_SUCCESS)
+		goto err1;
+
+	packet = kzalloc(sizeof *packet, GFP_KERNEL);
+	if (!packet)
+		goto err1;
+
+	packet->length = mad_recv_wc->mad_len;
+	packet->recv_wc = mad_recv_wc;
+
+	packet->mad.hdr.status	   = 0;
+	packet->mad.hdr.length	   = hdr_size(file) + mad_recv_wc->mad_len;
+	packet->mad.hdr.qpn	   = cpu_to_be32(mad_recv_wc->wc->src_qp);
+	/*
+	 * On OPA devices it is okay to lose the upper 16 bits of LID as this
+	 * information is obtained elsewhere. Mask off the upper 16 bits.
+	 */
+	if (rdma_cap_opa_mad(agent->device, agent->port_num))
+		packet->mad.hdr.lid = ib_lid_be16(0xFFFF &
+						  mad_recv_wc->wc->slid);
+	else
+		packet->mad.hdr.lid = ib_lid_be16(mad_recv_wc->wc->slid);
+	packet->mad.hdr.sl	   = mad_recv_wc->wc->sl;
+	packet->mad.hdr.path_bits  = mad_recv_wc->wc->dlid_path_bits;
+	packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index;
+	packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH);
+	if (packet->mad.hdr.grh_present) {
+		struct rdma_ah_attr ah_attr;
+		const struct ib_global_route *grh;
+		int ret;
+
+		ret = ib_init_ah_attr_from_wc(agent->device, agent->port_num,
+					      mad_recv_wc->wc,
+					      mad_recv_wc->recv_buf.grh,
+					      &ah_attr);
+		if (ret)
+			goto err2;
+
+		grh = rdma_ah_read_grh(&ah_attr);
+		packet->mad.hdr.gid_index = grh->sgid_index;
+		packet->mad.hdr.hop_limit = grh->hop_limit;
+		packet->mad.hdr.traffic_class = grh->traffic_class;
+		memcpy(packet->mad.hdr.gid, &grh->dgid, 16);
+		packet->mad.hdr.flow_label = cpu_to_be32(grh->flow_label);
+	}
+
+	if (queue_packet(file, agent, packet))
+		goto err2;
+	return;
+
+err2:
+	kfree(packet);
+err1:
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
+			     struct ib_umad_packet *packet, size_t count)
+{
+	struct ib_mad_recv_buf *recv_buf;
+	int left, seg_payload, offset, max_seg_payload;
+	size_t seg_size;
+
+	recv_buf = &packet->recv_wc->recv_buf;
+	seg_size = packet->recv_wc->mad_seg_size;
+
+	/* We need enough room to copy the first (or only) MAD segment. */
+	if ((packet->length <= seg_size &&
+	     count < hdr_size(file) + packet->length) ||
+	    (packet->length > seg_size &&
+	     count < hdr_size(file) + seg_size))
+		return -EINVAL;
+
+	if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+		return -EFAULT;
+
+	buf += hdr_size(file);
+	seg_payload = min_t(int, packet->length, seg_size);
+	if (copy_to_user(buf, recv_buf->mad, seg_payload))
+		return -EFAULT;
+
+	if (seg_payload < packet->length) {
+		/*
+		 * Multipacket RMPP MAD message. Copy remainder of message.
+		 * Note that last segment may have a shorter payload.
+		 */
+		if (count < hdr_size(file) + packet->length) {
+			/*
+			 * The buffer is too small, return the first RMPP segment,
+			 * which includes the RMPP message length.
+			 */
+			return -ENOSPC;
+		}
+		offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class);
+		max_seg_payload = seg_size - offset;
+
+		for (left = packet->length - seg_payload, buf += seg_payload;
+		     left; left -= seg_payload, buf += seg_payload) {
+			recv_buf = container_of(recv_buf->list.next,
+						struct ib_mad_recv_buf, list);
+			seg_payload = min(left, max_seg_payload);
+			if (copy_to_user(buf, ((void *) recv_buf->mad) + offset,
+					 seg_payload))
+				return -EFAULT;
+		}
+	}
+	return hdr_size(file) + packet->length;
+}
+
+static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf,
+			     struct ib_umad_packet *packet, size_t count)
+{
+	ssize_t size = hdr_size(file) + packet->length;
+
+	if (count < size)
+		return -EINVAL;
+
+	if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+		return -EFAULT;
+
+	buf += hdr_size(file);
+
+	if (copy_to_user(buf, packet->mad.data, packet->length))
+		return -EFAULT;
+
+	return size;
+}
+
+static ssize_t ib_umad_read(struct file *filp, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	ssize_t ret;
+
+	if (count < hdr_size(file))
+		return -EINVAL;
+
+	mutex_lock(&file->mutex);
+
+	while (list_empty(&file->recv_list)) {
+		mutex_unlock(&file->mutex);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->recv_wait,
+					     !list_empty(&file->recv_list)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->mutex);
+	}
+
+	packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
+	list_del(&packet->list);
+
+	mutex_unlock(&file->mutex);
+
+	if (packet->recv_wc)
+		ret = copy_recv_mad(file, buf, packet, count);
+	else
+		ret = copy_send_mad(file, buf, packet, count);
+
+	if (ret < 0) {
+		/* Requeue packet */
+		mutex_lock(&file->mutex);
+		list_add(&packet->list, &file->recv_list);
+		mutex_unlock(&file->mutex);
+	} else {
+		if (packet->recv_wc)
+			ib_free_recv_mad(packet->recv_wc);
+		kfree(packet);
+	}
+	return ret;
+}
+
+static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf)
+{
+	int left, seg;
+
+	/* Copy class specific header */
+	if ((msg->hdr_len > IB_MGMT_RMPP_HDR) &&
+	    copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR,
+			   msg->hdr_len - IB_MGMT_RMPP_HDR))
+		return -EFAULT;
+
+	/* All headers are in place.  Copy data segments. */
+	for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0;
+	     seg++, left -= msg->seg_size, buf += msg->seg_size) {
+		if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf,
+				   min(left, msg->seg_size)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static int same_destination(struct ib_user_mad_hdr *hdr1,
+			    struct ib_user_mad_hdr *hdr2)
+{
+	if (!hdr1->grh_present && !hdr2->grh_present)
+	   return (hdr1->lid == hdr2->lid);
+
+	if (hdr1->grh_present && hdr2->grh_present)
+	   return !memcmp(hdr1->gid, hdr2->gid, 16);
+
+	return 0;
+}
+
+static int is_duplicate(struct ib_umad_file *file,
+			struct ib_umad_packet *packet)
+{
+	struct ib_umad_packet *sent_packet;
+	struct ib_mad_hdr *sent_hdr, *hdr;
+
+	hdr = (struct ib_mad_hdr *) packet->mad.data;
+	list_for_each_entry(sent_packet, &file->send_list, list) {
+		sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data;
+
+		if ((hdr->tid != sent_hdr->tid) ||
+		    (hdr->mgmt_class != sent_hdr->mgmt_class))
+			continue;
+
+		/*
+		 * No need to be overly clever here.  If two new operations have
+		 * the same TID, reject the second as a duplicate.  This is more
+		 * restrictive than required by the spec.
+		 */
+		if (!ib_response_mad(hdr)) {
+			if (!ib_response_mad(sent_hdr))
+				return 1;
+			continue;
+		} else if (!ib_response_mad(sent_hdr))
+			continue;
+
+		if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr))
+			return 1;
+	}
+
+	return 0;
+}
+
+static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	struct ib_mad_agent *agent;
+	struct rdma_ah_attr ah_attr;
+	struct ib_ah *ah;
+	struct ib_rmpp_mad *rmpp_mad;
+	__be64 *tid;
+	int ret, data_len, hdr_len, copy_offset, rmpp_active;
+	u8 base_version;
+
+	if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
+		return -EINVAL;
+
+	packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL);
+	if (!packet)
+		return -ENOMEM;
+
+	if (copy_from_user(&packet->mad, buf, hdr_size(file))) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	buf += hdr_size(file);
+
+	if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	mutex_lock(&file->mutex);
+
+	agent = __get_agent(file, packet->mad.hdr.id);
+	if (!agent) {
+		ret = -EINVAL;
+		goto err_up;
+	}
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.type = rdma_ah_find_type(agent->device,
+					 file->port->port_num);
+	rdma_ah_set_dlid(&ah_attr, be16_to_cpu(packet->mad.hdr.lid));
+	rdma_ah_set_sl(&ah_attr, packet->mad.hdr.sl);
+	rdma_ah_set_path_bits(&ah_attr, packet->mad.hdr.path_bits);
+	rdma_ah_set_port_num(&ah_attr, file->port->port_num);
+	if (packet->mad.hdr.grh_present) {
+		rdma_ah_set_grh(&ah_attr, NULL,
+				be32_to_cpu(packet->mad.hdr.flow_label),
+				packet->mad.hdr.gid_index,
+				packet->mad.hdr.hop_limit,
+				packet->mad.hdr.traffic_class);
+		rdma_ah_set_dgid_raw(&ah_attr, packet->mad.hdr.gid);
+	}
+
+	ah = rdma_create_user_ah(agent->qp->pd, &ah_attr, NULL);
+	if (IS_ERR(ah)) {
+		ret = PTR_ERR(ah);
+		goto err_up;
+	}
+
+	rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
+	hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+
+	if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+	    && ib_mad_kernel_rmpp_agent(agent)) {
+		copy_offset = IB_MGMT_RMPP_HDR;
+		rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+						IB_MGMT_RMPP_FLAG_ACTIVE;
+	} else {
+		copy_offset = IB_MGMT_MAD_HDR;
+		rmpp_active = 0;
+	}
+
+	base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version;
+	data_len = count - hdr_size(file) - hdr_len;
+	packet->msg = ib_create_send_mad(agent,
+					 be32_to_cpu(packet->mad.hdr.qpn),
+					 packet->mad.hdr.pkey_index, rmpp_active,
+					 hdr_len, data_len, GFP_KERNEL,
+					 base_version);
+	if (IS_ERR(packet->msg)) {
+		ret = PTR_ERR(packet->msg);
+		goto err_ah;
+	}
+
+	packet->msg->ah		= ah;
+	packet->msg->timeout_ms = packet->mad.hdr.timeout_ms;
+	packet->msg->retries	= packet->mad.hdr.retries;
+	packet->msg->context[0] = packet;
+
+	/* Copy MAD header.  Any RMPP header is already in place. */
+	memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
+
+	if (!rmpp_active) {
+		if (copy_from_user(packet->msg->mad + copy_offset,
+				   buf + copy_offset,
+				   hdr_len + data_len - copy_offset)) {
+			ret = -EFAULT;
+			goto err_msg;
+		}
+	} else {
+		ret = copy_rmpp_mad(packet->msg, buf);
+		if (ret)
+			goto err_msg;
+	}
+
+	/*
+	 * Set the high-order part of the transaction ID to make MADs from
+	 * different agents unique, and allow routing responses back to the
+	 * original requestor.
+	 */
+	if (!ib_response_mad(packet->msg->mad)) {
+		tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid;
+		*tid = cpu_to_be64(((u64) agent->hi_tid) << 32 |
+				   (be64_to_cpup(tid) & 0xffffffff));
+		rmpp_mad->mad_hdr.tid = *tid;
+	}
+
+	if (!ib_mad_kernel_rmpp_agent(agent)
+	   && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+	   && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+		spin_lock_irq(&file->send_lock);
+		list_add_tail(&packet->list, &file->send_list);
+		spin_unlock_irq(&file->send_lock);
+	} else {
+		spin_lock_irq(&file->send_lock);
+		ret = is_duplicate(file, packet);
+		if (!ret)
+			list_add_tail(&packet->list, &file->send_list);
+		spin_unlock_irq(&file->send_lock);
+		if (ret) {
+			ret = -EINVAL;
+			goto err_msg;
+		}
+	}
+
+	ret = ib_post_send_mad(packet->msg, NULL);
+	if (ret)
+		goto err_send;
+
+	mutex_unlock(&file->mutex);
+	return count;
+
+err_send:
+	dequeue_send(file, packet);
+err_msg:
+	ib_free_send_mad(packet->msg);
+err_ah:
+	rdma_destroy_ah(ah);
+err_up:
+	mutex_unlock(&file->mutex);
+err:
+	kfree(packet);
+	return ret;
+}
+
+static __poll_t ib_umad_poll(struct file *filp, struct poll_table_struct *wait)
+{
+	struct ib_umad_file *file = filp->private_data;
+
+	/* we will always be able to post a MAD send */
+	__poll_t mask = EPOLLOUT | EPOLLWRNORM;
+
+	poll_wait(filp, &file->recv_wait, wait);
+
+	if (!list_empty(&file->recv_list))
+		mask |= EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
+static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
+			     int compat_method_mask)
+{
+	struct ib_user_mad_reg_req ureq;
+	struct ib_mad_reg_req req;
+	struct ib_mad_agent *agent = NULL;
+	int agent_id;
+	int ret;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	if (!file->port->ib_dev) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent: invalid device\n");
+		ret = -EPIPE;
+		goto out;
+	}
+
+	if (copy_from_user(&ureq, arg, sizeof ureq)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (ureq.qpn != 0 && ureq.qpn != 1) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent: invalid QPN %d specified\n",
+			   ureq.qpn);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+		if (!__get_agent(file, agent_id))
+			goto found;
+
+	dev_notice(file->port->dev,
+		   "ib_umad_reg_agent: Max Agents (%u) reached\n",
+		   IB_UMAD_MAX_AGENTS);
+	ret = -ENOMEM;
+	goto out;
+
+found:
+	if (ureq.mgmt_class) {
+		memset(&req, 0, sizeof(req));
+		req.mgmt_class         = ureq.mgmt_class;
+		req.mgmt_class_version = ureq.mgmt_class_version;
+		memcpy(req.oui, ureq.oui, sizeof req.oui);
+
+		if (compat_method_mask) {
+			u32 *umm = (u32 *) ureq.method_mask;
+			int i;
+
+			for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i)
+				req.method_mask[i] =
+					umm[i * 2] | ((u64) umm[i * 2 + 1] << 32);
+		} else
+			memcpy(req.method_mask, ureq.method_mask,
+			       sizeof req.method_mask);
+	}
+
+	agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+				      ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+				      ureq.mgmt_class ? &req : NULL,
+				      ureq.rmpp_version,
+				      send_handler, recv_handler, file, 0);
+	if (IS_ERR(agent)) {
+		ret = PTR_ERR(agent);
+		agent = NULL;
+		goto out;
+	}
+
+	if (put_user(agent_id,
+		     (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (!file->already_used) {
+		file->already_used = 1;
+		if (!file->use_pkey_index) {
+			dev_warn(file->port->dev,
+				"process %s did not enable P_Key index support.\n",
+				current->comm);
+			dev_warn(file->port->dev,
+				"   Documentation/infiniband/user_mad.txt has info on the new ABI.\n");
+		}
+	}
+
+	file->agent[agent_id] = agent;
+	ret = 0;
+
+out:
+	mutex_unlock(&file->mutex);
+
+	if (ret && agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	return ret;
+}
+
+static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
+{
+	struct ib_user_mad_reg_req2 ureq;
+	struct ib_mad_reg_req req;
+	struct ib_mad_agent *agent = NULL;
+	int agent_id;
+	int ret;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	if (!file->port->ib_dev) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent2: invalid device\n");
+		ret = -EPIPE;
+		goto out;
+	}
+
+	if (copy_from_user(&ureq, arg, sizeof(ureq))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (ureq.qpn != 0 && ureq.qpn != 1) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent2: invalid QPN %d specified\n",
+			   ureq.qpn);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n",
+			   ureq.flags, IB_USER_MAD_REG_FLAGS_CAP);
+		ret = -EINVAL;
+
+		if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP,
+				(u32 __user *) (arg + offsetof(struct
+				ib_user_mad_reg_req2, flags))))
+			ret = -EFAULT;
+
+		goto out;
+	}
+
+	for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+		if (!__get_agent(file, agent_id))
+			goto found;
+
+	dev_notice(file->port->dev,
+		   "ib_umad_reg_agent2: Max Agents (%u) reached\n",
+		   IB_UMAD_MAX_AGENTS);
+	ret = -ENOMEM;
+	goto out;
+
+found:
+	if (ureq.mgmt_class) {
+		memset(&req, 0, sizeof(req));
+		req.mgmt_class         = ureq.mgmt_class;
+		req.mgmt_class_version = ureq.mgmt_class_version;
+		if (ureq.oui & 0xff000000) {
+			dev_notice(file->port->dev,
+				   "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n",
+				   ureq.oui);
+			ret = -EINVAL;
+			goto out;
+		}
+		req.oui[2] =  ureq.oui & 0x0000ff;
+		req.oui[1] = (ureq.oui & 0x00ff00) >> 8;
+		req.oui[0] = (ureq.oui & 0xff0000) >> 16;
+		memcpy(req.method_mask, ureq.method_mask,
+			sizeof(req.method_mask));
+	}
+
+	agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+				      ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+				      ureq.mgmt_class ? &req : NULL,
+				      ureq.rmpp_version,
+				      send_handler, recv_handler, file,
+				      ureq.flags);
+	if (IS_ERR(agent)) {
+		ret = PTR_ERR(agent);
+		agent = NULL;
+		goto out;
+	}
+
+	if (put_user(agent_id,
+		     (u32 __user *)(arg +
+				offsetof(struct ib_user_mad_reg_req2, id)))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (!file->already_used) {
+		file->already_used = 1;
+		file->use_pkey_index = 1;
+	}
+
+	file->agent[agent_id] = agent;
+	ret = 0;
+
+out:
+	mutex_unlock(&file->mutex);
+
+	if (ret && agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	return ret;
+}
+
+
+static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
+{
+	struct ib_mad_agent *agent = NULL;
+	u32 id;
+	int ret = 0;
+
+	if (get_user(id, arg))
+		return -EFAULT;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	agent = file->agent[id];
+	file->agent[id] = NULL;
+
+out:
+	mutex_unlock(&file->mutex);
+
+	if (agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	return ret;
+}
+
+static long ib_umad_enable_pkey(struct ib_umad_file *file)
+{
+	int ret = 0;
+
+	mutex_lock(&file->mutex);
+	if (file->already_used)
+		ret = -EINVAL;
+	else
+		file->use_pkey_index = 1;
+	mutex_unlock(&file->mutex);
+
+	return ret;
+}
+
+static long ib_umad_ioctl(struct file *filp, unsigned int cmd,
+			  unsigned long arg)
+{
+	switch (cmd) {
+	case IB_USER_MAD_REGISTER_AGENT:
+		return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0);
+	case IB_USER_MAD_UNREGISTER_AGENT:
+		return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg);
+	case IB_USER_MAD_ENABLE_PKEY:
+		return ib_umad_enable_pkey(filp->private_data);
+	case IB_USER_MAD_REGISTER_AGENT2:
+		return ib_umad_reg_agent2(filp->private_data, (void __user *) arg);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
+				 unsigned long arg)
+{
+	switch (cmd) {
+	case IB_USER_MAD_REGISTER_AGENT:
+		return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1);
+	case IB_USER_MAD_UNREGISTER_AGENT:
+		return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg));
+	case IB_USER_MAD_ENABLE_PKEY:
+		return ib_umad_enable_pkey(filp->private_data);
+	case IB_USER_MAD_REGISTER_AGENT2:
+		return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg));
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+#endif
+
+/*
+ * ib_umad_open() does not need the BKL:
+ *
+ *  - the ib_umad_port structures are properly reference counted, and
+ *    everything else is purely local to the file being created, so
+ *    races against other open calls are not a problem;
+ *  - the ioctl method does not affect any global state outside of the
+ *    file structure being operated on;
+ */
+static int ib_umad_open(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port;
+	struct ib_umad_file *file;
+	int ret = -ENXIO;
+
+	port = container_of(inode->i_cdev, struct ib_umad_port, cdev);
+
+	mutex_lock(&port->file_mutex);
+
+	if (!port->ib_dev)
+		goto out;
+
+	ret = -ENOMEM;
+	file = kzalloc(sizeof *file, GFP_KERNEL);
+	if (!file)
+		goto out;
+
+	mutex_init(&file->mutex);
+	spin_lock_init(&file->send_lock);
+	INIT_LIST_HEAD(&file->recv_list);
+	INIT_LIST_HEAD(&file->send_list);
+	init_waitqueue_head(&file->recv_wait);
+
+	file->port = port;
+	filp->private_data = file;
+
+	list_add_tail(&file->port_list, &port->file_list);
+
+	ret = nonseekable_open(inode, filp);
+	if (ret) {
+		list_del(&file->port_list);
+		kfree(file);
+		goto out;
+	}
+
+	kobject_get(&port->umad_dev->kobj);
+
+out:
+	mutex_unlock(&port->file_mutex);
+	return ret;
+}
+
+static int ib_umad_close(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_device *dev = file->port->umad_dev;
+	struct ib_umad_packet *packet, *tmp;
+	int already_dead;
+	int i;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	already_dead = file->agents_dead;
+	file->agents_dead = 1;
+
+	list_for_each_entry_safe(packet, tmp, &file->recv_list, list) {
+		if (packet->recv_wc)
+			ib_free_recv_mad(packet->recv_wc);
+		kfree(packet);
+	}
+
+	list_del(&file->port_list);
+
+	mutex_unlock(&file->mutex);
+
+	if (!already_dead)
+		for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
+			if (file->agent[i])
+				ib_unregister_mad_agent(file->agent[i]);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	kfree(file);
+	kobject_put(&dev->kobj);
+
+	return 0;
+}
+
+static const struct file_operations umad_fops = {
+	.owner		= THIS_MODULE,
+	.read		= ib_umad_read,
+	.write		= ib_umad_write,
+	.poll		= ib_umad_poll,
+	.unlocked_ioctl = ib_umad_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ib_umad_compat_ioctl,
+#endif
+	.open		= ib_umad_open,
+	.release	= ib_umad_close,
+	.llseek		= no_llseek,
+};
+
+static int ib_umad_sm_open(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port;
+	struct ib_port_modify props = {
+		.set_port_cap_mask = IB_PORT_SM
+	};
+	int ret;
+
+	port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev);
+
+	if (filp->f_flags & O_NONBLOCK) {
+		if (down_trylock(&port->sm_sem)) {
+			ret = -EAGAIN;
+			goto fail;
+		}
+	} else {
+		if (down_interruptible(&port->sm_sem)) {
+			ret = -ERESTARTSYS;
+			goto fail;
+		}
+	}
+
+	ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+	if (ret)
+		goto err_up_sem;
+
+	filp->private_data = port;
+
+	ret = nonseekable_open(inode, filp);
+	if (ret)
+		goto err_clr_sm_cap;
+
+	kobject_get(&port->umad_dev->kobj);
+
+	return 0;
+
+err_clr_sm_cap:
+	swap(props.set_port_cap_mask, props.clr_port_cap_mask);
+	ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+
+err_up_sem:
+	up(&port->sm_sem);
+
+fail:
+	return ret;
+}
+
+static int ib_umad_sm_close(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port = filp->private_data;
+	struct ib_port_modify props = {
+		.clr_port_cap_mask = IB_PORT_SM
+	};
+	int ret = 0;
+
+	mutex_lock(&port->file_mutex);
+	if (port->ib_dev)
+		ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+	mutex_unlock(&port->file_mutex);
+
+	up(&port->sm_sem);
+
+	kobject_put(&port->umad_dev->kobj);
+
+	return ret;
+}
+
+static const struct file_operations umad_sm_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = ib_umad_sm_open,
+	.release = ib_umad_sm_close,
+	.llseek	 = no_llseek,
+};
+
+static struct ib_client umad_client = {
+	.name   = "umad",
+	.add    = ib_umad_add_one,
+	.remove = ib_umad_remove_one
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_umad_port *port = dev_get_drvdata(dev);
+
+	if (!port)
+		return -ENODEV;
+
+	return sprintf(buf, "%s\n", port->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct ib_umad_port *port = dev_get_drvdata(dev);
+
+	if (!port)
+		return -ENODEV;
+
+	return sprintf(buf, "%d\n", port->port_num);
+}
+static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+			 __stringify(IB_USER_MAD_ABI_VERSION));
+
+static int ib_umad_init_port(struct ib_device *device, int port_num,
+			     struct ib_umad_device *umad_dev,
+			     struct ib_umad_port *port)
+{
+	int devnum;
+	dev_t base_umad;
+	dev_t base_issm;
+
+	devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+	if (devnum >= IB_UMAD_MAX_PORTS)
+		return -1;
+	port->dev_num = devnum;
+	set_bit(devnum, dev_map);
+	if (devnum >= IB_UMAD_NUM_FIXED_MINOR) {
+		base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR;
+		base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR;
+	} else {
+		base_umad = devnum + base_umad_dev;
+		base_issm = devnum + base_issm_dev;
+	}
+
+	port->ib_dev   = device;
+	port->port_num = port_num;
+	sema_init(&port->sm_sem, 1);
+	mutex_init(&port->file_mutex);
+	INIT_LIST_HEAD(&port->file_list);
+
+	cdev_init(&port->cdev, &umad_fops);
+	port->cdev.owner = THIS_MODULE;
+	cdev_set_parent(&port->cdev, &umad_dev->kobj);
+	kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num);
+	if (cdev_add(&port->cdev, base_umad, 1))
+		goto err_cdev;
+
+	port->dev = device_create(umad_class, device->dev.parent,
+				  port->cdev.dev, port,
+				  "umad%d", port->dev_num);
+	if (IS_ERR(port->dev))
+		goto err_cdev;
+
+	if (device_create_file(port->dev, &dev_attr_ibdev))
+		goto err_dev;
+	if (device_create_file(port->dev, &dev_attr_port))
+		goto err_dev;
+
+	cdev_init(&port->sm_cdev, &umad_sm_fops);
+	port->sm_cdev.owner = THIS_MODULE;
+	cdev_set_parent(&port->sm_cdev, &umad_dev->kobj);
+	kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num);
+	if (cdev_add(&port->sm_cdev, base_issm, 1))
+		goto err_sm_cdev;
+
+	port->sm_dev = device_create(umad_class, device->dev.parent,
+				     port->sm_cdev.dev, port,
+				     "issm%d", port->dev_num);
+	if (IS_ERR(port->sm_dev))
+		goto err_sm_cdev;
+
+	if (device_create_file(port->sm_dev, &dev_attr_ibdev))
+		goto err_sm_dev;
+	if (device_create_file(port->sm_dev, &dev_attr_port))
+		goto err_sm_dev;
+
+	return 0;
+
+err_sm_dev:
+	device_destroy(umad_class, port->sm_cdev.dev);
+
+err_sm_cdev:
+	cdev_del(&port->sm_cdev);
+
+err_dev:
+	device_destroy(umad_class, port->cdev.dev);
+
+err_cdev:
+	cdev_del(&port->cdev);
+	clear_bit(devnum, dev_map);
+
+	return -1;
+}
+
+static void ib_umad_kill_port(struct ib_umad_port *port)
+{
+	struct ib_umad_file *file;
+	int id;
+
+	dev_set_drvdata(port->dev,    NULL);
+	dev_set_drvdata(port->sm_dev, NULL);
+
+	device_destroy(umad_class, port->cdev.dev);
+	device_destroy(umad_class, port->sm_cdev.dev);
+
+	cdev_del(&port->cdev);
+	cdev_del(&port->sm_cdev);
+
+	mutex_lock(&port->file_mutex);
+
+	port->ib_dev = NULL;
+
+	list_for_each_entry(file, &port->file_list, port_list) {
+		mutex_lock(&file->mutex);
+		file->agents_dead = 1;
+		mutex_unlock(&file->mutex);
+
+		for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id)
+			if (file->agent[id])
+				ib_unregister_mad_agent(file->agent[id]);
+	}
+
+	mutex_unlock(&port->file_mutex);
+	clear_bit(port->dev_num, dev_map);
+}
+
+static void ib_umad_add_one(struct ib_device *device)
+{
+	struct ib_umad_device *umad_dev;
+	int s, e, i;
+	int count = 0;
+
+	s = rdma_start_port(device);
+	e = rdma_end_port(device);
+
+	umad_dev = kzalloc(sizeof *umad_dev +
+			   (e - s + 1) * sizeof (struct ib_umad_port),
+			   GFP_KERNEL);
+	if (!umad_dev)
+		return;
+
+	kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype);
+
+	for (i = s; i <= e; ++i) {
+		if (!rdma_cap_ib_mad(device, i))
+			continue;
+
+		umad_dev->port[i - s].umad_dev = umad_dev;
+
+		if (ib_umad_init_port(device, i, umad_dev,
+				      &umad_dev->port[i - s]))
+			goto err;
+
+		count++;
+	}
+
+	if (!count)
+		goto free;
+
+	ib_set_client_data(device, &umad_client, umad_dev);
+
+	return;
+
+err:
+	while (--i >= s) {
+		if (!rdma_cap_ib_mad(device, i))
+			continue;
+
+		ib_umad_kill_port(&umad_dev->port[i - s]);
+	}
+free:
+	kobject_put(&umad_dev->kobj);
+}
+
+static void ib_umad_remove_one(struct ib_device *device, void *client_data)
+{
+	struct ib_umad_device *umad_dev = client_data;
+	int i;
+
+	if (!umad_dev)
+		return;
+
+	for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) {
+		if (rdma_cap_ib_mad(device, i + rdma_start_port(device)))
+			ib_umad_kill_port(&umad_dev->port[i]);
+	}
+
+	kobject_put(&umad_dev->kobj);
+}
+
+static char *umad_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static int __init ib_umad_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(base_umad_dev,
+				     IB_UMAD_NUM_FIXED_MINOR * 2,
+				     "infiniband_mad");
+	if (ret) {
+		pr_err("couldn't register device number\n");
+		goto out;
+	}
+
+	ret = alloc_chrdev_region(&dynamic_umad_dev, 0,
+				  IB_UMAD_NUM_DYNAMIC_MINOR * 2,
+				  "infiniband_mad");
+	if (ret) {
+		pr_err("couldn't register dynamic device number\n");
+		goto out_alloc;
+	}
+	dynamic_issm_dev = dynamic_umad_dev + IB_UMAD_NUM_DYNAMIC_MINOR;
+
+	umad_class = class_create(THIS_MODULE, "infiniband_mad");
+	if (IS_ERR(umad_class)) {
+		ret = PTR_ERR(umad_class);
+		pr_err("couldn't create class infiniband_mad\n");
+		goto out_chrdev;
+	}
+
+	umad_class->devnode = umad_devnode;
+
+	ret = class_create_file(umad_class, &class_attr_abi_version.attr);
+	if (ret) {
+		pr_err("couldn't create abi_version attribute\n");
+		goto out_class;
+	}
+
+	ret = ib_register_client(&umad_client);
+	if (ret) {
+		pr_err("couldn't register ib_umad client\n");
+		goto out_class;
+	}
+
+	return 0;
+
+out_class:
+	class_destroy(umad_class);
+
+out_chrdev:
+	unregister_chrdev_region(dynamic_umad_dev,
+				 IB_UMAD_NUM_DYNAMIC_MINOR * 2);
+
+out_alloc:
+	unregister_chrdev_region(base_umad_dev,
+				 IB_UMAD_NUM_FIXED_MINOR * 2);
+
+out:
+	return ret;
+}
+
+static void __exit ib_umad_cleanup(void)
+{
+	ib_unregister_client(&umad_client);
+	class_destroy(umad_class);
+	unregister_chrdev_region(base_umad_dev,
+				 IB_UMAD_NUM_FIXED_MINOR * 2);
+	unregister_chrdev_region(dynamic_umad_dev,
+				 IB_UMAD_NUM_DYNAMIC_MINOR * 2);
+}
+
+module_init(ib_umad_init);
+module_exit(ib_umad_cleanup);
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
new file mode 100644
index 0000000..cfb5161
--- /dev/null
+++ b/drivers/infiniband/core/uverbs.h
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef UVERBS_H
+#define UVERBS_H
+
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/cdev.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_std_types.h>
+
+#define UVERBS_MODULE_NAME ib_uverbs
+#include <rdma/uverbs_named_ioctl.h>
+
+static inline void
+ib_uverbs_init_udata(struct ib_udata *udata,
+		     const void __user *ibuf,
+		     void __user *obuf,
+		     size_t ilen, size_t olen)
+{
+	udata->inbuf  = ibuf;
+	udata->outbuf = obuf;
+	udata->inlen  = ilen;
+	udata->outlen = olen;
+}
+
+static inline void
+ib_uverbs_init_udata_buf_or_null(struct ib_udata *udata,
+				 const void __user *ibuf,
+				 void __user *obuf,
+				 size_t ilen, size_t olen)
+{
+	ib_uverbs_init_udata(udata,
+			     ilen ? ibuf : NULL, olen ? obuf : NULL,
+			     ilen, olen);
+}
+
+/*
+ * Our lifetime rules for these structs are the following:
+ *
+ * struct ib_uverbs_device: One reference is held by the module and
+ * released in ib_uverbs_remove_one().  Another reference is taken by
+ * ib_uverbs_open() each time the character special file is opened,
+ * and released in ib_uverbs_release_file() when the file is released.
+ *
+ * struct ib_uverbs_file: One reference is held by the VFS and
+ * released when the file is closed.  Another reference is taken when
+ * an asynchronous event queue file is created and released when the
+ * event file is closed.
+ *
+ * struct ib_uverbs_event_queue: Base structure for
+ * struct ib_uverbs_async_event_file and struct ib_uverbs_completion_event_file.
+ * One reference is held by the VFS and released when the file is closed.
+ * For asynchronous event files, another reference is held by the corresponding
+ * main context file and released when that file is closed.  For completion
+ * event files, a reference is taken when a CQ is created that uses the file,
+ * and released when the CQ is destroyed.
+ */
+
+struct ib_uverbs_device {
+	atomic_t				refcount;
+	int					num_comp_vectors;
+	struct completion			comp;
+	struct device			       *dev;
+	struct ib_device	__rcu	       *ib_dev;
+	int					devnum;
+	struct cdev			        cdev;
+	struct rb_root				xrcd_tree;
+	struct mutex				xrcd_tree_mutex;
+	struct kobject				kobj;
+	struct srcu_struct			disassociate_srcu;
+	struct mutex				lists_mutex; /* protect lists */
+	struct list_head			uverbs_file_list;
+	struct list_head			uverbs_events_file_list;
+	struct uverbs_root_spec			*specs_root;
+};
+
+struct ib_uverbs_event_queue {
+	spinlock_t				lock;
+	int					is_closed;
+	wait_queue_head_t			poll_wait;
+	struct fasync_struct		       *async_queue;
+	struct list_head			event_list;
+};
+
+struct ib_uverbs_async_event_file {
+	struct ib_uverbs_event_queue		ev_queue;
+	struct ib_uverbs_file		       *uverbs_file;
+	struct kref				ref;
+	struct list_head			list;
+};
+
+struct ib_uverbs_completion_event_file {
+	struct ib_uobject_file			uobj_file;
+	struct ib_uverbs_event_queue		ev_queue;
+};
+
+struct ib_uverbs_file {
+	struct kref				ref;
+	struct mutex				mutex;
+	struct mutex                            cleanup_mutex; /* protect cleanup */
+	struct ib_uverbs_device		       *device;
+	struct ib_ucontext		       *ucontext;
+	struct ib_event_handler			event_handler;
+	struct ib_uverbs_async_event_file       *async_file;
+	struct list_head			list;
+	int					is_closed;
+
+	struct idr		idr;
+	/* spinlock protects write access to idr */
+	spinlock_t		idr_lock;
+};
+
+struct ib_uverbs_event {
+	union {
+		struct ib_uverbs_async_event_desc	async;
+		struct ib_uverbs_comp_event_desc	comp;
+	}					desc;
+	struct list_head			list;
+	struct list_head			obj_list;
+	u32				       *counter;
+};
+
+struct ib_uverbs_mcast_entry {
+	struct list_head	list;
+	union ib_gid 		gid;
+	u16 			lid;
+};
+
+struct ib_uevent_object {
+	struct ib_uobject	uobject;
+	struct list_head	event_list;
+	u32			events_reported;
+};
+
+struct ib_uxrcd_object {
+	struct ib_uobject	uobject;
+	atomic_t		refcnt;
+};
+
+struct ib_usrq_object {
+	struct ib_uevent_object	uevent;
+	struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_uqp_object {
+	struct ib_uevent_object	uevent;
+	/* lock for mcast list */
+	struct mutex		mcast_lock;
+	struct list_head 	mcast_list;
+	struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_uwq_object {
+	struct ib_uevent_object	uevent;
+};
+
+struct ib_ucq_object {
+	struct ib_uobject	uobject;
+	struct ib_uverbs_file  *uverbs_file;
+	struct list_head	comp_list;
+	struct list_head	async_list;
+	u32			comp_events_reported;
+	u32			async_events_reported;
+};
+
+struct ib_uflow_resources;
+struct ib_uflow_object {
+	struct ib_uobject		uobject;
+	struct ib_uflow_resources	*resources;
+};
+
+extern const struct file_operations uverbs_event_fops;
+void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue);
+struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file,
+					      struct ib_device *ib_dev);
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);
+void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res);
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+			   struct ib_uverbs_completion_event_file *ev_file,
+			   struct ib_ucq_object *uobj);
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+			      struct ib_uevent_object *uobj);
+void ib_uverbs_release_file(struct kref *ref);
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+			     struct ib_event *event);
+int ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd,
+			   enum rdma_remove_reason why);
+
+int uverbs_dealloc_mw(struct ib_mw *mw);
+void ib_uverbs_detach_umcast(struct ib_qp *qp,
+			     struct ib_uqp_object *uobj);
+
+void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata);
+extern const struct uverbs_attr_def uverbs_uhw_compat_in;
+extern const struct uverbs_attr_def uverbs_uhw_compat_out;
+long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+int uverbs_destroy_def_handler(struct ib_device *ib_dev,
+			       struct ib_uverbs_file *file,
+			       struct uverbs_attr_bundle *attrs);
+
+struct ib_uverbs_flow_spec {
+	union {
+		union {
+			struct ib_uverbs_flow_spec_hdr hdr;
+			struct {
+				__u32 type;
+				__u16 size;
+				__u16 reserved;
+			};
+		};
+		struct ib_uverbs_flow_spec_eth     eth;
+		struct ib_uverbs_flow_spec_ipv4    ipv4;
+		struct ib_uverbs_flow_spec_esp     esp;
+		struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+		struct ib_uverbs_flow_spec_ipv6    ipv6;
+		struct ib_uverbs_flow_spec_action_tag	flow_tag;
+		struct ib_uverbs_flow_spec_action_drop	drop;
+		struct ib_uverbs_flow_spec_action_handle action;
+	};
+};
+
+int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
+					  const void *kern_spec_mask,
+					  const void *kern_spec_val,
+					  size_t kern_filter_sz,
+					  union ib_flow_spec *ib_spec);
+
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DEVICE);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_PD);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_MR);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_CQ);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_QP);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_AH);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_MW);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_SRQ);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_WQ);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_XRCD);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION);
+extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DM);
+
+#define IB_UVERBS_DECLARE_CMD(name)					\
+	ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,		\
+				 struct ib_device *ib_dev,              \
+				 const char __user *buf, int in_len,	\
+				 int out_len)
+
+IB_UVERBS_DECLARE_CMD(get_context);
+IB_UVERBS_DECLARE_CMD(query_device);
+IB_UVERBS_DECLARE_CMD(query_port);
+IB_UVERBS_DECLARE_CMD(alloc_pd);
+IB_UVERBS_DECLARE_CMD(dealloc_pd);
+IB_UVERBS_DECLARE_CMD(reg_mr);
+IB_UVERBS_DECLARE_CMD(rereg_mr);
+IB_UVERBS_DECLARE_CMD(dereg_mr);
+IB_UVERBS_DECLARE_CMD(alloc_mw);
+IB_UVERBS_DECLARE_CMD(dealloc_mw);
+IB_UVERBS_DECLARE_CMD(create_comp_channel);
+IB_UVERBS_DECLARE_CMD(create_cq);
+IB_UVERBS_DECLARE_CMD(resize_cq);
+IB_UVERBS_DECLARE_CMD(poll_cq);
+IB_UVERBS_DECLARE_CMD(req_notify_cq);
+IB_UVERBS_DECLARE_CMD(destroy_cq);
+IB_UVERBS_DECLARE_CMD(create_qp);
+IB_UVERBS_DECLARE_CMD(open_qp);
+IB_UVERBS_DECLARE_CMD(query_qp);
+IB_UVERBS_DECLARE_CMD(modify_qp);
+IB_UVERBS_DECLARE_CMD(destroy_qp);
+IB_UVERBS_DECLARE_CMD(post_send);
+IB_UVERBS_DECLARE_CMD(post_recv);
+IB_UVERBS_DECLARE_CMD(post_srq_recv);
+IB_UVERBS_DECLARE_CMD(create_ah);
+IB_UVERBS_DECLARE_CMD(destroy_ah);
+IB_UVERBS_DECLARE_CMD(attach_mcast);
+IB_UVERBS_DECLARE_CMD(detach_mcast);
+IB_UVERBS_DECLARE_CMD(create_srq);
+IB_UVERBS_DECLARE_CMD(modify_srq);
+IB_UVERBS_DECLARE_CMD(query_srq);
+IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(create_xsrq);
+IB_UVERBS_DECLARE_CMD(open_xrcd);
+IB_UVERBS_DECLARE_CMD(close_xrcd);
+
+#define IB_UVERBS_DECLARE_EX_CMD(name)				\
+	int ib_uverbs_ex_##name(struct ib_uverbs_file *file,	\
+				struct ib_device *ib_dev,		\
+				struct ib_udata *ucore,		\
+				struct ib_udata *uhw)
+
+IB_UVERBS_DECLARE_EX_CMD(create_flow);
+IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
+IB_UVERBS_DECLARE_EX_CMD(query_device);
+IB_UVERBS_DECLARE_EX_CMD(create_cq);
+IB_UVERBS_DECLARE_EX_CMD(create_qp);
+IB_UVERBS_DECLARE_EX_CMD(create_wq);
+IB_UVERBS_DECLARE_EX_CMD(modify_wq);
+IB_UVERBS_DECLARE_EX_CMD(destroy_wq);
+IB_UVERBS_DECLARE_EX_CMD(create_rwq_ind_table);
+IB_UVERBS_DECLARE_EX_CMD(destroy_rwq_ind_table);
+IB_UVERBS_DECLARE_EX_CMD(modify_qp);
+IB_UVERBS_DECLARE_EX_CMD(modify_cq);
+
+#endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
new file mode 100644
index 0000000..21a887c
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -0,0 +1,4066 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include <linux/uaccess.h>
+
+#include <rdma/uverbs_types.h>
+#include <rdma/uverbs_std_types.h>
+#include "rdma_core.h"
+
+#include "uverbs.h"
+#include "core_priv.h"
+
+static struct ib_uverbs_completion_event_file *
+ib_uverbs_lookup_comp_file(int fd, struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj = uobj_get_read(UVERBS_OBJECT_COMP_CHANNEL,
+						fd, context);
+	struct ib_uobject_file *uobj_file;
+
+	if (IS_ERR(uobj))
+		return (void *)uobj;
+
+	uverbs_uobject_get(uobj);
+	uobj_put_read(uobj);
+
+	uobj_file = container_of(uobj, struct ib_uobject_file, uobj);
+	return container_of(uobj_file, struct ib_uverbs_completion_event_file,
+			    uobj_file);
+}
+
+ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
+			      const char __user *buf,
+			      int in_len, int out_len)
+{
+	struct ib_uverbs_get_context      cmd;
+	struct ib_uverbs_get_context_resp resp;
+	struct ib_udata                   udata;
+	struct ib_ucontext		 *ucontext;
+	struct file			 *filp;
+	struct ib_rdmacg_object		 cg_obj;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	mutex_lock(&file->mutex);
+
+	if (file->ucontext) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+		   u64_to_user_ptr(cmd.response) + sizeof(resp),
+		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+		   out_len - sizeof(resp));
+
+	ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+	if (ret)
+		goto err;
+
+	ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
+	if (IS_ERR(ucontext)) {
+		ret = PTR_ERR(ucontext);
+		goto err_alloc;
+	}
+
+	ucontext->device = ib_dev;
+	ucontext->cg_obj = cg_obj;
+	/* ufile is required when some objects are released */
+	ucontext->ufile = file;
+	uverbs_initialize_ucontext(ucontext);
+
+	rcu_read_lock();
+	ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
+	rcu_read_unlock();
+	ucontext->closing = 0;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	ucontext->umem_tree = RB_ROOT_CACHED;
+	init_rwsem(&ucontext->umem_rwsem);
+	ucontext->odp_mrs_count = 0;
+	INIT_LIST_HEAD(&ucontext->no_private_counters);
+
+	if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+		ucontext->invalidate_range = NULL;
+
+#endif
+
+	resp.num_comp_vectors = file->device->num_comp_vectors;
+
+	ret = get_unused_fd_flags(O_CLOEXEC);
+	if (ret < 0)
+		goto err_free;
+	resp.async_fd = ret;
+
+	filp = ib_uverbs_alloc_async_event_file(file, ib_dev);
+	if (IS_ERR(filp)) {
+		ret = PTR_ERR(filp);
+		goto err_fd;
+	}
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_file;
+	}
+
+	file->ucontext = ucontext;
+
+	fd_install(resp.async_fd, filp);
+
+	mutex_unlock(&file->mutex);
+
+	return in_len;
+
+err_file:
+	ib_uverbs_free_async_event_file(file);
+	fput(filp);
+
+err_fd:
+	put_unused_fd(resp.async_fd);
+
+err_free:
+	put_pid(ucontext->tgid);
+	ib_dev->dealloc_ucontext(ucontext);
+
+err_alloc:
+	ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+
+err:
+	mutex_unlock(&file->mutex);
+	return ret;
+}
+
+static void copy_query_dev_fields(struct ib_uverbs_file *file,
+				  struct ib_device *ib_dev,
+				  struct ib_uverbs_query_device_resp *resp,
+				  struct ib_device_attr *attr)
+{
+	resp->fw_ver		= attr->fw_ver;
+	resp->node_guid		= ib_dev->node_guid;
+	resp->sys_image_guid	= attr->sys_image_guid;
+	resp->max_mr_size	= attr->max_mr_size;
+	resp->page_size_cap	= attr->page_size_cap;
+	resp->vendor_id		= attr->vendor_id;
+	resp->vendor_part_id	= attr->vendor_part_id;
+	resp->hw_ver		= attr->hw_ver;
+	resp->max_qp		= attr->max_qp;
+	resp->max_qp_wr		= attr->max_qp_wr;
+	resp->device_cap_flags	= lower_32_bits(attr->device_cap_flags);
+	resp->max_sge		= attr->max_sge;
+	resp->max_sge_rd	= attr->max_sge_rd;
+	resp->max_cq		= attr->max_cq;
+	resp->max_cqe		= attr->max_cqe;
+	resp->max_mr		= attr->max_mr;
+	resp->max_pd		= attr->max_pd;
+	resp->max_qp_rd_atom	= attr->max_qp_rd_atom;
+	resp->max_ee_rd_atom	= attr->max_ee_rd_atom;
+	resp->max_res_rd_atom	= attr->max_res_rd_atom;
+	resp->max_qp_init_rd_atom	= attr->max_qp_init_rd_atom;
+	resp->max_ee_init_rd_atom	= attr->max_ee_init_rd_atom;
+	resp->atomic_cap		= attr->atomic_cap;
+	resp->max_ee			= attr->max_ee;
+	resp->max_rdd			= attr->max_rdd;
+	resp->max_mw			= attr->max_mw;
+	resp->max_raw_ipv6_qp		= attr->max_raw_ipv6_qp;
+	resp->max_raw_ethy_qp		= attr->max_raw_ethy_qp;
+	resp->max_mcast_grp		= attr->max_mcast_grp;
+	resp->max_mcast_qp_attach	= attr->max_mcast_qp_attach;
+	resp->max_total_mcast_qp_attach	= attr->max_total_mcast_qp_attach;
+	resp->max_ah			= attr->max_ah;
+	resp->max_fmr			= attr->max_fmr;
+	resp->max_map_per_fmr		= attr->max_map_per_fmr;
+	resp->max_srq			= attr->max_srq;
+	resp->max_srq_wr		= attr->max_srq_wr;
+	resp->max_srq_sge		= attr->max_srq_sge;
+	resp->max_pkeys			= attr->max_pkeys;
+	resp->local_ca_ack_delay	= attr->local_ca_ack_delay;
+	resp->phys_port_cnt		= ib_dev->phys_port_cnt;
+}
+
+ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+			       struct ib_device *ib_dev,
+			       const char __user *buf,
+			       int in_len, int out_len)
+{
+	struct ib_uverbs_query_device      cmd;
+	struct ib_uverbs_query_device_resp resp;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	memset(&resp, 0, sizeof resp);
+	copy_query_dev_fields(file, ib_dev, &resp, &ib_dev->attrs);
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
+			     const char __user *buf,
+			     int in_len, int out_len)
+{
+	struct ib_uverbs_query_port      cmd;
+	struct ib_uverbs_query_port_resp resp;
+	struct ib_port_attr              attr;
+	int                              ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ret = ib_query_port(ib_dev, cmd.port_num, &attr);
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.state 	     = attr.state;
+	resp.max_mtu 	     = attr.max_mtu;
+	resp.active_mtu      = attr.active_mtu;
+	resp.gid_tbl_len     = attr.gid_tbl_len;
+	resp.port_cap_flags  = attr.port_cap_flags;
+	resp.max_msg_sz      = attr.max_msg_sz;
+	resp.bad_pkey_cntr   = attr.bad_pkey_cntr;
+	resp.qkey_viol_cntr  = attr.qkey_viol_cntr;
+	resp.pkey_tbl_len    = attr.pkey_tbl_len;
+
+	if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) {
+		resp.lid     = OPA_TO_IB_UCAST_LID(attr.lid);
+		resp.sm_lid  = OPA_TO_IB_UCAST_LID(attr.sm_lid);
+	} else {
+		resp.lid     = ib_lid_cpu16(attr.lid);
+		resp.sm_lid  = ib_lid_cpu16(attr.sm_lid);
+	}
+	resp.lmc 	     = attr.lmc;
+	resp.max_vl_num      = attr.max_vl_num;
+	resp.sm_sl 	     = attr.sm_sl;
+	resp.subnet_timeout  = attr.subnet_timeout;
+	resp.init_type_reply = attr.init_type_reply;
+	resp.active_width    = attr.active_width;
+	resp.active_speed    = attr.active_speed;
+	resp.phys_state      = attr.phys_state;
+	resp.link_layer      = rdma_port_get_link_layer(ib_dev,
+							cmd.port_num);
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
+			   const char __user *buf,
+			   int in_len, int out_len)
+{
+	struct ib_uverbs_alloc_pd      cmd;
+	struct ib_uverbs_alloc_pd_resp resp;
+	struct ib_udata                udata;
+	struct ib_uobject             *uobj;
+	struct ib_pd                  *pd;
+	int                            ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+		   u64_to_user_ptr(cmd.response) + sizeof(resp),
+                   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+                   out_len - sizeof(resp));
+
+	uobj  = uobj_alloc(UVERBS_OBJECT_PD, file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
+	if (IS_ERR(pd)) {
+		ret = PTR_ERR(pd);
+		goto err;
+	}
+
+	pd->device  = ib_dev;
+	pd->uobject = uobj;
+	pd->__internal_mr = NULL;
+	atomic_set(&pd->usecnt, 0);
+
+	uobj->object = pd;
+	memset(&resp, 0, sizeof resp);
+	resp.pd_handle = uobj->id;
+	pd->res.type = RDMA_RESTRACK_PD;
+	rdma_restrack_add(&pd->res);
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	uobj_alloc_commit(uobj);
+
+	return in_len;
+
+err_copy:
+	ib_dealloc_pd(pd);
+
+err:
+	uobj_alloc_abort(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
+			     const char __user *buf,
+			     int in_len, int out_len)
+{
+	struct ib_uverbs_dealloc_pd cmd;
+	struct ib_uobject          *uobj;
+	int                         ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj  = uobj_get_write(UVERBS_OBJECT_PD, cmd.pd_handle,
+			       file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	ret = uobj_remove_commit(uobj);
+
+	return ret ?: in_len;
+}
+
+struct xrcd_table_entry {
+	struct rb_node  node;
+	struct ib_xrcd *xrcd;
+	struct inode   *inode;
+};
+
+static int xrcd_table_insert(struct ib_uverbs_device *dev,
+			    struct inode *inode,
+			    struct ib_xrcd *xrcd)
+{
+	struct xrcd_table_entry *entry, *scan;
+	struct rb_node **p = &dev->xrcd_tree.rb_node;
+	struct rb_node *parent = NULL;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->xrcd  = xrcd;
+	entry->inode = inode;
+
+	while (*p) {
+		parent = *p;
+		scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+		if (inode < scan->inode) {
+			p = &(*p)->rb_left;
+		} else if (inode > scan->inode) {
+			p = &(*p)->rb_right;
+		} else {
+			kfree(entry);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&entry->node, parent, p);
+	rb_insert_color(&entry->node, &dev->xrcd_tree);
+	igrab(inode);
+	return 0;
+}
+
+static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev,
+						  struct inode *inode)
+{
+	struct xrcd_table_entry *entry;
+	struct rb_node *p = dev->xrcd_tree.rb_node;
+
+	while (p) {
+		entry = rb_entry(p, struct xrcd_table_entry, node);
+
+		if (inode < entry->inode)
+			p = p->rb_left;
+		else if (inode > entry->inode)
+			p = p->rb_right;
+		else
+			return entry;
+	}
+
+	return NULL;
+}
+
+static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode)
+{
+	struct xrcd_table_entry *entry;
+
+	entry = xrcd_table_search(dev, inode);
+	if (!entry)
+		return NULL;
+
+	return entry->xrcd;
+}
+
+static void xrcd_table_delete(struct ib_uverbs_device *dev,
+			      struct inode *inode)
+{
+	struct xrcd_table_entry *entry;
+
+	entry = xrcd_table_search(dev, inode);
+	if (entry) {
+		iput(inode);
+		rb_erase(&entry->node, &dev->xrcd_tree);
+		kfree(entry);
+	}
+}
+
+ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_open_xrcd	cmd;
+	struct ib_uverbs_open_xrcd_resp	resp;
+	struct ib_udata			udata;
+	struct ib_uxrcd_object         *obj;
+	struct ib_xrcd                 *xrcd = NULL;
+	struct fd			f = {NULL, 0};
+	struct inode                   *inode = NULL;
+	int				ret = 0;
+	int				new_xrcd = 0;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+		   u64_to_user_ptr(cmd.response) + sizeof(resp),
+                   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+                   out_len - sizeof(resp));
+
+	mutex_lock(&file->device->xrcd_tree_mutex);
+
+	if (cmd.fd != -1) {
+		/* search for file descriptor */
+		f = fdget(cmd.fd);
+		if (!f.file) {
+			ret = -EBADF;
+			goto err_tree_mutex_unlock;
+		}
+
+		inode = file_inode(f.file);
+		xrcd = find_xrcd(file->device, inode);
+		if (!xrcd && !(cmd.oflags & O_CREAT)) {
+			/* no file descriptor. Need CREATE flag */
+			ret = -EAGAIN;
+			goto err_tree_mutex_unlock;
+		}
+
+		if (xrcd && cmd.oflags & O_EXCL) {
+			ret = -EINVAL;
+			goto err_tree_mutex_unlock;
+		}
+	}
+
+	obj  = (struct ib_uxrcd_object *)uobj_alloc(UVERBS_OBJECT_XRCD,
+						    file->ucontext);
+	if (IS_ERR(obj)) {
+		ret = PTR_ERR(obj);
+		goto err_tree_mutex_unlock;
+	}
+
+	if (!xrcd) {
+		xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata);
+		if (IS_ERR(xrcd)) {
+			ret = PTR_ERR(xrcd);
+			goto err;
+		}
+
+		xrcd->inode   = inode;
+		xrcd->device  = ib_dev;
+		atomic_set(&xrcd->usecnt, 0);
+		mutex_init(&xrcd->tgt_qp_mutex);
+		INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+		new_xrcd = 1;
+	}
+
+	atomic_set(&obj->refcnt, 0);
+	obj->uobject.object = xrcd;
+	memset(&resp, 0, sizeof resp);
+	resp.xrcd_handle = obj->uobject.id;
+
+	if (inode) {
+		if (new_xrcd) {
+			/* create new inode/xrcd table entry */
+			ret = xrcd_table_insert(file->device, inode, xrcd);
+			if (ret)
+				goto err_dealloc_xrcd;
+		}
+		atomic_inc(&xrcd->usecnt);
+	}
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	if (f.file)
+		fdput(f);
+
+	mutex_unlock(&file->device->xrcd_tree_mutex);
+
+	uobj_alloc_commit(&obj->uobject);
+
+	return in_len;
+
+err_copy:
+	if (inode) {
+		if (new_xrcd)
+			xrcd_table_delete(file->device, inode);
+		atomic_dec(&xrcd->usecnt);
+	}
+
+err_dealloc_xrcd:
+	ib_dealloc_xrcd(xrcd);
+
+err:
+	uobj_alloc_abort(&obj->uobject);
+
+err_tree_mutex_unlock:
+	if (f.file)
+		fdput(f);
+
+	mutex_unlock(&file->device->xrcd_tree_mutex);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_close_xrcd cmd;
+	struct ib_uobject           *uobj;
+	int                         ret = 0;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj  = uobj_get_write(UVERBS_OBJECT_XRCD, cmd.xrcd_handle,
+			       file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	ret = uobj_remove_commit(uobj);
+	return ret ?: in_len;
+}
+
+int ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
+			   struct ib_xrcd *xrcd,
+			   enum rdma_remove_reason why)
+{
+	struct inode *inode;
+	int ret;
+
+	inode = xrcd->inode;
+	if (inode && !atomic_dec_and_test(&xrcd->usecnt))
+		return 0;
+
+	ret = ib_dealloc_xrcd(xrcd);
+
+	if (why == RDMA_REMOVE_DESTROY && ret)
+		atomic_inc(&xrcd->usecnt);
+	else if (inode)
+		xrcd_table_delete(dev, inode);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+			 struct ib_device *ib_dev,
+			 const char __user *buf, int in_len,
+			 int out_len)
+{
+	struct ib_uverbs_reg_mr      cmd;
+	struct ib_uverbs_reg_mr_resp resp;
+	struct ib_udata              udata;
+	struct ib_uobject           *uobj;
+	struct ib_pd                *pd;
+	struct ib_mr                *mr;
+	int                          ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+		   u64_to_user_ptr(cmd.response) + sizeof(resp),
+                   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+                   out_len - sizeof(resp));
+
+	if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
+		return -EINVAL;
+
+	ret = ib_check_mr_access(cmd.access_flags);
+	if (ret)
+		return ret;
+
+	uobj  = uobj_alloc(UVERBS_OBJECT_MR, file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err_free;
+	}
+
+	if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
+		if (!(pd->device->attrs.device_cap_flags &
+		      IB_DEVICE_ON_DEMAND_PAGING)) {
+			pr_debug("ODP support not available\n");
+			ret = -EINVAL;
+			goto err_put;
+		}
+	}
+
+	mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+				     cmd.access_flags, &udata);
+	if (IS_ERR(mr)) {
+		ret = PTR_ERR(mr);
+		goto err_put;
+	}
+
+	mr->device  = pd->device;
+	mr->pd      = pd;
+	mr->dm	    = NULL;
+	mr->uobject = uobj;
+	atomic_inc(&pd->usecnt);
+	mr->res.type = RDMA_RESTRACK_MR;
+	rdma_restrack_add(&mr->res);
+
+	uobj->object = mr;
+
+	memset(&resp, 0, sizeof resp);
+	resp.lkey      = mr->lkey;
+	resp.rkey      = mr->rkey;
+	resp.mr_handle = uobj->id;
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	uobj_put_obj_read(pd);
+
+	uobj_alloc_commit(uobj);
+
+	return in_len;
+
+err_copy:
+	ib_dereg_mr(mr);
+
+err_put:
+	uobj_put_obj_read(pd);
+
+err_free:
+	uobj_alloc_abort(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_rereg_mr      cmd;
+	struct ib_uverbs_rereg_mr_resp resp;
+	struct ib_udata              udata;
+	struct ib_pd                *pd = NULL;
+	struct ib_mr                *mr;
+	struct ib_pd		    *old_pd;
+	int                          ret;
+	struct ib_uobject	    *uobj;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+		   u64_to_user_ptr(cmd.response) + sizeof(resp),
+                   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+                   out_len - sizeof(resp));
+
+	if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags)
+		return -EINVAL;
+
+	if ((cmd.flags & IB_MR_REREG_TRANS) &&
+	    (!cmd.start || !cmd.hca_va || 0 >= cmd.length ||
+	     (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)))
+			return -EINVAL;
+
+	uobj  = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle,
+			       file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	mr = uobj->object;
+
+	if (mr->dm) {
+		ret = -EINVAL;
+		goto put_uobjs;
+	}
+
+	if (cmd.flags & IB_MR_REREG_ACCESS) {
+		ret = ib_check_mr_access(cmd.access_flags);
+		if (ret)
+			goto put_uobjs;
+	}
+
+	if (cmd.flags & IB_MR_REREG_PD) {
+		pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext);
+		if (!pd) {
+			ret = -EINVAL;
+			goto put_uobjs;
+		}
+	}
+
+	old_pd = mr->pd;
+	ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
+					cmd.length, cmd.hca_va,
+					cmd.access_flags, pd, &udata);
+	if (!ret) {
+		if (cmd.flags & IB_MR_REREG_PD) {
+			atomic_inc(&pd->usecnt);
+			mr->pd = pd;
+			atomic_dec(&old_pd->usecnt);
+		}
+	} else {
+		goto put_uobj_pd;
+	}
+
+	memset(&resp, 0, sizeof(resp));
+	resp.lkey      = mr->lkey;
+	resp.rkey      = mr->rkey;
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp)))
+		ret = -EFAULT;
+	else
+		ret = in_len;
+
+put_uobj_pd:
+	if (cmd.flags & IB_MR_REREG_PD)
+		uobj_put_obj_read(pd);
+
+put_uobjs:
+	uobj_put_write(uobj);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_dereg_mr cmd;
+	struct ib_uobject	 *uobj;
+	int                       ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj  = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle,
+			       file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	ret = uobj_remove_commit(uobj);
+
+	return ret ?: in_len;
+}
+
+ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_alloc_mw      cmd;
+	struct ib_uverbs_alloc_mw_resp resp;
+	struct ib_uobject             *uobj;
+	struct ib_pd                  *pd;
+	struct ib_mw                  *mw;
+	struct ib_udata		       udata;
+	int                            ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	uobj  = uobj_alloc(UVERBS_OBJECT_MW, file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err_free;
+	}
+
+	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+		   u64_to_user_ptr(cmd.response) + sizeof(resp),
+		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+		   out_len - sizeof(resp));
+
+	mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
+	if (IS_ERR(mw)) {
+		ret = PTR_ERR(mw);
+		goto err_put;
+	}
+
+	mw->device  = pd->device;
+	mw->pd      = pd;
+	mw->uobject = uobj;
+	atomic_inc(&pd->usecnt);
+
+	uobj->object = mw;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.rkey      = mw->rkey;
+	resp.mw_handle = uobj->id;
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	uobj_put_obj_read(pd);
+	uobj_alloc_commit(uobj);
+
+	return in_len;
+
+err_copy:
+	uverbs_dealloc_mw(mw);
+err_put:
+	uobj_put_obj_read(pd);
+err_free:
+	uobj_alloc_abort(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_dealloc_mw cmd;
+	struct ib_uobject	   *uobj;
+	int                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	uobj  = uobj_get_write(UVERBS_OBJECT_MW, cmd.mw_handle,
+			       file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	ret = uobj_remove_commit(uobj);
+	return ret ?: in_len;
+}
+
+ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+				      struct ib_device *ib_dev,
+				      const char __user *buf, int in_len,
+				      int out_len)
+{
+	struct ib_uverbs_create_comp_channel	   cmd;
+	struct ib_uverbs_create_comp_channel_resp  resp;
+	struct ib_uobject			  *uobj;
+	struct ib_uverbs_completion_event_file	  *ev_file;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = uobj_alloc(UVERBS_OBJECT_COMP_CHANNEL, file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	resp.fd = uobj->id;
+
+	ev_file = container_of(uobj, struct ib_uverbs_completion_event_file,
+			       uobj_file.uobj);
+	ib_uverbs_init_event_queue(&ev_file->ev_queue);
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+		uobj_alloc_abort(uobj);
+		return -EFAULT;
+	}
+
+	uobj_alloc_commit(uobj);
+	return in_len;
+}
+
+static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
+					struct ib_device *ib_dev,
+				       struct ib_udata *ucore,
+				       struct ib_udata *uhw,
+				       struct ib_uverbs_ex_create_cq *cmd,
+				       size_t cmd_sz,
+				       int (*cb)(struct ib_uverbs_file *file,
+						 struct ib_ucq_object *obj,
+						 struct ib_uverbs_ex_create_cq_resp *resp,
+						 struct ib_udata *udata,
+						 void *context),
+				       void *context)
+{
+	struct ib_ucq_object           *obj;
+	struct ib_uverbs_completion_event_file    *ev_file = NULL;
+	struct ib_cq                   *cq;
+	int                             ret;
+	struct ib_uverbs_ex_create_cq_resp resp;
+	struct ib_cq_init_attr attr = {};
+
+	if (!ib_dev->create_cq)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (cmd->comp_vector >= file->device->num_comp_vectors)
+		return ERR_PTR(-EINVAL);
+
+	obj  = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ,
+						  file->ucontext);
+	if (IS_ERR(obj))
+		return obj;
+
+	if (cmd->comp_channel >= 0) {
+		ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel,
+						     file->ucontext);
+		if (IS_ERR(ev_file)) {
+			ret = PTR_ERR(ev_file);
+			goto err;
+		}
+	}
+
+	obj->uobject.user_handle = cmd->user_handle;
+	obj->uverbs_file	   = file;
+	obj->comp_events_reported  = 0;
+	obj->async_events_reported = 0;
+	INIT_LIST_HEAD(&obj->comp_list);
+	INIT_LIST_HEAD(&obj->async_list);
+
+	attr.cqe = cmd->cqe;
+	attr.comp_vector = cmd->comp_vector;
+
+	if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
+		attr.flags = cmd->flags;
+
+	cq = ib_dev->create_cq(ib_dev, &attr, file->ucontext, uhw);
+	if (IS_ERR(cq)) {
+		ret = PTR_ERR(cq);
+		goto err_file;
+	}
+
+	cq->device        = ib_dev;
+	cq->uobject       = &obj->uobject;
+	cq->comp_handler  = ib_uverbs_comp_handler;
+	cq->event_handler = ib_uverbs_cq_event_handler;
+	cq->cq_context    = ev_file ? &ev_file->ev_queue : NULL;
+	atomic_set(&cq->usecnt, 0);
+
+	obj->uobject.object = cq;
+	memset(&resp, 0, sizeof resp);
+	resp.base.cq_handle = obj->uobject.id;
+	resp.base.cqe       = cq->cqe;
+
+	resp.response_length = offsetof(typeof(resp), response_length) +
+		sizeof(resp.response_length);
+
+	cq->res.type = RDMA_RESTRACK_CQ;
+	rdma_restrack_add(&cq->res);
+
+	ret = cb(file, obj, &resp, ucore, context);
+	if (ret)
+		goto err_cb;
+
+	uobj_alloc_commit(&obj->uobject);
+	return obj;
+
+err_cb:
+	ib_destroy_cq(cq);
+
+err_file:
+	if (ev_file)
+		ib_uverbs_release_ucq(file, ev_file, obj);
+
+err:
+	uobj_alloc_abort(&obj->uobject);
+
+	return ERR_PTR(ret);
+}
+
+static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file,
+				  struct ib_ucq_object *obj,
+				  struct ib_uverbs_ex_create_cq_resp *resp,
+				  struct ib_udata *ucore, void *context)
+{
+	if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base)))
+		return -EFAULT;
+
+	return 0;
+}
+
+ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_cq      cmd;
+	struct ib_uverbs_ex_create_cq	cmd_ex;
+	struct ib_uverbs_create_cq_resp resp;
+	struct ib_udata                 ucore;
+	struct ib_udata                 uhw;
+	struct ib_ucq_object           *obj;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	ib_uverbs_init_udata(&ucore, buf, u64_to_user_ptr(cmd.response),
+			     sizeof(cmd), sizeof(resp));
+
+	ib_uverbs_init_udata(&uhw, buf + sizeof(cmd),
+		   u64_to_user_ptr(cmd.response) + sizeof(resp),
+		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+		   out_len - sizeof(resp));
+
+	memset(&cmd_ex, 0, sizeof(cmd_ex));
+	cmd_ex.user_handle = cmd.user_handle;
+	cmd_ex.cqe = cmd.cqe;
+	cmd_ex.comp_vector = cmd.comp_vector;
+	cmd_ex.comp_channel = cmd.comp_channel;
+
+	obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex,
+			offsetof(typeof(cmd_ex), comp_channel) +
+			sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb,
+			NULL);
+
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	return in_len;
+}
+
+static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file,
+				     struct ib_ucq_object *obj,
+				     struct ib_uverbs_ex_create_cq_resp *resp,
+				     struct ib_udata *ucore, void *context)
+{
+	if (ib_copy_to_udata(ucore, resp, resp->response_length))
+		return -EFAULT;
+
+	return 0;
+}
+
+int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
+			 struct ib_device *ib_dev,
+			   struct ib_udata *ucore,
+			   struct ib_udata *uhw)
+{
+	struct ib_uverbs_ex_create_cq_resp resp;
+	struct ib_uverbs_ex_create_cq  cmd;
+	struct ib_ucq_object           *obj;
+	int err;
+
+	if (ucore->inlen < sizeof(cmd))
+		return -EINVAL;
+
+	err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	if (err)
+		return err;
+
+	if (cmd.comp_mask)
+		return -EINVAL;
+
+	if (cmd.reserved)
+		return -EINVAL;
+
+	if (ucore->outlen < (offsetof(typeof(resp), response_length) +
+			     sizeof(resp.response_length)))
+		return -ENOSPC;
+
+	obj = create_cq(file, ib_dev, ucore, uhw, &cmd,
+			min(ucore->inlen, sizeof(cmd)),
+			ib_uverbs_ex_create_cq_cb, NULL);
+
+	return PTR_ERR_OR_ZERO(obj);
+}
+
+ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_resize_cq	cmd;
+	struct ib_uverbs_resize_cq_resp	resp = {};
+	struct ib_udata                 udata;
+	struct ib_cq			*cq;
+	int				ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+		   u64_to_user_ptr(cmd.response) + sizeof(resp),
+		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+		   out_len - sizeof(resp));
+
+	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext);
+	if (!cq)
+		return -EINVAL;
+
+	ret = cq->device->resize_cq(cq, cmd.cqe, &udata);
+	if (ret)
+		goto out;
+
+	resp.cqe = cq->cqe;
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp.cqe))
+		ret = -EFAULT;
+
+out:
+	uobj_put_obj_read(cq);
+
+	return ret ? ret : in_len;
+}
+
+static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest,
+			   struct ib_wc *wc)
+{
+	struct ib_uverbs_wc tmp;
+
+	tmp.wr_id		= wc->wr_id;
+	tmp.status		= wc->status;
+	tmp.opcode		= wc->opcode;
+	tmp.vendor_err		= wc->vendor_err;
+	tmp.byte_len		= wc->byte_len;
+	tmp.ex.imm_data		= wc->ex.imm_data;
+	tmp.qp_num		= wc->qp->qp_num;
+	tmp.src_qp		= wc->src_qp;
+	tmp.wc_flags		= wc->wc_flags;
+	tmp.pkey_index		= wc->pkey_index;
+	if (rdma_cap_opa_ah(ib_dev, wc->port_num))
+		tmp.slid	= OPA_TO_IB_UCAST_LID(wc->slid);
+	else
+		tmp.slid	= ib_lid_cpu16(wc->slid);
+	tmp.sl			= wc->sl;
+	tmp.dlid_path_bits	= wc->dlid_path_bits;
+	tmp.port_num		= wc->port_num;
+	tmp.reserved		= 0;
+
+	if (copy_to_user(dest, &tmp, sizeof tmp))
+		return -EFAULT;
+
+	return 0;
+}
+
+ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+			  struct ib_device *ib_dev,
+			  const char __user *buf, int in_len,
+			  int out_len)
+{
+	struct ib_uverbs_poll_cq       cmd;
+	struct ib_uverbs_poll_cq_resp  resp;
+	u8 __user                     *header_ptr;
+	u8 __user                     *data_ptr;
+	struct ib_cq                  *cq;
+	struct ib_wc                   wc;
+	int                            ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext);
+	if (!cq)
+		return -EINVAL;
+
+	/* we copy a struct ib_uverbs_poll_cq_resp to user space */
+	header_ptr = u64_to_user_ptr(cmd.response);
+	data_ptr = header_ptr + sizeof resp;
+
+	memset(&resp, 0, sizeof resp);
+	while (resp.count < cmd.ne) {
+		ret = ib_poll_cq(cq, 1, &wc);
+		if (ret < 0)
+			goto out_put;
+		if (!ret)
+			break;
+
+		ret = copy_wc_to_user(ib_dev, data_ptr, &wc);
+		if (ret)
+			goto out_put;
+
+		data_ptr += sizeof(struct ib_uverbs_wc);
+		++resp.count;
+	}
+
+	if (copy_to_user(header_ptr, &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto out_put;
+	}
+
+	ret = in_len;
+
+out_put:
+	uobj_put_obj_read(cq);
+	return ret;
+}
+
+ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+				struct ib_device *ib_dev,
+				const char __user *buf, int in_len,
+				int out_len)
+{
+	struct ib_uverbs_req_notify_cq cmd;
+	struct ib_cq                  *cq;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext);
+	if (!cq)
+		return -EINVAL;
+
+	ib_req_notify_cq(cq, cmd.solicited_only ?
+			 IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
+
+	uobj_put_obj_read(cq);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_destroy_cq      cmd;
+	struct ib_uverbs_destroy_cq_resp resp;
+	struct ib_uobject		*uobj;
+	struct ib_cq               	*cq;
+	struct ib_ucq_object        	*obj;
+	int                        	 ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj  = uobj_get_write(UVERBS_OBJECT_CQ, cmd.cq_handle,
+			       file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	/*
+	 * Make sure we don't free the memory in remove_commit as we still
+	 * needs the uobject memory to create the response.
+	 */
+	uverbs_uobject_get(uobj);
+	cq      = uobj->object;
+	obj     = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+	memset(&resp, 0, sizeof(resp));
+
+	ret = uobj_remove_commit(uobj);
+	if (ret) {
+		uverbs_uobject_put(uobj);
+		return ret;
+	}
+
+	resp.comp_events_reported  = obj->comp_events_reported;
+	resp.async_events_reported = obj->async_events_reported;
+
+	uverbs_uobject_put(uobj);
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+static int create_qp(struct ib_uverbs_file *file,
+		     struct ib_udata *ucore,
+		     struct ib_udata *uhw,
+		     struct ib_uverbs_ex_create_qp *cmd,
+		     size_t cmd_sz,
+		     int (*cb)(struct ib_uverbs_file *file,
+			       struct ib_uverbs_ex_create_qp_resp *resp,
+			       struct ib_udata *udata),
+		     void *context)
+{
+	struct ib_uqp_object		*obj;
+	struct ib_device		*device;
+	struct ib_pd			*pd = NULL;
+	struct ib_xrcd			*xrcd = NULL;
+	struct ib_uobject		*xrcd_uobj = ERR_PTR(-ENOENT);
+	struct ib_cq			*scq = NULL, *rcq = NULL;
+	struct ib_srq			*srq = NULL;
+	struct ib_qp			*qp;
+	char				*buf;
+	struct ib_qp_init_attr		attr = {};
+	struct ib_uverbs_ex_create_qp_resp resp;
+	int				ret;
+	struct ib_rwq_ind_table *ind_tbl = NULL;
+	bool has_sq = true;
+
+	if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+		return -EPERM;
+
+	obj  = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP,
+						  file->ucontext);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+	obj->uxrcd = NULL;
+	obj->uevent.uobject.user_handle = cmd->user_handle;
+	mutex_init(&obj->mcast_lock);
+
+	if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) +
+		      sizeof(cmd->rwq_ind_tbl_handle) &&
+		      (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) {
+		ind_tbl = uobj_get_obj_read(rwq_ind_table, UVERBS_OBJECT_RWQ_IND_TBL,
+					    cmd->rwq_ind_tbl_handle,
+					    file->ucontext);
+		if (!ind_tbl) {
+			ret = -EINVAL;
+			goto err_put;
+		}
+
+		attr.rwq_ind_tbl = ind_tbl;
+	}
+
+	if (cmd_sz > sizeof(*cmd) &&
+	    !ib_is_udata_cleared(ucore, sizeof(*cmd),
+				 cmd_sz - sizeof(*cmd))) {
+		ret = -EOPNOTSUPP;
+		goto err_put;
+	}
+
+	if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) {
+		ret = -EINVAL;
+		goto err_put;
+	}
+
+	if (ind_tbl && !cmd->max_send_wr)
+		has_sq = false;
+
+	if (cmd->qp_type == IB_QPT_XRC_TGT) {
+		xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->pd_handle,
+					  file->ucontext);
+
+		if (IS_ERR(xrcd_uobj)) {
+			ret = -EINVAL;
+			goto err_put;
+		}
+
+		xrcd = (struct ib_xrcd *)xrcd_uobj->object;
+		if (!xrcd) {
+			ret = -EINVAL;
+			goto err_put;
+		}
+		device = xrcd->device;
+	} else {
+		if (cmd->qp_type == IB_QPT_XRC_INI) {
+			cmd->max_recv_wr = 0;
+			cmd->max_recv_sge = 0;
+		} else {
+			if (cmd->is_srq) {
+				srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd->srq_handle,
+							file->ucontext);
+				if (!srq || srq->srq_type == IB_SRQT_XRC) {
+					ret = -EINVAL;
+					goto err_put;
+				}
+			}
+
+			if (!ind_tbl) {
+				if (cmd->recv_cq_handle != cmd->send_cq_handle) {
+					rcq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->recv_cq_handle,
+								file->ucontext);
+					if (!rcq) {
+						ret = -EINVAL;
+						goto err_put;
+					}
+				}
+			}
+		}
+
+		if (has_sq)
+			scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->send_cq_handle,
+						file->ucontext);
+		if (!ind_tbl)
+			rcq = rcq ?: scq;
+		pd  = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, file->ucontext);
+		if (!pd || (!scq && has_sq)) {
+			ret = -EINVAL;
+			goto err_put;
+		}
+
+		device = pd->device;
+	}
+
+	attr.event_handler = ib_uverbs_qp_event_handler;
+	attr.qp_context    = file;
+	attr.send_cq       = scq;
+	attr.recv_cq       = rcq;
+	attr.srq           = srq;
+	attr.xrcd	   = xrcd;
+	attr.sq_sig_type   = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR :
+					      IB_SIGNAL_REQ_WR;
+	attr.qp_type       = cmd->qp_type;
+	attr.create_flags  = 0;
+
+	attr.cap.max_send_wr     = cmd->max_send_wr;
+	attr.cap.max_recv_wr     = cmd->max_recv_wr;
+	attr.cap.max_send_sge    = cmd->max_send_sge;
+	attr.cap.max_recv_sge    = cmd->max_recv_sge;
+	attr.cap.max_inline_data = cmd->max_inline_data;
+
+	obj->uevent.events_reported     = 0;
+	INIT_LIST_HEAD(&obj->uevent.event_list);
+	INIT_LIST_HEAD(&obj->mcast_list);
+
+	if (cmd_sz >= offsetof(typeof(*cmd), create_flags) +
+		      sizeof(cmd->create_flags))
+		attr.create_flags = cmd->create_flags;
+
+	if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
+				IB_QP_CREATE_CROSS_CHANNEL |
+				IB_QP_CREATE_MANAGED_SEND |
+				IB_QP_CREATE_MANAGED_RECV |
+				IB_QP_CREATE_SCATTER_FCS |
+				IB_QP_CREATE_CVLAN_STRIPPING |
+				IB_QP_CREATE_SOURCE_QPN |
+				IB_QP_CREATE_PCI_WRITE_END_PADDING)) {
+		ret = -EINVAL;
+		goto err_put;
+	}
+
+	if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) {
+		if (!capable(CAP_NET_RAW)) {
+			ret = -EPERM;
+			goto err_put;
+		}
+
+		attr.source_qpn = cmd->source_qpn;
+	}
+
+	buf = (void *)cmd + sizeof(*cmd);
+	if (cmd_sz > sizeof(*cmd))
+		if (!(buf[0] == 0 && !memcmp(buf, buf + 1,
+					     cmd_sz - sizeof(*cmd) - 1))) {
+			ret = -EINVAL;
+			goto err_put;
+		}
+
+	if (cmd->qp_type == IB_QPT_XRC_TGT)
+		qp = ib_create_qp(pd, &attr);
+	else
+		qp = _ib_create_qp(device, pd, &attr, uhw,
+				   &obj->uevent.uobject);
+
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto err_put;
+	}
+
+	if (cmd->qp_type != IB_QPT_XRC_TGT) {
+		ret = ib_create_qp_security(qp, device);
+		if (ret)
+			goto err_cb;
+
+		qp->real_qp	  = qp;
+		qp->pd		  = pd;
+		qp->send_cq	  = attr.send_cq;
+		qp->recv_cq	  = attr.recv_cq;
+		qp->srq		  = attr.srq;
+		qp->rwq_ind_tbl	  = ind_tbl;
+		qp->event_handler = attr.event_handler;
+		qp->qp_context	  = attr.qp_context;
+		qp->qp_type	  = attr.qp_type;
+		atomic_set(&qp->usecnt, 0);
+		atomic_inc(&pd->usecnt);
+		qp->port = 0;
+		if (attr.send_cq)
+			atomic_inc(&attr.send_cq->usecnt);
+		if (attr.recv_cq)
+			atomic_inc(&attr.recv_cq->usecnt);
+		if (attr.srq)
+			atomic_inc(&attr.srq->usecnt);
+		if (ind_tbl)
+			atomic_inc(&ind_tbl->usecnt);
+	} else {
+		/* It is done in _ib_create_qp for other QP types */
+		qp->uobject = &obj->uevent.uobject;
+	}
+
+	obj->uevent.uobject.object = qp;
+
+	memset(&resp, 0, sizeof resp);
+	resp.base.qpn             = qp->qp_num;
+	resp.base.qp_handle       = obj->uevent.uobject.id;
+	resp.base.max_recv_sge    = attr.cap.max_recv_sge;
+	resp.base.max_send_sge    = attr.cap.max_send_sge;
+	resp.base.max_recv_wr     = attr.cap.max_recv_wr;
+	resp.base.max_send_wr     = attr.cap.max_send_wr;
+	resp.base.max_inline_data = attr.cap.max_inline_data;
+
+	resp.response_length = offsetof(typeof(resp), response_length) +
+			       sizeof(resp.response_length);
+
+	ret = cb(file, &resp, ucore);
+	if (ret)
+		goto err_cb;
+
+	if (xrcd) {
+		obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
+					  uobject);
+		atomic_inc(&obj->uxrcd->refcnt);
+		uobj_put_read(xrcd_uobj);
+	}
+
+	if (pd)
+		uobj_put_obj_read(pd);
+	if (scq)
+		uobj_put_obj_read(scq);
+	if (rcq && rcq != scq)
+		uobj_put_obj_read(rcq);
+	if (srq)
+		uobj_put_obj_read(srq);
+	if (ind_tbl)
+		uobj_put_obj_read(ind_tbl);
+
+	uobj_alloc_commit(&obj->uevent.uobject);
+
+	return 0;
+err_cb:
+	ib_destroy_qp(qp);
+
+err_put:
+	if (!IS_ERR(xrcd_uobj))
+		uobj_put_read(xrcd_uobj);
+	if (pd)
+		uobj_put_obj_read(pd);
+	if (scq)
+		uobj_put_obj_read(scq);
+	if (rcq && rcq != scq)
+		uobj_put_obj_read(rcq);
+	if (srq)
+		uobj_put_obj_read(srq);
+	if (ind_tbl)
+		uobj_put_obj_read(ind_tbl);
+
+	uobj_alloc_abort(&obj->uevent.uobject);
+	return ret;
+}
+
+static int ib_uverbs_create_qp_cb(struct ib_uverbs_file *file,
+				  struct ib_uverbs_ex_create_qp_resp *resp,
+				  struct ib_udata *ucore)
+{
+	if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base)))
+		return -EFAULT;
+
+	return 0;
+}
+
+ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_qp      cmd;
+	struct ib_uverbs_ex_create_qp	cmd_ex;
+	struct ib_udata			ucore;
+	struct ib_udata			uhw;
+	ssize_t resp_size = sizeof(struct ib_uverbs_create_qp_resp);
+	int				err;
+
+	if (out_len < resp_size)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	ib_uverbs_init_udata(&ucore, buf, u64_to_user_ptr(cmd.response),
+		   sizeof(cmd), resp_size);
+	ib_uverbs_init_udata(&uhw, buf + sizeof(cmd),
+		   u64_to_user_ptr(cmd.response) + resp_size,
+		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+		   out_len - resp_size);
+
+	memset(&cmd_ex, 0, sizeof(cmd_ex));
+	cmd_ex.user_handle = cmd.user_handle;
+	cmd_ex.pd_handle = cmd.pd_handle;
+	cmd_ex.send_cq_handle = cmd.send_cq_handle;
+	cmd_ex.recv_cq_handle = cmd.recv_cq_handle;
+	cmd_ex.srq_handle = cmd.srq_handle;
+	cmd_ex.max_send_wr = cmd.max_send_wr;
+	cmd_ex.max_recv_wr = cmd.max_recv_wr;
+	cmd_ex.max_send_sge = cmd.max_send_sge;
+	cmd_ex.max_recv_sge = cmd.max_recv_sge;
+	cmd_ex.max_inline_data = cmd.max_inline_data;
+	cmd_ex.sq_sig_all = cmd.sq_sig_all;
+	cmd_ex.qp_type = cmd.qp_type;
+	cmd_ex.is_srq = cmd.is_srq;
+
+	err = create_qp(file, &ucore, &uhw, &cmd_ex,
+			offsetof(typeof(cmd_ex), is_srq) +
+			sizeof(cmd.is_srq), ib_uverbs_create_qp_cb,
+			NULL);
+
+	if (err)
+		return err;
+
+	return in_len;
+}
+
+static int ib_uverbs_ex_create_qp_cb(struct ib_uverbs_file *file,
+				     struct ib_uverbs_ex_create_qp_resp *resp,
+				     struct ib_udata *ucore)
+{
+	if (ib_copy_to_udata(ucore, resp, resp->response_length))
+		return -EFAULT;
+
+	return 0;
+}
+
+int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
+			   struct ib_udata *ucore,
+			   struct ib_udata *uhw)
+{
+	struct ib_uverbs_ex_create_qp_resp resp;
+	struct ib_uverbs_ex_create_qp cmd = {0};
+	int err;
+
+	if (ucore->inlen < (offsetof(typeof(cmd), comp_mask) +
+			    sizeof(cmd.comp_mask)))
+		return -EINVAL;
+
+	err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+	if (err)
+		return err;
+
+	if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK)
+		return -EINVAL;
+
+	if (cmd.reserved)
+		return -EINVAL;
+
+	if (ucore->outlen < (offsetof(typeof(resp), response_length) +
+			     sizeof(resp.response_length)))
+		return -ENOSPC;
+
+	err = create_qp(file, ucore, uhw, &cmd,
+			min(ucore->inlen, sizeof(cmd)),
+			ib_uverbs_ex_create_qp_cb, NULL);
+
+	if (err)
+		return err;
+
+	return 0;
+}
+
+ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+			  struct ib_device *ib_dev,
+			  const char __user *buf, int in_len, int out_len)
+{
+	struct ib_uverbs_open_qp        cmd;
+	struct ib_uverbs_create_qp_resp resp;
+	struct ib_udata                 udata;
+	struct ib_uqp_object           *obj;
+	struct ib_xrcd		       *xrcd;
+	struct ib_uobject	       *uninitialized_var(xrcd_uobj);
+	struct ib_qp                   *qp;
+	struct ib_qp_open_attr          attr;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+		   u64_to_user_ptr(cmd.response) + sizeof(resp),
+		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+		   out_len - sizeof(resp));
+
+	obj  = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP,
+						  file->ucontext);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd.pd_handle,
+				  file->ucontext);
+	if (IS_ERR(xrcd_uobj)) {
+		ret = -EINVAL;
+		goto err_put;
+	}
+
+	xrcd = (struct ib_xrcd *)xrcd_uobj->object;
+	if (!xrcd) {
+		ret = -EINVAL;
+		goto err_xrcd;
+	}
+
+	attr.event_handler = ib_uverbs_qp_event_handler;
+	attr.qp_context    = file;
+	attr.qp_num        = cmd.qpn;
+	attr.qp_type       = cmd.qp_type;
+
+	obj->uevent.events_reported = 0;
+	INIT_LIST_HEAD(&obj->uevent.event_list);
+	INIT_LIST_HEAD(&obj->mcast_list);
+
+	qp = ib_open_qp(xrcd, &attr);
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto err_xrcd;
+	}
+
+	obj->uevent.uobject.object = qp;
+	obj->uevent.uobject.user_handle = cmd.user_handle;
+
+	memset(&resp, 0, sizeof resp);
+	resp.qpn       = qp->qp_num;
+	resp.qp_handle = obj->uevent.uobject.id;
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_destroy;
+	}
+
+	obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+	atomic_inc(&obj->uxrcd->refcnt);
+	qp->uobject = &obj->uevent.uobject;
+	uobj_put_read(xrcd_uobj);
+
+
+	uobj_alloc_commit(&obj->uevent.uobject);
+
+	return in_len;
+
+err_destroy:
+	ib_destroy_qp(qp);
+err_xrcd:
+	uobj_put_read(xrcd_uobj);
+err_put:
+	uobj_alloc_abort(&obj->uevent.uobject);
+	return ret;
+}
+
+static void copy_ah_attr_to_uverbs(struct ib_uverbs_qp_dest *uverb_attr,
+				   struct rdma_ah_attr *rdma_attr)
+{
+	const struct ib_global_route   *grh;
+
+	uverb_attr->dlid              = rdma_ah_get_dlid(rdma_attr);
+	uverb_attr->sl                = rdma_ah_get_sl(rdma_attr);
+	uverb_attr->src_path_bits     = rdma_ah_get_path_bits(rdma_attr);
+	uverb_attr->static_rate       = rdma_ah_get_static_rate(rdma_attr);
+	uverb_attr->is_global         = !!(rdma_ah_get_ah_flags(rdma_attr) &
+					 IB_AH_GRH);
+	if (uverb_attr->is_global) {
+		grh = rdma_ah_read_grh(rdma_attr);
+		memcpy(uverb_attr->dgid, grh->dgid.raw, 16);
+		uverb_attr->flow_label        = grh->flow_label;
+		uverb_attr->sgid_index        = grh->sgid_index;
+		uverb_attr->hop_limit         = grh->hop_limit;
+		uverb_attr->traffic_class     = grh->traffic_class;
+	}
+	uverb_attr->port_num          = rdma_ah_get_port_num(rdma_attr);
+}
+
+ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_query_qp      cmd;
+	struct ib_uverbs_query_qp_resp resp;
+	struct ib_qp                   *qp;
+	struct ib_qp_attr              *attr;
+	struct ib_qp_init_attr         *init_attr;
+	int                            ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	attr      = kmalloc(sizeof *attr, GFP_KERNEL);
+	init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
+	if (!attr || !init_attr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext);
+	if (!qp) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr);
+
+	uobj_put_obj_read(qp);
+
+	if (ret)
+		goto out;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.qp_state               = attr->qp_state;
+	resp.cur_qp_state           = attr->cur_qp_state;
+	resp.path_mtu               = attr->path_mtu;
+	resp.path_mig_state         = attr->path_mig_state;
+	resp.qkey                   = attr->qkey;
+	resp.rq_psn                 = attr->rq_psn;
+	resp.sq_psn                 = attr->sq_psn;
+	resp.dest_qp_num            = attr->dest_qp_num;
+	resp.qp_access_flags        = attr->qp_access_flags;
+	resp.pkey_index             = attr->pkey_index;
+	resp.alt_pkey_index         = attr->alt_pkey_index;
+	resp.sq_draining            = attr->sq_draining;
+	resp.max_rd_atomic          = attr->max_rd_atomic;
+	resp.max_dest_rd_atomic     = attr->max_dest_rd_atomic;
+	resp.min_rnr_timer          = attr->min_rnr_timer;
+	resp.port_num               = attr->port_num;
+	resp.timeout                = attr->timeout;
+	resp.retry_cnt              = attr->retry_cnt;
+	resp.rnr_retry              = attr->rnr_retry;
+	resp.alt_port_num           = attr->alt_port_num;
+	resp.alt_timeout            = attr->alt_timeout;
+
+	copy_ah_attr_to_uverbs(&resp.dest, &attr->ah_attr);
+	copy_ah_attr_to_uverbs(&resp.alt_dest, &attr->alt_ah_attr);
+
+	resp.max_send_wr            = init_attr->cap.max_send_wr;
+	resp.max_recv_wr            = init_attr->cap.max_recv_wr;
+	resp.max_send_sge           = init_attr->cap.max_send_sge;
+	resp.max_recv_sge           = init_attr->cap.max_recv_sge;
+	resp.max_inline_data        = init_attr->cap.max_inline_data;
+	resp.sq_sig_all             = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	kfree(attr);
+	kfree(init_attr);
+
+	return ret ? ret : in_len;
+}
+
+/* Remove ignored fields set in the attribute mask */
+static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
+{
+	switch (qp_type) {
+	case IB_QPT_XRC_INI:
+		return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER);
+	case IB_QPT_XRC_TGT:
+		return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT |
+				IB_QP_RNR_RETRY);
+	default:
+		return mask;
+	}
+}
+
+static void copy_ah_attr_from_uverbs(struct ib_device *dev,
+				     struct rdma_ah_attr *rdma_attr,
+				     struct ib_uverbs_qp_dest *uverb_attr)
+{
+	rdma_attr->type = rdma_ah_find_type(dev, uverb_attr->port_num);
+	if (uverb_attr->is_global) {
+		rdma_ah_set_grh(rdma_attr, NULL,
+				uverb_attr->flow_label,
+				uverb_attr->sgid_index,
+				uverb_attr->hop_limit,
+				uverb_attr->traffic_class);
+		rdma_ah_set_dgid_raw(rdma_attr, uverb_attr->dgid);
+	} else {
+		rdma_ah_set_ah_flags(rdma_attr, 0);
+	}
+	rdma_ah_set_dlid(rdma_attr, uverb_attr->dlid);
+	rdma_ah_set_sl(rdma_attr, uverb_attr->sl);
+	rdma_ah_set_path_bits(rdma_attr, uverb_attr->src_path_bits);
+	rdma_ah_set_static_rate(rdma_attr, uverb_attr->static_rate);
+	rdma_ah_set_port_num(rdma_attr, uverb_attr->port_num);
+	rdma_ah_set_make_grd(rdma_attr, false);
+}
+
+static int modify_qp(struct ib_uverbs_file *file,
+		     struct ib_uverbs_ex_modify_qp *cmd, struct ib_udata *udata)
+{
+	struct ib_qp_attr *attr;
+	struct ib_qp *qp;
+	int ret;
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle, file->ucontext);
+	if (!qp) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((cmd->base.attr_mask & IB_QP_PORT) &&
+	    !rdma_is_port_valid(qp->device, cmd->base.port_num)) {
+		ret = -EINVAL;
+		goto release_qp;
+	}
+
+	if ((cmd->base.attr_mask & IB_QP_AV) &&
+	    !rdma_is_port_valid(qp->device, cmd->base.dest.port_num)) {
+		ret = -EINVAL;
+		goto release_qp;
+	}
+
+	if ((cmd->base.attr_mask & IB_QP_ALT_PATH) &&
+	    (!rdma_is_port_valid(qp->device, cmd->base.alt_port_num) ||
+	    !rdma_is_port_valid(qp->device, cmd->base.alt_dest.port_num))) {
+		ret = -EINVAL;
+		goto release_qp;
+	}
+
+	if ((cmd->base.attr_mask & IB_QP_CUR_STATE &&
+	    cmd->base.cur_qp_state > IB_QPS_ERR) ||
+	    cmd->base.qp_state > IB_QPS_ERR) {
+		ret = -EINVAL;
+		goto release_qp;
+	}
+
+	attr->qp_state		  = cmd->base.qp_state;
+	attr->cur_qp_state	  = cmd->base.cur_qp_state;
+	attr->path_mtu		  = cmd->base.path_mtu;
+	attr->path_mig_state	  = cmd->base.path_mig_state;
+	attr->qkey		  = cmd->base.qkey;
+	attr->rq_psn		  = cmd->base.rq_psn;
+	attr->sq_psn		  = cmd->base.sq_psn;
+	attr->dest_qp_num	  = cmd->base.dest_qp_num;
+	attr->qp_access_flags	  = cmd->base.qp_access_flags;
+	attr->pkey_index	  = cmd->base.pkey_index;
+	attr->alt_pkey_index	  = cmd->base.alt_pkey_index;
+	attr->en_sqd_async_notify = cmd->base.en_sqd_async_notify;
+	attr->max_rd_atomic	  = cmd->base.max_rd_atomic;
+	attr->max_dest_rd_atomic  = cmd->base.max_dest_rd_atomic;
+	attr->min_rnr_timer	  = cmd->base.min_rnr_timer;
+	attr->port_num		  = cmd->base.port_num;
+	attr->timeout		  = cmd->base.timeout;
+	attr->retry_cnt		  = cmd->base.retry_cnt;
+	attr->rnr_retry		  = cmd->base.rnr_retry;
+	attr->alt_port_num	  = cmd->base.alt_port_num;
+	attr->alt_timeout	  = cmd->base.alt_timeout;
+	attr->rate_limit	  = cmd->rate_limit;
+
+	if (cmd->base.attr_mask & IB_QP_AV)
+		copy_ah_attr_from_uverbs(qp->device, &attr->ah_attr,
+					 &cmd->base.dest);
+
+	if (cmd->base.attr_mask & IB_QP_ALT_PATH)
+		copy_ah_attr_from_uverbs(qp->device, &attr->alt_ah_attr,
+					 &cmd->base.alt_dest);
+
+	ret = ib_modify_qp_with_udata(qp, attr,
+				      modify_qp_mask(qp->qp_type,
+						     cmd->base.attr_mask),
+				      udata);
+
+release_qp:
+	uobj_put_obj_read(qp);
+out:
+	kfree(attr);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_ex_modify_qp cmd = {};
+	struct ib_udata udata;
+	int ret;
+
+	if (copy_from_user(&cmd.base, buf, sizeof(cmd.base)))
+		return -EFAULT;
+
+	if (cmd.base.attr_mask &
+	    ~((IB_USER_LEGACY_LAST_QP_ATTR_MASK << 1) - 1))
+		return -EOPNOTSUPP;
+
+	ib_uverbs_init_udata(&udata, buf + sizeof(cmd.base), NULL,
+		   in_len - sizeof(cmd.base) - sizeof(struct ib_uverbs_cmd_hdr),
+		   out_len);
+
+	ret = modify_qp(file, &cmd, &udata);
+	if (ret)
+		return ret;
+
+	return in_len;
+}
+
+int ib_uverbs_ex_modify_qp(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
+			   struct ib_udata *ucore,
+			   struct ib_udata *uhw)
+{
+	struct ib_uverbs_ex_modify_qp cmd = {};
+	int ret;
+
+	/*
+	 * Last bit is reserved for extending the attr_mask by
+	 * using another field.
+	 */
+	BUILD_BUG_ON(IB_USER_LAST_QP_ATTR_MASK == (1 << 31));
+
+	if (ucore->inlen < sizeof(cmd.base))
+		return -EINVAL;
+
+	ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+	if (ret)
+		return ret;
+
+	if (cmd.base.attr_mask &
+	    ~((IB_USER_LAST_QP_ATTR_MASK << 1) - 1))
+		return -EOPNOTSUPP;
+
+	if (ucore->inlen > sizeof(cmd)) {
+		if (!ib_is_udata_cleared(ucore, sizeof(cmd),
+					 ucore->inlen - sizeof(cmd)))
+			return -EOPNOTSUPP;
+	}
+
+	ret = modify_qp(file, &cmd, uhw);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_destroy_qp      cmd;
+	struct ib_uverbs_destroy_qp_resp resp;
+	struct ib_uobject		*uobj;
+	struct ib_uqp_object        	*obj;
+	int                        	 ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	memset(&resp, 0, sizeof resp);
+
+	uobj  = uobj_get_write(UVERBS_OBJECT_QP, cmd.qp_handle,
+			       file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	obj = container_of(uobj, struct ib_uqp_object, uevent.uobject);
+	/*
+	 * Make sure we don't free the memory in remove_commit as we still
+	 * needs the uobject memory to create the response.
+	 */
+	uverbs_uobject_get(uobj);
+
+	ret = uobj_remove_commit(uobj);
+	if (ret) {
+		uverbs_uobject_put(uobj);
+		return ret;
+	}
+
+	resp.events_reported = obj->uevent.events_reported;
+	uverbs_uobject_put(uobj);
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+static void *alloc_wr(size_t wr_size, __u32 num_sge)
+{
+	if (num_sge >= (U32_MAX - ALIGN(wr_size, sizeof (struct ib_sge))) /
+		       sizeof (struct ib_sge))
+		return NULL;
+
+	return kmalloc(ALIGN(wr_size, sizeof (struct ib_sge)) +
+			 num_sge * sizeof (struct ib_sge), GFP_KERNEL);
+}
+
+ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_post_send      cmd;
+	struct ib_uverbs_post_send_resp resp;
+	struct ib_uverbs_send_wr       *user_wr;
+	struct ib_send_wr              *wr = NULL, *last, *next, *bad_wr;
+	struct ib_qp                   *qp;
+	int                             i, sg_ind;
+	int				is_ud;
+	ssize_t                         ret = -EINVAL;
+	size_t                          next_size;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count +
+	    cmd.sge_count * sizeof (struct ib_uverbs_sge))
+		return -EINVAL;
+
+	if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr))
+		return -EINVAL;
+
+	user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL);
+	if (!user_wr)
+		return -ENOMEM;
+
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext);
+	if (!qp)
+		goto out;
+
+	is_ud = qp->qp_type == IB_QPT_UD;
+	sg_ind = 0;
+	last = NULL;
+	for (i = 0; i < cmd.wr_count; ++i) {
+		if (copy_from_user(user_wr,
+				   buf + sizeof cmd + i * cmd.wqe_size,
+				   cmd.wqe_size)) {
+			ret = -EFAULT;
+			goto out_put;
+		}
+
+		if (user_wr->num_sge + sg_ind > cmd.sge_count) {
+			ret = -EINVAL;
+			goto out_put;
+		}
+
+		if (is_ud) {
+			struct ib_ud_wr *ud;
+
+			if (user_wr->opcode != IB_WR_SEND &&
+			    user_wr->opcode != IB_WR_SEND_WITH_IMM) {
+				ret = -EINVAL;
+				goto out_put;
+			}
+
+			next_size = sizeof(*ud);
+			ud = alloc_wr(next_size, user_wr->num_sge);
+			if (!ud) {
+				ret = -ENOMEM;
+				goto out_put;
+			}
+
+			ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH, user_wr->wr.ud.ah,
+						   file->ucontext);
+			if (!ud->ah) {
+				kfree(ud);
+				ret = -EINVAL;
+				goto out_put;
+			}
+			ud->remote_qpn = user_wr->wr.ud.remote_qpn;
+			ud->remote_qkey = user_wr->wr.ud.remote_qkey;
+
+			next = &ud->wr;
+		} else if (user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+			   user_wr->opcode == IB_WR_RDMA_WRITE ||
+			   user_wr->opcode == IB_WR_RDMA_READ) {
+			struct ib_rdma_wr *rdma;
+
+			next_size = sizeof(*rdma);
+			rdma = alloc_wr(next_size, user_wr->num_sge);
+			if (!rdma) {
+				ret = -ENOMEM;
+				goto out_put;
+			}
+
+			rdma->remote_addr = user_wr->wr.rdma.remote_addr;
+			rdma->rkey = user_wr->wr.rdma.rkey;
+
+			next = &rdma->wr;
+		} else if (user_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+			   user_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+			struct ib_atomic_wr *atomic;
+
+			next_size = sizeof(*atomic);
+			atomic = alloc_wr(next_size, user_wr->num_sge);
+			if (!atomic) {
+				ret = -ENOMEM;
+				goto out_put;
+			}
+
+			atomic->remote_addr = user_wr->wr.atomic.remote_addr;
+			atomic->compare_add = user_wr->wr.atomic.compare_add;
+			atomic->swap = user_wr->wr.atomic.swap;
+			atomic->rkey = user_wr->wr.atomic.rkey;
+
+			next = &atomic->wr;
+		} else if (user_wr->opcode == IB_WR_SEND ||
+			   user_wr->opcode == IB_WR_SEND_WITH_IMM ||
+			   user_wr->opcode == IB_WR_SEND_WITH_INV) {
+			next_size = sizeof(*next);
+			next = alloc_wr(next_size, user_wr->num_sge);
+			if (!next) {
+				ret = -ENOMEM;
+				goto out_put;
+			}
+		} else {
+			ret = -EINVAL;
+			goto out_put;
+		}
+
+		if (user_wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+			next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
+		} else if (user_wr->opcode == IB_WR_SEND_WITH_INV) {
+			next->ex.invalidate_rkey = user_wr->ex.invalidate_rkey;
+		}
+
+		if (!last)
+			wr = next;
+		else
+			last->next = next;
+		last = next;
+
+		next->next       = NULL;
+		next->wr_id      = user_wr->wr_id;
+		next->num_sge    = user_wr->num_sge;
+		next->opcode     = user_wr->opcode;
+		next->send_flags = user_wr->send_flags;
+
+		if (next->num_sge) {
+			next->sg_list = (void *) next +
+				ALIGN(next_size, sizeof(struct ib_sge));
+			if (copy_from_user(next->sg_list,
+					   buf + sizeof cmd +
+					   cmd.wr_count * cmd.wqe_size +
+					   sg_ind * sizeof (struct ib_sge),
+					   next->num_sge * sizeof (struct ib_sge))) {
+				ret = -EFAULT;
+				goto out_put;
+			}
+			sg_ind += next->num_sge;
+		} else
+			next->sg_list = NULL;
+	}
+
+	resp.bad_wr = 0;
+	ret = qp->device->post_send(qp->real_qp, wr, &bad_wr);
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+		ret = -EFAULT;
+
+out_put:
+	uobj_put_obj_read(qp);
+
+	while (wr) {
+		if (is_ud && ud_wr(wr)->ah)
+			uobj_put_obj_read(ud_wr(wr)->ah);
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+out:
+	kfree(user_wr);
+
+	return ret ? ret : in_len;
+}
+
+static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
+						    int in_len,
+						    u32 wr_count,
+						    u32 sge_count,
+						    u32 wqe_size)
+{
+	struct ib_uverbs_recv_wr *user_wr;
+	struct ib_recv_wr        *wr = NULL, *last, *next;
+	int                       sg_ind;
+	int                       i;
+	int                       ret;
+
+	if (in_len < wqe_size * wr_count +
+	    sge_count * sizeof (struct ib_uverbs_sge))
+		return ERR_PTR(-EINVAL);
+
+	if (wqe_size < sizeof (struct ib_uverbs_recv_wr))
+		return ERR_PTR(-EINVAL);
+
+	user_wr = kmalloc(wqe_size, GFP_KERNEL);
+	if (!user_wr)
+		return ERR_PTR(-ENOMEM);
+
+	sg_ind = 0;
+	last = NULL;
+	for (i = 0; i < wr_count; ++i) {
+		if (copy_from_user(user_wr, buf + i * wqe_size,
+				   wqe_size)) {
+			ret = -EFAULT;
+			goto err;
+		}
+
+		if (user_wr->num_sge + sg_ind > sge_count) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		if (user_wr->num_sge >=
+		    (U32_MAX - ALIGN(sizeof *next, sizeof (struct ib_sge))) /
+		    sizeof (struct ib_sge)) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+			       user_wr->num_sge * sizeof (struct ib_sge),
+			       GFP_KERNEL);
+		if (!next) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		if (!last)
+			wr = next;
+		else
+			last->next = next;
+		last = next;
+
+		next->next       = NULL;
+		next->wr_id      = user_wr->wr_id;
+		next->num_sge    = user_wr->num_sge;
+
+		if (next->num_sge) {
+			next->sg_list = (void *) next +
+				ALIGN(sizeof *next, sizeof (struct ib_sge));
+			if (copy_from_user(next->sg_list,
+					   buf + wr_count * wqe_size +
+					   sg_ind * sizeof (struct ib_sge),
+					   next->num_sge * sizeof (struct ib_sge))) {
+				ret = -EFAULT;
+				goto err;
+			}
+			sg_ind += next->num_sge;
+		} else
+			next->sg_list = NULL;
+	}
+
+	kfree(user_wr);
+	return wr;
+
+err:
+	kfree(user_wr);
+
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ERR_PTR(ret);
+}
+
+ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_post_recv      cmd;
+	struct ib_uverbs_post_recv_resp resp;
+	struct ib_recv_wr              *wr, *next, *bad_wr;
+	struct ib_qp                   *qp;
+	ssize_t                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+				       in_len - sizeof cmd, cmd.wr_count,
+				       cmd.sge_count, cmd.wqe_size);
+	if (IS_ERR(wr))
+		return PTR_ERR(wr);
+
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext);
+	if (!qp)
+		goto out;
+
+	resp.bad_wr = 0;
+	ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr);
+
+	uobj_put_obj_read(qp);
+	if (ret) {
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+	}
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+				struct ib_device *ib_dev,
+				const char __user *buf, int in_len,
+				int out_len)
+{
+	struct ib_uverbs_post_srq_recv      cmd;
+	struct ib_uverbs_post_srq_recv_resp resp;
+	struct ib_recv_wr                  *wr, *next, *bad_wr;
+	struct ib_srq                      *srq;
+	ssize_t                             ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+				       in_len - sizeof cmd, cmd.wr_count,
+				       cmd.sge_count, cmd.wqe_size);
+	if (IS_ERR(wr))
+		return PTR_ERR(wr);
+
+	srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file->ucontext);
+	if (!srq)
+		goto out;
+
+	resp.bad_wr = 0;
+	ret = srq->device->post_srq_recv(srq, wr, &bad_wr);
+
+	uobj_put_obj_read(srq);
+
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_ah	 cmd;
+	struct ib_uverbs_create_ah_resp	 resp;
+	struct ib_uobject		*uobj;
+	struct ib_pd			*pd;
+	struct ib_ah			*ah;
+	struct rdma_ah_attr		attr;
+	int ret;
+	struct ib_udata                   udata;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	if (!rdma_is_port_valid(ib_dev, cmd.attr.port_num))
+		return -EINVAL;
+
+	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+		   u64_to_user_ptr(cmd.response) + sizeof(resp),
+		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+		   out_len - sizeof(resp));
+
+	uobj  = uobj_alloc(UVERBS_OBJECT_AH, file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	attr.type = rdma_ah_find_type(ib_dev, cmd.attr.port_num);
+	rdma_ah_set_make_grd(&attr, false);
+	rdma_ah_set_dlid(&attr, cmd.attr.dlid);
+	rdma_ah_set_sl(&attr, cmd.attr.sl);
+	rdma_ah_set_path_bits(&attr, cmd.attr.src_path_bits);
+	rdma_ah_set_static_rate(&attr, cmd.attr.static_rate);
+	rdma_ah_set_port_num(&attr, cmd.attr.port_num);
+
+	if (cmd.attr.is_global) {
+		rdma_ah_set_grh(&attr, NULL, cmd.attr.grh.flow_label,
+				cmd.attr.grh.sgid_index,
+				cmd.attr.grh.hop_limit,
+				cmd.attr.grh.traffic_class);
+		rdma_ah_set_dgid_raw(&attr, cmd.attr.grh.dgid);
+	} else {
+		rdma_ah_set_ah_flags(&attr, 0);
+	}
+
+	ah = rdma_create_user_ah(pd, &attr, &udata);
+	if (IS_ERR(ah)) {
+		ret = PTR_ERR(ah);
+		goto err_put;
+	}
+
+	ah->uobject  = uobj;
+	uobj->user_handle = cmd.user_handle;
+	uobj->object = ah;
+
+	resp.ah_handle = uobj->id;
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	uobj_put_obj_read(pd);
+	uobj_alloc_commit(uobj);
+
+	return in_len;
+
+err_copy:
+	rdma_destroy_ah(ah);
+
+err_put:
+	uobj_put_obj_read(pd);
+
+err:
+	uobj_alloc_abort(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
+			     const char __user *buf, int in_len, int out_len)
+{
+	struct ib_uverbs_destroy_ah cmd;
+	struct ib_uobject	   *uobj;
+	int			    ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj  = uobj_get_write(UVERBS_OBJECT_AH, cmd.ah_handle,
+			       file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	ret = uobj_remove_commit(uobj);
+	return ret ?: in_len;
+}
+
+ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+			       struct ib_device *ib_dev,
+			       const char __user *buf, int in_len,
+			       int out_len)
+{
+	struct ib_uverbs_attach_mcast cmd;
+	struct ib_qp                 *qp;
+	struct ib_uqp_object         *obj;
+	struct ib_uverbs_mcast_entry *mcast;
+	int                           ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext);
+	if (!qp)
+		return -EINVAL;
+
+	obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+	mutex_lock(&obj->mcast_lock);
+	list_for_each_entry(mcast, &obj->mcast_list, list)
+		if (cmd.mlid == mcast->lid &&
+		    !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+			ret = 0;
+			goto out_put;
+		}
+
+	mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+	if (!mcast) {
+		ret = -ENOMEM;
+		goto out_put;
+	}
+
+	mcast->lid = cmd.mlid;
+	memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw);
+
+	ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid);
+	if (!ret)
+		list_add_tail(&mcast->list, &obj->mcast_list);
+	else
+		kfree(mcast);
+
+out_put:
+	mutex_unlock(&obj->mcast_lock);
+	uobj_put_obj_read(qp);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+			       struct ib_device *ib_dev,
+			       const char __user *buf, int in_len,
+			       int out_len)
+{
+	struct ib_uverbs_detach_mcast cmd;
+	struct ib_uqp_object         *obj;
+	struct ib_qp                 *qp;
+	struct ib_uverbs_mcast_entry *mcast;
+	int                           ret = -EINVAL;
+	bool                          found = false;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext);
+	if (!qp)
+		return -EINVAL;
+
+	obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+	mutex_lock(&obj->mcast_lock);
+
+	list_for_each_entry(mcast, &obj->mcast_list, list)
+		if (cmd.mlid == mcast->lid &&
+		    !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+			list_del(&mcast->list);
+			kfree(mcast);
+			found = true;
+			break;
+		}
+
+	if (!found) {
+		ret = -EINVAL;
+		goto out_put;
+	}
+
+	ret = ib_detach_mcast(qp, (union ib_gid *)cmd.gid, cmd.mlid);
+
+out_put:
+	mutex_unlock(&obj->mcast_lock);
+	uobj_put_obj_read(qp);
+	return ret ? ret : in_len;
+}
+
+struct ib_uflow_resources {
+	size_t			max;
+	size_t			num;
+	struct ib_flow_action	*collection[0];
+};
+
+static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs)
+{
+	struct ib_uflow_resources *resources;
+
+	resources =
+		kmalloc(sizeof(*resources) +
+			num_specs * sizeof(*resources->collection), GFP_KERNEL);
+
+	if (!resources)
+		return NULL;
+
+	resources->num = 0;
+	resources->max = num_specs;
+
+	return resources;
+}
+
+void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res)
+{
+	unsigned int i;
+
+	for (i = 0; i < uflow_res->num; i++)
+		atomic_dec(&uflow_res->collection[i]->usecnt);
+
+	kfree(uflow_res);
+}
+
+static void flow_resources_add(struct ib_uflow_resources *uflow_res,
+			       struct ib_flow_action *action)
+{
+	WARN_ON(uflow_res->num >= uflow_res->max);
+
+	atomic_inc(&action->usecnt);
+	uflow_res->collection[uflow_res->num++] = action;
+}
+
+static int kern_spec_to_ib_spec_action(struct ib_ucontext *ucontext,
+				       struct ib_uverbs_flow_spec *kern_spec,
+				       union ib_flow_spec *ib_spec,
+				       struct ib_uflow_resources *uflow_res)
+{
+	ib_spec->type = kern_spec->type;
+	switch (ib_spec->type) {
+	case IB_FLOW_SPEC_ACTION_TAG:
+		if (kern_spec->flow_tag.size !=
+		    sizeof(struct ib_uverbs_flow_spec_action_tag))
+			return -EINVAL;
+
+		ib_spec->flow_tag.size = sizeof(struct ib_flow_spec_action_tag);
+		ib_spec->flow_tag.tag_id = kern_spec->flow_tag.tag_id;
+		break;
+	case IB_FLOW_SPEC_ACTION_DROP:
+		if (kern_spec->drop.size !=
+		    sizeof(struct ib_uverbs_flow_spec_action_drop))
+			return -EINVAL;
+
+		ib_spec->drop.size = sizeof(struct ib_flow_spec_action_drop);
+		break;
+	case IB_FLOW_SPEC_ACTION_HANDLE:
+		if (kern_spec->action.size !=
+		    sizeof(struct ib_uverbs_flow_spec_action_handle))
+			return -EOPNOTSUPP;
+		ib_spec->action.act = uobj_get_obj_read(flow_action,
+							UVERBS_OBJECT_FLOW_ACTION,
+							kern_spec->action.handle,
+							ucontext);
+		if (!ib_spec->action.act)
+			return -EINVAL;
+		ib_spec->action.size =
+			sizeof(struct ib_flow_spec_action_handle);
+		flow_resources_add(uflow_res, ib_spec->action.act);
+		uobj_put_obj_read(ib_spec->action.act);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static size_t kern_spec_filter_sz(const struct ib_uverbs_flow_spec_hdr *spec)
+{
+	/* Returns user space filter size, includes padding */
+	return (spec->size - sizeof(struct ib_uverbs_flow_spec_hdr)) / 2;
+}
+
+static ssize_t spec_filter_size(const void *kern_spec_filter, u16 kern_filter_size,
+				u16 ib_real_filter_sz)
+{
+	/*
+	 * User space filter structures must be 64 bit aligned, otherwise this
+	 * may pass, but we won't handle additional new attributes.
+	 */
+
+	if (kern_filter_size > ib_real_filter_sz) {
+		if (memchr_inv(kern_spec_filter +
+			       ib_real_filter_sz, 0,
+			       kern_filter_size - ib_real_filter_sz))
+			return -EINVAL;
+		return ib_real_filter_sz;
+	}
+	return kern_filter_size;
+}
+
+int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
+					  const void *kern_spec_mask,
+					  const void *kern_spec_val,
+					  size_t kern_filter_sz,
+					  union ib_flow_spec *ib_spec)
+{
+	ssize_t actual_filter_sz;
+	ssize_t ib_filter_sz;
+
+	/* User flow spec size must be aligned to 4 bytes */
+	if (kern_filter_sz != ALIGN(kern_filter_sz, 4))
+		return -EINVAL;
+
+	ib_spec->type = type;
+
+	if (ib_spec->type == (IB_FLOW_SPEC_INNER | IB_FLOW_SPEC_VXLAN_TUNNEL))
+		return -EINVAL;
+
+	switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
+	case IB_FLOW_SPEC_ETH:
+		ib_filter_sz = offsetof(struct ib_flow_eth_filter, real_sz);
+		actual_filter_sz = spec_filter_size(kern_spec_mask,
+						    kern_filter_sz,
+						    ib_filter_sz);
+		if (actual_filter_sz <= 0)
+			return -EINVAL;
+		ib_spec->size = sizeof(struct ib_flow_spec_eth);
+		memcpy(&ib_spec->eth.val, kern_spec_val, actual_filter_sz);
+		memcpy(&ib_spec->eth.mask, kern_spec_mask, actual_filter_sz);
+		break;
+	case IB_FLOW_SPEC_IPV4:
+		ib_filter_sz = offsetof(struct ib_flow_ipv4_filter, real_sz);
+		actual_filter_sz = spec_filter_size(kern_spec_mask,
+						    kern_filter_sz,
+						    ib_filter_sz);
+		if (actual_filter_sz <= 0)
+			return -EINVAL;
+		ib_spec->size = sizeof(struct ib_flow_spec_ipv4);
+		memcpy(&ib_spec->ipv4.val, kern_spec_val, actual_filter_sz);
+		memcpy(&ib_spec->ipv4.mask, kern_spec_mask, actual_filter_sz);
+		break;
+	case IB_FLOW_SPEC_IPV6:
+		ib_filter_sz = offsetof(struct ib_flow_ipv6_filter, real_sz);
+		actual_filter_sz = spec_filter_size(kern_spec_mask,
+						    kern_filter_sz,
+						    ib_filter_sz);
+		if (actual_filter_sz <= 0)
+			return -EINVAL;
+		ib_spec->size = sizeof(struct ib_flow_spec_ipv6);
+		memcpy(&ib_spec->ipv6.val, kern_spec_val, actual_filter_sz);
+		memcpy(&ib_spec->ipv6.mask, kern_spec_mask, actual_filter_sz);
+
+		if ((ntohl(ib_spec->ipv6.mask.flow_label)) >= BIT(20) ||
+		    (ntohl(ib_spec->ipv6.val.flow_label)) >= BIT(20))
+			return -EINVAL;
+		break;
+	case IB_FLOW_SPEC_TCP:
+	case IB_FLOW_SPEC_UDP:
+		ib_filter_sz = offsetof(struct ib_flow_tcp_udp_filter, real_sz);
+		actual_filter_sz = spec_filter_size(kern_spec_mask,
+						    kern_filter_sz,
+						    ib_filter_sz);
+		if (actual_filter_sz <= 0)
+			return -EINVAL;
+		ib_spec->size = sizeof(struct ib_flow_spec_tcp_udp);
+		memcpy(&ib_spec->tcp_udp.val, kern_spec_val, actual_filter_sz);
+		memcpy(&ib_spec->tcp_udp.mask, kern_spec_mask, actual_filter_sz);
+		break;
+	case IB_FLOW_SPEC_VXLAN_TUNNEL:
+		ib_filter_sz = offsetof(struct ib_flow_tunnel_filter, real_sz);
+		actual_filter_sz = spec_filter_size(kern_spec_mask,
+						    kern_filter_sz,
+						    ib_filter_sz);
+		if (actual_filter_sz <= 0)
+			return -EINVAL;
+		ib_spec->tunnel.size = sizeof(struct ib_flow_spec_tunnel);
+		memcpy(&ib_spec->tunnel.val, kern_spec_val, actual_filter_sz);
+		memcpy(&ib_spec->tunnel.mask, kern_spec_mask, actual_filter_sz);
+
+		if ((ntohl(ib_spec->tunnel.mask.tunnel_id)) >= BIT(24) ||
+		    (ntohl(ib_spec->tunnel.val.tunnel_id)) >= BIT(24))
+			return -EINVAL;
+		break;
+	case IB_FLOW_SPEC_ESP:
+		ib_filter_sz = offsetof(struct ib_flow_esp_filter, real_sz);
+		actual_filter_sz = spec_filter_size(kern_spec_mask,
+						    kern_filter_sz,
+						    ib_filter_sz);
+		if (actual_filter_sz <= 0)
+			return -EINVAL;
+		ib_spec->esp.size = sizeof(struct ib_flow_spec_esp);
+		memcpy(&ib_spec->esp.val, kern_spec_val, actual_filter_sz);
+		memcpy(&ib_spec->esp.mask, kern_spec_mask, actual_filter_sz);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec,
+				       union ib_flow_spec *ib_spec)
+{
+	ssize_t kern_filter_sz;
+	void *kern_spec_mask;
+	void *kern_spec_val;
+
+	if (kern_spec->reserved)
+		return -EINVAL;
+
+	kern_filter_sz = kern_spec_filter_sz(&kern_spec->hdr);
+
+	kern_spec_val = (void *)kern_spec +
+		sizeof(struct ib_uverbs_flow_spec_hdr);
+	kern_spec_mask = kern_spec_val + kern_filter_sz;
+
+	return ib_uverbs_kern_spec_to_ib_spec_filter(kern_spec->type,
+						     kern_spec_mask,
+						     kern_spec_val,
+						     kern_filter_sz, ib_spec);
+}
+
+static int kern_spec_to_ib_spec(struct ib_ucontext *ucontext,
+				struct ib_uverbs_flow_spec *kern_spec,
+				union ib_flow_spec *ib_spec,
+				struct ib_uflow_resources *uflow_res)
+{
+	if (kern_spec->reserved)
+		return -EINVAL;
+
+	if (kern_spec->type >= IB_FLOW_SPEC_ACTION_TAG)
+		return kern_spec_to_ib_spec_action(ucontext, kern_spec, ib_spec,
+						   uflow_res);
+	else
+		return kern_spec_to_ib_spec_filter(kern_spec, ib_spec);
+}
+
+int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
+			   struct ib_udata *ucore,
+			   struct ib_udata *uhw)
+{
+	struct ib_uverbs_ex_create_wq	  cmd = {};
+	struct ib_uverbs_ex_create_wq_resp resp = {};
+	struct ib_uwq_object           *obj;
+	int err = 0;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+	struct ib_wq *wq;
+	struct ib_wq_init_attr wq_init_attr = {};
+	size_t required_cmd_sz;
+	size_t required_resp_len;
+
+	required_cmd_sz = offsetof(typeof(cmd), max_sge) + sizeof(cmd.max_sge);
+	required_resp_len = offsetof(typeof(resp), wqn) + sizeof(resp.wqn);
+
+	if (ucore->inlen < required_cmd_sz)
+		return -EINVAL;
+
+	if (ucore->outlen < required_resp_len)
+		return -ENOSPC;
+
+	if (ucore->inlen > sizeof(cmd) &&
+	    !ib_is_udata_cleared(ucore, sizeof(cmd),
+				 ucore->inlen - sizeof(cmd)))
+		return -EOPNOTSUPP;
+
+	err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+	if (err)
+		return err;
+
+	if (cmd.comp_mask)
+		return -EOPNOTSUPP;
+
+	obj  = (struct ib_uwq_object *)uobj_alloc(UVERBS_OBJECT_WQ,
+						  file->ucontext);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	pd  = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		err = -EINVAL;
+		goto err_uobj;
+	}
+
+	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext);
+	if (!cq) {
+		err = -EINVAL;
+		goto err_put_pd;
+	}
+
+	wq_init_attr.cq = cq;
+	wq_init_attr.max_sge = cmd.max_sge;
+	wq_init_attr.max_wr = cmd.max_wr;
+	wq_init_attr.wq_context = file;
+	wq_init_attr.wq_type = cmd.wq_type;
+	wq_init_attr.event_handler = ib_uverbs_wq_event_handler;
+	if (ucore->inlen >= (offsetof(typeof(cmd), create_flags) +
+			     sizeof(cmd.create_flags)))
+		wq_init_attr.create_flags = cmd.create_flags;
+	obj->uevent.events_reported = 0;
+	INIT_LIST_HEAD(&obj->uevent.event_list);
+
+	if (!pd->device->create_wq) {
+		err = -EOPNOTSUPP;
+		goto err_put_cq;
+	}
+	wq = pd->device->create_wq(pd, &wq_init_attr, uhw);
+	if (IS_ERR(wq)) {
+		err = PTR_ERR(wq);
+		goto err_put_cq;
+	}
+
+	wq->uobject = &obj->uevent.uobject;
+	obj->uevent.uobject.object = wq;
+	wq->wq_type = wq_init_attr.wq_type;
+	wq->cq = cq;
+	wq->pd = pd;
+	wq->device = pd->device;
+	wq->wq_context = wq_init_attr.wq_context;
+	atomic_set(&wq->usecnt, 0);
+	atomic_inc(&pd->usecnt);
+	atomic_inc(&cq->usecnt);
+	wq->uobject = &obj->uevent.uobject;
+	obj->uevent.uobject.object = wq;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.wq_handle = obj->uevent.uobject.id;
+	resp.max_sge = wq_init_attr.max_sge;
+	resp.max_wr = wq_init_attr.max_wr;
+	resp.wqn = wq->wq_num;
+	resp.response_length = required_resp_len;
+	err = ib_copy_to_udata(ucore,
+			       &resp, resp.response_length);
+	if (err)
+		goto err_copy;
+
+	uobj_put_obj_read(pd);
+	uobj_put_obj_read(cq);
+	uobj_alloc_commit(&obj->uevent.uobject);
+	return 0;
+
+err_copy:
+	ib_destroy_wq(wq);
+err_put_cq:
+	uobj_put_obj_read(cq);
+err_put_pd:
+	uobj_put_obj_read(pd);
+err_uobj:
+	uobj_alloc_abort(&obj->uevent.uobject);
+
+	return err;
+}
+
+int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
+			    struct ib_udata *ucore,
+			    struct ib_udata *uhw)
+{
+	struct ib_uverbs_ex_destroy_wq	cmd = {};
+	struct ib_uverbs_ex_destroy_wq_resp	resp = {};
+	struct ib_uobject		*uobj;
+	struct ib_uwq_object		*obj;
+	size_t required_cmd_sz;
+	size_t required_resp_len;
+	int				ret;
+
+	required_cmd_sz = offsetof(typeof(cmd), wq_handle) + sizeof(cmd.wq_handle);
+	required_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+
+	if (ucore->inlen < required_cmd_sz)
+		return -EINVAL;
+
+	if (ucore->outlen < required_resp_len)
+		return -ENOSPC;
+
+	if (ucore->inlen > sizeof(cmd) &&
+	    !ib_is_udata_cleared(ucore, sizeof(cmd),
+				 ucore->inlen - sizeof(cmd)))
+		return -EOPNOTSUPP;
+
+	ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+	if (ret)
+		return ret;
+
+	if (cmd.comp_mask)
+		return -EOPNOTSUPP;
+
+	resp.response_length = required_resp_len;
+	uobj  = uobj_get_write(UVERBS_OBJECT_WQ, cmd.wq_handle,
+			       file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	obj = container_of(uobj, struct ib_uwq_object, uevent.uobject);
+	/*
+	 * Make sure we don't free the memory in remove_commit as we still
+	 * needs the uobject memory to create the response.
+	 */
+	uverbs_uobject_get(uobj);
+
+	ret = uobj_remove_commit(uobj);
+	resp.events_reported = obj->uevent.events_reported;
+	uverbs_uobject_put(uobj);
+	if (ret)
+		return ret;
+
+	return ib_copy_to_udata(ucore, &resp, resp.response_length);
+}
+
+int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
+			   struct ib_udata *ucore,
+			   struct ib_udata *uhw)
+{
+	struct ib_uverbs_ex_modify_wq cmd = {};
+	struct ib_wq *wq;
+	struct ib_wq_attr wq_attr = {};
+	size_t required_cmd_sz;
+	int ret;
+
+	required_cmd_sz = offsetof(typeof(cmd), curr_wq_state) + sizeof(cmd.curr_wq_state);
+	if (ucore->inlen < required_cmd_sz)
+		return -EINVAL;
+
+	if (ucore->inlen > sizeof(cmd) &&
+	    !ib_is_udata_cleared(ucore, sizeof(cmd),
+				 ucore->inlen - sizeof(cmd)))
+		return -EOPNOTSUPP;
+
+	ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+	if (ret)
+		return ret;
+
+	if (!cmd.attr_mask)
+		return -EINVAL;
+
+	if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE | IB_WQ_FLAGS))
+		return -EINVAL;
+
+	wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, file->ucontext);
+	if (!wq)
+		return -EINVAL;
+
+	wq_attr.curr_wq_state = cmd.curr_wq_state;
+	wq_attr.wq_state = cmd.wq_state;
+	if (cmd.attr_mask & IB_WQ_FLAGS) {
+		wq_attr.flags = cmd.flags;
+		wq_attr.flags_mask = cmd.flags_mask;
+	}
+	if (!wq->device->modify_wq) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw);
+out:
+	uobj_put_obj_read(wq);
+	return ret;
+}
+
+int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
+				      struct ib_device *ib_dev,
+				      struct ib_udata *ucore,
+				      struct ib_udata *uhw)
+{
+	struct ib_uverbs_ex_create_rwq_ind_table	  cmd = {};
+	struct ib_uverbs_ex_create_rwq_ind_table_resp  resp = {};
+	struct ib_uobject		  *uobj;
+	int err = 0;
+	struct ib_rwq_ind_table_init_attr init_attr = {};
+	struct ib_rwq_ind_table *rwq_ind_tbl;
+	struct ib_wq	**wqs = NULL;
+	u32 *wqs_handles = NULL;
+	struct ib_wq	*wq = NULL;
+	int i, j, num_read_wqs;
+	u32 num_wq_handles;
+	u32 expected_in_size;
+	size_t required_cmd_sz_header;
+	size_t required_resp_len;
+
+	required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size);
+	required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num);
+
+	if (ucore->inlen < required_cmd_sz_header)
+		return -EINVAL;
+
+	if (ucore->outlen < required_resp_len)
+		return -ENOSPC;
+
+	err = ib_copy_from_udata(&cmd, ucore, required_cmd_sz_header);
+	if (err)
+		return err;
+
+	ucore->inbuf += required_cmd_sz_header;
+	ucore->inlen -= required_cmd_sz_header;
+
+	if (cmd.comp_mask)
+		return -EOPNOTSUPP;
+
+	if (cmd.log_ind_tbl_size > IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE)
+		return -EINVAL;
+
+	num_wq_handles = 1 << cmd.log_ind_tbl_size;
+	expected_in_size = num_wq_handles * sizeof(__u32);
+	if (num_wq_handles == 1)
+		/* input size for wq handles is u64 aligned */
+		expected_in_size += sizeof(__u32);
+
+	if (ucore->inlen < expected_in_size)
+		return -EINVAL;
+
+	if (ucore->inlen > expected_in_size &&
+	    !ib_is_udata_cleared(ucore, expected_in_size,
+				 ucore->inlen - expected_in_size))
+		return -EOPNOTSUPP;
+
+	wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles),
+			      GFP_KERNEL);
+	if (!wqs_handles)
+		return -ENOMEM;
+
+	err = ib_copy_from_udata(wqs_handles, ucore,
+				 num_wq_handles * sizeof(__u32));
+	if (err)
+		goto err_free;
+
+	wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL);
+	if (!wqs) {
+		err = -ENOMEM;
+		goto  err_free;
+	}
+
+	for (num_read_wqs = 0; num_read_wqs < num_wq_handles;
+			num_read_wqs++) {
+		wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, wqs_handles[num_read_wqs],
+				       file->ucontext);
+		if (!wq) {
+			err = -EINVAL;
+			goto put_wqs;
+		}
+
+		wqs[num_read_wqs] = wq;
+	}
+
+	uobj  = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, file->ucontext);
+	if (IS_ERR(uobj)) {
+		err = PTR_ERR(uobj);
+		goto put_wqs;
+	}
+
+	init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size;
+	init_attr.ind_tbl = wqs;
+
+	if (!ib_dev->create_rwq_ind_table) {
+		err = -EOPNOTSUPP;
+		goto err_uobj;
+	}
+	rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw);
+
+	if (IS_ERR(rwq_ind_tbl)) {
+		err = PTR_ERR(rwq_ind_tbl);
+		goto err_uobj;
+	}
+
+	rwq_ind_tbl->ind_tbl = wqs;
+	rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size;
+	rwq_ind_tbl->uobject = uobj;
+	uobj->object = rwq_ind_tbl;
+	rwq_ind_tbl->device = ib_dev;
+	atomic_set(&rwq_ind_tbl->usecnt, 0);
+
+	for (i = 0; i < num_wq_handles; i++)
+		atomic_inc(&wqs[i]->usecnt);
+
+	resp.ind_tbl_handle = uobj->id;
+	resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num;
+	resp.response_length = required_resp_len;
+
+	err = ib_copy_to_udata(ucore,
+			       &resp, resp.response_length);
+	if (err)
+		goto err_copy;
+
+	kfree(wqs_handles);
+
+	for (j = 0; j < num_read_wqs; j++)
+		uobj_put_obj_read(wqs[j]);
+
+	uobj_alloc_commit(uobj);
+	return 0;
+
+err_copy:
+	ib_destroy_rwq_ind_table(rwq_ind_tbl);
+err_uobj:
+	uobj_alloc_abort(uobj);
+put_wqs:
+	for (j = 0; j < num_read_wqs; j++)
+		uobj_put_obj_read(wqs[j]);
+err_free:
+	kfree(wqs_handles);
+	kfree(wqs);
+	return err;
+}
+
+int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file,
+				       struct ib_device *ib_dev,
+				       struct ib_udata *ucore,
+				       struct ib_udata *uhw)
+{
+	struct ib_uverbs_ex_destroy_rwq_ind_table	cmd = {};
+	struct ib_uobject		*uobj;
+	int			ret;
+	size_t required_cmd_sz;
+
+	required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle);
+
+	if (ucore->inlen < required_cmd_sz)
+		return -EINVAL;
+
+	if (ucore->inlen > sizeof(cmd) &&
+	    !ib_is_udata_cleared(ucore, sizeof(cmd),
+				 ucore->inlen - sizeof(cmd)))
+		return -EOPNOTSUPP;
+
+	ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+	if (ret)
+		return ret;
+
+	if (cmd.comp_mask)
+		return -EOPNOTSUPP;
+
+	uobj  = uobj_get_write(UVERBS_OBJECT_RWQ_IND_TBL, cmd.ind_tbl_handle,
+			       file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	return uobj_remove_commit(uobj);
+}
+
+int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
+			     struct ib_udata *ucore,
+			     struct ib_udata *uhw)
+{
+	struct ib_uverbs_create_flow	  cmd;
+	struct ib_uverbs_create_flow_resp resp;
+	struct ib_uobject		  *uobj;
+	struct ib_uflow_object		  *uflow;
+	struct ib_flow			  *flow_id;
+	struct ib_uverbs_flow_attr	  *kern_flow_attr;
+	struct ib_flow_attr		  *flow_attr;
+	struct ib_qp			  *qp;
+	struct ib_uflow_resources	  *uflow_res;
+	int err = 0;
+	void *kern_spec;
+	void *ib_spec;
+	int i;
+
+	if (ucore->inlen < sizeof(cmd))
+		return -EINVAL;
+
+	if (ucore->outlen < sizeof(resp))
+		return -ENOSPC;
+
+	err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	if (err)
+		return err;
+
+	ucore->inbuf += sizeof(cmd);
+	ucore->inlen -= sizeof(cmd);
+
+	if (cmd.comp_mask)
+		return -EINVAL;
+
+	if (!capable(CAP_NET_RAW))
+		return -EPERM;
+
+	if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED)
+		return -EINVAL;
+
+	if ((cmd.flow_attr.flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) &&
+	    ((cmd.flow_attr.type == IB_FLOW_ATTR_ALL_DEFAULT) ||
+	     (cmd.flow_attr.type == IB_FLOW_ATTR_MC_DEFAULT)))
+		return -EINVAL;
+
+	if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
+		return -EINVAL;
+
+	if (cmd.flow_attr.size > ucore->inlen ||
+	    cmd.flow_attr.size >
+	    (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec)))
+		return -EINVAL;
+
+	if (cmd.flow_attr.reserved[0] ||
+	    cmd.flow_attr.reserved[1])
+		return -EINVAL;
+
+	if (cmd.flow_attr.num_of_specs) {
+		kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size,
+					 GFP_KERNEL);
+		if (!kern_flow_attr)
+			return -ENOMEM;
+
+		memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr));
+		err = ib_copy_from_udata(kern_flow_attr + 1, ucore,
+					 cmd.flow_attr.size);
+		if (err)
+			goto err_free_attr;
+	} else {
+		kern_flow_attr = &cmd.flow_attr;
+	}
+
+	uobj  = uobj_alloc(UVERBS_OBJECT_FLOW, file->ucontext);
+	if (IS_ERR(uobj)) {
+		err = PTR_ERR(uobj);
+		goto err_free_attr;
+	}
+
+	qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext);
+	if (!qp) {
+		err = -EINVAL;
+		goto err_uobj;
+	}
+
+	flow_attr = kzalloc(sizeof(*flow_attr) + cmd.flow_attr.num_of_specs *
+			    sizeof(union ib_flow_spec), GFP_KERNEL);
+	if (!flow_attr) {
+		err = -ENOMEM;
+		goto err_put;
+	}
+	uflow_res = flow_resources_alloc(cmd.flow_attr.num_of_specs);
+	if (!uflow_res) {
+		err = -ENOMEM;
+		goto err_free_flow_attr;
+	}
+
+	flow_attr->type = kern_flow_attr->type;
+	flow_attr->priority = kern_flow_attr->priority;
+	flow_attr->num_of_specs = kern_flow_attr->num_of_specs;
+	flow_attr->port = kern_flow_attr->port;
+	flow_attr->flags = kern_flow_attr->flags;
+	flow_attr->size = sizeof(*flow_attr);
+
+	kern_spec = kern_flow_attr + 1;
+	ib_spec = flow_attr + 1;
+	for (i = 0; i < flow_attr->num_of_specs &&
+	     cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) &&
+	     cmd.flow_attr.size >=
+	     ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) {
+		err = kern_spec_to_ib_spec(file->ucontext, kern_spec, ib_spec,
+					   uflow_res);
+		if (err)
+			goto err_free;
+		flow_attr->size +=
+			((union ib_flow_spec *) ib_spec)->size;
+		cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size;
+		kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size;
+		ib_spec += ((union ib_flow_spec *) ib_spec)->size;
+	}
+	if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) {
+		pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n",
+			i, cmd.flow_attr.size);
+		err = -EINVAL;
+		goto err_free;
+	}
+	flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
+	if (IS_ERR(flow_id)) {
+		err = PTR_ERR(flow_id);
+		goto err_free;
+	}
+	flow_id->uobject = uobj;
+	uobj->object = flow_id;
+	uflow = container_of(uobj, typeof(*uflow), uobject);
+	uflow->resources = uflow_res;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.flow_handle = uobj->id;
+
+	err = ib_copy_to_udata(ucore,
+			       &resp, sizeof(resp));
+	if (err)
+		goto err_copy;
+
+	uobj_put_obj_read(qp);
+	uobj_alloc_commit(uobj);
+	kfree(flow_attr);
+	if (cmd.flow_attr.num_of_specs)
+		kfree(kern_flow_attr);
+	return 0;
+err_copy:
+	ib_destroy_flow(flow_id);
+err_free:
+	ib_uverbs_flow_resources_free(uflow_res);
+err_free_flow_attr:
+	kfree(flow_attr);
+err_put:
+	uobj_put_obj_read(qp);
+err_uobj:
+	uobj_alloc_abort(uobj);
+err_free_attr:
+	if (cmd.flow_attr.num_of_specs)
+		kfree(kern_flow_attr);
+	return err;
+}
+
+int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
+			      struct ib_udata *ucore,
+			      struct ib_udata *uhw)
+{
+	struct ib_uverbs_destroy_flow	cmd;
+	struct ib_uobject		*uobj;
+	int				ret;
+
+	if (ucore->inlen < sizeof(cmd))
+		return -EINVAL;
+
+	ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	if (ret)
+		return ret;
+
+	if (cmd.comp_mask)
+		return -EINVAL;
+
+	uobj  = uobj_get_write(UVERBS_OBJECT_FLOW, cmd.flow_handle,
+			       file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	ret = uobj_remove_commit(uobj);
+	return ret;
+}
+
+static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+				struct ib_device *ib_dev,
+				struct ib_uverbs_create_xsrq *cmd,
+				struct ib_udata *udata)
+{
+	struct ib_uverbs_create_srq_resp resp;
+	struct ib_usrq_object           *obj;
+	struct ib_pd                    *pd;
+	struct ib_srq                   *srq;
+	struct ib_uobject               *uninitialized_var(xrcd_uobj);
+	struct ib_srq_init_attr          attr;
+	int ret;
+
+	obj  = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ,
+						   file->ucontext);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	if (cmd->srq_type == IB_SRQT_TM)
+		attr.ext.tag_matching.max_num_tags = cmd->max_num_tags;
+
+	if (cmd->srq_type == IB_SRQT_XRC) {
+		xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->xrcd_handle,
+					  file->ucontext);
+		if (IS_ERR(xrcd_uobj)) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		attr.ext.xrc.xrcd = (struct ib_xrcd *)xrcd_uobj->object;
+		if (!attr.ext.xrc.xrcd) {
+			ret = -EINVAL;
+			goto err_put_xrcd;
+		}
+
+		obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+		atomic_inc(&obj->uxrcd->refcnt);
+	}
+
+	if (ib_srq_has_cq(cmd->srq_type)) {
+		attr.ext.cq  = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->cq_handle,
+						 file->ucontext);
+		if (!attr.ext.cq) {
+			ret = -EINVAL;
+			goto err_put_xrcd;
+		}
+	}
+
+	pd  = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err_put_cq;
+	}
+
+	attr.event_handler  = ib_uverbs_srq_event_handler;
+	attr.srq_context    = file;
+	attr.srq_type       = cmd->srq_type;
+	attr.attr.max_wr    = cmd->max_wr;
+	attr.attr.max_sge   = cmd->max_sge;
+	attr.attr.srq_limit = cmd->srq_limit;
+
+	obj->uevent.events_reported = 0;
+	INIT_LIST_HEAD(&obj->uevent.event_list);
+
+	srq = pd->device->create_srq(pd, &attr, udata);
+	if (IS_ERR(srq)) {
+		ret = PTR_ERR(srq);
+		goto err_put;
+	}
+
+	srq->device        = pd->device;
+	srq->pd            = pd;
+	srq->srq_type	   = cmd->srq_type;
+	srq->uobject       = &obj->uevent.uobject;
+	srq->event_handler = attr.event_handler;
+	srq->srq_context   = attr.srq_context;
+
+	if (ib_srq_has_cq(cmd->srq_type)) {
+		srq->ext.cq       = attr.ext.cq;
+		atomic_inc(&attr.ext.cq->usecnt);
+	}
+
+	if (cmd->srq_type == IB_SRQT_XRC) {
+		srq->ext.xrc.xrcd = attr.ext.xrc.xrcd;
+		atomic_inc(&attr.ext.xrc.xrcd->usecnt);
+	}
+
+	atomic_inc(&pd->usecnt);
+	atomic_set(&srq->usecnt, 0);
+
+	obj->uevent.uobject.object = srq;
+	obj->uevent.uobject.user_handle = cmd->user_handle;
+
+	memset(&resp, 0, sizeof resp);
+	resp.srq_handle = obj->uevent.uobject.id;
+	resp.max_wr     = attr.attr.max_wr;
+	resp.max_sge    = attr.attr.max_sge;
+	if (cmd->srq_type == IB_SRQT_XRC)
+		resp.srqn = srq->ext.xrc.srq_num;
+
+	if (copy_to_user(u64_to_user_ptr(cmd->response),
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	if (cmd->srq_type == IB_SRQT_XRC)
+		uobj_put_read(xrcd_uobj);
+
+	if (ib_srq_has_cq(cmd->srq_type))
+		uobj_put_obj_read(attr.ext.cq);
+
+	uobj_put_obj_read(pd);
+	uobj_alloc_commit(&obj->uevent.uobject);
+
+	return 0;
+
+err_copy:
+	ib_destroy_srq(srq);
+
+err_put:
+	uobj_put_obj_read(pd);
+
+err_put_cq:
+	if (ib_srq_has_cq(cmd->srq_type))
+		uobj_put_obj_read(attr.ext.cq);
+
+err_put_xrcd:
+	if (cmd->srq_type == IB_SRQT_XRC) {
+		atomic_dec(&obj->uxrcd->refcnt);
+		uobj_put_read(xrcd_uobj);
+	}
+
+err:
+	uobj_alloc_abort(&obj->uevent.uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_create_srq      cmd;
+	struct ib_uverbs_create_xsrq     xcmd;
+	struct ib_uverbs_create_srq_resp resp;
+	struct ib_udata                  udata;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	memset(&xcmd, 0, sizeof(xcmd));
+	xcmd.response	 = cmd.response;
+	xcmd.user_handle = cmd.user_handle;
+	xcmd.srq_type	 = IB_SRQT_BASIC;
+	xcmd.pd_handle	 = cmd.pd_handle;
+	xcmd.max_wr	 = cmd.max_wr;
+	xcmd.max_sge	 = cmd.max_sge;
+	xcmd.srq_limit	 = cmd.srq_limit;
+
+	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+		   u64_to_user_ptr(cmd.response) + sizeof(resp),
+		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+		   out_len - sizeof(resp));
+
+	ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata);
+	if (ret)
+		return ret;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
+			      const char __user *buf, int in_len, int out_len)
+{
+	struct ib_uverbs_create_xsrq     cmd;
+	struct ib_uverbs_create_srq_resp resp;
+	struct ib_udata                  udata;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ib_uverbs_init_udata(&udata, buf + sizeof(cmd),
+		   u64_to_user_ptr(cmd.response) + sizeof(resp),
+		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+		   out_len - sizeof(resp));
+
+	ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata);
+	if (ret)
+		return ret;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_modify_srq cmd;
+	struct ib_udata             udata;
+	struct ib_srq              *srq;
+	struct ib_srq_attr          attr;
+	int                         ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ib_uverbs_init_udata(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+		   out_len);
+
+	srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file->ucontext);
+	if (!srq)
+		return -EINVAL;
+
+	attr.max_wr    = cmd.max_wr;
+	attr.srq_limit = cmd.srq_limit;
+
+	ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata);
+
+	uobj_put_obj_read(srq);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
+			    const char __user *buf,
+			    int in_len, int out_len)
+{
+	struct ib_uverbs_query_srq      cmd;
+	struct ib_uverbs_query_srq_resp resp;
+	struct ib_srq_attr              attr;
+	struct ib_srq                   *srq;
+	int                             ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file->ucontext);
+	if (!srq)
+		return -EINVAL;
+
+	ret = ib_query_srq(srq, &attr);
+
+	uobj_put_obj_read(srq);
+
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.max_wr    = attr.max_wr;
+	resp.max_sge   = attr.max_sge;
+	resp.srq_limit = attr.srq_limit;
+
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
+			      const char __user *buf, int in_len,
+			      int out_len)
+{
+	struct ib_uverbs_destroy_srq      cmd;
+	struct ib_uverbs_destroy_srq_resp resp;
+	struct ib_uobject		 *uobj;
+	struct ib_uevent_object        	 *obj;
+	int                         	  ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj  = uobj_get_write(UVERBS_OBJECT_SRQ, cmd.srq_handle,
+			       file->ucontext);
+	if (IS_ERR(uobj))
+		return PTR_ERR(uobj);
+
+	obj = container_of(uobj, struct ib_uevent_object, uobject);
+	/*
+	 * Make sure we don't free the memory in remove_commit as we still
+	 * needs the uobject memory to create the response.
+	 */
+	uverbs_uobject_get(uobj);
+
+	memset(&resp, 0, sizeof(resp));
+
+	ret = uobj_remove_commit(uobj);
+	if (ret) {
+		uverbs_uobject_put(uobj);
+		return ret;
+	}
+	resp.events_reported = obj->events_reported;
+	uverbs_uobject_put(uobj);
+	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp)))
+		return -EFAULT;
+
+	return in_len;
+}
+
+int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
+			      struct ib_udata *ucore,
+			      struct ib_udata *uhw)
+{
+	struct ib_uverbs_ex_query_device_resp resp = { {0} };
+	struct ib_uverbs_ex_query_device  cmd;
+	struct ib_device_attr attr = {0};
+	int err;
+
+	if (!ib_dev->query_device)
+		return -EOPNOTSUPP;
+
+	if (ucore->inlen < sizeof(cmd))
+		return -EINVAL;
+
+	err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	if (err)
+		return err;
+
+	if (cmd.comp_mask)
+		return -EINVAL;
+
+	if (cmd.reserved)
+		return -EINVAL;
+
+	resp.response_length = offsetof(typeof(resp), odp_caps);
+
+	if (ucore->outlen < resp.response_length)
+		return -ENOSPC;
+
+	err = ib_dev->query_device(ib_dev, &attr, uhw);
+	if (err)
+		return err;
+
+	copy_query_dev_fields(file, ib_dev, &resp.base, &attr);
+
+	if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
+		goto end;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	resp.odp_caps.general_caps = attr.odp_caps.general_caps;
+	resp.odp_caps.per_transport_caps.rc_odp_caps =
+		attr.odp_caps.per_transport_caps.rc_odp_caps;
+	resp.odp_caps.per_transport_caps.uc_odp_caps =
+		attr.odp_caps.per_transport_caps.uc_odp_caps;
+	resp.odp_caps.per_transport_caps.ud_odp_caps =
+		attr.odp_caps.per_transport_caps.ud_odp_caps;
+#endif
+	resp.response_length += sizeof(resp.odp_caps);
+
+	if (ucore->outlen < resp.response_length + sizeof(resp.timestamp_mask))
+		goto end;
+
+	resp.timestamp_mask = attr.timestamp_mask;
+	resp.response_length += sizeof(resp.timestamp_mask);
+
+	if (ucore->outlen < resp.response_length + sizeof(resp.hca_core_clock))
+		goto end;
+
+	resp.hca_core_clock = attr.hca_core_clock;
+	resp.response_length += sizeof(resp.hca_core_clock);
+
+	if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex))
+		goto end;
+
+	resp.device_cap_flags_ex = attr.device_cap_flags;
+	resp.response_length += sizeof(resp.device_cap_flags_ex);
+
+	if (ucore->outlen < resp.response_length + sizeof(resp.rss_caps))
+		goto end;
+
+	resp.rss_caps.supported_qpts = attr.rss_caps.supported_qpts;
+	resp.rss_caps.max_rwq_indirection_tables =
+		attr.rss_caps.max_rwq_indirection_tables;
+	resp.rss_caps.max_rwq_indirection_table_size =
+		attr.rss_caps.max_rwq_indirection_table_size;
+
+	resp.response_length += sizeof(resp.rss_caps);
+
+	if (ucore->outlen < resp.response_length + sizeof(resp.max_wq_type_rq))
+		goto end;
+
+	resp.max_wq_type_rq = attr.max_wq_type_rq;
+	resp.response_length += sizeof(resp.max_wq_type_rq);
+
+	if (ucore->outlen < resp.response_length + sizeof(resp.raw_packet_caps))
+		goto end;
+
+	resp.raw_packet_caps = attr.raw_packet_caps;
+	resp.response_length += sizeof(resp.raw_packet_caps);
+
+	if (ucore->outlen < resp.response_length + sizeof(resp.tm_caps))
+		goto end;
+
+	resp.tm_caps.max_rndv_hdr_size	= attr.tm_caps.max_rndv_hdr_size;
+	resp.tm_caps.max_num_tags	= attr.tm_caps.max_num_tags;
+	resp.tm_caps.max_ops		= attr.tm_caps.max_ops;
+	resp.tm_caps.max_sge		= attr.tm_caps.max_sge;
+	resp.tm_caps.flags		= attr.tm_caps.flags;
+	resp.response_length += sizeof(resp.tm_caps);
+
+	if (ucore->outlen < resp.response_length + sizeof(resp.cq_moderation_caps))
+		goto end;
+
+	resp.cq_moderation_caps.max_cq_moderation_count  =
+		attr.cq_caps.max_cq_moderation_count;
+	resp.cq_moderation_caps.max_cq_moderation_period =
+		attr.cq_caps.max_cq_moderation_period;
+	resp.response_length += sizeof(resp.cq_moderation_caps);
+
+	if (ucore->outlen < resp.response_length + sizeof(resp.max_dm_size))
+		goto end;
+
+	resp.max_dm_size = attr.max_dm_size;
+	resp.response_length += sizeof(resp.max_dm_size);
+end:
+	err = ib_copy_to_udata(ucore, &resp, resp.response_length);
+	return err;
+}
+
+int ib_uverbs_ex_modify_cq(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
+			   struct ib_udata *ucore,
+			   struct ib_udata *uhw)
+{
+	struct ib_uverbs_ex_modify_cq cmd = {};
+	struct ib_cq *cq;
+	size_t required_cmd_sz;
+	int ret;
+
+	required_cmd_sz = offsetof(typeof(cmd), reserved) +
+				sizeof(cmd.reserved);
+	if (ucore->inlen < required_cmd_sz)
+		return -EINVAL;
+
+	/* sanity checks */
+	if (ucore->inlen > sizeof(cmd) &&
+	    !ib_is_udata_cleared(ucore, sizeof(cmd),
+				 ucore->inlen - sizeof(cmd)))
+		return -EOPNOTSUPP;
+
+	ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+	if (ret)
+		return ret;
+
+	if (!cmd.attr_mask || cmd.reserved)
+		return -EINVAL;
+
+	if (cmd.attr_mask > IB_CQ_MODERATE)
+		return -EOPNOTSUPP;
+
+	cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext);
+	if (!cq)
+		return -EINVAL;
+
+	ret = rdma_set_cq_moderation(cq, cmd.attr.cq_count, cmd.attr.cq_period);
+
+	uobj_put_obj_read(cq);
+
+	return ret;
+}
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
new file mode 100644
index 0000000..8d32c4a
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/rdma_user_ioctl.h>
+#include <rdma/uverbs_ioctl.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr,
+				   u16 len)
+{
+	if (uattr->len > sizeof(((struct ib_uverbs_attr *)0)->data))
+		return ib_is_buffer_cleared(u64_to_user_ptr(uattr->data) + len,
+					    uattr->len - len);
+
+	return !memchr_inv((const void *)&uattr->data + len,
+			   0, uattr->len - len);
+}
+
+static int uverbs_process_attr(struct ib_device *ibdev,
+			       struct ib_ucontext *ucontext,
+			       const struct ib_uverbs_attr *uattr,
+			       u16 attr_id,
+			       const struct uverbs_attr_spec_hash *attr_spec_bucket,
+			       struct uverbs_attr_bundle_hash *attr_bundle_h,
+			       struct ib_uverbs_attr __user *uattr_ptr)
+{
+	const struct uverbs_attr_spec *spec;
+	const struct uverbs_attr_spec *val_spec;
+	struct uverbs_attr *e;
+	const struct uverbs_object_spec *object;
+	struct uverbs_obj_attr *o_attr;
+	struct uverbs_attr *elements = attr_bundle_h->attrs;
+
+	if (attr_id >= attr_spec_bucket->num_attrs) {
+		if (uattr->flags & UVERBS_ATTR_F_MANDATORY)
+			return -EINVAL;
+		else
+			return 0;
+	}
+
+	if (test_bit(attr_id, attr_bundle_h->valid_bitmap))
+		return -EINVAL;
+
+	spec = &attr_spec_bucket->attrs[attr_id];
+	val_spec = spec;
+	e = &elements[attr_id];
+	e->uattr = uattr_ptr;
+
+	switch (spec->type) {
+	case UVERBS_ATTR_TYPE_ENUM_IN:
+		if (uattr->attr_data.enum_data.elem_id >= spec->enum_def.num_elems)
+			return -EOPNOTSUPP;
+
+		if (uattr->attr_data.enum_data.reserved)
+			return -EINVAL;
+
+		val_spec = &spec->enum_def.ids[uattr->attr_data.enum_data.elem_id];
+
+		/* Currently we only support PTR_IN based enums */
+		if (val_spec->type != UVERBS_ATTR_TYPE_PTR_IN)
+			return -EOPNOTSUPP;
+
+		e->ptr_attr.enum_id = uattr->attr_data.enum_data.elem_id;
+	/* fall through */
+	case UVERBS_ATTR_TYPE_PTR_IN:
+		/* Ensure that any data provided by userspace beyond the known
+		 * struct is zero. Userspace that knows how to use some future
+		 * longer struct will fail here if used with an old kernel and
+		 * non-zero content, making ABI compat/discovery simpler.
+		 */
+		if (uattr->len > val_spec->ptr.len &&
+		    val_spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO &&
+		    !uverbs_is_attr_cleared(uattr, val_spec->ptr.len))
+			return -EOPNOTSUPP;
+
+	/* fall through */
+	case UVERBS_ATTR_TYPE_PTR_OUT:
+		if (uattr->len < val_spec->ptr.min_len ||
+		    (!(val_spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO) &&
+		     uattr->len > val_spec->ptr.len))
+			return -EINVAL;
+
+		if (spec->type != UVERBS_ATTR_TYPE_ENUM_IN &&
+		    uattr->attr_data.reserved)
+			return -EINVAL;
+
+		e->ptr_attr.data = uattr->data;
+		e->ptr_attr.len = uattr->len;
+		e->ptr_attr.flags = uattr->flags;
+		break;
+
+	case UVERBS_ATTR_TYPE_IDR:
+		if (uattr->data >> 32)
+			return -EINVAL;
+	/* fall through */
+	case UVERBS_ATTR_TYPE_FD:
+		if (uattr->attr_data.reserved)
+			return -EINVAL;
+
+		if (uattr->len != 0 || !ucontext || uattr->data > INT_MAX)
+			return -EINVAL;
+
+		o_attr = &e->obj_attr;
+		object = uverbs_get_object(ibdev, spec->obj.obj_type);
+		if (!object)
+			return -EINVAL;
+		o_attr->type = object->type_attrs;
+
+		o_attr->id = (int)uattr->data;
+		o_attr->uobject = uverbs_get_uobject_from_context(
+					o_attr->type,
+					ucontext,
+					spec->obj.access,
+					o_attr->id);
+
+		if (IS_ERR(o_attr->uobject))
+			return PTR_ERR(o_attr->uobject);
+
+		if (spec->obj.access == UVERBS_ACCESS_NEW) {
+			u64 id = o_attr->uobject->id;
+
+			/* Copy the allocated id to the user-space */
+			if (put_user(id, &e->uattr->data)) {
+				uverbs_finalize_object(o_attr->uobject,
+						       UVERBS_ACCESS_NEW,
+						       false);
+				return -EFAULT;
+			}
+		}
+
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	set_bit(attr_id, attr_bundle_h->valid_bitmap);
+	return 0;
+}
+
+static int uverbs_uattrs_process(struct ib_device *ibdev,
+				 struct ib_ucontext *ucontext,
+				 const struct ib_uverbs_attr *uattrs,
+				 size_t num_uattrs,
+				 const struct uverbs_method_spec *method,
+				 struct uverbs_attr_bundle *attr_bundle,
+				 struct ib_uverbs_attr __user *uattr_ptr)
+{
+	size_t i;
+	int ret = 0;
+	int num_given_buckets = 0;
+
+	for (i = 0; i < num_uattrs; i++) {
+		const struct ib_uverbs_attr *uattr = &uattrs[i];
+		u16 attr_id = uattr->attr_id;
+		struct uverbs_attr_spec_hash *attr_spec_bucket;
+
+		ret = uverbs_ns_idx(&attr_id, method->num_buckets);
+		if (ret < 0) {
+			if (uattr->flags & UVERBS_ATTR_F_MANDATORY) {
+				uverbs_finalize_objects(attr_bundle,
+							method->attr_buckets,
+							num_given_buckets,
+							false);
+				return ret;
+			}
+			continue;
+		}
+
+		/*
+		 * ret is the found ns, so increase num_given_buckets if
+		 * necessary.
+		 */
+		if (ret >= num_given_buckets)
+			num_given_buckets = ret + 1;
+
+		attr_spec_bucket = method->attr_buckets[ret];
+		ret = uverbs_process_attr(ibdev, ucontext, uattr, attr_id,
+					  attr_spec_bucket, &attr_bundle->hash[ret],
+					  uattr_ptr++);
+		if (ret) {
+			uverbs_finalize_objects(attr_bundle,
+						method->attr_buckets,
+						num_given_buckets,
+						false);
+			return ret;
+		}
+	}
+
+	return num_given_buckets;
+}
+
+static int uverbs_validate_kernel_mandatory(const struct uverbs_method_spec *method_spec,
+					    struct uverbs_attr_bundle *attr_bundle)
+{
+	unsigned int i;
+
+	for (i = 0; i < attr_bundle->num_buckets; i++) {
+		struct uverbs_attr_spec_hash *attr_spec_bucket =
+			method_spec->attr_buckets[i];
+
+		if (!bitmap_subset(attr_spec_bucket->mandatory_attrs_bitmask,
+				   attr_bundle->hash[i].valid_bitmap,
+				   attr_spec_bucket->num_attrs))
+			return -EINVAL;
+	}
+
+	for (; i < method_spec->num_buckets; i++) {
+		struct uverbs_attr_spec_hash *attr_spec_bucket =
+			method_spec->attr_buckets[i];
+
+		if (!bitmap_empty(attr_spec_bucket->mandatory_attrs_bitmask,
+				  attr_spec_bucket->num_attrs))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int uverbs_handle_method(struct ib_uverbs_attr __user *uattr_ptr,
+				const struct ib_uverbs_attr *uattrs,
+				size_t num_uattrs,
+				struct ib_device *ibdev,
+				struct ib_uverbs_file *ufile,
+				const struct uverbs_method_spec *method_spec,
+				struct uverbs_attr_bundle *attr_bundle)
+{
+	int ret;
+	int finalize_ret;
+	int num_given_buckets;
+
+	num_given_buckets = uverbs_uattrs_process(ibdev, ufile->ucontext, uattrs,
+						  num_uattrs, method_spec,
+						  attr_bundle, uattr_ptr);
+	if (num_given_buckets <= 0)
+		return -EINVAL;
+
+	attr_bundle->num_buckets = num_given_buckets;
+	ret = uverbs_validate_kernel_mandatory(method_spec, attr_bundle);
+	if (ret)
+		goto cleanup;
+
+	ret = method_spec->handler(ibdev, ufile, attr_bundle);
+cleanup:
+	finalize_ret = uverbs_finalize_objects(attr_bundle,
+					       method_spec->attr_buckets,
+					       attr_bundle->num_buckets,
+					       !ret);
+
+	return ret ? ret : finalize_ret;
+}
+
+#define UVERBS_OPTIMIZE_USING_STACK_SZ  256
+static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev,
+				struct ib_uverbs_file *file,
+				struct ib_uverbs_ioctl_hdr *hdr,
+				void __user *buf)
+{
+	const struct uverbs_object_spec *object_spec;
+	const struct uverbs_method_spec *method_spec;
+	long err = 0;
+	unsigned int i;
+	struct {
+		struct ib_uverbs_attr		*uattrs;
+		struct uverbs_attr_bundle	*uverbs_attr_bundle;
+	} *ctx = NULL;
+	struct uverbs_attr *curr_attr;
+	unsigned long *curr_bitmap;
+	size_t ctx_size;
+	uintptr_t data[UVERBS_OPTIMIZE_USING_STACK_SZ / sizeof(uintptr_t)];
+
+	if (hdr->driver_id != ib_dev->driver_id)
+		return -EINVAL;
+
+	object_spec = uverbs_get_object(ib_dev, hdr->object_id);
+	if (!object_spec)
+		return -EPROTONOSUPPORT;
+
+	method_spec = uverbs_get_method(object_spec, hdr->method_id);
+	if (!method_spec)
+		return -EPROTONOSUPPORT;
+
+	if ((method_spec->flags & UVERBS_ACTION_FLAG_CREATE_ROOT) ^ !file->ucontext)
+		return -EINVAL;
+
+	ctx_size = sizeof(*ctx) +
+		   sizeof(struct uverbs_attr_bundle) +
+		   sizeof(struct uverbs_attr_bundle_hash) * method_spec->num_buckets +
+		   sizeof(*ctx->uattrs) * hdr->num_attrs +
+		   sizeof(*ctx->uverbs_attr_bundle->hash[0].attrs) *
+		   method_spec->num_child_attrs +
+		   sizeof(*ctx->uverbs_attr_bundle->hash[0].valid_bitmap) *
+			(method_spec->num_child_attrs / BITS_PER_LONG +
+			 method_spec->num_buckets);
+
+	if (ctx_size <= UVERBS_OPTIMIZE_USING_STACK_SZ)
+		ctx = (void *)data;
+	if (!ctx)
+		ctx = kmalloc(ctx_size, GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->uverbs_attr_bundle = (void *)ctx + sizeof(*ctx);
+	ctx->uattrs = (void *)(ctx->uverbs_attr_bundle + 1) +
+			      (sizeof(ctx->uverbs_attr_bundle->hash[0]) *
+			       method_spec->num_buckets);
+	curr_attr = (void *)(ctx->uattrs + hdr->num_attrs);
+	curr_bitmap = (void *)(curr_attr + method_spec->num_child_attrs);
+
+	/*
+	 * We just fill the pointers and num_attrs here. The data itself will be
+	 * filled at a later stage (uverbs_process_attr)
+	 */
+	for (i = 0; i < method_spec->num_buckets; i++) {
+		unsigned int curr_num_attrs = method_spec->attr_buckets[i]->num_attrs;
+
+		ctx->uverbs_attr_bundle->hash[i].attrs = curr_attr;
+		curr_attr += curr_num_attrs;
+		ctx->uverbs_attr_bundle->hash[i].num_attrs = curr_num_attrs;
+		ctx->uverbs_attr_bundle->hash[i].valid_bitmap = curr_bitmap;
+		bitmap_zero(curr_bitmap, curr_num_attrs);
+		curr_bitmap += BITS_TO_LONGS(curr_num_attrs);
+	}
+
+	err = copy_from_user(ctx->uattrs, buf,
+			     sizeof(*ctx->uattrs) * hdr->num_attrs);
+	if (err) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = uverbs_handle_method(buf, ctx->uattrs, hdr->num_attrs, ib_dev,
+				   file, method_spec, ctx->uverbs_attr_bundle);
+
+	/*
+	 * EPROTONOSUPPORT is ONLY to be returned if the ioctl framework can
+	 * not invoke the method because the request is not supported.  No
+	 * other cases should return this code.
+	*/
+	if (unlikely(err == -EPROTONOSUPPORT)) {
+		WARN_ON_ONCE(err == -EPROTONOSUPPORT);
+		err = -EINVAL;
+	}
+out:
+	if (ctx != (void *)data)
+		kfree(ctx);
+	return err;
+}
+
+#define IB_UVERBS_MAX_CMD_SZ 4096
+
+long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_uverbs_ioctl_hdr __user *user_hdr =
+		(struct ib_uverbs_ioctl_hdr __user *)arg;
+	struct ib_uverbs_ioctl_hdr hdr;
+	struct ib_device *ib_dev;
+	int srcu_key;
+	long err;
+
+	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+	ib_dev = srcu_dereference(file->device->ib_dev,
+				  &file->device->disassociate_srcu);
+	if (!ib_dev) {
+		err = -EIO;
+		goto out;
+	}
+
+	if (cmd == RDMA_VERBS_IOCTL) {
+		err = copy_from_user(&hdr, user_hdr, sizeof(hdr));
+
+		if (err || hdr.length > IB_UVERBS_MAX_CMD_SZ ||
+		    hdr.length != sizeof(hdr) + hdr.num_attrs * sizeof(struct ib_uverbs_attr)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (hdr.reserved1 || hdr.reserved2) {
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+
+		err = ib_uverbs_cmd_verbs(ib_dev, file, &hdr,
+					  (__user void *)arg + sizeof(hdr));
+	} else {
+		err = -ENOIOCTLCMD;
+	}
+out:
+	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+
+	return err;
+}
diff --git a/drivers/infiniband/core/uverbs_ioctl_merge.c b/drivers/infiniband/core/uverbs_ioctl_merge.c
new file mode 100644
index 0000000..0f88a19
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_ioctl_merge.c
@@ -0,0 +1,665 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/rdma_user_ioctl.h>
+#include <linux/bitops.h>
+#include "uverbs.h"
+
+#define UVERBS_NUM_NS (UVERBS_ID_NS_MASK >> UVERBS_ID_NS_SHIFT)
+#define GET_NS_ID(idx) (((idx) & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT)
+#define GET_ID(idx) ((idx) & ~UVERBS_ID_NS_MASK)
+
+#define _for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset,	       \
+			  buckets_offset)				       \
+	for (tmpj = 0,							       \
+	     elem = (*(const void ***)((hashes)[tmpi] +			       \
+				       (buckets_offset)))[0];	               \
+	     tmpj < *(size_t *)((hashes)[tmpi] + (num_buckets_offset));        \
+	     tmpj++)						               \
+		if ((elem = ((*(const void ***)(hashes[tmpi] +		       \
+						(buckets_offset)))[tmpj])))
+
+/*
+ * Iterate all elements of a few @hashes. The number of given hashes is
+ * indicated by @num_hashes. The offset of the number of buckets in the hash is
+ * represented by @num_buckets_offset, while the offset of the buckets array in
+ * the hash structure is represented by @buckets_offset. tmpi and tmpj are two
+ * short (or int) based indices that are given by the user. tmpi iterates over
+ * the different hashes. @elem points the current element in the hashes[tmpi]
+ * bucket we are looping on. To be honest, @hashes representation isn't exactly
+ * a hash, but more a collection of elements. These elements' ids are treated
+ * in a hash like manner, where the first upper bits are the bucket number.
+ * These elements are later mapped into a perfect-hash.
+ */
+#define for_each_element(elem, tmpi, tmpj, hashes, num_hashes,                 \
+			 num_buckets_offset, buckets_offset)		       \
+	for (tmpi = 0; tmpi < (num_hashes); tmpi++)		               \
+		_for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset,\
+				  buckets_offset)
+
+#define get_elements_iterators_entry_above(iters, num_elements, elements,     \
+					  num_objects_fld, objects_fld, bucket,\
+					  min_id)			       \
+	get_elements_above_id((const void **)iters, num_elements,       \
+				     (const void **)(elements),		       \
+				     offsetof(typeof(**elements),	       \
+					      num_objects_fld),		       \
+				     offsetof(typeof(**elements), objects_fld),\
+				     offsetof(typeof(***(*elements)->objects_fld), id),\
+				     bucket, min_id)
+
+#define get_objects_above_id(iters, num_trees, trees, bucket, min_id)	       \
+	get_elements_iterators_entry_above(iters, num_trees, trees,	       \
+					   num_objects, objects, bucket, min_id)
+
+#define get_methods_above_id(method_iters, num_iters, iters, bucket, min_id)\
+	get_elements_iterators_entry_above(method_iters, num_iters, iters,     \
+					   num_methods, methods, bucket, min_id)
+
+#define get_attrs_above_id(attrs_iters, num_iters, iters, bucket, min_id)\
+	get_elements_iterators_entry_above(attrs_iters, num_iters, iters,      \
+					   num_attrs, attrs, bucket, min_id)
+
+/*
+ * get_elements_above_id get a few hashes represented by @elements and
+ * @num_elements. The hashes fields are described by @num_offset, @data_offset
+ * and @id_offset in the same way as required by for_each_element. The function
+ * returns an array of @iters, represents an array of elements in the hashes
+ * buckets, which their ids are the smallest ids in all hashes but are all
+ * larger than the id given by min_id. Elements are only added to the iters
+ * array if their id belongs to the bucket @bucket. The number of elements in
+ * the returned array is returned by the function. @min_id is also updated to
+ * reflect the new min_id of all elements in iters.
+ */
+static size_t get_elements_above_id(const void **iters,
+				    unsigned int num_elements,
+				    const void **elements,
+				    size_t num_offset,
+				    size_t data_offset,
+				    size_t id_offset,
+				    u16 bucket,
+				    short *min_id)
+{
+	size_t num_iters = 0;
+	short min = SHRT_MAX;
+	const void *elem;
+	int i, j, last_stored = -1;
+	unsigned int equal_min = 0;
+
+	for_each_element(elem, i, j, elements, num_elements, num_offset,
+			 data_offset) {
+		u16 id = *(u16 *)(elem + id_offset);
+
+		if (GET_NS_ID(id) != bucket)
+			continue;
+
+		if (GET_ID(id) < *min_id ||
+		    (min != SHRT_MAX && GET_ID(id) > min))
+			continue;
+
+		/*
+		 * We first iterate all hashes represented by @elements. When
+		 * we do, we try to find an element @elem in the bucket @bucket
+		 * which its id is min. Since we can't ensure the user sorted
+		 * the elements in increasing order, we override this hash's
+		 * minimal id element we found, if a new element with a smaller
+		 * id was just found.
+		 */
+		iters[last_stored == i ? num_iters - 1 : num_iters++] = elem;
+		last_stored = i;
+		if (min == GET_ID(id))
+			equal_min++;
+		else
+			equal_min = 1;
+		min = GET_ID(id);
+	}
+
+	/*
+	 * We only insert to our iters array an element, if its id is smaller
+	 * than all previous ids. Therefore, the final iters array is sorted so
+	 * that smaller ids are in the end of the array.
+	 * Therefore, we need to clean the beginning of the array to make sure
+	 * all ids of final elements are equal to min.
+	 */
+	memmove(iters, iters + num_iters - equal_min, sizeof(*iters) * equal_min);
+
+	*min_id = min;
+	return equal_min;
+}
+
+#define find_max_element_entry_id(num_elements, elements, num_objects_fld, \
+				  objects_fld, bucket)			   \
+	find_max_element_id(num_elements, (const void **)(elements),	   \
+			    offsetof(typeof(**elements), num_objects_fld),    \
+			    offsetof(typeof(**elements), objects_fld),	      \
+			    offsetof(typeof(***(*elements)->objects_fld), id),\
+			    bucket)
+
+static short find_max_element_ns_id(unsigned int num_elements,
+				    const void **elements,
+				    size_t num_offset,
+				    size_t data_offset,
+				    size_t id_offset)
+{
+	short max_ns = SHRT_MIN;
+	const void *elem;
+	int i, j;
+
+	for_each_element(elem, i, j, elements, num_elements, num_offset,
+			 data_offset) {
+		u16 id = *(u16 *)(elem + id_offset);
+
+		if (GET_NS_ID(id) > max_ns)
+			max_ns = GET_NS_ID(id);
+	}
+
+	return max_ns;
+}
+
+static short find_max_element_id(unsigned int num_elements,
+				 const void **elements,
+				 size_t num_offset,
+				 size_t data_offset,
+				 size_t id_offset,
+				 u16 bucket)
+{
+	short max_id = SHRT_MIN;
+	const void *elem;
+	int i, j;
+
+	for_each_element(elem, i, j, elements, num_elements, num_offset,
+			 data_offset) {
+		u16 id = *(u16 *)(elem + id_offset);
+
+		if (GET_NS_ID(id) == bucket &&
+		    GET_ID(id) > max_id)
+			max_id = GET_ID(id);
+	}
+	return max_id;
+}
+
+#define find_max_element_entry_id(num_elements, elements, num_objects_fld,   \
+				  objects_fld, bucket)			      \
+	find_max_element_id(num_elements, (const void **)(elements),	      \
+			    offsetof(typeof(**elements), num_objects_fld),    \
+			    offsetof(typeof(**elements), objects_fld),	      \
+			    offsetof(typeof(***(*elements)->objects_fld), id),\
+			    bucket)
+
+#define find_max_element_ns_entry_id(num_elements, elements,		    \
+				     num_objects_fld, objects_fld)	    \
+	find_max_element_ns_id(num_elements, (const void **)(elements),	    \
+			      offsetof(typeof(**elements), num_objects_fld),\
+			      offsetof(typeof(**elements), objects_fld),    \
+			      offsetof(typeof(***(*elements)->objects_fld), id))
+
+/*
+ * find_max_xxxx_ns_id gets a few elements. Each element is described by an id
+ * which its upper bits represents a namespace. It finds the max namespace. This
+ * could be used in order to know how many buckets do we need to allocate. If no
+ * elements exist, SHRT_MIN is returned. Namespace represents here different
+ * buckets. The common example is "common bucket" and "driver bucket".
+ *
+ * find_max_xxxx_id gets a few elements and a bucket. Each element is described
+ * by an id which its upper bits represent a namespace. It returns the max id
+ * which is contained in the same namespace defined in @bucket. This could be
+ * used in order to know how many elements do we need to allocate in the bucket.
+ * If no elements exist, SHRT_MIN is returned.
+ */
+
+#define find_max_object_id(num_trees, trees, bucket)			\
+		find_max_element_entry_id(num_trees, trees, num_objects,\
+					  objects, bucket)
+#define find_max_object_ns_id(num_trees, trees)			\
+		find_max_element_ns_entry_id(num_trees, trees,		\
+					     num_objects, objects)
+
+#define find_max_method_id(num_iters, iters, bucket)			\
+		find_max_element_entry_id(num_iters, iters, num_methods,\
+					  methods, bucket)
+#define find_max_method_ns_id(num_iters, iters)			\
+		find_max_element_ns_entry_id(num_iters, iters,		\
+					     num_methods, methods)
+
+#define find_max_attr_id(num_iters, iters, bucket)			\
+		find_max_element_entry_id(num_iters, iters, num_attrs,  \
+					  attrs, bucket)
+#define find_max_attr_ns_id(num_iters, iters)				\
+		find_max_element_ns_entry_id(num_iters, iters,		\
+					     num_attrs, attrs)
+
+static void free_method(struct uverbs_method_spec *method)
+{
+	unsigned int i;
+
+	if (!method)
+		return;
+
+	for (i = 0; i < method->num_buckets; i++)
+		kfree(method->attr_buckets[i]);
+
+	kfree(method);
+}
+
+#define IS_ATTR_OBJECT(attr) ((attr)->type == UVERBS_ATTR_TYPE_IDR || \
+			      (attr)->type == UVERBS_ATTR_TYPE_FD)
+
+/*
+ * This function gets array of size @num_method_defs which contains pointers to
+ * method definitions @method_defs. The function allocates an
+ * uverbs_method_spec structure and initializes its number of buckets and the
+ * elements in buckets to the correct attributes. While doing that, it
+ * validates that there aren't conflicts between attributes of different
+ * method_defs.
+ */
+static struct uverbs_method_spec *build_method_with_attrs(const struct uverbs_method_def **method_defs,
+							  size_t num_method_defs)
+{
+	int bucket_idx;
+	int max_attr_buckets = 0;
+	size_t num_attr_buckets = 0;
+	int res = 0;
+	struct uverbs_method_spec *method = NULL;
+	const struct uverbs_attr_def **attr_defs;
+	unsigned int num_of_singularities = 0;
+
+	max_attr_buckets = find_max_attr_ns_id(num_method_defs, method_defs);
+	if (max_attr_buckets >= 0)
+		num_attr_buckets = max_attr_buckets + 1;
+
+	method = kzalloc(sizeof(*method) +
+			 num_attr_buckets * sizeof(*method->attr_buckets),
+			 GFP_KERNEL);
+	if (!method)
+		return ERR_PTR(-ENOMEM);
+
+	method->num_buckets = num_attr_buckets;
+	attr_defs = kcalloc(num_method_defs, sizeof(*attr_defs), GFP_KERNEL);
+	if (!attr_defs) {
+		res = -ENOMEM;
+		goto free_method;
+	}
+	for (bucket_idx = 0; bucket_idx < method->num_buckets; bucket_idx++) {
+		short min_id = SHRT_MIN;
+		int attr_max_bucket = 0;
+		struct uverbs_attr_spec_hash *hash = NULL;
+
+		attr_max_bucket = find_max_attr_id(num_method_defs, method_defs,
+						   bucket_idx);
+		if (attr_max_bucket < 0)
+			continue;
+
+		hash = kzalloc(sizeof(*hash) +
+			       ALIGN(sizeof(*hash->attrs) * (attr_max_bucket + 1),
+				     sizeof(long)) +
+			       BITS_TO_LONGS(attr_max_bucket + 1) * sizeof(long),
+			       GFP_KERNEL);
+		if (!hash) {
+			res = -ENOMEM;
+			goto free;
+		}
+		hash->num_attrs = attr_max_bucket + 1;
+		method->num_child_attrs += hash->num_attrs;
+		hash->mandatory_attrs_bitmask = (void *)(hash + 1) +
+						 ALIGN(sizeof(*hash->attrs) *
+						       (attr_max_bucket + 1),
+						       sizeof(long));
+
+		method->attr_buckets[bucket_idx] = hash;
+
+		do {
+			size_t			 num_attr_defs;
+			struct uverbs_attr_spec	*attr;
+			bool attr_obj_with_special_access;
+
+			num_attr_defs =
+				get_attrs_above_id(attr_defs,
+						   num_method_defs,
+						   method_defs,
+						   bucket_idx,
+						   &min_id);
+			/* Last attr in bucket */
+			if (!num_attr_defs)
+				break;
+
+			if (num_attr_defs > 1) {
+				/*
+				 * We don't allow two attribute definitions for
+				 * the same attribute. This is usually a
+				 * programmer error. If required, it's better to
+				 * just add a new attribute to capture the new
+				 * semantics.
+				 */
+				res = -EEXIST;
+				goto free;
+			}
+
+			attr = &hash->attrs[min_id];
+			memcpy(attr, &attr_defs[0]->attr, sizeof(*attr));
+
+			attr_obj_with_special_access = IS_ATTR_OBJECT(attr) &&
+				   (attr->obj.access == UVERBS_ACCESS_NEW ||
+				    attr->obj.access == UVERBS_ACCESS_DESTROY);
+			num_of_singularities +=  !!attr_obj_with_special_access;
+			if (WARN(num_of_singularities > 1,
+				 "ib_uverbs: Method contains more than one object attr (%d) with new/destroy access\n",
+				 min_id) ||
+			    WARN(attr_obj_with_special_access &&
+				 !(attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY),
+				 "ib_uverbs: Tried to merge attr (%d) but it's an object with new/destroy access but isn't mandatory\n",
+				 min_id) ||
+			    WARN(IS_ATTR_OBJECT(attr) &&
+				 attr->flags & UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO,
+				 "ib_uverbs: Tried to merge attr (%d) but it's an object with min_sz flag\n",
+				 min_id)) {
+				res = -EINVAL;
+				goto free;
+			}
+
+			if (attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY)
+				set_bit(min_id, hash->mandatory_attrs_bitmask);
+			min_id++;
+
+		} while (1);
+	}
+	kfree(attr_defs);
+	return method;
+
+free:
+	kfree(attr_defs);
+free_method:
+	free_method(method);
+	return ERR_PTR(res);
+}
+
+static void free_object(struct uverbs_object_spec *object)
+{
+	unsigned int i, j;
+
+	if (!object)
+		return;
+
+	for (i = 0; i < object->num_buckets; i++) {
+		struct uverbs_method_spec_hash	*method_buckets =
+			object->method_buckets[i];
+
+		if (!method_buckets)
+			continue;
+
+		for (j = 0; j < method_buckets->num_methods; j++)
+			free_method(method_buckets->methods[j]);
+
+		kfree(method_buckets);
+	}
+
+	kfree(object);
+}
+
+/*
+ * This function gets array of size @num_object_defs which contains pointers to
+ * object definitions @object_defs. The function allocated an
+ * uverbs_object_spec structure and initialize its number of buckets and the
+ * elements in buckets to the correct methods. While doing that, it
+ * sorts out the correct relationship between conflicts in the same method.
+ */
+static struct uverbs_object_spec *build_object_with_methods(const struct uverbs_object_def **object_defs,
+							    size_t num_object_defs)
+{
+	u16 bucket_idx;
+	int max_method_buckets = 0;
+	u16 num_method_buckets = 0;
+	int res = 0;
+	struct uverbs_object_spec *object = NULL;
+	const struct uverbs_method_def **method_defs;
+
+	max_method_buckets = find_max_method_ns_id(num_object_defs, object_defs);
+	if (max_method_buckets >= 0)
+		num_method_buckets = max_method_buckets + 1;
+
+	object = kzalloc(sizeof(*object) +
+			 num_method_buckets *
+			 sizeof(*object->method_buckets), GFP_KERNEL);
+	if (!object)
+		return ERR_PTR(-ENOMEM);
+
+	object->num_buckets = num_method_buckets;
+	method_defs = kcalloc(num_object_defs, sizeof(*method_defs), GFP_KERNEL);
+	if (!method_defs) {
+		res = -ENOMEM;
+		goto free_object;
+	}
+
+	for (bucket_idx = 0; bucket_idx < object->num_buckets; bucket_idx++) {
+		short min_id = SHRT_MIN;
+		int methods_max_bucket = 0;
+		struct uverbs_method_spec_hash *hash = NULL;
+
+		methods_max_bucket = find_max_method_id(num_object_defs, object_defs,
+							bucket_idx);
+		if (methods_max_bucket < 0)
+			continue;
+
+		hash = kzalloc(sizeof(*hash) +
+			       sizeof(*hash->methods) * (methods_max_bucket + 1),
+			       GFP_KERNEL);
+		if (!hash) {
+			res = -ENOMEM;
+			goto free;
+		}
+
+		hash->num_methods = methods_max_bucket + 1;
+		object->method_buckets[bucket_idx] = hash;
+
+		do {
+			size_t				num_method_defs;
+			struct uverbs_method_spec	*method;
+			int i;
+
+			num_method_defs =
+				get_methods_above_id(method_defs,
+						     num_object_defs,
+						     object_defs,
+						     bucket_idx,
+						     &min_id);
+			/* Last method in bucket */
+			if (!num_method_defs)
+				break;
+
+			method = build_method_with_attrs(method_defs,
+							 num_method_defs);
+			if (IS_ERR(method)) {
+				res = PTR_ERR(method);
+				goto free;
+			}
+
+			/*
+			 * The last tree which is given as an argument to the
+			 * merge overrides previous method handler.
+			 * Therefore, we iterate backwards and search for the
+			 * first handler which != NULL. This also defines the
+			 * set of flags used for this handler.
+			 */
+			for (i = num_method_defs - 1;
+			     i >= 0 && !method_defs[i]->handler; i--)
+				;
+			hash->methods[min_id++] = method;
+			/* NULL handler isn't allowed */
+			if (WARN(i < 0,
+				 "ib_uverbs: tried to merge function id %d, but all handlers are NULL\n",
+				 min_id)) {
+				res = -EINVAL;
+				goto free;
+			}
+			method->handler = method_defs[i]->handler;
+			method->flags = method_defs[i]->flags;
+
+		} while (1);
+	}
+	kfree(method_defs);
+	return object;
+
+free:
+	kfree(method_defs);
+free_object:
+	free_object(object);
+	return ERR_PTR(res);
+}
+
+void uverbs_free_spec_tree(struct uverbs_root_spec *root)
+{
+	unsigned int i, j;
+
+	if (!root)
+		return;
+
+	for (i = 0; i < root->num_buckets; i++) {
+		struct uverbs_object_spec_hash *object_hash =
+			root->object_buckets[i];
+
+		if (!object_hash)
+			continue;
+
+		for (j = 0; j < object_hash->num_objects; j++)
+			free_object(object_hash->objects[j]);
+
+		kfree(object_hash);
+	}
+
+	kfree(root);
+}
+EXPORT_SYMBOL(uverbs_free_spec_tree);
+
+struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees,
+						const struct uverbs_object_tree_def **trees)
+{
+	u16 bucket_idx;
+	short max_object_buckets = 0;
+	size_t num_objects_buckets = 0;
+	struct uverbs_root_spec *root_spec = NULL;
+	const struct uverbs_object_def **object_defs;
+	int i;
+	int res = 0;
+
+	max_object_buckets = find_max_object_ns_id(num_trees, trees);
+	/*
+	 * Devices which don't want to support ib_uverbs, should just allocate
+	 * an empty parsing tree. Every user-space command won't hit any valid
+	 * entry in the parsing tree and thus will fail.
+	 */
+	if (max_object_buckets >= 0)
+		num_objects_buckets = max_object_buckets + 1;
+
+	root_spec = kzalloc(sizeof(*root_spec) +
+			    num_objects_buckets * sizeof(*root_spec->object_buckets),
+			    GFP_KERNEL);
+	if (!root_spec)
+		return ERR_PTR(-ENOMEM);
+	root_spec->num_buckets = num_objects_buckets;
+
+	object_defs = kcalloc(num_trees, sizeof(*object_defs),
+			      GFP_KERNEL);
+	if (!object_defs) {
+		res = -ENOMEM;
+		goto free_root;
+	}
+
+	for (bucket_idx = 0; bucket_idx < root_spec->num_buckets; bucket_idx++) {
+		short min_id = SHRT_MIN;
+		short objects_max_bucket;
+		struct uverbs_object_spec_hash *hash = NULL;
+
+		objects_max_bucket = find_max_object_id(num_trees, trees,
+							bucket_idx);
+		if (objects_max_bucket < 0)
+			continue;
+
+		hash = kzalloc(sizeof(*hash) +
+			       sizeof(*hash->objects) * (objects_max_bucket + 1),
+			       GFP_KERNEL);
+		if (!hash) {
+			res = -ENOMEM;
+			goto free;
+		}
+		hash->num_objects = objects_max_bucket + 1;
+		root_spec->object_buckets[bucket_idx] = hash;
+
+		do {
+			size_t				num_object_defs;
+			struct uverbs_object_spec	*object;
+
+			num_object_defs = get_objects_above_id(object_defs,
+							       num_trees,
+							       trees,
+							       bucket_idx,
+							       &min_id);
+			/* Last object in bucket */
+			if (!num_object_defs)
+				break;
+
+			object = build_object_with_methods(object_defs,
+							   num_object_defs);
+			if (IS_ERR(object)) {
+				res = PTR_ERR(object);
+				goto free;
+			}
+
+			/*
+			 * The last tree which is given as an argument to the
+			 * merge overrides previous object's type_attrs.
+			 * Therefore, we iterate backwards and search for the
+			 * first type_attrs which != NULL.
+			 */
+			for (i = num_object_defs - 1;
+			     i >= 0 && !object_defs[i]->type_attrs; i--)
+				;
+			/*
+			 * NULL is a valid type_attrs. It means an object we
+			 * can't instantiate (like DEVICE).
+			 */
+			object->type_attrs = i < 0 ? NULL :
+				object_defs[i]->type_attrs;
+
+			hash->objects[min_id++] = object;
+		} while (1);
+	}
+
+	kfree(object_defs);
+	return root_spec;
+
+free:
+	kfree(object_defs);
+free_root:
+	uverbs_free_spec_tree(root_spec);
+	return ERR_PTR(res);
+}
+EXPORT_SYMBOL(uverbs_alloc_spec_tree);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
new file mode 100644
index 0000000..4445d8e
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -0,0 +1,1282 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/cdev.h>
+#include <linux/anon_inodes.h>
+#include <linux/slab.h>
+
+#include <linux/uaccess.h>
+
+#include <rdma/ib.h>
+#include <rdma/uverbs_std_types.h>
+
+#include "uverbs.h"
+#include "core_priv.h"
+#include "rdma_core.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace verbs access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	IB_UVERBS_MAJOR       = 231,
+	IB_UVERBS_BASE_MINOR  = 192,
+	IB_UVERBS_MAX_DEVICES = RDMA_MAX_PORTS,
+	IB_UVERBS_NUM_FIXED_MINOR = 32,
+	IB_UVERBS_NUM_DYNAMIC_MINOR = IB_UVERBS_MAX_DEVICES - IB_UVERBS_NUM_FIXED_MINOR,
+};
+
+#define IB_UVERBS_BASE_DEV	MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
+
+static dev_t dynamic_uverbs_dev;
+static struct class *uverbs_class;
+
+static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
+
+static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+				     struct ib_device *ib_dev,
+				     const char __user *buf, int in_len,
+				     int out_len) = {
+	[IB_USER_VERBS_CMD_GET_CONTEXT]		= ib_uverbs_get_context,
+	[IB_USER_VERBS_CMD_QUERY_DEVICE]	= ib_uverbs_query_device,
+	[IB_USER_VERBS_CMD_QUERY_PORT]		= ib_uverbs_query_port,
+	[IB_USER_VERBS_CMD_ALLOC_PD]		= ib_uverbs_alloc_pd,
+	[IB_USER_VERBS_CMD_DEALLOC_PD]		= ib_uverbs_dealloc_pd,
+	[IB_USER_VERBS_CMD_REG_MR]		= ib_uverbs_reg_mr,
+	[IB_USER_VERBS_CMD_REREG_MR]		= ib_uverbs_rereg_mr,
+	[IB_USER_VERBS_CMD_DEREG_MR]		= ib_uverbs_dereg_mr,
+	[IB_USER_VERBS_CMD_ALLOC_MW]		= ib_uverbs_alloc_mw,
+	[IB_USER_VERBS_CMD_DEALLOC_MW]		= ib_uverbs_dealloc_mw,
+	[IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
+	[IB_USER_VERBS_CMD_CREATE_CQ]		= ib_uverbs_create_cq,
+	[IB_USER_VERBS_CMD_RESIZE_CQ]		= ib_uverbs_resize_cq,
+	[IB_USER_VERBS_CMD_POLL_CQ]		= ib_uverbs_poll_cq,
+	[IB_USER_VERBS_CMD_REQ_NOTIFY_CQ]	= ib_uverbs_req_notify_cq,
+	[IB_USER_VERBS_CMD_DESTROY_CQ]		= ib_uverbs_destroy_cq,
+	[IB_USER_VERBS_CMD_CREATE_QP]		= ib_uverbs_create_qp,
+	[IB_USER_VERBS_CMD_QUERY_QP]		= ib_uverbs_query_qp,
+	[IB_USER_VERBS_CMD_MODIFY_QP]		= ib_uverbs_modify_qp,
+	[IB_USER_VERBS_CMD_DESTROY_QP]		= ib_uverbs_destroy_qp,
+	[IB_USER_VERBS_CMD_POST_SEND]		= ib_uverbs_post_send,
+	[IB_USER_VERBS_CMD_POST_RECV]		= ib_uverbs_post_recv,
+	[IB_USER_VERBS_CMD_POST_SRQ_RECV]	= ib_uverbs_post_srq_recv,
+	[IB_USER_VERBS_CMD_CREATE_AH]		= ib_uverbs_create_ah,
+	[IB_USER_VERBS_CMD_DESTROY_AH]		= ib_uverbs_destroy_ah,
+	[IB_USER_VERBS_CMD_ATTACH_MCAST]	= ib_uverbs_attach_mcast,
+	[IB_USER_VERBS_CMD_DETACH_MCAST]	= ib_uverbs_detach_mcast,
+	[IB_USER_VERBS_CMD_CREATE_SRQ]		= ib_uverbs_create_srq,
+	[IB_USER_VERBS_CMD_MODIFY_SRQ]		= ib_uverbs_modify_srq,
+	[IB_USER_VERBS_CMD_QUERY_SRQ]		= ib_uverbs_query_srq,
+	[IB_USER_VERBS_CMD_DESTROY_SRQ]		= ib_uverbs_destroy_srq,
+	[IB_USER_VERBS_CMD_OPEN_XRCD]		= ib_uverbs_open_xrcd,
+	[IB_USER_VERBS_CMD_CLOSE_XRCD]		= ib_uverbs_close_xrcd,
+	[IB_USER_VERBS_CMD_CREATE_XSRQ]		= ib_uverbs_create_xsrq,
+	[IB_USER_VERBS_CMD_OPEN_QP]		= ib_uverbs_open_qp,
+};
+
+static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+				    struct ib_device *ib_dev,
+				    struct ib_udata *ucore,
+				    struct ib_udata *uhw) = {
+	[IB_USER_VERBS_EX_CMD_CREATE_FLOW]	= ib_uverbs_ex_create_flow,
+	[IB_USER_VERBS_EX_CMD_DESTROY_FLOW]	= ib_uverbs_ex_destroy_flow,
+	[IB_USER_VERBS_EX_CMD_QUERY_DEVICE]	= ib_uverbs_ex_query_device,
+	[IB_USER_VERBS_EX_CMD_CREATE_CQ]	= ib_uverbs_ex_create_cq,
+	[IB_USER_VERBS_EX_CMD_CREATE_QP]        = ib_uverbs_ex_create_qp,
+	[IB_USER_VERBS_EX_CMD_CREATE_WQ]        = ib_uverbs_ex_create_wq,
+	[IB_USER_VERBS_EX_CMD_MODIFY_WQ]        = ib_uverbs_ex_modify_wq,
+	[IB_USER_VERBS_EX_CMD_DESTROY_WQ]       = ib_uverbs_ex_destroy_wq,
+	[IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL] = ib_uverbs_ex_create_rwq_ind_table,
+	[IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL] = ib_uverbs_ex_destroy_rwq_ind_table,
+	[IB_USER_VERBS_EX_CMD_MODIFY_QP]        = ib_uverbs_ex_modify_qp,
+	[IB_USER_VERBS_EX_CMD_MODIFY_CQ]        = ib_uverbs_ex_modify_cq,
+};
+
+static void ib_uverbs_add_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
+
+int uverbs_dealloc_mw(struct ib_mw *mw)
+{
+	struct ib_pd *pd = mw->pd;
+	int ret;
+
+	ret = mw->device->dealloc_mw(mw);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+	return ret;
+}
+
+static void ib_uverbs_release_dev(struct kobject *kobj)
+{
+	struct ib_uverbs_device *dev =
+		container_of(kobj, struct ib_uverbs_device, kobj);
+
+	cleanup_srcu_struct(&dev->disassociate_srcu);
+	kfree(dev);
+}
+
+static struct kobj_type ib_uverbs_dev_ktype = {
+	.release = ib_uverbs_release_dev,
+};
+
+static void ib_uverbs_release_async_event_file(struct kref *ref)
+{
+	struct ib_uverbs_async_event_file *file =
+		container_of(ref, struct ib_uverbs_async_event_file, ref);
+
+	kfree(file);
+}
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+			  struct ib_uverbs_completion_event_file *ev_file,
+			  struct ib_ucq_object *uobj)
+{
+	struct ib_uverbs_event *evt, *tmp;
+
+	if (ev_file) {
+		spin_lock_irq(&ev_file->ev_queue.lock);
+		list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) {
+			list_del(&evt->list);
+			kfree(evt);
+		}
+		spin_unlock_irq(&ev_file->ev_queue.lock);
+
+		uverbs_uobject_put(&ev_file->uobj_file.uobj);
+	}
+
+	spin_lock_irq(&file->async_file->ev_queue.lock);
+	list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) {
+		list_del(&evt->list);
+		kfree(evt);
+	}
+	spin_unlock_irq(&file->async_file->ev_queue.lock);
+}
+
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+			      struct ib_uevent_object *uobj)
+{
+	struct ib_uverbs_event *evt, *tmp;
+
+	spin_lock_irq(&file->async_file->ev_queue.lock);
+	list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) {
+		list_del(&evt->list);
+		kfree(evt);
+	}
+	spin_unlock_irq(&file->async_file->ev_queue.lock);
+}
+
+void ib_uverbs_detach_umcast(struct ib_qp *qp,
+			     struct ib_uqp_object *uobj)
+{
+	struct ib_uverbs_mcast_entry *mcast, *tmp;
+
+	list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) {
+		ib_detach_mcast(qp, &mcast->gid, mcast->lid);
+		list_del(&mcast->list);
+		kfree(mcast);
+	}
+}
+
+static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
+				      struct ib_ucontext *context,
+				      bool device_removed)
+{
+	context->closing = 1;
+	uverbs_cleanup_ucontext(context, device_removed);
+	put_pid(context->tgid);
+
+	ib_rdmacg_uncharge(&context->cg_obj, context->device,
+			   RDMACG_RESOURCE_HCA_HANDLE);
+
+	return context->device->dealloc_ucontext(context);
+}
+
+static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
+{
+	complete(&dev->comp);
+}
+
+void ib_uverbs_release_file(struct kref *ref)
+{
+	struct ib_uverbs_file *file =
+		container_of(ref, struct ib_uverbs_file, ref);
+	struct ib_device *ib_dev;
+	int srcu_key;
+
+	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+	ib_dev = srcu_dereference(file->device->ib_dev,
+				  &file->device->disassociate_srcu);
+	if (ib_dev && !ib_dev->disassociate_ucontext)
+		module_put(ib_dev->owner);
+	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+
+	if (atomic_dec_and_test(&file->device->refcount))
+		ib_uverbs_comp_dev(file->device);
+
+	kobject_put(&file->device->kobj);
+	kfree(file);
+}
+
+static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue,
+				    struct ib_uverbs_file *uverbs_file,
+				    struct file *filp, char __user *buf,
+				    size_t count, loff_t *pos,
+				    size_t eventsz)
+{
+	struct ib_uverbs_event *event;
+	int ret = 0;
+
+	spin_lock_irq(&ev_queue->lock);
+
+	while (list_empty(&ev_queue->event_list)) {
+		spin_unlock_irq(&ev_queue->lock);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(ev_queue->poll_wait,
+					     (!list_empty(&ev_queue->event_list) ||
+			/* The barriers built into wait_event_interruptible()
+			 * and wake_up() guarentee this will see the null set
+			 * without using RCU
+			 */
+					     !uverbs_file->device->ib_dev)))
+			return -ERESTARTSYS;
+
+		/* If device was disassociated and no event exists set an error */
+		if (list_empty(&ev_queue->event_list) &&
+		    !uverbs_file->device->ib_dev)
+			return -EIO;
+
+		spin_lock_irq(&ev_queue->lock);
+	}
+
+	event = list_entry(ev_queue->event_list.next, struct ib_uverbs_event, list);
+
+	if (eventsz > count) {
+		ret   = -EINVAL;
+		event = NULL;
+	} else {
+		list_del(ev_queue->event_list.next);
+		if (event->counter) {
+			++(*event->counter);
+			list_del(&event->obj_list);
+		}
+	}
+
+	spin_unlock_irq(&ev_queue->lock);
+
+	if (event) {
+		if (copy_to_user(buf, event, eventsz))
+			ret = -EFAULT;
+		else
+			ret = eventsz;
+	}
+
+	kfree(event);
+
+	return ret;
+}
+
+static ssize_t ib_uverbs_async_event_read(struct file *filp, char __user *buf,
+					  size_t count, loff_t *pos)
+{
+	struct ib_uverbs_async_event_file *file = filp->private_data;
+
+	return ib_uverbs_event_read(&file->ev_queue, file->uverbs_file, filp,
+				    buf, count, pos,
+				    sizeof(struct ib_uverbs_async_event_desc));
+}
+
+static ssize_t ib_uverbs_comp_event_read(struct file *filp, char __user *buf,
+					 size_t count, loff_t *pos)
+{
+	struct ib_uverbs_completion_event_file *comp_ev_file =
+		filp->private_data;
+
+	return ib_uverbs_event_read(&comp_ev_file->ev_queue,
+				    comp_ev_file->uobj_file.ufile, filp,
+				    buf, count, pos,
+				    sizeof(struct ib_uverbs_comp_event_desc));
+}
+
+static __poll_t ib_uverbs_event_poll(struct ib_uverbs_event_queue *ev_queue,
+					 struct file *filp,
+					 struct poll_table_struct *wait)
+{
+	__poll_t pollflags = 0;
+
+	poll_wait(filp, &ev_queue->poll_wait, wait);
+
+	spin_lock_irq(&ev_queue->lock);
+	if (!list_empty(&ev_queue->event_list))
+		pollflags = EPOLLIN | EPOLLRDNORM;
+	spin_unlock_irq(&ev_queue->lock);
+
+	return pollflags;
+}
+
+static __poll_t ib_uverbs_async_event_poll(struct file *filp,
+					       struct poll_table_struct *wait)
+{
+	return ib_uverbs_event_poll(filp->private_data, filp, wait);
+}
+
+static __poll_t ib_uverbs_comp_event_poll(struct file *filp,
+					      struct poll_table_struct *wait)
+{
+	struct ib_uverbs_completion_event_file *comp_ev_file =
+		filp->private_data;
+
+	return ib_uverbs_event_poll(&comp_ev_file->ev_queue, filp, wait);
+}
+
+static int ib_uverbs_async_event_fasync(int fd, struct file *filp, int on)
+{
+	struct ib_uverbs_event_queue *ev_queue = filp->private_data;
+
+	return fasync_helper(fd, filp, on, &ev_queue->async_queue);
+}
+
+static int ib_uverbs_comp_event_fasync(int fd, struct file *filp, int on)
+{
+	struct ib_uverbs_completion_event_file *comp_ev_file =
+		filp->private_data;
+
+	return fasync_helper(fd, filp, on, &comp_ev_file->ev_queue.async_queue);
+}
+
+static int ib_uverbs_async_event_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_async_event_file *file = filp->private_data;
+	struct ib_uverbs_file *uverbs_file = file->uverbs_file;
+	struct ib_uverbs_event *entry, *tmp;
+	int closed_already = 0;
+
+	mutex_lock(&uverbs_file->device->lists_mutex);
+	spin_lock_irq(&file->ev_queue.lock);
+	closed_already = file->ev_queue.is_closed;
+	file->ev_queue.is_closed = 1;
+	list_for_each_entry_safe(entry, tmp, &file->ev_queue.event_list, list) {
+		if (entry->counter)
+			list_del(&entry->obj_list);
+		kfree(entry);
+	}
+	spin_unlock_irq(&file->ev_queue.lock);
+	if (!closed_already) {
+		list_del(&file->list);
+		ib_unregister_event_handler(&uverbs_file->event_handler);
+	}
+	mutex_unlock(&uverbs_file->device->lists_mutex);
+
+	kref_put(&uverbs_file->ref, ib_uverbs_release_file);
+	kref_put(&file->ref, ib_uverbs_release_async_event_file);
+
+	return 0;
+}
+
+static int ib_uverbs_comp_event_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_completion_event_file *file = filp->private_data;
+	struct ib_uverbs_event *entry, *tmp;
+
+	spin_lock_irq(&file->ev_queue.lock);
+	list_for_each_entry_safe(entry, tmp, &file->ev_queue.event_list, list) {
+		if (entry->counter)
+			list_del(&entry->obj_list);
+		kfree(entry);
+	}
+	spin_unlock_irq(&file->ev_queue.lock);
+
+	uverbs_close_fd(filp);
+
+	return 0;
+}
+
+const struct file_operations uverbs_event_fops = {
+	.owner	 = THIS_MODULE,
+	.read	 = ib_uverbs_comp_event_read,
+	.poll    = ib_uverbs_comp_event_poll,
+	.release = ib_uverbs_comp_event_close,
+	.fasync  = ib_uverbs_comp_event_fasync,
+	.llseek	 = no_llseek,
+};
+
+static const struct file_operations uverbs_async_event_fops = {
+	.owner	 = THIS_MODULE,
+	.read	 = ib_uverbs_async_event_read,
+	.poll    = ib_uverbs_async_event_poll,
+	.release = ib_uverbs_async_event_close,
+	.fasync  = ib_uverbs_async_event_fasync,
+	.llseek	 = no_llseek,
+};
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct ib_uverbs_event_queue   *ev_queue = cq_context;
+	struct ib_ucq_object	       *uobj;
+	struct ib_uverbs_event	       *entry;
+	unsigned long			flags;
+
+	if (!ev_queue)
+		return;
+
+	spin_lock_irqsave(&ev_queue->lock, flags);
+	if (ev_queue->is_closed) {
+		spin_unlock_irqrestore(&ev_queue->lock, flags);
+		return;
+	}
+
+	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry) {
+		spin_unlock_irqrestore(&ev_queue->lock, flags);
+		return;
+	}
+
+	uobj = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+	entry->desc.comp.cq_handle = cq->uobject->user_handle;
+	entry->counter		   = &uobj->comp_events_reported;
+
+	list_add_tail(&entry->list, &ev_queue->event_list);
+	list_add_tail(&entry->obj_list, &uobj->comp_list);
+	spin_unlock_irqrestore(&ev_queue->lock, flags);
+
+	wake_up_interruptible(&ev_queue->poll_wait);
+	kill_fasync(&ev_queue->async_queue, SIGIO, POLL_IN);
+}
+
+static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
+				    __u64 element, __u64 event,
+				    struct list_head *obj_list,
+				    u32 *counter)
+{
+	struct ib_uverbs_event *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&file->async_file->ev_queue.lock, flags);
+	if (file->async_file->ev_queue.is_closed) {
+		spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags);
+		return;
+	}
+
+	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry) {
+		spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags);
+		return;
+	}
+
+	entry->desc.async.element    = element;
+	entry->desc.async.event_type = event;
+	entry->desc.async.reserved   = 0;
+	entry->counter               = counter;
+
+	list_add_tail(&entry->list, &file->async_file->ev_queue.event_list);
+	if (obj_list)
+		list_add_tail(&entry->obj_list, obj_list);
+	spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags);
+
+	wake_up_interruptible(&file->async_file->ev_queue.poll_wait);
+	kill_fasync(&file->async_file->ev_queue.async_queue, SIGIO, POLL_IN);
+}
+
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_ucq_object *uobj = container_of(event->element.cq->uobject,
+						  struct ib_ucq_object, uobject);
+
+	ib_uverbs_async_handler(uobj->uverbs_file, uobj->uobject.user_handle,
+				event->event, &uobj->async_list,
+				&uobj->async_events_reported);
+}
+
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_uevent_object *uobj;
+
+	/* for XRC target qp's, check that qp is live */
+	if (!event->element.qp->uobject)
+		return;
+
+	uobj = container_of(event->element.qp->uobject,
+			    struct ib_uevent_object, uobject);
+
+	ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+				event->event, &uobj->event_list,
+				&uobj->events_reported);
+}
+
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_uevent_object *uobj = container_of(event->element.wq->uobject,
+						  struct ib_uevent_object, uobject);
+
+	ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+				event->event, &uobj->event_list,
+				&uobj->events_reported);
+}
+
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_uevent_object *uobj;
+
+	uobj = container_of(event->element.srq->uobject,
+			    struct ib_uevent_object, uobject);
+
+	ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+				event->event, &uobj->event_list,
+				&uobj->events_reported);
+}
+
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+			     struct ib_event *event)
+{
+	struct ib_uverbs_file *file =
+		container_of(handler, struct ib_uverbs_file, event_handler);
+
+	ib_uverbs_async_handler(file, event->element.port_num, event->event,
+				NULL, NULL);
+}
+
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file)
+{
+	kref_put(&file->async_file->ref, ib_uverbs_release_async_event_file);
+	file->async_file = NULL;
+}
+
+void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue)
+{
+	spin_lock_init(&ev_queue->lock);
+	INIT_LIST_HEAD(&ev_queue->event_list);
+	init_waitqueue_head(&ev_queue->poll_wait);
+	ev_queue->is_closed   = 0;
+	ev_queue->async_queue = NULL;
+}
+
+struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file,
+					      struct ib_device	*ib_dev)
+{
+	struct ib_uverbs_async_event_file *ev_file;
+	struct file *filp;
+
+	ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);
+	if (!ev_file)
+		return ERR_PTR(-ENOMEM);
+
+	ib_uverbs_init_event_queue(&ev_file->ev_queue);
+	ev_file->uverbs_file = uverbs_file;
+	kref_get(&ev_file->uverbs_file->ref);
+	kref_init(&ev_file->ref);
+	filp = anon_inode_getfile("[infinibandevent]", &uverbs_async_event_fops,
+				  ev_file, O_RDONLY);
+	if (IS_ERR(filp))
+		goto err_put_refs;
+
+	mutex_lock(&uverbs_file->device->lists_mutex);
+	list_add_tail(&ev_file->list,
+		      &uverbs_file->device->uverbs_events_file_list);
+	mutex_unlock(&uverbs_file->device->lists_mutex);
+
+	WARN_ON(uverbs_file->async_file);
+	uverbs_file->async_file = ev_file;
+	kref_get(&uverbs_file->async_file->ref);
+	INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler,
+			      ib_dev,
+			      ib_uverbs_event_handler);
+	ib_register_event_handler(&uverbs_file->event_handler);
+	/* At that point async file stuff was fully set */
+
+	return filp;
+
+err_put_refs:
+	kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file);
+	kref_put(&ev_file->ref, ib_uverbs_release_async_event_file);
+	return filp;
+}
+
+static bool verify_command_mask(struct ib_device *ib_dev,
+				u32 command, bool extended)
+{
+	if (!extended)
+		return ib_dev->uverbs_cmd_mask & BIT_ULL(command);
+
+	return ib_dev->uverbs_ex_cmd_mask & BIT_ULL(command);
+}
+
+static bool verify_command_idx(u32 command, bool extended)
+{
+	if (extended)
+		return command < ARRAY_SIZE(uverbs_ex_cmd_table) &&
+		       uverbs_ex_cmd_table[command];
+
+	return command < ARRAY_SIZE(uverbs_cmd_table) &&
+	       uverbs_cmd_table[command];
+}
+
+static ssize_t process_hdr(struct ib_uverbs_cmd_hdr *hdr,
+			   u32 *command, bool *extended)
+{
+	if (hdr->command & ~(u32)(IB_USER_VERBS_CMD_FLAG_EXTENDED |
+				   IB_USER_VERBS_CMD_COMMAND_MASK))
+		return -EINVAL;
+
+	*command = hdr->command & IB_USER_VERBS_CMD_COMMAND_MASK;
+	*extended = hdr->command & IB_USER_VERBS_CMD_FLAG_EXTENDED;
+
+	if (!verify_command_idx(*command, *extended))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr,
+			  struct ib_uverbs_ex_cmd_hdr *ex_hdr,
+			  size_t count, bool extended)
+{
+	if (extended) {
+		count -= sizeof(*hdr) + sizeof(*ex_hdr);
+
+		if ((hdr->in_words + ex_hdr->provider_in_words) * 8 != count)
+			return -EINVAL;
+
+		if (ex_hdr->cmd_hdr_reserved)
+			return -EINVAL;
+
+		if (ex_hdr->response) {
+			if (!hdr->out_words && !ex_hdr->provider_out_words)
+				return -EINVAL;
+
+			if (!access_ok(VERIFY_WRITE,
+				       u64_to_user_ptr(ex_hdr->response),
+				       (hdr->out_words + ex_hdr->provider_out_words) * 8))
+				return -EFAULT;
+		} else {
+			if (hdr->out_words || ex_hdr->provider_out_words)
+				return -EINVAL;
+		}
+
+		return 0;
+	}
+
+	/* not extended command */
+	if (hdr->in_words * 4 != count)
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_uverbs_ex_cmd_hdr ex_hdr;
+	struct ib_device *ib_dev;
+	struct ib_uverbs_cmd_hdr hdr;
+	bool extended;
+	int srcu_key;
+	u32 command;
+	ssize_t ret;
+
+	if (!ib_safe_file_access(filp)) {
+		pr_err_once("uverbs_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
+			    task_tgid_vnr(current), current->comm);
+		return -EACCES;
+	}
+
+	if (count < sizeof(hdr))
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof(hdr)))
+		return -EFAULT;
+
+	ret = process_hdr(&hdr, &command, &extended);
+	if (ret)
+		return ret;
+
+	if (!file->ucontext &&
+	    (command != IB_USER_VERBS_CMD_GET_CONTEXT || extended))
+		return -EINVAL;
+
+	if (extended) {
+		if (count < (sizeof(hdr) + sizeof(ex_hdr)))
+			return -EINVAL;
+		if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
+			return -EFAULT;
+	}
+
+	ret = verify_hdr(&hdr, &ex_hdr, count, extended);
+	if (ret)
+		return ret;
+
+	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+	ib_dev = srcu_dereference(file->device->ib_dev,
+				  &file->device->disassociate_srcu);
+	if (!ib_dev) {
+		ret = -EIO;
+		goto out;
+	}
+
+	if (!verify_command_mask(ib_dev, command, extended)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	buf += sizeof(hdr);
+
+	if (!extended) {
+		ret = uverbs_cmd_table[command](file, ib_dev, buf,
+						hdr.in_words * 4,
+						hdr.out_words * 4);
+	} else {
+		struct ib_udata ucore;
+		struct ib_udata uhw;
+
+		buf += sizeof(ex_hdr);
+
+		ib_uverbs_init_udata_buf_or_null(&ucore, buf,
+					u64_to_user_ptr(ex_hdr.response),
+					hdr.in_words * 8, hdr.out_words * 8);
+
+		ib_uverbs_init_udata_buf_or_null(&uhw,
+					buf + ucore.inlen,
+					u64_to_user_ptr(ex_hdr.response) + ucore.outlen,
+					ex_hdr.provider_in_words * 8,
+					ex_hdr.provider_out_words * 8);
+
+		ret = uverbs_ex_cmd_table[command](file, ib_dev, &ucore, &uhw);
+		ret = (ret) ? : count;
+	}
+
+out:
+	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+	return ret;
+}
+
+static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_device *ib_dev;
+	int ret = 0;
+	int srcu_key;
+
+	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+	ib_dev = srcu_dereference(file->device->ib_dev,
+				  &file->device->disassociate_srcu);
+	if (!ib_dev) {
+		ret = -EIO;
+		goto out;
+	}
+
+	if (!file->ucontext)
+		ret = -ENODEV;
+	else
+		ret = ib_dev->mmap(file->ucontext, vma);
+out:
+	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+	return ret;
+}
+
+/*
+ * ib_uverbs_open() does not need the BKL:
+ *
+ *  - the ib_uverbs_device structures are properly reference counted and
+ *    everything else is purely local to the file being created, so
+ *    races against other open calls are not a problem;
+ *  - there is no ioctl method to race against;
+ *  - the open method will either immediately run -ENXIO, or all
+ *    required initialization will be done.
+ */
+static int ib_uverbs_open(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_device *dev;
+	struct ib_uverbs_file *file;
+	struct ib_device *ib_dev;
+	int ret;
+	int module_dependent;
+	int srcu_key;
+
+	dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
+	if (!atomic_inc_not_zero(&dev->refcount))
+		return -ENXIO;
+
+	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+	mutex_lock(&dev->lists_mutex);
+	ib_dev = srcu_dereference(dev->ib_dev,
+				  &dev->disassociate_srcu);
+	if (!ib_dev) {
+		ret = -EIO;
+		goto err;
+	}
+
+	/* In case IB device supports disassociate ucontext, there is no hard
+	 * dependency between uverbs device and its low level device.
+	 */
+	module_dependent = !(ib_dev->disassociate_ucontext);
+
+	if (module_dependent) {
+		if (!try_module_get(ib_dev->owner)) {
+			ret = -ENODEV;
+			goto err;
+		}
+	}
+
+	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	if (!file) {
+		ret = -ENOMEM;
+		if (module_dependent)
+			goto err_module;
+
+		goto err;
+	}
+
+	file->device	 = dev;
+	spin_lock_init(&file->idr_lock);
+	idr_init(&file->idr);
+	file->ucontext	 = NULL;
+	file->async_file = NULL;
+	kref_init(&file->ref);
+	mutex_init(&file->mutex);
+	mutex_init(&file->cleanup_mutex);
+
+	filp->private_data = file;
+	kobject_get(&dev->kobj);
+	list_add_tail(&file->list, &dev->uverbs_file_list);
+	mutex_unlock(&dev->lists_mutex);
+	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+	return nonseekable_open(inode, filp);
+
+err_module:
+	module_put(ib_dev->owner);
+
+err:
+	mutex_unlock(&dev->lists_mutex);
+	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+	if (atomic_dec_and_test(&dev->refcount))
+		ib_uverbs_comp_dev(dev);
+
+	return ret;
+}
+
+static int ib_uverbs_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+
+	mutex_lock(&file->cleanup_mutex);
+	if (file->ucontext) {
+		ib_uverbs_cleanup_ucontext(file, file->ucontext, false);
+		file->ucontext = NULL;
+	}
+	mutex_unlock(&file->cleanup_mutex);
+	idr_destroy(&file->idr);
+
+	mutex_lock(&file->device->lists_mutex);
+	if (!file->is_closed) {
+		list_del(&file->list);
+		file->is_closed = 1;
+	}
+	mutex_unlock(&file->device->lists_mutex);
+
+	if (file->async_file)
+		kref_put(&file->async_file->ref,
+			 ib_uverbs_release_async_event_file);
+
+	kref_put(&file->ref, ib_uverbs_release_file);
+
+	return 0;
+}
+
+static const struct file_operations uverbs_fops = {
+	.owner	 = THIS_MODULE,
+	.write	 = ib_uverbs_write,
+	.open	 = ib_uverbs_open,
+	.release = ib_uverbs_close,
+	.llseek	 = no_llseek,
+	.unlocked_ioctl = ib_uverbs_ioctl,
+	.compat_ioctl = ib_uverbs_ioctl,
+};
+
+static const struct file_operations uverbs_mmap_fops = {
+	.owner	 = THIS_MODULE,
+	.write	 = ib_uverbs_write,
+	.mmap    = ib_uverbs_mmap,
+	.open	 = ib_uverbs_open,
+	.release = ib_uverbs_close,
+	.llseek	 = no_llseek,
+	.unlocked_ioctl = ib_uverbs_ioctl,
+	.compat_ioctl = ib_uverbs_ioctl,
+};
+
+static struct ib_client uverbs_client = {
+	.name   = "uverbs",
+	.add    = ib_uverbs_add_one,
+	.remove = ib_uverbs_remove_one
+};
+
+static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	int ret = -ENODEV;
+	int srcu_key;
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+	struct ib_device *ib_dev;
+
+	if (!dev)
+		return -ENODEV;
+
+	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+	ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+	if (ib_dev)
+		ret = sprintf(buf, "%s\n", ib_dev->name);
+	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+	return ret;
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_dev_abi_version(struct device *device,
+				    struct device_attribute *attr, char *buf)
+{
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+	int ret = -ENODEV;
+	int srcu_key;
+	struct ib_device *ib_dev;
+
+	if (!dev)
+		return -ENODEV;
+	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+	ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+	if (ib_dev)
+		ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
+	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+	return ret;
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+			 __stringify(IB_USER_VERBS_ABI_VERSION));
+
+static void ib_uverbs_add_one(struct ib_device *device)
+{
+	int devnum;
+	dev_t base;
+	struct ib_uverbs_device *uverbs_dev;
+	int ret;
+
+	if (!device->alloc_ucontext)
+		return;
+
+	uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL);
+	if (!uverbs_dev)
+		return;
+
+	ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
+	if (ret) {
+		kfree(uverbs_dev);
+		return;
+	}
+
+	atomic_set(&uverbs_dev->refcount, 1);
+	init_completion(&uverbs_dev->comp);
+	uverbs_dev->xrcd_tree = RB_ROOT;
+	mutex_init(&uverbs_dev->xrcd_tree_mutex);
+	kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
+	mutex_init(&uverbs_dev->lists_mutex);
+	INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
+	INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
+
+	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
+	if (devnum >= IB_UVERBS_MAX_DEVICES)
+		goto err;
+	uverbs_dev->devnum = devnum;
+	set_bit(devnum, dev_map);
+	if (devnum >= IB_UVERBS_NUM_FIXED_MINOR)
+		base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR;
+	else
+		base = IB_UVERBS_BASE_DEV + devnum;
+
+	rcu_assign_pointer(uverbs_dev->ib_dev, device);
+	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
+
+	cdev_init(&uverbs_dev->cdev, NULL);
+	uverbs_dev->cdev.owner = THIS_MODULE;
+	uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+	cdev_set_parent(&uverbs_dev->cdev, &uverbs_dev->kobj);
+	kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
+	if (cdev_add(&uverbs_dev->cdev, base, 1))
+		goto err_cdev;
+
+	uverbs_dev->dev = device_create(uverbs_class, device->dev.parent,
+					uverbs_dev->cdev.dev, uverbs_dev,
+					"uverbs%d", uverbs_dev->devnum);
+	if (IS_ERR(uverbs_dev->dev))
+		goto err_cdev;
+
+	if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
+		goto err_class;
+	if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
+		goto err_class;
+
+	if (!device->specs_root) {
+		const struct uverbs_object_tree_def *default_root[] = {
+			uverbs_default_get_objects()};
+
+		uverbs_dev->specs_root = uverbs_alloc_spec_tree(1,
+								default_root);
+		if (IS_ERR(uverbs_dev->specs_root))
+			goto err_class;
+
+		device->specs_root = uverbs_dev->specs_root;
+	}
+
+	ib_set_client_data(device, &uverbs_client, uverbs_dev);
+
+	return;
+
+err_class:
+	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+
+err_cdev:
+	cdev_del(&uverbs_dev->cdev);
+	clear_bit(devnum, dev_map);
+
+err:
+	if (atomic_dec_and_test(&uverbs_dev->refcount))
+		ib_uverbs_comp_dev(uverbs_dev);
+	wait_for_completion(&uverbs_dev->comp);
+	kobject_put(&uverbs_dev->kobj);
+	return;
+}
+
+static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
+					struct ib_device *ib_dev)
+{
+	struct ib_uverbs_file *file;
+	struct ib_uverbs_async_event_file *event_file;
+	struct ib_event event;
+
+	/* Pending running commands to terminate */
+	synchronize_srcu(&uverbs_dev->disassociate_srcu);
+	event.event = IB_EVENT_DEVICE_FATAL;
+	event.element.port_num = 0;
+	event.device = ib_dev;
+
+	mutex_lock(&uverbs_dev->lists_mutex);
+	while (!list_empty(&uverbs_dev->uverbs_file_list)) {
+		struct ib_ucontext *ucontext;
+		file = list_first_entry(&uverbs_dev->uverbs_file_list,
+					struct ib_uverbs_file, list);
+		file->is_closed = 1;
+		list_del(&file->list);
+		kref_get(&file->ref);
+		mutex_unlock(&uverbs_dev->lists_mutex);
+
+
+		mutex_lock(&file->cleanup_mutex);
+		ucontext = file->ucontext;
+		file->ucontext = NULL;
+		mutex_unlock(&file->cleanup_mutex);
+
+		/* At this point ib_uverbs_close cannot be running
+		 * ib_uverbs_cleanup_ucontext
+		 */
+		if (ucontext) {
+			/* We must release the mutex before going ahead and
+			 * calling disassociate_ucontext. disassociate_ucontext
+			 * might end up indirectly calling uverbs_close,
+			 * for example due to freeing the resources
+			 * (e.g mmput).
+			 */
+			ib_uverbs_event_handler(&file->event_handler, &event);
+			ib_dev->disassociate_ucontext(ucontext);
+			mutex_lock(&file->cleanup_mutex);
+			ib_uverbs_cleanup_ucontext(file, ucontext, true);
+			mutex_unlock(&file->cleanup_mutex);
+		}
+
+		mutex_lock(&uverbs_dev->lists_mutex);
+		kref_put(&file->ref, ib_uverbs_release_file);
+	}
+
+	while (!list_empty(&uverbs_dev->uverbs_events_file_list)) {
+		event_file = list_first_entry(&uverbs_dev->
+					      uverbs_events_file_list,
+					      struct ib_uverbs_async_event_file,
+					      list);
+		spin_lock_irq(&event_file->ev_queue.lock);
+		event_file->ev_queue.is_closed = 1;
+		spin_unlock_irq(&event_file->ev_queue.lock);
+
+		list_del(&event_file->list);
+		ib_unregister_event_handler(
+			&event_file->uverbs_file->event_handler);
+		event_file->uverbs_file->event_handler.device =
+			NULL;
+
+		wake_up_interruptible(&event_file->ev_queue.poll_wait);
+		kill_fasync(&event_file->ev_queue.async_queue, SIGIO, POLL_IN);
+	}
+	mutex_unlock(&uverbs_dev->lists_mutex);
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
+{
+	struct ib_uverbs_device *uverbs_dev = client_data;
+	int wait_clients = 1;
+
+	if (!uverbs_dev)
+		return;
+
+	dev_set_drvdata(uverbs_dev->dev, NULL);
+	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+	cdev_del(&uverbs_dev->cdev);
+	clear_bit(uverbs_dev->devnum, dev_map);
+
+	if (device->disassociate_ucontext) {
+		/* We disassociate HW resources and immediately return.
+		 * Userspace will see a EIO errno for all future access.
+		 * Upon returning, ib_device may be freed internally and is not
+		 * valid any more.
+		 * uverbs_device is still available until all clients close
+		 * their files, then the uverbs device ref count will be zero
+		 * and its resources will be freed.
+		 * Note: At this point no more files can be opened since the
+		 * cdev was deleted, however active clients can still issue
+		 * commands and close their open files.
+		 */
+		rcu_assign_pointer(uverbs_dev->ib_dev, NULL);
+		ib_uverbs_free_hw_resources(uverbs_dev, device);
+		wait_clients = 0;
+	}
+
+	if (atomic_dec_and_test(&uverbs_dev->refcount))
+		ib_uverbs_comp_dev(uverbs_dev);
+	if (wait_clients)
+		wait_for_completion(&uverbs_dev->comp);
+	if (uverbs_dev->specs_root) {
+		uverbs_free_spec_tree(uverbs_dev->specs_root);
+		device->specs_root = NULL;
+	}
+
+	kobject_put(&uverbs_dev->kobj);
+}
+
+static char *uverbs_devnode(struct device *dev, umode_t *mode)
+{
+	if (mode)
+		*mode = 0666;
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static int __init ib_uverbs_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(IB_UVERBS_BASE_DEV,
+				     IB_UVERBS_NUM_FIXED_MINOR,
+				     "infiniband_verbs");
+	if (ret) {
+		pr_err("user_verbs: couldn't register device number\n");
+		goto out;
+	}
+
+	ret = alloc_chrdev_region(&dynamic_uverbs_dev, 0,
+				  IB_UVERBS_NUM_DYNAMIC_MINOR,
+				  "infiniband_verbs");
+	if (ret) {
+		pr_err("couldn't register dynamic device number\n");
+		goto out_alloc;
+	}
+
+	uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
+	if (IS_ERR(uverbs_class)) {
+		ret = PTR_ERR(uverbs_class);
+		pr_err("user_verbs: couldn't create class infiniband_verbs\n");
+		goto out_chrdev;
+	}
+
+	uverbs_class->devnode = uverbs_devnode;
+
+	ret = class_create_file(uverbs_class, &class_attr_abi_version.attr);
+	if (ret) {
+		pr_err("user_verbs: couldn't create abi_version attribute\n");
+		goto out_class;
+	}
+
+	ret = ib_register_client(&uverbs_client);
+	if (ret) {
+		pr_err("user_verbs: couldn't register client\n");
+		goto out_class;
+	}
+
+	return 0;
+
+out_class:
+	class_destroy(uverbs_class);
+
+out_chrdev:
+	unregister_chrdev_region(dynamic_uverbs_dev,
+				 IB_UVERBS_NUM_DYNAMIC_MINOR);
+
+out_alloc:
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV,
+				 IB_UVERBS_NUM_FIXED_MINOR);
+
+out:
+	return ret;
+}
+
+static void __exit ib_uverbs_cleanup(void)
+{
+	ib_unregister_client(&uverbs_client);
+	class_destroy(uverbs_class);
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV,
+				 IB_UVERBS_NUM_FIXED_MINOR);
+	unregister_chrdev_region(dynamic_uverbs_dev,
+				 IB_UVERBS_NUM_DYNAMIC_MINOR);
+}
+
+module_init(ib_uverbs_init);
+module_exit(ib_uverbs_cleanup);
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
new file mode 100644
index 0000000..bb372b4
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <rdma/ib_marshall.h>
+
+#define OPA_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL)
+static int rdma_ah_conv_opa_to_ib(struct ib_device *dev,
+				  struct rdma_ah_attr *ib,
+				  struct rdma_ah_attr *opa)
+{
+	struct ib_port_attr port_attr;
+	int ret = 0;
+
+	/* Do structure copy and the over-write fields */
+	*ib = *opa;
+
+	ib->type = RDMA_AH_ATTR_TYPE_IB;
+	rdma_ah_set_grh(ib, NULL, 0, 0, 1, 0);
+
+	if (ib_query_port(dev, opa->port_num, &port_attr)) {
+		/* Set to default subnet to indicate error */
+		rdma_ah_set_subnet_prefix(ib, OPA_DEFAULT_GID_PREFIX);
+		ret = -EINVAL;
+	} else {
+		rdma_ah_set_subnet_prefix(ib,
+					  cpu_to_be64(port_attr.subnet_prefix));
+	}
+	rdma_ah_set_interface_id(ib, OPA_MAKE_ID(rdma_ah_get_dlid(opa)));
+	return ret;
+}
+
+void ib_copy_ah_attr_to_user(struct ib_device *device,
+			     struct ib_uverbs_ah_attr *dst,
+			     struct rdma_ah_attr *ah_attr)
+{
+	struct rdma_ah_attr *src = ah_attr;
+	struct rdma_ah_attr conv_ah;
+
+	memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved));
+
+	if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) &&
+	    (rdma_ah_get_dlid(ah_attr) > be16_to_cpu(IB_LID_PERMISSIVE)) &&
+	    (!rdma_ah_conv_opa_to_ib(device, &conv_ah, ah_attr)))
+		src = &conv_ah;
+
+	dst->dlid		   = rdma_ah_get_dlid(src);
+	dst->sl			   = rdma_ah_get_sl(src);
+	dst->src_path_bits	   = rdma_ah_get_path_bits(src);
+	dst->static_rate	   = rdma_ah_get_static_rate(src);
+	dst->is_global             = rdma_ah_get_ah_flags(src) &
+					IB_AH_GRH ? 1 : 0;
+	if (dst->is_global) {
+		const struct ib_global_route *grh = rdma_ah_read_grh(src);
+
+		memcpy(dst->grh.dgid, grh->dgid.raw, sizeof(grh->dgid));
+		dst->grh.flow_label        = grh->flow_label;
+		dst->grh.sgid_index        = grh->sgid_index;
+		dst->grh.hop_limit         = grh->hop_limit;
+		dst->grh.traffic_class     = grh->traffic_class;
+	}
+	dst->port_num		   = rdma_ah_get_port_num(src);
+	dst->reserved 		   = 0;
+}
+EXPORT_SYMBOL(ib_copy_ah_attr_to_user);
+
+void ib_copy_qp_attr_to_user(struct ib_device *device,
+			     struct ib_uverbs_qp_attr *dst,
+			     struct ib_qp_attr *src)
+{
+	dst->qp_state	        = src->qp_state;
+	dst->cur_qp_state	= src->cur_qp_state;
+	dst->path_mtu		= src->path_mtu;
+	dst->path_mig_state	= src->path_mig_state;
+	dst->qkey		= src->qkey;
+	dst->rq_psn		= src->rq_psn;
+	dst->sq_psn		= src->sq_psn;
+	dst->dest_qp_num	= src->dest_qp_num;
+	dst->qp_access_flags	= src->qp_access_flags;
+
+	dst->max_send_wr	= src->cap.max_send_wr;
+	dst->max_recv_wr	= src->cap.max_recv_wr;
+	dst->max_send_sge	= src->cap.max_send_sge;
+	dst->max_recv_sge	= src->cap.max_recv_sge;
+	dst->max_inline_data	= src->cap.max_inline_data;
+
+	ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr);
+	ib_copy_ah_attr_to_user(device, &dst->alt_ah_attr, &src->alt_ah_attr);
+
+	dst->pkey_index		= src->pkey_index;
+	dst->alt_pkey_index	= src->alt_pkey_index;
+	dst->en_sqd_async_notify = src->en_sqd_async_notify;
+	dst->sq_draining	= src->sq_draining;
+	dst->max_rd_atomic	= src->max_rd_atomic;
+	dst->max_dest_rd_atomic	= src->max_dest_rd_atomic;
+	dst->min_rnr_timer	= src->min_rnr_timer;
+	dst->port_num		= src->port_num;
+	dst->timeout		= src->timeout;
+	dst->retry_cnt		= src->retry_cnt;
+	dst->rnr_retry		= src->rnr_retry;
+	dst->alt_port_num	= src->alt_port_num;
+	dst->alt_timeout	= src->alt_timeout;
+	memset(dst->reserved, 0, sizeof(dst->reserved));
+}
+EXPORT_SYMBOL(ib_copy_qp_attr_to_user);
+
+static void __ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+				       struct sa_path_rec *src)
+{
+	memcpy(dst->dgid, src->dgid.raw, sizeof(src->dgid));
+	memcpy(dst->sgid, src->sgid.raw, sizeof(src->sgid));
+
+	dst->dlid		= htons(ntohl(sa_path_get_dlid(src)));
+	dst->slid		= htons(ntohl(sa_path_get_slid(src)));
+	dst->raw_traffic	= sa_path_get_raw_traffic(src);
+	dst->flow_label		= src->flow_label;
+	dst->hop_limit		= src->hop_limit;
+	dst->traffic_class	= src->traffic_class;
+	dst->reversible		= src->reversible;
+	dst->numb_path		= src->numb_path;
+	dst->pkey		= src->pkey;
+	dst->sl			= src->sl;
+	dst->mtu_selector	= src->mtu_selector;
+	dst->mtu		= src->mtu;
+	dst->rate_selector	= src->rate_selector;
+	dst->rate		= src->rate;
+	dst->packet_life_time	= src->packet_life_time;
+	dst->preference		= src->preference;
+	dst->packet_life_time_selector = src->packet_life_time_selector;
+}
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+			      struct sa_path_rec *src)
+{
+	struct sa_path_rec rec;
+
+	if (src->rec_type == SA_PATH_REC_TYPE_OPA) {
+		sa_convert_path_opa_to_ib(&rec, src);
+		__ib_copy_path_rec_to_user(dst, &rec);
+		return;
+	}
+	__ib_copy_path_rec_to_user(dst, src);
+}
+EXPORT_SYMBOL(ib_copy_path_rec_to_user);
+
+void ib_copy_path_rec_from_user(struct sa_path_rec *dst,
+				struct ib_user_path_rec *src)
+{
+	u32 slid, dlid;
+
+	memset(dst, 0, sizeof(*dst));
+	if ((ib_is_opa_gid((union ib_gid *)src->sgid)) ||
+	    (ib_is_opa_gid((union ib_gid *)src->dgid))) {
+		dst->rec_type = SA_PATH_REC_TYPE_OPA;
+		slid = opa_get_lid_from_gid((union ib_gid *)src->sgid);
+		dlid = opa_get_lid_from_gid((union ib_gid *)src->dgid);
+	} else {
+		dst->rec_type = SA_PATH_REC_TYPE_IB;
+		slid = ntohs(src->slid);
+		dlid = ntohs(src->dlid);
+	}
+	memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid);
+	memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid);
+
+	sa_path_set_dlid(dst, dlid);
+	sa_path_set_slid(dst, slid);
+	sa_path_set_raw_traffic(dst, src->raw_traffic);
+	dst->flow_label		= src->flow_label;
+	dst->hop_limit		= src->hop_limit;
+	dst->traffic_class	= src->traffic_class;
+	dst->reversible		= src->reversible;
+	dst->numb_path		= src->numb_path;
+	dst->pkey		= src->pkey;
+	dst->sl			= src->sl;
+	dst->mtu_selector	= src->mtu_selector;
+	dst->mtu		= src->mtu;
+	dst->rate_selector	= src->rate_selector;
+	dst->rate		= src->rate;
+	dst->packet_life_time	= src->packet_life_time;
+	dst->preference		= src->preference;
+	dst->packet_life_time_selector = src->packet_life_time_selector;
+
+	/* TODO: No need to set this */
+	sa_path_set_dmac_zero(dst);
+	sa_path_set_ndev(dst, NULL);
+	sa_path_set_ifindex(dst, 0);
+}
+EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
new file mode 100644
index 0000000..569f48b
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <linux/bug.h>
+#include <linux/file.h>
+#include <rdma/restrack.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+static int uverbs_free_ah(struct ib_uobject *uobject,
+			  enum rdma_remove_reason why)
+{
+	return rdma_destroy_ah((struct ib_ah *)uobject->object);
+}
+
+static int uverbs_free_flow(struct ib_uobject *uobject,
+			    enum rdma_remove_reason why)
+{
+	int ret;
+	struct ib_flow *flow = (struct ib_flow *)uobject->object;
+	struct ib_uflow_object *uflow =
+		container_of(uobject, struct ib_uflow_object, uobject);
+
+	ret = ib_destroy_flow(flow);
+	if (!ret)
+		ib_uverbs_flow_resources_free(uflow->resources);
+
+	return ret;
+}
+
+static int uverbs_free_mw(struct ib_uobject *uobject,
+			  enum rdma_remove_reason why)
+{
+	return uverbs_dealloc_mw((struct ib_mw *)uobject->object);
+}
+
+static int uverbs_free_qp(struct ib_uobject *uobject,
+			  enum rdma_remove_reason why)
+{
+	struct ib_qp *qp = uobject->object;
+	struct ib_uqp_object *uqp =
+		container_of(uobject, struct ib_uqp_object, uevent.uobject);
+	int ret;
+
+	if (why == RDMA_REMOVE_DESTROY) {
+		if (!list_empty(&uqp->mcast_list))
+			return -EBUSY;
+	} else if (qp == qp->real_qp) {
+		ib_uverbs_detach_umcast(qp, uqp);
+	}
+
+	ret = ib_destroy_qp(qp);
+	if (ret && why == RDMA_REMOVE_DESTROY)
+		return ret;
+
+	if (uqp->uxrcd)
+		atomic_dec(&uqp->uxrcd->refcnt);
+
+	ib_uverbs_release_uevent(uobject->context->ufile, &uqp->uevent);
+	return ret;
+}
+
+static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject,
+				   enum rdma_remove_reason why)
+{
+	struct ib_rwq_ind_table *rwq_ind_tbl = uobject->object;
+	struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl;
+	int ret;
+
+	ret = ib_destroy_rwq_ind_table(rwq_ind_tbl);
+	if (!ret || why != RDMA_REMOVE_DESTROY)
+		kfree(ind_tbl);
+	return ret;
+}
+
+static int uverbs_free_wq(struct ib_uobject *uobject,
+			  enum rdma_remove_reason why)
+{
+	struct ib_wq *wq = uobject->object;
+	struct ib_uwq_object *uwq =
+		container_of(uobject, struct ib_uwq_object, uevent.uobject);
+	int ret;
+
+	ret = ib_destroy_wq(wq);
+	if (!ret || why != RDMA_REMOVE_DESTROY)
+		ib_uverbs_release_uevent(uobject->context->ufile, &uwq->uevent);
+	return ret;
+}
+
+static int uverbs_free_srq(struct ib_uobject *uobject,
+			   enum rdma_remove_reason why)
+{
+	struct ib_srq *srq = uobject->object;
+	struct ib_uevent_object *uevent =
+		container_of(uobject, struct ib_uevent_object, uobject);
+	enum ib_srq_type  srq_type = srq->srq_type;
+	int ret;
+
+	ret = ib_destroy_srq(srq);
+
+	if (ret && why == RDMA_REMOVE_DESTROY)
+		return ret;
+
+	if (srq_type == IB_SRQT_XRC) {
+		struct ib_usrq_object *us =
+			container_of(uevent, struct ib_usrq_object, uevent);
+
+		atomic_dec(&us->uxrcd->refcnt);
+	}
+
+	ib_uverbs_release_uevent(uobject->context->ufile, uevent);
+	return ret;
+}
+
+static int uverbs_free_xrcd(struct ib_uobject *uobject,
+			    enum rdma_remove_reason why)
+{
+	struct ib_xrcd *xrcd = uobject->object;
+	struct ib_uxrcd_object *uxrcd =
+		container_of(uobject, struct ib_uxrcd_object, uobject);
+	int ret;
+
+	mutex_lock(&uobject->context->ufile->device->xrcd_tree_mutex);
+	if (why == RDMA_REMOVE_DESTROY && atomic_read(&uxrcd->refcnt))
+		ret = -EBUSY;
+	else
+		ret = ib_uverbs_dealloc_xrcd(uobject->context->ufile->device,
+					     xrcd, why);
+	mutex_unlock(&uobject->context->ufile->device->xrcd_tree_mutex);
+
+	return ret;
+}
+
+static int uverbs_free_pd(struct ib_uobject *uobject,
+			  enum rdma_remove_reason why)
+{
+	struct ib_pd *pd = uobject->object;
+
+	if (why == RDMA_REMOVE_DESTROY && atomic_read(&pd->usecnt))
+		return -EBUSY;
+
+	ib_dealloc_pd((struct ib_pd *)uobject->object);
+	return 0;
+}
+
+static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_file,
+						   enum rdma_remove_reason why)
+{
+	struct ib_uverbs_completion_event_file *comp_event_file =
+		container_of(uobj_file, struct ib_uverbs_completion_event_file,
+			     uobj_file);
+	struct ib_uverbs_event_queue *event_queue = &comp_event_file->ev_queue;
+
+	spin_lock_irq(&event_queue->lock);
+	event_queue->is_closed = 1;
+	spin_unlock_irq(&event_queue->lock);
+
+	if (why == RDMA_REMOVE_DRIVER_REMOVE) {
+		wake_up_interruptible(&event_queue->poll_wait);
+		kill_fasync(&event_queue->async_queue, SIGIO, POLL_IN);
+	}
+	return 0;
+};
+
+int uverbs_destroy_def_handler(struct ib_device *ib_dev,
+			       struct ib_uverbs_file *file,
+			       struct uverbs_attr_bundle *attrs)
+{
+	return 0;
+}
+
+/*
+ * This spec is used in order to pass information to the hardware driver in a
+ * legacy way. Every verb that could get driver specific data should get this
+ * spec.
+ */
+const struct uverbs_attr_def uverbs_uhw_compat_in =
+	UVERBS_ATTR_PTR_IN_SZ(UVERBS_ATTR_UHW_IN, UVERBS_ATTR_SIZE(0, USHRT_MAX),
+			      UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO));
+const struct uverbs_attr_def uverbs_uhw_compat_out =
+	UVERBS_ATTR_PTR_OUT_SZ(UVERBS_ATTR_UHW_OUT, UVERBS_ATTR_SIZE(0, USHRT_MAX),
+			       UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO));
+
+void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata)
+{
+	/*
+	 * This is for ease of conversion. The purpose is to convert all drivers
+	 * to use uverbs_attr_bundle instead of ib_udata.
+	 * Assume attr == 0 is input and attr == 1 is output.
+	 */
+	const struct uverbs_attr *uhw_in =
+		uverbs_attr_get(ctx, UVERBS_ATTR_UHW_IN);
+	const struct uverbs_attr *uhw_out =
+		uverbs_attr_get(ctx, UVERBS_ATTR_UHW_OUT);
+
+	if (!IS_ERR(uhw_in)) {
+		udata->inlen = uhw_in->ptr_attr.len;
+		if (uverbs_attr_ptr_is_inline(uhw_in))
+			udata->inbuf = &uhw_in->uattr->data;
+		else
+			udata->inbuf = u64_to_user_ptr(uhw_in->ptr_attr.data);
+	} else {
+		udata->inbuf = NULL;
+		udata->inlen = 0;
+	}
+
+	if (!IS_ERR(uhw_out)) {
+		udata->outbuf = u64_to_user_ptr(uhw_out->ptr_attr.data);
+		udata->outlen = uhw_out->ptr_attr.len;
+	} else {
+		udata->outbuf = NULL;
+		udata->outlen = 0;
+	}
+}
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COMP_CHANNEL,
+			    &UVERBS_TYPE_ALLOC_FD(0,
+						  sizeof(struct ib_uverbs_completion_event_file),
+						  uverbs_hot_unplug_completion_event_file,
+						  &uverbs_event_fops,
+						  "[infinibandevent]", O_RDONLY));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_QP,
+			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0,
+						      uverbs_free_qp));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW,
+			    &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_mw));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_SRQ,
+			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0,
+						      uverbs_free_srq));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_AH,
+			    &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_ah));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_FLOW,
+			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uflow_object),
+						      0, uverbs_free_flow));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_WQ,
+			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0,
+						      uverbs_free_wq));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL,
+			    &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_rwq_ind_tbl));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_XRCD,
+			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0,
+						      uverbs_free_xrcd));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_PD,
+			    /* 2 is used in order to free the PD after MRs */
+			    &UVERBS_TYPE_ALLOC_IDR(2, uverbs_free_pd));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DEVICE, NULL);
+
+static DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects,
+				  &UVERBS_OBJECT(UVERBS_OBJECT_DEVICE),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_PD),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_MR),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_CQ),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_QP),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_AH),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_MW),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_SRQ),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_FLOW),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_WQ),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_XRCD),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION),
+				  &UVERBS_OBJECT(UVERBS_OBJECT_DM));
+
+const struct uverbs_object_tree_def *uverbs_default_get_objects(void)
+{
+	return &uverbs_default_objects;
+}
+EXPORT_SYMBOL_GPL(uverbs_default_get_objects);
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
new file mode 100644
index 0000000..b0dbae9
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+static int uverbs_free_cq(struct ib_uobject *uobject,
+			  enum rdma_remove_reason why)
+{
+	struct ib_cq *cq = uobject->object;
+	struct ib_uverbs_event_queue *ev_queue = cq->cq_context;
+	struct ib_ucq_object *ucq =
+		container_of(uobject, struct ib_ucq_object, uobject);
+	int ret;
+
+	ret = ib_destroy_cq(cq);
+	if (!ret || why != RDMA_REMOVE_DESTROY)
+		ib_uverbs_release_ucq(uobject->context->ufile, ev_queue ?
+				      container_of(ev_queue,
+						   struct ib_uverbs_completion_event_file,
+						   ev_queue) : NULL,
+				      ucq);
+	return ret;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev,
+						   struct ib_uverbs_file *file,
+						   struct uverbs_attr_bundle *attrs)
+{
+	struct ib_ucontext *ucontext = file->ucontext;
+	struct ib_ucq_object           *obj;
+	struct ib_udata uhw;
+	int ret;
+	u64 user_handle;
+	struct ib_cq_init_attr attr = {};
+	struct ib_cq                   *cq;
+	struct ib_uverbs_completion_event_file    *ev_file = NULL;
+	const struct uverbs_attr *ev_file_attr;
+	struct ib_uobject *ev_file_uobj;
+
+	if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_CREATE_CQ))
+		return -EOPNOTSUPP;
+
+	ret = uverbs_copy_from(&attr.comp_vector, attrs,
+			       UVERBS_ATTR_CREATE_CQ_COMP_VECTOR);
+	if (!ret)
+		ret = uverbs_copy_from(&attr.cqe, attrs,
+				       UVERBS_ATTR_CREATE_CQ_CQE);
+	if (!ret)
+		ret = uverbs_copy_from(&user_handle, attrs,
+				       UVERBS_ATTR_CREATE_CQ_USER_HANDLE);
+	if (ret)
+		return ret;
+
+	/* Optional param, if it doesn't exist, we get -ENOENT and skip it */
+	if (IS_UVERBS_COPY_ERR(uverbs_copy_from(&attr.flags, attrs,
+						UVERBS_ATTR_CREATE_CQ_FLAGS)))
+		return -EFAULT;
+
+	ev_file_attr = uverbs_attr_get(attrs, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL);
+	if (!IS_ERR(ev_file_attr)) {
+		ev_file_uobj = ev_file_attr->obj_attr.uobject;
+
+		ev_file = container_of(ev_file_uobj,
+				       struct ib_uverbs_completion_event_file,
+				       uobj_file.uobj);
+		uverbs_uobject_get(ev_file_uobj);
+	}
+
+	if (attr.comp_vector >= ucontext->ufile->device->num_comp_vectors) {
+		ret = -EINVAL;
+		goto err_event_file;
+	}
+
+	obj = container_of(uverbs_attr_get(attrs,
+					   UVERBS_ATTR_CREATE_CQ_HANDLE)->obj_attr.uobject,
+			   typeof(*obj), uobject);
+	obj->uverbs_file	   = ucontext->ufile;
+	obj->comp_events_reported  = 0;
+	obj->async_events_reported = 0;
+	INIT_LIST_HEAD(&obj->comp_list);
+	INIT_LIST_HEAD(&obj->async_list);
+
+	/* Temporary, only until drivers get the new uverbs_attr_bundle */
+	create_udata(attrs, &uhw);
+
+	cq = ib_dev->create_cq(ib_dev, &attr, ucontext, &uhw);
+	if (IS_ERR(cq)) {
+		ret = PTR_ERR(cq);
+		goto err_event_file;
+	}
+
+	cq->device        = ib_dev;
+	cq->uobject       = &obj->uobject;
+	cq->comp_handler  = ib_uverbs_comp_handler;
+	cq->event_handler = ib_uverbs_cq_event_handler;
+	cq->cq_context    = ev_file ? &ev_file->ev_queue : NULL;
+	obj->uobject.object = cq;
+	obj->uobject.user_handle = user_handle;
+	atomic_set(&cq->usecnt, 0);
+	cq->res.type = RDMA_RESTRACK_CQ;
+	rdma_restrack_add(&cq->res);
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe,
+			     sizeof(cq->cqe));
+	if (ret)
+		goto err_cq;
+
+	return 0;
+err_cq:
+	ib_destroy_cq(cq);
+
+err_event_file:
+	if (ev_file)
+		uverbs_uobject_put(ev_file_uobj);
+	return ret;
+};
+
+static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_CREATE,
+	&UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_CQ_HANDLE, UVERBS_OBJECT_CQ,
+			 UVERBS_ACCESS_NEW,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE,
+			    UVERBS_ATTR_TYPE(u32),
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE,
+			    UVERBS_ATTR_TYPE(u64),
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL,
+			UVERBS_OBJECT_COMP_CHANNEL,
+			UVERBS_ACCESS_READ),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, UVERBS_ATTR_TYPE(u32),
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_FLAGS, UVERBS_ATTR_TYPE(u32)),
+	&UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE, UVERBS_ATTR_TYPE(u32),
+			     UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&uverbs_uhw_compat_in, &uverbs_uhw_compat_out);
+
+static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(struct ib_device *ib_dev,
+						    struct ib_uverbs_file *file,
+						    struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uverbs_destroy_cq_resp resp;
+	struct ib_uobject *uobj =
+		uverbs_attr_get(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE)->obj_attr.uobject;
+	struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object,
+						 uobject);
+	int ret;
+
+	if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_DESTROY_CQ))
+		return -EOPNOTSUPP;
+
+	ret = rdma_explicit_destroy(uobj);
+	if (ret)
+		return ret;
+
+	resp.comp_events_reported  = obj->comp_events_reported;
+	resp.async_events_reported = obj->async_events_reported;
+
+	return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_CQ_RESP, &resp,
+			      sizeof(resp));
+}
+
+static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_DESTROY,
+	&UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_CQ_HANDLE, UVERBS_OBJECT_CQ,
+			 UVERBS_ACCESS_DESTROY,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_CQ_RESP,
+			     UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_cq_resp),
+			     UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_CQ,
+			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0,
+						      uverbs_free_cq),
+#if IS_ENABLED(CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI)
+			    &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE),
+			    &UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY)
+#endif
+			   );
+
diff --git a/drivers/infiniband/core/uverbs_std_types_dm.c b/drivers/infiniband/core/uverbs_std_types_dm.c
new file mode 100644
index 0000000..8b68157
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_dm.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+
+static int uverbs_free_dm(struct ib_uobject *uobject,
+			  enum rdma_remove_reason why)
+{
+	struct ib_dm *dm = uobject->object;
+
+	if (why == RDMA_REMOVE_DESTROY && atomic_read(&dm->usecnt))
+		return -EBUSY;
+
+	return dm->device->dealloc_dm(dm);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(struct ib_device *ib_dev,
+						  struct ib_uverbs_file *file,
+						  struct uverbs_attr_bundle *attrs)
+{
+	struct ib_ucontext *ucontext = file->ucontext;
+	struct ib_dm_alloc_attr attr = {};
+	struct ib_uobject *uobj;
+	struct ib_dm *dm;
+	int ret;
+
+	if (!ib_dev->alloc_dm)
+		return -EOPNOTSUPP;
+
+	ret = uverbs_copy_from(&attr.length, attrs,
+			       UVERBS_ATTR_ALLOC_DM_LENGTH);
+	if (ret)
+		return ret;
+
+	ret = uverbs_copy_from(&attr.alignment, attrs,
+			       UVERBS_ATTR_ALLOC_DM_ALIGNMENT);
+	if (ret)
+		return ret;
+
+	uobj = uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DM_HANDLE)->obj_attr.uobject;
+
+	dm = ib_dev->alloc_dm(ib_dev, ucontext, &attr, attrs);
+	if (IS_ERR(dm))
+		return PTR_ERR(dm);
+
+	dm->device  = ib_dev;
+	dm->length  = attr.length;
+	dm->uobject = uobj;
+	atomic_set(&dm->usecnt, 0);
+
+	uobj->object = dm;
+
+	return 0;
+}
+
+static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_DM_ALLOC,
+	&UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DM_HANDLE, UVERBS_OBJECT_DM,
+			 UVERBS_ACCESS_NEW,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_LENGTH,
+			    UVERBS_ATTR_TYPE(u64),
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_ALIGNMENT,
+			    UVERBS_ATTR_TYPE(u32),
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
+
+static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_DM_FREE,
+	uverbs_destroy_def_handler,
+	&UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DM_HANDLE,
+			 UVERBS_OBJECT_DM,
+			 UVERBS_ACCESS_DESTROY,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DM,
+			    /* 1 is used in order to free the DM after MRs */
+			    &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_dm),
+			    &UVERBS_METHOD(UVERBS_METHOD_DM_ALLOC),
+			    &UVERBS_METHOD(UVERBS_METHOD_DM_FREE));
diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c
new file mode 100644
index 0000000..b4f016d
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+
+static int uverbs_free_flow_action(struct ib_uobject *uobject,
+				   enum rdma_remove_reason why)
+{
+	struct ib_flow_action *action = uobject->object;
+
+	if (why == RDMA_REMOVE_DESTROY &&
+	    atomic_read(&action->usecnt))
+		return -EBUSY;
+
+	return action->device->destroy_flow_action(action);
+}
+
+static u64 esp_flags_uverbs_to_verbs(struct uverbs_attr_bundle *attrs,
+				     u32 flags, bool is_modify)
+{
+	u64 verbs_flags = flags;
+
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ESN))
+		verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED;
+
+	if (is_modify && uverbs_attr_is_valid(attrs,
+					      UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS))
+		verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS;
+
+	return verbs_flags;
+};
+
+static int validate_flow_action_esp_keymat_aes_gcm(struct ib_flow_action_attrs_esp_keymats *keymat)
+{
+	struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm =
+		&keymat->keymat.aes_gcm;
+
+	if (aes_gcm->iv_algo > IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ)
+		return -EOPNOTSUPP;
+
+	if (aes_gcm->key_len != 32 &&
+	    aes_gcm->key_len != 24 &&
+	    aes_gcm->key_len != 16)
+		return -EINVAL;
+
+	if (aes_gcm->icv_len != 16 &&
+	    aes_gcm->icv_len != 8 &&
+	    aes_gcm->icv_len != 12)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int (* const flow_action_esp_keymat_validate[])(struct ib_flow_action_attrs_esp_keymats *keymat) = {
+	[IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = validate_flow_action_esp_keymat_aes_gcm,
+};
+
+static int flow_action_esp_replay_none(struct ib_flow_action_attrs_esp_replays *replay,
+				       bool is_modify)
+{
+	/* This is used in order to modify an esp flow action with an enabled
+	 * replay protection to a disabled one. This is only supported via
+	 * modify, as in create verb we can simply drop the REPLAY attribute and
+	 * achieve the same thing.
+	 */
+	return is_modify ? 0 : -EINVAL;
+}
+
+static int flow_action_esp_replay_def_ok(struct ib_flow_action_attrs_esp_replays *replay,
+					 bool is_modify)
+{
+	/* Some replay protections could always be enabled without validating
+	 * anything.
+	 */
+	return 0;
+}
+
+static int (* const flow_action_esp_replay_validate[])(struct ib_flow_action_attrs_esp_replays *replay,
+						       bool is_modify) = {
+	[IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = flow_action_esp_replay_none,
+	[IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = flow_action_esp_replay_def_ok,
+};
+
+static int parse_esp_ip(enum ib_flow_spec_type proto,
+			const void __user *val_ptr,
+			size_t len, union ib_flow_spec *out)
+{
+	int ret;
+	const struct ib_uverbs_flow_ipv4_filter ipv4 = {
+		.src_ip = cpu_to_be32(0xffffffffUL),
+		.dst_ip = cpu_to_be32(0xffffffffUL),
+		.proto = 0xff,
+		.tos = 0xff,
+		.ttl = 0xff,
+		.flags = 0xff,
+	};
+	const struct ib_uverbs_flow_ipv6_filter ipv6 = {
+		.src_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+			   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+		.dst_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+			   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+		.flow_label = cpu_to_be32(0xffffffffUL),
+		.next_hdr = 0xff,
+		.traffic_class = 0xff,
+		.hop_limit = 0xff,
+	};
+	union {
+		struct ib_uverbs_flow_ipv4_filter ipv4;
+		struct ib_uverbs_flow_ipv6_filter ipv6;
+	} user_val = {};
+	const void *user_pmask;
+	size_t val_len;
+
+	/* If the flow IPv4/IPv6 flow specifications are extended, the mask
+	 * should be changed as well.
+	 */
+	BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv4_filter, flags) +
+		     sizeof(ipv4.flags) != sizeof(ipv4));
+	BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv6_filter, reserved) +
+		     sizeof(ipv6.reserved) != sizeof(ipv6));
+
+	switch (proto) {
+	case IB_FLOW_SPEC_IPV4:
+		if (len > sizeof(user_val.ipv4) &&
+		    !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv4),
+					  len - sizeof(user_val.ipv4)))
+			return -EOPNOTSUPP;
+
+		val_len = min_t(size_t, len, sizeof(user_val.ipv4));
+		ret = copy_from_user(&user_val.ipv4, val_ptr,
+				     val_len);
+		if (ret)
+			return -EFAULT;
+
+		user_pmask = &ipv4;
+		break;
+	case IB_FLOW_SPEC_IPV6:
+		if (len > sizeof(user_val.ipv6) &&
+		    !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv6),
+					  len - sizeof(user_val.ipv6)))
+			return -EOPNOTSUPP;
+
+		val_len = min_t(size_t, len, sizeof(user_val.ipv6));
+		ret = copy_from_user(&user_val.ipv6, val_ptr,
+				     val_len);
+		if (ret)
+			return -EFAULT;
+
+		user_pmask = &ipv6;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return ib_uverbs_kern_spec_to_ib_spec_filter(proto, user_pmask,
+						     &user_val,
+						     val_len, out);
+}
+
+static int flow_action_esp_get_encap(struct ib_flow_spec_list *out,
+				     struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uverbs_flow_action_esp_encap uverbs_encap;
+	int ret;
+
+	ret = uverbs_copy_from(&uverbs_encap, attrs,
+			       UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP);
+	if (ret)
+		return ret;
+
+	/* We currently support only one encap */
+	if (uverbs_encap.next_ptr)
+		return -EOPNOTSUPP;
+
+	if (uverbs_encap.type != IB_FLOW_SPEC_IPV4 &&
+	    uverbs_encap.type != IB_FLOW_SPEC_IPV6)
+		return -EOPNOTSUPP;
+
+	return parse_esp_ip(uverbs_encap.type,
+			    u64_to_user_ptr(uverbs_encap.val_ptr),
+			    uverbs_encap.len,
+			    &out->spec);
+}
+
+struct ib_flow_action_esp_attr {
+	struct	ib_flow_action_attrs_esp		hdr;
+	struct	ib_flow_action_attrs_esp_keymats	keymat;
+	struct	ib_flow_action_attrs_esp_replays	replay;
+	/* We currently support only one spec */
+	struct	ib_flow_spec_list			encap;
+};
+
+#define ESP_LAST_SUPPORTED_FLAG		IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW
+static int parse_flow_action_esp(struct ib_device *ib_dev,
+				 struct ib_uverbs_file *file,
+				 struct uverbs_attr_bundle *attrs,
+				 struct ib_flow_action_esp_attr *esp_attr,
+				 bool is_modify)
+{
+	struct ib_uverbs_flow_action_esp uverbs_esp = {};
+	int ret;
+
+	/* Optional param, if it doesn't exist, we get -ENOENT and skip it */
+	ret = uverbs_copy_from(&esp_attr->hdr.esn, attrs,
+			       UVERBS_ATTR_FLOW_ACTION_ESP_ESN);
+	if (IS_UVERBS_COPY_ERR(ret))
+		return ret;
+
+	/* This can be called from FLOW_ACTION_ESP_MODIFY where
+	 * UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS is optional
+	 */
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS)) {
+		ret = uverbs_copy_from_or_zero(&uverbs_esp, attrs,
+					       UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS);
+		if (ret)
+			return ret;
+
+		if (uverbs_esp.flags & ~((ESP_LAST_SUPPORTED_FLAG << 1) - 1))
+			return -EOPNOTSUPP;
+
+		esp_attr->hdr.spi = uverbs_esp.spi;
+		esp_attr->hdr.seq = uverbs_esp.seq;
+		esp_attr->hdr.tfc_pad = uverbs_esp.tfc_pad;
+		esp_attr->hdr.hard_limit_pkts = uverbs_esp.hard_limit_pkts;
+	}
+	esp_attr->hdr.flags = esp_flags_uverbs_to_verbs(attrs, uverbs_esp.flags,
+							is_modify);
+
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT)) {
+		esp_attr->keymat.protocol =
+			uverbs_attr_get_enum_id(attrs,
+						UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT);
+		ret = uverbs_copy_from_or_zero(&esp_attr->keymat.keymat,
+					       attrs,
+					       UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT);
+		if (ret)
+			return ret;
+
+		ret = flow_action_esp_keymat_validate[esp_attr->keymat.protocol](&esp_attr->keymat);
+		if (ret)
+			return ret;
+
+		esp_attr->hdr.keymat = &esp_attr->keymat;
+	}
+
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY)) {
+		esp_attr->replay.protocol =
+			uverbs_attr_get_enum_id(attrs,
+						UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY);
+
+		ret = uverbs_copy_from_or_zero(&esp_attr->replay.replay,
+					       attrs,
+					       UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY);
+		if (ret)
+			return ret;
+
+		ret = flow_action_esp_replay_validate[esp_attr->replay.protocol](&esp_attr->replay,
+										 is_modify);
+		if (ret)
+			return ret;
+
+		esp_attr->hdr.replay = &esp_attr->replay;
+	}
+
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP)) {
+		ret = flow_action_esp_get_encap(&esp_attr->encap, attrs);
+		if (ret)
+			return ret;
+
+		esp_attr->hdr.encap = &esp_attr->encap;
+	}
+
+	return 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(struct ib_device *ib_dev,
+								struct ib_uverbs_file *file,
+								struct uverbs_attr_bundle *attrs)
+{
+	int				  ret;
+	struct ib_uobject		  *uobj;
+	struct ib_flow_action		  *action;
+	struct ib_flow_action_esp_attr	  esp_attr = {};
+
+	if (!ib_dev->create_flow_action_esp)
+		return -EOPNOTSUPP;
+
+	ret = parse_flow_action_esp(ib_dev, file, attrs, &esp_attr, false);
+	if (ret)
+		return ret;
+
+	/* No need to check as this attribute is marked as MANDATORY */
+	uobj = uverbs_attr_get(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE)->obj_attr.uobject;
+	action = ib_dev->create_flow_action_esp(ib_dev, &esp_attr.hdr, attrs);
+	if (IS_ERR(action))
+		return PTR_ERR(action);
+
+	atomic_set(&action->usecnt, 0);
+	action->device = ib_dev;
+	action->type = IB_FLOW_ACTION_ESP;
+	action->uobject = uobj;
+	uobj->object = action;
+
+	return 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)(struct ib_device *ib_dev,
+								struct ib_uverbs_file *file,
+								struct uverbs_attr_bundle *attrs)
+{
+	int				  ret;
+	struct ib_uobject		  *uobj;
+	struct ib_flow_action		  *action;
+	struct ib_flow_action_esp_attr	  esp_attr = {};
+
+	if (!ib_dev->modify_flow_action_esp)
+		return -EOPNOTSUPP;
+
+	ret = parse_flow_action_esp(ib_dev, file, attrs, &esp_attr, true);
+	if (ret)
+		return ret;
+
+	uobj = uverbs_attr_get(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE)->obj_attr.uobject;
+	action = uobj->object;
+
+	if (action->type != IB_FLOW_ACTION_ESP)
+		return -EINVAL;
+
+	return ib_dev->modify_flow_action_esp(action,
+					      &esp_attr.hdr,
+					      attrs);
+}
+
+static const struct uverbs_attr_spec uverbs_flow_action_esp_keymat[] = {
+	[IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = {
+		{ .ptr = {
+			.type = UVERBS_ATTR_TYPE_PTR_IN,
+			UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_keymat_aes_gcm),
+			.flags = UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO,
+		} },
+	},
+};
+
+static const struct uverbs_attr_spec uverbs_flow_action_esp_replay[] = {
+	[IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = {
+		{ .ptr = {
+			.type = UVERBS_ATTR_TYPE_PTR_IN,
+			/* No need to specify any data */
+			.len = 0,
+		} }
+	},
+	[IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = {
+		{ .ptr = {
+			.type = UVERBS_ATTR_TYPE_PTR_IN,
+			UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_replay_bmp, size),
+			.flags = UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO,
+		} }
+	},
+};
+
+static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE,
+	&UVERBS_ATTR_IDR(UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE, UVERBS_OBJECT_FLOW_ACTION,
+			 UVERBS_ACCESS_NEW,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS,
+			    UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, hard_limit_pkts),
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY |
+				     UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO)),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, UVERBS_ATTR_TYPE(__u32)),
+	&UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT,
+			     uverbs_flow_action_esp_keymat,
+			     UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY,
+			     uverbs_flow_action_esp_replay),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP,
+			    UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_encap, type)));
+
+static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY,
+	&UVERBS_ATTR_IDR(UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE, UVERBS_OBJECT_FLOW_ACTION,
+			 UVERBS_ACCESS_WRITE,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS,
+			    UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, hard_limit_pkts),
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO)),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, UVERBS_ATTR_TYPE(__u32)),
+	&UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT,
+			     uverbs_flow_action_esp_keymat),
+	&UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY,
+			     uverbs_flow_action_esp_replay),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP,
+			    UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_encap, type)));
+
+static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_FLOW_ACTION_DESTROY,
+	uverbs_destroy_def_handler,
+	&UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE,
+			 UVERBS_OBJECT_FLOW_ACTION,
+			 UVERBS_ACCESS_DESTROY,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_FLOW_ACTION,
+			    &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_flow_action),
+			    &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE),
+			    &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY),
+			    &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY));
+
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
new file mode 100644
index 0000000..68f7cad
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+
+static int uverbs_free_mr(struct ib_uobject *uobject,
+			  enum rdma_remove_reason why)
+{
+	return ib_dereg_mr((struct ib_mr *)uobject->object);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(struct ib_device *ib_dev,
+						   struct ib_uverbs_file *file,
+						   struct uverbs_attr_bundle *attrs)
+{
+	struct ib_dm_mr_attr attr = {};
+	struct ib_uobject *uobj;
+	struct ib_dm *dm;
+	struct ib_pd *pd;
+	struct ib_mr *mr;
+	int ret;
+
+	if (!ib_dev->reg_dm_mr)
+		return -EOPNOTSUPP;
+
+	ret = uverbs_copy_from(&attr.offset, attrs, UVERBS_ATTR_REG_DM_MR_OFFSET);
+	if (ret)
+		return ret;
+
+	ret = uverbs_copy_from(&attr.length, attrs,
+			       UVERBS_ATTR_REG_DM_MR_LENGTH);
+	if (ret)
+		return ret;
+
+	ret = uverbs_copy_from(&attr.access_flags, attrs,
+			       UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS);
+	if (ret)
+		return ret;
+
+	if (!(attr.access_flags & IB_ZERO_BASED))
+		return -EINVAL;
+
+	ret = ib_check_mr_access(attr.access_flags);
+	if (ret)
+		return ret;
+
+	pd = uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_PD_HANDLE);
+
+	dm = uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_DM_HANDLE);
+
+	uobj = uverbs_attr_get(attrs, UVERBS_ATTR_REG_DM_MR_HANDLE)->obj_attr.uobject;
+
+	if (attr.offset > dm->length || attr.length > dm->length ||
+	    attr.length > dm->length - attr.offset)
+		return -EINVAL;
+
+	mr = pd->device->reg_dm_mr(pd, dm, &attr, attrs);
+	if (IS_ERR(mr))
+		return PTR_ERR(mr);
+
+	mr->device  = pd->device;
+	mr->pd      = pd;
+	mr->dm      = dm;
+	mr->uobject = uobj;
+	atomic_inc(&pd->usecnt);
+	atomic_inc(&dm->usecnt);
+
+	uobj->object = mr;
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_LKEY, &mr->lkey,
+			     sizeof(mr->lkey));
+	if (ret)
+		goto err_dereg;
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_RKEY,
+			     &mr->rkey, sizeof(mr->rkey));
+	if (ret)
+		goto err_dereg;
+
+	return 0;
+
+err_dereg:
+	ib_dereg_mr(mr);
+
+	return ret;
+}
+
+static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_DM_MR_REG,
+	&UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE, UVERBS_OBJECT_MR,
+			 UVERBS_ACCESS_NEW,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_OFFSET,
+			    UVERBS_ATTR_TYPE(u64),
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_LENGTH,
+			    UVERBS_ATTR_TYPE(u64),
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_PD_HANDLE, UVERBS_OBJECT_PD,
+			 UVERBS_ACCESS_READ,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS,
+			    UVERBS_ATTR_TYPE(u32),
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_DM_HANDLE, UVERBS_OBJECT_DM,
+			 UVERBS_ACCESS_READ,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_LKEY,
+			     UVERBS_ATTR_TYPE(u32),
+			     UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_RKEY,
+			     UVERBS_ATTR_TYPE(u32),
+			     UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MR,
+			    /* 1 is used in order to free the MR after all the MWs */
+			    &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_mr),
+			    &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG));
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
new file mode 100644
index 0000000..6ddfb1f
--- /dev/null
+++ b/drivers/infiniband/core/verbs.c
@@ -0,0 +1,2344 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/addrconf.h>
+#include <linux/security.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+#include <rdma/rw.h>
+
+#include "core_priv.h"
+
+static int ib_resolve_eth_dmac(struct ib_device *device,
+			       struct rdma_ah_attr *ah_attr);
+
+static const char * const ib_events[] = {
+	[IB_EVENT_CQ_ERR]		= "CQ error",
+	[IB_EVENT_QP_FATAL]		= "QP fatal error",
+	[IB_EVENT_QP_REQ_ERR]		= "QP request error",
+	[IB_EVENT_QP_ACCESS_ERR]	= "QP access error",
+	[IB_EVENT_COMM_EST]		= "communication established",
+	[IB_EVENT_SQ_DRAINED]		= "send queue drained",
+	[IB_EVENT_PATH_MIG]		= "path migration successful",
+	[IB_EVENT_PATH_MIG_ERR]		= "path migration error",
+	[IB_EVENT_DEVICE_FATAL]		= "device fatal error",
+	[IB_EVENT_PORT_ACTIVE]		= "port active",
+	[IB_EVENT_PORT_ERR]		= "port error",
+	[IB_EVENT_LID_CHANGE]		= "LID change",
+	[IB_EVENT_PKEY_CHANGE]		= "P_key change",
+	[IB_EVENT_SM_CHANGE]		= "SM change",
+	[IB_EVENT_SRQ_ERR]		= "SRQ error",
+	[IB_EVENT_SRQ_LIMIT_REACHED]	= "SRQ limit reached",
+	[IB_EVENT_QP_LAST_WQE_REACHED]	= "last WQE reached",
+	[IB_EVENT_CLIENT_REREGISTER]	= "client reregister",
+	[IB_EVENT_GID_CHANGE]		= "GID changed",
+};
+
+const char *__attribute_const__ ib_event_msg(enum ib_event_type event)
+{
+	size_t index = event;
+
+	return (index < ARRAY_SIZE(ib_events) && ib_events[index]) ?
+			ib_events[index] : "unrecognized event";
+}
+EXPORT_SYMBOL(ib_event_msg);
+
+static const char * const wc_statuses[] = {
+	[IB_WC_SUCCESS]			= "success",
+	[IB_WC_LOC_LEN_ERR]		= "local length error",
+	[IB_WC_LOC_QP_OP_ERR]		= "local QP operation error",
+	[IB_WC_LOC_EEC_OP_ERR]		= "local EE context operation error",
+	[IB_WC_LOC_PROT_ERR]		= "local protection error",
+	[IB_WC_WR_FLUSH_ERR]		= "WR flushed",
+	[IB_WC_MW_BIND_ERR]		= "memory management operation error",
+	[IB_WC_BAD_RESP_ERR]		= "bad response error",
+	[IB_WC_LOC_ACCESS_ERR]		= "local access error",
+	[IB_WC_REM_INV_REQ_ERR]		= "invalid request error",
+	[IB_WC_REM_ACCESS_ERR]		= "remote access error",
+	[IB_WC_REM_OP_ERR]		= "remote operation error",
+	[IB_WC_RETRY_EXC_ERR]		= "transport retry counter exceeded",
+	[IB_WC_RNR_RETRY_EXC_ERR]	= "RNR retry counter exceeded",
+	[IB_WC_LOC_RDD_VIOL_ERR]	= "local RDD violation error",
+	[IB_WC_REM_INV_RD_REQ_ERR]	= "remote invalid RD request",
+	[IB_WC_REM_ABORT_ERR]		= "operation aborted",
+	[IB_WC_INV_EECN_ERR]		= "invalid EE context number",
+	[IB_WC_INV_EEC_STATE_ERR]	= "invalid EE context state",
+	[IB_WC_FATAL_ERR]		= "fatal error",
+	[IB_WC_RESP_TIMEOUT_ERR]	= "response timeout error",
+	[IB_WC_GENERAL_ERR]		= "general error",
+};
+
+const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status)
+{
+	size_t index = status;
+
+	return (index < ARRAY_SIZE(wc_statuses) && wc_statuses[index]) ?
+			wc_statuses[index] : "unrecognized status";
+}
+EXPORT_SYMBOL(ib_wc_status_msg);
+
+__attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
+{
+	switch (rate) {
+	case IB_RATE_2_5_GBPS: return   1;
+	case IB_RATE_5_GBPS:   return   2;
+	case IB_RATE_10_GBPS:  return   4;
+	case IB_RATE_20_GBPS:  return   8;
+	case IB_RATE_30_GBPS:  return  12;
+	case IB_RATE_40_GBPS:  return  16;
+	case IB_RATE_60_GBPS:  return  24;
+	case IB_RATE_80_GBPS:  return  32;
+	case IB_RATE_120_GBPS: return  48;
+	case IB_RATE_14_GBPS:  return   6;
+	case IB_RATE_56_GBPS:  return  22;
+	case IB_RATE_112_GBPS: return  45;
+	case IB_RATE_168_GBPS: return  67;
+	case IB_RATE_25_GBPS:  return  10;
+	case IB_RATE_100_GBPS: return  40;
+	case IB_RATE_200_GBPS: return  80;
+	case IB_RATE_300_GBPS: return 120;
+	default:	       return  -1;
+	}
+}
+EXPORT_SYMBOL(ib_rate_to_mult);
+
+__attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
+{
+	switch (mult) {
+	case 1:   return IB_RATE_2_5_GBPS;
+	case 2:   return IB_RATE_5_GBPS;
+	case 4:   return IB_RATE_10_GBPS;
+	case 8:   return IB_RATE_20_GBPS;
+	case 12:  return IB_RATE_30_GBPS;
+	case 16:  return IB_RATE_40_GBPS;
+	case 24:  return IB_RATE_60_GBPS;
+	case 32:  return IB_RATE_80_GBPS;
+	case 48:  return IB_RATE_120_GBPS;
+	case 6:   return IB_RATE_14_GBPS;
+	case 22:  return IB_RATE_56_GBPS;
+	case 45:  return IB_RATE_112_GBPS;
+	case 67:  return IB_RATE_168_GBPS;
+	case 10:  return IB_RATE_25_GBPS;
+	case 40:  return IB_RATE_100_GBPS;
+	case 80:  return IB_RATE_200_GBPS;
+	case 120: return IB_RATE_300_GBPS;
+	default:  return IB_RATE_PORT_CURRENT;
+	}
+}
+EXPORT_SYMBOL(mult_to_ib_rate);
+
+__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
+{
+	switch (rate) {
+	case IB_RATE_2_5_GBPS: return 2500;
+	case IB_RATE_5_GBPS:   return 5000;
+	case IB_RATE_10_GBPS:  return 10000;
+	case IB_RATE_20_GBPS:  return 20000;
+	case IB_RATE_30_GBPS:  return 30000;
+	case IB_RATE_40_GBPS:  return 40000;
+	case IB_RATE_60_GBPS:  return 60000;
+	case IB_RATE_80_GBPS:  return 80000;
+	case IB_RATE_120_GBPS: return 120000;
+	case IB_RATE_14_GBPS:  return 14062;
+	case IB_RATE_56_GBPS:  return 56250;
+	case IB_RATE_112_GBPS: return 112500;
+	case IB_RATE_168_GBPS: return 168750;
+	case IB_RATE_25_GBPS:  return 25781;
+	case IB_RATE_100_GBPS: return 103125;
+	case IB_RATE_200_GBPS: return 206250;
+	case IB_RATE_300_GBPS: return 309375;
+	default:	       return -1;
+	}
+}
+EXPORT_SYMBOL(ib_rate_to_mbps);
+
+__attribute_const__ enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type)
+{
+
+	if (node_type == RDMA_NODE_USNIC)
+		return RDMA_TRANSPORT_USNIC;
+	if (node_type == RDMA_NODE_USNIC_UDP)
+		return RDMA_TRANSPORT_USNIC_UDP;
+	if (node_type == RDMA_NODE_RNIC)
+		return RDMA_TRANSPORT_IWARP;
+
+	return RDMA_TRANSPORT_IB;
+}
+EXPORT_SYMBOL(rdma_node_get_transport);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
+{
+	enum rdma_transport_type lt;
+	if (device->get_link_layer)
+		return device->get_link_layer(device, port_num);
+
+	lt = rdma_node_get_transport(device->node_type);
+	if (lt == RDMA_TRANSPORT_IB)
+		return IB_LINK_LAYER_INFINIBAND;
+
+	return IB_LINK_LAYER_ETHERNET;
+}
+EXPORT_SYMBOL(rdma_port_get_link_layer);
+
+/* Protection domains */
+
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ *
+ * Every PD has a local_dma_lkey which can be used as the lkey value for local
+ * memory operations.
+ */
+struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
+		const char *caller)
+{
+	struct ib_pd *pd;
+	int mr_access_flags = 0;
+
+	pd = device->alloc_pd(device, NULL, NULL);
+	if (IS_ERR(pd))
+		return pd;
+
+	pd->device = device;
+	pd->uobject = NULL;
+	pd->__internal_mr = NULL;
+	atomic_set(&pd->usecnt, 0);
+	pd->flags = flags;
+
+	if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+		pd->local_dma_lkey = device->local_dma_lkey;
+	else
+		mr_access_flags |= IB_ACCESS_LOCAL_WRITE;
+
+	if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
+		pr_warn("%s: enabling unsafe global rkey\n", caller);
+		mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE;
+	}
+
+	pd->res.type = RDMA_RESTRACK_PD;
+	pd->res.kern_name = caller;
+	rdma_restrack_add(&pd->res);
+
+	if (mr_access_flags) {
+		struct ib_mr *mr;
+
+		mr = pd->device->get_dma_mr(pd, mr_access_flags);
+		if (IS_ERR(mr)) {
+			ib_dealloc_pd(pd);
+			return ERR_CAST(mr);
+		}
+
+		mr->device	= pd->device;
+		mr->pd		= pd;
+		mr->uobject	= NULL;
+		mr->need_inval	= false;
+
+		pd->__internal_mr = mr;
+
+		if (!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY))
+			pd->local_dma_lkey = pd->__internal_mr->lkey;
+
+		if (flags & IB_PD_UNSAFE_GLOBAL_RKEY)
+			pd->unsafe_global_rkey = pd->__internal_mr->rkey;
+	}
+
+	return pd;
+}
+EXPORT_SYMBOL(__ib_alloc_pd);
+
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ *
+ * It is an error to call this function while any resources in the pd still
+ * exist.  The caller is responsible to synchronously destroy them and
+ * guarantee no new allocations will happen.
+ */
+void ib_dealloc_pd(struct ib_pd *pd)
+{
+	int ret;
+
+	if (pd->__internal_mr) {
+		ret = pd->device->dereg_mr(pd->__internal_mr);
+		WARN_ON(ret);
+		pd->__internal_mr = NULL;
+	}
+
+	/* uverbs manipulates usecnt with proper locking, while the kabi
+	   requires the caller to guarantee we can't race here. */
+	WARN_ON(atomic_read(&pd->usecnt));
+
+	rdma_restrack_del(&pd->res);
+	/* Making delalloc_pd a void return is a WIP, no driver should return
+	   an error here. */
+	ret = pd->device->dealloc_pd(pd);
+	WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
+}
+EXPORT_SYMBOL(ib_dealloc_pd);
+
+/* Address handles */
+
+static struct ib_ah *_rdma_create_ah(struct ib_pd *pd,
+				     struct rdma_ah_attr *ah_attr,
+				     struct ib_udata *udata)
+{
+	struct ib_ah *ah;
+
+	ah = pd->device->create_ah(pd, ah_attr, udata);
+
+	if (!IS_ERR(ah)) {
+		ah->device  = pd->device;
+		ah->pd      = pd;
+		ah->uobject = NULL;
+		ah->type    = ah_attr->type;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return ah;
+}
+
+struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr)
+{
+	return _rdma_create_ah(pd, ah_attr, NULL);
+}
+EXPORT_SYMBOL(rdma_create_ah);
+
+/**
+ * rdma_create_user_ah - Creates an address handle for the
+ * given address vector.
+ * It resolves destination mac address for ah attribute of RoCE type.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ * @udata: pointer to user's input output buffer information need by
+ *         provider driver.
+ *
+ * It returns 0 on success and returns appropriate error code on error.
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *rdma_create_user_ah(struct ib_pd *pd,
+				  struct rdma_ah_attr *ah_attr,
+				  struct ib_udata *udata)
+{
+	int err;
+
+	if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
+		err = ib_resolve_eth_dmac(pd->device, ah_attr);
+		if (err)
+			return ERR_PTR(err);
+	}
+
+	return _rdma_create_ah(pd, ah_attr, udata);
+}
+EXPORT_SYMBOL(rdma_create_user_ah);
+
+int ib_get_rdma_header_version(const union rdma_network_hdr *hdr)
+{
+	const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh;
+	struct iphdr ip4h_checked;
+	const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh;
+
+	/* If it's IPv6, the version must be 6, otherwise, the first
+	 * 20 bytes (before the IPv4 header) are garbled.
+	 */
+	if (ip6h->version != 6)
+		return (ip4h->version == 4) ? 4 : 0;
+	/* version may be 6 or 4 because the first 20 bytes could be garbled */
+
+	/* RoCE v2 requires no options, thus header length
+	 * must be 5 words
+	 */
+	if (ip4h->ihl != 5)
+		return 6;
+
+	/* Verify checksum.
+	 * We can't write on scattered buffers so we need to copy to
+	 * temp buffer.
+	 */
+	memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
+	ip4h_checked.check = 0;
+	ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5);
+	/* if IPv4 header checksum is OK, believe it */
+	if (ip4h->check == ip4h_checked.check)
+		return 4;
+	return 6;
+}
+EXPORT_SYMBOL(ib_get_rdma_header_version);
+
+static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
+						     u8 port_num,
+						     const struct ib_grh *grh)
+{
+	int grh_version;
+
+	if (rdma_protocol_ib(device, port_num))
+		return RDMA_NETWORK_IB;
+
+	grh_version = ib_get_rdma_header_version((union rdma_network_hdr *)grh);
+
+	if (grh_version == 4)
+		return RDMA_NETWORK_IPV4;
+
+	if (grh->next_hdr == IPPROTO_UDP)
+		return RDMA_NETWORK_IPV6;
+
+	return RDMA_NETWORK_ROCE_V1;
+}
+
+struct find_gid_index_context {
+	u16 vlan_id;
+	enum ib_gid_type gid_type;
+};
+
+static bool find_gid_index(const union ib_gid *gid,
+			   const struct ib_gid_attr *gid_attr,
+			   void *context)
+{
+	struct find_gid_index_context *ctx = context;
+
+	if (ctx->gid_type != gid_attr->gid_type)
+		return false;
+
+	if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) ||
+	    (is_vlan_dev(gid_attr->ndev) &&
+	     vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id))
+		return false;
+
+	return true;
+}
+
+static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num,
+				   u16 vlan_id, const union ib_gid *sgid,
+				   enum ib_gid_type gid_type,
+				   u16 *gid_index)
+{
+	struct find_gid_index_context context = {.vlan_id = vlan_id,
+						 .gid_type = gid_type};
+
+	return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index,
+				     &context, gid_index);
+}
+
+int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr,
+			      enum rdma_network_type net_type,
+			      union ib_gid *sgid, union ib_gid *dgid)
+{
+	struct sockaddr_in  src_in;
+	struct sockaddr_in  dst_in;
+	__be32 src_saddr, dst_saddr;
+
+	if (!sgid || !dgid)
+		return -EINVAL;
+
+	if (net_type == RDMA_NETWORK_IPV4) {
+		memcpy(&src_in.sin_addr.s_addr,
+		       &hdr->roce4grh.saddr, 4);
+		memcpy(&dst_in.sin_addr.s_addr,
+		       &hdr->roce4grh.daddr, 4);
+		src_saddr = src_in.sin_addr.s_addr;
+		dst_saddr = dst_in.sin_addr.s_addr;
+		ipv6_addr_set_v4mapped(src_saddr,
+				       (struct in6_addr *)sgid);
+		ipv6_addr_set_v4mapped(dst_saddr,
+				       (struct in6_addr *)dgid);
+		return 0;
+	} else if (net_type == RDMA_NETWORK_IPV6 ||
+		   net_type == RDMA_NETWORK_IB) {
+		*dgid = hdr->ibgrh.dgid;
+		*sgid = hdr->ibgrh.sgid;
+		return 0;
+	} else {
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr);
+
+/* Resolve destination mac address and hop limit for unicast destination
+ * GID entry, considering the source GID entry as well.
+ * ah_attribute must have have valid port_num, sgid_index.
+ */
+static int ib_resolve_unicast_gid_dmac(struct ib_device *device,
+				       struct rdma_ah_attr *ah_attr)
+{
+	struct ib_gid_attr sgid_attr;
+	struct ib_global_route *grh;
+	int hop_limit = 0xff;
+	union ib_gid sgid;
+	int ret;
+
+	grh = rdma_ah_retrieve_grh(ah_attr);
+
+	ret = ib_query_gid(device,
+			   rdma_ah_get_port_num(ah_attr),
+			   grh->sgid_index,
+			   &sgid, &sgid_attr);
+	if (ret || !sgid_attr.ndev) {
+		if (!ret)
+			ret = -ENXIO;
+		return ret;
+	}
+
+	/* If destination is link local and source GID is RoCEv1,
+	 * IP stack is not used.
+	 */
+	if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw) &&
+	    sgid_attr.gid_type == IB_GID_TYPE_ROCE) {
+		rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw,
+				ah_attr->roce.dmac);
+		goto done;
+	}
+
+	ret = rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid,
+					   ah_attr->roce.dmac,
+					   sgid_attr.ndev, &hop_limit);
+done:
+	dev_put(sgid_attr.ndev);
+
+	grh->hop_limit = hop_limit;
+	return ret;
+}
+
+/*
+ * This function initializes address handle attributes from the incoming packet.
+ * Incoming packet has dgid of the receiver node on which this code is
+ * getting executed and, sgid contains the GID of the sender.
+ *
+ * When resolving mac address of destination, the arrived dgid is used
+ * as sgid and, sgid is used as dgid because sgid contains destinations
+ * GID whom to respond to.
+ *
+ */
+int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num,
+			    const struct ib_wc *wc, const struct ib_grh *grh,
+			    struct rdma_ah_attr *ah_attr)
+{
+	u32 flow_class;
+	u16 gid_index;
+	int ret;
+	enum rdma_network_type net_type = RDMA_NETWORK_IB;
+	enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+	int hoplimit = 0xff;
+	union ib_gid dgid;
+	union ib_gid sgid;
+
+	might_sleep();
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->type = rdma_ah_find_type(device, port_num);
+	if (rdma_cap_eth_ah(device, port_num)) {
+		if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE)
+			net_type = wc->network_hdr_type;
+		else
+			net_type = ib_get_net_type_by_grh(device, port_num, grh);
+		gid_type = ib_network_to_gid_type(net_type);
+	}
+	ret = ib_get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type,
+					&sgid, &dgid);
+	if (ret)
+		return ret;
+
+	rdma_ah_set_sl(ah_attr, wc->sl);
+	rdma_ah_set_port_num(ah_attr, port_num);
+
+	if (rdma_protocol_roce(device, port_num)) {
+		u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
+				wc->vlan_id : 0xffff;
+
+		if (!(wc->wc_flags & IB_WC_GRH))
+			return -EPROTOTYPE;
+
+		ret = get_sgid_index_from_eth(device, port_num,
+					      vlan_id, &dgid,
+					      gid_type, &gid_index);
+		if (ret)
+			return ret;
+
+		flow_class = be32_to_cpu(grh->version_tclass_flow);
+		rdma_ah_set_grh(ah_attr, &sgid,
+				flow_class & 0xFFFFF,
+				(u8)gid_index, hoplimit,
+				(flow_class >> 20) & 0xFF);
+		return ib_resolve_unicast_gid_dmac(device, ah_attr);
+	} else {
+		rdma_ah_set_dlid(ah_attr, wc->slid);
+		rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits);
+
+		if (wc->wc_flags & IB_WC_GRH) {
+			if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) {
+				ret = ib_find_cached_gid_by_port(device, &dgid,
+								 IB_GID_TYPE_IB,
+								 port_num, NULL,
+								 &gid_index);
+				if (ret)
+					return ret;
+			} else {
+				gid_index = 0;
+			}
+
+			flow_class = be32_to_cpu(grh->version_tclass_flow);
+			rdma_ah_set_grh(ah_attr, &sgid,
+					flow_class & 0xFFFFF,
+					(u8)gid_index, hoplimit,
+					(flow_class >> 20) & 0xFF);
+		}
+		return 0;
+	}
+}
+EXPORT_SYMBOL(ib_init_ah_attr_from_wc);
+
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
+				   const struct ib_grh *grh, u8 port_num)
+{
+	struct rdma_ah_attr ah_attr;
+	int ret;
+
+	ret = ib_init_ah_attr_from_wc(pd->device, port_num, wc, grh, &ah_attr);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return rdma_create_ah(pd, &ah_attr);
+}
+EXPORT_SYMBOL(ib_create_ah_from_wc);
+
+int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr)
+{
+	if (ah->type != ah_attr->type)
+		return -EINVAL;
+
+	return ah->device->modify_ah ?
+		ah->device->modify_ah(ah, ah_attr) :
+		-EOPNOTSUPP;
+}
+EXPORT_SYMBOL(rdma_modify_ah);
+
+int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr)
+{
+	return ah->device->query_ah ?
+		ah->device->query_ah(ah, ah_attr) :
+		-EOPNOTSUPP;
+}
+EXPORT_SYMBOL(rdma_query_ah);
+
+int rdma_destroy_ah(struct ib_ah *ah)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = ah->pd;
+	ret = ah->device->destroy_ah(ah);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(rdma_destroy_ah);
+
+/* Shared receive queues */
+
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+			     struct ib_srq_init_attr *srq_init_attr)
+{
+	struct ib_srq *srq;
+
+	if (!pd->device->create_srq)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	srq = pd->device->create_srq(pd, srq_init_attr, NULL);
+
+	if (!IS_ERR(srq)) {
+		srq->device    	   = pd->device;
+		srq->pd        	   = pd;
+		srq->uobject       = NULL;
+		srq->event_handler = srq_init_attr->event_handler;
+		srq->srq_context   = srq_init_attr->srq_context;
+		srq->srq_type      = srq_init_attr->srq_type;
+		if (ib_srq_has_cq(srq->srq_type)) {
+			srq->ext.cq   = srq_init_attr->ext.cq;
+			atomic_inc(&srq->ext.cq->usecnt);
+		}
+		if (srq->srq_type == IB_SRQT_XRC) {
+			srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
+			atomic_inc(&srq->ext.xrc.xrcd->usecnt);
+		}
+		atomic_inc(&pd->usecnt);
+		atomic_set(&srq->usecnt, 0);
+	}
+
+	return srq;
+}
+EXPORT_SYMBOL(ib_create_srq);
+
+int ib_modify_srq(struct ib_srq *srq,
+		  struct ib_srq_attr *srq_attr,
+		  enum ib_srq_attr_mask srq_attr_mask)
+{
+	return srq->device->modify_srq ?
+		srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
+		-EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_modify_srq);
+
+int ib_query_srq(struct ib_srq *srq,
+		 struct ib_srq_attr *srq_attr)
+{
+	return srq->device->query_srq ?
+		srq->device->query_srq(srq, srq_attr) : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_query_srq);
+
+int ib_destroy_srq(struct ib_srq *srq)
+{
+	struct ib_pd *pd;
+	enum ib_srq_type srq_type;
+	struct ib_xrcd *uninitialized_var(xrcd);
+	struct ib_cq *uninitialized_var(cq);
+	int ret;
+
+	if (atomic_read(&srq->usecnt))
+		return -EBUSY;
+
+	pd = srq->pd;
+	srq_type = srq->srq_type;
+	if (ib_srq_has_cq(srq_type))
+		cq = srq->ext.cq;
+	if (srq_type == IB_SRQT_XRC)
+		xrcd = srq->ext.xrc.xrcd;
+
+	ret = srq->device->destroy_srq(srq);
+	if (!ret) {
+		atomic_dec(&pd->usecnt);
+		if (srq_type == IB_SRQT_XRC)
+			atomic_dec(&xrcd->usecnt);
+		if (ib_srq_has_cq(srq_type))
+			atomic_dec(&cq->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_srq);
+
+/* Queue pairs */
+
+static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
+{
+	struct ib_qp *qp = context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->device->event_handler_lock, flags);
+	list_for_each_entry(event->element.qp, &qp->open_list, open_list)
+		if (event->element.qp->event_handler)
+			event->element.qp->event_handler(event, event->element.qp->qp_context);
+	spin_unlock_irqrestore(&qp->device->event_handler_lock, flags);
+}
+
+static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp)
+{
+	mutex_lock(&xrcd->tgt_qp_mutex);
+	list_add(&qp->xrcd_list, &xrcd->tgt_qp_list);
+	mutex_unlock(&xrcd->tgt_qp_mutex);
+}
+
+static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp,
+				  void (*event_handler)(struct ib_event *, void *),
+				  void *qp_context)
+{
+	struct ib_qp *qp;
+	unsigned long flags;
+	int err;
+
+	qp = kzalloc(sizeof *qp, GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->real_qp = real_qp;
+	err = ib_open_shared_qp_security(qp, real_qp->device);
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	qp->real_qp = real_qp;
+	atomic_inc(&real_qp->usecnt);
+	qp->device = real_qp->device;
+	qp->event_handler = event_handler;
+	qp->qp_context = qp_context;
+	qp->qp_num = real_qp->qp_num;
+	qp->qp_type = real_qp->qp_type;
+
+	spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+	list_add(&qp->open_list, &real_qp->open_list);
+	spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+	return qp;
+}
+
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
+			 struct ib_qp_open_attr *qp_open_attr)
+{
+	struct ib_qp *qp, *real_qp;
+
+	if (qp_open_attr->qp_type != IB_QPT_XRC_TGT)
+		return ERR_PTR(-EINVAL);
+
+	qp = ERR_PTR(-EINVAL);
+	mutex_lock(&xrcd->tgt_qp_mutex);
+	list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) {
+		if (real_qp->qp_num == qp_open_attr->qp_num) {
+			qp = __ib_open_qp(real_qp, qp_open_attr->event_handler,
+					  qp_open_attr->qp_context);
+			break;
+		}
+	}
+	mutex_unlock(&xrcd->tgt_qp_mutex);
+	return qp;
+}
+EXPORT_SYMBOL(ib_open_qp);
+
+static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp,
+		struct ib_qp_init_attr *qp_init_attr)
+{
+	struct ib_qp *real_qp = qp;
+
+	qp->event_handler = __ib_shared_qp_event_handler;
+	qp->qp_context = qp;
+	qp->pd = NULL;
+	qp->send_cq = qp->recv_cq = NULL;
+	qp->srq = NULL;
+	qp->xrcd = qp_init_attr->xrcd;
+	atomic_inc(&qp_init_attr->xrcd->usecnt);
+	INIT_LIST_HEAD(&qp->open_list);
+
+	qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
+			  qp_init_attr->qp_context);
+	if (!IS_ERR(qp))
+		__ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
+	else
+		real_qp->device->destroy_qp(real_qp);
+	return qp;
+}
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+			   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
+	struct ib_qp *qp;
+	int ret;
+
+	if (qp_init_attr->rwq_ind_tbl &&
+	    (qp_init_attr->recv_cq ||
+	    qp_init_attr->srq || qp_init_attr->cap.max_recv_wr ||
+	    qp_init_attr->cap.max_recv_sge))
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * If the callers is using the RDMA API calculate the resources
+	 * needed for the RDMA READ/WRITE operations.
+	 *
+	 * Note that these callers need to pass in a port number.
+	 */
+	if (qp_init_attr->cap.max_rdma_ctxs)
+		rdma_rw_init_qp(device, qp_init_attr);
+
+	qp = _ib_create_qp(device, pd, qp_init_attr, NULL, NULL);
+	if (IS_ERR(qp))
+		return qp;
+
+	ret = ib_create_qp_security(qp, device);
+	if (ret) {
+		ib_destroy_qp(qp);
+		return ERR_PTR(ret);
+	}
+
+	qp->real_qp    = qp;
+	qp->qp_type    = qp_init_attr->qp_type;
+	qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl;
+
+	atomic_set(&qp->usecnt, 0);
+	qp->mrs_used = 0;
+	spin_lock_init(&qp->mr_lock);
+	INIT_LIST_HEAD(&qp->rdma_mrs);
+	INIT_LIST_HEAD(&qp->sig_mrs);
+	qp->port = 0;
+
+	if (qp_init_attr->qp_type == IB_QPT_XRC_TGT)
+		return ib_create_xrc_qp(qp, qp_init_attr);
+
+	qp->event_handler = qp_init_attr->event_handler;
+	qp->qp_context = qp_init_attr->qp_context;
+	if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
+		qp->recv_cq = NULL;
+		qp->srq = NULL;
+	} else {
+		qp->recv_cq = qp_init_attr->recv_cq;
+		if (qp_init_attr->recv_cq)
+			atomic_inc(&qp_init_attr->recv_cq->usecnt);
+		qp->srq = qp_init_attr->srq;
+		if (qp->srq)
+			atomic_inc(&qp_init_attr->srq->usecnt);
+	}
+
+	qp->send_cq = qp_init_attr->send_cq;
+	qp->xrcd    = NULL;
+
+	atomic_inc(&pd->usecnt);
+	if (qp_init_attr->send_cq)
+		atomic_inc(&qp_init_attr->send_cq->usecnt);
+	if (qp_init_attr->rwq_ind_tbl)
+		atomic_inc(&qp->rwq_ind_tbl->usecnt);
+
+	if (qp_init_attr->cap.max_rdma_ctxs) {
+		ret = rdma_rw_init_mrs(qp, qp_init_attr);
+		if (ret) {
+			pr_err("failed to init MR pool ret= %d\n", ret);
+			ib_destroy_qp(qp);
+			return ERR_PTR(ret);
+		}
+	}
+
+	/*
+	 * Note: all hw drivers guarantee that max_send_sge is lower than
+	 * the device RDMA WRITE SGE limit but not all hw drivers ensure that
+	 * max_send_sge <= max_sge_rd.
+	 */
+	qp->max_write_sge = qp_init_attr->cap.max_send_sge;
+	qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
+				 device->attrs.max_sge_rd);
+
+	return qp;
+}
+EXPORT_SYMBOL(ib_create_qp);
+
+static const struct {
+	int			valid;
+	enum ib_qp_attr_mask	req_param[IB_QPT_MAX];
+	enum ib_qp_attr_mask	opt_param[IB_QPT_MAX];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_QKEY),
+				[IB_QPT_RAW_PACKET] = IB_QP_PORT,
+				[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		},
+	},
+	[IB_QPS_INIT]  = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_RTR]   = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UC]  = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN),
+				[IB_QPT_RC]  = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN			|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_XRC_INI] = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN),
+				[IB_QPT_XRC_TGT] = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN			|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_MIN_RNR_TIMER),
+			},
+			.opt_param = {
+				 [IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+				 [IB_QPT_UC]  = (IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_RC]  = (IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH		|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH		|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+				 [IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+			 },
+		},
+	},
+	[IB_QPS_RTR]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UD]  = IB_QP_SQ_PSN,
+				[IB_QPT_UC]  = IB_QP_SQ_PSN,
+				[IB_QPT_RC]  = (IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_SQ_PSN			|
+						IB_QP_MAX_QP_RD_ATOMIC),
+				[IB_QPT_XRC_INI] = (IB_QP_TIMEOUT		|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_SQ_PSN			|
+						IB_QP_MAX_QP_RD_ATOMIC),
+				[IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT		|
+						IB_QP_SQ_PSN),
+				[IB_QPT_SMI] = IB_QP_SQ_PSN,
+				[IB_QPT_GSI] = IB_QP_SQ_PSN,
+			},
+			.opt_param = {
+				 [IB_QPT_UD]  = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+				 [IB_QPT_UC]  = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_RC]  = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_MIN_RNR_TIMER		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_MIN_RNR_TIMER		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_SMI] = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+				 [IB_QPT_GSI] = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+				 [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT,
+			 }
+		}
+	},
+	[IB_QPS_RTS]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE		|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_XRC_INI] = (IB_QP_CUR_STATE		|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE		|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE		|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT,
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_UC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_RC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */
+				[IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
+			}
+		},
+	},
+	[IB_QPS_SQD]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_INI] = (IB_QP_CUR_STATE		|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE		|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_AV			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_PORT			|
+						IB_QP_AV			|
+						IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_MAX_QP_RD_ATOMIC		|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_INI] = (IB_QP_PORT			|
+						IB_QP_AV			|
+						IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_MAX_QP_RD_ATOMIC		|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_TGT] = (IB_QP_PORT			|
+						IB_QP_AV			|
+						IB_QP_TIMEOUT			|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_SQE]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_ERR] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 }
+	}
+};
+
+bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+			enum ib_qp_type type, enum ib_qp_attr_mask mask,
+			enum rdma_link_layer ll)
+{
+	enum ib_qp_attr_mask req_param, opt_param;
+
+	if (mask & IB_QP_CUR_STATE  &&
+	    cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+	    cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+		return false;
+
+	if (!qp_state_table[cur_state][next_state].valid)
+		return false;
+
+	req_param = qp_state_table[cur_state][next_state].req_param[type];
+	opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+	if ((mask & req_param) != req_param)
+		return false;
+
+	if (mask & ~(req_param | opt_param | IB_QP_STATE))
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL(ib_modify_qp_is_ok);
+
+static int ib_resolve_eth_dmac(struct ib_device *device,
+			       struct rdma_ah_attr *ah_attr)
+{
+	int           ret = 0;
+	struct ib_global_route *grh;
+
+	if (!rdma_is_port_valid(device, rdma_ah_get_port_num(ah_attr)))
+		return -EINVAL;
+
+	grh = rdma_ah_retrieve_grh(ah_attr);
+
+	if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) {
+		if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) {
+			__be32 addr = 0;
+
+			memcpy(&addr, ah_attr->grh.dgid.raw + 12, 4);
+			ip_eth_mc_map(addr, (char *)ah_attr->roce.dmac);
+		} else {
+			ipv6_eth_mc_map((struct in6_addr *)ah_attr->grh.dgid.raw,
+					(char *)ah_attr->roce.dmac);
+		}
+	} else {
+		ret = ib_resolve_unicast_gid_dmac(device, ah_attr);
+	}
+	return ret;
+}
+
+/**
+ * IB core internal function to perform QP attributes modification.
+ */
+static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
+			 int attr_mask, struct ib_udata *udata)
+{
+	u8 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+	int ret;
+
+	if (rdma_ib_or_roce(qp->device, port)) {
+		if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) {
+			pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n",
+				__func__, qp->device->name);
+			attr->rq_psn &= 0xffffff;
+		}
+
+		if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) {
+			pr_warn("%s: %s sq_psn overflow, masking to 24 bits\n",
+				__func__, qp->device->name);
+			attr->sq_psn &= 0xffffff;
+		}
+	}
+
+	ret = ib_security_modify_qp(qp, attr, attr_mask, udata);
+	if (!ret && (attr_mask & IB_QP_PORT))
+		qp->port = attr->port_num;
+
+	return ret;
+}
+
+static bool is_qp_type_connected(const struct ib_qp *qp)
+{
+	return (qp->qp_type == IB_QPT_UC ||
+		qp->qp_type == IB_QPT_RC ||
+		qp->qp_type == IB_QPT_XRC_INI ||
+		qp->qp_type == IB_QPT_XRC_TGT);
+}
+
+/**
+ * ib_modify_qp_with_udata - Modifies the attributes for the specified QP.
+ * @ib_qp: The QP to modify.
+ * @attr: On input, specifies the QP attributes to modify.  On output,
+ *   the current values of selected QP attributes are returned.
+ * @attr_mask: A bit-mask used to specify which attributes of the QP
+ *   are being modified.
+ * @udata: pointer to user's input output buffer information
+ *   are being modified.
+ * It returns 0 on success and returns appropriate error code on error.
+ */
+int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr,
+			    int attr_mask, struct ib_udata *udata)
+{
+	struct ib_qp *qp = ib_qp->real_qp;
+	int ret;
+
+	if (attr_mask & IB_QP_AV &&
+	    attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE &&
+	    is_qp_type_connected(qp)) {
+		ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr);
+		if (ret)
+			return ret;
+	}
+	return _ib_modify_qp(qp, attr, attr_mask, udata);
+}
+EXPORT_SYMBOL(ib_modify_qp_with_udata);
+
+int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width)
+{
+	int rc;
+	u32 netdev_speed;
+	struct net_device *netdev;
+	struct ethtool_link_ksettings lksettings;
+
+	if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET)
+		return -EINVAL;
+
+	if (!dev->get_netdev)
+		return -EOPNOTSUPP;
+
+	netdev = dev->get_netdev(dev, port_num);
+	if (!netdev)
+		return -ENODEV;
+
+	rtnl_lock();
+	rc = __ethtool_get_link_ksettings(netdev, &lksettings);
+	rtnl_unlock();
+
+	dev_put(netdev);
+
+	if (!rc) {
+		netdev_speed = lksettings.base.speed;
+	} else {
+		netdev_speed = SPEED_1000;
+		pr_warn("%s speed is unknown, defaulting to %d\n", netdev->name,
+			netdev_speed);
+	}
+
+	if (netdev_speed <= SPEED_1000) {
+		*width = IB_WIDTH_1X;
+		*speed = IB_SPEED_SDR;
+	} else if (netdev_speed <= SPEED_10000) {
+		*width = IB_WIDTH_1X;
+		*speed = IB_SPEED_FDR10;
+	} else if (netdev_speed <= SPEED_20000) {
+		*width = IB_WIDTH_4X;
+		*speed = IB_SPEED_DDR;
+	} else if (netdev_speed <= SPEED_25000) {
+		*width = IB_WIDTH_1X;
+		*speed = IB_SPEED_EDR;
+	} else if (netdev_speed <= SPEED_40000) {
+		*width = IB_WIDTH_4X;
+		*speed = IB_SPEED_FDR10;
+	} else {
+		*width = IB_WIDTH_4X;
+		*speed = IB_SPEED_EDR;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_get_eth_speed);
+
+int ib_modify_qp(struct ib_qp *qp,
+		 struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask)
+{
+	return _ib_modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL);
+}
+EXPORT_SYMBOL(ib_modify_qp);
+
+int ib_query_qp(struct ib_qp *qp,
+		struct ib_qp_attr *qp_attr,
+		int qp_attr_mask,
+		struct ib_qp_init_attr *qp_init_attr)
+{
+	return qp->device->query_qp ?
+		qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) :
+		-EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_query_qp);
+
+int ib_close_qp(struct ib_qp *qp)
+{
+	struct ib_qp *real_qp;
+	unsigned long flags;
+
+	real_qp = qp->real_qp;
+	if (real_qp == qp)
+		return -EINVAL;
+
+	spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+	list_del(&qp->open_list);
+	spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+	atomic_dec(&real_qp->usecnt);
+	if (qp->qp_sec)
+		ib_close_shared_qp_security(qp->qp_sec);
+	kfree(qp);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_close_qp);
+
+static int __ib_destroy_shared_qp(struct ib_qp *qp)
+{
+	struct ib_xrcd *xrcd;
+	struct ib_qp *real_qp;
+	int ret;
+
+	real_qp = qp->real_qp;
+	xrcd = real_qp->xrcd;
+
+	mutex_lock(&xrcd->tgt_qp_mutex);
+	ib_close_qp(qp);
+	if (atomic_read(&real_qp->usecnt) == 0)
+		list_del(&real_qp->xrcd_list);
+	else
+		real_qp = NULL;
+	mutex_unlock(&xrcd->tgt_qp_mutex);
+
+	if (real_qp) {
+		ret = ib_destroy_qp(real_qp);
+		if (!ret)
+			atomic_dec(&xrcd->usecnt);
+		else
+			__ib_insert_xrcd_qp(xrcd, real_qp);
+	}
+
+	return 0;
+}
+
+int ib_destroy_qp(struct ib_qp *qp)
+{
+	struct ib_pd *pd;
+	struct ib_cq *scq, *rcq;
+	struct ib_srq *srq;
+	struct ib_rwq_ind_table *ind_tbl;
+	struct ib_qp_security *sec;
+	int ret;
+
+	WARN_ON_ONCE(qp->mrs_used > 0);
+
+	if (atomic_read(&qp->usecnt))
+		return -EBUSY;
+
+	if (qp->real_qp != qp)
+		return __ib_destroy_shared_qp(qp);
+
+	pd   = qp->pd;
+	scq  = qp->send_cq;
+	rcq  = qp->recv_cq;
+	srq  = qp->srq;
+	ind_tbl = qp->rwq_ind_tbl;
+	sec  = qp->qp_sec;
+	if (sec)
+		ib_destroy_qp_security_begin(sec);
+
+	if (!qp->uobject)
+		rdma_rw_cleanup_mrs(qp);
+
+	rdma_restrack_del(&qp->res);
+	ret = qp->device->destroy_qp(qp);
+	if (!ret) {
+		if (pd)
+			atomic_dec(&pd->usecnt);
+		if (scq)
+			atomic_dec(&scq->usecnt);
+		if (rcq)
+			atomic_dec(&rcq->usecnt);
+		if (srq)
+			atomic_dec(&srq->usecnt);
+		if (ind_tbl)
+			atomic_dec(&ind_tbl->usecnt);
+		if (sec)
+			ib_destroy_qp_security_end(sec);
+	} else {
+		if (sec)
+			ib_destroy_qp_security_abort(sec);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_qp);
+
+/* Completion queues */
+
+struct ib_cq *ib_create_cq(struct ib_device *device,
+			   ib_comp_handler comp_handler,
+			   void (*event_handler)(struct ib_event *, void *),
+			   void *cq_context,
+			   const struct ib_cq_init_attr *cq_attr)
+{
+	struct ib_cq *cq;
+
+	cq = device->create_cq(device, cq_attr, NULL, NULL);
+
+	if (!IS_ERR(cq)) {
+		cq->device        = device;
+		cq->uobject       = NULL;
+		cq->comp_handler  = comp_handler;
+		cq->event_handler = event_handler;
+		cq->cq_context    = cq_context;
+		atomic_set(&cq->usecnt, 0);
+		cq->res.type = RDMA_RESTRACK_CQ;
+		rdma_restrack_add(&cq->res);
+	}
+
+	return cq;
+}
+EXPORT_SYMBOL(ib_create_cq);
+
+int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	return cq->device->modify_cq ?
+		cq->device->modify_cq(cq, cq_count, cq_period) : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(rdma_set_cq_moderation);
+
+int ib_destroy_cq(struct ib_cq *cq)
+{
+	if (atomic_read(&cq->usecnt))
+		return -EBUSY;
+
+	rdma_restrack_del(&cq->res);
+	return cq->device->destroy_cq(cq);
+}
+EXPORT_SYMBOL(ib_destroy_cq);
+
+int ib_resize_cq(struct ib_cq *cq, int cqe)
+{
+	return cq->device->resize_cq ?
+		cq->device->resize_cq(cq, cqe, NULL) : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_resize_cq);
+
+/* Memory regions */
+
+int ib_dereg_mr(struct ib_mr *mr)
+{
+	struct ib_pd *pd = mr->pd;
+	struct ib_dm *dm = mr->dm;
+	int ret;
+
+	rdma_restrack_del(&mr->res);
+	ret = mr->device->dereg_mr(mr);
+	if (!ret) {
+		atomic_dec(&pd->usecnt);
+		if (dm)
+			atomic_dec(&dm->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dereg_mr);
+
+/**
+ * ib_alloc_mr() - Allocates a memory region
+ * @pd:            protection domain associated with the region
+ * @mr_type:       memory region type
+ * @max_num_sg:    maximum sg entries available for registration.
+ *
+ * Notes:
+ * Memory registeration page/sg lists must not exceed max_num_sg.
+ * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed
+ * max_num_sg * used_page_size.
+ *
+ */
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+			  enum ib_mr_type mr_type,
+			  u32 max_num_sg)
+{
+	struct ib_mr *mr;
+
+	if (!pd->device->alloc_mr)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->dm      = NULL;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		mr->need_inval = false;
+		mr->res.type = RDMA_RESTRACK_MR;
+		rdma_restrack_add(&mr->res);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_alloc_mr);
+
+/* "Fast" memory regions */
+
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+			    int mr_access_flags,
+			    struct ib_fmr_attr *fmr_attr)
+{
+	struct ib_fmr *fmr;
+
+	if (!pd->device->alloc_fmr)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+	if (!IS_ERR(fmr)) {
+		fmr->device = pd->device;
+		fmr->pd     = pd;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return fmr;
+}
+EXPORT_SYMBOL(ib_alloc_fmr);
+
+int ib_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *fmr;
+
+	if (list_empty(fmr_list))
+		return 0;
+
+	fmr = list_entry(fmr_list->next, struct ib_fmr, list);
+	return fmr->device->unmap_fmr(fmr_list);
+}
+EXPORT_SYMBOL(ib_unmap_fmr);
+
+int ib_dealloc_fmr(struct ib_fmr *fmr)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = fmr->pd;
+	ret = fmr->device->dealloc_fmr(fmr);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_fmr);
+
+/* Multicast groups */
+
+static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid)
+{
+	struct ib_qp_init_attr init_attr = {};
+	struct ib_qp_attr attr = {};
+	int num_eth_ports = 0;
+	int port;
+
+	/* If QP state >= init, it is assigned to a port and we can check this
+	 * port only.
+	 */
+	if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) {
+		if (attr.qp_state >= IB_QPS_INIT) {
+			if (rdma_port_get_link_layer(qp->device, attr.port_num) !=
+			    IB_LINK_LAYER_INFINIBAND)
+				return true;
+			goto lid_check;
+		}
+	}
+
+	/* Can't get a quick answer, iterate over all ports */
+	for (port = 0; port < qp->device->phys_port_cnt; port++)
+		if (rdma_port_get_link_layer(qp->device, port) !=
+		    IB_LINK_LAYER_INFINIBAND)
+			num_eth_ports++;
+
+	/* If we have at lease one Ethernet port, RoCE annex declares that
+	 * multicast LID should be ignored. We can't tell at this step if the
+	 * QP belongs to an IB or Ethernet port.
+	 */
+	if (num_eth_ports)
+		return true;
+
+	/* If all the ports are IB, we can check according to IB spec. */
+lid_check:
+	return !(lid < be16_to_cpu(IB_MULTICAST_LID_BASE) ||
+		 lid == be16_to_cpu(IB_LID_PERMISSIVE));
+}
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+	int ret;
+
+	if (!qp->device->attach_mcast)
+		return -EOPNOTSUPP;
+
+	if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
+	    qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
+		return -EINVAL;
+
+	ret = qp->device->attach_mcast(qp, gid, lid);
+	if (!ret)
+		atomic_inc(&qp->usecnt);
+	return ret;
+}
+EXPORT_SYMBOL(ib_attach_mcast);
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+	int ret;
+
+	if (!qp->device->detach_mcast)
+		return -EOPNOTSUPP;
+
+	if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
+	    qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
+		return -EINVAL;
+
+	ret = qp->device->detach_mcast(qp, gid, lid);
+	if (!ret)
+		atomic_dec(&qp->usecnt);
+	return ret;
+}
+EXPORT_SYMBOL(ib_detach_mcast);
+
+struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller)
+{
+	struct ib_xrcd *xrcd;
+
+	if (!device->alloc_xrcd)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	xrcd = device->alloc_xrcd(device, NULL, NULL);
+	if (!IS_ERR(xrcd)) {
+		xrcd->device = device;
+		xrcd->inode = NULL;
+		atomic_set(&xrcd->usecnt, 0);
+		mutex_init(&xrcd->tgt_qp_mutex);
+		INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+	}
+
+	return xrcd;
+}
+EXPORT_SYMBOL(__ib_alloc_xrcd);
+
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	struct ib_qp *qp;
+	int ret;
+
+	if (atomic_read(&xrcd->usecnt))
+		return -EBUSY;
+
+	while (!list_empty(&xrcd->tgt_qp_list)) {
+		qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list);
+		ret = ib_destroy_qp(qp);
+		if (ret)
+			return ret;
+	}
+
+	return xrcd->device->dealloc_xrcd(xrcd);
+}
+EXPORT_SYMBOL(ib_dealloc_xrcd);
+
+/**
+ * ib_create_wq - Creates a WQ associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the WQ.
+ * @wq_attr: A list of initial attributes required to create the
+ * WQ. If WQ creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created WQ.
+ *
+ * wq_attr->max_wr and wq_attr->max_sge determine
+ * the requested size of the WQ, and set to the actual values allocated
+ * on return.
+ * If ib_create_wq() succeeds, then max_wr and max_sge will always be
+ * at least as large as the requested values.
+ */
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+			   struct ib_wq_init_attr *wq_attr)
+{
+	struct ib_wq *wq;
+
+	if (!pd->device->create_wq)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	wq = pd->device->create_wq(pd, wq_attr, NULL);
+	if (!IS_ERR(wq)) {
+		wq->event_handler = wq_attr->event_handler;
+		wq->wq_context = wq_attr->wq_context;
+		wq->wq_type = wq_attr->wq_type;
+		wq->cq = wq_attr->cq;
+		wq->device = pd->device;
+		wq->pd = pd;
+		wq->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_inc(&wq_attr->cq->usecnt);
+		atomic_set(&wq->usecnt, 0);
+	}
+	return wq;
+}
+EXPORT_SYMBOL(ib_create_wq);
+
+/**
+ * ib_destroy_wq - Destroys the specified WQ.
+ * @wq: The WQ to destroy.
+ */
+int ib_destroy_wq(struct ib_wq *wq)
+{
+	int err;
+	struct ib_cq *cq = wq->cq;
+	struct ib_pd *pd = wq->pd;
+
+	if (atomic_read(&wq->usecnt))
+		return -EBUSY;
+
+	err = wq->device->destroy_wq(wq);
+	if (!err) {
+		atomic_dec(&pd->usecnt);
+		atomic_dec(&cq->usecnt);
+	}
+	return err;
+}
+EXPORT_SYMBOL(ib_destroy_wq);
+
+/**
+ * ib_modify_wq - Modifies the specified WQ.
+ * @wq: The WQ to modify.
+ * @wq_attr: On input, specifies the WQ attributes to modify.
+ * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ
+ *   are being modified.
+ * On output, the current values of selected WQ attributes are returned.
+ */
+int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+		 u32 wq_attr_mask)
+{
+	int err;
+
+	if (!wq->device->modify_wq)
+		return -EOPNOTSUPP;
+
+	err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL);
+	return err;
+}
+EXPORT_SYMBOL(ib_modify_wq);
+
+/*
+ * ib_create_rwq_ind_table - Creates a RQ Indirection Table.
+ * @device: The device on which to create the rwq indirection table.
+ * @ib_rwq_ind_table_init_attr: A list of initial attributes required to
+ * create the Indirection Table.
+ *
+ * Note: The life time of ib_rwq_ind_table_init_attr->ind_tbl is not less
+ *	than the created ib_rwq_ind_table object and the caller is responsible
+ *	for its memory allocation/free.
+ */
+struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
+						 struct ib_rwq_ind_table_init_attr *init_attr)
+{
+	struct ib_rwq_ind_table *rwq_ind_table;
+	int i;
+	u32 table_size;
+
+	if (!device->create_rwq_ind_table)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	table_size = (1 << init_attr->log_ind_tbl_size);
+	rwq_ind_table = device->create_rwq_ind_table(device,
+				init_attr, NULL);
+	if (IS_ERR(rwq_ind_table))
+		return rwq_ind_table;
+
+	rwq_ind_table->ind_tbl = init_attr->ind_tbl;
+	rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size;
+	rwq_ind_table->device = device;
+	rwq_ind_table->uobject = NULL;
+	atomic_set(&rwq_ind_table->usecnt, 0);
+
+	for (i = 0; i < table_size; i++)
+		atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt);
+
+	return rwq_ind_table;
+}
+EXPORT_SYMBOL(ib_create_rwq_ind_table);
+
+/*
+ * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table.
+ * @wq_ind_table: The Indirection Table to destroy.
+*/
+int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table)
+{
+	int err, i;
+	u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size);
+	struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl;
+
+	if (atomic_read(&rwq_ind_table->usecnt))
+		return -EBUSY;
+
+	err = rwq_ind_table->device->destroy_rwq_ind_table(rwq_ind_table);
+	if (!err) {
+		for (i = 0; i < table_size; i++)
+			atomic_dec(&ind_tbl[i]->usecnt);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(ib_destroy_rwq_ind_table);
+
+struct ib_flow *ib_create_flow(struct ib_qp *qp,
+			       struct ib_flow_attr *flow_attr,
+			       int domain)
+{
+	struct ib_flow *flow_id;
+	if (!qp->device->create_flow)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	flow_id = qp->device->create_flow(qp, flow_attr, domain);
+	if (!IS_ERR(flow_id)) {
+		atomic_inc(&qp->usecnt);
+		flow_id->qp = qp;
+	}
+	return flow_id;
+}
+EXPORT_SYMBOL(ib_create_flow);
+
+int ib_destroy_flow(struct ib_flow *flow_id)
+{
+	int err;
+	struct ib_qp *qp = flow_id->qp;
+
+	err = qp->device->destroy_flow(flow_id);
+	if (!err)
+		atomic_dec(&qp->usecnt);
+	return err;
+}
+EXPORT_SYMBOL(ib_destroy_flow);
+
+int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
+		       struct ib_mr_status *mr_status)
+{
+	return mr->device->check_mr_status ?
+		mr->device->check_mr_status(mr, check_mask, mr_status) : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_check_mr_status);
+
+int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port,
+			 int state)
+{
+	if (!device->set_vf_link_state)
+		return -EOPNOTSUPP;
+
+	return device->set_vf_link_state(device, vf, port, state);
+}
+EXPORT_SYMBOL(ib_set_vf_link_state);
+
+int ib_get_vf_config(struct ib_device *device, int vf, u8 port,
+		     struct ifla_vf_info *info)
+{
+	if (!device->get_vf_config)
+		return -EOPNOTSUPP;
+
+	return device->get_vf_config(device, vf, port, info);
+}
+EXPORT_SYMBOL(ib_get_vf_config);
+
+int ib_get_vf_stats(struct ib_device *device, int vf, u8 port,
+		    struct ifla_vf_stats *stats)
+{
+	if (!device->get_vf_stats)
+		return -EOPNOTSUPP;
+
+	return device->get_vf_stats(device, vf, port, stats);
+}
+EXPORT_SYMBOL(ib_get_vf_stats);
+
+int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
+		   int type)
+{
+	if (!device->set_vf_guid)
+		return -EOPNOTSUPP;
+
+	return device->set_vf_guid(device, vf, port, guid, type);
+}
+EXPORT_SYMBOL(ib_set_vf_guid);
+
+/**
+ * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list
+ *     and set it the memory region.
+ * @mr:            memory region
+ * @sg:            dma mapped scatterlist
+ * @sg_nents:      number of entries in sg
+ * @sg_offset:     offset in bytes into sg
+ * @page_size:     page vector desired page size
+ *
+ * Constraints:
+ * - The first sg element is allowed to have an offset.
+ * - Each sg element must either be aligned to page_size or virtually
+ *   contiguous to the previous element. In case an sg element has a
+ *   non-contiguous offset, the mapping prefix will not include it.
+ * - The last sg element is allowed to have length less than page_size.
+ * - If sg_nents total byte length exceeds the mr max_num_sge * page_size
+ *   then only max_num_sg entries will be mapped.
+ * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS, none of these
+ *   constraints holds and the page_size argument is ignored.
+ *
+ * Returns the number of sg elements that were mapped to the memory region.
+ *
+ * After this completes successfully, the  memory region
+ * is ready for registration.
+ */
+int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+		 unsigned int *sg_offset, unsigned int page_size)
+{
+	if (unlikely(!mr->device->map_mr_sg))
+		return -EOPNOTSUPP;
+
+	mr->page_size = page_size;
+
+	return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset);
+}
+EXPORT_SYMBOL(ib_map_mr_sg);
+
+/**
+ * ib_sg_to_pages() - Convert the largest prefix of a sg list
+ *     to a page vector
+ * @mr:            memory region
+ * @sgl:           dma mapped scatterlist
+ * @sg_nents:      number of entries in sg
+ * @sg_offset_p:   IN:  start offset in bytes into sg
+ *                 OUT: offset in bytes for element n of the sg of the first
+ *                      byte that has not been processed where n is the return
+ *                      value of this function.
+ * @set_page:      driver page assignment function pointer
+ *
+ * Core service helper for drivers to convert the largest
+ * prefix of given sg list to a page vector. The sg list
+ * prefix converted is the prefix that meet the requirements
+ * of ib_map_mr_sg.
+ *
+ * Returns the number of sg elements that were assigned to
+ * a page vector.
+ */
+int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents,
+		unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64))
+{
+	struct scatterlist *sg;
+	u64 last_end_dma_addr = 0;
+	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
+	unsigned int last_page_off = 0;
+	u64 page_mask = ~((u64)mr->page_size - 1);
+	int i, ret;
+
+	if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0])))
+		return -EINVAL;
+
+	mr->iova = sg_dma_address(&sgl[0]) + sg_offset;
+	mr->length = 0;
+
+	for_each_sg(sgl, sg, sg_nents, i) {
+		u64 dma_addr = sg_dma_address(sg) + sg_offset;
+		u64 prev_addr = dma_addr;
+		unsigned int dma_len = sg_dma_len(sg) - sg_offset;
+		u64 end_dma_addr = dma_addr + dma_len;
+		u64 page_addr = dma_addr & page_mask;
+
+		/*
+		 * For the second and later elements, check whether either the
+		 * end of element i-1 or the start of element i is not aligned
+		 * on a page boundary.
+		 */
+		if (i && (last_page_off != 0 || page_addr != dma_addr)) {
+			/* Stop mapping if there is a gap. */
+			if (last_end_dma_addr != dma_addr)
+				break;
+
+			/*
+			 * Coalesce this element with the last. If it is small
+			 * enough just update mr->length. Otherwise start
+			 * mapping from the next page.
+			 */
+			goto next_page;
+		}
+
+		do {
+			ret = set_page(mr, page_addr);
+			if (unlikely(ret < 0)) {
+				sg_offset = prev_addr - sg_dma_address(sg);
+				mr->length += prev_addr - dma_addr;
+				if (sg_offset_p)
+					*sg_offset_p = sg_offset;
+				return i || sg_offset ? i : ret;
+			}
+			prev_addr = page_addr;
+next_page:
+			page_addr += mr->page_size;
+		} while (page_addr < end_dma_addr);
+
+		mr->length += dma_len;
+		last_end_dma_addr = end_dma_addr;
+		last_page_off = end_dma_addr & ~page_mask;
+
+		sg_offset = 0;
+	}
+
+	if (sg_offset_p)
+		*sg_offset_p = 0;
+	return i;
+}
+EXPORT_SYMBOL(ib_sg_to_pages);
+
+struct ib_drain_cqe {
+	struct ib_cqe cqe;
+	struct completion done;
+};
+
+static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe,
+						cqe);
+
+	complete(&cqe->done);
+}
+
+/*
+ * Post a WR and block until its completion is reaped for the SQ.
+ */
+static void __ib_drain_sq(struct ib_qp *qp)
+{
+	struct ib_cq *cq = qp->send_cq;
+	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+	struct ib_drain_cqe sdrain;
+	struct ib_send_wr *bad_swr;
+	struct ib_rdma_wr swr = {
+		.wr = {
+			.next = NULL,
+			{ .wr_cqe	= &sdrain.cqe, },
+			.opcode	= IB_WR_RDMA_WRITE,
+		},
+	};
+	int ret;
+
+	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+	if (ret) {
+		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
+		return;
+	}
+
+	sdrain.cqe.done = ib_drain_qp_done;
+	init_completion(&sdrain.done);
+
+	ret = ib_post_send(qp, &swr.wr, &bad_swr);
+	if (ret) {
+		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
+		return;
+	}
+
+	if (cq->poll_ctx == IB_POLL_DIRECT)
+		while (wait_for_completion_timeout(&sdrain.done, HZ / 10) <= 0)
+			ib_process_cq_direct(cq, -1);
+	else
+		wait_for_completion(&sdrain.done);
+}
+
+/*
+ * Post a WR and block until its completion is reaped for the RQ.
+ */
+static void __ib_drain_rq(struct ib_qp *qp)
+{
+	struct ib_cq *cq = qp->recv_cq;
+	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+	struct ib_drain_cqe rdrain;
+	struct ib_recv_wr rwr = {}, *bad_rwr;
+	int ret;
+
+	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+	if (ret) {
+		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
+		return;
+	}
+
+	rwr.wr_cqe = &rdrain.cqe;
+	rdrain.cqe.done = ib_drain_qp_done;
+	init_completion(&rdrain.done);
+
+	ret = ib_post_recv(qp, &rwr, &bad_rwr);
+	if (ret) {
+		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
+		return;
+	}
+
+	if (cq->poll_ctx == IB_POLL_DIRECT)
+		while (wait_for_completion_timeout(&rdrain.done, HZ / 10) <= 0)
+			ib_process_cq_direct(cq, -1);
+	else
+		wait_for_completion(&rdrain.done);
+}
+
+/**
+ * ib_drain_sq() - Block until all SQ CQEs have been consumed by the
+ *		   application.
+ * @qp:            queue pair to drain
+ *
+ * If the device has a provider-specific drain function, then
+ * call that.  Otherwise call the generic drain function
+ * __ib_drain_sq().
+ *
+ * The caller must:
+ *
+ * ensure there is room in the CQ and SQ for the drain work request and
+ * completion.
+ *
+ * allocate the CQ using ib_alloc_cq().
+ *
+ * ensure that there are no other contexts that are posting WRs concurrently.
+ * Otherwise the drain is not guaranteed.
+ */
+void ib_drain_sq(struct ib_qp *qp)
+{
+	if (qp->device->drain_sq)
+		qp->device->drain_sq(qp);
+	else
+		__ib_drain_sq(qp);
+}
+EXPORT_SYMBOL(ib_drain_sq);
+
+/**
+ * ib_drain_rq() - Block until all RQ CQEs have been consumed by the
+ *		   application.
+ * @qp:            queue pair to drain
+ *
+ * If the device has a provider-specific drain function, then
+ * call that.  Otherwise call the generic drain function
+ * __ib_drain_rq().
+ *
+ * The caller must:
+ *
+ * ensure there is room in the CQ and RQ for the drain work request and
+ * completion.
+ *
+ * allocate the CQ using ib_alloc_cq().
+ *
+ * ensure that there are no other contexts that are posting WRs concurrently.
+ * Otherwise the drain is not guaranteed.
+ */
+void ib_drain_rq(struct ib_qp *qp)
+{
+	if (qp->device->drain_rq)
+		qp->device->drain_rq(qp);
+	else
+		__ib_drain_rq(qp);
+}
+EXPORT_SYMBOL(ib_drain_rq);
+
+/**
+ * ib_drain_qp() - Block until all CQEs have been consumed by the
+ *		   application on both the RQ and SQ.
+ * @qp:            queue pair to drain
+ *
+ * The caller must:
+ *
+ * ensure there is room in the CQ(s), SQ, and RQ for drain work requests
+ * and completions.
+ *
+ * allocate the CQs using ib_alloc_cq().
+ *
+ * ensure that there are no other contexts that are posting WRs concurrently.
+ * Otherwise the drain is not guaranteed.
+ */
+void ib_drain_qp(struct ib_qp *qp)
+{
+	ib_drain_sq(qp);
+	if (!qp->srq)
+		ib_drain_rq(qp);
+}
+EXPORT_SYMBOL(ib_drain_qp);
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
new file mode 100644
index 0000000..e4f31c1
--- /dev/null
+++ b/drivers/infiniband/hw/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_INFINIBAND_MTHCA)		+= mthca/
+obj-$(CONFIG_INFINIBAND_QIB)		+= qib/
+obj-$(CONFIG_INFINIBAND_CXGB3)		+= cxgb3/
+obj-$(CONFIG_INFINIBAND_CXGB4)		+= cxgb4/
+obj-$(CONFIG_INFINIBAND_I40IW)		+= i40iw/
+obj-$(CONFIG_MLX4_INFINIBAND)		+= mlx4/
+obj-$(CONFIG_MLX5_INFINIBAND)		+= mlx5/
+obj-$(CONFIG_INFINIBAND_NES)		+= nes/
+obj-$(CONFIG_INFINIBAND_OCRDMA)		+= ocrdma/
+obj-$(CONFIG_INFINIBAND_VMWARE_PVRDMA)	+= vmw_pvrdma/
+obj-$(CONFIG_INFINIBAND_USNIC)		+= usnic/
+obj-$(CONFIG_INFINIBAND_HFI1)		+= hfi1/
+obj-$(CONFIG_INFINIBAND_HNS)		+= hns/
+obj-$(CONFIG_INFINIBAND_QEDR)		+= qedr/
+obj-$(CONFIG_INFINIBAND_BNXT_RE)	+= bnxt_re/
diff --git a/drivers/infiniband/hw/bnxt_re/Kconfig b/drivers/infiniband/hw/bnxt_re/Kconfig
new file mode 100644
index 0000000..18f5ed0
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/Kconfig
@@ -0,0 +1,10 @@
+config INFINIBAND_BNXT_RE
+    tristate "Broadcom Netxtreme HCA support"
+    depends on ETHERNET && NETDEVICES && PCI && INET && DCB
+    depends on MAY_USE_DEVLINK
+    select NET_VENDOR_BROADCOM
+    select BNXT
+    ---help---
+	  This driver supports Broadcom NetXtreme-E 10/25/40/50 gigabit
+	  RoCE HCAs.  To compile this driver as a module, choose M here:
+	  the module will be called bnxt_re.
diff --git a/drivers/infiniband/hw/bnxt_re/Makefile b/drivers/infiniband/hw/bnxt_re/Makefile
new file mode 100644
index 0000000..6e3bc25
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y := -Idrivers/net/ethernet/broadcom/bnxt
+obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re.o
+bnxt_re-y := main.o ib_verbs.o \
+	     qplib_res.o qplib_rcfw.o	\
+	     qplib_sp.o qplib_fp.o  hw_counters.o
diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
new file mode 100644
index 0000000..96f7689
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -0,0 +1,189 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Slow Path Operators (header)
+ *
+ */
+
+#ifndef __BNXT_RE_H__
+#define __BNXT_RE_H__
+#define ROCE_DRV_MODULE_NAME		"bnxt_re"
+#define ROCE_DRV_MODULE_VERSION		"1.0.0"
+
+#define BNXT_RE_DESC	"Broadcom NetXtreme-C/E RoCE Driver"
+#define BNXT_RE_PAGE_SHIFT_4K		(12)
+#define BNXT_RE_PAGE_SHIFT_8K		(13)
+#define BNXT_RE_PAGE_SHIFT_64K		(16)
+#define BNXT_RE_PAGE_SHIFT_2M		(21)
+#define BNXT_RE_PAGE_SHIFT_8M		(23)
+#define BNXT_RE_PAGE_SHIFT_1G		(30)
+
+#define BNXT_RE_PAGE_SIZE_4K		BIT(BNXT_RE_PAGE_SHIFT_4K)
+#define BNXT_RE_PAGE_SIZE_8K		BIT(BNXT_RE_PAGE_SHIFT_8K)
+#define BNXT_RE_PAGE_SIZE_64K		BIT(BNXT_RE_PAGE_SHIFT_64K)
+#define BNXT_RE_PAGE_SIZE_2M		BIT(BNXT_RE_PAGE_SHIFT_2M)
+#define BNXT_RE_PAGE_SIZE_8M		BIT(BNXT_RE_PAGE_SHIFT_8M)
+#define BNXT_RE_PAGE_SIZE_1G		BIT(BNXT_RE_PAGE_SHIFT_1G)
+
+#define BNXT_RE_MAX_MR_SIZE_LOW		BIT_ULL(BNXT_RE_PAGE_SHIFT_1G)
+#define BNXT_RE_MAX_MR_SIZE_HIGH	BIT_ULL(39)
+#define BNXT_RE_MAX_MR_SIZE		BNXT_RE_MAX_MR_SIZE_HIGH
+
+#define BNXT_RE_MAX_QPC_COUNT		(64 * 1024)
+#define BNXT_RE_MAX_MRW_COUNT		(64 * 1024)
+#define BNXT_RE_MAX_SRQC_COUNT		(64 * 1024)
+#define BNXT_RE_MAX_CQ_COUNT		(64 * 1024)
+#define BNXT_RE_MAX_MRW_COUNT_64K	(64 * 1024)
+#define BNXT_RE_MAX_MRW_COUNT_256K	(256 * 1024)
+
+/* Number of MRs to reserve for PF, leaving remainder for VFs */
+#define BNXT_RE_RESVD_MR_FOR_PF         (32 * 1024)
+#define BNXT_RE_MAX_GID_PER_VF          128
+
+/*
+ * Percentage of resources of each type reserved for PF.
+ * Remaining resources are divided equally among VFs.
+ * [0, 100]
+ */
+#define BNXT_RE_PCT_RSVD_FOR_PF         50
+
+#define BNXT_RE_UD_QP_HW_STALL		0x400000
+
+#define BNXT_RE_RQ_WQE_THRESHOLD	32
+
+/*
+ * Setting the default ack delay value to 16, which means
+ * the default timeout is approx. 260ms(4 usec * 2 ^(timeout))
+ */
+
+#define BNXT_RE_DEFAULT_ACK_DELAY	16
+
+struct bnxt_re_work {
+	struct work_struct	work;
+	unsigned long		event;
+	struct bnxt_re_dev      *rdev;
+	struct net_device	*vlan_dev;
+};
+
+struct bnxt_re_sqp_entries {
+	struct bnxt_qplib_sge sge;
+	u64 wrid;
+	/* For storing the actual qp1 cqe */
+	struct bnxt_qplib_cqe cqe;
+	struct bnxt_re_qp *qp1_qp;
+};
+
+#define BNXT_RE_MIN_MSIX		2
+#define BNXT_RE_MAX_MSIX		9
+#define BNXT_RE_AEQ_IDX			0
+#define BNXT_RE_NQ_IDX			1
+
+struct bnxt_re_dev {
+	struct ib_device		ibdev;
+	struct list_head		list;
+	unsigned long			flags;
+#define BNXT_RE_FLAG_NETDEV_REGISTERED		0
+#define BNXT_RE_FLAG_IBDEV_REGISTERED		1
+#define BNXT_RE_FLAG_GOT_MSIX			2
+#define BNXT_RE_FLAG_HAVE_L2_REF		3
+#define BNXT_RE_FLAG_RCFW_CHANNEL_EN		4
+#define BNXT_RE_FLAG_QOS_WORK_REG		5
+#define BNXT_RE_FLAG_ISSUE_ROCE_STATS          29
+	struct net_device		*netdev;
+	unsigned int			version, major, minor;
+	struct bnxt_en_dev		*en_dev;
+	struct bnxt_msix_entry		msix_entries[BNXT_RE_MAX_MSIX];
+	int				num_msix;
+
+	int				id;
+
+	struct delayed_work		worker;
+	u8				cur_prio_map;
+	u8				active_speed;
+	u8				active_width;
+
+	/* FP Notification Queue (CQ & SRQ) */
+	struct tasklet_struct		nq_task;
+
+	/* RCFW Channel */
+	struct bnxt_qplib_rcfw		rcfw;
+
+	/* NQ */
+	struct bnxt_qplib_nq		nq[BNXT_RE_MAX_MSIX];
+
+	/* Device Resources */
+	struct bnxt_qplib_dev_attr	dev_attr;
+	struct bnxt_qplib_ctx		qplib_ctx;
+	struct bnxt_qplib_res		qplib_res;
+	struct bnxt_qplib_dpi		dpi_privileged;
+
+	atomic_t			qp_count;
+	struct mutex			qp_lock;	/* protect qp list */
+	struct list_head		qp_list;
+
+	atomic_t			cq_count;
+	atomic_t			srq_count;
+	atomic_t			mr_count;
+	atomic_t			mw_count;
+	atomic_t			sched_count;
+	/* Max of 2 lossless traffic class supported per port */
+	u16				cosq[2];
+
+	/* QP for for handling QP1 packets */
+	u32				sqp_id;
+	struct bnxt_re_qp		*qp1_sqp;
+	struct bnxt_re_ah		*sqp_ah;
+	struct bnxt_re_sqp_entries sqp_tbl[1024];
+	atomic_t nq_alloc_cnt;
+	u32 is_virtfn;
+	u32 num_vfs;
+	struct bnxt_qplib_roce_stats	stats;
+};
+
+#define to_bnxt_re_dev(ptr, member)	\
+	container_of((ptr), struct bnxt_re_dev, member)
+
+#define BNXT_RE_ROCE_V1_PACKET		0
+#define BNXT_RE_ROCEV2_IPV4_PACKET	2
+#define BNXT_RE_ROCEV2_IPV6_PACKET	3
+
+static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev)
+{
+	if (rdev)
+		return  &rdev->ibdev.dev;
+	return NULL;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c
new file mode 100644
index 0000000..77416bc
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c
@@ -0,0 +1,239 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Statistics
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/prefetch.h>
+#include <linux/delay.h>
+
+#include <rdma/ib_addr.h>
+
+#include "bnxt_ulp.h"
+#include "roce_hsi.h"
+#include "qplib_res.h"
+#include "qplib_sp.h"
+#include "qplib_fp.h"
+#include "qplib_rcfw.h"
+#include "bnxt_re.h"
+#include "hw_counters.h"
+
+static const char * const bnxt_re_stat_name[] = {
+	[BNXT_RE_ACTIVE_QP]		=  "active_qps",
+	[BNXT_RE_ACTIVE_SRQ]		=  "active_srqs",
+	[BNXT_RE_ACTIVE_CQ]		=  "active_cqs",
+	[BNXT_RE_ACTIVE_MR]		=  "active_mrs",
+	[BNXT_RE_ACTIVE_MW]		=  "active_mws",
+	[BNXT_RE_RX_PKTS]		=  "rx_pkts",
+	[BNXT_RE_RX_BYTES]		=  "rx_bytes",
+	[BNXT_RE_TX_PKTS]		=  "tx_pkts",
+	[BNXT_RE_TX_BYTES]		=  "tx_bytes",
+	[BNXT_RE_RECOVERABLE_ERRORS]	=  "recoverable_errors",
+	[BNXT_RE_TO_RETRANSMITS]        = "to_retransmits",
+	[BNXT_RE_SEQ_ERR_NAKS_RCVD]     = "seq_err_naks_rcvd",
+	[BNXT_RE_MAX_RETRY_EXCEEDED]    = "max_retry_exceeded",
+	[BNXT_RE_RNR_NAKS_RCVD]         = "rnr_naks_rcvd",
+	[BNXT_RE_MISSING_RESP]          = "missin_resp",
+	[BNXT_RE_UNRECOVERABLE_ERR]     = "unrecoverable_err",
+	[BNXT_RE_BAD_RESP_ERR]          = "bad_resp_err",
+	[BNXT_RE_LOCAL_QP_OP_ERR]       = "local_qp_op_err",
+	[BNXT_RE_LOCAL_PROTECTION_ERR]  = "local_protection_err",
+	[BNXT_RE_MEM_MGMT_OP_ERR]       = "mem_mgmt_op_err",
+	[BNXT_RE_REMOTE_INVALID_REQ_ERR] = "remote_invalid_req_err",
+	[BNXT_RE_REMOTE_ACCESS_ERR]     = "remote_access_err",
+	[BNXT_RE_REMOTE_OP_ERR]         = "remote_op_err",
+	[BNXT_RE_DUP_REQ]               = "dup_req",
+	[BNXT_RE_RES_EXCEED_MAX]        = "res_exceed_max",
+	[BNXT_RE_RES_LENGTH_MISMATCH]   = "res_length_mismatch",
+	[BNXT_RE_RES_EXCEEDS_WQE]       = "res_exceeds_wqe",
+	[BNXT_RE_RES_OPCODE_ERR]        = "res_opcode_err",
+	[BNXT_RE_RES_RX_INVALID_RKEY]   = "res_rx_invalid_rkey",
+	[BNXT_RE_RES_RX_DOMAIN_ERR]     = "res_rx_domain_err",
+	[BNXT_RE_RES_RX_NO_PERM]        = "res_rx_no_perm",
+	[BNXT_RE_RES_RX_RANGE_ERR]      = "res_rx_range_err",
+	[BNXT_RE_RES_TX_INVALID_RKEY]   = "res_tx_invalid_rkey",
+	[BNXT_RE_RES_TX_DOMAIN_ERR]     = "res_tx_domain_err",
+	[BNXT_RE_RES_TX_NO_PERM]        = "res_tx_no_perm",
+	[BNXT_RE_RES_TX_RANGE_ERR]      = "res_tx_range_err",
+	[BNXT_RE_RES_IRRQ_OFLOW]        = "res_irrq_oflow",
+	[BNXT_RE_RES_UNSUP_OPCODE]      = "res_unsup_opcode",
+	[BNXT_RE_RES_UNALIGNED_ATOMIC]  = "res_unaligned_atomic",
+	[BNXT_RE_RES_REM_INV_ERR]       = "res_rem_inv_err",
+	[BNXT_RE_RES_MEM_ERROR]         = "res_mem_err",
+	[BNXT_RE_RES_SRQ_ERR]           = "res_srq_err",
+	[BNXT_RE_RES_CMP_ERR]           = "res_cmp_err",
+	[BNXT_RE_RES_INVALID_DUP_RKEY]  = "res_invalid_dup_rkey",
+	[BNXT_RE_RES_WQE_FORMAT_ERR]    = "res_wqe_format_err",
+	[BNXT_RE_RES_CQ_LOAD_ERR]       = "res_cq_load_err",
+	[BNXT_RE_RES_SRQ_LOAD_ERR]      = "res_srq_load_err",
+	[BNXT_RE_RES_TX_PCI_ERR]        = "res_tx_pci_err",
+	[BNXT_RE_RES_RX_PCI_ERR]        = "res_rx_pci_err"
+};
+
+int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
+			    struct rdma_hw_stats *stats,
+			    u8 port, int index)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct ctx_hw_stats *bnxt_re_stats = rdev->qplib_ctx.stats.dma;
+	int rc  = 0;
+
+	if (!port || !stats)
+		return -EINVAL;
+
+	stats->value[BNXT_RE_ACTIVE_QP] = atomic_read(&rdev->qp_count);
+	stats->value[BNXT_RE_ACTIVE_SRQ] = atomic_read(&rdev->srq_count);
+	stats->value[BNXT_RE_ACTIVE_CQ] = atomic_read(&rdev->cq_count);
+	stats->value[BNXT_RE_ACTIVE_MR] = atomic_read(&rdev->mr_count);
+	stats->value[BNXT_RE_ACTIVE_MW] = atomic_read(&rdev->mw_count);
+	if (bnxt_re_stats) {
+		stats->value[BNXT_RE_RECOVERABLE_ERRORS] =
+			le64_to_cpu(bnxt_re_stats->tx_bcast_pkts);
+		stats->value[BNXT_RE_RX_PKTS] =
+			le64_to_cpu(bnxt_re_stats->rx_ucast_pkts);
+		stats->value[BNXT_RE_RX_BYTES] =
+			le64_to_cpu(bnxt_re_stats->rx_ucast_bytes);
+		stats->value[BNXT_RE_TX_PKTS] =
+			le64_to_cpu(bnxt_re_stats->tx_ucast_pkts);
+		stats->value[BNXT_RE_TX_BYTES] =
+			le64_to_cpu(bnxt_re_stats->tx_ucast_bytes);
+	}
+	if (test_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags)) {
+		rc = bnxt_qplib_get_roce_stats(&rdev->rcfw, &rdev->stats);
+		if (rc)
+			clear_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS,
+				  &rdev->flags);
+		stats->value[BNXT_RE_TO_RETRANSMITS] =
+					rdev->stats.to_retransmits;
+		stats->value[BNXT_RE_SEQ_ERR_NAKS_RCVD] =
+					rdev->stats.seq_err_naks_rcvd;
+		stats->value[BNXT_RE_MAX_RETRY_EXCEEDED] =
+					rdev->stats.max_retry_exceeded;
+		stats->value[BNXT_RE_RNR_NAKS_RCVD] =
+					rdev->stats.rnr_naks_rcvd;
+		stats->value[BNXT_RE_MISSING_RESP] =
+					rdev->stats.missing_resp;
+		stats->value[BNXT_RE_UNRECOVERABLE_ERR] =
+					rdev->stats.unrecoverable_err;
+		stats->value[BNXT_RE_BAD_RESP_ERR] =
+					rdev->stats.bad_resp_err;
+		stats->value[BNXT_RE_LOCAL_QP_OP_ERR]	=
+				rdev->stats.local_qp_op_err;
+		stats->value[BNXT_RE_LOCAL_PROTECTION_ERR] =
+				rdev->stats.local_protection_err;
+		stats->value[BNXT_RE_MEM_MGMT_OP_ERR] =
+				rdev->stats.mem_mgmt_op_err;
+		stats->value[BNXT_RE_REMOTE_INVALID_REQ_ERR] =
+				rdev->stats.remote_invalid_req_err;
+		stats->value[BNXT_RE_REMOTE_ACCESS_ERR] =
+				rdev->stats.remote_access_err;
+		stats->value[BNXT_RE_REMOTE_OP_ERR] =
+				rdev->stats.remote_op_err;
+		stats->value[BNXT_RE_DUP_REQ] =
+				rdev->stats.dup_req;
+		stats->value[BNXT_RE_RES_EXCEED_MAX] =
+				rdev->stats.res_exceed_max;
+		stats->value[BNXT_RE_RES_LENGTH_MISMATCH] =
+				rdev->stats.res_length_mismatch;
+		stats->value[BNXT_RE_RES_EXCEEDS_WQE] =
+				rdev->stats.res_exceeds_wqe;
+		stats->value[BNXT_RE_RES_OPCODE_ERR] =
+				rdev->stats.res_opcode_err;
+		stats->value[BNXT_RE_RES_RX_INVALID_RKEY] =
+				rdev->stats.res_rx_invalid_rkey;
+		stats->value[BNXT_RE_RES_RX_DOMAIN_ERR] =
+				rdev->stats.res_rx_domain_err;
+		stats->value[BNXT_RE_RES_RX_NO_PERM] =
+				rdev->stats.res_rx_no_perm;
+		stats->value[BNXT_RE_RES_RX_RANGE_ERR]  =
+				rdev->stats.res_rx_range_err;
+		stats->value[BNXT_RE_RES_TX_INVALID_RKEY] =
+				rdev->stats.res_tx_invalid_rkey;
+		stats->value[BNXT_RE_RES_TX_DOMAIN_ERR] =
+				rdev->stats.res_tx_domain_err;
+		stats->value[BNXT_RE_RES_TX_NO_PERM] =
+				rdev->stats.res_tx_no_perm;
+		stats->value[BNXT_RE_RES_TX_RANGE_ERR]  =
+				rdev->stats.res_tx_range_err;
+		stats->value[BNXT_RE_RES_IRRQ_OFLOW] =
+				rdev->stats.res_irrq_oflow;
+		stats->value[BNXT_RE_RES_UNSUP_OPCODE]  =
+				rdev->stats.res_unsup_opcode;
+		stats->value[BNXT_RE_RES_UNALIGNED_ATOMIC] =
+				rdev->stats.res_unaligned_atomic;
+		stats->value[BNXT_RE_RES_REM_INV_ERR]   =
+				rdev->stats.res_rem_inv_err;
+		stats->value[BNXT_RE_RES_MEM_ERROR] =
+				rdev->stats.res_mem_error;
+		stats->value[BNXT_RE_RES_SRQ_ERR] =
+				rdev->stats.res_srq_err;
+		stats->value[BNXT_RE_RES_CMP_ERR] =
+				rdev->stats.res_cmp_err;
+		stats->value[BNXT_RE_RES_INVALID_DUP_RKEY] =
+				rdev->stats.res_invalid_dup_rkey;
+		stats->value[BNXT_RE_RES_WQE_FORMAT_ERR] =
+				rdev->stats.res_wqe_format_err;
+		stats->value[BNXT_RE_RES_CQ_LOAD_ERR]   =
+				rdev->stats.res_cq_load_err;
+		stats->value[BNXT_RE_RES_SRQ_LOAD_ERR]  =
+				rdev->stats.res_srq_load_err;
+		stats->value[BNXT_RE_RES_TX_PCI_ERR]    =
+				rdev->stats.res_tx_pci_err;
+		stats->value[BNXT_RE_RES_RX_PCI_ERR]    =
+				rdev->stats.res_rx_pci_err;
+	}
+
+	return ARRAY_SIZE(bnxt_re_stat_name);
+}
+
+struct rdma_hw_stats *bnxt_re_ib_alloc_hw_stats(struct ib_device *ibdev,
+						u8 port_num)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(bnxt_re_stat_name) != BNXT_RE_NUM_COUNTERS);
+	/* We support only per port stats */
+	if (!port_num)
+		return NULL;
+
+	return rdma_alloc_hw_stats_struct(bnxt_re_stat_name,
+					  ARRAY_SIZE(bnxt_re_stat_name),
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.h b/drivers/infiniband/hw/bnxt_re/hw_counters.h
new file mode 100644
index 0000000..a01a922
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/hw_counters.h
@@ -0,0 +1,101 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Statistics (header)
+ *
+ */
+
+#ifndef __BNXT_RE_HW_STATS_H__
+#define __BNXT_RE_HW_STATS_H__
+
+enum bnxt_re_hw_stats {
+	BNXT_RE_ACTIVE_QP,
+	BNXT_RE_ACTIVE_SRQ,
+	BNXT_RE_ACTIVE_CQ,
+	BNXT_RE_ACTIVE_MR,
+	BNXT_RE_ACTIVE_MW,
+	BNXT_RE_RX_PKTS,
+	BNXT_RE_RX_BYTES,
+	BNXT_RE_TX_PKTS,
+	BNXT_RE_TX_BYTES,
+	BNXT_RE_RECOVERABLE_ERRORS,
+	BNXT_RE_TO_RETRANSMITS,
+	BNXT_RE_SEQ_ERR_NAKS_RCVD,
+	BNXT_RE_MAX_RETRY_EXCEEDED,
+	BNXT_RE_RNR_NAKS_RCVD,
+	BNXT_RE_MISSING_RESP,
+	BNXT_RE_UNRECOVERABLE_ERR,
+	BNXT_RE_BAD_RESP_ERR,
+	BNXT_RE_LOCAL_QP_OP_ERR,
+	BNXT_RE_LOCAL_PROTECTION_ERR,
+	BNXT_RE_MEM_MGMT_OP_ERR,
+	BNXT_RE_REMOTE_INVALID_REQ_ERR,
+	BNXT_RE_REMOTE_ACCESS_ERR,
+	BNXT_RE_REMOTE_OP_ERR,
+	BNXT_RE_DUP_REQ,
+	BNXT_RE_RES_EXCEED_MAX,
+	BNXT_RE_RES_LENGTH_MISMATCH,
+	BNXT_RE_RES_EXCEEDS_WQE,
+	BNXT_RE_RES_OPCODE_ERR,
+	BNXT_RE_RES_RX_INVALID_RKEY,
+	BNXT_RE_RES_RX_DOMAIN_ERR,
+	BNXT_RE_RES_RX_NO_PERM,
+	BNXT_RE_RES_RX_RANGE_ERR,
+	BNXT_RE_RES_TX_INVALID_RKEY,
+	BNXT_RE_RES_TX_DOMAIN_ERR,
+	BNXT_RE_RES_TX_NO_PERM,
+	BNXT_RE_RES_TX_RANGE_ERR,
+	BNXT_RE_RES_IRRQ_OFLOW,
+	BNXT_RE_RES_UNSUP_OPCODE,
+	BNXT_RE_RES_UNALIGNED_ATOMIC,
+	BNXT_RE_RES_REM_INV_ERR,
+	BNXT_RE_RES_MEM_ERROR,
+	BNXT_RE_RES_SRQ_ERR,
+	BNXT_RE_RES_CMP_ERR,
+	BNXT_RE_RES_INVALID_DUP_RKEY,
+	BNXT_RE_RES_WQE_FORMAT_ERR,
+	BNXT_RE_RES_CQ_LOAD_ERR,
+	BNXT_RE_RES_SRQ_LOAD_ERR,
+	BNXT_RE_RES_TX_PCI_ERR,
+	BNXT_RE_RES_RX_PCI_ERR,
+	BNXT_RE_NUM_COUNTERS
+};
+
+struct rdma_hw_stats *bnxt_re_ib_alloc_hw_stats(struct ib_device *ibdev,
+						u8 port_num);
+int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
+			    struct rdma_hw_stats *stats,
+			    u8 port, int index);
+#endif /* __BNXT_RE_HW_STATS_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
new file mode 100644
index 0000000..a76e206
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -0,0 +1,3803 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: IB Verbs interpreter
+ */
+
+#include <linux/interrupt.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_cache.h>
+
+#include "bnxt_ulp.h"
+
+#include "roce_hsi.h"
+#include "qplib_res.h"
+#include "qplib_sp.h"
+#include "qplib_fp.h"
+#include "qplib_rcfw.h"
+
+#include "bnxt_re.h"
+#include "ib_verbs.h"
+#include <rdma/bnxt_re-abi.h>
+
+static int __from_ib_access_flags(int iflags)
+{
+	int qflags = 0;
+
+	if (iflags & IB_ACCESS_LOCAL_WRITE)
+		qflags |= BNXT_QPLIB_ACCESS_LOCAL_WRITE;
+	if (iflags & IB_ACCESS_REMOTE_READ)
+		qflags |= BNXT_QPLIB_ACCESS_REMOTE_READ;
+	if (iflags & IB_ACCESS_REMOTE_WRITE)
+		qflags |= BNXT_QPLIB_ACCESS_REMOTE_WRITE;
+	if (iflags & IB_ACCESS_REMOTE_ATOMIC)
+		qflags |= BNXT_QPLIB_ACCESS_REMOTE_ATOMIC;
+	if (iflags & IB_ACCESS_MW_BIND)
+		qflags |= BNXT_QPLIB_ACCESS_MW_BIND;
+	if (iflags & IB_ZERO_BASED)
+		qflags |= BNXT_QPLIB_ACCESS_ZERO_BASED;
+	if (iflags & IB_ACCESS_ON_DEMAND)
+		qflags |= BNXT_QPLIB_ACCESS_ON_DEMAND;
+	return qflags;
+};
+
+static enum ib_access_flags __to_ib_access_flags(int qflags)
+{
+	enum ib_access_flags iflags = 0;
+
+	if (qflags & BNXT_QPLIB_ACCESS_LOCAL_WRITE)
+		iflags |= IB_ACCESS_LOCAL_WRITE;
+	if (qflags & BNXT_QPLIB_ACCESS_REMOTE_WRITE)
+		iflags |= IB_ACCESS_REMOTE_WRITE;
+	if (qflags & BNXT_QPLIB_ACCESS_REMOTE_READ)
+		iflags |= IB_ACCESS_REMOTE_READ;
+	if (qflags & BNXT_QPLIB_ACCESS_REMOTE_ATOMIC)
+		iflags |= IB_ACCESS_REMOTE_ATOMIC;
+	if (qflags & BNXT_QPLIB_ACCESS_MW_BIND)
+		iflags |= IB_ACCESS_MW_BIND;
+	if (qflags & BNXT_QPLIB_ACCESS_ZERO_BASED)
+		iflags |= IB_ZERO_BASED;
+	if (qflags & BNXT_QPLIB_ACCESS_ON_DEMAND)
+		iflags |= IB_ACCESS_ON_DEMAND;
+	return iflags;
+};
+
+static int bnxt_re_build_sgl(struct ib_sge *ib_sg_list,
+			     struct bnxt_qplib_sge *sg_list, int num)
+{
+	int i, total = 0;
+
+	for (i = 0; i < num; i++) {
+		sg_list[i].addr = ib_sg_list[i].addr;
+		sg_list[i].lkey = ib_sg_list[i].lkey;
+		sg_list[i].size = ib_sg_list[i].length;
+		total += sg_list[i].size;
+	}
+	return total;
+}
+
+/* Device */
+struct net_device *bnxt_re_get_netdev(struct ib_device *ibdev, u8 port_num)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct net_device *netdev = NULL;
+
+	rcu_read_lock();
+	if (rdev)
+		netdev = rdev->netdev;
+	if (netdev)
+		dev_hold(netdev);
+
+	rcu_read_unlock();
+	return netdev;
+}
+
+int bnxt_re_query_device(struct ib_device *ibdev,
+			 struct ib_device_attr *ib_attr,
+			 struct ib_udata *udata)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+
+	memset(ib_attr, 0, sizeof(*ib_attr));
+	memcpy(&ib_attr->fw_ver, dev_attr->fw_ver,
+	       min(sizeof(dev_attr->fw_ver),
+		   sizeof(ib_attr->fw_ver)));
+	bnxt_qplib_get_guid(rdev->netdev->dev_addr,
+			    (u8 *)&ib_attr->sys_image_guid);
+	ib_attr->max_mr_size = BNXT_RE_MAX_MR_SIZE;
+	ib_attr->page_size_cap = BNXT_RE_PAGE_SIZE_4K | BNXT_RE_PAGE_SIZE_2M;
+
+	ib_attr->vendor_id = rdev->en_dev->pdev->vendor;
+	ib_attr->vendor_part_id = rdev->en_dev->pdev->device;
+	ib_attr->hw_ver = rdev->en_dev->pdev->subsystem_device;
+	ib_attr->max_qp = dev_attr->max_qp;
+	ib_attr->max_qp_wr = dev_attr->max_qp_wqes;
+	ib_attr->device_cap_flags =
+				    IB_DEVICE_CURR_QP_STATE_MOD
+				    | IB_DEVICE_RC_RNR_NAK_GEN
+				    | IB_DEVICE_SHUTDOWN_PORT
+				    | IB_DEVICE_SYS_IMAGE_GUID
+				    | IB_DEVICE_LOCAL_DMA_LKEY
+				    | IB_DEVICE_RESIZE_MAX_WR
+				    | IB_DEVICE_PORT_ACTIVE_EVENT
+				    | IB_DEVICE_N_NOTIFY_CQ
+				    | IB_DEVICE_MEM_WINDOW
+				    | IB_DEVICE_MEM_WINDOW_TYPE_2B
+				    | IB_DEVICE_MEM_MGT_EXTENSIONS;
+	ib_attr->max_sge = dev_attr->max_qp_sges;
+	ib_attr->max_sge_rd = dev_attr->max_qp_sges;
+	ib_attr->max_cq = dev_attr->max_cq;
+	ib_attr->max_cqe = dev_attr->max_cq_wqes;
+	ib_attr->max_mr = dev_attr->max_mr;
+	ib_attr->max_pd = dev_attr->max_pd;
+	ib_attr->max_qp_rd_atom = dev_attr->max_qp_rd_atom;
+	ib_attr->max_qp_init_rd_atom = dev_attr->max_qp_init_rd_atom;
+	ib_attr->atomic_cap = IB_ATOMIC_NONE;
+	ib_attr->masked_atomic_cap = IB_ATOMIC_NONE;
+
+	ib_attr->max_ee_rd_atom = 0;
+	ib_attr->max_res_rd_atom = 0;
+	ib_attr->max_ee_init_rd_atom = 0;
+	ib_attr->max_ee = 0;
+	ib_attr->max_rdd = 0;
+	ib_attr->max_mw = dev_attr->max_mw;
+	ib_attr->max_raw_ipv6_qp = 0;
+	ib_attr->max_raw_ethy_qp = dev_attr->max_raw_ethy_qp;
+	ib_attr->max_mcast_grp = 0;
+	ib_attr->max_mcast_qp_attach = 0;
+	ib_attr->max_total_mcast_qp_attach = 0;
+	ib_attr->max_ah = dev_attr->max_ah;
+
+	ib_attr->max_fmr = 0;
+	ib_attr->max_map_per_fmr = 0;
+
+	ib_attr->max_srq = dev_attr->max_srq;
+	ib_attr->max_srq_wr = dev_attr->max_srq_wqes;
+	ib_attr->max_srq_sge = dev_attr->max_srq_sges;
+
+	ib_attr->max_fast_reg_page_list_len = MAX_PBL_LVL_1_PGS;
+
+	ib_attr->max_pkeys = 1;
+	ib_attr->local_ca_ack_delay = BNXT_RE_DEFAULT_ACK_DELAY;
+	return 0;
+}
+
+int bnxt_re_modify_device(struct ib_device *ibdev,
+			  int device_modify_mask,
+			  struct ib_device_modify *device_modify)
+{
+	switch (device_modify_mask) {
+	case IB_DEVICE_MODIFY_SYS_IMAGE_GUID:
+		/* Modify the GUID requires the modification of the GID table */
+		/* GUID should be made as READ-ONLY */
+		break;
+	case IB_DEVICE_MODIFY_NODE_DESC:
+		/* Node Desc should be made as READ-ONLY */
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+/* Port */
+int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
+		       struct ib_port_attr *port_attr)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+
+	memset(port_attr, 0, sizeof(*port_attr));
+
+	if (netif_running(rdev->netdev) && netif_carrier_ok(rdev->netdev)) {
+		port_attr->state = IB_PORT_ACTIVE;
+		port_attr->phys_state = 5;
+	} else {
+		port_attr->state = IB_PORT_DOWN;
+		port_attr->phys_state = 3;
+	}
+	port_attr->max_mtu = IB_MTU_4096;
+	port_attr->active_mtu = iboe_get_mtu(rdev->netdev->mtu);
+	port_attr->gid_tbl_len = dev_attr->max_sgid;
+	port_attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
+				    IB_PORT_DEVICE_MGMT_SUP |
+				    IB_PORT_VENDOR_CLASS_SUP |
+				    IB_PORT_IP_BASED_GIDS;
+
+	port_attr->max_msg_sz = (u32)BNXT_RE_MAX_MR_SIZE_LOW;
+	port_attr->bad_pkey_cntr = 0;
+	port_attr->qkey_viol_cntr = 0;
+	port_attr->pkey_tbl_len = dev_attr->max_pkey;
+	port_attr->lid = 0;
+	port_attr->sm_lid = 0;
+	port_attr->lmc = 0;
+	port_attr->max_vl_num = 4;
+	port_attr->sm_sl = 0;
+	port_attr->subnet_timeout = 0;
+	port_attr->init_type_reply = 0;
+	port_attr->active_speed = rdev->active_speed;
+	port_attr->active_width = rdev->active_width;
+
+	return 0;
+}
+
+int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+			       struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr port_attr;
+
+	if (bnxt_re_query_port(ibdev, port_num, &port_attr))
+		return -EINVAL;
+
+	immutable->pkey_tbl_len = port_attr.pkey_tbl_len;
+	immutable->gid_tbl_len = port_attr.gid_tbl_len;
+	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+	immutable->core_cap_flags |= RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
+	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+	return 0;
+}
+
+void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d.%d",
+		 rdev->dev_attr.fw_ver[0], rdev->dev_attr.fw_ver[1],
+		 rdev->dev_attr.fw_ver[2], rdev->dev_attr.fw_ver[3]);
+}
+
+int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
+		       u16 index, u16 *pkey)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+
+	/* Ignore port_num */
+
+	memset(pkey, 0, sizeof(*pkey));
+	return bnxt_qplib_get_pkey(&rdev->qplib_res,
+				   &rdev->qplib_res.pkey_tbl, index, pkey);
+}
+
+int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num,
+		      int index, union ib_gid *gid)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	int rc = 0;
+
+	/* Ignore port_num */
+	memset(gid, 0, sizeof(*gid));
+	rc = bnxt_qplib_get_sgid(&rdev->qplib_res,
+				 &rdev->qplib_res.sgid_tbl, index,
+				 (struct bnxt_qplib_gid *)gid);
+	return rc;
+}
+
+int bnxt_re_del_gid(const struct ib_gid_attr *attr, void **context)
+{
+	int rc = 0;
+	struct bnxt_re_gid_ctx *ctx, **ctx_tbl;
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(attr->device, ibdev);
+	struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl;
+	struct bnxt_qplib_gid *gid_to_del;
+
+	/* Delete the entry from the hardware */
+	ctx = *context;
+	if (!ctx)
+		return -EINVAL;
+
+	if (sgid_tbl && sgid_tbl->active) {
+		if (ctx->idx >= sgid_tbl->max)
+			return -EINVAL;
+		gid_to_del = &sgid_tbl->tbl[ctx->idx];
+		/* DEL_GID is called in WQ context(netdevice_event_work_handler)
+		 * or via the ib_unregister_device path. In the former case QP1
+		 * may not be destroyed yet, in which case just return as FW
+		 * needs that entry to be present and will fail it's deletion.
+		 * We could get invoked again after QP1 is destroyed OR get an
+		 * ADD_GID call with a different GID value for the same index
+		 * where we issue MODIFY_GID cmd to update the GID entry -- TBD
+		 */
+		if (ctx->idx == 0 &&
+		    rdma_link_local_addr((struct in6_addr *)gid_to_del) &&
+		    ctx->refcnt == 1 && rdev->qp1_sqp) {
+			dev_dbg(rdev_to_dev(rdev),
+				"Trying to delete GID0 while QP1 is alive\n");
+			return -EFAULT;
+		}
+		ctx->refcnt--;
+		if (!ctx->refcnt) {
+			rc = bnxt_qplib_del_sgid(sgid_tbl, gid_to_del, true);
+			if (rc) {
+				dev_err(rdev_to_dev(rdev),
+					"Failed to remove GID: %#x", rc);
+			} else {
+				ctx_tbl = sgid_tbl->ctx;
+				ctx_tbl[ctx->idx] = NULL;
+				kfree(ctx);
+			}
+		}
+	} else {
+		return -EINVAL;
+	}
+	return rc;
+}
+
+int bnxt_re_add_gid(const union ib_gid *gid,
+		    const struct ib_gid_attr *attr, void **context)
+{
+	int rc;
+	u32 tbl_idx = 0;
+	u16 vlan_id = 0xFFFF;
+	struct bnxt_re_gid_ctx *ctx, **ctx_tbl;
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(attr->device, ibdev);
+	struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl;
+
+	if ((attr->ndev) && is_vlan_dev(attr->ndev))
+		vlan_id = vlan_dev_vlan_id(attr->ndev);
+
+	rc = bnxt_qplib_add_sgid(sgid_tbl, (struct bnxt_qplib_gid *)gid,
+				 rdev->qplib_res.netdev->dev_addr,
+				 vlan_id, true, &tbl_idx);
+	if (rc == -EALREADY) {
+		ctx_tbl = sgid_tbl->ctx;
+		ctx_tbl[tbl_idx]->refcnt++;
+		*context = ctx_tbl[tbl_idx];
+		return 0;
+	}
+
+	if (rc < 0) {
+		dev_err(rdev_to_dev(rdev), "Failed to add GID: %#x", rc);
+		return rc;
+	}
+
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	ctx_tbl = sgid_tbl->ctx;
+	ctx->idx = tbl_idx;
+	ctx->refcnt = 1;
+	ctx_tbl[tbl_idx] = ctx;
+	*context = ctx;
+
+	return rc;
+}
+
+enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
+					    u8 port_num)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+#define	BNXT_RE_FENCE_PBL_SIZE	DIV_ROUND_UP(BNXT_RE_FENCE_BYTES, PAGE_SIZE)
+
+static void bnxt_re_create_fence_wqe(struct bnxt_re_pd *pd)
+{
+	struct bnxt_re_fence_data *fence = &pd->fence;
+	struct ib_mr *ib_mr = &fence->mr->ib_mr;
+	struct bnxt_qplib_swqe *wqe = &fence->bind_wqe;
+
+	memset(wqe, 0, sizeof(*wqe));
+	wqe->type = BNXT_QPLIB_SWQE_TYPE_BIND_MW;
+	wqe->wr_id = BNXT_QPLIB_FENCE_WRID;
+	wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+	wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+	wqe->bind.zero_based = false;
+	wqe->bind.parent_l_key = ib_mr->lkey;
+	wqe->bind.va = (u64)(unsigned long)fence->va;
+	wqe->bind.length = fence->size;
+	wqe->bind.access_cntl = __from_ib_access_flags(IB_ACCESS_REMOTE_READ);
+	wqe->bind.mw_type = SQ_BIND_MW_TYPE_TYPE1;
+
+	/* Save the initial rkey in fence structure for now;
+	 * wqe->bind.r_key will be set at (re)bind time.
+	 */
+	fence->bind_rkey = ib_inc_rkey(fence->mw->rkey);
+}
+
+static int bnxt_re_bind_fence_mw(struct bnxt_qplib_qp *qplib_qp)
+{
+	struct bnxt_re_qp *qp = container_of(qplib_qp, struct bnxt_re_qp,
+					     qplib_qp);
+	struct ib_pd *ib_pd = qp->ib_qp.pd;
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_fence_data *fence = &pd->fence;
+	struct bnxt_qplib_swqe *fence_wqe = &fence->bind_wqe;
+	struct bnxt_qplib_swqe wqe;
+	int rc;
+
+	memcpy(&wqe, fence_wqe, sizeof(wqe));
+	wqe.bind.r_key = fence->bind_rkey;
+	fence->bind_rkey = ib_inc_rkey(fence->bind_rkey);
+
+	dev_dbg(rdev_to_dev(qp->rdev),
+		"Posting bind fence-WQE: rkey: %#x QP: %d PD: %p\n",
+		wqe.bind.r_key, qp->qplib_qp.id, pd);
+	rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
+	if (rc) {
+		dev_err(rdev_to_dev(qp->rdev), "Failed to bind fence-WQE\n");
+		return rc;
+	}
+	bnxt_qplib_post_send_db(&qp->qplib_qp);
+
+	return rc;
+}
+
+static void bnxt_re_destroy_fence_mr(struct bnxt_re_pd *pd)
+{
+	struct bnxt_re_fence_data *fence = &pd->fence;
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct device *dev = &rdev->en_dev->pdev->dev;
+	struct bnxt_re_mr *mr = fence->mr;
+
+	if (fence->mw) {
+		bnxt_re_dealloc_mw(fence->mw);
+		fence->mw = NULL;
+	}
+	if (mr) {
+		if (mr->ib_mr.rkey)
+			bnxt_qplib_dereg_mrw(&rdev->qplib_res, &mr->qplib_mr,
+					     true);
+		if (mr->ib_mr.lkey)
+			bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+		kfree(mr);
+		fence->mr = NULL;
+	}
+	if (fence->dma_addr) {
+		dma_unmap_single(dev, fence->dma_addr, BNXT_RE_FENCE_BYTES,
+				 DMA_BIDIRECTIONAL);
+		fence->dma_addr = 0;
+	}
+}
+
+static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd)
+{
+	int mr_access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_MW_BIND;
+	struct bnxt_re_fence_data *fence = &pd->fence;
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct device *dev = &rdev->en_dev->pdev->dev;
+	struct bnxt_re_mr *mr = NULL;
+	dma_addr_t dma_addr = 0;
+	struct ib_mw *mw;
+	u64 pbl_tbl;
+	int rc;
+
+	dma_addr = dma_map_single(dev, fence->va, BNXT_RE_FENCE_BYTES,
+				  DMA_BIDIRECTIONAL);
+	rc = dma_mapping_error(dev, dma_addr);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to dma-map fence-MR-mem\n");
+		rc = -EIO;
+		fence->dma_addr = 0;
+		goto fail;
+	}
+	fence->dma_addr = dma_addr;
+
+	/* Allocate a MR */
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+	fence->mr = mr;
+	mr->rdev = rdev;
+	mr->qplib_mr.pd = &pd->qplib_pd;
+	mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR;
+	mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags);
+	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to alloc fence-HW-MR\n");
+		goto fail;
+	}
+
+	/* Register MR */
+	mr->ib_mr.lkey = mr->qplib_mr.lkey;
+	mr->qplib_mr.va = (u64)(unsigned long)fence->va;
+	mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES;
+	pbl_tbl = dma_addr;
+	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl_tbl,
+			       BNXT_RE_FENCE_PBL_SIZE, false, PAGE_SIZE);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to register fence-MR\n");
+		goto fail;
+	}
+	mr->ib_mr.rkey = mr->qplib_mr.rkey;
+
+	/* Create a fence MW only for kernel consumers */
+	mw = bnxt_re_alloc_mw(&pd->ib_pd, IB_MW_TYPE_1, NULL);
+	if (IS_ERR(mw)) {
+		dev_err(rdev_to_dev(rdev),
+			"Failed to create fence-MW for PD: %p\n", pd);
+		rc = PTR_ERR(mw);
+		goto fail;
+	}
+	fence->mw = mw;
+
+	bnxt_re_create_fence_wqe(pd);
+	return 0;
+
+fail:
+	bnxt_re_destroy_fence_mr(pd);
+	return rc;
+}
+
+/* Protection Domains */
+int bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	int rc;
+
+	bnxt_re_destroy_fence_mr(pd);
+
+	if (pd->qplib_pd.id) {
+		rc = bnxt_qplib_dealloc_pd(&rdev->qplib_res,
+					   &rdev->qplib_res.pd_tbl,
+					   &pd->qplib_pd);
+		if (rc)
+			dev_err(rdev_to_dev(rdev), "Failed to deallocate HW PD");
+	}
+
+	kfree(pd);
+	return 0;
+}
+
+struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
+			       struct ib_ucontext *ucontext,
+			       struct ib_udata *udata)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_re_ucontext *ucntx = container_of(ucontext,
+						      struct bnxt_re_ucontext,
+						      ib_uctx);
+	struct bnxt_re_pd *pd;
+	int rc;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	pd->rdev = rdev;
+	if (bnxt_qplib_alloc_pd(&rdev->qplib_res.pd_tbl, &pd->qplib_pd)) {
+		dev_err(rdev_to_dev(rdev), "Failed to allocate HW PD");
+		rc = -ENOMEM;
+		goto fail;
+	}
+
+	if (udata) {
+		struct bnxt_re_pd_resp resp;
+
+		if (!ucntx->dpi.dbr) {
+			/* Allocate DPI in alloc_pd to avoid failing of
+			 * ibv_devinfo and family of application when DPIs
+			 * are depleted.
+			 */
+			if (bnxt_qplib_alloc_dpi(&rdev->qplib_res.dpi_tbl,
+						 &ucntx->dpi, ucntx)) {
+				rc = -ENOMEM;
+				goto dbfail;
+			}
+		}
+
+		resp.pdid = pd->qplib_pd.id;
+		/* Still allow mapping this DBR to the new user PD. */
+		resp.dpi = ucntx->dpi.dpi;
+		resp.dbr = (u64)ucntx->dpi.umdbr;
+
+		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to copy user response\n");
+			goto dbfail;
+		}
+	}
+
+	if (!udata)
+		if (bnxt_re_create_fence_mr(pd))
+			dev_warn(rdev_to_dev(rdev),
+				 "Failed to create Fence-MR\n");
+	return &pd->ib_pd;
+dbfail:
+	(void)bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
+				    &pd->qplib_pd);
+fail:
+	kfree(pd);
+	return ERR_PTR(rc);
+}
+
+/* Address Handles */
+int bnxt_re_destroy_ah(struct ib_ah *ib_ah)
+{
+	struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah);
+	struct bnxt_re_dev *rdev = ah->rdev;
+	int rc;
+
+	rc = bnxt_qplib_destroy_ah(&rdev->qplib_res, &ah->qplib_ah);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to destroy HW AH");
+		return rc;
+	}
+	kfree(ah);
+	return 0;
+}
+
+struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd,
+				struct rdma_ah_attr *ah_attr,
+				struct ib_udata *udata)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_ah *ah;
+	const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
+	int rc;
+	u8 nw_type;
+
+	struct ib_gid_attr sgid_attr;
+
+	if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) {
+		dev_err(rdev_to_dev(rdev), "Failed to alloc AH: GRH not set");
+		return ERR_PTR(-EINVAL);
+	}
+	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	ah->rdev = rdev;
+	ah->qplib_ah.pd = &pd->qplib_pd;
+
+	/* Supply the configuration for the HW */
+	memcpy(ah->qplib_ah.dgid.data, grh->dgid.raw,
+	       sizeof(union ib_gid));
+	/*
+	 * If RoCE V2 is enabled, stack will have two entries for
+	 * each GID entry. Avoiding this duplicte entry in HW. Dividing
+	 * the GID index by 2 for RoCE V2
+	 */
+	ah->qplib_ah.sgid_index = grh->sgid_index / 2;
+	ah->qplib_ah.host_sgid_index = grh->sgid_index;
+	ah->qplib_ah.traffic_class = grh->traffic_class;
+	ah->qplib_ah.flow_label = grh->flow_label;
+	ah->qplib_ah.hop_limit = grh->hop_limit;
+	ah->qplib_ah.sl = rdma_ah_get_sl(ah_attr);
+	if (ib_pd->uobject &&
+	    !rdma_is_multicast_addr((struct in6_addr *)
+				    grh->dgid.raw) &&
+	    !rdma_link_local_addr((struct in6_addr *)
+				  grh->dgid.raw)) {
+		union ib_gid sgid;
+
+		rc = ib_get_cached_gid(&rdev->ibdev, 1,
+				       grh->sgid_index, &sgid,
+				       &sgid_attr);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to query gid at index %d",
+				grh->sgid_index);
+			goto fail;
+		}
+		dev_put(sgid_attr.ndev);
+		/* Get network header type for this GID */
+		nw_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid);
+		switch (nw_type) {
+		case RDMA_NETWORK_IPV4:
+			ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V2IPV4;
+			break;
+		case RDMA_NETWORK_IPV6:
+			ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V2IPV6;
+			break;
+		default:
+			ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V1;
+			break;
+		}
+	}
+
+	memcpy(ah->qplib_ah.dmac, ah_attr->roce.dmac, ETH_ALEN);
+	rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to allocate HW AH");
+		goto fail;
+	}
+
+	/* Write AVID to shared page. */
+	if (ib_pd->uobject) {
+		struct ib_ucontext *ib_uctx = ib_pd->uobject->context;
+		struct bnxt_re_ucontext *uctx;
+		unsigned long flag;
+		u32 *wrptr;
+
+		uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx);
+		spin_lock_irqsave(&uctx->sh_lock, flag);
+		wrptr = (u32 *)(uctx->shpg + BNXT_RE_AVID_OFFT);
+		*wrptr = ah->qplib_ah.id;
+		wmb(); /* make sure cache is updated. */
+		spin_unlock_irqrestore(&uctx->sh_lock, flag);
+	}
+
+	return &ah->ib_ah;
+
+fail:
+	kfree(ah);
+	return ERR_PTR(rc);
+}
+
+int bnxt_re_modify_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr)
+{
+	return 0;
+}
+
+int bnxt_re_query_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr)
+{
+	struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah);
+
+	ah_attr->type = ib_ah->type;
+	rdma_ah_set_sl(ah_attr, ah->qplib_ah.sl);
+	memcpy(ah_attr->roce.dmac, ah->qplib_ah.dmac, ETH_ALEN);
+	rdma_ah_set_grh(ah_attr, NULL, 0,
+			ah->qplib_ah.host_sgid_index,
+			0, ah->qplib_ah.traffic_class);
+	rdma_ah_set_dgid_raw(ah_attr, ah->qplib_ah.dgid.data);
+	rdma_ah_set_port_num(ah_attr, 1);
+	rdma_ah_set_static_rate(ah_attr, 0);
+	return 0;
+}
+
+unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp)
+	__acquires(&qp->scq->cq_lock) __acquires(&qp->rcq->cq_lock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->scq->cq_lock, flags);
+	if (qp->rcq != qp->scq)
+		spin_lock(&qp->rcq->cq_lock);
+	else
+		__acquire(&qp->rcq->cq_lock);
+
+	return flags;
+}
+
+void bnxt_re_unlock_cqs(struct bnxt_re_qp *qp,
+			unsigned long flags)
+	__releases(&qp->scq->cq_lock) __releases(&qp->rcq->cq_lock)
+{
+	if (qp->rcq != qp->scq)
+		spin_unlock(&qp->rcq->cq_lock);
+	else
+		__release(&qp->rcq->cq_lock);
+	spin_unlock_irqrestore(&qp->scq->cq_lock, flags);
+}
+
+/* Queue Pairs */
+int bnxt_re_destroy_qp(struct ib_qp *ib_qp)
+{
+	struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
+	struct bnxt_re_dev *rdev = qp->rdev;
+	int rc;
+	unsigned int flags;
+
+	bnxt_qplib_flush_cqn_wq(&qp->qplib_qp);
+	rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to destroy HW QP");
+		return rc;
+	}
+
+	flags = bnxt_re_lock_cqs(qp);
+	bnxt_qplib_clean_qp(&qp->qplib_qp);
+	bnxt_re_unlock_cqs(qp, flags);
+	bnxt_qplib_free_qp_res(&rdev->qplib_res, &qp->qplib_qp);
+
+	if (ib_qp->qp_type == IB_QPT_GSI && rdev->qp1_sqp) {
+		rc = bnxt_qplib_destroy_ah(&rdev->qplib_res,
+					   &rdev->sqp_ah->qplib_ah);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to destroy HW AH for shadow QP");
+			return rc;
+		}
+
+		bnxt_qplib_clean_qp(&qp->qplib_qp);
+		rc = bnxt_qplib_destroy_qp(&rdev->qplib_res,
+					   &rdev->qp1_sqp->qplib_qp);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to destroy Shadow QP");
+			return rc;
+		}
+		mutex_lock(&rdev->qp_lock);
+		list_del(&rdev->qp1_sqp->list);
+		atomic_dec(&rdev->qp_count);
+		mutex_unlock(&rdev->qp_lock);
+
+		kfree(rdev->sqp_ah);
+		kfree(rdev->qp1_sqp);
+		rdev->qp1_sqp = NULL;
+		rdev->sqp_ah = NULL;
+	}
+
+	if (!IS_ERR_OR_NULL(qp->rumem))
+		ib_umem_release(qp->rumem);
+	if (!IS_ERR_OR_NULL(qp->sumem))
+		ib_umem_release(qp->sumem);
+
+	mutex_lock(&rdev->qp_lock);
+	list_del(&qp->list);
+	atomic_dec(&rdev->qp_count);
+	mutex_unlock(&rdev->qp_lock);
+	kfree(qp);
+	return 0;
+}
+
+static u8 __from_ib_qp_type(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_GSI:
+		return CMDQ_CREATE_QP1_TYPE_GSI;
+	case IB_QPT_RC:
+		return CMDQ_CREATE_QP_TYPE_RC;
+	case IB_QPT_UD:
+		return CMDQ_CREATE_QP_TYPE_UD;
+	default:
+		return IB_QPT_MAX;
+	}
+}
+
+static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
+				struct bnxt_re_qp *qp, struct ib_udata *udata)
+{
+	struct bnxt_re_qp_req ureq;
+	struct bnxt_qplib_qp *qplib_qp = &qp->qplib_qp;
+	struct ib_umem *umem;
+	int bytes = 0;
+	struct ib_ucontext *context = pd->ib_pd.uobject->context;
+	struct bnxt_re_ucontext *cntx = container_of(context,
+						     struct bnxt_re_ucontext,
+						     ib_uctx);
+	if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
+		return -EFAULT;
+
+	bytes = (qplib_qp->sq.max_wqe * BNXT_QPLIB_MAX_SQE_ENTRY_SIZE);
+	/* Consider mapping PSN search memory only for RC QPs. */
+	if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC)
+		bytes += (qplib_qp->sq.max_wqe * sizeof(struct sq_psn_search));
+	bytes = PAGE_ALIGN(bytes);
+	umem = ib_umem_get(context, ureq.qpsva, bytes,
+			   IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(umem))
+		return PTR_ERR(umem);
+
+	qp->sumem = umem;
+	qplib_qp->sq.sglist = umem->sg_head.sgl;
+	qplib_qp->sq.nmap = umem->nmap;
+	qplib_qp->qp_handle = ureq.qp_handle;
+
+	if (!qp->qplib_qp.srq) {
+		bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
+		bytes = PAGE_ALIGN(bytes);
+		umem = ib_umem_get(context, ureq.qprva, bytes,
+				   IB_ACCESS_LOCAL_WRITE, 1);
+		if (IS_ERR(umem))
+			goto rqfail;
+		qp->rumem = umem;
+		qplib_qp->rq.sglist = umem->sg_head.sgl;
+		qplib_qp->rq.nmap = umem->nmap;
+	}
+
+	qplib_qp->dpi = &cntx->dpi;
+	return 0;
+rqfail:
+	ib_umem_release(qp->sumem);
+	qp->sumem = NULL;
+	qplib_qp->sq.sglist = NULL;
+	qplib_qp->sq.nmap = 0;
+
+	return PTR_ERR(umem);
+}
+
+static struct bnxt_re_ah *bnxt_re_create_shadow_qp_ah
+				(struct bnxt_re_pd *pd,
+				 struct bnxt_qplib_res *qp1_res,
+				 struct bnxt_qplib_qp *qp1_qp)
+{
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_ah *ah;
+	union ib_gid sgid;
+	int rc;
+
+	ah = kzalloc(sizeof(*ah), GFP_KERNEL);
+	if (!ah)
+		return NULL;
+
+	ah->rdev = rdev;
+	ah->qplib_ah.pd = &pd->qplib_pd;
+
+	rc = bnxt_re_query_gid(&rdev->ibdev, 1, 0, &sgid);
+	if (rc)
+		goto fail;
+
+	/* supply the dgid data same as sgid */
+	memcpy(ah->qplib_ah.dgid.data, &sgid.raw,
+	       sizeof(union ib_gid));
+	ah->qplib_ah.sgid_index = 0;
+
+	ah->qplib_ah.traffic_class = 0;
+	ah->qplib_ah.flow_label = 0;
+	ah->qplib_ah.hop_limit = 1;
+	ah->qplib_ah.sl = 0;
+	/* Have DMAC same as SMAC */
+	ether_addr_copy(ah->qplib_ah.dmac, rdev->netdev->dev_addr);
+
+	rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev),
+			"Failed to allocate HW AH for Shadow QP");
+		goto fail;
+	}
+
+	return ah;
+
+fail:
+	kfree(ah);
+	return NULL;
+}
+
+static struct bnxt_re_qp *bnxt_re_create_shadow_qp
+				(struct bnxt_re_pd *pd,
+				 struct bnxt_qplib_res *qp1_res,
+				 struct bnxt_qplib_qp *qp1_qp)
+{
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_qp *qp;
+	int rc;
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return NULL;
+
+	qp->rdev = rdev;
+
+	/* Initialize the shadow QP structure from the QP1 values */
+	ether_addr_copy(qp->qplib_qp.smac, rdev->netdev->dev_addr);
+
+	qp->qplib_qp.pd = &pd->qplib_pd;
+	qp->qplib_qp.qp_handle = (u64)(unsigned long)(&qp->qplib_qp);
+	qp->qplib_qp.type = IB_QPT_UD;
+
+	qp->qplib_qp.max_inline_data = 0;
+	qp->qplib_qp.sig_type = true;
+
+	/* Shadow QP SQ depth should be same as QP1 RQ depth */
+	qp->qplib_qp.sq.max_wqe = qp1_qp->rq.max_wqe;
+	qp->qplib_qp.sq.max_sge = 2;
+	/* Q full delta can be 1 since it is internal QP */
+	qp->qplib_qp.sq.q_full_delta = 1;
+
+	qp->qplib_qp.scq = qp1_qp->scq;
+	qp->qplib_qp.rcq = qp1_qp->rcq;
+
+	qp->qplib_qp.rq.max_wqe = qp1_qp->rq.max_wqe;
+	qp->qplib_qp.rq.max_sge = qp1_qp->rq.max_sge;
+	/* Q full delta can be 1 since it is internal QP */
+	qp->qplib_qp.rq.q_full_delta = 1;
+
+	qp->qplib_qp.mtu = qp1_qp->mtu;
+
+	qp->qplib_qp.sq_hdr_buf_size = 0;
+	qp->qplib_qp.rq_hdr_buf_size = BNXT_QPLIB_MAX_GRH_HDR_SIZE_IPV6;
+	qp->qplib_qp.dpi = &rdev->dpi_privileged;
+
+	rc = bnxt_qplib_create_qp(qp1_res, &qp->qplib_qp);
+	if (rc)
+		goto fail;
+
+	rdev->sqp_id = qp->qplib_qp.id;
+
+	spin_lock_init(&qp->sq_lock);
+	INIT_LIST_HEAD(&qp->list);
+	mutex_lock(&rdev->qp_lock);
+	list_add_tail(&qp->list, &rdev->qp_list);
+	atomic_inc(&rdev->qp_count);
+	mutex_unlock(&rdev->qp_lock);
+	return qp;
+fail:
+	kfree(qp);
+	return NULL;
+}
+
+struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
+				struct ib_qp_init_attr *qp_init_attr,
+				struct ib_udata *udata)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+	struct bnxt_re_qp *qp;
+	struct bnxt_re_cq *cq;
+	struct bnxt_re_srq *srq;
+	int rc, entries;
+
+	if ((qp_init_attr->cap.max_send_wr > dev_attr->max_qp_wqes) ||
+	    (qp_init_attr->cap.max_recv_wr > dev_attr->max_qp_wqes) ||
+	    (qp_init_attr->cap.max_send_sge > dev_attr->max_qp_sges) ||
+	    (qp_init_attr->cap.max_recv_sge > dev_attr->max_qp_sges) ||
+	    (qp_init_attr->cap.max_inline_data > dev_attr->max_inline_data))
+		return ERR_PTR(-EINVAL);
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->rdev = rdev;
+	ether_addr_copy(qp->qplib_qp.smac, rdev->netdev->dev_addr);
+	qp->qplib_qp.pd = &pd->qplib_pd;
+	qp->qplib_qp.qp_handle = (u64)(unsigned long)(&qp->qplib_qp);
+	qp->qplib_qp.type = __from_ib_qp_type(qp_init_attr->qp_type);
+	if (qp->qplib_qp.type == IB_QPT_MAX) {
+		dev_err(rdev_to_dev(rdev), "QP type 0x%x not supported",
+			qp->qplib_qp.type);
+		rc = -EINVAL;
+		goto fail;
+	}
+	qp->qplib_qp.max_inline_data = qp_init_attr->cap.max_inline_data;
+	qp->qplib_qp.sig_type = ((qp_init_attr->sq_sig_type ==
+				  IB_SIGNAL_ALL_WR) ? true : false);
+
+	qp->qplib_qp.sq.max_sge = qp_init_attr->cap.max_send_sge;
+	if (qp->qplib_qp.sq.max_sge > dev_attr->max_qp_sges)
+		qp->qplib_qp.sq.max_sge = dev_attr->max_qp_sges;
+
+	if (qp_init_attr->send_cq) {
+		cq = container_of(qp_init_attr->send_cq, struct bnxt_re_cq,
+				  ib_cq);
+		if (!cq) {
+			dev_err(rdev_to_dev(rdev), "Send CQ not found");
+			rc = -EINVAL;
+			goto fail;
+		}
+		qp->qplib_qp.scq = &cq->qplib_cq;
+		qp->scq = cq;
+	}
+
+	if (qp_init_attr->recv_cq) {
+		cq = container_of(qp_init_attr->recv_cq, struct bnxt_re_cq,
+				  ib_cq);
+		if (!cq) {
+			dev_err(rdev_to_dev(rdev), "Receive CQ not found");
+			rc = -EINVAL;
+			goto fail;
+		}
+		qp->qplib_qp.rcq = &cq->qplib_cq;
+		qp->rcq = cq;
+	}
+
+	if (qp_init_attr->srq) {
+		srq = container_of(qp_init_attr->srq, struct bnxt_re_srq,
+				   ib_srq);
+		if (!srq) {
+			dev_err(rdev_to_dev(rdev), "SRQ not found");
+			rc = -EINVAL;
+			goto fail;
+		}
+		qp->qplib_qp.srq = &srq->qplib_srq;
+		qp->qplib_qp.rq.max_wqe = 0;
+	} else {
+		/* Allocate 1 more than what's provided so posting max doesn't
+		 * mean empty
+		 */
+		entries = roundup_pow_of_two(qp_init_attr->cap.max_recv_wr + 1);
+		qp->qplib_qp.rq.max_wqe = min_t(u32, entries,
+						dev_attr->max_qp_wqes + 1);
+
+		qp->qplib_qp.rq.q_full_delta = qp->qplib_qp.rq.max_wqe -
+						qp_init_attr->cap.max_recv_wr;
+
+		qp->qplib_qp.rq.max_sge = qp_init_attr->cap.max_recv_sge;
+		if (qp->qplib_qp.rq.max_sge > dev_attr->max_qp_sges)
+			qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
+	}
+
+	qp->qplib_qp.mtu = ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu));
+
+	if (qp_init_attr->qp_type == IB_QPT_GSI) {
+		/* Allocate 1 more than what's provided */
+		entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr + 1);
+		qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
+						dev_attr->max_qp_wqes + 1);
+		qp->qplib_qp.sq.q_full_delta = qp->qplib_qp.sq.max_wqe -
+						qp_init_attr->cap.max_send_wr;
+		qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
+		if (qp->qplib_qp.rq.max_sge > dev_attr->max_qp_sges)
+			qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
+		qp->qplib_qp.sq.max_sge++;
+		if (qp->qplib_qp.sq.max_sge > dev_attr->max_qp_sges)
+			qp->qplib_qp.sq.max_sge = dev_attr->max_qp_sges;
+
+		qp->qplib_qp.rq_hdr_buf_size =
+					BNXT_QPLIB_MAX_QP1_RQ_HDR_SIZE_V2;
+
+		qp->qplib_qp.sq_hdr_buf_size =
+					BNXT_QPLIB_MAX_QP1_SQ_HDR_SIZE_V2;
+		qp->qplib_qp.dpi = &rdev->dpi_privileged;
+		rc = bnxt_qplib_create_qp1(&rdev->qplib_res, &qp->qplib_qp);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "Failed to create HW QP1");
+			goto fail;
+		}
+		/* Create a shadow QP to handle the QP1 traffic */
+		rdev->qp1_sqp = bnxt_re_create_shadow_qp(pd, &rdev->qplib_res,
+							 &qp->qplib_qp);
+		if (!rdev->qp1_sqp) {
+			rc = -EINVAL;
+			dev_err(rdev_to_dev(rdev),
+				"Failed to create Shadow QP for QP1");
+			goto qp_destroy;
+		}
+		rdev->sqp_ah = bnxt_re_create_shadow_qp_ah(pd, &rdev->qplib_res,
+							   &qp->qplib_qp);
+		if (!rdev->sqp_ah) {
+			bnxt_qplib_destroy_qp(&rdev->qplib_res,
+					      &rdev->qp1_sqp->qplib_qp);
+			rc = -EINVAL;
+			dev_err(rdev_to_dev(rdev),
+				"Failed to create AH entry for ShadowQP");
+			goto qp_destroy;
+		}
+
+	} else {
+		/* Allocate 128 + 1 more than what's provided */
+		entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr +
+					     BNXT_QPLIB_RESERVED_QP_WRS + 1);
+		qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
+						dev_attr->max_qp_wqes +
+						BNXT_QPLIB_RESERVED_QP_WRS + 1);
+		qp->qplib_qp.sq.q_full_delta = BNXT_QPLIB_RESERVED_QP_WRS + 1;
+
+		/*
+		 * Reserving one slot for Phantom WQE. Application can
+		 * post one extra entry in this case. But allowing this to avoid
+		 * unexpected Queue full condition
+		 */
+
+		qp->qplib_qp.sq.q_full_delta -= 1;
+
+		qp->qplib_qp.max_rd_atomic = dev_attr->max_qp_rd_atom;
+		qp->qplib_qp.max_dest_rd_atomic = dev_attr->max_qp_init_rd_atom;
+		if (udata) {
+			rc = bnxt_re_init_user_qp(rdev, pd, qp, udata);
+			if (rc)
+				goto fail;
+		} else {
+			qp->qplib_qp.dpi = &rdev->dpi_privileged;
+		}
+
+		rc = bnxt_qplib_create_qp(&rdev->qplib_res, &qp->qplib_qp);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "Failed to create HW QP");
+			goto free_umem;
+		}
+	}
+
+	qp->ib_qp.qp_num = qp->qplib_qp.id;
+	spin_lock_init(&qp->sq_lock);
+	spin_lock_init(&qp->rq_lock);
+
+	if (udata) {
+		struct bnxt_re_qp_resp resp;
+
+		resp.qpid = qp->ib_qp.qp_num;
+		resp.rsvd = 0;
+		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "Failed to copy QP udata");
+			goto qp_destroy;
+		}
+	}
+	INIT_LIST_HEAD(&qp->list);
+	mutex_lock(&rdev->qp_lock);
+	list_add_tail(&qp->list, &rdev->qp_list);
+	atomic_inc(&rdev->qp_count);
+	mutex_unlock(&rdev->qp_lock);
+
+	return &qp->ib_qp;
+qp_destroy:
+	bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
+free_umem:
+	if (udata) {
+		if (qp->rumem)
+			ib_umem_release(qp->rumem);
+		if (qp->sumem)
+			ib_umem_release(qp->sumem);
+	}
+fail:
+	kfree(qp);
+	return ERR_PTR(rc);
+}
+
+static u8 __from_ib_qp_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:
+		return CMDQ_MODIFY_QP_NEW_STATE_RESET;
+	case IB_QPS_INIT:
+		return CMDQ_MODIFY_QP_NEW_STATE_INIT;
+	case IB_QPS_RTR:
+		return CMDQ_MODIFY_QP_NEW_STATE_RTR;
+	case IB_QPS_RTS:
+		return CMDQ_MODIFY_QP_NEW_STATE_RTS;
+	case IB_QPS_SQD:
+		return CMDQ_MODIFY_QP_NEW_STATE_SQD;
+	case IB_QPS_SQE:
+		return CMDQ_MODIFY_QP_NEW_STATE_SQE;
+	case IB_QPS_ERR:
+	default:
+		return CMDQ_MODIFY_QP_NEW_STATE_ERR;
+	}
+}
+
+static enum ib_qp_state __to_ib_qp_state(u8 state)
+{
+	switch (state) {
+	case CMDQ_MODIFY_QP_NEW_STATE_RESET:
+		return IB_QPS_RESET;
+	case CMDQ_MODIFY_QP_NEW_STATE_INIT:
+		return IB_QPS_INIT;
+	case CMDQ_MODIFY_QP_NEW_STATE_RTR:
+		return IB_QPS_RTR;
+	case CMDQ_MODIFY_QP_NEW_STATE_RTS:
+		return IB_QPS_RTS;
+	case CMDQ_MODIFY_QP_NEW_STATE_SQD:
+		return IB_QPS_SQD;
+	case CMDQ_MODIFY_QP_NEW_STATE_SQE:
+		return IB_QPS_SQE;
+	case CMDQ_MODIFY_QP_NEW_STATE_ERR:
+	default:
+		return IB_QPS_ERR;
+	}
+}
+
+static u32 __from_ib_mtu(enum ib_mtu mtu)
+{
+	switch (mtu) {
+	case IB_MTU_256:
+		return CMDQ_MODIFY_QP_PATH_MTU_MTU_256;
+	case IB_MTU_512:
+		return CMDQ_MODIFY_QP_PATH_MTU_MTU_512;
+	case IB_MTU_1024:
+		return CMDQ_MODIFY_QP_PATH_MTU_MTU_1024;
+	case IB_MTU_2048:
+		return CMDQ_MODIFY_QP_PATH_MTU_MTU_2048;
+	case IB_MTU_4096:
+		return CMDQ_MODIFY_QP_PATH_MTU_MTU_4096;
+	default:
+		return CMDQ_MODIFY_QP_PATH_MTU_MTU_2048;
+	}
+}
+
+static enum ib_mtu __to_ib_mtu(u32 mtu)
+{
+	switch (mtu & CREQ_QUERY_QP_RESP_SB_PATH_MTU_MASK) {
+	case CMDQ_MODIFY_QP_PATH_MTU_MTU_256:
+		return IB_MTU_256;
+	case CMDQ_MODIFY_QP_PATH_MTU_MTU_512:
+		return IB_MTU_512;
+	case CMDQ_MODIFY_QP_PATH_MTU_MTU_1024:
+		return IB_MTU_1024;
+	case CMDQ_MODIFY_QP_PATH_MTU_MTU_2048:
+		return IB_MTU_2048;
+	case CMDQ_MODIFY_QP_PATH_MTU_MTU_4096:
+		return IB_MTU_4096;
+	default:
+		return IB_MTU_2048;
+	}
+}
+
+/* Shared Receive Queues */
+int bnxt_re_destroy_srq(struct ib_srq *ib_srq)
+{
+	struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq,
+					       ib_srq);
+	struct bnxt_re_dev *rdev = srq->rdev;
+	struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq;
+	struct bnxt_qplib_nq *nq = NULL;
+	int rc;
+
+	if (qplib_srq->cq)
+		nq = qplib_srq->cq->nq;
+	rc = bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Destroy HW SRQ failed!");
+		return rc;
+	}
+
+	if (srq->umem)
+		ib_umem_release(srq->umem);
+	kfree(srq);
+	atomic_dec(&rdev->srq_count);
+	if (nq)
+		nq->budget--;
+	return 0;
+}
+
+static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
+				 struct bnxt_re_pd *pd,
+				 struct bnxt_re_srq *srq,
+				 struct ib_udata *udata)
+{
+	struct bnxt_re_srq_req ureq;
+	struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq;
+	struct ib_umem *umem;
+	int bytes = 0;
+	struct ib_ucontext *context = pd->ib_pd.uobject->context;
+	struct bnxt_re_ucontext *cntx = container_of(context,
+						     struct bnxt_re_ucontext,
+						     ib_uctx);
+	if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
+		return -EFAULT;
+
+	bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
+	bytes = PAGE_ALIGN(bytes);
+	umem = ib_umem_get(context, ureq.srqva, bytes,
+			   IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(umem))
+		return PTR_ERR(umem);
+
+	srq->umem = umem;
+	qplib_srq->nmap = umem->nmap;
+	qplib_srq->sglist = umem->sg_head.sgl;
+	qplib_srq->srq_handle = ureq.srq_handle;
+	qplib_srq->dpi = &cntx->dpi;
+
+	return 0;
+}
+
+struct ib_srq *bnxt_re_create_srq(struct ib_pd *ib_pd,
+				  struct ib_srq_init_attr *srq_init_attr,
+				  struct ib_udata *udata)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+	struct bnxt_re_srq *srq;
+	struct bnxt_qplib_nq *nq = NULL;
+	int rc, entries;
+
+	if (srq_init_attr->attr.max_wr >= dev_attr->max_srq_wqes) {
+		dev_err(rdev_to_dev(rdev), "Create CQ failed - max exceeded");
+		rc = -EINVAL;
+		goto exit;
+	}
+
+	if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
+		rc = -ENOTSUPP;
+		goto exit;
+	}
+
+	srq = kzalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq) {
+		rc = -ENOMEM;
+		goto exit;
+	}
+	srq->rdev = rdev;
+	srq->qplib_srq.pd = &pd->qplib_pd;
+	srq->qplib_srq.dpi = &rdev->dpi_privileged;
+	/* Allocate 1 more than what's provided so posting max doesn't
+	 * mean empty
+	 */
+	entries = roundup_pow_of_two(srq_init_attr->attr.max_wr + 1);
+	if (entries > dev_attr->max_srq_wqes + 1)
+		entries = dev_attr->max_srq_wqes + 1;
+
+	srq->qplib_srq.max_wqe = entries;
+	srq->qplib_srq.max_sge = srq_init_attr->attr.max_sge;
+	srq->qplib_srq.threshold = srq_init_attr->attr.srq_limit;
+	srq->srq_limit = srq_init_attr->attr.srq_limit;
+	srq->qplib_srq.eventq_hw_ring_id = rdev->nq[0].ring_id;
+	nq = &rdev->nq[0];
+
+	if (udata) {
+		rc = bnxt_re_init_user_srq(rdev, pd, srq, udata);
+		if (rc)
+			goto fail;
+	}
+
+	rc = bnxt_qplib_create_srq(&rdev->qplib_res, &srq->qplib_srq);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Create HW SRQ failed!");
+		goto fail;
+	}
+
+	if (udata) {
+		struct bnxt_re_srq_resp resp;
+
+		resp.srqid = srq->qplib_srq.id;
+		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "SRQ copy to udata failed!");
+			bnxt_qplib_destroy_srq(&rdev->qplib_res,
+					       &srq->qplib_srq);
+			goto exit;
+		}
+	}
+	if (nq)
+		nq->budget++;
+	atomic_inc(&rdev->srq_count);
+
+	return &srq->ib_srq;
+
+fail:
+	if (srq->umem)
+		ib_umem_release(srq->umem);
+	kfree(srq);
+exit:
+	return ERR_PTR(rc);
+}
+
+int bnxt_re_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr,
+		       enum ib_srq_attr_mask srq_attr_mask,
+		       struct ib_udata *udata)
+{
+	struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq,
+					       ib_srq);
+	struct bnxt_re_dev *rdev = srq->rdev;
+	int rc;
+
+	switch (srq_attr_mask) {
+	case IB_SRQ_MAX_WR:
+		/* SRQ resize is not supported */
+		break;
+	case IB_SRQ_LIMIT:
+		/* Change the SRQ threshold */
+		if (srq_attr->srq_limit > srq->qplib_srq.max_wqe)
+			return -EINVAL;
+
+		srq->qplib_srq.threshold = srq_attr->srq_limit;
+		rc = bnxt_qplib_modify_srq(&rdev->qplib_res, &srq->qplib_srq);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "Modify HW SRQ failed!");
+			return rc;
+		}
+		/* On success, update the shadow */
+		srq->srq_limit = srq_attr->srq_limit;
+		/* No need to Build and send response back to udata */
+		break;
+	default:
+		dev_err(rdev_to_dev(rdev),
+			"Unsupported srq_attr_mask 0x%x", srq_attr_mask);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int bnxt_re_query_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr)
+{
+	struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq,
+					       ib_srq);
+	struct bnxt_re_srq tsrq;
+	struct bnxt_re_dev *rdev = srq->rdev;
+	int rc;
+
+	/* Get live SRQ attr */
+	tsrq.qplib_srq.id = srq->qplib_srq.id;
+	rc = bnxt_qplib_query_srq(&rdev->qplib_res, &tsrq.qplib_srq);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Query HW SRQ failed!");
+		return rc;
+	}
+	srq_attr->max_wr = srq->qplib_srq.max_wqe;
+	srq_attr->max_sge = srq->qplib_srq.max_sge;
+	srq_attr->srq_limit = tsrq.qplib_srq.threshold;
+
+	return 0;
+}
+
+int bnxt_re_post_srq_recv(struct ib_srq *ib_srq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr)
+{
+	struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq,
+					       ib_srq);
+	struct bnxt_qplib_swqe wqe;
+	unsigned long flags;
+	int rc = 0;
+
+	spin_lock_irqsave(&srq->lock, flags);
+	while (wr) {
+		/* Transcribe each ib_recv_wr to qplib_swqe */
+		wqe.num_sge = wr->num_sge;
+		bnxt_re_build_sgl(wr->sg_list, wqe.sg_list, wr->num_sge);
+		wqe.wr_id = wr->wr_id;
+		wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV;
+
+		rc = bnxt_qplib_post_srq_recv(&srq->qplib_srq, &wqe);
+		if (rc) {
+			*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return rc;
+}
+static int bnxt_re_modify_shadow_qp(struct bnxt_re_dev *rdev,
+				    struct bnxt_re_qp *qp1_qp,
+				    int qp_attr_mask)
+{
+	struct bnxt_re_qp *qp = rdev->qp1_sqp;
+	int rc = 0;
+
+	if (qp_attr_mask & IB_QP_STATE) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_STATE;
+		qp->qplib_qp.state = qp1_qp->qplib_qp.state;
+	}
+	if (qp_attr_mask & IB_QP_PKEY_INDEX) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY;
+		qp->qplib_qp.pkey_index = qp1_qp->qplib_qp.pkey_index;
+	}
+
+	if (qp_attr_mask & IB_QP_QKEY) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_QKEY;
+		/* Using a Random  QKEY */
+		qp->qplib_qp.qkey = 0x81818181;
+	}
+	if (qp_attr_mask & IB_QP_SQ_PSN) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_SQ_PSN;
+		qp->qplib_qp.sq.psn = qp1_qp->qplib_qp.sq.psn;
+	}
+
+	rc = bnxt_qplib_modify_qp(&rdev->qplib_res, &qp->qplib_qp);
+	if (rc)
+		dev_err(rdev_to_dev(rdev),
+			"Failed to modify Shadow QP for QP1");
+	return rc;
+}
+
+int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
+		      int qp_attr_mask, struct ib_udata *udata)
+{
+	struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
+	struct bnxt_re_dev *rdev = qp->rdev;
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+	enum ib_qp_state curr_qp_state, new_qp_state;
+	int rc, entries;
+	int status;
+	union ib_gid sgid;
+	struct ib_gid_attr sgid_attr;
+	unsigned int flags;
+	u8 nw_type;
+
+	qp->qplib_qp.modify_flags = 0;
+	if (qp_attr_mask & IB_QP_STATE) {
+		curr_qp_state = __to_ib_qp_state(qp->qplib_qp.cur_qp_state);
+		new_qp_state = qp_attr->qp_state;
+		if (!ib_modify_qp_is_ok(curr_qp_state, new_qp_state,
+					ib_qp->qp_type, qp_attr_mask,
+					IB_LINK_LAYER_ETHERNET)) {
+			dev_err(rdev_to_dev(rdev),
+				"Invalid attribute mask: %#x specified ",
+				qp_attr_mask);
+			dev_err(rdev_to_dev(rdev),
+				"for qpn: %#x type: %#x",
+				ib_qp->qp_num, ib_qp->qp_type);
+			dev_err(rdev_to_dev(rdev),
+				"curr_qp_state=0x%x, new_qp_state=0x%x\n",
+				curr_qp_state, new_qp_state);
+			return -EINVAL;
+		}
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_STATE;
+		qp->qplib_qp.state = __from_ib_qp_state(qp_attr->qp_state);
+
+		if (!qp->sumem &&
+		    qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+			dev_dbg(rdev_to_dev(rdev),
+				"Move QP = %p to flush list\n",
+				qp);
+			flags = bnxt_re_lock_cqs(qp);
+			bnxt_qplib_add_flush_qp(&qp->qplib_qp);
+			bnxt_re_unlock_cqs(qp, flags);
+		}
+		if (!qp->sumem &&
+		    qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_RESET) {
+			dev_dbg(rdev_to_dev(rdev),
+				"Move QP = %p out of flush list\n",
+				qp);
+			flags = bnxt_re_lock_cqs(qp);
+			bnxt_qplib_clean_qp(&qp->qplib_qp);
+			bnxt_re_unlock_cqs(qp, flags);
+		}
+	}
+	if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_EN_SQD_ASYNC_NOTIFY;
+		qp->qplib_qp.en_sqd_async_notify = true;
+	}
+	if (qp_attr_mask & IB_QP_ACCESS_FLAGS) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS;
+		qp->qplib_qp.access =
+			__from_ib_access_flags(qp_attr->qp_access_flags);
+		/* LOCAL_WRITE access must be set to allow RC receive */
+		qp->qplib_qp.access |= BNXT_QPLIB_ACCESS_LOCAL_WRITE;
+	}
+	if (qp_attr_mask & IB_QP_PKEY_INDEX) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY;
+		qp->qplib_qp.pkey_index = qp_attr->pkey_index;
+	}
+	if (qp_attr_mask & IB_QP_QKEY) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_QKEY;
+		qp->qplib_qp.qkey = qp_attr->qkey;
+	}
+	if (qp_attr_mask & IB_QP_AV) {
+		const struct ib_global_route *grh =
+			rdma_ah_read_grh(&qp_attr->ah_attr);
+
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_DGID |
+				     CMDQ_MODIFY_QP_MODIFY_MASK_FLOW_LABEL |
+				     CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX |
+				     CMDQ_MODIFY_QP_MODIFY_MASK_HOP_LIMIT |
+				     CMDQ_MODIFY_QP_MODIFY_MASK_TRAFFIC_CLASS |
+				     CMDQ_MODIFY_QP_MODIFY_MASK_DEST_MAC |
+				     CMDQ_MODIFY_QP_MODIFY_MASK_VLAN_ID;
+		memcpy(qp->qplib_qp.ah.dgid.data, grh->dgid.raw,
+		       sizeof(qp->qplib_qp.ah.dgid.data));
+		qp->qplib_qp.ah.flow_label = grh->flow_label;
+		/* If RoCE V2 is enabled, stack will have two entries for
+		 * each GID entry. Avoiding this duplicte entry in HW. Dividing
+		 * the GID index by 2 for RoCE V2
+		 */
+		qp->qplib_qp.ah.sgid_index = grh->sgid_index / 2;
+		qp->qplib_qp.ah.host_sgid_index = grh->sgid_index;
+		qp->qplib_qp.ah.hop_limit = grh->hop_limit;
+		qp->qplib_qp.ah.traffic_class = grh->traffic_class;
+		qp->qplib_qp.ah.sl = rdma_ah_get_sl(&qp_attr->ah_attr);
+		ether_addr_copy(qp->qplib_qp.ah.dmac,
+				qp_attr->ah_attr.roce.dmac);
+
+		status = ib_get_cached_gid(&rdev->ibdev, 1,
+					   grh->sgid_index,
+					   &sgid, &sgid_attr);
+		if (!status) {
+			memcpy(qp->qplib_qp.smac, sgid_attr.ndev->dev_addr,
+			       ETH_ALEN);
+			dev_put(sgid_attr.ndev);
+			nw_type = ib_gid_to_network_type(sgid_attr.gid_type,
+							 &sgid);
+			switch (nw_type) {
+			case RDMA_NETWORK_IPV4:
+				qp->qplib_qp.nw_type =
+					CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV2_IPV4;
+				break;
+			case RDMA_NETWORK_IPV6:
+				qp->qplib_qp.nw_type =
+					CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV2_IPV6;
+				break;
+			default:
+				qp->qplib_qp.nw_type =
+					CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV1;
+				break;
+			}
+		}
+	}
+
+	if (qp_attr_mask & IB_QP_PATH_MTU) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU;
+		qp->qplib_qp.path_mtu = __from_ib_mtu(qp_attr->path_mtu);
+		qp->qplib_qp.mtu = ib_mtu_enum_to_int(qp_attr->path_mtu);
+	} else if (qp_attr->qp_state == IB_QPS_RTR) {
+		qp->qplib_qp.modify_flags |=
+			CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU;
+		qp->qplib_qp.path_mtu =
+			__from_ib_mtu(iboe_get_mtu(rdev->netdev->mtu));
+		qp->qplib_qp.mtu =
+			ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu));
+	}
+
+	if (qp_attr_mask & IB_QP_TIMEOUT) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_TIMEOUT;
+		qp->qplib_qp.timeout = qp_attr->timeout;
+	}
+	if (qp_attr_mask & IB_QP_RETRY_CNT) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_RETRY_CNT;
+		qp->qplib_qp.retry_cnt = qp_attr->retry_cnt;
+	}
+	if (qp_attr_mask & IB_QP_RNR_RETRY) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_RNR_RETRY;
+		qp->qplib_qp.rnr_retry = qp_attr->rnr_retry;
+	}
+	if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_MIN_RNR_TIMER;
+		qp->qplib_qp.min_rnr_timer = qp_attr->min_rnr_timer;
+	}
+	if (qp_attr_mask & IB_QP_RQ_PSN) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_RQ_PSN;
+		qp->qplib_qp.rq.psn = qp_attr->rq_psn;
+	}
+	if (qp_attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_MAX_RD_ATOMIC;
+		/* Cap the max_rd_atomic to device max */
+		qp->qplib_qp.max_rd_atomic = min_t(u32, qp_attr->max_rd_atomic,
+						   dev_attr->max_qp_rd_atom);
+	}
+	if (qp_attr_mask & IB_QP_SQ_PSN) {
+		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_SQ_PSN;
+		qp->qplib_qp.sq.psn = qp_attr->sq_psn;
+	}
+	if (qp_attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (qp_attr->max_dest_rd_atomic >
+		    dev_attr->max_qp_init_rd_atom) {
+			dev_err(rdev_to_dev(rdev),
+				"max_dest_rd_atomic requested%d is > dev_max%d",
+				qp_attr->max_dest_rd_atomic,
+				dev_attr->max_qp_init_rd_atom);
+			return -EINVAL;
+		}
+
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_MAX_DEST_RD_ATOMIC;
+		qp->qplib_qp.max_dest_rd_atomic = qp_attr->max_dest_rd_atomic;
+	}
+	if (qp_attr_mask & IB_QP_CAP) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_SQ_SIZE |
+				CMDQ_MODIFY_QP_MODIFY_MASK_RQ_SIZE |
+				CMDQ_MODIFY_QP_MODIFY_MASK_SQ_SGE |
+				CMDQ_MODIFY_QP_MODIFY_MASK_RQ_SGE |
+				CMDQ_MODIFY_QP_MODIFY_MASK_MAX_INLINE_DATA;
+		if ((qp_attr->cap.max_send_wr >= dev_attr->max_qp_wqes) ||
+		    (qp_attr->cap.max_recv_wr >= dev_attr->max_qp_wqes) ||
+		    (qp_attr->cap.max_send_sge >= dev_attr->max_qp_sges) ||
+		    (qp_attr->cap.max_recv_sge >= dev_attr->max_qp_sges) ||
+		    (qp_attr->cap.max_inline_data >=
+						dev_attr->max_inline_data)) {
+			dev_err(rdev_to_dev(rdev),
+				"Create QP failed - max exceeded");
+			return -EINVAL;
+		}
+		entries = roundup_pow_of_two(qp_attr->cap.max_send_wr);
+		qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
+						dev_attr->max_qp_wqes + 1);
+		qp->qplib_qp.sq.q_full_delta = qp->qplib_qp.sq.max_wqe -
+						qp_attr->cap.max_send_wr;
+		/*
+		 * Reserving one slot for Phantom WQE. Some application can
+		 * post one extra entry in this case. Allowing this to avoid
+		 * unexpected Queue full condition
+		 */
+		qp->qplib_qp.sq.q_full_delta -= 1;
+		qp->qplib_qp.sq.max_sge = qp_attr->cap.max_send_sge;
+		if (qp->qplib_qp.rq.max_wqe) {
+			entries = roundup_pow_of_two(qp_attr->cap.max_recv_wr);
+			qp->qplib_qp.rq.max_wqe =
+				min_t(u32, entries, dev_attr->max_qp_wqes + 1);
+			qp->qplib_qp.rq.q_full_delta = qp->qplib_qp.rq.max_wqe -
+						       qp_attr->cap.max_recv_wr;
+			qp->qplib_qp.rq.max_sge = qp_attr->cap.max_recv_sge;
+		} else {
+			/* SRQ was used prior, just ignore the RQ caps */
+		}
+	}
+	if (qp_attr_mask & IB_QP_DEST_QPN) {
+		qp->qplib_qp.modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_DEST_QP_ID;
+		qp->qplib_qp.dest_qpn = qp_attr->dest_qp_num;
+	}
+	rc = bnxt_qplib_modify_qp(&rdev->qplib_res, &qp->qplib_qp);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to modify HW QP");
+		return rc;
+	}
+	if (ib_qp->qp_type == IB_QPT_GSI && rdev->qp1_sqp)
+		rc = bnxt_re_modify_shadow_qp(rdev, qp, qp_attr_mask);
+	return rc;
+}
+
+int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
+		     int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
+	struct bnxt_re_dev *rdev = qp->rdev;
+	struct bnxt_qplib_qp *qplib_qp;
+	int rc;
+
+	qplib_qp = kzalloc(sizeof(*qplib_qp), GFP_KERNEL);
+	if (!qplib_qp)
+		return -ENOMEM;
+
+	qplib_qp->id = qp->qplib_qp.id;
+	qplib_qp->ah.host_sgid_index = qp->qplib_qp.ah.host_sgid_index;
+
+	rc = bnxt_qplib_query_qp(&rdev->qplib_res, qplib_qp);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to query HW QP");
+		goto out;
+	}
+	qp_attr->qp_state = __to_ib_qp_state(qplib_qp->state);
+	qp_attr->en_sqd_async_notify = qplib_qp->en_sqd_async_notify ? 1 : 0;
+	qp_attr->qp_access_flags = __to_ib_access_flags(qplib_qp->access);
+	qp_attr->pkey_index = qplib_qp->pkey_index;
+	qp_attr->qkey = qplib_qp->qkey;
+	qp_attr->ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
+	rdma_ah_set_grh(&qp_attr->ah_attr, NULL, qplib_qp->ah.flow_label,
+			qplib_qp->ah.host_sgid_index,
+			qplib_qp->ah.hop_limit,
+			qplib_qp->ah.traffic_class);
+	rdma_ah_set_dgid_raw(&qp_attr->ah_attr, qplib_qp->ah.dgid.data);
+	rdma_ah_set_sl(&qp_attr->ah_attr, qplib_qp->ah.sl);
+	ether_addr_copy(qp_attr->ah_attr.roce.dmac, qplib_qp->ah.dmac);
+	qp_attr->path_mtu = __to_ib_mtu(qplib_qp->path_mtu);
+	qp_attr->timeout = qplib_qp->timeout;
+	qp_attr->retry_cnt = qplib_qp->retry_cnt;
+	qp_attr->rnr_retry = qplib_qp->rnr_retry;
+	qp_attr->min_rnr_timer = qplib_qp->min_rnr_timer;
+	qp_attr->rq_psn = qplib_qp->rq.psn;
+	qp_attr->max_rd_atomic = qplib_qp->max_rd_atomic;
+	qp_attr->sq_psn = qplib_qp->sq.psn;
+	qp_attr->max_dest_rd_atomic = qplib_qp->max_dest_rd_atomic;
+	qp_init_attr->sq_sig_type = qplib_qp->sig_type ? IB_SIGNAL_ALL_WR :
+							 IB_SIGNAL_REQ_WR;
+	qp_attr->dest_qp_num = qplib_qp->dest_qpn;
+
+	qp_attr->cap.max_send_wr = qp->qplib_qp.sq.max_wqe;
+	qp_attr->cap.max_send_sge = qp->qplib_qp.sq.max_sge;
+	qp_attr->cap.max_recv_wr = qp->qplib_qp.rq.max_wqe;
+	qp_attr->cap.max_recv_sge = qp->qplib_qp.rq.max_sge;
+	qp_attr->cap.max_inline_data = qp->qplib_qp.max_inline_data;
+	qp_init_attr->cap = qp_attr->cap;
+
+out:
+	kfree(qplib_qp);
+	return rc;
+}
+
+/* Routine for sending QP1 packets for RoCE V1 an V2
+ */
+static int bnxt_re_build_qp1_send_v2(struct bnxt_re_qp *qp,
+				     struct ib_send_wr *wr,
+				     struct bnxt_qplib_swqe *wqe,
+				     int payload_size)
+{
+	struct ib_device *ibdev = &qp->rdev->ibdev;
+	struct bnxt_re_ah *ah = container_of(ud_wr(wr)->ah, struct bnxt_re_ah,
+					     ib_ah);
+	struct bnxt_qplib_ah *qplib_ah = &ah->qplib_ah;
+	struct bnxt_qplib_sge sge;
+	union ib_gid sgid;
+	u8 nw_type;
+	u16 ether_type;
+	struct ib_gid_attr sgid_attr;
+	union ib_gid dgid;
+	bool is_eth = false;
+	bool is_vlan = false;
+	bool is_grh = false;
+	bool is_udp = false;
+	u8 ip_version = 0;
+	u16 vlan_id = 0xFFFF;
+	void *buf;
+	int i, rc = 0;
+
+	memset(&qp->qp1_hdr, 0, sizeof(qp->qp1_hdr));
+
+	rc = ib_get_cached_gid(ibdev, 1,
+			       qplib_ah->host_sgid_index, &sgid,
+			       &sgid_attr);
+	if (rc) {
+		dev_err(rdev_to_dev(qp->rdev),
+			"Failed to query gid at index %d",
+			qplib_ah->host_sgid_index);
+		return rc;
+	}
+	if (sgid_attr.ndev) {
+		if (is_vlan_dev(sgid_attr.ndev))
+			vlan_id = vlan_dev_vlan_id(sgid_attr.ndev);
+		dev_put(sgid_attr.ndev);
+	}
+	/* Get network header type for this GID */
+	nw_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid);
+	switch (nw_type) {
+	case RDMA_NETWORK_IPV4:
+		nw_type = BNXT_RE_ROCEV2_IPV4_PACKET;
+		break;
+	case RDMA_NETWORK_IPV6:
+		nw_type = BNXT_RE_ROCEV2_IPV6_PACKET;
+		break;
+	default:
+		nw_type = BNXT_RE_ROCE_V1_PACKET;
+		break;
+	}
+	memcpy(&dgid.raw, &qplib_ah->dgid, 16);
+	is_udp = sgid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP;
+	if (is_udp) {
+		if (ipv6_addr_v4mapped((struct in6_addr *)&sgid)) {
+			ip_version = 4;
+			ether_type = ETH_P_IP;
+		} else {
+			ip_version = 6;
+			ether_type = ETH_P_IPV6;
+		}
+		is_grh = false;
+	} else {
+		ether_type = ETH_P_IBOE;
+		is_grh = true;
+	}
+
+	is_eth = true;
+	is_vlan = (vlan_id && (vlan_id < 0x1000)) ? true : false;
+
+	ib_ud_header_init(payload_size, !is_eth, is_eth, is_vlan, is_grh,
+			  ip_version, is_udp, 0, &qp->qp1_hdr);
+
+	/* ETH */
+	ether_addr_copy(qp->qp1_hdr.eth.dmac_h, ah->qplib_ah.dmac);
+	ether_addr_copy(qp->qp1_hdr.eth.smac_h, qp->qplib_qp.smac);
+
+	/* For vlan, check the sgid for vlan existence */
+
+	if (!is_vlan) {
+		qp->qp1_hdr.eth.type = cpu_to_be16(ether_type);
+	} else {
+		qp->qp1_hdr.vlan.type = cpu_to_be16(ether_type);
+		qp->qp1_hdr.vlan.tag = cpu_to_be16(vlan_id);
+	}
+
+	if (is_grh || (ip_version == 6)) {
+		memcpy(qp->qp1_hdr.grh.source_gid.raw, sgid.raw, sizeof(sgid));
+		memcpy(qp->qp1_hdr.grh.destination_gid.raw, qplib_ah->dgid.data,
+		       sizeof(sgid));
+		qp->qp1_hdr.grh.hop_limit     = qplib_ah->hop_limit;
+	}
+
+	if (ip_version == 4) {
+		qp->qp1_hdr.ip4.tos = 0;
+		qp->qp1_hdr.ip4.id = 0;
+		qp->qp1_hdr.ip4.frag_off = htons(IP_DF);
+		qp->qp1_hdr.ip4.ttl = qplib_ah->hop_limit;
+
+		memcpy(&qp->qp1_hdr.ip4.saddr, sgid.raw + 12, 4);
+		memcpy(&qp->qp1_hdr.ip4.daddr, qplib_ah->dgid.data + 12, 4);
+		qp->qp1_hdr.ip4.check = ib_ud_ip4_csum(&qp->qp1_hdr);
+	}
+
+	if (is_udp) {
+		qp->qp1_hdr.udp.dport = htons(ROCE_V2_UDP_DPORT);
+		qp->qp1_hdr.udp.sport = htons(0x8CD1);
+		qp->qp1_hdr.udp.csum = 0;
+	}
+
+	/* BTH */
+	if (wr->opcode == IB_WR_SEND_WITH_IMM) {
+		qp->qp1_hdr.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		qp->qp1_hdr.immediate_present = 1;
+	} else {
+		qp->qp1_hdr.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+	}
+	if (wr->send_flags & IB_SEND_SOLICITED)
+		qp->qp1_hdr.bth.solicited_event = 1;
+	/* pad_count */
+	qp->qp1_hdr.bth.pad_count = (4 - payload_size) & 3;
+
+	/* P_key for QP1 is for all members */
+	qp->qp1_hdr.bth.pkey = cpu_to_be16(0xFFFF);
+	qp->qp1_hdr.bth.destination_qpn = IB_QP1;
+	qp->qp1_hdr.bth.ack_req = 0;
+	qp->send_psn++;
+	qp->send_psn &= BTH_PSN_MASK;
+	qp->qp1_hdr.bth.psn = cpu_to_be32(qp->send_psn);
+	/* DETH */
+	/* Use the priviledged Q_Key for QP1 */
+	qp->qp1_hdr.deth.qkey = cpu_to_be32(IB_QP1_QKEY);
+	qp->qp1_hdr.deth.source_qpn = IB_QP1;
+
+	/* Pack the QP1 to the transmit buffer */
+	buf = bnxt_qplib_get_qp1_sq_buf(&qp->qplib_qp, &sge);
+	if (buf) {
+		ib_ud_header_pack(&qp->qp1_hdr, buf);
+		for (i = wqe->num_sge; i; i--) {
+			wqe->sg_list[i].addr = wqe->sg_list[i - 1].addr;
+			wqe->sg_list[i].lkey = wqe->sg_list[i - 1].lkey;
+			wqe->sg_list[i].size = wqe->sg_list[i - 1].size;
+		}
+
+		/*
+		 * Max Header buf size for IPV6 RoCE V2 is 86,
+		 * which is same as the QP1 SQ header buffer.
+		 * Header buf size for IPV4 RoCE V2 can be 66.
+		 * ETH(14) + VLAN(4)+ IP(20) + UDP (8) + BTH(20).
+		 * Subtract 20 bytes from QP1 SQ header buf size
+		 */
+		if (is_udp && ip_version == 4)
+			sge.size -= 20;
+		/*
+		 * Max Header buf size for RoCE V1 is 78.
+		 * ETH(14) + VLAN(4) + GRH(40) + BTH(20).
+		 * Subtract 8 bytes from QP1 SQ header buf size
+		 */
+		if (!is_udp)
+			sge.size -= 8;
+
+		/* Subtract 4 bytes for non vlan packets */
+		if (!is_vlan)
+			sge.size -= 4;
+
+		wqe->sg_list[0].addr = sge.addr;
+		wqe->sg_list[0].lkey = sge.lkey;
+		wqe->sg_list[0].size = sge.size;
+		wqe->num_sge++;
+
+	} else {
+		dev_err(rdev_to_dev(qp->rdev), "QP1 buffer is empty!");
+		rc = -ENOMEM;
+	}
+	return rc;
+}
+
+/* For the MAD layer, it only provides the recv SGE the size of
+ * ib_grh + MAD datagram.  No Ethernet headers, Ethertype, BTH, DETH,
+ * nor RoCE iCRC.  The Cu+ solution must provide buffer for the entire
+ * receive packet (334 bytes) with no VLAN and then copy the GRH
+ * and the MAD datagram out to the provided SGE.
+ */
+static int bnxt_re_build_qp1_shadow_qp_recv(struct bnxt_re_qp *qp,
+					    struct ib_recv_wr *wr,
+					    struct bnxt_qplib_swqe *wqe,
+					    int payload_size)
+{
+	struct bnxt_qplib_sge ref, sge;
+	u32 rq_prod_index;
+	struct bnxt_re_sqp_entries *sqp_entry;
+
+	rq_prod_index = bnxt_qplib_get_rq_prod_index(&qp->qplib_qp);
+
+	if (!bnxt_qplib_get_qp1_rq_buf(&qp->qplib_qp, &sge))
+		return -ENOMEM;
+
+	/* Create 1 SGE to receive the entire
+	 * ethernet packet
+	 */
+	/* Save the reference from ULP */
+	ref.addr = wqe->sg_list[0].addr;
+	ref.lkey = wqe->sg_list[0].lkey;
+	ref.size = wqe->sg_list[0].size;
+
+	sqp_entry = &qp->rdev->sqp_tbl[rq_prod_index];
+
+	/* SGE 1 */
+	wqe->sg_list[0].addr = sge.addr;
+	wqe->sg_list[0].lkey = sge.lkey;
+	wqe->sg_list[0].size = BNXT_QPLIB_MAX_QP1_RQ_HDR_SIZE_V2;
+	sge.size -= wqe->sg_list[0].size;
+
+	sqp_entry->sge.addr = ref.addr;
+	sqp_entry->sge.lkey = ref.lkey;
+	sqp_entry->sge.size = ref.size;
+	/* Store the wrid for reporting completion */
+	sqp_entry->wrid = wqe->wr_id;
+	/* change the wqe->wrid to table index */
+	wqe->wr_id = rq_prod_index;
+	return 0;
+}
+
+static int is_ud_qp(struct bnxt_re_qp *qp)
+{
+	return qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD;
+}
+
+static int bnxt_re_build_send_wqe(struct bnxt_re_qp *qp,
+				  struct ib_send_wr *wr,
+				  struct bnxt_qplib_swqe *wqe)
+{
+	struct bnxt_re_ah *ah = NULL;
+
+	if (is_ud_qp(qp)) {
+		ah = container_of(ud_wr(wr)->ah, struct bnxt_re_ah, ib_ah);
+		wqe->send.q_key = ud_wr(wr)->remote_qkey;
+		wqe->send.dst_qp = ud_wr(wr)->remote_qpn;
+		wqe->send.avid = ah->qplib_ah.id;
+	}
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM;
+		wqe->send.imm_data = wr->ex.imm_data;
+		break;
+	case IB_WR_SEND_WITH_INV:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV;
+		wqe->send.inv_key = wr->ex.invalidate_rkey;
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (wr->send_flags & IB_SEND_SIGNALED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+	if (wr->send_flags & IB_SEND_FENCE)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+	if (wr->send_flags & IB_SEND_SOLICITED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT;
+	if (wr->send_flags & IB_SEND_INLINE)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_INLINE;
+
+	return 0;
+}
+
+static int bnxt_re_build_rdma_wqe(struct ib_send_wr *wr,
+				  struct bnxt_qplib_swqe *wqe)
+{
+	switch (wr->opcode) {
+	case IB_WR_RDMA_WRITE:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE;
+		break;
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM;
+		wqe->rdma.imm_data = wr->ex.imm_data;
+		break;
+	case IB_WR_RDMA_READ:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_RDMA_READ;
+		wqe->rdma.inv_key = wr->ex.invalidate_rkey;
+		break;
+	default:
+		return -EINVAL;
+	}
+	wqe->rdma.remote_va = rdma_wr(wr)->remote_addr;
+	wqe->rdma.r_key = rdma_wr(wr)->rkey;
+	if (wr->send_flags & IB_SEND_SIGNALED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+	if (wr->send_flags & IB_SEND_FENCE)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+	if (wr->send_flags & IB_SEND_SOLICITED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT;
+	if (wr->send_flags & IB_SEND_INLINE)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_INLINE;
+
+	return 0;
+}
+
+static int bnxt_re_build_atomic_wqe(struct ib_send_wr *wr,
+				    struct bnxt_qplib_swqe *wqe)
+{
+	switch (wr->opcode) {
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP;
+		wqe->atomic.cmp_data = atomic_wr(wr)->compare_add;
+		wqe->atomic.swap_data = atomic_wr(wr)->swap;
+		break;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		wqe->type = BNXT_QPLIB_SWQE_TYPE_ATOMIC_FETCH_AND_ADD;
+		wqe->atomic.cmp_data = atomic_wr(wr)->compare_add;
+		break;
+	default:
+		return -EINVAL;
+	}
+	wqe->atomic.remote_va = atomic_wr(wr)->remote_addr;
+	wqe->atomic.r_key = atomic_wr(wr)->rkey;
+	if (wr->send_flags & IB_SEND_SIGNALED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+	if (wr->send_flags & IB_SEND_FENCE)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+	if (wr->send_flags & IB_SEND_SOLICITED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT;
+	return 0;
+}
+
+static int bnxt_re_build_inv_wqe(struct ib_send_wr *wr,
+				 struct bnxt_qplib_swqe *wqe)
+{
+	wqe->type = BNXT_QPLIB_SWQE_TYPE_LOCAL_INV;
+	wqe->local_inv.inv_l_key = wr->ex.invalidate_rkey;
+
+	/* Need unconditional fence for local invalidate
+	 * opcode to work as expected.
+	 */
+	wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+
+	if (wr->send_flags & IB_SEND_SIGNALED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+	if (wr->send_flags & IB_SEND_SOLICITED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT;
+
+	return 0;
+}
+
+static int bnxt_re_build_reg_wqe(struct ib_reg_wr *wr,
+				 struct bnxt_qplib_swqe *wqe)
+{
+	struct bnxt_re_mr *mr = container_of(wr->mr, struct bnxt_re_mr, ib_mr);
+	struct bnxt_qplib_frpl *qplib_frpl = &mr->qplib_frpl;
+	int access = wr->access;
+
+	wqe->frmr.pbl_ptr = (__le64 *)qplib_frpl->hwq.pbl_ptr[0];
+	wqe->frmr.pbl_dma_ptr = qplib_frpl->hwq.pbl_dma_ptr[0];
+	wqe->frmr.page_list = mr->pages;
+	wqe->frmr.page_list_len = mr->npages;
+	wqe->frmr.levels = qplib_frpl->hwq.level + 1;
+	wqe->type = BNXT_QPLIB_SWQE_TYPE_REG_MR;
+
+	/* Need unconditional fence for reg_mr
+	 * opcode to function as expected.
+	 */
+
+	wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_UC_FENCE;
+
+	if (wr->wr.send_flags & IB_SEND_SIGNALED)
+		wqe->flags |= BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP;
+
+	if (access & IB_ACCESS_LOCAL_WRITE)
+		wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_LOCAL_WRITE;
+	if (access & IB_ACCESS_REMOTE_READ)
+		wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_REMOTE_READ;
+	if (access & IB_ACCESS_REMOTE_WRITE)
+		wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_REMOTE_WRITE;
+	if (access & IB_ACCESS_REMOTE_ATOMIC)
+		wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_REMOTE_ATOMIC;
+	if (access & IB_ACCESS_MW_BIND)
+		wqe->frmr.access_cntl |= SQ_FR_PMR_ACCESS_CNTL_WINDOW_BIND;
+
+	wqe->frmr.l_key = wr->key;
+	wqe->frmr.length = wr->mr->length;
+	wqe->frmr.pbl_pg_sz_log = (wr->mr->page_size >> PAGE_SHIFT_4K) - 1;
+	wqe->frmr.va = wr->mr->iova;
+	return 0;
+}
+
+static int bnxt_re_copy_inline_data(struct bnxt_re_dev *rdev,
+				    struct ib_send_wr *wr,
+				    struct bnxt_qplib_swqe *wqe)
+{
+	/*  Copy the inline data to the data  field */
+	u8 *in_data;
+	u32 i, sge_len;
+	void *sge_addr;
+
+	in_data = wqe->inline_data;
+	for (i = 0; i < wr->num_sge; i++) {
+		sge_addr = (void *)(unsigned long)
+				wr->sg_list[i].addr;
+		sge_len = wr->sg_list[i].length;
+
+		if ((sge_len + wqe->inline_len) >
+		    BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) {
+			dev_err(rdev_to_dev(rdev),
+				"Inline data size requested > supported value");
+			return -EINVAL;
+		}
+		sge_len = wr->sg_list[i].length;
+
+		memcpy(in_data, sge_addr, sge_len);
+		in_data += wr->sg_list[i].length;
+		wqe->inline_len += wr->sg_list[i].length;
+	}
+	return wqe->inline_len;
+}
+
+static int bnxt_re_copy_wr_payload(struct bnxt_re_dev *rdev,
+				   struct ib_send_wr *wr,
+				   struct bnxt_qplib_swqe *wqe)
+{
+	int payload_sz = 0;
+
+	if (wr->send_flags & IB_SEND_INLINE)
+		payload_sz = bnxt_re_copy_inline_data(rdev, wr, wqe);
+	else
+		payload_sz = bnxt_re_build_sgl(wr->sg_list, wqe->sg_list,
+					       wqe->num_sge);
+
+	return payload_sz;
+}
+
+static void bnxt_ud_qp_hw_stall_workaround(struct bnxt_re_qp *qp)
+{
+	if ((qp->ib_qp.qp_type == IB_QPT_UD ||
+	     qp->ib_qp.qp_type == IB_QPT_GSI ||
+	     qp->ib_qp.qp_type == IB_QPT_RAW_ETHERTYPE) &&
+	     qp->qplib_qp.wqe_cnt == BNXT_RE_UD_QP_HW_STALL) {
+		int qp_attr_mask;
+		struct ib_qp_attr qp_attr;
+
+		qp_attr_mask = IB_QP_STATE;
+		qp_attr.qp_state = IB_QPS_RTS;
+		bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, qp_attr_mask, NULL);
+		qp->qplib_qp.wqe_cnt = 0;
+	}
+}
+
+static int bnxt_re_post_send_shadow_qp(struct bnxt_re_dev *rdev,
+				       struct bnxt_re_qp *qp,
+				struct ib_send_wr *wr)
+{
+	struct bnxt_qplib_swqe wqe;
+	int rc = 0, payload_sz = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->sq_lock, flags);
+	memset(&wqe, 0, sizeof(wqe));
+	while (wr) {
+		/* House keeping */
+		memset(&wqe, 0, sizeof(wqe));
+
+		/* Common */
+		wqe.num_sge = wr->num_sge;
+		if (wr->num_sge > qp->qplib_qp.sq.max_sge) {
+			dev_err(rdev_to_dev(rdev),
+				"Limit exceeded for Send SGEs");
+			rc = -EINVAL;
+			goto bad;
+		}
+
+		payload_sz = bnxt_re_copy_wr_payload(qp->rdev, wr, &wqe);
+		if (payload_sz < 0) {
+			rc = -EINVAL;
+			goto bad;
+		}
+		wqe.wr_id = wr->wr_id;
+
+		wqe.type = BNXT_QPLIB_SWQE_TYPE_SEND;
+
+		rc = bnxt_re_build_send_wqe(qp, wr, &wqe);
+		if (!rc)
+			rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
+bad:
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Post send failed opcode = %#x rc = %d",
+				wr->opcode, rc);
+			break;
+		}
+		wr = wr->next;
+	}
+	bnxt_qplib_post_send_db(&qp->qplib_qp);
+	bnxt_ud_qp_hw_stall_workaround(qp);
+	spin_unlock_irqrestore(&qp->sq_lock, flags);
+	return rc;
+}
+
+int bnxt_re_post_send(struct ib_qp *ib_qp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
+	struct bnxt_qplib_swqe wqe;
+	int rc = 0, payload_sz = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->sq_lock, flags);
+	while (wr) {
+		/* House keeping */
+		memset(&wqe, 0, sizeof(wqe));
+
+		/* Common */
+		wqe.num_sge = wr->num_sge;
+		if (wr->num_sge > qp->qplib_qp.sq.max_sge) {
+			dev_err(rdev_to_dev(qp->rdev),
+				"Limit exceeded for Send SGEs");
+			rc = -EINVAL;
+			goto bad;
+		}
+
+		payload_sz = bnxt_re_copy_wr_payload(qp->rdev, wr, &wqe);
+		if (payload_sz < 0) {
+			rc = -EINVAL;
+			goto bad;
+		}
+		wqe.wr_id = wr->wr_id;
+
+		switch (wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			if (ib_qp->qp_type == IB_QPT_GSI) {
+				rc = bnxt_re_build_qp1_send_v2(qp, wr, &wqe,
+							       payload_sz);
+				if (rc)
+					goto bad;
+				wqe.rawqp1.lflags |=
+					SQ_SEND_RAWETH_QP1_LFLAGS_ROCE_CRC;
+			}
+			switch (wr->send_flags) {
+			case IB_SEND_IP_CSUM:
+				wqe.rawqp1.lflags |=
+					SQ_SEND_RAWETH_QP1_LFLAGS_IP_CHKSUM;
+				break;
+			default:
+				break;
+			}
+			/* Fall thru to build the wqe */
+		case IB_WR_SEND_WITH_INV:
+			rc = bnxt_re_build_send_wqe(qp, wr, &wqe);
+			break;
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+		case IB_WR_RDMA_READ:
+			rc = bnxt_re_build_rdma_wqe(wr, &wqe);
+			break;
+		case IB_WR_ATOMIC_CMP_AND_SWP:
+		case IB_WR_ATOMIC_FETCH_AND_ADD:
+			rc = bnxt_re_build_atomic_wqe(wr, &wqe);
+			break;
+		case IB_WR_RDMA_READ_WITH_INV:
+			dev_err(rdev_to_dev(qp->rdev),
+				"RDMA Read with Invalidate is not supported");
+			rc = -EINVAL;
+			goto bad;
+		case IB_WR_LOCAL_INV:
+			rc = bnxt_re_build_inv_wqe(wr, &wqe);
+			break;
+		case IB_WR_REG_MR:
+			rc = bnxt_re_build_reg_wqe(reg_wr(wr), &wqe);
+			break;
+		default:
+			/* Unsupported WRs */
+			dev_err(rdev_to_dev(qp->rdev),
+				"WR (%#x) is not supported", wr->opcode);
+			rc = -EINVAL;
+			goto bad;
+		}
+		if (!rc)
+			rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
+bad:
+		if (rc) {
+			dev_err(rdev_to_dev(qp->rdev),
+				"post_send failed op:%#x qps = %#x rc = %d\n",
+				wr->opcode, qp->qplib_qp.state, rc);
+			*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+	bnxt_qplib_post_send_db(&qp->qplib_qp);
+	bnxt_ud_qp_hw_stall_workaround(qp);
+	spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+	return rc;
+}
+
+static int bnxt_re_post_recv_shadow_qp(struct bnxt_re_dev *rdev,
+				       struct bnxt_re_qp *qp,
+				       struct ib_recv_wr *wr)
+{
+	struct bnxt_qplib_swqe wqe;
+	int rc = 0;
+
+	memset(&wqe, 0, sizeof(wqe));
+	while (wr) {
+		/* House keeping */
+		memset(&wqe, 0, sizeof(wqe));
+
+		/* Common */
+		wqe.num_sge = wr->num_sge;
+		if (wr->num_sge > qp->qplib_qp.rq.max_sge) {
+			dev_err(rdev_to_dev(rdev),
+				"Limit exceeded for Receive SGEs");
+			rc = -EINVAL;
+			break;
+		}
+		bnxt_re_build_sgl(wr->sg_list, wqe.sg_list, wr->num_sge);
+		wqe.wr_id = wr->wr_id;
+		wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV;
+
+		rc = bnxt_qplib_post_recv(&qp->qplib_qp, &wqe);
+		if (rc)
+			break;
+
+		wr = wr->next;
+	}
+	if (!rc)
+		bnxt_qplib_post_recv_db(&qp->qplib_qp);
+	return rc;
+}
+
+int bnxt_re_post_recv(struct ib_qp *ib_qp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
+	struct bnxt_qplib_swqe wqe;
+	int rc = 0, payload_sz = 0;
+	unsigned long flags;
+	u32 count = 0;
+
+	spin_lock_irqsave(&qp->rq_lock, flags);
+	while (wr) {
+		/* House keeping */
+		memset(&wqe, 0, sizeof(wqe));
+
+		/* Common */
+		wqe.num_sge = wr->num_sge;
+		if (wr->num_sge > qp->qplib_qp.rq.max_sge) {
+			dev_err(rdev_to_dev(qp->rdev),
+				"Limit exceeded for Receive SGEs");
+			rc = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		payload_sz = bnxt_re_build_sgl(wr->sg_list, wqe.sg_list,
+					       wr->num_sge);
+		wqe.wr_id = wr->wr_id;
+		wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV;
+
+		if (ib_qp->qp_type == IB_QPT_GSI)
+			rc = bnxt_re_build_qp1_shadow_qp_recv(qp, wr, &wqe,
+							      payload_sz);
+		if (!rc)
+			rc = bnxt_qplib_post_recv(&qp->qplib_qp, &wqe);
+		if (rc) {
+			*bad_wr = wr;
+			break;
+		}
+
+		/* Ring DB if the RQEs posted reaches a threshold value */
+		if (++count >= BNXT_RE_RQ_WQE_THRESHOLD) {
+			bnxt_qplib_post_recv_db(&qp->qplib_qp);
+			count = 0;
+		}
+
+		wr = wr->next;
+	}
+
+	if (count)
+		bnxt_qplib_post_recv_db(&qp->qplib_qp);
+
+	spin_unlock_irqrestore(&qp->rq_lock, flags);
+
+	return rc;
+}
+
+/* Completion Queues */
+int bnxt_re_destroy_cq(struct ib_cq *ib_cq)
+{
+	int rc;
+	struct bnxt_re_cq *cq;
+	struct bnxt_qplib_nq *nq;
+	struct bnxt_re_dev *rdev;
+
+	cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq);
+	rdev = cq->rdev;
+	nq = cq->qplib_cq.nq;
+
+	rc = bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to destroy HW CQ");
+		return rc;
+	}
+	if (!IS_ERR_OR_NULL(cq->umem))
+		ib_umem_release(cq->umem);
+
+	atomic_dec(&rdev->cq_count);
+	nq->budget--;
+	kfree(cq->cql);
+	kfree(cq);
+
+	return 0;
+}
+
+struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
+				const struct ib_cq_init_attr *attr,
+				struct ib_ucontext *context,
+				struct ib_udata *udata)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+	struct bnxt_re_cq *cq = NULL;
+	int rc, entries;
+	int cqe = attr->cqe;
+	struct bnxt_qplib_nq *nq = NULL;
+	unsigned int nq_alloc_cnt;
+
+	/* Validate CQ fields */
+	if (cqe < 1 || cqe > dev_attr->max_cq_wqes) {
+		dev_err(rdev_to_dev(rdev), "Failed to create CQ -max exceeded");
+		return ERR_PTR(-EINVAL);
+	}
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	cq->rdev = rdev;
+	cq->qplib_cq.cq_handle = (u64)(unsigned long)(&cq->qplib_cq);
+
+	entries = roundup_pow_of_two(cqe + 1);
+	if (entries > dev_attr->max_cq_wqes + 1)
+		entries = dev_attr->max_cq_wqes + 1;
+
+	if (context) {
+		struct bnxt_re_cq_req req;
+		struct bnxt_re_ucontext *uctx = container_of
+						(context,
+						 struct bnxt_re_ucontext,
+						 ib_uctx);
+		if (ib_copy_from_udata(&req, udata, sizeof(req))) {
+			rc = -EFAULT;
+			goto fail;
+		}
+
+		cq->umem = ib_umem_get(context, req.cq_va,
+				       entries * sizeof(struct cq_base),
+				       IB_ACCESS_LOCAL_WRITE, 1);
+		if (IS_ERR(cq->umem)) {
+			rc = PTR_ERR(cq->umem);
+			goto fail;
+		}
+		cq->qplib_cq.sghead = cq->umem->sg_head.sgl;
+		cq->qplib_cq.nmap = cq->umem->nmap;
+		cq->qplib_cq.dpi = &uctx->dpi;
+	} else {
+		cq->max_cql = min_t(u32, entries, MAX_CQL_PER_POLL);
+		cq->cql = kcalloc(cq->max_cql, sizeof(struct bnxt_qplib_cqe),
+				  GFP_KERNEL);
+		if (!cq->cql) {
+			rc = -ENOMEM;
+			goto fail;
+		}
+
+		cq->qplib_cq.dpi = &rdev->dpi_privileged;
+		cq->qplib_cq.sghead = NULL;
+		cq->qplib_cq.nmap = 0;
+	}
+	/*
+	 * Allocating the NQ in a round robin fashion. nq_alloc_cnt is a
+	 * used for getting the NQ index.
+	 */
+	nq_alloc_cnt = atomic_inc_return(&rdev->nq_alloc_cnt);
+	nq = &rdev->nq[nq_alloc_cnt % (rdev->num_msix - 1)];
+	cq->qplib_cq.max_wqe = entries;
+	cq->qplib_cq.cnq_hw_ring_id = nq->ring_id;
+	cq->qplib_cq.nq	= nq;
+
+	rc = bnxt_qplib_create_cq(&rdev->qplib_res, &cq->qplib_cq);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to create HW CQ");
+		goto fail;
+	}
+
+	cq->ib_cq.cqe = entries;
+	cq->cq_period = cq->qplib_cq.period;
+	nq->budget++;
+
+	atomic_inc(&rdev->cq_count);
+
+	if (context) {
+		struct bnxt_re_cq_resp resp;
+
+		resp.cqid = cq->qplib_cq.id;
+		resp.tail = cq->qplib_cq.hwq.cons;
+		resp.phase = cq->qplib_cq.period;
+		resp.rsvd = 0;
+		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "Failed to copy CQ udata");
+			bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
+			goto c2fail;
+		}
+	}
+
+	return &cq->ib_cq;
+
+c2fail:
+	if (context)
+		ib_umem_release(cq->umem);
+fail:
+	kfree(cq->cql);
+	kfree(cq);
+	return ERR_PTR(rc);
+}
+
+static u8 __req_to_ib_wc_status(u8 qstatus)
+{
+	switch (qstatus) {
+	case CQ_REQ_STATUS_OK:
+		return IB_WC_SUCCESS;
+	case CQ_REQ_STATUS_BAD_RESPONSE_ERR:
+		return IB_WC_BAD_RESP_ERR;
+	case CQ_REQ_STATUS_LOCAL_LENGTH_ERR:
+		return IB_WC_LOC_LEN_ERR;
+	case CQ_REQ_STATUS_LOCAL_QP_OPERATION_ERR:
+		return IB_WC_LOC_QP_OP_ERR;
+	case CQ_REQ_STATUS_LOCAL_PROTECTION_ERR:
+		return IB_WC_LOC_PROT_ERR;
+	case CQ_REQ_STATUS_MEMORY_MGT_OPERATION_ERR:
+		return IB_WC_GENERAL_ERR;
+	case CQ_REQ_STATUS_REMOTE_INVALID_REQUEST_ERR:
+		return IB_WC_REM_INV_REQ_ERR;
+	case CQ_REQ_STATUS_REMOTE_ACCESS_ERR:
+		return IB_WC_REM_ACCESS_ERR;
+	case CQ_REQ_STATUS_REMOTE_OPERATION_ERR:
+		return IB_WC_REM_OP_ERR;
+	case CQ_REQ_STATUS_RNR_NAK_RETRY_CNT_ERR:
+		return IB_WC_RNR_RETRY_EXC_ERR;
+	case CQ_REQ_STATUS_TRANSPORT_RETRY_CNT_ERR:
+		return IB_WC_RETRY_EXC_ERR;
+	case CQ_REQ_STATUS_WORK_REQUEST_FLUSHED_ERR:
+		return IB_WC_WR_FLUSH_ERR;
+	default:
+		return IB_WC_GENERAL_ERR;
+	}
+	return 0;
+}
+
+static u8 __rawqp1_to_ib_wc_status(u8 qstatus)
+{
+	switch (qstatus) {
+	case CQ_RES_RAWETH_QP1_STATUS_OK:
+		return IB_WC_SUCCESS;
+	case CQ_RES_RAWETH_QP1_STATUS_LOCAL_ACCESS_ERROR:
+		return IB_WC_LOC_ACCESS_ERR;
+	case CQ_RES_RAWETH_QP1_STATUS_HW_LOCAL_LENGTH_ERR:
+		return IB_WC_LOC_LEN_ERR;
+	case CQ_RES_RAWETH_QP1_STATUS_LOCAL_PROTECTION_ERR:
+		return IB_WC_LOC_PROT_ERR;
+	case CQ_RES_RAWETH_QP1_STATUS_LOCAL_QP_OPERATION_ERR:
+		return IB_WC_LOC_QP_OP_ERR;
+	case CQ_RES_RAWETH_QP1_STATUS_MEMORY_MGT_OPERATION_ERR:
+		return IB_WC_GENERAL_ERR;
+	case CQ_RES_RAWETH_QP1_STATUS_WORK_REQUEST_FLUSHED_ERR:
+		return IB_WC_WR_FLUSH_ERR;
+	case CQ_RES_RAWETH_QP1_STATUS_HW_FLUSH_ERR:
+		return IB_WC_WR_FLUSH_ERR;
+	default:
+		return IB_WC_GENERAL_ERR;
+	}
+}
+
+static u8 __rc_to_ib_wc_status(u8 qstatus)
+{
+	switch (qstatus) {
+	case CQ_RES_RC_STATUS_OK:
+		return IB_WC_SUCCESS;
+	case CQ_RES_RC_STATUS_LOCAL_ACCESS_ERROR:
+		return IB_WC_LOC_ACCESS_ERR;
+	case CQ_RES_RC_STATUS_LOCAL_LENGTH_ERR:
+		return IB_WC_LOC_LEN_ERR;
+	case CQ_RES_RC_STATUS_LOCAL_PROTECTION_ERR:
+		return IB_WC_LOC_PROT_ERR;
+	case CQ_RES_RC_STATUS_LOCAL_QP_OPERATION_ERR:
+		return IB_WC_LOC_QP_OP_ERR;
+	case CQ_RES_RC_STATUS_MEMORY_MGT_OPERATION_ERR:
+		return IB_WC_GENERAL_ERR;
+	case CQ_RES_RC_STATUS_REMOTE_INVALID_REQUEST_ERR:
+		return IB_WC_REM_INV_REQ_ERR;
+	case CQ_RES_RC_STATUS_WORK_REQUEST_FLUSHED_ERR:
+		return IB_WC_WR_FLUSH_ERR;
+	case CQ_RES_RC_STATUS_HW_FLUSH_ERR:
+		return IB_WC_WR_FLUSH_ERR;
+	default:
+		return IB_WC_GENERAL_ERR;
+	}
+}
+
+static void bnxt_re_process_req_wc(struct ib_wc *wc, struct bnxt_qplib_cqe *cqe)
+{
+	switch (cqe->type) {
+	case BNXT_QPLIB_SWQE_TYPE_SEND:
+		wc->opcode = IB_WC_SEND;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM:
+		wc->opcode = IB_WC_SEND;
+		wc->wc_flags |= IB_WC_WITH_IMM;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV:
+		wc->opcode = IB_WC_SEND;
+		wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE:
+		wc->opcode = IB_WC_RDMA_WRITE;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM:
+		wc->opcode = IB_WC_RDMA_WRITE;
+		wc->wc_flags |= IB_WC_WITH_IMM;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_RDMA_READ:
+		wc->opcode = IB_WC_RDMA_READ;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP:
+		wc->opcode = IB_WC_COMP_SWAP;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_ATOMIC_FETCH_AND_ADD:
+		wc->opcode = IB_WC_FETCH_ADD;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_LOCAL_INV:
+		wc->opcode = IB_WC_LOCAL_INV;
+		break;
+	case BNXT_QPLIB_SWQE_TYPE_REG_MR:
+		wc->opcode = IB_WC_REG_MR;
+		break;
+	default:
+		wc->opcode = IB_WC_SEND;
+		break;
+	}
+
+	wc->status = __req_to_ib_wc_status(cqe->status);
+}
+
+static int bnxt_re_check_packet_type(u16 raweth_qp1_flags,
+				     u16 raweth_qp1_flags2)
+{
+	bool is_ipv6 = false, is_ipv4 = false;
+
+	/* raweth_qp1_flags Bit 9-6 indicates itype */
+	if ((raweth_qp1_flags & CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_ROCE)
+	    != CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_ROCE)
+		return -1;
+
+	if (raweth_qp1_flags2 &
+	    CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_IP_CS_CALC &&
+	    raweth_qp1_flags2 &
+	    CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_L4_CS_CALC) {
+		/* raweth_qp1_flags2 Bit 8 indicates ip_type. 0-v4 1 - v6 */
+		(raweth_qp1_flags2 &
+		 CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_IP_TYPE) ?
+			(is_ipv6 = true) : (is_ipv4 = true);
+		return ((is_ipv6) ?
+			 BNXT_RE_ROCEV2_IPV6_PACKET :
+			 BNXT_RE_ROCEV2_IPV4_PACKET);
+	} else {
+		return BNXT_RE_ROCE_V1_PACKET;
+	}
+}
+
+static int bnxt_re_to_ib_nw_type(int nw_type)
+{
+	u8 nw_hdr_type = 0xFF;
+
+	switch (nw_type) {
+	case BNXT_RE_ROCE_V1_PACKET:
+		nw_hdr_type = RDMA_NETWORK_ROCE_V1;
+		break;
+	case BNXT_RE_ROCEV2_IPV4_PACKET:
+		nw_hdr_type = RDMA_NETWORK_IPV4;
+		break;
+	case BNXT_RE_ROCEV2_IPV6_PACKET:
+		nw_hdr_type = RDMA_NETWORK_IPV6;
+		break;
+	}
+	return nw_hdr_type;
+}
+
+static bool bnxt_re_is_loopback_packet(struct bnxt_re_dev *rdev,
+				       void *rq_hdr_buf)
+{
+	u8 *tmp_buf = NULL;
+	struct ethhdr *eth_hdr;
+	u16 eth_type;
+	bool rc = false;
+
+	tmp_buf = (u8 *)rq_hdr_buf;
+	/*
+	 * If dest mac is not same as I/F mac, this could be a
+	 * loopback address or multicast address, check whether
+	 * it is a loopback packet
+	 */
+	if (!ether_addr_equal(tmp_buf, rdev->netdev->dev_addr)) {
+		tmp_buf += 4;
+		/* Check the  ether type */
+		eth_hdr = (struct ethhdr *)tmp_buf;
+		eth_type = ntohs(eth_hdr->h_proto);
+		switch (eth_type) {
+		case ETH_P_IBOE:
+			rc = true;
+			break;
+		case ETH_P_IP:
+		case ETH_P_IPV6: {
+			u32 len;
+			struct udphdr *udp_hdr;
+
+			len = (eth_type == ETH_P_IP ? sizeof(struct iphdr) :
+						      sizeof(struct ipv6hdr));
+			tmp_buf += sizeof(struct ethhdr) + len;
+			udp_hdr = (struct udphdr *)tmp_buf;
+			if (ntohs(udp_hdr->dest) ==
+				    ROCE_V2_UDP_DPORT)
+				rc = true;
+			break;
+			}
+		default:
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *qp1_qp,
+					 struct bnxt_qplib_cqe *cqe)
+{
+	struct bnxt_re_dev *rdev = qp1_qp->rdev;
+	struct bnxt_re_sqp_entries *sqp_entry = NULL;
+	struct bnxt_re_qp *qp = rdev->qp1_sqp;
+	struct ib_send_wr *swr;
+	struct ib_ud_wr udwr;
+	struct ib_recv_wr rwr;
+	int pkt_type = 0;
+	u32 tbl_idx;
+	void *rq_hdr_buf;
+	dma_addr_t rq_hdr_buf_map;
+	dma_addr_t shrq_hdr_buf_map;
+	u32 offset = 0;
+	u32 skip_bytes = 0;
+	struct ib_sge s_sge[2];
+	struct ib_sge r_sge[2];
+	int rc;
+
+	memset(&udwr, 0, sizeof(udwr));
+	memset(&rwr, 0, sizeof(rwr));
+	memset(&s_sge, 0, sizeof(s_sge));
+	memset(&r_sge, 0, sizeof(r_sge));
+
+	swr = &udwr.wr;
+	tbl_idx = cqe->wr_id;
+
+	rq_hdr_buf = qp1_qp->qplib_qp.rq_hdr_buf +
+			(tbl_idx * qp1_qp->qplib_qp.rq_hdr_buf_size);
+	rq_hdr_buf_map = bnxt_qplib_get_qp_buf_from_index(&qp1_qp->qplib_qp,
+							  tbl_idx);
+
+	/* Shadow QP header buffer */
+	shrq_hdr_buf_map = bnxt_qplib_get_qp_buf_from_index(&qp->qplib_qp,
+							    tbl_idx);
+	sqp_entry = &rdev->sqp_tbl[tbl_idx];
+
+	/* Store this cqe */
+	memcpy(&sqp_entry->cqe, cqe, sizeof(struct bnxt_qplib_cqe));
+	sqp_entry->qp1_qp = qp1_qp;
+
+	/* Find packet type from the cqe */
+
+	pkt_type = bnxt_re_check_packet_type(cqe->raweth_qp1_flags,
+					     cqe->raweth_qp1_flags2);
+	if (pkt_type < 0) {
+		dev_err(rdev_to_dev(rdev), "Invalid packet\n");
+		return -EINVAL;
+	}
+
+	/* Adjust the offset for the user buffer and post in the rq */
+
+	if (pkt_type == BNXT_RE_ROCEV2_IPV4_PACKET)
+		offset = 20;
+
+	/*
+	 * QP1 loopback packet has 4 bytes of internal header before
+	 * ether header. Skip these four bytes.
+	 */
+	if (bnxt_re_is_loopback_packet(rdev, rq_hdr_buf))
+		skip_bytes = 4;
+
+	/* First send SGE . Skip the ether header*/
+	s_sge[0].addr = rq_hdr_buf_map + BNXT_QPLIB_MAX_QP1_RQ_ETH_HDR_SIZE
+			+ skip_bytes;
+	s_sge[0].lkey = 0xFFFFFFFF;
+	s_sge[0].length = offset ? BNXT_QPLIB_MAX_GRH_HDR_SIZE_IPV4 :
+				BNXT_QPLIB_MAX_GRH_HDR_SIZE_IPV6;
+
+	/* Second Send SGE */
+	s_sge[1].addr = s_sge[0].addr + s_sge[0].length +
+			BNXT_QPLIB_MAX_QP1_RQ_BDETH_HDR_SIZE;
+	if (pkt_type != BNXT_RE_ROCE_V1_PACKET)
+		s_sge[1].addr += 8;
+	s_sge[1].lkey = 0xFFFFFFFF;
+	s_sge[1].length = 256;
+
+	/* First recv SGE */
+
+	r_sge[0].addr = shrq_hdr_buf_map;
+	r_sge[0].lkey = 0xFFFFFFFF;
+	r_sge[0].length = 40;
+
+	r_sge[1].addr = sqp_entry->sge.addr + offset;
+	r_sge[1].lkey = sqp_entry->sge.lkey;
+	r_sge[1].length = BNXT_QPLIB_MAX_GRH_HDR_SIZE_IPV6 + 256 - offset;
+
+	/* Create receive work request */
+	rwr.num_sge = 2;
+	rwr.sg_list = r_sge;
+	rwr.wr_id = tbl_idx;
+	rwr.next = NULL;
+
+	rc = bnxt_re_post_recv_shadow_qp(rdev, qp, &rwr);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev),
+			"Failed to post Rx buffers to shadow QP");
+		return -ENOMEM;
+	}
+
+	swr->num_sge = 2;
+	swr->sg_list = s_sge;
+	swr->wr_id = tbl_idx;
+	swr->opcode = IB_WR_SEND;
+	swr->next = NULL;
+
+	udwr.ah = &rdev->sqp_ah->ib_ah;
+	udwr.remote_qpn = rdev->qp1_sqp->qplib_qp.id;
+	udwr.remote_qkey = rdev->qp1_sqp->qplib_qp.qkey;
+
+	/* post data received  in the send queue */
+	rc = bnxt_re_post_send_shadow_qp(rdev, qp, swr);
+
+	return 0;
+}
+
+static void bnxt_re_process_res_rawqp1_wc(struct ib_wc *wc,
+					  struct bnxt_qplib_cqe *cqe)
+{
+	wc->opcode = IB_WC_RECV;
+	wc->status = __rawqp1_to_ib_wc_status(cqe->status);
+	wc->wc_flags |= IB_WC_GRH;
+}
+
+static bool bnxt_re_is_vlan_pkt(struct bnxt_qplib_cqe *orig_cqe,
+				u16 *vid, u8 *sl)
+{
+	bool ret = false;
+	u32 metadata;
+	u16 tpid;
+
+	metadata = orig_cqe->raweth_qp1_metadata;
+	if (orig_cqe->raweth_qp1_flags2 &
+		CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_META_FORMAT_VLAN) {
+		tpid = ((metadata &
+			 CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_TPID_MASK) >>
+			 CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_TPID_SFT);
+		if (tpid == ETH_P_8021Q) {
+			*vid = metadata &
+			       CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_VID_MASK;
+			*sl = (metadata &
+			       CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_PRI_MASK) >>
+			       CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_PRI_SFT;
+			ret = true;
+		}
+	}
+
+	return ret;
+}
+
+static void bnxt_re_process_res_rc_wc(struct ib_wc *wc,
+				      struct bnxt_qplib_cqe *cqe)
+{
+	wc->opcode = IB_WC_RECV;
+	wc->status = __rc_to_ib_wc_status(cqe->status);
+
+	if (cqe->flags & CQ_RES_RC_FLAGS_IMM)
+		wc->wc_flags |= IB_WC_WITH_IMM;
+	if (cqe->flags & CQ_RES_RC_FLAGS_INV)
+		wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+	if ((cqe->flags & (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM)) ==
+	    (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM))
+		wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+}
+
+static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *qp,
+					     struct ib_wc *wc,
+					     struct bnxt_qplib_cqe *cqe)
+{
+	struct bnxt_re_dev *rdev = qp->rdev;
+	struct bnxt_re_qp *qp1_qp = NULL;
+	struct bnxt_qplib_cqe *orig_cqe = NULL;
+	struct bnxt_re_sqp_entries *sqp_entry = NULL;
+	int nw_type;
+	u32 tbl_idx;
+	u16 vlan_id;
+	u8 sl;
+
+	tbl_idx = cqe->wr_id;
+
+	sqp_entry = &rdev->sqp_tbl[tbl_idx];
+	qp1_qp = sqp_entry->qp1_qp;
+	orig_cqe = &sqp_entry->cqe;
+
+	wc->wr_id = sqp_entry->wrid;
+	wc->byte_len = orig_cqe->length;
+	wc->qp = &qp1_qp->ib_qp;
+
+	wc->ex.imm_data = orig_cqe->immdata;
+	wc->src_qp = orig_cqe->src_qp;
+	memcpy(wc->smac, orig_cqe->smac, ETH_ALEN);
+	if (bnxt_re_is_vlan_pkt(orig_cqe, &vlan_id, &sl)) {
+		wc->vlan_id = vlan_id;
+		wc->sl = sl;
+		wc->wc_flags |= IB_WC_WITH_VLAN;
+	}
+	wc->port_num = 1;
+	wc->vendor_err = orig_cqe->status;
+
+	wc->opcode = IB_WC_RECV;
+	wc->status = __rawqp1_to_ib_wc_status(orig_cqe->status);
+	wc->wc_flags |= IB_WC_GRH;
+
+	nw_type = bnxt_re_check_packet_type(orig_cqe->raweth_qp1_flags,
+					    orig_cqe->raweth_qp1_flags2);
+	if (nw_type >= 0) {
+		wc->network_hdr_type = bnxt_re_to_ib_nw_type(nw_type);
+		wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
+	}
+}
+
+static void bnxt_re_process_res_ud_wc(struct ib_wc *wc,
+				      struct bnxt_qplib_cqe *cqe)
+{
+	wc->opcode = IB_WC_RECV;
+	wc->status = __rc_to_ib_wc_status(cqe->status);
+
+	if (cqe->flags & CQ_RES_RC_FLAGS_IMM)
+		wc->wc_flags |= IB_WC_WITH_IMM;
+	if (cqe->flags & CQ_RES_RC_FLAGS_INV)
+		wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+	if ((cqe->flags & (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM)) ==
+	    (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM))
+		wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+}
+
+static int send_phantom_wqe(struct bnxt_re_qp *qp)
+{
+	struct bnxt_qplib_qp *lib_qp = &qp->qplib_qp;
+	unsigned long flags;
+	int rc = 0;
+
+	spin_lock_irqsave(&qp->sq_lock, flags);
+
+	rc = bnxt_re_bind_fence_mw(lib_qp);
+	if (!rc) {
+		lib_qp->sq.phantom_wqe_cnt++;
+		dev_dbg(&lib_qp->sq.hwq.pdev->dev,
+			"qp %#x sq->prod %#x sw_prod %#x phantom_wqe_cnt %d\n",
+			lib_qp->id, lib_qp->sq.hwq.prod,
+			HWQ_CMP(lib_qp->sq.hwq.prod, &lib_qp->sq.hwq),
+			lib_qp->sq.phantom_wqe_cnt);
+	}
+
+	spin_unlock_irqrestore(&qp->sq_lock, flags);
+	return rc;
+}
+
+int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
+{
+	struct bnxt_re_cq *cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq);
+	struct bnxt_re_qp *qp;
+	struct bnxt_qplib_cqe *cqe;
+	int i, ncqe, budget;
+	struct bnxt_qplib_q *sq;
+	struct bnxt_qplib_qp *lib_qp;
+	u32 tbl_idx;
+	struct bnxt_re_sqp_entries *sqp_entry = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	budget = min_t(u32, num_entries, cq->max_cql);
+	num_entries = budget;
+	if (!cq->cql) {
+		dev_err(rdev_to_dev(cq->rdev), "POLL CQ : no CQL to use");
+		goto exit;
+	}
+	cqe = &cq->cql[0];
+	while (budget) {
+		lib_qp = NULL;
+		ncqe = bnxt_qplib_poll_cq(&cq->qplib_cq, cqe, budget, &lib_qp);
+		if (lib_qp) {
+			sq = &lib_qp->sq;
+			if (sq->send_phantom) {
+				qp = container_of(lib_qp,
+						  struct bnxt_re_qp, qplib_qp);
+				if (send_phantom_wqe(qp) == -ENOMEM)
+					dev_err(rdev_to_dev(cq->rdev),
+						"Phantom failed! Scheduled to send again\n");
+				else
+					sq->send_phantom = false;
+			}
+		}
+		if (ncqe < budget)
+			ncqe += bnxt_qplib_process_flush_list(&cq->qplib_cq,
+							      cqe + ncqe,
+							      budget - ncqe);
+
+		if (!ncqe)
+			break;
+
+		for (i = 0; i < ncqe; i++, cqe++) {
+			/* Transcribe each qplib_wqe back to ib_wc */
+			memset(wc, 0, sizeof(*wc));
+
+			wc->wr_id = cqe->wr_id;
+			wc->byte_len = cqe->length;
+			qp = container_of
+				((struct bnxt_qplib_qp *)
+				 (unsigned long)(cqe->qp_handle),
+				 struct bnxt_re_qp, qplib_qp);
+			if (!qp) {
+				dev_err(rdev_to_dev(cq->rdev),
+					"POLL CQ : bad QP handle");
+				continue;
+			}
+			wc->qp = &qp->ib_qp;
+			wc->ex.imm_data = cqe->immdata;
+			wc->src_qp = cqe->src_qp;
+			memcpy(wc->smac, cqe->smac, ETH_ALEN);
+			wc->port_num = 1;
+			wc->vendor_err = cqe->status;
+
+			switch (cqe->opcode) {
+			case CQ_BASE_CQE_TYPE_REQ:
+				if (qp->qplib_qp.id ==
+				    qp->rdev->qp1_sqp->qplib_qp.id) {
+					/* Handle this completion with
+					 * the stored completion
+					 */
+					memset(wc, 0, sizeof(*wc));
+					continue;
+				}
+				bnxt_re_process_req_wc(wc, cqe);
+				break;
+			case CQ_BASE_CQE_TYPE_RES_RAWETH_QP1:
+				if (!cqe->status) {
+					int rc = 0;
+
+					rc = bnxt_re_process_raw_qp_pkt_rx
+								(qp, cqe);
+					if (!rc) {
+						memset(wc, 0, sizeof(*wc));
+						continue;
+					}
+					cqe->status = -1;
+				}
+				/* Errors need not be looped back.
+				 * But change the wr_id to the one
+				 * stored in the table
+				 */
+				tbl_idx = cqe->wr_id;
+				sqp_entry = &cq->rdev->sqp_tbl[tbl_idx];
+				wc->wr_id = sqp_entry->wrid;
+				bnxt_re_process_res_rawqp1_wc(wc, cqe);
+				break;
+			case CQ_BASE_CQE_TYPE_RES_RC:
+				bnxt_re_process_res_rc_wc(wc, cqe);
+				break;
+			case CQ_BASE_CQE_TYPE_RES_UD:
+				if (qp->qplib_qp.id ==
+				    qp->rdev->qp1_sqp->qplib_qp.id) {
+					/* Handle this completion with
+					 * the stored completion
+					 */
+					if (cqe->status) {
+						continue;
+					} else {
+						bnxt_re_process_res_shadow_qp_wc
+								(qp, wc, cqe);
+						break;
+					}
+				}
+				bnxt_re_process_res_ud_wc(wc, cqe);
+				break;
+			default:
+				dev_err(rdev_to_dev(cq->rdev),
+					"POLL CQ : type 0x%x not handled",
+					cqe->opcode);
+				continue;
+			}
+			wc++;
+			budget--;
+		}
+	}
+exit:
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+	return num_entries - budget;
+}
+
+int bnxt_re_req_notify_cq(struct ib_cq *ib_cq,
+			  enum ib_cq_notify_flags ib_cqn_flags)
+{
+	struct bnxt_re_cq *cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq);
+	int type = 0, rc = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	/* Trigger on the very next completion */
+	if (ib_cqn_flags & IB_CQ_NEXT_COMP)
+		type = DBR_DBR_TYPE_CQ_ARMALL;
+	/* Trigger on the next solicited completion */
+	else if (ib_cqn_flags & IB_CQ_SOLICITED)
+		type = DBR_DBR_TYPE_CQ_ARMSE;
+
+	/* Poll to see if there are missed events */
+	if ((ib_cqn_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
+	    !(bnxt_qplib_is_cq_empty(&cq->qplib_cq))) {
+		rc = 1;
+		goto exit;
+	}
+	bnxt_qplib_req_notify_cq(&cq->qplib_cq, type);
+
+exit:
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+	return rc;
+}
+
+/* Memory Regions */
+struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_mr *mr;
+	u64 pbl = 0;
+	int rc;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->rdev = rdev;
+	mr->qplib_mr.pd = &pd->qplib_pd;
+	mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags);
+	mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR;
+
+	/* Allocate and register 0 as the address */
+	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
+	if (rc)
+		goto fail;
+
+	mr->qplib_mr.hwq.level = PBL_LVL_MAX;
+	mr->qplib_mr.total_size = -1; /* Infinte length */
+	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl, 0, false,
+			       PAGE_SIZE);
+	if (rc)
+		goto fail_mr;
+
+	mr->ib_mr.lkey = mr->qplib_mr.lkey;
+	if (mr_access_flags & (IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ |
+			       IB_ACCESS_REMOTE_ATOMIC))
+		mr->ib_mr.rkey = mr->ib_mr.lkey;
+	atomic_inc(&rdev->mr_count);
+
+	return &mr->ib_mr;
+
+fail_mr:
+	bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+fail:
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+int bnxt_re_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct bnxt_re_mr *mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr);
+	struct bnxt_re_dev *rdev = mr->rdev;
+	int rc;
+
+	rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+	if (rc)
+		dev_err(rdev_to_dev(rdev), "Dereg MR failed: %#x\n", rc);
+
+	if (mr->pages) {
+		rc = bnxt_qplib_free_fast_reg_page_list(&rdev->qplib_res,
+							&mr->qplib_frpl);
+		kfree(mr->pages);
+		mr->npages = 0;
+		mr->pages = NULL;
+	}
+	if (!IS_ERR_OR_NULL(mr->ib_umem))
+		ib_umem_release(mr->ib_umem);
+
+	kfree(mr);
+	atomic_dec(&rdev->mr_count);
+	return rc;
+}
+
+static int bnxt_re_set_page(struct ib_mr *ib_mr, u64 addr)
+{
+	struct bnxt_re_mr *mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr);
+
+	if (unlikely(mr->npages == mr->qplib_frpl.max_pg_ptrs))
+		return -ENOMEM;
+
+	mr->pages[mr->npages++] = addr;
+	return 0;
+}
+
+int bnxt_re_map_mr_sg(struct ib_mr *ib_mr, struct scatterlist *sg, int sg_nents,
+		      unsigned int *sg_offset)
+{
+	struct bnxt_re_mr *mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr);
+
+	mr->npages = 0;
+	return ib_sg_to_pages(ib_mr, sg, sg_nents, sg_offset, bnxt_re_set_page);
+}
+
+struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type,
+			       u32 max_num_sg)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_mr *mr = NULL;
+	int rc;
+
+	if (type != IB_MR_TYPE_MEM_REG) {
+		dev_dbg(rdev_to_dev(rdev), "MR type 0x%x not supported", type);
+		return ERR_PTR(-EINVAL);
+	}
+	if (max_num_sg > MAX_PBL_LVL_1_PGS)
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->rdev = rdev;
+	mr->qplib_mr.pd = &pd->qplib_pd;
+	mr->qplib_mr.flags = BNXT_QPLIB_FR_PMR;
+	mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR;
+
+	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
+	if (rc)
+		goto bail;
+
+	mr->ib_mr.lkey = mr->qplib_mr.lkey;
+	mr->ib_mr.rkey = mr->ib_mr.lkey;
+
+	mr->pages = kcalloc(max_num_sg, sizeof(u64), GFP_KERNEL);
+	if (!mr->pages) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+	rc = bnxt_qplib_alloc_fast_reg_page_list(&rdev->qplib_res,
+						 &mr->qplib_frpl, max_num_sg);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev),
+			"Failed to allocate HW FR page list");
+		goto fail_mr;
+	}
+
+	atomic_inc(&rdev->mr_count);
+	return &mr->ib_mr;
+
+fail_mr:
+	kfree(mr->pages);
+fail:
+	bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+bail:
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+struct ib_mw *bnxt_re_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
+			       struct ib_udata *udata)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_mw *mw;
+	int rc;
+
+	mw = kzalloc(sizeof(*mw), GFP_KERNEL);
+	if (!mw)
+		return ERR_PTR(-ENOMEM);
+	mw->rdev = rdev;
+	mw->qplib_mw.pd = &pd->qplib_pd;
+
+	mw->qplib_mw.type = (type == IB_MW_TYPE_1 ?
+			       CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE1 :
+			       CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B);
+	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mw->qplib_mw);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Allocate MW failed!");
+		goto fail;
+	}
+	mw->ib_mw.rkey = mw->qplib_mw.rkey;
+
+	atomic_inc(&rdev->mw_count);
+	return &mw->ib_mw;
+
+fail:
+	kfree(mw);
+	return ERR_PTR(rc);
+}
+
+int bnxt_re_dealloc_mw(struct ib_mw *ib_mw)
+{
+	struct bnxt_re_mw *mw = container_of(ib_mw, struct bnxt_re_mw, ib_mw);
+	struct bnxt_re_dev *rdev = mw->rdev;
+	int rc;
+
+	rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mw->qplib_mw);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Free MW failed: %#x\n", rc);
+		return rc;
+	}
+
+	kfree(mw);
+	atomic_dec(&rdev->mw_count);
+	return rc;
+}
+
+static int bnxt_re_page_size_ok(int page_shift)
+{
+	switch (page_shift) {
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4K:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_8K:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_64K:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_2M:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_256K:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1M:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4M:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1G:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int fill_umem_pbl_tbl(struct ib_umem *umem, u64 *pbl_tbl_orig,
+			     int page_shift)
+{
+	u64 *pbl_tbl = pbl_tbl_orig;
+	u64 paddr;
+	u64 page_mask = (1ULL << page_shift) - 1;
+	int i, pages;
+	struct scatterlist *sg;
+	int entry;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		pages = sg_dma_len(sg) >> PAGE_SHIFT;
+		for (i = 0; i < pages; i++) {
+			paddr = sg_dma_address(sg) + (i << PAGE_SHIFT);
+			if (pbl_tbl == pbl_tbl_orig)
+				*pbl_tbl++ = paddr & ~page_mask;
+			else if ((paddr & page_mask) == 0)
+				*pbl_tbl++ = paddr;
+		}
+	}
+	return pbl_tbl - pbl_tbl_orig;
+}
+
+/* uverbs */
+struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
+				  u64 virt_addr, int mr_access_flags,
+				  struct ib_udata *udata)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_re_mr *mr;
+	struct ib_umem *umem;
+	u64 *pbl_tbl = NULL;
+	int umem_pgs, page_shift, rc;
+
+	if (length > BNXT_RE_MAX_MR_SIZE) {
+		dev_err(rdev_to_dev(rdev), "MR Size: %lld > Max supported:%lld\n",
+			length, BNXT_RE_MAX_MR_SIZE);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->rdev = rdev;
+	mr->qplib_mr.pd = &pd->qplib_pd;
+	mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags);
+	mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR;
+
+	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to allocate MR");
+		goto free_mr;
+	}
+	/* The fixed portion of the rkey is the same as the lkey */
+	mr->ib_mr.rkey = mr->qplib_mr.rkey;
+
+	umem = ib_umem_get(ib_pd->uobject->context, start, length,
+			   mr_access_flags, 0);
+	if (IS_ERR(umem)) {
+		dev_err(rdev_to_dev(rdev), "Failed to get umem");
+		rc = -EFAULT;
+		goto free_mrw;
+	}
+	mr->ib_umem = umem;
+
+	mr->qplib_mr.va = virt_addr;
+	umem_pgs = ib_umem_page_count(umem);
+	if (!umem_pgs) {
+		dev_err(rdev_to_dev(rdev), "umem is invalid!");
+		rc = -EINVAL;
+		goto free_umem;
+	}
+	mr->qplib_mr.total_size = length;
+
+	pbl_tbl = kcalloc(umem_pgs, sizeof(u64 *), GFP_KERNEL);
+	if (!pbl_tbl) {
+		rc = -ENOMEM;
+		goto free_umem;
+	}
+
+	page_shift = umem->page_shift;
+
+	if (!bnxt_re_page_size_ok(page_shift)) {
+		dev_err(rdev_to_dev(rdev), "umem page size unsupported!");
+		rc = -EFAULT;
+		goto fail;
+	}
+
+	if (!umem->hugetlb && length > BNXT_RE_MAX_MR_SIZE_LOW) {
+		dev_err(rdev_to_dev(rdev), "Requested MR Sz:%llu Max sup:%llu",
+			length,	(u64)BNXT_RE_MAX_MR_SIZE_LOW);
+		rc = -EINVAL;
+		goto fail;
+	}
+	if (umem->hugetlb && length > BNXT_RE_PAGE_SIZE_2M) {
+		page_shift = BNXT_RE_PAGE_SHIFT_2M;
+		dev_warn(rdev_to_dev(rdev), "umem hugetlb set page_size %x",
+			 1 << page_shift);
+	}
+
+	/* Map umem buf ptrs to the PBL */
+	umem_pgs = fill_umem_pbl_tbl(umem, pbl_tbl, page_shift);
+	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, pbl_tbl,
+			       umem_pgs, false, 1 << page_shift);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to register user MR");
+		goto fail;
+	}
+
+	kfree(pbl_tbl);
+
+	mr->ib_mr.lkey = mr->qplib_mr.lkey;
+	mr->ib_mr.rkey = mr->qplib_mr.lkey;
+	atomic_inc(&rdev->mr_count);
+
+	return &mr->ib_mr;
+fail:
+	kfree(pbl_tbl);
+free_umem:
+	ib_umem_release(umem);
+free_mrw:
+	bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+free_mr:
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
+					   struct ib_udata *udata)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+	struct bnxt_re_uctx_resp resp;
+	struct bnxt_re_ucontext *uctx;
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+	int rc;
+
+	dev_dbg(rdev_to_dev(rdev), "ABI version requested %d",
+		ibdev->uverbs_abi_ver);
+
+	if (ibdev->uverbs_abi_ver != BNXT_RE_ABI_VERSION) {
+		dev_dbg(rdev_to_dev(rdev), " is different from the device %d ",
+			BNXT_RE_ABI_VERSION);
+		return ERR_PTR(-EPERM);
+	}
+
+	uctx = kzalloc(sizeof(*uctx), GFP_KERNEL);
+	if (!uctx)
+		return ERR_PTR(-ENOMEM);
+
+	uctx->rdev = rdev;
+
+	uctx->shpg = (void *)__get_free_page(GFP_KERNEL);
+	if (!uctx->shpg) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+	spin_lock_init(&uctx->sh_lock);
+
+	resp.dev_id = rdev->en_dev->pdev->devfn; /*Temp, Use idr_alloc instead*/
+	resp.max_qp = rdev->qplib_ctx.qpc_count;
+	resp.pg_size = PAGE_SIZE;
+	resp.cqe_sz = sizeof(struct cq_base);
+	resp.max_cqd = dev_attr->max_cq_wqes;
+	resp.rsvd    = 0;
+
+	rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to copy user context");
+		rc = -EFAULT;
+		goto cfail;
+	}
+
+	return &uctx->ib_uctx;
+cfail:
+	free_page((unsigned long)uctx->shpg);
+	uctx->shpg = NULL;
+fail:
+	kfree(uctx);
+	return ERR_PTR(rc);
+}
+
+int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
+{
+	struct bnxt_re_ucontext *uctx = container_of(ib_uctx,
+						   struct bnxt_re_ucontext,
+						   ib_uctx);
+
+	struct bnxt_re_dev *rdev = uctx->rdev;
+	int rc = 0;
+
+	if (uctx->shpg)
+		free_page((unsigned long)uctx->shpg);
+
+	if (uctx->dpi.dbr) {
+		/* Free DPI only if this is the first PD allocated by the
+		 * application and mark the context dpi as NULL
+		 */
+		rc = bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
+					    &rdev->qplib_res.dpi_tbl,
+					    &uctx->dpi);
+		if (rc)
+			dev_err(rdev_to_dev(rdev), "Deallocate HW DPI failed!");
+			/* Don't fail, continue*/
+		uctx->dpi.dbr = NULL;
+	}
+
+	kfree(uctx);
+	return 0;
+}
+
+/* Helper function to mmap the virtual memory from user app */
+int bnxt_re_mmap(struct ib_ucontext *ib_uctx, struct vm_area_struct *vma)
+{
+	struct bnxt_re_ucontext *uctx = container_of(ib_uctx,
+						   struct bnxt_re_ucontext,
+						   ib_uctx);
+	struct bnxt_re_dev *rdev = uctx->rdev;
+	u64 pfn;
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	if (vma->vm_pgoff) {
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+				       PAGE_SIZE, vma->vm_page_prot)) {
+			dev_err(rdev_to_dev(rdev), "Failed to map DPI");
+			return -EAGAIN;
+		}
+	} else {
+		pfn = virt_to_phys(uctx->shpg) >> PAGE_SHIFT;
+		if (remap_pfn_range(vma, vma->vm_start,
+				    pfn, PAGE_SIZE, vma->vm_page_prot)) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to map shared page");
+			return -EAGAIN;
+		}
+	}
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
new file mode 100644
index 0000000..5c6414c
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -0,0 +1,226 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: IB Verbs interpreter (header)
+ */
+
+#ifndef __BNXT_RE_IB_VERBS_H__
+#define __BNXT_RE_IB_VERBS_H__
+
+struct bnxt_re_gid_ctx {
+	u32			idx;
+	u32			refcnt;
+};
+
+#define BNXT_RE_FENCE_BYTES	64
+struct bnxt_re_fence_data {
+	u32 size;
+	u8 va[BNXT_RE_FENCE_BYTES];
+	dma_addr_t dma_addr;
+	struct bnxt_re_mr *mr;
+	struct ib_mw *mw;
+	struct bnxt_qplib_swqe bind_wqe;
+	u32 bind_rkey;
+};
+
+struct bnxt_re_pd {
+	struct bnxt_re_dev	*rdev;
+	struct ib_pd		ib_pd;
+	struct bnxt_qplib_pd	qplib_pd;
+	struct bnxt_re_fence_data fence;
+};
+
+struct bnxt_re_ah {
+	struct bnxt_re_dev	*rdev;
+	struct ib_ah		ib_ah;
+	struct bnxt_qplib_ah	qplib_ah;
+};
+
+struct bnxt_re_srq {
+	struct bnxt_re_dev	*rdev;
+	u32			srq_limit;
+	struct ib_srq		ib_srq;
+	struct bnxt_qplib_srq	qplib_srq;
+	struct ib_umem		*umem;
+	spinlock_t		lock;		/* protect srq */
+};
+
+struct bnxt_re_qp {
+	struct list_head	list;
+	struct bnxt_re_dev	*rdev;
+	struct ib_qp		ib_qp;
+	spinlock_t		sq_lock;	/* protect sq */
+	spinlock_t		rq_lock;	/* protect rq */
+	struct bnxt_qplib_qp	qplib_qp;
+	struct ib_umem		*sumem;
+	struct ib_umem		*rumem;
+	/* QP1 */
+	u32			send_psn;
+	struct ib_ud_header	qp1_hdr;
+	struct bnxt_re_cq	*scq;
+	struct bnxt_re_cq	*rcq;
+};
+
+struct bnxt_re_cq {
+	struct bnxt_re_dev	*rdev;
+	spinlock_t              cq_lock;	/* protect cq */
+	u16			cq_count;
+	u16			cq_period;
+	struct ib_cq		ib_cq;
+	struct bnxt_qplib_cq	qplib_cq;
+	struct bnxt_qplib_cqe	*cql;
+#define MAX_CQL_PER_POLL	1024
+	u32			max_cql;
+	struct ib_umem		*umem;
+};
+
+struct bnxt_re_mr {
+	struct bnxt_re_dev	*rdev;
+	struct ib_mr		ib_mr;
+	struct ib_umem		*ib_umem;
+	struct bnxt_qplib_mrw	qplib_mr;
+	u32			npages;
+	u64			*pages;
+	struct bnxt_qplib_frpl	qplib_frpl;
+};
+
+struct bnxt_re_frpl {
+	struct bnxt_re_dev		*rdev;
+	struct bnxt_qplib_frpl		qplib_frpl;
+	u64				*page_list;
+};
+
+struct bnxt_re_fmr {
+	struct bnxt_re_dev	*rdev;
+	struct ib_fmr		ib_fmr;
+	struct bnxt_qplib_mrw	qplib_fmr;
+};
+
+struct bnxt_re_mw {
+	struct bnxt_re_dev	*rdev;
+	struct ib_mw		ib_mw;
+	struct bnxt_qplib_mrw	qplib_mw;
+};
+
+struct bnxt_re_ucontext {
+	struct bnxt_re_dev	*rdev;
+	struct ib_ucontext	ib_uctx;
+	struct bnxt_qplib_dpi	dpi;
+	void			*shpg;
+	spinlock_t		sh_lock;	/* protect shpg */
+};
+
+struct net_device *bnxt_re_get_netdev(struct ib_device *ibdev, u8 port_num);
+
+int bnxt_re_query_device(struct ib_device *ibdev,
+			 struct ib_device_attr *ib_attr,
+			 struct ib_udata *udata);
+int bnxt_re_modify_device(struct ib_device *ibdev,
+			  int device_modify_mask,
+			  struct ib_device_modify *device_modify);
+int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
+		       struct ib_port_attr *port_attr);
+int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+			       struct ib_port_immutable *immutable);
+void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str);
+int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
+		       u16 index, u16 *pkey);
+int bnxt_re_del_gid(const struct ib_gid_attr *attr, void **context);
+int bnxt_re_add_gid(const union ib_gid *gid,
+		    const struct ib_gid_attr *attr, void **context);
+int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num,
+		      int index, union ib_gid *gid);
+enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
+					    u8 port_num);
+struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev,
+			       struct ib_ucontext *context,
+			       struct ib_udata *udata);
+int bnxt_re_dealloc_pd(struct ib_pd *pd);
+struct ib_ah *bnxt_re_create_ah(struct ib_pd *pd,
+				struct rdma_ah_attr *ah_attr,
+				struct ib_udata *udata);
+int bnxt_re_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
+int bnxt_re_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
+int bnxt_re_destroy_ah(struct ib_ah *ah);
+struct ib_srq *bnxt_re_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *srq_init_attr,
+				  struct ib_udata *udata);
+int bnxt_re_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr,
+		       enum ib_srq_attr_mask srq_attr_mask,
+		       struct ib_udata *udata);
+int bnxt_re_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int bnxt_re_destroy_srq(struct ib_srq *srq);
+int bnxt_re_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *recv_wr,
+			  struct ib_recv_wr **bad_recv_wr);
+struct ib_qp *bnxt_re_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *qp_init_attr,
+				struct ib_udata *udata);
+int bnxt_re_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+		      int qp_attr_mask, struct ib_udata *udata);
+int bnxt_re_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+		     int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+int bnxt_re_destroy_qp(struct ib_qp *qp);
+int bnxt_re_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr,
+		      struct ib_send_wr **bad_send_wr);
+int bnxt_re_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr,
+		      struct ib_recv_wr **bad_recv_wr);
+struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
+				const struct ib_cq_init_attr *attr,
+				struct ib_ucontext *context,
+				struct ib_udata *udata);
+int bnxt_re_destroy_cq(struct ib_cq *cq);
+int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
+int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+int bnxt_re_map_mr_sg(struct ib_mr *ib_mr, struct scatterlist *sg, int sg_nents,
+		      unsigned int *sg_offset);
+struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type mr_type,
+			       u32 max_num_sg);
+int bnxt_re_dereg_mr(struct ib_mr *mr);
+struct ib_mw *bnxt_re_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
+			       struct ib_udata *udata);
+int bnxt_re_dealloc_mw(struct ib_mw *mw);
+struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int mr_access_flags,
+				  struct ib_udata *udata);
+struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev,
+					   struct ib_udata *udata);
+int bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
+int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp);
+void bnxt_re_unlock_cqs(struct bnxt_re_qp *qp, unsigned long flags);
+#endif /* __BNXT_RE_IB_VERBS_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
new file mode 100644
index 0000000..20b9f31
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -0,0 +1,1659 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Main component of the bnxt_re driver
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <net/dcbnl.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <linux/if_ether.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_addr.h>
+
+#include "bnxt_ulp.h"
+#include "roce_hsi.h"
+#include "qplib_res.h"
+#include "qplib_sp.h"
+#include "qplib_fp.h"
+#include "qplib_rcfw.h"
+#include "bnxt_re.h"
+#include "ib_verbs.h"
+#include <rdma/bnxt_re-abi.h>
+#include "bnxt.h"
+#include "hw_counters.h"
+
+static char version[] =
+		BNXT_RE_DESC " v" ROCE_DRV_MODULE_VERSION "\n";
+
+MODULE_AUTHOR("Eddie Wai <eddie.wai@broadcom.com>");
+MODULE_DESCRIPTION(BNXT_RE_DESC " Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/* globals */
+static struct list_head bnxt_re_dev_list = LIST_HEAD_INIT(bnxt_re_dev_list);
+/* Mutex to protect the list of bnxt_re devices added */
+static DEFINE_MUTEX(bnxt_re_dev_lock);
+static struct workqueue_struct *bnxt_re_wq;
+static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait);
+
+/* SR-IOV helper functions */
+
+static void bnxt_re_get_sriov_func_type(struct bnxt_re_dev *rdev)
+{
+	struct bnxt *bp;
+
+	bp = netdev_priv(rdev->en_dev->net);
+	if (BNXT_VF(bp))
+		rdev->is_virtfn = 1;
+}
+
+/* Set the maximum number of each resource that the driver actually wants
+ * to allocate. This may be up to the maximum number the firmware has
+ * reserved for the function. The driver may choose to allocate fewer
+ * resources than the firmware maximum.
+ */
+static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev)
+{
+	u32 vf_qps = 0, vf_srqs = 0, vf_cqs = 0, vf_mrws = 0, vf_gids = 0;
+	u32 i;
+	u32 vf_pct;
+	u32 num_vfs;
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+
+	rdev->qplib_ctx.qpc_count = min_t(u32, BNXT_RE_MAX_QPC_COUNT,
+					  dev_attr->max_qp);
+
+	rdev->qplib_ctx.mrw_count = BNXT_RE_MAX_MRW_COUNT_256K;
+	/* Use max_mr from fw since max_mrw does not get set */
+	rdev->qplib_ctx.mrw_count = min_t(u32, rdev->qplib_ctx.mrw_count,
+					  dev_attr->max_mr);
+	rdev->qplib_ctx.srqc_count = min_t(u32, BNXT_RE_MAX_SRQC_COUNT,
+					   dev_attr->max_srq);
+	rdev->qplib_ctx.cq_count = min_t(u32, BNXT_RE_MAX_CQ_COUNT,
+					 dev_attr->max_cq);
+
+	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++)
+		rdev->qplib_ctx.tqm_count[i] =
+		rdev->dev_attr.tqm_alloc_reqs[i];
+
+	if (rdev->num_vfs) {
+		/*
+		 * Reserve a set of resources for the PF. Divide the remaining
+		 * resources among the VFs
+		 */
+		vf_pct = 100 - BNXT_RE_PCT_RSVD_FOR_PF;
+		num_vfs = 100 * rdev->num_vfs;
+		vf_qps = (rdev->qplib_ctx.qpc_count * vf_pct) / num_vfs;
+		vf_srqs = (rdev->qplib_ctx.srqc_count * vf_pct) / num_vfs;
+		vf_cqs = (rdev->qplib_ctx.cq_count * vf_pct) / num_vfs;
+		/*
+		 * The driver allows many more MRs than other resources. If the
+		 * firmware does also, then reserve a fixed amount for the PF
+		 * and divide the rest among VFs. VFs may use many MRs for NFS
+		 * mounts, ISER, NVME applications, etc. If the firmware
+		 * severely restricts the number of MRs, then let PF have
+		 * half and divide the rest among VFs, as for the other
+		 * resource types.
+		 */
+		if (rdev->qplib_ctx.mrw_count < BNXT_RE_MAX_MRW_COUNT_64K)
+			vf_mrws = rdev->qplib_ctx.mrw_count * vf_pct / num_vfs;
+		else
+			vf_mrws = (rdev->qplib_ctx.mrw_count -
+				   BNXT_RE_RESVD_MR_FOR_PF) / rdev->num_vfs;
+		vf_gids = BNXT_RE_MAX_GID_PER_VF;
+	}
+	rdev->qplib_ctx.vf_res.max_mrw_per_vf = vf_mrws;
+	rdev->qplib_ctx.vf_res.max_gid_per_vf = vf_gids;
+	rdev->qplib_ctx.vf_res.max_qp_per_vf = vf_qps;
+	rdev->qplib_ctx.vf_res.max_srq_per_vf = vf_srqs;
+	rdev->qplib_ctx.vf_res.max_cq_per_vf = vf_cqs;
+}
+
+/* for handling bnxt_en callbacks later */
+static void bnxt_re_stop(void *p)
+{
+}
+
+static void bnxt_re_start(void *p)
+{
+}
+
+static void bnxt_re_sriov_config(void *p, int num_vfs)
+{
+	struct bnxt_re_dev *rdev = p;
+
+	if (!rdev)
+		return;
+
+	rdev->num_vfs = num_vfs;
+	bnxt_re_set_resource_limits(rdev);
+	bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw,
+				      &rdev->qplib_ctx);
+}
+
+static void bnxt_re_shutdown(void *p)
+{
+	struct bnxt_re_dev *rdev = p;
+
+	if (!rdev)
+		return;
+
+	bnxt_re_ib_unreg(rdev, false);
+}
+
+static void bnxt_re_stop_irq(void *handle)
+{
+	struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle;
+	struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw;
+	struct bnxt_qplib_nq *nq;
+	int indx;
+
+	for (indx = BNXT_RE_NQ_IDX; indx < rdev->num_msix; indx++) {
+		nq = &rdev->nq[indx - 1];
+		bnxt_qplib_nq_stop_irq(nq, false);
+	}
+
+	bnxt_qplib_rcfw_stop_irq(rcfw, false);
+}
+
+static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent)
+{
+	struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle;
+	struct bnxt_msix_entry *msix_ent = rdev->msix_entries;
+	struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw;
+	struct bnxt_qplib_nq *nq;
+	int indx, rc;
+
+	if (!ent) {
+		/* Not setting the f/w timeout bit in rcfw.
+		 * During the driver unload the first command
+		 * to f/w will timeout and that will set the
+		 * timeout bit.
+		 */
+		dev_err(rdev_to_dev(rdev), "Failed to re-start IRQs\n");
+		return;
+	}
+
+	/* Vectors may change after restart, so update with new vectors
+	 * in device sctructure.
+	 */
+	for (indx = 0; indx < rdev->num_msix; indx++)
+		rdev->msix_entries[indx].vector = ent[indx].vector;
+
+	bnxt_qplib_rcfw_start_irq(rcfw, msix_ent[BNXT_RE_AEQ_IDX].vector,
+				  false);
+	for (indx = BNXT_RE_NQ_IDX ; indx < rdev->num_msix; indx++) {
+		nq = &rdev->nq[indx - 1];
+		rc = bnxt_qplib_nq_start_irq(nq, indx - 1,
+					     msix_ent[indx].vector, false);
+		if (rc)
+			dev_warn(rdev_to_dev(rdev),
+				 "Failed to reinit NQ index %d\n", indx - 1);
+	}
+}
+
+static struct bnxt_ulp_ops bnxt_re_ulp_ops = {
+	.ulp_async_notifier = NULL,
+	.ulp_stop = bnxt_re_stop,
+	.ulp_start = bnxt_re_start,
+	.ulp_sriov_config = bnxt_re_sriov_config,
+	.ulp_shutdown = bnxt_re_shutdown,
+	.ulp_irq_stop = bnxt_re_stop_irq,
+	.ulp_irq_restart = bnxt_re_start_irq
+};
+
+/* RoCE -> Net driver */
+
+/* Driver registration routines used to let the networking driver (bnxt_en)
+ * to know that the RoCE driver is now installed
+ */
+static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev, bool lock_wait)
+{
+	struct bnxt_en_dev *en_dev;
+	int rc;
+
+	if (!rdev)
+		return -EINVAL;
+
+	en_dev = rdev->en_dev;
+	/* Acquire rtnl lock if it is not invokded from netdev event */
+	if (lock_wait)
+		rtnl_lock();
+
+	rc = en_dev->en_ops->bnxt_unregister_device(rdev->en_dev,
+						    BNXT_ROCE_ULP);
+	if (lock_wait)
+		rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_en_dev *en_dev;
+	int rc = 0;
+
+	if (!rdev)
+		return -EINVAL;
+
+	en_dev = rdev->en_dev;
+
+	rtnl_lock();
+	rc = en_dev->en_ops->bnxt_register_device(en_dev, BNXT_ROCE_ULP,
+						  &bnxt_re_ulp_ops, rdev);
+	rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_free_msix(struct bnxt_re_dev *rdev, bool lock_wait)
+{
+	struct bnxt_en_dev *en_dev;
+	int rc;
+
+	if (!rdev)
+		return -EINVAL;
+
+	en_dev = rdev->en_dev;
+
+	if (lock_wait)
+		rtnl_lock();
+
+	rc = en_dev->en_ops->bnxt_free_msix(rdev->en_dev, BNXT_ROCE_ULP);
+
+	if (lock_wait)
+		rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_request_msix(struct bnxt_re_dev *rdev)
+{
+	int rc = 0, num_msix_want = BNXT_RE_MAX_MSIX, num_msix_got;
+	struct bnxt_en_dev *en_dev;
+
+	if (!rdev)
+		return -EINVAL;
+
+	en_dev = rdev->en_dev;
+
+	num_msix_want = min_t(u32, BNXT_RE_MAX_MSIX, num_online_cpus());
+
+	rtnl_lock();
+	num_msix_got = en_dev->en_ops->bnxt_request_msix(en_dev, BNXT_ROCE_ULP,
+							 rdev->msix_entries,
+							 num_msix_want);
+	if (num_msix_got < BNXT_RE_MIN_MSIX) {
+		rc = -EINVAL;
+		goto done;
+	}
+	if (num_msix_got != num_msix_want) {
+		dev_warn(rdev_to_dev(rdev),
+			 "Requested %d MSI-X vectors, got %d\n",
+			 num_msix_want, num_msix_got);
+	}
+	rdev->num_msix = num_msix_got;
+done:
+	rtnl_unlock();
+	return rc;
+}
+
+static void bnxt_re_init_hwrm_hdr(struct bnxt_re_dev *rdev, struct input *hdr,
+				  u16 opcd, u16 crid, u16 trid)
+{
+	hdr->req_type = cpu_to_le16(opcd);
+	hdr->cmpl_ring = cpu_to_le16(crid);
+	hdr->target_id = cpu_to_le16(trid);
+}
+
+static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg,
+				int msg_len, void *resp, int resp_max_len,
+				int timeout)
+{
+	fw_msg->msg = msg;
+	fw_msg->msg_len = msg_len;
+	fw_msg->resp = resp;
+	fw_msg->resp_max_len = resp_max_len;
+	fw_msg->timeout = timeout;
+}
+
+static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id,
+				 bool lock_wait)
+{
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct hwrm_ring_free_input req = {0};
+	struct hwrm_ring_free_output resp;
+	struct bnxt_fw_msg fw_msg;
+	bool do_unlock = false;
+	int rc = -EINVAL;
+
+	if (!en_dev)
+		return rc;
+
+	memset(&fw_msg, 0, sizeof(fw_msg));
+	if (lock_wait) {
+		rtnl_lock();
+		do_unlock = true;
+	}
+
+	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1);
+	req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL;
+	req.ring_id = cpu_to_le16(fw_ring_id);
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
+	if (rc)
+		dev_err(rdev_to_dev(rdev),
+			"Failed to free HW ring:%d :%#x", req.ring_id, rc);
+	if (do_unlock)
+		rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr,
+				  int pages, int type, u32 ring_mask,
+				  u32 map_index, u16 *fw_ring_id)
+{
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct hwrm_ring_alloc_input req = {0};
+	struct hwrm_ring_alloc_output resp;
+	struct bnxt_fw_msg fw_msg;
+	int rc = -EINVAL;
+
+	if (!en_dev)
+		return rc;
+
+	memset(&fw_msg, 0, sizeof(fw_msg));
+	rtnl_lock();
+	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_ALLOC, -1, -1);
+	req.enables = 0;
+	req.page_tbl_addr =  cpu_to_le64(dma_arr[0]);
+	if (pages > 1) {
+		/* Page size is in log2 units */
+		req.page_size = BNXT_PAGE_SHIFT;
+		req.page_tbl_depth = 1;
+	}
+	req.fbo = 0;
+	/* Association of ring index with doorbell index and MSIX number */
+	req.logical_id = cpu_to_le16(map_index);
+	req.length = cpu_to_le32(ring_mask + 1);
+	req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL;
+	req.int_mode = RING_ALLOC_REQ_INT_MODE_MSIX;
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
+	if (!rc)
+		*fw_ring_id = le16_to_cpu(resp.ring_id);
+
+	rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev,
+				      u32 fw_stats_ctx_id, bool lock_wait)
+{
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct hwrm_stat_ctx_free_input req = {0};
+	struct bnxt_fw_msg fw_msg;
+	bool do_unlock = false;
+	int rc = -EINVAL;
+
+	if (!en_dev)
+		return rc;
+
+	memset(&fw_msg, 0, sizeof(fw_msg));
+	if (lock_wait) {
+		rtnl_lock();
+		do_unlock = true;
+	}
+
+	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_FREE, -1, -1);
+	req.stat_ctx_id = cpu_to_le32(fw_stats_ctx_id);
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&req,
+			    sizeof(req), DFLT_HWRM_CMD_TIMEOUT);
+	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
+	if (rc)
+		dev_err(rdev_to_dev(rdev),
+			"Failed to free HW stats context %#x", rc);
+
+	if (do_unlock)
+		rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev,
+				       dma_addr_t dma_map,
+				       u32 *fw_stats_ctx_id)
+{
+	struct hwrm_stat_ctx_alloc_output resp = {0};
+	struct hwrm_stat_ctx_alloc_input req = {0};
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct bnxt_fw_msg fw_msg;
+	int rc = -EINVAL;
+
+	*fw_stats_ctx_id = INVALID_STATS_CTX_ID;
+
+	if (!en_dev)
+		return rc;
+
+	memset(&fw_msg, 0, sizeof(fw_msg));
+	rtnl_lock();
+
+	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_ALLOC, -1, -1);
+	req.update_period_ms = cpu_to_le32(1000);
+	req.stats_dma_addr = cpu_to_le64(dma_map);
+	req.stat_ctx_flags = STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE;
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
+	if (!rc)
+		*fw_stats_ctx_id = le32_to_cpu(resp.stat_ctx_id);
+
+	rtnl_unlock();
+	return rc;
+}
+
+/* Device */
+
+static bool is_bnxt_re_dev(struct net_device *netdev)
+{
+	struct ethtool_drvinfo drvinfo;
+
+	if (netdev->ethtool_ops && netdev->ethtool_ops->get_drvinfo) {
+		memset(&drvinfo, 0, sizeof(drvinfo));
+		netdev->ethtool_ops->get_drvinfo(netdev, &drvinfo);
+
+		if (strcmp(drvinfo.driver, "bnxt_en"))
+			return false;
+		return true;
+	}
+	return false;
+}
+
+static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev)
+{
+	struct bnxt_re_dev *rdev;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(rdev, &bnxt_re_dev_list, list) {
+		if (rdev->netdev == netdev) {
+			rcu_read_unlock();
+			return rdev;
+		}
+	}
+	rcu_read_unlock();
+	return NULL;
+}
+
+static void bnxt_re_dev_unprobe(struct net_device *netdev,
+				struct bnxt_en_dev *en_dev)
+{
+	dev_put(netdev);
+	module_put(en_dev->pdev->driver->driver.owner);
+}
+
+static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev)
+{
+	struct bnxt *bp = netdev_priv(netdev);
+	struct bnxt_en_dev *en_dev;
+	struct pci_dev *pdev;
+
+	/* Call bnxt_en's RoCE probe via indirect API */
+	if (!bp->ulp_probe)
+		return ERR_PTR(-EINVAL);
+
+	en_dev = bp->ulp_probe(netdev);
+	if (IS_ERR(en_dev))
+		return en_dev;
+
+	pdev = en_dev->pdev;
+	if (!pdev)
+		return ERR_PTR(-EINVAL);
+
+	if (!(en_dev->flags & BNXT_EN_FLAG_ROCE_CAP)) {
+		dev_info(&pdev->dev,
+			"%s: probe error: RoCE is not supported on this device",
+			ROCE_DRV_MODULE_NAME);
+		return ERR_PTR(-ENODEV);
+	}
+
+	/* Bump net device reference count */
+	if (!try_module_get(pdev->driver->driver.owner))
+		return ERR_PTR(-ENODEV);
+
+	dev_hold(netdev);
+
+	return en_dev;
+}
+
+static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev)
+{
+	ib_unregister_device(&rdev->ibdev);
+}
+
+static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
+{
+	struct ib_device *ibdev = &rdev->ibdev;
+
+	/* ib device init */
+	ibdev->owner = THIS_MODULE;
+	ibdev->node_type = RDMA_NODE_IB_CA;
+	strlcpy(ibdev->name, "bnxt_re%d", IB_DEVICE_NAME_MAX);
+	strlcpy(ibdev->node_desc, BNXT_RE_DESC " HCA",
+		strlen(BNXT_RE_DESC) + 5);
+	ibdev->phys_port_cnt = 1;
+
+	bnxt_qplib_get_guid(rdev->netdev->dev_addr, (u8 *)&ibdev->node_guid);
+
+	ibdev->num_comp_vectors	= 1;
+	ibdev->dev.parent = &rdev->en_dev->pdev->dev;
+	ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY;
+
+	/* User space */
+	ibdev->uverbs_abi_ver = BNXT_RE_ABI_VERSION;
+	ibdev->uverbs_cmd_mask =
+			(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+			(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+			(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+			(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+			(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+			(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
+			(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+			(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+			(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+			(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+			(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+			(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+			(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+			(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+			(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_CREATE_AH)		|
+			(1ull << IB_USER_VERBS_CMD_MODIFY_AH)		|
+			(1ull << IB_USER_VERBS_CMD_QUERY_AH)		|
+			(1ull << IB_USER_VERBS_CMD_DESTROY_AH);
+	/* POLL_CQ and REQ_NOTIFY_CQ is directly handled in libbnxt_re */
+
+	/* Kernel verbs */
+	ibdev->query_device		= bnxt_re_query_device;
+	ibdev->modify_device		= bnxt_re_modify_device;
+
+	ibdev->query_port		= bnxt_re_query_port;
+	ibdev->get_port_immutable	= bnxt_re_get_port_immutable;
+	ibdev->get_dev_fw_str           = bnxt_re_query_fw_str;
+	ibdev->query_pkey		= bnxt_re_query_pkey;
+	ibdev->get_netdev		= bnxt_re_get_netdev;
+	ibdev->add_gid			= bnxt_re_add_gid;
+	ibdev->del_gid			= bnxt_re_del_gid;
+	ibdev->get_link_layer		= bnxt_re_get_link_layer;
+
+	ibdev->alloc_pd			= bnxt_re_alloc_pd;
+	ibdev->dealloc_pd		= bnxt_re_dealloc_pd;
+
+	ibdev->create_ah		= bnxt_re_create_ah;
+	ibdev->modify_ah		= bnxt_re_modify_ah;
+	ibdev->query_ah			= bnxt_re_query_ah;
+	ibdev->destroy_ah		= bnxt_re_destroy_ah;
+
+	ibdev->create_srq		= bnxt_re_create_srq;
+	ibdev->modify_srq		= bnxt_re_modify_srq;
+	ibdev->query_srq		= bnxt_re_query_srq;
+	ibdev->destroy_srq		= bnxt_re_destroy_srq;
+	ibdev->post_srq_recv		= bnxt_re_post_srq_recv;
+
+	ibdev->create_qp		= bnxt_re_create_qp;
+	ibdev->modify_qp		= bnxt_re_modify_qp;
+	ibdev->query_qp			= bnxt_re_query_qp;
+	ibdev->destroy_qp		= bnxt_re_destroy_qp;
+
+	ibdev->post_send		= bnxt_re_post_send;
+	ibdev->post_recv		= bnxt_re_post_recv;
+
+	ibdev->create_cq		= bnxt_re_create_cq;
+	ibdev->destroy_cq		= bnxt_re_destroy_cq;
+	ibdev->poll_cq			= bnxt_re_poll_cq;
+	ibdev->req_notify_cq		= bnxt_re_req_notify_cq;
+
+	ibdev->get_dma_mr		= bnxt_re_get_dma_mr;
+	ibdev->dereg_mr			= bnxt_re_dereg_mr;
+	ibdev->alloc_mr			= bnxt_re_alloc_mr;
+	ibdev->map_mr_sg		= bnxt_re_map_mr_sg;
+
+	ibdev->reg_user_mr		= bnxt_re_reg_user_mr;
+	ibdev->alloc_ucontext		= bnxt_re_alloc_ucontext;
+	ibdev->dealloc_ucontext		= bnxt_re_dealloc_ucontext;
+	ibdev->mmap			= bnxt_re_mmap;
+	ibdev->get_hw_stats             = bnxt_re_ib_get_hw_stats;
+	ibdev->alloc_hw_stats           = bnxt_re_ib_alloc_hw_stats;
+
+	ibdev->driver_id = RDMA_DRIVER_BNXT_RE;
+	return ib_register_device(ibdev, NULL);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc);
+}
+
+static DEVICE_ATTR(hw_rev, 0444, show_rev, NULL);
+static DEVICE_ATTR(hca_type, 0444, show_hca, NULL);
+
+static struct device_attribute *bnxt_re_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type
+};
+
+static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev)
+{
+	dev_put(rdev->netdev);
+	rdev->netdev = NULL;
+
+	mutex_lock(&bnxt_re_dev_lock);
+	list_del_rcu(&rdev->list);
+	mutex_unlock(&bnxt_re_dev_lock);
+
+	synchronize_rcu();
+
+	ib_dealloc_device(&rdev->ibdev);
+	/* rdev is gone */
+}
+
+static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev,
+					   struct bnxt_en_dev *en_dev)
+{
+	struct bnxt_re_dev *rdev;
+
+	/* Allocate bnxt_re_dev instance here */
+	rdev = (struct bnxt_re_dev *)ib_alloc_device(sizeof(*rdev));
+	if (!rdev) {
+		dev_err(NULL, "%s: bnxt_re_dev allocation failure!",
+			ROCE_DRV_MODULE_NAME);
+		return NULL;
+	}
+	/* Default values */
+	rdev->netdev = netdev;
+	dev_hold(rdev->netdev);
+	rdev->en_dev = en_dev;
+	rdev->id = rdev->en_dev->pdev->devfn;
+	INIT_LIST_HEAD(&rdev->qp_list);
+	mutex_init(&rdev->qp_lock);
+	atomic_set(&rdev->qp_count, 0);
+	atomic_set(&rdev->cq_count, 0);
+	atomic_set(&rdev->srq_count, 0);
+	atomic_set(&rdev->mr_count, 0);
+	atomic_set(&rdev->mw_count, 0);
+	rdev->cosq[0] = 0xFFFF;
+	rdev->cosq[1] = 0xFFFF;
+
+	mutex_lock(&bnxt_re_dev_lock);
+	list_add_tail_rcu(&rdev->list, &bnxt_re_dev_list);
+	mutex_unlock(&bnxt_re_dev_lock);
+	return rdev;
+}
+
+static int bnxt_re_handle_unaffi_async_event(struct creq_func_event
+					     *unaffi_async)
+{
+	switch (unaffi_async->event) {
+	case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_RX_WQE_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_RX_DATA_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CQ_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_TQM_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCQ_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCS_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCC_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCM_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_TIM_ERROR:
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int bnxt_re_handle_qp_async_event(struct creq_qp_event *qp_event,
+					 struct bnxt_re_qp *qp)
+{
+	struct ib_event event;
+	unsigned int flags;
+
+	if (qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+		flags = bnxt_re_lock_cqs(qp);
+		bnxt_qplib_add_flush_qp(&qp->qplib_qp);
+		bnxt_re_unlock_cqs(qp, flags);
+	}
+
+	memset(&event, 0, sizeof(event));
+	if (qp->qplib_qp.srq) {
+		event.device = &qp->rdev->ibdev;
+		event.element.qp = &qp->ib_qp;
+		event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+	}
+
+	if (event.device && qp->ib_qp.event_handler)
+		qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context);
+
+	return 0;
+}
+
+static int bnxt_re_handle_affi_async_event(struct creq_qp_event *affi_async,
+					   void *obj)
+{
+	int rc = 0;
+	u8 event;
+
+	if (!obj)
+		return rc; /* QP was already dead, still return success */
+
+	event = affi_async->event;
+	if (event == CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION) {
+		struct bnxt_qplib_qp *lib_qp = obj;
+		struct bnxt_re_qp *qp = container_of(lib_qp, struct bnxt_re_qp,
+						     qplib_qp);
+		rc = bnxt_re_handle_qp_async_event(affi_async, qp);
+	}
+	return rc;
+}
+
+static int bnxt_re_aeq_handler(struct bnxt_qplib_rcfw *rcfw,
+			       void *aeqe, void *obj)
+{
+	struct creq_qp_event *affi_async;
+	struct creq_func_event *unaffi_async;
+	u8 type;
+	int rc;
+
+	type = ((struct creq_base *)aeqe)->type;
+	if (type == CREQ_BASE_TYPE_FUNC_EVENT) {
+		unaffi_async = aeqe;
+		rc = bnxt_re_handle_unaffi_async_event(unaffi_async);
+	} else {
+		affi_async = aeqe;
+		rc = bnxt_re_handle_affi_async_event(affi_async, obj);
+	}
+
+	return rc;
+}
+
+static int bnxt_re_srqn_handler(struct bnxt_qplib_nq *nq,
+				struct bnxt_qplib_srq *handle, u8 event)
+{
+	struct bnxt_re_srq *srq = container_of(handle, struct bnxt_re_srq,
+					       qplib_srq);
+	struct ib_event ib_event;
+	int rc = 0;
+
+	if (!srq) {
+		dev_err(NULL, "%s: SRQ is NULL, SRQN not handled",
+			ROCE_DRV_MODULE_NAME);
+		rc = -EINVAL;
+		goto done;
+	}
+	ib_event.device = &srq->rdev->ibdev;
+	ib_event.element.srq = &srq->ib_srq;
+	if (event == NQ_SRQ_EVENT_EVENT_SRQ_THRESHOLD_EVENT)
+		ib_event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+	else
+		ib_event.event = IB_EVENT_SRQ_ERR;
+
+	if (srq->ib_srq.event_handler) {
+		/* Lock event_handler? */
+		(*srq->ib_srq.event_handler)(&ib_event,
+					     srq->ib_srq.srq_context);
+	}
+done:
+	return rc;
+}
+
+static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq,
+			       struct bnxt_qplib_cq *handle)
+{
+	struct bnxt_re_cq *cq = container_of(handle, struct bnxt_re_cq,
+					     qplib_cq);
+
+	if (!cq) {
+		dev_err(NULL, "%s: CQ is NULL, CQN not handled",
+			ROCE_DRV_MODULE_NAME);
+		return -EINVAL;
+	}
+	if (cq->ib_cq.comp_handler) {
+		/* Lock comp_handler? */
+		(*cq->ib_cq.comp_handler)(&cq->ib_cq, cq->ib_cq.cq_context);
+	}
+
+	return 0;
+}
+
+static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev)
+{
+	int i;
+
+	if (rdev->nq[0].hwq.max_elements) {
+		for (i = 1; i < rdev->num_msix; i++)
+			bnxt_qplib_disable_nq(&rdev->nq[i - 1]);
+	}
+
+	if (rdev->qplib_res.rcfw)
+		bnxt_qplib_cleanup_res(&rdev->qplib_res);
+}
+
+static int bnxt_re_init_res(struct bnxt_re_dev *rdev)
+{
+	int rc = 0, i;
+
+	bnxt_qplib_init_res(&rdev->qplib_res);
+
+	for (i = 1; i < rdev->num_msix ; i++) {
+		rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1],
+					  i - 1, rdev->msix_entries[i].vector,
+					  rdev->msix_entries[i].db_offset,
+					  &bnxt_re_cqn_handler,
+					  &bnxt_re_srqn_handler);
+
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to enable NQ with rc = 0x%x", rc);
+			goto fail;
+		}
+	}
+	return 0;
+fail:
+	return rc;
+}
+
+static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev, bool lock_wait)
+{
+	int i;
+
+	for (i = 0; i < rdev->num_msix - 1; i++) {
+		bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, lock_wait);
+		bnxt_qplib_free_nq(&rdev->nq[i]);
+	}
+}
+
+static void bnxt_re_free_res(struct bnxt_re_dev *rdev, bool lock_wait)
+{
+	bnxt_re_free_nq_res(rdev, lock_wait);
+
+	if (rdev->qplib_res.dpi_tbl.max) {
+		bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
+				       &rdev->qplib_res.dpi_tbl,
+				       &rdev->dpi_privileged);
+	}
+	if (rdev->qplib_res.rcfw) {
+		bnxt_qplib_free_res(&rdev->qplib_res);
+		rdev->qplib_res.rcfw = NULL;
+	}
+}
+
+static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
+{
+	int rc = 0, i;
+
+	/* Configure and allocate resources for qplib */
+	rdev->qplib_res.rcfw = &rdev->rcfw;
+	rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr,
+				     rdev->is_virtfn);
+	if (rc)
+		goto fail;
+
+	rc = bnxt_qplib_alloc_res(&rdev->qplib_res, rdev->en_dev->pdev,
+				  rdev->netdev, &rdev->dev_attr);
+	if (rc)
+		goto fail;
+
+	rc = bnxt_qplib_alloc_dpi(&rdev->qplib_res.dpi_tbl,
+				  &rdev->dpi_privileged,
+				  rdev);
+	if (rc)
+		goto dealloc_res;
+
+	for (i = 0; i < rdev->num_msix - 1; i++) {
+		rdev->nq[i].hwq.max_elements = BNXT_RE_MAX_CQ_COUNT +
+			BNXT_RE_MAX_SRQC_COUNT + 2;
+		rc = bnxt_qplib_alloc_nq(rdev->en_dev->pdev, &rdev->nq[i]);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "Alloc Failed NQ%d rc:%#x",
+				i, rc);
+			goto dealloc_dpi;
+		}
+		rc = bnxt_re_net_ring_alloc
+			(rdev, rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr,
+			 rdev->nq[i].hwq.pbl[rdev->nq[i].hwq.level].pg_count,
+			 HWRM_RING_ALLOC_CMPL,
+			 BNXT_QPLIB_NQE_MAX_CNT - 1,
+			 rdev->msix_entries[i + 1].ring_idx,
+			 &rdev->nq[i].ring_id);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to allocate NQ fw id with rc = 0x%x",
+				rc);
+			goto free_nq;
+		}
+	}
+	return 0;
+free_nq:
+	for (i = 0; i < rdev->num_msix - 1; i++)
+		bnxt_qplib_free_nq(&rdev->nq[i]);
+dealloc_dpi:
+	bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
+			       &rdev->qplib_res.dpi_tbl,
+			       &rdev->dpi_privileged);
+dealloc_res:
+	bnxt_qplib_free_res(&rdev->qplib_res);
+
+fail:
+	rdev->qplib_res.rcfw = NULL;
+	return rc;
+}
+
+static void bnxt_re_dispatch_event(struct ib_device *ibdev, struct ib_qp *qp,
+				   u8 port_num, enum ib_event_type event)
+{
+	struct ib_event ib_event;
+
+	ib_event.device = ibdev;
+	if (qp)
+		ib_event.element.qp = qp;
+	else
+		ib_event.element.port_num = port_num;
+	ib_event.event = event;
+	ib_dispatch_event(&ib_event);
+}
+
+#define HWRM_QUEUE_PRI2COS_QCFG_INPUT_FLAGS_IVLAN      0x02
+static int bnxt_re_query_hwrm_pri2cos(struct bnxt_re_dev *rdev, u8 dir,
+				      u64 *cid_map)
+{
+	struct hwrm_queue_pri2cos_qcfg_input req = {0};
+	struct bnxt *bp = netdev_priv(rdev->netdev);
+	struct hwrm_queue_pri2cos_qcfg_output resp;
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct bnxt_fw_msg fw_msg;
+	u32 flags = 0;
+	u8 *qcfgmap, *tmp_map;
+	int rc = 0, i;
+
+	if (!cid_map)
+		return -EINVAL;
+
+	memset(&fw_msg, 0, sizeof(fw_msg));
+	bnxt_re_init_hwrm_hdr(rdev, (void *)&req,
+			      HWRM_QUEUE_PRI2COS_QCFG, -1, -1);
+	flags |= (dir & 0x01);
+	flags |= HWRM_QUEUE_PRI2COS_QCFG_INPUT_FLAGS_IVLAN;
+	req.flags = cpu_to_le32(flags);
+	req.port_id = bp->pf.port_id;
+
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
+	if (rc)
+		return rc;
+
+	if (resp.queue_cfg_info) {
+		dev_warn(rdev_to_dev(rdev),
+			 "Asymmetric cos queue configuration detected");
+		dev_warn(rdev_to_dev(rdev),
+			 " on device, QoS may not be fully functional\n");
+	}
+	qcfgmap = &resp.pri0_cos_queue_id;
+	tmp_map = (u8 *)cid_map;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		tmp_map[i] = qcfgmap[i];
+
+	return rc;
+}
+
+static bool bnxt_re_is_qp1_or_shadow_qp(struct bnxt_re_dev *rdev,
+					struct bnxt_re_qp *qp)
+{
+	return (qp->ib_qp.qp_type == IB_QPT_GSI) || (qp == rdev->qp1_sqp);
+}
+
+static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev)
+{
+	int mask = IB_QP_STATE;
+	struct ib_qp_attr qp_attr;
+	struct bnxt_re_qp *qp;
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	mutex_lock(&rdev->qp_lock);
+	list_for_each_entry(qp, &rdev->qp_list, list) {
+		/* Modify the state of all QPs except QP1/Shadow QP */
+		if (!bnxt_re_is_qp1_or_shadow_qp(rdev, qp)) {
+			if (qp->qplib_qp.state !=
+			    CMDQ_MODIFY_QP_NEW_STATE_RESET &&
+			    qp->qplib_qp.state !=
+			    CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+				bnxt_re_dispatch_event(&rdev->ibdev, &qp->ib_qp,
+						       1, IB_EVENT_QP_FATAL);
+				bnxt_re_modify_qp(&qp->ib_qp, &qp_attr, mask,
+						  NULL);
+			}
+		}
+	}
+	mutex_unlock(&rdev->qp_lock);
+}
+
+static int bnxt_re_update_gid(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl;
+	struct bnxt_qplib_gid gid;
+	u16 gid_idx, index;
+	int rc = 0;
+
+	if (!test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
+		return 0;
+
+	if (!sgid_tbl) {
+		dev_err(rdev_to_dev(rdev), "QPLIB: SGID table not allocated");
+		return -EINVAL;
+	}
+
+	for (index = 0; index < sgid_tbl->active; index++) {
+		gid_idx = sgid_tbl->hw_id[index];
+
+		if (!memcmp(&sgid_tbl->tbl[index], &bnxt_qplib_gid_zero,
+			    sizeof(bnxt_qplib_gid_zero)))
+			continue;
+		/* need to modify the VLAN enable setting of non VLAN GID only
+		 * as setting is done for VLAN GID while adding GID
+		 */
+		if (sgid_tbl->vlan[index])
+			continue;
+
+		memcpy(&gid, &sgid_tbl->tbl[index], sizeof(gid));
+
+		rc = bnxt_qplib_update_sgid(sgid_tbl, &gid, gid_idx,
+					    rdev->qplib_res.netdev->dev_addr);
+	}
+
+	return rc;
+}
+
+static u32 bnxt_re_get_priority_mask(struct bnxt_re_dev *rdev)
+{
+	u32 prio_map = 0, tmp_map = 0;
+	struct net_device *netdev;
+	struct dcb_app app;
+
+	netdev = rdev->netdev;
+
+	memset(&app, 0, sizeof(app));
+	app.selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE;
+	app.protocol = ETH_P_IBOE;
+	tmp_map = dcb_ieee_getapp_mask(netdev, &app);
+	prio_map = tmp_map;
+
+	app.selector = IEEE_8021QAZ_APP_SEL_DGRAM;
+	app.protocol = ROCE_V2_UDP_DPORT;
+	tmp_map = dcb_ieee_getapp_mask(netdev, &app);
+	prio_map |= tmp_map;
+
+	return prio_map;
+}
+
+static void bnxt_re_parse_cid_map(u8 prio_map, u8 *cid_map, u16 *cosq)
+{
+	u16 prio;
+	u8 id;
+
+	for (prio = 0, id = 0; prio < 8; prio++) {
+		if (prio_map & (1 << prio)) {
+			cosq[id] = cid_map[prio];
+			id++;
+			if (id == 2) /* Max 2 tcs supported */
+				break;
+		}
+	}
+}
+
+static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev)
+{
+	u8 prio_map = 0;
+	u64 cid_map;
+	int rc;
+
+	/* Get priority for roce */
+	prio_map = bnxt_re_get_priority_mask(rdev);
+
+	if (prio_map == rdev->cur_prio_map)
+		return 0;
+	rdev->cur_prio_map = prio_map;
+	/* Get cosq id for this priority */
+	rc = bnxt_re_query_hwrm_pri2cos(rdev, 0, &cid_map);
+	if (rc) {
+		dev_warn(rdev_to_dev(rdev), "no cos for p_mask %x\n", prio_map);
+		return rc;
+	}
+	/* Parse CoS IDs for app priority */
+	bnxt_re_parse_cid_map(prio_map, (u8 *)&cid_map, rdev->cosq);
+
+	/* Config BONO. */
+	rc = bnxt_qplib_map_tc2cos(&rdev->qplib_res, rdev->cosq);
+	if (rc) {
+		dev_warn(rdev_to_dev(rdev), "no tc for cos{%x, %x}\n",
+			 rdev->cosq[0], rdev->cosq[1]);
+		return rc;
+	}
+
+	/* Actual priorities are not programmed as they are already
+	 * done by L2 driver; just enable or disable priority vlan tagging
+	 */
+	if ((prio_map == 0 && rdev->qplib_res.prio) ||
+	    (prio_map != 0 && !rdev->qplib_res.prio)) {
+		rdev->qplib_res.prio = prio_map ? true : false;
+
+		bnxt_re_update_gid(rdev);
+	}
+
+	return 0;
+}
+
+static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait)
+{
+	int i, rc;
+
+	if (test_and_clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) {
+		for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++)
+			device_remove_file(&rdev->ibdev.dev,
+					   bnxt_re_attributes[i]);
+		/* Cleanup ib dev */
+		bnxt_re_unregister_ib(rdev);
+	}
+	if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags))
+		cancel_delayed_work(&rdev->worker);
+
+	bnxt_re_cleanup_res(rdev);
+	bnxt_re_free_res(rdev, lock_wait);
+
+	if (test_and_clear_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) {
+		rc = bnxt_qplib_deinit_rcfw(&rdev->rcfw);
+		if (rc)
+			dev_warn(rdev_to_dev(rdev),
+				 "Failed to deinitialize RCFW: %#x", rc);
+		bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id,
+					   lock_wait);
+		bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
+		bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
+		bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, lock_wait);
+		bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
+	}
+	if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) {
+		rc = bnxt_re_free_msix(rdev, lock_wait);
+		if (rc)
+			dev_warn(rdev_to_dev(rdev),
+				 "Failed to free MSI-X vectors: %#x", rc);
+	}
+	if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) {
+		rc = bnxt_re_unregister_netdev(rdev, lock_wait);
+		if (rc)
+			dev_warn(rdev_to_dev(rdev),
+				 "Failed to unregister with netdev: %#x", rc);
+	}
+}
+
+/* worker thread for polling periodic events. Now used for QoS programming*/
+static void bnxt_re_worker(struct work_struct *work)
+{
+	struct bnxt_re_dev *rdev = container_of(work, struct bnxt_re_dev,
+						worker.work);
+
+	bnxt_re_setup_qos(rdev);
+	schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
+}
+
+static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
+{
+	int i, j, rc;
+
+	/* Registered a new RoCE device instance to netdev */
+	rc = bnxt_re_register_netdev(rdev);
+	if (rc) {
+		pr_err("Failed to register with netedev: %#x\n", rc);
+		return -EINVAL;
+	}
+	set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
+
+	/* Check whether VF or PF */
+	bnxt_re_get_sriov_func_type(rdev);
+
+	rc = bnxt_re_request_msix(rdev);
+	if (rc) {
+		pr_err("Failed to get MSI-X vectors: %#x\n", rc);
+		rc = -EINVAL;
+		goto fail;
+	}
+	set_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags);
+
+	/* Establish RCFW Communication Channel to initialize the context
+	 * memory for the function and all child VFs
+	 */
+	rc = bnxt_qplib_alloc_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw,
+					   BNXT_RE_MAX_QPC_COUNT);
+	if (rc) {
+		pr_err("Failed to allocate RCFW Channel: %#x\n", rc);
+		goto fail;
+	}
+	rc = bnxt_re_net_ring_alloc
+			(rdev, rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr,
+			 rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count,
+			 HWRM_RING_ALLOC_CMPL, BNXT_QPLIB_CREQE_MAX_CNT - 1,
+			 rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx,
+			 &rdev->rcfw.creq_ring_id);
+	if (rc) {
+		pr_err("Failed to allocate CREQ: %#x\n", rc);
+		goto free_rcfw;
+	}
+	rc = bnxt_qplib_enable_rcfw_channel
+				(rdev->en_dev->pdev, &rdev->rcfw,
+				 rdev->msix_entries[BNXT_RE_AEQ_IDX].vector,
+				 rdev->msix_entries[BNXT_RE_AEQ_IDX].db_offset,
+				 rdev->is_virtfn, &bnxt_re_aeq_handler);
+	if (rc) {
+		pr_err("Failed to enable RCFW channel: %#x\n", rc);
+		goto free_ring;
+	}
+
+	rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr,
+				     rdev->is_virtfn);
+	if (rc)
+		goto disable_rcfw;
+	if (!rdev->is_virtfn)
+		bnxt_re_set_resource_limits(rdev);
+
+	rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0);
+	if (rc) {
+		pr_err("Failed to allocate QPLIB context: %#x\n", rc);
+		goto disable_rcfw;
+	}
+	rc = bnxt_re_net_stats_ctx_alloc(rdev,
+					 rdev->qplib_ctx.stats.dma_map,
+					 &rdev->qplib_ctx.stats.fw_id);
+	if (rc) {
+		pr_err("Failed to allocate stats context: %#x\n", rc);
+		goto free_ctx;
+	}
+
+	rc = bnxt_qplib_init_rcfw(&rdev->rcfw, &rdev->qplib_ctx,
+				  rdev->is_virtfn);
+	if (rc) {
+		pr_err("Failed to initialize RCFW: %#x\n", rc);
+		goto free_sctx;
+	}
+	set_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags);
+
+	/* Resources based on the 'new' device caps */
+	rc = bnxt_re_alloc_res(rdev);
+	if (rc) {
+		pr_err("Failed to allocate resources: %#x\n", rc);
+		goto fail;
+	}
+	rc = bnxt_re_init_res(rdev);
+	if (rc) {
+		pr_err("Failed to initialize resources: %#x\n", rc);
+		goto fail;
+	}
+
+	if (!rdev->is_virtfn) {
+		rc = bnxt_re_setup_qos(rdev);
+		if (rc)
+			pr_info("RoCE priority not yet configured\n");
+
+		INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker);
+		set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags);
+		schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
+	}
+
+	/* Register ib dev */
+	rc = bnxt_re_register_ib(rdev);
+	if (rc) {
+		pr_err("Failed to register with IB: %#x\n", rc);
+		goto fail;
+	}
+	dev_info(rdev_to_dev(rdev), "Device registered successfully");
+	for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++) {
+		rc = device_create_file(&rdev->ibdev.dev,
+					bnxt_re_attributes[i]);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to create IB sysfs: %#x", rc);
+			/* Must clean up all created device files */
+			for (j = 0; j < i; j++)
+				device_remove_file(&rdev->ibdev.dev,
+						   bnxt_re_attributes[j]);
+			bnxt_re_unregister_ib(rdev);
+			goto fail;
+		}
+	}
+	set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
+	ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed,
+			 &rdev->active_width);
+	set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags);
+	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_PORT_ACTIVE);
+	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_GID_CHANGE);
+
+	return 0;
+free_sctx:
+	bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id, true);
+free_ctx:
+	bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
+disable_rcfw:
+	bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
+free_ring:
+	bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, true);
+free_rcfw:
+	bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
+fail:
+	bnxt_re_ib_unreg(rdev, true);
+	return rc;
+}
+
+static void bnxt_re_dev_unreg(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct net_device *netdev = rdev->netdev;
+
+	bnxt_re_dev_remove(rdev);
+
+	if (netdev)
+		bnxt_re_dev_unprobe(netdev, en_dev);
+}
+
+static int bnxt_re_dev_reg(struct bnxt_re_dev **rdev, struct net_device *netdev)
+{
+	struct bnxt_en_dev *en_dev;
+	int rc = 0;
+
+	if (!is_bnxt_re_dev(netdev))
+		return -ENODEV;
+
+	en_dev = bnxt_re_dev_probe(netdev);
+	if (IS_ERR(en_dev)) {
+		if (en_dev != ERR_PTR(-ENODEV))
+			pr_err("%s: Failed to probe\n", ROCE_DRV_MODULE_NAME);
+		rc = PTR_ERR(en_dev);
+		goto exit;
+	}
+	*rdev = bnxt_re_dev_add(netdev, en_dev);
+	if (!*rdev) {
+		rc = -ENOMEM;
+		bnxt_re_dev_unprobe(netdev, en_dev);
+		goto exit;
+	}
+exit:
+	return rc;
+}
+
+static void bnxt_re_remove_one(struct bnxt_re_dev *rdev)
+{
+	pci_dev_put(rdev->en_dev->pdev);
+}
+
+/* Handle all deferred netevents tasks */
+static void bnxt_re_task(struct work_struct *work)
+{
+	struct bnxt_re_work *re_work;
+	struct bnxt_re_dev *rdev;
+	int rc = 0;
+
+	re_work = container_of(work, struct bnxt_re_work, work);
+	rdev = re_work->rdev;
+
+	if (re_work->event != NETDEV_REGISTER &&
+	    !test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
+		return;
+
+	switch (re_work->event) {
+	case NETDEV_REGISTER:
+		rc = bnxt_re_ib_reg(rdev);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev),
+				"Failed to register with IB: %#x", rc);
+			bnxt_re_remove_one(rdev);
+			bnxt_re_dev_unreg(rdev);
+		}
+		break;
+	case NETDEV_UP:
+		bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1,
+				       IB_EVENT_PORT_ACTIVE);
+		break;
+	case NETDEV_DOWN:
+		bnxt_re_dev_stop(rdev);
+		break;
+	case NETDEV_CHANGE:
+		if (!netif_carrier_ok(rdev->netdev))
+			bnxt_re_dev_stop(rdev);
+		else if (netif_carrier_ok(rdev->netdev))
+			bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1,
+					       IB_EVENT_PORT_ACTIVE);
+		ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed,
+				 &rdev->active_width);
+		break;
+	default:
+		break;
+	}
+	smp_mb__before_atomic();
+	atomic_dec(&rdev->sched_count);
+	kfree(re_work);
+}
+
+static void bnxt_re_init_one(struct bnxt_re_dev *rdev)
+{
+	pci_dev_get(rdev->en_dev->pdev);
+}
+
+/*
+ * "Notifier chain callback can be invoked for the same chain from
+ * different CPUs at the same time".
+ *
+ * For cases when the netdev is already present, our call to the
+ * register_netdevice_notifier() will actually get the rtnl_lock()
+ * before sending NETDEV_REGISTER and (if up) NETDEV_UP
+ * events.
+ *
+ * But for cases when the netdev is not already present, the notifier
+ * chain is subjected to be invoked from different CPUs simultaneously.
+ *
+ * This is protected by the netdev_mutex.
+ */
+static int bnxt_re_netdev_event(struct notifier_block *notifier,
+				unsigned long event, void *ptr)
+{
+	struct net_device *real_dev, *netdev = netdev_notifier_info_to_dev(ptr);
+	struct bnxt_re_work *re_work;
+	struct bnxt_re_dev *rdev;
+	int rc = 0;
+	bool sch_work = false;
+
+	real_dev = rdma_vlan_dev_real_dev(netdev);
+	if (!real_dev)
+		real_dev = netdev;
+
+	rdev = bnxt_re_from_netdev(real_dev);
+	if (!rdev && event != NETDEV_REGISTER)
+		goto exit;
+	if (real_dev != netdev)
+		goto exit;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		if (rdev)
+			break;
+		rc = bnxt_re_dev_reg(&rdev, real_dev);
+		if (rc == -ENODEV)
+			break;
+		if (rc) {
+			pr_err("Failed to register with the device %s: %#x\n",
+			       real_dev->name, rc);
+			break;
+		}
+		bnxt_re_init_one(rdev);
+		sch_work = true;
+		break;
+
+	case NETDEV_UNREGISTER:
+		/* netdev notifier will call NETDEV_UNREGISTER again later since
+		 * we are still holding the reference to the netdev
+		 */
+		if (atomic_read(&rdev->sched_count) > 0)
+			goto exit;
+		bnxt_re_ib_unreg(rdev, false);
+		bnxt_re_remove_one(rdev);
+		bnxt_re_dev_unreg(rdev);
+		break;
+
+	default:
+		sch_work = true;
+		break;
+	}
+	if (sch_work) {
+		/* Allocate for the deferred task */
+		re_work = kzalloc(sizeof(*re_work), GFP_ATOMIC);
+		if (re_work) {
+			re_work->rdev = rdev;
+			re_work->event = event;
+			re_work->vlan_dev = (real_dev == netdev ?
+					     NULL : netdev);
+			INIT_WORK(&re_work->work, bnxt_re_task);
+			atomic_inc(&rdev->sched_count);
+			queue_work(bnxt_re_wq, &re_work->work);
+		}
+	}
+
+exit:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block bnxt_re_netdev_notifier = {
+	.notifier_call = bnxt_re_netdev_event
+};
+
+static int __init bnxt_re_mod_init(void)
+{
+	int rc = 0;
+
+	pr_info("%s: %s", ROCE_DRV_MODULE_NAME, version);
+
+	bnxt_re_wq = create_singlethread_workqueue("bnxt_re");
+	if (!bnxt_re_wq)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&bnxt_re_dev_list);
+
+	rc = register_netdevice_notifier(&bnxt_re_netdev_notifier);
+	if (rc) {
+		pr_err("%s: Cannot register to netdevice_notifier",
+		       ROCE_DRV_MODULE_NAME);
+		goto err_netdev;
+	}
+	return 0;
+
+err_netdev:
+	destroy_workqueue(bnxt_re_wq);
+
+	return rc;
+}
+
+static void __exit bnxt_re_mod_exit(void)
+{
+	struct bnxt_re_dev *rdev, *next;
+	LIST_HEAD(to_be_deleted);
+
+	mutex_lock(&bnxt_re_dev_lock);
+	/* Free all adapter allocated resources */
+	if (!list_empty(&bnxt_re_dev_list))
+		list_splice_init(&bnxt_re_dev_list, &to_be_deleted);
+	mutex_unlock(&bnxt_re_dev_lock);
+       /*
+	* Cleanup the devices in reverse order so that the VF device
+	* cleanup is done before PF cleanup
+	*/
+	list_for_each_entry_safe_reverse(rdev, next, &to_be_deleted, list) {
+		dev_info(rdev_to_dev(rdev), "Unregistering Device");
+		/*
+		 * Flush out any scheduled tasks before destroying the
+		 * resources
+		 */
+		flush_workqueue(bnxt_re_wq);
+		bnxt_re_dev_stop(rdev);
+		bnxt_re_ib_unreg(rdev, true);
+		bnxt_re_remove_one(rdev);
+		bnxt_re_dev_unreg(rdev);
+	}
+	unregister_netdevice_notifier(&bnxt_re_netdev_notifier);
+	if (bnxt_re_wq)
+		destroy_workqueue(bnxt_re_wq);
+}
+
+module_init(bnxt_re_mod_init);
+module_exit(bnxt_re_mod_exit);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
new file mode 100644
index 0000000..50d8f1f
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -0,0 +1,2842 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Fast Path Operators
+ */
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/prefetch.h>
+
+#include "roce_hsi.h"
+
+#include "qplib_res.h"
+#include "qplib_rcfw.h"
+#include "qplib_sp.h"
+#include "qplib_fp.h"
+
+static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq);
+static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp);
+static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type);
+
+static void bnxt_qplib_cancel_phantom_processing(struct bnxt_qplib_qp *qp)
+{
+	qp->sq.condition = false;
+	qp->sq.send_phantom = false;
+	qp->sq.single = false;
+}
+
+/* Flush list */
+static void __bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_cq *scq, *rcq;
+
+	scq = qp->scq;
+	rcq = qp->rcq;
+
+	if (!qp->sq.flushed) {
+		dev_dbg(&scq->hwq.pdev->dev,
+			"QPLIB: FP: Adding to SQ Flush list = %p",
+			qp);
+		bnxt_qplib_cancel_phantom_processing(qp);
+		list_add_tail(&qp->sq_flush, &scq->sqf_head);
+		qp->sq.flushed = true;
+	}
+	if (!qp->srq) {
+		if (!qp->rq.flushed) {
+			dev_dbg(&rcq->hwq.pdev->dev,
+				"QPLIB: FP: Adding to RQ Flush list = %p",
+				qp);
+			list_add_tail(&qp->rq_flush, &rcq->rqf_head);
+			qp->rq.flushed = true;
+		}
+	}
+}
+
+static void bnxt_qplib_acquire_cq_flush_locks(struct bnxt_qplib_qp *qp,
+				       unsigned long *flags)
+	__acquires(&qp->scq->flush_lock) __acquires(&qp->rcq->flush_lock)
+{
+	spin_lock_irqsave(&qp->scq->flush_lock, *flags);
+	if (qp->scq == qp->rcq)
+		__acquire(&qp->rcq->flush_lock);
+	else
+		spin_lock(&qp->rcq->flush_lock);
+}
+
+static void bnxt_qplib_release_cq_flush_locks(struct bnxt_qplib_qp *qp,
+				       unsigned long *flags)
+	__releases(&qp->scq->flush_lock) __releases(&qp->rcq->flush_lock)
+{
+	if (qp->scq == qp->rcq)
+		__release(&qp->rcq->flush_lock);
+	else
+		spin_unlock(&qp->rcq->flush_lock);
+	spin_unlock_irqrestore(&qp->scq->flush_lock, *flags);
+}
+
+void bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp)
+{
+	unsigned long flags;
+
+	bnxt_qplib_acquire_cq_flush_locks(qp, &flags);
+	__bnxt_qplib_add_flush_qp(qp);
+	bnxt_qplib_release_cq_flush_locks(qp, &flags);
+}
+
+static void __bnxt_qplib_del_flush_qp(struct bnxt_qplib_qp *qp)
+{
+	if (qp->sq.flushed) {
+		qp->sq.flushed = false;
+		list_del(&qp->sq_flush);
+	}
+	if (!qp->srq) {
+		if (qp->rq.flushed) {
+			qp->rq.flushed = false;
+			list_del(&qp->rq_flush);
+		}
+	}
+}
+
+void bnxt_qplib_clean_qp(struct bnxt_qplib_qp *qp)
+{
+	unsigned long flags;
+
+	bnxt_qplib_acquire_cq_flush_locks(qp, &flags);
+	__clean_cq(qp->scq, (u64)(unsigned long)qp);
+	qp->sq.hwq.prod = 0;
+	qp->sq.hwq.cons = 0;
+	__clean_cq(qp->rcq, (u64)(unsigned long)qp);
+	qp->rq.hwq.prod = 0;
+	qp->rq.hwq.cons = 0;
+
+	__bnxt_qplib_del_flush_qp(qp);
+	bnxt_qplib_release_cq_flush_locks(qp, &flags);
+}
+
+static void bnxt_qpn_cqn_sched_task(struct work_struct *work)
+{
+	struct bnxt_qplib_nq_work *nq_work =
+			container_of(work, struct bnxt_qplib_nq_work, work);
+
+	struct bnxt_qplib_cq *cq = nq_work->cq;
+	struct bnxt_qplib_nq *nq = nq_work->nq;
+
+	if (cq && nq) {
+		spin_lock_bh(&cq->compl_lock);
+		if (atomic_read(&cq->arm_state) && nq->cqn_handler) {
+			dev_dbg(&nq->pdev->dev,
+				"%s:Trigger cq  = %p event nq = %p\n",
+				__func__, cq, nq);
+			nq->cqn_handler(nq, cq);
+		}
+		spin_unlock_bh(&cq->compl_lock);
+	}
+	kfree(nq_work);
+}
+
+static void bnxt_qplib_free_qp_hdr_buf(struct bnxt_qplib_res *res,
+				       struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_q *rq = &qp->rq;
+	struct bnxt_qplib_q *sq = &qp->sq;
+
+	if (qp->rq_hdr_buf)
+		dma_free_coherent(&res->pdev->dev,
+				  rq->hwq.max_elements * qp->rq_hdr_buf_size,
+				  qp->rq_hdr_buf, qp->rq_hdr_buf_map);
+	if (qp->sq_hdr_buf)
+		dma_free_coherent(&res->pdev->dev,
+				  sq->hwq.max_elements * qp->sq_hdr_buf_size,
+				  qp->sq_hdr_buf, qp->sq_hdr_buf_map);
+	qp->rq_hdr_buf = NULL;
+	qp->sq_hdr_buf = NULL;
+	qp->rq_hdr_buf_map = 0;
+	qp->sq_hdr_buf_map = 0;
+	qp->sq_hdr_buf_size = 0;
+	qp->rq_hdr_buf_size = 0;
+}
+
+static int bnxt_qplib_alloc_qp_hdr_buf(struct bnxt_qplib_res *res,
+				       struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_q *rq = &qp->rq;
+	struct bnxt_qplib_q *sq = &qp->rq;
+	int rc = 0;
+
+	if (qp->sq_hdr_buf_size && sq->hwq.max_elements) {
+		qp->sq_hdr_buf = dma_alloc_coherent(&res->pdev->dev,
+					sq->hwq.max_elements *
+					qp->sq_hdr_buf_size,
+					&qp->sq_hdr_buf_map, GFP_KERNEL);
+		if (!qp->sq_hdr_buf) {
+			rc = -ENOMEM;
+			dev_err(&res->pdev->dev,
+				"QPLIB: Failed to create sq_hdr_buf");
+			goto fail;
+		}
+	}
+
+	if (qp->rq_hdr_buf_size && rq->hwq.max_elements) {
+		qp->rq_hdr_buf = dma_alloc_coherent(&res->pdev->dev,
+						    rq->hwq.max_elements *
+						    qp->rq_hdr_buf_size,
+						    &qp->rq_hdr_buf_map,
+						    GFP_KERNEL);
+		if (!qp->rq_hdr_buf) {
+			rc = -ENOMEM;
+			dev_err(&res->pdev->dev,
+				"QPLIB: Failed to create rq_hdr_buf");
+			goto fail;
+		}
+	}
+	return 0;
+
+fail:
+	bnxt_qplib_free_qp_hdr_buf(res, qp);
+	return rc;
+}
+
+static void bnxt_qplib_service_nq(unsigned long data)
+{
+	struct bnxt_qplib_nq *nq = (struct bnxt_qplib_nq *)data;
+	struct bnxt_qplib_hwq *hwq = &nq->hwq;
+	struct nq_base *nqe, **nq_ptr;
+	struct bnxt_qplib_cq *cq;
+	int num_cqne_processed = 0;
+	int num_srqne_processed = 0;
+	u32 sw_cons, raw_cons;
+	u16 type;
+	int budget = nq->budget;
+	uintptr_t q_handle;
+
+	/* Service the NQ until empty */
+	raw_cons = hwq->cons;
+	while (budget--) {
+		sw_cons = HWQ_CMP(raw_cons, hwq);
+		nq_ptr = (struct nq_base **)hwq->pbl_ptr;
+		nqe = &nq_ptr[NQE_PG(sw_cons)][NQE_IDX(sw_cons)];
+		if (!NQE_CMP_VALID(nqe, raw_cons, hwq->max_elements))
+			break;
+
+		/*
+		 * The valid test of the entry must be done first before
+		 * reading any further.
+		 */
+		dma_rmb();
+
+		type = le16_to_cpu(nqe->info10_type) & NQ_BASE_TYPE_MASK;
+		switch (type) {
+		case NQ_BASE_TYPE_CQ_NOTIFICATION:
+		{
+			struct nq_cn *nqcne = (struct nq_cn *)nqe;
+
+			q_handle = le32_to_cpu(nqcne->cq_handle_low);
+			q_handle |= (u64)le32_to_cpu(nqcne->cq_handle_high)
+						     << 32;
+			cq = (struct bnxt_qplib_cq *)(unsigned long)q_handle;
+			bnxt_qplib_arm_cq_enable(cq);
+			spin_lock_bh(&cq->compl_lock);
+			atomic_set(&cq->arm_state, 0);
+			if (!nq->cqn_handler(nq, (cq)))
+				num_cqne_processed++;
+			else
+				dev_warn(&nq->pdev->dev,
+					 "QPLIB: cqn - type 0x%x not handled",
+					 type);
+			spin_unlock_bh(&cq->compl_lock);
+			break;
+		}
+		case NQ_BASE_TYPE_SRQ_EVENT:
+		{
+			struct nq_srq_event *nqsrqe =
+						(struct nq_srq_event *)nqe;
+
+			q_handle = le32_to_cpu(nqsrqe->srq_handle_low);
+			q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high)
+				     << 32;
+			bnxt_qplib_arm_srq((struct bnxt_qplib_srq *)q_handle,
+					   DBR_DBR_TYPE_SRQ_ARMENA);
+			if (!nq->srqn_handler(nq,
+					      (struct bnxt_qplib_srq *)q_handle,
+					      nqsrqe->event))
+				num_srqne_processed++;
+			else
+				dev_warn(&nq->pdev->dev,
+					 "QPLIB: SRQ event 0x%x not handled",
+					 nqsrqe->event);
+			break;
+		}
+		case NQ_BASE_TYPE_DBQ_EVENT:
+			break;
+		default:
+			dev_warn(&nq->pdev->dev,
+				 "QPLIB: nqe with type = 0x%x not handled",
+				 type);
+			break;
+		}
+		raw_cons++;
+	}
+	if (hwq->cons != raw_cons) {
+		hwq->cons = raw_cons;
+		NQ_DB_REARM(nq->bar_reg_iomem, hwq->cons, hwq->max_elements);
+	}
+}
+
+static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance)
+{
+	struct bnxt_qplib_nq *nq = dev_instance;
+	struct bnxt_qplib_hwq *hwq = &nq->hwq;
+	struct nq_base **nq_ptr;
+	u32 sw_cons;
+
+	/* Prefetch the NQ element */
+	sw_cons = HWQ_CMP(hwq->cons, hwq);
+	nq_ptr = (struct nq_base **)nq->hwq.pbl_ptr;
+	prefetch(&nq_ptr[NQE_PG(sw_cons)][NQE_IDX(sw_cons)]);
+
+	/* Fan out to CPU affinitized kthreads? */
+	tasklet_schedule(&nq->worker);
+
+	return IRQ_HANDLED;
+}
+
+void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill)
+{
+	tasklet_disable(&nq->worker);
+	/* Mask h/w interrupt */
+	NQ_DB(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements);
+	/* Sync with last running IRQ handler */
+	synchronize_irq(nq->vector);
+	if (kill)
+		tasklet_kill(&nq->worker);
+	if (nq->requested) {
+		irq_set_affinity_hint(nq->vector, NULL);
+		free_irq(nq->vector, nq);
+		nq->requested = false;
+	}
+}
+
+void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq)
+{
+	if (nq->cqn_wq) {
+		destroy_workqueue(nq->cqn_wq);
+		nq->cqn_wq = NULL;
+	}
+
+	/* Make sure the HW is stopped! */
+	bnxt_qplib_nq_stop_irq(nq, true);
+
+	if (nq->bar_reg_iomem)
+		iounmap(nq->bar_reg_iomem);
+	nq->bar_reg_iomem = NULL;
+
+	nq->cqn_handler = NULL;
+	nq->srqn_handler = NULL;
+	nq->vector = 0;
+}
+
+int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
+			    int msix_vector, bool need_init)
+{
+	int rc;
+
+	if (nq->requested)
+		return -EFAULT;
+
+	nq->vector = msix_vector;
+	if (need_init)
+		tasklet_init(&nq->worker, bnxt_qplib_service_nq,
+			     (unsigned long)nq);
+	else
+		tasklet_enable(&nq->worker);
+
+	snprintf(nq->name, sizeof(nq->name), "bnxt_qplib_nq-%d", nq_indx);
+	rc = request_irq(nq->vector, bnxt_qplib_nq_irq, 0, nq->name, nq);
+	if (rc)
+		return rc;
+
+	cpumask_clear(&nq->mask);
+	cpumask_set_cpu(nq_indx, &nq->mask);
+	rc = irq_set_affinity_hint(nq->vector, &nq->mask);
+	if (rc) {
+		dev_warn(&nq->pdev->dev,
+			 "QPLIB: set affinity failed; vector: %d nq_idx: %d\n",
+			 nq->vector, nq_indx);
+	}
+	nq->requested = true;
+	NQ_DB_REARM(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements);
+
+	return rc;
+}
+
+int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
+			 int nq_idx, int msix_vector, int bar_reg_offset,
+			 int (*cqn_handler)(struct bnxt_qplib_nq *nq,
+					    struct bnxt_qplib_cq *),
+			 int (*srqn_handler)(struct bnxt_qplib_nq *nq,
+					     struct bnxt_qplib_srq *,
+					     u8 event))
+{
+	resource_size_t nq_base;
+	int rc = -1;
+
+	if (cqn_handler)
+		nq->cqn_handler = cqn_handler;
+
+	if (srqn_handler)
+		nq->srqn_handler = srqn_handler;
+
+	/* Have a task to schedule CQ notifiers in post send case */
+	nq->cqn_wq  = create_singlethread_workqueue("bnxt_qplib_nq");
+	if (!nq->cqn_wq)
+		return -ENOMEM;
+
+	nq->bar_reg = NQ_CONS_PCI_BAR_REGION;
+	nq->bar_reg_off = bar_reg_offset;
+	nq_base = pci_resource_start(pdev, nq->bar_reg);
+	if (!nq_base) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+	nq->bar_reg_iomem = ioremap_nocache(nq_base + nq->bar_reg_off, 4);
+	if (!nq->bar_reg_iomem) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+
+	rc = bnxt_qplib_nq_start_irq(nq, nq_idx, msix_vector, true);
+	if (rc) {
+		dev_err(&nq->pdev->dev,
+			"QPLIB: Failed to request irq for nq-idx %d", nq_idx);
+		goto fail;
+	}
+
+	return 0;
+fail:
+	bnxt_qplib_disable_nq(nq);
+	return rc;
+}
+
+void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq)
+{
+	if (nq->hwq.max_elements) {
+		bnxt_qplib_free_hwq(nq->pdev, &nq->hwq);
+		nq->hwq.max_elements = 0;
+	}
+}
+
+int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq)
+{
+	nq->pdev = pdev;
+	if (!nq->hwq.max_elements ||
+	    nq->hwq.max_elements > BNXT_QPLIB_NQE_MAX_CNT)
+		nq->hwq.max_elements = BNXT_QPLIB_NQE_MAX_CNT;
+
+	if (bnxt_qplib_alloc_init_hwq(nq->pdev, &nq->hwq, NULL, 0,
+				      &nq->hwq.max_elements,
+				      BNXT_QPLIB_MAX_NQE_ENTRY_SIZE, 0,
+				      PAGE_SIZE, HWQ_TYPE_L2_CMPL))
+		return -ENOMEM;
+
+	nq->budget = 8;
+	return 0;
+}
+
+/* SRQ */
+static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type)
+{
+	struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
+	struct dbr_dbr db_msg = { 0 };
+	void __iomem *db;
+	u32 sw_prod = 0;
+
+	/* Ring DB */
+	sw_prod = (arm_type == DBR_DBR_TYPE_SRQ_ARM) ? srq->threshold :
+		   HWQ_CMP(srq_hwq->prod, srq_hwq);
+	db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
+				   DBR_DBR_INDEX_MASK);
+	db_msg.type_xid = cpu_to_le32(((srq->id << DBR_DBR_XID_SFT) &
+					DBR_DBR_XID_MASK) | arm_type);
+	db = (arm_type == DBR_DBR_TYPE_SRQ_ARMENA) ?
+		srq->dbr_base : srq->dpi->dbr;
+	wmb(); /* barrier before db ring */
+	__iowrite64_copy(db, &db_msg, sizeof(db_msg) / sizeof(u64));
+}
+
+int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
+			   struct bnxt_qplib_srq *srq)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_destroy_srq req;
+	struct creq_destroy_srq_resp resp;
+	u16 cmd_flags = 0;
+	int rc;
+
+	RCFW_CMD_PREP(req, DESTROY_SRQ, cmd_flags);
+
+	/* Configure the request */
+	req.srq_cid = cpu_to_le32(srq->id);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	if (rc)
+		return rc;
+
+	bnxt_qplib_free_hwq(res->pdev, &srq->hwq);
+	kfree(srq->swq);
+	return 0;
+}
+
+int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_srq *srq)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_create_srq req;
+	struct creq_create_srq_resp resp;
+	struct bnxt_qplib_pbl *pbl;
+	u16 cmd_flags = 0;
+	int rc, idx;
+
+	srq->hwq.max_elements = srq->max_wqe;
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &srq->hwq, srq->sglist,
+				       srq->nmap, &srq->hwq.max_elements,
+				       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	if (rc)
+		goto exit;
+
+	srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq),
+			   GFP_KERNEL);
+	if (!srq->swq) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+
+	RCFW_CMD_PREP(req, CREATE_SRQ, cmd_flags);
+
+	/* Configure the request */
+	req.dpi = cpu_to_le32(srq->dpi->dpi);
+	req.srq_handle = cpu_to_le64((uintptr_t)srq);
+
+	req.srq_size = cpu_to_le16((u16)srq->hwq.max_elements);
+	pbl = &srq->hwq.pbl[PBL_LVL_0];
+	req.pg_size_lvl = cpu_to_le16((((u16)srq->hwq.level &
+				      CMDQ_CREATE_SRQ_LVL_MASK) <<
+				      CMDQ_CREATE_SRQ_LVL_SFT) |
+				      (pbl->pg_size == ROCE_PG_SIZE_4K ?
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_4K :
+				       pbl->pg_size == ROCE_PG_SIZE_8K ?
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_8K :
+				       pbl->pg_size == ROCE_PG_SIZE_64K ?
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_64K :
+				       pbl->pg_size == ROCE_PG_SIZE_2M ?
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_2M :
+				       pbl->pg_size == ROCE_PG_SIZE_8M ?
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_8M :
+				       pbl->pg_size == ROCE_PG_SIZE_1G ?
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_1G :
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_4K));
+	req.pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+	req.pd_id = cpu_to_le32(srq->pd->id);
+	req.eventq_id = cpu_to_le16(srq->eventq_hw_ring_id);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	if (rc)
+		goto fail;
+
+	spin_lock_init(&srq->lock);
+	srq->start_idx = 0;
+	srq->last_idx = srq->hwq.max_elements - 1;
+	for (idx = 0; idx < srq->hwq.max_elements; idx++)
+		srq->swq[idx].next_idx = idx + 1;
+	srq->swq[srq->last_idx].next_idx = -1;
+
+	srq->id = le32_to_cpu(resp.xid);
+	srq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
+	if (srq->threshold)
+		bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARMENA);
+	srq->arm_req = false;
+
+	return 0;
+fail:
+	bnxt_qplib_free_hwq(res->pdev, &srq->hwq);
+	kfree(srq->swq);
+exit:
+	return rc;
+}
+
+int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_srq *srq)
+{
+	struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
+	u32 sw_prod, sw_cons, count = 0;
+
+	sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq);
+	sw_cons = HWQ_CMP(srq_hwq->cons, srq_hwq);
+
+	count = sw_prod > sw_cons ? sw_prod - sw_cons :
+				    srq_hwq->max_elements - sw_cons + sw_prod;
+	if (count > srq->threshold) {
+		srq->arm_req = false;
+		bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM);
+	} else {
+		/* Deferred arming */
+		srq->arm_req = true;
+	}
+
+	return 0;
+}
+
+int bnxt_qplib_query_srq(struct bnxt_qplib_res *res,
+			 struct bnxt_qplib_srq *srq)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_query_srq req;
+	struct creq_query_srq_resp resp;
+	struct bnxt_qplib_rcfw_sbuf *sbuf;
+	struct creq_query_srq_resp_sb *sb;
+	u16 cmd_flags = 0;
+	int rc = 0;
+
+	RCFW_CMD_PREP(req, QUERY_SRQ, cmd_flags);
+	req.srq_cid = cpu_to_le32(srq->id);
+
+	/* Configure the request */
+	sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb));
+	if (!sbuf)
+		return -ENOMEM;
+	sb = sbuf->sb;
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+					  (void *)sbuf, 0);
+	srq->threshold = le16_to_cpu(sb->srq_limit);
+	bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf);
+
+	return rc;
+}
+
+int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq,
+			     struct bnxt_qplib_swqe *wqe)
+{
+	struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
+	struct rq_wqe *srqe, **srqe_ptr;
+	struct sq_sge *hw_sge;
+	u32 sw_prod, sw_cons, count = 0;
+	int i, rc = 0, next;
+
+	spin_lock(&srq_hwq->lock);
+	if (srq->start_idx == srq->last_idx) {
+		dev_err(&srq_hwq->pdev->dev, "QPLIB: FP: SRQ (0x%x) is full!",
+			srq->id);
+		rc = -EINVAL;
+		spin_unlock(&srq_hwq->lock);
+		goto done;
+	}
+	next = srq->start_idx;
+	srq->start_idx = srq->swq[next].next_idx;
+	spin_unlock(&srq_hwq->lock);
+
+	sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq);
+	srqe_ptr = (struct rq_wqe **)srq_hwq->pbl_ptr;
+	srqe = &srqe_ptr[RQE_PG(sw_prod)][RQE_IDX(sw_prod)];
+	memset(srqe, 0, BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
+	/* Calculate wqe_size16 and data_len */
+	for (i = 0, hw_sge = (struct sq_sge *)srqe->data;
+	     i < wqe->num_sge; i++, hw_sge++) {
+		hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr);
+		hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey);
+		hw_sge->size = cpu_to_le32(wqe->sg_list[i].size);
+	}
+	srqe->wqe_type = wqe->type;
+	srqe->flags = wqe->flags;
+	srqe->wqe_size = wqe->num_sge +
+			((offsetof(typeof(*srqe), data) + 15) >> 4);
+	srqe->wr_id[0] = cpu_to_le32((u32)next);
+	srq->swq[next].wr_id = wqe->wr_id;
+
+	srq_hwq->prod++;
+
+	spin_lock(&srq_hwq->lock);
+	sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq);
+	/* retaining srq_hwq->cons for this logic
+	 * actually the lock is only required to
+	 * read srq_hwq->cons.
+	 */
+	sw_cons = HWQ_CMP(srq_hwq->cons, srq_hwq);
+	count = sw_prod > sw_cons ? sw_prod - sw_cons :
+				    srq_hwq->max_elements - sw_cons + sw_prod;
+	spin_unlock(&srq_hwq->lock);
+	/* Ring DB */
+	bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ);
+	if (srq->arm_req == true && count > srq->threshold) {
+		srq->arm_req = false;
+		bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM);
+	}
+done:
+	return rc;
+}
+
+/* QP */
+int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_create_qp1 req;
+	struct creq_create_qp1_resp resp;
+	struct bnxt_qplib_pbl *pbl;
+	struct bnxt_qplib_q *sq = &qp->sq;
+	struct bnxt_qplib_q *rq = &qp->rq;
+	int rc;
+	u16 cmd_flags = 0;
+	u32 qp_flags = 0;
+
+	RCFW_CMD_PREP(req, CREATE_QP1, cmd_flags);
+
+	/* General */
+	req.type = qp->type;
+	req.dpi = cpu_to_le32(qp->dpi->dpi);
+	req.qp_handle = cpu_to_le64(qp->qp_handle);
+
+	/* SQ */
+	sq->hwq.max_elements = sq->max_wqe;
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, NULL, 0,
+				       &sq->hwq.max_elements,
+				       BNXT_QPLIB_MAX_SQE_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	if (rc)
+		goto exit;
+
+	sq->swq = kcalloc(sq->hwq.max_elements, sizeof(*sq->swq), GFP_KERNEL);
+	if (!sq->swq) {
+		rc = -ENOMEM;
+		goto fail_sq;
+	}
+	pbl = &sq->hwq.pbl[PBL_LVL_0];
+	req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+	req.sq_pg_size_sq_lvl =
+		((sq->hwq.level & CMDQ_CREATE_QP1_SQ_LVL_MASK)
+				<<  CMDQ_CREATE_QP1_SQ_LVL_SFT) |
+		(pbl->pg_size == ROCE_PG_SIZE_4K ?
+				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_4K :
+		 pbl->pg_size == ROCE_PG_SIZE_8K ?
+				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_8K :
+		 pbl->pg_size == ROCE_PG_SIZE_64K ?
+				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_64K :
+		 pbl->pg_size == ROCE_PG_SIZE_2M ?
+				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_2M :
+		 pbl->pg_size == ROCE_PG_SIZE_8M ?
+				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_8M :
+		 pbl->pg_size == ROCE_PG_SIZE_1G ?
+				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_1G :
+		 CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_4K);
+
+	if (qp->scq)
+		req.scq_cid = cpu_to_le32(qp->scq->id);
+
+	qp_flags |= CMDQ_CREATE_QP1_QP_FLAGS_RESERVED_LKEY_ENABLE;
+
+	/* RQ */
+	if (rq->max_wqe) {
+		rq->hwq.max_elements = qp->rq.max_wqe;
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq, NULL, 0,
+					       &rq->hwq.max_elements,
+					       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
+					       PAGE_SIZE, HWQ_TYPE_QUEUE);
+		if (rc)
+			goto fail_sq;
+
+		rq->swq = kcalloc(rq->hwq.max_elements, sizeof(*rq->swq),
+				  GFP_KERNEL);
+		if (!rq->swq) {
+			rc = -ENOMEM;
+			goto fail_rq;
+		}
+		pbl = &rq->hwq.pbl[PBL_LVL_0];
+		req.rq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+		req.rq_pg_size_rq_lvl =
+			((rq->hwq.level & CMDQ_CREATE_QP1_RQ_LVL_MASK) <<
+			 CMDQ_CREATE_QP1_RQ_LVL_SFT) |
+				(pbl->pg_size == ROCE_PG_SIZE_4K ?
+					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_4K :
+				 pbl->pg_size == ROCE_PG_SIZE_8K ?
+					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_8K :
+				 pbl->pg_size == ROCE_PG_SIZE_64K ?
+					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_64K :
+				 pbl->pg_size == ROCE_PG_SIZE_2M ?
+					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_2M :
+				 pbl->pg_size == ROCE_PG_SIZE_8M ?
+					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_8M :
+				 pbl->pg_size == ROCE_PG_SIZE_1G ?
+					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_1G :
+				 CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_4K);
+		if (qp->rcq)
+			req.rcq_cid = cpu_to_le32(qp->rcq->id);
+	}
+
+	/* Header buffer - allow hdr_buf pass in */
+	rc = bnxt_qplib_alloc_qp_hdr_buf(res, qp);
+	if (rc) {
+		rc = -ENOMEM;
+		goto fail;
+	}
+	req.qp_flags = cpu_to_le32(qp_flags);
+	req.sq_size = cpu_to_le32(sq->hwq.max_elements);
+	req.rq_size = cpu_to_le32(rq->hwq.max_elements);
+
+	req.sq_fwo_sq_sge =
+		cpu_to_le16((sq->max_sge & CMDQ_CREATE_QP1_SQ_SGE_MASK) <<
+			    CMDQ_CREATE_QP1_SQ_SGE_SFT);
+	req.rq_fwo_rq_sge =
+		cpu_to_le16((rq->max_sge & CMDQ_CREATE_QP1_RQ_SGE_MASK) <<
+			    CMDQ_CREATE_QP1_RQ_SGE_SFT);
+
+	req.pd_id = cpu_to_le32(qp->pd->id);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	if (rc)
+		goto fail;
+
+	qp->id = le32_to_cpu(resp.xid);
+	qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
+	rcfw->qp_tbl[qp->id].qp_id = qp->id;
+	rcfw->qp_tbl[qp->id].qp_handle = (void *)qp;
+
+	return 0;
+
+fail:
+	bnxt_qplib_free_qp_hdr_buf(res, qp);
+fail_rq:
+	bnxt_qplib_free_hwq(res->pdev, &rq->hwq);
+	kfree(rq->swq);
+fail_sq:
+	bnxt_qplib_free_hwq(res->pdev, &sq->hwq);
+	kfree(sq->swq);
+exit:
+	return rc;
+}
+
+int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr;
+	struct cmdq_create_qp req;
+	struct creq_create_qp_resp resp;
+	struct bnxt_qplib_pbl *pbl;
+	struct sq_psn_search **psn_search_ptr;
+	unsigned long int psn_search, poff = 0;
+	struct bnxt_qplib_q *sq = &qp->sq;
+	struct bnxt_qplib_q *rq = &qp->rq;
+	struct bnxt_qplib_hwq *xrrq;
+	int i, rc, req_size, psn_sz;
+	u16 cmd_flags = 0, max_ssge;
+	u32 sw_prod, qp_flags = 0;
+
+	RCFW_CMD_PREP(req, CREATE_QP, cmd_flags);
+
+	/* General */
+	req.type = qp->type;
+	req.dpi = cpu_to_le32(qp->dpi->dpi);
+	req.qp_handle = cpu_to_le64(qp->qp_handle);
+
+	/* SQ */
+	psn_sz = (qp->type == CMDQ_CREATE_QP_TYPE_RC) ?
+		 sizeof(struct sq_psn_search) : 0;
+	sq->hwq.max_elements = sq->max_wqe;
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, sq->sglist,
+				       sq->nmap, &sq->hwq.max_elements,
+				       BNXT_QPLIB_MAX_SQE_ENTRY_SIZE,
+				       psn_sz,
+				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	if (rc)
+		goto exit;
+
+	sq->swq = kcalloc(sq->hwq.max_elements, sizeof(*sq->swq), GFP_KERNEL);
+	if (!sq->swq) {
+		rc = -ENOMEM;
+		goto fail_sq;
+	}
+	hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr;
+	if (psn_sz) {
+		psn_search_ptr = (struct sq_psn_search **)
+				  &hw_sq_send_ptr[get_sqe_pg
+					(sq->hwq.max_elements)];
+		psn_search = (unsigned long int)
+			      &hw_sq_send_ptr[get_sqe_pg(sq->hwq.max_elements)]
+			      [get_sqe_idx(sq->hwq.max_elements)];
+		if (psn_search & ~PAGE_MASK) {
+			/* If the psn_search does not start on a page boundary,
+			 * then calculate the offset
+			 */
+			poff = (psn_search & ~PAGE_MASK) /
+				BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE;
+		}
+		for (i = 0; i < sq->hwq.max_elements; i++)
+			sq->swq[i].psn_search =
+				&psn_search_ptr[get_psne_pg(i + poff)]
+					       [get_psne_idx(i + poff)];
+	}
+	pbl = &sq->hwq.pbl[PBL_LVL_0];
+	req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+	req.sq_pg_size_sq_lvl =
+		((sq->hwq.level & CMDQ_CREATE_QP_SQ_LVL_MASK)
+				 <<  CMDQ_CREATE_QP_SQ_LVL_SFT) |
+		(pbl->pg_size == ROCE_PG_SIZE_4K ?
+				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K :
+		 pbl->pg_size == ROCE_PG_SIZE_8K ?
+				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_8K :
+		 pbl->pg_size == ROCE_PG_SIZE_64K ?
+				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_64K :
+		 pbl->pg_size == ROCE_PG_SIZE_2M ?
+				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_2M :
+		 pbl->pg_size == ROCE_PG_SIZE_8M ?
+				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_8M :
+		 pbl->pg_size == ROCE_PG_SIZE_1G ?
+				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_1G :
+		 CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K);
+
+	/* initialize all SQ WQEs to LOCAL_INVALID (sq prep for hw fetch) */
+	hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr;
+	for (sw_prod = 0; sw_prod < sq->hwq.max_elements; sw_prod++) {
+		hw_sq_send_hdr = &hw_sq_send_ptr[get_sqe_pg(sw_prod)]
+						[get_sqe_idx(sw_prod)];
+		hw_sq_send_hdr->wqe_type = SQ_BASE_WQE_TYPE_LOCAL_INVALID;
+	}
+
+	if (qp->scq)
+		req.scq_cid = cpu_to_le32(qp->scq->id);
+
+	qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_RESERVED_LKEY_ENABLE;
+	qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FR_PMR_ENABLED;
+	if (qp->sig_type)
+		qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION;
+
+	/* RQ */
+	if (rq->max_wqe) {
+		rq->hwq.max_elements = rq->max_wqe;
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq, rq->sglist,
+					       rq->nmap, &rq->hwq.max_elements,
+					       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
+					       PAGE_SIZE, HWQ_TYPE_QUEUE);
+		if (rc)
+			goto fail_sq;
+
+		rq->swq = kcalloc(rq->hwq.max_elements, sizeof(*rq->swq),
+				  GFP_KERNEL);
+		if (!rq->swq) {
+			rc = -ENOMEM;
+			goto fail_rq;
+		}
+		pbl = &rq->hwq.pbl[PBL_LVL_0];
+		req.rq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+		req.rq_pg_size_rq_lvl =
+			((rq->hwq.level & CMDQ_CREATE_QP_RQ_LVL_MASK) <<
+			 CMDQ_CREATE_QP_RQ_LVL_SFT) |
+				(pbl->pg_size == ROCE_PG_SIZE_4K ?
+					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K :
+				 pbl->pg_size == ROCE_PG_SIZE_8K ?
+					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_8K :
+				 pbl->pg_size == ROCE_PG_SIZE_64K ?
+					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_64K :
+				 pbl->pg_size == ROCE_PG_SIZE_2M ?
+					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_2M :
+				 pbl->pg_size == ROCE_PG_SIZE_8M ?
+					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_8M :
+				 pbl->pg_size == ROCE_PG_SIZE_1G ?
+					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_1G :
+				 CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K);
+	} else {
+		/* SRQ */
+		if (qp->srq) {
+			qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_SRQ_USED;
+			req.srq_cid = cpu_to_le32(qp->srq->id);
+		}
+	}
+
+	if (qp->rcq)
+		req.rcq_cid = cpu_to_le32(qp->rcq->id);
+	req.qp_flags = cpu_to_le32(qp_flags);
+	req.sq_size = cpu_to_le32(sq->hwq.max_elements);
+	req.rq_size = cpu_to_le32(rq->hwq.max_elements);
+	qp->sq_hdr_buf = NULL;
+	qp->rq_hdr_buf = NULL;
+
+	rc = bnxt_qplib_alloc_qp_hdr_buf(res, qp);
+	if (rc)
+		goto fail_rq;
+
+	/* CTRL-22434: Irrespective of the requested SGE count on the SQ
+	 * always create the QP with max send sges possible if the requested
+	 * inline size is greater than 0.
+	 */
+	max_ssge = qp->max_inline_data ? 6 : sq->max_sge;
+	req.sq_fwo_sq_sge = cpu_to_le16(
+				((max_ssge & CMDQ_CREATE_QP_SQ_SGE_MASK)
+				 << CMDQ_CREATE_QP_SQ_SGE_SFT) | 0);
+	req.rq_fwo_rq_sge = cpu_to_le16(
+				((rq->max_sge & CMDQ_CREATE_QP_RQ_SGE_MASK)
+				 << CMDQ_CREATE_QP_RQ_SGE_SFT) | 0);
+	/* ORRQ and IRRQ */
+	if (psn_sz) {
+		xrrq = &qp->orrq;
+		xrrq->max_elements =
+			ORD_LIMIT_TO_ORRQ_SLOTS(qp->max_rd_atomic);
+		req_size = xrrq->max_elements *
+			   BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE + PAGE_SIZE - 1;
+		req_size &= ~(PAGE_SIZE - 1);
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL, 0,
+					       &xrrq->max_elements,
+					       BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE,
+					       0, req_size, HWQ_TYPE_CTX);
+		if (rc)
+			goto fail_buf_free;
+		pbl = &xrrq->pbl[PBL_LVL_0];
+		req.orrq_addr = cpu_to_le64(pbl->pg_map_arr[0]);
+
+		xrrq = &qp->irrq;
+		xrrq->max_elements = IRD_LIMIT_TO_IRRQ_SLOTS(
+						qp->max_dest_rd_atomic);
+		req_size = xrrq->max_elements *
+			   BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE + PAGE_SIZE - 1;
+		req_size &= ~(PAGE_SIZE - 1);
+
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL, 0,
+					       &xrrq->max_elements,
+					       BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE,
+					       0, req_size, HWQ_TYPE_CTX);
+		if (rc)
+			goto fail_orrq;
+
+		pbl = &xrrq->pbl[PBL_LVL_0];
+		req.irrq_addr = cpu_to_le64(pbl->pg_map_arr[0]);
+	}
+	req.pd_id = cpu_to_le32(qp->pd->id);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	if (rc)
+		goto fail;
+
+	qp->id = le32_to_cpu(resp.xid);
+	qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
+	INIT_LIST_HEAD(&qp->sq_flush);
+	INIT_LIST_HEAD(&qp->rq_flush);
+	rcfw->qp_tbl[qp->id].qp_id = qp->id;
+	rcfw->qp_tbl[qp->id].qp_handle = (void *)qp;
+
+	return 0;
+
+fail:
+	if (qp->irrq.max_elements)
+		bnxt_qplib_free_hwq(res->pdev, &qp->irrq);
+fail_orrq:
+	if (qp->orrq.max_elements)
+		bnxt_qplib_free_hwq(res->pdev, &qp->orrq);
+fail_buf_free:
+	bnxt_qplib_free_qp_hdr_buf(res, qp);
+fail_rq:
+	bnxt_qplib_free_hwq(res->pdev, &rq->hwq);
+	kfree(rq->swq);
+fail_sq:
+	bnxt_qplib_free_hwq(res->pdev, &sq->hwq);
+	kfree(sq->swq);
+exit:
+	return rc;
+}
+
+static void __modify_flags_from_init_state(struct bnxt_qplib_qp *qp)
+{
+	switch (qp->state) {
+	case CMDQ_MODIFY_QP_NEW_STATE_RTR:
+		/* INIT->RTR, configure the path_mtu to the default
+		 * 2048 if not being requested
+		 */
+		if (!(qp->modify_flags &
+		    CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU)) {
+			qp->modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU;
+			qp->path_mtu =
+				CMDQ_MODIFY_QP_PATH_MTU_MTU_2048;
+		}
+		qp->modify_flags &=
+			~CMDQ_MODIFY_QP_MODIFY_MASK_VLAN_ID;
+		/* Bono FW require the max_dest_rd_atomic to be >= 1 */
+		if (qp->max_dest_rd_atomic < 1)
+			qp->max_dest_rd_atomic = 1;
+		qp->modify_flags &= ~CMDQ_MODIFY_QP_MODIFY_MASK_SRC_MAC;
+		/* Bono FW 20.6.5 requires SGID_INDEX configuration */
+		if (!(qp->modify_flags &
+		    CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX)) {
+			qp->modify_flags |=
+				CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX;
+			qp->ah.sgid_index = 0;
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+static void __modify_flags_from_rtr_state(struct bnxt_qplib_qp *qp)
+{
+	switch (qp->state) {
+	case CMDQ_MODIFY_QP_NEW_STATE_RTS:
+		/* Bono FW requires the max_rd_atomic to be >= 1 */
+		if (qp->max_rd_atomic < 1)
+			qp->max_rd_atomic = 1;
+		/* Bono FW does not allow PKEY_INDEX,
+		 * DGID, FLOW_LABEL, SGID_INDEX, HOP_LIMIT,
+		 * TRAFFIC_CLASS, DEST_MAC, PATH_MTU, RQ_PSN,
+		 * MIN_RNR_TIMER, MAX_DEST_RD_ATOMIC, DEST_QP_ID
+		 * modification
+		 */
+		qp->modify_flags &=
+			~(CMDQ_MODIFY_QP_MODIFY_MASK_PKEY |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_DGID |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_FLOW_LABEL |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_HOP_LIMIT |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_TRAFFIC_CLASS |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_DEST_MAC |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_RQ_PSN |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_MIN_RNR_TIMER |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_MAX_DEST_RD_ATOMIC |
+			  CMDQ_MODIFY_QP_MODIFY_MASK_DEST_QP_ID);
+		break;
+	default:
+		break;
+	}
+}
+
+static void __filter_modify_flags(struct bnxt_qplib_qp *qp)
+{
+	switch (qp->cur_qp_state) {
+	case CMDQ_MODIFY_QP_NEW_STATE_RESET:
+		break;
+	case CMDQ_MODIFY_QP_NEW_STATE_INIT:
+		__modify_flags_from_init_state(qp);
+		break;
+	case CMDQ_MODIFY_QP_NEW_STATE_RTR:
+		__modify_flags_from_rtr_state(qp);
+		break;
+	case CMDQ_MODIFY_QP_NEW_STATE_RTS:
+		break;
+	case CMDQ_MODIFY_QP_NEW_STATE_SQD:
+		break;
+	case CMDQ_MODIFY_QP_NEW_STATE_SQE:
+		break;
+	case CMDQ_MODIFY_QP_NEW_STATE_ERR:
+		break;
+	default:
+		break;
+	}
+}
+
+int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_modify_qp req;
+	struct creq_modify_qp_resp resp;
+	u16 cmd_flags = 0, pkey;
+	u32 temp32[4];
+	u32 bmask;
+	int rc;
+
+	RCFW_CMD_PREP(req, MODIFY_QP, cmd_flags);
+
+	/* Filter out the qp_attr_mask based on the state->new transition */
+	__filter_modify_flags(qp);
+	bmask = qp->modify_flags;
+	req.modify_mask = cpu_to_le32(qp->modify_flags);
+	req.qp_cid = cpu_to_le32(qp->id);
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_STATE) {
+		req.network_type_en_sqd_async_notify_new_state =
+				(qp->state & CMDQ_MODIFY_QP_NEW_STATE_MASK) |
+				(qp->en_sqd_async_notify ?
+					CMDQ_MODIFY_QP_EN_SQD_ASYNC_NOTIFY : 0);
+	}
+	req.network_type_en_sqd_async_notify_new_state |= qp->nw_type;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS)
+		req.access = qp->access;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_PKEY) {
+		if (!bnxt_qplib_get_pkey(res, &res->pkey_tbl,
+					 qp->pkey_index, &pkey))
+			req.pkey = cpu_to_le16(pkey);
+	}
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_QKEY)
+		req.qkey = cpu_to_le32(qp->qkey);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_DGID) {
+		memcpy(temp32, qp->ah.dgid.data, sizeof(struct bnxt_qplib_gid));
+		req.dgid[0] = cpu_to_le32(temp32[0]);
+		req.dgid[1] = cpu_to_le32(temp32[1]);
+		req.dgid[2] = cpu_to_le32(temp32[2]);
+		req.dgid[3] = cpu_to_le32(temp32[3]);
+	}
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_FLOW_LABEL)
+		req.flow_label = cpu_to_le32(qp->ah.flow_label);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX)
+		req.sgid_index = cpu_to_le16(res->sgid_tbl.hw_id
+					     [qp->ah.sgid_index]);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_HOP_LIMIT)
+		req.hop_limit = qp->ah.hop_limit;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_TRAFFIC_CLASS)
+		req.traffic_class = qp->ah.traffic_class;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_DEST_MAC)
+		memcpy(req.dest_mac, qp->ah.dmac, 6);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU)
+		req.path_mtu = qp->path_mtu;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_TIMEOUT)
+		req.timeout = qp->timeout;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_RETRY_CNT)
+		req.retry_cnt = qp->retry_cnt;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_RNR_RETRY)
+		req.rnr_retry = qp->rnr_retry;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_MIN_RNR_TIMER)
+		req.min_rnr_timer = qp->min_rnr_timer;
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_RQ_PSN)
+		req.rq_psn = cpu_to_le32(qp->rq.psn);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_SQ_PSN)
+		req.sq_psn = cpu_to_le32(qp->sq.psn);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_MAX_RD_ATOMIC)
+		req.max_rd_atomic =
+			ORD_LIMIT_TO_ORRQ_SLOTS(qp->max_rd_atomic);
+
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_MAX_DEST_RD_ATOMIC)
+		req.max_dest_rd_atomic =
+			IRD_LIMIT_TO_IRRQ_SLOTS(qp->max_dest_rd_atomic);
+
+	req.sq_size = cpu_to_le32(qp->sq.hwq.max_elements);
+	req.rq_size = cpu_to_le32(qp->rq.hwq.max_elements);
+	req.sq_sge = cpu_to_le16(qp->sq.max_sge);
+	req.rq_sge = cpu_to_le16(qp->rq.max_sge);
+	req.max_inline_data = cpu_to_le32(qp->max_inline_data);
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_DEST_QP_ID)
+		req.dest_qp_id = cpu_to_le32(qp->dest_qpn);
+
+	req.vlan_pcp_vlan_dei_vlan_id = cpu_to_le16(qp->vlan_id);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	if (rc)
+		return rc;
+	qp->cur_qp_state = qp->state;
+	return 0;
+}
+
+int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_query_qp req;
+	struct creq_query_qp_resp resp;
+	struct bnxt_qplib_rcfw_sbuf *sbuf;
+	struct creq_query_qp_resp_sb *sb;
+	u16 cmd_flags = 0;
+	u32 temp32[4];
+	int i, rc = 0;
+
+	RCFW_CMD_PREP(req, QUERY_QP, cmd_flags);
+
+	sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb));
+	if (!sbuf)
+		return -ENOMEM;
+	sb = sbuf->sb;
+
+	req.qp_cid = cpu_to_le32(qp->id);
+	req.resp_size = sizeof(*sb) / BNXT_QPLIB_CMDQE_UNITS;
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+					  (void *)sbuf, 0);
+	if (rc)
+		goto bail;
+	/* Extract the context from the side buffer */
+	qp->state = sb->en_sqd_async_notify_state &
+			CREQ_QUERY_QP_RESP_SB_STATE_MASK;
+	qp->en_sqd_async_notify = sb->en_sqd_async_notify_state &
+				  CREQ_QUERY_QP_RESP_SB_EN_SQD_ASYNC_NOTIFY ?
+				  true : false;
+	qp->access = sb->access;
+	qp->pkey_index = le16_to_cpu(sb->pkey);
+	qp->qkey = le32_to_cpu(sb->qkey);
+
+	temp32[0] = le32_to_cpu(sb->dgid[0]);
+	temp32[1] = le32_to_cpu(sb->dgid[1]);
+	temp32[2] = le32_to_cpu(sb->dgid[2]);
+	temp32[3] = le32_to_cpu(sb->dgid[3]);
+	memcpy(qp->ah.dgid.data, temp32, sizeof(qp->ah.dgid.data));
+
+	qp->ah.flow_label = le32_to_cpu(sb->flow_label);
+
+	qp->ah.sgid_index = 0;
+	for (i = 0; i < res->sgid_tbl.max; i++) {
+		if (res->sgid_tbl.hw_id[i] == le16_to_cpu(sb->sgid_index)) {
+			qp->ah.sgid_index = i;
+			break;
+		}
+	}
+	if (i == res->sgid_tbl.max)
+		dev_warn(&res->pdev->dev, "QPLIB: SGID not found??");
+
+	qp->ah.hop_limit = sb->hop_limit;
+	qp->ah.traffic_class = sb->traffic_class;
+	memcpy(qp->ah.dmac, sb->dest_mac, 6);
+	qp->ah.vlan_id = (le16_to_cpu(sb->path_mtu_dest_vlan_id) &
+				CREQ_QUERY_QP_RESP_SB_VLAN_ID_MASK) >>
+				CREQ_QUERY_QP_RESP_SB_VLAN_ID_SFT;
+	qp->path_mtu = (le16_to_cpu(sb->path_mtu_dest_vlan_id) &
+				    CREQ_QUERY_QP_RESP_SB_PATH_MTU_MASK) >>
+				    CREQ_QUERY_QP_RESP_SB_PATH_MTU_SFT;
+	qp->timeout = sb->timeout;
+	qp->retry_cnt = sb->retry_cnt;
+	qp->rnr_retry = sb->rnr_retry;
+	qp->min_rnr_timer = sb->min_rnr_timer;
+	qp->rq.psn = le32_to_cpu(sb->rq_psn);
+	qp->max_rd_atomic = ORRQ_SLOTS_TO_ORD_LIMIT(sb->max_rd_atomic);
+	qp->sq.psn = le32_to_cpu(sb->sq_psn);
+	qp->max_dest_rd_atomic =
+			IRRQ_SLOTS_TO_IRD_LIMIT(sb->max_dest_rd_atomic);
+	qp->sq.max_wqe = qp->sq.hwq.max_elements;
+	qp->rq.max_wqe = qp->rq.hwq.max_elements;
+	qp->sq.max_sge = le16_to_cpu(sb->sq_sge);
+	qp->rq.max_sge = le16_to_cpu(sb->rq_sge);
+	qp->max_inline_data = le32_to_cpu(sb->max_inline_data);
+	qp->dest_qpn = le32_to_cpu(sb->dest_qp_id);
+	memcpy(qp->smac, sb->src_mac, 6);
+	qp->vlan_id = le16_to_cpu(sb->vlan_pcp_vlan_dei_vlan_id);
+bail:
+	bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf);
+	return rc;
+}
+
+static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp)
+{
+	struct bnxt_qplib_hwq *cq_hwq = &cq->hwq;
+	struct cq_base *hw_cqe, **hw_cqe_ptr;
+	int i;
+
+	for (i = 0; i < cq_hwq->max_elements; i++) {
+		hw_cqe_ptr = (struct cq_base **)cq_hwq->pbl_ptr;
+		hw_cqe = &hw_cqe_ptr[CQE_PG(i)][CQE_IDX(i)];
+		if (!CQE_CMP_VALID(hw_cqe, i, cq_hwq->max_elements))
+			continue;
+		/*
+		 * The valid test of the entry must be done first before
+		 * reading any further.
+		 */
+		dma_rmb();
+		switch (hw_cqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK) {
+		case CQ_BASE_CQE_TYPE_REQ:
+		case CQ_BASE_CQE_TYPE_TERMINAL:
+		{
+			struct cq_req *cqe = (struct cq_req *)hw_cqe;
+
+			if (qp == le64_to_cpu(cqe->qp_handle))
+				cqe->qp_handle = 0;
+			break;
+		}
+		case CQ_BASE_CQE_TYPE_RES_RC:
+		case CQ_BASE_CQE_TYPE_RES_UD:
+		case CQ_BASE_CQE_TYPE_RES_RAWETH_QP1:
+		{
+			struct cq_res_rc *cqe = (struct cq_res_rc *)hw_cqe;
+
+			if (qp == le64_to_cpu(cqe->qp_handle))
+				cqe->qp_handle = 0;
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_destroy_qp req;
+	struct creq_destroy_qp_resp resp;
+	u16 cmd_flags = 0;
+	int rc;
+
+	rcfw->qp_tbl[qp->id].qp_id = BNXT_QPLIB_QP_ID_INVALID;
+	rcfw->qp_tbl[qp->id].qp_handle = NULL;
+
+	RCFW_CMD_PREP(req, DESTROY_QP, cmd_flags);
+
+	req.qp_cid = cpu_to_le32(qp->id);
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	if (rc) {
+		rcfw->qp_tbl[qp->id].qp_id = qp->id;
+		rcfw->qp_tbl[qp->id].qp_handle = qp;
+		return rc;
+	}
+
+	return 0;
+}
+
+void bnxt_qplib_free_qp_res(struct bnxt_qplib_res *res,
+			    struct bnxt_qplib_qp *qp)
+{
+	bnxt_qplib_free_qp_hdr_buf(res, qp);
+	bnxt_qplib_free_hwq(res->pdev, &qp->sq.hwq);
+	kfree(qp->sq.swq);
+
+	bnxt_qplib_free_hwq(res->pdev, &qp->rq.hwq);
+	kfree(qp->rq.swq);
+
+	if (qp->irrq.max_elements)
+		bnxt_qplib_free_hwq(res->pdev, &qp->irrq);
+	if (qp->orrq.max_elements)
+		bnxt_qplib_free_hwq(res->pdev, &qp->orrq);
+
+}
+
+void *bnxt_qplib_get_qp1_sq_buf(struct bnxt_qplib_qp *qp,
+				struct bnxt_qplib_sge *sge)
+{
+	struct bnxt_qplib_q *sq = &qp->sq;
+	u32 sw_prod;
+
+	memset(sge, 0, sizeof(*sge));
+
+	if (qp->sq_hdr_buf) {
+		sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+		sge->addr = (dma_addr_t)(qp->sq_hdr_buf_map +
+					 sw_prod * qp->sq_hdr_buf_size);
+		sge->lkey = 0xFFFFFFFF;
+		sge->size = qp->sq_hdr_buf_size;
+		return qp->sq_hdr_buf + sw_prod * sge->size;
+	}
+	return NULL;
+}
+
+u32 bnxt_qplib_get_rq_prod_index(struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_q *rq = &qp->rq;
+
+	return HWQ_CMP(rq->hwq.prod, &rq->hwq);
+}
+
+dma_addr_t bnxt_qplib_get_qp_buf_from_index(struct bnxt_qplib_qp *qp, u32 index)
+{
+	return (qp->rq_hdr_buf_map + index * qp->rq_hdr_buf_size);
+}
+
+void *bnxt_qplib_get_qp1_rq_buf(struct bnxt_qplib_qp *qp,
+				struct bnxt_qplib_sge *sge)
+{
+	struct bnxt_qplib_q *rq = &qp->rq;
+	u32 sw_prod;
+
+	memset(sge, 0, sizeof(*sge));
+
+	if (qp->rq_hdr_buf) {
+		sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+		sge->addr = (dma_addr_t)(qp->rq_hdr_buf_map +
+					 sw_prod * qp->rq_hdr_buf_size);
+		sge->lkey = 0xFFFFFFFF;
+		sge->size = qp->rq_hdr_buf_size;
+		return qp->rq_hdr_buf + sw_prod * sge->size;
+	}
+	return NULL;
+}
+
+void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_q *sq = &qp->sq;
+	struct dbr_dbr db_msg = { 0 };
+	u32 sw_prod;
+
+	sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+
+	db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
+				   DBR_DBR_INDEX_MASK);
+	db_msg.type_xid =
+		cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
+			    DBR_DBR_TYPE_SQ);
+	/* Flush all the WQE writes to HW */
+	wmb();
+	__iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+}
+
+int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
+			 struct bnxt_qplib_swqe *wqe)
+{
+	struct bnxt_qplib_q *sq = &qp->sq;
+	struct bnxt_qplib_swq *swq;
+	struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr;
+	struct sq_sge *hw_sge;
+	struct bnxt_qplib_nq_work *nq_work = NULL;
+	bool sch_handler = false;
+	u32 sw_prod;
+	u8 wqe_size16;
+	int i, rc = 0, data_len = 0, pkt_num = 0;
+	__le32 temp32;
+
+	if (qp->state != CMDQ_MODIFY_QP_NEW_STATE_RTS) {
+		if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+			sch_handler = true;
+			dev_dbg(&sq->hwq.pdev->dev,
+				"%s Error QP. Scheduling for poll_cq\n",
+				__func__);
+			goto queue_err;
+		}
+	}
+
+	if (bnxt_qplib_queue_full(sq)) {
+		dev_err(&sq->hwq.pdev->dev,
+			"QPLIB: prod = %#x cons = %#x qdepth = %#x delta = %#x",
+			sq->hwq.prod, sq->hwq.cons, sq->hwq.max_elements,
+			sq->q_full_delta);
+		rc = -ENOMEM;
+		goto done;
+	}
+	sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+	swq = &sq->swq[sw_prod];
+	swq->wr_id = wqe->wr_id;
+	swq->type = wqe->type;
+	swq->flags = wqe->flags;
+	if (qp->sig_type)
+		swq->flags |= SQ_SEND_FLAGS_SIGNAL_COMP;
+	swq->start_psn = sq->psn & BTH_PSN_MASK;
+
+	hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr;
+	hw_sq_send_hdr = &hw_sq_send_ptr[get_sqe_pg(sw_prod)]
+					[get_sqe_idx(sw_prod)];
+
+	memset(hw_sq_send_hdr, 0, BNXT_QPLIB_MAX_SQE_ENTRY_SIZE);
+
+	if (wqe->flags & BNXT_QPLIB_SWQE_FLAGS_INLINE) {
+		/* Copy the inline data */
+		if (wqe->inline_len > BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) {
+			dev_warn(&sq->hwq.pdev->dev,
+				 "QPLIB: Inline data length > 96 detected");
+			data_len = BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH;
+		} else {
+			data_len = wqe->inline_len;
+		}
+		memcpy(hw_sq_send_hdr->data, wqe->inline_data, data_len);
+		wqe_size16 = (data_len + 15) >> 4;
+	} else {
+		for (i = 0, hw_sge = (struct sq_sge *)hw_sq_send_hdr->data;
+		     i < wqe->num_sge; i++, hw_sge++) {
+			hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr);
+			hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey);
+			hw_sge->size = cpu_to_le32(wqe->sg_list[i].size);
+			data_len += wqe->sg_list[i].size;
+		}
+		/* Each SGE entry = 1 WQE size16 */
+		wqe_size16 = wqe->num_sge;
+		/* HW requires wqe size has room for atleast one SGE even if
+		 * none was supplied by ULP
+		 */
+		if (!wqe->num_sge)
+			wqe_size16++;
+	}
+
+	/* Specifics */
+	switch (wqe->type) {
+	case BNXT_QPLIB_SWQE_TYPE_SEND:
+		if (qp->type == CMDQ_CREATE_QP1_TYPE_GSI) {
+			/* Assemble info for Raw Ethertype QPs */
+			struct sq_send_raweth_qp1 *sqe =
+				(struct sq_send_raweth_qp1 *)hw_sq_send_hdr;
+
+			sqe->wqe_type = wqe->type;
+			sqe->flags = wqe->flags;
+			sqe->wqe_size = wqe_size16 +
+				((offsetof(typeof(*sqe), data) + 15) >> 4);
+			sqe->cfa_action = cpu_to_le16(wqe->rawqp1.cfa_action);
+			sqe->lflags = cpu_to_le16(wqe->rawqp1.lflags);
+			sqe->length = cpu_to_le32(data_len);
+			sqe->cfa_meta = cpu_to_le32((wqe->rawqp1.cfa_meta &
+				SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_MASK) <<
+				SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_SFT);
+
+			break;
+		}
+		/* fall thru */
+	case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM:
+	case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV:
+	{
+		struct sq_send *sqe = (struct sq_send *)hw_sq_send_hdr;
+
+		sqe->wqe_type = wqe->type;
+		sqe->flags = wqe->flags;
+		sqe->wqe_size = wqe_size16 +
+				((offsetof(typeof(*sqe), data) + 15) >> 4);
+		sqe->inv_key_or_imm_data = cpu_to_le32(
+						wqe->send.inv_key);
+		if (qp->type == CMDQ_CREATE_QP_TYPE_UD) {
+			sqe->q_key = cpu_to_le32(wqe->send.q_key);
+			sqe->dst_qp = cpu_to_le32(
+					wqe->send.dst_qp & SQ_SEND_DST_QP_MASK);
+			sqe->length = cpu_to_le32(data_len);
+			sqe->avid = cpu_to_le32(wqe->send.avid &
+						SQ_SEND_AVID_MASK);
+			sq->psn = (sq->psn + 1) & BTH_PSN_MASK;
+		} else {
+			sqe->length = cpu_to_le32(data_len);
+			sqe->dst_qp = 0;
+			sqe->avid = 0;
+			if (qp->mtu)
+				pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
+			if (!pkt_num)
+				pkt_num = 1;
+			sq->psn = (sq->psn + pkt_num) & BTH_PSN_MASK;
+		}
+		break;
+	}
+	case BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE:
+	case BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM:
+	case BNXT_QPLIB_SWQE_TYPE_RDMA_READ:
+	{
+		struct sq_rdma *sqe = (struct sq_rdma *)hw_sq_send_hdr;
+
+		sqe->wqe_type = wqe->type;
+		sqe->flags = wqe->flags;
+		sqe->wqe_size = wqe_size16 +
+				((offsetof(typeof(*sqe), data) + 15) >> 4);
+		sqe->imm_data = cpu_to_le32(wqe->rdma.inv_key);
+		sqe->length = cpu_to_le32((u32)data_len);
+		sqe->remote_va = cpu_to_le64(wqe->rdma.remote_va);
+		sqe->remote_key = cpu_to_le32(wqe->rdma.r_key);
+		if (qp->mtu)
+			pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
+		if (!pkt_num)
+			pkt_num = 1;
+		sq->psn = (sq->psn + pkt_num) & BTH_PSN_MASK;
+		break;
+	}
+	case BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP:
+	case BNXT_QPLIB_SWQE_TYPE_ATOMIC_FETCH_AND_ADD:
+	{
+		struct sq_atomic *sqe = (struct sq_atomic *)hw_sq_send_hdr;
+
+		sqe->wqe_type = wqe->type;
+		sqe->flags = wqe->flags;
+		sqe->remote_key = cpu_to_le32(wqe->atomic.r_key);
+		sqe->remote_va = cpu_to_le64(wqe->atomic.remote_va);
+		sqe->swap_data = cpu_to_le64(wqe->atomic.swap_data);
+		sqe->cmp_data = cpu_to_le64(wqe->atomic.cmp_data);
+		if (qp->mtu)
+			pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
+		if (!pkt_num)
+			pkt_num = 1;
+		sq->psn = (sq->psn + pkt_num) & BTH_PSN_MASK;
+		break;
+	}
+	case BNXT_QPLIB_SWQE_TYPE_LOCAL_INV:
+	{
+		struct sq_localinvalidate *sqe =
+				(struct sq_localinvalidate *)hw_sq_send_hdr;
+
+		sqe->wqe_type = wqe->type;
+		sqe->flags = wqe->flags;
+		sqe->inv_l_key = cpu_to_le32(wqe->local_inv.inv_l_key);
+
+		break;
+	}
+	case BNXT_QPLIB_SWQE_TYPE_FAST_REG_MR:
+	{
+		struct sq_fr_pmr *sqe = (struct sq_fr_pmr *)hw_sq_send_hdr;
+
+		sqe->wqe_type = wqe->type;
+		sqe->flags = wqe->flags;
+		sqe->access_cntl = wqe->frmr.access_cntl |
+				   SQ_FR_PMR_ACCESS_CNTL_LOCAL_WRITE;
+		sqe->zero_based_page_size_log =
+			(wqe->frmr.pg_sz_log & SQ_FR_PMR_PAGE_SIZE_LOG_MASK) <<
+			SQ_FR_PMR_PAGE_SIZE_LOG_SFT |
+			(wqe->frmr.zero_based ? SQ_FR_PMR_ZERO_BASED : 0);
+		sqe->l_key = cpu_to_le32(wqe->frmr.l_key);
+		temp32 = cpu_to_le32(wqe->frmr.length);
+		memcpy(sqe->length, &temp32, sizeof(wqe->frmr.length));
+		sqe->numlevels_pbl_page_size_log =
+			((wqe->frmr.pbl_pg_sz_log <<
+					SQ_FR_PMR_PBL_PAGE_SIZE_LOG_SFT) &
+					SQ_FR_PMR_PBL_PAGE_SIZE_LOG_MASK) |
+			((wqe->frmr.levels << SQ_FR_PMR_NUMLEVELS_SFT) &
+					SQ_FR_PMR_NUMLEVELS_MASK);
+
+		for (i = 0; i < wqe->frmr.page_list_len; i++)
+			wqe->frmr.pbl_ptr[i] = cpu_to_le64(
+						wqe->frmr.page_list[i] |
+						PTU_PTE_VALID);
+		sqe->pblptr = cpu_to_le64(wqe->frmr.pbl_dma_ptr);
+		sqe->va = cpu_to_le64(wqe->frmr.va);
+
+		break;
+	}
+	case BNXT_QPLIB_SWQE_TYPE_BIND_MW:
+	{
+		struct sq_bind *sqe = (struct sq_bind *)hw_sq_send_hdr;
+
+		sqe->wqe_type = wqe->type;
+		sqe->flags = wqe->flags;
+		sqe->access_cntl = wqe->bind.access_cntl;
+		sqe->mw_type_zero_based = wqe->bind.mw_type |
+			(wqe->bind.zero_based ? SQ_BIND_ZERO_BASED : 0);
+		sqe->parent_l_key = cpu_to_le32(wqe->bind.parent_l_key);
+		sqe->l_key = cpu_to_le32(wqe->bind.r_key);
+		sqe->va = cpu_to_le64(wqe->bind.va);
+		temp32 = cpu_to_le32(wqe->bind.length);
+		memcpy(&sqe->length, &temp32, sizeof(wqe->bind.length));
+		break;
+	}
+	default:
+		/* Bad wqe, return error */
+		rc = -EINVAL;
+		goto done;
+	}
+	swq->next_psn = sq->psn & BTH_PSN_MASK;
+	if (swq->psn_search) {
+		swq->psn_search->opcode_start_psn = cpu_to_le32(
+			((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) &
+			 SQ_PSN_SEARCH_START_PSN_MASK) |
+			((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) &
+			 SQ_PSN_SEARCH_OPCODE_MASK));
+		swq->psn_search->flags_next_psn = cpu_to_le32(
+			((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) &
+			 SQ_PSN_SEARCH_NEXT_PSN_MASK));
+	}
+queue_err:
+	if (sch_handler) {
+		/* Store the ULP info in the software structures */
+		sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+		swq = &sq->swq[sw_prod];
+		swq->wr_id = wqe->wr_id;
+		swq->type = wqe->type;
+		swq->flags = wqe->flags;
+		if (qp->sig_type)
+			swq->flags |= SQ_SEND_FLAGS_SIGNAL_COMP;
+		swq->start_psn = sq->psn & BTH_PSN_MASK;
+	}
+	sq->hwq.prod++;
+	qp->wqe_cnt++;
+
+done:
+	if (sch_handler) {
+		nq_work = kzalloc(sizeof(*nq_work), GFP_ATOMIC);
+		if (nq_work) {
+			nq_work->cq = qp->scq;
+			nq_work->nq = qp->scq->nq;
+			INIT_WORK(&nq_work->work, bnxt_qpn_cqn_sched_task);
+			queue_work(qp->scq->nq->cqn_wq, &nq_work->work);
+		} else {
+			dev_err(&sq->hwq.pdev->dev,
+				"QPLIB: FP: Failed to allocate SQ nq_work!");
+			rc = -ENOMEM;
+		}
+	}
+	return rc;
+}
+
+void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_q *rq = &qp->rq;
+	struct dbr_dbr db_msg = { 0 };
+	u32 sw_prod;
+
+	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+	db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
+				   DBR_DBR_INDEX_MASK);
+	db_msg.type_xid =
+		cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
+			    DBR_DBR_TYPE_RQ);
+
+	/* Flush the writes to HW Rx WQE before the ringing Rx DB */
+	wmb();
+	__iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+}
+
+int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
+			 struct bnxt_qplib_swqe *wqe)
+{
+	struct bnxt_qplib_q *rq = &qp->rq;
+	struct rq_wqe *rqe, **rqe_ptr;
+	struct sq_sge *hw_sge;
+	struct bnxt_qplib_nq_work *nq_work = NULL;
+	bool sch_handler = false;
+	u32 sw_prod;
+	int i, rc = 0;
+
+	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+		sch_handler = true;
+		dev_dbg(&rq->hwq.pdev->dev,
+			"%s Error QP. Scheduling for poll_cq\n",
+			__func__);
+		goto queue_err;
+	}
+	if (bnxt_qplib_queue_full(rq)) {
+		dev_err(&rq->hwq.pdev->dev,
+			"QPLIB: FP: QP (0x%x) RQ is full!", qp->id);
+		rc = -EINVAL;
+		goto done;
+	}
+	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+	rq->swq[sw_prod].wr_id = wqe->wr_id;
+
+	rqe_ptr = (struct rq_wqe **)rq->hwq.pbl_ptr;
+	rqe = &rqe_ptr[RQE_PG(sw_prod)][RQE_IDX(sw_prod)];
+
+	memset(rqe, 0, BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
+
+	/* Calculate wqe_size16 and data_len */
+	for (i = 0, hw_sge = (struct sq_sge *)rqe->data;
+	     i < wqe->num_sge; i++, hw_sge++) {
+		hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr);
+		hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey);
+		hw_sge->size = cpu_to_le32(wqe->sg_list[i].size);
+	}
+	rqe->wqe_type = wqe->type;
+	rqe->flags = wqe->flags;
+	rqe->wqe_size = wqe->num_sge +
+			((offsetof(typeof(*rqe), data) + 15) >> 4);
+	/* HW requires wqe size has room for atleast one SGE even if none
+	 * was supplied by ULP
+	 */
+	if (!wqe->num_sge)
+		rqe->wqe_size++;
+
+	/* Supply the rqe->wr_id index to the wr_id_tbl for now */
+	rqe->wr_id[0] = cpu_to_le32(sw_prod);
+
+queue_err:
+	if (sch_handler) {
+		/* Store the ULP info in the software structures */
+		sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+		rq->swq[sw_prod].wr_id = wqe->wr_id;
+	}
+
+	rq->hwq.prod++;
+	if (sch_handler) {
+		nq_work = kzalloc(sizeof(*nq_work), GFP_ATOMIC);
+		if (nq_work) {
+			nq_work->cq = qp->rcq;
+			nq_work->nq = qp->rcq->nq;
+			INIT_WORK(&nq_work->work, bnxt_qpn_cqn_sched_task);
+			queue_work(qp->rcq->nq->cqn_wq, &nq_work->work);
+		} else {
+			dev_err(&rq->hwq.pdev->dev,
+				"QPLIB: FP: Failed to allocate RQ nq_work!");
+			rc = -ENOMEM;
+		}
+	}
+done:
+	return rc;
+}
+
+/* CQ */
+
+/* Spinlock must be held */
+static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq)
+{
+	struct dbr_dbr db_msg = { 0 };
+
+	db_msg.type_xid =
+		cpu_to_le32(((cq->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
+			    DBR_DBR_TYPE_CQ_ARMENA);
+	/* Flush memory writes before enabling the CQ */
+	wmb();
+	__iowrite64_copy(cq->dbr_base, &db_msg, sizeof(db_msg) / sizeof(u64));
+}
+
+static void bnxt_qplib_arm_cq(struct bnxt_qplib_cq *cq, u32 arm_type)
+{
+	struct bnxt_qplib_hwq *cq_hwq = &cq->hwq;
+	struct dbr_dbr db_msg = { 0 };
+	u32 sw_cons;
+
+	/* Ring DB */
+	sw_cons = HWQ_CMP(cq_hwq->cons, cq_hwq);
+	db_msg.index = cpu_to_le32((sw_cons << DBR_DBR_INDEX_SFT) &
+				    DBR_DBR_INDEX_MASK);
+	db_msg.type_xid =
+		cpu_to_le32(((cq->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) |
+			    arm_type);
+	/* flush memory writes before arming the CQ */
+	wmb();
+	__iowrite64_copy(cq->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64));
+}
+
+int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_create_cq req;
+	struct creq_create_cq_resp resp;
+	struct bnxt_qplib_pbl *pbl;
+	u16 cmd_flags = 0;
+	int rc;
+
+	cq->hwq.max_elements = cq->max_wqe;
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &cq->hwq, cq->sghead,
+				       cq->nmap, &cq->hwq.max_elements,
+				       BNXT_QPLIB_MAX_CQE_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	if (rc)
+		goto exit;
+
+	RCFW_CMD_PREP(req, CREATE_CQ, cmd_flags);
+
+	if (!cq->dpi) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: FP: CREATE_CQ failed due to NULL DPI");
+		return -EINVAL;
+	}
+	req.dpi = cpu_to_le32(cq->dpi->dpi);
+	req.cq_handle = cpu_to_le64(cq->cq_handle);
+
+	req.cq_size = cpu_to_le32(cq->hwq.max_elements);
+	pbl = &cq->hwq.pbl[PBL_LVL_0];
+	req.pg_size_lvl = cpu_to_le32(
+	    ((cq->hwq.level & CMDQ_CREATE_CQ_LVL_MASK) <<
+						CMDQ_CREATE_CQ_LVL_SFT) |
+	    (pbl->pg_size == ROCE_PG_SIZE_4K ? CMDQ_CREATE_CQ_PG_SIZE_PG_4K :
+	     pbl->pg_size == ROCE_PG_SIZE_8K ? CMDQ_CREATE_CQ_PG_SIZE_PG_8K :
+	     pbl->pg_size == ROCE_PG_SIZE_64K ? CMDQ_CREATE_CQ_PG_SIZE_PG_64K :
+	     pbl->pg_size == ROCE_PG_SIZE_2M ? CMDQ_CREATE_CQ_PG_SIZE_PG_2M :
+	     pbl->pg_size == ROCE_PG_SIZE_8M ? CMDQ_CREATE_CQ_PG_SIZE_PG_8M :
+	     pbl->pg_size == ROCE_PG_SIZE_1G ? CMDQ_CREATE_CQ_PG_SIZE_PG_1G :
+	     CMDQ_CREATE_CQ_PG_SIZE_PG_4K));
+
+	req.pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+
+	req.cq_fco_cnq_id = cpu_to_le32(
+			(cq->cnq_hw_ring_id & CMDQ_CREATE_CQ_CNQ_ID_MASK) <<
+			 CMDQ_CREATE_CQ_CNQ_ID_SFT);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	if (rc)
+		goto fail;
+
+	cq->id = le32_to_cpu(resp.xid);
+	cq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
+	cq->period = BNXT_QPLIB_QUEUE_START_PERIOD;
+	init_waitqueue_head(&cq->waitq);
+	INIT_LIST_HEAD(&cq->sqf_head);
+	INIT_LIST_HEAD(&cq->rqf_head);
+	spin_lock_init(&cq->compl_lock);
+
+	bnxt_qplib_arm_cq_enable(cq);
+	return 0;
+
+fail:
+	bnxt_qplib_free_hwq(res->pdev, &cq->hwq);
+exit:
+	return rc;
+}
+
+int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_destroy_cq req;
+	struct creq_destroy_cq_resp resp;
+	u16 cmd_flags = 0;
+	int rc;
+
+	RCFW_CMD_PREP(req, DESTROY_CQ, cmd_flags);
+
+	req.cq_cid = cpu_to_le32(cq->id);
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	if (rc)
+		return rc;
+	bnxt_qplib_free_hwq(res->pdev, &cq->hwq);
+	return 0;
+}
+
+static int __flush_sq(struct bnxt_qplib_q *sq, struct bnxt_qplib_qp *qp,
+		      struct bnxt_qplib_cqe **pcqe, int *budget)
+{
+	u32 sw_prod, sw_cons;
+	struct bnxt_qplib_cqe *cqe;
+	int rc = 0;
+
+	/* Now complete all outstanding SQEs with FLUSHED_ERR */
+	sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+	cqe = *pcqe;
+	while (*budget) {
+		sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
+		if (sw_cons == sw_prod) {
+			break;
+		}
+		/* Skip the FENCE WQE completions */
+		if (sq->swq[sw_cons].wr_id == BNXT_QPLIB_FENCE_WRID) {
+			bnxt_qplib_cancel_phantom_processing(qp);
+			goto skip_compl;
+		}
+		memset(cqe, 0, sizeof(*cqe));
+		cqe->status = CQ_REQ_STATUS_WORK_REQUEST_FLUSHED_ERR;
+		cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
+		cqe->qp_handle = (u64)(unsigned long)qp;
+		cqe->wr_id = sq->swq[sw_cons].wr_id;
+		cqe->src_qp = qp->id;
+		cqe->type = sq->swq[sw_cons].type;
+		cqe++;
+		(*budget)--;
+skip_compl:
+		sq->hwq.cons++;
+	}
+	*pcqe = cqe;
+	if (!(*budget) && HWQ_CMP(sq->hwq.cons, &sq->hwq) != sw_prod)
+		/* Out of budget */
+		rc = -EAGAIN;
+
+	return rc;
+}
+
+static int __flush_rq(struct bnxt_qplib_q *rq, struct bnxt_qplib_qp *qp,
+		      struct bnxt_qplib_cqe **pcqe, int *budget)
+{
+	struct bnxt_qplib_cqe *cqe;
+	u32 sw_prod, sw_cons;
+	int rc = 0;
+	int opcode = 0;
+
+	switch (qp->type) {
+	case CMDQ_CREATE_QP1_TYPE_GSI:
+		opcode = CQ_BASE_CQE_TYPE_RES_RAWETH_QP1;
+		break;
+	case CMDQ_CREATE_QP_TYPE_RC:
+		opcode = CQ_BASE_CQE_TYPE_RES_RC;
+		break;
+	case CMDQ_CREATE_QP_TYPE_UD:
+		opcode = CQ_BASE_CQE_TYPE_RES_UD;
+		break;
+	}
+
+	/* Flush the rest of the RQ */
+	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+	cqe = *pcqe;
+	while (*budget) {
+		sw_cons = HWQ_CMP(rq->hwq.cons, &rq->hwq);
+		if (sw_cons == sw_prod)
+			break;
+		memset(cqe, 0, sizeof(*cqe));
+		cqe->status =
+		    CQ_RES_RC_STATUS_WORK_REQUEST_FLUSHED_ERR;
+		cqe->opcode = opcode;
+		cqe->qp_handle = (unsigned long)qp;
+		cqe->wr_id = rq->swq[sw_cons].wr_id;
+		cqe++;
+		(*budget)--;
+		rq->hwq.cons++;
+	}
+	*pcqe = cqe;
+	if (!*budget && HWQ_CMP(rq->hwq.cons, &rq->hwq) != sw_prod)
+		/* Out of budget */
+		rc = -EAGAIN;
+
+	return rc;
+}
+
+void bnxt_qplib_mark_qp_error(void *qp_handle)
+{
+	struct bnxt_qplib_qp *qp = qp_handle;
+
+	if (!qp)
+		return;
+
+	/* Must block new posting of SQ and RQ */
+	qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
+	bnxt_qplib_cancel_phantom_processing(qp);
+}
+
+/* Note: SQE is valid from sw_sq_cons up to cqe_sq_cons (exclusive)
+ *       CQE is track from sw_cq_cons to max_element but valid only if VALID=1
+ */
+static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq,
+		     u32 cq_cons, u32 sw_sq_cons, u32 cqe_sq_cons)
+{
+	struct bnxt_qplib_q *sq = &qp->sq;
+	struct bnxt_qplib_swq *swq;
+	u32 peek_sw_cq_cons, peek_raw_cq_cons, peek_sq_cons_idx;
+	struct cq_base *peek_hwcqe, **peek_hw_cqe_ptr;
+	struct cq_req *peek_req_hwcqe;
+	struct bnxt_qplib_qp *peek_qp;
+	struct bnxt_qplib_q *peek_sq;
+	int i, rc = 0;
+
+	/* Normal mode */
+	/* Check for the psn_search marking before completing */
+	swq = &sq->swq[sw_sq_cons];
+	if (swq->psn_search &&
+	    le32_to_cpu(swq->psn_search->flags_next_psn) & 0x80000000) {
+		/* Unmark */
+		swq->psn_search->flags_next_psn = cpu_to_le32
+			(le32_to_cpu(swq->psn_search->flags_next_psn)
+				     & ~0x80000000);
+		dev_dbg(&cq->hwq.pdev->dev,
+			"FP: Process Req cq_cons=0x%x qp=0x%x sq cons sw=0x%x cqe=0x%x marked!\n",
+			cq_cons, qp->id, sw_sq_cons, cqe_sq_cons);
+		sq->condition = true;
+		sq->send_phantom = true;
+
+		/* TODO: Only ARM if the previous SQE is ARMALL */
+		bnxt_qplib_arm_cq(cq, DBR_DBR_TYPE_CQ_ARMALL);
+
+		rc = -EAGAIN;
+		goto out;
+	}
+	if (sq->condition) {
+		/* Peek at the completions */
+		peek_raw_cq_cons = cq->hwq.cons;
+		peek_sw_cq_cons = cq_cons;
+		i = cq->hwq.max_elements;
+		while (i--) {
+			peek_sw_cq_cons = HWQ_CMP((peek_sw_cq_cons), &cq->hwq);
+			peek_hw_cqe_ptr = (struct cq_base **)cq->hwq.pbl_ptr;
+			peek_hwcqe = &peek_hw_cqe_ptr[CQE_PG(peek_sw_cq_cons)]
+						     [CQE_IDX(peek_sw_cq_cons)];
+			/* If the next hwcqe is VALID */
+			if (CQE_CMP_VALID(peek_hwcqe, peek_raw_cq_cons,
+					  cq->hwq.max_elements)) {
+			/*
+			 * The valid test of the entry must be done first before
+			 * reading any further.
+			 */
+				dma_rmb();
+				/* If the next hwcqe is a REQ */
+				if ((peek_hwcqe->cqe_type_toggle &
+				    CQ_BASE_CQE_TYPE_MASK) ==
+				    CQ_BASE_CQE_TYPE_REQ) {
+					peek_req_hwcqe = (struct cq_req *)
+							 peek_hwcqe;
+					peek_qp = (struct bnxt_qplib_qp *)
+						((unsigned long)
+						 le64_to_cpu
+						 (peek_req_hwcqe->qp_handle));
+					peek_sq = &peek_qp->sq;
+					peek_sq_cons_idx = HWQ_CMP(le16_to_cpu(
+						peek_req_hwcqe->sq_cons_idx) - 1
+						, &sq->hwq);
+					/* If the hwcqe's sq's wr_id matches */
+					if (peek_sq == sq &&
+					    sq->swq[peek_sq_cons_idx].wr_id ==
+					    BNXT_QPLIB_FENCE_WRID) {
+						/*
+						 *  Unbreak only if the phantom
+						 *  comes back
+						 */
+						dev_dbg(&cq->hwq.pdev->dev,
+							"FP:Got Phantom CQE");
+						sq->condition = false;
+						sq->single = true;
+						rc = 0;
+						goto out;
+					}
+				}
+				/* Valid but not the phantom, so keep looping */
+			} else {
+				/* Not valid yet, just exit and wait */
+				rc = -EINVAL;
+				goto out;
+			}
+			peek_sw_cq_cons++;
+			peek_raw_cq_cons++;
+		}
+		dev_err(&cq->hwq.pdev->dev,
+			"Should not have come here! cq_cons=0x%x qp=0x%x sq cons sw=0x%x hw=0x%x",
+			cq_cons, qp->id, sw_sq_cons, cqe_sq_cons);
+		rc = -EINVAL;
+	}
+out:
+	return rc;
+}
+
+static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq,
+				     struct cq_req *hwcqe,
+				     struct bnxt_qplib_cqe **pcqe, int *budget,
+				     u32 cq_cons, struct bnxt_qplib_qp **lib_qp)
+{
+	struct bnxt_qplib_qp *qp;
+	struct bnxt_qplib_q *sq;
+	struct bnxt_qplib_cqe *cqe;
+	u32 sw_sq_cons, cqe_sq_cons;
+	struct bnxt_qplib_swq *swq;
+	int rc = 0;
+
+	qp = (struct bnxt_qplib_qp *)((unsigned long)
+				      le64_to_cpu(hwcqe->qp_handle));
+	if (!qp) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: FP: Process Req qp is NULL");
+		return -EINVAL;
+	}
+	sq = &qp->sq;
+
+	cqe_sq_cons = HWQ_CMP(le16_to_cpu(hwcqe->sq_cons_idx), &sq->hwq);
+	if (cqe_sq_cons > sq->hwq.max_elements) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: FP: CQ Process req reported ");
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x",
+			cqe_sq_cons, sq->hwq.max_elements);
+		return -EINVAL;
+	}
+
+	if (qp->sq.flushed) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+		goto done;
+	}
+	/* Require to walk the sq's swq to fabricate CQEs for all previously
+	 * signaled SWQEs due to CQE aggregation from the current sq cons
+	 * to the cqe_sq_cons
+	 */
+	cqe = *pcqe;
+	while (*budget) {
+		sw_sq_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
+		if (sw_sq_cons == cqe_sq_cons)
+			/* Done */
+			break;
+
+		swq = &sq->swq[sw_sq_cons];
+		memset(cqe, 0, sizeof(*cqe));
+		cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
+		cqe->qp_handle = (u64)(unsigned long)qp;
+		cqe->src_qp = qp->id;
+		cqe->wr_id = swq->wr_id;
+		if (cqe->wr_id == BNXT_QPLIB_FENCE_WRID)
+			goto skip;
+		cqe->type = swq->type;
+
+		/* For the last CQE, check for status.  For errors, regardless
+		 * of the request being signaled or not, it must complete with
+		 * the hwcqe error status
+		 */
+		if (HWQ_CMP((sw_sq_cons + 1), &sq->hwq) == cqe_sq_cons &&
+		    hwcqe->status != CQ_REQ_STATUS_OK) {
+			cqe->status = hwcqe->status;
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Processed Req ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: wr_id[%d] = 0x%llx with status 0x%x",
+				sw_sq_cons, cqe->wr_id, cqe->status);
+			cqe++;
+			(*budget)--;
+			bnxt_qplib_mark_qp_error(qp);
+			/* Add qp to flush list of the CQ */
+			bnxt_qplib_add_flush_qp(qp);
+		} else {
+			if (swq->flags & SQ_SEND_FLAGS_SIGNAL_COMP) {
+				/* Before we complete, do WA 9060 */
+				if (do_wa9060(qp, cq, cq_cons, sw_sq_cons,
+					      cqe_sq_cons)) {
+					*lib_qp = qp;
+					goto out;
+				}
+				cqe->status = CQ_REQ_STATUS_OK;
+				cqe++;
+				(*budget)--;
+			}
+		}
+skip:
+		sq->hwq.cons++;
+		if (sq->single)
+			break;
+	}
+out:
+	*pcqe = cqe;
+	if (HWQ_CMP(sq->hwq.cons, &sq->hwq) != cqe_sq_cons) {
+		/* Out of budget */
+		rc = -EAGAIN;
+		goto done;
+	}
+	/*
+	 * Back to normal completion mode only after it has completed all of
+	 * the WC for this CQE
+	 */
+	sq->single = false;
+done:
+	return rc;
+}
+
+static void bnxt_qplib_release_srqe(struct bnxt_qplib_srq *srq, u32 tag)
+{
+	spin_lock(&srq->hwq.lock);
+	srq->swq[srq->last_idx].next_idx = (int)tag;
+	srq->last_idx = (int)tag;
+	srq->swq[srq->last_idx].next_idx = -1;
+	srq->hwq.cons++; /* Support for SRQE counter */
+	spin_unlock(&srq->hwq.lock);
+}
+
+static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq,
+					struct cq_res_rc *hwcqe,
+					struct bnxt_qplib_cqe **pcqe,
+					int *budget)
+{
+	struct bnxt_qplib_qp *qp;
+	struct bnxt_qplib_q *rq;
+	struct bnxt_qplib_srq *srq;
+	struct bnxt_qplib_cqe *cqe;
+	u32 wr_id_idx;
+	int rc = 0;
+
+	qp = (struct bnxt_qplib_qp *)((unsigned long)
+				      le64_to_cpu(hwcqe->qp_handle));
+	if (!qp) {
+		dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq RC qp is NULL");
+		return -EINVAL;
+	}
+	if (qp->rq.flushed) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+		goto done;
+	}
+
+	cqe = *pcqe;
+	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
+	cqe->length = le32_to_cpu(hwcqe->length);
+	cqe->invrkey = le32_to_cpu(hwcqe->imm_data_or_inv_r_key);
+	cqe->mr_handle = le64_to_cpu(hwcqe->mr_handle);
+	cqe->flags = le16_to_cpu(hwcqe->flags);
+	cqe->status = hwcqe->status;
+	cqe->qp_handle = (u64)(unsigned long)qp;
+
+	wr_id_idx = le32_to_cpu(hwcqe->srq_or_rq_wr_id) &
+				CQ_RES_RC_SRQ_OR_RQ_WR_ID_MASK;
+	if (cqe->flags & CQ_RES_RC_FLAGS_SRQ_SRQ) {
+		srq = qp->srq;
+		if (!srq)
+			return -EINVAL;
+		if (wr_id_idx > srq->hwq.max_elements) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Process RC ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x",
+				wr_id_idx, srq->hwq.max_elements);
+			return -EINVAL;
+		}
+		cqe->wr_id = srq->swq[wr_id_idx].wr_id;
+		bnxt_qplib_release_srqe(srq, wr_id_idx);
+		cqe++;
+		(*budget)--;
+		*pcqe = cqe;
+	} else {
+		rq = &qp->rq;
+		if (wr_id_idx > rq->hwq.max_elements) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Process RC ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x",
+				wr_id_idx, rq->hwq.max_elements);
+			return -EINVAL;
+		}
+		cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+		cqe++;
+		(*budget)--;
+		rq->hwq.cons++;
+		*pcqe = cqe;
+
+		if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
+			qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
+			/* Add qp to flush list of the CQ */
+			bnxt_qplib_add_flush_qp(qp);
+		}
+	}
+
+done:
+	return rc;
+}
+
+static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq,
+					struct cq_res_ud *hwcqe,
+					struct bnxt_qplib_cqe **pcqe,
+					int *budget)
+{
+	struct bnxt_qplib_qp *qp;
+	struct bnxt_qplib_q *rq;
+	struct bnxt_qplib_srq *srq;
+	struct bnxt_qplib_cqe *cqe;
+	u32 wr_id_idx;
+	int rc = 0;
+
+	qp = (struct bnxt_qplib_qp *)((unsigned long)
+				      le64_to_cpu(hwcqe->qp_handle));
+	if (!qp) {
+		dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq UD qp is NULL");
+		return -EINVAL;
+	}
+	if (qp->rq.flushed) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+		goto done;
+	}
+	cqe = *pcqe;
+	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
+	cqe->length = le32_to_cpu(hwcqe->length);
+	cqe->invrkey = le32_to_cpu(hwcqe->imm_data);
+	cqe->flags = le16_to_cpu(hwcqe->flags);
+	cqe->status = hwcqe->status;
+	cqe->qp_handle = (u64)(unsigned long)qp;
+	memcpy(cqe->smac, hwcqe->src_mac, 6);
+	wr_id_idx = le32_to_cpu(hwcqe->src_qp_high_srq_or_rq_wr_id)
+				& CQ_RES_UD_SRQ_OR_RQ_WR_ID_MASK;
+	cqe->src_qp = le16_to_cpu(hwcqe->src_qp_low) |
+				  ((le32_to_cpu(
+				  hwcqe->src_qp_high_srq_or_rq_wr_id) &
+				 CQ_RES_UD_SRC_QP_HIGH_MASK) >> 8);
+
+	if (cqe->flags & CQ_RES_RC_FLAGS_SRQ_SRQ) {
+		srq = qp->srq;
+		if (!srq)
+			return -EINVAL;
+
+		if (wr_id_idx > srq->hwq.max_elements) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Process UD ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x",
+				wr_id_idx, srq->hwq.max_elements);
+			return -EINVAL;
+		}
+		cqe->wr_id = srq->swq[wr_id_idx].wr_id;
+		bnxt_qplib_release_srqe(srq, wr_id_idx);
+		cqe++;
+		(*budget)--;
+		*pcqe = cqe;
+	} else {
+		rq = &qp->rq;
+		if (wr_id_idx > rq->hwq.max_elements) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Process UD ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x",
+				wr_id_idx, rq->hwq.max_elements);
+			return -EINVAL;
+		}
+
+		cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+		cqe++;
+		(*budget)--;
+		rq->hwq.cons++;
+		*pcqe = cqe;
+
+		if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
+			qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
+			/* Add qp to flush list of the CQ */
+			bnxt_qplib_add_flush_qp(qp);
+		}
+	}
+done:
+	return rc;
+}
+
+bool bnxt_qplib_is_cq_empty(struct bnxt_qplib_cq *cq)
+{
+	struct cq_base *hw_cqe, **hw_cqe_ptr;
+	u32 sw_cons, raw_cons;
+	bool rc = true;
+
+	raw_cons = cq->hwq.cons;
+	sw_cons = HWQ_CMP(raw_cons, &cq->hwq);
+	hw_cqe_ptr = (struct cq_base **)cq->hwq.pbl_ptr;
+	hw_cqe = &hw_cqe_ptr[CQE_PG(sw_cons)][CQE_IDX(sw_cons)];
+
+	 /* Check for Valid bit. If the CQE is valid, return false */
+	rc = !CQE_CMP_VALID(hw_cqe, raw_cons, cq->hwq.max_elements);
+	return rc;
+}
+
+static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq,
+						struct cq_res_raweth_qp1 *hwcqe,
+						struct bnxt_qplib_cqe **pcqe,
+						int *budget)
+{
+	struct bnxt_qplib_qp *qp;
+	struct bnxt_qplib_q *rq;
+	struct bnxt_qplib_srq *srq;
+	struct bnxt_qplib_cqe *cqe;
+	u32 wr_id_idx;
+	int rc = 0;
+
+	qp = (struct bnxt_qplib_qp *)((unsigned long)
+				      le64_to_cpu(hwcqe->qp_handle));
+	if (!qp) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: process_cq Raw/QP1 qp is NULL");
+		return -EINVAL;
+	}
+	if (qp->rq.flushed) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+		goto done;
+	}
+	cqe = *pcqe;
+	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
+	cqe->flags = le16_to_cpu(hwcqe->flags);
+	cqe->qp_handle = (u64)(unsigned long)qp;
+
+	wr_id_idx =
+		le32_to_cpu(hwcqe->raweth_qp1_payload_offset_srq_or_rq_wr_id)
+				& CQ_RES_RAWETH_QP1_SRQ_OR_RQ_WR_ID_MASK;
+	cqe->src_qp = qp->id;
+	if (qp->id == 1 && !cqe->length) {
+		/* Add workaround for the length misdetection */
+		cqe->length = 296;
+	} else {
+		cqe->length = le16_to_cpu(hwcqe->length);
+	}
+	cqe->pkey_index = qp->pkey_index;
+	memcpy(cqe->smac, qp->smac, 6);
+
+	cqe->raweth_qp1_flags = le16_to_cpu(hwcqe->raweth_qp1_flags);
+	cqe->raweth_qp1_flags2 = le32_to_cpu(hwcqe->raweth_qp1_flags2);
+	cqe->raweth_qp1_metadata = le32_to_cpu(hwcqe->raweth_qp1_metadata);
+
+	if (cqe->flags & CQ_RES_RAWETH_QP1_FLAGS_SRQ_SRQ) {
+		srq = qp->srq;
+		if (!srq) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: SRQ used but not defined??");
+			return -EINVAL;
+		}
+		if (wr_id_idx > srq->hwq.max_elements) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Process Raw/QP1 ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x",
+				wr_id_idx, srq->hwq.max_elements);
+			return -EINVAL;
+		}
+		cqe->wr_id = srq->swq[wr_id_idx].wr_id;
+		bnxt_qplib_release_srqe(srq, wr_id_idx);
+		cqe++;
+		(*budget)--;
+		*pcqe = cqe;
+	} else {
+		rq = &qp->rq;
+		if (wr_id_idx > rq->hwq.max_elements) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Process Raw/QP1 RQ wr_id ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: ix 0x%x exceeded RQ max 0x%x",
+				wr_id_idx, rq->hwq.max_elements);
+			return -EINVAL;
+		}
+		cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+		cqe++;
+		(*budget)--;
+		rq->hwq.cons++;
+		*pcqe = cqe;
+
+		if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
+			qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
+			/* Add qp to flush list of the CQ */
+			bnxt_qplib_add_flush_qp(qp);
+		}
+	}
+
+done:
+	return rc;
+}
+
+static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq,
+					  struct cq_terminal *hwcqe,
+					  struct bnxt_qplib_cqe **pcqe,
+					  int *budget)
+{
+	struct bnxt_qplib_qp *qp;
+	struct bnxt_qplib_q *sq, *rq;
+	struct bnxt_qplib_cqe *cqe;
+	u32 sw_cons = 0, cqe_cons;
+	int rc = 0;
+
+	/* Check the Status */
+	if (hwcqe->status != CQ_TERMINAL_STATUS_OK)
+		dev_warn(&cq->hwq.pdev->dev,
+			 "QPLIB: FP: CQ Process Terminal Error status = 0x%x",
+			 hwcqe->status);
+
+	qp = (struct bnxt_qplib_qp *)((unsigned long)
+				      le64_to_cpu(hwcqe->qp_handle));
+	if (!qp) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: FP: CQ Process terminal qp is NULL");
+		return -EINVAL;
+	}
+
+	/* Must block new posting of SQ and RQ */
+	qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
+
+	sq = &qp->sq;
+	rq = &qp->rq;
+
+	cqe_cons = le16_to_cpu(hwcqe->sq_cons_idx);
+	if (cqe_cons == 0xFFFF)
+		goto do_rq;
+
+	if (cqe_cons > sq->hwq.max_elements) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: FP: CQ Process terminal reported ");
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x",
+			cqe_cons, sq->hwq.max_elements);
+		goto do_rq;
+	}
+
+	if (qp->sq.flushed) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+		goto sq_done;
+	}
+
+	/* Terminal CQE can also include aggregated successful CQEs prior.
+	 * So we must complete all CQEs from the current sq's cons to the
+	 * cq_cons with status OK
+	 */
+	cqe = *pcqe;
+	while (*budget) {
+		sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
+		if (sw_cons == cqe_cons)
+			break;
+		if (sq->swq[sw_cons].flags & SQ_SEND_FLAGS_SIGNAL_COMP) {
+			memset(cqe, 0, sizeof(*cqe));
+			cqe->status = CQ_REQ_STATUS_OK;
+			cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
+			cqe->qp_handle = (u64)(unsigned long)qp;
+			cqe->src_qp = qp->id;
+			cqe->wr_id = sq->swq[sw_cons].wr_id;
+			cqe->type = sq->swq[sw_cons].type;
+			cqe++;
+			(*budget)--;
+		}
+		sq->hwq.cons++;
+	}
+	*pcqe = cqe;
+	if (!(*budget) && sw_cons != cqe_cons) {
+		/* Out of budget */
+		rc = -EAGAIN;
+		goto sq_done;
+	}
+sq_done:
+	if (rc)
+		return rc;
+do_rq:
+	cqe_cons = le16_to_cpu(hwcqe->rq_cons_idx);
+	if (cqe_cons == 0xFFFF) {
+		goto done;
+	} else if (cqe_cons > rq->hwq.max_elements) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: FP: CQ Processed terminal ");
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: reported rq_cons_idx 0x%x exceeds max 0x%x",
+			cqe_cons, rq->hwq.max_elements);
+		goto done;
+	}
+
+	if (qp->rq.flushed) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+		rc = 0;
+		goto done;
+	}
+
+	/* Terminal CQE requires all posted RQEs to complete with FLUSHED_ERR
+	 * from the current rq->cons to the rq->prod regardless what the
+	 * rq->cons the terminal CQE indicates
+	 */
+
+	/* Add qp to flush list of the CQ */
+	bnxt_qplib_add_flush_qp(qp);
+done:
+	return rc;
+}
+
+static int bnxt_qplib_cq_process_cutoff(struct bnxt_qplib_cq *cq,
+					struct cq_cutoff *hwcqe)
+{
+	/* Check the Status */
+	if (hwcqe->status != CQ_CUTOFF_STATUS_OK) {
+		dev_err(&cq->hwq.pdev->dev,
+			"QPLIB: FP: CQ Process Cutoff Error status = 0x%x",
+			hwcqe->status);
+		return -EINVAL;
+	}
+	clear_bit(CQ_FLAGS_RESIZE_IN_PROG, &cq->flags);
+	wake_up_interruptible(&cq->waitq);
+
+	return 0;
+}
+
+int bnxt_qplib_process_flush_list(struct bnxt_qplib_cq *cq,
+				  struct bnxt_qplib_cqe *cqe,
+				  int num_cqes)
+{
+	struct bnxt_qplib_qp *qp = NULL;
+	u32 budget = num_cqes;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->flush_lock, flags);
+	list_for_each_entry(qp, &cq->sqf_head, sq_flush) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"QPLIB: FP: Flushing SQ QP= %p",
+			qp);
+		__flush_sq(&qp->sq, qp, &cqe, &budget);
+	}
+
+	list_for_each_entry(qp, &cq->rqf_head, rq_flush) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"QPLIB: FP: Flushing RQ QP= %p",
+			qp);
+		__flush_rq(&qp->rq, qp, &cqe, &budget);
+	}
+	spin_unlock_irqrestore(&cq->flush_lock, flags);
+
+	return num_cqes - budget;
+}
+
+int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
+		       int num_cqes, struct bnxt_qplib_qp **lib_qp)
+{
+	struct cq_base *hw_cqe, **hw_cqe_ptr;
+	u32 sw_cons, raw_cons;
+	int budget, rc = 0;
+
+	raw_cons = cq->hwq.cons;
+	budget = num_cqes;
+
+	while (budget) {
+		sw_cons = HWQ_CMP(raw_cons, &cq->hwq);
+		hw_cqe_ptr = (struct cq_base **)cq->hwq.pbl_ptr;
+		hw_cqe = &hw_cqe_ptr[CQE_PG(sw_cons)][CQE_IDX(sw_cons)];
+
+		/* Check for Valid bit */
+		if (!CQE_CMP_VALID(hw_cqe, raw_cons, cq->hwq.max_elements))
+			break;
+
+		/*
+		 * The valid test of the entry must be done first before
+		 * reading any further.
+		 */
+		dma_rmb();
+		/* From the device's respective CQE format to qplib_wc*/
+		switch (hw_cqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK) {
+		case CQ_BASE_CQE_TYPE_REQ:
+			rc = bnxt_qplib_cq_process_req(cq,
+						       (struct cq_req *)hw_cqe,
+						       &cqe, &budget,
+						       sw_cons, lib_qp);
+			break;
+		case CQ_BASE_CQE_TYPE_RES_RC:
+			rc = bnxt_qplib_cq_process_res_rc(cq,
+							  (struct cq_res_rc *)
+							  hw_cqe, &cqe,
+							  &budget);
+			break;
+		case CQ_BASE_CQE_TYPE_RES_UD:
+			rc = bnxt_qplib_cq_process_res_ud
+					(cq, (struct cq_res_ud *)hw_cqe, &cqe,
+					 &budget);
+			break;
+		case CQ_BASE_CQE_TYPE_RES_RAWETH_QP1:
+			rc = bnxt_qplib_cq_process_res_raweth_qp1
+					(cq, (struct cq_res_raweth_qp1 *)
+					 hw_cqe, &cqe, &budget);
+			break;
+		case CQ_BASE_CQE_TYPE_TERMINAL:
+			rc = bnxt_qplib_cq_process_terminal
+					(cq, (struct cq_terminal *)hw_cqe,
+					 &cqe, &budget);
+			break;
+		case CQ_BASE_CQE_TYPE_CUT_OFF:
+			bnxt_qplib_cq_process_cutoff
+					(cq, (struct cq_cutoff *)hw_cqe);
+			/* Done processing this CQ */
+			goto exit;
+		default:
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: process_cq unknown type 0x%lx",
+				hw_cqe->cqe_type_toggle &
+				CQ_BASE_CQE_TYPE_MASK);
+			rc = -EINVAL;
+			break;
+		}
+		if (rc < 0) {
+			if (rc == -EAGAIN)
+				break;
+			/* Error while processing the CQE, just skip to the
+			 * next one
+			 */
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: process_cqe error rc = 0x%x", rc);
+		}
+		raw_cons++;
+	}
+	if (cq->hwq.cons != raw_cons) {
+		cq->hwq.cons = raw_cons;
+		bnxt_qplib_arm_cq(cq, DBR_DBR_TYPE_CQ);
+	}
+exit:
+	return num_cqes - budget;
+}
+
+void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type)
+{
+	if (arm_type)
+		bnxt_qplib_arm_cq(cq, arm_type);
+	/* Using cq->arm_state variable to track whether to issue cq handler */
+	atomic_set(&cq->arm_state, 1);
+}
+
+void bnxt_qplib_flush_cqn_wq(struct bnxt_qplib_qp *qp)
+{
+	flush_workqueue(qp->scq->nq->cqn_wq);
+	if (qp->scq != qp->rcq)
+		flush_workqueue(qp->rcq->nq->cqn_wq);
+}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
new file mode 100644
index 0000000..72352ca
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -0,0 +1,529 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Fast Path Operators (header)
+ */
+
+#ifndef __BNXT_QPLIB_FP_H__
+#define __BNXT_QPLIB_FP_H__
+
+struct bnxt_qplib_srq {
+	struct bnxt_qplib_pd		*pd;
+	struct bnxt_qplib_dpi		*dpi;
+	void __iomem			*dbr_base;
+	u64				srq_handle;
+	u32				id;
+	u32				max_wqe;
+	u32				max_sge;
+	u32				threshold;
+	bool				arm_req;
+	struct bnxt_qplib_cq		*cq;
+	struct bnxt_qplib_hwq		hwq;
+	struct bnxt_qplib_swq		*swq;
+	struct scatterlist		*sglist;
+	int				start_idx;
+	int				last_idx;
+	u32				nmap;
+	u16				eventq_hw_ring_id;
+	spinlock_t			lock; /* protect SRQE link list */
+};
+
+struct bnxt_qplib_sge {
+	u64				addr;
+	u32				lkey;
+	u32				size;
+};
+
+#define BNXT_QPLIB_MAX_SQE_ENTRY_SIZE	sizeof(struct sq_send)
+
+#define SQE_CNT_PER_PG		(PAGE_SIZE / BNXT_QPLIB_MAX_SQE_ENTRY_SIZE)
+#define SQE_MAX_IDX_PER_PG	(SQE_CNT_PER_PG - 1)
+
+static inline u32 get_sqe_pg(u32 val)
+{
+	return ((val & ~SQE_MAX_IDX_PER_PG) / SQE_CNT_PER_PG);
+}
+
+static inline u32 get_sqe_idx(u32 val)
+{
+	return (val & SQE_MAX_IDX_PER_PG);
+}
+
+#define BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE	sizeof(struct sq_psn_search)
+
+#define PSNE_CNT_PER_PG		(PAGE_SIZE / BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE)
+#define PSNE_MAX_IDX_PER_PG	(PSNE_CNT_PER_PG - 1)
+
+static inline u32 get_psne_pg(u32 val)
+{
+	return ((val & ~PSNE_MAX_IDX_PER_PG) / PSNE_CNT_PER_PG);
+}
+
+static inline u32 get_psne_idx(u32 val)
+{
+	return (val & PSNE_MAX_IDX_PER_PG);
+}
+
+#define BNXT_QPLIB_QP_MAX_SGL	6
+
+struct bnxt_qplib_swq {
+	u64				wr_id;
+	int				next_idx;
+	u8				type;
+	u8				flags;
+	u32				start_psn;
+	u32				next_psn;
+	struct sq_psn_search		*psn_search;
+};
+
+struct bnxt_qplib_swqe {
+	/* General */
+#define	BNXT_QPLIB_FENCE_WRID	0x46454E43	/* "FENC" */
+	u64				wr_id;
+	u8				reqs_type;
+	u8				type;
+#define BNXT_QPLIB_SWQE_TYPE_SEND			0
+#define BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM		1
+#define BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV		2
+#define BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE			4
+#define BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM	5
+#define BNXT_QPLIB_SWQE_TYPE_RDMA_READ			6
+#define BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP		8
+#define BNXT_QPLIB_SWQE_TYPE_ATOMIC_FETCH_AND_ADD	11
+#define BNXT_QPLIB_SWQE_TYPE_LOCAL_INV			12
+#define BNXT_QPLIB_SWQE_TYPE_FAST_REG_MR		13
+#define BNXT_QPLIB_SWQE_TYPE_REG_MR			13
+#define BNXT_QPLIB_SWQE_TYPE_BIND_MW			14
+#define BNXT_QPLIB_SWQE_TYPE_RECV			128
+#define BNXT_QPLIB_SWQE_TYPE_RECV_RDMA_IMM		129
+	u8				flags;
+#define BNXT_QPLIB_SWQE_FLAGS_SIGNAL_COMP		BIT(0)
+#define BNXT_QPLIB_SWQE_FLAGS_RD_ATOMIC_FENCE		BIT(1)
+#define BNXT_QPLIB_SWQE_FLAGS_UC_FENCE			BIT(2)
+#define BNXT_QPLIB_SWQE_FLAGS_SOLICIT_EVENT		BIT(3)
+#define BNXT_QPLIB_SWQE_FLAGS_INLINE			BIT(4)
+	struct bnxt_qplib_sge		sg_list[BNXT_QPLIB_QP_MAX_SGL];
+	int				num_sge;
+	/* Max inline data is 96 bytes */
+	u32				inline_len;
+#define BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH		96
+	u8		inline_data[BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH];
+
+	union {
+		/* Send, with imm, inval key */
+		struct {
+			union {
+				__be32	imm_data;
+				u32	inv_key;
+			};
+			u32		q_key;
+			u32		dst_qp;
+			u16		avid;
+		} send;
+
+		/* Send Raw Ethernet and QP1 */
+		struct {
+			u16		lflags;
+			u16		cfa_action;
+			u32		cfa_meta;
+		} rawqp1;
+
+		/* RDMA write, with imm, read */
+		struct {
+			union {
+				__be32	imm_data;
+				u32	inv_key;
+			};
+			u64		remote_va;
+			u32		r_key;
+		} rdma;
+
+		/* Atomic cmp/swap, fetch/add */
+		struct {
+			u64		remote_va;
+			u32		r_key;
+			u64		swap_data;
+			u64		cmp_data;
+		} atomic;
+
+		/* Local Invalidate */
+		struct {
+			u32		inv_l_key;
+		} local_inv;
+
+		/* FR-PMR */
+		struct {
+			u8		access_cntl;
+			u8		pg_sz_log;
+			bool		zero_based;
+			u32		l_key;
+			u32		length;
+			u8		pbl_pg_sz_log;
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_4K			0
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_8K			1
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_64K			4
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_256K			6
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_1M			8
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_2M			9
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_4M			10
+#define BNXT_QPLIB_SWQE_PAGE_SIZE_1G			18
+			u8		levels;
+#define PAGE_SHIFT_4K	12
+			__le64		*pbl_ptr;
+			dma_addr_t	pbl_dma_ptr;
+			u64		*page_list;
+			u16		page_list_len;
+			u64		va;
+		} frmr;
+
+		/* Bind */
+		struct {
+			u8		access_cntl;
+#define BNXT_QPLIB_BIND_SWQE_ACCESS_LOCAL_WRITE		BIT(0)
+#define BNXT_QPLIB_BIND_SWQE_ACCESS_REMOTE_READ		BIT(1)
+#define BNXT_QPLIB_BIND_SWQE_ACCESS_REMOTE_WRITE	BIT(2)
+#define BNXT_QPLIB_BIND_SWQE_ACCESS_REMOTE_ATOMIC	BIT(3)
+#define BNXT_QPLIB_BIND_SWQE_ACCESS_WINDOW_BIND		BIT(4)
+			bool		zero_based;
+			u8		mw_type;
+			u32		parent_l_key;
+			u32		r_key;
+			u64		va;
+			u32		length;
+		} bind;
+	};
+};
+
+#define BNXT_QPLIB_MAX_RQE_ENTRY_SIZE	sizeof(struct rq_wqe)
+
+#define RQE_CNT_PER_PG		(PAGE_SIZE / BNXT_QPLIB_MAX_RQE_ENTRY_SIZE)
+#define RQE_MAX_IDX_PER_PG	(RQE_CNT_PER_PG - 1)
+#define RQE_PG(x)		(((x) & ~RQE_MAX_IDX_PER_PG) / RQE_CNT_PER_PG)
+#define RQE_IDX(x)		((x) & RQE_MAX_IDX_PER_PG)
+
+struct bnxt_qplib_q {
+	struct bnxt_qplib_hwq		hwq;
+	struct bnxt_qplib_swq		*swq;
+	struct scatterlist		*sglist;
+	u32				nmap;
+	u32				max_wqe;
+	u16				q_full_delta;
+	u16				max_sge;
+	u32				psn;
+	bool				condition;
+	bool				single;
+	bool				send_phantom;
+	u32				phantom_wqe_cnt;
+	u32				phantom_cqe_cnt;
+	u32				next_cq_cons;
+	bool				flushed;
+};
+
+struct bnxt_qplib_qp {
+	struct bnxt_qplib_pd		*pd;
+	struct bnxt_qplib_dpi		*dpi;
+	u64				qp_handle;
+#define        BNXT_QPLIB_QP_ID_INVALID        0xFFFFFFFF
+	u32				id;
+	u8				type;
+	u8				sig_type;
+	u32				modify_flags;
+	u8				state;
+	u8				cur_qp_state;
+	u32				max_inline_data;
+	u32				mtu;
+	u8				path_mtu;
+	bool				en_sqd_async_notify;
+	u16				pkey_index;
+	u32				qkey;
+	u32				dest_qp_id;
+	u8				access;
+	u8				timeout;
+	u8				retry_cnt;
+	u8				rnr_retry;
+	u64				wqe_cnt;
+	u32				min_rnr_timer;
+	u32				max_rd_atomic;
+	u32				max_dest_rd_atomic;
+	u32				dest_qpn;
+	u8				smac[6];
+	u16				vlan_id;
+	u8				nw_type;
+	struct bnxt_qplib_ah		ah;
+
+#define BTH_PSN_MASK			((1 << 24) - 1)
+	/* SQ */
+	struct bnxt_qplib_q		sq;
+	/* RQ */
+	struct bnxt_qplib_q		rq;
+	/* SRQ */
+	struct bnxt_qplib_srq		*srq;
+	/* CQ */
+	struct bnxt_qplib_cq		*scq;
+	struct bnxt_qplib_cq		*rcq;
+	/* IRRQ and ORRQ */
+	struct bnxt_qplib_hwq		irrq;
+	struct bnxt_qplib_hwq		orrq;
+	/* Header buffer for QP1 */
+	int				sq_hdr_buf_size;
+	int				rq_hdr_buf_size;
+/*
+ * Buffer space for ETH(14), IP or GRH(40), UDP header(8)
+ * and ib_bth + ib_deth (20).
+ * Max required is 82 when RoCE V2 is enabled
+ */
+#define BNXT_QPLIB_MAX_QP1_SQ_HDR_SIZE_V2	86
+	/* Ethernet header	=  14 */
+	/* ib_grh		=  40 (provided by MAD) */
+	/* ib_bth + ib_deth	=  20 */
+	/* MAD			= 256 (provided by MAD) */
+	/* iCRC			=   4 */
+#define BNXT_QPLIB_MAX_QP1_RQ_ETH_HDR_SIZE	14
+#define BNXT_QPLIB_MAX_QP1_RQ_HDR_SIZE_V2	512
+#define BNXT_QPLIB_MAX_GRH_HDR_SIZE_IPV4	20
+#define BNXT_QPLIB_MAX_GRH_HDR_SIZE_IPV6	40
+#define BNXT_QPLIB_MAX_QP1_RQ_BDETH_HDR_SIZE	20
+	void				*sq_hdr_buf;
+	dma_addr_t			sq_hdr_buf_map;
+	void				*rq_hdr_buf;
+	dma_addr_t			rq_hdr_buf_map;
+	struct list_head		sq_flush;
+	struct list_head		rq_flush;
+};
+
+#define BNXT_QPLIB_MAX_CQE_ENTRY_SIZE	sizeof(struct cq_base)
+
+#define CQE_CNT_PER_PG		(PAGE_SIZE / BNXT_QPLIB_MAX_CQE_ENTRY_SIZE)
+#define CQE_MAX_IDX_PER_PG	(CQE_CNT_PER_PG - 1)
+#define CQE_PG(x)		(((x) & ~CQE_MAX_IDX_PER_PG) / CQE_CNT_PER_PG)
+#define CQE_IDX(x)		((x) & CQE_MAX_IDX_PER_PG)
+
+#define ROCE_CQE_CMP_V			0
+#define CQE_CMP_VALID(hdr, raw_cons, cp_bit)			\
+	(!!((hdr)->cqe_type_toggle & CQ_BASE_TOGGLE) ==		\
+	   !((raw_cons) & (cp_bit)))
+
+static inline bool bnxt_qplib_queue_full(struct bnxt_qplib_q *qplib_q)
+{
+	return HWQ_CMP((qplib_q->hwq.prod + qplib_q->q_full_delta),
+		       &qplib_q->hwq) == HWQ_CMP(qplib_q->hwq.cons,
+						 &qplib_q->hwq);
+}
+
+struct bnxt_qplib_cqe {
+	u8				status;
+	u8				type;
+	u8				opcode;
+	u32				length;
+	u64				wr_id;
+	union {
+		__be32			immdata;
+		u32			invrkey;
+	};
+	u64				qp_handle;
+	u64				mr_handle;
+	u16				flags;
+	u8				smac[6];
+	u32				src_qp;
+	u16				raweth_qp1_flags;
+	u16				raweth_qp1_errors;
+	u16				raweth_qp1_cfa_code;
+	u32				raweth_qp1_flags2;
+	u32				raweth_qp1_metadata;
+	u8				raweth_qp1_payload_offset;
+	u16				pkey_index;
+};
+
+#define BNXT_QPLIB_QUEUE_START_PERIOD		0x01
+struct bnxt_qplib_cq {
+	struct bnxt_qplib_dpi		*dpi;
+	void __iomem			*dbr_base;
+	u32				max_wqe;
+	u32				id;
+	u16				count;
+	u16				period;
+	struct bnxt_qplib_hwq		hwq;
+	u32				cnq_hw_ring_id;
+	struct bnxt_qplib_nq		*nq;
+	bool				resize_in_progress;
+	struct scatterlist		*sghead;
+	u32				nmap;
+	u64				cq_handle;
+
+#define CQ_RESIZE_WAIT_TIME_MS		500
+	unsigned long			flags;
+#define CQ_FLAGS_RESIZE_IN_PROG		1
+	wait_queue_head_t		waitq;
+	struct list_head		sqf_head, rqf_head;
+	atomic_t			arm_state;
+	spinlock_t			compl_lock; /* synch CQ handlers */
+/* Locking Notes:
+ * QP can move to error state from modify_qp, async error event or error
+ * CQE as part of poll_cq. When QP is moved to error state, it gets added
+ * to two flush lists, one each for SQ and RQ.
+ * Each flush list is protected by qplib_cq->flush_lock. Both scq and rcq
+ * flush_locks should be acquired when QP is moved to error. The control path
+ * operations(modify_qp and async error events) are synchronized with poll_cq
+ * using upper level CQ locks (bnxt_re_cq->cq_lock) of both SCQ and RCQ.
+ * The qplib_cq->flush_lock is required to synchronize two instances of poll_cq
+ * of the same QP while manipulating the flush list.
+ */
+	spinlock_t			flush_lock; /* QP flush management */
+};
+
+#define BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE	sizeof(struct xrrq_irrq)
+#define BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE	sizeof(struct xrrq_orrq)
+#define IRD_LIMIT_TO_IRRQ_SLOTS(x)	(2 * (x) + 2)
+#define IRRQ_SLOTS_TO_IRD_LIMIT(s)	(((s) >> 1) - 1)
+#define ORD_LIMIT_TO_ORRQ_SLOTS(x)	((x) + 1)
+#define ORRQ_SLOTS_TO_ORD_LIMIT(s)	((s) - 1)
+
+#define BNXT_QPLIB_MAX_NQE_ENTRY_SIZE	sizeof(struct nq_base)
+
+#define NQE_CNT_PER_PG		(PAGE_SIZE / BNXT_QPLIB_MAX_NQE_ENTRY_SIZE)
+#define NQE_MAX_IDX_PER_PG	(NQE_CNT_PER_PG - 1)
+#define NQE_PG(x)		(((x) & ~NQE_MAX_IDX_PER_PG) / NQE_CNT_PER_PG)
+#define NQE_IDX(x)		((x) & NQE_MAX_IDX_PER_PG)
+
+#define NQE_CMP_VALID(hdr, raw_cons, cp_bit)			\
+	(!!(le32_to_cpu((hdr)->info63_v[0]) & NQ_BASE_V) ==	\
+	   !((raw_cons) & (cp_bit)))
+
+#define BNXT_QPLIB_NQE_MAX_CNT		(128 * 1024)
+
+#define NQ_CONS_PCI_BAR_REGION		2
+#define NQ_DB_KEY_CP			(0x2 << CMPL_DOORBELL_KEY_SFT)
+#define NQ_DB_IDX_VALID			CMPL_DOORBELL_IDX_VALID
+#define NQ_DB_IRQ_DIS			CMPL_DOORBELL_MASK
+#define NQ_DB_CP_FLAGS_REARM		(NQ_DB_KEY_CP |		\
+					 NQ_DB_IDX_VALID)
+#define NQ_DB_CP_FLAGS			(NQ_DB_KEY_CP    |	\
+					 NQ_DB_IDX_VALID |	\
+					 NQ_DB_IRQ_DIS)
+#define NQ_DB_REARM(db, raw_cons, cp_bit)			\
+	writel(NQ_DB_CP_FLAGS_REARM | ((raw_cons) & ((cp_bit) - 1)), db)
+#define NQ_DB(db, raw_cons, cp_bit)				\
+	writel(NQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db)
+
+struct bnxt_qplib_nq {
+	struct pci_dev		*pdev;
+
+	int			vector;
+	cpumask_t		mask;
+	int			budget;
+	bool			requested;
+	struct tasklet_struct	worker;
+	struct bnxt_qplib_hwq	hwq;
+
+	u16			bar_reg;
+	u16			bar_reg_off;
+	u16			ring_id;
+	void __iomem		*bar_reg_iomem;
+
+	int			(*cqn_handler)(struct bnxt_qplib_nq *nq,
+					       struct bnxt_qplib_cq *cq);
+	int			(*srqn_handler)(struct bnxt_qplib_nq *nq,
+						struct bnxt_qplib_srq *srq,
+						u8 event);
+	struct workqueue_struct	*cqn_wq;
+	char			name[32];
+};
+
+struct bnxt_qplib_nq_work {
+	struct work_struct      work;
+	struct bnxt_qplib_nq    *nq;
+	struct bnxt_qplib_cq    *cq;
+};
+
+void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill);
+void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq);
+int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
+			    int msix_vector, bool need_init);
+int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
+			 int nq_idx, int msix_vector, int bar_reg_offset,
+			 int (*cqn_handler)(struct bnxt_qplib_nq *nq,
+					    struct bnxt_qplib_cq *cq),
+			 int (*srqn_handler)(struct bnxt_qplib_nq *nq,
+					     struct bnxt_qplib_srq *srq,
+					     u8 event));
+int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_srq *srq);
+int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_srq *srq);
+int bnxt_qplib_query_srq(struct bnxt_qplib_res *res,
+			 struct bnxt_qplib_srq *srq);
+int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
+			   struct bnxt_qplib_srq *srq);
+int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq,
+			     struct bnxt_qplib_swqe *wqe);
+int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
+int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
+int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
+int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
+int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
+void bnxt_qplib_clean_qp(struct bnxt_qplib_qp *qp);
+void bnxt_qplib_free_qp_res(struct bnxt_qplib_res *res,
+			    struct bnxt_qplib_qp *qp);
+void *bnxt_qplib_get_qp1_sq_buf(struct bnxt_qplib_qp *qp,
+				struct bnxt_qplib_sge *sge);
+void *bnxt_qplib_get_qp1_rq_buf(struct bnxt_qplib_qp *qp,
+				struct bnxt_qplib_sge *sge);
+u32 bnxt_qplib_get_rq_prod_index(struct bnxt_qplib_qp *qp);
+dma_addr_t bnxt_qplib_get_qp_buf_from_index(struct bnxt_qplib_qp *qp,
+					    u32 index);
+void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp);
+int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
+			 struct bnxt_qplib_swqe *wqe);
+void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp);
+int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
+			 struct bnxt_qplib_swqe *wqe);
+int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq);
+int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq);
+int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
+		       int num, struct bnxt_qplib_qp **qp);
+bool bnxt_qplib_is_cq_empty(struct bnxt_qplib_cq *cq);
+void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type);
+void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq);
+int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq);
+void bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp);
+void bnxt_qplib_acquire_cq_locks(struct bnxt_qplib_qp *qp,
+				 unsigned long *flags);
+void bnxt_qplib_release_cq_locks(struct bnxt_qplib_qp *qp,
+				 unsigned long *flags);
+int bnxt_qplib_process_flush_list(struct bnxt_qplib_cq *cq,
+				  struct bnxt_qplib_cqe *cqe,
+				  int num_cqes);
+void bnxt_qplib_flush_cqn_wq(struct bnxt_qplib_qp *qp);
+#endif /* __BNXT_QPLIB_FP_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
new file mode 100644
index 0000000..2852d35
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -0,0 +1,767 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: RDMA Controller HW interface
+ */
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/prefetch.h>
+#include <linux/delay.h>
+
+#include "roce_hsi.h"
+#include "qplib_res.h"
+#include "qplib_rcfw.h"
+#include "qplib_sp.h"
+#include "qplib_fp.h"
+
+static void bnxt_qplib_service_creq(unsigned long data);
+
+/* Hardware communication channel */
+static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
+{
+	u16 cbit;
+	int rc;
+
+	cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+	rc = wait_event_timeout(rcfw->waitq,
+				!test_bit(cbit, rcfw->cmdq_bitmap),
+				msecs_to_jiffies(RCFW_CMD_WAIT_TIME_MS));
+	return rc ? 0 : -ETIMEDOUT;
+};
+
+static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
+{
+	u32 count = RCFW_BLOCKED_CMD_WAIT_COUNT;
+	u16 cbit;
+
+	cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+	if (!test_bit(cbit, rcfw->cmdq_bitmap))
+		goto done;
+	do {
+		mdelay(1); /* 1m sec */
+		bnxt_qplib_service_creq((unsigned long)rcfw);
+	} while (test_bit(cbit, rcfw->cmdq_bitmap) && --count);
+done:
+	return count ? 0 : -ETIMEDOUT;
+};
+
+static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
+			  struct creq_base *resp, void *sb, u8 is_block)
+{
+	struct bnxt_qplib_cmdqe *cmdqe, **cmdq_ptr;
+	struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
+	struct bnxt_qplib_crsq *crsqe;
+	u32 sw_prod, cmdq_prod;
+	unsigned long flags;
+	u32 size, opcode;
+	u16 cookie, cbit;
+	u8 *preq;
+
+	opcode = req->opcode;
+	if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
+	    (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC &&
+	     opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW &&
+	     opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: RCFW not initialized, reject opcode 0x%x",
+			opcode);
+		return -EINVAL;
+	}
+
+	if (test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
+	    opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: RCFW already initialized!");
+		return -EINVAL;
+	}
+
+	if (test_bit(FIRMWARE_TIMED_OUT, &rcfw->flags))
+		return -ETIMEDOUT;
+
+	/* Cmdq are in 16-byte units, each request can consume 1 or more
+	 * cmdqe
+	 */
+	spin_lock_irqsave(&cmdq->lock, flags);
+	if (req->cmd_size >= HWQ_FREE_SLOTS(cmdq)) {
+		dev_err(&rcfw->pdev->dev, "QPLIB: RCFW: CMDQ is full!");
+		spin_unlock_irqrestore(&cmdq->lock, flags);
+		return -EAGAIN;
+	}
+
+
+	cookie = rcfw->seq_num & RCFW_MAX_COOKIE_VALUE;
+	cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+	if (is_block)
+		cookie |= RCFW_CMD_IS_BLOCKING;
+
+	set_bit(cbit, rcfw->cmdq_bitmap);
+	req->cookie = cpu_to_le16(cookie);
+	crsqe = &rcfw->crsqe_tbl[cbit];
+	if (crsqe->resp) {
+		spin_unlock_irqrestore(&cmdq->lock, flags);
+		return -EBUSY;
+	}
+	memset(resp, 0, sizeof(*resp));
+	crsqe->resp = (struct creq_qp_event *)resp;
+	crsqe->resp->cookie = req->cookie;
+	crsqe->req_size = req->cmd_size;
+	if (req->resp_size && sb) {
+		struct bnxt_qplib_rcfw_sbuf *sbuf = sb;
+
+		req->resp_addr = cpu_to_le64(sbuf->dma_addr);
+		req->resp_size = (sbuf->size + BNXT_QPLIB_CMDQE_UNITS - 1) /
+				  BNXT_QPLIB_CMDQE_UNITS;
+	}
+
+	cmdq_ptr = (struct bnxt_qplib_cmdqe **)cmdq->pbl_ptr;
+	preq = (u8 *)req;
+	size = req->cmd_size * BNXT_QPLIB_CMDQE_UNITS;
+	do {
+		/* Locate the next cmdq slot */
+		sw_prod = HWQ_CMP(cmdq->prod, cmdq);
+		cmdqe = &cmdq_ptr[get_cmdq_pg(sw_prod)][get_cmdq_idx(sw_prod)];
+		if (!cmdqe) {
+			dev_err(&rcfw->pdev->dev,
+				"QPLIB: RCFW request failed with no cmdqe!");
+			goto done;
+		}
+		/* Copy a segment of the req cmd to the cmdq */
+		memset(cmdqe, 0, sizeof(*cmdqe));
+		memcpy(cmdqe, preq, min_t(u32, size, sizeof(*cmdqe)));
+		preq += min_t(u32, size, sizeof(*cmdqe));
+		size -= min_t(u32, size, sizeof(*cmdqe));
+		cmdq->prod++;
+		rcfw->seq_num++;
+	} while (size > 0);
+
+	rcfw->seq_num++;
+
+	cmdq_prod = cmdq->prod;
+	if (test_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags)) {
+		/* The very first doorbell write
+		 * is required to set this flag
+		 * which prompts the FW to reset
+		 * its internal pointers
+		 */
+		cmdq_prod |= BIT(FIRMWARE_FIRST_FLAG);
+		clear_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags);
+	}
+
+	/* ring CMDQ DB */
+	wmb();
+	writel(cmdq_prod, rcfw->cmdq_bar_reg_iomem +
+	       rcfw->cmdq_bar_reg_prod_off);
+	writel(RCFW_CMDQ_TRIG_VAL, rcfw->cmdq_bar_reg_iomem +
+	       rcfw->cmdq_bar_reg_trig_off);
+done:
+	spin_unlock_irqrestore(&cmdq->lock, flags);
+	/* Return the CREQ response pointer */
+	return 0;
+}
+
+int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
+				 struct cmdq_base *req,
+				 struct creq_base *resp,
+				 void *sb, u8 is_block)
+{
+	struct creq_qp_event *evnt = (struct creq_qp_event *)resp;
+	u16 cookie;
+	u8 opcode, retry_cnt = 0xFF;
+	int rc = 0;
+
+	do {
+		opcode = req->opcode;
+		rc = __send_message(rcfw, req, resp, sb, is_block);
+		cookie = le16_to_cpu(req->cookie) & RCFW_MAX_COOKIE_VALUE;
+		if (!rc)
+			break;
+
+		if (!retry_cnt || (rc != -EAGAIN && rc != -EBUSY)) {
+			/* send failed */
+			dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x send failed",
+				cookie, opcode);
+			return rc;
+		}
+		is_block ? mdelay(1) : usleep_range(500, 1000);
+
+	} while (retry_cnt--);
+
+	if (is_block)
+		rc = __block_for_resp(rcfw, cookie);
+	else
+		rc = __wait_for_resp(rcfw, cookie);
+	if (rc) {
+		/* timed out */
+		dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x timedout (%d)msec",
+			cookie, opcode, RCFW_CMD_WAIT_TIME_MS);
+		set_bit(FIRMWARE_TIMED_OUT, &rcfw->flags);
+		return rc;
+	}
+
+	if (evnt->status) {
+		/* failed with status */
+		dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x status %#x",
+			cookie, opcode, evnt->status);
+		rc = -EFAULT;
+	}
+
+	return rc;
+}
+/* Completions */
+static int bnxt_qplib_process_func_event(struct bnxt_qplib_rcfw *rcfw,
+					 struct creq_func_event *func_event)
+{
+	switch (func_event->event) {
+	case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_RX_WQE_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_RX_DATA_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CQ_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_TQM_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCQ_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCS_ERROR:
+		/* SRQ ctx error, call srq_handler??
+		 * But there's no SRQ handle!
+		 */
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCC_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_CFCM_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_TIM_ERROR:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_VF_COMM_REQUEST:
+		break;
+	case CREQ_FUNC_EVENT_EVENT_RESOURCE_EXHAUSTED:
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
+				       struct creq_qp_event *qp_event)
+{
+	struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
+	struct creq_qp_error_notification *err_event;
+	struct bnxt_qplib_crsq *crsqe;
+	unsigned long flags;
+	struct bnxt_qplib_qp *qp;
+	u16 cbit, blocked = 0;
+	u16 cookie;
+	__le16  mcookie;
+	u32 qp_id;
+
+	switch (qp_event->event) {
+	case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION:
+		err_event = (struct creq_qp_error_notification *)qp_event;
+		qp_id = le32_to_cpu(err_event->xid);
+		qp = rcfw->qp_tbl[qp_id].qp_handle;
+		dev_dbg(&rcfw->pdev->dev,
+			"QPLIB: Received QP error notification");
+		dev_dbg(&rcfw->pdev->dev,
+			"QPLIB: qpid 0x%x, req_err=0x%x, resp_err=0x%x\n",
+			qp_id, err_event->req_err_state_reason,
+			err_event->res_err_state_reason);
+		if (!qp)
+			break;
+		bnxt_qplib_mark_qp_error(qp);
+		rcfw->aeq_handler(rcfw, qp_event, qp);
+		break;
+	default:
+		/* Command Response */
+		spin_lock_irqsave(&cmdq->lock, flags);
+		cookie = le16_to_cpu(qp_event->cookie);
+		mcookie = qp_event->cookie;
+		blocked = cookie & RCFW_CMD_IS_BLOCKING;
+		cookie &= RCFW_MAX_COOKIE_VALUE;
+		cbit = cookie % RCFW_MAX_OUTSTANDING_CMD;
+		crsqe = &rcfw->crsqe_tbl[cbit];
+		if (crsqe->resp &&
+		    crsqe->resp->cookie  == mcookie) {
+			memcpy(crsqe->resp, qp_event, sizeof(*qp_event));
+			crsqe->resp = NULL;
+		} else {
+			dev_err(&rcfw->pdev->dev,
+				"QPLIB: CMD %s resp->cookie = %#x, evnt->cookie = %#x",
+				crsqe->resp ? "mismatch" : "collision",
+				crsqe->resp ? crsqe->resp->cookie : 0, mcookie);
+		}
+		if (!test_and_clear_bit(cbit, rcfw->cmdq_bitmap))
+			dev_warn(&rcfw->pdev->dev,
+				 "QPLIB: CMD bit %d was not requested", cbit);
+		cmdq->cons += crsqe->req_size;
+		crsqe->req_size = 0;
+
+		if (!blocked)
+			wake_up(&rcfw->waitq);
+		spin_unlock_irqrestore(&cmdq->lock, flags);
+	}
+	return 0;
+}
+
+/* SP - CREQ Completion handlers */
+static void bnxt_qplib_service_creq(unsigned long data)
+{
+	struct bnxt_qplib_rcfw *rcfw = (struct bnxt_qplib_rcfw *)data;
+	struct bnxt_qplib_hwq *creq = &rcfw->creq;
+	struct creq_base *creqe, **creq_ptr;
+	u32 sw_cons, raw_cons;
+	unsigned long flags;
+	u32 type, budget = CREQ_ENTRY_POLL_BUDGET;
+
+	/* Service the CREQ until budget is over */
+	spin_lock_irqsave(&creq->lock, flags);
+	raw_cons = creq->cons;
+	while (budget > 0) {
+		sw_cons = HWQ_CMP(raw_cons, creq);
+		creq_ptr = (struct creq_base **)creq->pbl_ptr;
+		creqe = &creq_ptr[get_creq_pg(sw_cons)][get_creq_idx(sw_cons)];
+		if (!CREQ_CMP_VALID(creqe, raw_cons, creq->max_elements))
+			break;
+		/* The valid test of the entry must be done first before
+		 * reading any further.
+		 */
+		dma_rmb();
+
+		type = creqe->type & CREQ_BASE_TYPE_MASK;
+		switch (type) {
+		case CREQ_BASE_TYPE_QP_EVENT:
+			bnxt_qplib_process_qp_event
+				(rcfw, (struct creq_qp_event *)creqe);
+			rcfw->creq_qp_event_processed++;
+			break;
+		case CREQ_BASE_TYPE_FUNC_EVENT:
+			if (!bnxt_qplib_process_func_event
+			    (rcfw, (struct creq_func_event *)creqe))
+				rcfw->creq_func_event_processed++;
+			else
+				dev_warn
+				(&rcfw->pdev->dev, "QPLIB:aeqe:%#x Not handled",
+				 type);
+			break;
+		default:
+			dev_warn(&rcfw->pdev->dev, "QPLIB: creqe with ");
+			dev_warn(&rcfw->pdev->dev,
+				 "QPLIB: op_event = 0x%x not handled", type);
+			break;
+		}
+		raw_cons++;
+		budget--;
+	}
+
+	if (creq->cons != raw_cons) {
+		creq->cons = raw_cons;
+		CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, raw_cons,
+			      creq->max_elements);
+	}
+	spin_unlock_irqrestore(&creq->lock, flags);
+}
+
+static irqreturn_t bnxt_qplib_creq_irq(int irq, void *dev_instance)
+{
+	struct bnxt_qplib_rcfw *rcfw = dev_instance;
+	struct bnxt_qplib_hwq *creq = &rcfw->creq;
+	struct creq_base **creq_ptr;
+	u32 sw_cons;
+
+	/* Prefetch the CREQ element */
+	sw_cons = HWQ_CMP(creq->cons, creq);
+	creq_ptr = (struct creq_base **)rcfw->creq.pbl_ptr;
+	prefetch(&creq_ptr[get_creq_pg(sw_cons)][get_creq_idx(sw_cons)]);
+
+	tasklet_schedule(&rcfw->worker);
+
+	return IRQ_HANDLED;
+}
+
+/* RCFW */
+int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw)
+{
+	struct cmdq_deinitialize_fw req;
+	struct creq_deinitialize_fw_resp resp;
+	u16 cmd_flags = 0;
+	int rc;
+
+	RCFW_CMD_PREP(req, DEINITIALIZE_FW, cmd_flags);
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+					  NULL, 0);
+	if (rc)
+		return rc;
+
+	clear_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags);
+	return 0;
+}
+
+static int __get_pbl_pg_idx(struct bnxt_qplib_pbl *pbl)
+{
+	return (pbl->pg_size == ROCE_PG_SIZE_4K ?
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_4K :
+		pbl->pg_size == ROCE_PG_SIZE_8K ?
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_8K :
+		pbl->pg_size == ROCE_PG_SIZE_64K ?
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_64K :
+		pbl->pg_size == ROCE_PG_SIZE_2M ?
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_2M :
+		pbl->pg_size == ROCE_PG_SIZE_8M ?
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_8M :
+		pbl->pg_size == ROCE_PG_SIZE_1G ?
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_1G :
+				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_4K);
+}
+
+int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
+			 struct bnxt_qplib_ctx *ctx, int is_virtfn)
+{
+	struct cmdq_initialize_fw req;
+	struct creq_initialize_fw_resp resp;
+	u16 cmd_flags = 0, level;
+	int rc;
+
+	RCFW_CMD_PREP(req, INITIALIZE_FW, cmd_flags);
+	/* Supply (log-base-2-of-host-page-size - base-page-shift)
+	 * to bono to adjust the doorbell page sizes.
+	 */
+	req.log2_dbr_pg_size = cpu_to_le16(PAGE_SHIFT -
+					   RCFW_DBR_BASE_PAGE_SHIFT);
+	/*
+	 * VFs need not setup the HW context area, PF
+	 * shall setup this area for VF. Skipping the
+	 * HW programming
+	 */
+	if (is_virtfn)
+		goto skip_ctx_setup;
+
+	level = ctx->qpc_tbl.level;
+	req.qpc_pg_size_qpc_lvl = (level << CMDQ_INITIALIZE_FW_QPC_LVL_SFT) |
+				__get_pbl_pg_idx(&ctx->qpc_tbl.pbl[level]);
+	level = ctx->mrw_tbl.level;
+	req.mrw_pg_size_mrw_lvl = (level << CMDQ_INITIALIZE_FW_MRW_LVL_SFT) |
+				__get_pbl_pg_idx(&ctx->mrw_tbl.pbl[level]);
+	level = ctx->srqc_tbl.level;
+	req.srq_pg_size_srq_lvl = (level << CMDQ_INITIALIZE_FW_SRQ_LVL_SFT) |
+				__get_pbl_pg_idx(&ctx->srqc_tbl.pbl[level]);
+	level = ctx->cq_tbl.level;
+	req.cq_pg_size_cq_lvl = (level << CMDQ_INITIALIZE_FW_CQ_LVL_SFT) |
+				__get_pbl_pg_idx(&ctx->cq_tbl.pbl[level]);
+	level = ctx->srqc_tbl.level;
+	req.srq_pg_size_srq_lvl = (level << CMDQ_INITIALIZE_FW_SRQ_LVL_SFT) |
+				__get_pbl_pg_idx(&ctx->srqc_tbl.pbl[level]);
+	level = ctx->cq_tbl.level;
+	req.cq_pg_size_cq_lvl = (level << CMDQ_INITIALIZE_FW_CQ_LVL_SFT) |
+				__get_pbl_pg_idx(&ctx->cq_tbl.pbl[level]);
+	level = ctx->tim_tbl.level;
+	req.tim_pg_size_tim_lvl = (level << CMDQ_INITIALIZE_FW_TIM_LVL_SFT) |
+				  __get_pbl_pg_idx(&ctx->tim_tbl.pbl[level]);
+	level = ctx->tqm_pde_level;
+	req.tqm_pg_size_tqm_lvl = (level << CMDQ_INITIALIZE_FW_TQM_LVL_SFT) |
+				  __get_pbl_pg_idx(&ctx->tqm_pde.pbl[level]);
+
+	req.qpc_page_dir =
+		cpu_to_le64(ctx->qpc_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
+	req.mrw_page_dir =
+		cpu_to_le64(ctx->mrw_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
+	req.srq_page_dir =
+		cpu_to_le64(ctx->srqc_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
+	req.cq_page_dir =
+		cpu_to_le64(ctx->cq_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
+	req.tim_page_dir =
+		cpu_to_le64(ctx->tim_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
+	req.tqm_page_dir =
+		cpu_to_le64(ctx->tqm_pde.pbl[PBL_LVL_0].pg_map_arr[0]);
+
+	req.number_of_qp = cpu_to_le32(ctx->qpc_tbl.max_elements);
+	req.number_of_mrw = cpu_to_le32(ctx->mrw_tbl.max_elements);
+	req.number_of_srq = cpu_to_le32(ctx->srqc_tbl.max_elements);
+	req.number_of_cq = cpu_to_le32(ctx->cq_tbl.max_elements);
+
+	req.max_qp_per_vf = cpu_to_le32(ctx->vf_res.max_qp_per_vf);
+	req.max_mrw_per_vf = cpu_to_le32(ctx->vf_res.max_mrw_per_vf);
+	req.max_srq_per_vf = cpu_to_le32(ctx->vf_res.max_srq_per_vf);
+	req.max_cq_per_vf = cpu_to_le32(ctx->vf_res.max_cq_per_vf);
+	req.max_gid_per_vf = cpu_to_le32(ctx->vf_res.max_gid_per_vf);
+
+skip_ctx_setup:
+	req.stat_ctx_id = cpu_to_le32(ctx->stats.fw_id);
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+					  NULL, 0);
+	if (rc)
+		return rc;
+	set_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags);
+	return 0;
+}
+
+void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
+{
+	kfree(rcfw->qp_tbl);
+	kfree(rcfw->crsqe_tbl);
+	bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->cmdq);
+	bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->creq);
+	rcfw->pdev = NULL;
+}
+
+int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
+				  struct bnxt_qplib_rcfw *rcfw,
+				  int qp_tbl_sz)
+{
+	rcfw->pdev = pdev;
+	rcfw->creq.max_elements = BNXT_QPLIB_CREQE_MAX_CNT;
+	if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->creq, NULL, 0,
+				      &rcfw->creq.max_elements,
+				      BNXT_QPLIB_CREQE_UNITS, 0, PAGE_SIZE,
+				      HWQ_TYPE_L2_CMPL)) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: HW channel CREQ allocation failed");
+		goto fail;
+	}
+	rcfw->cmdq.max_elements = BNXT_QPLIB_CMDQE_MAX_CNT;
+	if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->cmdq, NULL, 0,
+				      &rcfw->cmdq.max_elements,
+				      BNXT_QPLIB_CMDQE_UNITS, 0, PAGE_SIZE,
+				      HWQ_TYPE_CTX)) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: HW channel CMDQ allocation failed");
+		goto fail;
+	}
+
+	rcfw->crsqe_tbl = kcalloc(rcfw->cmdq.max_elements,
+				  sizeof(*rcfw->crsqe_tbl), GFP_KERNEL);
+	if (!rcfw->crsqe_tbl)
+		goto fail;
+
+	rcfw->qp_tbl_size = qp_tbl_sz;
+	rcfw->qp_tbl = kcalloc(qp_tbl_sz, sizeof(struct bnxt_qplib_qp_node),
+			       GFP_KERNEL);
+	if (!rcfw->qp_tbl)
+		goto fail;
+
+	return 0;
+
+fail:
+	bnxt_qplib_free_rcfw_channel(rcfw);
+	return -ENOMEM;
+}
+
+void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill)
+{
+	tasklet_disable(&rcfw->worker);
+	/* Mask h/w interrupts */
+	CREQ_DB(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
+		rcfw->creq.max_elements);
+	/* Sync with last running IRQ-handler */
+	synchronize_irq(rcfw->vector);
+	if (kill)
+		tasklet_kill(&rcfw->worker);
+
+	if (rcfw->requested) {
+		free_irq(rcfw->vector, rcfw);
+		rcfw->requested = false;
+	}
+}
+
+void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
+{
+	unsigned long indx;
+
+	bnxt_qplib_rcfw_stop_irq(rcfw, true);
+
+	if (rcfw->cmdq_bar_reg_iomem)
+		iounmap(rcfw->cmdq_bar_reg_iomem);
+	rcfw->cmdq_bar_reg_iomem = NULL;
+
+	if (rcfw->creq_bar_reg_iomem)
+		iounmap(rcfw->creq_bar_reg_iomem);
+	rcfw->creq_bar_reg_iomem = NULL;
+
+	indx = find_first_bit(rcfw->cmdq_bitmap, rcfw->bmap_size);
+	if (indx != rcfw->bmap_size)
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: disabling RCFW with pending cmd-bit %lx", indx);
+	kfree(rcfw->cmdq_bitmap);
+	rcfw->bmap_size = 0;
+
+	rcfw->aeq_handler = NULL;
+	rcfw->vector = 0;
+}
+
+int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
+			      bool need_init)
+{
+	int rc;
+
+	if (rcfw->requested)
+		return -EFAULT;
+
+	rcfw->vector = msix_vector;
+	if (need_init)
+		tasklet_init(&rcfw->worker,
+			     bnxt_qplib_service_creq, (unsigned long)rcfw);
+	else
+		tasklet_enable(&rcfw->worker);
+	rc = request_irq(rcfw->vector, bnxt_qplib_creq_irq, 0,
+			 "bnxt_qplib_creq", rcfw);
+	if (rc)
+		return rc;
+	rcfw->requested = true;
+	CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
+		      rcfw->creq.max_elements);
+
+	return 0;
+}
+
+int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
+				   struct bnxt_qplib_rcfw *rcfw,
+				   int msix_vector,
+				   int cp_bar_reg_off, int virt_fn,
+				   int (*aeq_handler)(struct bnxt_qplib_rcfw *,
+						      void *, void *))
+{
+	resource_size_t res_base;
+	struct cmdq_init init;
+	u16 bmap_size;
+	int rc;
+
+	/* General */
+	rcfw->seq_num = 0;
+	set_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags);
+	bmap_size = BITS_TO_LONGS(RCFW_MAX_OUTSTANDING_CMD *
+				  sizeof(unsigned long));
+	rcfw->cmdq_bitmap = kzalloc(bmap_size, GFP_KERNEL);
+	if (!rcfw->cmdq_bitmap)
+		return -ENOMEM;
+	rcfw->bmap_size = bmap_size;
+
+	/* CMDQ */
+	rcfw->cmdq_bar_reg = RCFW_COMM_PCI_BAR_REGION;
+	res_base = pci_resource_start(pdev, rcfw->cmdq_bar_reg);
+	if (!res_base)
+		return -ENOMEM;
+
+	rcfw->cmdq_bar_reg_iomem = ioremap_nocache(res_base +
+					      RCFW_COMM_BASE_OFFSET,
+					      RCFW_COMM_SIZE);
+	if (!rcfw->cmdq_bar_reg_iomem) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: CMDQ BAR region %d mapping failed",
+			rcfw->cmdq_bar_reg);
+		return -ENOMEM;
+	}
+
+	rcfw->cmdq_bar_reg_prod_off = virt_fn ? RCFW_VF_COMM_PROD_OFFSET :
+					RCFW_PF_COMM_PROD_OFFSET;
+
+	rcfw->cmdq_bar_reg_trig_off = RCFW_COMM_TRIG_OFFSET;
+
+	/* CREQ */
+	rcfw->creq_bar_reg = RCFW_COMM_CONS_PCI_BAR_REGION;
+	res_base = pci_resource_start(pdev, rcfw->creq_bar_reg);
+	if (!res_base)
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: CREQ BAR region %d resc start is 0!",
+			rcfw->creq_bar_reg);
+	rcfw->creq_bar_reg_iomem = ioremap_nocache(res_base + cp_bar_reg_off,
+						   4);
+	if (!rcfw->creq_bar_reg_iomem) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: CREQ BAR region %d mapping failed",
+			rcfw->creq_bar_reg);
+		return -ENOMEM;
+	}
+	rcfw->creq_qp_event_processed = 0;
+	rcfw->creq_func_event_processed = 0;
+
+	if (aeq_handler)
+		rcfw->aeq_handler = aeq_handler;
+	init_waitqueue_head(&rcfw->waitq);
+
+	rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_vector, true);
+	if (rc) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: Failed to request IRQ for CREQ rc = 0x%x", rc);
+		bnxt_qplib_disable_rcfw_channel(rcfw);
+		return rc;
+	}
+
+	init.cmdq_pbl = cpu_to_le64(rcfw->cmdq.pbl[PBL_LVL_0].pg_map_arr[0]);
+	init.cmdq_size_cmdq_lvl = cpu_to_le16(
+		((BNXT_QPLIB_CMDQE_MAX_CNT << CMDQ_INIT_CMDQ_SIZE_SFT) &
+		 CMDQ_INIT_CMDQ_SIZE_MASK) |
+		((rcfw->cmdq.level << CMDQ_INIT_CMDQ_LVL_SFT) &
+		 CMDQ_INIT_CMDQ_LVL_MASK));
+	init.creq_ring_id = cpu_to_le16(rcfw->creq_ring_id);
+
+	/* Write to the Bono mailbox register */
+	__iowrite32_copy(rcfw->cmdq_bar_reg_iomem, &init, sizeof(init) / 4);
+	return 0;
+}
+
+struct bnxt_qplib_rcfw_sbuf *bnxt_qplib_rcfw_alloc_sbuf(
+		struct bnxt_qplib_rcfw *rcfw,
+		u32 size)
+{
+	struct bnxt_qplib_rcfw_sbuf *sbuf;
+
+	sbuf = kzalloc(sizeof(*sbuf), GFP_ATOMIC);
+	if (!sbuf)
+		return NULL;
+
+	sbuf->size = size;
+	sbuf->sb = dma_zalloc_coherent(&rcfw->pdev->dev, sbuf->size,
+				       &sbuf->dma_addr, GFP_ATOMIC);
+	if (!sbuf->sb)
+		goto bail;
+
+	return sbuf;
+bail:
+	kfree(sbuf);
+	return NULL;
+}
+
+void bnxt_qplib_rcfw_free_sbuf(struct bnxt_qplib_rcfw *rcfw,
+			       struct bnxt_qplib_rcfw_sbuf *sbuf)
+{
+	if (sbuf->sb)
+		dma_free_coherent(&rcfw->pdev->dev, sbuf->size,
+				  sbuf->sb, sbuf->dma_addr);
+	kfree(sbuf);
+}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
new file mode 100644
index 0000000..46416df
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
@@ -0,0 +1,222 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: RDMA Controller HW interface (header)
+ */
+
+#ifndef __BNXT_QPLIB_RCFW_H__
+#define __BNXT_QPLIB_RCFW_H__
+
+#define RCFW_CMDQ_TRIG_VAL		1
+#define RCFW_COMM_PCI_BAR_REGION	0
+#define RCFW_COMM_CONS_PCI_BAR_REGION	2
+#define RCFW_COMM_BASE_OFFSET		0x600
+#define RCFW_PF_COMM_PROD_OFFSET	0xc
+#define RCFW_VF_COMM_PROD_OFFSET	0xc
+#define RCFW_COMM_TRIG_OFFSET		0x100
+#define RCFW_COMM_SIZE			0x104
+
+#define RCFW_DBR_PCI_BAR_REGION		2
+#define RCFW_DBR_BASE_PAGE_SHIFT	12
+
+#define RCFW_CMD_PREP(req, CMD, cmd_flags)				\
+	do {								\
+		memset(&(req), 0, sizeof((req)));			\
+		(req).opcode = CMDQ_BASE_OPCODE_##CMD;			\
+		(req).cmd_size = (sizeof((req)) +			\
+				BNXT_QPLIB_CMDQE_UNITS - 1) /		\
+				BNXT_QPLIB_CMDQE_UNITS;			\
+		(req).flags = cpu_to_le16(cmd_flags);			\
+	} while (0)
+
+#define RCFW_CMD_WAIT_TIME_MS		20000 /* 20 Seconds timeout */
+
+/* CMDQ elements */
+#define BNXT_QPLIB_CMDQE_MAX_CNT	256
+#define BNXT_QPLIB_CMDQE_UNITS		sizeof(struct bnxt_qplib_cmdqe)
+#define BNXT_QPLIB_CMDQE_CNT_PER_PG	(PAGE_SIZE / BNXT_QPLIB_CMDQE_UNITS)
+
+#define MAX_CMDQ_IDX			(BNXT_QPLIB_CMDQE_MAX_CNT - 1)
+#define MAX_CMDQ_IDX_PER_PG		(BNXT_QPLIB_CMDQE_CNT_PER_PG - 1)
+
+#define RCFW_MAX_OUTSTANDING_CMD	BNXT_QPLIB_CMDQE_MAX_CNT
+#define RCFW_MAX_COOKIE_VALUE		0x7FFF
+#define RCFW_CMD_IS_BLOCKING		0x8000
+#define RCFW_BLOCKED_CMD_WAIT_COUNT	0x4E20
+
+/* Cmdq contains a fix number of a 16-Byte slots */
+struct bnxt_qplib_cmdqe {
+	u8		data[16];
+};
+
+static inline u32 get_cmdq_pg(u32 val)
+{
+	return (val & ~MAX_CMDQ_IDX_PER_PG) / BNXT_QPLIB_CMDQE_CNT_PER_PG;
+}
+
+static inline u32 get_cmdq_idx(u32 val)
+{
+	return val & MAX_CMDQ_IDX_PER_PG;
+}
+
+/* Crsq buf is 1024-Byte */
+struct bnxt_qplib_crsbe {
+	u8			data[1024];
+};
+
+/* CREQ */
+/* Allocate 1 per QP for async error notification for now */
+#define BNXT_QPLIB_CREQE_MAX_CNT	(64 * 1024)
+#define BNXT_QPLIB_CREQE_UNITS		16	/* 16-Bytes per prod unit */
+#define BNXT_QPLIB_CREQE_CNT_PER_PG	(PAGE_SIZE / BNXT_QPLIB_CREQE_UNITS)
+
+#define MAX_CREQ_IDX			(BNXT_QPLIB_CREQE_MAX_CNT - 1)
+#define MAX_CREQ_IDX_PER_PG		(BNXT_QPLIB_CREQE_CNT_PER_PG - 1)
+
+static inline u32 get_creq_pg(u32 val)
+{
+	return (val & ~MAX_CREQ_IDX_PER_PG) / BNXT_QPLIB_CREQE_CNT_PER_PG;
+}
+
+static inline u32 get_creq_idx(u32 val)
+{
+	return val & MAX_CREQ_IDX_PER_PG;
+}
+
+#define BNXT_QPLIB_CREQE_PER_PG	(PAGE_SIZE / sizeof(struct creq_base))
+
+#define CREQ_CMP_VALID(hdr, raw_cons, cp_bit)			\
+	(!!((hdr)->v & CREQ_BASE_V) ==				\
+	   !((raw_cons) & (cp_bit)))
+
+#define CREQ_DB_KEY_CP			(0x2 << CMPL_DOORBELL_KEY_SFT)
+#define CREQ_DB_IDX_VALID		CMPL_DOORBELL_IDX_VALID
+#define CREQ_DB_IRQ_DIS			CMPL_DOORBELL_MASK
+#define CREQ_DB_CP_FLAGS_REARM		(CREQ_DB_KEY_CP |	\
+					 CREQ_DB_IDX_VALID)
+#define CREQ_DB_CP_FLAGS		(CREQ_DB_KEY_CP |	\
+					 CREQ_DB_IDX_VALID |	\
+					 CREQ_DB_IRQ_DIS)
+#define CREQ_DB_REARM(db, raw_cons, cp_bit)			\
+	writel(CREQ_DB_CP_FLAGS_REARM | ((raw_cons) & ((cp_bit) - 1)), db)
+#define CREQ_DB(db, raw_cons, cp_bit)				\
+	writel(CREQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db)
+
+#define CREQ_ENTRY_POLL_BUDGET		0x100
+
+/* HWQ */
+
+struct bnxt_qplib_crsq {
+	struct creq_qp_event	*resp;
+	u32			req_size;
+};
+
+struct bnxt_qplib_rcfw_sbuf {
+	void *sb;
+	dma_addr_t dma_addr;
+	u32 size;
+};
+
+struct bnxt_qplib_qp_node {
+	u32 qp_id;              /* QP id */
+	void *qp_handle;        /* ptr to qplib_qp */
+};
+
+/* RCFW Communication Channels */
+struct bnxt_qplib_rcfw {
+	struct pci_dev		*pdev;
+	int			vector;
+	struct tasklet_struct	worker;
+	bool			requested;
+	unsigned long		*cmdq_bitmap;
+	u32			bmap_size;
+	unsigned long		flags;
+#define FIRMWARE_INITIALIZED_FLAG	0
+#define FIRMWARE_FIRST_FLAG		31
+#define FIRMWARE_TIMED_OUT		3
+	wait_queue_head_t	waitq;
+	int			(*aeq_handler)(struct bnxt_qplib_rcfw *,
+					       void *, void *);
+	u32			seq_num;
+
+	/* Bar region info */
+	void __iomem		*cmdq_bar_reg_iomem;
+	u16			cmdq_bar_reg;
+	u16			cmdq_bar_reg_prod_off;
+	u16			cmdq_bar_reg_trig_off;
+	u16			creq_ring_id;
+	u16			creq_bar_reg;
+	void __iomem		*creq_bar_reg_iomem;
+
+	/* Cmd-Resp and Async Event notification queue */
+	struct bnxt_qplib_hwq	creq;
+	u64			creq_qp_event_processed;
+	u64			creq_func_event_processed;
+
+	/* Actual Cmd and Resp Queues */
+	struct bnxt_qplib_hwq	cmdq;
+	struct bnxt_qplib_crsq	*crsqe_tbl;
+	int qp_tbl_size;
+	struct bnxt_qplib_qp_node *qp_tbl;
+};
+
+void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
+int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
+				  struct bnxt_qplib_rcfw *rcfw, int qp_tbl_sz);
+void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill);
+void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
+int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
+			      bool need_init);
+int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
+				   struct bnxt_qplib_rcfw *rcfw,
+				   int msix_vector,
+				   int cp_bar_reg_off, int virt_fn,
+				   int (*aeq_handler)(struct bnxt_qplib_rcfw *,
+						      void *aeqe, void *obj));
+
+struct bnxt_qplib_rcfw_sbuf *bnxt_qplib_rcfw_alloc_sbuf(
+				struct bnxt_qplib_rcfw *rcfw,
+				u32 size);
+void bnxt_qplib_rcfw_free_sbuf(struct bnxt_qplib_rcfw *rcfw,
+			       struct bnxt_qplib_rcfw_sbuf *sbuf);
+int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
+				 struct cmdq_base *req, struct creq_base *resp,
+				 void *sbuf, u8 is_block);
+
+int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw);
+int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
+			 struct bnxt_qplib_ctx *ctx, int is_virtfn);
+void bnxt_qplib_mark_qp_error(void *qp_handle);
+#endif /* __BNXT_QPLIB_RCFW_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
new file mode 100644
index 0000000..539a5d4
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
@@ -0,0 +1,830 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: QPLib resource manager
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/inetdevice.h>
+#include <linux/dma-mapping.h>
+#include <linux/if_vlan.h>
+#include "roce_hsi.h"
+#include "qplib_res.h"
+#include "qplib_sp.h"
+#include "qplib_rcfw.h"
+
+static void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev,
+				      struct bnxt_qplib_stats *stats);
+static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
+				      struct bnxt_qplib_stats *stats);
+
+/* PBL */
+static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
+		       bool is_umem)
+{
+	int i;
+
+	if (!is_umem) {
+		for (i = 0; i < pbl->pg_count; i++) {
+			if (pbl->pg_arr[i])
+				dma_free_coherent(&pdev->dev, pbl->pg_size,
+						  (void *)((unsigned long)
+						   pbl->pg_arr[i] &
+						  PAGE_MASK),
+						  pbl->pg_map_arr[i]);
+			else
+				dev_warn(&pdev->dev,
+					 "QPLIB: PBL free pg_arr[%d] empty?!",
+					 i);
+			pbl->pg_arr[i] = NULL;
+		}
+	}
+	kfree(pbl->pg_arr);
+	pbl->pg_arr = NULL;
+	kfree(pbl->pg_map_arr);
+	pbl->pg_map_arr = NULL;
+	pbl->pg_count = 0;
+	pbl->pg_size = 0;
+}
+
+static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
+		       struct scatterlist *sghead, u32 pages, u32 pg_size)
+{
+	struct scatterlist *sg;
+	bool is_umem = false;
+	int i;
+
+	/* page ptr arrays */
+	pbl->pg_arr = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+	if (!pbl->pg_arr)
+		return -ENOMEM;
+
+	pbl->pg_map_arr = kcalloc(pages, sizeof(dma_addr_t), GFP_KERNEL);
+	if (!pbl->pg_map_arr) {
+		kfree(pbl->pg_arr);
+		pbl->pg_arr = NULL;
+		return -ENOMEM;
+	}
+	pbl->pg_count = 0;
+	pbl->pg_size = pg_size;
+
+	if (!sghead) {
+		for (i = 0; i < pages; i++) {
+			pbl->pg_arr[i] = dma_zalloc_coherent(&pdev->dev,
+							     pbl->pg_size,
+							     &pbl->pg_map_arr[i],
+							     GFP_KERNEL);
+			if (!pbl->pg_arr[i])
+				goto fail;
+			pbl->pg_count++;
+		}
+	} else {
+		i = 0;
+		is_umem = true;
+		for_each_sg(sghead, sg, pages, i) {
+			pbl->pg_map_arr[i] = sg_dma_address(sg);
+			pbl->pg_arr[i] = sg_virt(sg);
+			if (!pbl->pg_arr[i])
+				goto fail;
+
+			pbl->pg_count++;
+		}
+	}
+
+	return 0;
+
+fail:
+	__free_pbl(pdev, pbl, is_umem);
+	return -ENOMEM;
+}
+
+/* HWQ */
+void bnxt_qplib_free_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq)
+{
+	int i;
+
+	if (!hwq->max_elements)
+		return;
+	if (hwq->level >= PBL_LVL_MAX)
+		return;
+
+	for (i = 0; i < hwq->level + 1; i++) {
+		if (i == hwq->level)
+			__free_pbl(pdev, &hwq->pbl[i], hwq->is_user);
+		else
+			__free_pbl(pdev, &hwq->pbl[i], false);
+	}
+
+	hwq->level = PBL_LVL_MAX;
+	hwq->max_elements = 0;
+	hwq->element_size = 0;
+	hwq->prod = 0;
+	hwq->cons = 0;
+	hwq->cp_bit = 0;
+}
+
+/* All HWQs are power of 2 in size */
+int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
+			      struct scatterlist *sghead, int nmap,
+			      u32 *elements, u32 element_size, u32 aux,
+			      u32 pg_size, enum bnxt_qplib_hwq_type hwq_type)
+{
+	u32 pages, slots, size, aux_pages = 0, aux_size = 0;
+	dma_addr_t *src_phys_ptr, **dst_virt_ptr;
+	int i, rc;
+
+	hwq->level = PBL_LVL_MAX;
+
+	slots = roundup_pow_of_two(*elements);
+	if (aux) {
+		aux_size = roundup_pow_of_two(aux);
+		aux_pages = (slots * aux_size) / pg_size;
+		if ((slots * aux_size) % pg_size)
+			aux_pages++;
+	}
+	size = roundup_pow_of_two(element_size);
+
+	if (!sghead) {
+		hwq->is_user = false;
+		pages = (slots * size) / pg_size + aux_pages;
+		if ((slots * size) % pg_size)
+			pages++;
+		if (!pages)
+			return -EINVAL;
+	} else {
+		hwq->is_user = true;
+		pages = nmap;
+	}
+
+	/* Alloc the 1st memory block; can be a PDL/PTL/PBL */
+	if (sghead && (pages == MAX_PBL_LVL_0_PGS))
+		rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], sghead,
+				 pages, pg_size);
+	else
+		rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], NULL, 1, pg_size);
+	if (rc)
+		goto fail;
+
+	hwq->level = PBL_LVL_0;
+
+	if (pages > MAX_PBL_LVL_0_PGS) {
+		if (pages > MAX_PBL_LVL_1_PGS) {
+			/* 2 levels of indirection */
+			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_1], NULL,
+					 MAX_PBL_LVL_1_PGS_FOR_LVL_2, pg_size);
+			if (rc)
+				goto fail;
+			/* Fill in lvl0 PBL */
+			dst_virt_ptr =
+				(dma_addr_t **)hwq->pbl[PBL_LVL_0].pg_arr;
+			src_phys_ptr = hwq->pbl[PBL_LVL_1].pg_map_arr;
+			for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; i++)
+				dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+					src_phys_ptr[i] | PTU_PDE_VALID;
+			hwq->level = PBL_LVL_1;
+
+			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_2], sghead,
+					 pages, pg_size);
+			if (rc)
+				goto fail;
+
+			/* Fill in lvl1 PBL */
+			dst_virt_ptr =
+				(dma_addr_t **)hwq->pbl[PBL_LVL_1].pg_arr;
+			src_phys_ptr = hwq->pbl[PBL_LVL_2].pg_map_arr;
+			for (i = 0; i < hwq->pbl[PBL_LVL_2].pg_count; i++) {
+				dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+					src_phys_ptr[i] | PTU_PTE_VALID;
+			}
+			if (hwq_type == HWQ_TYPE_QUEUE) {
+				/* Find the last pg of the size */
+				i = hwq->pbl[PBL_LVL_2].pg_count;
+				dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
+								  PTU_PTE_LAST;
+				if (i > 1)
+					dst_virt_ptr[PTR_PG(i - 2)]
+						    [PTR_IDX(i - 2)] |=
+						    PTU_PTE_NEXT_TO_LAST;
+			}
+			hwq->level = PBL_LVL_2;
+		} else {
+			u32 flag = hwq_type == HWQ_TYPE_L2_CMPL ? 0 :
+						PTU_PTE_VALID;
+
+			/* 1 level of indirection */
+			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_1], sghead,
+					 pages, pg_size);
+			if (rc)
+				goto fail;
+			/* Fill in lvl0 PBL */
+			dst_virt_ptr =
+				(dma_addr_t **)hwq->pbl[PBL_LVL_0].pg_arr;
+			src_phys_ptr = hwq->pbl[PBL_LVL_1].pg_map_arr;
+			for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; i++) {
+				dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+					src_phys_ptr[i] | flag;
+			}
+			if (hwq_type == HWQ_TYPE_QUEUE) {
+				/* Find the last pg of the size */
+				i = hwq->pbl[PBL_LVL_1].pg_count;
+				dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
+								  PTU_PTE_LAST;
+				if (i > 1)
+					dst_virt_ptr[PTR_PG(i - 2)]
+						    [PTR_IDX(i - 2)] |=
+						    PTU_PTE_NEXT_TO_LAST;
+			}
+			hwq->level = PBL_LVL_1;
+		}
+	}
+	hwq->pdev = pdev;
+	spin_lock_init(&hwq->lock);
+	hwq->prod = 0;
+	hwq->cons = 0;
+	*elements = hwq->max_elements = slots;
+	hwq->element_size = size;
+
+	/* For direct access to the elements */
+	hwq->pbl_ptr = hwq->pbl[hwq->level].pg_arr;
+	hwq->pbl_dma_ptr = hwq->pbl[hwq->level].pg_map_arr;
+
+	return 0;
+
+fail:
+	bnxt_qplib_free_hwq(pdev, hwq);
+	return -ENOMEM;
+}
+
+/* Context Tables */
+void bnxt_qplib_free_ctx(struct pci_dev *pdev,
+			 struct bnxt_qplib_ctx *ctx)
+{
+	int i;
+
+	bnxt_qplib_free_hwq(pdev, &ctx->qpc_tbl);
+	bnxt_qplib_free_hwq(pdev, &ctx->mrw_tbl);
+	bnxt_qplib_free_hwq(pdev, &ctx->srqc_tbl);
+	bnxt_qplib_free_hwq(pdev, &ctx->cq_tbl);
+	bnxt_qplib_free_hwq(pdev, &ctx->tim_tbl);
+	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++)
+		bnxt_qplib_free_hwq(pdev, &ctx->tqm_tbl[i]);
+	bnxt_qplib_free_hwq(pdev, &ctx->tqm_pde);
+	bnxt_qplib_free_stats_ctx(pdev, &ctx->stats);
+}
+
+/*
+ * Routine: bnxt_qplib_alloc_ctx
+ * Description:
+ *     Context tables are memories which are used by the chip fw.
+ *     The 6 tables defined are:
+ *             QPC ctx - holds QP states
+ *             MRW ctx - holds memory region and window
+ *             SRQ ctx - holds shared RQ states
+ *             CQ ctx - holds completion queue states
+ *             TQM ctx - holds Tx Queue Manager context
+ *             TIM ctx - holds timer context
+ *     Depending on the size of the tbl requested, either a 1 Page Buffer List
+ *     or a 1-to-2-stage indirection Page Directory List + 1 PBL is used
+ *     instead.
+ *     Table might be employed as follows:
+ *             For 0      < ctx size <= 1 PAGE, 0 level of ind is used
+ *             For 1 PAGE < ctx size <= 512 entries size, 1 level of ind is used
+ *             For 512    < ctx size <= MAX, 2 levels of ind is used
+ * Returns:
+ *     0 if success, else -ERRORS
+ */
+int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
+			 struct bnxt_qplib_ctx *ctx,
+			 bool virt_fn)
+{
+	int i, j, k, rc = 0;
+	int fnz_idx = -1;
+	__le64 **pbl_ptr;
+
+	if (virt_fn)
+		goto stats_alloc;
+
+	/* QPC Tables */
+	ctx->qpc_tbl.max_elements = ctx->qpc_count;
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->qpc_tbl, NULL, 0,
+				       &ctx->qpc_tbl.max_elements,
+				       BNXT_QPLIB_MAX_QP_CTX_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_CTX);
+	if (rc)
+		goto fail;
+
+	/* MRW Tables */
+	ctx->mrw_tbl.max_elements = ctx->mrw_count;
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->mrw_tbl, NULL, 0,
+				       &ctx->mrw_tbl.max_elements,
+				       BNXT_QPLIB_MAX_MRW_CTX_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_CTX);
+	if (rc)
+		goto fail;
+
+	/* SRQ Tables */
+	ctx->srqc_tbl.max_elements = ctx->srqc_count;
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->srqc_tbl, NULL, 0,
+				       &ctx->srqc_tbl.max_elements,
+				       BNXT_QPLIB_MAX_SRQ_CTX_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_CTX);
+	if (rc)
+		goto fail;
+
+	/* CQ Tables */
+	ctx->cq_tbl.max_elements = ctx->cq_count;
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->cq_tbl, NULL, 0,
+				       &ctx->cq_tbl.max_elements,
+				       BNXT_QPLIB_MAX_CQ_CTX_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_CTX);
+	if (rc)
+		goto fail;
+
+	/* TQM Buffer */
+	ctx->tqm_pde.max_elements = 512;
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_pde, NULL, 0,
+				       &ctx->tqm_pde.max_elements, sizeof(u64),
+				       0, PAGE_SIZE, HWQ_TYPE_CTX);
+	if (rc)
+		goto fail;
+
+	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++) {
+		if (!ctx->tqm_count[i])
+			continue;
+		ctx->tqm_tbl[i].max_elements = ctx->qpc_count *
+					       ctx->tqm_count[i];
+		rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_tbl[i], NULL, 0,
+					       &ctx->tqm_tbl[i].max_elements, 1,
+					       0, PAGE_SIZE, HWQ_TYPE_CTX);
+		if (rc)
+			goto fail;
+	}
+	pbl_ptr = (__le64 **)ctx->tqm_pde.pbl_ptr;
+	for (i = 0, j = 0; i < MAX_TQM_ALLOC_REQ;
+	     i++, j += MAX_TQM_ALLOC_BLK_SIZE) {
+		if (!ctx->tqm_tbl[i].max_elements)
+			continue;
+		if (fnz_idx == -1)
+			fnz_idx = i;
+		switch (ctx->tqm_tbl[i].level) {
+		case PBL_LVL_2:
+			for (k = 0; k < ctx->tqm_tbl[i].pbl[PBL_LVL_1].pg_count;
+			     k++)
+				pbl_ptr[PTR_PG(j + k)][PTR_IDX(j + k)] =
+				  cpu_to_le64(
+				    ctx->tqm_tbl[i].pbl[PBL_LVL_1].pg_map_arr[k]
+				    | PTU_PTE_VALID);
+			break;
+		case PBL_LVL_1:
+		case PBL_LVL_0:
+		default:
+			pbl_ptr[PTR_PG(j)][PTR_IDX(j)] = cpu_to_le64(
+				ctx->tqm_tbl[i].pbl[PBL_LVL_0].pg_map_arr[0] |
+				PTU_PTE_VALID);
+			break;
+		}
+	}
+	if (fnz_idx == -1)
+		fnz_idx = 0;
+	ctx->tqm_pde_level = ctx->tqm_tbl[fnz_idx].level == PBL_LVL_2 ?
+			     PBL_LVL_2 : ctx->tqm_tbl[fnz_idx].level + 1;
+
+	/* TIM Buffer */
+	ctx->tim_tbl.max_elements = ctx->qpc_count * 16;
+	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tim_tbl, NULL, 0,
+				       &ctx->tim_tbl.max_elements, 1,
+				       0, PAGE_SIZE, HWQ_TYPE_CTX);
+	if (rc)
+		goto fail;
+
+stats_alloc:
+	/* Stats */
+	rc = bnxt_qplib_alloc_stats_ctx(pdev, &ctx->stats);
+	if (rc)
+		goto fail;
+
+	return 0;
+
+fail:
+	bnxt_qplib_free_ctx(pdev, ctx);
+	return rc;
+}
+
+/* GUID */
+void bnxt_qplib_get_guid(u8 *dev_addr, u8 *guid)
+{
+	u8 mac[ETH_ALEN];
+
+	/* MAC-48 to EUI-64 mapping */
+	memcpy(mac, dev_addr, ETH_ALEN);
+	guid[0] = mac[0] ^ 2;
+	guid[1] = mac[1];
+	guid[2] = mac[2];
+	guid[3] = 0xff;
+	guid[4] = 0xfe;
+	guid[5] = mac[3];
+	guid[6] = mac[4];
+	guid[7] = mac[5];
+}
+
+static void bnxt_qplib_free_sgid_tbl(struct bnxt_qplib_res *res,
+				     struct bnxt_qplib_sgid_tbl *sgid_tbl)
+{
+	kfree(sgid_tbl->tbl);
+	kfree(sgid_tbl->hw_id);
+	kfree(sgid_tbl->ctx);
+	kfree(sgid_tbl->vlan);
+	sgid_tbl->tbl = NULL;
+	sgid_tbl->hw_id = NULL;
+	sgid_tbl->ctx = NULL;
+	sgid_tbl->vlan = NULL;
+	sgid_tbl->max = 0;
+	sgid_tbl->active = 0;
+}
+
+static int bnxt_qplib_alloc_sgid_tbl(struct bnxt_qplib_res *res,
+				     struct bnxt_qplib_sgid_tbl *sgid_tbl,
+				     u16 max)
+{
+	sgid_tbl->tbl = kcalloc(max, sizeof(struct bnxt_qplib_gid), GFP_KERNEL);
+	if (!sgid_tbl->tbl)
+		return -ENOMEM;
+
+	sgid_tbl->hw_id = kcalloc(max, sizeof(u16), GFP_KERNEL);
+	if (!sgid_tbl->hw_id)
+		goto out_free1;
+
+	sgid_tbl->ctx = kcalloc(max, sizeof(void *), GFP_KERNEL);
+	if (!sgid_tbl->ctx)
+		goto out_free2;
+
+	sgid_tbl->vlan = kcalloc(max, sizeof(u8), GFP_KERNEL);
+	if (!sgid_tbl->vlan)
+		goto out_free3;
+
+	sgid_tbl->max = max;
+	return 0;
+out_free3:
+	kfree(sgid_tbl->ctx);
+	sgid_tbl->ctx = NULL;
+out_free2:
+	kfree(sgid_tbl->hw_id);
+	sgid_tbl->hw_id = NULL;
+out_free1:
+	kfree(sgid_tbl->tbl);
+	sgid_tbl->tbl = NULL;
+	return -ENOMEM;
+};
+
+static void bnxt_qplib_cleanup_sgid_tbl(struct bnxt_qplib_res *res,
+					struct bnxt_qplib_sgid_tbl *sgid_tbl)
+{
+	int i;
+
+	for (i = 0; i < sgid_tbl->max; i++) {
+		if (memcmp(&sgid_tbl->tbl[i], &bnxt_qplib_gid_zero,
+			   sizeof(bnxt_qplib_gid_zero)))
+			bnxt_qplib_del_sgid(sgid_tbl, &sgid_tbl->tbl[i], true);
+	}
+	memset(sgid_tbl->tbl, 0, sizeof(struct bnxt_qplib_gid) * sgid_tbl->max);
+	memset(sgid_tbl->hw_id, -1, sizeof(u16) * sgid_tbl->max);
+	memset(sgid_tbl->vlan, 0, sizeof(u8) * sgid_tbl->max);
+	sgid_tbl->active = 0;
+}
+
+static void bnxt_qplib_init_sgid_tbl(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+				     struct net_device *netdev)
+{
+	memset(sgid_tbl->tbl, 0, sizeof(struct bnxt_qplib_gid) * sgid_tbl->max);
+	memset(sgid_tbl->hw_id, -1, sizeof(u16) * sgid_tbl->max);
+}
+
+static void bnxt_qplib_free_pkey_tbl(struct bnxt_qplib_res *res,
+				     struct bnxt_qplib_pkey_tbl *pkey_tbl)
+{
+	if (!pkey_tbl->tbl)
+		dev_dbg(&res->pdev->dev, "QPLIB: PKEY tbl not present");
+	else
+		kfree(pkey_tbl->tbl);
+
+	pkey_tbl->tbl = NULL;
+	pkey_tbl->max = 0;
+	pkey_tbl->active = 0;
+}
+
+static int bnxt_qplib_alloc_pkey_tbl(struct bnxt_qplib_res *res,
+				     struct bnxt_qplib_pkey_tbl *pkey_tbl,
+				     u16 max)
+{
+	pkey_tbl->tbl = kcalloc(max, sizeof(u16), GFP_KERNEL);
+	if (!pkey_tbl->tbl)
+		return -ENOMEM;
+
+	pkey_tbl->max = max;
+	return 0;
+};
+
+/* PDs */
+int bnxt_qplib_alloc_pd(struct bnxt_qplib_pd_tbl *pdt, struct bnxt_qplib_pd *pd)
+{
+	u32 bit_num;
+
+	bit_num = find_first_bit(pdt->tbl, pdt->max);
+	if (bit_num == pdt->max)
+		return -ENOMEM;
+
+	/* Found unused PD */
+	clear_bit(bit_num, pdt->tbl);
+	pd->id = bit_num;
+	return 0;
+}
+
+int bnxt_qplib_dealloc_pd(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_pd_tbl *pdt,
+			  struct bnxt_qplib_pd *pd)
+{
+	if (test_and_set_bit(pd->id, pdt->tbl)) {
+		dev_warn(&res->pdev->dev, "Freeing an unused PD? pdn = %d",
+			 pd->id);
+		return -EINVAL;
+	}
+	pd->id = 0;
+	return 0;
+}
+
+static void bnxt_qplib_free_pd_tbl(struct bnxt_qplib_pd_tbl *pdt)
+{
+	kfree(pdt->tbl);
+	pdt->tbl = NULL;
+	pdt->max = 0;
+}
+
+static int bnxt_qplib_alloc_pd_tbl(struct bnxt_qplib_res *res,
+				   struct bnxt_qplib_pd_tbl *pdt,
+				   u32 max)
+{
+	u32 bytes;
+
+	bytes = max >> 3;
+	if (!bytes)
+		bytes = 1;
+	pdt->tbl = kmalloc(bytes, GFP_KERNEL);
+	if (!pdt->tbl)
+		return -ENOMEM;
+
+	pdt->max = max;
+	memset((u8 *)pdt->tbl, 0xFF, bytes);
+
+	return 0;
+}
+
+/* DPIs */
+int bnxt_qplib_alloc_dpi(struct bnxt_qplib_dpi_tbl *dpit,
+			 struct bnxt_qplib_dpi     *dpi,
+			 void                      *app)
+{
+	u32 bit_num;
+
+	bit_num = find_first_bit(dpit->tbl, dpit->max);
+	if (bit_num == dpit->max)
+		return -ENOMEM;
+
+	/* Found unused DPI */
+	clear_bit(bit_num, dpit->tbl);
+	dpit->app_tbl[bit_num] = app;
+
+	dpi->dpi = bit_num;
+	dpi->dbr = dpit->dbr_bar_reg_iomem + (bit_num * PAGE_SIZE);
+	dpi->umdbr = dpit->unmapped_dbr + (bit_num * PAGE_SIZE);
+
+	return 0;
+}
+
+int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res,
+			   struct bnxt_qplib_dpi_tbl *dpit,
+			   struct bnxt_qplib_dpi     *dpi)
+{
+	if (dpi->dpi >= dpit->max) {
+		dev_warn(&res->pdev->dev, "Invalid DPI? dpi = %d", dpi->dpi);
+		return -EINVAL;
+	}
+	if (test_and_set_bit(dpi->dpi, dpit->tbl)) {
+		dev_warn(&res->pdev->dev, "Freeing an unused DPI? dpi = %d",
+			 dpi->dpi);
+		return -EINVAL;
+	}
+	if (dpit->app_tbl)
+		dpit->app_tbl[dpi->dpi] = NULL;
+	memset(dpi, 0, sizeof(*dpi));
+
+	return 0;
+}
+
+static void bnxt_qplib_free_dpi_tbl(struct bnxt_qplib_res     *res,
+				    struct bnxt_qplib_dpi_tbl *dpit)
+{
+	kfree(dpit->tbl);
+	kfree(dpit->app_tbl);
+	if (dpit->dbr_bar_reg_iomem)
+		pci_iounmap(res->pdev, dpit->dbr_bar_reg_iomem);
+	memset(dpit, 0, sizeof(*dpit));
+}
+
+static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res     *res,
+				    struct bnxt_qplib_dpi_tbl *dpit,
+				    u32                       dbr_offset)
+{
+	u32 dbr_bar_reg = RCFW_DBR_PCI_BAR_REGION;
+	resource_size_t bar_reg_base;
+	u32 dbr_len, bytes;
+
+	if (dpit->dbr_bar_reg_iomem) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: DBR BAR region %d already mapped", dbr_bar_reg);
+		return -EALREADY;
+	}
+
+	bar_reg_base = pci_resource_start(res->pdev, dbr_bar_reg);
+	if (!bar_reg_base) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: BAR region %d resc start failed", dbr_bar_reg);
+		return -ENOMEM;
+	}
+
+	dbr_len = pci_resource_len(res->pdev, dbr_bar_reg) - dbr_offset;
+	if (!dbr_len || ((dbr_len & (PAGE_SIZE - 1)) != 0)) {
+		dev_err(&res->pdev->dev, "QPLIB: Invalid DBR length %d",
+			dbr_len);
+		return -ENOMEM;
+	}
+
+	dpit->dbr_bar_reg_iomem = ioremap_nocache(bar_reg_base + dbr_offset,
+						  dbr_len);
+	if (!dpit->dbr_bar_reg_iomem) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: FP: DBR BAR region %d mapping failed",
+			dbr_bar_reg);
+		return -ENOMEM;
+	}
+
+	dpit->unmapped_dbr = bar_reg_base + dbr_offset;
+	dpit->max = dbr_len / PAGE_SIZE;
+
+	dpit->app_tbl = kcalloc(dpit->max, sizeof(void *), GFP_KERNEL);
+	if (!dpit->app_tbl)
+		goto unmap_io;
+
+	bytes = dpit->max >> 3;
+	if (!bytes)
+		bytes = 1;
+
+	dpit->tbl = kmalloc(bytes, GFP_KERNEL);
+	if (!dpit->tbl) {
+		kfree(dpit->app_tbl);
+		dpit->app_tbl = NULL;
+		goto unmap_io;
+	}
+
+	memset((u8 *)dpit->tbl, 0xFF, bytes);
+
+	return 0;
+
+unmap_io:
+	pci_iounmap(res->pdev, dpit->dbr_bar_reg_iomem);
+	return -ENOMEM;
+}
+
+/* PKEYs */
+static void bnxt_qplib_cleanup_pkey_tbl(struct bnxt_qplib_pkey_tbl *pkey_tbl)
+{
+	memset(pkey_tbl->tbl, 0, sizeof(u16) * pkey_tbl->max);
+	pkey_tbl->active = 0;
+}
+
+static void bnxt_qplib_init_pkey_tbl(struct bnxt_qplib_res *res,
+				     struct bnxt_qplib_pkey_tbl *pkey_tbl)
+{
+	u16 pkey = 0xFFFF;
+
+	memset(pkey_tbl->tbl, 0, sizeof(u16) * pkey_tbl->max);
+
+	/* pkey default = 0xFFFF */
+	bnxt_qplib_add_pkey(res, pkey_tbl, &pkey, false);
+}
+
+/* Stats */
+static void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev,
+				      struct bnxt_qplib_stats *stats)
+{
+	if (stats->dma) {
+		dma_free_coherent(&pdev->dev, stats->size,
+				  stats->dma, stats->dma_map);
+	}
+	memset(stats, 0, sizeof(*stats));
+	stats->fw_id = -1;
+}
+
+static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
+				      struct bnxt_qplib_stats *stats)
+{
+	memset(stats, 0, sizeof(*stats));
+	stats->fw_id = -1;
+	stats->size = sizeof(struct ctx_hw_stats);
+	stats->dma = dma_alloc_coherent(&pdev->dev, stats->size,
+					&stats->dma_map, GFP_KERNEL);
+	if (!stats->dma) {
+		dev_err(&pdev->dev, "QPLIB: Stats DMA allocation failed");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void bnxt_qplib_cleanup_res(struct bnxt_qplib_res *res)
+{
+	bnxt_qplib_cleanup_pkey_tbl(&res->pkey_tbl);
+	bnxt_qplib_cleanup_sgid_tbl(res, &res->sgid_tbl);
+}
+
+int bnxt_qplib_init_res(struct bnxt_qplib_res *res)
+{
+	bnxt_qplib_init_sgid_tbl(&res->sgid_tbl, res->netdev);
+	bnxt_qplib_init_pkey_tbl(res, &res->pkey_tbl);
+
+	return 0;
+}
+
+void bnxt_qplib_free_res(struct bnxt_qplib_res *res)
+{
+	bnxt_qplib_free_pkey_tbl(res, &res->pkey_tbl);
+	bnxt_qplib_free_sgid_tbl(res, &res->sgid_tbl);
+	bnxt_qplib_free_pd_tbl(&res->pd_tbl);
+	bnxt_qplib_free_dpi_tbl(res, &res->dpi_tbl);
+
+	res->netdev = NULL;
+	res->pdev = NULL;
+}
+
+int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev,
+			 struct net_device *netdev,
+			 struct bnxt_qplib_dev_attr *dev_attr)
+{
+	int rc = 0;
+
+	res->pdev = pdev;
+	res->netdev = netdev;
+
+	rc = bnxt_qplib_alloc_sgid_tbl(res, &res->sgid_tbl, dev_attr->max_sgid);
+	if (rc)
+		goto fail;
+
+	rc = bnxt_qplib_alloc_pkey_tbl(res, &res->pkey_tbl, dev_attr->max_pkey);
+	if (rc)
+		goto fail;
+
+	rc = bnxt_qplib_alloc_pd_tbl(res, &res->pd_tbl, dev_attr->max_pd);
+	if (rc)
+		goto fail;
+
+	rc = bnxt_qplib_alloc_dpi_tbl(res, &res->dpi_tbl, dev_attr->l2_db_size);
+	if (rc)
+		goto fail;
+
+	return 0;
+fail:
+	bnxt_qplib_free_res(res);
+	return rc;
+}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h
new file mode 100644
index 0000000..2e5c052
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h
@@ -0,0 +1,229 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: QPLib resource manager (header)
+ */
+
+#ifndef __BNXT_QPLIB_RES_H__
+#define __BNXT_QPLIB_RES_H__
+
+extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero;
+
+#define PTR_CNT_PER_PG		(PAGE_SIZE / sizeof(void *))
+#define PTR_MAX_IDX_PER_PG	(PTR_CNT_PER_PG - 1)
+#define PTR_PG(x)		(((x) & ~PTR_MAX_IDX_PER_PG) / PTR_CNT_PER_PG)
+#define PTR_IDX(x)		((x) & PTR_MAX_IDX_PER_PG)
+
+#define HWQ_CMP(idx, hwq)	((idx) & ((hwq)->max_elements - 1))
+
+#define HWQ_FREE_SLOTS(hwq)	(hwq->max_elements - \
+				((HWQ_CMP(hwq->prod, hwq)\
+				- HWQ_CMP(hwq->cons, hwq))\
+				& (hwq->max_elements - 1)))
+enum bnxt_qplib_hwq_type {
+	HWQ_TYPE_CTX,
+	HWQ_TYPE_QUEUE,
+	HWQ_TYPE_L2_CMPL
+};
+
+#define MAX_PBL_LVL_0_PGS		1
+#define MAX_PBL_LVL_1_PGS		512
+#define MAX_PBL_LVL_1_PGS_SHIFT		9
+#define MAX_PBL_LVL_1_PGS_FOR_LVL_2	256
+#define MAX_PBL_LVL_2_PGS		(256 * 512)
+
+enum bnxt_qplib_pbl_lvl {
+	PBL_LVL_0,
+	PBL_LVL_1,
+	PBL_LVL_2,
+	PBL_LVL_MAX
+};
+
+#define ROCE_PG_SIZE_4K		(4 * 1024)
+#define ROCE_PG_SIZE_8K		(8 * 1024)
+#define ROCE_PG_SIZE_64K	(64 * 1024)
+#define ROCE_PG_SIZE_2M		(2 * 1024 * 1024)
+#define ROCE_PG_SIZE_8M		(8 * 1024 * 1024)
+#define ROCE_PG_SIZE_1G		(1024 * 1024 * 1024)
+
+struct bnxt_qplib_pbl {
+	u32				pg_count;
+	u32				pg_size;
+	void				**pg_arr;
+	dma_addr_t			*pg_map_arr;
+};
+
+struct bnxt_qplib_hwq {
+	struct pci_dev			*pdev;
+	/* lock to protect qplib_hwq */
+	spinlock_t			lock;
+	struct bnxt_qplib_pbl		pbl[PBL_LVL_MAX];
+	enum bnxt_qplib_pbl_lvl		level;		/* 0, 1, or 2 */
+	/* ptr for easy access to the PBL entries */
+	void				**pbl_ptr;
+	/* ptr for easy access to the dma_addr */
+	dma_addr_t			*pbl_dma_ptr;
+	u32				max_elements;
+	u16				element_size;	/* Size of each entry */
+
+	u32				prod;		/* raw */
+	u32				cons;		/* raw */
+	u8				cp_bit;
+	u8				is_user;
+};
+
+/* Tables */
+struct bnxt_qplib_pd_tbl {
+	unsigned long			*tbl;
+	u32				max;
+};
+
+struct bnxt_qplib_sgid_tbl {
+	struct bnxt_qplib_gid		*tbl;
+	u16				*hw_id;
+	u16				max;
+	u16				active;
+	void				*ctx;
+	u8				*vlan;
+};
+
+struct bnxt_qplib_pkey_tbl {
+	u16				*tbl;
+	u16				max;
+	u16				active;
+};
+
+struct bnxt_qplib_dpi {
+	u32				dpi;
+	void __iomem			*dbr;
+	u64				umdbr;
+};
+
+struct bnxt_qplib_dpi_tbl {
+	void				**app_tbl;
+	unsigned long			*tbl;
+	u16				max;
+	void __iomem			*dbr_bar_reg_iomem;
+	u64				unmapped_dbr;
+};
+
+struct bnxt_qplib_stats {
+	dma_addr_t			dma_map;
+	void				*dma;
+	u32				size;
+	u32				fw_id;
+};
+
+struct bnxt_qplib_vf_res {
+	u32 max_qp_per_vf;
+	u32 max_mrw_per_vf;
+	u32 max_srq_per_vf;
+	u32 max_cq_per_vf;
+	u32 max_gid_per_vf;
+};
+
+#define BNXT_QPLIB_MAX_QP_CTX_ENTRY_SIZE	448
+#define BNXT_QPLIB_MAX_SRQ_CTX_ENTRY_SIZE	64
+#define BNXT_QPLIB_MAX_CQ_CTX_ENTRY_SIZE	64
+#define BNXT_QPLIB_MAX_MRW_CTX_ENTRY_SIZE	128
+
+struct bnxt_qplib_ctx {
+	u32				qpc_count;
+	struct bnxt_qplib_hwq		qpc_tbl;
+	u32				mrw_count;
+	struct bnxt_qplib_hwq		mrw_tbl;
+	u32				srqc_count;
+	struct bnxt_qplib_hwq		srqc_tbl;
+	u32				cq_count;
+	struct bnxt_qplib_hwq		cq_tbl;
+	struct bnxt_qplib_hwq		tim_tbl;
+#define MAX_TQM_ALLOC_REQ		48
+#define MAX_TQM_ALLOC_BLK_SIZE		8
+	u8				tqm_count[MAX_TQM_ALLOC_REQ];
+	struct bnxt_qplib_hwq		tqm_pde;
+	u32				tqm_pde_level;
+	struct bnxt_qplib_hwq		tqm_tbl[MAX_TQM_ALLOC_REQ];
+	struct bnxt_qplib_stats		stats;
+	struct bnxt_qplib_vf_res	vf_res;
+};
+
+struct bnxt_qplib_res {
+	struct pci_dev			*pdev;
+	struct net_device		*netdev;
+
+	struct bnxt_qplib_rcfw		*rcfw;
+
+	struct bnxt_qplib_pd_tbl	pd_tbl;
+	struct bnxt_qplib_sgid_tbl	sgid_tbl;
+	struct bnxt_qplib_pkey_tbl	pkey_tbl;
+	struct bnxt_qplib_dpi_tbl	dpi_tbl;
+	bool				prio;
+};
+
+#define to_bnxt_qplib(ptr, type, member)	\
+	container_of(ptr, type, member)
+
+struct bnxt_qplib_pd;
+struct bnxt_qplib_dev_attr;
+
+void bnxt_qplib_free_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq);
+int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
+			      struct scatterlist *sl, int nmap, u32 *elements,
+			      u32 elements_per_page, u32 aux, u32 pg_size,
+			      enum bnxt_qplib_hwq_type hwq_type);
+void bnxt_qplib_get_guid(u8 *dev_addr, u8 *guid);
+int bnxt_qplib_alloc_pd(struct bnxt_qplib_pd_tbl *pd_tbl,
+			struct bnxt_qplib_pd *pd);
+int bnxt_qplib_dealloc_pd(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_pd_tbl *pd_tbl,
+			  struct bnxt_qplib_pd *pd);
+int bnxt_qplib_alloc_dpi(struct bnxt_qplib_dpi_tbl *dpit,
+			 struct bnxt_qplib_dpi     *dpi,
+			 void                      *app);
+int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res,
+			   struct bnxt_qplib_dpi_tbl *dpi_tbl,
+			   struct bnxt_qplib_dpi *dpi);
+void bnxt_qplib_cleanup_res(struct bnxt_qplib_res *res);
+int bnxt_qplib_init_res(struct bnxt_qplib_res *res);
+void bnxt_qplib_free_res(struct bnxt_qplib_res *res);
+int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev,
+			 struct net_device *netdev,
+			 struct bnxt_qplib_dev_attr *dev_attr);
+void bnxt_qplib_free_ctx(struct pci_dev *pdev,
+			 struct bnxt_qplib_ctx *ctx);
+int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
+			 struct bnxt_qplib_ctx *ctx,
+			 bool virt_fn);
+#endif /* __BNXT_QPLIB_RES_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
new file mode 100644
index 0000000..2f3f32e
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
@@ -0,0 +1,851 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Slow Path Operators
+ */
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+
+#include "roce_hsi.h"
+
+#include "qplib_res.h"
+#include "qplib_rcfw.h"
+#include "qplib_sp.h"
+
+const struct bnxt_qplib_gid bnxt_qplib_gid_zero = {{ 0, 0, 0, 0, 0, 0, 0, 0,
+						     0, 0, 0, 0, 0, 0, 0, 0 } };
+
+/* Device */
+
+static void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw,
+				     char *fw_ver)
+{
+	struct cmdq_query_version req;
+	struct creq_query_version_resp resp;
+	u16 cmd_flags = 0;
+	int rc = 0;
+
+	RCFW_CMD_PREP(req, QUERY_VERSION, cmd_flags);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	if (rc)
+		return;
+	fw_ver[0] = resp.fw_maj;
+	fw_ver[1] = resp.fw_minor;
+	fw_ver[2] = resp.fw_bld;
+	fw_ver[3] = resp.fw_rsvd;
+}
+
+int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
+			    struct bnxt_qplib_dev_attr *attr, bool vf)
+{
+	struct cmdq_query_func req;
+	struct creq_query_func_resp resp;
+	struct bnxt_qplib_rcfw_sbuf *sbuf;
+	struct creq_query_func_resp_sb *sb;
+	u16 cmd_flags = 0;
+	u32 temp;
+	u8 *tqm_alloc;
+	int i, rc = 0;
+
+	RCFW_CMD_PREP(req, QUERY_FUNC, cmd_flags);
+
+	sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb));
+	if (!sbuf) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: SP: QUERY_FUNC alloc side buffer failed");
+		return -ENOMEM;
+	}
+
+	sb = sbuf->sb;
+	req.resp_size = sizeof(*sb) / BNXT_QPLIB_CMDQE_UNITS;
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+					  (void *)sbuf, 0);
+	if (rc)
+		goto bail;
+
+	/* Extract the context from the side buffer */
+	attr->max_qp = le32_to_cpu(sb->max_qp);
+	/* max_qp value reported by FW for PF doesn't include the QP1 for PF */
+	if (!vf)
+		attr->max_qp += 1;
+	attr->max_qp_rd_atom =
+		sb->max_qp_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ?
+		BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_rd_atom;
+	attr->max_qp_init_rd_atom =
+		sb->max_qp_init_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ?
+		BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom;
+	attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr);
+	/*
+	 * 128 WQEs needs to be reserved for the HW (8916). Prevent
+	 * reporting the max number
+	 */
+	attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS;
+	attr->max_qp_sges = sb->max_sge;
+	attr->max_cq = le32_to_cpu(sb->max_cq);
+	attr->max_cq_wqes = le32_to_cpu(sb->max_cqe);
+	attr->max_cq_sges = attr->max_qp_sges;
+	attr->max_mr = le32_to_cpu(sb->max_mr);
+	attr->max_mw = le32_to_cpu(sb->max_mw);
+
+	attr->max_mr_size = le64_to_cpu(sb->max_mr_size);
+	attr->max_pd = 64 * 1024;
+	attr->max_raw_ethy_qp = le32_to_cpu(sb->max_raw_eth_qp);
+	attr->max_ah = le32_to_cpu(sb->max_ah);
+
+	attr->max_fmr = le32_to_cpu(sb->max_fmr);
+	attr->max_map_per_fmr = sb->max_map_per_fmr;
+
+	attr->max_srq = le16_to_cpu(sb->max_srq);
+	attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1;
+	attr->max_srq_sges = sb->max_srq_sge;
+	/* Bono only reports 1 PKEY for now, but it can support > 1 */
+	attr->max_pkey = le32_to_cpu(sb->max_pkeys);
+
+	attr->max_inline_data = le32_to_cpu(sb->max_inline_data);
+	attr->l2_db_size = (sb->l2_db_space_size + 1) *
+			    (0x01 << RCFW_DBR_BASE_PAGE_SHIFT);
+	attr->max_sgid = le32_to_cpu(sb->max_gid);
+
+	bnxt_qplib_query_version(rcfw, attr->fw_ver);
+
+	for (i = 0; i < MAX_TQM_ALLOC_REQ / 4; i++) {
+		temp = le32_to_cpu(sb->tqm_alloc_reqs[i]);
+		tqm_alloc = (u8 *)&temp;
+		attr->tqm_alloc_reqs[i * 4] = *tqm_alloc;
+		attr->tqm_alloc_reqs[i * 4 + 1] = *(++tqm_alloc);
+		attr->tqm_alloc_reqs[i * 4 + 2] = *(++tqm_alloc);
+		attr->tqm_alloc_reqs[i * 4 + 3] = *(++tqm_alloc);
+	}
+
+	attr->is_atomic = false;
+bail:
+	bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf);
+	return rc;
+}
+
+int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res,
+				  struct bnxt_qplib_rcfw *rcfw,
+				  struct bnxt_qplib_ctx *ctx)
+{
+	struct cmdq_set_func_resources req;
+	struct creq_set_func_resources_resp resp;
+	u16 cmd_flags = 0;
+	int rc = 0;
+
+	RCFW_CMD_PREP(req, SET_FUNC_RESOURCES, cmd_flags);
+
+	req.number_of_qp = cpu_to_le32(ctx->qpc_count);
+	req.number_of_mrw = cpu_to_le32(ctx->mrw_count);
+	req.number_of_srq =  cpu_to_le32(ctx->srqc_count);
+	req.number_of_cq = cpu_to_le32(ctx->cq_count);
+
+	req.max_qp_per_vf = cpu_to_le32(ctx->vf_res.max_qp_per_vf);
+	req.max_mrw_per_vf = cpu_to_le32(ctx->vf_res.max_mrw_per_vf);
+	req.max_srq_per_vf = cpu_to_le32(ctx->vf_res.max_srq_per_vf);
+	req.max_cq_per_vf = cpu_to_le32(ctx->vf_res.max_cq_per_vf);
+	req.max_gid_per_vf = cpu_to_le32(ctx->vf_res.max_gid_per_vf);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp,
+					  NULL, 0);
+	if (rc) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: Failed to set function resources");
+	}
+	return rc;
+}
+
+/* SGID */
+int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_sgid_tbl *sgid_tbl, int index,
+			struct bnxt_qplib_gid *gid)
+{
+	if (index > sgid_tbl->max) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: Index %d exceeded SGID table max (%d)",
+			index, sgid_tbl->max);
+		return -EINVAL;
+	}
+	memcpy(gid, &sgid_tbl->tbl[index], sizeof(*gid));
+	return 0;
+}
+
+int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+			struct bnxt_qplib_gid *gid, bool update)
+{
+	struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl,
+						   struct bnxt_qplib_res,
+						   sgid_tbl);
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	int index;
+
+	if (!sgid_tbl) {
+		dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated");
+		return -EINVAL;
+	}
+	/* Do we need a sgid_lock here? */
+	if (!sgid_tbl->active) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: SGID table has no active entries");
+		return -ENOMEM;
+	}
+	for (index = 0; index < sgid_tbl->max; index++) {
+		if (!memcmp(&sgid_tbl->tbl[index], gid, sizeof(*gid)))
+			break;
+	}
+	if (index == sgid_tbl->max) {
+		dev_warn(&res->pdev->dev, "GID not found in the SGID table");
+		return 0;
+	}
+	/* Remove GID from the SGID table */
+	if (update) {
+		struct cmdq_delete_gid req;
+		struct creq_delete_gid_resp resp;
+		u16 cmd_flags = 0;
+		int rc;
+
+		RCFW_CMD_PREP(req, DELETE_GID, cmd_flags);
+		if (sgid_tbl->hw_id[index] == 0xFFFF) {
+			dev_err(&res->pdev->dev,
+				"QPLIB: GID entry contains an invalid HW id");
+			return -EINVAL;
+		}
+		req.gid_index = cpu_to_le16(sgid_tbl->hw_id[index]);
+		rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						  (void *)&resp, NULL, 0);
+		if (rc)
+			return rc;
+	}
+	memcpy(&sgid_tbl->tbl[index], &bnxt_qplib_gid_zero,
+	       sizeof(bnxt_qplib_gid_zero));
+	sgid_tbl->vlan[index] = 0;
+	sgid_tbl->active--;
+	dev_dbg(&res->pdev->dev,
+		"QPLIB: SGID deleted hw_id[0x%x] = 0x%x active = 0x%x",
+		 index, sgid_tbl->hw_id[index], sgid_tbl->active);
+	sgid_tbl->hw_id[index] = (u16)-1;
+
+	/* unlock */
+	return 0;
+}
+
+int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+			struct bnxt_qplib_gid *gid, u8 *smac, u16 vlan_id,
+			bool update, u32 *index)
+{
+	struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl,
+						   struct bnxt_qplib_res,
+						   sgid_tbl);
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	int i, free_idx;
+
+	if (!sgid_tbl) {
+		dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated");
+		return -EINVAL;
+	}
+	/* Do we need a sgid_lock here? */
+	if (sgid_tbl->active == sgid_tbl->max) {
+		dev_err(&res->pdev->dev, "QPLIB: SGID table is full");
+		return -ENOMEM;
+	}
+	free_idx = sgid_tbl->max;
+	for (i = 0; i < sgid_tbl->max; i++) {
+		if (!memcmp(&sgid_tbl->tbl[i], gid, sizeof(*gid))) {
+			dev_dbg(&res->pdev->dev,
+				"QPLIB: SGID entry already exist in entry %d!",
+				i);
+			*index = i;
+			return -EALREADY;
+		} else if (!memcmp(&sgid_tbl->tbl[i], &bnxt_qplib_gid_zero,
+				   sizeof(bnxt_qplib_gid_zero)) &&
+			   free_idx == sgid_tbl->max) {
+			free_idx = i;
+		}
+	}
+	if (free_idx == sgid_tbl->max) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: SGID table is FULL but count is not MAX??");
+		return -ENOMEM;
+	}
+	if (update) {
+		struct cmdq_add_gid req;
+		struct creq_add_gid_resp resp;
+		u16 cmd_flags = 0;
+		int rc;
+
+		RCFW_CMD_PREP(req, ADD_GID, cmd_flags);
+
+		req.gid[0] = cpu_to_be32(((u32 *)gid->data)[3]);
+		req.gid[1] = cpu_to_be32(((u32 *)gid->data)[2]);
+		req.gid[2] = cpu_to_be32(((u32 *)gid->data)[1]);
+		req.gid[3] = cpu_to_be32(((u32 *)gid->data)[0]);
+		/*
+		 * driver should ensure that all RoCE traffic is always VLAN
+		 * tagged if RoCE traffic is running on non-zero VLAN ID or
+		 * RoCE traffic is running on non-zero Priority.
+		 */
+		if ((vlan_id != 0xFFFF) || res->prio) {
+			if (vlan_id != 0xFFFF)
+				req.vlan = cpu_to_le16
+				(vlan_id & CMDQ_ADD_GID_VLAN_VLAN_ID_MASK);
+			req.vlan |= cpu_to_le16
+					(CMDQ_ADD_GID_VLAN_TPID_TPID_8100 |
+					 CMDQ_ADD_GID_VLAN_VLAN_EN);
+		}
+
+		/* MAC in network format */
+		req.src_mac[0] = cpu_to_be16(((u16 *)smac)[0]);
+		req.src_mac[1] = cpu_to_be16(((u16 *)smac)[1]);
+		req.src_mac[2] = cpu_to_be16(((u16 *)smac)[2]);
+
+		rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+						  (void *)&resp, NULL, 0);
+		if (rc)
+			return rc;
+		sgid_tbl->hw_id[free_idx] = le32_to_cpu(resp.xid);
+	}
+	/* Add GID to the sgid_tbl */
+	memcpy(&sgid_tbl->tbl[free_idx], gid, sizeof(*gid));
+	sgid_tbl->active++;
+	if (vlan_id != 0xFFFF)
+		sgid_tbl->vlan[free_idx] = 1;
+
+	dev_dbg(&res->pdev->dev,
+		"QPLIB: SGID added hw_id[0x%x] = 0x%x active = 0x%x",
+		 free_idx, sgid_tbl->hw_id[free_idx], sgid_tbl->active);
+
+	*index = free_idx;
+	/* unlock */
+	return 0;
+}
+
+int bnxt_qplib_update_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+			   struct bnxt_qplib_gid *gid, u16 gid_idx,
+			   u8 *smac)
+{
+	struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl,
+						   struct bnxt_qplib_res,
+						   sgid_tbl);
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct creq_modify_gid_resp resp;
+	struct cmdq_modify_gid req;
+	int rc;
+	u16 cmd_flags = 0;
+
+	RCFW_CMD_PREP(req, MODIFY_GID, cmd_flags);
+
+	req.gid[0] = cpu_to_be32(((u32 *)gid->data)[3]);
+	req.gid[1] = cpu_to_be32(((u32 *)gid->data)[2]);
+	req.gid[2] = cpu_to_be32(((u32 *)gid->data)[1]);
+	req.gid[3] = cpu_to_be32(((u32 *)gid->data)[0]);
+	if (res->prio) {
+		req.vlan |= cpu_to_le16
+			(CMDQ_ADD_GID_VLAN_TPID_TPID_8100 |
+			 CMDQ_ADD_GID_VLAN_VLAN_EN);
+	}
+
+	/* MAC in network format */
+	req.src_mac[0] = cpu_to_be16(((u16 *)smac)[0]);
+	req.src_mac[1] = cpu_to_be16(((u16 *)smac)[1]);
+	req.src_mac[2] = cpu_to_be16(((u16 *)smac)[2]);
+
+	req.gid_index = cpu_to_le16(gid_idx);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	return rc;
+}
+
+/* pkeys */
+int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 index,
+			u16 *pkey)
+{
+	if (index == 0xFFFF) {
+		*pkey = 0xFFFF;
+		return 0;
+	}
+	if (index > pkey_tbl->max) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: Index %d exceeded PKEY table max (%d)",
+			index, pkey_tbl->max);
+		return -EINVAL;
+	}
+	memcpy(pkey, &pkey_tbl->tbl[index], sizeof(*pkey));
+	return 0;
+}
+
+int bnxt_qplib_del_pkey(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey,
+			bool update)
+{
+	int i, rc = 0;
+
+	if (!pkey_tbl) {
+		dev_err(&res->pdev->dev, "QPLIB: PKEY table not allocated");
+		return -EINVAL;
+	}
+
+	/* Do we need a pkey_lock here? */
+	if (!pkey_tbl->active) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: PKEY table has no active entries");
+		return -ENOMEM;
+	}
+	for (i = 0; i < pkey_tbl->max; i++) {
+		if (!memcmp(&pkey_tbl->tbl[i], pkey, sizeof(*pkey)))
+			break;
+	}
+	if (i == pkey_tbl->max) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: PKEY 0x%04x not found in the pkey table",
+			*pkey);
+		return -ENOMEM;
+	}
+	memset(&pkey_tbl->tbl[i], 0, sizeof(*pkey));
+	pkey_tbl->active--;
+
+	/* unlock */
+	return rc;
+}
+
+int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey,
+			bool update)
+{
+	int i, free_idx, rc = 0;
+
+	if (!pkey_tbl) {
+		dev_err(&res->pdev->dev, "QPLIB: PKEY table not allocated");
+		return -EINVAL;
+	}
+
+	/* Do we need a pkey_lock here? */
+	if (pkey_tbl->active == pkey_tbl->max) {
+		dev_err(&res->pdev->dev, "QPLIB: PKEY table is full");
+		return -ENOMEM;
+	}
+	free_idx = pkey_tbl->max;
+	for (i = 0; i < pkey_tbl->max; i++) {
+		if (!memcmp(&pkey_tbl->tbl[i], pkey, sizeof(*pkey)))
+			return -EALREADY;
+		else if (!pkey_tbl->tbl[i] && free_idx == pkey_tbl->max)
+			free_idx = i;
+	}
+	if (free_idx == pkey_tbl->max) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: PKEY table is FULL but count is not MAX??");
+		return -ENOMEM;
+	}
+	/* Add PKEY to the pkey_tbl */
+	memcpy(&pkey_tbl->tbl[free_idx], pkey, sizeof(*pkey));
+	pkey_tbl->active++;
+
+	/* unlock */
+	return rc;
+}
+
+/* AH */
+int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_create_ah req;
+	struct creq_create_ah_resp resp;
+	u16 cmd_flags = 0;
+	u32 temp32[4];
+	u16 temp16[3];
+	int rc;
+
+	RCFW_CMD_PREP(req, CREATE_AH, cmd_flags);
+
+	memcpy(temp32, ah->dgid.data, sizeof(struct bnxt_qplib_gid));
+	req.dgid[0] = cpu_to_le32(temp32[0]);
+	req.dgid[1] = cpu_to_le32(temp32[1]);
+	req.dgid[2] = cpu_to_le32(temp32[2]);
+	req.dgid[3] = cpu_to_le32(temp32[3]);
+
+	req.type = ah->nw_type;
+	req.hop_limit = ah->hop_limit;
+	req.sgid_index = cpu_to_le16(res->sgid_tbl.hw_id[ah->sgid_index]);
+	req.dest_vlan_id_flow_label = cpu_to_le32((ah->flow_label &
+					CMDQ_CREATE_AH_FLOW_LABEL_MASK) |
+					CMDQ_CREATE_AH_DEST_VLAN_ID_MASK);
+	req.pd_id = cpu_to_le32(ah->pd->id);
+	req.traffic_class = ah->traffic_class;
+
+	/* MAC in network format */
+	memcpy(temp16, ah->dmac, 6);
+	req.dest_mac[0] = cpu_to_le16(temp16[0]);
+	req.dest_mac[1] = cpu_to_le16(temp16[1]);
+	req.dest_mac[2] = cpu_to_le16(temp16[2]);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+					  NULL, 1);
+	if (rc)
+		return rc;
+
+	ah->id = le32_to_cpu(resp.xid);
+	return 0;
+}
+
+int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_destroy_ah req;
+	struct creq_destroy_ah_resp resp;
+	u16 cmd_flags = 0;
+	int rc;
+
+	/* Clean up the AH table in the device */
+	RCFW_CMD_PREP(req, DESTROY_AH, cmd_flags);
+
+	req.ah_cid = cpu_to_le32(ah->id);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+					  NULL, 1);
+	if (rc)
+		return rc;
+	return 0;
+}
+
+/* MRW */
+int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_deallocate_key req;
+	struct creq_deallocate_key_resp resp;
+	u16 cmd_flags = 0;
+	int rc;
+
+	if (mrw->lkey == 0xFFFFFFFF) {
+		dev_info(&res->pdev->dev,
+			 "QPLIB: SP: Free a reserved lkey MRW");
+		return 0;
+	}
+
+	RCFW_CMD_PREP(req, DEALLOCATE_KEY, cmd_flags);
+
+	req.mrw_flags = mrw->type;
+
+	if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE1)  ||
+	    (mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A) ||
+	    (mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B))
+		req.key = cpu_to_le32(mrw->rkey);
+	else
+		req.key = cpu_to_le32(mrw->lkey);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+					  NULL, 0);
+	if (rc)
+		return rc;
+
+	/* Free the qplib's MRW memory */
+	if (mrw->hwq.max_elements)
+		bnxt_qplib_free_hwq(res->pdev, &mrw->hwq);
+
+	return 0;
+}
+
+int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_allocate_mrw req;
+	struct creq_allocate_mrw_resp resp;
+	u16 cmd_flags = 0;
+	unsigned long tmp;
+	int rc;
+
+	RCFW_CMD_PREP(req, ALLOCATE_MRW, cmd_flags);
+
+	req.pd_id = cpu_to_le32(mrw->pd->id);
+	req.mrw_flags = mrw->type;
+	if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR &&
+	     mrw->flags & BNXT_QPLIB_FR_PMR) ||
+	    mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A ||
+	    mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B)
+		req.access = CMDQ_ALLOCATE_MRW_ACCESS_CONSUMER_OWNED_KEY;
+	tmp = (unsigned long)mrw;
+	req.mrw_handle = cpu_to_le64(tmp);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	if (rc)
+		return rc;
+
+	if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE1)  ||
+	    (mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A) ||
+	    (mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B))
+		mrw->rkey = le32_to_cpu(resp.xid);
+	else
+		mrw->lkey = le32_to_cpu(resp.xid);
+	return 0;
+}
+
+int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
+			 bool block)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_deregister_mr req;
+	struct creq_deregister_mr_resp resp;
+	u16 cmd_flags = 0;
+	int rc;
+
+	RCFW_CMD_PREP(req, DEREGISTER_MR, cmd_flags);
+
+	req.lkey = cpu_to_le32(mrw->lkey);
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, block);
+	if (rc)
+		return rc;
+
+	/* Free the qplib's MR memory */
+	if (mrw->hwq.max_elements) {
+		mrw->va = 0;
+		mrw->total_size = 0;
+		bnxt_qplib_free_hwq(res->pdev, &mrw->hwq);
+	}
+
+	return 0;
+}
+
+int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
+		      u64 *pbl_tbl, int num_pbls, bool block, u32 buf_pg_size)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_register_mr req;
+	struct creq_register_mr_resp resp;
+	u16 cmd_flags = 0, level;
+	int pg_ptrs, pages, i, rc;
+	dma_addr_t **pbl_ptr;
+	u32 pg_size;
+
+	if (num_pbls) {
+		/* Allocate memory for the non-leaf pages to store buf ptrs.
+		 * Non-leaf pages always uses system PAGE_SIZE
+		 */
+		pg_ptrs = roundup_pow_of_two(num_pbls);
+		pages = pg_ptrs >> MAX_PBL_LVL_1_PGS_SHIFT;
+		if (!pages)
+			pages++;
+
+		if (pages > MAX_PBL_LVL_1_PGS) {
+			dev_err(&res->pdev->dev, "QPLIB: SP: Reg MR pages ");
+			dev_err(&res->pdev->dev,
+				"requested (0x%x) exceeded max (0x%x)",
+				pages, MAX_PBL_LVL_1_PGS);
+			return -ENOMEM;
+		}
+		/* Free the hwq if it already exist, must be a rereg */
+		if (mr->hwq.max_elements)
+			bnxt_qplib_free_hwq(res->pdev, &mr->hwq);
+
+		mr->hwq.max_elements = pages;
+		/* Use system PAGE_SIZE */
+		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &mr->hwq, NULL, 0,
+					       &mr->hwq.max_elements,
+					       PAGE_SIZE, 0, PAGE_SIZE,
+					       HWQ_TYPE_CTX);
+		if (rc) {
+			dev_err(&res->pdev->dev,
+				"SP: Reg MR memory allocation failed");
+			return -ENOMEM;
+		}
+		/* Write to the hwq */
+		pbl_ptr = (dma_addr_t **)mr->hwq.pbl_ptr;
+		for (i = 0; i < num_pbls; i++)
+			pbl_ptr[PTR_PG(i)][PTR_IDX(i)] =
+				(pbl_tbl[i] & PAGE_MASK) | PTU_PTE_VALID;
+	}
+
+	RCFW_CMD_PREP(req, REGISTER_MR, cmd_flags);
+
+	/* Configure the request */
+	if (mr->hwq.level == PBL_LVL_MAX) {
+		/* No PBL provided, just use system PAGE_SIZE */
+		level = 0;
+		req.pbl = 0;
+		pg_size = PAGE_SIZE;
+	} else {
+		level = mr->hwq.level + 1;
+		req.pbl = cpu_to_le64(mr->hwq.pbl[PBL_LVL_0].pg_map_arr[0]);
+	}
+	pg_size = buf_pg_size ? buf_pg_size : PAGE_SIZE;
+	req.log2_pg_size_lvl = (level << CMDQ_REGISTER_MR_LVL_SFT) |
+			       ((ilog2(pg_size) <<
+				 CMDQ_REGISTER_MR_LOG2_PG_SIZE_SFT) &
+				CMDQ_REGISTER_MR_LOG2_PG_SIZE_MASK);
+	req.log2_pbl_pg_size = cpu_to_le16(((ilog2(PAGE_SIZE) <<
+				 CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_SFT) &
+				CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_MASK));
+	req.access = (mr->flags & 0xFFFF);
+	req.va = cpu_to_le64(mr->va);
+	req.key = cpu_to_le32(mr->lkey);
+	req.mr_size = cpu_to_le64(mr->total_size);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, block);
+	if (rc)
+		goto fail;
+
+	return 0;
+
+fail:
+	if (mr->hwq.max_elements)
+		bnxt_qplib_free_hwq(res->pdev, &mr->hwq);
+	return rc;
+}
+
+int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res,
+					struct bnxt_qplib_frpl *frpl,
+					int max_pg_ptrs)
+{
+	int pg_ptrs, pages, rc;
+
+	/* Re-calculate the max to fit the HWQ allocation model */
+	pg_ptrs = roundup_pow_of_two(max_pg_ptrs);
+	pages = pg_ptrs >> MAX_PBL_LVL_1_PGS_SHIFT;
+	if (!pages)
+		pages++;
+
+	if (pages > MAX_PBL_LVL_1_PGS)
+		return -ENOMEM;
+
+	frpl->hwq.max_elements = pages;
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &frpl->hwq, NULL, 0,
+				       &frpl->hwq.max_elements, PAGE_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_CTX);
+	if (!rc)
+		frpl->max_pg_ptrs = pg_ptrs;
+
+	return rc;
+}
+
+int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res,
+				       struct bnxt_qplib_frpl *frpl)
+{
+	bnxt_qplib_free_hwq(res->pdev, &frpl->hwq);
+	return 0;
+}
+
+int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_map_tc_to_cos req;
+	struct creq_map_tc_to_cos_resp resp;
+	u16 cmd_flags = 0;
+
+	RCFW_CMD_PREP(req, MAP_TC_TO_COS, cmd_flags);
+	req.cos0 = cpu_to_le16(cids[0]);
+	req.cos1 = cpu_to_le16(cids[1]);
+
+	bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp, NULL,
+				     0);
+	return 0;
+}
+
+int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw,
+			      struct bnxt_qplib_roce_stats *stats)
+{
+	struct cmdq_query_roce_stats req;
+	struct creq_query_roce_stats_resp resp;
+	struct bnxt_qplib_rcfw_sbuf *sbuf;
+	struct creq_query_roce_stats_resp_sb *sb;
+	u16 cmd_flags = 0;
+	int rc = 0;
+
+	RCFW_CMD_PREP(req, QUERY_ROCE_STATS, cmd_flags);
+
+	sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb));
+	if (!sbuf) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: SP: QUERY_ROCE_STATS alloc side buffer failed");
+		return -ENOMEM;
+	}
+
+	sb = sbuf->sb;
+	req.resp_size = sizeof(*sb) / BNXT_QPLIB_CMDQE_UNITS;
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+					  (void *)sbuf, 0);
+	if (rc)
+		goto bail;
+	/* Extract the context from the side buffer */
+	stats->to_retransmits = le64_to_cpu(sb->to_retransmits);
+	stats->seq_err_naks_rcvd = le64_to_cpu(sb->seq_err_naks_rcvd);
+	stats->max_retry_exceeded = le64_to_cpu(sb->max_retry_exceeded);
+	stats->rnr_naks_rcvd = le64_to_cpu(sb->rnr_naks_rcvd);
+	stats->missing_resp = le64_to_cpu(sb->missing_resp);
+	stats->unrecoverable_err = le64_to_cpu(sb->unrecoverable_err);
+	stats->bad_resp_err = le64_to_cpu(sb->bad_resp_err);
+	stats->local_qp_op_err = le64_to_cpu(sb->local_qp_op_err);
+	stats->local_protection_err = le64_to_cpu(sb->local_protection_err);
+	stats->mem_mgmt_op_err = le64_to_cpu(sb->mem_mgmt_op_err);
+	stats->remote_invalid_req_err = le64_to_cpu(sb->remote_invalid_req_err);
+	stats->remote_access_err = le64_to_cpu(sb->remote_access_err);
+	stats->remote_op_err = le64_to_cpu(sb->remote_op_err);
+	stats->dup_req = le64_to_cpu(sb->dup_req);
+	stats->res_exceed_max = le64_to_cpu(sb->res_exceed_max);
+	stats->res_length_mismatch = le64_to_cpu(sb->res_length_mismatch);
+	stats->res_exceeds_wqe = le64_to_cpu(sb->res_exceeds_wqe);
+	stats->res_opcode_err = le64_to_cpu(sb->res_opcode_err);
+	stats->res_rx_invalid_rkey = le64_to_cpu(sb->res_rx_invalid_rkey);
+	stats->res_rx_domain_err = le64_to_cpu(sb->res_rx_domain_err);
+	stats->res_rx_no_perm = le64_to_cpu(sb->res_rx_no_perm);
+	stats->res_rx_range_err = le64_to_cpu(sb->res_rx_range_err);
+	stats->res_tx_invalid_rkey = le64_to_cpu(sb->res_tx_invalid_rkey);
+	stats->res_tx_domain_err = le64_to_cpu(sb->res_tx_domain_err);
+	stats->res_tx_no_perm = le64_to_cpu(sb->res_tx_no_perm);
+	stats->res_tx_range_err = le64_to_cpu(sb->res_tx_range_err);
+	stats->res_irrq_oflow = le64_to_cpu(sb->res_irrq_oflow);
+	stats->res_unsup_opcode = le64_to_cpu(sb->res_unsup_opcode);
+	stats->res_unaligned_atomic = le64_to_cpu(sb->res_unaligned_atomic);
+	stats->res_rem_inv_err = le64_to_cpu(sb->res_rem_inv_err);
+	stats->res_mem_error = le64_to_cpu(sb->res_mem_error);
+	stats->res_srq_err = le64_to_cpu(sb->res_srq_err);
+	stats->res_cmp_err = le64_to_cpu(sb->res_cmp_err);
+	stats->res_invalid_dup_rkey = le64_to_cpu(sb->res_invalid_dup_rkey);
+	stats->res_wqe_format_err = le64_to_cpu(sb->res_wqe_format_err);
+	stats->res_cq_load_err = le64_to_cpu(sb->res_cq_load_err);
+	stats->res_srq_load_err = le64_to_cpu(sb->res_srq_load_err);
+	stats->res_tx_pci_err = le64_to_cpu(sb->res_tx_pci_err);
+	stats->res_rx_pci_err = le64_to_cpu(sb->res_rx_pci_err);
+bail:
+	bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf);
+	return rc;
+}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
new file mode 100644
index 0000000..9d3e8b9
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
@@ -0,0 +1,252 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Slow Path Operators (header)
+ *
+ */
+
+#ifndef __BNXT_QPLIB_SP_H__
+#define __BNXT_QPLIB_SP_H__
+
+#define BNXT_QPLIB_RESERVED_QP_WRS	128
+
+#define PCI_EXP_DEVCTL2_ATOMIC_REQ      0x0040
+
+struct bnxt_qplib_dev_attr {
+#define FW_VER_ARR_LEN			4
+	u8				fw_ver[FW_VER_ARR_LEN];
+	u16				max_sgid;
+	u16				max_mrw;
+	u32				max_qp;
+#define BNXT_QPLIB_MAX_OUT_RD_ATOM	126
+	u32				max_qp_rd_atom;
+	u32				max_qp_init_rd_atom;
+	u32				max_qp_wqes;
+	u32				max_qp_sges;
+	u32				max_cq;
+	u32				max_cq_wqes;
+	u32				max_cq_sges;
+	u32				max_mr;
+	u64				max_mr_size;
+	u32				max_pd;
+	u32				max_mw;
+	u32				max_raw_ethy_qp;
+	u32				max_ah;
+	u32				max_fmr;
+	u32				max_map_per_fmr;
+	u32				max_srq;
+	u32				max_srq_wqes;
+	u32				max_srq_sges;
+	u32				max_pkey;
+	u32				max_inline_data;
+	u32				l2_db_size;
+	u8				tqm_alloc_reqs[MAX_TQM_ALLOC_REQ];
+	bool				is_atomic;
+};
+
+struct bnxt_qplib_pd {
+	u32				id;
+};
+
+struct bnxt_qplib_gid {
+	u8				data[16];
+};
+
+struct bnxt_qplib_ah {
+	struct bnxt_qplib_gid		dgid;
+	struct bnxt_qplib_pd		*pd;
+	u32				id;
+	u8				sgid_index;
+	/* For Query AH if the hw table and SW table are differnt */
+	u8				host_sgid_index;
+	u8				traffic_class;
+	u32				flow_label;
+	u8				hop_limit;
+	u8				sl;
+	u8				dmac[6];
+	u16				vlan_id;
+	u8				nw_type;
+};
+
+struct bnxt_qplib_mrw {
+	struct bnxt_qplib_pd		*pd;
+	int				type;
+	u32				flags;
+#define BNXT_QPLIB_FR_PMR		0x80000000
+	u32				lkey;
+	u32				rkey;
+#define BNXT_QPLIB_RSVD_LKEY		0xFFFFFFFF
+	u64				va;
+	u64				total_size;
+	u32				npages;
+	u64				mr_handle;
+	struct bnxt_qplib_hwq		hwq;
+};
+
+struct bnxt_qplib_frpl {
+	int				max_pg_ptrs;
+	struct bnxt_qplib_hwq		hwq;
+};
+
+#define BNXT_QPLIB_ACCESS_LOCAL_WRITE	BIT(0)
+#define BNXT_QPLIB_ACCESS_REMOTE_READ	BIT(1)
+#define BNXT_QPLIB_ACCESS_REMOTE_WRITE	BIT(2)
+#define BNXT_QPLIB_ACCESS_REMOTE_ATOMIC	BIT(3)
+#define BNXT_QPLIB_ACCESS_MW_BIND	BIT(4)
+#define BNXT_QPLIB_ACCESS_ZERO_BASED	BIT(5)
+#define BNXT_QPLIB_ACCESS_ON_DEMAND	BIT(6)
+
+struct bnxt_qplib_roce_stats {
+	u64 to_retransmits;
+	u64 seq_err_naks_rcvd;
+	/* seq_err_naks_rcvd is 64 b */
+	u64 max_retry_exceeded;
+	/* max_retry_exceeded is 64 b */
+	u64 rnr_naks_rcvd;
+	/* rnr_naks_rcvd is 64 b */
+	u64 missing_resp;
+	u64 unrecoverable_err;
+	/* unrecoverable_err is 64 b */
+	u64 bad_resp_err;
+	/* bad_resp_err is 64 b */
+	u64 local_qp_op_err;
+	/* local_qp_op_err is 64 b */
+	u64 local_protection_err;
+	/* local_protection_err is 64 b */
+	u64 mem_mgmt_op_err;
+	/* mem_mgmt_op_err is 64 b */
+	u64 remote_invalid_req_err;
+	/* remote_invalid_req_err is 64 b */
+	u64 remote_access_err;
+	/* remote_access_err is 64 b */
+	u64 remote_op_err;
+	/* remote_op_err is 64 b */
+	u64 dup_req;
+	/* dup_req is 64 b */
+	u64 res_exceed_max;
+	/* res_exceed_max is 64 b */
+	u64 res_length_mismatch;
+	/* res_length_mismatch is 64 b */
+	u64 res_exceeds_wqe;
+	/* res_exceeds_wqe is 64 b */
+	u64 res_opcode_err;
+	/* res_opcode_err is 64 b */
+	u64 res_rx_invalid_rkey;
+	/* res_rx_invalid_rkey is 64 b */
+	u64 res_rx_domain_err;
+	/* res_rx_domain_err is 64 b */
+	u64 res_rx_no_perm;
+	/* res_rx_no_perm is 64 b */
+	u64 res_rx_range_err;
+	/* res_rx_range_err is 64 b */
+	u64 res_tx_invalid_rkey;
+	/* res_tx_invalid_rkey is 64 b */
+	u64 res_tx_domain_err;
+	/* res_tx_domain_err is 64 b */
+	u64 res_tx_no_perm;
+	/* res_tx_no_perm is 64 b */
+	u64 res_tx_range_err;
+	/* res_tx_range_err is 64 b */
+	u64 res_irrq_oflow;
+	/* res_irrq_oflow is 64 b */
+	u64 res_unsup_opcode;
+	/* res_unsup_opcode is 64 b */
+	u64 res_unaligned_atomic;
+	/* res_unaligned_atomic is 64 b */
+	u64 res_rem_inv_err;
+	/* res_rem_inv_err is 64 b */
+	u64 res_mem_error;
+	/* res_mem_error is 64 b */
+	u64 res_srq_err;
+	/* res_srq_err is 64 b */
+	u64 res_cmp_err;
+	/* res_cmp_err is 64 b */
+	u64 res_invalid_dup_rkey;
+	/* res_invalid_dup_rkey is 64 b */
+	u64 res_wqe_format_err;
+	/* res_wqe_format_err is 64 b */
+	u64 res_cq_load_err;
+	/* res_cq_load_err is 64 b */
+	u64 res_srq_load_err;
+	/* res_srq_load_err is 64 b */
+	u64 res_tx_pci_err;
+	/* res_tx_pci_err is 64 b */
+	u64 res_rx_pci_err;
+	/* res_rx_pci_err is 64 b */
+};
+
+int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_sgid_tbl *sgid_tbl, int index,
+			struct bnxt_qplib_gid *gid);
+int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+			struct bnxt_qplib_gid *gid, bool update);
+int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+			struct bnxt_qplib_gid *gid, u8 *mac, u16 vlan_id,
+			bool update, u32 *index);
+int bnxt_qplib_update_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+			   struct bnxt_qplib_gid *gid, u16 gid_idx, u8 *smac);
+int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 index,
+			u16 *pkey);
+int bnxt_qplib_del_pkey(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey,
+			bool update);
+int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res,
+			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey,
+			bool update);
+int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
+			    struct bnxt_qplib_dev_attr *attr, bool vf);
+int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res,
+				  struct bnxt_qplib_rcfw *rcfw,
+				  struct bnxt_qplib_ctx *ctx);
+int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
+int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
+int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res,
+			 struct bnxt_qplib_mrw *mrw);
+int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
+			 bool block);
+int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
+		      u64 *pbl_tbl, int num_pbls, bool block, u32 buf_pg_size);
+int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr);
+int bnxt_qplib_alloc_fast_reg_mr(struct bnxt_qplib_res *res,
+				 struct bnxt_qplib_mrw *mr, int max);
+int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res,
+					struct bnxt_qplib_frpl *frpl, int max);
+int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res,
+				       struct bnxt_qplib_frpl *frpl);
+int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids);
+int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw,
+			      struct bnxt_qplib_roce_stats *stats);
+#endif /* __BNXT_QPLIB_SP_H__*/
diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
new file mode 100644
index 0000000..3e5a4f7
--- /dev/null
+++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
@@ -0,0 +1,2969 @@
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: RoCE HSI File - Autogenerated
+ */
+
+#ifndef __BNXT_RE_HSI_H__
+#define __BNXT_RE_HSI_H__
+
+/* include bnxt_hsi.h from bnxt_en driver */
+#include "bnxt_hsi.h"
+
+/* CMP Door Bell Format (4 bytes) */
+struct cmpl_doorbell {
+	__le32 key_mask_valid_idx;
+	#define CMPL_DOORBELL_IDX_MASK				    0xffffffUL
+	#define CMPL_DOORBELL_IDX_SFT				    0
+	#define CMPL_DOORBELL_RESERVED_MASK			    0x3000000UL
+	#define CMPL_DOORBELL_RESERVED_SFT			    24
+	#define CMPL_DOORBELL_IDX_VALID			    0x4000000UL
+	#define CMPL_DOORBELL_MASK				    0x8000000UL
+	#define CMPL_DOORBELL_KEY_MASK				    0xf0000000UL
+	#define CMPL_DOORBELL_KEY_SFT				    28
+	#define CMPL_DOORBELL_KEY_CMPL				   (0x2UL << 28)
+};
+
+/* Status Door Bell Format (4 bytes) */
+struct status_doorbell {
+	__le32 key_idx;
+	#define STATUS_DOORBELL_IDX_MASK			    0xffffffUL
+	#define STATUS_DOORBELL_IDX_SFT			    0
+	#define STATUS_DOORBELL_RESERVED_MASK			    0xf000000UL
+	#define STATUS_DOORBELL_RESERVED_SFT			    24
+	#define STATUS_DOORBELL_KEY_MASK			    0xf0000000UL
+	#define STATUS_DOORBELL_KEY_SFT			    28
+	#define STATUS_DOORBELL_KEY_STAT			   (0x3UL << 28)
+};
+
+/* RoCE Host Structures */
+
+/* Doorbell Structures */
+/* 64b Doorbell Format (8 bytes) */
+struct dbr_dbr {
+	__le32 index;
+	#define DBR_DBR_INDEX_MASK				    0xfffffUL
+	#define DBR_DBR_INDEX_SFT				    0
+	#define DBR_DBR_RESERVED12_MASK			    0xfff00000UL
+	#define DBR_DBR_RESERVED12_SFT				    20
+	__le32 type_xid;
+	#define DBR_DBR_XID_MASK				    0xfffffUL
+	#define DBR_DBR_XID_SFT				    0
+	#define DBR_DBR_RESERVED8_MASK				    0xff00000UL
+	#define DBR_DBR_RESERVED8_SFT				    20
+	#define DBR_DBR_TYPE_MASK				    0xf0000000UL
+	#define DBR_DBR_TYPE_SFT				    28
+	#define DBR_DBR_TYPE_SQ				   (0x0UL << 28)
+	#define DBR_DBR_TYPE_RQ				   (0x1UL << 28)
+	#define DBR_DBR_TYPE_SRQ				   (0x2UL << 28)
+	#define DBR_DBR_TYPE_SRQ_ARM				   (0x3UL << 28)
+	#define DBR_DBR_TYPE_CQ				   (0x4UL << 28)
+	#define DBR_DBR_TYPE_CQ_ARMSE				   (0x5UL << 28)
+	#define DBR_DBR_TYPE_CQ_ARMALL				   (0x6UL << 28)
+	#define DBR_DBR_TYPE_CQ_ARMENA				   (0x7UL << 28)
+	#define DBR_DBR_TYPE_SRQ_ARMENA			   (0x8UL << 28)
+	#define DBR_DBR_TYPE_CQ_CUTOFF_ACK			   (0x9UL << 28)
+	#define DBR_DBR_TYPE_NULL				   (0xfUL << 28)
+};
+
+/* 32b Doorbell Format (4 bytes) */
+struct dbr_dbr32 {
+	__le32 type_abs_incr_xid;
+	#define DBR_DBR32_XID_MASK				    0xfffffUL
+	#define DBR_DBR32_XID_SFT				    0
+	#define DBR_DBR32_RESERVED4_MASK			    0xf00000UL
+	#define DBR_DBR32_RESERVED4_SFT			    20
+	#define DBR_DBR32_INCR_MASK				    0xf000000UL
+	#define DBR_DBR32_INCR_SFT				    24
+	#define DBR_DBR32_ABS					    0x10000000UL
+	#define DBR_DBR32_TYPE_MASK				    0xe0000000UL
+	#define DBR_DBR32_TYPE_SFT				    29
+	#define DBR_DBR32_TYPE_SQ				   (0x0UL << 29)
+};
+
+/* SQ WQE Structures */
+/* Base SQ WQE (8 bytes) */
+struct sq_base {
+	u8 wqe_type;
+	#define SQ_BASE_WQE_TYPE_SEND				   0x0UL
+	#define SQ_BASE_WQE_TYPE_SEND_W_IMMEAD			   0x1UL
+	#define SQ_BASE_WQE_TYPE_SEND_W_INVALID		   0x2UL
+	#define SQ_BASE_WQE_TYPE_WRITE_WQE			   0x4UL
+	#define SQ_BASE_WQE_TYPE_WRITE_W_IMMEAD		   0x5UL
+	#define SQ_BASE_WQE_TYPE_READ_WQE			   0x6UL
+	#define SQ_BASE_WQE_TYPE_ATOMIC_CS			   0x8UL
+	#define SQ_BASE_WQE_TYPE_ATOMIC_FA			   0xbUL
+	#define SQ_BASE_WQE_TYPE_LOCAL_INVALID			   0xcUL
+	#define SQ_BASE_WQE_TYPE_FR_PMR			   0xdUL
+	#define SQ_BASE_WQE_TYPE_BIND				   0xeUL
+	u8 unused_0[7];
+};
+
+/* WQE SGE (16 bytes) */
+struct sq_sge {
+	__le64 va_or_pa;
+	__le32 l_key;
+	__le32 size;
+};
+
+/* PSN Search Structure (8 bytes) */
+struct sq_psn_search {
+	__le32 opcode_start_psn;
+	#define SQ_PSN_SEARCH_START_PSN_MASK			    0xffffffUL
+	#define SQ_PSN_SEARCH_START_PSN_SFT			    0
+	#define SQ_PSN_SEARCH_OPCODE_MASK			    0xff000000UL
+	#define SQ_PSN_SEARCH_OPCODE_SFT			    24
+	__le32 flags_next_psn;
+	#define SQ_PSN_SEARCH_NEXT_PSN_MASK			    0xffffffUL
+	#define SQ_PSN_SEARCH_NEXT_PSN_SFT			    0
+	#define SQ_PSN_SEARCH_FLAGS_MASK			    0xff000000UL
+	#define SQ_PSN_SEARCH_FLAGS_SFT			    24
+};
+
+/* Send SQ WQE (40 bytes) */
+struct sq_send {
+	u8 wqe_type;
+	#define SQ_SEND_WQE_TYPE_SEND				   0x0UL
+	#define SQ_SEND_WQE_TYPE_SEND_W_IMMEAD			   0x1UL
+	#define SQ_SEND_WQE_TYPE_SEND_W_INVALID		   0x2UL
+	u8 flags;
+	#define SQ_SEND_FLAGS_SIGNAL_COMP			    0x1UL
+	#define SQ_SEND_FLAGS_RD_OR_ATOMIC_FENCE		    0x2UL
+	#define SQ_SEND_FLAGS_UC_FENCE				    0x4UL
+	#define SQ_SEND_FLAGS_SE				    0x8UL
+	#define SQ_SEND_FLAGS_INLINE				    0x10UL
+	u8 wqe_size;
+	u8 reserved8_1;
+	__le32 inv_key_or_imm_data;
+	__le32 length;
+	__le32 q_key;
+	__le32 dst_qp;
+	#define SQ_SEND_DST_QP_MASK				    0xffffffUL
+	#define SQ_SEND_DST_QP_SFT				    0
+	#define SQ_SEND_RESERVED8_2_MASK			    0xff000000UL
+	#define SQ_SEND_RESERVED8_2_SFT			    24
+	__le32 avid;
+	#define SQ_SEND_AVID_MASK				    0xfffffUL
+	#define SQ_SEND_AVID_SFT				    0
+	#define SQ_SEND_RESERVED_AVID_MASK			    0xfff00000UL
+	#define SQ_SEND_RESERVED_AVID_SFT			    20
+	__le64 reserved64;
+	__le32 data[24];
+};
+
+/* Send Raw Ethernet and QP1 SQ WQE (40 bytes) */
+struct sq_send_raweth_qp1 {
+	u8 wqe_type;
+	#define SQ_SEND_RAWETH_QP1_WQE_TYPE_SEND		   0x0UL
+	u8 flags;
+	#define SQ_SEND_RAWETH_QP1_FLAGS_SIGNAL_COMP		    0x1UL
+	#define SQ_SEND_RAWETH_QP1_FLAGS_RD_OR_ATOMIC_FENCE	    0x2UL
+	#define SQ_SEND_RAWETH_QP1_FLAGS_UC_FENCE		    0x4UL
+	#define SQ_SEND_RAWETH_QP1_FLAGS_SE			    0x8UL
+	#define SQ_SEND_RAWETH_QP1_FLAGS_INLINE		    0x10UL
+	u8 wqe_size;
+	u8 reserved8;
+	__le16 lflags;
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_TCP_UDP_CHKSUM	    0x1UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_IP_CHKSUM		    0x2UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_NOCRC		    0x4UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_STAMP		    0x8UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_T_IP_CHKSUM		    0x10UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_RESERVED1_1		    0x20UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_RESERVED1_2		    0x40UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_RESERVED1_3		    0x80UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_ROCE_CRC		    0x100UL
+	#define SQ_SEND_RAWETH_QP1_LFLAGS_FCOE_CRC		    0x200UL
+	__le16 cfa_action;
+	__le32 length;
+	__le32 reserved32_1;
+	__le32 cfa_meta;
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_MASK	    0xfffUL
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_SFT	    0
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_DE		    0x1000UL
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_PRI_MASK	    0xe000UL
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_PRI_SFT	    13
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_MASK	    0x70000UL
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_SFT	    16
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPID88A8    (0x0UL << 16)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPID8100    (0x1UL << 16)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPID9100    (0x2UL << 16)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPID9200    (0x3UL << 16)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPID9300    (0x4UL << 16)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPIDCFG     (0x5UL << 16)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_LAST	\
+				SQ_SEND_RAWETH_QP1_CFA_META_VLAN_TPID_TPIDCFG
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_RESERVED_MASK     0xff80000UL
+	#define SQ_SEND_RAWETH_QP1_CFA_META_VLAN_RESERVED_SFT      19
+	#define SQ_SEND_RAWETH_QP1_CFA_META_KEY_MASK		    0xf0000000UL
+	#define SQ_SEND_RAWETH_QP1_CFA_META_KEY_SFT		    28
+	#define SQ_SEND_RAWETH_QP1_CFA_META_KEY_NONE		   (0x0UL << 28)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_KEY_VLAN_TAG	   (0x1UL << 28)
+	#define SQ_SEND_RAWETH_QP1_CFA_META_KEY_LAST		\
+				SQ_SEND_RAWETH_QP1_CFA_META_KEY_VLAN_TAG
+	__le32 reserved32_2;
+	__le64 reserved64;
+	__le32 data[24];
+};
+
+/* RDMA SQ WQE (40 bytes) */
+struct sq_rdma {
+	u8 wqe_type;
+	#define SQ_RDMA_WQE_TYPE_WRITE_WQE			   0x4UL
+	#define SQ_RDMA_WQE_TYPE_WRITE_W_IMMEAD		   0x5UL
+	#define SQ_RDMA_WQE_TYPE_READ_WQE			   0x6UL
+	u8 flags;
+	#define SQ_RDMA_FLAGS_SIGNAL_COMP			    0x1UL
+	#define SQ_RDMA_FLAGS_RD_OR_ATOMIC_FENCE		    0x2UL
+	#define SQ_RDMA_FLAGS_UC_FENCE				    0x4UL
+	#define SQ_RDMA_FLAGS_SE				    0x8UL
+	#define SQ_RDMA_FLAGS_INLINE				    0x10UL
+	u8 wqe_size;
+	u8 reserved8;
+	__le32 imm_data;
+	__le32 length;
+	__le32 reserved32_1;
+	__le64 remote_va;
+	__le32 remote_key;
+	__le32 reserved32_2;
+	__le32 data[24];
+};
+
+/* Atomic SQ WQE (40 bytes) */
+struct sq_atomic {
+	u8 wqe_type;
+	#define SQ_ATOMIC_WQE_TYPE_ATOMIC_CS			   0x8UL
+	#define SQ_ATOMIC_WQE_TYPE_ATOMIC_FA			   0xbUL
+	u8 flags;
+	#define SQ_ATOMIC_FLAGS_SIGNAL_COMP			    0x1UL
+	#define SQ_ATOMIC_FLAGS_RD_OR_ATOMIC_FENCE		    0x2UL
+	#define SQ_ATOMIC_FLAGS_UC_FENCE			    0x4UL
+	#define SQ_ATOMIC_FLAGS_SE				    0x8UL
+	#define SQ_ATOMIC_FLAGS_INLINE				    0x10UL
+	__le16 reserved16;
+	__le32 remote_key;
+	__le64 remote_va;
+	__le64 swap_data;
+	__le64 cmp_data;
+	__le32 data[24];
+};
+
+/* Local Invalidate SQ WQE (40 bytes) */
+struct sq_localinvalidate {
+	u8 wqe_type;
+	#define SQ_LOCALINVALIDATE_WQE_TYPE_LOCAL_INVALID	   0xcUL
+	u8 flags;
+	#define SQ_LOCALINVALIDATE_FLAGS_SIGNAL_COMP		    0x1UL
+	#define SQ_LOCALINVALIDATE_FLAGS_RD_OR_ATOMIC_FENCE	    0x2UL
+	#define SQ_LOCALINVALIDATE_FLAGS_UC_FENCE		    0x4UL
+	#define SQ_LOCALINVALIDATE_FLAGS_SE			    0x8UL
+	#define SQ_LOCALINVALIDATE_FLAGS_INLINE		    0x10UL
+	__le16 reserved16;
+	__le32 inv_l_key;
+	__le64 reserved64;
+	__le32 reserved128[4];
+	__le32 data[24];
+};
+
+/* FR-PMR SQ WQE (40 bytes) */
+struct sq_fr_pmr {
+	u8 wqe_type;
+	#define SQ_FR_PMR_WQE_TYPE_FR_PMR			   0xdUL
+	u8 flags;
+	#define SQ_FR_PMR_FLAGS_SIGNAL_COMP			    0x1UL
+	#define SQ_FR_PMR_FLAGS_RD_OR_ATOMIC_FENCE		    0x2UL
+	#define SQ_FR_PMR_FLAGS_UC_FENCE			    0x4UL
+	#define SQ_FR_PMR_FLAGS_SE				    0x8UL
+	#define SQ_FR_PMR_FLAGS_INLINE				    0x10UL
+	u8 access_cntl;
+	#define SQ_FR_PMR_ACCESS_CNTL_LOCAL_WRITE		    0x1UL
+	#define SQ_FR_PMR_ACCESS_CNTL_REMOTE_READ		    0x2UL
+	#define SQ_FR_PMR_ACCESS_CNTL_REMOTE_WRITE		    0x4UL
+	#define SQ_FR_PMR_ACCESS_CNTL_REMOTE_ATOMIC		    0x8UL
+	#define SQ_FR_PMR_ACCESS_CNTL_WINDOW_BIND		    0x10UL
+	u8 zero_based_page_size_log;
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_MASK			    0x1fUL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_SFT			    0
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_4K		   0x0UL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_8K		   0x1UL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_64K		   0x4UL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_256K		   0x6UL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_1M		   0x8UL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_2M		   0x9UL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_4M		   0xaUL
+	#define SQ_FR_PMR_PAGE_SIZE_LOG_PGSZ_1G		   0x12UL
+	#define SQ_FR_PMR_ZERO_BASED				    0x20UL
+	#define SQ_FR_PMR_RESERVED2_MASK			    0xc0UL
+	#define SQ_FR_PMR_RESERVED2_SFT			    6
+	__le32 l_key;
+	u8 length[5];
+	u8 reserved8_1;
+	u8 reserved8_2;
+	u8 numlevels_pbl_page_size_log;
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_MASK		    0x1fUL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_SFT		    0
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_4K		   0x0UL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_8K		   0x1UL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_64K		   0x4UL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_256K		   0x6UL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_1M		   0x8UL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_2M		   0x9UL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_4M		   0xaUL
+	#define SQ_FR_PMR_PBL_PAGE_SIZE_LOG_PGSZ_1G		   0x12UL
+	#define SQ_FR_PMR_RESERVED1				    0x20UL
+	#define SQ_FR_PMR_NUMLEVELS_MASK			    0xc0UL
+	#define SQ_FR_PMR_NUMLEVELS_SFT			    6
+	#define SQ_FR_PMR_NUMLEVELS_PHYSICAL			   (0x0UL << 6)
+	#define SQ_FR_PMR_NUMLEVELS_LAYER1			   (0x1UL << 6)
+	#define SQ_FR_PMR_NUMLEVELS_LAYER2			   (0x2UL << 6)
+	__le64 pblptr;
+	__le64 va;
+	__le32 data[24];
+};
+
+/* Bind SQ WQE (40 bytes) */
+struct sq_bind {
+	u8 wqe_type;
+	#define SQ_BIND_WQE_TYPE_BIND				   0xeUL
+	u8 flags;
+	#define SQ_BIND_FLAGS_SIGNAL_COMP			    0x1UL
+	#define SQ_BIND_FLAGS_RD_OR_ATOMIC_FENCE		    0x2UL
+	#define SQ_BIND_FLAGS_UC_FENCE				    0x4UL
+	#define SQ_BIND_FLAGS_SE				    0x8UL
+	#define SQ_BIND_FLAGS_INLINE				    0x10UL
+	u8 access_cntl;
+	#define SQ_BIND_ACCESS_CNTL_LOCAL_WRITE		    0x1UL
+	#define SQ_BIND_ACCESS_CNTL_REMOTE_READ		    0x2UL
+	#define SQ_BIND_ACCESS_CNTL_REMOTE_WRITE		    0x4UL
+	#define SQ_BIND_ACCESS_CNTL_REMOTE_ATOMIC		    0x8UL
+	#define SQ_BIND_ACCESS_CNTL_WINDOW_BIND		    0x10UL
+	u8 reserved8_1;
+	u8 mw_type_zero_based;
+	#define SQ_BIND_ZERO_BASED				    0x1UL
+	#define SQ_BIND_MW_TYPE				    0x2UL
+	#define SQ_BIND_MW_TYPE_TYPE1				   (0x0UL << 1)
+	#define SQ_BIND_MW_TYPE_TYPE2				   (0x1UL << 1)
+	#define SQ_BIND_RESERVED6_MASK				    0xfcUL
+	#define SQ_BIND_RESERVED6_SFT				    2
+	u8 reserved8_2;
+	__le16 reserved16;
+	__le32 parent_l_key;
+	__le32 l_key;
+	__le64 va;
+	u8 length[5];
+	u8 data_reserved24[99];
+	#define SQ_BIND_RESERVED24_MASK			    0xffffff00UL
+	#define SQ_BIND_RESERVED24_SFT				    8
+	#define SQ_BIND_DATA_MASK				    0xffffffffUL
+	#define SQ_BIND_DATA_SFT				    0
+};
+
+/* RQ/SRQ WQE Structures */
+/* RQ/SRQ WQE (40 bytes) */
+struct rq_wqe {
+	u8 wqe_type;
+	#define RQ_WQE_WQE_TYPE_RCV				   0x80UL
+	u8 flags;
+	u8 wqe_size;
+	u8 reserved8;
+	__le32 reserved32;
+	__le32 wr_id[2];
+	#define RQ_WQE_WR_ID_MASK				    0xfffffUL
+	#define RQ_WQE_WR_ID_SFT				    0
+	#define RQ_WQE_RESERVED44_MASK				    0xfff00000UL
+	#define RQ_WQE_RESERVED44_SFT				    20
+	__le32 reserved128[4];
+	__le32 data[24];
+};
+
+/* CQ CQE Structures */
+/* Base CQE (32 bytes) */
+struct cq_base {
+	__le64 reserved64_1;
+	__le64 reserved64_2;
+	__le64 reserved64_3;
+	u8 cqe_type_toggle;
+	#define CQ_BASE_TOGGLE					    0x1UL
+	#define CQ_BASE_CQE_TYPE_MASK				    0x1eUL
+	#define CQ_BASE_CQE_TYPE_SFT				    1
+	#define CQ_BASE_CQE_TYPE_REQ				   (0x0UL << 1)
+	#define CQ_BASE_CQE_TYPE_RES_RC			   (0x1UL << 1)
+	#define CQ_BASE_CQE_TYPE_RES_UD			   (0x2UL << 1)
+	#define CQ_BASE_CQE_TYPE_RES_RAWETH_QP1		   (0x3UL << 1)
+	#define CQ_BASE_CQE_TYPE_TERMINAL			   (0xeUL << 1)
+	#define CQ_BASE_CQE_TYPE_CUT_OFF			   (0xfUL << 1)
+	#define CQ_BASE_RESERVED3_MASK				    0xe0UL
+	#define CQ_BASE_RESERVED3_SFT				    5
+	u8 status;
+	__le16 reserved16;
+	__le32 reserved32;
+};
+
+/* Requester CQ CQE (32 bytes) */
+struct cq_req {
+	__le64 qp_handle;
+	__le16 sq_cons_idx;
+	__le16 reserved16_1;
+	__le32 reserved32_2;
+	__le64 reserved64;
+	u8 cqe_type_toggle;
+	#define CQ_REQ_TOGGLE					    0x1UL
+	#define CQ_REQ_CQE_TYPE_MASK				    0x1eUL
+	#define CQ_REQ_CQE_TYPE_SFT				    1
+	#define CQ_REQ_CQE_TYPE_REQ				   (0x0UL << 1)
+	#define CQ_REQ_RESERVED3_MASK				    0xe0UL
+	#define CQ_REQ_RESERVED3_SFT				    5
+	u8 status;
+	#define CQ_REQ_STATUS_OK				   0x0UL
+	#define CQ_REQ_STATUS_BAD_RESPONSE_ERR			   0x1UL
+	#define CQ_REQ_STATUS_LOCAL_LENGTH_ERR			   0x2UL
+	#define CQ_REQ_STATUS_LOCAL_QP_OPERATION_ERR		   0x3UL
+	#define CQ_REQ_STATUS_LOCAL_PROTECTION_ERR		   0x4UL
+	#define CQ_REQ_STATUS_MEMORY_MGT_OPERATION_ERR		   0x5UL
+	#define CQ_REQ_STATUS_REMOTE_INVALID_REQUEST_ERR	   0x6UL
+	#define CQ_REQ_STATUS_REMOTE_ACCESS_ERR		   0x7UL
+	#define CQ_REQ_STATUS_REMOTE_OPERATION_ERR		   0x8UL
+	#define CQ_REQ_STATUS_RNR_NAK_RETRY_CNT_ERR		   0x9UL
+	#define CQ_REQ_STATUS_TRANSPORT_RETRY_CNT_ERR		   0xaUL
+	#define CQ_REQ_STATUS_WORK_REQUEST_FLUSHED_ERR		   0xbUL
+	__le16 reserved16_2;
+	__le32 reserved32_1;
+};
+
+/* Responder RC CQE (32 bytes) */
+struct cq_res_rc {
+	__le32 length;
+	__le32 imm_data_or_inv_r_key;
+	__le64 qp_handle;
+	__le64 mr_handle;
+	u8 cqe_type_toggle;
+	#define CQ_RES_RC_TOGGLE				    0x1UL
+	#define CQ_RES_RC_CQE_TYPE_MASK			    0x1eUL
+	#define CQ_RES_RC_CQE_TYPE_SFT				    1
+	#define CQ_RES_RC_CQE_TYPE_RES_RC			   (0x1UL << 1)
+	#define CQ_RES_RC_RESERVED3_MASK			    0xe0UL
+	#define CQ_RES_RC_RESERVED3_SFT			    5
+	u8 status;
+	#define CQ_RES_RC_STATUS_OK				   0x0UL
+	#define CQ_RES_RC_STATUS_LOCAL_ACCESS_ERROR		   0x1UL
+	#define CQ_RES_RC_STATUS_LOCAL_LENGTH_ERR		   0x2UL
+	#define CQ_RES_RC_STATUS_LOCAL_PROTECTION_ERR		   0x3UL
+	#define CQ_RES_RC_STATUS_LOCAL_QP_OPERATION_ERR	   0x4UL
+	#define CQ_RES_RC_STATUS_MEMORY_MGT_OPERATION_ERR	   0x5UL
+	#define CQ_RES_RC_STATUS_REMOTE_INVALID_REQUEST_ERR       0x6UL
+	#define CQ_RES_RC_STATUS_WORK_REQUEST_FLUSHED_ERR	   0x7UL
+	#define CQ_RES_RC_STATUS_HW_FLUSH_ERR			   0x8UL
+	__le16 flags;
+	#define CQ_RES_RC_FLAGS_SRQ				    0x1UL
+	#define CQ_RES_RC_FLAGS_SRQ_RQ				   (0x0UL << 0)
+	#define CQ_RES_RC_FLAGS_SRQ_SRQ			   (0x1UL << 0)
+	#define CQ_RES_RC_FLAGS_SRQ_LAST    CQ_RES_RC_FLAGS_SRQ_SRQ
+	#define CQ_RES_RC_FLAGS_IMM				    0x2UL
+	#define CQ_RES_RC_FLAGS_INV				    0x4UL
+	#define CQ_RES_RC_FLAGS_RDMA				    0x8UL
+	#define CQ_RES_RC_FLAGS_RDMA_SEND			   (0x0UL << 3)
+	#define CQ_RES_RC_FLAGS_RDMA_RDMA_WRITE		   (0x1UL << 3)
+	#define CQ_RES_RC_FLAGS_RDMA_LAST    CQ_RES_RC_FLAGS_RDMA_RDMA_WRITE
+	__le32 srq_or_rq_wr_id;
+	#define CQ_RES_RC_SRQ_OR_RQ_WR_ID_MASK			    0xfffffUL
+	#define CQ_RES_RC_SRQ_OR_RQ_WR_ID_SFT			    0
+	#define CQ_RES_RC_RESERVED12_MASK			    0xfff00000UL
+	#define CQ_RES_RC_RESERVED12_SFT			    20
+};
+
+/* Responder UD CQE (32 bytes) */
+struct cq_res_ud {
+	__le32 length;
+	#define CQ_RES_UD_LENGTH_MASK				    0x3fffUL
+	#define CQ_RES_UD_LENGTH_SFT				    0
+	#define CQ_RES_UD_RESERVED18_MASK			    0xffffc000UL
+	#define CQ_RES_UD_RESERVED18_SFT			    14
+	__le32 imm_data;
+	__le64 qp_handle;
+	__le16 src_mac[3];
+	__le16 src_qp_low;
+	u8 cqe_type_toggle;
+	#define CQ_RES_UD_TOGGLE				    0x1UL
+	#define CQ_RES_UD_CQE_TYPE_MASK			    0x1eUL
+	#define CQ_RES_UD_CQE_TYPE_SFT				    1
+	#define CQ_RES_UD_CQE_TYPE_RES_UD			   (0x2UL << 1)
+	#define CQ_RES_UD_RESERVED3_MASK			    0xe0UL
+	#define CQ_RES_UD_RESERVED3_SFT			    5
+	u8 status;
+	#define CQ_RES_UD_STATUS_OK				   0x0UL
+	#define CQ_RES_UD_STATUS_LOCAL_ACCESS_ERROR		   0x1UL
+	#define CQ_RES_UD_STATUS_HW_LOCAL_LENGTH_ERR		   0x2UL
+	#define CQ_RES_UD_STATUS_LOCAL_PROTECTION_ERR		   0x3UL
+	#define CQ_RES_UD_STATUS_LOCAL_QP_OPERATION_ERR	   0x4UL
+	#define CQ_RES_UD_STATUS_MEMORY_MGT_OPERATION_ERR	   0x5UL
+	#define CQ_RES_UD_STATUS_WORK_REQUEST_FLUSHED_ERR	   0x7UL
+	#define CQ_RES_UD_STATUS_HW_FLUSH_ERR			   0x8UL
+	__le16 flags;
+	#define CQ_RES_UD_FLAGS_SRQ				    0x1UL
+	#define CQ_RES_UD_FLAGS_SRQ_RQ				   (0x0UL << 0)
+	#define CQ_RES_UD_FLAGS_SRQ_SRQ			   (0x1UL << 0)
+	#define CQ_RES_UD_FLAGS_SRQ_LAST    CQ_RES_UD_FLAGS_SRQ_SRQ
+	#define CQ_RES_UD_FLAGS_IMM				    0x2UL
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK		    0xcUL
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT		    2
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_V1			   (0x0UL << 2)
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV4		   (0x2UL << 2)
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6		   (0x3UL << 2)
+	#define CQ_RES_UD_FLAGS_ROCE_IP_VER_LAST		\
+					CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6
+	__le32 src_qp_high_srq_or_rq_wr_id;
+	#define CQ_RES_UD_SRQ_OR_RQ_WR_ID_MASK			    0xfffffUL
+	#define CQ_RES_UD_SRQ_OR_RQ_WR_ID_SFT			    0
+	#define CQ_RES_UD_RESERVED4_MASK			    0xf00000UL
+	#define CQ_RES_UD_RESERVED4_SFT			    20
+	#define CQ_RES_UD_SRC_QP_HIGH_MASK			    0xff000000UL
+	#define CQ_RES_UD_SRC_QP_HIGH_SFT			    24
+};
+
+/* Responder RawEth and QP1 CQE (32 bytes) */
+struct cq_res_raweth_qp1 {
+	__le16 length;
+	#define CQ_RES_RAWETH_QP1_LENGTH_MASK			    0x3fffUL
+	#define CQ_RES_RAWETH_QP1_LENGTH_SFT			    0
+	#define CQ_RES_RAWETH_QP1_RESERVED2_MASK		    0xc000UL
+	#define CQ_RES_RAWETH_QP1_RESERVED2_SFT		    14
+	__le16 raweth_qp1_flags;
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ERROR	    0x1UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_RESERVED5_1_MASK 0x3eUL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_RESERVED5_1_SFT 1
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_MASK      0x3c0UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_SFT       6
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_NOT_KNOWN (0x0UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_IP       (0x1UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_TCP      (0x2UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_UDP      (0x3UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_FCOE     (0x4UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_ROCE     (0x5UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_ICMP     (0x7UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_PTP_WO_TIMESTAMP \
+								 (0x8UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_PTP_W_TIMESTAMP \
+								 (0x9UL << 6)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_LAST	\
+		CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_ITYPE_PTP_W_TIMESTAMP
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_MASK	    0x3ffUL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS_SFT		    0
+	#define CQ_RES_RAWETH_QP1_RESERVED6_MASK		    0xfc00UL
+	#define CQ_RES_RAWETH_QP1_RESERVED6_SFT		    10
+	__le16 raweth_qp1_errors;
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_RESERVED4_MASK 0xfUL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_RESERVED4_SFT  0
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_IP_CS_ERROR    0x10UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_L4_CS_ERROR    0x20UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_IP_CS_ERROR  0x40UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_L4_CS_ERROR  0x80UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_CRC_ERROR      0x100UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_MASK 0xe00UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_SFT 9
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_NO_ERROR \
+								(0x0UL << 9)
+	#define \
+	   CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_T_L3_BAD_VERSION \
+								(0x1UL << 9)
+	#define \
+	   CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_T_L3_BAD_HDR_LEN \
+								(0x2UL << 9)
+	#define \
+	   CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_TUNNEL_TOTAL_ERROR \
+								(0x3UL << 9)
+	#define \
+	   CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_T_IP_TOTAL_ERROR \
+								(0x4UL << 9)
+	#define \
+	   CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_T_UDP_TOTAL_ERROR \
+								(0x5UL << 9)
+	#define \
+	   CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_T_L3_BAD_TTL \
+								(0x6UL << 9)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_LAST \
+		CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_T_PKT_ERROR_T_L3_BAD_TTL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_MASK 0xf000UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_SFT  12
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_NO_ERROR \
+								(0x0UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L3_BAD_VERSION \
+								(0x1UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L3_BAD_HDR_LEN \
+								 (0x2UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L3_BAD_TTL \
+								 (0x3UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_IP_TOTAL_ERROR \
+								 (0x4UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_UDP_TOTAL_ERROR \
+								 (0x5UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L4_BAD_HDR_LEN \
+								 (0x6UL << 12)
+	#define \
+	 CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L4_BAD_HDR_LEN_TOO_SMALL\
+								 (0x7UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L4_BAD_OPT_LEN \
+								 (0x8UL << 12)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_LAST \
+		CQ_RES_RAWETH_QP1_RAWETH_QP1_ERRORS_PKT_ERROR_L4_BAD_OPT_LEN
+	__le16 raweth_qp1_cfa_code;
+	__le64 qp_handle;
+	__le32 raweth_qp1_flags2;
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_IP_CS_CALC     0x1UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_L4_CS_CALC     0x2UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_T_IP_CS_CALC   0x4UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_T_L4_CS_CALC   0x8UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_META_FORMAT_MASK 0xf0UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_META_FORMAT_SFT 4
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_META_FORMAT_NONE \
+								(0x0UL << 4)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_META_FORMAT_VLAN \
+								(0x1UL << 4)
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_META_FORMAT_LAST\
+			CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_META_FORMAT_VLAN
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_FLAGS2_IP_TYPE	    0x100UL
+	__le32 raweth_qp1_metadata;
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_VID_MASK     0xfffUL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_VID_SFT      0
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_DE	    0x1000UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_PRI_MASK     0xe000UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_PRI_SFT      13
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_TPID_MASK    0xffff0000UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_METADATA_TPID_SFT     16
+	u8 cqe_type_toggle;
+	#define CQ_RES_RAWETH_QP1_TOGGLE			    0x1UL
+	#define CQ_RES_RAWETH_QP1_CQE_TYPE_MASK		    0x1eUL
+	#define CQ_RES_RAWETH_QP1_CQE_TYPE_SFT			    1
+	#define CQ_RES_RAWETH_QP1_CQE_TYPE_RES_RAWETH_QP1	   (0x3UL << 1)
+	#define CQ_RES_RAWETH_QP1_RESERVED3_MASK		    0xe0UL
+	#define CQ_RES_RAWETH_QP1_RESERVED3_SFT		    5
+	u8 status;
+	#define CQ_RES_RAWETH_QP1_STATUS_OK			   0x0UL
+	#define CQ_RES_RAWETH_QP1_STATUS_LOCAL_ACCESS_ERROR       0x1UL
+	#define CQ_RES_RAWETH_QP1_STATUS_HW_LOCAL_LENGTH_ERR      0x2UL
+	#define CQ_RES_RAWETH_QP1_STATUS_LOCAL_PROTECTION_ERR     0x3UL
+	#define CQ_RES_RAWETH_QP1_STATUS_LOCAL_QP_OPERATION_ERR   0x4UL
+	#define CQ_RES_RAWETH_QP1_STATUS_MEMORY_MGT_OPERATION_ERR 0x5UL
+	#define CQ_RES_RAWETH_QP1_STATUS_WORK_REQUEST_FLUSHED_ERR 0x7UL
+	#define CQ_RES_RAWETH_QP1_STATUS_HW_FLUSH_ERR		   0x8UL
+	__le16 flags;
+	#define CQ_RES_RAWETH_QP1_FLAGS_SRQ			    0x1UL
+	#define CQ_RES_RAWETH_QP1_FLAGS_SRQ_RQ			   0x0UL
+	#define CQ_RES_RAWETH_QP1_FLAGS_SRQ_SRQ		   0x1UL
+	#define CQ_RES_RAWETH_QP1_FLAGS_SRQ_LAST \
+					CQ_RES_RAWETH_QP1_FLAGS_SRQ_SRQ
+	__le32 raweth_qp1_payload_offset_srq_or_rq_wr_id;
+	#define CQ_RES_RAWETH_QP1_SRQ_OR_RQ_WR_ID_MASK		    0xfffffUL
+	#define CQ_RES_RAWETH_QP1_SRQ_OR_RQ_WR_ID_SFT		    0
+	#define CQ_RES_RAWETH_QP1_RESERVED4_MASK		    0xf00000UL
+	#define CQ_RES_RAWETH_QP1_RESERVED4_SFT		    20
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_PAYLOAD_OFFSET_MASK   0xff000000UL
+	#define CQ_RES_RAWETH_QP1_RAWETH_QP1_PAYLOAD_OFFSET_SFT    24
+};
+
+/* Terminal CQE (32 bytes) */
+struct cq_terminal {
+	__le64 qp_handle;
+	__le16 sq_cons_idx;
+	__le16 rq_cons_idx;
+	__le32 reserved32_1;
+	__le64 reserved64_3;
+	u8 cqe_type_toggle;
+	#define CQ_TERMINAL_TOGGLE				    0x1UL
+	#define CQ_TERMINAL_CQE_TYPE_MASK			    0x1eUL
+	#define CQ_TERMINAL_CQE_TYPE_SFT			    1
+	#define CQ_TERMINAL_CQE_TYPE_TERMINAL			   (0xeUL << 1)
+	#define CQ_TERMINAL_RESERVED3_MASK			    0xe0UL
+	#define CQ_TERMINAL_RESERVED3_SFT			    5
+	u8 status;
+	#define CQ_TERMINAL_STATUS_OK				   0x0UL
+	__le16 reserved16;
+	__le32 reserved32_2;
+};
+
+/* Cutoff CQE (32 bytes) */
+struct cq_cutoff {
+	__le64 reserved64_1;
+	__le64 reserved64_2;
+	__le64 reserved64_3;
+	u8 cqe_type_toggle;
+	#define CQ_CUTOFF_TOGGLE				    0x1UL
+	#define CQ_CUTOFF_CQE_TYPE_MASK			    0x1eUL
+	#define CQ_CUTOFF_CQE_TYPE_SFT				    1
+	#define CQ_CUTOFF_CQE_TYPE_CUT_OFF			   (0xfUL << 1)
+	#define CQ_CUTOFF_RESERVED3_MASK			    0xe0UL
+	#define CQ_CUTOFF_RESERVED3_SFT			    5
+	u8 status;
+	#define CQ_CUTOFF_STATUS_OK				   0x0UL
+	__le16 reserved16;
+	__le32 reserved32;
+};
+
+/* Notification Queue (NQ) Structures */
+/* Base NQ Record (16 bytes) */
+struct nq_base {
+	__le16 info10_type;
+	#define NQ_BASE_TYPE_MASK				    0x3fUL
+	#define NQ_BASE_TYPE_SFT				    0
+	#define NQ_BASE_TYPE_CQ_NOTIFICATION			   0x30UL
+	#define NQ_BASE_TYPE_SRQ_EVENT				   0x32UL
+	#define NQ_BASE_TYPE_DBQ_EVENT				   0x34UL
+	#define NQ_BASE_TYPE_QP_EVENT				   0x38UL
+	#define NQ_BASE_TYPE_FUNC_EVENT			   0x3aUL
+	#define NQ_BASE_INFO10_MASK				    0xffc0UL
+	#define NQ_BASE_INFO10_SFT				    6
+	__le16 info16;
+	__le32 info32;
+	__le32 info63_v[2];
+	#define NQ_BASE_V					    0x1UL
+	#define NQ_BASE_INFO63_MASK				    0xfffffffeUL
+	#define NQ_BASE_INFO63_SFT				    1
+};
+
+/* Completion Queue Notification (16 bytes) */
+struct nq_cn {
+	__le16 type;
+	#define NQ_CN_TYPE_MASK				    0x3fUL
+	#define NQ_CN_TYPE_SFT					    0
+	#define NQ_CN_TYPE_CQ_NOTIFICATION			   0x30UL
+	#define NQ_CN_RESERVED9_MASK				    0xffc0UL
+	#define NQ_CN_RESERVED9_SFT				    6
+	__le16 reserved16;
+	__le32 cq_handle_low;
+	__le32 v;
+	#define NQ_CN_V					    0x1UL
+	#define NQ_CN_RESERVED31_MASK				    0xfffffffeUL
+	#define NQ_CN_RESERVED31_SFT				    1
+	__le32 cq_handle_high;
+};
+
+/* SRQ Event Notification (16 bytes) */
+struct nq_srq_event {
+	u8 type;
+	#define NQ_SRQ_EVENT_TYPE_MASK				    0x3fUL
+	#define NQ_SRQ_EVENT_TYPE_SFT				    0
+	#define NQ_SRQ_EVENT_TYPE_SRQ_EVENT			   0x32UL
+	#define NQ_SRQ_EVENT_RESERVED1_MASK			    0xc0UL
+	#define NQ_SRQ_EVENT_RESERVED1_SFT			    6
+	u8 event;
+	#define NQ_SRQ_EVENT_EVENT_SRQ_THRESHOLD_EVENT		   0x1UL
+	__le16 reserved16;
+	__le32 srq_handle_low;
+	__le32 v;
+	#define NQ_SRQ_EVENT_V					    0x1UL
+	#define NQ_SRQ_EVENT_RESERVED31_MASK			    0xfffffffeUL
+	#define NQ_SRQ_EVENT_RESERVED31_SFT			    1
+	__le32 srq_handle_high;
+};
+
+/* DBQ Async Event Notification (16 bytes) */
+struct nq_dbq_event {
+	u8 type;
+	#define NQ_DBQ_EVENT_TYPE_MASK				    0x3fUL
+	#define NQ_DBQ_EVENT_TYPE_SFT				    0
+	#define NQ_DBQ_EVENT_TYPE_DBQ_EVENT			   0x34UL
+	#define NQ_DBQ_EVENT_RESERVED1_MASK			    0xc0UL
+	#define NQ_DBQ_EVENT_RESERVED1_SFT			    6
+	u8 event;
+	#define NQ_DBQ_EVENT_EVENT_DBQ_THRESHOLD_EVENT		   0x1UL
+	__le16 db_pfid;
+	#define NQ_DBQ_EVENT_DB_PFID_MASK			    0xfUL
+	#define NQ_DBQ_EVENT_DB_PFID_SFT			    0
+	#define NQ_DBQ_EVENT_RESERVED12_MASK			    0xfff0UL
+	#define NQ_DBQ_EVENT_RESERVED12_SFT			    4
+	__le32 db_dpi;
+	#define NQ_DBQ_EVENT_DB_DPI_MASK			    0xfffffUL
+	#define NQ_DBQ_EVENT_DB_DPI_SFT			    0
+	#define NQ_DBQ_EVENT_RESERVED12_2_MASK			    0xfff00000UL
+	#define NQ_DBQ_EVENT_RESERVED12_2_SFT			    20
+	__le32 v;
+	#define NQ_DBQ_EVENT_V					    0x1UL
+	#define NQ_DBQ_EVENT_RESERVED32_MASK			    0xfffffffeUL
+	#define NQ_DBQ_EVENT_RESERVED32_SFT			    1
+	__le32 db_type_db_xid;
+	#define NQ_DBQ_EVENT_DB_XID_MASK			    0xfffffUL
+	#define NQ_DBQ_EVENT_DB_XID_SFT			    0
+	#define NQ_DBQ_EVENT_RESERVED8_MASK			    0xff00000UL
+	#define NQ_DBQ_EVENT_RESERVED8_SFT			    20
+	#define NQ_DBQ_EVENT_DB_TYPE_MASK			    0xf0000000UL
+	#define NQ_DBQ_EVENT_DB_TYPE_SFT			    28
+};
+
+/* Read Request/Response Queue Structures */
+/* Input Read Request Queue (IRRQ) Message (32 bytes) */
+struct xrrq_irrq {
+	__le16 credits_type;
+	#define XRRQ_IRRQ_TYPE					    0x1UL
+	#define XRRQ_IRRQ_TYPE_READ_REQ			   0x0UL
+	#define XRRQ_IRRQ_TYPE_ATOMIC_REQ			   0x1UL
+	#define XRRQ_IRRQ_RESERVED10_MASK			    0x7feUL
+	#define XRRQ_IRRQ_RESERVED10_SFT			    1
+	#define XRRQ_IRRQ_CREDITS_MASK				    0xf800UL
+	#define XRRQ_IRRQ_CREDITS_SFT				    11
+	__le16 reserved16;
+	__le32 reserved32;
+	__le32 psn;
+	#define XRRQ_IRRQ_PSN_MASK				    0xffffffUL
+	#define XRRQ_IRRQ_PSN_SFT				    0
+	#define XRRQ_IRRQ_RESERVED8_1_MASK			    0xff000000UL
+	#define XRRQ_IRRQ_RESERVED8_1_SFT			    24
+	__le32 msn;
+	#define XRRQ_IRRQ_MSN_MASK				    0xffffffUL
+	#define XRRQ_IRRQ_MSN_SFT				    0
+	#define XRRQ_IRRQ_RESERVED8_2_MASK			    0xff000000UL
+	#define XRRQ_IRRQ_RESERVED8_2_SFT			    24
+	__le64 va_or_atomic_result;
+	__le32 rdma_r_key;
+	__le32 length;
+};
+
+/* Output Read Request Queue (ORRQ) Message (32 bytes) */
+struct xrrq_orrq {
+	__le16 num_sges_type;
+	#define XRRQ_ORRQ_TYPE					    0x1UL
+	#define XRRQ_ORRQ_TYPE_READ_REQ			   0x0UL
+	#define XRRQ_ORRQ_TYPE_ATOMIC_REQ			   0x1UL
+	#define XRRQ_ORRQ_RESERVED10_MASK			    0x7feUL
+	#define XRRQ_ORRQ_RESERVED10_SFT			    1
+	#define XRRQ_ORRQ_NUM_SGES_MASK			    0xf800UL
+	#define XRRQ_ORRQ_NUM_SGES_SFT				    11
+	__le16 reserved16;
+	__le32 length;
+	__le32 psn;
+	#define XRRQ_ORRQ_PSN_MASK				    0xffffffUL
+	#define XRRQ_ORRQ_PSN_SFT				    0
+	#define XRRQ_ORRQ_RESERVED8_1_MASK			    0xff000000UL
+	#define XRRQ_ORRQ_RESERVED8_1_SFT			    24
+	__le32 end_psn;
+	#define XRRQ_ORRQ_END_PSN_MASK				    0xffffffUL
+	#define XRRQ_ORRQ_END_PSN_SFT				    0
+	#define XRRQ_ORRQ_RESERVED8_2_MASK			    0xff000000UL
+	#define XRRQ_ORRQ_RESERVED8_2_SFT			    24
+	__le64 first_sge_phy_or_sing_sge_va;
+	__le32 single_sge_l_key;
+	__le32 single_sge_size;
+};
+
+/* Page Buffer List Memory Structures (PBL) */
+/* Page Table Entry (PTE) (8 bytes) */
+struct ptu_pte {
+	__le32 page_next_to_last_last_valid[2];
+	#define PTU_PTE_VALID					    0x1UL
+	#define PTU_PTE_LAST					    0x2UL
+	#define PTU_PTE_NEXT_TO_LAST				    0x4UL
+	#define PTU_PTE_PAGE_MASK				    0xfffff000UL
+	#define PTU_PTE_PAGE_SFT				    12
+};
+
+/* Page Directory Entry (PDE) (8 bytes) */
+struct ptu_pde {
+	__le32 page_valid[2];
+	#define PTU_PDE_VALID					    0x1UL
+	#define PTU_PDE_PAGE_MASK				    0xfffff000UL
+	#define PTU_PDE_PAGE_SFT				    12
+};
+
+/* RoCE Fastpath Host Structures */
+/* Command Queue (CMDQ) Interface */
+/* Init CMDQ (16 bytes) */
+struct cmdq_init {
+	__le64 cmdq_pbl;
+	__le16 cmdq_size_cmdq_lvl;
+	#define CMDQ_INIT_CMDQ_LVL_MASK			    0x3UL
+	#define CMDQ_INIT_CMDQ_LVL_SFT				    0
+	#define CMDQ_INIT_CMDQ_SIZE_MASK			    0xfffcUL
+	#define CMDQ_INIT_CMDQ_SIZE_SFT			    2
+	__le16 creq_ring_id;
+	__le32 prod_idx;
+};
+
+/* Update CMDQ producer index (16 bytes) */
+struct cmdq_update {
+	__le64 reserved64;
+	__le32 reserved32;
+	__le32 prod_idx;
+};
+
+/* CMDQ common header structure (16 bytes) */
+struct cmdq_base {
+	u8 opcode;
+	#define CMDQ_BASE_OPCODE_CREATE_QP			   0x1UL
+	#define CMDQ_BASE_OPCODE_DESTROY_QP			   0x2UL
+	#define CMDQ_BASE_OPCODE_MODIFY_QP			   0x3UL
+	#define CMDQ_BASE_OPCODE_QUERY_QP			   0x4UL
+	#define CMDQ_BASE_OPCODE_CREATE_SRQ			   0x5UL
+	#define CMDQ_BASE_OPCODE_DESTROY_SRQ			   0x6UL
+	#define CMDQ_BASE_OPCODE_QUERY_SRQ			   0x8UL
+	#define CMDQ_BASE_OPCODE_CREATE_CQ			   0x9UL
+	#define CMDQ_BASE_OPCODE_DESTROY_CQ			   0xaUL
+	#define CMDQ_BASE_OPCODE_RESIZE_CQ			   0xcUL
+	#define CMDQ_BASE_OPCODE_ALLOCATE_MRW			   0xdUL
+	#define CMDQ_BASE_OPCODE_DEALLOCATE_KEY		   0xeUL
+	#define CMDQ_BASE_OPCODE_REGISTER_MR			   0xfUL
+	#define CMDQ_BASE_OPCODE_DEREGISTER_MR			   0x10UL
+	#define CMDQ_BASE_OPCODE_ADD_GID			   0x11UL
+	#define CMDQ_BASE_OPCODE_DELETE_GID			   0x12UL
+	#define CMDQ_BASE_OPCODE_MODIFY_GID			   0x17UL
+	#define CMDQ_BASE_OPCODE_QUERY_GID			   0x18UL
+	#define CMDQ_BASE_OPCODE_CREATE_QP1			   0x13UL
+	#define CMDQ_BASE_OPCODE_DESTROY_QP1			   0x14UL
+	#define CMDQ_BASE_OPCODE_CREATE_AH			   0x15UL
+	#define CMDQ_BASE_OPCODE_DESTROY_AH			   0x16UL
+	#define CMDQ_BASE_OPCODE_INITIALIZE_FW			   0x80UL
+	#define CMDQ_BASE_OPCODE_DEINITIALIZE_FW		   0x81UL
+	#define CMDQ_BASE_OPCODE_STOP_FUNC			   0x82UL
+	#define CMDQ_BASE_OPCODE_QUERY_FUNC			   0x83UL
+	#define CMDQ_BASE_OPCODE_SET_FUNC_RESOURCES		   0x84UL
+	#define CMDQ_BASE_OPCODE_READ_CONTEXT			   0x85UL
+	#define CMDQ_BASE_OPCODE_VF_BACKCHANNEL_REQUEST	   0x86UL
+	#define CMDQ_BASE_OPCODE_READ_VF_MEMORY		   0x87UL
+	#define CMDQ_BASE_OPCODE_COMPLETE_VF_REQUEST		   0x88UL
+	#define CMDQ_BASE_OPCODE_EXTEND_CONTEXT_ARRRAY		   0x89UL
+	#define CMDQ_BASE_OPCODE_MAP_TC_TO_COS			   0x8aUL
+	#define CMDQ_BASE_OPCODE_QUERY_VERSION			   0x8bUL
+	#define CMDQ_BASE_OPCODE_MODIFY_CC			   0x8cUL
+	#define CMDQ_BASE_OPCODE_QUERY_CC			   0x8dUL
+	#define CMDQ_BASE_OPCODE_QUERY_ROCE_STATS	   0x8eUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+};
+
+/* Create QP command (96 bytes) */
+struct cmdq_create_qp {
+	u8 opcode;
+	#define CMDQ_CREATE_QP_OPCODE_CREATE_QP		   0x1UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le64 qp_handle;
+	__le32 qp_flags;
+	#define CMDQ_CREATE_QP_QP_FLAGS_SRQ_USED		   0x1UL
+	#define CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION	   0x2UL
+	#define CMDQ_CREATE_QP_QP_FLAGS_RESERVED_LKEY_ENABLE      0x4UL
+	#define CMDQ_CREATE_QP_QP_FLAGS_FR_PMR_ENABLED		   0x8UL
+	u8 type;
+	#define CMDQ_CREATE_QP_TYPE_RC				   0x2UL
+	#define CMDQ_CREATE_QP_TYPE_UD				   0x4UL
+	#define CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE		   0x6UL
+	u8 sq_pg_size_sq_lvl;
+	#define CMDQ_CREATE_QP_SQ_LVL_MASK			    0xfUL
+	#define CMDQ_CREATE_QP_SQ_LVL_SFT			    0
+	#define CMDQ_CREATE_QP_SQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_CREATE_QP_SQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_CREATE_QP_SQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_MASK			    0xf0UL
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_SFT			    4
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_CREATE_QP_SQ_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 rq_pg_size_rq_lvl;
+	#define CMDQ_CREATE_QP_RQ_LVL_MASK			    0xfUL
+	#define CMDQ_CREATE_QP_RQ_LVL_SFT			    0
+	#define CMDQ_CREATE_QP_RQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_CREATE_QP_RQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_CREATE_QP_RQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_MASK			    0xf0UL
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_SFT			    4
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_CREATE_QP_RQ_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 unused_0;
+	__le32 dpi;
+	__le32 sq_size;
+	__le32 rq_size;
+	__le16 sq_fwo_sq_sge;
+	#define CMDQ_CREATE_QP_SQ_SGE_MASK			    0xfUL
+	#define CMDQ_CREATE_QP_SQ_SGE_SFT			    0
+	#define CMDQ_CREATE_QP_SQ_FWO_MASK			    0xfff0UL
+	#define CMDQ_CREATE_QP_SQ_FWO_SFT			    4
+	__le16 rq_fwo_rq_sge;
+	#define CMDQ_CREATE_QP_RQ_SGE_MASK			    0xfUL
+	#define CMDQ_CREATE_QP_RQ_SGE_SFT			    0
+	#define CMDQ_CREATE_QP_RQ_FWO_MASK			    0xfff0UL
+	#define CMDQ_CREATE_QP_RQ_FWO_SFT			    4
+	__le32 scq_cid;
+	__le32 rcq_cid;
+	__le32 srq_cid;
+	__le32 pd_id;
+	__le64 sq_pbl;
+	__le64 rq_pbl;
+	__le64 irrq_addr;
+	__le64 orrq_addr;
+};
+
+/* Destroy QP command (24 bytes) */
+struct cmdq_destroy_qp {
+	u8 opcode;
+	#define CMDQ_DESTROY_QP_OPCODE_DESTROY_QP		   0x2UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 qp_cid;
+	__le32 unused_0;
+};
+
+/* Modify QP command (112 bytes) */
+struct cmdq_modify_qp {
+	u8 opcode;
+	#define CMDQ_MODIFY_QP_OPCODE_MODIFY_QP		   0x3UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 modify_mask;
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_STATE		    0x1UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_EN_SQD_ASYNC_NOTIFY     0x2UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_ACCESS		    0x4UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_PKEY		    0x8UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_QKEY		    0x10UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_DGID		    0x20UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_FLOW_LABEL		    0x40UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX		    0x80UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_HOP_LIMIT		    0x100UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_TRAFFIC_CLASS	    0x200UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_DEST_MAC		    0x400UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU		    0x1000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_TIMEOUT		    0x2000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_RETRY_CNT		    0x4000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_RNR_RETRY		    0x8000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_RQ_PSN		    0x10000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_MAX_RD_ATOMIC	    0x20000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_MIN_RNR_TIMER	    0x40000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_SQ_PSN		    0x80000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_MAX_DEST_RD_ATOMIC      0x100000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_SQ_SIZE		    0x200000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_RQ_SIZE		    0x400000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_SQ_SGE		    0x800000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_RQ_SGE		    0x1000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_MAX_INLINE_DATA	    0x2000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_DEST_QP_ID		    0x4000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_SRC_MAC		    0x8000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_VLAN_ID		    0x10000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_ENABLE_CC		    0x20000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_TOS_ECN		    0x40000000UL
+	#define CMDQ_MODIFY_QP_MODIFY_MASK_TOS_DSCP		    0x80000000UL
+	__le32 qp_cid;
+	u8 network_type_en_sqd_async_notify_new_state;
+	#define CMDQ_MODIFY_QP_NEW_STATE_MASK			    0xfUL
+	#define CMDQ_MODIFY_QP_NEW_STATE_SFT			    0
+	#define CMDQ_MODIFY_QP_NEW_STATE_RESET			   0x0UL
+	#define CMDQ_MODIFY_QP_NEW_STATE_INIT			   0x1UL
+	#define CMDQ_MODIFY_QP_NEW_STATE_RTR			   0x2UL
+	#define CMDQ_MODIFY_QP_NEW_STATE_RTS			   0x3UL
+	#define CMDQ_MODIFY_QP_NEW_STATE_SQD			   0x4UL
+	#define CMDQ_MODIFY_QP_NEW_STATE_SQE			   0x5UL
+	#define CMDQ_MODIFY_QP_NEW_STATE_ERR			   0x6UL
+	#define CMDQ_MODIFY_QP_EN_SQD_ASYNC_NOTIFY		    0x10UL
+	#define CMDQ_MODIFY_QP_NETWORK_TYPE_MASK		    0xc0UL
+	#define CMDQ_MODIFY_QP_NETWORK_TYPE_SFT		    6
+	#define CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV1		   (0x0UL << 6)
+	#define CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV2_IPV4	   (0x2UL << 6)
+	#define CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV2_IPV6	   (0x3UL << 6)
+	u8 access;
+	#define CMDQ_MODIFY_QP_ACCESS_LOCAL_WRITE		    0x1UL
+	#define CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE		    0x2UL
+	#define CMDQ_MODIFY_QP_ACCESS_REMOTE_READ		    0x4UL
+	#define CMDQ_MODIFY_QP_ACCESS_REMOTE_ATOMIC		    0x8UL
+	__le16 pkey;
+	__le32 qkey;
+	__le32 dgid[4];
+	__le32 flow_label;
+	__le16 sgid_index;
+	u8 hop_limit;
+	u8 traffic_class;
+	__le16 dest_mac[3];
+	u8 tos_dscp_tos_ecn;
+	#define CMDQ_MODIFY_QP_TOS_ECN_MASK			    0x3UL
+	#define CMDQ_MODIFY_QP_TOS_ECN_SFT			    0
+	#define CMDQ_MODIFY_QP_TOS_DSCP_MASK			    0xfcUL
+	#define CMDQ_MODIFY_QP_TOS_DSCP_SFT			    2
+	u8 path_mtu;
+	#define CMDQ_MODIFY_QP_PATH_MTU_MASK			    0xf0UL
+	#define CMDQ_MODIFY_QP_PATH_MTU_SFT			    4
+	#define CMDQ_MODIFY_QP_PATH_MTU_MTU_256		   (0x0UL << 4)
+	#define CMDQ_MODIFY_QP_PATH_MTU_MTU_512		   (0x1UL << 4)
+	#define CMDQ_MODIFY_QP_PATH_MTU_MTU_1024		   (0x2UL << 4)
+	#define CMDQ_MODIFY_QP_PATH_MTU_MTU_2048		   (0x3UL << 4)
+	#define CMDQ_MODIFY_QP_PATH_MTU_MTU_4096		   (0x4UL << 4)
+	#define CMDQ_MODIFY_QP_PATH_MTU_MTU_8192		   (0x5UL << 4)
+	u8 timeout;
+	u8 retry_cnt;
+	u8 rnr_retry;
+	u8 min_rnr_timer;
+	__le32 rq_psn;
+	__le32 sq_psn;
+	u8 max_rd_atomic;
+	u8 max_dest_rd_atomic;
+	__le16 enable_cc;
+	#define CMDQ_MODIFY_QP_ENABLE_CC			    0x1UL
+	__le32 sq_size;
+	__le32 rq_size;
+	__le16 sq_sge;
+	__le16 rq_sge;
+	__le32 max_inline_data;
+	__le32 dest_qp_id;
+	__le32 unused_3;
+	__le16 src_mac[3];
+	__le16 vlan_pcp_vlan_dei_vlan_id;
+	#define CMDQ_MODIFY_QP_VLAN_ID_MASK			    0xfffUL
+	#define CMDQ_MODIFY_QP_VLAN_ID_SFT			    0
+	#define CMDQ_MODIFY_QP_VLAN_DEI			    0x1000UL
+	#define CMDQ_MODIFY_QP_VLAN_PCP_MASK			    0xe000UL
+	#define CMDQ_MODIFY_QP_VLAN_PCP_SFT			    13
+};
+
+/* Query QP command (24 bytes) */
+struct cmdq_query_qp {
+	u8 opcode;
+	#define CMDQ_QUERY_QP_OPCODE_QUERY_QP			   0x4UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 qp_cid;
+	__le32 unused_0;
+};
+
+/* Create SRQ command (48 bytes) */
+struct cmdq_create_srq {
+	u8 opcode;
+	#define CMDQ_CREATE_SRQ_OPCODE_CREATE_SRQ		   0x5UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le64 srq_handle;
+	__le16 pg_size_lvl;
+	#define CMDQ_CREATE_SRQ_LVL_MASK			    0x3UL
+	#define CMDQ_CREATE_SRQ_LVL_SFT			    0
+	#define CMDQ_CREATE_SRQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_CREATE_SRQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_CREATE_SRQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_CREATE_SRQ_PG_SIZE_MASK			    0x1cUL
+	#define CMDQ_CREATE_SRQ_PG_SIZE_SFT			    2
+	#define CMDQ_CREATE_SRQ_PG_SIZE_PG_4K			   (0x0UL << 2)
+	#define CMDQ_CREATE_SRQ_PG_SIZE_PG_8K			   (0x1UL << 2)
+	#define CMDQ_CREATE_SRQ_PG_SIZE_PG_64K			   (0x2UL << 2)
+	#define CMDQ_CREATE_SRQ_PG_SIZE_PG_2M			   (0x3UL << 2)
+	#define CMDQ_CREATE_SRQ_PG_SIZE_PG_8M			   (0x4UL << 2)
+	#define CMDQ_CREATE_SRQ_PG_SIZE_PG_1G			   (0x5UL << 2)
+	__le16 eventq_id;
+	#define CMDQ_CREATE_SRQ_EVENTQ_ID_MASK			    0xfffUL
+	#define CMDQ_CREATE_SRQ_EVENTQ_ID_SFT			    0
+	__le16 srq_size;
+	__le16 srq_fwo;
+	__le32 dpi;
+	__le32 pd_id;
+	__le64 pbl;
+};
+
+/* Destroy SRQ command (24 bytes) */
+struct cmdq_destroy_srq {
+	u8 opcode;
+	#define CMDQ_DESTROY_SRQ_OPCODE_DESTROY_SRQ		   0x6UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 srq_cid;
+	__le32 unused_0;
+};
+
+/* Query SRQ command (24 bytes) */
+struct cmdq_query_srq {
+	u8 opcode;
+	#define CMDQ_QUERY_SRQ_OPCODE_QUERY_SRQ		   0x8UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 srq_cid;
+	__le32 unused_0;
+};
+
+/* Create CQ command (48 bytes) */
+struct cmdq_create_cq {
+	u8 opcode;
+	#define CMDQ_CREATE_CQ_OPCODE_CREATE_CQ		   0x9UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le64 cq_handle;
+	__le32 pg_size_lvl;
+	#define CMDQ_CREATE_CQ_LVL_MASK			    0x3UL
+	#define CMDQ_CREATE_CQ_LVL_SFT				    0
+	#define CMDQ_CREATE_CQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_CREATE_CQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_CREATE_CQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_CREATE_CQ_PG_SIZE_MASK			    0x1cUL
+	#define CMDQ_CREATE_CQ_PG_SIZE_SFT			    2
+	#define CMDQ_CREATE_CQ_PG_SIZE_PG_4K			   (0x0UL << 2)
+	#define CMDQ_CREATE_CQ_PG_SIZE_PG_8K			   (0x1UL << 2)
+	#define CMDQ_CREATE_CQ_PG_SIZE_PG_64K			   (0x2UL << 2)
+	#define CMDQ_CREATE_CQ_PG_SIZE_PG_2M			   (0x3UL << 2)
+	#define CMDQ_CREATE_CQ_PG_SIZE_PG_8M			   (0x4UL << 2)
+	#define CMDQ_CREATE_CQ_PG_SIZE_PG_1G			   (0x5UL << 2)
+	__le32 cq_fco_cnq_id;
+	#define CMDQ_CREATE_CQ_CNQ_ID_MASK			    0xfffUL
+	#define CMDQ_CREATE_CQ_CNQ_ID_SFT			    0
+	#define CMDQ_CREATE_CQ_CQ_FCO_MASK			    0xfffff000UL
+	#define CMDQ_CREATE_CQ_CQ_FCO_SFT			    12
+	__le32 dpi;
+	__le32 cq_size;
+	__le64 pbl;
+};
+
+/* Destroy CQ command (24 bytes) */
+struct cmdq_destroy_cq {
+	u8 opcode;
+	#define CMDQ_DESTROY_CQ_OPCODE_DESTROY_CQ		   0xaUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 cq_cid;
+	__le32 unused_0;
+};
+
+/* Resize CQ command (40 bytes) */
+struct cmdq_resize_cq {
+	u8 opcode;
+	#define CMDQ_RESIZE_CQ_OPCODE_RESIZE_CQ		   0xcUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 cq_cid;
+	__le32 new_cq_size_pg_size_lvl;
+	#define CMDQ_RESIZE_CQ_LVL_MASK			    0x3UL
+	#define CMDQ_RESIZE_CQ_LVL_SFT				    0
+	#define CMDQ_RESIZE_CQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_RESIZE_CQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_RESIZE_CQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_RESIZE_CQ_PG_SIZE_MASK			    0x1cUL
+	#define CMDQ_RESIZE_CQ_PG_SIZE_SFT			    2
+	#define CMDQ_RESIZE_CQ_PG_SIZE_PG_4K			   (0x0UL << 2)
+	#define CMDQ_RESIZE_CQ_PG_SIZE_PG_8K			   (0x1UL << 2)
+	#define CMDQ_RESIZE_CQ_PG_SIZE_PG_64K			   (0x2UL << 2)
+	#define CMDQ_RESIZE_CQ_PG_SIZE_PG_2M			   (0x3UL << 2)
+	#define CMDQ_RESIZE_CQ_PG_SIZE_PG_8M			   (0x4UL << 2)
+	#define CMDQ_RESIZE_CQ_PG_SIZE_PG_1G			   (0x5UL << 2)
+	#define CMDQ_RESIZE_CQ_NEW_CQ_SIZE_MASK		    0x1fffe0UL
+	#define CMDQ_RESIZE_CQ_NEW_CQ_SIZE_SFT			    5
+	__le64 new_pbl;
+	__le32 new_cq_fco;
+	__le32 unused_2;
+};
+
+/* Allocate MRW command (32 bytes) */
+struct cmdq_allocate_mrw {
+	u8 opcode;
+	#define CMDQ_ALLOCATE_MRW_OPCODE_ALLOCATE_MRW		   0xdUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le64 mrw_handle;
+	u8 mrw_flags;
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MASK		    0xfUL
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_SFT		    0
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR			   0x0UL
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR		   0x1UL
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE1		   0x2UL
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A		   0x3UL
+	#define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B		   0x4UL
+	u8 access;
+	#define CMDQ_ALLOCATE_MRW_ACCESS_RESERVED_MASK		    0x1fUL
+	#define CMDQ_ALLOCATE_MRW_ACCESS_RESERVED_SFT		    0
+	#define CMDQ_ALLOCATE_MRW_ACCESS_CONSUMER_OWNED_KEY	    0x20UL
+	__le16 unused_1;
+	__le32 pd_id;
+};
+
+/* De-allocate key command (24 bytes) */
+struct cmdq_deallocate_key {
+	u8 opcode;
+	#define CMDQ_DEALLOCATE_KEY_OPCODE_DEALLOCATE_KEY	   0xeUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	u8 mrw_flags;
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_MASK		    0xfUL
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_SFT		    0
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_MR		   0x0UL
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_PMR		   0x1UL
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_MW_TYPE1		   0x2UL
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_MW_TYPE2A	   0x3UL
+	#define CMDQ_DEALLOCATE_KEY_MRW_FLAGS_MW_TYPE2B	   0x4UL
+	u8 unused_1[3];
+	__le32 key;
+};
+
+/* Register MR command (48 bytes) */
+struct cmdq_register_mr {
+	u8 opcode;
+	#define CMDQ_REGISTER_MR_OPCODE_REGISTER_MR		   0xfUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	u8 log2_pg_size_lvl;
+	#define CMDQ_REGISTER_MR_LVL_MASK			    0x3UL
+	#define CMDQ_REGISTER_MR_LVL_SFT			    0
+	#define CMDQ_REGISTER_MR_LVL_LVL_0			   0x0UL
+	#define CMDQ_REGISTER_MR_LVL_LVL_1			   0x1UL
+	#define CMDQ_REGISTER_MR_LVL_LVL_2			   0x2UL
+	#define CMDQ_REGISTER_MR_LVL_LAST             CMDQ_REGISTER_MR_LVL_LVL_2
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_MASK		    0x7cUL
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_SFT		    2
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_4K    (0xcUL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_8K    (0xdUL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_64K   (0x10UL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_256K  (0x12UL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_1M    (0x14UL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_2M    (0x15UL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_4M    (0x16UL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_1G    (0x1eUL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_LAST	\
+					CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_1G
+	#define CMDQ_REGISTER_MR_UNUSED1             0x80UL
+	u8 access;
+	#define CMDQ_REGISTER_MR_ACCESS_LOCAL_WRITE		    0x1UL
+	#define CMDQ_REGISTER_MR_ACCESS_REMOTE_READ		    0x2UL
+	#define CMDQ_REGISTER_MR_ACCESS_REMOTE_WRITE		    0x4UL
+	#define CMDQ_REGISTER_MR_ACCESS_REMOTE_ATOMIC		    0x8UL
+	#define CMDQ_REGISTER_MR_ACCESS_MW_BIND		    0x10UL
+	#define CMDQ_REGISTER_MR_ACCESS_ZERO_BASED		    0x20UL
+	__le16	log2_pbl_pg_size;
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_MASK   0x1fUL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_SFT    0
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4K    0xcUL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_8K    0xdUL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_64K   0x10UL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_256K  0x12UL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1M    0x14UL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_2M    0x15UL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4M    0x16UL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1G    0x1eUL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_LAST    \
+				CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1G
+	#define CMDQ_REGISTER_MR_UNUSED11_MASK           0xffe0UL
+	#define CMDQ_REGISTER_MR_UNUSED11_SFT            5
+	__le32 key;
+	__le64 pbl;
+	__le64 va;
+	__le64 mr_size;
+};
+
+/* Deregister MR command (24 bytes) */
+struct cmdq_deregister_mr {
+	u8 opcode;
+	#define CMDQ_DEREGISTER_MR_OPCODE_DEREGISTER_MR	   0x10UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 lkey;
+	__le32 unused_0;
+};
+
+/* Add GID command (48 bytes) */
+struct cmdq_add_gid {
+	u8 opcode;
+	#define CMDQ_ADD_GID_OPCODE_ADD_GID			   0x11UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__be32 gid[4];
+	__be16 src_mac[3];
+	__le16 vlan;
+	#define CMDQ_ADD_GID_VLAN_VLAN_ID_MASK			    0xfffUL
+	#define CMDQ_ADD_GID_VLAN_VLAN_ID_SFT			    0
+	#define CMDQ_ADD_GID_VLAN_TPID_MASK			    0x7000UL
+	#define CMDQ_ADD_GID_VLAN_TPID_SFT			    12
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_88A8		   (0x0UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_8100		   (0x1UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_9100		   (0x2UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_9200		   (0x3UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_9300		   (0x4UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_CFG1		   (0x5UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_CFG2		   (0x6UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_TPID_CFG3		   (0x7UL << 12)
+	#define CMDQ_ADD_GID_VLAN_TPID_LAST    CMDQ_ADD_GID_VLAN_TPID_TPID_CFG3
+	#define CMDQ_ADD_GID_VLAN_VLAN_EN			    0x8000UL
+	__le16 ipid;
+	__le16 stats_ctx;
+	#define CMDQ_ADD_GID_STATS_CTX_STATS_CTX_ID_MASK	    0x7fffUL
+	#define CMDQ_ADD_GID_STATS_CTX_STATS_CTX_ID_SFT	    0
+	#define CMDQ_ADD_GID_STATS_CTX_STATS_CTX_VALID		    0x8000UL
+	__le32 unused_0;
+};
+
+/* Delete GID command (24 bytes) */
+struct cmdq_delete_gid {
+	u8 opcode;
+	#define CMDQ_DELETE_GID_OPCODE_DELETE_GID		   0x12UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le16 gid_index;
+	__le16 unused_0;
+	__le32 unused_1;
+};
+
+/* Modify GID command (48 bytes) */
+struct cmdq_modify_gid {
+	u8 opcode;
+	#define CMDQ_MODIFY_GID_OPCODE_MODIFY_GID		   0x17UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__be32 gid[4];
+	__be16 src_mac[3];
+	__le16 vlan;
+	#define CMDQ_MODIFY_GID_VLAN_VLAN_ID_MASK		    0xfffUL
+	#define CMDQ_MODIFY_GID_VLAN_VLAN_ID_SFT		    0
+	#define CMDQ_MODIFY_GID_VLAN_TPID_MASK			    0x7000UL
+	#define CMDQ_MODIFY_GID_VLAN_TPID_SFT			    12
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_88A8		   (0x0UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_8100		   (0x1UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_9100		   (0x2UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_9200		   (0x3UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_9300		   (0x4UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_CFG1		   (0x5UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_CFG2		   (0x6UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_TPID_CFG3		   (0x7UL << 12)
+	#define CMDQ_MODIFY_GID_VLAN_TPID_LAST		\
+					CMDQ_MODIFY_GID_VLAN_TPID_TPID_CFG3
+	#define CMDQ_MODIFY_GID_VLAN_VLAN_EN			    0x8000UL
+	__le16 ipid;
+	__le16 gid_index;
+	__le16 stats_ctx;
+	#define CMDQ_MODIFY_GID_STATS_CTX_STATS_CTX_ID_MASK	    0x7fffUL
+	#define CMDQ_MODIFY_GID_STATS_CTX_STATS_CTX_ID_SFT	    0
+	#define CMDQ_MODIFY_GID_STATS_CTX_STATS_CTX_VALID	    0x8000UL
+	__le16 unused_0;
+};
+
+/* Query GID command (24 bytes) */
+struct cmdq_query_gid {
+	u8 opcode;
+	#define CMDQ_QUERY_GID_OPCODE_QUERY_GID		   0x18UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le16 gid_index;
+	__le16 unused_0;
+	__le32 unused_1;
+};
+
+/* Create QP1 command (80 bytes) */
+struct cmdq_create_qp1 {
+	u8 opcode;
+	#define CMDQ_CREATE_QP1_OPCODE_CREATE_QP1		   0x13UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le64 qp_handle;
+	__le32 qp_flags;
+	#define CMDQ_CREATE_QP1_QP_FLAGS_SRQ_USED		   0x1UL
+	#define CMDQ_CREATE_QP1_QP_FLAGS_FORCE_COMPLETION	   0x2UL
+	#define CMDQ_CREATE_QP1_QP_FLAGS_RESERVED_LKEY_ENABLE     0x4UL
+	u8 type;
+	#define CMDQ_CREATE_QP1_TYPE_GSI			   0x1UL
+	u8 sq_pg_size_sq_lvl;
+	#define CMDQ_CREATE_QP1_SQ_LVL_MASK			    0xfUL
+	#define CMDQ_CREATE_QP1_SQ_LVL_SFT			    0
+	#define CMDQ_CREATE_QP1_SQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_CREATE_QP1_SQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_CREATE_QP1_SQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_SFT			    4
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 rq_pg_size_rq_lvl;
+	#define CMDQ_CREATE_QP1_RQ_LVL_MASK			    0xfUL
+	#define CMDQ_CREATE_QP1_RQ_LVL_SFT			    0
+	#define CMDQ_CREATE_QP1_RQ_LVL_LVL_0			   0x0UL
+	#define CMDQ_CREATE_QP1_RQ_LVL_LVL_1			   0x1UL
+	#define CMDQ_CREATE_QP1_RQ_LVL_LVL_2			   0x2UL
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_SFT			    4
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 unused_0;
+	__le32 dpi;
+	__le32 sq_size;
+	__le32 rq_size;
+	__le16 sq_fwo_sq_sge;
+	#define CMDQ_CREATE_QP1_SQ_SGE_MASK			    0xfUL
+	#define CMDQ_CREATE_QP1_SQ_SGE_SFT			    0
+	#define CMDQ_CREATE_QP1_SQ_FWO_MASK			    0xfff0UL
+	#define CMDQ_CREATE_QP1_SQ_FWO_SFT			    4
+	__le16 rq_fwo_rq_sge;
+	#define CMDQ_CREATE_QP1_RQ_SGE_MASK			    0xfUL
+	#define CMDQ_CREATE_QP1_RQ_SGE_SFT			    0
+	#define CMDQ_CREATE_QP1_RQ_FWO_MASK			    0xfff0UL
+	#define CMDQ_CREATE_QP1_RQ_FWO_SFT			    4
+	__le32 scq_cid;
+	__le32 rcq_cid;
+	__le32 srq_cid;
+	__le32 pd_id;
+	__le64 sq_pbl;
+	__le64 rq_pbl;
+};
+
+/* Destroy QP1 command (24 bytes) */
+struct cmdq_destroy_qp1 {
+	u8 opcode;
+	#define CMDQ_DESTROY_QP1_OPCODE_DESTROY_QP1		   0x14UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 qp1_cid;
+	__le32 unused_0;
+};
+
+/* Create AH command (64 bytes) */
+struct cmdq_create_ah {
+	u8 opcode;
+	#define CMDQ_CREATE_AH_OPCODE_CREATE_AH		   0x15UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le64 ah_handle;
+	__le32 dgid[4];
+	u8 type;
+	#define CMDQ_CREATE_AH_TYPE_V1				   0x0UL
+	#define CMDQ_CREATE_AH_TYPE_V2IPV4			   0x2UL
+	#define CMDQ_CREATE_AH_TYPE_V2IPV6			   0x3UL
+	u8 hop_limit;
+	__le16 sgid_index;
+	__le32 dest_vlan_id_flow_label;
+	#define CMDQ_CREATE_AH_FLOW_LABEL_MASK			    0xfffffUL
+	#define CMDQ_CREATE_AH_FLOW_LABEL_SFT			    0
+	#define CMDQ_CREATE_AH_DEST_VLAN_ID_MASK		    0xfff00000UL
+	#define CMDQ_CREATE_AH_DEST_VLAN_ID_SFT		    20
+	__le32 pd_id;
+	__le32 unused_0;
+	__le16 dest_mac[3];
+	u8 traffic_class;
+	u8 unused_1;
+};
+
+/* Destroy AH command (24 bytes) */
+struct cmdq_destroy_ah {
+	u8 opcode;
+	#define CMDQ_DESTROY_AH_OPCODE_DESTROY_AH		   0x16UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 ah_cid;
+	__le32 unused_0;
+};
+
+/* Initialize Firmware command (112 bytes) */
+struct cmdq_initialize_fw {
+	u8 opcode;
+	#define CMDQ_INITIALIZE_FW_OPCODE_INITIALIZE_FW	   0x80UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	u8 qpc_pg_size_qpc_lvl;
+	#define CMDQ_INITIALIZE_FW_QPC_LVL_MASK		    0xfUL
+	#define CMDQ_INITIALIZE_FW_QPC_LVL_SFT			    0
+	#define CMDQ_INITIALIZE_FW_QPC_LVL_LVL_0		   0x0UL
+	#define CMDQ_INITIALIZE_FW_QPC_LVL_LVL_1		   0x1UL
+	#define CMDQ_INITIALIZE_FW_QPC_LVL_LVL_2		   0x2UL
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT		    4
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 mrw_pg_size_mrw_lvl;
+	#define CMDQ_INITIALIZE_FW_MRW_LVL_MASK		    0xfUL
+	#define CMDQ_INITIALIZE_FW_MRW_LVL_SFT			    0
+	#define CMDQ_INITIALIZE_FW_MRW_LVL_LVL_0		   0x0UL
+	#define CMDQ_INITIALIZE_FW_MRW_LVL_LVL_1		   0x1UL
+	#define CMDQ_INITIALIZE_FW_MRW_LVL_LVL_2		   0x2UL
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_SFT		    4
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_INITIALIZE_FW_MRW_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 srq_pg_size_srq_lvl;
+	#define CMDQ_INITIALIZE_FW_SRQ_LVL_MASK		    0xfUL
+	#define CMDQ_INITIALIZE_FW_SRQ_LVL_SFT			    0
+	#define CMDQ_INITIALIZE_FW_SRQ_LVL_LVL_0		   0x0UL
+	#define CMDQ_INITIALIZE_FW_SRQ_LVL_LVL_1		   0x1UL
+	#define CMDQ_INITIALIZE_FW_SRQ_LVL_LVL_2		   0x2UL
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_SFT		    4
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_INITIALIZE_FW_SRQ_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 cq_pg_size_cq_lvl;
+	#define CMDQ_INITIALIZE_FW_CQ_LVL_MASK			    0xfUL
+	#define CMDQ_INITIALIZE_FW_CQ_LVL_SFT			    0
+	#define CMDQ_INITIALIZE_FW_CQ_LVL_LVL_0		   0x0UL
+	#define CMDQ_INITIALIZE_FW_CQ_LVL_LVL_1		   0x1UL
+	#define CMDQ_INITIALIZE_FW_CQ_LVL_LVL_2		   0x2UL
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_SFT		    4
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_INITIALIZE_FW_CQ_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 tqm_pg_size_tqm_lvl;
+	#define CMDQ_INITIALIZE_FW_TQM_LVL_MASK		    0xfUL
+	#define CMDQ_INITIALIZE_FW_TQM_LVL_SFT			    0
+	#define CMDQ_INITIALIZE_FW_TQM_LVL_LVL_0		   0x0UL
+	#define CMDQ_INITIALIZE_FW_TQM_LVL_LVL_1		   0x1UL
+	#define CMDQ_INITIALIZE_FW_TQM_LVL_LVL_2		   0x2UL
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_SFT		    4
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_INITIALIZE_FW_TQM_PG_SIZE_PG_1G		   (0x5UL << 4)
+	u8 tim_pg_size_tim_lvl;
+	#define CMDQ_INITIALIZE_FW_TIM_LVL_MASK		    0xfUL
+	#define CMDQ_INITIALIZE_FW_TIM_LVL_SFT			    0
+	#define CMDQ_INITIALIZE_FW_TIM_LVL_LVL_0		   0x0UL
+	#define CMDQ_INITIALIZE_FW_TIM_LVL_LVL_1		   0x1UL
+	#define CMDQ_INITIALIZE_FW_TIM_LVL_LVL_2		   0x2UL
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_MASK		    0xf0UL
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_SFT		    4
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_PG_4K		   (0x0UL << 4)
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_PG_8K		   (0x1UL << 4)
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_PG_64K		   (0x2UL << 4)
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_PG_2M		   (0x3UL << 4)
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_PG_8M		   (0x4UL << 4)
+	#define CMDQ_INITIALIZE_FW_TIM_PG_SIZE_PG_1G		   (0x5UL << 4)
+	/* This value is (log-base-2-of-DBR-page-size - 12).
+	 * 0 for 4KB. HW supported values are enumerated below.
+	 */
+	__le16  log2_dbr_pg_size;
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_MASK	0xfUL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_SFT		0
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_4K	0x0UL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_8K	0x1UL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_16K	0x2UL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_32K	0x3UL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_64K	0x4UL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_128K	0x5UL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_256K	0x6UL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_512K	0x7UL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_1M	0x8UL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_2M	0x9UL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_4M	0xaUL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_8M	0xbUL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_16M	0xcUL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_32M	0xdUL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_64M	0xeUL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_128M	0xfUL
+	#define CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_LAST		\
+			CMDQ_INITIALIZE_FW_LOG2_DBR_PG_SIZE_PG_128M
+	__le64 qpc_page_dir;
+	__le64 mrw_page_dir;
+	__le64 srq_page_dir;
+	__le64 cq_page_dir;
+	__le64 tqm_page_dir;
+	__le64 tim_page_dir;
+	__le32 number_of_qp;
+	__le32 number_of_mrw;
+	__le32 number_of_srq;
+	__le32 number_of_cq;
+	__le32 max_qp_per_vf;
+	__le32 max_mrw_per_vf;
+	__le32 max_srq_per_vf;
+	__le32 max_cq_per_vf;
+	__le32 max_gid_per_vf;
+	__le32 stat_ctx_id;
+};
+
+/* De-initialize Firmware command (16 bytes) */
+struct cmdq_deinitialize_fw {
+	u8 opcode;
+	#define CMDQ_DEINITIALIZE_FW_OPCODE_DEINITIALIZE_FW       0x81UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+};
+
+/* Stop function command (16 bytes) */
+struct cmdq_stop_func {
+	u8 opcode;
+	#define CMDQ_STOP_FUNC_OPCODE_STOP_FUNC		   0x82UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+};
+
+/* Query function command (16 bytes) */
+struct cmdq_query_func {
+	u8 opcode;
+	#define CMDQ_QUERY_FUNC_OPCODE_QUERY_FUNC		   0x83UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+};
+
+/* Set function resources command (16 bytes) */
+struct cmdq_set_func_resources {
+	u8 opcode;
+	#define CMDQ_SET_FUNC_RESOURCES_OPCODE_SET_FUNC_RESOURCES 0x84UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 number_of_qp;
+	__le32 number_of_mrw;
+	__le32 number_of_srq;
+	__le32 number_of_cq;
+	__le32 max_qp_per_vf;
+	__le32 max_mrw_per_vf;
+	__le32 max_srq_per_vf;
+	__le32 max_cq_per_vf;
+	__le32 max_gid_per_vf;
+	__le32 stat_ctx_id;
+};
+
+/* Read hardware resource context command (24 bytes) */
+struct cmdq_read_context {
+	u8 opcode;
+	#define CMDQ_READ_CONTEXT_OPCODE_READ_CONTEXT		   0x85UL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le32 type_xid;
+	#define CMDQ_READ_CONTEXT_XID_MASK			    0xffffffUL
+	#define CMDQ_READ_CONTEXT_XID_SFT			    0
+	#define CMDQ_READ_CONTEXT_TYPE_MASK			    0xff000000UL
+	#define CMDQ_READ_CONTEXT_TYPE_SFT			    24
+	#define CMDQ_READ_CONTEXT_TYPE_QPC			   (0x0UL << 24)
+	#define CMDQ_READ_CONTEXT_TYPE_CQ			   (0x1UL << 24)
+	#define CMDQ_READ_CONTEXT_TYPE_MRW			   (0x2UL << 24)
+	#define CMDQ_READ_CONTEXT_TYPE_SRQ			   (0x3UL << 24)
+	__le32 unused_0;
+};
+
+/* Map TC to COS. Can only be issued from a PF (24 bytes) */
+struct cmdq_map_tc_to_cos {
+	u8 opcode;
+	#define CMDQ_MAP_TC_TO_COS_OPCODE_MAP_TC_TO_COS	   0x8aUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+	__le16 cos0;
+	#define CMDQ_MAP_TC_TO_COS_COS0_NO_CHANGE		   0xffffUL
+	__le16 cos1;
+	#define CMDQ_MAP_TC_TO_COS_COS1_DISABLE		   0x8000UL
+	#define CMDQ_MAP_TC_TO_COS_COS1_NO_CHANGE		   0xffffUL
+	__le32 unused_0;
+};
+
+/* Query version command (16 bytes) */
+struct cmdq_query_version {
+	u8 opcode;
+	#define CMDQ_QUERY_VERSION_OPCODE_QUERY_VERSION	   0x8bUL
+	u8 cmd_size;
+	__le16 flags;
+	__le16 cookie;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 resp_addr;
+};
+
+/* Command-Response Event Queue (CREQ) Structures */
+/* Base CREQ Record (16 bytes) */
+struct creq_base {
+	u8 type;
+	#define CREQ_BASE_TYPE_MASK				    0x3fUL
+	#define CREQ_BASE_TYPE_SFT				    0
+	#define CREQ_BASE_TYPE_QP_EVENT			   0x38UL
+	#define CREQ_BASE_TYPE_FUNC_EVENT			   0x3aUL
+	#define CREQ_BASE_RESERVED2_MASK			    0xc0UL
+	#define CREQ_BASE_RESERVED2_SFT			    6
+	u8 reserved56[7];
+	u8 v;
+	#define CREQ_BASE_V					    0x1UL
+	#define CREQ_BASE_RESERVED7_MASK			    0xfeUL
+	#define CREQ_BASE_RESERVED7_SFT			    1
+	u8 event;
+	__le16 reserved48[3];
+};
+
+/* RoCE Function Async Event Notification (16 bytes) */
+struct creq_func_event {
+	u8 type;
+	#define CREQ_FUNC_EVENT_TYPE_MASK			    0x3fUL
+	#define CREQ_FUNC_EVENT_TYPE_SFT			    0
+	#define CREQ_FUNC_EVENT_TYPE_FUNC_EVENT		   0x3aUL
+	#define CREQ_FUNC_EVENT_RESERVED2_MASK			    0xc0UL
+	#define CREQ_FUNC_EVENT_RESERVED2_SFT			    6
+	u8 reserved56[7];
+	u8 v;
+	#define CREQ_FUNC_EVENT_V				    0x1UL
+	#define CREQ_FUNC_EVENT_RESERVED7_MASK			    0xfeUL
+	#define CREQ_FUNC_EVENT_RESERVED7_SFT			    1
+	u8 event;
+	#define CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR		   0x1UL
+	#define CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR		   0x2UL
+	#define CREQ_FUNC_EVENT_EVENT_RX_WQE_ERROR		   0x3UL
+	#define CREQ_FUNC_EVENT_EVENT_RX_DATA_ERROR		   0x4UL
+	#define CREQ_FUNC_EVENT_EVENT_CQ_ERROR			   0x5UL
+	#define CREQ_FUNC_EVENT_EVENT_TQM_ERROR		   0x6UL
+	#define CREQ_FUNC_EVENT_EVENT_CFCQ_ERROR		   0x7UL
+	#define CREQ_FUNC_EVENT_EVENT_CFCS_ERROR		   0x8UL
+	#define CREQ_FUNC_EVENT_EVENT_CFCC_ERROR		   0x9UL
+	#define CREQ_FUNC_EVENT_EVENT_CFCM_ERROR		   0xaUL
+	#define CREQ_FUNC_EVENT_EVENT_TIM_ERROR		   0xbUL
+	#define CREQ_FUNC_EVENT_EVENT_VF_COMM_REQUEST		   0x80UL
+	#define CREQ_FUNC_EVENT_EVENT_RESOURCE_EXHAUSTED	   0x81UL
+	__le16 reserved48[3];
+};
+
+/* RoCE Slowpath Command Completion (16 bytes) */
+struct creq_qp_event {
+	u8 type;
+	#define CREQ_QP_EVENT_TYPE_MASK			    0x3fUL
+	#define CREQ_QP_EVENT_TYPE_SFT				    0
+	#define CREQ_QP_EVENT_TYPE_QP_EVENT			   0x38UL
+	#define CREQ_QP_EVENT_RESERVED2_MASK			    0xc0UL
+	#define CREQ_QP_EVENT_RESERVED2_SFT			    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_QP_EVENT_V				    0x1UL
+	#define CREQ_QP_EVENT_RESERVED7_MASK			    0xfeUL
+	#define CREQ_QP_EVENT_RESERVED7_SFT			    1
+	u8 event;
+	#define CREQ_QP_EVENT_EVENT_CREATE_QP			   0x1UL
+	#define CREQ_QP_EVENT_EVENT_DESTROY_QP			   0x2UL
+	#define CREQ_QP_EVENT_EVENT_MODIFY_QP			   0x3UL
+	#define CREQ_QP_EVENT_EVENT_QUERY_QP			   0x4UL
+	#define CREQ_QP_EVENT_EVENT_CREATE_SRQ			   0x5UL
+	#define CREQ_QP_EVENT_EVENT_DESTROY_SRQ		   0x6UL
+	#define CREQ_QP_EVENT_EVENT_QUERY_SRQ			   0x8UL
+	#define CREQ_QP_EVENT_EVENT_CREATE_CQ			   0x9UL
+	#define CREQ_QP_EVENT_EVENT_DESTROY_CQ			   0xaUL
+	#define CREQ_QP_EVENT_EVENT_RESIZE_CQ			   0xcUL
+	#define CREQ_QP_EVENT_EVENT_ALLOCATE_MRW		   0xdUL
+	#define CREQ_QP_EVENT_EVENT_DEALLOCATE_KEY		   0xeUL
+	#define CREQ_QP_EVENT_EVENT_REGISTER_MR		   0xfUL
+	#define CREQ_QP_EVENT_EVENT_DEREGISTER_MR		   0x10UL
+	#define CREQ_QP_EVENT_EVENT_ADD_GID			   0x11UL
+	#define CREQ_QP_EVENT_EVENT_DELETE_GID			   0x12UL
+	#define CREQ_QP_EVENT_EVENT_MODIFY_GID			   0x17UL
+	#define CREQ_QP_EVENT_EVENT_QUERY_GID			   0x18UL
+	#define CREQ_QP_EVENT_EVENT_CREATE_QP1			   0x13UL
+	#define CREQ_QP_EVENT_EVENT_DESTROY_QP1		   0x14UL
+	#define CREQ_QP_EVENT_EVENT_CREATE_AH			   0x15UL
+	#define CREQ_QP_EVENT_EVENT_DESTROY_AH			   0x16UL
+	#define CREQ_QP_EVENT_EVENT_INITIALIZE_FW		   0x80UL
+	#define CREQ_QP_EVENT_EVENT_DEINITIALIZE_FW		   0x81UL
+	#define CREQ_QP_EVENT_EVENT_STOP_FUNC			   0x82UL
+	#define CREQ_QP_EVENT_EVENT_QUERY_FUNC			   0x83UL
+	#define CREQ_QP_EVENT_EVENT_SET_FUNC_RESOURCES		   0x84UL
+	#define CREQ_QP_EVENT_EVENT_MAP_TC_TO_COS		   0x8aUL
+	#define CREQ_QP_EVENT_EVENT_QUERY_VERSION		   0x8bUL
+	#define CREQ_QP_EVENT_EVENT_MODIFY_CC			   0x8cUL
+	#define CREQ_QP_EVENT_EVENT_QUERY_CC			   0x8dUL
+	#define CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION	   0xc0UL
+	__le16 reserved48[3];
+};
+
+/* Create QP command response (16 bytes) */
+struct creq_create_qp_resp {
+	u8 type;
+	#define CREQ_CREATE_QP_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_CREATE_QP_RESP_TYPE_SFT			    0
+	#define CREQ_CREATE_QP_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_CREATE_QP_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_CREATE_QP_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_CREATE_QP_RESP_V				    0x1UL
+	#define CREQ_CREATE_QP_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_CREATE_QP_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_CREATE_QP_RESP_EVENT_CREATE_QP		   0x1UL
+	__le16 reserved48[3];
+};
+
+/* Destroy QP command response (16 bytes) */
+struct creq_destroy_qp_resp {
+	u8 type;
+	#define CREQ_DESTROY_QP_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_DESTROY_QP_RESP_TYPE_SFT			    0
+	#define CREQ_DESTROY_QP_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DESTROY_QP_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DESTROY_QP_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DESTROY_QP_RESP_V				    0x1UL
+	#define CREQ_DESTROY_QP_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DESTROY_QP_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DESTROY_QP_RESP_EVENT_DESTROY_QP		   0x2UL
+	__le16 reserved48[3];
+};
+
+/* Modify QP command response (16 bytes) */
+struct creq_modify_qp_resp {
+	u8 type;
+	#define CREQ_MODIFY_QP_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_MODIFY_QP_RESP_TYPE_SFT			    0
+	#define CREQ_MODIFY_QP_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_MODIFY_QP_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_MODIFY_QP_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_MODIFY_QP_RESP_V				    0x1UL
+	#define CREQ_MODIFY_QP_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_MODIFY_QP_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_MODIFY_QP_RESP_EVENT_MODIFY_QP		   0x3UL
+	__le16 reserved48[3];
+};
+
+/* cmdq_query_roce_stats (size:128b/16B) */
+struct cmdq_query_roce_stats {
+	u8	opcode;
+	#define CMDQ_QUERY_ROCE_STATS_OPCODE_QUERY_ROCE_STATS 0x8eUL
+	#define CMDQ_QUERY_ROCE_STATS_OPCODE_LAST	\
+				CMDQ_QUERY_ROCE_STATS_OPCODE_QUERY_ROCE_STATS
+	u8	cmd_size;
+	__le16	flags;
+	__le16	cookie;
+	u8	resp_size;
+	u8	reserved8;
+	__le64	resp_addr;
+};
+
+/* Query QP command response (16 bytes) */
+struct creq_query_qp_resp {
+	u8 type;
+	#define CREQ_QUERY_QP_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_QUERY_QP_RESP_TYPE_SFT			    0
+	#define CREQ_QUERY_QP_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_QUERY_QP_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_QUERY_QP_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 size;
+	u8 v;
+	#define CREQ_QUERY_QP_RESP_V				    0x1UL
+	#define CREQ_QUERY_QP_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_QP_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_QUERY_QP_RESP_EVENT_QUERY_QP		   0x4UL
+	__le16 reserved48[3];
+};
+
+/* Query QP command response side buffer structure (104 bytes) */
+struct creq_query_qp_resp_sb {
+	u8 opcode;
+	#define CREQ_QUERY_QP_RESP_SB_OPCODE_QUERY_QP		   0x4UL
+	u8 status;
+	__le16 cookie;
+	__le16 flags;
+	u8 resp_size;
+	u8 reserved8;
+	__le32 xid;
+	u8 en_sqd_async_notify_state;
+	#define CREQ_QUERY_QP_RESP_SB_STATE_MASK		    0xfUL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_SFT		    0
+	#define CREQ_QUERY_QP_RESP_SB_STATE_RESET		   0x0UL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_INIT		   0x1UL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_RTR		   0x2UL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_RTS		   0x3UL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_SQD		   0x4UL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_SQE		   0x5UL
+	#define CREQ_QUERY_QP_RESP_SB_STATE_ERR		   0x6UL
+	#define CREQ_QUERY_QP_RESP_SB_EN_SQD_ASYNC_NOTIFY	    0x10UL
+	u8 access;
+	#define CREQ_QUERY_QP_RESP_SB_ACCESS_LOCAL_WRITE	    0x1UL
+	#define CREQ_QUERY_QP_RESP_SB_ACCESS_REMOTE_WRITE	    0x2UL
+	#define CREQ_QUERY_QP_RESP_SB_ACCESS_REMOTE_READ	    0x4UL
+	#define CREQ_QUERY_QP_RESP_SB_ACCESS_REMOTE_ATOMIC	    0x8UL
+	__le16 pkey;
+	__le32 qkey;
+	__le32 reserved32;
+	__le32 dgid[4];
+	__le32 flow_label;
+	__le16 sgid_index;
+	u8 hop_limit;
+	u8 traffic_class;
+	__le16 dest_mac[3];
+	__le16 path_mtu_dest_vlan_id;
+	#define CREQ_QUERY_QP_RESP_SB_DEST_VLAN_ID_MASK	    0xfffUL
+	#define CREQ_QUERY_QP_RESP_SB_DEST_VLAN_ID_SFT		    0
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MASK		    0xf000UL
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_SFT		    12
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MTU_256		   (0x0UL << 12)
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MTU_512		   (0x1UL << 12)
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MTU_1024	   (0x2UL << 12)
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MTU_2048	   (0x3UL << 12)
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MTU_4096	   (0x4UL << 12)
+	#define CREQ_QUERY_QP_RESP_SB_PATH_MTU_MTU_8192	   (0x5UL << 12)
+	u8 timeout;
+	u8 retry_cnt;
+	u8 rnr_retry;
+	u8 min_rnr_timer;
+	__le32 rq_psn;
+	__le32 sq_psn;
+	u8 max_rd_atomic;
+	u8 max_dest_rd_atomic;
+	u8 tos_dscp_tos_ecn;
+	#define CREQ_QUERY_QP_RESP_SB_TOS_ECN_MASK		    0x3UL
+	#define CREQ_QUERY_QP_RESP_SB_TOS_ECN_SFT		    0
+	#define CREQ_QUERY_QP_RESP_SB_TOS_DSCP_MASK		    0xfcUL
+	#define CREQ_QUERY_QP_RESP_SB_TOS_DSCP_SFT		    2
+	u8 enable_cc;
+	#define CREQ_QUERY_QP_RESP_SB_ENABLE_CC		    0x1UL
+	#define CREQ_QUERY_QP_RESP_SB_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_QP_RESP_SB_RESERVED7_SFT		    1
+	__le32 sq_size;
+	__le32 rq_size;
+	__le16 sq_sge;
+	__le16 rq_sge;
+	__le32 max_inline_data;
+	__le32 dest_qp_id;
+	__le32 unused_1;
+	__le16 src_mac[3];
+	__le16 vlan_pcp_vlan_dei_vlan_id;
+	#define CREQ_QUERY_QP_RESP_SB_VLAN_ID_MASK		    0xfffUL
+	#define CREQ_QUERY_QP_RESP_SB_VLAN_ID_SFT		    0
+	#define CREQ_QUERY_QP_RESP_SB_VLAN_DEI			    0x1000UL
+	#define CREQ_QUERY_QP_RESP_SB_VLAN_PCP_MASK		    0xe000UL
+	#define CREQ_QUERY_QP_RESP_SB_VLAN_PCP_SFT		    13
+};
+
+/* Create SRQ command response (16 bytes) */
+struct creq_create_srq_resp {
+	u8 type;
+	#define CREQ_CREATE_SRQ_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_CREATE_SRQ_RESP_TYPE_SFT			    0
+	#define CREQ_CREATE_SRQ_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_CREATE_SRQ_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_CREATE_SRQ_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_CREATE_SRQ_RESP_V				    0x1UL
+	#define CREQ_CREATE_SRQ_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_CREATE_SRQ_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_CREATE_SRQ_RESP_EVENT_CREATE_SRQ		   0x5UL
+	__le16 reserved48[3];
+};
+
+/* Destroy SRQ command response (16 bytes) */
+struct creq_destroy_srq_resp {
+	u8 type;
+	#define CREQ_DESTROY_SRQ_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_DESTROY_SRQ_RESP_TYPE_SFT			    0
+	#define CREQ_DESTROY_SRQ_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DESTROY_SRQ_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DESTROY_SRQ_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DESTROY_SRQ_RESP_V			    0x1UL
+	#define CREQ_DESTROY_SRQ_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DESTROY_SRQ_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DESTROY_SRQ_RESP_EVENT_DESTROY_SRQ	   0x6UL
+	__le16 enable_for_arm[3];
+	#define CREQ_DESTROY_SRQ_RESP_ENABLE_FOR_ARM_MASK	    0x30000UL
+	#define CREQ_DESTROY_SRQ_RESP_ENABLE_FOR_ARM_SFT	    16
+	#define CREQ_DESTROY_SRQ_RESP_RESERVED46_MASK		    0xfffc0000UL
+	#define CREQ_DESTROY_SRQ_RESP_RESERVED46_SFT		    18
+};
+
+/* Query SRQ command response (16 bytes) */
+struct creq_query_srq_resp {
+	u8 type;
+	#define CREQ_QUERY_SRQ_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_QUERY_SRQ_RESP_TYPE_SFT			    0
+	#define CREQ_QUERY_SRQ_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_QUERY_SRQ_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_QUERY_SRQ_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 size;
+	u8 v;
+	#define CREQ_QUERY_SRQ_RESP_V				    0x1UL
+	#define CREQ_QUERY_SRQ_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_SRQ_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_QUERY_SRQ_RESP_EVENT_QUERY_SRQ		   0x8UL
+	__le16 reserved48[3];
+};
+
+/* Query SRQ command response side buffer structure (24 bytes) */
+struct creq_query_srq_resp_sb {
+	u8 opcode;
+	#define CREQ_QUERY_SRQ_RESP_SB_OPCODE_QUERY_SRQ	   0x8UL
+	u8 status;
+	__le16 cookie;
+	__le16 flags;
+	u8 resp_size;
+	u8 reserved8;
+	__le32 xid;
+	__le16 srq_limit;
+	__le16 reserved16;
+	__le32 data[4];
+};
+
+/* Create CQ command Response (16 bytes) */
+struct creq_create_cq_resp {
+	u8 type;
+	#define CREQ_CREATE_CQ_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_CREATE_CQ_RESP_TYPE_SFT			    0
+	#define CREQ_CREATE_CQ_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_CREATE_CQ_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_CREATE_CQ_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_CREATE_CQ_RESP_V				    0x1UL
+	#define CREQ_CREATE_CQ_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_CREATE_CQ_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_CREATE_CQ_RESP_EVENT_CREATE_CQ		   0x9UL
+	__le16 reserved48[3];
+};
+
+/* Destroy CQ command response (16 bytes) */
+struct creq_destroy_cq_resp {
+	u8 type;
+	#define CREQ_DESTROY_CQ_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_DESTROY_CQ_RESP_TYPE_SFT			    0
+	#define CREQ_DESTROY_CQ_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DESTROY_CQ_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DESTROY_CQ_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DESTROY_CQ_RESP_V				    0x1UL
+	#define CREQ_DESTROY_CQ_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DESTROY_CQ_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DESTROY_CQ_RESP_EVENT_DESTROY_CQ		   0xaUL
+	__le16 cq_arm_lvl;
+	#define CREQ_DESTROY_CQ_RESP_CQ_ARM_LVL_MASK		    0x3UL
+	#define CREQ_DESTROY_CQ_RESP_CQ_ARM_LVL_SFT		    0
+	#define CREQ_DESTROY_CQ_RESP_RESERVED14_MASK		    0xfffcUL
+	#define CREQ_DESTROY_CQ_RESP_RESERVED14_SFT		    2
+	__le16 total_cnq_events;
+	__le16 reserved16;
+};
+
+/* Resize CQ command response (16 bytes) */
+struct creq_resize_cq_resp {
+	u8 type;
+	#define CREQ_RESIZE_CQ_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_RESIZE_CQ_RESP_TYPE_SFT			    0
+	#define CREQ_RESIZE_CQ_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_RESIZE_CQ_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_RESIZE_CQ_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_RESIZE_CQ_RESP_V				    0x1UL
+	#define CREQ_RESIZE_CQ_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_RESIZE_CQ_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_RESIZE_CQ_RESP_EVENT_RESIZE_CQ		   0xcUL
+	__le16 reserved48[3];
+};
+
+/* Allocate MRW command response (16 bytes) */
+struct creq_allocate_mrw_resp {
+	u8 type;
+	#define CREQ_ALLOCATE_MRW_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_ALLOCATE_MRW_RESP_TYPE_SFT		    0
+	#define CREQ_ALLOCATE_MRW_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_ALLOCATE_MRW_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_ALLOCATE_MRW_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_ALLOCATE_MRW_RESP_V			    0x1UL
+	#define CREQ_ALLOCATE_MRW_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_ALLOCATE_MRW_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_ALLOCATE_MRW_RESP_EVENT_ALLOCATE_MRW	   0xdUL
+	__le16 reserved48[3];
+};
+
+/* De-allocate key command response (16 bytes) */
+struct creq_deallocate_key_resp {
+	u8 type;
+	#define CREQ_DEALLOCATE_KEY_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_DEALLOCATE_KEY_RESP_TYPE_SFT		    0
+	#define CREQ_DEALLOCATE_KEY_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DEALLOCATE_KEY_RESP_RESERVED2_MASK	    0xc0UL
+	#define CREQ_DEALLOCATE_KEY_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DEALLOCATE_KEY_RESP_V			    0x1UL
+	#define CREQ_DEALLOCATE_KEY_RESP_RESERVED7_MASK	    0xfeUL
+	#define CREQ_DEALLOCATE_KEY_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DEALLOCATE_KEY_RESP_EVENT_DEALLOCATE_KEY     0xeUL
+	__le16 reserved16;
+	__le32 bound_window_info;
+};
+
+/* Register MR command response (16 bytes) */
+struct creq_register_mr_resp {
+	u8 type;
+	#define CREQ_REGISTER_MR_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_REGISTER_MR_RESP_TYPE_SFT			    0
+	#define CREQ_REGISTER_MR_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_REGISTER_MR_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_REGISTER_MR_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_REGISTER_MR_RESP_V			    0x1UL
+	#define CREQ_REGISTER_MR_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_REGISTER_MR_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_REGISTER_MR_RESP_EVENT_REGISTER_MR	   0xfUL
+	__le16 reserved48[3];
+};
+
+/* Deregister MR command response (16 bytes) */
+struct creq_deregister_mr_resp {
+	u8 type;
+	#define CREQ_DEREGISTER_MR_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_DEREGISTER_MR_RESP_TYPE_SFT		    0
+	#define CREQ_DEREGISTER_MR_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DEREGISTER_MR_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DEREGISTER_MR_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DEREGISTER_MR_RESP_V			    0x1UL
+	#define CREQ_DEREGISTER_MR_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DEREGISTER_MR_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DEREGISTER_MR_RESP_EVENT_DEREGISTER_MR       0x10UL
+	__le16 reserved16;
+	__le32 bound_windows;
+};
+
+/* Add GID command response (16 bytes) */
+struct creq_add_gid_resp {
+	u8 type;
+	#define CREQ_ADD_GID_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_ADD_GID_RESP_TYPE_SFT			    0
+	#define CREQ_ADD_GID_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_ADD_GID_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_ADD_GID_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_ADD_GID_RESP_V				    0x1UL
+	#define CREQ_ADD_GID_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_ADD_GID_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_ADD_GID_RESP_EVENT_ADD_GID		   0x11UL
+	__le16 reserved48[3];
+};
+
+/* Delete GID command response (16 bytes) */
+struct creq_delete_gid_resp {
+	u8 type;
+	#define CREQ_DELETE_GID_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_DELETE_GID_RESP_TYPE_SFT			    0
+	#define CREQ_DELETE_GID_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DELETE_GID_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DELETE_GID_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DELETE_GID_RESP_V				    0x1UL
+	#define CREQ_DELETE_GID_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DELETE_GID_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DELETE_GID_RESP_EVENT_DELETE_GID		   0x12UL
+	__le16 reserved48[3];
+};
+
+/* Modify GID command response (16 bytes) */
+struct creq_modify_gid_resp {
+	u8 type;
+	#define CREQ_MODIFY_GID_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_MODIFY_GID_RESP_TYPE_SFT			    0
+	#define CREQ_MODIFY_GID_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_MODIFY_GID_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_MODIFY_GID_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_MODIFY_GID_RESP_V				    0x1UL
+	#define CREQ_MODIFY_GID_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_MODIFY_GID_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_MODIFY_GID_RESP_EVENT_ADD_GID		   0x11UL
+	__le16 reserved48[3];
+};
+
+/* Query GID command response (16 bytes) */
+struct creq_query_gid_resp {
+	u8 type;
+	#define CREQ_QUERY_GID_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_QUERY_GID_RESP_TYPE_SFT			    0
+	#define CREQ_QUERY_GID_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_QUERY_GID_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_QUERY_GID_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 size;
+	u8 v;
+	#define CREQ_QUERY_GID_RESP_V				    0x1UL
+	#define CREQ_QUERY_GID_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_GID_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_QUERY_GID_RESP_EVENT_QUERY_GID		   0x18UL
+	__le16 reserved48[3];
+};
+
+/* Query GID command response side buffer structure (40 bytes) */
+struct creq_query_gid_resp_sb {
+	u8 opcode;
+	#define CREQ_QUERY_GID_RESP_SB_OPCODE_QUERY_GID	   0x18UL
+	u8 status;
+	__le16 cookie;
+	__le16 flags;
+	u8 resp_size;
+	u8 reserved8;
+	__le32 gid[4];
+	__le16 src_mac[3];
+	__le16 vlan;
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_VLAN_ID_MASK	    0xfffUL
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_VLAN_ID_SFT	    0
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_MASK		    0x7000UL
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_SFT		    12
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_88A8	   (0x0UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_8100	   (0x1UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_9100	   (0x2UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_9200	   (0x3UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_9300	   (0x4UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_CFG1	   (0x5UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_CFG2	   (0x6UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_CFG3	   (0x7UL << 12)
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_TPID_LAST	\
+				CREQ_QUERY_GID_RESP_SB_VLAN_TPID_TPID_CFG3
+	#define CREQ_QUERY_GID_RESP_SB_VLAN_VLAN_EN		    0x8000UL
+	__le16 ipid;
+	__le16 gid_index;
+	__le32 unused_0;
+};
+
+/* Create QP1 command response (16 bytes) */
+struct creq_create_qp1_resp {
+	u8 type;
+	#define CREQ_CREATE_QP1_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_CREATE_QP1_RESP_TYPE_SFT			    0
+	#define CREQ_CREATE_QP1_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_CREATE_QP1_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_CREATE_QP1_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_CREATE_QP1_RESP_V				    0x1UL
+	#define CREQ_CREATE_QP1_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_CREATE_QP1_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_CREATE_QP1_RESP_EVENT_CREATE_QP1		   0x13UL
+	__le16 reserved48[3];
+};
+
+/* Destroy QP1 command response (16 bytes) */
+struct creq_destroy_qp1_resp {
+	u8 type;
+	#define CREQ_DESTROY_QP1_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_DESTROY_QP1_RESP_TYPE_SFT			    0
+	#define CREQ_DESTROY_QP1_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DESTROY_QP1_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DESTROY_QP1_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DESTROY_QP1_RESP_V			    0x1UL
+	#define CREQ_DESTROY_QP1_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DESTROY_QP1_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DESTROY_QP1_RESP_EVENT_DESTROY_QP1	   0x14UL
+	__le16 reserved48[3];
+};
+
+/* Create AH command response (16 bytes) */
+struct creq_create_ah_resp {
+	u8 type;
+	#define CREQ_CREATE_AH_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_CREATE_AH_RESP_TYPE_SFT			    0
+	#define CREQ_CREATE_AH_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_CREATE_AH_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_CREATE_AH_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_CREATE_AH_RESP_V				    0x1UL
+	#define CREQ_CREATE_AH_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_CREATE_AH_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_CREATE_AH_RESP_EVENT_CREATE_AH		   0x15UL
+	__le16 reserved48[3];
+};
+
+/* Destroy AH command response (16 bytes) */
+struct creq_destroy_ah_resp {
+	u8 type;
+	#define CREQ_DESTROY_AH_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_DESTROY_AH_RESP_TYPE_SFT			    0
+	#define CREQ_DESTROY_AH_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_DESTROY_AH_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_DESTROY_AH_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 xid;
+	u8 v;
+	#define CREQ_DESTROY_AH_RESP_V				    0x1UL
+	#define CREQ_DESTROY_AH_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_DESTROY_AH_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_DESTROY_AH_RESP_EVENT_DESTROY_AH		   0x16UL
+	__le16 reserved48[3];
+};
+
+/* Initialize Firmware command response (16 bytes) */
+struct creq_initialize_fw_resp {
+	u8 type;
+	#define CREQ_INITIALIZE_FW_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_INITIALIZE_FW_RESP_TYPE_SFT		    0
+	#define CREQ_INITIALIZE_FW_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_INITIALIZE_FW_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_INITIALIZE_FW_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_INITIALIZE_FW_RESP_V			    0x1UL
+	#define CREQ_INITIALIZE_FW_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_INITIALIZE_FW_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_INITIALIZE_FW_RESP_EVENT_INITIALIZE_FW       0x80UL
+	__le16 reserved48[3];
+};
+
+/* De-initialize Firmware command response (16 bytes) */
+struct creq_deinitialize_fw_resp {
+	u8 type;
+	#define CREQ_DEINITIALIZE_FW_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_DEINITIALIZE_FW_RESP_TYPE_SFT		    0
+	#define CREQ_DEINITIALIZE_FW_RESP_TYPE_QP_EVENT	   0x38UL
+	#define CREQ_DEINITIALIZE_FW_RESP_RESERVED2_MASK	    0xc0UL
+	#define CREQ_DEINITIALIZE_FW_RESP_RESERVED2_SFT	    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_DEINITIALIZE_FW_RESP_V			    0x1UL
+	#define CREQ_DEINITIALIZE_FW_RESP_RESERVED7_MASK	    0xfeUL
+	#define CREQ_DEINITIALIZE_FW_RESP_RESERVED7_SFT	    1
+	u8 event;
+	#define CREQ_DEINITIALIZE_FW_RESP_EVENT_DEINITIALIZE_FW   0x81UL
+	__le16 reserved48[3];
+};
+
+/* Stop function command response (16 bytes) */
+struct creq_stop_func_resp {
+	u8 type;
+	#define CREQ_STOP_FUNC_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_STOP_FUNC_RESP_TYPE_SFT			    0
+	#define CREQ_STOP_FUNC_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_STOP_FUNC_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_STOP_FUNC_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_STOP_FUNC_RESP_V				    0x1UL
+	#define CREQ_STOP_FUNC_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_STOP_FUNC_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_STOP_FUNC_RESP_EVENT_STOP_FUNC		   0x82UL
+	__le16 reserved48[3];
+};
+
+/* Query function command response (16 bytes) */
+struct creq_query_func_resp {
+	u8 type;
+	#define CREQ_QUERY_FUNC_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_QUERY_FUNC_RESP_TYPE_SFT			    0
+	#define CREQ_QUERY_FUNC_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_QUERY_FUNC_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_QUERY_FUNC_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 size;
+	u8 v;
+	#define CREQ_QUERY_FUNC_RESP_V				    0x1UL
+	#define CREQ_QUERY_FUNC_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_FUNC_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_QUERY_FUNC_RESP_EVENT_QUERY_FUNC		   0x83UL
+	__le16 reserved48[3];
+};
+
+/* Query function command response side buffer structure (88 bytes) */
+struct creq_query_func_resp_sb {
+	u8 opcode;
+	#define CREQ_QUERY_FUNC_RESP_SB_OPCODE_QUERY_FUNC	   0x83UL
+	u8 status;
+	__le16 cookie;
+	__le16 flags;
+	u8 resp_size;
+	u8 reserved8;
+	__le64 max_mr_size;
+	__le32 max_qp;
+	__le16 max_qp_wr;
+	__le16 dev_cap_flags;
+	#define CREQ_QUERY_FUNC_RESP_SB_DEV_CAP_FLAGS_RESIZE_QP   0x1UL
+	__le32 max_cq;
+	__le32 max_cqe;
+	__le32 max_pd;
+	u8 max_sge;
+	u8 max_srq_sge;
+	u8 max_qp_rd_atom;
+	u8 max_qp_init_rd_atom;
+	__le32 max_mr;
+	__le32 max_mw;
+	__le32 max_raw_eth_qp;
+	__le32 max_ah;
+	__le32 max_fmr;
+	__le32 max_srq_wr;
+	__le32 max_pkeys;
+	__le32 max_inline_data;
+	u8 max_map_per_fmr;
+	u8 l2_db_space_size;
+	__le16 max_srq;
+	__le32 max_gid;
+	__le32 tqm_alloc_reqs[12];
+};
+
+/* Set resources command response (16 bytes) */
+struct creq_set_func_resources_resp {
+	u8 type;
+	#define CREQ_SET_FUNC_RESOURCES_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_SET_FUNC_RESOURCES_RESP_TYPE_SFT		    0
+	#define CREQ_SET_FUNC_RESOURCES_RESP_TYPE_QP_EVENT	   0x38UL
+	#define CREQ_SET_FUNC_RESOURCES_RESP_RESERVED2_MASK	    0xc0UL
+	#define CREQ_SET_FUNC_RESOURCES_RESP_RESERVED2_SFT	    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_SET_FUNC_RESOURCES_RESP_V			    0x1UL
+	#define CREQ_SET_FUNC_RESOURCES_RESP_RESERVED7_MASK	    0xfeUL
+	#define CREQ_SET_FUNC_RESOURCES_RESP_RESERVED7_SFT	    1
+	u8 event;
+	#define CREQ_SET_FUNC_RESOURCES_RESP_EVENT_SET_FUNC_RESOURCES 0x84UL
+	__le16 reserved48[3];
+};
+
+/* Map TC to COS response (16 bytes) */
+struct creq_map_tc_to_cos_resp {
+	u8 type;
+	#define CREQ_MAP_TC_TO_COS_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_MAP_TC_TO_COS_RESP_TYPE_SFT		    0
+	#define CREQ_MAP_TC_TO_COS_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_MAP_TC_TO_COS_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_MAP_TC_TO_COS_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_MAP_TC_TO_COS_RESP_V			    0x1UL
+	#define CREQ_MAP_TC_TO_COS_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_MAP_TC_TO_COS_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_MAP_TC_TO_COS_RESP_EVENT_MAP_TC_TO_COS       0x8aUL
+	__le16 reserved48[3];
+};
+
+/* Query version response (16 bytes) */
+struct creq_query_version_resp {
+	u8 type;
+	#define CREQ_QUERY_VERSION_RESP_TYPE_MASK		    0x3fUL
+	#define CREQ_QUERY_VERSION_RESP_TYPE_SFT		    0
+	#define CREQ_QUERY_VERSION_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_QUERY_VERSION_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_QUERY_VERSION_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	u8 fw_maj;
+	u8 fw_minor;
+	u8 fw_bld;
+	u8 fw_rsvd;
+	u8 v;
+	#define CREQ_QUERY_VERSION_RESP_V			    0x1UL
+	#define CREQ_QUERY_VERSION_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_VERSION_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_QUERY_VERSION_RESP_EVENT_QUERY_VERSION       0x8bUL
+	__le16 reserved16;
+	u8 intf_maj;
+	u8 intf_minor;
+	u8 intf_bld;
+	u8 intf_rsvd;
+};
+
+/* Modify congestion control command response (16 bytes) */
+struct creq_modify_cc_resp {
+	u8 type;
+	#define CREQ_MODIFY_CC_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_MODIFY_CC_RESP_TYPE_SFT			    0
+	#define CREQ_MODIFY_CC_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_MODIFY_CC_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_MODIFY_CC_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 reserved32;
+	u8 v;
+	#define CREQ_MODIFY_CC_RESP_V				    0x1UL
+	#define CREQ_MODIFY_CC_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_MODIFY_CC_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_MODIFY_CC_RESP_EVENT_MODIFY_CC		   0x8cUL
+	__le16 reserved48[3];
+};
+
+/* Query congestion control command response (16 bytes) */
+struct creq_query_cc_resp {
+	u8 type;
+	#define CREQ_QUERY_CC_RESP_TYPE_MASK			    0x3fUL
+	#define CREQ_QUERY_CC_RESP_TYPE_SFT			    0
+	#define CREQ_QUERY_CC_RESP_TYPE_QP_EVENT		   0x38UL
+	#define CREQ_QUERY_CC_RESP_RESERVED2_MASK		    0xc0UL
+	#define CREQ_QUERY_CC_RESP_RESERVED2_SFT		    6
+	u8 status;
+	__le16 cookie;
+	__le32 size;
+	u8 v;
+	#define CREQ_QUERY_CC_RESP_V				    0x1UL
+	#define CREQ_QUERY_CC_RESP_RESERVED7_MASK		    0xfeUL
+	#define CREQ_QUERY_CC_RESP_RESERVED7_SFT		    1
+	u8 event;
+	#define CREQ_QUERY_CC_RESP_EVENT_QUERY_CC		   0x8dUL
+	__le16 reserved48[3];
+};
+
+/* Query congestion control command response side buffer structure (32 bytes) */
+struct creq_query_cc_resp_sb {
+	u8 opcode;
+	#define CREQ_QUERY_CC_RESP_SB_OPCODE_QUERY_CC		   0x8dUL
+	u8 status;
+	__le16 cookie;
+	__le16 flags;
+	u8 resp_size;
+	u8 reserved8;
+	u8 enable_cc;
+	#define CREQ_QUERY_CC_RESP_SB_ENABLE_CC		    0x1UL
+	u8 g;
+	#define CREQ_QUERY_CC_RESP_SB_G_MASK			    0x7UL
+	#define CREQ_QUERY_CC_RESP_SB_G_SFT			    0
+	u8 num_phases_per_state;
+	__le16 init_cr;
+	u8 unused_2;
+	__le16 unused_3;
+	u8 unused_4;
+	__le16 init_tr;
+	u8 tos_dscp_tos_ecn;
+	#define CREQ_QUERY_CC_RESP_SB_TOS_ECN_MASK		    0x3UL
+	#define CREQ_QUERY_CC_RESP_SB_TOS_ECN_SFT		    0
+	#define CREQ_QUERY_CC_RESP_SB_TOS_DSCP_MASK		    0xfcUL
+	#define CREQ_QUERY_CC_RESP_SB_TOS_DSCP_SFT		    2
+	__le64 reserved64;
+	__le64 reserved64_1;
+};
+
+/* creq_query_roce_stats_resp (size:128b/16B) */
+struct creq_query_roce_stats_resp {
+	u8	type;
+	#define CREQ_QUERY_ROCE_STATS_RESP_TYPE_MASK    0x3fUL
+	#define CREQ_QUERY_ROCE_STATS_RESP_TYPE_SFT     0
+	#define CREQ_QUERY_ROCE_STATS_RESP_TYPE_QP_EVENT  0x38UL
+	#define CREQ_QUERY_ROCE_STATS_RESP_TYPE_LAST	\
+				CREQ_QUERY_ROCE_STATS_RESP_TYPE_QP_EVENT
+	u8	status;
+	__le16	cookie;
+	__le32	size;
+	u8	v;
+	#define CREQ_QUERY_ROCE_STATS_RESP_V     0x1UL
+	u8	event;
+	#define CREQ_QUERY_ROCE_STATS_RESP_EVENT_QUERY_ROCE_STATS 0x8eUL
+	#define CREQ_QUERY_ROCE_STATS_RESP_EVENT_LAST	\
+			CREQ_QUERY_ROCE_STATS_RESP_EVENT_QUERY_ROCE_STATS
+	u8	reserved48[6];
+};
+
+/* creq_query_roce_stats_resp_sb (size:2624b/328B) */
+struct creq_query_roce_stats_resp_sb {
+	u8	opcode;
+	#define CREQ_QUERY_ROCE_STATS_RESP_SB_OPCODE_QUERY_ROCE_STATS 0x8eUL
+	#define CREQ_QUERY_ROCE_STATS_RESP_SB_OPCODE_LAST \
+			CREQ_QUERY_ROCE_STATS_RESP_SB_OPCODE_QUERY_ROCE_STATS
+	u8	status;
+	__le16	cookie;
+	__le16	flags;
+	u8	resp_size;
+	u8	rsvd;
+	__le32	num_counters;
+	__le32	rsvd1;
+	__le64	to_retransmits;
+	__le64	seq_err_naks_rcvd;
+	__le64	max_retry_exceeded;
+	__le64	rnr_naks_rcvd;
+	__le64	missing_resp;
+	__le64	unrecoverable_err;
+	__le64	bad_resp_err;
+	__le64	local_qp_op_err;
+	__le64	local_protection_err;
+	__le64	mem_mgmt_op_err;
+	__le64	remote_invalid_req_err;
+	__le64	remote_access_err;
+	__le64	remote_op_err;
+	__le64	dup_req;
+	__le64	res_exceed_max;
+	__le64	res_length_mismatch;
+	__le64	res_exceeds_wqe;
+	__le64	res_opcode_err;
+	__le64	res_rx_invalid_rkey;
+	__le64	res_rx_domain_err;
+	__le64	res_rx_no_perm;
+	__le64	res_rx_range_err;
+	__le64	res_tx_invalid_rkey;
+	__le64	res_tx_domain_err;
+	__le64	res_tx_no_perm;
+	__le64	res_tx_range_err;
+	__le64	res_irrq_oflow;
+	__le64	res_unsup_opcode;
+	__le64	res_unaligned_atomic;
+	__le64	res_rem_inv_err;
+	__le64	res_mem_error;
+	__le64	res_srq_err;
+	__le64	res_cmp_err;
+	__le64	res_invalid_dup_rkey;
+	__le64	res_wqe_format_err;
+	__le64	res_cq_load_err;
+	__le64	res_srq_load_err;
+	__le64	res_tx_pci_err;
+	__le64	res_rx_pci_err;
+};
+
+/* QP error notification event (16 bytes) */
+struct creq_qp_error_notification {
+	u8 type;
+	#define CREQ_QP_ERROR_NOTIFICATION_TYPE_MASK		    0x3fUL
+	#define CREQ_QP_ERROR_NOTIFICATION_TYPE_SFT		    0
+	#define CREQ_QP_ERROR_NOTIFICATION_TYPE_QP_EVENT	   0x38UL
+	#define CREQ_QP_ERROR_NOTIFICATION_RESERVED2_MASK	    0xc0UL
+	#define CREQ_QP_ERROR_NOTIFICATION_RESERVED2_SFT	    6
+	u8 status;
+	u8 req_slow_path_state;
+	u8 req_err_state_reason;
+	__le32 xid;
+	u8 v;
+	#define CREQ_QP_ERROR_NOTIFICATION_V			    0x1UL
+	#define CREQ_QP_ERROR_NOTIFICATION_RESERVED7_MASK	    0xfeUL
+	#define CREQ_QP_ERROR_NOTIFICATION_RESERVED7_SFT	    1
+	u8 event;
+	#define CREQ_QP_ERROR_NOTIFICATION_EVENT_QP_ERROR_NOTIFICATION 0xc0UL
+	u8 res_slow_path_state;
+	u8 res_err_state_reason;
+	__le16 sq_cons_idx;
+	__le16 rq_cons_idx;
+};
+
+/* RoCE Slowpath HSI Specification 1.6.0 */
+#define ROCE_SP_HSI_VERSION_MAJOR	1
+#define ROCE_SP_HSI_VERSION_MINOR	6
+#define ROCE_SP_HSI_VERSION_UPDATE	0
+
+#define ROCE_SP_HSI_VERSION_STR	"1.6.0"
+/*
+ * Following is the signature for ROCE_SP_HSI message field that indicates not
+ * applicable (All F's). Need to cast it the size of the field if needed.
+ */
+#define ROCE_SP_HSI_NA_SIGNATURE	((__le32)(-1))
+#endif /* __BNXT_RE_HSI_H__ */
diff --git a/drivers/infiniband/hw/cxgb3/Kconfig b/drivers/infiniband/hw/cxgb3/Kconfig
new file mode 100644
index 0000000..a7b77cb
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/Kconfig
@@ -0,0 +1,18 @@
+config INFINIBAND_CXGB3
+	tristate "Chelsio RDMA Driver"
+	depends on CHELSIO_T3
+	select GENERIC_ALLOCATOR
+	---help---
+	  This is an iWARP/RDMA driver for the Chelsio T3 1GbE and
+	  10GbE adapters.
+
+	  For general information about Chelsio and our products, visit
+	  our website at <http://www.chelsio.com>.
+
+	  For customer support, please visit our customer support page at
+	  <http://www.chelsio.com/support.html>.
+
+	  Please send feedback to <linux-bugs@chelsio.com>.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called iw_cxgb3.
diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile
new file mode 100644
index 0000000..66fe091
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb3
+
+obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o
+
+iw_cxgb3-y :=  iwch_cm.o iwch_ev.o iwch_cq.o iwch_qp.o iwch_mem.o \
+	       iwch_provider.o iwch.o cxio_hal.o cxio_resource.o
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
new file mode 100644
index 0000000..3328acc
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -0,0 +1,1332 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <asm/delay.h>
+
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+
+#include "cxio_resource.h"
+#include "cxio_hal.h"
+#include "cxgb3_offload.h"
+#include "sge_defs.h"
+
+static LIST_HEAD(rdev_list);
+static cxio_hal_ev_callback_func_t cxio_ev_cb = NULL;
+
+static struct cxio_rdev *cxio_hal_find_rdev_by_name(char *dev_name)
+{
+	struct cxio_rdev *rdev;
+
+	list_for_each_entry(rdev, &rdev_list, entry)
+		if (!strcmp(rdev->dev_name, dev_name))
+			return rdev;
+	return NULL;
+}
+
+static struct cxio_rdev *cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev)
+{
+	struct cxio_rdev *rdev;
+
+	list_for_each_entry(rdev, &rdev_list, entry)
+		if (rdev->t3cdev_p == tdev)
+			return rdev;
+	return NULL;
+}
+
+int cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq,
+		   enum t3_cq_opcode op, u32 credit)
+{
+	int ret;
+	struct t3_cqe *cqe;
+	u32 rptr;
+
+	struct rdma_cq_op setup;
+	setup.id = cq->cqid;
+	setup.credits = (op == CQ_CREDIT_UPDATE) ? credit : 0;
+	setup.op = op;
+	ret = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_OP, &setup);
+
+	if ((ret < 0) || (op == CQ_CREDIT_UPDATE))
+		return ret;
+
+	/*
+	 * If the rearm returned an index other than our current index,
+	 * then there might be CQE's in flight (being DMA'd).  We must wait
+	 * here for them to complete or the consumer can miss a notification.
+	 */
+	if (Q_PTR2IDX((cq->rptr), cq->size_log2) != ret) {
+		int i=0;
+
+		rptr = cq->rptr;
+
+		/*
+		 * Keep the generation correct by bumping rptr until it
+		 * matches the index returned by the rearm - 1.
+		 */
+		while (Q_PTR2IDX((rptr+1), cq->size_log2) != ret)
+			rptr++;
+
+		/*
+		 * Now rptr is the index for the (last) cqe that was
+		 * in-flight at the time the HW rearmed the CQ.  We
+		 * spin until that CQE is valid.
+		 */
+		cqe = cq->queue + Q_PTR2IDX(rptr, cq->size_log2);
+		while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) {
+			udelay(1);
+			if (i++ > 1000000) {
+				pr_err("%s: stalled rnic\n", rdev_p->dev_name);
+				return -EIO;
+			}
+		}
+
+		return 1;
+	}
+
+	return 0;
+}
+
+static int cxio_hal_clear_cq_ctx(struct cxio_rdev *rdev_p, u32 cqid)
+{
+	struct rdma_cq_setup setup;
+	setup.id = cqid;
+	setup.base_addr = 0;	/* NULL address */
+	setup.size = 0;		/* disaable the CQ */
+	setup.credits = 0;
+	setup.credit_thres = 0;
+	setup.ovfl_mode = 0;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+static int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid)
+{
+	u64 sge_cmd;
+	struct t3_modify_qp_wr *wqe;
+	struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_KERNEL);
+	if (!skb) {
+		pr_debug("%s alloc_skb failed\n", __func__);
+		return -ENOMEM;
+	}
+	wqe = skb_put_zero(skb, sizeof(*wqe));
+	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD,
+		       T3_COMPLETION_FLAG | T3_NOTIFY_FLAG, 0, qpid, 7,
+		       T3_SOPEOP);
+	wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
+	sge_cmd = qpid << 8 | 3;
+	wqe->sge_cmd = cpu_to_be64(sge_cmd);
+	skb->priority = CPL_PRIORITY_CONTROL;
+	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
+}
+
+int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel)
+{
+	struct rdma_cq_setup setup;
+	int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe);
+
+	size += 1; /* one extra page for storing cq-in-err state */
+	cq->cqid = cxio_hal_get_cqid(rdev_p->rscp);
+	if (!cq->cqid)
+		return -ENOMEM;
+	if (kernel) {
+		cq->sw_queue = kzalloc(size, GFP_KERNEL);
+		if (!cq->sw_queue)
+			return -ENOMEM;
+	}
+	cq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), size,
+					     &(cq->dma_addr), GFP_KERNEL);
+	if (!cq->queue) {
+		kfree(cq->sw_queue);
+		return -ENOMEM;
+	}
+	dma_unmap_addr_set(cq, mapping, cq->dma_addr);
+	memset(cq->queue, 0, size);
+	setup.id = cq->cqid;
+	setup.base_addr = (u64) (cq->dma_addr);
+	setup.size = 1UL << cq->size_log2;
+	setup.credits = 65535;
+	setup.credit_thres = 1;
+	if (rdev_p->t3cdev_p->type != T3A)
+		setup.ovfl_mode = 0;
+	else
+		setup.ovfl_mode = 1;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+#ifdef notyet
+int cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+{
+	struct rdma_cq_setup setup;
+	setup.id = cq->cqid;
+	setup.base_addr = (u64) (cq->dma_addr);
+	setup.size = 1UL << cq->size_log2;
+	setup.credits = setup.size;
+	setup.credit_thres = setup.size;	/* TBD: overflow recovery */
+	setup.ovfl_mode = 1;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+#endif
+
+static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+	struct cxio_qpid_list *entry;
+	u32 qpid;
+	int i;
+
+	mutex_lock(&uctx->lock);
+	if (!list_empty(&uctx->qpids)) {
+		entry = list_entry(uctx->qpids.next, struct cxio_qpid_list,
+				   entry);
+		list_del(&entry->entry);
+		qpid = entry->qpid;
+		kfree(entry);
+	} else {
+		qpid = cxio_hal_get_qpid(rdev_p->rscp);
+		if (!qpid)
+			goto out;
+		for (i = qpid+1; i & rdev_p->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				break;
+			entry->qpid = i;
+			list_add_tail(&entry->entry, &uctx->qpids);
+		}
+	}
+out:
+	mutex_unlock(&uctx->lock);
+	pr_debug("%s qpid 0x%x\n", __func__, qpid);
+	return qpid;
+}
+
+static void put_qpid(struct cxio_rdev *rdev_p, u32 qpid,
+		     struct cxio_ucontext *uctx)
+{
+	struct cxio_qpid_list *entry;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return;
+	pr_debug("%s qpid 0x%x\n", __func__, qpid);
+	entry->qpid = qpid;
+	mutex_lock(&uctx->lock);
+	list_add_tail(&entry->entry, &uctx->qpids);
+	mutex_unlock(&uctx->lock);
+}
+
+void cxio_release_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+	struct list_head *pos, *nxt;
+	struct cxio_qpid_list *entry;
+
+	mutex_lock(&uctx->lock);
+	list_for_each_safe(pos, nxt, &uctx->qpids) {
+		entry = list_entry(pos, struct cxio_qpid_list, entry);
+		list_del_init(&entry->entry);
+		if (!(entry->qpid & rdev_p->qpmask))
+			cxio_hal_put_qpid(rdev_p->rscp, entry->qpid);
+		kfree(entry);
+	}
+	mutex_unlock(&uctx->lock);
+}
+
+void cxio_init_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+	INIT_LIST_HEAD(&uctx->qpids);
+	mutex_init(&uctx->lock);
+}
+
+int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain,
+		   struct t3_wq *wq, struct cxio_ucontext *uctx)
+{
+	int depth = 1UL << wq->size_log2;
+	int rqsize = 1UL << wq->rq_size_log2;
+
+	wq->qpid = get_qpid(rdev_p, uctx);
+	if (!wq->qpid)
+		return -ENOMEM;
+
+	wq->rq = kzalloc(depth * sizeof(struct t3_swrq), GFP_KERNEL);
+	if (!wq->rq)
+		goto err1;
+
+	wq->rq_addr = cxio_hal_rqtpool_alloc(rdev_p, rqsize);
+	if (!wq->rq_addr)
+		goto err2;
+
+	wq->sq = kzalloc(depth * sizeof(struct t3_swsq), GFP_KERNEL);
+	if (!wq->sq)
+		goto err3;
+
+	wq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev),
+					     depth * sizeof(union t3_wr),
+					     &(wq->dma_addr), GFP_KERNEL);
+	if (!wq->queue)
+		goto err4;
+
+	memset(wq->queue, 0, depth * sizeof(union t3_wr));
+	dma_unmap_addr_set(wq, mapping, wq->dma_addr);
+	wq->doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
+	if (!kernel_domain)
+		wq->udb = (u64)rdev_p->rnic_info.udbell_physbase +
+					(wq->qpid << rdev_p->qpshift);
+	wq->rdev = rdev_p;
+	pr_debug("%s qpid 0x%x doorbell 0x%p udb 0x%llx\n",
+		 __func__, wq->qpid, wq->doorbell, (unsigned long long)wq->udb);
+	return 0;
+err4:
+	kfree(wq->sq);
+err3:
+	cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, rqsize);
+err2:
+	kfree(wq->rq);
+err1:
+	put_qpid(rdev_p, wq->qpid, uctx);
+	return -ENOMEM;
+}
+
+int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+{
+	int err;
+	err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid);
+	kfree(cq->sw_queue);
+	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
+			  (1UL << (cq->size_log2))
+			  * sizeof(struct t3_cqe) + 1, cq->queue,
+			  dma_unmap_addr(cq, mapping));
+	cxio_hal_put_cqid(rdev_p->rscp, cq->cqid);
+	return err;
+}
+
+int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq,
+		    struct cxio_ucontext *uctx)
+{
+	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
+			  (1UL << (wq->size_log2))
+			  * sizeof(union t3_wr), wq->queue,
+			  dma_unmap_addr(wq, mapping));
+	kfree(wq->sq);
+	cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, (1UL << wq->rq_size_log2));
+	kfree(wq->rq);
+	put_qpid(rdev_p, wq->qpid, uctx);
+	return 0;
+}
+
+static void insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq)
+{
+	struct t3_cqe cqe;
+
+	pr_debug("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __func__,
+		 wq, cq, cq->sw_rptr, cq->sw_wptr);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) |
+			         V_CQE_OPCODE(T3_SEND) |
+				 V_CQE_TYPE(0) |
+				 V_CQE_SWCQE(1) |
+				 V_CQE_QPID(wq->qpid) |
+				 V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr,
+						       cq->size_log2)));
+	*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe;
+	cq->sw_wptr++;
+}
+
+int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count)
+{
+	u32 ptr;
+	int flushed = 0;
+
+	pr_debug("%s wq %p cq %p\n", __func__, wq, cq);
+
+	/* flush RQ */
+	pr_debug("%s rq_rptr %u rq_wptr %u skip count %u\n", __func__,
+		 wq->rq_rptr, wq->rq_wptr, count);
+	ptr = wq->rq_rptr + count;
+	while (ptr++ != wq->rq_wptr) {
+		insert_recv_cqe(wq, cq);
+		flushed++;
+	}
+	return flushed;
+}
+
+static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq,
+		          struct t3_swsq *sqp)
+{
+	struct t3_cqe cqe;
+
+	pr_debug("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __func__,
+		 wq, cq, cq->sw_rptr, cq->sw_wptr);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) |
+			         V_CQE_OPCODE(sqp->opcode) |
+			         V_CQE_TYPE(1) |
+			         V_CQE_SWCQE(1) |
+			         V_CQE_QPID(wq->qpid) |
+			         V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr,
+						       cq->size_log2)));
+	cqe.u.scqe.wrid_hi = sqp->sq_wptr;
+
+	*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe;
+	cq->sw_wptr++;
+}
+
+int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count)
+{
+	__u32 ptr = wq->sq_rptr + count;
+	int flushed = 0;
+	struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
+
+	while (ptr != wq->sq_wptr) {
+		sqp->signaled = 0;
+		insert_sq_cqe(wq, cq, sqp);
+		ptr++;
+		sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
+		flushed++;
+	}
+	return flushed;
+}
+
+/*
+ * Move all CQEs from the HWCQ into the SWCQ.
+ */
+void cxio_flush_hw_cq(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe, *swcqe;
+
+	pr_debug("%s cq %p cqid 0x%x\n", __func__, cq, cq->cqid);
+	cqe = cxio_next_hw_cqe(cq);
+	while (cqe) {
+		pr_debug("%s flushing hwcq rptr 0x%x to swcq wptr 0x%x\n",
+			 __func__, cq->rptr, cq->sw_wptr);
+		swcqe = cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2);
+		*swcqe = *cqe;
+		swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1));
+		cq->sw_wptr++;
+		cq->rptr++;
+		cqe = cxio_next_hw_cqe(cq);
+	}
+}
+
+static int cqe_completes_wr(struct t3_cqe *cqe, struct t3_wq *wq)
+{
+	if (CQE_OPCODE(*cqe) == T3_TERMINATE)
+		return 0;
+
+	if ((CQE_OPCODE(*cqe) == T3_RDMA_WRITE) && RQ_TYPE(*cqe))
+		return 0;
+
+	if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe))
+		return 0;
+
+	if (CQE_SEND_OPCODE(*cqe) && RQ_TYPE(*cqe) &&
+	    Q_EMPTY(wq->rq_rptr, wq->rq_wptr))
+		return 0;
+
+	return 1;
+}
+
+void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count)
+{
+	struct t3_cqe *cqe;
+	u32 ptr;
+
+	*count = 0;
+	ptr = cq->sw_rptr;
+	while (!Q_EMPTY(ptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2));
+		if ((SQ_TYPE(*cqe) ||
+		     ((CQE_OPCODE(*cqe) == T3_READ_RESP) && wq->oldest_read)) &&
+		    (CQE_QPID(*cqe) == wq->qpid))
+			(*count)++;
+		ptr++;
+	}
+	pr_debug("%s cq %p count %d\n", __func__, cq, *count);
+}
+
+void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count)
+{
+	struct t3_cqe *cqe;
+	u32 ptr;
+
+	*count = 0;
+	pr_debug("%s count zero %d\n", __func__, *count);
+	ptr = cq->sw_rptr;
+	while (!Q_EMPTY(ptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2));
+		if (RQ_TYPE(*cqe) && (CQE_OPCODE(*cqe) != T3_READ_RESP) &&
+		    (CQE_QPID(*cqe) == wq->qpid) && cqe_completes_wr(cqe, wq))
+			(*count)++;
+		ptr++;
+	}
+	pr_debug("%s cq %p count %d\n", __func__, cq, *count);
+}
+
+static int cxio_hal_init_ctrl_cq(struct cxio_rdev *rdev_p)
+{
+	struct rdma_cq_setup setup;
+	setup.id = 0;
+	setup.base_addr = 0;	/* NULL address */
+	setup.size = 1;		/* enable the CQ */
+	setup.credits = 0;
+
+	/* force SGE to redirect to RspQ and interrupt */
+	setup.credit_thres = 0;
+	setup.ovfl_mode = 1;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p)
+{
+	int err;
+	u64 sge_cmd, ctx0, ctx1;
+	u64 base_addr;
+	struct t3_modify_qp_wr *wqe;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(sizeof(*wqe), GFP_KERNEL);
+	if (!skb) {
+		pr_debug("%s alloc_skb failed\n", __func__);
+		return -ENOMEM;
+	}
+	err = cxio_hal_init_ctrl_cq(rdev_p);
+	if (err) {
+		pr_debug("%s err %d initializing ctrl_cq\n", __func__, err);
+		goto err;
+	}
+	rdev_p->ctrl_qp.workq = dma_alloc_coherent(
+					&(rdev_p->rnic_info.pdev->dev),
+					(1 << T3_CTRL_QP_SIZE_LOG2) *
+					sizeof(union t3_wr),
+					&(rdev_p->ctrl_qp.dma_addr),
+					GFP_KERNEL);
+	if (!rdev_p->ctrl_qp.workq) {
+		pr_debug("%s dma_alloc_coherent failed\n", __func__);
+		err = -ENOMEM;
+		goto err;
+	}
+	dma_unmap_addr_set(&rdev_p->ctrl_qp, mapping,
+			   rdev_p->ctrl_qp.dma_addr);
+	rdev_p->ctrl_qp.doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
+	memset(rdev_p->ctrl_qp.workq, 0,
+	       (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr));
+
+	mutex_init(&rdev_p->ctrl_qp.lock);
+	init_waitqueue_head(&rdev_p->ctrl_qp.waitq);
+
+	/* update HW Ctrl QP context */
+	base_addr = rdev_p->ctrl_qp.dma_addr;
+	base_addr >>= 12;
+	ctx0 = (V_EC_SIZE((1 << T3_CTRL_QP_SIZE_LOG2)) |
+		V_EC_BASE_LO((u32) base_addr & 0xffff));
+	ctx0 <<= 32;
+	ctx0 |= V_EC_CREDITS(FW_WR_NUM);
+	base_addr >>= 16;
+	ctx1 = (u32) base_addr;
+	base_addr >>= 32;
+	ctx1 |= ((u64) (V_EC_BASE_HI((u32) base_addr & 0xf) | V_EC_RESPQ(0) |
+			V_EC_TYPE(0) | V_EC_GEN(1) |
+			V_EC_UP_TOKEN(T3_CTL_QP_TID) | F_EC_VALID)) << 32;
+	wqe = skb_put_zero(skb, sizeof(*wqe));
+	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 0, 0,
+		       T3_CTL_QP_TID, 7, T3_SOPEOP);
+	wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
+	sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3;
+	wqe->sge_cmd = cpu_to_be64(sge_cmd);
+	wqe->ctx1 = cpu_to_be64(ctx1);
+	wqe->ctx0 = cpu_to_be64(ctx0);
+	pr_debug("CtrlQP dma_addr 0x%llx workq %p size %d\n",
+		 (unsigned long long)rdev_p->ctrl_qp.dma_addr,
+		 rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2);
+	skb->priority = CPL_PRIORITY_CONTROL;
+	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
+err:
+	kfree_skb(skb);
+	return err;
+}
+
+static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p)
+{
+	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
+			  (1UL << T3_CTRL_QP_SIZE_LOG2)
+			  * sizeof(union t3_wr), rdev_p->ctrl_qp.workq,
+			  dma_unmap_addr(&rdev_p->ctrl_qp, mapping));
+	return cxio_hal_clear_qp_ctx(rdev_p, T3_CTRL_QP_ID);
+}
+
+/* write len bytes of data into addr (32B aligned address)
+ * If data is NULL, clear len byte of memory to zero.
+ * caller acquires the ctrl_qp lock before the call
+ */
+static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
+				      u32 len, void *data)
+{
+	u32 i, nr_wqe, copy_len;
+	u8 *copy_data;
+	u8 wr_len, utx_len;	/* length in 8 byte flit */
+	enum t3_wr_flags flag;
+	__be64 *wqe;
+	u64 utx_cmd;
+	addr &= 0x7FFFFFF;
+	nr_wqe = len % 96 ? len / 96 + 1 : len / 96;	/* 96B max per WQE */
+	pr_debug("%s wptr 0x%x rptr 0x%x len %d, nr_wqe %d data %p addr 0x%0x\n",
+		 __func__, rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, len,
+		 nr_wqe, data, addr);
+	utx_len = 3;		/* in 32B unit */
+	for (i = 0; i < nr_wqe; i++) {
+		if (Q_FULL(rdev_p->ctrl_qp.rptr, rdev_p->ctrl_qp.wptr,
+		           T3_CTRL_QP_SIZE_LOG2)) {
+			pr_debug("%s ctrl_qp full wtpr 0x%0x rptr 0x%0x, wait for more space i %d\n",
+				 __func__,
+				 rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, i);
+			if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
+					     !Q_FULL(rdev_p->ctrl_qp.rptr,
+						     rdev_p->ctrl_qp.wptr,
+						     T3_CTRL_QP_SIZE_LOG2))) {
+				pr_debug("%s ctrl_qp workq interrupted\n",
+					 __func__);
+				return -ERESTARTSYS;
+			}
+			pr_debug("%s ctrl_qp wakeup, continue posting work request i %d\n",
+				 __func__, i);
+		}
+		wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr %
+						(1 << T3_CTRL_QP_SIZE_LOG2)));
+		flag = 0;
+		if (i == (nr_wqe - 1)) {
+			/* last WQE */
+			flag = T3_COMPLETION_FLAG;
+			if (len % 32)
+				utx_len = len / 32 + 1;
+			else
+				utx_len = len / 32;
+		}
+
+		/*
+		 * Force a CQE to return the credit to the workq in case
+		 * we posted more than half the max QP size of WRs
+		 */
+		if ((i != 0) &&
+		    (i % (((1 << T3_CTRL_QP_SIZE_LOG2)) >> 1) == 0)) {
+			flag = T3_COMPLETION_FLAG;
+			pr_debug("%s force completion at i %d\n", __func__, i);
+		}
+
+		/* build the utx mem command */
+		wqe += (sizeof(struct t3_bypass_wr) >> 3);
+		utx_cmd = (T3_UTX_MEM_WRITE << 28) | (addr + i * 3);
+		utx_cmd <<= 32;
+		utx_cmd |= (utx_len << 28) | ((utx_len << 2) + 1);
+		*wqe = cpu_to_be64(utx_cmd);
+		wqe++;
+		copy_data = (u8 *) data + i * 96;
+		copy_len = len > 96 ? 96 : len;
+
+		/* clear memory content if data is NULL */
+		if (data)
+			memcpy(wqe, copy_data, copy_len);
+		else
+			memset(wqe, 0, copy_len);
+		if (copy_len % 32)
+			memset(((u8 *) wqe) + copy_len, 0,
+			       32 - (copy_len % 32));
+		wr_len = ((sizeof(struct t3_bypass_wr)) >> 3) + 1 +
+			 (utx_len << 2);
+		wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr %
+			      (1 << T3_CTRL_QP_SIZE_LOG2)));
+
+		/* wptr in the WRID[31:0] */
+		((union t3_wrid *)(wqe+1))->id0.low = rdev_p->ctrl_qp.wptr;
+
+		/*
+		 * This must be the last write with a memory barrier
+		 * for the genbit
+		 */
+		build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_BP, flag,
+			       Q_GENBIT(rdev_p->ctrl_qp.wptr,
+					T3_CTRL_QP_SIZE_LOG2), T3_CTRL_QP_ID,
+			       wr_len, T3_SOPEOP);
+		if (flag == T3_COMPLETION_FLAG)
+			ring_doorbell(rdev_p->ctrl_qp.doorbell, T3_CTRL_QP_ID);
+		len -= 96;
+		rdev_p->ctrl_qp.wptr++;
+	}
+	return 0;
+}
+
+/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl_size and pbl_addr
+ * OUT: stag index
+ * TBD: shared memory region support
+ */
+static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
+			 u32 *stag, u8 stag_state, u32 pdid,
+			 enum tpt_mem_type type, enum tpt_mem_perm perm,
+			 u32 zbva, u64 to, u32 len, u8 page_size,
+			 u32 pbl_size, u32 pbl_addr)
+{
+	int err;
+	struct tpt_entry tpt;
+	u32 stag_idx;
+	u32 wptr;
+
+	if (cxio_fatal_error(rdev_p))
+		return -EIO;
+
+	stag_state = stag_state > 0;
+	stag_idx = (*stag) >> 8;
+
+	if ((!reset_tpt_entry) && !(*stag != T3_STAG_UNSET)) {
+		stag_idx = cxio_hal_get_stag(rdev_p->rscp);
+		if (!stag_idx)
+			return -ENOMEM;
+		*stag = (stag_idx << 8) | ((*stag) & 0xFF);
+	}
+	pr_debug("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n",
+		 __func__, stag_state, type, pdid, stag_idx);
+
+	mutex_lock(&rdev_p->ctrl_qp.lock);
+
+	/* write TPT entry */
+	if (reset_tpt_entry)
+		memset(&tpt, 0, sizeof(tpt));
+	else {
+		tpt.valid_stag_pdid = cpu_to_be32(F_TPT_VALID |
+				V_TPT_STAG_KEY((*stag) & M_TPT_STAG_KEY) |
+				V_TPT_STAG_STATE(stag_state) |
+				V_TPT_STAG_TYPE(type) | V_TPT_PDID(pdid));
+		BUG_ON(page_size >= 28);
+		tpt.flags_pagesize_qpid = cpu_to_be32(V_TPT_PERM(perm) |
+			((perm & TPT_MW_BIND) ? F_TPT_MW_BIND_ENABLE : 0) |
+			V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) |
+			V_TPT_PAGE_SIZE(page_size));
+		tpt.rsvd_pbl_addr = cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3));
+		tpt.len = cpu_to_be32(len);
+		tpt.va_hi = cpu_to_be32((u32) (to >> 32));
+		tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL));
+		tpt.rsvd_bind_cnt_or_pstag = 0;
+		tpt.rsvd_pbl_size = cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2));
+	}
+	err = cxio_hal_ctrl_qp_write_mem(rdev_p,
+				       stag_idx +
+				       (rdev_p->rnic_info.tpt_base >> 5),
+				       sizeof(tpt), &tpt);
+
+	/* release the stag index to free pool */
+	if (reset_tpt_entry)
+		cxio_hal_put_stag(rdev_p->rscp, stag_idx);
+
+	wptr = rdev_p->ctrl_qp.wptr;
+	mutex_unlock(&rdev_p->ctrl_qp.lock);
+	if (!err)
+		if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
+					     SEQ32_GE(rdev_p->ctrl_qp.rptr,
+						      wptr)))
+			return -ERESTARTSYS;
+	return err;
+}
+
+int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
+		   u32 pbl_addr, u32 pbl_size)
+{
+	u32 wptr;
+	int err;
+
+	pr_debug("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
+		 __func__, pbl_addr, rdev_p->rnic_info.pbl_base,
+		 pbl_size);
+
+	mutex_lock(&rdev_p->ctrl_qp.lock);
+	err = cxio_hal_ctrl_qp_write_mem(rdev_p, pbl_addr >> 5, pbl_size << 3,
+					 pbl);
+	wptr = rdev_p->ctrl_qp.wptr;
+	mutex_unlock(&rdev_p->ctrl_qp.lock);
+	if (err)
+		return err;
+
+	if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
+				     SEQ32_GE(rdev_p->ctrl_qp.rptr,
+					      wptr)))
+		return -ERESTARTSYS;
+
+	return 0;
+}
+
+int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, u32 pbl_size, u32 pbl_addr)
+{
+	*stag = T3_STAG_UNSET;
+	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
+			     zbva, to, len, page_size, pbl_size, pbl_addr);
+}
+
+int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, u32 pbl_size, u32 pbl_addr)
+{
+	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
+			     zbva, to, len, page_size, pbl_size, pbl_addr);
+}
+
+int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size,
+		   u32 pbl_addr)
+{
+	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
+			     pbl_size, pbl_addr);
+}
+
+int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid)
+{
+	*stag = T3_STAG_UNSET;
+	return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0,
+			     0, 0);
+}
+
+int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag)
+{
+	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
+			     0, 0);
+}
+
+int cxio_allocate_stag(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr)
+{
+	*stag = T3_STAG_UNSET;
+	return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_NON_SHARED_MR,
+			     0, 0, 0ULL, 0, 0, pbl_size, pbl_addr);
+}
+
+int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr)
+{
+	struct t3_rdma_init_wr *wqe;
+	struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+	pr_debug("%s rdev_p %p\n", __func__, rdev_p);
+	wqe = __skb_put(skb, sizeof(*wqe));
+	wqe->wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_INIT));
+	wqe->wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(attr->tid) |
+					   V_FW_RIWR_LEN(sizeof(*wqe) >> 3));
+	wqe->wrid.id1 = 0;
+	wqe->qpid = cpu_to_be32(attr->qpid);
+	wqe->pdid = cpu_to_be32(attr->pdid);
+	wqe->scqid = cpu_to_be32(attr->scqid);
+	wqe->rcqid = cpu_to_be32(attr->rcqid);
+	wqe->rq_addr = cpu_to_be32(attr->rq_addr - rdev_p->rnic_info.rqt_base);
+	wqe->rq_size = cpu_to_be32(attr->rq_size);
+	wqe->mpaattrs = attr->mpaattrs;
+	wqe->qpcaps = attr->qpcaps;
+	wqe->ulpdu_size = cpu_to_be16(attr->tcp_emss);
+	wqe->rqe_count = cpu_to_be16(attr->rqe_count);
+	wqe->flags_rtr_type = cpu_to_be16(attr->flags |
+					  V_RTR_TYPE(attr->rtr_type) |
+					  V_CHAN(attr->chan));
+	wqe->ord = cpu_to_be32(attr->ord);
+	wqe->ird = cpu_to_be32(attr->ird);
+	wqe->qp_dma_addr = cpu_to_be64(attr->qp_dma_addr);
+	wqe->qp_dma_size = cpu_to_be32(attr->qp_dma_size);
+	wqe->irs = cpu_to_be32(attr->irs);
+	skb->priority = 0;	/* 0=>ToeQ; 1=>CtrlQ */
+	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
+}
+
+void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
+{
+	cxio_ev_cb = ev_cb;
+}
+
+void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
+{
+	cxio_ev_cb = NULL;
+}
+
+static int cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct sk_buff *skb)
+{
+	static int cnt;
+	struct cxio_rdev *rdev_p = NULL;
+	struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data;
+	pr_debug("%d: %s cq_id 0x%x cq_ptr 0x%x genbit %0x overflow %0x an %0x se %0x notify %0x cqbranch %0x creditth %0x\n",
+		 cnt, __func__, RSPQ_CQID(rsp_msg), RSPQ_CQPTR(rsp_msg),
+		 RSPQ_GENBIT(rsp_msg), RSPQ_OVERFLOW(rsp_msg), RSPQ_AN(rsp_msg),
+		 RSPQ_SE(rsp_msg), RSPQ_NOTIFY(rsp_msg), RSPQ_CQBRANCH(rsp_msg),
+		 RSPQ_CREDIT_THRESH(rsp_msg));
+	pr_debug("CQE: QPID 0x%0x genbit %0x type 0x%0x status 0x%0x opcode %d len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
+		 CQE_QPID(rsp_msg->cqe), CQE_GENBIT(rsp_msg->cqe),
+		 CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
+		 CQE_OPCODE(rsp_msg->cqe), CQE_LEN(rsp_msg->cqe),
+		 CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
+	rdev_p = (struct cxio_rdev *)t3cdev_p->ulp;
+	if (!rdev_p) {
+		pr_debug("%s called by t3cdev %p with null ulp\n", __func__,
+			 t3cdev_p);
+		return 0;
+	}
+	if (CQE_QPID(rsp_msg->cqe) == T3_CTRL_QP_ID) {
+		rdev_p->ctrl_qp.rptr = CQE_WRID_LOW(rsp_msg->cqe) + 1;
+		wake_up_interruptible(&rdev_p->ctrl_qp.waitq);
+		dev_kfree_skb_irq(skb);
+	} else if (CQE_QPID(rsp_msg->cqe) == 0xfff8)
+		dev_kfree_skb_irq(skb);
+	else if (cxio_ev_cb)
+		(*cxio_ev_cb) (rdev_p, skb);
+	else
+		dev_kfree_skb_irq(skb);
+	cnt++;
+	return 0;
+}
+
+/* Caller takes care of locking if needed */
+int cxio_rdev_open(struct cxio_rdev *rdev_p)
+{
+	struct net_device *netdev_p = NULL;
+	int err = 0;
+	if (strlen(rdev_p->dev_name)) {
+		if (cxio_hal_find_rdev_by_name(rdev_p->dev_name)) {
+			return -EBUSY;
+		}
+		netdev_p = dev_get_by_name(&init_net, rdev_p->dev_name);
+		if (!netdev_p) {
+			return -EINVAL;
+		}
+		dev_put(netdev_p);
+	} else if (rdev_p->t3cdev_p) {
+		if (cxio_hal_find_rdev_by_t3cdev(rdev_p->t3cdev_p)) {
+			return -EBUSY;
+		}
+		netdev_p = rdev_p->t3cdev_p->lldev;
+		strncpy(rdev_p->dev_name, rdev_p->t3cdev_p->name,
+			T3_MAX_DEV_NAME_LEN);
+	} else {
+		pr_debug("%s t3cdev_p or dev_name must be set\n", __func__);
+		return -EINVAL;
+	}
+
+	list_add_tail(&rdev_p->entry, &rdev_list);
+
+	pr_debug("%s opening rnic dev %s\n", __func__, rdev_p->dev_name);
+	memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp));
+	if (!rdev_p->t3cdev_p)
+		rdev_p->t3cdev_p = dev2t3cdev(netdev_p);
+	rdev_p->t3cdev_p->ulp = (void *) rdev_p;
+
+	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_EMBEDDED_INFO,
+					 &(rdev_p->fw_info));
+	if (err) {
+		pr_err("%s t3cdev_p(%p)->ctl returned error %d\n",
+		       __func__, rdev_p->t3cdev_p, err);
+		goto err1;
+	}
+	if (G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers) != CXIO_FW_MAJ) {
+		pr_err("fatal firmware version mismatch: need version %u but adapter has version %u\n",
+		       CXIO_FW_MAJ,
+		       G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers));
+		err = -EINVAL;
+		goto err1;
+	}
+
+	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS,
+					 &(rdev_p->rnic_info));
+	if (err) {
+		pr_err("%s t3cdev_p(%p)->ctl returned error %d\n",
+		       __func__, rdev_p->t3cdev_p, err);
+		goto err1;
+	}
+	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_PORTS,
+				    &(rdev_p->port_info));
+	if (err) {
+		pr_err("%s t3cdev_p(%p)->ctl returned error %d\n",
+		       __func__, rdev_p->t3cdev_p, err);
+		goto err1;
+	}
+
+	/*
+	 * qpshift is the number of bits to shift the qpid left in order
+	 * to get the correct address of the doorbell for that qp.
+	 */
+	cxio_init_ucontext(rdev_p, &rdev_p->uctx);
+	rdev_p->qpshift = PAGE_SHIFT -
+			  ilog2(65536 >>
+			            ilog2(rdev_p->rnic_info.udbell_len >>
+					      PAGE_SHIFT));
+	rdev_p->qpnr = rdev_p->rnic_info.udbell_len >> PAGE_SHIFT;
+	rdev_p->qpmask = (65536 >> ilog2(rdev_p->qpnr)) - 1;
+	pr_debug("%s rnic %s info: tpt_base 0x%0x tpt_top 0x%0x num stags %d pbl_base 0x%0x pbl_top 0x%0x rqt_base 0x%0x, rqt_top 0x%0x\n",
+		 __func__, rdev_p->dev_name, rdev_p->rnic_info.tpt_base,
+		 rdev_p->rnic_info.tpt_top, cxio_num_stags(rdev_p),
+		 rdev_p->rnic_info.pbl_base,
+		 rdev_p->rnic_info.pbl_top, rdev_p->rnic_info.rqt_base,
+		 rdev_p->rnic_info.rqt_top);
+	pr_debug("udbell_len 0x%0x udbell_physbase 0x%lx kdb_addr %p qpshift %lu qpnr %d qpmask 0x%x\n",
+		 rdev_p->rnic_info.udbell_len,
+		 rdev_p->rnic_info.udbell_physbase, rdev_p->rnic_info.kdb_addr,
+		 rdev_p->qpshift, rdev_p->qpnr, rdev_p->qpmask);
+
+	err = cxio_hal_init_ctrl_qp(rdev_p);
+	if (err) {
+		pr_err("%s error %d initializing ctrl_qp\n", __func__, err);
+		goto err1;
+	}
+	err = cxio_hal_init_resource(rdev_p, cxio_num_stags(rdev_p), 0,
+				     0, T3_MAX_NUM_QP, T3_MAX_NUM_CQ,
+				     T3_MAX_NUM_PD);
+	if (err) {
+		pr_err("%s error %d initializing hal resources\n",
+		       __func__, err);
+		goto err2;
+	}
+	err = cxio_hal_pblpool_create(rdev_p);
+	if (err) {
+		pr_err("%s error %d initializing pbl mem pool\n",
+		       __func__, err);
+		goto err3;
+	}
+	err = cxio_hal_rqtpool_create(rdev_p);
+	if (err) {
+		pr_err("%s error %d initializing rqt mem pool\n",
+		       __func__, err);
+		goto err4;
+	}
+	return 0;
+err4:
+	cxio_hal_pblpool_destroy(rdev_p);
+err3:
+	cxio_hal_destroy_resource(rdev_p->rscp);
+err2:
+	cxio_hal_destroy_ctrl_qp(rdev_p);
+err1:
+	rdev_p->t3cdev_p->ulp = NULL;
+	list_del(&rdev_p->entry);
+	return err;
+}
+
+void cxio_rdev_close(struct cxio_rdev *rdev_p)
+{
+	if (rdev_p) {
+		cxio_hal_pblpool_destroy(rdev_p);
+		cxio_hal_rqtpool_destroy(rdev_p);
+		list_del(&rdev_p->entry);
+		cxio_hal_destroy_ctrl_qp(rdev_p);
+		cxio_hal_destroy_resource(rdev_p->rscp);
+		rdev_p->t3cdev_p->ulp = NULL;
+	}
+}
+
+int __init cxio_hal_init(void)
+{
+	if (cxio_hal_init_rhdl_resource(T3_MAX_NUM_RI))
+		return -ENOMEM;
+	t3_register_cpl_handler(CPL_ASYNC_NOTIF, cxio_hal_ev_handler);
+	return 0;
+}
+
+void __exit cxio_hal_exit(void)
+{
+	struct cxio_rdev *rdev, *tmp;
+
+	t3_register_cpl_handler(CPL_ASYNC_NOTIF, NULL);
+	list_for_each_entry_safe(rdev, tmp, &rdev_list, entry)
+		cxio_rdev_close(rdev);
+	cxio_hal_destroy_rhdl_resource();
+}
+
+static void flush_completed_wrs(struct t3_wq *wq, struct t3_cq *cq)
+{
+	struct t3_swsq *sqp;
+	__u32 ptr = wq->sq_rptr;
+	int count = Q_COUNT(wq->sq_rptr, wq->sq_wptr);
+
+	sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
+	while (count--)
+		if (!sqp->signaled) {
+			ptr++;
+			sqp = wq->sq + Q_PTR2IDX(ptr,  wq->sq_size_log2);
+		} else if (sqp->complete) {
+
+			/*
+			 * Insert this completed cqe into the swcq.
+			 */
+			pr_debug("%s moving cqe into swcq sq idx %ld cq idx %ld\n",
+				 __func__, Q_PTR2IDX(ptr,  wq->sq_size_log2),
+				 Q_PTR2IDX(cq->sw_wptr, cq->size_log2));
+			sqp->cqe.header |= htonl(V_CQE_SWCQE(1));
+			*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2))
+				= sqp->cqe;
+			cq->sw_wptr++;
+			sqp->signaled = 0;
+			break;
+		} else
+			break;
+}
+
+static void create_read_req_cqe(struct t3_wq *wq, struct t3_cqe *hw_cqe,
+				struct t3_cqe *read_cqe)
+{
+	read_cqe->u.scqe.wrid_hi = wq->oldest_read->sq_wptr;
+	read_cqe->len = wq->oldest_read->read_len;
+	read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(*hw_cqe)) |
+				 V_CQE_SWCQE(SW_CQE(*hw_cqe)) |
+				 V_CQE_OPCODE(T3_READ_REQ) |
+				 V_CQE_TYPE(1));
+}
+
+/*
+ * Return a ptr to the next read wr in the SWSQ or NULL.
+ */
+static void advance_oldest_read(struct t3_wq *wq)
+{
+
+	u32 rptr = wq->oldest_read - wq->sq + 1;
+	u32 wptr = Q_PTR2IDX(wq->sq_wptr, wq->sq_size_log2);
+
+	while (Q_PTR2IDX(rptr, wq->sq_size_log2) != wptr) {
+		wq->oldest_read = wq->sq + Q_PTR2IDX(rptr, wq->sq_size_log2);
+
+		if (wq->oldest_read->opcode == T3_READ_REQ)
+			return;
+		rptr++;
+	}
+	wq->oldest_read = NULL;
+}
+
+/*
+ * cxio_poll_cq
+ *
+ * Caller must:
+ *     check the validity of the first CQE,
+ *     supply the wq assicated with the qpid.
+ *
+ * credit: cq credit to return to sge.
+ * cqe_flushed: 1 iff the CQE is flushed.
+ * cqe: copy of the polled CQE.
+ *
+ * return value:
+ *     0       CQE returned,
+ *    -1       CQE skipped, try again.
+ */
+int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
+		     u8 *cqe_flushed, u64 *cookie, u32 *credit)
+{
+	int ret = 0;
+	struct t3_cqe *hw_cqe, read_cqe;
+
+	*cqe_flushed = 0;
+	*credit = 0;
+	hw_cqe = cxio_next_cqe(cq);
+
+	pr_debug("%s CQE OOO %d qpid 0x%0x genbit %d type %d status 0x%0x opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
+		 __func__, CQE_OOO(*hw_cqe), CQE_QPID(*hw_cqe),
+		 CQE_GENBIT(*hw_cqe), CQE_TYPE(*hw_cqe), CQE_STATUS(*hw_cqe),
+		 CQE_OPCODE(*hw_cqe), CQE_LEN(*hw_cqe), CQE_WRID_HI(*hw_cqe),
+		 CQE_WRID_LOW(*hw_cqe));
+
+	/*
+	 * skip cqe's not affiliated with a QP.
+	 */
+	if (wq == NULL) {
+		ret = -1;
+		goto skip_cqe;
+	}
+
+	/*
+	 * Gotta tweak READ completions:
+	 *	1) the cqe doesn't contain the sq_wptr from the wr.
+	 *	2) opcode not reflected from the wr.
+	 *	3) read_len not reflected from the wr.
+	 *	4) cq_type is RQ_TYPE not SQ_TYPE.
+	 */
+	if (RQ_TYPE(*hw_cqe) && (CQE_OPCODE(*hw_cqe) == T3_READ_RESP)) {
+
+		/*
+		 * If this is an unsolicited read response, then the read
+		 * was generated by the kernel driver as part of peer-2-peer
+		 * connection setup.  So ignore the completion.
+		 */
+		if (!wq->oldest_read) {
+			if (CQE_STATUS(*hw_cqe))
+				wq->error = 1;
+			ret = -1;
+			goto skip_cqe;
+		}
+
+		/*
+		 * Don't write to the HWCQ, so create a new read req CQE
+		 * in local memory.
+		 */
+		create_read_req_cqe(wq, hw_cqe, &read_cqe);
+		hw_cqe = &read_cqe;
+		advance_oldest_read(wq);
+	}
+
+	/*
+	 * T3A: Discard TERMINATE CQEs.
+	 */
+	if (CQE_OPCODE(*hw_cqe) == T3_TERMINATE) {
+		ret = -1;
+		wq->error = 1;
+		goto skip_cqe;
+	}
+
+	if (CQE_STATUS(*hw_cqe) || wq->error) {
+		*cqe_flushed = wq->error;
+		wq->error = 1;
+
+		/*
+		 * T3A inserts errors into the CQE.  We cannot return
+		 * these as work completions.
+		 */
+		/* incoming write failures */
+		if ((CQE_OPCODE(*hw_cqe) == T3_RDMA_WRITE)
+		     && RQ_TYPE(*hw_cqe)) {
+			ret = -1;
+			goto skip_cqe;
+		}
+		/* incoming read request failures */
+		if ((CQE_OPCODE(*hw_cqe) == T3_READ_RESP) && SQ_TYPE(*hw_cqe)) {
+			ret = -1;
+			goto skip_cqe;
+		}
+
+		/* incoming SEND with no receive posted failures */
+		if (CQE_SEND_OPCODE(*hw_cqe) && RQ_TYPE(*hw_cqe) &&
+		    Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
+			ret = -1;
+			goto skip_cqe;
+		}
+		BUG_ON((*cqe_flushed == 0) && !SW_CQE(*hw_cqe));
+		goto proc_cqe;
+	}
+
+	/*
+	 * RECV completion.
+	 */
+	if (RQ_TYPE(*hw_cqe)) {
+
+		/*
+		 * HW only validates 4 bits of MSN.  So we must validate that
+		 * the MSN in the SEND is the next expected MSN.  If its not,
+		 * then we complete this with TPT_ERR_MSN and mark the wq in
+		 * error.
+		 */
+
+		if (Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
+			wq->error = 1;
+			ret = -1;
+			goto skip_cqe;
+		}
+
+		if (unlikely((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) {
+			wq->error = 1;
+			hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN));
+			goto proc_cqe;
+		}
+		goto proc_cqe;
+	}
+
+	/*
+	 * If we get here its a send completion.
+	 *
+	 * Handle out of order completion. These get stuffed
+	 * in the SW SQ. Then the SW SQ is walked to move any
+	 * now in-order completions into the SW CQ.  This handles
+	 * 2 cases:
+	 *	1) reaping unsignaled WRs when the first subsequent
+	 *	   signaled WR is completed.
+	 *	2) out of order read completions.
+	 */
+	if (!SW_CQE(*hw_cqe) && (CQE_WRID_SQ_WPTR(*hw_cqe) != wq->sq_rptr)) {
+		struct t3_swsq *sqp;
+
+		pr_debug("%s out of order completion going in swsq at idx %ld\n",
+			 __func__,
+			 Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe),
+				   wq->sq_size_log2));
+		sqp = wq->sq +
+		      Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2);
+		sqp->cqe = *hw_cqe;
+		sqp->complete = 1;
+		ret = -1;
+		goto flush_wq;
+	}
+
+proc_cqe:
+	*cqe = *hw_cqe;
+
+	/*
+	 * Reap the associated WR(s) that are freed up with this
+	 * completion.
+	 */
+	if (SQ_TYPE(*hw_cqe)) {
+		wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe);
+		pr_debug("%s completing sq idx %ld\n", __func__,
+			 Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2));
+		*cookie = wq->sq[Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)].wr_id;
+		wq->sq_rptr++;
+	} else {
+		pr_debug("%s completing rq idx %ld\n", __func__,
+			 Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
+		*cookie = wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].wr_id;
+		if (wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].pbl_addr)
+			cxio_hal_pblpool_free(wq->rdev,
+				wq->rq[Q_PTR2IDX(wq->rq_rptr,
+				wq->rq_size_log2)].pbl_addr, T3_STAG0_PBL_SIZE);
+		BUG_ON(Q_EMPTY(wq->rq_rptr, wq->rq_wptr));
+		wq->rq_rptr++;
+	}
+
+flush_wq:
+	/*
+	 * Flush any completed cqes that are now in-order.
+	 */
+	flush_completed_wrs(wq, cq);
+
+skip_cqe:
+	if (SW_CQE(*hw_cqe)) {
+		pr_debug("%s cq %p cqid 0x%x skip sw cqe sw_rptr 0x%x\n",
+			 __func__, cq, cq->cqid, cq->sw_rptr);
+		++cq->sw_rptr;
+	} else {
+		pr_debug("%s cq %p cqid 0x%x skip hw cqe rptr 0x%x\n",
+			 __func__, cq, cq->cqid, cq->rptr);
+		++cq->rptr;
+
+		/*
+		 * T3A: compute credits.
+		 */
+		if (((cq->rptr - cq->wptr) > (1 << (cq->size_log2 - 1)))
+		    || ((cq->rptr - cq->wptr) >= 128)) {
+			*credit = cq->rptr - cq->wptr;
+			cq->wptr = cq->rptr;
+		}
+	}
+	return ret;
+}
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
new file mode 100644
index 0000000..c64e50b
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef  __CXIO_HAL_H__
+#define  __CXIO_HAL_H__
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/kfifo.h>
+
+#include "t3_cpl.h"
+#include "t3cdev.h"
+#include "cxgb3_ctl_defs.h"
+#include "cxio_wr.h"
+
+#define T3_CTRL_QP_ID    FW_RI_SGEEC_START
+#define T3_CTL_QP_TID	 FW_RI_TID_START
+#define T3_CTRL_QP_SIZE_LOG2  8
+#define T3_CTRL_CQ_ID    0
+
+#define T3_MAX_NUM_RI (1<<15)
+#define T3_MAX_NUM_QP (1<<15)
+#define T3_MAX_NUM_CQ (1<<15)
+#define T3_MAX_NUM_PD (1<<15)
+#define T3_MAX_PBL_SIZE 256
+#define T3_MAX_RQ_SIZE 1024
+#define T3_MAX_QP_DEPTH (T3_MAX_RQ_SIZE-1)
+#define T3_MAX_CQ_DEPTH 65536
+#define T3_MAX_NUM_STAG (1<<15)
+#define T3_MAX_MR_SIZE 0x100000000ULL
+#define T3_PAGESIZE_MASK 0xffff000  /* 4KB-128MB */
+
+#define T3_STAG_UNSET 0xffffffff
+
+#define T3_MAX_DEV_NAME_LEN 32
+
+#define CXIO_FW_MAJ 7
+
+struct cxio_hal_ctrl_qp {
+	u32 wptr;
+	u32 rptr;
+	struct mutex lock;	/* for the wtpr, can sleep */
+	wait_queue_head_t waitq;/* wait for RspQ/CQE msg */
+	union t3_wr *workq;	/* the work request queue */
+	dma_addr_t dma_addr;	/* pci bus address of the workq */
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	void __iomem *doorbell;
+};
+
+struct cxio_hal_resource {
+	struct kfifo tpt_fifo;
+	spinlock_t tpt_fifo_lock;
+	struct kfifo qpid_fifo;
+	spinlock_t qpid_fifo_lock;
+	struct kfifo cqid_fifo;
+	spinlock_t cqid_fifo_lock;
+	struct kfifo pdid_fifo;
+	spinlock_t pdid_fifo_lock;
+};
+
+struct cxio_qpid_list {
+	struct list_head entry;
+	u32 qpid;
+};
+
+struct cxio_ucontext {
+	struct list_head qpids;
+	struct mutex lock;
+};
+
+struct cxio_rdev {
+	char dev_name[T3_MAX_DEV_NAME_LEN];
+	struct t3cdev *t3cdev_p;
+	struct rdma_info rnic_info;
+	struct adap_ports port_info;
+	struct cxio_hal_resource *rscp;
+	struct cxio_hal_ctrl_qp ctrl_qp;
+	void *ulp;
+	unsigned long qpshift;
+	u32 qpnr;
+	u32 qpmask;
+	struct cxio_ucontext uctx;
+	struct gen_pool *pbl_pool;
+	struct gen_pool *rqt_pool;
+	struct list_head entry;
+	struct ch_embedded_info fw_info;
+	u32	flags;
+#define	CXIO_ERROR_FATAL	1
+};
+
+static inline int cxio_fatal_error(struct cxio_rdev *rdev_p)
+{
+	return rdev_p->flags & CXIO_ERROR_FATAL;
+}
+
+static inline int cxio_num_stags(struct cxio_rdev *rdev_p)
+{
+	return min((int)T3_MAX_NUM_STAG, (int)((rdev_p->rnic_info.tpt_top - rdev_p->rnic_info.tpt_base) >> 5));
+}
+
+typedef void (*cxio_hal_ev_callback_func_t) (struct cxio_rdev * rdev_p,
+					     struct sk_buff * skb);
+
+#define RSPQ_CQID(rsp) (be32_to_cpu(rsp->cq_ptrid) & 0xffff)
+#define RSPQ_CQPTR(rsp) ((be32_to_cpu(rsp->cq_ptrid) >> 16) & 0xffff)
+#define RSPQ_GENBIT(rsp) ((be32_to_cpu(rsp->flags) >> 16) & 1)
+#define RSPQ_OVERFLOW(rsp) ((be32_to_cpu(rsp->flags) >> 17) & 1)
+#define RSPQ_AN(rsp) ((be32_to_cpu(rsp->flags) >> 18) & 1)
+#define RSPQ_SE(rsp) ((be32_to_cpu(rsp->flags) >> 19) & 1)
+#define RSPQ_NOTIFY(rsp) ((be32_to_cpu(rsp->flags) >> 20) & 1)
+#define RSPQ_CQBRANCH(rsp) ((be32_to_cpu(rsp->flags) >> 21) & 1)
+#define RSPQ_CREDIT_THRESH(rsp) ((be32_to_cpu(rsp->flags) >> 22) & 1)
+
+struct respQ_msg_t {
+	__be32 flags;		/* flit 0 */
+	__be32 cq_ptrid;
+	__be64 rsvd;		/* flit 1 */
+	struct t3_cqe cqe;	/* flits 2-3 */
+};
+
+enum t3_cq_opcode {
+	CQ_ARM_AN = 0x2,
+	CQ_ARM_SE = 0x6,
+	CQ_FORCE_AN = 0x3,
+	CQ_CREDIT_UPDATE = 0x7
+};
+
+int cxio_rdev_open(struct cxio_rdev *rdev);
+void cxio_rdev_close(struct cxio_rdev *rdev);
+int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq,
+		   enum t3_cq_opcode op, u32 credit);
+int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel);
+int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
+void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
+int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq,
+		   struct cxio_ucontext *uctx);
+int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq,
+		    struct cxio_ucontext *uctx);
+int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode);
+int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
+		   u32 pbl_addr, u32 pbl_size);
+int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, u32 pbl_size, u32 pbl_addr);
+int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, u32 pbl_size, u32 pbl_addr);
+int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size,
+		   u32 pbl_addr);
+int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid);
+int cxio_allocate_stag(struct cxio_rdev *rdev, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr);
+int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag);
+int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr);
+void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
+void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
+u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp);
+void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid);
+int __init cxio_hal_init(void);
+void __exit cxio_hal_exit(void);
+int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count);
+int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count);
+void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
+void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
+void cxio_flush_hw_cq(struct t3_cq *cq);
+int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
+		     u8 *cqe_flushed, u64 *cookie, u32 *credit);
+int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb);
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c
new file mode 100644
index 0000000..c6e7bc4
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* Crude resource management */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/kfifo.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include "cxio_resource.h"
+#include "cxio_hal.h"
+
+static struct kfifo rhdl_fifo;
+static spinlock_t rhdl_fifo_lock;
+
+#define RANDOM_SIZE 16
+
+static int __cxio_init_resource_fifo(struct kfifo *fifo,
+				   spinlock_t *fifo_lock,
+				   u32 nr, u32 skip_low,
+				   u32 skip_high,
+				   int random)
+{
+	u32 i, j, entry = 0, idx;
+	u32 random_bytes;
+	u32 rarray[16];
+	spin_lock_init(fifo_lock);
+
+	if (kfifo_alloc(fifo, nr * sizeof(u32), GFP_KERNEL))
+		return -ENOMEM;
+
+	for (i = 0; i < skip_low + skip_high; i++)
+		kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32));
+	if (random) {
+		j = 0;
+		random_bytes = prandom_u32();
+		for (i = 0; i < RANDOM_SIZE; i++)
+			rarray[i] = i + skip_low;
+		for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) {
+			if (j >= RANDOM_SIZE) {
+				j = 0;
+				random_bytes = prandom_u32();
+			}
+			idx = (random_bytes >> (j * 2)) & 0xF;
+			kfifo_in(fifo,
+				(unsigned char *) &rarray[idx],
+				sizeof(u32));
+			rarray[idx] = i;
+			j++;
+		}
+		for (i = 0; i < RANDOM_SIZE; i++)
+			kfifo_in(fifo,
+				(unsigned char *) &rarray[i],
+				sizeof(u32));
+	} else
+		for (i = skip_low; i < nr - skip_high; i++)
+			kfifo_in(fifo, (unsigned char *) &i, sizeof(u32));
+
+	for (i = 0; i < skip_low + skip_high; i++)
+		if (kfifo_out_locked(fifo, (unsigned char *) &entry,
+				sizeof(u32), fifo_lock) != sizeof(u32))
+					break;
+	return 0;
+}
+
+static int cxio_init_resource_fifo(struct kfifo *fifo, spinlock_t * fifo_lock,
+				   u32 nr, u32 skip_low, u32 skip_high)
+{
+	return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
+					  skip_high, 0));
+}
+
+static int cxio_init_resource_fifo_random(struct kfifo *fifo,
+				   spinlock_t * fifo_lock,
+				   u32 nr, u32 skip_low, u32 skip_high)
+{
+
+	return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
+					  skip_high, 1));
+}
+
+static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p)
+{
+	u32 i;
+
+	spin_lock_init(&rdev_p->rscp->qpid_fifo_lock);
+
+	if (kfifo_alloc(&rdev_p->rscp->qpid_fifo, T3_MAX_NUM_QP * sizeof(u32),
+					      GFP_KERNEL))
+		return -ENOMEM;
+
+	for (i = 16; i < T3_MAX_NUM_QP; i++)
+		if (!(i & rdev_p->qpmask))
+			kfifo_in(&rdev_p->rscp->qpid_fifo,
+				    (unsigned char *) &i, sizeof(u32));
+	return 0;
+}
+
+int cxio_hal_init_rhdl_resource(u32 nr_rhdl)
+{
+	return cxio_init_resource_fifo(&rhdl_fifo, &rhdl_fifo_lock, nr_rhdl, 1,
+				       0);
+}
+
+void cxio_hal_destroy_rhdl_resource(void)
+{
+	kfifo_free(&rhdl_fifo);
+}
+
+/* nr_* must be power of 2 */
+int cxio_hal_init_resource(struct cxio_rdev *rdev_p,
+			   u32 nr_tpt, u32 nr_pbl,
+			   u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, u32 nr_pdid)
+{
+	int err = 0;
+	struct cxio_hal_resource *rscp;
+
+	rscp = kmalloc(sizeof(*rscp), GFP_KERNEL);
+	if (!rscp)
+		return -ENOMEM;
+	rdev_p->rscp = rscp;
+	err = cxio_init_resource_fifo_random(&rscp->tpt_fifo,
+				      &rscp->tpt_fifo_lock,
+				      nr_tpt, 1, 0);
+	if (err)
+		goto tpt_err;
+	err = cxio_init_qpid_fifo(rdev_p);
+	if (err)
+		goto qpid_err;
+	err = cxio_init_resource_fifo(&rscp->cqid_fifo, &rscp->cqid_fifo_lock,
+				      nr_cqid, 1, 0);
+	if (err)
+		goto cqid_err;
+	err = cxio_init_resource_fifo(&rscp->pdid_fifo, &rscp->pdid_fifo_lock,
+				      nr_pdid, 1, 0);
+	if (err)
+		goto pdid_err;
+	return 0;
+pdid_err:
+	kfifo_free(&rscp->cqid_fifo);
+cqid_err:
+	kfifo_free(&rscp->qpid_fifo);
+qpid_err:
+	kfifo_free(&rscp->tpt_fifo);
+tpt_err:
+	return -ENOMEM;
+}
+
+/*
+ * returns 0 if no resource available
+ */
+static u32 cxio_hal_get_resource(struct kfifo *fifo, spinlock_t * lock)
+{
+	u32 entry;
+	if (kfifo_out_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock))
+		return entry;
+	else
+		return 0;	/* fifo emptry */
+}
+
+static void cxio_hal_put_resource(struct kfifo *fifo, spinlock_t * lock,
+		u32 entry)
+{
+	BUG_ON(
+	kfifo_in_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock)
+	== 0);
+}
+
+u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp)
+{
+	return cxio_hal_get_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock);
+}
+
+void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag)
+{
+	cxio_hal_put_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock, stag);
+}
+
+u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp)
+{
+	u32 qpid = cxio_hal_get_resource(&rscp->qpid_fifo,
+			&rscp->qpid_fifo_lock);
+	pr_debug("%s qpid 0x%x\n", __func__, qpid);
+	return qpid;
+}
+
+void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid)
+{
+	pr_debug("%s qpid 0x%x\n", __func__, qpid);
+	cxio_hal_put_resource(&rscp->qpid_fifo, &rscp->qpid_fifo_lock, qpid);
+}
+
+u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp)
+{
+	return cxio_hal_get_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock);
+}
+
+void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid)
+{
+	cxio_hal_put_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock, cqid);
+}
+
+u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp)
+{
+	return cxio_hal_get_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock);
+}
+
+void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid)
+{
+	cxio_hal_put_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock, pdid);
+}
+
+void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp)
+{
+	kfifo_free(&rscp->tpt_fifo);
+	kfifo_free(&rscp->cqid_fifo);
+	kfifo_free(&rscp->qpid_fifo);
+	kfifo_free(&rscp->pdid_fifo);
+	kfree(rscp);
+}
+
+/*
+ * PBL Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_PBL_SHIFT 8			/* 256B == min PBL size (32 entries) */
+
+u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev_p->pbl_pool, size);
+	pr_debug("%s addr 0x%x size %d\n", __func__, (u32)addr, size);
+	return (u32)addr;
+}
+
+void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size)
+{
+	pr_debug("%s addr 0x%x size %d\n", __func__, addr, size);
+	gen_pool_free(rdev_p->pbl_pool, (unsigned long)addr, size);
+}
+
+int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p)
+{
+	unsigned pbl_start, pbl_chunk;
+
+	rdev_p->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1);
+	if (!rdev_p->pbl_pool)
+		return -ENOMEM;
+
+	pbl_start = rdev_p->rnic_info.pbl_base;
+	pbl_chunk = rdev_p->rnic_info.pbl_top - pbl_start + 1;
+
+	while (pbl_start < rdev_p->rnic_info.pbl_top) {
+		pbl_chunk = min(rdev_p->rnic_info.pbl_top - pbl_start + 1,
+				pbl_chunk);
+		if (gen_pool_add(rdev_p->pbl_pool, pbl_start, pbl_chunk, -1)) {
+			pr_debug("%s failed to add PBL chunk (%x/%x)\n",
+				 __func__, pbl_start, pbl_chunk);
+			if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) {
+				pr_warn("%s: Failed to add all PBL chunks (%x/%x)\n",
+					__func__, pbl_start,
+					rdev_p->rnic_info.pbl_top - pbl_start);
+				return 0;
+			}
+			pbl_chunk >>= 1;
+		} else {
+			pr_debug("%s added PBL chunk (%x/%x)\n",
+				 __func__, pbl_start, pbl_chunk);
+			pbl_start += pbl_chunk;
+		}
+	}
+
+	return 0;
+}
+
+void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p)
+{
+	gen_pool_destroy(rdev_p->pbl_pool);
+}
+
+/*
+ * RQT Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_RQT_SHIFT 10	/* 1KB == mini RQT size (16 entries) */
+#define RQT_CHUNK 2*1024*1024
+
+u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev_p->rqt_pool, size << 6);
+	pr_debug("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6);
+	return (u32)addr;
+}
+
+void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size)
+{
+	pr_debug("%s addr 0x%x size %d\n", __func__, addr, size << 6);
+	gen_pool_free(rdev_p->rqt_pool, (unsigned long)addr, size << 6);
+}
+
+int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p)
+{
+	unsigned long i;
+	rdev_p->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1);
+	if (rdev_p->rqt_pool)
+		for (i = rdev_p->rnic_info.rqt_base;
+		     i <= rdev_p->rnic_info.rqt_top - RQT_CHUNK + 1;
+		     i += RQT_CHUNK)
+			gen_pool_add(rdev_p->rqt_pool, i, RQT_CHUNK, -1);
+	return rdev_p->rqt_pool ? 0 : -ENOMEM;
+}
+
+void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p)
+{
+	gen_pool_destroy(rdev_p->rqt_pool);
+}
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.h b/drivers/infiniband/hw/cxgb3/cxio_resource.h
new file mode 100644
index 0000000..a2703a3
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_resource.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __CXIO_RESOURCE_H__
+#define __CXIO_RESOURCE_H__
+
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/kfifo.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/genalloc.h>
+#include "cxio_hal.h"
+
+extern int cxio_hal_init_rhdl_resource(u32 nr_rhdl);
+extern void cxio_hal_destroy_rhdl_resource(void);
+extern int cxio_hal_init_resource(struct cxio_rdev *rdev_p,
+				  u32 nr_tpt, u32 nr_pbl,
+				  u32 nr_rqt, u32 nr_qpid, u32 nr_cqid,
+				  u32 nr_pdid);
+extern u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag);
+extern u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid);
+extern u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid);
+extern void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp);
+
+#define PBL_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.pbl_base )
+extern int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p);
+extern void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p);
+extern u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size);
+extern void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size);
+
+#define RQT_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.rqt_base )
+extern int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p);
+extern void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p);
+extern u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size);
+extern void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size);
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h
new file mode 100644
index 0000000..83d2e19
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __CXIO_WR_H__
+#define __CXIO_WR_H__
+
+#include <asm/io.h>
+#include <linux/pci.h>
+#include <linux/timer.h>
+#include "firmware_exports.h"
+
+#define T3_MAX_SGE      4
+#define T3_MAX_INLINE	64
+#define T3_STAG0_PBL_SIZE (2 * T3_MAX_SGE << 3)
+#define T3_STAG0_MAX_PBE_LEN (128 * 1024 * 1024)
+#define T3_STAG0_PAGE_SHIFT 15
+
+#define Q_EMPTY(rptr,wptr) ((rptr)==(wptr))
+#define Q_FULL(rptr,wptr,size_log2)  ( (((wptr)-(rptr))>>(size_log2)) && \
+				       ((rptr)!=(wptr)) )
+#define Q_GENBIT(ptr,size_log2) (!(((ptr)>>size_log2)&0x1))
+#define Q_FREECNT(rptr,wptr,size_log2) ((1UL<<size_log2)-((wptr)-(rptr)))
+#define Q_COUNT(rptr,wptr) ((wptr)-(rptr))
+#define Q_PTR2IDX(ptr,size_log2) (ptr & ((1UL<<size_log2)-1))
+
+static inline void ring_doorbell(void __iomem *doorbell, u32 qpid)
+{
+	writel(((1<<31) | qpid), doorbell);
+}
+
+#define SEQ32_GE(x,y) (!( (((u32) (x)) - ((u32) (y))) & 0x80000000 ))
+
+enum t3_wr_flags {
+	T3_COMPLETION_FLAG = 0x01,
+	T3_NOTIFY_FLAG = 0x02,
+	T3_SOLICITED_EVENT_FLAG = 0x04,
+	T3_READ_FENCE_FLAG = 0x08,
+	T3_LOCAL_FENCE_FLAG = 0x10
+} __attribute__ ((packed));
+
+enum t3_wr_opcode {
+	T3_WR_BP = FW_WROPCODE_RI_BYPASS,
+	T3_WR_SEND = FW_WROPCODE_RI_SEND,
+	T3_WR_WRITE = FW_WROPCODE_RI_RDMA_WRITE,
+	T3_WR_READ = FW_WROPCODE_RI_RDMA_READ,
+	T3_WR_INV_STAG = FW_WROPCODE_RI_LOCAL_INV,
+	T3_WR_BIND = FW_WROPCODE_RI_BIND_MW,
+	T3_WR_RCV = FW_WROPCODE_RI_RECEIVE,
+	T3_WR_INIT = FW_WROPCODE_RI_RDMA_INIT,
+	T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP,
+	T3_WR_FASTREG = FW_WROPCODE_RI_FASTREGISTER_MR
+} __attribute__ ((packed));
+
+enum t3_rdma_opcode {
+	T3_RDMA_WRITE,		/* IETF RDMAP v1.0 ... */
+	T3_READ_REQ,
+	T3_READ_RESP,
+	T3_SEND,
+	T3_SEND_WITH_INV,
+	T3_SEND_WITH_SE,
+	T3_SEND_WITH_SE_INV,
+	T3_TERMINATE,
+	T3_RDMA_INIT,		/* CHELSIO RI specific ... */
+	T3_BIND_MW,
+	T3_FAST_REGISTER,
+	T3_LOCAL_INV,
+	T3_QP_MOD,
+	T3_BYPASS,
+	T3_RDMA_READ_REQ_WITH_INV,
+} __attribute__ ((packed));
+
+static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop)
+{
+	switch (wrop) {
+		case T3_WR_BP: return T3_BYPASS;
+		case T3_WR_SEND: return T3_SEND;
+		case T3_WR_WRITE: return T3_RDMA_WRITE;
+		case T3_WR_READ: return T3_READ_REQ;
+		case T3_WR_INV_STAG: return T3_LOCAL_INV;
+		case T3_WR_BIND: return T3_BIND_MW;
+		case T3_WR_INIT: return T3_RDMA_INIT;
+		case T3_WR_QP_MOD: return T3_QP_MOD;
+		case T3_WR_FASTREG: return T3_FAST_REGISTER;
+		default: break;
+	}
+	return -1;
+}
+
+
+/* Work request id */
+union t3_wrid {
+	struct {
+		u32 hi;
+		u32 low;
+	} id0;
+	u64 id1;
+};
+
+#define WRID(wrid)		(wrid.id1)
+#define WRID_GEN(wrid)		(wrid.id0.wr_gen)
+#define WRID_IDX(wrid)		(wrid.id0.wr_idx)
+#define WRID_LO(wrid)		(wrid.id0.wr_lo)
+
+struct fw_riwrh {
+	__be32 op_seop_flags;
+	__be32 gen_tid_len;
+};
+
+#define S_FW_RIWR_OP		24
+#define M_FW_RIWR_OP		0xff
+#define V_FW_RIWR_OP(x)		((x) << S_FW_RIWR_OP)
+#define G_FW_RIWR_OP(x)	((((x) >> S_FW_RIWR_OP)) & M_FW_RIWR_OP)
+
+#define S_FW_RIWR_SOPEOP	22
+#define M_FW_RIWR_SOPEOP	0x3
+#define V_FW_RIWR_SOPEOP(x)	((x) << S_FW_RIWR_SOPEOP)
+
+#define S_FW_RIWR_FLAGS		8
+#define M_FW_RIWR_FLAGS		0x3fffff
+#define V_FW_RIWR_FLAGS(x)	((x) << S_FW_RIWR_FLAGS)
+#define G_FW_RIWR_FLAGS(x)	((((x) >> S_FW_RIWR_FLAGS)) & M_FW_RIWR_FLAGS)
+
+#define S_FW_RIWR_TID		8
+#define V_FW_RIWR_TID(x)	((x) << S_FW_RIWR_TID)
+
+#define S_FW_RIWR_LEN		0
+#define V_FW_RIWR_LEN(x)	((x) << S_FW_RIWR_LEN)
+
+#define S_FW_RIWR_GEN           31
+#define V_FW_RIWR_GEN(x)        ((x)  << S_FW_RIWR_GEN)
+
+struct t3_sge {
+	__be32 stag;
+	__be32 len;
+	__be64 to;
+};
+
+/* If num_sgle is zero, flit 5+ contains immediate data.*/
+struct t3_send_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+
+	u8 rdmaop;		/* 2 */
+	u8 reserved[3];
+	__be32 rem_stag;
+	__be32 plen;		/* 3 */
+	__be32 num_sgle;
+	struct t3_sge sgl[T3_MAX_SGE];	/* 4+ */
+};
+
+#define T3_MAX_FASTREG_DEPTH 10
+#define T3_MAX_FASTREG_FRAG 10
+
+struct t3_fastreg_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 stag;		/* 2 */
+	__be32 len;
+	__be32 va_base_hi;	/* 3 */
+	__be32 va_base_lo_fbo;
+	__be32 page_type_perms; /* 4 */
+	__be32 reserved1;
+	__be64 pbl_addrs[0];	/* 5+ */
+};
+
+/*
+ * If a fastreg wr spans multiple wqes, then the 2nd fragment look like this.
+ */
+struct t3_pbl_frag {
+	struct fw_riwrh wrh;	/* 0 */
+	__be64 pbl_addrs[14];	/* 1..14 */
+};
+
+#define S_FR_PAGE_COUNT		24
+#define M_FR_PAGE_COUNT		0xff
+#define V_FR_PAGE_COUNT(x)	((x) << S_FR_PAGE_COUNT)
+#define G_FR_PAGE_COUNT(x)	((((x) >> S_FR_PAGE_COUNT)) & M_FR_PAGE_COUNT)
+
+#define S_FR_PAGE_SIZE		16
+#define M_FR_PAGE_SIZE		0x1f
+#define V_FR_PAGE_SIZE(x)	((x) << S_FR_PAGE_SIZE)
+#define G_FR_PAGE_SIZE(x)	((((x) >> S_FR_PAGE_SIZE)) & M_FR_PAGE_SIZE)
+
+#define S_FR_TYPE		8
+#define M_FR_TYPE		0x1
+#define V_FR_TYPE(x)		((x) << S_FR_TYPE)
+#define G_FR_TYPE(x)		((((x) >> S_FR_TYPE)) & M_FR_TYPE)
+
+#define S_FR_PERMS		0
+#define M_FR_PERMS		0xff
+#define V_FR_PERMS(x)		((x) << S_FR_PERMS)
+#define G_FR_PERMS(x)		((((x) >> S_FR_PERMS)) & M_FR_PERMS)
+
+struct t3_local_inv_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 stag;		/* 2 */
+	__be32 reserved;
+};
+
+struct t3_rdma_write_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u8 rdmaop;		/* 2 */
+	u8 reserved[3];
+	__be32 stag_sink;
+	__be64 to_sink;		/* 3 */
+	__be32 plen;		/* 4 */
+	__be32 num_sgle;
+	struct t3_sge sgl[T3_MAX_SGE];	/* 5+ */
+};
+
+struct t3_rdma_read_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u8 rdmaop;		/* 2 */
+	u8 local_inv;
+	u8 reserved[2];
+	__be32 rem_stag;
+	__be64 rem_to;		/* 3 */
+	__be32 local_stag;	/* 4 */
+	__be32 local_len;
+	__be64 local_to;	/* 5 */
+};
+
+struct t3_bind_mw_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u16 reserved;		/* 2 */
+	u8 type;
+	u8 perms;
+	__be32 mr_stag;
+	__be32 mw_stag;		/* 3 */
+	__be32 mw_len;
+	__be64 mw_va;		/* 4 */
+	__be32 mr_pbl_addr;	/* 5 */
+	u8 reserved2[3];
+	u8 mr_pagesz;
+};
+
+struct t3_receive_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u8 pagesz[T3_MAX_SGE];
+	__be32 num_sgle;		/* 2 */
+	struct t3_sge sgl[T3_MAX_SGE];	/* 3+ */
+	__be32 pbl_addr[T3_MAX_SGE];
+};
+
+struct t3_bypass_wr {
+	struct fw_riwrh wrh;
+	union t3_wrid wrid;	/* 1 */
+};
+
+struct t3_modify_qp_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 flags;		/* 2 */
+	__be32 quiesce;		/* 2 */
+	__be32 max_ird;		/* 3 */
+	__be32 max_ord;		/* 3 */
+	__be64 sge_cmd;		/* 4 */
+	__be64 ctx1;		/* 5 */
+	__be64 ctx0;		/* 6 */
+};
+
+enum t3_modify_qp_flags {
+	MODQP_QUIESCE  = 0x01,
+	MODQP_MAX_IRD  = 0x02,
+	MODQP_MAX_ORD  = 0x04,
+	MODQP_WRITE_EC = 0x08,
+	MODQP_READ_EC  = 0x10,
+};
+
+
+enum t3_mpa_attrs {
+	uP_RI_MPA_RX_MARKER_ENABLE = 0x1,
+	uP_RI_MPA_TX_MARKER_ENABLE = 0x2,
+	uP_RI_MPA_CRC_ENABLE = 0x4,
+	uP_RI_MPA_IETF_ENABLE = 0x8
+} __attribute__ ((packed));
+
+enum t3_qp_caps {
+	uP_RI_QP_RDMA_READ_ENABLE = 0x01,
+	uP_RI_QP_RDMA_WRITE_ENABLE = 0x02,
+	uP_RI_QP_BIND_ENABLE = 0x04,
+	uP_RI_QP_FAST_REGISTER_ENABLE = 0x08,
+	uP_RI_QP_STAG0_ENABLE = 0x10
+} __attribute__ ((packed));
+
+enum rdma_init_rtr_types {
+	RTR_READ = 1,
+	RTR_WRITE = 2,
+	RTR_SEND = 3,
+};
+
+#define S_RTR_TYPE	2
+#define M_RTR_TYPE	0x3
+#define V_RTR_TYPE(x)	((x) << S_RTR_TYPE)
+#define G_RTR_TYPE(x)	((((x) >> S_RTR_TYPE)) & M_RTR_TYPE)
+
+#define S_CHAN		4
+#define M_CHAN		0x3
+#define V_CHAN(x)	((x) << S_CHAN)
+#define G_CHAN(x)	((((x) >> S_CHAN)) & M_CHAN)
+
+struct t3_rdma_init_attr {
+	u32 tid;
+	u32 qpid;
+	u32 pdid;
+	u32 scqid;
+	u32 rcqid;
+	u32 rq_addr;
+	u32 rq_size;
+	enum t3_mpa_attrs mpaattrs;
+	enum t3_qp_caps qpcaps;
+	u16 tcp_emss;
+	u32 ord;
+	u32 ird;
+	u64 qp_dma_addr;
+	u32 qp_dma_size;
+	enum rdma_init_rtr_types rtr_type;
+	u16 flags;
+	u16 rqe_count;
+	u32 irs;
+	u32 chan;
+};
+
+struct t3_rdma_init_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 qpid;		/* 2 */
+	__be32 pdid;
+	__be32 scqid;		/* 3 */
+	__be32 rcqid;
+	__be32 rq_addr;		/* 4 */
+	__be32 rq_size;
+	u8 mpaattrs;		/* 5 */
+	u8 qpcaps;
+	__be16 ulpdu_size;
+	__be16 flags_rtr_type;
+	__be16 rqe_count;
+	__be32 ord;		/* 6 */
+	__be32 ird;
+	__be64 qp_dma_addr;	/* 7 */
+	__be32 qp_dma_size;	/* 8 */
+	__be32 irs;
+};
+
+struct t3_genbit {
+	u64 flit[15];
+	__be64 genbit;
+};
+
+struct t3_wq_in_err {
+	u64 flit[13];
+	u64 err;
+};
+
+enum rdma_init_wr_flags {
+	MPA_INITIATOR = (1<<0),
+	PRIV_QP = (1<<1),
+};
+
+union t3_wr {
+	struct t3_send_wr send;
+	struct t3_rdma_write_wr write;
+	struct t3_rdma_read_wr read;
+	struct t3_receive_wr recv;
+	struct t3_fastreg_wr fastreg;
+	struct t3_pbl_frag pbl_frag;
+	struct t3_local_inv_wr local_inv;
+	struct t3_bind_mw_wr bind;
+	struct t3_bypass_wr bypass;
+	struct t3_rdma_init_wr init;
+	struct t3_modify_qp_wr qp_mod;
+	struct t3_genbit genbit;
+	struct t3_wq_in_err wq_in_err;
+	__be64 flit[16];
+};
+
+#define T3_SQ_CQE_FLIT	  13
+#define T3_SQ_COOKIE_FLIT 14
+
+#define T3_RQ_COOKIE_FLIT 13
+#define T3_RQ_CQE_FLIT	  14
+
+static inline enum t3_wr_opcode fw_riwrh_opcode(struct fw_riwrh *wqe)
+{
+	return G_FW_RIWR_OP(be32_to_cpu(wqe->op_seop_flags));
+}
+
+enum t3_wr_hdr_bits {
+	T3_EOP = 1,
+	T3_SOP = 2,
+	T3_SOPEOP = T3_EOP|T3_SOP,
+};
+
+static inline void build_fw_riwrh(struct fw_riwrh *wqe, enum t3_wr_opcode op,
+				  enum t3_wr_flags flags, u8 genbit, u32 tid,
+				  u8 len, u8 sopeop)
+{
+	wqe->op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(op) |
+					 V_FW_RIWR_SOPEOP(sopeop) |
+					 V_FW_RIWR_FLAGS(flags));
+	wmb();
+	wqe->gen_tid_len = cpu_to_be32(V_FW_RIWR_GEN(genbit) |
+				       V_FW_RIWR_TID(tid) |
+				       V_FW_RIWR_LEN(len));
+	/* 2nd gen bit... */
+	((union t3_wr *)wqe)->genbit.genbit = cpu_to_be64(genbit);
+}
+
+/*
+ * T3 ULP2_TX commands
+ */
+enum t3_utx_mem_op {
+	T3_UTX_MEM_READ = 2,
+	T3_UTX_MEM_WRITE = 3
+};
+
+/* T3 MC7 RDMA TPT entry format */
+
+enum tpt_mem_type {
+	TPT_NON_SHARED_MR = 0x0,
+	TPT_SHARED_MR = 0x1,
+	TPT_MW = 0x2,
+	TPT_MW_RELAXED_PROTECTION = 0x3
+};
+
+enum tpt_addr_type {
+	TPT_ZBTO = 0,
+	TPT_VATO = 1
+};
+
+enum tpt_mem_perm {
+	TPT_MW_BIND = 0x10,
+	TPT_LOCAL_READ = 0x8,
+	TPT_LOCAL_WRITE = 0x4,
+	TPT_REMOTE_READ = 0x2,
+	TPT_REMOTE_WRITE = 0x1
+};
+
+struct tpt_entry {
+	__be32 valid_stag_pdid;
+	__be32 flags_pagesize_qpid;
+
+	__be32 rsvd_pbl_addr;
+	__be32 len;
+	__be32 va_hi;
+	__be32 va_low_or_fbo;
+
+	__be32 rsvd_bind_cnt_or_pstag;
+	__be32 rsvd_pbl_size;
+};
+
+#define S_TPT_VALID		31
+#define V_TPT_VALID(x)		((x) << S_TPT_VALID)
+#define F_TPT_VALID		V_TPT_VALID(1U)
+
+#define S_TPT_STAG_KEY		23
+#define M_TPT_STAG_KEY		0xFF
+#define V_TPT_STAG_KEY(x)	((x) << S_TPT_STAG_KEY)
+#define G_TPT_STAG_KEY(x)	(((x) >> S_TPT_STAG_KEY) & M_TPT_STAG_KEY)
+
+#define S_TPT_STAG_STATE	22
+#define V_TPT_STAG_STATE(x)	((x) << S_TPT_STAG_STATE)
+#define F_TPT_STAG_STATE	V_TPT_STAG_STATE(1U)
+
+#define S_TPT_STAG_TYPE		20
+#define M_TPT_STAG_TYPE		0x3
+#define V_TPT_STAG_TYPE(x)	((x) << S_TPT_STAG_TYPE)
+#define G_TPT_STAG_TYPE(x)	(((x) >> S_TPT_STAG_TYPE) & M_TPT_STAG_TYPE)
+
+#define S_TPT_PDID		0
+#define M_TPT_PDID		0xFFFFF
+#define V_TPT_PDID(x)		((x) << S_TPT_PDID)
+#define G_TPT_PDID(x)		(((x) >> S_TPT_PDID) & M_TPT_PDID)
+
+#define S_TPT_PERM		28
+#define M_TPT_PERM		0xF
+#define V_TPT_PERM(x)		((x) << S_TPT_PERM)
+#define G_TPT_PERM(x)		(((x) >> S_TPT_PERM) & M_TPT_PERM)
+
+#define S_TPT_REM_INV_DIS	27
+#define V_TPT_REM_INV_DIS(x)	((x) << S_TPT_REM_INV_DIS)
+#define F_TPT_REM_INV_DIS	V_TPT_REM_INV_DIS(1U)
+
+#define S_TPT_ADDR_TYPE		26
+#define V_TPT_ADDR_TYPE(x)	((x) << S_TPT_ADDR_TYPE)
+#define F_TPT_ADDR_TYPE		V_TPT_ADDR_TYPE(1U)
+
+#define S_TPT_MW_BIND_ENABLE	25
+#define V_TPT_MW_BIND_ENABLE(x)	((x) << S_TPT_MW_BIND_ENABLE)
+#define F_TPT_MW_BIND_ENABLE    V_TPT_MW_BIND_ENABLE(1U)
+
+#define S_TPT_PAGE_SIZE		20
+#define M_TPT_PAGE_SIZE		0x1F
+#define V_TPT_PAGE_SIZE(x)	((x) << S_TPT_PAGE_SIZE)
+#define G_TPT_PAGE_SIZE(x)	(((x) >> S_TPT_PAGE_SIZE) & M_TPT_PAGE_SIZE)
+
+#define S_TPT_PBL_ADDR		0
+#define M_TPT_PBL_ADDR		0x1FFFFFFF
+#define V_TPT_PBL_ADDR(x)	((x) << S_TPT_PBL_ADDR)
+#define G_TPT_PBL_ADDR(x)       (((x) >> S_TPT_PBL_ADDR) & M_TPT_PBL_ADDR)
+
+#define S_TPT_QPID		0
+#define M_TPT_QPID		0xFFFFF
+#define V_TPT_QPID(x)		((x) << S_TPT_QPID)
+#define G_TPT_QPID(x)		(((x) >> S_TPT_QPID) & M_TPT_QPID)
+
+#define S_TPT_PSTAG		0
+#define M_TPT_PSTAG		0xFFFFFF
+#define V_TPT_PSTAG(x)		((x) << S_TPT_PSTAG)
+#define G_TPT_PSTAG(x)		(((x) >> S_TPT_PSTAG) & M_TPT_PSTAG)
+
+#define S_TPT_PBL_SIZE		0
+#define M_TPT_PBL_SIZE		0xFFFFF
+#define V_TPT_PBL_SIZE(x)	((x) << S_TPT_PBL_SIZE)
+#define G_TPT_PBL_SIZE(x)	(((x) >> S_TPT_PBL_SIZE) & M_TPT_PBL_SIZE)
+
+/*
+ * CQE defs
+ */
+struct t3_cqe {
+	__be32 header;
+	__be32 len;
+	union {
+		struct {
+			__be32 stag;
+			__be32 msn;
+		} rcqe;
+		struct {
+			u32 wrid_hi;
+			u32 wrid_low;
+		} scqe;
+	} u;
+};
+
+#define S_CQE_OOO	  31
+#define M_CQE_OOO	  0x1
+#define G_CQE_OOO(x)	  ((((x) >> S_CQE_OOO)) & M_CQE_OOO)
+#define V_CEQ_OOO(x)	  ((x)<<S_CQE_OOO)
+
+#define S_CQE_QPID        12
+#define M_CQE_QPID        0x7FFFF
+#define G_CQE_QPID(x)     ((((x) >> S_CQE_QPID)) & M_CQE_QPID)
+#define V_CQE_QPID(x)	  ((x)<<S_CQE_QPID)
+
+#define S_CQE_SWCQE       11
+#define M_CQE_SWCQE       0x1
+#define G_CQE_SWCQE(x)    ((((x) >> S_CQE_SWCQE)) & M_CQE_SWCQE)
+#define V_CQE_SWCQE(x)	  ((x)<<S_CQE_SWCQE)
+
+#define S_CQE_GENBIT      10
+#define M_CQE_GENBIT      0x1
+#define G_CQE_GENBIT(x)   (((x) >> S_CQE_GENBIT) & M_CQE_GENBIT)
+#define V_CQE_GENBIT(x)	  ((x)<<S_CQE_GENBIT)
+
+#define S_CQE_STATUS      5
+#define M_CQE_STATUS      0x1F
+#define G_CQE_STATUS(x)   ((((x) >> S_CQE_STATUS)) & M_CQE_STATUS)
+#define V_CQE_STATUS(x)   ((x)<<S_CQE_STATUS)
+
+#define S_CQE_TYPE        4
+#define M_CQE_TYPE        0x1
+#define G_CQE_TYPE(x)     ((((x) >> S_CQE_TYPE)) & M_CQE_TYPE)
+#define V_CQE_TYPE(x)     ((x)<<S_CQE_TYPE)
+
+#define S_CQE_OPCODE      0
+#define M_CQE_OPCODE      0xF
+#define G_CQE_OPCODE(x)   ((((x) >> S_CQE_OPCODE)) & M_CQE_OPCODE)
+#define V_CQE_OPCODE(x)   ((x)<<S_CQE_OPCODE)
+
+#define SW_CQE(x)         (G_CQE_SWCQE(be32_to_cpu((x).header)))
+#define CQE_OOO(x)        (G_CQE_OOO(be32_to_cpu((x).header)))
+#define CQE_QPID(x)       (G_CQE_QPID(be32_to_cpu((x).header)))
+#define CQE_GENBIT(x)     (G_CQE_GENBIT(be32_to_cpu((x).header)))
+#define CQE_TYPE(x)       (G_CQE_TYPE(be32_to_cpu((x).header)))
+#define SQ_TYPE(x)	  (CQE_TYPE((x)))
+#define RQ_TYPE(x)	  (!CQE_TYPE((x)))
+#define CQE_STATUS(x)     (G_CQE_STATUS(be32_to_cpu((x).header)))
+#define CQE_OPCODE(x)     (G_CQE_OPCODE(be32_to_cpu((x).header)))
+
+#define CQE_SEND_OPCODE(x)( \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_INV) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE_INV))
+
+#define CQE_LEN(x)        (be32_to_cpu((x).len))
+
+/* used for RQ completion processing */
+#define CQE_WRID_STAG(x)  (be32_to_cpu((x).u.rcqe.stag))
+#define CQE_WRID_MSN(x)   (be32_to_cpu((x).u.rcqe.msn))
+
+/* used for SQ completion processing */
+#define CQE_WRID_SQ_WPTR(x)	((x).u.scqe.wrid_hi)
+#define CQE_WRID_WPTR(x)	((x).u.scqe.wrid_low)
+
+/* generic accessor macros */
+#define CQE_WRID_HI(x)		((x).u.scqe.wrid_hi)
+#define CQE_WRID_LOW(x)		((x).u.scqe.wrid_low)
+
+#define TPT_ERR_SUCCESS                     0x0
+#define TPT_ERR_STAG                        0x1	 /* STAG invalid: either the */
+						 /* STAG is offlimt, being 0, */
+						 /* or STAG_key mismatch */
+#define TPT_ERR_PDID                        0x2	 /* PDID mismatch */
+#define TPT_ERR_QPID                        0x3	 /* QPID mismatch */
+#define TPT_ERR_ACCESS                      0x4	 /* Invalid access right */
+#define TPT_ERR_WRAP                        0x5	 /* Wrap error */
+#define TPT_ERR_BOUND                       0x6	 /* base and bounds voilation */
+#define TPT_ERR_INVALIDATE_SHARED_MR        0x7	 /* attempt to invalidate a  */
+						 /* shared memory region */
+#define TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8	 /* attempt to invalidate a  */
+						 /* shared memory region */
+#define TPT_ERR_ECC                         0x9	 /* ECC error detected */
+#define TPT_ERR_ECC_PSTAG                   0xA	 /* ECC error detected when  */
+						 /* reading PSTAG for a MW  */
+						 /* Invalidate */
+#define TPT_ERR_PBL_ADDR_BOUND              0xB	 /* pbl addr out of bounds:  */
+						 /* software error */
+#define TPT_ERR_SWFLUSH			    0xC	 /* SW FLUSHED */
+#define TPT_ERR_CRC                         0x10 /* CRC error */
+#define TPT_ERR_MARKER                      0x11 /* Marker error */
+#define TPT_ERR_PDU_LEN_ERR                 0x12 /* invalid PDU length */
+#define TPT_ERR_OUT_OF_RQE                  0x13 /* out of RQE */
+#define TPT_ERR_DDP_VERSION                 0x14 /* wrong DDP version */
+#define TPT_ERR_RDMA_VERSION                0x15 /* wrong RDMA version */
+#define TPT_ERR_OPCODE                      0x16 /* invalid rdma opcode */
+#define TPT_ERR_DDP_QUEUE_NUM               0x17 /* invalid ddp queue number */
+#define TPT_ERR_MSN                         0x18 /* MSN error */
+#define TPT_ERR_TBIT                        0x19 /* tag bit not set correctly */
+#define TPT_ERR_MO                          0x1A /* MO not 0 for TERMINATE  */
+						 /* or READ_REQ */
+#define TPT_ERR_MSN_GAP                     0x1B
+#define TPT_ERR_MSN_RANGE                   0x1C
+#define TPT_ERR_IRD_OVERFLOW                0x1D
+#define TPT_ERR_RQE_ADDR_BOUND              0x1E /* RQE addr out of bounds:  */
+						 /* software error */
+#define TPT_ERR_INTERNAL_ERR                0x1F /* internal error (opcode  */
+						 /* mismatch) */
+
+struct t3_swsq {
+	__u64			wr_id;
+	struct t3_cqe		cqe;
+	__u32			sq_wptr;
+	__be32			read_len;
+	int			opcode;
+	int			complete;
+	int			signaled;
+};
+
+struct t3_swrq {
+	__u64			wr_id;
+	__u32			pbl_addr;
+};
+
+/*
+ * A T3 WQ implements both the SQ and RQ.
+ */
+struct t3_wq {
+	union t3_wr *queue;		/* DMA accessible memory */
+	dma_addr_t dma_addr;		/* DMA address for HW */
+	DEFINE_DMA_UNMAP_ADDR(mapping); /* unmap kruft */
+	u32 error;			/* 1 once we go to ERROR */
+	u32 qpid;
+	u32 wptr;			/* idx to next available WR slot */
+	u32 size_log2;			/* total wq size */
+	struct t3_swsq *sq;		/* SW SQ */
+	struct t3_swsq *oldest_read;	/* tracks oldest pending read */
+	u32 sq_wptr;			/* sq_wptr - sq_rptr == count of */
+	u32 sq_rptr;			/* pending wrs */
+	u32 sq_size_log2;		/* sq size */
+	struct t3_swrq *rq;		/* SW RQ (holds consumer wr_ids */
+	u32 rq_wptr;			/* rq_wptr - rq_rptr == count of */
+	u32 rq_rptr;			/* pending wrs */
+	struct t3_swrq *rq_oldest_wr;	/* oldest wr on the SW RQ */
+	u32 rq_size_log2;		/* rq size */
+	u32 rq_addr;			/* rq adapter address */
+	void __iomem *doorbell;		/* kernel db */
+	u64 udb;			/* user db if any */
+	struct cxio_rdev *rdev;
+};
+
+struct t3_cq {
+	u32 cqid;
+	u32 rptr;
+	u32 wptr;
+	u32 size_log2;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	struct t3_cqe *queue;
+	struct t3_cqe *sw_queue;
+	u32 sw_rptr;
+	u32 sw_wptr;
+};
+
+#define CQ_VLD_ENTRY(ptr,size_log2,cqe) (Q_GENBIT(ptr,size_log2) == \
+					 CQE_GENBIT(*cqe))
+
+struct t3_cq_status_page {
+	u32 cq_err;
+};
+
+static inline int cxio_cq_in_error(struct t3_cq *cq)
+{
+	return ((struct t3_cq_status_page *)
+		&cq->queue[1 << cq->size_log2])->cq_err;
+}
+
+static inline void cxio_set_cq_in_error(struct t3_cq *cq)
+{
+	((struct t3_cq_status_page *)
+	 &cq->queue[1 << cq->size_log2])->cq_err = 1;
+}
+
+static inline void cxio_set_wq_in_error(struct t3_wq *wq)
+{
+	wq->queue->wq_in_err.err |= 1;
+}
+
+static inline void cxio_disable_wq_db(struct t3_wq *wq)
+{
+	wq->queue->wq_in_err.err |= 2;
+}
+
+static inline void cxio_enable_wq_db(struct t3_wq *wq)
+{
+	wq->queue->wq_in_err.err &= ~2;
+}
+
+static inline int cxio_wq_db_enabled(struct t3_wq *wq)
+{
+	return !(wq->queue->wq_in_err.err & 2);
+}
+
+static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe;
+
+	cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2));
+	if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe))
+		return cqe;
+	return NULL;
+}
+
+static inline struct t3_cqe *cxio_next_sw_cqe(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe;
+
+	if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2));
+		return cqe;
+	}
+	return NULL;
+}
+
+static inline struct t3_cqe *cxio_next_cqe(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe;
+
+	if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2));
+		return cqe;
+	}
+	cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2));
+	if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe))
+		return cqe;
+	return NULL;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c
new file mode 100644
index 0000000..591de31
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "cxgb3_offload.h"
+#include "iwch_provider.h"
+#include <rdma/cxgb3-abi.h>
+#include "iwch.h"
+#include "iwch_cm.h"
+
+#define DRV_VERSION "1.1"
+
+MODULE_AUTHOR("Boyd Faulkner, Steve Wise");
+MODULE_DESCRIPTION("Chelsio T3 RDMA Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static void open_rnic_dev(struct t3cdev *);
+static void close_rnic_dev(struct t3cdev *);
+static void iwch_event_handler(struct t3cdev *, u32, u32);
+
+struct cxgb3_client t3c_client = {
+	.name = "iw_cxgb3",
+	.add = open_rnic_dev,
+	.remove = close_rnic_dev,
+	.handlers = t3c_handlers,
+	.redirect = iwch_ep_redirect,
+	.event_handler = iwch_event_handler
+};
+
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(dev_mutex);
+
+static int disable_qp_db(int id, void *p, void *data)
+{
+	struct iwch_qp *qhp = p;
+
+	cxio_disable_wq_db(&qhp->wq);
+	return 0;
+}
+
+static int enable_qp_db(int id, void *p, void *data)
+{
+	struct iwch_qp *qhp = p;
+
+	if (data)
+		ring_doorbell(qhp->rhp->rdev.ctrl_qp.doorbell, qhp->wq.qpid);
+	cxio_enable_wq_db(&qhp->wq);
+	return 0;
+}
+
+static void disable_dbs(struct iwch_dev *rnicp)
+{
+	spin_lock_irq(&rnicp->lock);
+	idr_for_each(&rnicp->qpidr, disable_qp_db, NULL);
+	spin_unlock_irq(&rnicp->lock);
+}
+
+static void enable_dbs(struct iwch_dev *rnicp, int ring_db)
+{
+	spin_lock_irq(&rnicp->lock);
+	idr_for_each(&rnicp->qpidr, enable_qp_db,
+		     (void *)(unsigned long)ring_db);
+	spin_unlock_irq(&rnicp->lock);
+}
+
+static void iwch_db_drop_task(struct work_struct *work)
+{
+	struct iwch_dev *rnicp = container_of(work, struct iwch_dev,
+					      db_drop_task.work);
+	enable_dbs(rnicp, 1);
+}
+
+static void rnic_init(struct iwch_dev *rnicp)
+{
+	pr_debug("%s iwch_dev %p\n", __func__,  rnicp);
+	idr_init(&rnicp->cqidr);
+	idr_init(&rnicp->qpidr);
+	idr_init(&rnicp->mmidr);
+	spin_lock_init(&rnicp->lock);
+	INIT_DELAYED_WORK(&rnicp->db_drop_task, iwch_db_drop_task);
+
+	rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
+	rnicp->attr.max_wrs = T3_MAX_QP_DEPTH;
+	rnicp->attr.max_sge_per_wr = T3_MAX_SGE;
+	rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE;
+	rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1;
+	rnicp->attr.max_cqes_per_cq = T3_MAX_CQ_DEPTH;
+	rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev);
+	rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE;
+	rnicp->attr.max_pds = T3_MAX_NUM_PD - 1;
+	rnicp->attr.mem_pgsizes_bitmask = T3_PAGESIZE_MASK;
+	rnicp->attr.max_mr_size = T3_MAX_MR_SIZE;
+	rnicp->attr.can_resize_wq = 0;
+	rnicp->attr.max_rdma_reads_per_qp = 8;
+	rnicp->attr.max_rdma_read_resources =
+	    rnicp->attr.max_rdma_reads_per_qp * rnicp->attr.max_qps;
+	rnicp->attr.max_rdma_read_qp_depth = 8;	/* IRD */
+	rnicp->attr.max_rdma_read_depth =
+	    rnicp->attr.max_rdma_read_qp_depth * rnicp->attr.max_qps;
+	rnicp->attr.rq_overflow_handled = 0;
+	rnicp->attr.can_modify_ird = 0;
+	rnicp->attr.can_modify_ord = 0;
+	rnicp->attr.max_mem_windows = rnicp->attr.max_mem_regs - 1;
+	rnicp->attr.stag0_value = 1;
+	rnicp->attr.zbva_support = 1;
+	rnicp->attr.local_invalidate_fence = 1;
+	rnicp->attr.cq_overflow_detection = 1;
+	return;
+}
+
+static void open_rnic_dev(struct t3cdev *tdev)
+{
+	struct iwch_dev *rnicp;
+
+	pr_debug("%s t3cdev %p\n", __func__,  tdev);
+	pr_info_once("Chelsio T3 RDMA Driver - version %s\n", DRV_VERSION);
+	rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp));
+	if (!rnicp) {
+		pr_err("Cannot allocate ib device\n");
+		return;
+	}
+	rnicp->rdev.ulp = rnicp;
+	rnicp->rdev.t3cdev_p = tdev;
+
+	mutex_lock(&dev_mutex);
+
+	if (cxio_rdev_open(&rnicp->rdev)) {
+		mutex_unlock(&dev_mutex);
+		pr_err("Unable to open CXIO rdev\n");
+		ib_dealloc_device(&rnicp->ibdev);
+		return;
+	}
+
+	rnic_init(rnicp);
+
+	list_add_tail(&rnicp->entry, &dev_list);
+	mutex_unlock(&dev_mutex);
+
+	if (iwch_register_device(rnicp)) {
+		pr_err("Unable to register device\n");
+		close_rnic_dev(tdev);
+	}
+	pr_info("Initialized device %s\n",
+		pci_name(rnicp->rdev.rnic_info.pdev));
+	return;
+}
+
+static void close_rnic_dev(struct t3cdev *tdev)
+{
+	struct iwch_dev *dev, *tmp;
+	pr_debug("%s t3cdev %p\n", __func__,  tdev);
+	mutex_lock(&dev_mutex);
+	list_for_each_entry_safe(dev, tmp, &dev_list, entry) {
+		if (dev->rdev.t3cdev_p == tdev) {
+			dev->rdev.flags = CXIO_ERROR_FATAL;
+			synchronize_net();
+			cancel_delayed_work_sync(&dev->db_drop_task);
+			list_del(&dev->entry);
+			iwch_unregister_device(dev);
+			cxio_rdev_close(&dev->rdev);
+			idr_destroy(&dev->cqidr);
+			idr_destroy(&dev->qpidr);
+			idr_destroy(&dev->mmidr);
+			ib_dealloc_device(&dev->ibdev);
+			break;
+		}
+	}
+	mutex_unlock(&dev_mutex);
+}
+
+static void iwch_event_handler(struct t3cdev *tdev, u32 evt, u32 port_id)
+{
+	struct cxio_rdev *rdev = tdev->ulp;
+	struct iwch_dev *rnicp;
+	struct ib_event event;
+	u32 portnum = port_id + 1;
+	int dispatch = 0;
+
+	if (!rdev)
+		return;
+	rnicp = rdev_to_iwch_dev(rdev);
+	switch (evt) {
+	case OFFLOAD_STATUS_DOWN: {
+		rdev->flags = CXIO_ERROR_FATAL;
+		synchronize_net();
+		event.event  = IB_EVENT_DEVICE_FATAL;
+		dispatch = 1;
+		break;
+		}
+	case OFFLOAD_PORT_DOWN: {
+		event.event  = IB_EVENT_PORT_ERR;
+		dispatch = 1;
+		break;
+		}
+	case OFFLOAD_PORT_UP: {
+		event.event  = IB_EVENT_PORT_ACTIVE;
+		dispatch = 1;
+		break;
+		}
+	case OFFLOAD_DB_FULL: {
+		disable_dbs(rnicp);
+		break;
+		}
+	case OFFLOAD_DB_EMPTY: {
+		enable_dbs(rnicp, 1);
+		break;
+		}
+	case OFFLOAD_DB_DROP: {
+		unsigned long delay = 1000;
+		unsigned short r;
+
+		disable_dbs(rnicp);
+		get_random_bytes(&r, 2);
+		delay += r & 1023;
+
+		/*
+		 * delay is between 1000-2023 usecs.
+		 */
+		schedule_delayed_work(&rnicp->db_drop_task,
+			usecs_to_jiffies(delay));
+		break;
+		}
+	}
+
+	if (dispatch) {
+		event.device = &rnicp->ibdev;
+		event.element.port_num = portnum;
+		ib_dispatch_event(&event);
+	}
+
+	return;
+}
+
+static int __init iwch_init_module(void)
+{
+	int err;
+
+	err = cxio_hal_init();
+	if (err)
+		return err;
+	err = iwch_cm_init();
+	if (err)
+		return err;
+	cxio_register_ev_cb(iwch_ev_dispatch);
+	cxgb3_register_client(&t3c_client);
+	return 0;
+}
+
+static void __exit iwch_exit_module(void)
+{
+	cxgb3_unregister_client(&t3c_client);
+	cxio_unregister_ev_cb(iwch_ev_dispatch);
+	iwch_cm_term();
+	cxio_hal_exit();
+}
+
+module_init(iwch_init_module);
+module_exit(iwch_exit_module);
diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h
new file mode 100644
index 0000000..8378622
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IWCH_H__
+#define __IWCH_H__
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "cxio_hal.h"
+#include "cxgb3_offload.h"
+
+struct iwch_pd;
+struct iwch_cq;
+struct iwch_qp;
+struct iwch_mr;
+
+struct iwch_rnic_attributes {
+	u32 max_qps;
+	u32 max_wrs;				/* Max for any SQ/RQ */
+	u32 max_sge_per_wr;
+	u32 max_sge_per_rdma_write_wr;	/* for RDMA Write WR */
+	u32 max_cqs;
+	u32 max_cqes_per_cq;
+	u32 max_mem_regs;
+	u32 max_phys_buf_entries;		/* for phys buf list */
+	u32 max_pds;
+
+	/*
+	 * The memory page sizes supported by this RNIC.
+	 * Bit position i in bitmap indicates page of
+	 * size (4k)^i.  Phys block list mode unsupported.
+	 */
+	u32 mem_pgsizes_bitmask;
+	u64 max_mr_size;
+	u8 can_resize_wq;
+
+	/*
+	 * The maximum number of RDMA Reads that can be outstanding
+	 * per QP with this RNIC as the target.
+	 */
+	u32 max_rdma_reads_per_qp;
+
+	/*
+	 * The maximum number of resources used for RDMA Reads
+	 * by this RNIC with this RNIC as the target.
+	 */
+	u32 max_rdma_read_resources;
+
+	/*
+	 * The max depth per QP for initiation of RDMA Read
+	 * by this RNIC.
+	 */
+	u32 max_rdma_read_qp_depth;
+
+	/*
+	 * The maximum depth for initiation of RDMA Read
+	 * operations by this RNIC on all QPs
+	 */
+	u32 max_rdma_read_depth;
+	u8 rq_overflow_handled;
+	u32 can_modify_ird;
+	u32 can_modify_ord;
+	u32 max_mem_windows;
+	u32 stag0_value;
+	u8 zbva_support;
+	u8 local_invalidate_fence;
+	u32 cq_overflow_detection;
+};
+
+struct iwch_dev {
+	struct ib_device ibdev;
+	struct cxio_rdev rdev;
+	u32 device_cap_flags;
+	struct iwch_rnic_attributes attr;
+	struct idr cqidr;
+	struct idr qpidr;
+	struct idr mmidr;
+	spinlock_t lock;
+	struct list_head entry;
+	struct delayed_work db_drop_task;
+};
+
+static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct iwch_dev, ibdev);
+}
+
+static inline struct iwch_dev *rdev_to_iwch_dev(struct cxio_rdev *rdev)
+{
+	return container_of(rdev, struct iwch_dev, rdev);
+}
+
+static inline int t3b_device(const struct iwch_dev *rhp)
+{
+	return rhp->rdev.t3cdev_p->type == T3B;
+}
+
+static inline int t3a_device(const struct iwch_dev *rhp)
+{
+	return rhp->rdev.t3cdev_p->type == T3A;
+}
+
+static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid)
+{
+	return idr_find(&rhp->cqidr, cqid);
+}
+
+static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid)
+{
+	return idr_find(&rhp->qpidr, qpid);
+}
+
+static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid)
+{
+	return idr_find(&rhp->mmidr, mmid);
+}
+
+static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr,
+				void *handle, u32 id)
+{
+	int ret;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(&rhp->lock);
+
+	ret = idr_alloc(idr, handle, id, id + 1, GFP_NOWAIT);
+
+	spin_unlock_irq(&rhp->lock);
+	idr_preload_end();
+
+	BUG_ON(ret == -ENOSPC);
+	return ret < 0 ? ret : 0;
+}
+
+static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id)
+{
+	spin_lock_irq(&rhp->lock);
+	idr_remove(idr, id);
+	spin_unlock_irq(&rhp->lock);
+}
+
+extern struct cxgb3_client t3c_client;
+extern cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS];
+extern void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb);
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
new file mode 100644
index 0000000..1c90c86
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -0,0 +1,2258 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+
+#include <net/neighbour.h>
+#include <net/netevent.h>
+#include <net/route.h>
+
+#include "tcb.h"
+#include "cxgb3_offload.h"
+#include "iwch.h"
+#include "iwch_provider.h"
+#include "iwch_cm.h"
+
+static char *states[] = {
+	"idle",
+	"listen",
+	"connecting",
+	"mpa_wait_req",
+	"mpa_req_sent",
+	"mpa_req_rcvd",
+	"mpa_rep_sent",
+	"fpdu_mode",
+	"aborting",
+	"closing",
+	"moribund",
+	"dead",
+	NULL,
+};
+
+int peer2peer = 0;
+module_param(peer2peer, int, 0644);
+MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)");
+
+static int ep_timeout_secs = 60;
+module_param(ep_timeout_secs, int, 0644);
+MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
+				   "in seconds (default=60)");
+
+static int mpa_rev = 1;
+module_param(mpa_rev, int, 0644);
+MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
+		 "1 is spec compliant. (default=1)");
+
+static int markers_enabled = 0;
+module_param(markers_enabled, int, 0644);
+MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)");
+
+static int crc_enabled = 1;
+module_param(crc_enabled, int, 0644);
+MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)");
+
+static int rcv_win = 256 * 1024;
+module_param(rcv_win, int, 0644);
+MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256)");
+
+static int snd_win = 32 * 1024;
+module_param(snd_win, int, 0644);
+MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)");
+
+static unsigned int nocong = 0;
+module_param(nocong, uint, 0644);
+MODULE_PARM_DESC(nocong, "Turn off congestion control (default=0)");
+
+static unsigned int cong_flavor = 1;
+module_param(cong_flavor, uint, 0644);
+MODULE_PARM_DESC(cong_flavor, "TCP Congestion control flavor (default=1)");
+
+static struct workqueue_struct *workq;
+
+static struct sk_buff_head rxq;
+
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
+static void ep_timeout(struct timer_list *t);
+static void connect_reply_upcall(struct iwch_ep *ep, int status);
+
+static void start_ep_timer(struct iwch_ep *ep)
+{
+	pr_debug("%s ep %p\n", __func__, ep);
+	if (timer_pending(&ep->timer)) {
+		pr_debug("%s stopped / restarted timer ep %p\n", __func__, ep);
+		del_timer_sync(&ep->timer);
+	} else
+		get_ep(&ep->com);
+	ep->timer.expires = jiffies + ep_timeout_secs * HZ;
+	add_timer(&ep->timer);
+}
+
+static void stop_ep_timer(struct iwch_ep *ep)
+{
+	pr_debug("%s ep %p\n", __func__, ep);
+	if (!timer_pending(&ep->timer)) {
+		WARN(1, "%s timer stopped when its not running!  ep %p state %u\n",
+			__func__, ep, ep->com.state);
+		return;
+	}
+	del_timer_sync(&ep->timer);
+	put_ep(&ep->com);
+}
+
+static int iwch_l2t_send(struct t3cdev *tdev, struct sk_buff *skb, struct l2t_entry *l2e)
+{
+	int	error = 0;
+	struct cxio_rdev *rdev;
+
+	rdev = (struct cxio_rdev *)tdev->ulp;
+	if (cxio_fatal_error(rdev)) {
+		kfree_skb(skb);
+		return -EIO;
+	}
+	error = l2t_send(tdev, skb, l2e);
+	if (error < 0)
+		kfree_skb(skb);
+	return error < 0 ? error : 0;
+}
+
+int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb)
+{
+	int	error = 0;
+	struct cxio_rdev *rdev;
+
+	rdev = (struct cxio_rdev *)tdev->ulp;
+	if (cxio_fatal_error(rdev)) {
+		kfree_skb(skb);
+		return -EIO;
+	}
+	error = cxgb3_ofld_send(tdev, skb);
+	if (error < 0)
+		kfree_skb(skb);
+	return error < 0 ? error : 0;
+}
+
+static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb)
+{
+	struct cpl_tid_release *req;
+
+	skb = get_skb(skb, sizeof *req, GFP_KERNEL);
+	if (!skb)
+		return;
+	req = skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid));
+	skb->priority = CPL_PRIORITY_SETUP;
+	iwch_cxgb3_ofld_send(tdev, skb);
+	return;
+}
+
+int iwch_quiesce_tid(struct iwch_ep *ep)
+{
+	struct cpl_set_tcb_field *req;
+	struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+
+	if (!skb)
+		return -ENOMEM;
+	req = skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
+	req->reply = 0;
+	req->cpu_idx = 0;
+	req->word = htons(W_TCB_RX_QUIESCE);
+	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
+	req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE);
+
+	skb->priority = CPL_PRIORITY_DATA;
+	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+}
+
+int iwch_resume_tid(struct iwch_ep *ep)
+{
+	struct cpl_set_tcb_field *req;
+	struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+
+	if (!skb)
+		return -ENOMEM;
+	req = skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
+	req->reply = 0;
+	req->cpu_idx = 0;
+	req->word = htons(W_TCB_RX_QUIESCE);
+	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
+	req->val = 0;
+
+	skb->priority = CPL_PRIORITY_DATA;
+	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+}
+
+static void set_emss(struct iwch_ep *ep, u16 opt)
+{
+	pr_debug("%s ep %p opt %u\n", __func__, ep, opt);
+	ep->emss = T3C_DATA(ep->com.tdev)->mtus[G_TCPOPT_MSS(opt)] - 40;
+	if (G_TCPOPT_TSTAMP(opt))
+		ep->emss -= 12;
+	if (ep->emss < 128)
+		ep->emss = 128;
+	pr_debug("emss=%d\n", ep->emss);
+}
+
+static enum iwch_ep_state state_read(struct iwch_ep_common *epc)
+{
+	unsigned long flags;
+	enum iwch_ep_state state;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	state = epc->state;
+	spin_unlock_irqrestore(&epc->lock, flags);
+	return state;
+}
+
+static void __state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
+{
+	epc->state = new;
+}
+
+static void state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	pr_debug("%s - %s -> %s\n", __func__, states[epc->state], states[new]);
+	__state_set(epc, new);
+	spin_unlock_irqrestore(&epc->lock, flags);
+	return;
+}
+
+static void *alloc_ep(int size, gfp_t gfp)
+{
+	struct iwch_ep_common *epc;
+
+	epc = kzalloc(size, gfp);
+	if (epc) {
+		kref_init(&epc->kref);
+		spin_lock_init(&epc->lock);
+		init_waitqueue_head(&epc->waitq);
+	}
+	pr_debug("%s alloc ep %p\n", __func__, epc);
+	return epc;
+}
+
+void __free_ep(struct kref *kref)
+{
+	struct iwch_ep *ep;
+	ep = container_of(container_of(kref, struct iwch_ep_common, kref),
+			  struct iwch_ep, com);
+	pr_debug("%s ep %p state %s\n",
+		 __func__, ep, states[state_read(&ep->com)]);
+	if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
+		cxgb3_remove_tid(ep->com.tdev, (void *)ep, ep->hwtid);
+		dst_release(ep->dst);
+		l2t_release(ep->com.tdev, ep->l2t);
+	}
+	kfree(ep);
+}
+
+static void release_ep_resources(struct iwch_ep *ep)
+{
+	pr_debug("%s ep %p tid %d\n", __func__, ep, ep->hwtid);
+	set_bit(RELEASE_RESOURCES, &ep->com.flags);
+	put_ep(&ep->com);
+}
+
+static int status2errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_NONE:
+		return 0;
+	case CPL_ERR_CONN_RESET:
+		return -ECONNRESET;
+	case CPL_ERR_ARP_MISS:
+		return -EHOSTUNREACH;
+	case CPL_ERR_CONN_TIMEDOUT:
+		return -ETIMEDOUT;
+	case CPL_ERR_TCAM_FULL:
+		return -ENOMEM;
+	case CPL_ERR_CONN_EXIST:
+		return -EADDRINUSE;
+	default:
+		return -EIO;
+	}
+}
+
+/*
+ * Try and reuse skbs already allocated...
+ */
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp)
+{
+	if (skb && !skb_is_nonlinear(skb) && !skb_cloned(skb)) {
+		skb_trim(skb, 0);
+		skb_get(skb);
+	} else {
+		skb = alloc_skb(len, gfp);
+	}
+	return skb;
+}
+
+static struct rtable *find_route(struct t3cdev *dev, __be32 local_ip,
+				 __be32 peer_ip, __be16 local_port,
+				 __be16 peer_port, u8 tos)
+{
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip,
+				   peer_port, local_port, IPPROTO_TCP,
+				   tos, 0);
+	if (IS_ERR(rt))
+		return NULL;
+	return rt;
+}
+
+static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu)
+{
+	int i = 0;
+
+	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
+		++i;
+	return i;
+}
+
+static void arp_failure_discard(struct t3cdev *dev, struct sk_buff *skb)
+{
+	pr_debug("%s t3cdev %p\n", __func__, dev);
+	kfree_skb(skb);
+}
+
+/*
+ * Handle an ARP failure for an active open.
+ */
+static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
+{
+	pr_err("ARP failure during connect\n");
+	kfree_skb(skb);
+}
+
+/*
+ * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
+ * and send it along.
+ */
+static void abort_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_abort_req *req = cplhdr(skb);
+
+	pr_debug("%s t3cdev %p\n", __func__, dev);
+	req->cmd = CPL_ABORT_NO_RST;
+	iwch_cxgb3_ofld_send(dev, skb);
+}
+
+static int send_halfclose(struct iwch_ep *ep, gfp_t gfp)
+{
+	struct cpl_close_con_req *req;
+	struct sk_buff *skb;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	skb = get_skb(NULL, sizeof(*req), gfp);
+	if (!skb) {
+		pr_err("%s - failed to alloc skb\n", __func__);
+		return -ENOMEM;
+	}
+	skb->priority = CPL_PRIORITY_DATA;
+	set_arp_failure_handler(skb, arp_failure_discard);
+	req = skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, ep->hwtid));
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+	struct cpl_abort_req *req;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	skb = get_skb(skb, sizeof(*req), gfp);
+	if (!skb) {
+		pr_err("%s - failed to alloc skb\n", __func__);
+		return -ENOMEM;
+	}
+	skb->priority = CPL_PRIORITY_DATA;
+	set_arp_failure_handler(skb, abort_arp_failure);
+	req = skb_put_zero(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid));
+	req->cmd = CPL_ABORT_SEND_RST;
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static int send_connect(struct iwch_ep *ep)
+{
+	struct cpl_act_open_req *req;
+	struct sk_buff *skb;
+	u32 opt0h, opt0l, opt2;
+	unsigned int mtu_idx;
+	int wscale;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		pr_err("%s - failed to alloc skb\n", __func__);
+		return -ENOMEM;
+	}
+	mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst));
+	wscale = compute_wscale(rcv_win);
+	opt0h = V_NAGLE(0) |
+	    V_NO_CONG(nocong) |
+	    V_KEEP_ALIVE(1) |
+	    F_TCAM_BYPASS |
+	    V_WND_SCALE(wscale) |
+	    V_MSS_IDX(mtu_idx) |
+	    V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx);
+	opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10);
+	opt2 = F_RX_COALESCE_VALID | V_RX_COALESCE(0) | V_FLAVORS_VALID(1) |
+	       V_CONG_CONTROL_FLAVOR(cong_flavor);
+	skb->priority = CPL_PRIORITY_SETUP;
+	set_arp_failure_handler(skb, act_open_req_arp_failure);
+
+	req = skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, ep->atid));
+	req->local_port = ep->com.local_addr.sin_port;
+	req->peer_port = ep->com.remote_addr.sin_port;
+	req->local_ip = ep->com.local_addr.sin_addr.s_addr;
+	req->peer_ip = ep->com.remote_addr.sin_addr.s_addr;
+	req->opt0h = htonl(opt0h);
+	req->opt0l = htonl(opt0l);
+	req->params = 0;
+	req->opt2 = htonl(opt2);
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static void send_mpa_req(struct iwch_ep *ep, struct sk_buff *skb)
+{
+	int mpalen;
+	struct tx_data_wr *req;
+	struct mpa_message *mpa;
+	int len;
+
+	pr_debug("%s ep %p pd_len %d\n", __func__, ep, ep->plen);
+
+	BUG_ON(skb_cloned(skb));
+
+	mpalen = sizeof(*mpa) + ep->plen;
+	if (skb->data + mpalen + sizeof(*req) > skb_end_pointer(skb)) {
+		kfree_skb(skb);
+		skb=alloc_skb(mpalen + sizeof(*req), GFP_KERNEL);
+		if (!skb) {
+			connect_reply_upcall(ep, -ENOMEM);
+			return;
+		}
+	}
+	skb_trim(skb, 0);
+	skb_reserve(skb, sizeof(*req));
+	skb_put(skb, mpalen);
+	skb->priority = CPL_PRIORITY_DATA;
+	mpa = (struct mpa_message *) skb->data;
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
+	mpa->flags = (crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0);
+	mpa->private_data_size = htons(ep->plen);
+	mpa->revision = mpa_rev;
+
+	if (ep->plen)
+		memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function tx_ack() will deref it.
+	 */
+	skb_get(skb);
+	set_arp_failure_handler(skb, arp_failure_discard);
+	skb_reset_transport_header(skb);
+	len = skb->len;
+	req = skb_push(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
+	req->wr_lo = htonl(V_WR_TID(ep->hwtid));
+	req->len = htonl(len);
+	req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
+			   V_TX_SNDBUF(snd_win>>15));
+	req->flags = htonl(F_TX_INIT);
+	req->sndseq = htonl(ep->snd_seq);
+	BUG_ON(ep->mpa_skb);
+	ep->mpa_skb = skb;
+	iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+	start_ep_timer(ep);
+	state_set(&ep->com, MPA_REQ_SENT);
+	return;
+}
+
+static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen;
+	struct tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct sk_buff *skb;
+
+	pr_debug("%s ep %p plen %d\n", __func__, ep, plen);
+
+	mpalen = sizeof(*mpa) + plen;
+
+	skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		pr_err("%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	skb_reserve(skb, sizeof(*req));
+	mpa = skb_put(skb, mpalen);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = MPA_REJECT;
+	mpa->revision = mpa_rev;
+	mpa->private_data_size = htons(plen);
+	if (plen)
+		memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb again.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function tx_ack() will deref it.
+	 */
+	skb_get(skb);
+	skb->priority = CPL_PRIORITY_DATA;
+	set_arp_failure_handler(skb, arp_failure_discard);
+	skb_reset_transport_header(skb);
+	req = skb_push(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
+	req->wr_lo = htonl(V_WR_TID(ep->hwtid));
+	req->len = htonl(mpalen);
+	req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
+			   V_TX_SNDBUF(snd_win>>15));
+	req->flags = htonl(F_TX_INIT);
+	req->sndseq = htonl(ep->snd_seq);
+	BUG_ON(ep->mpa_skb);
+	ep->mpa_skb = skb;
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen;
+	struct tx_data_wr *req;
+	struct mpa_message *mpa;
+	int len;
+	struct sk_buff *skb;
+
+	pr_debug("%s ep %p plen %d\n", __func__, ep, plen);
+
+	mpalen = sizeof(*mpa) + plen;
+
+	skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		pr_err("%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	skb->priority = CPL_PRIORITY_DATA;
+	skb_reserve(skb, sizeof(*req));
+	mpa = skb_put(skb, mpalen);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0);
+	mpa->revision = mpa_rev;
+	mpa->private_data_size = htons(plen);
+	if (plen)
+		memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function tx_ack() will deref it.
+	 */
+	skb_get(skb);
+	set_arp_failure_handler(skb, arp_failure_discard);
+	skb_reset_transport_header(skb);
+	len = skb->len;
+	req = skb_push(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
+	req->wr_lo = htonl(V_WR_TID(ep->hwtid));
+	req->len = htonl(len);
+	req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
+			   V_TX_SNDBUF(snd_win>>15));
+	req->flags = htonl(F_TX_INIT);
+	req->sndseq = htonl(ep->snd_seq);
+	ep->mpa_skb = skb;
+	state_set(&ep->com, MPA_REP_SENT);
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static int act_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_act_establish *req = cplhdr(skb);
+	unsigned int tid = GET_TID(req);
+
+	pr_debug("%s ep %p tid %d\n", __func__, ep, tid);
+
+	dst_confirm(ep->dst);
+
+	/* setup the hwtid for this connection */
+	ep->hwtid = tid;
+	cxgb3_insert_tid(ep->com.tdev, &t3c_client, ep, tid);
+
+	ep->snd_seq = ntohl(req->snd_isn);
+	ep->rcv_seq = ntohl(req->rcv_isn);
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	/* dealloc the atid */
+	cxgb3_free_atid(ep->com.tdev, ep->atid);
+
+	/* start MPA negotiation */
+	send_mpa_req(ep, skb);
+
+	return 0;
+}
+
+static void abort_connection(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+	pr_debug("%s ep %p\n", __FILE__, ep);
+	state_set(&ep->com, ABORTING);
+	send_abort(ep, skb, gfp);
+}
+
+static void close_complete_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	if (ep->com.cm_id) {
+		pr_debug("close complete delivered ep %p cm_id %p tid %d\n",
+			 ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void peer_close_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_DISCONNECT;
+	if (ep->com.cm_id) {
+		pr_debug("peer close delivered ep %p cm_id %p tid %d\n",
+			 ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+}
+
+static void peer_abort_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	event.status = -ECONNRESET;
+	if (ep->com.cm_id) {
+		pr_debug("abort delivered ep %p cm_id %p tid %d\n", ep,
+			 ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void connect_reply_upcall(struct iwch_ep *ep, int status)
+{
+	struct iw_cm_event event;
+
+	pr_debug("%s ep %p status %d\n", __func__, ep, status);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REPLY;
+	event.status = status;
+	memcpy(&event.local_addr, &ep->com.local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&event.remote_addr, &ep->com.remote_addr,
+	       sizeof(ep->com.remote_addr));
+
+	if ((status == 0) || (status == -ECONNREFUSED)) {
+		event.private_data_len = ep->plen;
+		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+	}
+	if (ep->com.cm_id) {
+		pr_debug("%s ep %p tid %d status %d\n", __func__, ep,
+			 ep->hwtid, status);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+	if (status < 0) {
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void connect_request_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	pr_debug("%s ep %p tid %d\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REQUEST;
+	memcpy(&event.local_addr, &ep->com.local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&event.remote_addr, &ep->com.remote_addr,
+	       sizeof(ep->com.local_addr));
+	event.private_data_len = ep->plen;
+	event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+	event.provider_data = ep;
+	/*
+	 * Until ird/ord negotiation via MPAv2 support is added, send max
+	 * supported values
+	 */
+	event.ird = event.ord = 8;
+	if (state_read(&ep->parent_ep->com) != DEAD) {
+		get_ep(&ep->com);
+		ep->parent_ep->com.cm_id->event_handler(
+						ep->parent_ep->com.cm_id,
+						&event);
+	}
+	put_ep(&ep->parent_ep->com);
+	ep->parent_ep = NULL;
+}
+
+static void established_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_ESTABLISHED;
+	/*
+	 * Until ird/ord negotiation via MPAv2 support is added, send max
+	 * supported values
+	 */
+	event.ird = event.ord = 8;
+	if (ep->com.cm_id) {
+		pr_debug("%s ep %p tid %d\n", __func__, ep, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+}
+
+static int update_rx_credits(struct iwch_ep *ep, u32 credits)
+{
+	struct cpl_rx_data_ack *req;
+	struct sk_buff *skb;
+
+	pr_debug("%s ep %p credits %u\n", __func__, ep, credits);
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		pr_err("update_rx_credits - cannot alloc skb!\n");
+		return 0;
+	}
+
+	req = skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, ep->hwtid));
+	req->credit_dack = htonl(V_RX_CREDITS(credits) | V_RX_FORCE_ACK(1));
+	skb->priority = CPL_PRIORITY_ACK;
+	iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+	return credits;
+}
+
+static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	u16 plen;
+	struct iwch_qp_attributes attrs;
+	enum iwch_qp_attr_mask mask;
+	int err;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+
+	/*
+	 * Stop mpa timer.  If it expired, then the state has
+	 * changed and we bail since ep_timeout already aborted
+	 * the connection.
+	 */
+	stop_ep_timer(ep);
+	if (state_read(&ep->com) != MPA_REQ_SENT)
+		return;
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+		err = -EINVAL;
+		goto err;
+	}
+
+	/*
+	 * copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * if we don't even have the mpa message, then bail.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return;
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/* Validate MPA header. */
+	if (mpa->revision != mpa_rev) {
+		err = -EPROTO;
+		goto err;
+	}
+	if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return;
+
+	if (mpa->flags & MPA_REJECT) {
+		err = -ECONNREFUSED;
+		goto err;
+	}
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data. And
+	 * the MPA header is valid.
+	 */
+	state_set(&ep->com, FPDU_MODE);
+	ep->mpa_attr.initiator = 1;
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa_rev;
+	pr_debug("%s - crc_enabled=%d, recv_marker_enabled=%d, xmit_marker_enabled=%d, version=%d\n",
+		 __func__,
+		 ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+		 ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
+
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = IWCH_QP_STATE_RTS;
+
+	mask = IWCH_QP_ATTR_NEXT_STATE |
+	    IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR |
+	    IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD;
+
+	/* bind QP and TID with INIT_WR */
+	err = iwch_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err;
+
+	if (peer2peer && iwch_rqes_posted(ep->com.qp) == 0) {
+		iwch_post_zb_read(ep);
+	}
+
+	goto out;
+err:
+	abort_connection(ep, skb, GFP_KERNEL);
+out:
+	connect_reply_upcall(ep, err);
+	return;
+}
+
+static void process_mpa_request(struct iwch_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	u16 plen;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+
+	/*
+	 * Stop mpa timer.  If it expired, then the state has
+	 * changed and we bail since ep_timeout already aborted
+	 * the connection.
+	 */
+	stop_ep_timer(ep);
+	if (state_read(&ep->com) != MPA_REQ_WAIT)
+		return;
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	pr_debug("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
+
+	/*
+	 * Copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * If we don't even have the mpa message, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return;
+	pr_debug("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/*
+	 * Validate MPA Header.
+	 */
+	if (mpa->revision != mpa_rev) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return;
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data.
+	 */
+	ep->mpa_attr.initiator = 0;
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa_rev;
+	pr_debug("%s - crc_enabled=%d, recv_marker_enabled=%d, xmit_marker_enabled=%d, version=%d\n",
+		 __func__,
+		 ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+		 ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
+
+	state_set(&ep->com, MPA_REQ_RCVD);
+
+	/* drive upcall */
+	connect_request_upcall(ep);
+	return;
+}
+
+static int rx_data(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_rx_data *hdr = cplhdr(skb);
+	unsigned int dlen = ntohs(hdr->len);
+
+	pr_debug("%s ep %p dlen %u\n", __func__, ep, dlen);
+
+	skb_pull(skb, sizeof(*hdr));
+	skb_trim(skb, dlen);
+
+	ep->rcv_seq += dlen;
+	BUG_ON(ep->rcv_seq != (ntohl(hdr->seq) + dlen));
+
+	switch (state_read(&ep->com)) {
+	case MPA_REQ_SENT:
+		process_mpa_reply(ep, skb);
+		break;
+	case MPA_REQ_WAIT:
+		process_mpa_request(ep, skb);
+		break;
+	case MPA_REP_SENT:
+		break;
+	default:
+		pr_err("%s Unexpected streaming data. ep %p state %d tid %d\n",
+		       __func__, ep, state_read(&ep->com), ep->hwtid);
+
+		/*
+		 * The ep will timeout and inform the ULP of the failure.
+		 * See ep_timeout().
+		 */
+		break;
+	}
+
+	/* update RX credits */
+	update_rx_credits(ep, dlen);
+
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * Upcall from the adapter indicating data has been transmitted.
+ * For us its just the single MPA request or reply.  We can now free
+ * the skb holding the mpa message.
+ */
+static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_wr_ack *hdr = cplhdr(skb);
+	unsigned int credits = ntohs(hdr->credits);
+	unsigned long flags;
+	int post_zb = 0;
+
+	pr_debug("%s ep %p credits %u\n", __func__, ep, credits);
+
+	if (credits == 0) {
+		pr_debug("%s 0 credit ack  ep %p state %u\n",
+			 __func__, ep, state_read(&ep->com));
+		return CPL_RET_BUF_DONE;
+	}
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	BUG_ON(credits != 1);
+	dst_confirm(ep->dst);
+	if (!ep->mpa_skb) {
+		pr_debug("%s rdma_init wr_ack ep %p state %u\n",
+			 __func__, ep, ep->com.state);
+		if (ep->mpa_attr.initiator) {
+			pr_debug("%s initiator ep %p state %u\n",
+				 __func__, ep, ep->com.state);
+			if (peer2peer && ep->com.state == FPDU_MODE)
+				post_zb = 1;
+		} else {
+			pr_debug("%s responder ep %p state %u\n",
+				 __func__, ep, ep->com.state);
+			if (ep->com.state == MPA_REQ_RCVD) {
+				ep->com.rpl_done = 1;
+				wake_up(&ep->com.waitq);
+			}
+		}
+	} else {
+		pr_debug("%s lsm ack ep %p state %u freeing skb\n",
+			 __func__, ep, ep->com.state);
+		kfree_skb(ep->mpa_skb);
+		ep->mpa_skb = NULL;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (post_zb)
+		iwch_post_zb_read(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	unsigned long flags;
+	int release = 0;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	BUG_ON(!ep);
+
+	/*
+	 * We get 2 abort replies from the HW.  The first one must
+	 * be ignored except for scribbling that we need one more.
+	 */
+	if (!test_and_set_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags)) {
+		return CPL_RET_BUF_DONE;
+	}
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	switch (ep->com.state) {
+	case ABORTING:
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	default:
+		pr_err("%s ep %p state %d\n", __func__, ep, ep->com.state);
+		break;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+
+	if (release)
+		release_ep_resources(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * Return whether a failed active open has allocated a TID
+ */
+static inline int act_open_has_tid(int status)
+{
+	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
+	       status != CPL_ERR_ARP_MISS;
+}
+
+static int act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+
+	pr_debug("%s ep %p status %u errno %d\n", __func__, ep, rpl->status,
+		 status2errno(rpl->status));
+	connect_reply_upcall(ep, status2errno(rpl->status));
+	state_set(&ep->com, DEAD);
+	if (ep->com.tdev->type != T3A && act_open_has_tid(rpl->status))
+		release_tid(ep->com.tdev, GET_TID(rpl), NULL);
+	cxgb3_free_atid(ep->com.tdev, ep->atid);
+	dst_release(ep->dst);
+	l2t_release(ep->com.tdev, ep->l2t);
+	put_ep(&ep->com);
+	return CPL_RET_BUF_DONE;
+}
+
+static int listen_start(struct iwch_listen_ep *ep)
+{
+	struct sk_buff *skb;
+	struct cpl_pass_open_req *req;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		pr_err("t3c_listen_start failed to alloc skb!\n");
+		return -ENOMEM;
+	}
+
+	req = skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, ep->stid));
+	req->local_port = ep->com.local_addr.sin_port;
+	req->local_ip = ep->com.local_addr.sin_addr.s_addr;
+	req->peer_port = 0;
+	req->peer_ip = 0;
+	req->peer_netmask = 0;
+	req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS);
+	req->opt0l = htonl(V_RCV_BUFSIZ(rcv_win>>10));
+	req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));
+
+	skb->priority = 1;
+	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+}
+
+static int pass_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_listen_ep *ep = ctx;
+	struct cpl_pass_open_rpl *rpl = cplhdr(skb);
+
+	pr_debug("%s ep %p status %d error %d\n", __func__, ep,
+		 rpl->status, status2errno(rpl->status));
+	ep->com.rpl_err = status2errno(rpl->status);
+	ep->com.rpl_done = 1;
+	wake_up(&ep->com.waitq);
+
+	return CPL_RET_BUF_DONE;
+}
+
+static int listen_stop(struct iwch_listen_ep *ep)
+{
+	struct sk_buff *skb;
+	struct cpl_close_listserv_req *req;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		pr_err("%s - failed to alloc skb\n", __func__);
+		return -ENOMEM;
+	}
+	req = skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->cpu_idx = 0;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, ep->stid));
+	skb->priority = 1;
+	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+}
+
+static int close_listsrv_rpl(struct t3cdev *tdev, struct sk_buff *skb,
+			     void *ctx)
+{
+	struct iwch_listen_ep *ep = ctx;
+	struct cpl_close_listserv_rpl *rpl = cplhdr(skb);
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	ep->com.rpl_err = status2errno(rpl->status);
+	ep->com.rpl_done = 1;
+	wake_up(&ep->com.waitq);
+	return CPL_RET_BUF_DONE;
+}
+
+static void accept_cr(struct iwch_ep *ep, __be32 peer_ip, struct sk_buff *skb)
+{
+	struct cpl_pass_accept_rpl *rpl;
+	unsigned int mtu_idx;
+	u32 opt0h, opt0l, opt2;
+	int wscale;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	BUG_ON(skb_cloned(skb));
+	skb_trim(skb, sizeof(*rpl));
+	skb_get(skb);
+	mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst));
+	wscale = compute_wscale(rcv_win);
+	opt0h = V_NAGLE(0) |
+	    V_NO_CONG(nocong) |
+	    V_KEEP_ALIVE(1) |
+	    F_TCAM_BYPASS |
+	    V_WND_SCALE(wscale) |
+	    V_MSS_IDX(mtu_idx) |
+	    V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx);
+	opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10);
+	opt2 = F_RX_COALESCE_VALID | V_RX_COALESCE(0) | V_FLAVORS_VALID(1) |
+	       V_CONG_CONTROL_FLAVOR(cong_flavor);
+
+	rpl = cplhdr(skb);
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, ep->hwtid));
+	rpl->peer_ip = peer_ip;
+	rpl->opt0h = htonl(opt0h);
+	rpl->opt0l_status = htonl(opt0l | CPL_PASS_OPEN_ACCEPT);
+	rpl->opt2 = htonl(opt2);
+	rpl->rsvd = rpl->opt2;	/* workaround for HW bug */
+	skb->priority = CPL_PRIORITY_SETUP;
+	iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+
+	return;
+}
+
+static void reject_cr(struct t3cdev *tdev, u32 hwtid, __be32 peer_ip,
+		      struct sk_buff *skb)
+{
+	pr_debug("%s t3cdev %p tid %u peer_ip %x\n", __func__, tdev, hwtid,
+		 peer_ip);
+	BUG_ON(skb_cloned(skb));
+	skb_trim(skb, sizeof(struct cpl_tid_release));
+	skb_get(skb);
+
+	if (tdev->type != T3A)
+		release_tid(tdev, hwtid, skb);
+	else {
+		struct cpl_pass_accept_rpl *rpl;
+
+		rpl = cplhdr(skb);
+		skb->priority = CPL_PRIORITY_SETUP;
+		rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+		OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
+						      hwtid));
+		rpl->peer_ip = peer_ip;
+		rpl->opt0h = htonl(F_TCAM_BYPASS);
+		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
+		rpl->opt2 = 0;
+		rpl->rsvd = rpl->opt2;
+		iwch_cxgb3_ofld_send(tdev, skb);
+	}
+}
+
+static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *child_ep, *parent_ep = ctx;
+	struct cpl_pass_accept_req *req = cplhdr(skb);
+	unsigned int hwtid = GET_TID(req);
+	struct dst_entry *dst;
+	struct l2t_entry *l2t;
+	struct rtable *rt;
+	struct iff_mac tim;
+
+	pr_debug("%s parent ep %p tid %u\n", __func__, parent_ep, hwtid);
+
+	if (state_read(&parent_ep->com) != LISTEN) {
+		pr_err("%s - listening ep not in LISTEN\n", __func__);
+		goto reject;
+	}
+
+	/*
+	 * Find the netdev for this connection request.
+	 */
+	tim.mac_addr = req->dst_mac;
+	tim.vlan_tag = ntohs(req->vlan_tag);
+	if (tdev->ctl(tdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
+		pr_err("%s bad dst mac %pM\n", __func__, req->dst_mac);
+		goto reject;
+	}
+
+	/* Find output route */
+	rt = find_route(tdev,
+			req->local_ip,
+			req->peer_ip,
+			req->local_port,
+			req->peer_port, G_PASS_OPEN_TOS(ntohl(req->tos_tid)));
+	if (!rt) {
+		pr_err("%s - failed to find dst entry!\n", __func__);
+		goto reject;
+	}
+	dst = &rt->dst;
+	l2t = t3_l2t_get(tdev, dst, NULL, &req->peer_ip);
+	if (!l2t) {
+		pr_err("%s - failed to allocate l2t entry!\n", __func__);
+		dst_release(dst);
+		goto reject;
+	}
+	child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL);
+	if (!child_ep) {
+		pr_err("%s - failed to allocate ep entry!\n", __func__);
+		l2t_release(tdev, l2t);
+		dst_release(dst);
+		goto reject;
+	}
+	state_set(&child_ep->com, CONNECTING);
+	child_ep->com.tdev = tdev;
+	child_ep->com.cm_id = NULL;
+	child_ep->com.local_addr.sin_family = AF_INET;
+	child_ep->com.local_addr.sin_port = req->local_port;
+	child_ep->com.local_addr.sin_addr.s_addr = req->local_ip;
+	child_ep->com.remote_addr.sin_family = AF_INET;
+	child_ep->com.remote_addr.sin_port = req->peer_port;
+	child_ep->com.remote_addr.sin_addr.s_addr = req->peer_ip;
+	get_ep(&parent_ep->com);
+	child_ep->parent_ep = parent_ep;
+	child_ep->tos = G_PASS_OPEN_TOS(ntohl(req->tos_tid));
+	child_ep->l2t = l2t;
+	child_ep->dst = dst;
+	child_ep->hwtid = hwtid;
+	timer_setup(&child_ep->timer, ep_timeout, 0);
+	cxgb3_insert_tid(tdev, &t3c_client, child_ep, hwtid);
+	accept_cr(child_ep, req->peer_ip, skb);
+	goto out;
+reject:
+	reject_cr(tdev, hwtid, req->peer_ip, skb);
+out:
+	return CPL_RET_BUF_DONE;
+}
+
+static int pass_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_pass_establish *req = cplhdr(skb);
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	ep->snd_seq = ntohl(req->snd_isn);
+	ep->rcv_seq = ntohl(req->rcv_isn);
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	dst_confirm(ep->dst);
+	state_set(&ep->com, MPA_REQ_WAIT);
+	start_ep_timer(ep);
+
+	return CPL_RET_BUF_DONE;
+}
+
+static int peer_close(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct iwch_qp_attributes attrs;
+	unsigned long flags;
+	int disconnect = 1;
+	int release = 0;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	dst_confirm(ep->dst);
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, CLOSING);
+		break;
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, CLOSING);
+		connect_reply_upcall(ep, -ECONNRESET);
+		break;
+	case MPA_REQ_RCVD:
+
+		/*
+		 * We're gonna mark this puppy DEAD, but keep
+		 * the reference on it until the ULP accepts or
+		 * rejects the CR. Also wake up anyone waiting
+		 * in rdma connection migration (see iwch_accept_cr()).
+		 */
+		__state_set(&ep->com, CLOSING);
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		pr_debug("waking up ep %p\n", ep);
+		wake_up(&ep->com.waitq);
+		break;
+	case MPA_REP_SENT:
+		__state_set(&ep->com, CLOSING);
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		pr_debug("waking up ep %p\n", ep);
+		wake_up(&ep->com.waitq);
+		break;
+	case FPDU_MODE:
+		start_ep_timer(ep);
+		__state_set(&ep->com, CLOSING);
+		attrs.next_state = IWCH_QP_STATE_CLOSING;
+		iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
+			       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
+		peer_close_upcall(ep);
+		break;
+	case ABORTING:
+		disconnect = 0;
+		break;
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		disconnect = 0;
+		break;
+	case MORIBUND:
+		stop_ep_timer(ep);
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = IWCH_QP_STATE_IDLE;
+			iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
+		}
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		disconnect = 0;
+		break;
+	case DEAD:
+		disconnect = 0;
+		break;
+	default:
+		BUG_ON(1);
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (disconnect)
+		iwch_ep_disconnect(ep, 0, GFP_KERNEL);
+	if (release)
+		release_ep_resources(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * Returns whether an ABORT_REQ_RSS message is a negative advice.
+ */
+static int is_neg_adv_abort(unsigned int status)
+{
+	return status == CPL_ERR_RTX_NEG_ADVICE ||
+	       status == CPL_ERR_PERSIST_NEG_ADVICE;
+}
+
+static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct iwch_ep *ep = ctx;
+	struct cpl_abort_rpl *rpl;
+	struct sk_buff *rpl_skb;
+	struct iwch_qp_attributes attrs;
+	int ret;
+	int release = 0;
+	unsigned long flags;
+
+	if (is_neg_adv_abort(req->status)) {
+		pr_debug("%s neg_adv_abort ep %p tid %d\n", __func__, ep,
+			 ep->hwtid);
+		t3_l2t_send_event(ep->com.tdev, ep->l2t);
+		return CPL_RET_BUF_DONE;
+	}
+
+	/*
+	 * We get 2 peer aborts from the HW.  The first one must
+	 * be ignored except for scribbling that we need one more.
+	 */
+	if (!test_and_set_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags)) {
+		return CPL_RET_BUF_DONE;
+	}
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	pr_debug("%s ep %p state %u\n", __func__, ep, ep->com.state);
+	switch (ep->com.state) {
+	case CONNECTING:
+		break;
+	case MPA_REQ_WAIT:
+		stop_ep_timer(ep);
+		break;
+	case MPA_REQ_SENT:
+		stop_ep_timer(ep);
+		connect_reply_upcall(ep, -ECONNRESET);
+		break;
+	case MPA_REP_SENT:
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		pr_debug("waking up ep %p\n", ep);
+		wake_up(&ep->com.waitq);
+		break;
+	case MPA_REQ_RCVD:
+
+		/*
+		 * We're gonna mark this puppy DEAD, but keep
+		 * the reference on it until the ULP accepts or
+		 * rejects the CR. Also wake up anyone waiting
+		 * in rdma connection migration (see iwch_accept_cr()).
+		 */
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		pr_debug("waking up ep %p\n", ep);
+		wake_up(&ep->com.waitq);
+		break;
+	case MORIBUND:
+	case CLOSING:
+		stop_ep_timer(ep);
+		/*FALLTHROUGH*/
+	case FPDU_MODE:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = IWCH_QP_STATE_ERROR;
+			ret = iwch_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+			if (ret)
+				pr_err("%s - qp <- error failed!\n", __func__);
+		}
+		peer_abort_upcall(ep);
+		break;
+	case ABORTING:
+		break;
+	case DEAD:
+		pr_debug("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__);
+		spin_unlock_irqrestore(&ep->com.lock, flags);
+		return CPL_RET_BUF_DONE;
+	default:
+		BUG_ON(1);
+		break;
+	}
+	dst_confirm(ep->dst);
+	if (ep->com.state != ABORTING) {
+		__state_set(&ep->com, DEAD);
+		release = 1;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+
+	rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL);
+	if (!rpl_skb) {
+		pr_err("%s - cannot allocate skb!\n", __func__);
+		release = 1;
+		goto out;
+	}
+	rpl_skb->priority = CPL_PRIORITY_DATA;
+	rpl = skb_put(rpl_skb, sizeof(*rpl));
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+	rpl->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid));
+	rpl->cmd = CPL_ABORT_NO_RST;
+	iwch_cxgb3_ofld_send(ep->com.tdev, rpl_skb);
+out:
+	if (release)
+		release_ep_resources(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+static int close_con_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct iwch_qp_attributes attrs;
+	unsigned long flags;
+	int release = 0;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	BUG_ON(!ep);
+
+	/* The cm_id may be null if we failed to connect */
+	spin_lock_irqsave(&ep->com.lock, flags);
+	switch (ep->com.state) {
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		break;
+	case MORIBUND:
+		stop_ep_timer(ep);
+		if ((ep->com.cm_id) && (ep->com.qp)) {
+			attrs.next_state = IWCH_QP_STATE_IDLE;
+			iwch_modify_qp(ep->com.qp->rhp,
+					     ep->com.qp,
+					     IWCH_QP_ATTR_NEXT_STATE,
+					     &attrs, 1);
+		}
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	case ABORTING:
+	case DEAD:
+		break;
+	default:
+		BUG_ON(1);
+		break;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (release)
+		release_ep_resources(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * T3A does 3 things when a TERM is received:
+ * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet
+ * 2) generate an async event on the QP with the TERMINATE opcode
+ * 3) post a TERMINATE opcode cqe into the associated CQ.
+ *
+ * For (1), we save the message in the qp for later consumer consumption.
+ * For (2), we move the QP into TERMINATE, post a QP event and disconnect.
+ * For (3), we toss the CQE in cxio_poll_cq().
+ *
+ * terminate() handles case (1)...
+ */
+static int terminate(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+
+	if (state_read(&ep->com) != FPDU_MODE)
+		return CPL_RET_BUF_DONE;
+
+	pr_debug("%s ep %p\n", __func__, ep);
+	skb_pull(skb, sizeof(struct cpl_rdma_terminate));
+	pr_debug("%s saving %d bytes of term msg\n", __func__, skb->len);
+	skb_copy_from_linear_data(skb, ep->com.qp->attr.terminate_buffer,
+				  skb->len);
+	ep->com.qp->attr.terminate_msg_len = skb->len;
+	ep->com.qp->attr.is_terminate_local = 0;
+	return CPL_RET_BUF_DONE;
+}
+
+static int ec_status(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct cpl_rdma_ec_status *rep = cplhdr(skb);
+	struct iwch_ep *ep = ctx;
+
+	pr_debug("%s ep %p tid %u status %d\n", __func__, ep, ep->hwtid,
+		 rep->status);
+	if (rep->status) {
+		struct iwch_qp_attributes attrs;
+
+		pr_err("%s BAD CLOSE - Aborting tid %u\n",
+		       __func__, ep->hwtid);
+		stop_ep_timer(ep);
+		attrs.next_state = IWCH_QP_STATE_ERROR;
+		iwch_modify_qp(ep->com.qp->rhp,
+			       ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+			       &attrs, 1);
+		abort_connection(ep, NULL, GFP_KERNEL);
+	}
+	return CPL_RET_BUF_DONE;
+}
+
+static void ep_timeout(struct timer_list *t)
+{
+	struct iwch_ep *ep = from_timer(ep, t, timer);
+	struct iwch_qp_attributes attrs;
+	unsigned long flags;
+	int abort = 1;
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	pr_debug("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid,
+		 ep->com.state);
+	switch (ep->com.state) {
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, ABORTING);
+		connect_reply_upcall(ep, -ETIMEDOUT);
+		break;
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, ABORTING);
+		break;
+	case CLOSING:
+	case MORIBUND:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = IWCH_QP_STATE_ERROR;
+			iwch_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+		}
+		__state_set(&ep->com, ABORTING);
+		break;
+	default:
+		WARN(1, "%s unexpected state ep %p state %u\n",
+			__func__, ep, ep->com.state);
+		abort = 0;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (abort)
+		abort_connection(ep, NULL, GFP_ATOMIC);
+	put_ep(&ep->com);
+}
+
+int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	struct iwch_ep *ep = to_ep(cm_id);
+
+	pr_debug("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	if (state_read(&ep->com) == DEAD) {
+		put_ep(&ep->com);
+		return -ECONNRESET;
+	}
+	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
+	if (mpa_rev == 0)
+		abort_connection(ep, NULL, GFP_KERNEL);
+	else {
+		send_mpa_reject(ep, pdata, pdata_len);
+		iwch_ep_disconnect(ep, 0, GFP_KERNEL);
+	}
+	put_ep(&ep->com);
+	return 0;
+}
+
+int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	int err;
+	struct iwch_qp_attributes attrs;
+	enum iwch_qp_attr_mask mask;
+	struct iwch_ep *ep = to_ep(cm_id);
+	struct iwch_dev *h = to_iwch_dev(cm_id->device);
+	struct iwch_qp *qp = get_qhp(h, conn_param->qpn);
+
+	pr_debug("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	if (state_read(&ep->com) == DEAD) {
+		err = -ECONNRESET;
+		goto err;
+	}
+
+	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
+	BUG_ON(!qp);
+
+	if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) ||
+	    (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) {
+		abort_connection(ep, NULL, GFP_KERNEL);
+		err = -EINVAL;
+		goto err;
+	}
+
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.qp = qp;
+
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (peer2peer && ep->ird == 0)
+		ep->ird = 1;
+
+	pr_debug("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord);
+
+	/* bind QP to EP and move to RTS */
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = IWCH_QP_STATE_RTS;
+
+	/* bind QP and TID with INIT_WR */
+	mask = IWCH_QP_ATTR_NEXT_STATE |
+			     IWCH_QP_ATTR_LLP_STREAM_HANDLE |
+			     IWCH_QP_ATTR_MPA_ATTR |
+			     IWCH_QP_ATTR_MAX_IRD |
+			     IWCH_QP_ATTR_MAX_ORD;
+
+	err = iwch_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err1;
+
+	/* if needed, wait for wr_ack */
+	if (iwch_rqes_posted(qp)) {
+		wait_event(ep->com.waitq, ep->com.rpl_done);
+		err = ep->com.rpl_err;
+		if (err)
+			goto err1;
+	}
+
+	err = send_mpa_reply(ep, conn_param->private_data,
+			     conn_param->private_data_len);
+	if (err)
+		goto err1;
+
+
+	state_set(&ep->com, FPDU_MODE);
+	established_upcall(ep);
+	put_ep(&ep->com);
+	return 0;
+err1:
+	ep->com.cm_id = NULL;
+	ep->com.qp = NULL;
+	cm_id->rem_ref(cm_id);
+err:
+	put_ep(&ep->com);
+	return err;
+}
+
+static int is_loopback_dst(struct iw_cm_id *cm_id)
+{
+	struct net_device *dev;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
+
+	dev = ip_dev_find(&init_net, raddr->sin_addr.s_addr);
+	if (!dev)
+		return 0;
+	dev_put(dev);
+	return 1;
+}
+
+int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct iwch_dev *h = to_iwch_dev(cm_id->device);
+	struct iwch_ep *ep;
+	struct rtable *rt;
+	int err = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
+
+	if (cm_id->m_remote_addr.ss_family != PF_INET) {
+		err = -ENOSYS;
+		goto out;
+	}
+
+	if (is_loopback_dst(cm_id)) {
+		err = -ENOSYS;
+		goto out;
+	}
+
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		pr_err("%s - cannot alloc ep\n", __func__);
+		err = -ENOMEM;
+		goto out;
+	}
+	timer_setup(&ep->timer, ep_timeout, 0);
+	ep->plen = conn_param->private_data_len;
+	if (ep->plen)
+		memcpy(ep->mpa_pkt + sizeof(struct mpa_message),
+		       conn_param->private_data, ep->plen);
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (peer2peer && ep->ord == 0)
+		ep->ord = 1;
+
+	ep->com.tdev = h->rdev.t3cdev_p;
+
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.qp = get_qhp(h, conn_param->qpn);
+	BUG_ON(!ep->com.qp);
+	pr_debug("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn,
+		 ep->com.qp, cm_id);
+
+	/*
+	 * Allocate an active TID to initiate a TCP connection.
+	 */
+	ep->atid = cxgb3_alloc_atid(h->rdev.t3cdev_p, &t3c_client, ep);
+	if (ep->atid == -1) {
+		pr_err("%s - cannot alloc atid\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+
+	/* find a route */
+	rt = find_route(h->rdev.t3cdev_p, laddr->sin_addr.s_addr,
+			raddr->sin_addr.s_addr, laddr->sin_port,
+			raddr->sin_port, IPTOS_LOWDELAY);
+	if (!rt) {
+		pr_err("%s - cannot find route\n", __func__);
+		err = -EHOSTUNREACH;
+		goto fail3;
+	}
+	ep->dst = &rt->dst;
+	ep->l2t = t3_l2t_get(ep->com.tdev, ep->dst, NULL,
+			     &raddr->sin_addr.s_addr);
+	if (!ep->l2t) {
+		pr_err("%s - cannot alloc l2e\n", __func__);
+		err = -ENOMEM;
+		goto fail4;
+	}
+
+	state_set(&ep->com, CONNECTING);
+	ep->tos = IPTOS_LOWDELAY;
+	memcpy(&ep->com.local_addr, &cm_id->m_local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&ep->com.remote_addr, &cm_id->m_remote_addr,
+	       sizeof(ep->com.remote_addr));
+
+	/* send connect request to rnic */
+	err = send_connect(ep);
+	if (!err)
+		goto out;
+
+	l2t_release(h->rdev.t3cdev_p, ep->l2t);
+fail4:
+	dst_release(ep->dst);
+fail3:
+	cxgb3_free_atid(ep->com.tdev, ep->atid);
+fail2:
+	cm_id->rem_ref(cm_id);
+	put_ep(&ep->com);
+out:
+	return err;
+}
+
+int iwch_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	int err = 0;
+	struct iwch_dev *h = to_iwch_dev(cm_id->device);
+	struct iwch_listen_ep *ep;
+
+
+	might_sleep();
+
+	if (cm_id->m_local_addr.ss_family != PF_INET) {
+		err = -ENOSYS;
+		goto fail1;
+	}
+
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		pr_err("%s - cannot alloc ep\n", __func__);
+		err = -ENOMEM;
+		goto fail1;
+	}
+	pr_debug("%s ep %p\n", __func__, ep);
+	ep->com.tdev = h->rdev.t3cdev_p;
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->backlog = backlog;
+	memcpy(&ep->com.local_addr, &cm_id->m_local_addr,
+	       sizeof(ep->com.local_addr));
+
+	/*
+	 * Allocate a server TID.
+	 */
+	ep->stid = cxgb3_alloc_stid(h->rdev.t3cdev_p, &t3c_client, ep);
+	if (ep->stid == -1) {
+		pr_err("%s - cannot alloc atid\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+
+	state_set(&ep->com, LISTEN);
+	err = listen_start(ep);
+	if (err)
+		goto fail3;
+
+	/* wait for pass_open_rpl */
+	wait_event(ep->com.waitq, ep->com.rpl_done);
+	err = ep->com.rpl_err;
+	if (!err) {
+		cm_id->provider_data = ep;
+		goto out;
+	}
+fail3:
+	cxgb3_free_stid(ep->com.tdev, ep->stid);
+fail2:
+	cm_id->rem_ref(cm_id);
+	put_ep(&ep->com);
+fail1:
+out:
+	return err;
+}
+
+int iwch_destroy_listen(struct iw_cm_id *cm_id)
+{
+	int err;
+	struct iwch_listen_ep *ep = to_listen_ep(cm_id);
+
+	pr_debug("%s ep %p\n", __func__, ep);
+
+	might_sleep();
+	state_set(&ep->com, DEAD);
+	ep->com.rpl_done = 0;
+	ep->com.rpl_err = 0;
+	err = listen_stop(ep);
+	if (err)
+		goto done;
+	wait_event(ep->com.waitq, ep->com.rpl_done);
+	cxgb3_free_stid(ep->com.tdev, ep->stid);
+done:
+	err = ep->com.rpl_err;
+	cm_id->rem_ref(cm_id);
+	put_ep(&ep->com);
+	return err;
+}
+
+int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp)
+{
+	int ret=0;
+	unsigned long flags;
+	int close = 0;
+	int fatal = 0;
+	struct t3cdev *tdev;
+	struct cxio_rdev *rdev;
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+
+	pr_debug("%s ep %p state %s, abrupt %d\n", __func__, ep,
+		 states[ep->com.state], abrupt);
+
+	tdev = (struct t3cdev *)ep->com.tdev;
+	rdev = (struct cxio_rdev *)tdev->ulp;
+	if (cxio_fatal_error(rdev)) {
+		fatal = 1;
+		close_complete_upcall(ep);
+		ep->com.state = DEAD;
+	}
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+	case MPA_REQ_SENT:
+	case MPA_REQ_RCVD:
+	case MPA_REP_SENT:
+	case FPDU_MODE:
+		close = 1;
+		if (abrupt)
+			ep->com.state = ABORTING;
+		else {
+			ep->com.state = CLOSING;
+			start_ep_timer(ep);
+		}
+		set_bit(CLOSE_SENT, &ep->com.flags);
+		break;
+	case CLOSING:
+		if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) {
+			close = 1;
+			if (abrupt) {
+				stop_ep_timer(ep);
+				ep->com.state = ABORTING;
+			} else
+				ep->com.state = MORIBUND;
+		}
+		break;
+	case MORIBUND:
+	case ABORTING:
+	case DEAD:
+		pr_debug("%s ignoring disconnect ep %p state %u\n",
+			 __func__, ep, ep->com.state);
+		break;
+	default:
+		BUG();
+		break;
+	}
+
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (close) {
+		if (abrupt)
+			ret = send_abort(ep, NULL, gfp);
+		else
+			ret = send_halfclose(ep, gfp);
+		if (ret)
+			fatal = 1;
+	}
+	if (fatal)
+		release_ep_resources(ep);
+	return ret;
+}
+
+int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new,
+		     struct l2t_entry *l2t)
+{
+	struct iwch_ep *ep = ctx;
+
+	if (ep->dst != old)
+		return 0;
+
+	pr_debug("%s ep %p redirect to dst %p l2t %p\n", __func__, ep, new,
+		 l2t);
+	dst_hold(new);
+	l2t_release(ep->com.tdev, ep->l2t);
+	ep->l2t = l2t;
+	dst_release(old);
+	ep->dst = new;
+	return 1;
+}
+
+/*
+ * All the CM events are handled on a work queue to have a safe context.
+ * These are the real handlers that are called from the work queue.
+ */
+static const cxgb3_cpl_handler_func work_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH]	= act_establish,
+	[CPL_ACT_OPEN_RPL]	= act_open_rpl,
+	[CPL_RX_DATA]		= rx_data,
+	[CPL_TX_DMA_ACK]	= tx_ack,
+	[CPL_ABORT_RPL_RSS]	= abort_rpl,
+	[CPL_ABORT_RPL]		= abort_rpl,
+	[CPL_PASS_OPEN_RPL]	= pass_open_rpl,
+	[CPL_CLOSE_LISTSRV_RPL]	= close_listsrv_rpl,
+	[CPL_PASS_ACCEPT_REQ]	= pass_accept_req,
+	[CPL_PASS_ESTABLISH]	= pass_establish,
+	[CPL_PEER_CLOSE]	= peer_close,
+	[CPL_ABORT_REQ_RSS]	= peer_abort,
+	[CPL_CLOSE_CON_RPL]	= close_con_rpl,
+	[CPL_RDMA_TERMINATE]	= terminate,
+	[CPL_RDMA_EC_STATUS]	= ec_status,
+};
+
+static void process_work(struct work_struct *work)
+{
+	struct sk_buff *skb = NULL;
+	void *ep;
+	struct t3cdev *tdev;
+	int ret;
+
+	while ((skb = skb_dequeue(&rxq))) {
+		ep = *((void **) (skb->cb));
+		tdev = *((struct t3cdev **) (skb->cb + sizeof(void *)));
+		ret = work_handlers[G_OPCODE(ntohl((__force __be32)skb->csum))](tdev, skb, ep);
+		if (ret & CPL_RET_BUF_DONE)
+			kfree_skb(skb);
+
+		/*
+		 * ep was referenced in sched(), and is freed here.
+		 */
+		put_ep((struct iwch_ep_common *)ep);
+	}
+}
+
+static DECLARE_WORK(skb_work, process_work);
+
+static int sched(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep_common *epc = ctx;
+
+	get_ep(epc);
+
+	/*
+	 * Save ctx and tdev in the skb->cb area.
+	 */
+	*((void **) skb->cb) = ctx;
+	*((struct t3cdev **) (skb->cb + sizeof(void *))) = tdev;
+
+	/*
+	 * Queue the skb and schedule the worker thread.
+	 */
+	skb_queue_tail(&rxq, skb);
+	queue_work(workq, &skb_work);
+	return 0;
+}
+
+static int set_tcb_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct cpl_set_tcb_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status != CPL_ERR_NONE) {
+		pr_err("Unexpected SET_TCB_RPL status %u for tid %u\n",
+		       rpl->status, GET_TID(rpl));
+	}
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * All upcalls from the T3 Core go to sched() to schedule the
+ * processing on a work queue.
+ */
+cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH]	= sched,
+	[CPL_ACT_OPEN_RPL]	= sched,
+	[CPL_RX_DATA]		= sched,
+	[CPL_TX_DMA_ACK]	= sched,
+	[CPL_ABORT_RPL_RSS]	= sched,
+	[CPL_ABORT_RPL]		= sched,
+	[CPL_PASS_OPEN_RPL]	= sched,
+	[CPL_CLOSE_LISTSRV_RPL]	= sched,
+	[CPL_PASS_ACCEPT_REQ]	= sched,
+	[CPL_PASS_ESTABLISH]	= sched,
+	[CPL_PEER_CLOSE]	= sched,
+	[CPL_CLOSE_CON_RPL]	= sched,
+	[CPL_ABORT_REQ_RSS]	= sched,
+	[CPL_RDMA_TERMINATE]	= sched,
+	[CPL_RDMA_EC_STATUS]	= sched,
+	[CPL_SET_TCB_RPL]	= set_tcb_rpl,
+};
+
+int __init iwch_cm_init(void)
+{
+	skb_queue_head_init(&rxq);
+
+	workq = alloc_ordered_workqueue("iw_cxgb3", WQ_MEM_RECLAIM);
+	if (!workq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void __exit iwch_cm_term(void)
+{
+	flush_workqueue(workq);
+	destroy_workqueue(workq);
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h
new file mode 100644
index 0000000..cc7fe64
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IWCH_CM_H_
+#define _IWCH_CM_H_
+
+#include <linux/inet.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
+
+#include "cxgb3_offload.h"
+#include "iwch_provider.h"
+
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+
+#define MPA_MAX_PRIVATE_DATA	256
+#define MPA_REV		0	/* XXX - amso1100 uses rev 0 ! */
+#define MPA_REJECT		0x20
+#define MPA_CRC			0x40
+#define MPA_MARKERS		0x80
+#define MPA_FLAGS_MASK		0xE0
+
+#define put_ep(ep) {							\
+	pr_debug("put_ep (via %s:%u) ep %p refcnt %d\n",		\
+		 __func__, __LINE__, ep, kref_read(&((ep)->kref)));	\
+	WARN_ON(kref_read(&((ep)->kref)) < 1);				\
+	kref_put(&((ep)->kref), __free_ep);				\
+}
+
+#define get_ep(ep) {							\
+	pr_debug("get_ep (via %s:%u) ep %p, refcnt %d\n",		\
+		 __func__, __LINE__, ep, kref_read(&((ep)->kref)));	\
+	kref_get(&((ep)->kref));					\
+}
+
+struct mpa_message {
+	u8 key[16];
+	u8 flags;
+	u8 revision;
+	__be16 private_data_size;
+	u8 private_data[0];
+};
+
+struct terminate_message {
+	u8 layer_etype;
+	u8 ecode;
+	__be16 hdrct_rsvd;
+	u8 len_hdrs[0];
+};
+
+#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28)
+
+enum iwch_layers_types {
+	LAYER_RDMAP		= 0x00,
+	LAYER_DDP		= 0x10,
+	LAYER_MPA		= 0x20,
+	RDMAP_LOCAL_CATA	= 0x00,
+	RDMAP_REMOTE_PROT	= 0x01,
+	RDMAP_REMOTE_OP		= 0x02,
+	DDP_LOCAL_CATA		= 0x00,
+	DDP_TAGGED_ERR		= 0x01,
+	DDP_UNTAGGED_ERR	= 0x02,
+	DDP_LLP			= 0x03
+};
+
+enum iwch_rdma_ecodes {
+	RDMAP_INV_STAG		= 0x00,
+	RDMAP_BASE_BOUNDS	= 0x01,
+	RDMAP_ACC_VIOL		= 0x02,
+	RDMAP_STAG_NOT_ASSOC	= 0x03,
+	RDMAP_TO_WRAP		= 0x04,
+	RDMAP_INV_VERS		= 0x05,
+	RDMAP_INV_OPCODE	= 0x06,
+	RDMAP_STREAM_CATA	= 0x07,
+	RDMAP_GLOBAL_CATA	= 0x08,
+	RDMAP_CANT_INV_STAG	= 0x09,
+	RDMAP_UNSPECIFIED	= 0xff
+};
+
+enum iwch_ddp_ecodes {
+	DDPT_INV_STAG		= 0x00,
+	DDPT_BASE_BOUNDS	= 0x01,
+	DDPT_STAG_NOT_ASSOC	= 0x02,
+	DDPT_TO_WRAP		= 0x03,
+	DDPT_INV_VERS		= 0x04,
+	DDPU_INV_QN		= 0x01,
+	DDPU_INV_MSN_NOBUF	= 0x02,
+	DDPU_INV_MSN_RANGE	= 0x03,
+	DDPU_INV_MO		= 0x04,
+	DDPU_MSG_TOOBIG		= 0x05,
+	DDPU_INV_VERS		= 0x06
+};
+
+enum iwch_mpa_ecodes {
+	MPA_CRC_ERR		= 0x02,
+	MPA_MARKER_ERR		= 0x03
+};
+
+enum iwch_ep_state {
+	IDLE = 0,
+	LISTEN,
+	CONNECTING,
+	MPA_REQ_WAIT,
+	MPA_REQ_SENT,
+	MPA_REQ_RCVD,
+	MPA_REP_SENT,
+	FPDU_MODE,
+	ABORTING,
+	CLOSING,
+	MORIBUND,
+	DEAD,
+};
+
+enum iwch_ep_flags {
+	PEER_ABORT_IN_PROGRESS	= 0,
+	ABORT_REQ_IN_PROGRESS	= 1,
+	RELEASE_RESOURCES	= 2,
+	CLOSE_SENT		= 3,
+};
+
+struct iwch_ep_common {
+	struct iw_cm_id *cm_id;
+	struct iwch_qp *qp;
+	struct t3cdev *tdev;
+	enum iwch_ep_state state;
+	struct kref kref;
+	spinlock_t lock;
+	struct sockaddr_in local_addr;
+	struct sockaddr_in remote_addr;
+	wait_queue_head_t waitq;
+	int rpl_done;
+	int rpl_err;
+	unsigned long flags;
+};
+
+struct iwch_listen_ep {
+	struct iwch_ep_common com;
+	unsigned int stid;
+	int backlog;
+};
+
+struct iwch_ep {
+	struct iwch_ep_common com;
+	struct iwch_ep *parent_ep;
+	struct timer_list timer;
+	unsigned int atid;
+	u32 hwtid;
+	u32 snd_seq;
+	u32 rcv_seq;
+	struct l2t_entry *l2t;
+	struct dst_entry *dst;
+	struct sk_buff *mpa_skb;
+	struct iwch_mpa_attributes mpa_attr;
+	unsigned int mpa_pkt_len;
+	u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA];
+	u8 tos;
+	u16 emss;
+	u16 plen;
+	u32 ird;
+	u32 ord;
+};
+
+static inline struct iwch_ep *to_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline struct iwch_listen_ep *to_listen_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline int compute_wscale(int win)
+{
+	int wscale = 0;
+
+	while (wscale < 14 && (65535<<wscale) < win)
+		wscale++;
+	return wscale;
+}
+
+/* CM prototypes */
+
+int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int iwch_create_listen(struct iw_cm_id *cm_id, int backlog);
+int iwch_destroy_listen(struct iw_cm_id *cm_id);
+int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
+int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp);
+int iwch_quiesce_tid(struct iwch_ep *ep);
+int iwch_resume_tid(struct iwch_ep *ep);
+void __free_ep(struct kref *kref);
+void iwch_rearp(struct iwch_ep *ep);
+int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, struct l2t_entry *l2t);
+
+int __init iwch_cm_init(void);
+void __exit iwch_cm_term(void);
+extern int peer2peer;
+
+#endif				/* _IWCH_CM_H_ */
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cq.c b/drivers/infiniband/hw/cxgb3/iwch_cq.c
new file mode 100644
index 0000000..0a8542c
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_cq.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "iwch_provider.h"
+#include "iwch.h"
+
+/*
+ * Get one cq entry from cxio and map it to openib.
+ *
+ * Returns:
+ *	0			EMPTY;
+ *	1			cqe returned
+ *	-EAGAIN		caller must try again
+ *	any other -errno	fatal error
+ */
+static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
+			    struct ib_wc *wc)
+{
+	struct iwch_qp *qhp = NULL;
+	struct t3_cqe cqe, *rd_cqe;
+	struct t3_wq *wq;
+	u32 credit = 0;
+	u8 cqe_flushed;
+	u64 cookie;
+	int ret = 1;
+
+	rd_cqe = cxio_next_cqe(&chp->cq);
+
+	if (!rd_cqe)
+		return 0;
+
+	qhp = get_qhp(rhp, CQE_QPID(*rd_cqe));
+	if (!qhp)
+		wq = NULL;
+	else {
+		spin_lock(&qhp->lock);
+		wq = &(qhp->wq);
+	}
+	ret = cxio_poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie,
+				   &credit);
+	if (t3a_device(chp->rhp) && credit) {
+		pr_debug("%s updating %d cq credits on id %d\n", __func__,
+			 credit, chp->cq.cqid);
+		cxio_hal_cq_op(&rhp->rdev, &chp->cq, CQ_CREDIT_UPDATE, credit);
+	}
+
+	if (ret) {
+		ret = -EAGAIN;
+		goto out;
+	}
+	ret = 1;
+
+	wc->wr_id = cookie;
+	wc->qp = &qhp->ibqp;
+	wc->vendor_err = CQE_STATUS(cqe);
+	wc->wc_flags = 0;
+
+	pr_debug("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x lo 0x%x cookie 0x%llx\n",
+		 __func__,
+		 CQE_QPID(cqe), CQE_TYPE(cqe),
+		 CQE_OPCODE(cqe), CQE_STATUS(cqe), CQE_WRID_HI(cqe),
+		 CQE_WRID_LOW(cqe), (unsigned long long)cookie);
+
+	if (CQE_TYPE(cqe) == 0) {
+		if (!CQE_STATUS(cqe))
+			wc->byte_len = CQE_LEN(cqe);
+		else
+			wc->byte_len = 0;
+		wc->opcode = IB_WC_RECV;
+		if (CQE_OPCODE(cqe) == T3_SEND_WITH_INV ||
+		    CQE_OPCODE(cqe) == T3_SEND_WITH_SE_INV) {
+			wc->ex.invalidate_rkey = CQE_WRID_STAG(cqe);
+			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+		}
+	} else {
+		switch (CQE_OPCODE(cqe)) {
+		case T3_RDMA_WRITE:
+			wc->opcode = IB_WC_RDMA_WRITE;
+			break;
+		case T3_READ_REQ:
+			wc->opcode = IB_WC_RDMA_READ;
+			wc->byte_len = CQE_LEN(cqe);
+			break;
+		case T3_SEND:
+		case T3_SEND_WITH_SE:
+		case T3_SEND_WITH_INV:
+		case T3_SEND_WITH_SE_INV:
+			wc->opcode = IB_WC_SEND;
+			break;
+		case T3_LOCAL_INV:
+			wc->opcode = IB_WC_LOCAL_INV;
+			break;
+		case T3_FAST_REGISTER:
+			wc->opcode = IB_WC_REG_MR;
+			break;
+		default:
+			pr_err("Unexpected opcode %d in the CQE received for QPID=0x%0x\n",
+			       CQE_OPCODE(cqe), CQE_QPID(cqe));
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (cqe_flushed)
+		wc->status = IB_WC_WR_FLUSH_ERR;
+	else {
+
+		switch (CQE_STATUS(cqe)) {
+		case TPT_ERR_SUCCESS:
+			wc->status = IB_WC_SUCCESS;
+			break;
+		case TPT_ERR_STAG:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case TPT_ERR_PDID:
+			wc->status = IB_WC_LOC_PROT_ERR;
+			break;
+		case TPT_ERR_QPID:
+		case TPT_ERR_ACCESS:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case TPT_ERR_WRAP:
+			wc->status = IB_WC_GENERAL_ERR;
+			break;
+		case TPT_ERR_BOUND:
+			wc->status = IB_WC_LOC_LEN_ERR;
+			break;
+		case TPT_ERR_INVALIDATE_SHARED_MR:
+		case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+			wc->status = IB_WC_MW_BIND_ERR;
+			break;
+		case TPT_ERR_CRC:
+		case TPT_ERR_MARKER:
+		case TPT_ERR_PDU_LEN_ERR:
+		case TPT_ERR_OUT_OF_RQE:
+		case TPT_ERR_DDP_VERSION:
+		case TPT_ERR_RDMA_VERSION:
+		case TPT_ERR_DDP_QUEUE_NUM:
+		case TPT_ERR_MSN:
+		case TPT_ERR_TBIT:
+		case TPT_ERR_MO:
+		case TPT_ERR_MSN_RANGE:
+		case TPT_ERR_IRD_OVERFLOW:
+		case TPT_ERR_OPCODE:
+			wc->status = IB_WC_FATAL_ERR;
+			break;
+		case TPT_ERR_SWFLUSH:
+			wc->status = IB_WC_WR_FLUSH_ERR;
+			break;
+		default:
+			pr_err("Unexpected cqe_status 0x%x for QPID=0x%0x\n",
+			       CQE_STATUS(cqe), CQE_QPID(cqe));
+			ret = -EINVAL;
+		}
+	}
+out:
+	if (wq)
+		spin_unlock(&qhp->lock);
+	return ret;
+}
+
+int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct iwch_dev *rhp;
+	struct iwch_cq *chp;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+
+	chp = to_iwch_cq(ibcq);
+	rhp = chp->rhp;
+
+	spin_lock_irqsave(&chp->lock, flags);
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+
+		/*
+		 * Because T3 can post CQEs that are _not_ associated
+		 * with a WR, we might have to poll again after removing
+		 * one of these.
+		 */
+		do {
+			err = iwch_poll_cq_one(rhp, chp, wc + npolled);
+		} while (err == -EAGAIN);
+		if (err <= 0)
+			break;
+	}
+	spin_unlock_irqrestore(&chp->lock, flags);
+
+	if (err < 0)
+		return err;
+	else {
+		return npolled;
+	}
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_ev.c b/drivers/infiniband/hw/cxgb3/iwch_ev.c
new file mode 100644
index 0000000..4a0c82a
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_ev.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/gfp.h>
+#include <linux/mman.h>
+#include <net/sock.h>
+#include "iwch_provider.h"
+#include "iwch.h"
+#include "iwch_cm.h"
+#include "cxio_hal.h"
+#include "cxio_wr.h"
+
+static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp,
+			  struct respQ_msg_t *rsp_msg,
+			  enum ib_event_type ib_event,
+			  int send_term)
+{
+	struct ib_event event;
+	struct iwch_qp_attributes attrs;
+	struct iwch_qp *qhp;
+	unsigned long flag;
+
+	spin_lock(&rnicp->lock);
+	qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
+
+	if (!qhp) {
+		pr_err("%s unaffiliated error 0x%x qpid 0x%x\n",
+		       __func__, CQE_STATUS(rsp_msg->cqe),
+		       CQE_QPID(rsp_msg->cqe));
+		spin_unlock(&rnicp->lock);
+		return;
+	}
+
+	if ((qhp->attr.state == IWCH_QP_STATE_ERROR) ||
+	    (qhp->attr.state == IWCH_QP_STATE_TERMINATE)) {
+		pr_debug("%s AE received after RTS - qp state %d qpid 0x%x status 0x%x\n",
+			 __func__,
+			 qhp->attr.state, qhp->wq.qpid,
+			 CQE_STATUS(rsp_msg->cqe));
+		spin_unlock(&rnicp->lock);
+		return;
+	}
+
+	pr_err("%s - AE qpid 0x%x opcode %d status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
+	       __func__,
+	       CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe),
+	       CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe),
+	       CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
+
+	atomic_inc(&qhp->refcnt);
+	spin_unlock(&rnicp->lock);
+
+	if (qhp->attr.state == IWCH_QP_STATE_RTS) {
+		attrs.next_state = IWCH_QP_STATE_TERMINATE;
+		iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE,
+			       &attrs, 1);
+		if (send_term)
+			iwch_post_terminate(qhp, rsp_msg);
+	}
+
+	event.event = ib_event;
+	event.device = chp->ibcq.device;
+	if (ib_event == IB_EVENT_CQ_ERR)
+		event.element.cq = &chp->ibcq;
+	else
+		event.element.qp = &qhp->ibqp;
+
+	if (qhp->ibqp.event_handler)
+		(*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context);
+
+	spin_lock_irqsave(&chp->comp_handler_lock, flag);
+	(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+	spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
+
+	if (atomic_dec_and_test(&qhp->refcnt))
+		wake_up(&qhp->wait);
+}
+
+void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb)
+{
+	struct iwch_dev *rnicp;
+	struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data;
+	struct iwch_cq *chp;
+	struct iwch_qp *qhp;
+	u32 cqid = RSPQ_CQID(rsp_msg);
+	unsigned long flag;
+
+	rnicp = (struct iwch_dev *) rdev_p->ulp;
+	spin_lock(&rnicp->lock);
+	chp = get_chp(rnicp, cqid);
+	qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
+	if (!chp || !qhp) {
+		pr_err("BAD AE cqid 0x%x qpid 0x%x opcode %d status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
+		       cqid, CQE_QPID(rsp_msg->cqe),
+		       CQE_OPCODE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
+		       CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe),
+		       CQE_WRID_LOW(rsp_msg->cqe));
+		spin_unlock(&rnicp->lock);
+		goto out;
+	}
+	iwch_qp_add_ref(&qhp->ibqp);
+	atomic_inc(&chp->refcnt);
+	spin_unlock(&rnicp->lock);
+
+	/*
+	 * 1) completion of our sending a TERMINATE.
+	 * 2) incoming TERMINATE message.
+	 */
+	if ((CQE_OPCODE(rsp_msg->cqe) == T3_TERMINATE) &&
+	    (CQE_STATUS(rsp_msg->cqe) == 0)) {
+		if (SQ_TYPE(rsp_msg->cqe)) {
+			pr_debug("%s QPID 0x%x ep %p disconnecting\n",
+				 __func__, qhp->wq.qpid, qhp->ep);
+			iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC);
+		} else {
+			pr_debug("%s post REQ_ERR AE QPID 0x%x\n", __func__,
+				 qhp->wq.qpid);
+			post_qp_event(rnicp, chp, rsp_msg,
+				      IB_EVENT_QP_REQ_ERR, 0);
+			iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC);
+		}
+		goto done;
+	}
+
+	/* Bad incoming Read request */
+	if (SQ_TYPE(rsp_msg->cqe) &&
+	    (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP)) {
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1);
+		goto done;
+	}
+
+	/* Bad incoming write */
+	if (RQ_TYPE(rsp_msg->cqe) &&
+	    (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE)) {
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1);
+		goto done;
+	}
+
+	switch (CQE_STATUS(rsp_msg->cqe)) {
+
+	/* Completion Events */
+	case TPT_ERR_SUCCESS:
+
+		/*
+		 * Confirm the destination entry if this is a RECV completion.
+		 */
+		if (qhp->ep && SQ_TYPE(rsp_msg->cqe))
+			dst_confirm(qhp->ep->dst);
+		spin_lock_irqsave(&chp->comp_handler_lock, flag);
+		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+		spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
+		break;
+
+	case TPT_ERR_STAG:
+	case TPT_ERR_PDID:
+	case TPT_ERR_QPID:
+	case TPT_ERR_ACCESS:
+	case TPT_ERR_WRAP:
+	case TPT_ERR_BOUND:
+	case TPT_ERR_INVALIDATE_SHARED_MR:
+	case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1);
+		break;
+
+	/* Device Fatal Errors */
+	case TPT_ERR_ECC:
+	case TPT_ERR_ECC_PSTAG:
+	case TPT_ERR_INTERNAL_ERR:
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_DEVICE_FATAL, 1);
+		break;
+
+	/* QP Fatal Errors */
+	case TPT_ERR_OUT_OF_RQE:
+	case TPT_ERR_PBL_ADDR_BOUND:
+	case TPT_ERR_CRC:
+	case TPT_ERR_MARKER:
+	case TPT_ERR_PDU_LEN_ERR:
+	case TPT_ERR_DDP_VERSION:
+	case TPT_ERR_RDMA_VERSION:
+	case TPT_ERR_OPCODE:
+	case TPT_ERR_DDP_QUEUE_NUM:
+	case TPT_ERR_MSN:
+	case TPT_ERR_TBIT:
+	case TPT_ERR_MO:
+	case TPT_ERR_MSN_GAP:
+	case TPT_ERR_MSN_RANGE:
+	case TPT_ERR_RQE_ADDR_BOUND:
+	case TPT_ERR_IRD_OVERFLOW:
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1);
+		break;
+
+	default:
+		pr_err("Unknown T3 status 0x%x QPID 0x%x\n",
+		       CQE_STATUS(rsp_msg->cqe), qhp->wq.qpid);
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1);
+		break;
+	}
+done:
+	if (atomic_dec_and_test(&chp->refcnt))
+	        wake_up(&chp->wait);
+	iwch_qp_rem_ref(&qhp->ibqp);
+out:
+	dev_kfree_skb_irq(skb);
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c
new file mode 100644
index 0000000..12886b1
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+
+#include "cxio_hal.h"
+#include "cxio_resource.h"
+#include "iwch.h"
+#include "iwch_provider.h"
+
+static int iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag)
+{
+	u32 mmid;
+
+	mhp->attr.state = 1;
+	mhp->attr.stag = stag;
+	mmid = stag >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	pr_debug("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);
+	return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
+}
+
+int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+		      struct iwch_mr *mhp, int shift)
+{
+	u32 stag;
+	int ret;
+
+	if (cxio_register_phys_mem(&rhp->rdev,
+				   &stag, mhp->attr.pdid,
+				   mhp->attr.perms,
+				   mhp->attr.zbva,
+				   mhp->attr.va_fbo,
+				   mhp->attr.len,
+				   shift - 12,
+				   mhp->attr.pbl_size, mhp->attr.pbl_addr))
+		return -ENOMEM;
+
+	ret = iwch_finish_mem_reg(mhp, stag);
+	if (ret)
+		cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+	return ret;
+}
+
+int iwch_alloc_pbl(struct iwch_mr *mhp, int npages)
+{
+	mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev,
+						    npages << 3);
+
+	if (!mhp->attr.pbl_addr)
+		return -ENOMEM;
+
+	mhp->attr.pbl_size = npages;
+
+	return 0;
+}
+
+void iwch_free_pbl(struct iwch_mr *mhp)
+{
+	cxio_hal_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+}
+
+int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset)
+{
+	return cxio_write_pbl(&mhp->rhp->rdev, pages,
+			      mhp->attr.pbl_addr + (offset << 3), npages);
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
new file mode 100644
index 0000000..be097c6
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -0,0 +1,1475 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/sched/mm.h>
+#include <linux/spinlock.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "cxio_hal.h"
+#include "iwch.h"
+#include "iwch_provider.h"
+#include "iwch_cm.h"
+#include <rdma/cxgb3-abi.h>
+#include "common.h"
+
+static struct ib_ah *iwch_ah_create(struct ib_pd *pd,
+				    struct rdma_ah_attr *ah_attr,
+				    struct ib_udata *udata)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static int iwch_ah_destroy(struct ib_ah *ah)
+{
+	return -ENOSYS;
+}
+
+static int iwch_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int iwch_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int iwch_process_mad(struct ib_device *ibdev,
+			    int mad_flags,
+			    u8 port_num,
+			    const struct ib_wc *in_wc,
+			    const struct ib_grh *in_grh,
+			    const struct ib_mad_hdr *in_mad,
+			    size_t in_mad_size,
+			    struct ib_mad_hdr *out_mad,
+			    size_t *out_mad_size,
+			    u16 *out_mad_pkey_index)
+{
+	return -ENOSYS;
+}
+
+static int iwch_dealloc_ucontext(struct ib_ucontext *context)
+{
+	struct iwch_dev *rhp = to_iwch_dev(context->device);
+	struct iwch_ucontext *ucontext = to_iwch_ucontext(context);
+	struct iwch_mm_entry *mm, *tmp;
+
+	pr_debug("%s context %p\n", __func__, context);
+	list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
+		kfree(mm);
+	cxio_release_ucontext(&rhp->rdev, &ucontext->uctx);
+	kfree(ucontext);
+	return 0;
+}
+
+static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev,
+					struct ib_udata *udata)
+{
+	struct iwch_ucontext *context;
+	struct iwch_dev *rhp = to_iwch_dev(ibdev);
+
+	pr_debug("%s ibdev %p\n", __func__, ibdev);
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+	cxio_init_ucontext(&rhp->rdev, &context->uctx);
+	INIT_LIST_HEAD(&context->mmaps);
+	spin_lock_init(&context->mmap_lock);
+	return &context->ibucontext;
+}
+
+static int iwch_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct iwch_cq *chp;
+
+	pr_debug("%s ib_cq %p\n", __func__, ib_cq);
+	chp = to_iwch_cq(ib_cq);
+
+	remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
+	atomic_dec(&chp->refcnt);
+	wait_event(chp->wait, !atomic_read(&chp->refcnt));
+
+	cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
+	kfree(chp);
+	return 0;
+}
+
+static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
+				    const struct ib_cq_init_attr *attr,
+				    struct ib_ucontext *ib_context,
+				    struct ib_udata *udata)
+{
+	int entries = attr->cqe;
+	struct iwch_dev *rhp;
+	struct iwch_cq *chp;
+	struct iwch_create_cq_resp uresp;
+	struct iwch_create_cq_req ureq;
+	struct iwch_ucontext *ucontext = NULL;
+	static int warned;
+	size_t resplen;
+
+	pr_debug("%s ib_dev %p entries %d\n", __func__, ibdev, entries);
+	if (attr->flags)
+		return ERR_PTR(-EINVAL);
+
+	rhp = to_iwch_dev(ibdev);
+	chp = kzalloc(sizeof(*chp), GFP_KERNEL);
+	if (!chp)
+		return ERR_PTR(-ENOMEM);
+
+	if (ib_context) {
+		ucontext = to_iwch_ucontext(ib_context);
+		if (!t3a_device(rhp)) {
+			if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) {
+				kfree(chp);
+				return ERR_PTR(-EFAULT);
+			}
+			chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr;
+		}
+	}
+
+	if (t3a_device(rhp)) {
+
+		/*
+		 * T3A: Add some fluff to handle extra CQEs inserted
+		 * for various errors.
+		 * Additional CQE possibilities:
+		 *      TERMINATE,
+		 *      incoming RDMA WRITE Failures
+		 *      incoming RDMA READ REQUEST FAILUREs
+		 * NOTE: We cannot ensure the CQ won't overflow.
+		 */
+		entries += 16;
+	}
+	entries = roundup_pow_of_two(entries);
+	chp->cq.size_log2 = ilog2(entries);
+
+	if (cxio_create_cq(&rhp->rdev, &chp->cq, !ucontext)) {
+		kfree(chp);
+		return ERR_PTR(-ENOMEM);
+	}
+	chp->rhp = rhp;
+	chp->ibcq.cqe = 1 << chp->cq.size_log2;
+	spin_lock_init(&chp->lock);
+	spin_lock_init(&chp->comp_handler_lock);
+	atomic_set(&chp->refcnt, 1);
+	init_waitqueue_head(&chp->wait);
+	if (insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid)) {
+		cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
+		kfree(chp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (ucontext) {
+		struct iwch_mm_entry *mm;
+
+		mm = kmalloc(sizeof *mm, GFP_KERNEL);
+		if (!mm) {
+			iwch_destroy_cq(&chp->ibcq);
+			return ERR_PTR(-ENOMEM);
+		}
+		uresp.cqid = chp->cq.cqid;
+		uresp.size_log2 = chp->cq.size_log2;
+		spin_lock(&ucontext->mmap_lock);
+		uresp.key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		mm->key = uresp.key;
+		mm->addr = virt_to_phys(chp->cq.queue);
+		if (udata->outlen < sizeof uresp) {
+			if (!warned++)
+				pr_warn("Warning - downlevel libcxgb3 (non-fatal)\n");
+			mm->len = PAGE_ALIGN((1UL << uresp.size_log2) *
+					     sizeof(struct t3_cqe));
+			resplen = sizeof(struct iwch_create_cq_resp_v0);
+		} else {
+			mm->len = PAGE_ALIGN(((1UL << uresp.size_log2) + 1) *
+					     sizeof(struct t3_cqe));
+			uresp.memsize = mm->len;
+			uresp.reserved = 0;
+			resplen = sizeof uresp;
+		}
+		if (ib_copy_to_udata(udata, &uresp, resplen)) {
+			kfree(mm);
+			iwch_destroy_cq(&chp->ibcq);
+			return ERR_PTR(-EFAULT);
+		}
+		insert_mmap(ucontext, mm);
+	}
+	pr_debug("created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx\n",
+		 chp->cq.cqid, chp, (1 << chp->cq.size_log2),
+		 (unsigned long long)chp->cq.dma_addr);
+	return &chp->ibcq;
+}
+
+static int iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+#ifdef notyet
+	struct iwch_cq *chp = to_iwch_cq(cq);
+	struct t3_cq oldcq, newcq;
+	int ret;
+
+	pr_debug("%s ib_cq %p cqe %d\n", __func__, cq, cqe);
+
+	/* We don't downsize... */
+	if (cqe <= cq->cqe)
+		return 0;
+
+	/* create new t3_cq with new size */
+	cqe = roundup_pow_of_two(cqe+1);
+	newcq.size_log2 = ilog2(cqe);
+
+	/* Dont allow resize to less than the current wce count */
+	if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) {
+		return -ENOMEM;
+	}
+
+	/* Quiesce all QPs using this CQ */
+	ret = iwch_quiesce_qps(chp);
+	if (ret) {
+		return ret;
+	}
+
+	ret = cxio_create_cq(&chp->rhp->rdev, &newcq);
+	if (ret) {
+		return ret;
+	}
+
+	/* copy CQEs */
+	memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) *
+				        sizeof(struct t3_cqe));
+
+	/* old iwch_qp gets new t3_cq but keeps old cqid */
+	oldcq = chp->cq;
+	chp->cq = newcq;
+	chp->cq.cqid = oldcq.cqid;
+
+	/* resize new t3_cq to update the HW context */
+	ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq);
+	if (ret) {
+		chp->cq = oldcq;
+		return ret;
+	}
+	chp->ibcq.cqe = (1<<chp->cq.size_log2) - 1;
+
+	/* destroy old t3_cq */
+	oldcq.cqid = newcq.cqid;
+	ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq);
+	if (ret) {
+		pr_err("%s - cxio_destroy_cq failed %d\n", __func__, ret);
+	}
+
+	/* add user hooks here */
+
+	/* resume qps */
+	ret = iwch_resume_qps(chp);
+	return ret;
+#else
+	return -ENOSYS;
+#endif
+}
+
+static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct iwch_dev *rhp;
+	struct iwch_cq *chp;
+	enum t3_cq_opcode cq_op;
+	int err;
+	unsigned long flag;
+	u32 rptr;
+
+	chp = to_iwch_cq(ibcq);
+	rhp = chp->rhp;
+	if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		cq_op = CQ_ARM_SE;
+	else
+		cq_op = CQ_ARM_AN;
+	if (chp->user_rptr_addr) {
+		if (get_user(rptr, chp->user_rptr_addr))
+			return -EFAULT;
+		spin_lock_irqsave(&chp->lock, flag);
+		chp->cq.rptr = rptr;
+	} else
+		spin_lock_irqsave(&chp->lock, flag);
+	pr_debug("%s rptr 0x%x\n", __func__, chp->cq.rptr);
+	err = cxio_hal_cq_op(&rhp->rdev, &chp->cq, cq_op, 0);
+	spin_unlock_irqrestore(&chp->lock, flag);
+	if (err < 0)
+		pr_err("Error %d rearming CQID 0x%x\n", err, chp->cq.cqid);
+	if (err > 0 && !(flags & IB_CQ_REPORT_MISSED_EVENTS))
+		err = 0;
+	return err;
+}
+
+static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	int len = vma->vm_end - vma->vm_start;
+	u32 key = vma->vm_pgoff << PAGE_SHIFT;
+	struct cxio_rdev *rdev_p;
+	int ret = 0;
+	struct iwch_mm_entry *mm;
+	struct iwch_ucontext *ucontext;
+	u64 addr;
+
+	pr_debug("%s pgoff 0x%lx key 0x%x len %d\n", __func__, vma->vm_pgoff,
+		 key, len);
+
+	if (vma->vm_start & (PAGE_SIZE-1)) {
+	        return -EINVAL;
+	}
+
+	rdev_p = &(to_iwch_dev(context->device)->rdev);
+	ucontext = to_iwch_ucontext(context);
+
+	mm = remove_mmap(ucontext, key, len);
+	if (!mm)
+		return -EINVAL;
+	addr = mm->addr;
+	kfree(mm);
+
+	if ((addr >= rdev_p->rnic_info.udbell_physbase) &&
+	    (addr < (rdev_p->rnic_info.udbell_physbase +
+		       rdev_p->rnic_info.udbell_len))) {
+
+		/*
+		 * Map T3 DB register.
+		 */
+		if (vma->vm_flags & VM_READ) {
+			return -EPERM;
+		}
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		vma->vm_flags &= ~VM_MAYREAD;
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 addr >> PAGE_SHIFT,
+				         len, vma->vm_page_prot);
+	} else {
+
+		/*
+		 * Map WQ or CQ contig dma memory...
+		 */
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      addr >> PAGE_SHIFT,
+				      len, vma->vm_page_prot);
+	}
+
+	return ret;
+}
+
+static int iwch_deallocate_pd(struct ib_pd *pd)
+{
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	pr_debug("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid);
+	cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid);
+	kfree(php);
+	return 0;
+}
+
+static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
+			       struct ib_ucontext *context,
+			       struct ib_udata *udata)
+{
+	struct iwch_pd *php;
+	u32 pdid;
+	struct iwch_dev *rhp;
+
+	pr_debug("%s ibdev %p\n", __func__, ibdev);
+	rhp = (struct iwch_dev *) ibdev;
+	pdid = cxio_hal_get_pdid(rhp->rdev.rscp);
+	if (!pdid)
+		return ERR_PTR(-EINVAL);
+	php = kzalloc(sizeof(*php), GFP_KERNEL);
+	if (!php) {
+		cxio_hal_put_pdid(rhp->rdev.rscp, pdid);
+		return ERR_PTR(-ENOMEM);
+	}
+	php->pdid = pdid;
+	php->rhp = rhp;
+	if (context) {
+		struct iwch_alloc_pd_resp resp = {.pdid = php->pdid};
+
+		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
+			iwch_deallocate_pd(&php->ibpd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+	pr_debug("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php);
+	return &php->ibpd;
+}
+
+static int iwch_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct iwch_dev *rhp;
+	struct iwch_mr *mhp;
+	u32 mmid;
+
+	pr_debug("%s ib_mr %p\n", __func__, ib_mr);
+
+	mhp = to_iwch_mr(ib_mr);
+	kfree(mhp->pages);
+	rhp = mhp->rhp;
+	mmid = mhp->attr.stag >> 8;
+	cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+	iwch_free_pbl(mhp);
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	if (mhp->kva)
+		kfree((void *) (unsigned long) mhp->kva);
+	if (mhp->umem)
+		ib_umem_release(mhp->umem);
+	pr_debug("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp);
+	kfree(mhp);
+	return 0;
+}
+
+static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	const u64 total_size = 0xffffffff;
+	const u64 mask = (total_size + PAGE_SIZE - 1) & PAGE_MASK;
+	struct iwch_pd *php = to_iwch_pd(pd);
+	struct iwch_dev *rhp = php->rhp;
+	struct iwch_mr *mhp;
+	__be64 *page_list;
+	int shift = 26, npages, ret, i;
+
+	pr_debug("%s ib_pd %p\n", __func__, pd);
+
+	/*
+	 * T3 only supports 32 bits of size.
+	 */
+	if (sizeof(phys_addr_t) > 4) {
+		pr_warn_once("Cannot support dma_mrs on this platform\n");
+		return ERR_PTR(-ENOTSUPP);
+	}
+
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+
+	npages = (total_size + (1ULL << shift) - 1) >> shift;
+	if (!npages) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	page_list = kmalloc_array(npages, sizeof(u64), GFP_KERNEL);
+	if (!page_list) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < npages; i++)
+		page_list[i] = cpu_to_be64((u64)i << shift);
+
+	pr_debug("%s mask 0x%llx shift %d len %lld pbl_size %d\n",
+		 __func__, mask, shift, total_size, npages);
+
+	ret = iwch_alloc_pbl(mhp, npages);
+	if (ret) {
+		kfree(page_list);
+		goto err_pbl;
+	}
+
+	ret = iwch_write_pbl(mhp, page_list, npages, 0);
+	kfree(page_list);
+	if (ret)
+		goto err_pbl;
+
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+
+	mhp->attr.perms = iwch_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = 0;
+	mhp->attr.page_size = shift - 12;
+
+	mhp->attr.len = (u32) total_size;
+	mhp->attr.pbl_size = npages;
+	ret = iwch_register_mem(rhp, php, mhp, shift);
+	if (ret)
+		goto err_pbl;
+
+	return &mhp->ibmr;
+
+err_pbl:
+	iwch_free_pbl(mhp);
+
+err:
+	kfree(mhp);
+	return ERR_PTR(ret);
+}
+
+static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				      u64 virt, int acc, struct ib_udata *udata)
+{
+	__be64 *pages;
+	int shift, n, len;
+	int i, k, entry;
+	int err = 0;
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mr *mhp;
+	struct iwch_reg_user_mr_resp uresp;
+	struct scatterlist *sg;
+	pr_debug("%s ib_pd %p\n", __func__, pd);
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+
+	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(mhp->umem)) {
+		err = PTR_ERR(mhp->umem);
+		kfree(mhp);
+		return ERR_PTR(err);
+	}
+
+	shift = mhp->umem->page_shift;
+
+	n = mhp->umem->nmap;
+
+	err = iwch_alloc_pbl(mhp, n);
+	if (err)
+		goto err;
+
+	pages = (__be64 *) __get_free_page(GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err_pbl;
+	}
+
+	i = n = 0;
+
+	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
+			len = sg_dma_len(sg) >> shift;
+			for (k = 0; k < len; ++k) {
+				pages[i++] = cpu_to_be64(sg_dma_address(sg) +
+							 (k << shift));
+				if (i == PAGE_SIZE / sizeof *pages) {
+					err = iwch_write_pbl(mhp, pages, i, n);
+					if (err)
+						goto pbl_done;
+					n += i;
+					i = 0;
+				}
+			}
+	}
+
+	if (i)
+		err = iwch_write_pbl(mhp, pages, i, n);
+
+pbl_done:
+	free_page((unsigned long) pages);
+	if (err)
+		goto err_pbl;
+
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+	mhp->attr.perms = iwch_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = virt;
+	mhp->attr.page_size = shift - 12;
+	mhp->attr.len = (u32) length;
+
+	err = iwch_register_mem(rhp, php, mhp, shift);
+	if (err)
+		goto err_pbl;
+
+	if (udata && !t3a_device(rhp)) {
+		uresp.pbl_addr = (mhp->attr.pbl_addr -
+				 rhp->rdev.rnic_info.pbl_base) >> 3;
+		pr_debug("%s user resp pbl_addr 0x%x\n", __func__,
+			 uresp.pbl_addr);
+
+		if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+			iwch_dereg_mr(&mhp->ibmr);
+			err = -EFAULT;
+			goto err;
+		}
+	}
+
+	return &mhp->ibmr;
+
+err_pbl:
+	iwch_free_pbl(mhp);
+
+err:
+	ib_umem_release(mhp->umem);
+	kfree(mhp);
+	return ERR_PTR(err);
+}
+
+static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
+				   struct ib_udata *udata)
+{
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mw *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret;
+
+	if (type != IB_MW_TYPE_1)
+		return ERR_PTR(-EINVAL);
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+	ret = cxio_allocate_window(&rhp->rdev, &stag, php->pdid);
+	if (ret) {
+		kfree(mhp);
+		return ERR_PTR(ret);
+	}
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = TPT_MW;
+	mhp->attr.stag = stag;
+	mmid = (stag) >> 8;
+	mhp->ibmw.rkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+		cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
+		kfree(mhp);
+		return ERR_PTR(-ENOMEM);
+	}
+	pr_debug("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+	return &(mhp->ibmw);
+}
+
+static int iwch_dealloc_mw(struct ib_mw *mw)
+{
+	struct iwch_dev *rhp;
+	struct iwch_mw *mhp;
+	u32 mmid;
+
+	mhp = to_iwch_mw(mw);
+	rhp = mhp->rhp;
+	mmid = (mw->rkey) >> 8;
+	cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	pr_debug("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp);
+	kfree(mhp);
+	return 0;
+}
+
+static struct ib_mr *iwch_alloc_mr(struct ib_pd *pd,
+				   enum ib_mr_type mr_type,
+				   u32 max_num_sg)
+{
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mr *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret = -ENOMEM;
+
+	if (mr_type != IB_MR_TYPE_MEM_REG ||
+	    max_num_sg > T3_MAX_FASTREG_DEPTH)
+		return ERR_PTR(-EINVAL);
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		goto err;
+
+	mhp->pages = kcalloc(max_num_sg, sizeof(u64), GFP_KERNEL);
+	if (!mhp->pages)
+		goto pl_err;
+
+	mhp->rhp = rhp;
+	ret = iwch_alloc_pbl(mhp, max_num_sg);
+	if (ret)
+		goto err1;
+	mhp->attr.pbl_size = max_num_sg;
+	ret = cxio_allocate_stag(&rhp->rdev, &stag, php->pdid,
+				 mhp->attr.pbl_size, mhp->attr.pbl_addr);
+	if (ret)
+		goto err2;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = TPT_NON_SHARED_MR;
+	mhp->attr.stag = stag;
+	mhp->attr.state = 1;
+	mmid = (stag) >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	ret = insert_handle(rhp, &rhp->mmidr, mhp, mmid);
+	if (ret)
+		goto err3;
+
+	pr_debug("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+	return &(mhp->ibmr);
+err3:
+	cxio_dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+err2:
+	iwch_free_pbl(mhp);
+err1:
+	kfree(mhp->pages);
+pl_err:
+	kfree(mhp);
+err:
+	return ERR_PTR(ret);
+}
+
+static int iwch_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct iwch_mr *mhp = to_iwch_mr(ibmr);
+
+	if (unlikely(mhp->npages == mhp->attr.pbl_size))
+		return -ENOMEM;
+
+	mhp->pages[mhp->npages++] = addr;
+
+	return 0;
+}
+
+static int iwch_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+			  int sg_nents, unsigned int *sg_offset)
+{
+	struct iwch_mr *mhp = to_iwch_mr(ibmr);
+
+	mhp->npages = 0;
+
+	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, iwch_set_page);
+}
+
+static int iwch_destroy_qp(struct ib_qp *ib_qp)
+{
+	struct iwch_dev *rhp;
+	struct iwch_qp *qhp;
+	struct iwch_qp_attributes attrs;
+	struct iwch_ucontext *ucontext;
+
+	qhp = to_iwch_qp(ib_qp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = IWCH_QP_STATE_ERROR;
+	iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0);
+	wait_event(qhp->wait, !qhp->ep);
+
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.qpid);
+
+	atomic_dec(&qhp->refcnt);
+	wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
+
+	ucontext = ib_qp->uobject ? to_iwch_ucontext(ib_qp->uobject->context)
+				  : NULL;
+	cxio_destroy_qp(&rhp->rdev, &qhp->wq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+
+	pr_debug("%s ib_qp %p qpid 0x%0x qhp %p\n", __func__,
+		 ib_qp, qhp->wq.qpid, qhp);
+	kfree(qhp);
+	return 0;
+}
+
+static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
+			     struct ib_qp_init_attr *attrs,
+			     struct ib_udata *udata)
+{
+	struct iwch_dev *rhp;
+	struct iwch_qp *qhp;
+	struct iwch_pd *php;
+	struct iwch_cq *schp;
+	struct iwch_cq *rchp;
+	struct iwch_create_qp_resp uresp;
+	int wqsize, sqsize, rqsize;
+	struct iwch_ucontext *ucontext;
+
+	pr_debug("%s ib_pd %p\n", __func__, pd);
+	if (attrs->qp_type != IB_QPT_RC)
+		return ERR_PTR(-EINVAL);
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	schp = get_chp(rhp, ((struct iwch_cq *) attrs->send_cq)->cq.cqid);
+	rchp = get_chp(rhp, ((struct iwch_cq *) attrs->recv_cq)->cq.cqid);
+	if (!schp || !rchp)
+		return ERR_PTR(-EINVAL);
+
+	/* The RQT size must be # of entries + 1 rounded up to a power of two */
+	rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr);
+	if (rqsize == attrs->cap.max_recv_wr)
+		rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr+1);
+
+	/* T3 doesn't support RQT depth < 16 */
+	if (rqsize < 16)
+		rqsize = 16;
+
+	if (rqsize > T3_MAX_RQ_SIZE)
+		return ERR_PTR(-EINVAL);
+
+	if (attrs->cap.max_inline_data > T3_MAX_INLINE)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * NOTE: The SQ and total WQ sizes don't need to be
+	 * a power of two.  However, all the code assumes
+	 * they are. EG: Q_FREECNT() and friends.
+	 */
+	sqsize = roundup_pow_of_two(attrs->cap.max_send_wr);
+	wqsize = roundup_pow_of_two(rqsize + sqsize);
+
+	/*
+	 * Kernel users need more wq space for fastreg WRs which can take
+	 * 2 WR fragments.
+	 */
+	ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL;
+	if (!ucontext && wqsize < (rqsize + (2 * sqsize)))
+		wqsize = roundup_pow_of_two(rqsize +
+				roundup_pow_of_two(attrs->cap.max_send_wr * 2));
+	pr_debug("%s wqsize %d sqsize %d rqsize %d\n", __func__,
+		 wqsize, sqsize, rqsize);
+	qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
+	if (!qhp)
+		return ERR_PTR(-ENOMEM);
+	qhp->wq.size_log2 = ilog2(wqsize);
+	qhp->wq.rq_size_log2 = ilog2(rqsize);
+	qhp->wq.sq_size_log2 = ilog2(sqsize);
+	if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq,
+			   ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) {
+		kfree(qhp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	attrs->cap.max_recv_wr = rqsize - 1;
+	attrs->cap.max_send_wr = sqsize;
+	attrs->cap.max_inline_data = T3_MAX_INLINE;
+
+	qhp->rhp = rhp;
+	qhp->attr.pd = php->pdid;
+	qhp->attr.scq = ((struct iwch_cq *) attrs->send_cq)->cq.cqid;
+	qhp->attr.rcq = ((struct iwch_cq *) attrs->recv_cq)->cq.cqid;
+	qhp->attr.sq_num_entries = attrs->cap.max_send_wr;
+	qhp->attr.rq_num_entries = attrs->cap.max_recv_wr;
+	qhp->attr.sq_max_sges = attrs->cap.max_send_sge;
+	qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge;
+	qhp->attr.rq_max_sges = attrs->cap.max_recv_sge;
+	qhp->attr.state = IWCH_QP_STATE_IDLE;
+	qhp->attr.next_state = IWCH_QP_STATE_IDLE;
+
+	/*
+	 * XXX - These don't get passed in from the openib user
+	 * at create time.  The CM sets them via a QP modify.
+	 * Need to fix...  I think the CM should
+	 */
+	qhp->attr.enable_rdma_read = 1;
+	qhp->attr.enable_rdma_write = 1;
+	qhp->attr.enable_bind = 1;
+	qhp->attr.max_ord = 1;
+	qhp->attr.max_ird = 1;
+
+	spin_lock_init(&qhp->lock);
+	init_waitqueue_head(&qhp->wait);
+	atomic_set(&qhp->refcnt, 1);
+
+	if (insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid)) {
+		cxio_destroy_qp(&rhp->rdev, &qhp->wq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+		kfree(qhp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (udata) {
+
+		struct iwch_mm_entry *mm1, *mm2;
+
+		mm1 = kmalloc(sizeof *mm1, GFP_KERNEL);
+		if (!mm1) {
+			iwch_destroy_qp(&qhp->ibqp);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+		if (!mm2) {
+			kfree(mm1);
+			iwch_destroy_qp(&qhp->ibqp);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		uresp.qpid = qhp->wq.qpid;
+		uresp.size_log2 = qhp->wq.size_log2;
+		uresp.sq_size_log2 = qhp->wq.sq_size_log2;
+		uresp.rq_size_log2 = qhp->wq.rq_size_log2;
+		spin_lock(&ucontext->mmap_lock);
+		uresp.key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.db_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+			kfree(mm1);
+			kfree(mm2);
+			iwch_destroy_qp(&qhp->ibqp);
+			return ERR_PTR(-EFAULT);
+		}
+		mm1->key = uresp.key;
+		mm1->addr = virt_to_phys(qhp->wq.queue);
+		mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr));
+		insert_mmap(ucontext, mm1);
+		mm2->key = uresp.db_key;
+		mm2->addr = qhp->wq.udb & PAGE_MASK;
+		mm2->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm2);
+	}
+	qhp->ibqp.qp_num = qhp->wq.qpid;
+	pr_debug("%s sq_num_entries %d, rq_num_entries %d qpid 0x%0x qhp %p dma_addr 0x%llx size %d rq_addr 0x%x\n",
+		 __func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
+		 qhp->wq.qpid, qhp, (unsigned long long)qhp->wq.dma_addr,
+		 1 << qhp->wq.size_log2, qhp->wq.rq_addr);
+	return &qhp->ibqp;
+}
+
+static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct iwch_dev *rhp;
+	struct iwch_qp *qhp;
+	enum iwch_qp_attr_mask mask = 0;
+	struct iwch_qp_attributes attrs;
+
+	pr_debug("%s ib_qp %p\n", __func__, ibqp);
+
+	/* iwarp does not support the RTR state */
+	if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR))
+		attr_mask &= ~IB_QP_STATE;
+
+	/* Make sure we still have something left to do */
+	if (!attr_mask)
+		return 0;
+
+	memset(&attrs, 0, sizeof attrs);
+	qhp = to_iwch_qp(ibqp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = iwch_convert_state(attr->qp_state);
+	attrs.enable_rdma_read = (attr->qp_access_flags &
+			       IB_ACCESS_REMOTE_READ) ?  1 : 0;
+	attrs.enable_rdma_write = (attr->qp_access_flags &
+				IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0;
+
+
+	mask |= (attr_mask & IB_QP_STATE) ? IWCH_QP_ATTR_NEXT_STATE : 0;
+	mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ?
+			(IWCH_QP_ATTR_ENABLE_RDMA_READ |
+			 IWCH_QP_ATTR_ENABLE_RDMA_WRITE |
+			 IWCH_QP_ATTR_ENABLE_RDMA_BIND) : 0;
+
+	return iwch_modify_qp(rhp, qhp, mask, &attrs, 0);
+}
+
+void iwch_qp_add_ref(struct ib_qp *qp)
+{
+	pr_debug("%s ib_qp %p\n", __func__, qp);
+	atomic_inc(&(to_iwch_qp(qp)->refcnt));
+}
+
+void iwch_qp_rem_ref(struct ib_qp *qp)
+{
+	pr_debug("%s ib_qp %p\n", __func__, qp);
+	if (atomic_dec_and_test(&(to_iwch_qp(qp)->refcnt)))
+	        wake_up(&(to_iwch_qp(qp)->wait));
+}
+
+static struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn)
+{
+	pr_debug("%s ib_dev %p qpn 0x%x\n", __func__, dev, qpn);
+	return (struct ib_qp *)get_qhp(to_iwch_dev(dev), qpn);
+}
+
+
+static int iwch_query_pkey(struct ib_device *ibdev,
+			   u8 port, u16 index, u16 * pkey)
+{
+	pr_debug("%s ibdev %p\n", __func__, ibdev);
+	*pkey = 0;
+	return 0;
+}
+
+static int iwch_query_gid(struct ib_device *ibdev, u8 port,
+			  int index, union ib_gid *gid)
+{
+	struct iwch_dev *dev;
+
+	pr_debug("%s ibdev %p, port %d, index %d, gid %p\n",
+		 __func__, ibdev, port, index, gid);
+	dev = to_iwch_dev(ibdev);
+	BUG_ON(port == 0 || port > 2);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), dev->rdev.port_info.lldevs[port-1]->dev_addr, 6);
+	return 0;
+}
+
+static u64 fw_vers_string_to_u64(struct iwch_dev *iwch_dev)
+{
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+	char *cp, *next;
+	unsigned fw_maj, fw_min, fw_mic;
+
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+
+	next = info.fw_version + 1;
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_maj);
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_min);
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_mic);
+
+	return (((u64)fw_maj & 0xffff) << 32) | ((fw_min & 0xffff) << 16) |
+	       (fw_mic & 0xffff);
+}
+
+static int iwch_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+			     struct ib_udata *uhw)
+{
+
+	struct iwch_dev *dev;
+
+	pr_debug("%s ibdev %p\n", __func__, ibdev);
+
+	if (uhw->inlen || uhw->outlen)
+		return -EINVAL;
+
+	dev = to_iwch_dev(ibdev);
+	memset(props, 0, sizeof *props);
+	memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
+	props->hw_ver = dev->rdev.t3cdev_p->type;
+	props->fw_ver = fw_vers_string_to_u64(dev);
+	props->device_cap_flags = dev->device_cap_flags;
+	props->page_size_cap = dev->attr.mem_pgsizes_bitmask;
+	props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor;
+	props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device;
+	props->max_mr_size = dev->attr.max_mr_size;
+	props->max_qp = dev->attr.max_qps;
+	props->max_qp_wr = dev->attr.max_wrs;
+	props->max_sge = dev->attr.max_sge_per_wr;
+	props->max_sge_rd = 1;
+	props->max_qp_rd_atom = dev->attr.max_rdma_reads_per_qp;
+	props->max_qp_init_rd_atom = dev->attr.max_rdma_reads_per_qp;
+	props->max_cq = dev->attr.max_cqs;
+	props->max_cqe = dev->attr.max_cqes_per_cq;
+	props->max_mr = dev->attr.max_mem_regs;
+	props->max_pd = dev->attr.max_pds;
+	props->local_ca_ack_delay = 0;
+	props->max_fast_reg_page_list_len = T3_MAX_FASTREG_DEPTH;
+
+	return 0;
+}
+
+static int iwch_query_port(struct ib_device *ibdev,
+			   u8 port, struct ib_port_attr *props)
+{
+	struct iwch_dev *dev;
+	struct net_device *netdev;
+	struct in_device *inetdev;
+
+	pr_debug("%s ibdev %p\n", __func__, ibdev);
+
+	dev = to_iwch_dev(ibdev);
+	netdev = dev->rdev.port_info.lldevs[port-1];
+
+	/* props being zeroed by the caller, avoid zeroing it here */
+	props->max_mtu = IB_MTU_4096;
+	props->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
+
+	if (!netif_carrier_ok(netdev))
+		props->state = IB_PORT_DOWN;
+	else {
+		inetdev = in_dev_get(netdev);
+		if (inetdev) {
+			if (inetdev->ifa_list)
+				props->state = IB_PORT_ACTIVE;
+			else
+				props->state = IB_PORT_INIT;
+			in_dev_put(inetdev);
+		} else
+			props->state = IB_PORT_INIT;
+	}
+
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_SNMP_TUNNEL_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_DEVICE_MGMT_SUP |
+	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->active_width = 2;
+	props->active_speed = IB_SPEED_DDR;
+	props->max_msg_sz = -1;
+
+	return 0;
+}
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
+						 ibdev.dev);
+	pr_debug("%s dev 0x%p\n", __func__, dev);
+	return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
+}
+
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
+						 ibdev.dev);
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+
+	pr_debug("%s dev 0x%p\n", __func__, dev);
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	return sprintf(buf, "%s\n", info.driver);
+}
+
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
+						 ibdev.dev);
+	pr_debug("%s dev 0x%p\n", __func__, dev);
+	return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor,
+		       iwch_dev->rdev.rnic_info.pdev->device);
+}
+
+enum counters {
+	IPINRECEIVES,
+	IPINHDRERRORS,
+	IPINADDRERRORS,
+	IPINUNKNOWNPROTOS,
+	IPINDISCARDS,
+	IPINDELIVERS,
+	IPOUTREQUESTS,
+	IPOUTDISCARDS,
+	IPOUTNOROUTES,
+	IPREASMTIMEOUT,
+	IPREASMREQDS,
+	IPREASMOKS,
+	IPREASMFAILS,
+	TCPACTIVEOPENS,
+	TCPPASSIVEOPENS,
+	TCPATTEMPTFAILS,
+	TCPESTABRESETS,
+	TCPCURRESTAB,
+	TCPINSEGS,
+	TCPOUTSEGS,
+	TCPRETRANSSEGS,
+	TCPINERRS,
+	TCPOUTRSTS,
+	TCPRTOMIN,
+	TCPRTOMAX,
+	NR_COUNTERS
+};
+
+static const char * const names[] = {
+	[IPINRECEIVES] = "ipInReceives",
+	[IPINHDRERRORS] = "ipInHdrErrors",
+	[IPINADDRERRORS] = "ipInAddrErrors",
+	[IPINUNKNOWNPROTOS] = "ipInUnknownProtos",
+	[IPINDISCARDS] = "ipInDiscards",
+	[IPINDELIVERS] = "ipInDelivers",
+	[IPOUTREQUESTS] = "ipOutRequests",
+	[IPOUTDISCARDS] = "ipOutDiscards",
+	[IPOUTNOROUTES] = "ipOutNoRoutes",
+	[IPREASMTIMEOUT] = "ipReasmTimeout",
+	[IPREASMREQDS] = "ipReasmReqds",
+	[IPREASMOKS] = "ipReasmOKs",
+	[IPREASMFAILS] = "ipReasmFails",
+	[TCPACTIVEOPENS] = "tcpActiveOpens",
+	[TCPPASSIVEOPENS] = "tcpPassiveOpens",
+	[TCPATTEMPTFAILS] = "tcpAttemptFails",
+	[TCPESTABRESETS] = "tcpEstabResets",
+	[TCPCURRESTAB] = "tcpCurrEstab",
+	[TCPINSEGS] = "tcpInSegs",
+	[TCPOUTSEGS] = "tcpOutSegs",
+	[TCPRETRANSSEGS] = "tcpRetransSegs",
+	[TCPINERRS] = "tcpInErrs",
+	[TCPOUTRSTS] = "tcpOutRsts",
+	[TCPRTOMIN] = "tcpRtoMin",
+	[TCPRTOMAX] = "tcpRtoMax",
+};
+
+static struct rdma_hw_stats *iwch_alloc_stats(struct ib_device *ibdev,
+					      u8 port_num)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS);
+
+	/* Our driver only supports device level stats */
+	if (port_num != 0)
+		return NULL;
+
+	return rdma_alloc_hw_stats_struct(names, NR_COUNTERS,
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int iwch_get_mib(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+			u8 port, int index)
+{
+	struct iwch_dev *dev;
+	struct tp_mib_stats m;
+	int ret;
+
+	if (port != 0 || !stats)
+		return -ENOSYS;
+
+	pr_debug("%s ibdev %p\n", __func__, ibdev);
+	dev = to_iwch_dev(ibdev);
+	ret = dev->rdev.t3cdev_p->ctl(dev->rdev.t3cdev_p, RDMA_GET_MIB, &m);
+	if (ret)
+		return -ENOSYS;
+
+	stats->value[IPINRECEIVES] = ((u64)m.ipInReceive_hi << 32) +	m.ipInReceive_lo;
+	stats->value[IPINHDRERRORS] = ((u64)m.ipInHdrErrors_hi << 32) + m.ipInHdrErrors_lo;
+	stats->value[IPINADDRERRORS] = ((u64)m.ipInAddrErrors_hi << 32) + m.ipInAddrErrors_lo;
+	stats->value[IPINUNKNOWNPROTOS] = ((u64)m.ipInUnknownProtos_hi << 32) + m.ipInUnknownProtos_lo;
+	stats->value[IPINDISCARDS] = ((u64)m.ipInDiscards_hi << 32) + m.ipInDiscards_lo;
+	stats->value[IPINDELIVERS] = ((u64)m.ipInDelivers_hi << 32) + m.ipInDelivers_lo;
+	stats->value[IPOUTREQUESTS] = ((u64)m.ipOutRequests_hi << 32) + m.ipOutRequests_lo;
+	stats->value[IPOUTDISCARDS] = ((u64)m.ipOutDiscards_hi << 32) + m.ipOutDiscards_lo;
+	stats->value[IPOUTNOROUTES] = ((u64)m.ipOutNoRoutes_hi << 32) + m.ipOutNoRoutes_lo;
+	stats->value[IPREASMTIMEOUT] = 	m.ipReasmTimeout;
+	stats->value[IPREASMREQDS] = m.ipReasmReqds;
+	stats->value[IPREASMOKS] = m.ipReasmOKs;
+	stats->value[IPREASMFAILS] = m.ipReasmFails;
+	stats->value[TCPACTIVEOPENS] =	m.tcpActiveOpens;
+	stats->value[TCPPASSIVEOPENS] =	m.tcpPassiveOpens;
+	stats->value[TCPATTEMPTFAILS] = m.tcpAttemptFails;
+	stats->value[TCPESTABRESETS] = m.tcpEstabResets;
+	stats->value[TCPCURRESTAB] = m.tcpOutRsts;
+	stats->value[TCPINSEGS] = m.tcpCurrEstab;
+	stats->value[TCPOUTSEGS] = ((u64)m.tcpInSegs_hi << 32) + m.tcpInSegs_lo;
+	stats->value[TCPRETRANSSEGS] = ((u64)m.tcpOutSegs_hi << 32) + m.tcpOutSegs_lo;
+	stats->value[TCPINERRS] = ((u64)m.tcpRetransSeg_hi << 32) + m.tcpRetransSeg_lo,
+	stats->value[TCPOUTRSTS] = ((u64)m.tcpInErrs_hi << 32) + m.tcpInErrs_lo;
+	stats->value[TCPRTOMIN] = m.tcpRtoMin;
+	stats->value[TCPRTOMAX] = m.tcpRtoMax;
+
+	return stats->num_counters;
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *iwch_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+};
+
+static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num,
+			       struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	return 0;
+}
+
+static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str)
+{
+	struct iwch_dev *iwch_dev = to_iwch_dev(ibdev);
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+
+	pr_debug("%s dev 0x%p\n", __func__, iwch_dev);
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version);
+}
+
+int iwch_register_device(struct iwch_dev *dev)
+{
+	int ret;
+	int i;
+
+	pr_debug("%s iwch_dev %p\n", __func__, dev);
+	strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
+	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+	memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
+	dev->ibdev.owner = THIS_MODULE;
+	dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
+				IB_DEVICE_MEM_WINDOW |
+				IB_DEVICE_MEM_MGT_EXTENSIONS;
+
+	/* cxgb3 supports STag 0. */
+	dev->ibdev.local_dma_lkey = 0;
+
+	dev->ibdev.uverbs_cmd_mask =
+	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
+	dev->ibdev.node_type = RDMA_NODE_RNIC;
+	BUILD_BUG_ON(sizeof(IWCH_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX);
+	memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC));
+	dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports;
+	dev->ibdev.num_comp_vectors = 1;
+	dev->ibdev.dev.parent = &dev->rdev.rnic_info.pdev->dev;
+	dev->ibdev.query_device = iwch_query_device;
+	dev->ibdev.query_port = iwch_query_port;
+	dev->ibdev.query_pkey = iwch_query_pkey;
+	dev->ibdev.query_gid = iwch_query_gid;
+	dev->ibdev.alloc_ucontext = iwch_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext;
+	dev->ibdev.mmap = iwch_mmap;
+	dev->ibdev.alloc_pd = iwch_allocate_pd;
+	dev->ibdev.dealloc_pd = iwch_deallocate_pd;
+	dev->ibdev.create_ah = iwch_ah_create;
+	dev->ibdev.destroy_ah = iwch_ah_destroy;
+	dev->ibdev.create_qp = iwch_create_qp;
+	dev->ibdev.modify_qp = iwch_ib_modify_qp;
+	dev->ibdev.destroy_qp = iwch_destroy_qp;
+	dev->ibdev.create_cq = iwch_create_cq;
+	dev->ibdev.destroy_cq = iwch_destroy_cq;
+	dev->ibdev.resize_cq = iwch_resize_cq;
+	dev->ibdev.poll_cq = iwch_poll_cq;
+	dev->ibdev.get_dma_mr = iwch_get_dma_mr;
+	dev->ibdev.reg_user_mr = iwch_reg_user_mr;
+	dev->ibdev.dereg_mr = iwch_dereg_mr;
+	dev->ibdev.alloc_mw = iwch_alloc_mw;
+	dev->ibdev.dealloc_mw = iwch_dealloc_mw;
+	dev->ibdev.alloc_mr = iwch_alloc_mr;
+	dev->ibdev.map_mr_sg = iwch_map_mr_sg;
+	dev->ibdev.attach_mcast = iwch_multicast_attach;
+	dev->ibdev.detach_mcast = iwch_multicast_detach;
+	dev->ibdev.process_mad = iwch_process_mad;
+	dev->ibdev.req_notify_cq = iwch_arm_cq;
+	dev->ibdev.post_send = iwch_post_send;
+	dev->ibdev.post_recv = iwch_post_receive;
+	dev->ibdev.alloc_hw_stats = iwch_alloc_stats;
+	dev->ibdev.get_hw_stats = iwch_get_mib;
+	dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
+	dev->ibdev.get_port_immutable = iwch_port_immutable;
+	dev->ibdev.get_dev_fw_str = get_dev_fw_ver_str;
+
+	dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
+	if (!dev->ibdev.iwcm)
+		return -ENOMEM;
+
+	dev->ibdev.iwcm->connect = iwch_connect;
+	dev->ibdev.iwcm->accept = iwch_accept_cr;
+	dev->ibdev.iwcm->reject = iwch_reject_cr;
+	dev->ibdev.iwcm->create_listen = iwch_create_listen;
+	dev->ibdev.iwcm->destroy_listen = iwch_destroy_listen;
+	dev->ibdev.iwcm->add_ref = iwch_qp_add_ref;
+	dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref;
+	dev->ibdev.iwcm->get_qp = iwch_get_qp;
+	memcpy(dev->ibdev.iwcm->ifname, dev->rdev.t3cdev_p->lldev->name,
+	       sizeof(dev->ibdev.iwcm->ifname));
+
+	dev->ibdev.driver_id = RDMA_DRIVER_CXGB3;
+	ret = ib_register_device(&dev->ibdev, NULL);
+	if (ret)
+		goto bail1;
+
+	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) {
+		ret = device_create_file(&dev->ibdev.dev,
+					 iwch_class_attributes[i]);
+		if (ret) {
+			goto bail2;
+		}
+	}
+	return 0;
+bail2:
+	ib_unregister_device(&dev->ibdev);
+bail1:
+	kfree(dev->ibdev.iwcm);
+	return ret;
+}
+
+void iwch_unregister_device(struct iwch_dev *dev)
+{
+	int i;
+
+	pr_debug("%s iwch_dev %p\n", __func__, dev);
+	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i)
+		device_remove_file(&dev->ibdev.dev,
+				   iwch_class_attributes[i]);
+	ib_unregister_device(&dev->ibdev);
+	kfree(dev->ibdev.iwcm);
+	return;
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h
new file mode 100644
index 0000000..2e38dde
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IWCH_PROVIDER_H__
+#define __IWCH_PROVIDER_H__
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <rdma/ib_verbs.h>
+#include <asm/types.h>
+#include "t3cdev.h"
+#include "iwch.h"
+#include "cxio_wr.h"
+#include "cxio_hal.h"
+
+struct iwch_pd {
+	struct ib_pd ibpd;
+	u32 pdid;
+	struct iwch_dev *rhp;
+};
+
+static inline struct iwch_pd *to_iwch_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct iwch_pd, ibpd);
+}
+
+struct tpt_attributes {
+	u32 stag;
+	u32 state:1;
+	u32 type:2;
+	u32 rsvd:1;
+	enum tpt_mem_perm perms;
+	u32 remote_invaliate_disable:1;
+	u32 zbva:1;
+	u32 mw_bind_enable:1;
+	u32 page_size:5;
+
+	u32 pdid;
+	u32 qpid;
+	u32 pbl_addr;
+	u32 len;
+	u64 va_fbo;
+	u32 pbl_size;
+};
+
+struct iwch_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct iwch_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+	u64 *pages;
+	u32 npages;
+};
+
+typedef struct iwch_mw iwch_mw_handle;
+
+static inline struct iwch_mr *to_iwch_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct iwch_mr, ibmr);
+}
+
+struct iwch_mw {
+	struct ib_mw ibmw;
+	struct iwch_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+};
+
+static inline struct iwch_mw *to_iwch_mw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct iwch_mw, ibmw);
+}
+
+struct iwch_cq {
+	struct ib_cq ibcq;
+	struct iwch_dev *rhp;
+	struct t3_cq cq;
+	spinlock_t lock;
+	spinlock_t comp_handler_lock;
+	atomic_t refcnt;
+	wait_queue_head_t wait;
+	u32 __user *user_rptr_addr;
+};
+
+static inline struct iwch_cq *to_iwch_cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct iwch_cq, ibcq);
+}
+
+enum IWCH_QP_FLAGS {
+	QP_QUIESCED = 0x01
+};
+
+struct iwch_mpa_attributes {
+	u8 initiator;
+	u8 recv_marker_enabled;
+	u8 xmit_marker_enabled;	/* iWARP: enable inbound Read Resp. */
+	u8 crc_enabled;
+	u8 version;	/* 0 or 1 */
+};
+
+struct iwch_qp_attributes {
+	u32 scq;
+	u32 rcq;
+	u32 sq_num_entries;
+	u32 rq_num_entries;
+	u32 sq_max_sges;
+	u32 sq_max_sges_rdma_write;
+	u32 rq_max_sges;
+	u32 state;
+	u8 enable_rdma_read;
+	u8 enable_rdma_write;	/* enable inbound Read Resp. */
+	u8 enable_bind;
+	u8 enable_mmid0_fastreg;	/* Enable STAG0 + Fast-register */
+	/*
+	 * Next QP state. If specify the current state, only the
+	 * QP attributes will be modified.
+	 */
+	u32 max_ord;
+	u32 max_ird;
+	u32 pd;	/* IN */
+	u32 next_state;
+	char terminate_buffer[52];
+	u32 terminate_msg_len;
+	u8 is_terminate_local;
+	struct iwch_mpa_attributes mpa_attr;	/* IN-OUT */
+	struct iwch_ep *llp_stream_handle;
+	char *stream_msg_buf;	/* Last stream msg. before Idle -> RTS */
+	u32 stream_msg_buf_len;	/* Only on Idle -> RTS */
+};
+
+struct iwch_qp {
+	struct ib_qp ibqp;
+	struct iwch_dev *rhp;
+	struct iwch_ep *ep;
+	struct iwch_qp_attributes attr;
+	struct t3_wq wq;
+	spinlock_t lock;
+	atomic_t refcnt;
+	wait_queue_head_t wait;
+	enum IWCH_QP_FLAGS flags;
+};
+
+static inline int qp_quiesced(struct iwch_qp *qhp)
+{
+	return qhp->flags & QP_QUIESCED;
+}
+
+static inline struct iwch_qp *to_iwch_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct iwch_qp, ibqp);
+}
+
+void iwch_qp_add_ref(struct ib_qp *qp);
+void iwch_qp_rem_ref(struct ib_qp *qp);
+
+struct iwch_ucontext {
+	struct ib_ucontext ibucontext;
+	struct cxio_ucontext uctx;
+	u32 key;
+	spinlock_t mmap_lock;
+	struct list_head mmaps;
+};
+
+static inline struct iwch_ucontext *to_iwch_ucontext(struct ib_ucontext *c)
+{
+	return container_of(c, struct iwch_ucontext, ibucontext);
+}
+
+struct iwch_mm_entry {
+	struct list_head entry;
+	u64 addr;
+	u32 key;
+	unsigned len;
+};
+
+static inline struct iwch_mm_entry *remove_mmap(struct iwch_ucontext *ucontext,
+						u32 key, unsigned len)
+{
+	struct list_head *pos, *nxt;
+	struct iwch_mm_entry *mm;
+
+	spin_lock(&ucontext->mmap_lock);
+	list_for_each_safe(pos, nxt, &ucontext->mmaps) {
+
+		mm = list_entry(pos, struct iwch_mm_entry, entry);
+		if (mm->key == key && mm->len == len) {
+			list_del_init(&mm->entry);
+			spin_unlock(&ucontext->mmap_lock);
+			pr_debug("%s key 0x%x addr 0x%llx len %d\n",
+				 __func__, key,
+				 (unsigned long long)mm->addr, mm->len);
+			return mm;
+		}
+	}
+	spin_unlock(&ucontext->mmap_lock);
+	return NULL;
+}
+
+static inline void insert_mmap(struct iwch_ucontext *ucontext,
+			       struct iwch_mm_entry *mm)
+{
+	spin_lock(&ucontext->mmap_lock);
+	pr_debug("%s key 0x%x addr 0x%llx len %d\n",
+		 __func__, mm->key, (unsigned long long)mm->addr, mm->len);
+	list_add_tail(&mm->entry, &ucontext->mmaps);
+	spin_unlock(&ucontext->mmap_lock);
+}
+
+enum iwch_qp_attr_mask {
+	IWCH_QP_ATTR_NEXT_STATE = 1 << 0,
+	IWCH_QP_ATTR_ENABLE_RDMA_READ = 1 << 7,
+	IWCH_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8,
+	IWCH_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9,
+	IWCH_QP_ATTR_MAX_ORD = 1 << 11,
+	IWCH_QP_ATTR_MAX_IRD = 1 << 12,
+	IWCH_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22,
+	IWCH_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23,
+	IWCH_QP_ATTR_MPA_ATTR = 1 << 24,
+	IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25,
+	IWCH_QP_ATTR_VALID_MODIFY = (IWCH_QP_ATTR_ENABLE_RDMA_READ |
+				     IWCH_QP_ATTR_ENABLE_RDMA_WRITE |
+				     IWCH_QP_ATTR_MAX_ORD |
+				     IWCH_QP_ATTR_MAX_IRD |
+				     IWCH_QP_ATTR_LLP_STREAM_HANDLE |
+				     IWCH_QP_ATTR_STREAM_MSG_BUFFER |
+				     IWCH_QP_ATTR_MPA_ATTR |
+				     IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE)
+};
+
+int iwch_modify_qp(struct iwch_dev *rhp,
+				struct iwch_qp *qhp,
+				enum iwch_qp_attr_mask mask,
+				struct iwch_qp_attributes *attrs,
+				int internal);
+
+enum iwch_qp_state {
+	IWCH_QP_STATE_IDLE,
+	IWCH_QP_STATE_RTS,
+	IWCH_QP_STATE_ERROR,
+	IWCH_QP_STATE_TERMINATE,
+	IWCH_QP_STATE_CLOSING,
+	IWCH_QP_STATE_TOT
+};
+
+static inline int iwch_convert_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET:
+	case IB_QPS_INIT:
+		return IWCH_QP_STATE_IDLE;
+	case IB_QPS_RTS:
+		return IWCH_QP_STATE_RTS;
+	case IB_QPS_SQD:
+		return IWCH_QP_STATE_CLOSING;
+	case IB_QPS_SQE:
+		return IWCH_QP_STATE_TERMINATE;
+	case IB_QPS_ERR:
+		return IWCH_QP_STATE_ERROR;
+	default:
+		return -1;
+	}
+}
+
+static inline u32 iwch_ib_to_tpt_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE ? TPT_LOCAL_WRITE : 0) |
+	       (acc & IB_ACCESS_MW_BIND ? TPT_MW_BIND : 0) |
+	       TPT_LOCAL_READ;
+}
+
+static inline u32 iwch_ib_to_tpt_bind_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0);
+}
+
+enum iwch_mmid_state {
+	IWCH_STAG_STATE_VALID,
+	IWCH_STAG_STATE_INVALID
+};
+
+enum iwch_qp_query_flags {
+	IWCH_QP_QUERY_CONTEXT_NONE = 0x0,	/* No ctx; Only attrs */
+	IWCH_QP_QUERY_CONTEXT_GET = 0x1,	/* Get ctx + attrs */
+	IWCH_QP_QUERY_CONTEXT_SUSPEND = 0x2,	/* Not Supported */
+
+	/*
+	 * Quiesce QP context; Consumer
+	 * will NOT replay outstanding WR
+	 */
+	IWCH_QP_QUERY_CONTEXT_QUIESCE = 0x4,
+	IWCH_QP_QUERY_CONTEXT_REMOVE = 0x8,
+	IWCH_QP_QUERY_TEST_USERWRITE = 0x32	/* Test special */
+};
+
+u16 iwch_rqes_posted(struct iwch_qp *qhp);
+int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg);
+int iwch_post_zb_read(struct iwch_ep *ep);
+int iwch_register_device(struct iwch_dev *dev);
+void iwch_unregister_device(struct iwch_dev *dev);
+void stop_read_rep_timer(struct iwch_qp *qhp);
+int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+		      struct iwch_mr *mhp, int shift);
+int iwch_alloc_pbl(struct iwch_mr *mhp, int npages);
+void iwch_free_pbl(struct iwch_mr *mhp);
+int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset);
+
+#define IWCH_NODE_DESC "cxgb3 Chelsio Communications"
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
new file mode 100644
index 0000000..3871e1f
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -0,0 +1,1082 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include "iwch_provider.h"
+#include "iwch.h"
+#include "iwch_cm.h"
+#include "cxio_hal.h"
+#include "cxio_resource.h"
+
+#define NO_SUPPORT -1
+
+static int build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 * flit_cnt)
+{
+	int i;
+	u32 plen;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.rdmaop = T3_SEND_WITH_SE;
+		else
+			wqe->send.rdmaop = T3_SEND;
+		wqe->send.rem_stag = 0;
+		break;
+	case IB_WR_SEND_WITH_INV:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.rdmaop = T3_SEND_WITH_SE_INV;
+		else
+			wqe->send.rdmaop = T3_SEND_WITH_INV;
+		wqe->send.rem_stag = cpu_to_be32(wr->ex.invalidate_rkey);
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (wr->num_sge > T3_MAX_SGE)
+		return -EINVAL;
+	wqe->send.reserved[0] = 0;
+	wqe->send.reserved[1] = 0;
+	wqe->send.reserved[2] = 0;
+	plen = 0;
+	for (i = 0; i < wr->num_sge; i++) {
+		if ((plen + wr->sg_list[i].length) < plen)
+			return -EMSGSIZE;
+
+		plen += wr->sg_list[i].length;
+		wqe->send.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
+		wqe->send.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+		wqe->send.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
+	}
+	wqe->send.num_sgle = cpu_to_be32(wr->num_sge);
+	*flit_cnt = 4 + ((wr->num_sge) << 1);
+	wqe->send.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
+				 u8 *flit_cnt)
+{
+	int i;
+	u32 plen;
+	if (wr->num_sge > T3_MAX_SGE)
+		return -EINVAL;
+	wqe->write.rdmaop = T3_RDMA_WRITE;
+	wqe->write.reserved[0] = 0;
+	wqe->write.reserved[1] = 0;
+	wqe->write.reserved[2] = 0;
+	wqe->write.stag_sink = cpu_to_be32(rdma_wr(wr)->rkey);
+	wqe->write.to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr);
+
+	if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+		plen = 4;
+		wqe->write.sgl[0].stag = wr->ex.imm_data;
+		wqe->write.sgl[0].len = cpu_to_be32(0);
+		wqe->write.num_sgle = cpu_to_be32(0);
+		*flit_cnt = 6;
+	} else {
+		plen = 0;
+		for (i = 0; i < wr->num_sge; i++) {
+			if ((plen + wr->sg_list[i].length) < plen) {
+				return -EMSGSIZE;
+			}
+			plen += wr->sg_list[i].length;
+			wqe->write.sgl[i].stag =
+			    cpu_to_be32(wr->sg_list[i].lkey);
+			wqe->write.sgl[i].len =
+			    cpu_to_be32(wr->sg_list[i].length);
+			wqe->write.sgl[i].to =
+			    cpu_to_be64(wr->sg_list[i].addr);
+		}
+		wqe->write.num_sgle = cpu_to_be32(wr->num_sge);
+		*flit_cnt = 5 + ((wr->num_sge) << 1);
+	}
+	wqe->write.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 *flit_cnt)
+{
+	if (wr->num_sge > 1)
+		return -EINVAL;
+	wqe->read.rdmaop = T3_READ_REQ;
+	if (wr->opcode == IB_WR_RDMA_READ_WITH_INV)
+		wqe->read.local_inv = 1;
+	else
+		wqe->read.local_inv = 0;
+	wqe->read.reserved[0] = 0;
+	wqe->read.reserved[1] = 0;
+	wqe->read.rem_stag = cpu_to_be32(rdma_wr(wr)->rkey);
+	wqe->read.rem_to = cpu_to_be64(rdma_wr(wr)->remote_addr);
+	wqe->read.local_stag = cpu_to_be32(wr->sg_list[0].lkey);
+	wqe->read.local_len = cpu_to_be32(wr->sg_list[0].length);
+	wqe->read.local_to = cpu_to_be64(wr->sg_list[0].addr);
+	*flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3;
+	return 0;
+}
+
+static int build_memreg(union t3_wr *wqe, struct ib_reg_wr *wr,
+			  u8 *flit_cnt, int *wr_cnt, struct t3_wq *wq)
+{
+	struct iwch_mr *mhp = to_iwch_mr(wr->mr);
+	int i;
+	__be64 *p;
+
+	if (mhp->npages > T3_MAX_FASTREG_DEPTH)
+		return -EINVAL;
+	*wr_cnt = 1;
+	wqe->fastreg.stag = cpu_to_be32(wr->key);
+	wqe->fastreg.len = cpu_to_be32(mhp->ibmr.length);
+	wqe->fastreg.va_base_hi = cpu_to_be32(mhp->ibmr.iova >> 32);
+	wqe->fastreg.va_base_lo_fbo =
+				cpu_to_be32(mhp->ibmr.iova & 0xffffffff);
+	wqe->fastreg.page_type_perms = cpu_to_be32(
+		V_FR_PAGE_COUNT(mhp->npages) |
+		V_FR_PAGE_SIZE(ilog2(wr->mr->page_size) - 12) |
+		V_FR_TYPE(TPT_VATO) |
+		V_FR_PERMS(iwch_ib_to_tpt_access(wr->access)));
+	p = &wqe->fastreg.pbl_addrs[0];
+	for (i = 0; i < mhp->npages; i++, p++) {
+
+		/* If we need a 2nd WR, then set it up */
+		if (i == T3_MAX_FASTREG_FRAG) {
+			*wr_cnt = 2;
+			wqe = (union t3_wr *)(wq->queue +
+				Q_PTR2IDX((wq->wptr+1), wq->size_log2));
+			build_fw_riwrh((void *)wqe, T3_WR_FASTREG, 0,
+			       Q_GENBIT(wq->wptr + 1, wq->size_log2),
+			       0, 1 + mhp->npages - T3_MAX_FASTREG_FRAG,
+			       T3_EOP);
+
+			p = &wqe->pbl_frag.pbl_addrs[0];
+		}
+		*p = cpu_to_be64((u64)mhp->pages[i]);
+	}
+	*flit_cnt = 5 + mhp->npages;
+	if (*flit_cnt > 15)
+		*flit_cnt = 15;
+	return 0;
+}
+
+static int build_inv_stag(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 *flit_cnt)
+{
+	wqe->local_inv.stag = cpu_to_be32(wr->ex.invalidate_rkey);
+	wqe->local_inv.reserved = 0;
+	*flit_cnt = sizeof(struct t3_local_inv_wr) >> 3;
+	return 0;
+}
+
+static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list,
+			    u32 num_sgle, u32 * pbl_addr, u8 * page_size)
+{
+	int i;
+	struct iwch_mr *mhp;
+	u64 offset;
+	for (i = 0; i < num_sgle; i++) {
+
+		mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8);
+		if (!mhp) {
+			pr_debug("%s %d\n", __func__, __LINE__);
+			return -EIO;
+		}
+		if (!mhp->attr.state) {
+			pr_debug("%s %d\n", __func__, __LINE__);
+			return -EIO;
+		}
+		if (mhp->attr.zbva) {
+			pr_debug("%s %d\n", __func__, __LINE__);
+			return -EIO;
+		}
+
+		if (sg_list[i].addr < mhp->attr.va_fbo) {
+			pr_debug("%s %d\n", __func__, __LINE__);
+			return -EINVAL;
+		}
+		if (sg_list[i].addr + ((u64) sg_list[i].length) <
+		    sg_list[i].addr) {
+			pr_debug("%s %d\n", __func__, __LINE__);
+			return -EINVAL;
+		}
+		if (sg_list[i].addr + ((u64) sg_list[i].length) >
+		    mhp->attr.va_fbo + ((u64) mhp->attr.len)) {
+			pr_debug("%s %d\n", __func__, __LINE__);
+			return -EINVAL;
+		}
+		offset = sg_list[i].addr - mhp->attr.va_fbo;
+		offset += mhp->attr.va_fbo &
+			  ((1UL << (12 + mhp->attr.page_size)) - 1);
+		pbl_addr[i] = ((mhp->attr.pbl_addr -
+			        rhp->rdev.rnic_info.pbl_base) >> 3) +
+			      (offset >> (12 + mhp->attr.page_size));
+		page_size[i] = mhp->attr.page_size;
+	}
+	return 0;
+}
+
+static int build_rdma_recv(struct iwch_qp *qhp, union t3_wr *wqe,
+				struct ib_recv_wr *wr)
+{
+	int i, err = 0;
+	u32 pbl_addr[T3_MAX_SGE];
+	u8 page_size[T3_MAX_SGE];
+
+	err = iwch_sgl2pbl_map(qhp->rhp, wr->sg_list, wr->num_sge, pbl_addr,
+			       page_size);
+	if (err)
+		return err;
+	wqe->recv.pagesz[0] = page_size[0];
+	wqe->recv.pagesz[1] = page_size[1];
+	wqe->recv.pagesz[2] = page_size[2];
+	wqe->recv.pagesz[3] = page_size[3];
+	wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
+	for (i = 0; i < wr->num_sge; i++) {
+		wqe->recv.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
+		wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+
+		/* to in the WQE == the offset into the page */
+		wqe->recv.sgl[i].to = cpu_to_be64(((u32)wr->sg_list[i].addr) &
+				((1UL << (12 + page_size[i])) - 1));
+
+		/* pbl_addr is the adapters address in the PBL */
+		wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]);
+	}
+	for (; i < T3_MAX_SGE; i++) {
+		wqe->recv.sgl[i].stag = 0;
+		wqe->recv.sgl[i].len = 0;
+		wqe->recv.sgl[i].to = 0;
+		wqe->recv.pbl_addr[i] = 0;
+	}
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].pbl_addr = 0;
+	return 0;
+}
+
+static int build_zero_stag_recv(struct iwch_qp *qhp, union t3_wr *wqe,
+				struct ib_recv_wr *wr)
+{
+	int i;
+	u32 pbl_addr;
+	u32 pbl_offset;
+
+
+	/*
+	 * The T3 HW requires the PBL in the HW recv descriptor to reference
+	 * a PBL entry.  So we allocate the max needed PBL memory here and pass
+	 * it to the uP in the recv WR.  The uP will build the PBL and setup
+	 * the HW recv descriptor.
+	 */
+	pbl_addr = cxio_hal_pblpool_alloc(&qhp->rhp->rdev, T3_STAG0_PBL_SIZE);
+	if (!pbl_addr)
+		return -ENOMEM;
+
+	/*
+	 * Compute the 8B aligned offset.
+	 */
+	pbl_offset = (pbl_addr - qhp->rhp->rdev.rnic_info.pbl_base) >> 3;
+
+	wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
+
+	for (i = 0; i < wr->num_sge; i++) {
+
+		/*
+		 * Use a 128MB page size. This and an imposed 128MB
+		 * sge length limit allows us to require only a 2-entry HW
+		 * PBL for each SGE.  This restriction is acceptable since
+		 * since it is not possible to allocate 128MB of contiguous
+		 * DMA coherent memory!
+		 */
+		if (wr->sg_list[i].length > T3_STAG0_MAX_PBE_LEN)
+			return -EINVAL;
+		wqe->recv.pagesz[i] = T3_STAG0_PAGE_SHIFT;
+
+		/*
+		 * T3 restricts a recv to all zero-stag or all non-zero-stag.
+		 */
+		if (wr->sg_list[i].lkey != 0)
+			return -EINVAL;
+		wqe->recv.sgl[i].stag = 0;
+		wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+		wqe->recv.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
+		wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_offset);
+		pbl_offset += 2;
+	}
+	for (; i < T3_MAX_SGE; i++) {
+		wqe->recv.pagesz[i] = 0;
+		wqe->recv.sgl[i].stag = 0;
+		wqe->recv.sgl[i].len = 0;
+		wqe->recv.sgl[i].to = 0;
+		wqe->recv.pbl_addr[i] = 0;
+	}
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].pbl_addr = pbl_addr;
+	return 0;
+}
+
+int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	int err = 0;
+	u8 uninitialized_var(t3_wr_flit_cnt);
+	enum t3_wr_opcode t3_wr_opcode = 0;
+	enum t3_wr_flags t3_wr_flags;
+	struct iwch_qp *qhp;
+	u32 idx;
+	union t3_wr *wqe;
+	u32 num_wrs;
+	unsigned long flag;
+	struct t3_swsq *sqp;
+	int wr_cnt = 1;
+
+	qhp = to_iwch_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		err = -EINVAL;
+		goto out;
+	}
+	num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
+		  qhp->wq.sq_size_log2);
+	if (num_wrs == 0) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		err = -ENOMEM;
+		goto out;
+	}
+	while (wr) {
+		if (num_wrs == 0) {
+			err = -ENOMEM;
+			break;
+		}
+		idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+		wqe = (union t3_wr *) (qhp->wq.queue + idx);
+		t3_wr_flags = 0;
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			t3_wr_flags |= T3_SOLICITED_EVENT_FLAG;
+		if (wr->send_flags & IB_SEND_SIGNALED)
+			t3_wr_flags |= T3_COMPLETION_FLAG;
+		sqp = qhp->wq.sq +
+		      Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
+		switch (wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_INV:
+			if (wr->send_flags & IB_SEND_FENCE)
+				t3_wr_flags |= T3_READ_FENCE_FLAG;
+			t3_wr_opcode = T3_WR_SEND;
+			err = build_rdma_send(wqe, wr, &t3_wr_flit_cnt);
+			break;
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			t3_wr_opcode = T3_WR_WRITE;
+			err = build_rdma_write(wqe, wr, &t3_wr_flit_cnt);
+			break;
+		case IB_WR_RDMA_READ:
+		case IB_WR_RDMA_READ_WITH_INV:
+			t3_wr_opcode = T3_WR_READ;
+			t3_wr_flags = 0; /* T3 reads are always signaled */
+			err = build_rdma_read(wqe, wr, &t3_wr_flit_cnt);
+			if (err)
+				break;
+			sqp->read_len = wqe->read.local_len;
+			if (!qhp->wq.oldest_read)
+				qhp->wq.oldest_read = sqp;
+			break;
+		case IB_WR_REG_MR:
+			t3_wr_opcode = T3_WR_FASTREG;
+			err = build_memreg(wqe, reg_wr(wr), &t3_wr_flit_cnt,
+					   &wr_cnt, &qhp->wq);
+			break;
+		case IB_WR_LOCAL_INV:
+			if (wr->send_flags & IB_SEND_FENCE)
+				t3_wr_flags |= T3_LOCAL_FENCE_FLAG;
+			t3_wr_opcode = T3_WR_INV_STAG;
+			err = build_inv_stag(wqe, wr, &t3_wr_flit_cnt);
+			break;
+		default:
+			pr_debug("%s post of type=%d TBD!\n", __func__,
+				 wr->opcode);
+			err = -EINVAL;
+		}
+		if (err)
+			break;
+		wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
+		sqp->wr_id = wr->wr_id;
+		sqp->opcode = wr2opcode(t3_wr_opcode);
+		sqp->sq_wptr = qhp->wq.sq_wptr;
+		sqp->complete = 0;
+		sqp->signaled = (wr->send_flags & IB_SEND_SIGNALED);
+
+		build_fw_riwrh((void *) wqe, t3_wr_opcode, t3_wr_flags,
+			       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
+			       0, t3_wr_flit_cnt,
+			       (wr_cnt == 1) ? T3_SOPEOP : T3_SOP);
+		pr_debug("%s cookie 0x%llx wq idx 0x%x swsq idx %ld opcode %d\n",
+			 __func__, (unsigned long long)wr->wr_id, idx,
+			 Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2),
+			 sqp->opcode);
+		wr = wr->next;
+		num_wrs--;
+		qhp->wq.wptr += wr_cnt;
+		++(qhp->wq.sq_wptr);
+	}
+	spin_unlock_irqrestore(&qhp->lock, flag);
+	if (cxio_wq_db_enabled(&qhp->wq))
+		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+
+out:
+	if (err)
+		*bad_wr = wr;
+	return err;
+}
+
+int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	int err = 0;
+	struct iwch_qp *qhp;
+	u32 idx;
+	union t3_wr *wqe;
+	u32 num_wrs;
+	unsigned long flag;
+
+	qhp = to_iwch_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		err = -EINVAL;
+		goto out;
+	}
+	num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr,
+			    qhp->wq.rq_size_log2) - 1;
+	if (!wr) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		err = -ENOMEM;
+		goto out;
+	}
+	while (wr) {
+		if (wr->num_sge > T3_MAX_SGE) {
+			err = -EINVAL;
+			break;
+		}
+		idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+		wqe = (union t3_wr *) (qhp->wq.queue + idx);
+		if (num_wrs)
+			if (wr->sg_list[0].lkey)
+				err = build_rdma_recv(qhp, wqe, wr);
+			else
+				err = build_zero_stag_recv(qhp, wqe, wr);
+		else
+			err = -ENOMEM;
+
+		if (err)
+			break;
+
+		build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG,
+			       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
+			       0, sizeof(struct t3_receive_wr) >> 3, T3_SOPEOP);
+		pr_debug("%s cookie 0x%llx idx 0x%x rq_wptr 0x%x rw_rptr 0x%x wqe %p\n",
+			 __func__, (unsigned long long)wr->wr_id,
+			 idx, qhp->wq.rq_wptr, qhp->wq.rq_rptr, wqe);
+		++(qhp->wq.rq_wptr);
+		++(qhp->wq.wptr);
+		wr = wr->next;
+		num_wrs--;
+	}
+	spin_unlock_irqrestore(&qhp->lock, flag);
+	if (cxio_wq_db_enabled(&qhp->wq))
+		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+
+out:
+	if (err)
+		*bad_wr = wr;
+	return err;
+}
+
+static inline void build_term_codes(struct respQ_msg_t *rsp_msg,
+				    u8 *layer_type, u8 *ecode)
+{
+	int status = TPT_ERR_INTERNAL_ERR;
+	int tagged = 0;
+	int opcode = -1;
+	int rqtype = 0;
+	int send_inv = 0;
+
+	if (rsp_msg) {
+		status = CQE_STATUS(rsp_msg->cqe);
+		opcode = CQE_OPCODE(rsp_msg->cqe);
+		rqtype = RQ_TYPE(rsp_msg->cqe);
+		send_inv = (opcode == T3_SEND_WITH_INV) ||
+		           (opcode == T3_SEND_WITH_SE_INV);
+		tagged = (opcode == T3_RDMA_WRITE) ||
+			 (rqtype && (opcode == T3_READ_RESP));
+	}
+
+	switch (status) {
+	case TPT_ERR_STAG:
+		if (send_inv) {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+			*ecode = RDMAP_CANT_INV_STAG;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_INV_STAG;
+		}
+		break;
+	case TPT_ERR_PDID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		if ((opcode == T3_SEND_WITH_INV) ||
+		    (opcode == T3_SEND_WITH_SE_INV))
+			*ecode = RDMAP_CANT_INV_STAG;
+		else
+			*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case TPT_ERR_QPID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case TPT_ERR_ACCESS:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_ACC_VIOL;
+		break;
+	case TPT_ERR_WRAP:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_TO_WRAP;
+		break;
+	case TPT_ERR_BOUND:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_BASE_BOUNDS;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_BASE_BOUNDS;
+		}
+		break;
+	case TPT_ERR_INVALIDATE_SHARED_MR:
+	case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_CANT_INV_STAG;
+		break;
+	case TPT_ERR_ECC:
+	case TPT_ERR_ECC_PSTAG:
+	case TPT_ERR_INTERNAL_ERR:
+		*layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case TPT_ERR_OUT_OF_RQE:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_NOBUF;
+		break;
+	case TPT_ERR_PBL_ADDR_BOUND:
+		*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+		*ecode = DDPT_BASE_BOUNDS;
+		break;
+	case TPT_ERR_CRC:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_CRC_ERR;
+		break;
+	case TPT_ERR_MARKER:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_MARKER_ERR;
+		break;
+	case TPT_ERR_PDU_LEN_ERR:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_MSG_TOOBIG;
+		break;
+	case TPT_ERR_DDP_VERSION:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_INV_VERS;
+		} else {
+			*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+			*ecode = DDPU_INV_VERS;
+		}
+		break;
+	case TPT_ERR_RDMA_VERSION:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_VERS;
+		break;
+	case TPT_ERR_OPCODE:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_OPCODE;
+		break;
+	case TPT_ERR_DDP_QUEUE_NUM:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_QN;
+		break;
+	case TPT_ERR_MSN:
+	case TPT_ERR_MSN_GAP:
+	case TPT_ERR_MSN_RANGE:
+	case TPT_ERR_IRD_OVERFLOW:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_RANGE;
+		break;
+	case TPT_ERR_TBIT:
+		*layer_type = LAYER_DDP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case TPT_ERR_MO:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MO;
+		break;
+	default:
+		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	}
+}
+
+int iwch_post_zb_read(struct iwch_ep *ep)
+{
+	union t3_wr *wqe;
+	struct sk_buff *skb;
+	u8 flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3;
+
+	pr_debug("%s enter\n", __func__);
+	skb = alloc_skb(40, GFP_KERNEL);
+	if (!skb) {
+		pr_err("%s cannot send zb_read!!\n", __func__);
+		return -ENOMEM;
+	}
+	wqe = skb_put_zero(skb, sizeof(struct t3_rdma_read_wr));
+	wqe->read.rdmaop = T3_READ_REQ;
+	wqe->read.reserved[0] = 0;
+	wqe->read.reserved[1] = 0;
+	wqe->read.rem_stag = cpu_to_be32(1);
+	wqe->read.rem_to = cpu_to_be64(1);
+	wqe->read.local_stag = cpu_to_be32(1);
+	wqe->read.local_len = cpu_to_be32(0);
+	wqe->read.local_to = cpu_to_be64(1);
+	wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_READ));
+	wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(ep->hwtid)|
+						V_FW_RIWR_LEN(flit_cnt));
+	skb->priority = CPL_PRIORITY_DATA;
+	return iwch_cxgb3_ofld_send(ep->com.qp->rhp->rdev.t3cdev_p, skb);
+}
+
+/*
+ * This posts a TERMINATE with layer=RDMA, type=catastrophic.
+ */
+int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg)
+{
+	union t3_wr *wqe;
+	struct terminate_message *term;
+	struct sk_buff *skb;
+
+	pr_debug("%s %d\n", __func__, __LINE__);
+	skb = alloc_skb(40, GFP_ATOMIC);
+	if (!skb) {
+		pr_err("%s cannot send TERMINATE!\n", __func__);
+		return -ENOMEM;
+	}
+	wqe = skb_put_zero(skb, 40);
+	wqe->send.rdmaop = T3_TERMINATE;
+
+	/* immediate data length */
+	wqe->send.plen = htonl(4);
+
+	/* immediate data starts here. */
+	term = (struct terminate_message *)wqe->send.sgl;
+	build_term_codes(rsp_msg, &term->layer_etype, &term->ecode);
+	wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_SEND) |
+			 V_FW_RIWR_FLAGS(T3_COMPLETION_FLAG | T3_NOTIFY_FLAG));
+	wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(qhp->ep->hwtid));
+	skb->priority = CPL_PRIORITY_DATA;
+	return iwch_cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb);
+}
+
+/*
+ * Assumes qhp lock is held.
+ */
+static void __flush_qp(struct iwch_qp *qhp, struct iwch_cq *rchp,
+				struct iwch_cq *schp)
+	__releases(&qhp->lock)
+	__acquires(&qhp->lock)
+{
+	int count;
+	int flushed;
+
+	lockdep_assert_held(&qhp->lock);
+
+	pr_debug("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp);
+	/* take a ref on the qhp since we must release the lock */
+	atomic_inc(&qhp->refcnt);
+	spin_unlock(&qhp->lock);
+
+	/* locking hierarchy: cq lock first, then qp lock. */
+	spin_lock(&rchp->lock);
+	spin_lock(&qhp->lock);
+	cxio_flush_hw_cq(&rchp->cq);
+	cxio_count_rcqes(&rchp->cq, &qhp->wq, &count);
+	flushed = cxio_flush_rq(&qhp->wq, &rchp->cq, count);
+	spin_unlock(&qhp->lock);
+	spin_unlock(&rchp->lock);
+	if (flushed) {
+		spin_lock(&rchp->comp_handler_lock);
+		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+		spin_unlock(&rchp->comp_handler_lock);
+	}
+
+	/* locking hierarchy: cq lock first, then qp lock. */
+	spin_lock(&schp->lock);
+	spin_lock(&qhp->lock);
+	cxio_flush_hw_cq(&schp->cq);
+	cxio_count_scqes(&schp->cq, &qhp->wq, &count);
+	flushed = cxio_flush_sq(&qhp->wq, &schp->cq, count);
+	spin_unlock(&qhp->lock);
+	spin_unlock(&schp->lock);
+	if (flushed) {
+		spin_lock(&schp->comp_handler_lock);
+		(*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
+		spin_unlock(&schp->comp_handler_lock);
+	}
+
+	/* deref */
+	if (atomic_dec_and_test(&qhp->refcnt))
+	        wake_up(&qhp->wait);
+
+	spin_lock(&qhp->lock);
+}
+
+static void flush_qp(struct iwch_qp *qhp)
+{
+	struct iwch_cq *rchp, *schp;
+
+	rchp = get_chp(qhp->rhp, qhp->attr.rcq);
+	schp = get_chp(qhp->rhp, qhp->attr.scq);
+
+	if (qhp->ibqp.uobject) {
+		cxio_set_wq_in_error(&qhp->wq);
+		cxio_set_cq_in_error(&rchp->cq);
+		spin_lock(&rchp->comp_handler_lock);
+		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+		spin_unlock(&rchp->comp_handler_lock);
+		if (schp != rchp) {
+			cxio_set_cq_in_error(&schp->cq);
+			spin_lock(&schp->comp_handler_lock);
+			(*schp->ibcq.comp_handler)(&schp->ibcq,
+						   schp->ibcq.cq_context);
+			spin_unlock(&schp->comp_handler_lock);
+		}
+		return;
+	}
+	__flush_qp(qhp, rchp, schp);
+}
+
+
+/*
+ * Return count of RECV WRs posted
+ */
+u16 iwch_rqes_posted(struct iwch_qp *qhp)
+{
+	union t3_wr *wqe = qhp->wq.queue;
+	u16 count = 0;
+
+	while (count < USHRT_MAX && fw_riwrh_opcode((struct fw_riwrh *)wqe) == T3_WR_RCV) {
+		count++;
+		wqe++;
+	}
+	pr_debug("%s qhp %p count %u\n", __func__, qhp, count);
+	return count;
+}
+
+static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp,
+				enum iwch_qp_attr_mask mask,
+				struct iwch_qp_attributes *attrs)
+{
+	struct t3_rdma_init_attr init_attr;
+	int ret;
+
+	init_attr.tid = qhp->ep->hwtid;
+	init_attr.qpid = qhp->wq.qpid;
+	init_attr.pdid = qhp->attr.pd;
+	init_attr.scqid = qhp->attr.scq;
+	init_attr.rcqid = qhp->attr.rcq;
+	init_attr.rq_addr = qhp->wq.rq_addr;
+	init_attr.rq_size = 1 << qhp->wq.rq_size_log2;
+	init_attr.mpaattrs = uP_RI_MPA_IETF_ENABLE |
+		qhp->attr.mpa_attr.recv_marker_enabled |
+		(qhp->attr.mpa_attr.xmit_marker_enabled << 1) |
+		(qhp->attr.mpa_attr.crc_enabled << 2);
+
+	init_attr.qpcaps = uP_RI_QP_RDMA_READ_ENABLE |
+			   uP_RI_QP_RDMA_WRITE_ENABLE |
+			   uP_RI_QP_BIND_ENABLE;
+	if (!qhp->ibqp.uobject)
+		init_attr.qpcaps |= uP_RI_QP_STAG0_ENABLE |
+				    uP_RI_QP_FAST_REGISTER_ENABLE;
+
+	init_attr.tcp_emss = qhp->ep->emss;
+	init_attr.ord = qhp->attr.max_ord;
+	init_attr.ird = qhp->attr.max_ird;
+	init_attr.qp_dma_addr = qhp->wq.dma_addr;
+	init_attr.qp_dma_size = (1UL << qhp->wq.size_log2);
+	init_attr.rqe_count = iwch_rqes_posted(qhp);
+	init_attr.flags = qhp->attr.mpa_attr.initiator ? MPA_INITIATOR : 0;
+	init_attr.chan = qhp->ep->l2t->smt_idx;
+	if (peer2peer) {
+		init_attr.rtr_type = RTR_READ;
+		if (init_attr.ord == 0 && qhp->attr.mpa_attr.initiator)
+			init_attr.ord = 1;
+		if (init_attr.ird == 0 && !qhp->attr.mpa_attr.initiator)
+			init_attr.ird = 1;
+	} else
+		init_attr.rtr_type = 0;
+	init_attr.irs = qhp->ep->rcv_seq;
+	pr_debug("%s init_attr.rq_addr 0x%x init_attr.rq_size = %d flags 0x%x qpcaps 0x%x\n",
+		 __func__,
+		 init_attr.rq_addr, init_attr.rq_size,
+		 init_attr.flags, init_attr.qpcaps);
+	ret = cxio_rdma_init(&rhp->rdev, &init_attr);
+	pr_debug("%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
+				enum iwch_qp_attr_mask mask,
+				struct iwch_qp_attributes *attrs,
+				int internal)
+{
+	int ret = 0;
+	struct iwch_qp_attributes newattr = qhp->attr;
+	unsigned long flag;
+	int disconnect = 0;
+	int terminate = 0;
+	int abort = 0;
+	int free = 0;
+	struct iwch_ep *ep = NULL;
+
+	pr_debug("%s qhp %p qpid 0x%x ep %p state %d -> %d\n", __func__,
+		 qhp, qhp->wq.qpid, qhp->ep, qhp->attr.state,
+		 (mask & IWCH_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1);
+
+	spin_lock_irqsave(&qhp->lock, flag);
+
+	/* Process attr changes if in IDLE */
+	if (mask & IWCH_QP_ATTR_VALID_MODIFY) {
+		if (qhp->attr.state != IWCH_QP_STATE_IDLE) {
+			ret = -EIO;
+			goto out;
+		}
+		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_READ)
+			newattr.enable_rdma_read = attrs->enable_rdma_read;
+		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_WRITE)
+			newattr.enable_rdma_write = attrs->enable_rdma_write;
+		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_BIND)
+			newattr.enable_bind = attrs->enable_bind;
+		if (mask & IWCH_QP_ATTR_MAX_ORD) {
+			if (attrs->max_ord >
+			    rhp->attr.max_rdma_read_qp_depth) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ord = attrs->max_ord;
+		}
+		if (mask & IWCH_QP_ATTR_MAX_IRD) {
+			if (attrs->max_ird >
+			    rhp->attr.max_rdma_reads_per_qp) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ird = attrs->max_ird;
+		}
+		qhp->attr = newattr;
+	}
+
+	if (!(mask & IWCH_QP_ATTR_NEXT_STATE))
+		goto out;
+	if (qhp->attr.state == attrs->next_state)
+		goto out;
+
+	switch (qhp->attr.state) {
+	case IWCH_QP_STATE_IDLE:
+		switch (attrs->next_state) {
+		case IWCH_QP_STATE_RTS:
+			if (!(mask & IWCH_QP_ATTR_LLP_STREAM_HANDLE)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (!(mask & IWCH_QP_ATTR_MPA_ATTR)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			qhp->attr.mpa_attr = attrs->mpa_attr;
+			qhp->attr.llp_stream_handle = attrs->llp_stream_handle;
+			qhp->ep = qhp->attr.llp_stream_handle;
+			qhp->attr.state = IWCH_QP_STATE_RTS;
+
+			/*
+			 * Ref the endpoint here and deref when we
+			 * disassociate the endpoint from the QP.  This
+			 * happens in CLOSING->IDLE transition or *->ERROR
+			 * transition.
+			 */
+			get_ep(&qhp->ep->com);
+			spin_unlock_irqrestore(&qhp->lock, flag);
+			ret = rdma_init(rhp, qhp, mask, attrs);
+			spin_lock_irqsave(&qhp->lock, flag);
+			if (ret)
+				goto err;
+			break;
+		case IWCH_QP_STATE_ERROR:
+			qhp->attr.state = IWCH_QP_STATE_ERROR;
+			flush_qp(qhp);
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case IWCH_QP_STATE_RTS:
+		switch (attrs->next_state) {
+		case IWCH_QP_STATE_CLOSING:
+			BUG_ON(kref_read(&qhp->ep->com.kref) < 2);
+			qhp->attr.state = IWCH_QP_STATE_CLOSING;
+			if (!internal) {
+				abort=0;
+				disconnect = 1;
+				ep = qhp->ep;
+				get_ep(&ep->com);
+			}
+			break;
+		case IWCH_QP_STATE_TERMINATE:
+			qhp->attr.state = IWCH_QP_STATE_TERMINATE;
+			if (qhp->ibqp.uobject)
+				cxio_set_wq_in_error(&qhp->wq);
+			if (!internal)
+				terminate = 1;
+			break;
+		case IWCH_QP_STATE_ERROR:
+			qhp->attr.state = IWCH_QP_STATE_ERROR;
+			if (!internal) {
+				abort=1;
+				disconnect = 1;
+				ep = qhp->ep;
+				get_ep(&ep->com);
+			}
+			goto err;
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case IWCH_QP_STATE_CLOSING:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		switch (attrs->next_state) {
+			case IWCH_QP_STATE_IDLE:
+				flush_qp(qhp);
+				qhp->attr.state = IWCH_QP_STATE_IDLE;
+				qhp->attr.llp_stream_handle = NULL;
+				put_ep(&qhp->ep->com);
+				qhp->ep = NULL;
+				wake_up(&qhp->wait);
+				break;
+			case IWCH_QP_STATE_ERROR:
+				goto err;
+			default:
+				ret = -EINVAL;
+				goto err;
+		}
+		break;
+	case IWCH_QP_STATE_ERROR:
+		if (attrs->next_state != IWCH_QP_STATE_IDLE) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (!Q_EMPTY(qhp->wq.sq_rptr, qhp->wq.sq_wptr) ||
+		    !Q_EMPTY(qhp->wq.rq_rptr, qhp->wq.rq_wptr)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		qhp->attr.state = IWCH_QP_STATE_IDLE;
+		break;
+	case IWCH_QP_STATE_TERMINATE:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		goto err;
+		break;
+	default:
+		pr_err("%s in a bad state %d\n", __func__, qhp->attr.state);
+		ret = -EINVAL;
+		goto err;
+		break;
+	}
+	goto out;
+err:
+	pr_debug("%s disassociating ep %p qpid 0x%x\n", __func__, qhp->ep,
+		 qhp->wq.qpid);
+
+	/* disassociate the LLP connection */
+	qhp->attr.llp_stream_handle = NULL;
+	ep = qhp->ep;
+	qhp->ep = NULL;
+	qhp->attr.state = IWCH_QP_STATE_ERROR;
+	free=1;
+	wake_up(&qhp->wait);
+	BUG_ON(!ep);
+	flush_qp(qhp);
+out:
+	spin_unlock_irqrestore(&qhp->lock, flag);
+
+	if (terminate)
+		iwch_post_terminate(qhp, NULL);
+
+	/*
+	 * If disconnect is 1, then we need to initiate a disconnect
+	 * on the EP.  This can be a normal close (RTS->CLOSING) or
+	 * an abnormal close (RTS/CLOSING->ERROR).
+	 */
+	if (disconnect) {
+		iwch_ep_disconnect(ep, abort, GFP_KERNEL);
+		put_ep(&ep->com);
+	}
+
+	/*
+	 * If free is 1, then we've disassociated the EP from the QP
+	 * and we need to dereference the EP.
+	 */
+	if (free)
+		put_ep(&ep->com);
+
+	pr_debug("%s exit state %d\n", __func__, qhp->attr.state);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/cxgb3/tcb.h b/drivers/infiniband/hw/cxgb3/tcb.h
new file mode 100644
index 0000000..c702dc1
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/tcb.h
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) 2007 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _TCB_DEFS_H
+#define _TCB_DEFS_H
+
+#define W_TCB_T_STATE    0
+#define S_TCB_T_STATE    0
+#define M_TCB_T_STATE    0xfULL
+#define V_TCB_T_STATE(x) ((x) << S_TCB_T_STATE)
+
+#define W_TCB_TIMER    0
+#define S_TCB_TIMER    4
+#define M_TCB_TIMER    0x1ULL
+#define V_TCB_TIMER(x) ((x) << S_TCB_TIMER)
+
+#define W_TCB_DACK_TIMER    0
+#define S_TCB_DACK_TIMER    5
+#define M_TCB_DACK_TIMER    0x1ULL
+#define V_TCB_DACK_TIMER(x) ((x) << S_TCB_DACK_TIMER)
+
+#define W_TCB_DEL_FLAG    0
+#define S_TCB_DEL_FLAG    6
+#define M_TCB_DEL_FLAG    0x1ULL
+#define V_TCB_DEL_FLAG(x) ((x) << S_TCB_DEL_FLAG)
+
+#define W_TCB_L2T_IX    0
+#define S_TCB_L2T_IX    7
+#define M_TCB_L2T_IX    0x7ffULL
+#define V_TCB_L2T_IX(x) ((x) << S_TCB_L2T_IX)
+
+#define W_TCB_SMAC_SEL    0
+#define S_TCB_SMAC_SEL    18
+#define M_TCB_SMAC_SEL    0x3ULL
+#define V_TCB_SMAC_SEL(x) ((x) << S_TCB_SMAC_SEL)
+
+#define W_TCB_TOS    0
+#define S_TCB_TOS    20
+#define M_TCB_TOS    0x3fULL
+#define V_TCB_TOS(x) ((x) << S_TCB_TOS)
+
+#define W_TCB_MAX_RT    0
+#define S_TCB_MAX_RT    26
+#define M_TCB_MAX_RT    0xfULL
+#define V_TCB_MAX_RT(x) ((x) << S_TCB_MAX_RT)
+
+#define W_TCB_T_RXTSHIFT    0
+#define S_TCB_T_RXTSHIFT    30
+#define M_TCB_T_RXTSHIFT    0xfULL
+#define V_TCB_T_RXTSHIFT(x) ((x) << S_TCB_T_RXTSHIFT)
+
+#define W_TCB_T_DUPACKS    1
+#define S_TCB_T_DUPACKS    2
+#define M_TCB_T_DUPACKS    0xfULL
+#define V_TCB_T_DUPACKS(x) ((x) << S_TCB_T_DUPACKS)
+
+#define W_TCB_T_MAXSEG    1
+#define S_TCB_T_MAXSEG    6
+#define M_TCB_T_MAXSEG    0xfULL
+#define V_TCB_T_MAXSEG(x) ((x) << S_TCB_T_MAXSEG)
+
+#define W_TCB_T_FLAGS1    1
+#define S_TCB_T_FLAGS1    10
+#define M_TCB_T_FLAGS1    0xffffffffULL
+#define V_TCB_T_FLAGS1(x) ((x) << S_TCB_T_FLAGS1)
+
+#define W_TCB_T_MIGRATION    1
+#define S_TCB_T_MIGRATION    20
+#define M_TCB_T_MIGRATION    0x1ULL
+#define V_TCB_T_MIGRATION(x) ((x) << S_TCB_T_MIGRATION)
+
+#define W_TCB_T_FLAGS2    2
+#define S_TCB_T_FLAGS2    10
+#define M_TCB_T_FLAGS2    0x7fULL
+#define V_TCB_T_FLAGS2(x) ((x) << S_TCB_T_FLAGS2)
+
+#define W_TCB_SND_SCALE    2
+#define S_TCB_SND_SCALE    17
+#define M_TCB_SND_SCALE    0xfULL
+#define V_TCB_SND_SCALE(x) ((x) << S_TCB_SND_SCALE)
+
+#define W_TCB_RCV_SCALE    2
+#define S_TCB_RCV_SCALE    21
+#define M_TCB_RCV_SCALE    0xfULL
+#define V_TCB_RCV_SCALE(x) ((x) << S_TCB_RCV_SCALE)
+
+#define W_TCB_SND_UNA_RAW    2
+#define S_TCB_SND_UNA_RAW    25
+#define M_TCB_SND_UNA_RAW    0x7ffffffULL
+#define V_TCB_SND_UNA_RAW(x) ((x) << S_TCB_SND_UNA_RAW)
+
+#define W_TCB_SND_NXT_RAW    3
+#define S_TCB_SND_NXT_RAW    20
+#define M_TCB_SND_NXT_RAW    0x7ffffffULL
+#define V_TCB_SND_NXT_RAW(x) ((x) << S_TCB_SND_NXT_RAW)
+
+#define W_TCB_RCV_NXT    4
+#define S_TCB_RCV_NXT    15
+#define M_TCB_RCV_NXT    0xffffffffULL
+#define V_TCB_RCV_NXT(x) ((x) << S_TCB_RCV_NXT)
+
+#define W_TCB_RCV_ADV    5
+#define S_TCB_RCV_ADV    15
+#define M_TCB_RCV_ADV    0xffffULL
+#define V_TCB_RCV_ADV(x) ((x) << S_TCB_RCV_ADV)
+
+#define W_TCB_SND_MAX_RAW    5
+#define S_TCB_SND_MAX_RAW    31
+#define M_TCB_SND_MAX_RAW    0x7ffffffULL
+#define V_TCB_SND_MAX_RAW(x) ((x) << S_TCB_SND_MAX_RAW)
+
+#define W_TCB_SND_CWND    6
+#define S_TCB_SND_CWND    26
+#define M_TCB_SND_CWND    0x7ffffffULL
+#define V_TCB_SND_CWND(x) ((x) << S_TCB_SND_CWND)
+
+#define W_TCB_SND_SSTHRESH    7
+#define S_TCB_SND_SSTHRESH    21
+#define M_TCB_SND_SSTHRESH    0x7ffffffULL
+#define V_TCB_SND_SSTHRESH(x) ((x) << S_TCB_SND_SSTHRESH)
+
+#define W_TCB_T_RTT_TS_RECENT_AGE    8
+#define S_TCB_T_RTT_TS_RECENT_AGE    16
+#define M_TCB_T_RTT_TS_RECENT_AGE    0xffffffffULL
+#define V_TCB_T_RTT_TS_RECENT_AGE(x) ((x) << S_TCB_T_RTT_TS_RECENT_AGE)
+
+#define W_TCB_T_RTSEQ_RECENT    9
+#define S_TCB_T_RTSEQ_RECENT    16
+#define M_TCB_T_RTSEQ_RECENT    0xffffffffULL
+#define V_TCB_T_RTSEQ_RECENT(x) ((x) << S_TCB_T_RTSEQ_RECENT)
+
+#define W_TCB_T_SRTT    10
+#define S_TCB_T_SRTT    16
+#define M_TCB_T_SRTT    0xffffULL
+#define V_TCB_T_SRTT(x) ((x) << S_TCB_T_SRTT)
+
+#define W_TCB_T_RTTVAR    11
+#define S_TCB_T_RTTVAR    0
+#define M_TCB_T_RTTVAR    0xffffULL
+#define V_TCB_T_RTTVAR(x) ((x) << S_TCB_T_RTTVAR)
+
+#define W_TCB_TS_LAST_ACK_SENT_RAW    11
+#define S_TCB_TS_LAST_ACK_SENT_RAW    16
+#define M_TCB_TS_LAST_ACK_SENT_RAW    0x7ffffffULL
+#define V_TCB_TS_LAST_ACK_SENT_RAW(x) ((x) << S_TCB_TS_LAST_ACK_SENT_RAW)
+
+#define W_TCB_DIP    12
+#define S_TCB_DIP    11
+#define M_TCB_DIP    0xffffffffULL
+#define V_TCB_DIP(x) ((x) << S_TCB_DIP)
+
+#define W_TCB_SIP    13
+#define S_TCB_SIP    11
+#define M_TCB_SIP    0xffffffffULL
+#define V_TCB_SIP(x) ((x) << S_TCB_SIP)
+
+#define W_TCB_DP    14
+#define S_TCB_DP    11
+#define M_TCB_DP    0xffffULL
+#define V_TCB_DP(x) ((x) << S_TCB_DP)
+
+#define W_TCB_SP    14
+#define S_TCB_SP    27
+#define M_TCB_SP    0xffffULL
+#define V_TCB_SP(x) ((x) << S_TCB_SP)
+
+#define W_TCB_TIMESTAMP    15
+#define S_TCB_TIMESTAMP    11
+#define M_TCB_TIMESTAMP    0xffffffffULL
+#define V_TCB_TIMESTAMP(x) ((x) << S_TCB_TIMESTAMP)
+
+#define W_TCB_TIMESTAMP_OFFSET    16
+#define S_TCB_TIMESTAMP_OFFSET    11
+#define M_TCB_TIMESTAMP_OFFSET    0xfULL
+#define V_TCB_TIMESTAMP_OFFSET(x) ((x) << S_TCB_TIMESTAMP_OFFSET)
+
+#define W_TCB_TX_MAX    16
+#define S_TCB_TX_MAX    15
+#define M_TCB_TX_MAX    0xffffffffULL
+#define V_TCB_TX_MAX(x) ((x) << S_TCB_TX_MAX)
+
+#define W_TCB_TX_HDR_PTR_RAW    17
+#define S_TCB_TX_HDR_PTR_RAW    15
+#define M_TCB_TX_HDR_PTR_RAW    0x1ffffULL
+#define V_TCB_TX_HDR_PTR_RAW(x) ((x) << S_TCB_TX_HDR_PTR_RAW)
+
+#define W_TCB_TX_LAST_PTR_RAW    18
+#define S_TCB_TX_LAST_PTR_RAW    0
+#define M_TCB_TX_LAST_PTR_RAW    0x1ffffULL
+#define V_TCB_TX_LAST_PTR_RAW(x) ((x) << S_TCB_TX_LAST_PTR_RAW)
+
+#define W_TCB_TX_COMPACT    18
+#define S_TCB_TX_COMPACT    17
+#define M_TCB_TX_COMPACT    0x1ULL
+#define V_TCB_TX_COMPACT(x) ((x) << S_TCB_TX_COMPACT)
+
+#define W_TCB_RX_COMPACT    18
+#define S_TCB_RX_COMPACT    18
+#define M_TCB_RX_COMPACT    0x1ULL
+#define V_TCB_RX_COMPACT(x) ((x) << S_TCB_RX_COMPACT)
+
+#define W_TCB_RCV_WND    18
+#define S_TCB_RCV_WND    19
+#define M_TCB_RCV_WND    0x7ffffffULL
+#define V_TCB_RCV_WND(x) ((x) << S_TCB_RCV_WND)
+
+#define W_TCB_RX_HDR_OFFSET    19
+#define S_TCB_RX_HDR_OFFSET    14
+#define M_TCB_RX_HDR_OFFSET    0x7ffffffULL
+#define V_TCB_RX_HDR_OFFSET(x) ((x) << S_TCB_RX_HDR_OFFSET)
+
+#define W_TCB_RX_FRAG0_START_IDX_RAW    20
+#define S_TCB_RX_FRAG0_START_IDX_RAW    9
+#define M_TCB_RX_FRAG0_START_IDX_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG0_START_IDX_RAW(x) ((x) << S_TCB_RX_FRAG0_START_IDX_RAW)
+
+#define W_TCB_RX_FRAG1_START_IDX_OFFSET    21
+#define S_TCB_RX_FRAG1_START_IDX_OFFSET    4
+#define M_TCB_RX_FRAG1_START_IDX_OFFSET    0x7ffffffULL
+#define V_TCB_RX_FRAG1_START_IDX_OFFSET(x) ((x) << S_TCB_RX_FRAG1_START_IDX_OFFSET)
+
+#define W_TCB_RX_FRAG0_LEN    21
+#define S_TCB_RX_FRAG0_LEN    31
+#define M_TCB_RX_FRAG0_LEN    0x7ffffffULL
+#define V_TCB_RX_FRAG0_LEN(x) ((x) << S_TCB_RX_FRAG0_LEN)
+
+#define W_TCB_RX_FRAG1_LEN    22
+#define S_TCB_RX_FRAG1_LEN    26
+#define M_TCB_RX_FRAG1_LEN    0x7ffffffULL
+#define V_TCB_RX_FRAG1_LEN(x) ((x) << S_TCB_RX_FRAG1_LEN)
+
+#define W_TCB_NEWRENO_RECOVER    23
+#define S_TCB_NEWRENO_RECOVER    21
+#define M_TCB_NEWRENO_RECOVER    0x7ffffffULL
+#define V_TCB_NEWRENO_RECOVER(x) ((x) << S_TCB_NEWRENO_RECOVER)
+
+#define W_TCB_PDU_HAVE_LEN    24
+#define S_TCB_PDU_HAVE_LEN    16
+#define M_TCB_PDU_HAVE_LEN    0x1ULL
+#define V_TCB_PDU_HAVE_LEN(x) ((x) << S_TCB_PDU_HAVE_LEN)
+
+#define W_TCB_PDU_LEN    24
+#define S_TCB_PDU_LEN    17
+#define M_TCB_PDU_LEN    0xffffULL
+#define V_TCB_PDU_LEN(x) ((x) << S_TCB_PDU_LEN)
+
+#define W_TCB_RX_QUIESCE    25
+#define S_TCB_RX_QUIESCE    1
+#define M_TCB_RX_QUIESCE    0x1ULL
+#define V_TCB_RX_QUIESCE(x) ((x) << S_TCB_RX_QUIESCE)
+
+#define W_TCB_RX_PTR_RAW    25
+#define S_TCB_RX_PTR_RAW    2
+#define M_TCB_RX_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_PTR_RAW(x) ((x) << S_TCB_RX_PTR_RAW)
+
+#define W_TCB_CPU_NO    25
+#define S_TCB_CPU_NO    19
+#define M_TCB_CPU_NO    0x7fULL
+#define V_TCB_CPU_NO(x) ((x) << S_TCB_CPU_NO)
+
+#define W_TCB_ULP_TYPE    25
+#define S_TCB_ULP_TYPE    26
+#define M_TCB_ULP_TYPE    0xfULL
+#define V_TCB_ULP_TYPE(x) ((x) << S_TCB_ULP_TYPE)
+
+#define W_TCB_RX_FRAG1_PTR_RAW    25
+#define S_TCB_RX_FRAG1_PTR_RAW    30
+#define M_TCB_RX_FRAG1_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_FRAG1_PTR_RAW(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW)
+
+#define W_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    26
+#define S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    15
+#define M_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG2_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW)
+
+#define W_TCB_RX_FRAG2_PTR_RAW    27
+#define S_TCB_RX_FRAG2_PTR_RAW    10
+#define M_TCB_RX_FRAG2_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_FRAG2_PTR_RAW(x) ((x) << S_TCB_RX_FRAG2_PTR_RAW)
+
+#define W_TCB_RX_FRAG2_LEN_RAW    27
+#define S_TCB_RX_FRAG2_LEN_RAW    27
+#define M_TCB_RX_FRAG2_LEN_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG2_LEN_RAW(x) ((x) << S_TCB_RX_FRAG2_LEN_RAW)
+
+#define W_TCB_RX_FRAG3_PTR_RAW    28
+#define S_TCB_RX_FRAG3_PTR_RAW    22
+#define M_TCB_RX_FRAG3_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_FRAG3_PTR_RAW(x) ((x) << S_TCB_RX_FRAG3_PTR_RAW)
+
+#define W_TCB_RX_FRAG3_LEN_RAW    29
+#define S_TCB_RX_FRAG3_LEN_RAW    7
+#define M_TCB_RX_FRAG3_LEN_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG3_LEN_RAW(x) ((x) << S_TCB_RX_FRAG3_LEN_RAW)
+
+#define W_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    30
+#define S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    2
+#define M_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG3_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW)
+
+#define W_TCB_PDU_HDR_LEN    30
+#define S_TCB_PDU_HDR_LEN    29
+#define M_TCB_PDU_HDR_LEN    0xffULL
+#define V_TCB_PDU_HDR_LEN(x) ((x) << S_TCB_PDU_HDR_LEN)
+
+#define W_TCB_SLUSH1    31
+#define S_TCB_SLUSH1    5
+#define M_TCB_SLUSH1    0x7ffffULL
+#define V_TCB_SLUSH1(x) ((x) << S_TCB_SLUSH1)
+
+#define W_TCB_ULP_RAW    31
+#define S_TCB_ULP_RAW    24
+#define M_TCB_ULP_RAW    0xffULL
+#define V_TCB_ULP_RAW(x) ((x) << S_TCB_ULP_RAW)
+
+#define W_TCB_DDP_RDMAP_VERSION    25
+#define S_TCB_DDP_RDMAP_VERSION    30
+#define M_TCB_DDP_RDMAP_VERSION    0x1ULL
+#define V_TCB_DDP_RDMAP_VERSION(x) ((x) << S_TCB_DDP_RDMAP_VERSION)
+
+#define W_TCB_MARKER_ENABLE_RX    25
+#define S_TCB_MARKER_ENABLE_RX    31
+#define M_TCB_MARKER_ENABLE_RX    0x1ULL
+#define V_TCB_MARKER_ENABLE_RX(x) ((x) << S_TCB_MARKER_ENABLE_RX)
+
+#define W_TCB_MARKER_ENABLE_TX    26
+#define S_TCB_MARKER_ENABLE_TX    0
+#define M_TCB_MARKER_ENABLE_TX    0x1ULL
+#define V_TCB_MARKER_ENABLE_TX(x) ((x) << S_TCB_MARKER_ENABLE_TX)
+
+#define W_TCB_CRC_ENABLE    26
+#define S_TCB_CRC_ENABLE    1
+#define M_TCB_CRC_ENABLE    0x1ULL
+#define V_TCB_CRC_ENABLE(x) ((x) << S_TCB_CRC_ENABLE)
+
+#define W_TCB_IRS_ULP    26
+#define S_TCB_IRS_ULP    2
+#define M_TCB_IRS_ULP    0x1ffULL
+#define V_TCB_IRS_ULP(x) ((x) << S_TCB_IRS_ULP)
+
+#define W_TCB_ISS_ULP    26
+#define S_TCB_ISS_ULP    11
+#define M_TCB_ISS_ULP    0x1ffULL
+#define V_TCB_ISS_ULP(x) ((x) << S_TCB_ISS_ULP)
+
+#define W_TCB_TX_PDU_LEN    26
+#define S_TCB_TX_PDU_LEN    20
+#define M_TCB_TX_PDU_LEN    0x3fffULL
+#define V_TCB_TX_PDU_LEN(x) ((x) << S_TCB_TX_PDU_LEN)
+
+#define W_TCB_TX_PDU_OUT    27
+#define S_TCB_TX_PDU_OUT    2
+#define M_TCB_TX_PDU_OUT    0x1ULL
+#define V_TCB_TX_PDU_OUT(x) ((x) << S_TCB_TX_PDU_OUT)
+
+#define W_TCB_CQ_IDX_SQ    27
+#define S_TCB_CQ_IDX_SQ    3
+#define M_TCB_CQ_IDX_SQ    0xffffULL
+#define V_TCB_CQ_IDX_SQ(x) ((x) << S_TCB_CQ_IDX_SQ)
+
+#define W_TCB_CQ_IDX_RQ    27
+#define S_TCB_CQ_IDX_RQ    19
+#define M_TCB_CQ_IDX_RQ    0xffffULL
+#define V_TCB_CQ_IDX_RQ(x) ((x) << S_TCB_CQ_IDX_RQ)
+
+#define W_TCB_QP_ID    28
+#define S_TCB_QP_ID    3
+#define M_TCB_QP_ID    0xffffULL
+#define V_TCB_QP_ID(x) ((x) << S_TCB_QP_ID)
+
+#define W_TCB_PD_ID    28
+#define S_TCB_PD_ID    19
+#define M_TCB_PD_ID    0xffffULL
+#define V_TCB_PD_ID(x) ((x) << S_TCB_PD_ID)
+
+#define W_TCB_STAG    29
+#define S_TCB_STAG    3
+#define M_TCB_STAG    0xffffffffULL
+#define V_TCB_STAG(x) ((x) << S_TCB_STAG)
+
+#define W_TCB_RQ_START    30
+#define S_TCB_RQ_START    3
+#define M_TCB_RQ_START    0x3ffffffULL
+#define V_TCB_RQ_START(x) ((x) << S_TCB_RQ_START)
+
+#define W_TCB_RQ_MSN    30
+#define S_TCB_RQ_MSN    29
+#define M_TCB_RQ_MSN    0x3ffULL
+#define V_TCB_RQ_MSN(x) ((x) << S_TCB_RQ_MSN)
+
+#define W_TCB_RQ_MAX_OFFSET    31
+#define S_TCB_RQ_MAX_OFFSET    7
+#define M_TCB_RQ_MAX_OFFSET    0xfULL
+#define V_TCB_RQ_MAX_OFFSET(x) ((x) << S_TCB_RQ_MAX_OFFSET)
+
+#define W_TCB_RQ_WRITE_PTR    31
+#define S_TCB_RQ_WRITE_PTR    11
+#define M_TCB_RQ_WRITE_PTR    0x3ffULL
+#define V_TCB_RQ_WRITE_PTR(x) ((x) << S_TCB_RQ_WRITE_PTR)
+
+#define W_TCB_INB_WRITE_PERM    31
+#define S_TCB_INB_WRITE_PERM    21
+#define M_TCB_INB_WRITE_PERM    0x1ULL
+#define V_TCB_INB_WRITE_PERM(x) ((x) << S_TCB_INB_WRITE_PERM)
+
+#define W_TCB_INB_READ_PERM    31
+#define S_TCB_INB_READ_PERM    22
+#define M_TCB_INB_READ_PERM    0x1ULL
+#define V_TCB_INB_READ_PERM(x) ((x) << S_TCB_INB_READ_PERM)
+
+#define W_TCB_ORD_L_BIT_VLD    31
+#define S_TCB_ORD_L_BIT_VLD    23
+#define M_TCB_ORD_L_BIT_VLD    0x1ULL
+#define V_TCB_ORD_L_BIT_VLD(x) ((x) << S_TCB_ORD_L_BIT_VLD)
+
+#define W_TCB_RDMAP_OPCODE    31
+#define S_TCB_RDMAP_OPCODE    24
+#define M_TCB_RDMAP_OPCODE    0xfULL
+#define V_TCB_RDMAP_OPCODE(x) ((x) << S_TCB_RDMAP_OPCODE)
+
+#define W_TCB_TX_FLUSH    31
+#define S_TCB_TX_FLUSH    28
+#define M_TCB_TX_FLUSH    0x1ULL
+#define V_TCB_TX_FLUSH(x) ((x) << S_TCB_TX_FLUSH)
+
+#define W_TCB_TX_OOS_RXMT    31
+#define S_TCB_TX_OOS_RXMT    29
+#define M_TCB_TX_OOS_RXMT    0x1ULL
+#define V_TCB_TX_OOS_RXMT(x) ((x) << S_TCB_TX_OOS_RXMT)
+
+#define W_TCB_TX_OOS_TXMT    31
+#define S_TCB_TX_OOS_TXMT    30
+#define M_TCB_TX_OOS_TXMT    0x1ULL
+#define V_TCB_TX_OOS_TXMT(x) ((x) << S_TCB_TX_OOS_TXMT)
+
+#define W_TCB_SLUSH_AUX2    31
+#define S_TCB_SLUSH_AUX2    31
+#define M_TCB_SLUSH_AUX2    0x1ULL
+#define V_TCB_SLUSH_AUX2(x) ((x) << S_TCB_SLUSH_AUX2)
+
+#define W_TCB_RX_FRAG1_PTR_RAW2    25
+#define S_TCB_RX_FRAG1_PTR_RAW2    30
+#define M_TCB_RX_FRAG1_PTR_RAW2    0x1ffffULL
+#define V_TCB_RX_FRAG1_PTR_RAW2(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW2)
+
+#define W_TCB_RX_DDP_FLAGS    26
+#define S_TCB_RX_DDP_FLAGS    15
+#define M_TCB_RX_DDP_FLAGS    0x3ffULL
+#define V_TCB_RX_DDP_FLAGS(x) ((x) << S_TCB_RX_DDP_FLAGS)
+
+#define W_TCB_SLUSH_AUX3    26
+#define S_TCB_SLUSH_AUX3    31
+#define M_TCB_SLUSH_AUX3    0x1ffULL
+#define V_TCB_SLUSH_AUX3(x) ((x) << S_TCB_SLUSH_AUX3)
+
+#define W_TCB_RX_DDP_BUF0_OFFSET    27
+#define S_TCB_RX_DDP_BUF0_OFFSET    8
+#define M_TCB_RX_DDP_BUF0_OFFSET    0x3fffffULL
+#define V_TCB_RX_DDP_BUF0_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF0_OFFSET)
+
+#define W_TCB_RX_DDP_BUF0_LEN    27
+#define S_TCB_RX_DDP_BUF0_LEN    30
+#define M_TCB_RX_DDP_BUF0_LEN    0x3fffffULL
+#define V_TCB_RX_DDP_BUF0_LEN(x) ((x) << S_TCB_RX_DDP_BUF0_LEN)
+
+#define W_TCB_RX_DDP_BUF1_OFFSET    28
+#define S_TCB_RX_DDP_BUF1_OFFSET    20
+#define M_TCB_RX_DDP_BUF1_OFFSET    0x3fffffULL
+#define V_TCB_RX_DDP_BUF1_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF1_OFFSET)
+
+#define W_TCB_RX_DDP_BUF1_LEN    29
+#define S_TCB_RX_DDP_BUF1_LEN    10
+#define M_TCB_RX_DDP_BUF1_LEN    0x3fffffULL
+#define V_TCB_RX_DDP_BUF1_LEN(x) ((x) << S_TCB_RX_DDP_BUF1_LEN)
+
+#define W_TCB_RX_DDP_BUF0_TAG    30
+#define S_TCB_RX_DDP_BUF0_TAG    0
+#define M_TCB_RX_DDP_BUF0_TAG    0xffffffffULL
+#define V_TCB_RX_DDP_BUF0_TAG(x) ((x) << S_TCB_RX_DDP_BUF0_TAG)
+
+#define W_TCB_RX_DDP_BUF1_TAG    31
+#define S_TCB_RX_DDP_BUF1_TAG    0
+#define M_TCB_RX_DDP_BUF1_TAG    0xffffffffULL
+#define V_TCB_RX_DDP_BUF1_TAG(x) ((x) << S_TCB_RX_DDP_BUF1_TAG)
+
+#define S_TF_DACK    10
+#define V_TF_DACK(x) ((x) << S_TF_DACK)
+
+#define S_TF_NAGLE    11
+#define V_TF_NAGLE(x) ((x) << S_TF_NAGLE)
+
+#define S_TF_RECV_SCALE    12
+#define V_TF_RECV_SCALE(x) ((x) << S_TF_RECV_SCALE)
+
+#define S_TF_RECV_TSTMP    13
+#define V_TF_RECV_TSTMP(x) ((x) << S_TF_RECV_TSTMP)
+
+#define S_TF_RECV_SACK    14
+#define V_TF_RECV_SACK(x) ((x) << S_TF_RECV_SACK)
+
+#define S_TF_TURBO    15
+#define V_TF_TURBO(x) ((x) << S_TF_TURBO)
+
+#define S_TF_KEEPALIVE    16
+#define V_TF_KEEPALIVE(x) ((x) << S_TF_KEEPALIVE)
+
+#define S_TF_TCAM_BYPASS    17
+#define V_TF_TCAM_BYPASS(x) ((x) << S_TF_TCAM_BYPASS)
+
+#define S_TF_CORE_FIN    18
+#define V_TF_CORE_FIN(x) ((x) << S_TF_CORE_FIN)
+
+#define S_TF_CORE_MORE    19
+#define V_TF_CORE_MORE(x) ((x) << S_TF_CORE_MORE)
+
+#define S_TF_MIGRATING    20
+#define V_TF_MIGRATING(x) ((x) << S_TF_MIGRATING)
+
+#define S_TF_ACTIVE_OPEN    21
+#define V_TF_ACTIVE_OPEN(x) ((x) << S_TF_ACTIVE_OPEN)
+
+#define S_TF_ASK_MODE    22
+#define V_TF_ASK_MODE(x) ((x) << S_TF_ASK_MODE)
+
+#define S_TF_NON_OFFLOAD    23
+#define V_TF_NON_OFFLOAD(x) ((x) << S_TF_NON_OFFLOAD)
+
+#define S_TF_MOD_SCHD    24
+#define V_TF_MOD_SCHD(x) ((x) << S_TF_MOD_SCHD)
+
+#define S_TF_MOD_SCHD_REASON0    25
+#define V_TF_MOD_SCHD_REASON0(x) ((x) << S_TF_MOD_SCHD_REASON0)
+
+#define S_TF_MOD_SCHD_REASON1    26
+#define V_TF_MOD_SCHD_REASON1(x) ((x) << S_TF_MOD_SCHD_REASON1)
+
+#define S_TF_MOD_SCHD_RX    27
+#define V_TF_MOD_SCHD_RX(x) ((x) << S_TF_MOD_SCHD_RX)
+
+#define S_TF_CORE_PUSH    28
+#define V_TF_CORE_PUSH(x) ((x) << S_TF_CORE_PUSH)
+
+#define S_TF_RCV_COALESCE_ENABLE    29
+#define V_TF_RCV_COALESCE_ENABLE(x) ((x) << S_TF_RCV_COALESCE_ENABLE)
+
+#define S_TF_RCV_COALESCE_PUSH    30
+#define V_TF_RCV_COALESCE_PUSH(x) ((x) << S_TF_RCV_COALESCE_PUSH)
+
+#define S_TF_RCV_COALESCE_LAST_PSH    31
+#define V_TF_RCV_COALESCE_LAST_PSH(x) ((x) << S_TF_RCV_COALESCE_LAST_PSH)
+
+#define S_TF_RCV_COALESCE_HEARTBEAT    32
+#define V_TF_RCV_COALESCE_HEARTBEAT(x) ((x) << S_TF_RCV_COALESCE_HEARTBEAT)
+
+#define S_TF_HALF_CLOSE    33
+#define V_TF_HALF_CLOSE(x) ((x) << S_TF_HALF_CLOSE)
+
+#define S_TF_DACK_MSS    34
+#define V_TF_DACK_MSS(x) ((x) << S_TF_DACK_MSS)
+
+#define S_TF_CCTRL_SEL0    35
+#define V_TF_CCTRL_SEL0(x) ((x) << S_TF_CCTRL_SEL0)
+
+#define S_TF_CCTRL_SEL1    36
+#define V_TF_CCTRL_SEL1(x) ((x) << S_TF_CCTRL_SEL1)
+
+#define S_TF_TCP_NEWRENO_FAST_RECOVERY    37
+#define V_TF_TCP_NEWRENO_FAST_RECOVERY(x) ((x) << S_TF_TCP_NEWRENO_FAST_RECOVERY)
+
+#define S_TF_TX_PACE_AUTO    38
+#define V_TF_TX_PACE_AUTO(x) ((x) << S_TF_TX_PACE_AUTO)
+
+#define S_TF_PEER_FIN_HELD    39
+#define V_TF_PEER_FIN_HELD(x) ((x) << S_TF_PEER_FIN_HELD)
+
+#define S_TF_CORE_URG    40
+#define V_TF_CORE_URG(x) ((x) << S_TF_CORE_URG)
+
+#define S_TF_RDMA_ERROR    41
+#define V_TF_RDMA_ERROR(x) ((x) << S_TF_RDMA_ERROR)
+
+#define S_TF_SSWS_DISABLED    42
+#define V_TF_SSWS_DISABLED(x) ((x) << S_TF_SSWS_DISABLED)
+
+#define S_TF_DUPACK_COUNT_ODD    43
+#define V_TF_DUPACK_COUNT_ODD(x) ((x) << S_TF_DUPACK_COUNT_ODD)
+
+#define S_TF_TX_CHANNEL    44
+#define V_TF_TX_CHANNEL(x) ((x) << S_TF_TX_CHANNEL)
+
+#define S_TF_RX_CHANNEL    45
+#define V_TF_RX_CHANNEL(x) ((x) << S_TF_RX_CHANNEL)
+
+#define S_TF_TX_PACE_FIXED    46
+#define V_TF_TX_PACE_FIXED(x) ((x) << S_TF_TX_PACE_FIXED)
+
+#define S_TF_RDMA_FLM_ERROR    47
+#define V_TF_RDMA_FLM_ERROR(x) ((x) << S_TF_RDMA_FLM_ERROR)
+
+#define S_TF_RX_FLOW_CONTROL_DISABLE    48
+#define V_TF_RX_FLOW_CONTROL_DISABLE(x) ((x) << S_TF_RX_FLOW_CONTROL_DISABLE)
+
+#endif /* _TCB_DEFS_H */
diff --git a/drivers/infiniband/hw/cxgb4/Kconfig b/drivers/infiniband/hw/cxgb4/Kconfig
new file mode 100644
index 0000000..0a671a6
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/Kconfig
@@ -0,0 +1,19 @@
+config INFINIBAND_CXGB4
+	tristate "Chelsio T4/T5 RDMA Driver"
+	depends on CHELSIO_T4 && INET
+	select CHELSIO_LIB
+	select GENERIC_ALLOCATOR
+	---help---
+	  This is an iWARP/RDMA driver for the Chelsio T4 and T5
+	  1GbE, 10GbE adapters and T5 40GbE adapter.
+
+	  For general information about Chelsio and our products, visit
+	  our website at <http://www.chelsio.com>.
+
+	  For customer support, please visit our customer support page at
+	  <http://www.chelsio.com/support.html>.
+
+	  Please send feedback to <linux-bugs@chelsio.com>.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called iw_cxgb4.
diff --git a/drivers/infiniband/hw/cxgb4/Makefile b/drivers/infiniband/hw/cxgb4/Makefile
new file mode 100644
index 0000000..fa40b68
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/Makefile
@@ -0,0 +1,6 @@
+ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4
+ccflags-y += -Idrivers/net/ethernet/chelsio/libcxgb
+
+obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o
+
+iw_cxgb4-y :=  device.o cm.o provider.o mem.o cq.o qp.o resource.o ev.o id_table.o
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
new file mode 100644
index 0000000..4cf17c6
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -0,0 +1,4263 @@
+/*
+ * Copyright (c) 2009-2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+
+#include <net/neighbour.h>
+#include <net/netevent.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+
+#include <rdma/ib_addr.h>
+
+#include <libcxgb_cm.h>
+#include "iw_cxgb4.h"
+#include "clip_tbl.h"
+
+static char *states[] = {
+	"idle",
+	"listen",
+	"connecting",
+	"mpa_wait_req",
+	"mpa_req_sent",
+	"mpa_req_rcvd",
+	"mpa_rep_sent",
+	"fpdu_mode",
+	"aborting",
+	"closing",
+	"moribund",
+	"dead",
+	NULL,
+};
+
+static int nocong;
+module_param(nocong, int, 0644);
+MODULE_PARM_DESC(nocong, "Turn of congestion control (default=0)");
+
+static int enable_ecn;
+module_param(enable_ecn, int, 0644);
+MODULE_PARM_DESC(enable_ecn, "Enable ECN (default=0/disabled)");
+
+static int dack_mode = 1;
+module_param(dack_mode, int, 0644);
+MODULE_PARM_DESC(dack_mode, "Delayed ack mode (default=1)");
+
+uint c4iw_max_read_depth = 32;
+module_param(c4iw_max_read_depth, int, 0644);
+MODULE_PARM_DESC(c4iw_max_read_depth,
+		 "Per-connection max ORD/IRD (default=32)");
+
+static int enable_tcp_timestamps;
+module_param(enable_tcp_timestamps, int, 0644);
+MODULE_PARM_DESC(enable_tcp_timestamps, "Enable tcp timestamps (default=0)");
+
+static int enable_tcp_sack;
+module_param(enable_tcp_sack, int, 0644);
+MODULE_PARM_DESC(enable_tcp_sack, "Enable tcp SACK (default=0)");
+
+static int enable_tcp_window_scaling = 1;
+module_param(enable_tcp_window_scaling, int, 0644);
+MODULE_PARM_DESC(enable_tcp_window_scaling,
+		 "Enable tcp window scaling (default=1)");
+
+static int peer2peer = 1;
+module_param(peer2peer, int, 0644);
+MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=1)");
+
+static int p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ;
+module_param(p2p_type, int, 0644);
+MODULE_PARM_DESC(p2p_type, "RDMAP opcode to use for the RTR message: "
+			   "1=RDMA_READ 0=RDMA_WRITE (default 1)");
+
+static int ep_timeout_secs = 60;
+module_param(ep_timeout_secs, int, 0644);
+MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
+				   "in seconds (default=60)");
+
+static int mpa_rev = 2;
+module_param(mpa_rev, int, 0644);
+MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
+		"1 is RFC5044 spec compliant, 2 is IETF MPA Peer Connect Draft"
+		" compliant (default=2)");
+
+static int markers_enabled;
+module_param(markers_enabled, int, 0644);
+MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)");
+
+static int crc_enabled = 1;
+module_param(crc_enabled, int, 0644);
+MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)");
+
+static int rcv_win = 256 * 1024;
+module_param(rcv_win, int, 0644);
+MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256KB)");
+
+static int snd_win = 128 * 1024;
+module_param(snd_win, int, 0644);
+MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=128KB)");
+
+static struct workqueue_struct *workq;
+
+static struct sk_buff_head rxq;
+
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
+static void ep_timeout(struct timer_list *t);
+static void connect_reply_upcall(struct c4iw_ep *ep, int status);
+static int sched(struct c4iw_dev *dev, struct sk_buff *skb);
+
+static LIST_HEAD(timeout_list);
+static spinlock_t timeout_lock;
+
+static void deref_cm_id(struct c4iw_ep_common *epc)
+{
+	epc->cm_id->rem_ref(epc->cm_id);
+	epc->cm_id = NULL;
+	set_bit(CM_ID_DEREFED, &epc->history);
+}
+
+static void ref_cm_id(struct c4iw_ep_common *epc)
+{
+	set_bit(CM_ID_REFED, &epc->history);
+	epc->cm_id->add_ref(epc->cm_id);
+}
+
+static void deref_qp(struct c4iw_ep *ep)
+{
+	c4iw_qp_rem_ref(&ep->com.qp->ibqp);
+	clear_bit(QP_REFERENCED, &ep->com.flags);
+	set_bit(QP_DEREFED, &ep->com.history);
+}
+
+static void ref_qp(struct c4iw_ep *ep)
+{
+	set_bit(QP_REFERENCED, &ep->com.flags);
+	set_bit(QP_REFED, &ep->com.history);
+	c4iw_qp_add_ref(&ep->com.qp->ibqp);
+}
+
+static void start_ep_timer(struct c4iw_ep *ep)
+{
+	pr_debug("ep %p\n", ep);
+	if (timer_pending(&ep->timer)) {
+		pr_err("%s timer already started! ep %p\n",
+		       __func__, ep);
+		return;
+	}
+	clear_bit(TIMEOUT, &ep->com.flags);
+	c4iw_get_ep(&ep->com);
+	ep->timer.expires = jiffies + ep_timeout_secs * HZ;
+	add_timer(&ep->timer);
+}
+
+static int stop_ep_timer(struct c4iw_ep *ep)
+{
+	pr_debug("ep %p stopping\n", ep);
+	del_timer_sync(&ep->timer);
+	if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
+		c4iw_put_ep(&ep->com);
+		return 0;
+	}
+	return 1;
+}
+
+static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb,
+		  struct l2t_entry *l2e)
+{
+	int	error = 0;
+
+	if (c4iw_fatal_error(rdev)) {
+		kfree_skb(skb);
+		pr_err("%s - device in error state - dropping\n", __func__);
+		return -EIO;
+	}
+	error = cxgb4_l2t_send(rdev->lldi.ports[0], skb, l2e);
+	if (error < 0)
+		kfree_skb(skb);
+	else if (error == NET_XMIT_DROP)
+		return -ENOMEM;
+	return error < 0 ? error : 0;
+}
+
+int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb)
+{
+	int	error = 0;
+
+	if (c4iw_fatal_error(rdev)) {
+		kfree_skb(skb);
+		pr_err("%s - device in error state - dropping\n", __func__);
+		return -EIO;
+	}
+	error = cxgb4_ofld_send(rdev->lldi.ports[0], skb);
+	if (error < 0)
+		kfree_skb(skb);
+	return error < 0 ? error : 0;
+}
+
+static void release_tid(struct c4iw_rdev *rdev, u32 hwtid, struct sk_buff *skb)
+{
+	u32 len = roundup(sizeof(struct cpl_tid_release), 16);
+
+	skb = get_skb(skb, len, GFP_KERNEL);
+	if (!skb)
+		return;
+
+	cxgb_mk_tid_release(skb, len, hwtid, 0);
+	c4iw_ofld_send(rdev, skb);
+	return;
+}
+
+static void set_emss(struct c4iw_ep *ep, u16 opt)
+{
+	ep->emss = ep->com.dev->rdev.lldi.mtus[TCPOPT_MSS_G(opt)] -
+		   ((AF_INET == ep->com.remote_addr.ss_family) ?
+		    sizeof(struct iphdr) : sizeof(struct ipv6hdr)) -
+		   sizeof(struct tcphdr);
+	ep->mss = ep->emss;
+	if (TCPOPT_TSTAMP_G(opt))
+		ep->emss -= round_up(TCPOLEN_TIMESTAMP, 4);
+	if (ep->emss < 128)
+		ep->emss = 128;
+	if (ep->emss & 7)
+		pr_debug("Warning: misaligned mtu idx %u mss %u emss=%u\n",
+			 TCPOPT_MSS_G(opt), ep->mss, ep->emss);
+	pr_debug("mss_idx %u mss %u emss=%u\n", TCPOPT_MSS_G(opt), ep->mss,
+		 ep->emss);
+}
+
+static enum c4iw_ep_state state_read(struct c4iw_ep_common *epc)
+{
+	enum c4iw_ep_state state;
+
+	mutex_lock(&epc->mutex);
+	state = epc->state;
+	mutex_unlock(&epc->mutex);
+	return state;
+}
+
+static void __state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new)
+{
+	epc->state = new;
+}
+
+static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new)
+{
+	mutex_lock(&epc->mutex);
+	pr_debug("%s -> %s\n", states[epc->state], states[new]);
+	__state_set(epc, new);
+	mutex_unlock(&epc->mutex);
+	return;
+}
+
+static int alloc_ep_skb_list(struct sk_buff_head *ep_skb_list, int size)
+{
+	struct sk_buff *skb;
+	unsigned int i;
+	size_t len;
+
+	len = roundup(sizeof(union cpl_wr_size), 16);
+	for (i = 0; i < size; i++) {
+		skb = alloc_skb(len, GFP_KERNEL);
+		if (!skb)
+			goto fail;
+		skb_queue_tail(ep_skb_list, skb);
+	}
+	return 0;
+fail:
+	skb_queue_purge(ep_skb_list);
+	return -ENOMEM;
+}
+
+static void *alloc_ep(int size, gfp_t gfp)
+{
+	struct c4iw_ep_common *epc;
+
+	epc = kzalloc(size, gfp);
+	if (epc) {
+		epc->wr_waitp = c4iw_alloc_wr_wait(gfp);
+		if (!epc->wr_waitp) {
+			kfree(epc);
+			epc = NULL;
+			goto out;
+		}
+		kref_init(&epc->kref);
+		mutex_init(&epc->mutex);
+		c4iw_init_wr_wait(epc->wr_waitp);
+	}
+	pr_debug("alloc ep %p\n", epc);
+out:
+	return epc;
+}
+
+static void remove_ep_tid(struct c4iw_ep *ep)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ep->com.dev->lock, flags);
+	_remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid, 0);
+	if (idr_is_empty(&ep->com.dev->hwtid_idr))
+		wake_up(&ep->com.dev->wait);
+	spin_unlock_irqrestore(&ep->com.dev->lock, flags);
+}
+
+static void insert_ep_tid(struct c4iw_ep *ep)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ep->com.dev->lock, flags);
+	_insert_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep, ep->hwtid, 0);
+	spin_unlock_irqrestore(&ep->com.dev->lock, flags);
+}
+
+/*
+ * Atomically lookup the ep ptr given the tid and grab a reference on the ep.
+ */
+static struct c4iw_ep *get_ep_from_tid(struct c4iw_dev *dev, unsigned int tid)
+{
+	struct c4iw_ep *ep;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->lock, flags);
+	ep = idr_find(&dev->hwtid_idr, tid);
+	if (ep)
+		c4iw_get_ep(&ep->com);
+	spin_unlock_irqrestore(&dev->lock, flags);
+	return ep;
+}
+
+/*
+ * Atomically lookup the ep ptr given the stid and grab a reference on the ep.
+ */
+static struct c4iw_listen_ep *get_ep_from_stid(struct c4iw_dev *dev,
+					       unsigned int stid)
+{
+	struct c4iw_listen_ep *ep;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->lock, flags);
+	ep = idr_find(&dev->stid_idr, stid);
+	if (ep)
+		c4iw_get_ep(&ep->com);
+	spin_unlock_irqrestore(&dev->lock, flags);
+	return ep;
+}
+
+void _c4iw_free_ep(struct kref *kref)
+{
+	struct c4iw_ep *ep;
+
+	ep = container_of(kref, struct c4iw_ep, com.kref);
+	pr_debug("ep %p state %s\n", ep, states[ep->com.state]);
+	if (test_bit(QP_REFERENCED, &ep->com.flags))
+		deref_qp(ep);
+	if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
+		if (ep->com.remote_addr.ss_family == AF_INET6) {
+			struct sockaddr_in6 *sin6 =
+					(struct sockaddr_in6 *)
+					&ep->com.local_addr;
+
+			cxgb4_clip_release(
+					ep->com.dev->rdev.lldi.ports[0],
+					(const u32 *)&sin6->sin6_addr.s6_addr,
+					1);
+		}
+		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid,
+				 ep->com.local_addr.ss_family);
+		dst_release(ep->dst);
+		cxgb4_l2t_release(ep->l2t);
+		if (ep->mpa_skb)
+			kfree_skb(ep->mpa_skb);
+	}
+	if (!skb_queue_empty(&ep->com.ep_skb_list))
+		skb_queue_purge(&ep->com.ep_skb_list);
+	c4iw_put_wr_wait(ep->com.wr_waitp);
+	kfree(ep);
+}
+
+static void release_ep_resources(struct c4iw_ep *ep)
+{
+	set_bit(RELEASE_RESOURCES, &ep->com.flags);
+
+	/*
+	 * If we have a hwtid, then remove it from the idr table
+	 * so lookups will no longer find this endpoint.  Otherwise
+	 * we have a race where one thread finds the ep ptr just
+	 * before the other thread is freeing the ep memory.
+	 */
+	if (ep->hwtid != -1)
+		remove_ep_tid(ep);
+	c4iw_put_ep(&ep->com);
+}
+
+static int status2errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_NONE:
+		return 0;
+	case CPL_ERR_CONN_RESET:
+		return -ECONNRESET;
+	case CPL_ERR_ARP_MISS:
+		return -EHOSTUNREACH;
+	case CPL_ERR_CONN_TIMEDOUT:
+		return -ETIMEDOUT;
+	case CPL_ERR_TCAM_FULL:
+		return -ENOMEM;
+	case CPL_ERR_CONN_EXIST:
+		return -EADDRINUSE;
+	default:
+		return -EIO;
+	}
+}
+
+/*
+ * Try and reuse skbs already allocated...
+ */
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp)
+{
+	if (skb && !skb_is_nonlinear(skb) && !skb_cloned(skb)) {
+		skb_trim(skb, 0);
+		skb_get(skb);
+		skb_reset_transport_header(skb);
+	} else {
+		skb = alloc_skb(len, gfp);
+	}
+	t4_set_arp_err_handler(skb, NULL, NULL);
+	return skb;
+}
+
+static struct net_device *get_real_dev(struct net_device *egress_dev)
+{
+	return rdma_vlan_dev_real_dev(egress_dev) ? : egress_dev;
+}
+
+static void arp_failure_discard(void *handle, struct sk_buff *skb)
+{
+	pr_err("ARP failure\n");
+	kfree_skb(skb);
+}
+
+static void mpa_start_arp_failure(void *handle, struct sk_buff *skb)
+{
+	pr_err("ARP failure during MPA Negotiation - Closing Connection\n");
+}
+
+enum {
+	NUM_FAKE_CPLS = 2,
+	FAKE_CPL_PUT_EP_SAFE = NUM_CPL_CMDS + 0,
+	FAKE_CPL_PASS_PUT_EP_SAFE = NUM_CPL_CMDS + 1,
+};
+
+static int _put_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+
+	ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *)));
+	release_ep_resources(ep);
+	kfree_skb(skb);
+	return 0;
+}
+
+static int _put_pass_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+
+	ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *)));
+	c4iw_put_ep(&ep->parent_ep->com);
+	release_ep_resources(ep);
+	kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Fake up a special CPL opcode and call sched() so process_work() will call
+ * _put_ep_safe() in a safe context to free the ep resources.  This is needed
+ * because ARP error handlers are called in an ATOMIC context, and
+ * _c4iw_free_ep() needs to block.
+ */
+static void queue_arp_failure_cpl(struct c4iw_ep *ep, struct sk_buff *skb,
+				  int cpl)
+{
+	struct cpl_act_establish *rpl = cplhdr(skb);
+
+	/* Set our special ARP_FAILURE opcode */
+	rpl->ot.opcode = cpl;
+
+	/*
+	 * Save ep in the skb->cb area, after where sched() will save the dev
+	 * ptr.
+	 */
+	*((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))) = ep;
+	sched(ep->com.dev, skb);
+}
+
+/* Handle an ARP failure for an accept */
+static void pass_accept_rpl_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep = handle;
+
+	pr_err("ARP failure during accept - tid %u - dropping connection\n",
+	       ep->hwtid);
+
+	__state_set(&ep->com, DEAD);
+	queue_arp_failure_cpl(ep, skb, FAKE_CPL_PASS_PUT_EP_SAFE);
+}
+
+/*
+ * Handle an ARP failure for an active open.
+ */
+static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep = handle;
+
+	pr_err("ARP failure during connect\n");
+	connect_reply_upcall(ep, -EHOSTUNREACH);
+	__state_set(&ep->com, DEAD);
+	if (ep->com.remote_addr.ss_family == AF_INET6) {
+		struct sockaddr_in6 *sin6 =
+			(struct sockaddr_in6 *)&ep->com.local_addr;
+		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+				   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+	}
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
+	queue_arp_failure_cpl(ep, skb, FAKE_CPL_PUT_EP_SAFE);
+}
+
+/*
+ * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
+ * and send it along.
+ */
+static void abort_arp_failure(void *handle, struct sk_buff *skb)
+{
+	int ret;
+	struct c4iw_ep *ep = handle;
+	struct c4iw_rdev *rdev = &ep->com.dev->rdev;
+	struct cpl_abort_req *req = cplhdr(skb);
+
+	pr_debug("rdev %p\n", rdev);
+	req->cmd = CPL_ABORT_NO_RST;
+	skb_get(skb);
+	ret = c4iw_ofld_send(rdev, skb);
+	if (ret) {
+		__state_set(&ep->com, DEAD);
+		queue_arp_failure_cpl(ep, skb, FAKE_CPL_PUT_EP_SAFE);
+	} else
+		kfree_skb(skb);
+}
+
+static int send_flowc(struct c4iw_ep *ep)
+{
+	struct fw_flowc_wr *flowc;
+	struct sk_buff *skb = skb_dequeue(&ep->com.ep_skb_list);
+	int i;
+	u16 vlan = ep->l2t->vlan;
+	int nparams;
+
+	if (WARN_ON(!skb))
+		return -ENOMEM;
+
+	if (vlan == CPL_L2T_VLAN_NONE)
+		nparams = 8;
+	else
+		nparams = 9;
+
+	flowc = __skb_put(skb, FLOWC_LEN);
+
+	flowc->op_to_nparams = cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) |
+					   FW_FLOWC_WR_NPARAMS_V(nparams));
+	flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(FLOWC_LEN,
+					  16)) | FW_WR_FLOWID_V(ep->hwtid));
+
+	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+	flowc->mnemval[0].val = cpu_to_be32(FW_PFVF_CMD_PFN_V
+					    (ep->com.dev->rdev.lldi.pf));
+	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+	flowc->mnemval[1].val = cpu_to_be32(ep->tx_chan);
+	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+	flowc->mnemval[2].val = cpu_to_be32(ep->tx_chan);
+	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+	flowc->mnemval[3].val = cpu_to_be32(ep->rss_qid);
+	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT;
+	flowc->mnemval[4].val = cpu_to_be32(ep->snd_seq);
+	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;
+	flowc->mnemval[5].val = cpu_to_be32(ep->rcv_seq);
+	flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF;
+	flowc->mnemval[6].val = cpu_to_be32(ep->snd_win);
+	flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;
+	flowc->mnemval[7].val = cpu_to_be32(ep->emss);
+	if (nparams == 9) {
+		u16 pri;
+
+		pri = (vlan & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+		flowc->mnemval[8].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
+		flowc->mnemval[8].val = cpu_to_be32(pri);
+	} else {
+		/* Pad WR to 16 byte boundary */
+		flowc->mnemval[8].mnemonic = 0;
+		flowc->mnemval[8].val = 0;
+	}
+	for (i = 0; i < 9; i++) {
+		flowc->mnemval[i].r4[0] = 0;
+		flowc->mnemval[i].r4[1] = 0;
+		flowc->mnemval[i].r4[2] = 0;
+	}
+
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	return c4iw_ofld_send(&ep->com.dev->rdev, skb);
+}
+
+static int send_halfclose(struct c4iw_ep *ep)
+{
+	struct sk_buff *skb = skb_dequeue(&ep->com.ep_skb_list);
+	u32 wrlen = roundup(sizeof(struct cpl_close_con_req), 16);
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+	if (WARN_ON(!skb))
+		return -ENOMEM;
+
+	cxgb_mk_close_con_req(skb, wrlen, ep->hwtid, ep->txq_idx,
+			      NULL, arp_failure_discard);
+
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static int send_abort(struct c4iw_ep *ep)
+{
+	u32 wrlen = roundup(sizeof(struct cpl_abort_req), 16);
+	struct sk_buff *req_skb = skb_dequeue(&ep->com.ep_skb_list);
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+	if (WARN_ON(!req_skb))
+		return -ENOMEM;
+
+	cxgb_mk_abort_req(req_skb, wrlen, ep->hwtid, ep->txq_idx,
+			  ep, abort_arp_failure);
+
+	return c4iw_l2t_send(&ep->com.dev->rdev, req_skb, ep->l2t);
+}
+
+static int send_connect(struct c4iw_ep *ep)
+{
+	struct cpl_act_open_req *req = NULL;
+	struct cpl_t5_act_open_req *t5req = NULL;
+	struct cpl_t6_act_open_req *t6req = NULL;
+	struct cpl_act_open_req6 *req6 = NULL;
+	struct cpl_t5_act_open_req6 *t5req6 = NULL;
+	struct cpl_t6_act_open_req6 *t6req6 = NULL;
+	struct sk_buff *skb;
+	u64 opt0;
+	u32 opt2;
+	unsigned int mtu_idx;
+	u32 wscale;
+	int win, sizev4, sizev6, wrlen;
+	struct sockaddr_in *la = (struct sockaddr_in *)
+				 &ep->com.local_addr;
+	struct sockaddr_in *ra = (struct sockaddr_in *)
+				 &ep->com.remote_addr;
+	struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)
+				   &ep->com.local_addr;
+	struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)
+				   &ep->com.remote_addr;
+	int ret;
+	enum chip_type adapter_type = ep->com.dev->rdev.lldi.adapter_type;
+	u32 isn = (prandom_u32() & ~7UL) - 1;
+	struct net_device *netdev;
+	u64 params;
+
+	netdev = ep->com.dev->rdev.lldi.ports[0];
+
+	switch (CHELSIO_CHIP_VERSION(adapter_type)) {
+	case CHELSIO_T4:
+		sizev4 = sizeof(struct cpl_act_open_req);
+		sizev6 = sizeof(struct cpl_act_open_req6);
+		break;
+	case CHELSIO_T5:
+		sizev4 = sizeof(struct cpl_t5_act_open_req);
+		sizev6 = sizeof(struct cpl_t5_act_open_req6);
+		break;
+	case CHELSIO_T6:
+		sizev4 = sizeof(struct cpl_t6_act_open_req);
+		sizev6 = sizeof(struct cpl_t6_act_open_req6);
+		break;
+	default:
+		pr_err("T%d Chip is not supported\n",
+		       CHELSIO_CHIP_VERSION(adapter_type));
+		return -EINVAL;
+	}
+
+	wrlen = (ep->com.remote_addr.ss_family == AF_INET) ?
+			roundup(sizev4, 16) :
+			roundup(sizev6, 16);
+
+	pr_debug("ep %p atid %u\n", ep, ep->atid);
+
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		pr_err("%s - failed to alloc skb\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx);
+
+	cxgb_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
+		      enable_tcp_timestamps,
+		      (ep->com.remote_addr.ss_family == AF_INET) ? 0 : 1);
+	wscale = cxgb_compute_wscale(rcv_win);
+
+	/*
+	 * Specify the largest window that will fit in opt0. The
+	 * remainder will be specified in the rx_data_ack.
+	 */
+	win = ep->rcv_win >> 10;
+	if (win > RCV_BUFSIZ_M)
+		win = RCV_BUFSIZ_M;
+
+	opt0 = (nocong ? NO_CONG_F : 0) |
+	       KEEP_ALIVE_F |
+	       DELACK_F |
+	       WND_SCALE_V(wscale) |
+	       MSS_IDX_V(mtu_idx) |
+	       L2T_IDX_V(ep->l2t->idx) |
+	       TX_CHAN_V(ep->tx_chan) |
+	       SMAC_SEL_V(ep->smac_idx) |
+	       DSCP_V(ep->tos >> 2) |
+	       ULP_MODE_V(ULP_MODE_TCPDDP) |
+	       RCV_BUFSIZ_V(win);
+	opt2 = RX_CHANNEL_V(0) |
+	       CCTRL_ECN_V(enable_ecn) |
+	       RSS_QUEUE_VALID_F | RSS_QUEUE_V(ep->rss_qid);
+	if (enable_tcp_timestamps)
+		opt2 |= TSTAMPS_EN_F;
+	if (enable_tcp_sack)
+		opt2 |= SACK_EN_F;
+	if (wscale && enable_tcp_window_scaling)
+		opt2 |= WND_SCALE_EN_F;
+	if (CHELSIO_CHIP_VERSION(adapter_type) > CHELSIO_T4) {
+		if (peer2peer)
+			isn += 4;
+
+		opt2 |= T5_OPT_2_VALID_F;
+		opt2 |= CONG_CNTRL_V(CONG_ALG_TAHOE);
+		opt2 |= T5_ISS_F;
+	}
+
+	params = cxgb4_select_ntuple(netdev, ep->l2t);
+
+	if (ep->com.remote_addr.ss_family == AF_INET6)
+		cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
+			       (const u32 *)&la6->sin6_addr.s6_addr, 1);
+
+	t4_set_arp_err_handler(skb, ep, act_open_req_arp_failure);
+
+	if (ep->com.remote_addr.ss_family == AF_INET) {
+		switch (CHELSIO_CHIP_VERSION(adapter_type)) {
+		case CHELSIO_T4:
+			req = skb_put(skb, wrlen);
+			INIT_TP_WR(req, 0);
+			break;
+		case CHELSIO_T5:
+			t5req = skb_put(skb, wrlen);
+			INIT_TP_WR(t5req, 0);
+			req = (struct cpl_act_open_req *)t5req;
+			break;
+		case CHELSIO_T6:
+			t6req = skb_put(skb, wrlen);
+			INIT_TP_WR(t6req, 0);
+			req = (struct cpl_act_open_req *)t6req;
+			t5req = (struct cpl_t5_act_open_req *)t6req;
+			break;
+		default:
+			pr_err("T%d Chip is not supported\n",
+			       CHELSIO_CHIP_VERSION(adapter_type));
+			ret = -EINVAL;
+			goto clip_release;
+		}
+
+		OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
+					((ep->rss_qid<<14) | ep->atid)));
+		req->local_port = la->sin_port;
+		req->peer_port = ra->sin_port;
+		req->local_ip = la->sin_addr.s_addr;
+		req->peer_ip = ra->sin_addr.s_addr;
+		req->opt0 = cpu_to_be64(opt0);
+
+		if (is_t4(ep->com.dev->rdev.lldi.adapter_type)) {
+			req->params = cpu_to_be32(params);
+			req->opt2 = cpu_to_be32(opt2);
+		} else {
+			if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) {
+				t5req->params =
+					  cpu_to_be64(FILTER_TUPLE_V(params));
+				t5req->rsvd = cpu_to_be32(isn);
+				pr_debug("snd_isn %u\n", t5req->rsvd);
+				t5req->opt2 = cpu_to_be32(opt2);
+			} else {
+				t6req->params =
+					  cpu_to_be64(FILTER_TUPLE_V(params));
+				t6req->rsvd = cpu_to_be32(isn);
+				pr_debug("snd_isn %u\n", t6req->rsvd);
+				t6req->opt2 = cpu_to_be32(opt2);
+			}
+		}
+	} else {
+		switch (CHELSIO_CHIP_VERSION(adapter_type)) {
+		case CHELSIO_T4:
+			req6 = skb_put(skb, wrlen);
+			INIT_TP_WR(req6, 0);
+			break;
+		case CHELSIO_T5:
+			t5req6 = skb_put(skb, wrlen);
+			INIT_TP_WR(t5req6, 0);
+			req6 = (struct cpl_act_open_req6 *)t5req6;
+			break;
+		case CHELSIO_T6:
+			t6req6 = skb_put(skb, wrlen);
+			INIT_TP_WR(t6req6, 0);
+			req6 = (struct cpl_act_open_req6 *)t6req6;
+			t5req6 = (struct cpl_t5_act_open_req6 *)t6req6;
+			break;
+		default:
+			pr_err("T%d Chip is not supported\n",
+			       CHELSIO_CHIP_VERSION(adapter_type));
+			ret = -EINVAL;
+			goto clip_release;
+		}
+
+		OPCODE_TID(req6) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
+					((ep->rss_qid<<14)|ep->atid)));
+		req6->local_port = la6->sin6_port;
+		req6->peer_port = ra6->sin6_port;
+		req6->local_ip_hi = *((__be64 *)(la6->sin6_addr.s6_addr));
+		req6->local_ip_lo = *((__be64 *)(la6->sin6_addr.s6_addr + 8));
+		req6->peer_ip_hi = *((__be64 *)(ra6->sin6_addr.s6_addr));
+		req6->peer_ip_lo = *((__be64 *)(ra6->sin6_addr.s6_addr + 8));
+		req6->opt0 = cpu_to_be64(opt0);
+
+		if (is_t4(ep->com.dev->rdev.lldi.adapter_type)) {
+			req6->params = cpu_to_be32(cxgb4_select_ntuple(netdev,
+								      ep->l2t));
+			req6->opt2 = cpu_to_be32(opt2);
+		} else {
+			if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) {
+				t5req6->params =
+					    cpu_to_be64(FILTER_TUPLE_V(params));
+				t5req6->rsvd = cpu_to_be32(isn);
+				pr_debug("snd_isn %u\n", t5req6->rsvd);
+				t5req6->opt2 = cpu_to_be32(opt2);
+			} else {
+				t6req6->params =
+					    cpu_to_be64(FILTER_TUPLE_V(params));
+				t6req6->rsvd = cpu_to_be32(isn);
+				pr_debug("snd_isn %u\n", t6req6->rsvd);
+				t6req6->opt2 = cpu_to_be32(opt2);
+			}
+
+		}
+	}
+
+	set_bit(ACT_OPEN_REQ, &ep->com.history);
+	ret = c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+clip_release:
+	if (ret && ep->com.remote_addr.ss_family == AF_INET6)
+		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+				   (const u32 *)&la6->sin6_addr.s6_addr, 1);
+	return ret;
+}
+
+static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
+			u8 mpa_rev_to_use)
+{
+	int mpalen, wrlen, ret;
+	struct fw_ofld_tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct mpa_v2_conn_params mpa_v2_params;
+
+	pr_debug("ep %p tid %u pd_len %d\n",
+		 ep, ep->hwtid, ep->plen);
+
+	mpalen = sizeof(*mpa) + ep->plen;
+	if (mpa_rev_to_use == 2)
+		mpalen += sizeof(struct mpa_v2_conn_params);
+	wrlen = roundup(mpalen + sizeof *req, 16);
+	skb = get_skb(skb, wrlen, GFP_KERNEL);
+	if (!skb) {
+		connect_reply_upcall(ep, -ENOMEM);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	req = skb_put_zero(skb, wrlen);
+	req->op_to_immdlen = cpu_to_be32(
+		FW_WR_OP_V(FW_OFLD_TX_DATA_WR) |
+		FW_WR_COMPL_F |
+		FW_WR_IMMDLEN_V(mpalen));
+	req->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID_V(ep->hwtid) |
+		FW_WR_LEN16_V(wrlen >> 4));
+	req->plen = cpu_to_be32(mpalen);
+	req->tunnel_to_proxy = cpu_to_be32(
+		FW_OFLD_TX_DATA_WR_FLUSH_F |
+		FW_OFLD_TX_DATA_WR_SHOVE_F);
+
+	mpa = (struct mpa_message *)(req + 1);
+	memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
+
+	mpa->flags = 0;
+	if (crc_enabled)
+		mpa->flags |= MPA_CRC;
+	if (markers_enabled) {
+		mpa->flags |= MPA_MARKERS;
+		ep->mpa_attr.recv_marker_enabled = 1;
+	} else {
+		ep->mpa_attr.recv_marker_enabled = 0;
+	}
+	if (mpa_rev_to_use == 2)
+		mpa->flags |= MPA_ENHANCED_RDMA_CONN;
+
+	mpa->private_data_size = htons(ep->plen);
+	mpa->revision = mpa_rev_to_use;
+	if (mpa_rev_to_use == 1) {
+		ep->tried_with_mpa_v1 = 1;
+		ep->retry_with_mpa_v1 = 0;
+	}
+
+	if (mpa_rev_to_use == 2) {
+		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
+					       sizeof (struct mpa_v2_conn_params));
+		pr_debug("initiator ird %u ord %u\n", ep->ird,
+			 ep->ord);
+		mpa_v2_params.ird = htons((u16)ep->ird);
+		mpa_v2_params.ord = htons((u16)ep->ord);
+
+		if (peer2peer) {
+			mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL);
+			if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE)
+				mpa_v2_params.ord |=
+					htons(MPA_V2_RDMA_WRITE_RTR);
+			else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ)
+				mpa_v2_params.ord |=
+					htons(MPA_V2_RDMA_READ_RTR);
+		}
+		memcpy(mpa->private_data, &mpa_v2_params,
+		       sizeof(struct mpa_v2_conn_params));
+
+		if (ep->plen)
+			memcpy(mpa->private_data +
+			       sizeof(struct mpa_v2_conn_params),
+			       ep->mpa_pkt + sizeof(*mpa), ep->plen);
+	} else
+		if (ep->plen)
+			memcpy(mpa->private_data,
+					ep->mpa_pkt + sizeof(*mpa), ep->plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function fw4_ack() will deref it.
+	 */
+	skb_get(skb);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	ep->mpa_skb = skb;
+	ret = c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+	if (ret)
+		return ret;
+	start_ep_timer(ep);
+	__state_set(&ep->com, MPA_REQ_SENT);
+	ep->mpa_attr.initiator = 1;
+	ep->snd_seq += mpalen;
+	return ret;
+}
+
+static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen, wrlen;
+	struct fw_ofld_tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct sk_buff *skb;
+	struct mpa_v2_conn_params mpa_v2_params;
+
+	pr_debug("ep %p tid %u pd_len %d\n",
+		 ep, ep->hwtid, ep->plen);
+
+	mpalen = sizeof(*mpa) + plen;
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn)
+		mpalen += sizeof(struct mpa_v2_conn_params);
+	wrlen = roundup(mpalen + sizeof *req, 16);
+
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		pr_err("%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	req = skb_put_zero(skb, wrlen);
+	req->op_to_immdlen = cpu_to_be32(
+		FW_WR_OP_V(FW_OFLD_TX_DATA_WR) |
+		FW_WR_COMPL_F |
+		FW_WR_IMMDLEN_V(mpalen));
+	req->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID_V(ep->hwtid) |
+		FW_WR_LEN16_V(wrlen >> 4));
+	req->plen = cpu_to_be32(mpalen);
+	req->tunnel_to_proxy = cpu_to_be32(
+		FW_OFLD_TX_DATA_WR_FLUSH_F |
+		FW_OFLD_TX_DATA_WR_SHOVE_F);
+
+	mpa = (struct mpa_message *)(req + 1);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = MPA_REJECT;
+	mpa->revision = ep->mpa_attr.version;
+	mpa->private_data_size = htons(plen);
+
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
+		mpa->flags |= MPA_ENHANCED_RDMA_CONN;
+		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
+					       sizeof (struct mpa_v2_conn_params));
+		mpa_v2_params.ird = htons(((u16)ep->ird) |
+					  (peer2peer ? MPA_V2_PEER2PEER_MODEL :
+					   0));
+		mpa_v2_params.ord = htons(((u16)ep->ord) | (peer2peer ?
+					  (p2p_type ==
+					   FW_RI_INIT_P2PTYPE_RDMA_WRITE ?
+					   MPA_V2_RDMA_WRITE_RTR : p2p_type ==
+					   FW_RI_INIT_P2PTYPE_READ_REQ ?
+					   MPA_V2_RDMA_READ_RTR : 0) : 0));
+		memcpy(mpa->private_data, &mpa_v2_params,
+		       sizeof(struct mpa_v2_conn_params));
+
+		if (ep->plen)
+			memcpy(mpa->private_data +
+			       sizeof(struct mpa_v2_conn_params), pdata, plen);
+	} else
+		if (plen)
+			memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb again.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function fw4_ack() will deref it.
+	 */
+	skb_get(skb);
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	t4_set_arp_err_handler(skb, NULL, mpa_start_arp_failure);
+	ep->mpa_skb = skb;
+	ep->snd_seq += mpalen;
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen, wrlen;
+	struct fw_ofld_tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct sk_buff *skb;
+	struct mpa_v2_conn_params mpa_v2_params;
+
+	pr_debug("ep %p tid %u pd_len %d\n",
+		 ep, ep->hwtid, ep->plen);
+
+	mpalen = sizeof(*mpa) + plen;
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn)
+		mpalen += sizeof(struct mpa_v2_conn_params);
+	wrlen = roundup(mpalen + sizeof *req, 16);
+
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		pr_err("%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	req = skb_put_zero(skb, wrlen);
+	req->op_to_immdlen = cpu_to_be32(
+		FW_WR_OP_V(FW_OFLD_TX_DATA_WR) |
+		FW_WR_COMPL_F |
+		FW_WR_IMMDLEN_V(mpalen));
+	req->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID_V(ep->hwtid) |
+		FW_WR_LEN16_V(wrlen >> 4));
+	req->plen = cpu_to_be32(mpalen);
+	req->tunnel_to_proxy = cpu_to_be32(
+		FW_OFLD_TX_DATA_WR_FLUSH_F |
+		FW_OFLD_TX_DATA_WR_SHOVE_F);
+
+	mpa = (struct mpa_message *)(req + 1);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = 0;
+	if (ep->mpa_attr.crc_enabled)
+		mpa->flags |= MPA_CRC;
+	if (ep->mpa_attr.recv_marker_enabled)
+		mpa->flags |= MPA_MARKERS;
+	mpa->revision = ep->mpa_attr.version;
+	mpa->private_data_size = htons(plen);
+
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
+		mpa->flags |= MPA_ENHANCED_RDMA_CONN;
+		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
+					       sizeof (struct mpa_v2_conn_params));
+		mpa_v2_params.ird = htons((u16)ep->ird);
+		mpa_v2_params.ord = htons((u16)ep->ord);
+		if (peer2peer && (ep->mpa_attr.p2p_type !=
+					FW_RI_INIT_P2PTYPE_DISABLED)) {
+			mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL);
+
+			if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE)
+				mpa_v2_params.ord |=
+					htons(MPA_V2_RDMA_WRITE_RTR);
+			else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ)
+				mpa_v2_params.ord |=
+					htons(MPA_V2_RDMA_READ_RTR);
+		}
+
+		memcpy(mpa->private_data, &mpa_v2_params,
+		       sizeof(struct mpa_v2_conn_params));
+
+		if (ep->plen)
+			memcpy(mpa->private_data +
+			       sizeof(struct mpa_v2_conn_params), pdata, plen);
+	} else
+		if (plen)
+			memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function fw4_ack() will deref it.
+	 */
+	skb_get(skb);
+	t4_set_arp_err_handler(skb, NULL, mpa_start_arp_failure);
+	ep->mpa_skb = skb;
+	__state_set(&ep->com, MPA_REP_SENT);
+	ep->snd_seq += mpalen;
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_act_establish *req = cplhdr(skb);
+	unsigned int tid = GET_TID(req);
+	unsigned int atid = TID_TID_G(ntohl(req->tos_atid));
+	struct tid_info *t = dev->rdev.lldi.tids;
+	int ret;
+
+	ep = lookup_atid(t, atid);
+
+	pr_debug("ep %p tid %u snd_isn %u rcv_isn %u\n", ep, tid,
+		 be32_to_cpu(req->snd_isn), be32_to_cpu(req->rcv_isn));
+
+	mutex_lock(&ep->com.mutex);
+	dst_confirm(ep->dst);
+
+	/* setup the hwtid for this connection */
+	ep->hwtid = tid;
+	cxgb4_insert_tid(t, ep, tid, ep->com.local_addr.ss_family);
+	insert_ep_tid(ep);
+
+	ep->snd_seq = be32_to_cpu(req->snd_isn);
+	ep->rcv_seq = be32_to_cpu(req->rcv_isn);
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	/* dealloc the atid */
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid);
+	cxgb4_free_atid(t, atid);
+	set_bit(ACT_ESTAB, &ep->com.history);
+
+	/* start MPA negotiation */
+	ret = send_flowc(ep);
+	if (ret)
+		goto err;
+	if (ep->retry_with_mpa_v1)
+		ret = send_mpa_req(ep, skb, 1);
+	else
+		ret = send_mpa_req(ep, skb, mpa_rev);
+	if (ret)
+		goto err;
+	mutex_unlock(&ep->com.mutex);
+	return 0;
+err:
+	mutex_unlock(&ep->com.mutex);
+	connect_reply_upcall(ep, -ENOMEM);
+	c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+	return 0;
+}
+
+static void close_complete_upcall(struct c4iw_ep *ep, int status)
+{
+	struct iw_cm_event event;
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	event.status = status;
+	if (ep->com.cm_id) {
+		pr_debug("close complete delivered ep %p cm_id %p tid %u\n",
+			 ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		deref_cm_id(&ep->com);
+		set_bit(CLOSE_UPCALL, &ep->com.history);
+	}
+}
+
+static void peer_close_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_DISCONNECT;
+	if (ep->com.cm_id) {
+		pr_debug("peer close delivered ep %p cm_id %p tid %u\n",
+			 ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		set_bit(DISCONN_UPCALL, &ep->com.history);
+	}
+}
+
+static void peer_abort_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	event.status = -ECONNRESET;
+	if (ep->com.cm_id) {
+		pr_debug("abort delivered ep %p cm_id %p tid %u\n", ep,
+			 ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		deref_cm_id(&ep->com);
+		set_bit(ABORT_UPCALL, &ep->com.history);
+	}
+}
+
+static void connect_reply_upcall(struct c4iw_ep *ep, int status)
+{
+	struct iw_cm_event event;
+
+	pr_debug("ep %p tid %u status %d\n",
+		 ep, ep->hwtid, status);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REPLY;
+	event.status = status;
+	memcpy(&event.local_addr, &ep->com.local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&event.remote_addr, &ep->com.remote_addr,
+	       sizeof(ep->com.remote_addr));
+
+	if ((status == 0) || (status == -ECONNREFUSED)) {
+		if (!ep->tried_with_mpa_v1) {
+			/* this means MPA_v2 is used */
+			event.ord = ep->ird;
+			event.ird = ep->ord;
+			event.private_data_len = ep->plen -
+				sizeof(struct mpa_v2_conn_params);
+			event.private_data = ep->mpa_pkt +
+				sizeof(struct mpa_message) +
+				sizeof(struct mpa_v2_conn_params);
+		} else {
+			/* this means MPA_v1 is used */
+			event.ord = cur_max_read_depth(ep->com.dev);
+			event.ird = cur_max_read_depth(ep->com.dev);
+			event.private_data_len = ep->plen;
+			event.private_data = ep->mpa_pkt +
+				sizeof(struct mpa_message);
+		}
+	}
+
+	pr_debug("ep %p tid %u status %d\n", ep,
+		 ep->hwtid, status);
+	set_bit(CONN_RPL_UPCALL, &ep->com.history);
+	ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+
+	if (status < 0)
+		deref_cm_id(&ep->com);
+}
+
+static int connect_request_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+	int ret;
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REQUEST;
+	memcpy(&event.local_addr, &ep->com.local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&event.remote_addr, &ep->com.remote_addr,
+	       sizeof(ep->com.remote_addr));
+	event.provider_data = ep;
+	if (!ep->tried_with_mpa_v1) {
+		/* this means MPA_v2 is used */
+		event.ord = ep->ord;
+		event.ird = ep->ird;
+		event.private_data_len = ep->plen -
+			sizeof(struct mpa_v2_conn_params);
+		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message) +
+			sizeof(struct mpa_v2_conn_params);
+	} else {
+		/* this means MPA_v1 is used. Send max supported */
+		event.ord = cur_max_read_depth(ep->com.dev);
+		event.ird = cur_max_read_depth(ep->com.dev);
+		event.private_data_len = ep->plen;
+		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+	}
+	c4iw_get_ep(&ep->com);
+	ret = ep->parent_ep->com.cm_id->event_handler(ep->parent_ep->com.cm_id,
+						      &event);
+	if (ret)
+		c4iw_put_ep(&ep->com);
+	set_bit(CONNREQ_UPCALL, &ep->com.history);
+	c4iw_put_ep(&ep->parent_ep->com);
+	return ret;
+}
+
+static void established_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_ESTABLISHED;
+	event.ird = ep->ord;
+	event.ord = ep->ird;
+	if (ep->com.cm_id) {
+		pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		set_bit(ESTAB_UPCALL, &ep->com.history);
+	}
+}
+
+static int update_rx_credits(struct c4iw_ep *ep, u32 credits)
+{
+	struct sk_buff *skb;
+	u32 wrlen = roundup(sizeof(struct cpl_rx_data_ack), 16);
+	u32 credit_dack;
+
+	pr_debug("ep %p tid %u credits %u\n",
+		 ep, ep->hwtid, credits);
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		pr_err("update_rx_credits - cannot alloc skb!\n");
+		return 0;
+	}
+
+	/*
+	 * If we couldn't specify the entire rcv window at connection setup
+	 * due to the limit in the number of bits in the RCV_BUFSIZ field,
+	 * then add the overage in to the credits returned.
+	 */
+	if (ep->rcv_win > RCV_BUFSIZ_M * 1024)
+		credits += ep->rcv_win - RCV_BUFSIZ_M * 1024;
+
+	credit_dack = credits | RX_FORCE_ACK_F | RX_DACK_CHANGE_F |
+		      RX_DACK_MODE_V(dack_mode);
+
+	cxgb_mk_rx_data_ack(skb, wrlen, ep->hwtid, ep->ctrlq_idx,
+			    credit_dack);
+
+	c4iw_ofld_send(&ep->com.dev->rdev, skb);
+	return credits;
+}
+
+#define RELAXED_IRD_NEGOTIATION 1
+
+/*
+ * process_mpa_reply - process streaming mode MPA reply
+ *
+ * Returns:
+ *
+ * 0 upon success indicating a connect request was delivered to the ULP
+ * or the mpa request is incomplete but valid so far.
+ *
+ * 1 if a failure requires the caller to close the connection.
+ *
+ * 2 if a failure requires the caller to abort the connection.
+ */
+static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	struct mpa_v2_conn_params *mpa_v2_params;
+	u16 plen;
+	u16 resp_ird, resp_ord;
+	u8 rtr_mismatch = 0, insuff_ird = 0;
+	struct c4iw_qp_attributes attrs;
+	enum c4iw_qp_attr_mask mask;
+	int err;
+	int disconnect = 0;
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+		err = -EINVAL;
+		goto err_stop_timer;
+	}
+
+	/*
+	 * copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * if we don't even have the mpa message, then bail.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return 0;
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/* Validate MPA header. */
+	if (mpa->revision > mpa_rev) {
+		pr_err("%s MPA version mismatch. Local = %d, Received = %d\n",
+		       __func__, mpa_rev, mpa->revision);
+		err = -EPROTO;
+		goto err_stop_timer;
+	}
+	if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
+		err = -EPROTO;
+		goto err_stop_timer;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		err = -EPROTO;
+		goto err_stop_timer;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		err = -EPROTO;
+		goto err_stop_timer;
+	}
+
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return 0;
+
+	if (mpa->flags & MPA_REJECT) {
+		err = -ECONNREFUSED;
+		goto err_stop_timer;
+	}
+
+	/*
+	 * Stop mpa timer.  If it expired, then
+	 * we ignore the MPA reply.  process_timeout()
+	 * will abort the connection.
+	 */
+	if (stop_ep_timer(ep))
+		return 0;
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data. And
+	 * the MPA header is valid.
+	 */
+	__state_set(&ep->com, FPDU_MODE);
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa->revision;
+	ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
+
+	if (mpa->revision == 2) {
+		ep->mpa_attr.enhanced_rdma_conn =
+			mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0;
+		if (ep->mpa_attr.enhanced_rdma_conn) {
+			mpa_v2_params = (struct mpa_v2_conn_params *)
+				(ep->mpa_pkt + sizeof(*mpa));
+			resp_ird = ntohs(mpa_v2_params->ird) &
+				MPA_V2_IRD_ORD_MASK;
+			resp_ord = ntohs(mpa_v2_params->ord) &
+				MPA_V2_IRD_ORD_MASK;
+			pr_debug("responder ird %u ord %u ep ird %u ord %u\n",
+				 resp_ird, resp_ord, ep->ird, ep->ord);
+
+			/*
+			 * This is a double-check. Ideally, below checks are
+			 * not required since ird/ord stuff has been taken
+			 * care of in c4iw_accept_cr
+			 */
+			if (ep->ird < resp_ord) {
+				if (RELAXED_IRD_NEGOTIATION && resp_ord <=
+				    ep->com.dev->rdev.lldi.max_ordird_qp)
+					ep->ird = resp_ord;
+				else
+					insuff_ird = 1;
+			} else if (ep->ird > resp_ord) {
+				ep->ird = resp_ord;
+			}
+			if (ep->ord > resp_ird) {
+				if (RELAXED_IRD_NEGOTIATION)
+					ep->ord = resp_ird;
+				else
+					insuff_ird = 1;
+			}
+			if (insuff_ird) {
+				err = -ENOMEM;
+				ep->ird = resp_ord;
+				ep->ord = resp_ird;
+			}
+
+			if (ntohs(mpa_v2_params->ird) &
+					MPA_V2_PEER2PEER_MODEL) {
+				if (ntohs(mpa_v2_params->ord) &
+						MPA_V2_RDMA_WRITE_RTR)
+					ep->mpa_attr.p2p_type =
+						FW_RI_INIT_P2PTYPE_RDMA_WRITE;
+				else if (ntohs(mpa_v2_params->ord) &
+						MPA_V2_RDMA_READ_RTR)
+					ep->mpa_attr.p2p_type =
+						FW_RI_INIT_P2PTYPE_READ_REQ;
+			}
+		}
+	} else if (mpa->revision == 1)
+		if (peer2peer)
+			ep->mpa_attr.p2p_type = p2p_type;
+
+	pr_debug("crc_enabled=%d, recv_marker_enabled=%d, xmit_marker_enabled=%d, version=%d p2p_type=%d local-p2p_type = %d\n",
+		 ep->mpa_attr.crc_enabled,
+		 ep->mpa_attr.recv_marker_enabled,
+		 ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,
+		 ep->mpa_attr.p2p_type, p2p_type);
+
+	/*
+	 * If responder's RTR does not match with that of initiator, assign
+	 * FW_RI_INIT_P2PTYPE_DISABLED in mpa attributes so that RTR is not
+	 * generated when moving QP to RTS state.
+	 * A TERM message will be sent after QP has moved to RTS state
+	 */
+	if ((ep->mpa_attr.version == 2) && peer2peer &&
+			(ep->mpa_attr.p2p_type != p2p_type)) {
+		ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
+		rtr_mismatch = 1;
+	}
+
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = C4IW_QP_STATE_RTS;
+
+	mask = C4IW_QP_ATTR_NEXT_STATE |
+	    C4IW_QP_ATTR_LLP_STREAM_HANDLE | C4IW_QP_ATTR_MPA_ATTR |
+	    C4IW_QP_ATTR_MAX_IRD | C4IW_QP_ATTR_MAX_ORD;
+
+	/* bind QP and TID with INIT_WR */
+	err = c4iw_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err;
+
+	/*
+	 * If responder's RTR requirement did not match with what initiator
+	 * supports, generate TERM message
+	 */
+	if (rtr_mismatch) {
+		pr_err("%s: RTR mismatch, sending TERM\n", __func__);
+		attrs.layer_etype = LAYER_MPA | DDP_LLP;
+		attrs.ecode = MPA_NOMATCH_RTR;
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		attrs.send_term = 1;
+		err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		err = -ENOMEM;
+		disconnect = 1;
+		goto out;
+	}
+
+	/*
+	 * Generate TERM if initiator IRD is not sufficient for responder
+	 * provided ORD. Currently, we do the same behaviour even when
+	 * responder provided IRD is also not sufficient as regards to
+	 * initiator ORD.
+	 */
+	if (insuff_ird) {
+		pr_err("%s: Insufficient IRD, sending TERM\n", __func__);
+		attrs.layer_etype = LAYER_MPA | DDP_LLP;
+		attrs.ecode = MPA_INSUFF_IRD;
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		attrs.send_term = 1;
+		err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		err = -ENOMEM;
+		disconnect = 1;
+		goto out;
+	}
+	goto out;
+err_stop_timer:
+	stop_ep_timer(ep);
+err:
+	disconnect = 2;
+out:
+	connect_reply_upcall(ep, err);
+	return disconnect;
+}
+
+/*
+ * process_mpa_request - process streaming mode MPA request
+ *
+ * Returns:
+ *
+ * 0 upon success indicating a connect request was delivered to the ULP
+ * or the mpa request is incomplete but valid so far.
+ *
+ * 1 if a failure requires the caller to close the connection.
+ *
+ * 2 if a failure requires the caller to abort the connection.
+ */
+static int process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	struct mpa_v2_conn_params *mpa_v2_params;
+	u16 plen;
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt))
+		goto err_stop_timer;
+
+	pr_debug("enter (%s line %u)\n", __FILE__, __LINE__);
+
+	/*
+	 * Copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * If we don't even have the mpa message, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return 0;
+
+	pr_debug("enter (%s line %u)\n", __FILE__, __LINE__);
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/*
+	 * Validate MPA Header.
+	 */
+	if (mpa->revision > mpa_rev) {
+		pr_err("%s MPA version mismatch. Local = %d, Received = %d\n",
+		       __func__, mpa_rev, mpa->revision);
+		goto err_stop_timer;
+	}
+
+	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)))
+		goto err_stop_timer;
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA)
+		goto err_stop_timer;
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen))
+		goto err_stop_timer;
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return 0;
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data.
+	 */
+	ep->mpa_attr.initiator = 0;
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa->revision;
+	if (mpa->revision == 1)
+		ep->tried_with_mpa_v1 = 1;
+	ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
+
+	if (mpa->revision == 2) {
+		ep->mpa_attr.enhanced_rdma_conn =
+			mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0;
+		if (ep->mpa_attr.enhanced_rdma_conn) {
+			mpa_v2_params = (struct mpa_v2_conn_params *)
+				(ep->mpa_pkt + sizeof(*mpa));
+			ep->ird = ntohs(mpa_v2_params->ird) &
+				MPA_V2_IRD_ORD_MASK;
+			ep->ird = min_t(u32, ep->ird,
+					cur_max_read_depth(ep->com.dev));
+			ep->ord = ntohs(mpa_v2_params->ord) &
+				MPA_V2_IRD_ORD_MASK;
+			ep->ord = min_t(u32, ep->ord,
+					cur_max_read_depth(ep->com.dev));
+			pr_debug("initiator ird %u ord %u\n",
+				 ep->ird, ep->ord);
+			if (ntohs(mpa_v2_params->ird) & MPA_V2_PEER2PEER_MODEL)
+				if (peer2peer) {
+					if (ntohs(mpa_v2_params->ord) &
+							MPA_V2_RDMA_WRITE_RTR)
+						ep->mpa_attr.p2p_type =
+						FW_RI_INIT_P2PTYPE_RDMA_WRITE;
+					else if (ntohs(mpa_v2_params->ord) &
+							MPA_V2_RDMA_READ_RTR)
+						ep->mpa_attr.p2p_type =
+						FW_RI_INIT_P2PTYPE_READ_REQ;
+				}
+		}
+	} else if (mpa->revision == 1)
+		if (peer2peer)
+			ep->mpa_attr.p2p_type = p2p_type;
+
+	pr_debug("crc_enabled=%d, recv_marker_enabled=%d, xmit_marker_enabled=%d, version=%d p2p_type=%d\n",
+		 ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+		 ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,
+		 ep->mpa_attr.p2p_type);
+
+	__state_set(&ep->com, MPA_REQ_RCVD);
+
+	/* drive upcall */
+	mutex_lock_nested(&ep->parent_ep->com.mutex, SINGLE_DEPTH_NESTING);
+	if (ep->parent_ep->com.state != DEAD) {
+		if (connect_request_upcall(ep))
+			goto err_unlock_parent;
+	} else {
+		goto err_unlock_parent;
+	}
+	mutex_unlock(&ep->parent_ep->com.mutex);
+	return 0;
+
+err_unlock_parent:
+	mutex_unlock(&ep->parent_ep->com.mutex);
+	goto err_out;
+err_stop_timer:
+	(void)stop_ep_timer(ep);
+err_out:
+	return 2;
+}
+
+static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_rx_data *hdr = cplhdr(skb);
+	unsigned int dlen = ntohs(hdr->len);
+	unsigned int tid = GET_TID(hdr);
+	__u8 status = hdr->status;
+	int disconnect = 0;
+
+	ep = get_ep_from_tid(dev, tid);
+	if (!ep)
+		return 0;
+	pr_debug("ep %p tid %u dlen %u\n", ep, ep->hwtid, dlen);
+	skb_pull(skb, sizeof(*hdr));
+	skb_trim(skb, dlen);
+	mutex_lock(&ep->com.mutex);
+
+	switch (ep->com.state) {
+	case MPA_REQ_SENT:
+		update_rx_credits(ep, dlen);
+		ep->rcv_seq += dlen;
+		disconnect = process_mpa_reply(ep, skb);
+		break;
+	case MPA_REQ_WAIT:
+		update_rx_credits(ep, dlen);
+		ep->rcv_seq += dlen;
+		disconnect = process_mpa_request(ep, skb);
+		break;
+	case FPDU_MODE: {
+		struct c4iw_qp_attributes attrs;
+
+		update_rx_credits(ep, dlen);
+		if (status)
+			pr_err("%s Unexpected streaming data." \
+			       " qpid %u ep %p state %d tid %u status %d\n",
+			       __func__, ep->com.qp->wq.sq.qid, ep,
+			       ep->com.state, ep->hwtid, status);
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+			       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		disconnect = 1;
+		break;
+	}
+	default:
+		break;
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (disconnect)
+		c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL);
+	c4iw_put_ep(&ep->com);
+	return 0;
+}
+
+static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_abort_rpl_rss *rpl = cplhdr(skb);
+	int release = 0;
+	unsigned int tid = GET_TID(rpl);
+
+	ep = get_ep_from_tid(dev, tid);
+	if (!ep) {
+		pr_warn("Abort rpl to freed endpoint\n");
+		return 0;
+	}
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+	mutex_lock(&ep->com.mutex);
+	switch (ep->com.state) {
+	case ABORTING:
+		c4iw_wake_up_noref(ep->com.wr_waitp, -ECONNRESET);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	default:
+		pr_err("%s ep %p state %d\n", __func__, ep, ep->com.state);
+		break;
+	}
+	mutex_unlock(&ep->com.mutex);
+
+	if (release)
+		release_ep_resources(ep);
+	c4iw_put_ep(&ep->com);
+	return 0;
+}
+
+static int send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
+{
+	struct sk_buff *skb;
+	struct fw_ofld_connection_wr *req;
+	unsigned int mtu_idx;
+	u32 wscale;
+	struct sockaddr_in *sin;
+	int win;
+
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	req = __skb_put_zero(skb, sizeof(*req));
+	req->op_compl = htonl(WR_OP_V(FW_OFLD_CONNECTION_WR));
+	req->len16_pkd = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*req), 16)));
+	req->le.filter = cpu_to_be32(cxgb4_select_ntuple(
+				     ep->com.dev->rdev.lldi.ports[0],
+				     ep->l2t));
+	sin = (struct sockaddr_in *)&ep->com.local_addr;
+	req->le.lport = sin->sin_port;
+	req->le.u.ipv4.lip = sin->sin_addr.s_addr;
+	sin = (struct sockaddr_in *)&ep->com.remote_addr;
+	req->le.pport = sin->sin_port;
+	req->le.u.ipv4.pip = sin->sin_addr.s_addr;
+	req->tcb.t_state_to_astid =
+			htonl(FW_OFLD_CONNECTION_WR_T_STATE_V(TCP_SYN_SENT) |
+			FW_OFLD_CONNECTION_WR_ASTID_V(atid));
+	req->tcb.cplrxdataack_cplpassacceptrpl =
+			htons(FW_OFLD_CONNECTION_WR_CPLRXDATAACK_F);
+	req->tcb.tx_max = (__force __be32) jiffies;
+	req->tcb.rcv_adv = htons(1);
+	cxgb_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
+		      enable_tcp_timestamps,
+		      (ep->com.remote_addr.ss_family == AF_INET) ? 0 : 1);
+	wscale = cxgb_compute_wscale(rcv_win);
+
+	/*
+	 * Specify the largest window that will fit in opt0. The
+	 * remainder will be specified in the rx_data_ack.
+	 */
+	win = ep->rcv_win >> 10;
+	if (win > RCV_BUFSIZ_M)
+		win = RCV_BUFSIZ_M;
+
+	req->tcb.opt0 = (__force __be64) (TCAM_BYPASS_F |
+		(nocong ? NO_CONG_F : 0) |
+		KEEP_ALIVE_F |
+		DELACK_F |
+		WND_SCALE_V(wscale) |
+		MSS_IDX_V(mtu_idx) |
+		L2T_IDX_V(ep->l2t->idx) |
+		TX_CHAN_V(ep->tx_chan) |
+		SMAC_SEL_V(ep->smac_idx) |
+		DSCP_V(ep->tos >> 2) |
+		ULP_MODE_V(ULP_MODE_TCPDDP) |
+		RCV_BUFSIZ_V(win));
+	req->tcb.opt2 = (__force __be32) (PACE_V(1) |
+		TX_QUEUE_V(ep->com.dev->rdev.lldi.tx_modq[ep->tx_chan]) |
+		RX_CHANNEL_V(0) |
+		CCTRL_ECN_V(enable_ecn) |
+		RSS_QUEUE_VALID_F | RSS_QUEUE_V(ep->rss_qid));
+	if (enable_tcp_timestamps)
+		req->tcb.opt2 |= (__force __be32)TSTAMPS_EN_F;
+	if (enable_tcp_sack)
+		req->tcb.opt2 |= (__force __be32)SACK_EN_F;
+	if (wscale && enable_tcp_window_scaling)
+		req->tcb.opt2 |= (__force __be32)WND_SCALE_EN_F;
+	req->tcb.opt0 = cpu_to_be64((__force u64)req->tcb.opt0);
+	req->tcb.opt2 = cpu_to_be32((__force u32)req->tcb.opt2);
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx);
+	set_bit(ACT_OFLD_CONN, &ep->com.history);
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+/*
+ * Some of the error codes above implicitly indicate that there is no TID
+ * allocated with the result of an ACT_OPEN.  We use this predicate to make
+ * that explicit.
+ */
+static inline int act_open_has_tid(int status)
+{
+	return (status != CPL_ERR_TCAM_PARITY &&
+		status != CPL_ERR_TCAM_MISS &&
+		status != CPL_ERR_TCAM_FULL &&
+		status != CPL_ERR_CONN_EXIST_SYNRECV &&
+		status != CPL_ERR_CONN_EXIST);
+}
+
+static char *neg_adv_str(unsigned int status)
+{
+	switch (status) {
+	case CPL_ERR_RTX_NEG_ADVICE:
+		return "Retransmit timeout";
+	case CPL_ERR_PERSIST_NEG_ADVICE:
+		return "Persist timeout";
+	case CPL_ERR_KEEPALV_NEG_ADVICE:
+		return "Keepalive timeout";
+	default:
+		return "Unknown";
+	}
+}
+
+static void set_tcp_window(struct c4iw_ep *ep, struct port_info *pi)
+{
+	ep->snd_win = snd_win;
+	ep->rcv_win = rcv_win;
+	pr_debug("snd_win %d rcv_win %d\n",
+		 ep->snd_win, ep->rcv_win);
+}
+
+#define ACT_OPEN_RETRY_COUNT 2
+
+static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip,
+		     struct dst_entry *dst, struct c4iw_dev *cdev,
+		     bool clear_mpa_v1, enum chip_type adapter_type, u8 tos)
+{
+	struct neighbour *n;
+	int err, step;
+	struct net_device *pdev;
+
+	n = dst_neigh_lookup(dst, peer_ip);
+	if (!n)
+		return -ENODEV;
+
+	rcu_read_lock();
+	err = -ENOMEM;
+	if (n->dev->flags & IFF_LOOPBACK) {
+		if (iptype == 4)
+			pdev = ip_dev_find(&init_net, *(__be32 *)peer_ip);
+		else if (IS_ENABLED(CONFIG_IPV6))
+			for_each_netdev(&init_net, pdev) {
+				if (ipv6_chk_addr(&init_net,
+						  (struct in6_addr *)peer_ip,
+						  pdev, 1))
+					break;
+			}
+		else
+			pdev = NULL;
+
+		if (!pdev) {
+			err = -ENODEV;
+			goto out;
+		}
+		ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t,
+					n, pdev, rt_tos2priority(tos));
+		if (!ep->l2t) {
+			dev_put(pdev);
+			goto out;
+		}
+		ep->mtu = pdev->mtu;
+		ep->tx_chan = cxgb4_port_chan(pdev);
+		ep->smac_idx = cxgb4_tp_smt_idx(adapter_type,
+						cxgb4_port_viid(pdev));
+		step = cdev->rdev.lldi.ntxq /
+			cdev->rdev.lldi.nchan;
+		ep->txq_idx = cxgb4_port_idx(pdev) * step;
+		step = cdev->rdev.lldi.nrxq /
+			cdev->rdev.lldi.nchan;
+		ep->ctrlq_idx = cxgb4_port_idx(pdev);
+		ep->rss_qid = cdev->rdev.lldi.rxq_ids[
+			cxgb4_port_idx(pdev) * step];
+		set_tcp_window(ep, (struct port_info *)netdev_priv(pdev));
+		dev_put(pdev);
+	} else {
+		pdev = get_real_dev(n->dev);
+		ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t,
+					n, pdev, 0);
+		if (!ep->l2t)
+			goto out;
+		ep->mtu = dst_mtu(dst);
+		ep->tx_chan = cxgb4_port_chan(pdev);
+		ep->smac_idx = cxgb4_tp_smt_idx(adapter_type,
+						cxgb4_port_viid(pdev));
+		step = cdev->rdev.lldi.ntxq /
+			cdev->rdev.lldi.nchan;
+		ep->txq_idx = cxgb4_port_idx(pdev) * step;
+		ep->ctrlq_idx = cxgb4_port_idx(pdev);
+		step = cdev->rdev.lldi.nrxq /
+			cdev->rdev.lldi.nchan;
+		ep->rss_qid = cdev->rdev.lldi.rxq_ids[
+			cxgb4_port_idx(pdev) * step];
+		set_tcp_window(ep, (struct port_info *)netdev_priv(pdev));
+
+		if (clear_mpa_v1) {
+			ep->retry_with_mpa_v1 = 0;
+			ep->tried_with_mpa_v1 = 0;
+		}
+	}
+	err = 0;
+out:
+	rcu_read_unlock();
+
+	neigh_release(n);
+
+	return err;
+}
+
+static int c4iw_reconnect(struct c4iw_ep *ep)
+{
+	int err = 0;
+	int size = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)
+				    &ep->com.cm_id->m_local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)
+				    &ep->com.cm_id->m_remote_addr;
+	struct sockaddr_in6 *laddr6 = (struct sockaddr_in6 *)
+				      &ep->com.cm_id->m_local_addr;
+	struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *)
+				      &ep->com.cm_id->m_remote_addr;
+	int iptype;
+	__u8 *ra;
+
+	pr_debug("qp %p cm_id %p\n", ep->com.qp, ep->com.cm_id);
+	c4iw_init_wr_wait(ep->com.wr_waitp);
+
+	/* When MPA revision is different on nodes, the node with MPA_rev=2
+	 * tries to reconnect with MPA_rev 1 for the same EP through
+	 * c4iw_reconnect(), where the same EP is assigned with new tid for
+	 * further connection establishment. As we are using the same EP pointer
+	 * for reconnect, few skbs are used during the previous c4iw_connect(),
+	 * which leaves the EP with inadequate skbs for further
+	 * c4iw_reconnect(), Further causing a crash due to an empty
+	 * skb_list() during peer_abort(). Allocate skbs which is already used.
+	 */
+	size = (CN_MAX_CON_BUF - skb_queue_len(&ep->com.ep_skb_list));
+	if (alloc_ep_skb_list(&ep->com.ep_skb_list, size)) {
+		err = -ENOMEM;
+		goto fail1;
+	}
+
+	/*
+	 * Allocate an active TID to initiate a TCP connection.
+	 */
+	ep->atid = cxgb4_alloc_atid(ep->com.dev->rdev.lldi.tids, ep);
+	if (ep->atid == -1) {
+		pr_err("%s - cannot alloc atid\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+	insert_handle(ep->com.dev, &ep->com.dev->atid_idr, ep, ep->atid);
+
+	/* find a route */
+	if (ep->com.cm_id->m_local_addr.ss_family == AF_INET) {
+		ep->dst = cxgb_find_route(&ep->com.dev->rdev.lldi, get_real_dev,
+					  laddr->sin_addr.s_addr,
+					  raddr->sin_addr.s_addr,
+					  laddr->sin_port,
+					  raddr->sin_port, ep->com.cm_id->tos);
+		iptype = 4;
+		ra = (__u8 *)&raddr->sin_addr;
+	} else {
+		ep->dst = cxgb_find_route6(&ep->com.dev->rdev.lldi,
+					   get_real_dev,
+					   laddr6->sin6_addr.s6_addr,
+					   raddr6->sin6_addr.s6_addr,
+					   laddr6->sin6_port,
+					   raddr6->sin6_port, 0,
+					   raddr6->sin6_scope_id);
+		iptype = 6;
+		ra = (__u8 *)&raddr6->sin6_addr;
+	}
+	if (!ep->dst) {
+		pr_err("%s - cannot find route\n", __func__);
+		err = -EHOSTUNREACH;
+		goto fail3;
+	}
+	err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, false,
+			ep->com.dev->rdev.lldi.adapter_type,
+			ep->com.cm_id->tos);
+	if (err) {
+		pr_err("%s - cannot alloc l2e\n", __func__);
+		goto fail4;
+	}
+
+	pr_debug("txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n",
+		 ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid,
+		 ep->l2t->idx);
+
+	state_set(&ep->com, CONNECTING);
+	ep->tos = ep->com.cm_id->tos;
+
+	/* send connect request to rnic */
+	err = send_connect(ep);
+	if (!err)
+		goto out;
+
+	cxgb4_l2t_release(ep->l2t);
+fail4:
+	dst_release(ep->dst);
+fail3:
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
+fail2:
+	/*
+	 * remember to send notification to upper layer.
+	 * We are in here so the upper layer is not aware that this is
+	 * re-connect attempt and so, upper layer is still waiting for
+	 * response of 1st connect request.
+	 */
+	connect_reply_upcall(ep, -ECONNRESET);
+fail1:
+	c4iw_put_ep(&ep->com);
+out:
+	return err;
+}
+
+static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+	unsigned int atid = TID_TID_G(AOPEN_ATID_G(
+				      ntohl(rpl->atid_status)));
+	struct tid_info *t = dev->rdev.lldi.tids;
+	int status = AOPEN_STATUS_G(ntohl(rpl->atid_status));
+	struct sockaddr_in *la;
+	struct sockaddr_in *ra;
+	struct sockaddr_in6 *la6;
+	struct sockaddr_in6 *ra6;
+	int ret = 0;
+
+	ep = lookup_atid(t, atid);
+	la = (struct sockaddr_in *)&ep->com.local_addr;
+	ra = (struct sockaddr_in *)&ep->com.remote_addr;
+	la6 = (struct sockaddr_in6 *)&ep->com.local_addr;
+	ra6 = (struct sockaddr_in6 *)&ep->com.remote_addr;
+
+	pr_debug("ep %p atid %u status %u errno %d\n", ep, atid,
+		 status, status2errno(status));
+
+	if (cxgb_is_neg_adv(status)) {
+		pr_debug("Connection problems for atid %u status %u (%s)\n",
+			 atid, status, neg_adv_str(status));
+		ep->stats.connect_neg_adv++;
+		mutex_lock(&dev->rdev.stats.lock);
+		dev->rdev.stats.neg_adv++;
+		mutex_unlock(&dev->rdev.stats.lock);
+		return 0;
+	}
+
+	set_bit(ACT_OPEN_RPL, &ep->com.history);
+
+	/*
+	 * Log interesting failures.
+	 */
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+	case CPL_ERR_CONN_TIMEDOUT:
+		break;
+	case CPL_ERR_TCAM_FULL:
+		mutex_lock(&dev->rdev.stats.lock);
+		dev->rdev.stats.tcam_full++;
+		mutex_unlock(&dev->rdev.stats.lock);
+		if (ep->com.local_addr.ss_family == AF_INET &&
+		    dev->rdev.lldi.enable_fw_ofld_conn) {
+			ret = send_fw_act_open_req(ep, TID_TID_G(AOPEN_ATID_G(
+						   ntohl(rpl->atid_status))));
+			if (ret)
+				goto fail;
+			return 0;
+		}
+		break;
+	case CPL_ERR_CONN_EXIST:
+		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
+			set_bit(ACT_RETRY_INUSE, &ep->com.history);
+			if (ep->com.remote_addr.ss_family == AF_INET6) {
+				struct sockaddr_in6 *sin6 =
+						(struct sockaddr_in6 *)
+						&ep->com.local_addr;
+				cxgb4_clip_release(
+						ep->com.dev->rdev.lldi.ports[0],
+						(const u32 *)
+						&sin6->sin6_addr.s6_addr, 1);
+			}
+			remove_handle(ep->com.dev, &ep->com.dev->atid_idr,
+					atid);
+			cxgb4_free_atid(t, atid);
+			dst_release(ep->dst);
+			cxgb4_l2t_release(ep->l2t);
+			c4iw_reconnect(ep);
+			return 0;
+		}
+		break;
+	default:
+		if (ep->com.local_addr.ss_family == AF_INET) {
+			pr_info("Active open failure - atid %u status %u errno %d %pI4:%u->%pI4:%u\n",
+				atid, status, status2errno(status),
+				&la->sin_addr.s_addr, ntohs(la->sin_port),
+				&ra->sin_addr.s_addr, ntohs(ra->sin_port));
+		} else {
+			pr_info("Active open failure - atid %u status %u errno %d %pI6:%u->%pI6:%u\n",
+				atid, status, status2errno(status),
+				la6->sin6_addr.s6_addr, ntohs(la6->sin6_port),
+				ra6->sin6_addr.s6_addr, ntohs(ra6->sin6_port));
+		}
+		break;
+	}
+
+fail:
+	connect_reply_upcall(ep, status2errno(status));
+	state_set(&ep->com, DEAD);
+
+	if (ep->com.remote_addr.ss_family == AF_INET6) {
+		struct sockaddr_in6 *sin6 =
+			(struct sockaddr_in6 *)&ep->com.local_addr;
+		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+				   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+	}
+	if (status && act_open_has_tid(status))
+		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl),
+				 ep->com.local_addr.ss_family);
+
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid);
+	cxgb4_free_atid(t, atid);
+	dst_release(ep->dst);
+	cxgb4_l2t_release(ep->l2t);
+	c4iw_put_ep(&ep->com);
+
+	return 0;
+}
+
+static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_pass_open_rpl *rpl = cplhdr(skb);
+	unsigned int stid = GET_TID(rpl);
+	struct c4iw_listen_ep *ep = get_ep_from_stid(dev, stid);
+
+	if (!ep) {
+		pr_warn("%s stid %d lookup failure!\n", __func__, stid);
+		goto out;
+	}
+	pr_debug("ep %p status %d error %d\n", ep,
+		 rpl->status, status2errno(rpl->status));
+	c4iw_wake_up_noref(ep->com.wr_waitp, status2errno(rpl->status));
+	c4iw_put_ep(&ep->com);
+out:
+	return 0;
+}
+
+static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_close_listsvr_rpl *rpl = cplhdr(skb);
+	unsigned int stid = GET_TID(rpl);
+	struct c4iw_listen_ep *ep = get_ep_from_stid(dev, stid);
+
+	if (!ep) {
+		pr_warn("%s stid %d lookup failure!\n", __func__, stid);
+		goto out;
+	}
+	pr_debug("ep %p\n", ep);
+	c4iw_wake_up_noref(ep->com.wr_waitp, status2errno(rpl->status));
+	c4iw_put_ep(&ep->com);
+out:
+	return 0;
+}
+
+static int accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
+		     struct cpl_pass_accept_req *req)
+{
+	struct cpl_pass_accept_rpl *rpl;
+	unsigned int mtu_idx;
+	u64 opt0;
+	u32 opt2;
+	u32 wscale;
+	struct cpl_t5_pass_accept_rpl *rpl5 = NULL;
+	int win;
+	enum chip_type adapter_type = ep->com.dev->rdev.lldi.adapter_type;
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+
+	skb_get(skb);
+	rpl = cplhdr(skb);
+	if (!is_t4(adapter_type)) {
+		skb_trim(skb, roundup(sizeof(*rpl5), 16));
+		rpl5 = (void *)rpl;
+		INIT_TP_WR(rpl5, ep->hwtid);
+	} else {
+		skb_trim(skb, sizeof(*rpl));
+		INIT_TP_WR(rpl, ep->hwtid);
+	}
+	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
+						    ep->hwtid));
+
+	cxgb_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
+		      enable_tcp_timestamps && req->tcpopt.tstamp,
+		      (ep->com.remote_addr.ss_family == AF_INET) ? 0 : 1);
+	wscale = cxgb_compute_wscale(rcv_win);
+
+	/*
+	 * Specify the largest window that will fit in opt0. The
+	 * remainder will be specified in the rx_data_ack.
+	 */
+	win = ep->rcv_win >> 10;
+	if (win > RCV_BUFSIZ_M)
+		win = RCV_BUFSIZ_M;
+	opt0 = (nocong ? NO_CONG_F : 0) |
+	       KEEP_ALIVE_F |
+	       DELACK_F |
+	       WND_SCALE_V(wscale) |
+	       MSS_IDX_V(mtu_idx) |
+	       L2T_IDX_V(ep->l2t->idx) |
+	       TX_CHAN_V(ep->tx_chan) |
+	       SMAC_SEL_V(ep->smac_idx) |
+	       DSCP_V(ep->tos >> 2) |
+	       ULP_MODE_V(ULP_MODE_TCPDDP) |
+	       RCV_BUFSIZ_V(win);
+	opt2 = RX_CHANNEL_V(0) |
+	       RSS_QUEUE_VALID_F | RSS_QUEUE_V(ep->rss_qid);
+
+	if (enable_tcp_timestamps && req->tcpopt.tstamp)
+		opt2 |= TSTAMPS_EN_F;
+	if (enable_tcp_sack && req->tcpopt.sack)
+		opt2 |= SACK_EN_F;
+	if (wscale && enable_tcp_window_scaling)
+		opt2 |= WND_SCALE_EN_F;
+	if (enable_ecn) {
+		const struct tcphdr *tcph;
+		u32 hlen = ntohl(req->hdr_len);
+
+		if (CHELSIO_CHIP_VERSION(adapter_type) <= CHELSIO_T5)
+			tcph = (const void *)(req + 1) + ETH_HDR_LEN_G(hlen) +
+				IP_HDR_LEN_G(hlen);
+		else
+			tcph = (const void *)(req + 1) +
+				T6_ETH_HDR_LEN_G(hlen) + T6_IP_HDR_LEN_G(hlen);
+		if (tcph->ece && tcph->cwr)
+			opt2 |= CCTRL_ECN_V(1);
+	}
+	if (CHELSIO_CHIP_VERSION(adapter_type) > CHELSIO_T4) {
+		u32 isn = (prandom_u32() & ~7UL) - 1;
+		opt2 |= T5_OPT_2_VALID_F;
+		opt2 |= CONG_CNTRL_V(CONG_ALG_TAHOE);
+		opt2 |= T5_ISS_F;
+		rpl5 = (void *)rpl;
+		memset(&rpl5->iss, 0, roundup(sizeof(*rpl5)-sizeof(*rpl), 16));
+		if (peer2peer)
+			isn += 4;
+		rpl5->iss = cpu_to_be32(isn);
+		pr_debug("iss %u\n", be32_to_cpu(rpl5->iss));
+	}
+
+	rpl->opt0 = cpu_to_be64(opt0);
+	rpl->opt2 = cpu_to_be32(opt2);
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx);
+	t4_set_arp_err_handler(skb, ep, pass_accept_rpl_arp_failure);
+
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb)
+{
+	pr_debug("c4iw_dev %p tid %u\n", dev, hwtid);
+	skb_trim(skb, sizeof(struct cpl_tid_release));
+	release_tid(&dev->rdev, hwtid, skb);
+	return;
+}
+
+static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *child_ep = NULL, *parent_ep;
+	struct cpl_pass_accept_req *req = cplhdr(skb);
+	unsigned int stid = PASS_OPEN_TID_G(ntohl(req->tos_stid));
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int hwtid = GET_TID(req);
+	struct dst_entry *dst;
+	__u8 local_ip[16], peer_ip[16];
+	__be16 local_port, peer_port;
+	struct sockaddr_in6 *sin6;
+	int err;
+	u16 peer_mss = ntohs(req->tcpopt.mss);
+	int iptype;
+	unsigned short hdrs;
+	u8 tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid));
+
+	parent_ep = (struct c4iw_ep *)get_ep_from_stid(dev, stid);
+	if (!parent_ep) {
+		pr_err("%s connect request on invalid stid %d\n",
+		       __func__, stid);
+		goto reject;
+	}
+
+	if (state_read(&parent_ep->com) != LISTEN) {
+		pr_err("%s - listening ep not in LISTEN\n", __func__);
+		goto reject;
+	}
+
+	cxgb_get_4tuple(req, parent_ep->com.dev->rdev.lldi.adapter_type,
+			&iptype, local_ip, peer_ip, &local_port, &peer_port);
+
+	/* Find output route */
+	if (iptype == 4)  {
+		pr_debug("parent ep %p hwtid %u laddr %pI4 raddr %pI4 lport %d rport %d peer_mss %d\n"
+			 , parent_ep, hwtid,
+			 local_ip, peer_ip, ntohs(local_port),
+			 ntohs(peer_port), peer_mss);
+		dst = cxgb_find_route(&dev->rdev.lldi, get_real_dev,
+				      *(__be32 *)local_ip, *(__be32 *)peer_ip,
+				      local_port, peer_port, tos);
+	} else {
+		pr_debug("parent ep %p hwtid %u laddr %pI6 raddr %pI6 lport %d rport %d peer_mss %d\n"
+			 , parent_ep, hwtid,
+			 local_ip, peer_ip, ntohs(local_port),
+			 ntohs(peer_port), peer_mss);
+		dst = cxgb_find_route6(&dev->rdev.lldi, get_real_dev,
+				local_ip, peer_ip, local_port, peer_port,
+				PASS_OPEN_TOS_G(ntohl(req->tos_stid)),
+				((struct sockaddr_in6 *)
+				 &parent_ep->com.local_addr)->sin6_scope_id);
+	}
+	if (!dst) {
+		pr_err("%s - failed to find dst entry!\n", __func__);
+		goto reject;
+	}
+
+	child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL);
+	if (!child_ep) {
+		pr_err("%s - failed to allocate ep entry!\n", __func__);
+		dst_release(dst);
+		goto reject;
+	}
+
+	err = import_ep(child_ep, iptype, peer_ip, dst, dev, false,
+			parent_ep->com.dev->rdev.lldi.adapter_type, tos);
+	if (err) {
+		pr_err("%s - failed to allocate l2t entry!\n", __func__);
+		dst_release(dst);
+		kfree(child_ep);
+		goto reject;
+	}
+
+	hdrs = ((iptype == 4) ? sizeof(struct iphdr) : sizeof(struct ipv6hdr)) +
+	       sizeof(struct tcphdr) +
+	       ((enable_tcp_timestamps && req->tcpopt.tstamp) ? 12 : 0);
+	if (peer_mss && child_ep->mtu > (peer_mss + hdrs))
+		child_ep->mtu = peer_mss + hdrs;
+
+	skb_queue_head_init(&child_ep->com.ep_skb_list);
+	if (alloc_ep_skb_list(&child_ep->com.ep_skb_list, CN_MAX_CON_BUF))
+		goto fail;
+
+	state_set(&child_ep->com, CONNECTING);
+	child_ep->com.dev = dev;
+	child_ep->com.cm_id = NULL;
+
+	if (iptype == 4) {
+		struct sockaddr_in *sin = (struct sockaddr_in *)
+			&child_ep->com.local_addr;
+
+		sin->sin_family = AF_INET;
+		sin->sin_port = local_port;
+		sin->sin_addr.s_addr = *(__be32 *)local_ip;
+
+		sin = (struct sockaddr_in *)&child_ep->com.local_addr;
+		sin->sin_family = AF_INET;
+		sin->sin_port = ((struct sockaddr_in *)
+				 &parent_ep->com.local_addr)->sin_port;
+		sin->sin_addr.s_addr = *(__be32 *)local_ip;
+
+		sin = (struct sockaddr_in *)&child_ep->com.remote_addr;
+		sin->sin_family = AF_INET;
+		sin->sin_port = peer_port;
+		sin->sin_addr.s_addr = *(__be32 *)peer_ip;
+	} else {
+		sin6 = (struct sockaddr_in6 *)&child_ep->com.local_addr;
+		sin6->sin6_family = PF_INET6;
+		sin6->sin6_port = local_port;
+		memcpy(sin6->sin6_addr.s6_addr, local_ip, 16);
+
+		sin6 = (struct sockaddr_in6 *)&child_ep->com.local_addr;
+		sin6->sin6_family = PF_INET6;
+		sin6->sin6_port = ((struct sockaddr_in6 *)
+				   &parent_ep->com.local_addr)->sin6_port;
+		memcpy(sin6->sin6_addr.s6_addr, local_ip, 16);
+
+		sin6 = (struct sockaddr_in6 *)&child_ep->com.remote_addr;
+		sin6->sin6_family = PF_INET6;
+		sin6->sin6_port = peer_port;
+		memcpy(sin6->sin6_addr.s6_addr, peer_ip, 16);
+	}
+
+	c4iw_get_ep(&parent_ep->com);
+	child_ep->parent_ep = parent_ep;
+	child_ep->tos = tos;
+	child_ep->dst = dst;
+	child_ep->hwtid = hwtid;
+
+	pr_debug("tx_chan %u smac_idx %u rss_qid %u\n",
+		 child_ep->tx_chan, child_ep->smac_idx, child_ep->rss_qid);
+
+	timer_setup(&child_ep->timer, ep_timeout, 0);
+	cxgb4_insert_tid(t, child_ep, hwtid,
+			 child_ep->com.local_addr.ss_family);
+	insert_ep_tid(child_ep);
+	if (accept_cr(child_ep, skb, req)) {
+		c4iw_put_ep(&parent_ep->com);
+		release_ep_resources(child_ep);
+	} else {
+		set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
+	}
+	if (iptype == 6) {
+		sin6 = (struct sockaddr_in6 *)&child_ep->com.local_addr;
+		cxgb4_clip_get(child_ep->com.dev->rdev.lldi.ports[0],
+			       (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+	}
+	goto out;
+fail:
+	c4iw_put_ep(&child_ep->com);
+reject:
+	reject_cr(dev, hwtid, skb);
+out:
+	if (parent_ep)
+		c4iw_put_ep(&parent_ep->com);
+	return 0;
+}
+
+static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_pass_establish *req = cplhdr(skb);
+	unsigned int tid = GET_TID(req);
+	int ret;
+
+	ep = get_ep_from_tid(dev, tid);
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+	ep->snd_seq = be32_to_cpu(req->snd_isn);
+	ep->rcv_seq = be32_to_cpu(req->rcv_isn);
+
+	pr_debug("ep %p hwtid %u tcp_opt 0x%02x\n", ep, tid,
+		 ntohs(req->tcp_opt));
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	dst_confirm(ep->dst);
+	mutex_lock(&ep->com.mutex);
+	ep->com.state = MPA_REQ_WAIT;
+	start_ep_timer(ep);
+	set_bit(PASS_ESTAB, &ep->com.history);
+	ret = send_flowc(ep);
+	mutex_unlock(&ep->com.mutex);
+	if (ret)
+		c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
+	c4iw_put_ep(&ep->com);
+
+	return 0;
+}
+
+static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_peer_close *hdr = cplhdr(skb);
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attrs;
+	int disconnect = 1;
+	int release = 0;
+	unsigned int tid = GET_TID(hdr);
+	int ret;
+
+	ep = get_ep_from_tid(dev, tid);
+	if (!ep)
+		return 0;
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+	dst_confirm(ep->dst);
+
+	set_bit(PEER_CLOSE, &ep->com.history);
+	mutex_lock(&ep->com.mutex);
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, CLOSING);
+		break;
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, CLOSING);
+		connect_reply_upcall(ep, -ECONNRESET);
+		break;
+	case MPA_REQ_RCVD:
+
+		/*
+		 * We're gonna mark this puppy DEAD, but keep
+		 * the reference on it until the ULP accepts or
+		 * rejects the CR. Also wake up anyone waiting
+		 * in rdma connection migration (see c4iw_accept_cr()).
+		 */
+		__state_set(&ep->com, CLOSING);
+		pr_debug("waking up ep %p tid %u\n", ep, ep->hwtid);
+		c4iw_wake_up_noref(ep->com.wr_waitp, -ECONNRESET);
+		break;
+	case MPA_REP_SENT:
+		__state_set(&ep->com, CLOSING);
+		pr_debug("waking up ep %p tid %u\n", ep, ep->hwtid);
+		c4iw_wake_up_noref(ep->com.wr_waitp, -ECONNRESET);
+		break;
+	case FPDU_MODE:
+		start_ep_timer(ep);
+		__state_set(&ep->com, CLOSING);
+		attrs.next_state = C4IW_QP_STATE_CLOSING;
+		ret = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		if (ret != -ECONNRESET) {
+			peer_close_upcall(ep);
+			disconnect = 1;
+		}
+		break;
+	case ABORTING:
+		disconnect = 0;
+		break;
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		disconnect = 0;
+		break;
+	case MORIBUND:
+		(void)stop_ep_timer(ep);
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = C4IW_QP_STATE_IDLE;
+			c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		}
+		close_complete_upcall(ep, 0);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		disconnect = 0;
+		break;
+	case DEAD:
+		disconnect = 0;
+		break;
+	default:
+		WARN_ONCE(1, "Bad endpoint state %u\n", ep->com.state);
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (disconnect)
+		c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+	if (release)
+		release_ep_resources(ep);
+	c4iw_put_ep(&ep->com);
+	return 0;
+}
+
+static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct c4iw_ep *ep;
+	struct sk_buff *rpl_skb;
+	struct c4iw_qp_attributes attrs;
+	int ret;
+	int release = 0;
+	unsigned int tid = GET_TID(req);
+	u32 len = roundup(sizeof(struct cpl_abort_rpl), 16);
+
+	ep = get_ep_from_tid(dev, tid);
+	if (!ep)
+		return 0;
+
+	if (cxgb_is_neg_adv(req->status)) {
+		pr_debug("Negative advice on abort- tid %u status %d (%s)\n",
+			 ep->hwtid, req->status, neg_adv_str(req->status));
+		ep->stats.abort_neg_adv++;
+		mutex_lock(&dev->rdev.stats.lock);
+		dev->rdev.stats.neg_adv++;
+		mutex_unlock(&dev->rdev.stats.lock);
+		goto deref_ep;
+	}
+	pr_debug("ep %p tid %u state %u\n", ep, ep->hwtid,
+		 ep->com.state);
+	set_bit(PEER_ABORT, &ep->com.history);
+
+	/*
+	 * Wake up any threads in rdma_init() or rdma_fini().
+	 * However, this is not needed if com state is just
+	 * MPA_REQ_SENT
+	 */
+	if (ep->com.state != MPA_REQ_SENT)
+		c4iw_wake_up_noref(ep->com.wr_waitp, -ECONNRESET);
+
+	mutex_lock(&ep->com.mutex);
+	switch (ep->com.state) {
+	case CONNECTING:
+		c4iw_put_ep(&ep->parent_ep->com);
+		break;
+	case MPA_REQ_WAIT:
+		(void)stop_ep_timer(ep);
+		break;
+	case MPA_REQ_SENT:
+		(void)stop_ep_timer(ep);
+		if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1))
+			connect_reply_upcall(ep, -ECONNRESET);
+		else {
+			/*
+			 * we just don't send notification upwards because we
+			 * want to retry with mpa_v1 without upper layers even
+			 * knowing it.
+			 *
+			 * do some housekeeping so as to re-initiate the
+			 * connection
+			 */
+			pr_info("%s: mpa_rev=%d. Retrying with mpav1\n",
+				__func__, mpa_rev);
+			ep->retry_with_mpa_v1 = 1;
+		}
+		break;
+	case MPA_REP_SENT:
+		break;
+	case MPA_REQ_RCVD:
+		break;
+	case MORIBUND:
+	case CLOSING:
+		stop_ep_timer(ep);
+		/*FALLTHROUGH*/
+	case FPDU_MODE:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = C4IW_QP_STATE_ERROR;
+			ret = c4iw_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, C4IW_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+			if (ret)
+				pr_err("%s - qp <- error failed!\n", __func__);
+		}
+		peer_abort_upcall(ep);
+		break;
+	case ABORTING:
+		break;
+	case DEAD:
+		pr_warn("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__);
+		mutex_unlock(&ep->com.mutex);
+		goto deref_ep;
+	default:
+		WARN_ONCE(1, "Bad endpoint state %u\n", ep->com.state);
+		break;
+	}
+	dst_confirm(ep->dst);
+	if (ep->com.state != ABORTING) {
+		__state_set(&ep->com, DEAD);
+		/* we don't release if we want to retry with mpa_v1 */
+		if (!ep->retry_with_mpa_v1)
+			release = 1;
+	}
+	mutex_unlock(&ep->com.mutex);
+
+	rpl_skb = skb_dequeue(&ep->com.ep_skb_list);
+	if (WARN_ON(!rpl_skb)) {
+		release = 1;
+		goto out;
+	}
+
+	cxgb_mk_abort_rpl(rpl_skb, len, ep->hwtid, ep->txq_idx);
+
+	c4iw_ofld_send(&ep->com.dev->rdev, rpl_skb);
+out:
+	if (release)
+		release_ep_resources(ep);
+	else if (ep->retry_with_mpa_v1) {
+		if (ep->com.remote_addr.ss_family == AF_INET6) {
+			struct sockaddr_in6 *sin6 =
+					(struct sockaddr_in6 *)
+					&ep->com.local_addr;
+			cxgb4_clip_release(
+					ep->com.dev->rdev.lldi.ports[0],
+					(const u32 *)&sin6->sin6_addr.s6_addr,
+					1);
+		}
+		remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
+		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid,
+				 ep->com.local_addr.ss_family);
+		dst_release(ep->dst);
+		cxgb4_l2t_release(ep->l2t);
+		c4iw_reconnect(ep);
+	}
+
+deref_ep:
+	c4iw_put_ep(&ep->com);
+	/* Dereferencing ep, referenced in peer_abort_intr() */
+	c4iw_put_ep(&ep->com);
+	return 0;
+}
+
+static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attrs;
+	struct cpl_close_con_rpl *rpl = cplhdr(skb);
+	int release = 0;
+	unsigned int tid = GET_TID(rpl);
+
+	ep = get_ep_from_tid(dev, tid);
+	if (!ep)
+		return 0;
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+
+	/* The cm_id may be null if we failed to connect */
+	mutex_lock(&ep->com.mutex);
+	set_bit(CLOSE_CON_RPL, &ep->com.history);
+	switch (ep->com.state) {
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		break;
+	case MORIBUND:
+		(void)stop_ep_timer(ep);
+		if ((ep->com.cm_id) && (ep->com.qp)) {
+			attrs.next_state = C4IW_QP_STATE_IDLE;
+			c4iw_modify_qp(ep->com.qp->rhp,
+					     ep->com.qp,
+					     C4IW_QP_ATTR_NEXT_STATE,
+					     &attrs, 1);
+		}
+		close_complete_upcall(ep, 0);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	case ABORTING:
+	case DEAD:
+		break;
+	default:
+		WARN_ONCE(1, "Bad endpoint state %u\n", ep->com.state);
+		break;
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (release)
+		release_ep_resources(ep);
+	c4iw_put_ep(&ep->com);
+	return 0;
+}
+
+static int terminate(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_rdma_terminate *rpl = cplhdr(skb);
+	unsigned int tid = GET_TID(rpl);
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attrs;
+
+	ep = get_ep_from_tid(dev, tid);
+
+	if (ep && ep->com.qp) {
+		pr_warn("TERM received tid %u qpid %u\n",
+			tid, ep->com.qp->wq.sq.qid);
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+			       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+	} else
+		pr_warn("TERM received tid %u no ep/qp\n", tid);
+	c4iw_put_ep(&ep->com);
+
+	return 0;
+}
+
+/*
+ * Upcall from the adapter indicating data has been transmitted.
+ * For us its just the single MPA request or reply.  We can now free
+ * the skb holding the mpa message.
+ */
+static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_fw4_ack *hdr = cplhdr(skb);
+	u8 credits = hdr->credits;
+	unsigned int tid = GET_TID(hdr);
+
+
+	ep = get_ep_from_tid(dev, tid);
+	if (!ep)
+		return 0;
+	pr_debug("ep %p tid %u credits %u\n",
+		 ep, ep->hwtid, credits);
+	if (credits == 0) {
+		pr_debug("0 credit ack ep %p tid %u state %u\n",
+			 ep, ep->hwtid, state_read(&ep->com));
+		goto out;
+	}
+
+	dst_confirm(ep->dst);
+	if (ep->mpa_skb) {
+		pr_debug("last streaming msg ack ep %p tid %u state %u initiator %u freeing skb\n",
+			 ep, ep->hwtid, state_read(&ep->com),
+			 ep->mpa_attr.initiator ? 1 : 0);
+		mutex_lock(&ep->com.mutex);
+		kfree_skb(ep->mpa_skb);
+		ep->mpa_skb = NULL;
+		if (test_bit(STOP_MPA_TIMER, &ep->com.flags))
+			stop_ep_timer(ep);
+		mutex_unlock(&ep->com.mutex);
+	}
+out:
+	c4iw_put_ep(&ep->com);
+	return 0;
+}
+
+int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	int abort;
+	struct c4iw_ep *ep = to_ep(cm_id);
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+
+	mutex_lock(&ep->com.mutex);
+	if (ep->com.state != MPA_REQ_RCVD) {
+		mutex_unlock(&ep->com.mutex);
+		c4iw_put_ep(&ep->com);
+		return -ECONNRESET;
+	}
+	set_bit(ULP_REJECT, &ep->com.history);
+	if (mpa_rev == 0)
+		abort = 1;
+	else
+		abort = send_mpa_reject(ep, pdata, pdata_len);
+	mutex_unlock(&ep->com.mutex);
+
+	stop_ep_timer(ep);
+	c4iw_ep_disconnect(ep, abort != 0, GFP_KERNEL);
+	c4iw_put_ep(&ep->com);
+	return 0;
+}
+
+int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	int err;
+	struct c4iw_qp_attributes attrs;
+	enum c4iw_qp_attr_mask mask;
+	struct c4iw_ep *ep = to_ep(cm_id);
+	struct c4iw_dev *h = to_c4iw_dev(cm_id->device);
+	struct c4iw_qp *qp = get_qhp(h, conn_param->qpn);
+	int abort = 0;
+
+	pr_debug("ep %p tid %u\n", ep, ep->hwtid);
+
+	mutex_lock(&ep->com.mutex);
+	if (ep->com.state != MPA_REQ_RCVD) {
+		err = -ECONNRESET;
+		goto err_out;
+	}
+
+	if (!qp) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	set_bit(ULP_ACCEPT, &ep->com.history);
+	if ((conn_param->ord > cur_max_read_depth(ep->com.dev)) ||
+	    (conn_param->ird > cur_max_read_depth(ep->com.dev))) {
+		err = -EINVAL;
+		goto err_abort;
+	}
+
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
+		if (conn_param->ord > ep->ird) {
+			if (RELAXED_IRD_NEGOTIATION) {
+				conn_param->ord = ep->ird;
+			} else {
+				ep->ird = conn_param->ird;
+				ep->ord = conn_param->ord;
+				send_mpa_reject(ep, conn_param->private_data,
+						conn_param->private_data_len);
+				err = -ENOMEM;
+				goto err_abort;
+			}
+		}
+		if (conn_param->ird < ep->ord) {
+			if (RELAXED_IRD_NEGOTIATION &&
+			    ep->ord <= h->rdev.lldi.max_ordird_qp) {
+				conn_param->ird = ep->ord;
+			} else {
+				err = -ENOMEM;
+				goto err_abort;
+			}
+		}
+	}
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (ep->mpa_attr.version == 1) {
+		if (peer2peer && ep->ird == 0)
+			ep->ird = 1;
+	} else {
+		if (peer2peer &&
+		    (ep->mpa_attr.p2p_type != FW_RI_INIT_P2PTYPE_DISABLED) &&
+		    (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) && ep->ird == 0)
+			ep->ird = 1;
+	}
+
+	pr_debug("ird %d ord %d\n", ep->ird, ep->ord);
+
+	ep->com.cm_id = cm_id;
+	ref_cm_id(&ep->com);
+	ep->com.qp = qp;
+	ref_qp(ep);
+
+	/* bind QP to EP and move to RTS */
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = C4IW_QP_STATE_RTS;
+
+	/* bind QP and TID with INIT_WR */
+	mask = C4IW_QP_ATTR_NEXT_STATE |
+			     C4IW_QP_ATTR_LLP_STREAM_HANDLE |
+			     C4IW_QP_ATTR_MPA_ATTR |
+			     C4IW_QP_ATTR_MAX_IRD |
+			     C4IW_QP_ATTR_MAX_ORD;
+
+	err = c4iw_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err_deref_cm_id;
+
+	set_bit(STOP_MPA_TIMER, &ep->com.flags);
+	err = send_mpa_reply(ep, conn_param->private_data,
+			     conn_param->private_data_len);
+	if (err)
+		goto err_deref_cm_id;
+
+	__state_set(&ep->com, FPDU_MODE);
+	established_upcall(ep);
+	mutex_unlock(&ep->com.mutex);
+	c4iw_put_ep(&ep->com);
+	return 0;
+err_deref_cm_id:
+	deref_cm_id(&ep->com);
+err_abort:
+	abort = 1;
+err_out:
+	mutex_unlock(&ep->com.mutex);
+	if (abort)
+		c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
+	c4iw_put_ep(&ep->com);
+	return err;
+}
+
+static int pick_local_ipaddrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id)
+{
+	struct in_device *ind;
+	int found = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
+
+	ind = in_dev_get(dev->rdev.lldi.ports[0]);
+	if (!ind)
+		return -EADDRNOTAVAIL;
+	for_primary_ifa(ind) {
+		laddr->sin_addr.s_addr = ifa->ifa_address;
+		raddr->sin_addr.s_addr = ifa->ifa_address;
+		found = 1;
+		break;
+	}
+	endfor_ifa(ind);
+	in_dev_put(ind);
+	return found ? 0 : -EADDRNOTAVAIL;
+}
+
+static int get_lladdr(struct net_device *dev, struct in6_addr *addr,
+		      unsigned char banned_flags)
+{
+	struct inet6_dev *idev;
+	int err = -EADDRNOTAVAIL;
+
+	rcu_read_lock();
+	idev = __in6_dev_get(dev);
+	if (idev != NULL) {
+		struct inet6_ifaddr *ifp;
+
+		read_lock_bh(&idev->lock);
+		list_for_each_entry(ifp, &idev->addr_list, if_list) {
+			if (ifp->scope == IFA_LINK &&
+			    !(ifp->flags & banned_flags)) {
+				memcpy(addr, &ifp->addr, 16);
+				err = 0;
+				break;
+			}
+		}
+		read_unlock_bh(&idev->lock);
+	}
+	rcu_read_unlock();
+	return err;
+}
+
+static int pick_local_ip6addrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id)
+{
+	struct in6_addr uninitialized_var(addr);
+	struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->m_local_addr;
+	struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->m_remote_addr;
+
+	if (!get_lladdr(dev->rdev.lldi.ports[0], &addr, IFA_F_TENTATIVE)) {
+		memcpy(la6->sin6_addr.s6_addr, &addr, 16);
+		memcpy(ra6->sin6_addr.s6_addr, &addr, 16);
+		return 0;
+	}
+	return -EADDRNOTAVAIL;
+}
+
+int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);
+	struct c4iw_ep *ep;
+	int err = 0;
+	struct sockaddr_in *laddr;
+	struct sockaddr_in *raddr;
+	struct sockaddr_in6 *laddr6;
+	struct sockaddr_in6 *raddr6;
+	__u8 *ra;
+	int iptype;
+
+	if ((conn_param->ord > cur_max_read_depth(dev)) ||
+	    (conn_param->ird > cur_max_read_depth(dev))) {
+		err = -EINVAL;
+		goto out;
+	}
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		pr_err("%s - cannot alloc ep\n", __func__);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	skb_queue_head_init(&ep->com.ep_skb_list);
+	if (alloc_ep_skb_list(&ep->com.ep_skb_list, CN_MAX_CON_BUF)) {
+		err = -ENOMEM;
+		goto fail1;
+	}
+
+	timer_setup(&ep->timer, ep_timeout, 0);
+	ep->plen = conn_param->private_data_len;
+	if (ep->plen)
+		memcpy(ep->mpa_pkt + sizeof(struct mpa_message),
+		       conn_param->private_data, ep->plen);
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (peer2peer && ep->ord == 0)
+		ep->ord = 1;
+
+	ep->com.cm_id = cm_id;
+	ref_cm_id(&ep->com);
+	ep->com.dev = dev;
+	ep->com.qp = get_qhp(dev, conn_param->qpn);
+	if (!ep->com.qp) {
+		pr_warn("%s qpn 0x%x not found!\n", __func__, conn_param->qpn);
+		err = -EINVAL;
+		goto fail2;
+	}
+	ref_qp(ep);
+	pr_debug("qpn 0x%x qp %p cm_id %p\n", conn_param->qpn,
+		 ep->com.qp, cm_id);
+
+	/*
+	 * Allocate an active TID to initiate a TCP connection.
+	 */
+	ep->atid = cxgb4_alloc_atid(dev->rdev.lldi.tids, ep);
+	if (ep->atid == -1) {
+		pr_err("%s - cannot alloc atid\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+	insert_handle(dev, &dev->atid_idr, ep, ep->atid);
+
+	memcpy(&ep->com.local_addr, &cm_id->m_local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&ep->com.remote_addr, &cm_id->m_remote_addr,
+	       sizeof(ep->com.remote_addr));
+
+	laddr = (struct sockaddr_in *)&ep->com.local_addr;
+	raddr = (struct sockaddr_in *)&ep->com.remote_addr;
+	laddr6 = (struct sockaddr_in6 *)&ep->com.local_addr;
+	raddr6 = (struct sockaddr_in6 *) &ep->com.remote_addr;
+
+	if (cm_id->m_remote_addr.ss_family == AF_INET) {
+		iptype = 4;
+		ra = (__u8 *)&raddr->sin_addr;
+
+		/*
+		 * Handle loopback requests to INADDR_ANY.
+		 */
+		if (raddr->sin_addr.s_addr == htonl(INADDR_ANY)) {
+			err = pick_local_ipaddrs(dev, cm_id);
+			if (err)
+				goto fail2;
+		}
+
+		/* find a route */
+		pr_debug("saddr %pI4 sport 0x%x raddr %pI4 rport 0x%x\n",
+			 &laddr->sin_addr, ntohs(laddr->sin_port),
+			 ra, ntohs(raddr->sin_port));
+		ep->dst = cxgb_find_route(&dev->rdev.lldi, get_real_dev,
+					  laddr->sin_addr.s_addr,
+					  raddr->sin_addr.s_addr,
+					  laddr->sin_port,
+					  raddr->sin_port, cm_id->tos);
+	} else {
+		iptype = 6;
+		ra = (__u8 *)&raddr6->sin6_addr;
+
+		/*
+		 * Handle loopback requests to INADDR_ANY.
+		 */
+		if (ipv6_addr_type(&raddr6->sin6_addr) == IPV6_ADDR_ANY) {
+			err = pick_local_ip6addrs(dev, cm_id);
+			if (err)
+				goto fail2;
+		}
+
+		/* find a route */
+		pr_debug("saddr %pI6 sport 0x%x raddr %pI6 rport 0x%x\n",
+			 laddr6->sin6_addr.s6_addr,
+			 ntohs(laddr6->sin6_port),
+			 raddr6->sin6_addr.s6_addr, ntohs(raddr6->sin6_port));
+		ep->dst = cxgb_find_route6(&dev->rdev.lldi, get_real_dev,
+					   laddr6->sin6_addr.s6_addr,
+					   raddr6->sin6_addr.s6_addr,
+					   laddr6->sin6_port,
+					   raddr6->sin6_port, 0,
+					   raddr6->sin6_scope_id);
+	}
+	if (!ep->dst) {
+		pr_err("%s - cannot find route\n", __func__);
+		err = -EHOSTUNREACH;
+		goto fail3;
+	}
+
+	err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true,
+			ep->com.dev->rdev.lldi.adapter_type, cm_id->tos);
+	if (err) {
+		pr_err("%s - cannot alloc l2e\n", __func__);
+		goto fail4;
+	}
+
+	pr_debug("txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n",
+		 ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid,
+		 ep->l2t->idx);
+
+	state_set(&ep->com, CONNECTING);
+	ep->tos = cm_id->tos;
+
+	/* send connect request to rnic */
+	err = send_connect(ep);
+	if (!err)
+		goto out;
+
+	cxgb4_l2t_release(ep->l2t);
+fail4:
+	dst_release(ep->dst);
+fail3:
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
+fail2:
+	skb_queue_purge(&ep->com.ep_skb_list);
+	deref_cm_id(&ep->com);
+fail1:
+	c4iw_put_ep(&ep->com);
+out:
+	return err;
+}
+
+static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
+{
+	int err;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)
+				    &ep->com.local_addr;
+
+	if (ipv6_addr_type(&sin6->sin6_addr) != IPV6_ADDR_ANY) {
+		err = cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
+				     (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+		if (err)
+			return err;
+	}
+	c4iw_init_wr_wait(ep->com.wr_waitp);
+	err = cxgb4_create_server6(ep->com.dev->rdev.lldi.ports[0],
+				   ep->stid, &sin6->sin6_addr,
+				   sin6->sin6_port,
+				   ep->com.dev->rdev.lldi.rxq_ids[0]);
+	if (!err)
+		err = c4iw_wait_for_reply(&ep->com.dev->rdev,
+					  ep->com.wr_waitp,
+					  0, 0, __func__);
+	else if (err > 0)
+		err = net_xmit_errno(err);
+	if (err) {
+		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+				   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+		pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n",
+		       err, ep->stid,
+		       sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port));
+	}
+	return err;
+}
+
+static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
+{
+	int err;
+	struct sockaddr_in *sin = (struct sockaddr_in *)
+				  &ep->com.local_addr;
+
+	if (dev->rdev.lldi.enable_fw_ofld_conn) {
+		do {
+			err = cxgb4_create_server_filter(
+				ep->com.dev->rdev.lldi.ports[0], ep->stid,
+				sin->sin_addr.s_addr, sin->sin_port, 0,
+				ep->com.dev->rdev.lldi.rxq_ids[0], 0, 0);
+			if (err == -EBUSY) {
+				if (c4iw_fatal_error(&ep->com.dev->rdev)) {
+					err = -EIO;
+					break;
+				}
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				schedule_timeout(usecs_to_jiffies(100));
+			}
+		} while (err == -EBUSY);
+	} else {
+		c4iw_init_wr_wait(ep->com.wr_waitp);
+		err = cxgb4_create_server(ep->com.dev->rdev.lldi.ports[0],
+				ep->stid, sin->sin_addr.s_addr, sin->sin_port,
+				0, ep->com.dev->rdev.lldi.rxq_ids[0]);
+		if (!err)
+			err = c4iw_wait_for_reply(&ep->com.dev->rdev,
+						  ep->com.wr_waitp,
+						  0, 0, __func__);
+		else if (err > 0)
+			err = net_xmit_errno(err);
+	}
+	if (err)
+		pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n"
+		       , err, ep->stid,
+		       &sin->sin_addr, ntohs(sin->sin_port));
+	return err;
+}
+
+int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	int err = 0;
+	struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);
+	struct c4iw_listen_ep *ep;
+
+	might_sleep();
+
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		pr_err("%s - cannot alloc ep\n", __func__);
+		err = -ENOMEM;
+		goto fail1;
+	}
+	skb_queue_head_init(&ep->com.ep_skb_list);
+	pr_debug("ep %p\n", ep);
+	ep->com.cm_id = cm_id;
+	ref_cm_id(&ep->com);
+	ep->com.dev = dev;
+	ep->backlog = backlog;
+	memcpy(&ep->com.local_addr, &cm_id->m_local_addr,
+	       sizeof(ep->com.local_addr));
+
+	/*
+	 * Allocate a server TID.
+	 */
+	if (dev->rdev.lldi.enable_fw_ofld_conn &&
+	    ep->com.local_addr.ss_family == AF_INET)
+		ep->stid = cxgb4_alloc_sftid(dev->rdev.lldi.tids,
+					     cm_id->m_local_addr.ss_family, ep);
+	else
+		ep->stid = cxgb4_alloc_stid(dev->rdev.lldi.tids,
+					    cm_id->m_local_addr.ss_family, ep);
+
+	if (ep->stid == -1) {
+		pr_err("%s - cannot alloc stid\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+	insert_handle(dev, &dev->stid_idr, ep, ep->stid);
+
+	memcpy(&ep->com.local_addr, &cm_id->m_local_addr,
+	       sizeof(ep->com.local_addr));
+
+	state_set(&ep->com, LISTEN);
+	if (ep->com.local_addr.ss_family == AF_INET)
+		err = create_server4(dev, ep);
+	else
+		err = create_server6(dev, ep);
+	if (!err) {
+		cm_id->provider_data = ep;
+		goto out;
+	}
+	remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid);
+	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
+			ep->com.local_addr.ss_family);
+fail2:
+	deref_cm_id(&ep->com);
+	c4iw_put_ep(&ep->com);
+fail1:
+out:
+	return err;
+}
+
+int c4iw_destroy_listen(struct iw_cm_id *cm_id)
+{
+	int err;
+	struct c4iw_listen_ep *ep = to_listen_ep(cm_id);
+
+	pr_debug("ep %p\n", ep);
+
+	might_sleep();
+	state_set(&ep->com, DEAD);
+	if (ep->com.dev->rdev.lldi.enable_fw_ofld_conn &&
+	    ep->com.local_addr.ss_family == AF_INET) {
+		err = cxgb4_remove_server_filter(
+			ep->com.dev->rdev.lldi.ports[0], ep->stid,
+			ep->com.dev->rdev.lldi.rxq_ids[0], 0);
+	} else {
+		struct sockaddr_in6 *sin6;
+		c4iw_init_wr_wait(ep->com.wr_waitp);
+		err = cxgb4_remove_server(
+				ep->com.dev->rdev.lldi.ports[0], ep->stid,
+				ep->com.dev->rdev.lldi.rxq_ids[0], 0);
+		if (err)
+			goto done;
+		err = c4iw_wait_for_reply(&ep->com.dev->rdev, ep->com.wr_waitp,
+					  0, 0, __func__);
+		sin6 = (struct sockaddr_in6 *)&ep->com.local_addr;
+		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+				   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+	}
+	remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid);
+	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
+			ep->com.local_addr.ss_family);
+done:
+	deref_cm_id(&ep->com);
+	c4iw_put_ep(&ep->com);
+	return err;
+}
+
+int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
+{
+	int ret = 0;
+	int close = 0;
+	int fatal = 0;
+	struct c4iw_rdev *rdev;
+
+	mutex_lock(&ep->com.mutex);
+
+	pr_debug("ep %p state %s, abrupt %d\n", ep,
+		 states[ep->com.state], abrupt);
+
+	/*
+	 * Ref the ep here in case we have fatal errors causing the
+	 * ep to be released and freed.
+	 */
+	c4iw_get_ep(&ep->com);
+
+	rdev = &ep->com.dev->rdev;
+	if (c4iw_fatal_error(rdev)) {
+		fatal = 1;
+		close_complete_upcall(ep, -EIO);
+		ep->com.state = DEAD;
+	}
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+	case MPA_REQ_SENT:
+	case MPA_REQ_RCVD:
+	case MPA_REP_SENT:
+	case FPDU_MODE:
+	case CONNECTING:
+		close = 1;
+		if (abrupt)
+			ep->com.state = ABORTING;
+		else {
+			ep->com.state = CLOSING;
+
+			/*
+			 * if we close before we see the fw4_ack() then we fix
+			 * up the timer state since we're reusing it.
+			 */
+			if (ep->mpa_skb &&
+			    test_bit(STOP_MPA_TIMER, &ep->com.flags)) {
+				clear_bit(STOP_MPA_TIMER, &ep->com.flags);
+				stop_ep_timer(ep);
+			}
+			start_ep_timer(ep);
+		}
+		set_bit(CLOSE_SENT, &ep->com.flags);
+		break;
+	case CLOSING:
+		if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) {
+			close = 1;
+			if (abrupt) {
+				(void)stop_ep_timer(ep);
+				ep->com.state = ABORTING;
+			} else
+				ep->com.state = MORIBUND;
+		}
+		break;
+	case MORIBUND:
+	case ABORTING:
+	case DEAD:
+		pr_debug("ignoring disconnect ep %p state %u\n",
+			 ep, ep->com.state);
+		break;
+	default:
+		WARN_ONCE(1, "Bad endpoint state %u\n", ep->com.state);
+		break;
+	}
+
+	if (close) {
+		if (abrupt) {
+			set_bit(EP_DISC_ABORT, &ep->com.history);
+			close_complete_upcall(ep, -ECONNRESET);
+			ret = send_abort(ep);
+		} else {
+			set_bit(EP_DISC_CLOSE, &ep->com.history);
+			ret = send_halfclose(ep);
+		}
+		if (ret) {
+			set_bit(EP_DISC_FAIL, &ep->com.history);
+			if (!abrupt) {
+				stop_ep_timer(ep);
+				close_complete_upcall(ep, -EIO);
+			}
+			if (ep->com.qp) {
+				struct c4iw_qp_attributes attrs;
+
+				attrs.next_state = C4IW_QP_STATE_ERROR;
+				ret = c4iw_modify_qp(ep->com.qp->rhp,
+						     ep->com.qp,
+						     C4IW_QP_ATTR_NEXT_STATE,
+						     &attrs, 1);
+				if (ret)
+					pr_err("%s - qp <- error failed!\n",
+					       __func__);
+			}
+			fatal = 1;
+		}
+	}
+	mutex_unlock(&ep->com.mutex);
+	c4iw_put_ep(&ep->com);
+	if (fatal)
+		release_ep_resources(ep);
+	return ret;
+}
+
+static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
+			struct cpl_fw6_msg_ofld_connection_wr_rpl *req)
+{
+	struct c4iw_ep *ep;
+	int atid = be32_to_cpu(req->tid);
+
+	ep = (struct c4iw_ep *)lookup_atid(dev->rdev.lldi.tids,
+					   (__force u32) req->tid);
+	if (!ep)
+		return;
+
+	switch (req->retval) {
+	case FW_ENOMEM:
+		set_bit(ACT_RETRY_NOMEM, &ep->com.history);
+		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
+			send_fw_act_open_req(ep, atid);
+			return;
+		}
+		/* fall through */
+	case FW_EADDRINUSE:
+		set_bit(ACT_RETRY_INUSE, &ep->com.history);
+		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
+			send_fw_act_open_req(ep, atid);
+			return;
+		}
+		break;
+	default:
+		pr_info("%s unexpected ofld conn wr retval %d\n",
+		       __func__, req->retval);
+		break;
+	}
+	pr_err("active ofld_connect_wr failure %d atid %d\n",
+	       req->retval, atid);
+	mutex_lock(&dev->rdev.stats.lock);
+	dev->rdev.stats.act_ofld_conn_fails++;
+	mutex_unlock(&dev->rdev.stats.lock);
+	connect_reply_upcall(ep, status2errno(req->retval));
+	state_set(&ep->com, DEAD);
+	if (ep->com.remote_addr.ss_family == AF_INET6) {
+		struct sockaddr_in6 *sin6 =
+			(struct sockaddr_in6 *)&ep->com.local_addr;
+		cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+				   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+	}
+	remove_handle(dev, &dev->atid_idr, atid);
+	cxgb4_free_atid(dev->rdev.lldi.tids, atid);
+	dst_release(ep->dst);
+	cxgb4_l2t_release(ep->l2t);
+	c4iw_put_ep(&ep->com);
+}
+
+static void passive_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
+			struct cpl_fw6_msg_ofld_connection_wr_rpl *req)
+{
+	struct sk_buff *rpl_skb;
+	struct cpl_pass_accept_req *cpl;
+	int ret;
+
+	rpl_skb = (struct sk_buff *)(unsigned long)req->cookie;
+	if (req->retval) {
+		pr_err("%s passive open failure %d\n", __func__, req->retval);
+		mutex_lock(&dev->rdev.stats.lock);
+		dev->rdev.stats.pas_ofld_conn_fails++;
+		mutex_unlock(&dev->rdev.stats.lock);
+		kfree_skb(rpl_skb);
+	} else {
+		cpl = (struct cpl_pass_accept_req *)cplhdr(rpl_skb);
+		OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ,
+					(__force u32) htonl(
+					(__force u32) req->tid)));
+		ret = pass_accept_req(dev, rpl_skb);
+		if (!ret)
+			kfree_skb(rpl_skb);
+	}
+	return;
+}
+
+static int deferred_fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_fw6_msg *rpl = cplhdr(skb);
+	struct cpl_fw6_msg_ofld_connection_wr_rpl *req;
+
+	switch (rpl->type) {
+	case FW6_TYPE_CQE:
+		c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]);
+		break;
+	case FW6_TYPE_OFLD_CONNECTION_WR_RPL:
+		req = (struct cpl_fw6_msg_ofld_connection_wr_rpl *)rpl->data;
+		switch (req->t_state) {
+		case TCP_SYN_SENT:
+			active_ofld_conn_reply(dev, skb, req);
+			break;
+		case TCP_SYN_RECV:
+			passive_ofld_conn_reply(dev, skb, req);
+			break;
+		default:
+			pr_err("%s unexpected ofld conn wr state %d\n",
+			       __func__, req->t_state);
+			break;
+		}
+		break;
+	}
+	return 0;
+}
+
+static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
+{
+	__be32 l2info;
+	__be16 hdr_len, vlantag, len;
+	u16 eth_hdr_len;
+	int tcp_hdr_len, ip_hdr_len;
+	u8 intf;
+	struct cpl_rx_pkt *cpl = cplhdr(skb);
+	struct cpl_pass_accept_req *req;
+	struct tcp_options_received tmp_opt;
+	struct c4iw_dev *dev;
+	enum chip_type type;
+
+	dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
+	/* Store values from cpl_rx_pkt in temporary location. */
+	vlantag = cpl->vlan;
+	len = cpl->len;
+	l2info  = cpl->l2info;
+	hdr_len = cpl->hdr_len;
+	intf = cpl->iff;
+
+	__skb_pull(skb, sizeof(*req) + sizeof(struct rss_header));
+
+	/*
+	 * We need to parse the TCP options from SYN packet.
+	 * to generate cpl_pass_accept_req.
+	 */
+	memset(&tmp_opt, 0, sizeof(tmp_opt));
+	tcp_clear_options(&tmp_opt);
+	tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL);
+
+	req = __skb_push(skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	req->l2info = cpu_to_be16(SYN_INTF_V(intf) |
+			 SYN_MAC_IDX_V(RX_MACIDX_G(
+			 be32_to_cpu(l2info))) |
+			 SYN_XACT_MATCH_F);
+	type = dev->rdev.lldi.adapter_type;
+	tcp_hdr_len = RX_TCPHDR_LEN_G(be16_to_cpu(hdr_len));
+	ip_hdr_len = RX_IPHDR_LEN_G(be16_to_cpu(hdr_len));
+	req->hdr_len =
+		cpu_to_be32(SYN_RX_CHAN_V(RX_CHAN_G(be32_to_cpu(l2info))));
+	if (CHELSIO_CHIP_VERSION(type) <= CHELSIO_T5) {
+		eth_hdr_len = is_t4(type) ?
+				RX_ETHHDR_LEN_G(be32_to_cpu(l2info)) :
+				RX_T5_ETHHDR_LEN_G(be32_to_cpu(l2info));
+		req->hdr_len |= cpu_to_be32(TCP_HDR_LEN_V(tcp_hdr_len) |
+					    IP_HDR_LEN_V(ip_hdr_len) |
+					    ETH_HDR_LEN_V(eth_hdr_len));
+	} else { /* T6 and later */
+		eth_hdr_len = RX_T6_ETHHDR_LEN_G(be32_to_cpu(l2info));
+		req->hdr_len |= cpu_to_be32(T6_TCP_HDR_LEN_V(tcp_hdr_len) |
+					    T6_IP_HDR_LEN_V(ip_hdr_len) |
+					    T6_ETH_HDR_LEN_V(eth_hdr_len));
+	}
+	req->vlan = vlantag;
+	req->len = len;
+	req->tos_stid = cpu_to_be32(PASS_OPEN_TID_V(stid) |
+				    PASS_OPEN_TOS_V(tos));
+	req->tcpopt.mss = htons(tmp_opt.mss_clamp);
+	if (tmp_opt.wscale_ok)
+		req->tcpopt.wsf = tmp_opt.snd_wscale;
+	req->tcpopt.tstamp = tmp_opt.saw_tstamp;
+	if (tmp_opt.sack_ok)
+		req->tcpopt.sack = 1;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ, 0));
+	return;
+}
+
+static void send_fw_pass_open_req(struct c4iw_dev *dev, struct sk_buff *skb,
+				  __be32 laddr, __be16 lport,
+				  __be32 raddr, __be16 rport,
+				  u32 rcv_isn, u32 filter, u16 window,
+				  u32 rss_qid, u8 port_id)
+{
+	struct sk_buff *req_skb;
+	struct fw_ofld_connection_wr *req;
+	struct cpl_pass_accept_req *cpl = cplhdr(skb);
+	int ret;
+
+	req_skb = alloc_skb(sizeof(struct fw_ofld_connection_wr), GFP_KERNEL);
+	if (!req_skb)
+		return;
+	req = __skb_put_zero(req_skb, sizeof(*req));
+	req->op_compl = htonl(WR_OP_V(FW_OFLD_CONNECTION_WR) | FW_WR_COMPL_F);
+	req->len16_pkd = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*req), 16)));
+	req->le.version_cpl = htonl(FW_OFLD_CONNECTION_WR_CPL_F);
+	req->le.filter = (__force __be32) filter;
+	req->le.lport = lport;
+	req->le.pport = rport;
+	req->le.u.ipv4.lip = laddr;
+	req->le.u.ipv4.pip = raddr;
+	req->tcb.rcv_nxt = htonl(rcv_isn + 1);
+	req->tcb.rcv_adv = htons(window);
+	req->tcb.t_state_to_astid =
+		 htonl(FW_OFLD_CONNECTION_WR_T_STATE_V(TCP_SYN_RECV) |
+			FW_OFLD_CONNECTION_WR_RCV_SCALE_V(cpl->tcpopt.wsf) |
+			FW_OFLD_CONNECTION_WR_ASTID_V(
+			PASS_OPEN_TID_G(ntohl(cpl->tos_stid))));
+
+	/*
+	 * We store the qid in opt2 which will be used by the firmware
+	 * to send us the wr response.
+	 */
+	req->tcb.opt2 = htonl(RSS_QUEUE_V(rss_qid));
+
+	/*
+	 * We initialize the MSS index in TCB to 0xF.
+	 * So that when driver sends cpl_pass_accept_rpl
+	 * TCB picks up the correct value. If this was 0
+	 * TP will ignore any value > 0 for MSS index.
+	 */
+	req->tcb.opt0 = cpu_to_be64(MSS_IDX_V(0xF));
+	req->cookie = (uintptr_t)skb;
+
+	set_wr_txq(req_skb, CPL_PRIORITY_CONTROL, port_id);
+	ret = cxgb4_ofld_send(dev->rdev.lldi.ports[0], req_skb);
+	if (ret < 0) {
+		pr_err("%s - cxgb4_ofld_send error %d - dropping\n", __func__,
+		       ret);
+		kfree_skb(skb);
+		kfree_skb(req_skb);
+	}
+}
+
+/*
+ * Handler for CPL_RX_PKT message. Need to handle cpl_rx_pkt
+ * messages when a filter is being used instead of server to
+ * redirect a syn packet. When packets hit filter they are redirected
+ * to the offload queue and driver tries to establish the connection
+ * using firmware work request.
+ */
+static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	int stid;
+	unsigned int filter;
+	struct ethhdr *eh = NULL;
+	struct vlan_ethhdr *vlan_eh = NULL;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct rss_header *rss = (void *)skb->data;
+	struct cpl_rx_pkt *cpl = (void *)skb->data;
+	struct cpl_pass_accept_req *req = (void *)(rss + 1);
+	struct l2t_entry *e;
+	struct dst_entry *dst;
+	struct c4iw_ep *lep = NULL;
+	u16 window;
+	struct port_info *pi;
+	struct net_device *pdev;
+	u16 rss_qid, eth_hdr_len;
+	int step;
+	struct neighbour *neigh;
+
+	/* Drop all non-SYN packets */
+	if (!(cpl->l2info & cpu_to_be32(RXF_SYN_F)))
+		goto reject;
+
+	/*
+	 * Drop all packets which did not hit the filter.
+	 * Unlikely to happen.
+	 */
+	if (!(rss->filter_hit && rss->filter_tid))
+		goto reject;
+
+	/*
+	 * Calculate the server tid from filter hit index from cpl_rx_pkt.
+	 */
+	stid = (__force int) cpu_to_be32((__force u32) rss->hash_val);
+
+	lep = (struct c4iw_ep *)get_ep_from_stid(dev, stid);
+	if (!lep) {
+		pr_warn("%s connect request on invalid stid %d\n",
+			__func__, stid);
+		goto reject;
+	}
+
+	switch (CHELSIO_CHIP_VERSION(dev->rdev.lldi.adapter_type)) {
+	case CHELSIO_T4:
+		eth_hdr_len = RX_ETHHDR_LEN_G(be32_to_cpu(cpl->l2info));
+		break;
+	case CHELSIO_T5:
+		eth_hdr_len = RX_T5_ETHHDR_LEN_G(be32_to_cpu(cpl->l2info));
+		break;
+	case CHELSIO_T6:
+		eth_hdr_len = RX_T6_ETHHDR_LEN_G(be32_to_cpu(cpl->l2info));
+		break;
+	default:
+		pr_err("T%d Chip is not supported\n",
+		       CHELSIO_CHIP_VERSION(dev->rdev.lldi.adapter_type));
+		goto reject;
+	}
+
+	if (eth_hdr_len == ETH_HLEN) {
+		eh = (struct ethhdr *)(req + 1);
+		iph = (struct iphdr *)(eh + 1);
+	} else {
+		vlan_eh = (struct vlan_ethhdr *)(req + 1);
+		iph = (struct iphdr *)(vlan_eh + 1);
+		skb->vlan_tci = ntohs(cpl->vlan);
+	}
+
+	if (iph->version != 0x4)
+		goto reject;
+
+	tcph = (struct tcphdr *)(iph + 1);
+	skb_set_network_header(skb, (void *)iph - (void *)rss);
+	skb_set_transport_header(skb, (void *)tcph - (void *)rss);
+	skb_get(skb);
+
+	pr_debug("lip 0x%x lport %u pip 0x%x pport %u tos %d\n",
+		 ntohl(iph->daddr), ntohs(tcph->dest), ntohl(iph->saddr),
+		 ntohs(tcph->source), iph->tos);
+
+	dst = cxgb_find_route(&dev->rdev.lldi, get_real_dev,
+			      iph->daddr, iph->saddr, tcph->dest,
+			      tcph->source, iph->tos);
+	if (!dst) {
+		pr_err("%s - failed to find dst entry!\n", __func__);
+		goto reject;
+	}
+	neigh = dst_neigh_lookup_skb(dst, skb);
+
+	if (!neigh) {
+		pr_err("%s - failed to allocate neigh!\n", __func__);
+		goto free_dst;
+	}
+
+	if (neigh->dev->flags & IFF_LOOPBACK) {
+		pdev = ip_dev_find(&init_net, iph->daddr);
+		e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh,
+				    pdev, 0);
+		pi = (struct port_info *)netdev_priv(pdev);
+		dev_put(pdev);
+	} else {
+		pdev = get_real_dev(neigh->dev);
+		e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh,
+					pdev, 0);
+		pi = (struct port_info *)netdev_priv(pdev);
+	}
+	neigh_release(neigh);
+	if (!e) {
+		pr_err("%s - failed to allocate l2t entry!\n",
+		       __func__);
+		goto free_dst;
+	}
+
+	step = dev->rdev.lldi.nrxq / dev->rdev.lldi.nchan;
+	rss_qid = dev->rdev.lldi.rxq_ids[pi->port_id * step];
+	window = (__force u16) htons((__force u16)tcph->window);
+
+	/* Calcuate filter portion for LE region. */
+	filter = (__force unsigned int) cpu_to_be32(cxgb4_select_ntuple(
+						    dev->rdev.lldi.ports[0],
+						    e));
+
+	/*
+	 * Synthesize the cpl_pass_accept_req. We have everything except the
+	 * TID. Once firmware sends a reply with TID we update the TID field
+	 * in cpl and pass it through the regular cpl_pass_accept_req path.
+	 */
+	build_cpl_pass_accept_req(skb, stid, iph->tos);
+	send_fw_pass_open_req(dev, skb, iph->daddr, tcph->dest, iph->saddr,
+			      tcph->source, ntohl(tcph->seq), filter, window,
+			      rss_qid, pi->port_id);
+	cxgb4_l2t_release(e);
+free_dst:
+	dst_release(dst);
+reject:
+	if (lep)
+		c4iw_put_ep(&lep->com);
+	return 0;
+}
+
+/*
+ * These are the real handlers that are called from a
+ * work queue.
+ */
+static c4iw_handler_func work_handlers[NUM_CPL_CMDS + NUM_FAKE_CPLS] = {
+	[CPL_ACT_ESTABLISH] = act_establish,
+	[CPL_ACT_OPEN_RPL] = act_open_rpl,
+	[CPL_RX_DATA] = rx_data,
+	[CPL_ABORT_RPL_RSS] = abort_rpl,
+	[CPL_ABORT_RPL] = abort_rpl,
+	[CPL_PASS_OPEN_RPL] = pass_open_rpl,
+	[CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl,
+	[CPL_PASS_ACCEPT_REQ] = pass_accept_req,
+	[CPL_PASS_ESTABLISH] = pass_establish,
+	[CPL_PEER_CLOSE] = peer_close,
+	[CPL_ABORT_REQ_RSS] = peer_abort,
+	[CPL_CLOSE_CON_RPL] = close_con_rpl,
+	[CPL_RDMA_TERMINATE] = terminate,
+	[CPL_FW4_ACK] = fw4_ack,
+	[CPL_FW6_MSG] = deferred_fw6_msg,
+	[CPL_RX_PKT] = rx_pkt,
+	[FAKE_CPL_PUT_EP_SAFE] = _put_ep_safe,
+	[FAKE_CPL_PASS_PUT_EP_SAFE] = _put_pass_ep_safe
+};
+
+static void process_timeout(struct c4iw_ep *ep)
+{
+	struct c4iw_qp_attributes attrs;
+	int abort = 1;
+
+	mutex_lock(&ep->com.mutex);
+	pr_debug("ep %p tid %u state %d\n", ep, ep->hwtid, ep->com.state);
+	set_bit(TIMEDOUT, &ep->com.history);
+	switch (ep->com.state) {
+	case MPA_REQ_SENT:
+		connect_reply_upcall(ep, -ETIMEDOUT);
+		break;
+	case MPA_REQ_WAIT:
+	case MPA_REQ_RCVD:
+	case MPA_REP_SENT:
+	case FPDU_MODE:
+		break;
+	case CLOSING:
+	case MORIBUND:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = C4IW_QP_STATE_ERROR;
+			c4iw_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, C4IW_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+		}
+		close_complete_upcall(ep, -ETIMEDOUT);
+		break;
+	case ABORTING:
+	case DEAD:
+
+		/*
+		 * These states are expected if the ep timed out at the same
+		 * time as another thread was calling stop_ep_timer().
+		 * So we silently do nothing for these states.
+		 */
+		abort = 0;
+		break;
+	default:
+		WARN(1, "%s unexpected state ep %p tid %u state %u\n",
+			__func__, ep, ep->hwtid, ep->com.state);
+		abort = 0;
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (abort)
+		c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
+	c4iw_put_ep(&ep->com);
+}
+
+static void process_timedout_eps(void)
+{
+	struct c4iw_ep *ep;
+
+	spin_lock_irq(&timeout_lock);
+	while (!list_empty(&timeout_list)) {
+		struct list_head *tmp;
+
+		tmp = timeout_list.next;
+		list_del(tmp);
+		tmp->next = NULL;
+		tmp->prev = NULL;
+		spin_unlock_irq(&timeout_lock);
+		ep = list_entry(tmp, struct c4iw_ep, entry);
+		process_timeout(ep);
+		spin_lock_irq(&timeout_lock);
+	}
+	spin_unlock_irq(&timeout_lock);
+}
+
+static void process_work(struct work_struct *work)
+{
+	struct sk_buff *skb = NULL;
+	struct c4iw_dev *dev;
+	struct cpl_act_establish *rpl;
+	unsigned int opcode;
+	int ret;
+
+	process_timedout_eps();
+	while ((skb = skb_dequeue(&rxq))) {
+		rpl = cplhdr(skb);
+		dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
+		opcode = rpl->ot.opcode;
+
+		if (opcode >= ARRAY_SIZE(work_handlers) ||
+		    !work_handlers[opcode]) {
+			pr_err("No handler for opcode 0x%x.\n", opcode);
+			kfree_skb(skb);
+		} else {
+			ret = work_handlers[opcode](dev, skb);
+			if (!ret)
+				kfree_skb(skb);
+		}
+		process_timedout_eps();
+	}
+}
+
+static DECLARE_WORK(skb_work, process_work);
+
+static void ep_timeout(struct timer_list *t)
+{
+	struct c4iw_ep *ep = from_timer(ep, t, timer);
+	int kickit = 0;
+
+	spin_lock(&timeout_lock);
+	if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
+		/*
+		 * Only insert if it is not already on the list.
+		 */
+		if (!ep->entry.next) {
+			list_add_tail(&ep->entry, &timeout_list);
+			kickit = 1;
+		}
+	}
+	spin_unlock(&timeout_lock);
+	if (kickit)
+		queue_work(workq, &skb_work);
+}
+
+/*
+ * All the CM events are handled on a work queue to have a safe context.
+ */
+static int sched(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+
+	/*
+	 * Save dev in the skb->cb area.
+	 */
+	*((struct c4iw_dev **) (skb->cb + sizeof(void *))) = dev;
+
+	/*
+	 * Queue the skb and schedule the worker thread.
+	 */
+	skb_queue_tail(&rxq, skb);
+	queue_work(workq, &skb_work);
+	return 0;
+}
+
+static int set_tcb_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_set_tcb_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status != CPL_ERR_NONE) {
+		pr_err("Unexpected SET_TCB_RPL status %u for tid %u\n",
+		       rpl->status, GET_TID(rpl));
+	}
+	kfree_skb(skb);
+	return 0;
+}
+
+static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_fw6_msg *rpl = cplhdr(skb);
+	struct c4iw_wr_wait *wr_waitp;
+	int ret;
+
+	pr_debug("type %u\n", rpl->type);
+
+	switch (rpl->type) {
+	case FW6_TYPE_WR_RPL:
+		ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff);
+		wr_waitp = (struct c4iw_wr_wait *)(__force unsigned long) rpl->data[1];
+		pr_debug("wr_waitp %p ret %u\n", wr_waitp, ret);
+		if (wr_waitp)
+			c4iw_wake_up_deref(wr_waitp, ret ? -ret : 0);
+		kfree_skb(skb);
+		break;
+	case FW6_TYPE_CQE:
+	case FW6_TYPE_OFLD_CONNECTION_WR_RPL:
+		sched(dev, skb);
+		break;
+	default:
+		pr_err("%s unexpected fw6 msg type %u\n",
+		       __func__, rpl->type);
+		kfree_skb(skb);
+		break;
+	}
+	return 0;
+}
+
+static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct c4iw_ep *ep;
+	unsigned int tid = GET_TID(req);
+
+	ep = get_ep_from_tid(dev, tid);
+	/* This EP will be dereferenced in peer_abort() */
+	if (!ep) {
+		pr_warn("Abort on non-existent endpoint, tid %d\n", tid);
+		kfree_skb(skb);
+		return 0;
+	}
+	if (cxgb_is_neg_adv(req->status)) {
+		pr_debug("Negative advice on abort- tid %u status %d (%s)\n",
+			 ep->hwtid, req->status,
+			 neg_adv_str(req->status));
+		goto out;
+	}
+	pr_debug("ep %p tid %u state %u\n", ep, ep->hwtid, ep->com.state);
+
+	c4iw_wake_up_noref(ep->com.wr_waitp, -ECONNRESET);
+out:
+	sched(dev, skb);
+	return 0;
+}
+
+/*
+ * Most upcalls from the T4 Core go to sched() to
+ * schedule the processing on a work queue.
+ */
+c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH] = sched,
+	[CPL_ACT_OPEN_RPL] = sched,
+	[CPL_RX_DATA] = sched,
+	[CPL_ABORT_RPL_RSS] = sched,
+	[CPL_ABORT_RPL] = sched,
+	[CPL_PASS_OPEN_RPL] = sched,
+	[CPL_CLOSE_LISTSRV_RPL] = sched,
+	[CPL_PASS_ACCEPT_REQ] = sched,
+	[CPL_PASS_ESTABLISH] = sched,
+	[CPL_PEER_CLOSE] = sched,
+	[CPL_CLOSE_CON_RPL] = sched,
+	[CPL_ABORT_REQ_RSS] = peer_abort_intr,
+	[CPL_RDMA_TERMINATE] = sched,
+	[CPL_FW4_ACK] = sched,
+	[CPL_SET_TCB_RPL] = set_tcb_rpl,
+	[CPL_FW6_MSG] = fw6_msg,
+	[CPL_RX_PKT] = sched
+};
+
+int __init c4iw_cm_init(void)
+{
+	spin_lock_init(&timeout_lock);
+	skb_queue_head_init(&rxq);
+
+	workq = alloc_ordered_workqueue("iw_cxgb4", WQ_MEM_RECLAIM);
+	if (!workq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void c4iw_cm_term(void)
+{
+	WARN_ON(!list_empty(&timeout_list));
+	flush_workqueue(workq);
+	destroy_workqueue(workq);
+}
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
new file mode 100644
index 0000000..2be2e1a
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -0,0 +1,1041 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iw_cxgb4.h"
+
+static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
+		      struct c4iw_dev_ucontext *uctx, struct sk_buff *skb,
+		      struct c4iw_wr_wait *wr_waitp)
+{
+	struct fw_ri_res_wr *res_wr;
+	struct fw_ri_res *res;
+	int wr_len;
+	int ret;
+
+	wr_len = sizeof *res_wr + sizeof *res;
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	res_wr = __skb_put_zero(skb, wr_len);
+	res_wr->op_nres = cpu_to_be32(
+			FW_WR_OP_V(FW_RI_RES_WR) |
+			FW_RI_RES_WR_NRES_V(1) |
+			FW_WR_COMPL_F);
+	res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
+	res_wr->cookie = (uintptr_t)wr_waitp;
+	res = res_wr->res;
+	res->u.cq.restype = FW_RI_RES_TYPE_CQ;
+	res->u.cq.op = FW_RI_RES_OP_RESET;
+	res->u.cq.iqid = cpu_to_be32(cq->cqid);
+
+	c4iw_init_wr_wait(wr_waitp);
+	ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__);
+
+	kfree(cq->sw_queue);
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  cq->memsize, cq->queue,
+			  dma_unmap_addr(cq, mapping));
+	c4iw_put_cqid(rdev, cq->cqid, uctx);
+	return ret;
+}
+
+static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
+		     struct c4iw_dev_ucontext *uctx,
+		     struct c4iw_wr_wait *wr_waitp)
+{
+	struct fw_ri_res_wr *res_wr;
+	struct fw_ri_res *res;
+	int wr_len;
+	int user = (uctx != &rdev->uctx);
+	int ret;
+	struct sk_buff *skb;
+
+	cq->cqid = c4iw_get_cqid(rdev, uctx);
+	if (!cq->cqid) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	if (!user) {
+		cq->sw_queue = kzalloc(cq->memsize, GFP_KERNEL);
+		if (!cq->sw_queue) {
+			ret = -ENOMEM;
+			goto err2;
+		}
+	}
+	cq->queue = dma_alloc_coherent(&rdev->lldi.pdev->dev, cq->memsize,
+				       &cq->dma_addr, GFP_KERNEL);
+	if (!cq->queue) {
+		ret = -ENOMEM;
+		goto err3;
+	}
+	dma_unmap_addr_set(cq, mapping, cq->dma_addr);
+	memset(cq->queue, 0, cq->memsize);
+
+	/* build fw_ri_res_wr */
+	wr_len = sizeof *res_wr + sizeof *res;
+
+	skb = alloc_skb(wr_len, GFP_KERNEL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto err4;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	res_wr = __skb_put_zero(skb, wr_len);
+	res_wr->op_nres = cpu_to_be32(
+			FW_WR_OP_V(FW_RI_RES_WR) |
+			FW_RI_RES_WR_NRES_V(1) |
+			FW_WR_COMPL_F);
+	res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
+	res_wr->cookie = (uintptr_t)wr_waitp;
+	res = res_wr->res;
+	res->u.cq.restype = FW_RI_RES_TYPE_CQ;
+	res->u.cq.op = FW_RI_RES_OP_WRITE;
+	res->u.cq.iqid = cpu_to_be32(cq->cqid);
+	res->u.cq.iqandst_to_iqandstindex = cpu_to_be32(
+			FW_RI_RES_WR_IQANUS_V(0) |
+			FW_RI_RES_WR_IQANUD_V(1) |
+			FW_RI_RES_WR_IQANDST_F |
+			FW_RI_RES_WR_IQANDSTINDEX_V(
+				rdev->lldi.ciq_ids[cq->vector]));
+	res->u.cq.iqdroprss_to_iqesize = cpu_to_be16(
+			FW_RI_RES_WR_IQDROPRSS_F |
+			FW_RI_RES_WR_IQPCIECH_V(2) |
+			FW_RI_RES_WR_IQINTCNTTHRESH_V(0) |
+			FW_RI_RES_WR_IQO_F |
+			FW_RI_RES_WR_IQESIZE_V(1));
+	res->u.cq.iqsize = cpu_to_be16(cq->size);
+	res->u.cq.iqaddr = cpu_to_be64(cq->dma_addr);
+
+	c4iw_init_wr_wait(wr_waitp);
+	ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__);
+	if (ret)
+		goto err4;
+
+	cq->gen = 1;
+	cq->gts = rdev->lldi.gts_reg;
+	cq->rdev = rdev;
+
+	cq->bar2_va = c4iw_bar2_addrs(rdev, cq->cqid, T4_BAR2_QTYPE_INGRESS,
+				      &cq->bar2_qid,
+				      user ? &cq->bar2_pa : NULL);
+	if (user && !cq->bar2_pa) {
+		pr_warn("%s: cqid %u not in BAR2 range\n",
+			pci_name(rdev->lldi.pdev), cq->cqid);
+		ret = -EINVAL;
+		goto err4;
+	}
+	return 0;
+err4:
+	dma_free_coherent(&rdev->lldi.pdev->dev, cq->memsize, cq->queue,
+			  dma_unmap_addr(cq, mapping));
+err3:
+	kfree(cq->sw_queue);
+err2:
+	c4iw_put_cqid(rdev, cq->cqid, uctx);
+err1:
+	return ret;
+}
+
+static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq)
+{
+	struct t4_cqe cqe;
+
+	pr_debug("wq %p cq %p sw_cidx %u sw_pidx %u\n",
+		 wq, cq, cq->sw_cidx, cq->sw_pidx);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) |
+				 CQE_OPCODE_V(FW_RI_SEND) |
+				 CQE_TYPE_V(0) |
+				 CQE_SWCQE_V(1) |
+				 CQE_QPID_V(wq->sq.qid));
+	cqe.bits_type_ts = cpu_to_be64(CQE_GENBIT_V((u64)cq->gen));
+	cq->sw_queue[cq->sw_pidx] = cqe;
+	t4_swcq_produce(cq);
+}
+
+int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count)
+{
+	int flushed = 0;
+	int in_use = wq->rq.in_use - count;
+
+	pr_debug("wq %p cq %p rq.in_use %u skip count %u\n",
+		 wq, cq, wq->rq.in_use, count);
+	while (in_use--) {
+		insert_recv_cqe(wq, cq);
+		flushed++;
+	}
+	return flushed;
+}
+
+static void insert_sq_cqe(struct t4_wq *wq, struct t4_cq *cq,
+			  struct t4_swsqe *swcqe)
+{
+	struct t4_cqe cqe;
+
+	pr_debug("wq %p cq %p sw_cidx %u sw_pidx %u\n",
+		 wq, cq, cq->sw_cidx, cq->sw_pidx);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) |
+				 CQE_OPCODE_V(swcqe->opcode) |
+				 CQE_TYPE_V(1) |
+				 CQE_SWCQE_V(1) |
+				 CQE_QPID_V(wq->sq.qid));
+	CQE_WRID_SQ_IDX(&cqe) = swcqe->idx;
+	cqe.bits_type_ts = cpu_to_be64(CQE_GENBIT_V((u64)cq->gen));
+	cq->sw_queue[cq->sw_pidx] = cqe;
+	t4_swcq_produce(cq);
+}
+
+static void advance_oldest_read(struct t4_wq *wq);
+
+int c4iw_flush_sq(struct c4iw_qp *qhp)
+{
+	int flushed = 0;
+	struct t4_wq *wq = &qhp->wq;
+	struct c4iw_cq *chp = to_c4iw_cq(qhp->ibqp.send_cq);
+	struct t4_cq *cq = &chp->cq;
+	int idx;
+	struct t4_swsqe *swsqe;
+
+	if (wq->sq.flush_cidx == -1)
+		wq->sq.flush_cidx = wq->sq.cidx;
+	idx = wq->sq.flush_cidx;
+	while (idx != wq->sq.pidx) {
+		swsqe = &wq->sq.sw_sq[idx];
+		swsqe->flushed = 1;
+		insert_sq_cqe(wq, cq, swsqe);
+		if (wq->sq.oldest_read == swsqe) {
+			advance_oldest_read(wq);
+		}
+		flushed++;
+		if (++idx == wq->sq.size)
+			idx = 0;
+	}
+	wq->sq.flush_cidx += flushed;
+	if (wq->sq.flush_cidx >= wq->sq.size)
+		wq->sq.flush_cidx -= wq->sq.size;
+	return flushed;
+}
+
+static void flush_completed_wrs(struct t4_wq *wq, struct t4_cq *cq)
+{
+	struct t4_swsqe *swsqe;
+	int cidx;
+
+	if (wq->sq.flush_cidx == -1)
+		wq->sq.flush_cidx = wq->sq.cidx;
+	cidx = wq->sq.flush_cidx;
+
+	while (cidx != wq->sq.pidx) {
+		swsqe = &wq->sq.sw_sq[cidx];
+		if (!swsqe->signaled) {
+			if (++cidx == wq->sq.size)
+				cidx = 0;
+		} else if (swsqe->complete) {
+
+			/*
+			 * Insert this completed cqe into the swcq.
+			 */
+			pr_debug("moving cqe into swcq sq idx %u cq idx %u\n",
+				 cidx, cq->sw_pidx);
+			swsqe->cqe.header |= htonl(CQE_SWCQE_V(1));
+			cq->sw_queue[cq->sw_pidx] = swsqe->cqe;
+			t4_swcq_produce(cq);
+			swsqe->flushed = 1;
+			if (++cidx == wq->sq.size)
+				cidx = 0;
+			wq->sq.flush_cidx = cidx;
+		} else
+			break;
+	}
+}
+
+static void create_read_req_cqe(struct t4_wq *wq, struct t4_cqe *hw_cqe,
+		struct t4_cqe *read_cqe)
+{
+	read_cqe->u.scqe.cidx = wq->sq.oldest_read->idx;
+	read_cqe->len = htonl(wq->sq.oldest_read->read_len);
+	read_cqe->header = htonl(CQE_QPID_V(CQE_QPID(hw_cqe)) |
+			CQE_SWCQE_V(SW_CQE(hw_cqe)) |
+			CQE_OPCODE_V(FW_RI_READ_REQ) |
+			CQE_TYPE_V(1));
+	read_cqe->bits_type_ts = hw_cqe->bits_type_ts;
+}
+
+static void advance_oldest_read(struct t4_wq *wq)
+{
+
+	u32 rptr = wq->sq.oldest_read - wq->sq.sw_sq + 1;
+
+	if (rptr == wq->sq.size)
+		rptr = 0;
+	while (rptr != wq->sq.pidx) {
+		wq->sq.oldest_read = &wq->sq.sw_sq[rptr];
+
+		if (wq->sq.oldest_read->opcode == FW_RI_READ_REQ)
+			return;
+		if (++rptr == wq->sq.size)
+			rptr = 0;
+	}
+	wq->sq.oldest_read = NULL;
+}
+
+/*
+ * Move all CQEs from the HWCQ into the SWCQ.
+ * Deal with out-of-order and/or completions that complete
+ * prior unsignalled WRs.
+ */
+void c4iw_flush_hw_cq(struct c4iw_cq *chp, struct c4iw_qp *flush_qhp)
+{
+	struct t4_cqe *hw_cqe, *swcqe, read_cqe;
+	struct c4iw_qp *qhp;
+	struct t4_swsqe *swsqe;
+	int ret;
+
+	pr_debug("cqid 0x%x\n", chp->cq.cqid);
+	ret = t4_next_hw_cqe(&chp->cq, &hw_cqe);
+
+	/*
+	 * This logic is similar to poll_cq(), but not quite the same
+	 * unfortunately.  Need to move pertinent HW CQEs to the SW CQ but
+	 * also do any translation magic that poll_cq() normally does.
+	 */
+	while (!ret) {
+		qhp = get_qhp(chp->rhp, CQE_QPID(hw_cqe));
+
+		/*
+		 * drop CQEs with no associated QP
+		 */
+		if (qhp == NULL)
+			goto next_cqe;
+
+		if (flush_qhp != qhp) {
+			spin_lock(&qhp->lock);
+
+			if (qhp->wq.flushed == 1)
+				goto next_cqe;
+		}
+
+		if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE)
+			goto next_cqe;
+
+		if (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP) {
+
+			/* If we have reached here because of async
+			 * event or other error, and have egress error
+			 * then drop
+			 */
+			if (CQE_TYPE(hw_cqe) == 1)
+				goto next_cqe;
+
+			/* drop peer2peer RTR reads.
+			 */
+			if (CQE_WRID_STAG(hw_cqe) == 1)
+				goto next_cqe;
+
+			/*
+			 * Eat completions for unsignaled read WRs.
+			 */
+			if (!qhp->wq.sq.oldest_read->signaled) {
+				advance_oldest_read(&qhp->wq);
+				goto next_cqe;
+			}
+
+			/*
+			 * Don't write to the HWCQ, create a new read req CQE
+			 * in local memory and move it into the swcq.
+			 */
+			create_read_req_cqe(&qhp->wq, hw_cqe, &read_cqe);
+			hw_cqe = &read_cqe;
+			advance_oldest_read(&qhp->wq);
+		}
+
+		/* if its a SQ completion, then do the magic to move all the
+		 * unsignaled and now in-order completions into the swcq.
+		 */
+		if (SQ_TYPE(hw_cqe)) {
+			swsqe = &qhp->wq.sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)];
+			swsqe->cqe = *hw_cqe;
+			swsqe->complete = 1;
+			flush_completed_wrs(&qhp->wq, &chp->cq);
+		} else {
+			swcqe = &chp->cq.sw_queue[chp->cq.sw_pidx];
+			*swcqe = *hw_cqe;
+			swcqe->header |= cpu_to_be32(CQE_SWCQE_V(1));
+			t4_swcq_produce(&chp->cq);
+		}
+next_cqe:
+		t4_hwcq_consume(&chp->cq);
+		ret = t4_next_hw_cqe(&chp->cq, &hw_cqe);
+		if (qhp && flush_qhp != qhp)
+			spin_unlock(&qhp->lock);
+	}
+}
+
+static int cqe_completes_wr(struct t4_cqe *cqe, struct t4_wq *wq)
+{
+	if (DRAIN_CQE(cqe)) {
+		WARN_ONCE(1, "Unexpected DRAIN CQE qp id %u!\n", wq->sq.qid);
+		return 0;
+	}
+
+	if (CQE_OPCODE(cqe) == FW_RI_TERMINATE)
+		return 0;
+
+	if ((CQE_OPCODE(cqe) == FW_RI_RDMA_WRITE) && RQ_TYPE(cqe))
+		return 0;
+
+	if ((CQE_OPCODE(cqe) == FW_RI_READ_RESP) && SQ_TYPE(cqe))
+		return 0;
+
+	if (CQE_SEND_OPCODE(cqe) && RQ_TYPE(cqe) && t4_rq_empty(wq))
+		return 0;
+	return 1;
+}
+
+void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count)
+{
+	struct t4_cqe *cqe;
+	u32 ptr;
+
+	*count = 0;
+	pr_debug("count zero %d\n", *count);
+	ptr = cq->sw_cidx;
+	while (ptr != cq->sw_pidx) {
+		cqe = &cq->sw_queue[ptr];
+		if (RQ_TYPE(cqe) && (CQE_OPCODE(cqe) != FW_RI_READ_RESP) &&
+		    (CQE_QPID(cqe) == wq->sq.qid) && cqe_completes_wr(cqe, wq))
+			(*count)++;
+		if (++ptr == cq->size)
+			ptr = 0;
+	}
+	pr_debug("cq %p count %d\n", cq, *count);
+}
+
+/*
+ * poll_cq
+ *
+ * Caller must:
+ *     check the validity of the first CQE,
+ *     supply the wq assicated with the qpid.
+ *
+ * credit: cq credit to return to sge.
+ * cqe_flushed: 1 iff the CQE is flushed.
+ * cqe: copy of the polled CQE.
+ *
+ * return value:
+ *    0		    CQE returned ok.
+ *    -EAGAIN       CQE skipped, try again.
+ *    -EOVERFLOW    CQ overflow detected.
+ */
+static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
+		   u8 *cqe_flushed, u64 *cookie, u32 *credit)
+{
+	int ret = 0;
+	struct t4_cqe *hw_cqe, read_cqe;
+
+	*cqe_flushed = 0;
+	*credit = 0;
+	ret = t4_next_cqe(cq, &hw_cqe);
+	if (ret)
+		return ret;
+
+	pr_debug("CQE OVF %u qpid 0x%0x genbit %u type %u status 0x%0x opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
+		 CQE_OVFBIT(hw_cqe), CQE_QPID(hw_cqe),
+		 CQE_GENBIT(hw_cqe), CQE_TYPE(hw_cqe), CQE_STATUS(hw_cqe),
+		 CQE_OPCODE(hw_cqe), CQE_LEN(hw_cqe), CQE_WRID_HI(hw_cqe),
+		 CQE_WRID_LOW(hw_cqe));
+
+	/*
+	 * skip cqe's not affiliated with a QP.
+	 */
+	if (wq == NULL) {
+		ret = -EAGAIN;
+		goto skip_cqe;
+	}
+
+	/*
+	* skip hw cqe's if the wq is flushed.
+	*/
+	if (wq->flushed && !SW_CQE(hw_cqe)) {
+		ret = -EAGAIN;
+		goto skip_cqe;
+	}
+
+	/*
+	 * skip TERMINATE cqes...
+	 */
+	if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) {
+		ret = -EAGAIN;
+		goto skip_cqe;
+	}
+
+	/*
+	 * Special cqe for drain WR completions...
+	 */
+	if (DRAIN_CQE(hw_cqe)) {
+		*cookie = CQE_DRAIN_COOKIE(hw_cqe);
+		*cqe = *hw_cqe;
+		goto skip_cqe;
+	}
+
+	/*
+	 * Gotta tweak READ completions:
+	 *	1) the cqe doesn't contain the sq_wptr from the wr.
+	 *	2) opcode not reflected from the wr.
+	 *	3) read_len not reflected from the wr.
+	 *	4) cq_type is RQ_TYPE not SQ_TYPE.
+	 */
+	if (RQ_TYPE(hw_cqe) && (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP)) {
+
+		/* If we have reached here because of async
+		 * event or other error, and have egress error
+		 * then drop
+		 */
+		if (CQE_TYPE(hw_cqe) == 1) {
+			if (CQE_STATUS(hw_cqe))
+				t4_set_wq_in_error(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+
+		/* If this is an unsolicited read response, then the read
+		 * was generated by the kernel driver as part of peer-2-peer
+		 * connection setup.  So ignore the completion.
+		 */
+		if (CQE_WRID_STAG(hw_cqe) == 1) {
+			if (CQE_STATUS(hw_cqe))
+				t4_set_wq_in_error(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+
+		/*
+		 * Eat completions for unsignaled read WRs.
+		 */
+		if (!wq->sq.oldest_read->signaled) {
+			advance_oldest_read(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+
+		/*
+		 * Don't write to the HWCQ, so create a new read req CQE
+		 * in local memory.
+		 */
+		create_read_req_cqe(wq, hw_cqe, &read_cqe);
+		hw_cqe = &read_cqe;
+		advance_oldest_read(wq);
+	}
+
+	if (CQE_STATUS(hw_cqe) || t4_wq_in_error(wq)) {
+		*cqe_flushed = (CQE_STATUS(hw_cqe) == T4_ERR_SWFLUSH);
+		t4_set_wq_in_error(wq);
+	}
+
+	/*
+	 * RECV completion.
+	 */
+	if (RQ_TYPE(hw_cqe)) {
+
+		/*
+		 * HW only validates 4 bits of MSN.  So we must validate that
+		 * the MSN in the SEND is the next expected MSN.  If its not,
+		 * then we complete this with T4_ERR_MSN and mark the wq in
+		 * error.
+		 */
+
+		if (t4_rq_empty(wq)) {
+			t4_set_wq_in_error(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+		if (unlikely(!CQE_STATUS(hw_cqe) &&
+			     CQE_WRID_MSN(hw_cqe) != wq->rq.msn)) {
+			t4_set_wq_in_error(wq);
+			hw_cqe->header |= cpu_to_be32(CQE_STATUS_V(T4_ERR_MSN));
+		}
+		goto proc_cqe;
+	}
+
+	/*
+	 * If we get here its a send completion.
+	 *
+	 * Handle out of order completion. These get stuffed
+	 * in the SW SQ. Then the SW SQ is walked to move any
+	 * now in-order completions into the SW CQ.  This handles
+	 * 2 cases:
+	 *	1) reaping unsignaled WRs when the first subsequent
+	 *	   signaled WR is completed.
+	 *	2) out of order read completions.
+	 */
+	if (!SW_CQE(hw_cqe) && (CQE_WRID_SQ_IDX(hw_cqe) != wq->sq.cidx)) {
+		struct t4_swsqe *swsqe;
+
+		pr_debug("out of order completion going in sw_sq at idx %u\n",
+			 CQE_WRID_SQ_IDX(hw_cqe));
+		swsqe = &wq->sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)];
+		swsqe->cqe = *hw_cqe;
+		swsqe->complete = 1;
+		ret = -EAGAIN;
+		goto flush_wq;
+	}
+
+proc_cqe:
+	*cqe = *hw_cqe;
+
+	/*
+	 * Reap the associated WR(s) that are freed up with this
+	 * completion.
+	 */
+	if (SQ_TYPE(hw_cqe)) {
+		int idx = CQE_WRID_SQ_IDX(hw_cqe);
+
+		/*
+		* Account for any unsignaled completions completed by
+		* this signaled completion.  In this case, cidx points
+		* to the first unsignaled one, and idx points to the
+		* signaled one.  So adjust in_use based on this delta.
+		* if this is not completing any unsigned wrs, then the
+		* delta will be 0. Handle wrapping also!
+		*/
+		if (idx < wq->sq.cidx)
+			wq->sq.in_use -= wq->sq.size + idx - wq->sq.cidx;
+		else
+			wq->sq.in_use -= idx - wq->sq.cidx;
+
+		wq->sq.cidx = (uint16_t)idx;
+		pr_debug("completing sq idx %u\n", wq->sq.cidx);
+		*cookie = wq->sq.sw_sq[wq->sq.cidx].wr_id;
+		if (c4iw_wr_log)
+			c4iw_log_wr_stats(wq, hw_cqe);
+		t4_sq_consume(wq);
+	} else {
+		pr_debug("completing rq idx %u\n", wq->rq.cidx);
+		*cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id;
+		if (c4iw_wr_log)
+			c4iw_log_wr_stats(wq, hw_cqe);
+		t4_rq_consume(wq);
+		goto skip_cqe;
+	}
+
+flush_wq:
+	/*
+	 * Flush any completed cqes that are now in-order.
+	 */
+	flush_completed_wrs(wq, cq);
+
+skip_cqe:
+	if (SW_CQE(hw_cqe)) {
+		pr_debug("cq %p cqid 0x%x skip sw cqe cidx %u\n",
+			 cq, cq->cqid, cq->sw_cidx);
+		t4_swcq_consume(cq);
+	} else {
+		pr_debug("cq %p cqid 0x%x skip hw cqe cidx %u\n",
+			 cq, cq->cqid, cq->cidx);
+		t4_hwcq_consume(cq);
+	}
+	return ret;
+}
+
+/*
+ * Get one cq entry from c4iw and map it to openib.
+ *
+ * Returns:
+ *	0			cqe returned
+ *	-ENODATA		EMPTY;
+ *	-EAGAIN			caller must try again
+ *	any other -errno	fatal error
+ */
+static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc)
+{
+	struct c4iw_qp *qhp = NULL;
+	struct t4_cqe uninitialized_var(cqe), *rd_cqe;
+	struct t4_wq *wq;
+	u32 credit = 0;
+	u8 cqe_flushed;
+	u64 cookie = 0;
+	int ret;
+
+	ret = t4_next_cqe(&chp->cq, &rd_cqe);
+
+	if (ret)
+		return ret;
+
+	qhp = get_qhp(chp->rhp, CQE_QPID(rd_cqe));
+	if (!qhp)
+		wq = NULL;
+	else {
+		spin_lock(&qhp->lock);
+		wq = &(qhp->wq);
+	}
+	ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit);
+	if (ret)
+		goto out;
+
+	wc->wr_id = cookie;
+	wc->qp = &qhp->ibqp;
+	wc->vendor_err = CQE_STATUS(&cqe);
+	wc->wc_flags = 0;
+
+	pr_debug("qpid 0x%x type %d opcode %d status 0x%x len %u wrid hi 0x%x lo 0x%x cookie 0x%llx\n",
+		 CQE_QPID(&cqe),
+		 CQE_TYPE(&cqe), CQE_OPCODE(&cqe),
+		 CQE_STATUS(&cqe), CQE_LEN(&cqe),
+		 CQE_WRID_HI(&cqe), CQE_WRID_LOW(&cqe),
+		 (unsigned long long)cookie);
+
+	if (CQE_TYPE(&cqe) == 0) {
+		if (!CQE_STATUS(&cqe))
+			wc->byte_len = CQE_LEN(&cqe);
+		else
+			wc->byte_len = 0;
+		wc->opcode = IB_WC_RECV;
+		if (CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_INV ||
+		    CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) {
+			wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe);
+			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+			c4iw_invalidate_mr(qhp->rhp, wc->ex.invalidate_rkey);
+		}
+	} else {
+		switch (CQE_OPCODE(&cqe)) {
+		case FW_RI_RDMA_WRITE:
+			wc->opcode = IB_WC_RDMA_WRITE;
+			break;
+		case FW_RI_READ_REQ:
+			wc->opcode = IB_WC_RDMA_READ;
+			wc->byte_len = CQE_LEN(&cqe);
+			break;
+		case FW_RI_SEND_WITH_INV:
+		case FW_RI_SEND_WITH_SE_INV:
+			wc->opcode = IB_WC_SEND;
+			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+			break;
+		case FW_RI_SEND:
+		case FW_RI_SEND_WITH_SE:
+			wc->opcode = IB_WC_SEND;
+			break;
+
+		case FW_RI_LOCAL_INV:
+			wc->opcode = IB_WC_LOCAL_INV;
+			break;
+		case FW_RI_FAST_REGISTER:
+			wc->opcode = IB_WC_REG_MR;
+
+			/* Invalidate the MR if the fastreg failed */
+			if (CQE_STATUS(&cqe) != T4_ERR_SUCCESS)
+				c4iw_invalidate_mr(qhp->rhp,
+						   CQE_WRID_FR_STAG(&cqe));
+			break;
+		default:
+			pr_err("Unexpected opcode %d in the CQE received for QPID=0x%0x\n",
+			       CQE_OPCODE(&cqe), CQE_QPID(&cqe));
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (cqe_flushed)
+		wc->status = IB_WC_WR_FLUSH_ERR;
+	else {
+
+		switch (CQE_STATUS(&cqe)) {
+		case T4_ERR_SUCCESS:
+			wc->status = IB_WC_SUCCESS;
+			break;
+		case T4_ERR_STAG:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case T4_ERR_PDID:
+			wc->status = IB_WC_LOC_PROT_ERR;
+			break;
+		case T4_ERR_QPID:
+		case T4_ERR_ACCESS:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case T4_ERR_WRAP:
+			wc->status = IB_WC_GENERAL_ERR;
+			break;
+		case T4_ERR_BOUND:
+			wc->status = IB_WC_LOC_LEN_ERR;
+			break;
+		case T4_ERR_INVALIDATE_SHARED_MR:
+		case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+			wc->status = IB_WC_MW_BIND_ERR;
+			break;
+		case T4_ERR_CRC:
+		case T4_ERR_MARKER:
+		case T4_ERR_PDU_LEN_ERR:
+		case T4_ERR_OUT_OF_RQE:
+		case T4_ERR_DDP_VERSION:
+		case T4_ERR_RDMA_VERSION:
+		case T4_ERR_DDP_QUEUE_NUM:
+		case T4_ERR_MSN:
+		case T4_ERR_TBIT:
+		case T4_ERR_MO:
+		case T4_ERR_MSN_RANGE:
+		case T4_ERR_IRD_OVERFLOW:
+		case T4_ERR_OPCODE:
+		case T4_ERR_INTERNAL_ERR:
+			wc->status = IB_WC_FATAL_ERR;
+			break;
+		case T4_ERR_SWFLUSH:
+			wc->status = IB_WC_WR_FLUSH_ERR;
+			break;
+		default:
+			pr_err("Unexpected cqe_status 0x%x for QPID=0x%0x\n",
+			       CQE_STATUS(&cqe), CQE_QPID(&cqe));
+			wc->status = IB_WC_FATAL_ERR;
+		}
+	}
+out:
+	if (wq)
+		spin_unlock(&qhp->lock);
+	return ret;
+}
+
+int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct c4iw_cq *chp;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+
+	chp = to_c4iw_cq(ibcq);
+
+	spin_lock_irqsave(&chp->lock, flags);
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		do {
+			err = c4iw_poll_cq_one(chp, wc + npolled);
+		} while (err == -EAGAIN);
+		if (err)
+			break;
+	}
+	spin_unlock_irqrestore(&chp->lock, flags);
+	return !err || err == -ENODATA ? npolled : err;
+}
+
+int c4iw_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct c4iw_cq *chp;
+	struct c4iw_ucontext *ucontext;
+
+	pr_debug("ib_cq %p\n", ib_cq);
+	chp = to_c4iw_cq(ib_cq);
+
+	remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
+	atomic_dec(&chp->refcnt);
+	wait_event(chp->wait, !atomic_read(&chp->refcnt));
+
+	ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context)
+				  : NULL;
+	destroy_cq(&chp->rhp->rdev, &chp->cq,
+		   ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx,
+		   chp->destroy_skb, chp->wr_waitp);
+	c4iw_put_wr_wait(chp->wr_waitp);
+	kfree(chp);
+	return 0;
+}
+
+struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
+			     const struct ib_cq_init_attr *attr,
+			     struct ib_ucontext *ib_context,
+			     struct ib_udata *udata)
+{
+	int entries = attr->cqe;
+	int vector = attr->comp_vector;
+	struct c4iw_dev *rhp;
+	struct c4iw_cq *chp;
+	struct c4iw_create_cq_resp uresp;
+	struct c4iw_ucontext *ucontext = NULL;
+	int ret, wr_len;
+	size_t memsize, hwentries;
+	struct c4iw_mm_entry *mm, *mm2;
+
+	pr_debug("ib_dev %p entries %d\n", ibdev, entries);
+	if (attr->flags)
+		return ERR_PTR(-EINVAL);
+
+	rhp = to_c4iw_dev(ibdev);
+
+	if (vector >= rhp->rdev.lldi.nciq)
+		return ERR_PTR(-EINVAL);
+
+	chp = kzalloc(sizeof(*chp), GFP_KERNEL);
+	if (!chp)
+		return ERR_PTR(-ENOMEM);
+	chp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL);
+	if (!chp->wr_waitp) {
+		ret = -ENOMEM;
+		goto err_free_chp;
+	}
+	c4iw_init_wr_wait(chp->wr_waitp);
+
+	wr_len = sizeof(struct fw_ri_res_wr) + sizeof(struct fw_ri_res);
+	chp->destroy_skb = alloc_skb(wr_len, GFP_KERNEL);
+	if (!chp->destroy_skb) {
+		ret = -ENOMEM;
+		goto err_free_wr_wait;
+	}
+
+	if (ib_context)
+		ucontext = to_c4iw_ucontext(ib_context);
+
+	/* account for the status page. */
+	entries++;
+
+	/* IQ needs one extra entry to differentiate full vs empty. */
+	entries++;
+
+	/*
+	 * entries must be multiple of 16 for HW.
+	 */
+	entries = roundup(entries, 16);
+
+	/*
+	 * Make actual HW queue 2x to avoid cdix_inc overflows.
+	 */
+	hwentries = min(entries * 2, rhp->rdev.hw_queue.t4_max_iq_size);
+
+	/*
+	 * Make HW queue at least 64 entries so GTS updates aren't too
+	 * frequent.
+	 */
+	if (hwentries < 64)
+		hwentries = 64;
+
+	memsize = hwentries * sizeof *chp->cq.queue;
+
+	/*
+	 * memsize must be a multiple of the page size if its a user cq.
+	 */
+	if (ucontext)
+		memsize = roundup(memsize, PAGE_SIZE);
+	chp->cq.size = hwentries;
+	chp->cq.memsize = memsize;
+	chp->cq.vector = vector;
+
+	ret = create_cq(&rhp->rdev, &chp->cq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
+			chp->wr_waitp);
+	if (ret)
+		goto err_free_skb;
+
+	chp->rhp = rhp;
+	chp->cq.size--;				/* status page */
+	chp->ibcq.cqe = entries - 2;
+	spin_lock_init(&chp->lock);
+	spin_lock_init(&chp->comp_handler_lock);
+	atomic_set(&chp->refcnt, 1);
+	init_waitqueue_head(&chp->wait);
+	ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid);
+	if (ret)
+		goto err_destroy_cq;
+
+	if (ucontext) {
+		ret = -ENOMEM;
+		mm = kmalloc(sizeof *mm, GFP_KERNEL);
+		if (!mm)
+			goto err_remove_handle;
+		mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+		if (!mm2)
+			goto err_free_mm;
+
+		uresp.qid_mask = rhp->rdev.cqmask;
+		uresp.cqid = chp->cq.cqid;
+		uresp.size = chp->cq.size;
+		uresp.memsize = chp->cq.memsize;
+		spin_lock(&ucontext->mmap_lock);
+		uresp.key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.gts_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		ret = ib_copy_to_udata(udata, &uresp,
+				       sizeof(uresp) - sizeof(uresp.reserved));
+		if (ret)
+			goto err_free_mm2;
+
+		mm->key = uresp.key;
+		mm->addr = virt_to_phys(chp->cq.queue);
+		mm->len = chp->cq.memsize;
+		insert_mmap(ucontext, mm);
+
+		mm2->key = uresp.gts_key;
+		mm2->addr = chp->cq.bar2_pa;
+		mm2->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm2);
+	}
+	pr_debug("cqid 0x%0x chp %p size %u memsize %zu, dma_addr 0x%0llx\n",
+		 chp->cq.cqid, chp, chp->cq.size,
+		 chp->cq.memsize, (unsigned long long)chp->cq.dma_addr);
+	return &chp->ibcq;
+err_free_mm2:
+	kfree(mm2);
+err_free_mm:
+	kfree(mm);
+err_remove_handle:
+	remove_handle(rhp, &rhp->cqidr, chp->cq.cqid);
+err_destroy_cq:
+	destroy_cq(&chp->rhp->rdev, &chp->cq,
+		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
+		   chp->destroy_skb, chp->wr_waitp);
+err_free_skb:
+	kfree_skb(chp->destroy_skb);
+err_free_wr_wait:
+	c4iw_put_wr_wait(chp->wr_waitp);
+err_free_chp:
+	kfree(chp);
+	return ERR_PTR(ret);
+}
+
+int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+	return -ENOSYS;
+}
+
+int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct c4iw_cq *chp;
+	int ret = 0;
+	unsigned long flag;
+
+	chp = to_c4iw_cq(ibcq);
+	spin_lock_irqsave(&chp->lock, flag);
+	t4_arm_cq(&chp->cq,
+		  (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED);
+	if (flags & IB_CQ_REPORT_MISSED_EVENTS)
+		ret = t4_cq_notempty(&chp->cq);
+	spin_unlock_irqrestore(&chp->lock, flag);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
new file mode 100644
index 0000000..44161ca
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -0,0 +1,1578 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <linux/vmalloc.h>
+#include <linux/math64.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "iw_cxgb4.h"
+
+#define DRV_VERSION "0.1"
+
+MODULE_AUTHOR("Steve Wise");
+MODULE_DESCRIPTION("Chelsio T4/T5 RDMA Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static int allow_db_fc_on_t5;
+module_param(allow_db_fc_on_t5, int, 0644);
+MODULE_PARM_DESC(allow_db_fc_on_t5,
+		 "Allow DB Flow Control on T5 (default = 0)");
+
+static int allow_db_coalescing_on_t5;
+module_param(allow_db_coalescing_on_t5, int, 0644);
+MODULE_PARM_DESC(allow_db_coalescing_on_t5,
+		 "Allow DB Coalescing on T5 (default = 0)");
+
+int c4iw_wr_log = 0;
+module_param(c4iw_wr_log, int, 0444);
+MODULE_PARM_DESC(c4iw_wr_log, "Enables logging of work request timing data.");
+
+static int c4iw_wr_log_size_order = 12;
+module_param(c4iw_wr_log_size_order, int, 0444);
+MODULE_PARM_DESC(c4iw_wr_log_size_order,
+		 "Number of entries (log2) in the work request timing log.");
+
+static LIST_HEAD(uld_ctx_list);
+static DEFINE_MUTEX(dev_mutex);
+static struct workqueue_struct *reg_workq;
+
+#define DB_FC_RESUME_SIZE 64
+#define DB_FC_RESUME_DELAY 1
+#define DB_FC_DRAIN_THRESH 0
+
+static struct dentry *c4iw_debugfs_root;
+
+struct c4iw_debugfs_data {
+	struct c4iw_dev *devp;
+	char *buf;
+	int bufsize;
+	int pos;
+};
+
+static int count_idrs(int id, void *p, void *data)
+{
+	int *countp = data;
+
+	*countp = *countp + 1;
+	return 0;
+}
+
+static ssize_t debugfs_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct c4iw_debugfs_data *d = file->private_data;
+
+	return simple_read_from_buffer(buf, count, ppos, d->buf, d->pos);
+}
+
+void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe)
+{
+	struct wr_log_entry le;
+	int idx;
+
+	if (!wq->rdev->wr_log)
+		return;
+
+	idx = (atomic_inc_return(&wq->rdev->wr_log_idx) - 1) &
+		(wq->rdev->wr_log_size - 1);
+	le.poll_sge_ts = cxgb4_read_sge_timestamp(wq->rdev->lldi.ports[0]);
+	le.poll_host_time = ktime_get();
+	le.valid = 1;
+	le.cqe_sge_ts = CQE_TS(cqe);
+	if (SQ_TYPE(cqe)) {
+		le.qid = wq->sq.qid;
+		le.opcode = CQE_OPCODE(cqe);
+		le.post_host_time = wq->sq.sw_sq[wq->sq.cidx].host_time;
+		le.post_sge_ts = wq->sq.sw_sq[wq->sq.cidx].sge_ts;
+		le.wr_id = CQE_WRID_SQ_IDX(cqe);
+	} else {
+		le.qid = wq->rq.qid;
+		le.opcode = FW_RI_RECEIVE;
+		le.post_host_time = wq->rq.sw_rq[wq->rq.cidx].host_time;
+		le.post_sge_ts = wq->rq.sw_rq[wq->rq.cidx].sge_ts;
+		le.wr_id = CQE_WRID_MSN(cqe);
+	}
+	wq->rdev->wr_log[idx] = le;
+}
+
+static int wr_log_show(struct seq_file *seq, void *v)
+{
+	struct c4iw_dev *dev = seq->private;
+	ktime_t prev_time;
+	struct wr_log_entry *lep;
+	int prev_time_set = 0;
+	int idx, end;
+
+#define ts2ns(ts) div64_u64((ts) * dev->rdev.lldi.cclk_ps, 1000)
+
+	idx = atomic_read(&dev->rdev.wr_log_idx) &
+		(dev->rdev.wr_log_size - 1);
+	end = idx - 1;
+	if (end < 0)
+		end = dev->rdev.wr_log_size - 1;
+	lep = &dev->rdev.wr_log[idx];
+	while (idx != end) {
+		if (lep->valid) {
+			if (!prev_time_set) {
+				prev_time_set = 1;
+				prev_time = lep->poll_host_time;
+			}
+			seq_printf(seq, "%04u: nsec %llu qid %u opcode "
+				   "%u %s 0x%x host_wr_delta nsec %llu "
+				   "post_sge_ts 0x%llx cqe_sge_ts 0x%llx "
+				   "poll_sge_ts 0x%llx post_poll_delta_ns %llu "
+				   "cqe_poll_delta_ns %llu\n",
+				   idx,
+				   ktime_to_ns(ktime_sub(lep->poll_host_time,
+							 prev_time)),
+				   lep->qid, lep->opcode,
+				   lep->opcode == FW_RI_RECEIVE ?
+							"msn" : "wrid",
+				   lep->wr_id,
+				   ktime_to_ns(ktime_sub(lep->poll_host_time,
+							 lep->post_host_time)),
+				   lep->post_sge_ts, lep->cqe_sge_ts,
+				   lep->poll_sge_ts,
+				   ts2ns(lep->poll_sge_ts - lep->post_sge_ts),
+				   ts2ns(lep->poll_sge_ts - lep->cqe_sge_ts));
+			prev_time = lep->poll_host_time;
+		}
+		idx++;
+		if (idx > (dev->rdev.wr_log_size - 1))
+			idx = 0;
+		lep = &dev->rdev.wr_log[idx];
+	}
+#undef ts2ns
+	return 0;
+}
+
+static int wr_log_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, wr_log_show, inode->i_private);
+}
+
+static ssize_t wr_log_clear(struct file *file, const char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct c4iw_dev *dev = ((struct seq_file *)file->private_data)->private;
+	int i;
+
+	if (dev->rdev.wr_log)
+		for (i = 0; i < dev->rdev.wr_log_size; i++)
+			dev->rdev.wr_log[i].valid = 0;
+	return count;
+}
+
+static const struct file_operations wr_log_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = wr_log_open,
+	.release = single_release,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.write   = wr_log_clear,
+};
+
+static struct sockaddr_in zero_sin = {
+	.sin_family = AF_INET,
+};
+
+static struct sockaddr_in6 zero_sin6 = {
+	.sin6_family = AF_INET6,
+};
+
+static void set_ep_sin_addrs(struct c4iw_ep *ep,
+			     struct sockaddr_in **lsin,
+			     struct sockaddr_in **rsin,
+			     struct sockaddr_in **m_lsin,
+			     struct sockaddr_in **m_rsin)
+{
+	struct iw_cm_id *id = ep->com.cm_id;
+
+	*m_lsin = (struct sockaddr_in *)&ep->com.local_addr;
+	*m_rsin = (struct sockaddr_in *)&ep->com.remote_addr;
+	if (id) {
+		*lsin = (struct sockaddr_in *)&id->local_addr;
+		*rsin = (struct sockaddr_in *)&id->remote_addr;
+	} else {
+		*lsin = &zero_sin;
+		*rsin = &zero_sin;
+	}
+}
+
+static void set_ep_sin6_addrs(struct c4iw_ep *ep,
+			      struct sockaddr_in6 **lsin6,
+			      struct sockaddr_in6 **rsin6,
+			      struct sockaddr_in6 **m_lsin6,
+			      struct sockaddr_in6 **m_rsin6)
+{
+	struct iw_cm_id *id = ep->com.cm_id;
+
+	*m_lsin6 = (struct sockaddr_in6 *)&ep->com.local_addr;
+	*m_rsin6 = (struct sockaddr_in6 *)&ep->com.remote_addr;
+	if (id) {
+		*lsin6 = (struct sockaddr_in6 *)&id->local_addr;
+		*rsin6 = (struct sockaddr_in6 *)&id->remote_addr;
+	} else {
+		*lsin6 = &zero_sin6;
+		*rsin6 = &zero_sin6;
+	}
+}
+
+static int dump_qp(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+	struct c4iw_debugfs_data *qpd = data;
+	int space;
+	int cc;
+
+	if (id != qp->wq.sq.qid)
+		return 0;
+
+	space = qpd->bufsize - qpd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	if (qp->ep) {
+		struct c4iw_ep *ep = qp->ep;
+
+		if (ep->com.local_addr.ss_family == AF_INET) {
+			struct sockaddr_in *lsin;
+			struct sockaddr_in *rsin;
+			struct sockaddr_in *m_lsin;
+			struct sockaddr_in *m_rsin;
+
+			set_ep_sin_addrs(ep, &lsin, &rsin, &m_lsin, &m_rsin);
+			cc = snprintf(qpd->buf + qpd->pos, space,
+				      "rc qp sq id %u rq id %u state %u "
+				      "onchip %u ep tid %u state %u "
+				      "%pI4:%u/%u->%pI4:%u/%u\n",
+				      qp->wq.sq.qid, qp->wq.rq.qid,
+				      (int)qp->attr.state,
+				      qp->wq.sq.flags & T4_SQ_ONCHIP,
+				      ep->hwtid, (int)ep->com.state,
+				      &lsin->sin_addr, ntohs(lsin->sin_port),
+				      ntohs(m_lsin->sin_port),
+				      &rsin->sin_addr, ntohs(rsin->sin_port),
+				      ntohs(m_rsin->sin_port));
+		} else {
+			struct sockaddr_in6 *lsin6;
+			struct sockaddr_in6 *rsin6;
+			struct sockaddr_in6 *m_lsin6;
+			struct sockaddr_in6 *m_rsin6;
+
+			set_ep_sin6_addrs(ep, &lsin6, &rsin6, &m_lsin6,
+					  &m_rsin6);
+			cc = snprintf(qpd->buf + qpd->pos, space,
+				      "rc qp sq id %u rq id %u state %u "
+				      "onchip %u ep tid %u state %u "
+				      "%pI6:%u/%u->%pI6:%u/%u\n",
+				      qp->wq.sq.qid, qp->wq.rq.qid,
+				      (int)qp->attr.state,
+				      qp->wq.sq.flags & T4_SQ_ONCHIP,
+				      ep->hwtid, (int)ep->com.state,
+				      &lsin6->sin6_addr,
+				      ntohs(lsin6->sin6_port),
+				      ntohs(m_lsin6->sin6_port),
+				      &rsin6->sin6_addr,
+				      ntohs(rsin6->sin6_port),
+				      ntohs(m_rsin6->sin6_port));
+		}
+	} else
+		cc = snprintf(qpd->buf + qpd->pos, space,
+			     "qp sq id %u rq id %u state %u onchip %u\n",
+			      qp->wq.sq.qid, qp->wq.rq.qid,
+			      (int)qp->attr.state,
+			      qp->wq.sq.flags & T4_SQ_ONCHIP);
+	if (cc < space)
+		qpd->pos += cc;
+	return 0;
+}
+
+static int qp_release(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *qpd = file->private_data;
+	if (!qpd) {
+		pr_info("%s null qpd?\n", __func__);
+		return 0;
+	}
+	vfree(qpd->buf);
+	kfree(qpd);
+	return 0;
+}
+
+static int qp_open(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *qpd;
+	int count = 1;
+
+	qpd = kmalloc(sizeof *qpd, GFP_KERNEL);
+	if (!qpd)
+		return -ENOMEM;
+
+	qpd->devp = inode->i_private;
+	qpd->pos = 0;
+
+	spin_lock_irq(&qpd->devp->lock);
+	idr_for_each(&qpd->devp->qpidr, count_idrs, &count);
+	spin_unlock_irq(&qpd->devp->lock);
+
+	qpd->bufsize = count * 180;
+	qpd->buf = vmalloc(qpd->bufsize);
+	if (!qpd->buf) {
+		kfree(qpd);
+		return -ENOMEM;
+	}
+
+	spin_lock_irq(&qpd->devp->lock);
+	idr_for_each(&qpd->devp->qpidr, dump_qp, qpd);
+	spin_unlock_irq(&qpd->devp->lock);
+
+	qpd->buf[qpd->pos++] = 0;
+	file->private_data = qpd;
+	return 0;
+}
+
+static const struct file_operations qp_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = qp_open,
+	.release = qp_release,
+	.read    = debugfs_read,
+	.llseek  = default_llseek,
+};
+
+static int dump_stag(int id, void *p, void *data)
+{
+	struct c4iw_debugfs_data *stagd = data;
+	int space;
+	int cc;
+	struct fw_ri_tpte tpte;
+	int ret;
+
+	space = stagd->bufsize - stagd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	ret = cxgb4_read_tpte(stagd->devp->rdev.lldi.ports[0], (u32)id<<8,
+			      (__be32 *)&tpte);
+	if (ret) {
+		dev_err(&stagd->devp->rdev.lldi.pdev->dev,
+			"%s cxgb4_read_tpte err %d\n", __func__, ret);
+		return ret;
+	}
+	cc = snprintf(stagd->buf + stagd->pos, space,
+		      "stag: idx 0x%x valid %d key 0x%x state %d pdid %d "
+		      "perm 0x%x ps %d len 0x%llx va 0x%llx\n",
+		      (u32)id<<8,
+		      FW_RI_TPTE_VALID_G(ntohl(tpte.valid_to_pdid)),
+		      FW_RI_TPTE_STAGKEY_G(ntohl(tpte.valid_to_pdid)),
+		      FW_RI_TPTE_STAGSTATE_G(ntohl(tpte.valid_to_pdid)),
+		      FW_RI_TPTE_PDID_G(ntohl(tpte.valid_to_pdid)),
+		      FW_RI_TPTE_PERM_G(ntohl(tpte.locread_to_qpid)),
+		      FW_RI_TPTE_PS_G(ntohl(tpte.locread_to_qpid)),
+		      ((u64)ntohl(tpte.len_hi) << 32) | ntohl(tpte.len_lo),
+		      ((u64)ntohl(tpte.va_hi) << 32) | ntohl(tpte.va_lo_fbo));
+	if (cc < space)
+		stagd->pos += cc;
+	return 0;
+}
+
+static int stag_release(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *stagd = file->private_data;
+	if (!stagd) {
+		pr_info("%s null stagd?\n", __func__);
+		return 0;
+	}
+	vfree(stagd->buf);
+	kfree(stagd);
+	return 0;
+}
+
+static int stag_open(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *stagd;
+	int ret = 0;
+	int count = 1;
+
+	stagd = kmalloc(sizeof *stagd, GFP_KERNEL);
+	if (!stagd) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	stagd->devp = inode->i_private;
+	stagd->pos = 0;
+
+	spin_lock_irq(&stagd->devp->lock);
+	idr_for_each(&stagd->devp->mmidr, count_idrs, &count);
+	spin_unlock_irq(&stagd->devp->lock);
+
+	stagd->bufsize = count * 256;
+	stagd->buf = vmalloc(stagd->bufsize);
+	if (!stagd->buf) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	spin_lock_irq(&stagd->devp->lock);
+	idr_for_each(&stagd->devp->mmidr, dump_stag, stagd);
+	spin_unlock_irq(&stagd->devp->lock);
+
+	stagd->buf[stagd->pos++] = 0;
+	file->private_data = stagd;
+	goto out;
+err1:
+	kfree(stagd);
+out:
+	return ret;
+}
+
+static const struct file_operations stag_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = stag_open,
+	.release = stag_release,
+	.read    = debugfs_read,
+	.llseek  = default_llseek,
+};
+
+static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY", "STOPPED"};
+
+static int stats_show(struct seq_file *seq, void *v)
+{
+	struct c4iw_dev *dev = seq->private;
+
+	seq_printf(seq, "   Object: %10s %10s %10s %10s\n", "Total", "Current",
+		   "Max", "Fail");
+	seq_printf(seq, "     PDID: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.pd.total, dev->rdev.stats.pd.cur,
+			dev->rdev.stats.pd.max, dev->rdev.stats.pd.fail);
+	seq_printf(seq, "      QID: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.qid.total, dev->rdev.stats.qid.cur,
+			dev->rdev.stats.qid.max, dev->rdev.stats.qid.fail);
+	seq_printf(seq, "   TPTMEM: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.stag.total, dev->rdev.stats.stag.cur,
+			dev->rdev.stats.stag.max, dev->rdev.stats.stag.fail);
+	seq_printf(seq, "   PBLMEM: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.pbl.total, dev->rdev.stats.pbl.cur,
+			dev->rdev.stats.pbl.max, dev->rdev.stats.pbl.fail);
+	seq_printf(seq, "   RQTMEM: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.rqt.total, dev->rdev.stats.rqt.cur,
+			dev->rdev.stats.rqt.max, dev->rdev.stats.rqt.fail);
+	seq_printf(seq, "  OCQPMEM: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.ocqp.total, dev->rdev.stats.ocqp.cur,
+			dev->rdev.stats.ocqp.max, dev->rdev.stats.ocqp.fail);
+	seq_printf(seq, "  DB FULL: %10llu\n", dev->rdev.stats.db_full);
+	seq_printf(seq, " DB EMPTY: %10llu\n", dev->rdev.stats.db_empty);
+	seq_printf(seq, "  DB DROP: %10llu\n", dev->rdev.stats.db_drop);
+	seq_printf(seq, " DB State: %s Transitions %llu FC Interruptions %llu\n",
+		   db_state_str[dev->db_state],
+		   dev->rdev.stats.db_state_transitions,
+		   dev->rdev.stats.db_fc_interruptions);
+	seq_printf(seq, "TCAM_FULL: %10llu\n", dev->rdev.stats.tcam_full);
+	seq_printf(seq, "ACT_OFLD_CONN_FAILS: %10llu\n",
+		   dev->rdev.stats.act_ofld_conn_fails);
+	seq_printf(seq, "PAS_OFLD_CONN_FAILS: %10llu\n",
+		   dev->rdev.stats.pas_ofld_conn_fails);
+	seq_printf(seq, "NEG_ADV_RCVD: %10llu\n", dev->rdev.stats.neg_adv);
+	seq_printf(seq, "AVAILABLE IRD: %10u\n", dev->avail_ird);
+	return 0;
+}
+
+static int stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, stats_show, inode->i_private);
+}
+
+static ssize_t stats_clear(struct file *file, const char __user *buf,
+		size_t count, loff_t *pos)
+{
+	struct c4iw_dev *dev = ((struct seq_file *)file->private_data)->private;
+
+	mutex_lock(&dev->rdev.stats.lock);
+	dev->rdev.stats.pd.max = 0;
+	dev->rdev.stats.pd.fail = 0;
+	dev->rdev.stats.qid.max = 0;
+	dev->rdev.stats.qid.fail = 0;
+	dev->rdev.stats.stag.max = 0;
+	dev->rdev.stats.stag.fail = 0;
+	dev->rdev.stats.pbl.max = 0;
+	dev->rdev.stats.pbl.fail = 0;
+	dev->rdev.stats.rqt.max = 0;
+	dev->rdev.stats.rqt.fail = 0;
+	dev->rdev.stats.ocqp.max = 0;
+	dev->rdev.stats.ocqp.fail = 0;
+	dev->rdev.stats.db_full = 0;
+	dev->rdev.stats.db_empty = 0;
+	dev->rdev.stats.db_drop = 0;
+	dev->rdev.stats.db_state_transitions = 0;
+	dev->rdev.stats.tcam_full = 0;
+	dev->rdev.stats.act_ofld_conn_fails = 0;
+	dev->rdev.stats.pas_ofld_conn_fails = 0;
+	mutex_unlock(&dev->rdev.stats.lock);
+	return count;
+}
+
+static const struct file_operations stats_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = stats_open,
+	.release = single_release,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.write   = stats_clear,
+};
+
+static int dump_ep(int id, void *p, void *data)
+{
+	struct c4iw_ep *ep = p;
+	struct c4iw_debugfs_data *epd = data;
+	int space;
+	int cc;
+
+	space = epd->bufsize - epd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	if (ep->com.local_addr.ss_family == AF_INET) {
+		struct sockaddr_in *lsin;
+		struct sockaddr_in *rsin;
+		struct sockaddr_in *m_lsin;
+		struct sockaddr_in *m_rsin;
+
+		set_ep_sin_addrs(ep, &lsin, &rsin, &m_lsin, &m_rsin);
+		cc = snprintf(epd->buf + epd->pos, space,
+			      "ep %p cm_id %p qp %p state %d flags 0x%lx "
+			      "history 0x%lx hwtid %d atid %d "
+			      "conn_na %u abort_na %u "
+			      "%pI4:%d/%d <-> %pI4:%d/%d\n",
+			      ep, ep->com.cm_id, ep->com.qp,
+			      (int)ep->com.state, ep->com.flags,
+			      ep->com.history, ep->hwtid, ep->atid,
+			      ep->stats.connect_neg_adv,
+			      ep->stats.abort_neg_adv,
+			      &lsin->sin_addr, ntohs(lsin->sin_port),
+			      ntohs(m_lsin->sin_port),
+			      &rsin->sin_addr, ntohs(rsin->sin_port),
+			      ntohs(m_rsin->sin_port));
+	} else {
+		struct sockaddr_in6 *lsin6;
+		struct sockaddr_in6 *rsin6;
+		struct sockaddr_in6 *m_lsin6;
+		struct sockaddr_in6 *m_rsin6;
+
+		set_ep_sin6_addrs(ep, &lsin6, &rsin6, &m_lsin6, &m_rsin6);
+		cc = snprintf(epd->buf + epd->pos, space,
+			      "ep %p cm_id %p qp %p state %d flags 0x%lx "
+			      "history 0x%lx hwtid %d atid %d "
+			      "conn_na %u abort_na %u "
+			      "%pI6:%d/%d <-> %pI6:%d/%d\n",
+			      ep, ep->com.cm_id, ep->com.qp,
+			      (int)ep->com.state, ep->com.flags,
+			      ep->com.history, ep->hwtid, ep->atid,
+			      ep->stats.connect_neg_adv,
+			      ep->stats.abort_neg_adv,
+			      &lsin6->sin6_addr, ntohs(lsin6->sin6_port),
+			      ntohs(m_lsin6->sin6_port),
+			      &rsin6->sin6_addr, ntohs(rsin6->sin6_port),
+			      ntohs(m_rsin6->sin6_port));
+	}
+	if (cc < space)
+		epd->pos += cc;
+	return 0;
+}
+
+static int dump_listen_ep(int id, void *p, void *data)
+{
+	struct c4iw_listen_ep *ep = p;
+	struct c4iw_debugfs_data *epd = data;
+	int space;
+	int cc;
+
+	space = epd->bufsize - epd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	if (ep->com.local_addr.ss_family == AF_INET) {
+		struct sockaddr_in *lsin = (struct sockaddr_in *)
+			&ep->com.cm_id->local_addr;
+		struct sockaddr_in *m_lsin = (struct sockaddr_in *)
+			&ep->com.cm_id->m_local_addr;
+
+		cc = snprintf(epd->buf + epd->pos, space,
+			      "ep %p cm_id %p state %d flags 0x%lx stid %d "
+			      "backlog %d %pI4:%d/%d\n",
+			      ep, ep->com.cm_id, (int)ep->com.state,
+			      ep->com.flags, ep->stid, ep->backlog,
+			      &lsin->sin_addr, ntohs(lsin->sin_port),
+			      ntohs(m_lsin->sin_port));
+	} else {
+		struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *)
+			&ep->com.cm_id->local_addr;
+		struct sockaddr_in6 *m_lsin6 = (struct sockaddr_in6 *)
+			&ep->com.cm_id->m_local_addr;
+
+		cc = snprintf(epd->buf + epd->pos, space,
+			      "ep %p cm_id %p state %d flags 0x%lx stid %d "
+			      "backlog %d %pI6:%d/%d\n",
+			      ep, ep->com.cm_id, (int)ep->com.state,
+			      ep->com.flags, ep->stid, ep->backlog,
+			      &lsin6->sin6_addr, ntohs(lsin6->sin6_port),
+			      ntohs(m_lsin6->sin6_port));
+	}
+	if (cc < space)
+		epd->pos += cc;
+	return 0;
+}
+
+static int ep_release(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *epd = file->private_data;
+	if (!epd) {
+		pr_info("%s null qpd?\n", __func__);
+		return 0;
+	}
+	vfree(epd->buf);
+	kfree(epd);
+	return 0;
+}
+
+static int ep_open(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *epd;
+	int ret = 0;
+	int count = 1;
+
+	epd = kmalloc(sizeof(*epd), GFP_KERNEL);
+	if (!epd) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	epd->devp = inode->i_private;
+	epd->pos = 0;
+
+	spin_lock_irq(&epd->devp->lock);
+	idr_for_each(&epd->devp->hwtid_idr, count_idrs, &count);
+	idr_for_each(&epd->devp->atid_idr, count_idrs, &count);
+	idr_for_each(&epd->devp->stid_idr, count_idrs, &count);
+	spin_unlock_irq(&epd->devp->lock);
+
+	epd->bufsize = count * 240;
+	epd->buf = vmalloc(epd->bufsize);
+	if (!epd->buf) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	spin_lock_irq(&epd->devp->lock);
+	idr_for_each(&epd->devp->hwtid_idr, dump_ep, epd);
+	idr_for_each(&epd->devp->atid_idr, dump_ep, epd);
+	idr_for_each(&epd->devp->stid_idr, dump_listen_ep, epd);
+	spin_unlock_irq(&epd->devp->lock);
+
+	file->private_data = epd;
+	goto out;
+err1:
+	kfree(epd);
+out:
+	return ret;
+}
+
+static const struct file_operations ep_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ep_open,
+	.release = ep_release,
+	.read    = debugfs_read,
+};
+
+static int setup_debugfs(struct c4iw_dev *devp)
+{
+	if (!devp->debugfs_root)
+		return -1;
+
+	debugfs_create_file_size("qps", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &qp_debugfs_fops, 4096);
+
+	debugfs_create_file_size("stags", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &stag_debugfs_fops, 4096);
+
+	debugfs_create_file_size("stats", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &stats_debugfs_fops, 4096);
+
+	debugfs_create_file_size("eps", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &ep_debugfs_fops, 4096);
+
+	if (c4iw_wr_log)
+		debugfs_create_file_size("wr_log", S_IWUSR, devp->debugfs_root,
+					 (void *)devp, &wr_log_debugfs_fops, 4096);
+	return 0;
+}
+
+void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,
+			       struct c4iw_dev_ucontext *uctx)
+{
+	struct list_head *pos, *nxt;
+	struct c4iw_qid_list *entry;
+
+	mutex_lock(&uctx->lock);
+	list_for_each_safe(pos, nxt, &uctx->qpids) {
+		entry = list_entry(pos, struct c4iw_qid_list, entry);
+		list_del_init(&entry->entry);
+		if (!(entry->qid & rdev->qpmask)) {
+			c4iw_put_resource(&rdev->resource.qid_table,
+					  entry->qid);
+			mutex_lock(&rdev->stats.lock);
+			rdev->stats.qid.cur -= rdev->qpmask + 1;
+			mutex_unlock(&rdev->stats.lock);
+		}
+		kfree(entry);
+	}
+
+	list_for_each_safe(pos, nxt, &uctx->cqids) {
+		entry = list_entry(pos, struct c4iw_qid_list, entry);
+		list_del_init(&entry->entry);
+		kfree(entry);
+	}
+	mutex_unlock(&uctx->lock);
+}
+
+void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev,
+			    struct c4iw_dev_ucontext *uctx)
+{
+	INIT_LIST_HEAD(&uctx->qpids);
+	INIT_LIST_HEAD(&uctx->cqids);
+	mutex_init(&uctx->lock);
+}
+
+/* Caller takes care of locking if needed */
+static int c4iw_rdev_open(struct c4iw_rdev *rdev)
+{
+	int err;
+
+	c4iw_init_dev_ucontext(rdev, &rdev->uctx);
+
+	/*
+	 * This implementation assumes udb_density == ucq_density!  Eventually
+	 * we might need to support this but for now fail the open. Also the
+	 * cqid and qpid range must match for now.
+	 */
+	if (rdev->lldi.udb_density != rdev->lldi.ucq_density) {
+		pr_err("%s: unsupported udb/ucq densities %u/%u\n",
+		       pci_name(rdev->lldi.pdev), rdev->lldi.udb_density,
+		       rdev->lldi.ucq_density);
+		return -EINVAL;
+	}
+	if (rdev->lldi.vr->qp.start != rdev->lldi.vr->cq.start ||
+	    rdev->lldi.vr->qp.size != rdev->lldi.vr->cq.size) {
+		pr_err("%s: unsupported qp and cq id ranges qp start %u size %u cq start %u size %u\n",
+		       pci_name(rdev->lldi.pdev), rdev->lldi.vr->qp.start,
+		       rdev->lldi.vr->qp.size, rdev->lldi.vr->cq.size,
+		       rdev->lldi.vr->cq.size);
+		return -EINVAL;
+	}
+
+	rdev->qpmask = rdev->lldi.udb_density - 1;
+	rdev->cqmask = rdev->lldi.ucq_density - 1;
+	pr_debug("dev %s stag start 0x%0x size 0x%0x num stags %d pbl start 0x%0x size 0x%0x rq start 0x%0x size 0x%0x qp qid start %u size %u cq qid start %u size %u\n",
+		 pci_name(rdev->lldi.pdev), rdev->lldi.vr->stag.start,
+		 rdev->lldi.vr->stag.size, c4iw_num_stags(rdev),
+		 rdev->lldi.vr->pbl.start,
+		 rdev->lldi.vr->pbl.size, rdev->lldi.vr->rq.start,
+		 rdev->lldi.vr->rq.size,
+		 rdev->lldi.vr->qp.start,
+		 rdev->lldi.vr->qp.size,
+		 rdev->lldi.vr->cq.start,
+		 rdev->lldi.vr->cq.size);
+	pr_debug("udb %pR db_reg %p gts_reg %p qpmask 0x%x cqmask 0x%x\n",
+		 &rdev->lldi.pdev->resource[2],
+		 rdev->lldi.db_reg, rdev->lldi.gts_reg,
+		 rdev->qpmask, rdev->cqmask);
+
+	if (c4iw_num_stags(rdev) == 0)
+		return -EINVAL;
+
+	rdev->stats.pd.total = T4_MAX_NUM_PD;
+	rdev->stats.stag.total = rdev->lldi.vr->stag.size;
+	rdev->stats.pbl.total = rdev->lldi.vr->pbl.size;
+	rdev->stats.rqt.total = rdev->lldi.vr->rq.size;
+	rdev->stats.ocqp.total = rdev->lldi.vr->ocq.size;
+	rdev->stats.qid.total = rdev->lldi.vr->qp.size;
+
+	err = c4iw_init_resource(rdev, c4iw_num_stags(rdev), T4_MAX_NUM_PD);
+	if (err) {
+		pr_err("error %d initializing resources\n", err);
+		return err;
+	}
+	err = c4iw_pblpool_create(rdev);
+	if (err) {
+		pr_err("error %d initializing pbl pool\n", err);
+		goto destroy_resource;
+	}
+	err = c4iw_rqtpool_create(rdev);
+	if (err) {
+		pr_err("error %d initializing rqt pool\n", err);
+		goto destroy_pblpool;
+	}
+	err = c4iw_ocqp_pool_create(rdev);
+	if (err) {
+		pr_err("error %d initializing ocqp pool\n", err);
+		goto destroy_rqtpool;
+	}
+	rdev->status_page = (struct t4_dev_status_page *)
+			    __get_free_page(GFP_KERNEL);
+	if (!rdev->status_page) {
+		err = -ENOMEM;
+		goto destroy_ocqp_pool;
+	}
+	rdev->status_page->qp_start = rdev->lldi.vr->qp.start;
+	rdev->status_page->qp_size = rdev->lldi.vr->qp.size;
+	rdev->status_page->cq_start = rdev->lldi.vr->cq.start;
+	rdev->status_page->cq_size = rdev->lldi.vr->cq.size;
+
+	if (c4iw_wr_log) {
+		rdev->wr_log = kzalloc((1 << c4iw_wr_log_size_order) *
+				       sizeof(*rdev->wr_log), GFP_KERNEL);
+		if (rdev->wr_log) {
+			rdev->wr_log_size = 1 << c4iw_wr_log_size_order;
+			atomic_set(&rdev->wr_log_idx, 0);
+		}
+	}
+
+	rdev->free_workq = create_singlethread_workqueue("iw_cxgb4_free");
+	if (!rdev->free_workq) {
+		err = -ENOMEM;
+		goto err_free_status_page_and_wr_log;
+	}
+
+	rdev->status_page->db_off = 0;
+
+	init_completion(&rdev->rqt_compl);
+	init_completion(&rdev->pbl_compl);
+	kref_init(&rdev->rqt_kref);
+	kref_init(&rdev->pbl_kref);
+
+	return 0;
+err_free_status_page_and_wr_log:
+	if (c4iw_wr_log && rdev->wr_log)
+		kfree(rdev->wr_log);
+	free_page((unsigned long)rdev->status_page);
+destroy_ocqp_pool:
+	c4iw_ocqp_pool_destroy(rdev);
+destroy_rqtpool:
+	c4iw_rqtpool_destroy(rdev);
+destroy_pblpool:
+	c4iw_pblpool_destroy(rdev);
+destroy_resource:
+	c4iw_destroy_resource(&rdev->resource);
+	return err;
+}
+
+static void c4iw_rdev_close(struct c4iw_rdev *rdev)
+{
+	kfree(rdev->wr_log);
+	c4iw_release_dev_ucontext(rdev, &rdev->uctx);
+	free_page((unsigned long)rdev->status_page);
+	c4iw_pblpool_destroy(rdev);
+	c4iw_rqtpool_destroy(rdev);
+	wait_for_completion(&rdev->pbl_compl);
+	wait_for_completion(&rdev->rqt_compl);
+	c4iw_ocqp_pool_destroy(rdev);
+	destroy_workqueue(rdev->free_workq);
+	c4iw_destroy_resource(&rdev->resource);
+}
+
+void c4iw_dealloc(struct uld_ctx *ctx)
+{
+	c4iw_rdev_close(&ctx->dev->rdev);
+	WARN_ON_ONCE(!idr_is_empty(&ctx->dev->cqidr));
+	idr_destroy(&ctx->dev->cqidr);
+	WARN_ON_ONCE(!idr_is_empty(&ctx->dev->qpidr));
+	idr_destroy(&ctx->dev->qpidr);
+	WARN_ON_ONCE(!idr_is_empty(&ctx->dev->mmidr));
+	idr_destroy(&ctx->dev->mmidr);
+	wait_event(ctx->dev->wait, idr_is_empty(&ctx->dev->hwtid_idr));
+	idr_destroy(&ctx->dev->hwtid_idr);
+	idr_destroy(&ctx->dev->stid_idr);
+	idr_destroy(&ctx->dev->atid_idr);
+	if (ctx->dev->rdev.bar2_kva)
+		iounmap(ctx->dev->rdev.bar2_kva);
+	if (ctx->dev->rdev.oc_mw_kva)
+		iounmap(ctx->dev->rdev.oc_mw_kva);
+	ib_dealloc_device(&ctx->dev->ibdev);
+	ctx->dev = NULL;
+}
+
+static void c4iw_remove(struct uld_ctx *ctx)
+{
+	pr_debug("c4iw_dev %p\n", ctx->dev);
+	c4iw_unregister_device(ctx->dev);
+	c4iw_dealloc(ctx);
+}
+
+static int rdma_supported(const struct cxgb4_lld_info *infop)
+{
+	return infop->vr->stag.size > 0 && infop->vr->pbl.size > 0 &&
+	       infop->vr->rq.size > 0 && infop->vr->qp.size > 0 &&
+	       infop->vr->cq.size > 0;
+}
+
+static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)
+{
+	struct c4iw_dev *devp;
+	int ret;
+
+	if (!rdma_supported(infop)) {
+		pr_info("%s: RDMA not supported on this device\n",
+			pci_name(infop->pdev));
+		return ERR_PTR(-ENOSYS);
+	}
+	if (!ocqp_supported(infop))
+		pr_info("%s: On-Chip Queues not supported on this device\n",
+			pci_name(infop->pdev));
+
+	devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp));
+	if (!devp) {
+		pr_err("Cannot allocate ib device\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	devp->rdev.lldi = *infop;
+
+	/* init various hw-queue params based on lld info */
+	pr_debug("Ing. padding boundary is %d, egrsstatuspagesize = %d\n",
+		 devp->rdev.lldi.sge_ingpadboundary,
+		 devp->rdev.lldi.sge_egrstatuspagesize);
+
+	devp->rdev.hw_queue.t4_eq_status_entries =
+		devp->rdev.lldi.sge_egrstatuspagesize / 64;
+	devp->rdev.hw_queue.t4_max_eq_size = 65520;
+	devp->rdev.hw_queue.t4_max_iq_size = 65520;
+	devp->rdev.hw_queue.t4_max_rq_size = 8192 -
+		devp->rdev.hw_queue.t4_eq_status_entries - 1;
+	devp->rdev.hw_queue.t4_max_sq_size =
+		devp->rdev.hw_queue.t4_max_eq_size -
+		devp->rdev.hw_queue.t4_eq_status_entries - 1;
+	devp->rdev.hw_queue.t4_max_qp_depth =
+		devp->rdev.hw_queue.t4_max_rq_size;
+	devp->rdev.hw_queue.t4_max_cq_depth =
+		devp->rdev.hw_queue.t4_max_iq_size - 2;
+	devp->rdev.hw_queue.t4_stat_len =
+		devp->rdev.lldi.sge_egrstatuspagesize;
+
+	/*
+	 * For T5/T6 devices, we map all of BAR2 with WC.
+	 * For T4 devices with onchip qp mem, we map only that part
+	 * of BAR2 with WC.
+	 */
+	devp->rdev.bar2_pa = pci_resource_start(devp->rdev.lldi.pdev, 2);
+	if (!is_t4(devp->rdev.lldi.adapter_type)) {
+		devp->rdev.bar2_kva = ioremap_wc(devp->rdev.bar2_pa,
+			pci_resource_len(devp->rdev.lldi.pdev, 2));
+		if (!devp->rdev.bar2_kva) {
+			pr_err("Unable to ioremap BAR2\n");
+			ib_dealloc_device(&devp->ibdev);
+			return ERR_PTR(-EINVAL);
+		}
+	} else if (ocqp_supported(infop)) {
+		devp->rdev.oc_mw_pa =
+			pci_resource_start(devp->rdev.lldi.pdev, 2) +
+			pci_resource_len(devp->rdev.lldi.pdev, 2) -
+			roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size);
+		devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa,
+			devp->rdev.lldi.vr->ocq.size);
+		if (!devp->rdev.oc_mw_kva) {
+			pr_err("Unable to ioremap onchip mem\n");
+			ib_dealloc_device(&devp->ibdev);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	pr_debug("ocq memory: hw_start 0x%x size %u mw_pa 0x%lx mw_kva %p\n",
+		 devp->rdev.lldi.vr->ocq.start, devp->rdev.lldi.vr->ocq.size,
+		 devp->rdev.oc_mw_pa, devp->rdev.oc_mw_kva);
+
+	ret = c4iw_rdev_open(&devp->rdev);
+	if (ret) {
+		pr_err("Unable to open CXIO rdev err %d\n", ret);
+		ib_dealloc_device(&devp->ibdev);
+		return ERR_PTR(ret);
+	}
+
+	idr_init(&devp->cqidr);
+	idr_init(&devp->qpidr);
+	idr_init(&devp->mmidr);
+	idr_init(&devp->hwtid_idr);
+	idr_init(&devp->stid_idr);
+	idr_init(&devp->atid_idr);
+	spin_lock_init(&devp->lock);
+	mutex_init(&devp->rdev.stats.lock);
+	mutex_init(&devp->db_mutex);
+	INIT_LIST_HEAD(&devp->db_fc_list);
+	init_waitqueue_head(&devp->wait);
+	devp->avail_ird = devp->rdev.lldi.max_ird_adapter;
+
+	if (c4iw_debugfs_root) {
+		devp->debugfs_root = debugfs_create_dir(
+					pci_name(devp->rdev.lldi.pdev),
+					c4iw_debugfs_root);
+		setup_debugfs(devp);
+	}
+
+
+	return devp;
+}
+
+static void *c4iw_uld_add(const struct cxgb4_lld_info *infop)
+{
+	struct uld_ctx *ctx;
+	static int vers_printed;
+	int i;
+
+	if (!vers_printed++)
+		pr_info("Chelsio T4/T5 RDMA Driver - version %s\n",
+			DRV_VERSION);
+
+	ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+	if (!ctx) {
+		ctx = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	ctx->lldi = *infop;
+
+	pr_debug("found device %s nchan %u nrxq %u ntxq %u nports %u\n",
+		 pci_name(ctx->lldi.pdev),
+		 ctx->lldi.nchan, ctx->lldi.nrxq,
+		 ctx->lldi.ntxq, ctx->lldi.nports);
+
+	mutex_lock(&dev_mutex);
+	list_add_tail(&ctx->entry, &uld_ctx_list);
+	mutex_unlock(&dev_mutex);
+
+	for (i = 0; i < ctx->lldi.nrxq; i++)
+		pr_debug("rxqid[%u] %u\n", i, ctx->lldi.rxq_ids[i]);
+out:
+	return ctx;
+}
+
+static inline struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl,
+						 const __be64 *rsp,
+						 u32 pktshift)
+{
+	struct sk_buff *skb;
+
+	/*
+	 * Allocate space for cpl_pass_accept_req which will be synthesized by
+	 * driver. Once the driver synthesizes the request the skb will go
+	 * through the regular cpl_pass_accept_req processing.
+	 * The math here assumes sizeof cpl_pass_accept_req >= sizeof
+	 * cpl_rx_pkt.
+	 */
+	skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) +
+			sizeof(struct rss_header) - pktshift, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return NULL;
+
+	__skb_put(skb, gl->tot_len + sizeof(struct cpl_pass_accept_req) +
+		  sizeof(struct rss_header) - pktshift);
+
+	/*
+	 * This skb will contain:
+	 *   rss_header from the rspq descriptor (1 flit)
+	 *   cpl_rx_pkt struct from the rspq descriptor (2 flits)
+	 *   space for the difference between the size of an
+	 *      rx_pkt and pass_accept_req cpl (1 flit)
+	 *   the packet data from the gl
+	 */
+	skb_copy_to_linear_data(skb, rsp, sizeof(struct cpl_pass_accept_req) +
+				sizeof(struct rss_header));
+	skb_copy_to_linear_data_offset(skb, sizeof(struct rss_header) +
+				       sizeof(struct cpl_pass_accept_req),
+				       gl->va + pktshift,
+				       gl->tot_len - pktshift);
+	return skb;
+}
+
+static inline int recv_rx_pkt(struct c4iw_dev *dev, const struct pkt_gl *gl,
+			   const __be64 *rsp)
+{
+	unsigned int opcode = *(u8 *)rsp;
+	struct sk_buff *skb;
+
+	if (opcode != CPL_RX_PKT)
+		goto out;
+
+	skb = copy_gl_to_skb_pkt(gl , rsp, dev->rdev.lldi.sge_pktshift);
+	if (skb == NULL)
+		goto out;
+
+	if (c4iw_handlers[opcode] == NULL) {
+		pr_info("%s no handler opcode 0x%x...\n", __func__, opcode);
+		kfree_skb(skb);
+		goto out;
+	}
+	c4iw_handlers[opcode](dev, skb);
+	return 1;
+out:
+	return 0;
+}
+
+static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp,
+			const struct pkt_gl *gl)
+{
+	struct uld_ctx *ctx = handle;
+	struct c4iw_dev *dev = ctx->dev;
+	struct sk_buff *skb;
+	u8 opcode;
+
+	if (gl == NULL) {
+		/* omit RSS and rsp_ctrl at end of descriptor */
+		unsigned int len = 64 - sizeof(struct rsp_ctrl) - 8;
+
+		skb = alloc_skb(256, GFP_ATOMIC);
+		if (!skb)
+			goto nomem;
+		__skb_put(skb, len);
+		skb_copy_to_linear_data(skb, &rsp[1], len);
+	} else if (gl == CXGB4_MSG_AN) {
+		const struct rsp_ctrl *rc = (void *)rsp;
+
+		u32 qid = be32_to_cpu(rc->pldbuflen_qid);
+		c4iw_ev_handler(dev, qid);
+		return 0;
+	} else if (unlikely(*(u8 *)rsp != *(u8 *)gl->va)) {
+		if (recv_rx_pkt(dev, gl, rsp))
+			return 0;
+
+		pr_info("%s: unexpected FL contents at %p, RSS %#llx, FL %#llx, len %u\n",
+			pci_name(ctx->lldi.pdev), gl->va,
+			be64_to_cpu(*rsp),
+			be64_to_cpu(*(__force __be64 *)gl->va),
+			gl->tot_len);
+
+		return 0;
+	} else {
+		skb = cxgb4_pktgl_to_skb(gl, 128, 128);
+		if (unlikely(!skb))
+			goto nomem;
+	}
+
+	opcode = *(u8 *)rsp;
+	if (c4iw_handlers[opcode]) {
+		c4iw_handlers[opcode](dev, skb);
+	} else {
+		pr_info("%s no handler opcode 0x%x...\n", __func__, opcode);
+		kfree_skb(skb);
+	}
+
+	return 0;
+nomem:
+	return -1;
+}
+
+static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state)
+{
+	struct uld_ctx *ctx = handle;
+
+	pr_debug("new_state %u\n", new_state);
+	switch (new_state) {
+	case CXGB4_STATE_UP:
+		pr_info("%s: Up\n", pci_name(ctx->lldi.pdev));
+		if (!ctx->dev) {
+			ctx->dev = c4iw_alloc(&ctx->lldi);
+			if (IS_ERR(ctx->dev)) {
+				pr_err("%s: initialization failed: %ld\n",
+				       pci_name(ctx->lldi.pdev),
+				       PTR_ERR(ctx->dev));
+				ctx->dev = NULL;
+				break;
+			}
+
+			INIT_WORK(&ctx->reg_work, c4iw_register_device);
+			queue_work(reg_workq, &ctx->reg_work);
+		}
+		break;
+	case CXGB4_STATE_DOWN:
+		pr_info("%s: Down\n", pci_name(ctx->lldi.pdev));
+		if (ctx->dev)
+			c4iw_remove(ctx);
+		break;
+	case CXGB4_STATE_FATAL_ERROR:
+	case CXGB4_STATE_START_RECOVERY:
+		pr_info("%s: Fatal Error\n", pci_name(ctx->lldi.pdev));
+		if (ctx->dev) {
+			struct ib_event event;
+
+			ctx->dev->rdev.flags |= T4_FATAL_ERROR;
+			memset(&event, 0, sizeof event);
+			event.event  = IB_EVENT_DEVICE_FATAL;
+			event.device = &ctx->dev->ibdev;
+			ib_dispatch_event(&event);
+			c4iw_remove(ctx);
+		}
+		break;
+	case CXGB4_STATE_DETACH:
+		pr_info("%s: Detach\n", pci_name(ctx->lldi.pdev));
+		if (ctx->dev)
+			c4iw_remove(ctx);
+		break;
+	}
+	return 0;
+}
+
+static int disable_qp_db(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+
+	t4_disable_wq_db(&qp->wq);
+	return 0;
+}
+
+static void stop_queues(struct uld_ctx *ctx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->dev->lock, flags);
+	ctx->dev->rdev.stats.db_state_transitions++;
+	ctx->dev->db_state = STOPPED;
+	if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED)
+		idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL);
+	else
+		ctx->dev->rdev.status_page->db_off = 1;
+	spin_unlock_irqrestore(&ctx->dev->lock, flags);
+}
+
+static int enable_qp_db(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+
+	t4_enable_wq_db(&qp->wq);
+	return 0;
+}
+
+static void resume_rc_qp(struct c4iw_qp *qp)
+{
+	spin_lock(&qp->lock);
+	t4_ring_sq_db(&qp->wq, qp->wq.sq.wq_pidx_inc, NULL);
+	qp->wq.sq.wq_pidx_inc = 0;
+	t4_ring_rq_db(&qp->wq, qp->wq.rq.wq_pidx_inc, NULL);
+	qp->wq.rq.wq_pidx_inc = 0;
+	spin_unlock(&qp->lock);
+}
+
+static void resume_a_chunk(struct uld_ctx *ctx)
+{
+	int i;
+	struct c4iw_qp *qp;
+
+	for (i = 0; i < DB_FC_RESUME_SIZE; i++) {
+		qp = list_first_entry(&ctx->dev->db_fc_list, struct c4iw_qp,
+				      db_fc_entry);
+		list_del_init(&qp->db_fc_entry);
+		resume_rc_qp(qp);
+		if (list_empty(&ctx->dev->db_fc_list))
+			break;
+	}
+}
+
+static void resume_queues(struct uld_ctx *ctx)
+{
+	spin_lock_irq(&ctx->dev->lock);
+	if (ctx->dev->db_state != STOPPED)
+		goto out;
+	ctx->dev->db_state = FLOW_CONTROL;
+	while (1) {
+		if (list_empty(&ctx->dev->db_fc_list)) {
+			WARN_ON(ctx->dev->db_state != FLOW_CONTROL);
+			ctx->dev->db_state = NORMAL;
+			ctx->dev->rdev.stats.db_state_transitions++;
+			if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) {
+				idr_for_each(&ctx->dev->qpidr, enable_qp_db,
+					     NULL);
+			} else {
+				ctx->dev->rdev.status_page->db_off = 0;
+			}
+			break;
+		} else {
+			if (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1)
+			    < (ctx->dev->rdev.lldi.dbfifo_int_thresh <<
+			       DB_FC_DRAIN_THRESH)) {
+				resume_a_chunk(ctx);
+			}
+			if (!list_empty(&ctx->dev->db_fc_list)) {
+				spin_unlock_irq(&ctx->dev->lock);
+				if (DB_FC_RESUME_DELAY) {
+					set_current_state(TASK_UNINTERRUPTIBLE);
+					schedule_timeout(DB_FC_RESUME_DELAY);
+				}
+				spin_lock_irq(&ctx->dev->lock);
+				if (ctx->dev->db_state != FLOW_CONTROL)
+					break;
+			}
+		}
+	}
+out:
+	if (ctx->dev->db_state != NORMAL)
+		ctx->dev->rdev.stats.db_fc_interruptions++;
+	spin_unlock_irq(&ctx->dev->lock);
+}
+
+struct qp_list {
+	unsigned idx;
+	struct c4iw_qp **qps;
+};
+
+static int add_and_ref_qp(int id, void *p, void *data)
+{
+	struct qp_list *qp_listp = data;
+	struct c4iw_qp *qp = p;
+
+	c4iw_qp_add_ref(&qp->ibqp);
+	qp_listp->qps[qp_listp->idx++] = qp;
+	return 0;
+}
+
+static int count_qps(int id, void *p, void *data)
+{
+	unsigned *countp = data;
+	(*countp)++;
+	return 0;
+}
+
+static void deref_qps(struct qp_list *qp_list)
+{
+	int idx;
+
+	for (idx = 0; idx < qp_list->idx; idx++)
+		c4iw_qp_rem_ref(&qp_list->qps[idx]->ibqp);
+}
+
+static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list)
+{
+	int idx;
+	int ret;
+
+	for (idx = 0; idx < qp_list->idx; idx++) {
+		struct c4iw_qp *qp = qp_list->qps[idx];
+
+		spin_lock_irq(&qp->rhp->lock);
+		spin_lock(&qp->lock);
+		ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],
+					  qp->wq.sq.qid,
+					  t4_sq_host_wq_pidx(&qp->wq),
+					  t4_sq_wq_size(&qp->wq));
+		if (ret) {
+			pr_err("%s: Fatal error - DB overflow recovery failed - error syncing SQ qid %u\n",
+			       pci_name(ctx->lldi.pdev), qp->wq.sq.qid);
+			spin_unlock(&qp->lock);
+			spin_unlock_irq(&qp->rhp->lock);
+			return;
+		}
+		qp->wq.sq.wq_pidx_inc = 0;
+
+		ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],
+					  qp->wq.rq.qid,
+					  t4_rq_host_wq_pidx(&qp->wq),
+					  t4_rq_wq_size(&qp->wq));
+
+		if (ret) {
+			pr_err("%s: Fatal error - DB overflow recovery failed - error syncing RQ qid %u\n",
+			       pci_name(ctx->lldi.pdev), qp->wq.rq.qid);
+			spin_unlock(&qp->lock);
+			spin_unlock_irq(&qp->rhp->lock);
+			return;
+		}
+		qp->wq.rq.wq_pidx_inc = 0;
+		spin_unlock(&qp->lock);
+		spin_unlock_irq(&qp->rhp->lock);
+
+		/* Wait for the dbfifo to drain */
+		while (cxgb4_dbfifo_count(qp->rhp->rdev.lldi.ports[0], 1) > 0) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(usecs_to_jiffies(10));
+		}
+	}
+}
+
+static void recover_queues(struct uld_ctx *ctx)
+{
+	int count = 0;
+	struct qp_list qp_list;
+	int ret;
+
+	/* slow everybody down */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(usecs_to_jiffies(1000));
+
+	/* flush the SGE contexts */
+	ret = cxgb4_flush_eq_cache(ctx->dev->rdev.lldi.ports[0]);
+	if (ret) {
+		pr_err("%s: Fatal error - DB overflow recovery failed\n",
+		       pci_name(ctx->lldi.pdev));
+		return;
+	}
+
+	/* Count active queues so we can build a list of queues to recover */
+	spin_lock_irq(&ctx->dev->lock);
+	WARN_ON(ctx->dev->db_state != STOPPED);
+	ctx->dev->db_state = RECOVERY;
+	idr_for_each(&ctx->dev->qpidr, count_qps, &count);
+
+	qp_list.qps = kzalloc(count * sizeof *qp_list.qps, GFP_ATOMIC);
+	if (!qp_list.qps) {
+		spin_unlock_irq(&ctx->dev->lock);
+		return;
+	}
+	qp_list.idx = 0;
+
+	/* add and ref each qp so it doesn't get freed */
+	idr_for_each(&ctx->dev->qpidr, add_and_ref_qp, &qp_list);
+
+	spin_unlock_irq(&ctx->dev->lock);
+
+	/* now traverse the list in a safe context to recover the db state*/
+	recover_lost_dbs(ctx, &qp_list);
+
+	/* we're almost done!  deref the qps and clean up */
+	deref_qps(&qp_list);
+	kfree(qp_list.qps);
+
+	spin_lock_irq(&ctx->dev->lock);
+	WARN_ON(ctx->dev->db_state != RECOVERY);
+	ctx->dev->db_state = STOPPED;
+	spin_unlock_irq(&ctx->dev->lock);
+}
+
+static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...)
+{
+	struct uld_ctx *ctx = handle;
+
+	switch (control) {
+	case CXGB4_CONTROL_DB_FULL:
+		stop_queues(ctx);
+		ctx->dev->rdev.stats.db_full++;
+		break;
+	case CXGB4_CONTROL_DB_EMPTY:
+		resume_queues(ctx);
+		mutex_lock(&ctx->dev->rdev.stats.lock);
+		ctx->dev->rdev.stats.db_empty++;
+		mutex_unlock(&ctx->dev->rdev.stats.lock);
+		break;
+	case CXGB4_CONTROL_DB_DROP:
+		recover_queues(ctx);
+		mutex_lock(&ctx->dev->rdev.stats.lock);
+		ctx->dev->rdev.stats.db_drop++;
+		mutex_unlock(&ctx->dev->rdev.stats.lock);
+		break;
+	default:
+		pr_warn("%s: unknown control cmd %u\n",
+			pci_name(ctx->lldi.pdev), control);
+		break;
+	}
+	return 0;
+}
+
+static struct cxgb4_uld_info c4iw_uld_info = {
+	.name = DRV_NAME,
+	.nrxq = MAX_ULD_QSETS,
+	.ntxq = MAX_ULD_QSETS,
+	.rxq_size = 511,
+	.ciq = true,
+	.lro = false,
+	.add = c4iw_uld_add,
+	.rx_handler = c4iw_uld_rx_handler,
+	.state_change = c4iw_uld_state_change,
+	.control = c4iw_uld_control,
+};
+
+void _c4iw_free_wr_wait(struct kref *kref)
+{
+	struct c4iw_wr_wait *wr_waitp;
+
+	wr_waitp = container_of(kref, struct c4iw_wr_wait, kref);
+	pr_debug("Free wr_wait %p\n", wr_waitp);
+	kfree(wr_waitp);
+}
+
+struct c4iw_wr_wait *c4iw_alloc_wr_wait(gfp_t gfp)
+{
+	struct c4iw_wr_wait *wr_waitp;
+
+	wr_waitp = kzalloc(sizeof(*wr_waitp), gfp);
+	if (wr_waitp) {
+		kref_init(&wr_waitp->kref);
+		pr_debug("wr_wait %p\n", wr_waitp);
+	}
+	return wr_waitp;
+}
+
+static int __init c4iw_init_module(void)
+{
+	int err;
+
+	err = c4iw_cm_init();
+	if (err)
+		return err;
+
+	c4iw_debugfs_root = debugfs_create_dir(DRV_NAME, NULL);
+	if (!c4iw_debugfs_root)
+		pr_warn("could not create debugfs entry, continuing\n");
+
+	reg_workq = create_singlethread_workqueue("Register_iWARP_device");
+	if (!reg_workq) {
+		pr_err("Failed creating workqueue to register iwarp device\n");
+		return -ENOMEM;
+	}
+
+	cxgb4_register_uld(CXGB4_ULD_RDMA, &c4iw_uld_info);
+
+	return 0;
+}
+
+static void __exit c4iw_exit_module(void)
+{
+	struct uld_ctx *ctx, *tmp;
+
+	mutex_lock(&dev_mutex);
+	list_for_each_entry_safe(ctx, tmp, &uld_ctx_list, entry) {
+		if (ctx->dev)
+			c4iw_remove(ctx);
+		kfree(ctx);
+	}
+	mutex_unlock(&dev_mutex);
+	flush_workqueue(reg_workq);
+	destroy_workqueue(reg_workq);
+	cxgb4_unregister_uld(CXGB4_ULD_RDMA);
+	c4iw_cm_term();
+	debugfs_remove_recursive(c4iw_debugfs_root);
+}
+
+module_init(c4iw_init_module);
+module_exit(c4iw_exit_module);
diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c
new file mode 100644
index 0000000..3e9d8b2
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/ev.c
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+#include <linux/mman.h>
+#include <net/sock.h>
+
+#include "iw_cxgb4.h"
+
+static void print_tpte(struct c4iw_dev *dev, u32 stag)
+{
+	int ret;
+	struct fw_ri_tpte tpte;
+
+	ret = cxgb4_read_tpte(dev->rdev.lldi.ports[0], stag,
+			      (__be32 *)&tpte);
+	if (ret) {
+		dev_err(&dev->rdev.lldi.pdev->dev,
+			"%s cxgb4_read_tpte err %d\n", __func__, ret);
+		return;
+	}
+	pr_debug("stag idx 0x%x valid %d key 0x%x state %d pdid %d perm 0x%x ps %d len 0x%llx va 0x%llx\n",
+		 stag & 0xffffff00,
+		 FW_RI_TPTE_VALID_G(ntohl(tpte.valid_to_pdid)),
+		 FW_RI_TPTE_STAGKEY_G(ntohl(tpte.valid_to_pdid)),
+		 FW_RI_TPTE_STAGSTATE_G(ntohl(tpte.valid_to_pdid)),
+		 FW_RI_TPTE_PDID_G(ntohl(tpte.valid_to_pdid)),
+		 FW_RI_TPTE_PERM_G(ntohl(tpte.locread_to_qpid)),
+		 FW_RI_TPTE_PS_G(ntohl(tpte.locread_to_qpid)),
+		 ((u64)ntohl(tpte.len_hi) << 32) | ntohl(tpte.len_lo),
+		 ((u64)ntohl(tpte.va_hi) << 32) | ntohl(tpte.va_lo_fbo));
+}
+
+static void dump_err_cqe(struct c4iw_dev *dev, struct t4_cqe *err_cqe)
+{
+	__be64 *p = (void *)err_cqe;
+
+	dev_err(&dev->rdev.lldi.pdev->dev,
+		"AE qpid %d opcode %d status 0x%x "
+		"type %d len 0x%x wrid.hi 0x%x wrid.lo 0x%x\n",
+		CQE_QPID(err_cqe), CQE_OPCODE(err_cqe),
+		CQE_STATUS(err_cqe), CQE_TYPE(err_cqe), ntohl(err_cqe->len),
+		CQE_WRID_HI(err_cqe), CQE_WRID_LOW(err_cqe));
+
+	pr_debug("%016llx %016llx %016llx %016llx\n",
+		 be64_to_cpu(p[0]), be64_to_cpu(p[1]), be64_to_cpu(p[2]),
+		 be64_to_cpu(p[3]));
+
+	/*
+	 * Ingress WRITE and READ_RESP errors provide
+	 * the offending stag, so parse and log it.
+	 */
+	if (RQ_TYPE(err_cqe) && (CQE_OPCODE(err_cqe) == FW_RI_RDMA_WRITE ||
+				 CQE_OPCODE(err_cqe) == FW_RI_READ_RESP))
+		print_tpte(dev, CQE_WRID_STAG(err_cqe));
+}
+
+static void post_qp_event(struct c4iw_dev *dev, struct c4iw_cq *chp,
+			  struct c4iw_qp *qhp,
+			  struct t4_cqe *err_cqe,
+			  enum ib_event_type ib_event)
+{
+	struct ib_event event;
+	struct c4iw_qp_attributes attrs;
+	unsigned long flag;
+
+	dump_err_cqe(dev, err_cqe);
+
+	if (qhp->attr.state == C4IW_QP_STATE_RTS) {
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		c4iw_modify_qp(qhp->rhp, qhp, C4IW_QP_ATTR_NEXT_STATE,
+			       &attrs, 0);
+	}
+
+	event.event = ib_event;
+	event.device = chp->ibcq.device;
+	if (ib_event == IB_EVENT_CQ_ERR)
+		event.element.cq = &chp->ibcq;
+	else
+		event.element.qp = &qhp->ibqp;
+	if (qhp->ibqp.event_handler)
+		(*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context);
+
+	if (t4_clear_cq_armed(&chp->cq)) {
+		spin_lock_irqsave(&chp->comp_handler_lock, flag);
+		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+		spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
+	}
+}
+
+void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe)
+{
+	struct c4iw_cq *chp;
+	struct c4iw_qp *qhp;
+	u32 cqid;
+
+	spin_lock_irq(&dev->lock);
+	qhp = get_qhp(dev, CQE_QPID(err_cqe));
+	if (!qhp) {
+		pr_err("BAD AE qpid 0x%x opcode %d status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
+		       CQE_QPID(err_cqe),
+		       CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe),
+		       CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe),
+		       CQE_WRID_LOW(err_cqe));
+		spin_unlock_irq(&dev->lock);
+		goto out;
+	}
+
+	if (SQ_TYPE(err_cqe))
+		cqid = qhp->attr.scq;
+	else
+		cqid = qhp->attr.rcq;
+	chp = get_chp(dev, cqid);
+	if (!chp) {
+		pr_err("BAD AE cqid 0x%x qpid 0x%x opcode %d status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
+		       cqid, CQE_QPID(err_cqe),
+		       CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe),
+		       CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe),
+		       CQE_WRID_LOW(err_cqe));
+		spin_unlock_irq(&dev->lock);
+		goto out;
+	}
+
+	c4iw_qp_add_ref(&qhp->ibqp);
+	atomic_inc(&chp->refcnt);
+	spin_unlock_irq(&dev->lock);
+
+	/* Bad incoming write */
+	if (RQ_TYPE(err_cqe) &&
+	    (CQE_OPCODE(err_cqe) == FW_RI_RDMA_WRITE)) {
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_REQ_ERR);
+		goto done;
+	}
+
+	switch (CQE_STATUS(err_cqe)) {
+
+	/* Completion Events */
+	case T4_ERR_SUCCESS:
+		pr_err("AE with status 0!\n");
+		break;
+
+	case T4_ERR_STAG:
+	case T4_ERR_PDID:
+	case T4_ERR_QPID:
+	case T4_ERR_ACCESS:
+	case T4_ERR_WRAP:
+	case T4_ERR_BOUND:
+	case T4_ERR_INVALIDATE_SHARED_MR:
+	case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_ACCESS_ERR);
+		break;
+
+	/* Device Fatal Errors */
+	case T4_ERR_ECC:
+	case T4_ERR_ECC_PSTAG:
+	case T4_ERR_INTERNAL_ERR:
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_DEVICE_FATAL);
+		break;
+
+	/* QP Fatal Errors */
+	case T4_ERR_OUT_OF_RQE:
+	case T4_ERR_PBL_ADDR_BOUND:
+	case T4_ERR_CRC:
+	case T4_ERR_MARKER:
+	case T4_ERR_PDU_LEN_ERR:
+	case T4_ERR_DDP_VERSION:
+	case T4_ERR_RDMA_VERSION:
+	case T4_ERR_OPCODE:
+	case T4_ERR_DDP_QUEUE_NUM:
+	case T4_ERR_MSN:
+	case T4_ERR_TBIT:
+	case T4_ERR_MO:
+	case T4_ERR_MSN_GAP:
+	case T4_ERR_MSN_RANGE:
+	case T4_ERR_RQE_ADDR_BOUND:
+	case T4_ERR_IRD_OVERFLOW:
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_FATAL);
+		break;
+
+	default:
+		pr_err("Unknown T4 status 0x%x QPID 0x%x\n",
+		       CQE_STATUS(err_cqe), qhp->wq.sq.qid);
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_FATAL);
+		break;
+	}
+done:
+	if (atomic_dec_and_test(&chp->refcnt))
+		wake_up(&chp->wait);
+	c4iw_qp_rem_ref(&qhp->ibqp);
+out:
+	return;
+}
+
+int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid)
+{
+	struct c4iw_cq *chp;
+	unsigned long flag;
+
+	spin_lock_irqsave(&dev->lock, flag);
+	chp = get_chp(dev, qid);
+	if (chp) {
+		atomic_inc(&chp->refcnt);
+		spin_unlock_irqrestore(&dev->lock, flag);
+		t4_clear_cq_armed(&chp->cq);
+		spin_lock_irqsave(&chp->comp_handler_lock, flag);
+		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+		spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
+		if (atomic_dec_and_test(&chp->refcnt))
+			wake_up(&chp->wait);
+	} else {
+		pr_debug("unknown cqid 0x%x\n", qid);
+		spin_unlock_irqrestore(&dev->lock, flag);
+	}
+	return 0;
+}
diff --git a/drivers/infiniband/hw/cxgb4/id_table.c b/drivers/infiniband/hw/cxgb4/id_table.c
new file mode 100644
index 0000000..5c2cfde
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/id_table.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2011 Chelsio Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include "iw_cxgb4.h"
+
+#define RANDOM_SKIP 16
+
+/*
+ * Trivial bitmap-based allocator. If the random flag is set, the
+ * allocator is designed to:
+ * - pseudo-randomize the id returned such that it is not trivially predictable.
+ * - avoid reuse of recently used id (at the expense of predictability)
+ */
+u32 c4iw_id_alloc(struct c4iw_id_table *alloc)
+{
+	unsigned long flags;
+	u32 obj;
+
+	spin_lock_irqsave(&alloc->lock, flags);
+
+	obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last);
+	if (obj >= alloc->max)
+		obj = find_first_zero_bit(alloc->table, alloc->max);
+
+	if (obj < alloc->max) {
+		if (alloc->flags & C4IW_ID_TABLE_F_RANDOM)
+			alloc->last += prandom_u32() % RANDOM_SKIP;
+		else
+			alloc->last = obj + 1;
+		if (alloc->last >= alloc->max)
+			alloc->last = 0;
+		set_bit(obj, alloc->table);
+		obj += alloc->start;
+	} else
+		obj = -1;
+
+	spin_unlock_irqrestore(&alloc->lock, flags);
+	return obj;
+}
+
+void c4iw_id_free(struct c4iw_id_table *alloc, u32 obj)
+{
+	unsigned long flags;
+
+	obj -= alloc->start;
+
+	spin_lock_irqsave(&alloc->lock, flags);
+	clear_bit(obj, alloc->table);
+	spin_unlock_irqrestore(&alloc->lock, flags);
+}
+
+int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num,
+			u32 reserved, u32 flags)
+{
+	int i;
+
+	alloc->start = start;
+	alloc->flags = flags;
+	if (flags & C4IW_ID_TABLE_F_RANDOM)
+		alloc->last = prandom_u32() % RANDOM_SKIP;
+	else
+		alloc->last = 0;
+	alloc->max  = num;
+	spin_lock_init(&alloc->lock);
+	alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof(long),
+				GFP_KERNEL);
+	if (!alloc->table)
+		return -ENOMEM;
+
+	bitmap_zero(alloc->table, num);
+	if (!(alloc->flags & C4IW_ID_TABLE_F_EMPTY))
+		for (i = 0; i < reserved; ++i)
+			set_bit(i, alloc->table);
+
+	return 0;
+}
+
+void c4iw_id_table_free(struct c4iw_id_table *alloc)
+{
+	kfree(alloc->table);
+}
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
new file mode 100644
index 0000000..8310277
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -0,0 +1,1085 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IW_CXGB4_H__
+#define __IW_CXGB4_H__
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/completion.h>
+#include <linux/netdevice.h>
+#include <linux/sched/mm.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/inet.h>
+#include <linux/wait.h>
+#include <linux/kref.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/workqueue.h>
+
+#include <asm/byteorder.h>
+
+#include <net/net_namespace.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/iw_portmap.h>
+
+#include "cxgb4.h"
+#include "cxgb4_uld.h"
+#include "l2t.h"
+#include <rdma/cxgb4-abi.h>
+
+#define DRV_NAME "iw_cxgb4"
+#define MOD DRV_NAME ":"
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "t4.h"
+
+#define PBL_OFF(rdev_p, a) ((a) - (rdev_p)->lldi.vr->pbl.start)
+#define RQT_OFF(rdev_p, a) ((a) - (rdev_p)->lldi.vr->rq.start)
+
+static inline void *cplhdr(struct sk_buff *skb)
+{
+	return skb->data;
+}
+
+#define C4IW_ID_TABLE_F_RANDOM 1       /* Pseudo-randomize the id's returned */
+#define C4IW_ID_TABLE_F_EMPTY  2       /* Table is initially empty */
+
+struct c4iw_id_table {
+	u32 flags;
+	u32 start;              /* logical minimal id */
+	u32 last;               /* hint for find */
+	u32 max;
+	spinlock_t lock;
+	unsigned long *table;
+};
+
+struct c4iw_resource {
+	struct c4iw_id_table tpt_table;
+	struct c4iw_id_table qid_table;
+	struct c4iw_id_table pdid_table;
+};
+
+struct c4iw_qid_list {
+	struct list_head entry;
+	u32 qid;
+};
+
+struct c4iw_dev_ucontext {
+	struct list_head qpids;
+	struct list_head cqids;
+	struct mutex lock;
+	struct kref kref;
+};
+
+enum c4iw_rdev_flags {
+	T4_FATAL_ERROR = (1<<0),
+	T4_STATUS_PAGE_DISABLED = (1<<1),
+};
+
+struct c4iw_stat {
+	u64 total;
+	u64 cur;
+	u64 max;
+	u64 fail;
+};
+
+struct c4iw_stats {
+	struct mutex lock;
+	struct c4iw_stat qid;
+	struct c4iw_stat pd;
+	struct c4iw_stat stag;
+	struct c4iw_stat pbl;
+	struct c4iw_stat rqt;
+	struct c4iw_stat ocqp;
+	u64  db_full;
+	u64  db_empty;
+	u64  db_drop;
+	u64  db_state_transitions;
+	u64  db_fc_interruptions;
+	u64  tcam_full;
+	u64  act_ofld_conn_fails;
+	u64  pas_ofld_conn_fails;
+	u64  neg_adv;
+};
+
+struct c4iw_hw_queue {
+	int t4_eq_status_entries;
+	int t4_max_eq_size;
+	int t4_max_iq_size;
+	int t4_max_rq_size;
+	int t4_max_sq_size;
+	int t4_max_qp_depth;
+	int t4_max_cq_depth;
+	int t4_stat_len;
+};
+
+struct wr_log_entry {
+	ktime_t post_host_time;
+	ktime_t poll_host_time;
+	u64 post_sge_ts;
+	u64 cqe_sge_ts;
+	u64 poll_sge_ts;
+	u16 qid;
+	u16 wr_id;
+	u8 opcode;
+	u8 valid;
+};
+
+struct c4iw_rdev {
+	struct c4iw_resource resource;
+	u32 qpmask;
+	u32 cqmask;
+	struct c4iw_dev_ucontext uctx;
+	struct gen_pool *pbl_pool;
+	struct gen_pool *rqt_pool;
+	struct gen_pool *ocqp_pool;
+	u32 flags;
+	struct cxgb4_lld_info lldi;
+	unsigned long bar2_pa;
+	void __iomem *bar2_kva;
+	unsigned long oc_mw_pa;
+	void __iomem *oc_mw_kva;
+	struct c4iw_stats stats;
+	struct c4iw_hw_queue hw_queue;
+	struct t4_dev_status_page *status_page;
+	atomic_t wr_log_idx;
+	struct wr_log_entry *wr_log;
+	int wr_log_size;
+	struct workqueue_struct *free_workq;
+	struct completion rqt_compl;
+	struct completion pbl_compl;
+	struct kref rqt_kref;
+	struct kref pbl_kref;
+};
+
+static inline int c4iw_fatal_error(struct c4iw_rdev *rdev)
+{
+	return rdev->flags & T4_FATAL_ERROR;
+}
+
+static inline int c4iw_num_stags(struct c4iw_rdev *rdev)
+{
+	return (int)(rdev->lldi.vr->stag.size >> 5);
+}
+
+#define C4IW_WR_TO (60*HZ)
+
+struct c4iw_wr_wait {
+	struct completion completion;
+	int ret;
+	struct kref kref;
+};
+
+void _c4iw_free_wr_wait(struct kref *kref);
+
+static inline void c4iw_put_wr_wait(struct c4iw_wr_wait *wr_waitp)
+{
+	pr_debug("wr_wait %p ref before put %u\n", wr_waitp,
+		 kref_read(&wr_waitp->kref));
+	WARN_ON(kref_read(&wr_waitp->kref) == 0);
+	kref_put(&wr_waitp->kref, _c4iw_free_wr_wait);
+}
+
+static inline void c4iw_get_wr_wait(struct c4iw_wr_wait *wr_waitp)
+{
+	pr_debug("wr_wait %p ref before get %u\n", wr_waitp,
+		 kref_read(&wr_waitp->kref));
+	WARN_ON(kref_read(&wr_waitp->kref) == 0);
+	kref_get(&wr_waitp->kref);
+}
+
+static inline void c4iw_init_wr_wait(struct c4iw_wr_wait *wr_waitp)
+{
+	wr_waitp->ret = 0;
+	init_completion(&wr_waitp->completion);
+}
+
+static inline void _c4iw_wake_up(struct c4iw_wr_wait *wr_waitp, int ret,
+				 bool deref)
+{
+	wr_waitp->ret = ret;
+	complete(&wr_waitp->completion);
+	if (deref)
+		c4iw_put_wr_wait(wr_waitp);
+}
+
+static inline void c4iw_wake_up_noref(struct c4iw_wr_wait *wr_waitp, int ret)
+{
+	_c4iw_wake_up(wr_waitp, ret, false);
+}
+
+static inline void c4iw_wake_up_deref(struct c4iw_wr_wait *wr_waitp, int ret)
+{
+	_c4iw_wake_up(wr_waitp, ret, true);
+}
+
+static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev,
+				 struct c4iw_wr_wait *wr_waitp,
+				 u32 hwtid, u32 qpid,
+				 const char *func)
+{
+	int ret;
+
+	if (c4iw_fatal_error(rdev)) {
+		wr_waitp->ret = -EIO;
+		goto out;
+	}
+
+	ret = wait_for_completion_timeout(&wr_waitp->completion, C4IW_WR_TO);
+	if (!ret) {
+		pr_err("%s - Device %s not responding (disabling device) - tid %u qpid %u\n",
+		       func, pci_name(rdev->lldi.pdev), hwtid, qpid);
+		rdev->flags |= T4_FATAL_ERROR;
+		wr_waitp->ret = -EIO;
+		goto out;
+	}
+	if (wr_waitp->ret)
+		pr_debug("%s: FW reply %d tid %u qpid %u\n",
+			 pci_name(rdev->lldi.pdev), wr_waitp->ret, hwtid, qpid);
+out:
+	return wr_waitp->ret;
+}
+
+int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb);
+
+static inline int c4iw_ref_send_wait(struct c4iw_rdev *rdev,
+				     struct sk_buff *skb,
+				     struct c4iw_wr_wait *wr_waitp,
+				     u32 hwtid, u32 qpid,
+				     const char *func)
+{
+	int ret;
+
+	pr_debug("%s wr_wait %p hwtid %u qpid %u\n", func, wr_waitp, hwtid,
+		 qpid);
+	c4iw_get_wr_wait(wr_waitp);
+	ret = c4iw_ofld_send(rdev, skb);
+	if (ret) {
+		c4iw_put_wr_wait(wr_waitp);
+		return ret;
+	}
+	return c4iw_wait_for_reply(rdev, wr_waitp, hwtid, qpid, func);
+}
+
+enum db_state {
+	NORMAL = 0,
+	FLOW_CONTROL = 1,
+	RECOVERY = 2,
+	STOPPED = 3
+};
+
+struct c4iw_dev {
+	struct ib_device ibdev;
+	struct c4iw_rdev rdev;
+	u32 device_cap_flags;
+	struct idr cqidr;
+	struct idr qpidr;
+	struct idr mmidr;
+	spinlock_t lock;
+	struct mutex db_mutex;
+	struct dentry *debugfs_root;
+	enum db_state db_state;
+	struct idr hwtid_idr;
+	struct idr atid_idr;
+	struct idr stid_idr;
+	struct list_head db_fc_list;
+	u32 avail_ird;
+	wait_queue_head_t wait;
+};
+
+struct uld_ctx {
+	struct list_head entry;
+	struct cxgb4_lld_info lldi;
+	struct c4iw_dev *dev;
+	struct work_struct reg_work;
+};
+
+static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct c4iw_dev, ibdev);
+}
+
+static inline struct c4iw_dev *rdev_to_c4iw_dev(struct c4iw_rdev *rdev)
+{
+	return container_of(rdev, struct c4iw_dev, rdev);
+}
+
+static inline struct c4iw_cq *get_chp(struct c4iw_dev *rhp, u32 cqid)
+{
+	return idr_find(&rhp->cqidr, cqid);
+}
+
+static inline struct c4iw_qp *get_qhp(struct c4iw_dev *rhp, u32 qpid)
+{
+	return idr_find(&rhp->qpidr, qpid);
+}
+
+static inline struct c4iw_mr *get_mhp(struct c4iw_dev *rhp, u32 mmid)
+{
+	return idr_find(&rhp->mmidr, mmid);
+}
+
+static inline int _insert_handle(struct c4iw_dev *rhp, struct idr *idr,
+				 void *handle, u32 id, int lock)
+{
+	int ret;
+
+	if (lock) {
+		idr_preload(GFP_KERNEL);
+		spin_lock_irq(&rhp->lock);
+	}
+
+	ret = idr_alloc(idr, handle, id, id + 1, GFP_ATOMIC);
+
+	if (lock) {
+		spin_unlock_irq(&rhp->lock);
+		idr_preload_end();
+	}
+
+	return ret < 0 ? ret : 0;
+}
+
+static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr,
+				void *handle, u32 id)
+{
+	return _insert_handle(rhp, idr, handle, id, 1);
+}
+
+static inline int insert_handle_nolock(struct c4iw_dev *rhp, struct idr *idr,
+				       void *handle, u32 id)
+{
+	return _insert_handle(rhp, idr, handle, id, 0);
+}
+
+static inline void _remove_handle(struct c4iw_dev *rhp, struct idr *idr,
+				   u32 id, int lock)
+{
+	if (lock)
+		spin_lock_irq(&rhp->lock);
+	idr_remove(idr, id);
+	if (lock)
+		spin_unlock_irq(&rhp->lock);
+}
+
+static inline void remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id)
+{
+	_remove_handle(rhp, idr, id, 1);
+}
+
+static inline void remove_handle_nolock(struct c4iw_dev *rhp,
+					 struct idr *idr, u32 id)
+{
+	_remove_handle(rhp, idr, id, 0);
+}
+
+extern uint c4iw_max_read_depth;
+
+static inline int cur_max_read_depth(struct c4iw_dev *dev)
+{
+	return min(dev->rdev.lldi.max_ordird_qp, c4iw_max_read_depth);
+}
+
+struct c4iw_pd {
+	struct ib_pd ibpd;
+	u32 pdid;
+	struct c4iw_dev *rhp;
+};
+
+static inline struct c4iw_pd *to_c4iw_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct c4iw_pd, ibpd);
+}
+
+struct tpt_attributes {
+	u64 len;
+	u64 va_fbo;
+	enum fw_ri_mem_perms perms;
+	u32 stag;
+	u32 pdid;
+	u32 qpid;
+	u32 pbl_addr;
+	u32 pbl_size;
+	u32 state:1;
+	u32 type:2;
+	u32 rsvd:1;
+	u32 remote_invaliate_disable:1;
+	u32 zbva:1;
+	u32 mw_bind_enable:1;
+	u32 page_size:5;
+};
+
+struct c4iw_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct c4iw_dev *rhp;
+	struct sk_buff *dereg_skb;
+	u64 kva;
+	struct tpt_attributes attr;
+	u64 *mpl;
+	dma_addr_t mpl_addr;
+	u32 max_mpl_len;
+	u32 mpl_len;
+	struct c4iw_wr_wait *wr_waitp;
+};
+
+static inline struct c4iw_mr *to_c4iw_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct c4iw_mr, ibmr);
+}
+
+struct c4iw_mw {
+	struct ib_mw ibmw;
+	struct c4iw_dev *rhp;
+	struct sk_buff *dereg_skb;
+	u64 kva;
+	struct tpt_attributes attr;
+	struct c4iw_wr_wait *wr_waitp;
+};
+
+static inline struct c4iw_mw *to_c4iw_mw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct c4iw_mw, ibmw);
+}
+
+struct c4iw_cq {
+	struct ib_cq ibcq;
+	struct c4iw_dev *rhp;
+	struct sk_buff *destroy_skb;
+	struct t4_cq cq;
+	spinlock_t lock;
+	spinlock_t comp_handler_lock;
+	atomic_t refcnt;
+	wait_queue_head_t wait;
+	struct c4iw_wr_wait *wr_waitp;
+};
+
+static inline struct c4iw_cq *to_c4iw_cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct c4iw_cq, ibcq);
+}
+
+struct c4iw_mpa_attributes {
+	u8 initiator;
+	u8 recv_marker_enabled;
+	u8 xmit_marker_enabled;
+	u8 crc_enabled;
+	u8 enhanced_rdma_conn;
+	u8 version;
+	u8 p2p_type;
+};
+
+struct c4iw_qp_attributes {
+	u32 scq;
+	u32 rcq;
+	u32 sq_num_entries;
+	u32 rq_num_entries;
+	u32 sq_max_sges;
+	u32 sq_max_sges_rdma_write;
+	u32 rq_max_sges;
+	u32 state;
+	u8 enable_rdma_read;
+	u8 enable_rdma_write;
+	u8 enable_bind;
+	u8 enable_mmid0_fastreg;
+	u32 max_ord;
+	u32 max_ird;
+	u32 pd;
+	u32 next_state;
+	char terminate_buffer[52];
+	u32 terminate_msg_len;
+	u8 is_terminate_local;
+	struct c4iw_mpa_attributes mpa_attr;
+	struct c4iw_ep *llp_stream_handle;
+	u8 layer_etype;
+	u8 ecode;
+	u16 sq_db_inc;
+	u16 rq_db_inc;
+	u8 send_term;
+};
+
+struct c4iw_qp {
+	struct ib_qp ibqp;
+	struct list_head db_fc_entry;
+	struct c4iw_dev *rhp;
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attr;
+	struct t4_wq wq;
+	spinlock_t lock;
+	struct mutex mutex;
+	struct kref kref;
+	wait_queue_head_t wait;
+	int sq_sig_all;
+	struct work_struct free_work;
+	struct c4iw_ucontext *ucontext;
+	struct c4iw_wr_wait *wr_waitp;
+};
+
+static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct c4iw_qp, ibqp);
+}
+
+struct c4iw_ucontext {
+	struct ib_ucontext ibucontext;
+	struct c4iw_dev_ucontext uctx;
+	u32 key;
+	spinlock_t mmap_lock;
+	struct list_head mmaps;
+	struct kref kref;
+};
+
+static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c)
+{
+	return container_of(c, struct c4iw_ucontext, ibucontext);
+}
+
+void _c4iw_free_ucontext(struct kref *kref);
+
+static inline void c4iw_put_ucontext(struct c4iw_ucontext *ucontext)
+{
+	kref_put(&ucontext->kref, _c4iw_free_ucontext);
+}
+
+static inline void c4iw_get_ucontext(struct c4iw_ucontext *ucontext)
+{
+	kref_get(&ucontext->kref);
+}
+
+struct c4iw_mm_entry {
+	struct list_head entry;
+	u64 addr;
+	u32 key;
+	unsigned len;
+};
+
+static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext,
+						u32 key, unsigned len)
+{
+	struct list_head *pos, *nxt;
+	struct c4iw_mm_entry *mm;
+
+	spin_lock(&ucontext->mmap_lock);
+	list_for_each_safe(pos, nxt, &ucontext->mmaps) {
+
+		mm = list_entry(pos, struct c4iw_mm_entry, entry);
+		if (mm->key == key && mm->len == len) {
+			list_del_init(&mm->entry);
+			spin_unlock(&ucontext->mmap_lock);
+			pr_debug("key 0x%x addr 0x%llx len %d\n", key,
+				 (unsigned long long)mm->addr, mm->len);
+			return mm;
+		}
+	}
+	spin_unlock(&ucontext->mmap_lock);
+	return NULL;
+}
+
+static inline void insert_mmap(struct c4iw_ucontext *ucontext,
+			       struct c4iw_mm_entry *mm)
+{
+	spin_lock(&ucontext->mmap_lock);
+	pr_debug("key 0x%x addr 0x%llx len %d\n",
+		 mm->key, (unsigned long long)mm->addr, mm->len);
+	list_add_tail(&mm->entry, &ucontext->mmaps);
+	spin_unlock(&ucontext->mmap_lock);
+}
+
+enum c4iw_qp_attr_mask {
+	C4IW_QP_ATTR_NEXT_STATE = 1 << 0,
+	C4IW_QP_ATTR_SQ_DB = 1<<1,
+	C4IW_QP_ATTR_RQ_DB = 1<<2,
+	C4IW_QP_ATTR_ENABLE_RDMA_READ = 1 << 7,
+	C4IW_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8,
+	C4IW_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9,
+	C4IW_QP_ATTR_MAX_ORD = 1 << 11,
+	C4IW_QP_ATTR_MAX_IRD = 1 << 12,
+	C4IW_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22,
+	C4IW_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23,
+	C4IW_QP_ATTR_MPA_ATTR = 1 << 24,
+	C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25,
+	C4IW_QP_ATTR_VALID_MODIFY = (C4IW_QP_ATTR_ENABLE_RDMA_READ |
+				     C4IW_QP_ATTR_ENABLE_RDMA_WRITE |
+				     C4IW_QP_ATTR_MAX_ORD |
+				     C4IW_QP_ATTR_MAX_IRD |
+				     C4IW_QP_ATTR_LLP_STREAM_HANDLE |
+				     C4IW_QP_ATTR_STREAM_MSG_BUFFER |
+				     C4IW_QP_ATTR_MPA_ATTR |
+				     C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE)
+};
+
+int c4iw_modify_qp(struct c4iw_dev *rhp,
+				struct c4iw_qp *qhp,
+				enum c4iw_qp_attr_mask mask,
+				struct c4iw_qp_attributes *attrs,
+				int internal);
+
+enum c4iw_qp_state {
+	C4IW_QP_STATE_IDLE,
+	C4IW_QP_STATE_RTS,
+	C4IW_QP_STATE_ERROR,
+	C4IW_QP_STATE_TERMINATE,
+	C4IW_QP_STATE_CLOSING,
+	C4IW_QP_STATE_TOT
+};
+
+static inline int c4iw_convert_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET:
+	case IB_QPS_INIT:
+		return C4IW_QP_STATE_IDLE;
+	case IB_QPS_RTS:
+		return C4IW_QP_STATE_RTS;
+	case IB_QPS_SQD:
+		return C4IW_QP_STATE_CLOSING;
+	case IB_QPS_SQE:
+		return C4IW_QP_STATE_TERMINATE;
+	case IB_QPS_ERR:
+		return C4IW_QP_STATE_ERROR;
+	default:
+		return -1;
+	}
+}
+
+static inline int to_ib_qp_state(int c4iw_qp_state)
+{
+	switch (c4iw_qp_state) {
+	case C4IW_QP_STATE_IDLE:
+		return IB_QPS_INIT;
+	case C4IW_QP_STATE_RTS:
+		return IB_QPS_RTS;
+	case C4IW_QP_STATE_CLOSING:
+		return IB_QPS_SQD;
+	case C4IW_QP_STATE_TERMINATE:
+		return IB_QPS_SQE;
+	case C4IW_QP_STATE_ERROR:
+		return IB_QPS_ERR;
+	}
+	return IB_QPS_ERR;
+}
+
+static inline u32 c4iw_ib_to_tpt_access(int a)
+{
+	return (a & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) |
+	       (a & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0) |
+	       (a & IB_ACCESS_LOCAL_WRITE ? FW_RI_MEM_ACCESS_LOCAL_WRITE : 0) |
+	       FW_RI_MEM_ACCESS_LOCAL_READ;
+}
+
+static inline u32 c4iw_ib_to_tpt_bind_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0);
+}
+
+enum c4iw_mmid_state {
+	C4IW_STAG_STATE_VALID,
+	C4IW_STAG_STATE_INVALID
+};
+
+#define C4IW_NODE_DESC "cxgb4 Chelsio Communications"
+
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+
+#define MPA_MAX_PRIVATE_DATA	256
+#define MPA_ENHANCED_RDMA_CONN	0x10
+#define MPA_REJECT		0x20
+#define MPA_CRC			0x40
+#define MPA_MARKERS		0x80
+#define MPA_FLAGS_MASK		0xE0
+
+#define MPA_V2_PEER2PEER_MODEL          0x8000
+#define MPA_V2_ZERO_LEN_FPDU_RTR        0x4000
+#define MPA_V2_RDMA_WRITE_RTR           0x8000
+#define MPA_V2_RDMA_READ_RTR            0x4000
+#define MPA_V2_IRD_ORD_MASK             0x3FFF
+
+#define c4iw_put_ep(ep) {						\
+	pr_debug("put_ep ep %p refcnt %d\n",		\
+		 ep, kref_read(&((ep)->kref)));				\
+	WARN_ON(kref_read(&((ep)->kref)) < 1);				\
+	kref_put(&((ep)->kref), _c4iw_free_ep);				\
+}
+
+#define c4iw_get_ep(ep) {						\
+	pr_debug("get_ep ep %p, refcnt %d\n",		\
+		 ep, kref_read(&((ep)->kref)));				\
+	kref_get(&((ep)->kref));					\
+}
+void _c4iw_free_ep(struct kref *kref);
+
+struct mpa_message {
+	u8 key[16];
+	u8 flags;
+	u8 revision;
+	__be16 private_data_size;
+	u8 private_data[0];
+};
+
+struct mpa_v2_conn_params {
+	__be16 ird;
+	__be16 ord;
+};
+
+struct terminate_message {
+	u8 layer_etype;
+	u8 ecode;
+	__be16 hdrct_rsvd;
+	u8 len_hdrs[0];
+};
+
+#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28)
+
+enum c4iw_layers_types {
+	LAYER_RDMAP		= 0x00,
+	LAYER_DDP		= 0x10,
+	LAYER_MPA		= 0x20,
+	RDMAP_LOCAL_CATA	= 0x00,
+	RDMAP_REMOTE_PROT	= 0x01,
+	RDMAP_REMOTE_OP		= 0x02,
+	DDP_LOCAL_CATA		= 0x00,
+	DDP_TAGGED_ERR		= 0x01,
+	DDP_UNTAGGED_ERR	= 0x02,
+	DDP_LLP			= 0x03
+};
+
+enum c4iw_rdma_ecodes {
+	RDMAP_INV_STAG		= 0x00,
+	RDMAP_BASE_BOUNDS	= 0x01,
+	RDMAP_ACC_VIOL		= 0x02,
+	RDMAP_STAG_NOT_ASSOC	= 0x03,
+	RDMAP_TO_WRAP		= 0x04,
+	RDMAP_INV_VERS		= 0x05,
+	RDMAP_INV_OPCODE	= 0x06,
+	RDMAP_STREAM_CATA	= 0x07,
+	RDMAP_GLOBAL_CATA	= 0x08,
+	RDMAP_CANT_INV_STAG	= 0x09,
+	RDMAP_UNSPECIFIED	= 0xff
+};
+
+enum c4iw_ddp_ecodes {
+	DDPT_INV_STAG		= 0x00,
+	DDPT_BASE_BOUNDS	= 0x01,
+	DDPT_STAG_NOT_ASSOC	= 0x02,
+	DDPT_TO_WRAP		= 0x03,
+	DDPT_INV_VERS		= 0x04,
+	DDPU_INV_QN		= 0x01,
+	DDPU_INV_MSN_NOBUF	= 0x02,
+	DDPU_INV_MSN_RANGE	= 0x03,
+	DDPU_INV_MO		= 0x04,
+	DDPU_MSG_TOOBIG		= 0x05,
+	DDPU_INV_VERS		= 0x06
+};
+
+enum c4iw_mpa_ecodes {
+	MPA_CRC_ERR		= 0x02,
+	MPA_MARKER_ERR          = 0x03,
+	MPA_LOCAL_CATA          = 0x05,
+	MPA_INSUFF_IRD          = 0x06,
+	MPA_NOMATCH_RTR         = 0x07,
+};
+
+enum c4iw_ep_state {
+	IDLE = 0,
+	LISTEN,
+	CONNECTING,
+	MPA_REQ_WAIT,
+	MPA_REQ_SENT,
+	MPA_REQ_RCVD,
+	MPA_REP_SENT,
+	FPDU_MODE,
+	ABORTING,
+	CLOSING,
+	MORIBUND,
+	DEAD,
+};
+
+enum c4iw_ep_flags {
+	PEER_ABORT_IN_PROGRESS	= 0,
+	ABORT_REQ_IN_PROGRESS	= 1,
+	RELEASE_RESOURCES	= 2,
+	CLOSE_SENT		= 3,
+	TIMEOUT                 = 4,
+	QP_REFERENCED           = 5,
+	STOP_MPA_TIMER		= 7,
+};
+
+enum c4iw_ep_history {
+	ACT_OPEN_REQ            = 0,
+	ACT_OFLD_CONN           = 1,
+	ACT_OPEN_RPL            = 2,
+	ACT_ESTAB               = 3,
+	PASS_ACCEPT_REQ         = 4,
+	PASS_ESTAB              = 5,
+	ABORT_UPCALL            = 6,
+	ESTAB_UPCALL            = 7,
+	CLOSE_UPCALL            = 8,
+	ULP_ACCEPT              = 9,
+	ULP_REJECT              = 10,
+	TIMEDOUT                = 11,
+	PEER_ABORT              = 12,
+	PEER_CLOSE              = 13,
+	CONNREQ_UPCALL          = 14,
+	ABORT_CONN              = 15,
+	DISCONN_UPCALL          = 16,
+	EP_DISC_CLOSE           = 17,
+	EP_DISC_ABORT           = 18,
+	CONN_RPL_UPCALL         = 19,
+	ACT_RETRY_NOMEM         = 20,
+	ACT_RETRY_INUSE         = 21,
+	CLOSE_CON_RPL		= 22,
+	EP_DISC_FAIL		= 24,
+	QP_REFED		= 25,
+	QP_DEREFED		= 26,
+	CM_ID_REFED		= 27,
+	CM_ID_DEREFED		= 28,
+};
+
+enum conn_pre_alloc_buffers {
+	CN_ABORT_REQ_BUF,
+	CN_ABORT_RPL_BUF,
+	CN_CLOSE_CON_REQ_BUF,
+	CN_DESTROY_BUF,
+	CN_FLOWC_BUF,
+	CN_MAX_CON_BUF
+};
+
+#define FLOWC_LEN 80
+union cpl_wr_size {
+	struct cpl_abort_req abrt_req;
+	struct cpl_abort_rpl abrt_rpl;
+	struct fw_ri_wr ri_req;
+	struct cpl_close_con_req close_req;
+	char flowc_buf[FLOWC_LEN];
+};
+
+struct c4iw_ep_common {
+	struct iw_cm_id *cm_id;
+	struct c4iw_qp *qp;
+	struct c4iw_dev *dev;
+	struct sk_buff_head ep_skb_list;
+	enum c4iw_ep_state state;
+	struct kref kref;
+	struct mutex mutex;
+	struct sockaddr_storage local_addr;
+	struct sockaddr_storage remote_addr;
+	struct c4iw_wr_wait *wr_waitp;
+	unsigned long flags;
+	unsigned long history;
+};
+
+struct c4iw_listen_ep {
+	struct c4iw_ep_common com;
+	unsigned int stid;
+	int backlog;
+};
+
+struct c4iw_ep_stats {
+	unsigned connect_neg_adv;
+	unsigned abort_neg_adv;
+};
+
+struct c4iw_ep {
+	struct c4iw_ep_common com;
+	struct c4iw_ep *parent_ep;
+	struct timer_list timer;
+	struct list_head entry;
+	unsigned int atid;
+	u32 hwtid;
+	u32 snd_seq;
+	u32 rcv_seq;
+	struct l2t_entry *l2t;
+	struct dst_entry *dst;
+	struct sk_buff *mpa_skb;
+	struct c4iw_mpa_attributes mpa_attr;
+	u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA];
+	unsigned int mpa_pkt_len;
+	u32 ird;
+	u32 ord;
+	u32 smac_idx;
+	u32 tx_chan;
+	u32 mtu;
+	u16 mss;
+	u16 emss;
+	u16 plen;
+	u16 rss_qid;
+	u16 txq_idx;
+	u16 ctrlq_idx;
+	u8 tos;
+	u8 retry_with_mpa_v1;
+	u8 tried_with_mpa_v1;
+	unsigned int retry_count;
+	int snd_win;
+	int rcv_win;
+	struct c4iw_ep_stats stats;
+};
+
+static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline struct c4iw_listen_ep *to_listen_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline int ocqp_supported(const struct cxgb4_lld_info *infop)
+{
+#if defined(__i386__) || defined(__x86_64__) || defined(CONFIG_PPC64)
+	return infop->vr->ocq.size > 0;
+#else
+	return 0;
+#endif
+}
+
+u32 c4iw_id_alloc(struct c4iw_id_table *alloc);
+void c4iw_id_free(struct c4iw_id_table *alloc, u32 obj);
+int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num,
+			u32 reserved, u32 flags);
+void c4iw_id_table_free(struct c4iw_id_table *alloc);
+
+typedef int (*c4iw_handler_func)(struct c4iw_dev *dev, struct sk_buff *skb);
+
+int c4iw_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new,
+		     struct l2t_entry *l2t);
+void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qpid,
+		   struct c4iw_dev_ucontext *uctx);
+u32 c4iw_get_resource(struct c4iw_id_table *id_table);
+void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry);
+int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid);
+int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev);
+int c4iw_pblpool_create(struct c4iw_rdev *rdev);
+int c4iw_rqtpool_create(struct c4iw_rdev *rdev);
+int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev);
+void c4iw_pblpool_destroy(struct c4iw_rdev *rdev);
+void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev);
+void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev);
+void c4iw_destroy_resource(struct c4iw_resource *rscp);
+int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev);
+void c4iw_register_device(struct work_struct *work);
+void c4iw_unregister_device(struct c4iw_dev *dev);
+int __init c4iw_cm_init(void);
+void c4iw_cm_term(void);
+void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,
+			       struct c4iw_dev_ucontext *uctx);
+void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev,
+			    struct c4iw_dev_ucontext *uctx);
+int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog);
+int c4iw_destroy_listen(struct iw_cm_id *cm_id);
+int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
+void c4iw_qp_add_ref(struct ib_qp *qp);
+void c4iw_qp_rem_ref(struct ib_qp *qp);
+struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
+			    enum ib_mr_type mr_type,
+			    u32 max_num_sg);
+int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		   unsigned int *sg_offset);
+int c4iw_dealloc_mw(struct ib_mw *mw);
+void c4iw_dealloc(struct uld_ctx *ctx);
+struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
+			    struct ib_udata *udata);
+struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
+					   u64 length, u64 virt, int acc,
+					   struct ib_udata *udata);
+struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
+int c4iw_dereg_mr(struct ib_mr *ib_mr);
+int c4iw_destroy_cq(struct ib_cq *ib_cq);
+struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
+			     const struct ib_cq_init_attr *attr,
+			     struct ib_ucontext *ib_context,
+			     struct ib_udata *udata);
+int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata);
+int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+int c4iw_destroy_qp(struct ib_qp *ib_qp);
+struct ib_qp *c4iw_create_qp(struct ib_pd *pd,
+			     struct ib_qp_init_attr *attrs,
+			     struct ib_udata *udata);
+int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+				 int attr_mask, struct ib_udata *udata);
+int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_qp_init_attr *init_attr);
+struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn);
+u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size);
+void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size);
+u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size);
+void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size);
+u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size);
+void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size);
+void c4iw_flush_hw_cq(struct c4iw_cq *chp, struct c4iw_qp *flush_qhp);
+void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count);
+int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp);
+int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count);
+int c4iw_flush_sq(struct c4iw_qp *qhp);
+int c4iw_ev_handler(struct c4iw_dev *rnicp, u32 qid);
+u16 c4iw_rqes_posted(struct c4iw_qp *qhp);
+int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe);
+u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx);
+void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid,
+		struct c4iw_dev_ucontext *uctx);
+u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx);
+void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid,
+		struct c4iw_dev_ucontext *uctx);
+void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe);
+
+extern struct cxgb4_client t4c_client;
+extern c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS];
+void __iomem *c4iw_bar2_addrs(struct c4iw_rdev *rdev, unsigned int qid,
+			      enum cxgb4_bar2_qtype qtype,
+			      unsigned int *pbar2_qid, u64 *pbar2_pa);
+extern void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe);
+extern int c4iw_wr_log;
+extern int db_fc_threshold;
+extern int db_coalescing_threshold;
+extern int use_dsgl;
+void c4iw_invalidate_mr(struct c4iw_dev *rhp, u32 rkey);
+struct c4iw_wr_wait *c4iw_alloc_wr_wait(gfp_t gfp);
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
new file mode 100644
index 0000000..1445918
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -0,0 +1,835 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <rdma/ib_umem.h>
+#include <linux/atomic.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "iw_cxgb4.h"
+
+int use_dsgl = 1;
+module_param(use_dsgl, int, 0644);
+MODULE_PARM_DESC(use_dsgl, "Use DSGL for PBL/FastReg (default=1) (DEPRECATED)");
+
+#define T4_ULPTX_MIN_IO 32
+#define C4IW_MAX_INLINE_SIZE 96
+#define T4_ULPTX_MAX_DMA 1024
+#define C4IW_INLINE_THRESHOLD 128
+
+static int inline_threshold = C4IW_INLINE_THRESHOLD;
+module_param(inline_threshold, int, 0644);
+MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)");
+
+static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length)
+{
+	return (is_t4(dev->rdev.lldi.adapter_type) ||
+		is_t5(dev->rdev.lldi.adapter_type)) &&
+		length >= 8*1024*1024*1024ULL;
+}
+
+static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
+				       u32 len, dma_addr_t data,
+				       struct sk_buff *skb,
+				       struct c4iw_wr_wait *wr_waitp)
+{
+	struct ulp_mem_io *req;
+	struct ulptx_sgl *sgl;
+	u8 wr_len;
+	int ret = 0;
+
+	addr &= 0x7FFFFFF;
+
+	if (wr_waitp)
+		c4iw_init_wr_wait(wr_waitp);
+	wr_len = roundup(sizeof(*req) + sizeof(*sgl), 16);
+
+	if (!skb) {
+		skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+		if (!skb)
+			return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	req = __skb_put_zero(skb, wr_len);
+	INIT_ULPTX_WR(req, wr_len, 0, 0);
+	req->wr.wr_hi = cpu_to_be32(FW_WR_OP_V(FW_ULPTX_WR) |
+			(wr_waitp ? FW_WR_COMPL_F : 0));
+	req->wr.wr_lo = wr_waitp ? (__force __be64)(unsigned long)wr_waitp : 0L;
+	req->wr.wr_mid = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(wr_len, 16)));
+	req->cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE) |
+			       T5_ULP_MEMIO_ORDER_V(1) |
+			       T5_ULP_MEMIO_FID_V(rdev->lldi.rxq_ids[0]));
+	req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN_V(len>>5));
+	req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr), 16));
+	req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR_V(addr));
+
+	sgl = (struct ulptx_sgl *)(req + 1);
+	sgl->cmd_nsge = cpu_to_be32(ULPTX_CMD_V(ULP_TX_SC_DSGL) |
+				    ULPTX_NSGE_V(1));
+	sgl->len0 = cpu_to_be32(len);
+	sgl->addr0 = cpu_to_be64(data);
+
+	if (wr_waitp)
+		ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__);
+	else
+		ret = c4iw_ofld_send(rdev, skb);
+	return ret;
+}
+
+static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
+				  void *data, struct sk_buff *skb,
+				  struct c4iw_wr_wait *wr_waitp)
+{
+	struct ulp_mem_io *req;
+	struct ulptx_idata *sc;
+	u8 wr_len, *to_dp, *from_dp;
+	int copy_len, num_wqe, i, ret = 0;
+	__be32 cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE));
+
+	if (is_t4(rdev->lldi.adapter_type))
+		cmd |= cpu_to_be32(ULP_MEMIO_ORDER_F);
+	else
+		cmd |= cpu_to_be32(T5_ULP_MEMIO_IMM_F);
+
+	addr &= 0x7FFFFFF;
+	pr_debug("addr 0x%x len %u\n", addr, len);
+	num_wqe = DIV_ROUND_UP(len, C4IW_MAX_INLINE_SIZE);
+	c4iw_init_wr_wait(wr_waitp);
+	for (i = 0; i < num_wqe; i++) {
+
+		copy_len = len > C4IW_MAX_INLINE_SIZE ? C4IW_MAX_INLINE_SIZE :
+			   len;
+		wr_len = roundup(sizeof *req + sizeof *sc +
+				 roundup(copy_len, T4_ULPTX_MIN_IO), 16);
+
+		if (!skb) {
+			skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+			if (!skb)
+				return -ENOMEM;
+		}
+		set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+		req = __skb_put_zero(skb, wr_len);
+		INIT_ULPTX_WR(req, wr_len, 0, 0);
+
+		if (i == (num_wqe-1)) {
+			req->wr.wr_hi = cpu_to_be32(FW_WR_OP_V(FW_ULPTX_WR) |
+						    FW_WR_COMPL_F);
+			req->wr.wr_lo = (__force __be64)(unsigned long)wr_waitp;
+		} else
+			req->wr.wr_hi = cpu_to_be32(FW_WR_OP_V(FW_ULPTX_WR));
+		req->wr.wr_mid = cpu_to_be32(
+				       FW_WR_LEN16_V(DIV_ROUND_UP(wr_len, 16)));
+
+		req->cmd = cmd;
+		req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN_V(
+				DIV_ROUND_UP(copy_len, T4_ULPTX_MIN_IO)));
+		req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr),
+						      16));
+		req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR_V(addr + i * 3));
+
+		sc = (struct ulptx_idata *)(req + 1);
+		sc->cmd_more = cpu_to_be32(ULPTX_CMD_V(ULP_TX_SC_IMM));
+		sc->len = cpu_to_be32(roundup(copy_len, T4_ULPTX_MIN_IO));
+
+		to_dp = (u8 *)(sc + 1);
+		from_dp = (u8 *)data + i * C4IW_MAX_INLINE_SIZE;
+		if (data)
+			memcpy(to_dp, from_dp, copy_len);
+		else
+			memset(to_dp, 0, copy_len);
+		if (copy_len % T4_ULPTX_MIN_IO)
+			memset(to_dp + copy_len, 0, T4_ULPTX_MIN_IO -
+			       (copy_len % T4_ULPTX_MIN_IO));
+		if (i == (num_wqe-1))
+			ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0,
+						 __func__);
+		else
+			ret = c4iw_ofld_send(rdev, skb);
+		if (ret)
+			break;
+		skb = NULL;
+		len -= C4IW_MAX_INLINE_SIZE;
+	}
+
+	return ret;
+}
+
+static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len,
+			       void *data, struct sk_buff *skb,
+			       struct c4iw_wr_wait *wr_waitp)
+{
+	u32 remain = len;
+	u32 dmalen;
+	int ret = 0;
+	dma_addr_t daddr;
+	dma_addr_t save;
+
+	daddr = dma_map_single(&rdev->lldi.pdev->dev, data, len, DMA_TO_DEVICE);
+	if (dma_mapping_error(&rdev->lldi.pdev->dev, daddr))
+		return -1;
+	save = daddr;
+
+	while (remain > inline_threshold) {
+		if (remain < T4_ULPTX_MAX_DMA) {
+			if (remain & ~T4_ULPTX_MIN_IO)
+				dmalen = remain & ~(T4_ULPTX_MIN_IO-1);
+			else
+				dmalen = remain;
+		} else
+			dmalen = T4_ULPTX_MAX_DMA;
+		remain -= dmalen;
+		ret = _c4iw_write_mem_dma_aligned(rdev, addr, dmalen, daddr,
+						 skb, remain ? NULL : wr_waitp);
+		if (ret)
+			goto out;
+		addr += dmalen >> 5;
+		data += dmalen;
+		daddr += dmalen;
+	}
+	if (remain)
+		ret = _c4iw_write_mem_inline(rdev, addr, remain, data, skb,
+					     wr_waitp);
+out:
+	dma_unmap_single(&rdev->lldi.pdev->dev, save, len, DMA_TO_DEVICE);
+	return ret;
+}
+
+/*
+ * write len bytes of data into addr (32B aligned address)
+ * If data is NULL, clear len byte of memory to zero.
+ */
+static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len,
+			     void *data, struct sk_buff *skb,
+			     struct c4iw_wr_wait *wr_waitp)
+{
+	int ret;
+
+	if (!rdev->lldi.ulptx_memwrite_dsgl || !use_dsgl) {
+		ret = _c4iw_write_mem_inline(rdev, addr, len, data, skb,
+					      wr_waitp);
+		goto out;
+	}
+
+	if (len <= inline_threshold) {
+		ret = _c4iw_write_mem_inline(rdev, addr, len, data, skb,
+					      wr_waitp);
+		goto out;
+	}
+
+	ret = _c4iw_write_mem_dma(rdev, addr, len, data, skb, wr_waitp);
+	if (ret) {
+		pr_warn_ratelimited("%s: dma map failure (non fatal)\n",
+				    pci_name(rdev->lldi.pdev));
+		ret = _c4iw_write_mem_inline(rdev, addr, len, data, skb,
+					      wr_waitp);
+	}
+out:
+	return ret;
+
+}
+
+/*
+ * Build and write a TPT entry.
+ * IN: stag key, pdid, perm, bind_enabled, zbva, to, len, page_size,
+ *     pbl_size and pbl_addr
+ * OUT: stag index
+ */
+static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
+			   u32 *stag, u8 stag_state, u32 pdid,
+			   enum fw_ri_stag_type type, enum fw_ri_mem_perms perm,
+			   int bind_enabled, u32 zbva, u64 to,
+			   u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr,
+			   struct sk_buff *skb, struct c4iw_wr_wait *wr_waitp)
+{
+	int err;
+	struct fw_ri_tpte tpt;
+	u32 stag_idx;
+	static atomic_t key;
+
+	if (c4iw_fatal_error(rdev))
+		return -EIO;
+
+	stag_state = stag_state > 0;
+	stag_idx = (*stag) >> 8;
+
+	if ((!reset_tpt_entry) && (*stag == T4_STAG_UNSET)) {
+		stag_idx = c4iw_get_resource(&rdev->resource.tpt_table);
+		if (!stag_idx) {
+			mutex_lock(&rdev->stats.lock);
+			rdev->stats.stag.fail++;
+			mutex_unlock(&rdev->stats.lock);
+			return -ENOMEM;
+		}
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.stag.cur += 32;
+		if (rdev->stats.stag.cur > rdev->stats.stag.max)
+			rdev->stats.stag.max = rdev->stats.stag.cur;
+		mutex_unlock(&rdev->stats.lock);
+		*stag = (stag_idx << 8) | (atomic_inc_return(&key) & 0xff);
+	}
+	pr_debug("stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n",
+		 stag_state, type, pdid, stag_idx);
+
+	/* write TPT entry */
+	if (reset_tpt_entry)
+		memset(&tpt, 0, sizeof(tpt));
+	else {
+		tpt.valid_to_pdid = cpu_to_be32(FW_RI_TPTE_VALID_F |
+			FW_RI_TPTE_STAGKEY_V((*stag & FW_RI_TPTE_STAGKEY_M)) |
+			FW_RI_TPTE_STAGSTATE_V(stag_state) |
+			FW_RI_TPTE_STAGTYPE_V(type) | FW_RI_TPTE_PDID_V(pdid));
+		tpt.locread_to_qpid = cpu_to_be32(FW_RI_TPTE_PERM_V(perm) |
+			(bind_enabled ? FW_RI_TPTE_MWBINDEN_F : 0) |
+			FW_RI_TPTE_ADDRTYPE_V((zbva ? FW_RI_ZERO_BASED_TO :
+						      FW_RI_VA_BASED_TO))|
+			FW_RI_TPTE_PS_V(page_size));
+		tpt.nosnoop_pbladdr = !pbl_size ? 0 : cpu_to_be32(
+			FW_RI_TPTE_PBLADDR_V(PBL_OFF(rdev, pbl_addr)>>3));
+		tpt.len_lo = cpu_to_be32((u32)(len & 0xffffffffUL));
+		tpt.va_hi = cpu_to_be32((u32)(to >> 32));
+		tpt.va_lo_fbo = cpu_to_be32((u32)(to & 0xffffffffUL));
+		tpt.dca_mwbcnt_pstag = cpu_to_be32(0);
+		tpt.len_hi = cpu_to_be32((u32)(len >> 32));
+	}
+	err = write_adapter_mem(rdev, stag_idx +
+				(rdev->lldi.vr->stag.start >> 5),
+				sizeof(tpt), &tpt, skb, wr_waitp);
+
+	if (reset_tpt_entry) {
+		c4iw_put_resource(&rdev->resource.tpt_table, stag_idx);
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.stag.cur -= 32;
+		mutex_unlock(&rdev->stats.lock);
+	}
+	return err;
+}
+
+static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl,
+		     u32 pbl_addr, u32 pbl_size, struct c4iw_wr_wait *wr_waitp)
+{
+	int err;
+
+	pr_debug("*pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
+		 pbl_addr, rdev->lldi.vr->pbl.start,
+		 pbl_size);
+
+	err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl, NULL,
+				wr_waitp);
+	return err;
+}
+
+static int dereg_mem(struct c4iw_rdev *rdev, u32 stag, u32 pbl_size,
+		     u32 pbl_addr, struct sk_buff *skb,
+		     struct c4iw_wr_wait *wr_waitp)
+{
+	return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0,
+			       pbl_size, pbl_addr, skb, wr_waitp);
+}
+
+static int allocate_window(struct c4iw_rdev *rdev, u32 *stag, u32 pdid,
+			   struct c4iw_wr_wait *wr_waitp)
+{
+	*stag = T4_STAG_UNSET;
+	return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_MW, 0, 0, 0,
+			       0UL, 0, 0, 0, 0, NULL, wr_waitp);
+}
+
+static int deallocate_window(struct c4iw_rdev *rdev, u32 stag,
+			     struct sk_buff *skb,
+			     struct c4iw_wr_wait *wr_waitp)
+{
+	return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0, 0,
+			       0, skb, wr_waitp);
+}
+
+static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid,
+			 u32 pbl_size, u32 pbl_addr,
+			 struct c4iw_wr_wait *wr_waitp)
+{
+	*stag = T4_STAG_UNSET;
+	return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_NSMR, 0, 0, 0,
+			       0UL, 0, 0, pbl_size, pbl_addr, NULL, wr_waitp);
+}
+
+static int finish_mem_reg(struct c4iw_mr *mhp, u32 stag)
+{
+	u32 mmid;
+
+	mhp->attr.state = 1;
+	mhp->attr.stag = stag;
+	mmid = stag >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	mhp->ibmr.length = mhp->attr.len;
+	mhp->ibmr.iova = mhp->attr.va_fbo;
+	mhp->ibmr.page_size = 1U << (mhp->attr.page_size + 12);
+	pr_debug("mmid 0x%x mhp %p\n", mmid, mhp);
+	return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
+}
+
+static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
+		      struct c4iw_mr *mhp, int shift)
+{
+	u32 stag = T4_STAG_UNSET;
+	int ret;
+
+	ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
+			      FW_RI_STAG_NSMR, mhp->attr.len ?
+			      mhp->attr.perms : 0,
+			      mhp->attr.mw_bind_enable, mhp->attr.zbva,
+			      mhp->attr.va_fbo, mhp->attr.len ?
+			      mhp->attr.len : -1, shift - 12,
+			      mhp->attr.pbl_size, mhp->attr.pbl_addr, NULL,
+			      mhp->wr_waitp);
+	if (ret)
+		return ret;
+
+	ret = finish_mem_reg(mhp, stag);
+	if (ret) {
+		dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+			  mhp->attr.pbl_addr, mhp->dereg_skb, mhp->wr_waitp);
+		mhp->dereg_skb = NULL;
+	}
+	return ret;
+}
+
+static int alloc_pbl(struct c4iw_mr *mhp, int npages)
+{
+	mhp->attr.pbl_addr = c4iw_pblpool_alloc(&mhp->rhp->rdev,
+						    npages << 3);
+
+	if (!mhp->attr.pbl_addr)
+		return -ENOMEM;
+
+	mhp->attr.pbl_size = npages;
+
+	return 0;
+}
+
+struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+	int ret;
+	u32 stag = T4_STAG_UNSET;
+
+	pr_debug("ib_pd %p\n", pd);
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+	mhp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL);
+	if (!mhp->wr_waitp) {
+		ret = -ENOMEM;
+		goto err_free_mhp;
+	}
+	c4iw_init_wr_wait(mhp->wr_waitp);
+
+	mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL);
+	if (!mhp->dereg_skb) {
+		ret = -ENOMEM;
+		goto err_free_wr_wait;
+	}
+
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	mhp->attr.mw_bind_enable = (acc&IB_ACCESS_MW_BIND) == IB_ACCESS_MW_BIND;
+	mhp->attr.zbva = 0;
+	mhp->attr.va_fbo = 0;
+	mhp->attr.page_size = 0;
+	mhp->attr.len = ~0ULL;
+	mhp->attr.pbl_size = 0;
+
+	ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, php->pdid,
+			      FW_RI_STAG_NSMR, mhp->attr.perms,
+			      mhp->attr.mw_bind_enable, 0, 0, ~0ULL, 0, 0, 0,
+			      NULL, mhp->wr_waitp);
+	if (ret)
+		goto err_free_skb;
+
+	ret = finish_mem_reg(mhp, stag);
+	if (ret)
+		goto err_dereg_mem;
+	return &mhp->ibmr;
+err_dereg_mem:
+	dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		  mhp->attr.pbl_addr, mhp->dereg_skb, mhp->wr_waitp);
+err_free_skb:
+	kfree_skb(mhp->dereg_skb);
+err_free_wr_wait:
+	c4iw_put_wr_wait(mhp->wr_waitp);
+err_free_mhp:
+	kfree(mhp);
+	return ERR_PTR(ret);
+}
+
+struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			       u64 virt, int acc, struct ib_udata *udata)
+{
+	__be64 *pages;
+	int shift, n, len;
+	int i, k, entry;
+	int err = -ENOMEM;
+	struct scatterlist *sg;
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+
+	pr_debug("ib_pd %p\n", pd);
+
+	if (length == ~0ULL)
+		return ERR_PTR(-EINVAL);
+
+	if ((length + start) < start)
+		return ERR_PTR(-EINVAL);
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+
+	if (mr_exceeds_hw_limits(rhp, length))
+		return ERR_PTR(-EINVAL);
+
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+	mhp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL);
+	if (!mhp->wr_waitp)
+		goto err_free_mhp;
+
+	mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL);
+	if (!mhp->dereg_skb)
+		goto err_free_wr_wait;
+
+	mhp->rhp = rhp;
+
+	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(mhp->umem))
+		goto err_free_skb;
+
+	shift = mhp->umem->page_shift;
+
+	n = mhp->umem->nmap;
+	err = alloc_pbl(mhp, n);
+	if (err)
+		goto err_umem_release;
+
+	pages = (__be64 *) __get_free_page(GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err_pbl_free;
+	}
+
+	i = n = 0;
+
+	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
+		len = sg_dma_len(sg) >> shift;
+		for (k = 0; k < len; ++k) {
+			pages[i++] = cpu_to_be64(sg_dma_address(sg) +
+						 (k << shift));
+			if (i == PAGE_SIZE / sizeof *pages) {
+				err = write_pbl(&mhp->rhp->rdev,
+				      pages,
+				      mhp->attr.pbl_addr + (n << 3), i,
+				      mhp->wr_waitp);
+				if (err)
+					goto pbl_done;
+				n += i;
+				i = 0;
+			}
+		}
+	}
+
+	if (i)
+		err = write_pbl(&mhp->rhp->rdev, pages,
+				mhp->attr.pbl_addr + (n << 3), i,
+				mhp->wr_waitp);
+
+pbl_done:
+	free_page((unsigned long) pages);
+	if (err)
+		goto err_pbl_free;
+
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+	mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = virt;
+	mhp->attr.page_size = shift - 12;
+	mhp->attr.len = length;
+
+	err = register_mem(rhp, php, mhp, shift);
+	if (err)
+		goto err_pbl_free;
+
+	return &mhp->ibmr;
+
+err_pbl_free:
+	c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+err_umem_release:
+	ib_umem_release(mhp->umem);
+err_free_skb:
+	kfree_skb(mhp->dereg_skb);
+err_free_wr_wait:
+	c4iw_put_wr_wait(mhp->wr_waitp);
+err_free_mhp:
+	kfree(mhp);
+	return ERR_PTR(err);
+}
+
+struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
+			    struct ib_udata *udata)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mw *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret;
+
+	if (type != IB_MW_TYPE_1)
+		return ERR_PTR(-EINVAL);
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL);
+	if (!mhp->wr_waitp) {
+		ret = -ENOMEM;
+		goto free_mhp;
+	}
+
+	mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL);
+	if (!mhp->dereg_skb) {
+		ret = -ENOMEM;
+		goto free_wr_wait;
+	}
+
+	ret = allocate_window(&rhp->rdev, &stag, php->pdid, mhp->wr_waitp);
+	if (ret)
+		goto free_skb;
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = FW_RI_STAG_MW;
+	mhp->attr.stag = stag;
+	mmid = (stag) >> 8;
+	mhp->ibmw.rkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+		ret = -ENOMEM;
+		goto dealloc_win;
+	}
+	pr_debug("mmid 0x%x mhp %p stag 0x%x\n", mmid, mhp, stag);
+	return &(mhp->ibmw);
+
+dealloc_win:
+	deallocate_window(&rhp->rdev, mhp->attr.stag, mhp->dereg_skb,
+			  mhp->wr_waitp);
+free_skb:
+	kfree_skb(mhp->dereg_skb);
+free_wr_wait:
+	c4iw_put_wr_wait(mhp->wr_waitp);
+free_mhp:
+	kfree(mhp);
+	return ERR_PTR(ret);
+}
+
+int c4iw_dealloc_mw(struct ib_mw *mw)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_mw *mhp;
+	u32 mmid;
+
+	mhp = to_c4iw_mw(mw);
+	rhp = mhp->rhp;
+	mmid = (mw->rkey) >> 8;
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	deallocate_window(&rhp->rdev, mhp->attr.stag, mhp->dereg_skb,
+			  mhp->wr_waitp);
+	kfree_skb(mhp->dereg_skb);
+	c4iw_put_wr_wait(mhp->wr_waitp);
+	kfree(mhp);
+	pr_debug("ib_mw %p mmid 0x%x ptr %p\n", mw, mmid, mhp);
+	return 0;
+}
+
+struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
+			    enum ib_mr_type mr_type,
+			    u32 max_num_sg)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret = 0;
+	int length = roundup(max_num_sg * sizeof(u64), 32);
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+
+	if (mr_type != IB_MR_TYPE_MEM_REG ||
+	    max_num_sg > t4_max_fr_depth(rhp->rdev.lldi.ulptx_memwrite_dsgl &&
+					 use_dsgl))
+		return ERR_PTR(-EINVAL);
+
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	mhp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL);
+	if (!mhp->wr_waitp) {
+		ret = -ENOMEM;
+		goto err_free_mhp;
+	}
+	c4iw_init_wr_wait(mhp->wr_waitp);
+
+	mhp->mpl = dma_alloc_coherent(&rhp->rdev.lldi.pdev->dev,
+				      length, &mhp->mpl_addr, GFP_KERNEL);
+	if (!mhp->mpl) {
+		ret = -ENOMEM;
+		goto err_free_wr_wait;
+	}
+	mhp->max_mpl_len = length;
+
+	mhp->rhp = rhp;
+	ret = alloc_pbl(mhp, max_num_sg);
+	if (ret)
+		goto err_free_dma;
+	mhp->attr.pbl_size = max_num_sg;
+	ret = allocate_stag(&rhp->rdev, &stag, php->pdid,
+			    mhp->attr.pbl_size, mhp->attr.pbl_addr,
+			    mhp->wr_waitp);
+	if (ret)
+		goto err_free_pbl;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = FW_RI_STAG_NSMR;
+	mhp->attr.stag = stag;
+	mhp->attr.state = 0;
+	mmid = (stag) >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+		ret = -ENOMEM;
+		goto err_dereg;
+	}
+
+	pr_debug("mmid 0x%x mhp %p stag 0x%x\n", mmid, mhp, stag);
+	return &(mhp->ibmr);
+err_dereg:
+	dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size,
+		  mhp->attr.pbl_addr, mhp->dereg_skb, mhp->wr_waitp);
+err_free_pbl:
+	c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+err_free_dma:
+	dma_free_coherent(&mhp->rhp->rdev.lldi.pdev->dev,
+			  mhp->max_mpl_len, mhp->mpl, mhp->mpl_addr);
+err_free_wr_wait:
+	c4iw_put_wr_wait(mhp->wr_waitp);
+err_free_mhp:
+	kfree(mhp);
+err:
+	return ERR_PTR(ret);
+}
+
+static int c4iw_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct c4iw_mr *mhp = to_c4iw_mr(ibmr);
+
+	if (unlikely(mhp->mpl_len == mhp->max_mpl_len))
+		return -ENOMEM;
+
+	mhp->mpl[mhp->mpl_len++] = addr;
+
+	return 0;
+}
+
+int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		   unsigned int *sg_offset)
+{
+	struct c4iw_mr *mhp = to_c4iw_mr(ibmr);
+
+	mhp->mpl_len = 0;
+
+	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, c4iw_set_page);
+}
+
+int c4iw_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_mr *mhp;
+	u32 mmid;
+
+	pr_debug("ib_mr %p\n", ib_mr);
+
+	mhp = to_c4iw_mr(ib_mr);
+	rhp = mhp->rhp;
+	mmid = mhp->attr.stag >> 8;
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	if (mhp->mpl)
+		dma_free_coherent(&mhp->rhp->rdev.lldi.pdev->dev,
+				  mhp->max_mpl_len, mhp->mpl, mhp->mpl_addr);
+	dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		  mhp->attr.pbl_addr, mhp->dereg_skb, mhp->wr_waitp);
+	if (mhp->attr.pbl_size)
+		c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+				  mhp->attr.pbl_size << 3);
+	if (mhp->kva)
+		kfree((void *) (unsigned long) mhp->kva);
+	if (mhp->umem)
+		ib_umem_release(mhp->umem);
+	pr_debug("mmid 0x%x ptr %p\n", mmid, mhp);
+	c4iw_put_wr_wait(mhp->wr_waitp);
+	kfree(mhp);
+	return 0;
+}
+
+void c4iw_invalidate_mr(struct c4iw_dev *rhp, u32 rkey)
+{
+	struct c4iw_mr *mhp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rhp->lock, flags);
+	mhp = get_mhp(rhp, rkey >> 8);
+	if (mhp)
+		mhp->attr.state = 0;
+	spin_unlock_irqrestore(&rhp->lock, flags);
+}
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
new file mode 100644
index 0000000..0b9cc73
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -0,0 +1,685 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/io.h>
+
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "iw_cxgb4.h"
+
+static int fastreg_support = 1;
+module_param(fastreg_support, int, 0644);
+MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)");
+
+static struct ib_ah *c4iw_ah_create(struct ib_pd *pd,
+				    struct rdma_ah_attr *ah_attr,
+				    struct ib_udata *udata)
+
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static int c4iw_ah_destroy(struct ib_ah *ah)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_process_mad(struct ib_device *ibdev, int mad_flags,
+			    u8 port_num, const struct ib_wc *in_wc,
+			    const struct ib_grh *in_grh,
+			    const struct ib_mad_hdr *in_mad,
+			    size_t in_mad_size,
+			    struct ib_mad_hdr *out_mad,
+			    size_t *out_mad_size,
+			    u16 *out_mad_pkey_index)
+{
+	return -ENOSYS;
+}
+
+void _c4iw_free_ucontext(struct kref *kref)
+{
+	struct c4iw_ucontext *ucontext;
+	struct c4iw_dev *rhp;
+	struct c4iw_mm_entry *mm, *tmp;
+
+	ucontext = container_of(kref, struct c4iw_ucontext, kref);
+	rhp = to_c4iw_dev(ucontext->ibucontext.device);
+
+	pr_debug("ucontext %p\n", ucontext);
+	list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
+		kfree(mm);
+	c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx);
+	kfree(ucontext);
+}
+
+static int c4iw_dealloc_ucontext(struct ib_ucontext *context)
+{
+	struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
+
+	pr_debug("context %p\n", context);
+	c4iw_put_ucontext(ucontext);
+	return 0;
+}
+
+static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
+					       struct ib_udata *udata)
+{
+	struct c4iw_ucontext *context;
+	struct c4iw_dev *rhp = to_c4iw_dev(ibdev);
+	struct c4iw_alloc_ucontext_resp uresp;
+	int ret = 0;
+	struct c4iw_mm_entry *mm = NULL;
+
+	pr_debug("ibdev %p\n", ibdev);
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx);
+	INIT_LIST_HEAD(&context->mmaps);
+	spin_lock_init(&context->mmap_lock);
+	kref_init(&context->kref);
+
+	if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) {
+		pr_err_once("Warning - downlevel libcxgb4 (non-fatal), device status page disabled\n");
+		rhp->rdev.flags |= T4_STATUS_PAGE_DISABLED;
+	} else {
+		mm = kmalloc(sizeof(*mm), GFP_KERNEL);
+		if (!mm) {
+			ret = -ENOMEM;
+			goto err_free;
+		}
+
+		uresp.status_page_size = PAGE_SIZE;
+
+		spin_lock(&context->mmap_lock);
+		uresp.status_page_key = context->key;
+		context->key += PAGE_SIZE;
+		spin_unlock(&context->mmap_lock);
+
+		ret = ib_copy_to_udata(udata, &uresp,
+				       sizeof(uresp) - sizeof(uresp.reserved));
+		if (ret)
+			goto err_mm;
+
+		mm->key = uresp.status_page_key;
+		mm->addr = virt_to_phys(rhp->rdev.status_page);
+		mm->len = PAGE_SIZE;
+		insert_mmap(context, mm);
+	}
+	return &context->ibucontext;
+err_mm:
+	kfree(mm);
+err_free:
+	kfree(context);
+err:
+	return ERR_PTR(ret);
+}
+
+static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	int len = vma->vm_end - vma->vm_start;
+	u32 key = vma->vm_pgoff << PAGE_SHIFT;
+	struct c4iw_rdev *rdev;
+	int ret = 0;
+	struct c4iw_mm_entry *mm;
+	struct c4iw_ucontext *ucontext;
+	u64 addr;
+
+	pr_debug("pgoff 0x%lx key 0x%x len %d\n", vma->vm_pgoff,
+		 key, len);
+
+	if (vma->vm_start & (PAGE_SIZE-1))
+		return -EINVAL;
+
+	rdev = &(to_c4iw_dev(context->device)->rdev);
+	ucontext = to_c4iw_ucontext(context);
+
+	mm = remove_mmap(ucontext, key, len);
+	if (!mm)
+		return -EINVAL;
+	addr = mm->addr;
+	kfree(mm);
+
+	if ((addr >= pci_resource_start(rdev->lldi.pdev, 0)) &&
+	    (addr < (pci_resource_start(rdev->lldi.pdev, 0) +
+		    pci_resource_len(rdev->lldi.pdev, 0)))) {
+
+		/*
+		 * MA_SYNC register...
+		 */
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 addr >> PAGE_SHIFT,
+					 len, vma->vm_page_prot);
+	} else if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) &&
+		   (addr < (pci_resource_start(rdev->lldi.pdev, 2) +
+		    pci_resource_len(rdev->lldi.pdev, 2)))) {
+
+		/*
+		 * Map user DB or OCQP memory...
+		 */
+		if (addr >= rdev->oc_mw_pa)
+			vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot);
+		else {
+			if (!is_t4(rdev->lldi.adapter_type))
+				vma->vm_page_prot =
+					t4_pgprot_wc(vma->vm_page_prot);
+			else
+				vma->vm_page_prot =
+					pgprot_noncached(vma->vm_page_prot);
+		}
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 addr >> PAGE_SHIFT,
+					 len, vma->vm_page_prot);
+	} else {
+
+		/*
+		 * Map WQ or CQ contig dma memory...
+		 */
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      addr >> PAGE_SHIFT,
+				      len, vma->vm_page_prot);
+	}
+
+	return ret;
+}
+
+static int c4iw_deallocate_pd(struct ib_pd *pd)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	pr_debug("ibpd %p pdid 0x%x\n", pd, php->pdid);
+	c4iw_put_resource(&rhp->rdev.resource.pdid_table, php->pdid);
+	mutex_lock(&rhp->rdev.stats.lock);
+	rhp->rdev.stats.pd.cur--;
+	mutex_unlock(&rhp->rdev.stats.lock);
+	kfree(php);
+	return 0;
+}
+
+static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
+				      struct ib_ucontext *context,
+				      struct ib_udata *udata)
+{
+	struct c4iw_pd *php;
+	u32 pdid;
+	struct c4iw_dev *rhp;
+
+	pr_debug("ibdev %p\n", ibdev);
+	rhp = (struct c4iw_dev *) ibdev;
+	pdid =  c4iw_get_resource(&rhp->rdev.resource.pdid_table);
+	if (!pdid)
+		return ERR_PTR(-EINVAL);
+	php = kzalloc(sizeof(*php), GFP_KERNEL);
+	if (!php) {
+		c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid);
+		return ERR_PTR(-ENOMEM);
+	}
+	php->pdid = pdid;
+	php->rhp = rhp;
+	if (context) {
+		struct c4iw_alloc_pd_resp uresp = {.pdid = php->pdid};
+
+		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
+			c4iw_deallocate_pd(&php->ibpd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+	mutex_lock(&rhp->rdev.stats.lock);
+	rhp->rdev.stats.pd.cur++;
+	if (rhp->rdev.stats.pd.cur > rhp->rdev.stats.pd.max)
+		rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur;
+	mutex_unlock(&rhp->rdev.stats.lock);
+	pr_debug("pdid 0x%0x ptr 0x%p\n", pdid, php);
+	return &php->ibpd;
+}
+
+static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			   u16 *pkey)
+{
+	pr_debug("ibdev %p\n", ibdev);
+	*pkey = 0;
+	return 0;
+}
+
+static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index,
+			  union ib_gid *gid)
+{
+	struct c4iw_dev *dev;
+
+	pr_debug("ibdev %p, port %d, index %d, gid %p\n",
+		 ibdev, port, index, gid);
+	if (!port)
+		return -EINVAL;
+	dev = to_c4iw_dev(ibdev);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), dev->rdev.lldi.ports[port-1]->dev_addr, 6);
+	return 0;
+}
+
+static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+			     struct ib_udata *uhw)
+{
+
+	struct c4iw_dev *dev;
+
+	pr_debug("ibdev %p\n", ibdev);
+
+	if (uhw->inlen || uhw->outlen)
+		return -EINVAL;
+
+	dev = to_c4iw_dev(ibdev);
+	memset(props, 0, sizeof *props);
+	memcpy(&props->sys_image_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
+	props->hw_ver = CHELSIO_CHIP_RELEASE(dev->rdev.lldi.adapter_type);
+	props->fw_ver = dev->rdev.lldi.fw_vers;
+	props->device_cap_flags = dev->device_cap_flags;
+	props->page_size_cap = T4_PAGESIZE_MASK;
+	props->vendor_id = (u32)dev->rdev.lldi.pdev->vendor;
+	props->vendor_part_id = (u32)dev->rdev.lldi.pdev->device;
+	props->max_mr_size = T4_MAX_MR_SIZE;
+	props->max_qp = dev->rdev.lldi.vr->qp.size / 2;
+	props->max_qp_wr = dev->rdev.hw_queue.t4_max_qp_depth;
+	props->max_sge = T4_MAX_RECV_SGE;
+	props->max_sge_rd = 1;
+	props->max_res_rd_atom = dev->rdev.lldi.max_ird_adapter;
+	props->max_qp_rd_atom = min(dev->rdev.lldi.max_ordird_qp,
+				    c4iw_max_read_depth);
+	props->max_qp_init_rd_atom = props->max_qp_rd_atom;
+	props->max_cq = dev->rdev.lldi.vr->qp.size;
+	props->max_cqe = dev->rdev.hw_queue.t4_max_cq_depth;
+	props->max_mr = c4iw_num_stags(&dev->rdev);
+	props->max_pd = T4_MAX_NUM_PD;
+	props->local_ca_ack_delay = 0;
+	props->max_fast_reg_page_list_len =
+		t4_max_fr_depth(dev->rdev.lldi.ulptx_memwrite_dsgl && use_dsgl);
+
+	return 0;
+}
+
+static int c4iw_query_port(struct ib_device *ibdev, u8 port,
+			   struct ib_port_attr *props)
+{
+	struct c4iw_dev *dev;
+	struct net_device *netdev;
+	struct in_device *inetdev;
+
+	pr_debug("ibdev %p\n", ibdev);
+
+	dev = to_c4iw_dev(ibdev);
+	netdev = dev->rdev.lldi.ports[port-1];
+	/* props being zeroed by the caller, avoid zeroing it here */
+	props->max_mtu = IB_MTU_4096;
+	props->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
+
+	if (!netif_carrier_ok(netdev))
+		props->state = IB_PORT_DOWN;
+	else {
+		inetdev = in_dev_get(netdev);
+		if (inetdev) {
+			if (inetdev->ifa_list)
+				props->state = IB_PORT_ACTIVE;
+			else
+				props->state = IB_PORT_INIT;
+			in_dev_put(inetdev);
+		} else
+			props->state = IB_PORT_INIT;
+	}
+
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_SNMP_TUNNEL_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_DEVICE_MGMT_SUP |
+	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->active_width = 2;
+	props->active_speed = IB_SPEED_DDR;
+	props->max_msg_sz = -1;
+
+	return 0;
+}
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	pr_debug("dev 0x%p\n", dev);
+	return sprintf(buf, "%d\n",
+		       CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type));
+}
+
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0];
+
+	pr_debug("dev 0x%p\n", dev);
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	return sprintf(buf, "%s\n", info.driver);
+}
+
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	pr_debug("dev 0x%p\n", dev);
+	return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor,
+		       c4iw_dev->rdev.lldi.pdev->device);
+}
+
+enum counters {
+	IP4INSEGS,
+	IP4OUTSEGS,
+	IP4RETRANSSEGS,
+	IP4OUTRSTS,
+	IP6INSEGS,
+	IP6OUTSEGS,
+	IP6RETRANSSEGS,
+	IP6OUTRSTS,
+	NR_COUNTERS
+};
+
+static const char * const names[] = {
+	[IP4INSEGS] = "ip4InSegs",
+	[IP4OUTSEGS] = "ip4OutSegs",
+	[IP4RETRANSSEGS] = "ip4RetransSegs",
+	[IP4OUTRSTS] = "ip4OutRsts",
+	[IP6INSEGS] = "ip6InSegs",
+	[IP6OUTSEGS] = "ip6OutSegs",
+	[IP6RETRANSSEGS] = "ip6RetransSegs",
+	[IP6OUTRSTS] = "ip6OutRsts"
+};
+
+static struct rdma_hw_stats *c4iw_alloc_stats(struct ib_device *ibdev,
+					      u8 port_num)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS);
+
+	if (port_num != 0)
+		return NULL;
+
+	return rdma_alloc_hw_stats_struct(names, NR_COUNTERS,
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int c4iw_get_mib(struct ib_device *ibdev,
+			struct rdma_hw_stats *stats,
+			u8 port, int index)
+{
+	struct tp_tcp_stats v4, v6;
+	struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev);
+
+	cxgb4_get_tcp_stats(c4iw_dev->rdev.lldi.pdev, &v4, &v6);
+	stats->value[IP4INSEGS] = v4.tcp_in_segs;
+	stats->value[IP4OUTSEGS] = v4.tcp_out_segs;
+	stats->value[IP4RETRANSSEGS] = v4.tcp_retrans_segs;
+	stats->value[IP4OUTRSTS] = v4.tcp_out_rsts;
+	stats->value[IP6INSEGS] = v6.tcp_in_segs;
+	stats->value[IP6OUTSEGS] = v6.tcp_out_segs;
+	stats->value[IP6RETRANSSEGS] = v6.tcp_retrans_segs;
+	stats->value[IP6OUTRSTS] = v6.tcp_out_rsts;
+
+	return stats->num_counters;
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *c4iw_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+};
+
+static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num,
+			       struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	return 0;
+}
+
+static void get_dev_fw_str(struct ib_device *dev, char *str)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev);
+	pr_debug("dev 0x%p\n", dev);
+
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u.%u",
+		 FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers),
+		 FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers),
+		 FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers),
+		 FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
+}
+
+static struct net_device *get_netdev(struct ib_device *dev, u8 port)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, ibdev);
+	struct c4iw_rdev *rdev = &c4iw_dev->rdev;
+	struct net_device *ndev;
+
+	if (!port || port > rdev->lldi.nports)
+		return NULL;
+
+	rcu_read_lock();
+	ndev = rdev->lldi.ports[port - 1];
+	if (ndev)
+		dev_hold(ndev);
+	rcu_read_unlock();
+
+	return ndev;
+}
+
+void c4iw_register_device(struct work_struct *work)
+{
+	int ret;
+	int i;
+	struct uld_ctx *ctx = container_of(work, struct uld_ctx, reg_work);
+	struct c4iw_dev *dev = ctx->dev;
+
+	pr_debug("c4iw_dev %p\n", dev);
+	strlcpy(dev->ibdev.name, "cxgb4_%d", IB_DEVICE_NAME_MAX);
+	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+	memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
+	dev->ibdev.owner = THIS_MODULE;
+	dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW;
+	if (fastreg_support)
+		dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+	dev->ibdev.local_dma_lkey = 0;
+	dev->ibdev.uverbs_cmd_mask =
+	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
+	dev->ibdev.node_type = RDMA_NODE_RNIC;
+	BUILD_BUG_ON(sizeof(C4IW_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX);
+	memcpy(dev->ibdev.node_desc, C4IW_NODE_DESC, sizeof(C4IW_NODE_DESC));
+	dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports;
+	dev->ibdev.num_comp_vectors =  dev->rdev.lldi.nciq;
+	dev->ibdev.dev.parent = &dev->rdev.lldi.pdev->dev;
+	dev->ibdev.query_device = c4iw_query_device;
+	dev->ibdev.query_port = c4iw_query_port;
+	dev->ibdev.query_pkey = c4iw_query_pkey;
+	dev->ibdev.query_gid = c4iw_query_gid;
+	dev->ibdev.alloc_ucontext = c4iw_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = c4iw_dealloc_ucontext;
+	dev->ibdev.mmap = c4iw_mmap;
+	dev->ibdev.alloc_pd = c4iw_allocate_pd;
+	dev->ibdev.dealloc_pd = c4iw_deallocate_pd;
+	dev->ibdev.create_ah = c4iw_ah_create;
+	dev->ibdev.destroy_ah = c4iw_ah_destroy;
+	dev->ibdev.create_qp = c4iw_create_qp;
+	dev->ibdev.modify_qp = c4iw_ib_modify_qp;
+	dev->ibdev.query_qp = c4iw_ib_query_qp;
+	dev->ibdev.destroy_qp = c4iw_destroy_qp;
+	dev->ibdev.create_cq = c4iw_create_cq;
+	dev->ibdev.destroy_cq = c4iw_destroy_cq;
+	dev->ibdev.resize_cq = c4iw_resize_cq;
+	dev->ibdev.poll_cq = c4iw_poll_cq;
+	dev->ibdev.get_dma_mr = c4iw_get_dma_mr;
+	dev->ibdev.reg_user_mr = c4iw_reg_user_mr;
+	dev->ibdev.dereg_mr = c4iw_dereg_mr;
+	dev->ibdev.alloc_mw = c4iw_alloc_mw;
+	dev->ibdev.dealloc_mw = c4iw_dealloc_mw;
+	dev->ibdev.alloc_mr = c4iw_alloc_mr;
+	dev->ibdev.map_mr_sg = c4iw_map_mr_sg;
+	dev->ibdev.attach_mcast = c4iw_multicast_attach;
+	dev->ibdev.detach_mcast = c4iw_multicast_detach;
+	dev->ibdev.process_mad = c4iw_process_mad;
+	dev->ibdev.req_notify_cq = c4iw_arm_cq;
+	dev->ibdev.post_send = c4iw_post_send;
+	dev->ibdev.post_recv = c4iw_post_receive;
+	dev->ibdev.alloc_hw_stats = c4iw_alloc_stats;
+	dev->ibdev.get_hw_stats = c4iw_get_mib;
+	dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
+	dev->ibdev.get_port_immutable = c4iw_port_immutable;
+	dev->ibdev.get_dev_fw_str = get_dev_fw_str;
+	dev->ibdev.get_netdev = get_netdev;
+
+	dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
+	if (!dev->ibdev.iwcm) {
+		ret = -ENOMEM;
+		goto err_dealloc_ctx;
+	}
+
+	dev->ibdev.iwcm->connect = c4iw_connect;
+	dev->ibdev.iwcm->accept = c4iw_accept_cr;
+	dev->ibdev.iwcm->reject = c4iw_reject_cr;
+	dev->ibdev.iwcm->create_listen = c4iw_create_listen;
+	dev->ibdev.iwcm->destroy_listen = c4iw_destroy_listen;
+	dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref;
+	dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref;
+	dev->ibdev.iwcm->get_qp = c4iw_get_qp;
+	memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name,
+	       sizeof(dev->ibdev.iwcm->ifname));
+
+	dev->ibdev.driver_id = RDMA_DRIVER_CXGB4;
+	ret = ib_register_device(&dev->ibdev, NULL);
+	if (ret)
+		goto err_kfree_iwcm;
+
+	for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) {
+		ret = device_create_file(&dev->ibdev.dev,
+					 c4iw_class_attributes[i]);
+		if (ret)
+			goto err_unregister_device;
+	}
+	return;
+err_unregister_device:
+	ib_unregister_device(&dev->ibdev);
+err_kfree_iwcm:
+	kfree(dev->ibdev.iwcm);
+err_dealloc_ctx:
+	pr_err("%s - Failed registering iwarp device: %d\n",
+	       pci_name(ctx->lldi.pdev), ret);
+	c4iw_dealloc(ctx);
+	return;
+}
+
+void c4iw_unregister_device(struct c4iw_dev *dev)
+{
+	int i;
+
+	pr_debug("c4iw_dev %p\n", dev);
+	for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i)
+		device_remove_file(&dev->ibdev.dev,
+				   c4iw_class_attributes[i]);
+	ib_unregister_device(&dev->ibdev);
+	kfree(dev->ibdev.iwcm);
+	return;
+}
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
new file mode 100644
index 0000000..ae167b6
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -0,0 +1,2109 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+
+#include "iw_cxgb4.h"
+
+static int db_delay_usecs = 1;
+module_param(db_delay_usecs, int, 0644);
+MODULE_PARM_DESC(db_delay_usecs, "Usecs to delay awaiting db fifo to drain");
+
+static int ocqp_support = 1;
+module_param(ocqp_support, int, 0644);
+MODULE_PARM_DESC(ocqp_support, "Support on-chip SQs (default=1)");
+
+int db_fc_threshold = 1000;
+module_param(db_fc_threshold, int, 0644);
+MODULE_PARM_DESC(db_fc_threshold,
+		 "QP count/threshold that triggers"
+		 " automatic db flow control mode (default = 1000)");
+
+int db_coalescing_threshold;
+module_param(db_coalescing_threshold, int, 0644);
+MODULE_PARM_DESC(db_coalescing_threshold,
+		 "QP count/threshold that triggers"
+		 " disabling db coalescing (default = 0)");
+
+static int max_fr_immd = T4_MAX_FR_IMMD;
+module_param(max_fr_immd, int, 0644);
+MODULE_PARM_DESC(max_fr_immd, "fastreg threshold for using DSGL instead of immedate");
+
+static int alloc_ird(struct c4iw_dev *dev, u32 ird)
+{
+	int ret = 0;
+
+	spin_lock_irq(&dev->lock);
+	if (ird <= dev->avail_ird)
+		dev->avail_ird -= ird;
+	else
+		ret = -ENOMEM;
+	spin_unlock_irq(&dev->lock);
+
+	if (ret)
+		dev_warn(&dev->rdev.lldi.pdev->dev,
+			 "device IRD resources exhausted\n");
+
+	return ret;
+}
+
+static void free_ird(struct c4iw_dev *dev, int ird)
+{
+	spin_lock_irq(&dev->lock);
+	dev->avail_ird += ird;
+	spin_unlock_irq(&dev->lock);
+}
+
+static void set_state(struct c4iw_qp *qhp, enum c4iw_qp_state state)
+{
+	unsigned long flag;
+	spin_lock_irqsave(&qhp->lock, flag);
+	qhp->attr.state = state;
+	spin_unlock_irqrestore(&qhp->lock, flag);
+}
+
+static void dealloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	c4iw_ocqp_pool_free(rdev, sq->dma_addr, sq->memsize);
+}
+
+static void dealloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	dma_free_coherent(&(rdev->lldi.pdev->dev), sq->memsize, sq->queue,
+			  pci_unmap_addr(sq, mapping));
+}
+
+static void dealloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	if (t4_sq_onchip(sq))
+		dealloc_oc_sq(rdev, sq);
+	else
+		dealloc_host_sq(rdev, sq);
+}
+
+static int alloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	if (!ocqp_support || !ocqp_supported(&rdev->lldi))
+		return -ENOSYS;
+	sq->dma_addr = c4iw_ocqp_pool_alloc(rdev, sq->memsize);
+	if (!sq->dma_addr)
+		return -ENOMEM;
+	sq->phys_addr = rdev->oc_mw_pa + sq->dma_addr -
+			rdev->lldi.vr->ocq.start;
+	sq->queue = (__force union t4_wr *)(rdev->oc_mw_kva + sq->dma_addr -
+					    rdev->lldi.vr->ocq.start);
+	sq->flags |= T4_SQ_ONCHIP;
+	return 0;
+}
+
+static int alloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	sq->queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev), sq->memsize,
+				       &(sq->dma_addr), GFP_KERNEL);
+	if (!sq->queue)
+		return -ENOMEM;
+	sq->phys_addr = virt_to_phys(sq->queue);
+	pci_unmap_addr_set(sq, mapping, sq->dma_addr);
+	return 0;
+}
+
+static int alloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq, int user)
+{
+	int ret = -ENOSYS;
+	if (user)
+		ret = alloc_oc_sq(rdev, sq);
+	if (ret)
+		ret = alloc_host_sq(rdev, sq);
+	return ret;
+}
+
+static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
+		      struct c4iw_dev_ucontext *uctx)
+{
+	/*
+	 * uP clears EQ contexts when the connection exits rdma mode,
+	 * so no need to post a RESET WR for these EQs.
+	 */
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  wq->rq.memsize, wq->rq.queue,
+			  dma_unmap_addr(&wq->rq, mapping));
+	dealloc_sq(rdev, &wq->sq);
+	c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size);
+	kfree(wq->rq.sw_rq);
+	kfree(wq->sq.sw_sq);
+	c4iw_put_qpid(rdev, wq->rq.qid, uctx);
+	c4iw_put_qpid(rdev, wq->sq.qid, uctx);
+	return 0;
+}
+
+/*
+ * Determine the BAR2 virtual address and qid. If pbar2_pa is not NULL,
+ * then this is a user mapping so compute the page-aligned physical address
+ * for mapping.
+ */
+void __iomem *c4iw_bar2_addrs(struct c4iw_rdev *rdev, unsigned int qid,
+			      enum cxgb4_bar2_qtype qtype,
+			      unsigned int *pbar2_qid, u64 *pbar2_pa)
+{
+	u64 bar2_qoffset;
+	int ret;
+
+	ret = cxgb4_bar2_sge_qregs(rdev->lldi.ports[0], qid, qtype,
+				   pbar2_pa ? 1 : 0,
+				   &bar2_qoffset, pbar2_qid);
+	if (ret)
+		return NULL;
+
+	if (pbar2_pa)
+		*pbar2_pa = (rdev->bar2_pa + bar2_qoffset) & PAGE_MASK;
+
+	if (is_t4(rdev->lldi.adapter_type))
+		return NULL;
+
+	return rdev->bar2_kva + bar2_qoffset;
+}
+
+static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
+		     struct t4_cq *rcq, struct t4_cq *scq,
+		     struct c4iw_dev_ucontext *uctx,
+		     struct c4iw_wr_wait *wr_waitp)
+{
+	int user = (uctx != &rdev->uctx);
+	struct fw_ri_res_wr *res_wr;
+	struct fw_ri_res *res;
+	int wr_len;
+	struct sk_buff *skb;
+	int ret = 0;
+	int eqsize;
+
+	wq->sq.qid = c4iw_get_qpid(rdev, uctx);
+	if (!wq->sq.qid)
+		return -ENOMEM;
+
+	wq->rq.qid = c4iw_get_qpid(rdev, uctx);
+	if (!wq->rq.qid) {
+		ret = -ENOMEM;
+		goto free_sq_qid;
+	}
+
+	if (!user) {
+		wq->sq.sw_sq = kzalloc(wq->sq.size * sizeof *wq->sq.sw_sq,
+				 GFP_KERNEL);
+		if (!wq->sq.sw_sq) {
+			ret = -ENOMEM;
+			goto free_rq_qid;
+		}
+
+		wq->rq.sw_rq = kzalloc(wq->rq.size * sizeof *wq->rq.sw_rq,
+				 GFP_KERNEL);
+		if (!wq->rq.sw_rq) {
+			ret = -ENOMEM;
+			goto free_sw_sq;
+		}
+	}
+
+	/*
+	 * RQT must be a power of 2 and at least 16 deep.
+	 */
+	wq->rq.rqt_size = roundup_pow_of_two(max_t(u16, wq->rq.size, 16));
+	wq->rq.rqt_hwaddr = c4iw_rqtpool_alloc(rdev, wq->rq.rqt_size);
+	if (!wq->rq.rqt_hwaddr) {
+		ret = -ENOMEM;
+		goto free_sw_rq;
+	}
+
+	ret = alloc_sq(rdev, &wq->sq, user);
+	if (ret)
+		goto free_hwaddr;
+	memset(wq->sq.queue, 0, wq->sq.memsize);
+	dma_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr);
+
+	wq->rq.queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev),
+					  wq->rq.memsize, &(wq->rq.dma_addr),
+					  GFP_KERNEL);
+	if (!wq->rq.queue) {
+		ret = -ENOMEM;
+		goto free_sq;
+	}
+	pr_debug("sq base va 0x%p pa 0x%llx rq base va 0x%p pa 0x%llx\n",
+		 wq->sq.queue,
+		 (unsigned long long)virt_to_phys(wq->sq.queue),
+		 wq->rq.queue,
+		 (unsigned long long)virt_to_phys(wq->rq.queue));
+	memset(wq->rq.queue, 0, wq->rq.memsize);
+	dma_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr);
+
+	wq->db = rdev->lldi.db_reg;
+
+	wq->sq.bar2_va = c4iw_bar2_addrs(rdev, wq->sq.qid, T4_BAR2_QTYPE_EGRESS,
+					 &wq->sq.bar2_qid,
+					 user ? &wq->sq.bar2_pa : NULL);
+	wq->rq.bar2_va = c4iw_bar2_addrs(rdev, wq->rq.qid, T4_BAR2_QTYPE_EGRESS,
+					 &wq->rq.bar2_qid,
+					 user ? &wq->rq.bar2_pa : NULL);
+
+	/*
+	 * User mode must have bar2 access.
+	 */
+	if (user && (!wq->sq.bar2_pa || !wq->rq.bar2_pa)) {
+		pr_warn("%s: sqid %u or rqid %u not in BAR2 range\n",
+			pci_name(rdev->lldi.pdev), wq->sq.qid, wq->rq.qid);
+		goto free_dma;
+	}
+
+	wq->rdev = rdev;
+	wq->rq.msn = 1;
+
+	/* build fw_ri_res_wr */
+	wr_len = sizeof *res_wr + 2 * sizeof *res;
+
+	skb = alloc_skb(wr_len, GFP_KERNEL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto free_dma;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	res_wr = __skb_put_zero(skb, wr_len);
+	res_wr->op_nres = cpu_to_be32(
+			FW_WR_OP_V(FW_RI_RES_WR) |
+			FW_RI_RES_WR_NRES_V(2) |
+			FW_WR_COMPL_F);
+	res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
+	res_wr->cookie = (uintptr_t)wr_waitp;
+	res = res_wr->res;
+	res->u.sqrq.restype = FW_RI_RES_TYPE_SQ;
+	res->u.sqrq.op = FW_RI_RES_OP_WRITE;
+
+	/*
+	 * eqsize is the number of 64B entries plus the status page size.
+	 */
+	eqsize = wq->sq.size * T4_SQ_NUM_SLOTS +
+		rdev->hw_queue.t4_eq_status_entries;
+
+	res->u.sqrq.fetchszm_to_iqid = cpu_to_be32(
+		FW_RI_RES_WR_HOSTFCMODE_V(0) |	/* no host cidx updates */
+		FW_RI_RES_WR_CPRIO_V(0) |	/* don't keep in chip cache */
+		FW_RI_RES_WR_PCIECHN_V(0) |	/* set by uP at ri_init time */
+		(t4_sq_onchip(&wq->sq) ? FW_RI_RES_WR_ONCHIP_F : 0) |
+		FW_RI_RES_WR_IQID_V(scq->cqid));
+	res->u.sqrq.dcaen_to_eqsize = cpu_to_be32(
+		FW_RI_RES_WR_DCAEN_V(0) |
+		FW_RI_RES_WR_DCACPU_V(0) |
+		FW_RI_RES_WR_FBMIN_V(2) |
+		(t4_sq_onchip(&wq->sq) ? FW_RI_RES_WR_FBMAX_V(2) :
+					 FW_RI_RES_WR_FBMAX_V(3)) |
+		FW_RI_RES_WR_CIDXFTHRESHO_V(0) |
+		FW_RI_RES_WR_CIDXFTHRESH_V(0) |
+		FW_RI_RES_WR_EQSIZE_V(eqsize));
+	res->u.sqrq.eqid = cpu_to_be32(wq->sq.qid);
+	res->u.sqrq.eqaddr = cpu_to_be64(wq->sq.dma_addr);
+	res++;
+	res->u.sqrq.restype = FW_RI_RES_TYPE_RQ;
+	res->u.sqrq.op = FW_RI_RES_OP_WRITE;
+
+	/*
+	 * eqsize is the number of 64B entries plus the status page size.
+	 */
+	eqsize = wq->rq.size * T4_RQ_NUM_SLOTS +
+		rdev->hw_queue.t4_eq_status_entries;
+	res->u.sqrq.fetchszm_to_iqid = cpu_to_be32(
+		FW_RI_RES_WR_HOSTFCMODE_V(0) |	/* no host cidx updates */
+		FW_RI_RES_WR_CPRIO_V(0) |	/* don't keep in chip cache */
+		FW_RI_RES_WR_PCIECHN_V(0) |	/* set by uP at ri_init time */
+		FW_RI_RES_WR_IQID_V(rcq->cqid));
+	res->u.sqrq.dcaen_to_eqsize = cpu_to_be32(
+		FW_RI_RES_WR_DCAEN_V(0) |
+		FW_RI_RES_WR_DCACPU_V(0) |
+		FW_RI_RES_WR_FBMIN_V(2) |
+		FW_RI_RES_WR_FBMAX_V(3) |
+		FW_RI_RES_WR_CIDXFTHRESHO_V(0) |
+		FW_RI_RES_WR_CIDXFTHRESH_V(0) |
+		FW_RI_RES_WR_EQSIZE_V(eqsize));
+	res->u.sqrq.eqid = cpu_to_be32(wq->rq.qid);
+	res->u.sqrq.eqaddr = cpu_to_be64(wq->rq.dma_addr);
+
+	c4iw_init_wr_wait(wr_waitp);
+	ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, wq->sq.qid, __func__);
+	if (ret)
+		goto free_dma;
+
+	pr_debug("sqid 0x%x rqid 0x%x kdb 0x%p sq_bar2_addr %p rq_bar2_addr %p\n",
+		 wq->sq.qid, wq->rq.qid, wq->db,
+		 wq->sq.bar2_va, wq->rq.bar2_va);
+
+	return 0;
+free_dma:
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  wq->rq.memsize, wq->rq.queue,
+			  dma_unmap_addr(&wq->rq, mapping));
+free_sq:
+	dealloc_sq(rdev, &wq->sq);
+free_hwaddr:
+	c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size);
+free_sw_rq:
+	kfree(wq->rq.sw_rq);
+free_sw_sq:
+	kfree(wq->sq.sw_sq);
+free_rq_qid:
+	c4iw_put_qpid(rdev, wq->rq.qid, uctx);
+free_sq_qid:
+	c4iw_put_qpid(rdev, wq->sq.qid, uctx);
+	return ret;
+}
+
+static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp,
+		      struct ib_send_wr *wr, int max, u32 *plenp)
+{
+	u8 *dstp, *srcp;
+	u32 plen = 0;
+	int i;
+	int rem, len;
+
+	dstp = (u8 *)immdp->data;
+	for (i = 0; i < wr->num_sge; i++) {
+		if ((plen + wr->sg_list[i].length) > max)
+			return -EMSGSIZE;
+		srcp = (u8 *)(unsigned long)wr->sg_list[i].addr;
+		plen += wr->sg_list[i].length;
+		rem = wr->sg_list[i].length;
+		while (rem) {
+			if (dstp == (u8 *)&sq->queue[sq->size])
+				dstp = (u8 *)sq->queue;
+			if (rem <= (u8 *)&sq->queue[sq->size] - dstp)
+				len = rem;
+			else
+				len = (u8 *)&sq->queue[sq->size] - dstp;
+			memcpy(dstp, srcp, len);
+			dstp += len;
+			srcp += len;
+			rem -= len;
+		}
+	}
+	len = roundup(plen + sizeof *immdp, 16) - (plen + sizeof *immdp);
+	if (len)
+		memset(dstp, 0, len);
+	immdp->op = FW_RI_DATA_IMMD;
+	immdp->r1 = 0;
+	immdp->r2 = 0;
+	immdp->immdlen = cpu_to_be32(plen);
+	*plenp = plen;
+	return 0;
+}
+
+static int build_isgl(__be64 *queue_start, __be64 *queue_end,
+		      struct fw_ri_isgl *isglp, struct ib_sge *sg_list,
+		      int num_sge, u32 *plenp)
+
+{
+	int i;
+	u32 plen = 0;
+	__be64 *flitp = (__be64 *)isglp->sge;
+
+	for (i = 0; i < num_sge; i++) {
+		if ((plen + sg_list[i].length) < plen)
+			return -EMSGSIZE;
+		plen += sg_list[i].length;
+		*flitp = cpu_to_be64(((u64)sg_list[i].lkey << 32) |
+				     sg_list[i].length);
+		if (++flitp == queue_end)
+			flitp = queue_start;
+		*flitp = cpu_to_be64(sg_list[i].addr);
+		if (++flitp == queue_end)
+			flitp = queue_start;
+	}
+	*flitp = (__force __be64)0;
+	isglp->op = FW_RI_DATA_ISGL;
+	isglp->r1 = 0;
+	isglp->nsge = cpu_to_be16(num_sge);
+	isglp->r2 = 0;
+	if (plenp)
+		*plenp = plen;
+	return 0;
+}
+
+static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe,
+			   struct ib_send_wr *wr, u8 *len16)
+{
+	u32 plen;
+	int size;
+	int ret;
+
+	if (wr->num_sge > T4_MAX_SEND_SGE)
+		return -EINVAL;
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.sendop_pkd = cpu_to_be32(
+				FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND_WITH_SE));
+		else
+			wqe->send.sendop_pkd = cpu_to_be32(
+				FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND));
+		wqe->send.stag_inv = 0;
+		break;
+	case IB_WR_SEND_WITH_INV:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.sendop_pkd = cpu_to_be32(
+				FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND_WITH_SE_INV));
+		else
+			wqe->send.sendop_pkd = cpu_to_be32(
+				FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND_WITH_INV));
+		wqe->send.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	wqe->send.r3 = 0;
+	wqe->send.r4 = 0;
+
+	plen = 0;
+	if (wr->num_sge) {
+		if (wr->send_flags & IB_SEND_INLINE) {
+			ret = build_immd(sq, wqe->send.u.immd_src, wr,
+					 T4_MAX_SEND_INLINE, &plen);
+			if (ret)
+				return ret;
+			size = sizeof wqe->send + sizeof(struct fw_ri_immd) +
+			       plen;
+		} else {
+			ret = build_isgl((__be64 *)sq->queue,
+					 (__be64 *)&sq->queue[sq->size],
+					 wqe->send.u.isgl_src,
+					 wr->sg_list, wr->num_sge, &plen);
+			if (ret)
+				return ret;
+			size = sizeof wqe->send + sizeof(struct fw_ri_isgl) +
+			       wr->num_sge * sizeof(struct fw_ri_sge);
+		}
+	} else {
+		wqe->send.u.immd_src[0].op = FW_RI_DATA_IMMD;
+		wqe->send.u.immd_src[0].r1 = 0;
+		wqe->send.u.immd_src[0].r2 = 0;
+		wqe->send.u.immd_src[0].immdlen = 0;
+		size = sizeof wqe->send + sizeof(struct fw_ri_immd);
+		plen = 0;
+	}
+	*len16 = DIV_ROUND_UP(size, 16);
+	wqe->send.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe,
+			    struct ib_send_wr *wr, u8 *len16)
+{
+	u32 plen;
+	int size;
+	int ret;
+
+	if (wr->num_sge > T4_MAX_SEND_SGE)
+		return -EINVAL;
+	wqe->write.r2 = 0;
+	wqe->write.stag_sink = cpu_to_be32(rdma_wr(wr)->rkey);
+	wqe->write.to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr);
+	if (wr->num_sge) {
+		if (wr->send_flags & IB_SEND_INLINE) {
+			ret = build_immd(sq, wqe->write.u.immd_src, wr,
+					 T4_MAX_WRITE_INLINE, &plen);
+			if (ret)
+				return ret;
+			size = sizeof wqe->write + sizeof(struct fw_ri_immd) +
+			       plen;
+		} else {
+			ret = build_isgl((__be64 *)sq->queue,
+					 (__be64 *)&sq->queue[sq->size],
+					 wqe->write.u.isgl_src,
+					 wr->sg_list, wr->num_sge, &plen);
+			if (ret)
+				return ret;
+			size = sizeof wqe->write + sizeof(struct fw_ri_isgl) +
+			       wr->num_sge * sizeof(struct fw_ri_sge);
+		}
+	} else {
+		wqe->write.u.immd_src[0].op = FW_RI_DATA_IMMD;
+		wqe->write.u.immd_src[0].r1 = 0;
+		wqe->write.u.immd_src[0].r2 = 0;
+		wqe->write.u.immd_src[0].immdlen = 0;
+		size = sizeof wqe->write + sizeof(struct fw_ri_immd);
+		plen = 0;
+	}
+	*len16 = DIV_ROUND_UP(size, 16);
+	wqe->write.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_read(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16)
+{
+	if (wr->num_sge > 1)
+		return -EINVAL;
+	if (wr->num_sge && wr->sg_list[0].length) {
+		wqe->read.stag_src = cpu_to_be32(rdma_wr(wr)->rkey);
+		wqe->read.to_src_hi = cpu_to_be32((u32)(rdma_wr(wr)->remote_addr
+							>> 32));
+		wqe->read.to_src_lo = cpu_to_be32((u32)rdma_wr(wr)->remote_addr);
+		wqe->read.stag_sink = cpu_to_be32(wr->sg_list[0].lkey);
+		wqe->read.plen = cpu_to_be32(wr->sg_list[0].length);
+		wqe->read.to_sink_hi = cpu_to_be32((u32)(wr->sg_list[0].addr
+							 >> 32));
+		wqe->read.to_sink_lo = cpu_to_be32((u32)(wr->sg_list[0].addr));
+	} else {
+		wqe->read.stag_src = cpu_to_be32(2);
+		wqe->read.to_src_hi = 0;
+		wqe->read.to_src_lo = 0;
+		wqe->read.stag_sink = cpu_to_be32(2);
+		wqe->read.plen = 0;
+		wqe->read.to_sink_hi = 0;
+		wqe->read.to_sink_lo = 0;
+	}
+	wqe->read.r2 = 0;
+	wqe->read.r5 = 0;
+	*len16 = DIV_ROUND_UP(sizeof wqe->read, 16);
+	return 0;
+}
+
+static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe,
+			   struct ib_recv_wr *wr, u8 *len16)
+{
+	int ret;
+
+	ret = build_isgl((__be64 *)qhp->wq.rq.queue,
+			 (__be64 *)&qhp->wq.rq.queue[qhp->wq.rq.size],
+			 &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL);
+	if (ret)
+		return ret;
+	*len16 = DIV_ROUND_UP(sizeof wqe->recv +
+			      wr->num_sge * sizeof(struct fw_ri_sge), 16);
+	return 0;
+}
+
+static void build_tpte_memreg(struct fw_ri_fr_nsmr_tpte_wr *fr,
+			      struct ib_reg_wr *wr, struct c4iw_mr *mhp,
+			      u8 *len16)
+{
+	__be64 *p = (__be64 *)fr->pbl;
+
+	fr->r2 = cpu_to_be32(0);
+	fr->stag = cpu_to_be32(mhp->ibmr.rkey);
+
+	fr->tpte.valid_to_pdid = cpu_to_be32(FW_RI_TPTE_VALID_F |
+		FW_RI_TPTE_STAGKEY_V((mhp->ibmr.rkey & FW_RI_TPTE_STAGKEY_M)) |
+		FW_RI_TPTE_STAGSTATE_V(1) |
+		FW_RI_TPTE_STAGTYPE_V(FW_RI_STAG_NSMR) |
+		FW_RI_TPTE_PDID_V(mhp->attr.pdid));
+	fr->tpte.locread_to_qpid = cpu_to_be32(
+		FW_RI_TPTE_PERM_V(c4iw_ib_to_tpt_access(wr->access)) |
+		FW_RI_TPTE_ADDRTYPE_V(FW_RI_VA_BASED_TO) |
+		FW_RI_TPTE_PS_V(ilog2(wr->mr->page_size) - 12));
+	fr->tpte.nosnoop_pbladdr = cpu_to_be32(FW_RI_TPTE_PBLADDR_V(
+		PBL_OFF(&mhp->rhp->rdev, mhp->attr.pbl_addr)>>3));
+	fr->tpte.dca_mwbcnt_pstag = cpu_to_be32(0);
+	fr->tpte.len_hi = cpu_to_be32(0);
+	fr->tpte.len_lo = cpu_to_be32(mhp->ibmr.length);
+	fr->tpte.va_hi = cpu_to_be32(mhp->ibmr.iova >> 32);
+	fr->tpte.va_lo_fbo = cpu_to_be32(mhp->ibmr.iova & 0xffffffff);
+
+	p[0] = cpu_to_be64((u64)mhp->mpl[0]);
+	p[1] = cpu_to_be64((u64)mhp->mpl[1]);
+
+	*len16 = DIV_ROUND_UP(sizeof(*fr), 16);
+}
+
+static int build_memreg(struct t4_sq *sq, union t4_wr *wqe,
+			struct ib_reg_wr *wr, struct c4iw_mr *mhp, u8 *len16,
+			bool dsgl_supported)
+{
+	struct fw_ri_immd *imdp;
+	__be64 *p;
+	int i;
+	int pbllen = roundup(mhp->mpl_len * sizeof(u64), 32);
+	int rem;
+
+	if (mhp->mpl_len > t4_max_fr_depth(dsgl_supported && use_dsgl))
+		return -EINVAL;
+
+	wqe->fr.qpbinde_to_dcacpu = 0;
+	wqe->fr.pgsz_shift = ilog2(wr->mr->page_size) - 12;
+	wqe->fr.addr_type = FW_RI_VA_BASED_TO;
+	wqe->fr.mem_perms = c4iw_ib_to_tpt_access(wr->access);
+	wqe->fr.len_hi = 0;
+	wqe->fr.len_lo = cpu_to_be32(mhp->ibmr.length);
+	wqe->fr.stag = cpu_to_be32(wr->key);
+	wqe->fr.va_hi = cpu_to_be32(mhp->ibmr.iova >> 32);
+	wqe->fr.va_lo_fbo = cpu_to_be32(mhp->ibmr.iova &
+					0xffffffff);
+
+	if (dsgl_supported && use_dsgl && (pbllen > max_fr_immd)) {
+		struct fw_ri_dsgl *sglp;
+
+		for (i = 0; i < mhp->mpl_len; i++)
+			mhp->mpl[i] = (__force u64)cpu_to_be64((u64)mhp->mpl[i]);
+
+		sglp = (struct fw_ri_dsgl *)(&wqe->fr + 1);
+		sglp->op = FW_RI_DATA_DSGL;
+		sglp->r1 = 0;
+		sglp->nsge = cpu_to_be16(1);
+		sglp->addr0 = cpu_to_be64(mhp->mpl_addr);
+		sglp->len0 = cpu_to_be32(pbllen);
+
+		*len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*sglp), 16);
+	} else {
+		imdp = (struct fw_ri_immd *)(&wqe->fr + 1);
+		imdp->op = FW_RI_DATA_IMMD;
+		imdp->r1 = 0;
+		imdp->r2 = 0;
+		imdp->immdlen = cpu_to_be32(pbllen);
+		p = (__be64 *)(imdp + 1);
+		rem = pbllen;
+		for (i = 0; i < mhp->mpl_len; i++) {
+			*p = cpu_to_be64((u64)mhp->mpl[i]);
+			rem -= sizeof(*p);
+			if (++p == (__be64 *)&sq->queue[sq->size])
+				p = (__be64 *)sq->queue;
+		}
+		while (rem) {
+			*p = 0;
+			rem -= sizeof(*p);
+			if (++p == (__be64 *)&sq->queue[sq->size])
+				p = (__be64 *)sq->queue;
+		}
+		*len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*imdp)
+				      + pbllen, 16);
+	}
+	return 0;
+}
+
+static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16)
+{
+	wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey);
+	wqe->inv.r2 = 0;
+	*len16 = DIV_ROUND_UP(sizeof wqe->inv, 16);
+	return 0;
+}
+
+static void free_qp_work(struct work_struct *work)
+{
+	struct c4iw_ucontext *ucontext;
+	struct c4iw_qp *qhp;
+	struct c4iw_dev *rhp;
+
+	qhp = container_of(work, struct c4iw_qp, free_work);
+	ucontext = qhp->ucontext;
+	rhp = qhp->rhp;
+
+	pr_debug("qhp %p ucontext %p\n", qhp, ucontext);
+	destroy_qp(&rhp->rdev, &qhp->wq,
+		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+
+	if (ucontext)
+		c4iw_put_ucontext(ucontext);
+	c4iw_put_wr_wait(qhp->wr_waitp);
+	kfree(qhp);
+}
+
+static void queue_qp_free(struct kref *kref)
+{
+	struct c4iw_qp *qhp;
+
+	qhp = container_of(kref, struct c4iw_qp, kref);
+	pr_debug("qhp %p\n", qhp);
+	queue_work(qhp->rhp->rdev.free_workq, &qhp->free_work);
+}
+
+void c4iw_qp_add_ref(struct ib_qp *qp)
+{
+	pr_debug("ib_qp %p\n", qp);
+	kref_get(&to_c4iw_qp(qp)->kref);
+}
+
+void c4iw_qp_rem_ref(struct ib_qp *qp)
+{
+	pr_debug("ib_qp %p\n", qp);
+	kref_put(&to_c4iw_qp(qp)->kref, queue_qp_free);
+}
+
+static void add_to_fc_list(struct list_head *head, struct list_head *entry)
+{
+	if (list_empty(entry))
+		list_add_tail(entry, head);
+}
+
+static int ring_kernel_sq_db(struct c4iw_qp *qhp, u16 inc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qhp->rhp->lock, flags);
+	spin_lock(&qhp->lock);
+	if (qhp->rhp->db_state == NORMAL)
+		t4_ring_sq_db(&qhp->wq, inc, NULL);
+	else {
+		add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry);
+		qhp->wq.sq.wq_pidx_inc += inc;
+	}
+	spin_unlock(&qhp->lock);
+	spin_unlock_irqrestore(&qhp->rhp->lock, flags);
+	return 0;
+}
+
+static int ring_kernel_rq_db(struct c4iw_qp *qhp, u16 inc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qhp->rhp->lock, flags);
+	spin_lock(&qhp->lock);
+	if (qhp->rhp->db_state == NORMAL)
+		t4_ring_rq_db(&qhp->wq, inc, NULL);
+	else {
+		add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry);
+		qhp->wq.rq.wq_pidx_inc += inc;
+	}
+	spin_unlock(&qhp->lock);
+	spin_unlock_irqrestore(&qhp->rhp->lock, flags);
+	return 0;
+}
+
+static int ib_to_fw_opcode(int ib_opcode)
+{
+	int opcode;
+
+	switch (ib_opcode) {
+	case IB_WR_SEND_WITH_INV:
+		opcode = FW_RI_SEND_WITH_INV;
+		break;
+	case IB_WR_SEND:
+		opcode = FW_RI_SEND;
+		break;
+	case IB_WR_RDMA_WRITE:
+		opcode = FW_RI_RDMA_WRITE;
+		break;
+	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_READ_WITH_INV:
+		opcode = FW_RI_READ_REQ;
+		break;
+	case IB_WR_REG_MR:
+		opcode = FW_RI_FAST_REGISTER;
+		break;
+	case IB_WR_LOCAL_INV:
+		opcode = FW_RI_LOCAL_INV;
+		break;
+	default:
+		opcode = -EINVAL;
+	}
+	return opcode;
+}
+
+static int complete_sq_drain_wr(struct c4iw_qp *qhp, struct ib_send_wr *wr)
+{
+	struct t4_cqe cqe = {};
+	struct c4iw_cq *schp;
+	unsigned long flag;
+	struct t4_cq *cq;
+	int opcode;
+
+	schp = to_c4iw_cq(qhp->ibqp.send_cq);
+	cq = &schp->cq;
+
+	opcode = ib_to_fw_opcode(wr->opcode);
+	if (opcode < 0)
+		return opcode;
+
+	cqe.u.drain_cookie = wr->wr_id;
+	cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) |
+				 CQE_OPCODE_V(opcode) |
+				 CQE_TYPE_V(1) |
+				 CQE_SWCQE_V(1) |
+				 CQE_DRAIN_V(1) |
+				 CQE_QPID_V(qhp->wq.sq.qid));
+
+	spin_lock_irqsave(&schp->lock, flag);
+	cqe.bits_type_ts = cpu_to_be64(CQE_GENBIT_V((u64)cq->gen));
+	cq->sw_queue[cq->sw_pidx] = cqe;
+	t4_swcq_produce(cq);
+	spin_unlock_irqrestore(&schp->lock, flag);
+
+	if (t4_clear_cq_armed(&schp->cq)) {
+		spin_lock_irqsave(&schp->comp_handler_lock, flag);
+		(*schp->ibcq.comp_handler)(&schp->ibcq,
+					   schp->ibcq.cq_context);
+		spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
+	}
+	return 0;
+}
+
+static int complete_sq_drain_wrs(struct c4iw_qp *qhp, struct ib_send_wr *wr,
+				struct ib_send_wr **bad_wr)
+{
+	int ret = 0;
+
+	while (wr) {
+		ret = complete_sq_drain_wr(qhp, wr);
+		if (ret) {
+			*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+	return ret;
+}
+
+static void complete_rq_drain_wr(struct c4iw_qp *qhp, struct ib_recv_wr *wr)
+{
+	struct t4_cqe cqe = {};
+	struct c4iw_cq *rchp;
+	unsigned long flag;
+	struct t4_cq *cq;
+
+	rchp = to_c4iw_cq(qhp->ibqp.recv_cq);
+	cq = &rchp->cq;
+
+	cqe.u.drain_cookie = wr->wr_id;
+	cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) |
+				 CQE_OPCODE_V(FW_RI_SEND) |
+				 CQE_TYPE_V(0) |
+				 CQE_SWCQE_V(1) |
+				 CQE_DRAIN_V(1) |
+				 CQE_QPID_V(qhp->wq.sq.qid));
+
+	spin_lock_irqsave(&rchp->lock, flag);
+	cqe.bits_type_ts = cpu_to_be64(CQE_GENBIT_V((u64)cq->gen));
+	cq->sw_queue[cq->sw_pidx] = cqe;
+	t4_swcq_produce(cq);
+	spin_unlock_irqrestore(&rchp->lock, flag);
+
+	if (t4_clear_cq_armed(&rchp->cq)) {
+		spin_lock_irqsave(&rchp->comp_handler_lock, flag);
+		(*rchp->ibcq.comp_handler)(&rchp->ibcq,
+					   rchp->ibcq.cq_context);
+		spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
+	}
+}
+
+static void complete_rq_drain_wrs(struct c4iw_qp *qhp, struct ib_recv_wr *wr)
+{
+	while (wr) {
+		complete_rq_drain_wr(qhp, wr);
+		wr = wr->next;
+	}
+}
+
+int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		   struct ib_send_wr **bad_wr)
+{
+	int err = 0;
+	u8 len16 = 0;
+	enum fw_wr_opcodes fw_opcode = 0;
+	enum fw_ri_wr_flags fw_flags;
+	struct c4iw_qp *qhp;
+	union t4_wr *wqe = NULL;
+	u32 num_wrs;
+	struct t4_swsqe *swsqe;
+	unsigned long flag;
+	u16 idx = 0;
+
+	qhp = to_c4iw_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+
+	/*
+	 * If the qp has been flushed, then just insert a special
+	 * drain cqe.
+	 */
+	if (qhp->wq.flushed) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		err = complete_sq_drain_wrs(qhp, wr, bad_wr);
+		return err;
+	}
+	num_wrs = t4_sq_avail(&qhp->wq);
+	if (num_wrs == 0) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		*bad_wr = wr;
+		return -ENOMEM;
+	}
+	while (wr) {
+		if (num_wrs == 0) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+		wqe = (union t4_wr *)((u8 *)qhp->wq.sq.queue +
+		      qhp->wq.sq.wq_pidx * T4_EQ_ENTRY_SIZE);
+
+		fw_flags = 0;
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			fw_flags |= FW_RI_SOLICITED_EVENT_FLAG;
+		if (wr->send_flags & IB_SEND_SIGNALED || qhp->sq_sig_all)
+			fw_flags |= FW_RI_COMPLETION_FLAG;
+		swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx];
+		switch (wr->opcode) {
+		case IB_WR_SEND_WITH_INV:
+		case IB_WR_SEND:
+			if (wr->send_flags & IB_SEND_FENCE)
+				fw_flags |= FW_RI_READ_FENCE_FLAG;
+			fw_opcode = FW_RI_SEND_WR;
+			if (wr->opcode == IB_WR_SEND)
+				swsqe->opcode = FW_RI_SEND;
+			else
+				swsqe->opcode = FW_RI_SEND_WITH_INV;
+			err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16);
+			break;
+		case IB_WR_RDMA_WRITE:
+			fw_opcode = FW_RI_RDMA_WRITE_WR;
+			swsqe->opcode = FW_RI_RDMA_WRITE;
+			err = build_rdma_write(&qhp->wq.sq, wqe, wr, &len16);
+			break;
+		case IB_WR_RDMA_READ:
+		case IB_WR_RDMA_READ_WITH_INV:
+			fw_opcode = FW_RI_RDMA_READ_WR;
+			swsqe->opcode = FW_RI_READ_REQ;
+			if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) {
+				c4iw_invalidate_mr(qhp->rhp,
+						   wr->sg_list[0].lkey);
+				fw_flags = FW_RI_RDMA_READ_INVALIDATE;
+			} else {
+				fw_flags = 0;
+			}
+			err = build_rdma_read(wqe, wr, &len16);
+			if (err)
+				break;
+			swsqe->read_len = wr->sg_list[0].length;
+			if (!qhp->wq.sq.oldest_read)
+				qhp->wq.sq.oldest_read = swsqe;
+			break;
+		case IB_WR_REG_MR: {
+			struct c4iw_mr *mhp = to_c4iw_mr(reg_wr(wr)->mr);
+
+			swsqe->opcode = FW_RI_FAST_REGISTER;
+			if (qhp->rhp->rdev.lldi.fr_nsmr_tpte_wr_support &&
+			    !mhp->attr.state && mhp->mpl_len <= 2) {
+				fw_opcode = FW_RI_FR_NSMR_TPTE_WR;
+				build_tpte_memreg(&wqe->fr_tpte, reg_wr(wr),
+						  mhp, &len16);
+			} else {
+				fw_opcode = FW_RI_FR_NSMR_WR;
+				err = build_memreg(&qhp->wq.sq, wqe, reg_wr(wr),
+				       mhp, &len16,
+				       qhp->rhp->rdev.lldi.ulptx_memwrite_dsgl);
+				if (err)
+					break;
+			}
+			mhp->attr.state = 1;
+			break;
+		}
+		case IB_WR_LOCAL_INV:
+			if (wr->send_flags & IB_SEND_FENCE)
+				fw_flags |= FW_RI_LOCAL_FENCE_FLAG;
+			fw_opcode = FW_RI_INV_LSTAG_WR;
+			swsqe->opcode = FW_RI_LOCAL_INV;
+			err = build_inv_stag(wqe, wr, &len16);
+			c4iw_invalidate_mr(qhp->rhp, wr->ex.invalidate_rkey);
+			break;
+		default:
+			pr_warn("%s post of type=%d TBD!\n", __func__,
+				wr->opcode);
+			err = -EINVAL;
+		}
+		if (err) {
+			*bad_wr = wr;
+			break;
+		}
+		swsqe->idx = qhp->wq.sq.pidx;
+		swsqe->complete = 0;
+		swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED) ||
+				  qhp->sq_sig_all;
+		swsqe->flushed = 0;
+		swsqe->wr_id = wr->wr_id;
+		if (c4iw_wr_log) {
+			swsqe->sge_ts = cxgb4_read_sge_timestamp(
+					qhp->rhp->rdev.lldi.ports[0]);
+			swsqe->host_time = ktime_get();
+		}
+
+		init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16);
+
+		pr_debug("cookie 0x%llx pidx 0x%x opcode 0x%x read_len %u\n",
+			 (unsigned long long)wr->wr_id, qhp->wq.sq.pidx,
+			 swsqe->opcode, swsqe->read_len);
+		wr = wr->next;
+		num_wrs--;
+		t4_sq_produce(&qhp->wq, len16);
+		idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+	}
+	if (!qhp->rhp->rdev.status_page->db_off) {
+		t4_ring_sq_db(&qhp->wq, idx, wqe);
+		spin_unlock_irqrestore(&qhp->lock, flag);
+	} else {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		ring_kernel_sq_db(qhp, idx);
+	}
+	return err;
+}
+
+int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	int err = 0;
+	struct c4iw_qp *qhp;
+	union t4_recv_wr *wqe = NULL;
+	u32 num_wrs;
+	u8 len16 = 0;
+	unsigned long flag;
+	u16 idx = 0;
+
+	qhp = to_c4iw_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+
+	/*
+	 * If the qp has been flushed, then just insert a special
+	 * drain cqe.
+	 */
+	if (qhp->wq.flushed) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		complete_rq_drain_wrs(qhp, wr);
+		return err;
+	}
+	num_wrs = t4_rq_avail(&qhp->wq);
+	if (num_wrs == 0) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		*bad_wr = wr;
+		return -ENOMEM;
+	}
+	while (wr) {
+		if (wr->num_sge > T4_MAX_RECV_SGE) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+		wqe = (union t4_recv_wr *)((u8 *)qhp->wq.rq.queue +
+					   qhp->wq.rq.wq_pidx *
+					   T4_EQ_ENTRY_SIZE);
+		if (num_wrs)
+			err = build_rdma_recv(qhp, wqe, wr, &len16);
+		else
+			err = -ENOMEM;
+		if (err) {
+			*bad_wr = wr;
+			break;
+		}
+
+		qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].wr_id = wr->wr_id;
+		if (c4iw_wr_log) {
+			qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].sge_ts =
+				cxgb4_read_sge_timestamp(
+						qhp->rhp->rdev.lldi.ports[0]);
+			qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].host_time =
+				ktime_get();
+		}
+
+		wqe->recv.opcode = FW_RI_RECV_WR;
+		wqe->recv.r1 = 0;
+		wqe->recv.wrid = qhp->wq.rq.pidx;
+		wqe->recv.r2[0] = 0;
+		wqe->recv.r2[1] = 0;
+		wqe->recv.r2[2] = 0;
+		wqe->recv.len16 = len16;
+		pr_debug("cookie 0x%llx pidx %u\n",
+			 (unsigned long long)wr->wr_id, qhp->wq.rq.pidx);
+		t4_rq_produce(&qhp->wq, len16);
+		idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+		wr = wr->next;
+		num_wrs--;
+	}
+	if (!qhp->rhp->rdev.status_page->db_off) {
+		t4_ring_rq_db(&qhp->wq, idx, wqe);
+		spin_unlock_irqrestore(&qhp->lock, flag);
+	} else {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		ring_kernel_rq_db(qhp, idx);
+	}
+	return err;
+}
+
+static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type,
+				    u8 *ecode)
+{
+	int status;
+	int tagged;
+	int opcode;
+	int rqtype;
+	int send_inv;
+
+	if (!err_cqe) {
+		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		return;
+	}
+
+	status = CQE_STATUS(err_cqe);
+	opcode = CQE_OPCODE(err_cqe);
+	rqtype = RQ_TYPE(err_cqe);
+	send_inv = (opcode == FW_RI_SEND_WITH_INV) ||
+		   (opcode == FW_RI_SEND_WITH_SE_INV);
+	tagged = (opcode == FW_RI_RDMA_WRITE) ||
+		 (rqtype && (opcode == FW_RI_READ_RESP));
+
+	switch (status) {
+	case T4_ERR_STAG:
+		if (send_inv) {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+			*ecode = RDMAP_CANT_INV_STAG;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_INV_STAG;
+		}
+		break;
+	case T4_ERR_PDID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		if ((opcode == FW_RI_SEND_WITH_INV) ||
+		    (opcode == FW_RI_SEND_WITH_SE_INV))
+			*ecode = RDMAP_CANT_INV_STAG;
+		else
+			*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case T4_ERR_QPID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case T4_ERR_ACCESS:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_ACC_VIOL;
+		break;
+	case T4_ERR_WRAP:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_TO_WRAP;
+		break;
+	case T4_ERR_BOUND:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_BASE_BOUNDS;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_BASE_BOUNDS;
+		}
+		break;
+	case T4_ERR_INVALIDATE_SHARED_MR:
+	case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_CANT_INV_STAG;
+		break;
+	case T4_ERR_ECC:
+	case T4_ERR_ECC_PSTAG:
+	case T4_ERR_INTERNAL_ERR:
+		*layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case T4_ERR_OUT_OF_RQE:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_NOBUF;
+		break;
+	case T4_ERR_PBL_ADDR_BOUND:
+		*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+		*ecode = DDPT_BASE_BOUNDS;
+		break;
+	case T4_ERR_CRC:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_CRC_ERR;
+		break;
+	case T4_ERR_MARKER:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_MARKER_ERR;
+		break;
+	case T4_ERR_PDU_LEN_ERR:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_MSG_TOOBIG;
+		break;
+	case T4_ERR_DDP_VERSION:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_INV_VERS;
+		} else {
+			*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+			*ecode = DDPU_INV_VERS;
+		}
+		break;
+	case T4_ERR_RDMA_VERSION:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_VERS;
+		break;
+	case T4_ERR_OPCODE:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_OPCODE;
+		break;
+	case T4_ERR_DDP_QUEUE_NUM:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_QN;
+		break;
+	case T4_ERR_MSN:
+	case T4_ERR_MSN_GAP:
+	case T4_ERR_MSN_RANGE:
+	case T4_ERR_IRD_OVERFLOW:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_RANGE;
+		break;
+	case T4_ERR_TBIT:
+		*layer_type = LAYER_DDP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case T4_ERR_MO:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MO;
+		break;
+	default:
+		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	}
+}
+
+static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe,
+			   gfp_t gfp)
+{
+	struct fw_ri_wr *wqe;
+	struct sk_buff *skb;
+	struct terminate_message *term;
+
+	pr_debug("qhp %p qid 0x%x tid %u\n", qhp, qhp->wq.sq.qid,
+		 qhp->ep->hwtid);
+
+	skb = skb_dequeue(&qhp->ep->com.ep_skb_list);
+	if (WARN_ON(!skb))
+		return;
+
+	set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
+
+	wqe = __skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof *wqe);
+	wqe->op_compl = cpu_to_be32(FW_WR_OP_V(FW_RI_INIT_WR));
+	wqe->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID_V(qhp->ep->hwtid) |
+		FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*wqe), 16)));
+
+	wqe->u.terminate.type = FW_RI_TYPE_TERMINATE;
+	wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term);
+	term = (struct terminate_message *)wqe->u.terminate.termmsg;
+	if (qhp->attr.layer_etype == (LAYER_MPA|DDP_LLP)) {
+		term->layer_etype = qhp->attr.layer_etype;
+		term->ecode = qhp->attr.ecode;
+	} else
+		build_term_codes(err_cqe, &term->layer_etype, &term->ecode);
+	c4iw_ofld_send(&qhp->rhp->rdev, skb);
+}
+
+/*
+ * Assumes qhp lock is held.
+ */
+static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp,
+		       struct c4iw_cq *schp)
+{
+	int count;
+	int rq_flushed, sq_flushed;
+	unsigned long flag;
+
+	pr_debug("qhp %p rchp %p schp %p\n", qhp, rchp, schp);
+
+	/* locking hierarchy: cqs lock first, then qp lock. */
+	spin_lock_irqsave(&rchp->lock, flag);
+	if (schp != rchp)
+		spin_lock(&schp->lock);
+	spin_lock(&qhp->lock);
+
+	if (qhp->wq.flushed) {
+		spin_unlock(&qhp->lock);
+		if (schp != rchp)
+			spin_unlock(&schp->lock);
+		spin_unlock_irqrestore(&rchp->lock, flag);
+		return;
+	}
+	qhp->wq.flushed = 1;
+	t4_set_wq_in_error(&qhp->wq);
+
+	c4iw_flush_hw_cq(rchp, qhp);
+	c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count);
+	rq_flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count);
+
+	if (schp != rchp)
+		c4iw_flush_hw_cq(schp, qhp);
+	sq_flushed = c4iw_flush_sq(qhp);
+
+	spin_unlock(&qhp->lock);
+	if (schp != rchp)
+		spin_unlock(&schp->lock);
+	spin_unlock_irqrestore(&rchp->lock, flag);
+
+	if (schp == rchp) {
+		if ((rq_flushed || sq_flushed) &&
+		    t4_clear_cq_armed(&rchp->cq)) {
+			spin_lock_irqsave(&rchp->comp_handler_lock, flag);
+			(*rchp->ibcq.comp_handler)(&rchp->ibcq,
+						   rchp->ibcq.cq_context);
+			spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
+		}
+	} else {
+		if (rq_flushed && t4_clear_cq_armed(&rchp->cq)) {
+			spin_lock_irqsave(&rchp->comp_handler_lock, flag);
+			(*rchp->ibcq.comp_handler)(&rchp->ibcq,
+						   rchp->ibcq.cq_context);
+			spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
+		}
+		if (sq_flushed && t4_clear_cq_armed(&schp->cq)) {
+			spin_lock_irqsave(&schp->comp_handler_lock, flag);
+			(*schp->ibcq.comp_handler)(&schp->ibcq,
+						   schp->ibcq.cq_context);
+			spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
+		}
+	}
+}
+
+static void flush_qp(struct c4iw_qp *qhp)
+{
+	struct c4iw_cq *rchp, *schp;
+	unsigned long flag;
+
+	rchp = to_c4iw_cq(qhp->ibqp.recv_cq);
+	schp = to_c4iw_cq(qhp->ibqp.send_cq);
+
+	if (qhp->ibqp.uobject) {
+		t4_set_wq_in_error(&qhp->wq);
+		t4_set_cq_in_error(&rchp->cq);
+		spin_lock_irqsave(&rchp->comp_handler_lock, flag);
+		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+		spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
+		if (schp != rchp) {
+			t4_set_cq_in_error(&schp->cq);
+			spin_lock_irqsave(&schp->comp_handler_lock, flag);
+			(*schp->ibcq.comp_handler)(&schp->ibcq,
+					schp->ibcq.cq_context);
+			spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
+		}
+		return;
+	}
+	__flush_qp(qhp, rchp, schp);
+}
+
+static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
+		     struct c4iw_ep *ep)
+{
+	struct fw_ri_wr *wqe;
+	int ret;
+	struct sk_buff *skb;
+
+	pr_debug("qhp %p qid 0x%x tid %u\n", qhp, qhp->wq.sq.qid, ep->hwtid);
+
+	skb = skb_dequeue(&ep->com.ep_skb_list);
+	if (WARN_ON(!skb))
+		return -ENOMEM;
+
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	wqe = __skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof *wqe);
+	wqe->op_compl = cpu_to_be32(
+		FW_WR_OP_V(FW_RI_INIT_WR) |
+		FW_WR_COMPL_F);
+	wqe->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID_V(ep->hwtid) |
+		FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*wqe), 16)));
+	wqe->cookie = (uintptr_t)ep->com.wr_waitp;
+
+	wqe->u.fini.type = FW_RI_TYPE_FINI;
+
+	ret = c4iw_ref_send_wait(&rhp->rdev, skb, ep->com.wr_waitp,
+				 qhp->ep->hwtid, qhp->wq.sq.qid, __func__);
+
+	pr_debug("ret %d\n", ret);
+	return ret;
+}
+
+static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init)
+{
+	pr_debug("p2p_type = %d\n", p2p_type);
+	memset(&init->u, 0, sizeof init->u);
+	switch (p2p_type) {
+	case FW_RI_INIT_P2PTYPE_RDMA_WRITE:
+		init->u.write.opcode = FW_RI_RDMA_WRITE_WR;
+		init->u.write.stag_sink = cpu_to_be32(1);
+		init->u.write.to_sink = cpu_to_be64(1);
+		init->u.write.u.immd_src[0].op = FW_RI_DATA_IMMD;
+		init->u.write.len16 = DIV_ROUND_UP(sizeof init->u.write +
+						   sizeof(struct fw_ri_immd),
+						   16);
+		break;
+	case FW_RI_INIT_P2PTYPE_READ_REQ:
+		init->u.write.opcode = FW_RI_RDMA_READ_WR;
+		init->u.read.stag_src = cpu_to_be32(1);
+		init->u.read.to_src_lo = cpu_to_be32(1);
+		init->u.read.stag_sink = cpu_to_be32(1);
+		init->u.read.to_sink_lo = cpu_to_be32(1);
+		init->u.read.len16 = DIV_ROUND_UP(sizeof init->u.read, 16);
+		break;
+	}
+}
+
+static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp)
+{
+	struct fw_ri_wr *wqe;
+	int ret;
+	struct sk_buff *skb;
+
+	pr_debug("qhp %p qid 0x%x tid %u ird %u ord %u\n", qhp,
+		 qhp->wq.sq.qid, qhp->ep->hwtid, qhp->ep->ird, qhp->ep->ord);
+
+	skb = alloc_skb(sizeof *wqe, GFP_KERNEL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = alloc_ird(rhp, qhp->attr.max_ird);
+	if (ret) {
+		qhp->attr.max_ird = 0;
+		kfree_skb(skb);
+		goto out;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
+
+	wqe = __skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof *wqe);
+	wqe->op_compl = cpu_to_be32(
+		FW_WR_OP_V(FW_RI_INIT_WR) |
+		FW_WR_COMPL_F);
+	wqe->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID_V(qhp->ep->hwtid) |
+		FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*wqe), 16)));
+
+	wqe->cookie = (uintptr_t)qhp->ep->com.wr_waitp;
+
+	wqe->u.init.type = FW_RI_TYPE_INIT;
+	wqe->u.init.mpareqbit_p2ptype =
+		FW_RI_WR_MPAREQBIT_V(qhp->attr.mpa_attr.initiator) |
+		FW_RI_WR_P2PTYPE_V(qhp->attr.mpa_attr.p2p_type);
+	wqe->u.init.mpa_attrs = FW_RI_MPA_IETF_ENABLE;
+	if (qhp->attr.mpa_attr.recv_marker_enabled)
+		wqe->u.init.mpa_attrs |= FW_RI_MPA_RX_MARKER_ENABLE;
+	if (qhp->attr.mpa_attr.xmit_marker_enabled)
+		wqe->u.init.mpa_attrs |= FW_RI_MPA_TX_MARKER_ENABLE;
+	if (qhp->attr.mpa_attr.crc_enabled)
+		wqe->u.init.mpa_attrs |= FW_RI_MPA_CRC_ENABLE;
+
+	wqe->u.init.qp_caps = FW_RI_QP_RDMA_READ_ENABLE |
+			    FW_RI_QP_RDMA_WRITE_ENABLE |
+			    FW_RI_QP_BIND_ENABLE;
+	if (!qhp->ibqp.uobject)
+		wqe->u.init.qp_caps |= FW_RI_QP_FAST_REGISTER_ENABLE |
+				     FW_RI_QP_STAG0_ENABLE;
+	wqe->u.init.nrqe = cpu_to_be16(t4_rqes_posted(&qhp->wq));
+	wqe->u.init.pdid = cpu_to_be32(qhp->attr.pd);
+	wqe->u.init.qpid = cpu_to_be32(qhp->wq.sq.qid);
+	wqe->u.init.sq_eqid = cpu_to_be32(qhp->wq.sq.qid);
+	wqe->u.init.rq_eqid = cpu_to_be32(qhp->wq.rq.qid);
+	wqe->u.init.scqid = cpu_to_be32(qhp->attr.scq);
+	wqe->u.init.rcqid = cpu_to_be32(qhp->attr.rcq);
+	wqe->u.init.ord_max = cpu_to_be32(qhp->attr.max_ord);
+	wqe->u.init.ird_max = cpu_to_be32(qhp->attr.max_ird);
+	wqe->u.init.iss = cpu_to_be32(qhp->ep->snd_seq);
+	wqe->u.init.irs = cpu_to_be32(qhp->ep->rcv_seq);
+	wqe->u.init.hwrqsize = cpu_to_be32(qhp->wq.rq.rqt_size);
+	wqe->u.init.hwrqaddr = cpu_to_be32(qhp->wq.rq.rqt_hwaddr -
+					 rhp->rdev.lldi.vr->rq.start);
+	if (qhp->attr.mpa_attr.initiator)
+		build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init);
+
+	ret = c4iw_ref_send_wait(&rhp->rdev, skb, qhp->ep->com.wr_waitp,
+				 qhp->ep->hwtid, qhp->wq.sq.qid, __func__);
+	if (!ret)
+		goto out;
+
+	free_ird(rhp, qhp->attr.max_ird);
+out:
+	pr_debug("ret %d\n", ret);
+	return ret;
+}
+
+int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
+		   enum c4iw_qp_attr_mask mask,
+		   struct c4iw_qp_attributes *attrs,
+		   int internal)
+{
+	int ret = 0;
+	struct c4iw_qp_attributes newattr = qhp->attr;
+	int disconnect = 0;
+	int terminate = 0;
+	int abort = 0;
+	int free = 0;
+	struct c4iw_ep *ep = NULL;
+
+	pr_debug("qhp %p sqid 0x%x rqid 0x%x ep %p state %d -> %d\n",
+		 qhp, qhp->wq.sq.qid, qhp->wq.rq.qid, qhp->ep, qhp->attr.state,
+		 (mask & C4IW_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1);
+
+	mutex_lock(&qhp->mutex);
+
+	/* Process attr changes if in IDLE */
+	if (mask & C4IW_QP_ATTR_VALID_MODIFY) {
+		if (qhp->attr.state != C4IW_QP_STATE_IDLE) {
+			ret = -EIO;
+			goto out;
+		}
+		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_READ)
+			newattr.enable_rdma_read = attrs->enable_rdma_read;
+		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_WRITE)
+			newattr.enable_rdma_write = attrs->enable_rdma_write;
+		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_BIND)
+			newattr.enable_bind = attrs->enable_bind;
+		if (mask & C4IW_QP_ATTR_MAX_ORD) {
+			if (attrs->max_ord > c4iw_max_read_depth) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ord = attrs->max_ord;
+		}
+		if (mask & C4IW_QP_ATTR_MAX_IRD) {
+			if (attrs->max_ird > cur_max_read_depth(rhp)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ird = attrs->max_ird;
+		}
+		qhp->attr = newattr;
+	}
+
+	if (mask & C4IW_QP_ATTR_SQ_DB) {
+		ret = ring_kernel_sq_db(qhp, attrs->sq_db_inc);
+		goto out;
+	}
+	if (mask & C4IW_QP_ATTR_RQ_DB) {
+		ret = ring_kernel_rq_db(qhp, attrs->rq_db_inc);
+		goto out;
+	}
+
+	if (!(mask & C4IW_QP_ATTR_NEXT_STATE))
+		goto out;
+	if (qhp->attr.state == attrs->next_state)
+		goto out;
+
+	switch (qhp->attr.state) {
+	case C4IW_QP_STATE_IDLE:
+		switch (attrs->next_state) {
+		case C4IW_QP_STATE_RTS:
+			if (!(mask & C4IW_QP_ATTR_LLP_STREAM_HANDLE)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (!(mask & C4IW_QP_ATTR_MPA_ATTR)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			qhp->attr.mpa_attr = attrs->mpa_attr;
+			qhp->attr.llp_stream_handle = attrs->llp_stream_handle;
+			qhp->ep = qhp->attr.llp_stream_handle;
+			set_state(qhp, C4IW_QP_STATE_RTS);
+
+			/*
+			 * Ref the endpoint here and deref when we
+			 * disassociate the endpoint from the QP.  This
+			 * happens in CLOSING->IDLE transition or *->ERROR
+			 * transition.
+			 */
+			c4iw_get_ep(&qhp->ep->com);
+			ret = rdma_init(rhp, qhp);
+			if (ret)
+				goto err;
+			break;
+		case C4IW_QP_STATE_ERROR:
+			set_state(qhp, C4IW_QP_STATE_ERROR);
+			flush_qp(qhp);
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case C4IW_QP_STATE_RTS:
+		switch (attrs->next_state) {
+		case C4IW_QP_STATE_CLOSING:
+			t4_set_wq_in_error(&qhp->wq);
+			set_state(qhp, C4IW_QP_STATE_CLOSING);
+			ep = qhp->ep;
+			if (!internal) {
+				abort = 0;
+				disconnect = 1;
+				c4iw_get_ep(&qhp->ep->com);
+			}
+			ret = rdma_fini(rhp, qhp, ep);
+			if (ret)
+				goto err;
+			break;
+		case C4IW_QP_STATE_TERMINATE:
+			t4_set_wq_in_error(&qhp->wq);
+			set_state(qhp, C4IW_QP_STATE_TERMINATE);
+			qhp->attr.layer_etype = attrs->layer_etype;
+			qhp->attr.ecode = attrs->ecode;
+			ep = qhp->ep;
+			if (!internal) {
+				c4iw_get_ep(&qhp->ep->com);
+				terminate = 1;
+				disconnect = 1;
+			} else {
+				terminate = qhp->attr.send_term;
+				ret = rdma_fini(rhp, qhp, ep);
+				if (ret)
+					goto err;
+			}
+			break;
+		case C4IW_QP_STATE_ERROR:
+			t4_set_wq_in_error(&qhp->wq);
+			set_state(qhp, C4IW_QP_STATE_ERROR);
+			if (!internal) {
+				abort = 1;
+				disconnect = 1;
+				ep = qhp->ep;
+				c4iw_get_ep(&qhp->ep->com);
+			}
+			goto err;
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case C4IW_QP_STATE_CLOSING:
+
+		/*
+		 * Allow kernel users to move to ERROR for qp draining.
+		 */
+		if (!internal && (qhp->ibqp.uobject || attrs->next_state !=
+				  C4IW_QP_STATE_ERROR)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		switch (attrs->next_state) {
+		case C4IW_QP_STATE_IDLE:
+			flush_qp(qhp);
+			set_state(qhp, C4IW_QP_STATE_IDLE);
+			qhp->attr.llp_stream_handle = NULL;
+			c4iw_put_ep(&qhp->ep->com);
+			qhp->ep = NULL;
+			wake_up(&qhp->wait);
+			break;
+		case C4IW_QP_STATE_ERROR:
+			goto err;
+		default:
+			ret = -EINVAL;
+			goto err;
+		}
+		break;
+	case C4IW_QP_STATE_ERROR:
+		if (attrs->next_state != C4IW_QP_STATE_IDLE) {
+			ret = -EINVAL;
+			goto out;
+		}
+		if (!t4_sq_empty(&qhp->wq) || !t4_rq_empty(&qhp->wq)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		set_state(qhp, C4IW_QP_STATE_IDLE);
+		break;
+	case C4IW_QP_STATE_TERMINATE:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		goto err;
+		break;
+	default:
+		pr_err("%s in a bad state %d\n", __func__, qhp->attr.state);
+		ret = -EINVAL;
+		goto err;
+		break;
+	}
+	goto out;
+err:
+	pr_debug("disassociating ep %p qpid 0x%x\n", qhp->ep,
+		 qhp->wq.sq.qid);
+
+	/* disassociate the LLP connection */
+	qhp->attr.llp_stream_handle = NULL;
+	if (!ep)
+		ep = qhp->ep;
+	qhp->ep = NULL;
+	set_state(qhp, C4IW_QP_STATE_ERROR);
+	free = 1;
+	abort = 1;
+	flush_qp(qhp);
+	wake_up(&qhp->wait);
+out:
+	mutex_unlock(&qhp->mutex);
+
+	if (terminate)
+		post_terminate(qhp, NULL, internal ? GFP_ATOMIC : GFP_KERNEL);
+
+	/*
+	 * If disconnect is 1, then we need to initiate a disconnect
+	 * on the EP.  This can be a normal close (RTS->CLOSING) or
+	 * an abnormal close (RTS/CLOSING->ERROR).
+	 */
+	if (disconnect) {
+		c4iw_ep_disconnect(ep, abort, internal ? GFP_ATOMIC :
+							 GFP_KERNEL);
+		c4iw_put_ep(&ep->com);
+	}
+
+	/*
+	 * If free is 1, then we've disassociated the EP from the QP
+	 * and we need to dereference the EP.
+	 */
+	if (free)
+		c4iw_put_ep(&ep->com);
+	pr_debug("exit state %d\n", qhp->attr.state);
+	return ret;
+}
+
+int c4iw_destroy_qp(struct ib_qp *ib_qp)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_qp *qhp;
+	struct c4iw_qp_attributes attrs;
+
+	qhp = to_c4iw_qp(ib_qp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = C4IW_QP_STATE_ERROR;
+	if (qhp->attr.state == C4IW_QP_STATE_TERMINATE)
+		c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+	else
+		c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0);
+	wait_event(qhp->wait, !qhp->ep);
+
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
+
+	spin_lock_irq(&rhp->lock);
+	if (!list_empty(&qhp->db_fc_entry))
+		list_del_init(&qhp->db_fc_entry);
+	spin_unlock_irq(&rhp->lock);
+	free_ird(rhp, qhp->attr.max_ird);
+
+	c4iw_qp_rem_ref(ib_qp);
+
+	pr_debug("ib_qp %p qpid 0x%0x\n", ib_qp, qhp->wq.sq.qid);
+	return 0;
+}
+
+struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
+			     struct ib_udata *udata)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_qp *qhp;
+	struct c4iw_pd *php;
+	struct c4iw_cq *schp;
+	struct c4iw_cq *rchp;
+	struct c4iw_create_qp_resp uresp;
+	unsigned int sqsize, rqsize;
+	struct c4iw_ucontext *ucontext;
+	int ret;
+	struct c4iw_mm_entry *sq_key_mm, *rq_key_mm = NULL, *sq_db_key_mm;
+	struct c4iw_mm_entry *rq_db_key_mm = NULL, *ma_sync_key_mm = NULL;
+
+	pr_debug("ib_pd %p\n", pd);
+
+	if (attrs->qp_type != IB_QPT_RC)
+		return ERR_PTR(-EINVAL);
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	schp = get_chp(rhp, ((struct c4iw_cq *)attrs->send_cq)->cq.cqid);
+	rchp = get_chp(rhp, ((struct c4iw_cq *)attrs->recv_cq)->cq.cqid);
+	if (!schp || !rchp)
+		return ERR_PTR(-EINVAL);
+
+	if (attrs->cap.max_inline_data > T4_MAX_SEND_INLINE)
+		return ERR_PTR(-EINVAL);
+
+	if (attrs->cap.max_recv_wr > rhp->rdev.hw_queue.t4_max_rq_size)
+		return ERR_PTR(-E2BIG);
+	rqsize = attrs->cap.max_recv_wr + 1;
+	if (rqsize < 8)
+		rqsize = 8;
+
+	if (attrs->cap.max_send_wr > rhp->rdev.hw_queue.t4_max_sq_size)
+		return ERR_PTR(-E2BIG);
+	sqsize = attrs->cap.max_send_wr + 1;
+	if (sqsize < 8)
+		sqsize = 8;
+
+	ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL;
+
+	qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
+	if (!qhp)
+		return ERR_PTR(-ENOMEM);
+
+	qhp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL);
+	if (!qhp->wr_waitp) {
+		ret = -ENOMEM;
+		goto err_free_qhp;
+	}
+
+	qhp->wq.sq.size = sqsize;
+	qhp->wq.sq.memsize =
+		(sqsize + rhp->rdev.hw_queue.t4_eq_status_entries) *
+		sizeof(*qhp->wq.sq.queue) + 16 * sizeof(__be64);
+	qhp->wq.sq.flush_cidx = -1;
+	qhp->wq.rq.size = rqsize;
+	qhp->wq.rq.memsize =
+		(rqsize + rhp->rdev.hw_queue.t4_eq_status_entries) *
+		sizeof(*qhp->wq.rq.queue);
+
+	if (ucontext) {
+		qhp->wq.sq.memsize = roundup(qhp->wq.sq.memsize, PAGE_SIZE);
+		qhp->wq.rq.memsize = roundup(qhp->wq.rq.memsize, PAGE_SIZE);
+	}
+
+	ret = create_qp(&rhp->rdev, &qhp->wq, &schp->cq, &rchp->cq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
+			qhp->wr_waitp);
+	if (ret)
+		goto err_free_wr_wait;
+
+	attrs->cap.max_recv_wr = rqsize - 1;
+	attrs->cap.max_send_wr = sqsize - 1;
+	attrs->cap.max_inline_data = T4_MAX_SEND_INLINE;
+
+	qhp->rhp = rhp;
+	qhp->attr.pd = php->pdid;
+	qhp->attr.scq = ((struct c4iw_cq *) attrs->send_cq)->cq.cqid;
+	qhp->attr.rcq = ((struct c4iw_cq *) attrs->recv_cq)->cq.cqid;
+	qhp->attr.sq_num_entries = attrs->cap.max_send_wr;
+	qhp->attr.rq_num_entries = attrs->cap.max_recv_wr;
+	qhp->attr.sq_max_sges = attrs->cap.max_send_sge;
+	qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge;
+	qhp->attr.rq_max_sges = attrs->cap.max_recv_sge;
+	qhp->attr.state = C4IW_QP_STATE_IDLE;
+	qhp->attr.next_state = C4IW_QP_STATE_IDLE;
+	qhp->attr.enable_rdma_read = 1;
+	qhp->attr.enable_rdma_write = 1;
+	qhp->attr.enable_bind = 1;
+	qhp->attr.max_ord = 0;
+	qhp->attr.max_ird = 0;
+	qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR;
+	spin_lock_init(&qhp->lock);
+	mutex_init(&qhp->mutex);
+	init_waitqueue_head(&qhp->wait);
+	kref_init(&qhp->kref);
+	INIT_WORK(&qhp->free_work, free_qp_work);
+
+	ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
+	if (ret)
+		goto err_destroy_qp;
+
+	if (udata && ucontext) {
+		sq_key_mm = kmalloc(sizeof(*sq_key_mm), GFP_KERNEL);
+		if (!sq_key_mm) {
+			ret = -ENOMEM;
+			goto err_remove_handle;
+		}
+		rq_key_mm = kmalloc(sizeof(*rq_key_mm), GFP_KERNEL);
+		if (!rq_key_mm) {
+			ret = -ENOMEM;
+			goto err_free_sq_key;
+		}
+		sq_db_key_mm = kmalloc(sizeof(*sq_db_key_mm), GFP_KERNEL);
+		if (!sq_db_key_mm) {
+			ret = -ENOMEM;
+			goto err_free_rq_key;
+		}
+		rq_db_key_mm = kmalloc(sizeof(*rq_db_key_mm), GFP_KERNEL);
+		if (!rq_db_key_mm) {
+			ret = -ENOMEM;
+			goto err_free_sq_db_key;
+		}
+		if (t4_sq_onchip(&qhp->wq.sq)) {
+			ma_sync_key_mm = kmalloc(sizeof(*ma_sync_key_mm),
+						 GFP_KERNEL);
+			if (!ma_sync_key_mm) {
+				ret = -ENOMEM;
+				goto err_free_rq_db_key;
+			}
+			uresp.flags = C4IW_QPF_ONCHIP;
+		} else
+			uresp.flags = 0;
+		uresp.qid_mask = rhp->rdev.qpmask;
+		uresp.sqid = qhp->wq.sq.qid;
+		uresp.sq_size = qhp->wq.sq.size;
+		uresp.sq_memsize = qhp->wq.sq.memsize;
+		uresp.rqid = qhp->wq.rq.qid;
+		uresp.rq_size = qhp->wq.rq.size;
+		uresp.rq_memsize = qhp->wq.rq.memsize;
+		spin_lock(&ucontext->mmap_lock);
+		if (ma_sync_key_mm) {
+			uresp.ma_sync_key = ucontext->key;
+			ucontext->key += PAGE_SIZE;
+		} else {
+			uresp.ma_sync_key =  0;
+		}
+		uresp.sq_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.rq_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.sq_db_gts_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.rq_db_gts_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		ret = ib_copy_to_udata(udata, &uresp, sizeof uresp);
+		if (ret)
+			goto err_free_ma_sync_key;
+		sq_key_mm->key = uresp.sq_key;
+		sq_key_mm->addr = qhp->wq.sq.phys_addr;
+		sq_key_mm->len = PAGE_ALIGN(qhp->wq.sq.memsize);
+		insert_mmap(ucontext, sq_key_mm);
+		rq_key_mm->key = uresp.rq_key;
+		rq_key_mm->addr = virt_to_phys(qhp->wq.rq.queue);
+		rq_key_mm->len = PAGE_ALIGN(qhp->wq.rq.memsize);
+		insert_mmap(ucontext, rq_key_mm);
+		sq_db_key_mm->key = uresp.sq_db_gts_key;
+		sq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.sq.bar2_pa;
+		sq_db_key_mm->len = PAGE_SIZE;
+		insert_mmap(ucontext, sq_db_key_mm);
+		rq_db_key_mm->key = uresp.rq_db_gts_key;
+		rq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.rq.bar2_pa;
+		rq_db_key_mm->len = PAGE_SIZE;
+		insert_mmap(ucontext, rq_db_key_mm);
+		if (ma_sync_key_mm) {
+			ma_sync_key_mm->key = uresp.ma_sync_key;
+			ma_sync_key_mm->addr =
+				(pci_resource_start(rhp->rdev.lldi.pdev, 0) +
+				PCIE_MA_SYNC_A) & PAGE_MASK;
+			ma_sync_key_mm->len = PAGE_SIZE;
+			insert_mmap(ucontext, ma_sync_key_mm);
+		}
+
+		c4iw_get_ucontext(ucontext);
+		qhp->ucontext = ucontext;
+	}
+	qhp->ibqp.qp_num = qhp->wq.sq.qid;
+	INIT_LIST_HEAD(&qhp->db_fc_entry);
+	pr_debug("sq id %u size %u memsize %zu num_entries %u rq id %u size %u memsize %zu num_entries %u\n",
+		 qhp->wq.sq.qid, qhp->wq.sq.size, qhp->wq.sq.memsize,
+		 attrs->cap.max_send_wr, qhp->wq.rq.qid, qhp->wq.rq.size,
+		 qhp->wq.rq.memsize, attrs->cap.max_recv_wr);
+	return &qhp->ibqp;
+err_free_ma_sync_key:
+	kfree(ma_sync_key_mm);
+err_free_rq_db_key:
+	kfree(rq_db_key_mm);
+err_free_sq_db_key:
+	kfree(sq_db_key_mm);
+err_free_rq_key:
+	kfree(rq_key_mm);
+err_free_sq_key:
+	kfree(sq_key_mm);
+err_remove_handle:
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
+err_destroy_qp:
+	destroy_qp(&rhp->rdev, &qhp->wq,
+		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+err_free_wr_wait:
+	c4iw_put_wr_wait(qhp->wr_waitp);
+err_free_qhp:
+	kfree(qhp);
+	return ERR_PTR(ret);
+}
+
+int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_qp *qhp;
+	enum c4iw_qp_attr_mask mask = 0;
+	struct c4iw_qp_attributes attrs;
+
+	pr_debug("ib_qp %p\n", ibqp);
+
+	/* iwarp does not support the RTR state */
+	if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR))
+		attr_mask &= ~IB_QP_STATE;
+
+	/* Make sure we still have something left to do */
+	if (!attr_mask)
+		return 0;
+
+	memset(&attrs, 0, sizeof attrs);
+	qhp = to_c4iw_qp(ibqp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = c4iw_convert_state(attr->qp_state);
+	attrs.enable_rdma_read = (attr->qp_access_flags &
+			       IB_ACCESS_REMOTE_READ) ?  1 : 0;
+	attrs.enable_rdma_write = (attr->qp_access_flags &
+				IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0;
+
+
+	mask |= (attr_mask & IB_QP_STATE) ? C4IW_QP_ATTR_NEXT_STATE : 0;
+	mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ?
+			(C4IW_QP_ATTR_ENABLE_RDMA_READ |
+			 C4IW_QP_ATTR_ENABLE_RDMA_WRITE |
+			 C4IW_QP_ATTR_ENABLE_RDMA_BIND) : 0;
+
+	/*
+	 * Use SQ_PSN and RQ_PSN to pass in IDX_INC values for
+	 * ringing the queue db when we're in DB_FULL mode.
+	 * Only allow this on T4 devices.
+	 */
+	attrs.sq_db_inc = attr->sq_psn;
+	attrs.rq_db_inc = attr->rq_psn;
+	mask |= (attr_mask & IB_QP_SQ_PSN) ? C4IW_QP_ATTR_SQ_DB : 0;
+	mask |= (attr_mask & IB_QP_RQ_PSN) ? C4IW_QP_ATTR_RQ_DB : 0;
+	if (!is_t4(to_c4iw_qp(ibqp)->rhp->rdev.lldi.adapter_type) &&
+	    (mask & (C4IW_QP_ATTR_SQ_DB|C4IW_QP_ATTR_RQ_DB)))
+		return -EINVAL;
+
+	return c4iw_modify_qp(rhp, qhp, mask, &attrs, 0);
+}
+
+struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn)
+{
+	pr_debug("ib_dev %p qpn 0x%x\n", dev, qpn);
+	return (struct ib_qp *)get_qhp(to_c4iw_dev(dev), qpn);
+}
+
+int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct c4iw_qp *qhp = to_c4iw_qp(ibqp);
+
+	memset(attr, 0, sizeof *attr);
+	memset(init_attr, 0, sizeof *init_attr);
+	attr->qp_state = to_ib_qp_state(qhp->attr.state);
+	init_attr->cap.max_send_wr = qhp->attr.sq_num_entries;
+	init_attr->cap.max_recv_wr = qhp->attr.rq_num_entries;
+	init_attr->cap.max_send_sge = qhp->attr.sq_max_sges;
+	init_attr->cap.max_recv_sge = qhp->attr.sq_max_sges;
+	init_attr->cap.max_inline_data = T4_MAX_SEND_INLINE;
+	init_attr->sq_sig_type = qhp->sq_sig_all ? IB_SIGNAL_ALL_WR : 0;
+	return 0;
+}
diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c
new file mode 100644
index 0000000..0ef25ae
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/resource.c
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* Crude resource management */
+#include <linux/spinlock.h>
+#include <linux/genalloc.h>
+#include <linux/ratelimit.h>
+#include "iw_cxgb4.h"
+
+static int c4iw_init_qid_table(struct c4iw_rdev *rdev)
+{
+	u32 i;
+
+	if (c4iw_id_table_alloc(&rdev->resource.qid_table,
+				rdev->lldi.vr->qp.start,
+				rdev->lldi.vr->qp.size,
+				rdev->lldi.vr->qp.size, 0))
+		return -ENOMEM;
+
+	for (i = rdev->lldi.vr->qp.start;
+		i < rdev->lldi.vr->qp.start + rdev->lldi.vr->qp.size; i++)
+		if (!(i & rdev->qpmask))
+			c4iw_id_free(&rdev->resource.qid_table, i);
+	return 0;
+}
+
+/* nr_* must be power of 2 */
+int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid)
+{
+	int err = 0;
+	err = c4iw_id_table_alloc(&rdev->resource.tpt_table, 0, nr_tpt, 1,
+					C4IW_ID_TABLE_F_RANDOM);
+	if (err)
+		goto tpt_err;
+	err = c4iw_init_qid_table(rdev);
+	if (err)
+		goto qid_err;
+	err = c4iw_id_table_alloc(&rdev->resource.pdid_table, 0,
+					nr_pdid, 1, 0);
+	if (err)
+		goto pdid_err;
+	return 0;
+ pdid_err:
+	c4iw_id_table_free(&rdev->resource.qid_table);
+ qid_err:
+	c4iw_id_table_free(&rdev->resource.tpt_table);
+ tpt_err:
+	return -ENOMEM;
+}
+
+/*
+ * returns 0 if no resource available
+ */
+u32 c4iw_get_resource(struct c4iw_id_table *id_table)
+{
+	u32 entry;
+	entry = c4iw_id_alloc(id_table);
+	if (entry == (u32)(-1))
+		return 0;
+	return entry;
+}
+
+void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry)
+{
+	pr_debug("entry 0x%x\n", entry);
+	c4iw_id_free(id_table, entry);
+}
+
+u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+	u32 qid;
+	int i;
+
+	mutex_lock(&uctx->lock);
+	if (!list_empty(&uctx->cqids)) {
+		entry = list_entry(uctx->cqids.next, struct c4iw_qid_list,
+				   entry);
+		list_del(&entry->entry);
+		qid = entry->qid;
+		kfree(entry);
+	} else {
+		qid = c4iw_get_resource(&rdev->resource.qid_table);
+		if (!qid)
+			goto out;
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.qid.cur += rdev->qpmask + 1;
+		mutex_unlock(&rdev->stats.lock);
+		for (i = qid+1; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->cqids);
+		}
+
+		/*
+		 * now put the same ids on the qp list since they all
+		 * map to the same db/gts page.
+		 */
+		entry = kmalloc(sizeof *entry, GFP_KERNEL);
+		if (!entry)
+			goto out;
+		entry->qid = qid;
+		list_add_tail(&entry->entry, &uctx->qpids);
+		for (i = qid+1; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->qpids);
+		}
+	}
+out:
+	mutex_unlock(&uctx->lock);
+	pr_debug("qid 0x%x\n", qid);
+	mutex_lock(&rdev->stats.lock);
+	if (rdev->stats.qid.cur > rdev->stats.qid.max)
+		rdev->stats.qid.max = rdev->stats.qid.cur;
+	mutex_unlock(&rdev->stats.lock);
+	return qid;
+}
+
+void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid,
+		   struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return;
+	pr_debug("qid 0x%x\n", qid);
+	entry->qid = qid;
+	mutex_lock(&uctx->lock);
+	list_add_tail(&entry->entry, &uctx->cqids);
+	mutex_unlock(&uctx->lock);
+}
+
+u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+	u32 qid;
+	int i;
+
+	mutex_lock(&uctx->lock);
+	if (!list_empty(&uctx->qpids)) {
+		entry = list_entry(uctx->qpids.next, struct c4iw_qid_list,
+				   entry);
+		list_del(&entry->entry);
+		qid = entry->qid;
+		kfree(entry);
+	} else {
+		qid = c4iw_get_resource(&rdev->resource.qid_table);
+		if (!qid) {
+			mutex_lock(&rdev->stats.lock);
+			rdev->stats.qid.fail++;
+			mutex_unlock(&rdev->stats.lock);
+			goto out;
+		}
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.qid.cur += rdev->qpmask + 1;
+		mutex_unlock(&rdev->stats.lock);
+		for (i = qid+1; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->qpids);
+		}
+
+		/*
+		 * now put the same ids on the cq list since they all
+		 * map to the same db/gts page.
+		 */
+		entry = kmalloc(sizeof *entry, GFP_KERNEL);
+		if (!entry)
+			goto out;
+		entry->qid = qid;
+		list_add_tail(&entry->entry, &uctx->cqids);
+		for (i = qid; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->cqids);
+		}
+	}
+out:
+	mutex_unlock(&uctx->lock);
+	pr_debug("qid 0x%x\n", qid);
+	mutex_lock(&rdev->stats.lock);
+	if (rdev->stats.qid.cur > rdev->stats.qid.max)
+		rdev->stats.qid.max = rdev->stats.qid.cur;
+	mutex_unlock(&rdev->stats.lock);
+	return qid;
+}
+
+void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid,
+		   struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return;
+	pr_debug("qid 0x%x\n", qid);
+	entry->qid = qid;
+	mutex_lock(&uctx->lock);
+	list_add_tail(&entry->entry, &uctx->qpids);
+	mutex_unlock(&uctx->lock);
+}
+
+void c4iw_destroy_resource(struct c4iw_resource *rscp)
+{
+	c4iw_id_table_free(&rscp->tpt_table);
+	c4iw_id_table_free(&rscp->qid_table);
+	c4iw_id_table_free(&rscp->pdid_table);
+}
+
+/*
+ * PBL Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_PBL_SHIFT 8			/* 256B == min PBL size (32 entries) */
+
+u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev->pbl_pool, size);
+	pr_debug("addr 0x%x size %d\n", (u32)addr, size);
+	mutex_lock(&rdev->stats.lock);
+	if (addr) {
+		rdev->stats.pbl.cur += roundup(size, 1 << MIN_PBL_SHIFT);
+		if (rdev->stats.pbl.cur > rdev->stats.pbl.max)
+			rdev->stats.pbl.max = rdev->stats.pbl.cur;
+		kref_get(&rdev->pbl_kref);
+	} else
+		rdev->stats.pbl.fail++;
+	mutex_unlock(&rdev->stats.lock);
+	return (u32)addr;
+}
+
+static void destroy_pblpool(struct kref *kref)
+{
+	struct c4iw_rdev *rdev;
+
+	rdev = container_of(kref, struct c4iw_rdev, pbl_kref);
+	gen_pool_destroy(rdev->pbl_pool);
+	complete(&rdev->pbl_compl);
+}
+
+void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size)
+{
+	pr_debug("addr 0x%x size %d\n", addr, size);
+	mutex_lock(&rdev->stats.lock);
+	rdev->stats.pbl.cur -= roundup(size, 1 << MIN_PBL_SHIFT);
+	mutex_unlock(&rdev->stats.lock);
+	gen_pool_free(rdev->pbl_pool, (unsigned long)addr, size);
+	kref_put(&rdev->pbl_kref, destroy_pblpool);
+}
+
+int c4iw_pblpool_create(struct c4iw_rdev *rdev)
+{
+	unsigned pbl_start, pbl_chunk, pbl_top;
+
+	rdev->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1);
+	if (!rdev->pbl_pool)
+		return -ENOMEM;
+
+	pbl_start = rdev->lldi.vr->pbl.start;
+	pbl_chunk = rdev->lldi.vr->pbl.size;
+	pbl_top = pbl_start + pbl_chunk;
+
+	while (pbl_start < pbl_top) {
+		pbl_chunk = min(pbl_top - pbl_start + 1, pbl_chunk);
+		if (gen_pool_add(rdev->pbl_pool, pbl_start, pbl_chunk, -1)) {
+			pr_debug("failed to add PBL chunk (%x/%x)\n",
+				 pbl_start, pbl_chunk);
+			if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) {
+				pr_warn("Failed to add all PBL chunks (%x/%x)\n",
+					pbl_start, pbl_top - pbl_start);
+				return 0;
+			}
+			pbl_chunk >>= 1;
+		} else {
+			pr_debug("added PBL chunk (%x/%x)\n",
+				 pbl_start, pbl_chunk);
+			pbl_start += pbl_chunk;
+		}
+	}
+
+	return 0;
+}
+
+void c4iw_pblpool_destroy(struct c4iw_rdev *rdev)
+{
+	kref_put(&rdev->pbl_kref, destroy_pblpool);
+}
+
+/*
+ * RQT Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_RQT_SHIFT 10	/* 1KB == min RQT size (16 entries) */
+
+u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev->rqt_pool, size << 6);
+	pr_debug("addr 0x%x size %d\n", (u32)addr, size << 6);
+	if (!addr)
+		pr_warn_ratelimited("%s: Out of RQT memory\n",
+				    pci_name(rdev->lldi.pdev));
+	mutex_lock(&rdev->stats.lock);
+	if (addr) {
+		rdev->stats.rqt.cur += roundup(size << 6, 1 << MIN_RQT_SHIFT);
+		if (rdev->stats.rqt.cur > rdev->stats.rqt.max)
+			rdev->stats.rqt.max = rdev->stats.rqt.cur;
+		kref_get(&rdev->rqt_kref);
+	} else
+		rdev->stats.rqt.fail++;
+	mutex_unlock(&rdev->stats.lock);
+	return (u32)addr;
+}
+
+static void destroy_rqtpool(struct kref *kref)
+{
+	struct c4iw_rdev *rdev;
+
+	rdev = container_of(kref, struct c4iw_rdev, rqt_kref);
+	gen_pool_destroy(rdev->rqt_pool);
+	complete(&rdev->rqt_compl);
+}
+
+void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size)
+{
+	pr_debug("addr 0x%x size %d\n", addr, size << 6);
+	mutex_lock(&rdev->stats.lock);
+	rdev->stats.rqt.cur -= roundup(size << 6, 1 << MIN_RQT_SHIFT);
+	mutex_unlock(&rdev->stats.lock);
+	gen_pool_free(rdev->rqt_pool, (unsigned long)addr, size << 6);
+	kref_put(&rdev->rqt_kref, destroy_rqtpool);
+}
+
+int c4iw_rqtpool_create(struct c4iw_rdev *rdev)
+{
+	unsigned rqt_start, rqt_chunk, rqt_top;
+
+	rdev->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1);
+	if (!rdev->rqt_pool)
+		return -ENOMEM;
+
+	rqt_start = rdev->lldi.vr->rq.start;
+	rqt_chunk = rdev->lldi.vr->rq.size;
+	rqt_top = rqt_start + rqt_chunk;
+
+	while (rqt_start < rqt_top) {
+		rqt_chunk = min(rqt_top - rqt_start + 1, rqt_chunk);
+		if (gen_pool_add(rdev->rqt_pool, rqt_start, rqt_chunk, -1)) {
+			pr_debug("failed to add RQT chunk (%x/%x)\n",
+				 rqt_start, rqt_chunk);
+			if (rqt_chunk <= 1024 << MIN_RQT_SHIFT) {
+				pr_warn("Failed to add all RQT chunks (%x/%x)\n",
+					rqt_start, rqt_top - rqt_start);
+				return 0;
+			}
+			rqt_chunk >>= 1;
+		} else {
+			pr_debug("added RQT chunk (%x/%x)\n",
+				 rqt_start, rqt_chunk);
+			rqt_start += rqt_chunk;
+		}
+	}
+	return 0;
+}
+
+void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev)
+{
+	kref_put(&rdev->rqt_kref, destroy_rqtpool);
+}
+
+/*
+ * On-Chip QP Memory.
+ */
+#define MIN_OCQP_SHIFT 12	/* 4KB == min ocqp size */
+
+u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev->ocqp_pool, size);
+	pr_debug("addr 0x%x size %d\n", (u32)addr, size);
+	if (addr) {
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.ocqp.cur += roundup(size, 1 << MIN_OCQP_SHIFT);
+		if (rdev->stats.ocqp.cur > rdev->stats.ocqp.max)
+			rdev->stats.ocqp.max = rdev->stats.ocqp.cur;
+		mutex_unlock(&rdev->stats.lock);
+	}
+	return (u32)addr;
+}
+
+void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size)
+{
+	pr_debug("addr 0x%x size %d\n", addr, size);
+	mutex_lock(&rdev->stats.lock);
+	rdev->stats.ocqp.cur -= roundup(size, 1 << MIN_OCQP_SHIFT);
+	mutex_unlock(&rdev->stats.lock);
+	gen_pool_free(rdev->ocqp_pool, (unsigned long)addr, size);
+}
+
+int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev)
+{
+	unsigned start, chunk, top;
+
+	rdev->ocqp_pool = gen_pool_create(MIN_OCQP_SHIFT, -1);
+	if (!rdev->ocqp_pool)
+		return -ENOMEM;
+
+	start = rdev->lldi.vr->ocq.start;
+	chunk = rdev->lldi.vr->ocq.size;
+	top = start + chunk;
+
+	while (start < top) {
+		chunk = min(top - start + 1, chunk);
+		if (gen_pool_add(rdev->ocqp_pool, start, chunk, -1)) {
+			pr_debug("failed to add OCQP chunk (%x/%x)\n",
+				 start, chunk);
+			if (chunk <= 1024 << MIN_OCQP_SHIFT) {
+				pr_warn("Failed to add all OCQP chunks (%x/%x)\n",
+					start, top - start);
+				return 0;
+			}
+			chunk >>= 1;
+		} else {
+			pr_debug("added OCQP chunk (%x/%x)\n",
+				 start, chunk);
+			start += chunk;
+		}
+	}
+	return 0;
+}
+
+void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev)
+{
+	gen_pool_destroy(rdev->ocqp_pool);
+}
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
new file mode 100644
index 0000000..8369c7c
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __T4_H__
+#define __T4_H__
+
+#include "t4_hw.h"
+#include "t4_regs.h"
+#include "t4_values.h"
+#include "t4_msg.h"
+#include "t4fw_ri_api.h"
+
+#define T4_MAX_NUM_PD 65536
+#define T4_MAX_MR_SIZE (~0ULL)
+#define T4_PAGESIZE_MASK 0xffff000  /* 4KB-128MB */
+#define T4_STAG_UNSET 0xffffffff
+#define T4_FW_MAJ 0
+#define PCIE_MA_SYNC_A 0x30b4
+
+struct t4_status_page {
+	__be32 rsvd1;	/* flit 0 - hw owns */
+	__be16 rsvd2;
+	__be16 qid;
+	__be16 cidx;
+	__be16 pidx;
+	u8 qp_err;	/* flit 1 - sw owns */
+	u8 db_off;
+	u8 pad;
+	u16 host_wq_pidx;
+	u16 host_cidx;
+	u16 host_pidx;
+};
+
+#define T4_EQ_ENTRY_SIZE 64
+
+#define T4_SQ_NUM_SLOTS 5
+#define T4_SQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_SQ_NUM_SLOTS)
+#define T4_MAX_SEND_SGE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \
+			sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge))
+#define T4_MAX_SEND_INLINE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \
+			sizeof(struct fw_ri_immd)))
+#define T4_MAX_WRITE_INLINE ((T4_SQ_NUM_BYTES - \
+			sizeof(struct fw_ri_rdma_write_wr) - \
+			sizeof(struct fw_ri_immd)))
+#define T4_MAX_WRITE_SGE ((T4_SQ_NUM_BYTES - \
+			sizeof(struct fw_ri_rdma_write_wr) - \
+			sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge))
+#define T4_MAX_FR_IMMD ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_fr_nsmr_wr) - \
+			sizeof(struct fw_ri_immd)) & ~31UL)
+#define T4_MAX_FR_IMMD_DEPTH (T4_MAX_FR_IMMD / sizeof(u64))
+#define T4_MAX_FR_DSGL 1024
+#define T4_MAX_FR_DSGL_DEPTH (T4_MAX_FR_DSGL / sizeof(u64))
+
+static inline int t4_max_fr_depth(int use_dsgl)
+{
+	return use_dsgl ? T4_MAX_FR_DSGL_DEPTH : T4_MAX_FR_IMMD_DEPTH;
+}
+
+#define T4_RQ_NUM_SLOTS 2
+#define T4_RQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_RQ_NUM_SLOTS)
+#define T4_MAX_RECV_SGE 4
+
+union t4_wr {
+	struct fw_ri_res_wr res;
+	struct fw_ri_wr ri;
+	struct fw_ri_rdma_write_wr write;
+	struct fw_ri_send_wr send;
+	struct fw_ri_rdma_read_wr read;
+	struct fw_ri_bind_mw_wr bind;
+	struct fw_ri_fr_nsmr_wr fr;
+	struct fw_ri_fr_nsmr_tpte_wr fr_tpte;
+	struct fw_ri_inv_lstag_wr inv;
+	struct t4_status_page status;
+	__be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_SQ_NUM_SLOTS];
+};
+
+union t4_recv_wr {
+	struct fw_ri_recv_wr recv;
+	struct t4_status_page status;
+	__be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_RQ_NUM_SLOTS];
+};
+
+static inline void init_wr_hdr(union t4_wr *wqe, u16 wrid,
+			       enum fw_wr_opcodes opcode, u8 flags, u8 len16)
+{
+	wqe->send.opcode = (u8)opcode;
+	wqe->send.flags = flags;
+	wqe->send.wrid = wrid;
+	wqe->send.r1[0] = 0;
+	wqe->send.r1[1] = 0;
+	wqe->send.r1[2] = 0;
+	wqe->send.len16 = len16;
+}
+
+/* CQE/AE status codes */
+#define T4_ERR_SUCCESS                     0x0
+#define T4_ERR_STAG                        0x1	/* STAG invalid: either the */
+						/* STAG is offlimt, being 0, */
+						/* or STAG_key mismatch */
+#define T4_ERR_PDID                        0x2	/* PDID mismatch */
+#define T4_ERR_QPID                        0x3	/* QPID mismatch */
+#define T4_ERR_ACCESS                      0x4	/* Invalid access right */
+#define T4_ERR_WRAP                        0x5	/* Wrap error */
+#define T4_ERR_BOUND                       0x6	/* base and bounds voilation */
+#define T4_ERR_INVALIDATE_SHARED_MR        0x7	/* attempt to invalidate a  */
+						/* shared memory region */
+#define T4_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8	/* attempt to invalidate a  */
+						/* shared memory region */
+#define T4_ERR_ECC                         0x9	/* ECC error detected */
+#define T4_ERR_ECC_PSTAG                   0xA	/* ECC error detected when  */
+						/* reading PSTAG for a MW  */
+						/* Invalidate */
+#define T4_ERR_PBL_ADDR_BOUND              0xB	/* pbl addr out of bounds:  */
+						/* software error */
+#define T4_ERR_SWFLUSH			   0xC	/* SW FLUSHED */
+#define T4_ERR_CRC                         0x10 /* CRC error */
+#define T4_ERR_MARKER                      0x11 /* Marker error */
+#define T4_ERR_PDU_LEN_ERR                 0x12 /* invalid PDU length */
+#define T4_ERR_OUT_OF_RQE                  0x13 /* out of RQE */
+#define T4_ERR_DDP_VERSION                 0x14 /* wrong DDP version */
+#define T4_ERR_RDMA_VERSION                0x15 /* wrong RDMA version */
+#define T4_ERR_OPCODE                      0x16 /* invalid rdma opcode */
+#define T4_ERR_DDP_QUEUE_NUM               0x17 /* invalid ddp queue number */
+#define T4_ERR_MSN                         0x18 /* MSN error */
+#define T4_ERR_TBIT                        0x19 /* tag bit not set correctly */
+#define T4_ERR_MO                          0x1A /* MO not 0 for TERMINATE  */
+						/* or READ_REQ */
+#define T4_ERR_MSN_GAP                     0x1B
+#define T4_ERR_MSN_RANGE                   0x1C
+#define T4_ERR_IRD_OVERFLOW                0x1D
+#define T4_ERR_RQE_ADDR_BOUND              0x1E /* RQE addr out of bounds:  */
+						/* software error */
+#define T4_ERR_INTERNAL_ERR                0x1F /* internal error (opcode  */
+						/* mismatch) */
+/*
+ * CQE defs
+ */
+struct t4_cqe {
+	__be32 header;
+	__be32 len;
+	union {
+		struct {
+			__be32 stag;
+			__be32 msn;
+		} rcqe;
+		struct {
+			__be32 stag;
+			u16 nada2;
+			u16 cidx;
+		} scqe;
+		struct {
+			__be32 wrid_hi;
+			__be32 wrid_low;
+		} gen;
+		u64 drain_cookie;
+	} u;
+	__be64 reserved;
+	__be64 bits_type_ts;
+};
+
+/* macros for flit 0 of the cqe */
+
+#define CQE_QPID_S        12
+#define CQE_QPID_M        0xFFFFF
+#define CQE_QPID_G(x)     ((((x) >> CQE_QPID_S)) & CQE_QPID_M)
+#define CQE_QPID_V(x)	  ((x)<<CQE_QPID_S)
+
+#define CQE_SWCQE_S       11
+#define CQE_SWCQE_M       0x1
+#define CQE_SWCQE_G(x)    ((((x) >> CQE_SWCQE_S)) & CQE_SWCQE_M)
+#define CQE_SWCQE_V(x)	  ((x)<<CQE_SWCQE_S)
+
+#define CQE_DRAIN_S       10
+#define CQE_DRAIN_M       0x1
+#define CQE_DRAIN_G(x)    ((((x) >> CQE_DRAIN_S)) & CQE_DRAIN_M)
+#define CQE_DRAIN_V(x)	  ((x)<<CQE_DRAIN_S)
+
+#define CQE_STATUS_S      5
+#define CQE_STATUS_M      0x1F
+#define CQE_STATUS_G(x)   ((((x) >> CQE_STATUS_S)) & CQE_STATUS_M)
+#define CQE_STATUS_V(x)   ((x)<<CQE_STATUS_S)
+
+#define CQE_TYPE_S        4
+#define CQE_TYPE_M        0x1
+#define CQE_TYPE_G(x)     ((((x) >> CQE_TYPE_S)) & CQE_TYPE_M)
+#define CQE_TYPE_V(x)     ((x)<<CQE_TYPE_S)
+
+#define CQE_OPCODE_S      0
+#define CQE_OPCODE_M      0xF
+#define CQE_OPCODE_G(x)   ((((x) >> CQE_OPCODE_S)) & CQE_OPCODE_M)
+#define CQE_OPCODE_V(x)   ((x)<<CQE_OPCODE_S)
+
+#define SW_CQE(x)         (CQE_SWCQE_G(be32_to_cpu((x)->header)))
+#define DRAIN_CQE(x)      (CQE_DRAIN_G(be32_to_cpu((x)->header)))
+#define CQE_QPID(x)       (CQE_QPID_G(be32_to_cpu((x)->header)))
+#define CQE_TYPE(x)       (CQE_TYPE_G(be32_to_cpu((x)->header)))
+#define SQ_TYPE(x)	  (CQE_TYPE((x)))
+#define RQ_TYPE(x)	  (!CQE_TYPE((x)))
+#define CQE_STATUS(x)     (CQE_STATUS_G(be32_to_cpu((x)->header)))
+#define CQE_OPCODE(x)     (CQE_OPCODE_G(be32_to_cpu((x)->header)))
+
+#define CQE_SEND_OPCODE(x)( \
+	(CQE_OPCODE_G(be32_to_cpu((x)->header)) == FW_RI_SEND) || \
+	(CQE_OPCODE_G(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_SE) || \
+	(CQE_OPCODE_G(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_INV) || \
+	(CQE_OPCODE_G(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_SE_INV))
+
+#define CQE_LEN(x)        (be32_to_cpu((x)->len))
+
+/* used for RQ completion processing */
+#define CQE_WRID_STAG(x)  (be32_to_cpu((x)->u.rcqe.stag))
+#define CQE_WRID_MSN(x)   (be32_to_cpu((x)->u.rcqe.msn))
+
+/* used for SQ completion processing */
+#define CQE_WRID_SQ_IDX(x)	((x)->u.scqe.cidx)
+#define CQE_WRID_FR_STAG(x)     (be32_to_cpu((x)->u.scqe.stag))
+
+/* generic accessor macros */
+#define CQE_WRID_HI(x)		(be32_to_cpu((x)->u.gen.wrid_hi))
+#define CQE_WRID_LOW(x)		(be32_to_cpu((x)->u.gen.wrid_low))
+#define CQE_DRAIN_COOKIE(x)	((x)->u.drain_cookie)
+
+/* macros for flit 3 of the cqe */
+#define CQE_GENBIT_S	63
+#define CQE_GENBIT_M	0x1
+#define CQE_GENBIT_G(x)	(((x) >> CQE_GENBIT_S) & CQE_GENBIT_M)
+#define CQE_GENBIT_V(x) ((x)<<CQE_GENBIT_S)
+
+#define CQE_OVFBIT_S	62
+#define CQE_OVFBIT_M	0x1
+#define CQE_OVFBIT_G(x)	((((x) >> CQE_OVFBIT_S)) & CQE_OVFBIT_M)
+
+#define CQE_IQTYPE_S	60
+#define CQE_IQTYPE_M	0x3
+#define CQE_IQTYPE_G(x)	((((x) >> CQE_IQTYPE_S)) & CQE_IQTYPE_M)
+
+#define CQE_TS_M	0x0fffffffffffffffULL
+#define CQE_TS_G(x)	((x) & CQE_TS_M)
+
+#define CQE_OVFBIT(x)	((unsigned)CQE_OVFBIT_G(be64_to_cpu((x)->bits_type_ts)))
+#define CQE_GENBIT(x)	((unsigned)CQE_GENBIT_G(be64_to_cpu((x)->bits_type_ts)))
+#define CQE_TS(x)	(CQE_TS_G(be64_to_cpu((x)->bits_type_ts)))
+
+struct t4_swsqe {
+	u64			wr_id;
+	struct t4_cqe		cqe;
+	int			read_len;
+	int			opcode;
+	int			complete;
+	int			signaled;
+	u16			idx;
+	int                     flushed;
+	ktime_t			host_time;
+	u64                     sge_ts;
+};
+
+static inline pgprot_t t4_pgprot_wc(pgprot_t prot)
+{
+#if defined(__i386__) || defined(__x86_64__) || defined(CONFIG_PPC64)
+	return pgprot_writecombine(prot);
+#else
+	return pgprot_noncached(prot);
+#endif
+}
+
+enum {
+	T4_SQ_ONCHIP = (1<<0),
+};
+
+struct t4_sq {
+	union t4_wr *queue;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	unsigned long phys_addr;
+	struct t4_swsqe *sw_sq;
+	struct t4_swsqe *oldest_read;
+	void __iomem *bar2_va;
+	u64 bar2_pa;
+	size_t memsize;
+	u32 bar2_qid;
+	u32 qid;
+	u16 in_use;
+	u16 size;
+	u16 cidx;
+	u16 pidx;
+	u16 wq_pidx;
+	u16 wq_pidx_inc;
+	u16 flags;
+	short flush_cidx;
+};
+
+struct t4_swrqe {
+	u64 wr_id;
+	ktime_t	host_time;
+	u64 sge_ts;
+};
+
+struct t4_rq {
+	union  t4_recv_wr *queue;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	struct t4_swrqe *sw_rq;
+	void __iomem *bar2_va;
+	u64 bar2_pa;
+	size_t memsize;
+	u32 bar2_qid;
+	u32 qid;
+	u32 msn;
+	u32 rqt_hwaddr;
+	u16 rqt_size;
+	u16 in_use;
+	u16 size;
+	u16 cidx;
+	u16 pidx;
+	u16 wq_pidx;
+	u16 wq_pidx_inc;
+};
+
+struct t4_wq {
+	struct t4_sq sq;
+	struct t4_rq rq;
+	void __iomem *db;
+	struct c4iw_rdev *rdev;
+	int flushed;
+};
+
+static inline int t4_rqes_posted(struct t4_wq *wq)
+{
+	return wq->rq.in_use;
+}
+
+static inline int t4_rq_empty(struct t4_wq *wq)
+{
+	return wq->rq.in_use == 0;
+}
+
+static inline int t4_rq_full(struct t4_wq *wq)
+{
+	return wq->rq.in_use == (wq->rq.size - 1);
+}
+
+static inline u32 t4_rq_avail(struct t4_wq *wq)
+{
+	return wq->rq.size - 1 - wq->rq.in_use;
+}
+
+static inline void t4_rq_produce(struct t4_wq *wq, u8 len16)
+{
+	wq->rq.in_use++;
+	if (++wq->rq.pidx == wq->rq.size)
+		wq->rq.pidx = 0;
+	wq->rq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+	if (wq->rq.wq_pidx >= wq->rq.size * T4_RQ_NUM_SLOTS)
+		wq->rq.wq_pidx %= wq->rq.size * T4_RQ_NUM_SLOTS;
+}
+
+static inline void t4_rq_consume(struct t4_wq *wq)
+{
+	wq->rq.in_use--;
+	wq->rq.msn++;
+	if (++wq->rq.cidx == wq->rq.size)
+		wq->rq.cidx = 0;
+}
+
+static inline u16 t4_rq_host_wq_pidx(struct t4_wq *wq)
+{
+	return wq->rq.queue[wq->rq.size].status.host_wq_pidx;
+}
+
+static inline u16 t4_rq_wq_size(struct t4_wq *wq)
+{
+		return wq->rq.size * T4_RQ_NUM_SLOTS;
+}
+
+static inline int t4_sq_onchip(struct t4_sq *sq)
+{
+	return sq->flags & T4_SQ_ONCHIP;
+}
+
+static inline int t4_sq_empty(struct t4_wq *wq)
+{
+	return wq->sq.in_use == 0;
+}
+
+static inline int t4_sq_full(struct t4_wq *wq)
+{
+	return wq->sq.in_use == (wq->sq.size - 1);
+}
+
+static inline u32 t4_sq_avail(struct t4_wq *wq)
+{
+	return wq->sq.size - 1 - wq->sq.in_use;
+}
+
+static inline void t4_sq_produce(struct t4_wq *wq, u8 len16)
+{
+	wq->sq.in_use++;
+	if (++wq->sq.pidx == wq->sq.size)
+		wq->sq.pidx = 0;
+	wq->sq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+	if (wq->sq.wq_pidx >= wq->sq.size * T4_SQ_NUM_SLOTS)
+		wq->sq.wq_pidx %= wq->sq.size * T4_SQ_NUM_SLOTS;
+}
+
+static inline void t4_sq_consume(struct t4_wq *wq)
+{
+	if (wq->sq.cidx == wq->sq.flush_cidx)
+		wq->sq.flush_cidx = -1;
+	wq->sq.in_use--;
+	if (++wq->sq.cidx == wq->sq.size)
+		wq->sq.cidx = 0;
+}
+
+static inline u16 t4_sq_host_wq_pidx(struct t4_wq *wq)
+{
+	return wq->sq.queue[wq->sq.size].status.host_wq_pidx;
+}
+
+static inline u16 t4_sq_wq_size(struct t4_wq *wq)
+{
+		return wq->sq.size * T4_SQ_NUM_SLOTS;
+}
+
+/* This function copies 64 byte coalesced work request to memory
+ * mapped BAR2 space. For coalesced WRs, the SGE fetches data
+ * from the FIFO instead of from Host.
+ */
+static inline void pio_copy(u64 __iomem *dst, u64 *src)
+{
+	int count = 8;
+
+	while (count) {
+		writeq(*src, dst);
+		src++;
+		dst++;
+		count--;
+	}
+}
+
+static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc, union t4_wr *wqe)
+{
+
+	/* Flush host queue memory writes. */
+	wmb();
+	if (wq->sq.bar2_va) {
+		if (inc == 1 && wq->sq.bar2_qid == 0 && wqe) {
+			pr_debug("WC wq->sq.pidx = %d\n", wq->sq.pidx);
+			pio_copy((u64 __iomem *)
+				 (wq->sq.bar2_va + SGE_UDB_WCDOORBELL),
+				 (u64 *)wqe);
+		} else {
+			pr_debug("DB wq->sq.pidx = %d\n", wq->sq.pidx);
+			writel(PIDX_T5_V(inc) | QID_V(wq->sq.bar2_qid),
+			       wq->sq.bar2_va + SGE_UDB_KDOORBELL);
+		}
+
+		/* Flush user doorbell area writes. */
+		wmb();
+		return;
+	}
+	writel(QID_V(wq->sq.qid) | PIDX_V(inc), wq->db);
+}
+
+static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc,
+				 union t4_recv_wr *wqe)
+{
+
+	/* Flush host queue memory writes. */
+	wmb();
+	if (wq->rq.bar2_va) {
+		if (inc == 1 && wq->rq.bar2_qid == 0 && wqe) {
+			pr_debug("WC wq->rq.pidx = %d\n", wq->rq.pidx);
+			pio_copy((u64 __iomem *)
+				 (wq->rq.bar2_va + SGE_UDB_WCDOORBELL),
+				 (void *)wqe);
+		} else {
+			pr_debug("DB wq->rq.pidx = %d\n", wq->rq.pidx);
+			writel(PIDX_T5_V(inc) | QID_V(wq->rq.bar2_qid),
+			       wq->rq.bar2_va + SGE_UDB_KDOORBELL);
+		}
+
+		/* Flush user doorbell area writes. */
+		wmb();
+		return;
+	}
+	writel(QID_V(wq->rq.qid) | PIDX_V(inc), wq->db);
+}
+
+static inline int t4_wq_in_error(struct t4_wq *wq)
+{
+	return wq->rq.queue[wq->rq.size].status.qp_err;
+}
+
+static inline void t4_set_wq_in_error(struct t4_wq *wq)
+{
+	wq->rq.queue[wq->rq.size].status.qp_err = 1;
+}
+
+static inline void t4_disable_wq_db(struct t4_wq *wq)
+{
+	wq->rq.queue[wq->rq.size].status.db_off = 1;
+}
+
+static inline void t4_enable_wq_db(struct t4_wq *wq)
+{
+	wq->rq.queue[wq->rq.size].status.db_off = 0;
+}
+
+static inline int t4_wq_db_enabled(struct t4_wq *wq)
+{
+	return !wq->rq.queue[wq->rq.size].status.db_off;
+}
+
+enum t4_cq_flags {
+	CQ_ARMED	= 1,
+};
+
+struct t4_cq {
+	struct t4_cqe *queue;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	struct t4_cqe *sw_queue;
+	void __iomem *gts;
+	void __iomem *bar2_va;
+	u64 bar2_pa;
+	u32 bar2_qid;
+	struct c4iw_rdev *rdev;
+	size_t memsize;
+	__be64 bits_type_ts;
+	u32 cqid;
+	u32 qid_mask;
+	int vector;
+	u16 size; /* including status page */
+	u16 cidx;
+	u16 sw_pidx;
+	u16 sw_cidx;
+	u16 sw_in_use;
+	u16 cidx_inc;
+	u8 gen;
+	u8 error;
+	unsigned long flags;
+};
+
+static inline void write_gts(struct t4_cq *cq, u32 val)
+{
+	if (cq->bar2_va)
+		writel(val | INGRESSQID_V(cq->bar2_qid),
+		       cq->bar2_va + SGE_UDB_GTS);
+	else
+		writel(val | INGRESSQID_V(cq->cqid), cq->gts);
+}
+
+static inline int t4_clear_cq_armed(struct t4_cq *cq)
+{
+	return test_and_clear_bit(CQ_ARMED, &cq->flags);
+}
+
+static inline int t4_arm_cq(struct t4_cq *cq, int se)
+{
+	u32 val;
+
+	set_bit(CQ_ARMED, &cq->flags);
+	while (cq->cidx_inc > CIDXINC_M) {
+		val = SEINTARM_V(0) | CIDXINC_V(CIDXINC_M) | TIMERREG_V(7);
+		write_gts(cq, val);
+		cq->cidx_inc -= CIDXINC_M;
+	}
+	val = SEINTARM_V(se) | CIDXINC_V(cq->cidx_inc) | TIMERREG_V(6);
+	write_gts(cq, val);
+	cq->cidx_inc = 0;
+	return 0;
+}
+
+static inline void t4_swcq_produce(struct t4_cq *cq)
+{
+	cq->sw_in_use++;
+	if (cq->sw_in_use == cq->size) {
+		pr_warn("%s cxgb4 sw cq overflow cqid %u\n",
+			__func__, cq->cqid);
+		cq->error = 1;
+		cq->sw_in_use--;
+		return;
+	}
+	if (++cq->sw_pidx == cq->size)
+		cq->sw_pidx = 0;
+}
+
+static inline void t4_swcq_consume(struct t4_cq *cq)
+{
+	cq->sw_in_use--;
+	if (++cq->sw_cidx == cq->size)
+		cq->sw_cidx = 0;
+}
+
+static inline void t4_hwcq_consume(struct t4_cq *cq)
+{
+	cq->bits_type_ts = cq->queue[cq->cidx].bits_type_ts;
+	if (++cq->cidx_inc == (cq->size >> 4) || cq->cidx_inc == CIDXINC_M) {
+		u32 val;
+
+		val = SEINTARM_V(0) | CIDXINC_V(cq->cidx_inc) | TIMERREG_V(7);
+		write_gts(cq, val);
+		cq->cidx_inc = 0;
+	}
+	if (++cq->cidx == cq->size) {
+		cq->cidx = 0;
+		cq->gen ^= 1;
+	}
+}
+
+static inline int t4_valid_cqe(struct t4_cq *cq, struct t4_cqe *cqe)
+{
+	return (CQE_GENBIT(cqe) == cq->gen);
+}
+
+static inline int t4_cq_notempty(struct t4_cq *cq)
+{
+	return cq->sw_in_use || t4_valid_cqe(cq, &cq->queue[cq->cidx]);
+}
+
+static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe)
+{
+	int ret;
+	u16 prev_cidx;
+
+	if (cq->cidx == 0)
+		prev_cidx = cq->size - 1;
+	else
+		prev_cidx = cq->cidx - 1;
+
+	if (cq->queue[prev_cidx].bits_type_ts != cq->bits_type_ts) {
+		ret = -EOVERFLOW;
+		cq->error = 1;
+		pr_err("cq overflow cqid %u\n", cq->cqid);
+	} else if (t4_valid_cqe(cq, &cq->queue[cq->cidx])) {
+
+		/* Ensure CQE is flushed to memory */
+		rmb();
+		*cqe = &cq->queue[cq->cidx];
+		ret = 0;
+	} else
+		ret = -ENODATA;
+	return ret;
+}
+
+static inline struct t4_cqe *t4_next_sw_cqe(struct t4_cq *cq)
+{
+	if (cq->sw_in_use == cq->size) {
+		pr_warn("%s cxgb4 sw cq overflow cqid %u\n",
+			__func__, cq->cqid);
+		cq->error = 1;
+		return NULL;
+	}
+	if (cq->sw_in_use)
+		return &cq->sw_queue[cq->sw_cidx];
+	return NULL;
+}
+
+static inline int t4_next_cqe(struct t4_cq *cq, struct t4_cqe **cqe)
+{
+	int ret = 0;
+
+	if (cq->error)
+		ret = -ENODATA;
+	else if (cq->sw_in_use)
+		*cqe = &cq->sw_queue[cq->sw_cidx];
+	else
+		ret = t4_next_hw_cqe(cq, cqe);
+	return ret;
+}
+
+static inline int t4_cq_in_error(struct t4_cq *cq)
+{
+	return ((struct t4_status_page *)&cq->queue[cq->size])->qp_err;
+}
+
+static inline void t4_set_cq_in_error(struct t4_cq *cq)
+{
+	((struct t4_status_page *)&cq->queue[cq->size])->qp_err = 1;
+}
+#endif
+
+struct t4_dev_status_page {
+	u8 db_off;
+	u8 pad1;
+	u16 pad2;
+	u32 pad3;
+	u64 qp_start;
+	u64 qp_size;
+	u64 cq_start;
+	u64 cq_size;
+};
diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
new file mode 100644
index 0000000..58c531d
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
@@ -0,0 +1,768 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _T4FW_RI_API_H_
+#define _T4FW_RI_API_H_
+
+#include "t4fw_api.h"
+
+enum fw_ri_wr_opcode {
+	FW_RI_RDMA_WRITE		= 0x0,	/* IETF RDMAP v1.0 ... */
+	FW_RI_READ_REQ			= 0x1,
+	FW_RI_READ_RESP			= 0x2,
+	FW_RI_SEND			= 0x3,
+	FW_RI_SEND_WITH_INV		= 0x4,
+	FW_RI_SEND_WITH_SE		= 0x5,
+	FW_RI_SEND_WITH_SE_INV		= 0x6,
+	FW_RI_TERMINATE			= 0x7,
+	FW_RI_RDMA_INIT			= 0x8,	/* CHELSIO RI specific ... */
+	FW_RI_BIND_MW			= 0x9,
+	FW_RI_FAST_REGISTER		= 0xa,
+	FW_RI_LOCAL_INV			= 0xb,
+	FW_RI_QP_MODIFY			= 0xc,
+	FW_RI_BYPASS			= 0xd,
+	FW_RI_RECEIVE			= 0xe,
+
+	FW_RI_SGE_EC_CR_RETURN		= 0xf
+};
+
+enum fw_ri_wr_flags {
+	FW_RI_COMPLETION_FLAG		= 0x01,
+	FW_RI_NOTIFICATION_FLAG		= 0x02,
+	FW_RI_SOLICITED_EVENT_FLAG	= 0x04,
+	FW_RI_READ_FENCE_FLAG		= 0x08,
+	FW_RI_LOCAL_FENCE_FLAG		= 0x10,
+	FW_RI_RDMA_READ_INVALIDATE	= 0x20
+};
+
+enum fw_ri_mpa_attrs {
+	FW_RI_MPA_RX_MARKER_ENABLE	= 0x01,
+	FW_RI_MPA_TX_MARKER_ENABLE	= 0x02,
+	FW_RI_MPA_CRC_ENABLE		= 0x04,
+	FW_RI_MPA_IETF_ENABLE		= 0x08
+};
+
+enum fw_ri_qp_caps {
+	FW_RI_QP_RDMA_READ_ENABLE	= 0x01,
+	FW_RI_QP_RDMA_WRITE_ENABLE	= 0x02,
+	FW_RI_QP_BIND_ENABLE		= 0x04,
+	FW_RI_QP_FAST_REGISTER_ENABLE	= 0x08,
+	FW_RI_QP_STAG0_ENABLE		= 0x10
+};
+
+enum fw_ri_addr_type {
+	FW_RI_ZERO_BASED_TO		= 0x00,
+	FW_RI_VA_BASED_TO		= 0x01
+};
+
+enum fw_ri_mem_perms {
+	FW_RI_MEM_ACCESS_REM_WRITE	= 0x01,
+	FW_RI_MEM_ACCESS_REM_READ	= 0x02,
+	FW_RI_MEM_ACCESS_REM		= 0x03,
+	FW_RI_MEM_ACCESS_LOCAL_WRITE	= 0x04,
+	FW_RI_MEM_ACCESS_LOCAL_READ	= 0x08,
+	FW_RI_MEM_ACCESS_LOCAL		= 0x0C
+};
+
+enum fw_ri_stag_type {
+	FW_RI_STAG_NSMR			= 0x00,
+	FW_RI_STAG_SMR			= 0x01,
+	FW_RI_STAG_MW			= 0x02,
+	FW_RI_STAG_MW_RELAXED		= 0x03
+};
+
+enum fw_ri_data_op {
+	FW_RI_DATA_IMMD			= 0x81,
+	FW_RI_DATA_DSGL			= 0x82,
+	FW_RI_DATA_ISGL			= 0x83
+};
+
+enum fw_ri_sgl_depth {
+	FW_RI_SGL_DEPTH_MAX_SQ		= 16,
+	FW_RI_SGL_DEPTH_MAX_RQ		= 4
+};
+
+struct fw_ri_dsge_pair {
+	__be32	len[2];
+	__be64	addr[2];
+};
+
+struct fw_ri_dsgl {
+	__u8	op;
+	__u8	r1;
+	__be16	nsge;
+	__be32	len0;
+	__be64	addr0;
+#ifndef C99_NOT_SUPPORTED
+	struct fw_ri_dsge_pair sge[0];
+#endif
+};
+
+struct fw_ri_sge {
+	__be32 stag;
+	__be32 len;
+	__be64 to;
+};
+
+struct fw_ri_isgl {
+	__u8	op;
+	__u8	r1;
+	__be16	nsge;
+	__be32	r2;
+#ifndef C99_NOT_SUPPORTED
+	struct fw_ri_sge sge[0];
+#endif
+};
+
+struct fw_ri_immd {
+	__u8	op;
+	__u8	r1;
+	__be16	r2;
+	__be32	immdlen;
+#ifndef C99_NOT_SUPPORTED
+	__u8	data[0];
+#endif
+};
+
+struct fw_ri_tpte {
+	__be32 valid_to_pdid;
+	__be32 locread_to_qpid;
+	__be32 nosnoop_pbladdr;
+	__be32 len_lo;
+	__be32 va_hi;
+	__be32 va_lo_fbo;
+	__be32 dca_mwbcnt_pstag;
+	__be32 len_hi;
+};
+
+#define FW_RI_TPTE_VALID_S		31
+#define FW_RI_TPTE_VALID_M		0x1
+#define FW_RI_TPTE_VALID_V(x)		((x) << FW_RI_TPTE_VALID_S)
+#define FW_RI_TPTE_VALID_G(x)		\
+	(((x) >> FW_RI_TPTE_VALID_S) & FW_RI_TPTE_VALID_M)
+#define FW_RI_TPTE_VALID_F		FW_RI_TPTE_VALID_V(1U)
+
+#define FW_RI_TPTE_STAGKEY_S		23
+#define FW_RI_TPTE_STAGKEY_M		0xff
+#define FW_RI_TPTE_STAGKEY_V(x)		((x) << FW_RI_TPTE_STAGKEY_S)
+#define FW_RI_TPTE_STAGKEY_G(x)		\
+	(((x) >> FW_RI_TPTE_STAGKEY_S) & FW_RI_TPTE_STAGKEY_M)
+
+#define FW_RI_TPTE_STAGSTATE_S		22
+#define FW_RI_TPTE_STAGSTATE_M		0x1
+#define FW_RI_TPTE_STAGSTATE_V(x)	((x) << FW_RI_TPTE_STAGSTATE_S)
+#define FW_RI_TPTE_STAGSTATE_G(x)	\
+	(((x) >> FW_RI_TPTE_STAGSTATE_S) & FW_RI_TPTE_STAGSTATE_M)
+#define FW_RI_TPTE_STAGSTATE_F		FW_RI_TPTE_STAGSTATE_V(1U)
+
+#define FW_RI_TPTE_STAGTYPE_S		20
+#define FW_RI_TPTE_STAGTYPE_M		0x3
+#define FW_RI_TPTE_STAGTYPE_V(x)	((x) << FW_RI_TPTE_STAGTYPE_S)
+#define FW_RI_TPTE_STAGTYPE_G(x)	\
+	(((x) >> FW_RI_TPTE_STAGTYPE_S) & FW_RI_TPTE_STAGTYPE_M)
+
+#define FW_RI_TPTE_PDID_S		0
+#define FW_RI_TPTE_PDID_M		0xfffff
+#define FW_RI_TPTE_PDID_V(x)		((x) << FW_RI_TPTE_PDID_S)
+#define FW_RI_TPTE_PDID_G(x)		\
+	(((x) >> FW_RI_TPTE_PDID_S) & FW_RI_TPTE_PDID_M)
+
+#define FW_RI_TPTE_PERM_S		28
+#define FW_RI_TPTE_PERM_M		0xf
+#define FW_RI_TPTE_PERM_V(x)		((x) << FW_RI_TPTE_PERM_S)
+#define FW_RI_TPTE_PERM_G(x)		\
+	(((x) >> FW_RI_TPTE_PERM_S) & FW_RI_TPTE_PERM_M)
+
+#define FW_RI_TPTE_REMINVDIS_S		27
+#define FW_RI_TPTE_REMINVDIS_M		0x1
+#define FW_RI_TPTE_REMINVDIS_V(x)	((x) << FW_RI_TPTE_REMINVDIS_S)
+#define FW_RI_TPTE_REMINVDIS_G(x)	\
+	(((x) >> FW_RI_TPTE_REMINVDIS_S) & FW_RI_TPTE_REMINVDIS_M)
+#define FW_RI_TPTE_REMINVDIS_F		FW_RI_TPTE_REMINVDIS_V(1U)
+
+#define FW_RI_TPTE_ADDRTYPE_S		26
+#define FW_RI_TPTE_ADDRTYPE_M		1
+#define FW_RI_TPTE_ADDRTYPE_V(x)	((x) << FW_RI_TPTE_ADDRTYPE_S)
+#define FW_RI_TPTE_ADDRTYPE_G(x)	\
+	(((x) >> FW_RI_TPTE_ADDRTYPE_S) & FW_RI_TPTE_ADDRTYPE_M)
+#define FW_RI_TPTE_ADDRTYPE_F		FW_RI_TPTE_ADDRTYPE_V(1U)
+
+#define FW_RI_TPTE_MWBINDEN_S		25
+#define FW_RI_TPTE_MWBINDEN_M		0x1
+#define FW_RI_TPTE_MWBINDEN_V(x)	((x) << FW_RI_TPTE_MWBINDEN_S)
+#define FW_RI_TPTE_MWBINDEN_G(x)	\
+	(((x) >> FW_RI_TPTE_MWBINDEN_S) & FW_RI_TPTE_MWBINDEN_M)
+#define FW_RI_TPTE_MWBINDEN_F		FW_RI_TPTE_MWBINDEN_V(1U)
+
+#define FW_RI_TPTE_PS_S			20
+#define FW_RI_TPTE_PS_M			0x1f
+#define FW_RI_TPTE_PS_V(x)		((x) << FW_RI_TPTE_PS_S)
+#define FW_RI_TPTE_PS_G(x)		\
+	(((x) >> FW_RI_TPTE_PS_S) & FW_RI_TPTE_PS_M)
+
+#define FW_RI_TPTE_QPID_S		0
+#define FW_RI_TPTE_QPID_M		0xfffff
+#define FW_RI_TPTE_QPID_V(x)		((x) << FW_RI_TPTE_QPID_S)
+#define FW_RI_TPTE_QPID_G(x)		\
+	(((x) >> FW_RI_TPTE_QPID_S) & FW_RI_TPTE_QPID_M)
+
+#define FW_RI_TPTE_NOSNOOP_S		30
+#define FW_RI_TPTE_NOSNOOP_M		0x1
+#define FW_RI_TPTE_NOSNOOP_V(x)		((x) << FW_RI_TPTE_NOSNOOP_S)
+#define FW_RI_TPTE_NOSNOOP_G(x)		\
+	(((x) >> FW_RI_TPTE_NOSNOOP_S) & FW_RI_TPTE_NOSNOOP_M)
+#define FW_RI_TPTE_NOSNOOP_F		FW_RI_TPTE_NOSNOOP_V(1U)
+
+#define FW_RI_TPTE_PBLADDR_S		0
+#define FW_RI_TPTE_PBLADDR_M		0x1fffffff
+#define FW_RI_TPTE_PBLADDR_V(x)		((x) << FW_RI_TPTE_PBLADDR_S)
+#define FW_RI_TPTE_PBLADDR_G(x)		\
+	(((x) >> FW_RI_TPTE_PBLADDR_S) & FW_RI_TPTE_PBLADDR_M)
+
+#define FW_RI_TPTE_DCA_S		24
+#define FW_RI_TPTE_DCA_M		0x1f
+#define FW_RI_TPTE_DCA_V(x)		((x) << FW_RI_TPTE_DCA_S)
+#define FW_RI_TPTE_DCA_G(x)		\
+	(((x) >> FW_RI_TPTE_DCA_S) & FW_RI_TPTE_DCA_M)
+
+#define FW_RI_TPTE_MWBCNT_PSTAG_S	0
+#define FW_RI_TPTE_MWBCNT_PSTAG_M	0xffffff
+#define FW_RI_TPTE_MWBCNT_PSTAT_V(x)	\
+	((x) << FW_RI_TPTE_MWBCNT_PSTAG_S)
+#define FW_RI_TPTE_MWBCNT_PSTAG_G(x)	\
+	(((x) >> FW_RI_TPTE_MWBCNT_PSTAG_S) & FW_RI_TPTE_MWBCNT_PSTAG_M)
+
+enum fw_ri_res_type {
+	FW_RI_RES_TYPE_SQ,
+	FW_RI_RES_TYPE_RQ,
+	FW_RI_RES_TYPE_CQ,
+};
+
+enum fw_ri_res_op {
+	FW_RI_RES_OP_WRITE,
+	FW_RI_RES_OP_RESET,
+};
+
+struct fw_ri_res {
+	union fw_ri_restype {
+		struct fw_ri_res_sqrq {
+			__u8   restype;
+			__u8   op;
+			__be16 r3;
+			__be32 eqid;
+			__be32 r4[2];
+			__be32 fetchszm_to_iqid;
+			__be32 dcaen_to_eqsize;
+			__be64 eqaddr;
+		} sqrq;
+		struct fw_ri_res_cq {
+			__u8   restype;
+			__u8   op;
+			__be16 r3;
+			__be32 iqid;
+			__be32 r4[2];
+			__be32 iqandst_to_iqandstindex;
+			__be16 iqdroprss_to_iqesize;
+			__be16 iqsize;
+			__be64 iqaddr;
+			__be32 iqns_iqro;
+			__be32 r6_lo;
+			__be64 r7;
+		} cq;
+	} u;
+};
+
+struct fw_ri_res_wr {
+	__be32 op_nres;
+	__be32 len16_pkd;
+	__u64  cookie;
+#ifndef C99_NOT_SUPPORTED
+	struct fw_ri_res res[0];
+#endif
+};
+
+#define FW_RI_RES_WR_NRES_S	0
+#define FW_RI_RES_WR_NRES_M	0xff
+#define FW_RI_RES_WR_NRES_V(x)	((x) << FW_RI_RES_WR_NRES_S)
+#define FW_RI_RES_WR_NRES_G(x)	\
+	(((x) >> FW_RI_RES_WR_NRES_S) & FW_RI_RES_WR_NRES_M)
+
+#define FW_RI_RES_WR_FETCHSZM_S		26
+#define FW_RI_RES_WR_FETCHSZM_M		0x1
+#define FW_RI_RES_WR_FETCHSZM_V(x)	((x) << FW_RI_RES_WR_FETCHSZM_S)
+#define FW_RI_RES_WR_FETCHSZM_G(x)	\
+	(((x) >> FW_RI_RES_WR_FETCHSZM_S) & FW_RI_RES_WR_FETCHSZM_M)
+#define FW_RI_RES_WR_FETCHSZM_F	FW_RI_RES_WR_FETCHSZM_V(1U)
+
+#define FW_RI_RES_WR_STATUSPGNS_S	25
+#define FW_RI_RES_WR_STATUSPGNS_M	0x1
+#define FW_RI_RES_WR_STATUSPGNS_V(x)	((x) << FW_RI_RES_WR_STATUSPGNS_S)
+#define FW_RI_RES_WR_STATUSPGNS_G(x)	\
+	(((x) >> FW_RI_RES_WR_STATUSPGNS_S) & FW_RI_RES_WR_STATUSPGNS_M)
+#define FW_RI_RES_WR_STATUSPGNS_F	FW_RI_RES_WR_STATUSPGNS_V(1U)
+
+#define FW_RI_RES_WR_STATUSPGRO_S	24
+#define FW_RI_RES_WR_STATUSPGRO_M	0x1
+#define FW_RI_RES_WR_STATUSPGRO_V(x)	((x) << FW_RI_RES_WR_STATUSPGRO_S)
+#define FW_RI_RES_WR_STATUSPGRO_G(x)	\
+	(((x) >> FW_RI_RES_WR_STATUSPGRO_S) & FW_RI_RES_WR_STATUSPGRO_M)
+#define FW_RI_RES_WR_STATUSPGRO_F	FW_RI_RES_WR_STATUSPGRO_V(1U)
+
+#define FW_RI_RES_WR_FETCHNS_S		23
+#define FW_RI_RES_WR_FETCHNS_M		0x1
+#define FW_RI_RES_WR_FETCHNS_V(x)	((x) << FW_RI_RES_WR_FETCHNS_S)
+#define FW_RI_RES_WR_FETCHNS_G(x)	\
+	(((x) >> FW_RI_RES_WR_FETCHNS_S) & FW_RI_RES_WR_FETCHNS_M)
+#define FW_RI_RES_WR_FETCHNS_F	FW_RI_RES_WR_FETCHNS_V(1U)
+
+#define FW_RI_RES_WR_FETCHRO_S		22
+#define FW_RI_RES_WR_FETCHRO_M		0x1
+#define FW_RI_RES_WR_FETCHRO_V(x)	((x) << FW_RI_RES_WR_FETCHRO_S)
+#define FW_RI_RES_WR_FETCHRO_G(x)	\
+	(((x) >> FW_RI_RES_WR_FETCHRO_S) & FW_RI_RES_WR_FETCHRO_M)
+#define FW_RI_RES_WR_FETCHRO_F	FW_RI_RES_WR_FETCHRO_V(1U)
+
+#define FW_RI_RES_WR_HOSTFCMODE_S	20
+#define FW_RI_RES_WR_HOSTFCMODE_M	0x3
+#define FW_RI_RES_WR_HOSTFCMODE_V(x)	((x) << FW_RI_RES_WR_HOSTFCMODE_S)
+#define FW_RI_RES_WR_HOSTFCMODE_G(x)	\
+	(((x) >> FW_RI_RES_WR_HOSTFCMODE_S) & FW_RI_RES_WR_HOSTFCMODE_M)
+
+#define FW_RI_RES_WR_CPRIO_S	19
+#define FW_RI_RES_WR_CPRIO_M	0x1
+#define FW_RI_RES_WR_CPRIO_V(x)	((x) << FW_RI_RES_WR_CPRIO_S)
+#define FW_RI_RES_WR_CPRIO_G(x)	\
+	(((x) >> FW_RI_RES_WR_CPRIO_S) & FW_RI_RES_WR_CPRIO_M)
+#define FW_RI_RES_WR_CPRIO_F	FW_RI_RES_WR_CPRIO_V(1U)
+
+#define FW_RI_RES_WR_ONCHIP_S		18
+#define FW_RI_RES_WR_ONCHIP_M		0x1
+#define FW_RI_RES_WR_ONCHIP_V(x)	((x) << FW_RI_RES_WR_ONCHIP_S)
+#define FW_RI_RES_WR_ONCHIP_G(x)	\
+	(((x) >> FW_RI_RES_WR_ONCHIP_S) & FW_RI_RES_WR_ONCHIP_M)
+#define FW_RI_RES_WR_ONCHIP_F	FW_RI_RES_WR_ONCHIP_V(1U)
+
+#define FW_RI_RES_WR_PCIECHN_S		16
+#define FW_RI_RES_WR_PCIECHN_M		0x3
+#define FW_RI_RES_WR_PCIECHN_V(x)	((x) << FW_RI_RES_WR_PCIECHN_S)
+#define FW_RI_RES_WR_PCIECHN_G(x)	\
+	(((x) >> FW_RI_RES_WR_PCIECHN_S) & FW_RI_RES_WR_PCIECHN_M)
+
+#define FW_RI_RES_WR_IQID_S	0
+#define FW_RI_RES_WR_IQID_M	0xffff
+#define FW_RI_RES_WR_IQID_V(x)	((x) << FW_RI_RES_WR_IQID_S)
+#define FW_RI_RES_WR_IQID_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQID_S) & FW_RI_RES_WR_IQID_M)
+
+#define FW_RI_RES_WR_DCAEN_S	31
+#define FW_RI_RES_WR_DCAEN_M	0x1
+#define FW_RI_RES_WR_DCAEN_V(x)	((x) << FW_RI_RES_WR_DCAEN_S)
+#define FW_RI_RES_WR_DCAEN_G(x)	\
+	(((x) >> FW_RI_RES_WR_DCAEN_S) & FW_RI_RES_WR_DCAEN_M)
+#define FW_RI_RES_WR_DCAEN_F	FW_RI_RES_WR_DCAEN_V(1U)
+
+#define FW_RI_RES_WR_DCACPU_S		26
+#define FW_RI_RES_WR_DCACPU_M		0x1f
+#define FW_RI_RES_WR_DCACPU_V(x)	((x) << FW_RI_RES_WR_DCACPU_S)
+#define FW_RI_RES_WR_DCACPU_G(x)	\
+	(((x) >> FW_RI_RES_WR_DCACPU_S) & FW_RI_RES_WR_DCACPU_M)
+
+#define FW_RI_RES_WR_FBMIN_S	23
+#define FW_RI_RES_WR_FBMIN_M	0x7
+#define FW_RI_RES_WR_FBMIN_V(x)	((x) << FW_RI_RES_WR_FBMIN_S)
+#define FW_RI_RES_WR_FBMIN_G(x)	\
+	(((x) >> FW_RI_RES_WR_FBMIN_S) & FW_RI_RES_WR_FBMIN_M)
+
+#define FW_RI_RES_WR_FBMAX_S	20
+#define FW_RI_RES_WR_FBMAX_M	0x7
+#define FW_RI_RES_WR_FBMAX_V(x)	((x) << FW_RI_RES_WR_FBMAX_S)
+#define FW_RI_RES_WR_FBMAX_G(x)	\
+	(((x) >> FW_RI_RES_WR_FBMAX_S) & FW_RI_RES_WR_FBMAX_M)
+
+#define FW_RI_RES_WR_CIDXFTHRESHO_S	19
+#define FW_RI_RES_WR_CIDXFTHRESHO_M	0x1
+#define FW_RI_RES_WR_CIDXFTHRESHO_V(x)	((x) << FW_RI_RES_WR_CIDXFTHRESHO_S)
+#define FW_RI_RES_WR_CIDXFTHRESHO_G(x)	\
+	(((x) >> FW_RI_RES_WR_CIDXFTHRESHO_S) & FW_RI_RES_WR_CIDXFTHRESHO_M)
+#define FW_RI_RES_WR_CIDXFTHRESHO_F	FW_RI_RES_WR_CIDXFTHRESHO_V(1U)
+
+#define FW_RI_RES_WR_CIDXFTHRESH_S	16
+#define FW_RI_RES_WR_CIDXFTHRESH_M	0x7
+#define FW_RI_RES_WR_CIDXFTHRESH_V(x)	((x) << FW_RI_RES_WR_CIDXFTHRESH_S)
+#define FW_RI_RES_WR_CIDXFTHRESH_G(x)	\
+	(((x) >> FW_RI_RES_WR_CIDXFTHRESH_S) & FW_RI_RES_WR_CIDXFTHRESH_M)
+
+#define FW_RI_RES_WR_EQSIZE_S		0
+#define FW_RI_RES_WR_EQSIZE_M		0xffff
+#define FW_RI_RES_WR_EQSIZE_V(x)	((x) << FW_RI_RES_WR_EQSIZE_S)
+#define FW_RI_RES_WR_EQSIZE_G(x)	\
+	(((x) >> FW_RI_RES_WR_EQSIZE_S) & FW_RI_RES_WR_EQSIZE_M)
+
+#define FW_RI_RES_WR_IQANDST_S		15
+#define FW_RI_RES_WR_IQANDST_M		0x1
+#define FW_RI_RES_WR_IQANDST_V(x)	((x) << FW_RI_RES_WR_IQANDST_S)
+#define FW_RI_RES_WR_IQANDST_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQANDST_S) & FW_RI_RES_WR_IQANDST_M)
+#define FW_RI_RES_WR_IQANDST_F	FW_RI_RES_WR_IQANDST_V(1U)
+
+#define FW_RI_RES_WR_IQANUS_S		14
+#define FW_RI_RES_WR_IQANUS_M		0x1
+#define FW_RI_RES_WR_IQANUS_V(x)	((x) << FW_RI_RES_WR_IQANUS_S)
+#define FW_RI_RES_WR_IQANUS_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQANUS_S) & FW_RI_RES_WR_IQANUS_M)
+#define FW_RI_RES_WR_IQANUS_F	FW_RI_RES_WR_IQANUS_V(1U)
+
+#define FW_RI_RES_WR_IQANUD_S		12
+#define FW_RI_RES_WR_IQANUD_M		0x3
+#define FW_RI_RES_WR_IQANUD_V(x)	((x) << FW_RI_RES_WR_IQANUD_S)
+#define FW_RI_RES_WR_IQANUD_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQANUD_S) & FW_RI_RES_WR_IQANUD_M)
+
+#define FW_RI_RES_WR_IQANDSTINDEX_S	0
+#define FW_RI_RES_WR_IQANDSTINDEX_M	0xfff
+#define FW_RI_RES_WR_IQANDSTINDEX_V(x)	((x) << FW_RI_RES_WR_IQANDSTINDEX_S)
+#define FW_RI_RES_WR_IQANDSTINDEX_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQANDSTINDEX_S) & FW_RI_RES_WR_IQANDSTINDEX_M)
+
+#define FW_RI_RES_WR_IQDROPRSS_S	15
+#define FW_RI_RES_WR_IQDROPRSS_M	0x1
+#define FW_RI_RES_WR_IQDROPRSS_V(x)	((x) << FW_RI_RES_WR_IQDROPRSS_S)
+#define FW_RI_RES_WR_IQDROPRSS_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQDROPRSS_S) & FW_RI_RES_WR_IQDROPRSS_M)
+#define FW_RI_RES_WR_IQDROPRSS_F	FW_RI_RES_WR_IQDROPRSS_V(1U)
+
+#define FW_RI_RES_WR_IQGTSMODE_S	14
+#define FW_RI_RES_WR_IQGTSMODE_M	0x1
+#define FW_RI_RES_WR_IQGTSMODE_V(x)	((x) << FW_RI_RES_WR_IQGTSMODE_S)
+#define FW_RI_RES_WR_IQGTSMODE_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQGTSMODE_S) & FW_RI_RES_WR_IQGTSMODE_M)
+#define FW_RI_RES_WR_IQGTSMODE_F	FW_RI_RES_WR_IQGTSMODE_V(1U)
+
+#define FW_RI_RES_WR_IQPCIECH_S		12
+#define FW_RI_RES_WR_IQPCIECH_M		0x3
+#define FW_RI_RES_WR_IQPCIECH_V(x)	((x) << FW_RI_RES_WR_IQPCIECH_S)
+#define FW_RI_RES_WR_IQPCIECH_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQPCIECH_S) & FW_RI_RES_WR_IQPCIECH_M)
+
+#define FW_RI_RES_WR_IQDCAEN_S		11
+#define FW_RI_RES_WR_IQDCAEN_M		0x1
+#define FW_RI_RES_WR_IQDCAEN_V(x)	((x) << FW_RI_RES_WR_IQDCAEN_S)
+#define FW_RI_RES_WR_IQDCAEN_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQDCAEN_S) & FW_RI_RES_WR_IQDCAEN_M)
+#define FW_RI_RES_WR_IQDCAEN_F	FW_RI_RES_WR_IQDCAEN_V(1U)
+
+#define FW_RI_RES_WR_IQDCACPU_S		6
+#define FW_RI_RES_WR_IQDCACPU_M		0x1f
+#define FW_RI_RES_WR_IQDCACPU_V(x)	((x) << FW_RI_RES_WR_IQDCACPU_S)
+#define FW_RI_RES_WR_IQDCACPU_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQDCACPU_S) & FW_RI_RES_WR_IQDCACPU_M)
+
+#define FW_RI_RES_WR_IQINTCNTTHRESH_S		4
+#define FW_RI_RES_WR_IQINTCNTTHRESH_M		0x3
+#define FW_RI_RES_WR_IQINTCNTTHRESH_V(x)	\
+	((x) << FW_RI_RES_WR_IQINTCNTTHRESH_S)
+#define FW_RI_RES_WR_IQINTCNTTHRESH_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQINTCNTTHRESH_S) & FW_RI_RES_WR_IQINTCNTTHRESH_M)
+
+#define FW_RI_RES_WR_IQO_S	3
+#define FW_RI_RES_WR_IQO_M	0x1
+#define FW_RI_RES_WR_IQO_V(x)	((x) << FW_RI_RES_WR_IQO_S)
+#define FW_RI_RES_WR_IQO_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQO_S) & FW_RI_RES_WR_IQO_M)
+#define FW_RI_RES_WR_IQO_F	FW_RI_RES_WR_IQO_V(1U)
+
+#define FW_RI_RES_WR_IQCPRIO_S		2
+#define FW_RI_RES_WR_IQCPRIO_M		0x1
+#define FW_RI_RES_WR_IQCPRIO_V(x)	((x) << FW_RI_RES_WR_IQCPRIO_S)
+#define FW_RI_RES_WR_IQCPRIO_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQCPRIO_S) & FW_RI_RES_WR_IQCPRIO_M)
+#define FW_RI_RES_WR_IQCPRIO_F	FW_RI_RES_WR_IQCPRIO_V(1U)
+
+#define FW_RI_RES_WR_IQESIZE_S		0
+#define FW_RI_RES_WR_IQESIZE_M		0x3
+#define FW_RI_RES_WR_IQESIZE_V(x)	((x) << FW_RI_RES_WR_IQESIZE_S)
+#define FW_RI_RES_WR_IQESIZE_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQESIZE_S) & FW_RI_RES_WR_IQESIZE_M)
+
+#define FW_RI_RES_WR_IQNS_S	31
+#define FW_RI_RES_WR_IQNS_M	0x1
+#define FW_RI_RES_WR_IQNS_V(x)	((x) << FW_RI_RES_WR_IQNS_S)
+#define FW_RI_RES_WR_IQNS_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQNS_S) & FW_RI_RES_WR_IQNS_M)
+#define FW_RI_RES_WR_IQNS_F	FW_RI_RES_WR_IQNS_V(1U)
+
+#define FW_RI_RES_WR_IQRO_S	30
+#define FW_RI_RES_WR_IQRO_M	0x1
+#define FW_RI_RES_WR_IQRO_V(x)	((x) << FW_RI_RES_WR_IQRO_S)
+#define FW_RI_RES_WR_IQRO_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQRO_S) & FW_RI_RES_WR_IQRO_M)
+#define FW_RI_RES_WR_IQRO_F	FW_RI_RES_WR_IQRO_V(1U)
+
+struct fw_ri_rdma_write_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be64 r2;
+	__be32 plen;
+	__be32 stag_sink;
+	__be64 to_sink;
+#ifndef C99_NOT_SUPPORTED
+	union {
+		struct fw_ri_immd immd_src[0];
+		struct fw_ri_isgl isgl_src[0];
+	} u;
+#endif
+};
+
+struct fw_ri_send_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be32 sendop_pkd;
+	__be32 stag_inv;
+	__be32 plen;
+	__be32 r3;
+	__be64 r4;
+#ifndef C99_NOT_SUPPORTED
+	union {
+		struct fw_ri_immd immd_src[0];
+		struct fw_ri_isgl isgl_src[0];
+	} u;
+#endif
+};
+
+#define FW_RI_SEND_WR_SENDOP_S		0
+#define FW_RI_SEND_WR_SENDOP_M		0xf
+#define FW_RI_SEND_WR_SENDOP_V(x)	((x) << FW_RI_SEND_WR_SENDOP_S)
+#define FW_RI_SEND_WR_SENDOP_G(x)	\
+	(((x) >> FW_RI_SEND_WR_SENDOP_S) & FW_RI_SEND_WR_SENDOP_M)
+
+struct fw_ri_rdma_read_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be64 r2;
+	__be32 stag_sink;
+	__be32 to_sink_hi;
+	__be32 to_sink_lo;
+	__be32 plen;
+	__be32 stag_src;
+	__be32 to_src_hi;
+	__be32 to_src_lo;
+	__be32 r5;
+};
+
+struct fw_ri_recv_wr {
+	__u8   opcode;
+	__u8   r1;
+	__u16  wrid;
+	__u8   r2[3];
+	__u8   len16;
+	struct fw_ri_isgl isgl;
+};
+
+struct fw_ri_bind_mw_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__u8   qpbinde_to_dcacpu;
+	__u8   pgsz_shift;
+	__u8   addr_type;
+	__u8   mem_perms;
+	__be32 stag_mr;
+	__be32 stag_mw;
+	__be32 r3;
+	__be64 len_mw;
+	__be64 va_fbo;
+	__be64 r4;
+};
+
+#define FW_RI_BIND_MW_WR_QPBINDE_S	6
+#define FW_RI_BIND_MW_WR_QPBINDE_M	0x1
+#define FW_RI_BIND_MW_WR_QPBINDE_V(x)	((x) << FW_RI_BIND_MW_WR_QPBINDE_S)
+#define FW_RI_BIND_MW_WR_QPBINDE_G(x)	\
+	(((x) >> FW_RI_BIND_MW_WR_QPBINDE_S) & FW_RI_BIND_MW_WR_QPBINDE_M)
+#define FW_RI_BIND_MW_WR_QPBINDE_F	FW_RI_BIND_MW_WR_QPBINDE_V(1U)
+
+#define FW_RI_BIND_MW_WR_NS_S		5
+#define FW_RI_BIND_MW_WR_NS_M		0x1
+#define FW_RI_BIND_MW_WR_NS_V(x)	((x) << FW_RI_BIND_MW_WR_NS_S)
+#define FW_RI_BIND_MW_WR_NS_G(x)	\
+	(((x) >> FW_RI_BIND_MW_WR_NS_S) & FW_RI_BIND_MW_WR_NS_M)
+#define FW_RI_BIND_MW_WR_NS_F	FW_RI_BIND_MW_WR_NS_V(1U)
+
+#define FW_RI_BIND_MW_WR_DCACPU_S	0
+#define FW_RI_BIND_MW_WR_DCACPU_M	0x1f
+#define FW_RI_BIND_MW_WR_DCACPU_V(x)	((x) << FW_RI_BIND_MW_WR_DCACPU_S)
+#define FW_RI_BIND_MW_WR_DCACPU_G(x)	\
+	(((x) >> FW_RI_BIND_MW_WR_DCACPU_S) & FW_RI_BIND_MW_WR_DCACPU_M)
+
+struct fw_ri_fr_nsmr_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__u8   qpbinde_to_dcacpu;
+	__u8   pgsz_shift;
+	__u8   addr_type;
+	__u8   mem_perms;
+	__be32 stag;
+	__be32 len_hi;
+	__be32 len_lo;
+	__be32 va_hi;
+	__be32 va_lo_fbo;
+};
+
+#define FW_RI_FR_NSMR_WR_QPBINDE_S	6
+#define FW_RI_FR_NSMR_WR_QPBINDE_M	0x1
+#define FW_RI_FR_NSMR_WR_QPBINDE_V(x)	((x) << FW_RI_FR_NSMR_WR_QPBINDE_S)
+#define FW_RI_FR_NSMR_WR_QPBINDE_G(x)	\
+	(((x) >> FW_RI_FR_NSMR_WR_QPBINDE_S) & FW_RI_FR_NSMR_WR_QPBINDE_M)
+#define FW_RI_FR_NSMR_WR_QPBINDE_F	FW_RI_FR_NSMR_WR_QPBINDE_V(1U)
+
+#define FW_RI_FR_NSMR_WR_NS_S		5
+#define FW_RI_FR_NSMR_WR_NS_M		0x1
+#define FW_RI_FR_NSMR_WR_NS_V(x)	((x) << FW_RI_FR_NSMR_WR_NS_S)
+#define FW_RI_FR_NSMR_WR_NS_G(x)	\
+	(((x) >> FW_RI_FR_NSMR_WR_NS_S) & FW_RI_FR_NSMR_WR_NS_M)
+#define FW_RI_FR_NSMR_WR_NS_F	FW_RI_FR_NSMR_WR_NS_V(1U)
+
+#define FW_RI_FR_NSMR_WR_DCACPU_S	0
+#define FW_RI_FR_NSMR_WR_DCACPU_M	0x1f
+#define FW_RI_FR_NSMR_WR_DCACPU_V(x)	((x) << FW_RI_FR_NSMR_WR_DCACPU_S)
+#define FW_RI_FR_NSMR_WR_DCACPU_G(x)	\
+	(((x) >> FW_RI_FR_NSMR_WR_DCACPU_S) & FW_RI_FR_NSMR_WR_DCACPU_M)
+
+struct fw_ri_fr_nsmr_tpte_wr {
+	__u8	opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be32  r2;
+	__be32  stag;
+	struct fw_ri_tpte tpte;
+	__u64  pbl[2];
+};
+
+struct fw_ri_inv_lstag_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be32 r2;
+	__be32 stag_inv;
+};
+
+enum fw_ri_type {
+	FW_RI_TYPE_INIT,
+	FW_RI_TYPE_FINI,
+	FW_RI_TYPE_TERMINATE
+};
+
+enum fw_ri_init_p2ptype {
+	FW_RI_INIT_P2PTYPE_RDMA_WRITE		= FW_RI_RDMA_WRITE,
+	FW_RI_INIT_P2PTYPE_READ_REQ		= FW_RI_READ_REQ,
+	FW_RI_INIT_P2PTYPE_SEND			= FW_RI_SEND,
+	FW_RI_INIT_P2PTYPE_SEND_WITH_INV	= FW_RI_SEND_WITH_INV,
+	FW_RI_INIT_P2PTYPE_SEND_WITH_SE		= FW_RI_SEND_WITH_SE,
+	FW_RI_INIT_P2PTYPE_SEND_WITH_SE_INV	= FW_RI_SEND_WITH_SE_INV,
+	FW_RI_INIT_P2PTYPE_DISABLED		= 0xf,
+};
+
+struct fw_ri_wr {
+	__be32 op_compl;
+	__be32 flowid_len16;
+	__u64  cookie;
+	union fw_ri {
+		struct fw_ri_init {
+			__u8   type;
+			__u8   mpareqbit_p2ptype;
+			__u8   r4[2];
+			__u8   mpa_attrs;
+			__u8   qp_caps;
+			__be16 nrqe;
+			__be32 pdid;
+			__be32 qpid;
+			__be32 sq_eqid;
+			__be32 rq_eqid;
+			__be32 scqid;
+			__be32 rcqid;
+			__be32 ord_max;
+			__be32 ird_max;
+			__be32 iss;
+			__be32 irs;
+			__be32 hwrqsize;
+			__be32 hwrqaddr;
+			__be64 r5;
+			union fw_ri_init_p2p {
+				struct fw_ri_rdma_write_wr write;
+				struct fw_ri_rdma_read_wr read;
+				struct fw_ri_send_wr send;
+			} u;
+		} init;
+		struct fw_ri_fini {
+			__u8   type;
+			__u8   r3[7];
+			__be64 r4;
+		} fini;
+		struct fw_ri_terminate {
+			__u8   type;
+			__u8   r3[3];
+			__be32 immdlen;
+			__u8   termmsg[40];
+		} terminate;
+	} u;
+};
+
+#define FW_RI_WR_MPAREQBIT_S	7
+#define FW_RI_WR_MPAREQBIT_M	0x1
+#define FW_RI_WR_MPAREQBIT_V(x)	((x) << FW_RI_WR_MPAREQBIT_S)
+#define FW_RI_WR_MPAREQBIT_G(x)	\
+	(((x) >> FW_RI_WR_MPAREQBIT_S) & FW_RI_WR_MPAREQBIT_M)
+#define FW_RI_WR_MPAREQBIT_F	FW_RI_WR_MPAREQBIT_V(1U)
+
+#define FW_RI_WR_P2PTYPE_S	0
+#define FW_RI_WR_P2PTYPE_M	0xf
+#define FW_RI_WR_P2PTYPE_V(x)	((x) << FW_RI_WR_P2PTYPE_S)
+#define FW_RI_WR_P2PTYPE_G(x)	\
+	(((x) >> FW_RI_WR_P2PTYPE_S) & FW_RI_WR_P2PTYPE_M)
+
+#endif /* _T4FW_RI_API_H_ */
diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig
new file mode 100644
index 0000000..7b146b6
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/Kconfig
@@ -0,0 +1,22 @@
+config INFINIBAND_HFI1
+	tristate "Intel OPA Gen1 support"
+	depends on X86_64 && INFINIBAND_RDMAVT && I2C
+	select MMU_NOTIFIER
+	select CRC32
+	select I2C_ALGOBIT
+	---help---
+	This is a low-level driver for Intel OPA Gen1 adapter.
+config HFI1_DEBUG_SDMA_ORDER
+	bool "HFI1 SDMA Order debug"
+	depends on INFINIBAND_HFI1
+	default n
+	---help---
+	This is a debug flag to test for out of order
+	sdma completions for unit testing
+config SDMA_VERBOSITY
+	bool "Config SDMA Verbosity"
+	depends on INFINIBAND_HFI1
+	default n
+	---help---
+	This is a configuration flag to enable verbose
+	SDMA debug
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
new file mode 100644
index 0000000..ce4010b
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/Makefile
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# HFI driver
+#
+#
+#
+# Called from the kernel module build system.
+#
+obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
+
+hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
+	eprom.o exp_rcv.o file_ops.o firmware.o \
+	init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
+	qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \
+	uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
+	verbs_txreq.o vnic_main.o vnic_sdma.o
+hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
+
+CFLAGS_trace.o = -I$(src)
+ifdef MVERSION
+CFLAGS_driver.o = -DHFI_DRIVER_VERSION_BASE=\"$(MVERSION)\"
+endif
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
new file mode 100644
index 0000000..b5fab55
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -0,0 +1,777 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/topology.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+#include "hfi.h"
+#include "affinity.h"
+#include "sdma.h"
+#include "trace.h"
+
+struct hfi1_affinity_node_list node_affinity = {
+	.list = LIST_HEAD_INIT(node_affinity.list),
+	.lock = __MUTEX_INITIALIZER(node_affinity.lock)
+};
+
+/* Name of IRQ types, indexed by enum irq_type */
+static const char * const irq_type_names[] = {
+	"SDMA",
+	"RCVCTXT",
+	"GENERAL",
+	"OTHER",
+};
+
+/* Per NUMA node count of HFI devices */
+static unsigned int *hfi1_per_node_cntr;
+
+static inline void init_cpu_mask_set(struct cpu_mask_set *set)
+{
+	cpumask_clear(&set->mask);
+	cpumask_clear(&set->used);
+	set->gen = 0;
+}
+
+/* Initialize non-HT cpu cores mask */
+void init_real_cpu_mask(void)
+{
+	int possible, curr_cpu, i, ht;
+
+	cpumask_clear(&node_affinity.real_cpu_mask);
+
+	/* Start with cpu online mask as the real cpu mask */
+	cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
+
+	/*
+	 * Remove HT cores from the real cpu mask.  Do this in two steps below.
+	 */
+	possible = cpumask_weight(&node_affinity.real_cpu_mask);
+	ht = cpumask_weight(topology_sibling_cpumask(
+				cpumask_first(&node_affinity.real_cpu_mask)));
+	/*
+	 * Step 1.  Skip over the first N HT siblings and use them as the
+	 * "real" cores.  Assumes that HT cores are not enumerated in
+	 * succession (except in the single core case).
+	 */
+	curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
+	for (i = 0; i < possible / ht; i++)
+		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
+	/*
+	 * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
+	 * skip any gaps.
+	 */
+	for (; i < possible; i++) {
+		cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
+		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
+	}
+}
+
+int node_affinity_init(void)
+{
+	int node;
+	struct pci_dev *dev = NULL;
+	const struct pci_device_id *ids = hfi1_pci_tbl;
+
+	cpumask_clear(&node_affinity.proc.used);
+	cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
+
+	node_affinity.proc.gen = 0;
+	node_affinity.num_core_siblings =
+				cpumask_weight(topology_sibling_cpumask(
+					cpumask_first(&node_affinity.proc.mask)
+					));
+	node_affinity.num_possible_nodes = num_possible_nodes();
+	node_affinity.num_online_nodes = num_online_nodes();
+	node_affinity.num_online_cpus = num_online_cpus();
+
+	/*
+	 * The real cpu mask is part of the affinity struct but it has to be
+	 * initialized early. It is needed to calculate the number of user
+	 * contexts in set_up_context_variables().
+	 */
+	init_real_cpu_mask();
+
+	hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes,
+				     sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
+	if (!hfi1_per_node_cntr)
+		return -ENOMEM;
+
+	while (ids->vendor) {
+		dev = NULL;
+		while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
+			node = pcibus_to_node(dev->bus);
+			if (node < 0)
+				node = numa_node_id();
+
+			hfi1_per_node_cntr[node]++;
+		}
+		ids++;
+	}
+
+	return 0;
+}
+
+void node_affinity_destroy(void)
+{
+	struct list_head *pos, *q;
+	struct hfi1_affinity_node *entry;
+
+	mutex_lock(&node_affinity.lock);
+	list_for_each_safe(pos, q, &node_affinity.list) {
+		entry = list_entry(pos, struct hfi1_affinity_node,
+				   list);
+		list_del(pos);
+		kfree(entry);
+	}
+	mutex_unlock(&node_affinity.lock);
+	kfree(hfi1_per_node_cntr);
+}
+
+static struct hfi1_affinity_node *node_affinity_allocate(int node)
+{
+	struct hfi1_affinity_node *entry;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return NULL;
+	entry->node = node;
+	INIT_LIST_HEAD(&entry->list);
+
+	return entry;
+}
+
+/*
+ * It appends an entry to the list.
+ * It *must* be called with node_affinity.lock held.
+ */
+static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
+{
+	list_add_tail(&entry->list, &node_affinity.list);
+}
+
+/* It must be called with node_affinity.lock held */
+static struct hfi1_affinity_node *node_affinity_lookup(int node)
+{
+	struct list_head *pos;
+	struct hfi1_affinity_node *entry;
+
+	list_for_each(pos, &node_affinity.list) {
+		entry = list_entry(pos, struct hfi1_affinity_node, list);
+		if (entry->node == node)
+			return entry;
+	}
+
+	return NULL;
+}
+
+/*
+ * Interrupt affinity.
+ *
+ * non-rcv avail gets a default mask that
+ * starts as possible cpus with threads reset
+ * and each rcv avail reset.
+ *
+ * rcv avail gets node relative 1 wrapping back
+ * to the node relative 1 as necessary.
+ *
+ */
+int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
+{
+	int node = pcibus_to_node(dd->pcidev->bus);
+	struct hfi1_affinity_node *entry;
+	const struct cpumask *local_mask;
+	int curr_cpu, possible, i;
+
+	if (node < 0)
+		node = numa_node_id();
+	dd->node = node;
+
+	local_mask = cpumask_of_node(dd->node);
+	if (cpumask_first(local_mask) >= nr_cpu_ids)
+		local_mask = topology_core_cpumask(0);
+
+	mutex_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+
+	/*
+	 * If this is the first time this NUMA node's affinity is used,
+	 * create an entry in the global affinity structure and initialize it.
+	 */
+	if (!entry) {
+		entry = node_affinity_allocate(node);
+		if (!entry) {
+			dd_dev_err(dd,
+				   "Unable to allocate global affinity node\n");
+			mutex_unlock(&node_affinity.lock);
+			return -ENOMEM;
+		}
+		init_cpu_mask_set(&entry->def_intr);
+		init_cpu_mask_set(&entry->rcv_intr);
+		cpumask_clear(&entry->general_intr_mask);
+		/* Use the "real" cpu mask of this node as the default */
+		cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
+			    local_mask);
+
+		/* fill in the receive list */
+		possible = cpumask_weight(&entry->def_intr.mask);
+		curr_cpu = cpumask_first(&entry->def_intr.mask);
+
+		if (possible == 1) {
+			/* only one CPU, everyone will use it */
+			cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
+			cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
+		} else {
+			/*
+			 * The general/control context will be the first CPU in
+			 * the default list, so it is removed from the default
+			 * list and added to the general interrupt list.
+			 */
+			cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
+			cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
+			curr_cpu = cpumask_next(curr_cpu,
+						&entry->def_intr.mask);
+
+			/*
+			 * Remove the remaining kernel receive queues from
+			 * the default list and add them to the receive list.
+			 */
+			for (i = 0;
+			     i < (dd->n_krcv_queues - 1) *
+				  hfi1_per_node_cntr[dd->node];
+			     i++) {
+				cpumask_clear_cpu(curr_cpu,
+						  &entry->def_intr.mask);
+				cpumask_set_cpu(curr_cpu,
+						&entry->rcv_intr.mask);
+				curr_cpu = cpumask_next(curr_cpu,
+							&entry->def_intr.mask);
+				if (curr_cpu >= nr_cpu_ids)
+					break;
+			}
+
+			/*
+			 * If there ends up being 0 CPU cores leftover for SDMA
+			 * engines, use the same CPU cores as general/control
+			 * context.
+			 */
+			if (cpumask_weight(&entry->def_intr.mask) == 0)
+				cpumask_copy(&entry->def_intr.mask,
+					     &entry->general_intr_mask);
+		}
+
+		node_affinity_add_tail(entry);
+	}
+	mutex_unlock(&node_affinity.lock);
+	return 0;
+}
+
+/*
+ * Function updates the irq affinity hint for msix after it has been changed
+ * by the user using the /proc/irq interface. This function only accepts
+ * one cpu in the mask.
+ */
+static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu)
+{
+	struct sdma_engine *sde = msix->arg;
+	struct hfi1_devdata *dd = sde->dd;
+	struct hfi1_affinity_node *entry;
+	struct cpu_mask_set *set;
+	int i, old_cpu;
+
+	if (cpu > num_online_cpus() || cpu == sde->cpu)
+		return;
+
+	mutex_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	if (!entry)
+		goto unlock;
+
+	old_cpu = sde->cpu;
+	sde->cpu = cpu;
+	cpumask_clear(&msix->mask);
+	cpumask_set_cpu(cpu, &msix->mask);
+	dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n",
+		   msix->irq, irq_type_names[msix->type],
+		   sde->this_idx, cpu);
+	irq_set_affinity_hint(msix->irq, &msix->mask);
+
+	/*
+	 * Set the new cpu in the hfi1_affinity_node and clean
+	 * the old cpu if it is not used by any other IRQ
+	 */
+	set = &entry->def_intr;
+	cpumask_set_cpu(cpu, &set->mask);
+	cpumask_set_cpu(cpu, &set->used);
+	for (i = 0; i < dd->num_msix_entries; i++) {
+		struct hfi1_msix_entry *other_msix;
+
+		other_msix = &dd->msix_entries[i];
+		if (other_msix->type != IRQ_SDMA || other_msix == msix)
+			continue;
+
+		if (cpumask_test_cpu(old_cpu, &other_msix->mask))
+			goto unlock;
+	}
+	cpumask_clear_cpu(old_cpu, &set->mask);
+	cpumask_clear_cpu(old_cpu, &set->used);
+unlock:
+	mutex_unlock(&node_affinity.lock);
+}
+
+static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify,
+				     const cpumask_t *mask)
+{
+	int cpu = cpumask_first(mask);
+	struct hfi1_msix_entry *msix = container_of(notify,
+						    struct hfi1_msix_entry,
+						    notify);
+
+	/* Only one CPU configuration supported currently */
+	hfi1_update_sdma_affinity(msix, cpu);
+}
+
+static void hfi1_irq_notifier_release(struct kref *ref)
+{
+	/*
+	 * This is required by affinity notifier. We don't have anything to
+	 * free here.
+	 */
+}
+
+static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix)
+{
+	struct irq_affinity_notify *notify = &msix->notify;
+
+	notify->irq = msix->irq;
+	notify->notify = hfi1_irq_notifier_notify;
+	notify->release = hfi1_irq_notifier_release;
+
+	if (irq_set_affinity_notifier(notify->irq, notify))
+		pr_err("Failed to register sdma irq affinity notifier for irq %d\n",
+		       notify->irq);
+}
+
+static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix)
+{
+	struct irq_affinity_notify *notify = &msix->notify;
+
+	if (irq_set_affinity_notifier(notify->irq, NULL))
+		pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n",
+		       notify->irq);
+}
+
+/*
+ * Function sets the irq affinity for msix.
+ * It *must* be called with node_affinity.lock held.
+ */
+static int get_irq_affinity(struct hfi1_devdata *dd,
+			    struct hfi1_msix_entry *msix)
+{
+	cpumask_var_t diff;
+	struct hfi1_affinity_node *entry;
+	struct cpu_mask_set *set = NULL;
+	struct sdma_engine *sde = NULL;
+	struct hfi1_ctxtdata *rcd = NULL;
+	char extra[64];
+	int cpu = -1;
+
+	extra[0] = '\0';
+	cpumask_clear(&msix->mask);
+
+	entry = node_affinity_lookup(dd->node);
+
+	switch (msix->type) {
+	case IRQ_SDMA:
+		sde = (struct sdma_engine *)msix->arg;
+		scnprintf(extra, 64, "engine %u", sde->this_idx);
+		set = &entry->def_intr;
+		break;
+	case IRQ_GENERAL:
+		cpu = cpumask_first(&entry->general_intr_mask);
+		break;
+	case IRQ_RCVCTXT:
+		rcd = (struct hfi1_ctxtdata *)msix->arg;
+		if (rcd->ctxt == HFI1_CTRL_CTXT)
+			cpu = cpumask_first(&entry->general_intr_mask);
+		else
+			set = &entry->rcv_intr;
+		scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
+		break;
+	default:
+		dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
+		return -EINVAL;
+	}
+
+	/*
+	 * The general and control contexts are placed on a particular
+	 * CPU, which is set above. Skip accounting for it. Everything else
+	 * finds its CPU here.
+	 */
+	if (cpu == -1 && set) {
+		if (!zalloc_cpumask_var(&diff, GFP_KERNEL))
+			return -ENOMEM;
+
+		if (cpumask_equal(&set->mask, &set->used)) {
+			/*
+			 * We've used up all the CPUs, bump up the generation
+			 * and reset the 'used' map
+			 */
+			set->gen++;
+			cpumask_clear(&set->used);
+		}
+		cpumask_andnot(diff, &set->mask, &set->used);
+		cpu = cpumask_first(diff);
+		cpumask_set_cpu(cpu, &set->used);
+
+		free_cpumask_var(diff);
+	}
+
+	cpumask_set_cpu(cpu, &msix->mask);
+	dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n",
+		    msix->irq, irq_type_names[msix->type],
+		    extra, cpu);
+	irq_set_affinity_hint(msix->irq, &msix->mask);
+
+	if (msix->type == IRQ_SDMA) {
+		sde->cpu = cpu;
+		hfi1_setup_sdma_notifier(msix);
+	}
+
+	return 0;
+}
+
+int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
+{
+	int ret;
+
+	mutex_lock(&node_affinity.lock);
+	ret = get_irq_affinity(dd, msix);
+	mutex_unlock(&node_affinity.lock);
+	return ret;
+}
+
+void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
+			   struct hfi1_msix_entry *msix)
+{
+	struct cpu_mask_set *set = NULL;
+	struct hfi1_ctxtdata *rcd;
+	struct hfi1_affinity_node *entry;
+
+	mutex_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+
+	switch (msix->type) {
+	case IRQ_SDMA:
+		set = &entry->def_intr;
+		hfi1_cleanup_sdma_notifier(msix);
+		break;
+	case IRQ_GENERAL:
+		/* Don't do accounting for general contexts */
+		break;
+	case IRQ_RCVCTXT:
+		rcd = (struct hfi1_ctxtdata *)msix->arg;
+		/* Don't do accounting for control contexts */
+		if (rcd->ctxt != HFI1_CTRL_CTXT)
+			set = &entry->rcv_intr;
+		break;
+	default:
+		mutex_unlock(&node_affinity.lock);
+		return;
+	}
+
+	if (set) {
+		cpumask_andnot(&set->used, &set->used, &msix->mask);
+		if (cpumask_empty(&set->used) && set->gen) {
+			set->gen--;
+			cpumask_copy(&set->used, &set->mask);
+		}
+	}
+
+	irq_set_affinity_hint(msix->irq, NULL);
+	cpumask_clear(&msix->mask);
+	mutex_unlock(&node_affinity.lock);
+}
+
+/* This should be called with node_affinity.lock held */
+static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
+				struct hfi1_affinity_node_list *affinity)
+{
+	int possible, curr_cpu, i;
+	uint num_cores_per_socket = node_affinity.num_online_cpus /
+					affinity->num_core_siblings /
+						node_affinity.num_online_nodes;
+
+	cpumask_copy(hw_thread_mask, &affinity->proc.mask);
+	if (affinity->num_core_siblings > 0) {
+		/* Removing other siblings not needed for now */
+		possible = cpumask_weight(hw_thread_mask);
+		curr_cpu = cpumask_first(hw_thread_mask);
+		for (i = 0;
+		     i < num_cores_per_socket * node_affinity.num_online_nodes;
+		     i++)
+			curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
+
+		for (; i < possible; i++) {
+			cpumask_clear_cpu(curr_cpu, hw_thread_mask);
+			curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
+		}
+
+		/* Identifying correct HW threads within physical cores */
+		cpumask_shift_left(hw_thread_mask, hw_thread_mask,
+				   num_cores_per_socket *
+				   node_affinity.num_online_nodes *
+				   hw_thread_no);
+	}
+}
+
+int hfi1_get_proc_affinity(int node)
+{
+	int cpu = -1, ret, i;
+	struct hfi1_affinity_node *entry;
+	cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
+	const struct cpumask *node_mask,
+		*proc_mask = &current->cpus_allowed;
+	struct hfi1_affinity_node_list *affinity = &node_affinity;
+	struct cpu_mask_set *set = &affinity->proc;
+
+	/*
+	 * check whether process/context affinity has already
+	 * been set
+	 */
+	if (cpumask_weight(proc_mask) == 1) {
+		hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
+			  current->pid, current->comm,
+			  cpumask_pr_args(proc_mask));
+		/*
+		 * Mark the pre-set CPU as used. This is atomic so we don't
+		 * need the lock
+		 */
+		cpu = cpumask_first(proc_mask);
+		cpumask_set_cpu(cpu, &set->used);
+		goto done;
+	} else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
+		hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
+			  current->pid, current->comm,
+			  cpumask_pr_args(proc_mask));
+		goto done;
+	}
+
+	/*
+	 * The process does not have a preset CPU affinity so find one to
+	 * recommend using the following algorithm:
+	 *
+	 * For each user process that is opening a context on HFI Y:
+	 *  a) If all cores are filled, reinitialize the bitmask
+	 *  b) Fill real cores first, then HT cores (First set of HT
+	 *     cores on all physical cores, then second set of HT core,
+	 *     and, so on) in the following order:
+	 *
+	 *     1. Same NUMA node as HFI Y and not running an IRQ
+	 *        handler
+	 *     2. Same NUMA node as HFI Y and running an IRQ handler
+	 *     3. Different NUMA node to HFI Y and not running an IRQ
+	 *        handler
+	 *     4. Different NUMA node to HFI Y and running an IRQ
+	 *        handler
+	 *  c) Mark core as filled in the bitmask. As user processes are
+	 *     done, clear cores from the bitmask.
+	 */
+
+	ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
+	if (!ret)
+		goto done;
+	ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
+	if (!ret)
+		goto free_diff;
+	ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
+	if (!ret)
+		goto free_hw_thread_mask;
+	ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
+	if (!ret)
+		goto free_available_mask;
+
+	mutex_lock(&affinity->lock);
+	/*
+	 * If we've used all available HW threads, clear the mask and start
+	 * overloading.
+	 */
+	if (cpumask_equal(&set->mask, &set->used)) {
+		set->gen++;
+		cpumask_clear(&set->used);
+	}
+
+	/*
+	 * If NUMA node has CPUs used by interrupt handlers, include them in the
+	 * interrupt handler mask.
+	 */
+	entry = node_affinity_lookup(node);
+	if (entry) {
+		cpumask_copy(intrs_mask, (entry->def_intr.gen ?
+					  &entry->def_intr.mask :
+					  &entry->def_intr.used));
+		cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
+						    &entry->rcv_intr.mask :
+						    &entry->rcv_intr.used));
+		cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
+	}
+	hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
+		  cpumask_pr_args(intrs_mask));
+
+	cpumask_copy(hw_thread_mask, &set->mask);
+
+	/*
+	 * If HT cores are enabled, identify which HW threads within the
+	 * physical cores should be used.
+	 */
+	if (affinity->num_core_siblings > 0) {
+		for (i = 0; i < affinity->num_core_siblings; i++) {
+			find_hw_thread_mask(i, hw_thread_mask, affinity);
+
+			/*
+			 * If there's at least one available core for this HW
+			 * thread number, stop looking for a core.
+			 *
+			 * diff will always be not empty at least once in this
+			 * loop as the used mask gets reset when
+			 * (set->mask == set->used) before this loop.
+			 */
+			cpumask_andnot(diff, hw_thread_mask, &set->used);
+			if (!cpumask_empty(diff))
+				break;
+		}
+	}
+	hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
+		  cpumask_pr_args(hw_thread_mask));
+
+	node_mask = cpumask_of_node(node);
+	hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
+		  cpumask_pr_args(node_mask));
+
+	/* Get cpumask of available CPUs on preferred NUMA */
+	cpumask_and(available_mask, hw_thread_mask, node_mask);
+	cpumask_andnot(available_mask, available_mask, &set->used);
+	hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
+		  cpumask_pr_args(available_mask));
+
+	/*
+	 * At first, we don't want to place processes on the same
+	 * CPUs as interrupt handlers. Then, CPUs running interrupt
+	 * handlers are used.
+	 *
+	 * 1) If diff is not empty, then there are CPUs not running
+	 *    non-interrupt handlers available, so diff gets copied
+	 *    over to available_mask.
+	 * 2) If diff is empty, then all CPUs not running interrupt
+	 *    handlers are taken, so available_mask contains all
+	 *    available CPUs running interrupt handlers.
+	 * 3) If available_mask is empty, then all CPUs on the
+	 *    preferred NUMA node are taken, so other NUMA nodes are
+	 *    used for process assignments using the same method as
+	 *    the preferred NUMA node.
+	 */
+	cpumask_andnot(diff, available_mask, intrs_mask);
+	if (!cpumask_empty(diff))
+		cpumask_copy(available_mask, diff);
+
+	/* If we don't have CPUs on the preferred node, use other NUMA nodes */
+	if (cpumask_empty(available_mask)) {
+		cpumask_andnot(available_mask, hw_thread_mask, &set->used);
+		/* Excluding preferred NUMA cores */
+		cpumask_andnot(available_mask, available_mask, node_mask);
+		hfi1_cdbg(PROC,
+			  "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
+			  cpumask_pr_args(available_mask));
+
+		/*
+		 * At first, we don't want to place processes on the same
+		 * CPUs as interrupt handlers.
+		 */
+		cpumask_andnot(diff, available_mask, intrs_mask);
+		if (!cpumask_empty(diff))
+			cpumask_copy(available_mask, diff);
+	}
+	hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
+		  cpumask_pr_args(available_mask));
+
+	cpu = cpumask_first(available_mask);
+	if (cpu >= nr_cpu_ids) /* empty */
+		cpu = -1;
+	else
+		cpumask_set_cpu(cpu, &set->used);
+
+	mutex_unlock(&affinity->lock);
+	hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
+
+	free_cpumask_var(intrs_mask);
+free_available_mask:
+	free_cpumask_var(available_mask);
+free_hw_thread_mask:
+	free_cpumask_var(hw_thread_mask);
+free_diff:
+	free_cpumask_var(diff);
+done:
+	return cpu;
+}
+
+void hfi1_put_proc_affinity(int cpu)
+{
+	struct hfi1_affinity_node_list *affinity = &node_affinity;
+	struct cpu_mask_set *set = &affinity->proc;
+
+	if (cpu < 0)
+		return;
+
+	mutex_lock(&affinity->lock);
+	cpumask_clear_cpu(cpu, &set->used);
+	hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
+	if (cpumask_empty(&set->used) && set->gen) {
+		set->gen--;
+		cpumask_copy(&set->used, &set->mask);
+	}
+	mutex_unlock(&affinity->lock);
+}
diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h
new file mode 100644
index 0000000..2a1e374
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/affinity.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_AFFINITY_H
+#define _HFI1_AFFINITY_H
+
+#include "hfi.h"
+
+enum irq_type {
+	IRQ_SDMA,
+	IRQ_RCVCTXT,
+	IRQ_GENERAL,
+	IRQ_OTHER
+};
+
+/* Can be used for both memory and cpu */
+enum affinity_flags {
+	AFF_AUTO,
+	AFF_NUMA_LOCAL,
+	AFF_DEV_LOCAL,
+	AFF_IRQ_LOCAL
+};
+
+struct cpu_mask_set {
+	struct cpumask mask;
+	struct cpumask used;
+	uint gen;
+};
+
+struct hfi1_msix_entry;
+
+/* Initialize non-HT cpu cores mask */
+void init_real_cpu_mask(void);
+/* Initialize driver affinity data */
+int hfi1_dev_affinity_init(struct hfi1_devdata *dd);
+/*
+ * Set IRQ affinity to a CPU. The function will determine the
+ * CPU and set the affinity to it.
+ */
+int hfi1_get_irq_affinity(struct hfi1_devdata *dd,
+			  struct hfi1_msix_entry *msix);
+/*
+ * Remove the IRQ's CPU affinity. This function also updates
+ * any internal CPU tracking data
+ */
+void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
+			   struct hfi1_msix_entry *msix);
+/*
+ * Determine a CPU affinity for a user process, if the process does not
+ * have an affinity set yet.
+ */
+int hfi1_get_proc_affinity(int node);
+/* Release a CPU used by a user process. */
+void hfi1_put_proc_affinity(int cpu);
+
+struct hfi1_affinity_node {
+	int node;
+	struct cpu_mask_set def_intr;
+	struct cpu_mask_set rcv_intr;
+	struct cpumask general_intr_mask;
+	struct list_head list;
+};
+
+struct hfi1_affinity_node_list {
+	struct list_head list;
+	struct cpumask real_cpu_mask;
+	struct cpu_mask_set proc;
+	int num_core_siblings;
+	int num_possible_nodes;
+	int num_online_nodes;
+	int num_online_cpus;
+	struct mutex lock; /* protects affinity nodes */
+};
+
+int node_affinity_init(void);
+void node_affinity_destroy(void);
+extern struct hfi1_affinity_node_list node_affinity;
+
+#endif /* _HFI1_AFFINITY_H */
diff --git a/drivers/infiniband/hw/hfi1/aspm.h b/drivers/infiniband/hw/hfi1/aspm.h
new file mode 100644
index 0000000..e813387
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/aspm.h
@@ -0,0 +1,322 @@
+/*
+ * Copyright(c) 2015-2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _ASPM_H
+#define _ASPM_H
+
+#include "hfi.h"
+
+extern uint aspm_mode;
+
+enum aspm_mode {
+	ASPM_MODE_DISABLED = 0,	/* ASPM always disabled, performance mode */
+	ASPM_MODE_ENABLED = 1,	/* ASPM always enabled, power saving mode */
+	ASPM_MODE_DYNAMIC = 2,	/* ASPM enabled/disabled dynamically */
+};
+
+/* Time after which the timer interrupt will re-enable ASPM */
+#define ASPM_TIMER_MS 1000
+/* Time for which interrupts are ignored after a timer has been scheduled */
+#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2)
+/* Two interrupts within this time trigger ASPM disable */
+#define ASPM_TRIGGER_MS 1
+#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull)
+#define ASPM_L1_SUPPORTED(reg) \
+	(((reg & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2)
+
+static inline bool aspm_hw_l1_supported(struct hfi1_devdata *dd)
+{
+	struct pci_dev *parent = dd->pcidev->bus->self;
+	u32 up, dn;
+
+	/*
+	 * If the driver does not have access to the upstream component,
+	 * it cannot support ASPM L1 at all.
+	 */
+	if (!parent)
+		return false;
+
+	pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn);
+	dn = ASPM_L1_SUPPORTED(dn);
+
+	pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up);
+	up = ASPM_L1_SUPPORTED(up);
+
+	/* ASPM works on A-step but is reported as not supported */
+	return (!!dn || is_ax(dd)) && !!up;
+}
+
+/* Set L1 entrance latency for slower entry to L1 */
+static inline void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd)
+{
+	u32 l1_ent_lat = 0x4u;
+	u32 reg32;
+
+	pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, &reg32);
+	reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK;
+	reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT;
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32);
+}
+
+static inline void aspm_hw_enable_l1(struct hfi1_devdata *dd)
+{
+	struct pci_dev *parent = dd->pcidev->bus->self;
+
+	/*
+	 * If the driver does not have access to the upstream component,
+	 * it cannot support ASPM L1 at all.
+	 */
+	if (!parent)
+		return;
+
+	/* Enable ASPM L1 first in upstream component and then downstream */
+	pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+					   PCI_EXP_LNKCTL_ASPMC,
+					   PCI_EXP_LNKCTL_ASPM_L1);
+	pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+					   PCI_EXP_LNKCTL_ASPMC,
+					   PCI_EXP_LNKCTL_ASPM_L1);
+}
+
+static inline void aspm_hw_disable_l1(struct hfi1_devdata *dd)
+{
+	struct pci_dev *parent = dd->pcidev->bus->self;
+
+	/* Disable ASPM L1 first in downstream component and then upstream */
+	pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+					   PCI_EXP_LNKCTL_ASPMC, 0x0);
+	if (parent)
+		pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+						   PCI_EXP_LNKCTL_ASPMC, 0x0);
+}
+
+static inline void aspm_enable(struct hfi1_devdata *dd)
+{
+	if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED ||
+	    !dd->aspm_supported)
+		return;
+
+	aspm_hw_enable_l1(dd);
+	dd->aspm_enabled = true;
+}
+
+static inline void aspm_disable(struct hfi1_devdata *dd)
+{
+	if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED)
+		return;
+
+	aspm_hw_disable_l1(dd);
+	dd->aspm_enabled = false;
+}
+
+static inline void aspm_disable_inc(struct hfi1_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->aspm_lock, flags);
+	aspm_disable(dd);
+	atomic_inc(&dd->aspm_disabled_cnt);
+	spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+static inline void aspm_enable_dec(struct hfi1_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->aspm_lock, flags);
+	if (atomic_dec_and_test(&dd->aspm_disabled_cnt))
+		aspm_enable(dd);
+	spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+/* ASPM processing for each receive context interrupt */
+static inline void aspm_ctx_disable(struct hfi1_ctxtdata *rcd)
+{
+	bool restart_timer;
+	bool close_interrupts;
+	unsigned long flags;
+	ktime_t now, prev;
+
+	/* Quickest exit for minimum impact */
+	if (!rcd->aspm_intr_supported)
+		return;
+
+	spin_lock_irqsave(&rcd->aspm_lock, flags);
+	/* PSM contexts are open */
+	if (!rcd->aspm_intr_enable)
+		goto unlock;
+
+	prev = rcd->aspm_ts_last_intr;
+	now = ktime_get();
+	rcd->aspm_ts_last_intr = now;
+
+	/* An interrupt pair close together in time */
+	close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS;
+
+	/* Don't push out our timer till this much time has elapsed */
+	restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) >
+				    ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC;
+	restart_timer = restart_timer && close_interrupts;
+
+	/* Disable ASPM and schedule timer */
+	if (rcd->aspm_enabled && close_interrupts) {
+		aspm_disable_inc(rcd->dd);
+		rcd->aspm_enabled = false;
+		restart_timer = true;
+	}
+
+	if (restart_timer) {
+		mod_timer(&rcd->aspm_timer,
+			  jiffies + msecs_to_jiffies(ASPM_TIMER_MS));
+		rcd->aspm_ts_timer_sched = now;
+	}
+unlock:
+	spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/* Timer function for re-enabling ASPM in the absence of interrupt activity */
+static inline void aspm_ctx_timer_function(struct timer_list *t)
+{
+	struct hfi1_ctxtdata *rcd = from_timer(rcd, t, aspm_timer);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rcd->aspm_lock, flags);
+	aspm_enable_dec(rcd->dd);
+	rcd->aspm_enabled = true;
+	spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/*
+ * Disable interrupt processing for verbs contexts when PSM or VNIC contexts
+ * are open.
+ */
+static inline void aspm_disable_all(struct hfi1_devdata *dd)
+{
+	struct hfi1_ctxtdata *rcd;
+	unsigned long flags;
+	u16 i;
+
+	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		if (rcd) {
+			del_timer_sync(&rcd->aspm_timer);
+			spin_lock_irqsave(&rcd->aspm_lock, flags);
+			rcd->aspm_intr_enable = false;
+			spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+			hfi1_rcd_put(rcd);
+		}
+	}
+
+	aspm_disable(dd);
+	atomic_set(&dd->aspm_disabled_cnt, 0);
+}
+
+/* Re-enable interrupt processing for verbs contexts */
+static inline void aspm_enable_all(struct hfi1_devdata *dd)
+{
+	struct hfi1_ctxtdata *rcd;
+	unsigned long flags;
+	u16 i;
+
+	aspm_enable(dd);
+
+	if (aspm_mode != ASPM_MODE_DYNAMIC)
+		return;
+
+	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		if (rcd) {
+			spin_lock_irqsave(&rcd->aspm_lock, flags);
+			rcd->aspm_intr_enable = true;
+			rcd->aspm_enabled = true;
+			spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+			hfi1_rcd_put(rcd);
+		}
+	}
+}
+
+static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
+{
+	spin_lock_init(&rcd->aspm_lock);
+	timer_setup(&rcd->aspm_timer, aspm_ctx_timer_function, 0);
+	rcd->aspm_intr_supported = rcd->dd->aspm_supported &&
+		aspm_mode == ASPM_MODE_DYNAMIC &&
+		rcd->ctxt < rcd->dd->first_dyn_alloc_ctxt;
+}
+
+static inline void aspm_init(struct hfi1_devdata *dd)
+{
+	struct hfi1_ctxtdata *rcd;
+	u16 i;
+
+	spin_lock_init(&dd->aspm_lock);
+	dd->aspm_supported = aspm_hw_l1_supported(dd);
+
+	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		if (rcd)
+			aspm_ctx_init(rcd);
+		hfi1_rcd_put(rcd);
+	}
+
+	/* Start with ASPM disabled */
+	aspm_hw_set_l1_ent_latency(dd);
+	dd->aspm_enabled = false;
+	aspm_hw_disable_l1(dd);
+
+	/* Now turn on ASPM if configured */
+	aspm_enable_all(dd);
+}
+
+static inline void aspm_exit(struct hfi1_devdata *dd)
+{
+	aspm_disable_all(dd);
+
+	/* Turn on ASPM on exit to conserve power */
+	aspm_enable(dd);
+}
+
+#endif /* _ASPM_H */
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
new file mode 100644
index 0000000..e6bdd0c
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -0,0 +1,15451 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the code that is specific to the HFI chip
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "mad.h"
+#include "pio.h"
+#include "sdma.h"
+#include "eprom.h"
+#include "efivar.h"
+#include "platform.h"
+#include "aspm.h"
+#include "affinity.h"
+#include "debugfs.h"
+
+#define NUM_IB_PORTS 1
+
+uint kdeth_qp;
+module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
+MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
+
+uint num_vls = HFI1_MAX_VLS_SUPPORTED;
+module_param(num_vls, uint, S_IRUGO);
+MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
+
+/*
+ * Default time to aggregate two 10K packets from the idle state
+ * (timer not running). The timer starts at the end of the first packet,
+ * so only the time for one 10K packet and header plus a bit extra is needed.
+ * 10 * 1024 + 64 header byte = 10304 byte
+ * 10304 byte / 12.5 GB/s = 824.32ns
+ */
+uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */
+module_param(rcv_intr_timeout, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns");
+
+uint rcv_intr_count = 16; /* same as qib */
+module_param(rcv_intr_count, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count");
+
+ushort link_crc_mask = SUPPORTED_CRCS;
+module_param(link_crc_mask, ushort, S_IRUGO);
+MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link");
+
+uint loopback;
+module_param_named(loopback, loopback, uint, S_IRUGO);
+MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable");
+
+/* Other driver tunables */
+uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/
+static ushort crc_14b_sideband = 1;
+static uint use_flr = 1;
+uint quick_linkup; /* skip LNI */
+
+struct flag_table {
+	u64 flag;	/* the flag */
+	char *str;	/* description string */
+	u16 extra;	/* extra information */
+	u16 unused0;
+	u32 unused1;
+};
+
+/* str must be a string constant */
+#define FLAG_ENTRY(str, extra, flag) {flag, str, extra}
+#define FLAG_ENTRY0(str, flag) {flag, str, 0}
+
+/* Send Error Consequences */
+#define SEC_WRITE_DROPPED	0x1
+#define SEC_PACKET_DROPPED	0x2
+#define SEC_SC_HALTED		0x4	/* per-context only */
+#define SEC_SPC_FREEZE		0x8	/* per-HFI only */
+
+#define DEFAULT_KRCVQS		  2
+#define MIN_KERNEL_KCTXTS         2
+#define FIRST_KERNEL_KCTXT        1
+
+/*
+ * RSM instance allocation
+ *   0 - Verbs
+ *   1 - User Fecn Handling
+ *   2 - Vnic
+ */
+#define RSM_INS_VERBS             0
+#define RSM_INS_FECN              1
+#define RSM_INS_VNIC              2
+
+/* Bit offset into the GUID which carries HFI id information */
+#define GUID_HFI_INDEX_SHIFT     39
+
+/* extract the emulation revision */
+#define emulator_rev(dd) ((dd)->irev >> 8)
+/* parallel and serial emulation versions are 3 and 4 respectively */
+#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3)
+#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4)
+
+/* RSM fields for Verbs */
+/* packet type */
+#define IB_PACKET_TYPE         2ull
+#define QW_SHIFT               6ull
+/* QPN[7..1] */
+#define QPN_WIDTH              7ull
+
+/* LRH.BTH: QW 0, OFFSET 48 - for match */
+#define LRH_BTH_QW             0ull
+#define LRH_BTH_BIT_OFFSET     48ull
+#define LRH_BTH_OFFSET(off)    ((LRH_BTH_QW << QW_SHIFT) | (off))
+#define LRH_BTH_MATCH_OFFSET   LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET)
+#define LRH_BTH_SELECT
+#define LRH_BTH_MASK           3ull
+#define LRH_BTH_VALUE          2ull
+
+/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */
+#define LRH_SC_QW              0ull
+#define LRH_SC_BIT_OFFSET      56ull
+#define LRH_SC_OFFSET(off)     ((LRH_SC_QW << QW_SHIFT) | (off))
+#define LRH_SC_MATCH_OFFSET    LRH_SC_OFFSET(LRH_SC_BIT_OFFSET)
+#define LRH_SC_MASK            128ull
+#define LRH_SC_VALUE           0ull
+
+/* SC[n..0] QW 0, OFFSET 60 - for select */
+#define LRH_SC_SELECT_OFFSET  ((LRH_SC_QW << QW_SHIFT) | (60ull))
+
+/* QPN[m+n:1] QW 1, OFFSET 1 */
+#define QPN_SELECT_OFFSET      ((1ull << QW_SHIFT) | (1ull))
+
+/* RSM fields for Vnic */
+/* L2_TYPE: QW 0, OFFSET 61 - for match */
+#define L2_TYPE_QW             0ull
+#define L2_TYPE_BIT_OFFSET     61ull
+#define L2_TYPE_OFFSET(off)    ((L2_TYPE_QW << QW_SHIFT) | (off))
+#define L2_TYPE_MATCH_OFFSET   L2_TYPE_OFFSET(L2_TYPE_BIT_OFFSET)
+#define L2_TYPE_MASK           3ull
+#define L2_16B_VALUE           2ull
+
+/* L4_TYPE QW 1, OFFSET 0 - for match */
+#define L4_TYPE_QW              1ull
+#define L4_TYPE_BIT_OFFSET      0ull
+#define L4_TYPE_OFFSET(off)     ((L4_TYPE_QW << QW_SHIFT) | (off))
+#define L4_TYPE_MATCH_OFFSET    L4_TYPE_OFFSET(L4_TYPE_BIT_OFFSET)
+#define L4_16B_TYPE_MASK        0xFFull
+#define L4_16B_ETH_VALUE        0x78ull
+
+/* 16B VESWID - for select */
+#define L4_16B_HDR_VESWID_OFFSET  ((2 << QW_SHIFT) | (16ull))
+/* 16B ENTROPY - for select */
+#define L2_16B_ENTROPY_OFFSET     ((1 << QW_SHIFT) | (32ull))
+
+/* defines to build power on SC2VL table */
+#define SC2VL_VAL( \
+	num, \
+	sc0, sc0val, \
+	sc1, sc1val, \
+	sc2, sc2val, \
+	sc3, sc3val, \
+	sc4, sc4val, \
+	sc5, sc5val, \
+	sc6, sc6val, \
+	sc7, sc7val) \
+( \
+	((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \
+	((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \
+	((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \
+	((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \
+	((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \
+	((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \
+	((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \
+	((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT)   \
+)
+
+#define DC_SC_VL_VAL( \
+	range, \
+	e0, e0val, \
+	e1, e1val, \
+	e2, e2val, \
+	e3, e3val, \
+	e4, e4val, \
+	e5, e5val, \
+	e6, e6val, \
+	e7, e7val, \
+	e8, e8val, \
+	e9, e9val, \
+	e10, e10val, \
+	e11, e11val, \
+	e12, e12val, \
+	e13, e13val, \
+	e14, e14val, \
+	e15, e15val) \
+( \
+	((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \
+	((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \
+	((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \
+	((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \
+	((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \
+	((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \
+	((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \
+	((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \
+	((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \
+	((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \
+	((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \
+	((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \
+	((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \
+	((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \
+	((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \
+	((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \
+)
+
+/* all CceStatus sub-block freeze bits */
+#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \
+			| CCE_STATUS_RXE_FROZE_SMASK \
+			| CCE_STATUS_TXE_FROZE_SMASK \
+			| CCE_STATUS_TXE_PIO_FROZE_SMASK)
+/* all CceStatus sub-block TXE pause bits */
+#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \
+			| CCE_STATUS_TXE_PAUSED_SMASK \
+			| CCE_STATUS_SDMA_PAUSED_SMASK)
+/* all CceStatus sub-block RXE pause bits */
+#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK
+
+#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
+#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
+
+/*
+ * CCE Error flags.
+ */
+static struct flag_table cce_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY0("CceCsrParityErr",
+		CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK),
+/* 1*/	FLAG_ENTRY0("CceCsrReadBadAddrErr",
+		CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK),
+/* 2*/	FLAG_ENTRY0("CceCsrWriteBadAddrErr",
+		CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK),
+/* 3*/	FLAG_ENTRY0("CceTrgtAsyncFifoParityErr",
+		CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 4*/	FLAG_ENTRY0("CceTrgtAccessErr",
+		CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK),
+/* 5*/	FLAG_ENTRY0("CceRspdDataParityErr",
+		CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK),
+/* 6*/	FLAG_ENTRY0("CceCli0AsyncFifoParityErr",
+		CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 7*/	FLAG_ENTRY0("CceCsrCfgBusParityErr",
+		CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK),
+/* 8*/	FLAG_ENTRY0("CceCli2AsyncFifoParityErr",
+		CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 9*/	FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+	    CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK),
+/*10*/	FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+	    CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK),
+/*11*/	FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError",
+	    CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK),
+/*12*/	FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError",
+		CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK),
+/*13*/	FLAG_ENTRY0("PcicRetryMemCorErr",
+		CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK),
+/*14*/	FLAG_ENTRY0("PcicRetryMemCorErr",
+		CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK),
+/*15*/	FLAG_ENTRY0("PcicPostHdQCorErr",
+		CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK),
+/*16*/	FLAG_ENTRY0("PcicPostHdQCorErr",
+		CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK),
+/*17*/	FLAG_ENTRY0("PcicPostHdQCorErr",
+		CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK),
+/*18*/	FLAG_ENTRY0("PcicCplDatQCorErr",
+		CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK),
+/*19*/	FLAG_ENTRY0("PcicNPostHQParityErr",
+		CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK),
+/*20*/	FLAG_ENTRY0("PcicNPostDatQParityErr",
+		CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK),
+/*21*/	FLAG_ENTRY0("PcicRetryMemUncErr",
+		CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK),
+/*22*/	FLAG_ENTRY0("PcicRetrySotMemUncErr",
+		CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK),
+/*23*/	FLAG_ENTRY0("PcicPostHdQUncErr",
+		CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK),
+/*24*/	FLAG_ENTRY0("PcicPostDatQUncErr",
+		CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK),
+/*25*/	FLAG_ENTRY0("PcicCplHdQUncErr",
+		CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK),
+/*26*/	FLAG_ENTRY0("PcicCplDatQUncErr",
+		CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK),
+/*27*/	FLAG_ENTRY0("PcicTransmitFrontParityErr",
+		CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK),
+/*28*/	FLAG_ENTRY0("PcicTransmitBackParityErr",
+		CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK),
+/*29*/	FLAG_ENTRY0("PcicReceiveParityErr",
+		CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK),
+/*30*/	FLAG_ENTRY0("CceTrgtCplTimeoutErr",
+		CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK),
+/*31*/	FLAG_ENTRY0("LATriggered",
+		CCE_ERR_STATUS_LA_TRIGGERED_SMASK),
+/*32*/	FLAG_ENTRY0("CceSegReadBadAddrErr",
+		CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK),
+/*33*/	FLAG_ENTRY0("CceSegWriteBadAddrErr",
+		CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK),
+/*34*/	FLAG_ENTRY0("CceRcplAsyncFifoParityErr",
+		CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK),
+/*35*/	FLAG_ENTRY0("CceRxdmaConvFifoParityErr",
+		CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK),
+/*36*/	FLAG_ENTRY0("CceMsixTableCorErr",
+		CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK),
+/*37*/	FLAG_ENTRY0("CceMsixTableUncErr",
+		CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK),
+/*38*/	FLAG_ENTRY0("CceIntMapCorErr",
+		CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK),
+/*39*/	FLAG_ENTRY0("CceIntMapUncErr",
+		CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK),
+/*40*/	FLAG_ENTRY0("CceMsixCsrParityErr",
+		CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK),
+/*41-63 reserved*/
+};
+
+/*
+ * Misc Error flags
+ */
+#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK
+static struct flag_table misc_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)),
+/* 1*/	FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)),
+/* 2*/	FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)),
+/* 3*/	FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)),
+/* 4*/	FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)),
+/* 5*/	FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)),
+/* 6*/	FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)),
+/* 7*/	FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)),
+/* 8*/	FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)),
+/* 9*/	FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)),
+/*10*/	FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)),
+/*11*/	FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)),
+/*12*/	FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL))
+};
+
+/*
+ * TXE PIO Error flags and consequences
+ */
+static struct flag_table pio_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY("PioWriteBadCtxt",
+	SEC_WRITE_DROPPED,
+	SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK),
+/* 1*/	FLAG_ENTRY("PioWriteAddrParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK),
+/* 2*/	FLAG_ENTRY("PioCsrParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK),
+/* 3*/	FLAG_ENTRY("PioSbMemFifo0",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK),
+/* 4*/	FLAG_ENTRY("PioSbMemFifo1",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK),
+/* 5*/	FLAG_ENTRY("PioPccFifoParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK),
+/* 6*/	FLAG_ENTRY("PioPecFifoParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK),
+/* 7*/	FLAG_ENTRY("PioSbrdctlCrrelParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK),
+/* 8*/	FLAG_ENTRY("PioSbrdctrlCrrelFifoParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK),
+/* 9*/	FLAG_ENTRY("PioPktEvictFifoParityErr",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK),
+/*10*/	FLAG_ENTRY("PioSmPktResetParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK),
+/*11*/	FLAG_ENTRY("PioVlLenMemBank0Unc",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK),
+/*12*/	FLAG_ENTRY("PioVlLenMemBank1Unc",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK),
+/*13*/	FLAG_ENTRY("PioVlLenMemBank0Cor",
+	0,
+	SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK),
+/*14*/	FLAG_ENTRY("PioVlLenMemBank1Cor",
+	0,
+	SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK),
+/*15*/	FLAG_ENTRY("PioCreditRetFifoParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK),
+/*16*/	FLAG_ENTRY("PioPpmcPblFifo",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK),
+/*17*/	FLAG_ENTRY("PioInitSmIn",
+	0,
+	SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK),
+/*18*/	FLAG_ENTRY("PioPktEvictSmOrArbSm",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK),
+/*19*/	FLAG_ENTRY("PioHostAddrMemUnc",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK),
+/*20*/	FLAG_ENTRY("PioHostAddrMemCor",
+	0,
+	SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK),
+/*21*/	FLAG_ENTRY("PioWriteDataParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK),
+/*22*/	FLAG_ENTRY("PioStateMachine",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK),
+/*23*/	FLAG_ENTRY("PioWriteQwValidParity",
+	SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK),
+/*24*/	FLAG_ENTRY("PioBlockQwCountParity",
+	SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK),
+/*25*/	FLAG_ENTRY("PioVlfVlLenParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK),
+/*26*/	FLAG_ENTRY("PioVlfSopParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK),
+/*27*/	FLAG_ENTRY("PioVlFifoParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK),
+/*28*/	FLAG_ENTRY("PioPpmcBqcMemParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK),
+/*29*/	FLAG_ENTRY("PioPpmcSopLen",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK),
+/*30-31 reserved*/
+/*32*/	FLAG_ENTRY("PioCurrentFreeCntParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK),
+/*33*/	FLAG_ENTRY("PioLastReturnedCntParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK),
+/*34*/	FLAG_ENTRY("PioPccSopHeadParity",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK),
+/*35*/	FLAG_ENTRY("PioPecSopHeadParityErr",
+	SEC_SPC_FREEZE,
+	SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK),
+/*36-63 reserved*/
+};
+
+/* TXE PIO errors that cause an SPC freeze */
+#define ALL_PIO_FREEZE_ERR \
+	(SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+	| SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK)
+
+/*
+ * TXE SDMA Error flags
+ */
+static struct flag_table sdma_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY0("SDmaRpyTagErr",
+		SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK),
+/* 1*/	FLAG_ENTRY0("SDmaCsrParityErr",
+		SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK),
+/* 2*/	FLAG_ENTRY0("SDmaPcieReqTrackingUncErr",
+		SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK),
+/* 3*/	FLAG_ENTRY0("SDmaPcieReqTrackingCorErr",
+		SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK),
+/*04-63 reserved*/
+};
+
+/* TXE SDMA errors that cause an SPC freeze */
+#define ALL_SDMA_FREEZE_ERR  \
+		(SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \
+		| SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \
+		| SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK)
+
+/* SendEgressErrInfo bits that correspond to a PortXmitDiscard counter */
+#define PORT_DISCARD_EGRESS_ERRS \
+	(SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK \
+	| SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK \
+	| SEND_EGRESS_ERR_INFO_VL_ERR_SMASK)
+
+/*
+ * TXE Egress Error flags
+ */
+#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK
+static struct flag_table egress_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)),
+/* 1*/	FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)),
+/* 2 reserved */
+/* 3*/	FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr",
+		SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)),
+/* 4*/	FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)),
+/* 5*/	FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)),
+/* 6 reserved */
+/* 7*/	FLAG_ENTRY0("TxPioLaunchIntfParityErr",
+		SEES(TX_PIO_LAUNCH_INTF_PARITY)),
+/* 8*/	FLAG_ENTRY0("TxSdmaLaunchIntfParityErr",
+		SEES(TX_SDMA_LAUNCH_INTF_PARITY)),
+/* 9-10 reserved */
+/*11*/	FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr",
+		SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)),
+/*12*/	FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)),
+/*13*/	FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)),
+/*14*/	FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)),
+/*15*/	FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)),
+/*16*/	FLAG_ENTRY0("TxSdma0DisallowedPacketErr",
+		SEES(TX_SDMA0_DISALLOWED_PACKET)),
+/*17*/	FLAG_ENTRY0("TxSdma1DisallowedPacketErr",
+		SEES(TX_SDMA1_DISALLOWED_PACKET)),
+/*18*/	FLAG_ENTRY0("TxSdma2DisallowedPacketErr",
+		SEES(TX_SDMA2_DISALLOWED_PACKET)),
+/*19*/	FLAG_ENTRY0("TxSdma3DisallowedPacketErr",
+		SEES(TX_SDMA3_DISALLOWED_PACKET)),
+/*20*/	FLAG_ENTRY0("TxSdma4DisallowedPacketErr",
+		SEES(TX_SDMA4_DISALLOWED_PACKET)),
+/*21*/	FLAG_ENTRY0("TxSdma5DisallowedPacketErr",
+		SEES(TX_SDMA5_DISALLOWED_PACKET)),
+/*22*/	FLAG_ENTRY0("TxSdma6DisallowedPacketErr",
+		SEES(TX_SDMA6_DISALLOWED_PACKET)),
+/*23*/	FLAG_ENTRY0("TxSdma7DisallowedPacketErr",
+		SEES(TX_SDMA7_DISALLOWED_PACKET)),
+/*24*/	FLAG_ENTRY0("TxSdma8DisallowedPacketErr",
+		SEES(TX_SDMA8_DISALLOWED_PACKET)),
+/*25*/	FLAG_ENTRY0("TxSdma9DisallowedPacketErr",
+		SEES(TX_SDMA9_DISALLOWED_PACKET)),
+/*26*/	FLAG_ENTRY0("TxSdma10DisallowedPacketErr",
+		SEES(TX_SDMA10_DISALLOWED_PACKET)),
+/*27*/	FLAG_ENTRY0("TxSdma11DisallowedPacketErr",
+		SEES(TX_SDMA11_DISALLOWED_PACKET)),
+/*28*/	FLAG_ENTRY0("TxSdma12DisallowedPacketErr",
+		SEES(TX_SDMA12_DISALLOWED_PACKET)),
+/*29*/	FLAG_ENTRY0("TxSdma13DisallowedPacketErr",
+		SEES(TX_SDMA13_DISALLOWED_PACKET)),
+/*30*/	FLAG_ENTRY0("TxSdma14DisallowedPacketErr",
+		SEES(TX_SDMA14_DISALLOWED_PACKET)),
+/*31*/	FLAG_ENTRY0("TxSdma15DisallowedPacketErr",
+		SEES(TX_SDMA15_DISALLOWED_PACKET)),
+/*32*/	FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)),
+/*33*/	FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)),
+/*34*/	FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)),
+/*35*/	FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)),
+/*36*/	FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)),
+/*37*/	FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)),
+/*38*/	FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)),
+/*39*/	FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)),
+/*40*/	FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr",
+		SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)),
+/*41*/	FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)),
+/*42*/	FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)),
+/*43*/	FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)),
+/*44*/	FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)),
+/*45*/	FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)),
+/*46*/	FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)),
+/*47*/	FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)),
+/*48*/	FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)),
+/*49*/	FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)),
+/*50*/	FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)),
+/*51*/	FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)),
+/*52*/	FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)),
+/*53*/	FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)),
+/*54*/	FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)),
+/*55*/	FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)),
+/*56*/	FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)),
+/*57*/	FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)),
+/*58*/	FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)),
+/*59*/	FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)),
+/*60*/	FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)),
+/*61*/	FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)),
+/*62*/	FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr",
+		SEES(TX_READ_SDMA_MEMORY_CSR_UNC)),
+/*63*/	FLAG_ENTRY0("TxReadPioMemoryCsrUncErr",
+		SEES(TX_READ_PIO_MEMORY_CSR_UNC)),
+};
+
+/*
+ * TXE Egress Error Info flags
+ */
+#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK
+static struct flag_table egress_err_info_flags[] = {
+/* 0*/	FLAG_ENTRY0("Reserved", 0ull),
+/* 1*/	FLAG_ENTRY0("VLErr", SEEI(VL)),
+/* 2*/	FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 3*/	FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 4*/	FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)),
+/* 5*/	FLAG_ENTRY0("SLIDErr", SEEI(SLID)),
+/* 6*/	FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)),
+/* 7*/	FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)),
+/* 8*/	FLAG_ENTRY0("RawErr", SEEI(RAW)),
+/* 9*/	FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)),
+/*10*/	FLAG_ENTRY0("GRHErr", SEEI(GRH)),
+/*11*/	FLAG_ENTRY0("BypassErr", SEEI(BYPASS)),
+/*12*/	FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)),
+/*13*/	FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)),
+/*14*/	FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)),
+/*15*/	FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)),
+/*16*/	FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)),
+/*17*/	FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)),
+/*18*/	FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)),
+/*19*/	FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)),
+/*20*/	FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)),
+/*21*/	FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)),
+};
+
+/* TXE Egress errors that cause an SPC freeze */
+#define ALL_TXE_EGRESS_FREEZE_ERR \
+	(SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \
+	| SEES(TX_PIO_LAUNCH_INTF_PARITY) \
+	| SEES(TX_SDMA_LAUNCH_INTF_PARITY) \
+	| SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \
+	| SEES(TX_LAUNCH_CSR_PARITY) \
+	| SEES(TX_SBRD_CTL_CSR_PARITY) \
+	| SEES(TX_CONFIG_PARITY) \
+	| SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \
+	| SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \
+	| SEES(TX_CREDIT_RETURN_PARITY))
+
+/*
+ * TXE Send error flags
+ */
+#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK
+static struct flag_table send_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY0("SendCsrParityErr", SES(CSR_PARITY)),
+/* 1*/	FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)),
+/* 2*/	FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR))
+};
+
+/*
+ * TXE Send Context Error flags and consequences
+ */
+static struct flag_table sc_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY("InconsistentSop",
+		SEC_PACKET_DROPPED | SEC_SC_HALTED,
+		SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK),
+/* 1*/	FLAG_ENTRY("DisallowedPacket",
+		SEC_PACKET_DROPPED | SEC_SC_HALTED,
+		SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK),
+/* 2*/	FLAG_ENTRY("WriteCrossesBoundary",
+		SEC_WRITE_DROPPED | SEC_SC_HALTED,
+		SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK),
+/* 3*/	FLAG_ENTRY("WriteOverflow",
+		SEC_WRITE_DROPPED | SEC_SC_HALTED,
+		SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK),
+/* 4*/	FLAG_ENTRY("WriteOutOfBounds",
+		SEC_WRITE_DROPPED | SEC_SC_HALTED,
+		SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK),
+/* 5-63 reserved*/
+};
+
+/*
+ * RXE Receive Error flags
+ */
+#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK
+static struct flag_table rxe_err_status_flags[] = {
+/* 0*/	FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)),
+/* 1*/	FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)),
+/* 2*/	FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)),
+/* 3*/	FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)),
+/* 4*/	FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)),
+/* 5*/	FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)),
+/* 6*/	FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)),
+/* 7*/	FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)),
+/* 8*/	FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)),
+/* 9*/	FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)),
+/*10*/	FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)),
+/*11*/	FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)),
+/*12*/	FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)),
+/*13*/	FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)),
+/*14*/	FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)),
+/*15*/	FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)),
+/*16*/	FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr",
+		RXES(RBUF_LOOKUP_DES_REG_UNC_COR)),
+/*17*/	FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)),
+/*18*/	FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)),
+/*19*/	FLAG_ENTRY0("RxRbufBlockListReadUncErr",
+		RXES(RBUF_BLOCK_LIST_READ_UNC)),
+/*20*/	FLAG_ENTRY0("RxRbufBlockListReadCorErr",
+		RXES(RBUF_BLOCK_LIST_READ_COR)),
+/*21*/	FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr",
+		RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)),
+/*22*/	FLAG_ENTRY0("RxRbufCsrQEntCntParityErr",
+		RXES(RBUF_CSR_QENT_CNT_PARITY)),
+/*23*/	FLAG_ENTRY0("RxRbufCsrQNextBufParityErr",
+		RXES(RBUF_CSR_QNEXT_BUF_PARITY)),
+/*24*/	FLAG_ENTRY0("RxRbufCsrQVldBitParityErr",
+		RXES(RBUF_CSR_QVLD_BIT_PARITY)),
+/*25*/	FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)),
+/*26*/	FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)),
+/*27*/	FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr",
+		RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)),
+/*28*/	FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)),
+/*29*/	FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)),
+/*30*/	FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)),
+/*31*/	FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)),
+/*32*/	FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)),
+/*33*/	FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)),
+/*34*/	FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)),
+/*35*/	FLAG_ENTRY0("RxRbufFlInitdoneParityErr",
+		RXES(RBUF_FL_INITDONE_PARITY)),
+/*36*/	FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr",
+		RXES(RBUF_FL_INIT_WR_ADDR_PARITY)),
+/*37*/	FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)),
+/*38*/	FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)),
+/*39*/	FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)),
+/*40*/	FLAG_ENTRY0("RxLookupDesPart1UncCorErr",
+		RXES(LOOKUP_DES_PART1_UNC_COR)),
+/*41*/	FLAG_ENTRY0("RxLookupDesPart2ParityErr",
+		RXES(LOOKUP_DES_PART2_PARITY)),
+/*42*/	FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)),
+/*43*/	FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)),
+/*44*/	FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)),
+/*45*/	FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)),
+/*46*/	FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)),
+/*47*/	FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)),
+/*48*/	FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)),
+/*49*/	FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)),
+/*50*/	FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)),
+/*51*/	FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)),
+/*52*/	FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)),
+/*53*/	FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)),
+/*54*/	FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)),
+/*55*/	FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)),
+/*56*/	FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)),
+/*57*/	FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)),
+/*58*/	FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)),
+/*59*/	FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)),
+/*60*/	FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)),
+/*61*/	FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)),
+/*62*/	FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)),
+/*63*/	FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY))
+};
+
+/* RXE errors that will trigger an SPC freeze */
+#define ALL_RXE_FREEZE_ERR  \
+	(RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \
+	| RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK)
+
+#define RXE_FREEZE_ABORT_MASK \
+	(RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \
+	RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \
+	RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK)
+
+/*
+ * DCC Error Flags
+ */
+#define DCCE(name) DCC_ERR_FLG_##name##_SMASK
+static struct flag_table dcc_err_flags[] = {
+	FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)),
+	FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)),
+	FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)),
+	FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)),
+	FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)),
+	FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)),
+	FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)),
+	FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)),
+	FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)),
+	FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)),
+	FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)),
+	FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)),
+	FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)),
+	FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)),
+	FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)),
+	FLAG_ENTRY0("link_err", DCCE(LINK_ERR)),
+	FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)),
+	FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)),
+	FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)),
+	FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)),
+	FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)),
+	FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)),
+	FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)),
+	FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)),
+	FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)),
+	FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)),
+	FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)),
+	FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)),
+	FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)),
+	FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)),
+	FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)),
+	FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)),
+	FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)),
+	FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)),
+	FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)),
+	FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)),
+	FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)),
+	FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)),
+	FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)),
+	FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)),
+	FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)),
+	FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)),
+	FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)),
+	FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)),
+	FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)),
+	FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)),
+};
+
+/*
+ * LCB error flags
+ */
+#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK
+static struct flag_table lcb_err_flags[] = {
+/* 0*/	FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)),
+/* 1*/	FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)),
+/* 2*/	FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)),
+/* 3*/	FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST",
+		LCBE(ALL_LNS_FAILED_REINIT_TEST)),
+/* 4*/	FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)),
+/* 5*/	FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)),
+/* 6*/	FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)),
+/* 7*/	FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)),
+/* 8*/	FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)),
+/* 9*/	FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)),
+/*10*/	FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)),
+/*11*/	FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)),
+/*12*/	FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)),
+/*13*/	FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER",
+		LCBE(UNEXPECTED_ROUND_TRIP_MARKER)),
+/*14*/	FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)),
+/*15*/	FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)),
+/*16*/	FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)),
+/*17*/	FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)),
+/*18*/	FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)),
+/*19*/	FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE",
+		LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)),
+/*20*/	FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)),
+/*21*/	FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)),
+/*22*/	FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)),
+/*23*/	FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)),
+/*24*/	FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)),
+/*25*/	FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)),
+/*26*/	FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP",
+		LCBE(RST_FOR_INCOMPLT_RND_TRIP)),
+/*27*/	FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)),
+/*28*/	FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE",
+		LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)),
+/*29*/	FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR",
+		LCBE(REDUNDANT_FLIT_PARITY_ERR))
+};
+
+/*
+ * DC8051 Error Flags
+ */
+#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK
+static struct flag_table dc8051_err_flags[] = {
+	FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)),
+	FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)),
+	FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)),
+	FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)),
+	FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)),
+	FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)),
+	FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)),
+	FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)),
+	FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES",
+		    D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)),
+	FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)),
+};
+
+/*
+ * DC8051 Information Error flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field.
+ */
+static struct flag_table dc8051_info_err_flags[] = {
+	FLAG_ENTRY0("Spico ROM check failed",  SPICO_ROM_FAILED),
+	FLAG_ENTRY0("Unknown frame received",  UNKNOWN_FRAME),
+	FLAG_ENTRY0("Target BER not met",      TARGET_BER_NOT_MET),
+	FLAG_ENTRY0("Serdes internal loopback failure",
+		    FAILED_SERDES_INTERNAL_LOOPBACK),
+	FLAG_ENTRY0("Failed SerDes init",      FAILED_SERDES_INIT),
+	FLAG_ENTRY0("Failed LNI(Polling)",     FAILED_LNI_POLLING),
+	FLAG_ENTRY0("Failed LNI(Debounce)",    FAILED_LNI_DEBOUNCE),
+	FLAG_ENTRY0("Failed LNI(EstbComm)",    FAILED_LNI_ESTBCOMM),
+	FLAG_ENTRY0("Failed LNI(OptEq)",       FAILED_LNI_OPTEQ),
+	FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1),
+	FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2),
+	FLAG_ENTRY0("Failed LNI(ConfigLT)",    FAILED_LNI_CONFIGLT),
+	FLAG_ENTRY0("Host Handshake Timeout",  HOST_HANDSHAKE_TIMEOUT),
+	FLAG_ENTRY0("External Device Request Timeout",
+		    EXTERNAL_DEVICE_REQ_TIMEOUT),
+};
+
+/*
+ * DC8051 Information Host Information flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field.
+ */
+static struct flag_table dc8051_info_host_msg_flags[] = {
+	FLAG_ENTRY0("Host request done", 0x0001),
+	FLAG_ENTRY0("BC PWR_MGM message", 0x0002),
+	FLAG_ENTRY0("BC SMA message", 0x0004),
+	FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008),
+	FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010),
+	FLAG_ENTRY0("External device config request", 0x0020),
+	FLAG_ENTRY0("VerifyCap all frames received", 0x0040),
+	FLAG_ENTRY0("LinkUp achieved", 0x0080),
+	FLAG_ENTRY0("Link going down", 0x0100),
+	FLAG_ENTRY0("Link width downgraded", 0x0200),
+};
+
+static u32 encoded_size(u32 size);
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate);
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state);
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+			       u8 *continuous);
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+				  u8 *vcu, u16 *vl15buf, u8 *crc_sizes);
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+				      u8 *remote_tx_rate, u16 *link_widths);
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+				     u8 *flag_bits, u16 *link_widths);
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+				  u8 *device_rev);
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx);
+static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx,
+			    u8 *tx_polarity_inversion,
+			    u8 *rx_polarity_inversion, u8 *max_rate);
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+				unsigned int context, u64 err_status);
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg);
+static void handle_dcc_err(struct hfi1_devdata *dd,
+			   unsigned int context, u64 err_status);
+static void handle_lcb_err(struct hfi1_devdata *dd,
+			   unsigned int context, u64 err_status);
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void set_partition_keys(struct hfi1_pportdata *ppd);
+static const char *link_state_name(u32 state);
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd,
+					  u32 state);
+static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
+			   u64 *out_data);
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data);
+static int thermal_init(struct hfi1_devdata *dd);
+
+static void update_statusp(struct hfi1_pportdata *ppd, u32 state);
+static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd,
+					    int msecs);
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+				  int msecs);
+static void log_state_transition(struct hfi1_pportdata *ppd, u32 state);
+static void log_physical_state(struct hfi1_pportdata *ppd, u32 state);
+static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+				   int msecs);
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
+static void handle_temp_err(struct hfi1_devdata *dd);
+static void dc_shutdown(struct hfi1_devdata *dd);
+static void dc_start(struct hfi1_devdata *dd);
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+			   unsigned int *np);
+static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd);
+static int wait_link_transfer_active(struct hfi1_devdata *dd, int wait_ms);
+static void clear_rsm_rule(struct hfi1_devdata *dd, u8 rule_index);
+static void update_xmit_counters(struct hfi1_pportdata *ppd, u16 link_width);
+
+/*
+ * Error interrupt table entry.  This is used as input to the interrupt
+ * "clear down" routine used for all second tier error interrupt register.
+ * Second tier interrupt registers have a single bit representing them
+ * in the top-level CceIntStatus.
+ */
+struct err_reg_info {
+	u32 status;		/* status CSR offset */
+	u32 clear;		/* clear CSR offset */
+	u32 mask;		/* mask CSR offset */
+	void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg);
+	const char *desc;
+};
+
+#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
+#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
+#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
+
+/*
+ * Helpers for building HFI and DC error interrupt table entries.  Different
+ * helpers are needed because of inconsistent register names.
+ */
+#define EE(reg, handler, desc) \
+	{ reg##_STATUS, reg##_CLEAR, reg##_MASK, \
+		handler, desc }
+#define DC_EE1(reg, handler, desc) \
+	{ reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc }
+#define DC_EE2(reg, handler, desc) \
+	{ reg##_FLG, reg##_CLR, reg##_EN, handler, desc }
+
+/*
+ * Table of the "misc" grouping of error interrupts.  Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = {
+/* 0*/	EE(CCE_ERR,		handle_cce_err,    "CceErr"),
+/* 1*/	EE(RCV_ERR,		handle_rxe_err,    "RxeErr"),
+/* 2*/	EE(MISC_ERR,	handle_misc_err,   "MiscErr"),
+/* 3*/	{ 0, 0, 0, NULL }, /* reserved */
+/* 4*/	EE(SEND_PIO_ERR,    handle_pio_err,    "PioErr"),
+/* 5*/	EE(SEND_DMA_ERR,    handle_sdma_err,   "SDmaErr"),
+/* 6*/	EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"),
+/* 7*/	EE(SEND_ERR,	handle_txe_err,    "TxeErr")
+	/* the rest are reserved */
+};
+
+/*
+ * Index into the Various section of the interrupt sources
+ * corresponding to the Critical Temperature interrupt.
+ */
+#define TCRIT_INT_SOURCE 4
+
+/*
+ * SDMA error interrupt entry - refers to another register containing more
+ * information.
+ */
+static const struct err_reg_info sdma_eng_err =
+	EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr");
+
+static const struct err_reg_info various_err[NUM_VARIOUS] = {
+/* 0*/	{ 0, 0, 0, NULL }, /* PbcInt */
+/* 1*/	{ 0, 0, 0, NULL }, /* GpioAssertInt */
+/* 2*/	EE(ASIC_QSFP1,	handle_qsfp_int,	"QSFP1"),
+/* 3*/	EE(ASIC_QSFP2,	handle_qsfp_int,	"QSFP2"),
+/* 4*/	{ 0, 0, 0, NULL }, /* TCritInt */
+	/* rest are reserved */
+};
+
+/*
+ * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG
+ * register can not be derived from the MTU value because 10K is not
+ * a power of 2. Therefore, we need a constant. Everything else can
+ * be calculated.
+ */
+#define DCC_CFG_PORT_MTU_CAP_10240 7
+
+/*
+ * Table of the DC grouping of error interrupts.  Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info dc_errs[NUM_DC_ERRS] = {
+/* 0*/	DC_EE1(DCC_ERR,		handle_dcc_err,	       "DCC Err"),
+/* 1*/	DC_EE2(DC_LCB_ERR,	handle_lcb_err,	       "LCB Err"),
+/* 2*/	DC_EE2(DC_DC8051_ERR,	handle_8051_interrupt, "DC8051 Interrupt"),
+/* 3*/	/* dc_lbm_int - special, see is_dc_int() */
+	/* the rest are reserved */
+};
+
+struct cntr_entry {
+	/*
+	 * counter name
+	 */
+	char *name;
+
+	/*
+	 * csr to read for name (if applicable)
+	 */
+	u64 csr;
+
+	/*
+	 * offset into dd or ppd to store the counter's value
+	 */
+	int offset;
+
+	/*
+	 * flags
+	 */
+	u8 flags;
+
+	/*
+	 * accessor for stat element, context either dd or ppd
+	 */
+	u64 (*rw_cntr)(const struct cntr_entry *, void *context, int vl,
+		       int mode, u64 data);
+};
+
+#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0
+#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159
+
+#define CNTR_ELEM(name, csr, offset, flags, accessor) \
+{ \
+	name, \
+	csr, \
+	offset, \
+	flags, \
+	accessor \
+}
+
+/* 32bit RXE */
+#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + RCV_COUNTER_ARRAY32), \
+	  0, flags | CNTR_32BIT, \
+	  port_access_u32_csr)
+
+#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + RCV_COUNTER_ARRAY32), \
+	  0, flags | CNTR_32BIT, \
+	  dev_access_u32_csr)
+
+/* 64bit RXE */
+#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + RCV_COUNTER_ARRAY64), \
+	  0, flags, \
+	  port_access_u64_csr)
+
+#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + RCV_COUNTER_ARRAY64), \
+	  0, flags, \
+	  dev_access_u64_csr)
+
+#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx
+#define OVR_ELM(ctx) \
+CNTR_ELEM("RcvHdrOvr" #ctx, \
+	  (RCV_HDR_OVFL_CNT + ctx * 0x100), \
+	  0, CNTR_NORMAL, port_access_u64_csr)
+
+/* 32bit TXE */
+#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + SEND_COUNTER_ARRAY32), \
+	  0, flags | CNTR_32BIT, \
+	  port_access_u32_csr)
+
+/* 64bit TXE */
+#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + SEND_COUNTER_ARRAY64), \
+	  0, flags, \
+	  port_access_u64_csr)
+
+# define TX64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name,\
+	  counter * 8 + SEND_COUNTER_ARRAY64, \
+	  0, \
+	  flags, \
+	  dev_access_u64_csr)
+
+/* CCE */
+#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + CCE_COUNTER_ARRAY32), \
+	  0, flags | CNTR_32BIT, \
+	  dev_access_u32_csr)
+
+#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  (counter * 8 + CCE_INT_COUNTER_ARRAY32), \
+	  0, flags | CNTR_32BIT, \
+	  dev_access_u32_csr)
+
+/* DC */
+#define DC_PERF_CNTR(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  counter, \
+	  0, \
+	  flags, \
+	  dev_access_u64_csr)
+
+#define DC_PERF_CNTR_LCB(name, counter, flags) \
+CNTR_ELEM(#name, \
+	  counter, \
+	  0, \
+	  flags, \
+	  dc_access_lcb_cntr)
+
+/* ibp counters */
+#define SW_IBP_CNTR(name, cntr) \
+CNTR_ELEM(#name, \
+	  0, \
+	  0, \
+	  CNTR_SYNTH, \
+	  access_ibp_##cntr)
+
+/**
+ * hfi_addr_from_offset - return addr for readq/writeq
+ * @dd - the dd device
+ * @offset - the offset of the CSR within bar0
+ *
+ * This routine selects the appropriate base address
+ * based on the indicated offset.
+ */
+static inline void __iomem *hfi1_addr_from_offset(
+	const struct hfi1_devdata *dd,
+	u32 offset)
+{
+	if (offset >= dd->base2_start)
+		return dd->kregbase2 + (offset - dd->base2_start);
+	return dd->kregbase1 + offset;
+}
+
+/**
+ * read_csr - read CSR at the indicated offset
+ * @dd - the dd device
+ * @offset - the offset of the CSR within bar0
+ *
+ * Return: the value read or all FF's if there
+ * is no mapping
+ */
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
+{
+	if (dd->flags & HFI1_PRESENT)
+		return readq(hfi1_addr_from_offset(dd, offset));
+	return -1;
+}
+
+/**
+ * write_csr - write CSR at the indicated offset
+ * @dd - the dd device
+ * @offset - the offset of the CSR within bar0
+ * @value - value to write
+ */
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
+{
+	if (dd->flags & HFI1_PRESENT) {
+		void __iomem *base = hfi1_addr_from_offset(dd, offset);
+
+		/* avoid write to RcvArray */
+		if (WARN_ON(offset >= RCV_ARRAY && offset < dd->base2_start))
+			return;
+		writeq(value, base);
+	}
+}
+
+/**
+ * get_csr_addr - return te iomem address for offset
+ * @dd - the dd device
+ * @offset - the offset of the CSR within bar0
+ *
+ * Return: The iomem address to use in subsequent
+ * writeq/readq operations.
+ */
+void __iomem *get_csr_addr(
+	const struct hfi1_devdata *dd,
+	u32 offset)
+{
+	if (dd->flags & HFI1_PRESENT)
+		return hfi1_addr_from_offset(dd, offset);
+	return NULL;
+}
+
+static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
+				 int mode, u64 value)
+{
+	u64 ret;
+
+	if (mode == CNTR_MODE_R) {
+		ret = read_csr(dd, csr);
+	} else if (mode == CNTR_MODE_W) {
+		write_csr(dd, csr, value);
+		ret = value;
+	} else {
+		dd_dev_err(dd, "Invalid cntr register access mode");
+		return 0;
+	}
+
+	hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode);
+	return ret;
+}
+
+/* Dev Access */
+static u64 dev_access_u32_csr(const struct cntr_entry *entry,
+			      void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = context;
+	u64 csr = entry->csr;
+
+	if (entry->flags & CNTR_SDMA) {
+		if (vl == CNTR_INVALID_VL)
+			return 0;
+		csr += 0x100 * vl;
+	} else {
+		if (vl != CNTR_INVALID_VL)
+			return 0;
+	}
+	return read_write_csr(dd, csr, mode, data);
+}
+
+static u64 access_sde_err_cnt(const struct cntr_entry *entry,
+			      void *context, int idx, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	if (dd->per_sdma && idx < dd->num_sdma)
+		return dd->per_sdma[idx].err_cnt;
+	return 0;
+}
+
+static u64 access_sde_int_cnt(const struct cntr_entry *entry,
+			      void *context, int idx, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	if (dd->per_sdma && idx < dd->num_sdma)
+		return dd->per_sdma[idx].sdma_int_cnt;
+	return 0;
+}
+
+static u64 access_sde_idle_int_cnt(const struct cntr_entry *entry,
+				   void *context, int idx, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	if (dd->per_sdma && idx < dd->num_sdma)
+		return dd->per_sdma[idx].idle_int_cnt;
+	return 0;
+}
+
+static u64 access_sde_progress_int_cnt(const struct cntr_entry *entry,
+				       void *context, int idx, int mode,
+				       u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	if (dd->per_sdma && idx < dd->num_sdma)
+		return dd->per_sdma[idx].progress_int_cnt;
+	return 0;
+}
+
+static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context,
+			      int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = context;
+
+	u64 val = 0;
+	u64 csr = entry->csr;
+
+	if (entry->flags & CNTR_VL) {
+		if (vl == CNTR_INVALID_VL)
+			return 0;
+		csr += 8 * vl;
+	} else {
+		if (vl != CNTR_INVALID_VL)
+			return 0;
+	}
+
+	val = read_write_csr(dd, csr, mode, data);
+	return val;
+}
+
+static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context,
+			      int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = context;
+	u32 csr = entry->csr;
+	int ret = 0;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+	if (mode == CNTR_MODE_R)
+		ret = read_lcb_csr(dd, csr, &data);
+	else if (mode == CNTR_MODE_W)
+		ret = write_lcb_csr(dd, csr, data);
+
+	if (ret) {
+		dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr);
+		return 0;
+	}
+
+	hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode);
+	return data;
+}
+
+/* Port Access */
+static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context,
+			       int vl, int mode, u64 data)
+{
+	struct hfi1_pportdata *ppd = context;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+	return read_write_csr(ppd->dd, entry->csr, mode, data);
+}
+
+static u64 port_access_u64_csr(const struct cntr_entry *entry,
+			       void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_pportdata *ppd = context;
+	u64 val;
+	u64 csr = entry->csr;
+
+	if (entry->flags & CNTR_VL) {
+		if (vl == CNTR_INVALID_VL)
+			return 0;
+		csr += 8 * vl;
+	} else {
+		if (vl != CNTR_INVALID_VL)
+			return 0;
+	}
+	val = read_write_csr(ppd->dd, csr, mode, data);
+	return val;
+}
+
+/* Software defined */
+static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode,
+				u64 data)
+{
+	u64 ret;
+
+	if (mode == CNTR_MODE_R) {
+		ret = *cntr;
+	} else if (mode == CNTR_MODE_W) {
+		*cntr = data;
+		ret = data;
+	} else {
+		dd_dev_err(dd, "Invalid cntr sw access mode");
+		return 0;
+	}
+
+	hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode);
+
+	return ret;
+}
+
+static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context,
+				 int vl, int mode, u64 data)
+{
+	struct hfi1_pportdata *ppd = context;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+	return read_write_sw(ppd->dd, &ppd->link_downed, mode, data);
+}
+
+static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context,
+				 int vl, int mode, u64 data)
+{
+	struct hfi1_pportdata *ppd = context;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+	return read_write_sw(ppd->dd, &ppd->link_up, mode, data);
+}
+
+static u64 access_sw_unknown_frame_cnt(const struct cntr_entry *entry,
+				       void *context, int vl, int mode,
+				       u64 data)
+{
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+	return read_write_sw(ppd->dd, &ppd->unknown_frame_count, mode, data);
+}
+
+static u64 access_sw_xmit_discards(const struct cntr_entry *entry,
+				   void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+	u64 zero = 0;
+	u64 *counter;
+
+	if (vl == CNTR_INVALID_VL)
+		counter = &ppd->port_xmit_discards;
+	else if (vl >= 0 && vl < C_VL_COUNT)
+		counter = &ppd->port_xmit_discards_vl[vl];
+	else
+		counter = &zero;
+
+	return read_write_sw(ppd->dd, counter, mode, data);
+}
+
+static u64 access_xmit_constraint_errs(const struct cntr_entry *entry,
+				       void *context, int vl, int mode,
+				       u64 data)
+{
+	struct hfi1_pportdata *ppd = context;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+
+	return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors,
+			     mode, data);
+}
+
+static u64 access_rcv_constraint_errs(const struct cntr_entry *entry,
+				      void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_pportdata *ppd = context;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+
+	return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors,
+			     mode, data);
+}
+
+u64 get_all_cpu_total(u64 __percpu *cntr)
+{
+	int cpu;
+	u64 counter = 0;
+
+	for_each_possible_cpu(cpu)
+		counter += *per_cpu_ptr(cntr, cpu);
+	return counter;
+}
+
+static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val,
+			  u64 __percpu *cntr,
+			  int vl, int mode, u64 data)
+{
+	u64 ret = 0;
+
+	if (vl != CNTR_INVALID_VL)
+		return 0;
+
+	if (mode == CNTR_MODE_R) {
+		ret = get_all_cpu_total(cntr) - *z_val;
+	} else if (mode == CNTR_MODE_W) {
+		/* A write can only zero the counter */
+		if (data == 0)
+			*z_val = get_all_cpu_total(cntr);
+		else
+			dd_dev_err(dd, "Per CPU cntrs can only be zeroed");
+	} else {
+		dd_dev_err(dd, "Invalid cntr sw cpu access mode");
+		return 0;
+	}
+
+	return ret;
+}
+
+static u64 access_sw_cpu_intr(const struct cntr_entry *entry,
+			      void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = context;
+
+	return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl,
+			      mode, data);
+}
+
+static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry,
+				   void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = context;
+
+	return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl,
+			      mode, data);
+}
+
+static u64 access_sw_pio_wait(const struct cntr_entry *entry,
+			      void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = context;
+
+	return dd->verbs_dev.n_piowait;
+}
+
+static u64 access_sw_pio_drain(const struct cntr_entry *entry,
+			       void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->verbs_dev.n_piodrain;
+}
+
+static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
+			      void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = context;
+
+	return dd->verbs_dev.n_txwait;
+}
+
+static u64 access_sw_kmem_wait(const struct cntr_entry *entry,
+			       void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = context;
+
+	return dd->verbs_dev.n_kmem_wait;
+}
+
+static u64 access_sw_send_schedule(const struct cntr_entry *entry,
+				   void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return read_write_cpu(dd, &dd->z_send_schedule, dd->send_schedule, vl,
+			      mode, data);
+}
+
+/* Software counters for the error status bits within MISC_ERR_STATUS */
+static u64 access_misc_pll_lock_fail_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->misc_err_status_cnt[12];
+}
+
+static u64 access_misc_mbist_fail_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->misc_err_status_cnt[11];
+}
+
+static u64 access_misc_invalid_eep_cmd_err_cnt(const struct cntr_entry *entry,
+					       void *context, int vl, int mode,
+					       u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->misc_err_status_cnt[10];
+}
+
+static u64 access_misc_efuse_done_parity_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->misc_err_status_cnt[9];
+}
+
+static u64 access_misc_efuse_write_err_cnt(const struct cntr_entry *entry,
+					   void *context, int vl, int mode,
+					   u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->misc_err_status_cnt[8];
+}
+
+static u64 access_misc_efuse_read_bad_addr_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->misc_err_status_cnt[7];
+}
+
+static u64 access_misc_efuse_csr_parity_err_cnt(const struct cntr_entry *entry,
+						void *context, int vl,
+						int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->misc_err_status_cnt[6];
+}
+
+static u64 access_misc_fw_auth_failed_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->misc_err_status_cnt[5];
+}
+
+static u64 access_misc_key_mismatch_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->misc_err_status_cnt[4];
+}
+
+static u64 access_misc_sbus_write_failed_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->misc_err_status_cnt[3];
+}
+
+static u64 access_misc_csr_write_bad_addr_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->misc_err_status_cnt[2];
+}
+
+static u64 access_misc_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->misc_err_status_cnt[1];
+}
+
+static u64 access_misc_csr_parity_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->misc_err_status_cnt[0];
+}
+
+/*
+ * Software counter for the aggregate of
+ * individual CceErrStatus counters
+ */
+static u64 access_sw_cce_err_status_aggregated_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_cce_err_status_aggregate;
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within CceErrStatus
+ */
+static u64 access_cce_msix_csr_parity_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[40];
+}
+
+static u64 access_cce_int_map_unc_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[39];
+}
+
+static u64 access_cce_int_map_cor_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[38];
+}
+
+static u64 access_cce_msix_table_unc_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[37];
+}
+
+static u64 access_cce_msix_table_cor_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[36];
+}
+
+static u64 access_cce_rxdma_conv_fifo_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[35];
+}
+
+static u64 access_cce_rcpl_async_fifo_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[34];
+}
+
+static u64 access_cce_seg_write_bad_addr_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[33];
+}
+
+static u64 access_cce_seg_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+						void *context, int vl, int mode,
+						u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[32];
+}
+
+static u64 access_la_triggered_cnt(const struct cntr_entry *entry,
+				   void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[31];
+}
+
+static u64 access_cce_trgt_cpl_timeout_err_cnt(const struct cntr_entry *entry,
+					       void *context, int vl, int mode,
+					       u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[30];
+}
+
+static u64 access_pcic_receive_parity_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[29];
+}
+
+static u64 access_pcic_transmit_back_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[28];
+}
+
+static u64 access_pcic_transmit_front_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[27];
+}
+
+static u64 access_pcic_cpl_dat_q_unc_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[26];
+}
+
+static u64 access_pcic_cpl_hd_q_unc_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[25];
+}
+
+static u64 access_pcic_post_dat_q_unc_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[24];
+}
+
+static u64 access_pcic_post_hd_q_unc_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[23];
+}
+
+static u64 access_pcic_retry_sot_mem_unc_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[22];
+}
+
+static u64 access_pcic_retry_mem_unc_err(const struct cntr_entry *entry,
+					 void *context, int vl, int mode,
+					 u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[21];
+}
+
+static u64 access_pcic_n_post_dat_q_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[20];
+}
+
+static u64 access_pcic_n_post_h_q_parity_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[19];
+}
+
+static u64 access_pcic_cpl_dat_q_cor_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[18];
+}
+
+static u64 access_pcic_cpl_hd_q_cor_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[17];
+}
+
+static u64 access_pcic_post_dat_q_cor_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[16];
+}
+
+static u64 access_pcic_post_hd_q_cor_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[15];
+}
+
+static u64 access_pcic_retry_sot_mem_cor_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[14];
+}
+
+static u64 access_pcic_retry_mem_cor_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[13];
+}
+
+static u64 access_cce_cli1_async_fifo_dbg_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[12];
+}
+
+static u64 access_cce_cli1_async_fifo_rxdma_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[11];
+}
+
+static u64 access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[10];
+}
+
+static u64 access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[9];
+}
+
+static u64 access_cce_cli2_async_fifo_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[8];
+}
+
+static u64 access_cce_csr_cfg_bus_parity_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[7];
+}
+
+static u64 access_cce_cli0_async_fifo_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[6];
+}
+
+static u64 access_cce_rspd_data_parity_err_cnt(const struct cntr_entry *entry,
+					       void *context, int vl, int mode,
+					       u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[5];
+}
+
+static u64 access_cce_trgt_access_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[4];
+}
+
+static u64 access_cce_trgt_async_fifo_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[3];
+}
+
+static u64 access_cce_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[2];
+}
+
+static u64 access_cce_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+						void *context, int vl,
+						int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[1];
+}
+
+static u64 access_ccs_csr_parity_err_cnt(const struct cntr_entry *entry,
+					 void *context, int vl, int mode,
+					 u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->cce_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within RcvErrStatus
+ */
+static u64 access_rx_csr_parity_err_cnt(const struct cntr_entry *entry,
+					void *context, int vl, int mode,
+					u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[63];
+}
+
+static u64 access_rx_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry,
+						void *context, int vl,
+						int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[62];
+}
+
+static u64 access_rx_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+					       void *context, int vl, int mode,
+					       u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[61];
+}
+
+static u64 access_rx_dma_csr_unc_err_cnt(const struct cntr_entry *entry,
+					 void *context, int vl, int mode,
+					 u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[60];
+}
+
+static u64 access_rx_dma_dq_fsm_encoding_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[59];
+}
+
+static u64 access_rx_dma_eq_fsm_encoding_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[58];
+}
+
+static u64 access_rx_dma_csr_parity_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[57];
+}
+
+static u64 access_rx_rbuf_data_cor_err_cnt(const struct cntr_entry *entry,
+					   void *context, int vl, int mode,
+					   u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[56];
+}
+
+static u64 access_rx_rbuf_data_unc_err_cnt(const struct cntr_entry *entry,
+					   void *context, int vl, int mode,
+					   u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[55];
+}
+
+static u64 access_rx_dma_data_fifo_rd_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[54];
+}
+
+static u64 access_rx_dma_data_fifo_rd_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[53];
+}
+
+static u64 access_rx_dma_hdr_fifo_rd_cor_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[52];
+}
+
+static u64 access_rx_dma_hdr_fifo_rd_unc_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[51];
+}
+
+static u64 access_rx_rbuf_desc_part2_cor_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[50];
+}
+
+static u64 access_rx_rbuf_desc_part2_unc_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[49];
+}
+
+static u64 access_rx_rbuf_desc_part1_cor_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[48];
+}
+
+static u64 access_rx_rbuf_desc_part1_unc_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[47];
+}
+
+static u64 access_rx_hq_intr_fsm_err_cnt(const struct cntr_entry *entry,
+					 void *context, int vl, int mode,
+					 u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[46];
+}
+
+static u64 access_rx_hq_intr_csr_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[45];
+}
+
+static u64 access_rx_lookup_csr_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[44];
+}
+
+static u64 access_rx_lookup_rcv_array_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[43];
+}
+
+static u64 access_rx_lookup_rcv_array_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[42];
+}
+
+static u64 access_rx_lookup_des_part2_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[41];
+}
+
+static u64 access_rx_lookup_des_part1_unc_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[40];
+}
+
+static u64 access_rx_lookup_des_part1_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[39];
+}
+
+static u64 access_rx_rbuf_next_free_buf_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[38];
+}
+
+static u64 access_rx_rbuf_next_free_buf_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[37];
+}
+
+static u64 access_rbuf_fl_init_wr_addr_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[36];
+}
+
+static u64 access_rx_rbuf_fl_initdone_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[35];
+}
+
+static u64 access_rx_rbuf_fl_write_addr_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[34];
+}
+
+static u64 access_rx_rbuf_fl_rd_addr_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[33];
+}
+
+static u64 access_rx_rbuf_empty_err_cnt(const struct cntr_entry *entry,
+					void *context, int vl, int mode,
+					u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[32];
+}
+
+static u64 access_rx_rbuf_full_err_cnt(const struct cntr_entry *entry,
+				       void *context, int vl, int mode,
+				       u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[31];
+}
+
+static u64 access_rbuf_bad_lookup_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[30];
+}
+
+static u64 access_rbuf_ctx_id_parity_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[29];
+}
+
+static u64 access_rbuf_csr_qeopdw_parity_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[28];
+}
+
+static u64 access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[27];
+}
+
+static u64 access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[26];
+}
+
+static u64 access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[25];
+}
+
+static u64 access_rx_rbuf_csr_q_vld_bit_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[24];
+}
+
+static u64 access_rx_rbuf_csr_q_next_buf_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[23];
+}
+
+static u64 access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[22];
+}
+
+static u64 access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[21];
+}
+
+static u64 access_rx_rbuf_block_list_read_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[20];
+}
+
+static u64 access_rx_rbuf_block_list_read_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[19];
+}
+
+static u64 access_rx_rbuf_lookup_des_cor_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[18];
+}
+
+static u64 access_rx_rbuf_lookup_des_unc_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[17];
+}
+
+static u64 access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[16];
+}
+
+static u64 access_rx_rbuf_lookup_des_reg_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[15];
+}
+
+static u64 access_rx_rbuf_free_list_cor_err_cnt(const struct cntr_entry *entry,
+						void *context, int vl,
+						int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[14];
+}
+
+static u64 access_rx_rbuf_free_list_unc_err_cnt(const struct cntr_entry *entry,
+						void *context, int vl,
+						int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[13];
+}
+
+static u64 access_rx_rcv_fsm_encoding_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[12];
+}
+
+static u64 access_rx_dma_flag_cor_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[11];
+}
+
+static u64 access_rx_dma_flag_unc_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[10];
+}
+
+static u64 access_rx_dc_sop_eop_parity_err_cnt(const struct cntr_entry *entry,
+					       void *context, int vl, int mode,
+					       u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[9];
+}
+
+static u64 access_rx_rcv_csr_parity_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[8];
+}
+
+static u64 access_rx_rcv_qp_map_table_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[7];
+}
+
+static u64 access_rx_rcv_qp_map_table_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[6];
+}
+
+static u64 access_rx_rcv_data_cor_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[5];
+}
+
+static u64 access_rx_rcv_data_unc_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[4];
+}
+
+static u64 access_rx_rcv_hdr_cor_err_cnt(const struct cntr_entry *entry,
+					 void *context, int vl, int mode,
+					 u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[3];
+}
+
+static u64 access_rx_rcv_hdr_unc_err_cnt(const struct cntr_entry *entry,
+					 void *context, int vl, int mode,
+					 u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[2];
+}
+
+static u64 access_rx_dc_intf_parity_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[1];
+}
+
+static u64 access_rx_dma_csr_cor_err_cnt(const struct cntr_entry *entry,
+					 void *context, int vl, int mode,
+					 u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->rcv_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendPioErrStatus
+ */
+static u64 access_pio_pec_sop_head_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[35];
+}
+
+static u64 access_pio_pcc_sop_head_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[34];
+}
+
+static u64 access_pio_last_returned_cnt_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[33];
+}
+
+static u64 access_pio_current_free_cnt_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[32];
+}
+
+static u64 access_pio_reserved_31_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[31];
+}
+
+static u64 access_pio_reserved_30_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[30];
+}
+
+static u64 access_pio_ppmc_sop_len_err_cnt(const struct cntr_entry *entry,
+					   void *context, int vl, int mode,
+					   u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[29];
+}
+
+static u64 access_pio_ppmc_bqc_mem_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[28];
+}
+
+static u64 access_pio_vl_fifo_parity_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[27];
+}
+
+static u64 access_pio_vlf_sop_parity_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[26];
+}
+
+static u64 access_pio_vlf_v1_len_parity_err_cnt(const struct cntr_entry *entry,
+						void *context, int vl,
+						int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[25];
+}
+
+static u64 access_pio_block_qw_count_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[24];
+}
+
+static u64 access_pio_write_qw_valid_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[23];
+}
+
+static u64 access_pio_state_machine_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[22];
+}
+
+static u64 access_pio_write_data_parity_err_cnt(const struct cntr_entry *entry,
+						void *context, int vl,
+						int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[21];
+}
+
+static u64 access_pio_host_addr_mem_cor_err_cnt(const struct cntr_entry *entry,
+						void *context, int vl,
+						int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[20];
+}
+
+static u64 access_pio_host_addr_mem_unc_err_cnt(const struct cntr_entry *entry,
+						void *context, int vl,
+						int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[19];
+}
+
+static u64 access_pio_pkt_evict_sm_or_arb_sm_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[18];
+}
+
+static u64 access_pio_init_sm_in_err_cnt(const struct cntr_entry *entry,
+					 void *context, int vl, int mode,
+					 u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[17];
+}
+
+static u64 access_pio_ppmc_pbl_fifo_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[16];
+}
+
+static u64 access_pio_credit_ret_fifo_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[15];
+}
+
+static u64 access_pio_v1_len_mem_bank1_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[14];
+}
+
+static u64 access_pio_v1_len_mem_bank0_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[13];
+}
+
+static u64 access_pio_v1_len_mem_bank1_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[12];
+}
+
+static u64 access_pio_v1_len_mem_bank0_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[11];
+}
+
+static u64 access_pio_sm_pkt_reset_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[10];
+}
+
+static u64 access_pio_pkt_evict_fifo_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[9];
+}
+
+static u64 access_pio_sbrdctrl_crrel_fifo_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[8];
+}
+
+static u64 access_pio_sbrdctl_crrel_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[7];
+}
+
+static u64 access_pio_pec_fifo_parity_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[6];
+}
+
+static u64 access_pio_pcc_fifo_parity_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[5];
+}
+
+static u64 access_pio_sb_mem_fifo1_err_cnt(const struct cntr_entry *entry,
+					   void *context, int vl, int mode,
+					   u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[4];
+}
+
+static u64 access_pio_sb_mem_fifo0_err_cnt(const struct cntr_entry *entry,
+					   void *context, int vl, int mode,
+					   u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[3];
+}
+
+static u64 access_pio_csr_parity_err_cnt(const struct cntr_entry *entry,
+					 void *context, int vl, int mode,
+					 u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[2];
+}
+
+static u64 access_pio_write_addr_parity_err_cnt(const struct cntr_entry *entry,
+						void *context, int vl,
+						int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[1];
+}
+
+static u64 access_pio_write_bad_ctxt_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_pio_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendDmaErrStatus
+ */
+static u64 access_sdma_pcie_req_tracking_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_dma_err_status_cnt[3];
+}
+
+static u64 access_sdma_pcie_req_tracking_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_dma_err_status_cnt[2];
+}
+
+static u64 access_sdma_csr_parity_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_dma_err_status_cnt[1];
+}
+
+static u64 access_sdma_rpy_tag_err_cnt(const struct cntr_entry *entry,
+				       void *context, int vl, int mode,
+				       u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_dma_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendEgressErrStatus
+ */
+static u64 access_tx_read_pio_memory_csr_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[63];
+}
+
+static u64 access_tx_read_sdma_memory_csr_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[62];
+}
+
+static u64 access_tx_egress_fifo_cor_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[61];
+}
+
+static u64 access_tx_read_pio_memory_cor_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[60];
+}
+
+static u64 access_tx_read_sdma_memory_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[59];
+}
+
+static u64 access_tx_sb_hdr_cor_err_cnt(const struct cntr_entry *entry,
+					void *context, int vl, int mode,
+					u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[58];
+}
+
+static u64 access_tx_credit_overrun_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[57];
+}
+
+static u64 access_tx_launch_fifo8_cor_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[56];
+}
+
+static u64 access_tx_launch_fifo7_cor_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[55];
+}
+
+static u64 access_tx_launch_fifo6_cor_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[54];
+}
+
+static u64 access_tx_launch_fifo5_cor_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[53];
+}
+
+static u64 access_tx_launch_fifo4_cor_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[52];
+}
+
+static u64 access_tx_launch_fifo3_cor_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[51];
+}
+
+static u64 access_tx_launch_fifo2_cor_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[50];
+}
+
+static u64 access_tx_launch_fifo1_cor_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[49];
+}
+
+static u64 access_tx_launch_fifo0_cor_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[48];
+}
+
+static u64 access_tx_credit_return_vl_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[47];
+}
+
+static u64 access_tx_hcrc_insertion_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[46];
+}
+
+static u64 access_tx_egress_fifo_unc_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[45];
+}
+
+static u64 access_tx_read_pio_memory_unc_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[44];
+}
+
+static u64 access_tx_read_sdma_memory_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[43];
+}
+
+static u64 access_tx_sb_hdr_unc_err_cnt(const struct cntr_entry *entry,
+					void *context, int vl, int mode,
+					u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[42];
+}
+
+static u64 access_tx_credit_return_partiy_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[41];
+}
+
+static u64 access_tx_launch_fifo8_unc_or_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[40];
+}
+
+static u64 access_tx_launch_fifo7_unc_or_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[39];
+}
+
+static u64 access_tx_launch_fifo6_unc_or_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[38];
+}
+
+static u64 access_tx_launch_fifo5_unc_or_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[37];
+}
+
+static u64 access_tx_launch_fifo4_unc_or_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[36];
+}
+
+static u64 access_tx_launch_fifo3_unc_or_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[35];
+}
+
+static u64 access_tx_launch_fifo2_unc_or_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[34];
+}
+
+static u64 access_tx_launch_fifo1_unc_or_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[33];
+}
+
+static u64 access_tx_launch_fifo0_unc_or_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[32];
+}
+
+static u64 access_tx_sdma15_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[31];
+}
+
+static u64 access_tx_sdma14_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[30];
+}
+
+static u64 access_tx_sdma13_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[29];
+}
+
+static u64 access_tx_sdma12_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[28];
+}
+
+static u64 access_tx_sdma11_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[27];
+}
+
+static u64 access_tx_sdma10_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[26];
+}
+
+static u64 access_tx_sdma9_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[25];
+}
+
+static u64 access_tx_sdma8_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[24];
+}
+
+static u64 access_tx_sdma7_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[23];
+}
+
+static u64 access_tx_sdma6_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[22];
+}
+
+static u64 access_tx_sdma5_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[21];
+}
+
+static u64 access_tx_sdma4_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[20];
+}
+
+static u64 access_tx_sdma3_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[19];
+}
+
+static u64 access_tx_sdma2_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[18];
+}
+
+static u64 access_tx_sdma1_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[17];
+}
+
+static u64 access_tx_sdma0_disallowed_packet_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[16];
+}
+
+static u64 access_tx_config_parity_err_cnt(const struct cntr_entry *entry,
+					   void *context, int vl, int mode,
+					   u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[15];
+}
+
+static u64 access_tx_sbrd_ctl_csr_parity_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[14];
+}
+
+static u64 access_tx_launch_csr_parity_err_cnt(const struct cntr_entry *entry,
+					       void *context, int vl, int mode,
+					       u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[13];
+}
+
+static u64 access_tx_illegal_vl_err_cnt(const struct cntr_entry *entry,
+					void *context, int vl, int mode,
+					u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[12];
+}
+
+static u64 access_tx_sbrd_ctl_state_machine_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[11];
+}
+
+static u64 access_egress_reserved_10_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[10];
+}
+
+static u64 access_egress_reserved_9_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[9];
+}
+
+static u64 access_tx_sdma_launch_intf_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[8];
+}
+
+static u64 access_tx_pio_launch_intf_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[7];
+}
+
+static u64 access_egress_reserved_6_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[6];
+}
+
+static u64 access_tx_incorrect_link_state_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[5];
+}
+
+static u64 access_tx_linkdown_err_cnt(const struct cntr_entry *entry,
+				      void *context, int vl, int mode,
+				      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[4];
+}
+
+static u64 access_tx_egress_fifi_underrun_or_parity_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[3];
+}
+
+static u64 access_egress_reserved_2_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[2];
+}
+
+static u64 access_tx_pkt_integrity_mem_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[1];
+}
+
+static u64 access_tx_pkt_integrity_mem_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_egress_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendErrStatus
+ */
+static u64 access_send_csr_write_bad_addr_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_err_status_cnt[2];
+}
+
+static u64 access_send_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+						 void *context, int vl,
+						 int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_err_status_cnt[1];
+}
+
+static u64 access_send_csr_parity_cnt(const struct cntr_entry *entry,
+				      void *context, int vl, int mode,
+				      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->send_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendCtxtErrStatus
+ */
+static u64 access_pio_write_out_of_bounds_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_ctxt_err_status_cnt[4];
+}
+
+static u64 access_pio_write_overflow_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_ctxt_err_status_cnt[3];
+}
+
+static u64 access_pio_write_crosses_boundary_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_ctxt_err_status_cnt[2];
+}
+
+static u64 access_pio_disallowed_packet_err_cnt(const struct cntr_entry *entry,
+						void *context, int vl,
+						int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_ctxt_err_status_cnt[1];
+}
+
+static u64 access_pio_inconsistent_sop_err_cnt(const struct cntr_entry *entry,
+					       void *context, int vl, int mode,
+					       u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_ctxt_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendDmaEngErrStatus
+ */
+static u64 access_sdma_header_request_fifo_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[23];
+}
+
+static u64 access_sdma_header_storage_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[22];
+}
+
+static u64 access_sdma_packet_tracking_cor_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[21];
+}
+
+static u64 access_sdma_assembly_cor_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[20];
+}
+
+static u64 access_sdma_desc_table_cor_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[19];
+}
+
+static u64 access_sdma_header_request_fifo_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[18];
+}
+
+static u64 access_sdma_header_storage_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[17];
+}
+
+static u64 access_sdma_packet_tracking_unc_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[16];
+}
+
+static u64 access_sdma_assembly_unc_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[15];
+}
+
+static u64 access_sdma_desc_table_unc_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[14];
+}
+
+static u64 access_sdma_timeout_err_cnt(const struct cntr_entry *entry,
+				       void *context, int vl, int mode,
+				       u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[13];
+}
+
+static u64 access_sdma_header_length_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[12];
+}
+
+static u64 access_sdma_header_address_err_cnt(const struct cntr_entry *entry,
+					      void *context, int vl, int mode,
+					      u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[11];
+}
+
+static u64 access_sdma_header_select_err_cnt(const struct cntr_entry *entry,
+					     void *context, int vl, int mode,
+					     u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[10];
+}
+
+static u64 access_sdma_reserved_9_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[9];
+}
+
+static u64 access_sdma_packet_desc_overflow_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[8];
+}
+
+static u64 access_sdma_length_mismatch_err_cnt(const struct cntr_entry *entry,
+					       void *context, int vl,
+					       int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[7];
+}
+
+static u64 access_sdma_halt_err_cnt(const struct cntr_entry *entry,
+				    void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[6];
+}
+
+static u64 access_sdma_mem_read_err_cnt(const struct cntr_entry *entry,
+					void *context, int vl, int mode,
+					u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[5];
+}
+
+static u64 access_sdma_first_desc_err_cnt(const struct cntr_entry *entry,
+					  void *context, int vl, int mode,
+					  u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[4];
+}
+
+static u64 access_sdma_tail_out_of_bounds_err_cnt(
+				const struct cntr_entry *entry,
+				void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[3];
+}
+
+static u64 access_sdma_too_long_err_cnt(const struct cntr_entry *entry,
+					void *context, int vl, int mode,
+					u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[2];
+}
+
+static u64 access_sdma_gen_mismatch_err_cnt(const struct cntr_entry *entry,
+					    void *context, int vl, int mode,
+					    u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[1];
+}
+
+static u64 access_sdma_wrong_dw_err_cnt(const struct cntr_entry *entry,
+					void *context, int vl, int mode,
+					u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	return dd->sw_send_dma_eng_err_status_cnt[0];
+}
+
+static u64 access_dc_rcv_err_cnt(const struct cntr_entry *entry,
+				 void *context, int vl, int mode,
+				 u64 data)
+{
+	struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+	u64 val = 0;
+	u64 csr = entry->csr;
+
+	val = read_write_csr(dd, csr, mode, data);
+	if (mode == CNTR_MODE_R) {
+		val = val > CNTR_MAX - dd->sw_rcv_bypass_packet_errors ?
+			CNTR_MAX : val + dd->sw_rcv_bypass_packet_errors;
+	} else if (mode == CNTR_MODE_W) {
+		dd->sw_rcv_bypass_packet_errors = 0;
+	} else {
+		dd_dev_err(dd, "Invalid cntr register access mode");
+		return 0;
+	}
+	return val;
+}
+
+#define def_access_sw_cpu(cntr) \
+static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry,		      \
+			      void *context, int vl, int mode, u64 data)      \
+{									      \
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;	      \
+	return read_write_cpu(ppd->dd, &ppd->ibport_data.rvp.z_ ##cntr,	      \
+			      ppd->ibport_data.rvp.cntr, vl,		      \
+			      mode, data);				      \
+}
+
+def_access_sw_cpu(rc_acks);
+def_access_sw_cpu(rc_qacks);
+def_access_sw_cpu(rc_delayed_comp);
+
+#define def_access_ibp_counter(cntr) \
+static u64 access_ibp_##cntr(const struct cntr_entry *entry,		      \
+				void *context, int vl, int mode, u64 data)    \
+{									      \
+	struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;	      \
+									      \
+	if (vl != CNTR_INVALID_VL)					      \
+		return 0;						      \
+									      \
+	return read_write_sw(ppd->dd, &ppd->ibport_data.rvp.n_ ##cntr,	      \
+			     mode, data);				      \
+}
+
+def_access_ibp_counter(loop_pkts);
+def_access_ibp_counter(rc_resends);
+def_access_ibp_counter(rnr_naks);
+def_access_ibp_counter(other_naks);
+def_access_ibp_counter(rc_timeouts);
+def_access_ibp_counter(pkt_drops);
+def_access_ibp_counter(dmawait);
+def_access_ibp_counter(rc_seqnak);
+def_access_ibp_counter(rc_dupreq);
+def_access_ibp_counter(rdma_seq);
+def_access_ibp_counter(unaligned);
+def_access_ibp_counter(seq_naks);
+
+static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
+[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
+[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
+			CNTR_NORMAL),
+[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
+			CNTR_NORMAL),
+[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs,
+			RCV_TID_FLOW_GEN_MISMATCH_CNT,
+			CNTR_NORMAL),
+[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL,
+			CNTR_NORMAL),
+[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs,
+			RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL),
+[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt,
+			CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL),
+[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT,
+			CNTR_NORMAL),
+[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT,
+			CNTR_NORMAL),
+[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT,
+			CNTR_NORMAL),
+[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT,
+			CNTR_NORMAL),
+[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT,
+			CNTR_NORMAL),
+[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT,
+			CNTR_NORMAL),
+[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt,
+			CCE_RCV_URGENT_INT_CNT,	CNTR_NORMAL),
+[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt,
+			CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL),
+[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT,
+			      CNTR_SYNTH),
+[C_DC_RCV_ERR] = CNTR_ELEM("DcRecvErr", DCC_ERR_PORTRCV_ERR_CNT, 0, CNTR_SYNTH,
+			    access_dc_rcv_err_cnt),
+[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT,
+				 CNTR_SYNTH),
+[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT,
+				  CNTR_SYNTH),
+[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT,
+				  CNTR_SYNTH),
+[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts,
+				   DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH),
+[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts,
+				  DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT,
+				  CNTR_SYNTH),
+[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr,
+				DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH),
+[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT,
+			       CNTR_SYNTH),
+[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT,
+			      CNTR_SYNTH),
+[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT,
+			       CNTR_SYNTH),
+[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT,
+				 CNTR_SYNTH),
+[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT,
+				CNTR_SYNTH),
+[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT,
+				CNTR_SYNTH),
+[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT,
+			       CNTR_SYNTH),
+[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT,
+				 CNTR_SYNTH | CNTR_VL),
+[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT,
+				CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT,
+				 CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT,
+				 CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT,
+			      CNTR_SYNTH),
+[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT,
+				 CNTR_SYNTH | CNTR_VL),
+[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT,
+				CNTR_SYNTH),
+[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT,
+				   CNTR_SYNTH | CNTR_VL),
+[C_DC_TOTAL_CRC] =
+	DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR,
+			 CNTR_SYNTH),
+[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0,
+				  CNTR_SYNTH),
+[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1,
+				  CNTR_SYNTH),
+[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2,
+				  CNTR_SYNTH),
+[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3,
+				  CNTR_SYNTH),
+[C_DC_CRC_MULT_LN] =
+	DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN,
+			 CNTR_SYNTH),
+[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT,
+				    CNTR_SYNTH),
+[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT,
+				    CNTR_SYNTH),
+[C_DC_SEQ_CRC_CNT] =
+	DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT,
+			 CNTR_SYNTH),
+[C_DC_ESC0_ONLY_CNT] =
+	DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT,
+			 CNTR_SYNTH),
+[C_DC_ESC0_PLUS1_CNT] =
+	DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT,
+			 CNTR_SYNTH),
+[C_DC_ESC0_PLUS2_CNT] =
+	DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT,
+			 CNTR_SYNTH),
+[C_DC_REINIT_FROM_PEER_CNT] =
+	DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT,
+			 CNTR_SYNTH),
+[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT,
+				  CNTR_SYNTH),
+[C_DC_MISC_FLG_CNT] =
+	DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT,
+			 CNTR_SYNTH),
+[C_DC_PRF_GOOD_LTP_CNT] =
+	DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH),
+[C_DC_PRF_ACCEPTED_LTP_CNT] =
+	DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT,
+			 CNTR_SYNTH),
+[C_DC_PRF_RX_FLIT_CNT] =
+	DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_TX_FLIT_CNT] =
+	DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_CLK_CNTR] =
+	DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH),
+[C_DC_PG_DBG_FLIT_CRDTS_CNT] =
+	DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_PAUSE_COMPLETE_CNT] =
+	DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT,
+			 CNTR_SYNTH),
+[C_DC_PG_STS_TX_SBE_CNT] =
+	DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_TX_MBE_CNT] =
+	DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT,
+			 CNTR_SYNTH),
+[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL,
+			    access_sw_cpu_intr),
+[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL,
+			    access_sw_cpu_rcv_limit),
+[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL,
+			    access_sw_vtx_wait),
+[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
+			    access_sw_pio_wait),
+[C_SW_PIO_DRAIN] = CNTR_ELEM("PioDrain", 0, 0, CNTR_NORMAL,
+			    access_sw_pio_drain),
+[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
+			    access_sw_kmem_wait),
+[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
+			    access_sw_send_schedule),
+[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
+				      SEND_DMA_DESC_FETCHED_CNT, 0,
+				      CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+				      dev_access_u32_csr),
+[C_SDMA_INT_CNT] = CNTR_ELEM("SDMAInt", 0, 0,
+			     CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+			     access_sde_int_cnt),
+[C_SDMA_ERR_CNT] = CNTR_ELEM("SDMAErrCt", 0, 0,
+			     CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+			     access_sde_err_cnt),
+[C_SDMA_IDLE_INT_CNT] = CNTR_ELEM("SDMAIdInt", 0, 0,
+				  CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+				  access_sde_idle_int_cnt),
+[C_SDMA_PROGRESS_INT_CNT] = CNTR_ELEM("SDMAPrIntCn", 0, 0,
+				      CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+				      access_sde_progress_int_cnt),
+/* MISC_ERR_STATUS */
+[C_MISC_PLL_LOCK_FAIL_ERR] = CNTR_ELEM("MISC_PLL_LOCK_FAIL_ERR", 0, 0,
+				CNTR_NORMAL,
+				access_misc_pll_lock_fail_err_cnt),
+[C_MISC_MBIST_FAIL_ERR] = CNTR_ELEM("MISC_MBIST_FAIL_ERR", 0, 0,
+				CNTR_NORMAL,
+				access_misc_mbist_fail_err_cnt),
+[C_MISC_INVALID_EEP_CMD_ERR] = CNTR_ELEM("MISC_INVALID_EEP_CMD_ERR", 0, 0,
+				CNTR_NORMAL,
+				access_misc_invalid_eep_cmd_err_cnt),
+[C_MISC_EFUSE_DONE_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_DONE_PARITY_ERR", 0, 0,
+				CNTR_NORMAL,
+				access_misc_efuse_done_parity_err_cnt),
+[C_MISC_EFUSE_WRITE_ERR] = CNTR_ELEM("MISC_EFUSE_WRITE_ERR", 0, 0,
+				CNTR_NORMAL,
+				access_misc_efuse_write_err_cnt),
+[C_MISC_EFUSE_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_EFUSE_READ_BAD_ADDR_ERR", 0,
+				0, CNTR_NORMAL,
+				access_misc_efuse_read_bad_addr_err_cnt),
+[C_MISC_EFUSE_CSR_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_CSR_PARITY_ERR", 0, 0,
+				CNTR_NORMAL,
+				access_misc_efuse_csr_parity_err_cnt),
+[C_MISC_FW_AUTH_FAILED_ERR] = CNTR_ELEM("MISC_FW_AUTH_FAILED_ERR", 0, 0,
+				CNTR_NORMAL,
+				access_misc_fw_auth_failed_err_cnt),
+[C_MISC_KEY_MISMATCH_ERR] = CNTR_ELEM("MISC_KEY_MISMATCH_ERR", 0, 0,
+				CNTR_NORMAL,
+				access_misc_key_mismatch_err_cnt),
+[C_MISC_SBUS_WRITE_FAILED_ERR] = CNTR_ELEM("MISC_SBUS_WRITE_FAILED_ERR", 0, 0,
+				CNTR_NORMAL,
+				access_misc_sbus_write_failed_err_cnt),
+[C_MISC_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_WRITE_BAD_ADDR_ERR", 0, 0,
+				CNTR_NORMAL,
+				access_misc_csr_write_bad_addr_err_cnt),
+[C_MISC_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_READ_BAD_ADDR_ERR", 0, 0,
+				CNTR_NORMAL,
+				access_misc_csr_read_bad_addr_err_cnt),
+[C_MISC_CSR_PARITY_ERR] = CNTR_ELEM("MISC_CSR_PARITY_ERR", 0, 0,
+				CNTR_NORMAL,
+				access_misc_csr_parity_err_cnt),
+/* CceErrStatus */
+[C_CCE_ERR_STATUS_AGGREGATED_CNT] = CNTR_ELEM("CceErrStatusAggregatedCnt", 0, 0,
+				CNTR_NORMAL,
+				access_sw_cce_err_status_aggregated_cnt),
+[C_CCE_MSIX_CSR_PARITY_ERR] = CNTR_ELEM("CceMsixCsrParityErr", 0, 0,
+				CNTR_NORMAL,
+				access_cce_msix_csr_parity_err_cnt),
+[C_CCE_INT_MAP_UNC_ERR] = CNTR_ELEM("CceIntMapUncErr", 0, 0,
+				CNTR_NORMAL,
+				access_cce_int_map_unc_err_cnt),
+[C_CCE_INT_MAP_COR_ERR] = CNTR_ELEM("CceIntMapCorErr", 0, 0,
+				CNTR_NORMAL,
+				access_cce_int_map_cor_err_cnt),
+[C_CCE_MSIX_TABLE_UNC_ERR] = CNTR_ELEM("CceMsixTableUncErr", 0, 0,
+				CNTR_NORMAL,
+				access_cce_msix_table_unc_err_cnt),
+[C_CCE_MSIX_TABLE_COR_ERR] = CNTR_ELEM("CceMsixTableCorErr", 0, 0,
+				CNTR_NORMAL,
+				access_cce_msix_table_cor_err_cnt),
+[C_CCE_RXDMA_CONV_FIFO_PARITY_ERR] = CNTR_ELEM("CceRxdmaConvFifoParityErr", 0,
+				0, CNTR_NORMAL,
+				access_cce_rxdma_conv_fifo_parity_err_cnt),
+[C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceRcplAsyncFifoParityErr", 0,
+				0, CNTR_NORMAL,
+				access_cce_rcpl_async_fifo_parity_err_cnt),
+[C_CCE_SEG_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceSegWriteBadAddrErr", 0, 0,
+				CNTR_NORMAL,
+				access_cce_seg_write_bad_addr_err_cnt),
+[C_CCE_SEG_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceSegReadBadAddrErr", 0, 0,
+				CNTR_NORMAL,
+				access_cce_seg_read_bad_addr_err_cnt),
+[C_LA_TRIGGERED] = CNTR_ELEM("Cce LATriggered", 0, 0,
+				CNTR_NORMAL,
+				access_la_triggered_cnt),
+[C_CCE_TRGT_CPL_TIMEOUT_ERR] = CNTR_ELEM("CceTrgtCplTimeoutErr", 0, 0,
+				CNTR_NORMAL,
+				access_cce_trgt_cpl_timeout_err_cnt),
+[C_PCIC_RECEIVE_PARITY_ERR] = CNTR_ELEM("PcicReceiveParityErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_receive_parity_err_cnt),
+[C_PCIC_TRANSMIT_BACK_PARITY_ERR] = CNTR_ELEM("PcicTransmitBackParityErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_transmit_back_parity_err_cnt),
+[C_PCIC_TRANSMIT_FRONT_PARITY_ERR] = CNTR_ELEM("PcicTransmitFrontParityErr", 0,
+				0, CNTR_NORMAL,
+				access_pcic_transmit_front_parity_err_cnt),
+[C_PCIC_CPL_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicCplDatQUncErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_cpl_dat_q_unc_err_cnt),
+[C_PCIC_CPL_HD_Q_UNC_ERR] = CNTR_ELEM("PcicCplHdQUncErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_cpl_hd_q_unc_err_cnt),
+[C_PCIC_POST_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicPostDatQUncErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_post_dat_q_unc_err_cnt),
+[C_PCIC_POST_HD_Q_UNC_ERR] = CNTR_ELEM("PcicPostHdQUncErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_post_hd_q_unc_err_cnt),
+[C_PCIC_RETRY_SOT_MEM_UNC_ERR] = CNTR_ELEM("PcicRetrySotMemUncErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_retry_sot_mem_unc_err_cnt),
+[C_PCIC_RETRY_MEM_UNC_ERR] = CNTR_ELEM("PcicRetryMemUncErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_retry_mem_unc_err),
+[C_PCIC_N_POST_DAT_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostDatQParityErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_n_post_dat_q_parity_err_cnt),
+[C_PCIC_N_POST_H_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostHQParityErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_n_post_h_q_parity_err_cnt),
+[C_PCIC_CPL_DAT_Q_COR_ERR] = CNTR_ELEM("PcicCplDatQCorErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_cpl_dat_q_cor_err_cnt),
+[C_PCIC_CPL_HD_Q_COR_ERR] = CNTR_ELEM("PcicCplHdQCorErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_cpl_hd_q_cor_err_cnt),
+[C_PCIC_POST_DAT_Q_COR_ERR] = CNTR_ELEM("PcicPostDatQCorErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_post_dat_q_cor_err_cnt),
+[C_PCIC_POST_HD_Q_COR_ERR] = CNTR_ELEM("PcicPostHdQCorErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_post_hd_q_cor_err_cnt),
+[C_PCIC_RETRY_SOT_MEM_COR_ERR] = CNTR_ELEM("PcicRetrySotMemCorErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_retry_sot_mem_cor_err_cnt),
+[C_PCIC_RETRY_MEM_COR_ERR] = CNTR_ELEM("PcicRetryMemCorErr", 0, 0,
+				CNTR_NORMAL,
+				access_pcic_retry_mem_cor_err_cnt),
+[C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR] = CNTR_ELEM(
+				"CceCli1AsyncFifoDbgParityError", 0, 0,
+				CNTR_NORMAL,
+				access_cce_cli1_async_fifo_dbg_parity_err_cnt),
+[C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR] = CNTR_ELEM(
+				"CceCli1AsyncFifoRxdmaParityError", 0, 0,
+				CNTR_NORMAL,
+				access_cce_cli1_async_fifo_rxdma_parity_err_cnt
+				),
+[C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR] = CNTR_ELEM(
+			"CceCli1AsyncFifoSdmaHdParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt),
+[C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR] = CNTR_ELEM(
+			"CceCli1AsyncFifoPioCrdtParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt),
+[C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceCli2AsyncFifoParityErr", 0,
+			0, CNTR_NORMAL,
+			access_cce_cli2_async_fifo_parity_err_cnt),
+[C_CCE_CSR_CFG_BUS_PARITY_ERR] = CNTR_ELEM("CceCsrCfgBusParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_cce_csr_cfg_bus_parity_err_cnt),
+[C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR] = CNTR_ELEM("CceCli0AsyncFifoParityErr", 0,
+			0, CNTR_NORMAL,
+			access_cce_cli0_async_fifo_parity_err_cnt),
+[C_CCE_RSPD_DATA_PARITY_ERR] = CNTR_ELEM("CceRspdDataParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_cce_rspd_data_parity_err_cnt),
+[C_CCE_TRGT_ACCESS_ERR] = CNTR_ELEM("CceTrgtAccessErr", 0, 0,
+			CNTR_NORMAL,
+			access_cce_trgt_access_err_cnt),
+[C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceTrgtAsyncFifoParityErr", 0,
+			0, CNTR_NORMAL,
+			access_cce_trgt_async_fifo_parity_err_cnt),
+[C_CCE_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrWriteBadAddrErr", 0, 0,
+			CNTR_NORMAL,
+			access_cce_csr_write_bad_addr_err_cnt),
+[C_CCE_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrReadBadAddrErr", 0, 0,
+			CNTR_NORMAL,
+			access_cce_csr_read_bad_addr_err_cnt),
+[C_CCE_CSR_PARITY_ERR] = CNTR_ELEM("CceCsrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_ccs_csr_parity_err_cnt),
+
+/* RcvErrStatus */
+[C_RX_CSR_PARITY_ERR] = CNTR_ELEM("RxCsrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_csr_parity_err_cnt),
+[C_RX_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrWriteBadAddrErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_csr_write_bad_addr_err_cnt),
+[C_RX_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrReadBadAddrErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_csr_read_bad_addr_err_cnt),
+[C_RX_DMA_CSR_UNC_ERR] = CNTR_ELEM("RxDmaCsrUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_dma_csr_unc_err_cnt),
+[C_RX_DMA_DQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaDqFsmEncodingErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_dma_dq_fsm_encoding_err_cnt),
+[C_RX_DMA_EQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaEqFsmEncodingErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_dma_eq_fsm_encoding_err_cnt),
+[C_RX_DMA_CSR_PARITY_ERR] = CNTR_ELEM("RxDmaCsrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_dma_csr_parity_err_cnt),
+[C_RX_RBUF_DATA_COR_ERR] = CNTR_ELEM("RxRbufDataCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_data_cor_err_cnt),
+[C_RX_RBUF_DATA_UNC_ERR] = CNTR_ELEM("RxRbufDataUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_data_unc_err_cnt),
+[C_RX_DMA_DATA_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaDataFifoRdCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_dma_data_fifo_rd_cor_err_cnt),
+[C_RX_DMA_DATA_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaDataFifoRdUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_dma_data_fifo_rd_unc_err_cnt),
+[C_RX_DMA_HDR_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaHdrFifoRdCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_dma_hdr_fifo_rd_cor_err_cnt),
+[C_RX_DMA_HDR_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaHdrFifoRdUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_dma_hdr_fifo_rd_unc_err_cnt),
+[C_RX_RBUF_DESC_PART2_COR_ERR] = CNTR_ELEM("RxRbufDescPart2CorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_desc_part2_cor_err_cnt),
+[C_RX_RBUF_DESC_PART2_UNC_ERR] = CNTR_ELEM("RxRbufDescPart2UncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_desc_part2_unc_err_cnt),
+[C_RX_RBUF_DESC_PART1_COR_ERR] = CNTR_ELEM("RxRbufDescPart1CorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_desc_part1_cor_err_cnt),
+[C_RX_RBUF_DESC_PART1_UNC_ERR] = CNTR_ELEM("RxRbufDescPart1UncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_desc_part1_unc_err_cnt),
+[C_RX_HQ_INTR_FSM_ERR] = CNTR_ELEM("RxHqIntrFsmErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_hq_intr_fsm_err_cnt),
+[C_RX_HQ_INTR_CSR_PARITY_ERR] = CNTR_ELEM("RxHqIntrCsrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_hq_intr_csr_parity_err_cnt),
+[C_RX_LOOKUP_CSR_PARITY_ERR] = CNTR_ELEM("RxLookupCsrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_lookup_csr_parity_err_cnt),
+[C_RX_LOOKUP_RCV_ARRAY_COR_ERR] = CNTR_ELEM("RxLookupRcvArrayCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_lookup_rcv_array_cor_err_cnt),
+[C_RX_LOOKUP_RCV_ARRAY_UNC_ERR] = CNTR_ELEM("RxLookupRcvArrayUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_lookup_rcv_array_unc_err_cnt),
+[C_RX_LOOKUP_DES_PART2_PARITY_ERR] = CNTR_ELEM("RxLookupDesPart2ParityErr", 0,
+			0, CNTR_NORMAL,
+			access_rx_lookup_des_part2_parity_err_cnt),
+[C_RX_LOOKUP_DES_PART1_UNC_COR_ERR] = CNTR_ELEM("RxLookupDesPart1UncCorErr", 0,
+			0, CNTR_NORMAL,
+			access_rx_lookup_des_part1_unc_cor_err_cnt),
+[C_RX_LOOKUP_DES_PART1_UNC_ERR] = CNTR_ELEM("RxLookupDesPart1UncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_lookup_des_part1_unc_err_cnt),
+[C_RX_RBUF_NEXT_FREE_BUF_COR_ERR] = CNTR_ELEM("RxRbufNextFreeBufCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_next_free_buf_cor_err_cnt),
+[C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR] = CNTR_ELEM("RxRbufNextFreeBufUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_next_free_buf_unc_err_cnt),
+[C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR] = CNTR_ELEM(
+			"RxRbufFlInitWrAddrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rbuf_fl_init_wr_addr_parity_err_cnt),
+[C_RX_RBUF_FL_INITDONE_PARITY_ERR] = CNTR_ELEM("RxRbufFlInitdoneParityErr", 0,
+			0, CNTR_NORMAL,
+			access_rx_rbuf_fl_initdone_parity_err_cnt),
+[C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlWrAddrParityErr", 0,
+			0, CNTR_NORMAL,
+			access_rx_rbuf_fl_write_addr_parity_err_cnt),
+[C_RX_RBUF_FL_RD_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlRdAddrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_fl_rd_addr_parity_err_cnt),
+[C_RX_RBUF_EMPTY_ERR] = CNTR_ELEM("RxRbufEmptyErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_empty_err_cnt),
+[C_RX_RBUF_FULL_ERR] = CNTR_ELEM("RxRbufFullErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_full_err_cnt),
+[C_RX_RBUF_BAD_LOOKUP_ERR] = CNTR_ELEM("RxRBufBadLookupErr", 0, 0,
+			CNTR_NORMAL,
+			access_rbuf_bad_lookup_err_cnt),
+[C_RX_RBUF_CTX_ID_PARITY_ERR] = CNTR_ELEM("RxRbufCtxIdParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rbuf_ctx_id_parity_err_cnt),
+[C_RX_RBUF_CSR_QEOPDW_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEOPDWParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rbuf_csr_qeopdw_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR] = CNTR_ELEM(
+			"RxRbufCsrQNumOfPktParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR] = CNTR_ELEM(
+			"RxRbufCsrQTlPtrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQHdPtrParityErr", 0,
+			0, CNTR_NORMAL,
+			access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQVldBitParityErr", 0,
+			0, CNTR_NORMAL,
+			access_rx_rbuf_csr_q_vld_bit_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQNextBufParityErr",
+			0, 0, CNTR_NORMAL,
+			access_rx_rbuf_csr_q_next_buf_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEntCntParityErr", 0,
+			0, CNTR_NORMAL,
+			access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR] = CNTR_ELEM(
+			"RxRbufCsrQHeadBufNumParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt),
+[C_RX_RBUF_BLOCK_LIST_READ_COR_ERR] = CNTR_ELEM("RxRbufBlockListReadCorErr", 0,
+			0, CNTR_NORMAL,
+			access_rx_rbuf_block_list_read_cor_err_cnt),
+[C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR] = CNTR_ELEM("RxRbufBlockListReadUncErr", 0,
+			0, CNTR_NORMAL,
+			access_rx_rbuf_block_list_read_unc_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_COR_ERR] = CNTR_ELEM("RxRbufLookupDesCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_lookup_des_cor_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_lookup_des_unc_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR] = CNTR_ELEM(
+			"RxRbufLookupDesRegUncCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesRegUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_lookup_des_reg_unc_err_cnt),
+[C_RX_RBUF_FREE_LIST_COR_ERR] = CNTR_ELEM("RxRbufFreeListCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_free_list_cor_err_cnt),
+[C_RX_RBUF_FREE_LIST_UNC_ERR] = CNTR_ELEM("RxRbufFreeListUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rbuf_free_list_unc_err_cnt),
+[C_RX_RCV_FSM_ENCODING_ERR] = CNTR_ELEM("RxRcvFsmEncodingErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rcv_fsm_encoding_err_cnt),
+[C_RX_DMA_FLAG_COR_ERR] = CNTR_ELEM("RxDmaFlagCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_dma_flag_cor_err_cnt),
+[C_RX_DMA_FLAG_UNC_ERR] = CNTR_ELEM("RxDmaFlagUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_dma_flag_unc_err_cnt),
+[C_RX_DC_SOP_EOP_PARITY_ERR] = CNTR_ELEM("RxDcSopEopParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_dc_sop_eop_parity_err_cnt),
+[C_RX_RCV_CSR_PARITY_ERR] = CNTR_ELEM("RxRcvCsrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rcv_csr_parity_err_cnt),
+[C_RX_RCV_QP_MAP_TABLE_COR_ERR] = CNTR_ELEM("RxRcvQpMapTableCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rcv_qp_map_table_cor_err_cnt),
+[C_RX_RCV_QP_MAP_TABLE_UNC_ERR] = CNTR_ELEM("RxRcvQpMapTableUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rcv_qp_map_table_unc_err_cnt),
+[C_RX_RCV_DATA_COR_ERR] = CNTR_ELEM("RxRcvDataCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rcv_data_cor_err_cnt),
+[C_RX_RCV_DATA_UNC_ERR] = CNTR_ELEM("RxRcvDataUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rcv_data_unc_err_cnt),
+[C_RX_RCV_HDR_COR_ERR] = CNTR_ELEM("RxRcvHdrCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rcv_hdr_cor_err_cnt),
+[C_RX_RCV_HDR_UNC_ERR] = CNTR_ELEM("RxRcvHdrUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_rcv_hdr_unc_err_cnt),
+[C_RX_DC_INTF_PARITY_ERR] = CNTR_ELEM("RxDcIntfParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_dc_intf_parity_err_cnt),
+[C_RX_DMA_CSR_COR_ERR] = CNTR_ELEM("RxDmaCsrCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_rx_dma_csr_cor_err_cnt),
+/* SendPioErrStatus */
+[C_PIO_PEC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPecSopHeadParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_pec_sop_head_parity_err_cnt),
+[C_PIO_PCC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPccSopHeadParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_pcc_sop_head_parity_err_cnt),
+[C_PIO_LAST_RETURNED_CNT_PARITY_ERR] = CNTR_ELEM("PioLastReturnedCntParityErr",
+			0, 0, CNTR_NORMAL,
+			access_pio_last_returned_cnt_parity_err_cnt),
+[C_PIO_CURRENT_FREE_CNT_PARITY_ERR] = CNTR_ELEM("PioCurrentFreeCntParityErr", 0,
+			0, CNTR_NORMAL,
+			access_pio_current_free_cnt_parity_err_cnt),
+[C_PIO_RSVD_31_ERR] = CNTR_ELEM("Pio Reserved 31", 0, 0,
+			CNTR_NORMAL,
+			access_pio_reserved_31_err_cnt),
+[C_PIO_RSVD_30_ERR] = CNTR_ELEM("Pio Reserved 30", 0, 0,
+			CNTR_NORMAL,
+			access_pio_reserved_30_err_cnt),
+[C_PIO_PPMC_SOP_LEN_ERR] = CNTR_ELEM("PioPpmcSopLenErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_ppmc_sop_len_err_cnt),
+[C_PIO_PPMC_BQC_MEM_PARITY_ERR] = CNTR_ELEM("PioPpmcBqcMemParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_ppmc_bqc_mem_parity_err_cnt),
+[C_PIO_VL_FIFO_PARITY_ERR] = CNTR_ELEM("PioVlFifoParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_vl_fifo_parity_err_cnt),
+[C_PIO_VLF_SOP_PARITY_ERR] = CNTR_ELEM("PioVlfSopParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_vlf_sop_parity_err_cnt),
+[C_PIO_VLF_V1_LEN_PARITY_ERR] = CNTR_ELEM("PioVlfVlLenParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_vlf_v1_len_parity_err_cnt),
+[C_PIO_BLOCK_QW_COUNT_PARITY_ERR] = CNTR_ELEM("PioBlockQwCountParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_block_qw_count_parity_err_cnt),
+[C_PIO_WRITE_QW_VALID_PARITY_ERR] = CNTR_ELEM("PioWriteQwValidParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_write_qw_valid_parity_err_cnt),
+[C_PIO_STATE_MACHINE_ERR] = CNTR_ELEM("PioStateMachineErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_state_machine_err_cnt),
+[C_PIO_WRITE_DATA_PARITY_ERR] = CNTR_ELEM("PioWriteDataParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_write_data_parity_err_cnt),
+[C_PIO_HOST_ADDR_MEM_COR_ERR] = CNTR_ELEM("PioHostAddrMemCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_host_addr_mem_cor_err_cnt),
+[C_PIO_HOST_ADDR_MEM_UNC_ERR] = CNTR_ELEM("PioHostAddrMemUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_host_addr_mem_unc_err_cnt),
+[C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR] = CNTR_ELEM("PioPktEvictSmOrArbSmErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_pkt_evict_sm_or_arb_sm_err_cnt),
+[C_PIO_INIT_SM_IN_ERR] = CNTR_ELEM("PioInitSmInErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_init_sm_in_err_cnt),
+[C_PIO_PPMC_PBL_FIFO_ERR] = CNTR_ELEM("PioPpmcPblFifoErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_ppmc_pbl_fifo_err_cnt),
+[C_PIO_CREDIT_RET_FIFO_PARITY_ERR] = CNTR_ELEM("PioCreditRetFifoParityErr", 0,
+			0, CNTR_NORMAL,
+			access_pio_credit_ret_fifo_parity_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK1_COR_ERR] = CNTR_ELEM("PioVlLenMemBank1CorErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_v1_len_mem_bank1_cor_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK0_COR_ERR] = CNTR_ELEM("PioVlLenMemBank0CorErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_v1_len_mem_bank0_cor_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK1_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank1UncErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_v1_len_mem_bank1_unc_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK0_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank0UncErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_v1_len_mem_bank0_unc_err_cnt),
+[C_PIO_SM_PKT_RESET_PARITY_ERR] = CNTR_ELEM("PioSmPktResetParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_sm_pkt_reset_parity_err_cnt),
+[C_PIO_PKT_EVICT_FIFO_PARITY_ERR] = CNTR_ELEM("PioPktEvictFifoParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_pkt_evict_fifo_parity_err_cnt),
+[C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR] = CNTR_ELEM(
+			"PioSbrdctrlCrrelFifoParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_sbrdctrl_crrel_fifo_parity_err_cnt),
+[C_PIO_SBRDCTL_CRREL_PARITY_ERR] = CNTR_ELEM("PioSbrdctlCrrelParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_sbrdctl_crrel_parity_err_cnt),
+[C_PIO_PEC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPecFifoParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_pec_fifo_parity_err_cnt),
+[C_PIO_PCC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPccFifoParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_pcc_fifo_parity_err_cnt),
+[C_PIO_SB_MEM_FIFO1_ERR] = CNTR_ELEM("PioSbMemFifo1Err", 0, 0,
+			CNTR_NORMAL,
+			access_pio_sb_mem_fifo1_err_cnt),
+[C_PIO_SB_MEM_FIFO0_ERR] = CNTR_ELEM("PioSbMemFifo0Err", 0, 0,
+			CNTR_NORMAL,
+			access_pio_sb_mem_fifo0_err_cnt),
+[C_PIO_CSR_PARITY_ERR] = CNTR_ELEM("PioCsrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_csr_parity_err_cnt),
+[C_PIO_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("PioWriteAddrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_write_addr_parity_err_cnt),
+[C_PIO_WRITE_BAD_CTXT_ERR] = CNTR_ELEM("PioWriteBadCtxtErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_write_bad_ctxt_err_cnt),
+/* SendDmaErrStatus */
+[C_SDMA_PCIE_REQ_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPcieReqTrackingCorErr", 0,
+			0, CNTR_NORMAL,
+			access_sdma_pcie_req_tracking_cor_err_cnt),
+[C_SDMA_PCIE_REQ_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPcieReqTrackingUncErr", 0,
+			0, CNTR_NORMAL,
+			access_sdma_pcie_req_tracking_unc_err_cnt),
+[C_SDMA_CSR_PARITY_ERR] = CNTR_ELEM("SDmaCsrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_csr_parity_err_cnt),
+[C_SDMA_RPY_TAG_ERR] = CNTR_ELEM("SDmaRpyTagErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_rpy_tag_err_cnt),
+/* SendEgressErrStatus */
+[C_TX_READ_PIO_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryCsrUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_read_pio_memory_csr_unc_err_cnt),
+[C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryCsrUncErr", 0,
+			0, CNTR_NORMAL,
+			access_tx_read_sdma_memory_csr_err_cnt),
+[C_TX_EGRESS_FIFO_COR_ERR] = CNTR_ELEM("TxEgressFifoCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_egress_fifo_cor_err_cnt),
+[C_TX_READ_PIO_MEMORY_COR_ERR] = CNTR_ELEM("TxReadPioMemoryCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_read_pio_memory_cor_err_cnt),
+[C_TX_READ_SDMA_MEMORY_COR_ERR] = CNTR_ELEM("TxReadSdmaMemoryCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_read_sdma_memory_cor_err_cnt),
+[C_TX_SB_HDR_COR_ERR] = CNTR_ELEM("TxSbHdrCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_sb_hdr_cor_err_cnt),
+[C_TX_CREDIT_OVERRUN_ERR] = CNTR_ELEM("TxCreditOverrunErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_credit_overrun_err_cnt),
+[C_TX_LAUNCH_FIFO8_COR_ERR] = CNTR_ELEM("TxLaunchFifo8CorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_launch_fifo8_cor_err_cnt),
+[C_TX_LAUNCH_FIFO7_COR_ERR] = CNTR_ELEM("TxLaunchFifo7CorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_launch_fifo7_cor_err_cnt),
+[C_TX_LAUNCH_FIFO6_COR_ERR] = CNTR_ELEM("TxLaunchFifo6CorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_launch_fifo6_cor_err_cnt),
+[C_TX_LAUNCH_FIFO5_COR_ERR] = CNTR_ELEM("TxLaunchFifo5CorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_launch_fifo5_cor_err_cnt),
+[C_TX_LAUNCH_FIFO4_COR_ERR] = CNTR_ELEM("TxLaunchFifo4CorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_launch_fifo4_cor_err_cnt),
+[C_TX_LAUNCH_FIFO3_COR_ERR] = CNTR_ELEM("TxLaunchFifo3CorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_launch_fifo3_cor_err_cnt),
+[C_TX_LAUNCH_FIFO2_COR_ERR] = CNTR_ELEM("TxLaunchFifo2CorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_launch_fifo2_cor_err_cnt),
+[C_TX_LAUNCH_FIFO1_COR_ERR] = CNTR_ELEM("TxLaunchFifo1CorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_launch_fifo1_cor_err_cnt),
+[C_TX_LAUNCH_FIFO0_COR_ERR] = CNTR_ELEM("TxLaunchFifo0CorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_launch_fifo0_cor_err_cnt),
+[C_TX_CREDIT_RETURN_VL_ERR] = CNTR_ELEM("TxCreditReturnVLErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_credit_return_vl_err_cnt),
+[C_TX_HCRC_INSERTION_ERR] = CNTR_ELEM("TxHcrcInsertionErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_hcrc_insertion_err_cnt),
+[C_TX_EGRESS_FIFI_UNC_ERR] = CNTR_ELEM("TxEgressFifoUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_egress_fifo_unc_err_cnt),
+[C_TX_READ_PIO_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_read_pio_memory_unc_err_cnt),
+[C_TX_READ_SDMA_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_read_sdma_memory_unc_err_cnt),
+[C_TX_SB_HDR_UNC_ERR] = CNTR_ELEM("TxSbHdrUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_sb_hdr_unc_err_cnt),
+[C_TX_CREDIT_RETURN_PARITY_ERR] = CNTR_ELEM("TxCreditReturnParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_credit_return_partiy_err_cnt),
+[C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo8UncOrParityErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_launch_fifo8_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo7UncOrParityErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_launch_fifo7_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo6UncOrParityErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_launch_fifo6_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo5UncOrParityErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_launch_fifo5_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo4UncOrParityErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_launch_fifo4_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo3UncOrParityErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_launch_fifo3_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo2UncOrParityErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_launch_fifo2_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo1UncOrParityErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_launch_fifo1_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo0UncOrParityErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_launch_fifo0_unc_or_parity_err_cnt),
+[C_TX_SDMA15_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma15DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma15_disallowed_packet_err_cnt),
+[C_TX_SDMA14_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma14DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma14_disallowed_packet_err_cnt),
+[C_TX_SDMA13_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma13DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma13_disallowed_packet_err_cnt),
+[C_TX_SDMA12_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma12DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma12_disallowed_packet_err_cnt),
+[C_TX_SDMA11_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma11DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma11_disallowed_packet_err_cnt),
+[C_TX_SDMA10_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma10DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma10_disallowed_packet_err_cnt),
+[C_TX_SDMA9_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma9DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma9_disallowed_packet_err_cnt),
+[C_TX_SDMA8_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma8DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma8_disallowed_packet_err_cnt),
+[C_TX_SDMA7_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma7DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma7_disallowed_packet_err_cnt),
+[C_TX_SDMA6_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma6DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma6_disallowed_packet_err_cnt),
+[C_TX_SDMA5_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma5DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma5_disallowed_packet_err_cnt),
+[C_TX_SDMA4_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma4DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma4_disallowed_packet_err_cnt),
+[C_TX_SDMA3_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma3DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma3_disallowed_packet_err_cnt),
+[C_TX_SDMA2_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma2DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma2_disallowed_packet_err_cnt),
+[C_TX_SDMA1_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma1DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma1_disallowed_packet_err_cnt),
+[C_TX_SDMA0_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma0DisallowedPacketErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma0_disallowed_packet_err_cnt),
+[C_TX_CONFIG_PARITY_ERR] = CNTR_ELEM("TxConfigParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_config_parity_err_cnt),
+[C_TX_SBRD_CTL_CSR_PARITY_ERR] = CNTR_ELEM("TxSbrdCtlCsrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_sbrd_ctl_csr_parity_err_cnt),
+[C_TX_LAUNCH_CSR_PARITY_ERR] = CNTR_ELEM("TxLaunchCsrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_launch_csr_parity_err_cnt),
+[C_TX_ILLEGAL_CL_ERR] = CNTR_ELEM("TxIllegalVLErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_illegal_vl_err_cnt),
+[C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR] = CNTR_ELEM(
+			"TxSbrdCtlStateMachineParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_sbrd_ctl_state_machine_parity_err_cnt),
+[C_TX_RESERVED_10] = CNTR_ELEM("Tx Egress Reserved 10", 0, 0,
+			CNTR_NORMAL,
+			access_egress_reserved_10_err_cnt),
+[C_TX_RESERVED_9] = CNTR_ELEM("Tx Egress Reserved 9", 0, 0,
+			CNTR_NORMAL,
+			access_egress_reserved_9_err_cnt),
+[C_TX_SDMA_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxSdmaLaunchIntfParityErr",
+			0, 0, CNTR_NORMAL,
+			access_tx_sdma_launch_intf_parity_err_cnt),
+[C_TX_PIO_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxPioLaunchIntfParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_pio_launch_intf_parity_err_cnt),
+[C_TX_RESERVED_6] = CNTR_ELEM("Tx Egress Reserved 6", 0, 0,
+			CNTR_NORMAL,
+			access_egress_reserved_6_err_cnt),
+[C_TX_INCORRECT_LINK_STATE_ERR] = CNTR_ELEM("TxIncorrectLinkStateErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_incorrect_link_state_err_cnt),
+[C_TX_LINK_DOWN_ERR] = CNTR_ELEM("TxLinkdownErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_linkdown_err_cnt),
+[C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR] = CNTR_ELEM(
+			"EgressFifoUnderrunOrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_egress_fifi_underrun_or_parity_err_cnt),
+[C_TX_RESERVED_2] = CNTR_ELEM("Tx Egress Reserved 2", 0, 0,
+			CNTR_NORMAL,
+			access_egress_reserved_2_err_cnt),
+[C_TX_PKT_INTEGRITY_MEM_UNC_ERR] = CNTR_ELEM("TxPktIntegrityMemUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_pkt_integrity_mem_unc_err_cnt),
+[C_TX_PKT_INTEGRITY_MEM_COR_ERR] = CNTR_ELEM("TxPktIntegrityMemCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_tx_pkt_integrity_mem_cor_err_cnt),
+/* SendErrStatus */
+[C_SEND_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("SendCsrWriteBadAddrErr", 0, 0,
+			CNTR_NORMAL,
+			access_send_csr_write_bad_addr_err_cnt),
+[C_SEND_CSR_READ_BAD_ADD_ERR] = CNTR_ELEM("SendCsrReadBadAddrErr", 0, 0,
+			CNTR_NORMAL,
+			access_send_csr_read_bad_addr_err_cnt),
+[C_SEND_CSR_PARITY_ERR] = CNTR_ELEM("SendCsrParityErr", 0, 0,
+			CNTR_NORMAL,
+			access_send_csr_parity_cnt),
+/* SendCtxtErrStatus */
+[C_PIO_WRITE_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("PioWriteOutOfBoundsErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_write_out_of_bounds_err_cnt),
+[C_PIO_WRITE_OVERFLOW_ERR] = CNTR_ELEM("PioWriteOverflowErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_write_overflow_err_cnt),
+[C_PIO_WRITE_CROSSES_BOUNDARY_ERR] = CNTR_ELEM("PioWriteCrossesBoundaryErr",
+			0, 0, CNTR_NORMAL,
+			access_pio_write_crosses_boundary_err_cnt),
+[C_PIO_DISALLOWED_PACKET_ERR] = CNTR_ELEM("PioDisallowedPacketErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_disallowed_packet_err_cnt),
+[C_PIO_INCONSISTENT_SOP_ERR] = CNTR_ELEM("PioInconsistentSopErr", 0, 0,
+			CNTR_NORMAL,
+			access_pio_inconsistent_sop_err_cnt),
+/* SendDmaEngErrStatus */
+[C_SDMA_HEADER_REQUEST_FIFO_COR_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoCorErr",
+			0, 0, CNTR_NORMAL,
+			access_sdma_header_request_fifo_cor_err_cnt),
+[C_SDMA_HEADER_STORAGE_COR_ERR] = CNTR_ELEM("SDmaHeaderStorageCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_header_storage_cor_err_cnt),
+[C_SDMA_PACKET_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPacketTrackingCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_packet_tracking_cor_err_cnt),
+[C_SDMA_ASSEMBLY_COR_ERR] = CNTR_ELEM("SDmaAssemblyCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_assembly_cor_err_cnt),
+[C_SDMA_DESC_TABLE_COR_ERR] = CNTR_ELEM("SDmaDescTableCorErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_desc_table_cor_err_cnt),
+[C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoUncErr",
+			0, 0, CNTR_NORMAL,
+			access_sdma_header_request_fifo_unc_err_cnt),
+[C_SDMA_HEADER_STORAGE_UNC_ERR] = CNTR_ELEM("SDmaHeaderStorageUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_header_storage_unc_err_cnt),
+[C_SDMA_PACKET_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPacketTrackingUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_packet_tracking_unc_err_cnt),
+[C_SDMA_ASSEMBLY_UNC_ERR] = CNTR_ELEM("SDmaAssemblyUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_assembly_unc_err_cnt),
+[C_SDMA_DESC_TABLE_UNC_ERR] = CNTR_ELEM("SDmaDescTableUncErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_desc_table_unc_err_cnt),
+[C_SDMA_TIMEOUT_ERR] = CNTR_ELEM("SDmaTimeoutErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_timeout_err_cnt),
+[C_SDMA_HEADER_LENGTH_ERR] = CNTR_ELEM("SDmaHeaderLengthErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_header_length_err_cnt),
+[C_SDMA_HEADER_ADDRESS_ERR] = CNTR_ELEM("SDmaHeaderAddressErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_header_address_err_cnt),
+[C_SDMA_HEADER_SELECT_ERR] = CNTR_ELEM("SDmaHeaderSelectErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_header_select_err_cnt),
+[C_SMDA_RESERVED_9] = CNTR_ELEM("SDma Reserved 9", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_reserved_9_err_cnt),
+[C_SDMA_PACKET_DESC_OVERFLOW_ERR] = CNTR_ELEM("SDmaPacketDescOverflowErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_packet_desc_overflow_err_cnt),
+[C_SDMA_LENGTH_MISMATCH_ERR] = CNTR_ELEM("SDmaLengthMismatchErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_length_mismatch_err_cnt),
+[C_SDMA_HALT_ERR] = CNTR_ELEM("SDmaHaltErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_halt_err_cnt),
+[C_SDMA_MEM_READ_ERR] = CNTR_ELEM("SDmaMemReadErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_mem_read_err_cnt),
+[C_SDMA_FIRST_DESC_ERR] = CNTR_ELEM("SDmaFirstDescErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_first_desc_err_cnt),
+[C_SDMA_TAIL_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("SDmaTailOutOfBoundsErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_tail_out_of_bounds_err_cnt),
+[C_SDMA_TOO_LONG_ERR] = CNTR_ELEM("SDmaTooLongErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_too_long_err_cnt),
+[C_SDMA_GEN_MISMATCH_ERR] = CNTR_ELEM("SDmaGenMismatchErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_gen_mismatch_err_cnt),
+[C_SDMA_WRONG_DW_ERR] = CNTR_ELEM("SDmaWrongDwErr", 0, 0,
+			CNTR_NORMAL,
+			access_sdma_wrong_dw_err_cnt),
+};
+
+static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = {
+[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT,
+			CNTR_NORMAL),
+[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT,
+			CNTR_NORMAL),
+[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT,
+			CNTR_NORMAL),
+[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT,
+			CNTR_NORMAL),
+[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT,
+			CNTR_NORMAL),
+[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT,
+			CNTR_NORMAL),
+[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT,
+			CNTR_NORMAL),
+[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL),
+[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL),
+[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH),
+[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT,
+				      CNTR_SYNTH | CNTR_VL),
+[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT,
+				     CNTR_SYNTH | CNTR_VL),
+[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT,
+				      CNTR_SYNTH | CNTR_VL),
+[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL),
+[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL),
+[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+			     access_sw_link_dn_cnt),
+[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+			   access_sw_link_up_cnt),
+[C_SW_UNKNOWN_FRAME] = CNTR_ELEM("UnknownFrame", 0, 0, CNTR_NORMAL,
+				 access_sw_unknown_frame_cnt),
+[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+			     access_sw_xmit_discards),
+[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0,
+				CNTR_SYNTH | CNTR_32BIT | CNTR_VL,
+				access_sw_xmit_discards),
+[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH,
+				 access_xmit_constraint_errs),
+[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH,
+				access_rcv_constraint_errs),
+[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts),
+[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends),
+[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks),
+[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks),
+[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts),
+[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops),
+[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait),
+[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak),
+[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq),
+[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
+[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
+[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
+[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
+			       access_sw_cpu_rc_acks),
+[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
+				access_sw_cpu_rc_qacks),
+[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL,
+				       access_sw_cpu_rc_delayed_comp),
+[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1),
+[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3),
+[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5),
+[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7),
+[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9),
+[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11),
+[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13),
+[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15),
+[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17),
+[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19),
+[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21),
+[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23),
+[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25),
+[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27),
+[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29),
+[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31),
+[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33),
+[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35),
+[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37),
+[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39),
+[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41),
+[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43),
+[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45),
+[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47),
+[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49),
+[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51),
+[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53),
+[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55),
+[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57),
+[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59),
+[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61),
+[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63),
+[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65),
+[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67),
+[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69),
+[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71),
+[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73),
+[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75),
+[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77),
+[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79),
+[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81),
+[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83),
+[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85),
+[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87),
+[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89),
+[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91),
+[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93),
+[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95),
+[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97),
+[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99),
+[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101),
+[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103),
+[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105),
+[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107),
+[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109),
+[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111),
+[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113),
+[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115),
+[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117),
+[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119),
+[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121),
+[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123),
+[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125),
+[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127),
+[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129),
+[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131),
+[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133),
+[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135),
+[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137),
+[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139),
+[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141),
+[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143),
+[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145),
+[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147),
+[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149),
+[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151),
+[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153),
+[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155),
+[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157),
+[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159),
+};
+
+/* ======================================================================== */
+
+/* return true if this is chip revision revision a */
+int is_ax(struct hfi1_devdata *dd)
+{
+	u8 chip_rev_minor =
+		dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+			& CCE_REVISION_CHIP_REV_MINOR_MASK;
+	return (chip_rev_minor & 0xf0) == 0;
+}
+
+/* return true if this is chip revision revision b */
+int is_bx(struct hfi1_devdata *dd)
+{
+	u8 chip_rev_minor =
+		dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+			& CCE_REVISION_CHIP_REV_MINOR_MASK;
+	return (chip_rev_minor & 0xF0) == 0x10;
+}
+
+/*
+ * Append string s to buffer buf.  Arguments curp and len are the current
+ * position and remaining length, respectively.
+ *
+ * return 0 on success, 1 on out of room
+ */
+static int append_str(char *buf, char **curp, int *lenp, const char *s)
+{
+	char *p = *curp;
+	int len = *lenp;
+	int result = 0; /* success */
+	char c;
+
+	/* add a comma, if first in the buffer */
+	if (p != buf) {
+		if (len == 0) {
+			result = 1; /* out of room */
+			goto done;
+		}
+		*p++ = ',';
+		len--;
+	}
+
+	/* copy the string */
+	while ((c = *s++) != 0) {
+		if (len == 0) {
+			result = 1; /* out of room */
+			goto done;
+		}
+		*p++ = c;
+		len--;
+	}
+
+done:
+	/* write return values */
+	*curp = p;
+	*lenp = len;
+
+	return result;
+}
+
+/*
+ * Using the given flag table, print a comma separated string into
+ * the buffer.  End in '*' if the buffer is too short.
+ */
+static char *flag_string(char *buf, int buf_len, u64 flags,
+			 struct flag_table *table, int table_size)
+{
+	char extra[32];
+	char *p = buf;
+	int len = buf_len;
+	int no_room = 0;
+	int i;
+
+	/* make sure there is at least 2 so we can form "*" */
+	if (len < 2)
+		return "";
+
+	len--;	/* leave room for a nul */
+	for (i = 0; i < table_size; i++) {
+		if (flags & table[i].flag) {
+			no_room = append_str(buf, &p, &len, table[i].str);
+			if (no_room)
+				break;
+			flags &= ~table[i].flag;
+		}
+	}
+
+	/* any undocumented bits left? */
+	if (!no_room && flags) {
+		snprintf(extra, sizeof(extra), "bits 0x%llx", flags);
+		no_room = append_str(buf, &p, &len, extra);
+	}
+
+	/* add * if ran out of room */
+	if (no_room) {
+		/* may need to back up to add space for a '*' */
+		if (len == 0)
+			--p;
+		*p++ = '*';
+	}
+
+	/* add final nul - space already allocated above */
+	*p = 0;
+	return buf;
+}
+
+/* first 8 CCE error interrupt source names */
+static const char * const cce_misc_names[] = {
+	"CceErrInt",		/* 0 */
+	"RxeErrInt",		/* 1 */
+	"MiscErrInt",		/* 2 */
+	"Reserved3",		/* 3 */
+	"PioErrInt",		/* 4 */
+	"SDmaErrInt",		/* 5 */
+	"EgressErrInt",		/* 6 */
+	"TxeErrInt"		/* 7 */
+};
+
+/*
+ * Return the miscellaneous error interrupt name.
+ */
+static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source)
+{
+	if (source < ARRAY_SIZE(cce_misc_names))
+		strncpy(buf, cce_misc_names[source], bsize);
+	else
+		snprintf(buf, bsize, "Reserved%u",
+			 source + IS_GENERAL_ERR_START);
+
+	return buf;
+}
+
+/*
+ * Return the SDMA engine error interrupt name.
+ */
+static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source)
+{
+	snprintf(buf, bsize, "SDmaEngErrInt%u", source);
+	return buf;
+}
+
+/*
+ * Return the send context error interrupt name.
+ */
+static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source)
+{
+	snprintf(buf, bsize, "SendCtxtErrInt%u", source);
+	return buf;
+}
+
+static const char * const various_names[] = {
+	"PbcInt",
+	"GpioAssertInt",
+	"Qsfp1Int",
+	"Qsfp2Int",
+	"TCritInt"
+};
+
+/*
+ * Return the various interrupt name.
+ */
+static char *is_various_name(char *buf, size_t bsize, unsigned int source)
+{
+	if (source < ARRAY_SIZE(various_names))
+		strncpy(buf, various_names[source], bsize);
+	else
+		snprintf(buf, bsize, "Reserved%u", source + IS_VARIOUS_START);
+	return buf;
+}
+
+/*
+ * Return the DC interrupt name.
+ */
+static char *is_dc_name(char *buf, size_t bsize, unsigned int source)
+{
+	static const char * const dc_int_names[] = {
+		"common",
+		"lcb",
+		"8051",
+		"lbm"	/* local block merge */
+	};
+
+	if (source < ARRAY_SIZE(dc_int_names))
+		snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]);
+	else
+		snprintf(buf, bsize, "DCInt%u", source);
+	return buf;
+}
+
+static const char * const sdma_int_names[] = {
+	"SDmaInt",
+	"SdmaIdleInt",
+	"SdmaProgressInt",
+};
+
+/*
+ * Return the SDMA engine interrupt name.
+ */
+static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source)
+{
+	/* what interrupt */
+	unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
+	/* which engine */
+	unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+	if (likely(what < 3))
+		snprintf(buf, bsize, "%s%u", sdma_int_names[what], which);
+	else
+		snprintf(buf, bsize, "Invalid SDMA interrupt %u", source);
+	return buf;
+}
+
+/*
+ * Return the receive available interrupt name.
+ */
+static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source)
+{
+	snprintf(buf, bsize, "RcvAvailInt%u", source);
+	return buf;
+}
+
+/*
+ * Return the receive urgent interrupt name.
+ */
+static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source)
+{
+	snprintf(buf, bsize, "RcvUrgentInt%u", source);
+	return buf;
+}
+
+/*
+ * Return the send credit interrupt name.
+ */
+static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source)
+{
+	snprintf(buf, bsize, "SendCreditInt%u", source);
+	return buf;
+}
+
+/*
+ * Return the reserved interrupt name.
+ */
+static char *is_reserved_name(char *buf, size_t bsize, unsigned int source)
+{
+	snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START);
+	return buf;
+}
+
+static char *cce_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			   cce_err_status_flags,
+			   ARRAY_SIZE(cce_err_status_flags));
+}
+
+static char *rxe_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			   rxe_err_status_flags,
+			   ARRAY_SIZE(rxe_err_status_flags));
+}
+
+static char *misc_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags, misc_err_status_flags,
+			   ARRAY_SIZE(misc_err_status_flags));
+}
+
+static char *pio_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			   pio_err_status_flags,
+			   ARRAY_SIZE(pio_err_status_flags));
+}
+
+static char *sdma_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			   sdma_err_status_flags,
+			   ARRAY_SIZE(sdma_err_status_flags));
+}
+
+static char *egress_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			   egress_err_status_flags,
+			   ARRAY_SIZE(egress_err_status_flags));
+}
+
+static char *egress_err_info_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			   egress_err_info_flags,
+			   ARRAY_SIZE(egress_err_info_flags));
+}
+
+static char *send_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			   send_err_status_flags,
+			   ARRAY_SIZE(send_err_status_flags));
+}
+
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+	int i = 0;
+
+	/*
+	 * For most these errors, there is nothing that can be done except
+	 * report or record it.
+	 */
+	dd_dev_info(dd, "CCE Error: %s\n",
+		    cce_err_status_string(buf, sizeof(buf), reg));
+
+	if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK) &&
+	    is_ax(dd) && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) {
+		/* this error requires a manual drop into SPC freeze mode */
+		/* then a fix up */
+		start_freeze_handling(dd->pport, FREEZE_SELF);
+	}
+
+	for (i = 0; i < NUM_CCE_ERR_STATUS_COUNTERS; i++) {
+		if (reg & (1ull << i)) {
+			incr_cntr64(&dd->cce_err_status_cnt[i]);
+			/* maintain a counter over all cce_err_status errors */
+			incr_cntr64(&dd->sw_cce_err_status_aggregate);
+		}
+	}
+}
+
+/*
+ * Check counters for receive errors that do not have an interrupt
+ * associated with them.
+ */
+#define RCVERR_CHECK_TIME 10
+static void update_rcverr_timer(struct timer_list *t)
+{
+	struct hfi1_devdata *dd = from_timer(dd, t, rcverr_timer);
+	struct hfi1_pportdata *ppd = dd->pport;
+	u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+
+	if (dd->rcv_ovfl_cnt < cur_ovfl_cnt &&
+	    ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) {
+		dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+		set_link_down_reason(
+		ppd, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
+		OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
+		queue_work(ppd->link_wq, &ppd->link_bounce_work);
+	}
+	dd->rcv_ovfl_cnt = (u32)cur_ovfl_cnt;
+
+	mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static int init_rcverr(struct hfi1_devdata *dd)
+{
+	timer_setup(&dd->rcverr_timer, update_rcverr_timer, 0);
+	/* Assume the hardware counter has been reset */
+	dd->rcv_ovfl_cnt = 0;
+	return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static void free_rcverr(struct hfi1_devdata *dd)
+{
+	if (dd->rcverr_timer.function)
+		del_timer_sync(&dd->rcverr_timer);
+}
+
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+	int i = 0;
+
+	dd_dev_info(dd, "Receive Error: %s\n",
+		    rxe_err_status_string(buf, sizeof(buf), reg));
+
+	if (reg & ALL_RXE_FREEZE_ERR) {
+		int flags = 0;
+
+		/*
+		 * Freeze mode recovery is disabled for the errors
+		 * in RXE_FREEZE_ABORT_MASK
+		 */
+		if (is_ax(dd) && (reg & RXE_FREEZE_ABORT_MASK))
+			flags = FREEZE_ABORT;
+
+		start_freeze_handling(dd->pport, flags);
+	}
+
+	for (i = 0; i < NUM_RCV_ERR_STATUS_COUNTERS; i++) {
+		if (reg & (1ull << i))
+			incr_cntr64(&dd->rcv_err_status_cnt[i]);
+	}
+}
+
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+	int i = 0;
+
+	dd_dev_info(dd, "Misc Error: %s",
+		    misc_err_status_string(buf, sizeof(buf), reg));
+	for (i = 0; i < NUM_MISC_ERR_STATUS_COUNTERS; i++) {
+		if (reg & (1ull << i))
+			incr_cntr64(&dd->misc_err_status_cnt[i]);
+	}
+}
+
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+	int i = 0;
+
+	dd_dev_info(dd, "PIO Error: %s\n",
+		    pio_err_status_string(buf, sizeof(buf), reg));
+
+	if (reg & ALL_PIO_FREEZE_ERR)
+		start_freeze_handling(dd->pport, 0);
+
+	for (i = 0; i < NUM_SEND_PIO_ERR_STATUS_COUNTERS; i++) {
+		if (reg & (1ull << i))
+			incr_cntr64(&dd->send_pio_err_status_cnt[i]);
+	}
+}
+
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+	int i = 0;
+
+	dd_dev_info(dd, "SDMA Error: %s\n",
+		    sdma_err_status_string(buf, sizeof(buf), reg));
+
+	if (reg & ALL_SDMA_FREEZE_ERR)
+		start_freeze_handling(dd->pport, 0);
+
+	for (i = 0; i < NUM_SEND_DMA_ERR_STATUS_COUNTERS; i++) {
+		if (reg & (1ull << i))
+			incr_cntr64(&dd->send_dma_err_status_cnt[i]);
+	}
+}
+
+static inline void __count_port_discards(struct hfi1_pportdata *ppd)
+{
+	incr_cntr64(&ppd->port_xmit_discards);
+}
+
+static void count_port_inactive(struct hfi1_devdata *dd)
+{
+	__count_port_discards(dd->pport);
+}
+
+/*
+ * We have had a "disallowed packet" error during egress. Determine the
+ * integrity check which failed, and update relevant error counter, etc.
+ *
+ * Note that the SEND_EGRESS_ERR_INFO register has only a single
+ * bit of state per integrity check, and so we can miss the reason for an
+ * egress error if more than one packet fails the same integrity check
+ * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO.
+ */
+static void handle_send_egress_err_info(struct hfi1_devdata *dd,
+					int vl)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+	u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */
+	u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO);
+	char buf[96];
+
+	/* clear down all observed info as quickly as possible after read */
+	write_csr(dd, SEND_EGRESS_ERR_INFO, info);
+
+	dd_dev_info(dd,
+		    "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n",
+		    info, egress_err_info_string(buf, sizeof(buf), info), src);
+
+	/* Eventually add other counters for each bit */
+	if (info & PORT_DISCARD_EGRESS_ERRS) {
+		int weight, i;
+
+		/*
+		 * Count all applicable bits as individual errors and
+		 * attribute them to the packet that triggered this handler.
+		 * This may not be completely accurate due to limitations
+		 * on the available hardware error information.  There is
+		 * a single information register and any number of error
+		 * packets may have occurred and contributed to it before
+		 * this routine is called.  This means that:
+		 * a) If multiple packets with the same error occur before
+		 *    this routine is called, earlier packets are missed.
+		 *    There is only a single bit for each error type.
+		 * b) Errors may not be attributed to the correct VL.
+		 *    The driver is attributing all bits in the info register
+		 *    to the packet that triggered this call, but bits
+		 *    could be an accumulation of different packets with
+		 *    different VLs.
+		 * c) A single error packet may have multiple counts attached
+		 *    to it.  There is no way for the driver to know if
+		 *    multiple bits set in the info register are due to a
+		 *    single packet or multiple packets.  The driver assumes
+		 *    multiple packets.
+		 */
+		weight = hweight64(info & PORT_DISCARD_EGRESS_ERRS);
+		for (i = 0; i < weight; i++) {
+			__count_port_discards(ppd);
+			if (vl >= 0 && vl < TXE_NUM_DATA_VL)
+				incr_cntr64(&ppd->port_xmit_discards_vl[vl]);
+			else if (vl == 15)
+				incr_cntr64(&ppd->port_xmit_discards_vl
+					    [C_VL_15]);
+		}
+	}
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'port inactive' error?
+ */
+static inline int port_inactive_err(u64 posn)
+{
+	return (posn >= SEES(TX_LINKDOWN) &&
+		posn <= SEES(TX_INCORRECT_LINK_STATE));
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'disallowed packet' error?
+ */
+static inline int disallowed_pkt_err(int posn)
+{
+	return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) &&
+		posn <= SEES(TX_SDMA15_DISALLOWED_PACKET));
+}
+
+/*
+ * Input value is a bit position of one of the SDMA engine disallowed
+ * packet errors.  Return which engine.  Use of this must be guarded by
+ * disallowed_pkt_err().
+ */
+static inline int disallowed_pkt_engine(int posn)
+{
+	return posn - SEES(TX_SDMA0_DISALLOWED_PACKET);
+}
+
+/*
+ * Translate an SDMA engine to a VL.  Return -1 if the tranlation cannot
+ * be done.
+ */
+static int engine_to_vl(struct hfi1_devdata *dd, int engine)
+{
+	struct sdma_vl_map *m;
+	int vl;
+
+	/* range check */
+	if (engine < 0 || engine >= TXE_NUM_SDMA_ENGINES)
+		return -1;
+
+	rcu_read_lock();
+	m = rcu_dereference(dd->sdma_map);
+	vl = m->engine_to_vl[engine];
+	rcu_read_unlock();
+
+	return vl;
+}
+
+/*
+ * Translate the send context (sofware index) into a VL.  Return -1 if the
+ * translation cannot be done.
+ */
+static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
+{
+	struct send_context_info *sci;
+	struct send_context *sc;
+	int i;
+
+	sci = &dd->send_contexts[sw_index];
+
+	/* there is no information for user (PSM) and ack contexts */
+	if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15))
+		return -1;
+
+	sc = sci->sc;
+	if (!sc)
+		return -1;
+	if (dd->vld[15].sc == sc)
+		return 15;
+	for (i = 0; i < num_vls; i++)
+		if (dd->vld[i].sc == sc)
+			return i;
+
+	return -1;
+}
+
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	u64 reg_copy = reg, handled = 0;
+	char buf[96];
+	int i = 0;
+
+	if (reg & ALL_TXE_EGRESS_FREEZE_ERR)
+		start_freeze_handling(dd->pport, 0);
+	else if (is_ax(dd) &&
+		 (reg & SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK) &&
+		 (dd->icode != ICODE_FUNCTIONAL_SIMULATOR))
+		start_freeze_handling(dd->pport, 0);
+
+	while (reg_copy) {
+		int posn = fls64(reg_copy);
+		/* fls64() returns a 1-based offset, we want it zero based */
+		int shift = posn - 1;
+		u64 mask = 1ULL << shift;
+
+		if (port_inactive_err(shift)) {
+			count_port_inactive(dd);
+			handled |= mask;
+		} else if (disallowed_pkt_err(shift)) {
+			int vl = engine_to_vl(dd, disallowed_pkt_engine(shift));
+
+			handle_send_egress_err_info(dd, vl);
+			handled |= mask;
+		}
+		reg_copy &= ~mask;
+	}
+
+	reg &= ~handled;
+
+	if (reg)
+		dd_dev_info(dd, "Egress Error: %s\n",
+			    egress_err_status_string(buf, sizeof(buf), reg));
+
+	for (i = 0; i < NUM_SEND_EGRESS_ERR_STATUS_COUNTERS; i++) {
+		if (reg & (1ull << i))
+			incr_cntr64(&dd->send_egress_err_status_cnt[i]);
+	}
+}
+
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+	int i = 0;
+
+	dd_dev_info(dd, "Send Error: %s\n",
+		    send_err_status_string(buf, sizeof(buf), reg));
+
+	for (i = 0; i < NUM_SEND_ERR_STATUS_COUNTERS; i++) {
+		if (reg & (1ull << i))
+			incr_cntr64(&dd->send_err_status_cnt[i]);
+	}
+}
+
+/*
+ * The maximum number of times the error clear down will loop before
+ * blocking a repeating error.  This value is arbitrary.
+ */
+#define MAX_CLEAR_COUNT 20
+
+/*
+ * Clear and handle an error register.  All error interrupts are funneled
+ * through here to have a central location to correctly handle single-
+ * or multi-shot errors.
+ *
+ * For non per-context registers, call this routine with a context value
+ * of 0 so the per-context offset is zero.
+ *
+ * If the handler loops too many times, assume that something is wrong
+ * and can't be fixed, so mask the error bits.
+ */
+static void interrupt_clear_down(struct hfi1_devdata *dd,
+				 u32 context,
+				 const struct err_reg_info *eri)
+{
+	u64 reg;
+	u32 count;
+
+	/* read in a loop until no more errors are seen */
+	count = 0;
+	while (1) {
+		reg = read_kctxt_csr(dd, context, eri->status);
+		if (reg == 0)
+			break;
+		write_kctxt_csr(dd, context, eri->clear, reg);
+		if (likely(eri->handler))
+			eri->handler(dd, context, reg);
+		count++;
+		if (count > MAX_CLEAR_COUNT) {
+			u64 mask;
+
+			dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n",
+				   eri->desc, reg);
+			/*
+			 * Read-modify-write so any other masked bits
+			 * remain masked.
+			 */
+			mask = read_kctxt_csr(dd, context, eri->mask);
+			mask &= ~reg;
+			write_kctxt_csr(dd, context, eri->mask, mask);
+			break;
+		}
+	}
+}
+
+/*
+ * CCE block "misc" interrupt.  Source is < 16.
+ */
+static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	const struct err_reg_info *eri = &misc_errs[source];
+
+	if (eri->handler) {
+		interrupt_clear_down(dd, 0, eri);
+	} else {
+		dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n",
+			   source);
+	}
+}
+
+static char *send_context_err_status_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags,
+			   sc_err_status_flags,
+			   ARRAY_SIZE(sc_err_status_flags));
+}
+
+/*
+ * Send context error interrupt.  Source (hw_context) is < 160.
+ *
+ * All send context errors cause the send context to halt.  The normal
+ * clear-down mechanism cannot be used because we cannot clear the
+ * error bits until several other long-running items are done first.
+ * This is OK because with the context halted, nothing else is going
+ * to happen on it anyway.
+ */
+static void is_sendctxt_err_int(struct hfi1_devdata *dd,
+				unsigned int hw_context)
+{
+	struct send_context_info *sci;
+	struct send_context *sc;
+	char flags[96];
+	u64 status;
+	u32 sw_index;
+	int i = 0;
+	unsigned long irq_flags;
+
+	sw_index = dd->hw_to_sw[hw_context];
+	if (sw_index >= dd->num_send_contexts) {
+		dd_dev_err(dd,
+			   "out of range sw index %u for send context %u\n",
+			   sw_index, hw_context);
+		return;
+	}
+	sci = &dd->send_contexts[sw_index];
+	spin_lock_irqsave(&dd->sc_lock, irq_flags);
+	sc = sci->sc;
+	if (!sc) {
+		dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
+			   sw_index, hw_context);
+		spin_unlock_irqrestore(&dd->sc_lock, irq_flags);
+		return;
+	}
+
+	/* tell the software that a halt has begun */
+	sc_stop(sc, SCF_HALTED);
+
+	status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS);
+
+	dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context,
+		    send_context_err_status_string(flags, sizeof(flags),
+						   status));
+
+	if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK)
+		handle_send_egress_err_info(dd, sc_to_vl(dd, sw_index));
+
+	/*
+	 * Automatically restart halted kernel contexts out of interrupt
+	 * context.  User contexts must ask the driver to restart the context.
+	 */
+	if (sc->type != SC_USER)
+		queue_work(dd->pport->hfi1_wq, &sc->halt_work);
+	spin_unlock_irqrestore(&dd->sc_lock, irq_flags);
+
+	/*
+	 * Update the counters for the corresponding status bits.
+	 * Note that these particular counters are aggregated over all
+	 * 160 contexts.
+	 */
+	for (i = 0; i < NUM_SEND_CTXT_ERR_STATUS_COUNTERS; i++) {
+		if (status & (1ull << i))
+			incr_cntr64(&dd->sw_ctxt_err_status_cnt[i]);
+	}
+}
+
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+				unsigned int source, u64 status)
+{
+	struct sdma_engine *sde;
+	int i = 0;
+
+	sde = &dd->per_sdma[source];
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+		   slashstrip(__FILE__), __LINE__, __func__);
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n",
+		   sde->this_idx, source, (unsigned long long)status);
+#endif
+	sde->err_cnt++;
+	sdma_engine_error(sde, status);
+
+	/*
+	* Update the counters for the corresponding status bits.
+	* Note that these particular counters are aggregated over
+	* all 16 DMA engines.
+	*/
+	for (i = 0; i < NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS; i++) {
+		if (status & (1ull << i))
+			incr_cntr64(&dd->sw_send_dma_eng_err_status_cnt[i]);
+	}
+}
+
+/*
+ * CCE block SDMA error interrupt.  Source is < 16.
+ */
+static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+	struct sdma_engine *sde = &dd->per_sdma[source];
+
+	dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+		   slashstrip(__FILE__), __LINE__, __func__);
+	dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx,
+		   source);
+	sdma_dumpstate(sde);
+#endif
+	interrupt_clear_down(dd, source, &sdma_eng_err);
+}
+
+/*
+ * CCE block "various" interrupt.  Source is < 8.
+ */
+static void is_various_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	const struct err_reg_info *eri = &various_err[source];
+
+	/*
+	 * TCritInt cannot go through interrupt_clear_down()
+	 * because it is not a second tier interrupt. The handler
+	 * should be called directly.
+	 */
+	if (source == TCRIT_INT_SOURCE)
+		handle_temp_err(dd);
+	else if (eri->handler)
+		interrupt_clear_down(dd, 0, eri);
+	else
+		dd_dev_info(dd,
+			    "%s: Unimplemented/reserved interrupt %d\n",
+			    __func__, source);
+}
+
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
+{
+	/* src_ctx is always zero */
+	struct hfi1_pportdata *ppd = dd->pport;
+	unsigned long flags;
+	u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+
+	if (reg & QSFP_HFI0_MODPRST_N) {
+		if (!qsfp_mod_present(ppd)) {
+			dd_dev_info(dd, "%s: QSFP module removed\n",
+				    __func__);
+
+			ppd->driver_link_ready = 0;
+			/*
+			 * Cable removed, reset all our information about the
+			 * cache and cable capabilities
+			 */
+
+			spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+			/*
+			 * We don't set cache_refresh_required here as we expect
+			 * an interrupt when a cable is inserted
+			 */
+			ppd->qsfp_info.cache_valid = 0;
+			ppd->qsfp_info.reset_needed = 0;
+			ppd->qsfp_info.limiting_active = 0;
+			spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+					       flags);
+			/* Invert the ModPresent pin now to detect plug-in */
+			write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
+				  ASIC_QSFP1_INVERT, qsfp_int_mgmt);
+
+			if ((ppd->offline_disabled_reason >
+			  HFI1_ODR_MASK(
+			  OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED)) ||
+			  (ppd->offline_disabled_reason ==
+			  HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
+				ppd->offline_disabled_reason =
+				HFI1_ODR_MASK(
+				OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
+
+			if (ppd->host_link_state == HLS_DN_POLL) {
+				/*
+				 * The link is still in POLL. This means
+				 * that the normal link down processing
+				 * will not happen. We have to do it here
+				 * before turning the DC off.
+				 */
+				queue_work(ppd->link_wq, &ppd->link_down_work);
+			}
+		} else {
+			dd_dev_info(dd, "%s: QSFP module inserted\n",
+				    __func__);
+
+			spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+			ppd->qsfp_info.cache_valid = 0;
+			ppd->qsfp_info.cache_refresh_required = 1;
+			spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+					       flags);
+
+			/*
+			 * Stop inversion of ModPresent pin to detect
+			 * removal of the cable
+			 */
+			qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N;
+			write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
+				  ASIC_QSFP1_INVERT, qsfp_int_mgmt);
+
+			ppd->offline_disabled_reason =
+				HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
+		}
+	}
+
+	if (reg & QSFP_HFI0_INT_N) {
+		dd_dev_info(dd, "%s: Interrupt received from QSFP module\n",
+			    __func__);
+		spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+		ppd->qsfp_info.check_interrupt_flags = 1;
+		spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+	}
+
+	/* Schedule the QSFP work only if there is a cable attached. */
+	if (qsfp_mod_present(ppd))
+		queue_work(ppd->link_wq, &ppd->qsfp_info.qsfp_work);
+}
+
+static int request_host_lcb_access(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	ret = do_8051_command(dd, HCMD_MISC,
+			      (u64)HCMD_MISC_REQUEST_LCB_ACCESS <<
+			      LOAD_DATA_FIELD_ID_SHIFT, NULL);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd, "%s: command failed with error %d\n",
+			   __func__, ret);
+	}
+	return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+static int request_8051_lcb_access(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	ret = do_8051_command(dd, HCMD_MISC,
+			      (u64)HCMD_MISC_GRANT_LCB_ACCESS <<
+			      LOAD_DATA_FIELD_ID_SHIFT, NULL);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd, "%s: command failed with error %d\n",
+			   __func__, ret);
+	}
+	return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+/*
+ * Set the LCB selector - allow host access.  The DCC selector always
+ * points to the host.
+ */
+static inline void set_host_lcb_access(struct hfi1_devdata *dd)
+{
+	write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+		  DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK |
+		  DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
+}
+
+/*
+ * Clear the LCB selector - allow 8051 access.  The DCC selector always
+ * points to the host.
+ */
+static inline void set_8051_lcb_access(struct hfi1_devdata *dd)
+{
+	write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+		  DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
+}
+
+/*
+ * Acquire LCB access from the 8051.  If the host already has access,
+ * just increment a counter.  Otherwise, inform the 8051 that the
+ * host is taking access.
+ *
+ * Returns:
+ *	0 on success
+ *	-EBUSY if the 8051 has control and cannot be disturbed
+ *	-errno if unable to acquire access from the 8051
+ */
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+	int ret = 0;
+
+	/*
+	 * Use the host link state lock so the operation of this routine
+	 * { link state check, selector change, count increment } can occur
+	 * as a unit against a link state change.  Otherwise there is a
+	 * race between the state change and the count increment.
+	 */
+	if (sleep_ok) {
+		mutex_lock(&ppd->hls_lock);
+	} else {
+		while (!mutex_trylock(&ppd->hls_lock))
+			udelay(1);
+	}
+
+	/* this access is valid only when the link is up */
+	if (ppd->host_link_state & HLS_DOWN) {
+		dd_dev_info(dd, "%s: link state %s not up\n",
+			    __func__, link_state_name(ppd->host_link_state));
+		ret = -EBUSY;
+		goto done;
+	}
+
+	if (dd->lcb_access_count == 0) {
+		ret = request_host_lcb_access(dd);
+		if (ret) {
+			dd_dev_err(dd,
+				   "%s: unable to acquire LCB access, err %d\n",
+				   __func__, ret);
+			goto done;
+		}
+		set_host_lcb_access(dd);
+	}
+	dd->lcb_access_count++;
+done:
+	mutex_unlock(&ppd->hls_lock);
+	return ret;
+}
+
+/*
+ * Release LCB access by decrementing the use count.  If the count is moving
+ * from 1 to 0, inform 8051 that it has control back.
+ *
+ * Returns:
+ *	0 on success
+ *	-errno if unable to release access to the 8051
+ */
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+	int ret = 0;
+
+	/*
+	 * Use the host link state lock because the acquire needed it.
+	 * Here, we only need to keep { selector change, count decrement }
+	 * as a unit.
+	 */
+	if (sleep_ok) {
+		mutex_lock(&dd->pport->hls_lock);
+	} else {
+		while (!mutex_trylock(&dd->pport->hls_lock))
+			udelay(1);
+	}
+
+	if (dd->lcb_access_count == 0) {
+		dd_dev_err(dd, "%s: LCB access count is zero.  Skipping.\n",
+			   __func__);
+		goto done;
+	}
+
+	if (dd->lcb_access_count == 1) {
+		set_8051_lcb_access(dd);
+		ret = request_8051_lcb_access(dd);
+		if (ret) {
+			dd_dev_err(dd,
+				   "%s: unable to release LCB access, err %d\n",
+				   __func__, ret);
+			/* restore host access if the grant didn't work */
+			set_host_lcb_access(dd);
+			goto done;
+		}
+	}
+	dd->lcb_access_count--;
+done:
+	mutex_unlock(&dd->pport->hls_lock);
+	return ret;
+}
+
+/*
+ * Initialize LCB access variables and state.  Called during driver load,
+ * after most of the initialization is finished.
+ *
+ * The DC default is LCB access on for the host.  The driver defaults to
+ * leaving access to the 8051.  Assign access now - this constrains the call
+ * to this routine to be after all LCB set-up is done.  In particular, after
+ * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts()
+ */
+static void init_lcb_access(struct hfi1_devdata *dd)
+{
+	dd->lcb_access_count = 0;
+}
+
+/*
+ * Write a response back to a 8051 request.
+ */
+static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
+{
+	write_csr(dd, DC_DC8051_CFG_EXT_DEV_0,
+		  DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK |
+		  (u64)return_code <<
+		  DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT |
+		  (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
+}
+
+/*
+ * Handle host requests from the 8051.
+ */
+static void handle_8051_request(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 reg;
+	u16 data = 0;
+	u8 type;
+
+	reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
+	if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
+		return;	/* no request */
+
+	/* zero out COMPLETED so the response is seen */
+	write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0);
+
+	/* extract request details */
+	type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT)
+			& DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK;
+	data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT)
+			& DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK;
+
+	switch (type) {
+	case HREQ_LOAD_CONFIG:
+	case HREQ_SAVE_CONFIG:
+	case HREQ_READ_CONFIG:
+	case HREQ_SET_TX_EQ_ABS:
+	case HREQ_SET_TX_EQ_REL:
+	case HREQ_ENABLE:
+		dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
+			    type);
+		hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+		break;
+	case HREQ_CONFIG_DONE:
+		hreq_response(dd, HREQ_SUCCESS, 0);
+		break;
+
+	case HREQ_INTERFACE_TEST:
+		hreq_response(dd, HREQ_SUCCESS, data);
+		break;
+	default:
+		dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
+		hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+		break;
+	}
+}
+
+/*
+ * Set up allocation unit vaulue.
+ */
+void set_up_vau(struct hfi1_devdata *dd, u8 vau)
+{
+	u64 reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+
+	/* do not modify other values in the register */
+	reg &= ~SEND_CM_GLOBAL_CREDIT_AU_SMASK;
+	reg |= (u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT;
+	write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/*
+ * Set up initial VL15 credits of the remote.  Assumes the rest of
+ * the CM credit registers are zero from a previous global or credit reset.
+ * Shared limit for VL15 will always be 0.
+ */
+void set_up_vl15(struct hfi1_devdata *dd, u16 vl15buf)
+{
+	u64 reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+
+	/* set initial values for total and shared credit limit */
+	reg &= ~(SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK |
+		 SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK);
+
+	/*
+	 * Set total limit to be equal to VL15 credits.
+	 * Leave shared limit at 0.
+	 */
+	reg |= (u64)vl15buf << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT;
+	write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+
+	write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf
+		  << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+}
+
+/*
+ * Zero all credit details from the previous connection and
+ * reset the CM manager's internal counters.
+ */
+void reset_link_credits(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* remove all previous VL credit limits */
+	for (i = 0; i < TXE_NUM_DATA_VL; i++)
+		write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
+	write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+	write_csr(dd, SEND_CM_GLOBAL_CREDIT, 0);
+	/* reset the CM block */
+	pio_send_control(dd, PSC_CM_RESET);
+	/* reset cached value */
+	dd->vl15buf_cached = 0;
+}
+
+/* convert a vCU to a CU */
+static u32 vcu_to_cu(u8 vcu)
+{
+	return 1 << vcu;
+}
+
+/* convert a CU to a vCU */
+static u8 cu_to_vcu(u32 cu)
+{
+	return ilog2(cu);
+}
+
+/* convert a vAU to an AU */
+static u32 vau_to_au(u8 vau)
+{
+	return 8 * (1 << vau);
+}
+
+static void set_linkup_defaults(struct hfi1_pportdata *ppd)
+{
+	ppd->sm_trap_qp = 0x0;
+	ppd->sa_qp = 0x1;
+}
+
+/*
+ * Graceful LCB shutdown.  This leaves the LCB FIFOs in reset.
+ */
+static void lcb_shutdown(struct hfi1_devdata *dd, int abort)
+{
+	u64 reg;
+
+	/* clear lcb run: LCB_CFG_RUN.EN = 0 */
+	write_csr(dd, DC_LCB_CFG_RUN, 0);
+	/* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */
+	write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET,
+		  1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT);
+	/* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */
+	dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN);
+	reg = read_csr(dd, DCC_CFG_RESET);
+	write_csr(dd, DCC_CFG_RESET, reg |
+		  (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT) |
+		  (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT));
+	(void)read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */
+	if (!abort) {
+		udelay(1);    /* must hold for the longer of 16cclks or 20ns */
+		write_csr(dd, DCC_CFG_RESET, reg);
+		write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+	}
+}
+
+/*
+ * This routine should be called after the link has been transitioned to
+ * OFFLINE (OFFLINE state has the side effect of putting the SerDes into
+ * reset).
+ *
+ * The expectation is that the caller of this routine would have taken
+ * care of properly transitioning the link into the correct state.
+ * NOTE: the caller needs to acquire the dd->dc8051_lock lock
+ *       before calling this function.
+ */
+static void _dc_shutdown(struct hfi1_devdata *dd)
+{
+	lockdep_assert_held(&dd->dc8051_lock);
+
+	if (dd->dc_shutdown)
+		return;
+
+	dd->dc_shutdown = 1;
+	/* Shutdown the LCB */
+	lcb_shutdown(dd, 1);
+	/*
+	 * Going to OFFLINE would have causes the 8051 to put the
+	 * SerDes into reset already. Just need to shut down the 8051,
+	 * itself.
+	 */
+	write_csr(dd, DC_DC8051_CFG_RST, 0x1);
+}
+
+static void dc_shutdown(struct hfi1_devdata *dd)
+{
+	mutex_lock(&dd->dc8051_lock);
+	_dc_shutdown(dd);
+	mutex_unlock(&dd->dc8051_lock);
+}
+
+/*
+ * Calling this after the DC has been brought out of reset should not
+ * do any damage.
+ * NOTE: the caller needs to acquire the dd->dc8051_lock lock
+ *       before calling this function.
+ */
+static void _dc_start(struct hfi1_devdata *dd)
+{
+	lockdep_assert_held(&dd->dc8051_lock);
+
+	if (!dd->dc_shutdown)
+		return;
+
+	/* Take the 8051 out of reset */
+	write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+	/* Wait until 8051 is ready */
+	if (wait_fm_ready(dd, TIMEOUT_8051_START))
+		dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
+			   __func__);
+
+	/* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
+	write_csr(dd, DCC_CFG_RESET, 0x10);
+	/* lcb_shutdown() with abort=1 does not restore these */
+	write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+	dd->dc_shutdown = 0;
+}
+
+static void dc_start(struct hfi1_devdata *dd)
+{
+	mutex_lock(&dd->dc8051_lock);
+	_dc_start(dd);
+	mutex_unlock(&dd->dc8051_lock);
+}
+
+/*
+ * These LCB adjustments are for the Aurora SerDes core in the FPGA.
+ */
+static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd)
+{
+	u64 rx_radr, tx_radr;
+	u32 version;
+
+	if (dd->icode != ICODE_FPGA_EMULATION)
+		return;
+
+	/*
+	 * These LCB defaults on emulator _s are good, nothing to do here:
+	 *	LCB_CFG_TX_FIFOS_RADR
+	 *	LCB_CFG_RX_FIFOS_RADR
+	 *	LCB_CFG_LN_DCLK
+	 *	LCB_CFG_IGNORE_LOST_RCLK
+	 */
+	if (is_emulator_s(dd))
+		return;
+	/* else this is _p */
+
+	version = emulator_rev(dd);
+	if (!is_ax(dd))
+		version = 0x2d;	/* all B0 use 0x2d or higher settings */
+
+	if (version <= 0x12) {
+		/* release 0x12 and below */
+
+		/*
+		 * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9
+		 * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9
+		 * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa
+		 */
+		rx_radr =
+		      0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+		    | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+		    | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+		/*
+		 * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default)
+		 * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6
+		 */
+		tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+	} else if (version <= 0x18) {
+		/* release 0x13 up to 0x18 */
+		/* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+		rx_radr =
+		      0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+		    | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+		    | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+		tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+	} else if (version == 0x19) {
+		/* release 0x19 */
+		/* LCB_CFG_RX_FIFOS_RADR = 0xa99 */
+		rx_radr =
+		      0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+		    | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+		    | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+		tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+	} else if (version == 0x1a) {
+		/* release 0x1a */
+		/* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+		rx_radr =
+		      0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+		    | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+		    | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+		tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+		write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull);
+	} else {
+		/* release 0x1b and higher */
+		/* LCB_CFG_RX_FIFOS_RADR = 0x877 */
+		rx_radr =
+		      0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+		    | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+		    | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+		tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+	}
+
+	write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr);
+	/* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */
+	write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
+		  DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
+	write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr);
+}
+
+/*
+ * Handle a SMA idle message
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_sma_message(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+							sma_message_work);
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 msg;
+	int ret;
+
+	/*
+	 * msg is bytes 1-4 of the 40-bit idle message - the command code
+	 * is stripped off
+	 */
+	ret = read_idle_sma(dd, &msg);
+	if (ret)
+		return;
+	dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg);
+	/*
+	 * React to the SMA message.  Byte[1] (0 for us) is the command.
+	 */
+	switch (msg & 0xff) {
+	case SMA_IDLE_ARM:
+		/*
+		 * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+		 * State Transitions
+		 *
+		 * Only expected in INIT or ARMED, discard otherwise.
+		 */
+		if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED))
+			ppd->neighbor_normal = 1;
+		break;
+	case SMA_IDLE_ACTIVE:
+		/*
+		 * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+		 * State Transitions
+		 *
+		 * Can activate the node.  Discard otherwise.
+		 */
+		if (ppd->host_link_state == HLS_UP_ARMED &&
+		    ppd->is_active_optimize_enabled) {
+			ppd->neighbor_normal = 1;
+			ret = set_link_state(ppd, HLS_UP_ACTIVE);
+			if (ret)
+				dd_dev_err(
+					dd,
+					"%s: received Active SMA idle message, couldn't set link to Active\n",
+					__func__);
+		}
+		break;
+	default:
+		dd_dev_err(dd,
+			   "%s: received unexpected SMA idle message 0x%llx\n",
+			   __func__, msg);
+		break;
+	}
+}
+
+static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear)
+{
+	u64 rcvctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->rcvctrl_lock, flags);
+	rcvctrl = read_csr(dd, RCV_CTRL);
+	rcvctrl |= add;
+	rcvctrl &= ~clear;
+	write_csr(dd, RCV_CTRL, rcvctrl);
+	spin_unlock_irqrestore(&dd->rcvctrl_lock, flags);
+}
+
+static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add)
+{
+	adjust_rcvctrl(dd, add, 0);
+}
+
+static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear)
+{
+	adjust_rcvctrl(dd, 0, clear);
+}
+
+/*
+ * Called from all interrupt handlers to start handling an SPC freeze.
+ */
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	struct send_context *sc;
+	int i;
+
+	if (flags & FREEZE_SELF)
+		write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+
+	/* enter frozen mode */
+	dd->flags |= HFI1_FROZEN;
+
+	/* notify all SDMA engines that they are going into a freeze */
+	sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
+
+	/* do halt pre-handling on all enabled send contexts */
+	for (i = 0; i < dd->num_send_contexts; i++) {
+		sc = dd->send_contexts[i].sc;
+		if (sc && (sc->flags & SCF_ENABLED))
+			sc_stop(sc, SCF_FROZEN | SCF_HALTED);
+	}
+
+	/* Send context are frozen. Notify user space */
+	hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT);
+
+	if (flags & FREEZE_ABORT) {
+		dd_dev_err(dd,
+			   "Aborted freeze recovery. Please REBOOT system\n");
+		return;
+	}
+	/* queue non-interrupt handler */
+	queue_work(ppd->hfi1_wq, &ppd->freeze_work);
+}
+
+/*
+ * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen,
+ * depending on the "freeze" parameter.
+ *
+ * No need to return an error if it times out, our only option
+ * is to proceed anyway.
+ */
+static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze)
+{
+	unsigned long timeout;
+	u64 reg;
+
+	timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT);
+	while (1) {
+		reg = read_csr(dd, CCE_STATUS);
+		if (freeze) {
+			/* waiting until all indicators are set */
+			if ((reg & ALL_FROZE) == ALL_FROZE)
+				return;	/* all done */
+		} else {
+			/* waiting until all indicators are clear */
+			if ((reg & ALL_FROZE) == 0)
+				return; /* all done */
+		}
+
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(dd,
+				   "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing",
+				   freeze ? "" : "un", reg & ALL_FROZE,
+				   freeze ? ALL_FROZE : 0ull);
+			return;
+		}
+		usleep_range(80, 120);
+	}
+}
+
+/*
+ * Do all freeze handling for the RXE block.
+ */
+static void rxe_freeze(struct hfi1_devdata *dd)
+{
+	int i;
+	struct hfi1_ctxtdata *rcd;
+
+	/* disable port */
+	clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+	/* disable all receive contexts */
+	for (i = 0; i < dd->num_rcv_contexts; i++) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, rcd);
+		hfi1_rcd_put(rcd);
+	}
+}
+
+/*
+ * Unfreeze handling for the RXE block - kernel contexts only.
+ * This will also enable the port.  User contexts will do unfreeze
+ * handling on a per-context basis as they call into the driver.
+ *
+ */
+static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+	u32 rcvmask;
+	u16 i;
+	struct hfi1_ctxtdata *rcd;
+
+	/* enable all kernel contexts */
+	for (i = 0; i < dd->num_rcv_contexts; i++) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+
+		/* Ensure all non-user contexts(including vnic) are enabled */
+		if (!rcd ||
+		    (i >= dd->first_dyn_alloc_ctxt && !rcd->is_vnic)) {
+			hfi1_rcd_put(rcd);
+			continue;
+		}
+		rcvmask = HFI1_RCVCTRL_CTXT_ENB;
+		/* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */
+		rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ?
+			HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
+		hfi1_rcvctrl(dd, rcvmask, rcd);
+		hfi1_rcd_put(rcd);
+	}
+
+	/* enable port */
+	add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+/*
+ * Non-interrupt SPC freeze handling.
+ *
+ * This is a work-queue function outside of the triggering interrupt.
+ */
+void handle_freeze(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+								freeze_work);
+	struct hfi1_devdata *dd = ppd->dd;
+
+	/* wait for freeze indicators on all affected blocks */
+	wait_for_freeze_status(dd, 1);
+
+	/* SPC is now frozen */
+
+	/* do send PIO freeze steps */
+	pio_freeze(dd);
+
+	/* do send DMA freeze steps */
+	sdma_freeze(dd);
+
+	/* do send egress freeze steps - nothing to do */
+
+	/* do receive freeze steps */
+	rxe_freeze(dd);
+
+	/*
+	 * Unfreeze the hardware - clear the freeze, wait for each
+	 * block's frozen bit to clear, then clear the frozen flag.
+	 */
+	write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+	wait_for_freeze_status(dd, 0);
+
+	if (is_ax(dd)) {
+		write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+		wait_for_freeze_status(dd, 1);
+		write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+		wait_for_freeze_status(dd, 0);
+	}
+
+	/* do send PIO unfreeze steps for kernel contexts */
+	pio_kernel_unfreeze(dd);
+
+	/* do send DMA unfreeze steps */
+	sdma_unfreeze(dd);
+
+	/* do send egress unfreeze steps - nothing to do */
+
+	/* do receive unfreeze steps for kernel contexts */
+	rxe_kernel_unfreeze(dd);
+
+	/*
+	 * The unfreeze procedure touches global device registers when
+	 * it disables and re-enables RXE. Mark the device unfrozen
+	 * after all that is done so other parts of the driver waiting
+	 * for the device to unfreeze don't do things out of order.
+	 *
+	 * The above implies that the meaning of HFI1_FROZEN flag is
+	 * "Device has gone into freeze mode and freeze mode handling
+	 * is still in progress."
+	 *
+	 * The flag will be removed when freeze mode processing has
+	 * completed.
+	 */
+	dd->flags &= ~HFI1_FROZEN;
+	wake_up(&dd->event_queue);
+
+	/* no longer frozen */
+}
+
+/**
+ * update_xmit_counters - update PortXmitWait/PortVlXmitWait
+ * counters.
+ * @ppd: info of physical Hfi port
+ * @link_width: new link width after link up or downgrade
+ *
+ * Update the PortXmitWait and PortVlXmitWait counters after
+ * a link up or downgrade event to reflect a link width change.
+ */
+static void update_xmit_counters(struct hfi1_pportdata *ppd, u16 link_width)
+{
+	int i;
+	u16 tx_width;
+	u16 link_speed;
+
+	tx_width = tx_link_width(link_width);
+	link_speed = get_link_speed(ppd->link_speed_active);
+
+	/*
+	 * There are C_VL_COUNT number of PortVLXmitWait counters.
+	 * Adding 1 to C_VL_COUNT to include the PortXmitWait counter.
+	 */
+	for (i = 0; i < C_VL_COUNT + 1; i++)
+		get_xmit_wait_counters(ppd, tx_width, link_speed, i);
+}
+
+/*
+ * Handle a link up interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_up(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+						  link_up_work);
+	struct hfi1_devdata *dd = ppd->dd;
+
+	set_link_state(ppd, HLS_UP_INIT);
+
+	/* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+	read_ltp_rtt(dd);
+	/*
+	 * OPA specifies that certain counters are cleared on a transition
+	 * to link up, so do that.
+	 */
+	clear_linkup_counters(dd);
+	/*
+	 * And (re)set link up default values.
+	 */
+	set_linkup_defaults(ppd);
+
+	/*
+	 * Set VL15 credits. Use cached value from verify cap interrupt.
+	 * In case of quick linkup or simulator, vl15 value will be set by
+	 * handle_linkup_change. VerifyCap interrupt handler will not be
+	 * called in those scenarios.
+	 */
+	if (!(quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR))
+		set_up_vl15(dd, dd->vl15buf_cached);
+
+	/* enforce link speed enabled */
+	if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) {
+		/* oops - current speed is not enabled, bounce */
+		dd_dev_err(dd,
+			   "Link speed active 0x%x is outside enabled 0x%x, downing link\n",
+			   ppd->link_speed_active, ppd->link_speed_enabled);
+		set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0,
+				     OPA_LINKDOWN_REASON_SPEED_POLICY);
+		set_link_state(ppd, HLS_DN_OFFLINE);
+		start_link(ppd);
+	}
+}
+
+/*
+ * Several pieces of LNI information were cached for SMA in ppd.
+ * Reset these on link down
+ */
+static void reset_neighbor_info(struct hfi1_pportdata *ppd)
+{
+	ppd->neighbor_guid = 0;
+	ppd->neighbor_port_number = 0;
+	ppd->neighbor_type = 0;
+	ppd->neighbor_fm_security = 0;
+}
+
+static const char * const link_down_reason_strs[] = {
+	[OPA_LINKDOWN_REASON_NONE] = "None",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Receive error 0",
+	[OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
+	[OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
+	[OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
+	[OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID",
+	[OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID",
+	[OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2",
+	[OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8",
+	[OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10",
+	[OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error",
+	[OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15",
+	[OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15",
+	[OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance",
+	[OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance",
+	[OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance",
+	[OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack",
+	[OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker",
+	[OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt",
+	[OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit",
+	[OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30",
+	[OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] =
+					"Excessive buffer overrun",
+	[OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown",
+	[OPA_LINKDOWN_REASON_REBOOT] = "Reboot",
+	[OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown",
+	[OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce",
+	[OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy",
+	[OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy",
+	[OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected",
+	[OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] =
+					"Local media not installed",
+	[OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed",
+	[OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config",
+	[OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] =
+					"End to end not installed",
+	[OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy",
+	[OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy",
+	[OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy",
+	[OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management",
+	[OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled",
+	[OPA_LINKDOWN_REASON_TRANSIENT] = "Transient"
+};
+
+/* return the neighbor link down reason string */
+static const char *link_down_reason_str(u8 reason)
+{
+	const char *str = NULL;
+
+	if (reason < ARRAY_SIZE(link_down_reason_strs))
+		str = link_down_reason_strs[reason];
+	if (!str)
+		str = "(invalid)";
+
+	return str;
+}
+
+/*
+ * Handle a link down interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_down(struct work_struct *work)
+{
+	u8 lcl_reason, neigh_reason = 0;
+	u8 link_down_reason;
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+						  link_down_work);
+	int was_up;
+	static const char ldr_str[] = "Link down reason: ";
+
+	if ((ppd->host_link_state &
+	     (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) &&
+	     ppd->port_type == PORT_TYPE_FIXED)
+		ppd->offline_disabled_reason =
+			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED);
+
+	/* Go offline first, then deal with reading/writing through 8051 */
+	was_up = !!(ppd->host_link_state & HLS_UP);
+	set_link_state(ppd, HLS_DN_OFFLINE);
+	xchg(&ppd->is_link_down_queued, 0);
+
+	if (was_up) {
+		lcl_reason = 0;
+		/* link down reason is only valid if the link was up */
+		read_link_down_reason(ppd->dd, &link_down_reason);
+		switch (link_down_reason) {
+		case LDR_LINK_TRANSFER_ACTIVE_LOW:
+			/* the link went down, no idle message reason */
+			dd_dev_info(ppd->dd, "%sUnexpected link down\n",
+				    ldr_str);
+			break;
+		case LDR_RECEIVED_LINKDOWN_IDLE_MSG:
+			/*
+			 * The neighbor reason is only valid if an idle message
+			 * was received for it.
+			 */
+			read_planned_down_reason_code(ppd->dd, &neigh_reason);
+			dd_dev_info(ppd->dd,
+				    "%sNeighbor link down message %d, %s\n",
+				    ldr_str, neigh_reason,
+				    link_down_reason_str(neigh_reason));
+			break;
+		case LDR_RECEIVED_HOST_OFFLINE_REQ:
+			dd_dev_info(ppd->dd,
+				    "%sHost requested link to go offline\n",
+				    ldr_str);
+			break;
+		default:
+			dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n",
+				    ldr_str, link_down_reason);
+			break;
+		}
+
+		/*
+		 * If no reason, assume peer-initiated but missed
+		 * LinkGoingDown idle flits.
+		 */
+		if (neigh_reason == 0)
+			lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+	} else {
+		/* went down while polling or going up */
+		lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT;
+	}
+
+	set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
+
+	/* inform the SMA when the link transitions from up to down */
+	if (was_up && ppd->local_link_down_reason.sma == 0 &&
+	    ppd->neigh_link_down_reason.sma == 0) {
+		ppd->local_link_down_reason.sma =
+					ppd->local_link_down_reason.latest;
+		ppd->neigh_link_down_reason.sma =
+					ppd->neigh_link_down_reason.latest;
+	}
+
+	reset_neighbor_info(ppd);
+
+	/* disable the port */
+	clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+	/*
+	 * If there is no cable attached, turn the DC off. Otherwise,
+	 * start the link bring up.
+	 */
+	if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd))
+		dc_shutdown(ppd->dd);
+	else
+		start_link(ppd);
+}
+
+void handle_link_bounce(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+							link_bounce_work);
+
+	/*
+	 * Only do something if the link is currently up.
+	 */
+	if (ppd->host_link_state & HLS_UP) {
+		set_link_state(ppd, HLS_DN_OFFLINE);
+		start_link(ppd);
+	} else {
+		dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n",
+			    __func__, link_state_name(ppd->host_link_state));
+	}
+}
+
+/*
+ * Mask conversion: Capability exchange to Port LTP.  The capability
+ * exchange has an implicit 16b CRC that is mandatory.
+ */
+static int cap_to_port_ltp(int cap)
+{
+	int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */
+
+	if (cap & CAP_CRC_14B)
+		port_ltp |= PORT_LTP_CRC_MODE_14;
+	if (cap & CAP_CRC_48B)
+		port_ltp |= PORT_LTP_CRC_MODE_48;
+	if (cap & CAP_CRC_12B_16B_PER_LANE)
+		port_ltp |= PORT_LTP_CRC_MODE_PER_LANE;
+
+	return port_ltp;
+}
+
+/*
+ * Convert an OPA Port LTP mask to capability mask
+ */
+int port_ltp_to_cap(int port_ltp)
+{
+	int cap_mask = 0;
+
+	if (port_ltp & PORT_LTP_CRC_MODE_14)
+		cap_mask |= CAP_CRC_14B;
+	if (port_ltp & PORT_LTP_CRC_MODE_48)
+		cap_mask |= CAP_CRC_48B;
+	if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE)
+		cap_mask |= CAP_CRC_12B_16B_PER_LANE;
+
+	return cap_mask;
+}
+
+/*
+ * Convert a single DC LCB CRC mode to an OPA Port LTP mask.
+ */
+static int lcb_to_port_ltp(int lcb_crc)
+{
+	int port_ltp = 0;
+
+	if (lcb_crc == LCB_CRC_12B_16B_PER_LANE)
+		port_ltp = PORT_LTP_CRC_MODE_PER_LANE;
+	else if (lcb_crc == LCB_CRC_48B)
+		port_ltp = PORT_LTP_CRC_MODE_48;
+	else if (lcb_crc == LCB_CRC_14B)
+		port_ltp = PORT_LTP_CRC_MODE_14;
+	else
+		port_ltp = PORT_LTP_CRC_MODE_16;
+
+	return port_ltp;
+}
+
+static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd)
+{
+	if (ppd->pkeys[2] != 0) {
+		ppd->pkeys[2] = 0;
+		(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+		hfi1_event_pkey_change(ppd->dd, ppd->port);
+	}
+}
+
+/*
+ * Convert the given link width to the OPA link width bitmask.
+ */
+static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width)
+{
+	switch (width) {
+	case 0:
+		/*
+		 * Simulator and quick linkup do not set the width.
+		 * Just set it to 4x without complaint.
+		 */
+		if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup)
+			return OPA_LINK_WIDTH_4X;
+		return 0; /* no lanes up */
+	case 1: return OPA_LINK_WIDTH_1X;
+	case 2: return OPA_LINK_WIDTH_2X;
+	case 3: return OPA_LINK_WIDTH_3X;
+	default:
+		dd_dev_info(dd, "%s: invalid width %d, using 4\n",
+			    __func__, width);
+		/* fall through */
+	case 4: return OPA_LINK_WIDTH_4X;
+	}
+}
+
+/*
+ * Do a population count on the bottom nibble.
+ */
+static const u8 bit_counts[16] = {
+	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
+};
+
+static inline u8 nibble_to_count(u8 nibble)
+{
+	return bit_counts[nibble & 0xf];
+}
+
+/*
+ * Read the active lane information from the 8051 registers and return
+ * their widths.
+ *
+ * Active lane information is found in these 8051 registers:
+ *	enable_lane_tx
+ *	enable_lane_rx
+ */
+static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width,
+			    u16 *rx_width)
+{
+	u16 tx, rx;
+	u8 enable_lane_rx;
+	u8 enable_lane_tx;
+	u8 tx_polarity_inversion;
+	u8 rx_polarity_inversion;
+	u8 max_rate;
+
+	/* read the active lanes */
+	read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+			 &rx_polarity_inversion, &max_rate);
+	read_local_lni(dd, &enable_lane_rx);
+
+	/* convert to counts */
+	tx = nibble_to_count(enable_lane_tx);
+	rx = nibble_to_count(enable_lane_rx);
+
+	/*
+	 * Set link_speed_active here, overriding what was set in
+	 * handle_verify_cap().  The ASIC 8051 firmware does not correctly
+	 * set the max_rate field in handle_verify_cap until v0.19.
+	 */
+	if ((dd->icode == ICODE_RTL_SILICON) &&
+	    (dd->dc8051_ver < dc8051_ver(0, 19, 0))) {
+		/* max_rate: 0 = 12.5G, 1 = 25G */
+		switch (max_rate) {
+		case 0:
+			dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G;
+			break;
+		default:
+			dd_dev_err(dd,
+				   "%s: unexpected max rate %d, using 25Gb\n",
+				   __func__, (int)max_rate);
+			/* fall through */
+		case 1:
+			dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
+			break;
+		}
+	}
+
+	dd_dev_info(dd,
+		    "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n",
+		    enable_lane_tx, tx, enable_lane_rx, rx);
+	*tx_width = link_width_to_bits(dd, tx);
+	*rx_width = link_width_to_bits(dd, rx);
+}
+
+/*
+ * Read verify_cap_local_fm_link_width[1] to obtain the link widths.
+ * Valid after the end of VerifyCap and during LinkUp.  Does not change
+ * after link up.  I.e. look elsewhere for downgrade information.
+ *
+ * Bits are:
+ *	+ bits [7:4] contain the number of active transmitters
+ *	+ bits [3:0] contain the number of active receivers
+ * These are numbers 1 through 4 and can be different values if the
+ * link is asymmetric.
+ *
+ * verify_cap_local_fm_link_width[0] retains its original value.
+ */
+static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width,
+			      u16 *rx_width)
+{
+	u16 widths, tx, rx;
+	u8 misc_bits, local_flags;
+	u16 active_tx, active_rx;
+
+	read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths);
+	tx = widths >> 12;
+	rx = (widths >> 8) & 0xf;
+
+	*tx_width = link_width_to_bits(dd, tx);
+	*rx_width = link_width_to_bits(dd, rx);
+
+	/* print the active widths */
+	get_link_widths(dd, &active_tx, &active_rx);
+}
+
+/*
+ * Set ppd->link_width_active and ppd->link_width_downgrade_active using
+ * hardware information when the link first comes up.
+ *
+ * The link width is not available until after VerifyCap.AllFramesReceived
+ * (the trigger for handle_verify_cap), so this is outside that routine
+ * and should be called when the 8051 signals linkup.
+ */
+void get_linkup_link_widths(struct hfi1_pportdata *ppd)
+{
+	u16 tx_width, rx_width;
+
+	/* get end-of-LNI link widths */
+	get_linkup_widths(ppd->dd, &tx_width, &rx_width);
+
+	/* use tx_width as the link is supposed to be symmetric on link up */
+	ppd->link_width_active = tx_width;
+	/* link width downgrade active (LWD.A) starts out matching LW.A */
+	ppd->link_width_downgrade_tx_active = ppd->link_width_active;
+	ppd->link_width_downgrade_rx_active = ppd->link_width_active;
+	/* per OPA spec, on link up LWD.E resets to LWD.S */
+	ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported;
+	/* cache the active egress rate (units {10^6 bits/sec]) */
+	ppd->current_egress_rate = active_egress_rate(ppd);
+}
+
+/*
+ * Handle a verify capabilities interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_verify_cap(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+								link_vc_work);
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 reg;
+	u8 power_management;
+	u8 continuous;
+	u8 vcu;
+	u8 vau;
+	u8 z;
+	u16 vl15buf;
+	u16 link_widths;
+	u16 crc_mask;
+	u16 crc_val;
+	u16 device_id;
+	u16 active_tx, active_rx;
+	u8 partner_supported_crc;
+	u8 remote_tx_rate;
+	u8 device_rev;
+
+	set_link_state(ppd, HLS_VERIFY_CAP);
+
+	lcb_shutdown(dd, 0);
+	adjust_lcb_for_fpga_serdes(dd);
+
+	read_vc_remote_phy(dd, &power_management, &continuous);
+	read_vc_remote_fabric(dd, &vau, &z, &vcu, &vl15buf,
+			      &partner_supported_crc);
+	read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths);
+	read_remote_device_id(dd, &device_id, &device_rev);
+
+	/* print the active widths */
+	get_link_widths(dd, &active_tx, &active_rx);
+	dd_dev_info(dd,
+		    "Peer PHY: power management 0x%x, continuous updates 0x%x\n",
+		    (int)power_management, (int)continuous);
+	dd_dev_info(dd,
+		    "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
+		    (int)vau, (int)z, (int)vcu, (int)vl15buf,
+		    (int)partner_supported_crc);
+	dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n",
+		    (u32)remote_tx_rate, (u32)link_widths);
+	dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n",
+		    (u32)device_id, (u32)device_rev);
+	/*
+	 * The peer vAU value just read is the peer receiver value.  HFI does
+	 * not support a transmit vAU of 0 (AU == 8).  We advertised that
+	 * with Z=1 in the fabric capabilities sent to the peer.  The peer
+	 * will see our Z=1, and, if it advertised a vAU of 0, will move its
+	 * receive to vAU of 1 (AU == 16).  Do the same here.  We do not care
+	 * about the peer Z value - our sent vAU is 3 (hardwired) and is not
+	 * subject to the Z value exception.
+	 */
+	if (vau == 0)
+		vau = 1;
+	set_up_vau(dd, vau);
+
+	/*
+	 * Set VL15 credits to 0 in global credit register. Cache remote VL15
+	 * credits value and wait for link-up interrupt ot set it.
+	 */
+	set_up_vl15(dd, 0);
+	dd->vl15buf_cached = vl15buf;
+
+	/* set up the LCB CRC mode */
+	crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc;
+
+	/* order is important: use the lowest bit in common */
+	if (crc_mask & CAP_CRC_14B)
+		crc_val = LCB_CRC_14B;
+	else if (crc_mask & CAP_CRC_48B)
+		crc_val = LCB_CRC_48B;
+	else if (crc_mask & CAP_CRC_12B_16B_PER_LANE)
+		crc_val = LCB_CRC_12B_16B_PER_LANE;
+	else
+		crc_val = LCB_CRC_16B;
+
+	dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val);
+	write_csr(dd, DC_LCB_CFG_CRC_MODE,
+		  (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT);
+
+	/* set (14b only) or clear sideband credit */
+	reg = read_csr(dd, SEND_CM_CTRL);
+	if (crc_val == LCB_CRC_14B && crc_14b_sideband) {
+		write_csr(dd, SEND_CM_CTRL,
+			  reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+	} else {
+		write_csr(dd, SEND_CM_CTRL,
+			  reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+	}
+
+	ppd->link_speed_active = 0;	/* invalid value */
+	if (dd->dc8051_ver < dc8051_ver(0, 20, 0)) {
+		/* remote_tx_rate: 0 = 12.5G, 1 = 25G */
+		switch (remote_tx_rate) {
+		case 0:
+			ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+			break;
+		case 1:
+			ppd->link_speed_active = OPA_LINK_SPEED_25G;
+			break;
+		}
+	} else {
+		/* actual rate is highest bit of the ANDed rates */
+		u8 rate = remote_tx_rate & ppd->local_tx_rate;
+
+		if (rate & 2)
+			ppd->link_speed_active = OPA_LINK_SPEED_25G;
+		else if (rate & 1)
+			ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+	}
+	if (ppd->link_speed_active == 0) {
+		dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n",
+			   __func__, (int)remote_tx_rate);
+		ppd->link_speed_active = OPA_LINK_SPEED_25G;
+	}
+
+	/*
+	 * Cache the values of the supported, enabled, and active
+	 * LTP CRC modes to return in 'portinfo' queries. But the bit
+	 * flags that are returned in the portinfo query differ from
+	 * what's in the link_crc_mask, crc_sizes, and crc_val
+	 * variables. Convert these here.
+	 */
+	ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+		/* supported crc modes */
+	ppd->port_ltp_crc_mode |=
+		cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4;
+		/* enabled crc modes */
+	ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val);
+		/* active crc mode */
+
+	/* set up the remote credit return table */
+	assign_remote_cm_au_table(dd, vcu);
+
+	/*
+	 * The LCB is reset on entry to handle_verify_cap(), so this must
+	 * be applied on every link up.
+	 *
+	 * Adjust LCB error kill enable to kill the link if
+	 * these RBUF errors are seen:
+	 *	REPLAY_BUF_MBE_SMASK
+	 *	FLIT_INPUT_BUF_MBE_SMASK
+	 */
+	if (is_ax(dd)) {			/* fixed in B0 */
+		reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN);
+		reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK
+			| DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK;
+		write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg);
+	}
+
+	/* pull LCB fifos out of reset - all fifo clocks must be stable */
+	write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+	/* give 8051 access to the LCB CSRs */
+	write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+	set_8051_lcb_access(dd);
+
+	/* tell the 8051 to go to LinkUp */
+	set_link_state(ppd, HLS_GOING_UP);
+}
+
+/**
+ * apply_link_downgrade_policy - Apply the link width downgrade enabled
+ * policy against the current active link widths.
+ * @ppd: info of physical Hfi port
+ * @refresh_widths: True indicates link downgrade event
+ * @return: True indicates a successful link downgrade. False indicates
+ *	    link downgrade event failed and the link will bounce back to
+ *	    default link width.
+ *
+ * Called when the enabled policy changes or the active link widths
+ * change.
+ * Refresh_widths indicates that a link downgrade occurred. The
+ * link_downgraded variable is set by refresh_widths and
+ * determines the success/failure of the policy application.
+ */
+bool apply_link_downgrade_policy(struct hfi1_pportdata *ppd,
+				 bool refresh_widths)
+{
+	int do_bounce = 0;
+	int tries;
+	u16 lwde;
+	u16 tx, rx;
+	bool link_downgraded = refresh_widths;
+
+	/* use the hls lock to avoid a race with actual link up */
+	tries = 0;
+retry:
+	mutex_lock(&ppd->hls_lock);
+	/* only apply if the link is up */
+	if (ppd->host_link_state & HLS_DOWN) {
+		/* still going up..wait and retry */
+		if (ppd->host_link_state & HLS_GOING_UP) {
+			if (++tries < 1000) {
+				mutex_unlock(&ppd->hls_lock);
+				usleep_range(100, 120); /* arbitrary */
+				goto retry;
+			}
+			dd_dev_err(ppd->dd,
+				   "%s: giving up waiting for link state change\n",
+				   __func__);
+		}
+		goto done;
+	}
+
+	lwde = ppd->link_width_downgrade_enabled;
+
+	if (refresh_widths) {
+		get_link_widths(ppd->dd, &tx, &rx);
+		ppd->link_width_downgrade_tx_active = tx;
+		ppd->link_width_downgrade_rx_active = rx;
+	}
+
+	if (ppd->link_width_downgrade_tx_active == 0 ||
+	    ppd->link_width_downgrade_rx_active == 0) {
+		/* the 8051 reported a dead link as a downgrade */
+		dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n");
+		link_downgraded = false;
+	} else if (lwde == 0) {
+		/* downgrade is disabled */
+
+		/* bounce if not at starting active width */
+		if ((ppd->link_width_active !=
+		     ppd->link_width_downgrade_tx_active) ||
+		    (ppd->link_width_active !=
+		     ppd->link_width_downgrade_rx_active)) {
+			dd_dev_err(ppd->dd,
+				   "Link downgrade is disabled and link has downgraded, downing link\n");
+			dd_dev_err(ppd->dd,
+				   "  original 0x%x, tx active 0x%x, rx active 0x%x\n",
+				   ppd->link_width_active,
+				   ppd->link_width_downgrade_tx_active,
+				   ppd->link_width_downgrade_rx_active);
+			do_bounce = 1;
+			link_downgraded = false;
+		}
+	} else if ((lwde & ppd->link_width_downgrade_tx_active) == 0 ||
+		   (lwde & ppd->link_width_downgrade_rx_active) == 0) {
+		/* Tx or Rx is outside the enabled policy */
+		dd_dev_err(ppd->dd,
+			   "Link is outside of downgrade allowed, downing link\n");
+		dd_dev_err(ppd->dd,
+			   "  enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
+			   lwde, ppd->link_width_downgrade_tx_active,
+			   ppd->link_width_downgrade_rx_active);
+		do_bounce = 1;
+		link_downgraded = false;
+	}
+
+done:
+	mutex_unlock(&ppd->hls_lock);
+
+	if (do_bounce) {
+		set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0,
+				     OPA_LINKDOWN_REASON_WIDTH_POLICY);
+		set_link_state(ppd, HLS_DN_OFFLINE);
+		start_link(ppd);
+	}
+
+	return link_downgraded;
+}
+
+/*
+ * Handle a link downgrade interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_downgrade(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+							link_downgrade_work);
+
+	dd_dev_info(ppd->dd, "8051: Link width downgrade\n");
+	if (apply_link_downgrade_policy(ppd, true))
+		update_xmit_counters(ppd, ppd->link_width_downgrade_tx_active);
+}
+
+static char *dcc_err_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags, dcc_err_flags,
+		ARRAY_SIZE(dcc_err_flags));
+}
+
+static char *lcb_err_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags, lcb_err_flags,
+		ARRAY_SIZE(lcb_err_flags));
+}
+
+static char *dc8051_err_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags, dc8051_err_flags,
+		ARRAY_SIZE(dc8051_err_flags));
+}
+
+static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags, dc8051_info_err_flags,
+		ARRAY_SIZE(dc8051_info_err_flags));
+}
+
+static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags)
+{
+	return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags,
+		ARRAY_SIZE(dc8051_info_host_msg_flags));
+}
+
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+	u64 info, err, host_msg;
+	int queue_link_down = 0;
+	char buf[96];
+
+	/* look at the flags */
+	if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) {
+		/* 8051 information set by firmware */
+		/* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */
+		info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051);
+		err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT)
+			& DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK;
+		host_msg = (info >>
+			DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT)
+			& DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK;
+
+		/*
+		 * Handle error flags.
+		 */
+		if (err & FAILED_LNI) {
+			/*
+			 * LNI error indications are cleared by the 8051
+			 * only when starting polling.  Only pay attention
+			 * to them when in the states that occur during
+			 * LNI.
+			 */
+			if (ppd->host_link_state
+			    & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+				queue_link_down = 1;
+				dd_dev_info(dd, "Link error: %s\n",
+					    dc8051_info_err_string(buf,
+								   sizeof(buf),
+								   err &
+								   FAILED_LNI));
+			}
+			err &= ~(u64)FAILED_LNI;
+		}
+		/* unknown frames can happen durning LNI, just count */
+		if (err & UNKNOWN_FRAME) {
+			ppd->unknown_frame_count++;
+			err &= ~(u64)UNKNOWN_FRAME;
+		}
+		if (err) {
+			/* report remaining errors, but do not do anything */
+			dd_dev_err(dd, "8051 info error: %s\n",
+				   dc8051_info_err_string(buf, sizeof(buf),
+							  err));
+		}
+
+		/*
+		 * Handle host message flags.
+		 */
+		if (host_msg & HOST_REQ_DONE) {
+			/*
+			 * Presently, the driver does a busy wait for
+			 * host requests to complete.  This is only an
+			 * informational message.
+			 * NOTE: The 8051 clears the host message
+			 * information *on the next 8051 command*.
+			 * Therefore, when linkup is achieved,
+			 * this flag will still be set.
+			 */
+			host_msg &= ~(u64)HOST_REQ_DONE;
+		}
+		if (host_msg & BC_SMA_MSG) {
+			queue_work(ppd->link_wq, &ppd->sma_message_work);
+			host_msg &= ~(u64)BC_SMA_MSG;
+		}
+		if (host_msg & LINKUP_ACHIEVED) {
+			dd_dev_info(dd, "8051: Link up\n");
+			queue_work(ppd->link_wq, &ppd->link_up_work);
+			host_msg &= ~(u64)LINKUP_ACHIEVED;
+		}
+		if (host_msg & EXT_DEVICE_CFG_REQ) {
+			handle_8051_request(ppd);
+			host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
+		}
+		if (host_msg & VERIFY_CAP_FRAME) {
+			queue_work(ppd->link_wq, &ppd->link_vc_work);
+			host_msg &= ~(u64)VERIFY_CAP_FRAME;
+		}
+		if (host_msg & LINK_GOING_DOWN) {
+			const char *extra = "";
+			/* no downgrade action needed if going down */
+			if (host_msg & LINK_WIDTH_DOWNGRADED) {
+				host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+				extra = " (ignoring downgrade)";
+			}
+			dd_dev_info(dd, "8051: Link down%s\n", extra);
+			queue_link_down = 1;
+			host_msg &= ~(u64)LINK_GOING_DOWN;
+		}
+		if (host_msg & LINK_WIDTH_DOWNGRADED) {
+			queue_work(ppd->link_wq, &ppd->link_downgrade_work);
+			host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+		}
+		if (host_msg) {
+			/* report remaining messages, but do not do anything */
+			dd_dev_info(dd, "8051 info host message: %s\n",
+				    dc8051_info_host_msg_string(buf,
+								sizeof(buf),
+								host_msg));
+		}
+
+		reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK;
+	}
+	if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) {
+		/*
+		 * Lost the 8051 heartbeat.  If this happens, we
+		 * receive constant interrupts about it.  Disable
+		 * the interrupt after the first.
+		 */
+		dd_dev_err(dd, "Lost 8051 heartbeat\n");
+		write_csr(dd, DC_DC8051_ERR_EN,
+			  read_csr(dd, DC_DC8051_ERR_EN) &
+			  ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
+
+		reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK;
+	}
+	if (reg) {
+		/* report the error, but do not do anything */
+		dd_dev_err(dd, "8051 error: %s\n",
+			   dc8051_err_string(buf, sizeof(buf), reg));
+	}
+
+	if (queue_link_down) {
+		/*
+		 * if the link is already going down or disabled, do not
+		 * queue another. If there's a link down entry already
+		 * queued, don't queue another one.
+		 */
+		if ((ppd->host_link_state &
+		    (HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) ||
+		    ppd->link_enabled == 0) {
+			dd_dev_info(dd, "%s: not queuing link down. host_link_state %x, link_enabled %x\n",
+				    __func__, ppd->host_link_state,
+				    ppd->link_enabled);
+		} else {
+			if (xchg(&ppd->is_link_down_queued, 1) == 1)
+				dd_dev_info(dd,
+					    "%s: link down request already queued\n",
+					    __func__);
+			else
+				queue_work(ppd->link_wq, &ppd->link_down_work);
+		}
+	}
+}
+
+static const char * const fm_config_txt[] = {
+[0] =
+	"BadHeadDist: Distance violation between two head flits",
+[1] =
+	"BadTailDist: Distance violation between two tail flits",
+[2] =
+	"BadCtrlDist: Distance violation between two credit control flits",
+[3] =
+	"BadCrdAck: Credits return for unsupported VL",
+[4] =
+	"UnsupportedVLMarker: Received VL Marker",
+[5] =
+	"BadPreempt: Exceeded the preemption nesting level",
+[6] =
+	"BadControlFlit: Received unsupported control flit",
+/* no 7 */
+[8] =
+	"UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL",
+};
+
+static const char * const port_rcv_txt[] = {
+[1] =
+	"BadPktLen: Illegal PktLen",
+[2] =
+	"PktLenTooLong: Packet longer than PktLen",
+[3] =
+	"PktLenTooShort: Packet shorter than PktLen",
+[4] =
+	"BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)",
+[5] =
+	"BadDLID: Illegal DLID (0, doesn't match HFI)",
+[6] =
+	"BadL2: Illegal L2 opcode",
+[7] =
+	"BadSC: Unsupported SC",
+[9] =
+	"BadRC: Illegal RC",
+[11] =
+	"PreemptError: Preempting with same VL",
+[12] =
+	"PreemptVL15: Preempting a VL15 packet",
+};
+
+#define OPA_LDR_FMCONFIG_OFFSET 16
+#define OPA_LDR_PORTRCV_OFFSET 0
+static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	u64 info, hdr0, hdr1;
+	const char *extra;
+	char buf[96];
+	struct hfi1_pportdata *ppd = dd->pport;
+	u8 lcl_reason = 0;
+	int do_bounce = 0;
+
+	if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) {
+		if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) {
+			info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE);
+			dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK;
+			/* set status bit */
+			dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK;
+		}
+		reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK;
+	}
+
+	if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) {
+		struct hfi1_pportdata *ppd = dd->pport;
+		/* this counter saturates at (2^32) - 1 */
+		if (ppd->link_downed < (u32)UINT_MAX)
+			ppd->link_downed++;
+		reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK;
+	}
+
+	if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) {
+		u8 reason_valid = 1;
+
+		info = read_csr(dd, DCC_ERR_INFO_FMCONFIG);
+		if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) {
+			dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK;
+			/* set status bit */
+			dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK;
+		}
+		switch (info) {
+		case 0:
+		case 1:
+		case 2:
+		case 3:
+		case 4:
+		case 5:
+		case 6:
+			extra = fm_config_txt[info];
+			break;
+		case 8:
+			extra = fm_config_txt[info];
+			if (ppd->port_error_action &
+			    OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) {
+				do_bounce = 1;
+				/*
+				 * lcl_reason cannot be derived from info
+				 * for this error
+				 */
+				lcl_reason =
+				  OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER;
+			}
+			break;
+		default:
+			reason_valid = 0;
+			snprintf(buf, sizeof(buf), "reserved%lld", info);
+			extra = buf;
+			break;
+		}
+
+		if (reason_valid && !do_bounce) {
+			do_bounce = ppd->port_error_action &
+					(1 << (OPA_LDR_FMCONFIG_OFFSET + info));
+			lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST;
+		}
+
+		/* just report this */
+		dd_dev_info_ratelimited(dd, "DCC Error: fmconfig error: %s\n",
+					extra);
+		reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK;
+	}
+
+	if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) {
+		u8 reason_valid = 1;
+
+		info = read_csr(dd, DCC_ERR_INFO_PORTRCV);
+		hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0);
+		hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1);
+		if (!(dd->err_info_rcvport.status_and_code &
+		      OPA_EI_STATUS_SMASK)) {
+			dd->err_info_rcvport.status_and_code =
+				info & OPA_EI_CODE_SMASK;
+			/* set status bit */
+			dd->err_info_rcvport.status_and_code |=
+				OPA_EI_STATUS_SMASK;
+			/*
+			 * save first 2 flits in the packet that caused
+			 * the error
+			 */
+			dd->err_info_rcvport.packet_flit1 = hdr0;
+			dd->err_info_rcvport.packet_flit2 = hdr1;
+		}
+		switch (info) {
+		case 1:
+		case 2:
+		case 3:
+		case 4:
+		case 5:
+		case 6:
+		case 7:
+		case 9:
+		case 11:
+		case 12:
+			extra = port_rcv_txt[info];
+			break;
+		default:
+			reason_valid = 0;
+			snprintf(buf, sizeof(buf), "reserved%lld", info);
+			extra = buf;
+			break;
+		}
+
+		if (reason_valid && !do_bounce) {
+			do_bounce = ppd->port_error_action &
+					(1 << (OPA_LDR_PORTRCV_OFFSET + info));
+			lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0;
+		}
+
+		/* just report this */
+		dd_dev_info_ratelimited(dd, "DCC Error: PortRcv error: %s\n"
+					"               hdr0 0x%llx, hdr1 0x%llx\n",
+					extra, hdr0, hdr1);
+
+		reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK;
+	}
+
+	if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) {
+		/* informative only */
+		dd_dev_info_ratelimited(dd, "8051 access to LCB blocked\n");
+		reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK;
+	}
+	if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) {
+		/* informative only */
+		dd_dev_info_ratelimited(dd, "host access to LCB blocked\n");
+		reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK;
+	}
+
+	if (unlikely(hfi1_dbg_fault_suppress_err(&dd->verbs_dev)))
+		reg &= ~DCC_ERR_FLG_LATE_EBP_ERR_SMASK;
+
+	/* report any remaining errors */
+	if (reg)
+		dd_dev_info_ratelimited(dd, "DCC Error: %s\n",
+					dcc_err_string(buf, sizeof(buf), reg));
+
+	if (lcl_reason == 0)
+		lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN;
+
+	if (do_bounce) {
+		dd_dev_info_ratelimited(dd, "%s: PortErrorAction bounce\n",
+					__func__);
+		set_link_down_reason(ppd, lcl_reason, 0, lcl_reason);
+		queue_work(ppd->link_wq, &ppd->link_bounce_work);
+	}
+}
+
+static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+	char buf[96];
+
+	dd_dev_info(dd, "LCB Error: %s\n",
+		    lcb_err_string(buf, sizeof(buf), reg));
+}
+
+/*
+ * CCE block DC interrupt.  Source is < 8.
+ */
+static void is_dc_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	const struct err_reg_info *eri = &dc_errs[source];
+
+	if (eri->handler) {
+		interrupt_clear_down(dd, 0, eri);
+	} else if (source == 3 /* dc_lbm_int */) {
+		/*
+		 * This indicates that a parity error has occurred on the
+		 * address/control lines presented to the LBM.  The error
+		 * is a single pulse, there is no associated error flag,
+		 * and it is non-maskable.  This is because if a parity
+		 * error occurs on the request the request is dropped.
+		 * This should never occur, but it is nice to know if it
+		 * ever does.
+		 */
+		dd_dev_err(dd, "Parity error in DC LBM block\n");
+	} else {
+		dd_dev_err(dd, "Invalid DC interrupt %u\n", source);
+	}
+}
+
+/*
+ * TX block send credit interrupt.  Source is < 160.
+ */
+static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	sc_group_release_update(dd, source);
+}
+
+/*
+ * TX block SDMA interrupt.  Source is < 48.
+ *
+ * SDMA interrupts are grouped by type:
+ *
+ *	 0 -  N-1 = SDma
+ *	 N - 2N-1 = SDmaProgress
+ *	2N - 3N-1 = SDmaIdle
+ */
+static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	/* what interrupt */
+	unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
+	/* which engine */
+	unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which,
+		   slashstrip(__FILE__), __LINE__, __func__);
+	sdma_dumpstate(&dd->per_sdma[which]);
+#endif
+
+	if (likely(what < 3 && which < dd->num_sdma)) {
+		sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source);
+	} else {
+		/* should not happen */
+		dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source);
+	}
+}
+
+/*
+ * RX block receive available interrupt.  Source is < 160.
+ */
+static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	struct hfi1_ctxtdata *rcd;
+	char *err_detail;
+
+	if (likely(source < dd->num_rcv_contexts)) {
+		rcd = hfi1_rcd_get_by_index(dd, source);
+		if (rcd) {
+			/* Check for non-user contexts, including vnic */
+			if (source < dd->first_dyn_alloc_ctxt || rcd->is_vnic)
+				rcd->do_interrupt(rcd, 0);
+			else
+				handle_user_interrupt(rcd);
+
+			hfi1_rcd_put(rcd);
+			return;	/* OK */
+		}
+		/* received an interrupt, but no rcd */
+		err_detail = "dataless";
+	} else {
+		/* received an interrupt, but are not using that context */
+		err_detail = "out of range";
+	}
+	dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n",
+		   err_detail, source);
+}
+
+/*
+ * RX block receive urgent interrupt.  Source is < 160.
+ */
+static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	struct hfi1_ctxtdata *rcd;
+	char *err_detail;
+
+	if (likely(source < dd->num_rcv_contexts)) {
+		rcd = hfi1_rcd_get_by_index(dd, source);
+		if (rcd) {
+			/* only pay attention to user urgent interrupts */
+			if (source >= dd->first_dyn_alloc_ctxt &&
+			    !rcd->is_vnic)
+				handle_user_interrupt(rcd);
+
+			hfi1_rcd_put(rcd);
+			return;	/* OK */
+		}
+		/* received an interrupt, but no rcd */
+		err_detail = "dataless";
+	} else {
+		/* received an interrupt, but are not using that context */
+		err_detail = "out of range";
+	}
+	dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n",
+		   err_detail, source);
+}
+
+/*
+ * Reserved range interrupt.  Should not be called in normal operation.
+ */
+static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source)
+{
+	char name[64];
+
+	dd_dev_err(dd, "unexpected %s interrupt\n",
+		   is_reserved_name(name, sizeof(name), source));
+}
+
+static const struct is_table is_table[] = {
+/*
+ * start		 end
+ *				name func		interrupt func
+ */
+{ IS_GENERAL_ERR_START,  IS_GENERAL_ERR_END,
+				is_misc_err_name,	is_misc_err_int },
+{ IS_SDMAENG_ERR_START,  IS_SDMAENG_ERR_END,
+				is_sdma_eng_err_name,	is_sdma_eng_err_int },
+{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
+				is_sendctxt_err_name,	is_sendctxt_err_int },
+{ IS_SDMA_START,	     IS_SDMA_END,
+				is_sdma_eng_name,	is_sdma_eng_int },
+{ IS_VARIOUS_START,	     IS_VARIOUS_END,
+				is_various_name,	is_various_int },
+{ IS_DC_START,	     IS_DC_END,
+				is_dc_name,		is_dc_int },
+{ IS_RCVAVAIL_START,     IS_RCVAVAIL_END,
+				is_rcv_avail_name,	is_rcv_avail_int },
+{ IS_RCVURGENT_START,    IS_RCVURGENT_END,
+				is_rcv_urgent_name,	is_rcv_urgent_int },
+{ IS_SENDCREDIT_START,   IS_SENDCREDIT_END,
+				is_send_credit_name,	is_send_credit_int},
+{ IS_RESERVED_START,     IS_RESERVED_END,
+				is_reserved_name,	is_reserved_int},
+};
+
+/*
+ * Interrupt source interrupt - called when the given source has an interrupt.
+ * Source is a bit index into an array of 64-bit integers.
+ */
+static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
+{
+	const struct is_table *entry;
+
+	/* avoids a double compare by walking the table in-order */
+	for (entry = &is_table[0]; entry->is_name; entry++) {
+		if (source < entry->end) {
+			trace_hfi1_interrupt(dd, entry, source);
+			entry->is_int(dd, source - entry->start);
+			return;
+		}
+	}
+	/* fell off the end */
+	dd_dev_err(dd, "invalid interrupt source %u\n", source);
+}
+
+/*
+ * General interrupt handler.  This is able to correctly handle
+ * all interrupts in case INTx is used.
+ */
+static irqreturn_t general_interrupt(int irq, void *data)
+{
+	struct hfi1_devdata *dd = data;
+	u64 regs[CCE_NUM_INT_CSRS];
+	u32 bit;
+	int i;
+	irqreturn_t handled = IRQ_NONE;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* phase 1: scan and clear all handled interrupts */
+	for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+		if (dd->gi_mask[i] == 0) {
+			regs[i] = 0;	/* used later */
+			continue;
+		}
+		regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) &
+				dd->gi_mask[i];
+		/* only clear if anything is set */
+		if (regs[i])
+			write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]);
+	}
+
+	/* phase 2: call the appropriate handler */
+	for_each_set_bit(bit, (unsigned long *)&regs[0],
+			 CCE_NUM_INT_CSRS * 64) {
+		is_interrupt(dd, bit);
+		handled = IRQ_HANDLED;
+	}
+
+	return handled;
+}
+
+static irqreturn_t sdma_interrupt(int irq, void *data)
+{
+	struct sdma_engine *sde = data;
+	struct hfi1_devdata *dd = sde->dd;
+	u64 status;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+		   slashstrip(__FILE__), __LINE__, __func__);
+	sdma_dumpstate(sde);
+#endif
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* This read_csr is really bad in the hot path */
+	status = read_csr(dd,
+			  CCE_INT_STATUS + (8 * (IS_SDMA_START / 64)))
+			  & sde->imask;
+	if (likely(status)) {
+		/* clear the interrupt(s) */
+		write_csr(dd,
+			  CCE_INT_CLEAR + (8 * (IS_SDMA_START / 64)),
+			  status);
+
+		/* handle the interrupt(s) */
+		sdma_engine_interrupt(sde, status);
+	} else {
+		dd_dev_info_ratelimited(dd, "SDMA engine %u interrupt, but no status bits set\n",
+					sde->this_idx);
+	}
+	return IRQ_HANDLED;
+}
+
+/*
+ * Clear the receive interrupt.  Use a read of the interrupt clear CSR
+ * to insure that the write completed.  This does NOT guarantee that
+ * queued DMA writes to memory from the chip are pushed.
+ */
+static inline void clear_recv_intr(struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 addr = CCE_INT_CLEAR + (8 * rcd->ireg);
+
+	mmiowb();	/* make sure everything before is written */
+	write_csr(dd, addr, rcd->imask);
+	/* force the above write on the chip and get a value back */
+	(void)read_csr(dd, addr);
+}
+
+/* force the receive interrupt */
+void force_recv_intr(struct hfi1_ctxtdata *rcd)
+{
+	write_csr(rcd->dd, CCE_INT_FORCE + (8 * rcd->ireg), rcd->imask);
+}
+
+/*
+ * Return non-zero if a packet is present.
+ *
+ * This routine is called when rechecking for packets after the RcvAvail
+ * interrupt has been cleared down.  First, do a quick check of memory for
+ * a packet present.  If not found, use an expensive CSR read of the context
+ * tail to determine the actual tail.  The CSR read is necessary because there
+ * is no method to push pending DMAs to memory other than an interrupt and we
+ * are trying to determine if we need to force an interrupt.
+ */
+static inline int check_packet_present(struct hfi1_ctxtdata *rcd)
+{
+	u32 tail;
+	int present;
+
+	if (!HFI1_CAP_IS_KSET(DMA_RTAIL))
+		present = (rcd->seq_cnt ==
+				rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd))));
+	else /* is RDMA rtail */
+		present = (rcd->head != get_rcvhdrtail(rcd));
+
+	if (present)
+		return 1;
+
+	/* fall back to a CSR read, correct indpendent of DMA_RTAIL */
+	tail = (u32)read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
+	return rcd->head != tail;
+}
+
+/*
+ * Receive packet IRQ handler.  This routine expects to be on its own IRQ.
+ * This routine will try to handle packets immediately (latency), but if
+ * it finds too many, it will invoke the thread handler (bandwitdh).  The
+ * chip receive interrupt is *not* cleared down until this or the thread (if
+ * invoked) is finished.  The intent is to avoid extra interrupts while we
+ * are processing packets anyway.
+ */
+static irqreturn_t receive_context_interrupt(int irq, void *data)
+{
+	struct hfi1_ctxtdata *rcd = data;
+	struct hfi1_devdata *dd = rcd->dd;
+	int disposition;
+	int present;
+
+	trace_hfi1_receive_interrupt(dd, rcd);
+	this_cpu_inc(*dd->int_counter);
+	aspm_ctx_disable(rcd);
+
+	/* receive interrupt remains blocked while processing packets */
+	disposition = rcd->do_interrupt(rcd, 0);
+
+	/*
+	 * Too many packets were seen while processing packets in this
+	 * IRQ handler.  Invoke the handler thread.  The receive interrupt
+	 * remains blocked.
+	 */
+	if (disposition == RCV_PKT_LIMIT)
+		return IRQ_WAKE_THREAD;
+
+	/*
+	 * The packet processor detected no more packets.  Clear the receive
+	 * interrupt and recheck for a packet packet that may have arrived
+	 * after the previous check and interrupt clear.  If a packet arrived,
+	 * force another interrupt.
+	 */
+	clear_recv_intr(rcd);
+	present = check_packet_present(rcd);
+	if (present)
+		force_recv_intr(rcd);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Receive packet thread handler.  This expects to be invoked with the
+ * receive interrupt still blocked.
+ */
+static irqreturn_t receive_context_thread(int irq, void *data)
+{
+	struct hfi1_ctxtdata *rcd = data;
+	int present;
+
+	/* receive interrupt is still blocked from the IRQ handler */
+	(void)rcd->do_interrupt(rcd, 1);
+
+	/*
+	 * The packet processor will only return if it detected no more
+	 * packets.  Hold IRQs here so we can safely clear the interrupt and
+	 * recheck for a packet that may have arrived after the previous
+	 * check and the interrupt clear.  If a packet arrived, force another
+	 * interrupt.
+	 */
+	local_irq_disable();
+	clear_recv_intr(rcd);
+	present = check_packet_present(rcd);
+	if (present)
+		force_recv_intr(rcd);
+	local_irq_enable();
+
+	return IRQ_HANDLED;
+}
+
+/* ========================================================================= */
+
+u32 read_physical_state(struct hfi1_devdata *dd)
+{
+	u64 reg;
+
+	reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+	return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT)
+				& DC_DC8051_STS_CUR_STATE_PORT_MASK;
+}
+
+u32 read_logical_state(struct hfi1_devdata *dd)
+{
+	u64 reg;
+
+	reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+	return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT)
+				& DCC_CFG_PORT_CONFIG_LINK_STATE_MASK;
+}
+
+static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+	u64 reg;
+
+	reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+	/* clear current state, set new state */
+	reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK;
+	reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT;
+	write_csr(dd, DCC_CFG_PORT_CONFIG, reg);
+}
+
+/*
+ * Use the 8051 to read a LCB CSR.
+ */
+static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+	u32 regno;
+	int ret;
+
+	if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+		if (acquire_lcb_access(dd, 0) == 0) {
+			*data = read_csr(dd, addr);
+			release_lcb_access(dd, 0);
+			return 0;
+		}
+		return -EBUSY;
+	}
+
+	/* register is an index of LCB registers: (offset - base) / 8 */
+	regno = (addr - DC_LCB_CFG_RUN) >> 3;
+	ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data);
+	if (ret != HCMD_SUCCESS)
+		return -EBUSY;
+	return 0;
+}
+
+/*
+ * Provide a cache for some of the LCB registers in case the LCB is
+ * unavailable.
+ * (The LCB is unavailable in certain link states, for example.)
+ */
+struct lcb_datum {
+	u32 off;
+	u64 val;
+};
+
+static struct lcb_datum lcb_cache[] = {
+	{ DC_LCB_ERR_INFO_RX_REPLAY_CNT, 0},
+	{ DC_LCB_ERR_INFO_SEQ_CRC_CNT, 0 },
+	{ DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT, 0 },
+};
+
+static void update_lcb_cache(struct hfi1_devdata *dd)
+{
+	int i;
+	int ret;
+	u64 val;
+
+	for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+		ret = read_lcb_csr(dd, lcb_cache[i].off, &val);
+
+		/* Update if we get good data */
+		if (likely(ret != -EBUSY))
+			lcb_cache[i].val = val;
+	}
+}
+
+static int read_lcb_cache(u32 off, u64 *val)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+		if (lcb_cache[i].off == off) {
+			*val = lcb_cache[i].val;
+			return 0;
+		}
+	}
+
+	pr_warn("%s bad offset 0x%x\n", __func__, off);
+	return -1;
+}
+
+/*
+ * Read an LCB CSR.  Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+
+	/* if up, go through the 8051 for the value */
+	if (ppd->host_link_state & HLS_UP)
+		return read_lcb_via_8051(dd, addr, data);
+	/* if going up or down, check the cache, otherwise, no access */
+	if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE)) {
+		if (read_lcb_cache(addr, data))
+			return -EBUSY;
+		return 0;
+	}
+
+	/* otherwise, host has access */
+	*data = read_csr(dd, addr);
+	return 0;
+}
+
+/*
+ * Use the 8051 to write a LCB CSR.
+ */
+static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+	u32 regno;
+	int ret;
+
+	if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR ||
+	    (dd->dc8051_ver < dc8051_ver(0, 20, 0))) {
+		if (acquire_lcb_access(dd, 0) == 0) {
+			write_csr(dd, addr, data);
+			release_lcb_access(dd, 0);
+			return 0;
+		}
+		return -EBUSY;
+	}
+
+	/* register is an index of LCB registers: (offset - base) / 8 */
+	regno = (addr - DC_LCB_CFG_RUN) >> 3;
+	ret = do_8051_command(dd, HCMD_WRITE_LCB_CSR, regno, &data);
+	if (ret != HCMD_SUCCESS)
+		return -EBUSY;
+	return 0;
+}
+
+/*
+ * Write an LCB CSR.  Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+
+	/* if up, go through the 8051 for the value */
+	if (ppd->host_link_state & HLS_UP)
+		return write_lcb_via_8051(dd, addr, data);
+	/* if going up or down, no access */
+	if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+		return -EBUSY;
+	/* otherwise, host has access */
+	write_csr(dd, addr, data);
+	return 0;
+}
+
+/*
+ * Returns:
+ *	< 0 = Linux error, not able to get access
+ *	> 0 = 8051 command RETURN_CODE
+ */
+static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
+			   u64 *out_data)
+{
+	u64 reg, completed;
+	int return_code;
+	unsigned long timeout;
+
+	hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
+
+	mutex_lock(&dd->dc8051_lock);
+
+	/* We can't send any commands to the 8051 if it's in reset */
+	if (dd->dc_shutdown) {
+		return_code = -ENODEV;
+		goto fail;
+	}
+
+	/*
+	 * If an 8051 host command timed out previously, then the 8051 is
+	 * stuck.
+	 *
+	 * On first timeout, attempt to reset and restart the entire DC
+	 * block (including 8051). (Is this too big of a hammer?)
+	 *
+	 * If the 8051 times out a second time, the reset did not bring it
+	 * back to healthy life. In that case, fail any subsequent commands.
+	 */
+	if (dd->dc8051_timed_out) {
+		if (dd->dc8051_timed_out > 1) {
+			dd_dev_err(dd,
+				   "Previous 8051 host command timed out, skipping command %u\n",
+				   type);
+			return_code = -ENXIO;
+			goto fail;
+		}
+		_dc_shutdown(dd);
+		_dc_start(dd);
+	}
+
+	/*
+	 * If there is no timeout, then the 8051 command interface is
+	 * waiting for a command.
+	 */
+
+	/*
+	 * When writing a LCB CSR, out_data contains the full value to
+	 * to be written, while in_data contains the relative LCB
+	 * address in 7:0.  Do the work here, rather than the caller,
+	 * of distrubting the write data to where it needs to go:
+	 *
+	 * Write data
+	 *   39:00 -> in_data[47:8]
+	 *   47:40 -> DC8051_CFG_EXT_DEV_0.RETURN_CODE
+	 *   63:48 -> DC8051_CFG_EXT_DEV_0.RSP_DATA
+	 */
+	if (type == HCMD_WRITE_LCB_CSR) {
+		in_data |= ((*out_data) & 0xffffffffffull) << 8;
+		/* must preserve COMPLETED - it is tied to hardware */
+		reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_0);
+		reg &= DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK;
+		reg |= ((((*out_data) >> 40) & 0xff) <<
+				DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT)
+		      | ((((*out_data) >> 48) & 0xffff) <<
+				DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
+		write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, reg);
+	}
+
+	/*
+	 * Do two writes: the first to stabilize the type and req_data, the
+	 * second to activate.
+	 */
+	reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK)
+			<< DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT
+		| (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK)
+			<< DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT;
+	write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+	reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK;
+	write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+
+	/* wait for completion, alternate: interrupt */
+	timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT);
+	while (1) {
+		reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1);
+		completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK;
+		if (completed)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd->dc8051_timed_out++;
+			dd_dev_err(dd, "8051 host command %u timeout\n", type);
+			if (out_data)
+				*out_data = 0;
+			return_code = -ETIMEDOUT;
+			goto fail;
+		}
+		udelay(2);
+	}
+
+	if (out_data) {
+		*out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT)
+				& DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK;
+		if (type == HCMD_READ_LCB_CSR) {
+			/* top 16 bits are in a different register */
+			*out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1)
+				& DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK)
+				<< (48
+				    - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT);
+		}
+	}
+	return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT)
+				& DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK;
+	dd->dc8051_timed_out = 0;
+	/*
+	 * Clear command for next user.
+	 */
+	write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
+
+fail:
+	mutex_unlock(&dd->dc8051_lock);
+	return return_code;
+}
+
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state)
+{
+	return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
+}
+
+int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
+		     u8 lane_id, u32 config_data)
+{
+	u64 data;
+	int ret;
+
+	data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
+		| (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
+		| (u64)config_data << LOAD_DATA_DATA_SHIFT;
+	ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd,
+			   "load 8051 config: field id %d, lane %d, err %d\n",
+			   (int)field_id, (int)lane_id, ret);
+	}
+	return ret;
+}
+
+/*
+ * Read the 8051 firmware "registers".  Use the RAM directly.  Always
+ * set the result, even on error.
+ * Return 0 on success, -errno on failure
+ */
+int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
+		     u32 *result)
+{
+	u64 big_data;
+	u32 addr;
+	int ret;
+
+	/* address start depends on the lane_id */
+	if (lane_id < 4)
+		addr = (4 * NUM_GENERAL_FIELDS)
+			+ (lane_id * 4 * NUM_LANE_FIELDS);
+	else
+		addr = 0;
+	addr += field_id * 4;
+
+	/* read is in 8-byte chunks, hardware will truncate the address down */
+	ret = read_8051_data(dd, addr, 8, &big_data);
+
+	if (ret == 0) {
+		/* extract the 4 bytes we want */
+		if (addr & 0x4)
+			*result = (u32)(big_data >> 32);
+		else
+			*result = (u32)big_data;
+	} else {
+		*result = 0;
+		dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n",
+			   __func__, lane_id, field_id);
+	}
+
+	return ret;
+}
+
+static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management,
+			      u8 continuous)
+{
+	u32 frame;
+
+	frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT
+		| power_management << POWER_MANAGEMENT_SHIFT;
+	return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY,
+				GENERAL_CONFIG, frame);
+}
+
+static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu,
+				 u16 vl15buf, u8 crc_sizes)
+{
+	u32 frame;
+
+	frame = (u32)vau << VAU_SHIFT
+		| (u32)z << Z_SHIFT
+		| (u32)vcu << VCU_SHIFT
+		| (u32)vl15buf << VL15BUF_SHIFT
+		| (u32)crc_sizes << CRC_SIZES_SHIFT;
+	return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC,
+				GENERAL_CONFIG, frame);
+}
+
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+				     u8 *flag_bits, u16 *link_widths)
+{
+	u32 frame;
+
+	read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+			 &frame);
+	*misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK;
+	*flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK;
+	*link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static int write_vc_local_link_width(struct hfi1_devdata *dd,
+				     u8 misc_bits,
+				     u8 flag_bits,
+				     u16 link_widths)
+{
+	u32 frame;
+
+	frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT
+		| (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT
+		| (u32)link_widths << LINK_WIDTH_SHIFT;
+	return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+		     frame);
+}
+
+static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id,
+				 u8 device_rev)
+{
+	u32 frame;
+
+	frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT)
+		| ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT);
+	return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame);
+}
+
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+				  u8 *device_rev)
+{
+	u32 frame;
+
+	read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame);
+	*device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK;
+	*device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT)
+			& REMOTE_DEVICE_REV_MASK;
+}
+
+int write_host_interface_version(struct hfi1_devdata *dd, u8 version)
+{
+	u32 frame;
+	u32 mask;
+
+	mask = (HOST_INTERFACE_VERSION_MASK << HOST_INTERFACE_VERSION_SHIFT);
+	read_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, &frame);
+	/* Clear, then set field */
+	frame &= ~mask;
+	frame |= ((u32)version << HOST_INTERFACE_VERSION_SHIFT);
+	return load_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG,
+				frame);
+}
+
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor,
+		      u8 *ver_patch)
+{
+	u32 frame;
+
+	read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame);
+	*ver_major = (frame >> STS_FM_VERSION_MAJOR_SHIFT) &
+		STS_FM_VERSION_MAJOR_MASK;
+	*ver_minor = (frame >> STS_FM_VERSION_MINOR_SHIFT) &
+		STS_FM_VERSION_MINOR_MASK;
+
+	read_8051_config(dd, VERSION_PATCH, GENERAL_CONFIG, &frame);
+	*ver_patch = (frame >> STS_FM_VERSION_PATCH_SHIFT) &
+		STS_FM_VERSION_PATCH_MASK;
+}
+
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+			       u8 *continuous)
+{
+	u32 frame;
+
+	read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame);
+	*power_management = (frame >> POWER_MANAGEMENT_SHIFT)
+					& POWER_MANAGEMENT_MASK;
+	*continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT)
+					& CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK;
+}
+
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+				  u8 *vcu, u16 *vl15buf, u8 *crc_sizes)
+{
+	u32 frame;
+
+	read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame);
+	*vau = (frame >> VAU_SHIFT) & VAU_MASK;
+	*z = (frame >> Z_SHIFT) & Z_MASK;
+	*vcu = (frame >> VCU_SHIFT) & VCU_MASK;
+	*vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK;
+	*crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK;
+}
+
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+				      u8 *remote_tx_rate,
+				      u16 *link_widths)
+{
+	u32 frame;
+
+	read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG,
+			 &frame);
+	*remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT)
+				& REMOTE_TX_RATE_MASK;
+	*link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx)
+{
+	u32 frame;
+
+	read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame);
+	*enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK;
+}
+
+static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls)
+{
+	read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls);
+}
+
+static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs)
+{
+	read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs);
+}
+
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality)
+{
+	u32 frame;
+	int ret;
+
+	*link_quality = 0;
+	if (dd->pport->host_link_state & HLS_UP) {
+		ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG,
+				       &frame);
+		if (ret == 0)
+			*link_quality = (frame >> LINK_QUALITY_SHIFT)
+						& LINK_QUALITY_MASK;
+	}
+}
+
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
+{
+	u32 frame;
+
+	read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame);
+	*pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
+}
+
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr)
+{
+	u32 frame;
+
+	read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame);
+	*ldr = (frame & 0xff);
+}
+
+static int read_tx_settings(struct hfi1_devdata *dd,
+			    u8 *enable_lane_tx,
+			    u8 *tx_polarity_inversion,
+			    u8 *rx_polarity_inversion,
+			    u8 *max_rate)
+{
+	u32 frame;
+	int ret;
+
+	ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame);
+	*enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT)
+				& ENABLE_LANE_TX_MASK;
+	*tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT)
+				& TX_POLARITY_INVERSION_MASK;
+	*rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT)
+				& RX_POLARITY_INVERSION_MASK;
+	*max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK;
+	return ret;
+}
+
+static int write_tx_settings(struct hfi1_devdata *dd,
+			     u8 enable_lane_tx,
+			     u8 tx_polarity_inversion,
+			     u8 rx_polarity_inversion,
+			     u8 max_rate)
+{
+	u32 frame;
+
+	/* no need to mask, all variable sizes match field widths */
+	frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT
+		| tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT
+		| rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT
+		| max_rate << MAX_RATE_SHIFT;
+	return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
+}
+
+/*
+ * Read an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out)
+{
+	int ret;
+
+	ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG, type, data_out);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd, "read idle message: type %d, err %d\n",
+			   (u32)type, ret);
+		return -EINVAL;
+	}
+	dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out);
+	/* return only the payload as we already know the type */
+	*data_out >>= IDLE_PAYLOAD_SHIFT;
+	return 0;
+}
+
+/*
+ * Read an idle SMA message.  To be done in response to a notification from
+ * the 8051.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data)
+{
+	return read_idle_message(dd, (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT,
+				 data);
+}
+
+/*
+ * Send an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int send_idle_message(struct hfi1_devdata *dd, u64 data)
+{
+	int ret;
+
+	dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data);
+	ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n",
+			   data, ret);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Send an idle SMA message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+int send_idle_sma(struct hfi1_devdata *dd, u64 message)
+{
+	u64 data;
+
+	data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT) |
+		((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT);
+	return send_idle_message(dd, data);
+}
+
+/*
+ * Initialize the LCB then do a quick link up.  This may or may not be
+ * in loopback.
+ *
+ * return 0 on success, -errno on error
+ */
+static int do_quick_linkup(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	lcb_shutdown(dd, 0);
+
+	if (loopback) {
+		/* LCB_CFG_LOOPBACK.VAL = 2 */
+		/* LCB_CFG_LANE_WIDTH.VAL = 0 */
+		write_csr(dd, DC_LCB_CFG_LOOPBACK,
+			  IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT);
+		write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0);
+	}
+
+	/* start the LCBs */
+	/* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */
+	write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+	/* simulator only loopback steps */
+	if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+		/* LCB_CFG_RUN.EN = 1 */
+		write_csr(dd, DC_LCB_CFG_RUN,
+			  1ull << DC_LCB_CFG_RUN_EN_SHIFT);
+
+		ret = wait_link_transfer_active(dd, 10);
+		if (ret)
+			return ret;
+
+		write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP,
+			  1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT);
+	}
+
+	if (!loopback) {
+		/*
+		 * When doing quick linkup and not in loopback, both
+		 * sides must be done with LCB set-up before either
+		 * starts the quick linkup.  Put a delay here so that
+		 * both sides can be started and have a chance to be
+		 * done with LCB set up before resuming.
+		 */
+		dd_dev_err(dd,
+			   "Pausing for peer to be finished with LCB set up\n");
+		msleep(5000);
+		dd_dev_err(dd, "Continuing with quick linkup\n");
+	}
+
+	write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+	set_8051_lcb_access(dd);
+
+	/*
+	 * State "quick" LinkUp request sets the physical link state to
+	 * LinkUp without a verify capability sequence.
+	 * This state is in simulator v37 and later.
+	 */
+	ret = set_physical_link_state(dd, PLS_QUICK_LINKUP);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd,
+			   "%s: set physical link state to quick LinkUp failed with return %d\n",
+			   __func__, ret);
+
+		set_host_lcb_access(dd);
+		write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+
+		if (ret >= 0)
+			ret = -EINVAL;
+		return ret;
+	}
+
+	return 0; /* success */
+}
+
+/*
+ * Do all special steps to set up loopback.
+ */
+static int init_loopback(struct hfi1_devdata *dd)
+{
+	dd_dev_info(dd, "Entering loopback mode\n");
+
+	/* all loopbacks should disable self GUID check */
+	write_csr(dd, DC_DC8051_CFG_MODE,
+		  (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
+
+	/*
+	 * The simulator has only one loopback option - LCB.  Switch
+	 * to that option, which includes quick link up.
+	 *
+	 * Accept all valid loopback values.
+	 */
+	if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR) &&
+	    (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
+	     loopback == LOOPBACK_CABLE)) {
+		loopback = LOOPBACK_LCB;
+		quick_linkup = 1;
+		return 0;
+	}
+
+	/*
+	 * SerDes loopback init sequence is handled in set_local_link_attributes
+	 */
+	if (loopback == LOOPBACK_SERDES)
+		return 0;
+
+	/* LCB loopback - handled at poll time */
+	if (loopback == LOOPBACK_LCB) {
+		quick_linkup = 1; /* LCB is always quick linkup */
+
+		/* not supported in emulation due to emulation RTL changes */
+		if (dd->icode == ICODE_FPGA_EMULATION) {
+			dd_dev_err(dd,
+				   "LCB loopback not supported in emulation\n");
+			return -EINVAL;
+		}
+		return 0;
+	}
+
+	/* external cable loopback requires no extra steps */
+	if (loopback == LOOPBACK_CABLE)
+		return 0;
+
+	dd_dev_err(dd, "Invalid loopback mode %d\n", loopback);
+	return -EINVAL;
+}
+
+/*
+ * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits
+ * used in the Verify Capability link width attribute.
+ */
+static u16 opa_to_vc_link_widths(u16 opa_widths)
+{
+	int i;
+	u16 result = 0;
+
+	static const struct link_bits {
+		u16 from;
+		u16 to;
+	} opa_link_xlate[] = {
+		{ OPA_LINK_WIDTH_1X, 1 << (1 - 1)  },
+		{ OPA_LINK_WIDTH_2X, 1 << (2 - 1)  },
+		{ OPA_LINK_WIDTH_3X, 1 << (3 - 1)  },
+		{ OPA_LINK_WIDTH_4X, 1 << (4 - 1)  },
+	};
+
+	for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) {
+		if (opa_widths & opa_link_xlate[i].from)
+			result |= opa_link_xlate[i].to;
+	}
+	return result;
+}
+
+/*
+ * Set link attributes before moving to polling.
+ */
+static int set_local_link_attributes(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u8 enable_lane_tx;
+	u8 tx_polarity_inversion;
+	u8 rx_polarity_inversion;
+	int ret;
+	u32 misc_bits = 0;
+	/* reset our fabric serdes to clear any lingering problems */
+	fabric_serdes_reset(dd);
+
+	/* set the local tx rate - need to read-modify-write */
+	ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+			       &rx_polarity_inversion, &ppd->local_tx_rate);
+	if (ret)
+		goto set_local_link_attributes_fail;
+
+	if (dd->dc8051_ver < dc8051_ver(0, 20, 0)) {
+		/* set the tx rate to the fastest enabled */
+		if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+			ppd->local_tx_rate = 1;
+		else
+			ppd->local_tx_rate = 0;
+	} else {
+		/* set the tx rate to all enabled */
+		ppd->local_tx_rate = 0;
+		if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+			ppd->local_tx_rate |= 2;
+		if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G)
+			ppd->local_tx_rate |= 1;
+	}
+
+	enable_lane_tx = 0xF; /* enable all four lanes */
+	ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion,
+				rx_polarity_inversion, ppd->local_tx_rate);
+	if (ret != HCMD_SUCCESS)
+		goto set_local_link_attributes_fail;
+
+	ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd,
+			   "Failed to set host interface version, return 0x%x\n",
+			   ret);
+		goto set_local_link_attributes_fail;
+	}
+
+	/*
+	 * DC supports continuous updates.
+	 */
+	ret = write_vc_local_phy(dd,
+				 0 /* no power management */,
+				 1 /* continuous updates */);
+	if (ret != HCMD_SUCCESS)
+		goto set_local_link_attributes_fail;
+
+	/* z=1 in the next call: AU of 0 is not supported by the hardware */
+	ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init,
+				    ppd->port_crc_mode_enabled);
+	if (ret != HCMD_SUCCESS)
+		goto set_local_link_attributes_fail;
+
+	/*
+	 * SerDes loopback init sequence requires
+	 * setting bit 0 of MISC_CONFIG_BITS
+	 */
+	if (loopback == LOOPBACK_SERDES)
+		misc_bits |= 1 << LOOPBACK_SERDES_CONFIG_BIT_MASK_SHIFT;
+
+	ret = write_vc_local_link_width(dd, misc_bits, 0,
+					opa_to_vc_link_widths(
+						ppd->link_width_enabled));
+	if (ret != HCMD_SUCCESS)
+		goto set_local_link_attributes_fail;
+
+	/* let peer know who we are */
+	ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev);
+	if (ret == HCMD_SUCCESS)
+		return 0;
+
+set_local_link_attributes_fail:
+	dd_dev_err(dd,
+		   "Failed to set local link attributes, return 0x%x\n",
+		   ret);
+	return ret;
+}
+
+/*
+ * Call this to start the link.
+ * Do not do anything if the link is disabled.
+ * Returns 0 if link is disabled, moved to polling, or the driver is not ready.
+ */
+int start_link(struct hfi1_pportdata *ppd)
+{
+	/*
+	 * Tune the SerDes to a ballpark setting for optimal signal and bit
+	 * error rate.  Needs to be done before starting the link.
+	 */
+	tune_serdes(ppd);
+
+	if (!ppd->driver_link_ready) {
+		dd_dev_info(ppd->dd,
+			    "%s: stopping link start because driver is not ready\n",
+			    __func__);
+		return 0;
+	}
+
+	/*
+	 * FULL_MGMT_P_KEY is cleared from the pkey table, so that the
+	 * pkey table can be configured properly if the HFI unit is connected
+	 * to switch port with MgmtAllowed=NO
+	 */
+	clear_full_mgmt_pkey(ppd);
+
+	return set_link_state(ppd, HLS_DN_POLL);
+}
+
+static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 mask;
+	unsigned long timeout;
+
+	/*
+	 * Some QSFP cables have a quirk that asserts the IntN line as a side
+	 * effect of power up on plug-in. We ignore this false positive
+	 * interrupt until the module has finished powering up by waiting for
+	 * a minimum timeout of the module inrush initialization time of
+	 * 500 ms (SFF 8679 Table 5-6) to ensure the voltage rails in the
+	 * module have stabilized.
+	 */
+	msleep(500);
+
+	/*
+	 * Check for QSFP interrupt for t_init (SFF 8679 Table 8-1)
+	 */
+	timeout = jiffies + msecs_to_jiffies(2000);
+	while (1) {
+		mask = read_csr(dd, dd->hfi1_id ?
+				ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+		if (!(mask & QSFP_HFI0_INT_N))
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_info(dd, "%s: No IntN detected, reset complete\n",
+				    __func__);
+			break;
+		}
+		udelay(2);
+	}
+}
+
+static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 mask;
+
+	mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK);
+	if (enable) {
+		/*
+		 * Clear the status register to avoid an immediate interrupt
+		 * when we re-enable the IntN pin
+		 */
+		write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR,
+			  QSFP_HFI0_INT_N);
+		mask |= (u64)QSFP_HFI0_INT_N;
+	} else {
+		mask &= ~(u64)QSFP_HFI0_INT_N;
+	}
+	write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask);
+}
+
+int reset_qsfp(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 mask, qsfp_mask;
+
+	/* Disable INT_N from triggering QSFP interrupts */
+	set_qsfp_int_n(ppd, 0);
+
+	/* Reset the QSFP */
+	mask = (u64)QSFP_HFI0_RESET_N;
+
+	qsfp_mask = read_csr(dd,
+			     dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
+	qsfp_mask &= ~mask;
+	write_csr(dd,
+		  dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
+
+	udelay(10);
+
+	qsfp_mask |= mask;
+	write_csr(dd,
+		  dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
+
+	wait_for_qsfp_init(ppd);
+
+	/*
+	 * Allow INT_N to trigger the QSFP interrupt to watch
+	 * for alarms and warnings
+	 */
+	set_qsfp_int_n(ppd, 1);
+
+	/*
+	 * After the reset, AOC transmitters are enabled by default. They need
+	 * to be turned off to complete the QSFP setup before they can be
+	 * enabled again.
+	 */
+	return set_qsfp_tx(ppd, 0);
+}
+
+static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
+					u8 *qsfp_interrupt_status)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+
+	if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
+	    (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
+		dd_dev_err(dd, "%s: QSFP cable temperature too high\n",
+			   __func__);
+
+	if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
+	    (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
+		dd_dev_err(dd, "%s: QSFP cable temperature too low\n",
+			   __func__);
+
+	/*
+	 * The remaining alarms/warnings don't matter if the link is down.
+	 */
+	if (ppd->host_link_state & HLS_DOWN)
+		return 0;
+
+	if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
+	    (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
+		dd_dev_err(dd, "%s: QSFP supply voltage too high\n",
+			   __func__);
+
+	if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
+	    (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
+		dd_dev_err(dd, "%s: QSFP supply voltage too low\n",
+			   __func__);
+
+	/* Byte 2 is vendor specific */
+
+	if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
+	    (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
+		dd_dev_err(dd, "%s: Cable RX channel 1/2 power too high\n",
+			   __func__);
+
+	if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
+	    (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
+		dd_dev_err(dd, "%s: Cable RX channel 1/2 power too low\n",
+			   __func__);
+
+	if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
+	    (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
+		dd_dev_err(dd, "%s: Cable RX channel 3/4 power too high\n",
+			   __func__);
+
+	if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
+	    (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
+		dd_dev_err(dd, "%s: Cable RX channel 3/4 power too low\n",
+			   __func__);
+
+	if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
+	    (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
+		dd_dev_err(dd, "%s: Cable TX channel 1/2 bias too high\n",
+			   __func__);
+
+	if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
+	    (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
+		dd_dev_err(dd, "%s: Cable TX channel 1/2 bias too low\n",
+			   __func__);
+
+	if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
+	    (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
+		dd_dev_err(dd, "%s: Cable TX channel 3/4 bias too high\n",
+			   __func__);
+
+	if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
+	    (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
+		dd_dev_err(dd, "%s: Cable TX channel 3/4 bias too low\n",
+			   __func__);
+
+	if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
+	    (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
+		dd_dev_err(dd, "%s: Cable TX channel 1/2 power too high\n",
+			   __func__);
+
+	if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
+	    (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
+		dd_dev_err(dd, "%s: Cable TX channel 1/2 power too low\n",
+			   __func__);
+
+	if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
+	    (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
+		dd_dev_err(dd, "%s: Cable TX channel 3/4 power too high\n",
+			   __func__);
+
+	if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
+	    (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
+		dd_dev_err(dd, "%s: Cable TX channel 3/4 power too low\n",
+			   __func__);
+
+	/* Bytes 9-10 and 11-12 are reserved */
+	/* Bytes 13-15 are vendor specific */
+
+	return 0;
+}
+
+/* This routine will only be scheduled if the QSFP module present is asserted */
+void qsfp_event(struct work_struct *work)
+{
+	struct qsfp_data *qd;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_devdata *dd;
+
+	qd = container_of(work, struct qsfp_data, qsfp_work);
+	ppd = qd->ppd;
+	dd = ppd->dd;
+
+	/* Sanity check */
+	if (!qsfp_mod_present(ppd))
+		return;
+
+	if (ppd->host_link_state == HLS_DN_DISABLE) {
+		dd_dev_info(ppd->dd,
+			    "%s: stopping link start because link is disabled\n",
+			    __func__);
+		return;
+	}
+
+	/*
+	 * Turn DC back on after cable has been re-inserted. Up until
+	 * now, the DC has been in reset to save power.
+	 */
+	dc_start(dd);
+
+	if (qd->cache_refresh_required) {
+		set_qsfp_int_n(ppd, 0);
+
+		wait_for_qsfp_init(ppd);
+
+		/*
+		 * Allow INT_N to trigger the QSFP interrupt to watch
+		 * for alarms and warnings
+		 */
+		set_qsfp_int_n(ppd, 1);
+
+		start_link(ppd);
+	}
+
+	if (qd->check_interrupt_flags) {
+		u8 qsfp_interrupt_status[16] = {0,};
+
+		if (one_qsfp_read(ppd, dd->hfi1_id, 6,
+				  &qsfp_interrupt_status[0], 16) != 16) {
+			dd_dev_info(dd,
+				    "%s: Failed to read status of QSFP module\n",
+				    __func__);
+		} else {
+			unsigned long flags;
+
+			handle_qsfp_error_conditions(
+					ppd, qsfp_interrupt_status);
+			spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+			ppd->qsfp_info.check_interrupt_flags = 0;
+			spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+					       flags);
+		}
+	}
+}
+
+static void init_qsfp_int(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+	u64 qsfp_mask, cce_int_mask;
+	const int qsfp1_int_smask = QSFP1_INT % 64;
+	const int qsfp2_int_smask = QSFP2_INT % 64;
+
+	/*
+	 * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
+	 * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
+	 * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
+	 * the index of the appropriate CSR in the CCEIntMask CSR array
+	 */
+	cce_int_mask = read_csr(dd, CCE_INT_MASK +
+				(8 * (QSFP1_INT / 64)));
+	if (dd->hfi1_id) {
+		cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
+		write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)),
+			  cce_int_mask);
+	} else {
+		cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
+		write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)),
+			  cce_int_mask);
+	}
+
+	qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+	/* Clear current status to avoid spurious interrupts */
+	write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR,
+		  qsfp_mask);
+	write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
+		  qsfp_mask);
+
+	set_qsfp_int_n(ppd, 0);
+
+	/* Handle active low nature of INT_N and MODPRST_N pins */
+	if (qsfp_mod_present(ppd))
+		qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N;
+	write_csr(dd,
+		  dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
+		  qsfp_mask);
+}
+
+/*
+ * Do a one-time initialize of the LCB block.
+ */
+static void init_lcb(struct hfi1_devdata *dd)
+{
+	/* simulator does not correctly handle LCB cclk loopback, skip */
+	if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+		return;
+
+	/* the DC has been reset earlier in the driver load */
+
+	/* set LCB for cclk loopback on the port */
+	write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x01);
+	write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0x00);
+	write_csr(dd, DC_LCB_CFG_REINIT_AS_SLAVE, 0x00);
+	write_csr(dd, DC_LCB_CFG_CNT_FOR_SKIP_STALL, 0x110);
+	write_csr(dd, DC_LCB_CFG_CLK_CNTR, 0x08);
+	write_csr(dd, DC_LCB_CFG_LOOPBACK, 0x02);
+	write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x00);
+}
+
+/*
+ * Perform a test read on the QSFP.  Return 0 on success, -ERRNO
+ * on error.
+ */
+static int test_qsfp_read(struct hfi1_pportdata *ppd)
+{
+	int ret;
+	u8 status;
+
+	/*
+	 * Report success if not a QSFP or, if it is a QSFP, but the cable is
+	 * not present
+	 */
+	if (ppd->port_type != PORT_TYPE_QSFP || !qsfp_mod_present(ppd))
+		return 0;
+
+	/* read byte 2, the status byte */
+	ret = one_qsfp_read(ppd, ppd->dd->hfi1_id, 2, &status, 1);
+	if (ret < 0)
+		return ret;
+	if (ret != 1)
+		return -EIO;
+
+	return 0; /* success */
+}
+
+/*
+ * Values for QSFP retry.
+ *
+ * Give up after 10s (20 x 500ms).  The overall timeout was empirically
+ * arrived at from experience on a large cluster.
+ */
+#define MAX_QSFP_RETRIES 20
+#define QSFP_RETRY_WAIT 500 /* msec */
+
+/*
+ * Try a QSFP read.  If it fails, schedule a retry for later.
+ * Called on first link activation after driver load.
+ */
+static void try_start_link(struct hfi1_pportdata *ppd)
+{
+	if (test_qsfp_read(ppd)) {
+		/* read failed */
+		if (ppd->qsfp_retry_count >= MAX_QSFP_RETRIES) {
+			dd_dev_err(ppd->dd, "QSFP not responding, giving up\n");
+			return;
+		}
+		dd_dev_info(ppd->dd,
+			    "QSFP not responding, waiting and retrying %d\n",
+			    (int)ppd->qsfp_retry_count);
+		ppd->qsfp_retry_count++;
+		queue_delayed_work(ppd->link_wq, &ppd->start_link_work,
+				   msecs_to_jiffies(QSFP_RETRY_WAIT));
+		return;
+	}
+	ppd->qsfp_retry_count = 0;
+
+	start_link(ppd);
+}
+
+/*
+ * Workqueue function to start the link after a delay.
+ */
+void handle_start_link(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+						  start_link_work.work);
+	try_start_link(ppd);
+}
+
+int bringup_serdes(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 guid;
+	int ret;
+
+	if (HFI1_CAP_IS_KSET(EXTENDED_PSN))
+		add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK);
+
+	guid = ppd->guids[HFI1_PORT_GUID_INDEX];
+	if (!guid) {
+		if (dd->base_guid)
+			guid = dd->base_guid + ppd->port - 1;
+		ppd->guids[HFI1_PORT_GUID_INDEX] = guid;
+	}
+
+	/* Set linkinit_reason on power up per OPA spec */
+	ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP;
+
+	/* one-time init of the LCB */
+	init_lcb(dd);
+
+	if (loopback) {
+		ret = init_loopback(dd);
+		if (ret < 0)
+			return ret;
+	}
+
+	get_port_type(ppd);
+	if (ppd->port_type == PORT_TYPE_QSFP) {
+		set_qsfp_int_n(ppd, 0);
+		wait_for_qsfp_init(ppd);
+		set_qsfp_int_n(ppd, 1);
+	}
+
+	try_start_link(ppd);
+	return 0;
+}
+
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+
+	/*
+	 * Shut down the link and keep it down.   First turn off that the
+	 * driver wants to allow the link to be up (driver_link_ready).
+	 * Then make sure the link is not automatically restarted
+	 * (link_enabled).  Cancel any pending restart.  And finally
+	 * go offline.
+	 */
+	ppd->driver_link_ready = 0;
+	ppd->link_enabled = 0;
+
+	ppd->qsfp_retry_count = MAX_QSFP_RETRIES; /* prevent more retries */
+	flush_delayed_work(&ppd->start_link_work);
+	cancel_delayed_work_sync(&ppd->start_link_work);
+
+	ppd->offline_disabled_reason =
+			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_REBOOT);
+	set_link_down_reason(ppd, OPA_LINKDOWN_REASON_REBOOT, 0,
+			     OPA_LINKDOWN_REASON_REBOOT);
+	set_link_state(ppd, HLS_DN_OFFLINE);
+
+	/* disable the port */
+	clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+static inline int init_cpu_counters(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd;
+	int i;
+
+	ppd = (struct hfi1_pportdata *)(dd + 1);
+	for (i = 0; i < dd->num_pports; i++, ppd++) {
+		ppd->ibport_data.rvp.rc_acks = NULL;
+		ppd->ibport_data.rvp.rc_qacks = NULL;
+		ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64);
+		ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64);
+		ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64);
+		if (!ppd->ibport_data.rvp.rc_acks ||
+		    !ppd->ibport_data.rvp.rc_delayed_comp ||
+		    !ppd->ibport_data.rvp.rc_qacks)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * index is the index into the receive array
+ */
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+		  u32 type, unsigned long pa, u16 order)
+{
+	u64 reg;
+
+	if (!(dd->flags & HFI1_PRESENT))
+		goto done;
+
+	if (type == PT_INVALID || type == PT_INVALID_FLUSH) {
+		pa = 0;
+		order = 0;
+	} else if (type > PT_INVALID) {
+		dd_dev_err(dd,
+			   "unexpected receive array type %u for index %u, not handled\n",
+			   type, index);
+		goto done;
+	}
+	trace_hfi1_put_tid(dd, index, type, pa, order);
+
+#define RT_ADDR_SHIFT 12	/* 4KB kernel address boundary */
+	reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
+		| (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
+		| ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
+					<< RCV_ARRAY_RT_ADDR_SHIFT;
+	trace_hfi1_write_rcvarray(dd->rcvarray_wc + (index * 8), reg);
+	writeq(reg, dd->rcvarray_wc + (index * 8));
+
+	if (type == PT_EAGER || type == PT_INVALID_FLUSH || (index & 3) == 3)
+		/*
+		 * Eager entries are written and flushed
+		 *
+		 * Expected entries are flushed every 4 writes
+		 */
+		flush_wc();
+done:
+	return;
+}
+
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 i;
+
+	/* this could be optimized */
+	for (i = rcd->eager_base; i < rcd->eager_base +
+		     rcd->egrbufs.alloced; i++)
+		hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+
+	for (i = rcd->expected_base;
+			i < rcd->expected_base + rcd->expected_count; i++)
+		hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+}
+
+static const char * const ib_cfg_name_strings[] = {
+	"HFI1_IB_CFG_LIDLMC",
+	"HFI1_IB_CFG_LWID_DG_ENB",
+	"HFI1_IB_CFG_LWID_ENB",
+	"HFI1_IB_CFG_LWID",
+	"HFI1_IB_CFG_SPD_ENB",
+	"HFI1_IB_CFG_SPD",
+	"HFI1_IB_CFG_RXPOL_ENB",
+	"HFI1_IB_CFG_LREV_ENB",
+	"HFI1_IB_CFG_LINKLATENCY",
+	"HFI1_IB_CFG_HRTBT",
+	"HFI1_IB_CFG_OP_VLS",
+	"HFI1_IB_CFG_VL_HIGH_CAP",
+	"HFI1_IB_CFG_VL_LOW_CAP",
+	"HFI1_IB_CFG_OVERRUN_THRESH",
+	"HFI1_IB_CFG_PHYERR_THRESH",
+	"HFI1_IB_CFG_LINKDEFAULT",
+	"HFI1_IB_CFG_PKEYS",
+	"HFI1_IB_CFG_MTU",
+	"HFI1_IB_CFG_LSTATE",
+	"HFI1_IB_CFG_VL_HIGH_LIMIT",
+	"HFI1_IB_CFG_PMA_TICKS",
+	"HFI1_IB_CFG_PORT"
+};
+
+static const char *ib_cfg_name(int which)
+{
+	if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings))
+		return "invalid";
+	return ib_cfg_name_strings[which];
+}
+
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	int val = 0;
+
+	switch (which) {
+	case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */
+		val = ppd->link_width_enabled;
+		break;
+	case HFI1_IB_CFG_LWID: /* currently active Link-width */
+		val = ppd->link_width_active;
+		break;
+	case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+		val = ppd->link_speed_enabled;
+		break;
+	case HFI1_IB_CFG_SPD: /* current Link speed */
+		val = ppd->link_speed_active;
+		break;
+
+	case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */
+	case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */
+	case HFI1_IB_CFG_LINKLATENCY:
+		goto unimplemented;
+
+	case HFI1_IB_CFG_OP_VLS:
+		val = ppd->actual_vls_operational;
+		break;
+	case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */
+		val = VL_ARB_HIGH_PRIO_TABLE_SIZE;
+		break;
+	case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */
+		val = VL_ARB_LOW_PRIO_TABLE_SIZE;
+		break;
+	case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		val = ppd->overrun_threshold;
+		break;
+	case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		val = ppd->phy_error_threshold;
+		break;
+	case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		val = HLS_DEFAULT;
+		break;
+
+	case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */
+	case HFI1_IB_CFG_PMA_TICKS:
+	default:
+unimplemented:
+		if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+			dd_dev_info(
+				dd,
+				"%s: which %s: not implemented\n",
+				__func__,
+				ib_cfg_name(which));
+		break;
+	}
+
+	return val;
+}
+
+/*
+ * The largest MAD packet size.
+ */
+#define MAX_MAD_PACKET 2048
+
+/*
+ * Return the maximum header bytes that can go on the _wire_
+ * for this device. This count includes the ICRC which is
+ * not part of the packet held in memory but it is appended
+ * by the HW.
+ * This is dependent on the device's receive header entry size.
+ * HFI allows this to be set per-receive context, but the
+ * driver presently enforces a global value.
+ */
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd)
+{
+	/*
+	 * The maximum non-payload (MTU) bytes in LRH.PktLen are
+	 * the Receive Header Entry Size minus the PBC (or RHF) size
+	 * plus one DW for the ICRC appended by HW.
+	 *
+	 * dd->rcd[0].rcvhdrqentsize is in DW.
+	 * We use rcd[0] as all context will have the same value. Also,
+	 * the first kernel context would have been allocated by now so
+	 * we are guaranteed a valid value.
+	 */
+	return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
+}
+
+/*
+ * Set Send Length
+ * @ppd - per port data
+ *
+ * Set the MTU by limiting how many DWs may be sent.  The SendLenCheck*
+ * registers compare against LRH.PktLen, so use the max bytes included
+ * in the LRH.
+ *
+ * This routine changes all VL values except VL15, which it maintains at
+ * the same value.
+ */
+static void set_send_length(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u32 max_hb = lrh_max_header_bytes(dd), dcmtu;
+	u32 maxvlmtu = dd->vld[15].mtu;
+	u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2)
+			      & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
+		SEND_LEN_CHECK1_LEN_VL15_SHIFT;
+	int i, j;
+	u32 thres;
+
+	for (i = 0; i < ppd->vls_supported; i++) {
+		if (dd->vld[i].mtu > maxvlmtu)
+			maxvlmtu = dd->vld[i].mtu;
+		if (i <= 3)
+			len1 |= (((dd->vld[i].mtu + max_hb) >> 2)
+				 & SEND_LEN_CHECK0_LEN_VL0_MASK) <<
+				((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT);
+		else
+			len2 |= (((dd->vld[i].mtu + max_hb) >> 2)
+				 & SEND_LEN_CHECK1_LEN_VL4_MASK) <<
+				((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT);
+	}
+	write_csr(dd, SEND_LEN_CHECK0, len1);
+	write_csr(dd, SEND_LEN_CHECK1, len2);
+	/* adjust kernel credit return thresholds based on new MTUs */
+	/* all kernel receive contexts have the same hdrqentsize */
+	for (i = 0; i < ppd->vls_supported; i++) {
+		thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
+			    sc_mtu_to_threshold(dd->vld[i].sc,
+						dd->vld[i].mtu,
+						dd->rcd[0]->rcvhdrqentsize));
+		for (j = 0; j < INIT_SC_PER_VL; j++)
+			sc_set_cr_threshold(
+					pio_select_send_context_vl(dd, j, i),
+					    thres);
+	}
+	thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
+		    sc_mtu_to_threshold(dd->vld[15].sc,
+					dd->vld[15].mtu,
+					dd->rcd[0]->rcvhdrqentsize));
+	sc_set_cr_threshold(dd->vld[15].sc, thres);
+
+	/* Adjust maximum MTU for the port in DC */
+	dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
+		(ilog2(maxvlmtu >> 8) + 1);
+	len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG);
+	len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK;
+	len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) <<
+		DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT;
+	write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1);
+}
+
+static void set_lidlmc(struct hfi1_pportdata *ppd)
+{
+	int i;
+	u64 sreg = 0;
+	struct hfi1_devdata *dd = ppd->dd;
+	u32 mask = ~((1U << ppd->lmc) - 1);
+	u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1);
+	u32 lid;
+
+	/*
+	 * Program 0 in CSR if port lid is extended. This prevents
+	 * 9B packets being sent out for large lids.
+	 */
+	lid = (ppd->lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) ? 0 : ppd->lid;
+	c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
+		| DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
+	c1 |= ((lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
+			<< DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) |
+	      ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
+			<< DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
+	write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1);
+
+	/*
+	 * Iterate over all the send contexts and set their SLID check
+	 */
+	sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) <<
+			SEND_CTXT_CHECK_SLID_MASK_SHIFT) |
+	       (((lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
+			SEND_CTXT_CHECK_SLID_VALUE_SHIFT);
+
+	for (i = 0; i < dd->chip_send_contexts; i++) {
+		hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x",
+			  i, (u32)sreg);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg);
+	}
+
+	/* Now we have to do the same thing for the sdma engines */
+	sdma_update_lmc(dd, mask, lid);
+}
+
+static const char *state_completed_string(u32 completed)
+{
+	static const char * const state_completed[] = {
+		"EstablishComm",
+		"OptimizeEQ",
+		"VerifyCap"
+	};
+
+	if (completed < ARRAY_SIZE(state_completed))
+		return state_completed[completed];
+
+	return "unknown";
+}
+
+static const char all_lanes_dead_timeout_expired[] =
+	"All lanes were inactive â was the interconnect media removed?";
+static const char tx_out_of_policy[] =
+	"Passing lanes on local port do not meet the local link width policy";
+static const char no_state_complete[] =
+	"State timeout occurred before link partner completed the state";
+static const char * const state_complete_reasons[] = {
+	[0x00] = "Reason unknown",
+	[0x01] = "Link was halted by driver, refer to LinkDownReason",
+	[0x02] = "Link partner reported failure",
+	[0x10] = "Unable to achieve frame sync on any lane",
+	[0x11] =
+	  "Unable to find a common bit rate with the link partner",
+	[0x12] =
+	  "Unable to achieve frame sync on sufficient lanes to meet the local link width policy",
+	[0x13] =
+	  "Unable to identify preset equalization on sufficient lanes to meet the local link width policy",
+	[0x14] = no_state_complete,
+	[0x15] =
+	  "State timeout occurred before link partner identified equalization presets",
+	[0x16] =
+	  "Link partner completed the EstablishComm state, but the passing lanes do not meet the local link width policy",
+	[0x17] = tx_out_of_policy,
+	[0x20] = all_lanes_dead_timeout_expired,
+	[0x21] =
+	  "Unable to achieve acceptable BER on sufficient lanes to meet the local link width policy",
+	[0x22] = no_state_complete,
+	[0x23] =
+	  "Link partner completed the OptimizeEq state, but the passing lanes do not meet the local link width policy",
+	[0x24] = tx_out_of_policy,
+	[0x30] = all_lanes_dead_timeout_expired,
+	[0x31] =
+	  "State timeout occurred waiting for host to process received frames",
+	[0x32] = no_state_complete,
+	[0x33] =
+	  "Link partner completed the VerifyCap state, but the passing lanes do not meet the local link width policy",
+	[0x34] = tx_out_of_policy,
+	[0x35] = "Negotiated link width is mutually exclusive",
+	[0x36] =
+	  "Timed out before receiving verifycap frames in VerifyCap.Exchange",
+	[0x37] = "Unable to resolve secure data exchange",
+};
+
+static const char *state_complete_reason_code_string(struct hfi1_pportdata *ppd,
+						     u32 code)
+{
+	const char *str = NULL;
+
+	if (code < ARRAY_SIZE(state_complete_reasons))
+		str = state_complete_reasons[code];
+
+	if (str)
+		return str;
+	return "Reserved";
+}
+
+/* describe the given last state complete frame */
+static void decode_state_complete(struct hfi1_pportdata *ppd, u32 frame,
+				  const char *prefix)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u32 success;
+	u32 state;
+	u32 reason;
+	u32 lanes;
+
+	/*
+	 * Decode frame:
+	 *  [ 0: 0] - success
+	 *  [ 3: 1] - state
+	 *  [ 7: 4] - next state timeout
+	 *  [15: 8] - reason code
+	 *  [31:16] - lanes
+	 */
+	success = frame & 0x1;
+	state = (frame >> 1) & 0x7;
+	reason = (frame >> 8) & 0xff;
+	lanes = (frame >> 16) & 0xffff;
+
+	dd_dev_err(dd, "Last %s LNI state complete frame 0x%08x:\n",
+		   prefix, frame);
+	dd_dev_err(dd, "    last reported state state: %s (0x%x)\n",
+		   state_completed_string(state), state);
+	dd_dev_err(dd, "    state successfully completed: %s\n",
+		   success ? "yes" : "no");
+	dd_dev_err(dd, "    fail reason 0x%x: %s\n",
+		   reason, state_complete_reason_code_string(ppd, reason));
+	dd_dev_err(dd, "    passing lane mask: 0x%x", lanes);
+}
+
+/*
+ * Read the last state complete frames and explain them.  This routine
+ * expects to be called if the link went down during link negotiation
+ * and initialization (LNI).  That is, anywhere between polling and link up.
+ */
+static void check_lni_states(struct hfi1_pportdata *ppd)
+{
+	u32 last_local_state;
+	u32 last_remote_state;
+
+	read_last_local_state(ppd->dd, &last_local_state);
+	read_last_remote_state(ppd->dd, &last_remote_state);
+
+	/*
+	 * Don't report anything if there is nothing to report.  A value of
+	 * 0 means the link was taken down while polling and there was no
+	 * training in-process.
+	 */
+	if (last_local_state == 0 && last_remote_state == 0)
+		return;
+
+	decode_state_complete(ppd, last_local_state, "transmitted");
+	decode_state_complete(ppd, last_remote_state, "received");
+}
+
+/* wait for wait_ms for LINK_TRANSFER_ACTIVE to go to 1 */
+static int wait_link_transfer_active(struct hfi1_devdata *dd, int wait_ms)
+{
+	u64 reg;
+	unsigned long timeout;
+
+	/* watch LCB_STS_LINK_TRANSFER_ACTIVE */
+	timeout = jiffies + msecs_to_jiffies(wait_ms);
+	while (1) {
+		reg = read_csr(dd, DC_LCB_STS_LINK_TRANSFER_ACTIVE);
+		if (reg)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(dd,
+				   "timeout waiting for LINK_TRANSFER_ACTIVE\n");
+			return -ETIMEDOUT;
+		}
+		udelay(2);
+	}
+	return 0;
+}
+
+/* called when the logical link state is not down as it should be */
+static void force_logical_link_state_down(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+
+	/*
+	 * Bring link up in LCB loopback
+	 */
+	write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 1);
+	write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
+		  DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
+
+	write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0);
+	write_csr(dd, DC_LCB_CFG_REINIT_AS_SLAVE, 0);
+	write_csr(dd, DC_LCB_CFG_CNT_FOR_SKIP_STALL, 0x110);
+	write_csr(dd, DC_LCB_CFG_LOOPBACK, 0x2);
+
+	write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+	(void)read_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET);
+	udelay(3);
+	write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP, 1);
+	write_csr(dd, DC_LCB_CFG_RUN, 1ull << DC_LCB_CFG_RUN_EN_SHIFT);
+
+	wait_link_transfer_active(dd, 100);
+
+	/*
+	 * Bring the link down again.
+	 */
+	write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 1);
+	write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP, 0);
+	write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK, 0);
+
+	dd_dev_info(ppd->dd, "logical state forced to LINK_DOWN\n");
+}
+
+/*
+ * Helper for set_link_state().  Do not call except from that routine.
+ * Expects ppd->hls_mutex to be held.
+ *
+ * @rem_reason value to be sent to the neighbor
+ *
+ * LinkDownReasons only set if transition succeeds.
+ */
+static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u32 previous_state;
+	int offline_state_ret;
+	int ret;
+
+	update_lcb_cache(dd);
+
+	previous_state = ppd->host_link_state;
+	ppd->host_link_state = HLS_GOING_OFFLINE;
+
+	/* start offline transition */
+	ret = set_physical_link_state(dd, (rem_reason << 8) | PLS_OFFLINE);
+
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd,
+			   "Failed to transition to Offline link state, return %d\n",
+			   ret);
+		return -EINVAL;
+	}
+	if (ppd->offline_disabled_reason ==
+			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
+		ppd->offline_disabled_reason =
+		HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
+
+	offline_state_ret = wait_phys_link_offline_substates(ppd, 10000);
+	if (offline_state_ret < 0)
+		return offline_state_ret;
+
+	/* Disabling AOC transmitters */
+	if (ppd->port_type == PORT_TYPE_QSFP &&
+	    ppd->qsfp_info.limiting_active &&
+	    qsfp_mod_present(ppd)) {
+		int ret;
+
+		ret = acquire_chip_resource(dd, qsfp_resource(dd), QSFP_WAIT);
+		if (ret == 0) {
+			set_qsfp_tx(ppd, 0);
+			release_chip_resource(dd, qsfp_resource(dd));
+		} else {
+			/* not fatal, but should warn */
+			dd_dev_err(dd,
+				   "Unable to acquire lock to turn off QSFP TX\n");
+		}
+	}
+
+	/*
+	 * Wait for the offline.Quiet transition if it hasn't happened yet. It
+	 * can take a while for the link to go down.
+	 */
+	if (offline_state_ret != PLS_OFFLINE_QUIET) {
+		ret = wait_physical_linkstate(ppd, PLS_OFFLINE, 30000);
+		if (ret < 0)
+			return ret;
+	}
+
+	/*
+	 * Now in charge of LCB - must be after the physical state is
+	 * offline.quiet and before host_link_state is changed.
+	 */
+	set_host_lcb_access(dd);
+	write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+
+	/* make sure the logical state is also down */
+	ret = wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000);
+	if (ret)
+		force_logical_link_state_down(ppd);
+
+	ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */
+	update_statusp(ppd, IB_PORT_DOWN);
+
+	/*
+	 * The LNI has a mandatory wait time after the physical state
+	 * moves to Offline.Quiet.  The wait time may be different
+	 * depending on how the link went down.  The 8051 firmware
+	 * will observe the needed wait time and only move to ready
+	 * when that is completed.  The largest of the quiet timeouts
+	 * is 6s, so wait that long and then at least 0.5s more for
+	 * other transitions, and another 0.5s for a buffer.
+	 */
+	ret = wait_fm_ready(dd, 7000);
+	if (ret) {
+		dd_dev_err(dd,
+			   "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n");
+		/* state is really offline, so make it so */
+		ppd->host_link_state = HLS_DN_OFFLINE;
+		return ret;
+	}
+
+	/*
+	 * The state is now offline and the 8051 is ready to accept host
+	 * requests.
+	 *	- change our state
+	 *	- notify others if we were previously in a linkup state
+	 */
+	ppd->host_link_state = HLS_DN_OFFLINE;
+	if (previous_state & HLS_UP) {
+		/* went down while link was up */
+		handle_linkup_change(dd, 0);
+	} else if (previous_state
+			& (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+		/* went down while attempting link up */
+		check_lni_states(ppd);
+
+		/* The QSFP doesn't need to be reset on LNI failure */
+		ppd->qsfp_info.reset_needed = 0;
+	}
+
+	/* the active link width (downgrade) is 0 on link down */
+	ppd->link_width_active = 0;
+	ppd->link_width_downgrade_tx_active = 0;
+	ppd->link_width_downgrade_rx_active = 0;
+	ppd->current_egress_rate = 0;
+	return 0;
+}
+
+/* return the link state name */
+static const char *link_state_name(u32 state)
+{
+	const char *name;
+	int n = ilog2(state);
+	static const char * const names[] = {
+		[__HLS_UP_INIT_BP]	 = "INIT",
+		[__HLS_UP_ARMED_BP]	 = "ARMED",
+		[__HLS_UP_ACTIVE_BP]	 = "ACTIVE",
+		[__HLS_DN_DOWNDEF_BP]	 = "DOWNDEF",
+		[__HLS_DN_POLL_BP]	 = "POLL",
+		[__HLS_DN_DISABLE_BP]	 = "DISABLE",
+		[__HLS_DN_OFFLINE_BP]	 = "OFFLINE",
+		[__HLS_VERIFY_CAP_BP]	 = "VERIFY_CAP",
+		[__HLS_GOING_UP_BP]	 = "GOING_UP",
+		[__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE",
+		[__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN"
+	};
+
+	name = n < ARRAY_SIZE(names) ? names[n] : NULL;
+	return name ? name : "unknown";
+}
+
+/* return the link state reason name */
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state)
+{
+	if (state == HLS_UP_INIT) {
+		switch (ppd->linkinit_reason) {
+		case OPA_LINKINIT_REASON_LINKUP:
+			return "(LINKUP)";
+		case OPA_LINKINIT_REASON_FLAPPING:
+			return "(FLAPPING)";
+		case OPA_LINKINIT_OUTSIDE_POLICY:
+			return "(OUTSIDE_POLICY)";
+		case OPA_LINKINIT_QUARANTINED:
+			return "(QUARANTINED)";
+		case OPA_LINKINIT_INSUFIC_CAPABILITY:
+			return "(INSUFIC_CAPABILITY)";
+		default:
+			break;
+		}
+	}
+	return "";
+}
+
+/*
+ * driver_pstate - convert the driver's notion of a port's
+ * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*).
+ * Return -1 (converted to a u32) to indicate error.
+ */
+u32 driver_pstate(struct hfi1_pportdata *ppd)
+{
+	switch (ppd->host_link_state) {
+	case HLS_UP_INIT:
+	case HLS_UP_ARMED:
+	case HLS_UP_ACTIVE:
+		return IB_PORTPHYSSTATE_LINKUP;
+	case HLS_DN_POLL:
+		return IB_PORTPHYSSTATE_POLLING;
+	case HLS_DN_DISABLE:
+		return IB_PORTPHYSSTATE_DISABLED;
+	case HLS_DN_OFFLINE:
+		return OPA_PORTPHYSSTATE_OFFLINE;
+	case HLS_VERIFY_CAP:
+		return IB_PORTPHYSSTATE_POLLING;
+	case HLS_GOING_UP:
+		return IB_PORTPHYSSTATE_POLLING;
+	case HLS_GOING_OFFLINE:
+		return OPA_PORTPHYSSTATE_OFFLINE;
+	case HLS_LINK_COOLDOWN:
+		return OPA_PORTPHYSSTATE_OFFLINE;
+	case HLS_DN_DOWNDEF:
+	default:
+		dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+			   ppd->host_link_state);
+		return  -1;
+	}
+}
+
+/*
+ * driver_lstate - convert the driver's notion of a port's
+ * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1
+ * (converted to a u32) to indicate error.
+ */
+u32 driver_lstate(struct hfi1_pportdata *ppd)
+{
+	if (ppd->host_link_state && (ppd->host_link_state & HLS_DOWN))
+		return IB_PORT_DOWN;
+
+	switch (ppd->host_link_state & HLS_UP) {
+	case HLS_UP_INIT:
+		return IB_PORT_INIT;
+	case HLS_UP_ARMED:
+		return IB_PORT_ARMED;
+	case HLS_UP_ACTIVE:
+		return IB_PORT_ACTIVE;
+	default:
+		dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+			   ppd->host_link_state);
+	return -1;
+	}
+}
+
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+			  u8 neigh_reason, u8 rem_reason)
+{
+	if (ppd->local_link_down_reason.latest == 0 &&
+	    ppd->neigh_link_down_reason.latest == 0) {
+		ppd->local_link_down_reason.latest = lcl_reason;
+		ppd->neigh_link_down_reason.latest = neigh_reason;
+		ppd->remote_link_down_reason = rem_reason;
+	}
+}
+
+/*
+ * Verify if BCT for data VLs is non-zero.
+ */
+static inline bool data_vls_operational(struct hfi1_pportdata *ppd)
+{
+	return !!ppd->actual_vls_operational;
+}
+
+/*
+ * Change the physical and/or logical link state.
+ *
+ * Do not call this routine while inside an interrupt.  It contains
+ * calls to routines that can take multiple seconds to finish.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int set_link_state(struct hfi1_pportdata *ppd, u32 state)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	struct ib_event event = {.device = NULL};
+	int ret1, ret = 0;
+	int orig_new_state, poll_bounce;
+
+	mutex_lock(&ppd->hls_lock);
+
+	orig_new_state = state;
+	if (state == HLS_DN_DOWNDEF)
+		state = HLS_DEFAULT;
+
+	/* interpret poll -> poll as a link bounce */
+	poll_bounce = ppd->host_link_state == HLS_DN_POLL &&
+		      state == HLS_DN_POLL;
+
+	dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__,
+		    link_state_name(ppd->host_link_state),
+		    link_state_name(orig_new_state),
+		    poll_bounce ? "(bounce) " : "",
+		    link_state_reason_name(ppd, state));
+
+	/*
+	 * If we're going to a (HLS_*) link state that implies the logical
+	 * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
+	 * reset is_sm_config_started to 0.
+	 */
+	if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE)))
+		ppd->is_sm_config_started = 0;
+
+	/*
+	 * Do nothing if the states match.  Let a poll to poll link bounce
+	 * go through.
+	 */
+	if (ppd->host_link_state == state && !poll_bounce)
+		goto done;
+
+	switch (state) {
+	case HLS_UP_INIT:
+		if (ppd->host_link_state == HLS_DN_POLL &&
+		    (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) {
+			/*
+			 * Quick link up jumps from polling to here.
+			 *
+			 * Whether in normal or loopback mode, the
+			 * simulator jumps from polling to link up.
+			 * Accept that here.
+			 */
+			/* OK */
+		} else if (ppd->host_link_state != HLS_GOING_UP) {
+			goto unexpected;
+		}
+
+		/*
+		 * Wait for Link_Up physical state.
+		 * Physical and Logical states should already be
+		 * be transitioned to LinkUp and LinkInit respectively.
+		 */
+		ret = wait_physical_linkstate(ppd, PLS_LINKUP, 1000);
+		if (ret) {
+			dd_dev_err(dd,
+				   "%s: physical state did not change to LINK-UP\n",
+				   __func__);
+			break;
+		}
+
+		ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000);
+		if (ret) {
+			dd_dev_err(dd,
+				   "%s: logical state did not change to INIT\n",
+				   __func__);
+			break;
+		}
+
+		/* clear old transient LINKINIT_REASON code */
+		if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
+			ppd->linkinit_reason =
+				OPA_LINKINIT_REASON_LINKUP;
+
+		/* enable the port */
+		add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+		handle_linkup_change(dd, 1);
+
+		/*
+		 * After link up, a new link width will have been set.
+		 * Update the xmit counters with regards to the new
+		 * link width.
+		 */
+		update_xmit_counters(ppd, ppd->link_width_active);
+
+		ppd->host_link_state = HLS_UP_INIT;
+		update_statusp(ppd, IB_PORT_INIT);
+		break;
+	case HLS_UP_ARMED:
+		if (ppd->host_link_state != HLS_UP_INIT)
+			goto unexpected;
+
+		if (!data_vls_operational(ppd)) {
+			dd_dev_err(dd,
+				   "%s: data VLs not operational\n", __func__);
+			ret = -EINVAL;
+			break;
+		}
+
+		set_logical_state(dd, LSTATE_ARMED);
+		ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000);
+		if (ret) {
+			dd_dev_err(dd,
+				   "%s: logical state did not change to ARMED\n",
+				   __func__);
+			break;
+		}
+		ppd->host_link_state = HLS_UP_ARMED;
+		update_statusp(ppd, IB_PORT_ARMED);
+		/*
+		 * The simulator does not currently implement SMA messages,
+		 * so neighbor_normal is not set.  Set it here when we first
+		 * move to Armed.
+		 */
+		if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+			ppd->neighbor_normal = 1;
+		break;
+	case HLS_UP_ACTIVE:
+		if (ppd->host_link_state != HLS_UP_ARMED)
+			goto unexpected;
+
+		set_logical_state(dd, LSTATE_ACTIVE);
+		ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000);
+		if (ret) {
+			dd_dev_err(dd,
+				   "%s: logical state did not change to ACTIVE\n",
+				   __func__);
+		} else {
+			/* tell all engines to go running */
+			sdma_all_running(dd);
+			ppd->host_link_state = HLS_UP_ACTIVE;
+			update_statusp(ppd, IB_PORT_ACTIVE);
+
+			/* Signal the IB layer that the port has went active */
+			event.device = &dd->verbs_dev.rdi.ibdev;
+			event.element.port_num = ppd->port;
+			event.event = IB_EVENT_PORT_ACTIVE;
+		}
+		break;
+	case HLS_DN_POLL:
+		if ((ppd->host_link_state == HLS_DN_DISABLE ||
+		     ppd->host_link_state == HLS_DN_OFFLINE) &&
+		    dd->dc_shutdown)
+			dc_start(dd);
+		/* Hand LED control to the DC */
+		write_csr(dd, DCC_CFG_LED_CNTRL, 0);
+
+		if (ppd->host_link_state != HLS_DN_OFFLINE) {
+			u8 tmp = ppd->link_enabled;
+
+			ret = goto_offline(ppd, ppd->remote_link_down_reason);
+			if (ret) {
+				ppd->link_enabled = tmp;
+				break;
+			}
+			ppd->remote_link_down_reason = 0;
+
+			if (ppd->driver_link_ready)
+				ppd->link_enabled = 1;
+		}
+
+		set_all_slowpath(ppd->dd);
+		ret = set_local_link_attributes(ppd);
+		if (ret)
+			break;
+
+		ppd->port_error_action = 0;
+		ppd->host_link_state = HLS_DN_POLL;
+
+		if (quick_linkup) {
+			/* quick linkup does not go into polling */
+			ret = do_quick_linkup(dd);
+		} else {
+			ret1 = set_physical_link_state(dd, PLS_POLLING);
+			if (ret1 != HCMD_SUCCESS) {
+				dd_dev_err(dd,
+					   "Failed to transition to Polling link state, return 0x%x\n",
+					   ret1);
+				ret = -EINVAL;
+			}
+		}
+		ppd->offline_disabled_reason =
+			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
+		/*
+		 * If an error occurred above, go back to offline.  The
+		 * caller may reschedule another attempt.
+		 */
+		if (ret)
+			goto_offline(ppd, 0);
+		else
+			log_physical_state(ppd, PLS_POLLING);
+		break;
+	case HLS_DN_DISABLE:
+		/* link is disabled */
+		ppd->link_enabled = 0;
+
+		/* allow any state to transition to disabled */
+
+		/* must transition to offline first */
+		if (ppd->host_link_state != HLS_DN_OFFLINE) {
+			ret = goto_offline(ppd, ppd->remote_link_down_reason);
+			if (ret)
+				break;
+			ppd->remote_link_down_reason = 0;
+		}
+
+		if (!dd->dc_shutdown) {
+			ret1 = set_physical_link_state(dd, PLS_DISABLED);
+			if (ret1 != HCMD_SUCCESS) {
+				dd_dev_err(dd,
+					   "Failed to transition to Disabled link state, return 0x%x\n",
+					   ret1);
+				ret = -EINVAL;
+				break;
+			}
+			ret = wait_physical_linkstate(ppd, PLS_DISABLED, 10000);
+			if (ret) {
+				dd_dev_err(dd,
+					   "%s: physical state did not change to DISABLED\n",
+					   __func__);
+				break;
+			}
+			dc_shutdown(dd);
+		}
+		ppd->host_link_state = HLS_DN_DISABLE;
+		break;
+	case HLS_DN_OFFLINE:
+		if (ppd->host_link_state == HLS_DN_DISABLE)
+			dc_start(dd);
+
+		/* allow any state to transition to offline */
+		ret = goto_offline(ppd, ppd->remote_link_down_reason);
+		if (!ret)
+			ppd->remote_link_down_reason = 0;
+		break;
+	case HLS_VERIFY_CAP:
+		if (ppd->host_link_state != HLS_DN_POLL)
+			goto unexpected;
+		ppd->host_link_state = HLS_VERIFY_CAP;
+		log_physical_state(ppd, PLS_CONFIGPHY_VERIFYCAP);
+		break;
+	case HLS_GOING_UP:
+		if (ppd->host_link_state != HLS_VERIFY_CAP)
+			goto unexpected;
+
+		ret1 = set_physical_link_state(dd, PLS_LINKUP);
+		if (ret1 != HCMD_SUCCESS) {
+			dd_dev_err(dd,
+				   "Failed to transition to link up state, return 0x%x\n",
+				   ret1);
+			ret = -EINVAL;
+			break;
+		}
+		ppd->host_link_state = HLS_GOING_UP;
+		break;
+
+	case HLS_GOING_OFFLINE:		/* transient within goto_offline() */
+	case HLS_LINK_COOLDOWN:		/* transient within goto_offline() */
+	default:
+		dd_dev_info(dd, "%s: state 0x%x: not supported\n",
+			    __func__, state);
+		ret = -EINVAL;
+		break;
+	}
+
+	goto done;
+
+unexpected:
+	dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n",
+		   __func__, link_state_name(ppd->host_link_state),
+		   link_state_name(state));
+	ret = -EINVAL;
+
+done:
+	mutex_unlock(&ppd->hls_lock);
+
+	if (event.device)
+		ib_dispatch_event(&event);
+
+	return ret;
+}
+
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val)
+{
+	u64 reg;
+	int ret = 0;
+
+	switch (which) {
+	case HFI1_IB_CFG_LIDLMC:
+		set_lidlmc(ppd);
+		break;
+	case HFI1_IB_CFG_VL_HIGH_LIMIT:
+		/*
+		 * The VL Arbitrator high limit is sent in units of 4k
+		 * bytes, while HFI stores it in units of 64 bytes.
+		 */
+		val *= 4096 / 64;
+		reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK)
+			<< SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT;
+		write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg);
+		break;
+	case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* HFI only supports POLL as the default link down state */
+		if (val != HLS_DN_POLL)
+			ret = -EINVAL;
+		break;
+	case HFI1_IB_CFG_OP_VLS:
+		if (ppd->vls_operational != val) {
+			ppd->vls_operational = val;
+			if (!ppd->port)
+				ret = -EINVAL;
+		}
+		break;
+	/*
+	 * For link width, link width downgrade, and speed enable, always AND
+	 * the setting with what is actually supported.  This has two benefits.
+	 * First, enabled can't have unsupported values, no matter what the
+	 * SM or FM might want.  Second, the ALL_SUPPORTED wildcards that mean
+	 * "fill in with your supported value" have all the bits in the
+	 * field set, so simply ANDing with supported has the desired result.
+	 */
+	case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */
+		ppd->link_width_enabled = val & ppd->link_width_supported;
+		break;
+	case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */
+		ppd->link_width_downgrade_enabled =
+				val & ppd->link_width_downgrade_supported;
+		break;
+	case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+		ppd->link_speed_enabled = val & ppd->link_speed_supported;
+		break;
+	case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		/*
+		 * HFI does not follow IB specs, save this value
+		 * so we can report it, if asked.
+		 */
+		ppd->overrun_threshold = val;
+		break;
+	case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		/*
+		 * HFI does not follow IB specs, save this value
+		 * so we can report it, if asked.
+		 */
+		ppd->phy_error_threshold = val;
+		break;
+
+	case HFI1_IB_CFG_MTU:
+		set_send_length(ppd);
+		break;
+
+	case HFI1_IB_CFG_PKEYS:
+		if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+			set_partition_keys(ppd);
+		break;
+
+	default:
+		if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+			dd_dev_info(ppd->dd,
+				    "%s: which %s, val 0x%x: not implemented\n",
+				    __func__, ib_cfg_name(which), val);
+		break;
+	}
+	return ret;
+}
+
+/* begin functions related to vl arbitration table caching */
+static void init_vl_arb_caches(struct hfi1_pportdata *ppd)
+{
+	int i;
+
+	BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+			VL_ARB_LOW_PRIO_TABLE_SIZE);
+	BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+			VL_ARB_HIGH_PRIO_TABLE_SIZE);
+
+	/*
+	 * Note that we always return values directly from the
+	 * 'vl_arb_cache' (and do no CSR reads) in response to a
+	 * 'Get(VLArbTable)'. This is obviously correct after a
+	 * 'Set(VLArbTable)', since the cache will then be up to
+	 * date. But it's also correct prior to any 'Set(VLArbTable)'
+	 * since then both the cache, and the relevant h/w registers
+	 * will be zeroed.
+	 */
+
+	for (i = 0; i < MAX_PRIO_TABLE; i++)
+		spin_lock_init(&ppd->vl_arb_cache[i].lock);
+}
+
+/*
+ * vl_arb_lock_cache
+ *
+ * All other vl_arb_* functions should be called only after locking
+ * the cache.
+ */
+static inline struct vl_arb_cache *
+vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+	if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE)
+		return NULL;
+	spin_lock(&ppd->vl_arb_cache[idx].lock);
+	return &ppd->vl_arb_cache[idx];
+}
+
+static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+	spin_unlock(&ppd->vl_arb_cache[idx].lock);
+}
+
+static void vl_arb_get_cache(struct vl_arb_cache *cache,
+			     struct ib_vl_weight_elem *vl)
+{
+	memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static void vl_arb_set_cache(struct vl_arb_cache *cache,
+			     struct ib_vl_weight_elem *vl)
+{
+	memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static int vl_arb_match_cache(struct vl_arb_cache *cache,
+			      struct ib_vl_weight_elem *vl)
+{
+	return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+/* end functions related to vl arbitration table caching */
+
+static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target,
+			  u32 size, struct ib_vl_weight_elem *vl)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 reg;
+	unsigned int i, is_up = 0;
+	int drain, ret = 0;
+
+	mutex_lock(&ppd->hls_lock);
+
+	if (ppd->host_link_state & HLS_UP)
+		is_up = 1;
+
+	drain = !is_ax(dd) && is_up;
+
+	if (drain)
+		/*
+		 * Before adjusting VL arbitration weights, empty per-VL
+		 * FIFOs, otherwise a packet whose VL weight is being
+		 * set to 0 could get stuck in a FIFO with no chance to
+		 * egress.
+		 */
+		ret = stop_drain_data_vls(dd);
+
+	if (ret) {
+		dd_dev_err(
+			dd,
+			"%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n",
+			__func__);
+		goto err;
+	}
+
+	for (i = 0; i < size; i++, vl++) {
+		/*
+		 * NOTE: The low priority shift and mask are used here, but
+		 * they are the same for both the low and high registers.
+		 */
+		reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK)
+				<< SEND_LOW_PRIORITY_LIST_VL_SHIFT)
+		      | (((u64)vl->weight
+				& SEND_LOW_PRIORITY_LIST_WEIGHT_MASK)
+				<< SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT);
+		write_csr(dd, target + (i * 8), reg);
+	}
+	pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE);
+
+	if (drain)
+		open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+	mutex_unlock(&ppd->hls_lock);
+
+	return ret;
+}
+
+/*
+ * Read one credit merge VL register.
+ */
+static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr,
+			   struct vl_limit *vll)
+{
+	u64 reg = read_csr(dd, csr);
+
+	vll->dedicated = cpu_to_be16(
+		(reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT)
+		& SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK);
+	vll->shared = cpu_to_be16(
+		(reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT)
+		& SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK);
+}
+
+/*
+ * Read the current credit merge limits.
+ */
+static int get_buffer_control(struct hfi1_devdata *dd,
+			      struct buffer_control *bc, u16 *overall_limit)
+{
+	u64 reg;
+	int i;
+
+	/* not all entries are filled in */
+	memset(bc, 0, sizeof(*bc));
+
+	/* OPA and HFI have a 1-1 mapping */
+	for (i = 0; i < TXE_NUM_DATA_VL; i++)
+		read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8 * i), &bc->vl[i]);
+
+	/* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */
+	read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]);
+
+	reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+	bc->overall_shared_limit = cpu_to_be16(
+		(reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
+		& SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK);
+	if (overall_limit)
+		*overall_limit = (reg
+			>> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
+			& SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK;
+	return sizeof(struct buffer_control);
+}
+
+static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+	u64 reg;
+	int i;
+
+	/* each register contains 16 SC->VLnt mappings, 4 bits each */
+	reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0);
+	for (i = 0; i < sizeof(u64); i++) {
+		u8 byte = *(((u8 *)&reg) + i);
+
+		dp->vlnt[2 * i] = byte & 0xf;
+		dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4;
+	}
+
+	reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16);
+	for (i = 0; i < sizeof(u64); i++) {
+		u8 byte = *(((u8 *)&reg) + i);
+
+		dp->vlnt[16 + (2 * i)] = byte & 0xf;
+		dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4;
+	}
+	return sizeof(struct sc2vlnt);
+}
+
+static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems,
+			      struct ib_vl_weight_elem *vl)
+{
+	unsigned int i;
+
+	for (i = 0; i < nelems; i++, vl++) {
+		vl->vl = 0xf;
+		vl->weight = 0;
+	}
+}
+
+static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+	write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0,
+		  DC_SC_VL_VAL(15_0,
+			       0, dp->vlnt[0] & 0xf,
+			       1, dp->vlnt[1] & 0xf,
+			       2, dp->vlnt[2] & 0xf,
+			       3, dp->vlnt[3] & 0xf,
+			       4, dp->vlnt[4] & 0xf,
+			       5, dp->vlnt[5] & 0xf,
+			       6, dp->vlnt[6] & 0xf,
+			       7, dp->vlnt[7] & 0xf,
+			       8, dp->vlnt[8] & 0xf,
+			       9, dp->vlnt[9] & 0xf,
+			       10, dp->vlnt[10] & 0xf,
+			       11, dp->vlnt[11] & 0xf,
+			       12, dp->vlnt[12] & 0xf,
+			       13, dp->vlnt[13] & 0xf,
+			       14, dp->vlnt[14] & 0xf,
+			       15, dp->vlnt[15] & 0xf));
+	write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16,
+		  DC_SC_VL_VAL(31_16,
+			       16, dp->vlnt[16] & 0xf,
+			       17, dp->vlnt[17] & 0xf,
+			       18, dp->vlnt[18] & 0xf,
+			       19, dp->vlnt[19] & 0xf,
+			       20, dp->vlnt[20] & 0xf,
+			       21, dp->vlnt[21] & 0xf,
+			       22, dp->vlnt[22] & 0xf,
+			       23, dp->vlnt[23] & 0xf,
+			       24, dp->vlnt[24] & 0xf,
+			       25, dp->vlnt[25] & 0xf,
+			       26, dp->vlnt[26] & 0xf,
+			       27, dp->vlnt[27] & 0xf,
+			       28, dp->vlnt[28] & 0xf,
+			       29, dp->vlnt[29] & 0xf,
+			       30, dp->vlnt[30] & 0xf,
+			       31, dp->vlnt[31] & 0xf));
+}
+
+static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what,
+			u16 limit)
+{
+	if (limit != 0)
+		dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n",
+			    what, (int)limit, idx);
+}
+
+/* change only the shared limit portion of SendCmGLobalCredit */
+static void set_global_shared(struct hfi1_devdata *dd, u16 limit)
+{
+	u64 reg;
+
+	reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+	reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK;
+	reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT;
+	write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* change only the total credit limit portion of SendCmGLobalCredit */
+static void set_global_limit(struct hfi1_devdata *dd, u16 limit)
+{
+	u64 reg;
+
+	reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+	reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK;
+	reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT;
+	write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* set the given per-VL shared limit */
+static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+	u64 reg;
+	u32 addr;
+
+	if (vl < TXE_NUM_DATA_VL)
+		addr = SEND_CM_CREDIT_VL + (8 * vl);
+	else
+		addr = SEND_CM_CREDIT_VL15;
+
+	reg = read_csr(dd, addr);
+	reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK;
+	reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT;
+	write_csr(dd, addr, reg);
+}
+
+/* set the given per-VL dedicated limit */
+static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+	u64 reg;
+	u32 addr;
+
+	if (vl < TXE_NUM_DATA_VL)
+		addr = SEND_CM_CREDIT_VL + (8 * vl);
+	else
+		addr = SEND_CM_CREDIT_VL15;
+
+	reg = read_csr(dd, addr);
+	reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK;
+	reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT;
+	write_csr(dd, addr, reg);
+}
+
+/* spin until the given per-VL status mask bits clear */
+static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask,
+				     const char *which)
+{
+	unsigned long timeout;
+	u64 reg;
+
+	timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT);
+	while (1) {
+		reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask;
+
+		if (reg == 0)
+			return;	/* success */
+		if (time_after(jiffies, timeout))
+			break;		/* timed out */
+		udelay(1);
+	}
+
+	dd_dev_err(dd,
+		   "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
+		   which, VL_STATUS_CLEAR_TIMEOUT, mask, reg);
+	/*
+	 * If this occurs, it is likely there was a credit loss on the link.
+	 * The only recovery from that is a link bounce.
+	 */
+	dd_dev_err(dd,
+		   "Continuing anyway.  A credit loss may occur.  Suggest a link bounce\n");
+}
+
+/*
+ * The number of credits on the VLs may be changed while everything
+ * is "live", but the following algorithm must be followed due to
+ * how the hardware is actually implemented.  In particular,
+ * Return_Credit_Status[] is the only correct status check.
+ *
+ * if (reducing Global_Shared_Credit_Limit or any shared limit changing)
+ *     set Global_Shared_Credit_Limit = 0
+ *     use_all_vl = 1
+ * mask0 = all VLs that are changing either dedicated or shared limits
+ * set Shared_Limit[mask0] = 0
+ * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0
+ * if (changing any dedicated limit)
+ *     mask1 = all VLs that are lowering dedicated limits
+ *     lower Dedicated_Limit[mask1]
+ *     spin until Return_Credit_Status[mask1] == 0
+ *     raise Dedicated_Limits
+ * raise Shared_Limits
+ * raise Global_Shared_Credit_Limit
+ *
+ * lower = if the new limit is lower, set the limit to the new value
+ * raise = if the new limit is higher than the current value (may be changed
+ *	earlier in the algorithm), set the new limit to the new value
+ */
+int set_buffer_control(struct hfi1_pportdata *ppd,
+		       struct buffer_control *new_bc)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 changing_mask, ld_mask, stat_mask;
+	int change_count;
+	int i, use_all_mask;
+	int this_shared_changing;
+	int vl_count = 0, ret;
+	/*
+	 * A0: add the variable any_shared_limit_changing below and in the
+	 * algorithm above.  If removing A0 support, it can be removed.
+	 */
+	int any_shared_limit_changing;
+	struct buffer_control cur_bc;
+	u8 changing[OPA_MAX_VLS];
+	u8 lowering_dedicated[OPA_MAX_VLS];
+	u16 cur_total;
+	u32 new_total = 0;
+	const u64 all_mask =
+	SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK
+	 | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK;
+
+#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15)
+#define NUM_USABLE_VLS 16	/* look at VL15 and less */
+
+	/* find the new total credits, do sanity check on unused VLs */
+	for (i = 0; i < OPA_MAX_VLS; i++) {
+		if (valid_vl(i)) {
+			new_total += be16_to_cpu(new_bc->vl[i].dedicated);
+			continue;
+		}
+		nonzero_msg(dd, i, "dedicated",
+			    be16_to_cpu(new_bc->vl[i].dedicated));
+		nonzero_msg(dd, i, "shared",
+			    be16_to_cpu(new_bc->vl[i].shared));
+		new_bc->vl[i].dedicated = 0;
+		new_bc->vl[i].shared = 0;
+	}
+	new_total += be16_to_cpu(new_bc->overall_shared_limit);
+
+	/* fetch the current values */
+	get_buffer_control(dd, &cur_bc, &cur_total);
+
+	/*
+	 * Create the masks we will use.
+	 */
+	memset(changing, 0, sizeof(changing));
+	memset(lowering_dedicated, 0, sizeof(lowering_dedicated));
+	/*
+	 * NOTE: Assumes that the individual VL bits are adjacent and in
+	 * increasing order
+	 */
+	stat_mask =
+		SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK;
+	changing_mask = 0;
+	ld_mask = 0;
+	change_count = 0;
+	any_shared_limit_changing = 0;
+	for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) {
+		if (!valid_vl(i))
+			continue;
+		this_shared_changing = new_bc->vl[i].shared
+						!= cur_bc.vl[i].shared;
+		if (this_shared_changing)
+			any_shared_limit_changing = 1;
+		if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated ||
+		    this_shared_changing) {
+			changing[i] = 1;
+			changing_mask |= stat_mask;
+			change_count++;
+		}
+		if (be16_to_cpu(new_bc->vl[i].dedicated) <
+					be16_to_cpu(cur_bc.vl[i].dedicated)) {
+			lowering_dedicated[i] = 1;
+			ld_mask |= stat_mask;
+		}
+	}
+
+	/* bracket the credit change with a total adjustment */
+	if (new_total > cur_total)
+		set_global_limit(dd, new_total);
+
+	/*
+	 * Start the credit change algorithm.
+	 */
+	use_all_mask = 0;
+	if ((be16_to_cpu(new_bc->overall_shared_limit) <
+	     be16_to_cpu(cur_bc.overall_shared_limit)) ||
+	    (is_ax(dd) && any_shared_limit_changing)) {
+		set_global_shared(dd, 0);
+		cur_bc.overall_shared_limit = 0;
+		use_all_mask = 1;
+	}
+
+	for (i = 0; i < NUM_USABLE_VLS; i++) {
+		if (!valid_vl(i))
+			continue;
+
+		if (changing[i]) {
+			set_vl_shared(dd, i, 0);
+			cur_bc.vl[i].shared = 0;
+		}
+	}
+
+	wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask,
+				 "shared");
+
+	if (change_count > 0) {
+		for (i = 0; i < NUM_USABLE_VLS; i++) {
+			if (!valid_vl(i))
+				continue;
+
+			if (lowering_dedicated[i]) {
+				set_vl_dedicated(dd, i,
+						 be16_to_cpu(new_bc->
+							     vl[i].dedicated));
+				cur_bc.vl[i].dedicated =
+						new_bc->vl[i].dedicated;
+			}
+		}
+
+		wait_for_vl_status_clear(dd, ld_mask, "dedicated");
+
+		/* now raise all dedicated that are going up */
+		for (i = 0; i < NUM_USABLE_VLS; i++) {
+			if (!valid_vl(i))
+				continue;
+
+			if (be16_to_cpu(new_bc->vl[i].dedicated) >
+					be16_to_cpu(cur_bc.vl[i].dedicated))
+				set_vl_dedicated(dd, i,
+						 be16_to_cpu(new_bc->
+							     vl[i].dedicated));
+		}
+	}
+
+	/* next raise all shared that are going up */
+	for (i = 0; i < NUM_USABLE_VLS; i++) {
+		if (!valid_vl(i))
+			continue;
+
+		if (be16_to_cpu(new_bc->vl[i].shared) >
+				be16_to_cpu(cur_bc.vl[i].shared))
+			set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared));
+	}
+
+	/* finally raise the global shared */
+	if (be16_to_cpu(new_bc->overall_shared_limit) >
+	    be16_to_cpu(cur_bc.overall_shared_limit))
+		set_global_shared(dd,
+				  be16_to_cpu(new_bc->overall_shared_limit));
+
+	/* bracket the credit change with a total adjustment */
+	if (new_total < cur_total)
+		set_global_limit(dd, new_total);
+
+	/*
+	 * Determine the actual number of operational VLS using the number of
+	 * dedicated and shared credits for each VL.
+	 */
+	if (change_count > 0) {
+		for (i = 0; i < TXE_NUM_DATA_VL; i++)
+			if (be16_to_cpu(new_bc->vl[i].dedicated) > 0 ||
+			    be16_to_cpu(new_bc->vl[i].shared) > 0)
+				vl_count++;
+		ppd->actual_vls_operational = vl_count;
+		ret = sdma_map_init(dd, ppd->port - 1, vl_count ?
+				    ppd->actual_vls_operational :
+				    ppd->vls_operational,
+				    NULL);
+		if (ret == 0)
+			ret = pio_map_init(dd, ppd->port - 1, vl_count ?
+					   ppd->actual_vls_operational :
+					   ppd->vls_operational, NULL);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+/*
+ * Read the given fabric manager table. Return the size of the
+ * table (in bytes) on success, and a negative error code on
+ * failure.
+ */
+int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t)
+
+{
+	int size;
+	struct vl_arb_cache *vlc;
+
+	switch (which) {
+	case FM_TBL_VL_HIGH_ARB:
+		size = 256;
+		/*
+		 * OPA specifies 128 elements (of 2 bytes each), though
+		 * HFI supports only 16 elements in h/w.
+		 */
+		vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+		vl_arb_get_cache(vlc, t);
+		vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+		break;
+	case FM_TBL_VL_LOW_ARB:
+		size = 256;
+		/*
+		 * OPA specifies 128 elements (of 2 bytes each), though
+		 * HFI supports only 16 elements in h/w.
+		 */
+		vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+		vl_arb_get_cache(vlc, t);
+		vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+		break;
+	case FM_TBL_BUFFER_CONTROL:
+		size = get_buffer_control(ppd->dd, t, NULL);
+		break;
+	case FM_TBL_SC2VLNT:
+		size = get_sc2vlnt(ppd->dd, t);
+		break;
+	case FM_TBL_VL_PREEMPT_ELEMS:
+		size = 256;
+		/* OPA specifies 128 elements, of 2 bytes each */
+		get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t);
+		break;
+	case FM_TBL_VL_PREEMPT_MATRIX:
+		size = 256;
+		/*
+		 * OPA specifies that this is the same size as the VL
+		 * arbitration tables (i.e., 256 bytes).
+		 */
+		break;
+	default:
+		return -EINVAL;
+	}
+	return size;
+}
+
+/*
+ * Write the given fabric manager table.
+ */
+int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t)
+{
+	int ret = 0;
+	struct vl_arb_cache *vlc;
+
+	switch (which) {
+	case FM_TBL_VL_HIGH_ARB:
+		vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+		if (vl_arb_match_cache(vlc, t)) {
+			vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+			break;
+		}
+		vl_arb_set_cache(vlc, t);
+		vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+		ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST,
+				     VL_ARB_HIGH_PRIO_TABLE_SIZE, t);
+		break;
+	case FM_TBL_VL_LOW_ARB:
+		vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+		if (vl_arb_match_cache(vlc, t)) {
+			vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+			break;
+		}
+		vl_arb_set_cache(vlc, t);
+		vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+		ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST,
+				     VL_ARB_LOW_PRIO_TABLE_SIZE, t);
+		break;
+	case FM_TBL_BUFFER_CONTROL:
+		ret = set_buffer_control(ppd, t);
+		break;
+	case FM_TBL_SC2VLNT:
+		set_sc2vlnt(ppd->dd, t);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/*
+ * Disable all data VLs.
+ *
+ * Return 0 if disabled, non-zero if the VLs cannot be disabled.
+ */
+static int disable_data_vls(struct hfi1_devdata *dd)
+{
+	if (is_ax(dd))
+		return 1;
+
+	pio_send_control(dd, PSC_DATA_VL_DISABLE);
+
+	return 0;
+}
+
+/*
+ * open_fill_data_vls() - the counterpart to stop_drain_data_vls().
+ * Just re-enables all data VLs (the "fill" part happens
+ * automatically - the name was chosen for symmetry with
+ * stop_drain_data_vls()).
+ *
+ * Return 0 if successful, non-zero if the VLs cannot be enabled.
+ */
+int open_fill_data_vls(struct hfi1_devdata *dd)
+{
+	if (is_ax(dd))
+		return 1;
+
+	pio_send_control(dd, PSC_DATA_VL_ENABLE);
+
+	return 0;
+}
+
+/*
+ * drain_data_vls() - assumes that disable_data_vls() has been called,
+ * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA
+ * engines to drop to 0.
+ */
+static void drain_data_vls(struct hfi1_devdata *dd)
+{
+	sc_wait(dd);
+	sdma_wait(dd);
+	pause_for_credit_return(dd);
+}
+
+/*
+ * stop_drain_data_vls() - disable, then drain all per-VL fifos.
+ *
+ * Use open_fill_data_vls() to resume using data VLs.  This pair is
+ * meant to be used like this:
+ *
+ * stop_drain_data_vls(dd);
+ * // do things with per-VL resources
+ * open_fill_data_vls(dd);
+ */
+int stop_drain_data_vls(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	ret = disable_data_vls(dd);
+	if (ret == 0)
+		drain_data_vls(dd);
+
+	return ret;
+}
+
+/*
+ * Convert a nanosecond time to a cclock count.  No matter how slow
+ * the cclock, a non-zero ns will always have a non-zero result.
+ */
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns)
+{
+	u32 cclocks;
+
+	if (dd->icode == ICODE_FPGA_EMULATION)
+		cclocks = (ns * 1000) / FPGA_CCLOCK_PS;
+	else  /* simulation pretends to be ASIC */
+		cclocks = (ns * 1000) / ASIC_CCLOCK_PS;
+	if (ns && !cclocks)	/* if ns nonzero, must be at least 1 */
+		cclocks = 1;
+	return cclocks;
+}
+
+/*
+ * Convert a cclock count to nanoseconds. Not matter how slow
+ * the cclock, a non-zero cclocks will always have a non-zero result.
+ */
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks)
+{
+	u32 ns;
+
+	if (dd->icode == ICODE_FPGA_EMULATION)
+		ns = (cclocks * FPGA_CCLOCK_PS) / 1000;
+	else  /* simulation pretends to be ASIC */
+		ns = (cclocks * ASIC_CCLOCK_PS) / 1000;
+	if (cclocks && !ns)
+		ns = 1;
+	return ns;
+}
+
+/*
+ * Dynamically adjust the receive interrupt timeout for a context based on
+ * incoming packet rate.
+ *
+ * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero.
+ */
+static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 timeout = rcd->rcvavail_timeout;
+
+	/*
+	 * This algorithm doubles or halves the timeout depending on whether
+	 * the number of packets received in this interrupt were less than or
+	 * greater equal the interrupt count.
+	 *
+	 * The calculations below do not allow a steady state to be achieved.
+	 * Only at the endpoints it is possible to have an unchanging
+	 * timeout.
+	 */
+	if (npkts < rcv_intr_count) {
+		/*
+		 * Not enough packets arrived before the timeout, adjust
+		 * timeout downward.
+		 */
+		if (timeout < 2) /* already at minimum? */
+			return;
+		timeout >>= 1;
+	} else {
+		/*
+		 * More than enough packets arrived before the timeout, adjust
+		 * timeout upward.
+		 */
+		if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */
+			return;
+		timeout = min(timeout << 1, dd->rcv_intr_timeout_csr);
+	}
+
+	rcd->rcvavail_timeout = timeout;
+	/*
+	 * timeout cannot be larger than rcv_intr_timeout_csr which has already
+	 * been verified to be in range
+	 */
+	write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT,
+			(u64)timeout <<
+			RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+}
+
+void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd,
+		    u32 intr_adjust, u32 npkts)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	u64 reg;
+	u32 ctxt = rcd->ctxt;
+
+	/*
+	 * Need to write timeout register before updating RcvHdrHead to ensure
+	 * that a new value is used when the HW decides to restart counting.
+	 */
+	if (intr_adjust)
+		adjust_rcv_timeout(rcd, npkts);
+	if (updegr) {
+		reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK)
+			<< RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
+		write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
+	}
+	mmiowb();
+	reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
+		(((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
+			<< RCV_HDR_HEAD_HEAD_SHIFT);
+	write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+	mmiowb();
+}
+
+u32 hdrqempty(struct hfi1_ctxtdata *rcd)
+{
+	u32 head, tail;
+
+	head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD)
+		& RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT;
+
+	if (rcd->rcvhdrtail_kvaddr)
+		tail = get_rcvhdrtail(rcd);
+	else
+		tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
+
+	return head == tail;
+}
+
+/*
+ * Context Control and Receive Array encoding for buffer size:
+ *	0x0 invalid
+ *	0x1   4 KB
+ *	0x2   8 KB
+ *	0x3  16 KB
+ *	0x4  32 KB
+ *	0x5  64 KB
+ *	0x6 128 KB
+ *	0x7 256 KB
+ *	0x8 512 KB (Receive Array only)
+ *	0x9   1 MB (Receive Array only)
+ *	0xa   2 MB (Receive Array only)
+ *
+ *	0xB-0xF - reserved (Receive Array only)
+ *
+ *
+ * This routine assumes that the value has already been sanity checked.
+ */
+static u32 encoded_size(u32 size)
+{
+	switch (size) {
+	case   4 * 1024: return 0x1;
+	case   8 * 1024: return 0x2;
+	case  16 * 1024: return 0x3;
+	case  32 * 1024: return 0x4;
+	case  64 * 1024: return 0x5;
+	case 128 * 1024: return 0x6;
+	case 256 * 1024: return 0x7;
+	case 512 * 1024: return 0x8;
+	case   1 * 1024 * 1024: return 0x9;
+	case   2 * 1024 * 1024: return 0xa;
+	}
+	return 0x1;	/* if invalid, go with the minimum size */
+}
+
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op,
+		  struct hfi1_ctxtdata *rcd)
+{
+	u64 rcvctrl, reg;
+	int did_enable = 0;
+	u16 ctxt;
+
+	if (!rcd)
+		return;
+
+	ctxt = rcd->ctxt;
+
+	hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op);
+
+	rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
+	/* if the context already enabled, don't do the extra steps */
+	if ((op & HFI1_RCVCTRL_CTXT_ENB) &&
+	    !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) {
+		/* reset the tail and hdr addresses, and sequence count */
+		write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
+				rcd->rcvhdrq_dma);
+		if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
+			write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+					rcd->rcvhdrqtailaddr_dma);
+		rcd->seq_cnt = 1;
+
+		/* reset the cached receive header queue head value */
+		rcd->head = 0;
+
+		/*
+		 * Zero the receive header queue so we don't get false
+		 * positives when checking the sequence number.  The
+		 * sequence numbers could land exactly on the same spot.
+		 * E.g. a rcd restart before the receive header wrapped.
+		 */
+		memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
+
+		/* starting timeout */
+		rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr;
+
+		/* enable the context */
+		rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK;
+
+		/* clean the egr buffer size first */
+		rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+		rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size)
+				& RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK)
+					<< RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT;
+
+		/* zero RcvHdrHead - set RcvHdrHead.Counter after enable */
+		write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0);
+		did_enable = 1;
+
+		/* zero RcvEgrIndexHead */
+		write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0);
+
+		/* set eager count and base index */
+		reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT)
+			& RCV_EGR_CTRL_EGR_CNT_MASK)
+		       << RCV_EGR_CTRL_EGR_CNT_SHIFT) |
+			(((rcd->eager_base >> RCV_SHIFT)
+			  & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK)
+			 << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT);
+		write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg);
+
+		/*
+		 * Set TID (expected) count and base index.
+		 * rcd->expected_count is set to individual RcvArray entries,
+		 * not pairs, and the CSR takes a pair-count in groups of
+		 * four, so divide by 8.
+		 */
+		reg = (((rcd->expected_count >> RCV_SHIFT)
+					& RCV_TID_CTRL_TID_PAIR_CNT_MASK)
+				<< RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) |
+		      (((rcd->expected_base >> RCV_SHIFT)
+					& RCV_TID_CTRL_TID_BASE_INDEX_MASK)
+				<< RCV_TID_CTRL_TID_BASE_INDEX_SHIFT);
+		write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg);
+		if (ctxt == HFI1_CTRL_CTXT)
+			write_csr(dd, RCV_VL15, HFI1_CTRL_CTXT);
+	}
+	if (op & HFI1_RCVCTRL_CTXT_DIS) {
+		write_csr(dd, RCV_VL15, 0);
+		/*
+		 * When receive context is being disabled turn on tail
+		 * update with a dummy tail address and then disable
+		 * receive context.
+		 */
+		if (dd->rcvhdrtail_dummy_dma) {
+			write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+					dd->rcvhdrtail_dummy_dma);
+			/* Enabling RcvCtxtCtrl.TailUpd is intentional. */
+			rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+		}
+
+		rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
+	}
+	if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
+		rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+	if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
+		rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+	if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_dma)
+		rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+	if (op & HFI1_RCVCTRL_TAILUPD_DIS) {
+		/* See comment on RcvCtxtCtrl.TailUpd above */
+		if (!(op & HFI1_RCVCTRL_CTXT_DIS))
+			rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+	}
+	if (op & HFI1_RCVCTRL_TIDFLOW_ENB)
+		rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+	if (op & HFI1_RCVCTRL_TIDFLOW_DIS)
+		rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+	if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) {
+		/*
+		 * In one-packet-per-eager mode, the size comes from
+		 * the RcvArray entry.
+		 */
+		rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+		rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+	}
+	if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS)
+		rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+	if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB)
+		rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+	if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS)
+		rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+	if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB)
+		rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+	if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
+		rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+	rcd->rcvctrl = rcvctrl;
+	hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
+	write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl);
+
+	/* work around sticky RcvCtxtStatus.BlockedRHQFull */
+	if (did_enable &&
+	    (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) {
+		reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+		if (reg != 0) {
+			dd_dev_info(dd, "ctxt %d status %lld (blocked)\n",
+				    ctxt, reg);
+			read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+			write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10);
+			write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00);
+			read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+			reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+			dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n",
+				    ctxt, reg, reg == 0 ? "not" : "still");
+		}
+	}
+
+	if (did_enable) {
+		/*
+		 * The interrupt timeout and count must be set after
+		 * the context is enabled to take effect.
+		 */
+		/* set interrupt timeout */
+		write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT,
+				(u64)rcd->rcvavail_timeout <<
+				RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+
+		/* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */
+		reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT;
+		write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+	}
+
+	if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS))
+		/*
+		 * If the context has been disabled and the Tail Update has
+		 * been cleared, set the RCV_HDR_TAIL_ADDR CSR to dummy address
+		 * so it doesn't contain an address that is invalid.
+		 */
+		write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+				dd->rcvhdrtail_dummy_dma);
+}
+
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp)
+{
+	int ret;
+	u64 val = 0;
+
+	if (namep) {
+		ret = dd->cntrnameslen;
+		*namep = dd->cntrnames;
+	} else {
+		const struct cntr_entry *entry;
+		int i, j;
+
+		ret = (dd->ndevcntrs) * sizeof(u64);
+
+		/* Get the start of the block of counters */
+		*cntrp = dd->cntrs;
+
+		/*
+		 * Now go and fill in each counter in the block.
+		 */
+		for (i = 0; i < DEV_CNTR_LAST; i++) {
+			entry = &dev_cntrs[i];
+			hfi1_cdbg(CNTR, "reading %s", entry->name);
+			if (entry->flags & CNTR_DISABLED) {
+				/* Nothing */
+				hfi1_cdbg(CNTR, "\tDisabled\n");
+			} else {
+				if (entry->flags & CNTR_VL) {
+					hfi1_cdbg(CNTR, "\tPer VL\n");
+					for (j = 0; j < C_VL_COUNT; j++) {
+						val = entry->rw_cntr(entry,
+								  dd, j,
+								  CNTR_MODE_R,
+								  0);
+						hfi1_cdbg(
+						   CNTR,
+						   "\t\tRead 0x%llx for %d\n",
+						   val, j);
+						dd->cntrs[entry->offset + j] =
+									    val;
+					}
+				} else if (entry->flags & CNTR_SDMA) {
+					hfi1_cdbg(CNTR,
+						  "\t Per SDMA Engine\n");
+					for (j = 0; j < dd->chip_sdma_engines;
+					     j++) {
+						val =
+						entry->rw_cntr(entry, dd, j,
+							       CNTR_MODE_R, 0);
+						hfi1_cdbg(CNTR,
+							  "\t\tRead 0x%llx for %d\n",
+							  val, j);
+						dd->cntrs[entry->offset + j] =
+									val;
+					}
+				} else {
+					val = entry->rw_cntr(entry, dd,
+							CNTR_INVALID_VL,
+							CNTR_MODE_R, 0);
+					dd->cntrs[entry->offset] = val;
+					hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+				}
+			}
+		}
+	}
+	return ret;
+}
+
+/*
+ * Used by sysfs to create files for hfi stats to read
+ */
+u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp)
+{
+	int ret;
+	u64 val = 0;
+
+	if (namep) {
+		ret = ppd->dd->portcntrnameslen;
+		*namep = ppd->dd->portcntrnames;
+	} else {
+		const struct cntr_entry *entry;
+		int i, j;
+
+		ret = ppd->dd->nportcntrs * sizeof(u64);
+		*cntrp = ppd->cntrs;
+
+		for (i = 0; i < PORT_CNTR_LAST; i++) {
+			entry = &port_cntrs[i];
+			hfi1_cdbg(CNTR, "reading %s", entry->name);
+			if (entry->flags & CNTR_DISABLED) {
+				/* Nothing */
+				hfi1_cdbg(CNTR, "\tDisabled\n");
+				continue;
+			}
+
+			if (entry->flags & CNTR_VL) {
+				hfi1_cdbg(CNTR, "\tPer VL");
+				for (j = 0; j < C_VL_COUNT; j++) {
+					val = entry->rw_cntr(entry, ppd, j,
+							       CNTR_MODE_R,
+							       0);
+					hfi1_cdbg(
+					   CNTR,
+					   "\t\tRead 0x%llx for %d",
+					   val, j);
+					ppd->cntrs[entry->offset + j] = val;
+				}
+			} else {
+				val = entry->rw_cntr(entry, ppd,
+						       CNTR_INVALID_VL,
+						       CNTR_MODE_R,
+						       0);
+				ppd->cntrs[entry->offset] = val;
+				hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+			}
+		}
+	}
+	return ret;
+}
+
+static void free_cntrs(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd;
+	int i;
+
+	if (dd->synth_stats_timer.function)
+		del_timer_sync(&dd->synth_stats_timer);
+	ppd = (struct hfi1_pportdata *)(dd + 1);
+	for (i = 0; i < dd->num_pports; i++, ppd++) {
+		kfree(ppd->cntrs);
+		kfree(ppd->scntrs);
+		free_percpu(ppd->ibport_data.rvp.rc_acks);
+		free_percpu(ppd->ibport_data.rvp.rc_qacks);
+		free_percpu(ppd->ibport_data.rvp.rc_delayed_comp);
+		ppd->cntrs = NULL;
+		ppd->scntrs = NULL;
+		ppd->ibport_data.rvp.rc_acks = NULL;
+		ppd->ibport_data.rvp.rc_qacks = NULL;
+		ppd->ibport_data.rvp.rc_delayed_comp = NULL;
+	}
+	kfree(dd->portcntrnames);
+	dd->portcntrnames = NULL;
+	kfree(dd->cntrs);
+	dd->cntrs = NULL;
+	kfree(dd->scntrs);
+	dd->scntrs = NULL;
+	kfree(dd->cntrnames);
+	dd->cntrnames = NULL;
+	if (dd->update_cntr_wq) {
+		destroy_workqueue(dd->update_cntr_wq);
+		dd->update_cntr_wq = NULL;
+	}
+}
+
+static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
+			      u64 *psval, void *context, int vl)
+{
+	u64 val;
+	u64 sval = *psval;
+
+	if (entry->flags & CNTR_DISABLED) {
+		dd_dev_err(dd, "Counter %s not enabled", entry->name);
+		return 0;
+	}
+
+	hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+	val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0);
+
+	/* If its a synthetic counter there is more work we need to do */
+	if (entry->flags & CNTR_SYNTH) {
+		if (sval == CNTR_MAX) {
+			/* No need to read already saturated */
+			return CNTR_MAX;
+		}
+
+		if (entry->flags & CNTR_32BIT) {
+			/* 32bit counters can wrap multiple times */
+			u64 upper = sval >> 32;
+			u64 lower = (sval << 32) >> 32;
+
+			if (lower > val) { /* hw wrapped */
+				if (upper == CNTR_32BIT_MAX)
+					val = CNTR_MAX;
+				else
+					upper++;
+			}
+
+			if (val != CNTR_MAX)
+				val = (upper << 32) | val;
+
+		} else {
+			/* If we rolled we are saturated */
+			if ((val < sval) || (val > CNTR_MAX))
+				val = CNTR_MAX;
+		}
+	}
+
+	*psval = val;
+
+	hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+	return val;
+}
+
+static u64 write_dev_port_cntr(struct hfi1_devdata *dd,
+			       struct cntr_entry *entry,
+			       u64 *psval, void *context, int vl, u64 data)
+{
+	u64 val;
+
+	if (entry->flags & CNTR_DISABLED) {
+		dd_dev_err(dd, "Counter %s not enabled", entry->name);
+		return 0;
+	}
+
+	hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+	if (entry->flags & CNTR_SYNTH) {
+		*psval = data;
+		if (entry->flags & CNTR_32BIT) {
+			val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+					     (data << 32) >> 32);
+			val = data; /* return the full 64bit value */
+		} else {
+			val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+					     data);
+		}
+	} else {
+		val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data);
+	}
+
+	*psval = val;
+
+	hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+	return val;
+}
+
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl)
+{
+	struct cntr_entry *entry;
+	u64 *sval;
+
+	entry = &dev_cntrs[index];
+	sval = dd->scntrs + entry->offset;
+
+	if (vl != CNTR_INVALID_VL)
+		sval += vl;
+
+	return read_dev_port_cntr(dd, entry, sval, dd, vl);
+}
+
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data)
+{
+	struct cntr_entry *entry;
+	u64 *sval;
+
+	entry = &dev_cntrs[index];
+	sval = dd->scntrs + entry->offset;
+
+	if (vl != CNTR_INVALID_VL)
+		sval += vl;
+
+	return write_dev_port_cntr(dd, entry, sval, dd, vl, data);
+}
+
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl)
+{
+	struct cntr_entry *entry;
+	u64 *sval;
+
+	entry = &port_cntrs[index];
+	sval = ppd->scntrs + entry->offset;
+
+	if (vl != CNTR_INVALID_VL)
+		sval += vl;
+
+	if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+	    (index <= C_RCV_HDR_OVF_LAST)) {
+		/* We do not want to bother for disabled contexts */
+		return 0;
+	}
+
+	return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl);
+}
+
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data)
+{
+	struct cntr_entry *entry;
+	u64 *sval;
+
+	entry = &port_cntrs[index];
+	sval = ppd->scntrs + entry->offset;
+
+	if (vl != CNTR_INVALID_VL)
+		sval += vl;
+
+	if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+	    (index <= C_RCV_HDR_OVF_LAST)) {
+		/* We do not want to bother for disabled contexts */
+		return 0;
+	}
+
+	return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data);
+}
+
+static void do_update_synth_timer(struct work_struct *work)
+{
+	u64 cur_tx;
+	u64 cur_rx;
+	u64 total_flits;
+	u8 update = 0;
+	int i, j, vl;
+	struct hfi1_pportdata *ppd;
+	struct cntr_entry *entry;
+	struct hfi1_devdata *dd = container_of(work, struct hfi1_devdata,
+					       update_cntr_work);
+
+	/*
+	 * Rather than keep beating on the CSRs pick a minimal set that we can
+	 * check to watch for potential roll over. We can do this by looking at
+	 * the number of flits sent/recv. If the total flits exceeds 32bits then
+	 * we have to iterate all the counters and update.
+	 */
+	entry = &dev_cntrs[C_DC_RCV_FLITS];
+	cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+	entry = &dev_cntrs[C_DC_XMIT_FLITS];
+	cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+	hfi1_cdbg(
+	    CNTR,
+	    "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx\n",
+	    dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx);
+
+	if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) {
+		/*
+		 * May not be strictly necessary to update but it won't hurt and
+		 * simplifies the logic here.
+		 */
+		update = 1;
+		hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating",
+			  dd->unit);
+	} else {
+		total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx);
+		hfi1_cdbg(CNTR,
+			  "[%d] total flits 0x%llx limit 0x%llx\n", dd->unit,
+			  total_flits, (u64)CNTR_32BIT_MAX);
+		if (total_flits >= CNTR_32BIT_MAX) {
+			hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating",
+				  dd->unit);
+			update = 1;
+		}
+	}
+
+	if (update) {
+		hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit);
+		for (i = 0; i < DEV_CNTR_LAST; i++) {
+			entry = &dev_cntrs[i];
+			if (entry->flags & CNTR_VL) {
+				for (vl = 0; vl < C_VL_COUNT; vl++)
+					read_dev_cntr(dd, i, vl);
+			} else {
+				read_dev_cntr(dd, i, CNTR_INVALID_VL);
+			}
+		}
+		ppd = (struct hfi1_pportdata *)(dd + 1);
+		for (i = 0; i < dd->num_pports; i++, ppd++) {
+			for (j = 0; j < PORT_CNTR_LAST; j++) {
+				entry = &port_cntrs[j];
+				if (entry->flags & CNTR_VL) {
+					for (vl = 0; vl < C_VL_COUNT; vl++)
+						read_port_cntr(ppd, j, vl);
+				} else {
+					read_port_cntr(ppd, j, CNTR_INVALID_VL);
+				}
+			}
+		}
+
+		/*
+		 * We want the value in the register. The goal is to keep track
+		 * of the number of "ticks" not the counter value. In other
+		 * words if the register rolls we want to notice it and go ahead
+		 * and force an update.
+		 */
+		entry = &dev_cntrs[C_DC_XMIT_FLITS];
+		dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+						CNTR_MODE_R, 0);
+
+		entry = &dev_cntrs[C_DC_RCV_FLITS];
+		dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+						CNTR_MODE_R, 0);
+
+		hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx",
+			  dd->unit, dd->last_tx, dd->last_rx);
+
+	} else {
+		hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit);
+	}
+}
+
+static void update_synth_timer(struct timer_list *t)
+{
+	struct hfi1_devdata *dd = from_timer(dd, t, synth_stats_timer);
+
+	queue_work(dd->update_cntr_wq, &dd->update_cntr_work);
+	mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+}
+
+#define C_MAX_NAME 16 /* 15 chars + one for /0 */
+static int init_cntrs(struct hfi1_devdata *dd)
+{
+	int i, rcv_ctxts, j;
+	size_t sz;
+	char *p;
+	char name[C_MAX_NAME];
+	struct hfi1_pportdata *ppd;
+	const char *bit_type_32 = ",32";
+	const int bit_type_32_sz = strlen(bit_type_32);
+
+	/* set up the stats timer; the add_timer is done at the end */
+	timer_setup(&dd->synth_stats_timer, update_synth_timer, 0);
+
+	/***********************/
+	/* per device counters */
+	/***********************/
+
+	/* size names and determine how many we have*/
+	dd->ndevcntrs = 0;
+	sz = 0;
+
+	for (i = 0; i < DEV_CNTR_LAST; i++) {
+		if (dev_cntrs[i].flags & CNTR_DISABLED) {
+			hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name);
+			continue;
+		}
+
+		if (dev_cntrs[i].flags & CNTR_VL) {
+			dev_cntrs[i].offset = dd->ndevcntrs;
+			for (j = 0; j < C_VL_COUNT; j++) {
+				snprintf(name, C_MAX_NAME, "%s%d",
+					 dev_cntrs[i].name, vl_from_idx(j));
+				sz += strlen(name);
+				/* Add ",32" for 32-bit counters */
+				if (dev_cntrs[i].flags & CNTR_32BIT)
+					sz += bit_type_32_sz;
+				sz++;
+				dd->ndevcntrs++;
+			}
+		} else if (dev_cntrs[i].flags & CNTR_SDMA) {
+			dev_cntrs[i].offset = dd->ndevcntrs;
+			for (j = 0; j < dd->chip_sdma_engines; j++) {
+				snprintf(name, C_MAX_NAME, "%s%d",
+					 dev_cntrs[i].name, j);
+				sz += strlen(name);
+				/* Add ",32" for 32-bit counters */
+				if (dev_cntrs[i].flags & CNTR_32BIT)
+					sz += bit_type_32_sz;
+				sz++;
+				dd->ndevcntrs++;
+			}
+		} else {
+			/* +1 for newline. */
+			sz += strlen(dev_cntrs[i].name) + 1;
+			/* Add ",32" for 32-bit counters */
+			if (dev_cntrs[i].flags & CNTR_32BIT)
+				sz += bit_type_32_sz;
+			dev_cntrs[i].offset = dd->ndevcntrs;
+			dd->ndevcntrs++;
+		}
+	}
+
+	/* allocate space for the counter values */
+	dd->cntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
+	if (!dd->cntrs)
+		goto bail;
+
+	dd->scntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
+	if (!dd->scntrs)
+		goto bail;
+
+	/* allocate space for the counter names */
+	dd->cntrnameslen = sz;
+	dd->cntrnames = kmalloc(sz, GFP_KERNEL);
+	if (!dd->cntrnames)
+		goto bail;
+
+	/* fill in the names */
+	for (p = dd->cntrnames, i = 0; i < DEV_CNTR_LAST; i++) {
+		if (dev_cntrs[i].flags & CNTR_DISABLED) {
+			/* Nothing */
+		} else if (dev_cntrs[i].flags & CNTR_VL) {
+			for (j = 0; j < C_VL_COUNT; j++) {
+				snprintf(name, C_MAX_NAME, "%s%d",
+					 dev_cntrs[i].name,
+					 vl_from_idx(j));
+				memcpy(p, name, strlen(name));
+				p += strlen(name);
+
+				/* Counter is 32 bits */
+				if (dev_cntrs[i].flags & CNTR_32BIT) {
+					memcpy(p, bit_type_32, bit_type_32_sz);
+					p += bit_type_32_sz;
+				}
+
+				*p++ = '\n';
+			}
+		} else if (dev_cntrs[i].flags & CNTR_SDMA) {
+			for (j = 0; j < dd->chip_sdma_engines; j++) {
+				snprintf(name, C_MAX_NAME, "%s%d",
+					 dev_cntrs[i].name, j);
+				memcpy(p, name, strlen(name));
+				p += strlen(name);
+
+				/* Counter is 32 bits */
+				if (dev_cntrs[i].flags & CNTR_32BIT) {
+					memcpy(p, bit_type_32, bit_type_32_sz);
+					p += bit_type_32_sz;
+				}
+
+				*p++ = '\n';
+			}
+		} else {
+			memcpy(p, dev_cntrs[i].name, strlen(dev_cntrs[i].name));
+			p += strlen(dev_cntrs[i].name);
+
+			/* Counter is 32 bits */
+			if (dev_cntrs[i].flags & CNTR_32BIT) {
+				memcpy(p, bit_type_32, bit_type_32_sz);
+				p += bit_type_32_sz;
+			}
+
+			*p++ = '\n';
+		}
+	}
+
+	/*********************/
+	/* per port counters */
+	/*********************/
+
+	/*
+	 * Go through the counters for the overflows and disable the ones we
+	 * don't need. This varies based on platform so we need to do it
+	 * dynamically here.
+	 */
+	rcv_ctxts = dd->num_rcv_contexts;
+	for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts;
+	     i <= C_RCV_HDR_OVF_LAST; i++) {
+		port_cntrs[i].flags |= CNTR_DISABLED;
+	}
+
+	/* size port counter names and determine how many we have*/
+	sz = 0;
+	dd->nportcntrs = 0;
+	for (i = 0; i < PORT_CNTR_LAST; i++) {
+		if (port_cntrs[i].flags & CNTR_DISABLED) {
+			hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name);
+			continue;
+		}
+
+		if (port_cntrs[i].flags & CNTR_VL) {
+			port_cntrs[i].offset = dd->nportcntrs;
+			for (j = 0; j < C_VL_COUNT; j++) {
+				snprintf(name, C_MAX_NAME, "%s%d",
+					 port_cntrs[i].name, vl_from_idx(j));
+				sz += strlen(name);
+				/* Add ",32" for 32-bit counters */
+				if (port_cntrs[i].flags & CNTR_32BIT)
+					sz += bit_type_32_sz;
+				sz++;
+				dd->nportcntrs++;
+			}
+		} else {
+			/* +1 for newline */
+			sz += strlen(port_cntrs[i].name) + 1;
+			/* Add ",32" for 32-bit counters */
+			if (port_cntrs[i].flags & CNTR_32BIT)
+				sz += bit_type_32_sz;
+			port_cntrs[i].offset = dd->nportcntrs;
+			dd->nportcntrs++;
+		}
+	}
+
+	/* allocate space for the counter names */
+	dd->portcntrnameslen = sz;
+	dd->portcntrnames = kmalloc(sz, GFP_KERNEL);
+	if (!dd->portcntrnames)
+		goto bail;
+
+	/* fill in port cntr names */
+	for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) {
+		if (port_cntrs[i].flags & CNTR_DISABLED)
+			continue;
+
+		if (port_cntrs[i].flags & CNTR_VL) {
+			for (j = 0; j < C_VL_COUNT; j++) {
+				snprintf(name, C_MAX_NAME, "%s%d",
+					 port_cntrs[i].name, vl_from_idx(j));
+				memcpy(p, name, strlen(name));
+				p += strlen(name);
+
+				/* Counter is 32 bits */
+				if (port_cntrs[i].flags & CNTR_32BIT) {
+					memcpy(p, bit_type_32, bit_type_32_sz);
+					p += bit_type_32_sz;
+				}
+
+				*p++ = '\n';
+			}
+		} else {
+			memcpy(p, port_cntrs[i].name,
+			       strlen(port_cntrs[i].name));
+			p += strlen(port_cntrs[i].name);
+
+			/* Counter is 32 bits */
+			if (port_cntrs[i].flags & CNTR_32BIT) {
+				memcpy(p, bit_type_32, bit_type_32_sz);
+				p += bit_type_32_sz;
+			}
+
+			*p++ = '\n';
+		}
+	}
+
+	/* allocate per port storage for counter values */
+	ppd = (struct hfi1_pportdata *)(dd + 1);
+	for (i = 0; i < dd->num_pports; i++, ppd++) {
+		ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+		if (!ppd->cntrs)
+			goto bail;
+
+		ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+		if (!ppd->scntrs)
+			goto bail;
+	}
+
+	/* CPU counters need to be allocated and zeroed */
+	if (init_cpu_counters(dd))
+		goto bail;
+
+	dd->update_cntr_wq = alloc_ordered_workqueue("hfi1_update_cntr_%d",
+						     WQ_MEM_RECLAIM, dd->unit);
+	if (!dd->update_cntr_wq)
+		goto bail;
+
+	INIT_WORK(&dd->update_cntr_work, do_update_synth_timer);
+
+	mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+	return 0;
+bail:
+	free_cntrs(dd);
+	return -ENOMEM;
+}
+
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+	switch (chip_lstate) {
+	default:
+		dd_dev_err(dd,
+			   "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
+			   chip_lstate);
+		/* fall through */
+	case LSTATE_DOWN:
+		return IB_PORT_DOWN;
+	case LSTATE_INIT:
+		return IB_PORT_INIT;
+	case LSTATE_ARMED:
+		return IB_PORT_ARMED;
+	case LSTATE_ACTIVE:
+		return IB_PORT_ACTIVE;
+	}
+}
+
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate)
+{
+	/* look at the HFI meta-states only */
+	switch (chip_pstate & 0xf0) {
+	default:
+		dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
+			   chip_pstate);
+		/* fall through */
+	case PLS_DISABLED:
+		return IB_PORTPHYSSTATE_DISABLED;
+	case PLS_OFFLINE:
+		return OPA_PORTPHYSSTATE_OFFLINE;
+	case PLS_POLLING:
+		return IB_PORTPHYSSTATE_POLLING;
+	case PLS_CONFIGPHY:
+		return IB_PORTPHYSSTATE_TRAINING;
+	case PLS_LINKUP:
+		return IB_PORTPHYSSTATE_LINKUP;
+	case PLS_PHYTEST:
+		return IB_PORTPHYSSTATE_PHY_TEST;
+	}
+}
+
+/* return the OPA port logical state name */
+const char *opa_lstate_name(u32 lstate)
+{
+	static const char * const port_logical_names[] = {
+		"PORT_NOP",
+		"PORT_DOWN",
+		"PORT_INIT",
+		"PORT_ARMED",
+		"PORT_ACTIVE",
+		"PORT_ACTIVE_DEFER",
+	};
+	if (lstate < ARRAY_SIZE(port_logical_names))
+		return port_logical_names[lstate];
+	return "unknown";
+}
+
+/* return the OPA port physical state name */
+const char *opa_pstate_name(u32 pstate)
+{
+	static const char * const port_physical_names[] = {
+		"PHYS_NOP",
+		"reserved1",
+		"PHYS_POLL",
+		"PHYS_DISABLED",
+		"PHYS_TRAINING",
+		"PHYS_LINKUP",
+		"PHYS_LINK_ERR_RECOVER",
+		"PHYS_PHY_TEST",
+		"reserved8",
+		"PHYS_OFFLINE",
+		"PHYS_GANGED",
+		"PHYS_TEST",
+	};
+	if (pstate < ARRAY_SIZE(port_physical_names))
+		return port_physical_names[pstate];
+	return "unknown";
+}
+
+/**
+ * update_statusp - Update userspace status flag
+ * @ppd: Port data structure
+ * @state: port state information
+ *
+ * Actual port status is determined by the host_link_state value
+ * in the ppd.
+ *
+ * host_link_state MUST be updated before updating the user space
+ * statusp.
+ */
+static void update_statusp(struct hfi1_pportdata *ppd, u32 state)
+{
+	/*
+	 * Set port status flags in the page mapped into userspace
+	 * memory. Do it here to ensure a reliable state - this is
+	 * the only function called by all state handling code.
+	 * Always set the flags due to the fact that the cache value
+	 * might have been changed explicitly outside of this
+	 * function.
+	 */
+	if (ppd->statusp) {
+		switch (state) {
+		case IB_PORT_DOWN:
+		case IB_PORT_INIT:
+			*ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+					   HFI1_STATUS_IB_READY);
+			break;
+		case IB_PORT_ARMED:
+			*ppd->statusp |= HFI1_STATUS_IB_CONF;
+			break;
+		case IB_PORT_ACTIVE:
+			*ppd->statusp |= HFI1_STATUS_IB_READY;
+			break;
+		}
+	}
+	dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n",
+		    opa_lstate_name(state), state);
+}
+
+/**
+ * wait_logical_linkstate - wait for an IB link state change to occur
+ * @ppd: port device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for IB link state change to occur.
+ * For now, take the easy polling route.
+ * Returns 0 if state reached, otherwise -ETIMEDOUT.
+ */
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+				  int msecs)
+{
+	unsigned long timeout;
+	u32 new_state;
+
+	timeout = jiffies + msecs_to_jiffies(msecs);
+	while (1) {
+		new_state = chip_to_opa_lstate(ppd->dd,
+					       read_logical_state(ppd->dd));
+		if (new_state == state)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(ppd->dd,
+				   "timeout waiting for link state 0x%x\n",
+				   state);
+			return -ETIMEDOUT;
+		}
+		msleep(20);
+	}
+
+	return 0;
+}
+
+static void log_state_transition(struct hfi1_pportdata *ppd, u32 state)
+{
+	u32 ib_pstate = chip_to_opa_pstate(ppd->dd, state);
+
+	dd_dev_info(ppd->dd,
+		    "physical state changed to %s (0x%x), phy 0x%x\n",
+		    opa_pstate_name(ib_pstate), ib_pstate, state);
+}
+
+/*
+ * Read the physical hardware link state and check if it matches host
+ * drivers anticipated state.
+ */
+static void log_physical_state(struct hfi1_pportdata *ppd, u32 state)
+{
+	u32 read_state = read_physical_state(ppd->dd);
+
+	if (read_state == state) {
+		log_state_transition(ppd, state);
+	} else {
+		dd_dev_err(ppd->dd,
+			   "anticipated phy link state 0x%x, read 0x%x\n",
+			   state, read_state);
+	}
+}
+
+/*
+ * wait_physical_linkstate - wait for an physical link state change to occur
+ * @ppd: port device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for physical link state change to occur.
+ * Returns 0 if state reached, otherwise -ETIMEDOUT.
+ */
+static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+				   int msecs)
+{
+	u32 read_state;
+	unsigned long timeout;
+
+	timeout = jiffies + msecs_to_jiffies(msecs);
+	while (1) {
+		read_state = read_physical_state(ppd->dd);
+		if (read_state == state)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(ppd->dd,
+				   "timeout waiting for phy link state 0x%x\n",
+				   state);
+			return -ETIMEDOUT;
+		}
+		usleep_range(1950, 2050); /* sleep 2ms-ish */
+	}
+
+	log_state_transition(ppd, state);
+	return 0;
+}
+
+/*
+ * wait_phys_link_offline_quiet_substates - wait for any offline substate
+ * @ppd: port device
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for any offline physical link
+ * state change to occur.
+ * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT.
+ */
+static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd,
+					    int msecs)
+{
+	u32 read_state;
+	unsigned long timeout;
+
+	timeout = jiffies + msecs_to_jiffies(msecs);
+	while (1) {
+		read_state = read_physical_state(ppd->dd);
+		if ((read_state & 0xF0) == PLS_OFFLINE)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(ppd->dd,
+				   "timeout waiting for phy link offline.quiet substates. Read state 0x%x, %dms\n",
+				   read_state, msecs);
+			return -ETIMEDOUT;
+		}
+		usleep_range(1950, 2050); /* sleep 2ms-ish */
+	}
+
+	log_state_transition(ppd, read_state);
+	return read_state;
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+void hfi1_init_ctxt(struct send_context *sc)
+{
+	if (sc) {
+		struct hfi1_devdata *dd = sc->dd;
+		u64 reg;
+		u8 set = (sc->type == SC_USER ?
+			  HFI1_CAP_IS_USET(STATIC_RATE_CTRL) :
+			  HFI1_CAP_IS_KSET(STATIC_RATE_CTRL));
+		reg = read_kctxt_csr(dd, sc->hw_context,
+				     SEND_CTXT_CHECK_ENABLE);
+		if (set)
+			CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+		else
+			SET_STATIC_RATE_CONTROL_SMASK(reg);
+		write_kctxt_csr(dd, sc->hw_context,
+				SEND_CTXT_CHECK_ENABLE, reg);
+	}
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp)
+{
+	int ret = 0;
+	u64 reg;
+
+	if (dd->icode != ICODE_RTL_SILICON) {
+		if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+			dd_dev_info(dd, "%s: tempsense not supported by HW\n",
+				    __func__);
+		return -EINVAL;
+	}
+	reg = read_csr(dd, ASIC_STS_THERM);
+	temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) &
+		      ASIC_STS_THERM_CURR_TEMP_MASK);
+	temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) &
+			ASIC_STS_THERM_LO_TEMP_MASK);
+	temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) &
+			ASIC_STS_THERM_HI_TEMP_MASK);
+	temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) &
+			  ASIC_STS_THERM_CRIT_TEMP_MASK);
+	/* triggers is a 3-bit value - 1 bit per trigger. */
+	temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7);
+
+	return ret;
+}
+
+/**
+ * get_int_mask - get 64 bit int mask
+ * @dd - the devdata
+ * @i - the csr (relative to CCE_INT_MASK)
+ *
+ * Returns the mask with the urgent interrupt mask
+ * bit clear for kernel receive contexts.
+ */
+static u64 get_int_mask(struct hfi1_devdata *dd, u32 i)
+{
+	u64 mask = U64_MAX; /* default to no change */
+
+	if (i >= (IS_RCVURGENT_START / 64) && i < (IS_RCVURGENT_END / 64)) {
+		int j = (i - (IS_RCVURGENT_START / 64)) * 64;
+		int k = !j ? IS_RCVURGENT_START % 64 : 0;
+
+		if (j)
+			j -= IS_RCVURGENT_START % 64;
+		/* j = 0..dd->first_dyn_alloc_ctxt - 1,k = 0..63 */
+		for (; j < dd->first_dyn_alloc_ctxt && k < 64; j++, k++)
+			/* convert to bit in mask and clear */
+			mask &= ~BIT_ULL(k);
+	}
+	return mask;
+}
+
+/* ========================================================================= */
+
+/*
+ * Enable/disable chip from delivering interrupts.
+ */
+void set_intr_state(struct hfi1_devdata *dd, u32 enable)
+{
+	int i;
+
+	/*
+	 * In HFI, the mask needs to be 1 to allow interrupts.
+	 */
+	if (enable) {
+		/* enable all interrupts but urgent on kernel contexts */
+		for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+			u64 mask = get_int_mask(dd, i);
+
+			write_csr(dd, CCE_INT_MASK + (8 * i), mask);
+		}
+
+		init_qsfp_int(dd);
+	} else {
+		for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+			write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
+	}
+}
+
+/*
+ * Clear all interrupt sources on the chip.
+ */
+static void clear_all_interrupts(struct hfi1_devdata *dd)
+{
+	int i;
+
+	for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+		write_csr(dd, CCE_INT_CLEAR + (8 * i), ~(u64)0);
+
+	write_csr(dd, CCE_ERR_CLEAR, ~(u64)0);
+	write_csr(dd, MISC_ERR_CLEAR, ~(u64)0);
+	write_csr(dd, RCV_ERR_CLEAR, ~(u64)0);
+	write_csr(dd, SEND_ERR_CLEAR, ~(u64)0);
+	write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0);
+	write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0);
+	write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0);
+	for (i = 0; i < dd->chip_send_contexts; i++)
+		write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0);
+	for (i = 0; i < dd->chip_sdma_engines; i++)
+		write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0);
+
+	write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0);
+	write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0);
+	write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
+}
+
+/* Move to pcie.c? */
+static void disable_intx(struct pci_dev *pdev)
+{
+	pci_intx(pdev, 0);
+}
+
+/**
+ * hfi1_clean_up_interrupts() - Free all IRQ resources
+ * @dd: valid device data data structure
+ *
+ * Free the MSI or INTx IRQs and assoicated PCI resources,
+ * if they have been allocated.
+ */
+void hfi1_clean_up_interrupts(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* remove irqs - must happen before disabling/turning off */
+	if (dd->num_msix_entries) {
+		/* MSI-X */
+		struct hfi1_msix_entry *me = dd->msix_entries;
+
+		for (i = 0; i < dd->num_msix_entries; i++, me++) {
+			if (!me->arg) /* => no irq, no affinity */
+				continue;
+			hfi1_put_irq_affinity(dd, me);
+			pci_free_irq(dd->pcidev, i, me->arg);
+		}
+
+		/* clean structures */
+		kfree(dd->msix_entries);
+		dd->msix_entries = NULL;
+		dd->num_msix_entries = 0;
+	} else {
+		/* INTx */
+		if (dd->requested_intx_irq) {
+			pci_free_irq(dd->pcidev, 0, dd);
+			dd->requested_intx_irq = 0;
+		}
+		disable_intx(dd->pcidev);
+	}
+
+	pci_free_irq_vectors(dd->pcidev);
+}
+
+/*
+ * Remap the interrupt source from the general handler to the given MSI-X
+ * interrupt.
+ */
+static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
+{
+	u64 reg;
+	int m, n;
+
+	/* clear from the handled mask of the general interrupt */
+	m = isrc / 64;
+	n = isrc % 64;
+	if (likely(m < CCE_NUM_INT_CSRS)) {
+		dd->gi_mask[m] &= ~((u64)1 << n);
+	} else {
+		dd_dev_err(dd, "remap interrupt err\n");
+		return;
+	}
+
+	/* direct the chip source to the given MSI-X interrupt */
+	m = isrc / 8;
+	n = isrc % 8;
+	reg = read_csr(dd, CCE_INT_MAP + (8 * m));
+	reg &= ~((u64)0xff << (8 * n));
+	reg |= ((u64)msix_intr & 0xff) << (8 * n);
+	write_csr(dd, CCE_INT_MAP + (8 * m), reg);
+}
+
+static void remap_sdma_interrupts(struct hfi1_devdata *dd,
+				  int engine, int msix_intr)
+{
+	/*
+	 * SDMA engine interrupt sources grouped by type, rather than
+	 * engine.  Per-engine interrupts are as follows:
+	 *	SDMA
+	 *	SDMAProgress
+	 *	SDMAIdle
+	 */
+	remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine,
+		   msix_intr);
+	remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine,
+		   msix_intr);
+	remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine,
+		   msix_intr);
+}
+
+static int request_intx_irq(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	ret = pci_request_irq(dd->pcidev, 0, general_interrupt, NULL, dd,
+			      DRIVER_NAME "_%d", dd->unit);
+	if (ret)
+		dd_dev_err(dd, "unable to request INTx interrupt, err %d\n",
+			   ret);
+	else
+		dd->requested_intx_irq = 1;
+	return ret;
+}
+
+static int request_msix_irqs(struct hfi1_devdata *dd)
+{
+	int first_general, last_general;
+	int first_sdma, last_sdma;
+	int first_rx, last_rx;
+	int i, ret = 0;
+
+	/* calculate the ranges we are going to use */
+	first_general = 0;
+	last_general = first_general + 1;
+	first_sdma = last_general;
+	last_sdma = first_sdma + dd->num_sdma;
+	first_rx = last_sdma;
+	last_rx = first_rx + dd->n_krcv_queues + dd->num_vnic_contexts;
+
+	/* VNIC MSIx interrupts get mapped when VNIC contexts are created */
+	dd->first_dyn_msix_idx = first_rx + dd->n_krcv_queues;
+
+	/*
+	 * Sanity check - the code expects all SDMA chip source
+	 * interrupts to be in the same CSR, starting at bit 0.  Verify
+	 * that this is true by checking the bit location of the start.
+	 */
+	BUILD_BUG_ON(IS_SDMA_START % 64);
+
+	for (i = 0; i < dd->num_msix_entries; i++) {
+		struct hfi1_msix_entry *me = &dd->msix_entries[i];
+		const char *err_info;
+		irq_handler_t handler;
+		irq_handler_t thread = NULL;
+		void *arg = NULL;
+		int idx;
+		struct hfi1_ctxtdata *rcd = NULL;
+		struct sdma_engine *sde = NULL;
+		char name[MAX_NAME_SIZE];
+
+		/* obtain the arguments to pci_request_irq */
+		if (first_general <= i && i < last_general) {
+			idx = i - first_general;
+			handler = general_interrupt;
+			arg = dd;
+			snprintf(name, sizeof(name),
+				 DRIVER_NAME "_%d", dd->unit);
+			err_info = "general";
+			me->type = IRQ_GENERAL;
+		} else if (first_sdma <= i && i < last_sdma) {
+			idx = i - first_sdma;
+			sde = &dd->per_sdma[idx];
+			handler = sdma_interrupt;
+			arg = sde;
+			snprintf(name, sizeof(name),
+				 DRIVER_NAME "_%d sdma%d", dd->unit, idx);
+			err_info = "sdma";
+			remap_sdma_interrupts(dd, idx, i);
+			me->type = IRQ_SDMA;
+		} else if (first_rx <= i && i < last_rx) {
+			idx = i - first_rx;
+			rcd = hfi1_rcd_get_by_index_safe(dd, idx);
+			if (rcd) {
+				/*
+				 * Set the interrupt register and mask for this
+				 * context's interrupt.
+				 */
+				rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
+				rcd->imask = ((u64)1) <<
+					  ((IS_RCVAVAIL_START + idx) % 64);
+				handler = receive_context_interrupt;
+				thread = receive_context_thread;
+				arg = rcd;
+				snprintf(name, sizeof(name),
+					 DRIVER_NAME "_%d kctxt%d",
+					 dd->unit, idx);
+				err_info = "receive context";
+				remap_intr(dd, IS_RCVAVAIL_START + idx, i);
+				me->type = IRQ_RCVCTXT;
+				rcd->msix_intr = i;
+				hfi1_rcd_put(rcd);
+			}
+		} else {
+			/* not in our expected range - complain, then
+			 * ignore it
+			 */
+			dd_dev_err(dd,
+				   "Unexpected extra MSI-X interrupt %d\n", i);
+			continue;
+		}
+		/* no argument, no interrupt */
+		if (!arg)
+			continue;
+		/* make sure the name is terminated */
+		name[sizeof(name) - 1] = 0;
+		me->irq = pci_irq_vector(dd->pcidev, i);
+		ret = pci_request_irq(dd->pcidev, i, handler, thread, arg,
+				      name);
+		if (ret) {
+			dd_dev_err(dd,
+				   "unable to allocate %s interrupt, irq %d, index %d, err %d\n",
+				   err_info, me->irq, idx, ret);
+			return ret;
+		}
+		/*
+		 * assign arg after pci_request_irq call, so it will be
+		 * cleaned up
+		 */
+		me->arg = arg;
+
+		ret = hfi1_get_irq_affinity(dd, me);
+		if (ret)
+			dd_dev_err(dd, "unable to pin IRQ %d\n", ret);
+	}
+
+	return ret;
+}
+
+void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd)
+{
+	int i;
+
+	if (!dd->num_msix_entries) {
+		synchronize_irq(pci_irq_vector(dd->pcidev, 0));
+		return;
+	}
+
+	for (i = 0; i < dd->vnic.num_ctxt; i++) {
+		struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i];
+		struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr];
+
+		synchronize_irq(me->irq);
+	}
+}
+
+void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr];
+
+	if (!me->arg) /* => no irq, no affinity */
+		return;
+
+	hfi1_put_irq_affinity(dd, me);
+	pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg);
+
+	me->arg = NULL;
+}
+
+void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	struct hfi1_msix_entry *me;
+	int idx = rcd->ctxt;
+	void *arg = rcd;
+	int ret;
+
+	rcd->msix_intr = dd->vnic.msix_idx++;
+	me = &dd->msix_entries[rcd->msix_intr];
+
+	/*
+	 * Set the interrupt register and mask for this
+	 * context's interrupt.
+	 */
+	rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
+	rcd->imask = ((u64)1) <<
+		  ((IS_RCVAVAIL_START + idx) % 64);
+	me->type = IRQ_RCVCTXT;
+	me->irq = pci_irq_vector(dd->pcidev, rcd->msix_intr);
+	remap_intr(dd, IS_RCVAVAIL_START + idx, rcd->msix_intr);
+
+	ret = pci_request_irq(dd->pcidev, rcd->msix_intr,
+			      receive_context_interrupt,
+			      receive_context_thread, arg,
+			      DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
+	if (ret) {
+		dd_dev_err(dd, "vnic irq request (irq %d, idx %d) fail %d\n",
+			   me->irq, idx, ret);
+		return;
+	}
+	/*
+	 * assign arg after pci_request_irq call, so it will be
+	 * cleaned up
+	 */
+	me->arg = arg;
+
+	ret = hfi1_get_irq_affinity(dd, me);
+	if (ret) {
+		dd_dev_err(dd,
+			   "unable to pin IRQ %d\n", ret);
+		pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg);
+	}
+}
+
+/*
+ * Set the general handler to accept all interrupts, remap all
+ * chip interrupts back to MSI-X 0.
+ */
+static void reset_interrupts(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* all interrupts handled by the general handler */
+	for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+		dd->gi_mask[i] = ~(u64)0;
+
+	/* all chip interrupts map to MSI-X 0 */
+	for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+		write_csr(dd, CCE_INT_MAP + (8 * i), 0);
+}
+
+static int set_up_interrupts(struct hfi1_devdata *dd)
+{
+	u32 total;
+	int ret, request;
+	int single_interrupt = 0; /* we expect to have all the interrupts */
+
+	/*
+	 * Interrupt count:
+	 *	1 general, "slow path" interrupt (includes the SDMA engines
+	 *		slow source, SDMACleanupDone)
+	 *	N interrupts - one per used SDMA engine
+	 *	M interrupt - one per kernel receive context
+	 *	V interrupt - one for each VNIC context
+	 */
+	total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts;
+
+	/* ask for MSI-X interrupts */
+	request = request_msix(dd, total);
+	if (request < 0) {
+		ret = request;
+		goto fail;
+	} else if (request == 0) {
+		/* using INTx */
+		/* dd->num_msix_entries already zero */
+		single_interrupt = 1;
+		dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n");
+	} else if (request < total) {
+		/* using MSI-X, with reduced interrupts */
+		dd_dev_err(dd, "reduced interrupt found, wanted %u, got %u\n",
+			   total, request);
+		ret = -EINVAL;
+		goto fail;
+	} else {
+		dd->msix_entries = kcalloc(total, sizeof(*dd->msix_entries),
+					   GFP_KERNEL);
+		if (!dd->msix_entries) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+		/* using MSI-X */
+		dd->num_msix_entries = total;
+		dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
+	}
+
+	/* mask all interrupts */
+	set_intr_state(dd, 0);
+	/* clear all pending interrupts */
+	clear_all_interrupts(dd);
+
+	/* reset general handler mask, chip MSI-X mappings */
+	reset_interrupts(dd);
+
+	if (single_interrupt)
+		ret = request_intx_irq(dd);
+	else
+		ret = request_msix_irqs(dd);
+	if (ret)
+		goto fail;
+
+	return 0;
+
+fail:
+	hfi1_clean_up_interrupts(dd);
+	return ret;
+}
+
+/*
+ * Set up context values in dd.  Sets:
+ *
+ *	num_rcv_contexts - number of contexts being used
+ *	n_krcv_queues - number of kernel contexts
+ *	first_dyn_alloc_ctxt - first dynamically allocated context
+ *                             in array of contexts
+ *	freectxts  - number of free user contexts
+ *	num_send_contexts - number of PIO send contexts being used
+ *	num_vnic_contexts - number of contexts reserved for VNIC
+ */
+static int set_up_context_variables(struct hfi1_devdata *dd)
+{
+	unsigned long num_kernel_contexts;
+	u16 num_vnic_contexts = HFI1_NUM_VNIC_CTXT;
+	int total_contexts;
+	int ret;
+	unsigned ngroups;
+	int qos_rmt_count;
+	int user_rmt_reduced;
+	u32 n_usr_ctxts;
+
+	/*
+	 * Kernel receive contexts:
+	 * - Context 0 - control context (VL15/multicast/error)
+	 * - Context 1 - first kernel context
+	 * - Context 2 - second kernel context
+	 * ...
+	 */
+	if (n_krcvqs)
+		/*
+		 * n_krcvqs is the sum of module parameter kernel receive
+		 * contexts, krcvqs[].  It does not include the control
+		 * context, so add that.
+		 */
+		num_kernel_contexts = n_krcvqs + 1;
+	else
+		num_kernel_contexts = DEFAULT_KRCVQS + 1;
+	/*
+	 * Every kernel receive context needs an ACK send context.
+	 * one send context is allocated for each VL{0-7} and VL15
+	 */
+	if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) {
+		dd_dev_err(dd,
+			   "Reducing # kernel rcv contexts to: %d, from %lu\n",
+			   (int)(dd->chip_send_contexts - num_vls - 1),
+			   num_kernel_contexts);
+		num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
+	}
+
+	/* Accommodate VNIC contexts if possible */
+	if ((num_kernel_contexts + num_vnic_contexts) > dd->chip_rcv_contexts) {
+		dd_dev_err(dd, "No receive contexts available for VNIC\n");
+		num_vnic_contexts = 0;
+	}
+	total_contexts = num_kernel_contexts + num_vnic_contexts;
+
+	/*
+	 * User contexts:
+	 *	- default to 1 user context per real (non-HT) CPU core if
+	 *	  num_user_contexts is negative
+	 */
+	if (num_user_contexts < 0)
+		n_usr_ctxts = cpumask_weight(&node_affinity.real_cpu_mask);
+	else
+		n_usr_ctxts = num_user_contexts;
+	/*
+	 * Adjust the counts given a global max.
+	 */
+	if (total_contexts + n_usr_ctxts > dd->chip_rcv_contexts) {
+		dd_dev_err(dd,
+			   "Reducing # user receive contexts to: %d, from %u\n",
+			   (int)(dd->chip_rcv_contexts - total_contexts),
+			   n_usr_ctxts);
+		/* recalculate */
+		n_usr_ctxts = dd->chip_rcv_contexts - total_contexts;
+	}
+
+	/* each user context requires an entry in the RMT */
+	qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
+	if (qos_rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
+		user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+		dd_dev_err(dd,
+			   "RMT size is reducing the number of user receive contexts from %u to %d\n",
+			   n_usr_ctxts,
+			   user_rmt_reduced);
+		/* recalculate */
+		n_usr_ctxts = user_rmt_reduced;
+	}
+
+	total_contexts += n_usr_ctxts;
+
+	/* the first N are kernel contexts, the rest are user/vnic contexts */
+	dd->num_rcv_contexts = total_contexts;
+	dd->n_krcv_queues = num_kernel_contexts;
+	dd->first_dyn_alloc_ctxt = num_kernel_contexts;
+	dd->num_vnic_contexts = num_vnic_contexts;
+	dd->num_user_contexts = n_usr_ctxts;
+	dd->freectxts = n_usr_ctxts;
+	dd_dev_info(dd,
+		    "rcv contexts: chip %d, used %d (kernel %d, vnic %u, user %u)\n",
+		    (int)dd->chip_rcv_contexts,
+		    (int)dd->num_rcv_contexts,
+		    (int)dd->n_krcv_queues,
+		    dd->num_vnic_contexts,
+		    dd->num_user_contexts);
+
+	/*
+	 * Receive array allocation:
+	 *   All RcvArray entries are divided into groups of 8. This
+	 *   is required by the hardware and will speed up writes to
+	 *   consecutive entries by using write-combining of the entire
+	 *   cacheline.
+	 *
+	 *   The number of groups are evenly divided among all contexts.
+	 *   any left over groups will be given to the first N user
+	 *   contexts.
+	 */
+	dd->rcv_entries.group_size = RCV_INCREMENT;
+	ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size;
+	dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts;
+	dd->rcv_entries.nctxt_extra = ngroups -
+		(dd->num_rcv_contexts * dd->rcv_entries.ngroups);
+	dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n",
+		    dd->rcv_entries.ngroups,
+		    dd->rcv_entries.nctxt_extra);
+	if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size >
+	    MAX_EAGER_ENTRIES * 2) {
+		dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) /
+			dd->rcv_entries.group_size;
+		dd_dev_info(dd,
+			    "RcvArray group count too high, change to %u\n",
+			    dd->rcv_entries.ngroups);
+		dd->rcv_entries.nctxt_extra = 0;
+	}
+	/*
+	 * PIO send contexts
+	 */
+	ret = init_sc_pools_and_sizes(dd);
+	if (ret >= 0) {	/* success */
+		dd->num_send_contexts = ret;
+		dd_dev_info(
+			dd,
+			"send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
+			dd->chip_send_contexts,
+			dd->num_send_contexts,
+			dd->sc_sizes[SC_KERNEL].count,
+			dd->sc_sizes[SC_ACK].count,
+			dd->sc_sizes[SC_USER].count,
+			dd->sc_sizes[SC_VL15].count);
+		ret = 0;	/* success */
+	}
+
+	return ret;
+}
+
+/*
+ * Set the device/port partition key table. The MAD code
+ * will ensure that, at least, the partial management
+ * partition key is present in the table.
+ */
+static void set_partition_keys(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 reg = 0;
+	int i;
+
+	dd_dev_info(dd, "Setting partition keys\n");
+	for (i = 0; i < hfi1_get_npkeys(dd); i++) {
+		reg |= (ppd->pkeys[i] &
+			RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) <<
+			((i % 4) *
+			 RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT);
+		/* Each register holds 4 PKey values. */
+		if ((i % 4) == 3) {
+			write_csr(dd, RCV_PARTITION_KEY +
+				  ((i - 3) * 2), reg);
+			reg = 0;
+		}
+	}
+
+	/* Always enable HW pkeys check when pkeys table is set */
+	add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK);
+}
+
+/*
+ * These CSRs and memories are uninitialized on reset and must be
+ * written before reading to set the ECC/parity bits.
+ *
+ * NOTE: All user context CSRs that are not mmaped write-only
+ * (e.g. the TID flows) must be initialized even if the driver never
+ * reads them.
+ */
+static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd)
+{
+	int i, j;
+
+	/* CceIntMap */
+	for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+		write_csr(dd, CCE_INT_MAP + (8 * i), 0);
+
+	/* SendCtxtCreditReturnAddr */
+	for (i = 0; i < dd->chip_send_contexts; i++)
+		write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+
+	/* PIO Send buffers */
+	/* SDMA Send buffers */
+	/*
+	 * These are not normally read, and (presently) have no method
+	 * to be read, so are not pre-initialized
+	 */
+
+	/* RcvHdrAddr */
+	/* RcvHdrTailAddr */
+	/* RcvTidFlowTable */
+	for (i = 0; i < dd->chip_rcv_contexts; i++) {
+		write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+		for (j = 0; j < RXE_NUM_TID_FLOWS; j++)
+			write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j), 0);
+	}
+
+	/* RcvArray */
+	for (i = 0; i < dd->chip_rcv_array_count; i++)
+		hfi1_put_tid(dd, i, PT_INVALID_FLUSH, 0, 0);
+
+	/* RcvQPMapTable */
+	for (i = 0; i < 32; i++)
+		write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+}
+
+/*
+ * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus.
+ */
+static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits,
+			     u64 ctrl_bits)
+{
+	unsigned long timeout;
+	u64 reg;
+
+	/* is the condition present? */
+	reg = read_csr(dd, CCE_STATUS);
+	if ((reg & status_bits) == 0)
+		return;
+
+	/* clear the condition */
+	write_csr(dd, CCE_CTRL, ctrl_bits);
+
+	/* wait for the condition to clear */
+	timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT);
+	while (1) {
+		reg = read_csr(dd, CCE_STATUS);
+		if ((reg & status_bits) == 0)
+			return;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(dd,
+				   "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
+				   status_bits, reg & status_bits);
+			return;
+		}
+		udelay(1);
+	}
+}
+
+/* set CCE CSRs to chip reset defaults */
+static void reset_cce_csrs(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* CCE_REVISION read-only */
+	/* CCE_REVISION2 read-only */
+	/* CCE_CTRL - bits clear automatically */
+	/* CCE_STATUS read-only, use CceCtrl to clear */
+	clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK);
+	clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK);
+	clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK);
+	for (i = 0; i < CCE_NUM_SCRATCH; i++)
+		write_csr(dd, CCE_SCRATCH + (8 * i), 0);
+	/* CCE_ERR_STATUS read-only */
+	write_csr(dd, CCE_ERR_MASK, 0);
+	write_csr(dd, CCE_ERR_CLEAR, ~0ull);
+	/* CCE_ERR_FORCE leave alone */
+	for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++)
+		write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0);
+	write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR);
+	/* CCE_PCIE_CTRL leave alone */
+	for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) {
+		write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0);
+		write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i),
+			  CCE_MSIX_TABLE_UPPER_RESETCSR);
+	}
+	for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) {
+		/* CCE_MSIX_PBA read-only */
+		write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull);
+		write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull);
+	}
+	for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+		write_csr(dd, CCE_INT_MAP, 0);
+	for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+		/* CCE_INT_STATUS read-only */
+		write_csr(dd, CCE_INT_MASK + (8 * i), 0);
+		write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull);
+		/* CCE_INT_FORCE leave alone */
+		/* CCE_INT_BLOCKED read-only */
+	}
+	for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++)
+		write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0);
+}
+
+/* set MISC CSRs to chip reset defaults */
+static void reset_misc_csrs(struct hfi1_devdata *dd)
+{
+	int i;
+
+	for (i = 0; i < 32; i++) {
+		write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0);
+		write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0);
+		write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0);
+	}
+	/*
+	 * MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can
+	 * only be written 128-byte chunks
+	 */
+	/* init RSA engine to clear lingering errors */
+	write_csr(dd, MISC_CFG_RSA_CMD, 1);
+	write_csr(dd, MISC_CFG_RSA_MU, 0);
+	write_csr(dd, MISC_CFG_FW_CTRL, 0);
+	/* MISC_STS_8051_DIGEST read-only */
+	/* MISC_STS_SBM_DIGEST read-only */
+	/* MISC_STS_PCIE_DIGEST read-only */
+	/* MISC_STS_FAB_DIGEST read-only */
+	/* MISC_ERR_STATUS read-only */
+	write_csr(dd, MISC_ERR_MASK, 0);
+	write_csr(dd, MISC_ERR_CLEAR, ~0ull);
+	/* MISC_ERR_FORCE leave alone */
+}
+
+/* set TXE CSRs to chip reset defaults */
+static void reset_txe_csrs(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/*
+	 * TXE Kernel CSRs
+	 */
+	write_csr(dd, SEND_CTRL, 0);
+	__cm_reset(dd, 0);	/* reset CM internal state */
+	/* SEND_CONTEXTS read-only */
+	/* SEND_DMA_ENGINES read-only */
+	/* SEND_PIO_MEM_SIZE read-only */
+	/* SEND_DMA_MEM_SIZE read-only */
+	write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0);
+	pio_reset_all(dd);	/* SEND_PIO_INIT_CTXT */
+	/* SEND_PIO_ERR_STATUS read-only */
+	write_csr(dd, SEND_PIO_ERR_MASK, 0);
+	write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull);
+	/* SEND_PIO_ERR_FORCE leave alone */
+	/* SEND_DMA_ERR_STATUS read-only */
+	write_csr(dd, SEND_DMA_ERR_MASK, 0);
+	write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull);
+	/* SEND_DMA_ERR_FORCE leave alone */
+	/* SEND_EGRESS_ERR_STATUS read-only */
+	write_csr(dd, SEND_EGRESS_ERR_MASK, 0);
+	write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull);
+	/* SEND_EGRESS_ERR_FORCE leave alone */
+	write_csr(dd, SEND_BTH_QP, 0);
+	write_csr(dd, SEND_STATIC_RATE_CONTROL, 0);
+	write_csr(dd, SEND_SC2VLT0, 0);
+	write_csr(dd, SEND_SC2VLT1, 0);
+	write_csr(dd, SEND_SC2VLT2, 0);
+	write_csr(dd, SEND_SC2VLT3, 0);
+	write_csr(dd, SEND_LEN_CHECK0, 0);
+	write_csr(dd, SEND_LEN_CHECK1, 0);
+	/* SEND_ERR_STATUS read-only */
+	write_csr(dd, SEND_ERR_MASK, 0);
+	write_csr(dd, SEND_ERR_CLEAR, ~0ull);
+	/* SEND_ERR_FORCE read-only */
+	for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++)
+		write_csr(dd, SEND_LOW_PRIORITY_LIST + (8 * i), 0);
+	for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++)
+		write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8 * i), 0);
+	for (i = 0; i < dd->chip_send_contexts / NUM_CONTEXTS_PER_SET; i++)
+		write_csr(dd, SEND_CONTEXT_SET_CTRL + (8 * i), 0);
+	for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++)
+		write_csr(dd, SEND_COUNTER_ARRAY32 + (8 * i), 0);
+	for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++)
+		write_csr(dd, SEND_COUNTER_ARRAY64 + (8 * i), 0);
+	write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR);
+	write_csr(dd, SEND_CM_GLOBAL_CREDIT, SEND_CM_GLOBAL_CREDIT_RESETCSR);
+	/* SEND_CM_CREDIT_USED_STATUS read-only */
+	write_csr(dd, SEND_CM_TIMER_CTRL, 0);
+	write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0);
+	write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0);
+	write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0);
+	write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0);
+	for (i = 0; i < TXE_NUM_DATA_VL; i++)
+		write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
+	write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+	/* SEND_CM_CREDIT_USED_VL read-only */
+	/* SEND_CM_CREDIT_USED_VL15 read-only */
+	/* SEND_EGRESS_CTXT_STATUS read-only */
+	/* SEND_EGRESS_SEND_DMA_STATUS read-only */
+	write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull);
+	/* SEND_EGRESS_ERR_INFO read-only */
+	/* SEND_EGRESS_ERR_SOURCE read-only */
+
+	/*
+	 * TXE Per-Context CSRs
+	 */
+	for (i = 0; i < dd->chip_send_contexts; i++) {
+		write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0);
+		write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0);
+	}
+
+	/*
+	 * TXE Per-SDMA CSRs
+	 */
+	for (i = 0; i < dd->chip_sdma_engines; i++) {
+		write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+		/* SEND_DMA_STATUS read-only */
+		write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0);
+		/* SEND_DMA_HEAD read-only */
+		write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0);
+		/* SEND_DMA_IDLE_CNT read-only */
+		write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0);
+		/* SEND_DMA_DESC_FETCHED_CNT read-only */
+		/* SEND_DMA_ENG_ERR_STATUS read-only */
+		write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull);
+		/* SEND_DMA_ENG_ERR_FORCE leave alone */
+		write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0);
+		write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0);
+	}
+}
+
+/*
+ * Expect on entry:
+ * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0
+ */
+static void init_rbufs(struct hfi1_devdata *dd)
+{
+	u64 reg;
+	int count;
+
+	/*
+	 * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are
+	 * clear.
+	 */
+	count = 0;
+	while (1) {
+		reg = read_csr(dd, RCV_STATUS);
+		if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK
+			    | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0)
+			break;
+		/*
+		 * Give up after 1ms - maximum wait time.
+		 *
+		 * RBuf size is 136KiB.  Slowest possible is PCIe Gen1 x1 at
+		 * 250MB/s bandwidth.  Lower rate to 66% for overhead to get:
+		 *	136 KB / (66% * 250MB/s) = 844us
+		 */
+		if (count++ > 500) {
+			dd_dev_err(dd,
+				   "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
+				   __func__, reg);
+			break;
+		}
+		udelay(2); /* do not busy-wait the CSR */
+	}
+
+	/* start the init - expect RcvCtrl to be 0 */
+	write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK);
+
+	/*
+	 * Read to force the write of Rcvtrl.RxRbufInit.  There is a brief
+	 * period after the write before RcvStatus.RxRbufInitDone is valid.
+	 * The delay in the first run through the loop below is sufficient and
+	 * required before the first read of RcvStatus.RxRbufInintDone.
+	 */
+	read_csr(dd, RCV_CTRL);
+
+	/* wait for the init to finish */
+	count = 0;
+	while (1) {
+		/* delay is required first time through - see above */
+		udelay(2); /* do not busy-wait the CSR */
+		reg = read_csr(dd, RCV_STATUS);
+		if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK))
+			break;
+
+		/* give up after 100us - slowest possible at 33MHz is 73us */
+		if (count++ > 50) {
+			dd_dev_err(dd,
+				   "%s: RcvStatus.RxRbufInit not set, continuing\n",
+				   __func__);
+			break;
+		}
+	}
+}
+
+/* set RXE CSRs to chip reset defaults */
+static void reset_rxe_csrs(struct hfi1_devdata *dd)
+{
+	int i, j;
+
+	/*
+	 * RXE Kernel CSRs
+	 */
+	write_csr(dd, RCV_CTRL, 0);
+	init_rbufs(dd);
+	/* RCV_STATUS read-only */
+	/* RCV_CONTEXTS read-only */
+	/* RCV_ARRAY_CNT read-only */
+	/* RCV_BUF_SIZE read-only */
+	write_csr(dd, RCV_BTH_QP, 0);
+	write_csr(dd, RCV_MULTICAST, 0);
+	write_csr(dd, RCV_BYPASS, 0);
+	write_csr(dd, RCV_VL15, 0);
+	/* this is a clear-down */
+	write_csr(dd, RCV_ERR_INFO,
+		  RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+	/* RCV_ERR_STATUS read-only */
+	write_csr(dd, RCV_ERR_MASK, 0);
+	write_csr(dd, RCV_ERR_CLEAR, ~0ull);
+	/* RCV_ERR_FORCE leave alone */
+	for (i = 0; i < 32; i++)
+		write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+	for (i = 0; i < 4; i++)
+		write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0);
+	for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++)
+		write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0);
+	for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++)
+		write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0);
+	for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++)
+		clear_rsm_rule(dd, i);
+	for (i = 0; i < 32; i++)
+		write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0);
+
+	/*
+	 * RXE Kernel and User Per-Context CSRs
+	 */
+	for (i = 0; i < dd->chip_rcv_contexts; i++) {
+		/* kernel */
+		write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0);
+		/* RCV_CTXT_STATUS read-only */
+		write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0);
+		write_kctxt_csr(dd, i, RCV_TID_CTRL, 0);
+		write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_CNT, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+		write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0);
+		write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0);
+
+		/* user */
+		/* RCV_HDR_TAIL read-only */
+		write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0);
+		/* RCV_EGR_INDEX_TAIL read-only */
+		write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0);
+		/* RCV_EGR_OFFSET_TAIL read-only */
+		for (j = 0; j < RXE_NUM_TID_FLOWS; j++) {
+			write_uctxt_csr(dd, i,
+					RCV_TID_FLOW_TABLE + (8 * j), 0);
+		}
+	}
+}
+
+/*
+ * Set sc2vl tables.
+ *
+ * They power on to zeros, so to avoid send context errors
+ * they need to be set:
+ *
+ * SC 0-7 -> VL 0-7 (respectively)
+ * SC 15  -> VL 15
+ * otherwise
+ *        -> VL 0
+ */
+static void init_sc2vl_tables(struct hfi1_devdata *dd)
+{
+	int i;
+	/* init per architecture spec, constrained by hardware capability */
+
+	/* HFI maps sent packets */
+	write_csr(dd, SEND_SC2VLT0, SC2VL_VAL(
+		0,
+		0, 0, 1, 1,
+		2, 2, 3, 3,
+		4, 4, 5, 5,
+		6, 6, 7, 7));
+	write_csr(dd, SEND_SC2VLT1, SC2VL_VAL(
+		1,
+		8, 0, 9, 0,
+		10, 0, 11, 0,
+		12, 0, 13, 0,
+		14, 0, 15, 15));
+	write_csr(dd, SEND_SC2VLT2, SC2VL_VAL(
+		2,
+		16, 0, 17, 0,
+		18, 0, 19, 0,
+		20, 0, 21, 0,
+		22, 0, 23, 0));
+	write_csr(dd, SEND_SC2VLT3, SC2VL_VAL(
+		3,
+		24, 0, 25, 0,
+		26, 0, 27, 0,
+		28, 0, 29, 0,
+		30, 0, 31, 0));
+
+	/* DC maps received packets */
+	write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL(
+		15_0,
+		0, 0, 1, 1,  2, 2,  3, 3,  4, 4,  5, 5,  6, 6,  7,  7,
+		8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15));
+	write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL(
+		31_16,
+		16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0,
+		24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0));
+
+	/* initialize the cached sc2vl values consistently with h/w */
+	for (i = 0; i < 32; i++) {
+		if (i < 8 || i == 15)
+			*((u8 *)(dd->sc2vl) + i) = (u8)i;
+		else
+			*((u8 *)(dd->sc2vl) + i) = 0;
+	}
+}
+
+/*
+ * Read chip sizes and then reset parts to sane, disabled, values.  We cannot
+ * depend on the chip going through a power-on reset - a driver may be loaded
+ * and unloaded many times.
+ *
+ * Do not write any CSR values to the chip in this routine - there may be
+ * a reset following the (possible) FLR in this routine.
+ *
+ */
+static int init_chip(struct hfi1_devdata *dd)
+{
+	int i;
+	int ret = 0;
+
+	/*
+	 * Put the HFI CSRs in a known state.
+	 * Combine this with a DC reset.
+	 *
+	 * Stop the device from doing anything while we do a
+	 * reset.  We know there are no other active users of
+	 * the device since we are now in charge.  Turn off
+	 * off all outbound and inbound traffic and make sure
+	 * the device does not generate any interrupts.
+	 */
+
+	/* disable send contexts and SDMA engines */
+	write_csr(dd, SEND_CTRL, 0);
+	for (i = 0; i < dd->chip_send_contexts; i++)
+		write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+	for (i = 0; i < dd->chip_sdma_engines; i++)
+		write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+	/* disable port (turn off RXE inbound traffic) and contexts */
+	write_csr(dd, RCV_CTRL, 0);
+	for (i = 0; i < dd->chip_rcv_contexts; i++)
+		write_csr(dd, RCV_CTXT_CTRL, 0);
+	/* mask all interrupt sources */
+	for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+		write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
+
+	/*
+	 * DC Reset: do a full DC reset before the register clear.
+	 * A recommended length of time to hold is one CSR read,
+	 * so reread the CceDcCtrl.  Then, hold the DC in reset
+	 * across the clear.
+	 */
+	write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+	(void)read_csr(dd, CCE_DC_CTRL);
+
+	if (use_flr) {
+		/*
+		 * A FLR will reset the SPC core and part of the PCIe.
+		 * The parts that need to be restored have already been
+		 * saved.
+		 */
+		dd_dev_info(dd, "Resetting CSRs with FLR\n");
+
+		/* do the FLR, the DC reset will remain */
+		pcie_flr(dd->pcidev);
+
+		/* restore command and BARs */
+		ret = restore_pci_variables(dd);
+		if (ret) {
+			dd_dev_err(dd, "%s: Could not restore PCI variables\n",
+				   __func__);
+			return ret;
+		}
+
+		if (is_ax(dd)) {
+			dd_dev_info(dd, "Resetting CSRs with FLR\n");
+			pcie_flr(dd->pcidev);
+			ret = restore_pci_variables(dd);
+			if (ret) {
+				dd_dev_err(dd, "%s: Could not restore PCI variables\n",
+					   __func__);
+				return ret;
+			}
+		}
+	} else {
+		dd_dev_info(dd, "Resetting CSRs with writes\n");
+		reset_cce_csrs(dd);
+		reset_txe_csrs(dd);
+		reset_rxe_csrs(dd);
+		reset_misc_csrs(dd);
+	}
+	/* clear the DC reset */
+	write_csr(dd, CCE_DC_CTRL, 0);
+
+	/* Set the LED off */
+	setextled(dd, 0);
+
+	/*
+	 * Clear the QSFP reset.
+	 * An FLR enforces a 0 on all out pins. The driver does not touch
+	 * ASIC_QSFPn_OUT otherwise.  This leaves RESET_N low and
+	 * anything plugged constantly in reset, if it pays attention
+	 * to RESET_N.
+	 * Prime examples of this are optical cables. Set all pins high.
+	 * I2CCLK and I2CDAT will change per direction, and INT_N and
+	 * MODPRS_N are input only and their value is ignored.
+	 */
+	write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
+	write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
+	init_chip_resources(dd);
+	return ret;
+}
+
+static void init_early_variables(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* assign link credit variables */
+	dd->vau = CM_VAU;
+	dd->link_credits = CM_GLOBAL_CREDITS;
+	if (is_ax(dd))
+		dd->link_credits--;
+	dd->vcu = cu_to_vcu(hfi1_cu);
+	/* enough room for 8 MAD packets plus header - 17K */
+	dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau);
+	if (dd->vl15_init > dd->link_credits)
+		dd->vl15_init = dd->link_credits;
+
+	write_uninitialized_csrs_and_memories(dd);
+
+	if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+		for (i = 0; i < dd->num_pports; i++) {
+			struct hfi1_pportdata *ppd = &dd->pport[i];
+
+			set_partition_keys(ppd);
+		}
+	init_sc2vl_tables(dd);
+}
+
+static void init_kdeth_qp(struct hfi1_devdata *dd)
+{
+	/* user changed the KDETH_QP */
+	if (kdeth_qp != 0 && kdeth_qp >= 0xff) {
+		/* out of range or illegal value */
+		dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring");
+		kdeth_qp = 0;
+	}
+	if (kdeth_qp == 0)	/* not set, or failed range check */
+		kdeth_qp = DEFAULT_KDETH_QP;
+
+	write_csr(dd, SEND_BTH_QP,
+		  (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK) <<
+		  SEND_BTH_QP_KDETH_QP_SHIFT);
+
+	write_csr(dd, RCV_BTH_QP,
+		  (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK) <<
+		  RCV_BTH_QP_KDETH_QP_SHIFT);
+}
+
+/**
+ * init_qpmap_table
+ * @dd - device data
+ * @first_ctxt - first context
+ * @last_ctxt - first context
+ *
+ * This return sets the qpn mapping table that
+ * is indexed by qpn[8:1].
+ *
+ * The routine will round robin the 256 settings
+ * from first_ctxt to last_ctxt.
+ *
+ * The first/last looks ahead to having specialized
+ * receive contexts for mgmt and bypass.  Normal
+ * verbs traffic will assumed to be on a range
+ * of receive contexts.
+ */
+static void init_qpmap_table(struct hfi1_devdata *dd,
+			     u32 first_ctxt,
+			     u32 last_ctxt)
+{
+	u64 reg = 0;
+	u64 regno = RCV_QP_MAP_TABLE;
+	int i;
+	u64 ctxt = first_ctxt;
+
+	for (i = 0; i < 256; i++) {
+		reg |= ctxt << (8 * (i % 8));
+		ctxt++;
+		if (ctxt > last_ctxt)
+			ctxt = first_ctxt;
+		if (i % 8 == 7) {
+			write_csr(dd, regno, reg);
+			reg = 0;
+			regno += 8;
+		}
+	}
+
+	add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
+			| RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
+}
+
+struct rsm_map_table {
+	u64 map[NUM_MAP_REGS];
+	unsigned int used;
+};
+
+struct rsm_rule_data {
+	u8 offset;
+	u8 pkt_type;
+	u32 field1_off;
+	u32 field2_off;
+	u32 index1_off;
+	u32 index1_width;
+	u32 index2_off;
+	u32 index2_width;
+	u32 mask1;
+	u32 value1;
+	u32 mask2;
+	u32 value2;
+};
+
+/*
+ * Return an initialized RMT map table for users to fill in.  OK if it
+ * returns NULL, indicating no table.
+ */
+static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd)
+{
+	struct rsm_map_table *rmt;
+	u8 rxcontext = is_ax(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
+
+	rmt = kmalloc(sizeof(*rmt), GFP_KERNEL);
+	if (rmt) {
+		memset(rmt->map, rxcontext, sizeof(rmt->map));
+		rmt->used = 0;
+	}
+
+	return rmt;
+}
+
+/*
+ * Write the final RMT map table to the chip and free the table.  OK if
+ * table is NULL.
+ */
+static void complete_rsm_map_table(struct hfi1_devdata *dd,
+				   struct rsm_map_table *rmt)
+{
+	int i;
+
+	if (rmt) {
+		/* write table to chip */
+		for (i = 0; i < NUM_MAP_REGS; i++)
+			write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]);
+
+		/* enable RSM */
+		add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+	}
+}
+
+/*
+ * Add a receive side mapping rule.
+ */
+static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index,
+			 struct rsm_rule_data *rrd)
+{
+	write_csr(dd, RCV_RSM_CFG + (8 * rule_index),
+		  (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT |
+		  1ull << rule_index | /* enable bit */
+		  (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+	write_csr(dd, RCV_RSM_SELECT + (8 * rule_index),
+		  (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+		  (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+		  (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+		  (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+		  (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+		  (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+	write_csr(dd, RCV_RSM_MATCH + (8 * rule_index),
+		  (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT |
+		  (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT |
+		  (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT |
+		  (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT);
+}
+
+/*
+ * Clear a receive side mapping rule.
+ */
+static void clear_rsm_rule(struct hfi1_devdata *dd, u8 rule_index)
+{
+	write_csr(dd, RCV_RSM_CFG + (8 * rule_index), 0);
+	write_csr(dd, RCV_RSM_SELECT + (8 * rule_index), 0);
+	write_csr(dd, RCV_RSM_MATCH + (8 * rule_index), 0);
+}
+
+/* return the number of RSM map table entries that will be used for QOS */
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+			   unsigned int *np)
+{
+	int i;
+	unsigned int m, n;
+	u8 max_by_vl = 0;
+
+	/* is QOS active at all? */
+	if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
+	    num_vls == 1 ||
+	    krcvqsset <= 1)
+		goto no_qos;
+
+	/* determine bits for qpn */
+	for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++)
+		if (krcvqs[i] > max_by_vl)
+			max_by_vl = krcvqs[i];
+	if (max_by_vl > 32)
+		goto no_qos;
+	m = ilog2(__roundup_pow_of_two(max_by_vl));
+
+	/* determine bits for vl */
+	n = ilog2(__roundup_pow_of_two(num_vls));
+
+	/* reject if too much is used */
+	if ((m + n) > 7)
+		goto no_qos;
+
+	if (mp)
+		*mp = m;
+	if (np)
+		*np = n;
+
+	return 1 << (m + n);
+
+no_qos:
+	if (mp)
+		*mp = 0;
+	if (np)
+		*np = 0;
+	return 0;
+}
+
+/**
+ * init_qos - init RX qos
+ * @dd - device data
+ * @rmt - RSM map table
+ *
+ * This routine initializes Rule 0 and the RSM map table to implement
+ * quality of service (qos).
+ *
+ * If all of the limit tests succeed, qos is applied based on the array
+ * interpretation of krcvqs where entry 0 is VL0.
+ *
+ * The number of vl bits (n) and the number of qpn bits (m) are computed to
+ * feed both the RSM map table and the single rule.
+ */
+static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
+{
+	struct rsm_rule_data rrd;
+	unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
+	unsigned int rmt_entries;
+	u64 reg;
+
+	if (!rmt)
+		goto bail;
+	rmt_entries = qos_rmt_entries(dd, &m, &n);
+	if (rmt_entries == 0)
+		goto bail;
+	qpns_per_vl = 1 << m;
+
+	/* enough room in the map table? */
+	rmt_entries = 1 << (m + n);
+	if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES)
+		goto bail;
+
+	/* add qos entries to the the RSM map table */
+	for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
+		unsigned tctxt;
+
+		for (qpn = 0, tctxt = ctxt;
+		     krcvqs[i] && qpn < qpns_per_vl; qpn++) {
+			unsigned idx, regoff, regidx;
+
+			/* generate the index the hardware will produce */
+			idx = rmt->used + ((qpn << n) ^ i);
+			regoff = (idx % 8) * 8;
+			regidx = idx / 8;
+			/* replace default with context number */
+			reg = rmt->map[regidx];
+			reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
+				<< regoff);
+			reg |= (u64)(tctxt++) << regoff;
+			rmt->map[regidx] = reg;
+			if (tctxt == ctxt + krcvqs[i])
+				tctxt = ctxt;
+		}
+		ctxt += krcvqs[i];
+	}
+
+	rrd.offset = rmt->used;
+	rrd.pkt_type = 2;
+	rrd.field1_off = LRH_BTH_MATCH_OFFSET;
+	rrd.field2_off = LRH_SC_MATCH_OFFSET;
+	rrd.index1_off = LRH_SC_SELECT_OFFSET;
+	rrd.index1_width = n;
+	rrd.index2_off = QPN_SELECT_OFFSET;
+	rrd.index2_width = m + n;
+	rrd.mask1 = LRH_BTH_MASK;
+	rrd.value1 = LRH_BTH_VALUE;
+	rrd.mask2 = LRH_SC_MASK;
+	rrd.value2 = LRH_SC_VALUE;
+
+	/* add rule 0 */
+	add_rsm_rule(dd, RSM_INS_VERBS, &rrd);
+
+	/* mark RSM map entries as used */
+	rmt->used += rmt_entries;
+	/* map everything else to the mcast/err/vl15 context */
+	init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT);
+	dd->qos_shift = n + 1;
+	return;
+bail:
+	dd->qos_shift = 1;
+	init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
+}
+
+static void init_user_fecn_handling(struct hfi1_devdata *dd,
+				    struct rsm_map_table *rmt)
+{
+	struct rsm_rule_data rrd;
+	u64 reg;
+	int i, idx, regoff, regidx;
+	u8 offset;
+
+	/* there needs to be enough room in the map table */
+	if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
+		dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+		return;
+	}
+
+	/*
+	 * RSM will extract the destination context as an index into the
+	 * map table.  The destination contexts are a sequential block
+	 * in the range first_dyn_alloc_ctxt...num_rcv_contexts-1 (inclusive).
+	 * Map entries are accessed as offset + extracted value.  Adjust
+	 * the added offset so this sequence can be placed anywhere in
+	 * the table - as long as the entries themselves do not wrap.
+	 * There are only enough bits in offset for the table size, so
+	 * start with that to allow for a "negative" offset.
+	 */
+	offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
+						(int)dd->first_dyn_alloc_ctxt);
+
+	for (i = dd->first_dyn_alloc_ctxt, idx = rmt->used;
+				i < dd->num_rcv_contexts; i++, idx++) {
+		/* replace with identity mapping */
+		regoff = (idx % 8) * 8;
+		regidx = idx / 8;
+		reg = rmt->map[regidx];
+		reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff);
+		reg |= (u64)i << regoff;
+		rmt->map[regidx] = reg;
+	}
+
+	/*
+	 * For RSM intercept of Expected FECN packets:
+	 * o packet type 0 - expected
+	 * o match on F (bit 95), using select/match 1, and
+	 * o match on SH (bit 133), using select/match 2.
+	 *
+	 * Use index 1 to extract the 8-bit receive context from DestQP
+	 * (start at bit 64).  Use that as the RSM map table index.
+	 */
+	rrd.offset = offset;
+	rrd.pkt_type = 0;
+	rrd.field1_off = 95;
+	rrd.field2_off = 133;
+	rrd.index1_off = 64;
+	rrd.index1_width = 8;
+	rrd.index2_off = 0;
+	rrd.index2_width = 0;
+	rrd.mask1 = 1;
+	rrd.value1 = 1;
+	rrd.mask2 = 1;
+	rrd.value2 = 1;
+
+	/* add rule 1 */
+	add_rsm_rule(dd, RSM_INS_FECN, &rrd);
+
+	rmt->used += dd->num_user_contexts;
+}
+
+/* Initialize RSM for VNIC */
+void hfi1_init_vnic_rsm(struct hfi1_devdata *dd)
+{
+	u8 i, j;
+	u8 ctx_id = 0;
+	u64 reg;
+	u32 regoff;
+	struct rsm_rule_data rrd;
+
+	if (hfi1_vnic_is_rsm_full(dd, NUM_VNIC_MAP_ENTRIES)) {
+		dd_dev_err(dd, "Vnic RSM disabled, rmt entries used = %d\n",
+			   dd->vnic.rmt_start);
+		return;
+	}
+
+	dev_dbg(&(dd)->pcidev->dev, "Vnic rsm start = %d, end %d\n",
+		dd->vnic.rmt_start,
+		dd->vnic.rmt_start + NUM_VNIC_MAP_ENTRIES);
+
+	/* Update RSM mapping table, 32 regs, 256 entries - 1 ctx per byte */
+	regoff = RCV_RSM_MAP_TABLE + (dd->vnic.rmt_start / 8) * 8;
+	reg = read_csr(dd, regoff);
+	for (i = 0; i < NUM_VNIC_MAP_ENTRIES; i++) {
+		/* Update map register with vnic context */
+		j = (dd->vnic.rmt_start + i) % 8;
+		reg &= ~(0xffllu << (j * 8));
+		reg |= (u64)dd->vnic.ctxt[ctx_id++]->ctxt << (j * 8);
+		/* Wrap up vnic ctx index */
+		ctx_id %= dd->vnic.num_ctxt;
+		/* Write back map register */
+		if (j == 7 || ((i + 1) == NUM_VNIC_MAP_ENTRIES)) {
+			dev_dbg(&(dd)->pcidev->dev,
+				"Vnic rsm map reg[%d] =0x%llx\n",
+				regoff - RCV_RSM_MAP_TABLE, reg);
+
+			write_csr(dd, regoff, reg);
+			regoff += 8;
+			if (i < (NUM_VNIC_MAP_ENTRIES - 1))
+				reg = read_csr(dd, regoff);
+		}
+	}
+
+	/* Add rule for vnic */
+	rrd.offset = dd->vnic.rmt_start;
+	rrd.pkt_type = 4;
+	/* Match 16B packets */
+	rrd.field1_off = L2_TYPE_MATCH_OFFSET;
+	rrd.mask1 = L2_TYPE_MASK;
+	rrd.value1 = L2_16B_VALUE;
+	/* Match ETH L4 packets */
+	rrd.field2_off = L4_TYPE_MATCH_OFFSET;
+	rrd.mask2 = L4_16B_TYPE_MASK;
+	rrd.value2 = L4_16B_ETH_VALUE;
+	/* Calc context from veswid and entropy */
+	rrd.index1_off = L4_16B_HDR_VESWID_OFFSET;
+	rrd.index1_width = ilog2(NUM_VNIC_MAP_ENTRIES);
+	rrd.index2_off = L2_16B_ENTROPY_OFFSET;
+	rrd.index2_width = ilog2(NUM_VNIC_MAP_ENTRIES);
+	add_rsm_rule(dd, RSM_INS_VNIC, &rrd);
+
+	/* Enable RSM if not already enabled */
+	add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+}
+
+void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd)
+{
+	clear_rsm_rule(dd, RSM_INS_VNIC);
+
+	/* Disable RSM if used only by vnic */
+	if (dd->vnic.rmt_start == 0)
+		clear_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+}
+
+static void init_rxe(struct hfi1_devdata *dd)
+{
+	struct rsm_map_table *rmt;
+	u64 val;
+
+	/* enable all receive errors */
+	write_csr(dd, RCV_ERR_MASK, ~0ull);
+
+	rmt = alloc_rsm_map_table(dd);
+	/* set up QOS, including the QPN map table */
+	init_qos(dd, rmt);
+	init_user_fecn_handling(dd, rmt);
+	complete_rsm_map_table(dd, rmt);
+	/* record number of used rsm map entries for vnic */
+	dd->vnic.rmt_start = rmt->used;
+	kfree(rmt);
+
+	/*
+	 * make sure RcvCtrl.RcvWcb <= PCIe Device Control
+	 * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
+	 * space, PciCfgCap2.MaxPayloadSize in HFI).  There is only one
+	 * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and
+	 * Max_PayLoad_Size set to its minimum of 128.
+	 *
+	 * Presently, RcvCtrl.RcvWcb is not modified from its default of 0
+	 * (64 bytes).  Max_Payload_Size is possibly modified upward in
+	 * tune_pcie_caps() which is called after this routine.
+	 */
+
+	/* Have 16 bytes (4DW) of bypass header available in header queue */
+	val = read_csr(dd, RCV_BYPASS);
+	val |= (4ull << 16);
+	write_csr(dd, RCV_BYPASS, val);
+}
+
+static void init_other(struct hfi1_devdata *dd)
+{
+	/* enable all CCE errors */
+	write_csr(dd, CCE_ERR_MASK, ~0ull);
+	/* enable *some* Misc errors */
+	write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK);
+	/* enable all DC errors, except LCB */
+	write_csr(dd, DCC_ERR_FLG_EN, ~0ull);
+	write_csr(dd, DC_DC8051_ERR_EN, ~0ull);
+}
+
+/*
+ * Fill out the given AU table using the given CU.  A CU is defined in terms
+ * AUs.  The table is a an encoding: given the index, how many AUs does that
+ * represent?
+ *
+ * NOTE: Assumes that the register layout is the same for the
+ * local and remote tables.
+ */
+static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu,
+			       u32 csr0to3, u32 csr4to7)
+{
+	write_csr(dd, csr0to3,
+		  0ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT |
+		  1ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT |
+		  2ull * cu <<
+		  SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT |
+		  4ull * cu <<
+		  SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
+	write_csr(dd, csr4to7,
+		  8ull * cu <<
+		  SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT |
+		  16ull * cu <<
+		  SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT |
+		  32ull * cu <<
+		  SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT |
+		  64ull * cu <<
+		  SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
+}
+
+static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+	assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3,
+			   SEND_CM_LOCAL_AU_TABLE4_TO7);
+}
+
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+	assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3,
+			   SEND_CM_REMOTE_AU_TABLE4_TO7);
+}
+
+static void init_txe(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* enable all PIO, SDMA, general, and Egress errors */
+	write_csr(dd, SEND_PIO_ERR_MASK, ~0ull);
+	write_csr(dd, SEND_DMA_ERR_MASK, ~0ull);
+	write_csr(dd, SEND_ERR_MASK, ~0ull);
+	write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull);
+
+	/* enable all per-context and per-SDMA engine errors */
+	for (i = 0; i < dd->chip_send_contexts; i++)
+		write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull);
+	for (i = 0; i < dd->chip_sdma_engines; i++)
+		write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull);
+
+	/* set the local CU to AU mapping */
+	assign_local_cm_au_table(dd, dd->vcu);
+
+	/*
+	 * Set reasonable default for Credit Return Timer
+	 * Don't set on Simulator - causes it to choke.
+	 */
+	if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
+		write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
+}
+
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd,
+		       u16 jkey)
+{
+	u8 hw_ctxt;
+	u64 reg;
+
+	if (!rcd || !rcd->sc)
+		return -EINVAL;
+
+	hw_ctxt = rcd->sc->hw_context;
+	reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */
+		((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) <<
+		 SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT);
+	/* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */
+	if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY))
+		reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK;
+	write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_JOB_KEY, reg);
+	/*
+	 * Enable send-side J_KEY integrity check, unless this is A0 h/w
+	 */
+	if (!is_ax(dd)) {
+		reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE);
+		reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+		write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg);
+	}
+
+	/* Enable J_KEY check on receive context. */
+	reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK |
+		((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) <<
+		 RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT);
+	write_kctxt_csr(dd, rcd->ctxt, RCV_KEY_CTRL, reg);
+
+	return 0;
+}
+
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+	u8 hw_ctxt;
+	u64 reg;
+
+	if (!rcd || !rcd->sc)
+		return -EINVAL;
+
+	hw_ctxt = rcd->sc->hw_context;
+	write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_JOB_KEY, 0);
+	/*
+	 * Disable send-side J_KEY integrity check, unless this is A0 h/w.
+	 * This check would not have been enabled for A0 h/w, see
+	 * set_ctxt_jkey().
+	 */
+	if (!is_ax(dd)) {
+		reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE);
+		reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+		write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg);
+	}
+	/* Turn off the J_KEY on the receive side */
+	write_kctxt_csr(dd, rcd->ctxt, RCV_KEY_CTRL, 0);
+
+	return 0;
+}
+
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd,
+		       u16 pkey)
+{
+	u8 hw_ctxt;
+	u64 reg;
+
+	if (!rcd || !rcd->sc)
+		return -EINVAL;
+
+	hw_ctxt = rcd->sc->hw_context;
+	reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) <<
+		SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT;
+	write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
+	reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE);
+	reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+	reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
+	write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg);
+
+	return 0;
+}
+
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt)
+{
+	u8 hw_ctxt;
+	u64 reg;
+
+	if (!ctxt || !ctxt->sc)
+		return -EINVAL;
+
+	hw_ctxt = ctxt->sc->hw_context;
+	reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE);
+	reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+	write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg);
+	write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+
+	return 0;
+}
+
+/*
+ * Start doing the clean up the the chip. Our clean up happens in multiple
+ * stages and this is just the first.
+ */
+void hfi1_start_cleanup(struct hfi1_devdata *dd)
+{
+	aspm_exit(dd);
+	free_cntrs(dd);
+	free_rcverr(dd);
+	finish_chip_resources(dd);
+}
+
+#define HFI_BASE_GUID(dev) \
+	((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT))
+
+/*
+ * Information can be shared between the two HFIs on the same ASIC
+ * in the same OS.  This function finds the peer device and sets
+ * up a shared structure.
+ */
+static int init_asic_data(struct hfi1_devdata *dd)
+{
+	unsigned long flags;
+	struct hfi1_devdata *tmp, *peer = NULL;
+	struct hfi1_asic_data *asic_data;
+	int ret = 0;
+
+	/* pre-allocate the asic structure in case we are the first device */
+	asic_data = kzalloc(sizeof(*dd->asic_data), GFP_KERNEL);
+	if (!asic_data)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	/* Find our peer device */
+	list_for_each_entry(tmp, &hfi1_dev_list, list) {
+		if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
+		    dd->unit != tmp->unit) {
+			peer = tmp;
+			break;
+		}
+	}
+
+	if (peer) {
+		/* use already allocated structure */
+		dd->asic_data = peer->asic_data;
+		kfree(asic_data);
+	} else {
+		dd->asic_data = asic_data;
+		mutex_init(&dd->asic_data->asic_resource_mutex);
+	}
+	dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+	/* first one through - set up i2c devices */
+	if (!peer)
+		ret = set_up_i2c(dd, dd->asic_data);
+
+	return ret;
+}
+
+/*
+ * Set dd->boardname.  Use a generic name if a name is not returned from
+ * EFI variable space.
+ *
+ * Return 0 on success, -ENOMEM if space could not be allocated.
+ */
+static int obtain_boardname(struct hfi1_devdata *dd)
+{
+	/* generic board description */
+	const char generic[] =
+		"Intel Omni-Path Host Fabric Interface Adapter 100 Series";
+	unsigned long size;
+	int ret;
+
+	ret = read_hfi1_efi_var(dd, "description", &size,
+				(void **)&dd->boardname);
+	if (ret) {
+		dd_dev_info(dd, "Board description not found\n");
+		/* use generic description */
+		dd->boardname = kstrdup(generic, GFP_KERNEL);
+		if (!dd->boardname)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+/*
+ * Check the interrupt registers to make sure that they are mapped correctly.
+ * It is intended to help user identify any mismapping by VMM when the driver
+ * is running in a VM. This function should only be called before interrupt
+ * is set up properly.
+ *
+ * Return 0 on success, -EINVAL on failure.
+ */
+static int check_int_registers(struct hfi1_devdata *dd)
+{
+	u64 reg;
+	u64 all_bits = ~(u64)0;
+	u64 mask;
+
+	/* Clear CceIntMask[0] to avoid raising any interrupts */
+	mask = read_csr(dd, CCE_INT_MASK);
+	write_csr(dd, CCE_INT_MASK, 0ull);
+	reg = read_csr(dd, CCE_INT_MASK);
+	if (reg)
+		goto err_exit;
+
+	/* Clear all interrupt status bits */
+	write_csr(dd, CCE_INT_CLEAR, all_bits);
+	reg = read_csr(dd, CCE_INT_STATUS);
+	if (reg)
+		goto err_exit;
+
+	/* Set all interrupt status bits */
+	write_csr(dd, CCE_INT_FORCE, all_bits);
+	reg = read_csr(dd, CCE_INT_STATUS);
+	if (reg != all_bits)
+		goto err_exit;
+
+	/* Restore the interrupt mask */
+	write_csr(dd, CCE_INT_CLEAR, all_bits);
+	write_csr(dd, CCE_INT_MASK, mask);
+
+	return 0;
+err_exit:
+	write_csr(dd, CCE_INT_MASK, mask);
+	dd_dev_err(dd, "Interrupt registers not properly mapped by VMM\n");
+	return -EINVAL;
+}
+
+/**
+ * Allocate and initialize the device structure for the hfi.
+ * @dev: the pci_dev for hfi1_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * Also allocates, initializes, and returns the devdata struct for this
+ * device instance
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
+				  const struct pci_device_id *ent)
+{
+	struct hfi1_devdata *dd;
+	struct hfi1_pportdata *ppd;
+	u64 reg;
+	int i, ret;
+	static const char * const inames[] = { /* implementation names */
+		"RTL silicon",
+		"RTL VCS simulation",
+		"RTL FPGA emulation",
+		"Functional simulator"
+	};
+	struct pci_dev *parent = pdev->bus->self;
+
+	dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
+				sizeof(struct hfi1_pportdata));
+	if (IS_ERR(dd))
+		goto bail;
+	ppd = dd->pport;
+	for (i = 0; i < dd->num_pports; i++, ppd++) {
+		int vl;
+		/* init common fields */
+		hfi1_init_pportdata(pdev, ppd, dd, 0, 1);
+		/* DC supports 4 link widths */
+		ppd->link_width_supported =
+			OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X |
+			OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X;
+		ppd->link_width_downgrade_supported =
+			ppd->link_width_supported;
+		/* start out enabling only 4X */
+		ppd->link_width_enabled = OPA_LINK_WIDTH_4X;
+		ppd->link_width_downgrade_enabled =
+					ppd->link_width_downgrade_supported;
+		/* link width active is 0 when link is down */
+		/* link width downgrade active is 0 when link is down */
+
+		if (num_vls < HFI1_MIN_VLS_SUPPORTED ||
+		    num_vls > HFI1_MAX_VLS_SUPPORTED) {
+			dd_dev_err(dd, "Invalid num_vls %u, using %u VLs\n",
+				   num_vls, HFI1_MAX_VLS_SUPPORTED);
+			num_vls = HFI1_MAX_VLS_SUPPORTED;
+		}
+		ppd->vls_supported = num_vls;
+		ppd->vls_operational = ppd->vls_supported;
+		/* Set the default MTU. */
+		for (vl = 0; vl < num_vls; vl++)
+			dd->vld[vl].mtu = hfi1_max_mtu;
+		dd->vld[15].mtu = MAX_MAD_PACKET;
+		/*
+		 * Set the initial values to reasonable default, will be set
+		 * for real when link is up.
+		 */
+		ppd->overrun_threshold = 0x4;
+		ppd->phy_error_threshold = 0xf;
+		ppd->port_crc_mode_enabled = link_crc_mask;
+		/* initialize supported LTP CRC mode */
+		ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+		/* initialize enabled LTP CRC mode */
+		ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4;
+		/* start in offline */
+		ppd->host_link_state = HLS_DN_OFFLINE;
+		init_vl_arb_caches(ppd);
+	}
+
+	/*
+	 * Do remaining PCIe setup and save PCIe values in dd.
+	 * Any error printing is already done by the init code.
+	 * On return, we have the chip mapped.
+	 */
+	ret = hfi1_pcie_ddinit(dd, pdev);
+	if (ret < 0)
+		goto bail_free;
+
+	/* Save PCI space registers to rewrite after device reset */
+	ret = save_pci_variables(dd);
+	if (ret < 0)
+		goto bail_cleanup;
+
+	/* verify that reads actually work, save revision for reset check */
+	dd->revision = read_csr(dd, CCE_REVISION);
+	if (dd->revision == ~(u64)0) {
+		dd_dev_err(dd, "cannot read chip CSRs\n");
+		ret = -EINVAL;
+		goto bail_cleanup;
+	}
+	dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT)
+			& CCE_REVISION_CHIP_REV_MAJOR_MASK;
+	dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
+			& CCE_REVISION_CHIP_REV_MINOR_MASK;
+
+	/*
+	 * Check interrupt registers mapping if the driver has no access to
+	 * the upstream component. In this case, it is likely that the driver
+	 * is running in a VM.
+	 */
+	if (!parent) {
+		ret = check_int_registers(dd);
+		if (ret)
+			goto bail_cleanup;
+	}
+
+	/*
+	 * obtain the hardware ID - NOT related to unit, which is a
+	 * software enumeration
+	 */
+	reg = read_csr(dd, CCE_REVISION2);
+	dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT)
+					& CCE_REVISION2_HFI_ID_MASK;
+	/* the variable size will remove unwanted bits */
+	dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT;
+	dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT;
+	dd_dev_info(dd, "Implementation: %s, revision 0x%x\n",
+		    dd->icode < ARRAY_SIZE(inames) ?
+		    inames[dd->icode] : "unknown", (int)dd->irev);
+
+	/* speeds the hardware can support */
+	dd->pport->link_speed_supported = OPA_LINK_SPEED_25G;
+	/* speeds allowed to run at */
+	dd->pport->link_speed_enabled = dd->pport->link_speed_supported;
+	/* give a reasonable active value, will be set on link up */
+	dd->pport->link_speed_active = OPA_LINK_SPEED_25G;
+
+	dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS);
+	dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS);
+	dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES);
+	dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE);
+	dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE);
+	/* fix up link widths for emulation _p */
+	ppd = dd->pport;
+	if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) {
+		ppd->link_width_supported =
+			ppd->link_width_enabled =
+			ppd->link_width_downgrade_supported =
+			ppd->link_width_downgrade_enabled =
+				OPA_LINK_WIDTH_1X;
+	}
+	/* insure num_vls isn't larger than number of sdma engines */
+	if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) {
+		dd_dev_err(dd, "num_vls %u too large, using %u VLs\n",
+			   num_vls, dd->chip_sdma_engines);
+		num_vls = dd->chip_sdma_engines;
+		ppd->vls_supported = dd->chip_sdma_engines;
+		ppd->vls_operational = ppd->vls_supported;
+	}
+
+	/*
+	 * Convert the ns parameter to the 64 * cclocks used in the CSR.
+	 * Limit the max if larger than the field holds.  If timeout is
+	 * non-zero, then the calculated field will be at least 1.
+	 *
+	 * Must be after icode is set up - the cclock rate depends
+	 * on knowing the hardware being used.
+	 */
+	dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64;
+	if (dd->rcv_intr_timeout_csr >
+			RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK)
+		dd->rcv_intr_timeout_csr =
+			RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK;
+	else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout)
+		dd->rcv_intr_timeout_csr = 1;
+
+	/* needs to be done before we look for the peer device */
+	read_guid(dd);
+
+	/* set up shared ASIC data with peer device */
+	ret = init_asic_data(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/* obtain chip sizes, reset chip CSRs */
+	ret = init_chip(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/* read in the PCIe link speed information */
+	ret = pcie_speeds(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/* call before get_platform_config(), after init_chip_resources() */
+	ret = eprom_init(dd);
+	if (ret)
+		goto bail_free_rcverr;
+
+	/* Needs to be called before hfi1_firmware_init */
+	get_platform_config(dd);
+
+	/* read in firmware */
+	ret = hfi1_firmware_init(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/*
+	 * In general, the PCIe Gen3 transition must occur after the
+	 * chip has been idled (so it won't initiate any PCIe transactions
+	 * e.g. an interrupt) and before the driver changes any registers
+	 * (the transition will reset the registers).
+	 *
+	 * In particular, place this call after:
+	 * - init_chip()     - the chip will not initiate any PCIe transactions
+	 * - pcie_speeds()   - reads the current link speed
+	 * - hfi1_firmware_init() - the needed firmware is ready to be
+	 *			    downloaded
+	 */
+	ret = do_pcie_gen3_transition(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/* start setting dd values and adjusting CSRs */
+	init_early_variables(dd);
+
+	parse_platform_config(dd);
+
+	ret = obtain_boardname(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	snprintf(dd->boardversion, BOARD_VERS_MAX,
+		 "ChipABI %u.%u, ChipRev %u.%u, SW Compat %llu\n",
+		 HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN,
+		 (u32)dd->majrev,
+		 (u32)dd->minrev,
+		 (dd->revision >> CCE_REVISION_SW_SHIFT)
+		    & CCE_REVISION_SW_MASK);
+
+	ret = set_up_context_variables(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/* set initial RXE CSRs */
+	init_rxe(dd);
+	/* set initial TXE CSRs */
+	init_txe(dd);
+	/* set initial non-RXE, non-TXE CSRs */
+	init_other(dd);
+	/* set up KDETH QP prefix in both RX and TX CSRs */
+	init_kdeth_qp(dd);
+
+	ret = hfi1_dev_affinity_init(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/* send contexts must be set up before receive contexts */
+	ret = init_send_contexts(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	ret = hfi1_create_kctxts(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/*
+	 * Initialize aspm, to be done after gen3 transition and setting up
+	 * contexts and before enabling interrupts
+	 */
+	aspm_init(dd);
+
+	dd->rcvhdrsize = DEFAULT_RCVHDRSIZE;
+	/*
+	 * rcd[0] is guaranteed to be valid by this point. Also, all
+	 * context are using the same value, as per the module parameter.
+	 */
+	dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
+
+	ret = init_pervl_scs(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/* sdma init */
+	for (i = 0; i < dd->num_pports; ++i) {
+		ret = sdma_init(dd, i);
+		if (ret)
+			goto bail_cleanup;
+	}
+
+	/* use contexts created by hfi1_create_kctxts */
+	ret = set_up_interrupts(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	/* set up LCB access - must be after set_up_interrupts() */
+	init_lcb_access(dd);
+
+	/*
+	 * Serial number is created from the base guid:
+	 * [27:24] = base guid [38:35]
+	 * [23: 0] = base guid [23: 0]
+	 */
+	snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
+		 (dd->base_guid & 0xFFFFFF) |
+		     ((dd->base_guid >> 11) & 0xF000000));
+
+	dd->oui1 = dd->base_guid >> 56 & 0xFF;
+	dd->oui2 = dd->base_guid >> 48 & 0xFF;
+	dd->oui3 = dd->base_guid >> 40 & 0xFF;
+
+	ret = load_firmware(dd); /* asymmetric with dispose_firmware() */
+	if (ret)
+		goto bail_clear_intr;
+
+	thermal_init(dd);
+
+	ret = init_cntrs(dd);
+	if (ret)
+		goto bail_clear_intr;
+
+	ret = init_rcverr(dd);
+	if (ret)
+		goto bail_free_cntrs;
+
+	init_completion(&dd->user_comp);
+
+	/* The user refcount starts with one to inidicate an active device */
+	atomic_set(&dd->user_refcount, 1);
+
+	goto bail;
+
+bail_free_rcverr:
+	free_rcverr(dd);
+bail_free_cntrs:
+	free_cntrs(dd);
+bail_clear_intr:
+	hfi1_clean_up_interrupts(dd);
+bail_cleanup:
+	hfi1_pcie_ddcleanup(dd);
+bail_free:
+	hfi1_free_devdata(dd);
+	dd = ERR_PTR(ret);
+bail:
+	return dd;
+}
+
+static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,
+			u32 dw_len)
+{
+	u32 delta_cycles;
+	u32 current_egress_rate = ppd->current_egress_rate;
+	/* rates here are in units of 10^6 bits/sec */
+
+	if (desired_egress_rate == -1)
+		return 0; /* shouldn't happen */
+
+	if (desired_egress_rate >= current_egress_rate)
+		return 0; /* we can't help go faster, only slower */
+
+	delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) -
+			egress_cycles(dw_len * 4, current_egress_rate);
+
+	return (u16)delta_cycles;
+}
+
+/**
+ * create_pbc - build a pbc for transmission
+ * @flags: special case flags or-ed in built pbc
+ * @srate: static rate
+ * @vl: vl
+ * @dwlen: dword length (header words + data words + pbc words)
+ *
+ * Create a PBC with the given flags, rate, VL, and length.
+ *
+ * NOTE: The PBC created will not insert any HCRC - all callers but one are
+ * for verbs, which does not use this PSM feature.  The lone other caller
+ * is for the diagnostic interface which calls this if the user does not
+ * supply their own PBC.
+ */
+u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
+	       u32 dw_len)
+{
+	u64 pbc, delay = 0;
+
+	if (unlikely(srate_mbs))
+		delay = delay_cycles(ppd, srate_mbs, dw_len);
+
+	pbc = flags
+		| (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+		| ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
+		| (vl & PBC_VL_MASK) << PBC_VL_SHIFT
+		| (dw_len & PBC_LENGTH_DWS_MASK)
+			<< PBC_LENGTH_DWS_SHIFT;
+
+	return pbc;
+}
+
+#define SBUS_THERMAL    0x4f
+#define SBUS_THERM_MONITOR_MODE 0x1
+
+#define THERM_FAILURE(dev, ret, reason) \
+	dd_dev_err((dd),						\
+		   "Thermal sensor initialization failed: %s (%d)\n",	\
+		   (reason), (ret))
+
+/*
+ * Initialize the thermal sensor.
+ *
+ * After initialization, enable polling of thermal sensor through
+ * SBus interface. In order for this to work, the SBus Master
+ * firmware has to be loaded due to the fact that the HW polling
+ * logic uses SBus interrupts, which are not supported with
+ * default firmware. Otherwise, no data will be returned through
+ * the ASIC_STS_THERM CSR.
+ */
+static int thermal_init(struct hfi1_devdata *dd)
+{
+	int ret = 0;
+
+	if (dd->icode != ICODE_RTL_SILICON ||
+	    check_chip_resource(dd, CR_THERM_INIT, NULL))
+		return ret;
+
+	ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+	if (ret) {
+		THERM_FAILURE(dd, ret, "Acquire SBus");
+		return ret;
+	}
+
+	dd_dev_info(dd, "Initializing thermal sensor\n");
+	/* Disable polling of thermal readings */
+	write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
+	msleep(100);
+	/* Thermal Sensor Initialization */
+	/*    Step 1: Reset the Thermal SBus Receiver */
+	ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+				RESET_SBUS_RECEIVER, 0);
+	if (ret) {
+		THERM_FAILURE(dd, ret, "Bus Reset");
+		goto done;
+	}
+	/*    Step 2: Set Reset bit in Thermal block */
+	ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+				WRITE_SBUS_RECEIVER, 0x1);
+	if (ret) {
+		THERM_FAILURE(dd, ret, "Therm Block Reset");
+		goto done;
+	}
+	/*    Step 3: Write clock divider value (100MHz -> 2MHz) */
+	ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1,
+				WRITE_SBUS_RECEIVER, 0x32);
+	if (ret) {
+		THERM_FAILURE(dd, ret, "Write Clock Div");
+		goto done;
+	}
+	/*    Step 4: Select temperature mode */
+	ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3,
+				WRITE_SBUS_RECEIVER,
+				SBUS_THERM_MONITOR_MODE);
+	if (ret) {
+		THERM_FAILURE(dd, ret, "Write Mode Sel");
+		goto done;
+	}
+	/*    Step 5: De-assert block reset and start conversion */
+	ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+				WRITE_SBUS_RECEIVER, 0x2);
+	if (ret) {
+		THERM_FAILURE(dd, ret, "Write Reset Deassert");
+		goto done;
+	}
+	/*    Step 5.1: Wait for first conversion (21.5ms per spec) */
+	msleep(22);
+
+	/* Enable polling of thermal readings */
+	write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+
+	/* Set initialized flag */
+	ret = acquire_chip_resource(dd, CR_THERM_INIT, 0);
+	if (ret)
+		THERM_FAILURE(dd, ret, "Unable to set thermal init flag");
+
+done:
+	release_chip_resource(dd, CR_SBUS);
+	return ret;
+}
+
+static void handle_temp_err(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd = &dd->pport[0];
+	/*
+	 * Thermal Critical Interrupt
+	 * Put the device into forced freeze mode, take link down to
+	 * offline, and put DC into reset.
+	 */
+	dd_dev_emerg(dd,
+		     "Critical temperature reached! Forcing device into freeze mode!\n");
+	dd->flags |= HFI1_FORCED_FREEZE;
+	start_freeze_handling(ppd, FREEZE_SELF | FREEZE_ABORT);
+	/*
+	 * Shut DC down as much and as quickly as possible.
+	 *
+	 * Step 1: Take the link down to OFFLINE. This will cause the
+	 *         8051 to put the Serdes in reset. However, we don't want to
+	 *         go through the entire link state machine since we want to
+	 *         shutdown ASAP. Furthermore, this is not a graceful shutdown
+	 *         but rather an attempt to save the chip.
+	 *         Code below is almost the same as quiet_serdes() but avoids
+	 *         all the extra work and the sleeps.
+	 */
+	ppd->driver_link_ready = 0;
+	ppd->link_enabled = 0;
+	set_physical_link_state(dd, (OPA_LINKDOWN_REASON_SMA_DISABLED << 8) |
+				PLS_OFFLINE);
+	/*
+	 * Step 2: Shutdown LCB and 8051
+	 *         After shutdown, do not restore DC_CFG_RESET value.
+	 */
+	dc_shutdown(dd);
+}
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
new file mode 100644
index 0000000..c0d70f2
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -0,0 +1,1393 @@
+#ifndef _CHIP_H
+#define _CHIP_H
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the defines that is specific to the HFI chip
+ */
+
+/* sizes */
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define NUM_INTERRUPT_SOURCES 768
+#define RXE_NUM_CONTEXTS 160
+#define RXE_PER_CONTEXT_SIZE 0x1000	/* 4k */
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_NUM_DATA_VL 8
+#define TXE_NUM_CONTEXTS 160
+#define TXE_NUM_SDMA_ENGINES 16
+#define NUM_CONTEXTS_PER_SET 8
+#define VL_ARB_HIGH_PRIO_TABLE_SIZE 16
+#define VL_ARB_LOW_PRIO_TABLE_SIZE 16
+#define VL_ARB_TABLE_SIZE 16
+#define TXE_NUM_32_BIT_COUNTER 7
+#define TXE_NUM_64_BIT_COUNTER 30
+#define TXE_NUM_DATA_VL 8
+#define TXE_PIO_SIZE (32 * 0x100000)	/* 32 MB */
+#define PIO_BLOCK_SIZE 64			/* bytes */
+#define SDMA_BLOCK_SIZE 64			/* bytes */
+#define RCV_BUF_BLOCK_SIZE 64               /* bytes */
+#define PIO_CMASK 0x7ff	/* counter mask for free and fill counters */
+#define MAX_EAGER_ENTRIES    2048	/* max receive eager entries */
+#define MAX_TID_PAIR_ENTRIES 1024	/* max receive expected pairs */
+/*
+ * Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed
+ * at 64 bytes for all generation one devices
+ */
+#define CM_VAU 3
+/* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */
+#define CM_GLOBAL_CREDITS 0x880
+/* Number of PKey entries in the HW */
+#define MAX_PKEY_VALUES 16
+
+#include "chip_registers.h"
+
+#define RXE_PER_CONTEXT_USER   (RXE + RXE_PER_CONTEXT_OFFSET)
+#define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET)
+
+/* PBC flags */
+#define PBC_INTR		BIT_ULL(31)
+#define PBC_DC_INFO_SHIFT	(30)
+#define PBC_DC_INFO		BIT_ULL(PBC_DC_INFO_SHIFT)
+#define PBC_TEST_EBP		BIT_ULL(29)
+#define PBC_PACKET_BYPASS	BIT_ULL(28)
+#define PBC_CREDIT_RETURN	BIT_ULL(25)
+#define PBC_INSERT_BYPASS_ICRC	BIT_ULL(24)
+#define PBC_TEST_BAD_ICRC	BIT_ULL(23)
+#define PBC_FECN		BIT_ULL(22)
+
+/* PbcInsertHcrc field settings */
+#define PBC_IHCRC_LKDETH 0x0	/* insert @ local KDETH offset */
+#define PBC_IHCRC_GKDETH 0x1	/* insert @ global KDETH offset */
+#define PBC_IHCRC_NONE   0x2	/* no HCRC inserted */
+
+/* PBC fields */
+#define PBC_STATIC_RATE_CONTROL_COUNT_SHIFT 32
+#define PBC_STATIC_RATE_CONTROL_COUNT_MASK 0xffffull
+#define PBC_STATIC_RATE_CONTROL_COUNT_SMASK \
+	(PBC_STATIC_RATE_CONTROL_COUNT_MASK << \
+	PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+
+#define PBC_INSERT_HCRC_SHIFT 26
+#define PBC_INSERT_HCRC_MASK 0x3ull
+#define PBC_INSERT_HCRC_SMASK \
+	(PBC_INSERT_HCRC_MASK << PBC_INSERT_HCRC_SHIFT)
+
+#define PBC_VL_SHIFT 12
+#define PBC_VL_MASK 0xfull
+#define PBC_VL_SMASK (PBC_VL_MASK << PBC_VL_SHIFT)
+
+#define PBC_LENGTH_DWS_SHIFT 0
+#define PBC_LENGTH_DWS_MASK 0xfffull
+#define PBC_LENGTH_DWS_SMASK \
+	(PBC_LENGTH_DWS_MASK << PBC_LENGTH_DWS_SHIFT)
+
+/* Credit Return Fields */
+#define CR_COUNTER_SHIFT 0
+#define CR_COUNTER_MASK 0x7ffull
+#define CR_COUNTER_SMASK (CR_COUNTER_MASK << CR_COUNTER_SHIFT)
+
+#define CR_STATUS_SHIFT 11
+#define CR_STATUS_MASK 0x1ull
+#define CR_STATUS_SMASK (CR_STATUS_MASK << CR_STATUS_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT 12
+#define CR_CREDIT_RETURN_DUE_TO_PBC_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SMASK \
+	(CR_CREDIT_RETURN_DUE_TO_PBC_MASK << \
+	CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT 13
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK \
+	(CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK << \
+	CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT 14
+#define CR_CREDIT_RETURN_DUE_TO_ERR_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SMASK \
+	(CR_CREDIT_RETURN_DUE_TO_ERR_MASK << \
+	CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT 15
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK \
+	(CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \
+	CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT)
+
+/* interrupt source numbers */
+#define IS_GENERAL_ERR_START	  0
+#define IS_SDMAENG_ERR_START	 16
+#define IS_SENDCTXT_ERR_START	 32
+#define IS_SDMA_START		192 /* includes SDmaProgress,SDmaIdle */
+#define IS_VARIOUS_START		240
+#define IS_DC_START			248
+#define IS_RCVAVAIL_START		256
+#define IS_RCVURGENT_START		416
+#define IS_SENDCREDIT_START		576
+#define IS_RESERVED_START		736
+#define IS_MAX_SOURCES		768
+
+/* derived interrupt source values */
+#define IS_GENERAL_ERR_END		IS_SDMAENG_ERR_START
+#define IS_SDMAENG_ERR_END		IS_SENDCTXT_ERR_START
+#define IS_SENDCTXT_ERR_END		IS_SDMA_START
+#define IS_SDMA_END			IS_VARIOUS_START
+#define IS_VARIOUS_END		IS_DC_START
+#define IS_DC_END			IS_RCVAVAIL_START
+#define IS_RCVAVAIL_END		IS_RCVURGENT_START
+#define IS_RCVURGENT_END		IS_SENDCREDIT_START
+#define IS_SENDCREDIT_END		IS_RESERVED_START
+#define IS_RESERVED_END		IS_MAX_SOURCES
+
+/* absolute interrupt numbers for QSFP1Int and QSFP2Int */
+#define QSFP1_INT		242
+#define QSFP2_INT		243
+
+/* DCC_CFG_PORT_CONFIG logical link states */
+#define LSTATE_DOWN    0x1
+#define LSTATE_INIT    0x2
+#define LSTATE_ARMED   0x3
+#define LSTATE_ACTIVE  0x4
+
+/* DC8051_STS_CUR_STATE port values (physical link states) */
+#define PLS_DISABLED			   0x30
+#define PLS_OFFLINE				   0x90
+#define PLS_OFFLINE_QUIET			   0x90
+#define PLS_OFFLINE_PLANNED_DOWN_INFORM	   0x91
+#define PLS_OFFLINE_READY_TO_QUIET_LT	   0x92
+#define PLS_OFFLINE_REPORT_FAILURE		   0x93
+#define PLS_OFFLINE_READY_TO_QUIET_BCC	   0x94
+#define PLS_OFFLINE_QUIET_DURATION	   0x95
+#define PLS_POLLING				   0x20
+#define PLS_POLLING_QUIET			   0x20
+#define PLS_POLLING_ACTIVE			   0x21
+#define PLS_CONFIGPHY			   0x40
+#define PLS_CONFIGPHY_DEBOUCE		   0x40
+#define PLS_CONFIGPHY_ESTCOMM		   0x41
+#define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT	   0x42
+#define PLS_CONFIGPHY_ESTCOMM_LOCAL_COMPLETE   0x43
+#define PLS_CONFIGPHY_OPTEQ			   0x44
+#define PLS_CONFIGPHY_OPTEQ_OPTIMIZING	   0x44
+#define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE	   0x45
+#define PLS_CONFIGPHY_VERIFYCAP		   0x46
+#define PLS_CONFIGPHY_VERIFYCAP_EXCHANGE	   0x46
+#define PLS_CONFIGPHY_VERIFYCAP_LOCAL_COMPLETE 0x47
+#define PLS_CONFIGLT			   0x48
+#define PLS_CONFIGLT_CONFIGURE		   0x48
+#define PLS_CONFIGLT_LINK_TRANSFER_ACTIVE	   0x49
+#define PLS_LINKUP				   0x50
+#define PLS_PHYTEST				   0xB0
+#define PLS_INTERNAL_SERDES_LOOPBACK	   0xe1
+#define PLS_QUICK_LINKUP			   0xe2
+
+/* DC_DC8051_CFG_HOST_CMD_0.REQ_TYPE - 8051 host commands */
+#define HCMD_LOAD_CONFIG_DATA  0x01
+#define HCMD_READ_CONFIG_DATA  0x02
+#define HCMD_CHANGE_PHY_STATE  0x03
+#define HCMD_SEND_LCB_IDLE_MSG 0x04
+#define HCMD_MISC		   0x05
+#define HCMD_READ_LCB_IDLE_MSG 0x06
+#define HCMD_READ_LCB_CSR      0x07
+#define HCMD_WRITE_LCB_CSR     0x08
+#define HCMD_INTERFACE_TEST	   0xff
+
+/* DC_DC8051_CFG_HOST_CMD_1.RETURN_CODE - 8051 host command return */
+#define HCMD_SUCCESS 2
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */
+#define SPICO_ROM_FAILED		BIT(0)
+#define UNKNOWN_FRAME			BIT(1)
+#define TARGET_BER_NOT_MET		BIT(2)
+#define FAILED_SERDES_INTERNAL_LOOPBACK	BIT(3)
+#define FAILED_SERDES_INIT		BIT(4)
+#define FAILED_LNI_POLLING		BIT(5)
+#define FAILED_LNI_DEBOUNCE		BIT(6)
+#define FAILED_LNI_ESTBCOMM		BIT(7)
+#define FAILED_LNI_OPTEQ		BIT(8)
+#define FAILED_LNI_VERIFY_CAP1		BIT(9)
+#define FAILED_LNI_VERIFY_CAP2		BIT(10)
+#define FAILED_LNI_CONFIGLT		BIT(11)
+#define HOST_HANDSHAKE_TIMEOUT		BIT(12)
+#define EXTERNAL_DEVICE_REQ_TIMEOUT	BIT(13)
+
+#define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \
+			| FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \
+			| FAILED_LNI_VERIFY_CAP1 \
+			| FAILED_LNI_VERIFY_CAP2 \
+			| FAILED_LNI_CONFIGLT | HOST_HANDSHAKE_TIMEOUT \
+			| EXTERNAL_DEVICE_REQ_TIMEOUT)
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */
+#define HOST_REQ_DONE		BIT(0)
+#define BC_PWR_MGM_MSG		BIT(1)
+#define BC_SMA_MSG		BIT(2)
+#define BC_BCC_UNKNOWN_MSG	BIT(3)
+#define BC_IDLE_UNKNOWN_MSG	BIT(4)
+#define EXT_DEVICE_CFG_REQ	BIT(5)
+#define VERIFY_CAP_FRAME	BIT(6)
+#define LINKUP_ACHIEVED		BIT(7)
+#define LINK_GOING_DOWN		BIT(8)
+#define LINK_WIDTH_DOWNGRADED	BIT(9)
+
+/* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */
+#define HREQ_LOAD_CONFIG	0x01
+#define HREQ_SAVE_CONFIG	0x02
+#define HREQ_READ_CONFIG	0x03
+#define HREQ_SET_TX_EQ_ABS	0x04
+#define HREQ_SET_TX_EQ_REL	0x05
+#define HREQ_ENABLE		0x06
+#define HREQ_CONFIG_DONE	0xfe
+#define HREQ_INTERFACE_TEST	0xff
+
+/* DC_DC8051_CFG_EXT_DEV_0.RETURN_CODE - 8051 host request return codes */
+#define HREQ_INVALID		0x01
+#define HREQ_SUCCESS		0x02
+#define HREQ_NOT_SUPPORTED		0x03
+#define HREQ_FEATURE_NOT_SUPPORTED	0x04 /* request specific feature */
+#define HREQ_REQUEST_REJECTED	0xfe
+#define HREQ_EXECUTION_ONGOING	0xff
+
+/* MISC host command functions */
+#define HCMD_MISC_REQUEST_LCB_ACCESS 0x1
+#define HCMD_MISC_GRANT_LCB_ACCESS   0x2
+
+/* idle flit message types */
+#define IDLE_PHYSICAL_LINK_MGMT 0x1
+#define IDLE_CRU		    0x2
+#define IDLE_SMA		    0x3
+#define IDLE_POWER_MGMT	    0x4
+
+/* idle flit message send fields (both send and read) */
+#define IDLE_PAYLOAD_MASK 0xffffffffffull /* 40 bits */
+#define IDLE_PAYLOAD_SHIFT 8
+#define IDLE_MSG_TYPE_MASK 0xf
+#define IDLE_MSG_TYPE_SHIFT 0
+
+/* idle flit message read fields */
+#define READ_IDLE_MSG_TYPE_MASK 0xf
+#define READ_IDLE_MSG_TYPE_SHIFT 0
+
+/* SMA idle flit payload commands */
+#define SMA_IDLE_ARM	1
+#define SMA_IDLE_ACTIVE 2
+
+/* DC_DC8051_CFG_MODE.GENERAL bits */
+#define DISABLE_SELF_GUID_CHECK 0x2
+
+/* Bad L2 frame error code */
+#define BAD_L2_ERR      0x6
+
+/*
+ * Eager buffer minimum and maximum sizes supported by the hardware.
+ * All power-of-two sizes in between are supported as well.
+ * MAX_EAGER_BUFFER_TOTAL is the maximum size of memory
+ * allocatable for Eager buffer to a single context. All others
+ * are limits for the RcvArray entries.
+ */
+#define MIN_EAGER_BUFFER       (4 * 1024)
+#define MAX_EAGER_BUFFER       (256 * 1024)
+#define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */
+#define MAX_EXPECTED_BUFFER    (2048 * 1024)
+
+/*
+ * Receive expected base and count and eager base and count increment -
+ * the CSR fields hold multiples of this value.
+ */
+#define RCV_SHIFT 3
+#define RCV_INCREMENT BIT(RCV_SHIFT)
+
+/*
+ * Receive header queue entry increment - the CSR holds multiples of
+ * this value.
+ */
+#define HDRQ_SIZE_SHIFT 5
+#define HDRQ_INCREMENT BIT(HDRQ_SIZE_SHIFT)
+
+/*
+ * Freeze handling flags
+ */
+#define FREEZE_ABORT     0x01	/* do not do recovery */
+#define FREEZE_SELF	     0x02	/* initiate the freeze */
+#define FREEZE_LINK_DOWN 0x04	/* link is down */
+
+/*
+ * Chip implementation codes.
+ */
+#define ICODE_RTL_SILICON		0x00
+#define ICODE_RTL_VCS_SIMULATION	0x01
+#define ICODE_FPGA_EMULATION	0x02
+#define ICODE_FUNCTIONAL_SIMULATOR	0x03
+
+/*
+ * 8051 data memory size.
+ */
+#define DC8051_DATA_MEM_SIZE 0x1000
+
+/*
+ * 8051 firmware registers
+ */
+#define NUM_GENERAL_FIELDS 0x17
+#define NUM_LANE_FIELDS    0x8
+
+/* 8051 general register Field IDs */
+#define LINK_OPTIMIZATION_SETTINGS   0x00
+#define LINK_TUNING_PARAMETERS	     0x02
+#define DC_HOST_COMM_SETTINGS	     0x03
+#define TX_SETTINGS		     0x06
+#define VERIFY_CAP_LOCAL_PHY	     0x07
+#define VERIFY_CAP_LOCAL_FABRIC	     0x08
+#define VERIFY_CAP_LOCAL_LINK_WIDTH  0x09
+#define LOCAL_DEVICE_ID		     0x0a
+#define RESERVED_REGISTERS	     0x0b
+#define LOCAL_LNI_INFO		     0x0c
+#define REMOTE_LNI_INFO              0x0d
+#define MISC_STATUS		     0x0e
+#define VERIFY_CAP_REMOTE_PHY	     0x0f
+#define VERIFY_CAP_REMOTE_FABRIC     0x10
+#define VERIFY_CAP_REMOTE_LINK_WIDTH 0x11
+#define LAST_LOCAL_STATE_COMPLETE    0x12
+#define LAST_REMOTE_STATE_COMPLETE   0x13
+#define LINK_QUALITY_INFO            0x14
+#define REMOTE_DEVICE_ID	     0x15
+#define LINK_DOWN_REASON	     0x16 /* first byte of offset 0x16 */
+#define VERSION_PATCH		     0x16 /* last byte of offset 0x16 */
+
+/* 8051 lane specific register field IDs */
+#define TX_EQ_SETTINGS		0x00
+#define CHANNEL_LOSS_SETTINGS	0x05
+
+/* Lane ID for general configuration registers */
+#define GENERAL_CONFIG 4
+
+/* LINK_TUNING_PARAMETERS fields */
+#define TUNING_METHOD_SHIFT 24
+
+/* LINK_OPTIMIZATION_SETTINGS fields */
+#define ENABLE_EXT_DEV_CONFIG_SHIFT 24
+
+/* LOAD_DATA 8051 command shifts and fields */
+#define LOAD_DATA_FIELD_ID_SHIFT 40
+#define LOAD_DATA_FIELD_ID_MASK 0xfull
+#define LOAD_DATA_LANE_ID_SHIFT 32
+#define LOAD_DATA_LANE_ID_MASK 0xfull
+#define LOAD_DATA_DATA_SHIFT   0x0
+#define LOAD_DATA_DATA_MASK   0xffffffffull
+
+/* READ_DATA 8051 command shifts and fields */
+#define READ_DATA_FIELD_ID_SHIFT 40
+#define READ_DATA_FIELD_ID_MASK 0xffull
+#define READ_DATA_LANE_ID_SHIFT 32
+#define READ_DATA_LANE_ID_MASK 0xffull
+#define READ_DATA_DATA_SHIFT   0x0
+#define READ_DATA_DATA_MASK   0xffffffffull
+
+/* TX settings fields */
+#define ENABLE_LANE_TX_SHIFT		0
+#define ENABLE_LANE_TX_MASK		0xff
+#define TX_POLARITY_INVERSION_SHIFT	8
+#define TX_POLARITY_INVERSION_MASK	0xff
+#define RX_POLARITY_INVERSION_SHIFT	16
+#define RX_POLARITY_INVERSION_MASK	0xff
+#define MAX_RATE_SHIFT			24
+#define MAX_RATE_MASK			0xff
+
+/* verify capability PHY fields */
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT	0x4
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK	0x1
+#define POWER_MANAGEMENT_SHIFT			0x0
+#define POWER_MANAGEMENT_MASK			0xf
+
+/* 8051 lane register Field IDs */
+#define SPICO_FW_VERSION 0x7	/* SPICO firmware version */
+
+/* SPICO firmware version fields */
+#define SPICO_ROM_VERSION_SHIFT 0
+#define SPICO_ROM_VERSION_MASK 0xffff
+#define SPICO_ROM_PROD_ID_SHIFT 16
+#define SPICO_ROM_PROD_ID_MASK 0xffff
+
+/* verify capability fabric fields */
+#define VAU_SHIFT	0
+#define VAU_MASK	0x0007
+#define Z_SHIFT		3
+#define Z_MASK		0x0001
+#define VCU_SHIFT	4
+#define VCU_MASK	0x0007
+#define VL15BUF_SHIFT	8
+#define VL15BUF_MASK	0x0fff
+#define CRC_SIZES_SHIFT 20
+#define CRC_SIZES_MASK	0x7
+
+/* verify capability local link width fields */
+#define LINK_WIDTH_SHIFT 0		/* also for remote link width */
+#define LINK_WIDTH_MASK 0xffff		/* also for remote link width */
+#define LOCAL_FLAG_BITS_SHIFT 16
+#define LOCAL_FLAG_BITS_MASK 0xff
+#define MISC_CONFIG_BITS_SHIFT 24
+#define MISC_CONFIG_BITS_MASK 0xff
+
+/* verify capability remote link width fields */
+#define REMOTE_TX_RATE_SHIFT 16
+#define REMOTE_TX_RATE_MASK 0xff
+
+/* LOCAL_DEVICE_ID fields */
+#define LOCAL_DEVICE_REV_SHIFT 0
+#define LOCAL_DEVICE_REV_MASK 0xff
+#define LOCAL_DEVICE_ID_SHIFT 8
+#define LOCAL_DEVICE_ID_MASK 0xffff
+
+/* REMOTE_DEVICE_ID fields */
+#define REMOTE_DEVICE_REV_SHIFT 0
+#define REMOTE_DEVICE_REV_MASK 0xff
+#define REMOTE_DEVICE_ID_SHIFT 8
+#define REMOTE_DEVICE_ID_MASK 0xffff
+
+/* local LNI link width fields */
+#define ENABLE_LANE_RX_SHIFT 16
+#define ENABLE_LANE_RX_MASK  0xff
+
+/* mask, shift for reading 'mgmt_enabled' value from REMOTE_LNI_INFO field */
+#define MGMT_ALLOWED_SHIFT 23
+#define MGMT_ALLOWED_MASK 0x1
+
+/* mask, shift for 'link_quality' within LINK_QUALITY_INFO field */
+#define LINK_QUALITY_SHIFT 24
+#define LINK_QUALITY_MASK  0x7
+
+/*
+ * mask, shift for reading 'planned_down_remote_reason_code'
+ * from LINK_QUALITY_INFO field
+ */
+#define DOWN_REMOTE_REASON_SHIFT 16
+#define DOWN_REMOTE_REASON_MASK  0xff
+
+#define HOST_INTERFACE_VERSION 1
+#define HOST_INTERFACE_VERSION_SHIFT 16
+#define HOST_INTERFACE_VERSION_MASK  0xff
+
+/* verify capability PHY power management bits */
+#define PWRM_BER_CONTROL	0x1
+#define PWRM_BANDWIDTH_CONTROL	0x2
+
+/* 8051 link down reasons */
+#define LDR_LINK_TRANSFER_ACTIVE_LOW   0xa
+#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb
+#define LDR_RECEIVED_HOST_OFFLINE_REQ  0xc
+
+/* verify capability fabric CRC size bits */
+enum {
+	CAP_CRC_14B = (1 << 0), /* 14b CRC */
+	CAP_CRC_48B = (1 << 1), /* 48b CRC */
+	CAP_CRC_12B_16B_PER_LANE = (1 << 2) /* 12b-16b per lane CRC */
+};
+
+#define SUPPORTED_CRCS (CAP_CRC_14B | CAP_CRC_48B)
+
+/* misc status version fields */
+#define STS_FM_VERSION_MINOR_SHIFT 16
+#define STS_FM_VERSION_MINOR_MASK  0xff
+#define STS_FM_VERSION_MAJOR_SHIFT 24
+#define STS_FM_VERSION_MAJOR_MASK  0xff
+#define STS_FM_VERSION_PATCH_SHIFT 24
+#define STS_FM_VERSION_PATCH_MASK  0xff
+
+/* LCB_CFG_CRC_MODE TX_VAL and RX_VAL CRC mode values */
+#define LCB_CRC_16B			0x0	/* 16b CRC */
+#define LCB_CRC_14B			0x1	/* 14b CRC */
+#define LCB_CRC_48B			0x2	/* 48b CRC */
+#define LCB_CRC_12B_16B_PER_LANE	0x3	/* 12b-16b per lane CRC */
+
+/*
+ * the following enum is (almost) a copy/paste of the definition
+ * in the OPA spec, section 20.2.2.6.8 (PortInfo)
+ */
+enum {
+	PORT_LTP_CRC_MODE_NONE = 0,
+	PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */
+	PORT_LTP_CRC_MODE_16 = 2, /* 16-bit LTP CRC mode */
+	PORT_LTP_CRC_MODE_48 = 4,
+		/* 48-bit overlapping LTP CRC mode (optional) */
+	PORT_LTP_CRC_MODE_PER_LANE = 8
+		/* 12 to 16 bit per lane LTP CRC mode (optional) */
+};
+
+/* timeouts */
+#define LINK_RESTART_DELAY 1000		/* link restart delay, in ms */
+#define TIMEOUT_8051_START 5000         /* 8051 start timeout, in ms */
+#define DC8051_COMMAND_TIMEOUT 1000	/* DC8051 command timeout, in ms */
+#define FREEZE_STATUS_TIMEOUT 20	/* wait for freeze indicators, in ms */
+#define VL_STATUS_CLEAR_TIMEOUT 5000	/* per-VL status clear, in ms */
+#define CCE_STATUS_TIMEOUT 10		/* time to clear CCE Status, in ms */
+
+/* cclock tick time, in picoseconds per tick: 1/speed * 10^12  */
+#define ASIC_CCLOCK_PS  1242	/* 805 MHz */
+#define FPGA_CCLOCK_PS 30300	/*  33 MHz */
+
+/*
+ * Mask of enabled MISC errors.  Do not enable the two RSA engine errors -
+ * see firmware.c:run_rsa() for details.
+ */
+#define DRIVER_MISC_MASK \
+	(~(MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK \
+		| MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK))
+
+/* valid values for the loopback module parameter */
+#define LOOPBACK_NONE	0	/* no loopback - default */
+#define LOOPBACK_SERDES 1
+#define LOOPBACK_LCB	2
+#define LOOPBACK_CABLE	3	/* external cable */
+
+/* set up serdes bit in MISC_CONFIG_BITS */
+#define LOOPBACK_SERDES_CONFIG_BIT_MASK_SHIFT 0
+
+/* read and write hardware registers */
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset);
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value);
+
+/*
+ * The *_kctxt_* flavor of the CSR read/write functions are for
+ * per-context or per-SDMA CSRs that are not mappable to user-space.
+ * Their spacing is not a PAGE_SIZE multiple.
+ */
+static inline u64 read_kctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+				 u32 offset0)
+{
+	/* kernel per-context CSRs are separated by 0x100 */
+	return read_csr(dd, offset0 + (0x100 * ctxt));
+}
+
+static inline void write_kctxt_csr(struct hfi1_devdata *dd, int ctxt,
+				   u32 offset0, u64 value)
+{
+	/* kernel per-context CSRs are separated by 0x100 */
+	write_csr(dd, offset0 + (0x100 * ctxt), value);
+}
+
+int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data);
+int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data);
+
+void __iomem *get_csr_addr(
+	const struct hfi1_devdata *dd,
+	u32 offset);
+
+static inline void __iomem *get_kctxt_csr_addr(
+	const struct hfi1_devdata *dd,
+	int ctxt,
+	u32 offset0)
+{
+	return get_csr_addr(dd, offset0 + (0x100 * ctxt));
+}
+
+/*
+ * The *_uctxt_* flavor of the CSR read/write functions are for
+ * per-context CSRs that are mappable to user space. All these CSRs
+ * are spaced by a PAGE_SIZE multiple in order to be mappable to
+ * different processes without exposing other contexts' CSRs
+ */
+static inline u64 read_uctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+				 u32 offset0)
+{
+	/* user per-context CSRs are separated by 0x1000 */
+	return read_csr(dd, offset0 + (0x1000 * ctxt));
+}
+
+static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt,
+				   u32 offset0, u64 value)
+{
+	/* user per-context CSRs are separated by 0x1000 */
+	write_csr(dd, offset0 + (0x1000 * ctxt), value);
+}
+
+u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
+	       u32 dw_len);
+
+/* firmware.c */
+#define SBUS_MASTER_BROADCAST 0xfd
+#define NUM_PCIE_SERDES 16	/* number of PCIe serdes on the SBus */
+extern const u8 pcie_serdes_broadcast[];
+extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES];
+
+/* SBus commands */
+#define RESET_SBUS_RECEIVER 0x20
+#define WRITE_SBUS_RECEIVER 0x21
+#define READ_SBUS_RECEIVER  0x22
+void sbus_request(struct hfi1_devdata *dd,
+		  u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+int sbus_request_slow(struct hfi1_devdata *dd,
+		      u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+void set_sbus_fast_mode(struct hfi1_devdata *dd);
+void clear_sbus_fast_mode(struct hfi1_devdata *dd);
+int hfi1_firmware_init(struct hfi1_devdata *dd);
+int load_pcie_firmware(struct hfi1_devdata *dd);
+int load_firmware(struct hfi1_devdata *dd);
+void dispose_firmware(void);
+int acquire_hw_mutex(struct hfi1_devdata *dd);
+void release_hw_mutex(struct hfi1_devdata *dd);
+
+/*
+ * Bitmask of dynamic access for ASIC block chip resources.  Each HFI has its
+ * own range of bits for the resource so it can clear its own bits on
+ * starting and exiting.  If either HFI has the resource bit set, the
+ * resource is in use.  The separate bit ranges are:
+ *	HFI0 bits  7:0
+ *	HFI1 bits 15:8
+ */
+#define CR_SBUS  0x01	/* SBUS, THERM, and PCIE registers */
+#define CR_EPROM 0x02	/* EEP, GPIO registers */
+#define CR_I2C1  0x04	/* QSFP1_OE register */
+#define CR_I2C2  0x08	/* QSFP2_OE register */
+#define CR_DYN_SHIFT 8	/* dynamic flag shift */
+#define CR_DYN_MASK  ((1ull << CR_DYN_SHIFT) - 1)
+
+/*
+ * Bitmask of static ASIC states these are outside of the dynamic ASIC
+ * block chip resources above.  These are to be set once and never cleared.
+ * Must be holding the SBus dynamic flag when setting.
+ */
+#define CR_THERM_INIT	0x010000
+
+int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait);
+void release_chip_resource(struct hfi1_devdata *dd, u32 resource);
+bool check_chip_resource(struct hfi1_devdata *dd, u32 resource,
+			 const char *func);
+void init_chip_resources(struct hfi1_devdata *dd);
+void finish_chip_resources(struct hfi1_devdata *dd);
+
+/* ms wait time for access to an SBus resoure */
+#define SBUS_TIMEOUT 4000 /* long enough for a FW download and SBR */
+
+/* ms wait time for a qsfp (i2c) chain to become available */
+#define QSFP_WAIT 20000 /* long enough for FW update to the F4 uc */
+
+void fabric_serdes_reset(struct hfi1_devdata *dd);
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result);
+
+/* chip.c */
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor,
+		      u8 *ver_patch);
+int write_host_interface_version(struct hfi1_devdata *dd, u8 version);
+void read_guid(struct hfi1_devdata *dd);
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+			  u8 neigh_reason, u8 rem_reason);
+int set_link_state(struct hfi1_pportdata *, u32 state);
+int port_ltp_to_cap(int port_ltp);
+void handle_verify_cap(struct work_struct *work);
+void handle_freeze(struct work_struct *work);
+void handle_link_up(struct work_struct *work);
+void handle_link_down(struct work_struct *work);
+void handle_link_downgrade(struct work_struct *work);
+void handle_link_bounce(struct work_struct *work);
+void handle_start_link(struct work_struct *work);
+void handle_sma_message(struct work_struct *work);
+int reset_qsfp(struct hfi1_pportdata *ppd);
+void qsfp_event(struct work_struct *work);
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags);
+int send_idle_sma(struct hfi1_devdata *dd, u64 message);
+int load_8051_config(struct hfi1_devdata *, u8, u8, u32);
+int read_8051_config(struct hfi1_devdata *, u8, u8, u32 *);
+int start_link(struct hfi1_pportdata *ppd);
+int bringup_serdes(struct hfi1_pportdata *ppd);
+void set_intr_state(struct hfi1_devdata *dd, u32 enable);
+bool apply_link_downgrade_policy(struct hfi1_pportdata *ppd,
+				 bool refresh_widths);
+void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd,
+		    u32 intr_adjust, u32 npkts);
+int stop_drain_data_vls(struct hfi1_devdata *dd);
+int open_fill_data_vls(struct hfi1_devdata *dd);
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns);
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclock);
+void get_linkup_link_widths(struct hfi1_pportdata *ppd);
+void read_ltp_rtt(struct hfi1_devdata *dd);
+void clear_linkup_counters(struct hfi1_devdata *dd);
+u32 hdrqempty(struct hfi1_ctxtdata *rcd);
+int is_ax(struct hfi1_devdata *dd);
+int is_bx(struct hfi1_devdata *dd);
+u32 read_physical_state(struct hfi1_devdata *dd);
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
+const char *opa_lstate_name(u32 lstate);
+const char *opa_pstate_name(u32 pstate);
+u32 driver_pstate(struct hfi1_pportdata *ppd);
+u32 driver_lstate(struct hfi1_pportdata *ppd);
+
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+#define LCB_START DC_LCB_CSRS
+#define LCB_END   DC_8051_CSRS /* next block is 8051 */
+static inline int is_lcb_offset(u32 offset)
+{
+	return (offset >= LCB_START && offset < LCB_END);
+}
+
+extern uint num_vls;
+
+extern uint disable_integrity;
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl);
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data);
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl);
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data);
+u32 read_logical_state(struct hfi1_devdata *dd);
+void force_recv_intr(struct hfi1_ctxtdata *rcd);
+
+/* Per VL indexes */
+enum {
+	C_VL_0 = 0,
+	C_VL_1,
+	C_VL_2,
+	C_VL_3,
+	C_VL_4,
+	C_VL_5,
+	C_VL_6,
+	C_VL_7,
+	C_VL_15,
+	C_VL_COUNT
+};
+
+static inline int vl_from_idx(int idx)
+{
+	return (idx == C_VL_15 ? 15 : idx);
+}
+
+static inline int idx_from_vl(int vl)
+{
+	return (vl == 15 ? C_VL_15 : vl);
+}
+
+/* Per device counter indexes */
+enum {
+	C_RCV_OVF = 0,
+	C_RX_TID_FULL,
+	C_RX_TID_INVALID,
+	C_RX_TID_FLGMS,
+	C_RX_CTX_EGRS,
+	C_RCV_TID_FLSMS,
+	C_CCE_PCI_CR_ST,
+	C_CCE_PCI_TR_ST,
+	C_CCE_PIO_WR_ST,
+	C_CCE_ERR_INT,
+	C_CCE_SDMA_INT,
+	C_CCE_MISC_INT,
+	C_CCE_RCV_AV_INT,
+	C_CCE_RCV_URG_INT,
+	C_CCE_SEND_CR_INT,
+	C_DC_UNC_ERR,
+	C_DC_RCV_ERR,
+	C_DC_FM_CFG_ERR,
+	C_DC_RMT_PHY_ERR,
+	C_DC_DROPPED_PKT,
+	C_DC_MC_XMIT_PKTS,
+	C_DC_MC_RCV_PKTS,
+	C_DC_XMIT_CERR,
+	C_DC_RCV_CERR,
+	C_DC_RCV_FCC,
+	C_DC_XMIT_FCC,
+	C_DC_XMIT_FLITS,
+	C_DC_RCV_FLITS,
+	C_DC_XMIT_PKTS,
+	C_DC_RCV_PKTS,
+	C_DC_RX_FLIT_VL,
+	C_DC_RX_PKT_VL,
+	C_DC_RCV_FCN,
+	C_DC_RCV_FCN_VL,
+	C_DC_RCV_BCN,
+	C_DC_RCV_BCN_VL,
+	C_DC_RCV_BBL,
+	C_DC_RCV_BBL_VL,
+	C_DC_MARK_FECN,
+	C_DC_MARK_FECN_VL,
+	C_DC_TOTAL_CRC,
+	C_DC_CRC_LN0,
+	C_DC_CRC_LN1,
+	C_DC_CRC_LN2,
+	C_DC_CRC_LN3,
+	C_DC_CRC_MULT_LN,
+	C_DC_TX_REPLAY,
+	C_DC_RX_REPLAY,
+	C_DC_SEQ_CRC_CNT,
+	C_DC_ESC0_ONLY_CNT,
+	C_DC_ESC0_PLUS1_CNT,
+	C_DC_ESC0_PLUS2_CNT,
+	C_DC_REINIT_FROM_PEER_CNT,
+	C_DC_SBE_CNT,
+	C_DC_MISC_FLG_CNT,
+	C_DC_PRF_GOOD_LTP_CNT,
+	C_DC_PRF_ACCEPTED_LTP_CNT,
+	C_DC_PRF_RX_FLIT_CNT,
+	C_DC_PRF_TX_FLIT_CNT,
+	C_DC_PRF_CLK_CNTR,
+	C_DC_PG_DBG_FLIT_CRDTS_CNT,
+	C_DC_PG_STS_PAUSE_COMPLETE_CNT,
+	C_DC_PG_STS_TX_SBE_CNT,
+	C_DC_PG_STS_TX_MBE_CNT,
+	C_SW_CPU_INTR,
+	C_SW_CPU_RCV_LIM,
+	C_SW_VTX_WAIT,
+	C_SW_PIO_WAIT,
+	C_SW_PIO_DRAIN,
+	C_SW_KMEM_WAIT,
+	C_SW_SEND_SCHED,
+	C_SDMA_DESC_FETCHED_CNT,
+	C_SDMA_INT_CNT,
+	C_SDMA_ERR_CNT,
+	C_SDMA_IDLE_INT_CNT,
+	C_SDMA_PROGRESS_INT_CNT,
+/* MISC_ERR_STATUS */
+	C_MISC_PLL_LOCK_FAIL_ERR,
+	C_MISC_MBIST_FAIL_ERR,
+	C_MISC_INVALID_EEP_CMD_ERR,
+	C_MISC_EFUSE_DONE_PARITY_ERR,
+	C_MISC_EFUSE_WRITE_ERR,
+	C_MISC_EFUSE_READ_BAD_ADDR_ERR,
+	C_MISC_EFUSE_CSR_PARITY_ERR,
+	C_MISC_FW_AUTH_FAILED_ERR,
+	C_MISC_KEY_MISMATCH_ERR,
+	C_MISC_SBUS_WRITE_FAILED_ERR,
+	C_MISC_CSR_WRITE_BAD_ADDR_ERR,
+	C_MISC_CSR_READ_BAD_ADDR_ERR,
+	C_MISC_CSR_PARITY_ERR,
+/* CceErrStatus */
+	/*
+	* A special counter that is the aggregate count
+	* of all the cce_err_status errors.  The remainder
+	* are actual bits in the CceErrStatus register.
+	*/
+	C_CCE_ERR_STATUS_AGGREGATED_CNT,
+	C_CCE_MSIX_CSR_PARITY_ERR,
+	C_CCE_INT_MAP_UNC_ERR,
+	C_CCE_INT_MAP_COR_ERR,
+	C_CCE_MSIX_TABLE_UNC_ERR,
+	C_CCE_MSIX_TABLE_COR_ERR,
+	C_CCE_RXDMA_CONV_FIFO_PARITY_ERR,
+	C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR,
+	C_CCE_SEG_WRITE_BAD_ADDR_ERR,
+	C_CCE_SEG_READ_BAD_ADDR_ERR,
+	C_LA_TRIGGERED,
+	C_CCE_TRGT_CPL_TIMEOUT_ERR,
+	C_PCIC_RECEIVE_PARITY_ERR,
+	C_PCIC_TRANSMIT_BACK_PARITY_ERR,
+	C_PCIC_TRANSMIT_FRONT_PARITY_ERR,
+	C_PCIC_CPL_DAT_Q_UNC_ERR,
+	C_PCIC_CPL_HD_Q_UNC_ERR,
+	C_PCIC_POST_DAT_Q_UNC_ERR,
+	C_PCIC_POST_HD_Q_UNC_ERR,
+	C_PCIC_RETRY_SOT_MEM_UNC_ERR,
+	C_PCIC_RETRY_MEM_UNC_ERR,
+	C_PCIC_N_POST_DAT_Q_PARITY_ERR,
+	C_PCIC_N_POST_H_Q_PARITY_ERR,
+	C_PCIC_CPL_DAT_Q_COR_ERR,
+	C_PCIC_CPL_HD_Q_COR_ERR,
+	C_PCIC_POST_DAT_Q_COR_ERR,
+	C_PCIC_POST_HD_Q_COR_ERR,
+	C_PCIC_RETRY_SOT_MEM_COR_ERR,
+	C_PCIC_RETRY_MEM_COR_ERR,
+	C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR,
+	C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR,
+	C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR,
+	C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR,
+	C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR,
+	C_CCE_CSR_CFG_BUS_PARITY_ERR,
+	C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR,
+	C_CCE_RSPD_DATA_PARITY_ERR,
+	C_CCE_TRGT_ACCESS_ERR,
+	C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR,
+	C_CCE_CSR_WRITE_BAD_ADDR_ERR,
+	C_CCE_CSR_READ_BAD_ADDR_ERR,
+	C_CCE_CSR_PARITY_ERR,
+/* RcvErrStatus */
+	C_RX_CSR_PARITY_ERR,
+	C_RX_CSR_WRITE_BAD_ADDR_ERR,
+	C_RX_CSR_READ_BAD_ADDR_ERR,
+	C_RX_DMA_CSR_UNC_ERR,
+	C_RX_DMA_DQ_FSM_ENCODING_ERR,
+	C_RX_DMA_EQ_FSM_ENCODING_ERR,
+	C_RX_DMA_CSR_PARITY_ERR,
+	C_RX_RBUF_DATA_COR_ERR,
+	C_RX_RBUF_DATA_UNC_ERR,
+	C_RX_DMA_DATA_FIFO_RD_COR_ERR,
+	C_RX_DMA_DATA_FIFO_RD_UNC_ERR,
+	C_RX_DMA_HDR_FIFO_RD_COR_ERR,
+	C_RX_DMA_HDR_FIFO_RD_UNC_ERR,
+	C_RX_RBUF_DESC_PART2_COR_ERR,
+	C_RX_RBUF_DESC_PART2_UNC_ERR,
+	C_RX_RBUF_DESC_PART1_COR_ERR,
+	C_RX_RBUF_DESC_PART1_UNC_ERR,
+	C_RX_HQ_INTR_FSM_ERR,
+	C_RX_HQ_INTR_CSR_PARITY_ERR,
+	C_RX_LOOKUP_CSR_PARITY_ERR,
+	C_RX_LOOKUP_RCV_ARRAY_COR_ERR,
+	C_RX_LOOKUP_RCV_ARRAY_UNC_ERR,
+	C_RX_LOOKUP_DES_PART2_PARITY_ERR,
+	C_RX_LOOKUP_DES_PART1_UNC_COR_ERR,
+	C_RX_LOOKUP_DES_PART1_UNC_ERR,
+	C_RX_RBUF_NEXT_FREE_BUF_COR_ERR,
+	C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR,
+	C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR,
+	C_RX_RBUF_FL_INITDONE_PARITY_ERR,
+	C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR,
+	C_RX_RBUF_FL_RD_ADDR_PARITY_ERR,
+	C_RX_RBUF_EMPTY_ERR,
+	C_RX_RBUF_FULL_ERR,
+	C_RX_RBUF_BAD_LOOKUP_ERR,
+	C_RX_RBUF_CTX_ID_PARITY_ERR,
+	C_RX_RBUF_CSR_QEOPDW_PARITY_ERR,
+	C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR,
+	C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR,
+	C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR,
+	C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR,
+	C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR,
+	C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR,
+	C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR,
+	C_RX_RBUF_BLOCK_LIST_READ_COR_ERR,
+	C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR,
+	C_RX_RBUF_LOOKUP_DES_COR_ERR,
+	C_RX_RBUF_LOOKUP_DES_UNC_ERR,
+	C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR,
+	C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR,
+	C_RX_RBUF_FREE_LIST_COR_ERR,
+	C_RX_RBUF_FREE_LIST_UNC_ERR,
+	C_RX_RCV_FSM_ENCODING_ERR,
+	C_RX_DMA_FLAG_COR_ERR,
+	C_RX_DMA_FLAG_UNC_ERR,
+	C_RX_DC_SOP_EOP_PARITY_ERR,
+	C_RX_RCV_CSR_PARITY_ERR,
+	C_RX_RCV_QP_MAP_TABLE_COR_ERR,
+	C_RX_RCV_QP_MAP_TABLE_UNC_ERR,
+	C_RX_RCV_DATA_COR_ERR,
+	C_RX_RCV_DATA_UNC_ERR,
+	C_RX_RCV_HDR_COR_ERR,
+	C_RX_RCV_HDR_UNC_ERR,
+	C_RX_DC_INTF_PARITY_ERR,
+	C_RX_DMA_CSR_COR_ERR,
+/* SendPioErrStatus */
+	C_PIO_PEC_SOP_HEAD_PARITY_ERR,
+	C_PIO_PCC_SOP_HEAD_PARITY_ERR,
+	C_PIO_LAST_RETURNED_CNT_PARITY_ERR,
+	C_PIO_CURRENT_FREE_CNT_PARITY_ERR,
+	C_PIO_RSVD_31_ERR,
+	C_PIO_RSVD_30_ERR,
+	C_PIO_PPMC_SOP_LEN_ERR,
+	C_PIO_PPMC_BQC_MEM_PARITY_ERR,
+	C_PIO_VL_FIFO_PARITY_ERR,
+	C_PIO_VLF_SOP_PARITY_ERR,
+	C_PIO_VLF_V1_LEN_PARITY_ERR,
+	C_PIO_BLOCK_QW_COUNT_PARITY_ERR,
+	C_PIO_WRITE_QW_VALID_PARITY_ERR,
+	C_PIO_STATE_MACHINE_ERR,
+	C_PIO_WRITE_DATA_PARITY_ERR,
+	C_PIO_HOST_ADDR_MEM_COR_ERR,
+	C_PIO_HOST_ADDR_MEM_UNC_ERR,
+	C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR,
+	C_PIO_INIT_SM_IN_ERR,
+	C_PIO_PPMC_PBL_FIFO_ERR,
+	C_PIO_CREDIT_RET_FIFO_PARITY_ERR,
+	C_PIO_V1_LEN_MEM_BANK1_COR_ERR,
+	C_PIO_V1_LEN_MEM_BANK0_COR_ERR,
+	C_PIO_V1_LEN_MEM_BANK1_UNC_ERR,
+	C_PIO_V1_LEN_MEM_BANK0_UNC_ERR,
+	C_PIO_SM_PKT_RESET_PARITY_ERR,
+	C_PIO_PKT_EVICT_FIFO_PARITY_ERR,
+	C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR,
+	C_PIO_SBRDCTL_CRREL_PARITY_ERR,
+	C_PIO_PEC_FIFO_PARITY_ERR,
+	C_PIO_PCC_FIFO_PARITY_ERR,
+	C_PIO_SB_MEM_FIFO1_ERR,
+	C_PIO_SB_MEM_FIFO0_ERR,
+	C_PIO_CSR_PARITY_ERR,
+	C_PIO_WRITE_ADDR_PARITY_ERR,
+	C_PIO_WRITE_BAD_CTXT_ERR,
+/* SendDmaErrStatus */
+	C_SDMA_PCIE_REQ_TRACKING_COR_ERR,
+	C_SDMA_PCIE_REQ_TRACKING_UNC_ERR,
+	C_SDMA_CSR_PARITY_ERR,
+	C_SDMA_RPY_TAG_ERR,
+/* SendEgressErrStatus */
+	C_TX_READ_PIO_MEMORY_CSR_UNC_ERR,
+	C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR,
+	C_TX_EGRESS_FIFO_COR_ERR,
+	C_TX_READ_PIO_MEMORY_COR_ERR,
+	C_TX_READ_SDMA_MEMORY_COR_ERR,
+	C_TX_SB_HDR_COR_ERR,
+	C_TX_CREDIT_OVERRUN_ERR,
+	C_TX_LAUNCH_FIFO8_COR_ERR,
+	C_TX_LAUNCH_FIFO7_COR_ERR,
+	C_TX_LAUNCH_FIFO6_COR_ERR,
+	C_TX_LAUNCH_FIFO5_COR_ERR,
+	C_TX_LAUNCH_FIFO4_COR_ERR,
+	C_TX_LAUNCH_FIFO3_COR_ERR,
+	C_TX_LAUNCH_FIFO2_COR_ERR,
+	C_TX_LAUNCH_FIFO1_COR_ERR,
+	C_TX_LAUNCH_FIFO0_COR_ERR,
+	C_TX_CREDIT_RETURN_VL_ERR,
+	C_TX_HCRC_INSERTION_ERR,
+	C_TX_EGRESS_FIFI_UNC_ERR,
+	C_TX_READ_PIO_MEMORY_UNC_ERR,
+	C_TX_READ_SDMA_MEMORY_UNC_ERR,
+	C_TX_SB_HDR_UNC_ERR,
+	C_TX_CREDIT_RETURN_PARITY_ERR,
+	C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR,
+	C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR,
+	C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR,
+	C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR,
+	C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR,
+	C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR,
+	C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR,
+	C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR,
+	C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR,
+	C_TX_SDMA15_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA14_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA13_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA12_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA11_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA10_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA9_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA8_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA7_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA6_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA5_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA4_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA3_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA2_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA1_DISALLOWED_PACKET_ERR,
+	C_TX_SDMA0_DISALLOWED_PACKET_ERR,
+	C_TX_CONFIG_PARITY_ERR,
+	C_TX_SBRD_CTL_CSR_PARITY_ERR,
+	C_TX_LAUNCH_CSR_PARITY_ERR,
+	C_TX_ILLEGAL_CL_ERR,
+	C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR,
+	C_TX_RESERVED_10,
+	C_TX_RESERVED_9,
+	C_TX_SDMA_LAUNCH_INTF_PARITY_ERR,
+	C_TX_PIO_LAUNCH_INTF_PARITY_ERR,
+	C_TX_RESERVED_6,
+	C_TX_INCORRECT_LINK_STATE_ERR,
+	C_TX_LINK_DOWN_ERR,
+	C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR,
+	C_TX_RESERVED_2,
+	C_TX_PKT_INTEGRITY_MEM_UNC_ERR,
+	C_TX_PKT_INTEGRITY_MEM_COR_ERR,
+/* SendErrStatus */
+	C_SEND_CSR_WRITE_BAD_ADDR_ERR,
+	C_SEND_CSR_READ_BAD_ADD_ERR,
+	C_SEND_CSR_PARITY_ERR,
+/* SendCtxtErrStatus */
+	C_PIO_WRITE_OUT_OF_BOUNDS_ERR,
+	C_PIO_WRITE_OVERFLOW_ERR,
+	C_PIO_WRITE_CROSSES_BOUNDARY_ERR,
+	C_PIO_DISALLOWED_PACKET_ERR,
+	C_PIO_INCONSISTENT_SOP_ERR,
+/*SendDmaEngErrStatus */
+	C_SDMA_HEADER_REQUEST_FIFO_COR_ERR,
+	C_SDMA_HEADER_STORAGE_COR_ERR,
+	C_SDMA_PACKET_TRACKING_COR_ERR,
+	C_SDMA_ASSEMBLY_COR_ERR,
+	C_SDMA_DESC_TABLE_COR_ERR,
+	C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR,
+	C_SDMA_HEADER_STORAGE_UNC_ERR,
+	C_SDMA_PACKET_TRACKING_UNC_ERR,
+	C_SDMA_ASSEMBLY_UNC_ERR,
+	C_SDMA_DESC_TABLE_UNC_ERR,
+	C_SDMA_TIMEOUT_ERR,
+	C_SDMA_HEADER_LENGTH_ERR,
+	C_SDMA_HEADER_ADDRESS_ERR,
+	C_SDMA_HEADER_SELECT_ERR,
+	C_SMDA_RESERVED_9,
+	C_SDMA_PACKET_DESC_OVERFLOW_ERR,
+	C_SDMA_LENGTH_MISMATCH_ERR,
+	C_SDMA_HALT_ERR,
+	C_SDMA_MEM_READ_ERR,
+	C_SDMA_FIRST_DESC_ERR,
+	C_SDMA_TAIL_OUT_OF_BOUNDS_ERR,
+	C_SDMA_TOO_LONG_ERR,
+	C_SDMA_GEN_MISMATCH_ERR,
+	C_SDMA_WRONG_DW_ERR,
+	DEV_CNTR_LAST  /* Must be kept last */
+};
+
+/* Per port counter indexes */
+enum {
+	C_TX_UNSUP_VL = 0,
+	C_TX_INVAL_LEN,
+	C_TX_MM_LEN_ERR,
+	C_TX_UNDERRUN,
+	C_TX_FLOW_STALL,
+	C_TX_DROPPED,
+	C_TX_HDR_ERR,
+	C_TX_PKT,
+	C_TX_WORDS,
+	C_TX_WAIT,
+	C_TX_FLIT_VL,
+	C_TX_PKT_VL,
+	C_TX_WAIT_VL,
+	C_RX_PKT,
+	C_RX_WORDS,
+	C_SW_LINK_DOWN,
+	C_SW_LINK_UP,
+	C_SW_UNKNOWN_FRAME,
+	C_SW_XMIT_DSCD,
+	C_SW_XMIT_DSCD_VL,
+	C_SW_XMIT_CSTR_ERR,
+	C_SW_RCV_CSTR_ERR,
+	C_SW_IBP_LOOP_PKTS,
+	C_SW_IBP_RC_RESENDS,
+	C_SW_IBP_RNR_NAKS,
+	C_SW_IBP_OTHER_NAKS,
+	C_SW_IBP_RC_TIMEOUTS,
+	C_SW_IBP_PKT_DROPS,
+	C_SW_IBP_DMA_WAIT,
+	C_SW_IBP_RC_SEQNAK,
+	C_SW_IBP_RC_DUPREQ,
+	C_SW_IBP_RDMA_SEQ,
+	C_SW_IBP_UNALIGNED,
+	C_SW_IBP_SEQ_NAK,
+	C_SW_CPU_RC_ACKS,
+	C_SW_CPU_RC_QACKS,
+	C_SW_CPU_RC_DELAYED_COMP,
+	C_RCV_HDR_OVF_0,
+	C_RCV_HDR_OVF_1,
+	C_RCV_HDR_OVF_2,
+	C_RCV_HDR_OVF_3,
+	C_RCV_HDR_OVF_4,
+	C_RCV_HDR_OVF_5,
+	C_RCV_HDR_OVF_6,
+	C_RCV_HDR_OVF_7,
+	C_RCV_HDR_OVF_8,
+	C_RCV_HDR_OVF_9,
+	C_RCV_HDR_OVF_10,
+	C_RCV_HDR_OVF_11,
+	C_RCV_HDR_OVF_12,
+	C_RCV_HDR_OVF_13,
+	C_RCV_HDR_OVF_14,
+	C_RCV_HDR_OVF_15,
+	C_RCV_HDR_OVF_16,
+	C_RCV_HDR_OVF_17,
+	C_RCV_HDR_OVF_18,
+	C_RCV_HDR_OVF_19,
+	C_RCV_HDR_OVF_20,
+	C_RCV_HDR_OVF_21,
+	C_RCV_HDR_OVF_22,
+	C_RCV_HDR_OVF_23,
+	C_RCV_HDR_OVF_24,
+	C_RCV_HDR_OVF_25,
+	C_RCV_HDR_OVF_26,
+	C_RCV_HDR_OVF_27,
+	C_RCV_HDR_OVF_28,
+	C_RCV_HDR_OVF_29,
+	C_RCV_HDR_OVF_30,
+	C_RCV_HDR_OVF_31,
+	C_RCV_HDR_OVF_32,
+	C_RCV_HDR_OVF_33,
+	C_RCV_HDR_OVF_34,
+	C_RCV_HDR_OVF_35,
+	C_RCV_HDR_OVF_36,
+	C_RCV_HDR_OVF_37,
+	C_RCV_HDR_OVF_38,
+	C_RCV_HDR_OVF_39,
+	C_RCV_HDR_OVF_40,
+	C_RCV_HDR_OVF_41,
+	C_RCV_HDR_OVF_42,
+	C_RCV_HDR_OVF_43,
+	C_RCV_HDR_OVF_44,
+	C_RCV_HDR_OVF_45,
+	C_RCV_HDR_OVF_46,
+	C_RCV_HDR_OVF_47,
+	C_RCV_HDR_OVF_48,
+	C_RCV_HDR_OVF_49,
+	C_RCV_HDR_OVF_50,
+	C_RCV_HDR_OVF_51,
+	C_RCV_HDR_OVF_52,
+	C_RCV_HDR_OVF_53,
+	C_RCV_HDR_OVF_54,
+	C_RCV_HDR_OVF_55,
+	C_RCV_HDR_OVF_56,
+	C_RCV_HDR_OVF_57,
+	C_RCV_HDR_OVF_58,
+	C_RCV_HDR_OVF_59,
+	C_RCV_HDR_OVF_60,
+	C_RCV_HDR_OVF_61,
+	C_RCV_HDR_OVF_62,
+	C_RCV_HDR_OVF_63,
+	C_RCV_HDR_OVF_64,
+	C_RCV_HDR_OVF_65,
+	C_RCV_HDR_OVF_66,
+	C_RCV_HDR_OVF_67,
+	C_RCV_HDR_OVF_68,
+	C_RCV_HDR_OVF_69,
+	C_RCV_HDR_OVF_70,
+	C_RCV_HDR_OVF_71,
+	C_RCV_HDR_OVF_72,
+	C_RCV_HDR_OVF_73,
+	C_RCV_HDR_OVF_74,
+	C_RCV_HDR_OVF_75,
+	C_RCV_HDR_OVF_76,
+	C_RCV_HDR_OVF_77,
+	C_RCV_HDR_OVF_78,
+	C_RCV_HDR_OVF_79,
+	C_RCV_HDR_OVF_80,
+	C_RCV_HDR_OVF_81,
+	C_RCV_HDR_OVF_82,
+	C_RCV_HDR_OVF_83,
+	C_RCV_HDR_OVF_84,
+	C_RCV_HDR_OVF_85,
+	C_RCV_HDR_OVF_86,
+	C_RCV_HDR_OVF_87,
+	C_RCV_HDR_OVF_88,
+	C_RCV_HDR_OVF_89,
+	C_RCV_HDR_OVF_90,
+	C_RCV_HDR_OVF_91,
+	C_RCV_HDR_OVF_92,
+	C_RCV_HDR_OVF_93,
+	C_RCV_HDR_OVF_94,
+	C_RCV_HDR_OVF_95,
+	C_RCV_HDR_OVF_96,
+	C_RCV_HDR_OVF_97,
+	C_RCV_HDR_OVF_98,
+	C_RCV_HDR_OVF_99,
+	C_RCV_HDR_OVF_100,
+	C_RCV_HDR_OVF_101,
+	C_RCV_HDR_OVF_102,
+	C_RCV_HDR_OVF_103,
+	C_RCV_HDR_OVF_104,
+	C_RCV_HDR_OVF_105,
+	C_RCV_HDR_OVF_106,
+	C_RCV_HDR_OVF_107,
+	C_RCV_HDR_OVF_108,
+	C_RCV_HDR_OVF_109,
+	C_RCV_HDR_OVF_110,
+	C_RCV_HDR_OVF_111,
+	C_RCV_HDR_OVF_112,
+	C_RCV_HDR_OVF_113,
+	C_RCV_HDR_OVF_114,
+	C_RCV_HDR_OVF_115,
+	C_RCV_HDR_OVF_116,
+	C_RCV_HDR_OVF_117,
+	C_RCV_HDR_OVF_118,
+	C_RCV_HDR_OVF_119,
+	C_RCV_HDR_OVF_120,
+	C_RCV_HDR_OVF_121,
+	C_RCV_HDR_OVF_122,
+	C_RCV_HDR_OVF_123,
+	C_RCV_HDR_OVF_124,
+	C_RCV_HDR_OVF_125,
+	C_RCV_HDR_OVF_126,
+	C_RCV_HDR_OVF_127,
+	C_RCV_HDR_OVF_128,
+	C_RCV_HDR_OVF_129,
+	C_RCV_HDR_OVF_130,
+	C_RCV_HDR_OVF_131,
+	C_RCV_HDR_OVF_132,
+	C_RCV_HDR_OVF_133,
+	C_RCV_HDR_OVF_134,
+	C_RCV_HDR_OVF_135,
+	C_RCV_HDR_OVF_136,
+	C_RCV_HDR_OVF_137,
+	C_RCV_HDR_OVF_138,
+	C_RCV_HDR_OVF_139,
+	C_RCV_HDR_OVF_140,
+	C_RCV_HDR_OVF_141,
+	C_RCV_HDR_OVF_142,
+	C_RCV_HDR_OVF_143,
+	C_RCV_HDR_OVF_144,
+	C_RCV_HDR_OVF_145,
+	C_RCV_HDR_OVF_146,
+	C_RCV_HDR_OVF_147,
+	C_RCV_HDR_OVF_148,
+	C_RCV_HDR_OVF_149,
+	C_RCV_HDR_OVF_150,
+	C_RCV_HDR_OVF_151,
+	C_RCV_HDR_OVF_152,
+	C_RCV_HDR_OVF_153,
+	C_RCV_HDR_OVF_154,
+	C_RCV_HDR_OVF_155,
+	C_RCV_HDR_OVF_156,
+	C_RCV_HDR_OVF_157,
+	C_RCV_HDR_OVF_158,
+	C_RCV_HDR_OVF_159,
+	PORT_CNTR_LAST /* Must be kept last */
+};
+
+u64 get_all_cpu_total(u64 __percpu *cntr);
+void hfi1_start_cleanup(struct hfi1_devdata *dd);
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
+void hfi1_init_ctxt(struct send_context *sc);
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+		  u32 type, unsigned long pa, u16 order);
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd);
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op,
+		  struct hfi1_ctxtdata *rcd);
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp);
+u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp);
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which);
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val);
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd,
+		       u16 jkey);
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt);
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt,
+		       u16 pkey);
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt);
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
+void hfi1_init_vnic_rsm(struct hfi1_devdata *dd);
+void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd);
+
+/*
+ * Interrupt source table.
+ *
+ * Each entry is an interrupt source "type".  It is ordered by increasing
+ * number.
+ */
+struct is_table {
+	int start;	 /* interrupt source type start */
+	int end;	 /* interrupt source type end */
+	/* routine that returns the name of the interrupt source */
+	char *(*is_name)(char *name, size_t size, unsigned int source);
+	/* routine to call when receiving an interrupt */
+	void (*is_int)(struct hfi1_devdata *dd, unsigned int source);
+};
+
+#endif /* _CHIP_H */
diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h
new file mode 100644
index 0000000..793514f
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/chip_registers.h
@@ -0,0 +1,1316 @@
+#ifndef DEF_CHIP_REG
+#define DEF_CHIP_REG
+
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define CORE		0x000000000000
+#define CCE			(CORE + 0x000000000000)
+#define ASIC		(CORE + 0x000000400000)
+#define MISC		(CORE + 0x000000500000)
+#define DC_TOP_CSRS		(CORE + 0x000000600000)
+#define CHIP_DEBUG		(CORE + 0x000000700000)
+#define RXE			(CORE + 0x000001000000)
+#define TXE			(CORE + 0x000001800000)
+#define DCC_CSRS		(DC_TOP_CSRS + 0x000000000000)
+#define DC_LCB_CSRS		(DC_TOP_CSRS + 0x000000001000)
+#define DC_8051_CSRS		(DC_TOP_CSRS + 0x000000002000)
+#define PCIE		0
+
+#define ASIC_NUM_SCRATCH 4
+#define CCE_ERR_INT_CNT 0
+#define CCE_MISC_INT_CNT 2
+#define CCE_NUM_32_BIT_COUNTERS 3
+#define CCE_NUM_32_BIT_INT_COUNTERS 6
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define CCE_NUM_MSIX_PBAS 4
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_SCRATCH 4
+#define CCE_PCIE_POSTED_CRDT_STALL_CNT 2
+#define CCE_PCIE_TRGT_STALL_CNT 0
+#define CCE_PIO_WR_STALL_CNT 1
+#define CCE_RCV_AVAIL_INT_CNT 3
+#define CCE_RCV_URGENT_INT_CNT 4
+#define CCE_SDMA_INT_CNT 1
+#define CCE_SEND_CREDIT_INT_CNT 5
+#define DCC_CFG_LED_CNTRL (DCC_CSRS + 0x000000000040)
+#define DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK 0x10ull
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SHIFT 0
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK 0xFull
+#define DCC_CFG_PORT_CONFIG (DCC_CSRS + 0x000000000008)
+#define DCC_CFG_PORT_CONFIG1 (DCC_CSRS + 0x000000000010)
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT 16
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK 0xFFFF0000ull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT 0
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT 48
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK 0x7000000000000ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull
+#define DCC_CFG_RESET (DCC_CSRS + 0x000000000000)
+#define DCC_CFG_RESET_RESET_LCB_SHIFT 0
+#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2
+#define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028)
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY11_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY12_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY13_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY14_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY15_SHIFT 60
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY1_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY2_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY3_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY4_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY5_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY6_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY7_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY8_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY9_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16 (DCC_CSRS + 0x000000000030)
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY16_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY17_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY18_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY19_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY20_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY21_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY22_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY23_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY24_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY25_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY26_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY27_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY28_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY29_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY30_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY31_SHIFT 60
+#define DCC_ERR_DROPPED_PKT_CNT (DCC_CSRS + 0x000000000120)
+#define DCC_ERR_FLG (DCC_CSRS + 0x000000000050)
+#define DCC_ERR_FLG_BAD_CRDT_ACK_ERR_SMASK 0x4000ull
+#define DCC_ERR_FLG_BAD_CTRL_DIST_ERR_SMASK 0x200000ull
+#define DCC_ERR_FLG_BAD_CTRL_FLIT_ERR_SMASK 0x10000ull
+#define DCC_ERR_FLG_BAD_DLID_TARGET_ERR_SMASK 0x200ull
+#define DCC_ERR_FLG_BAD_HEAD_DIST_ERR_SMASK 0x800000ull
+#define DCC_ERR_FLG_BAD_L2_ERR_SMASK 0x2ull
+#define DCC_ERR_FLG_BAD_LVER_ERR_SMASK 0x400ull
+#define DCC_ERR_FLG_BAD_MID_TAIL_ERR_SMASK 0x8ull
+#define DCC_ERR_FLG_BAD_PKT_LENGTH_ERR_SMASK 0x4000000ull
+#define DCC_ERR_FLG_BAD_PREEMPTION_ERR_SMASK 0x10ull
+#define DCC_ERR_FLG_BAD_SC_ERR_SMASK 0x4ull
+#define DCC_ERR_FLG_BAD_TAIL_DIST_ERR_SMASK 0x400000ull
+#define DCC_ERR_FLG_BAD_VL_MARKER_ERR_SMASK 0x80ull
+#define DCC_ERR_FLG_CLR (DCC_CSRS + 0x000000000060)
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_CSR_INVAL_ADDR_SMASK 0x400000000000ull
+#define DCC_ERR_FLG_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define DCC_ERR_FLG_DLID_ZERO_ERR_SMASK 0x40000000ull
+#define DCC_ERR_FLG_EN (DCC_CSRS + 0x000000000058)
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_EVENT_CNTR_PARITY_ERR_SMASK 0x20000ull
+#define DCC_ERR_FLG_EVENT_CNTR_ROLLOVER_ERR_SMASK 0x40000ull
+#define DCC_ERR_FLG_FMCONFIG_ERR_SMASK 0x40000000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_OVFLW_ERR_SMASK 0x2000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_UNFLW_ERR_SMASK 0x4000000000ull
+#define DCC_ERR_FLG_LATE_EBP_ERR_SMASK 0x1000000000ull
+#define DCC_ERR_FLG_LATE_LONG_ERR_SMASK 0x800000000ull
+#define DCC_ERR_FLG_LATE_SHORT_ERR_SMASK 0x400000000ull
+#define DCC_ERR_FLG_LENGTH_MTU_ERR_SMASK 0x80000000ull
+#define DCC_ERR_FLG_LINK_ERR_SMASK 0x80000ull
+#define DCC_ERR_FLG_MISC_CNTR_ROLLOVER_ERR_SMASK 0x100000ull
+#define DCC_ERR_FLG_NONVL15_STATE_ERR_SMASK 0x1000000ull
+#define DCC_ERR_FLG_PERM_NVL15_ERR_SMASK 0x10000000ull
+#define DCC_ERR_FLG_PREEMPTION_ERR_SMASK 0x20ull
+#define DCC_ERR_FLG_PREEMPTIONVL15_ERR_SMASK 0x40ull
+#define DCC_ERR_FLG_RCVPORT_ERR_SMASK 0x80000000000000ull
+#define DCC_ERR_FLG_RX_BYTE_SHFT_PARITY_ERR_SMASK 0x1000000000000ull
+#define DCC_ERR_FLG_RX_CTRL_PARITY_MBE_ERR_SMASK 0x100000000000ull
+#define DCC_ERR_FLG_RX_EARLY_DROP_ERR_SMASK 0x200000000ull
+#define DCC_ERR_FLG_SLID_ZERO_ERR_SMASK 0x20000000ull
+#define DCC_ERR_FLG_TX_BYTE_SHFT_PARITY_ERR_SMASK 0x800000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_ERR_SMASK 0x20000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_MBE_ERR_SMASK 0x40000000000ull
+#define DCC_ERR_FLG_TX_SC_PARITY_ERR_SMASK 0x80000000000ull
+#define DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK 0x2000ull
+#define DCC_ERR_FLG_UNSUP_PKT_TYPE_SMASK 0x8000ull
+#define DCC_ERR_FLG_UNSUP_VL_ERR_SMASK 0x8000000ull
+#define DCC_ERR_FLG_VL15_MULTI_ERR_SMASK 0x2000000ull
+#define DCC_ERR_FMCONFIG_ERR_CNT (DCC_CSRS + 0x000000000110)
+#define DCC_ERR_INFO_FMCONFIG (DCC_CSRS + 0x000000000090)
+#define DCC_ERR_INFO_PORTRCV (DCC_CSRS + 0x000000000078)
+#define DCC_ERR_INFO_PORTRCV_HDR0 (DCC_CSRS + 0x000000000080)
+#define DCC_ERR_INFO_PORTRCV_HDR1 (DCC_CSRS + 0x000000000088)
+#define DCC_ERR_INFO_UNCORRECTABLE (DCC_CSRS + 0x000000000098)
+#define DCC_ERR_PORTRCV_ERR_CNT (DCC_CSRS + 0x000000000108)
+#define DCC_ERR_RCVREMOTE_PHY_ERR_CNT (DCC_CSRS + 0x000000000118)
+#define DCC_ERR_UNCORRECTABLE_CNT (DCC_CSRS + 0x000000000100)
+#define DCC_PRF_PORT_MARK_FECN_CNT (DCC_CSRS + 0x000000000330)
+#define DCC_PRF_PORT_RCV_BECN_CNT (DCC_CSRS + 0x000000000290)
+#define DCC_PRF_PORT_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E0)
+#define DCC_PRF_PORT_RCV_CORRECTABLE_CNT (DCC_CSRS + 0x000000000140)
+#define DCC_PRF_PORT_RCV_DATA_CNT (DCC_CSRS + 0x000000000198)
+#define DCC_PRF_PORT_RCV_FECN_CNT (DCC_CSRS + 0x000000000240)
+#define DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT (DCC_CSRS + 0x000000000130)
+#define DCC_PRF_PORT_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001A8)
+#define DCC_PRF_PORT_VL_MARK_FECN_CNT (DCC_CSRS + 0x000000000338)
+#define DCC_PRF_PORT_VL_RCV_BECN_CNT (DCC_CSRS + 0x000000000298)
+#define DCC_PRF_PORT_VL_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E8)
+#define DCC_PRF_PORT_VL_RCV_DATA_CNT (DCC_CSRS + 0x0000000001B0)
+#define DCC_PRF_PORT_VL_RCV_FECN_CNT (DCC_CSRS + 0x000000000248)
+#define DCC_PRF_PORT_VL_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001F8)
+#define DCC_PRF_PORT_XMIT_CORRECTABLE_CNT (DCC_CSRS + 0x000000000138)
+#define DCC_PRF_PORT_XMIT_DATA_CNT (DCC_CSRS + 0x000000000190)
+#define DCC_PRF_PORT_XMIT_MULTICAST_CNT (DCC_CSRS + 0x000000000128)
+#define DCC_PRF_PORT_XMIT_PKTS_CNT (DCC_CSRS + 0x0000000001A0)
+#define DCC_PRF_RX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000180)
+#define DCC_PRF_TX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000188)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL (DC_8051_CSRS + 0x000000000110)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK 0x2ull
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0 (DC_8051_CSRS + 0x000000000118)
+#define DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1 (DC_8051_CSRS + 0x000000000120)
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK 0xFFFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK 0xFFFF0000ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_0 (DC_8051_CSRS + 0x000000000028)
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1 (DC_8051_CSRS + 0x000000000030)
+#define DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_LOCAL_GUID (DC_8051_CSRS + 0x000000000038)
+#define DC_DC8051_CFG_MODE (DC_8051_CSRS + 0x000000000070)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL (DC_8051_CSRS + 0x000000000008)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK 0x7FFFull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT 0
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK 0x1000000ull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP (DC_8051_CSRS + 0x000000000000)
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK 0x100ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK 0x1ull
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS (DC_8051_CSRS + 0x000000000018)
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_WR_DATA (DC_8051_CSRS + 0x000000000010)
+#define DC_DC8051_CFG_RAM_ACCESS_RD_DATA (DC_8051_CSRS + 0x000000000020)
+#define DC_DC8051_CFG_RST (DC_8051_CSRS + 0x000000000068)
+#define DC_DC8051_CFG_RST_CRAM_SMASK 0x2ull
+#define DC_DC8051_CFG_RST_DRAM_SMASK 0x4ull
+#define DC_DC8051_CFG_RST_IRAM_SMASK 0x8ull
+#define DC_DC8051_CFG_RST_M8051W_SMASK 0x1ull
+#define DC_DC8051_CFG_RST_SFR_SMASK 0x10ull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051 (DC_8051_CSRS + 0x0000000000D8)
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK 0xFFFFFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT 16
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK 0xFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT 0
+#define DC_DC8051_ERR_CLR (DC_8051_CSRS + 0x0000000000E8)
+#define DC_DC8051_ERR_EN (DC_8051_CSRS + 0x0000000000F0)
+#define DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG (DC_8051_CSRS + 0x0000000000E0)
+#define DC_DC8051_ERR_FLG_CRAM_MBE_SMASK 0x4ull
+#define DC_DC8051_ERR_FLG_CRAM_SBE_SMASK 0x8ull
+#define DC_DC8051_ERR_FLG_DRAM_MBE_SMASK 0x10ull
+#define DC_DC8051_ERR_FLG_DRAM_SBE_SMASK 0x20ull
+#define DC_DC8051_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x400ull
+#define DC_DC8051_ERR_FLG_IRAM_MBE_SMASK 0x40ull
+#define DC_DC8051_ERR_FLG_IRAM_SBE_SMASK 0x80ull
+#define DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG_SET_BY_8051_SMASK 0x1ull
+#define DC_DC8051_ERR_FLG_UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES_SMASK 0x100ull
+#define DC_DC8051_STS_CUR_STATE (DC_8051_CSRS + 0x000000000060)
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT 16
+#define DC_DC8051_STS_CUR_STATE_PORT_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_PORT_SHIFT 0
+#define DC_DC8051_STS_LOCAL_FM_SECURITY (DC_8051_CSRS + 0x000000000050)
+#define DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK 0x1ull
+#define DC_DC8051_STS_REMOTE_FM_SECURITY (DC_8051_CSRS + 0x000000000058)
+#define DC_DC8051_STS_REMOTE_GUID (DC_8051_CSRS + 0x000000000040)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE (DC_8051_CSRS + 0x000000000048)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK 0x3ull
+#define DC_DC8051_STS_REMOTE_PORT_NO (DC_8051_CSRS + 0x000000000130)
+#define DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK 0xFFull
+#define DC_LCB_CFG_ALLOW_LINK_UP (DC_LCB_CSRS + 0x000000000128)
+#define DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT 0
+#define DC_LCB_CFG_CRC_MODE (DC_LCB_CSRS + 0x000000000058)
+#define DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT 0
+#define DC_LCB_CFG_IGNORE_LOST_RCLK (DC_LCB_CSRS + 0x000000000020)
+#define DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK 0x1ull
+#define DC_LCB_CFG_LANE_WIDTH (DC_LCB_CSRS + 0x000000000100)
+#define DC_LCB_CFG_LINK_KILL_EN (DC_LCB_CSRS + 0x000000000120)
+#define DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_CFG_LN_DCLK (DC_LCB_CSRS + 0x000000000060)
+#define DC_LCB_CFG_LOOPBACK (DC_LCB_CSRS + 0x0000000000F8)
+#define DC_LCB_CFG_LOOPBACK_VAL_SHIFT 0
+#define DC_LCB_CFG_RUN (DC_LCB_CSRS + 0x000000000000)
+#define DC_LCB_CFG_RUN_EN_SHIFT 0
+#define DC_LCB_CFG_RX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000018)
+#define DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT 8
+#define DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT 4
+#define DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000010)
+#define DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RESET (DC_LCB_CSRS + 0x000000000008)
+#define DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT 0
+#define DC_LCB_CFG_REINIT_AS_SLAVE (DC_LCB_CSRS + 0x000000000030)
+#define DC_LCB_CFG_CNT_FOR_SKIP_STALL (DC_LCB_CSRS + 0x000000000040)
+#define DC_LCB_CFG_CLK_CNTR (DC_LCB_CSRS + 0x000000000110)
+#define DC_LCB_ERR_CLR (DC_LCB_CSRS + 0x000000000308)
+#define DC_LCB_ERR_EN (DC_LCB_CSRS + 0x000000000310)
+#define DC_LCB_ERR_FLG (DC_LCB_CSRS + 0x000000000300)
+#define DC_LCB_ERR_FLG_REDUNDANT_FLIT_PARITY_ERR_SMASK 0x20000000ull
+#define DC_LCB_ERR_FLG_NEG_EDGE_LINK_TRANSFER_ACTIVE_SMASK 0x10000000ull
+#define DC_LCB_ERR_FLG_HOLD_REINIT_SMASK 0x8000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_INCOMPLT_RND_TRIP_SMASK 0x4000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_LINK_TIMEOUT_SMASK 0x2000000ull
+#define DC_LCB_ERR_FLG_CREDIT_RETURN_FLIT_MBE_SMASK 0x1000000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_SBE_SMASK 0x800000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_SBE_SMASK 0x200000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_WRONG_CRC_MODE_SMASK 0x80000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_PARITY_ERR_SMASK 0x40000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_BUF_OFLW_SMASK 0x20000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_OFLW_SMASK 0x10000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_FLIT_ENCODING_SMASK 0x8000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_NULL_LTP_SMASK 0x4000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_ROUND_TRIP_MARKER_SMASK 0x2000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_REPLAY_MARKER_SMASK 0x1000ull
+#define DC_LCB_ERR_FLG_RCLK_STOPPED_SMASK 0x800ull
+#define DC_LCB_ERR_FLG_CRC_ERR_CNT_HIT_LIMIT_SMASK 0x400ull
+#define DC_LCB_ERR_FLG_REINIT_FOR_LN_DEGRADE_SMASK 0x200ull
+#define DC_LCB_ERR_FLG_REINIT_FROM_PEER_SMASK 0x100ull
+#define DC_LCB_ERR_FLG_SEQ_CRC_ERR_SMASK 0x80ull
+#define DC_LCB_ERR_FLG_RX_LESS_THAN_FOUR_LNS_SMASK 0x40ull
+#define DC_LCB_ERR_FLG_TX_LESS_THAN_FOUR_LNS_SMASK 0x20ull
+#define DC_LCB_ERR_FLG_LOST_REINIT_STALL_OR_TOS_SMASK 0x10ull
+#define DC_LCB_ERR_FLG_ALL_LNS_FAILED_REINIT_TEST_SMASK 0x8ull
+#define DC_LCB_ERR_FLG_RST_FOR_FAILED_DESKEW_SMASK 0x4ull
+#define DC_LCB_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x2ull
+#define DC_LCB_ERR_FLG_CSR_PARITY_ERR_SMASK 0x1ull
+#define DC_LCB_ERR_INFO_CRC_ERR_LN0 (DC_LCB_CSRS + 0x000000000328)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN1 (DC_LCB_CSRS + 0x000000000330)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN2 (DC_LCB_CSRS + 0x000000000338)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN3 (DC_LCB_CSRS + 0x000000000340)
+#define DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN (DC_LCB_CSRS + 0x000000000348)
+#define DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT (DC_LCB_CSRS + 0x000000000368)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT (DC_LCB_CSRS + 0x000000000370)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT (DC_LCB_CSRS + 0x000000000378)
+#define DC_LCB_ERR_INFO_MISC_FLG_CNT (DC_LCB_CSRS + 0x000000000390)
+#define DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT (DC_LCB_CSRS + 0x000000000380)
+#define DC_LCB_ERR_INFO_RX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000358)
+#define DC_LCB_ERR_INFO_SBE_CNT (DC_LCB_CSRS + 0x000000000388)
+#define DC_LCB_ERR_INFO_SEQ_CRC_CNT (DC_LCB_CSRS + 0x000000000360)
+#define DC_LCB_ERR_INFO_TOTAL_CRC_ERR (DC_LCB_CSRS + 0x000000000320)
+#define DC_LCB_ERR_INFO_TX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000350)
+#define DC_LCB_PG_DBG_FLIT_CRDTS_CNT (DC_LCB_CSRS + 0x000000000580)
+#define DC_LCB_PG_STS_PAUSE_COMPLETE_CNT (DC_LCB_CSRS + 0x0000000005F8)
+#define DC_LCB_PG_STS_TX_MBE_CNT (DC_LCB_CSRS + 0x000000000608)
+#define DC_LCB_PG_STS_TX_SBE_CNT (DC_LCB_CSRS + 0x000000000600)
+#define DC_LCB_PRF_ACCEPTED_LTP_CNT (DC_LCB_CSRS + 0x000000000408)
+#define DC_LCB_PRF_CLK_CNTR (DC_LCB_CSRS + 0x000000000420)
+#define DC_LCB_PRF_GOOD_LTP_CNT (DC_LCB_CSRS + 0x000000000400)
+#define DC_LCB_PRF_RX_FLIT_CNT (DC_LCB_CSRS + 0x000000000410)
+#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418)
+#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468)
+#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0)
+#define RCV_BUF_OVFL_CNT 10
+#define RCV_CONTEXT_EGR_STALL 22
+#define RCV_DATA_PKT_CNT 0
+#define RCV_DWORD_CNT 1
+#define RCV_TID_FLOW_GEN_MISMATCH_CNT 20
+#define RCV_TID_FLOW_SEQ_MISMATCH_CNT 23
+#define RCV_TID_FULL_ERR_CNT 18
+#define RCV_TID_VALID_ERR_CNT 19
+#define RXE_NUM_32_BIT_COUNTERS 24
+#define RXE_NUM_64_BIT_COUNTERS 2
+#define RXE_NUM_RSM_INSTANCES 4
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_PER_CONTEXT_OFFSET 0x0300000
+#define SEND_DATA_PKT_CNT 0
+#define SEND_DATA_PKT_VL0_CNT 12
+#define SEND_DATA_VL0_CNT 3
+#define SEND_DROPPED_PKT_CNT 5
+#define SEND_DWORD_CNT 1
+#define SEND_FLOW_STALL_CNT 4
+#define SEND_HEADERS_ERR_CNT 6
+#define SEND_LEN_ERR_CNT 1
+#define SEND_MAX_MIN_LEN_ERR_CNT 2
+#define SEND_UNDERRUN_CNT 3
+#define SEND_UNSUP_VL_ERR_CNT 0
+#define SEND_WAIT_CNT 2
+#define SEND_WAIT_VL0_CNT 21
+#define TXE_PIO_SEND_OFFSET 0x0800000
+#define ASIC_CFG_DRV_STR (ASIC + 0x000000000048)
+#define ASIC_CFG_MUTEX (ASIC + 0x000000000040)
+#define ASIC_CFG_SBUS_EXECUTE (ASIC + 0x000000000008)
+#define ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK 0x1ull
+#define ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK 0x2ull
+#define ASIC_CFG_SBUS_REQUEST (ASIC + 0x000000000000)
+#define ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT 16
+#define ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT 8
+#define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32
+#define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0
+#define ASIC_CFG_SCRATCH (ASIC + 0x000000000020)
+#define ASIC_CFG_SCRATCH_1 (ASIC_CFG_SCRATCH + 0x08)
+#define ASIC_CFG_SCRATCH_2 (ASIC_CFG_SCRATCH + 0x10)
+#define ASIC_CFG_SCRATCH_3 (ASIC_CFG_SCRATCH + 0x18)
+#define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050)
+#define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308)
+#define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull
+#define ASIC_EEP_CTL_STAT (ASIC + 0x000000000300)
+#define ASIC_EEP_CTL_STAT_EP_RESET_SMASK 0x4ull
+#define ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT 8
+#define ASIC_EEP_CTL_STAT_RESETCSR 0x0000000083818000ull
+#define ASIC_EEP_DATA (ASIC + 0x000000000310)
+#define ASIC_GPIO_CLEAR (ASIC + 0x000000000230)
+#define ASIC_GPIO_FORCE (ASIC + 0x000000000238)
+#define ASIC_GPIO_IN (ASIC + 0x000000000200)
+#define ASIC_GPIO_INVERT (ASIC + 0x000000000210)
+#define ASIC_GPIO_MASK (ASIC + 0x000000000220)
+#define ASIC_GPIO_OE (ASIC + 0x000000000208)
+#define ASIC_GPIO_OUT (ASIC + 0x000000000218)
+#define ASIC_PCIE_SD_HOST_CMD (ASIC + 0x000000000100)
+#define ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT 0
+#define ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK 0x400ull
+#define ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_MASK 0xFFFFFull
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT 12
+#define ASIC_PCIE_SD_HOST_STATUS (ASIC + 0x000000000108)
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK 0x7ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK 0x3ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_DATA_CODE (ASIC + 0x000000000110)
+#define ASIC_PCIE_SD_INTRPT_ENABLE (ASIC + 0x000000000118)
+#define ASIC_PCIE_SD_INTRPT_LIST (ASIC + 0x000000000180)
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT 16
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_STATUS (ASIC + 0x000000000128)
+#define ASIC_QSFP1_CLEAR (ASIC + 0x000000000270)
+#define ASIC_QSFP1_FORCE (ASIC + 0x000000000278)
+#define ASIC_QSFP1_IN (ASIC + 0x000000000240)
+#define ASIC_QSFP1_INVERT (ASIC + 0x000000000250)
+#define ASIC_QSFP1_MASK (ASIC + 0x000000000260)
+#define ASIC_QSFP1_OE (ASIC + 0x000000000248)
+#define ASIC_QSFP1_OUT (ASIC + 0x000000000258)
+#define ASIC_QSFP1_STATUS (ASIC + 0x000000000268)
+#define ASIC_QSFP2_CLEAR (ASIC + 0x0000000002B0)
+#define ASIC_QSFP2_FORCE (ASIC + 0x0000000002B8)
+#define ASIC_QSFP2_IN (ASIC + 0x000000000280)
+#define ASIC_QSFP2_INVERT (ASIC + 0x000000000290)
+#define ASIC_QSFP2_MASK (ASIC + 0x0000000002A0)
+#define ASIC_QSFP2_OE (ASIC + 0x000000000288)
+#define ASIC_QSFP2_OUT (ASIC + 0x000000000298)
+#define ASIC_QSFP2_STATUS (ASIC + 0x0000000002A8)
+#define ASIC_STS_SBUS_COUNTERS (ASIC + 0x000000000018)
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_SHIFT 0
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_SHIFT 16
+#define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010)
+#define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull
+#define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull
+#define ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT 2
+#define ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK 0x7ull
+#define ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT 32
+#define ASIC_STS_SBUS_RESULT_DATA_OUT_MASK 0xFFFFFFFFull
+#define ASIC_STS_THERM (ASIC + 0x000000000058)
+#define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18
+#define ASIC_STS_THERM_CURR_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CURR_TEMP_SHIFT 2
+#define ASIC_STS_THERM_HI_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_HI_TEMP_SHIFT 50
+#define ASIC_STS_THERM_LO_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_LO_TEMP_SHIFT 34
+#define ASIC_STS_THERM_LOW_SHIFT 13
+#define CCE_COUNTER_ARRAY32 (CCE + 0x000000000060)
+#define CCE_CTRL (CCE + 0x000000000010)
+#define CCE_CTRL_RXE_RESUME_SMASK 0x800ull
+#define CCE_CTRL_SPC_FREEZE_SMASK 0x100ull
+#define CCE_CTRL_SPC_UNFREEZE_SMASK 0x200ull
+#define CCE_CTRL_TXE_RESUME_SMASK 0x2000ull
+#define CCE_DC_CTRL (CCE + 0x0000000000B8)
+#define CCE_DC_CTRL_DC_RESET_SMASK 0x1ull
+#define CCE_DC_CTRL_RESETCSR 0x0000000000000001ull
+#define CCE_ERR_CLEAR (CCE + 0x000000000050)
+#define CCE_ERR_MASK (CCE + 0x000000000048)
+#define CCE_ERR_STATUS (CCE + 0x000000000040)
+#define CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK 0x1000ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK \
+		0x200ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK \
+		0x800ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK \
+		0x400ull
+#define CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK 0x100ull
+#define CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK 0x80ull
+#define CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK 0x1ull
+#define CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK 0x4000000000ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK 0x8000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK 0x10000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK 0x1000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK 0x2000000000ull
+#define CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK 0x400000000ull
+#define CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK 0x20ull
+#define CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK 0x800000000ull
+#define CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK 0x100000000ull
+#define CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK 0x200000000ull
+#define CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK 0x10ull
+#define CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK 0x8ull
+#define CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK 0x40000000ull
+#define CCE_ERR_STATUS_LA_TRIGGERED_SMASK 0x80000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK 0x40000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK 0x4000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK 0x20000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK 0x2000000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK 0x100000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK 0x80000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK 0x10000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK 0x1000000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK 0x8000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK 0x800000ull
+#define CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK 0x20000000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK 0x2000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK 0x200000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK 0x4000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK 0x400000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK 0x10000000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK 0x8000000ull
+#define CCE_INT_CLEAR (CCE + 0x000000110A00)
+#define CCE_INT_COUNTER_ARRAY32 (CCE + 0x000000110D00)
+#define CCE_INT_FORCE (CCE + 0x000000110B00)
+#define CCE_INT_MAP (CCE + 0x000000110500)
+#define CCE_INT_MASK (CCE + 0x000000110900)
+#define CCE_INT_STATUS (CCE + 0x000000110800)
+#define CCE_MSIX_INT_GRANTED (CCE + 0x000000110200)
+#define CCE_MSIX_TABLE_LOWER (CCE + 0x000000100000)
+#define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008)
+#define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull
+#define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400)
+#define CCE_PCIE_CTRL (CCE + 0x0000000000C0)
+#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK 0x3ull
+#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT 0
+#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK 0xFull
+#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT 2
+#define CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT 8
+#define CCE_PCIE_CTRL_XMT_MARGIN_SHIFT 9
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK 0x1ull
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT 12
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK 0x7ull
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT 13
+#define CCE_REVISION (CCE + 0x000000000000)
+#define CCE_REVISION2 (CCE + 0x000000000008)
+#define CCE_REVISION2_HFI_ID_MASK 0x1ull
+#define CCE_REVISION2_HFI_ID_SHIFT 0
+#define CCE_REVISION2_IMPL_CODE_SHIFT 8
+#define CCE_REVISION2_IMPL_REVISION_SHIFT 16
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK 0xFull
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT 32
+#define CCE_REVISION_CHIP_REV_MAJOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MAJOR_SHIFT 8
+#define CCE_REVISION_CHIP_REV_MINOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MINOR_SHIFT 0
+#define CCE_REVISION_SW_MASK 0xFFull
+#define CCE_REVISION_SW_SHIFT 24
+#define CCE_SCRATCH (CCE + 0x000000000020)
+#define CCE_STATUS (CCE + 0x000000000018)
+#define CCE_STATUS_RXE_FROZE_SMASK 0x2ull
+#define CCE_STATUS_RXE_PAUSED_SMASK 0x20ull
+#define CCE_STATUS_SDMA_FROZE_SMASK 0x1ull
+#define CCE_STATUS_SDMA_PAUSED_SMASK 0x10ull
+#define CCE_STATUS_TXE_FROZE_SMASK 0x4ull
+#define CCE_STATUS_TXE_PAUSED_SMASK 0x40ull
+#define CCE_STATUS_TXE_PIO_FROZE_SMASK 0x8ull
+#define CCE_STATUS_TXE_PIO_PAUSED_SMASK 0x80ull
+#define MISC_CFG_FW_CTRL (MISC + 0x000000001000)
+#define MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK 0x2ull
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT 2
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SMASK 0xCull
+#define MISC_CFG_RSA_CMD (MISC + 0x000000000A08)
+#define MISC_CFG_RSA_MODULUS (MISC + 0x000000000400)
+#define MISC_CFG_RSA_MU (MISC + 0x000000000A10)
+#define MISC_CFG_RSA_R2 (MISC + 0x000000000000)
+#define MISC_CFG_RSA_SIGNATURE (MISC + 0x000000000200)
+#define MISC_CFG_SHA_PRELOAD (MISC + 0x000000000A00)
+#define MISC_ERR_CLEAR (MISC + 0x000000002010)
+#define MISC_ERR_MASK (MISC + 0x000000002008)
+#define MISC_ERR_STATUS (MISC + 0x000000002000)
+#define MISC_ERR_STATUS_MISC_PLL_LOCK_FAIL_ERR_SMASK 0x1000ull
+#define MISC_ERR_STATUS_MISC_MBIST_FAIL_ERR_SMASK 0x800ull
+#define MISC_ERR_STATUS_MISC_INVALID_EEP_CMD_ERR_SMASK 0x400ull
+#define MISC_ERR_STATUS_MISC_EFUSE_DONE_PARITY_ERR_SMASK 0x200ull
+#define MISC_ERR_STATUS_MISC_EFUSE_WRITE_ERR_SMASK 0x100ull
+#define MISC_ERR_STATUS_MISC_EFUSE_READ_BAD_ADDR_ERR_SMASK 0x80ull
+#define MISC_ERR_STATUS_MISC_EFUSE_CSR_PARITY_ERR_SMASK 0x40ull
+#define MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK 0x20ull
+#define MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK 0x10ull
+#define MISC_ERR_STATUS_MISC_SBUS_WRITE_FAILED_ERR_SMASK 0x8ull
+#define MISC_ERR_STATUS_MISC_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define MISC_ERR_STATUS_MISC_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define MISC_ERR_STATUS_MISC_CSR_PARITY_ERR_SMASK 0x1ull
+#define PCI_CFG_MSIX0 (PCIE + 0x0000000000B0)
+#define PCI_CFG_REG1 (PCIE + 0x000000000004)
+#define PCI_CFG_REG11 (PCIE + 0x00000000002C)
+#define PCIE_CFG_SPCIE1 (PCIE + 0x00000000014C)
+#define PCIE_CFG_SPCIE2 (PCIE + 0x000000000150)
+#define PCIE_CFG_TPH2 (PCIE + 0x000000000180)
+#define RCV_ARRAY (RXE + 0x000000200000)
+#define RCV_ARRAY_CNT (RXE + 0x000000000018)
+#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull
+#define RCV_ARRAY_RT_ADDR_SHIFT 0
+#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36
+#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull
+#define RCV_AVAIL_TIME_OUT (RXE + 0x000000100050)
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK 0xFFull
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT 0
+#define RCV_BTH_QP (RXE + 0x000000000028)
+#define RCV_BTH_QP_KDETH_QP_MASK 0xFFull
+#define RCV_BTH_QP_KDETH_QP_SHIFT 16
+#define RCV_BYPASS (RXE + 0x000000000038)
+#define RCV_CONTEXTS (RXE + 0x000000000010)
+#define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400)
+#define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500)
+#define RCV_CTRL (RXE + 0x000000000000)
+#define RCV_CTRL_RCV_BYPASS_ENABLE_SMASK 0x10ull
+#define RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK 0x40ull
+#define RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK 0x4ull
+#define RCV_CTRL_RCV_PORT_ENABLE_SMASK 0x1ull
+#define RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK 0x2ull
+#define RCV_CTRL_RCV_RSM_ENABLE_SMASK 0x20ull
+#define RCV_CTRL_RX_RBUF_INIT_SMASK 0x200ull
+#define RCV_CTXT_CTRL (RXE + 0x000000100000)
+#define RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK 0x4ull
+#define RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK 0x8ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK 0x7ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT 8
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK 0x700ull
+#define RCV_CTXT_CTRL_ENABLE_SMASK 0x1ull
+#define RCV_CTXT_CTRL_INTR_AVAIL_SMASK 0x20ull
+#define RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK 0x2ull
+#define RCV_CTXT_CTRL_TAIL_UPD_SMASK 0x40ull
+#define RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK 0x10ull
+#define RCV_CTXT_STATUS (RXE + 0x000000100008)
+#define RCV_EGR_CTRL (RXE + 0x000000100010)
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_MASK 0x1FFFull
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT 0
+#define RCV_EGR_CTRL_EGR_CNT_MASK 0x1FFull
+#define RCV_EGR_CTRL_EGR_CNT_SHIFT 32
+#define RCV_EGR_INDEX_HEAD (RXE + 0x000000300018)
+#define RCV_EGR_INDEX_HEAD_HEAD_MASK 0x7FFull
+#define RCV_EGR_INDEX_HEAD_HEAD_SHIFT 0
+#define RCV_ERR_CLEAR (RXE + 0x000000000070)
+#define RCV_ERR_INFO (RXE + 0x000000000050)
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK 0x1Full
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK 0x20ull
+#define RCV_ERR_MASK (RXE + 0x000000000068)
+#define RCV_ERR_STATUS (RXE + 0x000000000060)
+#define RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK 0x8000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_READ_BAD_ADDR_ERR_SMASK 0x2000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_WRITE_BAD_ADDR_ERR_SMASK \
+		0x4000000000000000ull
+#define RCV_ERR_STATUS_RX_DC_INTF_PARITY_ERR_SMASK 0x2ull
+#define RCV_ERR_STATUS_RX_DC_SOP_EOP_PARITY_ERR_SMASK 0x200ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_COR_ERR_SMASK 0x1ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK 0x200000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK 0x1000000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_COR_ERR_SMASK \
+		0x40000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+		0x20000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+		0x800000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+		0x400000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_COR_ERR_SMASK 0x800ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK 0x400ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_COR_ERR_SMASK 0x10000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK 0x8000000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK 0x400000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK 0x100000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+		0x10000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK 0x8000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+		0x20000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_COR_ERR_SMASK 0x80000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK 0x40000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK 0x40000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_COR_ERR_SMASK 0x100000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK 0x80000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK 0x400000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK 0x10000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK 0x2000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+		0x200000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK 0x800000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+		0x8000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK 0x4000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK 0x1000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK 0x20000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_COR_ERR_SMASK 0x100000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK 0x80000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK 0x1000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK 0x800000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_COR_ERR_SMASK 0x4000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK 0x2000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK 0x100000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK 0x800000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+		0x1000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK 0x200000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK 0x400000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_COR_ERR_SMASK 0x4000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK 0x2000ull
+#define RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK 0x80000000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_COR_ERR_SMASK 0x40000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK 0x10000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK 0x8000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK 0x20000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_COR_ERR_SMASK 0x4000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK 0x2000000000ull
+#define RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK 0x100ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_COR_ERR_SMASK 0x20ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_UNC_ERR_SMASK 0x10ull
+#define RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK 0x1000ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_COR_ERR_SMASK 0x8ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_UNC_ERR_SMASK 0x4ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_COR_ERR_SMASK 0x80ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK 0x40ull
+#define RCV_HDR_ADDR (RXE + 0x000000100028)
+#define RCV_HDR_CNT (RXE + 0x000000100030)
+#define RCV_HDR_CNT_CNT_MASK 0x1FFull
+#define RCV_HDR_CNT_CNT_SHIFT 0
+#define RCV_HDR_ENT_SIZE (RXE + 0x000000100038)
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_MASK 0x7ull
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT 0
+#define RCV_HDR_HEAD (RXE + 0x000000300008)
+#define RCV_HDR_HEAD_COUNTER_MASK 0xFFull
+#define RCV_HDR_HEAD_COUNTER_SHIFT 32
+#define RCV_HDR_HEAD_HEAD_MASK 0x7FFFFull
+#define RCV_HDR_HEAD_HEAD_SHIFT 0
+#define RCV_HDR_HEAD_HEAD_SMASK 0x7FFFFull
+#define RCV_HDR_OVFL_CNT (RXE + 0x000000100058)
+#define RCV_HDR_SIZE (RXE + 0x000000100040)
+#define RCV_HDR_SIZE_HDR_SIZE_MASK 0x1Full
+#define RCV_HDR_SIZE_HDR_SIZE_SHIFT 0
+#define RCV_HDR_TAIL (RXE + 0x000000300000)
+#define RCV_HDR_TAIL_ADDR (RXE + 0x000000100048)
+#define RCV_KEY_CTRL (RXE + 0x000000100020)
+#define RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK 0x200000000ull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_MASK 0xFFFFull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT 0
+#define RCV_MULTICAST (RXE + 0x000000000030)
+#define RCV_PARTITION_KEY (RXE + 0x000000000200)
+#define RCV_PARTITION_KEY_PARTITION_KEY_A_MASK 0xFFFFull
+#define RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT 16
+#define RCV_QP_MAP_TABLE (RXE + 0x000000000100)
+#define RCV_RSM_CFG (RXE + 0x000000000600)
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
+#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
+#define RCV_RSM_CFG_OFFSET_SHIFT 32
+#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
+#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
+#define RCV_RSM_MATCH (RXE + 0x000000000800)
+#define RCV_RSM_MATCH_MASK1_SHIFT 0
+#define RCV_RSM_MATCH_MASK2_SHIFT 16
+#define RCV_RSM_MATCH_VALUE1_SHIFT 8
+#define RCV_RSM_MATCH_VALUE2_SHIFT 24
+#define RCV_RSM_SELECT (RXE + 0x000000000700)
+#define RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT 0
+#define RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT 16
+#define RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT 32
+#define RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT 44
+#define RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT 48
+#define RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT 60
+#define RCV_STATUS (RXE + 0x000000000008)
+#define RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK 0x1ull
+#define RCV_STATUS_RX_RBUF_INIT_DONE_SMASK 0x200ull
+#define RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK 0x40ull
+#define RCV_TID_CTRL (RXE + 0x000000100018)
+#define RCV_TID_CTRL_TID_BASE_INDEX_MASK 0x1FFFull
+#define RCV_TID_CTRL_TID_BASE_INDEX_SHIFT 0
+#define RCV_TID_CTRL_TID_PAIR_CNT_MASK 0x1FFull
+#define RCV_TID_CTRL_TID_PAIR_CNT_SHIFT 32
+#define RCV_TID_FLOW_TABLE (RXE + 0x000000300800)
+#define RCV_VL15 (RXE + 0x000000000048)
+#define SEND_BTH_QP (TXE + 0x0000000000A0)
+#define SEND_BTH_QP_KDETH_QP_MASK 0xFFull
+#define SEND_BTH_QP_KDETH_QP_SHIFT 16
+#define SEND_CM_CREDIT_USED_STATUS (TXE + 0x000000000510)
+#define SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK \
+		0x1000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK \
+		0x8000000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK \
+		0x2000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK \
+		0x4000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK \
+		0x8000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK \
+		0x10000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK \
+		0x20000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK \
+		0x40000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK \
+		0x80000000000000ull
+#define SEND_CM_CREDIT_VL (TXE + 0x000000000600)
+#define SEND_CM_CREDIT_VL15 (TXE + 0x000000000678)
+#define SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT 16
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK 0xFFFF0000ull
+#define SEND_CM_CTRL (TXE + 0x000000000500)
+#define SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK 0x8ull
+#define SEND_CM_CTRL_RESETCSR 0x0000000000000020ull
+#define SEND_CM_GLOBAL_CREDIT (TXE + 0x000000000508)
+#define SEND_CM_GLOBAL_CREDIT_AU_MASK 0x7ull
+#define SEND_CM_GLOBAL_CREDIT_AU_SHIFT 16
+#define SEND_CM_GLOBAL_CREDIT_AU_SMASK 0x70000ull
+#define SEND_CM_GLOBAL_CREDIT_RESETCSR 0x0000094000030000ull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT 0
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT 32
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK 0xFFFF00000000ull
+#define SEND_CM_LOCAL_AU_TABLE0_TO3 (TXE + 0x000000000520)
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT 48
+#define SEND_CM_LOCAL_AU_TABLE4_TO7 (TXE + 0x000000000528)
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT 48
+#define SEND_CM_REMOTE_AU_TABLE0_TO3 (TXE + 0x000000000530)
+#define SEND_CM_REMOTE_AU_TABLE4_TO7 (TXE + 0x000000000538)
+#define SEND_CM_TIMER_CTRL (TXE + 0x000000000518)
+#define SEND_CONTEXTS (TXE + 0x000000000010)
+#define SEND_CONTEXT_SET_CTRL (TXE + 0x000000000200)
+#define SEND_COUNTER_ARRAY32 (TXE + 0x000000000300)
+#define SEND_COUNTER_ARRAY64 (TXE + 0x000000000400)
+#define SEND_CTRL (TXE + 0x000000000000)
+#define SEND_CTRL_CM_RESET_SMASK 0x4ull
+#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull
+#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080)
+#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK \
+		0x200000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK 0x800ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK 0x400ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK 0x1000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK 0x2000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+		0x100000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK 0x10000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+		0x80000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK \
+		0x40000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+		0x8000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK \
+		0x4000ull
+#define SEND_CTXT_CHECK_JOB_KEY (TXE + 0x000000100090)
+#define SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK 0x100000000ull
+#define SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK 0xFFFF0000ull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_OPCODE (TXE + 0x0000001000A8)
+#define SEND_CTXT_CHECK_OPCODE_MASK_SHIFT 8
+#define SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_PARTITION_KEY (TXE + 0x000000100098)
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_SLID (TXE + 0x0000001000A0)
+#define SEND_CTXT_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_MASK_SHIFT 16
+#define SEND_CTXT_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_VL (TXE + 0x000000100088)
+#define SEND_CTXT_CREDIT_CTRL (TXE + 0x000000100010)
+#define SEND_CTXT_CREDIT_CTRL_CREDIT_INTR_SMASK 0x20000ull
+#define SEND_CTXT_CREDIT_CTRL_EARLY_RETURN_SMASK 0x10000ull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull
+#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028)
+#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull
+#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020)
+#define SEND_CTXT_CREDIT_RETURN_ADDR_ADDRESS_SMASK 0xFFFFFFFFFFC0ull
+#define SEND_CTXT_CTRL (TXE + 0x000000100000)
+#define SEND_CTXT_CTRL_CTXT_BASE_MASK 0x3FFFull
+#define SEND_CTXT_CTRL_CTXT_BASE_SHIFT 32
+#define SEND_CTXT_CTRL_CTXT_DEPTH_MASK 0x7FFull
+#define SEND_CTXT_CTRL_CTXT_DEPTH_SHIFT 48
+#define SEND_CTXT_CTRL_CTXT_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_ERR_CLEAR (TXE + 0x000000100050)
+#define SEND_CTXT_ERR_MASK (TXE + 0x000000100048)
+#define SEND_CTXT_ERR_STATUS (TXE + 0x000000100040)
+#define SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK 0x2ull
+#define SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK 0x1ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK 0x4ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK 0x10ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK 0x8ull
+#define SEND_CTXT_STATUS (TXE + 0x000000100008)
+#define SEND_CTXT_STATUS_CTXT_HALTED_SMASK 0x1ull
+#define SEND_DMA_BASE_ADDR (TXE + 0x000000200010)
+#define SEND_DMA_CHECK_ENABLE (TXE + 0x000000200080)
+#define SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK 0x200000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+		0x100000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+		0x80000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK 0x40000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+		0x8000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK 0x4000ull
+#define SEND_DMA_CHECK_JOB_KEY (TXE + 0x000000200090)
+#define SEND_DMA_CHECK_OPCODE (TXE + 0x0000002000A8)
+#define SEND_DMA_CHECK_PARTITION_KEY (TXE + 0x000000200098)
+#define SEND_DMA_CHECK_SLID (TXE + 0x0000002000A0)
+#define SEND_DMA_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_MASK_SHIFT 16
+#define SEND_DMA_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_DMA_CHECK_VL (TXE + 0x000000200088)
+#define SEND_DMA_CTRL (TXE + 0x000000200000)
+#define SEND_DMA_CTRL_SDMA_CLEANUP_SMASK 0x4ull
+#define SEND_DMA_CTRL_SDMA_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CTRL_SDMA_HALT_SMASK 0x2ull
+#define SEND_DMA_CTRL_SDMA_INT_ENABLE_SMASK 0x8ull
+#define SEND_DMA_DESC_CNT (TXE + 0x000000200050)
+#define SEND_DMA_DESC_CNT_CNT_MASK 0xFFFFull
+#define SEND_DMA_DESC_CNT_CNT_SHIFT 0
+#define SEND_DMA_ENG_ERR_CLEAR (TXE + 0x000000200070)
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK 0x1ull
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT 18
+#define SEND_DMA_ENG_ERR_MASK (TXE + 0x000000200068)
+#define SEND_DMA_ENG_ERR_STATUS (TXE + 0x000000200060)
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK 0x8000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK 0x4000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK 0x10ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK 0x2ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK 0x40ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK 0x800ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK 0x1000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK \
+		0x40000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK 0x400ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK \
+		0x20000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK 0x80ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK 0x20ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK \
+		0x100ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK \
+		0x10000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK 0x8ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK 0x2000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK 0x4ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK 0x1ull
+#define SEND_DMA_ENGINES (TXE + 0x000000000018)
+#define SEND_DMA_ERR_CLEAR (TXE + 0x000000000070)
+#define SEND_DMA_ERR_MASK (TXE + 0x000000000068)
+#define SEND_DMA_ERR_STATUS (TXE + 0x000000000060)
+#define SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK 0x2ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK 0x8ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK 0x4ull
+#define SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK 0x1ull
+#define SEND_DMA_HEAD (TXE + 0x000000200028)
+#define SEND_DMA_HEAD_ADDR (TXE + 0x000000200030)
+#define SEND_DMA_LEN_GEN (TXE + 0x000000200018)
+#define SEND_DMA_LEN_GEN_GENERATION_SHIFT 16
+#define SEND_DMA_LEN_GEN_LENGTH_SHIFT 6
+#define SEND_DMA_MEMORY (TXE + 0x0000002000B0)
+#define SEND_DMA_MEMORY_SDMA_MEMORY_CNT_SHIFT 16
+#define SEND_DMA_MEMORY_SDMA_MEMORY_INDEX_SHIFT 0
+#define SEND_DMA_MEM_SIZE (TXE + 0x000000000028)
+#define SEND_DMA_PRIORITY_THLD (TXE + 0x000000200038)
+#define SEND_DMA_RELOAD_CNT (TXE + 0x000000200048)
+#define SEND_DMA_STATUS (TXE + 0x000000200008)
+#define SEND_DMA_STATUS_ENG_CLEANED_UP_SMASK 0x200000000000000ull
+#define SEND_DMA_STATUS_ENG_HALTED_SMASK 0x100000000000000ull
+#define SEND_DMA_TAIL (TXE + 0x000000200020)
+#define SEND_EGRESS_CTXT_STATUS (TXE + 0x000000000800)
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK 0x10000ull
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK \
+		0x3FFFull
+#define SEND_EGRESS_ERR_CLEAR (TXE + 0x000000000090)
+#define SEND_EGRESS_ERR_INFO (TXE + 0x000000000F00)
+#define SEND_EGRESS_ERR_INFO_BAD_PKT_LEN_ERR_SMASK 0x20000ull
+#define SEND_EGRESS_ERR_INFO_BYPASS_ERR_SMASK 0x800ull
+#define SEND_EGRESS_ERR_INFO_GRH_ERR_SMASK 0x400ull
+#define SEND_EGRESS_ERR_INFO_JOB_KEY_ERR_SMASK 0x4ull
+#define SEND_EGRESS_ERR_INFO_KDETH_PACKETS_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_INFO_NON_KDETH_PACKETS_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_INFO_OPCODE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_INFO_PARTITION_KEY_ERR_SMASK 0x8ull
+#define SEND_EGRESS_ERR_INFO_PBC_STATIC_RATE_CONTROL_ERR_SMASK 0x100000ull
+#define SEND_EGRESS_ERR_INFO_PBC_TEST_ERR_SMASK 0x10000ull
+#define SEND_EGRESS_ERR_INFO_RAW_ERR_SMASK 0x100ull
+#define SEND_EGRESS_ERR_INFO_RAW_IPV6_ERR_SMASK 0x200ull
+#define SEND_EGRESS_ERR_INFO_SLID_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_BYPASS_PACKETS_ERR_SMASK 0x80000ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK 0x40000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_BYPASS_PACKETS_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_IB_PACKETS_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_INFO_VL_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK 0x40ull
+#define SEND_EGRESS_ERR_MASK (TXE + 0x000000000088)
+#define SEND_EGRESS_ERR_SOURCE (TXE + 0x000000000F08)
+#define SEND_EGRESS_ERR_STATUS (TXE + 0x000000000080)
+#define SEND_EGRESS_ERR_STATUS_TX_CONFIG_PARITY_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_OVERRUN_ERR_SMASK \
+		0x200000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_PARITY_ERR_SMASK \
+		0x20000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK \
+		0x800000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_COR_ERR_SMASK \
+		0x2000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNC_ERR_SMASK \
+		0x200000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR_SMASK \
+		0x8ull
+#define SEND_EGRESS_ERR_STATUS_TX_HCRC_INSERTION_ERR_SMASK \
+		0x400000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_ILLEGAL_VL_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_STATUS_TX_INCORRECT_LINK_STATE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_CSR_PARITY_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_COR_ERR_SMASK \
+		0x1000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR_SMASK \
+		0x100000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_COR_ERR_SMASK \
+		0x2000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR_SMASK \
+		0x200000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_COR_ERR_SMASK \
+		0x4000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR_SMASK \
+		0x400000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_COR_ERR_SMASK \
+		0x8000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR_SMASK \
+		0x800000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_COR_ERR_SMASK \
+		0x10000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR_SMASK \
+		0x1000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_COR_ERR_SMASK \
+		0x20000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR_SMASK \
+		0x2000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_COR_ERR_SMASK \
+		0x40000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR_SMASK \
+		0x4000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_COR_ERR_SMASK \
+		0x80000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR_SMASK \
+		0x8000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_COR_ERR_SMASK \
+		0x100000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR_SMASK \
+		0x10000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LINKDOWN_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_STATUS_TX_PIO_LAUNCH_INTF_PARITY_ERR_SMASK 0x80ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_COR_ERR_SMASK 0x1ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_UNC_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_COR_ERR_SMASK \
+		0x1000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_CSR_UNC_ERR_SMASK \
+		0x8000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_UNC_ERR_SMASK \
+		0x100000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_COR_ERR_SMASK \
+		0x800000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_CSR_UNC_ERR_SMASK \
+		0x4000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_UNC_ERR_SMASK \
+		0x80000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_COR_ERR_SMASK 0x400000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_UNC_ERR_SMASK 0x40000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_CSR_PARITY_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR_SMASK \
+		0x800ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA0_DISALLOWED_PACKET_ERR_SMASK \
+		0x10000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA10_DISALLOWED_PACKET_ERR_SMASK \
+		0x4000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA11_DISALLOWED_PACKET_ERR_SMASK \
+		0x8000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA12_DISALLOWED_PACKET_ERR_SMASK \
+		0x10000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA13_DISALLOWED_PACKET_ERR_SMASK \
+		0x20000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA14_DISALLOWED_PACKET_ERR_SMASK \
+		0x40000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA15_DISALLOWED_PACKET_ERR_SMASK \
+		0x80000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA1_DISALLOWED_PACKET_ERR_SMASK \
+		0x20000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA2_DISALLOWED_PACKET_ERR_SMASK \
+		0x40000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA3_DISALLOWED_PACKET_ERR_SMASK \
+		0x80000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA4_DISALLOWED_PACKET_ERR_SMASK \
+		0x100000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA5_DISALLOWED_PACKET_ERR_SMASK \
+		0x200000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA6_DISALLOWED_PACKET_ERR_SMASK \
+		0x400000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA7_DISALLOWED_PACKET_ERR_SMASK \
+		0x800000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA8_DISALLOWED_PACKET_ERR_SMASK \
+		0x1000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA9_DISALLOWED_PACKET_ERR_SMASK \
+		0x2000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA_LAUNCH_INTF_PARITY_ERR_SMASK \
+		0x100ull
+#define SEND_EGRESS_SEND_DMA_STATUS (TXE + 0x000000000E00)
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+		0x3FFFull
+#define SEND_ERR_CLEAR (TXE + 0x0000000000F0)
+#define SEND_ERR_MASK (TXE + 0x0000000000E8)
+#define SEND_ERR_STATUS (TXE + 0x0000000000E0)
+#define SEND_ERR_STATUS_SEND_CSR_PARITY_ERR_SMASK 0x1ull
+#define SEND_ERR_STATUS_SEND_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define SEND_ERR_STATUS_SEND_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define SEND_HIGH_PRIORITY_LIMIT (TXE + 0x000000000030)
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK 0x3FFFull
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT 0
+#define SEND_HIGH_PRIORITY_LIST (TXE + 0x000000000180)
+#define SEND_LEN_CHECK0 (TXE + 0x0000000000D0)
+#define SEND_LEN_CHECK0_LEN_VL0_MASK 0xFFFull
+#define SEND_LEN_CHECK0_LEN_VL1_SHIFT 12
+#define SEND_LEN_CHECK1 (TXE + 0x0000000000D8)
+#define SEND_LEN_CHECK1_LEN_VL15_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL15_SHIFT 48
+#define SEND_LEN_CHECK1_LEN_VL4_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL5_SHIFT 12
+#define SEND_LOW_PRIORITY_LIST (TXE + 0x000000000100)
+#define SEND_LOW_PRIORITY_LIST_VL_MASK 0x7ull
+#define SEND_LOW_PRIORITY_LIST_VL_SHIFT 16
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_MASK 0xFFull
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT 0
+#define SEND_PIO_ERR_CLEAR (TXE + 0x000000000050)
+#define SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_MASK (TXE + 0x000000000048)
+#define SEND_PIO_ERR_STATUS (TXE + 0x000000000040)
+#define SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+		0x1000000ull
+#define SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK 0x8000ull
+#define SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK 0x4ull
+#define SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+		0x100000000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK 0x100000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK 0x80000ull
+#define SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+		0x200000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK 0x20ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+		0x400000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK \
+		0x800000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK 0x200ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK 0x40000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK 0x10000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK 0x10000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK 0x20000000ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK 0x8ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK 0x10ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK 0x80ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+		0x100ull
+#define SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK 0x400ull
+#define SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK 0x400000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK 0x8000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK 0x4000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK 0x2000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK 0x2000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK 0x800ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK 0x4000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK 0x1000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK 0x2ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK 0x1ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK 0x200000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK 0x800000ull
+#define SEND_PIO_INIT_CTXT (TXE + 0x000000000038)
+#define SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK 0x1ull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK 0xFFull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT 8
+#define SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK 0x8ull
+#define SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK 0x4ull
+#define SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK 0x2ull
+#define SEND_PIO_MEM_SIZE (TXE + 0x000000000020)
+#define SEND_SC2VLT0 (TXE + 0x0000000000B0)
+#define SEND_SC2VLT0_SC0_SHIFT 0
+#define SEND_SC2VLT0_SC1_SHIFT 8
+#define SEND_SC2VLT0_SC2_SHIFT 16
+#define SEND_SC2VLT0_SC3_SHIFT 24
+#define SEND_SC2VLT0_SC4_SHIFT 32
+#define SEND_SC2VLT0_SC5_SHIFT 40
+#define SEND_SC2VLT0_SC6_SHIFT 48
+#define SEND_SC2VLT0_SC7_SHIFT 56
+#define SEND_SC2VLT1 (TXE + 0x0000000000B8)
+#define SEND_SC2VLT1_SC10_SHIFT 16
+#define SEND_SC2VLT1_SC11_SHIFT 24
+#define SEND_SC2VLT1_SC12_SHIFT 32
+#define SEND_SC2VLT1_SC13_SHIFT 40
+#define SEND_SC2VLT1_SC14_SHIFT 48
+#define SEND_SC2VLT1_SC15_SHIFT 56
+#define SEND_SC2VLT1_SC8_SHIFT 0
+#define SEND_SC2VLT1_SC9_SHIFT 8
+#define SEND_SC2VLT2 (TXE + 0x0000000000C0)
+#define SEND_SC2VLT2_SC16_SHIFT 0
+#define SEND_SC2VLT2_SC17_SHIFT 8
+#define SEND_SC2VLT2_SC18_SHIFT 16
+#define SEND_SC2VLT2_SC19_SHIFT 24
+#define SEND_SC2VLT2_SC20_SHIFT 32
+#define SEND_SC2VLT2_SC21_SHIFT 40
+#define SEND_SC2VLT2_SC22_SHIFT 48
+#define SEND_SC2VLT2_SC23_SHIFT 56
+#define SEND_SC2VLT3 (TXE + 0x0000000000C8)
+#define SEND_SC2VLT3_SC24_SHIFT 0
+#define SEND_SC2VLT3_SC25_SHIFT 8
+#define SEND_SC2VLT3_SC26_SHIFT 16
+#define SEND_SC2VLT3_SC27_SHIFT 24
+#define SEND_SC2VLT3_SC28_SHIFT 32
+#define SEND_SC2VLT3_SC29_SHIFT 40
+#define SEND_SC2VLT3_SC30_SHIFT 48
+#define SEND_SC2VLT3_SC31_SHIFT 56
+#define SEND_STATIC_RATE_CONTROL (TXE + 0x0000000000A8)
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull
+#define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708)
+#define PCIE_CFG_REG_PL3 (PCIE + 0x00000000070C)
+#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT 27
+#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK 0x38000000
+#define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898)
+#define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12
+#define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6
+#define PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT 0
+#define PCIE_CFG_REG_PL103 (PCIE + 0x00000000089C)
+#define PCIE_CFG_REG_PL105 (PCIE + 0x0000000008A4)
+#define PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK 0x1ull
+#define PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT 24
+#define PCIE_CFG_REG_PL100 (PCIE + 0x000000000890)
+#define PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK 0x400ull
+#define PCIE_CFG_REG_PL101 (PCIE + 0x000000000894)
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT 6
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT 0
+#define PCIE_CFG_REG_PL106 (PCIE + 0x0000000008A8)
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8
+#define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull
+#define CCE_INT_BLOCKED (CCE + 0x000000110C00)
+#define SEND_DMA_IDLE_CNT (TXE + 0x000000200040)
+#define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058)
+#define CCE_MSIX_PBA_OFFSET 0X0110000
+
+#endif          /* DEF_CHIP_REG */
diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
new file mode 100644
index 0000000..7108d4d
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -0,0 +1,396 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _COMMON_H
+#define _COMMON_H
+
+#include <rdma/hfi/hfi1_user.h>
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fast path code
+ * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in fast path code
+ * _HFI1_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ */
+
+/*
+ * If a packet's QP[23:16] bits match this value, then it is
+ * a PSM packet and the hardware will expect a KDETH header
+ * following the BTH.
+ */
+#define DEFAULT_KDETH_QP 0x80
+
+/* driver/hw feature set bitmask */
+#define HFI1_CAP_USER_SHIFT      24
+#define HFI1_CAP_MASK            ((1UL << HFI1_CAP_USER_SHIFT) - 1)
+/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */
+#define HFI1_CAP_LOCKED_SHIFT    63
+#define HFI1_CAP_LOCKED_MASK     0x1ULL
+#define HFI1_CAP_LOCKED_SMASK    (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT)
+/* extra bits used between kernel and user processes */
+#define HFI1_CAP_MISC_SHIFT      (HFI1_CAP_USER_SHIFT * 2)
+#define HFI1_CAP_MISC_MASK       ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \
+					   HFI1_CAP_MISC_SHIFT)) - 1)
+
+#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; })
+#define HFI1_CAP_KCLEAR(cap)						\
+	({								\
+		hfi1_cap_mask &= ~HFI1_CAP_##cap;			\
+		hfi1_cap_mask;						\
+	})
+#define HFI1_CAP_USET(cap)						\
+	({								\
+		hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+		hfi1_cap_mask;						\
+		})
+#define HFI1_CAP_UCLEAR(cap)						\
+	({								\
+		hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+		hfi1_cap_mask;						\
+	})
+#define HFI1_CAP_SET(cap)						\
+	({								\
+		hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap <<	\
+						  HFI1_CAP_USER_SHIFT)); \
+		hfi1_cap_mask;						\
+	})
+#define HFI1_CAP_CLEAR(cap)						\
+	({								\
+		hfi1_cap_mask &= ~(HFI1_CAP_##cap |			\
+				  (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \
+		hfi1_cap_mask;						\
+	})
+#define HFI1_CAP_LOCK()							\
+	({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; })
+#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK))
+/*
+ * The set of capability bits that can be changed after initial load
+ * This set is the same for kernel and user contexts. However, for
+ * user contexts, the set can be further filtered by using the
+ * HFI1_CAP_RESERVED_MASK bits.
+ */
+#define HFI1_CAP_WRITABLE_MASK   (HFI1_CAP_SDMA_AHG |			\
+				  HFI1_CAP_HDRSUPP |			\
+				  HFI1_CAP_MULTI_PKT_EGR |		\
+				  HFI1_CAP_NODROP_RHQ_FULL |		\
+				  HFI1_CAP_NODROP_EGR_FULL |		\
+				  HFI1_CAP_ALLOW_PERM_JKEY |		\
+				  HFI1_CAP_STATIC_RATE_CTRL |		\
+				  HFI1_CAP_PRINT_UNIMPL |		\
+				  HFI1_CAP_TID_UNMAP)
+/*
+ * A set of capability bits that are "global" and are not allowed to be
+ * set in the user bitmask.
+ */
+#define HFI1_CAP_RESERVED_MASK   ((HFI1_CAP_SDMA |			\
+				  HFI1_CAP_USE_SDMA_HEAD |		\
+				  HFI1_CAP_EXTENDED_PSN |		\
+				  HFI1_CAP_PRINT_UNIMPL |		\
+				  HFI1_CAP_NO_INTEGRITY |		\
+				  HFI1_CAP_PKEY_CHECK) <<		\
+				 HFI1_CAP_USER_SHIFT)
+/*
+ * Set of capabilities that need to be enabled for kernel context in
+ * order to be allowed for user contexts, as well.
+ */
+#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL)
+/* Default enabled capabilities (both kernel and user) */
+#define HFI1_CAP_MASK_DEFAULT    (HFI1_CAP_HDRSUPP |			\
+				 HFI1_CAP_NODROP_RHQ_FULL |		\
+				 HFI1_CAP_NODROP_EGR_FULL |		\
+				 HFI1_CAP_SDMA |			\
+				 HFI1_CAP_PRINT_UNIMPL |		\
+				 HFI1_CAP_STATIC_RATE_CTRL |		\
+				 HFI1_CAP_PKEY_CHECK |			\
+				 HFI1_CAP_MULTI_PKT_EGR |		\
+				 HFI1_CAP_EXTENDED_PSN |		\
+				 ((HFI1_CAP_HDRSUPP |			\
+				   HFI1_CAP_MULTI_PKT_EGR |		\
+				   HFI1_CAP_STATIC_RATE_CTRL |		\
+				   HFI1_CAP_PKEY_CHECK |		\
+				   HFI1_CAP_EARLY_CREDIT_RETURN) <<	\
+				  HFI1_CAP_USER_SHIFT))
+/*
+ * A bitmask of kernel/global capabilities that should be communicated
+ * to user level processes.
+ */
+#define HFI1_CAP_K2U (HFI1_CAP_SDMA |			\
+		     HFI1_CAP_EXTENDED_PSN |		\
+		     HFI1_CAP_PKEY_CHECK |		\
+		     HFI1_CAP_NO_INTEGRITY)
+
+#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << HFI1_SWMAJOR_SHIFT) | \
+			     HFI1_USER_SWMINOR)
+
+#ifndef HFI1_KERN_TYPE
+#define HFI1_KERN_TYPE 0
+#endif
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a Intel release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-Intel and 1 for Intel-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of hfi1_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION)
+
+/*
+ * Define the driver version number.  This is something that refers only
+ * to the driver itself, not the software interfaces it supports.
+ */
+#ifndef HFI1_DRIVER_VERSION_BASE
+#define HFI1_DRIVER_VERSION_BASE "0.9-294"
+#endif
+
+/* create the final driver version string */
+#ifdef HFI1_IDSTR
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR
+#else
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE
+#endif
+
+/*
+ * Diagnostics can send a packet by writing the following
+ * struct to the diag packet special file.
+ *
+ * This allows a custom PBC qword, so that special modes and deliberate
+ * changes to CRCs can be used.
+ */
+#define _DIAG_PKT_VERS 1
+struct diag_pkt {
+	__u16 version;		/* structure version */
+	__u16 unit;		/* which device */
+	__u16 sw_index;		/* send sw index to use */
+	__u16 len;		/* data length, in bytes */
+	__u16 port;		/* port number */
+	__u16 unused;
+	__u32 flags;		/* call flags */
+	__u64 data;		/* user data pointer */
+	__u64 pbc;		/* PBC for the packet */
+};
+
+/* diag_pkt flags */
+#define F_DIAGPKT_WAIT 0x1	/* wait until packet is sent */
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software.
+ */
+
+/*
+ * Receive Header Flags
+ */
+#define RHF_PKT_LEN_SHIFT	0
+#define RHF_PKT_LEN_MASK	0xfffull
+#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT)
+
+#define RHF_RCV_TYPE_SHIFT	12
+#define RHF_RCV_TYPE_MASK	0x7ull
+#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT)
+
+#define RHF_USE_EGR_BFR_SHIFT	15
+#define RHF_USE_EGR_BFR_MASK	0x1ull
+#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT)
+
+#define RHF_EGR_INDEX_SHIFT	16
+#define RHF_EGR_INDEX_MASK	0x7ffull
+#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT)
+
+#define RHF_DC_INFO_SHIFT	27
+#define RHF_DC_INFO_MASK	0x1ull
+#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT)
+
+#define RHF_RCV_SEQ_SHIFT	28
+#define RHF_RCV_SEQ_MASK	0xfull
+#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT)
+
+#define RHF_EGR_OFFSET_SHIFT	32
+#define RHF_EGR_OFFSET_MASK	0xfffull
+#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT)
+#define RHF_HDRQ_OFFSET_SHIFT	44
+#define RHF_HDRQ_OFFSET_MASK	0x1ffull
+#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT)
+#define RHF_K_HDR_LEN_ERR	(0x1ull << 53)
+#define RHF_DC_UNC_ERR		(0x1ull << 54)
+#define RHF_DC_ERR		(0x1ull << 55)
+#define RHF_RCV_TYPE_ERR_SHIFT	56
+#define RHF_RCV_TYPE_ERR_MASK	0x7ul
+#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT)
+#define RHF_TID_ERR		(0x1ull << 59)
+#define RHF_LEN_ERR		(0x1ull << 60)
+#define RHF_ECC_ERR		(0x1ull << 61)
+#define RHF_VCRC_ERR		(0x1ull << 62)
+#define RHF_ICRC_ERR		(0x1ull << 63)
+
+#define RHF_ERROR_SMASK 0xffe0000000000000ull		/* bits 63:53 */
+
+/* RHF receive types */
+#define RHF_RCV_TYPE_EXPECTED 0
+#define RHF_RCV_TYPE_EAGER    1
+#define RHF_RCV_TYPE_IB       2 /* normal IB, IB Raw, or IPv6 */
+#define RHF_RCV_TYPE_ERROR    3
+#define RHF_RCV_TYPE_BYPASS   4
+#define RHF_RCV_TYPE_INVALID5 5
+#define RHF_RCV_TYPE_INVALID6 6
+#define RHF_RCV_TYPE_INVALID7 7
+
+/* RHF receive type error - expected packet errors */
+#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR	0x2
+#define RHF_RTE_EXPECTED_FLOW_GEN_ERR	0x4
+
+/* RHF receive type error - eager packet errors */
+#define RHF_RTE_EAGER_NO_ERR		0x0
+
+/* RHF receive type error - IB packet errors */
+#define RHF_RTE_IB_NO_ERR		0x0
+
+/* RHF receive type error - error packet errors */
+#define RHF_RTE_ERROR_NO_ERR		0x0
+#define RHF_RTE_ERROR_OP_CODE_ERR	0x1
+#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR	0x2
+#define RHF_RTE_ERROR_KHDR_HCRC_ERR	0x3
+#define RHF_RTE_ERROR_KHDR_KVER_ERR	0x4
+#define RHF_RTE_ERROR_CONTEXT_ERR	0x5
+#define RHF_RTE_ERROR_KHDR_TID_ERR	0x6
+
+/* RHF receive type error - bypass packet errors */
+#define RHF_RTE_BYPASS_NO_ERR		0x0
+
+/* IB - LRH header constants */
+#define HFI1_LRH_GRH 0x0003      /* 1. word of IB LRH - next header: GRH */
+#define HFI1_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SC15_PACKET 0xF
+#define SIZE_OF_CRC 1
+#define SIZE_OF_LT 1
+#define MAX_16B_PADDING 12 /* CRC = 4, LT = 1, Pad = 0 to 7 bytes */
+
+#define LIM_MGMT_P_KEY       0x7FFF
+#define FULL_MGMT_P_KEY      0xFFFF
+
+#define DEFAULT_P_KEY LIM_MGMT_P_KEY
+
+#define HFI1_PSM_IOC_BASE_SEQ 0x0
+
+static inline __u64 rhf_to_cpu(const __le32 *rbuf)
+{
+	return __le64_to_cpu(*((__le64 *)rbuf));
+}
+
+static inline u64 rhf_err_flags(u64 rhf)
+{
+	return rhf & RHF_ERROR_SMASK;
+}
+
+static inline u32 rhf_rcv_type(u64 rhf)
+{
+	return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK;
+}
+
+static inline u32 rhf_rcv_type_err(u64 rhf)
+{
+	return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK;
+}
+
+/* return size is in bytes, not DWORDs */
+static inline u32 rhf_pkt_len(u64 rhf)
+{
+	return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2;
+}
+
+static inline u32 rhf_egr_index(u64 rhf)
+{
+	return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK;
+}
+
+static inline u32 rhf_rcv_seq(u64 rhf)
+{
+	return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK;
+}
+
+/* returned offset is in DWORDS */
+static inline u32 rhf_hdrq_offset(u64 rhf)
+{
+	return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline u64 rhf_use_egr_bfr(u64 rhf)
+{
+	return rhf & RHF_USE_EGR_BFR_SMASK;
+}
+
+static inline u64 rhf_dc_info(u64 rhf)
+{
+	return rhf & RHF_DC_INFO_SMASK;
+}
+
+static inline u32 rhf_egr_buf_offset(u64 rhf)
+{
+	return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK;
+}
+#endif /* _COMMON_H */
diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c
new file mode 100644
index 0000000..852173b
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/debugfs.c
@@ -0,0 +1,1585 @@
+/*
+ * Copyright(c) 2015-2018 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ratelimit.h>
+#include <linux/fault-inject.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "debugfs.h"
+#include "device.h"
+#include "qp.h"
+#include "sdma.h"
+
+static struct dentry *hfi1_dbg_root;
+
+/* wrappers to enforce srcu in seq file */
+static ssize_t hfi1_seq_read(
+	struct file *file,
+	char __user *buf,
+	size_t size,
+	loff_t *ppos)
+{
+	struct dentry *d = file->f_path.dentry;
+	ssize_t r;
+
+	r = debugfs_file_get(d);
+	if (unlikely(r))
+		return r;
+	r = seq_read(file, buf, size, ppos);
+	debugfs_file_put(d);
+	return r;
+}
+
+static loff_t hfi1_seq_lseek(
+	struct file *file,
+	loff_t offset,
+	int whence)
+{
+	struct dentry *d = file->f_path.dentry;
+	loff_t r;
+
+	r = debugfs_file_get(d);
+	if (unlikely(r))
+		return r;
+	r = seq_lseek(file, offset, whence);
+	debugfs_file_put(d);
+	return r;
+}
+
+#define private2dd(file) (file_inode(file)->i_private)
+#define private2ppd(file) (file_inode(file)->i_private)
+
+#define DEBUGFS_SEQ_FILE_OPS(name) \
+static const struct seq_operations _##name##_seq_ops = { \
+	.start = _##name##_seq_start, \
+	.next  = _##name##_seq_next, \
+	.stop  = _##name##_seq_stop, \
+	.show  = _##name##_seq_show \
+}
+
+#define DEBUGFS_SEQ_FILE_OPEN(name) \
+static int _##name##_open(struct inode *inode, struct file *s) \
+{ \
+	struct seq_file *seq; \
+	int ret; \
+	ret =  seq_open(s, &_##name##_seq_ops); \
+	if (ret) \
+		return ret; \
+	seq = s->private_data; \
+	seq->private = inode->i_private; \
+	return 0; \
+}
+
+#define DEBUGFS_FILE_OPS(name) \
+static const struct file_operations _##name##_file_ops = { \
+	.owner   = THIS_MODULE, \
+	.open    = _##name##_open, \
+	.read    = hfi1_seq_read, \
+	.llseek  = hfi1_seq_lseek, \
+	.release = seq_release \
+}
+
+#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode)	\
+do { \
+	struct dentry *ent; \
+	ent = debugfs_create_file(name, mode, parent, \
+		data, ops); \
+	if (!ent) \
+		pr_warn("create of %s failed\n", name); \
+} while (0)
+
+#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
+	DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO)
+
+static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct hfi1_opcode_stats_perctx *opstats;
+
+	if (*pos >= ARRAY_SIZE(opstats->stats))
+		return NULL;
+	return pos;
+}
+
+static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct hfi1_opcode_stats_perctx *opstats;
+
+	++*pos;
+	if (*pos >= ARRAY_SIZE(opstats->stats))
+		return NULL;
+	return pos;
+}
+
+static void _opcode_stats_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static int opcode_stats_show(struct seq_file *s, u8 i, u64 packets, u64 bytes)
+{
+	if (!packets && !bytes)
+		return SEQ_SKIP;
+	seq_printf(s, "%02x %llu/%llu\n", i,
+		   (unsigned long long)packets,
+		   (unsigned long long)bytes);
+
+	return 0;
+}
+
+static int _opcode_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos = v;
+	loff_t i = *spos, j;
+	u64 n_packets = 0, n_bytes = 0;
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+	struct hfi1_ctxtdata *rcd;
+
+	for (j = 0; j < dd->first_dyn_alloc_ctxt; j++) {
+		rcd = hfi1_rcd_get_by_index(dd, j);
+		if (rcd) {
+			n_packets += rcd->opstats->stats[i].n_packets;
+			n_bytes += rcd->opstats->stats[i].n_bytes;
+		}
+		hfi1_rcd_put(rcd);
+	}
+	return opcode_stats_show(s, i, n_packets, n_bytes);
+}
+
+DEBUGFS_SEQ_FILE_OPS(opcode_stats);
+DEBUGFS_SEQ_FILE_OPEN(opcode_stats)
+DEBUGFS_FILE_OPS(opcode_stats);
+
+static void *_tx_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	return _opcode_stats_seq_start(s, pos);
+}
+
+static void *_tx_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	return _opcode_stats_seq_next(s, v, pos);
+}
+
+static void _tx_opcode_stats_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static int _tx_opcode_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos = v;
+	loff_t i = *spos;
+	int j;
+	u64 n_packets = 0, n_bytes = 0;
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+	for_each_possible_cpu(j) {
+		struct hfi1_opcode_stats_perctx *s =
+			per_cpu_ptr(dd->tx_opstats, j);
+		n_packets += s->stats[i].n_packets;
+		n_bytes += s->stats[i].n_bytes;
+	}
+	return opcode_stats_show(s, i, n_packets, n_bytes);
+}
+
+DEBUGFS_SEQ_FILE_OPS(tx_opcode_stats);
+DEBUGFS_SEQ_FILE_OPEN(tx_opcode_stats)
+DEBUGFS_FILE_OPS(tx_opcode_stats);
+
+static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+	if (!*pos)
+		return SEQ_START_TOKEN;
+	if (*pos >= dd->first_dyn_alloc_ctxt)
+		return NULL;
+	return pos;
+}
+
+static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+	if (v == SEQ_START_TOKEN)
+		return pos;
+
+	++*pos;
+	if (*pos >= dd->first_dyn_alloc_ctxt)
+		return NULL;
+	return pos;
+}
+
+static void _ctx_stats_seq_stop(struct seq_file *s, void *v)
+{
+	/* nothing allocated */
+}
+
+static int _ctx_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos;
+	loff_t i, j;
+	u64 n_packets = 0;
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+	struct hfi1_ctxtdata *rcd;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(s, "Ctx:npkts\n");
+		return 0;
+	}
+
+	spos = v;
+	i = *spos;
+
+	rcd = hfi1_rcd_get_by_index_safe(dd, i);
+	if (!rcd)
+		return SEQ_SKIP;
+
+	for (j = 0; j < ARRAY_SIZE(rcd->opstats->stats); j++)
+		n_packets += rcd->opstats->stats[j].n_packets;
+
+	hfi1_rcd_put(rcd);
+
+	if (!n_packets)
+		return SEQ_SKIP;
+
+	seq_printf(s, "  %llu:%llu\n", i, n_packets);
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(ctx_stats);
+DEBUGFS_SEQ_FILE_OPEN(ctx_stats)
+DEBUGFS_FILE_OPS(ctx_stats);
+
+static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
+	__acquires(RCU)
+{
+	struct rvt_qp_iter *iter;
+	loff_t n = *pos;
+
+	iter = rvt_qp_iter_init(s->private, 0, NULL);
+
+	/* stop calls rcu_read_unlock */
+	rcu_read_lock();
+
+	if (!iter)
+		return NULL;
+
+	do {
+		if (rvt_qp_iter_next(iter)) {
+			kfree(iter);
+			return NULL;
+		}
+	} while (n--);
+
+	return iter;
+}
+
+static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
+				loff_t *pos)
+	__must_hold(RCU)
+{
+	struct rvt_qp_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (rvt_qp_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
+{
+	struct rvt_qp_iter *iter = iter_ptr;
+
+	if (!iter)
+		return 0;
+
+	qp_iter_print(s, iter);
+
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(qp_stats);
+DEBUGFS_SEQ_FILE_OPEN(qp_stats)
+DEBUGFS_FILE_OPS(qp_stats);
+
+static void *_sdes_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct hfi1_ibdev *ibd;
+	struct hfi1_devdata *dd;
+
+	ibd = (struct hfi1_ibdev *)s->private;
+	dd = dd_from_dev(ibd);
+	if (!dd->per_sdma || *pos >= dd->num_sdma)
+		return NULL;
+	return pos;
+}
+
+static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+	++*pos;
+	if (!dd->per_sdma || *pos >= dd->num_sdma)
+		return NULL;
+	return pos;
+}
+
+static void _sdes_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static int _sdes_seq_show(struct seq_file *s, void *v)
+{
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+	loff_t *spos = v;
+	loff_t i = *spos;
+
+	sdma_seqfile_dump_sde(s, &dd->per_sdma[i]);
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(sdes);
+DEBUGFS_SEQ_FILE_OPEN(sdes)
+DEBUGFS_FILE_OPS(sdes);
+
+static void *_rcds_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct hfi1_ibdev *ibd;
+	struct hfi1_devdata *dd;
+
+	ibd = (struct hfi1_ibdev *)s->private;
+	dd = dd_from_dev(ibd);
+	if (!dd->rcd || *pos >= dd->n_krcv_queues)
+		return NULL;
+	return pos;
+}
+
+static void *_rcds_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+	++*pos;
+	if (!dd->rcd || *pos >= dd->n_krcv_queues)
+		return NULL;
+	return pos;
+}
+
+static void _rcds_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static int _rcds_seq_show(struct seq_file *s, void *v)
+{
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+	struct hfi1_ctxtdata *rcd;
+	loff_t *spos = v;
+	loff_t i = *spos;
+
+	rcd = hfi1_rcd_get_by_index_safe(dd, i);
+	if (rcd)
+		seqfile_dump_rcd(s, rcd);
+	hfi1_rcd_put(rcd);
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(rcds);
+DEBUGFS_SEQ_FILE_OPEN(rcds)
+DEBUGFS_FILE_OPS(rcds);
+
+/* read the per-device counters */
+static ssize_t dev_counters_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	u64 *counters;
+	size_t avail;
+	struct hfi1_devdata *dd;
+	ssize_t rval;
+
+	dd = private2dd(file);
+	avail = hfi1_read_cntrs(dd, NULL, &counters);
+	rval =  simple_read_from_buffer(buf, count, ppos, counters, avail);
+	return rval;
+}
+
+/* read the per-device counters */
+static ssize_t dev_names_read(struct file *file, char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	char *names;
+	size_t avail;
+	struct hfi1_devdata *dd;
+	ssize_t rval;
+
+	dd = private2dd(file);
+	avail = hfi1_read_cntrs(dd, &names, NULL);
+	rval =  simple_read_from_buffer(buf, count, ppos, names, avail);
+	return rval;
+}
+
+struct counter_info {
+	char *name;
+	const struct file_operations ops;
+};
+
+/*
+ * Could use file_inode(file)->i_ino to figure out which file,
+ * instead of separate routine for each, but for now, this works...
+ */
+
+/* read the per-port names (same for each port) */
+static ssize_t portnames_read(struct file *file, char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	char *names;
+	size_t avail;
+	struct hfi1_devdata *dd;
+	ssize_t rval;
+
+	dd = private2dd(file);
+	avail = hfi1_read_portcntrs(dd->pport, &names, NULL);
+	rval = simple_read_from_buffer(buf, count, ppos, names, avail);
+	return rval;
+}
+
+/* read the per-port counters */
+static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf,
+				      size_t count, loff_t *ppos)
+{
+	u64 *counters;
+	size_t avail;
+	struct hfi1_pportdata *ppd;
+	ssize_t rval;
+
+	ppd = private2ppd(file);
+	avail = hfi1_read_portcntrs(ppd, NULL, &counters);
+	rval = simple_read_from_buffer(buf, count, ppos, counters, avail);
+	return rval;
+}
+
+static void check_dyn_flag(u64 scratch0, char *p, int size, int *used,
+			   int this_hfi, int hfi, u32 flag, const char *what)
+{
+	u32 mask;
+
+	mask = flag << (hfi ? CR_DYN_SHIFT : 0);
+	if (scratch0 & mask) {
+		*used += scnprintf(p + *used, size - *used,
+				   "  0x%08x - HFI%d %s in use, %s device\n",
+				   mask, hfi, what,
+				   this_hfi == hfi ? "this" : "other");
+	}
+}
+
+static ssize_t asic_flags_read(struct file *file, char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct hfi1_pportdata *ppd;
+	struct hfi1_devdata *dd;
+	u64 scratch0;
+	char *tmp;
+	int ret = 0;
+	int size;
+	int used;
+	int i;
+
+	ppd = private2ppd(file);
+	dd = ppd->dd;
+	size = PAGE_SIZE;
+	used = 0;
+	tmp = kmalloc(size, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+	used += scnprintf(tmp + used, size - used,
+			  "Resource flags: 0x%016llx\n", scratch0);
+
+	/* check permanent flag */
+	if (scratch0 & CR_THERM_INIT) {
+		used += scnprintf(tmp + used, size - used,
+				  "  0x%08x - thermal monitoring initialized\n",
+				  (u32)CR_THERM_INIT);
+	}
+
+	/* check each dynamic flag on each HFI */
+	for (i = 0; i < 2; i++) {
+		check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+			       CR_SBUS, "SBus");
+		check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+			       CR_EPROM, "EPROM");
+		check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+			       CR_I2C1, "i2c chain 1");
+		check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+			       CR_I2C2, "i2c chain 2");
+	}
+	used += scnprintf(tmp + used, size - used, "Write bits to clear\n");
+
+	ret = simple_read_from_buffer(buf, count, ppos, tmp, used);
+	kfree(tmp);
+	return ret;
+}
+
+static ssize_t asic_flags_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct hfi1_pportdata *ppd;
+	struct hfi1_devdata *dd;
+	char *buff;
+	int ret;
+	unsigned long long value;
+	u64 scratch0;
+	u64 clear;
+
+	ppd = private2ppd(file);
+	dd = ppd->dd;
+
+	/* zero terminate and read the expected integer */
+	buff = memdup_user_nul(buf, count);
+	if (IS_ERR(buff))
+		return PTR_ERR(buff);
+
+	ret = kstrtoull(buff, 0, &value);
+	if (ret)
+		goto do_free;
+	clear = value;
+
+	/* obtain exclusive access */
+	mutex_lock(&dd->asic_data->asic_resource_mutex);
+	acquire_hw_mutex(dd);
+
+	scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+	scratch0 &= ~clear;
+	write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
+	/* force write to be visible to other HFI on another OS */
+	(void)read_csr(dd, ASIC_CFG_SCRATCH);
+
+	release_hw_mutex(dd);
+	mutex_unlock(&dd->asic_data->asic_resource_mutex);
+
+	/* return the number of bytes written */
+	ret = count;
+
+ do_free:
+	kfree(buff);
+	return ret;
+}
+
+/* read the dc8051 memory */
+static ssize_t dc8051_memory_read(struct file *file, char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct hfi1_pportdata *ppd = private2ppd(file);
+	ssize_t rval;
+	void *tmp;
+	loff_t start, end;
+
+	/* the checks below expect the position to be positive */
+	if (*ppos < 0)
+		return -EINVAL;
+
+	tmp = kzalloc(DC8051_DATA_MEM_SIZE, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	/*
+	 * Fill in the requested portion of the temporary buffer from the
+	 * 8051 memory.  The 8051 memory read is done in terms of 8 bytes.
+	 * Adjust start and end to fit.  Skip reading anything if out of
+	 * range.
+	 */
+	start = *ppos & ~0x7;	/* round down */
+	if (start < DC8051_DATA_MEM_SIZE) {
+		end = (*ppos + count + 7) & ~0x7; /* round up */
+		if (end > DC8051_DATA_MEM_SIZE)
+			end = DC8051_DATA_MEM_SIZE;
+		rval = read_8051_data(ppd->dd, start, end - start,
+				      (u64 *)(tmp + start));
+		if (rval)
+			goto done;
+	}
+
+	rval = simple_read_from_buffer(buf, count, ppos, tmp,
+				       DC8051_DATA_MEM_SIZE);
+done:
+	kfree(tmp);
+	return rval;
+}
+
+static ssize_t debugfs_lcb_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct hfi1_pportdata *ppd = private2ppd(file);
+	struct hfi1_devdata *dd = ppd->dd;
+	unsigned long total, csr_off;
+	u64 data;
+
+	if (*ppos < 0)
+		return -EINVAL;
+	/* only read 8 byte quantities */
+	if ((count % 8) != 0)
+		return -EINVAL;
+	/* offset must be 8-byte aligned */
+	if ((*ppos % 8) != 0)
+		return -EINVAL;
+	/* do nothing if out of range or zero count */
+	if (*ppos >= (LCB_END - LCB_START) || !count)
+		return 0;
+	/* reduce count if needed */
+	if (*ppos + count > LCB_END - LCB_START)
+		count = (LCB_END - LCB_START) - *ppos;
+
+	csr_off = LCB_START + *ppos;
+	for (total = 0; total < count; total += 8, csr_off += 8) {
+		if (read_lcb_csr(dd, csr_off, (u64 *)&data))
+			break; /* failed */
+		if (put_user(data, (unsigned long __user *)(buf + total)))
+			break;
+	}
+	*ppos += total;
+	return total;
+}
+
+static ssize_t debugfs_lcb_write(struct file *file, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct hfi1_pportdata *ppd = private2ppd(file);
+	struct hfi1_devdata *dd = ppd->dd;
+	unsigned long total, csr_off, data;
+
+	if (*ppos < 0)
+		return -EINVAL;
+	/* only write 8 byte quantities */
+	if ((count % 8) != 0)
+		return -EINVAL;
+	/* offset must be 8-byte aligned */
+	if ((*ppos % 8) != 0)
+		return -EINVAL;
+	/* do nothing if out of range or zero count */
+	if (*ppos >= (LCB_END - LCB_START) || !count)
+		return 0;
+	/* reduce count if needed */
+	if (*ppos + count > LCB_END - LCB_START)
+		count = (LCB_END - LCB_START) - *ppos;
+
+	csr_off = LCB_START + *ppos;
+	for (total = 0; total < count; total += 8, csr_off += 8) {
+		if (get_user(data, (unsigned long __user *)(buf + total)))
+			break;
+		if (write_lcb_csr(dd, csr_off, data))
+			break; /* failed */
+	}
+	*ppos += total;
+	return total;
+}
+
+/*
+ * read the per-port QSFP data for ppd
+ */
+static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct hfi1_pportdata *ppd;
+	char *tmp;
+	int ret;
+
+	ppd = private2ppd(file);
+	tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	ret = qsfp_dump(ppd, tmp, PAGE_SIZE);
+	if (ret > 0)
+		ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
+	kfree(tmp);
+	return ret;
+}
+
+/* Do an i2c write operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos, u32 target)
+{
+	struct hfi1_pportdata *ppd;
+	char *buff;
+	int ret;
+	int i2c_addr;
+	int offset;
+	int total_written;
+
+	ppd = private2ppd(file);
+
+	/* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */
+	i2c_addr = (*ppos >> 16) & 0xffff;
+	offset = *ppos & 0xffff;
+
+	/* explicitly reject invalid address 0 to catch cp and cat */
+	if (i2c_addr == 0)
+		return -EINVAL;
+
+	buff = memdup_user(buf, count);
+	if (IS_ERR(buff))
+		return PTR_ERR(buff);
+
+	total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count);
+	if (total_written < 0) {
+		ret = total_written;
+		goto _free;
+	}
+
+	*ppos += total_written;
+
+	ret = total_written;
+
+ _free:
+	kfree(buff);
+	return ret;
+}
+
+/* Do an i2c write operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	return __i2c_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c write operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	return __i2c_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do an i2c read operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf,
+				  size_t count, loff_t *ppos, u32 target)
+{
+	struct hfi1_pportdata *ppd;
+	char *buff;
+	int ret;
+	int i2c_addr;
+	int offset;
+	int total_read;
+
+	ppd = private2ppd(file);
+
+	/* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */
+	i2c_addr = (*ppos >> 16) & 0xffff;
+	offset = *ppos & 0xffff;
+
+	/* explicitly reject invalid address 0 to catch cp and cat */
+	if (i2c_addr == 0)
+		return -EINVAL;
+
+	buff = kmalloc(count, GFP_KERNEL);
+	if (!buff)
+		return -ENOMEM;
+
+	total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count);
+	if (total_read < 0) {
+		ret = total_read;
+		goto _free;
+	}
+
+	*ppos += total_read;
+
+	ret = copy_to_user(buf, buff, total_read);
+	if (ret > 0) {
+		ret = -EFAULT;
+		goto _free;
+	}
+
+	ret = total_read;
+
+ _free:
+	kfree(buff);
+	return ret;
+}
+
+/* Do an i2c read operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	return __i2c_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c read operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	return __i2c_debugfs_read(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP write operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf,
+				    size_t count, loff_t *ppos, u32 target)
+{
+	struct hfi1_pportdata *ppd;
+	char *buff;
+	int ret;
+	int total_written;
+
+	if (*ppos + count > QSFP_PAGESIZE * 4) /* base page + page00-page03 */
+		return -EINVAL;
+
+	ppd = private2ppd(file);
+
+	buff = memdup_user(buf, count);
+	if (IS_ERR(buff))
+		return PTR_ERR(buff);
+
+	total_written = qsfp_write(ppd, target, *ppos, buff, count);
+	if (total_written < 0) {
+		ret = total_written;
+		goto _free;
+	}
+
+	*ppos += total_written;
+
+	ret = total_written;
+
+ _free:
+	kfree(buff);
+	return ret;
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	return __qsfp_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	return __qsfp_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP read operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf,
+				   size_t count, loff_t *ppos, u32 target)
+{
+	struct hfi1_pportdata *ppd;
+	char *buff;
+	int ret;
+	int total_read;
+
+	if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
+		ret = -EINVAL;
+		goto _return;
+	}
+
+	ppd = private2ppd(file);
+
+	buff = kmalloc(count, GFP_KERNEL);
+	if (!buff) {
+		ret = -ENOMEM;
+		goto _return;
+	}
+
+	total_read = qsfp_read(ppd, target, *ppos, buff, count);
+	if (total_read < 0) {
+		ret = total_read;
+		goto _free;
+	}
+
+	*ppos += total_read;
+
+	ret = copy_to_user(buf, buff, total_read);
+	if (ret > 0) {
+		ret = -EFAULT;
+		goto _free;
+	}
+
+	ret = total_read;
+
+ _free:
+	kfree(buff);
+ _return:
+	return ret;
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	return __qsfp_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	return __qsfp_debugfs_read(file, buf, count, ppos, 1);
+}
+
+static int __i2c_debugfs_open(struct inode *in, struct file *fp, u32 target)
+{
+	struct hfi1_pportdata *ppd;
+	int ret;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	ppd = private2ppd(fp);
+
+	ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
+	if (ret) /* failed - release the module */
+		module_put(THIS_MODULE);
+
+	return ret;
+}
+
+static int i2c1_debugfs_open(struct inode *in, struct file *fp)
+{
+	return __i2c_debugfs_open(in, fp, 0);
+}
+
+static int i2c2_debugfs_open(struct inode *in, struct file *fp)
+{
+	return __i2c_debugfs_open(in, fp, 1);
+}
+
+static int __i2c_debugfs_release(struct inode *in, struct file *fp, u32 target)
+{
+	struct hfi1_pportdata *ppd;
+
+	ppd = private2ppd(fp);
+
+	release_chip_resource(ppd->dd, i2c_target(target));
+	module_put(THIS_MODULE);
+
+	return 0;
+}
+
+static int i2c1_debugfs_release(struct inode *in, struct file *fp)
+{
+	return __i2c_debugfs_release(in, fp, 0);
+}
+
+static int i2c2_debugfs_release(struct inode *in, struct file *fp)
+{
+	return __i2c_debugfs_release(in, fp, 1);
+}
+
+static int __qsfp_debugfs_open(struct inode *in, struct file *fp, u32 target)
+{
+	struct hfi1_pportdata *ppd;
+	int ret;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	ppd = private2ppd(fp);
+
+	ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
+	if (ret) /* failed - release the module */
+		module_put(THIS_MODULE);
+
+	return ret;
+}
+
+static int qsfp1_debugfs_open(struct inode *in, struct file *fp)
+{
+	return __qsfp_debugfs_open(in, fp, 0);
+}
+
+static int qsfp2_debugfs_open(struct inode *in, struct file *fp)
+{
+	return __qsfp_debugfs_open(in, fp, 1);
+}
+
+static int __qsfp_debugfs_release(struct inode *in, struct file *fp, u32 target)
+{
+	struct hfi1_pportdata *ppd;
+
+	ppd = private2ppd(fp);
+
+	release_chip_resource(ppd->dd, i2c_target(target));
+	module_put(THIS_MODULE);
+
+	return 0;
+}
+
+static int qsfp1_debugfs_release(struct inode *in, struct file *fp)
+{
+	return __qsfp_debugfs_release(in, fp, 0);
+}
+
+static int qsfp2_debugfs_release(struct inode *in, struct file *fp)
+{
+	return __qsfp_debugfs_release(in, fp, 1);
+}
+
+#define DEBUGFS_OPS(nm, readroutine, writeroutine)	\
+{ \
+	.name = nm, \
+	.ops = { \
+		.read = readroutine, \
+		.write = writeroutine, \
+		.llseek = generic_file_llseek, \
+	}, \
+}
+
+#define DEBUGFS_XOPS(nm, readf, writef, openf, releasef) \
+{ \
+	.name = nm, \
+	.ops = { \
+		.read = readf, \
+		.write = writef, \
+		.llseek = generic_file_llseek, \
+		.open = openf, \
+		.release = releasef \
+	}, \
+}
+
+static const struct counter_info cntr_ops[] = {
+	DEBUGFS_OPS("counter_names", dev_names_read, NULL),
+	DEBUGFS_OPS("counters", dev_counters_read, NULL),
+	DEBUGFS_OPS("portcounter_names", portnames_read, NULL),
+};
+
+static const struct counter_info port_cntr_ops[] = {
+	DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL),
+	DEBUGFS_XOPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write,
+		     i2c1_debugfs_open, i2c1_debugfs_release),
+	DEBUGFS_XOPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write,
+		     i2c2_debugfs_open, i2c2_debugfs_release),
+	DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL),
+	DEBUGFS_XOPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write,
+		     qsfp1_debugfs_open, qsfp1_debugfs_release),
+	DEBUGFS_XOPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write,
+		     qsfp2_debugfs_open, qsfp2_debugfs_release),
+	DEBUGFS_OPS("asic_flags", asic_flags_read, asic_flags_write),
+	DEBUGFS_OPS("dc8051_memory", dc8051_memory_read, NULL),
+	DEBUGFS_OPS("lcb", debugfs_lcb_read, debugfs_lcb_write),
+};
+
+static void *_sdma_cpu_list_seq_start(struct seq_file *s, loff_t *pos)
+{
+	if (*pos >= num_online_cpus())
+		return NULL;
+
+	return pos;
+}
+
+static void *_sdma_cpu_list_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	++*pos;
+	if (*pos >= num_online_cpus())
+		return NULL;
+
+	return pos;
+}
+
+static void _sdma_cpu_list_seq_stop(struct seq_file *s, void *v)
+{
+	/* nothing allocated */
+}
+
+static int _sdma_cpu_list_seq_show(struct seq_file *s, void *v)
+{
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+	loff_t *spos = v;
+	loff_t i = *spos;
+
+	sdma_seqfile_dump_cpu_list(s, dd, (unsigned long)i);
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(sdma_cpu_list);
+DEBUGFS_SEQ_FILE_OPEN(sdma_cpu_list)
+DEBUGFS_FILE_OPS(sdma_cpu_list);
+
+#ifdef CONFIG_FAULT_INJECTION
+static void *_fault_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct hfi1_opcode_stats_perctx *opstats;
+
+	if (*pos >= ARRAY_SIZE(opstats->stats))
+		return NULL;
+	return pos;
+}
+
+static void *_fault_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct hfi1_opcode_stats_perctx *opstats;
+
+	++*pos;
+	if (*pos >= ARRAY_SIZE(opstats->stats))
+		return NULL;
+	return pos;
+}
+
+static void _fault_stats_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static int _fault_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos = v;
+	loff_t i = *spos, j;
+	u64 n_packets = 0, n_bytes = 0;
+	struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+	struct hfi1_ctxtdata *rcd;
+
+	for (j = 0; j < dd->first_dyn_alloc_ctxt; j++) {
+		rcd = hfi1_rcd_get_by_index(dd, j);
+		if (rcd) {
+			n_packets += rcd->opstats->stats[i].n_packets;
+			n_bytes += rcd->opstats->stats[i].n_bytes;
+		}
+		hfi1_rcd_put(rcd);
+	}
+	for_each_possible_cpu(j) {
+		struct hfi1_opcode_stats_perctx *sp =
+			per_cpu_ptr(dd->tx_opstats, j);
+
+		n_packets += sp->stats[i].n_packets;
+		n_bytes += sp->stats[i].n_bytes;
+	}
+	if (!n_packets && !n_bytes)
+		return SEQ_SKIP;
+	if (!ibd->fault_opcode->n_rxfaults[i] &&
+	    !ibd->fault_opcode->n_txfaults[i])
+		return SEQ_SKIP;
+	seq_printf(s, "%02llx %llu/%llu (faults rx:%llu faults: tx:%llu)\n", i,
+		   (unsigned long long)n_packets,
+		   (unsigned long long)n_bytes,
+		   (unsigned long long)ibd->fault_opcode->n_rxfaults[i],
+		   (unsigned long long)ibd->fault_opcode->n_txfaults[i]);
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(fault_stats);
+DEBUGFS_SEQ_FILE_OPEN(fault_stats);
+DEBUGFS_FILE_OPS(fault_stats);
+
+static void fault_exit_opcode_debugfs(struct hfi1_ibdev *ibd)
+{
+	debugfs_remove_recursive(ibd->fault_opcode->dir);
+	kfree(ibd->fault_opcode);
+	ibd->fault_opcode = NULL;
+}
+
+static int fault_init_opcode_debugfs(struct hfi1_ibdev *ibd)
+{
+	struct dentry *parent = ibd->hfi1_ibdev_dbg;
+
+	ibd->fault_opcode = kzalloc(sizeof(*ibd->fault_opcode), GFP_KERNEL);
+	if (!ibd->fault_opcode)
+		return -ENOMEM;
+
+	ibd->fault_opcode->attr.interval = 1;
+	ibd->fault_opcode->attr.require_end = ULONG_MAX;
+	ibd->fault_opcode->attr.stacktrace_depth = 32;
+	ibd->fault_opcode->attr.dname = NULL;
+	ibd->fault_opcode->attr.verbose = 0;
+	ibd->fault_opcode->fault_by_opcode = false;
+	ibd->fault_opcode->opcode = 0;
+	ibd->fault_opcode->mask = 0xff;
+
+	ibd->fault_opcode->dir =
+		fault_create_debugfs_attr("fault_opcode",
+					  parent,
+					  &ibd->fault_opcode->attr);
+	if (IS_ERR(ibd->fault_opcode->dir)) {
+		kfree(ibd->fault_opcode);
+		return -ENOENT;
+	}
+
+	DEBUGFS_SEQ_FILE_CREATE(fault_stats, ibd->fault_opcode->dir, ibd);
+	if (!debugfs_create_bool("fault_by_opcode", 0600,
+				 ibd->fault_opcode->dir,
+				 &ibd->fault_opcode->fault_by_opcode))
+		goto fail;
+	if (!debugfs_create_x8("opcode", 0600, ibd->fault_opcode->dir,
+			       &ibd->fault_opcode->opcode))
+		goto fail;
+	if (!debugfs_create_x8("mask", 0600, ibd->fault_opcode->dir,
+			       &ibd->fault_opcode->mask))
+		goto fail;
+
+	return 0;
+fail:
+	fault_exit_opcode_debugfs(ibd);
+	return -ENOMEM;
+}
+
+static void fault_exit_packet_debugfs(struct hfi1_ibdev *ibd)
+{
+	debugfs_remove_recursive(ibd->fault_packet->dir);
+	kfree(ibd->fault_packet);
+	ibd->fault_packet = NULL;
+}
+
+static int fault_init_packet_debugfs(struct hfi1_ibdev *ibd)
+{
+	struct dentry *parent = ibd->hfi1_ibdev_dbg;
+
+	ibd->fault_packet = kzalloc(sizeof(*ibd->fault_packet), GFP_KERNEL);
+	if (!ibd->fault_packet)
+		return -ENOMEM;
+
+	ibd->fault_packet->attr.interval = 1;
+	ibd->fault_packet->attr.require_end = ULONG_MAX;
+	ibd->fault_packet->attr.stacktrace_depth = 32;
+	ibd->fault_packet->attr.dname = NULL;
+	ibd->fault_packet->attr.verbose = 0;
+	ibd->fault_packet->fault_by_packet = false;
+
+	ibd->fault_packet->dir =
+		fault_create_debugfs_attr("fault_packet",
+					  parent,
+					  &ibd->fault_opcode->attr);
+	if (IS_ERR(ibd->fault_packet->dir)) {
+		kfree(ibd->fault_packet);
+		return -ENOENT;
+	}
+
+	if (!debugfs_create_bool("fault_by_packet", 0600,
+				 ibd->fault_packet->dir,
+				 &ibd->fault_packet->fault_by_packet))
+		goto fail;
+	if (!debugfs_create_u64("fault_stats", 0400,
+				ibd->fault_packet->dir,
+				&ibd->fault_packet->n_faults))
+		goto fail;
+
+	return 0;
+fail:
+	fault_exit_packet_debugfs(ibd);
+	return -ENOMEM;
+}
+
+static void fault_exit_debugfs(struct hfi1_ibdev *ibd)
+{
+	fault_exit_opcode_debugfs(ibd);
+	fault_exit_packet_debugfs(ibd);
+}
+
+static int fault_init_debugfs(struct hfi1_ibdev *ibd)
+{
+	int ret = 0;
+
+	ret = fault_init_opcode_debugfs(ibd);
+	if (ret)
+		return ret;
+
+	ret = fault_init_packet_debugfs(ibd);
+	if (ret)
+		fault_exit_opcode_debugfs(ibd);
+
+	return ret;
+}
+
+bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd)
+{
+	return ibd->fault_suppress_err;
+}
+
+bool hfi1_dbg_fault_opcode(struct rvt_qp *qp, u32 opcode, bool rx)
+{
+	bool ret = false;
+	struct hfi1_ibdev *ibd = to_idev(qp->ibqp.device);
+
+	if (!ibd->fault_opcode || !ibd->fault_opcode->fault_by_opcode)
+		return false;
+	if (ibd->fault_opcode->opcode != (opcode & ibd->fault_opcode->mask))
+		return false;
+	ret = should_fail(&ibd->fault_opcode->attr, 1);
+	if (ret) {
+		trace_hfi1_fault_opcode(qp, opcode);
+		if (rx)
+			ibd->fault_opcode->n_rxfaults[opcode]++;
+		else
+			ibd->fault_opcode->n_txfaults[opcode]++;
+	}
+	return ret;
+}
+
+bool hfi1_dbg_fault_packet(struct hfi1_packet *packet)
+{
+	struct rvt_dev_info *rdi = &packet->rcd->ppd->dd->verbs_dev.rdi;
+	struct hfi1_ibdev *ibd = dev_from_rdi(rdi);
+	bool ret = false;
+
+	if (!ibd->fault_packet || !ibd->fault_packet->fault_by_packet)
+		return false;
+
+	ret = should_fail(&ibd->fault_packet->attr, 1);
+	if (ret) {
+		++ibd->fault_packet->n_faults;
+		trace_hfi1_fault_packet(packet);
+	}
+	return ret;
+}
+#endif
+
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+	char name[sizeof("port0counters") + 1];
+	char link[10];
+	struct hfi1_devdata *dd = dd_from_dev(ibd);
+	struct hfi1_pportdata *ppd;
+	int unit = dd->unit;
+	int i, j;
+
+	if (!hfi1_dbg_root)
+		return;
+	snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
+	snprintf(link, sizeof(link), "%d", unit);
+	ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
+	if (!ibd->hfi1_ibdev_dbg) {
+		pr_warn("create of %s failed\n", name);
+		return;
+	}
+	ibd->hfi1_ibdev_link =
+		debugfs_create_symlink(link, hfi1_dbg_root, name);
+	if (!ibd->hfi1_ibdev_link) {
+		pr_warn("create of %s symlink failed\n", name);
+		return;
+	}
+	DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
+	DEBUGFS_SEQ_FILE_CREATE(tx_opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
+	DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
+	DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
+	DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
+	DEBUGFS_SEQ_FILE_CREATE(rcds, ibd->hfi1_ibdev_dbg, ibd);
+	DEBUGFS_SEQ_FILE_CREATE(sdma_cpu_list, ibd->hfi1_ibdev_dbg, ibd);
+	/* dev counter files */
+	for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
+		DEBUGFS_FILE_CREATE(cntr_ops[i].name,
+				    ibd->hfi1_ibdev_dbg,
+				    dd,
+				    &cntr_ops[i].ops, S_IRUGO);
+	/* per port files */
+	for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
+		for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
+			snprintf(name,
+				 sizeof(name),
+				 port_cntr_ops[i].name,
+				 j + 1);
+			DEBUGFS_FILE_CREATE(name,
+					    ibd->hfi1_ibdev_dbg,
+					    ppd,
+					    &port_cntr_ops[i].ops,
+					    !port_cntr_ops[i].ops.write ?
+					    S_IRUGO : S_IRUGO | S_IWUSR);
+		}
+
+#ifdef CONFIG_FAULT_INJECTION
+	debugfs_create_bool("fault_suppress_err", 0600,
+			    ibd->hfi1_ibdev_dbg,
+			    &ibd->fault_suppress_err);
+	fault_init_debugfs(ibd);
+#endif
+}
+
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+	if (!hfi1_dbg_root)
+		goto out;
+#ifdef CONFIG_FAULT_INJECTION
+	fault_exit_debugfs(ibd);
+#endif
+	debugfs_remove(ibd->hfi1_ibdev_link);
+	debugfs_remove_recursive(ibd->hfi1_ibdev_dbg);
+out:
+	ibd->hfi1_ibdev_dbg = NULL;
+}
+
+/*
+ * driver stats field names, one line per stat, single string.  Used by
+ * programs like hfistats to print the stats in a way which works for
+ * different versions of drivers, without changing program source.
+ * if hfi1_ib_stats changes, this needs to change.  Names need to be
+ * 12 chars or less (w/o newline), for proper display by hfistats utility.
+ */
+static const char * const hfi1_statnames[] = {
+	/* must be element 0*/
+	"KernIntr",
+	"ErrorIntr",
+	"Tx_Errs",
+	"Rcv_Errs",
+	"H/W_Errs",
+	"NoPIOBufs",
+	"CtxtsOpen",
+	"RcvLen_Errs",
+	"EgrBufFull",
+	"EgrHdrFull"
+};
+
+static void *_driver_stats_names_seq_start(struct seq_file *s, loff_t *pos)
+{
+	if (*pos >= ARRAY_SIZE(hfi1_statnames))
+		return NULL;
+	return pos;
+}
+
+static void *_driver_stats_names_seq_next(
+	struct seq_file *s,
+	void *v,
+	loff_t *pos)
+{
+	++*pos;
+	if (*pos >= ARRAY_SIZE(hfi1_statnames))
+		return NULL;
+	return pos;
+}
+
+static void _driver_stats_names_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static int _driver_stats_names_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos = v;
+
+	seq_printf(s, "%s\n", hfi1_statnames[*spos]);
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats_names);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats_names)
+DEBUGFS_FILE_OPS(driver_stats_names);
+
+static void *_driver_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	if (*pos >= ARRAY_SIZE(hfi1_statnames))
+		return NULL;
+	return pos;
+}
+
+static void *_driver_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	++*pos;
+	if (*pos >= ARRAY_SIZE(hfi1_statnames))
+		return NULL;
+	return pos;
+}
+
+static void _driver_stats_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static u64 hfi1_sps_ints(void)
+{
+	unsigned long flags;
+	struct hfi1_devdata *dd;
+	u64 sps_ints = 0;
+
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	list_for_each_entry(dd, &hfi1_dev_list, list) {
+		sps_ints += get_all_cpu_total(dd->int_counter);
+	}
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	return sps_ints;
+}
+
+static int _driver_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos = v;
+	char *buffer;
+	u64 *stats = (u64 *)&hfi1_stats;
+	size_t sz = seq_get_buf(s, &buffer);
+
+	if (sz < sizeof(u64))
+		return SEQ_SKIP;
+	/* special case for interrupts */
+	if (*spos == 0)
+		*(u64 *)buffer = hfi1_sps_ints();
+	else
+		*(u64 *)buffer = stats[*spos];
+	seq_commit(s,  sizeof(u64));
+	return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats)
+DEBUGFS_FILE_OPS(driver_stats);
+
+void hfi1_dbg_init(void)
+{
+	hfi1_dbg_root  = debugfs_create_dir(DRIVER_NAME, NULL);
+	if (!hfi1_dbg_root)
+		pr_warn("init of debugfs failed\n");
+	DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
+	DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
+}
+
+void hfi1_dbg_exit(void)
+{
+	debugfs_remove_recursive(hfi1_dbg_root);
+	hfi1_dbg_root = NULL;
+}
diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h
new file mode 100644
index 0000000..38c38a9
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/debugfs.h
@@ -0,0 +1,131 @@
+#ifndef _HFI1_DEBUGFS_H
+#define _HFI1_DEBUGFS_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_ibdev;
+#ifdef CONFIG_DEBUG_FS
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd);
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd);
+void hfi1_dbg_init(void);
+void hfi1_dbg_exit(void);
+
+#ifdef CONFIG_FAULT_INJECTION
+#include <linux/fault-inject.h>
+struct fault_opcode {
+	struct fault_attr attr;
+	struct dentry *dir;
+	bool fault_by_opcode;
+	u64 n_rxfaults[256];
+	u64 n_txfaults[256];
+	u8 opcode;
+	u8 mask;
+};
+
+struct fault_packet {
+	struct fault_attr attr;
+	struct dentry *dir;
+	bool fault_by_packet;
+	u64 n_faults;
+};
+
+bool hfi1_dbg_fault_opcode(struct rvt_qp *qp, u32 opcode, bool rx);
+bool hfi1_dbg_fault_packet(struct hfi1_packet *packet);
+bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd);
+#else
+static inline bool hfi1_dbg_fault_packet(struct hfi1_packet *packet)
+{
+	return false;
+}
+
+static inline bool hfi1_dbg_fault_opcode(struct rvt_qp *qp,
+					 u32 opcode, bool rx)
+{
+	return false;
+}
+
+static inline bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd)
+{
+	return false;
+}
+#endif
+
+#else
+static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+}
+
+static inline void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+}
+
+static inline void hfi1_dbg_init(void)
+{
+}
+
+static inline void hfi1_dbg_exit(void)
+{
+}
+
+static inline bool hfi1_dbg_fault_packet(struct hfi1_packet *packet)
+{
+	return false;
+}
+
+static inline bool hfi1_dbg_fault_opcode(struct rvt_qp *qp,
+					 u32 opcode, bool rx)
+{
+	return false;
+}
+
+static inline bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd)
+{
+	return false;
+}
+#endif
+
+#endif                          /* _HFI1_DEBUGFS_H */
diff --git a/drivers/infiniband/hw/hfi1/device.c b/drivers/infiniband/hw/hfi1/device.c
new file mode 100644
index 0000000..bbb6069
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/device.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/cdev.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+
+#include "hfi.h"
+#include "device.h"
+
+static struct class *class;
+static struct class *user_class;
+static dev_t hfi1_dev;
+
+int hfi1_cdev_init(int minor, const char *name,
+		   const struct file_operations *fops,
+		   struct cdev *cdev, struct device **devp,
+		   bool user_accessible,
+		   struct kobject *parent)
+{
+	const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor);
+	struct device *device = NULL;
+	int ret;
+
+	cdev_init(cdev, fops);
+	cdev->owner = THIS_MODULE;
+	cdev_set_parent(cdev, parent);
+	kobject_set_name(&cdev->kobj, name);
+
+	ret = cdev_add(cdev, dev, 1);
+	if (ret < 0) {
+		pr_err("Could not add cdev for minor %d, %s (err %d)\n",
+		       minor, name, -ret);
+		goto done;
+	}
+
+	if (user_accessible)
+		device = device_create(user_class, NULL, dev, NULL, "%s", name);
+	else
+		device = device_create(class, NULL, dev, NULL, "%s", name);
+
+	if (IS_ERR(device)) {
+		ret = PTR_ERR(device);
+		device = NULL;
+		pr_err("Could not create device for minor %d, %s (err %d)\n",
+			minor, name, -ret);
+		cdev_del(cdev);
+	}
+done:
+	*devp = device;
+	return ret;
+}
+
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp)
+{
+	struct device *device = *devp;
+
+	if (device) {
+		device_unregister(device);
+		*devp = NULL;
+
+		cdev_del(cdev);
+	}
+}
+
+static const char *hfi1_class_name = "hfi1";
+
+const char *class_name(void)
+{
+	return hfi1_class_name;
+}
+
+static char *hfi1_devnode(struct device *dev, umode_t *mode)
+{
+	if (mode)
+		*mode = 0600;
+	return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
+}
+
+static const char *hfi1_class_name_user = "hfi1_user";
+static const char *class_name_user(void)
+{
+	return hfi1_class_name_user;
+}
+
+static char *hfi1_user_devnode(struct device *dev, umode_t *mode)
+{
+	if (mode)
+		*mode = 0666;
+	return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
+}
+
+int __init dev_init(void)
+{
+	int ret;
+
+	ret = alloc_chrdev_region(&hfi1_dev, 0, HFI1_NMINORS, DRIVER_NAME);
+	if (ret < 0) {
+		pr_err("Could not allocate chrdev region (err %d)\n", -ret);
+		goto done;
+	}
+
+	class = class_create(THIS_MODULE, class_name());
+	if (IS_ERR(class)) {
+		ret = PTR_ERR(class);
+		pr_err("Could not create device class (err %d)\n", -ret);
+		unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+		goto done;
+	}
+	class->devnode = hfi1_devnode;
+
+	user_class = class_create(THIS_MODULE, class_name_user());
+	if (IS_ERR(user_class)) {
+		ret = PTR_ERR(user_class);
+		pr_err("Could not create device class for user accessible files (err %d)\n",
+		       -ret);
+		class_destroy(class);
+		class = NULL;
+		user_class = NULL;
+		unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+		goto done;
+	}
+	user_class->devnode = hfi1_user_devnode;
+
+done:
+	return ret;
+}
+
+void dev_cleanup(void)
+{
+	class_destroy(class);
+	class = NULL;
+
+	class_destroy(user_class);
+	user_class = NULL;
+
+	unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+}
diff --git a/drivers/infiniband/hw/hfi1/device.h b/drivers/infiniband/hw/hfi1/device.h
new file mode 100644
index 0000000..c3ec19c
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/device.h
@@ -0,0 +1,60 @@
+#ifndef _HFI1_DEVICE_H
+#define _HFI1_DEVICE_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+int hfi1_cdev_init(int minor, const char *name,
+		   const struct file_operations *fops,
+		   struct cdev *cdev, struct device **devp,
+		   bool user_accessible,
+		   struct kobject *parent);
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp);
+const char *class_name(void);
+int __init dev_init(void);
+void dev_cleanup(void);
+
+#endif                          /* _HFI1_DEVICE_H */
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
new file mode 100644
index 0000000..bd837a0
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -0,0 +1,1749 @@
+/*
+ * Copyright(c) 2015-2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/prefetch.h>
+#include <rdma/ib_verbs.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "qp.h"
+#include "sdma.h"
+#include "debugfs.h"
+#include "vnic.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the initialization code.
+ */
+const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
+
+DEFINE_SPINLOCK(hfi1_devs_lock);
+LIST_HEAD(hfi1_dev_list);
+DEFINE_MUTEX(hfi1_mutex);	/* general driver use */
+
+unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify(
+		 HFI1_DEFAULT_MAX_MTU));
+
+unsigned int hfi1_cu = 1;
+module_param_named(cu, hfi1_cu, uint, S_IRUGO);
+MODULE_PARM_DESC(cu, "Credit return units");
+
+unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT;
+static int hfi1_caps_set(const char *val, const struct kernel_param *kp);
+static int hfi1_caps_get(char *buffer, const struct kernel_param *kp);
+static const struct kernel_param_ops cap_ops = {
+	.set = hfi1_caps_set,
+	.get = hfi1_caps_get
+};
+module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Intel Omni-Path Architecture driver");
+
+/*
+ * MAX_PKT_RCV is the max # if packets processed per receive interrupt.
+ */
+#define MAX_PKT_RECV 64
+/*
+ * MAX_PKT_THREAD_RCV is the max # of packets processed before
+ * the qp_wait_list queue is flushed.
+ */
+#define MAX_PKT_RECV_THREAD (MAX_PKT_RECV * 4)
+#define EGR_HEAD_UPDATE_THRESHOLD 16
+
+struct hfi1_ib_stats hfi1_stats;
+
+static int hfi1_caps_set(const char *val, const struct kernel_param *kp)
+{
+	int ret = 0;
+	unsigned long *cap_mask_ptr = (unsigned long *)kp->arg,
+		cap_mask = *cap_mask_ptr, value, diff,
+		write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) |
+			      HFI1_CAP_WRITABLE_MASK);
+
+	ret = kstrtoul(val, 0, &value);
+	if (ret) {
+		pr_warn("Invalid module parameter value for 'cap_mask'\n");
+		goto done;
+	}
+	/* Get the changed bits (except the locked bit) */
+	diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK);
+
+	/* Remove any bits that are not allowed to change after driver load */
+	if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) {
+		pr_warn("Ignoring non-writable capability bits %#lx\n",
+			diff & ~write_mask);
+		diff &= write_mask;
+	}
+
+	/* Mask off any reserved bits */
+	diff &= ~HFI1_CAP_RESERVED_MASK;
+	/* Clear any previously set and changing bits */
+	cap_mask &= ~diff;
+	/* Update the bits with the new capability */
+	cap_mask |= (value & diff);
+	/* Check for any kernel/user restrictions */
+	diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^
+		((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT);
+	cap_mask &= ~diff;
+	/* Set the bitmask to the final set */
+	*cap_mask_ptr = cap_mask;
+done:
+	return ret;
+}
+
+static int hfi1_caps_get(char *buffer, const struct kernel_param *kp)
+{
+	unsigned long cap_mask = *(unsigned long *)kp->arg;
+
+	cap_mask &= ~HFI1_CAP_LOCKED_SMASK;
+	cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT);
+
+	return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask);
+}
+
+struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi)
+{
+	struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
+	struct hfi1_devdata *dd = container_of(ibdev,
+					       struct hfi1_devdata, verbs_dev);
+	return dd->pcidev;
+}
+
+/*
+ * Return count of units with at least one port ACTIVE.
+ */
+int hfi1_count_active_units(void)
+{
+	struct hfi1_devdata *dd;
+	struct hfi1_pportdata *ppd;
+	unsigned long flags;
+	int pidx, nunits_active = 0;
+
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	list_for_each_entry(dd, &hfi1_dev_list, list) {
+		if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase1)
+			continue;
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+			if (ppd->lid && ppd->linkup) {
+				nunits_active++;
+				break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	return nunits_active;
+}
+
+/*
+ * Get address of eager buffer from it's index (allocated in chunks, not
+ * contiguous).
+ */
+static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf,
+			       u8 *update)
+{
+	u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf);
+
+	*update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset;
+	return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) +
+			(offset * RCV_BUF_BLOCK_SIZE));
+}
+
+static inline void *hfi1_get_header(struct hfi1_devdata *dd,
+				    __le32 *rhf_addr)
+{
+	u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
+
+	return (void *)(rhf_addr - dd->rhf_offset + offset);
+}
+
+static inline struct ib_header *hfi1_get_msgheader(struct hfi1_devdata *dd,
+						   __le32 *rhf_addr)
+{
+	return (struct ib_header *)hfi1_get_header(dd, rhf_addr);
+}
+
+static inline struct hfi1_16b_header
+		*hfi1_get_16B_header(struct hfi1_devdata *dd,
+				     __le32 *rhf_addr)
+{
+	return (struct hfi1_16b_header *)hfi1_get_header(dd, rhf_addr);
+}
+
+/*
+ * Validate and encode the a given RcvArray Buffer size.
+ * The function will check whether the given size falls within
+ * allowed size ranges for the respective type and, optionally,
+ * return the proper encoding.
+ */
+int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded)
+{
+	if (unlikely(!PAGE_ALIGNED(size)))
+		return 0;
+	if (unlikely(size < MIN_EAGER_BUFFER))
+		return 0;
+	if (size >
+	    (type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER))
+		return 0;
+	if (encoded)
+		*encoded = ilog2(size / PAGE_SIZE) + 1;
+	return 1;
+}
+
+static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
+		       struct hfi1_packet *packet)
+{
+	struct ib_header *rhdr = packet->hdr;
+	u32 rte = rhf_rcv_type_err(packet->rhf);
+	u32 mlid_base;
+	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
+	struct hfi1_devdata *dd = ppd->dd;
+	struct hfi1_ibdev *verbs_dev = &dd->verbs_dev;
+	struct rvt_dev_info *rdi = &verbs_dev->rdi;
+
+	if ((packet->rhf & RHF_DC_ERR) &&
+	    hfi1_dbg_fault_suppress_err(verbs_dev))
+		return;
+
+	if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+		return;
+
+	if (packet->etype == RHF_RCV_TYPE_BYPASS) {
+		goto drop;
+	} else {
+		u8 lnh = ib_get_lnh(rhdr);
+
+		mlid_base = be16_to_cpu(IB_MULTICAST_LID_BASE);
+		if (lnh == HFI1_LRH_BTH) {
+			packet->ohdr = &rhdr->u.oth;
+		} else if (lnh == HFI1_LRH_GRH) {
+			packet->ohdr = &rhdr->u.l.oth;
+			packet->grh = &rhdr->u.l.grh;
+		} else {
+			goto drop;
+		}
+	}
+
+	if (packet->rhf & RHF_TID_ERR) {
+		/* For TIDERR and RC QPs preemptively schedule a NAK */
+		u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+		u32 dlid = ib_get_dlid(rhdr);
+		u32 qp_num;
+
+		/* Sanity check packet */
+		if (tlen < 24)
+			goto drop;
+
+		/* Check for GRH */
+		if (packet->grh) {
+			u32 vtf;
+			struct ib_grh *grh = packet->grh;
+
+			if (grh->next_hdr != IB_GRH_NEXT_HDR)
+				goto drop;
+			vtf = be32_to_cpu(grh->version_tclass_flow);
+			if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+				goto drop;
+		}
+
+		/* Get the destination QP number. */
+		qp_num = ib_bth_get_qpn(packet->ohdr);
+		if (dlid < mlid_base) {
+			struct rvt_qp *qp;
+			unsigned long flags;
+
+			rcu_read_lock();
+			qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+			if (!qp) {
+				rcu_read_unlock();
+				goto drop;
+			}
+
+			/*
+			 * Handle only RC QPs - for other QP types drop error
+			 * packet.
+			 */
+			spin_lock_irqsave(&qp->r_lock, flags);
+
+			/* Check for valid receive state. */
+			if (!(ib_rvt_state_ops[qp->state] &
+			      RVT_PROCESS_RECV_OK)) {
+				ibp->rvp.n_pkt_drops++;
+			}
+
+			switch (qp->ibqp.qp_type) {
+			case IB_QPT_RC:
+				hfi1_rc_hdrerr(rcd, packet, qp);
+				break;
+			default:
+				/* For now don't handle any other QP types */
+				break;
+			}
+
+			spin_unlock_irqrestore(&qp->r_lock, flags);
+			rcu_read_unlock();
+		} /* Unicast QP */
+	} /* Valid packet with TIDErr */
+
+	/* handle "RcvTypeErr" flags */
+	switch (rte) {
+	case RHF_RTE_ERROR_OP_CODE_ERR:
+	{
+		void *ebuf = NULL;
+		u8 opcode;
+
+		if (rhf_use_egr_bfr(packet->rhf))
+			ebuf = packet->ebuf;
+
+		if (!ebuf)
+			goto drop; /* this should never happen */
+
+		opcode = ib_bth_get_opcode(packet->ohdr);
+		if (opcode == IB_OPCODE_CNP) {
+			/*
+			 * Only in pre-B0 h/w is the CNP_OPCODE handled
+			 * via this code path.
+			 */
+			struct rvt_qp *qp = NULL;
+			u32 lqpn, rqpn;
+			u16 rlid;
+			u8 svc_type, sl, sc5;
+
+			sc5 = hfi1_9B_get_sc5(rhdr, packet->rhf);
+			sl = ibp->sc_to_sl[sc5];
+
+			lqpn = ib_bth_get_qpn(packet->ohdr);
+			rcu_read_lock();
+			qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn);
+			if (!qp) {
+				rcu_read_unlock();
+				goto drop;
+			}
+
+			switch (qp->ibqp.qp_type) {
+			case IB_QPT_UD:
+				rlid = 0;
+				rqpn = 0;
+				svc_type = IB_CC_SVCTYPE_UD;
+				break;
+			case IB_QPT_UC:
+				rlid = ib_get_slid(rhdr);
+				rqpn = qp->remote_qpn;
+				svc_type = IB_CC_SVCTYPE_UC;
+				break;
+			default:
+				rcu_read_unlock();
+				goto drop;
+			}
+
+			process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+			rcu_read_unlock();
+		}
+
+		packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK;
+		break;
+	}
+	default:
+		break;
+	}
+
+drop:
+	return;
+}
+
+static inline void init_packet(struct hfi1_ctxtdata *rcd,
+			       struct hfi1_packet *packet)
+{
+	packet->rsize = rcd->rcvhdrqentsize; /* words */
+	packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */
+	packet->rcd = rcd;
+	packet->updegr = 0;
+	packet->etail = -1;
+	packet->rhf_addr = get_rhf_addr(rcd);
+	packet->rhf = rhf_to_cpu(packet->rhf_addr);
+	packet->rhqoff = rcd->head;
+	packet->numpkt = 0;
+}
+
+/* We support only two types - 9B and 16B for now */
+static const hfi1_handle_cnp hfi1_handle_cnp_tbl[2] = {
+	[HFI1_PKT_TYPE_9B] = &return_cnp,
+	[HFI1_PKT_TYPE_16B] = &return_cnp_16B
+};
+
+void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+			       bool do_cnp)
+{
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct ib_other_headers *ohdr = pkt->ohdr;
+	struct ib_grh *grh = pkt->grh;
+	u32 rqpn = 0, bth1;
+	u16 pkey;
+	u32 rlid, slid, dlid = 0;
+	u8 hdr_type, sc, svc_type;
+	bool is_mcast = false;
+
+	/* can be called from prescan */
+	if (pkt->etype == RHF_RCV_TYPE_BYPASS) {
+		is_mcast = hfi1_is_16B_mcast(dlid);
+		pkey = hfi1_16B_get_pkey(pkt->hdr);
+		sc = hfi1_16B_get_sc(pkt->hdr);
+		dlid = hfi1_16B_get_dlid(pkt->hdr);
+		slid = hfi1_16B_get_slid(pkt->hdr);
+		hdr_type = HFI1_PKT_TYPE_16B;
+	} else {
+		is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+			   (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
+		pkey = ib_bth_get_pkey(ohdr);
+		sc = hfi1_9B_get_sc5(pkt->hdr, pkt->rhf);
+		dlid = ib_get_dlid(pkt->hdr);
+		slid = ib_get_slid(pkt->hdr);
+		hdr_type = HFI1_PKT_TYPE_9B;
+	}
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_UD:
+		dlid = ppd->lid;
+		rlid = slid;
+		rqpn = ib_get_sqpn(pkt->ohdr);
+		svc_type = IB_CC_SVCTYPE_UD;
+		break;
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		rlid = slid;
+		rqpn = ib_get_sqpn(pkt->ohdr);
+		svc_type = IB_CC_SVCTYPE_UD;
+		break;
+	case IB_QPT_UC:
+		rlid = rdma_ah_get_dlid(&qp->remote_ah_attr);
+		rqpn = qp->remote_qpn;
+		svc_type = IB_CC_SVCTYPE_UC;
+		break;
+	case IB_QPT_RC:
+		rlid = rdma_ah_get_dlid(&qp->remote_ah_attr);
+		rqpn = qp->remote_qpn;
+		svc_type = IB_CC_SVCTYPE_RC;
+		break;
+	default:
+		return;
+	}
+
+	bth1 = be32_to_cpu(ohdr->bth[1]);
+	/* Call appropriate CNP handler */
+	if (do_cnp && (bth1 & IB_FECN_SMASK))
+		hfi1_handle_cnp_tbl[hdr_type](ibp, qp, rqpn, pkey,
+					      dlid, rlid, sc, grh);
+
+	if (!is_mcast && (bth1 & IB_BECN_SMASK)) {
+		u32 lqpn = bth1 & RVT_QPN_MASK;
+		u8 sl = ibp->sc_to_sl[sc];
+
+		process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+	}
+
+}
+
+struct ps_mdata {
+	struct hfi1_ctxtdata *rcd;
+	u32 rsize;
+	u32 maxcnt;
+	u32 ps_head;
+	u32 ps_tail;
+	u32 ps_seq;
+};
+
+static inline void init_ps_mdata(struct ps_mdata *mdata,
+				 struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+
+	mdata->rcd = rcd;
+	mdata->rsize = packet->rsize;
+	mdata->maxcnt = packet->maxcnt;
+	mdata->ps_head = packet->rhqoff;
+
+	if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+		mdata->ps_tail = get_rcvhdrtail(rcd);
+		if (rcd->ctxt == HFI1_CTRL_CTXT)
+			mdata->ps_seq = rcd->seq_cnt;
+		else
+			mdata->ps_seq = 0; /* not used with DMA_RTAIL */
+	} else {
+		mdata->ps_tail = 0; /* used only with DMA_RTAIL*/
+		mdata->ps_seq = rcd->seq_cnt;
+	}
+}
+
+static inline int ps_done(struct ps_mdata *mdata, u64 rhf,
+			  struct hfi1_ctxtdata *rcd)
+{
+	if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
+		return mdata->ps_head == mdata->ps_tail;
+	return mdata->ps_seq != rhf_rcv_seq(rhf);
+}
+
+static inline int ps_skip(struct ps_mdata *mdata, u64 rhf,
+			  struct hfi1_ctxtdata *rcd)
+{
+	/*
+	 * Control context can potentially receive an invalid rhf.
+	 * Drop such packets.
+	 */
+	if ((rcd->ctxt == HFI1_CTRL_CTXT) && (mdata->ps_head != mdata->ps_tail))
+		return mdata->ps_seq != rhf_rcv_seq(rhf);
+
+	return 0;
+}
+
+static inline void update_ps_mdata(struct ps_mdata *mdata,
+				   struct hfi1_ctxtdata *rcd)
+{
+	mdata->ps_head += mdata->rsize;
+	if (mdata->ps_head >= mdata->maxcnt)
+		mdata->ps_head = 0;
+
+	/* Control context must do seq counting */
+	if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
+	    (rcd->ctxt == HFI1_CTRL_CTXT)) {
+		if (++mdata->ps_seq > 13)
+			mdata->ps_seq = 1;
+	}
+}
+
+/*
+ * prescan_rxq - search through the receive queue looking for packets
+ * containing Excplicit Congestion Notifications (FECNs, or BECNs).
+ * When an ECN is found, process the Congestion Notification, and toggle
+ * it off.
+ * This is declared as a macro to allow quick checking of the port to avoid
+ * the overhead of a function call if not enabled.
+ */
+#define prescan_rxq(rcd, packet) \
+	do { \
+		if (rcd->ppd->cc_prescan) \
+			__prescan_rxq(packet); \
+	} while (0)
+static void __prescan_rxq(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct ps_mdata mdata;
+
+	init_ps_mdata(&mdata, packet);
+
+	while (1) {
+		struct hfi1_devdata *dd = rcd->dd;
+		struct hfi1_ibport *ibp = rcd_to_iport(rcd);
+		__le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head +
+					 dd->rhf_offset;
+		struct rvt_qp *qp;
+		struct ib_header *hdr;
+		struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+		u64 rhf = rhf_to_cpu(rhf_addr);
+		u32 etype = rhf_rcv_type(rhf), qpn, bth1;
+		int is_ecn = 0;
+		u8 lnh;
+
+		if (ps_done(&mdata, rhf, rcd))
+			break;
+
+		if (ps_skip(&mdata, rhf, rcd))
+			goto next;
+
+		if (etype != RHF_RCV_TYPE_IB)
+			goto next;
+
+		packet->hdr = hfi1_get_msgheader(dd, rhf_addr);
+		hdr = packet->hdr;
+		lnh = ib_get_lnh(hdr);
+
+		if (lnh == HFI1_LRH_BTH) {
+			packet->ohdr = &hdr->u.oth;
+			packet->grh = NULL;
+		} else if (lnh == HFI1_LRH_GRH) {
+			packet->ohdr = &hdr->u.l.oth;
+			packet->grh = &hdr->u.l.grh;
+		} else {
+			goto next; /* just in case */
+		}
+
+		bth1 = be32_to_cpu(packet->ohdr->bth[1]);
+		is_ecn = !!(bth1 & (IB_FECN_SMASK | IB_BECN_SMASK));
+
+		if (!is_ecn)
+			goto next;
+
+		qpn = bth1 & RVT_QPN_MASK;
+		rcu_read_lock();
+		qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn);
+
+		if (!qp) {
+			rcu_read_unlock();
+			goto next;
+		}
+
+		process_ecn(qp, packet, true);
+		rcu_read_unlock();
+
+		/* turn off BECN, FECN */
+		bth1 &= ~(IB_FECN_SMASK | IB_BECN_SMASK);
+		packet->ohdr->bth[1] = cpu_to_be32(bth1);
+next:
+		update_ps_mdata(&mdata, rcd);
+	}
+}
+
+static void process_rcv_qp_work(struct hfi1_packet *packet)
+{
+	struct rvt_qp *qp, *nqp;
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+
+	/*
+	 * Iterate over all QPs waiting to respond.
+	 * The list won't change since the IRQ is only run on one CPU.
+	 */
+	list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
+		list_del_init(&qp->rspwait);
+		if (qp->r_flags & RVT_R_RSP_NAK) {
+			qp->r_flags &= ~RVT_R_RSP_NAK;
+			packet->qp = qp;
+			hfi1_send_rc_ack(packet, 0);
+		}
+		if (qp->r_flags & RVT_R_RSP_SEND) {
+			unsigned long flags;
+
+			qp->r_flags &= ~RVT_R_RSP_SEND;
+			spin_lock_irqsave(&qp->s_lock, flags);
+			if (ib_rvt_state_ops[qp->state] &
+					RVT_PROCESS_OR_FLUSH_SEND)
+				hfi1_schedule_send(qp);
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+		}
+		rvt_put_qp(qp);
+	}
+}
+
+static noinline int max_packet_exceeded(struct hfi1_packet *packet, int thread)
+{
+	if (thread) {
+		if ((packet->numpkt & (MAX_PKT_RECV_THREAD - 1)) == 0)
+			/* allow defered processing */
+			process_rcv_qp_work(packet);
+		cond_resched();
+		return RCV_PKT_OK;
+	} else {
+		this_cpu_inc(*packet->rcd->dd->rcv_limit);
+		return RCV_PKT_LIMIT;
+	}
+}
+
+static inline int check_max_packet(struct hfi1_packet *packet, int thread)
+{
+	int ret = RCV_PKT_OK;
+
+	if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0))
+		ret = max_packet_exceeded(packet, thread);
+	return ret;
+}
+
+static noinline int skip_rcv_packet(struct hfi1_packet *packet, int thread)
+{
+	int ret;
+
+	/* Set up for the next packet */
+	packet->rhqoff += packet->rsize;
+	if (packet->rhqoff >= packet->maxcnt)
+		packet->rhqoff = 0;
+
+	packet->numpkt++;
+	ret = check_max_packet(packet, thread);
+
+	packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
+				     packet->rcd->dd->rhf_offset;
+	packet->rhf = rhf_to_cpu(packet->rhf_addr);
+
+	return ret;
+}
+
+static inline int process_rcv_packet(struct hfi1_packet *packet, int thread)
+{
+	int ret;
+
+	packet->etype = rhf_rcv_type(packet->rhf);
+
+	/* total length */
+	packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+	/* retrieve eager buffer details */
+	packet->ebuf = NULL;
+	if (rhf_use_egr_bfr(packet->rhf)) {
+		packet->etail = rhf_egr_index(packet->rhf);
+		packet->ebuf = get_egrbuf(packet->rcd, packet->rhf,
+				 &packet->updegr);
+		/*
+		 * Prefetch the contents of the eager buffer.  It is
+		 * OK to send a negative length to prefetch_range().
+		 * The +2 is the size of the RHF.
+		 */
+		prefetch_range(packet->ebuf,
+			       packet->tlen - ((packet->rcd->rcvhdrqentsize -
+					       (rhf_hdrq_offset(packet->rhf)
+						+ 2)) * 4));
+	}
+
+	/*
+	 * Call a type specific handler for the packet. We
+	 * should be able to trust that etype won't be beyond
+	 * the range of valid indexes. If so something is really
+	 * wrong and we can probably just let things come
+	 * crashing down. There is no need to eat another
+	 * comparison in this performance critical code.
+	 */
+	packet->rcd->dd->rhf_rcv_function_map[packet->etype](packet);
+	packet->numpkt++;
+
+	/* Set up for the next packet */
+	packet->rhqoff += packet->rsize;
+	if (packet->rhqoff >= packet->maxcnt)
+		packet->rhqoff = 0;
+
+	ret = check_max_packet(packet, thread);
+
+	packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
+				      packet->rcd->dd->rhf_offset;
+	packet->rhf = rhf_to_cpu(packet->rhf_addr);
+
+	return ret;
+}
+
+static inline void process_rcv_update(int last, struct hfi1_packet *packet)
+{
+	/*
+	 * Update head regs etc., every 16 packets, if not last pkt,
+	 * to help prevent rcvhdrq overflows, when many packets
+	 * are processed and queue is nearly full.
+	 * Don't request an interrupt for intermediate updates.
+	 */
+	if (!last && !(packet->numpkt & 0xf)) {
+		update_usrhead(packet->rcd, packet->rhqoff, packet->updegr,
+			       packet->etail, 0, 0);
+		packet->updegr = 0;
+	}
+	packet->grh = NULL;
+}
+
+static inline void finish_packet(struct hfi1_packet *packet)
+{
+	/*
+	 * Nothing we need to free for the packet.
+	 *
+	 * The only thing we need to do is a final update and call for an
+	 * interrupt
+	 */
+	update_usrhead(packet->rcd, packet->rcd->head, packet->updegr,
+		       packet->etail, rcv_intr_dynamic, packet->numpkt);
+}
+
+/*
+ * Handle receive interrupts when using the no dma rtail option.
+ */
+int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread)
+{
+	u32 seq;
+	int last = RCV_PKT_OK;
+	struct hfi1_packet packet;
+
+	init_packet(rcd, &packet);
+	seq = rhf_rcv_seq(packet.rhf);
+	if (seq != rcd->seq_cnt) {
+		last = RCV_PKT_DONE;
+		goto bail;
+	}
+
+	prescan_rxq(rcd, &packet);
+
+	while (last == RCV_PKT_OK) {
+		last = process_rcv_packet(&packet, thread);
+		seq = rhf_rcv_seq(packet.rhf);
+		if (++rcd->seq_cnt > 13)
+			rcd->seq_cnt = 1;
+		if (seq != rcd->seq_cnt)
+			last = RCV_PKT_DONE;
+		process_rcv_update(last, &packet);
+	}
+	process_rcv_qp_work(&packet);
+	rcd->head = packet.rhqoff;
+bail:
+	finish_packet(&packet);
+	return last;
+}
+
+int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread)
+{
+	u32 hdrqtail;
+	int last = RCV_PKT_OK;
+	struct hfi1_packet packet;
+
+	init_packet(rcd, &packet);
+	hdrqtail = get_rcvhdrtail(rcd);
+	if (packet.rhqoff == hdrqtail) {
+		last = RCV_PKT_DONE;
+		goto bail;
+	}
+	smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
+
+	prescan_rxq(rcd, &packet);
+
+	while (last == RCV_PKT_OK) {
+		last = process_rcv_packet(&packet, thread);
+		if (packet.rhqoff == hdrqtail)
+			last = RCV_PKT_DONE;
+		process_rcv_update(last, &packet);
+	}
+	process_rcv_qp_work(&packet);
+	rcd->head = packet.rhqoff;
+bail:
+	finish_packet(&packet);
+	return last;
+}
+
+static inline void set_nodma_rtail(struct hfi1_devdata *dd, u16 ctxt)
+{
+	struct hfi1_ctxtdata *rcd;
+	u16 i;
+
+	/*
+	 * For dynamically allocated kernel contexts (like vnic) switch
+	 * interrupt handler only for that context. Otherwise, switch
+	 * interrupt handler for all statically allocated kernel contexts.
+	 */
+	if (ctxt >= dd->first_dyn_alloc_ctxt) {
+		rcd = hfi1_rcd_get_by_index_safe(dd, ctxt);
+		if (rcd) {
+			rcd->do_interrupt =
+				&handle_receive_interrupt_nodma_rtail;
+			hfi1_rcd_put(rcd);
+		}
+		return;
+	}
+
+	for (i = HFI1_CTRL_CTXT + 1; i < dd->first_dyn_alloc_ctxt; i++) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		if (rcd)
+			rcd->do_interrupt =
+				&handle_receive_interrupt_nodma_rtail;
+		hfi1_rcd_put(rcd);
+	}
+}
+
+static inline void set_dma_rtail(struct hfi1_devdata *dd, u16 ctxt)
+{
+	struct hfi1_ctxtdata *rcd;
+	u16 i;
+
+	/*
+	 * For dynamically allocated kernel contexts (like vnic) switch
+	 * interrupt handler only for that context. Otherwise, switch
+	 * interrupt handler for all statically allocated kernel contexts.
+	 */
+	if (ctxt >= dd->first_dyn_alloc_ctxt) {
+		rcd = hfi1_rcd_get_by_index_safe(dd, ctxt);
+		if (rcd) {
+			rcd->do_interrupt =
+				&handle_receive_interrupt_dma_rtail;
+			hfi1_rcd_put(rcd);
+		}
+		return;
+	}
+
+	for (i = HFI1_CTRL_CTXT + 1; i < dd->first_dyn_alloc_ctxt; i++) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		if (rcd)
+			rcd->do_interrupt =
+				&handle_receive_interrupt_dma_rtail;
+		hfi1_rcd_put(rcd);
+	}
+}
+
+void set_all_slowpath(struct hfi1_devdata *dd)
+{
+	struct hfi1_ctxtdata *rcd;
+	u16 i;
+
+	/* HFI1_CTRL_CTXT must always use the slow path interrupt handler */
+	for (i = HFI1_CTRL_CTXT + 1; i < dd->num_rcv_contexts; i++) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		if (!rcd)
+			continue;
+		if (i < dd->first_dyn_alloc_ctxt || rcd->is_vnic)
+			rcd->do_interrupt = &handle_receive_interrupt;
+
+		hfi1_rcd_put(rcd);
+	}
+}
+
+static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd,
+				      struct hfi1_packet *packet,
+				      struct hfi1_devdata *dd)
+{
+	struct work_struct *lsaw = &rcd->ppd->linkstate_active_work;
+	u8 etype = rhf_rcv_type(packet->rhf);
+	u8 sc = SC15_PACKET;
+
+	if (etype == RHF_RCV_TYPE_IB) {
+		struct ib_header *hdr = hfi1_get_msgheader(packet->rcd->dd,
+							   packet->rhf_addr);
+		sc = hfi1_9B_get_sc5(hdr, packet->rhf);
+	} else if (etype == RHF_RCV_TYPE_BYPASS) {
+		struct hfi1_16b_header *hdr = hfi1_get_16B_header(
+						packet->rcd->dd,
+						packet->rhf_addr);
+		sc = hfi1_16B_get_sc(hdr);
+	}
+	if (sc != SC15_PACKET) {
+		int hwstate = driver_lstate(rcd->ppd);
+
+		if (hwstate != IB_PORT_ACTIVE) {
+			dd_dev_info(dd,
+				    "Unexpected link state %s\n",
+				    opa_lstate_name(hwstate));
+			return 0;
+		}
+
+		queue_work(rcd->ppd->link_wq, lsaw);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * handle_receive_interrupt - receive a packet
+ * @rcd: the context
+ *
+ * Called from interrupt handler for errors or receive interrupt.
+ * This is the slow path interrupt handler.
+ */
+int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 hdrqtail;
+	int needset, last = RCV_PKT_OK;
+	struct hfi1_packet packet;
+	int skip_pkt = 0;
+
+	/* Control context will always use the slow path interrupt handler */
+	needset = (rcd->ctxt == HFI1_CTRL_CTXT) ? 0 : 1;
+
+	init_packet(rcd, &packet);
+
+	if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+		u32 seq = rhf_rcv_seq(packet.rhf);
+
+		if (seq != rcd->seq_cnt) {
+			last = RCV_PKT_DONE;
+			goto bail;
+		}
+		hdrqtail = 0;
+	} else {
+		hdrqtail = get_rcvhdrtail(rcd);
+		if (packet.rhqoff == hdrqtail) {
+			last = RCV_PKT_DONE;
+			goto bail;
+		}
+		smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
+
+		/*
+		 * Control context can potentially receive an invalid
+		 * rhf. Drop such packets.
+		 */
+		if (rcd->ctxt == HFI1_CTRL_CTXT) {
+			u32 seq = rhf_rcv_seq(packet.rhf);
+
+			if (seq != rcd->seq_cnt)
+				skip_pkt = 1;
+		}
+	}
+
+	prescan_rxq(rcd, &packet);
+
+	while (last == RCV_PKT_OK) {
+		if (unlikely(dd->do_drop &&
+			     atomic_xchg(&dd->drop_packet, DROP_PACKET_OFF) ==
+			     DROP_PACKET_ON)) {
+			dd->do_drop = 0;
+
+			/* On to the next packet */
+			packet.rhqoff += packet.rsize;
+			packet.rhf_addr = (__le32 *)rcd->rcvhdrq +
+					  packet.rhqoff +
+					  dd->rhf_offset;
+			packet.rhf = rhf_to_cpu(packet.rhf_addr);
+
+		} else if (skip_pkt) {
+			last = skip_rcv_packet(&packet, thread);
+			skip_pkt = 0;
+		} else {
+			/* Auto activate link on non-SC15 packet receive */
+			if (unlikely(rcd->ppd->host_link_state ==
+				     HLS_UP_ARMED) &&
+			    set_armed_to_active(rcd, &packet, dd))
+				goto bail;
+			last = process_rcv_packet(&packet, thread);
+		}
+
+		if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+			u32 seq = rhf_rcv_seq(packet.rhf);
+
+			if (++rcd->seq_cnt > 13)
+				rcd->seq_cnt = 1;
+			if (seq != rcd->seq_cnt)
+				last = RCV_PKT_DONE;
+			if (needset) {
+				dd_dev_info(dd, "Switching to NO_DMA_RTAIL\n");
+				set_nodma_rtail(dd, rcd->ctxt);
+				needset = 0;
+			}
+		} else {
+			if (packet.rhqoff == hdrqtail)
+				last = RCV_PKT_DONE;
+			/*
+			 * Control context can potentially receive an invalid
+			 * rhf. Drop such packets.
+			 */
+			if (rcd->ctxt == HFI1_CTRL_CTXT) {
+				u32 seq = rhf_rcv_seq(packet.rhf);
+
+				if (++rcd->seq_cnt > 13)
+					rcd->seq_cnt = 1;
+				if (!last && (seq != rcd->seq_cnt))
+					skip_pkt = 1;
+			}
+
+			if (needset) {
+				dd_dev_info(dd,
+					    "Switching to DMA_RTAIL\n");
+				set_dma_rtail(dd, rcd->ctxt);
+				needset = 0;
+			}
+		}
+
+		process_rcv_update(last, &packet);
+	}
+
+	process_rcv_qp_work(&packet);
+	rcd->head = packet.rhqoff;
+
+bail:
+	/*
+	 * Always write head at end, and setup rcv interrupt, even
+	 * if no packets were processed.
+	 */
+	finish_packet(&packet);
+	return last;
+}
+
+/*
+ * We may discover in the interrupt that the hardware link state has
+ * changed from ARMED to ACTIVE (due to the arrival of a non-SC15 packet),
+ * and we need to update the driver's notion of the link state.  We cannot
+ * run set_link_state from interrupt context, so we queue this function on
+ * a workqueue.
+ *
+ * We delay the regular interrupt processing until after the state changes
+ * so that the link will be in the correct state by the time any application
+ * we wake up attempts to send a reply to any message it received.
+ * (Subsequent receive interrupts may possibly force the wakeup before we
+ * update the link state.)
+ *
+ * The rcd is freed in hfi1_free_ctxtdata after hfi1_postinit_cleanup invokes
+ * dd->f_cleanup(dd) to disable the interrupt handler and flush workqueues,
+ * so we're safe from use-after-free of the rcd.
+ */
+void receive_interrupt_work(struct work_struct *work)
+{
+	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+						  linkstate_active_work);
+	struct hfi1_devdata *dd = ppd->dd;
+	struct hfi1_ctxtdata *rcd;
+	u16 i;
+
+	/* Received non-SC15 packet implies neighbor_normal */
+	ppd->neighbor_normal = 1;
+	set_link_state(ppd, HLS_UP_ACTIVE);
+
+	/*
+	 * Interrupt all statically allocated kernel contexts that could
+	 * have had an interrupt during auto activation.
+	 */
+	for (i = HFI1_CTRL_CTXT; i < dd->first_dyn_alloc_ctxt; i++) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		if (rcd)
+			force_recv_intr(rcd);
+		hfi1_rcd_put(rcd);
+	}
+}
+
+/*
+ * Convert a given MTU size to the on-wire MAD packet enumeration.
+ * Return -1 if the size is invalid.
+ */
+int mtu_to_enum(u32 mtu, int default_if_bad)
+{
+	switch (mtu) {
+	case     0: return OPA_MTU_0;
+	case   256: return OPA_MTU_256;
+	case   512: return OPA_MTU_512;
+	case  1024: return OPA_MTU_1024;
+	case  2048: return OPA_MTU_2048;
+	case  4096: return OPA_MTU_4096;
+	case  8192: return OPA_MTU_8192;
+	case 10240: return OPA_MTU_10240;
+	}
+	return default_if_bad;
+}
+
+u16 enum_to_mtu(int mtu)
+{
+	switch (mtu) {
+	case OPA_MTU_0:     return 0;
+	case OPA_MTU_256:   return 256;
+	case OPA_MTU_512:   return 512;
+	case OPA_MTU_1024:  return 1024;
+	case OPA_MTU_2048:  return 2048;
+	case OPA_MTU_4096:  return 4096;
+	case OPA_MTU_8192:  return 8192;
+	case OPA_MTU_10240: return 10240;
+	default: return 0xffff;
+	}
+}
+
+/*
+ * set_mtu - set the MTU
+ * @ppd: the per port data
+ *
+ * We can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size.  We do not deal with what happens
+ * to programs that are already running when the size changes.
+ */
+int set_mtu(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	int i, drain, ret = 0, is_up = 0;
+
+	ppd->ibmtu = 0;
+	for (i = 0; i < ppd->vls_supported; i++)
+		if (ppd->ibmtu < dd->vld[i].mtu)
+			ppd->ibmtu = dd->vld[i].mtu;
+	ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd);
+
+	mutex_lock(&ppd->hls_lock);
+	if (ppd->host_link_state == HLS_UP_INIT ||
+	    ppd->host_link_state == HLS_UP_ARMED ||
+	    ppd->host_link_state == HLS_UP_ACTIVE)
+		is_up = 1;
+
+	drain = !is_ax(dd) && is_up;
+
+	if (drain)
+		/*
+		 * MTU is specified per-VL. To ensure that no packet gets
+		 * stuck (due, e.g., to the MTU for the packet's VL being
+		 * reduced), empty the per-VL FIFOs before adjusting MTU.
+		 */
+		ret = stop_drain_data_vls(dd);
+
+	if (ret) {
+		dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n",
+			   __func__);
+		goto err;
+	}
+
+	hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0);
+
+	if (drain)
+		open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+	mutex_unlock(&ppd->hls_lock);
+
+	return ret;
+}
+
+int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+
+	ppd->lid = lid;
+	ppd->lmc = lmc;
+	hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0);
+
+	dd_dev_info(dd, "port %u: got a lid: 0x%x\n", ppd->port, lid);
+
+	return 0;
+}
+
+void shutdown_led_override(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+
+	/*
+	 * This pairs with the memory barrier in hfi1_start_led_override to
+	 * ensure that we read the correct state of LED beaconing represented
+	 * by led_override_timer_active
+	 */
+	smp_rmb();
+	if (atomic_read(&ppd->led_override_timer_active)) {
+		del_timer_sync(&ppd->led_override_timer);
+		atomic_set(&ppd->led_override_timer_active, 0);
+		/* Ensure the atomic_set is visible to all CPUs */
+		smp_wmb();
+	}
+
+	/* Hand control of the LED to the DC for normal operation */
+	write_csr(dd, DCC_CFG_LED_CNTRL, 0);
+}
+
+static void run_led_override(struct timer_list *t)
+{
+	struct hfi1_pportdata *ppd = from_timer(ppd, t, led_override_timer);
+	struct hfi1_devdata *dd = ppd->dd;
+	unsigned long timeout;
+	int phase_idx;
+
+	if (!(dd->flags & HFI1_INITTED))
+		return;
+
+	phase_idx = ppd->led_override_phase & 1;
+
+	setextled(dd, phase_idx);
+
+	timeout = ppd->led_override_vals[phase_idx];
+
+	/* Set up for next phase */
+	ppd->led_override_phase = !ppd->led_override_phase;
+
+	mod_timer(&ppd->led_override_timer, jiffies + timeout);
+}
+
+/*
+ * To have the LED blink in a particular pattern, provide timeon and timeoff
+ * in milliseconds.
+ * To turn off custom blinking and return to normal operation, use
+ * shutdown_led_override()
+ */
+void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
+			     unsigned int timeoff)
+{
+	if (!(ppd->dd->flags & HFI1_INITTED))
+		return;
+
+	/* Convert to jiffies for direct use in timer */
+	ppd->led_override_vals[0] = msecs_to_jiffies(timeoff);
+	ppd->led_override_vals[1] = msecs_to_jiffies(timeon);
+
+	/* Arbitrarily start from LED on phase */
+	ppd->led_override_phase = 1;
+
+	/*
+	 * If the timer has not already been started, do so. Use a "quick"
+	 * timeout so the handler will be called soon to look at our request.
+	 */
+	if (!timer_pending(&ppd->led_override_timer)) {
+		timer_setup(&ppd->led_override_timer, run_led_override, 0);
+		ppd->led_override_timer.expires = jiffies + 1;
+		add_timer(&ppd->led_override_timer);
+		atomic_set(&ppd->led_override_timer_active, 1);
+		/* Ensure the atomic_set is visible to all CPUs */
+		smp_wmb();
+	}
+}
+
+/**
+ * hfi1_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload).  We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize.  For
+ * now, we only allow this if no user contexts are open that use chip resources
+ */
+int hfi1_reset_device(int unit)
+{
+	int ret;
+	struct hfi1_devdata *dd = hfi1_lookup(unit);
+	struct hfi1_pportdata *ppd;
+	int pidx;
+
+	if (!dd) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	dd_dev_info(dd, "Reset on unit %u requested\n", unit);
+
+	if (!dd->kregbase1 || !(dd->flags & HFI1_PRESENT)) {
+		dd_dev_info(dd,
+			    "Invalid unit number %u or not initialized or not present\n",
+			    unit);
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	/* If there are any user/vnic contexts, we cannot reset */
+	mutex_lock(&hfi1_mutex);
+	if (dd->rcd)
+		if (hfi1_stats.sps_ctxts) {
+			mutex_unlock(&hfi1_mutex);
+			ret = -EBUSY;
+			goto bail;
+		}
+	mutex_unlock(&hfi1_mutex);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+
+		shutdown_led_override(ppd);
+	}
+	if (dd->flags & HFI1_HAS_SEND_DMA)
+		sdma_exit(dd);
+
+	hfi1_reset_cpu_counters(dd);
+
+	ret = hfi1_init(dd, 1);
+
+	if (ret)
+		dd_dev_err(dd,
+			   "Reinitialize unit %u after reset failed with %d\n",
+			   unit, ret);
+	else
+		dd_dev_info(dd, "Reinitialized unit %u after resetting\n",
+			    unit);
+
+bail:
+	return ret;
+}
+
+static inline void hfi1_setup_ib_header(struct hfi1_packet *packet)
+{
+	packet->hdr = (struct hfi1_ib_message_header *)
+			hfi1_get_msgheader(packet->rcd->dd,
+					   packet->rhf_addr);
+	packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
+}
+
+static int hfi1_bypass_ingress_pkt_check(struct hfi1_packet *packet)
+{
+	struct hfi1_pportdata *ppd = packet->rcd->ppd;
+
+	/* slid and dlid cannot be 0 */
+	if ((!packet->slid) || (!packet->dlid))
+		return -EINVAL;
+
+	/* Compare port lid with incoming packet dlid */
+	if ((!(hfi1_is_16B_mcast(packet->dlid))) &&
+	    (packet->dlid !=
+		opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B))) {
+		if (packet->dlid != ppd->lid)
+			return -EINVAL;
+	}
+
+	/* No multicast packets with SC15 */
+	if ((hfi1_is_16B_mcast(packet->dlid)) && (packet->sc == 0xF))
+		return -EINVAL;
+
+	/* Packets with permissive DLID always on SC15 */
+	if ((packet->dlid == opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE),
+					 16B)) &&
+	    (packet->sc != 0xF))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int hfi1_setup_9B_packet(struct hfi1_packet *packet)
+{
+	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
+	struct ib_header *hdr;
+	u8 lnh;
+
+	hfi1_setup_ib_header(packet);
+	hdr = packet->hdr;
+
+	lnh = ib_get_lnh(hdr);
+	if (lnh == HFI1_LRH_BTH) {
+		packet->ohdr = &hdr->u.oth;
+		packet->grh = NULL;
+	} else if (lnh == HFI1_LRH_GRH) {
+		u32 vtf;
+
+		packet->ohdr = &hdr->u.l.oth;
+		packet->grh = &hdr->u.l.grh;
+		if (packet->grh->next_hdr != IB_GRH_NEXT_HDR)
+			goto drop;
+		vtf = be32_to_cpu(packet->grh->version_tclass_flow);
+		if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+			goto drop;
+	} else {
+		goto drop;
+	}
+
+	/* Query commonly used fields from packet header */
+	packet->payload = packet->ebuf;
+	packet->opcode = ib_bth_get_opcode(packet->ohdr);
+	packet->slid = ib_get_slid(hdr);
+	packet->dlid = ib_get_dlid(hdr);
+	if (unlikely((packet->dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+		     (packet->dlid != be16_to_cpu(IB_LID_PERMISSIVE))))
+		packet->dlid += opa_get_mcast_base(OPA_MCAST_NR) -
+				be16_to_cpu(IB_MULTICAST_LID_BASE);
+	packet->sl = ib_get_sl(hdr);
+	packet->sc = hfi1_9B_get_sc5(hdr, packet->rhf);
+	packet->pad = ib_bth_get_pad(packet->ohdr);
+	packet->extra_byte = 0;
+	packet->pkey = ib_bth_get_pkey(packet->ohdr);
+	packet->migrated = ib_bth_is_migration(packet->ohdr);
+
+	return 0;
+drop:
+	ibp->rvp.n_pkt_drops++;
+	return -EINVAL;
+}
+
+static int hfi1_setup_bypass_packet(struct hfi1_packet *packet)
+{
+	/*
+	 * Bypass packets have a different header/payload split
+	 * compared to an IB packet.
+	 * Current split is set such that 16 bytes of the actual
+	 * header is in the header buffer and the remining is in
+	 * the eager buffer. We chose 16 since hfi1 driver only
+	 * supports 16B bypass packets and we will be able to
+	 * receive the entire LRH with such a split.
+	 */
+
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct hfi1_pportdata *ppd = rcd->ppd;
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+	u8 l4;
+	u8 grh_len;
+
+	packet->hdr = (struct hfi1_16b_header *)
+			hfi1_get_16B_header(packet->rcd->dd,
+					    packet->rhf_addr);
+	packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
+
+	l4 = hfi1_16B_get_l4(packet->hdr);
+	if (l4 == OPA_16B_L4_IB_LOCAL) {
+		grh_len = 0;
+		packet->ohdr = packet->ebuf;
+		packet->grh = NULL;
+	} else if (l4 == OPA_16B_L4_IB_GLOBAL) {
+		u32 vtf;
+
+		grh_len = sizeof(struct ib_grh);
+		packet->ohdr = packet->ebuf + grh_len;
+		packet->grh = packet->ebuf;
+		if (packet->grh->next_hdr != IB_GRH_NEXT_HDR)
+			goto drop;
+		vtf = be32_to_cpu(packet->grh->version_tclass_flow);
+		if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+			goto drop;
+	} else {
+		goto drop;
+	}
+
+	/* Query commonly used fields from packet header */
+	packet->opcode = ib_bth_get_opcode(packet->ohdr);
+	/* hdr_len_by_opcode already has an IB LRH factored in */
+	packet->hlen = hdr_len_by_opcode[packet->opcode] +
+		(LRH_16B_BYTES - LRH_9B_BYTES) + grh_len;
+	packet->payload = packet->ebuf + packet->hlen - LRH_16B_BYTES;
+	packet->slid = hfi1_16B_get_slid(packet->hdr);
+	packet->dlid = hfi1_16B_get_dlid(packet->hdr);
+	if (unlikely(hfi1_is_16B_mcast(packet->dlid)))
+		packet->dlid += opa_get_mcast_base(OPA_MCAST_NR) -
+				opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR),
+					    16B);
+	packet->sc = hfi1_16B_get_sc(packet->hdr);
+	packet->sl = ibp->sc_to_sl[packet->sc];
+	packet->pad = hfi1_16B_bth_get_pad(packet->ohdr);
+	packet->extra_byte = SIZE_OF_LT;
+	packet->pkey = hfi1_16B_get_pkey(packet->hdr);
+	packet->migrated = opa_bth_is_migration(packet->ohdr);
+
+	if (hfi1_bypass_ingress_pkt_check(packet))
+		goto drop;
+
+	return 0;
+drop:
+	hfi1_cdbg(PKT, "%s: packet dropped\n", __func__);
+	ibp->rvp.n_pkt_drops++;
+	return -EINVAL;
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	u32 rte = rhf_rcv_type_err(packet->rhf);
+
+	rcv_hdrerr(rcd, rcd->ppd, packet);
+	if (rhf_err_flags(packet->rhf))
+		dd_dev_err(rcd->dd,
+			   "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
+			   rcd->ctxt, packet->rhf,
+			   packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+			   packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+			   packet->rhf & RHF_DC_ERR ? "dc " : "",
+			   packet->rhf & RHF_TID_ERR ? "tid " : "",
+			   packet->rhf & RHF_LEN_ERR ? "len " : "",
+			   packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+			   packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
+			   packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+			   rte);
+}
+
+/*
+ * The following functions are called by the interrupt handler. They are type
+ * specific handlers for each packet type.
+ */
+int process_receive_ib(struct hfi1_packet *packet)
+{
+	if (unlikely(hfi1_dbg_fault_packet(packet)))
+		return RHF_RCV_CONTINUE;
+
+	if (hfi1_setup_9B_packet(packet))
+		return RHF_RCV_CONTINUE;
+
+	trace_hfi1_rcvhdr(packet);
+
+	if (unlikely(rhf_err_flags(packet->rhf))) {
+		handle_eflags(packet);
+		return RHF_RCV_CONTINUE;
+	}
+
+	hfi1_ib_rcv(packet);
+	return RHF_RCV_CONTINUE;
+}
+
+static inline bool hfi1_is_vnic_packet(struct hfi1_packet *packet)
+{
+	/* Packet received in VNIC context via RSM */
+	if (packet->rcd->is_vnic)
+		return true;
+
+	if ((hfi1_16B_get_l2(packet->ebuf) == OPA_16B_L2_TYPE) &&
+	    (hfi1_16B_get_l4(packet->ebuf) == OPA_16B_L4_ETHR))
+		return true;
+
+	return false;
+}
+
+int process_receive_bypass(struct hfi1_packet *packet)
+{
+	struct hfi1_devdata *dd = packet->rcd->dd;
+
+	if (hfi1_is_vnic_packet(packet)) {
+		hfi1_vnic_bypass_rcv(packet);
+		return RHF_RCV_CONTINUE;
+	}
+
+	if (hfi1_setup_bypass_packet(packet))
+		return RHF_RCV_CONTINUE;
+
+	trace_hfi1_rcvhdr(packet);
+
+	if (unlikely(rhf_err_flags(packet->rhf))) {
+		handle_eflags(packet);
+		return RHF_RCV_CONTINUE;
+	}
+
+	if (hfi1_16B_get_l2(packet->hdr) == 0x2) {
+		hfi1_16B_rcv(packet);
+	} else {
+		dd_dev_err(dd,
+			   "Bypass packets other than 16B are not supported in normal operation. Dropping\n");
+		incr_cntr64(&dd->sw_rcv_bypass_packet_errors);
+		if (!(dd->err_info_rcvport.status_and_code &
+		      OPA_EI_STATUS_SMASK)) {
+			u64 *flits = packet->ebuf;
+
+			if (flits && !(packet->rhf & RHF_LEN_ERR)) {
+				dd->err_info_rcvport.packet_flit1 = flits[0];
+				dd->err_info_rcvport.packet_flit2 =
+					packet->tlen > sizeof(flits[0]) ?
+					flits[1] : 0;
+			}
+			dd->err_info_rcvport.status_and_code |=
+				(OPA_EI_STATUS_SMASK | BAD_L2_ERR);
+		}
+	}
+	return RHF_RCV_CONTINUE;
+}
+
+int process_receive_error(struct hfi1_packet *packet)
+{
+	/* KHdrHCRCErr -- KDETH packet with a bad HCRC */
+	if (unlikely(
+		 hfi1_dbg_fault_suppress_err(&packet->rcd->dd->verbs_dev) &&
+		 rhf_rcv_type_err(packet->rhf) == 3))
+		return RHF_RCV_CONTINUE;
+
+	hfi1_setup_ib_header(packet);
+	handle_eflags(packet);
+
+	if (unlikely(rhf_err_flags(packet->rhf)))
+		dd_dev_err(packet->rcd->dd,
+			   "Unhandled error packet received. Dropping.\n");
+
+	return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_expected(struct hfi1_packet *packet)
+{
+	if (unlikely(hfi1_dbg_fault_packet(packet)))
+		return RHF_RCV_CONTINUE;
+
+	hfi1_setup_ib_header(packet);
+	if (unlikely(rhf_err_flags(packet->rhf)))
+		handle_eflags(packet);
+
+	dd_dev_err(packet->rcd->dd,
+		   "Unhandled expected packet received. Dropping.\n");
+	return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_eager(struct hfi1_packet *packet)
+{
+	hfi1_setup_ib_header(packet);
+	if (unlikely(rhf_err_flags(packet->rhf)))
+		handle_eflags(packet);
+	if (unlikely(hfi1_dbg_fault_packet(packet)))
+		return RHF_RCV_CONTINUE;
+
+	dd_dev_err(packet->rcd->dd,
+		   "Unhandled eager packet received. Dropping.\n");
+	return RHF_RCV_CONTINUE;
+}
+
+int process_receive_invalid(struct hfi1_packet *packet)
+{
+	dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n",
+		   rhf_rcv_type(packet->rhf));
+	return RHF_RCV_CONTINUE;
+}
+
+void seqfile_dump_rcd(struct seq_file *s, struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_packet packet;
+	struct ps_mdata mdata;
+
+	seq_printf(s, "Rcd %u: RcvHdr cnt %u entsize %u %s head %llu tail %llu\n",
+		   rcd->ctxt, rcd->rcvhdrq_cnt, rcd->rcvhdrqentsize,
+		   HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ?
+		   "dma_rtail" : "nodma_rtail",
+		   read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD) &
+		   RCV_HDR_HEAD_HEAD_MASK,
+		   read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL));
+
+	init_packet(rcd, &packet);
+	init_ps_mdata(&mdata, &packet);
+
+	while (1) {
+		struct hfi1_devdata *dd = rcd->dd;
+		__le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head +
+					 dd->rhf_offset;
+		struct ib_header *hdr;
+		u64 rhf = rhf_to_cpu(rhf_addr);
+		u32 etype = rhf_rcv_type(rhf), qpn;
+		u8 opcode;
+		u32 psn;
+		u8 lnh;
+
+		if (ps_done(&mdata, rhf, rcd))
+			break;
+
+		if (ps_skip(&mdata, rhf, rcd))
+			goto next;
+
+		if (etype > RHF_RCV_TYPE_IB)
+			goto next;
+
+		packet.hdr = hfi1_get_msgheader(dd, rhf_addr);
+		hdr = packet.hdr;
+
+		lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+
+		if (lnh == HFI1_LRH_BTH)
+			packet.ohdr = &hdr->u.oth;
+		else if (lnh == HFI1_LRH_GRH)
+			packet.ohdr = &hdr->u.l.oth;
+		else
+			goto next; /* just in case */
+
+		opcode = (be32_to_cpu(packet.ohdr->bth[0]) >> 24);
+		qpn = be32_to_cpu(packet.ohdr->bth[1]) & RVT_QPN_MASK;
+		psn = mask_psn(be32_to_cpu(packet.ohdr->bth[2]));
+
+		seq_printf(s, "\tEnt %u: opcode 0x%x, qpn 0x%x, psn 0x%x\n",
+			   mdata.ps_head, opcode, qpn, psn);
+next:
+		update_ps_mdata(&mdata, rcd);
+	}
+}
diff --git a/drivers/infiniband/hw/hfi1/efivar.c b/drivers/infiniband/hw/hfi1/efivar.c
new file mode 100644
index 0000000..d106d23
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/efivar.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/ctype.h>
+#include "efivar.h"
+
+/* GUID for HFI1 variables in EFI */
+#define HFI1_EFIVAR_GUID EFI_GUID(0xc50a953e, 0xa8b2, 0x42a6, \
+		0xbf, 0x89, 0xd3, 0x33, 0xa6, 0xe9, 0xe6, 0xd4)
+/* largest EFI data size we expect */
+#define EFI_DATA_SIZE 4096
+
+/*
+ * Read the named EFI variable.  Return the size of the actual data in *size
+ * and a kmalloc'ed buffer in *return_data.  The caller must free the
+ * data.  It is guaranteed that *return_data will be NULL and *size = 0
+ * if this routine fails.
+ *
+ * Return 0 on success, -errno on failure.
+ */
+static int read_efi_var(const char *name, unsigned long *size,
+			void **return_data)
+{
+	efi_status_t status;
+	efi_char16_t *uni_name;
+	efi_guid_t guid;
+	unsigned long temp_size;
+	void *temp_buffer;
+	void *data;
+	int i;
+	int ret;
+
+	/* set failure return values */
+	*size = 0;
+	*return_data = NULL;
+
+	if (!efi_enabled(EFI_RUNTIME_SERVICES))
+		return -EOPNOTSUPP;
+
+	uni_name = kcalloc(strlen(name) + 1, sizeof(efi_char16_t), GFP_KERNEL);
+	temp_buffer = kzalloc(EFI_DATA_SIZE, GFP_KERNEL);
+
+	if (!uni_name || !temp_buffer) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	/* input: the size of the buffer */
+	temp_size = EFI_DATA_SIZE;
+
+	/* convert ASCII to unicode - it is a 1:1 mapping */
+	for (i = 0; name[i]; i++)
+		uni_name[i] = name[i];
+
+	/* need a variable for our GUID */
+	guid = HFI1_EFIVAR_GUID;
+
+	/* call into EFI runtime services */
+	status = efi.get_variable(
+			uni_name,
+			&guid,
+			NULL,
+			&temp_size,
+			temp_buffer);
+
+	/*
+	 * It would be nice to call efi_status_to_err() here, but that
+	 * is in the EFIVAR_FS code and may not be compiled in.
+	 * However, even that is insufficient since it does not cover
+	 * EFI_BUFFER_TOO_SMALL which could be an important return.
+	 * For now, just split out succces or not found.
+	 */
+	ret = status == EFI_SUCCESS   ? 0 :
+	      status == EFI_NOT_FOUND ? -ENOENT :
+					-EINVAL;
+	if (ret)
+		goto fail;
+
+	/*
+	 * We have successfully read the EFI variable into our
+	 * temporary buffer.  Now allocate a correctly sized
+	 * buffer.
+	 */
+	data = kmemdup(temp_buffer, temp_size, GFP_KERNEL);
+	if (!data) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	*size = temp_size;
+	*return_data = data;
+
+fail:
+	kfree(uni_name);
+	kfree(temp_buffer);
+
+	return ret;
+}
+
+/*
+ * Read an HFI1 EFI variable of the form:
+ *	<PCIe address>-<kind>
+ * Return an kalloc'ed array and size of the data.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind,
+		      unsigned long *size, void **return_data)
+{
+	char prefix_name[64];
+	char name[64];
+	int result;
+	int i;
+
+	/* create a common prefix */
+	snprintf(prefix_name, sizeof(prefix_name), "%04x:%02x:%02x.%x",
+		 pci_domain_nr(dd->pcidev->bus),
+		 dd->pcidev->bus->number,
+		 PCI_SLOT(dd->pcidev->devfn),
+		 PCI_FUNC(dd->pcidev->devfn));
+	snprintf(name, sizeof(name), "%s-%s", prefix_name, kind);
+	result = read_efi_var(name, size, return_data);
+
+	/*
+	 * If reading the lowercase EFI variable fail, read the uppercase
+	 * variable.
+	 */
+	if (result) {
+		/* Converting to uppercase */
+		for (i = 0; prefix_name[i]; i++)
+			if (isalpha(prefix_name[i]))
+				prefix_name[i] = toupper(prefix_name[i]);
+		snprintf(name, sizeof(name), "%s-%s", prefix_name, kind);
+		result = read_efi_var(name, size, return_data);
+	}
+
+	return result;
+}
diff --git a/drivers/infiniband/hw/hfi1/efivar.h b/drivers/infiniband/hw/hfi1/efivar.h
new file mode 100644
index 0000000..94e9e70
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/efivar.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_EFIVAR_H
+#define _HFI1_EFIVAR_H
+
+#include <linux/efi.h>
+
+#include "hfi.h"
+
+int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind,
+		      unsigned long *size, void **return_data);
+
+#endif /* _HFI1_EFIVAR_H */
diff --git a/drivers/infiniband/hw/hfi1/eprom.c b/drivers/infiniband/hw/hfi1/eprom.c
new file mode 100644
index 0000000..1613af1
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/eprom.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/delay.h>
+#include "hfi.h"
+#include "common.h"
+#include "eprom.h"
+
+/*
+ * The EPROM is logically divided into three partitions:
+ *	partition 0: the first 128K, visible from PCI ROM BAR
+ *	partition 1: 4K config file (sector size)
+ *	partition 2: the rest
+ */
+#define P0_SIZE (128 * 1024)
+#define P1_SIZE   (4 * 1024)
+#define P1_START P0_SIZE
+#define P2_START (P0_SIZE + P1_SIZE)
+
+/* controller page size, in bytes */
+#define EP_PAGE_SIZE 256
+#define EP_PAGE_MASK (EP_PAGE_SIZE - 1)
+#define EP_PAGE_DWORDS (EP_PAGE_SIZE / sizeof(u32))
+
+/* controller commands */
+#define CMD_SHIFT 24
+#define CMD_NOP			    (0)
+#define CMD_READ_DATA(addr)	    ((0x03 << CMD_SHIFT) | addr)
+#define CMD_RELEASE_POWERDOWN_NOID  ((0xab << CMD_SHIFT))
+
+/* controller interface speeds */
+#define EP_SPEED_FULL 0x2	/* full speed */
+
+/*
+ * How long to wait for the EPROM to become available, in ms.
+ * The spec 32 Mb EPROM takes around 40s to erase then write.
+ * Double it for safety.
+ */
+#define EPROM_TIMEOUT 80000 /* ms */
+
+/*
+ * Read a 256 byte (64 dword) EPROM page.
+ * All callers have verified the offset is at a page boundary.
+ */
+static void read_page(struct hfi1_devdata *dd, u32 offset, u32 *result)
+{
+	int i;
+
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_DATA(offset));
+	for (i = 0; i < EP_PAGE_DWORDS; i++)
+		result[i] = (u32)read_csr(dd, ASIC_EEP_DATA);
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); /* close open page */
+}
+
+/*
+ * Read length bytes starting at offset from the start of the EPROM.
+ */
+static int read_length(struct hfi1_devdata *dd, u32 start, u32 len, void *dest)
+{
+	u32 buffer[EP_PAGE_DWORDS];
+	u32 end;
+	u32 start_offset;
+	u32 read_start;
+	u32 bytes;
+
+	if (len == 0)
+		return 0;
+
+	end = start + len;
+
+	/*
+	 * Make sure the read range is not outside of the controller read
+	 * command address range.  Note that '>' is correct below - the end
+	 * of the range is OK if it stops at the limit, but no higher.
+	 */
+	if (end > (1 << CMD_SHIFT))
+		return -EINVAL;
+
+	/* read the first partial page */
+	start_offset = start & EP_PAGE_MASK;
+	if (start_offset) {
+		/* partial starting page */
+
+		/* align and read the page that contains the start */
+		read_start = start & ~EP_PAGE_MASK;
+		read_page(dd, read_start, buffer);
+
+		/* the rest of the page is available data */
+		bytes = EP_PAGE_SIZE - start_offset;
+
+		if (len <= bytes) {
+			/* end is within this page */
+			memcpy(dest, (u8 *)buffer + start_offset, len);
+			return 0;
+		}
+
+		memcpy(dest, (u8 *)buffer + start_offset, bytes);
+
+		start += bytes;
+		len -= bytes;
+		dest += bytes;
+	}
+	/* start is now page aligned */
+
+	/* read whole pages */
+	while (len >= EP_PAGE_SIZE) {
+		read_page(dd, start, buffer);
+		memcpy(dest, buffer, EP_PAGE_SIZE);
+
+		start += EP_PAGE_SIZE;
+		len -= EP_PAGE_SIZE;
+		dest += EP_PAGE_SIZE;
+	}
+
+	/* read the last partial page */
+	if (len) {
+		read_page(dd, start, buffer);
+		memcpy(dest, buffer, len);
+	}
+
+	return 0;
+}
+
+/*
+ * Initialize the EPROM handler.
+ */
+int eprom_init(struct hfi1_devdata *dd)
+{
+	int ret = 0;
+
+	/* only the discrete chip has an EPROM */
+	if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0)
+		return 0;
+
+	/*
+	 * It is OK if both HFIs reset the EPROM as long as they don't
+	 * do it at the same time.
+	 */
+	ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT);
+	if (ret) {
+		dd_dev_err(dd,
+			   "%s: unable to acquire EPROM resource, no EPROM support\n",
+			   __func__);
+		goto done_asic;
+	}
+
+	/* reset EPROM to be sure it is in a good state */
+
+	/* set reset */
+	write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_EP_RESET_SMASK);
+	/* clear reset, set speed */
+	write_csr(dd, ASIC_EEP_CTL_STAT,
+		  EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT);
+
+	/* wake the device with command "release powerdown NoID" */
+	write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID);
+
+	dd->eprom_available = true;
+	release_chip_resource(dd, CR_EPROM);
+done_asic:
+	return ret;
+}
+
+/* magic character sequence that begins an image */
+#define IMAGE_START_MAGIC "APO="
+
+/* magic character sequence that might trail an image */
+#define IMAGE_TRAIL_MAGIC "egamiAPO"
+
+/* EPROM file types */
+#define HFI1_EFT_PLATFORM_CONFIG 2
+
+/* segment size - 128 KiB */
+#define SEG_SIZE (128 * 1024)
+
+struct hfi1_eprom_footer {
+	u32 oprom_size;		/* size of the oprom, in bytes */
+	u16 num_table_entries;
+	u16 version;		/* version of this footer */
+	u32 magic;		/* must be last */
+};
+
+struct hfi1_eprom_table_entry {
+	u32 type;		/* file type */
+	u32 offset;		/* file offset from start of EPROM */
+	u32 size;		/* file size, in bytes */
+};
+
+/*
+ * Calculate the max number of table entries that will fit within a directory
+ * buffer of size 'dir_size'.
+ */
+#define MAX_TABLE_ENTRIES(dir_size) \
+	(((dir_size) - sizeof(struct hfi1_eprom_footer)) / \
+		sizeof(struct hfi1_eprom_table_entry))
+
+#define DIRECTORY_SIZE(n) (sizeof(struct hfi1_eprom_footer) + \
+	(sizeof(struct hfi1_eprom_table_entry) * (n)))
+
+#define MAGIC4(a, b, c, d) ((d) << 24 | (c) << 16 | (b) << 8 | (a))
+#define FOOTER_MAGIC MAGIC4('e', 'p', 'r', 'm')
+#define FOOTER_VERSION 1
+
+/*
+ * Read all of partition 1.  The actual file is at the front.  Adjust
+ * the returned size if a trailing image magic is found.
+ */
+static int read_partition_platform_config(struct hfi1_devdata *dd, void **data,
+					  u32 *size)
+{
+	void *buffer;
+	void *p;
+	u32 length;
+	int ret;
+
+	buffer = kmalloc(P1_SIZE, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	ret = read_length(dd, P1_START, P1_SIZE, buffer);
+	if (ret) {
+		kfree(buffer);
+		return ret;
+	}
+
+	/* config partition is valid only if it starts with IMAGE_START_MAGIC */
+	if (memcmp(buffer, IMAGE_START_MAGIC, strlen(IMAGE_START_MAGIC))) {
+		kfree(buffer);
+		return -ENOENT;
+	}
+
+	/* scan for image magic that may trail the actual data */
+	p = strnstr(buffer, IMAGE_TRAIL_MAGIC, P1_SIZE);
+	if (p)
+		length = p - buffer;
+	else
+		length = P1_SIZE;
+
+	*data = buffer;
+	*size = length;
+	return 0;
+}
+
+/*
+ * The segment magic has been checked.  There is a footer and table of
+ * contents present.
+ *
+ * directory is a u32 aligned buffer of size EP_PAGE_SIZE.
+ */
+static int read_segment_platform_config(struct hfi1_devdata *dd,
+					void *directory, void **data, u32 *size)
+{
+	struct hfi1_eprom_footer *footer;
+	struct hfi1_eprom_table_entry *table;
+	struct hfi1_eprom_table_entry *entry;
+	void *buffer = NULL;
+	void *table_buffer = NULL;
+	int ret, i;
+	u32 directory_size;
+	u32 seg_base, seg_offset;
+	u32 bytes_available, ncopied, to_copy;
+
+	/* the footer is at the end of the directory */
+	footer = (struct hfi1_eprom_footer *)
+			(directory + EP_PAGE_SIZE - sizeof(*footer));
+
+	/* make sure the structure version is supported */
+	if (footer->version != FOOTER_VERSION)
+		return -EINVAL;
+
+	/* oprom size cannot be larger than a segment */
+	if (footer->oprom_size >= SEG_SIZE)
+		return -EINVAL;
+
+	/* the file table must fit in a segment with the oprom */
+	if (footer->num_table_entries >
+			MAX_TABLE_ENTRIES(SEG_SIZE - footer->oprom_size))
+		return -EINVAL;
+
+	/* find the file table start, which precedes the footer */
+	directory_size = DIRECTORY_SIZE(footer->num_table_entries);
+	if (directory_size <= EP_PAGE_SIZE) {
+		/* the file table fits into the directory buffer handed in */
+		table = (struct hfi1_eprom_table_entry *)
+				(directory + EP_PAGE_SIZE - directory_size);
+	} else {
+		/* need to allocate and read more */
+		table_buffer = kmalloc(directory_size, GFP_KERNEL);
+		if (!table_buffer)
+			return -ENOMEM;
+		ret = read_length(dd, SEG_SIZE - directory_size,
+				  directory_size, table_buffer);
+		if (ret)
+			goto done;
+		table = table_buffer;
+	}
+
+	/* look for the platform configuration file in the table */
+	for (entry = NULL, i = 0; i < footer->num_table_entries; i++) {
+		if (table[i].type == HFI1_EFT_PLATFORM_CONFIG) {
+			entry = &table[i];
+			break;
+		}
+	}
+	if (!entry) {
+		ret = -ENOENT;
+		goto done;
+	}
+
+	/*
+	 * Sanity check on the configuration file size - it should never
+	 * be larger than 4 KiB.
+	 */
+	if (entry->size > (4 * 1024)) {
+		dd_dev_err(dd, "Bad configuration file size 0x%x\n",
+			   entry->size);
+		ret = -EINVAL;
+		goto done;
+	}
+
+	/* check for bogus offset and size that wrap when added together */
+	if (entry->offset + entry->size < entry->offset) {
+		dd_dev_err(dd,
+			   "Bad configuration file start + size 0x%x+0x%x\n",
+			   entry->offset, entry->size);
+		ret = -EINVAL;
+		goto done;
+	}
+
+	/* allocate the buffer to return */
+	buffer = kmalloc(entry->size, GFP_KERNEL);
+	if (!buffer) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * Extract the file by looping over segments until it is fully read.
+	 */
+	seg_offset = entry->offset % SEG_SIZE;
+	seg_base = entry->offset - seg_offset;
+	ncopied = 0;
+	while (ncopied < entry->size) {
+		/* calculate data bytes available in this segment */
+
+		/* start with the bytes from the current offset to the end */
+		bytes_available = SEG_SIZE - seg_offset;
+		/* subtract off footer and table from segment 0 */
+		if (seg_base == 0) {
+			/*
+			 * Sanity check: should not have a starting point
+			 * at or within the directory.
+			 */
+			if (bytes_available <= directory_size) {
+				dd_dev_err(dd,
+					   "Bad configuration file - offset 0x%x within footer+table\n",
+					   entry->offset);
+				ret = -EINVAL;
+				goto done;
+			}
+			bytes_available -= directory_size;
+		}
+
+		/* calculate bytes wanted */
+		to_copy = entry->size - ncopied;
+
+		/* max out at the available bytes in this segment */
+		if (to_copy > bytes_available)
+			to_copy = bytes_available;
+
+		/*
+		 * Read from the EPROM.
+		 *
+		 * The sanity check for entry->offset is done in read_length().
+		 * The EPROM offset is validated against what the hardware
+		 * addressing supports.  In addition, if the offset is larger
+		 * than the actual EPROM, it silently wraps.  It will work
+		 * fine, though the reader may not get what they expected
+		 * from the EPROM.
+		 */
+		ret = read_length(dd, seg_base + seg_offset, to_copy,
+				  buffer + ncopied);
+		if (ret)
+			goto done;
+
+		ncopied += to_copy;
+
+		/* set up for next segment */
+		seg_offset = footer->oprom_size;
+		seg_base += SEG_SIZE;
+	}
+
+	/* success */
+	ret = 0;
+	*data = buffer;
+	*size = entry->size;
+
+done:
+	kfree(table_buffer);
+	if (ret)
+		kfree(buffer);
+	return ret;
+}
+
+/*
+ * Read the platform configuration file from the EPROM.
+ *
+ * On success, an allocated buffer containing the data and its size are
+ * returned.  It is up to the caller to free this buffer.
+ *
+ * Return value:
+ *   0	      - success
+ *   -ENXIO   - no EPROM is available
+ *   -EBUSY   - not able to acquire access to the EPROM
+ *   -ENOENT  - no recognizable file written
+ *   -ENOMEM  - buffer could not be allocated
+ *   -EINVAL  - invalid EPROM contentents found
+ */
+int eprom_read_platform_config(struct hfi1_devdata *dd, void **data, u32 *size)
+{
+	u32 directory[EP_PAGE_DWORDS]; /* aligned buffer */
+	int ret;
+
+	if (!dd->eprom_available)
+		return -ENXIO;
+
+	ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT);
+	if (ret)
+		return -EBUSY;
+
+	/* read the last page of the segment for the EPROM format magic */
+	ret = read_length(dd, SEG_SIZE - EP_PAGE_SIZE, EP_PAGE_SIZE, directory);
+	if (ret)
+		goto done;
+
+	/* last dword of the segment contains a magic value */
+	if (directory[EP_PAGE_DWORDS - 1] == FOOTER_MAGIC) {
+		/* segment format */
+		ret = read_segment_platform_config(dd, directory, data, size);
+	} else {
+		/* partition format */
+		ret = read_partition_platform_config(dd, data, size);
+	}
+
+done:
+	release_chip_resource(dd, CR_EPROM);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/eprom.h b/drivers/infiniband/hw/hfi1/eprom.h
new file mode 100644
index 0000000..e774184
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/eprom.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_devdata;
+
+int eprom_init(struct hfi1_devdata *dd);
+int eprom_read_platform_config(struct hfi1_devdata *dd, void **buf_ret,
+			       u32 *size_ret);
diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.c b/drivers/infiniband/hw/hfi1/exp_rcv.c
new file mode 100644
index 0000000..0af9167
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/exp_rcv.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "exp_rcv.h"
+#include "trace.h"
+
+/**
+ * exp_tid_group_init - initialize exp_tid_set
+ * @set - the set
+ */
+void hfi1_exp_tid_group_init(struct exp_tid_set *set)
+{
+	INIT_LIST_HEAD(&set->list);
+	set->count = 0;
+}
+
+/**
+ * alloc_ctxt_rcv_groups - initialize expected receive groups
+ * @rcd - the context to add the groupings to
+ */
+int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 tidbase;
+	struct tid_group *grp;
+	int i;
+
+	tidbase = rcd->expected_base;
+	for (i = 0; i < rcd->expected_count /
+		     dd->rcv_entries.group_size; i++) {
+		grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+		if (!grp)
+			goto bail;
+		grp->size = dd->rcv_entries.group_size;
+		grp->base = tidbase;
+		tid_group_add_tail(grp, &rcd->tid_group_list);
+		tidbase += dd->rcv_entries.group_size;
+	}
+
+	return 0;
+bail:
+	hfi1_free_ctxt_rcv_groups(rcd);
+	return -ENOMEM;
+}
+
+/**
+ * free_ctxt_rcv_groups - free  expected receive groups
+ * @rcd - the context to free
+ *
+ * The routine dismantles the expect receive linked
+ * list and clears any tids associated with the receive
+ * context.
+ *
+ * This should only be called for kernel contexts and the
+ * a base user context.
+ */
+void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
+{
+	struct tid_group *grp, *gptr;
+
+	WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_full_list));
+	WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_used_list));
+
+	list_for_each_entry_safe(grp, gptr, &rcd->tid_group_list.list, list) {
+		tid_group_remove(grp, &rcd->tid_group_list);
+		kfree(grp);
+	}
+
+	hfi1_clear_tids(rcd);
+}
diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.h b/drivers/infiniband/hw/hfi1/exp_rcv.h
new file mode 100644
index 0000000..0871904
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/exp_rcv.h
@@ -0,0 +1,190 @@
+#ifndef _HFI1_EXP_RCV_H
+#define _HFI1_EXP_RCV_H
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list))
+
+#define EXP_TID_TIDLEN_MASK   0x7FFULL
+#define EXP_TID_TIDLEN_SHIFT  0
+#define EXP_TID_TIDCTRL_MASK  0x3ULL
+#define EXP_TID_TIDCTRL_SHIFT 20
+#define EXP_TID_TIDIDX_MASK   0x3FFULL
+#define EXP_TID_TIDIDX_SHIFT  22
+#define EXP_TID_GET(tid, field)	\
+	(((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
+
+#define EXP_TID_SET(field, value)			\
+	(((value) & EXP_TID_TID##field##_MASK) <<	\
+	 EXP_TID_TID##field##_SHIFT)
+#define EXP_TID_CLEAR(tid, field) ({					\
+		(tid) &= ~(EXP_TID_TID##field##_MASK <<			\
+			   EXP_TID_TID##field##_SHIFT);			\
+		})
+#define EXP_TID_RESET(tid, field, value) do {				\
+		EXP_TID_CLEAR(tid, field);				\
+		(tid) |= EXP_TID_SET(field, (value));			\
+	} while (0)
+
+/*
+ * Define fields in the KDETH header so we can update the header
+ * template.
+ */
+#define KDETH_OFFSET_SHIFT        0
+#define KDETH_OFFSET_MASK         0x7fff
+#define KDETH_OM_SHIFT            15
+#define KDETH_OM_MASK             0x1
+#define KDETH_TID_SHIFT           16
+#define KDETH_TID_MASK            0x3ff
+#define KDETH_TIDCTRL_SHIFT       26
+#define KDETH_TIDCTRL_MASK        0x3
+#define KDETH_INTR_SHIFT          28
+#define KDETH_INTR_MASK           0x1
+#define KDETH_SH_SHIFT            29
+#define KDETH_SH_MASK             0x1
+#define KDETH_KVER_SHIFT          30
+#define KDETH_KVER_MASK           0x3
+#define KDETH_JKEY_SHIFT          0x0
+#define KDETH_JKEY_MASK           0xff
+#define KDETH_HCRC_UPPER_SHIFT    16
+#define KDETH_HCRC_UPPER_MASK     0xff
+#define KDETH_HCRC_LOWER_SHIFT    24
+#define KDETH_HCRC_LOWER_MASK     0xff
+
+#define KDETH_GET(val, field)						\
+	(((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
+#define KDETH_SET(dw, field, val) do {					\
+		u32 dwval = le32_to_cpu(dw);				\
+		dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
+		dwval |= (((val) & KDETH_##field##_MASK) << \
+			  KDETH_##field##_SHIFT);			\
+		dw = cpu_to_le32(dwval);				\
+	} while (0)
+
+#define KDETH_RESET(dw, field, val) ({ dw = 0; KDETH_SET(dw, field, val); })
+
+/* KDETH OM multipliers and switch over point */
+#define KDETH_OM_SMALL     4
+#define KDETH_OM_SMALL_SHIFT     2
+#define KDETH_OM_LARGE     64
+#define KDETH_OM_LARGE_SHIFT     6
+#define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
+
+struct tid_group {
+	struct list_head list;
+	u32 base;
+	u8 size;
+	u8 used;
+	u8 map;
+};
+
+/*
+ * Write an "empty" RcvArray entry.
+ * This function exists so the TID registaration code can use it
+ * to write to unused/unneeded entries and still take advantage
+ * of the WC performance improvements. The HFI will ignore this
+ * write to the RcvArray entry.
+ */
+static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
+{
+	/*
+	 * Doing the WC fill writes only makes sense if the device is
+	 * present and the RcvArray has been mapped as WC memory.
+	 */
+	if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc) {
+		writeq(0, dd->rcvarray_wc + (index * 8));
+		if ((index & 3) == 3)
+			flush_wc();
+	}
+}
+
+static inline void tid_group_add_tail(struct tid_group *grp,
+				      struct exp_tid_set *set)
+{
+	list_add_tail(&grp->list, &set->list);
+	set->count++;
+}
+
+static inline void tid_group_remove(struct tid_group *grp,
+				    struct exp_tid_set *set)
+{
+	list_del_init(&grp->list);
+	set->count--;
+}
+
+static inline void tid_group_move(struct tid_group *group,
+				  struct exp_tid_set *s1,
+				  struct exp_tid_set *s2)
+{
+	tid_group_remove(group, s1);
+	tid_group_add_tail(group, s2);
+}
+
+static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
+{
+	struct tid_group *grp =
+		list_first_entry(&set->list, struct tid_group, list);
+	list_del_init(&grp->list);
+	set->count--;
+	return grp;
+}
+
+static inline u32 rcventry2tidinfo(u32 rcventry)
+{
+	u32 pair = rcventry & ~0x1;
+
+	return EXP_TID_SET(IDX, pair >> 1) |
+		EXP_TID_SET(CTRL, 1 << (rcventry - pair));
+}
+
+int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd);
+void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd);
+void hfi1_exp_tid_group_init(struct exp_tid_set *set);
+
+#endif /* _HFI1_EXP_RCV_H */
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
new file mode 100644
index 0000000..da4aa1a
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -0,0 +1,1703 @@
+/*
+ * Copyright(c) 2015-2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/vmalloc.h>
+#include <linux/io.h>
+#include <linux/sched/mm.h>
+#include <linux/bitmap.h>
+
+#include <rdma/ib.h>
+
+#include "hfi.h"
+#include "pio.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+#include "mmu_rb.h"
+#include "user_sdma.h"
+#include "user_exp_rcv.h"
+#include "aspm.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
+
+/*
+ * File operation functions
+ */
+static int hfi1_file_open(struct inode *inode, struct file *fp);
+static int hfi1_file_close(struct inode *inode, struct file *fp);
+static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from);
+static __poll_t hfi1_poll(struct file *fp, struct poll_table_struct *pt);
+static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma);
+
+static u64 kvirt_to_phys(void *addr);
+static int assign_ctxt(struct hfi1_filedata *fd, unsigned long arg, u32 len);
+static void init_subctxts(struct hfi1_ctxtdata *uctxt,
+			  const struct hfi1_user_info *uinfo);
+static int init_user_ctxt(struct hfi1_filedata *fd,
+			  struct hfi1_ctxtdata *uctxt);
+static void user_init(struct hfi1_ctxtdata *uctxt);
+static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len);
+static int get_base_info(struct hfi1_filedata *fd, unsigned long arg, u32 len);
+static int user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned long arg,
+			      u32 len);
+static int user_exp_rcv_clear(struct hfi1_filedata *fd, unsigned long arg,
+			      u32 len);
+static int user_exp_rcv_invalid(struct hfi1_filedata *fd, unsigned long arg,
+				u32 len);
+static int setup_base_ctxt(struct hfi1_filedata *fd,
+			   struct hfi1_ctxtdata *uctxt);
+static int setup_subctxt(struct hfi1_ctxtdata *uctxt);
+
+static int find_sub_ctxt(struct hfi1_filedata *fd,
+			 const struct hfi1_user_info *uinfo);
+static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
+			 struct hfi1_user_info *uinfo,
+			 struct hfi1_ctxtdata **cd);
+static void deallocate_ctxt(struct hfi1_ctxtdata *uctxt);
+static __poll_t poll_urgent(struct file *fp, struct poll_table_struct *pt);
+static __poll_t poll_next(struct file *fp, struct poll_table_struct *pt);
+static int user_event_ack(struct hfi1_ctxtdata *uctxt, u16 subctxt,
+			  unsigned long arg);
+static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned long arg);
+static int ctxt_reset(struct hfi1_ctxtdata *uctxt);
+static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt,
+		       unsigned long arg);
+static int vma_fault(struct vm_fault *vmf);
+static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
+			    unsigned long arg);
+
+static const struct file_operations hfi1_file_ops = {
+	.owner = THIS_MODULE,
+	.write_iter = hfi1_write_iter,
+	.open = hfi1_file_open,
+	.release = hfi1_file_close,
+	.unlocked_ioctl = hfi1_file_ioctl,
+	.poll = hfi1_poll,
+	.mmap = hfi1_file_mmap,
+	.llseek = noop_llseek,
+};
+
+static const struct vm_operations_struct vm_ops = {
+	.fault = vma_fault,
+};
+
+/*
+ * Types of memories mapped into user processes' space
+ */
+enum mmap_types {
+	PIO_BUFS = 1,
+	PIO_BUFS_SOP,
+	PIO_CRED,
+	RCV_HDRQ,
+	RCV_EGRBUF,
+	UREGS,
+	EVENTS,
+	STATUS,
+	RTAIL,
+	SUBCTXT_UREGS,
+	SUBCTXT_RCV_HDRQ,
+	SUBCTXT_EGRBUF,
+	SDMA_COMP
+};
+
+/*
+ * Masks and offsets defining the mmap tokens
+ */
+#define HFI1_MMAP_OFFSET_MASK   0xfffULL
+#define HFI1_MMAP_OFFSET_SHIFT  0
+#define HFI1_MMAP_SUBCTXT_MASK  0xfULL
+#define HFI1_MMAP_SUBCTXT_SHIFT 12
+#define HFI1_MMAP_CTXT_MASK     0xffULL
+#define HFI1_MMAP_CTXT_SHIFT    16
+#define HFI1_MMAP_TYPE_MASK     0xfULL
+#define HFI1_MMAP_TYPE_SHIFT    24
+#define HFI1_MMAP_MAGIC_MASK    0xffffffffULL
+#define HFI1_MMAP_MAGIC_SHIFT   32
+
+#define HFI1_MMAP_MAGIC         0xdabbad00
+
+#define HFI1_MMAP_TOKEN_SET(field, val)	\
+	(((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
+#define HFI1_MMAP_TOKEN_GET(field, token) \
+	(((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
+#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr)   \
+	(HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
+	HFI1_MMAP_TOKEN_SET(TYPE, type) | \
+	HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
+	HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
+	HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr))))
+
+#define dbg(fmt, ...)				\
+	pr_info(fmt, ##__VA_ARGS__)
+
+static inline int is_valid_mmap(u64 token)
+{
+	return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
+}
+
+static int hfi1_file_open(struct inode *inode, struct file *fp)
+{
+	struct hfi1_filedata *fd;
+	struct hfi1_devdata *dd = container_of(inode->i_cdev,
+					       struct hfi1_devdata,
+					       user_cdev);
+
+	if (!((dd->flags & HFI1_PRESENT) && dd->kregbase1))
+		return -EINVAL;
+
+	if (!atomic_inc_not_zero(&dd->user_refcount))
+		return -ENXIO;
+
+	/* The real work is performed later in assign_ctxt() */
+
+	fd = kzalloc(sizeof(*fd), GFP_KERNEL);
+
+	if (fd) {
+		fd->rec_cpu_num = -1; /* no cpu affinity by default */
+		fd->mm = current->mm;
+		mmgrab(fd->mm);
+		fd->dd = dd;
+		kobject_get(&fd->dd->kobj);
+		fp->private_data = fd;
+	} else {
+		fp->private_data = NULL;
+
+		if (atomic_dec_and_test(&dd->user_refcount))
+			complete(&dd->user_comp);
+
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
+			    unsigned long arg)
+{
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	int ret = 0;
+	int uval = 0;
+
+	hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd);
+	if (cmd != HFI1_IOCTL_ASSIGN_CTXT &&
+	    cmd != HFI1_IOCTL_GET_VERS &&
+	    !uctxt)
+		return -EINVAL;
+
+	switch (cmd) {
+	case HFI1_IOCTL_ASSIGN_CTXT:
+		ret = assign_ctxt(fd, arg, _IOC_SIZE(cmd));
+		break;
+
+	case HFI1_IOCTL_CTXT_INFO:
+		ret = get_ctxt_info(fd, arg, _IOC_SIZE(cmd));
+		break;
+
+	case HFI1_IOCTL_USER_INFO:
+		ret = get_base_info(fd, arg, _IOC_SIZE(cmd));
+		break;
+
+	case HFI1_IOCTL_CREDIT_UPD:
+		if (uctxt)
+			sc_return_credits(uctxt->sc);
+		break;
+
+	case HFI1_IOCTL_TID_UPDATE:
+		ret = user_exp_rcv_setup(fd, arg, _IOC_SIZE(cmd));
+		break;
+
+	case HFI1_IOCTL_TID_FREE:
+		ret = user_exp_rcv_clear(fd, arg, _IOC_SIZE(cmd));
+		break;
+
+	case HFI1_IOCTL_TID_INVAL_READ:
+		ret = user_exp_rcv_invalid(fd, arg, _IOC_SIZE(cmd));
+		break;
+
+	case HFI1_IOCTL_RECV_CTRL:
+		ret = manage_rcvq(uctxt, fd->subctxt, arg);
+		break;
+
+	case HFI1_IOCTL_POLL_TYPE:
+		if (get_user(uval, (int __user *)arg))
+			return -EFAULT;
+		uctxt->poll_type = (typeof(uctxt->poll_type))uval;
+		break;
+
+	case HFI1_IOCTL_ACK_EVENT:
+		ret = user_event_ack(uctxt, fd->subctxt, arg);
+		break;
+
+	case HFI1_IOCTL_SET_PKEY:
+		ret = set_ctxt_pkey(uctxt, arg);
+		break;
+
+	case HFI1_IOCTL_CTXT_RESET:
+		ret = ctxt_reset(uctxt);
+		break;
+
+	case HFI1_IOCTL_GET_VERS:
+		uval = HFI1_USER_SWVERSION;
+		if (put_user(uval, (int __user *)arg))
+			return -EFAULT;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
+{
+	struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
+	struct hfi1_user_sdma_pkt_q *pq = fd->pq;
+	struct hfi1_user_sdma_comp_q *cq = fd->cq;
+	int done = 0, reqs = 0;
+	unsigned long dim = from->nr_segs;
+
+	if (!cq || !pq)
+		return -EIO;
+
+	if (!iter_is_iovec(from) || !dim)
+		return -EINVAL;
+
+	trace_hfi1_sdma_request(fd->dd, fd->uctxt->ctxt, fd->subctxt, dim);
+
+	if (atomic_read(&pq->n_reqs) == pq->n_max_reqs)
+		return -ENOSPC;
+
+	while (dim) {
+		int ret;
+		unsigned long count = 0;
+
+		ret = hfi1_user_sdma_process_request(
+			fd, (struct iovec *)(from->iov + done),
+			dim, &count);
+		if (ret) {
+			reqs = ret;
+			break;
+		}
+		dim -= count;
+		done += count;
+		reqs++;
+	}
+
+	return reqs;
+}
+
+static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd;
+	unsigned long flags;
+	u64 token = vma->vm_pgoff << PAGE_SHIFT,
+		memaddr = 0;
+	void *memvirt = NULL;
+	u8 subctxt, mapio = 0, vmf = 0, type;
+	ssize_t memlen = 0;
+	int ret = 0;
+	u16 ctxt;
+
+	if (!is_valid_mmap(token) || !uctxt ||
+	    !(vma->vm_flags & VM_SHARED)) {
+		ret = -EINVAL;
+		goto done;
+	}
+	dd = uctxt->dd;
+	ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
+	subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
+	type = HFI1_MMAP_TOKEN_GET(TYPE, token);
+	if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	flags = vma->vm_flags;
+
+	switch (type) {
+	case PIO_BUFS:
+	case PIO_BUFS_SOP:
+		memaddr = ((dd->physaddr + TXE_PIO_SEND) +
+				/* chip pio base */
+			   (uctxt->sc->hw_context * BIT(16))) +
+				/* 64K PIO space / ctxt */
+			(type == PIO_BUFS_SOP ?
+				(TXE_PIO_SIZE / 2) : 0); /* sop? */
+		/*
+		 * Map only the amount allocated to the context, not the
+		 * entire available context's PIO space.
+		 */
+		memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE);
+		flags &= ~VM_MAYREAD;
+		flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+		mapio = 1;
+		break;
+	case PIO_CRED:
+		if (flags & VM_WRITE) {
+			ret = -EPERM;
+			goto done;
+		}
+		/*
+		 * The credit return location for this context could be on the
+		 * second or third page allocated for credit returns (if number
+		 * of enabled contexts > 64 and 128 respectively).
+		 */
+		memvirt = dd->cr_base[uctxt->numa_id].va;
+		memaddr = virt_to_phys(memvirt) +
+			(((u64)uctxt->sc->hw_free -
+			  (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
+		memlen = PAGE_SIZE;
+		flags &= ~VM_MAYWRITE;
+		flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		/*
+		 * The driver has already allocated memory for credit
+		 * returns and programmed it into the chip. Has that
+		 * memory been flagged as non-cached?
+		 */
+		/* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
+		mapio = 1;
+		break;
+	case RCV_HDRQ:
+		memlen = uctxt->rcvhdrq_size;
+		memvirt = uctxt->rcvhdrq;
+		break;
+	case RCV_EGRBUF: {
+		unsigned long addr;
+		int i;
+		/*
+		 * The RcvEgr buffer need to be handled differently
+		 * as multiple non-contiguous pages need to be mapped
+		 * into the user process.
+		 */
+		memlen = uctxt->egrbufs.size;
+		if ((vma->vm_end - vma->vm_start) != memlen) {
+			dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
+				   (vma->vm_end - vma->vm_start), memlen);
+			ret = -EINVAL;
+			goto done;
+		}
+		if (vma->vm_flags & VM_WRITE) {
+			ret = -EPERM;
+			goto done;
+		}
+		vma->vm_flags &= ~VM_MAYWRITE;
+		addr = vma->vm_start;
+		for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
+			memlen = uctxt->egrbufs.buffers[i].len;
+			memvirt = uctxt->egrbufs.buffers[i].addr;
+			ret = remap_pfn_range(
+				vma, addr,
+				/*
+				 * virt_to_pfn() does the same, but
+				 * it's not available on x86_64
+				 * when CONFIG_MMU is enabled.
+				 */
+				PFN_DOWN(__pa(memvirt)),
+				memlen,
+				vma->vm_page_prot);
+			if (ret < 0)
+				goto done;
+			addr += memlen;
+		}
+		ret = 0;
+		goto done;
+	}
+	case UREGS:
+		/*
+		 * Map only the page that contains this context's user
+		 * registers.
+		 */
+		memaddr = (unsigned long)
+			(dd->physaddr + RXE_PER_CONTEXT_USER)
+			+ (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
+		/*
+		 * TidFlow table is on the same page as the rest of the
+		 * user registers.
+		 */
+		memlen = PAGE_SIZE;
+		flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		mapio = 1;
+		break;
+	case EVENTS:
+		/*
+		 * Use the page where this context's flags are. User level
+		 * knows where it's own bitmap is within the page.
+		 */
+		memaddr = (unsigned long)
+			(dd->events + uctxt_offset(uctxt)) & PAGE_MASK;
+		memlen = PAGE_SIZE;
+		/*
+		 * v3.7 removes VM_RESERVED but the effect is kept by
+		 * using VM_IO.
+		 */
+		flags |= VM_IO | VM_DONTEXPAND;
+		vmf = 1;
+		break;
+	case STATUS:
+		if (flags & (unsigned long)(VM_WRITE | VM_EXEC)) {
+			ret = -EPERM;
+			goto done;
+		}
+		memaddr = kvirt_to_phys((void *)dd->status);
+		memlen = PAGE_SIZE;
+		flags |= VM_IO | VM_DONTEXPAND;
+		break;
+	case RTAIL:
+		if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
+			/*
+			 * If the memory allocation failed, the context alloc
+			 * also would have failed, so we would never get here
+			 */
+			ret = -EINVAL;
+			goto done;
+		}
+		if (flags & VM_WRITE) {
+			ret = -EPERM;
+			goto done;
+		}
+		memlen = PAGE_SIZE;
+		memvirt = (void *)uctxt->rcvhdrtail_kvaddr;
+		flags &= ~VM_MAYWRITE;
+		break;
+	case SUBCTXT_UREGS:
+		memaddr = (u64)uctxt->subctxt_uregbase;
+		memlen = PAGE_SIZE;
+		flags |= VM_IO | VM_DONTEXPAND;
+		vmf = 1;
+		break;
+	case SUBCTXT_RCV_HDRQ:
+		memaddr = (u64)uctxt->subctxt_rcvhdr_base;
+		memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt;
+		flags |= VM_IO | VM_DONTEXPAND;
+		vmf = 1;
+		break;
+	case SUBCTXT_EGRBUF:
+		memaddr = (u64)uctxt->subctxt_rcvegrbuf;
+		memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
+		flags |= VM_IO | VM_DONTEXPAND;
+		flags &= ~VM_MAYWRITE;
+		vmf = 1;
+		break;
+	case SDMA_COMP: {
+		struct hfi1_user_sdma_comp_q *cq = fd->cq;
+
+		if (!cq) {
+			ret = -EFAULT;
+			goto done;
+		}
+		memaddr = (u64)cq->comps;
+		memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries);
+		flags |= VM_IO | VM_DONTEXPAND;
+		vmf = 1;
+		break;
+	}
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	if ((vma->vm_end - vma->vm_start) != memlen) {
+		hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
+			  uctxt->ctxt, fd->subctxt,
+			  (vma->vm_end - vma->vm_start), memlen);
+		ret = -EINVAL;
+		goto done;
+	}
+
+	vma->vm_flags = flags;
+	hfi1_cdbg(PROC,
+		  "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
+		    ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
+		    vma->vm_end - vma->vm_start, vma->vm_flags);
+	if (vmf) {
+		vma->vm_pgoff = PFN_DOWN(memaddr);
+		vma->vm_ops = &vm_ops;
+		ret = 0;
+	} else if (mapio) {
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 PFN_DOWN(memaddr),
+					 memlen,
+					 vma->vm_page_prot);
+	} else if (memvirt) {
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      PFN_DOWN(__pa(memvirt)),
+				      memlen,
+				      vma->vm_page_prot);
+	} else {
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      PFN_DOWN(memaddr),
+				      memlen,
+				      vma->vm_page_prot);
+	}
+done:
+	return ret;
+}
+
+/*
+ * Local (non-chip) user memory is not mapped right away but as it is
+ * accessed by the user-level code.
+ */
+static int vma_fault(struct vm_fault *vmf)
+{
+	struct page *page;
+
+	page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+	if (!page)
+		return VM_FAULT_SIGBUS;
+
+	get_page(page);
+	vmf->page = page;
+
+	return 0;
+}
+
+static __poll_t hfi1_poll(struct file *fp, struct poll_table_struct *pt)
+{
+	struct hfi1_ctxtdata *uctxt;
+	__poll_t pollflag;
+
+	uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt;
+	if (!uctxt)
+		pollflag = EPOLLERR;
+	else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
+		pollflag = poll_urgent(fp, pt);
+	else  if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
+		pollflag = poll_next(fp, pt);
+	else /* invalid */
+		pollflag = EPOLLERR;
+
+	return pollflag;
+}
+
+static int hfi1_file_close(struct inode *inode, struct file *fp)
+{
+	struct hfi1_filedata *fdata = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
+	struct hfi1_devdata *dd = container_of(inode->i_cdev,
+					       struct hfi1_devdata,
+					       user_cdev);
+	unsigned long flags, *ev;
+
+	fp->private_data = NULL;
+
+	if (!uctxt)
+		goto done;
+
+	hfi1_cdbg(PROC, "closing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
+
+	flush_wc();
+	/* drain user sdma queue */
+	hfi1_user_sdma_free_queues(fdata, uctxt);
+
+	/* release the cpu */
+	hfi1_put_proc_affinity(fdata->rec_cpu_num);
+
+	/* clean up rcv side */
+	hfi1_user_exp_rcv_free(fdata);
+
+	/*
+	 * fdata->uctxt is used in the above cleanup.  It is not ready to be
+	 * removed until here.
+	 */
+	fdata->uctxt = NULL;
+	hfi1_rcd_put(uctxt);
+
+	/*
+	 * Clear any left over, unhandled events so the next process that
+	 * gets this context doesn't get confused.
+	 */
+	ev = dd->events + uctxt_offset(uctxt) + fdata->subctxt;
+	*ev = 0;
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	__clear_bit(fdata->subctxt, uctxt->in_use_ctxts);
+	if (!bitmap_empty(uctxt->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) {
+		spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+		goto done;
+	}
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+	/*
+	 * Disable receive context and interrupt available, reset all
+	 * RcvCtxtCtrl bits to default values.
+	 */
+	hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+		     HFI1_RCVCTRL_TIDFLOW_DIS |
+		     HFI1_RCVCTRL_INTRAVAIL_DIS |
+		     HFI1_RCVCTRL_TAILUPD_DIS |
+		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
+		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
+		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
+	/* Clear the context's J_KEY */
+	hfi1_clear_ctxt_jkey(dd, uctxt);
+	/*
+	 * If a send context is allocated, reset context integrity
+	 * checks to default and disable the send context.
+	 */
+	if (uctxt->sc) {
+		set_pio_integrity(uctxt->sc);
+		sc_disable(uctxt->sc);
+	}
+
+	hfi1_free_ctxt_rcv_groups(uctxt);
+	hfi1_clear_ctxt_pkey(dd, uctxt);
+
+	uctxt->event_flags = 0;
+
+	deallocate_ctxt(uctxt);
+done:
+	mmdrop(fdata->mm);
+	kobject_put(&dd->kobj);
+
+	if (atomic_dec_and_test(&dd->user_refcount))
+		complete(&dd->user_comp);
+
+	kfree(fdata);
+	return 0;
+}
+
+/*
+ * Convert kernel *virtual* addresses to physical addresses.
+ * This is used to vmalloc'ed addresses.
+ */
+static u64 kvirt_to_phys(void *addr)
+{
+	struct page *page;
+	u64 paddr = 0;
+
+	page = vmalloc_to_page(addr);
+	if (page)
+		paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+	return paddr;
+}
+
+/**
+ * complete_subctxt
+ * @fd: valid filedata pointer
+ *
+ * Sub-context info can only be set up after the base context
+ * has been completed.  This is indicated by the clearing of the
+ * HFI1_CTXT_BASE_UINIT bit.
+ *
+ * Wait for the bit to be cleared, and then complete the subcontext
+ * initialization.
+ *
+ */
+static int complete_subctxt(struct hfi1_filedata *fd)
+{
+	int ret;
+	unsigned long flags;
+
+	/*
+	 * sub-context info can only be set up after the base context
+	 * has been completed.
+	 */
+	ret = wait_event_interruptible(
+		fd->uctxt->wait,
+		!test_bit(HFI1_CTXT_BASE_UNINIT, &fd->uctxt->event_flags));
+
+	if (test_bit(HFI1_CTXT_BASE_FAILED, &fd->uctxt->event_flags))
+		ret = -ENOMEM;
+
+	/* Finish the sub-context init */
+	if (!ret) {
+		fd->rec_cpu_num = hfi1_get_proc_affinity(fd->uctxt->numa_id);
+		ret = init_user_ctxt(fd, fd->uctxt);
+	}
+
+	if (ret) {
+		spin_lock_irqsave(&fd->dd->uctxt_lock, flags);
+		__clear_bit(fd->subctxt, fd->uctxt->in_use_ctxts);
+		spin_unlock_irqrestore(&fd->dd->uctxt_lock, flags);
+		hfi1_rcd_put(fd->uctxt);
+		fd->uctxt = NULL;
+	}
+
+	return ret;
+}
+
+static int assign_ctxt(struct hfi1_filedata *fd, unsigned long arg, u32 len)
+{
+	int ret;
+	unsigned int swmajor;
+	struct hfi1_ctxtdata *uctxt = NULL;
+	struct hfi1_user_info uinfo;
+
+	if (fd->uctxt)
+		return -EINVAL;
+
+	if (sizeof(uinfo) != len)
+		return -EINVAL;
+
+	if (copy_from_user(&uinfo, (void __user *)arg, sizeof(uinfo)))
+		return -EFAULT;
+
+	swmajor = uinfo.userversion >> 16;
+	if (swmajor != HFI1_USER_SWMAJOR)
+		return -ENODEV;
+
+	if (uinfo.subctxt_cnt > HFI1_MAX_SHARED_CTXTS)
+		return -EINVAL;
+
+	/*
+	 * Acquire the mutex to protect against multiple creations of what
+	 * could be a shared base context.
+	 */
+	mutex_lock(&hfi1_mutex);
+	/*
+	 * Get a sub context if available  (fd->uctxt will be set).
+	 * ret < 0 error, 0 no context, 1 sub-context found
+	 */
+	ret = find_sub_ctxt(fd, &uinfo);
+
+	/*
+	 * Allocate a base context if context sharing is not required or a
+	 * sub context wasn't found.
+	 */
+	if (!ret)
+		ret = allocate_ctxt(fd, fd->dd, &uinfo, &uctxt);
+
+	mutex_unlock(&hfi1_mutex);
+
+	/* Depending on the context type, finish the appropriate init */
+	switch (ret) {
+	case 0:
+		ret = setup_base_ctxt(fd, uctxt);
+		if (ret)
+			deallocate_ctxt(uctxt);
+		break;
+	case 1:
+		ret = complete_subctxt(fd);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * match_ctxt
+ * @fd: valid filedata pointer
+ * @uinfo: user info to compare base context with
+ * @uctxt: context to compare uinfo to.
+ *
+ * Compare the given context with the given information to see if it
+ * can be used for a sub context.
+ */
+static int match_ctxt(struct hfi1_filedata *fd,
+		      const struct hfi1_user_info *uinfo,
+		      struct hfi1_ctxtdata *uctxt)
+{
+	struct hfi1_devdata *dd = fd->dd;
+	unsigned long flags;
+	u16 subctxt;
+
+	/* Skip dynamically allocated kernel contexts */
+	if (uctxt->sc && (uctxt->sc->type == SC_KERNEL))
+		return 0;
+
+	/* Skip ctxt if it doesn't match the requested one */
+	if (memcmp(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid)) ||
+	    uctxt->jkey != generate_jkey(current_uid()) ||
+	    uctxt->subctxt_id != uinfo->subctxt_id ||
+	    uctxt->subctxt_cnt != uinfo->subctxt_cnt)
+		return 0;
+
+	/* Verify the sharing process matches the base */
+	if (uctxt->userversion != uinfo->userversion)
+		return -EINVAL;
+
+	/* Find an unused sub context */
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	if (bitmap_empty(uctxt->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) {
+		/* context is being closed, do not use */
+		spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+		return 0;
+	}
+
+	subctxt = find_first_zero_bit(uctxt->in_use_ctxts,
+				      HFI1_MAX_SHARED_CTXTS);
+	if (subctxt >= uctxt->subctxt_cnt) {
+		spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+		return -EBUSY;
+	}
+
+	fd->subctxt = subctxt;
+	__set_bit(fd->subctxt, uctxt->in_use_ctxts);
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+	fd->uctxt = uctxt;
+	hfi1_rcd_get(uctxt);
+
+	return 1;
+}
+
+/**
+ * find_sub_ctxt
+ * @fd: valid filedata pointer
+ * @uinfo: matching info to use to find a possible context to share.
+ *
+ * The hfi1_mutex must be held when this function is called.  It is
+ * necessary to ensure serialized creation of shared contexts.
+ *
+ * Return:
+ *    0      No sub-context found
+ *    1      Subcontext found and allocated
+ *    errno  EINVAL (incorrect parameters)
+ *           EBUSY (all sub contexts in use)
+ */
+static int find_sub_ctxt(struct hfi1_filedata *fd,
+			 const struct hfi1_user_info *uinfo)
+{
+	struct hfi1_ctxtdata *uctxt;
+	struct hfi1_devdata *dd = fd->dd;
+	u16 i;
+	int ret;
+
+	if (!uinfo->subctxt_cnt)
+		return 0;
+
+	for (i = dd->first_dyn_alloc_ctxt; i < dd->num_rcv_contexts; i++) {
+		uctxt = hfi1_rcd_get_by_index(dd, i);
+		if (uctxt) {
+			ret = match_ctxt(fd, uinfo, uctxt);
+			hfi1_rcd_put(uctxt);
+			/* value of != 0 will return */
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
+			 struct hfi1_user_info *uinfo,
+			 struct hfi1_ctxtdata **rcd)
+{
+	struct hfi1_ctxtdata *uctxt;
+	int ret, numa;
+
+	if (dd->flags & HFI1_FROZEN) {
+		/*
+		 * Pick an error that is unique from all other errors
+		 * that are returned so the user process knows that
+		 * it tried to allocate while the SPC was frozen.  It
+		 * it should be able to retry with success in a short
+		 * while.
+		 */
+		return -EIO;
+	}
+
+	if (!dd->freectxts)
+		return -EBUSY;
+
+	/*
+	 * If we don't have a NUMA node requested, preference is towards
+	 * device NUMA node.
+	 */
+	fd->rec_cpu_num = hfi1_get_proc_affinity(dd->node);
+	if (fd->rec_cpu_num != -1)
+		numa = cpu_to_node(fd->rec_cpu_num);
+	else
+		numa = numa_node_id();
+	ret = hfi1_create_ctxtdata(dd->pport, numa, &uctxt);
+	if (ret < 0) {
+		dd_dev_err(dd, "user ctxtdata allocation failed\n");
+		return ret;
+	}
+	hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)",
+		  uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num,
+		  uctxt->numa_id);
+
+	/*
+	 * Allocate and enable a PIO send context.
+	 */
+	uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize, dd->node);
+	if (!uctxt->sc) {
+		ret = -ENOMEM;
+		goto ctxdata_free;
+	}
+	hfi1_cdbg(PROC, "allocated send context %u(%u)\n", uctxt->sc->sw_index,
+		  uctxt->sc->hw_context);
+	ret = sc_enable(uctxt->sc);
+	if (ret)
+		goto ctxdata_free;
+
+	/*
+	 * Setup sub context information if the user-level has requested
+	 * sub contexts.
+	 * This has to be done here so the rest of the sub-contexts find the
+	 * proper base context.
+	 */
+	if (uinfo->subctxt_cnt)
+		init_subctxts(uctxt, uinfo);
+	uctxt->userversion = uinfo->userversion;
+	uctxt->flags = hfi1_cap_mask; /* save current flag state */
+	init_waitqueue_head(&uctxt->wait);
+	strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
+	memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
+	uctxt->jkey = generate_jkey(current_uid());
+	hfi1_stats.sps_ctxts++;
+	/*
+	 * Disable ASPM when there are open user/PSM contexts to avoid
+	 * issues with ASPM L1 exit latency
+	 */
+	if (dd->freectxts-- == dd->num_user_contexts)
+		aspm_disable_all(dd);
+
+	*rcd = uctxt;
+
+	return 0;
+
+ctxdata_free:
+	hfi1_free_ctxt(uctxt);
+	return ret;
+}
+
+static void deallocate_ctxt(struct hfi1_ctxtdata *uctxt)
+{
+	mutex_lock(&hfi1_mutex);
+	hfi1_stats.sps_ctxts--;
+	if (++uctxt->dd->freectxts == uctxt->dd->num_user_contexts)
+		aspm_enable_all(uctxt->dd);
+	mutex_unlock(&hfi1_mutex);
+
+	hfi1_free_ctxt(uctxt);
+}
+
+static void init_subctxts(struct hfi1_ctxtdata *uctxt,
+			  const struct hfi1_user_info *uinfo)
+{
+	uctxt->subctxt_cnt = uinfo->subctxt_cnt;
+	uctxt->subctxt_id = uinfo->subctxt_id;
+	set_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags);
+}
+
+static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
+{
+	int ret = 0;
+	u16 num_subctxts = uctxt->subctxt_cnt;
+
+	uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
+	if (!uctxt->subctxt_uregbase)
+		return -ENOMEM;
+
+	/* We can take the size of the RcvHdr Queue from the master */
+	uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size *
+						  num_subctxts);
+	if (!uctxt->subctxt_rcvhdr_base) {
+		ret = -ENOMEM;
+		goto bail_ureg;
+	}
+
+	uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
+						num_subctxts);
+	if (!uctxt->subctxt_rcvegrbuf) {
+		ret = -ENOMEM;
+		goto bail_rhdr;
+	}
+
+	return 0;
+
+bail_rhdr:
+	vfree(uctxt->subctxt_rcvhdr_base);
+	uctxt->subctxt_rcvhdr_base = NULL;
+bail_ureg:
+	vfree(uctxt->subctxt_uregbase);
+	uctxt->subctxt_uregbase = NULL;
+
+	return ret;
+}
+
+static void user_init(struct hfi1_ctxtdata *uctxt)
+{
+	unsigned int rcvctrl_ops = 0;
+
+	/* initialize poll variables... */
+	uctxt->urgent = 0;
+	uctxt->urgent_poll = 0;
+
+	/*
+	 * Now enable the ctxt for receive.
+	 * For chips that are set to DMA the tail register to memory
+	 * when they change (and when the update bit transitions from
+	 * 0 to 1.  So for those chips, we turn it off and then back on.
+	 * This will (very briefly) affect any other open ctxts, but the
+	 * duration is very short, and therefore isn't an issue.  We
+	 * explicitly set the in-memory tail copy to 0 beforehand, so we
+	 * don't have to wait to be sure the DMA update has happened
+	 * (chip resets head/tail to 0 on transition to enable).
+	 */
+	if (uctxt->rcvhdrtail_kvaddr)
+		clear_rcvhdrtail(uctxt);
+
+	/* Setup J_KEY before enabling the context */
+	hfi1_set_ctxt_jkey(uctxt->dd, uctxt, uctxt->jkey);
+
+	rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
+	if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP))
+		rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
+	/*
+	 * Ignore the bit in the flags for now until proper
+	 * support for multiple packet per rcv array entry is
+	 * added.
+	 */
+	if (!HFI1_CAP_UGET_MASK(uctxt->flags, MULTI_PKT_EGR))
+		rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+	if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_EGR_FULL))
+		rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+	if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
+		rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+	/*
+	 * The RcvCtxtCtrl.TailUpd bit has to be explicitly written.
+	 * We can't rely on the correct value to be set from prior
+	 * uses of the chip or ctxt. Therefore, add the rcvctrl op
+	 * for both cases.
+	 */
+	if (HFI1_CAP_UGET_MASK(uctxt->flags, DMA_RTAIL))
+		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
+	else
+		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
+	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt);
+}
+
+static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len)
+{
+	struct hfi1_ctxt_info cinfo;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+
+	if (sizeof(cinfo) != len)
+		return -EINVAL;
+
+	memset(&cinfo, 0, sizeof(cinfo));
+	cinfo.runtime_flags = (((uctxt->flags >> HFI1_CAP_MISC_SHIFT) &
+				HFI1_CAP_MISC_MASK) << HFI1_CAP_USER_SHIFT) |
+			HFI1_CAP_UGET_MASK(uctxt->flags, MASK) |
+			HFI1_CAP_KGET_MASK(uctxt->flags, K2U);
+	/* adjust flag if this fd is not able to cache */
+	if (!fd->handler)
+		cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */
+
+	cinfo.num_active = hfi1_count_active_units();
+	cinfo.unit = uctxt->dd->unit;
+	cinfo.ctxt = uctxt->ctxt;
+	cinfo.subctxt = fd->subctxt;
+	cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
+				uctxt->dd->rcv_entries.group_size) +
+		uctxt->expected_count;
+	cinfo.credits = uctxt->sc->credits;
+	cinfo.numa_node = uctxt->numa_id;
+	cinfo.rec_cpu = fd->rec_cpu_num;
+	cinfo.send_ctxt = uctxt->sc->hw_context;
+
+	cinfo.egrtids = uctxt->egrbufs.alloced;
+	cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+	cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
+	cinfo.sdma_ring_size = fd->cq->nentries;
+	cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
+
+	trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, &cinfo);
+	if (copy_to_user((void __user *)arg, &cinfo, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int init_user_ctxt(struct hfi1_filedata *fd,
+			  struct hfi1_ctxtdata *uctxt)
+{
+	int ret;
+
+	ret = hfi1_user_sdma_alloc_queues(uctxt, fd);
+	if (ret)
+		return ret;
+
+	ret = hfi1_user_exp_rcv_init(fd, uctxt);
+	if (ret)
+		hfi1_user_sdma_free_queues(fd, uctxt);
+
+	return ret;
+}
+
+static int setup_base_ctxt(struct hfi1_filedata *fd,
+			   struct hfi1_ctxtdata *uctxt)
+{
+	struct hfi1_devdata *dd = uctxt->dd;
+	int ret = 0;
+
+	hfi1_init_ctxt(uctxt->sc);
+
+	/* Now allocate the RcvHdr queue and eager buffers. */
+	ret = hfi1_create_rcvhdrq(dd, uctxt);
+	if (ret)
+		goto done;
+
+	ret = hfi1_setup_eagerbufs(uctxt);
+	if (ret)
+		goto done;
+
+	/* If sub-contexts are enabled, do the appropriate setup */
+	if (uctxt->subctxt_cnt)
+		ret = setup_subctxt(uctxt);
+	if (ret)
+		goto done;
+
+	ret = hfi1_alloc_ctxt_rcv_groups(uctxt);
+	if (ret)
+		goto done;
+
+	ret = init_user_ctxt(fd, uctxt);
+	if (ret)
+		goto done;
+
+	user_init(uctxt);
+
+	/* Now that the context is set up, the fd can get a reference. */
+	fd->uctxt = uctxt;
+	hfi1_rcd_get(uctxt);
+
+done:
+	if (uctxt->subctxt_cnt) {
+		/*
+		 * On error, set the failed bit so sub-contexts will clean up
+		 * correctly.
+		 */
+		if (ret)
+			set_bit(HFI1_CTXT_BASE_FAILED, &uctxt->event_flags);
+
+		/*
+		 * Base context is done (successfully or not), notify anybody
+		 * using a sub-context that is waiting for this completion.
+		 */
+		clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags);
+		wake_up(&uctxt->wait);
+	}
+
+	return ret;
+}
+
+static int get_base_info(struct hfi1_filedata *fd, unsigned long arg, u32 len)
+{
+	struct hfi1_base_info binfo;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+	unsigned offset;
+
+	trace_hfi1_uctxtdata(uctxt->dd, uctxt, fd->subctxt);
+
+	if (sizeof(binfo) != len)
+		return -EINVAL;
+
+	memset(&binfo, 0, sizeof(binfo));
+	binfo.hw_version = dd->revision;
+	binfo.sw_version = HFI1_KERN_SWVERSION;
+	binfo.bthqp = kdeth_qp;
+	binfo.jkey = uctxt->jkey;
+	/*
+	 * If more than 64 contexts are enabled the allocated credit
+	 * return will span two or three contiguous pages. Since we only
+	 * map the page containing the context's credit return address,
+	 * we need to calculate the offset in the proper page.
+	 */
+	offset = ((u64)uctxt->sc->hw_free -
+		  (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
+	binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
+						fd->subctxt, offset);
+	binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
+					    fd->subctxt,
+					    uctxt->sc->base_addr);
+	binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
+						uctxt->ctxt,
+						fd->subctxt,
+						uctxt->sc->base_addr);
+	binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
+					       fd->subctxt,
+					       uctxt->rcvhdrq);
+	binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
+					       fd->subctxt,
+					       uctxt->egrbufs.rcvtids[0].dma);
+	binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
+						  fd->subctxt, 0);
+	/*
+	 * user regs are at
+	 * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
+	 */
+	binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
+					     fd->subctxt, 0);
+	offset = offset_in_page((uctxt_offset(uctxt) + fd->subctxt) *
+				sizeof(*dd->events));
+	binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
+					       fd->subctxt,
+					       offset);
+	binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
+					       fd->subctxt,
+					       dd->status);
+	if (HFI1_CAP_IS_USET(DMA_RTAIL))
+		binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
+							fd->subctxt, 0);
+	if (uctxt->subctxt_cnt) {
+		binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
+							 uctxt->ctxt,
+							 fd->subctxt, 0);
+		binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
+							  uctxt->ctxt,
+							  fd->subctxt, 0);
+		binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
+							  uctxt->ctxt,
+							  fd->subctxt, 0);
+	}
+
+	if (copy_to_user((void __user *)arg, &binfo, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+/**
+ * user_exp_rcv_setup - Set up the given tid rcv list
+ * @fd: file data of the current driver instance
+ * @arg: ioctl argumnent for user space information
+ * @len: length of data structure associated with ioctl command
+ *
+ * Wrapper to validate ioctl information before doing _rcv_setup.
+ *
+ */
+static int user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned long arg,
+			      u32 len)
+{
+	int ret;
+	unsigned long addr;
+	struct hfi1_tid_info tinfo;
+
+	if (sizeof(tinfo) != len)
+		return -EINVAL;
+
+	if (copy_from_user(&tinfo, (void __user *)arg, (sizeof(tinfo))))
+		return -EFAULT;
+
+	ret = hfi1_user_exp_rcv_setup(fd, &tinfo);
+	if (!ret) {
+		/*
+		 * Copy the number of tidlist entries we used
+		 * and the length of the buffer we registered.
+		 */
+		addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+		if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+				 sizeof(tinfo.tidcnt)))
+			return -EFAULT;
+
+		addr = arg + offsetof(struct hfi1_tid_info, length);
+		if (copy_to_user((void __user *)addr, &tinfo.length,
+				 sizeof(tinfo.length)))
+			ret = -EFAULT;
+	}
+
+	return ret;
+}
+
+/**
+ * user_exp_rcv_clear - Clear the given tid rcv list
+ * @fd: file data of the current driver instance
+ * @arg: ioctl argumnent for user space information
+ * @len: length of data structure associated with ioctl command
+ *
+ * The hfi1_user_exp_rcv_clear() can be called from the error path.  Because
+ * of this, we need to use this wrapper to copy the user space information
+ * before doing the clear.
+ */
+static int user_exp_rcv_clear(struct hfi1_filedata *fd, unsigned long arg,
+			      u32 len)
+{
+	int ret;
+	unsigned long addr;
+	struct hfi1_tid_info tinfo;
+
+	if (sizeof(tinfo) != len)
+		return -EINVAL;
+
+	if (copy_from_user(&tinfo, (void __user *)arg, (sizeof(tinfo))))
+		return -EFAULT;
+
+	ret = hfi1_user_exp_rcv_clear(fd, &tinfo);
+	if (!ret) {
+		addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+		if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+				 sizeof(tinfo.tidcnt)))
+			return -EFAULT;
+	}
+
+	return ret;
+}
+
+/**
+ * user_exp_rcv_invalid - Invalidate the given tid rcv list
+ * @fd: file data of the current driver instance
+ * @arg: ioctl argumnent for user space information
+ * @len: length of data structure associated with ioctl command
+ *
+ * Wrapper to validate ioctl information before doing _rcv_invalid.
+ *
+ */
+static int user_exp_rcv_invalid(struct hfi1_filedata *fd, unsigned long arg,
+				u32 len)
+{
+	int ret;
+	unsigned long addr;
+	struct hfi1_tid_info tinfo;
+
+	if (sizeof(tinfo) != len)
+		return -EINVAL;
+
+	if (!fd->invalid_tids)
+		return -EINVAL;
+
+	if (copy_from_user(&tinfo, (void __user *)arg, (sizeof(tinfo))))
+		return -EFAULT;
+
+	ret = hfi1_user_exp_rcv_invalid(fd, &tinfo);
+	if (ret)
+		return ret;
+
+	addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+	if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+			 sizeof(tinfo.tidcnt)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static __poll_t poll_urgent(struct file *fp,
+				struct poll_table_struct *pt)
+{
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+	__poll_t pollflag;
+
+	poll_wait(fp, &uctxt->wait, pt);
+
+	spin_lock_irq(&dd->uctxt_lock);
+	if (uctxt->urgent != uctxt->urgent_poll) {
+		pollflag = EPOLLIN | EPOLLRDNORM;
+		uctxt->urgent_poll = uctxt->urgent;
+	} else {
+		pollflag = 0;
+		set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
+	}
+	spin_unlock_irq(&dd->uctxt_lock);
+
+	return pollflag;
+}
+
+static __poll_t poll_next(struct file *fp,
+			      struct poll_table_struct *pt)
+{
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+	__poll_t pollflag;
+
+	poll_wait(fp, &uctxt->wait, pt);
+
+	spin_lock_irq(&dd->uctxt_lock);
+	if (hdrqempty(uctxt)) {
+		set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
+		hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt);
+		pollflag = 0;
+	} else {
+		pollflag = EPOLLIN | EPOLLRDNORM;
+	}
+	spin_unlock_irq(&dd->uctxt_lock);
+
+	return pollflag;
+}
+
+/*
+ * Find all user contexts in use, and set the specified bit in their
+ * event mask.
+ * See also find_ctxt() for a similar use, that is specific to send buffers.
+ */
+int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
+{
+	struct hfi1_ctxtdata *uctxt;
+	struct hfi1_devdata *dd = ppd->dd;
+	u16 ctxt;
+
+	if (!dd->events)
+		return -EINVAL;
+
+	for (ctxt = dd->first_dyn_alloc_ctxt; ctxt < dd->num_rcv_contexts;
+	     ctxt++) {
+		uctxt = hfi1_rcd_get_by_index(dd, ctxt);
+		if (uctxt) {
+			unsigned long *evs;
+			int i;
+			/*
+			 * subctxt_cnt is 0 if not shared, so do base
+			 * separately, first, then remaining subctxt, if any
+			 */
+			evs = dd->events + uctxt_offset(uctxt);
+			set_bit(evtbit, evs);
+			for (i = 1; i < uctxt->subctxt_cnt; i++)
+				set_bit(evtbit, evs + i);
+			hfi1_rcd_put(uctxt);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * manage_rcvq - manage a context's receive queue
+ * @uctxt: the context
+ * @subctxt: the sub-context
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the context, for use in queue
+ * overflow conditions.  start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt,
+		       unsigned long arg)
+{
+	struct hfi1_devdata *dd = uctxt->dd;
+	unsigned int rcvctrl_op;
+	int start_stop;
+
+	if (subctxt)
+		return 0;
+
+	if (get_user(start_stop, (int __user *)arg))
+		return -EFAULT;
+
+	/* atomically clear receive enable ctxt. */
+	if (start_stop) {
+		/*
+		 * On enable, force in-memory copy of the tail register to
+		 * 0, so that protocol code doesn't have to worry about
+		 * whether or not the chip has yet updated the in-memory
+		 * copy or not on return from the system call. The chip
+		 * always resets it's tail register back to 0 on a
+		 * transition from disabled to enabled.
+		 */
+		if (uctxt->rcvhdrtail_kvaddr)
+			clear_rcvhdrtail(uctxt);
+		rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
+	} else {
+		rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
+	}
+	hfi1_rcvctrl(dd, rcvctrl_op, uctxt);
+	/* always; new head should be equal to new tail; see above */
+
+	return 0;
+}
+
+/*
+ * clear the event notifier events for this context.
+ * User process then performs actions appropriate to bit having been
+ * set, if desired, and checks again in future.
+ */
+static int user_event_ack(struct hfi1_ctxtdata *uctxt, u16 subctxt,
+			  unsigned long arg)
+{
+	int i;
+	struct hfi1_devdata *dd = uctxt->dd;
+	unsigned long *evs;
+	unsigned long events;
+
+	if (!dd->events)
+		return 0;
+
+	if (get_user(events, (unsigned long __user *)arg))
+		return -EFAULT;
+
+	evs = dd->events + uctxt_offset(uctxt) + subctxt;
+
+	for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
+		if (!test_bit(i, &events))
+			continue;
+		clear_bit(i, evs);
+	}
+	return 0;
+}
+
+static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned long arg)
+{
+	int i;
+	struct hfi1_pportdata *ppd = uctxt->ppd;
+	struct hfi1_devdata *dd = uctxt->dd;
+	u16 pkey;
+
+	if (!HFI1_CAP_IS_USET(PKEY_CHECK))
+		return -EPERM;
+
+	if (get_user(pkey, (u16 __user *)arg))
+		return -EFAULT;
+
+	if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
+		if (pkey == ppd->pkeys[i])
+			return hfi1_set_ctxt_pkey(dd, uctxt, pkey);
+
+	return -ENOENT;
+}
+
+/**
+ * ctxt_reset - Reset the user context
+ * @uctxt: valid user context
+ */
+static int ctxt_reset(struct hfi1_ctxtdata *uctxt)
+{
+	struct send_context *sc;
+	struct hfi1_devdata *dd;
+	int ret = 0;
+
+	if (!uctxt || !uctxt->dd || !uctxt->sc)
+		return -EINVAL;
+
+	/*
+	 * There is no protection here. User level has to guarantee that
+	 * no one will be writing to the send context while it is being
+	 * re-initialized.  If user level breaks that guarantee, it will
+	 * break it's own context and no one else's.
+	 */
+	dd = uctxt->dd;
+	sc = uctxt->sc;
+
+	/*
+	 * Wait until the interrupt handler has marked the context as
+	 * halted or frozen. Report error if we time out.
+	 */
+	wait_event_interruptible_timeout(
+		sc->halt_wait, (sc->flags & SCF_HALTED),
+		msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+	if (!(sc->flags & SCF_HALTED))
+		return -ENOLCK;
+
+	/*
+	 * If the send context was halted due to a Freeze, wait until the
+	 * device has been "unfrozen" before resetting the context.
+	 */
+	if (sc->flags & SCF_FROZEN) {
+		wait_event_interruptible_timeout(
+			dd->event_queue,
+			!(READ_ONCE(dd->flags) & HFI1_FROZEN),
+			msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+		if (dd->flags & HFI1_FROZEN)
+			return -ENOLCK;
+
+		if (dd->flags & HFI1_FORCED_FREEZE)
+			/*
+			 * Don't allow context reset if we are into
+			 * forced freeze
+			 */
+			return -ENODEV;
+
+		sc_disable(sc);
+		ret = sc_enable(sc);
+		hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, uctxt);
+	} else {
+		ret = sc_restart(sc);
+	}
+	if (!ret)
+		sc_return_credits(sc);
+
+	return ret;
+}
+
+static void user_remove(struct hfi1_devdata *dd)
+{
+
+	hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
+}
+
+static int user_add(struct hfi1_devdata *dd)
+{
+	char name[10];
+	int ret;
+
+	snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
+	ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops,
+			     &dd->user_cdev, &dd->user_device,
+			     true, &dd->kobj);
+	if (ret)
+		user_remove(dd);
+
+	return ret;
+}
+
+/*
+ * Create per-unit files in /dev
+ */
+int hfi1_device_create(struct hfi1_devdata *dd)
+{
+	return user_add(dd);
+}
+
+/*
+ * Remove per-unit files in /dev
+ * void, core kernel returns no errors for this stuff
+ */
+void hfi1_device_remove(struct hfi1_devdata *dd)
+{
+	user_remove(dd);
+}
diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c
new file mode 100644
index 0000000..2b57ba7
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/firmware.c
@@ -0,0 +1,2305 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/firmware.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/crc32.h>
+
+#include "hfi.h"
+#include "trace.h"
+
+/*
+ * Make it easy to toggle firmware file name and if it gets loaded by
+ * editing the following. This may be something we do while in development
+ * but not necessarily something a user would ever need to use.
+ */
+#define DEFAULT_FW_8051_NAME_FPGA "hfi_dc8051.bin"
+#define DEFAULT_FW_8051_NAME_ASIC "hfi1_dc8051.fw"
+#define DEFAULT_FW_FABRIC_NAME "hfi1_fabric.fw"
+#define DEFAULT_FW_SBUS_NAME "hfi1_sbus.fw"
+#define DEFAULT_FW_PCIE_NAME "hfi1_pcie.fw"
+#define ALT_FW_8051_NAME_ASIC "hfi1_dc8051_d.fw"
+#define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw"
+#define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw"
+#define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw"
+
+MODULE_FIRMWARE(DEFAULT_FW_8051_NAME_ASIC);
+MODULE_FIRMWARE(DEFAULT_FW_FABRIC_NAME);
+MODULE_FIRMWARE(DEFAULT_FW_SBUS_NAME);
+MODULE_FIRMWARE(DEFAULT_FW_PCIE_NAME);
+
+static uint fw_8051_load = 1;
+static uint fw_fabric_serdes_load = 1;
+static uint fw_pcie_serdes_load = 1;
+static uint fw_sbus_load = 1;
+
+/* Firmware file names get set in hfi1_firmware_init() based on the above */
+static char *fw_8051_name;
+static char *fw_fabric_serdes_name;
+static char *fw_sbus_name;
+static char *fw_pcie_serdes_name;
+
+#define SBUS_MAX_POLL_COUNT 100
+#define SBUS_COUNTER(reg, name) \
+	(((reg) >> ASIC_STS_SBUS_COUNTERS_##name##_CNT_SHIFT) & \
+	 ASIC_STS_SBUS_COUNTERS_##name##_CNT_MASK)
+
+/*
+ * Firmware security header.
+ */
+struct css_header {
+	u32 module_type;
+	u32 header_len;
+	u32 header_version;
+	u32 module_id;
+	u32 module_vendor;
+	u32 date;		/* BCD yyyymmdd */
+	u32 size;		/* in DWORDs */
+	u32 key_size;		/* in DWORDs */
+	u32 modulus_size;	/* in DWORDs */
+	u32 exponent_size;	/* in DWORDs */
+	u32 reserved[22];
+};
+
+/* expected field values */
+#define CSS_MODULE_TYPE	   0x00000006
+#define CSS_HEADER_LEN	   0x000000a1
+#define CSS_HEADER_VERSION 0x00010000
+#define CSS_MODULE_VENDOR  0x00008086
+
+#define KEY_SIZE      256
+#define MU_SIZE		8
+#define EXPONENT_SIZE	4
+
+/* size of platform configuration partition */
+#define MAX_PLATFORM_CONFIG_FILE_SIZE 4096
+
+/* size of file of plaform configuration encoded in format version 4 */
+#define PLATFORM_CONFIG_FORMAT_4_FILE_SIZE 528
+
+/* the file itself */
+struct firmware_file {
+	struct css_header css_header;
+	u8 modulus[KEY_SIZE];
+	u8 exponent[EXPONENT_SIZE];
+	u8 signature[KEY_SIZE];
+	u8 firmware[];
+};
+
+struct augmented_firmware_file {
+	struct css_header css_header;
+	u8 modulus[KEY_SIZE];
+	u8 exponent[EXPONENT_SIZE];
+	u8 signature[KEY_SIZE];
+	u8 r2[KEY_SIZE];
+	u8 mu[MU_SIZE];
+	u8 firmware[];
+};
+
+/* augmented file size difference */
+#define AUGMENT_SIZE (sizeof(struct augmented_firmware_file) - \
+						sizeof(struct firmware_file))
+
+struct firmware_details {
+	/* Linux core piece */
+	const struct firmware *fw;
+
+	struct css_header *css_header;
+	u8 *firmware_ptr;		/* pointer to binary data */
+	u32 firmware_len;		/* length in bytes */
+	u8 *modulus;			/* pointer to the modulus */
+	u8 *exponent;			/* pointer to the exponent */
+	u8 *signature;			/* pointer to the signature */
+	u8 *r2;				/* pointer to r2 */
+	u8 *mu;				/* pointer to mu */
+	struct augmented_firmware_file dummy_header;
+};
+
+/*
+ * The mutex protects fw_state, fw_err, and all of the firmware_details
+ * variables.
+ */
+static DEFINE_MUTEX(fw_mutex);
+enum fw_state {
+	FW_EMPTY,
+	FW_TRY,
+	FW_FINAL,
+	FW_ERR
+};
+
+static enum fw_state fw_state = FW_EMPTY;
+static int fw_err;
+static struct firmware_details fw_8051;
+static struct firmware_details fw_fabric;
+static struct firmware_details fw_pcie;
+static struct firmware_details fw_sbus;
+
+/* flags for turn_off_spicos() */
+#define SPICO_SBUS   0x1
+#define SPICO_FABRIC 0x2
+#define ENABLE_SPICO_SMASK 0x1
+
+/* security block commands */
+#define RSA_CMD_INIT  0x1
+#define RSA_CMD_START 0x2
+
+/* security block status */
+#define RSA_STATUS_IDLE   0x0
+#define RSA_STATUS_ACTIVE 0x1
+#define RSA_STATUS_DONE   0x2
+#define RSA_STATUS_FAILED 0x3
+
+/* RSA engine timeout, in ms */
+#define RSA_ENGINE_TIMEOUT 100 /* ms */
+
+/* hardware mutex timeout, in ms */
+#define HM_TIMEOUT 10 /* ms */
+
+/* 8051 memory access timeout, in us */
+#define DC8051_ACCESS_TIMEOUT 100 /* us */
+
+/* the number of fabric SerDes on the SBus */
+#define NUM_FABRIC_SERDES 4
+
+/* ASIC_STS_SBUS_RESULT.RESULT_CODE value */
+#define SBUS_READ_COMPLETE 0x4
+
+/* SBus fabric SerDes addresses, one set per HFI */
+static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = {
+	{ 0x01, 0x02, 0x03, 0x04 },
+	{ 0x28, 0x29, 0x2a, 0x2b }
+};
+
+/* SBus PCIe SerDes addresses, one set per HFI */
+static const u8 pcie_serdes_addrs[2][NUM_PCIE_SERDES] = {
+	{ 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16,
+	  0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26 },
+	{ 0x2f, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3b, 0x3d,
+	  0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d }
+};
+
+/* SBus PCIe PCS addresses, one set per HFI */
+const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES] = {
+	{ 0x09, 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x15, 0x17,
+	  0x19, 0x1b, 0x1d, 0x1f, 0x21, 0x23, 0x25, 0x27 },
+	{ 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+	  0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e }
+};
+
+/* SBus fabric SerDes broadcast addresses, one per HFI */
+static const u8 fabric_serdes_broadcast[2] = { 0xe4, 0xe5 };
+static const u8 all_fabric_serdes_broadcast = 0xe1;
+
+/* SBus PCIe SerDes broadcast addresses, one per HFI */
+const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 };
+static const u8 all_pcie_serdes_broadcast = 0xe0;
+
+static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = {
+	0,
+	SYSTEM_TABLE_MAX,
+	PORT_TABLE_MAX,
+	RX_PRESET_TABLE_MAX,
+	TX_PRESET_TABLE_MAX,
+	QSFP_ATTEN_TABLE_MAX,
+	VARIABLE_SETTINGS_TABLE_MAX
+};
+
+/* forwards */
+static void dispose_one_firmware(struct firmware_details *fdet);
+static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
+				       struct firmware_details *fdet);
+static void dump_fw_version(struct hfi1_devdata *dd);
+
+/*
+ * Read a single 64-bit value from 8051 data memory.
+ *
+ * Expects:
+ * o caller to have already set up data read, no auto increment
+ * o caller to turn off read enable when finished
+ *
+ * The address argument is a byte offset.  Bits 0:2 in the address are
+ * ignored - i.e. the hardware will always do aligned 8-byte reads as if
+ * the lower bits are zero.
+ *
+ * Return 0 on success, -ENXIO on a read error (timeout).
+ */
+static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result)
+{
+	u64 reg;
+	int count;
+
+	/* step 1: set the address, clear enable */
+	reg = (addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+			<< DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT;
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+	/* step 2: enable */
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL,
+		  reg | DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK);
+
+	/* wait until ACCESS_COMPLETED is set */
+	count = 0;
+	while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+		    & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+		    == 0) {
+		count++;
+		if (count > DC8051_ACCESS_TIMEOUT) {
+			dd_dev_err(dd, "timeout reading 8051 data\n");
+			return -ENXIO;
+		}
+		ndelay(10);
+	}
+
+	/* gather the data */
+	*result = read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_RD_DATA);
+
+	return 0;
+}
+
+/*
+ * Read 8051 data starting at addr, for len bytes.  Will read in 8-byte chunks.
+ * Return 0 on success, -errno on error.
+ */
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result)
+{
+	unsigned long flags;
+	u32 done;
+	int ret = 0;
+
+	spin_lock_irqsave(&dd->dc8051_memlock, flags);
+
+	/* data read set-up, no auto-increment */
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+	for (done = 0; done < len; addr += 8, done += 8, result++) {
+		ret = __read_8051_data(dd, addr, result);
+		if (ret)
+			break;
+	}
+
+	/* turn off read enable */
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+
+	spin_unlock_irqrestore(&dd->dc8051_memlock, flags);
+
+	return ret;
+}
+
+/*
+ * Write data or code to the 8051 code or data RAM.
+ */
+static int write_8051(struct hfi1_devdata *dd, int code, u32 start,
+		      const u8 *data, u32 len)
+{
+	u64 reg;
+	u32 offset;
+	int aligned, count;
+
+	/* check alignment */
+	aligned = ((unsigned long)data & 0x7) == 0;
+
+	/* write set-up */
+	reg = (code ? DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK : 0ull)
+		| DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK;
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, reg);
+
+	reg = ((start & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+			<< DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
+		| DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK;
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+
+	/* write */
+	for (offset = 0; offset < len; offset += 8) {
+		int bytes = len - offset;
+
+		if (bytes < 8) {
+			reg = 0;
+			memcpy(&reg, &data[offset], bytes);
+		} else if (aligned) {
+			reg = *(u64 *)&data[offset];
+		} else {
+			memcpy(&reg, &data[offset], 8);
+		}
+		write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_WR_DATA, reg);
+
+		/* wait until ACCESS_COMPLETED is set */
+		count = 0;
+		while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+		    & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+		    == 0) {
+			count++;
+			if (count > DC8051_ACCESS_TIMEOUT) {
+				dd_dev_err(dd, "timeout writing 8051 data\n");
+				return -ENXIO;
+			}
+			udelay(1);
+		}
+	}
+
+	/* turn off write access, auto increment (also sets to data access) */
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+	return 0;
+}
+
+/* return 0 if values match, non-zero and complain otherwise */
+static int invalid_header(struct hfi1_devdata *dd, const char *what,
+			  u32 actual, u32 expected)
+{
+	if (actual == expected)
+		return 0;
+
+	dd_dev_err(dd,
+		   "invalid firmware header field %s: expected 0x%x, actual 0x%x\n",
+		   what, expected, actual);
+	return 1;
+}
+
+/*
+ * Verify that the static fields in the CSS header match.
+ */
+static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css)
+{
+	/* verify CSS header fields (most sizes are in DW, so add /4) */
+	if (invalid_header(dd, "module_type", css->module_type,
+			   CSS_MODULE_TYPE) ||
+	    invalid_header(dd, "header_len", css->header_len,
+			   (sizeof(struct firmware_file) / 4)) ||
+	    invalid_header(dd, "header_version", css->header_version,
+			   CSS_HEADER_VERSION) ||
+	    invalid_header(dd, "module_vendor", css->module_vendor,
+			   CSS_MODULE_VENDOR) ||
+	    invalid_header(dd, "key_size", css->key_size, KEY_SIZE / 4) ||
+	    invalid_header(dd, "modulus_size", css->modulus_size,
+			   KEY_SIZE / 4) ||
+	    invalid_header(dd, "exponent_size", css->exponent_size,
+			   EXPONENT_SIZE / 4)) {
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Make sure there are at least some bytes after the prefix.
+ */
+static int payload_check(struct hfi1_devdata *dd, const char *name,
+			 long file_size, long prefix_size)
+{
+	/* make sure we have some payload */
+	if (prefix_size >= file_size) {
+		dd_dev_err(dd,
+			   "firmware \"%s\", size %ld, must be larger than %ld bytes\n",
+			   name, file_size, prefix_size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Request the firmware from the system.  Extract the pieces and fill in
+ * fdet.  If successful, the caller will need to call dispose_one_firmware().
+ * Returns 0 on success, -ERRNO on error.
+ */
+static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name,
+			       struct firmware_details *fdet)
+{
+	struct css_header *css;
+	int ret;
+
+	memset(fdet, 0, sizeof(*fdet));
+
+	ret = request_firmware(&fdet->fw, name, &dd->pcidev->dev);
+	if (ret) {
+		dd_dev_warn(dd, "cannot find firmware \"%s\", err %d\n",
+			    name, ret);
+		return ret;
+	}
+
+	/* verify the firmware */
+	if (fdet->fw->size < sizeof(struct css_header)) {
+		dd_dev_err(dd, "firmware \"%s\" is too small\n", name);
+		ret = -EINVAL;
+		goto done;
+	}
+	css = (struct css_header *)fdet->fw->data;
+
+	hfi1_cdbg(FIRMWARE, "Firmware %s details:", name);
+	hfi1_cdbg(FIRMWARE, "file size: 0x%lx bytes", fdet->fw->size);
+	hfi1_cdbg(FIRMWARE, "CSS structure:");
+	hfi1_cdbg(FIRMWARE, "  module_type    0x%x", css->module_type);
+	hfi1_cdbg(FIRMWARE, "  header_len     0x%03x (0x%03x bytes)",
+		  css->header_len, 4 * css->header_len);
+	hfi1_cdbg(FIRMWARE, "  header_version 0x%x", css->header_version);
+	hfi1_cdbg(FIRMWARE, "  module_id      0x%x", css->module_id);
+	hfi1_cdbg(FIRMWARE, "  module_vendor  0x%x", css->module_vendor);
+	hfi1_cdbg(FIRMWARE, "  date           0x%x", css->date);
+	hfi1_cdbg(FIRMWARE, "  size           0x%03x (0x%03x bytes)",
+		  css->size, 4 * css->size);
+	hfi1_cdbg(FIRMWARE, "  key_size       0x%03x (0x%03x bytes)",
+		  css->key_size, 4 * css->key_size);
+	hfi1_cdbg(FIRMWARE, "  modulus_size   0x%03x (0x%03x bytes)",
+		  css->modulus_size, 4 * css->modulus_size);
+	hfi1_cdbg(FIRMWARE, "  exponent_size  0x%03x (0x%03x bytes)",
+		  css->exponent_size, 4 * css->exponent_size);
+	hfi1_cdbg(FIRMWARE, "firmware size: 0x%lx bytes",
+		  fdet->fw->size - sizeof(struct firmware_file));
+
+	/*
+	 * If the file does not have a valid CSS header, fail.
+	 * Otherwise, check the CSS size field for an expected size.
+	 * The augmented file has r2 and mu inserted after the header
+	 * was generated, so there will be a known difference between
+	 * the CSS header size and the actual file size.  Use this
+	 * difference to identify an augmented file.
+	 *
+	 * Note: css->size is in DWORDs, multiply by 4 to get bytes.
+	 */
+	ret = verify_css_header(dd, css);
+	if (ret) {
+		dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name);
+	} else if ((css->size * 4) == fdet->fw->size) {
+		/* non-augmented firmware file */
+		struct firmware_file *ff = (struct firmware_file *)
+							fdet->fw->data;
+
+		/* make sure there are bytes in the payload */
+		ret = payload_check(dd, name, fdet->fw->size,
+				    sizeof(struct firmware_file));
+		if (ret == 0) {
+			fdet->css_header = css;
+			fdet->modulus = ff->modulus;
+			fdet->exponent = ff->exponent;
+			fdet->signature = ff->signature;
+			fdet->r2 = fdet->dummy_header.r2; /* use dummy space */
+			fdet->mu = fdet->dummy_header.mu; /* use dummy space */
+			fdet->firmware_ptr = ff->firmware;
+			fdet->firmware_len = fdet->fw->size -
+						sizeof(struct firmware_file);
+			/*
+			 * Header does not include r2 and mu - generate here.
+			 * For now, fail.
+			 */
+			dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n");
+			ret = -EINVAL;
+		}
+	} else if ((css->size * 4) + AUGMENT_SIZE == fdet->fw->size) {
+		/* augmented firmware file */
+		struct augmented_firmware_file *aff =
+			(struct augmented_firmware_file *)fdet->fw->data;
+
+		/* make sure there are bytes in the payload */
+		ret = payload_check(dd, name, fdet->fw->size,
+				    sizeof(struct augmented_firmware_file));
+		if (ret == 0) {
+			fdet->css_header = css;
+			fdet->modulus = aff->modulus;
+			fdet->exponent = aff->exponent;
+			fdet->signature = aff->signature;
+			fdet->r2 = aff->r2;
+			fdet->mu = aff->mu;
+			fdet->firmware_ptr = aff->firmware;
+			fdet->firmware_len = fdet->fw->size -
+					sizeof(struct augmented_firmware_file);
+		}
+	} else {
+		/* css->size check failed */
+		dd_dev_err(dd,
+			   "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n",
+			   fdet->fw->size / 4,
+			   (fdet->fw->size - AUGMENT_SIZE) / 4,
+			   css->size);
+
+		ret = -EINVAL;
+	}
+
+done:
+	/* if returning an error, clean up after ourselves */
+	if (ret)
+		dispose_one_firmware(fdet);
+	return ret;
+}
+
+static void dispose_one_firmware(struct firmware_details *fdet)
+{
+	release_firmware(fdet->fw);
+	/* erase all previous information */
+	memset(fdet, 0, sizeof(*fdet));
+}
+
+/*
+ * Obtain the 4 firmwares from the OS.  All must be obtained at once or not
+ * at all.  If called with the firmware state in FW_TRY, use alternate names.
+ * On exit, this routine will have set the firmware state to one of FW_TRY,
+ * FW_FINAL, or FW_ERR.
+ *
+ * Must be holding fw_mutex.
+ */
+static void __obtain_firmware(struct hfi1_devdata *dd)
+{
+	int err = 0;
+
+	if (fw_state == FW_FINAL)	/* nothing more to obtain */
+		return;
+	if (fw_state == FW_ERR)		/* already in error */
+		return;
+
+	/* fw_state is FW_EMPTY or FW_TRY */
+retry:
+	if (fw_state == FW_TRY) {
+		/*
+		 * We tried the original and it failed.  Move to the
+		 * alternate.
+		 */
+		dd_dev_warn(dd, "using alternate firmware names\n");
+		/*
+		 * Let others run.  Some systems, when missing firmware, does
+		 * something that holds for 30 seconds.  If we do that twice
+		 * in a row it triggers task blocked warning.
+		 */
+		cond_resched();
+		if (fw_8051_load)
+			dispose_one_firmware(&fw_8051);
+		if (fw_fabric_serdes_load)
+			dispose_one_firmware(&fw_fabric);
+		if (fw_sbus_load)
+			dispose_one_firmware(&fw_sbus);
+		if (fw_pcie_serdes_load)
+			dispose_one_firmware(&fw_pcie);
+		fw_8051_name = ALT_FW_8051_NAME_ASIC;
+		fw_fabric_serdes_name = ALT_FW_FABRIC_NAME;
+		fw_sbus_name = ALT_FW_SBUS_NAME;
+		fw_pcie_serdes_name = ALT_FW_PCIE_NAME;
+
+		/*
+		 * Add a delay before obtaining and loading debug firmware.
+		 * Authorization will fail if the delay between firmware
+		 * authorization events is shorter than 50us. Add 100us to
+		 * make a delay time safe.
+		 */
+		usleep_range(100, 120);
+	}
+
+	if (fw_sbus_load) {
+		err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus);
+		if (err)
+			goto done;
+	}
+
+	if (fw_pcie_serdes_load) {
+		err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie);
+		if (err)
+			goto done;
+	}
+
+	if (fw_fabric_serdes_load) {
+		err = obtain_one_firmware(dd, fw_fabric_serdes_name,
+					  &fw_fabric);
+		if (err)
+			goto done;
+	}
+
+	if (fw_8051_load) {
+		err = obtain_one_firmware(dd, fw_8051_name, &fw_8051);
+		if (err)
+			goto done;
+	}
+
+done:
+	if (err) {
+		/* oops, had problems obtaining a firmware */
+		if (fw_state == FW_EMPTY && dd->icode == ICODE_RTL_SILICON) {
+			/* retry with alternate (RTL only) */
+			fw_state = FW_TRY;
+			goto retry;
+		}
+		dd_dev_err(dd, "unable to obtain working firmware\n");
+		fw_state = FW_ERR;
+		fw_err = -ENOENT;
+	} else {
+		/* success */
+		if (fw_state == FW_EMPTY &&
+		    dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
+			fw_state = FW_TRY;	/* may retry later */
+		else
+			fw_state = FW_FINAL;	/* cannot try again */
+	}
+}
+
+/*
+ * Called by all HFIs when loading their firmware - i.e. device probe time.
+ * The first one will do the actual firmware load.  Use a mutex to resolve
+ * any possible race condition.
+ *
+ * The call to this routine cannot be moved to driver load because the kernel
+ * call request_firmware() requires a device which is only available after
+ * the first device probe.
+ */
+static int obtain_firmware(struct hfi1_devdata *dd)
+{
+	unsigned long timeout;
+
+	mutex_lock(&fw_mutex);
+
+	/* 40s delay due to long delay on missing firmware on some systems */
+	timeout = jiffies + msecs_to_jiffies(40000);
+	while (fw_state == FW_TRY) {
+		/*
+		 * Another device is trying the firmware.  Wait until it
+		 * decides what works (or not).
+		 */
+		if (time_after(jiffies, timeout)) {
+			/* waited too long */
+			dd_dev_err(dd, "Timeout waiting for firmware try");
+			fw_state = FW_ERR;
+			fw_err = -ETIMEDOUT;
+			break;
+		}
+		mutex_unlock(&fw_mutex);
+		msleep(20);	/* arbitrary delay */
+		mutex_lock(&fw_mutex);
+	}
+	/* not in FW_TRY state */
+
+	/* set fw_state to FW_TRY, FW_FINAL, or FW_ERR, and fw_err */
+	if (fw_state == FW_EMPTY)
+		__obtain_firmware(dd);
+
+	mutex_unlock(&fw_mutex);
+	return fw_err;
+}
+
+/*
+ * Called when the driver unloads.  The timing is asymmetric with its
+ * counterpart, obtain_firmware().  If called at device remove time,
+ * then it is conceivable that another device could probe while the
+ * firmware is being disposed.  The mutexes can be moved to do that
+ * safely, but then the firmware would be requested from the OS multiple
+ * times.
+ *
+ * No mutex is needed as the driver is unloading and there cannot be any
+ * other callers.
+ */
+void dispose_firmware(void)
+{
+	dispose_one_firmware(&fw_8051);
+	dispose_one_firmware(&fw_fabric);
+	dispose_one_firmware(&fw_pcie);
+	dispose_one_firmware(&fw_sbus);
+
+	/* retain the error state, otherwise revert to empty */
+	if (fw_state != FW_ERR)
+		fw_state = FW_EMPTY;
+}
+
+/*
+ * Called with the result of a firmware download.
+ *
+ * Return 1 to retry loading the firmware, 0 to stop.
+ */
+static int retry_firmware(struct hfi1_devdata *dd, int load_result)
+{
+	int retry;
+
+	mutex_lock(&fw_mutex);
+
+	if (load_result == 0) {
+		/*
+		 * The load succeeded, so expect all others to do the same.
+		 * Do not retry again.
+		 */
+		if (fw_state == FW_TRY)
+			fw_state = FW_FINAL;
+		retry = 0;	/* do NOT retry */
+	} else if (fw_state == FW_TRY) {
+		/* load failed, obtain alternate firmware */
+		__obtain_firmware(dd);
+		retry = (fw_state == FW_FINAL);
+	} else {
+		/* else in FW_FINAL or FW_ERR, no retry in either case */
+		retry = 0;
+	}
+
+	mutex_unlock(&fw_mutex);
+	return retry;
+}
+
+/*
+ * Write a block of data to a given array CSR.  All calls will be in
+ * multiples of 8 bytes.
+ */
+static void write_rsa_data(struct hfi1_devdata *dd, int what,
+			   const u8 *data, int nbytes)
+{
+	int qw_size = nbytes / 8;
+	int i;
+
+	if (((unsigned long)data & 0x7) == 0) {
+		/* aligned */
+		u64 *ptr = (u64 *)data;
+
+		for (i = 0; i < qw_size; i++, ptr++)
+			write_csr(dd, what + (8 * i), *ptr);
+	} else {
+		/* not aligned */
+		for (i = 0; i < qw_size; i++, data += 8) {
+			u64 value;
+
+			memcpy(&value, data, 8);
+			write_csr(dd, what + (8 * i), value);
+		}
+	}
+}
+
+/*
+ * Write a block of data to a given CSR as a stream of writes.  All calls will
+ * be in multiples of 8 bytes.
+ */
+static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what,
+				    const u8 *data, int nbytes)
+{
+	u64 *ptr = (u64 *)data;
+	int qw_size = nbytes / 8;
+
+	for (; qw_size > 0; qw_size--, ptr++)
+		write_csr(dd, what, *ptr);
+}
+
+/*
+ * Download the signature and start the RSA mechanism.  Wait for
+ * RSA_ENGINE_TIMEOUT before giving up.
+ */
+static int run_rsa(struct hfi1_devdata *dd, const char *who,
+		   const u8 *signature)
+{
+	unsigned long timeout;
+	u64 reg;
+	u32 status;
+	int ret = 0;
+
+	/* write the signature */
+	write_rsa_data(dd, MISC_CFG_RSA_SIGNATURE, signature, KEY_SIZE);
+
+	/* initialize RSA */
+	write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_INIT);
+
+	/*
+	 * Make sure the engine is idle and insert a delay between the two
+	 * writes to MISC_CFG_RSA_CMD.
+	 */
+	status = (read_csr(dd, MISC_CFG_FW_CTRL)
+			   & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+			     >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+	if (status != RSA_STATUS_IDLE) {
+		dd_dev_err(dd, "%s security engine not idle - giving up\n",
+			   who);
+		return -EBUSY;
+	}
+
+	/* start RSA */
+	write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_START);
+
+	/*
+	 * Look for the result.
+	 *
+	 * The RSA engine is hooked up to two MISC errors.  The driver
+	 * masks these errors as they do not respond to the standard
+	 * error "clear down" mechanism.  Look for these errors here and
+	 * clear them when possible.  This routine will exit with the
+	 * errors of the current run still set.
+	 *
+	 * MISC_FW_AUTH_FAILED_ERR
+	 *	Firmware authorization failed.  This can be cleared by
+	 *	re-initializing the RSA engine, then clearing the status bit.
+	 *	Do not re-init the RSA angine immediately after a successful
+	 *	run - this will reset the current authorization.
+	 *
+	 * MISC_KEY_MISMATCH_ERR
+	 *	Key does not match.  The only way to clear this is to load
+	 *	a matching key then clear the status bit.  If this error
+	 *	is raised, it will persist outside of this routine until a
+	 *	matching key is loaded.
+	 */
+	timeout = msecs_to_jiffies(RSA_ENGINE_TIMEOUT) + jiffies;
+	while (1) {
+		status = (read_csr(dd, MISC_CFG_FW_CTRL)
+			   & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+			     >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+
+		if (status == RSA_STATUS_IDLE) {
+			/* should not happen */
+			dd_dev_err(dd, "%s firmware security bad idle state\n",
+				   who);
+			ret = -EINVAL;
+			break;
+		} else if (status == RSA_STATUS_DONE) {
+			/* finished successfully */
+			break;
+		} else if (status == RSA_STATUS_FAILED) {
+			/* finished unsuccessfully */
+			ret = -EINVAL;
+			break;
+		}
+		/* else still active */
+
+		if (time_after(jiffies, timeout)) {
+			/*
+			 * Timed out while active.  We can't reset the engine
+			 * if it is stuck active, but run through the
+			 * error code to see what error bits are set.
+			 */
+			dd_dev_err(dd, "%s firmware security time out\n", who);
+			ret = -ETIMEDOUT;
+			break;
+		}
+
+		msleep(20);
+	}
+
+	/*
+	 * Arrive here on success or failure.  Clear all RSA engine
+	 * errors.  All current errors will stick - the RSA logic is keeping
+	 * error high.  All previous errors will clear - the RSA logic
+	 * is not keeping the error high.
+	 */
+	write_csr(dd, MISC_ERR_CLEAR,
+		  MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK |
+		  MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK);
+	/*
+	 * All that is left are the current errors.  Print warnings on
+	 * authorization failure details, if any.  Firmware authorization
+	 * can be retried, so these are only warnings.
+	 */
+	reg = read_csr(dd, MISC_ERR_STATUS);
+	if (ret) {
+		if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK)
+			dd_dev_warn(dd, "%s firmware authorization failed\n",
+				    who);
+		if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK)
+			dd_dev_warn(dd, "%s firmware key mismatch\n", who);
+	}
+
+	return ret;
+}
+
+static void load_security_variables(struct hfi1_devdata *dd,
+				    struct firmware_details *fdet)
+{
+	/* Security variables a.  Write the modulus */
+	write_rsa_data(dd, MISC_CFG_RSA_MODULUS, fdet->modulus, KEY_SIZE);
+	/* Security variables b.  Write the r2 */
+	write_rsa_data(dd, MISC_CFG_RSA_R2, fdet->r2, KEY_SIZE);
+	/* Security variables c.  Write the mu */
+	write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE);
+	/* Security variables d.  Write the header */
+	write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD,
+				(u8 *)fdet->css_header,
+				sizeof(struct css_header));
+}
+
+/* return the 8051 firmware state */
+static inline u32 get_firmware_state(struct hfi1_devdata *dd)
+{
+	u64 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+
+	return (reg >> DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT)
+				& DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK;
+}
+
+/*
+ * Wait until the firmware is up and ready to take host requests.
+ * Return 0 on success, -ETIMEDOUT on timeout.
+ */
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout)
+{
+	unsigned long timeout;
+
+	/* in the simulator, the fake 8051 is always ready */
+	if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+		return 0;
+
+	timeout = msecs_to_jiffies(mstimeout) + jiffies;
+	while (1) {
+		if (get_firmware_state(dd) == 0xa0)	/* ready */
+			return 0;
+		if (time_after(jiffies, timeout))	/* timed out */
+			return -ETIMEDOUT;
+		usleep_range(1950, 2050); /* sleep 2ms-ish */
+	}
+}
+
+/*
+ * Load the 8051 firmware.
+ */
+static int load_8051_firmware(struct hfi1_devdata *dd,
+			      struct firmware_details *fdet)
+{
+	u64 reg;
+	int ret;
+	u8 ver_major;
+	u8 ver_minor;
+	u8 ver_patch;
+
+	/*
+	 * DC Reset sequence
+	 * Load DC 8051 firmware
+	 */
+	/*
+	 * DC reset step 1: Reset DC8051
+	 */
+	reg = DC_DC8051_CFG_RST_M8051W_SMASK
+		| DC_DC8051_CFG_RST_CRAM_SMASK
+		| DC_DC8051_CFG_RST_DRAM_SMASK
+		| DC_DC8051_CFG_RST_IRAM_SMASK
+		| DC_DC8051_CFG_RST_SFR_SMASK;
+	write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+	/*
+	 * DC reset step 2 (optional): Load 8051 data memory with link
+	 * configuration
+	 */
+
+	/*
+	 * DC reset step 3: Load DC8051 firmware
+	 */
+	/* release all but the core reset */
+	reg = DC_DC8051_CFG_RST_M8051W_SMASK;
+	write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+	/* Firmware load step 1 */
+	load_security_variables(dd, fdet);
+
+	/*
+	 * Firmware load step 2.  Clear MISC_CFG_FW_CTRL.FW_8051_LOADED
+	 */
+	write_csr(dd, MISC_CFG_FW_CTRL, 0);
+
+	/* Firmware load steps 3-5 */
+	ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr,
+			 fdet->firmware_len);
+	if (ret)
+		return ret;
+
+	/*
+	 * DC reset step 4. Host starts the DC8051 firmware
+	 */
+	/*
+	 * Firmware load step 6.  Set MISC_CFG_FW_CTRL.FW_8051_LOADED
+	 */
+	write_csr(dd, MISC_CFG_FW_CTRL, MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK);
+
+	/* Firmware load steps 7-10 */
+	ret = run_rsa(dd, "8051", fdet->signature);
+	if (ret)
+		return ret;
+
+	/* clear all reset bits, releasing the 8051 */
+	write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+
+	/*
+	 * DC reset step 5. Wait for firmware to be ready to accept host
+	 * requests.
+	 */
+	ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+	if (ret) { /* timed out */
+		dd_dev_err(dd, "8051 start timeout, current state 0x%x\n",
+			   get_firmware_state(dd));
+		return -ETIMEDOUT;
+	}
+
+	read_misc_status(dd, &ver_major, &ver_minor, &ver_patch);
+	dd_dev_info(dd, "8051 firmware version %d.%d.%d\n",
+		    (int)ver_major, (int)ver_minor, (int)ver_patch);
+	dd->dc8051_ver = dc8051_ver(ver_major, ver_minor, ver_patch);
+	ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd,
+			   "Failed to set host interface version, return 0x%x\n",
+			   ret);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/*
+ * Write the SBus request register
+ *
+ * No need for masking - the arguments are sized exactly.
+ */
+void sbus_request(struct hfi1_devdata *dd,
+		  u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+	write_csr(dd, ASIC_CFG_SBUS_REQUEST,
+		  ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT) |
+		  ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT) |
+		  ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT) |
+		  ((u64)receiver_addr <<
+		   ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT));
+}
+
+/*
+ * Read a value from the SBus.
+ *
+ * Requires the caller to be in fast mode
+ */
+static u32 sbus_read(struct hfi1_devdata *dd, u8 receiver_addr, u8 data_addr,
+		     u32 data_in)
+{
+	u64 reg;
+	int retries;
+	int success = 0;
+	u32 result = 0;
+	u32 result_code = 0;
+
+	sbus_request(dd, receiver_addr, data_addr, READ_SBUS_RECEIVER, data_in);
+
+	for (retries = 0; retries < 100; retries++) {
+		usleep_range(1000, 1200); /* arbitrary */
+		reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+		result_code = (reg >> ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT)
+				& ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK;
+		if (result_code != SBUS_READ_COMPLETE)
+			continue;
+
+		success = 1;
+		result = (reg >> ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT)
+			   & ASIC_STS_SBUS_RESULT_DATA_OUT_MASK;
+		break;
+	}
+
+	if (!success) {
+		dd_dev_err(dd, "%s: read failed, result code 0x%x\n", __func__,
+			   result_code);
+	}
+
+	return result;
+}
+
+/*
+ * Turn off the SBus and fabric serdes spicos.
+ *
+ * + Must be called with Sbus fast mode turned on.
+ * + Must be called after fabric serdes broadcast is set up.
+ * + Must be called before the 8051 is loaded - assumes 8051 is not loaded
+ *   when using MISC_CFG_FW_CTRL.
+ */
+static void turn_off_spicos(struct hfi1_devdata *dd, int flags)
+{
+	/* only needed on A0 */
+	if (!is_ax(dd))
+		return;
+
+	dd_dev_info(dd, "Turning off spicos:%s%s\n",
+		    flags & SPICO_SBUS ? " SBus" : "",
+		    flags & SPICO_FABRIC ? " fabric" : "");
+
+	write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK);
+	/* disable SBus spico */
+	if (flags & SPICO_SBUS)
+		sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01,
+			     WRITE_SBUS_RECEIVER, 0x00000040);
+
+	/* disable the fabric serdes spicos */
+	if (flags & SPICO_FABRIC)
+		sbus_request(dd, fabric_serdes_broadcast[dd->hfi1_id],
+			     0x07, WRITE_SBUS_RECEIVER, 0x00000000);
+	write_csr(dd, MISC_CFG_FW_CTRL, 0);
+}
+
+/*
+ * Reset all of the fabric serdes for this HFI in preparation to take the
+ * link to Polling.
+ *
+ * To do a reset, we need to write to to the serdes registers.  Unfortunately,
+ * the fabric serdes download to the other HFI on the ASIC will have turned
+ * off the firmware validation on this HFI.  This means we can't write to the
+ * registers to reset the serdes.  Work around this by performing a complete
+ * re-download and validation of the fabric serdes firmware.  This, as a
+ * by-product, will reset the serdes.  NOTE: the re-download requires that
+ * the 8051 be in the Offline state.  I.e. not actively trying to use the
+ * serdes.  This routine is called at the point where the link is Offline and
+ * is getting ready to go to Polling.
+ */
+void fabric_serdes_reset(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	if (!fw_fabric_serdes_load)
+		return;
+
+	ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+	if (ret) {
+		dd_dev_err(dd,
+			   "Cannot acquire SBus resource to reset fabric SerDes - perhaps you should reboot\n");
+		return;
+	}
+	set_sbus_fast_mode(dd);
+
+	if (is_ax(dd)) {
+		/* A0 serdes do not work with a re-download */
+		u8 ra = fabric_serdes_broadcast[dd->hfi1_id];
+
+		/* place SerDes in reset and disable SPICO */
+		sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+		/* wait 100 refclk cycles @ 156.25MHz => 640ns */
+		udelay(1);
+		/* remove SerDes reset */
+		sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+		/* turn SPICO enable on */
+		sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+	} else {
+		turn_off_spicos(dd, SPICO_FABRIC);
+		/*
+		 * No need for firmware retry - what to download has already
+		 * been decided.
+		 * No need to pay attention to the load return - the only
+		 * failure is a validation failure, which has already been
+		 * checked by the initial download.
+		 */
+		(void)load_fabric_serdes_firmware(dd, &fw_fabric);
+	}
+
+	clear_sbus_fast_mode(dd);
+	release_chip_resource(dd, CR_SBUS);
+}
+
+/* Access to the SBus in this routine should probably be serialized */
+int sbus_request_slow(struct hfi1_devdata *dd,
+		      u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+	u64 reg, count = 0;
+
+	/* make sure fast mode is clear */
+	clear_sbus_fast_mode(dd);
+
+	sbus_request(dd, receiver_addr, data_addr, command, data_in);
+	write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+		  ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK);
+	/* Wait for both DONE and RCV_DATA_VALID to go high */
+	reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+	while (!((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+		 (reg & ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK))) {
+		if (count++ >= SBUS_MAX_POLL_COUNT) {
+			u64 counts = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+			/*
+			 * If the loop has timed out, we are OK if DONE bit
+			 * is set and RCV_DATA_VALID and EXECUTE counters
+			 * are the same. If not, we cannot proceed.
+			 */
+			if ((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+			    (SBUS_COUNTER(counts, RCV_DATA_VALID) ==
+			     SBUS_COUNTER(counts, EXECUTE)))
+				break;
+			return -ETIMEDOUT;
+		}
+		udelay(1);
+		reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+	}
+	count = 0;
+	write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+	/* Wait for DONE to clear after EXECUTE is cleared */
+	reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+	while (reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) {
+		if (count++ >= SBUS_MAX_POLL_COUNT)
+			return -ETIME;
+		udelay(1);
+		reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+	}
+	return 0;
+}
+
+static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
+				       struct firmware_details *fdet)
+{
+	int i, err;
+	const u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; /* receiver addr */
+
+	dd_dev_info(dd, "Downloading fabric firmware\n");
+
+	/* step 1: load security variables */
+	load_security_variables(dd, fdet);
+	/* step 2: place SerDes in reset and disable SPICO */
+	sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+	/* wait 100 refclk cycles @ 156.25MHz => 640ns */
+	udelay(1);
+	/* step 3:  remove SerDes reset */
+	sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+	/* step 4: assert IMEM override */
+	sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x40000000);
+	/* step 5: download SerDes machine code */
+	for (i = 0; i < fdet->firmware_len; i += 4) {
+		sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER,
+			     *(u32 *)&fdet->firmware_ptr[i]);
+	}
+	/* step 6: IMEM override off */
+	sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000);
+	/* step 7: turn ECC on */
+	sbus_request(dd, ra, 0x0b, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+	/* steps 8-11: run the RSA engine */
+	err = run_rsa(dd, "fabric serdes", fdet->signature);
+	if (err)
+		return err;
+
+	/* step 12: turn SPICO enable on */
+	sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+	/* step 13: enable core hardware interrupts */
+	sbus_request(dd, ra, 0x08, WRITE_SBUS_RECEIVER, 0x00000000);
+
+	return 0;
+}
+
+static int load_sbus_firmware(struct hfi1_devdata *dd,
+			      struct firmware_details *fdet)
+{
+	int i, err;
+	const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+	dd_dev_info(dd, "Downloading SBus firmware\n");
+
+	/* step 1: load security variables */
+	load_security_variables(dd, fdet);
+	/* step 2: place SPICO into reset and enable off */
+	sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x000000c0);
+	/* step 3: remove reset, enable off, IMEM_CNTRL_EN on */
+	sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000240);
+	/* step 4: set starting IMEM address for burst download */
+	sbus_request(dd, ra, 0x03, WRITE_SBUS_RECEIVER, 0x80000000);
+	/* step 5: download the SBus Master machine code */
+	for (i = 0; i < fdet->firmware_len; i += 4) {
+		sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER,
+			     *(u32 *)&fdet->firmware_ptr[i]);
+	}
+	/* step 6: set IMEM_CNTL_EN off */
+	sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040);
+	/* step 7: turn ECC on */
+	sbus_request(dd, ra, 0x16, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+	/* steps 8-11: run the RSA engine */
+	err = run_rsa(dd, "SBus", fdet->signature);
+	if (err)
+		return err;
+
+	/* step 12: set SPICO_ENABLE on */
+	sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+
+	return 0;
+}
+
+static int load_pcie_serdes_firmware(struct hfi1_devdata *dd,
+				     struct firmware_details *fdet)
+{
+	int i;
+	const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+	dd_dev_info(dd, "Downloading PCIe firmware\n");
+
+	/* step 1: load security variables */
+	load_security_variables(dd, fdet);
+	/* step 2: assert single step (halts the SBus Master spico) */
+	sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000001);
+	/* step 3: enable XDMEM access */
+	sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40);
+	/* step 4: load firmware into SBus Master XDMEM */
+	/*
+	 * NOTE: the dmem address, write_en, and wdata are all pre-packed,
+	 * we only need to pick up the bytes and write them
+	 */
+	for (i = 0; i < fdet->firmware_len; i += 4) {
+		sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER,
+			     *(u32 *)&fdet->firmware_ptr[i]);
+	}
+	/* step 5: disable XDMEM access */
+	sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+	/* step 6: allow SBus Spico to run */
+	sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000);
+
+	/*
+	 * steps 7-11: run RSA, if it succeeds, firmware is available to
+	 * be swapped
+	 */
+	return run_rsa(dd, "PCIe serdes", fdet->signature);
+}
+
+/*
+ * Set the given broadcast values on the given list of devices.
+ */
+static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2,
+				 const u8 *addrs, int count)
+{
+	while (--count >= 0) {
+		/*
+		 * Set BROADCAST_GROUP_1 and BROADCAST_GROUP_2, leave
+		 * defaults for everything else.  Do not read-modify-write,
+		 * per instruction from the manufacturer.
+		 *
+		 * Register 0xfd:
+		 *	bits    what
+		 *	-----	---------------------------------
+		 *	  0	IGNORE_BROADCAST  (default 0)
+		 *	11:4	BROADCAST_GROUP_1 (default 0xff)
+		 *	23:16	BROADCAST_GROUP_2 (default 0xff)
+		 */
+		sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER,
+			     (u32)bg1 << 4 | (u32)bg2 << 16);
+	}
+}
+
+int acquire_hw_mutex(struct hfi1_devdata *dd)
+{
+	unsigned long timeout;
+	int try = 0;
+	u8 mask = 1 << dd->hfi1_id;
+	u8 user = (u8)read_csr(dd, ASIC_CFG_MUTEX);
+
+	if (user == mask) {
+		dd_dev_info(dd,
+			    "Hardware mutex already acquired, mutex mask %u\n",
+			    (u32)mask);
+		return 0;
+	}
+
+retry:
+	timeout = msecs_to_jiffies(HM_TIMEOUT) + jiffies;
+	while (1) {
+		write_csr(dd, ASIC_CFG_MUTEX, mask);
+		user = (u8)read_csr(dd, ASIC_CFG_MUTEX);
+		if (user == mask)
+			return 0; /* success */
+		if (time_after(jiffies, timeout))
+			break; /* timed out */
+		msleep(20);
+	}
+
+	/* timed out */
+	dd_dev_err(dd,
+		   "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n",
+		   (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up");
+
+	if (try == 0) {
+		/* break mutex and retry */
+		write_csr(dd, ASIC_CFG_MUTEX, 0);
+		try++;
+		goto retry;
+	}
+
+	return -EBUSY;
+}
+
+void release_hw_mutex(struct hfi1_devdata *dd)
+{
+	u8 mask = 1 << dd->hfi1_id;
+	u8 user = (u8)read_csr(dd, ASIC_CFG_MUTEX);
+
+	if (user != mask)
+		dd_dev_warn(dd,
+			    "Unable to release hardware mutex, mutex mask %u, my mask %u\n",
+			    (u32)user, (u32)mask);
+	else
+		write_csr(dd, ASIC_CFG_MUTEX, 0);
+}
+
+/* return the given resource bit(s) as a mask for the given HFI */
+static inline u64 resource_mask(u32 hfi1_id, u32 resource)
+{
+	return ((u64)resource) << (hfi1_id ? CR_DYN_SHIFT : 0);
+}
+
+static void fail_mutex_acquire_message(struct hfi1_devdata *dd,
+				       const char *func)
+{
+	dd_dev_err(dd,
+		   "%s: hardware mutex stuck - suggest rebooting the machine\n",
+		   func);
+}
+
+/*
+ * Acquire access to a chip resource.
+ *
+ * Return 0 on success, -EBUSY if resource busy, -EIO if mutex acquire failed.
+ */
+static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource)
+{
+	u64 scratch0, all_bits, my_bit;
+	int ret;
+
+	if (resource & CR_DYN_MASK) {
+		/* a dynamic resource is in use if either HFI has set the bit */
+		if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 &&
+		    (resource & (CR_I2C1 | CR_I2C2))) {
+			/* discrete devices must serialize across both chains */
+			all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) |
+					resource_mask(1, CR_I2C1 | CR_I2C2);
+		} else {
+			all_bits = resource_mask(0, resource) |
+						resource_mask(1, resource);
+		}
+		my_bit = resource_mask(dd->hfi1_id, resource);
+	} else {
+		/* non-dynamic resources are not split between HFIs */
+		all_bits = resource;
+		my_bit = resource;
+	}
+
+	/* lock against other callers within the driver wanting a resource */
+	mutex_lock(&dd->asic_data->asic_resource_mutex);
+
+	ret = acquire_hw_mutex(dd);
+	if (ret) {
+		fail_mutex_acquire_message(dd, __func__);
+		ret = -EIO;
+		goto done;
+	}
+
+	scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+	if (scratch0 & all_bits) {
+		ret = -EBUSY;
+	} else {
+		write_csr(dd, ASIC_CFG_SCRATCH, scratch0 | my_bit);
+		/* force write to be visible to other HFI on another OS */
+		(void)read_csr(dd, ASIC_CFG_SCRATCH);
+	}
+
+	release_hw_mutex(dd);
+
+done:
+	mutex_unlock(&dd->asic_data->asic_resource_mutex);
+	return ret;
+}
+
+/*
+ * Acquire access to a chip resource, wait up to mswait milliseconds for
+ * the resource to become available.
+ *
+ * Return 0 on success, -EBUSY if busy (even after wait), -EIO if mutex
+ * acquire failed.
+ */
+int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait)
+{
+	unsigned long timeout;
+	int ret;
+
+	timeout = jiffies + msecs_to_jiffies(mswait);
+	while (1) {
+		ret = __acquire_chip_resource(dd, resource);
+		if (ret != -EBUSY)
+			return ret;
+		/* resource is busy, check our timeout */
+		if (time_after_eq(jiffies, timeout))
+			return -EBUSY;
+		usleep_range(80, 120);	/* arbitrary delay */
+	}
+}
+
+/*
+ * Release access to a chip resource
+ */
+void release_chip_resource(struct hfi1_devdata *dd, u32 resource)
+{
+	u64 scratch0, bit;
+
+	/* only dynamic resources should ever be cleared */
+	if (!(resource & CR_DYN_MASK)) {
+		dd_dev_err(dd, "%s: invalid resource 0x%x\n", __func__,
+			   resource);
+		return;
+	}
+	bit = resource_mask(dd->hfi1_id, resource);
+
+	/* lock against other callers within the driver wanting a resource */
+	mutex_lock(&dd->asic_data->asic_resource_mutex);
+
+	if (acquire_hw_mutex(dd)) {
+		fail_mutex_acquire_message(dd, __func__);
+		goto done;
+	}
+
+	scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+	if ((scratch0 & bit) != 0) {
+		scratch0 &= ~bit;
+		write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
+		/* force write to be visible to other HFI on another OS */
+		(void)read_csr(dd, ASIC_CFG_SCRATCH);
+	} else {
+		dd_dev_warn(dd, "%s: id %d, resource 0x%x: bit not set\n",
+			    __func__, dd->hfi1_id, resource);
+	}
+
+	release_hw_mutex(dd);
+
+done:
+	mutex_unlock(&dd->asic_data->asic_resource_mutex);
+}
+
+/*
+ * Return true if resource is set, false otherwise.  Print a warning
+ * if not set and a function is supplied.
+ */
+bool check_chip_resource(struct hfi1_devdata *dd, u32 resource,
+			 const char *func)
+{
+	u64 scratch0, bit;
+
+	if (resource & CR_DYN_MASK)
+		bit = resource_mask(dd->hfi1_id, resource);
+	else
+		bit = resource;
+
+	scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+	if ((scratch0 & bit) == 0) {
+		if (func)
+			dd_dev_warn(dd,
+				    "%s: id %d, resource 0x%x, not acquired!\n",
+				    func, dd->hfi1_id, resource);
+		return false;
+	}
+	return true;
+}
+
+static void clear_chip_resources(struct hfi1_devdata *dd, const char *func)
+{
+	u64 scratch0;
+
+	/* lock against other callers within the driver wanting a resource */
+	mutex_lock(&dd->asic_data->asic_resource_mutex);
+
+	if (acquire_hw_mutex(dd)) {
+		fail_mutex_acquire_message(dd, func);
+		goto done;
+	}
+
+	/* clear all dynamic access bits for this HFI */
+	scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+	scratch0 &= ~resource_mask(dd->hfi1_id, CR_DYN_MASK);
+	write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
+	/* force write to be visible to other HFI on another OS */
+	(void)read_csr(dd, ASIC_CFG_SCRATCH);
+
+	release_hw_mutex(dd);
+
+done:
+	mutex_unlock(&dd->asic_data->asic_resource_mutex);
+}
+
+void init_chip_resources(struct hfi1_devdata *dd)
+{
+	/* clear any holds left by us */
+	clear_chip_resources(dd, __func__);
+}
+
+void finish_chip_resources(struct hfi1_devdata *dd)
+{
+	/* clear any holds left by us */
+	clear_chip_resources(dd, __func__);
+}
+
+void set_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+	write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+		  ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK);
+}
+
+void clear_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+	u64 reg, count = 0;
+
+	reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+	while (SBUS_COUNTER(reg, EXECUTE) !=
+	       SBUS_COUNTER(reg, RCV_DATA_VALID)) {
+		if (count++ >= SBUS_MAX_POLL_COUNT)
+			break;
+		udelay(1);
+		reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+	}
+	write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+}
+
+int load_firmware(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	if (fw_fabric_serdes_load) {
+		ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+		if (ret)
+			return ret;
+
+		set_sbus_fast_mode(dd);
+
+		set_serdes_broadcast(dd, all_fabric_serdes_broadcast,
+				     fabric_serdes_broadcast[dd->hfi1_id],
+				     fabric_serdes_addrs[dd->hfi1_id],
+				     NUM_FABRIC_SERDES);
+		turn_off_spicos(dd, SPICO_FABRIC);
+		do {
+			ret = load_fabric_serdes_firmware(dd, &fw_fabric);
+		} while (retry_firmware(dd, ret));
+
+		clear_sbus_fast_mode(dd);
+		release_chip_resource(dd, CR_SBUS);
+		if (ret)
+			return ret;
+	}
+
+	if (fw_8051_load) {
+		do {
+			ret = load_8051_firmware(dd, &fw_8051);
+		} while (retry_firmware(dd, ret));
+		if (ret)
+			return ret;
+	}
+
+	dump_fw_version(dd);
+	return 0;
+}
+
+int hfi1_firmware_init(struct hfi1_devdata *dd)
+{
+	/* only RTL can use these */
+	if (dd->icode != ICODE_RTL_SILICON) {
+		fw_fabric_serdes_load = 0;
+		fw_pcie_serdes_load = 0;
+		fw_sbus_load = 0;
+	}
+
+	/* no 8051 or QSFP on simulator */
+	if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+		fw_8051_load = 0;
+
+	if (!fw_8051_name) {
+		if (dd->icode == ICODE_RTL_SILICON)
+			fw_8051_name = DEFAULT_FW_8051_NAME_ASIC;
+		else
+			fw_8051_name = DEFAULT_FW_8051_NAME_FPGA;
+	}
+	if (!fw_fabric_serdes_name)
+		fw_fabric_serdes_name = DEFAULT_FW_FABRIC_NAME;
+	if (!fw_sbus_name)
+		fw_sbus_name = DEFAULT_FW_SBUS_NAME;
+	if (!fw_pcie_serdes_name)
+		fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME;
+
+	return obtain_firmware(dd);
+}
+
+/*
+ * This function is a helper function for parse_platform_config(...) and
+ * does not check for validity of the platform configuration cache
+ * (because we know it is invalid as we are building up the cache).
+ * As such, this should not be called from anywhere other than
+ * parse_platform_config
+ */
+static int check_meta_version(struct hfi1_devdata *dd, u32 *system_table)
+{
+	u32 meta_ver, meta_ver_meta, ver_start, ver_len, mask;
+	struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+
+	if (!system_table)
+		return -EINVAL;
+
+	meta_ver_meta =
+	*(pcfgcache->config_tables[PLATFORM_CONFIG_SYSTEM_TABLE].table_metadata
+	+ SYSTEM_TABLE_META_VERSION);
+
+	mask = ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
+	ver_start = meta_ver_meta & mask;
+
+	meta_ver_meta >>= METADATA_TABLE_FIELD_LEN_SHIFT;
+
+	mask = ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
+	ver_len = meta_ver_meta & mask;
+
+	ver_start /= 8;
+	meta_ver = *((u8 *)system_table + ver_start) & ((1 << ver_len) - 1);
+
+	if (meta_ver < 4) {
+		dd_dev_info(
+			dd, "%s:Please update platform config\n", __func__);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int parse_platform_config(struct hfi1_devdata *dd)
+{
+	struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+	struct hfi1_pportdata *ppd = dd->pport;
+	u32 *ptr = NULL;
+	u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0, file_length = 0;
+	u32 record_idx = 0, table_type = 0, table_length_dwords = 0;
+	int ret = -EINVAL; /* assume failure */
+
+	/*
+	 * For integrated devices that did not fall back to the default file,
+	 * the SI tuning information for active channels is acquired from the
+	 * scratch register bitmap, thus there is no platform config to parse.
+	 * Skip parsing in these situations.
+	 */
+	if (ppd->config_from_scratch)
+		return 0;
+
+	if (!dd->platform_config.data) {
+		dd_dev_err(dd, "%s: Missing config file\n", __func__);
+		goto bail;
+	}
+	ptr = (u32 *)dd->platform_config.data;
+
+	magic_num = *ptr;
+	ptr++;
+	if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) {
+		dd_dev_err(dd, "%s: Bad config file\n", __func__);
+		goto bail;
+	}
+
+	/* Field is file size in DWORDs */
+	file_length = (*ptr) * 4;
+
+	/*
+	 * Length can't be larger than partition size. Assume platform
+	 * config format version 4 is being used. Interpret the file size
+	 * field as header instead by not moving the pointer.
+	 */
+	if (file_length > MAX_PLATFORM_CONFIG_FILE_SIZE) {
+		dd_dev_info(dd,
+			    "%s:File length out of bounds, using alternative format\n",
+			    __func__);
+		file_length = PLATFORM_CONFIG_FORMAT_4_FILE_SIZE;
+	} else {
+		ptr++;
+	}
+
+	if (file_length > dd->platform_config.size) {
+		dd_dev_info(dd, "%s:File claims to be larger than read size\n",
+			    __func__);
+		goto bail;
+	} else if (file_length < dd->platform_config.size) {
+		dd_dev_info(dd,
+			    "%s:File claims to be smaller than read size, continuing\n",
+			    __func__);
+	}
+	/* exactly equal, perfection */
+
+	/*
+	 * In both cases where we proceed, using the self-reported file length
+	 * is the safer option. In case of old format a predefined value is
+	 * being used.
+	 */
+	while (ptr < (u32 *)(dd->platform_config.data + file_length)) {
+		header1 = *ptr;
+		header2 = *(ptr + 1);
+		if (header1 != ~header2) {
+			dd_dev_err(dd, "%s: Failed validation at offset %ld\n",
+				   __func__, (ptr - (u32 *)
+					      dd->platform_config.data));
+			goto bail;
+		}
+
+		record_idx = *ptr &
+			((1 << PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS) - 1);
+
+		table_length_dwords = (*ptr >>
+				PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT) &
+		      ((1 << PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS) - 1);
+
+		table_type = (*ptr >> PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT) &
+			((1 << PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS) - 1);
+
+		/* Done with this set of headers */
+		ptr += 2;
+
+		if (record_idx) {
+			/* data table */
+			switch (table_type) {
+			case PLATFORM_CONFIG_SYSTEM_TABLE:
+				pcfgcache->config_tables[table_type].num_table =
+									1;
+				ret = check_meta_version(dd, ptr);
+				if (ret)
+					goto bail;
+				break;
+			case PLATFORM_CONFIG_PORT_TABLE:
+				pcfgcache->config_tables[table_type].num_table =
+									2;
+				break;
+			case PLATFORM_CONFIG_RX_PRESET_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_TX_PRESET_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+				pcfgcache->config_tables[table_type].num_table =
+							table_length_dwords;
+				break;
+			default:
+				dd_dev_err(dd,
+					   "%s: Unknown data table %d, offset %ld\n",
+					   __func__, table_type,
+					   (ptr - (u32 *)
+					    dd->platform_config.data));
+				goto bail; /* We don't trust this file now */
+			}
+			pcfgcache->config_tables[table_type].table = ptr;
+		} else {
+			/* metadata table */
+			switch (table_type) {
+			case PLATFORM_CONFIG_SYSTEM_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_PORT_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_RX_PRESET_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_TX_PRESET_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+				/* fall through */
+			case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+				break;
+			default:
+				dd_dev_err(dd,
+					   "%s: Unknown meta table %d, offset %ld\n",
+					   __func__, table_type,
+					   (ptr -
+					    (u32 *)dd->platform_config.data));
+				goto bail; /* We don't trust this file now */
+			}
+			pcfgcache->config_tables[table_type].table_metadata =
+									ptr;
+		}
+
+		/* Calculate and check table crc */
+		crc = crc32_le(~(u32)0, (unsigned char const *)ptr,
+			       (table_length_dwords * 4));
+		crc ^= ~(u32)0;
+
+		/* Jump the table */
+		ptr += table_length_dwords;
+		if (crc != *ptr) {
+			dd_dev_err(dd, "%s: Failed CRC check at offset %ld\n",
+				   __func__, (ptr -
+				   (u32 *)dd->platform_config.data));
+			goto bail;
+		}
+		/* Jump the CRC DWORD */
+		ptr++;
+	}
+
+	pcfgcache->cache_valid = 1;
+	return 0;
+bail:
+	memset(pcfgcache, 0, sizeof(struct platform_config_cache));
+	return ret;
+}
+
+static void get_integrated_platform_config_field(
+		struct hfi1_devdata *dd,
+		enum platform_config_table_type_encoding table_type,
+		int field_index, u32 *data)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+	u8 *cache = ppd->qsfp_info.cache;
+	u32 tx_preset = 0;
+
+	switch (table_type) {
+	case PLATFORM_CONFIG_SYSTEM_TABLE:
+		if (field_index == SYSTEM_TABLE_QSFP_POWER_CLASS_MAX)
+			*data = ppd->max_power_class;
+		else if (field_index == SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G)
+			*data = ppd->default_atten;
+		break;
+	case PLATFORM_CONFIG_PORT_TABLE:
+		if (field_index == PORT_TABLE_PORT_TYPE)
+			*data = ppd->port_type;
+		else if (field_index == PORT_TABLE_LOCAL_ATTEN_25G)
+			*data = ppd->local_atten;
+		else if (field_index == PORT_TABLE_REMOTE_ATTEN_25G)
+			*data = ppd->remote_atten;
+		break;
+	case PLATFORM_CONFIG_RX_PRESET_TABLE:
+		if (field_index == RX_PRESET_TABLE_QSFP_RX_CDR_APPLY)
+			*data = (ppd->rx_preset & QSFP_RX_CDR_APPLY_SMASK) >>
+				QSFP_RX_CDR_APPLY_SHIFT;
+		else if (field_index == RX_PRESET_TABLE_QSFP_RX_EMP_APPLY)
+			*data = (ppd->rx_preset & QSFP_RX_EMP_APPLY_SMASK) >>
+				QSFP_RX_EMP_APPLY_SHIFT;
+		else if (field_index == RX_PRESET_TABLE_QSFP_RX_AMP_APPLY)
+			*data = (ppd->rx_preset & QSFP_RX_AMP_APPLY_SMASK) >>
+				QSFP_RX_AMP_APPLY_SHIFT;
+		else if (field_index == RX_PRESET_TABLE_QSFP_RX_CDR)
+			*data = (ppd->rx_preset & QSFP_RX_CDR_SMASK) >>
+				QSFP_RX_CDR_SHIFT;
+		else if (field_index == RX_PRESET_TABLE_QSFP_RX_EMP)
+			*data = (ppd->rx_preset & QSFP_RX_EMP_SMASK) >>
+				QSFP_RX_EMP_SHIFT;
+		else if (field_index == RX_PRESET_TABLE_QSFP_RX_AMP)
+			*data = (ppd->rx_preset & QSFP_RX_AMP_SMASK) >>
+				QSFP_RX_AMP_SHIFT;
+		break;
+	case PLATFORM_CONFIG_TX_PRESET_TABLE:
+		if (cache[QSFP_EQ_INFO_OFFS] & 0x4)
+			tx_preset = ppd->tx_preset_eq;
+		else
+			tx_preset = ppd->tx_preset_noeq;
+		if (field_index == TX_PRESET_TABLE_PRECUR)
+			*data = (tx_preset & TX_PRECUR_SMASK) >>
+				TX_PRECUR_SHIFT;
+		else if (field_index == TX_PRESET_TABLE_ATTN)
+			*data = (tx_preset & TX_ATTN_SMASK) >>
+				TX_ATTN_SHIFT;
+		else if (field_index == TX_PRESET_TABLE_POSTCUR)
+			*data = (tx_preset & TX_POSTCUR_SMASK) >>
+				TX_POSTCUR_SHIFT;
+		else if (field_index == TX_PRESET_TABLE_QSFP_TX_CDR_APPLY)
+			*data = (tx_preset & QSFP_TX_CDR_APPLY_SMASK) >>
+				QSFP_TX_CDR_APPLY_SHIFT;
+		else if (field_index == TX_PRESET_TABLE_QSFP_TX_EQ_APPLY)
+			*data = (tx_preset & QSFP_TX_EQ_APPLY_SMASK) >>
+				QSFP_TX_EQ_APPLY_SHIFT;
+		else if (field_index == TX_PRESET_TABLE_QSFP_TX_CDR)
+			*data = (tx_preset & QSFP_TX_CDR_SMASK) >>
+				QSFP_TX_CDR_SHIFT;
+		else if (field_index == TX_PRESET_TABLE_QSFP_TX_EQ)
+			*data = (tx_preset & QSFP_TX_EQ_SMASK) >>
+				QSFP_TX_EQ_SHIFT;
+		break;
+	case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+	case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+	default:
+		break;
+	}
+}
+
+static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table,
+					  int field, u32 *field_len_bits,
+					  u32 *field_start_bits)
+{
+	struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+	u32 *src_ptr = NULL;
+
+	if (!pcfgcache->cache_valid)
+		return -EINVAL;
+
+	switch (table) {
+	case PLATFORM_CONFIG_SYSTEM_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_PORT_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_RX_PRESET_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_TX_PRESET_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+		if (field && field < platform_config_table_limits[table])
+			src_ptr =
+			pcfgcache->config_tables[table].table_metadata + field;
+		break;
+	default:
+		dd_dev_info(dd, "%s: Unknown table\n", __func__);
+		break;
+	}
+
+	if (!src_ptr)
+		return -EINVAL;
+
+	if (field_start_bits)
+		*field_start_bits = *src_ptr &
+		      ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
+
+	if (field_len_bits)
+		*field_len_bits = (*src_ptr >> METADATA_TABLE_FIELD_LEN_SHIFT)
+		       & ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
+
+	return 0;
+}
+
+/* This is the central interface to getting data out of the platform config
+ * file. It depends on parse_platform_config() having populated the
+ * platform_config_cache in hfi1_devdata, and checks the cache_valid member to
+ * validate the sanity of the cache.
+ *
+ * The non-obvious parameters:
+ * @table_index: Acts as a look up key into which instance of the tables the
+ * relevant field is fetched from.
+ *
+ * This applies to the data tables that have multiple instances. The port table
+ * is an exception to this rule as each HFI only has one port and thus the
+ * relevant table can be distinguished by hfi_id.
+ *
+ * @data: pointer to memory that will be populated with the field requested.
+ * @len: length of memory pointed by @data in bytes.
+ */
+int get_platform_config_field(struct hfi1_devdata *dd,
+			      enum platform_config_table_type_encoding
+			      table_type, int table_index, int field_index,
+			      u32 *data, u32 len)
+{
+	int ret = 0, wlen = 0, seek = 0;
+	u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL;
+	struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+	struct hfi1_pportdata *ppd = dd->pport;
+
+	if (data)
+		memset(data, 0, len);
+	else
+		return -EINVAL;
+
+	if (ppd->config_from_scratch) {
+		/*
+		 * Use saved configuration from ppd for integrated platforms
+		 */
+		get_integrated_platform_config_field(dd, table_type,
+						     field_index, data);
+		return 0;
+	}
+
+	ret = get_platform_fw_field_metadata(dd, table_type, field_index,
+					     &field_len_bits,
+					     &field_start_bits);
+	if (ret)
+		return -EINVAL;
+
+	/* Convert length to bits */
+	len *= 8;
+
+	/* Our metadata function checked cache_valid and field_index for us */
+	switch (table_type) {
+	case PLATFORM_CONFIG_SYSTEM_TABLE:
+		src_ptr = pcfgcache->config_tables[table_type].table;
+
+		if (field_index != SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) {
+			if (len < field_len_bits)
+				return -EINVAL;
+
+			seek = field_start_bits / 8;
+			wlen = field_len_bits / 8;
+
+			src_ptr = (u32 *)((u8 *)src_ptr + seek);
+
+			/*
+			 * We expect the field to be byte aligned and whole byte
+			 * lengths if we are here
+			 */
+			memcpy(data, src_ptr, wlen);
+			return 0;
+		}
+		break;
+	case PLATFORM_CONFIG_PORT_TABLE:
+		/* Port table is 4 DWORDS */
+		src_ptr = dd->hfi1_id ?
+			pcfgcache->config_tables[table_type].table + 4 :
+			pcfgcache->config_tables[table_type].table;
+		break;
+	case PLATFORM_CONFIG_RX_PRESET_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_TX_PRESET_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+		/* fall through */
+	case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+		src_ptr = pcfgcache->config_tables[table_type].table;
+
+		if (table_index <
+			pcfgcache->config_tables[table_type].num_table)
+			src_ptr += table_index;
+		else
+			src_ptr = NULL;
+		break;
+	default:
+		dd_dev_info(dd, "%s: Unknown table\n", __func__);
+		break;
+	}
+
+	if (!src_ptr || len < field_len_bits)
+		return -EINVAL;
+
+	src_ptr += (field_start_bits / 32);
+	*data = (*src_ptr >> (field_start_bits % 32)) &
+			((1 << field_len_bits) - 1);
+
+	return 0;
+}
+
+/*
+ * Download the firmware needed for the Gen3 PCIe SerDes.  An update
+ * to the SBus firmware is needed before updating the PCIe firmware.
+ *
+ * Note: caller must be holding the SBus resource.
+ */
+int load_pcie_firmware(struct hfi1_devdata *dd)
+{
+	int ret = 0;
+
+	/* both firmware loads below use the SBus */
+	set_sbus_fast_mode(dd);
+
+	if (fw_sbus_load) {
+		turn_off_spicos(dd, SPICO_SBUS);
+		do {
+			ret = load_sbus_firmware(dd, &fw_sbus);
+		} while (retry_firmware(dd, ret));
+		if (ret)
+			goto done;
+	}
+
+	if (fw_pcie_serdes_load) {
+		dd_dev_info(dd, "Setting PCIe SerDes broadcast\n");
+		set_serdes_broadcast(dd, all_pcie_serdes_broadcast,
+				     pcie_serdes_broadcast[dd->hfi1_id],
+				     pcie_serdes_addrs[dd->hfi1_id],
+				     NUM_PCIE_SERDES);
+		do {
+			ret = load_pcie_serdes_firmware(dd, &fw_pcie);
+		} while (retry_firmware(dd, ret));
+		if (ret)
+			goto done;
+	}
+
+done:
+	clear_sbus_fast_mode(dd);
+
+	return ret;
+}
+
+/*
+ * Read the GUID from the hardware, store it in dd.
+ */
+void read_guid(struct hfi1_devdata *dd)
+{
+	/* Take the DC out of reset to get a valid GUID value */
+	write_csr(dd, CCE_DC_CTRL, 0);
+	(void)read_csr(dd, CCE_DC_CTRL);
+
+	dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID);
+	dd_dev_info(dd, "GUID %llx",
+		    (unsigned long long)dd->base_guid);
+}
+
+/* read and display firmware version info */
+static void dump_fw_version(struct hfi1_devdata *dd)
+{
+	u32 pcie_vers[NUM_PCIE_SERDES];
+	u32 fabric_vers[NUM_FABRIC_SERDES];
+	u32 sbus_vers;
+	int i;
+	int all_same;
+	int ret;
+	u8 rcv_addr;
+
+	ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+	if (ret) {
+		dd_dev_err(dd, "Unable to acquire SBus to read firmware versions\n");
+		return;
+	}
+
+	/* set fast mode */
+	set_sbus_fast_mode(dd);
+
+	/* read version for SBus Master */
+	sbus_request(dd, SBUS_MASTER_BROADCAST, 0x02, WRITE_SBUS_RECEIVER, 0);
+	sbus_request(dd, SBUS_MASTER_BROADCAST, 0x07, WRITE_SBUS_RECEIVER, 0x1);
+	/* wait for interrupt to be processed */
+	usleep_range(10000, 11000);
+	sbus_vers = sbus_read(dd, SBUS_MASTER_BROADCAST, 0x08, 0x1);
+	dd_dev_info(dd, "SBus Master firmware version 0x%08x\n", sbus_vers);
+
+	/* read version for PCIe SerDes */
+	all_same = 1;
+	pcie_vers[0] = 0;
+	for (i = 0; i < NUM_PCIE_SERDES; i++) {
+		rcv_addr = pcie_serdes_addrs[dd->hfi1_id][i];
+		sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0);
+		/* wait for interrupt to be processed */
+		usleep_range(10000, 11000);
+		pcie_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0);
+		if (i > 0 && pcie_vers[0] != pcie_vers[i])
+			all_same = 0;
+	}
+
+	if (all_same) {
+		dd_dev_info(dd, "PCIe SerDes firmware version 0x%x\n",
+			    pcie_vers[0]);
+	} else {
+		dd_dev_warn(dd, "PCIe SerDes do not have the same firmware version\n");
+		for (i = 0; i < NUM_PCIE_SERDES; i++) {
+			dd_dev_info(dd,
+				    "PCIe SerDes lane %d firmware version 0x%x\n",
+				    i, pcie_vers[i]);
+		}
+	}
+
+	/* read version for fabric SerDes */
+	all_same = 1;
+	fabric_vers[0] = 0;
+	for (i = 0; i < NUM_FABRIC_SERDES; i++) {
+		rcv_addr = fabric_serdes_addrs[dd->hfi1_id][i];
+		sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0);
+		/* wait for interrupt to be processed */
+		usleep_range(10000, 11000);
+		fabric_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0);
+		if (i > 0 && fabric_vers[0] != fabric_vers[i])
+			all_same = 0;
+	}
+
+	if (all_same) {
+		dd_dev_info(dd, "Fabric SerDes firmware version 0x%x\n",
+			    fabric_vers[0]);
+	} else {
+		dd_dev_warn(dd, "Fabric SerDes do not have the same firmware version\n");
+		for (i = 0; i < NUM_FABRIC_SERDES; i++) {
+			dd_dev_info(dd,
+				    "Fabric SerDes lane %d firmware version 0x%x\n",
+				    i, fabric_vers[i]);
+		}
+	}
+
+	clear_sbus_fast_mode(dd);
+	release_chip_resource(dd, CR_SBUS);
+}
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
new file mode 100644
index 0000000..cac2c62
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -0,0 +1,2448 @@
+#ifndef _HFI1_KERNEL_H
+#define _HFI1_KERNEL_H
+/*
+ * Copyright(c) 2015-2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/sched.h>
+#include <linux/cdev.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/i2c.h>
+#include <linux/i2c-algo-bit.h>
+#include <rdma/ib_hdrs.h>
+#include <rdma/opa_addr.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <rdma/rdma_vt.h>
+
+#include "chip_registers.h"
+#include "common.h"
+#include "verbs.h"
+#include "pio.h"
+#include "chip.h"
+#include "mad.h"
+#include "qsfp.h"
+#include "platform.h"
+#include "affinity.h"
+
+/* bumped 1 from s/w major version of TrueScale */
+#define HFI1_CHIP_VERS_MAJ 3U
+
+/* don't care about this except printing */
+#define HFI1_CHIP_VERS_MIN 0U
+
+/* The Organization Unique Identifier (Mfg code), and its position in GUID */
+#define HFI1_OUI 0x001175
+#define HFI1_OUI_LSB 40
+
+#define DROP_PACKET_OFF		0
+#define DROP_PACKET_ON		1
+
+#define NEIGHBOR_TYPE_HFI		0
+#define NEIGHBOR_TYPE_SWITCH	1
+
+extern unsigned long hfi1_cap_mask;
+#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
+#define HFI1_CAP_UGET_MASK(mask, cap) \
+	(((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap)
+#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap))
+#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap))
+#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \
+			HFI1_CAP_MISC_MASK)
+/* Offline Disabled Reason is 4-bits */
+#define HFI1_ODR_MASK(rsn) ((rsn) & OPA_PI_MASK_OFFLINE_REASON)
+
+/*
+ * Control context is always 0 and handles the error packets.
+ * It also handles the VL15 and multicast packets.
+ */
+#define HFI1_CTRL_CTXT    0
+
+/*
+ * Driver context will store software counters for each of the events
+ * associated with these status registers
+ */
+#define NUM_CCE_ERR_STATUS_COUNTERS 41
+#define NUM_RCV_ERR_STATUS_COUNTERS 64
+#define NUM_MISC_ERR_STATUS_COUNTERS 13
+#define NUM_SEND_PIO_ERR_STATUS_COUNTERS 36
+#define NUM_SEND_DMA_ERR_STATUS_COUNTERS 4
+#define NUM_SEND_EGRESS_ERR_STATUS_COUNTERS 64
+#define NUM_SEND_ERR_STATUS_COUNTERS 3
+#define NUM_SEND_CTXT_ERR_STATUS_COUNTERS 5
+#define NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS 24
+
+/*
+ * per driver stats, either not device nor port-specific, or
+ * summed over all of the devices and ports.
+ * They are described by name via ipathfs filesystem, so layout
+ * and number of elements can change without breaking compatibility.
+ * If members are added or deleted hfi1_statnames[] in debugfs.c must
+ * change to match.
+ */
+struct hfi1_ib_stats {
+	__u64 sps_ints; /* number of interrupts handled */
+	__u64 sps_errints; /* number of error interrupts */
+	__u64 sps_txerrs; /* tx-related packet errors */
+	__u64 sps_rcverrs; /* non-crc rcv packet errors */
+	__u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */
+	__u64 sps_nopiobufs; /* no pio bufs avail from kernel */
+	__u64 sps_ctxts; /* number of contexts currently open */
+	__u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */
+	__u64 sps_buffull;
+	__u64 sps_hdrfull;
+};
+
+extern struct hfi1_ib_stats hfi1_stats;
+extern const struct pci_error_handlers hfi1_pci_err_handler;
+
+/*
+ * First-cut criterion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000)
+
+/*
+ * Below contains all data related to a single context (formerly called port).
+ */
+
+struct hfi1_opcode_stats_perctx;
+
+struct ctxt_eager_bufs {
+	ssize_t size;            /* total size of eager buffers */
+	u32 count;               /* size of buffers array */
+	u32 numbufs;             /* number of buffers allocated */
+	u32 alloced;             /* number of rcvarray entries used */
+	u32 rcvtid_size;         /* size of each eager rcv tid */
+	u32 threshold;           /* head update threshold */
+	struct eager_buffer {
+		void *addr;
+		dma_addr_t dma;
+		ssize_t len;
+	} *buffers;
+	struct {
+		void *addr;
+		dma_addr_t dma;
+	} *rcvtids;
+};
+
+struct exp_tid_set {
+	struct list_head list;
+	u32 count;
+};
+
+struct hfi1_ctxtdata {
+	/* shadow the ctxt's RcvCtrl register */
+	u64 rcvctrl;
+	/* rcvhdrq base, needs mmap before useful */
+	void *rcvhdrq;
+	/* kernel virtual address where hdrqtail is updated */
+	volatile __le64 *rcvhdrtail_kvaddr;
+	/* when waiting for rcv or pioavail */
+	wait_queue_head_t wait;
+	/* rcvhdrq size (for freeing) */
+	size_t rcvhdrq_size;
+	/* number of rcvhdrq entries */
+	u16 rcvhdrq_cnt;
+	/* size of each of the rcvhdrq entries */
+	u16 rcvhdrqentsize;
+	/* mmap of hdrq, must fit in 44 bits */
+	dma_addr_t rcvhdrq_dma;
+	dma_addr_t rcvhdrqtailaddr_dma;
+	struct ctxt_eager_bufs egrbufs;
+	/* this receive context's assigned PIO ACK send context */
+	struct send_context *sc;
+
+	/* dynamic receive available interrupt timeout */
+	u32 rcvavail_timeout;
+	/* Reference count the base context usage */
+	struct kref kref;
+
+	/* Device context index */
+	u16 ctxt;
+	/*
+	 * non-zero if ctxt can be shared, and defines the maximum number of
+	 * sub-contexts for this device context.
+	 */
+	u16 subctxt_cnt;
+	/* non-zero if ctxt is being shared. */
+	u16 subctxt_id;
+	u8 uuid[16];
+	/* job key */
+	u16 jkey;
+	/* number of RcvArray groups for this context. */
+	u32 rcv_array_groups;
+	/* index of first eager TID entry. */
+	u32 eager_base;
+	/* number of expected TID entries */
+	u32 expected_count;
+	/* index of first expected TID entry. */
+	u32 expected_base;
+
+	struct exp_tid_set tid_group_list;
+	struct exp_tid_set tid_used_list;
+	struct exp_tid_set tid_full_list;
+
+	/* lock protecting all Expected TID data */
+	struct mutex exp_lock;
+	/* per-context configuration flags */
+	unsigned long flags;
+	/* per-context event flags for fileops/intr communication */
+	unsigned long event_flags;
+	/* total number of polled urgent packets */
+	u32 urgent;
+	/* saved total number of polled urgent packets for poll edge trigger */
+	u32 urgent_poll;
+	/* same size as task_struct .comm[], command that opened context */
+	char comm[TASK_COMM_LEN];
+	/* so file ops can get at unit */
+	struct hfi1_devdata *dd;
+	/* so functions that need physical port can get it easily */
+	struct hfi1_pportdata *ppd;
+	/* associated msix interrupt */
+	u32 msix_intr;
+	/* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+	void *subctxt_uregbase;
+	/* An array of pages for the eager receive buffers * N */
+	void *subctxt_rcvegrbuf;
+	/* An array of pages for the eager header queue entries * N */
+	void *subctxt_rcvhdr_base;
+	/* Bitmask of in use context(s) */
+	DECLARE_BITMAP(in_use_ctxts, HFI1_MAX_SHARED_CTXTS);
+	/* The version of the library which opened this ctxt */
+	u32 userversion;
+	/* Type of packets or conditions we want to poll for */
+	u16 poll_type;
+	/* receive packet sequence counter */
+	u8 seq_cnt;
+	/* ctxt rcvhdrq head offset */
+	u32 head;
+	/* QPs waiting for context processing */
+	struct list_head qp_wait_list;
+	/* interrupt handling */
+	u64 imask;	/* clear interrupt mask */
+	int ireg;	/* clear interrupt register */
+	unsigned numa_id; /* numa node of this context */
+	/* verbs rx_stats per rcd */
+	struct hfi1_opcode_stats_perctx *opstats;
+
+	/* Is ASPM interrupt supported for this context */
+	bool aspm_intr_supported;
+	/* ASPM state (enabled/disabled) for this context */
+	bool aspm_enabled;
+	/* Timer for re-enabling ASPM if interrupt activity quietens down */
+	struct timer_list aspm_timer;
+	/* Lock to serialize between intr, timer intr and user threads */
+	spinlock_t aspm_lock;
+	/* Is ASPM processing enabled for this context (in intr context) */
+	bool aspm_intr_enable;
+	/* Last interrupt timestamp */
+	ktime_t aspm_ts_last_intr;
+	/* Last timestamp at which we scheduled a timer for this context */
+	ktime_t aspm_ts_timer_sched;
+
+	/*
+	 * The interrupt handler for a particular receive context can vary
+	 * throughout it's lifetime. This is not a lock protected data member so
+	 * it must be updated atomically and the prev and new value must always
+	 * be valid. Worst case is we process an extra interrupt and up to 64
+	 * packets with the wrong interrupt handler.
+	 */
+	int (*do_interrupt)(struct hfi1_ctxtdata *rcd, int threaded);
+
+	/* Indicates that this is vnic context */
+	bool is_vnic;
+
+	/* vnic queue index this context is mapped to */
+	u8 vnic_q_idx;
+};
+
+/*
+ * Represents a single packet at a high level. Put commonly computed things in
+ * here so we do not have to keep doing them over and over. The rule of thumb is
+ * if something is used one time to derive some value, store that something in
+ * here. If it is used multiple times, then store the result of that derivation
+ * in here.
+ */
+struct hfi1_packet {
+	void *ebuf;
+	void *hdr;
+	void *payload;
+	struct hfi1_ctxtdata *rcd;
+	__le32 *rhf_addr;
+	struct rvt_qp *qp;
+	struct ib_other_headers *ohdr;
+	struct ib_grh *grh;
+	u64 rhf;
+	u32 maxcnt;
+	u32 rhqoff;
+	u32 dlid;
+	u32 slid;
+	u16 tlen;
+	s16 etail;
+	u16 pkey;
+	u8 hlen;
+	u8 numpkt;
+	u8 rsize;
+	u8 updegr;
+	u8 etype;
+	u8 extra_byte;
+	u8 pad;
+	u8 sc;
+	u8 sl;
+	u8 opcode;
+	bool migrated;
+};
+
+/* Packet types */
+#define HFI1_PKT_TYPE_9B  0
+#define HFI1_PKT_TYPE_16B 1
+
+/*
+ * OPA 16B Header
+ */
+#define OPA_16B_L4_MASK		0xFFull
+#define OPA_16B_SC_MASK		0x1F00000ull
+#define OPA_16B_SC_SHIFT	20
+#define OPA_16B_LID_MASK	0xFFFFFull
+#define OPA_16B_DLID_MASK	0xF000ull
+#define OPA_16B_DLID_SHIFT	20
+#define OPA_16B_DLID_HIGH_SHIFT	12
+#define OPA_16B_SLID_MASK	0xF00ull
+#define OPA_16B_SLID_SHIFT	20
+#define OPA_16B_SLID_HIGH_SHIFT	8
+#define OPA_16B_BECN_MASK       0x80000000ull
+#define OPA_16B_BECN_SHIFT      31
+#define OPA_16B_FECN_MASK       0x10000000ull
+#define OPA_16B_FECN_SHIFT      28
+#define OPA_16B_L2_MASK		0x60000000ull
+#define OPA_16B_L2_SHIFT	29
+#define OPA_16B_PKEY_MASK	0xFFFF0000ull
+#define OPA_16B_PKEY_SHIFT	16
+#define OPA_16B_LEN_MASK	0x7FF00000ull
+#define OPA_16B_LEN_SHIFT	20
+#define OPA_16B_RC_MASK		0xE000000ull
+#define OPA_16B_RC_SHIFT	25
+#define OPA_16B_AGE_MASK	0xFF0000ull
+#define OPA_16B_AGE_SHIFT	16
+#define OPA_16B_ENTROPY_MASK	0xFFFFull
+
+/*
+ * OPA 16B L2/L4 Encodings
+ */
+#define OPA_16B_L4_9B		0x00
+#define OPA_16B_L2_TYPE		0x02
+#define OPA_16B_L4_IB_LOCAL	0x09
+#define OPA_16B_L4_IB_GLOBAL	0x0A
+#define OPA_16B_L4_ETHR		OPA_VNIC_L4_ETHR
+
+static inline u8 hfi1_16B_get_l4(struct hfi1_16b_header *hdr)
+{
+	return (u8)(hdr->lrh[2] & OPA_16B_L4_MASK);
+}
+
+static inline u8 hfi1_16B_get_sc(struct hfi1_16b_header *hdr)
+{
+	return (u8)((hdr->lrh[1] & OPA_16B_SC_MASK) >> OPA_16B_SC_SHIFT);
+}
+
+static inline u32 hfi1_16B_get_dlid(struct hfi1_16b_header *hdr)
+{
+	return (u32)((hdr->lrh[1] & OPA_16B_LID_MASK) |
+		     (((hdr->lrh[2] & OPA_16B_DLID_MASK) >>
+		     OPA_16B_DLID_HIGH_SHIFT) << OPA_16B_DLID_SHIFT));
+}
+
+static inline u32 hfi1_16B_get_slid(struct hfi1_16b_header *hdr)
+{
+	return (u32)((hdr->lrh[0] & OPA_16B_LID_MASK) |
+		     (((hdr->lrh[2] & OPA_16B_SLID_MASK) >>
+		     OPA_16B_SLID_HIGH_SHIFT) << OPA_16B_SLID_SHIFT));
+}
+
+static inline u8 hfi1_16B_get_becn(struct hfi1_16b_header *hdr)
+{
+	return (u8)((hdr->lrh[0] & OPA_16B_BECN_MASK) >> OPA_16B_BECN_SHIFT);
+}
+
+static inline u8 hfi1_16B_get_fecn(struct hfi1_16b_header *hdr)
+{
+	return (u8)((hdr->lrh[1] & OPA_16B_FECN_MASK) >> OPA_16B_FECN_SHIFT);
+}
+
+static inline u8 hfi1_16B_get_l2(struct hfi1_16b_header *hdr)
+{
+	return (u8)((hdr->lrh[1] & OPA_16B_L2_MASK) >> OPA_16B_L2_SHIFT);
+}
+
+static inline u16 hfi1_16B_get_pkey(struct hfi1_16b_header *hdr)
+{
+	return (u16)((hdr->lrh[2] & OPA_16B_PKEY_MASK) >> OPA_16B_PKEY_SHIFT);
+}
+
+static inline u8 hfi1_16B_get_rc(struct hfi1_16b_header *hdr)
+{
+	return (u8)((hdr->lrh[1] & OPA_16B_RC_MASK) >> OPA_16B_RC_SHIFT);
+}
+
+static inline u8 hfi1_16B_get_age(struct hfi1_16b_header *hdr)
+{
+	return (u8)((hdr->lrh[3] & OPA_16B_AGE_MASK) >> OPA_16B_AGE_SHIFT);
+}
+
+static inline u16 hfi1_16B_get_len(struct hfi1_16b_header *hdr)
+{
+	return (u16)((hdr->lrh[0] & OPA_16B_LEN_MASK) >> OPA_16B_LEN_SHIFT);
+}
+
+static inline u16 hfi1_16B_get_entropy(struct hfi1_16b_header *hdr)
+{
+	return (u16)(hdr->lrh[3] & OPA_16B_ENTROPY_MASK);
+}
+
+#define OPA_16B_MAKE_QW(low_dw, high_dw) (((u64)(high_dw) << 32) | (low_dw))
+
+/*
+ * BTH
+ */
+#define OPA_16B_BTH_PAD_MASK	7
+static inline u8 hfi1_16B_bth_get_pad(struct ib_other_headers *ohdr)
+{
+	return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_PAD_SHIFT) &
+		   OPA_16B_BTH_PAD_MASK);
+}
+
+struct rvt_sge_state;
+
+/*
+ * Get/Set IB link-level config parameters for f_get/set_ib_cfg()
+ * Mostly for MADs that set or query link parameters, also ipath
+ * config interfaces
+ */
+#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */
+#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */
+#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */
+#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */
+#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */
+#define HFI1_IB_CFG_SPD 5 /* current Link spd */
+#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */
+#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */
+#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */
+#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */
+#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */
+#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */
+#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */
+#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */
+#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */
+#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */
+#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */
+#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */
+#define HFI1_IB_CFG_VL_HIGH_LIMIT 19
+#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */
+#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */
+
+/*
+ * HFI or Host Link States
+ *
+ * These describe the states the driver thinks the logical and physical
+ * states are in.  Used as an argument to set_link_state().  Implemented
+ * as bits for easy multi-state checking.  The actual state can only be
+ * one.
+ */
+#define __HLS_UP_INIT_BP	0
+#define __HLS_UP_ARMED_BP	1
+#define __HLS_UP_ACTIVE_BP	2
+#define __HLS_DN_DOWNDEF_BP	3	/* link down default */
+#define __HLS_DN_POLL_BP	4
+#define __HLS_DN_DISABLE_BP	5
+#define __HLS_DN_OFFLINE_BP	6
+#define __HLS_VERIFY_CAP_BP	7
+#define __HLS_GOING_UP_BP	8
+#define __HLS_GOING_OFFLINE_BP  9
+#define __HLS_LINK_COOLDOWN_BP 10
+
+#define HLS_UP_INIT	  BIT(__HLS_UP_INIT_BP)
+#define HLS_UP_ARMED	  BIT(__HLS_UP_ARMED_BP)
+#define HLS_UP_ACTIVE	  BIT(__HLS_UP_ACTIVE_BP)
+#define HLS_DN_DOWNDEF	  BIT(__HLS_DN_DOWNDEF_BP) /* link down default */
+#define HLS_DN_POLL	  BIT(__HLS_DN_POLL_BP)
+#define HLS_DN_DISABLE	  BIT(__HLS_DN_DISABLE_BP)
+#define HLS_DN_OFFLINE	  BIT(__HLS_DN_OFFLINE_BP)
+#define HLS_VERIFY_CAP	  BIT(__HLS_VERIFY_CAP_BP)
+#define HLS_GOING_UP	  BIT(__HLS_GOING_UP_BP)
+#define HLS_GOING_OFFLINE BIT(__HLS_GOING_OFFLINE_BP)
+#define HLS_LINK_COOLDOWN BIT(__HLS_LINK_COOLDOWN_BP)
+
+#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
+#define HLS_DOWN ~(HLS_UP)
+
+#define HLS_DEFAULT HLS_DN_POLL
+
+/* use this MTU size if none other is given */
+#define HFI1_DEFAULT_ACTIVE_MTU 10240
+/* use this MTU size as the default maximum */
+#define HFI1_DEFAULT_MAX_MTU 10240
+/* default partition key */
+#define DEFAULT_PKEY 0xffff
+
+/*
+ * Possible fabric manager config parameters for fm_{get,set}_table()
+ */
+#define FM_TBL_VL_HIGH_ARB		1 /* Get/set VL high prio weights */
+#define FM_TBL_VL_LOW_ARB		2 /* Get/set VL low prio weights */
+#define FM_TBL_BUFFER_CONTROL		3 /* Get/set Buffer Control */
+#define FM_TBL_SC2VLNT			4 /* Get/set SC->VLnt */
+#define FM_TBL_VL_PREEMPT_ELEMS		5 /* Get (no set) VL preempt elems */
+#define FM_TBL_VL_PREEMPT_MATRIX	6 /* Get (no set) VL preempt matrix */
+
+/*
+ * Possible "operations" for f_rcvctrl(ppd, op, ctxt)
+ * these are bits so they can be combined, e.g.
+ * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB
+ */
+#define HFI1_RCVCTRL_TAILUPD_ENB 0x01
+#define HFI1_RCVCTRL_TAILUPD_DIS 0x02
+#define HFI1_RCVCTRL_CTXT_ENB 0x04
+#define HFI1_RCVCTRL_CTXT_DIS 0x08
+#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10
+#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20
+#define HFI1_RCVCTRL_PKEY_ENB 0x40  /* Note, default is enabled */
+#define HFI1_RCVCTRL_PKEY_DIS 0x80
+#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400
+#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800
+#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000
+#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000
+#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000
+#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000
+
+/* partition enforcement flags */
+#define HFI1_PART_ENFORCE_IN	0x1
+#define HFI1_PART_ENFORCE_OUT	0x2
+
+/* how often we check for synthetic counter wrap around */
+#define SYNTH_CNT_TIME 3
+
+/* Counter flags */
+#define CNTR_NORMAL		0x0 /* Normal counters, just read register */
+#define CNTR_SYNTH		0x1 /* Synthetic counters, saturate at all 1s */
+#define CNTR_DISABLED		0x2 /* Disable this counter */
+#define CNTR_32BIT		0x4 /* Simulate 64 bits for this counter */
+#define CNTR_VL			0x8 /* Per VL counter */
+#define CNTR_SDMA              0x10
+#define CNTR_INVALID_VL		-1  /* Specifies invalid VL */
+#define CNTR_MODE_W		0x0
+#define CNTR_MODE_R		0x1
+
+/* VLs Supported/Operational */
+#define HFI1_MIN_VLS_SUPPORTED 1
+#define HFI1_MAX_VLS_SUPPORTED 8
+
+#define HFI1_GUIDS_PER_PORT  5
+#define HFI1_PORT_GUID_INDEX 0
+
+static inline void incr_cntr64(u64 *cntr)
+{
+	if (*cntr < (u64)-1LL)
+		(*cntr)++;
+}
+
+static inline void incr_cntr32(u32 *cntr)
+{
+	if (*cntr < (u32)-1LL)
+		(*cntr)++;
+}
+
+#define MAX_NAME_SIZE 64
+struct hfi1_msix_entry {
+	enum irq_type type;
+	int irq;
+	void *arg;
+	cpumask_t mask;
+	struct irq_affinity_notify notify;
+};
+
+/* per-SL CCA information */
+struct cca_timer {
+	struct hrtimer hrtimer;
+	struct hfi1_pportdata *ppd; /* read-only */
+	int sl; /* read-only */
+	u16 ccti; /* read/write - current value of CCTI */
+};
+
+struct link_down_reason {
+	/*
+	 * SMA-facing value.  Should be set from .latest when
+	 * HLS_UP_* -> HLS_DN_* transition actually occurs.
+	 */
+	u8 sma;
+	u8 latest;
+};
+
+enum {
+	LO_PRIO_TABLE,
+	HI_PRIO_TABLE,
+	MAX_PRIO_TABLE
+};
+
+struct vl_arb_cache {
+	/* protect vl arb cache */
+	spinlock_t lock;
+	struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE];
+};
+
+/*
+ * The structure below encapsulates data relevant to a physical IB Port.
+ * Current chips support only one such port, but the separation
+ * clarifies things a bit. Note that to conform to IB conventions,
+ * port-numbers are one-based. The first or only port is port1.
+ */
+struct hfi1_pportdata {
+	struct hfi1_ibport ibport_data;
+
+	struct hfi1_devdata *dd;
+	struct kobject pport_cc_kobj;
+	struct kobject sc2vl_kobj;
+	struct kobject sl2sc_kobj;
+	struct kobject vl2mtu_kobj;
+
+	/* PHY support */
+	struct qsfp_data qsfp_info;
+	/* Values for SI tuning of SerDes */
+	u32 port_type;
+	u32 tx_preset_eq;
+	u32 tx_preset_noeq;
+	u32 rx_preset;
+	u8  local_atten;
+	u8  remote_atten;
+	u8  default_atten;
+	u8  max_power_class;
+
+	/* did we read platform config from scratch registers? */
+	bool config_from_scratch;
+
+	/* GUIDs for this interface, in host order, guids[0] is a port guid */
+	u64 guids[HFI1_GUIDS_PER_PORT];
+
+	/* GUID for peer interface, in host order */
+	u64 neighbor_guid;
+
+	/* up or down physical link state */
+	u32 linkup;
+
+	/*
+	 * this address is mapped read-only into user processes so they can
+	 * get status cheaply, whenever they want.  One qword of status per port
+	 */
+	u64 *statusp;
+
+	/* SendDMA related entries */
+
+	struct workqueue_struct *hfi1_wq;
+	struct workqueue_struct *link_wq;
+
+	/* move out of interrupt context */
+	struct work_struct link_vc_work;
+	struct work_struct link_up_work;
+	struct work_struct link_down_work;
+	struct work_struct sma_message_work;
+	struct work_struct freeze_work;
+	struct work_struct link_downgrade_work;
+	struct work_struct link_bounce_work;
+	struct delayed_work start_link_work;
+	/* host link state variables */
+	struct mutex hls_lock;
+	u32 host_link_state;
+
+	/* these are the "32 bit" regs */
+
+	u32 ibmtu; /* The MTU programmed for this unit */
+	/*
+	 * Current max size IB packet (in bytes) including IB headers, that
+	 * we can send. Changes when ibmtu changes.
+	 */
+	u32 ibmaxlen;
+	u32 current_egress_rate; /* units [10^6 bits/sec] */
+	/* LID programmed for this instance */
+	u32 lid;
+	/* list of pkeys programmed; 0 if not set */
+	u16 pkeys[MAX_PKEY_VALUES];
+	u16 link_width_supported;
+	u16 link_width_downgrade_supported;
+	u16 link_speed_supported;
+	u16 link_width_enabled;
+	u16 link_width_downgrade_enabled;
+	u16 link_speed_enabled;
+	u16 link_width_active;
+	u16 link_width_downgrade_tx_active;
+	u16 link_width_downgrade_rx_active;
+	u16 link_speed_active;
+	u8 vls_supported;
+	u8 vls_operational;
+	u8 actual_vls_operational;
+	/* LID mask control */
+	u8 lmc;
+	/* Rx Polarity inversion (compensate for ~tx on partner) */
+	u8 rx_pol_inv;
+
+	u8 hw_pidx;     /* physical port index */
+	u8 port;        /* IB port number and index into dd->pports - 1 */
+	/* type of neighbor node */
+	u8 neighbor_type;
+	u8 neighbor_normal;
+	u8 neighbor_fm_security; /* 1 if firmware checking is disabled */
+	u8 neighbor_port_number;
+	u8 is_sm_config_started;
+	u8 offline_disabled_reason;
+	u8 is_active_optimize_enabled;
+	u8 driver_link_ready;	/* driver ready for active link */
+	u8 link_enabled;	/* link enabled? */
+	u8 linkinit_reason;
+	u8 local_tx_rate;	/* rate given to 8051 firmware */
+	u8 qsfp_retry_count;
+
+	/* placeholders for IB MAD packet settings */
+	u8 overrun_threshold;
+	u8 phy_error_threshold;
+	unsigned int is_link_down_queued;
+
+	/* Used to override LED behavior for things like maintenance beaconing*/
+	/*
+	 * Alternates per phase of blink
+	 * [0] holds LED off duration, [1] holds LED on duration
+	 */
+	unsigned long led_override_vals[2];
+	u8 led_override_phase; /* LSB picks from vals[] */
+	atomic_t led_override_timer_active;
+	/* Used to flash LEDs in override mode */
+	struct timer_list led_override_timer;
+
+	u32 sm_trap_qp;
+	u32 sa_qp;
+
+	/*
+	 * cca_timer_lock protects access to the per-SL cca_timer
+	 * structures (specifically the ccti member).
+	 */
+	spinlock_t cca_timer_lock ____cacheline_aligned_in_smp;
+	struct cca_timer cca_timer[OPA_MAX_SLS];
+
+	/* List of congestion control table entries */
+	struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX];
+
+	/* congestion entries, each entry corresponding to a SL */
+	struct opa_congestion_setting_entry_shadow
+		congestion_entries[OPA_MAX_SLS];
+
+	/*
+	 * cc_state_lock protects (write) access to the per-port
+	 * struct cc_state.
+	 */
+	spinlock_t cc_state_lock ____cacheline_aligned_in_smp;
+
+	struct cc_state __rcu *cc_state;
+
+	/* Total number of congestion control table entries */
+	u16 total_cct_entry;
+
+	/* Bit map identifying service level */
+	u32 cc_sl_control_map;
+
+	/* CA's max number of 64 entry units in the congestion control table */
+	u8 cc_max_table_entries;
+
+	/*
+	 * begin congestion log related entries
+	 * cc_log_lock protects all congestion log related data
+	 */
+	spinlock_t cc_log_lock ____cacheline_aligned_in_smp;
+	u8 threshold_cong_event_map[OPA_MAX_SLS / 8];
+	u16 threshold_event_counter;
+	struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS];
+	int cc_log_idx; /* index for logging events */
+	int cc_mad_idx; /* index for reporting events */
+	/* end congestion log related entries */
+
+	struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE];
+
+	/* port relative counter buffer */
+	u64 *cntrs;
+	/* port relative synthetic counter buffer */
+	u64 *scntrs;
+	/* port_xmit_discards are synthesized from different egress errors */
+	u64 port_xmit_discards;
+	u64 port_xmit_discards_vl[C_VL_COUNT];
+	u64 port_xmit_constraint_errors;
+	u64 port_rcv_constraint_errors;
+	/* count of 'link_err' interrupts from DC */
+	u64 link_downed;
+	/* number of times link retrained successfully */
+	u64 link_up;
+	/* number of times a link unknown frame was reported */
+	u64 unknown_frame_count;
+	/* port_ltp_crc_mode is returned in 'portinfo' MADs */
+	u16 port_ltp_crc_mode;
+	/* port_crc_mode_enabled is the crc we support */
+	u8 port_crc_mode_enabled;
+	/* mgmt_allowed is also returned in 'portinfo' MADs */
+	u8 mgmt_allowed;
+	u8 part_enforce; /* partition enforcement flags */
+	struct link_down_reason local_link_down_reason;
+	struct link_down_reason neigh_link_down_reason;
+	/* Value to be sent to link peer on LinkDown .*/
+	u8 remote_link_down_reason;
+	/* Error events that will cause a port bounce. */
+	u32 port_error_action;
+	struct work_struct linkstate_active_work;
+	/* Does this port need to prescan for FECNs */
+	bool cc_prescan;
+	/*
+	 * Sample sendWaitCnt & sendWaitVlCnt during link transition
+	 * and counter request.
+	 */
+	u64 port_vl_xmit_wait_last[C_VL_COUNT + 1];
+	u16 prev_link_width;
+	u64 vl_xmit_flit_cnt[C_VL_COUNT + 1];
+};
+
+typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+typedef void (*opcode_handler)(struct hfi1_packet *packet);
+typedef void (*hfi1_make_req)(struct rvt_qp *qp,
+			      struct hfi1_pkt_state *ps,
+			      struct rvt_swqe *wqe);
+
+
+/* return values for the RHF receive functions */
+#define RHF_RCV_CONTINUE  0	/* keep going */
+#define RHF_RCV_DONE	  1	/* stop, this packet processed */
+#define RHF_RCV_REPROCESS 2	/* stop. retain this packet */
+
+struct rcv_array_data {
+	u8 group_size;
+	u16 ngroups;
+	u16 nctxt_extra;
+};
+
+struct per_vl_data {
+	u16 mtu;
+	struct send_context *sc;
+};
+
+/* 16 to directly index */
+#define PER_VL_SEND_CONTEXTS 16
+
+struct err_info_rcvport {
+	u8 status_and_code;
+	u64 packet_flit1;
+	u64 packet_flit2;
+};
+
+struct err_info_constraint {
+	u8 status;
+	u16 pkey;
+	u32 slid;
+};
+
+struct hfi1_temp {
+	unsigned int curr;       /* current temperature */
+	unsigned int lo_lim;     /* low temperature limit */
+	unsigned int hi_lim;     /* high temperature limit */
+	unsigned int crit_lim;   /* critical temperature limit */
+	u8 triggers;      /* temperature triggers */
+};
+
+struct hfi1_i2c_bus {
+	struct hfi1_devdata *controlling_dd; /* current controlling device */
+	struct i2c_adapter adapter;	/* bus details */
+	struct i2c_algo_bit_data algo;	/* bus algorithm details */
+	int num;			/* bus number, 0 or 1 */
+};
+
+/* common data between shared ASIC HFIs */
+struct hfi1_asic_data {
+	struct hfi1_devdata *dds[2];	/* back pointers */
+	struct mutex asic_resource_mutex;
+	struct hfi1_i2c_bus *i2c_bus0;
+	struct hfi1_i2c_bus *i2c_bus1;
+};
+
+/* sizes for both the QP and RSM map tables */
+#define NUM_MAP_ENTRIES	 256
+#define NUM_MAP_REGS      32
+
+/*
+ * Number of VNIC contexts used. Ensure it is less than or equal to
+ * max queues supported by VNIC (HFI1_VNIC_MAX_QUEUE).
+ */
+#define HFI1_NUM_VNIC_CTXT   8
+
+/* Number of VNIC RSM entries */
+#define NUM_VNIC_MAP_ENTRIES 8
+
+/* Virtual NIC information */
+struct hfi1_vnic_data {
+	struct hfi1_ctxtdata *ctxt[HFI1_NUM_VNIC_CTXT];
+	struct kmem_cache *txreq_cache;
+	u8 num_vports;
+	struct idr vesw_idr;
+	u8 rmt_start;
+	u8 num_ctxt;
+	u32 msix_idx;
+};
+
+struct hfi1_vnic_vport_info;
+
+/* device data struct now contains only "general per-device" info.
+ * fields related to a physical IB port are in a hfi1_pportdata struct.
+ */
+struct sdma_engine;
+struct sdma_vl_map;
+
+#define BOARD_VERS_MAX 96 /* how long the version string can be */
+#define SERIAL_MAX 16 /* length of the serial number */
+
+typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64);
+struct hfi1_devdata {
+	struct hfi1_ibdev verbs_dev;     /* must be first */
+	struct list_head list;
+	/* pointers to related structs for this device */
+	/* pci access data structure */
+	struct pci_dev *pcidev;
+	struct cdev user_cdev;
+	struct cdev diag_cdev;
+	struct cdev ui_cdev;
+	struct device *user_device;
+	struct device *diag_device;
+	struct device *ui_device;
+
+	/* first mapping up to RcvArray */
+	u8 __iomem *kregbase1;
+	resource_size_t physaddr;
+
+	/* second uncached mapping from RcvArray to pio send buffers */
+	u8 __iomem *kregbase2;
+	/* for detecting offset above kregbase2 address */
+	u32 base2_start;
+
+	/* Per VL data. Enough for all VLs but not all elements are set/used. */
+	struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
+	/* send context data */
+	struct send_context_info *send_contexts;
+	/* map hardware send contexts to software index */
+	u8 *hw_to_sw;
+	/* spinlock for allocating and releasing send context resources */
+	spinlock_t sc_lock;
+	/* lock for pio_map */
+	spinlock_t pio_map_lock;
+	/* Send Context initialization lock. */
+	spinlock_t sc_init_lock;
+	/* lock for sdma_map */
+	spinlock_t                          sde_map_lock;
+	/* array of kernel send contexts */
+	struct send_context **kernel_send_context;
+	/* array of vl maps */
+	struct pio_vl_map __rcu *pio_map;
+	/* default flags to last descriptor */
+	u64 default_desc1;
+
+	/* fields common to all SDMA engines */
+
+	volatile __le64                    *sdma_heads_dma; /* DMA'ed by chip */
+	dma_addr_t                          sdma_heads_phys;
+	void                               *sdma_pad_dma; /* DMA'ed by chip */
+	dma_addr_t                          sdma_pad_phys;
+	/* for deallocation */
+	size_t                              sdma_heads_size;
+	/* number from the chip */
+	u32                                 chip_sdma_engines;
+	/* num used */
+	u32                                 num_sdma;
+	/* array of engines sized by num_sdma */
+	struct sdma_engine                 *per_sdma;
+	/* array of vl maps */
+	struct sdma_vl_map __rcu           *sdma_map;
+	/* SPC freeze waitqueue and variable */
+	wait_queue_head_t		  sdma_unfreeze_wq;
+	atomic_t			  sdma_unfreeze_count;
+
+	u32 lcb_access_count;		/* count of LCB users */
+
+	/* common data between shared ASIC HFIs in this OS */
+	struct hfi1_asic_data *asic_data;
+
+	/* mem-mapped pointer to base of PIO buffers */
+	void __iomem *piobase;
+	/*
+	 * write-combining mem-mapped pointer to base of RcvArray
+	 * memory.
+	 */
+	void __iomem *rcvarray_wc;
+	/*
+	 * credit return base - a per-NUMA range of DMA address that
+	 * the chip will use to update the per-context free counter
+	 */
+	struct credit_return_base *cr_base;
+
+	/* send context numbers and sizes for each type */
+	struct sc_config_sizes sc_sizes[SC_MAX];
+
+	char *boardname; /* human readable board info */
+
+	/* reset value */
+	u64 z_int_counter;
+	u64 z_rcv_limit;
+	u64 z_send_schedule;
+
+	u64 __percpu *send_schedule;
+	/* number of reserved contexts for VNIC usage */
+	u16 num_vnic_contexts;
+	/* number of receive contexts in use by the driver */
+	u32 num_rcv_contexts;
+	/* number of pio send contexts in use by the driver */
+	u32 num_send_contexts;
+	/*
+	 * number of ctxts available for PSM open
+	 */
+	u32 freectxts;
+	/* total number of available user/PSM contexts */
+	u32 num_user_contexts;
+	/* base receive interrupt timeout, in CSR units */
+	u32 rcv_intr_timeout_csr;
+
+	u32 freezelen; /* max length of freezemsg */
+	u64 __iomem *egrtidbase;
+	spinlock_t sendctrl_lock; /* protect changes to SendCtrl */
+	spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */
+	spinlock_t uctxt_lock; /* protect rcd changes */
+	struct mutex dc8051_lock; /* exclusive access to 8051 */
+	struct workqueue_struct *update_cntr_wq;
+	struct work_struct update_cntr_work;
+	/* exclusive access to 8051 memory */
+	spinlock_t dc8051_memlock;
+	int dc8051_timed_out;	/* remember if the 8051 timed out */
+	/*
+	 * A page that will hold event notification bitmaps for all
+	 * contexts. This page will be mapped into all processes.
+	 */
+	unsigned long *events;
+	/*
+	 * per unit status, see also portdata statusp
+	 * mapped read-only into user processes so they can get unit and
+	 * IB link status cheaply
+	 */
+	struct hfi1_status *status;
+
+	/* revision register shadow */
+	u64 revision;
+	/* Base GUID for device (network order) */
+	u64 base_guid;
+
+	/* these are the "32 bit" regs */
+
+	/* value we put in kr_rcvhdrsize */
+	u32 rcvhdrsize;
+	/* number of receive contexts the chip supports */
+	u32 chip_rcv_contexts;
+	/* number of receive array entries */
+	u32 chip_rcv_array_count;
+	/* number of PIO send contexts the chip supports */
+	u32 chip_send_contexts;
+	/* number of bytes in the PIO memory buffer */
+	u32 chip_pio_mem_size;
+	/* number of bytes in the SDMA memory buffer */
+	u32 chip_sdma_mem_size;
+
+	/* size of each rcvegrbuffer */
+	u32 rcvegrbufsize;
+	/* log2 of above */
+	u16 rcvegrbufsize_shift;
+	/* both sides of the PCIe link are gen3 capable */
+	u8 link_gen3_capable;
+	u8 dc_shutdown;
+	/* localbus width (1, 2,4,8,16,32) from config space  */
+	u32 lbus_width;
+	/* localbus speed in MHz */
+	u32 lbus_speed;
+	int unit; /* unit # of this chip */
+	int node; /* home node of this chip */
+
+	/* save these PCI fields to restore after a reset */
+	u32 pcibar0;
+	u32 pcibar1;
+	u32 pci_rom;
+	u16 pci_command;
+	u16 pcie_devctl;
+	u16 pcie_lnkctl;
+	u16 pcie_devctl2;
+	u32 pci_msix0;
+	u32 pci_tph2;
+
+	/*
+	 * ASCII serial number, from flash, large enough for original
+	 * all digit strings, and longer serial number format
+	 */
+	u8 serial[SERIAL_MAX];
+	/* human readable board version */
+	u8 boardversion[BOARD_VERS_MAX];
+	u8 lbus_info[32]; /* human readable localbus info */
+	/* chip major rev, from CceRevision */
+	u8 majrev;
+	/* chip minor rev, from CceRevision */
+	u8 minrev;
+	/* hardware ID */
+	u8 hfi1_id;
+	/* implementation code */
+	u8 icode;
+	/* vAU of this device */
+	u8 vau;
+	/* vCU of this device */
+	u8 vcu;
+	/* link credits of this device */
+	u16 link_credits;
+	/* initial vl15 credits to use */
+	u16 vl15_init;
+
+	/*
+	 * Cached value for vl15buf, read during verify cap interrupt. VL15
+	 * credits are to be kept at 0 and set when handling the link-up
+	 * interrupt. This removes the possibility of receiving VL15 MAD
+	 * packets before this HFI is ready.
+	 */
+	u16 vl15buf_cached;
+
+	/* Misc small ints */
+	u8 n_krcv_queues;
+	u8 qos_shift;
+
+	u16 irev;	/* implementation revision */
+	u32 dc8051_ver; /* 8051 firmware version */
+
+	spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */
+	struct platform_config platform_config;
+	struct platform_config_cache pcfg_cache;
+
+	struct diag_client *diag_client;
+
+	/* MSI-X information */
+	struct hfi1_msix_entry *msix_entries;
+	u32 num_msix_entries;
+	u32 first_dyn_msix_idx;
+
+	/* INTx information */
+	u32 requested_intx_irq;		/* did we request one? */
+
+	/* general interrupt: mask of handled interrupts */
+	u64 gi_mask[CCE_NUM_INT_CSRS];
+
+	struct rcv_array_data rcv_entries;
+
+	/* cycle length of PS* counters in HW (in picoseconds) */
+	u16 psxmitwait_check_rate;
+
+	/*
+	 * 64 bit synthetic counters
+	 */
+	struct timer_list synth_stats_timer;
+
+	/*
+	 * device counters
+	 */
+	char *cntrnames;
+	size_t cntrnameslen;
+	size_t ndevcntrs;
+	u64 *cntrs;
+	u64 *scntrs;
+
+	/*
+	 * remembered values for synthetic counters
+	 */
+	u64 last_tx;
+	u64 last_rx;
+
+	/*
+	 * per-port counters
+	 */
+	size_t nportcntrs;
+	char *portcntrnames;
+	size_t portcntrnameslen;
+
+	struct err_info_rcvport err_info_rcvport;
+	struct err_info_constraint err_info_rcv_constraint;
+	struct err_info_constraint err_info_xmit_constraint;
+
+	atomic_t drop_packet;
+	u8 do_drop;
+	u8 err_info_uncorrectable;
+	u8 err_info_fmconfig;
+
+	/*
+	 * Software counters for the status bits defined by the
+	 * associated error status registers
+	 */
+	u64 cce_err_status_cnt[NUM_CCE_ERR_STATUS_COUNTERS];
+	u64 rcv_err_status_cnt[NUM_RCV_ERR_STATUS_COUNTERS];
+	u64 misc_err_status_cnt[NUM_MISC_ERR_STATUS_COUNTERS];
+	u64 send_pio_err_status_cnt[NUM_SEND_PIO_ERR_STATUS_COUNTERS];
+	u64 send_dma_err_status_cnt[NUM_SEND_DMA_ERR_STATUS_COUNTERS];
+	u64 send_egress_err_status_cnt[NUM_SEND_EGRESS_ERR_STATUS_COUNTERS];
+	u64 send_err_status_cnt[NUM_SEND_ERR_STATUS_COUNTERS];
+
+	/* Software counter that spans all contexts */
+	u64 sw_ctxt_err_status_cnt[NUM_SEND_CTXT_ERR_STATUS_COUNTERS];
+	/* Software counter that spans all DMA engines */
+	u64 sw_send_dma_eng_err_status_cnt[
+		NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS];
+	/* Software counter that aggregates all cce_err_status errors */
+	u64 sw_cce_err_status_aggregate;
+	/* Software counter that aggregates all bypass packet rcv errors */
+	u64 sw_rcv_bypass_packet_errors;
+	/* receive interrupt function */
+	rhf_rcv_function_ptr normal_rhf_rcv_functions[8];
+
+	/* Save the enabled LCB error bits */
+	u64 lcb_err_en;
+
+	/*
+	 * Capability to have different send engines simply by changing a
+	 * pointer value.
+	 */
+	send_routine process_pio_send ____cacheline_aligned_in_smp;
+	send_routine process_dma_send;
+	void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+				u64 pbc, const void *from, size_t count);
+	int (*process_vnic_dma_send)(struct hfi1_devdata *dd, u8 q_idx,
+				     struct hfi1_vnic_vport_info *vinfo,
+				     struct sk_buff *skb, u64 pbc, u8 plen);
+	/* hfi1_pportdata, points to array of (physical) port-specific
+	 * data structs, indexed by pidx (0..n-1)
+	 */
+	struct hfi1_pportdata *pport;
+	/* receive context data */
+	struct hfi1_ctxtdata **rcd;
+	u64 __percpu *int_counter;
+	/* verbs tx opcode stats */
+	struct hfi1_opcode_stats_perctx __percpu *tx_opstats;
+	/* device (not port) flags, basically device capabilities */
+	u16 flags;
+	/* Number of physical ports available */
+	u8 num_pports;
+	/* Lowest context number which can be used by user processes or VNIC */
+	u8 first_dyn_alloc_ctxt;
+	/* adding a new field here would make it part of this cacheline */
+
+	/* seqlock for sc2vl */
+	seqlock_t sc2vl_lock ____cacheline_aligned_in_smp;
+	u64 sc2vl[4];
+	/* receive interrupt functions */
+	rhf_rcv_function_ptr *rhf_rcv_function_map;
+	u64 __percpu *rcv_limit;
+	u16 rhf_offset; /* offset of RHF within receive header entry */
+	/* adding a new field here would make it part of this cacheline */
+
+	/* OUI comes from the HW. Used everywhere as 3 separate bytes. */
+	u8 oui1;
+	u8 oui2;
+	u8 oui3;
+
+	/* Timer and counter used to detect RcvBufOvflCnt changes */
+	struct timer_list rcverr_timer;
+
+	wait_queue_head_t event_queue;
+
+	/* receive context tail dummy address */
+	__le64 *rcvhdrtail_dummy_kvaddr;
+	dma_addr_t rcvhdrtail_dummy_dma;
+
+	u32 rcv_ovfl_cnt;
+	/* Serialize ASPM enable/disable between multiple verbs contexts */
+	spinlock_t aspm_lock;
+	/* Number of verbs contexts which have disabled ASPM */
+	atomic_t aspm_disabled_cnt;
+	/* Keeps track of user space clients */
+	atomic_t user_refcount;
+	/* Used to wait for outstanding user space clients before dev removal */
+	struct completion user_comp;
+
+	bool eprom_available;	/* true if EPROM is available for this device */
+	bool aspm_supported;	/* Does HW support ASPM */
+	bool aspm_enabled;	/* ASPM state: enabled/disabled */
+	struct rhashtable *sdma_rht;
+
+	struct kobject kobj;
+
+	/* vnic data */
+	struct hfi1_vnic_data vnic;
+};
+
+static inline bool hfi1_vnic_is_rsm_full(struct hfi1_devdata *dd, int spare)
+{
+	return (dd->vnic.rmt_start + spare) > NUM_MAP_ENTRIES;
+}
+
+/* 8051 firmware version helper */
+#define dc8051_ver(a, b, c) ((a) << 16 | (b) << 8 | (c))
+#define dc8051_ver_maj(a) (((a) & 0xff0000) >> 16)
+#define dc8051_ver_min(a) (((a) & 0x00ff00) >> 8)
+#define dc8051_ver_patch(a) ((a) & 0x0000ff)
+
+/* f_put_tid types */
+#define PT_EXPECTED       0
+#define PT_EAGER          1
+#define PT_INVALID_FLUSH  2
+#define PT_INVALID        3
+
+struct tid_rb_node;
+struct mmu_rb_node;
+struct mmu_rb_handler;
+
+/* Private data for file operations */
+struct hfi1_filedata {
+	struct hfi1_devdata *dd;
+	struct hfi1_ctxtdata *uctxt;
+	struct hfi1_user_sdma_comp_q *cq;
+	struct hfi1_user_sdma_pkt_q *pq;
+	u16 subctxt;
+	/* for cpu affinity; -1 if none */
+	int rec_cpu_num;
+	u32 tid_n_pinned;
+	struct mmu_rb_handler *handler;
+	struct tid_rb_node **entry_to_rb;
+	spinlock_t tid_lock; /* protect tid_[limit,used] counters */
+	u32 tid_limit;
+	u32 tid_used;
+	u32 *invalid_tids;
+	u32 invalid_tid_idx;
+	/* protect invalid_tids array and invalid_tid_idx */
+	spinlock_t invalid_lock;
+	struct mm_struct *mm;
+};
+
+extern struct list_head hfi1_dev_list;
+extern spinlock_t hfi1_devs_lock;
+struct hfi1_devdata *hfi1_lookup(int unit);
+
+static inline unsigned long uctxt_offset(struct hfi1_ctxtdata *uctxt)
+{
+	return (uctxt->ctxt - uctxt->dd->first_dyn_alloc_ctxt) *
+		HFI1_MAX_SHARED_CTXTS;
+}
+
+int hfi1_init(struct hfi1_devdata *dd, int reinit);
+int hfi1_count_active_units(void);
+
+int hfi1_diag_add(struct hfi1_devdata *dd);
+void hfi1_diag_remove(struct hfi1_devdata *dd);
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup);
+
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd);
+
+int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd);
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd);
+int hfi1_create_kctxts(struct hfi1_devdata *dd);
+int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
+			 struct hfi1_ctxtdata **rcd);
+void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd);
+void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
+			 struct hfi1_devdata *dd, u8 hw_pidx, u8 port);
+void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd);
+int hfi1_rcd_put(struct hfi1_ctxtdata *rcd);
+void hfi1_rcd_get(struct hfi1_ctxtdata *rcd);
+struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd,
+						 u16 ctxt);
+struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt);
+int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread);
+int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread);
+int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread);
+void set_all_slowpath(struct hfi1_devdata *dd);
+void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd);
+void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd);
+void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd);
+
+extern const struct pci_device_id hfi1_pci_tbl[];
+void hfi1_make_ud_req_9B(struct rvt_qp *qp,
+			 struct hfi1_pkt_state *ps,
+			 struct rvt_swqe *wqe);
+
+void hfi1_make_ud_req_16B(struct rvt_qp *qp,
+			  struct hfi1_pkt_state *ps,
+			  struct rvt_swqe *wqe);
+
+/* receive packet handler dispositions */
+#define RCV_PKT_OK      0x0 /* keep going */
+#define RCV_PKT_LIMIT   0x1 /* stop, hit limit, start thread */
+#define RCV_PKT_DONE    0x2 /* stop, no more packets detected */
+
+/* calculate the current RHF address */
+static inline __le32 *get_rhf_addr(struct hfi1_ctxtdata *rcd)
+{
+	return (__le32 *)rcd->rcvhdrq + rcd->head + rcd->dd->rhf_offset;
+}
+
+int hfi1_reset_device(int);
+
+void receive_interrupt_work(struct work_struct *work);
+
+/* extract service channel from header and rhf */
+static inline int hfi1_9B_get_sc5(struct ib_header *hdr, u64 rhf)
+{
+	return ib_get_sc(hdr) | ((!!(rhf_dc_info(rhf))) << 4);
+}
+
+#define HFI1_JKEY_WIDTH       16
+#define HFI1_JKEY_MASK        (BIT(16) - 1)
+#define HFI1_ADMIN_JKEY_RANGE 32
+
+/*
+ * J_KEYs are split and allocated in the following groups:
+ *   0 - 31    - users with administrator privileges
+ *  32 - 63    - kernel protocols using KDETH packets
+ *  64 - 65535 - all other users using KDETH packets
+ */
+static inline u16 generate_jkey(kuid_t uid)
+{
+	u16 jkey = from_kuid(current_user_ns(), uid) & HFI1_JKEY_MASK;
+
+	if (capable(CAP_SYS_ADMIN))
+		jkey &= HFI1_ADMIN_JKEY_RANGE - 1;
+	else if (jkey < 64)
+		jkey |= BIT(HFI1_JKEY_WIDTH - 1);
+
+	return jkey;
+}
+
+/*
+ * active_egress_rate
+ *
+ * returns the active egress rate in units of [10^6 bits/sec]
+ */
+static inline u32 active_egress_rate(struct hfi1_pportdata *ppd)
+{
+	u16 link_speed = ppd->link_speed_active;
+	u16 link_width = ppd->link_width_active;
+	u32 egress_rate;
+
+	if (link_speed == OPA_LINK_SPEED_25G)
+		egress_rate = 25000;
+	else /* assume OPA_LINK_SPEED_12_5G */
+		egress_rate = 12500;
+
+	switch (link_width) {
+	case OPA_LINK_WIDTH_4X:
+		egress_rate *= 4;
+		break;
+	case OPA_LINK_WIDTH_3X:
+		egress_rate *= 3;
+		break;
+	case OPA_LINK_WIDTH_2X:
+		egress_rate *= 2;
+		break;
+	default:
+		/* assume IB_WIDTH_1X */
+		break;
+	}
+
+	return egress_rate;
+}
+
+/*
+ * egress_cycles
+ *
+ * Returns the number of 'fabric clock cycles' to egress a packet
+ * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock
+ * rate is (approximately) 805 MHz, the units of the returned value
+ * are (1/805 MHz).
+ */
+static inline u32 egress_cycles(u32 len, u32 rate)
+{
+	u32 cycles;
+
+	/*
+	 * cycles is:
+	 *
+	 *          (length) [bits] / (rate) [bits/sec]
+	 *  ---------------------------------------------------
+	 *  fabric_clock_period == 1 /(805 * 10^6) [cycles/sec]
+	 */
+
+	cycles = len * 8; /* bits */
+	cycles *= 805;
+	cycles /= rate;
+
+	return cycles;
+}
+
+void set_link_ipg(struct hfi1_pportdata *ppd);
+void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn,
+		  u32 rqpn, u8 svc_type);
+void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
+		u16 pkey, u32 slid, u32 dlid, u8 sc5,
+		const struct ib_grh *old_grh);
+void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp,
+		    u32 remote_qpn, u16 pkey, u32 slid, u32 dlid,
+		    u8 sc5, const struct ib_grh *old_grh);
+typedef void (*hfi1_handle_cnp)(struct hfi1_ibport *ibp, struct rvt_qp *qp,
+				u32 remote_qpn, u16 pkey, u32 slid, u32 dlid,
+				u8 sc5, const struct ib_grh *old_grh);
+
+#define PKEY_CHECK_INVALID -1
+int egress_pkey_check(struct hfi1_pportdata *ppd, u32 slid, u16 pkey,
+		      u8 sc5, int8_t s_pkey_index);
+
+#define PACKET_EGRESS_TIMEOUT 350
+static inline void pause_for_credit_return(struct hfi1_devdata *dd)
+{
+	/* Pause at least 1us, to ensure chip returns all credits */
+	u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000;
+
+	udelay(usec ? usec : 1);
+}
+
+/**
+ * sc_to_vlt() reverse lookup sc to vl
+ * @dd - devdata
+ * @sc5 - 5 bit sc
+ */
+static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5)
+{
+	unsigned seq;
+	u8 rval;
+
+	if (sc5 >= OPA_MAX_SCS)
+		return (u8)(0xff);
+
+	do {
+		seq = read_seqbegin(&dd->sc2vl_lock);
+		rval = *(((u8 *)dd->sc2vl) + sc5);
+	} while (read_seqretry(&dd->sc2vl_lock, seq));
+
+	return rval;
+}
+
+#define PKEY_MEMBER_MASK 0x8000
+#define PKEY_LOW_15_MASK 0x7fff
+
+/*
+ * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the ingress partition key table), return 0
+ * otherwise. Use the matching criteria for ingress partition keys
+ * specified in the OPAv1 spec., section 9.10.14.
+ */
+static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+	u16 mkey = pkey & PKEY_LOW_15_MASK;
+	u16 ment = ent & PKEY_LOW_15_MASK;
+
+	if (mkey == ment) {
+		/*
+		 * If pkey[15] is clear (limited partition member),
+		 * is bit 15 in the corresponding table element
+		 * clear (limited member)?
+		 */
+		if (!(pkey & PKEY_MEMBER_MASK))
+			return !!(ent & PKEY_MEMBER_MASK);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * ingress_pkey_table_search - search the entire pkey table for
+ * an entry which matches 'pkey'. return 0 if a match is found,
+ * and 1 otherwise.
+ */
+static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey)
+{
+	int i;
+
+	for (i = 0; i < MAX_PKEY_VALUES; i++) {
+		if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * ingress_pkey_table_fail - record a failure of ingress pkey validation,
+ * i.e., increment port_rcv_constraint_errors for the port, and record
+ * the 'error info' for this failure.
+ */
+static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey,
+				    u32 slid)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+
+	incr_cntr64(&ppd->port_rcv_constraint_errors);
+	if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) {
+		dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK;
+		dd->err_info_rcv_constraint.slid = slid;
+		dd->err_info_rcv_constraint.pkey = pkey;
+	}
+}
+
+/*
+ * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx
+ * is a hint as to the best place in the partition key table to begin
+ * searching. This function should not be called on the data path because
+ * of performance reasons. On datapath pkey check is expected to be done
+ * by HW and rcv_pkey_check function should be called instead.
+ */
+static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+				     u8 sc5, u8 idx, u32 slid, bool force)
+{
+	if (!(force) && !(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+		return 0;
+
+	/* If SC15, pkey[0:14] must be 0x7fff */
+	if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+		goto bad;
+
+	/* Is the pkey = 0x0, or 0x8000? */
+	if ((pkey & PKEY_LOW_15_MASK) == 0)
+		goto bad;
+
+	/* The most likely matching pkey has index 'idx' */
+	if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx]))
+		return 0;
+
+	/* no match - try the whole table */
+	if (!ingress_pkey_table_search(ppd, pkey))
+		return 0;
+
+bad:
+	ingress_pkey_table_fail(ppd, pkey, slid);
+	return 1;
+}
+
+/*
+ * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. It only ensures pkey is vlid for QP0. This function
+ * should be called on the data path instead of ingress_pkey_check
+ * as on data path, pkey check is done by HW (except for QP0).
+ */
+static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+				 u8 sc5, u16 slid)
+{
+	if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+		return 0;
+
+	/* If SC15, pkey[0:14] must be 0x7fff */
+	if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+		goto bad;
+
+	return 0;
+bad:
+	ingress_pkey_table_fail(ppd, pkey, slid);
+	return 1;
+}
+
+/* MTU handling */
+
+/* MTU enumeration, 256-4k match IB */
+#define OPA_MTU_0     0
+#define OPA_MTU_256   1
+#define OPA_MTU_512   2
+#define OPA_MTU_1024  3
+#define OPA_MTU_2048  4
+#define OPA_MTU_4096  5
+
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd);
+int mtu_to_enum(u32 mtu, int default_if_bad);
+u16 enum_to_mtu(int mtu);
+static inline int valid_ib_mtu(unsigned int mtu)
+{
+	return mtu == 256 || mtu == 512 ||
+		mtu == 1024 || mtu == 2048 ||
+		mtu == 4096;
+}
+
+static inline int valid_opa_max_mtu(unsigned int mtu)
+{
+	return mtu >= 2048 &&
+		(valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240);
+}
+
+int set_mtu(struct hfi1_pportdata *ppd);
+
+int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc);
+void hfi1_disable_after_error(struct hfi1_devdata *dd);
+int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit);
+int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encode);
+
+int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t);
+int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t);
+
+void set_up_vau(struct hfi1_devdata *dd, u8 vau);
+void set_up_vl15(struct hfi1_devdata *dd, u16 vl15buf);
+void reset_link_credits(struct hfi1_devdata *dd);
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu);
+
+int set_buffer_control(struct hfi1_pportdata *ppd, struct buffer_control *bc);
+
+static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd)
+{
+	return ppd->dd;
+}
+
+static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev)
+{
+	return container_of(dev, struct hfi1_devdata, verbs_dev);
+}
+
+static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev)
+{
+	return dd_from_dev(to_idev(ibdev));
+}
+
+static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp)
+{
+	return container_of(ibp, struct hfi1_pportdata, ibport_data);
+}
+
+static inline struct hfi1_ibdev *dev_from_rdi(struct rvt_dev_info *rdi)
+{
+	return container_of(rdi, struct hfi1_ibdev, rdi);
+}
+
+static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	WARN_ON(pidx >= dd->num_pports);
+	return &dd->pport[pidx].ibport_data;
+}
+
+static inline struct hfi1_ibport *rcd_to_iport(struct hfi1_ctxtdata *rcd)
+{
+	return &rcd->ppd->ibport_data;
+}
+
+void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+			       bool do_cnp);
+static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt,
+			       bool do_cnp)
+{
+	bool becn;
+	bool fecn;
+
+	if (pkt->etype == RHF_RCV_TYPE_BYPASS) {
+		fecn = hfi1_16B_get_fecn(pkt->hdr);
+		becn = hfi1_16B_get_becn(pkt->hdr);
+	} else {
+		fecn = ib_bth_get_fecn(pkt->ohdr);
+		becn = ib_bth_get_becn(pkt->ohdr);
+	}
+	if (unlikely(fecn || becn)) {
+		hfi1_process_ecn_slowpath(qp, pkt, do_cnp);
+		return fecn;
+	}
+	return false;
+}
+
+/*
+ * Return the indexed PKEY from the port PKEY table.
+ */
+static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u16 ret;
+
+	if (index >= ARRAY_SIZE(ppd->pkeys))
+		ret = 0;
+	else
+		ret = ppd->pkeys[index];
+
+	return ret;
+}
+
+/*
+ * Return the indexed GUID from the port GUIDs table.
+ */
+static inline __be64 get_sguid(struct hfi1_ibport *ibp, unsigned int index)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+	WARN_ON(index >= HFI1_GUIDS_PER_PORT);
+	return cpu_to_be64(ppd->guids[index]);
+}
+
+/*
+ * Called by readers of cc_state only, must call under rcu_read_lock().
+ */
+static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd)
+{
+	return rcu_dereference(ppd->cc_state);
+}
+
+/*
+ * Called by writers of cc_state only,  must call under cc_state_lock.
+ */
+static inline
+struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd)
+{
+	return rcu_dereference_protected(ppd->cc_state,
+					 lockdep_is_held(&ppd->cc_state_lock));
+}
+
+/*
+ * values for dd->flags (_device_ related flags)
+ */
+#define HFI1_INITTED           0x1    /* chip and driver up and initted */
+#define HFI1_PRESENT           0x2    /* chip accesses can be done */
+#define HFI1_FROZEN            0x4    /* chip in SPC freeze */
+#define HFI1_HAS_SDMA_TIMEOUT  0x8
+#define HFI1_HAS_SEND_DMA      0x10   /* Supports Send DMA */
+#define HFI1_FORCED_FREEZE     0x80   /* driver forced freeze mode */
+
+/* IB dword length mask in PBC (lower 11 bits); same for all chips */
+#define HFI1_PBC_LENGTH_MASK                     ((1 << 11) - 1)
+
+/* ctxt_flag bit offsets */
+		/* base context has not finished initializing */
+#define HFI1_CTXT_BASE_UNINIT 1
+		/* base context initaliation failed */
+#define HFI1_CTXT_BASE_FAILED 2
+		/* waiting for a packet to arrive */
+#define HFI1_CTXT_WAITING_RCV 3
+		/* waiting for an urgent packet to arrive */
+#define HFI1_CTXT_WAITING_URG 4
+
+/* free up any allocated data at closes */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
+				  const struct pci_device_id *ent);
+void hfi1_free_devdata(struct hfi1_devdata *dd);
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra);
+
+/* LED beaconing functions */
+void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
+			     unsigned int timeoff);
+void shutdown_led_override(struct hfi1_pportdata *ppd);
+
+#define HFI1_CREDIT_RETURN_RATE (100)
+
+/*
+ * The number of words for the KDETH protocol field.  If this is
+ * larger then the actual field used, then part of the payload
+ * will be in the header.
+ *
+ * Optimally, we want this sized so that a typical case will
+ * use full cache lines.  The typical local KDETH header would
+ * be:
+ *
+ *	Bytes	Field
+ *	  8	LRH
+ *	 12	BHT
+ *	 ??	KDETH
+ *	  8	RHF
+ *	---
+ *	 28 + KDETH
+ *
+ * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS
+ */
+#define DEFAULT_RCVHDRSIZE 9
+
+/*
+ * Maximal header byte count:
+ *
+ *	Bytes	Field
+ *	  8	LRH
+ *	 40	GRH (optional)
+ *	 12	BTH
+ *	 ??	KDETH
+ *	  8	RHF
+ *	---
+ *	 68 + KDETH
+ *
+ * We also want to maintain a cache line alignment to assist DMA'ing
+ * of the header bytes.  Round up to a good size.
+ */
+#define DEFAULT_RCVHDR_ENTSIZE 32
+
+bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm,
+			u32 nlocked, u32 npages);
+int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr,
+			    size_t npages, bool writable, struct page **pages);
+void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
+			     size_t npages, bool dirty);
+
+static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+	*((u64 *)rcd->rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+	/*
+	 * volatile because it's a DMA target from the chip, routine is
+	 * inlined, and don't want register caching or reordering.
+	 */
+	return (u32)le64_to_cpu(*rcd->rcvhdrtail_kvaddr);
+}
+
+/*
+ * sysfs interface.
+ */
+
+extern const char ib_hfi1_version[];
+
+int hfi1_device_create(struct hfi1_devdata *dd);
+void hfi1_device_remove(struct hfi1_devdata *dd);
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+			   struct kobject *kobj);
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd);
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd);
+/* Hook for sysfs read of QSFP */
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len);
+
+int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent);
+void hfi1_clean_up_interrupts(struct hfi1_devdata *dd);
+void hfi1_pcie_cleanup(struct pci_dev *pdev);
+int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev);
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
+int pcie_speeds(struct hfi1_devdata *dd);
+int request_msix(struct hfi1_devdata *dd, u32 msireq);
+int restore_pci_variables(struct hfi1_devdata *dd);
+int save_pci_variables(struct hfi1_devdata *dd);
+int do_pcie_gen3_transition(struct hfi1_devdata *dd);
+int parse_platform_config(struct hfi1_devdata *dd);
+int get_platform_config_field(struct hfi1_devdata *dd,
+			      enum platform_config_table_type_encoding
+			      table_type, int table_index, int field_index,
+			      u32 *data, u32 len);
+
+struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+static inline void flush_wc(void)
+{
+	asm volatile("sfence" : : : "memory");
+}
+
+void handle_eflags(struct hfi1_packet *packet);
+int process_receive_ib(struct hfi1_packet *packet);
+int process_receive_bypass(struct hfi1_packet *packet);
+int process_receive_error(struct hfi1_packet *packet);
+int kdeth_process_expected(struct hfi1_packet *packet);
+int kdeth_process_eager(struct hfi1_packet *packet);
+int process_receive_invalid(struct hfi1_packet *packet);
+void seqfile_dump_rcd(struct seq_file *s, struct hfi1_ctxtdata *rcd);
+
+/* global module parameter variables */
+extern unsigned int hfi1_max_mtu;
+extern unsigned int hfi1_cu;
+extern unsigned int user_credit_return_threshold;
+extern int num_user_contexts;
+extern unsigned long n_krcvqs;
+extern uint krcvqs[];
+extern int krcvqsset;
+extern uint kdeth_qp;
+extern uint loopback;
+extern uint quick_linkup;
+extern uint rcv_intr_timeout;
+extern uint rcv_intr_count;
+extern uint rcv_intr_dynamic;
+extern ushort link_crc_mask;
+
+extern struct mutex hfi1_mutex;
+
+/* Number of seconds before our card status check...  */
+#define STATUS_TIMEOUT 60
+
+#define DRIVER_NAME		"hfi1"
+#define HFI1_USER_MINOR_BASE     0
+#define HFI1_TRACE_MINOR         127
+#define HFI1_NMINORS             255
+
+#define PCI_VENDOR_ID_INTEL 0x8086
+#define PCI_DEVICE_ID_INTEL0 0x24f0
+#define PCI_DEVICE_ID_INTEL1 0x24f1
+
+#define HFI1_PKT_USER_SC_INTEGRITY					    \
+	(SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK	    \
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK		\
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK		    \
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)
+
+#define HFI1_PKT_KERNEL_SC_INTEGRITY					    \
+	(SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK)
+
+static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
+						  u16 ctxt_type)
+{
+	u64 base_sc_integrity;
+
+	/* No integrity checks if HFI1_CAP_NO_INTEGRITY is set */
+	if (HFI1_CAP_IS_KSET(NO_INTEGRITY))
+		return 0;
+
+	base_sc_integrity =
+	SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+	| SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK
+	| SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+	| SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+	| SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK
+	| SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK
+	| SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK
+	| SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+	if (ctxt_type == SC_USER)
+		base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY;
+	else
+		base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
+
+	/* turn on send-side job key checks if !A0 */
+	if (!is_ax(dd))
+		base_sc_integrity |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+
+	return base_sc_integrity;
+}
+
+static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd)
+{
+	u64 base_sdma_integrity;
+
+	/* No integrity checks if HFI1_CAP_NO_INTEGRITY is set */
+	if (HFI1_CAP_IS_KSET(NO_INTEGRITY))
+		return 0;
+
+	base_sdma_integrity =
+	SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+	| SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK
+	| SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+	| SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+	| SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK
+	| SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK
+	| SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK
+	| SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+	if (!HFI1_CAP_IS_KSET(STATIC_RATE_CTRL))
+		base_sdma_integrity |=
+		SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK;
+
+	/* turn on send-side job key checks if !A0 */
+	if (!is_ax(dd))
+		base_sdma_integrity |=
+			SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+
+	return base_sdma_integrity;
+}
+
+/*
+ * hfi1_early_err is used (only!) to print early errors before devdata is
+ * allocated, or when dd->pcidev may not be valid, and at the tail end of
+ * cleanup when devdata may have been freed, etc.  hfi1_dev_porterr is
+ * the same as dd_dev_err, but is used when the message really needs
+ * the IB port# to be definitive as to what's happening..
+ */
+#define hfi1_early_err(dev, fmt, ...) \
+	dev_err(dev, fmt, ##__VA_ARGS__)
+
+#define hfi1_early_info(dev, fmt, ...) \
+	dev_info(dev, fmt, ##__VA_ARGS__)
+
+#define dd_dev_emerg(dd, fmt, ...) \
+	dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
+		  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
+
+#define dd_dev_err(dd, fmt, ...) \
+	dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
+		rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
+
+#define dd_dev_err_ratelimited(dd, fmt, ...) \
+	dev_err_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
+			    rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \
+			    ##__VA_ARGS__)
+
+#define dd_dev_warn(dd, fmt, ...) \
+	dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
+		 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
+
+#define dd_dev_warn_ratelimited(dd, fmt, ...) \
+	dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
+			     rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \
+			     ##__VA_ARGS__)
+
+#define dd_dev_info(dd, fmt, ...) \
+	dev_info(&(dd)->pcidev->dev, "%s: " fmt, \
+		 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
+
+#define dd_dev_info_ratelimited(dd, fmt, ...) \
+	dev_info_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
+			     rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \
+			     ##__VA_ARGS__)
+
+#define dd_dev_dbg(dd, fmt, ...) \
+	dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \
+		rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
+
+#define hfi1_dev_porterr(dd, port, fmt, ...) \
+	dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \
+		rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), (port), ##__VA_ARGS__)
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct hfi1_hwerror_msgs {
+	u64 mask;
+	const char *msg;
+	size_t sz;
+};
+
+/* in intr.c... */
+void hfi1_format_hwerrors(u64 hwerrs,
+			  const struct hfi1_hwerror_msgs *hwerrmsgs,
+			  size_t nhwerrmsgs, char *msg, size_t lmsg);
+
+#define USER_OPCODE_CHECK_VAL 0xC0
+#define USER_OPCODE_CHECK_MASK 0xC0
+#define OPCODE_CHECK_VAL_DISABLED 0x0
+#define OPCODE_CHECK_MASK_DISABLED 0x0
+
+static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd;
+	int i;
+
+	dd->z_int_counter = get_all_cpu_total(dd->int_counter);
+	dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit);
+	dd->z_send_schedule = get_all_cpu_total(dd->send_schedule);
+
+	ppd = (struct hfi1_pportdata *)(dd + 1);
+	for (i = 0; i < dd->num_pports; i++, ppd++) {
+		ppd->ibport_data.rvp.z_rc_acks =
+			get_all_cpu_total(ppd->ibport_data.rvp.rc_acks);
+		ppd->ibport_data.rvp.z_rc_qacks =
+			get_all_cpu_total(ppd->ibport_data.rvp.rc_qacks);
+	}
+}
+
+/* Control LED state */
+static inline void setextled(struct hfi1_devdata *dd, u32 on)
+{
+	if (on)
+		write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F);
+	else
+		write_csr(dd, DCC_CFG_LED_CNTRL, 0x10);
+}
+
+/* return the i2c resource given the target */
+static inline u32 i2c_target(u32 target)
+{
+	return target ? CR_I2C2 : CR_I2C1;
+}
+
+/* return the i2c chain chip resource that this HFI uses for QSFP */
+static inline u32 qsfp_resource(struct hfi1_devdata *dd)
+{
+	return i2c_target(dd->hfi1_id);
+}
+
+/* Is this device integrated or discrete? */
+static inline bool is_integrated(struct hfi1_devdata *dd)
+{
+	return dd->pcidev->device == PCI_DEVICE_ID_INTEL1;
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
+
+#define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
+#define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
+
+static inline void hfi1_update_ah_attr(struct ib_device *ibdev,
+				       struct rdma_ah_attr *attr)
+{
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	u32 dlid = rdma_ah_get_dlid(attr);
+
+	/*
+	 * Kernel clients may not have setup GRH information
+	 * Set that here.
+	 */
+	ibp = to_iport(ibdev, rdma_ah_get_port_num(attr));
+	ppd = ppd_from_ibp(ibp);
+	if ((((dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) ||
+	      (ppd->lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))) &&
+	    (dlid != be32_to_cpu(OPA_LID_PERMISSIVE)) &&
+	    (dlid != be16_to_cpu(IB_LID_PERMISSIVE)) &&
+	    (!(rdma_ah_get_ah_flags(attr) & IB_AH_GRH))) ||
+	    (rdma_ah_get_make_grd(attr))) {
+		rdma_ah_set_ah_flags(attr, IB_AH_GRH);
+		rdma_ah_set_interface_id(attr, OPA_MAKE_ID(dlid));
+		rdma_ah_set_subnet_prefix(attr, ibp->rvp.gid_prefix);
+	}
+}
+
+/*
+ * hfi1_check_mcast- Check if the given lid is
+ * in the OPA multicast range.
+ *
+ * The LID might either reside in ah.dlid or might be
+ * in the GRH of the address handle as DGID if extended
+ * addresses are in use.
+ */
+static inline bool hfi1_check_mcast(u32 lid)
+{
+	return ((lid >= opa_get_mcast_base(OPA_MCAST_NR)) &&
+		(lid != be32_to_cpu(OPA_LID_PERMISSIVE)));
+}
+
+#define opa_get_lid(lid, format)	\
+	__opa_get_lid(lid, OPA_PORT_PACKET_FORMAT_##format)
+
+/* Convert a lid to a specific lid space */
+static inline u32 __opa_get_lid(u32 lid, u8 format)
+{
+	bool is_mcast = hfi1_check_mcast(lid);
+
+	switch (format) {
+	case OPA_PORT_PACKET_FORMAT_8B:
+	case OPA_PORT_PACKET_FORMAT_10B:
+		if (is_mcast)
+			return (lid - opa_get_mcast_base(OPA_MCAST_NR) +
+				0xF0000);
+		return lid & 0xFFFFF;
+	case OPA_PORT_PACKET_FORMAT_16B:
+		if (is_mcast)
+			return (lid - opa_get_mcast_base(OPA_MCAST_NR) +
+				0xF00000);
+		return lid & 0xFFFFFF;
+	case OPA_PORT_PACKET_FORMAT_9B:
+		if (is_mcast)
+			return (lid -
+				opa_get_mcast_base(OPA_MCAST_NR) +
+				be16_to_cpu(IB_MULTICAST_LID_BASE));
+		else
+			return lid & 0xFFFF;
+	default:
+		return lid;
+	}
+}
+
+/* Return true if the given lid is the OPA 16B multicast range */
+static inline bool hfi1_is_16B_mcast(u32 lid)
+{
+	return ((lid >=
+		opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), 16B)) &&
+		(lid != opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B)));
+}
+
+static inline void hfi1_make_opa_lid(struct rdma_ah_attr *attr)
+{
+	const struct ib_global_route *grh = rdma_ah_read_grh(attr);
+	u32 dlid = rdma_ah_get_dlid(attr);
+
+	/* Modify ah_attr.dlid to be in the 32 bit LID space.
+	 * This is how the address will be laid out:
+	 * Assuming MCAST_NR to be 4,
+	 * 32 bit permissive LID = 0xFFFFFFFF
+	 * Multicast LID range = 0xFFFFFFFE to 0xF0000000
+	 * Unicast LID range = 0xEFFFFFFF to 1
+	 * Invalid LID = 0
+	 */
+	if (ib_is_opa_gid(&grh->dgid))
+		dlid = opa_get_lid_from_gid(&grh->dgid);
+	else if ((dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+		 (dlid != be16_to_cpu(IB_LID_PERMISSIVE)) &&
+		 (dlid != be32_to_cpu(OPA_LID_PERMISSIVE)))
+		dlid = dlid - be16_to_cpu(IB_MULTICAST_LID_BASE) +
+			opa_get_mcast_base(OPA_MCAST_NR);
+	else if (dlid == be16_to_cpu(IB_LID_PERMISSIVE))
+		dlid = be32_to_cpu(OPA_LID_PERMISSIVE);
+
+	rdma_ah_set_dlid(attr, dlid);
+}
+
+static inline u8 hfi1_get_packet_type(u32 lid)
+{
+	/* 9B if lid > 0xF0000000 */
+	if (lid >= opa_get_mcast_base(OPA_MCAST_NR))
+		return HFI1_PKT_TYPE_9B;
+
+	/* 16B if lid > 0xC000 */
+	if (lid >= opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), 9B))
+		return HFI1_PKT_TYPE_16B;
+
+	return HFI1_PKT_TYPE_9B;
+}
+
+static inline bool hfi1_get_hdr_type(u32 lid, struct rdma_ah_attr *attr)
+{
+	/*
+	 * If there was an incoming 16B packet with permissive
+	 * LIDs, OPA GIDs would have been programmed when those
+	 * packets were received. A 16B packet will have to
+	 * be sent in response to that packet. Return a 16B
+	 * header type if that's the case.
+	 */
+	if (rdma_ah_get_dlid(attr) == be32_to_cpu(OPA_LID_PERMISSIVE))
+		return (ib_is_opa_gid(&rdma_ah_read_grh(attr)->dgid)) ?
+			HFI1_PKT_TYPE_16B : HFI1_PKT_TYPE_9B;
+
+	/*
+	 * Return a 16B header type if either the the destination
+	 * or source lid is extended.
+	 */
+	if (hfi1_get_packet_type(rdma_ah_get_dlid(attr)) == HFI1_PKT_TYPE_16B)
+		return HFI1_PKT_TYPE_16B;
+
+	return hfi1_get_packet_type(lid);
+}
+
+static inline void hfi1_make_ext_grh(struct hfi1_packet *packet,
+				     struct ib_grh *grh, u32 slid,
+				     u32 dlid)
+{
+	struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+	if (!ibp)
+		return;
+
+	grh->hop_limit = 1;
+	grh->sgid.global.subnet_prefix = ibp->rvp.gid_prefix;
+	if (slid == opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B))
+		grh->sgid.global.interface_id =
+			OPA_MAKE_ID(be32_to_cpu(OPA_LID_PERMISSIVE));
+	else
+		grh->sgid.global.interface_id = OPA_MAKE_ID(slid);
+
+	/*
+	 * Upper layers (like mad) may compare the dgid in the
+	 * wc that is obtained here with the sgid_index in
+	 * the wr. Since sgid_index in wr is always 0 for
+	 * extended lids, set the dgid here to the default
+	 * IB gid.
+	 */
+	grh->dgid.global.subnet_prefix = ibp->rvp.gid_prefix;
+	grh->dgid.global.interface_id =
+		cpu_to_be64(ppd->guids[HFI1_PORT_GUID_INDEX]);
+}
+
+static inline int hfi1_get_16b_padding(u32 hdr_size, u32 payload)
+{
+	return -(hdr_size + payload + (SIZE_OF_CRC << 2) +
+		     SIZE_OF_LT) & 0x7;
+}
+
+static inline void hfi1_make_ib_hdr(struct ib_header *hdr,
+				    u16 lrh0, u16 len,
+				    u16 dlid, u16 slid)
+{
+	hdr->lrh[0] = cpu_to_be16(lrh0);
+	hdr->lrh[1] = cpu_to_be16(dlid);
+	hdr->lrh[2] = cpu_to_be16(len);
+	hdr->lrh[3] = cpu_to_be16(slid);
+}
+
+static inline void hfi1_make_16b_hdr(struct hfi1_16b_header *hdr,
+				     u32 slid, u32 dlid,
+				     u16 len, u16 pkey,
+				     bool becn, bool fecn, u8 l4,
+				     u8 sc)
+{
+	u32 lrh0 = 0;
+	u32 lrh1 = 0x40000000;
+	u32 lrh2 = 0;
+	u32 lrh3 = 0;
+
+	lrh0 = (lrh0 & ~OPA_16B_BECN_MASK) | (becn << OPA_16B_BECN_SHIFT);
+	lrh0 = (lrh0 & ~OPA_16B_LEN_MASK) | (len << OPA_16B_LEN_SHIFT);
+	lrh0 = (lrh0 & ~OPA_16B_LID_MASK)  | (slid & OPA_16B_LID_MASK);
+	lrh1 = (lrh1 & ~OPA_16B_FECN_MASK) | (fecn << OPA_16B_FECN_SHIFT);
+	lrh1 = (lrh1 & ~OPA_16B_SC_MASK) | (sc << OPA_16B_SC_SHIFT);
+	lrh1 = (lrh1 & ~OPA_16B_LID_MASK) | (dlid & OPA_16B_LID_MASK);
+	lrh2 = (lrh2 & ~OPA_16B_SLID_MASK) |
+		((slid >> OPA_16B_SLID_SHIFT) << OPA_16B_SLID_HIGH_SHIFT);
+	lrh2 = (lrh2 & ~OPA_16B_DLID_MASK) |
+		((dlid >> OPA_16B_DLID_SHIFT) << OPA_16B_DLID_HIGH_SHIFT);
+	lrh2 = (lrh2 & ~OPA_16B_PKEY_MASK) | ((u32)pkey << OPA_16B_PKEY_SHIFT);
+	lrh2 = (lrh2 & ~OPA_16B_L4_MASK) | l4;
+
+	hdr->lrh[0] = lrh0;
+	hdr->lrh[1] = lrh1;
+	hdr->lrh[2] = lrh2;
+	hdr->lrh[3] = lrh3;
+}
+#endif                          /* _HFI1_KERNEL_H */
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
new file mode 100644
index 0000000..6309edf
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -0,0 +1,2092 @@
+/*
+ * Copyright(c) 2015-2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/hrtimer.h>
+#include <linux/bitmap.h>
+#include <rdma/rdma_vt.h>
+
+#include "hfi.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+#include "mad.h"
+#include "sdma.h"
+#include "debugfs.h"
+#include "verbs.h"
+#include "aspm.h"
+#include "affinity.h"
+#include "vnic.h"
+#include "exp_rcv.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
+/*
+ * min buffers we want to have per context, after driver
+ */
+#define HFI1_MIN_USER_CTXT_BUFCNT 7
+
+#define HFI1_MIN_HDRQ_EGRBUF_CNT 2
+#define HFI1_MAX_HDRQ_EGRBUF_CNT 16352
+#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
+#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
+
+/*
+ * Number of user receive contexts we are configured to use (to allow for more
+ * pio buffers per ctxt, etc.)  Zero means use one user context per CPU.
+ */
+int num_user_contexts = -1;
+module_param_named(num_user_contexts, num_user_contexts, int, 0444);
+MODULE_PARM_DESC(
+	num_user_contexts, "Set max number of user contexts to use (default: -1 will use the real (non-HT) CPU count)");
+
+uint krcvqs[RXE_NUM_DATA_VL];
+int krcvqsset;
+module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO);
+MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL");
+
+/* computed based on above array */
+unsigned long n_krcvqs;
+
+static unsigned hfi1_rcvarr_split = 25;
+module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
+
+static uint eager_buffer_size = (8 << 20); /* 8MB */
+module_param(eager_buffer_size, uint, S_IRUGO);
+MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 8MB");
+
+static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */
+module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
+
+static uint hfi1_hdrq_entsize = 32;
+module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO);
+MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B");
+
+unsigned int user_credit_return_threshold = 33;	/* default is 33% */
+module_param(user_credit_return_threshold, uint, S_IRUGO);
+MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
+
+static inline u64 encode_rcv_header_entry_size(u16 size);
+
+static struct idr hfi1_unit_table;
+
+static int hfi1_create_kctxt(struct hfi1_devdata *dd,
+			     struct hfi1_pportdata *ppd)
+{
+	struct hfi1_ctxtdata *rcd;
+	int ret;
+
+	/* Control context has to be always 0 */
+	BUILD_BUG_ON(HFI1_CTRL_CTXT != 0);
+
+	ret = hfi1_create_ctxtdata(ppd, dd->node, &rcd);
+	if (ret < 0) {
+		dd_dev_err(dd, "Kernel receive context allocation failed\n");
+		return ret;
+	}
+
+	/*
+	 * Set up the kernel context flags here and now because they use
+	 * default values for all receive side memories.  User contexts will
+	 * be handled as they are created.
+	 */
+	rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
+		HFI1_CAP_KGET(NODROP_RHQ_FULL) |
+		HFI1_CAP_KGET(NODROP_EGR_FULL) |
+		HFI1_CAP_KGET(DMA_RTAIL);
+
+	/* Control context must use DMA_RTAIL */
+	if (rcd->ctxt == HFI1_CTRL_CTXT)
+		rcd->flags |= HFI1_CAP_DMA_RTAIL;
+	rcd->seq_cnt = 1;
+
+	rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
+	if (!rcd->sc) {
+		dd_dev_err(dd, "Kernel send context allocation failed\n");
+		return -ENOMEM;
+	}
+	hfi1_init_ctxt(rcd->sc);
+
+	return 0;
+}
+
+/*
+ * Create the receive context array and one or more kernel contexts
+ */
+int hfi1_create_kctxts(struct hfi1_devdata *dd)
+{
+	u16 i;
+	int ret;
+
+	dd->rcd = kcalloc_node(dd->num_rcv_contexts, sizeof(*dd->rcd),
+			       GFP_KERNEL, dd->node);
+	if (!dd->rcd)
+		return -ENOMEM;
+
+	for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) {
+		ret = hfi1_create_kctxt(dd, dd->pport);
+		if (ret)
+			goto bail;
+	}
+
+	return 0;
+bail:
+	for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i)
+		hfi1_free_ctxt(dd->rcd[i]);
+
+	/* All the contexts should be freed, free the array */
+	kfree(dd->rcd);
+	dd->rcd = NULL;
+	return ret;
+}
+
+/*
+ * Helper routines for the receive context reference count (rcd and uctxt).
+ */
+static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd)
+{
+	kref_init(&rcd->kref);
+}
+
+/**
+ * hfi1_rcd_free - When reference is zero clean up.
+ * @kref: pointer to an initialized rcd data structure
+ *
+ */
+static void hfi1_rcd_free(struct kref *kref)
+{
+	unsigned long flags;
+	struct hfi1_ctxtdata *rcd =
+		container_of(kref, struct hfi1_ctxtdata, kref);
+
+	hfi1_free_ctxtdata(rcd->dd, rcd);
+
+	spin_lock_irqsave(&rcd->dd->uctxt_lock, flags);
+	rcd->dd->rcd[rcd->ctxt] = NULL;
+	spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags);
+
+	kfree(rcd);
+}
+
+/**
+ * hfi1_rcd_put - decrement reference for rcd
+ * @rcd: pointer to an initialized rcd data structure
+ *
+ * Use this to put a reference after the init.
+ */
+int hfi1_rcd_put(struct hfi1_ctxtdata *rcd)
+{
+	if (rcd)
+		return kref_put(&rcd->kref, hfi1_rcd_free);
+
+	return 0;
+}
+
+/**
+ * hfi1_rcd_get - increment reference for rcd
+ * @rcd: pointer to an initialized rcd data structure
+ *
+ * Use this to get a reference after the init.
+ */
+void hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
+{
+	kref_get(&rcd->kref);
+}
+
+/**
+ * allocate_rcd_index - allocate an rcd index from the rcd array
+ * @dd: pointer to a valid devdata structure
+ * @rcd: rcd data structure to assign
+ * @index: pointer to index that is allocated
+ *
+ * Find an empty index in the rcd array, and assign the given rcd to it.
+ * If the array is full, we are EBUSY.
+ *
+ */
+static int allocate_rcd_index(struct hfi1_devdata *dd,
+			      struct hfi1_ctxtdata *rcd, u16 *index)
+{
+	unsigned long flags;
+	u16 ctxt;
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	for (ctxt = 0; ctxt < dd->num_rcv_contexts; ctxt++)
+		if (!dd->rcd[ctxt])
+			break;
+
+	if (ctxt < dd->num_rcv_contexts) {
+		rcd->ctxt = ctxt;
+		dd->rcd[ctxt] = rcd;
+		hfi1_rcd_init(rcd);
+	}
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+	if (ctxt >= dd->num_rcv_contexts)
+		return -EBUSY;
+
+	*index = ctxt;
+
+	return 0;
+}
+
+/**
+ * hfi1_rcd_get_by_index_safe - validate the ctxt index before accessing the
+ * array
+ * @dd: pointer to a valid devdata structure
+ * @ctxt: the index of an possilbe rcd
+ *
+ * This is a wrapper for hfi1_rcd_get_by_index() to validate that the given
+ * ctxt index is valid.
+ *
+ * The caller is responsible for making the _put().
+ *
+ */
+struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd,
+						 u16 ctxt)
+{
+	if (ctxt < dd->num_rcv_contexts)
+		return hfi1_rcd_get_by_index(dd, ctxt);
+
+	return NULL;
+}
+
+/**
+ * hfi1_rcd_get_by_index
+ * @dd: pointer to a valid devdata structure
+ * @ctxt: the index of an possilbe rcd
+ *
+ * We need to protect access to the rcd array.  If access is needed to
+ * one or more index, get the protecting spinlock and then increment the
+ * kref.
+ *
+ * The caller is responsible for making the _put().
+ *
+ */
+struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt)
+{
+	unsigned long flags;
+	struct hfi1_ctxtdata *rcd = NULL;
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	if (dd->rcd[ctxt]) {
+		rcd = dd->rcd[ctxt];
+		hfi1_rcd_get(rcd);
+	}
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+	return rcd;
+}
+
+/*
+ * Common code for user and kernel context create and setup.
+ * NOTE: the initial kref is done here (hf1_rcd_init()).
+ */
+int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
+			 struct hfi1_ctxtdata **context)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	struct hfi1_ctxtdata *rcd;
+	unsigned kctxt_ngroups = 0;
+	u32 base;
+
+	if (dd->rcv_entries.nctxt_extra >
+	    dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt)
+		kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
+			 (dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt));
+	rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, numa);
+	if (rcd) {
+		u32 rcvtids, max_entries;
+		u16 ctxt;
+		int ret;
+
+		ret = allocate_rcd_index(dd, rcd, &ctxt);
+		if (ret) {
+			*context = NULL;
+			kfree(rcd);
+			return ret;
+		}
+
+		INIT_LIST_HEAD(&rcd->qp_wait_list);
+		hfi1_exp_tid_group_init(&rcd->tid_group_list);
+		hfi1_exp_tid_group_init(&rcd->tid_used_list);
+		hfi1_exp_tid_group_init(&rcd->tid_full_list);
+		rcd->ppd = ppd;
+		rcd->dd = dd;
+		__set_bit(0, rcd->in_use_ctxts);
+		rcd->numa_id = numa;
+		rcd->rcv_array_groups = dd->rcv_entries.ngroups;
+
+		mutex_init(&rcd->exp_lock);
+
+		hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
+
+		/*
+		 * Calculate the context's RcvArray entry starting point.
+		 * We do this here because we have to take into account all
+		 * the RcvArray entries that previous context would have
+		 * taken and we have to account for any extra groups assigned
+		 * to the static (kernel) or dynamic (vnic/user) contexts.
+		 */
+		if (ctxt < dd->first_dyn_alloc_ctxt) {
+			if (ctxt < kctxt_ngroups) {
+				base = ctxt * (dd->rcv_entries.ngroups + 1);
+				rcd->rcv_array_groups++;
+			} else {
+				base = kctxt_ngroups +
+					(ctxt * dd->rcv_entries.ngroups);
+			}
+		} else {
+			u16 ct = ctxt - dd->first_dyn_alloc_ctxt;
+
+			base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
+				kctxt_ngroups);
+			if (ct < dd->rcv_entries.nctxt_extra) {
+				base += ct * (dd->rcv_entries.ngroups + 1);
+				rcd->rcv_array_groups++;
+			} else {
+				base += dd->rcv_entries.nctxt_extra +
+					(ct * dd->rcv_entries.ngroups);
+			}
+		}
+		rcd->eager_base = base * dd->rcv_entries.group_size;
+
+		rcd->rcvhdrq_cnt = rcvhdrcnt;
+		rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
+		/*
+		 * Simple Eager buffer allocation: we have already pre-allocated
+		 * the number of RcvArray entry groups. Each ctxtdata structure
+		 * holds the number of groups for that context.
+		 *
+		 * To follow CSR requirements and maintain cacheline alignment,
+		 * make sure all sizes and bases are multiples of group_size.
+		 *
+		 * The expected entry count is what is left after assigning
+		 * eager.
+		 */
+		max_entries = rcd->rcv_array_groups *
+			dd->rcv_entries.group_size;
+		rcvtids = ((max_entries * hfi1_rcvarr_split) / 100);
+		rcd->egrbufs.count = round_down(rcvtids,
+						dd->rcv_entries.group_size);
+		if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
+			dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
+				   rcd->ctxt);
+			rcd->egrbufs.count = MAX_EAGER_ENTRIES;
+		}
+		hfi1_cdbg(PROC,
+			  "ctxt%u: max Eager buffer RcvArray entries: %u\n",
+			  rcd->ctxt, rcd->egrbufs.count);
+
+		/*
+		 * Allocate array that will hold the eager buffer accounting
+		 * data.
+		 * This will allocate the maximum possible buffer count based
+		 * on the value of the RcvArray split parameter.
+		 * The resulting value will be rounded down to the closest
+		 * multiple of dd->rcv_entries.group_size.
+		 */
+		rcd->egrbufs.buffers =
+			kcalloc_node(rcd->egrbufs.count,
+				     sizeof(*rcd->egrbufs.buffers),
+				     GFP_KERNEL, numa);
+		if (!rcd->egrbufs.buffers)
+			goto bail;
+		rcd->egrbufs.rcvtids =
+			kcalloc_node(rcd->egrbufs.count,
+				     sizeof(*rcd->egrbufs.rcvtids),
+				     GFP_KERNEL, numa);
+		if (!rcd->egrbufs.rcvtids)
+			goto bail;
+		rcd->egrbufs.size = eager_buffer_size;
+		/*
+		 * The size of the buffers programmed into the RcvArray
+		 * entries needs to be big enough to handle the highest
+		 * MTU supported.
+		 */
+		if (rcd->egrbufs.size < hfi1_max_mtu) {
+			rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
+			hfi1_cdbg(PROC,
+				  "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
+				    rcd->ctxt, rcd->egrbufs.size);
+		}
+		rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
+
+		/* Applicable only for statically created kernel contexts */
+		if (ctxt < dd->first_dyn_alloc_ctxt) {
+			rcd->opstats = kzalloc_node(sizeof(*rcd->opstats),
+						    GFP_KERNEL, numa);
+			if (!rcd->opstats)
+				goto bail;
+		}
+
+		*context = rcd;
+		return 0;
+	}
+
+bail:
+	*context = NULL;
+	hfi1_free_ctxt(rcd);
+	return -ENOMEM;
+}
+
+/**
+ * hfi1_free_ctxt
+ * @rcd: pointer to an initialized rcd data structure
+ *
+ * This wrapper is the free function that matches hfi1_create_ctxtdata().
+ * When a context is done being used (kernel or user), this function is called
+ * for the "final" put to match the kref init from hf1i_create_ctxtdata().
+ * Other users of the context do a get/put sequence to make sure that the
+ * structure isn't removed while in use.
+ */
+void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd)
+{
+	hfi1_rcd_put(rcd);
+}
+
+/*
+ * Convert a receive header entry size that to the encoding used in the CSR.
+ *
+ * Return a zero if the given size is invalid.
+ */
+static inline u64 encode_rcv_header_entry_size(u16 size)
+{
+	/* there are only 3 valid receive header entry sizes */
+	if (size == 2)
+		return 1;
+	if (size == 16)
+		return 2;
+	else if (size == 32)
+		return 4;
+	return 0; /* invalid */
+}
+
+/*
+ * Select the largest ccti value over all SLs to determine the intra-
+ * packet gap for the link.
+ *
+ * called with cca_timer_lock held (to protect access to cca_timer
+ * array), and rcu_read_lock() (to protect access to cc_state).
+ */
+void set_link_ipg(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	struct cc_state *cc_state;
+	int i;
+	u16 cce, ccti_limit, max_ccti = 0;
+	u16 shift, mult;
+	u64 src;
+	u32 current_egress_rate; /* Mbits /sec */
+	u32 max_pkt_time;
+	/*
+	 * max_pkt_time is the maximum packet egress time in units
+	 * of the fabric clock period 1/(805 MHz).
+	 */
+
+	cc_state = get_cc_state(ppd);
+
+	if (!cc_state)
+		/*
+		 * This should _never_ happen - rcu_read_lock() is held,
+		 * and set_link_ipg() should not be called if cc_state
+		 * is NULL.
+		 */
+		return;
+
+	for (i = 0; i < OPA_MAX_SLS; i++) {
+		u16 ccti = ppd->cca_timer[i].ccti;
+
+		if (ccti > max_ccti)
+			max_ccti = ccti;
+	}
+
+	ccti_limit = cc_state->cct.ccti_limit;
+	if (max_ccti > ccti_limit)
+		max_ccti = ccti_limit;
+
+	cce = cc_state->cct.entries[max_ccti].entry;
+	shift = (cce & 0xc000) >> 14;
+	mult = (cce & 0x3fff);
+
+	current_egress_rate = active_egress_rate(ppd);
+
+	max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate);
+
+	src = (max_pkt_time >> shift) * mult;
+
+	src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
+	src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
+
+	write_csr(dd, SEND_STATIC_RATE_CONTROL, src);
+}
+
+static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
+{
+	struct cca_timer *cca_timer;
+	struct hfi1_pportdata *ppd;
+	int sl;
+	u16 ccti_timer, ccti_min;
+	struct cc_state *cc_state;
+	unsigned long flags;
+	enum hrtimer_restart ret = HRTIMER_NORESTART;
+
+	cca_timer = container_of(t, struct cca_timer, hrtimer);
+	ppd = cca_timer->ppd;
+	sl = cca_timer->sl;
+
+	rcu_read_lock();
+
+	cc_state = get_cc_state(ppd);
+
+	if (!cc_state) {
+		rcu_read_unlock();
+		return HRTIMER_NORESTART;
+	}
+
+	/*
+	 * 1) decrement ccti for SL
+	 * 2) calculate IPG for link (set_link_ipg())
+	 * 3) restart timer, unless ccti is at min value
+	 */
+
+	ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
+	ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+
+	spin_lock_irqsave(&ppd->cca_timer_lock, flags);
+
+	if (cca_timer->ccti > ccti_min) {
+		cca_timer->ccti--;
+		set_link_ipg(ppd);
+	}
+
+	if (cca_timer->ccti > ccti_min) {
+		unsigned long nsec = 1024 * ccti_timer;
+		/* ccti_timer is in units of 1.024 usec */
+		hrtimer_forward_now(t, ns_to_ktime(nsec));
+		ret = HRTIMER_RESTART;
+	}
+
+	spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+	rcu_read_unlock();
+	return ret;
+}
+
+/*
+ * Common code for initializing the physical port structure.
+ */
+void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
+			 struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
+{
+	int i;
+	uint default_pkey_idx;
+	struct cc_state *cc_state;
+
+	ppd->dd = dd;
+	ppd->hw_pidx = hw_pidx;
+	ppd->port = port; /* IB port number, not index */
+	ppd->prev_link_width = LINK_WIDTH_DEFAULT;
+	/*
+	 * There are C_VL_COUNT number of PortVLXmitWait counters.
+	 * Adding 1 to C_VL_COUNT to include the PortXmitWait counter.
+	 */
+	for (i = 0; i < C_VL_COUNT + 1; i++) {
+		ppd->port_vl_xmit_wait_last[i] = 0;
+		ppd->vl_xmit_flit_cnt[i] = 0;
+	}
+
+	default_pkey_idx = 1;
+
+	ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
+	ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
+
+	if (loopback) {
+		hfi1_early_err(&pdev->dev,
+			       "Faking data partition 0x8001 in idx %u\n",
+			       !default_pkey_idx);
+		ppd->pkeys[!default_pkey_idx] = 0x8001;
+	}
+
+	INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
+	INIT_WORK(&ppd->link_up_work, handle_link_up);
+	INIT_WORK(&ppd->link_down_work, handle_link_down);
+	INIT_WORK(&ppd->freeze_work, handle_freeze);
+	INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
+	INIT_WORK(&ppd->sma_message_work, handle_sma_message);
+	INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
+	INIT_DELAYED_WORK(&ppd->start_link_work, handle_start_link);
+	INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work);
+	INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
+
+	mutex_init(&ppd->hls_lock);
+	spin_lock_init(&ppd->qsfp_info.qsfp_lock);
+
+	ppd->qsfp_info.ppd = ppd;
+	ppd->sm_trap_qp = 0x0;
+	ppd->sa_qp = 0x1;
+
+	ppd->hfi1_wq = NULL;
+
+	spin_lock_init(&ppd->cca_timer_lock);
+
+	for (i = 0; i < OPA_MAX_SLS; i++) {
+		hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
+			     HRTIMER_MODE_REL);
+		ppd->cca_timer[i].ppd = ppd;
+		ppd->cca_timer[i].sl = i;
+		ppd->cca_timer[i].ccti = 0;
+		ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
+	}
+
+	ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
+
+	spin_lock_init(&ppd->cc_state_lock);
+	spin_lock_init(&ppd->cc_log_lock);
+	cc_state = kzalloc(sizeof(*cc_state), GFP_KERNEL);
+	RCU_INIT_POINTER(ppd->cc_state, cc_state);
+	if (!cc_state)
+		goto bail;
+	return;
+
+bail:
+
+	hfi1_early_err(&pdev->dev,
+		       "Congestion Control Agent disabled for port %d\n", port);
+}
+
+/*
+ * Do initialization for device that is only needed on
+ * first detect, not on resets.
+ */
+static int loadtime_init(struct hfi1_devdata *dd)
+{
+	return 0;
+}
+
+/**
+ * init_after_reset - re-initialize after a reset
+ * @dd: the hfi1_ib device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_after_reset(struct hfi1_devdata *dd)
+{
+	int i;
+	struct hfi1_ctxtdata *rcd;
+	/*
+	 * Ensure chip does no sends or receives, tail updates, or
+	 * pioavail updates while we re-initialize.  This is mostly
+	 * for the driver data structures, not chip registers.
+	 */
+	for (i = 0; i < dd->num_rcv_contexts; i++) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+			     HFI1_RCVCTRL_INTRAVAIL_DIS |
+			     HFI1_RCVCTRL_TAILUPD_DIS, rcd);
+		hfi1_rcd_put(rcd);
+	}
+	pio_send_control(dd, PSC_GLOBAL_DISABLE);
+	for (i = 0; i < dd->num_send_contexts; i++)
+		sc_disable(dd->send_contexts[i].sc);
+
+	return 0;
+}
+
+static void enable_chip(struct hfi1_devdata *dd)
+{
+	struct hfi1_ctxtdata *rcd;
+	u32 rcvmask;
+	u16 i;
+
+	/* enable PIO send */
+	pio_send_control(dd, PSC_GLOBAL_ENABLE);
+
+	/*
+	 * Enable kernel ctxts' receive and receive interrupt.
+	 * Other ctxts done as user opens and initializes them.
+	 */
+	for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) {
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		if (!rcd)
+			continue;
+		rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
+		rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ?
+			HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
+		if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
+			rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+		if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_RHQ_FULL))
+			rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+		if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
+			rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+		hfi1_rcvctrl(dd, rcvmask, rcd);
+		sc_enable(rcd->sc);
+		hfi1_rcd_put(rcd);
+	}
+}
+
+/**
+ * create_workqueues - create per port workqueues
+ * @dd: the hfi1_ib device
+ */
+static int create_workqueues(struct hfi1_devdata *dd)
+{
+	int pidx;
+	struct hfi1_pportdata *ppd;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (!ppd->hfi1_wq) {
+			ppd->hfi1_wq =
+				alloc_workqueue(
+				    "hfi%d_%d",
+				    WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+				    HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES,
+				    dd->unit, pidx);
+			if (!ppd->hfi1_wq)
+				goto wq_error;
+		}
+		if (!ppd->link_wq) {
+			/*
+			 * Make the link workqueue single-threaded to enforce
+			 * serialization.
+			 */
+			ppd->link_wq =
+				alloc_workqueue(
+				    "hfi_link_%d_%d",
+				    WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND,
+				    1, /* max_active */
+				    dd->unit, pidx);
+			if (!ppd->link_wq)
+				goto wq_error;
+		}
+	}
+	return 0;
+wq_error:
+	pr_err("alloc_workqueue failed for port %d\n", pidx + 1);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (ppd->hfi1_wq) {
+			destroy_workqueue(ppd->hfi1_wq);
+			ppd->hfi1_wq = NULL;
+		}
+		if (ppd->link_wq) {
+			destroy_workqueue(ppd->link_wq);
+			ppd->link_wq = NULL;
+		}
+	}
+	return -ENOMEM;
+}
+
+/**
+ * hfi1_init - do the actual initialization sequence on the chip
+ * @dd: the hfi1_ib device
+ * @reinit: re-initializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip.  This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0).  We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int hfi1_init(struct hfi1_devdata *dd, int reinit)
+{
+	int ret = 0, pidx, lastfail = 0;
+	unsigned long len;
+	u16 i;
+	struct hfi1_ctxtdata *rcd;
+	struct hfi1_pportdata *ppd;
+
+	/* Set up recv low level handlers */
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] =
+						kdeth_process_expected;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] =
+						kdeth_process_eager;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] =
+						process_receive_error;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] =
+						process_receive_bypass;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] =
+						process_receive_invalid;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] =
+						process_receive_invalid;
+	dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] =
+						process_receive_invalid;
+	dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
+
+	/* Set up send low level handlers */
+	dd->process_pio_send = hfi1_verbs_send_pio;
+	dd->process_dma_send = hfi1_verbs_send_dma;
+	dd->pio_inline_send = pio_copy;
+	dd->process_vnic_dma_send = hfi1_vnic_send_dma;
+
+	if (is_ax(dd)) {
+		atomic_set(&dd->drop_packet, DROP_PACKET_ON);
+		dd->do_drop = 1;
+	} else {
+		atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
+		dd->do_drop = 0;
+	}
+
+	/* make sure the link is not "up" */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		ppd->linkup = 0;
+	}
+
+	if (reinit)
+		ret = init_after_reset(dd);
+	else
+		ret = loadtime_init(dd);
+	if (ret)
+		goto done;
+
+	/* allocate dummy tail memory for all receive contexts */
+	dd->rcvhdrtail_dummy_kvaddr = dma_zalloc_coherent(
+		&dd->pcidev->dev, sizeof(u64),
+		&dd->rcvhdrtail_dummy_dma,
+		GFP_KERNEL);
+
+	if (!dd->rcvhdrtail_dummy_kvaddr) {
+		dd_dev_err(dd, "cannot allocate dummy tail memory\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	/* dd->rcd can be NULL if early initialization failed */
+	for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) {
+		/*
+		 * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
+		 * re-init, the simplest way to handle this is to free
+		 * existing, and re-allocate.
+		 * Need to re-create rest of ctxt 0 ctxtdata as well.
+		 */
+		rcd = hfi1_rcd_get_by_index(dd, i);
+		if (!rcd)
+			continue;
+
+		rcd->do_interrupt = &handle_receive_interrupt;
+
+		lastfail = hfi1_create_rcvhdrq(dd, rcd);
+		if (!lastfail)
+			lastfail = hfi1_setup_eagerbufs(rcd);
+		if (lastfail) {
+			dd_dev_err(dd,
+				   "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
+			ret = lastfail;
+		}
+		hfi1_rcd_put(rcd);
+	}
+
+	/* Allocate enough memory for user event notification. */
+	len = PAGE_ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS *
+			 sizeof(*dd->events));
+	dd->events = vmalloc_user(len);
+	if (!dd->events)
+		dd_dev_err(dd, "Failed to allocate user events page\n");
+	/*
+	 * Allocate a page for device and port status.
+	 * Page will be shared amongst all user processes.
+	 */
+	dd->status = vmalloc_user(PAGE_SIZE);
+	if (!dd->status)
+		dd_dev_err(dd, "Failed to allocate dev status page\n");
+	else
+		dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) -
+					     sizeof(dd->status->freezemsg));
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (dd->status)
+			/* Currently, we only have one port */
+			ppd->statusp = &dd->status->port;
+
+		set_mtu(ppd);
+	}
+
+	/* enable chip even if we have an error, so we can debug cause */
+	enable_chip(dd);
+
+done:
+	/*
+	 * Set status even if port serdes is not initialized
+	 * so that diags will work.
+	 */
+	if (dd->status)
+		dd->status->dev |= HFI1_STATUS_CHIP_PRESENT |
+			HFI1_STATUS_INITTED;
+	if (!ret) {
+		/* enable all interrupts from the chip */
+		set_intr_state(dd, 1);
+
+		/* chip is OK for user apps; mark it as initialized */
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+
+			/*
+			 * start the serdes - must be after interrupts are
+			 * enabled so we are notified when the link goes up
+			 */
+			lastfail = bringup_serdes(ppd);
+			if (lastfail)
+				dd_dev_info(dd,
+					    "Failed to bring up port %u\n",
+					    ppd->port);
+
+			/*
+			 * Set status even if port serdes is not initialized
+			 * so that diags will work.
+			 */
+			if (ppd->statusp)
+				*ppd->statusp |= HFI1_STATUS_CHIP_PRESENT |
+							HFI1_STATUS_INITTED;
+			if (!ppd->link_speed_enabled)
+				continue;
+		}
+	}
+
+	/* if ret is non-zero, we probably should do some cleanup here... */
+	return ret;
+}
+
+static inline struct hfi1_devdata *__hfi1_lookup(int unit)
+{
+	return idr_find(&hfi1_unit_table, unit);
+}
+
+struct hfi1_devdata *hfi1_lookup(int unit)
+{
+	struct hfi1_devdata *dd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	dd = __hfi1_lookup(unit);
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+	return dd;
+}
+
+/*
+ * Stop the timers during unit shutdown, or after an error late
+ * in initialization.
+ */
+static void stop_timers(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd;
+	int pidx;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (ppd->led_override_timer.function) {
+			del_timer_sync(&ppd->led_override_timer);
+			atomic_set(&ppd->led_override_timer_active, 0);
+		}
+	}
+}
+
+/**
+ * shutdown_device - shut down a device
+ * @dd: the hfi1_ib device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled.   It does not free any data structures.
+ * Everything it does has to be setup again by hfi1_init(dd, 1)
+ */
+static void shutdown_device(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ctxtdata *rcd;
+	unsigned pidx;
+	int i;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+
+		ppd->linkup = 0;
+		if (ppd->statusp)
+			*ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+					   HFI1_STATUS_IB_READY);
+	}
+	dd->flags &= ~HFI1_INITTED;
+
+	/* mask and clean up interrupts, but not errors */
+	set_intr_state(dd, 0);
+	hfi1_clean_up_interrupts(dd);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		for (i = 0; i < dd->num_rcv_contexts; i++) {
+			rcd = hfi1_rcd_get_by_index(dd, i);
+			hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
+				     HFI1_RCVCTRL_CTXT_DIS |
+				     HFI1_RCVCTRL_INTRAVAIL_DIS |
+				     HFI1_RCVCTRL_PKEY_DIS |
+				     HFI1_RCVCTRL_ONE_PKT_EGR_DIS, rcd);
+			hfi1_rcd_put(rcd);
+		}
+		/*
+		 * Gracefully stop all sends allowing any in progress to
+		 * trickle out first.
+		 */
+		for (i = 0; i < dd->num_send_contexts; i++)
+			sc_flush(dd->send_contexts[i].sc);
+	}
+
+	/*
+	 * Enough for anything that's going to trickle out to have actually
+	 * done so.
+	 */
+	udelay(20);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+
+		/* disable all contexts */
+		for (i = 0; i < dd->num_send_contexts; i++)
+			sc_disable(dd->send_contexts[i].sc);
+		/* disable the send device */
+		pio_send_control(dd, PSC_GLOBAL_DISABLE);
+
+		shutdown_led_override(ppd);
+
+		/*
+		 * Clear SerdesEnable.
+		 * We can't count on interrupts since we are stopping.
+		 */
+		hfi1_quiet_serdes(ppd);
+
+		if (ppd->hfi1_wq) {
+			destroy_workqueue(ppd->hfi1_wq);
+			ppd->hfi1_wq = NULL;
+		}
+		if (ppd->link_wq) {
+			destroy_workqueue(ppd->link_wq);
+			ppd->link_wq = NULL;
+		}
+	}
+	sdma_exit(dd);
+}
+
+/**
+ * hfi1_free_ctxtdata - free a context's allocated data
+ * @dd: the hfi1_ib device
+ * @rcd: the ctxtdata structure
+ *
+ * free up any allocated data for a context
+ * It should never change any chip state, or global driver state.
+ */
+void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+	u32 e;
+
+	if (!rcd)
+		return;
+
+	if (rcd->rcvhdrq) {
+		dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
+				  rcd->rcvhdrq, rcd->rcvhdrq_dma);
+		rcd->rcvhdrq = NULL;
+		if (rcd->rcvhdrtail_kvaddr) {
+			dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+					  (void *)rcd->rcvhdrtail_kvaddr,
+					  rcd->rcvhdrqtailaddr_dma);
+			rcd->rcvhdrtail_kvaddr = NULL;
+		}
+	}
+
+	/* all the RcvArray entries should have been cleared by now */
+	kfree(rcd->egrbufs.rcvtids);
+	rcd->egrbufs.rcvtids = NULL;
+
+	for (e = 0; e < rcd->egrbufs.alloced; e++) {
+		if (rcd->egrbufs.buffers[e].dma)
+			dma_free_coherent(&dd->pcidev->dev,
+					  rcd->egrbufs.buffers[e].len,
+					  rcd->egrbufs.buffers[e].addr,
+					  rcd->egrbufs.buffers[e].dma);
+	}
+	kfree(rcd->egrbufs.buffers);
+	rcd->egrbufs.alloced = 0;
+	rcd->egrbufs.buffers = NULL;
+
+	sc_free(rcd->sc);
+	rcd->sc = NULL;
+
+	vfree(rcd->subctxt_uregbase);
+	vfree(rcd->subctxt_rcvegrbuf);
+	vfree(rcd->subctxt_rcvhdr_base);
+	kfree(rcd->opstats);
+
+	rcd->subctxt_uregbase = NULL;
+	rcd->subctxt_rcvegrbuf = NULL;
+	rcd->subctxt_rcvhdr_base = NULL;
+	rcd->opstats = NULL;
+}
+
+/*
+ * Release our hold on the shared asic data.  If we are the last one,
+ * return the structure to be finalized outside the lock.  Must be
+ * holding hfi1_devs_lock.
+ */
+static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd)
+{
+	struct hfi1_asic_data *ad;
+	int other;
+
+	if (!dd->asic_data)
+		return NULL;
+	dd->asic_data->dds[dd->hfi1_id] = NULL;
+	other = dd->hfi1_id ? 0 : 1;
+	ad = dd->asic_data;
+	dd->asic_data = NULL;
+	/* return NULL if the other dd still has a link */
+	return ad->dds[other] ? NULL : ad;
+}
+
+static void finalize_asic_data(struct hfi1_devdata *dd,
+			       struct hfi1_asic_data *ad)
+{
+	clean_up_i2c(dd, ad);
+	kfree(ad);
+}
+
+/**
+ * hfi1_clean_devdata - cleans up per-unit data structure
+ * @dd: pointer to a valid devdata structure
+ *
+ * It cleans up all data structures set up by
+ * by hfi1_alloc_devdata().
+ */
+static void hfi1_clean_devdata(struct hfi1_devdata *dd)
+{
+	struct hfi1_asic_data *ad;
+	unsigned long flags;
+
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	if (!list_empty(&dd->list)) {
+		idr_remove(&hfi1_unit_table, dd->unit);
+		list_del_init(&dd->list);
+	}
+	ad = release_asic_data(dd);
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+	finalize_asic_data(dd, ad);
+	free_platform_config(dd);
+	rcu_barrier(); /* wait for rcu callbacks to complete */
+	free_percpu(dd->int_counter);
+	free_percpu(dd->rcv_limit);
+	free_percpu(dd->send_schedule);
+	free_percpu(dd->tx_opstats);
+	dd->int_counter   = NULL;
+	dd->rcv_limit     = NULL;
+	dd->send_schedule = NULL;
+	dd->tx_opstats    = NULL;
+	sdma_clean(dd, dd->num_sdma);
+	rvt_dealloc_device(&dd->verbs_dev.rdi);
+}
+
+static void __hfi1_free_devdata(struct kobject *kobj)
+{
+	struct hfi1_devdata *dd =
+		container_of(kobj, struct hfi1_devdata, kobj);
+
+	hfi1_clean_devdata(dd);
+}
+
+static struct kobj_type hfi1_devdata_type = {
+	.release = __hfi1_free_devdata,
+};
+
+void hfi1_free_devdata(struct hfi1_devdata *dd)
+{
+	kobject_put(&dd->kobj);
+}
+
+/*
+ * Allocate our primary per-unit data structure.  Must be done via verbs
+ * allocator, because the verbs cleanup process both does cleanup and
+ * free of the data structure.
+ * "extra" is for chip-specific data.
+ *
+ * Use the idr mechanism to get a unit number for this unit.
+ */
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
+{
+	unsigned long flags;
+	struct hfi1_devdata *dd;
+	int ret, nports;
+
+	/* extra is * number of ports */
+	nports = extra / sizeof(struct hfi1_pportdata);
+
+	dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra,
+						     nports);
+	if (!dd)
+		return ERR_PTR(-ENOMEM);
+	dd->num_pports = nports;
+	dd->pport = (struct hfi1_pportdata *)(dd + 1);
+	dd->pcidev = pdev;
+	pci_set_drvdata(pdev, dd);
+
+	INIT_LIST_HEAD(&dd->list);
+	idr_preload(GFP_KERNEL);
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+
+	ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT);
+	if (ret >= 0) {
+		dd->unit = ret;
+		list_add(&dd->list, &hfi1_dev_list);
+	}
+
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	idr_preload_end();
+
+	if (ret < 0) {
+		hfi1_early_err(&pdev->dev,
+			       "Could not allocate unit ID: error %d\n", -ret);
+		goto bail;
+	}
+	rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit);
+
+	/*
+	 * Initialize all locks for the device. This needs to be as early as
+	 * possible so locks are usable.
+	 */
+	spin_lock_init(&dd->sc_lock);
+	spin_lock_init(&dd->sendctrl_lock);
+	spin_lock_init(&dd->rcvctrl_lock);
+	spin_lock_init(&dd->uctxt_lock);
+	spin_lock_init(&dd->hfi1_diag_trans_lock);
+	spin_lock_init(&dd->sc_init_lock);
+	spin_lock_init(&dd->dc8051_memlock);
+	seqlock_init(&dd->sc2vl_lock);
+	spin_lock_init(&dd->sde_map_lock);
+	spin_lock_init(&dd->pio_map_lock);
+	mutex_init(&dd->dc8051_lock);
+	init_waitqueue_head(&dd->event_queue);
+
+	dd->int_counter = alloc_percpu(u64);
+	if (!dd->int_counter) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	dd->rcv_limit = alloc_percpu(u64);
+	if (!dd->rcv_limit) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	dd->send_schedule = alloc_percpu(u64);
+	if (!dd->send_schedule) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	dd->tx_opstats = alloc_percpu(struct hfi1_opcode_stats_perctx);
+	if (!dd->tx_opstats) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	kobject_init(&dd->kobj, &hfi1_devdata_type);
+	return dd;
+
+bail:
+	hfi1_clean_devdata(dd);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Called from freeze mode handlers, and from PCI error
+ * reporting code.  Should be paranoid about state of
+ * system and data structures.
+ */
+void hfi1_disable_after_error(struct hfi1_devdata *dd)
+{
+	if (dd->flags & HFI1_INITTED) {
+		u32 pidx;
+
+		dd->flags &= ~HFI1_INITTED;
+		if (dd->pport)
+			for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+				struct hfi1_pportdata *ppd;
+
+				ppd = dd->pport + pidx;
+				if (dd->flags & HFI1_PRESENT)
+					set_link_state(ppd, HLS_DN_DISABLE);
+
+				if (ppd->statusp)
+					*ppd->statusp &= ~HFI1_STATUS_IB_READY;
+			}
+	}
+
+	/*
+	 * Mark as having had an error for driver, and also
+	 * for /sys and status word mapped to user programs.
+	 * This marks unit as not usable, until reset.
+	 */
+	if (dd->status)
+		dd->status->dev |= HFI1_STATUS_HWERROR;
+}
+
+static void remove_one(struct pci_dev *);
+static int init_one(struct pci_dev *, const struct pci_device_id *);
+
+#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
+#define PFX DRIVER_NAME ": "
+
+const struct pci_device_id hfi1_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
+
+static struct pci_driver hfi1_pci_driver = {
+	.name = DRIVER_NAME,
+	.probe = init_one,
+	.remove = remove_one,
+	.id_table = hfi1_pci_tbl,
+	.err_handler = &hfi1_pci_err_handler,
+};
+
+static void __init compute_krcvqs(void)
+{
+	int i;
+
+	for (i = 0; i < krcvqsset; i++)
+		n_krcvqs += krcvqs[i];
+}
+
+/*
+ * Do all the generic driver unit- and chip-independent memory
+ * allocation and initialization.
+ */
+static int __init hfi1_mod_init(void)
+{
+	int ret;
+
+	ret = dev_init();
+	if (ret)
+		goto bail;
+
+	ret = node_affinity_init();
+	if (ret)
+		goto bail;
+
+	/* validate max MTU before any devices start */
+	if (!valid_opa_max_mtu(hfi1_max_mtu)) {
+		pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
+		       hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
+		hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+	}
+	/* valid CUs run from 1-128 in powers of 2 */
+	if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu))
+		hfi1_cu = 1;
+	/* valid credit return threshold is 0-100, variable is unsigned */
+	if (user_credit_return_threshold > 100)
+		user_credit_return_threshold = 100;
+
+	compute_krcvqs();
+	/*
+	 * sanitize receive interrupt count, time must wait until after
+	 * the hardware type is known
+	 */
+	if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
+		rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
+	/* reject invalid combinations */
+	if (rcv_intr_count == 0 && rcv_intr_timeout == 0) {
+		pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
+		rcv_intr_count = 1;
+	}
+	if (rcv_intr_count > 1 && rcv_intr_timeout == 0) {
+		/*
+		 * Avoid indefinite packet delivery by requiring a timeout
+		 * if count is > 1.
+		 */
+		pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
+		rcv_intr_timeout = 1;
+	}
+	if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) {
+		/*
+		 * The dynamic algorithm expects a non-zero timeout
+		 * and a count > 1.
+		 */
+		pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
+		rcv_intr_dynamic = 0;
+	}
+
+	/* sanitize link CRC options */
+	link_crc_mask &= SUPPORTED_CRCS;
+
+	/*
+	 * These must be called before the driver is registered with
+	 * the PCI subsystem.
+	 */
+	idr_init(&hfi1_unit_table);
+
+	hfi1_dbg_init();
+	ret = hfi1_wss_init();
+	if (ret < 0)
+		goto bail_wss;
+	ret = pci_register_driver(&hfi1_pci_driver);
+	if (ret < 0) {
+		pr_err("Unable to register driver: error %d\n", -ret);
+		goto bail_dev;
+	}
+	goto bail; /* all OK */
+
+bail_dev:
+	hfi1_wss_exit();
+bail_wss:
+	hfi1_dbg_exit();
+	idr_destroy(&hfi1_unit_table);
+	dev_cleanup();
+bail:
+	return ret;
+}
+
+module_init(hfi1_mod_init);
+
+/*
+ * Do the non-unit driver cleanup, memory free, etc. at unload.
+ */
+static void __exit hfi1_mod_cleanup(void)
+{
+	pci_unregister_driver(&hfi1_pci_driver);
+	node_affinity_destroy();
+	hfi1_wss_exit();
+	hfi1_dbg_exit();
+
+	idr_destroy(&hfi1_unit_table);
+	dispose_firmware();	/* asymmetric with obtain_firmware() */
+	dev_cleanup();
+}
+
+module_exit(hfi1_mod_cleanup);
+
+/* this can only be called after a successful initialization */
+static void cleanup_device_data(struct hfi1_devdata *dd)
+{
+	int ctxt;
+	int pidx;
+
+	/* users can't do anything more with chip */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		struct hfi1_pportdata *ppd = &dd->pport[pidx];
+		struct cc_state *cc_state;
+		int i;
+
+		if (ppd->statusp)
+			*ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
+
+		for (i = 0; i < OPA_MAX_SLS; i++)
+			hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
+
+		spin_lock(&ppd->cc_state_lock);
+		cc_state = get_cc_state_protected(ppd);
+		RCU_INIT_POINTER(ppd->cc_state, NULL);
+		spin_unlock(&ppd->cc_state_lock);
+
+		if (cc_state)
+			kfree_rcu(cc_state, rcu);
+	}
+
+	free_credit_return(dd);
+
+	if (dd->rcvhdrtail_dummy_kvaddr) {
+		dma_free_coherent(&dd->pcidev->dev, sizeof(u64),
+				  (void *)dd->rcvhdrtail_dummy_kvaddr,
+				  dd->rcvhdrtail_dummy_dma);
+		dd->rcvhdrtail_dummy_kvaddr = NULL;
+	}
+
+	/*
+	 * Free any resources still in use (usually just kernel contexts)
+	 * at unload; we do for ctxtcnt, because that's what we allocate.
+	 */
+	for (ctxt = 0; dd->rcd && ctxt < dd->num_rcv_contexts; ctxt++) {
+		struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+
+		if (rcd) {
+			hfi1_clear_tids(rcd);
+			hfi1_free_ctxt(rcd);
+		}
+	}
+
+	kfree(dd->rcd);
+	dd->rcd = NULL;
+
+	free_pio_map(dd);
+	/* must follow rcv context free - need to remove rcv's hooks */
+	for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
+		sc_free(dd->send_contexts[ctxt].sc);
+	dd->num_send_contexts = 0;
+	kfree(dd->send_contexts);
+	dd->send_contexts = NULL;
+	kfree(dd->hw_to_sw);
+	dd->hw_to_sw = NULL;
+	kfree(dd->boardname);
+	vfree(dd->events);
+	vfree(dd->status);
+}
+
+/*
+ * Clean up on unit shutdown, or error during unit load after
+ * successful initialization.
+ */
+static void postinit_cleanup(struct hfi1_devdata *dd)
+{
+	hfi1_start_cleanup(dd);
+
+	hfi1_pcie_ddcleanup(dd);
+	hfi1_pcie_cleanup(dd->pcidev);
+
+	cleanup_device_data(dd);
+
+	hfi1_free_devdata(dd);
+}
+
+static int init_validate_rcvhdrcnt(struct device *dev, uint thecnt)
+{
+	if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
+		hfi1_early_err(dev, "Receive header queue count too small\n");
+		return -EINVAL;
+	}
+
+	if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
+		hfi1_early_err(dev,
+			       "Receive header queue count cannot be greater than %u\n",
+			       HFI1_MAX_HDRQ_EGRBUF_CNT);
+		return -EINVAL;
+	}
+
+	if (thecnt % HDRQ_INCREMENT) {
+		hfi1_early_err(dev, "Receive header queue count %d must be divisible by %lu\n",
+			       thecnt, HDRQ_INCREMENT);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret = 0, j, pidx, initfail;
+	struct hfi1_devdata *dd;
+	struct hfi1_pportdata *ppd;
+
+	/* First, lock the non-writable module parameters */
+	HFI1_CAP_LOCK();
+
+	/* Validate dev ids */
+	if (!(ent->device == PCI_DEVICE_ID_INTEL0 ||
+	      ent->device == PCI_DEVICE_ID_INTEL1)) {
+		hfi1_early_err(&pdev->dev,
+			       "Failing on unknown Intel deviceid 0x%x\n",
+			       ent->device);
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	/* Validate some global module parameters */
+	ret = init_validate_rcvhdrcnt(&pdev->dev, rcvhdrcnt);
+	if (ret)
+		goto bail;
+
+	/* use the encoding function as a sanitization check */
+	if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
+		hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
+			       hfi1_hdrq_entsize);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* The receive eager buffer size must be set before the receive
+	 * contexts are created.
+	 *
+	 * Set the eager buffer size.  Validate that it falls in a range
+	 * allowed by the hardware - all powers of 2 between the min and
+	 * max.  The maximum valid MTU is within the eager buffer range
+	 * so we do not need to cap the max_mtu by an eager buffer size
+	 * setting.
+	 */
+	if (eager_buffer_size) {
+		if (!is_power_of_2(eager_buffer_size))
+			eager_buffer_size =
+				roundup_pow_of_two(eager_buffer_size);
+		eager_buffer_size =
+			clamp_val(eager_buffer_size,
+				  MIN_EAGER_BUFFER * 8,
+				  MAX_EAGER_BUFFER_TOTAL);
+		hfi1_early_info(&pdev->dev, "Eager buffer size %u\n",
+				eager_buffer_size);
+	} else {
+		hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n");
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* restrict value of hfi1_rcvarr_split */
+	hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
+
+	ret = hfi1_pcie_init(pdev, ent);
+	if (ret)
+		goto bail;
+
+	/*
+	 * Do device-specific initialization, function table setup, dd
+	 * allocation, etc.
+	 */
+	dd = hfi1_init_dd(pdev, ent);
+
+	if (IS_ERR(dd)) {
+		ret = PTR_ERR(dd);
+		goto clean_bail; /* error already printed */
+	}
+
+	ret = create_workqueues(dd);
+	if (ret)
+		goto clean_bail;
+
+	/* do the generic initialization */
+	initfail = hfi1_init(dd, 0);
+
+	/* setup vnic */
+	hfi1_vnic_setup(dd);
+
+	ret = hfi1_register_ib_device(dd);
+
+	/*
+	 * Now ready for use.  this should be cleared whenever we
+	 * detect a reset, or initiate one.  If earlier failure,
+	 * we still create devices, so diags, etc. can be used
+	 * to determine cause of problem.
+	 */
+	if (!initfail && !ret) {
+		dd->flags |= HFI1_INITTED;
+		/* create debufs files after init and ib register */
+		hfi1_dbg_ibdev_init(&dd->verbs_dev);
+	}
+
+	j = hfi1_device_create(dd);
+	if (j)
+		dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
+
+	if (initfail || ret) {
+		hfi1_clean_up_interrupts(dd);
+		stop_timers(dd);
+		flush_workqueue(ib_wq);
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			hfi1_quiet_serdes(dd->pport + pidx);
+			ppd = dd->pport + pidx;
+			if (ppd->hfi1_wq) {
+				destroy_workqueue(ppd->hfi1_wq);
+				ppd->hfi1_wq = NULL;
+			}
+			if (ppd->link_wq) {
+				destroy_workqueue(ppd->link_wq);
+				ppd->link_wq = NULL;
+			}
+		}
+		if (!j)
+			hfi1_device_remove(dd);
+		if (!ret)
+			hfi1_unregister_ib_device(dd);
+		hfi1_vnic_cleanup(dd);
+		postinit_cleanup(dd);
+		if (initfail)
+			ret = initfail;
+		goto bail;	/* everything already cleaned */
+	}
+
+	sdma_start(dd);
+
+	return 0;
+
+clean_bail:
+	hfi1_pcie_cleanup(pdev);
+bail:
+	return ret;
+}
+
+static void wait_for_clients(struct hfi1_devdata *dd)
+{
+	/*
+	 * Remove the device init value and complete the device if there is
+	 * no clients or wait for active clients to finish.
+	 */
+	if (atomic_dec_and_test(&dd->user_refcount))
+		complete(&dd->user_comp);
+
+	wait_for_completion(&dd->user_comp);
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+	/* close debugfs files before ib unregister */
+	hfi1_dbg_ibdev_exit(&dd->verbs_dev);
+
+	/* remove the /dev hfi1 interface */
+	hfi1_device_remove(dd);
+
+	/* wait for existing user space clients to finish */
+	wait_for_clients(dd);
+
+	/* unregister from IB core */
+	hfi1_unregister_ib_device(dd);
+
+	/* cleanup vnic */
+	hfi1_vnic_cleanup(dd);
+
+	/*
+	 * Disable the IB link, disable interrupts on the device,
+	 * clear dma engines, etc.
+	 */
+	shutdown_device(dd);
+
+	stop_timers(dd);
+
+	/* wait until all of our (qsfp) queue_work() calls complete */
+	flush_workqueue(ib_wq);
+
+	postinit_cleanup(dd);
+}
+
+/**
+ * hfi1_create_rcvhdrq - create a receive header queue
+ * @dd: the hfi1_ib device
+ * @rcd: the context data
+ *
+ * This must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+	unsigned amt;
+	u64 reg;
+
+	if (!rcd->rcvhdrq) {
+		dma_addr_t dma_hdrqtail;
+		gfp_t gfp_flags;
+
+		/*
+		 * rcvhdrqentsize is in DWs, so we have to convert to bytes
+		 * (* sizeof(u32)).
+		 */
+		amt = PAGE_ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize *
+				 sizeof(u32));
+
+		if (rcd->ctxt < dd->first_dyn_alloc_ctxt || rcd->is_vnic)
+			gfp_flags = GFP_KERNEL;
+		else
+			gfp_flags = GFP_USER;
+		rcd->rcvhdrq = dma_zalloc_coherent(
+			&dd->pcidev->dev, amt, &rcd->rcvhdrq_dma,
+			gfp_flags | __GFP_COMP);
+
+		if (!rcd->rcvhdrq) {
+			dd_dev_err(dd,
+				   "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
+				   amt, rcd->ctxt);
+			goto bail;
+		}
+
+		if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+			rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
+				&dd->pcidev->dev, PAGE_SIZE, &dma_hdrqtail,
+				gfp_flags);
+			if (!rcd->rcvhdrtail_kvaddr)
+				goto bail_free;
+			rcd->rcvhdrqtailaddr_dma = dma_hdrqtail;
+		}
+
+		rcd->rcvhdrq_size = amt;
+	}
+	/*
+	 * These values are per-context:
+	 *	RcvHdrCnt
+	 *	RcvHdrEntSize
+	 *	RcvHdrSize
+	 */
+	reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT)
+			& RCV_HDR_CNT_CNT_MASK)
+		<< RCV_HDR_CNT_CNT_SHIFT;
+	write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg);
+	reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize)
+			& RCV_HDR_ENT_SIZE_ENT_SIZE_MASK)
+		<< RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT;
+	write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg);
+	reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK)
+		<< RCV_HDR_SIZE_HDR_SIZE_SHIFT;
+	write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg);
+
+	/*
+	 * Program dummy tail address for every receive context
+	 * before enabling any receive context
+	 */
+	write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_TAIL_ADDR,
+			dd->rcvhdrtail_dummy_dma);
+
+	return 0;
+
+bail_free:
+	dd_dev_err(dd,
+		   "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
+		   rcd->ctxt);
+	dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
+			  rcd->rcvhdrq_dma);
+	rcd->rcvhdrq = NULL;
+bail:
+	return -ENOMEM;
+}
+
+/**
+ * allocate eager buffers, both kernel and user contexts.
+ * @rcd: the context we are setting up.
+ *
+ * Allocate the eager TID buffers and program them into hip.
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.  Otherwise we get the OOM code involved, by asking for too
+ * much per call, with disastrous results on some kernels.
+ */
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 max_entries, egrtop, alloced_bytes = 0, idx = 0;
+	gfp_t gfp_flags;
+	u16 order;
+	int ret = 0;
+	u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
+
+	/*
+	 * GFP_USER, but without GFP_FS, so buffer cache can be
+	 * coalesced (we hope); otherwise, even at order 4,
+	 * heavy filesystem activity makes these fail, and we can
+	 * use compound pages.
+	 */
+	gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
+
+	/*
+	 * The minimum size of the eager buffers is a groups of MTU-sized
+	 * buffers.
+	 * The global eager_buffer_size parameter is checked against the
+	 * theoretical lower limit of the value. Here, we check against the
+	 * MTU.
+	 */
+	if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
+		rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
+	/*
+	 * If using one-pkt-per-egr-buffer, lower the eager buffer
+	 * size to the max MTU (page-aligned).
+	 */
+	if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
+		rcd->egrbufs.rcvtid_size = round_mtu;
+
+	/*
+	 * Eager buffers sizes of 1MB or less require smaller TID sizes
+	 * to satisfy the "multiple of 8 RcvArray entries" requirement.
+	 */
+	if (rcd->egrbufs.size <= (1 << 20))
+		rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
+			rounddown_pow_of_two(rcd->egrbufs.size / 8));
+
+	while (alloced_bytes < rcd->egrbufs.size &&
+	       rcd->egrbufs.alloced < rcd->egrbufs.count) {
+		rcd->egrbufs.buffers[idx].addr =
+			dma_zalloc_coherent(&dd->pcidev->dev,
+					    rcd->egrbufs.rcvtid_size,
+					    &rcd->egrbufs.buffers[idx].dma,
+					    gfp_flags);
+		if (rcd->egrbufs.buffers[idx].addr) {
+			rcd->egrbufs.buffers[idx].len =
+				rcd->egrbufs.rcvtid_size;
+			rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
+				rcd->egrbufs.buffers[idx].addr;
+			rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].dma =
+				rcd->egrbufs.buffers[idx].dma;
+			rcd->egrbufs.alloced++;
+			alloced_bytes += rcd->egrbufs.rcvtid_size;
+			idx++;
+		} else {
+			u32 new_size, i, j;
+			u64 offset = 0;
+
+			/*
+			 * Fail the eager buffer allocation if:
+			 *   - we are already using the lowest acceptable size
+			 *   - we are using one-pkt-per-egr-buffer (this implies
+			 *     that we are accepting only one size)
+			 */
+			if (rcd->egrbufs.rcvtid_size == round_mtu ||
+			    !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
+				dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
+					   rcd->ctxt);
+				ret = -ENOMEM;
+				goto bail_rcvegrbuf_phys;
+			}
+
+			new_size = rcd->egrbufs.rcvtid_size / 2;
+
+			/*
+			 * If the first attempt to allocate memory failed, don't
+			 * fail everything but continue with the next lower
+			 * size.
+			 */
+			if (idx == 0) {
+				rcd->egrbufs.rcvtid_size = new_size;
+				continue;
+			}
+
+			/*
+			 * Re-partition already allocated buffers to a smaller
+			 * size.
+			 */
+			rcd->egrbufs.alloced = 0;
+			for (i = 0, j = 0, offset = 0; j < idx; i++) {
+				if (i >= rcd->egrbufs.count)
+					break;
+				rcd->egrbufs.rcvtids[i].dma =
+					rcd->egrbufs.buffers[j].dma + offset;
+				rcd->egrbufs.rcvtids[i].addr =
+					rcd->egrbufs.buffers[j].addr + offset;
+				rcd->egrbufs.alloced++;
+				if ((rcd->egrbufs.buffers[j].dma + offset +
+				     new_size) ==
+				    (rcd->egrbufs.buffers[j].dma +
+				     rcd->egrbufs.buffers[j].len)) {
+					j++;
+					offset = 0;
+				} else {
+					offset += new_size;
+				}
+			}
+			rcd->egrbufs.rcvtid_size = new_size;
+		}
+	}
+	rcd->egrbufs.numbufs = idx;
+	rcd->egrbufs.size = alloced_bytes;
+
+	hfi1_cdbg(PROC,
+		  "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
+		  rcd->ctxt, rcd->egrbufs.alloced,
+		  rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024);
+
+	/*
+	 * Set the contexts rcv array head update threshold to the closest
+	 * power of 2 (so we can use a mask instead of modulo) below half
+	 * the allocated entries.
+	 */
+	rcd->egrbufs.threshold =
+		rounddown_pow_of_two(rcd->egrbufs.alloced / 2);
+	/*
+	 * Compute the expected RcvArray entry base. This is done after
+	 * allocating the eager buffers in order to maximize the
+	 * expected RcvArray entries for the context.
+	 */
+	max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
+	egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
+	rcd->expected_count = max_entries - egrtop;
+	if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2)
+		rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2;
+
+	rcd->expected_base = rcd->eager_base + egrtop;
+	hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
+		  rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
+		  rcd->eager_base, rcd->expected_base);
+
+	if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) {
+		hfi1_cdbg(PROC,
+			  "ctxt%u: current Eager buffer size is invalid %u\n",
+			  rcd->ctxt, rcd->egrbufs.rcvtid_size);
+		ret = -EINVAL;
+		goto bail_rcvegrbuf_phys;
+	}
+
+	for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
+		hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
+			     rcd->egrbufs.rcvtids[idx].dma, order);
+		cond_resched();
+	}
+
+	return 0;
+
+bail_rcvegrbuf_phys:
+	for (idx = 0; idx < rcd->egrbufs.alloced &&
+	     rcd->egrbufs.buffers[idx].addr;
+	     idx++) {
+		dma_free_coherent(&dd->pcidev->dev,
+				  rcd->egrbufs.buffers[idx].len,
+				  rcd->egrbufs.buffers[idx].addr,
+				  rcd->egrbufs.buffers[idx].dma);
+		rcd->egrbufs.buffers[idx].addr = NULL;
+		rcd->egrbufs.buffers[idx].dma = 0;
+		rcd->egrbufs.buffers[idx].len = 0;
+	}
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/intr.c b/drivers/infiniband/hw/hfi1/intr.c
new file mode 100644
index 0000000..387305b
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/intr.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/bitmap.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "sdma.h"
+
+#define LINK_UP_DELAY  500  /* in microseconds */
+
+static void set_mgmt_allowed(struct hfi1_pportdata *ppd)
+{
+	u32 frame;
+	struct hfi1_devdata *dd = ppd->dd;
+
+	if (ppd->neighbor_type == NEIGHBOR_TYPE_HFI) {
+		ppd->mgmt_allowed = 1;
+	} else {
+		read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame);
+		ppd->mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT)
+		& MGMT_ALLOWED_MASK;
+	}
+}
+
+/*
+ * Our neighbor has indicated that we are allowed to act as a fabric
+ * manager, so place the full management partition key in the second
+ * (0-based) pkey array position. Note that we should already have
+ * the limited management partition key in array element 1, and also
+ * that the port is not yet up when add_full_mgmt_pkey() is invoked.
+ */
+static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+
+	/* Sanity check - ppd->pkeys[2] should be 0, or already initialized */
+	if (!((ppd->pkeys[2] == 0) || (ppd->pkeys[2] == FULL_MGMT_P_KEY)))
+		dd_dev_warn(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n",
+			    __func__, ppd->pkeys[2], FULL_MGMT_P_KEY);
+	ppd->pkeys[2] = FULL_MGMT_P_KEY;
+	(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+	hfi1_event_pkey_change(ppd->dd, ppd->port);
+}
+
+/**
+ * format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+	strlcat(msg, "[", msgl);
+	strlcat(msg, hwmsg, msgl);
+	strlcat(msg, "]", msgl);
+}
+
+/**
+ * hfi1_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs,
+			  size_t nhwerrmsgs, char *msg, size_t msgl)
+{
+	int i;
+
+	for (i = 0; i < nhwerrmsgs; i++)
+		if (hwerrs & hwerrmsgs[i].mask)
+			format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+}
+
+static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev)
+{
+	struct ib_event event;
+	struct hfi1_devdata *dd = ppd->dd;
+
+	/*
+	 * Only call ib_dispatch_event() if the IB device has been
+	 * registered.  HFI1_INITED is set iff the driver has successfully
+	 * registered with the IB core.
+	 */
+	if (!(dd->flags & HFI1_INITTED))
+		return;
+	event.device = &dd->verbs_dev.rdi.ibdev;
+	event.element.port_num = ppd->port;
+	event.event = ev;
+	ib_dispatch_event(&event);
+}
+
+/**
+ * handle_linkup_change - finish linkup/down state changes
+ * @dd: valid device
+ * @linkup: link state information
+ *
+ * Handle a linkup or link down notification.
+ * The HW needs time to finish its link up state change. Give it that chance.
+ *
+ * This is called outside an interrupt.
+ *
+ */
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup)
+{
+	struct hfi1_pportdata *ppd = &dd->pport[0];
+	enum ib_event_type ev;
+
+	if (!(ppd->linkup ^ !!linkup))
+		return;	/* no change, nothing to do */
+
+	if (linkup) {
+		/*
+		 * Quick linkup and all link up on the simulator does not
+		 * trigger or implement:
+		 *	- VerifyCap interrupt
+		 *	- VerifyCap frames
+		 * But rather moves directly to LinkUp.
+		 *
+		 * Do the work of the VerifyCap interrupt handler,
+		 * handle_verify_cap(), but do not try moving the state to
+		 * LinkUp as we are already there.
+		 *
+		 * NOTE: This uses this device's vAU, vCU, and vl15_init for
+		 * the remote values.  Both sides must be using the values.
+		 */
+		if (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+			set_up_vau(dd, dd->vau);
+			set_up_vl15(dd, dd->vl15_init);
+			assign_remote_cm_au_table(dd, dd->vcu);
+		}
+
+		ppd->neighbor_guid =
+			read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
+		ppd->neighbor_type =
+			read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+				 DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+		ppd->neighbor_port_number =
+			read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+				 DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+		ppd->neighbor_fm_security =
+			read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) &
+				 DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK;
+		dd_dev_info(dd,
+			    "Neighbor Guid %llx, Type %d, Port Num %d\n",
+			    ppd->neighbor_guid, ppd->neighbor_type,
+			    ppd->neighbor_port_number);
+
+		/* HW needs LINK_UP_DELAY to settle, give it that chance */
+		udelay(LINK_UP_DELAY);
+
+		/*
+		 * 'MgmtAllowed' information, which is exchanged during
+		 * LNI, is available at this point.
+		 */
+		set_mgmt_allowed(ppd);
+
+		if (ppd->mgmt_allowed)
+			add_full_mgmt_pkey(ppd);
+
+		/* physical link went up */
+		ppd->linkup = 1;
+		ppd->offline_disabled_reason =
+			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
+
+		/* link widths are not available until the link is fully up */
+		get_linkup_link_widths(ppd);
+
+	} else {
+		/* physical link went down */
+		ppd->linkup = 0;
+
+		/* clear HW details of the previous connection */
+		ppd->actual_vls_operational = 0;
+		reset_link_credits(dd);
+
+		/* freeze after a link down to guarantee a clean egress */
+		start_freeze_handling(ppd, FREEZE_SELF | FREEZE_LINK_DOWN);
+
+		ev = IB_EVENT_PORT_ERR;
+
+		hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LINKDOWN_BIT);
+
+		/* if we are down, the neighbor is down */
+		ppd->neighbor_normal = 0;
+
+		/* notify IB of the link change */
+		signal_ib_event(ppd, ev);
+	}
+}
+
+/*
+ * Handle receive or urgent interrupts for user contexts.  This means a user
+ * process was waiting for a packet to arrive, and didn't want to poll.
+ */
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	if (bitmap_empty(rcd->in_use_ctxts, HFI1_MAX_SHARED_CTXTS))
+		goto done;
+
+	if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) {
+		wake_up_interruptible(&rcd->wait);
+		hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd);
+	} else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG,
+							&rcd->event_flags)) {
+		rcd->urgent++;
+		wake_up_interruptible(&rcd->wait);
+	}
+done:
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+}
diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
new file mode 100644
index 0000000..3d9c32c
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/iowait.h
@@ -0,0 +1,383 @@
+#ifndef _HFI1_IOWAIT_H
+#define _HFI1_IOWAIT_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+
+#include "sdma_txreq.h"
+
+/*
+ * typedef (*restart_t)() - restart callback
+ * @work: pointer to work structure
+ */
+typedef void (*restart_t)(struct work_struct *work);
+
+struct sdma_txreq;
+struct sdma_engine;
+/**
+ * struct iowait - linkage for delayed progress/waiting
+ * @list: used to add/insert into QP/PQ wait lists
+ * @lock: uses to record the list head lock
+ * @tx_head: overflow list of sdma_txreq's
+ * @sleep: no space callback
+ * @wakeup: space callback wakeup
+ * @sdma_drained: sdma count drained
+ * @iowork: workqueue overhead
+ * @wait_dma: wait for sdma_busy == 0
+ * @wait_pio: wait for pio_busy == 0
+ * @sdma_busy: # of packets in flight
+ * @count: total number of descriptors in tx_head'ed list
+ * @tx_limit: limit for overflow queuing
+ * @tx_count: number of tx entry's in tx_head'ed list
+ *
+ * This is to be embedded in user's state structure
+ * (QP or PQ).
+ *
+ * The sleep and wakeup members are a
+ * bit misnamed.   They do not strictly
+ * speaking sleep or wake up, but they
+ * are callbacks for the ULP to implement
+ * what ever queuing/dequeuing of
+ * the embedded iowait and its containing struct
+ * when a resource shortage like SDMA ring space is seen.
+ *
+ * Both potentially have locks help
+ * so sleeping is not allowed.
+ *
+ * The wait_dma member along with the iow
+ *
+ * The lock field is used by waiters to record
+ * the seqlock_t that guards the list head.
+ * Waiters explicity know that, but the destroy
+ * code that unwaits QPs does not.
+ */
+
+struct iowait {
+	struct list_head list;
+	struct list_head tx_head;
+	int (*sleep)(
+		struct sdma_engine *sde,
+		struct iowait *wait,
+		struct sdma_txreq *tx,
+		uint seq,
+		bool pkts_sent
+		);
+	void (*wakeup)(struct iowait *wait, int reason);
+	void (*sdma_drained)(struct iowait *wait);
+	seqlock_t *lock;
+	struct work_struct iowork;
+	wait_queue_head_t wait_dma;
+	wait_queue_head_t wait_pio;
+	atomic_t sdma_busy;
+	atomic_t pio_busy;
+	u32 count;
+	u32 tx_limit;
+	u32 tx_count;
+	u8 starved_cnt;
+};
+
+#define SDMA_AVAIL_REASON 0
+
+/**
+ * iowait_init() - initialize wait structure
+ * @wait: wait struct to initialize
+ * @tx_limit: limit for overflow queuing
+ * @func: restart function for workqueue
+ * @sleep: sleep function for no space
+ * @resume: wakeup function for no space
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+
+static inline void iowait_init(
+	struct iowait *wait,
+	u32 tx_limit,
+	void (*func)(struct work_struct *work),
+	int (*sleep)(
+		struct sdma_engine *sde,
+		struct iowait *wait,
+		struct sdma_txreq *tx,
+		uint seq,
+		bool pkts_sent),
+	void (*wakeup)(struct iowait *wait, int reason),
+	void (*sdma_drained)(struct iowait *wait))
+{
+	wait->count = 0;
+	wait->lock = NULL;
+	INIT_LIST_HEAD(&wait->list);
+	INIT_LIST_HEAD(&wait->tx_head);
+	INIT_WORK(&wait->iowork, func);
+	init_waitqueue_head(&wait->wait_dma);
+	init_waitqueue_head(&wait->wait_pio);
+	atomic_set(&wait->sdma_busy, 0);
+	atomic_set(&wait->pio_busy, 0);
+	wait->tx_limit = tx_limit;
+	wait->sleep = sleep;
+	wait->wakeup = wakeup;
+	wait->sdma_drained = sdma_drained;
+}
+
+/**
+ * iowait_schedule() - initialize wait structure
+ * @wait: wait struct to schedule
+ * @wq: workqueue for schedule
+ * @cpu: cpu
+ */
+static inline void iowait_schedule(
+	struct iowait *wait,
+	struct workqueue_struct *wq,
+	int cpu)
+{
+	queue_work_on(cpu, wq, &wait->iowork);
+}
+
+/**
+ * iowait_sdma_drain() - wait for DMAs to drain
+ *
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait sdmas have
+ * completed.
+ */
+static inline void iowait_sdma_drain(struct iowait *wait)
+{
+	wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy));
+}
+
+/**
+ * iowait_sdma_pending() - return sdma pending count
+ *
+ * @wait: iowait structure
+ *
+ */
+static inline int iowait_sdma_pending(struct iowait *wait)
+{
+	return atomic_read(&wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_inc - note sdma io pending
+ * @wait: iowait structure
+ */
+static inline void iowait_sdma_inc(struct iowait *wait)
+{
+	atomic_inc(&wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_add - add count to pending
+ * @wait: iowait structure
+ */
+static inline void iowait_sdma_add(struct iowait *wait, int count)
+{
+	atomic_add(count, &wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_dec - note sdma complete
+ * @wait: iowait structure
+ */
+static inline int iowait_sdma_dec(struct iowait *wait)
+{
+	return atomic_dec_and_test(&wait->sdma_busy);
+}
+
+/**
+ * iowait_pio_drain() - wait for pios to drain
+ *
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait pios have
+ * completed.
+ */
+static inline void iowait_pio_drain(struct iowait *wait)
+{
+	wait_event_timeout(wait->wait_pio,
+			   !atomic_read(&wait->pio_busy),
+			   HZ);
+}
+
+/**
+ * iowait_pio_pending() - return pio pending count
+ *
+ * @wait: iowait structure
+ *
+ */
+static inline int iowait_pio_pending(struct iowait *wait)
+{
+	return atomic_read(&wait->pio_busy);
+}
+
+/**
+ * iowait_pio_inc - note pio pending
+ * @wait: iowait structure
+ */
+static inline void iowait_pio_inc(struct iowait *wait)
+{
+	atomic_inc(&wait->pio_busy);
+}
+
+/**
+ * iowait_sdma_dec - note pio complete
+ * @wait: iowait structure
+ */
+static inline int iowait_pio_dec(struct iowait *wait)
+{
+	return atomic_dec_and_test(&wait->pio_busy);
+}
+
+/**
+ * iowait_drain_wakeup() - trigger iowait_drain() waiter
+ *
+ * @wait: iowait structure
+ *
+ * This will trigger any waiters.
+ */
+static inline void iowait_drain_wakeup(struct iowait *wait)
+{
+	wake_up(&wait->wait_dma);
+	wake_up(&wait->wait_pio);
+	if (wait->sdma_drained)
+		wait->sdma_drained(wait);
+}
+
+/**
+ * iowait_get_txhead() - get packet off of iowait list
+ *
+ * @wait wait struture
+ */
+static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait)
+{
+	struct sdma_txreq *tx = NULL;
+
+	if (!list_empty(&wait->tx_head)) {
+		tx = list_first_entry(
+			&wait->tx_head,
+			struct sdma_txreq,
+			list);
+		list_del_init(&tx->list);
+	}
+	return tx;
+}
+
+/**
+ * iowait_queue - Put the iowait on a wait queue
+ * @pkts_sent: have some packets been sent before queuing?
+ * @w: the iowait struct
+ * @wait_head: the wait queue
+ *
+ * This function is called to insert an iowait struct into a
+ * wait queue after a resource (eg, sdma decriptor or pio
+ * buffer) is run out.
+ */
+static inline void iowait_queue(bool pkts_sent, struct iowait *w,
+				struct list_head *wait_head)
+{
+	/*
+	 * To play fair, insert the iowait at the tail of the wait queue if it
+	 * has already sent some packets; Otherwise, put it at the head.
+	 */
+	if (pkts_sent) {
+		list_add_tail(&w->list, wait_head);
+		w->starved_cnt = 0;
+	} else {
+		list_add(&w->list, wait_head);
+		w->starved_cnt++;
+	}
+}
+
+/**
+ * iowait_starve_clear - clear the wait queue's starve count
+ * @pkts_sent: have some packets been sent?
+ * @w: the iowait struct
+ *
+ * This function is called to clear the starve count. If no
+ * packets have been sent, the starve count will not be cleared.
+ */
+static inline void iowait_starve_clear(bool pkts_sent, struct iowait *w)
+{
+	if (pkts_sent)
+		w->starved_cnt = 0;
+}
+
+/**
+ * iowait_starve_find_max - Find the maximum of the starve count
+ * @w: the iowait struct
+ * @max: a variable containing the max starve count
+ * @idx: the index of the current iowait in an array
+ * @max_idx: a variable containing the array index for the
+ *         iowait entry that has the max starve count
+ *
+ * This function is called to compare the starve count of a
+ * given iowait with the given max starve count. The max starve
+ * count and the index will be updated if the iowait's start
+ * count is larger.
+ */
+static inline void iowait_starve_find_max(struct iowait *w, u8 *max,
+					  uint idx, uint *max_idx)
+{
+	if (w->starved_cnt > *max) {
+		*max = w->starved_cnt;
+		*max_idx = idx;
+	}
+}
+
+/**
+ * iowait_packet_queued() - determine if a packet is already built
+ * @wait: the wait structure
+ */
+static inline bool iowait_packet_queued(struct iowait *wait)
+{
+	return !list_empty(&wait->tx_head);
+}
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
new file mode 100644
index 0000000..e9962c6
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -0,0 +1,4949 @@
+/*
+ * Copyright(c) 2015-2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#include <rdma/opa_addr.h>
+#define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \
+			/ (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16)))
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+#include "qp.h"
+#include "vnic.h"
+
+/* the reset value from the FM is supposed to be 0xffff, handle both */
+#define OPA_LINK_WIDTH_RESET_OLD 0x0fff
+#define OPA_LINK_WIDTH_RESET 0xffff
+
+struct trap_node {
+	struct list_head list;
+	struct opa_mad_notice_attr data;
+	__be64 tid;
+	int len;
+	u32 retry;
+	u8 in_use;
+	u8 repress;
+};
+
+static int smp_length_check(u32 data_size, u32 request_len)
+{
+	if (unlikely(request_len < data_size))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int reply(struct ib_mad_hdr *smp)
+{
+	/*
+	 * The verbs framework will handle the directed/LID route
+	 * packet changes.
+	 */
+	smp->method = IB_MGMT_METHOD_GET_RESP;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		smp->status |= IB_SMP_DIRECTION;
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static inline void clear_opa_smp_data(struct opa_smp *smp)
+{
+	void *data = opa_get_smp_data(smp);
+	size_t size = opa_get_smp_data_size(smp);
+
+	memset(data, 0, size);
+}
+
+static u16 hfi1_lookup_pkey_value(struct hfi1_ibport *ibp, int pkey_idx)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+	if (pkey_idx < ARRAY_SIZE(ppd->pkeys))
+		return ppd->pkeys[pkey_idx];
+
+	return 0;
+}
+
+void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port)
+{
+	struct ib_event event;
+
+	event.event = IB_EVENT_PKEY_CHANGE;
+	event.device = &dd->verbs_dev.rdi.ibdev;
+	event.element.port_num = port;
+	ib_dispatch_event(&event);
+}
+
+/*
+ * If the port is down, clean up all pending traps.  We need to be careful
+ * with the given trap, because it may be queued.
+ */
+static void cleanup_traps(struct hfi1_ibport *ibp, struct trap_node *trap)
+{
+	struct trap_node *node, *q;
+	unsigned long flags;
+	struct list_head trap_list;
+	int i;
+
+	for (i = 0; i < RVT_MAX_TRAP_LISTS; i++) {
+		spin_lock_irqsave(&ibp->rvp.lock, flags);
+		list_replace_init(&ibp->rvp.trap_lists[i].list, &trap_list);
+		ibp->rvp.trap_lists[i].list_len = 0;
+		spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+		/*
+		 * Remove all items from the list, freeing all the non-given
+		 * traps.
+		 */
+		list_for_each_entry_safe(node, q, &trap_list, list) {
+			list_del(&node->list);
+			if (node != trap)
+				kfree(node);
+		}
+	}
+
+	/*
+	 * If this wasn't on one of the lists it would not be freed.  If it
+	 * was on the list, it is now safe to free.
+	 */
+	kfree(trap);
+}
+
+static struct trap_node *check_and_add_trap(struct hfi1_ibport *ibp,
+					    struct trap_node *trap)
+{
+	struct trap_node *node;
+	struct trap_list *trap_list;
+	unsigned long flags;
+	unsigned long timeout;
+	int found = 0;
+	unsigned int queue_id;
+	static int trap_count;
+
+	queue_id = trap->data.generic_type & 0x0F;
+	if (queue_id >= RVT_MAX_TRAP_LISTS) {
+		trap_count++;
+		pr_err_ratelimited("hfi1: Invalid trap 0x%0x dropped. Total dropped: %d\n",
+				   trap->data.generic_type, trap_count);
+		kfree(trap);
+		return NULL;
+	}
+
+	/*
+	 * Since the retry (handle timeout) does not remove a trap request
+	 * from the list, all we have to do is compare the node.
+	 */
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	trap_list = &ibp->rvp.trap_lists[queue_id];
+
+	list_for_each_entry(node, &trap_list->list, list) {
+		if (node == trap) {
+			node->retry++;
+			found = 1;
+			break;
+		}
+	}
+
+	/* If it is not on the list, add it, limited to RVT-MAX_TRAP_LEN. */
+	if (!found) {
+		if (trap_list->list_len < RVT_MAX_TRAP_LEN) {
+			trap_list->list_len++;
+			list_add_tail(&trap->list, &trap_list->list);
+		} else {
+			pr_warn_ratelimited("hfi1: Maximum trap limit reached for 0x%0x traps\n",
+					    trap->data.generic_type);
+			kfree(trap);
+		}
+	}
+
+	/*
+	 * Next check to see if there is a timer pending.  If not, set it up
+	 * and get the first trap from the list.
+	 */
+	node = NULL;
+	if (!timer_pending(&ibp->rvp.trap_timer)) {
+		/*
+		 * o14-2
+		 * If the time out is set we have to wait until it expires
+		 * before the trap can be sent.
+		 * This should be > RVT_TRAP_TIMEOUT
+		 */
+		timeout = (RVT_TRAP_TIMEOUT *
+			   (1UL << ibp->rvp.subnet_timeout)) / 1000;
+		mod_timer(&ibp->rvp.trap_timer,
+			  jiffies + usecs_to_jiffies(timeout));
+		node = list_first_entry(&trap_list->list, struct trap_node,
+					list);
+		node->in_use = 1;
+	}
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+	return node;
+}
+
+static void subn_handle_opa_trap_repress(struct hfi1_ibport *ibp,
+					 struct opa_smp *smp)
+{
+	struct trap_list *trap_list;
+	struct trap_node *trap;
+	unsigned long flags;
+	int i;
+
+	if (smp->attr_id != IB_SMP_ATTR_NOTICE)
+		return;
+
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	for (i = 0; i < RVT_MAX_TRAP_LISTS; i++) {
+		trap_list = &ibp->rvp.trap_lists[i];
+		trap = list_first_entry_or_null(&trap_list->list,
+						struct trap_node, list);
+		if (trap && trap->tid == smp->tid) {
+			if (trap->in_use) {
+				trap->repress = 1;
+			} else {
+				trap_list->list_len--;
+				list_del(&trap->list);
+				kfree(trap);
+			}
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+}
+
+static void hfi1_update_sm_ah_attr(struct hfi1_ibport *ibp,
+				   struct rdma_ah_attr *attr, u32 dlid)
+{
+	rdma_ah_set_dlid(attr, dlid);
+	rdma_ah_set_port_num(attr, ppd_from_ibp(ibp)->port);
+	if (dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+		struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);
+
+		rdma_ah_set_ah_flags(attr, IB_AH_GRH);
+		grh->sgid_index = 0;
+		grh->hop_limit = 1;
+		grh->dgid.global.subnet_prefix =
+			ibp->rvp.gid_prefix;
+		grh->dgid.global.interface_id = OPA_MAKE_ID(dlid);
+	}
+}
+
+static int hfi1_modify_qp0_ah(struct hfi1_ibport *ibp,
+			      struct rvt_ah *ah, u32 dlid)
+{
+	struct rdma_ah_attr attr;
+	struct rvt_qp *qp0;
+	int ret = -EINVAL;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.type = ah->ibah.type;
+	hfi1_update_sm_ah_attr(ibp, &attr, dlid);
+	rcu_read_lock();
+	qp0 = rcu_dereference(ibp->rvp.qp[0]);
+	if (qp0)
+		ret = rdma_modify_ah(&ah->ibah, &attr);
+	rcu_read_unlock();
+	return ret;
+}
+
+static struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u32 dlid)
+{
+	struct rdma_ah_attr attr;
+	struct ib_ah *ah = ERR_PTR(-EINVAL);
+	struct rvt_qp *qp0;
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct hfi1_devdata *dd = dd_from_ppd(ppd);
+	u8 port_num = ppd->port;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.type = rdma_ah_find_type(&dd->verbs_dev.rdi.ibdev, port_num);
+	hfi1_update_sm_ah_attr(ibp, &attr, dlid);
+	rcu_read_lock();
+	qp0 = rcu_dereference(ibp->rvp.qp[0]);
+	if (qp0)
+		ah = rdma_create_ah(qp0->ibqp.pd, &attr);
+	rcu_read_unlock();
+	return ah;
+}
+
+static void send_trap(struct hfi1_ibport *ibp, struct trap_node *trap)
+{
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent;
+	struct opa_smp *smp;
+	unsigned long flags;
+	int pkey_idx;
+	u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
+
+	agent = ibp->rvp.send_agent;
+	if (!agent) {
+		cleanup_traps(ibp, trap);
+		return;
+	}
+
+	/* o14-3.2.1 */
+	if (driver_lstate(ppd_from_ibp(ibp)) != IB_PORT_ACTIVE) {
+		cleanup_traps(ibp, trap);
+		return;
+	}
+
+	/* Add the trap to the list if necessary and see if we can send it */
+	trap = check_and_add_trap(ibp, trap);
+	if (!trap)
+		return;
+
+	pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+	if (pkey_idx < 0) {
+		pr_warn("%s: failed to find limited mgmt pkey, defaulting 0x%x\n",
+			__func__, hfi1_get_pkey(ibp, 1));
+		pkey_idx = 1;
+	}
+
+	send_buf = ib_create_send_mad(agent, qpn, pkey_idx, 0,
+				      IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+				      GFP_ATOMIC, IB_MGMT_BASE_VERSION);
+	if (IS_ERR(send_buf))
+		return;
+
+	smp = send_buf->mad;
+	smp->base_version = OPA_MGMT_BASE_VERSION;
+	smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	smp->class_version = OPA_SM_CLASS_VERSION;
+	smp->method = IB_MGMT_METHOD_TRAP;
+
+	/* Only update the transaction ID for new traps (o13-5). */
+	if (trap->tid == 0) {
+		ibp->rvp.tid++;
+		/* make sure that tid != 0 */
+		if (ibp->rvp.tid == 0)
+			ibp->rvp.tid++;
+		trap->tid = cpu_to_be64(ibp->rvp.tid);
+	}
+	smp->tid = trap->tid;
+
+	smp->attr_id = IB_SMP_ATTR_NOTICE;
+	/* o14-1: smp->mkey = 0; */
+
+	memcpy(smp->route.lid.data, &trap->data, trap->len);
+
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	if (!ibp->rvp.sm_ah) {
+		if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
+			struct ib_ah *ah;
+
+			ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid);
+			if (IS_ERR(ah)) {
+				spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+				return;
+			}
+			send_buf->ah = ah;
+			ibp->rvp.sm_ah = ibah_to_rvtah(ah);
+		} else {
+			spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+			return;
+		}
+	} else {
+		send_buf->ah = &ibp->rvp.sm_ah->ibah;
+	}
+
+	/*
+	 * If the trap was repressed while things were getting set up, don't
+	 * bother sending it. This could happen for a retry.
+	 */
+	if (trap->repress) {
+		list_del(&trap->list);
+		spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+		kfree(trap);
+		ib_free_send_mad(send_buf);
+		return;
+	}
+
+	trap->in_use = 0;
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+	if (ib_post_send_mad(send_buf, NULL))
+		ib_free_send_mad(send_buf);
+}
+
+void hfi1_handle_trap_timer(struct timer_list *t)
+{
+	struct hfi1_ibport *ibp = from_timer(ibp, t, rvp.trap_timer);
+	struct trap_node *trap = NULL;
+	unsigned long flags;
+	int i;
+
+	/* Find the trap with the highest priority */
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	for (i = 0; !trap && i < RVT_MAX_TRAP_LISTS; i++) {
+		trap = list_first_entry_or_null(&ibp->rvp.trap_lists[i].list,
+						struct trap_node, list);
+	}
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+	if (trap)
+		send_trap(ibp, trap);
+}
+
+static struct trap_node *create_trap_node(u8 type, __be16 trap_num, u32 lid)
+{
+	struct trap_node *trap;
+
+	trap = kzalloc(sizeof(*trap), GFP_ATOMIC);
+	if (!trap)
+		return NULL;
+
+	INIT_LIST_HEAD(&trap->list);
+	trap->data.generic_type = type;
+	trap->data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	trap->data.trap_num = trap_num;
+	trap->data.issuer_lid = cpu_to_be32(lid);
+
+	return trap;
+}
+
+/*
+ * Send a bad P_Key trap (ch. 14.3.8).
+ */
+void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl,
+		   u32 qp1, u32 qp2, u32 lid1, u32 lid2)
+{
+	struct trap_node *trap;
+	u32 lid = ppd_from_ibp(ibp)->lid;
+
+	ibp->rvp.n_pkt_drops++;
+	ibp->rvp.pkey_violations++;
+
+	trap = create_trap_node(IB_NOTICE_TYPE_SECURITY, OPA_TRAP_BAD_P_KEY,
+				lid);
+	if (!trap)
+		return;
+
+	/* Send violation trap */
+	trap->data.ntc_257_258.lid1 = cpu_to_be32(lid1);
+	trap->data.ntc_257_258.lid2 = cpu_to_be32(lid2);
+	trap->data.ntc_257_258.key = cpu_to_be32(key);
+	trap->data.ntc_257_258.sl = sl << 3;
+	trap->data.ntc_257_258.qp1 = cpu_to_be32(qp1);
+	trap->data.ntc_257_258.qp2 = cpu_to_be32(qp2);
+
+	trap->len = sizeof(trap->data);
+	send_trap(ibp, trap);
+}
+
+/*
+ * Send a bad M_Key trap (ch. 14.3.9).
+ */
+static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+		     __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt)
+{
+	struct trap_node *trap;
+	u32 lid = ppd_from_ibp(ibp)->lid;
+
+	trap = create_trap_node(IB_NOTICE_TYPE_SECURITY, OPA_TRAP_BAD_M_KEY,
+				lid);
+	if (!trap)
+		return;
+
+	/* Send violation trap */
+	trap->data.ntc_256.lid = trap->data.issuer_lid;
+	trap->data.ntc_256.method = mad->method;
+	trap->data.ntc_256.attr_id = mad->attr_id;
+	trap->data.ntc_256.attr_mod = mad->attr_mod;
+	trap->data.ntc_256.mkey = mkey;
+	if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		trap->data.ntc_256.dr_slid = dr_slid;
+		trap->data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
+		if (hop_cnt > ARRAY_SIZE(trap->data.ntc_256.dr_rtn_path)) {
+			trap->data.ntc_256.dr_trunc_hop |=
+				IB_NOTICE_TRAP_DR_TRUNC;
+			hop_cnt = ARRAY_SIZE(trap->data.ntc_256.dr_rtn_path);
+		}
+		trap->data.ntc_256.dr_trunc_hop |= hop_cnt;
+		memcpy(trap->data.ntc_256.dr_rtn_path, return_path,
+		       hop_cnt);
+	}
+
+	trap->len = sizeof(trap->data);
+
+	send_trap(ibp, trap);
+}
+
+/*
+ * Send a Port Capability Mask Changed trap (ch. 14.3.11).
+ */
+void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
+{
+	struct trap_node *trap;
+	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+	struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data;
+	u32 lid = ppd_from_ibp(ibp)->lid;
+
+	trap = create_trap_node(IB_NOTICE_TYPE_INFO,
+				OPA_TRAP_CHANGE_CAPABILITY,
+				lid);
+	if (!trap)
+		return;
+
+	trap->data.ntc_144.lid = trap->data.issuer_lid;
+	trap->data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+	trap->data.ntc_144.cap_mask3 = cpu_to_be16(ibp->rvp.port_cap3_flags);
+
+	trap->len = sizeof(trap->data);
+	send_trap(ibp, trap);
+}
+
+/*
+ * Send a System Image GUID Changed trap (ch. 14.3.12).
+ */
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
+{
+	struct trap_node *trap;
+	u32 lid = ppd_from_ibp(ibp)->lid;
+
+	trap = create_trap_node(IB_NOTICE_TYPE_INFO, OPA_TRAP_CHANGE_SYSGUID,
+				lid);
+	if (!trap)
+		return;
+
+	trap->data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
+	trap->data.ntc_145.lid = trap->data.issuer_lid;
+
+	trap->len = sizeof(trap->data);
+	send_trap(ibp, trap);
+}
+
+/*
+ * Send a Node Description Changed trap (ch. 14.3.13).
+ */
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
+{
+	struct trap_node *trap;
+	u32 lid = ppd_from_ibp(ibp)->lid;
+
+	trap = create_trap_node(IB_NOTICE_TYPE_INFO,
+				OPA_TRAP_CHANGE_CAPABILITY,
+				lid);
+	if (!trap)
+		return;
+
+	trap->data.ntc_144.lid = trap->data.issuer_lid;
+	trap->data.ntc_144.change_flags =
+		cpu_to_be16(OPA_NOTICE_TRAP_NODE_DESC_CHG);
+
+	trap->len = sizeof(trap->data);
+	send_trap(ibp, trap);
+}
+
+static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
+				   u8 *data, struct ib_device *ibdev,
+				   u8 port, u32 *resp_len, u32 max_len)
+{
+	struct opa_node_description *nd;
+
+	if (am || smp_length_check(sizeof(*nd), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	nd = (struct opa_node_description *)data;
+
+	memcpy(nd->data, ibdev->node_desc, sizeof(nd->data));
+
+	if (resp_len)
+		*resp_len += sizeof(*nd);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len, u32 max_len)
+{
+	struct opa_node_info *ni;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+	ni = (struct opa_node_info *)data;
+
+	/* GUID 0 is illegal */
+	if (am || pidx >= dd->num_pports || ibdev->node_guid == 0 ||
+	    smp_length_check(sizeof(*ni), max_len) ||
+	    get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX) == 0) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ni->port_guid = get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX);
+	ni->base_version = OPA_MGMT_BASE_VERSION;
+	ni->class_version = OPA_SM_CLASS_VERSION;
+	ni->node_type = 1;     /* channel adapter */
+	ni->num_ports = ibdev->phys_port_cnt;
+	/* This is already in network order */
+	ni->system_image_guid = ib_hfi1_sys_image_guid;
+	ni->node_guid = ibdev->node_guid;
+	ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+	ni->device_id = cpu_to_be16(dd->pcidev->device);
+	ni->revision = cpu_to_be32(dd->minrev);
+	ni->local_port_num = port;
+	ni->vendor_id[0] = dd->oui1;
+	ni->vendor_id[1] = dd->oui2;
+	ni->vendor_id[2] = dd->oui3;
+
+	if (resp_len)
+		*resp_len += sizeof(*ni);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+	/* GUID 0 is illegal */
+	if (smp->attr_mod || pidx >= dd->num_pports ||
+	    ibdev->node_guid == 0 ||
+	    get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX) == 0) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	nip->port_guid = get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX);
+	nip->base_version = OPA_MGMT_BASE_VERSION;
+	nip->class_version = OPA_SM_CLASS_VERSION;
+	nip->node_type = 1;     /* channel adapter */
+	nip->num_ports = ibdev->phys_port_cnt;
+	/* This is already in network order */
+	nip->sys_guid = ib_hfi1_sys_image_guid;
+	nip->node_guid = ibdev->node_guid;
+	nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+	nip->device_id = cpu_to_be16(dd->pcidev->device);
+	nip->revision = cpu_to_be32(dd->minrev);
+	nip->local_port_num = port;
+	nip->vendor_id[0] = dd->oui1;
+	nip->vendor_id[1] = dd->oui2;
+	nip->vendor_id[2] = dd->oui3;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static void set_link_width_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+	(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_width_downgrade_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+	(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_DG_ENB, w);
+}
+
+static void set_link_speed_enabled(struct hfi1_pportdata *ppd, u32 s)
+{
+	(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_SPD_ENB, s);
+}
+
+static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+		      int mad_flags, __be64 mkey, __be32 dr_slid,
+		      u8 return_path[], u8 hop_cnt)
+{
+	int valid_mkey = 0;
+	int ret = 0;
+
+	/* Is the mkey in the process of expiring? */
+	if (ibp->rvp.mkey_lease_timeout &&
+	    time_after_eq(jiffies, ibp->rvp.mkey_lease_timeout)) {
+		/* Clear timeout and mkey protection field. */
+		ibp->rvp.mkey_lease_timeout = 0;
+		ibp->rvp.mkeyprot = 0;
+	}
+
+	if ((mad_flags & IB_MAD_IGNORE_MKEY) ||  ibp->rvp.mkey == 0 ||
+	    ibp->rvp.mkey == mkey)
+		valid_mkey = 1;
+
+	/* Unset lease timeout on any valid Get/Set/TrapRepress */
+	if (valid_mkey && ibp->rvp.mkey_lease_timeout &&
+	    (mad->method == IB_MGMT_METHOD_GET ||
+	     mad->method == IB_MGMT_METHOD_SET ||
+	     mad->method == IB_MGMT_METHOD_TRAP_REPRESS))
+		ibp->rvp.mkey_lease_timeout = 0;
+
+	if (!valid_mkey) {
+		switch (mad->method) {
+		case IB_MGMT_METHOD_GET:
+			/* Bad mkey not a violation below level 2 */
+			if (ibp->rvp.mkeyprot < 2)
+				break;
+			/* fall through */
+		case IB_MGMT_METHOD_SET:
+		case IB_MGMT_METHOD_TRAP_REPRESS:
+			if (ibp->rvp.mkey_violations != 0xFFFF)
+				++ibp->rvp.mkey_violations;
+			if (!ibp->rvp.mkey_lease_timeout &&
+			    ibp->rvp.mkey_lease_period)
+				ibp->rvp.mkey_lease_timeout = jiffies +
+					ibp->rvp.mkey_lease_period * HZ;
+			/* Generate a trap notice. */
+			bad_mkey(ibp, mad, mkey, dr_slid, return_path,
+				 hop_cnt);
+			ret = 1;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * The SMA caches reads from LCB registers in case the LCB is unavailable.
+ * (The LCB is unavailable in certain link states, for example.)
+ */
+struct lcb_datum {
+	u32 off;
+	u64 val;
+};
+
+static struct lcb_datum lcb_cache[] = {
+	{ DC_LCB_STS_ROUND_TRIP_LTP_CNT, 0 },
+};
+
+static int write_lcb_cache(u32 off, u64 val)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+		if (lcb_cache[i].off == off) {
+			lcb_cache[i].val = val;
+			return 0;
+		}
+	}
+
+	pr_warn("%s bad offset 0x%x\n", __func__, off);
+	return -1;
+}
+
+static int read_lcb_cache(u32 off, u64 *val)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+		if (lcb_cache[i].off == off) {
+			*val = lcb_cache[i].val;
+			return 0;
+		}
+	}
+
+	pr_warn("%s bad offset 0x%x\n", __func__, off);
+	return -1;
+}
+
+void read_ltp_rtt(struct hfi1_devdata *dd)
+{
+	u64 reg;
+
+	if (read_lcb_csr(dd, DC_LCB_STS_ROUND_TRIP_LTP_CNT, &reg))
+		dd_dev_err(dd, "%s: unable to read LTP RTT\n", __func__);
+	else
+		write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg);
+}
+
+static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len, u32 max_len)
+{
+	int i;
+	struct hfi1_devdata *dd;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	struct opa_port_info *pi = (struct opa_port_info *)data;
+	u8 mtu;
+	u8 credit_rate;
+	u8 is_beaconing_active;
+	u32 state;
+	u32 num_ports = OPA_AM_NPORT(am);
+	u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+	u32 buffer_units;
+	u64 tmp = 0;
+
+	if (num_ports != 1 || smp_length_check(sizeof(*pi), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	dd = dd_from_ibdev(ibdev);
+	/* IB numbers ports from 1, hw from 0 */
+	ppd = dd->pport + (port - 1);
+	ibp = &ppd->ibport_data;
+
+	if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+	    ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	pi->lid = cpu_to_be32(ppd->lid);
+
+	/* Only return the mkey if the protection field allows it. */
+	if (!(smp->method == IB_MGMT_METHOD_GET &&
+	      ibp->rvp.mkey != smp->mkey &&
+	      ibp->rvp.mkeyprot == 1))
+		pi->mkey = ibp->rvp.mkey;
+
+	pi->subnet_prefix = ibp->rvp.gid_prefix;
+	pi->sm_lid = cpu_to_be32(ibp->rvp.sm_lid);
+	pi->ib_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+	pi->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period);
+	pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp);
+	pi->sa_qp = cpu_to_be32(ppd->sa_qp);
+
+	pi->link_width.enabled = cpu_to_be16(ppd->link_width_enabled);
+	pi->link_width.supported = cpu_to_be16(ppd->link_width_supported);
+	pi->link_width.active = cpu_to_be16(ppd->link_width_active);
+
+	pi->link_width_downgrade.supported =
+			cpu_to_be16(ppd->link_width_downgrade_supported);
+	pi->link_width_downgrade.enabled =
+			cpu_to_be16(ppd->link_width_downgrade_enabled);
+	pi->link_width_downgrade.tx_active =
+			cpu_to_be16(ppd->link_width_downgrade_tx_active);
+	pi->link_width_downgrade.rx_active =
+			cpu_to_be16(ppd->link_width_downgrade_rx_active);
+
+	pi->link_speed.supported = cpu_to_be16(ppd->link_speed_supported);
+	pi->link_speed.active = cpu_to_be16(ppd->link_speed_active);
+	pi->link_speed.enabled = cpu_to_be16(ppd->link_speed_enabled);
+
+	state = driver_lstate(ppd);
+
+	if (start_of_sm_config && (state == IB_PORT_INIT))
+		ppd->is_sm_config_started = 1;
+
+	pi->port_phys_conf = (ppd->port_type & 0xf);
+
+	pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+	pi->port_states.ledenable_offlinereason |=
+		ppd->is_sm_config_started << 5;
+	/*
+	 * This pairs with the memory barrier in hfi1_start_led_override to
+	 * ensure that we read the correct state of LED beaconing represented
+	 * by led_override_timer_active
+	 */
+	smp_rmb();
+	is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
+	pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6;
+	pi->port_states.ledenable_offlinereason |=
+		ppd->offline_disabled_reason;
+
+	pi->port_states.portphysstate_portstate =
+		(driver_pstate(ppd) << 4) | state;
+
+	pi->mkeyprotect_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc;
+
+	memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu));
+	for (i = 0; i < ppd->vls_supported; i++) {
+		mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU);
+		if ((i % 2) == 0)
+			pi->neigh_mtu.pvlx_to_mtu[i / 2] |= (mtu << 4);
+		else
+			pi->neigh_mtu.pvlx_to_mtu[i / 2] |= mtu;
+	}
+	/* don't forget VL 15 */
+	mtu = mtu_to_enum(dd->vld[15].mtu, 2048);
+	pi->neigh_mtu.pvlx_to_mtu[15 / 2] |= mtu;
+	pi->smsl = ibp->rvp.sm_sl & OPA_PI_MASK_SMSL;
+	pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS);
+	pi->partenforce_filterraw |=
+		(ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON);
+	if (ppd->part_enforce & HFI1_PART_ENFORCE_IN)
+		pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN;
+	if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT)
+		pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT;
+	pi->mkey_violations = cpu_to_be16(ibp->rvp.mkey_violations);
+	/* P_KeyViolations are counted by hardware. */
+	pi->pkey_violations = cpu_to_be16(ibp->rvp.pkey_violations);
+	pi->qkey_violations = cpu_to_be16(ibp->rvp.qkey_violations);
+
+	pi->vl.cap = ppd->vls_supported;
+	pi->vl.high_limit = cpu_to_be16(ibp->rvp.vl_high_limit);
+	pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP);
+	pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP);
+
+	pi->clientrereg_subnettimeout = ibp->rvp.subnet_timeout;
+
+	pi->port_link_mode  = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 |
+					  OPA_PORT_LINK_MODE_OPA << 5 |
+					  OPA_PORT_LINK_MODE_OPA);
+
+	pi->port_ltp_crc_mode = cpu_to_be16(ppd->port_ltp_crc_mode);
+
+	pi->port_mode = cpu_to_be16(
+				ppd->is_active_optimize_enabled ?
+					OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0);
+
+	pi->port_packet_format.supported =
+		cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B |
+			    OPA_PORT_PACKET_FORMAT_16B);
+	pi->port_packet_format.enabled =
+		cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B |
+			    OPA_PORT_PACKET_FORMAT_16B);
+
+	/* flit_control.interleave is (OPA V1, version .76):
+	 * bits		use
+	 * ----		---
+	 * 2		res
+	 * 2		DistanceSupported
+	 * 2		DistanceEnabled
+	 * 5		MaxNextLevelTxEnabled
+	 * 5		MaxNestLevelRxSupported
+	 *
+	 * HFI supports only "distance mode 1" (see OPA V1, version .76,
+	 * section 9.6.2), so set DistanceSupported, DistanceEnabled
+	 * to 0x1.
+	 */
+	pi->flit_control.interleave = cpu_to_be16(0x1400);
+
+	pi->link_down_reason = ppd->local_link_down_reason.sma;
+	pi->neigh_link_down_reason = ppd->neigh_link_down_reason.sma;
+	pi->port_error_action = cpu_to_be32(ppd->port_error_action);
+	pi->mtucap = mtu_to_enum(hfi1_max_mtu, IB_MTU_4096);
+
+	/* 32.768 usec. response time (guessing) */
+	pi->resptimevalue = 3;
+
+	pi->local_port_num = port;
+
+	/* buffer info for FM */
+	pi->overall_buffer_space = cpu_to_be16(dd->link_credits);
+
+	pi->neigh_node_guid = cpu_to_be64(ppd->neighbor_guid);
+	pi->neigh_port_num = ppd->neighbor_port_number;
+	pi->port_neigh_mode =
+		(ppd->neighbor_type & OPA_PI_MASK_NEIGH_NODE_TYPE) |
+		(ppd->mgmt_allowed ? OPA_PI_MASK_NEIGH_MGMT_ALLOWED : 0) |
+		(ppd->neighbor_fm_security ?
+			OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS : 0);
+
+	/* HFIs shall always return VL15 credits to their
+	 * neighbor in a timely manner, without any credit return pacing.
+	 */
+	credit_rate = 0;
+	buffer_units  = (dd->vau) & OPA_PI_MASK_BUF_UNIT_BUF_ALLOC;
+	buffer_units |= (dd->vcu << 3) & OPA_PI_MASK_BUF_UNIT_CREDIT_ACK;
+	buffer_units |= (credit_rate << 6) &
+				OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE;
+	buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT;
+	pi->buffer_units = cpu_to_be32(buffer_units);
+
+	pi->opa_cap_mask = cpu_to_be16(ibp->rvp.port_cap3_flags);
+	pi->collectivemask_multicastmask = ((OPA_COLLECTIVE_NR & 0x7)
+					    << 3 | (OPA_MCAST_NR & 0x7));
+
+	/* HFI supports a replay buffer 128 LTPs in size */
+	pi->replay_depth.buffer = 0x80;
+	/* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+	read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp);
+
+	/*
+	 * this counter is 16 bits wide, but the replay_depth.wire
+	 * variable is only 8 bits
+	 */
+	if (tmp > 0xff)
+		tmp = 0xff;
+	pi->replay_depth.wire = tmp;
+
+	if (resp_len)
+		*resp_len += sizeof(struct opa_port_info);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+/**
+ * get_pkeys - return the PKEY table
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+	struct hfi1_pportdata *ppd = dd->pport + port - 1;
+
+	memcpy(pkeys, ppd->pkeys, sizeof(ppd->pkeys));
+
+	return 0;
+}
+
+static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+				    struct ib_device *ibdev, u8 port,
+				    u32 *resp_len, u32 max_len)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	u32 n_blocks_req = OPA_AM_NBLK(am);
+	u32 start_block = am & 0x7ff;
+	__be16 *p;
+	u16 *q;
+	int i;
+	u16 n_blocks_avail;
+	unsigned npkeys = hfi1_get_npkeys(dd);
+	size_t size;
+
+	if (n_blocks_req == 0) {
+		pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+			port, start_block, n_blocks_req);
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+	size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16);
+
+	if (smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	if (start_block + n_blocks_req > n_blocks_avail ||
+	    n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+		pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; "
+			"avail 0x%x; blk/smp 0x%lx\n",
+			start_block, n_blocks_req, n_blocks_avail,
+			OPA_NUM_PKEY_BLOCKS_PER_SMP);
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	p = (__be16 *)data;
+	q = (u16 *)data;
+	/* get the real pkeys if we are requesting the first block */
+	if (start_block == 0) {
+		get_pkeys(dd, port, q);
+		for (i = 0; i < npkeys; i++)
+			p[i] = cpu_to_be16(q[i]);
+		if (resp_len)
+			*resp_len += size;
+	} else {
+		smp->status |= IB_SMP_INVALID_FIELD;
+	}
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+enum {
+	HFI_TRANSITION_DISALLOWED,
+	HFI_TRANSITION_IGNORED,
+	HFI_TRANSITION_ALLOWED,
+	HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * Use shortened names to improve readability of
+ * {logical,physical}_state_transitions
+ */
+enum {
+	__D = HFI_TRANSITION_DISALLOWED,
+	__I = HFI_TRANSITION_IGNORED,
+	__A = HFI_TRANSITION_ALLOWED,
+	__U = HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * IB_PORTPHYSSTATE_POLLING (2) through OPA_PORTPHYSSTATE_MAX (11) are
+ * represented in physical_state_transitions.
+ */
+#define __N_PHYSTATES (OPA_PORTPHYSSTATE_MAX - IB_PORTPHYSSTATE_POLLING + 1)
+
+/*
+ * Within physical_state_transitions, rows represent "old" states,
+ * columns "new" states, and physical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 6-4).
+ */
+static const struct {
+	u8 allowed[__N_PHYSTATES][__N_PHYSTATES];
+} physical_state_transitions = {
+	{
+		/* 2    3    4    5    6    7    8    9   10   11 */
+	/* 2 */	{ __A, __A, __D, __D, __D, __D, __D, __D, __D, __D },
+	/* 3 */	{ __A, __I, __D, __D, __D, __D, __D, __D, __D, __A },
+	/* 4 */	{ __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+	/* 5 */	{ __A, __A, __D, __I, __D, __D, __D, __D, __D, __D },
+	/* 6 */	{ __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+	/* 7 */	{ __D, __A, __D, __D, __D, __I, __D, __D, __D, __D },
+	/* 8 */	{ __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+	/* 9 */	{ __I, __A, __D, __D, __D, __D, __D, __I, __D, __D },
+	/*10 */	{ __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+	/*11 */	{ __D, __A, __D, __D, __D, __D, __D, __D, __D, __I },
+	}
+};
+
+/*
+ * IB_PORT_DOWN (1) through IB_PORT_ACTIVE_DEFER (5) are represented
+ * logical_state_transitions
+ */
+
+#define __N_LOGICAL_STATES (IB_PORT_ACTIVE_DEFER - IB_PORT_DOWN + 1)
+
+/*
+ * Within logical_state_transitions rows represent "old" states,
+ * columns "new" states, and logical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 9-12).
+ */
+static const struct {
+	u8 allowed[__N_LOGICAL_STATES][__N_LOGICAL_STATES];
+} logical_state_transitions = {
+	{
+		/* 1    2    3    4    5 */
+	/* 1 */	{ __I, __D, __D, __D, __U},
+	/* 2 */	{ __D, __I, __A, __D, __U},
+	/* 3 */	{ __D, __D, __I, __A, __U},
+	/* 4 */	{ __D, __D, __I, __I, __U},
+	/* 5 */	{ __U, __U, __U, __U, __U},
+	}
+};
+
+static int logical_transition_allowed(int old, int new)
+{
+	if (old < IB_PORT_NOP || old > IB_PORT_ACTIVE_DEFER ||
+	    new < IB_PORT_NOP || new > IB_PORT_ACTIVE_DEFER) {
+		pr_warn("invalid logical state(s) (old %d new %d)\n",
+			old, new);
+		return HFI_TRANSITION_UNDEFINED;
+	}
+
+	if (new == IB_PORT_NOP)
+		return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+	/* adjust states for indexing into logical_state_transitions */
+	old -= IB_PORT_DOWN;
+	new -= IB_PORT_DOWN;
+
+	if (old < 0 || new < 0)
+		return HFI_TRANSITION_UNDEFINED;
+	return logical_state_transitions.allowed[old][new];
+}
+
+static int physical_transition_allowed(int old, int new)
+{
+	if (old < IB_PORTPHYSSTATE_NOP || old > OPA_PORTPHYSSTATE_MAX ||
+	    new < IB_PORTPHYSSTATE_NOP || new > OPA_PORTPHYSSTATE_MAX) {
+		pr_warn("invalid physical state(s) (old %d new %d)\n",
+			old, new);
+		return HFI_TRANSITION_UNDEFINED;
+	}
+
+	if (new == IB_PORTPHYSSTATE_NOP)
+		return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+	/* adjust states for indexing into physical_state_transitions */
+	old -= IB_PORTPHYSSTATE_POLLING;
+	new -= IB_PORTPHYSSTATE_POLLING;
+
+	if (old < 0 || new < 0)
+		return HFI_TRANSITION_UNDEFINED;
+	return physical_state_transitions.allowed[old][new];
+}
+
+static int port_states_transition_allowed(struct hfi1_pportdata *ppd,
+					  u32 logical_new, u32 physical_new)
+{
+	u32 physical_old = driver_pstate(ppd);
+	u32 logical_old = driver_lstate(ppd);
+	int ret, logical_allowed, physical_allowed;
+
+	ret = logical_transition_allowed(logical_old, logical_new);
+	logical_allowed = ret;
+
+	if (ret == HFI_TRANSITION_DISALLOWED ||
+	    ret == HFI_TRANSITION_UNDEFINED) {
+		pr_warn("invalid logical state transition %s -> %s\n",
+			opa_lstate_name(logical_old),
+			opa_lstate_name(logical_new));
+		return ret;
+	}
+
+	ret = physical_transition_allowed(physical_old, physical_new);
+	physical_allowed = ret;
+
+	if (ret == HFI_TRANSITION_DISALLOWED ||
+	    ret == HFI_TRANSITION_UNDEFINED) {
+		pr_warn("invalid physical state transition %s -> %s\n",
+			opa_pstate_name(physical_old),
+			opa_pstate_name(physical_new));
+		return ret;
+	}
+
+	if (logical_allowed == HFI_TRANSITION_IGNORED &&
+	    physical_allowed == HFI_TRANSITION_IGNORED)
+		return HFI_TRANSITION_IGNORED;
+
+	/*
+	 * A change request of Physical Port State from
+	 * 'Offline' to 'Polling' should be ignored.
+	 */
+	if ((physical_old == OPA_PORTPHYSSTATE_OFFLINE) &&
+	    (physical_new == IB_PORTPHYSSTATE_POLLING))
+		return HFI_TRANSITION_IGNORED;
+
+	/*
+	 * Either physical_allowed or logical_allowed is
+	 * HFI_TRANSITION_ALLOWED.
+	 */
+	return HFI_TRANSITION_ALLOWED;
+}
+
+static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
+			   u32 logical_state, u32 phys_state)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u32 link_state;
+	int ret;
+
+	ret = port_states_transition_allowed(ppd, logical_state, phys_state);
+	if (ret == HFI_TRANSITION_DISALLOWED ||
+	    ret == HFI_TRANSITION_UNDEFINED) {
+		/* error message emitted above */
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return 0;
+	}
+
+	if (ret == HFI_TRANSITION_IGNORED)
+		return 0;
+
+	if ((phys_state != IB_PORTPHYSSTATE_NOP) &&
+	    !(logical_state == IB_PORT_DOWN ||
+	      logical_state == IB_PORT_NOP)){
+		pr_warn("SubnSet(OPA_PortInfo) port state invalid: logical_state 0x%x physical_state 0x%x\n",
+			logical_state, phys_state);
+		smp->status |= IB_SMP_INVALID_FIELD;
+	}
+
+	/*
+	 * Logical state changes are summarized in OPAv1g1 spec.,
+	 * Table 9-12; physical state changes are summarized in
+	 * OPAv1g1 spec., Table 6.4.
+	 */
+	switch (logical_state) {
+	case IB_PORT_NOP:
+		if (phys_state == IB_PORTPHYSSTATE_NOP)
+			break;
+		/* FALLTHROUGH */
+	case IB_PORT_DOWN:
+		if (phys_state == IB_PORTPHYSSTATE_NOP) {
+			link_state = HLS_DN_DOWNDEF;
+		} else if (phys_state == IB_PORTPHYSSTATE_POLLING) {
+			link_state = HLS_DN_POLL;
+			set_link_down_reason(ppd, OPA_LINKDOWN_REASON_FM_BOUNCE,
+					     0, OPA_LINKDOWN_REASON_FM_BOUNCE);
+		} else if (phys_state == IB_PORTPHYSSTATE_DISABLED) {
+			link_state = HLS_DN_DISABLE;
+		} else {
+			pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n",
+				phys_state);
+			smp->status |= IB_SMP_INVALID_FIELD;
+			break;
+		}
+
+		if ((link_state == HLS_DN_POLL ||
+		     link_state == HLS_DN_DOWNDEF)) {
+			/*
+			 * Going to poll.  No matter what the current state,
+			 * always move offline first, then tune and start the
+			 * link.  This correctly handles a FM link bounce and
+			 * a link enable.  Going offline is a no-op if already
+			 * offline.
+			 */
+			set_link_state(ppd, HLS_DN_OFFLINE);
+			start_link(ppd);
+		} else {
+			set_link_state(ppd, link_state);
+		}
+		if (link_state == HLS_DN_DISABLE &&
+		    (ppd->offline_disabled_reason >
+		     HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) ||
+		     ppd->offline_disabled_reason ==
+		     HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
+			ppd->offline_disabled_reason =
+			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED);
+		/*
+		 * Don't send a reply if the response would be sent
+		 * through the disabled port.
+		 */
+		if (link_state == HLS_DN_DISABLE && smp->hop_cnt)
+			return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+		break;
+	case IB_PORT_ARMED:
+		ret = set_link_state(ppd, HLS_UP_ARMED);
+		if (!ret)
+			send_idle_sma(dd, SMA_IDLE_ARM);
+		break;
+	case IB_PORT_ACTIVE:
+		if (ppd->neighbor_normal) {
+			ret = set_link_state(ppd, HLS_UP_ACTIVE);
+			if (ret == 0)
+				send_idle_sma(dd, SMA_IDLE_ACTIVE);
+		} else {
+			pr_warn("SubnSet(OPA_PortInfo) Cannot move to Active with NeighborNormal 0\n");
+			smp->status |= IB_SMP_INVALID_FIELD;
+		}
+		break;
+	default:
+		pr_warn("SubnSet(OPA_PortInfo) invalid logical state 0x%x\n",
+			logical_state);
+		smp->status |= IB_SMP_INVALID_FIELD;
+	}
+
+	return 0;
+}
+
+/**
+ * subn_set_opa_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ */
+static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len, u32 max_len)
+{
+	struct opa_port_info *pi = (struct opa_port_info *)data;
+	struct ib_event event;
+	struct hfi1_devdata *dd;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	u8 clientrereg;
+	unsigned long flags;
+	u32 smlid;
+	u32 lid;
+	u8 ls_old, ls_new, ps_new;
+	u8 vls;
+	u8 msl;
+	u8 crc_enabled;
+	u16 lse, lwe, mtu;
+	u32 num_ports = OPA_AM_NPORT(am);
+	u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+	int ret, i, invalid = 0, call_set_mtu = 0;
+	int call_link_downgrade_policy = 0;
+
+	if (num_ports != 1 ||
+	    smp_length_check(sizeof(*pi), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	lid = be32_to_cpu(pi->lid);
+	if (lid & 0xFF000000) {
+		pr_warn("OPA_PortInfo lid out of range: %X\n", lid);
+		smp->status |= IB_SMP_INVALID_FIELD;
+		goto get_only;
+	}
+
+
+	smlid = be32_to_cpu(pi->sm_lid);
+	if (smlid & 0xFF000000) {
+		pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid);
+		smp->status |= IB_SMP_INVALID_FIELD;
+		goto get_only;
+	}
+
+	clientrereg = (pi->clientrereg_subnettimeout &
+			OPA_PI_MASK_CLIENT_REREGISTER);
+
+	dd = dd_from_ibdev(ibdev);
+	/* IB numbers ports from 1, hw from 0 */
+	ppd = dd->pport + (port - 1);
+	ibp = &ppd->ibport_data;
+	event.device = ibdev;
+	event.element.port_num = port;
+
+	ls_old = driver_lstate(ppd);
+
+	ibp->rvp.mkey = pi->mkey;
+	if (ibp->rvp.gid_prefix != pi->subnet_prefix) {
+		ibp->rvp.gid_prefix = pi->subnet_prefix;
+		event.event = IB_EVENT_GID_CHANGE;
+		ib_dispatch_event(&event);
+	}
+	ibp->rvp.mkey_lease_period = be16_to_cpu(pi->mkey_lease_period);
+
+	/* Must be a valid unicast LID address. */
+	if ((lid == 0 && ls_old > IB_PORT_INIT) ||
+	     (hfi1_is_16B_mcast(lid))) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n",
+			lid);
+	} else if (ppd->lid != lid ||
+		 ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) {
+		if (ppd->lid != lid)
+			hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LID_CHANGE_BIT);
+		if (ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC))
+			hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LMC_CHANGE_BIT);
+		hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC);
+		event.event = IB_EVENT_LID_CHANGE;
+		ib_dispatch_event(&event);
+
+		if (HFI1_PORT_GUID_INDEX + 1 < HFI1_GUIDS_PER_PORT) {
+			/* Manufacture GID from LID to support extended
+			 * addresses
+			 */
+			ppd->guids[HFI1_PORT_GUID_INDEX + 1] =
+				be64_to_cpu(OPA_MAKE_ID(lid));
+			event.event = IB_EVENT_GID_CHANGE;
+			ib_dispatch_event(&event);
+		}
+	}
+
+	msl = pi->smsl & OPA_PI_MASK_SMSL;
+	if (pi->partenforce_filterraw & OPA_PI_MASK_LINKINIT_REASON)
+		ppd->linkinit_reason =
+			(pi->partenforce_filterraw &
+			 OPA_PI_MASK_LINKINIT_REASON);
+
+	/* Must be a valid unicast LID address. */
+	if ((smlid == 0 && ls_old > IB_PORT_INIT) ||
+	     (hfi1_is_16B_mcast(smlid))) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid);
+	} else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) {
+		pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid);
+		spin_lock_irqsave(&ibp->rvp.lock, flags);
+		if (ibp->rvp.sm_ah) {
+			if (smlid != ibp->rvp.sm_lid)
+				hfi1_modify_qp0_ah(ibp, ibp->rvp.sm_ah, smlid);
+			if (msl != ibp->rvp.sm_sl)
+				rdma_ah_set_sl(&ibp->rvp.sm_ah->attr, msl);
+		}
+		spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+		if (smlid != ibp->rvp.sm_lid)
+			ibp->rvp.sm_lid = smlid;
+		if (msl != ibp->rvp.sm_sl)
+			ibp->rvp.sm_sl = msl;
+		event.event = IB_EVENT_SM_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	if (pi->link_down_reason == 0) {
+		ppd->local_link_down_reason.sma = 0;
+		ppd->local_link_down_reason.latest = 0;
+	}
+
+	if (pi->neigh_link_down_reason == 0) {
+		ppd->neigh_link_down_reason.sma = 0;
+		ppd->neigh_link_down_reason.latest = 0;
+	}
+
+	ppd->sm_trap_qp = be32_to_cpu(pi->sm_trap_qp);
+	ppd->sa_qp = be32_to_cpu(pi->sa_qp);
+
+	ppd->port_error_action = be32_to_cpu(pi->port_error_action);
+	lwe = be16_to_cpu(pi->link_width.enabled);
+	if (lwe) {
+		if (lwe == OPA_LINK_WIDTH_RESET ||
+		    lwe == OPA_LINK_WIDTH_RESET_OLD)
+			set_link_width_enabled(ppd, ppd->link_width_supported);
+		else if ((lwe & ~ppd->link_width_supported) == 0)
+			set_link_width_enabled(ppd, lwe);
+		else
+			smp->status |= IB_SMP_INVALID_FIELD;
+	}
+	lwe = be16_to_cpu(pi->link_width_downgrade.enabled);
+	/* LWD.E is always applied - 0 means "disabled" */
+	if (lwe == OPA_LINK_WIDTH_RESET ||
+	    lwe == OPA_LINK_WIDTH_RESET_OLD) {
+		set_link_width_downgrade_enabled(ppd,
+						 ppd->
+						 link_width_downgrade_supported
+						 );
+	} else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) {
+		/* only set and apply if something changed */
+		if (lwe != ppd->link_width_downgrade_enabled) {
+			set_link_width_downgrade_enabled(ppd, lwe);
+			call_link_downgrade_policy = 1;
+		}
+	} else {
+		smp->status |= IB_SMP_INVALID_FIELD;
+	}
+	lse = be16_to_cpu(pi->link_speed.enabled);
+	if (lse) {
+		if (lse & be16_to_cpu(pi->link_speed.supported))
+			set_link_speed_enabled(ppd, lse);
+		else
+			smp->status |= IB_SMP_INVALID_FIELD;
+	}
+
+	ibp->rvp.mkeyprot =
+		(pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6;
+	ibp->rvp.vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF;
+	(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT,
+				    ibp->rvp.vl_high_limit);
+
+	if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+	    ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+	for (i = 0; i < ppd->vls_supported; i++) {
+		if ((i % 2) == 0)
+			mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i / 2] >>
+					   4) & 0xF);
+		else
+			mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i / 2] &
+					  0xF);
+		if (mtu == 0xffff) {
+			pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n",
+				mtu,
+				(pi->neigh_mtu.pvlx_to_mtu[0] >> 4) & 0xF);
+			smp->status |= IB_SMP_INVALID_FIELD;
+			mtu = hfi1_max_mtu; /* use a valid MTU */
+		}
+		if (dd->vld[i].mtu != mtu) {
+			dd_dev_info(dd,
+				    "MTU change on vl %d from %d to %d\n",
+				    i, dd->vld[i].mtu, mtu);
+			dd->vld[i].mtu = mtu;
+			call_set_mtu++;
+		}
+	}
+	/* As per OPAV1 spec: VL15 must support and be configured
+	 * for operation with a 2048 or larger MTU.
+	 */
+	mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15 / 2] & 0xF);
+	if (mtu < 2048 || mtu == 0xffff)
+		mtu = 2048;
+	if (dd->vld[15].mtu != mtu) {
+		dd_dev_info(dd,
+			    "MTU change on vl 15 from %d to %d\n",
+			    dd->vld[15].mtu, mtu);
+		dd->vld[15].mtu = mtu;
+		call_set_mtu++;
+	}
+	if (call_set_mtu)
+		set_mtu(ppd);
+
+	/* Set operational VLs */
+	vls = pi->operational_vls & OPA_PI_MASK_OPERATIONAL_VL;
+	if (vls) {
+		if (vls > ppd->vls_supported) {
+			pr_warn("SubnSet(OPA_PortInfo) VL's supported invalid %d\n",
+				pi->operational_vls);
+			smp->status |= IB_SMP_INVALID_FIELD;
+		} else {
+			if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS,
+					    vls) == -EINVAL)
+				smp->status |= IB_SMP_INVALID_FIELD;
+		}
+	}
+
+	if (pi->mkey_violations == 0)
+		ibp->rvp.mkey_violations = 0;
+
+	if (pi->pkey_violations == 0)
+		ibp->rvp.pkey_violations = 0;
+
+	if (pi->qkey_violations == 0)
+		ibp->rvp.qkey_violations = 0;
+
+	ibp->rvp.subnet_timeout =
+		pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT;
+
+	crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode);
+	crc_enabled >>= 4;
+	crc_enabled &= 0xf;
+
+	if (crc_enabled != 0)
+		ppd->port_crc_mode_enabled = port_ltp_to_cap(crc_enabled);
+
+	ppd->is_active_optimize_enabled =
+			!!(be16_to_cpu(pi->port_mode)
+					& OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE);
+
+	ls_new = pi->port_states.portphysstate_portstate &
+			OPA_PI_MASK_PORT_STATE;
+	ps_new = (pi->port_states.portphysstate_portstate &
+			OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4;
+
+	if (ls_old == IB_PORT_INIT) {
+		if (start_of_sm_config) {
+			if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+				ppd->is_sm_config_started = 1;
+		} else if (ls_new == IB_PORT_ARMED) {
+			if (ppd->is_sm_config_started == 0) {
+				invalid = 1;
+				smp->status |= IB_SMP_INVALID_FIELD;
+			}
+		}
+	}
+
+	/* Handle CLIENT_REREGISTER event b/c SM asked us for it */
+	if (clientrereg) {
+		event.event = IB_EVENT_CLIENT_REREGISTER;
+		ib_dispatch_event(&event);
+	}
+
+	/*
+	 * Do the port state change now that the other link parameters
+	 * have been set.
+	 * Changing the port physical state only makes sense if the link
+	 * is down or is being set to down.
+	 */
+
+	if (!invalid) {
+		ret = set_port_states(ppd, smp, ls_new, ps_new);
+		if (ret)
+			return ret;
+	}
+
+	ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len,
+				      max_len);
+
+	/* restore re-reg bit per o14-12.2.1 */
+	pi->clientrereg_subnettimeout |= clientrereg;
+
+	/*
+	 * Apply the new link downgrade policy.  This may result in a link
+	 * bounce.  Do this after everything else so things are settled.
+	 * Possible problem: if setting the port state above fails, then
+	 * the policy change is not applied.
+	 */
+	if (call_link_downgrade_policy)
+		apply_link_downgrade_policy(ppd, 0);
+
+	return ret;
+
+get_only:
+	return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len,
+				       max_len);
+}
+
+/**
+ * set_pkeys - set the PKEY table for ctxt 0
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+	struct hfi1_pportdata *ppd;
+	int i;
+	int changed = 0;
+	int update_includes_mgmt_partition = 0;
+
+	/*
+	 * IB port one/two always maps to context zero/one,
+	 * always a kernel context, no locking needed
+	 * If we get here with ppd setup, no need to check
+	 * that rcd is valid.
+	 */
+	ppd = dd->pport + (port - 1);
+	/*
+	 * If the update does not include the management pkey, don't do it.
+	 */
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (pkeys[i] == LIM_MGMT_P_KEY) {
+			update_includes_mgmt_partition = 1;
+			break;
+		}
+	}
+
+	if (!update_includes_mgmt_partition)
+		return 1;
+
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		u16 key = pkeys[i];
+		u16 okey = ppd->pkeys[i];
+
+		if (key == okey)
+			continue;
+		/*
+		 * The SM gives us the complete PKey table. We have
+		 * to ensure that we put the PKeys in the matching
+		 * slots.
+		 */
+		ppd->pkeys[i] = key;
+		changed = 1;
+	}
+
+	if (changed) {
+		(void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+		hfi1_event_pkey_change(dd, port);
+	}
+
+	return 0;
+}
+
+static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+				    struct ib_device *ibdev, u8 port,
+				    u32 *resp_len, u32 max_len)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	u32 n_blocks_sent = OPA_AM_NBLK(am);
+	u32 start_block = am & 0x7ff;
+	u16 *p = (u16 *)data;
+	__be16 *q = (__be16 *)data;
+	int i;
+	u16 n_blocks_avail;
+	unsigned npkeys = hfi1_get_npkeys(dd);
+	u32 size = 0;
+
+	if (n_blocks_sent == 0) {
+		pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+			port, start_block, n_blocks_sent);
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+	size = sizeof(u16) * (n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE);
+
+	if (smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	if (start_block + n_blocks_sent > n_blocks_avail ||
+	    n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+		pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n",
+			start_block, n_blocks_sent, n_blocks_avail,
+			OPA_NUM_PKEY_BLOCKS_PER_SMP);
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i < n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE; i++)
+		p[i] = be16_to_cpu(q[i]);
+
+	if (start_block == 0 && set_pkeys(dd, port, p) != 0) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len,
+					max_len);
+}
+
+#define ILLEGAL_VL 12
+/*
+ * filter_sc2vlt changes mappings to VL15 to ILLEGAL_VL (except
+ * for SC15, which must map to VL15). If we don't remap things this
+ * way it is possible for VL15 counters to increment when we try to
+ * send on a SC which is mapped to an invalid VL.
+ * When getting the table convert ILLEGAL_VL back to VL15.
+ */
+static void filter_sc2vlt(void *data, bool set)
+{
+	int i;
+	u8 *pd = data;
+
+	for (i = 0; i < OPA_MAX_SCS; i++) {
+		if (i == 15)
+			continue;
+
+		if (set) {
+			if ((pd[i] & 0x1f) == 0xf)
+				pd[i] = ILLEGAL_VL;
+		} else {
+			if ((pd[i] & 0x1f) == ILLEGAL_VL)
+				pd[i] = 0xf;
+		}
+	}
+}
+
+static int set_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+	u64 *val = data;
+
+	filter_sc2vlt(data, true);
+
+	write_csr(dd, SEND_SC2VLT0, *val++);
+	write_csr(dd, SEND_SC2VLT1, *val++);
+	write_csr(dd, SEND_SC2VLT2, *val++);
+	write_csr(dd, SEND_SC2VLT3, *val++);
+	write_seqlock_irq(&dd->sc2vl_lock);
+	memcpy(dd->sc2vl, data, sizeof(dd->sc2vl));
+	write_sequnlock_irq(&dd->sc2vl_lock);
+	return 0;
+}
+
+static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+	u64 *val = (u64 *)data;
+
+	*val++ = read_csr(dd, SEND_SC2VLT0);
+	*val++ = read_csr(dd, SEND_SC2VLT1);
+	*val++ = read_csr(dd, SEND_SC2VLT2);
+	*val++ = read_csr(dd, SEND_SC2VLT3);
+
+	filter_sc2vlt((u64 *)data, false);
+	return 0;
+}
+
+static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len, u32 max_len)
+{
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	u8 *p = data;
+	size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */
+	unsigned i;
+
+	if (am || smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++)
+		*p++ = ibp->sl_to_sc[i];
+
+	if (resp_len)
+		*resp_len += size;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len, u32 max_len)
+{
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	u8 *p = data;
+	size_t size = ARRAY_SIZE(ibp->sl_to_sc);
+	int i;
+	u8 sc;
+
+	if (am || smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i <  ARRAY_SIZE(ibp->sl_to_sc); i++) {
+		sc = *p++;
+		if (ibp->sl_to_sc[i] != sc) {
+			ibp->sl_to_sc[i] = sc;
+
+			/* Put all stale qps into error state */
+			hfi1_error_port_qps(ibp, i);
+		}
+	}
+
+	return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len,
+				       max_len);
+}
+
+static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len, u32 max_len)
+{
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	u8 *p = data;
+	size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */
+	unsigned i;
+
+	if (am || smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+		*p++ = ibp->sc_to_sl[i];
+
+	if (resp_len)
+		*resp_len += size;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len, u32 max_len)
+{
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	size_t size = ARRAY_SIZE(ibp->sc_to_sl);
+	u8 *p = data;
+	int i;
+
+	if (am || smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+		ibp->sc_to_sl[i] = *p++;
+
+	return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len,
+				       max_len);
+}
+
+static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+				    struct ib_device *ibdev, u8 port,
+				    u32 *resp_len, u32 max_len)
+{
+	u32 n_blocks = OPA_AM_NBLK(am);
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	void *vp = (void *)data;
+	size_t size = 4 * sizeof(u64);
+
+	if (n_blocks != 1 || smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	get_sc2vlt_tables(dd, vp);
+
+	if (resp_len)
+		*resp_len += size;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+				    struct ib_device *ibdev, u8 port,
+				    u32 *resp_len, u32 max_len)
+{
+	u32 n_blocks = OPA_AM_NBLK(am);
+	int async_update = OPA_AM_ASYNC(am);
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	void *vp = (void *)data;
+	struct hfi1_pportdata *ppd;
+	int lstate;
+	/*
+	 * set_sc2vlt_tables writes the information contained in *data
+	 * to four 64-bit registers SendSC2VLt[0-3]. We need to make
+	 * sure *max_len is not greater than the total size of the four
+	 * SendSC2VLt[0-3] registers.
+	 */
+	size_t size = 4 * sizeof(u64);
+
+	if (n_blocks != 1 || async_update || smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	/* IB numbers ports from 1, hw from 0 */
+	ppd = dd->pport + (port - 1);
+	lstate = driver_lstate(ppd);
+	/*
+	 * it's known that async_update is 0 by this point, but include
+	 * the explicit check for clarity
+	 */
+	if (!async_update &&
+	    (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	set_sc2vlt_tables(dd, vp);
+
+	return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len,
+					max_len);
+}
+
+static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+				     struct ib_device *ibdev, u8 port,
+				     u32 *resp_len, u32 max_len)
+{
+	u32 n_blocks = OPA_AM_NPORT(am);
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_pportdata *ppd;
+	void *vp = (void *)data;
+	int size = sizeof(struct sc2vlnt);
+
+	if (n_blocks != 1 || smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ppd = dd->pport + (port - 1);
+
+	fm_get_table(ppd, FM_TBL_SC2VLNT, vp);
+
+	if (resp_len)
+		*resp_len += size;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+				     struct ib_device *ibdev, u8 port,
+				     u32 *resp_len, u32 max_len)
+{
+	u32 n_blocks = OPA_AM_NPORT(am);
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_pportdata *ppd;
+	void *vp = (void *)data;
+	int lstate;
+	int size = sizeof(struct sc2vlnt);
+
+	if (n_blocks != 1 || smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	/* IB numbers ports from 1, hw from 0 */
+	ppd = dd->pport + (port - 1);
+	lstate = driver_lstate(ppd);
+	if (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ppd = dd->pport + (port - 1);
+
+	fm_set_table(ppd, FM_TBL_SC2VLNT, vp);
+
+	return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+					 resp_len, max_len);
+}
+
+static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+			      struct ib_device *ibdev, u8 port,
+			      u32 *resp_len, u32 max_len)
+{
+	u32 nports = OPA_AM_NPORT(am);
+	u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+	u32 lstate;
+	struct hfi1_ibport *ibp;
+	struct hfi1_pportdata *ppd;
+	struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
+
+	if (nports != 1 || smp_length_check(sizeof(*psi), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ibp = to_iport(ibdev, port);
+	ppd = ppd_from_ibp(ibp);
+
+	lstate = driver_lstate(ppd);
+
+	if (start_of_sm_config && (lstate == IB_PORT_INIT))
+		ppd->is_sm_config_started = 1;
+
+	psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+	psi->port_states.ledenable_offlinereason |=
+		ppd->is_sm_config_started << 5;
+	psi->port_states.ledenable_offlinereason |=
+		ppd->offline_disabled_reason;
+
+	psi->port_states.portphysstate_portstate =
+		(driver_pstate(ppd) << 4) | (lstate & 0xf);
+	psi->link_width_downgrade_tx_active =
+		cpu_to_be16(ppd->link_width_downgrade_tx_active);
+	psi->link_width_downgrade_rx_active =
+		cpu_to_be16(ppd->link_width_downgrade_rx_active);
+	if (resp_len)
+		*resp_len += sizeof(struct opa_port_state_info);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+			      struct ib_device *ibdev, u8 port,
+			      u32 *resp_len, u32 max_len)
+{
+	u32 nports = OPA_AM_NPORT(am);
+	u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+	u32 ls_old;
+	u8 ls_new, ps_new;
+	struct hfi1_ibport *ibp;
+	struct hfi1_pportdata *ppd;
+	struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
+	int ret, invalid = 0;
+
+	if (nports != 1 || smp_length_check(sizeof(*psi), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ibp = to_iport(ibdev, port);
+	ppd = ppd_from_ibp(ibp);
+
+	ls_old = driver_lstate(ppd);
+
+	ls_new = port_states_to_logical_state(&psi->port_states);
+	ps_new = port_states_to_phys_state(&psi->port_states);
+
+	if (ls_old == IB_PORT_INIT) {
+		if (start_of_sm_config) {
+			if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+				ppd->is_sm_config_started = 1;
+		} else if (ls_new == IB_PORT_ARMED) {
+			if (ppd->is_sm_config_started == 0) {
+				invalid = 1;
+				smp->status |= IB_SMP_INVALID_FIELD;
+			}
+		}
+	}
+
+	if (!invalid) {
+		ret = set_port_states(ppd, smp, ls_new, ps_new);
+		if (ret)
+			return ret;
+	}
+
+	return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len,
+				  max_len);
+}
+
+static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
+				     struct ib_device *ibdev, u8 port,
+				     u32 *resp_len, u32 max_len)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	u32 addr = OPA_AM_CI_ADDR(am);
+	u32 len = OPA_AM_CI_LEN(am) + 1;
+	int ret;
+
+	if (dd->pport->port_type != PORT_TYPE_QSFP ||
+	    smp_length_check(len, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+#define __CI_PAGE_SIZE BIT(7) /* 128 bytes */
+#define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1)
+#define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK)
+
+	/*
+	 * check that addr is within spec, and
+	 * addr and (addr + len - 1) are on the same "page"
+	 */
+	if (addr >= 4096 ||
+	    (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ret = get_cable_info(dd, port, addr, len, data);
+
+	if (ret == -ENODEV) {
+		smp->status |= IB_SMP_UNSUP_METH_ATTR;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	/* The address range for the CableInfo SMA query is wider than the
+	 * memory available on the QSFP cable. We want to return a valid
+	 * response, albeit zeroed out, for address ranges beyond available
+	 * memory but that are within the CableInfo query spec
+	 */
+	if (ret < 0 && ret != -ERANGE) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	if (resp_len)
+		*resp_len += len;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+			      struct ib_device *ibdev, u8 port, u32 *resp_len,
+			      u32 max_len)
+{
+	u32 num_ports = OPA_AM_NPORT(am);
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_pportdata *ppd;
+	struct buffer_control *p = (struct buffer_control *)data;
+	int size = sizeof(struct buffer_control);
+
+	if (num_ports != 1 || smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	ppd = dd->pport + (port - 1);
+	fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p);
+	trace_bct_get(dd, p);
+	if (resp_len)
+		*resp_len += size;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+			      struct ib_device *ibdev, u8 port, u32 *resp_len,
+			      u32 max_len)
+{
+	u32 num_ports = OPA_AM_NPORT(am);
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_pportdata *ppd;
+	struct buffer_control *p = (struct buffer_control *)data;
+
+	if (num_ports != 1 || smp_length_check(sizeof(*p), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+	ppd = dd->pport + (port - 1);
+	trace_bct_set(dd, p);
+	if (fm_set_table(ppd, FM_TBL_BUFFER_CONTROL, p) < 0) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len,
+				  max_len);
+}
+
+static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+				 struct ib_device *ibdev, u8 port,
+				 u32 *resp_len, u32 max_len)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+	u32 num_ports = OPA_AM_NPORT(am);
+	u8 section = (am & 0x00ff0000) >> 16;
+	u8 *p = data;
+	int size = 256;
+
+	if (num_ports != 1 || smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	switch (section) {
+	case OPA_VLARB_LOW_ELEMENTS:
+		fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p);
+		break;
+	case OPA_VLARB_HIGH_ELEMENTS:
+		fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+		break;
+	case OPA_VLARB_PREEMPT_ELEMENTS:
+		fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p);
+		break;
+	case OPA_VLARB_PREEMPT_MATRIX:
+		fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p);
+		break;
+	default:
+		pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n",
+			be32_to_cpu(smp->attr_mod));
+		smp->status |= IB_SMP_INVALID_FIELD;
+		size = 0;
+		break;
+	}
+
+	if (size > 0 && resp_len)
+		*resp_len += size;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+				 struct ib_device *ibdev, u8 port,
+				 u32 *resp_len, u32 max_len)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+	u32 num_ports = OPA_AM_NPORT(am);
+	u8 section = (am & 0x00ff0000) >> 16;
+	u8 *p = data;
+	int size = 256;
+
+	if (num_ports != 1 || smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	switch (section) {
+	case OPA_VLARB_LOW_ELEMENTS:
+		(void)fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p);
+		break;
+	case OPA_VLARB_HIGH_ELEMENTS:
+		(void)fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+		break;
+	/*
+	 * neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX
+	 * can be changed from the default values
+	 */
+	case OPA_VLARB_PREEMPT_ELEMENTS:
+		/* FALLTHROUGH */
+	case OPA_VLARB_PREEMPT_MATRIX:
+		smp->status |= IB_SMP_UNSUP_METH_ATTR;
+		break;
+	default:
+		pr_warn("OPA SubnSet(VL Arb) AM Invalid : 0x%x\n",
+			be32_to_cpu(smp->attr_mod));
+		smp->status |= IB_SMP_INVALID_FIELD;
+		break;
+	}
+
+	return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len,
+				     max_len);
+}
+
+struct opa_pma_mad {
+	struct ib_mad_hdr mad_hdr;
+	u8 data[2024];
+} __packed;
+
+struct opa_port_status_req {
+	__u8 port_num;
+	__u8 reserved[3];
+	__be32 vl_select_mask;
+};
+
+#define VL_MASK_ALL		0x000080ff
+
+struct opa_port_status_rsp {
+	__u8 port_num;
+	__u8 reserved[3];
+	__be32  vl_select_mask;
+
+	/* Data counters */
+	__be64 port_xmit_data;
+	__be64 port_rcv_data;
+	__be64 port_xmit_pkts;
+	__be64 port_rcv_pkts;
+	__be64 port_multicast_xmit_pkts;
+	__be64 port_multicast_rcv_pkts;
+	__be64 port_xmit_wait;
+	__be64 sw_port_congestion;
+	__be64 port_rcv_fecn;
+	__be64 port_rcv_becn;
+	__be64 port_xmit_time_cong;
+	__be64 port_xmit_wasted_bw;
+	__be64 port_xmit_wait_data;
+	__be64 port_rcv_bubble;
+	__be64 port_mark_fecn;
+	/* Error counters */
+	__be64 port_rcv_constraint_errors;
+	__be64 port_rcv_switch_relay_errors;
+	__be64 port_xmit_discards;
+	__be64 port_xmit_constraint_errors;
+	__be64 port_rcv_remote_physical_errors;
+	__be64 local_link_integrity_errors;
+	__be64 port_rcv_errors;
+	__be64 excessive_buffer_overruns;
+	__be64 fm_config_errors;
+	__be32 link_error_recovery;
+	__be32 link_downed;
+	u8 uncorrectable_errors;
+
+	u8 link_quality_indicator; /* 5res, 3bit */
+	u8 res2[6];
+	struct _vls_pctrs {
+		/* per-VL Data counters */
+		__be64 port_vl_xmit_data;
+		__be64 port_vl_rcv_data;
+		__be64 port_vl_xmit_pkts;
+		__be64 port_vl_rcv_pkts;
+		__be64 port_vl_xmit_wait;
+		__be64 sw_port_vl_congestion;
+		__be64 port_vl_rcv_fecn;
+		__be64 port_vl_rcv_becn;
+		__be64 port_xmit_time_cong;
+		__be64 port_vl_xmit_wasted_bw;
+		__be64 port_vl_xmit_wait_data;
+		__be64 port_vl_rcv_bubble;
+		__be64 port_vl_mark_fecn;
+		__be64 port_vl_xmit_discards;
+	} vls[0]; /* real array size defined by # bits set in vl_select_mask */
+};
+
+enum counter_selects {
+	CS_PORT_XMIT_DATA			= (1 << 31),
+	CS_PORT_RCV_DATA			= (1 << 30),
+	CS_PORT_XMIT_PKTS			= (1 << 29),
+	CS_PORT_RCV_PKTS			= (1 << 28),
+	CS_PORT_MCAST_XMIT_PKTS			= (1 << 27),
+	CS_PORT_MCAST_RCV_PKTS			= (1 << 26),
+	CS_PORT_XMIT_WAIT			= (1 << 25),
+	CS_SW_PORT_CONGESTION			= (1 << 24),
+	CS_PORT_RCV_FECN			= (1 << 23),
+	CS_PORT_RCV_BECN			= (1 << 22),
+	CS_PORT_XMIT_TIME_CONG			= (1 << 21),
+	CS_PORT_XMIT_WASTED_BW			= (1 << 20),
+	CS_PORT_XMIT_WAIT_DATA			= (1 << 19),
+	CS_PORT_RCV_BUBBLE			= (1 << 18),
+	CS_PORT_MARK_FECN			= (1 << 17),
+	CS_PORT_RCV_CONSTRAINT_ERRORS		= (1 << 16),
+	CS_PORT_RCV_SWITCH_RELAY_ERRORS		= (1 << 15),
+	CS_PORT_XMIT_DISCARDS			= (1 << 14),
+	CS_PORT_XMIT_CONSTRAINT_ERRORS		= (1 << 13),
+	CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS	= (1 << 12),
+	CS_LOCAL_LINK_INTEGRITY_ERRORS		= (1 << 11),
+	CS_PORT_RCV_ERRORS			= (1 << 10),
+	CS_EXCESSIVE_BUFFER_OVERRUNS		= (1 << 9),
+	CS_FM_CONFIG_ERRORS			= (1 << 8),
+	CS_LINK_ERROR_RECOVERY			= (1 << 7),
+	CS_LINK_DOWNED				= (1 << 6),
+	CS_UNCORRECTABLE_ERRORS			= (1 << 5),
+};
+
+struct opa_clear_port_status {
+	__be64 port_select_mask[4];
+	__be32 counter_select_mask;
+};
+
+struct opa_aggregate {
+	__be16 attr_id;
+	__be16 err_reqlength;	/* 1 bit, 8 res, 7 bit */
+	__be32 attr_mod;
+	u8 data[0];
+};
+
+#define MSK_LLI 0x000000f0
+#define MSK_LLI_SFT 4
+#define MSK_LER 0x0000000f
+#define MSK_LER_SFT 0
+#define ADD_LLI 8
+#define ADD_LER 2
+
+/* Request contains first three fields, response contains those plus the rest */
+struct opa_port_data_counters_msg {
+	__be64 port_select_mask[4];
+	__be32 vl_select_mask;
+	__be32 resolution;
+
+	/* Response fields follow */
+	struct _port_dctrs {
+		u8 port_number;
+		u8 reserved2[3];
+		__be32 link_quality_indicator; /* 29res, 3bit */
+
+		/* Data counters */
+		__be64 port_xmit_data;
+		__be64 port_rcv_data;
+		__be64 port_xmit_pkts;
+		__be64 port_rcv_pkts;
+		__be64 port_multicast_xmit_pkts;
+		__be64 port_multicast_rcv_pkts;
+		__be64 port_xmit_wait;
+		__be64 sw_port_congestion;
+		__be64 port_rcv_fecn;
+		__be64 port_rcv_becn;
+		__be64 port_xmit_time_cong;
+		__be64 port_xmit_wasted_bw;
+		__be64 port_xmit_wait_data;
+		__be64 port_rcv_bubble;
+		__be64 port_mark_fecn;
+
+		__be64 port_error_counter_summary;
+		/* Sum of error counts/port */
+
+		struct _vls_dctrs {
+			/* per-VL Data counters */
+			__be64 port_vl_xmit_data;
+			__be64 port_vl_rcv_data;
+			__be64 port_vl_xmit_pkts;
+			__be64 port_vl_rcv_pkts;
+			__be64 port_vl_xmit_wait;
+			__be64 sw_port_vl_congestion;
+			__be64 port_vl_rcv_fecn;
+			__be64 port_vl_rcv_becn;
+			__be64 port_xmit_time_cong;
+			__be64 port_vl_xmit_wasted_bw;
+			__be64 port_vl_xmit_wait_data;
+			__be64 port_vl_rcv_bubble;
+			__be64 port_vl_mark_fecn;
+		} vls[0];
+		/* array size defined by #bits set in vl_select_mask*/
+	} port[1]; /* array size defined by  #ports in attribute modifier */
+};
+
+struct opa_port_error_counters64_msg {
+	/*
+	 * Request contains first two fields, response contains the
+	 * whole magilla
+	 */
+	__be64 port_select_mask[4];
+	__be32 vl_select_mask;
+
+	/* Response-only fields follow */
+	__be32 reserved1;
+	struct _port_ectrs {
+		u8 port_number;
+		u8 reserved2[7];
+		__be64 port_rcv_constraint_errors;
+		__be64 port_rcv_switch_relay_errors;
+		__be64 port_xmit_discards;
+		__be64 port_xmit_constraint_errors;
+		__be64 port_rcv_remote_physical_errors;
+		__be64 local_link_integrity_errors;
+		__be64 port_rcv_errors;
+		__be64 excessive_buffer_overruns;
+		__be64 fm_config_errors;
+		__be32 link_error_recovery;
+		__be32 link_downed;
+		u8 uncorrectable_errors;
+		u8 reserved3[7];
+		struct _vls_ectrs {
+			__be64 port_vl_xmit_discards;
+		} vls[0];
+		/* array size defined by #bits set in vl_select_mask */
+	} port[1]; /* array size defined by #ports in attribute modifier */
+};
+
+struct opa_port_error_info_msg {
+	__be64 port_select_mask[4];
+	__be32 error_info_select_mask;
+	__be32 reserved1;
+	struct _port_ei {
+		u8 port_number;
+		u8 reserved2[7];
+
+		/* PortRcvErrorInfo */
+		struct {
+			u8 status_and_code;
+			union {
+				u8 raw[17];
+				struct {
+					/* EI1to12 format */
+					u8 packet_flit1[8];
+					u8 packet_flit2[8];
+					u8 remaining_flit_bits12;
+				} ei1to12;
+				struct {
+					u8 packet_bytes[8];
+					u8 remaining_flit_bits;
+				} ei13;
+			} ei;
+			u8 reserved3[6];
+		} __packed port_rcv_ei;
+
+		/* ExcessiveBufferOverrunInfo */
+		struct {
+			u8 status_and_sc;
+			u8 reserved4[7];
+		} __packed excessive_buffer_overrun_ei;
+
+		/* PortXmitConstraintErrorInfo */
+		struct {
+			u8 status;
+			u8 reserved5;
+			__be16 pkey;
+			__be32 slid;
+		} __packed port_xmit_constraint_ei;
+
+		/* PortRcvConstraintErrorInfo */
+		struct {
+			u8 status;
+			u8 reserved6;
+			__be16 pkey;
+			__be32 slid;
+		} __packed port_rcv_constraint_ei;
+
+		/* PortRcvSwitchRelayErrorInfo */
+		struct {
+			u8 status_and_code;
+			u8 reserved7[3];
+			__u32 error_info;
+		} __packed port_rcv_switch_relay_ei;
+
+		/* UncorrectableErrorInfo */
+		struct {
+			u8 status_and_code;
+			u8 reserved8;
+		} __packed uncorrectable_ei;
+
+		/* FMConfigErrorInfo */
+		struct {
+			u8 status_and_code;
+			u8 error_info;
+		} __packed fm_config_ei;
+		__u32 reserved9;
+	} port[1]; /* actual array size defined by #ports in attr modifier */
+};
+
+/* opa_port_error_info_msg error_info_select_mask bit definitions */
+enum error_info_selects {
+	ES_PORT_RCV_ERROR_INFO			= (1 << 31),
+	ES_EXCESSIVE_BUFFER_OVERRUN_INFO	= (1 << 30),
+	ES_PORT_XMIT_CONSTRAINT_ERROR_INFO	= (1 << 29),
+	ES_PORT_RCV_CONSTRAINT_ERROR_INFO	= (1 << 28),
+	ES_PORT_RCV_SWITCH_RELAY_ERROR_INFO	= (1 << 27),
+	ES_UNCORRECTABLE_ERROR_INFO		= (1 << 26),
+	ES_FM_CONFIG_ERROR_INFO			= (1 << 25)
+};
+
+static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp,
+				     struct ib_device *ibdev, u32 *resp_len)
+{
+	struct opa_class_port_info *p =
+		(struct opa_class_port_info *)pmp->data;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	if (pmp->mad_hdr.attr_mod != 0)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	p->base_version = OPA_MGMT_BASE_VERSION;
+	p->class_version = OPA_SM_CLASS_VERSION;
+	/*
+	 * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
+	 */
+	p->cap_mask2_resp_time = cpu_to_be32(18);
+
+	if (resp_len)
+		*resp_len += sizeof(*p);
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static void a0_portstatus(struct hfi1_pportdata *ppd,
+			  struct opa_port_status_rsp *rsp, u32 vl_select_mask)
+{
+	if (!is_bx(ppd->dd)) {
+		unsigned long vl;
+		u64 sum_vl_xmit_wait = 0;
+		u32 vl_all_mask = VL_MASK_ALL;
+
+		for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
+				 8 * sizeof(vl_all_mask)) {
+			u64 tmp = sum_vl_xmit_wait +
+				  read_port_cntr(ppd, C_TX_WAIT_VL,
+						 idx_from_vl(vl));
+			if (tmp < sum_vl_xmit_wait) {
+				/* we wrapped */
+				sum_vl_xmit_wait = (u64)~0;
+				break;
+			}
+			sum_vl_xmit_wait = tmp;
+		}
+		if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
+			rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
+	}
+}
+
+/**
+ * tx_link_width - convert link width bitmask to integer
+ * value representing actual link width.
+ * @link_width: width of active link
+ * @return: return index of the bit set in link_width var
+ *
+ * The function convert and return the index of bit set
+ * that indicate the current link width.
+ */
+u16 tx_link_width(u16 link_width)
+{
+	int n = LINK_WIDTH_DEFAULT;
+	u16 tx_width = n;
+
+	while (link_width && n) {
+		if (link_width & (1 << (n - 1))) {
+			tx_width = n;
+			break;
+		}
+		n--;
+	}
+
+	return tx_width;
+}
+
+/**
+ * get_xmit_wait_counters - Convert HFI 's SendWaitCnt/SendWaitVlCnt
+ * counter in unit of TXE cycle times to flit times.
+ * @ppd: info of physical Hfi port
+ * @link_width: width of active link
+ * @link_speed: speed of active link
+ * @vl: represent VL0-VL7, VL15 for PortVLXmitWait counters request
+ * and if vl value is C_VL_COUNT, it represent SendWaitCnt
+ * counter request
+ * @return: return SendWaitCnt/SendWaitVlCnt counter value per vl.
+ *
+ * Convert SendWaitCnt/SendWaitVlCnt counter from TXE cycle times to
+ * flit times. Call this function to samples these counters. This
+ * function will calculate for previous state transition and update
+ * current state at end of function using ppd->prev_link_width and
+ * ppd->port_vl_xmit_wait_last to port_vl_xmit_wait_curr and link_width.
+ */
+u64 get_xmit_wait_counters(struct hfi1_pportdata *ppd,
+			   u16 link_width, u16 link_speed, int vl)
+{
+	u64 port_vl_xmit_wait_curr;
+	u64 delta_vl_xmit_wait;
+	u64 xmit_wait_val;
+
+	if (vl > C_VL_COUNT)
+		return  0;
+	if (vl < C_VL_COUNT)
+		port_vl_xmit_wait_curr =
+			read_port_cntr(ppd, C_TX_WAIT_VL, vl);
+	else
+		port_vl_xmit_wait_curr =
+			read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL);
+
+	xmit_wait_val =
+		port_vl_xmit_wait_curr -
+		ppd->port_vl_xmit_wait_last[vl];
+	delta_vl_xmit_wait =
+		convert_xmit_counter(xmit_wait_val,
+				     ppd->prev_link_width,
+				     link_speed);
+
+	ppd->vl_xmit_flit_cnt[vl] += delta_vl_xmit_wait;
+	ppd->port_vl_xmit_wait_last[vl] = port_vl_xmit_wait_curr;
+	ppd->prev_link_width = link_width;
+
+	return ppd->vl_xmit_flit_cnt[vl];
+}
+
+static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
+				  struct ib_device *ibdev,
+				  u8 port, u32 *resp_len)
+{
+	struct opa_port_status_req *req =
+		(struct opa_port_status_req *)pmp->data;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct opa_port_status_rsp *rsp;
+	u32 vl_select_mask = be32_to_cpu(req->vl_select_mask);
+	unsigned long vl;
+	size_t response_data_size;
+	u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+	u8 port_num = req->port_num;
+	u8 num_vls = hweight32(vl_select_mask);
+	struct _vls_pctrs *vlinfo;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	int vfi;
+	u64 tmp, tmp2;
+	u16 link_width;
+	u16 link_speed;
+
+	response_data_size = sizeof(struct opa_port_status_rsp) +
+				num_vls * sizeof(struct _vls_pctrs);
+	if (response_data_size > sizeof(pmp->data)) {
+		pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	if (nports != 1 || (port_num && port_num != port) ||
+	    num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	rsp = (struct opa_port_status_rsp *)pmp->data;
+	if (port_num)
+		rsp->port_num = port_num;
+	else
+		rsp->port_num = port;
+
+	rsp->port_rcv_constraint_errors =
+		cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+					   CNTR_INVALID_VL));
+
+	hfi1_read_link_quality(dd, &rsp->link_quality_indicator);
+
+	rsp->vl_select_mask = cpu_to_be32(vl_select_mask);
+	rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+					  CNTR_INVALID_VL));
+	rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+					 CNTR_INVALID_VL));
+	rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+					  CNTR_INVALID_VL));
+	rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+					 CNTR_INVALID_VL));
+	rsp->port_multicast_xmit_pkts =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+					  CNTR_INVALID_VL));
+	rsp->port_multicast_rcv_pkts =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+					  CNTR_INVALID_VL));
+	/*
+	 * Convert PortXmitWait counter from TXE cycle times
+	 * to flit times.
+	 */
+	link_width =
+		tx_link_width(ppd->link_width_downgrade_tx_active);
+	link_speed = get_link_speed(ppd->link_speed_active);
+	rsp->port_xmit_wait =
+		cpu_to_be64(get_xmit_wait_counters(ppd, link_width,
+						   link_speed, C_VL_COUNT));
+	rsp->port_rcv_fecn =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+	rsp->port_rcv_becn =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+	rsp->port_xmit_discards =
+		cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+					   CNTR_INVALID_VL));
+	rsp->port_xmit_constraint_errors =
+		cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+					   CNTR_INVALID_VL));
+	rsp->port_rcv_remote_physical_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+					  CNTR_INVALID_VL));
+	rsp->local_link_integrity_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY,
+					  CNTR_INVALID_VL));
+	tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+	tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+				   CNTR_INVALID_VL);
+	if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+		/* overflow/wrapped */
+		rsp->link_error_recovery = cpu_to_be32(~0);
+	} else {
+		rsp->link_error_recovery = cpu_to_be32(tmp2);
+	}
+	rsp->port_rcv_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+	rsp->excessive_buffer_overruns =
+		cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+	rsp->fm_config_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+					  CNTR_INVALID_VL));
+	rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+						      CNTR_INVALID_VL));
+
+	/* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */
+	tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+	rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+	vlinfo = &rsp->vls[0];
+	vfi = 0;
+	/* The vl_select_mask has been checked above, and we know
+	 * that it contains only entries which represent valid VLs.
+	 * So in the for_each_set_bit() loop below, we don't need
+	 * any additional checks for vl.
+	 */
+	for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+			 8 * sizeof(vl_select_mask)) {
+		memset(vlinfo, 0, sizeof(*vlinfo));
+
+		tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl));
+		rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(tmp);
+
+		rsp->vls[vfi].port_vl_rcv_pkts =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+						  idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_xmit_data =
+			cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+						   idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_xmit_pkts =
+			cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+						   idx_from_vl(vl)));
+		/*
+		 * Convert PortVlXmitWait counter from TXE cycle
+		 * times to flit times.
+		 */
+		rsp->vls[vfi].port_vl_xmit_wait =
+			cpu_to_be64(get_xmit_wait_counters(ppd, link_width,
+							   link_speed,
+							   idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_rcv_fecn =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+						  idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_rcv_becn =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+						  idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_xmit_discards =
+			cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
+						   idx_from_vl(vl)));
+		vlinfo++;
+		vfi++;
+	}
+
+	a0_portstatus(ppd, rsp, vl_select_mask);
+
+	if (resp_len)
+		*resp_len += response_data_size;
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port,
+				     u8 res_lli, u8 res_ler)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u64 error_counter_summary = 0, tmp;
+
+	error_counter_summary += read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+						CNTR_INVALID_VL);
+	/* port_rcv_switch_relay_errors is 0 for HFIs */
+	error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_DSCD,
+						CNTR_INVALID_VL);
+	error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+						CNTR_INVALID_VL);
+	error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+					       CNTR_INVALID_VL);
+	/* local link integrity must be right-shifted by the lli resolution */
+	error_counter_summary += (read_dev_cntr(dd, C_DC_RX_REPLAY,
+						CNTR_INVALID_VL) >> res_lli);
+	/* link error recovery must b right-shifted by the ler resolution */
+	tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+	tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL);
+	error_counter_summary += (tmp >> res_ler);
+	error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR,
+					       CNTR_INVALID_VL);
+	error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+	error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+					       CNTR_INVALID_VL);
+	/* ppd->link_downed is a 32-bit value */
+	error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN,
+						CNTR_INVALID_VL);
+	tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+	/* this is an 8-bit quantity */
+	error_counter_summary += tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+	return error_counter_summary;
+}
+
+static void a0_datacounters(struct hfi1_pportdata *ppd, struct _port_dctrs *rsp,
+			    u32 vl_select_mask)
+{
+	if (!is_bx(ppd->dd)) {
+		unsigned long vl;
+		u64 sum_vl_xmit_wait = 0;
+		u32 vl_all_mask = VL_MASK_ALL;
+
+		for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
+				 8 * sizeof(vl_all_mask)) {
+			u64 tmp = sum_vl_xmit_wait +
+				  read_port_cntr(ppd, C_TX_WAIT_VL,
+						 idx_from_vl(vl));
+			if (tmp < sum_vl_xmit_wait) {
+				/* we wrapped */
+				sum_vl_xmit_wait = (u64)~0;
+				break;
+			}
+			sum_vl_xmit_wait = tmp;
+		}
+		if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
+			rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
+	}
+}
+
+static void pma_get_opa_port_dctrs(struct ib_device *ibdev,
+				   struct _port_dctrs *rsp)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+
+	rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+						CNTR_INVALID_VL));
+	rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+						CNTR_INVALID_VL));
+	rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+						CNTR_INVALID_VL));
+	rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+						CNTR_INVALID_VL));
+	rsp->port_multicast_xmit_pkts =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+					  CNTR_INVALID_VL));
+	rsp->port_multicast_rcv_pkts =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+					  CNTR_INVALID_VL));
+}
+
+static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
+				    struct ib_device *ibdev,
+				    u8 port, u32 *resp_len)
+{
+	struct opa_port_data_counters_msg *req =
+		(struct opa_port_data_counters_msg *)pmp->data;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct _port_dctrs *rsp;
+	struct _vls_dctrs *vlinfo;
+	size_t response_data_size;
+	u32 num_ports;
+	u8 lq, num_vls;
+	u8 res_lli, res_ler;
+	u64 port_mask;
+	u8 port_num;
+	unsigned long vl;
+	u32 vl_select_mask;
+	int vfi;
+	u16 link_width;
+	u16 link_speed;
+
+	num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+	num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+	vl_select_mask = be32_to_cpu(req->vl_select_mask);
+	res_lli = (u8)(be32_to_cpu(req->resolution) & MSK_LLI) >> MSK_LLI_SFT;
+	res_lli = res_lli ? res_lli + ADD_LLI : 0;
+	res_ler = (u8)(be32_to_cpu(req->resolution) & MSK_LER) >> MSK_LER_SFT;
+	res_ler = res_ler ? res_ler + ADD_LER : 0;
+
+	if (num_ports != 1 || (vl_select_mask & ~VL_MASK_ALL)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	/* Sanity check */
+	response_data_size = sizeof(struct opa_port_data_counters_msg) +
+				num_vls * sizeof(struct _vls_dctrs);
+
+	if (response_data_size > sizeof(pmp->data)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	/*
+	 * The bit set in the mask needs to be consistent with the
+	 * port the request came in on.
+	 */
+	port_mask = be64_to_cpu(req->port_select_mask[3]);
+	port_num = find_first_bit((unsigned long *)&port_mask,
+				  sizeof(port_mask) * 8);
+
+	if (port_num != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	rsp = &req->port[0];
+	memset(rsp, 0, sizeof(*rsp));
+
+	rsp->port_number = port;
+	/*
+	 * Note that link_quality_indicator is a 32 bit quantity in
+	 * 'datacounters' queries (as opposed to 'portinfo' queries,
+	 * where it's a byte).
+	 */
+	hfi1_read_link_quality(dd, &lq);
+	rsp->link_quality_indicator = cpu_to_be32((u32)lq);
+	pma_get_opa_port_dctrs(ibdev, rsp);
+
+	/*
+	 * Convert PortXmitWait counter from TXE
+	 * cycle times to flit times.
+	 */
+	link_width =
+		tx_link_width(ppd->link_width_downgrade_tx_active);
+	link_speed = get_link_speed(ppd->link_speed_active);
+	rsp->port_xmit_wait =
+		cpu_to_be64(get_xmit_wait_counters(ppd, link_width,
+						   link_speed, C_VL_COUNT));
+	rsp->port_rcv_fecn =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+	rsp->port_rcv_becn =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+	rsp->port_error_counter_summary =
+		cpu_to_be64(get_error_counter_summary(ibdev, port,
+						      res_lli, res_ler));
+
+	vlinfo = &rsp->vls[0];
+	vfi = 0;
+	/* The vl_select_mask has been checked above, and we know
+	 * that it contains only entries which represent valid VLs.
+	 * So in the for_each_set_bit() loop below, we don't need
+	 * any additional checks for vl.
+	 */
+	for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+			 8 * sizeof(req->vl_select_mask)) {
+		memset(vlinfo, 0, sizeof(*vlinfo));
+
+		rsp->vls[vfi].port_vl_xmit_data =
+			cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+						   idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_rcv_data =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL,
+						  idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_xmit_pkts =
+			cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+						   idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_rcv_pkts =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+						  idx_from_vl(vl)));
+
+		/*
+		 * Convert PortVlXmitWait counter from TXE
+		 * cycle times to flit times.
+		 */
+		rsp->vls[vfi].port_vl_xmit_wait =
+			cpu_to_be64(get_xmit_wait_counters(ppd, link_width,
+							   link_speed,
+							   idx_from_vl(vl)));
+
+		rsp->vls[vfi].port_vl_rcv_fecn =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+						  idx_from_vl(vl)));
+		rsp->vls[vfi].port_vl_rcv_becn =
+			cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+						  idx_from_vl(vl)));
+
+		/* rsp->port_vl_xmit_time_cong is 0 for HFIs */
+		/* rsp->port_vl_xmit_wasted_bw ??? */
+		/* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ???
+		 * does this differ from rsp->vls[vfi].port_vl_xmit_wait
+		 */
+		/*rsp->vls[vfi].port_vl_mark_fecn =
+		 *	cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT
+		 *		+ offset));
+		 */
+		vlinfo++;
+		vfi++;
+	}
+
+	a0_datacounters(ppd, rsp, vl_select_mask);
+
+	if (resp_len)
+		*resp_len += response_data_size;
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_ib_portcounters_ext(struct ib_pma_mad *pmp,
+				       struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters_ext *p = (struct ib_pma_portcounters_ext *)
+						pmp->data;
+	struct _port_dctrs rsp;
+
+	if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		goto bail;
+	}
+
+	memset(&rsp, 0, sizeof(rsp));
+	pma_get_opa_port_dctrs(ibdev, &rsp);
+
+	p->port_xmit_data = rsp.port_xmit_data;
+	p->port_rcv_data = rsp.port_rcv_data;
+	p->port_xmit_packets = rsp.port_xmit_pkts;
+	p->port_rcv_packets = rsp.port_rcv_pkts;
+	p->port_unicast_xmit_packets = 0;
+	p->port_unicast_rcv_packets =  0;
+	p->port_multicast_xmit_packets = rsp.port_multicast_xmit_pkts;
+	p->port_multicast_rcv_packets = rsp.port_multicast_rcv_pkts;
+
+bail:
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static void pma_get_opa_port_ectrs(struct ib_device *ibdev,
+				   struct _port_ectrs *rsp, u8 port)
+{
+	u64 tmp, tmp2;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+	tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+	tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+					CNTR_INVALID_VL);
+	if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+		/* overflow/wrapped */
+		rsp->link_error_recovery = cpu_to_be32(~0);
+	} else {
+		rsp->link_error_recovery = cpu_to_be32(tmp2);
+	}
+
+	rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+						CNTR_INVALID_VL));
+	rsp->port_rcv_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+	rsp->port_rcv_remote_physical_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+					  CNTR_INVALID_VL));
+	rsp->port_rcv_switch_relay_errors = 0;
+	rsp->port_xmit_discards =
+		cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+					   CNTR_INVALID_VL));
+	rsp->port_xmit_constraint_errors =
+		cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+					   CNTR_INVALID_VL));
+	rsp->port_rcv_constraint_errors =
+		cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+					   CNTR_INVALID_VL));
+	rsp->local_link_integrity_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY,
+					  CNTR_INVALID_VL));
+	rsp->excessive_buffer_overruns =
+		cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+}
+
+static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
+				  struct ib_device *ibdev,
+				  u8 port, u32 *resp_len)
+{
+	size_t response_data_size;
+	struct _port_ectrs *rsp;
+	u8 port_num;
+	struct opa_port_error_counters64_msg *req;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	u32 num_ports;
+	u8 num_pslm;
+	u8 num_vls;
+	struct hfi1_ibport *ibp;
+	struct hfi1_pportdata *ppd;
+	struct _vls_ectrs *vlinfo;
+	unsigned long vl;
+	u64 port_mask, tmp;
+	u32 vl_select_mask;
+	int vfi;
+
+	req = (struct opa_port_error_counters64_msg *)pmp->data;
+
+	num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+
+	num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+	num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+
+	if (num_ports != 1 || num_ports != num_pslm) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	response_data_size = sizeof(struct opa_port_error_counters64_msg) +
+				num_vls * sizeof(struct _vls_ectrs);
+
+	if (response_data_size > sizeof(pmp->data)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+	/*
+	 * The bit set in the mask needs to be consistent with the
+	 * port the request came in on.
+	 */
+	port_mask = be64_to_cpu(req->port_select_mask[3]);
+	port_num = find_first_bit((unsigned long *)&port_mask,
+				  sizeof(port_mask) * 8);
+
+	if (port_num != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	rsp = &req->port[0];
+
+	ibp = to_iport(ibdev, port_num);
+	ppd = ppd_from_ibp(ibp);
+
+	memset(rsp, 0, sizeof(*rsp));
+	rsp->port_number = port_num;
+
+	pma_get_opa_port_ectrs(ibdev, rsp, port_num);
+
+	rsp->port_rcv_remote_physical_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+					  CNTR_INVALID_VL));
+	rsp->fm_config_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+					  CNTR_INVALID_VL));
+	tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+
+	rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+	rsp->port_rcv_errors =
+		cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+	vlinfo = &rsp->vls[0];
+	vfi = 0;
+	vl_select_mask = be32_to_cpu(req->vl_select_mask);
+	for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+			 8 * sizeof(req->vl_select_mask)) {
+		memset(vlinfo, 0, sizeof(*vlinfo));
+		rsp->vls[vfi].port_vl_xmit_discards =
+			cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
+						   idx_from_vl(vl)));
+		vlinfo += 1;
+		vfi++;
+	}
+
+	if (resp_len)
+		*resp_len += response_data_size;
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_ib_portcounters(struct ib_pma_mad *pmp,
+				   struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct _port_ectrs rsp;
+	u64 temp_link_overrun_errors;
+	u64 temp_64;
+	u32 temp_32;
+
+	memset(&rsp, 0, sizeof(rsp));
+	pma_get_opa_port_ectrs(ibdev, &rsp, port);
+
+	if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		goto bail;
+	}
+
+	p->symbol_error_counter = 0; /* N/A for OPA */
+
+	temp_32 = be32_to_cpu(rsp.link_error_recovery);
+	if (temp_32 > 0xFFUL)
+		p->link_error_recovery_counter = 0xFF;
+	else
+		p->link_error_recovery_counter = (u8)temp_32;
+
+	temp_32 = be32_to_cpu(rsp.link_downed);
+	if (temp_32 > 0xFFUL)
+		p->link_downed_counter = 0xFF;
+	else
+		p->link_downed_counter = (u8)temp_32;
+
+	temp_64 = be64_to_cpu(rsp.port_rcv_errors);
+	if (temp_64 > 0xFFFFUL)
+		p->port_rcv_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_errors = cpu_to_be16((u16)temp_64);
+
+	temp_64 = be64_to_cpu(rsp.port_rcv_remote_physical_errors);
+	if (temp_64 > 0xFFFFUL)
+		p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_remphys_errors = cpu_to_be16((u16)temp_64);
+
+	temp_64 = be64_to_cpu(rsp.port_rcv_switch_relay_errors);
+	p->port_rcv_switch_relay_errors = cpu_to_be16((u16)temp_64);
+
+	temp_64 = be64_to_cpu(rsp.port_xmit_discards);
+	if (temp_64 > 0xFFFFUL)
+		p->port_xmit_discards = cpu_to_be16(0xFFFF);
+	else
+		p->port_xmit_discards = cpu_to_be16((u16)temp_64);
+
+	temp_64 = be64_to_cpu(rsp.port_xmit_constraint_errors);
+	if (temp_64 > 0xFFUL)
+		p->port_xmit_constraint_errors = 0xFF;
+	else
+		p->port_xmit_constraint_errors = (u8)temp_64;
+
+	temp_64 = be64_to_cpu(rsp.port_rcv_constraint_errors);
+	if (temp_64 > 0xFFUL)
+		p->port_rcv_constraint_errors = 0xFFUL;
+	else
+		p->port_rcv_constraint_errors = (u8)temp_64;
+
+	/* LocalLink: 7:4, BufferOverrun: 3:0 */
+	temp_64 = be64_to_cpu(rsp.local_link_integrity_errors);
+	if (temp_64 > 0xFUL)
+		temp_64 = 0xFUL;
+
+	temp_link_overrun_errors = temp_64 << 4;
+
+	temp_64 = be64_to_cpu(rsp.excessive_buffer_overruns);
+	if (temp_64 > 0xFUL)
+		temp_64 = 0xFUL;
+	temp_link_overrun_errors |= temp_64;
+
+	p->link_overrun_errors = (u8)temp_link_overrun_errors;
+
+	p->vl15_dropped = 0; /* N/A for OPA */
+
+bail:
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
+				 struct ib_device *ibdev,
+				 u8 port, u32 *resp_len)
+{
+	size_t response_data_size;
+	struct _port_ei *rsp;
+	struct opa_port_error_info_msg *req;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	u64 port_mask;
+	u32 num_ports;
+	u8 port_num;
+	u8 num_pslm;
+	u64 reg;
+
+	req = (struct opa_port_error_info_msg *)pmp->data;
+	rsp = &req->port[0];
+
+	num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+	num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+	memset(rsp, 0, sizeof(*rsp));
+
+	if (num_ports != 1 || num_ports != num_pslm) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	/* Sanity check */
+	response_data_size = sizeof(struct opa_port_error_info_msg);
+
+	if (response_data_size > sizeof(pmp->data)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	/*
+	 * The bit set in the mask needs to be consistent with the port
+	 * the request came in on.
+	 */
+	port_mask = be64_to_cpu(req->port_select_mask[3]);
+	port_num = find_first_bit((unsigned long *)&port_mask,
+				  sizeof(port_mask) * 8);
+
+	if (port_num != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	/* PortRcvErrorInfo */
+	rsp->port_rcv_ei.status_and_code =
+		dd->err_info_rcvport.status_and_code;
+	memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1,
+	       &dd->err_info_rcvport.packet_flit1, sizeof(u64));
+	memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2,
+	       &dd->err_info_rcvport.packet_flit2, sizeof(u64));
+
+	/* ExcessiverBufferOverrunInfo */
+	reg = read_csr(dd, RCV_ERR_INFO);
+	if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) {
+		/*
+		 * if the RcvExcessBufferOverrun bit is set, save SC of
+		 * first pkt that encountered an excess buffer overrun
+		 */
+		u8 tmp = (u8)reg;
+
+		tmp &=  RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK;
+		tmp <<= 2;
+		rsp->excessive_buffer_overrun_ei.status_and_sc = tmp;
+		/* set the status bit */
+		rsp->excessive_buffer_overrun_ei.status_and_sc |= 0x80;
+	}
+
+	rsp->port_xmit_constraint_ei.status =
+		dd->err_info_xmit_constraint.status;
+	rsp->port_xmit_constraint_ei.pkey =
+		cpu_to_be16(dd->err_info_xmit_constraint.pkey);
+	rsp->port_xmit_constraint_ei.slid =
+		cpu_to_be32(dd->err_info_xmit_constraint.slid);
+
+	rsp->port_rcv_constraint_ei.status =
+		dd->err_info_rcv_constraint.status;
+	rsp->port_rcv_constraint_ei.pkey =
+		cpu_to_be16(dd->err_info_rcv_constraint.pkey);
+	rsp->port_rcv_constraint_ei.slid =
+		cpu_to_be32(dd->err_info_rcv_constraint.slid);
+
+	/* UncorrectableErrorInfo */
+	rsp->uncorrectable_ei.status_and_code = dd->err_info_uncorrectable;
+
+	/* FMConfigErrorInfo */
+	rsp->fm_config_ei.status_and_code = dd->err_info_fmconfig;
+
+	if (resp_len)
+		*resp_len += response_data_size;
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
+				  struct ib_device *ibdev,
+				  u8 port, u32 *resp_len)
+{
+	struct opa_clear_port_status *req =
+		(struct opa_clear_port_status *)pmp->data;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+	u64 portn = be64_to_cpu(req->port_select_mask[3]);
+	u32 counter_select = be32_to_cpu(req->counter_select_mask);
+	u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
+	unsigned long vl;
+
+	if ((nports != 1) || (portn != 1 << port)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+	/*
+	 * only counters returned by pma_get_opa_portstatus() are
+	 * handled, so when pma_get_opa_portstatus() gets a fix,
+	 * the corresponding change should be made here as well.
+	 */
+
+	if (counter_select & CS_PORT_XMIT_DATA)
+		write_dev_cntr(dd, C_DC_XMIT_FLITS, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_RCV_DATA)
+		write_dev_cntr(dd, C_DC_RCV_FLITS, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_XMIT_PKTS)
+		write_dev_cntr(dd, C_DC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_RCV_PKTS)
+		write_dev_cntr(dd, C_DC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_MCAST_XMIT_PKTS)
+		write_dev_cntr(dd, C_DC_MC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_MCAST_RCV_PKTS)
+		write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_XMIT_WAIT) {
+		write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0);
+		ppd->port_vl_xmit_wait_last[C_VL_COUNT] = 0;
+		ppd->vl_xmit_flit_cnt[C_VL_COUNT] = 0;
+	}
+	/* ignore cs_sw_portCongestion for HFIs */
+
+	if (counter_select & CS_PORT_RCV_FECN)
+		write_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_RCV_BECN)
+		write_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL, 0);
+
+	/* ignore cs_port_xmit_time_cong for HFIs */
+	/* ignore cs_port_xmit_wasted_bw for now */
+	/* ignore cs_port_xmit_wait_data for now */
+	if (counter_select & CS_PORT_RCV_BUBBLE)
+		write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0);
+
+	/* Only applicable for switch */
+	/* if (counter_select & CS_PORT_MARK_FECN)
+	 *	write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);
+	 */
+
+	if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS)
+		write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+	/* ignore cs_port_rcv_switch_relay_errors for HFIs */
+	if (counter_select & CS_PORT_XMIT_DISCARDS)
+		write_port_cntr(ppd, C_SW_XMIT_DSCD, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_XMIT_CONSTRAINT_ERRORS)
+		write_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS)
+		write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS)
+		write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_LINK_ERROR_RECOVERY) {
+		write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+		write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+			       CNTR_INVALID_VL, 0);
+	}
+
+	if (counter_select & CS_PORT_RCV_ERRORS)
+		write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_EXCESSIVE_BUFFER_OVERRUNS) {
+		write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+		dd->rcv_ovfl_cnt = 0;
+	}
+
+	if (counter_select & CS_FM_CONFIG_ERRORS)
+		write_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_LINK_DOWNED)
+		write_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL, 0);
+
+	if (counter_select & CS_UNCORRECTABLE_ERRORS)
+		write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0);
+
+	for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+			 8 * sizeof(vl_select_mask)) {
+		if (counter_select & CS_PORT_XMIT_DATA)
+			write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0);
+
+		if (counter_select & CS_PORT_RCV_DATA)
+			write_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl), 0);
+
+		if (counter_select & CS_PORT_XMIT_PKTS)
+			write_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl), 0);
+
+		if (counter_select & CS_PORT_RCV_PKTS)
+			write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0);
+
+		if (counter_select & CS_PORT_XMIT_WAIT) {
+			write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0);
+			ppd->port_vl_xmit_wait_last[idx_from_vl(vl)] = 0;
+			ppd->vl_xmit_flit_cnt[idx_from_vl(vl)] = 0;
+		}
+
+		/* sw_port_vl_congestion is 0 for HFIs */
+		if (counter_select & CS_PORT_RCV_FECN)
+			write_dev_cntr(dd, C_DC_RCV_FCN_VL, idx_from_vl(vl), 0);
+
+		if (counter_select & CS_PORT_RCV_BECN)
+			write_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl), 0);
+
+		/* port_vl_xmit_time_cong is 0 for HFIs */
+		/* port_vl_xmit_wasted_bw ??? */
+		/* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? */
+		if (counter_select & CS_PORT_RCV_BUBBLE)
+			write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0);
+
+		/* if (counter_select & CS_PORT_MARK_FECN)
+		 *     write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
+		 */
+		if (counter_select & C_SW_XMIT_DSCD_VL)
+			write_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
+					idx_from_vl(vl), 0);
+	}
+
+	if (resp_len)
+		*resp_len += sizeof(*req);
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp,
+				 struct ib_device *ibdev,
+				 u8 port, u32 *resp_len)
+{
+	struct _port_ei *rsp;
+	struct opa_port_error_info_msg *req;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	u64 port_mask;
+	u32 num_ports;
+	u8 port_num;
+	u8 num_pslm;
+	u32 error_info_select;
+
+	req = (struct opa_port_error_info_msg *)pmp->data;
+	rsp = &req->port[0];
+
+	num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+	num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+	memset(rsp, 0, sizeof(*rsp));
+
+	if (num_ports != 1 || num_ports != num_pslm) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	/*
+	 * The bit set in the mask needs to be consistent with the port
+	 * the request came in on.
+	 */
+	port_mask = be64_to_cpu(req->port_select_mask[3]);
+	port_num = find_first_bit((unsigned long *)&port_mask,
+				  sizeof(port_mask) * 8);
+
+	if (port_num != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	error_info_select = be32_to_cpu(req->error_info_select_mask);
+
+	/* PortRcvErrorInfo */
+	if (error_info_select & ES_PORT_RCV_ERROR_INFO)
+		/* turn off status bit */
+		dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+
+	/* ExcessiverBufferOverrunInfo */
+	if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO)
+		/*
+		 * status bit is essentially kept in the h/w - bit 5 of
+		 * RCV_ERR_INFO
+		 */
+		write_csr(dd, RCV_ERR_INFO,
+			  RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+
+	if (error_info_select & ES_PORT_XMIT_CONSTRAINT_ERROR_INFO)
+		dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+	if (error_info_select & ES_PORT_RCV_CONSTRAINT_ERROR_INFO)
+		dd->err_info_rcv_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+	/* UncorrectableErrorInfo */
+	if (error_info_select & ES_UNCORRECTABLE_ERROR_INFO)
+		/* turn off status bit */
+		dd->err_info_uncorrectable &= ~OPA_EI_STATUS_SMASK;
+
+	/* FMConfigErrorInfo */
+	if (error_info_select & ES_FM_CONFIG_ERROR_INFO)
+		/* turn off status bit */
+		dd->err_info_fmconfig &= ~OPA_EI_STATUS_SMASK;
+
+	if (resp_len)
+		*resp_len += sizeof(*req);
+
+	return reply((struct ib_mad_hdr *)pmp);
+}
+
+struct opa_congestion_info_attr {
+	__be16 congestion_info;
+	u8 control_table_cap;	/* Multiple of 64 entry unit CCTs */
+	u8 congestion_log_length;
+} __packed;
+
+static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
+				    struct ib_device *ibdev, u8 port,
+				    u32 *resp_len, u32 max_len)
+{
+	struct opa_congestion_info_attr *p =
+		(struct opa_congestion_info_attr *)data;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+	if (smp_length_check(sizeof(*p), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	p->congestion_info = 0;
+	p->control_table_cap = ppd->cc_max_table_entries;
+	p->congestion_log_length = OPA_CONG_LOG_ELEMS;
+
+	if (resp_len)
+		*resp_len += sizeof(*p);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
+				       u8 *data, struct ib_device *ibdev,
+				       u8 port, u32 *resp_len, u32 max_len)
+{
+	int i;
+	struct opa_congestion_setting_attr *p =
+		(struct opa_congestion_setting_attr *)data;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct opa_congestion_setting_entry_shadow *entries;
+	struct cc_state *cc_state;
+
+	if (smp_length_check(sizeof(*p), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	rcu_read_lock();
+
+	cc_state = get_cc_state(ppd);
+
+	if (!cc_state) {
+		rcu_read_unlock();
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	entries = cc_state->cong_setting.entries;
+	p->port_control = cpu_to_be16(cc_state->cong_setting.port_control);
+	p->control_map = cpu_to_be32(cc_state->cong_setting.control_map);
+	for (i = 0; i < OPA_MAX_SLS; i++) {
+		p->entries[i].ccti_increase = entries[i].ccti_increase;
+		p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer);
+		p->entries[i].trigger_threshold =
+			entries[i].trigger_threshold;
+		p->entries[i].ccti_min = entries[i].ccti_min;
+	}
+
+	rcu_read_unlock();
+
+	if (resp_len)
+		*resp_len += sizeof(*p);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+/*
+ * Apply congestion control information stored in the ppd to the
+ * active structure.
+ */
+static void apply_cc_state(struct hfi1_pportdata *ppd)
+{
+	struct cc_state *old_cc_state, *new_cc_state;
+
+	new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL);
+	if (!new_cc_state)
+		return;
+
+	/*
+	 * Hold the lock for updating *and* to prevent ppd information
+	 * from changing during the update.
+	 */
+	spin_lock(&ppd->cc_state_lock);
+
+	old_cc_state = get_cc_state_protected(ppd);
+	if (!old_cc_state) {
+		/* never active, or shutting down */
+		spin_unlock(&ppd->cc_state_lock);
+		kfree(new_cc_state);
+		return;
+	}
+
+	*new_cc_state = *old_cc_state;
+
+	if (ppd->total_cct_entry)
+		new_cc_state->cct.ccti_limit = ppd->total_cct_entry - 1;
+	else
+		new_cc_state->cct.ccti_limit = 0;
+
+	memcpy(new_cc_state->cct.entries, ppd->ccti_entries,
+	       ppd->total_cct_entry * sizeof(struct ib_cc_table_entry));
+
+	new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED;
+	new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map;
+	memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries,
+	       OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry));
+
+	rcu_assign_pointer(ppd->cc_state, new_cc_state);
+
+	spin_unlock(&ppd->cc_state_lock);
+
+	kfree_rcu(old_cc_state, rcu);
+}
+
+static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
+				       struct ib_device *ibdev, u8 port,
+				       u32 *resp_len, u32 max_len)
+{
+	struct opa_congestion_setting_attr *p =
+		(struct opa_congestion_setting_attr *)data;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct opa_congestion_setting_entry_shadow *entries;
+	int i;
+
+	if (smp_length_check(sizeof(*p), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	/*
+	 * Save details from packet into the ppd.  Hold the cc_state_lock so
+	 * our information is consistent with anyone trying to apply the state.
+	 */
+	spin_lock(&ppd->cc_state_lock);
+	ppd->cc_sl_control_map = be32_to_cpu(p->control_map);
+
+	entries = ppd->congestion_entries;
+	for (i = 0; i < OPA_MAX_SLS; i++) {
+		entries[i].ccti_increase = p->entries[i].ccti_increase;
+		entries[i].ccti_timer = be16_to_cpu(p->entries[i].ccti_timer);
+		entries[i].trigger_threshold =
+			p->entries[i].trigger_threshold;
+		entries[i].ccti_min = p->entries[i].ccti_min;
+	}
+	spin_unlock(&ppd->cc_state_lock);
+
+	/* now apply the information */
+	apply_cc_state(ppd);
+
+	return __subn_get_opa_cong_setting(smp, am, data, ibdev, port,
+					   resp_len, max_len);
+}
+
+static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
+					u8 *data, struct ib_device *ibdev,
+					u8 port, u32 *resp_len, u32 max_len)
+{
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct opa_hfi1_cong_log *cong_log = (struct opa_hfi1_cong_log *)data;
+	u64 ts;
+	int i;
+
+	if (am || smp_length_check(sizeof(*cong_log), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	spin_lock_irq(&ppd->cc_log_lock);
+
+	cong_log->log_type = OPA_CC_LOG_TYPE_HFI;
+	cong_log->congestion_flags = 0;
+	cong_log->threshold_event_counter =
+		cpu_to_be16(ppd->threshold_event_counter);
+	memcpy(cong_log->threshold_cong_event_map,
+	       ppd->threshold_cong_event_map,
+	       sizeof(cong_log->threshold_cong_event_map));
+	/* keep timestamp in units of 1.024 usec */
+	ts = ktime_get_ns() / 1024;
+	cong_log->current_time_stamp = cpu_to_be32(ts);
+	for (i = 0; i < OPA_CONG_LOG_ELEMS; i++) {
+		struct opa_hfi1_cong_log_event_internal *cce =
+			&ppd->cc_events[ppd->cc_mad_idx++];
+		if (ppd->cc_mad_idx == OPA_CONG_LOG_ELEMS)
+			ppd->cc_mad_idx = 0;
+		/*
+		 * Entries which are older than twice the time
+		 * required to wrap the counter are supposed to
+		 * be zeroed (CA10-49 IBTA, release 1.2.1, V1).
+		 */
+		if ((ts - cce->timestamp) / 2 > U32_MAX)
+			continue;
+		memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3);
+		memcpy(cong_log->events[i].remote_qp_number_cn_entry,
+		       &cce->rqpn, 3);
+		cong_log->events[i].sl_svc_type_cn_entry =
+			((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7);
+		cong_log->events[i].remote_lid_cn_entry =
+			cpu_to_be32(cce->rlid);
+		cong_log->events[i].timestamp_cn_entry =
+			cpu_to_be32(cce->timestamp);
+	}
+
+	/*
+	 * Reset threshold_cong_event_map, and threshold_event_counter
+	 * to 0 when log is read.
+	 */
+	memset(ppd->threshold_cong_event_map, 0x0,
+	       sizeof(ppd->threshold_cong_event_map));
+	ppd->threshold_event_counter = 0;
+
+	spin_unlock_irq(&ppd->cc_log_lock);
+
+	if (resp_len)
+		*resp_len += sizeof(struct opa_hfi1_cong_log);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len, u32 max_len)
+{
+	struct ib_cc_table_attr *cc_table_attr =
+		(struct ib_cc_table_attr *)data;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 start_block = OPA_AM_START_BLK(am);
+	u32 n_blocks = OPA_AM_NBLK(am);
+	struct ib_cc_table_entry_shadow *entries;
+	int i, j;
+	u32 sentry, eentry;
+	struct cc_state *cc_state;
+	u32 size = sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1);
+
+	/* sanity check n_blocks, start_block */
+	if (n_blocks == 0 || smp_length_check(size, max_len) ||
+	    start_block + n_blocks > ppd->cc_max_table_entries) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	rcu_read_lock();
+
+	cc_state = get_cc_state(ppd);
+
+	if (!cc_state) {
+		rcu_read_unlock();
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	sentry = start_block * IB_CCT_ENTRIES;
+	eentry = sentry + (IB_CCT_ENTRIES * n_blocks);
+
+	cc_table_attr->ccti_limit = cpu_to_be16(cc_state->cct.ccti_limit);
+
+	entries = cc_state->cct.entries;
+
+	/* return n_blocks, though the last block may not be full */
+	for (j = 0, i = sentry; i < eentry; j++, i++)
+		cc_table_attr->ccti_entries[j].entry =
+			cpu_to_be16(entries[i].entry);
+
+	rcu_read_unlock();
+
+	if (resp_len)
+		*resp_len += size;
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len, u32 max_len)
+{
+	struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 start_block = OPA_AM_START_BLK(am);
+	u32 n_blocks = OPA_AM_NBLK(am);
+	struct ib_cc_table_entry_shadow *entries;
+	int i, j;
+	u32 sentry, eentry;
+	u16 ccti_limit;
+	u32 size = sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1);
+
+	/* sanity check n_blocks, start_block */
+	if (n_blocks == 0 || smp_length_check(size, max_len) ||
+	    start_block + n_blocks > ppd->cc_max_table_entries) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	sentry = start_block * IB_CCT_ENTRIES;
+	eentry = sentry + ((n_blocks - 1) * IB_CCT_ENTRIES) +
+		 (be16_to_cpu(p->ccti_limit)) % IB_CCT_ENTRIES + 1;
+
+	/* sanity check ccti_limit */
+	ccti_limit = be16_to_cpu(p->ccti_limit);
+	if (ccti_limit + 1 > eentry) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	/*
+	 * Save details from packet into the ppd.  Hold the cc_state_lock so
+	 * our information is consistent with anyone trying to apply the state.
+	 */
+	spin_lock(&ppd->cc_state_lock);
+	ppd->total_cct_entry = ccti_limit + 1;
+	entries = ppd->ccti_entries;
+	for (j = 0, i = sentry; i < eentry; j++, i++)
+		entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry);
+	spin_unlock(&ppd->cc_state_lock);
+
+	/* now apply the information */
+	apply_cc_state(ppd);
+
+	return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len,
+				       max_len);
+}
+
+struct opa_led_info {
+	__be32 rsvd_led_mask;
+	__be32 rsvd;
+};
+
+#define OPA_LED_SHIFT	31
+#define OPA_LED_MASK	BIT(OPA_LED_SHIFT)
+
+static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len, u32 max_len)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct hfi1_pportdata *ppd = dd->pport;
+	struct opa_led_info *p = (struct opa_led_info *)data;
+	u32 nport = OPA_AM_NPORT(am);
+	u32 is_beaconing_active;
+
+	if (nport != 1 || smp_length_check(sizeof(*p), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	/*
+	 * This pairs with the memory barrier in hfi1_start_led_override to
+	 * ensure that we read the correct state of LED beaconing represented
+	 * by led_override_timer_active
+	 */
+	smp_rmb();
+	is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
+	p->rsvd_led_mask = cpu_to_be32(is_beaconing_active << OPA_LED_SHIFT);
+
+	if (resp_len)
+		*resp_len += sizeof(struct opa_led_info);
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+				   struct ib_device *ibdev, u8 port,
+				   u32 *resp_len, u32 max_len)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	struct opa_led_info *p = (struct opa_led_info *)data;
+	u32 nport = OPA_AM_NPORT(am);
+	int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK);
+
+	if (nport != 1 || smp_length_check(sizeof(*p), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	if (on)
+		hfi1_start_led_override(dd->pport, 2000, 1500);
+	else
+		shutdown_led_override(dd->pport);
+
+	return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len,
+				       max_len);
+}
+
+static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+			    u8 *data, struct ib_device *ibdev, u8 port,
+			    u32 *resp_len, u32 max_len)
+{
+	int ret;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+	switch (attr_id) {
+	case IB_SMP_ATTR_NODE_DESC:
+		ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port,
+					      resp_len, max_len);
+		break;
+	case IB_SMP_ATTR_NODE_INFO:
+		ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port,
+					      resp_len, max_len);
+		break;
+	case IB_SMP_ATTR_PORT_INFO:
+		ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port,
+					      resp_len, max_len);
+		break;
+	case IB_SMP_ATTR_PKEY_TABLE:
+		ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port,
+					       resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+		ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port,
+					      resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+		ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port,
+					      resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+		ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port,
+					       resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+		ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+						resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_PORT_STATE_INFO:
+		ret = __subn_get_opa_psi(smp, am, data, ibdev, port,
+					 resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+		ret = __subn_get_opa_bct(smp, am, data, ibdev, port,
+					 resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_CABLE_INFO:
+		ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port,
+						resp_len, max_len);
+		break;
+	case IB_SMP_ATTR_VL_ARB_TABLE:
+		ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port,
+					    resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_CONGESTION_INFO:
+		ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port,
+					       resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+		ret = __subn_get_opa_cong_setting(smp, am, data, ibdev,
+						  port, resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_HFI_CONGESTION_LOG:
+		ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev,
+						   port, resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+		ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port,
+					      resp_len, max_len);
+		break;
+	case IB_SMP_ATTR_LED_INFO:
+		ret = __subn_get_opa_led_info(smp, am, data, ibdev, port,
+					      resp_len, max_len);
+		break;
+	case IB_SMP_ATTR_SM_INFO:
+		if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
+			return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+		if (ibp->rvp.port_cap_flags & IB_PORT_SM)
+			return IB_MAD_RESULT_SUCCESS;
+		/* FALLTHROUGH */
+	default:
+		smp->status |= IB_SMP_UNSUP_METH_ATTR;
+		ret = reply((struct ib_mad_hdr *)smp);
+		break;
+	}
+	return ret;
+}
+
+static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+			    u8 *data, struct ib_device *ibdev, u8 port,
+			    u32 *resp_len, u32 max_len)
+{
+	int ret;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+	switch (attr_id) {
+	case IB_SMP_ATTR_PORT_INFO:
+		ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port,
+					      resp_len, max_len);
+		break;
+	case IB_SMP_ATTR_PKEY_TABLE:
+		ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port,
+					       resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+		ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port,
+					      resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+		ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port,
+					      resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+		ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port,
+					       resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+		ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+						resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_PORT_STATE_INFO:
+		ret = __subn_set_opa_psi(smp, am, data, ibdev, port,
+					 resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+		ret = __subn_set_opa_bct(smp, am, data, ibdev, port,
+					 resp_len, max_len);
+		break;
+	case IB_SMP_ATTR_VL_ARB_TABLE:
+		ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port,
+					    resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+		ret = __subn_set_opa_cong_setting(smp, am, data, ibdev,
+						  port, resp_len, max_len);
+		break;
+	case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+		ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port,
+					      resp_len, max_len);
+		break;
+	case IB_SMP_ATTR_LED_INFO:
+		ret = __subn_set_opa_led_info(smp, am, data, ibdev, port,
+					      resp_len, max_len);
+		break;
+	case IB_SMP_ATTR_SM_INFO:
+		if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
+			return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+		if (ibp->rvp.port_cap_flags & IB_PORT_SM)
+			return IB_MAD_RESULT_SUCCESS;
+		/* FALLTHROUGH */
+	default:
+		smp->status |= IB_SMP_UNSUP_METH_ATTR;
+		ret = reply((struct ib_mad_hdr *)smp);
+		break;
+	}
+	return ret;
+}
+
+static inline void set_aggr_error(struct opa_aggregate *ag)
+{
+	ag->err_reqlength |= cpu_to_be16(0x8000);
+}
+
+static int subn_get_opa_aggregate(struct opa_smp *smp,
+				  struct ib_device *ibdev, u8 port,
+				  u32 *resp_len)
+{
+	int i;
+	u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+	u8 *next_smp = opa_get_smp_data(smp);
+
+	if (num_attr < 1 || num_attr > 117) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i < num_attr; i++) {
+		struct opa_aggregate *agg;
+		size_t agg_data_len;
+		size_t agg_size;
+		u32 am;
+
+		agg = (struct opa_aggregate *)next_smp;
+		agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+		agg_size = sizeof(*agg) + agg_data_len;
+		am = be32_to_cpu(agg->attr_mod);
+
+		*resp_len += agg_size;
+
+		if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+			smp->status |= IB_SMP_INVALID_FIELD;
+			return reply((struct ib_mad_hdr *)smp);
+		}
+
+		/* zero the payload for this segment */
+		memset(next_smp + sizeof(*agg), 0, agg_data_len);
+
+		(void)subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
+				       ibdev, port, NULL, (u32)agg_data_len);
+
+		if (smp->status & IB_SMP_INVALID_FIELD)
+			break;
+		if (smp->status & ~IB_SMP_DIRECTION) {
+			set_aggr_error(agg);
+			return reply((struct ib_mad_hdr *)smp);
+		}
+		next_smp += agg_size;
+	}
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_set_opa_aggregate(struct opa_smp *smp,
+				  struct ib_device *ibdev, u8 port,
+				  u32 *resp_len)
+{
+	int i;
+	u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+	u8 *next_smp = opa_get_smp_data(smp);
+
+	if (num_attr < 1 || num_attr > 117) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
+	for (i = 0; i < num_attr; i++) {
+		struct opa_aggregate *agg;
+		size_t agg_data_len;
+		size_t agg_size;
+		u32 am;
+
+		agg = (struct opa_aggregate *)next_smp;
+		agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+		agg_size = sizeof(*agg) + agg_data_len;
+		am = be32_to_cpu(agg->attr_mod);
+
+		*resp_len += agg_size;
+
+		if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+			smp->status |= IB_SMP_INVALID_FIELD;
+			return reply((struct ib_mad_hdr *)smp);
+		}
+
+		(void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
+				       ibdev, port, NULL, (u32)agg_data_len);
+		if (smp->status & IB_SMP_INVALID_FIELD)
+			break;
+		if (smp->status & ~IB_SMP_DIRECTION) {
+			set_aggr_error(agg);
+			return reply((struct ib_mad_hdr *)smp);
+		}
+		next_smp += agg_size;
+	}
+
+	return reply((struct ib_mad_hdr *)smp);
+}
+
+/*
+ * OPAv1 specifies that, on the transition to link up, these counters
+ * are cleared:
+ *   PortRcvErrors [*]
+ *   LinkErrorRecovery
+ *   LocalLinkIntegrityErrors
+ *   ExcessiveBufferOverruns [*]
+ *
+ * [*] Error info associated with these counters is retained, but the
+ * error info status is reset to 0.
+ */
+void clear_linkup_counters(struct hfi1_devdata *dd)
+{
+	/* PortRcvErrors */
+	write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+	dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+	/* LinkErrorRecovery */
+	write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+	write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0);
+	/* LocalLinkIntegrityErrors */
+	write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+	/* ExcessiveBufferOverruns */
+	write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+	dd->rcv_ovfl_cnt = 0;
+	dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+}
+
+static int is_full_mgmt_pkey_in_table(struct hfi1_ibport *ibp)
+{
+	unsigned int i;
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i)
+		if (ppd->pkeys[i] == FULL_MGMT_P_KEY)
+			return 1;
+
+	return 0;
+}
+
+/*
+ * is_local_mad() returns 1 if 'mad' is sent from, and destined to the
+ * local node, 0 otherwise.
+ */
+static int is_local_mad(struct hfi1_ibport *ibp, const struct opa_mad *mad,
+			const struct ib_wc *in_wc)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	const struct opa_smp *smp = (const struct opa_smp *)mad;
+
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		return (smp->hop_cnt == 0 &&
+			smp->route.dr.dr_slid == OPA_LID_PERMISSIVE &&
+			smp->route.dr.dr_dlid == OPA_LID_PERMISSIVE);
+	}
+
+	return (in_wc->slid == ppd->lid);
+}
+
+/*
+ * opa_local_smp_check() should only be called on MADs for which
+ * is_local_mad() returns true. It applies the SMP checks that are
+ * specific to SMPs which are sent from, and destined to this node.
+ * opa_local_smp_check() returns 0 if the SMP passes its checks, 1
+ * otherwise.
+ *
+ * SMPs which arrive from other nodes are instead checked by
+ * opa_smp_check().
+ */
+static int opa_local_smp_check(struct hfi1_ibport *ibp,
+			       const struct ib_wc *in_wc)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u16 pkey;
+
+	if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys))
+		return 1;
+
+	pkey = ppd->pkeys[in_wc->pkey_index];
+	/*
+	 * We need to do the "node-local" checks specified in OPAv1,
+	 * rev 0.90, section 9.10.26, which are:
+	 *   - pkey is 0x7fff, or 0xffff
+	 *   - Source QPN == 0 || Destination QPN == 0
+	 *   - the MAD header's management class is either
+	 *     IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE or
+	 *     IB_MGMT_CLASS_SUBN_LID_ROUTED
+	 *   - SLID != 0
+	 *
+	 * However, we know (and so don't need to check again) that,
+	 * for local SMPs, the MAD stack passes MADs with:
+	 *   - Source QPN of 0
+	 *   - MAD mgmt_class is IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+	 *   - SLID is either: OPA_LID_PERMISSIVE (0xFFFFFFFF), or
+	 *     our own port's lid
+	 *
+	 */
+	if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
+		return 0;
+	ingress_pkey_table_fail(ppd, pkey, in_wc->slid);
+	return 1;
+}
+
+/**
+ * hfi1_pkey_validation_pma - It validates PKEYs for incoming PMA MAD packets.
+ * @ibp: IB port data
+ * @in_mad: MAD packet with header and data
+ * @in_wc: Work completion data such as source LID, port number, etc.
+ *
+ * These are all the possible logic rules for validating a pkey:
+ *
+ * a) If pkey neither FULL_MGMT_P_KEY nor LIM_MGMT_P_KEY,
+ *    and NOT self-originated packet:
+ *     Drop MAD packet as it should always be part of the
+ *     management partition unless it's a self-originated packet.
+ *
+ * b) If pkey_index -> FULL_MGMT_P_KEY, and LIM_MGMT_P_KEY in pkey table:
+ *     The packet is coming from a management node and the receiving node
+ *     is also a management node, so it is safe for the packet to go through.
+ *
+ * c) If pkey_index -> FULL_MGMT_P_KEY, and LIM_MGMT_P_KEY is NOT in pkey table:
+ *     Drop the packet as LIM_MGMT_P_KEY should always be in the pkey table.
+ *     It could be an FM misconfiguration.
+ *
+ * d) If pkey_index -> LIM_MGMT_P_KEY and FULL_MGMT_P_KEY is NOT in pkey table:
+ *     It is safe for the packet to go through since a non-management node is
+ *     talking to another non-management node.
+ *
+ * e) If pkey_index -> LIM_MGMT_P_KEY and FULL_MGMT_P_KEY in pkey table:
+ *     Drop the packet because a non-management node is talking to a
+ *     management node, and it could be an attack.
+ *
+ * For the implementation, these rules can be simplied to only checking
+ * for (a) and (e). There's no need to check for rule (b) as
+ * the packet doesn't need to be dropped. Rule (c) is not possible in
+ * the driver as LIM_MGMT_P_KEY is always in the pkey table.
+ *
+ * Return:
+ * 0 - pkey is okay, -EINVAL it's a bad pkey
+ */
+static int hfi1_pkey_validation_pma(struct hfi1_ibport *ibp,
+				    const struct opa_mad *in_mad,
+				    const struct ib_wc *in_wc)
+{
+	u16 pkey_value = hfi1_lookup_pkey_value(ibp, in_wc->pkey_index);
+
+	/* Rule (a) from above */
+	if (!is_local_mad(ibp, in_mad, in_wc) &&
+	    pkey_value != LIM_MGMT_P_KEY &&
+	    pkey_value != FULL_MGMT_P_KEY)
+		return -EINVAL;
+
+	/* Rule (e) from above */
+	if (pkey_value == LIM_MGMT_P_KEY &&
+	    is_full_mgmt_pkey_in_table(ibp))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
+			    u8 port, const struct opa_mad *in_mad,
+			    struct opa_mad *out_mad,
+			    u32 *resp_len)
+{
+	struct opa_smp *smp = (struct opa_smp *)out_mad;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	u8 *data;
+	u32 am, data_size;
+	__be16 attr_id;
+	int ret;
+
+	*out_mad = *in_mad;
+	data = opa_get_smp_data(smp);
+	data_size = (u32)opa_get_smp_data_size(smp);
+
+	am = be32_to_cpu(smp->attr_mod);
+	attr_id = smp->attr_id;
+	if (smp->class_version != OPA_SM_CLASS_VERSION) {
+		smp->status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_mad_hdr *)smp);
+		return ret;
+	}
+	ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey,
+			 smp->route.dr.dr_slid, smp->route.dr.return_path,
+			 smp->hop_cnt);
+	if (ret) {
+		u32 port_num = be32_to_cpu(smp->attr_mod);
+
+		/*
+		 * If this is a get/set portinfo, we already check the
+		 * M_Key if the MAD is for another port and the M_Key
+		 * is OK on the receiving port. This check is needed
+		 * to increment the error counters when the M_Key
+		 * fails to match on *both* ports.
+		 */
+		if (attr_id == IB_SMP_ATTR_PORT_INFO &&
+		    (smp->method == IB_MGMT_METHOD_GET ||
+		     smp->method == IB_MGMT_METHOD_SET) &&
+		    port_num && port_num <= ibdev->phys_port_cnt &&
+		    port != port_num)
+			(void)check_mkey(to_iport(ibdev, port_num),
+					  (struct ib_mad_hdr *)smp, 0,
+					  smp->mkey, smp->route.dr.dr_slid,
+					  smp->route.dr.return_path,
+					  smp->hop_cnt);
+		ret = IB_MAD_RESULT_FAILURE;
+		return ret;
+	}
+
+	*resp_len = opa_get_smp_header_size(smp);
+
+	switch (smp->method) {
+	case IB_MGMT_METHOD_GET:
+		switch (attr_id) {
+		default:
+			clear_opa_smp_data(smp);
+			ret = subn_get_opa_sma(attr_id, smp, am, data,
+					       ibdev, port, resp_len,
+					       data_size);
+			break;
+		case OPA_ATTRIB_ID_AGGREGATE:
+			ret = subn_get_opa_aggregate(smp, ibdev, port,
+						     resp_len);
+			break;
+		}
+		break;
+	case IB_MGMT_METHOD_SET:
+		switch (attr_id) {
+		default:
+			ret = subn_set_opa_sma(attr_id, smp, am, data,
+					       ibdev, port, resp_len,
+					       data_size);
+			break;
+		case OPA_ATTRIB_ID_AGGREGATE:
+			ret = subn_set_opa_aggregate(smp, ibdev, port,
+						     resp_len);
+			break;
+		}
+		break;
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_MGMT_METHOD_REPORT_RESP:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		break;
+	case IB_MGMT_METHOD_TRAP_REPRESS:
+		subn_handle_opa_trap_repress(ibp, smp);
+		/* Always successful */
+		ret = IB_MAD_RESULT_SUCCESS;
+		break;
+	default:
+		smp->status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_mad_hdr *)smp);
+		break;
+	}
+
+	return ret;
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+			u8 port, const struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_smp *smp = (struct ib_smp *)out_mad;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	int ret;
+
+	*out_mad = *in_mad;
+	if (smp->class_version != 1) {
+		smp->status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_mad_hdr *)smp);
+		return ret;
+	}
+
+	ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags,
+			 smp->mkey, (__force __be32)smp->dr_slid,
+			 smp->return_path, smp->hop_cnt);
+	if (ret) {
+		u32 port_num = be32_to_cpu(smp->attr_mod);
+
+		/*
+		 * If this is a get/set portinfo, we already check the
+		 * M_Key if the MAD is for another port and the M_Key
+		 * is OK on the receiving port. This check is needed
+		 * to increment the error counters when the M_Key
+		 * fails to match on *both* ports.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+		    (smp->method == IB_MGMT_METHOD_GET ||
+		     smp->method == IB_MGMT_METHOD_SET) &&
+		    port_num && port_num <= ibdev->phys_port_cnt &&
+		    port != port_num)
+			(void)check_mkey(to_iport(ibdev, port_num),
+					 (struct ib_mad_hdr *)smp, 0,
+					 smp->mkey,
+					 (__force __be32)smp->dr_slid,
+					 smp->return_path, smp->hop_cnt);
+		ret = IB_MAD_RESULT_FAILURE;
+		return ret;
+	}
+
+	switch (smp->method) {
+	case IB_MGMT_METHOD_GET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_NODE_INFO:
+			ret = subn_get_nodeinfo(smp, ibdev, port);
+			break;
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_mad_hdr *)smp);
+			break;
+		}
+		break;
+	}
+
+	return ret;
+}
+
+static int process_perf(struct ib_device *ibdev, u8 port,
+			const struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
+	struct ib_class_port_info *cpi = (struct ib_class_port_info *)
+						&pmp->data;
+	int ret = IB_MAD_RESULT_FAILURE;
+
+	*out_mad = *in_mad;
+	if (pmp->mad_hdr.class_version != 1) {
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_mad_hdr *)pmp);
+		return ret;
+	}
+
+	switch (pmp->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_PORT_COUNTERS:
+			ret = pma_get_ib_portcounters(pmp, ibdev, port);
+			break;
+		case IB_PMA_PORT_COUNTERS_EXT:
+			ret = pma_get_ib_portcounters_ext(pmp, ibdev, port);
+			break;
+		case IB_PMA_CLASS_PORT_INFO:
+			cpi->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
+			ret = reply((struct ib_mad_hdr *)pmp);
+			break;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_mad_hdr *)pmp);
+			break;
+		}
+		break;
+
+	case IB_MGMT_METHOD_SET:
+		if (pmp->mad_hdr.attr_id) {
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_mad_hdr *)pmp);
+		}
+		break;
+
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		break;
+
+	default:
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_mad_hdr *)pmp);
+		break;
+	}
+
+	return ret;
+}
+
+static int process_perf_opa(struct ib_device *ibdev, u8 port,
+			    const struct opa_mad *in_mad,
+			    struct opa_mad *out_mad, u32 *resp_len)
+{
+	struct opa_pma_mad *pmp = (struct opa_pma_mad *)out_mad;
+	int ret;
+
+	*out_mad = *in_mad;
+
+	if (pmp->mad_hdr.class_version != OPA_SM_CLASS_VERSION) {
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+		return reply((struct ib_mad_hdr *)pmp);
+	}
+
+	*resp_len = sizeof(pmp->mad_hdr);
+
+	switch (pmp->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_CLASS_PORT_INFO:
+			ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len);
+			break;
+		case OPA_PM_ATTRIB_ID_PORT_STATUS:
+			ret = pma_get_opa_portstatus(pmp, ibdev, port,
+						     resp_len);
+			break;
+		case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS:
+			ret = pma_get_opa_datacounters(pmp, ibdev, port,
+						       resp_len);
+			break;
+		case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS:
+			ret = pma_get_opa_porterrors(pmp, ibdev, port,
+						     resp_len);
+			break;
+		case OPA_PM_ATTRIB_ID_ERROR_INFO:
+			ret = pma_get_opa_errorinfo(pmp, ibdev, port,
+						    resp_len);
+			break;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_mad_hdr *)pmp);
+			break;
+		}
+		break;
+
+	case IB_MGMT_METHOD_SET:
+		switch (pmp->mad_hdr.attr_id) {
+		case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS:
+			ret = pma_set_opa_portstatus(pmp, ibdev, port,
+						     resp_len);
+			break;
+		case OPA_PM_ATTRIB_ID_ERROR_INFO:
+			ret = pma_set_opa_errorinfo(pmp, ibdev, port,
+						    resp_len);
+			break;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_mad_hdr *)pmp);
+			break;
+		}
+		break;
+
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		break;
+
+	default:
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_mad_hdr *)pmp);
+		break;
+	}
+
+	return ret;
+}
+
+static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags,
+				u8 port, const struct ib_wc *in_wc,
+				const struct ib_grh *in_grh,
+				const struct opa_mad *in_mad,
+				struct opa_mad *out_mad, size_t *out_mad_size,
+				u16 *out_mad_pkey_index)
+{
+	int ret;
+	int pkey_idx;
+	u32 resp_len = 0;
+	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+	pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+	if (pkey_idx < 0) {
+		pr_warn("failed to find limited mgmt pkey, defaulting 0x%x\n",
+			hfi1_get_pkey(ibp, 1));
+		pkey_idx = 1;
+	}
+	*out_mad_pkey_index = (u16)pkey_idx;
+
+	switch (in_mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+		if (is_local_mad(ibp, in_mad, in_wc)) {
+			ret = opa_local_smp_check(ibp, in_wc);
+			if (ret)
+				return IB_MAD_RESULT_FAILURE;
+		}
+		ret = process_subn_opa(ibdev, mad_flags, port, in_mad,
+				       out_mad, &resp_len);
+		goto bail;
+	case IB_MGMT_CLASS_PERF_MGMT:
+		ret = hfi1_pkey_validation_pma(ibp, in_mad, in_wc);
+		if (ret)
+			return IB_MAD_RESULT_FAILURE;
+
+		ret = process_perf_opa(ibdev, port, in_mad, out_mad, &resp_len);
+		goto bail;
+
+	default:
+		ret = IB_MAD_RESULT_SUCCESS;
+	}
+
+bail:
+	if (ret & IB_MAD_RESULT_REPLY)
+		*out_mad_size = round_up(resp_len, 8);
+	else if (ret & IB_MAD_RESULT_SUCCESS)
+		*out_mad_size = in_wc->byte_len - sizeof(struct ib_grh);
+
+	return ret;
+}
+
+static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+			       const struct ib_wc *in_wc,
+			       const struct ib_grh *in_grh,
+			       const struct ib_mad *in_mad,
+			       struct ib_mad *out_mad)
+{
+	int ret;
+
+	switch (in_mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+		ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
+		break;
+	case IB_MGMT_CLASS_PERF_MGMT:
+		ret = process_perf(ibdev, port, in_mad, out_mad);
+		break;
+	default:
+		ret = IB_MAD_RESULT_SUCCESS;
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * hfi1_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+		     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		     const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+		     struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+		     u16 *out_mad_pkey_index)
+{
+	switch (in_mad->base_version) {
+	case OPA_MGMT_BASE_VERSION:
+		if (unlikely(in_mad_size != sizeof(struct opa_mad))) {
+			dev_err(ibdev->dev.parent, "invalid in_mad_size\n");
+			return IB_MAD_RESULT_FAILURE;
+		}
+		return hfi1_process_opa_mad(ibdev, mad_flags, port,
+					    in_wc, in_grh,
+					    (struct opa_mad *)in_mad,
+					    (struct opa_mad *)out_mad,
+					    out_mad_size,
+					    out_mad_pkey_index);
+	case IB_MGMT_BASE_VERSION:
+		return hfi1_process_ib_mad(ibdev, mad_flags, port,
+					  in_wc, in_grh,
+					  (const struct ib_mad *)in_mad,
+					  (struct ib_mad *)out_mad);
+	default:
+		break;
+	}
+
+	return IB_MAD_RESULT_FAILURE;
+}
diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h
new file mode 100644
index 0000000..2f48e69
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/mad.h
@@ -0,0 +1,478 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MAD_H
+#define _HFI1_MAD_H
+
+#include <rdma/ib_pma.h>
+#include <rdma/opa_smi.h>
+#include <rdma/opa_port_info.h>
+#include "opa_compat.h"
+
+/*
+ * OPA Traps
+ */
+#define OPA_TRAP_GID_NOW_IN_SERVICE             cpu_to_be16(64)
+#define OPA_TRAP_GID_OUT_OF_SERVICE             cpu_to_be16(65)
+#define OPA_TRAP_ADD_MULTICAST_GROUP            cpu_to_be16(66)
+#define OPA_TRAL_DEL_MULTICAST_GROUP            cpu_to_be16(67)
+#define OPA_TRAP_UNPATH                         cpu_to_be16(68)
+#define OPA_TRAP_REPATH                         cpu_to_be16(69)
+#define OPA_TRAP_PORT_CHANGE_STATE              cpu_to_be16(128)
+#define OPA_TRAP_LINK_INTEGRITY                 cpu_to_be16(129)
+#define OPA_TRAP_EXCESSIVE_BUFFER_OVERRUN       cpu_to_be16(130)
+#define OPA_TRAP_FLOW_WATCHDOG                  cpu_to_be16(131)
+#define OPA_TRAP_CHANGE_CAPABILITY              cpu_to_be16(144)
+#define OPA_TRAP_CHANGE_SYSGUID                 cpu_to_be16(145)
+#define OPA_TRAP_BAD_M_KEY                      cpu_to_be16(256)
+#define OPA_TRAP_BAD_P_KEY                      cpu_to_be16(257)
+#define OPA_TRAP_BAD_Q_KEY                      cpu_to_be16(258)
+#define OPA_TRAP_SWITCH_BAD_PKEY                cpu_to_be16(259)
+#define OPA_SMA_TRAP_DATA_LINK_WIDTH            cpu_to_be16(2048)
+
+/*
+ * Generic trap/notice other local changes flags (trap 144).
+ */
+#define	OPA_NOTICE_TRAP_LWDE_CHG        0x08 /* Link Width Downgrade Enable
+					      * changed
+					      */
+#define OPA_NOTICE_TRAP_LSE_CHG         0x04 /* Link Speed Enable changed */
+#define OPA_NOTICE_TRAP_LWE_CHG         0x02 /* Link Width Enable changed */
+#define OPA_NOTICE_TRAP_NODE_DESC_CHG   0x01
+
+struct opa_mad_notice_attr {
+	u8 generic_type;
+	u8 prod_type_msb;
+	__be16 prod_type_lsb;
+	__be16 trap_num;
+	__be16 toggle_count;
+	__be32 issuer_lid;
+	__be32 reserved1;
+	union ib_gid issuer_gid;
+
+	union {
+		struct {
+			u8	details[64];
+		} raw_data;
+
+		struct {
+			union ib_gid	gid;
+		} __packed ntc_64_65_66_67;
+
+		struct {
+			__be32	lid;
+		} __packed ntc_128;
+
+		struct {
+			__be32	lid;		/* where violation happened */
+			u8	port_num;	/* where violation happened */
+		} __packed ntc_129_130_131;
+
+		struct {
+			__be32	lid;		/* LID where change occurred */
+			__be32	new_cap_mask;	/* new capability mask */
+			__be16	reserved2;
+			__be16	cap_mask3;
+			__be16	change_flags;	/* low 4 bits only */
+		} __packed ntc_144;
+
+		struct {
+			__be64	new_sys_guid;
+			__be32	lid;		/* lid where sys guid changed */
+		} __packed ntc_145;
+
+		struct {
+			__be32	lid;
+			__be32	dr_slid;
+			u8	method;
+			u8	dr_trunc_hop;
+			__be16	attr_id;
+			__be32	attr_mod;
+			__be64	mkey;
+			u8	dr_rtn_path[30];
+		} __packed ntc_256;
+
+		struct {
+			__be32		lid1;
+			__be32		lid2;
+			__be32		key;
+			u8		sl;	/* SL: high 5 bits */
+			u8		reserved3[3];
+			union ib_gid	gid1;
+			union ib_gid	gid2;
+			__be32		qp1;	/* high 8 bits reserved */
+			__be32		qp2;	/* high 8 bits reserved */
+		} __packed ntc_257_258;
+
+		struct {
+			__be16		flags;	/* low 8 bits reserved */
+			__be16		pkey;
+			__be32		lid1;
+			__be32		lid2;
+			u8		sl;	/* SL: high 5 bits */
+			u8		reserved4[3];
+			union ib_gid	gid1;
+			union ib_gid	gid2;
+			__be32		qp1;	/* high 8 bits reserved */
+			__be32		qp2;	/* high 8 bits reserved */
+		} __packed ntc_259;
+
+		struct {
+			__be32	lid;
+		} __packed ntc_2048;
+
+	};
+	u8	class_data[0];
+};
+
+#define IB_VLARB_LOWPRI_0_31    1
+#define IB_VLARB_LOWPRI_32_63   2
+#define IB_VLARB_HIGHPRI_0_31   3
+#define IB_VLARB_HIGHPRI_32_63  4
+
+#define OPA_MAX_PREEMPT_CAP         32
+#define OPA_VLARB_LOW_ELEMENTS       0
+#define OPA_VLARB_HIGH_ELEMENTS      1
+#define OPA_VLARB_PREEMPT_ELEMENTS   2
+#define OPA_VLARB_PREEMPT_MATRIX     3
+
+#define IB_PMA_PORT_COUNTERS_CONG       cpu_to_be16(0xFF00)
+#define LINK_SPEED_25G		1
+#define LINK_SPEED_12_5G	2
+#define LINK_WIDTH_DEFAULT	4
+#define DECIMAL_FACTORING	1000
+/*
+ * The default link width is multiplied by 1000
+ * to get accurate value after division.
+ */
+#define FACTOR_LINK_WIDTH	(LINK_WIDTH_DEFAULT * DECIMAL_FACTORING)
+
+struct ib_pma_portcounters_cong {
+	u8 reserved;
+	u8 reserved1;
+	__be16 port_check_rate;
+	__be16 symbol_error_counter;
+	u8 link_error_recovery_counter;
+	u8 link_downed_counter;
+	__be16 port_rcv_errors;
+	__be16 port_rcv_remphys_errors;
+	__be16 port_rcv_switch_relay_errors;
+	__be16 port_xmit_discards;
+	u8 port_xmit_constraint_errors;
+	u8 port_rcv_constraint_errors;
+	u8 reserved2;
+	u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+	__be16 reserved3;
+	__be16 vl15_dropped;
+	__be64 port_xmit_data;
+	__be64 port_rcv_data;
+	__be64 port_xmit_packets;
+	__be64 port_rcv_packets;
+	__be64 port_xmit_wait;
+	__be64 port_adr_events;
+} __packed;
+
+#define IB_SMP_UNSUP_VERSION    cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD     cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR  cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD    cpu_to_be16(0x001C)
+
+#define OPA_MAX_PREEMPT_CAP         32
+#define OPA_VLARB_LOW_ELEMENTS       0
+#define OPA_VLARB_HIGH_ELEMENTS      1
+#define OPA_VLARB_PREEMPT_ELEMENTS   2
+#define OPA_VLARB_PREEMPT_MATRIX     3
+
+#define HFI1_XMIT_RATE_UNSUPPORTED               0x0
+#define HFI1_XMIT_RATE_PICO                      0x7
+/* number of 4nsec cycles equaling 2secs */
+#define HFI1_CONG_TIMER_PSINTERVAL               0x1DCD64EC
+
+#define IB_CC_SVCTYPE_RC 0x0
+#define IB_CC_SVCTYPE_UC 0x1
+#define IB_CC_SVCTYPE_RD 0x2
+#define IB_CC_SVCTYPE_UD 0x3
+
+/*
+ * There should be an equivalent IB #define for the following, but
+ * I cannot find it.
+ */
+#define OPA_CC_LOG_TYPE_HFI	2
+
+struct opa_hfi1_cong_log_event_internal {
+	u32 lqpn;
+	u32 rqpn;
+	u8 sl;
+	u8 svc_type;
+	u32 rlid;
+	u64 timestamp; /* wider than 32 bits to detect 32 bit rollover */
+};
+
+struct opa_hfi1_cong_log_event {
+	u8 local_qp_cn_entry[3];
+	u8 remote_qp_number_cn_entry[3];
+	u8 sl_svc_type_cn_entry; /* 5 bits SL, 3 bits svc type */
+	u8 reserved;
+	__be32 remote_lid_cn_entry;
+	__be32 timestamp_cn_entry;
+} __packed;
+
+#define OPA_CONG_LOG_ELEMS	96
+
+struct opa_hfi1_cong_log {
+	u8 log_type;
+	u8 congestion_flags;
+	__be16 threshold_event_counter;
+	__be32 current_time_stamp;
+	u8 threshold_cong_event_map[OPA_MAX_SLS / 8];
+	struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS];
+} __packed;
+
+#define IB_CC_TABLE_CAP_DEFAULT 31
+
+/* Port control flags */
+#define IB_CC_CCS_PC_SL_BASED 0x01
+
+struct opa_congestion_setting_entry {
+	u8 ccti_increase;
+	u8 reserved;
+	__be16 ccti_timer;
+	u8 trigger_threshold;
+	u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_entry_shadow {
+	u8 ccti_increase;
+	u8 reserved;
+	u16 ccti_timer;
+	u8 trigger_threshold;
+	u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_attr {
+	__be32 control_map;
+	__be16 port_control;
+	struct opa_congestion_setting_entry entries[OPA_MAX_SLS];
+} __packed;
+
+struct opa_congestion_setting_attr_shadow {
+	u32 control_map;
+	u16 port_control;
+	struct opa_congestion_setting_entry_shadow entries[OPA_MAX_SLS];
+} __packed;
+
+#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1
+#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1
+
+/* 64 Congestion Control table entries in a single MAD */
+#define IB_CCT_ENTRIES 64
+#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2)
+
+struct ib_cc_table_entry {
+	__be16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_entry_shadow {
+	u16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_attr {
+	__be16 ccti_limit; /* max CCTI for cc table */
+	struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+struct ib_cc_table_attr_shadow {
+	u16 ccti_limit; /* max CCTI for cc table */
+	struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+#define CC_TABLE_SHADOW_MAX \
+	(IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES)
+
+struct cc_table_shadow {
+	u16 ccti_limit; /* max CCTI for cc table */
+	struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
+} __packed;
+
+/*
+ * struct cc_state combines the (active) per-port congestion control
+ * table, and the (active) per-SL congestion settings. cc_state data
+ * may need to be read in code paths that we want to be fast, so it
+ * is an RCU protected structure.
+ */
+struct cc_state {
+	struct rcu_head rcu;
+	struct cc_table_shadow cct;
+	struct opa_congestion_setting_attr_shadow cong_setting;
+};
+
+/*
+ * OPA BufferControl MAD
+ */
+
+/* attribute modifier macros */
+#define OPA_AM_NPORT_SHIFT	24
+#define OPA_AM_NPORT_MASK	0xff
+#define OPA_AM_NPORT_SMASK	(OPA_AM_NPORT_MASK << OPA_AM_NPORT_SHIFT)
+#define OPA_AM_NPORT(am)	(((am) >> OPA_AM_NPORT_SHIFT) & \
+					OPA_AM_NPORT_MASK)
+
+#define OPA_AM_NBLK_SHIFT	24
+#define OPA_AM_NBLK_MASK	0xff
+#define OPA_AM_NBLK_SMASK	(OPA_AM_NBLK_MASK << OPA_AM_NBLK_SHIFT)
+#define OPA_AM_NBLK(am)		(((am) >> OPA_AM_NBLK_SHIFT) & \
+					OPA_AM_NBLK_MASK)
+
+#define OPA_AM_START_BLK_SHIFT	0
+#define OPA_AM_START_BLK_MASK	0xff
+#define OPA_AM_START_BLK_SMASK	(OPA_AM_START_BLK_MASK << \
+					OPA_AM_START_BLK_SHIFT)
+#define OPA_AM_START_BLK(am)	(((am) >> OPA_AM_START_BLK_SHIFT) & \
+					OPA_AM_START_BLK_MASK)
+
+#define OPA_AM_PORTNUM_SHIFT	0
+#define OPA_AM_PORTNUM_MASK	0xff
+#define OPA_AM_PORTNUM_SMASK	(OPA_AM_PORTNUM_MASK << OPA_AM_PORTNUM_SHIFT)
+#define OPA_AM_PORTNUM(am)	(((am) >> OPA_AM_PORTNUM_SHIFT) & \
+					OPA_AM_PORTNUM_MASK)
+
+#define OPA_AM_ASYNC_SHIFT	12
+#define OPA_AM_ASYNC_MASK	0x1
+#define OPA_AM_ASYNC_SMASK	(OPA_AM_ASYNC_MASK << OPA_AM_ASYNC_SHIFT)
+#define OPA_AM_ASYNC(am)	(((am) >> OPA_AM_ASYNC_SHIFT) & \
+					OPA_AM_ASYNC_MASK)
+
+#define OPA_AM_START_SM_CFG_SHIFT	9
+#define OPA_AM_START_SM_CFG_MASK	0x1
+#define OPA_AM_START_SM_CFG_SMASK	(OPA_AM_START_SM_CFG_MASK << \
+						OPA_AM_START_SM_CFG_SHIFT)
+#define OPA_AM_START_SM_CFG(am)		(((am) >> OPA_AM_START_SM_CFG_SHIFT) \
+						& OPA_AM_START_SM_CFG_MASK)
+
+#define OPA_AM_CI_ADDR_SHIFT	19
+#define OPA_AM_CI_ADDR_MASK	0xfff
+#define OPA_AM_CI_ADDR_SMASK	(OPA_AM_CI_ADDR_MASK << OPA_CI_ADDR_SHIFT)
+#define OPA_AM_CI_ADDR(am)	(((am) >> OPA_AM_CI_ADDR_SHIFT) & \
+					OPA_AM_CI_ADDR_MASK)
+
+#define OPA_AM_CI_LEN_SHIFT	13
+#define OPA_AM_CI_LEN_MASK	0x3f
+#define OPA_AM_CI_LEN_SMASK	(OPA_AM_CI_LEN_MASK << OPA_CI_LEN_SHIFT)
+#define OPA_AM_CI_LEN(am)	(((am) >> OPA_AM_CI_LEN_SHIFT) & \
+					OPA_AM_CI_LEN_MASK)
+
+/* error info macros */
+#define OPA_EI_STATUS_SMASK	0x80
+#define OPA_EI_CODE_SMASK	0x0f
+
+struct vl_limit {
+	__be16 dedicated;
+	__be16 shared;
+};
+
+struct buffer_control {
+	__be16 reserved;
+	__be16 overall_shared_limit;
+	struct vl_limit vl[OPA_MAX_VLS];
+};
+
+struct sc2vlnt {
+	u8 vlnt[32]; /* 5 bit VL, 3 bits reserved */
+};
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 \
+	cpu_to_be32(COUNTER_MASK(1, 0) | \
+		    COUNTER_MASK(1, 1) | \
+		    COUNTER_MASK(1, 2) | \
+		    COUNTER_MASK(1, 3) | \
+		    COUNTER_MASK(1, 4))
+
+void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port);
+void hfi1_handle_trap_timer(struct timer_list *t);
+u16 tx_link_width(u16 link_width);
+u64 get_xmit_wait_counters(struct hfi1_pportdata *ppd, u16 link_width,
+			   u16 link_speed, int vl);
+/**
+ * get_link_speed - determine whether 12.5G or 25G speed
+ * @link_speed: the speed of active link
+ * @return: Return 2 if link speed identified as 12.5G
+ * or return 1 if link speed is 25G.
+ *
+ * The function indirectly calculate required link speed
+ * value for convert_xmit_counter function. If the link
+ * speed is 25G, the function return as 1 as it is required
+ * by xmit counter conversion formula :-( 25G / link_speed).
+ * This conversion will provide value 1 if current
+ * link speed is 25G or 2 if 12.5G.This is done to avoid
+ * 12.5 float number conversion.
+ */
+static inline u16 get_link_speed(u16 link_speed)
+{
+	return (link_speed == 1) ?
+		 LINK_SPEED_12_5G : LINK_SPEED_25G;
+}
+
+/**
+ * convert_xmit_counter - calculate flit times for given xmit counter
+ * value
+ * @xmit_wait_val: current xmit counter value
+ * @link_width: width of active link
+ * @link_speed: speed of active link
+ * @return: return xmit counter value in flit times.
+ */
+static inline u64 convert_xmit_counter(u64 xmit_wait_val, u16 link_width,
+				       u16 link_speed)
+{
+	return (xmit_wait_val * 2 * (FACTOR_LINK_WIDTH / link_width)
+		 * link_speed) / DECIMAL_FACTORING;
+}
+#endif				/* _HFI1_MAD_H */
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
new file mode 100644
index 0000000..70aceef
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright(c) 2016 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/mmu_notifier.h>
+#include <linux/interval_tree_generic.h>
+
+#include "mmu_rb.h"
+#include "trace.h"
+
+struct mmu_rb_handler {
+	struct mmu_notifier mn;
+	struct rb_root_cached root;
+	void *ops_arg;
+	spinlock_t lock;        /* protect the RB tree */
+	struct mmu_rb_ops *ops;
+	struct mm_struct *mm;
+	struct list_head lru_list;
+	struct work_struct del_work;
+	struct list_head del_list;
+	struct workqueue_struct *wq;
+};
+
+static unsigned long mmu_node_start(struct mmu_rb_node *);
+static unsigned long mmu_node_last(struct mmu_rb_node *);
+static void mmu_notifier_range_start(struct mmu_notifier *,
+				     struct mm_struct *,
+				     unsigned long, unsigned long);
+static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
+					   unsigned long, unsigned long);
+static void do_remove(struct mmu_rb_handler *handler,
+		      struct list_head *del_list);
+static void handle_remove(struct work_struct *work);
+
+static const struct mmu_notifier_ops mn_opts = {
+	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
+	.invalidate_range_start = mmu_notifier_range_start,
+};
+
+INTERVAL_TREE_DEFINE(struct mmu_rb_node, node, unsigned long, __last,
+		     mmu_node_start, mmu_node_last, static, __mmu_int_rb);
+
+static unsigned long mmu_node_start(struct mmu_rb_node *node)
+{
+	return node->addr & PAGE_MASK;
+}
+
+static unsigned long mmu_node_last(struct mmu_rb_node *node)
+{
+	return PAGE_ALIGN(node->addr + node->len) - 1;
+}
+
+int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
+			 struct mmu_rb_ops *ops,
+			 struct workqueue_struct *wq,
+			 struct mmu_rb_handler **handler)
+{
+	struct mmu_rb_handler *handlr;
+	int ret;
+
+	handlr = kmalloc(sizeof(*handlr), GFP_KERNEL);
+	if (!handlr)
+		return -ENOMEM;
+
+	handlr->root = RB_ROOT_CACHED;
+	handlr->ops = ops;
+	handlr->ops_arg = ops_arg;
+	INIT_HLIST_NODE(&handlr->mn.hlist);
+	spin_lock_init(&handlr->lock);
+	handlr->mn.ops = &mn_opts;
+	handlr->mm = mm;
+	INIT_WORK(&handlr->del_work, handle_remove);
+	INIT_LIST_HEAD(&handlr->del_list);
+	INIT_LIST_HEAD(&handlr->lru_list);
+	handlr->wq = wq;
+
+	ret = mmu_notifier_register(&handlr->mn, handlr->mm);
+	if (ret) {
+		kfree(handlr);
+		return ret;
+	}
+
+	*handler = handlr;
+	return 0;
+}
+
+void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler)
+{
+	struct mmu_rb_node *rbnode;
+	struct rb_node *node;
+	unsigned long flags;
+	struct list_head del_list;
+
+	/* Unregister first so we don't get any more notifications. */
+	mmu_notifier_unregister(&handler->mn, handler->mm);
+
+	/*
+	 * Make sure the wq delete handler is finished running.  It will not
+	 * be triggered once the mmu notifiers are unregistered above.
+	 */
+	flush_work(&handler->del_work);
+
+	INIT_LIST_HEAD(&del_list);
+
+	spin_lock_irqsave(&handler->lock, flags);
+	while ((node = rb_first_cached(&handler->root))) {
+		rbnode = rb_entry(node, struct mmu_rb_node, node);
+		rb_erase_cached(node, &handler->root);
+		/* move from LRU list to delete list */
+		list_move(&rbnode->list, &del_list);
+	}
+	spin_unlock_irqrestore(&handler->lock, flags);
+
+	do_remove(handler, &del_list);
+
+	kfree(handler);
+}
+
+int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
+		       struct mmu_rb_node *mnode)
+{
+	struct mmu_rb_node *node;
+	unsigned long flags;
+	int ret = 0;
+
+	trace_hfi1_mmu_rb_insert(mnode->addr, mnode->len);
+	spin_lock_irqsave(&handler->lock, flags);
+	node = __mmu_rb_search(handler, mnode->addr, mnode->len);
+	if (node) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+	__mmu_int_rb_insert(mnode, &handler->root);
+	list_add(&mnode->list, &handler->lru_list);
+
+	ret = handler->ops->insert(handler->ops_arg, mnode);
+	if (ret) {
+		__mmu_int_rb_remove(mnode, &handler->root);
+		list_del(&mnode->list); /* remove from LRU list */
+	}
+unlock:
+	spin_unlock_irqrestore(&handler->lock, flags);
+	return ret;
+}
+
+/* Caller must hold handler lock */
+static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
+					   unsigned long addr,
+					   unsigned long len)
+{
+	struct mmu_rb_node *node = NULL;
+
+	trace_hfi1_mmu_rb_search(addr, len);
+	if (!handler->ops->filter) {
+		node = __mmu_int_rb_iter_first(&handler->root, addr,
+					       (addr + len) - 1);
+	} else {
+		for (node = __mmu_int_rb_iter_first(&handler->root, addr,
+						    (addr + len) - 1);
+		     node;
+		     node = __mmu_int_rb_iter_next(node, addr,
+						   (addr + len) - 1)) {
+			if (handler->ops->filter(node, addr, len))
+				return node;
+		}
+	}
+	return node;
+}
+
+bool hfi1_mmu_rb_remove_unless_exact(struct mmu_rb_handler *handler,
+				     unsigned long addr, unsigned long len,
+				     struct mmu_rb_node **rb_node)
+{
+	struct mmu_rb_node *node;
+	unsigned long flags;
+	bool ret = false;
+
+	spin_lock_irqsave(&handler->lock, flags);
+	node = __mmu_rb_search(handler, addr, len);
+	if (node) {
+		if (node->addr == addr && node->len == len)
+			goto unlock;
+		__mmu_int_rb_remove(node, &handler->root);
+		list_del(&node->list); /* remove from LRU list */
+		ret = true;
+	}
+unlock:
+	spin_unlock_irqrestore(&handler->lock, flags);
+	*rb_node = node;
+	return ret;
+}
+
+void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg)
+{
+	struct mmu_rb_node *rbnode, *ptr;
+	struct list_head del_list;
+	unsigned long flags;
+	bool stop = false;
+
+	INIT_LIST_HEAD(&del_list);
+
+	spin_lock_irqsave(&handler->lock, flags);
+	list_for_each_entry_safe_reverse(rbnode, ptr, &handler->lru_list,
+					 list) {
+		if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg,
+					&stop)) {
+			__mmu_int_rb_remove(rbnode, &handler->root);
+			/* move from LRU list to delete list */
+			list_move(&rbnode->list, &del_list);
+		}
+		if (stop)
+			break;
+	}
+	spin_unlock_irqrestore(&handler->lock, flags);
+
+	while (!list_empty(&del_list)) {
+		rbnode = list_first_entry(&del_list, struct mmu_rb_node, list);
+		list_del(&rbnode->list);
+		handler->ops->remove(handler->ops_arg, rbnode);
+	}
+}
+
+/*
+ * It is up to the caller to ensure that this function does not race with the
+ * mmu invalidate notifier which may be calling the users remove callback on
+ * 'node'.
+ */
+void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
+			struct mmu_rb_node *node)
+{
+	unsigned long flags;
+
+	/* Validity of handler and node pointers has been checked by caller. */
+	trace_hfi1_mmu_rb_remove(node->addr, node->len);
+	spin_lock_irqsave(&handler->lock, flags);
+	__mmu_int_rb_remove(node, &handler->root);
+	list_del(&node->list); /* remove from LRU list */
+	spin_unlock_irqrestore(&handler->lock, flags);
+
+	handler->ops->remove(handler->ops_arg, node);
+}
+
+static void mmu_notifier_range_start(struct mmu_notifier *mn,
+				     struct mm_struct *mm,
+				     unsigned long start,
+				     unsigned long end)
+{
+	struct mmu_rb_handler *handler =
+		container_of(mn, struct mmu_rb_handler, mn);
+	struct rb_root_cached *root = &handler->root;
+	struct mmu_rb_node *node, *ptr = NULL;
+	unsigned long flags;
+	bool added = false;
+
+	spin_lock_irqsave(&handler->lock, flags);
+	for (node = __mmu_int_rb_iter_first(root, start, end - 1);
+	     node; node = ptr) {
+		/* Guard against node removal. */
+		ptr = __mmu_int_rb_iter_next(node, start, end - 1);
+		trace_hfi1_mmu_mem_invalidate(node->addr, node->len);
+		if (handler->ops->invalidate(handler->ops_arg, node)) {
+			__mmu_int_rb_remove(node, root);
+			/* move from LRU list to delete list */
+			list_move(&node->list, &handler->del_list);
+			added = true;
+		}
+	}
+	spin_unlock_irqrestore(&handler->lock, flags);
+
+	if (added)
+		queue_work(handler->wq, &handler->del_work);
+}
+
+/*
+ * Call the remove function for the given handler and the list.  This
+ * is expected to be called with a delete list extracted from handler.
+ * The caller should not be holding the handler lock.
+ */
+static void do_remove(struct mmu_rb_handler *handler,
+		      struct list_head *del_list)
+{
+	struct mmu_rb_node *node;
+
+	while (!list_empty(del_list)) {
+		node = list_first_entry(del_list, struct mmu_rb_node, list);
+		list_del(&node->list);
+		handler->ops->remove(handler->ops_arg, node);
+	}
+}
+
+/*
+ * Work queue function to remove all nodes that have been queued up to
+ * be removed.  The key feature is that mm->mmap_sem is not being held
+ * and the remove callback can sleep while taking it, if needed.
+ */
+static void handle_remove(struct work_struct *work)
+{
+	struct mmu_rb_handler *handler = container_of(work,
+						struct mmu_rb_handler,
+						del_work);
+	struct list_head del_list;
+	unsigned long flags;
+
+	/* remove anything that is queued to get removed */
+	spin_lock_irqsave(&handler->lock, flags);
+	list_replace_init(&handler->del_list, &del_list);
+	spin_unlock_irqrestore(&handler->lock, flags);
+
+	do_remove(handler, &del_list);
+}
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h
new file mode 100644
index 0000000..f04cec1
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MMU_RB_H
+#define _HFI1_MMU_RB_H
+
+#include "hfi.h"
+
+struct mmu_rb_node {
+	unsigned long addr;
+	unsigned long len;
+	unsigned long __last;
+	struct rb_node node;
+	struct list_head list;
+};
+
+/*
+ * NOTE: filter, insert, invalidate, and evict must not sleep.  Only remove is
+ * allowed to sleep.
+ */
+struct mmu_rb_ops {
+	bool (*filter)(struct mmu_rb_node *node, unsigned long addr,
+		       unsigned long len);
+	int (*insert)(void *ops_arg, struct mmu_rb_node *mnode);
+	void (*remove)(void *ops_arg, struct mmu_rb_node *mnode);
+	int (*invalidate)(void *ops_arg, struct mmu_rb_node *node);
+	int (*evict)(void *ops_arg, struct mmu_rb_node *mnode,
+		     void *evict_arg, bool *stop);
+};
+
+int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
+			 struct mmu_rb_ops *ops,
+			 struct workqueue_struct *wq,
+			 struct mmu_rb_handler **handler);
+void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler);
+int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
+		       struct mmu_rb_node *mnode);
+void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg);
+void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
+			struct mmu_rb_node *mnode);
+bool hfi1_mmu_rb_remove_unless_exact(struct mmu_rb_handler *handler,
+				     unsigned long addr, unsigned long len,
+				     struct mmu_rb_node **rb_node);
+
+#endif /* _HFI1_MMU_RB_H */
diff --git a/drivers/infiniband/hw/hfi1/opa_compat.h b/drivers/infiniband/hw/hfi1/opa_compat.h
new file mode 100644
index 0000000..774215b
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/opa_compat.h
@@ -0,0 +1,128 @@
+#ifndef _LINUX_H
+#define _LINUX_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This header file is for OPA-specific definitions which are
+ * required by the HFI driver, and which aren't yet in the Linux
+ * IB core. We'll collect these all here, then merge them into
+ * the kernel when that's convenient.
+ */
+
+/* OPA SMA attribute IDs */
+#define OPA_ATTRIB_ID_CONGESTION_INFO		cpu_to_be16(0x008b)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_LOG	cpu_to_be16(0x008f)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_SETTING	cpu_to_be16(0x0090)
+#define OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE	cpu_to_be16(0x0091)
+
+/* OPA PMA attribute IDs */
+#define OPA_PM_ATTRIB_ID_PORT_STATUS		cpu_to_be16(0x0040)
+#define OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS	cpu_to_be16(0x0041)
+#define OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS	cpu_to_be16(0x0042)
+#define OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS	cpu_to_be16(0x0043)
+#define OPA_PM_ATTRIB_ID_ERROR_INFO		cpu_to_be16(0x0044)
+
+/* OPA status codes */
+#define OPA_PM_STATUS_REQUEST_TOO_LARGE		cpu_to_be16(0x100)
+
+static inline u8 port_states_to_logical_state(struct opa_port_states *ps)
+{
+	return ps->portphysstate_portstate & OPA_PI_MASK_PORT_STATE;
+}
+
+static inline u8 port_states_to_phys_state(struct opa_port_states *ps)
+{
+	return ((ps->portphysstate_portstate &
+		  OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4) & 0xf;
+}
+
+/*
+ * OPA port physical states
+ * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState
+ * values are the same in OmniPath Architecture. OPA leverages some of the same
+ * concepts as InfiniBand, but has a few other states as well.
+ *
+ * When writing, only values 0-3 are valid, other values are ignored.
+ * When reading, 0 is reserved.
+ *
+ * Returned by the ibphys_portstate() routine.
+ */
+enum opa_port_phys_state {
+	/* Values 0-7 have the same meaning in OPA as in InfiniBand. */
+
+	IB_PORTPHYSSTATE_NOP = 0,
+	/* 1 is reserved */
+	IB_PORTPHYSSTATE_POLLING = 2,
+	IB_PORTPHYSSTATE_DISABLED = 3,
+	IB_PORTPHYSSTATE_TRAINING = 4,
+	IB_PORTPHYSSTATE_LINKUP = 5,
+	IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6,
+	IB_PORTPHYSSTATE_PHY_TEST = 7,
+	/* 8 is reserved */
+
+	/*
+	 * Offline: Port is quiet (transmitters disabled) due to lack of
+	 * physical media, unsupported media, or transition between link up
+	 * and next link up attempt
+	 */
+	OPA_PORTPHYSSTATE_OFFLINE = 9,
+
+	/* 10 is reserved */
+
+	/*
+	 * Phy_Test: Specific test patterns are transmitted, and receiver BER
+	 * can be monitored. This facilitates signal integrity testing for the
+	 * physical layer of the port.
+	 */
+	OPA_PORTPHYSSTATE_TEST = 11,
+
+	OPA_PORTPHYSSTATE_MAX = 11,
+	/* values 12-15 are reserved/ignored */
+};
+
+#endif /* _LINUX_H */
diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
new file mode 100644
index 0000000..c1c9829
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -0,0 +1,1479 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/aer.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "chip_registers.h"
+#include "aspm.h"
+
+/* link speed vector for Gen3 speed - not in Linux headers */
+#define GEN1_SPEED_VECTOR 0x1
+#define GEN2_SPEED_VECTOR 0x2
+#define GEN3_SPEED_VECTOR 0x3
+
+/*
+ * This file contains PCIe utility routines.
+ */
+
+/*
+ * Code to adjust PCIe capabilities.
+ */
+static void tune_pcie_caps(struct hfi1_devdata *);
+
+/*
+ * Do all the common PCIe setup and initialization.
+ * devdata is not yet allocated, and is not allocated until after this
+ * routine returns success.  Therefore dd_dev_err() can't be used for error
+ * printing.
+ */
+int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret;
+
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		/*
+		 * This can happen (in theory) iff:
+		 * We did a chip reset, and then failed to reprogram the
+		 * BAR, or the chip reset due to an internal error.  We then
+		 * unloaded the driver and reloaded it.
+		 *
+		 * Both reset cases set the BAR back to initial state.  For
+		 * the latter case, the AER sticky error bit at offset 0x718
+		 * should be set, but the Linux kernel doesn't yet know
+		 * about that, it appears.  If the original BAR was retained
+		 * in the kernel data structures, this may be OK.
+		 */
+		hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n",
+			       -ret);
+		goto done;
+	}
+
+	ret = pci_request_regions(pdev, DRIVER_NAME);
+	if (ret) {
+		hfi1_early_err(&pdev->dev,
+			       "pci_request_regions fails: err %d\n", -ret);
+		goto bail;
+	}
+
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		/*
+		 * If the 64 bit setup fails, try 32 bit.  Some systems
+		 * do not setup 64 bit maps on systems with 2GB or less
+		 * memory installed.
+		 */
+		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (ret) {
+			hfi1_early_err(&pdev->dev,
+				       "Unable to set DMA mask: %d\n", ret);
+			goto bail;
+		}
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+	} else {
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	}
+	if (ret) {
+		hfi1_early_err(&pdev->dev,
+			       "Unable to set DMA consistent mask: %d\n", ret);
+		goto bail;
+	}
+
+	pci_set_master(pdev);
+	(void)pci_enable_pcie_error_reporting(pdev);
+	goto done;
+
+bail:
+	hfi1_pcie_cleanup(pdev);
+done:
+	return ret;
+}
+
+/*
+ * Clean what was done in hfi1_pcie_init()
+ */
+void hfi1_pcie_cleanup(struct pci_dev *pdev)
+{
+	pci_disable_device(pdev);
+	/*
+	 * Release regions should be called after the disable. OK to
+	 * call if request regions has not been called or failed.
+	 */
+	pci_release_regions(pdev);
+}
+
+/*
+ * Do remaining PCIe setup, once dd is allocated, and save away
+ * fields required to re-initialize after a chip reset, or for
+ * various other purposes
+ */
+int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev)
+{
+	unsigned long len;
+	resource_size_t addr;
+	int ret = 0;
+
+	addr = pci_resource_start(pdev, 0);
+	len = pci_resource_len(pdev, 0);
+
+	/*
+	 * The TXE PIO buffers are at the tail end of the chip space.
+	 * Cut them off and map them separately.
+	 */
+
+	/* sanity check vs expectations */
+	if (len != TXE_PIO_SEND + TXE_PIO_SIZE) {
+		dd_dev_err(dd, "chip PIO range does not match\n");
+		return -EINVAL;
+	}
+
+	dd->kregbase1 = ioremap_nocache(addr, RCV_ARRAY);
+	if (!dd->kregbase1) {
+		dd_dev_err(dd, "UC mapping of kregbase1 failed\n");
+		return -ENOMEM;
+	}
+	dd_dev_info(dd, "UC base1: %p for %x\n", dd->kregbase1, RCV_ARRAY);
+	dd->chip_rcv_array_count = readq(dd->kregbase1 + RCV_ARRAY_CNT);
+	dd_dev_info(dd, "RcvArray count: %u\n", dd->chip_rcv_array_count);
+	dd->base2_start  = RCV_ARRAY + dd->chip_rcv_array_count * 8;
+
+	dd->kregbase2 = ioremap_nocache(
+		addr + dd->base2_start,
+		TXE_PIO_SEND - dd->base2_start);
+	if (!dd->kregbase2) {
+		dd_dev_err(dd, "UC mapping of kregbase2 failed\n");
+		goto nomem;
+	}
+	dd_dev_info(dd, "UC base2: %p for %x\n", dd->kregbase2,
+		    TXE_PIO_SEND - dd->base2_start);
+
+	dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE);
+	if (!dd->piobase) {
+		dd_dev_err(dd, "WC mapping of send buffers failed\n");
+		goto nomem;
+	}
+	dd_dev_info(dd, "WC piobase: %p\n for %x", dd->piobase, TXE_PIO_SIZE);
+
+	dd->physaddr = addr;        /* used for io_remap, etc. */
+
+	/*
+	 * Map the chip's RcvArray as write-combining to allow us
+	 * to write an entire cacheline worth of entries in one shot.
+	 */
+	dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY,
+				     dd->chip_rcv_array_count * 8);
+	if (!dd->rcvarray_wc) {
+		dd_dev_err(dd, "WC mapping of receive array failed\n");
+		goto nomem;
+	}
+	dd_dev_info(dd, "WC RcvArray: %p for %x\n",
+		    dd->rcvarray_wc, dd->chip_rcv_array_count * 8);
+
+	dd->flags |= HFI1_PRESENT;	/* chip.c CSR routines now work */
+	return 0;
+nomem:
+	ret = -ENOMEM;
+	hfi1_pcie_ddcleanup(dd);
+	return ret;
+}
+
+/*
+ * Do PCIe cleanup related to dd, after chip-specific cleanup, etc.  Just prior
+ * to releasing the dd memory.
+ * Void because all of the core pcie cleanup functions are void.
+ */
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
+{
+	dd->flags &= ~HFI1_PRESENT;
+	if (dd->kregbase1)
+		iounmap(dd->kregbase1);
+	dd->kregbase1 = NULL;
+	if (dd->kregbase2)
+		iounmap(dd->kregbase2);
+	dd->kregbase2 = NULL;
+	if (dd->rcvarray_wc)
+		iounmap(dd->rcvarray_wc);
+	dd->rcvarray_wc = NULL;
+	if (dd->piobase)
+		iounmap(dd->piobase);
+	dd->piobase = NULL;
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_speed(u16 linkstat)
+{
+	u32 speed;
+
+	switch (linkstat & PCI_EXP_LNKSTA_CLS) {
+	default: /* not defined, assume Gen1 */
+	case PCI_EXP_LNKSTA_CLS_2_5GB:
+		speed = 2500; /* Gen 1, 2.5GHz */
+		break;
+	case PCI_EXP_LNKSTA_CLS_5_0GB:
+		speed = 5000; /* Gen 2, 5GHz */
+		break;
+	case GEN3_SPEED_VECTOR:
+		speed = 8000; /* Gen 3, 8GHz */
+		break;
+	}
+	return speed;
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_width(u16 linkstat)
+{
+	return (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
+}
+
+/* read the link status and set dd->{lbus_width,lbus_speed,lbus_info} */
+static void update_lbus_info(struct hfi1_devdata *dd)
+{
+	u16 linkstat;
+	int ret;
+
+	ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
+	if (ret) {
+		dd_dev_err(dd, "Unable to read from PCI config\n");
+		return;
+	}
+
+	dd->lbus_width = extract_width(linkstat);
+	dd->lbus_speed = extract_speed(linkstat);
+	snprintf(dd->lbus_info, sizeof(dd->lbus_info),
+		 "PCIe,%uMHz,x%u", dd->lbus_speed, dd->lbus_width);
+}
+
+/*
+ * Read in the current PCIe link width and speed.  Find if the link is
+ * Gen3 capable.
+ */
+int pcie_speeds(struct hfi1_devdata *dd)
+{
+	u32 linkcap;
+	struct pci_dev *parent = dd->pcidev->bus->self;
+	int ret;
+
+	if (!pci_is_pcie(dd->pcidev)) {
+		dd_dev_err(dd, "Can't find PCI Express capability!\n");
+		return -EINVAL;
+	}
+
+	/* find if our max speed is Gen3 and parent supports Gen3 speeds */
+	dd->link_gen3_capable = 1;
+
+	ret = pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
+	if (ret) {
+		dd_dev_err(dd, "Unable to read from PCI config\n");
+		return ret;
+	}
+
+	if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) {
+		dd_dev_info(dd,
+			    "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
+			    linkcap & PCI_EXP_LNKCAP_SLS);
+		dd->link_gen3_capable = 0;
+	}
+
+	/*
+	 * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
+	 */
+	if (parent && dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
+		dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
+		dd->link_gen3_capable = 0;
+	}
+
+	/* obtain the link width and current speed */
+	update_lbus_info(dd);
+
+	dd_dev_info(dd, "%s\n", dd->lbus_info);
+
+	return 0;
+}
+
+/*
+ * Returns:
+ *	- actual number of interrupts allocated or
+ *	- 0 if fell back to INTx.
+ *      - error
+ */
+int request_msix(struct hfi1_devdata *dd, u32 msireq)
+{
+	int nvec;
+
+	nvec = pci_alloc_irq_vectors(dd->pcidev, 1, msireq,
+				     PCI_IRQ_MSIX | PCI_IRQ_LEGACY);
+	if (nvec < 0) {
+		dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", nvec);
+		return nvec;
+	}
+
+	tune_pcie_caps(dd);
+
+	/* check for legacy IRQ */
+	if (nvec == 1 && !dd->pcidev->msix_enabled)
+		return 0;
+
+	return nvec;
+}
+
+/* restore command and BARs after a reset has wiped them out */
+int restore_pci_variables(struct hfi1_devdata *dd)
+{
+	int ret = 0;
+
+	ret = pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
+	if (ret)
+		goto error;
+
+	ret = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0,
+				     dd->pcibar0);
+	if (ret)
+		goto error;
+
+	ret = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1,
+				     dd->pcibar1);
+	if (ret)
+		goto error;
+
+	ret = pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom);
+	if (ret)
+		goto error;
+
+	ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL,
+					 dd->pcie_devctl);
+	if (ret)
+		goto error;
+
+	ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL,
+					 dd->pcie_lnkctl);
+	if (ret)
+		goto error;
+
+	ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2,
+					 dd->pcie_devctl2);
+	if (ret)
+		goto error;
+
+	ret = pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0);
+	if (ret)
+		goto error;
+
+	if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
+		ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2,
+					     dd->pci_tph2);
+		if (ret)
+			goto error;
+	}
+	return 0;
+
+error:
+	dd_dev_err(dd, "Unable to write to PCI config\n");
+	return ret;
+}
+
+/* Save BARs and command to rewrite after device reset */
+int save_pci_variables(struct hfi1_devdata *dd)
+{
+	int ret = 0;
+
+	ret = pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0,
+				    &dd->pcibar0);
+	if (ret)
+		goto error;
+
+	ret = pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1,
+				    &dd->pcibar1);
+	if (ret)
+		goto error;
+
+	ret = pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom);
+	if (ret)
+		goto error;
+
+	ret = pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command);
+	if (ret)
+		goto error;
+
+	ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL,
+					&dd->pcie_devctl);
+	if (ret)
+		goto error;
+
+	ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL,
+					&dd->pcie_lnkctl);
+	if (ret)
+		goto error;
+
+	ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2,
+					&dd->pcie_devctl2);
+	if (ret)
+		goto error;
+
+	ret = pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0);
+	if (ret)
+		goto error;
+
+	if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
+		ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2,
+					    &dd->pci_tph2);
+		if (ret)
+			goto error;
+	}
+	return 0;
+
+error:
+	dd_dev_err(dd, "Unable to read from PCI config\n");
+	return ret;
+}
+
+/*
+ * BIOS may not set PCIe bus-utilization parameters for best performance.
+ * Check and optionally adjust them to maximize our throughput.
+ */
+static int hfi1_pcie_caps;
+module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
+
+uint aspm_mode = ASPM_MODE_DISABLED;
+module_param_named(aspm, aspm_mode, uint, S_IRUGO);
+MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
+
+static void tune_pcie_caps(struct hfi1_devdata *dd)
+{
+	struct pci_dev *parent;
+	u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
+	u16 rc_mrrs, ep_mrrs, max_mrrs, ectl;
+	int ret;
+
+	/*
+	 * Turn on extended tags in DevCtl in case the BIOS has turned it off
+	 * to improve WFR SDMA bandwidth
+	 */
+	ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl);
+	if ((!ret) && !(ectl & PCI_EXP_DEVCTL_EXT_TAG)) {
+		dd_dev_info(dd, "Enabling PCIe extended tags\n");
+		ectl |= PCI_EXP_DEVCTL_EXT_TAG;
+		ret = pcie_capability_write_word(dd->pcidev,
+						 PCI_EXP_DEVCTL, ectl);
+		if (ret)
+			dd_dev_info(dd, "Unable to write to PCI config\n");
+	}
+	/* Find out supported and configured values for parent (root) */
+	parent = dd->pcidev->bus->self;
+	/*
+	 * The driver cannot perform the tuning if it does not have
+	 * access to the upstream component.
+	 */
+	if (!parent) {
+		dd_dev_info(dd, "Parent not found\n");
+		return;
+	}
+	if (!pci_is_root_bus(parent->bus)) {
+		dd_dev_info(dd, "Parent not root\n");
+		return;
+	}
+	if (!pci_is_pcie(parent)) {
+		dd_dev_info(dd, "Parent is not PCI Express capable\n");
+		return;
+	}
+	if (!pci_is_pcie(dd->pcidev)) {
+		dd_dev_info(dd, "PCI device is not PCI Express capable\n");
+		return;
+	}
+	rc_mpss = parent->pcie_mpss;
+	rc_mps = ffs(pcie_get_mps(parent)) - 8;
+	/* Find out supported and configured values for endpoint (us) */
+	ep_mpss = dd->pcidev->pcie_mpss;
+	ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8;
+
+	/* Find max payload supported by root, endpoint */
+	if (rc_mpss > ep_mpss)
+		rc_mpss = ep_mpss;
+
+	/* If Supported greater than limit in module param, limit it */
+	if (rc_mpss > (hfi1_pcie_caps & 7))
+		rc_mpss = hfi1_pcie_caps & 7;
+	/* If less than (allowed, supported), bump root payload */
+	if (rc_mpss > rc_mps) {
+		rc_mps = rc_mpss;
+		pcie_set_mps(parent, 128 << rc_mps);
+	}
+	/* If less than (allowed, supported), bump endpoint payload */
+	if (rc_mpss > ep_mps) {
+		ep_mps = rc_mpss;
+		pcie_set_mps(dd->pcidev, 128 << ep_mps);
+	}
+
+	/*
+	 * Now the Read Request size.
+	 * No field for max supported, but PCIe spec limits it to 4096,
+	 * which is code '5' (log2(4096) - 7)
+	 */
+	max_mrrs = 5;
+	if (max_mrrs > ((hfi1_pcie_caps >> 4) & 7))
+		max_mrrs = (hfi1_pcie_caps >> 4) & 7;
+
+	max_mrrs = 128 << max_mrrs;
+	rc_mrrs = pcie_get_readrq(parent);
+	ep_mrrs = pcie_get_readrq(dd->pcidev);
+
+	if (max_mrrs > rc_mrrs) {
+		rc_mrrs = max_mrrs;
+		pcie_set_readrq(parent, rc_mrrs);
+	}
+	if (max_mrrs > ep_mrrs) {
+		ep_mrrs = max_mrrs;
+		pcie_set_readrq(dd->pcidev, ep_mrrs);
+	}
+}
+
+/* End of PCIe capability tuning */
+
+/*
+ * From here through hfi1_pci_err_handler definition is invoked via
+ * PCI error infrastructure, registered via pci
+ */
+static pci_ers_result_t
+pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+	pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+	switch (state) {
+	case pci_channel_io_normal:
+		dd_dev_info(dd, "State Normal, ignoring\n");
+		break;
+
+	case pci_channel_io_frozen:
+		dd_dev_info(dd, "State Frozen, requesting reset\n");
+		pci_disable_device(pdev);
+		ret = PCI_ERS_RESULT_NEED_RESET;
+		break;
+
+	case pci_channel_io_perm_failure:
+		if (dd) {
+			dd_dev_info(dd, "State Permanent Failure, disabling\n");
+			/* no more register accesses! */
+			dd->flags &= ~HFI1_PRESENT;
+			hfi1_disable_after_error(dd);
+		}
+		 /* else early, or other problem */
+		ret =  PCI_ERS_RESULT_DISCONNECT;
+		break;
+
+	default: /* shouldn't happen */
+		dd_dev_info(dd, "HFI1 PCI errors detected (state %d)\n",
+			    state);
+		break;
+	}
+	return ret;
+}
+
+static pci_ers_result_t
+pci_mmio_enabled(struct pci_dev *pdev)
+{
+	u64 words = 0U;
+	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+	pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+	if (dd && dd->pport) {
+		words = read_port_cntr(dd->pport, C_RX_WORDS, CNTR_INVALID_VL);
+		if (words == ~0ULL)
+			ret = PCI_ERS_RESULT_NEED_RESET;
+		dd_dev_info(dd,
+			    "HFI1 mmio_enabled function called, read wordscntr %llx, returning %d\n",
+			    words, ret);
+	}
+	return  ret;
+}
+
+static pci_ers_result_t
+pci_slot_reset(struct pci_dev *pdev)
+{
+	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+	dd_dev_info(dd, "HFI1 slot_reset function called, ignored\n");
+	return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static void
+pci_resume(struct pci_dev *pdev)
+{
+	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+	dd_dev_info(dd, "HFI1 resume function called\n");
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+	/*
+	 * Running jobs will fail, since it's asynchronous
+	 * unlike sysfs-requested reset.   Better than
+	 * doing nothing.
+	 */
+	hfi1_init(dd, 1); /* same as re-init after reset */
+}
+
+const struct pci_error_handlers hfi1_pci_err_handler = {
+	.error_detected = pci_error_detected,
+	.mmio_enabled = pci_mmio_enabled,
+	.slot_reset = pci_slot_reset,
+	.resume = pci_resume,
+};
+
+/*============================================================================*/
+/* PCIe Gen3 support */
+
+/*
+ * This code is separated out because it is expected to be removed in the
+ * final shipping product.  If not, then it will be revisited and items
+ * will be moved to more standard locations.
+ */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_STS field values */
+#define DL_STATUS_HFI0 0x1	/* hfi0 firmware download complete */
+#define DL_STATUS_HFI1 0x2	/* hfi1 firmware download complete */
+#define DL_STATUS_BOTH 0x3	/* hfi0 and hfi1 firmware download complete */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_ERR field values */
+#define DL_ERR_NONE		0x0	/* no error */
+#define DL_ERR_SWAP_PARITY	0x1	/* parity error in SerDes interrupt */
+					/*   or response data */
+#define DL_ERR_DISABLED	0x2	/* hfi disabled */
+#define DL_ERR_SECURITY	0x3	/* security check failed */
+#define DL_ERR_SBUS		0x4	/* SBus status error */
+#define DL_ERR_XFR_PARITY	0x5	/* parity error during ROM transfer*/
+
+/* gasket block secondary bus reset delay */
+#define SBR_DELAY_US 200000	/* 200ms */
+
+/* mask for PCIe capability register lnkctl2 target link speed */
+#define LNKCTL2_TARGET_LINK_SPEED_MASK 0xf
+
+static uint pcie_target = 3;
+module_param(pcie_target, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_target, "PCIe target speed (0 skip, 1-3 Gen1-3)");
+
+static uint pcie_force;
+module_param(pcie_force, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_force, "Force driver to do a PCIe firmware download even if already at target speed");
+
+static uint pcie_retry = 5;
+module_param(pcie_retry, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_retry, "Driver will try this many times to reach requested speed");
+
+#define UNSET_PSET 255
+#define DEFAULT_DISCRETE_PSET 2	/* discrete HFI */
+#define DEFAULT_MCP_PSET 6	/* MCP HFI */
+static uint pcie_pset = UNSET_PSET;
+module_param(pcie_pset, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10");
+
+static uint pcie_ctle = 3; /* discrete on, integrated on */
+module_param(pcie_ctle, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_ctle, "PCIe static CTLE mode, bit 0 - discrete on/off, bit 1 - integrated on/off");
+
+/* equalization columns */
+#define PREC 0
+#define ATTN 1
+#define POST 2
+
+/* discrete silicon preliminary equalization values */
+static const u8 discrete_preliminary_eq[11][3] = {
+	/* prec   attn   post */
+	{  0x00,  0x00,  0x12 },	/* p0 */
+	{  0x00,  0x00,  0x0c },	/* p1 */
+	{  0x00,  0x00,  0x0f },	/* p2 */
+	{  0x00,  0x00,  0x09 },	/* p3 */
+	{  0x00,  0x00,  0x00 },	/* p4 */
+	{  0x06,  0x00,  0x00 },	/* p5 */
+	{  0x09,  0x00,  0x00 },	/* p6 */
+	{  0x06,  0x00,  0x0f },	/* p7 */
+	{  0x09,  0x00,  0x09 },	/* p8 */
+	{  0x0c,  0x00,  0x00 },	/* p9 */
+	{  0x00,  0x00,  0x18 },	/* p10 */
+};
+
+/* integrated silicon preliminary equalization values */
+static const u8 integrated_preliminary_eq[11][3] = {
+	/* prec   attn   post */
+	{  0x00,  0x1e,  0x07 },	/* p0 */
+	{  0x00,  0x1e,  0x05 },	/* p1 */
+	{  0x00,  0x1e,  0x06 },	/* p2 */
+	{  0x00,  0x1e,  0x04 },	/* p3 */
+	{  0x00,  0x1e,  0x00 },	/* p4 */
+	{  0x03,  0x1e,  0x00 },	/* p5 */
+	{  0x04,  0x1e,  0x00 },	/* p6 */
+	{  0x03,  0x1e,  0x06 },	/* p7 */
+	{  0x03,  0x1e,  0x04 },	/* p8 */
+	{  0x05,  0x1e,  0x00 },	/* p9 */
+	{  0x00,  0x1e,  0x0a },	/* p10 */
+};
+
+static const u8 discrete_ctle_tunings[11][4] = {
+	/* DC     LF     HF     BW */
+	{  0x48,  0x0b,  0x04,  0x04 },	/* p0 */
+	{  0x60,  0x05,  0x0f,  0x0a },	/* p1 */
+	{  0x50,  0x09,  0x06,  0x06 },	/* p2 */
+	{  0x68,  0x05,  0x0f,  0x0a },	/* p3 */
+	{  0x80,  0x05,  0x0f,  0x0a },	/* p4 */
+	{  0x70,  0x05,  0x0f,  0x0a },	/* p5 */
+	{  0x68,  0x05,  0x0f,  0x0a },	/* p6 */
+	{  0x38,  0x0f,  0x00,  0x00 },	/* p7 */
+	{  0x48,  0x09,  0x06,  0x06 },	/* p8 */
+	{  0x60,  0x05,  0x0f,  0x0a },	/* p9 */
+	{  0x38,  0x0f,  0x00,  0x00 },	/* p10 */
+};
+
+static const u8 integrated_ctle_tunings[11][4] = {
+	/* DC     LF     HF     BW */
+	{  0x38,  0x0f,  0x00,  0x00 },	/* p0 */
+	{  0x38,  0x0f,  0x00,  0x00 },	/* p1 */
+	{  0x38,  0x0f,  0x00,  0x00 },	/* p2 */
+	{  0x38,  0x0f,  0x00,  0x00 },	/* p3 */
+	{  0x58,  0x0a,  0x05,  0x05 },	/* p4 */
+	{  0x48,  0x0a,  0x05,  0x05 },	/* p5 */
+	{  0x40,  0x0a,  0x05,  0x05 },	/* p6 */
+	{  0x38,  0x0f,  0x00,  0x00 },	/* p7 */
+	{  0x38,  0x0f,  0x00,  0x00 },	/* p8 */
+	{  0x38,  0x09,  0x06,  0x06 },	/* p9 */
+	{  0x38,  0x0e,  0x01,  0x01 },	/* p10 */
+};
+
+/* helper to format the value to write to hardware */
+#define eq_value(pre, curr, post) \
+	((((u32)(pre)) << \
+			PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT) \
+	| (((u32)(curr)) << PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT) \
+	| (((u32)(post)) << \
+		PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT))
+
+/*
+ * Load the given EQ preset table into the PCIe hardware.
+ */
+static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs,
+			 u8 div)
+{
+	struct pci_dev *pdev = dd->pcidev;
+	u32 hit_error = 0;
+	u32 violation;
+	u32 i;
+	u8 c_minus1, c0, c_plus1;
+	int ret;
+
+	for (i = 0; i < 11; i++) {
+		/* set index */
+		pci_write_config_dword(pdev, PCIE_CFG_REG_PL103, i);
+		/* write the value */
+		c_minus1 = eq[i][PREC] / div;
+		c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div);
+		c_plus1 = eq[i][POST] / div;
+		pci_write_config_dword(pdev, PCIE_CFG_REG_PL102,
+				       eq_value(c_minus1, c0, c_plus1));
+		/* check if these coefficients violate EQ rules */
+		ret = pci_read_config_dword(dd->pcidev,
+					    PCIE_CFG_REG_PL105, &violation);
+		if (ret) {
+			dd_dev_err(dd, "Unable to read from PCI config\n");
+			hit_error = 1;
+			break;
+		}
+
+		if (violation
+		    & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){
+			if (hit_error == 0) {
+				dd_dev_err(dd,
+					   "Gen3 EQ Table Coefficient rule violations\n");
+				dd_dev_err(dd, "         prec   attn   post\n");
+			}
+			dd_dev_err(dd, "   p%02d:   %02x     %02x     %02x\n",
+				   i, (u32)eq[i][0], (u32)eq[i][1],
+				   (u32)eq[i][2]);
+			dd_dev_err(dd, "            %02x     %02x     %02x\n",
+				   (u32)c_minus1, (u32)c0, (u32)c_plus1);
+			hit_error = 1;
+		}
+	}
+	if (hit_error)
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * Steps to be done after the PCIe firmware is downloaded and
+ * before the SBR for the Pcie Gen3.
+ * The SBus resource is already being held.
+ */
+static void pcie_post_steps(struct hfi1_devdata *dd)
+{
+	int i;
+
+	set_sbus_fast_mode(dd);
+	/*
+	 * Write to the PCIe PCSes to set the G3_LOCKED_NEXT bits to 1.
+	 * This avoids a spurious framing error that can otherwise be
+	 * generated by the MAC layer.
+	 *
+	 * Use individual addresses since no broadcast is set up.
+	 */
+	for (i = 0; i < NUM_PCIE_SERDES; i++) {
+		sbus_request(dd, pcie_pcs_addrs[dd->hfi1_id][i],
+			     0x03, WRITE_SBUS_RECEIVER, 0x00022132);
+	}
+
+	clear_sbus_fast_mode(dd);
+}
+
+/*
+ * Trigger a secondary bus reset (SBR) on ourselves using our parent.
+ *
+ * Based on pci_parent_bus_reset() which is not exported by the
+ * kernel core.
+ */
+static int trigger_sbr(struct hfi1_devdata *dd)
+{
+	struct pci_dev *dev = dd->pcidev;
+	struct pci_dev *pdev;
+
+	/* need a parent */
+	if (!dev->bus->self) {
+		dd_dev_err(dd, "%s: no parent device\n", __func__);
+		return -ENOTTY;
+	}
+
+	/* should not be anyone else on the bus */
+	list_for_each_entry(pdev, &dev->bus->devices, bus_list)
+		if (pdev != dev) {
+			dd_dev_err(dd,
+				   "%s: another device is on the same bus\n",
+				   __func__);
+			return -ENOTTY;
+		}
+
+	/*
+	 * A secondary bus reset (SBR) issues a hot reset to our device.
+	 * The following routine does a 1s wait after the reset is dropped
+	 * per PCI Trhfa (recovery time).  PCIe 3.0 section 6.6.1 -
+	 * Conventional Reset, paragraph 3, line 35 also says that a 1s
+	 * delay after a reset is required.  Per spec requirements,
+	 * the link is either working or not after that point.
+	 */
+	pci_reset_bridge_secondary_bus(dev->bus->self);
+
+	return 0;
+}
+
+/*
+ * Write the given gasket interrupt register.
+ */
+static void write_gasket_interrupt(struct hfi1_devdata *dd, int index,
+				   u16 code, u16 data)
+{
+	write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8),
+		  (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT) |
+		   ((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT)));
+}
+
+/*
+ * Tell the gasket logic how to react to the reset.
+ */
+static void arm_gasket_logic(struct hfi1_devdata *dd)
+{
+	u64 reg;
+
+	reg = (((u64)1 << dd->hfi1_id) <<
+	       ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT) |
+	      ((u64)pcie_serdes_broadcast[dd->hfi1_id] <<
+	       ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT |
+	       ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK |
+	       ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK) <<
+	       ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT);
+	write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg);
+	/* read back to push the write */
+	read_csr(dd, ASIC_PCIE_SD_HOST_CMD);
+}
+
+/*
+ * CCE_PCIE_CTRL long name helpers
+ * We redefine these shorter macros to use in the code while leaving
+ * chip_registers.h to be autogenerated from the hardware spec.
+ */
+#define LANE_BUNDLE_MASK              CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK
+#define LANE_BUNDLE_SHIFT             CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT
+#define LANE_DELAY_MASK               CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK
+#define LANE_DELAY_SHIFT              CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT
+#define MARGIN_OVERWRITE_ENABLE_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT
+#define MARGIN_SHIFT                  CCE_PCIE_CTRL_XMT_MARGIN_SHIFT
+#define MARGIN_G1_G2_OVERWRITE_MASK   CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK
+#define MARGIN_G1_G2_OVERWRITE_SHIFT  CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT
+#define MARGIN_GEN1_GEN2_MASK         CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK
+#define MARGIN_GEN1_GEN2_SHIFT        CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT
+
+ /*
+  * Write xmt_margin for full-swing (WFR-B) or half-swing (WFR-C).
+  */
+static void write_xmt_margin(struct hfi1_devdata *dd, const char *fname)
+{
+	u64 pcie_ctrl;
+	u64 xmt_margin;
+	u64 xmt_margin_oe;
+	u64 lane_delay;
+	u64 lane_bundle;
+
+	pcie_ctrl = read_csr(dd, CCE_PCIE_CTRL);
+
+	/*
+	 * For Discrete, use full-swing.
+	 *  - PCIe TX defaults to full-swing.
+	 *    Leave this register as default.
+	 * For Integrated, use half-swing
+	 *  - Copy xmt_margin and xmt_margin_oe
+	 *    from Gen1/Gen2 to Gen3.
+	 */
+	if (dd->pcidev->device == PCI_DEVICE_ID_INTEL1) { /* integrated */
+		/* extract initial fields */
+		xmt_margin = (pcie_ctrl >> MARGIN_GEN1_GEN2_SHIFT)
+			      & MARGIN_GEN1_GEN2_MASK;
+		xmt_margin_oe = (pcie_ctrl >> MARGIN_G1_G2_OVERWRITE_SHIFT)
+				 & MARGIN_G1_G2_OVERWRITE_MASK;
+		lane_delay = (pcie_ctrl >> LANE_DELAY_SHIFT) & LANE_DELAY_MASK;
+		lane_bundle = (pcie_ctrl >> LANE_BUNDLE_SHIFT)
+			       & LANE_BUNDLE_MASK;
+
+		/*
+		 * For A0, EFUSE values are not set.  Override with the
+		 * correct values.
+		 */
+		if (is_ax(dd)) {
+			/*
+			 * xmt_margin and OverwiteEnabel should be the
+			 * same for Gen1/Gen2 and Gen3
+			 */
+			xmt_margin = 0x5;
+			xmt_margin_oe = 0x1;
+			lane_delay = 0xF; /* Delay 240ns. */
+			lane_bundle = 0x0; /* Set to 1 lane. */
+		}
+
+		/* overwrite existing values */
+		pcie_ctrl = (xmt_margin << MARGIN_GEN1_GEN2_SHIFT)
+			| (xmt_margin_oe << MARGIN_G1_G2_OVERWRITE_SHIFT)
+			| (xmt_margin << MARGIN_SHIFT)
+			| (xmt_margin_oe << MARGIN_OVERWRITE_ENABLE_SHIFT)
+			| (lane_delay << LANE_DELAY_SHIFT)
+			| (lane_bundle << LANE_BUNDLE_SHIFT);
+
+		write_csr(dd, CCE_PCIE_CTRL, pcie_ctrl);
+	}
+
+	dd_dev_dbg(dd, "%s: program XMT margin, CcePcieCtrl 0x%llx\n",
+		   fname, pcie_ctrl);
+}
+
+/*
+ * Do all the steps needed to transition the PCIe link to Gen3 speed.
+ */
+int do_pcie_gen3_transition(struct hfi1_devdata *dd)
+{
+	struct pci_dev *parent = dd->pcidev->bus->self;
+	u64 fw_ctrl;
+	u64 reg, therm;
+	u32 reg32, fs, lf;
+	u32 status, err;
+	int ret;
+	int do_retry, retry_count = 0;
+	int intnum = 0;
+	uint default_pset;
+	uint pset = pcie_pset;
+	u16 target_vector, target_speed;
+	u16 lnkctl2, vendor;
+	u8 div;
+	const u8 (*eq)[3];
+	const u8 (*ctle_tunings)[4];
+	uint static_ctle_mode;
+	int return_error = 0;
+
+	/* PCIe Gen3 is for the ASIC only */
+	if (dd->icode != ICODE_RTL_SILICON)
+		return 0;
+
+	if (pcie_target == 1) {			/* target Gen1 */
+		target_vector = GEN1_SPEED_VECTOR;
+		target_speed = 2500;
+	} else if (pcie_target == 2) {		/* target Gen2 */
+		target_vector = GEN2_SPEED_VECTOR;
+		target_speed = 5000;
+	} else if (pcie_target == 3) {		/* target Gen3 */
+		target_vector = GEN3_SPEED_VECTOR;
+		target_speed = 8000;
+	} else {
+		/* off or invalid target - skip */
+		dd_dev_info(dd, "%s: Skipping PCIe transition\n", __func__);
+		return 0;
+	}
+
+	/* if already at target speed, done (unless forced) */
+	if (dd->lbus_speed == target_speed) {
+		dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__,
+			    pcie_target,
+			    pcie_force ? "re-doing anyway" : "skipping");
+		if (!pcie_force)
+			return 0;
+	}
+
+	/*
+	 * The driver cannot do the transition if it has no access to the
+	 * upstream component
+	 */
+	if (!parent) {
+		dd_dev_info(dd, "%s: No upstream, Can't do gen3 transition\n",
+			    __func__);
+		return 0;
+	}
+
+	/*
+	 * Do the Gen3 transition.  Steps are those of the PCIe Gen3
+	 * recipe.
+	 */
+
+	/* step 1: pcie link working in gen1/gen2 */
+
+	/* step 2: if either side is not capable of Gen3, done */
+	if (pcie_target == 3 && !dd->link_gen3_capable) {
+		dd_dev_err(dd, "The PCIe link is not Gen3 capable\n");
+		ret = -ENOSYS;
+		goto done_no_mutex;
+	}
+
+	/* hold the SBus resource across the firmware download and SBR */
+	ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+	if (ret) {
+		dd_dev_err(dd, "%s: unable to acquire SBus resource\n",
+			   __func__);
+		return ret;
+	}
+
+	/* make sure thermal polling is not causing interrupts */
+	therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN);
+	if (therm) {
+		write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
+		msleep(100);
+		dd_dev_info(dd, "%s: Disabled therm polling\n",
+			    __func__);
+	}
+
+retry:
+	/* the SBus download will reset the spico for thermal */
+
+	/* step 3: download SBus Master firmware */
+	/* step 4: download PCIe Gen3 SerDes firmware */
+	dd_dev_info(dd, "%s: downloading firmware\n", __func__);
+	ret = load_pcie_firmware(dd);
+	if (ret) {
+		/* do not proceed if the firmware cannot be downloaded */
+		return_error = 1;
+		goto done;
+	}
+
+	/* step 5: set up device parameter settings */
+	dd_dev_info(dd, "%s: setting PCIe registers\n", __func__);
+
+	/*
+	 * PcieCfgSpcie1 - Link Control 3
+	 * Leave at reset value.  No need to set PerfEq - link equalization
+	 * will be performed automatically after the SBR when the target
+	 * speed is 8GT/s.
+	 */
+
+	/* clear all 16 per-lane error bits (PCIe: Lane Error Status) */
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, 0xffff);
+
+	/* step 5a: Set Synopsys Port Logic registers */
+
+	/*
+	 * PcieCfgRegPl2 - Port Force Link
+	 *
+	 * Set the low power field to 0x10 to avoid unnecessary power
+	 * management messages.  All other fields are zero.
+	 */
+	reg32 = 0x10ul << PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT;
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL2, reg32);
+
+	/*
+	 * PcieCfgRegPl100 - Gen3 Control
+	 *
+	 * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl
+	 * turn on PcieCfgRegPl100.EqEieosCnt
+	 * Everything else zero.
+	 */
+	reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK;
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL100, reg32);
+
+	/*
+	 * PcieCfgRegPl101 - Gen3 EQ FS and LF
+	 * PcieCfgRegPl102 - Gen3 EQ Presets to Coefficients Mapping
+	 * PcieCfgRegPl103 - Gen3 EQ Preset Index
+	 * PcieCfgRegPl105 - Gen3 EQ Status
+	 *
+	 * Give initial EQ settings.
+	 */
+	if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0) { /* discrete */
+		/* 1000mV, FS=24, LF = 8 */
+		fs = 24;
+		lf = 8;
+		div = 3;
+		eq = discrete_preliminary_eq;
+		default_pset = DEFAULT_DISCRETE_PSET;
+		ctle_tunings = discrete_ctle_tunings;
+		/* bit 0 - discrete on/off */
+		static_ctle_mode = pcie_ctle & 0x1;
+	} else {
+		/* 400mV, FS=29, LF = 9 */
+		fs = 29;
+		lf = 9;
+		div = 1;
+		eq = integrated_preliminary_eq;
+		default_pset = DEFAULT_MCP_PSET;
+		ctle_tunings = integrated_ctle_tunings;
+		/* bit 1 - integrated on/off */
+		static_ctle_mode = (pcie_ctle >> 1) & 0x1;
+	}
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101,
+			       (fs <<
+				PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT) |
+			       (lf <<
+				PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT));
+	ret = load_eq_table(dd, eq, fs, div);
+	if (ret)
+		goto done;
+
+	/*
+	 * PcieCfgRegPl106 - Gen3 EQ Control
+	 *
+	 * Set Gen3EqPsetReqVec, leave other fields 0.
+	 */
+	if (pset == UNSET_PSET)
+		pset = default_pset;
+	if (pset > 10) {	/* valid range is 0-10, inclusive */
+		dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n",
+			   __func__, pset, default_pset);
+		pset = default_pset;
+	}
+	dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pset);
+	pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106,
+			       ((1 << pset) <<
+			PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT) |
+			PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK |
+			PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK);
+
+	/*
+	 * step 5b: Do post firmware download steps via SBus
+	 */
+	dd_dev_info(dd, "%s: doing pcie post steps\n", __func__);
+	pcie_post_steps(dd);
+
+	/*
+	 * step 5c: Program gasket interrupts
+	 */
+	/* set the Rx Bit Rate to REFCLK ratio */
+	write_gasket_interrupt(dd, intnum++, 0x0006, 0x0050);
+	/* disable pCal for PCIe Gen3 RX equalization */
+	/* select adaptive or static CTLE */
+	write_gasket_interrupt(dd, intnum++, 0x0026,
+			       0x5b01 | (static_ctle_mode << 3));
+	/*
+	 * Enable iCal for PCIe Gen3 RX equalization, and set which
+	 * evaluation of RX_EQ_EVAL will launch the iCal procedure.
+	 */
+	write_gasket_interrupt(dd, intnum++, 0x0026, 0x5202);
+
+	if (static_ctle_mode) {
+		/* apply static CTLE tunings */
+		u8 pcie_dc, pcie_lf, pcie_hf, pcie_bw;
+
+		pcie_dc = ctle_tunings[pset][0];
+		pcie_lf = ctle_tunings[pset][1];
+		pcie_hf = ctle_tunings[pset][2];
+		pcie_bw = ctle_tunings[pset][3];
+		write_gasket_interrupt(dd, intnum++, 0x0026, 0x0200 | pcie_dc);
+		write_gasket_interrupt(dd, intnum++, 0x0026, 0x0100 | pcie_lf);
+		write_gasket_interrupt(dd, intnum++, 0x0026, 0x0000 | pcie_hf);
+		write_gasket_interrupt(dd, intnum++, 0x0026, 0x5500 | pcie_bw);
+	}
+
+	/* terminate list */
+	write_gasket_interrupt(dd, intnum++, 0x0000, 0x0000);
+
+	/*
+	 * step 5d: program XMT margin
+	 */
+	write_xmt_margin(dd, __func__);
+
+	/*
+	 * step 5e: disable active state power management (ASPM). It
+	 * will be enabled if required later
+	 */
+	dd_dev_info(dd, "%s: clearing ASPM\n", __func__);
+	aspm_hw_disable_l1(dd);
+
+	/*
+	 * step 5f: clear DirectSpeedChange
+	 * PcieCfgRegPl67.DirectSpeedChange must be zero to prevent the
+	 * change in the speed target from starting before we are ready.
+	 * This field defaults to 0 and we are not changing it, so nothing
+	 * needs to be done.
+	 */
+
+	/* step 5g: Set target link speed */
+	/*
+	 * Set target link speed to be target on both device and parent.
+	 * On setting the parent: Some system BIOSs "helpfully" set the
+	 * parent target speed to Gen2 to match the ASIC's initial speed.
+	 * We can set the target Gen3 because we have already checked
+	 * that it is Gen3 capable earlier.
+	 */
+	dd_dev_info(dd, "%s: setting parent target link speed\n", __func__);
+	ret = pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2);
+	if (ret) {
+		dd_dev_err(dd, "Unable to read from PCI config\n");
+		return_error = 1;
+		goto done;
+	}
+
+	dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+		    (u32)lnkctl2);
+	/* only write to parent if target is not as high as ours */
+	if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) {
+		lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+		lnkctl2 |= target_vector;
+		dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+			    (u32)lnkctl2);
+		ret = pcie_capability_write_word(parent,
+						 PCI_EXP_LNKCTL2, lnkctl2);
+		if (ret) {
+			dd_dev_err(dd, "Unable to write to PCI config\n");
+			return_error = 1;
+			goto done;
+		}
+	} else {
+		dd_dev_info(dd, "%s: ..target speed is OK\n", __func__);
+	}
+
+	dd_dev_info(dd, "%s: setting target link speed\n", __func__);
+	ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2);
+	if (ret) {
+		dd_dev_err(dd, "Unable to read from PCI config\n");
+		return_error = 1;
+		goto done;
+	}
+
+	dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+		    (u32)lnkctl2);
+	lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+	lnkctl2 |= target_vector;
+	dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+		    (u32)lnkctl2);
+	ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2);
+	if (ret) {
+		dd_dev_err(dd, "Unable to write to PCI config\n");
+		return_error = 1;
+		goto done;
+	}
+
+	/* step 5h: arm gasket logic */
+	/* hold DC in reset across the SBR */
+	write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+	(void)read_csr(dd, CCE_DC_CTRL); /* DC reset hold */
+	/* save firmware control across the SBR */
+	fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL);
+
+	dd_dev_info(dd, "%s: arming gasket logic\n", __func__);
+	arm_gasket_logic(dd);
+
+	/*
+	 * step 6: quiesce PCIe link
+	 * The chip has already been reset, so there will be no traffic
+	 * from the chip.  Linux has no easy way to enforce that it will
+	 * not try to access the device, so we just need to hope it doesn't
+	 * do it while we are doing the reset.
+	 */
+
+	/*
+	 * step 7: initiate the secondary bus reset (SBR)
+	 * step 8: hardware brings the links back up
+	 * step 9: wait for link speed transition to be complete
+	 */
+	dd_dev_info(dd, "%s: calling trigger_sbr\n", __func__);
+	ret = trigger_sbr(dd);
+	if (ret)
+		goto done;
+
+	/* step 10: decide what to do next */
+
+	/* check if we can read PCI space */
+	ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor);
+	if (ret) {
+		dd_dev_info(dd,
+			    "%s: read of VendorID failed after SBR, err %d\n",
+			    __func__, ret);
+		return_error = 1;
+		goto done;
+	}
+	if (vendor == 0xffff) {
+		dd_dev_info(dd, "%s: VendorID is all 1s after SBR\n", __func__);
+		return_error = 1;
+		ret = -EIO;
+		goto done;
+	}
+
+	/* restore PCI space registers we know were reset */
+	dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__);
+	ret = restore_pci_variables(dd);
+	if (ret) {
+		dd_dev_err(dd, "%s: Could not restore PCI variables\n",
+			   __func__);
+		return_error = 1;
+		goto done;
+	}
+
+	/* restore firmware control */
+	write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl);
+
+	/*
+	 * Check the gasket block status.
+	 *
+	 * This is the first CSR read after the SBR.  If the read returns
+	 * all 1s (fails), the link did not make it back.
+	 *
+	 * Once we're sure we can read and write, clear the DC reset after
+	 * the SBR.  Then check for any per-lane errors. Then look over
+	 * the status.
+	 */
+	reg = read_csr(dd, ASIC_PCIE_SD_HOST_STATUS);
+	dd_dev_info(dd, "%s: gasket block status: 0x%llx\n", __func__, reg);
+	if (reg == ~0ull) {	/* PCIe read failed/timeout */
+		dd_dev_err(dd, "SBR failed - unable to read from device\n");
+		return_error = 1;
+		ret = -ENOSYS;
+		goto done;
+	}
+
+	/* clear the DC reset */
+	write_csr(dd, CCE_DC_CTRL, 0);
+
+	/* Set the LED off */
+	setextled(dd, 0);
+
+	/* check for any per-lane errors */
+	ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, &reg32);
+	if (ret) {
+		dd_dev_err(dd, "Unable to read from PCI config\n");
+		return_error = 1;
+		goto done;
+	}
+
+	dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32);
+
+	/* extract status, look for our HFI */
+	status = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT)
+			& ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK;
+	if ((status & (1 << dd->hfi1_id)) == 0) {
+		dd_dev_err(dd,
+			   "%s: gasket status 0x%x, expecting 0x%x\n",
+			   __func__, status, 1 << dd->hfi1_id);
+		ret = -EIO;
+		goto done;
+	}
+
+	/* extract error */
+	err = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT)
+		& ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK;
+	if (err) {
+		dd_dev_err(dd, "%s: gasket error %d\n", __func__, err);
+		ret = -EIO;
+		goto done;
+	}
+
+	/* update our link information cache */
+	update_lbus_info(dd);
+	dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
+		    dd->lbus_info);
+
+	if (dd->lbus_speed != target_speed) { /* not target */
+		/* maybe retry */
+		do_retry = retry_count < pcie_retry;
+		dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
+			   pcie_target, do_retry ? ", retrying" : "");
+		retry_count++;
+		if (do_retry) {
+			msleep(100); /* allow time to settle */
+			goto retry;
+		}
+		ret = -EIO;
+	}
+
+done:
+	if (therm) {
+		write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+		msleep(100);
+		dd_dev_info(dd, "%s: Re-enable therm polling\n",
+			    __func__);
+	}
+	release_chip_resource(dd, CR_SBUS);
+done_no_mutex:
+	/* return no error if it is OK to be at current speed */
+	if (ret && !return_error) {
+		dd_dev_err(dd, "Proceeding at current speed PCIe speed\n");
+		ret = 0;
+	}
+
+	dd_dev_info(dd, "%s: done\n", __func__);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
new file mode 100644
index 0000000..40dac4d
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -0,0 +1,2092 @@
+/*
+ * Copyright(c) 2015-2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+
+#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */
+
+#define SC(name) SEND_CTXT_##name
+/*
+ * Send Context functions
+ */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause);
+
+/*
+ * Set the CM reset bit and wait for it to clear.  Use the provided
+ * sendctrl register.  This routine has no locking.
+ */
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
+{
+	write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK);
+	while (1) {
+		udelay(1);
+		sendctrl = read_csr(dd, SEND_CTRL);
+		if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0)
+			break;
+	}
+}
+
+/* defined in header release 48 and higher */
+#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
+#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
+#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
+#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
+		<< SEND_CTRL_UNSUPPORTED_VL_SHIFT)
+#endif
+
+/* global control of PIO send */
+void pio_send_control(struct hfi1_devdata *dd, int op)
+{
+	u64 reg, mask;
+	unsigned long flags;
+	int write = 1;	/* write sendctrl back */
+	int flush = 0;	/* re-read sendctrl to make sure it is flushed */
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+	reg = read_csr(dd, SEND_CTRL);
+	switch (op) {
+	case PSC_GLOBAL_ENABLE:
+		reg |= SEND_CTRL_SEND_ENABLE_SMASK;
+	/* Fall through */
+	case PSC_DATA_VL_ENABLE:
+		/* Disallow sending on VLs not enabled */
+		mask = (((~0ull) << num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK) <<
+				SEND_CTRL_UNSUPPORTED_VL_SHIFT;
+		reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
+		break;
+	case PSC_GLOBAL_DISABLE:
+		reg &= ~SEND_CTRL_SEND_ENABLE_SMASK;
+		break;
+	case PSC_GLOBAL_VLARB_ENABLE:
+		reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+		break;
+	case PSC_GLOBAL_VLARB_DISABLE:
+		reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+		break;
+	case PSC_CM_RESET:
+		__cm_reset(dd, reg);
+		write = 0; /* CSR already written (and flushed) */
+		break;
+	case PSC_DATA_VL_DISABLE:
+		reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK;
+		flush = 1;
+		break;
+	default:
+		dd_dev_err(dd, "%s: invalid control %d\n", __func__, op);
+		break;
+	}
+
+	if (write) {
+		write_csr(dd, SEND_CTRL, reg);
+		if (flush)
+			(void)read_csr(dd, SEND_CTRL); /* flush write */
+	}
+
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/* number of send context memory pools */
+#define NUM_SC_POOLS 2
+
+/* Send Context Size (SCS) wildcards */
+#define SCS_POOL_0 -1
+#define SCS_POOL_1 -2
+
+/* Send Context Count (SCC) wildcards */
+#define SCC_PER_VL -1
+#define SCC_PER_CPU  -2
+#define SCC_PER_KRCVQ  -3
+
+/* Send Context Size (SCS) constants */
+#define SCS_ACK_CREDITS  32
+#define SCS_VL15_CREDITS 102	/* 3 pkts of 2048B data + 128B header */
+
+#define PIO_THRESHOLD_CEILING 4096
+
+#define PIO_WAIT_BATCH_SIZE 5
+
+/* default send context sizes */
+static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
+	[SC_KERNEL] = { .size  = SCS_POOL_0,	/* even divide, pool 0 */
+			.count = SCC_PER_VL },	/* one per NUMA */
+	[SC_ACK]    = { .size  = SCS_ACK_CREDITS,
+			.count = SCC_PER_KRCVQ },
+	[SC_USER]   = { .size  = SCS_POOL_0,	/* even divide, pool 0 */
+			.count = SCC_PER_CPU },	/* one per CPU */
+	[SC_VL15]   = { .size  = SCS_VL15_CREDITS,
+			.count = 1 },
+
+};
+
+/* send context memory pool configuration */
+struct mem_pool_config {
+	int centipercent;	/* % of memory, in 100ths of 1% */
+	int absolute_blocks;	/* absolute block count */
+};
+
+/* default memory pool configuration: 100% in pool 0 */
+static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = {
+	/* centi%, abs blocks */
+	{  10000,     -1 },		/* pool 0 */
+	{      0,     -1 },		/* pool 1 */
+};
+
+/* memory pool information, used when calculating final sizes */
+struct mem_pool_info {
+	int centipercent;	/*
+				 * 100th of 1% of memory to use, -1 if blocks
+				 * already set
+				 */
+	int count;		/* count of contexts in the pool */
+	int blocks;		/* block size of the pool */
+	int size;		/* context size, in blocks */
+};
+
+/*
+ * Convert a pool wildcard to a valid pool index.  The wildcards
+ * start at -1 and increase negatively.  Map them as:
+ *	-1 => 0
+ *	-2 => 1
+ *	etc.
+ *
+ * Return -1 on non-wildcard input, otherwise convert to a pool number.
+ */
+static int wildcard_to_pool(int wc)
+{
+	if (wc >= 0)
+		return -1;	/* non-wildcard */
+	return -wc - 1;
+}
+
+static const char *sc_type_names[SC_MAX] = {
+	"kernel",
+	"ack",
+	"user",
+	"vl15"
+};
+
+static const char *sc_type_name(int index)
+{
+	if (index < 0 || index >= SC_MAX)
+		return "unknown";
+	return sc_type_names[index];
+}
+
+/*
+ * Read the send context memory pool configuration and send context
+ * size configuration.  Replace any wildcards and come up with final
+ * counts and sizes for the send context types.
+ */
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
+{
+	struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } };
+	int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1;
+	int total_contexts = 0;
+	int fixed_blocks;
+	int pool_blocks;
+	int used_blocks;
+	int cp_total;		/* centipercent total */
+	int ab_total;		/* absolute block total */
+	int extra;
+	int i;
+
+	/*
+	 * When SDMA is enabled, kernel context pio packet size is capped by
+	 * "piothreshold". Reduce pio buffer allocation for kernel context by
+	 * setting it to a fixed size. The allocation allows 3-deep buffering
+	 * of the largest pio packets plus up to 128 bytes header, sufficient
+	 * to maintain verbs performance.
+	 *
+	 * When SDMA is disabled, keep the default pooling allocation.
+	 */
+	if (HFI1_CAP_IS_KSET(SDMA)) {
+		u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ?
+					 piothreshold : PIO_THRESHOLD_CEILING;
+		sc_config_sizes[SC_KERNEL].size =
+			3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE;
+	}
+
+	/*
+	 * Step 0:
+	 *	- copy the centipercents/absolute sizes from the pool config
+	 *	- sanity check these values
+	 *	- add up centipercents, then later check for full value
+	 *	- add up absolute blocks, then later check for over-commit
+	 */
+	cp_total = 0;
+	ab_total = 0;
+	for (i = 0; i < NUM_SC_POOLS; i++) {
+		int cp = sc_mem_pool_config[i].centipercent;
+		int ab = sc_mem_pool_config[i].absolute_blocks;
+
+		/*
+		 * A negative value is "unused" or "invalid".  Both *can*
+		 * be valid, but centipercent wins, so check that first
+		 */
+		if (cp >= 0) {			/* centipercent valid */
+			cp_total += cp;
+		} else if (ab >= 0) {		/* absolute blocks valid */
+			ab_total += ab;
+		} else {			/* neither valid */
+			dd_dev_err(
+				dd,
+				"Send context memory pool %d: both the block count and centipercent are invalid\n",
+				i);
+			return -EINVAL;
+		}
+
+		mem_pool_info[i].centipercent = cp;
+		mem_pool_info[i].blocks = ab;
+	}
+
+	/* do not use both % and absolute blocks for different pools */
+	if (cp_total != 0 && ab_total != 0) {
+		dd_dev_err(
+			dd,
+			"All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n");
+		return -EINVAL;
+	}
+
+	/* if any percentages are present, they must add up to 100% x 100 */
+	if (cp_total != 0 && cp_total != 10000) {
+		dd_dev_err(
+			dd,
+			"Send context memory pool centipercent is %d, expecting 10000\n",
+			cp_total);
+		return -EINVAL;
+	}
+
+	/* the absolute pool total cannot be more than the mem total */
+	if (ab_total > total_blocks) {
+		dd_dev_err(
+			dd,
+			"Send context memory pool absolute block count %d is larger than the memory size %d\n",
+			ab_total, total_blocks);
+		return -EINVAL;
+	}
+
+	/*
+	 * Step 2:
+	 *	- copy from the context size config
+	 *	- replace context type wildcard counts with real values
+	 *	- add up non-memory pool block sizes
+	 *	- add up memory pool user counts
+	 */
+	fixed_blocks = 0;
+	for (i = 0; i < SC_MAX; i++) {
+		int count = sc_config_sizes[i].count;
+		int size = sc_config_sizes[i].size;
+		int pool;
+
+		/*
+		 * Sanity check count: Either a positive value or
+		 * one of the expected wildcards is valid.  The positive
+		 * value is checked later when we compare against total
+		 * memory available.
+		 */
+		if (i == SC_ACK) {
+			count = dd->n_krcv_queues;
+		} else if (i == SC_KERNEL) {
+			count = INIT_SC_PER_VL * num_vls;
+		} else if (count == SCC_PER_CPU) {
+			count = dd->num_rcv_contexts - dd->n_krcv_queues;
+		} else if (count < 0) {
+			dd_dev_err(
+				dd,
+				"%s send context invalid count wildcard %d\n",
+				sc_type_name(i), count);
+			return -EINVAL;
+		}
+		if (total_contexts + count > dd->chip_send_contexts)
+			count = dd->chip_send_contexts - total_contexts;
+
+		total_contexts += count;
+
+		/*
+		 * Sanity check pool: The conversion will return a pool
+		 * number or -1 if a fixed (non-negative) value.  The fixed
+		 * value is checked later when we compare against
+		 * total memory available.
+		 */
+		pool = wildcard_to_pool(size);
+		if (pool == -1) {			/* non-wildcard */
+			fixed_blocks += size * count;
+		} else if (pool < NUM_SC_POOLS) {	/* valid wildcard */
+			mem_pool_info[pool].count += count;
+		} else {				/* invalid wildcard */
+			dd_dev_err(
+				dd,
+				"%s send context invalid pool wildcard %d\n",
+				sc_type_name(i), size);
+			return -EINVAL;
+		}
+
+		dd->sc_sizes[i].count = count;
+		dd->sc_sizes[i].size = size;
+	}
+	if (fixed_blocks > total_blocks) {
+		dd_dev_err(
+			dd,
+			"Send context fixed block count, %u, larger than total block count %u\n",
+			fixed_blocks, total_blocks);
+		return -EINVAL;
+	}
+
+	/* step 3: calculate the blocks in the pools, and pool context sizes */
+	pool_blocks = total_blocks - fixed_blocks;
+	if (ab_total > pool_blocks) {
+		dd_dev_err(
+			dd,
+			"Send context fixed pool sizes, %u, larger than pool block count %u\n",
+			ab_total, pool_blocks);
+		return -EINVAL;
+	}
+	/* subtract off the fixed pool blocks */
+	pool_blocks -= ab_total;
+
+	for (i = 0; i < NUM_SC_POOLS; i++) {
+		struct mem_pool_info *pi = &mem_pool_info[i];
+
+		/* % beats absolute blocks */
+		if (pi->centipercent >= 0)
+			pi->blocks = (pool_blocks * pi->centipercent) / 10000;
+
+		if (pi->blocks == 0 && pi->count != 0) {
+			dd_dev_err(
+				dd,
+				"Send context memory pool %d has %u contexts, but no blocks\n",
+				i, pi->count);
+			return -EINVAL;
+		}
+		if (pi->count == 0) {
+			/* warn about wasted blocks */
+			if (pi->blocks != 0)
+				dd_dev_err(
+					dd,
+					"Send context memory pool %d has %u blocks, but zero contexts\n",
+					i, pi->blocks);
+			pi->size = 0;
+		} else {
+			pi->size = pi->blocks / pi->count;
+		}
+	}
+
+	/* step 4: fill in the context type sizes from the pool sizes */
+	used_blocks = 0;
+	for (i = 0; i < SC_MAX; i++) {
+		if (dd->sc_sizes[i].size < 0) {
+			unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size);
+
+			WARN_ON_ONCE(pool >= NUM_SC_POOLS);
+			dd->sc_sizes[i].size = mem_pool_info[pool].size;
+		}
+		/* make sure we are not larger than what is allowed by the HW */
+#define PIO_MAX_BLOCKS 1024
+		if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS)
+			dd->sc_sizes[i].size = PIO_MAX_BLOCKS;
+
+		/* calculate our total usage */
+		used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count;
+	}
+	extra = total_blocks - used_blocks;
+	if (extra != 0)
+		dd_dev_info(dd, "unused send context blocks: %d\n", extra);
+
+	return total_contexts;
+}
+
+int init_send_contexts(struct hfi1_devdata *dd)
+{
+	u16 base;
+	int ret, i, j, context;
+
+	ret = init_credit_return(dd);
+	if (ret)
+		return ret;
+
+	dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8),
+					GFP_KERNEL);
+	dd->send_contexts = kcalloc(dd->num_send_contexts,
+				    sizeof(struct send_context_info),
+				    GFP_KERNEL);
+	if (!dd->send_contexts || !dd->hw_to_sw) {
+		kfree(dd->hw_to_sw);
+		kfree(dd->send_contexts);
+		free_credit_return(dd);
+		return -ENOMEM;
+	}
+
+	/* hardware context map starts with invalid send context indices */
+	for (i = 0; i < TXE_NUM_CONTEXTS; i++)
+		dd->hw_to_sw[i] = INVALID_SCI;
+
+	/*
+	 * All send contexts have their credit sizes.  Allocate credits
+	 * for each context one after another from the global space.
+	 */
+	context = 0;
+	base = 1;
+	for (i = 0; i < SC_MAX; i++) {
+		struct sc_config_sizes *scs = &dd->sc_sizes[i];
+
+		for (j = 0; j < scs->count; j++) {
+			struct send_context_info *sci =
+						&dd->send_contexts[context];
+			sci->type = i;
+			sci->base = base;
+			sci->credits = scs->size;
+
+			context++;
+			base += scs->size;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Allocate a software index and hardware context of the given type.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index,
+		       u32 *hw_context)
+{
+	struct send_context_info *sci;
+	u32 index;
+	u32 context;
+
+	for (index = 0, sci = &dd->send_contexts[0];
+			index < dd->num_send_contexts; index++, sci++) {
+		if (sci->type == type && sci->allocated == 0) {
+			sci->allocated = 1;
+			/* use a 1:1 mapping, but make them non-equal */
+			context = dd->chip_send_contexts - index - 1;
+			dd->hw_to_sw[context] = index;
+			*sw_index = index;
+			*hw_context = context;
+			return 0; /* success */
+		}
+	}
+	dd_dev_err(dd, "Unable to locate a free type %d send context\n", type);
+	return -ENOSPC;
+}
+
+/*
+ * Free the send context given by its software index.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context)
+{
+	struct send_context_info *sci;
+
+	sci = &dd->send_contexts[sw_index];
+	if (!sci->allocated) {
+		dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n",
+			   __func__, sw_index, hw_context);
+	}
+	sci->allocated = 0;
+	dd->hw_to_sw[hw_context] = INVALID_SCI;
+}
+
+/* return the base context of a context in a group */
+static inline u32 group_context(u32 context, u32 group)
+{
+	return (context >> group) << group;
+}
+
+/* return the size of a group */
+static inline u32 group_size(u32 group)
+{
+	return 1 << group;
+}
+
+/*
+ * Obtain the credit return addresses, kernel virtual and bus, for the
+ * given sc.
+ *
+ * To understand this routine:
+ * o va and dma are arrays of struct credit_return.  One for each physical
+ *   send context, per NUMA.
+ * o Each send context always looks in its relative location in a struct
+ *   credit_return for its credit return.
+ * o Each send context in a group must have its return address CSR programmed
+ *   with the same value.  Use the address of the first send context in the
+ *   group.
+ */
+static void cr_group_addresses(struct send_context *sc, dma_addr_t *dma)
+{
+	u32 gc = group_context(sc->hw_context, sc->group);
+	u32 index = sc->hw_context & 0x7;
+
+	sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index];
+	*dma = (unsigned long)
+	       &((struct credit_return *)sc->dd->cr_base[sc->node].dma)[gc];
+}
+
+/*
+ * Work queue function triggered in error interrupt routine for
+ * kernel contexts.
+ */
+static void sc_halted(struct work_struct *work)
+{
+	struct send_context *sc;
+
+	sc = container_of(work, struct send_context, halt_work);
+	sc_restart(sc);
+}
+
+/*
+ * Calculate PIO block threshold for this send context using the given MTU.
+ * Trigger a return when one MTU plus optional header of credits remain.
+ *
+ * Parameter mtu is in bytes.
+ * Parameter hdrqentsize is in DWORDs.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
+{
+	u32 release_credits;
+	u32 threshold;
+
+	/* add in the header size, then divide by the PIO block size */
+	mtu += hdrqentsize << 2;
+	release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE);
+
+	/* check against this context's credits */
+	if (sc->credits <= release_credits)
+		threshold = 1;
+	else
+		threshold = sc->credits - release_credits;
+
+	return threshold;
+}
+
+/*
+ * Calculate credit threshold in terms of percent of the allocated credits.
+ * Trigger when unreturned credits equal or exceed the percentage of the whole.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
+{
+	return (sc->credits * percent) / 100;
+}
+
+/*
+ * Set the credit return threshold.
+ */
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold)
+{
+	unsigned long flags;
+	u32 old_threshold;
+	int force_return = 0;
+
+	spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+
+	old_threshold = (sc->credit_ctrl >>
+				SC(CREDIT_CTRL_THRESHOLD_SHIFT))
+			 & SC(CREDIT_CTRL_THRESHOLD_MASK);
+
+	if (new_threshold != old_threshold) {
+		sc->credit_ctrl =
+			(sc->credit_ctrl
+				& ~SC(CREDIT_CTRL_THRESHOLD_SMASK))
+			| ((new_threshold
+				& SC(CREDIT_CTRL_THRESHOLD_MASK))
+			   << SC(CREDIT_CTRL_THRESHOLD_SHIFT));
+		write_kctxt_csr(sc->dd, sc->hw_context,
+				SC(CREDIT_CTRL), sc->credit_ctrl);
+
+		/* force a credit return on change to avoid a possible stall */
+		force_return = 1;
+	}
+
+	spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+
+	if (force_return)
+		sc_return_credits(sc);
+}
+
+/*
+ * set_pio_integrity
+ *
+ * Set the CHECK_ENABLE register for the send context 'sc'.
+ */
+void set_pio_integrity(struct send_context *sc)
+{
+	struct hfi1_devdata *dd = sc->dd;
+	u32 hw_context = sc->hw_context;
+	int type = sc->type;
+
+	write_kctxt_csr(dd, hw_context,
+			SC(CHECK_ENABLE),
+			hfi1_pkt_default_send_ctxt_mask(dd, type));
+}
+
+static u32 get_buffers_allocated(struct send_context *sc)
+{
+	int cpu;
+	u32 ret = 0;
+
+	for_each_possible_cpu(cpu)
+		ret += *per_cpu_ptr(sc->buffers_allocated, cpu);
+	return ret;
+}
+
+static void reset_buffers_allocated(struct send_context *sc)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		(*per_cpu_ptr(sc->buffers_allocated, cpu)) = 0;
+}
+
+/*
+ * Allocate a NUMA relative send context structure of the given type along
+ * with a HW context.
+ */
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+			      uint hdrqentsize, int numa)
+{
+	struct send_context_info *sci;
+	struct send_context *sc = NULL;
+	dma_addr_t dma;
+	unsigned long flags;
+	u64 reg;
+	u32 thresh;
+	u32 sw_index;
+	u32 hw_context;
+	int ret;
+	u8 opval, opmask;
+
+	/* do not allocate while frozen */
+	if (dd->flags & HFI1_FROZEN)
+		return NULL;
+
+	sc = kzalloc_node(sizeof(*sc), GFP_KERNEL, numa);
+	if (!sc)
+		return NULL;
+
+	sc->buffers_allocated = alloc_percpu(u32);
+	if (!sc->buffers_allocated) {
+		kfree(sc);
+		dd_dev_err(dd,
+			   "Cannot allocate buffers_allocated per cpu counters\n"
+			  );
+		return NULL;
+	}
+
+	spin_lock_irqsave(&dd->sc_lock, flags);
+	ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
+	if (ret) {
+		spin_unlock_irqrestore(&dd->sc_lock, flags);
+		free_percpu(sc->buffers_allocated);
+		kfree(sc);
+		return NULL;
+	}
+
+	sci = &dd->send_contexts[sw_index];
+	sci->sc = sc;
+
+	sc->dd = dd;
+	sc->node = numa;
+	sc->type = type;
+	spin_lock_init(&sc->alloc_lock);
+	spin_lock_init(&sc->release_lock);
+	spin_lock_init(&sc->credit_ctrl_lock);
+	INIT_LIST_HEAD(&sc->piowait);
+	INIT_WORK(&sc->halt_work, sc_halted);
+	init_waitqueue_head(&sc->halt_wait);
+
+	/* grouping is always single context for now */
+	sc->group = 0;
+
+	sc->sw_index = sw_index;
+	sc->hw_context = hw_context;
+	cr_group_addresses(sc, &dma);
+	sc->credits = sci->credits;
+	sc->size = sc->credits * PIO_BLOCK_SIZE;
+
+/* PIO Send Memory Address details */
+#define PIO_ADDR_CONTEXT_MASK 0xfful
+#define PIO_ADDR_CONTEXT_SHIFT 16
+	sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK)
+					<< PIO_ADDR_CONTEXT_SHIFT);
+
+	/* set base and credits */
+	reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK))
+					<< SC(CTRL_CTXT_DEPTH_SHIFT))
+		| ((sci->base & SC(CTRL_CTXT_BASE_MASK))
+					<< SC(CTRL_CTXT_BASE_SHIFT));
+	write_kctxt_csr(dd, hw_context, SC(CTRL), reg);
+
+	set_pio_integrity(sc);
+
+	/* unmask all errors */
+	write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1);
+
+	/* set the default partition key */
+	write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY),
+			(SC(CHECK_PARTITION_KEY_VALUE_MASK) &
+			 DEFAULT_PKEY) <<
+			SC(CHECK_PARTITION_KEY_VALUE_SHIFT));
+
+	/* per context type checks */
+	if (type == SC_USER) {
+		opval = USER_OPCODE_CHECK_VAL;
+		opmask = USER_OPCODE_CHECK_MASK;
+	} else {
+		opval = OPCODE_CHECK_VAL_DISABLED;
+		opmask = OPCODE_CHECK_MASK_DISABLED;
+	}
+
+	/* set the send context check opcode mask and value */
+	write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE),
+			((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) |
+			((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT)));
+
+	/* set up credit return */
+	reg = dma & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK);
+	write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg);
+
+	/*
+	 * Calculate the initial credit return threshold.
+	 *
+	 * For Ack contexts, set a threshold for half the credits.
+	 * For User contexts use the given percentage.  This has been
+	 * sanitized on driver start-up.
+	 * For Kernel contexts, use the default MTU plus a header
+	 * or half the credits, whichever is smaller. This should
+	 * work for both the 3-deep buffering allocation and the
+	 * pooling allocation.
+	 */
+	if (type == SC_ACK) {
+		thresh = sc_percent_to_threshold(sc, 50);
+	} else if (type == SC_USER) {
+		thresh = sc_percent_to_threshold(sc,
+						 user_credit_return_threshold);
+	} else { /* kernel */
+		thresh = min(sc_percent_to_threshold(sc, 50),
+			     sc_mtu_to_threshold(sc, hfi1_max_mtu,
+						 hdrqentsize));
+	}
+	reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
+	/* add in early return */
+	if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN))
+		reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+	else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */
+		reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+
+	/* set up write-through credit_ctrl */
+	sc->credit_ctrl = reg;
+	write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg);
+
+	/* User send contexts should not allow sending on VL15 */
+	if (type == SC_USER) {
+		reg = 1ULL << 15;
+		write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg);
+	}
+
+	spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+	/*
+	 * Allocate shadow ring to track outstanding PIO buffers _after_
+	 * unlocking.  We don't know the size until the lock is held and
+	 * we can't allocate while the lock is held.  No one is using
+	 * the context yet, so allocate it now.
+	 *
+	 * User contexts do not get a shadow ring.
+	 */
+	if (type != SC_USER) {
+		/*
+		 * Size the shadow ring 1 larger than the number of credits
+		 * so head == tail can mean empty.
+		 */
+		sc->sr_size = sci->credits + 1;
+		sc->sr = kcalloc_node(sc->sr_size,
+				      sizeof(union pio_shadow_ring),
+				      GFP_KERNEL, numa);
+		if (!sc->sr) {
+			sc_free(sc);
+			return NULL;
+		}
+	}
+
+	hfi1_cdbg(PIO,
+		  "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n",
+		  sw_index,
+		  hw_context,
+		  sc_type_name(type),
+		  sc->group,
+		  sc->credits,
+		  sc->credit_ctrl,
+		  thresh);
+
+	return sc;
+}
+
+/* free a per-NUMA send context structure */
+void sc_free(struct send_context *sc)
+{
+	struct hfi1_devdata *dd;
+	unsigned long flags;
+	u32 sw_index;
+	u32 hw_context;
+
+	if (!sc)
+		return;
+
+	sc->flags |= SCF_IN_FREE;	/* ensure no restarts */
+	dd = sc->dd;
+	if (!list_empty(&sc->piowait))
+		dd_dev_err(dd, "piowait list not empty!\n");
+	sw_index = sc->sw_index;
+	hw_context = sc->hw_context;
+	sc_disable(sc);	/* make sure the HW is disabled */
+	flush_work(&sc->halt_work);
+
+	spin_lock_irqsave(&dd->sc_lock, flags);
+	dd->send_contexts[sw_index].sc = NULL;
+
+	/* clear/disable all registers set in sc_alloc */
+	write_kctxt_csr(dd, hw_context, SC(CTRL), 0);
+	write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0);
+	write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0);
+	write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0);
+	write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0);
+	write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0);
+	write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0);
+
+	/* release the index and context for re-use */
+	sc_hw_free(dd, sw_index, hw_context);
+	spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+	kfree(sc->sr);
+	free_percpu(sc->buffers_allocated);
+	kfree(sc);
+}
+
+/* disable the context */
+void sc_disable(struct send_context *sc)
+{
+	u64 reg;
+	unsigned long flags;
+	struct pio_buf *pbuf;
+
+	if (!sc)
+		return;
+
+	/* do all steps, even if already disabled */
+	spin_lock_irqsave(&sc->alloc_lock, flags);
+	reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
+	reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
+	sc->flags &= ~SCF_ENABLED;
+	sc_wait_for_packet_egress(sc, 1);
+	write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
+	spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+	/*
+	 * Flush any waiters.  Once the context is disabled,
+	 * credit return interrupts are stopped (although there
+	 * could be one in-process when the context is disabled).
+	 * Wait one microsecond for any lingering interrupts, then
+	 * proceed with the flush.
+	 */
+	udelay(1);
+	spin_lock_irqsave(&sc->release_lock, flags);
+	if (sc->sr) {	/* this context has a shadow ring */
+		while (sc->sr_tail != sc->sr_head) {
+			pbuf = &sc->sr[sc->sr_tail].pbuf;
+			if (pbuf->cb)
+				(*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE);
+			sc->sr_tail++;
+			if (sc->sr_tail >= sc->sr_size)
+				sc->sr_tail = 0;
+		}
+	}
+	spin_unlock_irqrestore(&sc->release_lock, flags);
+}
+
+/* return SendEgressCtxtStatus.PacketOccupancy */
+#define packet_occupancy(r) \
+	(((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\
+	>> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT)
+
+/* is egress halted on the context? */
+#define egress_halted(r) \
+	((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK)
+
+/* wait for packet egress, optionally pause for credit return  */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
+{
+	struct hfi1_devdata *dd = sc->dd;
+	u64 reg = 0;
+	u64 reg_prev;
+	u32 loop = 0;
+
+	while (1) {
+		reg_prev = reg;
+		reg = read_csr(dd, sc->hw_context * 8 +
+			       SEND_EGRESS_CTXT_STATUS);
+		/* done if egress is stopped */
+		if (egress_halted(reg))
+			break;
+		reg = packet_occupancy(reg);
+		if (reg == 0)
+			break;
+		/* counter is reset if occupancy count changes */
+		if (reg != reg_prev)
+			loop = 0;
+		if (loop > 50000) {
+			/* timed out - bounce the link */
+			dd_dev_err(dd,
+				   "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n",
+				   __func__, sc->sw_index,
+				   sc->hw_context, (u32)reg);
+			queue_work(dd->pport->link_wq,
+				   &dd->pport->link_bounce_work);
+			break;
+		}
+		loop++;
+		udelay(1);
+	}
+
+	if (pause)
+		/* Add additional delay to ensure chip returns all credits */
+		pause_for_credit_return(dd);
+}
+
+void sc_wait(struct hfi1_devdata *dd)
+{
+	int i;
+
+	for (i = 0; i < dd->num_send_contexts; i++) {
+		struct send_context *sc = dd->send_contexts[i].sc;
+
+		if (!sc)
+			continue;
+		sc_wait_for_packet_egress(sc, 0);
+	}
+}
+
+/*
+ * Restart a context after it has been halted due to error.
+ *
+ * If the first step fails - wait for the halt to be asserted, return early.
+ * Otherwise complain about timeouts but keep going.
+ *
+ * It is expected that allocations (enabled flag bit) have been shut off
+ * already (only applies to kernel contexts).
+ */
+int sc_restart(struct send_context *sc)
+{
+	struct hfi1_devdata *dd = sc->dd;
+	u64 reg;
+	u32 loop;
+	int count;
+
+	/* bounce off if not halted, or being free'd */
+	if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE))
+		return -EINVAL;
+
+	dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index,
+		    sc->hw_context);
+
+	/*
+	 * Step 1: Wait for the context to actually halt.
+	 *
+	 * The error interrupt is asynchronous to actually setting halt
+	 * on the context.
+	 */
+	loop = 0;
+	while (1) {
+		reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS));
+		if (reg & SC(STATUS_CTXT_HALTED_SMASK))
+			break;
+		if (loop > 100) {
+			dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n",
+				   __func__, sc->sw_index, sc->hw_context);
+			return -ETIME;
+		}
+		loop++;
+		udelay(1);
+	}
+
+	/*
+	 * Step 2: Ensure no users are still trying to write to PIO.
+	 *
+	 * For kernel contexts, we have already turned off buffer allocation.
+	 * Now wait for the buffer count to go to zero.
+	 *
+	 * For user contexts, the user handling code has cut off write access
+	 * to the context's PIO pages before calling this routine and will
+	 * restore write access after this routine returns.
+	 */
+	if (sc->type != SC_USER) {
+		/* kernel context */
+		loop = 0;
+		while (1) {
+			count = get_buffers_allocated(sc);
+			if (count == 0)
+				break;
+			if (loop > 100) {
+				dd_dev_err(dd,
+					   "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n",
+					   __func__, sc->sw_index,
+					   sc->hw_context, count);
+			}
+			loop++;
+			udelay(1);
+		}
+	}
+
+	/*
+	 * Step 3: Wait for all packets to egress.
+	 * This is done while disabling the send context
+	 *
+	 * Step 4: Disable the context
+	 *
+	 * This is a superset of the halt.  After the disable, the
+	 * errors can be cleared.
+	 */
+	sc_disable(sc);
+
+	/*
+	 * Step 5: Enable the context
+	 *
+	 * This enable will clear the halted flag and per-send context
+	 * error flags.
+	 */
+	return sc_enable(sc);
+}
+
+/*
+ * PIO freeze processing.  To be called after the TXE block is fully frozen.
+ * Go through all frozen send contexts and disable them.  The contexts are
+ * already stopped by the freeze.
+ */
+void pio_freeze(struct hfi1_devdata *dd)
+{
+	struct send_context *sc;
+	int i;
+
+	for (i = 0; i < dd->num_send_contexts; i++) {
+		sc = dd->send_contexts[i].sc;
+		/*
+		 * Don't disable unallocated, unfrozen, or user send contexts.
+		 * User send contexts will be disabled when the process
+		 * calls into the driver to reset its context.
+		 */
+		if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+			continue;
+
+		/* only need to disable, the context is already stopped */
+		sc_disable(sc);
+	}
+}
+
+/*
+ * Unfreeze PIO for kernel send contexts.  The precondition for calling this
+ * is that all PIO send contexts have been disabled and the SPC freeze has
+ * been cleared.  Now perform the last step and re-enable each kernel context.
+ * User (PSM) processing will occur when PSM calls into the kernel to
+ * acknowledge the freeze.
+ */
+void pio_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+	struct send_context *sc;
+	int i;
+
+	for (i = 0; i < dd->num_send_contexts; i++) {
+		sc = dd->send_contexts[i].sc;
+		if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+			continue;
+
+		sc_enable(sc);	/* will clear the sc frozen flag */
+	}
+}
+
+/*
+ * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
+ * Returns:
+ *	-ETIMEDOUT - if we wait too long
+ *	-EIO	   - if there was an error
+ */
+static int pio_init_wait_progress(struct hfi1_devdata *dd)
+{
+	u64 reg;
+	int max, count = 0;
+
+	/* max is the longest possible HW init time / delay */
+	max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5;
+	while (1) {
+		reg = read_csr(dd, SEND_PIO_INIT_CTXT);
+		if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK))
+			break;
+		if (count >= max)
+			return -ETIMEDOUT;
+		udelay(5);
+		count++;
+	}
+
+	return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0;
+}
+
+/*
+ * Reset all of the send contexts to their power-on state.  Used
+ * only during manual init - no lock against sc_enable needed.
+ */
+void pio_reset_all(struct hfi1_devdata *dd)
+{
+	int ret;
+
+	/* make sure the init engine is not busy */
+	ret = pio_init_wait_progress(dd);
+	/* ignore any timeout */
+	if (ret == -EIO) {
+		/* clear the error */
+		write_csr(dd, SEND_PIO_ERR_CLEAR,
+			  SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK);
+	}
+
+	/* reset init all */
+	write_csr(dd, SEND_PIO_INIT_CTXT,
+		  SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK);
+	udelay(2);
+	ret = pio_init_wait_progress(dd);
+	if (ret < 0) {
+		dd_dev_err(dd,
+			   "PIO send context init %s while initializing all PIO blocks\n",
+			   ret == -ETIMEDOUT ? "is stuck" : "had an error");
+	}
+}
+
+/* enable the context */
+int sc_enable(struct send_context *sc)
+{
+	u64 sc_ctrl, reg, pio;
+	struct hfi1_devdata *dd;
+	unsigned long flags;
+	int ret = 0;
+
+	if (!sc)
+		return -EINVAL;
+	dd = sc->dd;
+
+	/*
+	 * Obtain the allocator lock to guard against any allocation
+	 * attempts (which should not happen prior to context being
+	 * enabled). On the release/disable side we don't need to
+	 * worry about locking since the releaser will not do anything
+	 * if the context accounting values have not changed.
+	 */
+	spin_lock_irqsave(&sc->alloc_lock, flags);
+	sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+	if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK)))
+		goto unlock; /* already enabled */
+
+	/* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */
+
+	*sc->hw_free = 0;
+	sc->free = 0;
+	sc->alloc_free = 0;
+	sc->fill = 0;
+	sc->fill_wrap = 0;
+	sc->sr_head = 0;
+	sc->sr_tail = 0;
+	sc->flags = 0;
+	/* the alloc lock insures no fast path allocation */
+	reset_buffers_allocated(sc);
+
+	/*
+	 * Clear all per-context errors.  Some of these will be set when
+	 * we are re-enabling after a context halt.  Now that the context
+	 * is disabled, the halt will not clear until after the PIO init
+	 * engine runs below.
+	 */
+	reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS));
+	if (reg)
+		write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR), reg);
+
+	/*
+	 * The HW PIO initialization engine can handle only one init
+	 * request at a time. Serialize access to each device's engine.
+	 */
+	spin_lock(&dd->sc_init_lock);
+	/*
+	 * Since access to this code block is serialized and
+	 * each access waits for the initialization to complete
+	 * before releasing the lock, the PIO initialization engine
+	 * should not be in use, so we don't have to wait for the
+	 * InProgress bit to go down.
+	 */
+	pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) <<
+	       SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) |
+		SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK;
+	write_csr(dd, SEND_PIO_INIT_CTXT, pio);
+	/*
+	 * Wait until the engine is done.  Give the chip the required time
+	 * so, hopefully, we read the register just once.
+	 */
+	udelay(2);
+	ret = pio_init_wait_progress(dd);
+	spin_unlock(&dd->sc_init_lock);
+	if (ret) {
+		dd_dev_err(dd,
+			   "sctxt%u(%u): Context not enabled due to init failure %d\n",
+			   sc->sw_index, sc->hw_context, ret);
+		goto unlock;
+	}
+
+	/*
+	 * All is well. Enable the context.
+	 */
+	sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK);
+	write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl);
+	/*
+	 * Read SendCtxtCtrl to force the write out and prevent a timing
+	 * hazard where a PIO write may reach the context before the enable.
+	 */
+	read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+	sc->flags |= SCF_ENABLED;
+
+unlock:
+	spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+	return ret;
+}
+
+/* force a credit return on the context */
+void sc_return_credits(struct send_context *sc)
+{
+	if (!sc)
+		return;
+
+	/* a 0->1 transition schedules a credit return */
+	write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE),
+			SC(CREDIT_FORCE_FORCE_RETURN_SMASK));
+	/*
+	 * Ensure that the write is flushed and the credit return is
+	 * scheduled. We care more about the 0 -> 1 transition.
+	 */
+	read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE));
+	/* set back to 0 for next time */
+	write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0);
+}
+
+/* allow all in-flight packets to drain on the context */
+void sc_flush(struct send_context *sc)
+{
+	if (!sc)
+		return;
+
+	sc_wait_for_packet_egress(sc, 1);
+}
+
+/* drop all packets on the context, no waiting until they are sent */
+void sc_drop(struct send_context *sc)
+{
+	if (!sc)
+		return;
+
+	dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
+		    __func__, sc->sw_index, sc->hw_context);
+}
+
+/*
+ * Start the software reaction to a context halt or SPC freeze:
+ *	- mark the context as halted or frozen
+ *	- stop buffer allocations
+ *
+ * Called from the error interrupt.  Other work is deferred until
+ * out of the interrupt.
+ */
+void sc_stop(struct send_context *sc, int flag)
+{
+	unsigned long flags;
+
+	/* mark the context */
+	sc->flags |= flag;
+
+	/* stop buffer allocations */
+	spin_lock_irqsave(&sc->alloc_lock, flags);
+	sc->flags &= ~SCF_ENABLED;
+	spin_unlock_irqrestore(&sc->alloc_lock, flags);
+	wake_up(&sc->halt_wait);
+}
+
+#define BLOCK_DWORDS (PIO_BLOCK_SIZE / sizeof(u32))
+#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS)
+
+/*
+ * The send context buffer "allocator".
+ *
+ * @sc: the PIO send context we are allocating from
+ * @len: length of whole packet - including PBC - in dwords
+ * @cb: optional callback to call when the buffer is finished sending
+ * @arg: argument for cb
+ *
+ * Return a pointer to a PIO buffer if successful, NULL if not enough room.
+ */
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+				pio_release_cb cb, void *arg)
+{
+	struct pio_buf *pbuf = NULL;
+	unsigned long flags;
+	unsigned long avail;
+	unsigned long blocks = dwords_to_blocks(dw_len);
+	u32 fill_wrap;
+	int trycount = 0;
+	u32 head, next;
+
+	spin_lock_irqsave(&sc->alloc_lock, flags);
+	if (!(sc->flags & SCF_ENABLED)) {
+		spin_unlock_irqrestore(&sc->alloc_lock, flags);
+		goto done;
+	}
+
+retry:
+	avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free);
+	if (blocks > avail) {
+		/* not enough room */
+		if (unlikely(trycount))	{ /* already tried to get more room */
+			spin_unlock_irqrestore(&sc->alloc_lock, flags);
+			goto done;
+		}
+		/* copy from receiver cache line and recalculate */
+		sc->alloc_free = READ_ONCE(sc->free);
+		avail =
+			(unsigned long)sc->credits -
+			(sc->fill - sc->alloc_free);
+		if (blocks > avail) {
+			/* still no room, actively update */
+			sc_release_update(sc);
+			sc->alloc_free = READ_ONCE(sc->free);
+			trycount++;
+			goto retry;
+		}
+	}
+
+	/* there is enough room */
+
+	preempt_disable();
+	this_cpu_inc(*sc->buffers_allocated);
+
+	/* read this once */
+	head = sc->sr_head;
+
+	/* "allocate" the buffer */
+	sc->fill += blocks;
+	fill_wrap = sc->fill_wrap;
+	sc->fill_wrap += blocks;
+	if (sc->fill_wrap >= sc->credits)
+		sc->fill_wrap = sc->fill_wrap - sc->credits;
+
+	/*
+	 * Fill the parts that the releaser looks at before moving the head.
+	 * The only necessary piece is the sent_at field.  The credits
+	 * we have just allocated cannot have been returned yet, so the
+	 * cb and arg will not be looked at for a "while".  Put them
+	 * on this side of the memory barrier anyway.
+	 */
+	pbuf = &sc->sr[head].pbuf;
+	pbuf->sent_at = sc->fill;
+	pbuf->cb = cb;
+	pbuf->arg = arg;
+	pbuf->sc = sc;	/* could be filled in at sc->sr init time */
+	/* make sure this is in memory before updating the head */
+
+	/* calculate next head index, do not store */
+	next = head + 1;
+	if (next >= sc->sr_size)
+		next = 0;
+	/*
+	 * update the head - must be last! - the releaser can look at fields
+	 * in pbuf once we move the head
+	 */
+	smp_wmb();
+	sc->sr_head = next;
+	spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+	/* finish filling in the buffer outside the lock */
+	pbuf->start = sc->base_addr + fill_wrap * PIO_BLOCK_SIZE;
+	pbuf->end = sc->base_addr + sc->size;
+	pbuf->qw_written = 0;
+	pbuf->carry_bytes = 0;
+	pbuf->carry.val64 = 0;
+done:
+	return pbuf;
+}
+
+/*
+ * There are at least two entities that can turn on credit return
+ * interrupts and they can overlap.  Avoid problems by implementing
+ * a count scheme that is enforced by a lock.  The lock is needed because
+ * the count and CSR write must be paired.
+ */
+
+/*
+ * Start credit return interrupts.  This is managed by a count.  If already
+ * on, just increment the count.
+ */
+void sc_add_credit_return_intr(struct send_context *sc)
+{
+	unsigned long flags;
+
+	/* lock must surround both the count change and the CSR update */
+	spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+	if (sc->credit_intr_count == 0) {
+		sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+		write_kctxt_csr(sc->dd, sc->hw_context,
+				SC(CREDIT_CTRL), sc->credit_ctrl);
+	}
+	sc->credit_intr_count++;
+	spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * Stop credit return interrupts.  This is managed by a count.  Decrement the
+ * count, if the last user, then turn the credit interrupts off.
+ */
+void sc_del_credit_return_intr(struct send_context *sc)
+{
+	unsigned long flags;
+
+	WARN_ON(sc->credit_intr_count == 0);
+
+	/* lock must surround both the count change and the CSR update */
+	spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+	sc->credit_intr_count--;
+	if (sc->credit_intr_count == 0) {
+		sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+		write_kctxt_csr(sc->dd, sc->hw_context,
+				SC(CREDIT_CTRL), sc->credit_ctrl);
+	}
+	spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * The caller must be careful when calling this.  All needint calls
+ * must be paired with !needint.
+ */
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
+{
+	if (needint)
+		sc_add_credit_return_intr(sc);
+	else
+		sc_del_credit_return_intr(sc);
+	trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
+	if (needint) {
+		mmiowb();
+		sc_return_credits(sc);
+	}
+}
+
+/**
+ * sc_piobufavail - callback when a PIO buffer is available
+ * @sc: the send context
+ *
+ * This is called from the interrupt handler when a PIO buffer is
+ * available after hfi1_verbs_send() returned an error that no buffers were
+ * available. Disable the interrupt if there are no more QPs waiting.
+ */
+static void sc_piobufavail(struct send_context *sc)
+{
+	struct hfi1_devdata *dd = sc->dd;
+	struct hfi1_ibdev *dev = &dd->verbs_dev;
+	struct list_head *list;
+	struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE];
+	struct rvt_qp *qp;
+	struct hfi1_qp_priv *priv;
+	unsigned long flags;
+	uint i, n = 0, max_idx = 0;
+	u8 max_starved_cnt = 0;
+
+	if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
+	    dd->send_contexts[sc->sw_index].type != SC_VL15)
+		return;
+	list = &sc->piowait;
+	/*
+	 * Note: checking that the piowait list is empty and clearing
+	 * the buffer available interrupt needs to be atomic or we
+	 * could end up with QPs on the wait list with the interrupt
+	 * disabled.
+	 */
+	write_seqlock_irqsave(&dev->iowait_lock, flags);
+	while (!list_empty(list)) {
+		struct iowait *wait;
+
+		if (n == ARRAY_SIZE(qps))
+			break;
+		wait = list_first_entry(list, struct iowait, list);
+		qp = iowait_to_qp(wait);
+		priv = qp->priv;
+		list_del_init(&priv->s_iowait.list);
+		priv->s_iowait.lock = NULL;
+		iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx);
+		/* refcount held until actual wake up */
+		qps[n++] = qp;
+	}
+	/*
+	 * If there had been waiters and there are more
+	 * insure that we redo the force to avoid a potential hang.
+	 */
+	if (n) {
+		hfi1_sc_wantpiobuf_intr(sc, 0);
+		if (!list_empty(list))
+			hfi1_sc_wantpiobuf_intr(sc, 1);
+	}
+	write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+	/* Wake up the most starved one first */
+	if (n)
+		hfi1_qp_wakeup(qps[max_idx],
+			       RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
+	for (i = 0; i < n; i++)
+		if (i != max_idx)
+			hfi1_qp_wakeup(qps[i],
+				       RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
+}
+
+/* translate a send credit update to a bit code of reasons */
+static inline int fill_code(u64 hw_free)
+{
+	int code = 0;
+
+	if (hw_free & CR_STATUS_SMASK)
+		code |= PRC_STATUS_ERR;
+	if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK)
+		code |= PRC_PBC;
+	if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK)
+		code |= PRC_THRESHOLD;
+	if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK)
+		code |= PRC_FILL_ERR;
+	if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK)
+		code |= PRC_SC_DISABLE;
+	return code;
+}
+
+/* use the jiffies compare to get the wrap right */
+#define sent_before(a, b) time_before(a, b)	/* a < b */
+
+/*
+ * The send context buffer "releaser".
+ */
+void sc_release_update(struct send_context *sc)
+{
+	struct pio_buf *pbuf;
+	u64 hw_free;
+	u32 head, tail;
+	unsigned long old_free;
+	unsigned long free;
+	unsigned long extra;
+	unsigned long flags;
+	int code;
+
+	if (!sc)
+		return;
+
+	spin_lock_irqsave(&sc->release_lock, flags);
+	/* update free */
+	hw_free = le64_to_cpu(*sc->hw_free);		/* volatile read */
+	old_free = sc->free;
+	extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT)
+			- (old_free & CR_COUNTER_MASK))
+				& CR_COUNTER_MASK;
+	free = old_free + extra;
+	trace_hfi1_piofree(sc, extra);
+
+	/* call sent buffer callbacks */
+	code = -1;				/* code not yet set */
+	head = READ_ONCE(sc->sr_head);	/* snapshot the head */
+	tail = sc->sr_tail;
+	while (head != tail) {
+		pbuf = &sc->sr[tail].pbuf;
+
+		if (sent_before(free, pbuf->sent_at)) {
+			/* not sent yet */
+			break;
+		}
+		if (pbuf->cb) {
+			if (code < 0) /* fill in code on first user */
+				code = fill_code(hw_free);
+			(*pbuf->cb)(pbuf->arg, code);
+		}
+
+		tail++;
+		if (tail >= sc->sr_size)
+			tail = 0;
+	}
+	sc->sr_tail = tail;
+	/* make sure tail is updated before free */
+	smp_wmb();
+	sc->free = free;
+	spin_unlock_irqrestore(&sc->release_lock, flags);
+	sc_piobufavail(sc);
+}
+
+/*
+ * Send context group releaser.  Argument is the send context that caused
+ * the interrupt.  Called from the send context interrupt handler.
+ *
+ * Call release on all contexts in the group.
+ *
+ * This routine takes the sc_lock without an irqsave because it is only
+ * called from an interrupt handler.  Adjust if that changes.
+ */
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context)
+{
+	struct send_context *sc;
+	u32 sw_index;
+	u32 gc, gc_end;
+
+	spin_lock(&dd->sc_lock);
+	sw_index = dd->hw_to_sw[hw_context];
+	if (unlikely(sw_index >= dd->num_send_contexts)) {
+		dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n",
+			   __func__, hw_context, sw_index);
+		goto done;
+	}
+	sc = dd->send_contexts[sw_index].sc;
+	if (unlikely(!sc))
+		goto done;
+
+	gc = group_context(hw_context, sc->group);
+	gc_end = gc + group_size(sc->group);
+	for (; gc < gc_end; gc++) {
+		sw_index = dd->hw_to_sw[gc];
+		if (unlikely(sw_index >= dd->num_send_contexts)) {
+			dd_dev_err(dd,
+				   "%s: invalid hw (%u) to sw (%u) mapping\n",
+				   __func__, hw_context, sw_index);
+			continue;
+		}
+		sc_release_update(dd->send_contexts[sw_index].sc);
+	}
+done:
+	spin_unlock(&dd->sc_lock);
+}
+
+/*
+ * pio_select_send_context_vl() - select send context
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ * This function returns a send context based on the selector and a vl.
+ * The mapping fields are protected by RCU
+ */
+struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
+						u32 selector, u8 vl)
+{
+	struct pio_vl_map *m;
+	struct pio_map_elem *e;
+	struct send_context *rval;
+
+	/*
+	 * NOTE This should only happen if SC->VL changed after the initial
+	 * checks on the QP/AH
+	 * Default will return VL0's send context below
+	 */
+	if (unlikely(vl >= num_vls)) {
+		rval = NULL;
+		goto done;
+	}
+
+	rcu_read_lock();
+	m = rcu_dereference(dd->pio_map);
+	if (unlikely(!m)) {
+		rcu_read_unlock();
+		return dd->vld[0].sc;
+	}
+	e = m->map[vl & m->mask];
+	rval = e->ksc[selector & e->mask];
+	rcu_read_unlock();
+
+done:
+	rval = !rval ? dd->vld[0].sc : rval;
+	return rval;
+}
+
+/*
+ * pio_select_send_context_sc() - select send context
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @sc5: the 5 bit sc
+ *
+ * This function returns an send context based on the selector and an sc
+ */
+struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
+						u32 selector, u8 sc5)
+{
+	u8 vl = sc_to_vlt(dd, sc5);
+
+	return pio_select_send_context_vl(dd, selector, vl);
+}
+
+/*
+ * Free the indicated map struct
+ */
+static void pio_map_free(struct pio_vl_map *m)
+{
+	int i;
+
+	for (i = 0; m && i < m->actual_vls; i++)
+		kfree(m->map[i]);
+	kfree(m);
+}
+
+/*
+ * Handle RCU callback
+ */
+static void pio_map_rcu_callback(struct rcu_head *list)
+{
+	struct pio_vl_map *m = container_of(list, struct pio_vl_map, list);
+
+	pio_map_free(m);
+}
+
+/*
+ * Set credit return threshold for the kernel send context
+ */
+static void set_threshold(struct hfi1_devdata *dd, int scontext, int i)
+{
+	u32 thres;
+
+	thres = min(sc_percent_to_threshold(dd->kernel_send_context[scontext],
+					    50),
+		    sc_mtu_to_threshold(dd->kernel_send_context[scontext],
+					dd->vld[i].mtu,
+					dd->rcd[0]->rcvhdrqentsize));
+	sc_set_cr_threshold(dd->kernel_send_context[scontext], thres);
+}
+
+/*
+ * pio_map_init - called when #vls change
+ * @dd: hfi1_devdata
+ * @port: port number
+ * @num_vls: number of vls
+ * @vl_scontexts: per vl send context mapping (optional)
+ *
+ * This routine changes the mapping based on the number of vls.
+ *
+ * vl_scontexts is used to specify a non-uniform vl/send context
+ * loading. NULL implies auto computing the loading and giving each
+ * VL an uniform distribution of send contexts per VL.
+ *
+ * The auto algorithm computers the sc_per_vl and the number of extra
+ * send contexts. Any extra send contexts are added from the last VL
+ * on down
+ *
+ * rcu locking is used here to control access to the mapping fields.
+ *
+ * If either the num_vls or num_send_contexts are non-power of 2, the
+ * array sizes in the struct pio_vl_map and the struct pio_map_elem are
+ * rounded up to the next highest power of 2 and the first entry is
+ * reused in a round robin fashion.
+ *
+ * If an error occurs the map change is not done and the mapping is not
+ * chaged.
+ *
+ */
+int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts)
+{
+	int i, j;
+	int extra, sc_per_vl;
+	int scontext = 1;
+	int num_kernel_send_contexts = 0;
+	u8 lvl_scontexts[OPA_MAX_VLS];
+	struct pio_vl_map *oldmap, *newmap;
+
+	if (!vl_scontexts) {
+		for (i = 0; i < dd->num_send_contexts; i++)
+			if (dd->send_contexts[i].type == SC_KERNEL)
+				num_kernel_send_contexts++;
+		/* truncate divide */
+		sc_per_vl = num_kernel_send_contexts / num_vls;
+		/* extras */
+		extra = num_kernel_send_contexts % num_vls;
+		vl_scontexts = lvl_scontexts;
+		/* add extras from last vl down */
+		for (i = num_vls - 1; i >= 0; i--, extra--)
+			vl_scontexts[i] = sc_per_vl + (extra > 0 ? 1 : 0);
+	}
+	/* build new map */
+	newmap = kzalloc(sizeof(*newmap) +
+			 roundup_pow_of_two(num_vls) *
+			 sizeof(struct pio_map_elem *),
+			 GFP_KERNEL);
+	if (!newmap)
+		goto bail;
+	newmap->actual_vls = num_vls;
+	newmap->vls = roundup_pow_of_two(num_vls);
+	newmap->mask = (1 << ilog2(newmap->vls)) - 1;
+	for (i = 0; i < newmap->vls; i++) {
+		/* save for wrap around */
+		int first_scontext = scontext;
+
+		if (i < newmap->actual_vls) {
+			int sz = roundup_pow_of_two(vl_scontexts[i]);
+
+			/* only allocate once */
+			newmap->map[i] = kzalloc(sizeof(*newmap->map[i]) +
+						 sz * sizeof(struct
+							     send_context *),
+						 GFP_KERNEL);
+			if (!newmap->map[i])
+				goto bail;
+			newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
+			/*
+			 * assign send contexts and
+			 * adjust credit return threshold
+			 */
+			for (j = 0; j < sz; j++) {
+				if (dd->kernel_send_context[scontext]) {
+					newmap->map[i]->ksc[j] =
+					dd->kernel_send_context[scontext];
+					set_threshold(dd, scontext, i);
+				}
+				if (++scontext >= first_scontext +
+						  vl_scontexts[i])
+					/* wrap back to first send context */
+					scontext = first_scontext;
+			}
+		} else {
+			/* just re-use entry without allocating */
+			newmap->map[i] = newmap->map[i % num_vls];
+		}
+		scontext = first_scontext + vl_scontexts[i];
+	}
+	/* newmap in hand, save old map */
+	spin_lock_irq(&dd->pio_map_lock);
+	oldmap = rcu_dereference_protected(dd->pio_map,
+					   lockdep_is_held(&dd->pio_map_lock));
+
+	/* publish newmap */
+	rcu_assign_pointer(dd->pio_map, newmap);
+
+	spin_unlock_irq(&dd->pio_map_lock);
+	/* success, free any old map after grace period */
+	if (oldmap)
+		call_rcu(&oldmap->list, pio_map_rcu_callback);
+	return 0;
+bail:
+	/* free any partial allocation */
+	pio_map_free(newmap);
+	return -ENOMEM;
+}
+
+void free_pio_map(struct hfi1_devdata *dd)
+{
+	/* Free PIO map if allocated */
+	if (rcu_access_pointer(dd->pio_map)) {
+		spin_lock_irq(&dd->pio_map_lock);
+		pio_map_free(rcu_access_pointer(dd->pio_map));
+		RCU_INIT_POINTER(dd->pio_map, NULL);
+		spin_unlock_irq(&dd->pio_map_lock);
+		synchronize_rcu();
+	}
+	kfree(dd->kernel_send_context);
+	dd->kernel_send_context = NULL;
+}
+
+int init_pervl_scs(struct hfi1_devdata *dd)
+{
+	int i;
+	u64 mask, all_vl_mask = (u64)0x80ff; /* VLs 0-7, 15 */
+	u64 data_vls_mask = (u64)0x00ff; /* VLs 0-7 */
+	u32 ctxt;
+	struct hfi1_pportdata *ppd = dd->pport;
+
+	dd->vld[15].sc = sc_alloc(dd, SC_VL15,
+				  dd->rcd[0]->rcvhdrqentsize, dd->node);
+	if (!dd->vld[15].sc)
+		return -ENOMEM;
+
+	hfi1_init_ctxt(dd->vld[15].sc);
+	dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
+
+	dd->kernel_send_context = kcalloc_node(dd->num_send_contexts,
+					       sizeof(struct send_context *),
+					       GFP_KERNEL, dd->node);
+	if (!dd->kernel_send_context)
+		goto freesc15;
+
+	dd->kernel_send_context[0] = dd->vld[15].sc;
+
+	for (i = 0; i < num_vls; i++) {
+		/*
+		 * Since this function does not deal with a specific
+		 * receive context but we need the RcvHdrQ entry size,
+		 * use the size from rcd[0]. It is guaranteed to be
+		 * valid at this point and will remain the same for all
+		 * receive contexts.
+		 */
+		dd->vld[i].sc = sc_alloc(dd, SC_KERNEL,
+					 dd->rcd[0]->rcvhdrqentsize, dd->node);
+		if (!dd->vld[i].sc)
+			goto nomem;
+		dd->kernel_send_context[i + 1] = dd->vld[i].sc;
+		hfi1_init_ctxt(dd->vld[i].sc);
+		/* non VL15 start with the max MTU */
+		dd->vld[i].mtu = hfi1_max_mtu;
+	}
+	for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
+		dd->kernel_send_context[i + 1] =
+		sc_alloc(dd, SC_KERNEL, dd->rcd[0]->rcvhdrqentsize, dd->node);
+		if (!dd->kernel_send_context[i + 1])
+			goto nomem;
+		hfi1_init_ctxt(dd->kernel_send_context[i + 1]);
+	}
+
+	sc_enable(dd->vld[15].sc);
+	ctxt = dd->vld[15].sc->hw_context;
+	mask = all_vl_mask & ~(1LL << 15);
+	write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+	dd_dev_info(dd,
+		    "Using send context %u(%u) for VL15\n",
+		    dd->vld[15].sc->sw_index, ctxt);
+
+	for (i = 0; i < num_vls; i++) {
+		sc_enable(dd->vld[i].sc);
+		ctxt = dd->vld[i].sc->hw_context;
+		mask = all_vl_mask & ~(data_vls_mask);
+		write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+	}
+	for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
+		sc_enable(dd->kernel_send_context[i + 1]);
+		ctxt = dd->kernel_send_context[i + 1]->hw_context;
+		mask = all_vl_mask & ~(data_vls_mask);
+		write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+	}
+
+	if (pio_map_init(dd, ppd->port - 1, num_vls, NULL))
+		goto nomem;
+	return 0;
+
+nomem:
+	for (i = 0; i < num_vls; i++) {
+		sc_free(dd->vld[i].sc);
+		dd->vld[i].sc = NULL;
+	}
+
+	for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++)
+		sc_free(dd->kernel_send_context[i + 1]);
+
+	kfree(dd->kernel_send_context);
+	dd->kernel_send_context = NULL;
+
+freesc15:
+	sc_free(dd->vld[15].sc);
+	return -ENOMEM;
+}
+
+int init_credit_return(struct hfi1_devdata *dd)
+{
+	int ret;
+	int i;
+
+	dd->cr_base = kcalloc(
+		node_affinity.num_possible_nodes,
+		sizeof(struct credit_return_base),
+		GFP_KERNEL);
+	if (!dd->cr_base) {
+		ret = -ENOMEM;
+		goto done;
+	}
+	for_each_node_with_cpus(i) {
+		int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
+
+		set_dev_node(&dd->pcidev->dev, i);
+		dd->cr_base[i].va = dma_zalloc_coherent(
+					&dd->pcidev->dev,
+					bytes,
+					&dd->cr_base[i].dma,
+					GFP_KERNEL);
+		if (!dd->cr_base[i].va) {
+			set_dev_node(&dd->pcidev->dev, dd->node);
+			dd_dev_err(dd,
+				   "Unable to allocate credit return DMA range for NUMA %d\n",
+				   i);
+			ret = -ENOMEM;
+			goto done;
+		}
+	}
+	set_dev_node(&dd->pcidev->dev, dd->node);
+
+	ret = 0;
+done:
+	return ret;
+}
+
+void free_credit_return(struct hfi1_devdata *dd)
+{
+	int i;
+
+	if (!dd->cr_base)
+		return;
+	for (i = 0; i < node_affinity.num_possible_nodes; i++) {
+		if (dd->cr_base[i].va) {
+			dma_free_coherent(&dd->pcidev->dev,
+					  TXE_NUM_CONTEXTS *
+					  sizeof(struct credit_return),
+					  dd->cr_base[i].va,
+					  dd->cr_base[i].dma);
+		}
+	}
+	kfree(dd->cr_base);
+	dd->cr_base = NULL;
+}
diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h
new file mode 100644
index 0000000..058b08f
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/pio.h
@@ -0,0 +1,330 @@
+#ifndef _PIO_H
+#define _PIO_H
+/*
+ * Copyright(c) 2015-2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/* send context types */
+#define SC_KERNEL 0
+#define SC_VL15   1
+#define SC_ACK    2
+#define SC_USER   3	/* must be the last one: it may take all left */
+#define SC_MAX    4	/* count of send context types */
+
+/* invalid send context index */
+#define INVALID_SCI 0xff
+
+/* PIO buffer release callback function */
+typedef void (*pio_release_cb)(void *arg, int code);
+
+/* PIO release codes - in bits, as there could more than one that apply */
+#define PRC_OK		0	/* no known error */
+#define PRC_STATUS_ERR	0x01	/* credit return due to status error */
+#define PRC_PBC		0x02	/* credit return due to PBC */
+#define PRC_THRESHOLD	0x04	/* credit return due to threshold */
+#define PRC_FILL_ERR	0x08	/* credit return due fill error */
+#define PRC_FORCE	0x10	/* credit return due credit force */
+#define PRC_SC_DISABLE	0x20	/* clean-up after a context disable */
+
+/* byte helper */
+union mix {
+	u64 val64;
+	u32 val32[2];
+	u8  val8[8];
+};
+
+/* an allocated PIO buffer */
+struct pio_buf {
+	struct send_context *sc;/* back pointer to owning send context */
+	pio_release_cb cb;	/* called when the buffer is released */
+	void *arg;		/* argument for cb */
+	void __iomem *start;	/* buffer start address */
+	void __iomem *end;	/* context end address */
+	unsigned long sent_at;	/* buffer is sent when <= free */
+	union mix carry;	/* pending unwritten bytes */
+	u16 qw_written;		/* QW written so far */
+	u8 carry_bytes;	/* number of valid bytes in carry */
+};
+
+/* cache line aligned pio buffer array */
+union pio_shadow_ring {
+	struct pio_buf pbuf;
+} ____cacheline_aligned;
+
+/* per-NUMA send context */
+struct send_context {
+	/* read-only after init */
+	struct hfi1_devdata *dd;		/* device */
+	union pio_shadow_ring *sr;	/* shadow ring */
+	void __iomem *base_addr;	/* start of PIO memory */
+	u32 __percpu *buffers_allocated;/* count of buffers allocated */
+	u32 size;			/* context size, in bytes */
+
+	int node;			/* context home node */
+	u32 sr_size;			/* size of the shadow ring */
+	u16 flags;			/* flags */
+	u8  type;			/* context type */
+	u8  sw_index;			/* software index number */
+	u8  hw_context;			/* hardware context number */
+	u8  group;			/* credit return group */
+
+	/* allocator fields */
+	spinlock_t alloc_lock ____cacheline_aligned_in_smp;
+	u32 sr_head;			/* shadow ring head */
+	unsigned long fill;		/* official alloc count */
+	unsigned long alloc_free;	/* copy of free (less cache thrash) */
+	u32 fill_wrap;			/* tracks fill within ring */
+	u32 credits;			/* number of blocks in context */
+	/* adding a new field here would make it part of this cacheline */
+
+	/* releaser fields */
+	spinlock_t release_lock ____cacheline_aligned_in_smp;
+	u32 sr_tail;			/* shadow ring tail */
+	unsigned long free;		/* official free count */
+	volatile __le64 *hw_free;	/* HW free counter */
+	/* list for PIO waiters */
+	struct list_head piowait  ____cacheline_aligned_in_smp;
+	spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
+	u32 credit_intr_count;		/* count of credit intr users */
+	u64 credit_ctrl;		/* cache for credit control */
+	wait_queue_head_t halt_wait;    /* wait until kernel sees interrupt */
+	struct work_struct halt_work;	/* halted context work queue entry */
+};
+
+/* send context flags */
+#define SCF_ENABLED 0x01
+#define SCF_IN_FREE 0x02
+#define SCF_HALTED  0x04
+#define SCF_FROZEN  0x08
+
+struct send_context_info {
+	struct send_context *sc;	/* allocated working context */
+	u16 allocated;			/* has this been allocated? */
+	u16 type;			/* context type */
+	u16 base;			/* base in PIO array */
+	u16 credits;			/* size in PIO array */
+};
+
+/* DMA credit return, index is always (context & 0x7) */
+struct credit_return {
+	volatile __le64 cr[8];
+};
+
+/* NUMA indexed credit return array */
+struct credit_return_base {
+	struct credit_return *va;
+	dma_addr_t dma;
+};
+
+/* send context configuration sizes (one per type) */
+struct sc_config_sizes {
+	short int size;
+	short int count;
+};
+
+/*
+ * The diagram below details the relationship of the mapping structures
+ *
+ * Since the mapping now allows for non-uniform send contexts per vl, the
+ * number of send contexts for a vl is either the vl_scontexts[vl] or
+ * a computation based on num_kernel_send_contexts/num_vls:
+ *
+ * For example:
+ * nactual = vl_scontexts ? vl_scontexts[vl] : num_kernel_send_contexts/num_vls
+ *
+ * n = roundup to next highest power of 2 using nactual
+ *
+ * In the case where there are num_kernel_send_contexts/num_vls doesn't divide
+ * evenly, the extras are added from the last vl downward.
+ *
+ * For the case where n > nactual, the send contexts are assigned
+ * in a round robin fashion wrapping back to the first send context
+ * for a particular vl.
+ *
+ *               dd->pio_map
+ *                    |                                   pio_map_elem[0]
+ *                    |                                +--------------------+
+ *                    v                                |       mask         |
+ *               pio_vl_map                            |--------------------|
+ *      +--------------------------+                   | ksc[0] -> sc 1     |
+ *      |    list (RCU)            |                   |--------------------|
+ *      |--------------------------|                 ->| ksc[1] -> sc 2     |
+ *      |    mask                  |              --/  |--------------------|
+ *      |--------------------------|            -/     |        *           |
+ *      |    actual_vls (max 8)    |          -/       |--------------------|
+ *      |--------------------------|       --/         | ksc[n-1] -> sc n   |
+ *      |    vls (max 8)           |     -/            +--------------------+
+ *      |--------------------------|  --/
+ *      |    map[0]                |-/
+ *      |--------------------------|                   +--------------------+
+ *      |    map[1]                |---                |       mask         |
+ *      |--------------------------|   \----           |--------------------|
+ *      |           *              |        \--        | ksc[0] -> sc 1+n   |
+ *      |           *              |           \----   |--------------------|
+ *      |           *              |                \->| ksc[1] -> sc 2+n   |
+ *      |--------------------------|                   |--------------------|
+ *      |   map[vls - 1]           |-                  |         *          |
+ *      +--------------------------+ \-                |--------------------|
+ *                                     \-              | ksc[m-1] -> sc m+n |
+ *                                       \             +--------------------+
+ *                                        \-
+ *                                          \
+ *                                           \-        +----------------------+
+ *                                             \-      |       mask           |
+ *                                               \     |----------------------|
+ *                                                \-   | ksc[0] -> sc 1+m+n   |
+ *                                                  \- |----------------------|
+ *                                                    >| ksc[1] -> sc 2+m+n   |
+ *                                                     |----------------------|
+ *                                                     |         *            |
+ *                                                     |----------------------|
+ *                                                     | ksc[o-1] -> sc o+m+n |
+ *                                                     +----------------------+
+ *
+ */
+
+/* Initial number of send contexts per VL */
+#define INIT_SC_PER_VL 2
+
+/*
+ * struct pio_map_elem - mapping for a vl
+ * @mask - selector mask
+ * @ksc - array of kernel send contexts for this vl
+ *
+ * The mask is used to "mod" the selector to
+ * produce index into the trailing array of
+ * kscs
+ */
+struct pio_map_elem {
+	u32 mask;
+	struct send_context *ksc[0];
+};
+
+/*
+ * struct pio_vl_map - mapping for a vl
+ * @list - rcu head for free callback
+ * @mask - vl mask to "mod" the vl to produce an index to map array
+ * @actual_vls - number of vls
+ * @vls - numbers of vls rounded to next power of 2
+ * @map - array of pio_map_elem entries
+ *
+ * This is the parent mapping structure. The trailing members of the
+ * struct point to pio_map_elem entries, which in turn point to an
+ * array of kscs for that vl.
+ */
+struct pio_vl_map {
+	struct rcu_head list;
+	u32 mask;
+	u8 actual_vls;
+	u8 vls;
+	struct pio_map_elem *map[0];
+};
+
+int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls,
+		 u8 *vl_scontexts);
+void free_pio_map(struct hfi1_devdata *dd);
+struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
+						u32 selector, u8 vl);
+struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
+						u32 selector, u8 sc5);
+
+/* send context functions */
+int init_credit_return(struct hfi1_devdata *dd);
+void free_credit_return(struct hfi1_devdata *dd);
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd);
+int init_send_contexts(struct hfi1_devdata *dd);
+int init_credit_return(struct hfi1_devdata *dd);
+int init_pervl_scs(struct hfi1_devdata *dd);
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+			      uint hdrqentsize, int numa);
+void sc_free(struct send_context *sc);
+int sc_enable(struct send_context *sc);
+void sc_disable(struct send_context *sc);
+int sc_restart(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_flush(struct send_context *sc);
+void sc_drop(struct send_context *sc);
+void sc_stop(struct send_context *sc, int bit);
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+				pio_release_cb cb, void *arg);
+void sc_release_update(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
+void sc_add_credit_return_intr(struct send_context *sc);
+void sc_del_credit_return_intr(struct send_context *sc);
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent);
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
+void sc_wait(struct hfi1_devdata *dd);
+void set_pio_integrity(struct send_context *sc);
+
+/* support functions */
+void pio_reset_all(struct hfi1_devdata *dd);
+void pio_freeze(struct hfi1_devdata *dd);
+void pio_kernel_unfreeze(struct hfi1_devdata *dd);
+
+/* global PIO send control operations */
+#define PSC_GLOBAL_ENABLE 0
+#define PSC_GLOBAL_DISABLE 1
+#define PSC_GLOBAL_VLARB_ENABLE 2
+#define PSC_GLOBAL_VLARB_DISABLE 3
+#define PSC_CM_RESET 4
+#define PSC_DATA_VL_ENABLE 5
+#define PSC_DATA_VL_DISABLE 6
+
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl);
+void pio_send_control(struct hfi1_devdata *dd, int op);
+
+/* PIO copy routines */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+	      const void *from, size_t count);
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+			const void *from, size_t nbytes);
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes);
+void seg_pio_copy_end(struct pio_buf *pbuf);
+
+#endif /* _PIO_H */
diff --git a/drivers/infiniband/hw/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c
new file mode 100644
index 0000000..03024ce
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/pio_copy.c
@@ -0,0 +1,757 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+/* additive distance between non-SOP and SOP space */
+#define SOP_DISTANCE (TXE_PIO_SIZE / 2)
+#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE - 1)
+/* number of QUADWORDs in a block */
+#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE / sizeof(u64))
+
+/**
+ * pio_copy - copy data block to MMIO space
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @pbc: PBC to send
+ * @from: source, must be 8 byte aligned
+ * @count: number of DWORD (32-bit) quantities to copy from source
+ *
+ * Copy data from source to PIO Send Buffer memory, 8 bytes at a time.
+ * Must always write full BLOCK_SIZE bytes blocks.  The first block must
+ * be written to the corresponding SOP=1 address.
+ *
+ * Known:
+ * o pbuf->start always starts on a block boundary
+ * o pbuf can wrap only at a block boundary
+ */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+	      const void *from, size_t count)
+{
+	void __iomem *dest = pbuf->start + SOP_DISTANCE;
+	void __iomem *send = dest + PIO_BLOCK_SIZE;
+	void __iomem *dend;			/* 8-byte data end */
+
+	/* write the PBC */
+	writeq(pbc, dest);
+	dest += sizeof(u64);
+
+	/* calculate where the QWORD data ends - in SOP=1 space */
+	dend = dest + ((count >> 1) * sizeof(u64));
+
+	if (dend < send) {
+		/*
+		 * all QWORD data is within the SOP block, does *not*
+		 * reach the end of the SOP block
+		 */
+
+		while (dest < dend) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+		/*
+		 * No boundary checks are needed here:
+		 * 0. We're not on the SOP block boundary
+		 * 1. The possible DWORD dangle will still be within
+		 *    the SOP block
+		 * 2. We cannot wrap except on a block boundary.
+		 */
+	} else {
+		/* QWORD data extends _to_ or beyond the SOP block */
+
+		/* write 8-byte SOP chunk data */
+		while (dest < send) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+		/* drop out of the SOP range */
+		dest -= SOP_DISTANCE;
+		dend -= SOP_DISTANCE;
+
+		/*
+		 * If the wrap comes before or matches the data end,
+		 * copy until until the wrap, then wrap.
+		 *
+		 * If the data ends at the end of the SOP above and
+		 * the buffer wraps, then pbuf->end == dend == dest
+		 * and nothing will get written, but we will wrap in
+		 * case there is a dangling DWORD.
+		 */
+		if (pbuf->end <= dend) {
+			while (dest < pbuf->end) {
+				writeq(*(u64 *)from, dest);
+				from += sizeof(u64);
+				dest += sizeof(u64);
+			}
+
+			dest -= pbuf->sc->size;
+			dend -= pbuf->sc->size;
+		}
+
+		/* write 8-byte non-SOP, non-wrap chunk data */
+		while (dest < dend) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+	}
+	/* at this point we have wrapped if we are going to wrap */
+
+	/* write dangling u32, if any */
+	if (count & 1) {
+		union mix val;
+
+		val.val64 = 0;
+		val.val32[0] = *(u32 *)from;
+		writeq(val.val64, dest);
+		dest += sizeof(u64);
+	}
+	/*
+	 * fill in rest of block, no need to check pbuf->end
+	 * as we only wrap on a block boundary
+	 */
+	while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+		writeq(0, dest);
+		dest += sizeof(u64);
+	}
+
+	/* finished with this buffer */
+	this_cpu_dec(*pbuf->sc->buffers_allocated);
+	preempt_enable();
+}
+
+/*
+ * Handle carry bytes using shifts and masks.
+ *
+ * NOTE: the value the unused portion of carry is expected to always be zero.
+ */
+
+/*
+ * "zero" shift - bit shift used to zero out upper bytes.  Input is
+ * the count of LSB bytes to preserve.
+ */
+#define zshift(x) (8 * (8 - (x)))
+
+/*
+ * "merge" shift - bit shift used to merge with carry bytes.  Input is
+ * the LSB byte count to move beyond.
+ */
+#define mshift(x) (8 * (x))
+
+/*
+ * Jump copy - no-loop copy for < 8 bytes.
+ */
+static inline void jcopy(u8 *dest, const u8 *src, u32 n)
+{
+	switch (n) {
+	case 7:
+		*dest++ = *src++;
+		/* fall through */
+	case 6:
+		*dest++ = *src++;
+		/* fall through */
+	case 5:
+		*dest++ = *src++;
+		/* fall through */
+	case 4:
+		*dest++ = *src++;
+		/* fall through */
+	case 3:
+		*dest++ = *src++;
+		/* fall through */
+	case 2:
+		*dest++ = *src++;
+		/* fall through */
+	case 1:
+		*dest++ = *src++;
+		/* fall through */
+	}
+}
+
+/*
+ * Read nbytes from "from" and and place them in the low bytes
+ * of pbuf->carry.  Other bytes are left as-is.  Any previous
+ * value in pbuf->carry is lost.
+ *
+ * NOTES:
+ * o do not read from from if nbytes is zero
+ * o from may _not_ be u64 aligned.
+ */
+static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
+				  unsigned int nbytes)
+{
+	pbuf->carry.val64 = 0;
+	jcopy(&pbuf->carry.val8[0], from, nbytes);
+	pbuf->carry_bytes = nbytes;
+}
+
+/*
+ * Read nbytes bytes from "from" and put them at the end of pbuf->carry.
+ * It is expected that the extra read does not overfill carry.
+ *
+ * NOTES:
+ * o from may _not_ be u64 aligned
+ * o nbytes may span a QW boundary
+ */
+static inline void read_extra_bytes(struct pio_buf *pbuf,
+				    const void *from, unsigned int nbytes)
+{
+	jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes);
+	pbuf->carry_bytes += nbytes;
+}
+
+/*
+ * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
+ * Put the unused part of the next 8 bytes of src into the LSB bytes of
+ * pbuf->carry with the upper bytes zeroed..
+ *
+ * NOTES:
+ * o result must keep unused bytes zeroed
+ * o src must be u64 aligned
+ */
+static inline void merge_write8(
+	struct pio_buf *pbuf,
+	void __iomem *dest,
+	const void *src)
+{
+	u64 new, temp;
+
+	new = *(u64 *)src;
+	temp = pbuf->carry.val64 | (new << mshift(pbuf->carry_bytes));
+	writeq(temp, dest);
+	pbuf->carry.val64 = new >> zshift(pbuf->carry_bytes);
+}
+
+/*
+ * Write a quad word using all bytes of carry.
+ */
+static inline void carry8_write8(union mix carry, void __iomem *dest)
+{
+	writeq(carry.val64, dest);
+}
+
+/*
+ * Write a quad word using all the valid bytes of carry.  If carry
+ * has zero valid bytes, nothing is written.
+ * Returns 0 on nothing written, non-zero on quad word written.
+ */
+static inline int carry_write8(struct pio_buf *pbuf, void __iomem *dest)
+{
+	if (pbuf->carry_bytes) {
+		/* unused bytes are always kept zeroed, so just write */
+		writeq(pbuf->carry.val64, dest);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Segmented PIO Copy - start
+ *
+ * Start a PIO copy.
+ *
+ * @pbuf: destination buffer
+ * @pbc: the PBC for the PIO buffer
+ * @from: data source, QWORD aligned
+ * @nbytes: bytes to copy
+ */
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+			const void *from, size_t nbytes)
+{
+	void __iomem *dest = pbuf->start + SOP_DISTANCE;
+	void __iomem *send = dest + PIO_BLOCK_SIZE;
+	void __iomem *dend;			/* 8-byte data end */
+
+	writeq(pbc, dest);
+	dest += sizeof(u64);
+
+	/* calculate where the QWORD data ends - in SOP=1 space */
+	dend = dest + ((nbytes >> 3) * sizeof(u64));
+
+	if (dend < send) {
+		/*
+		 * all QWORD data is within the SOP block, does *not*
+		 * reach the end of the SOP block
+		 */
+
+		while (dest < dend) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+		/*
+		 * No boundary checks are needed here:
+		 * 0. We're not on the SOP block boundary
+		 * 1. The possible DWORD dangle will still be within
+		 *    the SOP block
+		 * 2. We cannot wrap except on a block boundary.
+		 */
+	} else {
+		/* QWORD data extends _to_ or beyond the SOP block */
+
+		/* write 8-byte SOP chunk data */
+		while (dest < send) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+		/* drop out of the SOP range */
+		dest -= SOP_DISTANCE;
+		dend -= SOP_DISTANCE;
+
+		/*
+		 * If the wrap comes before or matches the data end,
+		 * copy until until the wrap, then wrap.
+		 *
+		 * If the data ends at the end of the SOP above and
+		 * the buffer wraps, then pbuf->end == dend == dest
+		 * and nothing will get written, but we will wrap in
+		 * case there is a dangling DWORD.
+		 */
+		if (pbuf->end <= dend) {
+			while (dest < pbuf->end) {
+				writeq(*(u64 *)from, dest);
+				from += sizeof(u64);
+				dest += sizeof(u64);
+			}
+
+			dest -= pbuf->sc->size;
+			dend -= pbuf->sc->size;
+		}
+
+		/* write 8-byte non-SOP, non-wrap chunk data */
+		while (dest < dend) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+	}
+	/* at this point we have wrapped if we are going to wrap */
+
+	/* ...but it doesn't matter as we're done writing */
+
+	/* save dangling bytes, if any */
+	read_low_bytes(pbuf, from, nbytes & 0x7);
+
+	pbuf->qw_written = 1 /*PBC*/ + (nbytes >> 3);
+}
+
+/*
+ * Mid copy helper, "mixed case" - source is 64-bit aligned but carry
+ * bytes are non-zero.
+ *
+ * Whole u64s must be written to the chip, so bytes must be manually merged.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned.
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+	void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+	void __iomem *dend;			/* 8-byte data end */
+	unsigned long qw_to_write = nbytes >> 3;
+	unsigned long bytes_left = nbytes & 0x7;
+
+	/* calculate 8-byte data end */
+	dend = dest + (qw_to_write * sizeof(u64));
+
+	if (pbuf->qw_written < PIO_BLOCK_QWS) {
+		/*
+		 * Still within SOP block.  We don't need to check for
+		 * wrap because we are still in the first block and
+		 * can only wrap on block boundaries.
+		 */
+		void __iomem *send;		/* SOP end */
+		void __iomem *xend;
+
+		/*
+		 * calculate the end of data or end of block, whichever
+		 * comes first
+		 */
+		send = pbuf->start + PIO_BLOCK_SIZE;
+		xend = min(send, dend);
+
+		/* shift up to SOP=1 space */
+		dest += SOP_DISTANCE;
+		xend += SOP_DISTANCE;
+
+		/* write 8-byte chunk data */
+		while (dest < xend) {
+			merge_write8(pbuf, dest, from);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+
+		/* shift down to SOP=0 space */
+		dest -= SOP_DISTANCE;
+	}
+	/*
+	 * At this point dest could be (either, both, or neither):
+	 * - at dend
+	 * - at the wrap
+	 */
+
+	/*
+	 * If the wrap comes before or matches the data end,
+	 * copy until until the wrap, then wrap.
+	 *
+	 * If dest is at the wrap, we will fall into the if,
+	 * not do the loop, when wrap.
+	 *
+	 * If the data ends at the end of the SOP above and
+	 * the buffer wraps, then pbuf->end == dend == dest
+	 * and nothing will get written.
+	 */
+	if (pbuf->end <= dend) {
+		while (dest < pbuf->end) {
+			merge_write8(pbuf, dest, from);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+
+		dest -= pbuf->sc->size;
+		dend -= pbuf->sc->size;
+	}
+
+	/* write 8-byte non-SOP, non-wrap chunk data */
+	while (dest < dend) {
+		merge_write8(pbuf, dest, from);
+		from += sizeof(u64);
+		dest += sizeof(u64);
+	}
+
+	pbuf->qw_written += qw_to_write;
+
+	/* handle carry and left-over bytes */
+	if (pbuf->carry_bytes + bytes_left >= 8) {
+		unsigned long nread;
+
+		/* there is enough to fill another qw - fill carry */
+		nread = 8 - pbuf->carry_bytes;
+		read_extra_bytes(pbuf, from, nread);
+
+		/*
+		 * One more write - but need to make sure dest is correct.
+		 * Check for wrap and the possibility the write
+		 * should be in SOP space.
+		 *
+		 * The two checks immediately below cannot both be true, hence
+		 * the else. If we have wrapped, we cannot still be within the
+		 * first block. Conversely, if we are still in the first block,
+		 * we cannot have wrapped. We do the wrap check first as that
+		 * is more likely.
+		 */
+		/* adjust if we have wrapped */
+		if (dest >= pbuf->end)
+			dest -= pbuf->sc->size;
+		/* jump to the SOP range if within the first block */
+		else if (pbuf->qw_written < PIO_BLOCK_QWS)
+			dest += SOP_DISTANCE;
+
+		/* flush out full carry */
+		carry8_write8(pbuf->carry, dest);
+		pbuf->qw_written++;
+
+		/* now adjust and read the rest of the bytes into carry */
+		bytes_left -= nread;
+		from += nread; /* from is now not aligned */
+		read_low_bytes(pbuf, from, bytes_left);
+	} else {
+		/* not enough to fill another qw, append the rest to carry */
+		read_extra_bytes(pbuf, from, bytes_left);
+	}
+}
+
+/*
+ * Mid copy helper, "straight case" - source pointer is 64-bit aligned
+ * with no carry bytes.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_straight(struct pio_buf *pbuf,
+			      const void *from, size_t nbytes)
+{
+	void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+	void __iomem *dend;			/* 8-byte data end */
+
+	/* calculate 8-byte data end */
+	dend = dest + ((nbytes >> 3) * sizeof(u64));
+
+	if (pbuf->qw_written < PIO_BLOCK_QWS) {
+		/*
+		 * Still within SOP block.  We don't need to check for
+		 * wrap because we are still in the first block and
+		 * can only wrap on block boundaries.
+		 */
+		void __iomem *send;		/* SOP end */
+		void __iomem *xend;
+
+		/*
+		 * calculate the end of data or end of block, whichever
+		 * comes first
+		 */
+		send = pbuf->start + PIO_BLOCK_SIZE;
+		xend = min(send, dend);
+
+		/* shift up to SOP=1 space */
+		dest += SOP_DISTANCE;
+		xend += SOP_DISTANCE;
+
+		/* write 8-byte chunk data */
+		while (dest < xend) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+
+		/* shift down to SOP=0 space */
+		dest -= SOP_DISTANCE;
+	}
+	/*
+	 * At this point dest could be (either, both, or neither):
+	 * - at dend
+	 * - at the wrap
+	 */
+
+	/*
+	 * If the wrap comes before or matches the data end,
+	 * copy until until the wrap, then wrap.
+	 *
+	 * If dest is at the wrap, we will fall into the if,
+	 * not do the loop, when wrap.
+	 *
+	 * If the data ends at the end of the SOP above and
+	 * the buffer wraps, then pbuf->end == dend == dest
+	 * and nothing will get written.
+	 */
+	if (pbuf->end <= dend) {
+		while (dest < pbuf->end) {
+			writeq(*(u64 *)from, dest);
+			from += sizeof(u64);
+			dest += sizeof(u64);
+		}
+
+		dest -= pbuf->sc->size;
+		dend -= pbuf->sc->size;
+	}
+
+	/* write 8-byte non-SOP, non-wrap chunk data */
+	while (dest < dend) {
+		writeq(*(u64 *)from, dest);
+		from += sizeof(u64);
+		dest += sizeof(u64);
+	}
+
+	/* we know carry_bytes was zero on entry to this routine */
+	read_low_bytes(pbuf, from, nbytes & 0x7);
+
+	pbuf->qw_written += nbytes >> 3;
+}
+
+/*
+ * Segmented PIO Copy - middle
+ *
+ * Must handle any aligned tail and any aligned source with any byte count.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @from: data source
+ * @nbytes: number of bytes to copy
+ */
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+	unsigned long from_align = (unsigned long)from & 0x7;
+
+	if (pbuf->carry_bytes + nbytes < 8) {
+		/* not enough bytes to fill a QW */
+		read_extra_bytes(pbuf, from, nbytes);
+		return;
+	}
+
+	if (from_align) {
+		/* misaligned source pointer - align it */
+		unsigned long to_align;
+
+		/* bytes to read to align "from" */
+		to_align = 8 - from_align;
+
+		/*
+		 * In the advance-to-alignment logic below, we do not need
+		 * to check if we are using more than nbytes.  This is because
+		 * if we are here, we already know that carry+nbytes will
+		 * fill at least one QW.
+		 */
+		if (pbuf->carry_bytes + to_align < 8) {
+			/* not enough align bytes to fill a QW */
+			read_extra_bytes(pbuf, from, to_align);
+			from += to_align;
+			nbytes -= to_align;
+		} else {
+			/* bytes to fill carry */
+			unsigned long to_fill = 8 - pbuf->carry_bytes;
+			/* bytes left over to be read */
+			unsigned long extra = to_align - to_fill;
+			void __iomem *dest;
+
+			/* fill carry... */
+			read_extra_bytes(pbuf, from, to_fill);
+			from += to_fill;
+			nbytes -= to_fill;
+			/* may not be enough valid bytes left to align */
+			if (extra > nbytes)
+				extra = nbytes;
+
+			/* ...now write carry */
+			dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+			/*
+			 * The two checks immediately below cannot both be
+			 * true, hence the else.  If we have wrapped, we
+			 * cannot still be within the first block.
+			 * Conversely, if we are still in the first block, we
+			 * cannot have wrapped.  We do the wrap check first
+			 * as that is more likely.
+			 */
+			/* adjust if we've wrapped */
+			if (dest >= pbuf->end)
+				dest -= pbuf->sc->size;
+			/* jump to SOP range if within the first block */
+			else if (pbuf->qw_written < PIO_BLOCK_QWS)
+				dest += SOP_DISTANCE;
+
+			carry8_write8(pbuf->carry, dest);
+			pbuf->qw_written++;
+
+			/* read any extra bytes to do final alignment */
+			/* this will overwrite anything in pbuf->carry */
+			read_low_bytes(pbuf, from, extra);
+			from += extra;
+			nbytes -= extra;
+			/*
+			 * If no bytes are left, return early - we are done.
+			 * NOTE: This short-circuit is *required* because
+			 * "extra" may have been reduced in size and "from"
+			 * is not aligned, as required when leaving this
+			 * if block.
+			 */
+			if (nbytes == 0)
+				return;
+		}
+
+		/* at this point, from is QW aligned */
+	}
+
+	if (pbuf->carry_bytes)
+		mid_copy_mix(pbuf, from, nbytes);
+	else
+		mid_copy_straight(pbuf, from, nbytes);
+}
+
+/*
+ * Segmented PIO Copy - end
+ *
+ * Write any remainder (in pbuf->carry) and finish writing the whole block.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ */
+void seg_pio_copy_end(struct pio_buf *pbuf)
+{
+	void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+	/*
+	 * The two checks immediately below cannot both be true, hence the
+	 * else.  If we have wrapped, we cannot still be within the first
+	 * block.  Conversely, if we are still in the first block, we
+	 * cannot have wrapped.  We do the wrap check first as that is
+	 * more likely.
+	 */
+	/* adjust if we have wrapped */
+	if (dest >= pbuf->end)
+		dest -= pbuf->sc->size;
+	/* jump to the SOP range if within the first block */
+	else if (pbuf->qw_written < PIO_BLOCK_QWS)
+		dest += SOP_DISTANCE;
+
+	/* write final bytes, if any */
+	if (carry_write8(pbuf, dest)) {
+		dest += sizeof(u64);
+		/*
+		 * NOTE: We do not need to recalculate whether dest needs
+		 * SOP_DISTANCE or not.
+		 *
+		 * If we are in the first block and the dangle write
+		 * keeps us in the same block, dest will need
+		 * to retain SOP_DISTANCE in the loop below.
+		 *
+		 * If we are in the first block and the dangle write pushes
+		 * us to the next block, then loop below will not run
+		 * and dest is not used.  Hence we do not need to update
+		 * it.
+		 *
+		 * If we are past the first block, then SOP_DISTANCE
+		 * was never added, so there is nothing to do.
+		 */
+	}
+
+	/* fill in rest of block */
+	while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+		writeq(0, dest);
+		dest += sizeof(u64);
+	}
+
+	/* finished with this buffer */
+	this_cpu_dec(*pbuf->sc->buffers_allocated);
+	preempt_enable();
+}
diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c
new file mode 100644
index 0000000..cbf7faa
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/platform.c
@@ -0,0 +1,1077 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/firmware.h>
+
+#include "hfi.h"
+#include "efivar.h"
+#include "eprom.h"
+
+#define DEFAULT_PLATFORM_CONFIG_NAME "hfi1_platform.dat"
+
+static int validate_scratch_checksum(struct hfi1_devdata *dd)
+{
+	u64 checksum = 0, temp_scratch = 0;
+	int i, j, version;
+
+	temp_scratch = read_csr(dd, ASIC_CFG_SCRATCH);
+	version = (temp_scratch & BITMAP_VERSION_SMASK) >> BITMAP_VERSION_SHIFT;
+
+	/* Prevent power on default of all zeroes from passing checksum */
+	if (!version) {
+		dd_dev_err(dd, "%s: Config bitmap uninitialized\n", __func__);
+		dd_dev_err(dd,
+			   "%s: Please update your BIOS to support active channels\n",
+			   __func__);
+		return 0;
+	}
+
+	/*
+	 * ASIC scratch 0 only contains the checksum and bitmap version as
+	 * fields of interest, both of which are handled separately from the
+	 * loop below, so skip it
+	 */
+	checksum += version;
+	for (i = 1; i < ASIC_NUM_SCRATCH; i++) {
+		temp_scratch = read_csr(dd, ASIC_CFG_SCRATCH + (8 * i));
+		for (j = sizeof(u64); j != 0; j -= 2) {
+			checksum += (temp_scratch & 0xFFFF);
+			temp_scratch >>= 16;
+		}
+	}
+
+	while (checksum >> 16)
+		checksum = (checksum & CHECKSUM_MASK) + (checksum >> 16);
+
+	temp_scratch = read_csr(dd, ASIC_CFG_SCRATCH);
+	temp_scratch &= CHECKSUM_SMASK;
+	temp_scratch >>= CHECKSUM_SHIFT;
+
+	if (checksum + temp_scratch == 0xFFFF)
+		return 1;
+
+	dd_dev_err(dd, "%s: Configuration bitmap corrupted\n", __func__);
+	return 0;
+}
+
+static void save_platform_config_fields(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+	u64 temp_scratch = 0, temp_dest = 0;
+
+	temp_scratch = read_csr(dd, ASIC_CFG_SCRATCH_1);
+
+	temp_dest = temp_scratch &
+		    (dd->hfi1_id ? PORT1_PORT_TYPE_SMASK :
+		     PORT0_PORT_TYPE_SMASK);
+	ppd->port_type = temp_dest >>
+			 (dd->hfi1_id ? PORT1_PORT_TYPE_SHIFT :
+			  PORT0_PORT_TYPE_SHIFT);
+
+	temp_dest = temp_scratch &
+		    (dd->hfi1_id ? PORT1_LOCAL_ATTEN_SMASK :
+		     PORT0_LOCAL_ATTEN_SMASK);
+	ppd->local_atten = temp_dest >>
+			   (dd->hfi1_id ? PORT1_LOCAL_ATTEN_SHIFT :
+			    PORT0_LOCAL_ATTEN_SHIFT);
+
+	temp_dest = temp_scratch &
+		    (dd->hfi1_id ? PORT1_REMOTE_ATTEN_SMASK :
+		     PORT0_REMOTE_ATTEN_SMASK);
+	ppd->remote_atten = temp_dest >>
+			    (dd->hfi1_id ? PORT1_REMOTE_ATTEN_SHIFT :
+			     PORT0_REMOTE_ATTEN_SHIFT);
+
+	temp_dest = temp_scratch &
+		    (dd->hfi1_id ? PORT1_DEFAULT_ATTEN_SMASK :
+		     PORT0_DEFAULT_ATTEN_SMASK);
+	ppd->default_atten = temp_dest >>
+			     (dd->hfi1_id ? PORT1_DEFAULT_ATTEN_SHIFT :
+			      PORT0_DEFAULT_ATTEN_SHIFT);
+
+	temp_scratch = read_csr(dd, dd->hfi1_id ? ASIC_CFG_SCRATCH_3 :
+				ASIC_CFG_SCRATCH_2);
+
+	ppd->tx_preset_eq = (temp_scratch & TX_EQ_SMASK) >> TX_EQ_SHIFT;
+	ppd->tx_preset_noeq = (temp_scratch & TX_NO_EQ_SMASK) >> TX_NO_EQ_SHIFT;
+	ppd->rx_preset = (temp_scratch & RX_SMASK) >> RX_SHIFT;
+
+	ppd->max_power_class = (temp_scratch & QSFP_MAX_POWER_SMASK) >>
+				QSFP_MAX_POWER_SHIFT;
+
+	ppd->config_from_scratch = true;
+}
+
+void get_platform_config(struct hfi1_devdata *dd)
+{
+	int ret = 0;
+	u8 *temp_platform_config = NULL;
+	u32 esize;
+	const struct firmware *platform_config_file = NULL;
+
+	if (is_integrated(dd)) {
+		if (validate_scratch_checksum(dd)) {
+			save_platform_config_fields(dd);
+			return;
+		}
+	} else {
+		ret = eprom_read_platform_config(dd,
+						 (void **)&temp_platform_config,
+						 &esize);
+		if (!ret) {
+			/* success */
+			dd->platform_config.data = temp_platform_config;
+			dd->platform_config.size = esize;
+			return;
+		}
+	}
+	dd_dev_err(dd,
+		   "%s: Failed to get platform config, falling back to sub-optimal default file\n",
+		   __func__);
+
+	ret = request_firmware(&platform_config_file,
+			       DEFAULT_PLATFORM_CONFIG_NAME,
+			       &dd->pcidev->dev);
+	if (ret) {
+		dd_dev_err(dd,
+			   "%s: No default platform config file found\n",
+			   __func__);
+		return;
+	}
+
+	/*
+	 * Allocate separate memory block to store data and free firmware
+	 * structure. This allows free_platform_config to treat EPROM and
+	 * fallback configs in the same manner.
+	 */
+	dd->platform_config.data = kmemdup(platform_config_file->data,
+					   platform_config_file->size,
+					   GFP_KERNEL);
+	dd->platform_config.size = platform_config_file->size;
+	release_firmware(platform_config_file);
+}
+
+void free_platform_config(struct hfi1_devdata *dd)
+{
+	/* Release memory allocated for eprom or fallback file read. */
+	kfree(dd->platform_config.data);
+	dd->platform_config.data = NULL;
+}
+
+void get_port_type(struct hfi1_pportdata *ppd)
+{
+	int ret;
+	u32 temp;
+
+	ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+					PORT_TABLE_PORT_TYPE, &temp,
+					4);
+	if (ret) {
+		ppd->port_type = PORT_TYPE_UNKNOWN;
+		return;
+	}
+	ppd->port_type = temp;
+}
+
+int set_qsfp_tx(struct hfi1_pportdata *ppd, int on)
+{
+	u8 tx_ctrl_byte = on ? 0x0 : 0xF;
+	int ret = 0;
+
+	ret = qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_TX_CTRL_BYTE_OFFS,
+			 &tx_ctrl_byte, 1);
+	/* we expected 1, so consider 0 an error */
+	if (ret == 0)
+		ret = -EIO;
+	else if (ret == 1)
+		ret = 0;
+	return ret;
+}
+
+static int qual_power(struct hfi1_pportdata *ppd)
+{
+	u32 cable_power_class = 0, power_class_max = 0;
+	u8 *cache = ppd->qsfp_info.cache;
+	int ret = 0;
+
+	ret = get_platform_config_field(
+		ppd->dd, PLATFORM_CONFIG_SYSTEM_TABLE, 0,
+		SYSTEM_TABLE_QSFP_POWER_CLASS_MAX, &power_class_max, 4);
+	if (ret)
+		return ret;
+
+	cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+	if (cable_power_class > power_class_max)
+		ppd->offline_disabled_reason =
+			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
+
+	if (ppd->offline_disabled_reason ==
+			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) {
+		dd_dev_err(
+			ppd->dd,
+			"%s: Port disabled due to system power restrictions\n",
+			__func__);
+		ret = -EPERM;
+	}
+	return ret;
+}
+
+static int qual_bitrate(struct hfi1_pportdata *ppd)
+{
+	u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
+	u8 *cache = ppd->qsfp_info.cache;
+
+	if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G) &&
+	    cache[QSFP_NOM_BIT_RATE_250_OFFS] < 0x64)
+		ppd->offline_disabled_reason =
+			   HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY);
+
+	if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G) &&
+	    cache[QSFP_NOM_BIT_RATE_100_OFFS] < 0x7D)
+		ppd->offline_disabled_reason =
+			   HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY);
+
+	if (ppd->offline_disabled_reason ==
+			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY)) {
+		dd_dev_err(
+			ppd->dd,
+			"%s: Cable failed bitrate check, disabling port\n",
+			__func__);
+		return -EPERM;
+	}
+	return 0;
+}
+
+static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
+{
+	u8 cable_power_class = 0, power_ctrl_byte = 0;
+	u8 *cache = ppd->qsfp_info.cache;
+	int ret;
+
+	cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+	if (cable_power_class > QSFP_POWER_CLASS_1) {
+		power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS];
+
+		power_ctrl_byte |= 1;
+		power_ctrl_byte &= ~(0x2);
+
+		ret = qsfp_write(ppd, ppd->dd->hfi1_id,
+				 QSFP_PWR_CTRL_BYTE_OFFS,
+				 &power_ctrl_byte, 1);
+		if (ret != 1)
+			return -EIO;
+
+		if (cable_power_class > QSFP_POWER_CLASS_4) {
+			power_ctrl_byte |= (1 << 2);
+			ret = qsfp_write(ppd, ppd->dd->hfi1_id,
+					 QSFP_PWR_CTRL_BYTE_OFFS,
+					 &power_ctrl_byte, 1);
+			if (ret != 1)
+				return -EIO;
+		}
+
+		/* SFF 8679 rev 1.7 LPMode Deassert time */
+		msleep(300);
+	}
+	return 0;
+}
+
+static void apply_rx_cdr(struct hfi1_pportdata *ppd,
+			 u32 rx_preset_index,
+			 u8 *cdr_ctrl_byte)
+{
+	u32 rx_preset;
+	u8 *cache = ppd->qsfp_info.cache;
+	int cable_power_class;
+
+	if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) &&
+	      (cache[QSFP_CDR_INFO_OFFS] & 0x40)))
+		return;
+
+	/* RX CDR present, bypass supported */
+	cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+	if (cable_power_class <= QSFP_POWER_CLASS_3) {
+		/* Power class <= 3, ignore config & turn RX CDR on */
+		*cdr_ctrl_byte |= 0xF;
+		return;
+	}
+
+	get_platform_config_field(
+		ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+		rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
+		&rx_preset, 4);
+
+	if (!rx_preset) {
+		dd_dev_info(
+			ppd->dd,
+			"%s: RX_CDR_APPLY is set to disabled\n",
+			__func__);
+		return;
+	}
+	get_platform_config_field(
+		ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+		rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR,
+		&rx_preset, 4);
+
+	/* Expand cdr setting to all 4 lanes */
+	rx_preset = (rx_preset | (rx_preset << 1) |
+			(rx_preset << 2) | (rx_preset << 3));
+
+	if (rx_preset) {
+		*cdr_ctrl_byte |= rx_preset;
+	} else {
+		*cdr_ctrl_byte &= rx_preset;
+		/* Preserve current TX CDR status */
+		*cdr_ctrl_byte |= (cache[QSFP_CDR_CTRL_BYTE_OFFS] & 0xF0);
+	}
+}
+
+static void apply_tx_cdr(struct hfi1_pportdata *ppd,
+			 u32 tx_preset_index,
+			 u8 *cdr_ctrl_byte)
+{
+	u32 tx_preset;
+	u8 *cache = ppd->qsfp_info.cache;
+	int cable_power_class;
+
+	if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) &&
+	      (cache[QSFP_CDR_INFO_OFFS] & 0x80)))
+		return;
+
+	/* TX CDR present, bypass supported */
+	cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+	if (cable_power_class <= QSFP_POWER_CLASS_3) {
+		/* Power class <= 3, ignore config & turn TX CDR on */
+		*cdr_ctrl_byte |= 0xF0;
+		return;
+	}
+
+	get_platform_config_field(
+		ppd->dd,
+		PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
+		TX_PRESET_TABLE_QSFP_TX_CDR_APPLY, &tx_preset, 4);
+
+	if (!tx_preset) {
+		dd_dev_info(
+			ppd->dd,
+			"%s: TX_CDR_APPLY is set to disabled\n",
+			__func__);
+		return;
+	}
+	get_platform_config_field(
+		ppd->dd,
+		PLATFORM_CONFIG_TX_PRESET_TABLE,
+		tx_preset_index,
+		TX_PRESET_TABLE_QSFP_TX_CDR, &tx_preset, 4);
+
+	/* Expand cdr setting to all 4 lanes */
+	tx_preset = (tx_preset | (tx_preset << 1) |
+			(tx_preset << 2) | (tx_preset << 3));
+
+	if (tx_preset)
+		*cdr_ctrl_byte |= (tx_preset << 4);
+	else
+		/* Preserve current/determined RX CDR status */
+		*cdr_ctrl_byte &= ((tx_preset << 4) | 0xF);
+}
+
+static void apply_cdr_settings(
+		struct hfi1_pportdata *ppd, u32 rx_preset_index,
+		u32 tx_preset_index)
+{
+	u8 *cache = ppd->qsfp_info.cache;
+	u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS];
+
+	apply_rx_cdr(ppd, rx_preset_index, &cdr_ctrl_byte);
+
+	apply_tx_cdr(ppd, tx_preset_index, &cdr_ctrl_byte);
+
+	qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS,
+		   &cdr_ctrl_byte, 1);
+}
+
+static void apply_tx_eq_auto(struct hfi1_pportdata *ppd)
+{
+	u8 *cache = ppd->qsfp_info.cache;
+	u8 tx_eq;
+
+	if (!(cache[QSFP_EQ_INFO_OFFS] & 0x8))
+		return;
+	/* Disable adaptive TX EQ if present */
+	tx_eq = cache[(128 * 3) + 241];
+	tx_eq &= 0xF0;
+	qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 241, &tx_eq, 1);
+}
+
+static void apply_tx_eq_prog(struct hfi1_pportdata *ppd, u32 tx_preset_index)
+{
+	u8 *cache = ppd->qsfp_info.cache;
+	u32 tx_preset;
+	u8 tx_eq;
+
+	if (!(cache[QSFP_EQ_INFO_OFFS] & 0x4))
+		return;
+
+	get_platform_config_field(
+		ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+		tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
+		&tx_preset, 4);
+	if (!tx_preset) {
+		dd_dev_info(
+			ppd->dd,
+			"%s: TX_EQ_APPLY is set to disabled\n",
+			__func__);
+		return;
+	}
+	get_platform_config_field(
+			ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+			tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ,
+			&tx_preset, 4);
+
+	if (((cache[(128 * 3) + 224] & 0xF0) >> 4) < tx_preset) {
+		dd_dev_info(
+			ppd->dd,
+			"%s: TX EQ %x unsupported\n",
+			__func__, tx_preset);
+
+		dd_dev_info(
+			ppd->dd,
+			"%s: Applying EQ %x\n",
+			__func__, cache[608] & 0xF0);
+
+		tx_preset = (cache[608] & 0xF0) >> 4;
+	}
+
+	tx_eq = tx_preset | (tx_preset << 4);
+	qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 234, &tx_eq, 1);
+	qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 235, &tx_eq, 1);
+}
+
+static void apply_rx_eq_emp(struct hfi1_pportdata *ppd, u32 rx_preset_index)
+{
+	u32 rx_preset;
+	u8 rx_eq, *cache = ppd->qsfp_info.cache;
+
+	if (!(cache[QSFP_EQ_INFO_OFFS] & 0x2))
+		return;
+	get_platform_config_field(
+			ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+			rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP_APPLY,
+			&rx_preset, 4);
+
+	if (!rx_preset) {
+		dd_dev_info(
+			ppd->dd,
+			"%s: RX_EMP_APPLY is set to disabled\n",
+			__func__);
+		return;
+	}
+	get_platform_config_field(
+		ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+		rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP,
+		&rx_preset, 4);
+
+	if ((cache[(128 * 3) + 224] & 0xF) < rx_preset) {
+		dd_dev_info(
+			ppd->dd,
+			"%s: Requested RX EMP %x\n",
+			__func__, rx_preset);
+
+		dd_dev_info(
+			ppd->dd,
+			"%s: Applying supported EMP %x\n",
+			__func__, cache[608] & 0xF);
+
+		rx_preset = cache[608] & 0xF;
+	}
+
+	rx_eq = rx_preset | (rx_preset << 4);
+
+	qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 236, &rx_eq, 1);
+	qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 237, &rx_eq, 1);
+}
+
+static void apply_eq_settings(struct hfi1_pportdata *ppd,
+			      u32 rx_preset_index, u32 tx_preset_index)
+{
+	u8 *cache = ppd->qsfp_info.cache;
+
+	/* no point going on w/o a page 3 */
+	if (cache[2] & 4) {
+		dd_dev_info(ppd->dd,
+			    "%s: Upper page 03 not present\n",
+			    __func__);
+		return;
+	}
+
+	apply_tx_eq_auto(ppd);
+
+	apply_tx_eq_prog(ppd, tx_preset_index);
+
+	apply_rx_eq_emp(ppd, rx_preset_index);
+}
+
+static void apply_rx_amplitude_settings(
+		struct hfi1_pportdata *ppd, u32 rx_preset_index,
+		u32 tx_preset_index)
+{
+	u32 rx_preset;
+	u8 rx_amp = 0, i = 0, preferred = 0, *cache = ppd->qsfp_info.cache;
+
+	/* no point going on w/o a page 3 */
+	if (cache[2] & 4) {
+		dd_dev_info(ppd->dd,
+			    "%s: Upper page 03 not present\n",
+			    __func__);
+		return;
+	}
+	if (!(cache[QSFP_EQ_INFO_OFFS] & 0x1)) {
+		dd_dev_info(ppd->dd,
+			    "%s: RX_AMP_APPLY is set to disabled\n",
+			    __func__);
+		return;
+	}
+
+	get_platform_config_field(ppd->dd,
+				  PLATFORM_CONFIG_RX_PRESET_TABLE,
+				  rx_preset_index,
+				  RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
+				  &rx_preset, 4);
+
+	if (!rx_preset) {
+		dd_dev_info(ppd->dd,
+			    "%s: RX_AMP_APPLY is set to disabled\n",
+			    __func__);
+		return;
+	}
+	get_platform_config_field(ppd->dd,
+				  PLATFORM_CONFIG_RX_PRESET_TABLE,
+				  rx_preset_index,
+				  RX_PRESET_TABLE_QSFP_RX_AMP,
+				  &rx_preset, 4);
+
+	dd_dev_info(ppd->dd,
+		    "%s: Requested RX AMP %x\n",
+		    __func__,
+		    rx_preset);
+
+	for (i = 0; i < 4; i++) {
+		if (cache[(128 * 3) + 225] & (1 << i)) {
+			preferred = i;
+			if (preferred == rx_preset)
+				break;
+		}
+	}
+
+	/*
+	 * Verify that preferred RX amplitude is not just a
+	 * fall through of the default
+	 */
+	if (!preferred && !(cache[(128 * 3) + 225] & 0x1)) {
+		dd_dev_info(ppd->dd, "No supported RX AMP, not applying\n");
+		return;
+	}
+
+	dd_dev_info(ppd->dd,
+		    "%s: Applying RX AMP %x\n", __func__, preferred);
+
+	rx_amp = preferred | (preferred << 4);
+	qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 238, &rx_amp, 1);
+	qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 239, &rx_amp, 1);
+}
+
+#define OPA_INVALID_INDEX 0xFFF
+
+static void apply_tx_lanes(struct hfi1_pportdata *ppd, u8 field_id,
+			   u32 config_data, const char *message)
+{
+	u8 i;
+	int ret = HCMD_SUCCESS;
+
+	for (i = 0; i < 4; i++) {
+		ret = load_8051_config(ppd->dd, field_id, i, config_data);
+		if (ret != HCMD_SUCCESS) {
+			dd_dev_err(
+				ppd->dd,
+				"%s: %s for lane %u failed\n",
+				message, __func__, i);
+		}
+	}
+}
+
+/*
+ * Return a special SerDes setting for low power AOC cables.  The power class
+ * threshold and setting being used were all found by empirical testing.
+ *
+ * Summary of the logic:
+ *
+ * if (QSFP and QSFP_TYPE == AOC and QSFP_POWER_CLASS < 4)
+ *     return 0xe
+ * return 0; // leave at default
+ */
+static u8 aoc_low_power_setting(struct hfi1_pportdata *ppd)
+{
+	u8 *cache = ppd->qsfp_info.cache;
+	int power_class;
+
+	/* QSFP only */
+	if (ppd->port_type != PORT_TYPE_QSFP)
+		return 0; /* leave at default */
+
+	/* active optical cables only */
+	switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) {
+	case 0x0 ... 0x9: /* fallthrough */
+	case 0xC: /* fallthrough */
+	case 0xE:
+		/* active AOC */
+		power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+		if (power_class < QSFP_POWER_CLASS_4)
+			return 0xe;
+	}
+	return 0; /* leave at default */
+}
+
+static void apply_tunings(
+		struct hfi1_pportdata *ppd, u32 tx_preset_index,
+		u8 tuning_method, u32 total_atten, u8 limiting_active)
+{
+	int ret = 0;
+	u32 config_data = 0, tx_preset = 0;
+	u8 precur = 0, attn = 0, postcur = 0, external_device_config = 0;
+	u8 *cache = ppd->qsfp_info.cache;
+
+	/* Pass tuning method to 8051 */
+	read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
+			 &config_data);
+	config_data &= ~(0xff << TUNING_METHOD_SHIFT);
+	config_data |= ((u32)tuning_method << TUNING_METHOD_SHIFT);
+	ret = load_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
+			       config_data);
+	if (ret != HCMD_SUCCESS)
+		dd_dev_err(ppd->dd, "%s: Failed to set tuning method\n",
+			   __func__);
+
+	/* Set same channel loss for both TX and RX */
+	config_data = 0 | (total_atten << 16) | (total_atten << 24);
+	apply_tx_lanes(ppd, CHANNEL_LOSS_SETTINGS, config_data,
+		       "Setting channel loss");
+
+	/* Inform 8051 of cable capabilities */
+	if (ppd->qsfp_info.cache_valid) {
+		external_device_config =
+			((cache[QSFP_MOD_PWR_OFFS] & 0x4) << 3) |
+			((cache[QSFP_MOD_PWR_OFFS] & 0x8) << 2) |
+			((cache[QSFP_EQ_INFO_OFFS] & 0x2) << 1) |
+			(cache[QSFP_EQ_INFO_OFFS] & 0x4);
+		ret = read_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
+				       GENERAL_CONFIG, &config_data);
+		/* Clear, then set the external device config field */
+		config_data &= ~(u32)0xFF;
+		config_data |= external_device_config;
+		ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
+				       GENERAL_CONFIG, config_data);
+		if (ret != HCMD_SUCCESS)
+			dd_dev_err(ppd->dd,
+				   "%s: Failed set ext device config params\n",
+				   __func__);
+	}
+
+	if (tx_preset_index == OPA_INVALID_INDEX) {
+		if (ppd->port_type == PORT_TYPE_QSFP && limiting_active)
+			dd_dev_err(ppd->dd, "%s: Invalid Tx preset index\n",
+				   __func__);
+		return;
+	}
+
+	/* Following for limiting active channels only */
+	get_platform_config_field(
+		ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
+		TX_PRESET_TABLE_PRECUR, &tx_preset, 4);
+	precur = tx_preset;
+
+	get_platform_config_field(
+		ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+		tx_preset_index, TX_PRESET_TABLE_ATTN, &tx_preset, 4);
+	attn = tx_preset;
+
+	get_platform_config_field(
+		ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+		tx_preset_index, TX_PRESET_TABLE_POSTCUR, &tx_preset, 4);
+	postcur = tx_preset;
+
+	/*
+	 * NOTES:
+	 * o The aoc_low_power_setting is applied to all lanes even
+	 *   though only lane 0's value is examined by the firmware.
+	 * o A lingering low power setting after a cable swap does
+	 *   not occur.  On cable unplug the 8051 is reset and
+	 *   restarted on cable insert.  This resets all settings to
+	 *   their default, erasing any previous low power setting.
+	 */
+	config_data = precur | (attn << 8) | (postcur << 16) |
+			(aoc_low_power_setting(ppd) << 24);
+
+	apply_tx_lanes(ppd, TX_EQ_SETTINGS, config_data,
+		       "Applying TX settings");
+}
+
+/* Must be holding the QSFP i2c resource */
+static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
+			    u32 *ptr_rx_preset, u32 *ptr_total_atten)
+{
+	int ret;
+	u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
+	u8 *cache = ppd->qsfp_info.cache;
+
+	ppd->qsfp_info.limiting_active = 1;
+
+	ret = set_qsfp_tx(ppd, 0);
+	if (ret)
+		return ret;
+
+	ret = qual_power(ppd);
+	if (ret)
+		return ret;
+
+	ret = qual_bitrate(ppd);
+	if (ret)
+		return ret;
+
+	/*
+	 * We'll change the QSFP memory contents from here on out, thus we set a
+	 * flag here to remind ourselves to reset the QSFP module. This prevents
+	 * reuse of stale settings established in our previous pass through.
+	 */
+	if (ppd->qsfp_info.reset_needed) {
+		ret = reset_qsfp(ppd);
+		if (ret)
+			return ret;
+		refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+	} else {
+		ppd->qsfp_info.reset_needed = 1;
+	}
+
+	ret = set_qsfp_high_power(ppd);
+	if (ret)
+		return ret;
+
+	if (cache[QSFP_EQ_INFO_OFFS] & 0x4) {
+		ret = get_platform_config_field(
+			ppd->dd,
+			PLATFORM_CONFIG_PORT_TABLE, 0,
+			PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
+			ptr_tx_preset, 4);
+		if (ret) {
+			*ptr_tx_preset = OPA_INVALID_INDEX;
+			return ret;
+		}
+	} else {
+		ret = get_platform_config_field(
+			ppd->dd,
+			PLATFORM_CONFIG_PORT_TABLE, 0,
+			PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
+			ptr_tx_preset, 4);
+		if (ret) {
+			*ptr_tx_preset = OPA_INVALID_INDEX;
+			return ret;
+		}
+	}
+
+	ret = get_platform_config_field(
+		ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+		PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4);
+	if (ret) {
+		*ptr_rx_preset = OPA_INVALID_INDEX;
+		return ret;
+	}
+
+	if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
+		get_platform_config_field(
+			ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+			PORT_TABLE_LOCAL_ATTEN_25G, ptr_total_atten, 4);
+	else if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G))
+		get_platform_config_field(
+			ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+			PORT_TABLE_LOCAL_ATTEN_12G, ptr_total_atten, 4);
+
+	apply_cdr_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
+
+	apply_eq_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
+
+	apply_rx_amplitude_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
+
+	ret = set_qsfp_tx(ppd, 1);
+
+	return ret;
+}
+
+static int tune_qsfp(struct hfi1_pportdata *ppd,
+		     u32 *ptr_tx_preset, u32 *ptr_rx_preset,
+		     u8 *ptr_tuning_method, u32 *ptr_total_atten)
+{
+	u32 cable_atten = 0, remote_atten = 0, platform_atten = 0;
+	u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
+	int ret = 0;
+	u8 *cache = ppd->qsfp_info.cache;
+
+	switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) {
+	case 0xA ... 0xB:
+		ret = get_platform_config_field(
+			ppd->dd,
+			PLATFORM_CONFIG_PORT_TABLE, 0,
+			PORT_TABLE_LOCAL_ATTEN_25G,
+			&platform_atten, 4);
+		if (ret)
+			return ret;
+
+		if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
+			cable_atten = cache[QSFP_CU_ATTEN_12G_OFFS];
+		else if ((lss & OPA_LINK_SPEED_12_5G) &&
+			 (lse & OPA_LINK_SPEED_12_5G))
+			cable_atten = cache[QSFP_CU_ATTEN_7G_OFFS];
+
+		/* Fallback to configured attenuation if cable memory is bad */
+		if (cable_atten == 0 || cable_atten > 36) {
+			ret = get_platform_config_field(
+				ppd->dd,
+				PLATFORM_CONFIG_SYSTEM_TABLE, 0,
+				SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
+				&cable_atten, 4);
+			if (ret)
+				return ret;
+		}
+
+		ret = get_platform_config_field(
+			ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+			PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4);
+		if (ret)
+			return ret;
+
+		*ptr_total_atten = platform_atten + cable_atten + remote_atten;
+
+		*ptr_tuning_method = OPA_PASSIVE_TUNING;
+		break;
+	case 0x0 ... 0x9: /* fallthrough */
+	case 0xC: /* fallthrough */
+	case 0xE:
+		ret = tune_active_qsfp(ppd, ptr_tx_preset, ptr_rx_preset,
+				       ptr_total_atten);
+		if (ret)
+			return ret;
+
+		*ptr_tuning_method = OPA_ACTIVE_TUNING;
+		break;
+	case 0xD: /* fallthrough */
+	case 0xF:
+	default:
+		dd_dev_warn(ppd->dd, "%s: Unknown/unsupported cable\n",
+			    __func__);
+		break;
+	}
+	return ret;
+}
+
+/*
+ * This function communicates its success or failure via ppd->driver_link_ready
+ * Thus, it depends on its association with start_link(...) which checks
+ * driver_link_ready before proceeding with the link negotiation and
+ * initialization process.
+ */
+void tune_serdes(struct hfi1_pportdata *ppd)
+{
+	int ret = 0;
+	u32 total_atten = 0;
+	u32 remote_atten = 0, platform_atten = 0;
+	u32 rx_preset_index, tx_preset_index;
+	u8 tuning_method = 0, limiting_active = 0;
+	struct hfi1_devdata *dd = ppd->dd;
+
+	rx_preset_index = OPA_INVALID_INDEX;
+	tx_preset_index = OPA_INVALID_INDEX;
+
+	/* the link defaults to enabled */
+	ppd->link_enabled = 1;
+	/* the driver link ready state defaults to not ready */
+	ppd->driver_link_ready = 0;
+	ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
+
+	/* Skip the tuning for testing (loopback != none) and simulations */
+	if (loopback != LOOPBACK_NONE ||
+	    ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+		ppd->driver_link_ready = 1;
+
+		if (qsfp_mod_present(ppd)) {
+			ret = acquire_chip_resource(ppd->dd,
+						    qsfp_resource(ppd->dd),
+						    QSFP_WAIT);
+			if (ret) {
+				dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
+					   __func__, (int)ppd->dd->hfi1_id);
+				goto bail;
+			}
+
+			refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+			release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
+		}
+
+		return;
+	}
+
+	switch (ppd->port_type) {
+	case PORT_TYPE_DISCONNECTED:
+		ppd->offline_disabled_reason =
+			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_DISCONNECTED);
+		dd_dev_warn(dd, "%s: Port disconnected, disabling port\n",
+			    __func__);
+		goto bail;
+	case PORT_TYPE_FIXED:
+		/* platform_atten, remote_atten pre-zeroed to catch error */
+		get_platform_config_field(
+			ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+			PORT_TABLE_LOCAL_ATTEN_25G, &platform_atten, 4);
+
+		get_platform_config_field(
+			ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+			PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4);
+
+		total_atten = platform_atten + remote_atten;
+
+		tuning_method = OPA_PASSIVE_TUNING;
+		break;
+	case PORT_TYPE_VARIABLE:
+		if (qsfp_mod_present(ppd)) {
+			/*
+			 * platform_atten, remote_atten pre-zeroed to
+			 * catch error
+			 */
+			get_platform_config_field(
+				ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+				PORT_TABLE_LOCAL_ATTEN_25G,
+				&platform_atten, 4);
+
+			get_platform_config_field(
+				ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+				PORT_TABLE_REMOTE_ATTEN_25G,
+				&remote_atten, 4);
+
+			total_atten = platform_atten + remote_atten;
+
+			tuning_method = OPA_PASSIVE_TUNING;
+		} else {
+			ppd->offline_disabled_reason =
+			     HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG);
+			goto bail;
+		}
+		break;
+	case PORT_TYPE_QSFP:
+		if (qsfp_mod_present(ppd)) {
+			ret = acquire_chip_resource(ppd->dd,
+						    qsfp_resource(ppd->dd),
+						    QSFP_WAIT);
+			if (ret) {
+				dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
+					   __func__, (int)ppd->dd->hfi1_id);
+				goto bail;
+			}
+			refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+
+			if (ppd->qsfp_info.cache_valid) {
+				ret = tune_qsfp(ppd,
+						&tx_preset_index,
+						&rx_preset_index,
+						&tuning_method,
+						&total_atten);
+
+				/*
+				 * We may have modified the QSFP memory, so
+				 * update the cache to reflect the changes
+				 */
+				refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+				limiting_active =
+						ppd->qsfp_info.limiting_active;
+			} else {
+				dd_dev_err(dd,
+					   "%s: Reading QSFP memory failed\n",
+					   __func__);
+				ret = -EINVAL; /* a fail indication */
+			}
+			release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
+			if (ret)
+				goto bail;
+		} else {
+			ppd->offline_disabled_reason =
+			   HFI1_ODR_MASK(
+				OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
+			goto bail;
+		}
+		break;
+	default:
+		dd_dev_warn(ppd->dd, "%s: Unknown port type\n", __func__);
+		ppd->port_type = PORT_TYPE_UNKNOWN;
+		tuning_method = OPA_UNKNOWN_TUNING;
+		total_atten = 0;
+		limiting_active = 0;
+		tx_preset_index = OPA_INVALID_INDEX;
+		break;
+	}
+
+	if (ppd->offline_disabled_reason ==
+			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
+		apply_tunings(ppd, tx_preset_index, tuning_method,
+			      total_atten, limiting_active);
+
+	if (!ret)
+		ppd->driver_link_ready = 1;
+
+	return;
+bail:
+	ppd->driver_link_ready = 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/platform.h b/drivers/infiniband/hw/hfi1/platform.h
new file mode 100644
index 0000000..eed0aa9
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/platform.h
@@ -0,0 +1,412 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef __PLATFORM_H
+#define __PLATFORM_H
+
+#define METADATA_TABLE_FIELD_START_SHIFT		0
+#define METADATA_TABLE_FIELD_START_LEN_BITS		15
+#define METADATA_TABLE_FIELD_LEN_SHIFT			16
+#define METADATA_TABLE_FIELD_LEN_LEN_BITS		16
+
+/* Header structure */
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_SHIFT			0
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS		6
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT		16
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS		12
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT			28
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS		4
+
+enum platform_config_table_type_encoding {
+	PLATFORM_CONFIG_TABLE_RESERVED,
+	PLATFORM_CONFIG_SYSTEM_TABLE,
+	PLATFORM_CONFIG_PORT_TABLE,
+	PLATFORM_CONFIG_RX_PRESET_TABLE,
+	PLATFORM_CONFIG_TX_PRESET_TABLE,
+	PLATFORM_CONFIG_QSFP_ATTEN_TABLE,
+	PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE,
+	PLATFORM_CONFIG_TABLE_MAX
+};
+
+enum platform_config_system_table_fields {
+	SYSTEM_TABLE_RESERVED,
+	SYSTEM_TABLE_NODE_STRING,
+	SYSTEM_TABLE_SYSTEM_IMAGE_GUID,
+	SYSTEM_TABLE_NODE_GUID,
+	SYSTEM_TABLE_REVISION,
+	SYSTEM_TABLE_VENDOR_OUI,
+	SYSTEM_TABLE_META_VERSION,
+	SYSTEM_TABLE_DEVICE_ID,
+	SYSTEM_TABLE_PARTITION_ENFORCEMENT_CAP,
+	SYSTEM_TABLE_QSFP_POWER_CLASS_MAX,
+	SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_12G,
+	SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
+	SYSTEM_TABLE_VARIABLE_TABLE_ENTRIES_PER_PORT,
+	SYSTEM_TABLE_MAX
+};
+
+enum platform_config_port_table_fields {
+	PORT_TABLE_RESERVED,
+	PORT_TABLE_PORT_TYPE,
+	PORT_TABLE_LOCAL_ATTEN_12G,
+	PORT_TABLE_LOCAL_ATTEN_25G,
+	PORT_TABLE_LINK_SPEED_SUPPORTED,
+	PORT_TABLE_LINK_WIDTH_SUPPORTED,
+	PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED,
+	PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED,
+	PORT_TABLE_VL_CAP,
+	PORT_TABLE_MTU_CAP,
+	PORT_TABLE_TX_LANE_ENABLE_MASK,
+	PORT_TABLE_LOCAL_MAX_TIMEOUT,
+	PORT_TABLE_REMOTE_ATTEN_12G,
+	PORT_TABLE_REMOTE_ATTEN_25G,
+	PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
+	PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
+	PORT_TABLE_RX_PRESET_IDX,
+	PORT_TABLE_CABLE_REACH_CLASS,
+	PORT_TABLE_MAX
+};
+
+enum platform_config_rx_preset_table_fields {
+	RX_PRESET_TABLE_RESERVED,
+	RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
+	RX_PRESET_TABLE_QSFP_RX_EMP_APPLY,
+	RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
+	RX_PRESET_TABLE_QSFP_RX_CDR,
+	RX_PRESET_TABLE_QSFP_RX_EMP,
+	RX_PRESET_TABLE_QSFP_RX_AMP,
+	RX_PRESET_TABLE_MAX
+};
+
+enum platform_config_tx_preset_table_fields {
+	TX_PRESET_TABLE_RESERVED,
+	TX_PRESET_TABLE_PRECUR,
+	TX_PRESET_TABLE_ATTN,
+	TX_PRESET_TABLE_POSTCUR,
+	TX_PRESET_TABLE_QSFP_TX_CDR_APPLY,
+	TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
+	TX_PRESET_TABLE_QSFP_TX_CDR,
+	TX_PRESET_TABLE_QSFP_TX_EQ,
+	TX_PRESET_TABLE_MAX
+};
+
+enum platform_config_qsfp_attn_table_fields {
+	QSFP_ATTEN_TABLE_RESERVED,
+	QSFP_ATTEN_TABLE_TX_PRESET_IDX,
+	QSFP_ATTEN_TABLE_RX_PRESET_IDX,
+	QSFP_ATTEN_TABLE_MAX
+};
+
+enum platform_config_variable_settings_table_fields {
+	VARIABLE_SETTINGS_TABLE_RESERVED,
+	VARIABLE_SETTINGS_TABLE_TX_PRESET_IDX,
+	VARIABLE_SETTINGS_TABLE_RX_PRESET_IDX,
+	VARIABLE_SETTINGS_TABLE_MAX
+};
+
+struct platform_config {
+	size_t size;
+	const u8 *data;
+};
+
+struct platform_config_data {
+	u32 *table;
+	u32 *table_metadata;
+	u32 num_table;
+};
+
+/*
+ * This struct acts as a quick reference into the platform_data binary image
+ * and is populated by parse_platform_config(...) depending on the specific
+ * META_VERSION
+ */
+struct platform_config_cache {
+	u8  cache_valid;
+	struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX];
+};
+
+/* This section defines default values and encodings for the
+ * fields defined for each table above
+ */
+
+/*
+ * =====================================================
+ *  System table encodings
+ * =====================================================
+ */
+#define PLATFORM_CONFIG_MAGIC_NUM		0x3d4f5041
+#define PLATFORM_CONFIG_MAGIC_NUMBER_LEN	4
+
+/*
+ * These power classes are the same as defined in SFF 8636 spec rev 2.4
+ * describing byte 129 in table 6-16, except enumerated in a different order
+ */
+enum platform_config_qsfp_power_class_encoding {
+	QSFP_POWER_CLASS_1 = 1,
+	QSFP_POWER_CLASS_2,
+	QSFP_POWER_CLASS_3,
+	QSFP_POWER_CLASS_4,
+	QSFP_POWER_CLASS_5,
+	QSFP_POWER_CLASS_6,
+	QSFP_POWER_CLASS_7
+};
+
+/*
+ * ====================================================
+ *  Port table encodings
+ * ====================================================
+ */
+enum platform_config_port_type_encoding {
+	PORT_TYPE_UNKNOWN,
+	PORT_TYPE_DISCONNECTED,
+	PORT_TYPE_FIXED,
+	PORT_TYPE_VARIABLE,
+	PORT_TYPE_QSFP,
+	PORT_TYPE_MAX
+};
+
+enum platform_config_link_speed_supported_encoding {
+	LINK_SPEED_SUPP_12G = 1,
+	LINK_SPEED_SUPP_25G,
+	LINK_SPEED_SUPP_12G_25G,
+	LINK_SPEED_SUPP_MAX
+};
+
+/*
+ * This is a subset (not strict) of the link downgrades
+ * supported. The link downgrades supported are expected
+ * to be supplied to the driver by another entity such as
+ * the fabric manager
+ */
+enum platform_config_link_width_supported_encoding {
+	LINK_WIDTH_SUPP_1X = 1,
+	LINK_WIDTH_SUPP_2X,
+	LINK_WIDTH_SUPP_2X_1X,
+	LINK_WIDTH_SUPP_3X,
+	LINK_WIDTH_SUPP_3X_1X,
+	LINK_WIDTH_SUPP_3X_2X,
+	LINK_WIDTH_SUPP_3X_2X_1X,
+	LINK_WIDTH_SUPP_4X,
+	LINK_WIDTH_SUPP_4X_1X,
+	LINK_WIDTH_SUPP_4X_2X,
+	LINK_WIDTH_SUPP_4X_2X_1X,
+	LINK_WIDTH_SUPP_4X_3X,
+	LINK_WIDTH_SUPP_4X_3X_1X,
+	LINK_WIDTH_SUPP_4X_3X_2X,
+	LINK_WIDTH_SUPP_4X_3X_2X_1X,
+	LINK_WIDTH_SUPP_MAX
+};
+
+enum platform_config_virtual_lane_capability_encoding {
+	VL_CAP_VL0 = 1,
+	VL_CAP_VL0_1,
+	VL_CAP_VL0_2,
+	VL_CAP_VL0_3,
+	VL_CAP_VL0_4,
+	VL_CAP_VL0_5,
+	VL_CAP_VL0_6,
+	VL_CAP_VL0_7,
+	VL_CAP_VL0_8,
+	VL_CAP_VL0_9,
+	VL_CAP_VL0_10,
+	VL_CAP_VL0_11,
+	VL_CAP_VL0_12,
+	VL_CAP_VL0_13,
+	VL_CAP_VL0_14,
+	VL_CAP_MAX
+};
+
+/* Max MTU */
+enum platform_config_mtu_capability_encoding {
+	MTU_CAP_256   = 1,
+	MTU_CAP_512   = 2,
+	MTU_CAP_1024  = 3,
+	MTU_CAP_2048  = 4,
+	MTU_CAP_4096  = 5,
+	MTU_CAP_8192  = 6,
+	MTU_CAP_10240 = 7
+};
+
+enum platform_config_local_max_timeout_encoding {
+	LOCAL_MAX_TIMEOUT_10_MS = 1,
+	LOCAL_MAX_TIMEOUT_100_MS,
+	LOCAL_MAX_TIMEOUT_1_S,
+	LOCAL_MAX_TIMEOUT_10_S,
+	LOCAL_MAX_TIMEOUT_100_S,
+	LOCAL_MAX_TIMEOUT_1000_S
+};
+
+enum link_tuning_encoding {
+	OPA_PASSIVE_TUNING,
+	OPA_ACTIVE_TUNING,
+	OPA_UNKNOWN_TUNING
+};
+
+/*
+ * Shifts and masks for the link SI tuning values stuffed into the ASIC scratch
+ * registers for integrated platforms
+ */
+#define PORT0_PORT_TYPE_SHIFT		0
+#define PORT0_LOCAL_ATTEN_SHIFT		4
+#define PORT0_REMOTE_ATTEN_SHIFT	10
+#define PORT0_DEFAULT_ATTEN_SHIFT	32
+
+#define PORT1_PORT_TYPE_SHIFT		16
+#define PORT1_LOCAL_ATTEN_SHIFT		20
+#define PORT1_REMOTE_ATTEN_SHIFT	26
+#define PORT1_DEFAULT_ATTEN_SHIFT	40
+
+#define PORT0_PORT_TYPE_MASK		0xFUL
+#define PORT0_LOCAL_ATTEN_MASK		0x3FUL
+#define PORT0_REMOTE_ATTEN_MASK		0x3FUL
+#define PORT0_DEFAULT_ATTEN_MASK	0xFFUL
+
+#define PORT1_PORT_TYPE_MASK		0xFUL
+#define PORT1_LOCAL_ATTEN_MASK		0x3FUL
+#define PORT1_REMOTE_ATTEN_MASK		0x3FUL
+#define PORT1_DEFAULT_ATTEN_MASK	0xFFUL
+
+#define PORT0_PORT_TYPE_SMASK		(PORT0_PORT_TYPE_MASK << \
+					 PORT0_PORT_TYPE_SHIFT)
+#define PORT0_LOCAL_ATTEN_SMASK		(PORT0_LOCAL_ATTEN_MASK << \
+					 PORT0_LOCAL_ATTEN_SHIFT)
+#define PORT0_REMOTE_ATTEN_SMASK	(PORT0_REMOTE_ATTEN_MASK << \
+					 PORT0_REMOTE_ATTEN_SHIFT)
+#define PORT0_DEFAULT_ATTEN_SMASK	(PORT0_DEFAULT_ATTEN_MASK << \
+					 PORT0_DEFAULT_ATTEN_SHIFT)
+
+#define PORT1_PORT_TYPE_SMASK		(PORT1_PORT_TYPE_MASK << \
+					 PORT1_PORT_TYPE_SHIFT)
+#define PORT1_LOCAL_ATTEN_SMASK		(PORT1_LOCAL_ATTEN_MASK << \
+					 PORT1_LOCAL_ATTEN_SHIFT)
+#define PORT1_REMOTE_ATTEN_SMASK	(PORT1_REMOTE_ATTEN_MASK << \
+					 PORT1_REMOTE_ATTEN_SHIFT)
+#define PORT1_DEFAULT_ATTEN_SMASK	(PORT1_DEFAULT_ATTEN_MASK << \
+					 PORT1_DEFAULT_ATTEN_SHIFT)
+
+#define QSFP_MAX_POWER_SHIFT		0
+#define TX_NO_EQ_SHIFT			4
+#define TX_EQ_SHIFT			25
+#define RX_SHIFT			46
+
+#define QSFP_MAX_POWER_MASK		0xFUL
+#define TX_NO_EQ_MASK			0x1FFFFFUL
+#define TX_EQ_MASK			0x1FFFFFUL
+#define RX_MASK				0xFFFFUL
+
+#define QSFP_MAX_POWER_SMASK		(QSFP_MAX_POWER_MASK << \
+					 QSFP_MAX_POWER_SHIFT)
+#define TX_NO_EQ_SMASK			(TX_NO_EQ_MASK << TX_NO_EQ_SHIFT)
+#define TX_EQ_SMASK			(TX_EQ_MASK << TX_EQ_SHIFT)
+#define RX_SMASK			(RX_MASK << RX_SHIFT)
+
+#define TX_PRECUR_SHIFT			0
+#define TX_ATTN_SHIFT			4
+#define QSFP_TX_CDR_APPLY_SHIFT		9
+#define QSFP_TX_EQ_APPLY_SHIFT		10
+#define QSFP_TX_CDR_SHIFT		11
+#define QSFP_TX_EQ_SHIFT		12
+#define TX_POSTCUR_SHIFT		16
+
+#define TX_PRECUR_MASK			0xFUL
+#define TX_ATTN_MASK			0x1FUL
+#define QSFP_TX_CDR_APPLY_MASK		0x1UL
+#define QSFP_TX_EQ_APPLY_MASK		0x1UL
+#define QSFP_TX_CDR_MASK		0x1UL
+#define QSFP_TX_EQ_MASK			0xFUL
+#define TX_POSTCUR_MASK			0x1FUL
+
+#define TX_PRECUR_SMASK			(TX_PRECUR_MASK << TX_PRECUR_SHIFT)
+#define TX_ATTN_SMASK			(TX_ATTN_MASK << TX_ATTN_SHIFT)
+#define QSFP_TX_CDR_APPLY_SMASK		(QSFP_TX_CDR_APPLY_MASK << \
+					 QSFP_TX_CDR_APPLY_SHIFT)
+#define QSFP_TX_EQ_APPLY_SMASK		(QSFP_TX_EQ_APPLY_MASK << \
+					 QSFP_TX_EQ_APPLY_SHIFT)
+#define QSFP_TX_CDR_SMASK		(QSFP_TX_CDR_MASK << QSFP_TX_CDR_SHIFT)
+#define QSFP_TX_EQ_SMASK		(QSFP_TX_EQ_MASK << QSFP_TX_EQ_SHIFT)
+#define TX_POSTCUR_SMASK		(TX_POSTCUR_MASK << TX_POSTCUR_SHIFT)
+
+#define QSFP_RX_CDR_APPLY_SHIFT		0
+#define QSFP_RX_EMP_APPLY_SHIFT		1
+#define QSFP_RX_AMP_APPLY_SHIFT		2
+#define QSFP_RX_CDR_SHIFT		3
+#define QSFP_RX_EMP_SHIFT		4
+#define QSFP_RX_AMP_SHIFT		8
+
+#define QSFP_RX_CDR_APPLY_MASK		0x1UL
+#define QSFP_RX_EMP_APPLY_MASK		0x1UL
+#define QSFP_RX_AMP_APPLY_MASK		0x1UL
+#define QSFP_RX_CDR_MASK		0x1UL
+#define QSFP_RX_EMP_MASK		0xFUL
+#define QSFP_RX_AMP_MASK		0x3UL
+
+#define QSFP_RX_CDR_APPLY_SMASK		(QSFP_RX_CDR_APPLY_MASK << \
+					 QSFP_RX_CDR_APPLY_SHIFT)
+#define QSFP_RX_EMP_APPLY_SMASK		(QSFP_RX_EMP_APPLY_MASK << \
+					 QSFP_RX_EMP_APPLY_SHIFT)
+#define QSFP_RX_AMP_APPLY_SMASK		(QSFP_RX_AMP_APPLY_MASK << \
+					 QSFP_RX_AMP_APPLY_SHIFT)
+#define QSFP_RX_CDR_SMASK		(QSFP_RX_CDR_MASK << QSFP_RX_CDR_SHIFT)
+#define QSFP_RX_EMP_SMASK		(QSFP_RX_EMP_MASK << QSFP_RX_EMP_SHIFT)
+#define QSFP_RX_AMP_SMASK		(QSFP_RX_AMP_MASK << QSFP_RX_AMP_SHIFT)
+
+#define BITMAP_VERSION			1
+#define BITMAP_VERSION_SHIFT		44
+#define BITMAP_VERSION_MASK		0xFUL
+#define BITMAP_VERSION_SMASK		(BITMAP_VERSION_MASK << \
+					 BITMAP_VERSION_SHIFT)
+#define CHECKSUM_SHIFT			48
+#define CHECKSUM_MASK			0xFFFFUL
+#define CHECKSUM_SMASK			(CHECKSUM_MASK << CHECKSUM_SHIFT)
+
+/* platform.c */
+void get_platform_config(struct hfi1_devdata *dd);
+void free_platform_config(struct hfi1_devdata *dd);
+void get_port_type(struct hfi1_pportdata *ppd);
+int set_qsfp_tx(struct hfi1_pportdata *ppd, int on);
+void tune_serdes(struct hfi1_pportdata *ppd);
+
+#endif			/*__PLATFORM_H*/
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
new file mode 100644
index 0000000..1697d96
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -0,0 +1,858 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+#include <rdma/ib_verbs.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+#include "verbs_txreq.h"
+
+unsigned int hfi1_qp_table_size = 256;
+module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+static void flush_tx_list(struct rvt_qp *qp);
+static int iowait_sleep(
+	struct sdma_engine *sde,
+	struct iowait *wait,
+	struct sdma_txreq *stx,
+	unsigned int seq,
+	bool pkts_sent);
+static void iowait_wakeup(struct iowait *wait, int reason);
+static void iowait_sdma_drained(struct iowait *wait);
+static void qp_pio_drain(struct rvt_qp *qp);
+
+const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
+[IB_WR_RDMA_WRITE] = {
+	.length = sizeof(struct ib_rdma_wr),
+	.qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_RDMA_READ] = {
+	.length = sizeof(struct ib_rdma_wr),
+	.qpt_support = BIT(IB_QPT_RC),
+	.flags = RVT_OPERATION_ATOMIC,
+},
+
+[IB_WR_ATOMIC_CMP_AND_SWP] = {
+	.length = sizeof(struct ib_atomic_wr),
+	.qpt_support = BIT(IB_QPT_RC),
+	.flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_ATOMIC_FETCH_AND_ADD] = {
+	.length = sizeof(struct ib_atomic_wr),
+	.qpt_support = BIT(IB_QPT_RC),
+	.flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_RDMA_WRITE_WITH_IMM] = {
+	.length = sizeof(struct ib_rdma_wr),
+	.qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND] = {
+	.length = sizeof(struct ib_send_wr),
+	.qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+		       BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND_WITH_IMM] = {
+	.length = sizeof(struct ib_send_wr),
+	.qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+		       BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_REG_MR] = {
+	.length = sizeof(struct ib_reg_wr),
+	.qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+	.flags = RVT_OPERATION_LOCAL,
+},
+
+[IB_WR_LOCAL_INV] = {
+	.length = sizeof(struct ib_send_wr),
+	.qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+	.flags = RVT_OPERATION_LOCAL,
+},
+
+[IB_WR_SEND_WITH_INV] = {
+	.length = sizeof(struct ib_send_wr),
+	.qpt_support = BIT(IB_QPT_RC),
+},
+
+};
+
+static void flush_tx_list(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	while (!list_empty(&priv->s_iowait.tx_head)) {
+		struct sdma_txreq *tx;
+
+		tx = list_first_entry(
+			&priv->s_iowait.tx_head,
+			struct sdma_txreq,
+			list);
+		list_del_init(&tx->list);
+		hfi1_put_txreq(
+			container_of(tx, struct verbs_txreq, txreq));
+	}
+}
+
+static void flush_iowait(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	unsigned long flags;
+	seqlock_t *lock = priv->s_iowait.lock;
+
+	if (!lock)
+		return;
+	write_seqlock_irqsave(lock, flags);
+	if (!list_empty(&priv->s_iowait.list)) {
+		list_del_init(&priv->s_iowait.list);
+		priv->s_iowait.lock = NULL;
+		rvt_put_qp(qp);
+	}
+	write_sequnlock_irqrestore(lock, flags);
+}
+
+static inline int opa_mtu_enum_to_int(int mtu)
+{
+	switch (mtu) {
+	case OPA_MTU_8192:  return 8192;
+	case OPA_MTU_10240: return 10240;
+	default:            return -1;
+	}
+}
+
+/**
+ * This function is what we would push to the core layer if we wanted to be a
+ * "first class citizen".  Instead we hide this here and rely on Verbs ULPs
+ * to blindly pass the MTU enum value from the PathRecord to us.
+ */
+static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
+{
+	int val;
+
+	/* Constraining 10KB packets to 8KB packets */
+	if (mtu == (enum ib_mtu)OPA_MTU_10240)
+		mtu = OPA_MTU_8192;
+	val = opa_mtu_enum_to_int((int)mtu);
+	if (val > 0)
+		return val;
+	return ib_mtu_enum_to_int(mtu);
+}
+
+int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+			 int attr_mask, struct ib_udata *udata)
+{
+	struct ib_qp *ibqp = &qp->ibqp;
+	struct hfi1_ibdev *dev = to_idev(ibqp->device);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+	u8 sc;
+
+	if (attr_mask & IB_QP_AV) {
+		sc = ah_to_sc(ibqp->device, &attr->ah_attr);
+		if (sc == 0xf)
+			return -EINVAL;
+
+		if (!qp_to_sdma_engine(qp, sc) &&
+		    dd->flags & HFI1_HAS_SEND_DMA)
+			return -EINVAL;
+
+		if (!qp_to_send_context(qp, sc))
+			return -EINVAL;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		sc = ah_to_sc(ibqp->device, &attr->alt_ah_attr);
+		if (sc == 0xf)
+			return -EINVAL;
+
+		if (!qp_to_sdma_engine(qp, sc) &&
+		    dd->flags & HFI1_HAS_SEND_DMA)
+			return -EINVAL;
+
+		if (!qp_to_send_context(qp, sc))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * qp_set_16b - Set the hdr_type based on whether the slid or the
+ * dlid in the connection is extended. Only applicable for RC and UC
+ * QPs. UD QPs determine this on the fly from the ah in the wqe
+ */
+static inline void qp_set_16b(struct rvt_qp *qp)
+{
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	/* Update ah_attr to account for extended LIDs */
+	hfi1_update_ah_attr(qp->ibqp.device, &qp->remote_ah_attr);
+
+	/* Create 32 bit LIDs */
+	hfi1_make_opa_lid(&qp->remote_ah_attr);
+
+	if (!(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH))
+		return;
+
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ppd = ppd_from_ibp(ibp);
+	priv->hdr_type = hfi1_get_hdr_type(ppd->lid, &qp->remote_ah_attr);
+}
+
+void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+		    int attr_mask, struct ib_udata *udata)
+{
+	struct ib_qp *ibqp = &qp->ibqp;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	if (attr_mask & IB_QP_AV) {
+		priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
+		priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+		priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+		qp_set_16b(qp);
+	}
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE &&
+	    attr->path_mig_state == IB_MIG_MIGRATED &&
+	    qp->s_mig_state == IB_MIG_ARMED) {
+		qp->s_flags |= RVT_S_AHG_CLEAR;
+		priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
+		priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+		priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+		qp_set_16b(qp);
+	}
+}
+
+/**
+ * hfi1_check_send_wqe - validate wqe
+ * @qp - The qp
+ * @wqe - The built wqe
+ *
+ * validate wqe.  This is called
+ * prior to inserting the wqe into
+ * the ring but after the wqe has been
+ * setup.
+ *
+ * Returns 0 on success, -EINVAL on failure
+ *
+ */
+int hfi1_check_send_wqe(struct rvt_qp *qp,
+			struct rvt_swqe *wqe)
+{
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct rvt_ah *ah;
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+		if (wqe->length > 0x80000000U)
+			return -EINVAL;
+		break;
+	case IB_QPT_SMI:
+		ah = ibah_to_rvtah(wqe->ud_wr.ah);
+		if (wqe->length > (1 << ah->log_pmtu))
+			return -EINVAL;
+		break;
+	case IB_QPT_GSI:
+	case IB_QPT_UD:
+		ah = ibah_to_rvtah(wqe->ud_wr.ah);
+		if (wqe->length > (1 << ah->log_pmtu))
+			return -EINVAL;
+		if (ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)] == 0xf)
+			return -EINVAL;
+	default:
+		break;
+	}
+	return wqe->length <= piothreshold;
+}
+
+/**
+ * _hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress w/o regard to the s_flags.
+ *
+ * It is only used in the post send, which doesn't hold
+ * the s_lock.
+ */
+void _hfi1_schedule_send(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_ibport *ibp =
+		to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+	iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
+			priv->s_sde ?
+			priv->s_sde->cpu :
+			cpumask_first(cpumask_of_node(dd->node)));
+}
+
+static void qp_pio_drain(struct rvt_qp *qp)
+{
+	struct hfi1_ibdev *dev;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	if (!priv->s_sendcontext)
+		return;
+	dev = to_idev(qp->ibqp.device);
+	while (iowait_pio_pending(&priv->s_iowait)) {
+		write_seqlock_irq(&dev->iowait_lock);
+		hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1);
+		write_sequnlock_irq(&dev->iowait_lock);
+		iowait_pio_drain(&priv->s_iowait);
+		write_seqlock_irq(&dev->iowait_lock);
+		hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0);
+		write_sequnlock_irq(&dev->iowait_lock);
+	}
+}
+
+/**
+ * hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress and caller should hold
+ * the s_lock.
+ */
+void hfi1_schedule_send(struct rvt_qp *qp)
+{
+	lockdep_assert_held(&qp->s_lock);
+	if (hfi1_send_ok(qp))
+		_hfi1_schedule_send(qp);
+}
+
+void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (qp->s_flags & flag) {
+		qp->s_flags &= ~flag;
+		trace_hfi1_qpwakeup(qp, flag);
+		hfi1_schedule_send(qp);
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	/* Notify hfi1_destroy_qp() if it is waiting. */
+	rvt_put_qp(qp);
+}
+
+static int iowait_sleep(
+	struct sdma_engine *sde,
+	struct iowait *wait,
+	struct sdma_txreq *stx,
+	uint seq,
+	bool pkts_sent)
+{
+	struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq);
+	struct rvt_qp *qp;
+	struct hfi1_qp_priv *priv;
+	unsigned long flags;
+	int ret = 0;
+	struct hfi1_ibdev *dev;
+
+	qp = tx->qp;
+	priv = qp->priv;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+		/*
+		 * If we couldn't queue the DMA request, save the info
+		 * and try again later rather than destroying the
+		 * buffer and undoing the side effects of the copy.
+		 */
+		/* Make a common routine? */
+		dev = &sde->dd->verbs_dev;
+		list_add_tail(&stx->list, &wait->tx_head);
+		write_seqlock(&dev->iowait_lock);
+		if (sdma_progress(sde, seq, stx))
+			goto eagain;
+		if (list_empty(&priv->s_iowait.list)) {
+			struct hfi1_ibport *ibp =
+				to_iport(qp->ibqp.device, qp->port_num);
+
+			ibp->rvp.n_dmawait++;
+			qp->s_flags |= RVT_S_WAIT_DMA_DESC;
+			iowait_queue(pkts_sent, &priv->s_iowait,
+				     &sde->dmawait);
+			priv->s_iowait.lock = &dev->iowait_lock;
+			trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC);
+			rvt_get_qp(qp);
+		}
+		write_sequnlock(&dev->iowait_lock);
+		qp->s_flags &= ~RVT_S_BUSY;
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		ret = -EBUSY;
+	} else {
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		hfi1_put_txreq(tx);
+	}
+	return ret;
+eagain:
+	write_sequnlock(&dev->iowait_lock);
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	list_del_init(&stx->list);
+	return -EAGAIN;
+}
+
+static void iowait_wakeup(struct iowait *wait, int reason)
+{
+	struct rvt_qp *qp = iowait_to_qp(wait);
+
+	WARN_ON(reason != SDMA_AVAIL_REASON);
+	hfi1_qp_wakeup(qp, RVT_S_WAIT_DMA_DESC);
+}
+
+static void iowait_sdma_drained(struct iowait *wait)
+{
+	struct rvt_qp *qp = iowait_to_qp(wait);
+	unsigned long flags;
+
+	/*
+	 * This happens when the send engine notes
+	 * a QP in the error state and cannot
+	 * do the flush work until that QP's
+	 * sdma work has finished.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (qp->s_flags & RVT_S_WAIT_DMA) {
+		qp->s_flags &= ~RVT_S_WAIT_DMA;
+		hfi1_schedule_send(qp);
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/**
+ * qp_to_sdma_engine - map a qp to a send engine
+ * @qp: the QP
+ * @sc5: the 5 bit sc
+ *
+ * Return:
+ * A send engine for the qp or NULL for SMI type qp.
+ */
+struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	struct sdma_engine *sde;
+
+	if (!(dd->flags & HFI1_HAS_SEND_DMA))
+		return NULL;
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_SMI:
+		return NULL;
+	default:
+		break;
+	}
+	sde = sdma_select_engine_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, sc5);
+	return sde;
+}
+
+/*
+ * qp_to_send_context - map a qp to a send context
+ * @qp: the QP
+ * @sc5: the 5 bit sc
+ *
+ * Return:
+ * A send context for the qp
+ */
+struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_SMI:
+		/* SMA packets to VL15 */
+		return dd->vld[15].sc;
+	default:
+		break;
+	}
+
+	return pio_select_send_context_sc(dd, qp->ibqp.qp_num >> dd->qos_shift,
+					  sc5);
+}
+
+static const char * const qp_type_str[] = {
+	"SMI", "GSI", "RC", "UC", "UD",
+};
+
+static int qp_idle(struct rvt_qp *qp)
+{
+	return
+		qp->s_last == qp->s_acked &&
+		qp->s_acked == qp->s_cur &&
+		qp->s_cur == qp->s_tail &&
+		qp->s_tail == qp->s_head;
+}
+
+/**
+ * qp_iter_print - print the qp information to seq_file
+ * @s: the seq_file to emit the qp information on
+ * @iter: the iterator for the qp hash list
+ */
+void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter)
+{
+	struct rvt_swqe *wqe;
+	struct rvt_qp *qp = iter->qp;
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct sdma_engine *sde;
+	struct send_context *send_context;
+	struct rvt_ack_entry *e = NULL;
+	struct rvt_srq *srq = qp->ibqp.srq ?
+		ibsrq_to_rvtsrq(qp->ibqp.srq) : NULL;
+
+	sde = qp_to_sdma_engine(qp, priv->s_sc);
+	wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+	send_context = qp_to_send_context(qp, priv->s_sc);
+	if (qp->s_ack_queue)
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+	seq_printf(s,
+		   "N %d %s QP %x R %u %s %u %u f=%x %u %u %u %u %u %u SPSN %x %x %x %x %x RPSN %x S(%u %u %u %u %u %u %u) R(%u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d OS %x %x E %x %x %x RNR %d %s %d\n",
+		   iter->n,
+		   qp_idle(qp) ? "I" : "B",
+		   qp->ibqp.qp_num,
+		   atomic_read(&qp->refcount),
+		   qp_type_str[qp->ibqp.qp_type],
+		   qp->state,
+		   wqe ? wqe->wr.opcode : 0,
+		   qp->s_flags,
+		   iowait_sdma_pending(&priv->s_iowait),
+		   iowait_pio_pending(&priv->s_iowait),
+		   !list_empty(&priv->s_iowait.list),
+		   qp->timeout,
+		   wqe ? wqe->ssn : 0,
+		   qp->s_lsn,
+		   qp->s_last_psn,
+		   qp->s_psn, qp->s_next_psn,
+		   qp->s_sending_psn, qp->s_sending_hpsn,
+		   qp->r_psn,
+		   qp->s_last, qp->s_acked, qp->s_cur,
+		   qp->s_tail, qp->s_head, qp->s_size,
+		   qp->s_avail,
+		   /* ack_queue ring pointers, size */
+		   qp->s_tail_ack_queue, qp->r_head_ack_queue,
+		   rvt_max_atomic(&to_idev(qp->ibqp.device)->rdi),
+		   /* remote QP info  */
+		   qp->remote_qpn,
+		   rdma_ah_get_dlid(&qp->remote_ah_attr),
+		   rdma_ah_get_sl(&qp->remote_ah_attr),
+		   qp->pmtu,
+		   qp->s_retry,
+		   qp->s_retry_cnt,
+		   qp->s_rnr_retry_cnt,
+		   qp->s_rnr_retry,
+		   sde,
+		   sde ? sde->this_idx : 0,
+		   send_context,
+		   send_context ? send_context->sw_index : 0,
+		   ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head,
+		   ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail,
+		   qp->pid,
+		   qp->s_state,
+		   qp->s_ack_state,
+		   /* ack queue information */
+		   e ? e->opcode : 0,
+		   e ? e->psn : 0,
+		   e ? e->lpsn : 0,
+		   qp->r_min_rnr_timer,
+		   srq ? "SRQ" : "RQ",
+		   srq ? srq->rq.size : qp->r_rq.size
+		);
+}
+
+void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv;
+
+	priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, rdi->dparms.node);
+	if (!priv)
+		return ERR_PTR(-ENOMEM);
+
+	priv->owner = qp;
+
+	priv->s_ahg = kzalloc_node(sizeof(*priv->s_ahg), GFP_KERNEL,
+				   rdi->dparms.node);
+	if (!priv->s_ahg) {
+		kfree(priv);
+		return ERR_PTR(-ENOMEM);
+	}
+	iowait_init(
+		&priv->s_iowait,
+		1,
+		_hfi1_do_send,
+		iowait_sleep,
+		iowait_wakeup,
+		iowait_sdma_drained);
+	return priv;
+}
+
+void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	kfree(priv->s_ahg);
+	kfree(priv);
+}
+
+unsigned free_all_qps(struct rvt_dev_info *rdi)
+{
+	struct hfi1_ibdev *verbs_dev = container_of(rdi,
+						    struct hfi1_ibdev,
+						    rdi);
+	struct hfi1_devdata *dd = container_of(verbs_dev,
+					       struct hfi1_devdata,
+					       verbs_dev);
+	int n;
+	unsigned qp_inuse = 0;
+
+	for (n = 0; n < dd->num_pports; n++) {
+		struct hfi1_ibport *ibp = &dd->pport[n].ibport_data;
+
+		rcu_read_lock();
+		if (rcu_dereference(ibp->rvp.qp[0]))
+			qp_inuse++;
+		if (rcu_dereference(ibp->rvp.qp[1]))
+			qp_inuse++;
+		rcu_read_unlock();
+	}
+
+	return qp_inuse;
+}
+
+void flush_qp_waiters(struct rvt_qp *qp)
+{
+	lockdep_assert_held(&qp->s_lock);
+	flush_iowait(qp);
+}
+
+void stop_send_queue(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	cancel_work_sync(&priv->s_iowait.iowork);
+}
+
+void quiesce_qp(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	iowait_sdma_drain(&priv->s_iowait);
+	qp_pio_drain(qp);
+	flush_tx_list(qp);
+}
+
+void notify_qp_reset(struct rvt_qp *qp)
+{
+	qp->r_adefered = 0;
+	clear_ahg(qp);
+}
+
+/*
+ * Switch to alternate path.
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void hfi1_migrate_qp(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_event ev;
+
+	qp->s_mig_state = IB_MIG_MIGRATED;
+	qp->remote_ah_attr = qp->alt_ah_attr;
+	qp->port_num = rdma_ah_get_port_num(&qp->alt_ah_attr);
+	qp->s_pkey_index = qp->s_alt_pkey_index;
+	qp->s_flags |= RVT_S_AHG_CLEAR;
+	priv->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr);
+	priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+	qp_set_16b(qp);
+
+	ev.device = qp->ibqp.device;
+	ev.element.qp = &qp->ibqp;
+	ev.event = IB_EVENT_PATH_MIG;
+	qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+}
+
+int mtu_to_path_mtu(u32 mtu)
+{
+	return mtu_to_enum(mtu, OPA_MTU_8192);
+}
+
+u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu)
+{
+	u32 mtu;
+	struct hfi1_ibdev *verbs_dev = container_of(rdi,
+						    struct hfi1_ibdev,
+						    rdi);
+	struct hfi1_devdata *dd = container_of(verbs_dev,
+					       struct hfi1_devdata,
+					       verbs_dev);
+	struct hfi1_ibport *ibp;
+	u8 sc, vl;
+
+	ibp = &dd->pport[qp->port_num - 1].ibport_data;
+	sc = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)];
+	vl = sc_to_vlt(dd, sc);
+
+	mtu = verbs_mtu_enum_to_int(qp->ibqp.device, pmtu);
+	if (vl < PER_VL_SEND_CONTEXTS)
+		mtu = min_t(u32, mtu, dd->vld[vl].mtu);
+	return mtu;
+}
+
+int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+		       struct ib_qp_attr *attr)
+{
+	int mtu, pidx = qp->port_num - 1;
+	struct hfi1_ibdev *verbs_dev = container_of(rdi,
+						    struct hfi1_ibdev,
+						    rdi);
+	struct hfi1_devdata *dd = container_of(verbs_dev,
+					       struct hfi1_devdata,
+					       verbs_dev);
+	mtu = verbs_mtu_enum_to_int(qp->ibqp.device, attr->path_mtu);
+	if (mtu == -1)
+		return -1; /* values less than 0 are error */
+
+	if (mtu > dd->pport[pidx].ibmtu)
+		return mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048);
+	else
+		return attr->path_mtu;
+}
+
+void notify_error_qp(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	seqlock_t *lock = priv->s_iowait.lock;
+
+	if (lock) {
+		write_seqlock(lock);
+		if (!list_empty(&priv->s_iowait.list) &&
+		    !(qp->s_flags & RVT_S_BUSY)) {
+			qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
+			list_del_init(&priv->s_iowait.list);
+			priv->s_iowait.lock = NULL;
+			rvt_put_qp(qp);
+		}
+		write_sequnlock(lock);
+	}
+
+	if (!(qp->s_flags & RVT_S_BUSY)) {
+		if (qp->s_rdma_mr) {
+			rvt_put_mr(qp->s_rdma_mr);
+			qp->s_rdma_mr = NULL;
+		}
+		flush_tx_list(qp);
+	}
+}
+
+/**
+ * hfi1_qp_iter_cb - callback for iterator
+ * @qp - the qp
+ * @v - the sl in low bits of v
+ *
+ * This is called from the iterator callback to work
+ * on an individual qp.
+ */
+static void hfi1_qp_iter_cb(struct rvt_qp *qp, u64 v)
+{
+	int lastwqe;
+	struct ib_event ev;
+	struct hfi1_ibport *ibp =
+		to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u8 sl = (u8)v;
+
+	if (qp->port_num != ppd->port ||
+	    (qp->ibqp.qp_type != IB_QPT_UC &&
+	     qp->ibqp.qp_type != IB_QPT_RC) ||
+	    rdma_ah_get_sl(&qp->remote_ah_attr) != sl ||
+	    !(ib_rvt_state_ops[qp->state] & RVT_POST_SEND_OK))
+		return;
+
+	spin_lock_irq(&qp->r_lock);
+	spin_lock(&qp->s_hlock);
+	spin_lock(&qp->s_lock);
+	lastwqe = rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+	spin_unlock(&qp->s_lock);
+	spin_unlock(&qp->s_hlock);
+	spin_unlock_irq(&qp->r_lock);
+	if (lastwqe) {
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+}
+
+/**
+ * hfi1_error_port_qps - put a port's RC/UC qps into error state
+ * @ibp: the ibport.
+ * @sl: the service level.
+ *
+ * This function places all RC/UC qps with a given service level into error
+ * state. It is generally called to force upper lay apps to abandon stale qps
+ * after an sl->sc mapping change.
+ */
+void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct hfi1_ibdev *dev = &ppd->dd->verbs_dev;
+
+	rvt_qp_iter(&dev->rdi, sl, hfi1_qp_iter_cb);
+}
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
new file mode 100644
index 0000000..b2d4cba
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -0,0 +1,133 @@
+#ifndef _QP_H
+#define _QP_H
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/hash.h>
+#include <rdma/rdmavt_qp.h>
+#include "verbs.h"
+#include "sdma.h"
+#include "verbs_txreq.h"
+
+extern unsigned int hfi1_qp_table_size;
+
+extern const struct rvt_operation_params hfi1_post_parms[];
+
+/*
+ * Send if not busy or waiting for I/O and either
+ * a RC response is pending or we can process send work requests.
+ */
+static inline int hfi1_send_ok(struct rvt_qp *qp)
+{
+	return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) &&
+		(verbs_txreq_queued(qp) ||
+		(qp->s_flags & RVT_S_RESP_PENDING) ||
+		 !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
+}
+
+/*
+ * free_ahg - clear ahg from QP
+ */
+static inline void clear_ahg(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	priv->s_ahg->ahgcount = 0;
+	qp->s_flags &= ~(RVT_S_AHG_VALID | RVT_S_AHG_CLEAR);
+	if (priv->s_sde && qp->s_ahgidx >= 0)
+		sdma_ahg_free(priv->s_sde, qp->s_ahgidx);
+	qp->s_ahgidx = -1;
+}
+
+/**
+ * hfi1_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
+			     struct ib_qp_init_attr *init_attr,
+			     struct ib_udata *udata);
+
+/**
+ * hfi1_qp_wakeup - wake up on the indicated event
+ * @qp: the QP
+ * @flag: flag the qp on which the qp is stalled
+ */
+void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag);
+
+struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5);
+struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5);
+
+void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter);
+
+void _hfi1_schedule_send(struct rvt_qp *qp);
+void hfi1_schedule_send(struct rvt_qp *qp);
+
+void hfi1_migrate_qp(struct rvt_qp *qp);
+
+/*
+ * Functions provided by hfi1 driver for rdmavt to use
+ */
+void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+unsigned free_all_qps(struct rvt_dev_info *rdi);
+void notify_qp_reset(struct rvt_qp *qp);
+int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+		       struct ib_qp_attr *attr);
+void flush_qp_waiters(struct rvt_qp *qp);
+void notify_error_qp(struct rvt_qp *qp);
+void stop_send_queue(struct rvt_qp *qp);
+void quiesce_qp(struct rvt_qp *qp);
+u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu);
+int mtu_to_path_mtu(u32 mtu);
+void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl);
+#endif /* _QP_H */
diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c
new file mode 100644
index 0000000..b596699
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/qsfp.c
@@ -0,0 +1,858 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+
+/* for the given bus number, return the CSR for reading an i2c line */
+static inline u32 i2c_in_csr(u32 bus_num)
+{
+	return bus_num ? ASIC_QSFP2_IN : ASIC_QSFP1_IN;
+}
+
+/* for the given bus number, return the CSR for writing an i2c line */
+static inline u32 i2c_oe_csr(u32 bus_num)
+{
+	return bus_num ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
+}
+
+static void hfi1_setsda(void *data, int state)
+{
+	struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+	struct hfi1_devdata *dd = bus->controlling_dd;
+	u64 reg;
+	u32 target_oe;
+
+	target_oe = i2c_oe_csr(bus->num);
+	reg = read_csr(dd, target_oe);
+	/*
+	 * The OE bit value is inverted and connected to the pin.  When
+	 * OE is 0 the pin is left to be pulled up, when the OE is 1
+	 * the pin is driven low.  This matches the "open drain" or "open
+	 * collector" convention.
+	 */
+	if (state)
+		reg &= ~QSFP_HFI0_I2CDAT;
+	else
+		reg |= QSFP_HFI0_I2CDAT;
+	write_csr(dd, target_oe, reg);
+	/* do a read to force the write into the chip */
+	(void)read_csr(dd, target_oe);
+}
+
+static void hfi1_setscl(void *data, int state)
+{
+	struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+	struct hfi1_devdata *dd = bus->controlling_dd;
+	u64 reg;
+	u32 target_oe;
+
+	target_oe = i2c_oe_csr(bus->num);
+	reg = read_csr(dd, target_oe);
+	/*
+	 * The OE bit value is inverted and connected to the pin.  When
+	 * OE is 0 the pin is left to be pulled up, when the OE is 1
+	 * the pin is driven low.  This matches the "open drain" or "open
+	 * collector" convention.
+	 */
+	if (state)
+		reg &= ~QSFP_HFI0_I2CCLK;
+	else
+		reg |= QSFP_HFI0_I2CCLK;
+	write_csr(dd, target_oe, reg);
+	/* do a read to force the write into the chip */
+	(void)read_csr(dd, target_oe);
+}
+
+static int hfi1_getsda(void *data)
+{
+	struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+	u64 reg;
+	u32 target_in;
+
+	hfi1_setsda(data, 1);	/* clear OE so we do not pull line down */
+	udelay(2);		/* 1us pull up + 250ns hold */
+
+	target_in = i2c_in_csr(bus->num);
+	reg = read_csr(bus->controlling_dd, target_in);
+	return !!(reg & QSFP_HFI0_I2CDAT);
+}
+
+static int hfi1_getscl(void *data)
+{
+	struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+	u64 reg;
+	u32 target_in;
+
+	hfi1_setscl(data, 1);	/* clear OE so we do not pull line down */
+	udelay(2);		/* 1us pull up + 250ns hold */
+
+	target_in = i2c_in_csr(bus->num);
+	reg = read_csr(bus->controlling_dd, target_in);
+	return !!(reg & QSFP_HFI0_I2CCLK);
+}
+
+/*
+ * Allocate and initialize the given i2c bus number.
+ * Returns NULL on failure.
+ */
+static struct hfi1_i2c_bus *init_i2c_bus(struct hfi1_devdata *dd,
+					 struct hfi1_asic_data *ad, int num)
+{
+	struct hfi1_i2c_bus *bus;
+	int ret;
+
+	bus = kzalloc(sizeof(*bus), GFP_KERNEL);
+	if (!bus)
+		return NULL;
+
+	bus->controlling_dd = dd;
+	bus->num = num;	/* our bus number */
+
+	bus->algo.setsda = hfi1_setsda;
+	bus->algo.setscl = hfi1_setscl;
+	bus->algo.getsda = hfi1_getsda;
+	bus->algo.getscl = hfi1_getscl;
+	bus->algo.udelay = 5;
+	bus->algo.timeout = usecs_to_jiffies(100000);
+	bus->algo.data = bus;
+
+	bus->adapter.owner = THIS_MODULE;
+	bus->adapter.algo_data = &bus->algo;
+	bus->adapter.dev.parent = &dd->pcidev->dev;
+	snprintf(bus->adapter.name, sizeof(bus->adapter.name),
+		 "hfi1_i2c%d", num);
+
+	ret = i2c_bit_add_bus(&bus->adapter);
+	if (ret) {
+		dd_dev_info(dd, "%s: unable to add i2c bus %d, err %d\n",
+			    __func__, num, ret);
+		kfree(bus);
+		return NULL;
+	}
+
+	return bus;
+}
+
+/*
+ * Initialize i2c buses.
+ * Return 0 on success, -errno on error.
+ */
+int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad)
+{
+	ad->i2c_bus0 = init_i2c_bus(dd, ad, 0);
+	ad->i2c_bus1 = init_i2c_bus(dd, ad, 1);
+	if (!ad->i2c_bus0 || !ad->i2c_bus1)
+		return -ENOMEM;
+	return 0;
+};
+
+static void clean_i2c_bus(struct hfi1_i2c_bus *bus)
+{
+	if (bus) {
+		i2c_del_adapter(&bus->adapter);
+		kfree(bus);
+	}
+}
+
+void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad)
+{
+	if (!ad)
+		return;
+	clean_i2c_bus(ad->i2c_bus0);
+	ad->i2c_bus0 = NULL;
+	clean_i2c_bus(ad->i2c_bus1);
+	ad->i2c_bus1 = NULL;
+}
+
+static int i2c_bus_write(struct hfi1_devdata *dd, struct hfi1_i2c_bus *i2c,
+			 u8 slave_addr, int offset, int offset_size,
+			 u8 *data, u16 len)
+{
+	int ret;
+	int num_msgs;
+	u8 offset_bytes[2];
+	struct i2c_msg msgs[2];
+
+	switch (offset_size) {
+	case 0:
+		num_msgs = 1;
+		msgs[0].addr = slave_addr;
+		msgs[0].flags = 0;
+		msgs[0].len = len;
+		msgs[0].buf = data;
+		break;
+	case 2:
+		offset_bytes[1] = (offset >> 8) & 0xff;
+		/* fall through */
+	case 1:
+		num_msgs = 2;
+		offset_bytes[0] = offset & 0xff;
+
+		msgs[0].addr = slave_addr;
+		msgs[0].flags = 0;
+		msgs[0].len = offset_size;
+		msgs[0].buf = offset_bytes;
+
+		msgs[1].addr = slave_addr;
+		msgs[1].flags = I2C_M_NOSTART,
+		msgs[1].len = len;
+		msgs[1].buf = data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	i2c->controlling_dd = dd;
+	ret = i2c_transfer(&i2c->adapter, msgs, num_msgs);
+	if (ret != num_msgs) {
+		dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; write failed, ret %d\n",
+			   __func__, i2c->num, slave_addr, offset, len, ret);
+		return ret < 0 ? ret : -EIO;
+	}
+	return 0;
+}
+
+static int i2c_bus_read(struct hfi1_devdata *dd, struct hfi1_i2c_bus *bus,
+			u8 slave_addr, int offset, int offset_size,
+			u8 *data, u16 len)
+{
+	int ret;
+	int num_msgs;
+	u8 offset_bytes[2];
+	struct i2c_msg msgs[2];
+
+	switch (offset_size) {
+	case 0:
+		num_msgs = 1;
+		msgs[0].addr = slave_addr;
+		msgs[0].flags = I2C_M_RD;
+		msgs[0].len = len;
+		msgs[0].buf = data;
+		break;
+	case 2:
+		offset_bytes[1] = (offset >> 8) & 0xff;
+		/* fall through */
+	case 1:
+		num_msgs = 2;
+		offset_bytes[0] = offset & 0xff;
+
+		msgs[0].addr = slave_addr;
+		msgs[0].flags = 0;
+		msgs[0].len = offset_size;
+		msgs[0].buf = offset_bytes;
+
+		msgs[1].addr = slave_addr;
+		msgs[1].flags = I2C_M_RD,
+		msgs[1].len = len;
+		msgs[1].buf = data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	bus->controlling_dd = dd;
+	ret = i2c_transfer(&bus->adapter, msgs, num_msgs);
+	if (ret != num_msgs) {
+		dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; read failed, ret %d\n",
+			   __func__, bus->num, slave_addr, offset, len, ret);
+		return ret < 0 ? ret : -EIO;
+	}
+	return 0;
+}
+
+/*
+ * Raw i2c write.  No set-up or lock checking.
+ *
+ * Return 0 on success, -errno on error.
+ */
+static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+		       int offset, void *bp, int len)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	struct hfi1_i2c_bus *bus;
+	u8 slave_addr;
+	int offset_size;
+
+	bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0;
+	slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */
+	offset_size = (i2c_addr >> 8) & 0x3;
+	return i2c_bus_write(dd, bus, slave_addr, offset, offset_size, bp, len);
+}
+
+/*
+ * Caller must hold the i2c chain resource.
+ *
+ * Return number of bytes written, or -errno.
+ */
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+	      void *bp, int len)
+{
+	int ret;
+
+	if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+		return -EACCES;
+
+	ret = __i2c_write(ppd, target, i2c_addr, offset, bp, len);
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+/*
+ * Raw i2c read.  No set-up or lock checking.
+ *
+ * Return 0 on success, -errno on error.
+ */
+static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+		      int offset, void *bp, int len)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	struct hfi1_i2c_bus *bus;
+	u8 slave_addr;
+	int offset_size;
+
+	bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0;
+	slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */
+	offset_size = (i2c_addr >> 8) & 0x3;
+	return i2c_bus_read(dd, bus, slave_addr, offset, offset_size, bp, len);
+}
+
+/*
+ * Caller must hold the i2c chain resource.
+ *
+ * Return number of bytes read, or -errno.
+ */
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+	     void *bp, int len)
+{
+	int ret;
+
+	if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+		return -EACCES;
+
+	ret = __i2c_read(ppd, target, i2c_addr, offset, bp, len);
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+/*
+ * Write page n, offset m of QSFP memory as defined by SFF 8636
+ * by writing @addr = ((256 * n) + m)
+ *
+ * Caller must hold the i2c chain resource.
+ *
+ * Return number of bytes written or -errno.
+ */
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+	       int len)
+{
+	int count = 0;
+	int offset;
+	int nwrite;
+	int ret = 0;
+	u8 page;
+
+	if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+		return -EACCES;
+
+	while (count < len) {
+		/*
+		 * Set the qsfp page based on a zero-based address
+		 * and a page size of QSFP_PAGESIZE bytes.
+		 */
+		page = (u8)(addr / QSFP_PAGESIZE);
+
+		ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+				  QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+		/* QSFPs require a 5-10msec delay after write operations */
+		mdelay(5);
+		if (ret) {
+			hfi1_dev_porterr(ppd->dd, ppd->port,
+					 "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
+					 target, ret);
+			break;
+		}
+
+		offset = addr % QSFP_PAGESIZE;
+		nwrite = len - count;
+		/* truncate write to boundary if crossing boundary */
+		if (((addr % QSFP_RW_BOUNDARY) + nwrite) > QSFP_RW_BOUNDARY)
+			nwrite = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
+
+		ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+				  offset, bp + count, nwrite);
+		/* QSFPs require a 5-10msec delay after write operations */
+		mdelay(5);
+		if (ret)	/* stop on error */
+			break;
+
+		count += nwrite;
+		addr += nwrite;
+	}
+
+	if (ret < 0)
+		return ret;
+	return count;
+}
+
+/*
+ * Perform a stand-alone single QSFP write.  Acquire the resource, do the
+ * write, then release the resource.
+ */
+int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+		   int len)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u32 resource = qsfp_resource(dd);
+	int ret;
+
+	ret = acquire_chip_resource(dd, resource, QSFP_WAIT);
+	if (ret)
+		return ret;
+	ret = qsfp_write(ppd, target, addr, bp, len);
+	release_chip_resource(dd, resource);
+
+	return ret;
+}
+
+/*
+ * Access page n, offset m of QSFP memory as defined by SFF 8636
+ * by reading @addr = ((256 * n) + m)
+ *
+ * Caller must hold the i2c chain resource.
+ *
+ * Return the number of bytes read or -errno.
+ */
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+	      int len)
+{
+	int count = 0;
+	int offset;
+	int nread;
+	int ret = 0;
+	u8 page;
+
+	if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+		return -EACCES;
+
+	while (count < len) {
+		/*
+		 * Set the qsfp page based on a zero-based address
+		 * and a page size of QSFP_PAGESIZE bytes.
+		 */
+		page = (u8)(addr / QSFP_PAGESIZE);
+		ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+				  QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+		/* QSFPs require a 5-10msec delay after write operations */
+		mdelay(5);
+		if (ret) {
+			hfi1_dev_porterr(ppd->dd, ppd->port,
+					 "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
+					 target, ret);
+			break;
+		}
+
+		offset = addr % QSFP_PAGESIZE;
+		nread = len - count;
+		/* truncate read to boundary if crossing boundary */
+		if (((addr % QSFP_RW_BOUNDARY) + nread) > QSFP_RW_BOUNDARY)
+			nread = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
+
+		ret = __i2c_read(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+				 offset, bp + count, nread);
+		if (ret)	/* stop on error */
+			break;
+
+		count += nread;
+		addr += nread;
+	}
+
+	if (ret < 0)
+		return ret;
+	return count;
+}
+
+/*
+ * Perform a stand-alone single QSFP read.  Acquire the resource, do the
+ * read, then release the resource.
+ */
+int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+		  int len)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u32 resource = qsfp_resource(dd);
+	int ret;
+
+	ret = acquire_chip_resource(dd, resource, QSFP_WAIT);
+	if (ret)
+		return ret;
+	ret = qsfp_read(ppd, target, addr, bp, len);
+	release_chip_resource(dd, resource);
+
+	return ret;
+}
+
+/*
+ * This function caches the QSFP memory range in 128 byte chunks.
+ * As an example, the next byte after address 255 is byte 128 from
+ * upper page 01H (if existing) rather than byte 0 from lower page 00H.
+ * Access page n, offset m of QSFP memory as defined by SFF 8636
+ * in the cache by reading byte ((128 * n) + m)
+ * The calls to qsfp_{read,write} in this function correctly handle the
+ * address map difference between this mapping and the mapping implemented
+ * by those functions
+ *
+ * The caller must be holding the QSFP i2c chain resource.
+ */
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
+{
+	u32 target = ppd->dd->hfi1_id;
+	int ret;
+	unsigned long flags;
+	u8 *cache = &cp->cache[0];
+
+	/* ensure sane contents on invalid reads, for cable swaps */
+	memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
+	spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+	ppd->qsfp_info.cache_valid = 0;
+	spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+
+	if (!qsfp_mod_present(ppd)) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE);
+	if (ret != QSFP_PAGESIZE) {
+		dd_dev_info(ppd->dd,
+			    "%s: Page 0 read failed, expected %d, got %d\n",
+			    __func__, QSFP_PAGESIZE, ret);
+		goto bail;
+	}
+
+	/* Is paging enabled? */
+	if (!(cache[2] & 4)) {
+		/* Paging enabled, page 03 required */
+		if ((cache[195] & 0xC0) == 0xC0) {
+			/* all */
+			ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s failed\n", __func__);
+				goto bail;
+			}
+			ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s failed\n", __func__);
+				goto bail;
+			}
+			ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s failed\n", __func__);
+				goto bail;
+			}
+		} else if ((cache[195] & 0x80) == 0x80) {
+			/* only page 2 and 3 */
+			ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s failed\n", __func__);
+				goto bail;
+			}
+			ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s failed\n", __func__);
+				goto bail;
+			}
+		} else if ((cache[195] & 0x40) == 0x40) {
+			/* only page 1 and 3 */
+			ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s failed\n", __func__);
+				goto bail;
+			}
+			ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s failed\n", __func__);
+				goto bail;
+			}
+		} else {
+			/* only page 3 */
+			ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+			if (ret <= 0 || ret != 128) {
+				dd_dev_info(ppd->dd, "%s failed\n", __func__);
+				goto bail;
+			}
+		}
+	}
+
+	spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+	ppd->qsfp_info.cache_valid = 1;
+	ppd->qsfp_info.cache_refresh_required = 0;
+	spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+
+	return 0;
+
+bail:
+	memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
+	return ret;
+}
+
+const char * const hfi1_qsfp_devtech[16] = {
+	"850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP",
+	"1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML",
+	"Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq",
+	"Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq"
+};
+
+#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
+#define QSFP_DEFAULT_HDR_CNT 224
+
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3)
+/* For use with QSFP_HIGH_PWR macro */
+#define QSFP_HIGH_PWR_UNUSED	0 /* Bits [1:0] = 00 implies low power module */
+
+/*
+ * Takes power class byte [Page 00 Byte 129] in SFF 8636
+ * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4)
+ */
+int get_qsfp_power_class(u8 power_byte)
+{
+	if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED)
+		/* power classes count from 1, their bit encodings from 0 */
+		return (QSFP_PWR(power_byte) + 1);
+	/*
+	 * 00 in the high power classes stands for unused, bringing
+	 * balance to the off-by-1 offset above, we add 4 here to
+	 * account for the difference between the low and high power
+	 * groups
+	 */
+	return (QSFP_HIGH_PWR(power_byte) + 4);
+}
+
+int qsfp_mod_present(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_devdata *dd = ppd->dd;
+	u64 reg;
+
+	reg = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+	return !(reg & QSFP_HFI0_MODPRST_N);
+}
+
+/*
+ * This function maps QSFP memory addresses in 128 byte chunks in the following
+ * fashion per the CableInfo SMA query definition in the IBA 1.3 spec/OPA Gen 1
+ * spec
+ * For addr 000-127, lower page 00h
+ * For addr 128-255, upper page 00h
+ * For addr 256-383, upper page 01h
+ * For addr 384-511, upper page 02h
+ * For addr 512-639, upper page 03h
+ *
+ * For addresses beyond this range, it returns the invalid range of data buffer
+ * set to 0.
+ * For upper pages that are optional, if they are not valid, returns the
+ * particular range of bytes in the data buffer set to 0.
+ */
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len,
+		   u8 *data)
+{
+	struct hfi1_pportdata *ppd;
+	u32 excess_len = len;
+	int ret = 0, offset = 0;
+
+	if (port_num > dd->num_pports || port_num < 1) {
+		dd_dev_info(dd, "%s: Invalid port number %d\n",
+			    __func__, port_num);
+		ret = -EINVAL;
+		goto set_zeroes;
+	}
+
+	ppd = dd->pport + (port_num - 1);
+	if (!qsfp_mod_present(ppd)) {
+		ret = -ENODEV;
+		goto set_zeroes;
+	}
+
+	if (!ppd->qsfp_info.cache_valid) {
+		ret = -EINVAL;
+		goto set_zeroes;
+	}
+
+	if (addr >= (QSFP_MAX_NUM_PAGES * 128)) {
+		ret = -ERANGE;
+		goto set_zeroes;
+	}
+
+	if ((addr + len) > (QSFP_MAX_NUM_PAGES * 128)) {
+		excess_len = (addr + len) - (QSFP_MAX_NUM_PAGES * 128);
+		memcpy(data, &ppd->qsfp_info.cache[addr], (len - excess_len));
+		data += (len - excess_len);
+		goto set_zeroes;
+	}
+
+	memcpy(data, &ppd->qsfp_info.cache[addr], len);
+
+	if (addr <= QSFP_MONITOR_VAL_END &&
+	    (addr + len) >= QSFP_MONITOR_VAL_START) {
+		/* Overlap with the dynamic channel monitor range */
+		if (addr < QSFP_MONITOR_VAL_START) {
+			if (addr + len <= QSFP_MONITOR_VAL_END)
+				len = addr + len - QSFP_MONITOR_VAL_START;
+			else
+				len = QSFP_MONITOR_RANGE;
+			offset = QSFP_MONITOR_VAL_START - addr;
+			addr = QSFP_MONITOR_VAL_START;
+		} else if (addr == QSFP_MONITOR_VAL_START) {
+			offset = 0;
+			if (addr + len > QSFP_MONITOR_VAL_END)
+				len = QSFP_MONITOR_RANGE;
+		} else {
+			offset = 0;
+			if (addr + len > QSFP_MONITOR_VAL_END)
+				len = QSFP_MONITOR_VAL_END - addr + 1;
+		}
+		/* Refresh the values of the dynamic monitors from the cable */
+		ret = one_qsfp_read(ppd, dd->hfi1_id, addr, data + offset, len);
+		if (ret != len) {
+			ret = -EAGAIN;
+			goto set_zeroes;
+		}
+	}
+
+	return 0;
+
+set_zeroes:
+	memset(data, 0, excess_len);
+	return ret;
+}
+
+static const char *pwr_codes[8] = {"N/AW",
+				  "1.5W",
+				  "2.0W",
+				  "2.5W",
+				  "3.5W",
+				  "4.0W",
+				  "4.5W",
+				  "5.0W"
+				 };
+
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
+{
+	u8 *cache = &ppd->qsfp_info.cache[0];
+	u8 bin_buff[QSFP_DUMP_CHUNK];
+	char lenstr[6];
+	int sofar;
+	int bidx = 0;
+	u8 *atten = &cache[QSFP_ATTEN_OFFS];
+	u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
+	u8 power_byte = 0;
+
+	sofar = 0;
+	lenstr[0] = ' ';
+	lenstr[1] = '\0';
+
+	if (ppd->qsfp_info.cache_valid) {
+		if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+			snprintf(lenstr, sizeof(lenstr), "%dM ",
+				 cache[QSFP_MOD_LEN_OFFS]);
+
+		power_byte = cache[QSFP_MOD_PWR_OFFS];
+		sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
+				pwr_codes[get_qsfp_power_class(power_byte)]);
+
+		sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
+				lenstr,
+			hfi1_qsfp_devtech[(cache[QSFP_MOD_TECH_OFFS]) >> 4]);
+
+		sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n",
+				   QSFP_VEND_LEN, &cache[QSFP_VEND_OFFS]);
+
+		sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n",
+				   QSFP_OUI(vendor_oui));
+
+		sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n",
+				   QSFP_PN_LEN, &cache[QSFP_PN_OFFS]);
+
+		sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n",
+				   QSFP_REV_LEN, &cache[QSFP_REV_OFFS]);
+
+		if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+			sofar += scnprintf(buf + sofar, len - sofar,
+				"Atten:%d, %d\n",
+				QSFP_ATTEN_SDR(atten),
+				QSFP_ATTEN_DDR(atten));
+
+		sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n",
+				   QSFP_SN_LEN, &cache[QSFP_SN_OFFS]);
+
+		sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n",
+				   QSFP_DATE_LEN, &cache[QSFP_DATE_OFFS]);
+
+		sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n",
+				   QSFP_LOT_LEN, &cache[QSFP_LOT_OFFS]);
+
+		while (bidx < QSFP_DEFAULT_HDR_CNT) {
+			int iidx;
+
+			memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK);
+			for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) {
+				sofar += scnprintf(buf + sofar, len - sofar,
+					" %02X", bin_buff[iidx]);
+			}
+			sofar += scnprintf(buf + sofar, len - sofar, "\n");
+			bidx += QSFP_DUMP_CHUNK;
+		}
+	}
+	return sofar;
+}
diff --git a/drivers/infiniband/hw/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h
new file mode 100644
index 0000000..36cf523
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/qsfp.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+/* QSFP support common definitions, for hfi driver */
+
+#define QSFP_DEV 0xA0
+#define QSFP_PWR_LAG_MSEC 2000
+#define QSFP_MODPRS_LAG_MSEC 20
+/* 128 byte pages, per SFF 8636 rev 2.4 */
+#define QSFP_MAX_NUM_PAGES	5
+
+/*
+ * Below are masks for QSFP pins.  Pins are the same for HFI0 and HFI1.
+ * _N means asserted low
+ */
+#define QSFP_HFI0_I2CCLK    BIT(0)
+#define QSFP_HFI0_I2CDAT    BIT(1)
+#define QSFP_HFI0_RESET_N   BIT(2)
+#define QSFP_HFI0_INT_N	    BIT(3)
+#define QSFP_HFI0_MODPRST_N BIT(4)
+
+/* QSFP is paged at 256 bytes */
+#define QSFP_PAGESIZE 256
+/* Reads/writes cannot cross 128 byte boundaries */
+#define QSFP_RW_BOUNDARY 128
+
+/* number of bytes in i2c offset for QSFP devices */
+#define __QSFP_OFFSET_SIZE 1                           /* num address bytes */
+#define QSFP_OFFSET_SIZE (__QSFP_OFFSET_SIZE << 8)     /* shifted value */
+
+/* Defined fields that Intel requires of qualified cables */
+/* Byte 0 is Identifier, not checked */
+/* Byte 1 is reserved "status MSB" */
+#define QSFP_MONITOR_VAL_START 22
+#define QSFP_MONITOR_VAL_END 81
+#define QSFP_MONITOR_RANGE (QSFP_MONITOR_VAL_END - QSFP_MONITOR_VAL_START + 1)
+#define QSFP_TX_CTRL_BYTE_OFFS 86
+#define QSFP_PWR_CTRL_BYTE_OFFS 93
+#define QSFP_CDR_CTRL_BYTE_OFFS 98
+
+#define QSFP_PAGE_SELECT_BYTE_OFFS 127
+/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
+#define QSFP_MOD_ID_OFFS 128
+/*
+ * Byte 129 is "Extended Identifier".
+ * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W
+ */
+#define QSFP_MOD_PWR_OFFS 129
+/* Byte 130 is Connector type. Not Intel req'd */
+/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */
+/* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */
+/* byte 140 is nominal bit-rate, in units of 100Mbits/sec */
+#define QSFP_NOM_BIT_RATE_100_OFFS 140
+/* Byte 141 is Extended Rate Select. Not Intel req'd */
+/* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */
+/* Byte 146 is length for Copper. Units of 1 meter */
+#define QSFP_MOD_LEN_OFFS 146
+/*
+ * Byte 147 is Device technology. D0..3 not Intel req'd
+ * D4..7 select from 15 choices, translated by table:
+ */
+#define QSFP_MOD_TECH_OFFS 147
+extern const char *const hfi1_qsfp_devtech[16];
+/* Active Equalization includes fiber, copper full EQ, and copper near Eq */
+#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1)
+/* Active Equalization includes fiber, copper full EQ, and copper far Eq */
+#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1)
+/* Attenuation should be valid for copper other than full/near Eq */
+#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1)
+/* Length is only valid if technology is "copper" */
+#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1)
+#define QSFP_TECH_1490 9
+
+#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \
+			oui[2])
+#define QSFP_OUI_AMPHENOL 0x415048
+#define QSFP_OUI_FINISAR  0x009065
+#define QSFP_OUI_GORE     0x002177
+
+/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */
+#define QSFP_VEND_OFFS 148
+#define QSFP_VEND_LEN 16
+/* Byte 164 is IB Extended transceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */
+#define QSFP_IBXCV_OFFS 164
+/* Bytes 165..167 are Vendor OUI number */
+#define QSFP_VOUI_OFFS 165
+#define QSFP_VOUI_LEN 3
+/* Bytes 168..183 are Vendor Part Number, string */
+#define QSFP_PN_OFFS 168
+#define QSFP_PN_LEN 16
+/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */
+#define QSFP_REV_OFFS 184
+#define QSFP_REV_LEN 2
+/*
+ * Bytes 186,187 are Wavelength, if Optical. Not Intel req'd
+ *  If copper, they are attenuation in dB:
+ * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR)
+ */
+#define QSFP_ATTEN_OFFS 186
+#define QSFP_ATTEN_LEN 2
+/*
+ * Bytes 188,189 are Wavelength tolerance, if optical
+ * If copper, they are attenuation in dB:
+ * Byte 188 is at 12.5 Gb/s, Byte 189 at 25 Gb/s
+ */
+#define QSFP_CU_ATTEN_7G_OFFS 188
+#define QSFP_CU_ATTEN_12G_OFFS 189
+/* Byte 190 is Max Case Temp. Not Intel req'd */
+/* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */
+#define QSFP_CC_OFFS 191
+#define QSFP_EQ_INFO_OFFS 193
+#define QSFP_CDR_INFO_OFFS 194
+/* Bytes 196..211 are Serial Number, String */
+#define QSFP_SN_OFFS 196
+#define QSFP_SN_LEN 16
+/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */
+#define QSFP_DATE_OFFS 212
+#define QSFP_DATE_LEN 6
+/* Bytes 218,219 are optional lot-code, string */
+#define QSFP_LOT_OFFS 218
+#define QSFP_LOT_LEN 2
+/* Bytes 220, 221 indicate monitoring options, Not Intel req'd */
+/* Byte 222 indicates nominal bitrate in units of 250Mbits/sec */
+#define QSFP_NOM_BIT_RATE_250_OFFS 222
+/* Byte 223 is LSB of sum of bytes 192..222 */
+#define QSFP_CC_EXT_OFFS 223
+
+/*
+ * Interrupt flag masks
+ */
+#define QSFP_DATA_NOT_READY		0x01
+
+#define QSFP_HIGH_TEMP_ALARM		0x80
+#define QSFP_LOW_TEMP_ALARM		0x40
+#define QSFP_HIGH_TEMP_WARNING		0x20
+#define QSFP_LOW_TEMP_WARNING		0x10
+
+#define QSFP_HIGH_VCC_ALARM		0x80
+#define QSFP_LOW_VCC_ALARM		0x40
+#define QSFP_HIGH_VCC_WARNING		0x20
+#define QSFP_LOW_VCC_WARNING		0x10
+
+#define QSFP_HIGH_POWER_ALARM		0x88
+#define QSFP_LOW_POWER_ALARM		0x44
+#define QSFP_HIGH_POWER_WARNING		0x22
+#define QSFP_LOW_POWER_WARNING		0x11
+
+#define QSFP_HIGH_BIAS_ALARM		0x88
+#define QSFP_LOW_BIAS_ALARM		0x44
+#define QSFP_HIGH_BIAS_WARNING		0x22
+#define QSFP_LOW_BIAS_WARNING		0x11
+
+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
+/*
+ * struct qsfp_data encapsulates state of QSFP device for one port.
+ * it will be part of port-specific data if a board supports QSFP.
+ *
+ * Since multiple board-types use QSFP, and their pport_data structs
+ * differ (in the chip-specific section), we need a pointer to its head.
+ *
+ * Avoiding premature optimization, we will have one work_struct per port,
+ * and let the qsfp_lock arbitrate access to common resources.
+ *
+ */
+struct qsfp_data {
+	/* Helps to find our way */
+	struct hfi1_pportdata *ppd;
+	struct work_struct qsfp_work;
+	u8 cache[QSFP_MAX_NUM_PAGES * 128];
+	/* protect qsfp data */
+	spinlock_t qsfp_lock;
+	u8 check_interrupt_flags;
+	u8 reset_needed;
+	u8 limiting_active;
+	u8 cache_valid;
+	u8 cache_refresh_required;
+};
+
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
+		       struct qsfp_data *cp);
+int get_qsfp_power_class(u8 power_byte);
+int qsfp_mod_present(struct hfi1_pportdata *ppd);
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
+		   u32 len, u8 *data);
+
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+	      int offset, void *bp, int len);
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+	     int offset, void *bp, int len);
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+	       int len);
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+	      int len);
+int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+		   int len);
+int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+		  int len);
+struct hfi1_asic_data;
+int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad);
+void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad);
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
new file mode 100644
index 0000000..da58046
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -0,0 +1,2519 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/io.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "verbs_txreq.h"
+#include "trace.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) RC_OP(x)
+
+static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
+		       u32 psn, u32 pmtu)
+{
+	u32 len;
+
+	len = delta_psn(psn, wqe->psn) * pmtu;
+	ss->sge = wqe->sg_list[0];
+	ss->sg_list = wqe->sg_list + 1;
+	ss->num_sge = wqe->wr.num_sge;
+	ss->total_len = wqe->length;
+	rvt_skip_sge(ss, len, false);
+	return wqe->length - len;
+}
+
+/**
+ * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @dev: the device for this QP
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @ps: the xmit packet state
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
+		       struct ib_other_headers *ohdr,
+		       struct hfi1_pkt_state *ps)
+{
+	struct rvt_ack_entry *e;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+	int middle = 0;
+	u32 pmtu = qp->pmtu;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	lockdep_assert_held(&qp->s_lock);
+	/* Don't send an ACK if we aren't supposed to. */
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+		goto bail;
+
+	if (priv->hdr_type == HFI1_PKT_TYPE_9B)
+		/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+		hwords = 5;
+	else
+		/* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
+		hwords = 7;
+
+	switch (qp->s_ack_state) {
+	case OP(RDMA_READ_RESPONSE_LAST):
+	case OP(RDMA_READ_RESPONSE_ONLY):
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->rdma_sge.mr) {
+			rvt_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		/* FALLTHROUGH */
+	case OP(ATOMIC_ACKNOWLEDGE):
+		/*
+		 * We can increment the tail pointer now that the last
+		 * response has been sent instead of only being
+		 * constructed.
+		 */
+		if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
+			qp->s_tail_ack_queue = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_ONLY):
+	case OP(ACKNOWLEDGE):
+		/* Check for no next entry in the queue. */
+		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+			if (qp->s_flags & RVT_S_ACK_PENDING)
+				goto normal;
+			goto bail;
+		}
+
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST)) {
+			/*
+			 * If a RDMA read response is being resent and
+			 * we haven't seen the duplicate request yet,
+			 * then stop sending the remaining responses the
+			 * responder has seen until the requester re-sends it.
+			 */
+			len = e->rdma_sge.sge_length;
+			if (len && !e->rdma_sge.mr) {
+				qp->s_tail_ack_queue = qp->r_head_ack_queue;
+				goto bail;
+			}
+			/* Copy SGE state in case we need to resend */
+			ps->s_txreq->mr = e->rdma_sge.mr;
+			if (ps->s_txreq->mr)
+				rvt_get_mr(ps->s_txreq->mr);
+			qp->s_ack_rdma_sge.sge = e->rdma_sge;
+			qp->s_ack_rdma_sge.num_sge = 1;
+			ps->s_txreq->ss = &qp->s_ack_rdma_sge;
+			if (len > pmtu) {
+				len = pmtu;
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+			} else {
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+				e->sent = 1;
+			}
+			ohdr->u.aeth = rvt_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_rdma_psn = e->psn;
+			bth2 = mask_psn(qp->s_ack_rdma_psn++);
+		} else {
+			/* COMPARE_SWAP or FETCH_ADD */
+			ps->s_txreq->ss = NULL;
+			len = 0;
+			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+			ohdr->u.at.aeth = rvt_compute_aeth(qp);
+			ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth);
+			hwords += sizeof(ohdr->u.at) / sizeof(u32);
+			bth2 = mask_psn(e->psn);
+			e->sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		ps->s_txreq->ss = &qp->s_ack_rdma_sge;
+		ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
+		if (ps->s_txreq->mr)
+			rvt_get_mr(ps->s_txreq->mr);
+		len = qp->s_ack_rdma_sge.sge.sge_length;
+		if (len > pmtu) {
+			len = pmtu;
+			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+		} else {
+			ohdr->u.aeth = rvt_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+			e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+			e->sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		bth2 = mask_psn(qp->s_ack_rdma_psn++);
+		break;
+
+	default:
+normal:
+		/*
+		 * Send a regular ACK.
+		 * Set the s_ack_state so we wait until after sending
+		 * the ACK before setting s_ack_state to ACKNOWLEDGE
+		 * (see above).
+		 */
+		qp->s_ack_state = OP(SEND_ONLY);
+		qp->s_flags &= ~RVT_S_ACK_PENDING;
+		ps->s_txreq->ss = NULL;
+		if (qp->s_nak_state)
+			ohdr->u.aeth =
+				cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
+					    (qp->s_nak_state <<
+					     IB_AETH_CREDIT_SHIFT));
+		else
+			ohdr->u.aeth = rvt_compute_aeth(qp);
+		hwords++;
+		len = 0;
+		bth0 = OP(ACKNOWLEDGE) << 24;
+		bth2 = mask_psn(qp->s_ack_psn);
+	}
+	qp->s_rdma_ack_cnt++;
+	ps->s_txreq->sde = priv->s_sde;
+	ps->s_txreq->s_cur_size = len;
+	ps->s_txreq->hdr_dwords = hwords;
+	hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
+	return 1;
+
+bail:
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+	/*
+	 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
+	 * RVT_S_RESP_PENDING
+	 */
+	smp_wmb();
+	qp->s_flags &= ~(RVT_S_RESP_PENDING
+				| RVT_S_ACK_PENDING
+				| RVT_S_AHG_VALID);
+	return 0;
+}
+
+/**
+ * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Assumes s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ib_other_headers *ohdr;
+	struct rvt_sge_state *ss;
+	struct rvt_swqe *wqe;
+	u32 hwords;
+	u32 len;
+	u32 bth0 = 0;
+	u32 bth2;
+	u32 pmtu = qp->pmtu;
+	char newreq;
+	int middle = 0;
+	int delta;
+
+	lockdep_assert_held(&qp->s_lock);
+	ps->s_txreq = get_txreq(ps->dev, qp);
+	if (IS_ERR(ps->s_txreq))
+		goto bail_no_tx;
+
+	if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
+		/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+		hwords = 5;
+		if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)
+			ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth;
+		else
+			ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
+	} else {
+		/* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
+		hwords = 7;
+		if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
+		    (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))))
+			ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth;
+		else
+			ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth;
+	}
+
+	/* Sending responses has higher priority over sending requests. */
+	if ((qp->s_flags & RVT_S_RESP_PENDING) &&
+	    make_rc_ack(dev, qp, ohdr, ps))
+		return 1;
+
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
+		if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == READ_ONCE(qp->s_head))
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (iowait_sdma_pending(&priv->s_iowait)) {
+			qp->s_flags |= RVT_S_WAIT_DMA;
+			goto bail;
+		}
+		clear_ahg(qp);
+		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+		hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+		/* will get called again */
+		goto done_free_tx;
+	}
+
+	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
+		goto bail;
+
+	if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
+		if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
+			qp->s_flags |= RVT_S_WAIT_PSN;
+			goto bail;
+		}
+		qp->s_sending_psn = qp->s_psn;
+		qp->s_sending_hpsn = qp->s_psn - 1;
+	}
+
+	/* Send a request. */
+	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+	switch (qp->s_state) {
+	default:
+		if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/*
+		 * Resend an old request or start a new one.
+		 *
+		 * We keep track of the current SWQE so that
+		 * we don't reset the "furthest progress" state
+		 * if we need to back up.
+		 */
+		newreq = 0;
+		if (qp->s_cur == qp->s_tail) {
+			/* Check if send work queue is empty. */
+			if (qp->s_tail == READ_ONCE(qp->s_head)) {
+				clear_ahg(qp);
+				goto bail;
+			}
+			/*
+			 * If a fence is requested, wait for previous
+			 * RDMA read and atomic operations to finish.
+			 */
+			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+			    qp->s_num_rd_atomic) {
+				qp->s_flags |= RVT_S_WAIT_FENCE;
+				goto bail;
+			}
+			/*
+			 * Local operations are processed immediately
+			 * after all prior requests have completed
+			 */
+			if (wqe->wr.opcode == IB_WR_REG_MR ||
+			    wqe->wr.opcode == IB_WR_LOCAL_INV) {
+				int local_ops = 0;
+				int err = 0;
+
+				if (qp->s_last != qp->s_cur)
+					goto bail;
+				if (++qp->s_cur == qp->s_size)
+					qp->s_cur = 0;
+				if (++qp->s_tail == qp->s_size)
+					qp->s_tail = 0;
+				if (!(wqe->wr.send_flags &
+				      RVT_SEND_COMPLETION_ONLY)) {
+					err = rvt_invalidate_rkey(
+						qp,
+						wqe->wr.ex.invalidate_rkey);
+					local_ops = 1;
+				}
+				hfi1_send_complete(qp, wqe,
+						   err ? IB_WC_LOC_PROT_ERR
+						       : IB_WC_SUCCESS);
+				if (local_ops)
+					atomic_dec(&qp->local_ops_pending);
+				goto done_free_tx;
+			}
+
+			newreq = 1;
+			qp->s_psn = wqe->psn;
+		}
+		/*
+		 * Note that we have to be careful not to modify the
+		 * original work request since we may need to resend
+		 * it.
+		 */
+		len = wqe->length;
+		ss = &qp->s_sge;
+		bth2 = mask_psn(qp->s_psn);
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+		case IB_WR_SEND_WITH_INV:
+			/* If no credit, return. */
+			if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
+			    rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			if (len > pmtu) {
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND) {
+				qp->s_state = OP(SEND_ONLY);
+			} else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			} else {
+				qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE);
+				/* Invalidate rkey comes after the BTH */
+				ohdr->u.ieth = cpu_to_be32(
+						wqe->wr.ex.invalidate_rkey);
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+				qp->s_lsn++;
+			goto no_flow_control;
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			/* If no credit, return. */
+			if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
+			    rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+no_flow_control:
+			put_ib_reth_vaddr(
+				wqe->rdma_wr.remote_addr,
+				&ohdr->u.rc.reth);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->rdma_wr.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / sizeof(u32);
+			if (len > pmtu) {
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			} else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= IB_BTH_SOLICITED;
+			}
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_READ:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= RVT_S_WAIT_RDMAR;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+					qp->s_lsn++;
+			}
+			put_ib_reth_vaddr(
+				wqe->rdma_wr.remote_addr,
+				&ohdr->u.rc.reth);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->rdma_wr.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			qp->s_state = OP(RDMA_READ_REQUEST);
+			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_ATOMIC_CMP_AND_SWP:
+		case IB_WR_ATOMIC_FETCH_AND_ADD:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= RVT_S_WAIT_RDMAR;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+					qp->s_lsn++;
+			}
+			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+				qp->s_state = OP(COMPARE_SWAP);
+				put_ib_ateth_swap(wqe->atomic_wr.swap,
+						  &ohdr->u.atomic_eth);
+				put_ib_ateth_compare(wqe->atomic_wr.compare_add,
+						     &ohdr->u.atomic_eth);
+			} else {
+				qp->s_state = OP(FETCH_ADD);
+				put_ib_ateth_swap(wqe->atomic_wr.compare_add,
+						  &ohdr->u.atomic_eth);
+				put_ib_ateth_compare(0, &ohdr->u.atomic_eth);
+			}
+			put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr,
+					   &ohdr->u.atomic_eth);
+			ohdr->u.atomic_eth.rkey = cpu_to_be32(
+				wqe->atomic_wr.rkey);
+			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_sge.total_len = wqe->length;
+		qp->s_len = wqe->length;
+		if (newreq) {
+			qp->s_tail++;
+			if (qp->s_tail >= qp->s_size)
+				qp->s_tail = 0;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+			qp->s_psn = wqe->lpsn + 1;
+		else
+			qp->s_psn++;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
+		 * thread to indicate a SEND needs to be restarted from an
+		 * earlier PSN without interfering with the sending thread.
+		 * See restart_rc().
+		 */
+		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+		/* FALLTHROUGH */
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		bth2 = mask_psn(qp->s_psn++);
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND) {
+			qp->s_state = OP(SEND_LAST);
+		} else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		} else {
+			qp->s_state = OP(SEND_LAST_WITH_INVALIDATE);
+			/* invalidate data comes after the BTH */
+			ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey);
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= IB_BTH_SOLICITED;
+		bth2 |= IB_BTH_REQ_ACK;
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
+		 * thread to indicate a RDMA write needs to be restarted from
+		 * an earlier PSN without interfering with the sending thread.
+		 * See restart_rc().
+		 */
+		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		bth2 = mask_psn(qp->s_psn++);
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		} else {
+			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+		}
+		bth2 |= IB_BTH_REQ_ACK;
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
+		 * thread to indicate a RDMA read needs to be restarted from
+		 * an earlier PSN without interfering with the sending thread.
+		 * See restart_rc().
+		 */
+		len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
+		put_ib_reth_vaddr(
+			wqe->rdma_wr.remote_addr + len,
+			&ohdr->u.rc.reth);
+		ohdr->u.rc.reth.rkey =
+			cpu_to_be32(wqe->rdma_wr.rkey);
+		ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
+		qp->s_state = OP(RDMA_READ_REQUEST);
+		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+		bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
+		qp->s_psn = wqe->lpsn + 1;
+		ss = NULL;
+		len = 0;
+		qp->s_cur++;
+		if (qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	qp->s_sending_hpsn = bth2;
+	delta = delta_psn(bth2, wqe->psn);
+	if (delta && delta % HFI1_PSN_CREDIT == 0)
+		bth2 |= IB_BTH_REQ_ACK;
+	if (qp->s_flags & RVT_S_SEND_ONE) {
+		qp->s_flags &= ~RVT_S_SEND_ONE;
+		qp->s_flags |= RVT_S_WAIT_ACK;
+		bth2 |= IB_BTH_REQ_ACK;
+	}
+	qp->s_len -= len;
+	ps->s_txreq->hdr_dwords = hwords;
+	ps->s_txreq->sde = priv->s_sde;
+	ps->s_txreq->ss = ss;
+	ps->s_txreq->s_cur_size = len;
+	hfi1_make_ruc_header(
+		qp,
+		ohdr,
+		bth0 | (qp->s_state << 24),
+		bth2,
+		middle,
+		ps);
+	return 1;
+
+done_free_tx:
+	hfi1_put_txreq(ps->s_txreq);
+	ps->s_txreq = NULL;
+	return 1;
+
+bail:
+	hfi1_put_txreq(ps->s_txreq);
+
+bail_no_tx:
+	ps->s_txreq = NULL;
+	qp->s_flags &= ~RVT_S_BUSY;
+	return 0;
+}
+
+static inline void hfi1_make_bth_aeth(struct rvt_qp *qp,
+				      struct ib_other_headers *ohdr,
+				      u32 bth0, u32 bth1)
+{
+	if (qp->r_nak_state)
+		ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
+					    (qp->r_nak_state <<
+					     IB_AETH_CREDIT_SHIFT));
+	else
+		ohdr->u.aeth = rvt_compute_aeth(qp);
+
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(bth1 | qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
+}
+
+static inline void hfi1_queue_rc_ack(struct hfi1_packet *packet, bool is_fecn)
+{
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_ibport *ibp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+		goto unlock;
+	ibp = rcd_to_iport(packet->rcd);
+	this_cpu_inc(*ibp->rvp.rc_qacks);
+	qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
+	qp->s_nak_state = qp->r_nak_state;
+	qp->s_ack_psn = qp->r_ack_psn;
+	if (is_fecn)
+		qp->s_flags |= RVT_S_ECN;
+
+	/* Schedule the send tasklet. */
+	hfi1_schedule_send(qp);
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet,
+				       struct hfi1_opa_header *opa_hdr,
+				       u8 sc5, bool is_fecn,
+				       u64 *pbc_flags, u32 *hwords,
+				       u32 *nwords)
+{
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct ib_header *hdr = &opa_hdr->ibh;
+	struct ib_other_headers *ohdr;
+	u16 lrh0 = HFI1_LRH_BTH;
+	u16 pkey;
+	u32 bth0, bth1;
+
+	opa_hdr->hdr_type = HFI1_PKT_TYPE_9B;
+	ohdr = &hdr->u.oth;
+	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
+	*hwords = 6;
+
+	if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) {
+		*hwords += hfi1_make_grh(ibp, &hdr->u.l.grh,
+					 rdma_ah_read_grh(&qp->remote_ah_attr),
+					 *hwords - 2, SIZE_OF_CRC);
+		ohdr = &hdr->u.l.oth;
+		lrh0 = HFI1_LRH_GRH;
+	}
+	/* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+	*pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
+
+	/* read pkey_index w/o lock (its atomic) */
+	pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
+
+	lrh0 |= (sc5 & IB_SC_MASK) << IB_SC_SHIFT |
+		(rdma_ah_get_sl(&qp->remote_ah_attr) & IB_SL_MASK) <<
+			IB_SL_SHIFT;
+
+	hfi1_make_ib_hdr(hdr, lrh0, *hwords + SIZE_OF_CRC,
+			 opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B),
+			 ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr));
+
+	bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
+	if (qp->s_mig_state == IB_MIG_MIGRATED)
+		bth0 |= IB_BTH_MIG_REQ;
+	bth1 = (!!is_fecn) << IB_BECN_SHIFT;
+	hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
+}
+
+static inline void hfi1_make_rc_ack_16B(struct hfi1_packet *packet,
+					struct hfi1_opa_header *opa_hdr,
+					u8 sc5, bool is_fecn,
+					u64 *pbc_flags, u32 *hwords,
+					u32 *nwords)
+{
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct hfi1_16b_header *hdr = &opa_hdr->opah;
+	struct ib_other_headers *ohdr;
+	u32 bth0, bth1 = 0;
+	u16 len, pkey;
+	bool becn = is_fecn;
+	u8 l4 = OPA_16B_L4_IB_LOCAL;
+	u8 extra_bytes;
+
+	opa_hdr->hdr_type = HFI1_PKT_TYPE_16B;
+	ohdr = &hdr->u.oth;
+	/* header size in 32-bit words 16B LRH+BTH+AETH = (16+12+4)/4 */
+	*hwords = 8;
+	extra_bytes = hfi1_get_16b_padding(*hwords << 2, 0);
+	*nwords = SIZE_OF_CRC + ((extra_bytes + SIZE_OF_LT) >> 2);
+
+	if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
+	    hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) {
+		*hwords += hfi1_make_grh(ibp, &hdr->u.l.grh,
+					 rdma_ah_read_grh(&qp->remote_ah_attr),
+					 *hwords - 4, *nwords);
+		ohdr = &hdr->u.l.oth;
+		l4 = OPA_16B_L4_IB_GLOBAL;
+	}
+	*pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
+
+	/* read pkey_index w/o lock (its atomic) */
+	pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
+
+	/* Convert dwords to flits */
+	len = (*hwords + *nwords) >> 1;
+
+	hfi1_make_16b_hdr(hdr, ppd->lid |
+			  (rdma_ah_get_path_bits(&qp->remote_ah_attr) &
+			  ((1 << ppd->lmc) - 1)),
+			  opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr),
+				      16B), len, pkey, becn, 0, l4, sc5);
+
+	bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
+	bth0 |= extra_bytes << 20;
+	if (qp->s_mig_state == IB_MIG_MIGRATED)
+		bth1 = OPA_BTH_MIG_REQ;
+	hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
+}
+
+typedef void (*hfi1_make_rc_ack)(struct hfi1_packet *packet,
+				 struct hfi1_opa_header *opa_hdr,
+				 u8 sc5, bool is_fecn,
+				 u64 *pbc_flags, u32 *hwords,
+				 u32 *nwords);
+
+/* We support only two types - 9B and 16B for now */
+static const hfi1_make_rc_ack hfi1_make_rc_ack_tbl[2] = {
+	[HFI1_PKT_TYPE_9B] = &hfi1_make_rc_ack_9B,
+	[HFI1_PKT_TYPE_16B] = &hfi1_make_rc_ack_16B
+};
+
+/**
+ * hfi1_send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and send engine.
+ */
+void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)];
+	u64 pbc, pbc_flags = 0;
+	u32 hwords = 0;
+	u32 nwords = 0;
+	u32 plen;
+	struct pio_buf *pbuf;
+	struct hfi1_opa_header opa_hdr;
+
+	/* clear the defer count */
+	qp->r_adefered = 0;
+
+	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+	if (qp->s_flags & RVT_S_RESP_PENDING) {
+		hfi1_queue_rc_ack(packet, is_fecn);
+		return;
+	}
+
+	/* Ensure s_rdma_ack_cnt changes are committed */
+	if (qp->s_rdma_ack_cnt) {
+		hfi1_queue_rc_ack(packet, is_fecn);
+		return;
+	}
+
+	/* Don't try to send ACKs if the link isn't ACTIVE */
+	if (driver_lstate(ppd) != IB_PORT_ACTIVE)
+		return;
+
+	/* Make the appropriate header */
+	hfi1_make_rc_ack_tbl[priv->hdr_type](packet, &opa_hdr, sc5, is_fecn,
+					     &pbc_flags, &hwords, &nwords);
+
+	plen = 2 /* PBC */ + hwords + nwords;
+	pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps,
+			 sc_to_vlt(ppd->dd, sc5), plen);
+	pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL);
+	if (!pbuf) {
+		/*
+		 * We have no room to send at the moment.  Pass
+		 * responsibility for sending the ACK to the send engine
+		 * so that when enough buffer space becomes available,
+		 * the ACK is sent ahead of other outgoing packets.
+		 */
+		hfi1_queue_rc_ack(packet, is_fecn);
+		return;
+	}
+	trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
+			       &opa_hdr, ib_is_sc5(sc5));
+
+	/* write the pbc and data */
+	ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
+				 (priv->hdr_type == HFI1_PKT_TYPE_9B ?
+				 (void *)&opa_hdr.ibh :
+				 (void *)&opa_hdr.opah), hwords);
+	return;
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct rvt_qp *qp, u32 psn)
+{
+	u32 n = qp->s_acked;
+	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
+	u32 opcode;
+
+	lockdep_assert_held(&qp->s_lock);
+	qp->s_cur = n;
+
+	/*
+	 * If we are starting the request from the beginning,
+	 * let the normal send code handle initialization.
+	 */
+	if (cmp_psn(psn, wqe->psn) <= 0) {
+		qp->s_state = OP(SEND_LAST);
+		goto done;
+	}
+
+	/* Find the work request opcode corresponding to the given PSN. */
+	opcode = wqe->wr.opcode;
+	for (;;) {
+		int diff;
+
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+		wqe = rvt_get_swqe_ptr(qp, n);
+		diff = cmp_psn(psn, wqe->psn);
+		if (diff < 0)
+			break;
+		qp->s_cur = n;
+		/*
+		 * If we are starting the request from the beginning,
+		 * let the normal send code handle initialization.
+		 */
+		if (diff == 0) {
+			qp->s_state = OP(SEND_LAST);
+			goto done;
+		}
+		opcode = wqe->wr.opcode;
+	}
+
+	/*
+	 * Set the state to restart in the middle of a request.
+	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
+	 * See hfi1_make_rc_req().
+	 */
+	switch (opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+		break;
+
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+		break;
+
+	case IB_WR_RDMA_READ:
+		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		break;
+
+	default:
+		/*
+		 * This case shouldn't happen since its only
+		 * one PSN per req.
+		 */
+		qp->s_state = OP(SEND_LAST);
+	}
+done:
+	qp->s_psn = psn;
+	/*
+	 * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
+	 * asynchronously before the send engine can get scheduled.
+	 * Doing it in hfi1_make_rc_req() is too late.
+	 */
+	if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
+	    (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
+		qp->s_flags |= RVT_S_WAIT_PSN;
+	qp->s_flags &= ~RVT_S_AHG_VALID;
+}
+
+/*
+ * Back up requester to resend the last un-ACKed request.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ */
+void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
+{
+	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+	struct hfi1_ibport *ibp;
+
+	lockdep_assert_held(&qp->r_lock);
+	lockdep_assert_held(&qp->s_lock);
+	if (qp->s_retry == 0) {
+		if (qp->s_mig_state == IB_MIG_ARMED) {
+			hfi1_migrate_qp(qp);
+			qp->s_retry = qp->s_retry_cnt;
+		} else if (qp->s_last == qp->s_acked) {
+			hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+			rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			return;
+		} else { /* need to handle delayed completion */
+			return;
+		}
+	} else {
+		qp->s_retry--;
+	}
+
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	if (wqe->wr.opcode == IB_WR_RDMA_READ)
+		ibp->rvp.n_rc_resends++;
+	else
+		ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
+
+	qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
+			 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
+			 RVT_S_WAIT_ACK);
+	if (wait)
+		qp->s_flags |= RVT_S_SEND_ONE;
+	reset_psn(qp, psn);
+}
+
+/*
+ * Set qp->s_sending_psn to the next PSN after the given one.
+ * This would be psn+1 except when RDMA reads are present.
+ */
+static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
+{
+	struct rvt_swqe *wqe;
+	u32 n = qp->s_last;
+
+	lockdep_assert_held(&qp->s_lock);
+	/* Find the work request corresponding to the given PSN. */
+	for (;;) {
+		wqe = rvt_get_swqe_ptr(qp, n);
+		if (cmp_psn(psn, wqe->lpsn) <= 0) {
+			if (wqe->wr.opcode == IB_WR_RDMA_READ)
+				qp->s_sending_psn = wqe->lpsn + 1;
+			else
+				qp->s_sending_psn = psn + 1;
+			break;
+		}
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+	}
+}
+
+/*
+ * This should be called with the QP s_lock held and interrupts disabled.
+ */
+void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
+{
+	struct ib_other_headers *ohdr;
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct rvt_swqe *wqe;
+	struct ib_header *hdr = NULL;
+	struct hfi1_16b_header *hdr_16b = NULL;
+	u32 opcode;
+	u32 psn;
+
+	lockdep_assert_held(&qp->s_lock);
+	if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
+		return;
+
+	/* Find out where the BTH is */
+	if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
+		hdr = &opah->ibh;
+		if (ib_get_lnh(hdr) == HFI1_LRH_BTH)
+			ohdr = &hdr->u.oth;
+		else
+			ohdr = &hdr->u.l.oth;
+	} else {
+		u8 l4;
+
+		hdr_16b = &opah->opah;
+		l4  = hfi1_16B_get_l4(hdr_16b);
+		if (l4 == OPA_16B_L4_IB_LOCAL)
+			ohdr = &hdr_16b->u.oth;
+		else
+			ohdr = &hdr_16b->u.l.oth;
+	}
+
+	opcode = ib_bth_get_opcode(ohdr);
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		WARN_ON(!qp->s_rdma_ack_cnt);
+		qp->s_rdma_ack_cnt--;
+		return;
+	}
+
+	psn = ib_bth_get_psn(ohdr);
+	reset_sending_psn(qp, psn);
+
+	/*
+	 * Start timer after a packet requesting an ACK has been sent and
+	 * there are still requests that haven't been acked.
+	 */
+	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+	    !(qp->s_flags &
+		(RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
+		(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+		rvt_add_retry_timer(qp);
+
+	while (qp->s_last != qp->s_acked) {
+		u32 s_last;
+
+		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+		if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
+		    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
+			break;
+		s_last = qp->s_last;
+		trace_hfi1_qp_send_completion(qp, wqe, s_last);
+		if (++s_last >= qp->s_size)
+			s_last = 0;
+		qp->s_last = s_last;
+		/* see post_send() */
+		barrier();
+		rvt_put_swqe(wqe);
+		rvt_qp_swqe_complete(qp,
+				     wqe,
+				     ib_hfi1_wc_opcode[wqe->wr.opcode],
+				     IB_WC_SUCCESS);
+	}
+	/*
+	 * If we were waiting for sends to complete before re-sending,
+	 * and they are now complete, restart sending.
+	 */
+	trace_hfi1_sendcomplete(qp, psn);
+	if (qp->s_flags & RVT_S_WAIT_PSN &&
+	    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+		qp->s_flags &= ~RVT_S_WAIT_PSN;
+		qp->s_sending_psn = qp->s_psn;
+		qp->s_sending_hpsn = qp->s_psn - 1;
+		hfi1_schedule_send(qp);
+	}
+}
+
+static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
+{
+	qp->s_last_psn = psn;
+}
+
+/*
+ * Generate a SWQE completion.
+ * This is similar to hfi1_send_complete but has to check to be sure
+ * that the SGEs are not being referenced if the SWQE is being resent.
+ */
+static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
+					 struct rvt_swqe *wqe,
+					 struct hfi1_ibport *ibp)
+{
+	lockdep_assert_held(&qp->s_lock);
+	/*
+	 * Don't decrement refcount and don't generate a
+	 * completion if the SWQE is being resent until the send
+	 * is finished.
+	 */
+	if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
+	    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+		u32 s_last;
+
+		rvt_put_swqe(wqe);
+		s_last = qp->s_last;
+		trace_hfi1_qp_send_completion(qp, wqe, s_last);
+		if (++s_last >= qp->s_size)
+			s_last = 0;
+		qp->s_last = s_last;
+		/* see post_send() */
+		barrier();
+		rvt_qp_swqe_complete(qp,
+				     wqe,
+				     ib_hfi1_wc_opcode[wqe->wr.opcode],
+				     IB_WC_SUCCESS);
+	} else {
+		struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+		this_cpu_inc(*ibp->rvp.rc_delayed_comp);
+		/*
+		 * If send progress not running attempt to progress
+		 * SDMA queue.
+		 */
+		if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
+			struct sdma_engine *engine;
+			u8 sl = rdma_ah_get_sl(&qp->remote_ah_attr);
+			u8 sc5;
+
+			/* For now use sc to find engine */
+			sc5 = ibp->sl_to_sc[sl];
+			engine = qp_to_sdma_engine(qp, sc5);
+			sdma_engine_progress_schedule(engine);
+		}
+	}
+
+	qp->s_retry = qp->s_retry_cnt;
+	update_last_psn(qp, wqe->lpsn);
+
+	/*
+	 * If we are completing a request which is in the process of
+	 * being resent, we can stop re-sending it since we know the
+	 * responder has already seen it.
+	 */
+	if (qp->s_acked == qp->s_cur) {
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		qp->s_acked = qp->s_cur;
+		wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+		if (qp->s_acked != qp->s_tail) {
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = wqe->psn;
+		}
+	} else {
+		if (++qp->s_acked >= qp->s_size)
+			qp->s_acked = 0;
+		if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
+			qp->s_draining = 0;
+		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+	}
+	return wqe;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * May be called at interrupt level, with the QP s_lock held.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
+		     u64 val, struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_ibport *ibp;
+	enum ib_wc_status status;
+	struct rvt_swqe *wqe;
+	int ret = 0;
+	u32 ack_psn;
+	int diff;
+
+	lockdep_assert_held(&qp->s_lock);
+	/*
+	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+	 * requests and implicitly NAK RDMA read and atomic requests issued
+	 * before the NAK'ed request.  The MSN won't include the NAK'ed
+	 * request but will include an ACK'ed request(s).
+	 */
+	ack_psn = psn;
+	if (aeth >> IB_AETH_NAK_SHIFT)
+		ack_psn--;
+	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+	ibp = rcd_to_iport(rcd);
+
+	/*
+	 * The MSN might be for a later WQE than the PSN indicates so
+	 * only complete WQEs that the PSN finishes.
+	 */
+	while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
+		/*
+		 * RDMA_READ_RESPONSE_ONLY is a special case since
+		 * we want to generate completion events for everything
+		 * before the RDMA read, copy the data, then generate
+		 * the completion for the read.
+		 */
+		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+		    diff == 0) {
+			ret = 1;
+			goto bail_stop;
+		}
+		/*
+		 * If this request is a RDMA read or atomic, and the ACK is
+		 * for a later operation, this ACK NAKs the RDMA read or
+		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
+		 * can ACK a RDMA read and likewise for atomic ops.  Note
+		 * that the NAK case can only happen if relaxed ordering is
+		 * used and requests are sent after an RDMA read or atomic
+		 * is sent but before the response is received.
+		 */
+		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+			/* Retry this request. */
+			if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+				qp->r_flags |= RVT_R_RDMAR_SEQ;
+				hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
+				if (list_empty(&qp->rspwait)) {
+					qp->r_flags |= RVT_R_RSP_SEND;
+					rvt_get_qp(qp);
+					list_add_tail(&qp->rspwait,
+						      &rcd->qp_wait_list);
+				}
+			}
+			/*
+			 * No need to process the ACK/NAK since we are
+			 * restarting an earlier request.
+			 */
+			goto bail_stop;
+		}
+		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+			u64 *vaddr = wqe->sg_list[0].vaddr;
+			*vaddr = val;
+		}
+		if (qp->s_num_rd_atomic &&
+		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+			qp->s_num_rd_atomic--;
+			/* Restart sending task if fence is complete */
+			if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
+			    !qp->s_num_rd_atomic) {
+				qp->s_flags &= ~(RVT_S_WAIT_FENCE |
+						 RVT_S_WAIT_ACK);
+				hfi1_schedule_send(qp);
+			} else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
+				qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
+						 RVT_S_WAIT_ACK);
+				hfi1_schedule_send(qp);
+			}
+		}
+		wqe = do_rc_completion(qp, wqe, ibp);
+		if (qp->s_acked == qp->s_tail)
+			break;
+	}
+
+	switch (aeth >> IB_AETH_NAK_SHIFT) {
+	case 0:         /* ACK */
+		this_cpu_inc(*ibp->rvp.rc_acks);
+		if (qp->s_acked != qp->s_tail) {
+			/*
+			 * We are expecting more ACKs so
+			 * mod the retry timer.
+			 */
+			rvt_mod_retry_timer(qp);
+			/*
+			 * We can stop re-sending the earlier packets and
+			 * continue with the next packet the receiver wants.
+			 */
+			if (cmp_psn(qp->s_psn, psn) <= 0)
+				reset_psn(qp, psn + 1);
+		} else {
+			/* No more acks - kill all timers */
+			rvt_stop_rc_timers(qp);
+			if (cmp_psn(qp->s_psn, psn) <= 0) {
+				qp->s_state = OP(SEND_LAST);
+				qp->s_psn = psn + 1;
+			}
+		}
+		if (qp->s_flags & RVT_S_WAIT_ACK) {
+			qp->s_flags &= ~RVT_S_WAIT_ACK;
+			hfi1_schedule_send(qp);
+		}
+		rvt_get_credit(qp, aeth);
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		qp->s_retry = qp->s_retry_cnt;
+		update_last_psn(qp, psn);
+		return 1;
+
+	case 1:         /* RNR NAK */
+		ibp->rvp.n_rnr_naks++;
+		if (qp->s_acked == qp->s_tail)
+			goto bail_stop;
+		if (qp->s_flags & RVT_S_WAIT_RNR)
+			goto bail_stop;
+		if (qp->s_rnr_retry == 0) {
+			status = IB_WC_RNR_RETRY_EXC_ERR;
+			goto class_b;
+		}
+		if (qp->s_rnr_retry_cnt < 7)
+			qp->s_rnr_retry--;
+
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+
+		ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
+
+		reset_psn(qp, psn);
+
+		qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
+		rvt_stop_rc_timers(qp);
+		rvt_add_rnr_timer(qp, aeth);
+		return 0;
+
+	case 3:         /* NAK */
+		if (qp->s_acked == qp->s_tail)
+			goto bail_stop;
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+		switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
+			IB_AETH_CREDIT_MASK) {
+		case 0: /* PSN sequence error */
+			ibp->rvp.n_seq_naks++;
+			/*
+			 * Back up to the responder's expected PSN.
+			 * Note that we might get a NAK in the middle of an
+			 * RDMA READ response which terminates the RDMA
+			 * READ.
+			 */
+			hfi1_restart_rc(qp, psn, 0);
+			hfi1_schedule_send(qp);
+			break;
+
+		case 1: /* Invalid Request */
+			status = IB_WC_REM_INV_REQ_ERR;
+			ibp->rvp.n_other_naks++;
+			goto class_b;
+
+		case 2: /* Remote Access Error */
+			status = IB_WC_REM_ACCESS_ERR;
+			ibp->rvp.n_other_naks++;
+			goto class_b;
+
+		case 3: /* Remote Operation Error */
+			status = IB_WC_REM_OP_ERR;
+			ibp->rvp.n_other_naks++;
+class_b:
+			if (qp->s_last == qp->s_acked) {
+				hfi1_send_complete(qp, wqe, status);
+				rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			}
+			break;
+
+		default:
+			/* Ignore other reserved NAK error codes */
+			goto reserved;
+		}
+		qp->s_retry = qp->s_retry_cnt;
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		goto bail_stop;
+
+	default:                /* 2: reserved */
+reserved:
+		/* Ignore reserved NAK codes. */
+		goto bail_stop;
+	}
+	/* cannot be reached  */
+bail_stop:
+	rvt_stop_rc_timers(qp);
+	return ret;
+}
+
+/*
+ * We have seen an out of sequence RDMA read middle or last packet.
+ * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
+ */
+static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
+			 struct hfi1_ctxtdata *rcd)
+{
+	struct rvt_swqe *wqe;
+
+	lockdep_assert_held(&qp->s_lock);
+	/* Remove QP from retry timer */
+	rvt_stop_rc_timers(qp);
+
+	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+
+	while (cmp_psn(psn, wqe->lpsn) > 0) {
+		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+			break;
+		wqe = do_rc_completion(qp, wqe, ibp);
+	}
+
+	ibp->rvp.n_rdma_seq++;
+	qp->r_flags |= RVT_R_RDMAR_SEQ;
+	hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= RVT_R_RSP_SEND;
+		rvt_get_qp(qp);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+}
+
+/**
+ * rc_rcv_resp - process an incoming RC response packet
+ * @packet: data packet information
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static void rc_rcv_resp(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	void *data = packet->payload;
+	u32 tlen = packet->tlen;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_ibport *ibp;
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_swqe *wqe;
+	enum ib_wc_status status;
+	unsigned long flags;
+	int diff;
+	u64 val;
+	u32 aeth;
+	u32 psn = ib_bth_get_psn(packet->ohdr);
+	u32 pmtu = qp->pmtu;
+	u16 hdrsize = packet->hlen;
+	u8 opcode = packet->opcode;
+	u8 pad = packet->pad;
+	u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	trace_hfi1_ack(qp, psn);
+
+	/* Ignore invalid responses. */
+	if (cmp_psn(psn, READ_ONCE(qp->s_next_psn)) >= 0)
+		goto ack_done;
+
+	/* Ignore duplicate responses. */
+	diff = cmp_psn(psn, qp->s_last_psn);
+	if (unlikely(diff <= 0)) {
+		/* Update credits for "ghost" ACKs */
+		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+			aeth = be32_to_cpu(ohdr->u.aeth);
+			if ((aeth >> IB_AETH_NAK_SHIFT) == 0)
+				rvt_get_credit(qp, aeth);
+		}
+		goto ack_done;
+	}
+
+	/*
+	 * Skip everything other than the PSN we expect, if we are waiting
+	 * for a reply to a restarted RDMA read or atomic op.
+	 */
+	if (qp->r_flags & RVT_R_RDMAR_SEQ) {
+		if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
+			goto ack_done;
+		qp->r_flags &= ~RVT_R_RDMAR_SEQ;
+	}
+
+	if (unlikely(qp->s_acked == qp->s_tail))
+		goto ack_done;
+	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+	status = IB_WC_SUCCESS;
+
+	switch (opcode) {
+	case OP(ACKNOWLEDGE):
+	case OP(ATOMIC_ACKNOWLEDGE):
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		if (opcode == OP(ATOMIC_ACKNOWLEDGE))
+			val = ib_u64_get(&ohdr->u.at.atomic_ack_eth);
+		else
+			val = 0;
+		if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
+		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
+			goto ack_done;
+		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_middle;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/* no AETH, no ACK */
+		if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+			goto ack_seq_err;
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+read_middle:
+		if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
+			goto ack_len_err;
+		if (unlikely(pmtu >= qp->s_rdma_read_len))
+			goto ack_len_err;
+
+		/*
+		 * We got a response so update the timeout.
+		 * 4.096 usec. * (1 << qp->timeout)
+		 */
+		rvt_mod_retry_timer(qp);
+		if (qp->s_flags & RVT_S_WAIT_ACK) {
+			qp->s_flags &= ~RVT_S_WAIT_ACK;
+			hfi1_schedule_send(qp);
+		}
+
+		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+			qp->s_retry = qp->s_retry_cnt;
+
+		/*
+		 * Update the RDMA receive state but do the copy w/o
+		 * holding the locks and blocking interrupts.
+		 */
+		qp->s_rdma_read_len -= pmtu;
+		update_last_psn(qp, psn);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false);
+		goto bail;
+
+	case OP(RDMA_READ_RESPONSE_ONLY):
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+			goto ack_done;
+		/*
+		 * Check that the data size is >= 0 && <= pmtu.
+		 * Remember to account for ICRC (4).
+		 */
+		if (unlikely(tlen < (hdrsize + extra_bytes)))
+			goto ack_len_err;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_last;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/* ACKs READ req. */
+		if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+			goto ack_seq_err;
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		/*
+		 * Check that the data size is >= 1 && <= pmtu.
+		 * Remember to account for ICRC (4).
+		 */
+		if (unlikely(tlen <= (hdrsize + extra_bytes)))
+			goto ack_len_err;
+read_last:
+		tlen -= hdrsize + extra_bytes;
+		if (unlikely(tlen != qp->s_rdma_read_len))
+			goto ack_len_err;
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false);
+		WARN_ON(qp->s_rdma_read_sge.num_sge);
+		(void)do_rc_ack(qp, aeth, psn,
+				 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
+		goto ack_done;
+	}
+
+ack_op_err:
+	status = IB_WC_LOC_QP_OP_ERR;
+	goto ack_err;
+
+ack_seq_err:
+	ibp = rcd_to_iport(rcd);
+	rdma_seq_err(qp, ibp, psn, rcd);
+	goto ack_done;
+
+ack_len_err:
+	status = IB_WC_LOC_LEN_ERR;
+ack_err:
+	if (qp->s_last == qp->s_acked) {
+		hfi1_send_complete(qp, wqe, status);
+		rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+	}
+ack_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+	return;
+}
+
+static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
+				  struct rvt_qp *qp)
+{
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= RVT_R_RSP_NAK;
+		rvt_get_qp(qp);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+}
+
+static inline void rc_cancel_ack(struct rvt_qp *qp)
+{
+	qp->r_adefered = 0;
+	if (list_empty(&qp->rspwait))
+		return;
+	list_del_init(&qp->rspwait);
+	qp->r_flags &= ~RVT_R_RSP_NAK;
+	rvt_put_qp(qp);
+}
+
+/**
+ * rc_rcv_error - process an incoming duplicate or error RC packet
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ *
+ * This is called from hfi1_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
+				 struct rvt_qp *qp, u32 opcode, u32 psn,
+				 int diff, struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
+	struct rvt_ack_entry *e;
+	unsigned long flags;
+	u8 i, prev;
+	int old_req;
+
+	trace_hfi1_rcv_error(qp, psn);
+	if (diff > 0) {
+		/*
+		 * Packet sequence error.
+		 * A NAK will ACK earlier sends and RDMA writes.
+		 * Don't queue the NAK if we already sent one.
+		 */
+		if (!qp->r_nak_state) {
+			ibp->rvp.n_rc_seqnak++;
+			qp->r_nak_state = IB_NAK_PSN_ERROR;
+			/* Use the expected PSN. */
+			qp->r_ack_psn = qp->r_psn;
+			/*
+			 * Wait to send the sequence NAK until all packets
+			 * in the receive queue have been processed.
+			 * Otherwise, we end up propagating congestion.
+			 */
+			rc_defered_ack(rcd, qp);
+		}
+		goto done;
+	}
+
+	/*
+	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
+	 * write or atomic op.  Don't NAK errors, just silently drop
+	 * the duplicate request.  Note that r_sge, r_len, and
+	 * r_rcv_len may be in use so don't modify them.
+	 *
+	 * We are supposed to ACK the earliest duplicate PSN but we
+	 * can coalesce an outstanding duplicate ACK.  We have to
+	 * send the earliest so that RDMA reads can be restarted at
+	 * the requester's expected PSN.
+	 *
+	 * First, find where this duplicate PSN falls within the
+	 * ACKs previously sent.
+	 * old_req is true if there is an older response that is scheduled
+	 * to be sent before sending this one.
+	 */
+	e = NULL;
+	old_req = 1;
+	ibp->rvp.n_rc_dupreq++;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	for (i = qp->r_head_ack_queue; ; i = prev) {
+		if (i == qp->s_tail_ack_queue)
+			old_req = 0;
+		if (i)
+			prev = i - 1;
+		else
+			prev = HFI1_MAX_RDMA_ATOMIC;
+		if (prev == qp->r_head_ack_queue) {
+			e = NULL;
+			break;
+		}
+		e = &qp->s_ack_queue[prev];
+		if (!e->opcode) {
+			e = NULL;
+			break;
+		}
+		if (cmp_psn(psn, e->psn) >= 0) {
+			if (prev == qp->s_tail_ack_queue &&
+			    cmp_psn(psn, e->lpsn) <= 0)
+				old_req = 0;
+			break;
+		}
+	}
+	switch (opcode) {
+	case OP(RDMA_READ_REQUEST): {
+		struct ib_reth *reth;
+		u32 offset;
+		u32 len;
+
+		/*
+		 * If we didn't find the RDMA read request in the ack queue,
+		 * we can ignore this request.
+		 */
+		if (!e || e->opcode != OP(RDMA_READ_REQUEST))
+			goto unlock_done;
+		/* RETH comes after BTH */
+		reth = &ohdr->u.rc.reth;
+		/*
+		 * Address range must be a subset of the original
+		 * request and start on pmtu boundaries.
+		 * We reuse the old ack_queue slot since the requester
+		 * should not back up and request an earlier PSN for the
+		 * same request.
+		 */
+		offset = delta_psn(psn, e->psn) * qp->pmtu;
+		len = be32_to_cpu(reth->length);
+		if (unlikely(offset + len != e->rdma_sge.sge_length))
+			goto unlock_done;
+		if (e->rdma_sge.mr) {
+			rvt_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		if (len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = get_ib_reth_vaddr(reth);
+			int ok;
+
+			ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+					 IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto unlock_done;
+		} else {
+			e->rdma_sge.vaddr = NULL;
+			e->rdma_sge.length = 0;
+			e->rdma_sge.sge_length = 0;
+		}
+		e->psn = psn;
+		if (old_req)
+			goto unlock_done;
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		/*
+		 * If we didn't find the atomic request in the ack queue
+		 * or the send engine is already backed up to send an
+		 * earlier entry, we can ignore this request.
+		 */
+		if (!e || e->opcode != (u8)opcode || old_req)
+			goto unlock_done;
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	default:
+		/*
+		 * Ignore this operation if it doesn't request an ACK
+		 * or an earlier RDMA read or atomic is going to be resent.
+		 */
+		if (!(psn & IB_BTH_REQ_ACK) || old_req)
+			goto unlock_done;
+		/*
+		 * Resend the most recent ACK if this request is
+		 * after all the previous RDMA reads and atomics.
+		 */
+		if (i == qp->r_head_ack_queue) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->r_psn - 1;
+			goto send_ack;
+		}
+
+		/*
+		 * Resend the RDMA read or atomic op which
+		 * ACKs this duplicate request.
+		 */
+		qp->s_tail_ack_queue = i;
+		break;
+	}
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+	qp->s_flags |= RVT_S_RESP_PENDING;
+	qp->r_nak_state = 0;
+	hfi1_schedule_send(qp);
+
+unlock_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return 1;
+
+send_ack:
+	return 0;
+}
+
+static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
+{
+	unsigned next;
+
+	next = n + 1;
+	if (next > HFI1_MAX_RDMA_ATOMIC)
+		next = 0;
+	qp->s_tail_ack_queue = next;
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
+			  u32 lqpn, u32 rqpn, u8 svc_type)
+{
+	struct opa_hfi1_cong_log_event_internal *cc_event;
+	unsigned long flags;
+
+	if (sl >= OPA_MAX_SLS)
+		return;
+
+	spin_lock_irqsave(&ppd->cc_log_lock, flags);
+
+	ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8);
+	ppd->threshold_event_counter++;
+
+	cc_event = &ppd->cc_events[ppd->cc_log_idx++];
+	if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
+		ppd->cc_log_idx = 0;
+	cc_event->lqpn = lqpn & RVT_QPN_MASK;
+	cc_event->rqpn = rqpn & RVT_QPN_MASK;
+	cc_event->sl = sl;
+	cc_event->svc_type = svc_type;
+	cc_event->rlid = rlid;
+	/* keep timestamp in units of 1.024 usec */
+	cc_event->timestamp = ktime_get_ns() / 1024;
+
+	spin_unlock_irqrestore(&ppd->cc_log_lock, flags);
+}
+
+void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn,
+		  u32 rqpn, u8 svc_type)
+{
+	struct cca_timer *cca_timer;
+	u16 ccti, ccti_incr, ccti_timer, ccti_limit;
+	u8 trigger_threshold;
+	struct cc_state *cc_state;
+	unsigned long flags;
+
+	if (sl >= OPA_MAX_SLS)
+		return;
+
+	cc_state = get_cc_state(ppd);
+
+	if (!cc_state)
+		return;
+
+	/*
+	 * 1) increase CCTI (for this SL)
+	 * 2) select IPG (i.e., call set_link_ipg())
+	 * 3) start timer
+	 */
+	ccti_limit = cc_state->cct.ccti_limit;
+	ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
+	ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+	trigger_threshold =
+		cc_state->cong_setting.entries[sl].trigger_threshold;
+
+	spin_lock_irqsave(&ppd->cca_timer_lock, flags);
+
+	cca_timer = &ppd->cca_timer[sl];
+	if (cca_timer->ccti < ccti_limit) {
+		if (cca_timer->ccti + ccti_incr <= ccti_limit)
+			cca_timer->ccti += ccti_incr;
+		else
+			cca_timer->ccti = ccti_limit;
+		set_link_ipg(ppd);
+	}
+
+	ccti = cca_timer->ccti;
+
+	if (!hrtimer_active(&cca_timer->hrtimer)) {
+		/* ccti_timer is in units of 1.024 usec */
+		unsigned long nsec = 1024 * ccti_timer;
+
+		hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
+			      HRTIMER_MODE_REL);
+	}
+
+	spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+
+	if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
+		log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
+}
+
+/**
+ * hfi1_rc_rcv - process an incoming RC packet
+ * @packet: data packet information
+ *
+ * This is called from qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * May be called at interrupt level.
+ */
+void hfi1_rc_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	void *data = packet->payload;
+	u32 tlen = packet->tlen;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
+	struct ib_other_headers *ohdr = packet->ohdr;
+	u32 opcode = packet->opcode;
+	u32 hdrsize = packet->hlen;
+	u32 psn = ib_bth_get_psn(packet->ohdr);
+	u32 pad = packet->pad;
+	struct ib_wc wc;
+	u32 pmtu = qp->pmtu;
+	int diff;
+	struct ib_reth *reth;
+	unsigned long flags;
+	int ret;
+	bool is_fecn = false;
+	bool copy_last = false;
+	u32 rkey;
+	u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
+
+	lockdep_assert_held(&qp->r_lock);
+
+	if (hfi1_ruc_check_hdr(ibp, packet))
+		return;
+
+	is_fecn = process_ecn(qp, packet, false);
+
+	/*
+	 * Process responses (ACKs) before anything else.  Note that the
+	 * packet sequence number will be for something in the send work
+	 * queue rather than the expected receive packet sequence number.
+	 * In other words, this QP is the requester.
+	 */
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		rc_rcv_resp(packet);
+		if (is_fecn)
+			goto send_ack;
+		return;
+	}
+
+	/* Compute 24 bits worth of difference. */
+	diff = delta_psn(psn, qp->r_psn);
+	if (unlikely(diff)) {
+		if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
+			return;
+		goto send_ack;
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(SEND_LAST_WITH_INVALIDATE))
+			break;
+		goto nack_inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	default:
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(SEND_LAST_WITH_INVALIDATE) ||
+		    opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			goto nack_inv;
+		/*
+		 * Note that it is up to the requester to not send a new
+		 * RDMA read or atomic operation before receiving an ACK
+		 * for the previous operation.
+		 */
+		break;
+	}
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+		rvt_comm_est(qp);
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+		ret = hfi1_rvt_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+	case OP(RDMA_WRITE_MIDDLE):
+send_middle:
+		/* Check for invalid length PMTU or posted rwqe len. */
+		/*
+		 * There will be no padding for 9B packet but 16B packets
+		 * will come in with some padding since we always add
+		 * CRC and LT bytes which will need to be flit aligned
+		 */
+		if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
+			goto nack_inv;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto nack_inv;
+		hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		/* consume RWQE */
+		ret = hfi1_rvt_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		goto send_last_imm;
+
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+	case OP(SEND_ONLY_WITH_INVALIDATE):
+		ret = hfi1_rvt_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto no_immediate_data;
+		if (opcode == OP(SEND_ONLY_WITH_INVALIDATE))
+			goto send_last_inv;
+		/* FALLTHROUGH -- for SEND_ONLY_WITH_IMMEDIATE */
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+		wc.ex.imm_data = ohdr->u.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+	case OP(SEND_LAST_WITH_INVALIDATE):
+send_last_inv:
+		rkey = be32_to_cpu(ohdr->u.ieth);
+		if (rvt_invalidate_rkey(qp, rkey))
+			goto no_immediate_data;
+		wc.ex.invalidate_rkey = rkey;
+		wc.wc_flags = IB_WC_WITH_INVALIDATE;
+		goto send_last;
+	case OP(RDMA_WRITE_LAST):
+		copy_last = rvt_is_user_qp(qp);
+		/* fall through */
+	case OP(SEND_LAST):
+no_immediate_data:
+		wc.wc_flags = 0;
+		wc.ex.imm_data = 0;
+send_last:
+		/* Check for invalid length. */
+		/* LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + extra_bytes)))
+			goto nack_inv;
+		/* Don't count the CRC(and padding and LT byte for 16B). */
+		tlen -= (hdrsize + extra_bytes);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len))
+			goto nack_inv;
+		hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last);
+		rvt_put_ss(&qp->r_sge);
+		qp->r_msn++;
+		if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+			break;
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		else
+			wc.opcode = IB_WC_RECV;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
+		/*
+		 * It seems that IB mandates the presence of an SL in a
+		 * work completion only for the UD transport (see section
+		 * 11.4.2 of IBTA Vol. 1).
+		 *
+		 * However, the way the SL is chosen below is consistent
+		 * with the way that IB/qib works and is trying avoid
+		 * introducing incompatibilities.
+		 *
+		 * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+		 */
+		wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
+		/* zero fields that are N/A */
+		wc.vendor_err = 0;
+		wc.pkey_index = 0;
+		wc.dlid_path_bits = 0;
+		wc.port_num = 0;
+		/* Signal completion event if the solicited bit is set. */
+		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+			     ib_bth_is_solicited(ohdr));
+		break;
+
+	case OP(RDMA_WRITE_ONLY):
+		copy_last = rvt_is_user_qp(qp);
+		/* fall through */
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto nack_inv;
+		/* consume RWQE */
+		reth = &ohdr->u.rc.reth;
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		qp->r_sge.sg_list = NULL;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = get_ib_reth_vaddr(reth);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
+					 rkey, IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok))
+				goto nack_acc;
+			qp->r_sge.num_sge = 1;
+		} else {
+			qp->r_sge.num_sge = 0;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (opcode == OP(RDMA_WRITE_FIRST))
+			goto send_middle;
+		else if (opcode == OP(RDMA_WRITE_ONLY))
+			goto no_immediate_data;
+		ret = hfi1_rvt_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret) {
+			/* peer will send again */
+			rvt_put_ss(&qp->r_sge);
+			goto rnr_nak;
+		}
+		wc.ex.imm_data = ohdr->u.rc.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+
+	case OP(RDMA_READ_REQUEST): {
+		struct rvt_ack_entry *e;
+		u32 len;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		/* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
+		if (next > HFI1_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			update_ack_queue(qp, next);
+		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+			rvt_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		reth = &ohdr->u.rc.reth;
+		len = be32_to_cpu(reth->length);
+		if (len) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = get_ib_reth_vaddr(reth);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+					 rkey, IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto nack_acc_unlck;
+			/*
+			 * Update the next expected PSN.  We add 1 later
+			 * below, so only add the remainder here.
+			 */
+			qp->r_psn += rvt_div_mtu(qp, len - 1);
+		} else {
+			e->rdma_sge.mr = NULL;
+			e->rdma_sge.vaddr = NULL;
+			e->rdma_sge.length = 0;
+			e->rdma_sge.sge_length = 0;
+		}
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn;
+		e->lpsn = qp->r_psn;
+		/*
+		 * We need to increment the MSN here instead of when we
+		 * finish sending the result since a duplicate request would
+		 * increment it more than once.
+		 */
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send engine. */
+		qp->s_flags |= RVT_S_RESP_PENDING;
+		hfi1_schedule_send(qp);
+
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		if (is_fecn)
+			goto send_ack;
+		return;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		struct ib_atomic_eth *ateth;
+		struct rvt_ack_entry *e;
+		u64 vaddr;
+		atomic64_t *maddr;
+		u64 sdata;
+		u32 rkey;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		if (next > HFI1_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			update_ack_queue(qp, next);
+		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+			rvt_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		ateth = &ohdr->u.atomic_eth;
+		vaddr = get_ib_ateth_vaddr(ateth);
+		if (unlikely(vaddr & (sizeof(u64) - 1)))
+			goto nack_inv_unlck;
+		rkey = be32_to_cpu(ateth->rkey);
+		/* Check rkey & NAK */
+		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+					  vaddr, rkey,
+					  IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_acc_unlck;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
+		sdata = get_ib_ateth_swap(ateth);
+		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+			(u64)atomic64_add_return(sdata, maddr) - sdata :
+			(u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
+				      get_ib_ateth_compare(ateth),
+				      sdata);
+		rvt_put_mr(qp->r_sge.sge.mr);
+		qp->r_sge.num_sge = 0;
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn;
+		e->lpsn = psn;
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send engine. */
+		qp->s_flags |= RVT_S_RESP_PENDING;
+		hfi1_schedule_send(qp);
+
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		if (is_fecn)
+			goto send_ack;
+		return;
+	}
+
+	default:
+		/* NAK unknown opcodes. */
+		goto nack_inv;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+	qp->r_ack_psn = psn;
+	qp->r_nak_state = 0;
+	/* Send an ACK if requested or required. */
+	if (psn & IB_BTH_REQ_ACK) {
+		if (packet->numpkt == 0) {
+			rc_cancel_ack(qp);
+			goto send_ack;
+		}
+		if (qp->r_adefered >= HFI1_PSN_CREDIT) {
+			rc_cancel_ack(qp);
+			goto send_ack;
+		}
+		if (unlikely(is_fecn)) {
+			rc_cancel_ack(qp);
+			goto send_ack;
+		}
+		qp->r_adefered++;
+		rc_defered_ack(rcd, qp);
+	}
+	return;
+
+rnr_nak:
+	qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue RNR NAK for later */
+	rc_defered_ack(rcd, qp);
+	return;
+
+nack_op_err:
+	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	rc_defered_ack(rcd, qp);
+	return;
+
+nack_inv_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	rc_defered_ack(rcd, qp);
+	return;
+
+nack_acc_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+send_ack:
+	hfi1_send_rc_ack(packet, is_fecn);
+}
+
+void hfi1_rc_hdrerr(
+	struct hfi1_ctxtdata *rcd,
+	struct hfi1_packet *packet,
+	struct rvt_qp *qp)
+{
+	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
+	int diff;
+	u32 opcode;
+	u32 psn;
+
+	if (hfi1_ruc_check_hdr(ibp, packet))
+		return;
+
+	psn = ib_bth_get_psn(packet->ohdr);
+	opcode = ib_bth_get_opcode(packet->ohdr);
+
+	/* Only deal with RDMA Writes for now */
+	if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
+		diff = delta_psn(psn, qp->r_psn);
+		if (!qp->r_nak_state && diff >= 0) {
+			ibp->rvp.n_rc_seqnak++;
+			qp->r_nak_state = IB_NAK_PSN_ERROR;
+			/* Use the expected PSN. */
+			qp->r_ack_psn = qp->r_psn;
+			/*
+			 * Wait to send the sequence
+			 * NAK until all packets
+			 * in the receive queue have
+			 * been processed.
+			 * Otherwise, we end up
+			 * propagating congestion.
+			 */
+			rc_defered_ack(rcd, qp);
+		} /* Out of sequence NAK */
+	} /* QP Request NAKs */
+}
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
new file mode 100644
index 0000000..c0071ca
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -0,0 +1,1110 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "qp.h"
+#include "verbs_txreq.h"
+#include "trace.h"
+
+/*
+ * Validate a RWQE and fill in the SGE state.
+ * Return 1 if OK.
+ */
+static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
+{
+	int i, j, ret;
+	struct ib_wc wc;
+	struct rvt_lkey_table *rkt;
+	struct rvt_pd *pd;
+	struct rvt_sge_state *ss;
+
+	rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table;
+	pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
+	ss = &qp->r_sge;
+	ss->sg_list = qp->r_sg_list;
+	qp->r_len = 0;
+	for (i = j = 0; i < wqe->num_sge; i++) {
+		if (wqe->sg_list[i].length == 0)
+			continue;
+		/* Check LKEY */
+		ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+				  NULL, &wqe->sg_list[i],
+				  IB_ACCESS_LOCAL_WRITE);
+		if (unlikely(ret <= 0))
+			goto bad_lkey;
+		qp->r_len += wqe->sg_list[i].length;
+		j++;
+	}
+	ss->num_sge = j;
+	ss->total_len = qp->r_len;
+	ret = 1;
+	goto bail;
+
+bad_lkey:
+	while (j) {
+		struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
+
+		rvt_put_mr(sge->mr);
+	}
+	ss->num_sge = 0;
+	memset(&wc, 0, sizeof(wc));
+	wc.wr_id = wqe->wr_id;
+	wc.status = IB_WC_LOC_PROT_ERR;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	/* Signal solicited completion event. */
+	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * hfi1_rvt_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return -1 if there is a local error, 0 if no RWQE is available,
+ * otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only)
+{
+	unsigned long flags;
+	struct rvt_rq *rq;
+	struct rvt_rwq *wq;
+	struct rvt_srq *srq;
+	struct rvt_rwqe *wqe;
+	void (*handler)(struct ib_event *, void *);
+	u32 tail;
+	int ret;
+
+	if (qp->ibqp.srq) {
+		srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
+		handler = srq->ibsrq.event_handler;
+		rq = &srq->rq;
+	} else {
+		srq = NULL;
+		handler = NULL;
+		rq = &qp->r_rq;
+	}
+
+	spin_lock_irqsave(&rq->lock, flags);
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+		ret = 0;
+		goto unlock;
+	}
+
+	wq = rq->wq;
+	tail = wq->tail;
+	/* Validate tail before using it since it is user writable. */
+	if (tail >= rq->size)
+		tail = 0;
+	if (unlikely(tail == wq->head)) {
+		ret = 0;
+		goto unlock;
+	}
+	/* Make sure entry is read after head index is read. */
+	smp_rmb();
+	wqe = rvt_get_rwqe_ptr(rq, tail);
+	/*
+	 * Even though we update the tail index in memory, the verbs
+	 * consumer is not supposed to post more entries until a
+	 * completion is generated.
+	 */
+	if (++tail >= rq->size)
+		tail = 0;
+	wq->tail = tail;
+	if (!wr_id_only && !init_sge(qp, wqe)) {
+		ret = -1;
+		goto unlock;
+	}
+	qp->r_wr_id = wqe->wr_id;
+
+	ret = 1;
+	set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
+	if (handler) {
+		u32 n;
+
+		/*
+		 * Validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		n = wq->head;
+		if (n >= rq->size)
+			n = 0;
+		if (n < tail)
+			n += rq->size - tail;
+		else
+			n -= tail;
+		if (n < srq->limit) {
+			struct ib_event ev;
+
+			srq->limit = 0;
+			spin_unlock_irqrestore(&rq->lock, flags);
+			ev.device = qp->ibqp.device;
+			ev.element.srq = qp->ibqp.srq;
+			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			handler(&ev, srq->ibsrq.srq_context);
+			goto bail;
+		}
+	}
+unlock:
+	spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+	return ret;
+}
+
+static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
+{
+	return (gid->global.interface_id == id &&
+		(gid->global.subnet_prefix == gid_prefix ||
+		 gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
+}
+
+/*
+ *
+ * This should be called with the QP r_lock held.
+ *
+ * The s_lock will be acquired around the hfi1_migrate_qp() call.
+ */
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_packet *packet)
+{
+	__be64 guid;
+	unsigned long flags;
+	struct rvt_qp *qp = packet->qp;
+	u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)];
+	u32 dlid = packet->dlid;
+	u32 slid = packet->slid;
+	u32 sl = packet->sl;
+	bool migrated = packet->migrated;
+	u16 pkey = packet->pkey;
+
+	if (qp->s_mig_state == IB_MIG_ARMED && migrated) {
+		if (!packet->grh) {
+			if ((rdma_ah_get_ah_flags(&qp->alt_ah_attr) &
+			     IB_AH_GRH) &&
+			    (packet->etype != RHF_RCV_TYPE_BYPASS))
+				return 1;
+		} else {
+			const struct ib_global_route *grh;
+
+			if (!(rdma_ah_get_ah_flags(&qp->alt_ah_attr) &
+			      IB_AH_GRH))
+				return 1;
+			grh = rdma_ah_read_grh(&qp->alt_ah_attr);
+			guid = get_sguid(ibp, grh->sgid_index);
+			if (!gid_ok(&packet->grh->dgid, ibp->rvp.gid_prefix,
+				    guid))
+				return 1;
+			if (!gid_ok(
+				&packet->grh->sgid,
+				grh->dgid.global.subnet_prefix,
+				grh->dgid.global.interface_id))
+				return 1;
+		}
+		if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), pkey,
+					    sc5, slid))) {
+			hfi1_bad_pkey(ibp, pkey, sl, 0, qp->ibqp.qp_num,
+				      slid, dlid);
+			return 1;
+		}
+		/* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
+		if (slid != rdma_ah_get_dlid(&qp->alt_ah_attr) ||
+		    ppd_from_ibp(ibp)->port !=
+			rdma_ah_get_port_num(&qp->alt_ah_attr))
+			return 1;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		hfi1_migrate_qp(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	} else {
+		if (!packet->grh) {
+			if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) &
+			     IB_AH_GRH) &&
+			    (packet->etype != RHF_RCV_TYPE_BYPASS))
+				return 1;
+		} else {
+			const struct ib_global_route *grh;
+
+			if (!(rdma_ah_get_ah_flags(&qp->remote_ah_attr) &
+						   IB_AH_GRH))
+				return 1;
+			grh = rdma_ah_read_grh(&qp->remote_ah_attr);
+			guid = get_sguid(ibp, grh->sgid_index);
+			if (!gid_ok(&packet->grh->dgid, ibp->rvp.gid_prefix,
+				    guid))
+				return 1;
+			if (!gid_ok(
+			     &packet->grh->sgid,
+			     grh->dgid.global.subnet_prefix,
+			     grh->dgid.global.interface_id))
+				return 1;
+		}
+		if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), pkey,
+					    sc5, slid))) {
+			hfi1_bad_pkey(ibp, pkey, sl, 0, qp->ibqp.qp_num,
+				      slid, dlid);
+			return 1;
+		}
+		/* Validate the SLID. See Ch. 9.6.1.5 */
+		if ((slid != rdma_ah_get_dlid(&qp->remote_ah_attr)) ||
+		    ppd_from_ibp(ibp)->port != qp->port_num)
+			return 1;
+		if (qp->s_mig_state == IB_MIG_REARM && !migrated)
+			qp->s_mig_state = IB_MIG_ARMED;
+	}
+
+	return 0;
+}
+
+/**
+ * ruc_loopback - handle UC and RC loopback requests
+ * @sqp: the sending QP
+ *
+ * This is called from hfi1_do_send() to
+ * forward a WQE addressed to the same HFI.
+ * Note that although we are single threaded due to the send engine, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void ruc_loopback(struct rvt_qp *sqp)
+{
+	struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+	struct rvt_qp *qp;
+	struct rvt_swqe *wqe;
+	struct rvt_sge *sge;
+	unsigned long flags;
+	struct ib_wc wc;
+	u64 sdata;
+	atomic64_t *maddr;
+	enum ib_wc_status send_status;
+	bool release;
+	int ret;
+	bool copy_last = false;
+	int local_ops = 0;
+
+	rcu_read_lock();
+
+	/*
+	 * Note that we check the responder QP state after
+	 * checking the requester's state.
+	 */
+	qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
+			    sqp->remote_qpn);
+
+	spin_lock_irqsave(&sqp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
+	    !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+		goto unlock;
+
+	sqp->s_flags |= RVT_S_BUSY;
+
+again:
+	if (sqp->s_last == READ_ONCE(sqp->s_head))
+		goto clr_busy;
+	wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
+
+	/* Return if it is not OK to start a new work request. */
+	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
+			goto clr_busy;
+		/* We are in the error state, flush the work request. */
+		send_status = IB_WC_WR_FLUSH_ERR;
+		goto flush_send;
+	}
+
+	/*
+	 * We can rely on the entry not changing without the s_lock
+	 * being held until we update s_last.
+	 * We increment s_cur to indicate s_last is in progress.
+	 */
+	if (sqp->s_last == sqp->s_cur) {
+		if (++sqp->s_cur >= sqp->s_size)
+			sqp->s_cur = 0;
+	}
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+	if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
+	    qp->ibqp.qp_type != sqp->ibqp.qp_type) {
+		ibp->rvp.n_pkt_drops++;
+		/*
+		 * For RC, the requester would timeout and retry so
+		 * shortcut the timeouts and just signal too many retries.
+		 */
+		if (sqp->ibqp.qp_type == IB_QPT_RC)
+			send_status = IB_WC_RETRY_EXC_ERR;
+		else
+			send_status = IB_WC_SUCCESS;
+		goto serr;
+	}
+
+	memset(&wc, 0, sizeof(wc));
+	send_status = IB_WC_SUCCESS;
+
+	release = true;
+	sqp->s_sge.sge = wqe->sg_list[0];
+	sqp->s_sge.sg_list = wqe->sg_list + 1;
+	sqp->s_sge.num_sge = wqe->wr.num_sge;
+	sqp->s_len = wqe->length;
+	switch (wqe->wr.opcode) {
+	case IB_WR_REG_MR:
+		goto send_comp;
+
+	case IB_WR_LOCAL_INV:
+		if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
+			if (rvt_invalidate_rkey(sqp,
+						wqe->wr.ex.invalidate_rkey))
+				send_status = IB_WC_LOC_PROT_ERR;
+			local_ops = 1;
+		}
+		goto send_comp;
+
+	case IB_WR_SEND_WITH_INV:
+		if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) {
+			wc.wc_flags = IB_WC_WITH_INVALIDATE;
+			wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey;
+		}
+		goto send;
+
+	case IB_WR_SEND_WITH_IMM:
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		/* FALLTHROUGH */
+	case IB_WR_SEND:
+send:
+		ret = hfi1_rvt_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto op_err;
+		if (!ret)
+			goto rnr_nak;
+		break;
+
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		ret = hfi1_rvt_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto op_err;
+		if (!ret)
+			goto rnr_nak;
+		/* skip copy_last set and qp_access_flags recheck */
+		goto do_write;
+	case IB_WR_RDMA_WRITE:
+		copy_last = rvt_is_user_qp(qp);
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+do_write:
+		if (wqe->length == 0)
+			break;
+		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
+					  wqe->rdma_wr.remote_addr,
+					  wqe->rdma_wr.rkey,
+					  IB_ACCESS_REMOTE_WRITE)))
+			goto acc_err;
+		qp->r_sge.sg_list = NULL;
+		qp->r_sge.num_sge = 1;
+		qp->r_sge.total_len = wqe->length;
+		break;
+
+	case IB_WR_RDMA_READ:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto inv_err;
+		if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
+					  wqe->rdma_wr.remote_addr,
+					  wqe->rdma_wr.rkey,
+					  IB_ACCESS_REMOTE_READ)))
+			goto acc_err;
+		release = false;
+		sqp->s_sge.sg_list = NULL;
+		sqp->s_sge.num_sge = 1;
+		qp->r_sge.sge = wqe->sg_list[0];
+		qp->r_sge.sg_list = wqe->sg_list + 1;
+		qp->r_sge.num_sge = wqe->wr.num_sge;
+		qp->r_sge.total_len = wqe->length;
+		break;
+
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto inv_err;
+		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+					  wqe->atomic_wr.remote_addr,
+					  wqe->atomic_wr.rkey,
+					  IB_ACCESS_REMOTE_ATOMIC)))
+			goto acc_err;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
+		sdata = wqe->atomic_wr.compare_add;
+		*(u64 *)sqp->s_sge.sge.vaddr =
+			(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+			(u64)atomic64_add_return(sdata, maddr) - sdata :
+			(u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
+				      sdata, wqe->atomic_wr.swap);
+		rvt_put_mr(qp->r_sge.sge.mr);
+		qp->r_sge.num_sge = 0;
+		goto send_comp;
+
+	default:
+		send_status = IB_WC_LOC_QP_OP_ERR;
+		goto serr;
+	}
+
+	sge = &sqp->s_sge.sge;
+	while (sqp->s_len) {
+		u32 len = sqp->s_len;
+
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		WARN_ON_ONCE(len == 0);
+		hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (!release)
+				rvt_put_mr(sge->mr);
+			if (--sqp->s_sge.num_sge)
+				*sge = *sqp->s_sge.sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= RVT_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		sqp->s_len -= len;
+	}
+	if (release)
+		rvt_put_ss(&qp->r_sge);
+
+	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+		goto send_comp;
+
+	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+	else
+		wc.opcode = IB_WC_RECV;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.byte_len = wqe->length;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = qp->remote_qpn;
+	wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
+	wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+		     wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	ibp->rvp.n_loop_pkts++;
+flush_send:
+	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+	hfi1_send_complete(sqp, wqe, send_status);
+	if (local_ops) {
+		atomic_dec(&sqp->local_ops_pending);
+		local_ops = 0;
+	}
+	goto again;
+
+rnr_nak:
+	/* Handle RNR NAK */
+	if (qp->ibqp.qp_type == IB_QPT_UC)
+		goto send_comp;
+	ibp->rvp.n_rnr_naks++;
+	/*
+	 * Note: we don't need the s_lock held since the BUSY flag
+	 * makes this single threaded.
+	 */
+	if (sqp->s_rnr_retry == 0) {
+		send_status = IB_WC_RNR_RETRY_EXC_ERR;
+		goto serr;
+	}
+	if (sqp->s_rnr_retry_cnt < 7)
+		sqp->s_rnr_retry--;
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
+		goto clr_busy;
+	rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer <<
+				IB_AETH_CREDIT_SHIFT);
+	goto clr_busy;
+
+op_err:
+	send_status = IB_WC_REM_OP_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+inv_err:
+	send_status = IB_WC_REM_INV_REQ_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+acc_err:
+	send_status = IB_WC_REM_ACCESS_ERR;
+	wc.status = IB_WC_LOC_PROT_ERR;
+err:
+	/* responder goes to error state */
+	rvt_rc_error(qp, wc.status);
+
+serr:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	hfi1_send_complete(sqp, wqe, send_status);
+	if (sqp->ibqp.qp_type == IB_QPT_RC) {
+		int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+		sqp->s_flags &= ~RVT_S_BUSY;
+		spin_unlock_irqrestore(&sqp->s_lock, flags);
+		if (lastwqe) {
+			struct ib_event ev;
+
+			ev.device = sqp->ibqp.device;
+			ev.element.qp = &sqp->ibqp;
+			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+		}
+		goto done;
+	}
+clr_busy:
+	sqp->s_flags &= ~RVT_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+	rcu_read_unlock();
+}
+
+/**
+ * hfi1_make_grh - construct a GRH header
+ * @ibp: a pointer to the IB port
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: size of header after grh being sent in dwords
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+		  const struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+	hdr->version_tclass_flow =
+		cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
+			    (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
+			    (grh->flow_label << IB_GRH_FLOW_SHIFT));
+	hdr->paylen = cpu_to_be16((hwords + nwords) << 2);
+	/* next_hdr is defined by C8-7 in ch. 8.4.1 */
+	hdr->next_hdr = IB_GRH_NEXT_HDR;
+	hdr->hop_limit = grh->hop_limit;
+	/* The SGID is 32-bit aligned. */
+	hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix;
+	hdr->sgid.global.interface_id =
+		grh->sgid_index < HFI1_GUIDS_PER_PORT ?
+		get_sguid(ibp, grh->sgid_index) :
+		get_sguid(ibp, HFI1_PORT_GUID_INDEX);
+	hdr->dgid = grh->dgid;
+
+	/* GRH header size in 32-bit words. */
+	return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+#define BTH2_OFFSET (offsetof(struct hfi1_sdma_header, \
+			      hdr.ibh.u.oth.bth[2]) / 4)
+
+/**
+ * build_ahg - create ahg in s_ahg
+ * @qp: a pointer to QP
+ * @npsn: the next PSN for the request/response
+ *
+ * This routine handles the AHG by allocating an ahg entry and causing the
+ * copy of the first middle.
+ *
+ * Subsequent middles use the copied entry, editing the
+ * PSN with 1 or 2 edits.
+ */
+static inline void build_ahg(struct rvt_qp *qp, u32 npsn)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	if (unlikely(qp->s_flags & RVT_S_AHG_CLEAR))
+		clear_ahg(qp);
+	if (!(qp->s_flags & RVT_S_AHG_VALID)) {
+		/* first middle that needs copy  */
+		if (qp->s_ahgidx < 0)
+			qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde);
+		if (qp->s_ahgidx >= 0) {
+			qp->s_ahgpsn = npsn;
+			priv->s_ahg->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
+			/* save to protect a change in another thread */
+			priv->s_ahg->ahgidx = qp->s_ahgidx;
+			qp->s_flags |= RVT_S_AHG_VALID;
+		}
+	} else {
+		/* subsequent middle after valid */
+		if (qp->s_ahgidx >= 0) {
+			priv->s_ahg->tx_flags |= SDMA_TXREQ_F_USE_AHG;
+			priv->s_ahg->ahgidx = qp->s_ahgidx;
+			priv->s_ahg->ahgcount++;
+			priv->s_ahg->ahgdesc[0] =
+				sdma_build_ahg_descriptor(
+					(__force u16)cpu_to_be16((u16)npsn),
+					BTH2_OFFSET,
+					16,
+					16);
+			if ((npsn & 0xffff0000) !=
+					(qp->s_ahgpsn & 0xffff0000)) {
+				priv->s_ahg->ahgcount++;
+				priv->s_ahg->ahgdesc[1] =
+					sdma_build_ahg_descriptor(
+						(__force u16)cpu_to_be16(
+							(u16)(npsn >> 16)),
+						BTH2_OFFSET,
+						0,
+						16);
+			}
+		}
+	}
+}
+
+static inline void hfi1_make_ruc_bth(struct rvt_qp *qp,
+				     struct ib_other_headers *ohdr,
+				     u32 bth0, u32 bth1, u32 bth2)
+{
+	bth1 |= qp->remote_qpn;
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(bth1);
+	ohdr->bth[2] = cpu_to_be32(bth2);
+}
+
+/**
+ * hfi1_make_ruc_header_16B - build a 16B header
+ * @qp: the queue pair
+ * @ohdr: a pointer to the destination header memory
+ * @bth0: bth0 passed in from the RC/UC builder
+ * @bth2: bth2 passed in from the RC/UC builder
+ * @middle: non zero implies indicates ahg "could" be used
+ * @ps: the current packet state
+ *
+ * This routine may disarm ahg under these situations:
+ * - packet needs a GRH
+ * - BECN needed
+ * - migration state not IB_MIG_MIGRATED
+ */
+static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp,
+					    struct ib_other_headers *ohdr,
+					    u32 bth0, u32 bth2, int middle,
+					    struct hfi1_pkt_state *ps)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_ibport *ibp = ps->ibp;
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 bth1 = 0;
+	u32 slid;
+	u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
+	u8 l4 = OPA_16B_L4_IB_LOCAL;
+	u8 extra_bytes = hfi1_get_16b_padding(
+				(ps->s_txreq->hdr_dwords << 2),
+				ps->s_txreq->s_cur_size);
+	u32 nwords = SIZE_OF_CRC + ((ps->s_txreq->s_cur_size +
+				 extra_bytes + SIZE_OF_LT) >> 2);
+	bool becn = false;
+
+	if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
+	    hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) {
+		struct ib_grh *grh;
+		struct ib_global_route *grd =
+			rdma_ah_retrieve_grh(&qp->remote_ah_attr);
+		/*
+		 * Ensure OPA GIDs are transformed to IB gids
+		 * before creating the GRH.
+		 */
+		if (grd->sgid_index == OPA_GID_INDEX)
+			grd->sgid_index = 0;
+		grh = &ps->s_txreq->phdr.hdr.opah.u.l.grh;
+		l4 = OPA_16B_L4_IB_GLOBAL;
+		ps->s_txreq->hdr_dwords +=
+			hfi1_make_grh(ibp, grh, grd,
+				      ps->s_txreq->hdr_dwords - LRH_16B_DWORDS,
+				      nwords);
+		middle = 0;
+	}
+
+	if (qp->s_mig_state == IB_MIG_MIGRATED)
+		bth1 |= OPA_BTH_MIG_REQ;
+	else
+		middle = 0;
+
+	if (qp->s_flags & RVT_S_ECN) {
+		qp->s_flags &= ~RVT_S_ECN;
+		/* we recently received a FECN, so return a BECN */
+		becn = true;
+		middle = 0;
+	}
+	if (middle)
+		build_ahg(qp, bth2);
+	else
+		qp->s_flags &= ~RVT_S_AHG_VALID;
+
+	bth0 |= pkey;
+	bth0 |= extra_bytes << 20;
+	hfi1_make_ruc_bth(qp, ohdr, bth0, bth1, bth2);
+
+	if (!ppd->lid)
+		slid = be32_to_cpu(OPA_LID_PERMISSIVE);
+	else
+		slid = ppd->lid |
+			(rdma_ah_get_path_bits(&qp->remote_ah_attr) &
+			((1 << ppd->lmc) - 1));
+
+	hfi1_make_16b_hdr(&ps->s_txreq->phdr.hdr.opah,
+			  slid,
+			  opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr),
+				      16B),
+			  (ps->s_txreq->hdr_dwords + nwords) >> 1,
+			  pkey, becn, 0, l4, priv->s_sc);
+}
+
+/**
+ * hfi1_make_ruc_header_9B - build a 9B header
+ * @qp: the queue pair
+ * @ohdr: a pointer to the destination header memory
+ * @bth0: bth0 passed in from the RC/UC builder
+ * @bth2: bth2 passed in from the RC/UC builder
+ * @middle: non zero implies indicates ahg "could" be used
+ * @ps: the current packet state
+ *
+ * This routine may disarm ahg under these situations:
+ * - packet needs a GRH
+ * - BECN needed
+ * - migration state not IB_MIG_MIGRATED
+ */
+static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp,
+					   struct ib_other_headers *ohdr,
+					   u32 bth0, u32 bth2, int middle,
+					   struct hfi1_pkt_state *ps)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_ibport *ibp = ps->ibp;
+	u32 bth1 = 0;
+	u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
+	u16 lrh0 = HFI1_LRH_BTH;
+	u8 extra_bytes = -ps->s_txreq->s_cur_size & 3;
+	u32 nwords = SIZE_OF_CRC + ((ps->s_txreq->s_cur_size +
+					 extra_bytes) >> 2);
+
+	if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) {
+		struct ib_grh *grh = &ps->s_txreq->phdr.hdr.ibh.u.l.grh;
+
+		lrh0 = HFI1_LRH_GRH;
+		ps->s_txreq->hdr_dwords +=
+			hfi1_make_grh(ibp, grh,
+				      rdma_ah_read_grh(&qp->remote_ah_attr),
+				      ps->s_txreq->hdr_dwords - LRH_9B_DWORDS,
+				      nwords);
+		middle = 0;
+	}
+	lrh0 |= (priv->s_sc & 0xf) << 12 |
+		(rdma_ah_get_sl(&qp->remote_ah_attr) & 0xf) << 4;
+
+	if (qp->s_mig_state == IB_MIG_MIGRATED)
+		bth0 |= IB_BTH_MIG_REQ;
+	else
+		middle = 0;
+
+	if (qp->s_flags & RVT_S_ECN) {
+		qp->s_flags &= ~RVT_S_ECN;
+		/* we recently received a FECN, so return a BECN */
+		bth1 |= (IB_BECN_MASK << IB_BECN_SHIFT);
+		middle = 0;
+	}
+	if (middle)
+		build_ahg(qp, bth2);
+	else
+		qp->s_flags &= ~RVT_S_AHG_VALID;
+
+	bth0 |= pkey;
+	bth0 |= extra_bytes << 20;
+	hfi1_make_ruc_bth(qp, ohdr, bth0, bth1, bth2);
+	hfi1_make_ib_hdr(&ps->s_txreq->phdr.hdr.ibh,
+			 lrh0,
+			 ps->s_txreq->hdr_dwords + nwords,
+			 opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B),
+			 ppd_from_ibp(ibp)->lid |
+				rdma_ah_get_path_bits(&qp->remote_ah_attr));
+}
+
+typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp,
+				  struct ib_other_headers *ohdr,
+				  u32 bth0, u32 bth2, int middle,
+				  struct hfi1_pkt_state *ps);
+
+/* We support only two types - 9B and 16B for now */
+static const hfi1_make_ruc_hdr hfi1_ruc_header_tbl[2] = {
+	[HFI1_PKT_TYPE_9B] = &hfi1_make_ruc_header_9B,
+	[HFI1_PKT_TYPE_16B] = &hfi1_make_ruc_header_16B
+};
+
+void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
+			  u32 bth0, u32 bth2, int middle,
+			  struct hfi1_pkt_state *ps)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	/*
+	 * reset s_ahg/AHG fields
+	 *
+	 * This insures that the ahgentry/ahgcount
+	 * are at a non-AHG default to protect
+	 * build_verbs_tx_desc() from using
+	 * an include ahgidx.
+	 *
+	 * build_ahg() will modify as appropriate
+	 * to use the AHG feature.
+	 */
+	priv->s_ahg->tx_flags = 0;
+	priv->s_ahg->ahgcount = 0;
+	priv->s_ahg->ahgidx = 0;
+
+	/* Make the appropriate header */
+	hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth2, middle, ps);
+}
+
+/* when sending, force a reschedule every one of these periods */
+#define SEND_RESCHED_TIMEOUT (5 * HZ)  /* 5s in jiffies */
+
+/**
+ * schedule_send_yield - test for a yield required for QP send engine
+ * @timeout: Final time for timeout slice for jiffies
+ * @qp: a pointer to QP
+ * @ps: a pointer to a structure with commonly lookup values for
+ *      the the send engine progress
+ *
+ * This routine checks if the time slice for the QP has expired
+ * for RC QPs, if so an additional work entry is queued. At this
+ * point, other QPs have an opportunity to be scheduled. It
+ * returns true if a yield is required, otherwise, false
+ * is returned.
+ */
+static bool schedule_send_yield(struct rvt_qp *qp,
+				struct hfi1_pkt_state *ps)
+{
+	ps->pkts_sent = true;
+
+	if (unlikely(time_after(jiffies, ps->timeout))) {
+		if (!ps->in_thread ||
+		    workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
+			spin_lock_irqsave(&qp->s_lock, ps->flags);
+			qp->s_flags &= ~RVT_S_BUSY;
+			hfi1_schedule_send(qp);
+			spin_unlock_irqrestore(&qp->s_lock, ps->flags);
+			this_cpu_inc(*ps->ppd->dd->send_schedule);
+			trace_hfi1_rc_expired_time_slice(qp, true);
+			return true;
+		}
+
+		cond_resched();
+		this_cpu_inc(*ps->ppd->dd->send_schedule);
+		ps->timeout = jiffies + ps->timeout_int;
+	}
+
+	trace_hfi1_rc_expired_time_slice(qp, false);
+	return false;
+}
+
+void hfi1_do_send_from_rvt(struct rvt_qp *qp)
+{
+	hfi1_do_send(qp, false);
+}
+
+void _hfi1_do_send(struct work_struct *work)
+{
+	struct iowait *wait = container_of(work, struct iowait, iowork);
+	struct rvt_qp *qp = iowait_to_qp(wait);
+
+	hfi1_do_send(qp, true);
+}
+
+/**
+ * hfi1_do_send - perform a send on a QP
+ * @work: contains a pointer to the QP
+ * @in_thread: true if in a workqueue thread
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted.  Only allow one CPU to send a packet per QP.
+ * Otherwise, two threads could send packets out of order.
+ */
+void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
+{
+	struct hfi1_pkt_state ps;
+	struct hfi1_qp_priv *priv = qp->priv;
+	int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+	ps.dev = to_idev(qp->ibqp.device);
+	ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ps.ppd = ppd_from_ibp(ps.ibp);
+	ps.in_thread = in_thread;
+
+	trace_hfi1_rc_do_send(qp, in_thread);
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_RC:
+		if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) &
+				   ~((1 << ps.ppd->lmc) - 1)) ==
+				  ps.ppd->lid)) {
+			ruc_loopback(qp);
+			return;
+		}
+		make_req = hfi1_make_rc_req;
+		ps.timeout_int = qp->timeout_jiffies;
+		break;
+	case IB_QPT_UC:
+		if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) &
+				   ~((1 << ps.ppd->lmc) - 1)) ==
+				  ps.ppd->lid)) {
+			ruc_loopback(qp);
+			return;
+		}
+		make_req = hfi1_make_uc_req;
+		ps.timeout_int = SEND_RESCHED_TIMEOUT;
+		break;
+	default:
+		make_req = hfi1_make_ud_req;
+		ps.timeout_int = SEND_RESCHED_TIMEOUT;
+	}
+
+	spin_lock_irqsave(&qp->s_lock, ps.flags);
+
+	/* Return if we are already busy processing a work request. */
+	if (!hfi1_send_ok(qp)) {
+		spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+		return;
+	}
+
+	qp->s_flags |= RVT_S_BUSY;
+
+	ps.timeout_int = ps.timeout_int / 8;
+	ps.timeout = jiffies + ps.timeout_int;
+	ps.cpu = priv->s_sde ? priv->s_sde->cpu :
+			cpumask_first(cpumask_of_node(ps.ppd->dd->node));
+	ps.pkts_sent = false;
+
+	/* insure a pre-built packet is handled  */
+	ps.s_txreq = get_waiting_verbs_txreq(qp);
+	do {
+		/* Check for a constructed packet to be sent. */
+		if (ps.s_txreq) {
+			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+			/*
+			 * If the packet cannot be sent now, return and
+			 * the send engine will be woken up later.
+			 */
+			if (hfi1_verbs_send(qp, &ps))
+				return;
+			/* allow other tasks to run */
+			if (schedule_send_yield(qp, &ps))
+				return;
+
+			spin_lock_irqsave(&qp->s_lock, ps.flags);
+		}
+	} while (make_req(qp, &ps));
+	iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
+	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
+			enum ib_wc_status status)
+{
+	u32 old_last, last;
+
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+		return;
+
+	last = qp->s_last;
+	old_last = last;
+	trace_hfi1_qp_send_completion(qp, wqe, last);
+	if (++last >= qp->s_size)
+		last = 0;
+	trace_hfi1_qp_send_completion(qp, wqe, last);
+	qp->s_last = last;
+	/* See post_send() */
+	barrier();
+	rvt_put_swqe(wqe);
+	if (qp->ibqp.qp_type == IB_QPT_UD ||
+	    qp->ibqp.qp_type == IB_QPT_SMI ||
+	    qp->ibqp.qp_type == IB_QPT_GSI)
+		atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
+
+	rvt_qp_swqe_complete(qp,
+			     wqe,
+			     ib_hfi1_wc_opcode[wqe->wr.opcode],
+			     status);
+
+	if (qp->s_acked == old_last)
+		qp->s_acked = last;
+	if (qp->s_cur == old_last)
+		qp->s_cur = last;
+	if (qp->s_tail == old_last)
+		qp->s_tail = last;
+	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+		qp->s_draining = 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
new file mode 100644
index 0000000..1f20330
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -0,0 +1,3426 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/seqlock.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/bitops.h>
+#include <linux/timer.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "qp.h"
+#include "sdma.h"
+#include "iowait.h"
+#include "trace.h"
+
+/* must be a power of 2 >= 64 <= 32768 */
+#define SDMA_DESCQ_CNT 2048
+#define SDMA_DESC_INTR 64
+#define INVALID_TAIL 0xffff
+
+static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
+module_param(sdma_descq_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
+
+static uint sdma_idle_cnt = 250;
+module_param(sdma_idle_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)");
+
+uint mod_num_sdma;
+module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO);
+MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use");
+
+static uint sdma_desct_intr = SDMA_DESC_INTR;
+module_param_named(desct_intr, sdma_desct_intr, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(desct_intr, "Number of SDMA descriptor before interrupt");
+
+#define SDMA_WAIT_BATCH_SIZE 20
+/* max wait time for a SDMA engine to indicate it has halted */
+#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */
+/* all SDMA engine errors that cause a halt */
+
+#define SD(name) SEND_DMA_##name
+#define ALL_SDMA_ENG_HALT_ERRS \
+	(SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \
+	| SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK))
+
+/* sdma_sendctrl operations */
+#define SDMA_SENDCTRL_OP_ENABLE    BIT(0)
+#define SDMA_SENDCTRL_OP_INTENABLE BIT(1)
+#define SDMA_SENDCTRL_OP_HALT      BIT(2)
+#define SDMA_SENDCTRL_OP_CLEANUP   BIT(3)
+
+/* handle long defines */
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT
+
+static const char * const sdma_state_names[] = {
+	[sdma_state_s00_hw_down]                = "s00_HwDown",
+	[sdma_state_s10_hw_start_up_halt_wait]  = "s10_HwStartUpHaltWait",
+	[sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait",
+	[sdma_state_s20_idle]                   = "s20_Idle",
+	[sdma_state_s30_sw_clean_up_wait]       = "s30_SwCleanUpWait",
+	[sdma_state_s40_hw_clean_up_wait]       = "s40_HwCleanUpWait",
+	[sdma_state_s50_hw_halt_wait]           = "s50_HwHaltWait",
+	[sdma_state_s60_idle_halt_wait]         = "s60_IdleHaltWait",
+	[sdma_state_s80_hw_freeze]		= "s80_HwFreeze",
+	[sdma_state_s82_freeze_sw_clean]	= "s82_FreezeSwClean",
+	[sdma_state_s99_running]                = "s99_Running",
+};
+
+#ifdef CONFIG_SDMA_VERBOSITY
+static const char * const sdma_event_names[] = {
+	[sdma_event_e00_go_hw_down]   = "e00_GoHwDown",
+	[sdma_event_e10_go_hw_start]  = "e10_GoHwStart",
+	[sdma_event_e15_hw_halt_done] = "e15_HwHaltDone",
+	[sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone",
+	[sdma_event_e30_go_running]   = "e30_GoRunning",
+	[sdma_event_e40_sw_cleaned]   = "e40_SwCleaned",
+	[sdma_event_e50_hw_cleaned]   = "e50_HwCleaned",
+	[sdma_event_e60_hw_halted]    = "e60_HwHalted",
+	[sdma_event_e70_go_idle]      = "e70_GoIdle",
+	[sdma_event_e80_hw_freeze]    = "e80_HwFreeze",
+	[sdma_event_e81_hw_frozen]    = "e81_HwFrozen",
+	[sdma_event_e82_hw_unfreeze]  = "e82_HwUnfreeze",
+	[sdma_event_e85_link_down]    = "e85_LinkDown",
+	[sdma_event_e90_sw_halted]    = "e90_SwHalted",
+};
+#endif
+
+static const struct sdma_set_state_action sdma_action_table[] = {
+	[sdma_state_s00_hw_down] = {
+		.go_s99_running_tofalse = 1,
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s10_hw_start_up_halt_wait] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 1,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s15_hw_start_up_clean_wait] = {
+		.op_enable = 0,
+		.op_intenable = 1,
+		.op_halt = 0,
+		.op_cleanup = 1,
+	},
+	[sdma_state_s20_idle] = {
+		.op_enable = 0,
+		.op_intenable = 1,
+		.op_halt = 0,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s30_sw_clean_up_wait] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s40_hw_clean_up_wait] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_cleanup = 1,
+	},
+	[sdma_state_s50_hw_halt_wait] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s60_idle_halt_wait] = {
+		.go_s99_running_tofalse = 1,
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 1,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s80_hw_freeze] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s82_freeze_sw_clean] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_cleanup = 0,
+	},
+	[sdma_state_s99_running] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 0,
+		.op_cleanup = 0,
+		.go_s99_running_totrue = 1,
+	},
+};
+
+#define SDMA_TAIL_UPDATE_THRESH 0x1F
+
+/* declare all statics here rather than keep sorting */
+static void sdma_complete(struct kref *);
+static void sdma_finalput(struct sdma_state *);
+static void sdma_get(struct sdma_state *);
+static void sdma_hw_clean_up_task(unsigned long);
+static void sdma_put(struct sdma_state *);
+static void sdma_set_state(struct sdma_engine *, enum sdma_states);
+static void sdma_start_hw_clean_up(struct sdma_engine *);
+static void sdma_sw_clean_up_task(unsigned long);
+static void sdma_sendctrl(struct sdma_engine *, unsigned);
+static void init_sdma_regs(struct sdma_engine *, u32, uint);
+static void sdma_process_event(
+	struct sdma_engine *sde,
+	enum sdma_events event);
+static void __sdma_process_event(
+	struct sdma_engine *sde,
+	enum sdma_events event);
+static void dump_sdma_state(struct sdma_engine *sde);
+static void sdma_make_progress(struct sdma_engine *sde, u64 status);
+static void sdma_desc_avail(struct sdma_engine *sde, uint avail);
+static void sdma_flush_descq(struct sdma_engine *sde);
+
+/**
+ * sdma_state_name() - return state string from enum
+ * @state: state
+ */
+static const char *sdma_state_name(enum sdma_states state)
+{
+	return sdma_state_names[state];
+}
+
+static void sdma_get(struct sdma_state *ss)
+{
+	kref_get(&ss->kref);
+}
+
+static void sdma_complete(struct kref *kref)
+{
+	struct sdma_state *ss =
+		container_of(kref, struct sdma_state, kref);
+
+	complete(&ss->comp);
+}
+
+static void sdma_put(struct sdma_state *ss)
+{
+	kref_put(&ss->kref, sdma_complete);
+}
+
+static void sdma_finalput(struct sdma_state *ss)
+{
+	sdma_put(ss);
+	wait_for_completion(&ss->comp);
+}
+
+static inline void write_sde_csr(
+	struct sdma_engine *sde,
+	u32 offset0,
+	u64 value)
+{
+	write_kctxt_csr(sde->dd, sde->this_idx, offset0, value);
+}
+
+static inline u64 read_sde_csr(
+	struct sdma_engine *sde,
+	u32 offset0)
+{
+	return read_kctxt_csr(sde->dd, sde->this_idx, offset0);
+}
+
+/*
+ * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for
+ * sdma engine 'sde' to drop to 0.
+ */
+static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
+					int pause)
+{
+	u64 off = 8 * sde->this_idx;
+	struct hfi1_devdata *dd = sde->dd;
+	int lcnt = 0;
+	u64 reg_prev;
+	u64 reg = 0;
+
+	while (1) {
+		reg_prev = reg;
+		reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
+
+		reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
+		reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
+		if (reg == 0)
+			break;
+		/* counter is reest if accupancy count changes */
+		if (reg != reg_prev)
+			lcnt = 0;
+		if (lcnt++ > 500) {
+			/* timed out - bounce the link */
+			dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n",
+				   __func__, sde->this_idx, (u32)reg);
+			queue_work(dd->pport->link_wq,
+				   &dd->pport->link_bounce_work);
+			break;
+		}
+		udelay(1);
+	}
+}
+
+/*
+ * sdma_wait() - wait for packet egress to complete for all SDMA engines,
+ * and pause for credit return.
+ */
+void sdma_wait(struct hfi1_devdata *dd)
+{
+	int i;
+
+	for (i = 0; i < dd->num_sdma; i++) {
+		struct sdma_engine *sde = &dd->per_sdma[i];
+
+		sdma_wait_for_packet_egress(sde, 0);
+	}
+}
+
+static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
+{
+	u64 reg;
+
+	if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT))
+		return;
+	reg = cnt;
+	reg &= SD(DESC_CNT_CNT_MASK);
+	reg <<= SD(DESC_CNT_CNT_SHIFT);
+	write_sde_csr(sde, SD(DESC_CNT), reg);
+}
+
+static inline void complete_tx(struct sdma_engine *sde,
+			       struct sdma_txreq *tx,
+			       int res)
+{
+	/* protect against complete modifying */
+	struct iowait *wait = tx->wait;
+	callback_t complete = tx->complete;
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+	trace_hfi1_sdma_out_sn(sde, tx->sn);
+	if (WARN_ON_ONCE(sde->head_sn != tx->sn))
+		dd_dev_err(sde->dd, "expected %llu got %llu\n",
+			   sde->head_sn, tx->sn);
+	sde->head_sn++;
+#endif
+	__sdma_txclean(sde->dd, tx);
+	if (complete)
+		(*complete)(tx, res);
+	if (wait && iowait_sdma_dec(wait))
+		iowait_drain_wakeup(wait);
+}
+
+/*
+ * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
+ *
+ * Depending on timing there can be txreqs in two places:
+ * - in the descq ring
+ * - in the flush list
+ *
+ * To avoid ordering issues the descq ring needs to be flushed
+ * first followed by the flush list.
+ *
+ * This routine is called from two places
+ * - From a work queue item
+ * - Directly from the state machine just before setting the
+ *   state to running
+ *
+ * Must be called with head_lock held
+ *
+ */
+static void sdma_flush(struct sdma_engine *sde)
+{
+	struct sdma_txreq *txp, *txp_next;
+	LIST_HEAD(flushlist);
+	unsigned long flags;
+
+	/* flush from head to tail */
+	sdma_flush_descq(sde);
+	spin_lock_irqsave(&sde->flushlist_lock, flags);
+	/* copy flush list */
+	list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) {
+		list_del_init(&txp->list);
+		list_add_tail(&txp->list, &flushlist);
+	}
+	spin_unlock_irqrestore(&sde->flushlist_lock, flags);
+	/* flush from flush list */
+	list_for_each_entry_safe(txp, txp_next, &flushlist, list)
+		complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
+}
+
+/*
+ * Fields a work request for flushing the descq ring
+ * and the flush list
+ *
+ * If the engine has been brought to running during
+ * the scheduling delay, the flush is ignored, assuming
+ * that the process of bringing the engine to running
+ * would have done this flush prior to going to running.
+ *
+ */
+static void sdma_field_flush(struct work_struct *work)
+{
+	unsigned long flags;
+	struct sdma_engine *sde =
+		container_of(work, struct sdma_engine, flush_worker);
+
+	write_seqlock_irqsave(&sde->head_lock, flags);
+	if (!__sdma_running(sde))
+		sdma_flush(sde);
+	write_sequnlock_irqrestore(&sde->head_lock, flags);
+}
+
+static void sdma_err_halt_wait(struct work_struct *work)
+{
+	struct sdma_engine *sde = container_of(work, struct sdma_engine,
+						err_halt_worker);
+	u64 statuscsr;
+	unsigned long timeout;
+
+	timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT);
+	while (1) {
+		statuscsr = read_sde_csr(sde, SD(STATUS));
+		statuscsr &= SD(STATUS_ENG_HALTED_SMASK);
+		if (statuscsr)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(sde->dd,
+				   "SDMA engine %d - timeout waiting for engine to halt\n",
+				   sde->this_idx);
+			/*
+			 * Continue anyway.  This could happen if there was
+			 * an uncorrectable error in the wrong spot.
+			 */
+			break;
+		}
+		usleep_range(80, 120);
+	}
+
+	sdma_process_event(sde, sdma_event_e15_hw_halt_done);
+}
+
+static void sdma_err_progress_check_schedule(struct sdma_engine *sde)
+{
+	if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) {
+		unsigned index;
+		struct hfi1_devdata *dd = sde->dd;
+
+		for (index = 0; index < dd->num_sdma; index++) {
+			struct sdma_engine *curr_sdma = &dd->per_sdma[index];
+
+			if (curr_sdma != sde)
+				curr_sdma->progress_check_head =
+							curr_sdma->descq_head;
+		}
+		dd_dev_err(sde->dd,
+			   "SDMA engine %d - check scheduled\n",
+				sde->this_idx);
+		mod_timer(&sde->err_progress_check_timer, jiffies + 10);
+	}
+}
+
+static void sdma_err_progress_check(struct timer_list *t)
+{
+	unsigned index;
+	struct sdma_engine *sde = from_timer(sde, t, err_progress_check_timer);
+
+	dd_dev_err(sde->dd, "SDE progress check event\n");
+	for (index = 0; index < sde->dd->num_sdma; index++) {
+		struct sdma_engine *curr_sde = &sde->dd->per_sdma[index];
+		unsigned long flags;
+
+		/* check progress on each engine except the current one */
+		if (curr_sde == sde)
+			continue;
+		/*
+		 * We must lock interrupts when acquiring sde->lock,
+		 * to avoid a deadlock if interrupt triggers and spins on
+		 * the same lock on same CPU
+		 */
+		spin_lock_irqsave(&curr_sde->tail_lock, flags);
+		write_seqlock(&curr_sde->head_lock);
+
+		/* skip non-running queues */
+		if (curr_sde->state.current_state != sdma_state_s99_running) {
+			write_sequnlock(&curr_sde->head_lock);
+			spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+			continue;
+		}
+
+		if ((curr_sde->descq_head != curr_sde->descq_tail) &&
+		    (curr_sde->descq_head ==
+				curr_sde->progress_check_head))
+			__sdma_process_event(curr_sde,
+					     sdma_event_e90_sw_halted);
+		write_sequnlock(&curr_sde->head_lock);
+		spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+	}
+	schedule_work(&sde->err_halt_worker);
+}
+
+static void sdma_hw_clean_up_task(unsigned long opaque)
+{
+	struct sdma_engine *sde = (struct sdma_engine *)opaque;
+	u64 statuscsr;
+
+	while (1) {
+#ifdef CONFIG_SDMA_VERBOSITY
+		dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+			   sde->this_idx, slashstrip(__FILE__), __LINE__,
+			__func__);
+#endif
+		statuscsr = read_sde_csr(sde, SD(STATUS));
+		statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK);
+		if (statuscsr)
+			break;
+		udelay(10);
+	}
+
+	sdma_process_event(sde, sdma_event_e25_hw_clean_up_done);
+}
+
+static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde)
+{
+	return sde->tx_ring[sde->tx_head & sde->sdma_mask];
+}
+
+/*
+ * flush ring for recovery
+ */
+static void sdma_flush_descq(struct sdma_engine *sde)
+{
+	u16 head, tail;
+	int progress = 0;
+	struct sdma_txreq *txp = get_txhead(sde);
+
+	/* The reason for some of the complexity of this code is that
+	 * not all descriptors have corresponding txps.  So, we have to
+	 * be able to skip over descs until we wander into the range of
+	 * the next txp on the list.
+	 */
+	head = sde->descq_head & sde->sdma_mask;
+	tail = sde->descq_tail & sde->sdma_mask;
+	while (head != tail) {
+		/* advance head, wrap if needed */
+		head = ++sde->descq_head & sde->sdma_mask;
+		/* if now past this txp's descs, do the callback */
+		if (txp && txp->next_descq_idx == head) {
+			/* remove from list */
+			sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+			complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
+			trace_hfi1_sdma_progress(sde, head, tail, txp);
+			txp = get_txhead(sde);
+		}
+		progress++;
+	}
+	if (progress)
+		sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+static void sdma_sw_clean_up_task(unsigned long opaque)
+{
+	struct sdma_engine *sde = (struct sdma_engine *)opaque;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sde->tail_lock, flags);
+	write_seqlock(&sde->head_lock);
+
+	/*
+	 * At this point, the following should always be true:
+	 * - We are halted, so no more descriptors are getting retired.
+	 * - We are not running, so no one is submitting new work.
+	 * - Only we can send the e40_sw_cleaned, so we can't start
+	 *   running again until we say so.  So, the active list and
+	 *   descq are ours to play with.
+	 */
+
+	/*
+	 * In the error clean up sequence, software clean must be called
+	 * before the hardware clean so we can use the hardware head in
+	 * the progress routine.  A hardware clean or SPC unfreeze will
+	 * reset the hardware head.
+	 *
+	 * Process all retired requests. The progress routine will use the
+	 * latest physical hardware head - we are not running so speed does
+	 * not matter.
+	 */
+	sdma_make_progress(sde, 0);
+
+	sdma_flush(sde);
+
+	/*
+	 * Reset our notion of head and tail.
+	 * Note that the HW registers have been reset via an earlier
+	 * clean up.
+	 */
+	sde->descq_tail = 0;
+	sde->descq_head = 0;
+	sde->desc_avail = sdma_descq_freecnt(sde);
+	*sde->head_dma = 0;
+
+	__sdma_process_event(sde, sdma_event_e40_sw_cleaned);
+
+	write_sequnlock(&sde->head_lock);
+	spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sw_tear_down(struct sdma_engine *sde)
+{
+	struct sdma_state *ss = &sde->state;
+
+	/* Releasing this reference means the state machine has stopped. */
+	sdma_put(ss);
+
+	/* stop waiting for all unfreeze events to complete */
+	atomic_set(&sde->dd->sdma_unfreeze_count, -1);
+	wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+}
+
+static void sdma_start_hw_clean_up(struct sdma_engine *sde)
+{
+	tasklet_hi_schedule(&sde->sdma_hw_clean_up_task);
+}
+
+static void sdma_set_state(struct sdma_engine *sde,
+			   enum sdma_states next_state)
+{
+	struct sdma_state *ss = &sde->state;
+	const struct sdma_set_state_action *action = sdma_action_table;
+	unsigned op = 0;
+
+	trace_hfi1_sdma_state(
+		sde,
+		sdma_state_names[ss->current_state],
+		sdma_state_names[next_state]);
+
+	/* debugging bookkeeping */
+	ss->previous_state = ss->current_state;
+	ss->previous_op = ss->current_op;
+	ss->current_state = next_state;
+
+	if (ss->previous_state != sdma_state_s99_running &&
+	    next_state == sdma_state_s99_running)
+		sdma_flush(sde);
+
+	if (action[next_state].op_enable)
+		op |= SDMA_SENDCTRL_OP_ENABLE;
+
+	if (action[next_state].op_intenable)
+		op |= SDMA_SENDCTRL_OP_INTENABLE;
+
+	if (action[next_state].op_halt)
+		op |= SDMA_SENDCTRL_OP_HALT;
+
+	if (action[next_state].op_cleanup)
+		op |= SDMA_SENDCTRL_OP_CLEANUP;
+
+	if (action[next_state].go_s99_running_tofalse)
+		ss->go_s99_running = 0;
+
+	if (action[next_state].go_s99_running_totrue)
+		ss->go_s99_running = 1;
+
+	ss->current_op = op;
+	sdma_sendctrl(sde, ss->current_op);
+}
+
+/**
+ * sdma_get_descq_cnt() - called when device probed
+ *
+ * Return a validated descq count.
+ *
+ * This is currently only used in the verbs initialization to build the tx
+ * list.
+ *
+ * This will probably be deleted in favor of a more scalable approach to
+ * alloc tx's.
+ *
+ */
+u16 sdma_get_descq_cnt(void)
+{
+	u16 count = sdma_descq_cnt;
+
+	if (!count)
+		return SDMA_DESCQ_CNT;
+	/* count must be a power of 2 greater than 64 and less than
+	 * 32768.   Otherwise return default.
+	 */
+	if (!is_power_of_2(count))
+		return SDMA_DESCQ_CNT;
+	if (count < 64 || count > 32768)
+		return SDMA_DESCQ_CNT;
+	return count;
+}
+
+/**
+ * sdma_engine_get_vl() - return vl for a given sdma engine
+ * @sde: sdma engine
+ *
+ * This function returns the vl mapped to a given engine, or an error if
+ * the mapping can't be found. The mapping fields are protected by RCU.
+ */
+int sdma_engine_get_vl(struct sdma_engine *sde)
+{
+	struct hfi1_devdata *dd = sde->dd;
+	struct sdma_vl_map *m;
+	u8 vl;
+
+	if (sde->this_idx >= TXE_NUM_SDMA_ENGINES)
+		return -EINVAL;
+
+	rcu_read_lock();
+	m = rcu_dereference(dd->sdma_map);
+	if (unlikely(!m)) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+	vl = m->engine_to_vl[sde->this_idx];
+	rcu_read_unlock();
+
+	return vl;
+}
+
+/**
+ * sdma_select_engine_vl() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ *
+ * This function returns an engine based on the selector and a vl.  The
+ * mapping fields are protected by RCU.
+ */
+struct sdma_engine *sdma_select_engine_vl(
+	struct hfi1_devdata *dd,
+	u32 selector,
+	u8 vl)
+{
+	struct sdma_vl_map *m;
+	struct sdma_map_elem *e;
+	struct sdma_engine *rval;
+
+	/* NOTE This should only happen if SC->VL changed after the initial
+	 *      checks on the QP/AH
+	 *      Default will return engine 0 below
+	 */
+	if (vl >= num_vls) {
+		rval = NULL;
+		goto done;
+	}
+
+	rcu_read_lock();
+	m = rcu_dereference(dd->sdma_map);
+	if (unlikely(!m)) {
+		rcu_read_unlock();
+		return &dd->per_sdma[0];
+	}
+	e = m->map[vl & m->mask];
+	rval = e->sde[selector & e->mask];
+	rcu_read_unlock();
+
+done:
+	rval =  !rval ? &dd->per_sdma[0] : rval;
+	trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
+	return rval;
+}
+
+/**
+ * sdma_select_engine_sc() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @sc5: the 5 bit sc
+ *
+ *
+ * This function returns an engine based on the selector and an sc.
+ */
+struct sdma_engine *sdma_select_engine_sc(
+	struct hfi1_devdata *dd,
+	u32 selector,
+	u8 sc5)
+{
+	u8 vl = sc_to_vlt(dd, sc5);
+
+	return sdma_select_engine_vl(dd, selector, vl);
+}
+
+struct sdma_rht_map_elem {
+	u32 mask;
+	u8 ctr;
+	struct sdma_engine *sde[0];
+};
+
+struct sdma_rht_node {
+	unsigned long cpu_id;
+	struct sdma_rht_map_elem *map[HFI1_MAX_VLS_SUPPORTED];
+	struct rhash_head node;
+};
+
+#define NR_CPUS_HINT 192
+
+static const struct rhashtable_params sdma_rht_params = {
+	.nelem_hint = NR_CPUS_HINT,
+	.head_offset = offsetof(struct sdma_rht_node, node),
+	.key_offset = offsetof(struct sdma_rht_node, cpu_id),
+	.key_len = FIELD_SIZEOF(struct sdma_rht_node, cpu_id),
+	.max_size = NR_CPUS,
+	.min_size = 8,
+	.automatic_shrinking = true,
+};
+
+/*
+ * sdma_select_user_engine() - select sdma engine based on user setup
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ * This function returns an sdma engine for a user sdma request.
+ * User defined sdma engine affinity setting is honored when applicable,
+ * otherwise system default sdma engine mapping is used. To ensure correct
+ * ordering, the mapping from <selector, vl> to sde must remain unchanged.
+ */
+struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
+					    u32 selector, u8 vl)
+{
+	struct sdma_rht_node *rht_node;
+	struct sdma_engine *sde = NULL;
+	const struct cpumask *current_mask = &current->cpus_allowed;
+	unsigned long cpu_id;
+
+	/*
+	 * To ensure that always the same sdma engine(s) will be
+	 * selected make sure the process is pinned to this CPU only.
+	 */
+	if (cpumask_weight(current_mask) != 1)
+		goto out;
+
+	cpu_id = smp_processor_id();
+	rcu_read_lock();
+	rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu_id,
+					  sdma_rht_params);
+
+	if (rht_node && rht_node->map[vl]) {
+		struct sdma_rht_map_elem *map = rht_node->map[vl];
+
+		sde = map->sde[selector & map->mask];
+	}
+	rcu_read_unlock();
+
+	if (sde)
+		return sde;
+
+out:
+	return sdma_select_engine_vl(dd, selector, vl);
+}
+
+static void sdma_populate_sde_map(struct sdma_rht_map_elem *map)
+{
+	int i;
+
+	for (i = 0; i < roundup_pow_of_two(map->ctr ? : 1) - map->ctr; i++)
+		map->sde[map->ctr + i] = map->sde[i];
+}
+
+static void sdma_cleanup_sde_map(struct sdma_rht_map_elem *map,
+				 struct sdma_engine *sde)
+{
+	unsigned int i, pow;
+
+	/* only need to check the first ctr entries for a match */
+	for (i = 0; i < map->ctr; i++) {
+		if (map->sde[i] == sde) {
+			memmove(&map->sde[i], &map->sde[i + 1],
+				(map->ctr - i - 1) * sizeof(map->sde[0]));
+			map->ctr--;
+			pow = roundup_pow_of_two(map->ctr ? : 1);
+			map->mask = pow - 1;
+			sdma_populate_sde_map(map);
+			break;
+		}
+	}
+}
+
+/*
+ * Prevents concurrent reads and writes of the sdma engine cpu_mask
+ */
+static DEFINE_MUTEX(process_to_sde_mutex);
+
+ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf,
+				size_t count)
+{
+	struct hfi1_devdata *dd = sde->dd;
+	cpumask_var_t mask, new_mask;
+	unsigned long cpu;
+	int ret, vl, sz;
+
+	vl = sdma_engine_get_vl(sde);
+	if (unlikely(vl < 0))
+		return -EINVAL;
+
+	ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
+	if (!ret)
+		return -ENOMEM;
+
+	ret = zalloc_cpumask_var(&new_mask, GFP_KERNEL);
+	if (!ret) {
+		free_cpumask_var(mask);
+		return -ENOMEM;
+	}
+	ret = cpulist_parse(buf, mask);
+	if (ret)
+		goto out_free;
+
+	if (!cpumask_subset(mask, cpu_online_mask)) {
+		dd_dev_warn(sde->dd, "Invalid CPU mask\n");
+		ret = -EINVAL;
+		goto out_free;
+	}
+
+	sz = sizeof(struct sdma_rht_map_elem) +
+			(TXE_NUM_SDMA_ENGINES * sizeof(struct sdma_engine *));
+
+	mutex_lock(&process_to_sde_mutex);
+
+	for_each_cpu(cpu, mask) {
+		struct sdma_rht_node *rht_node;
+
+		/* Check if we have this already mapped */
+		if (cpumask_test_cpu(cpu, &sde->cpu_mask)) {
+			cpumask_set_cpu(cpu, new_mask);
+			continue;
+		}
+
+		if (vl >= ARRAY_SIZE(rht_node->map)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu,
+						  sdma_rht_params);
+		if (!rht_node) {
+			rht_node = kzalloc(sizeof(*rht_node), GFP_KERNEL);
+			if (!rht_node) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			rht_node->map[vl] = kzalloc(sz, GFP_KERNEL);
+			if (!rht_node->map[vl]) {
+				kfree(rht_node);
+				ret = -ENOMEM;
+				goto out;
+			}
+			rht_node->cpu_id = cpu;
+			rht_node->map[vl]->mask = 0;
+			rht_node->map[vl]->ctr = 1;
+			rht_node->map[vl]->sde[0] = sde;
+
+			ret = rhashtable_insert_fast(dd->sdma_rht,
+						     &rht_node->node,
+						     sdma_rht_params);
+			if (ret) {
+				kfree(rht_node->map[vl]);
+				kfree(rht_node);
+				dd_dev_err(sde->dd, "Failed to set process to sde affinity for cpu %lu\n",
+					   cpu);
+				goto out;
+			}
+
+		} else {
+			int ctr, pow;
+
+			/* Add new user mappings */
+			if (!rht_node->map[vl])
+				rht_node->map[vl] = kzalloc(sz, GFP_KERNEL);
+
+			if (!rht_node->map[vl]) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			rht_node->map[vl]->ctr++;
+			ctr = rht_node->map[vl]->ctr;
+			rht_node->map[vl]->sde[ctr - 1] = sde;
+			pow = roundup_pow_of_two(ctr);
+			rht_node->map[vl]->mask = pow - 1;
+
+			/* Populate the sde map table */
+			sdma_populate_sde_map(rht_node->map[vl]);
+		}
+		cpumask_set_cpu(cpu, new_mask);
+	}
+
+	/* Clean up old mappings */
+	for_each_cpu(cpu, cpu_online_mask) {
+		struct sdma_rht_node *rht_node;
+
+		/* Don't cleanup sdes that are set in the new mask */
+		if (cpumask_test_cpu(cpu, mask))
+			continue;
+
+		rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu,
+						  sdma_rht_params);
+		if (rht_node) {
+			bool empty = true;
+			int i;
+
+			/* Remove mappings for old sde */
+			for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++)
+				if (rht_node->map[i])
+					sdma_cleanup_sde_map(rht_node->map[i],
+							     sde);
+
+			/* Free empty hash table entries */
+			for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++) {
+				if (!rht_node->map[i])
+					continue;
+
+				if (rht_node->map[i]->ctr) {
+					empty = false;
+					break;
+				}
+			}
+
+			if (empty) {
+				ret = rhashtable_remove_fast(dd->sdma_rht,
+							     &rht_node->node,
+							     sdma_rht_params);
+				WARN_ON(ret);
+
+				for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++)
+					kfree(rht_node->map[i]);
+
+				kfree(rht_node);
+			}
+		}
+	}
+
+	cpumask_copy(&sde->cpu_mask, new_mask);
+out:
+	mutex_unlock(&process_to_sde_mutex);
+out_free:
+	free_cpumask_var(mask);
+	free_cpumask_var(new_mask);
+	return ret ? : strnlen(buf, PAGE_SIZE);
+}
+
+ssize_t sdma_get_cpu_to_sde_map(struct sdma_engine *sde, char *buf)
+{
+	mutex_lock(&process_to_sde_mutex);
+	if (cpumask_empty(&sde->cpu_mask))
+		snprintf(buf, PAGE_SIZE, "%s\n", "empty");
+	else
+		cpumap_print_to_pagebuf(true, buf, &sde->cpu_mask);
+	mutex_unlock(&process_to_sde_mutex);
+	return strnlen(buf, PAGE_SIZE);
+}
+
+static void sdma_rht_free(void *ptr, void *arg)
+{
+	struct sdma_rht_node *rht_node = ptr;
+	int i;
+
+	for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++)
+		kfree(rht_node->map[i]);
+
+	kfree(rht_node);
+}
+
+/**
+ * sdma_seqfile_dump_cpu_list() - debugfs dump the cpu to sdma mappings
+ * @s: seq file
+ * @dd: hfi1_devdata
+ * @cpuid: cpu id
+ *
+ * This routine dumps the process to sde mappings per cpu
+ */
+void sdma_seqfile_dump_cpu_list(struct seq_file *s,
+				struct hfi1_devdata *dd,
+				unsigned long cpuid)
+{
+	struct sdma_rht_node *rht_node;
+	int i, j;
+
+	rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpuid,
+					  sdma_rht_params);
+	if (!rht_node)
+		return;
+
+	seq_printf(s, "cpu%3lu: ", cpuid);
+	for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++) {
+		if (!rht_node->map[i] || !rht_node->map[i]->ctr)
+			continue;
+
+		seq_printf(s, " vl%d: [", i);
+
+		for (j = 0; j < rht_node->map[i]->ctr; j++) {
+			if (!rht_node->map[i]->sde[j])
+				continue;
+
+			if (j > 0)
+				seq_puts(s, ",");
+
+			seq_printf(s, " sdma%2d",
+				   rht_node->map[i]->sde[j]->this_idx);
+		}
+		seq_puts(s, " ]");
+	}
+
+	seq_puts(s, "\n");
+}
+
+/*
+ * Free the indicated map struct
+ */
+static void sdma_map_free(struct sdma_vl_map *m)
+{
+	int i;
+
+	for (i = 0; m && i < m->actual_vls; i++)
+		kfree(m->map[i]);
+	kfree(m);
+}
+
+/*
+ * Handle RCU callback
+ */
+static void sdma_map_rcu_callback(struct rcu_head *list)
+{
+	struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list);
+
+	sdma_map_free(m);
+}
+
+/**
+ * sdma_map_init - called when # vls change
+ * @dd: hfi1_devdata
+ * @port: port number
+ * @num_vls: number of vls
+ * @vl_engines: per vl engine mapping (optional)
+ *
+ * This routine changes the mapping based on the number of vls.
+ *
+ * vl_engines is used to specify a non-uniform vl/engine loading. NULL
+ * implies auto computing the loading and giving each VLs a uniform
+ * distribution of engines per VL.
+ *
+ * The auto algorithm computes the sde_per_vl and the number of extra
+ * engines.  Any extra engines are added from the last VL on down.
+ *
+ * rcu locking is used here to control access to the mapping fields.
+ *
+ * If either the num_vls or num_sdma are non-power of 2, the array sizes
+ * in the struct sdma_vl_map and the struct sdma_map_elem are rounded
+ * up to the next highest power of 2 and the first entry is reused
+ * in a round robin fashion.
+ *
+ * If an error occurs the map change is not done and the mapping is
+ * not changed.
+ *
+ */
+int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines)
+{
+	int i, j;
+	int extra, sde_per_vl;
+	int engine = 0;
+	u8 lvl_engines[OPA_MAX_VLS];
+	struct sdma_vl_map *oldmap, *newmap;
+
+	if (!(dd->flags & HFI1_HAS_SEND_DMA))
+		return 0;
+
+	if (!vl_engines) {
+		/* truncate divide */
+		sde_per_vl = dd->num_sdma / num_vls;
+		/* extras */
+		extra = dd->num_sdma % num_vls;
+		vl_engines = lvl_engines;
+		/* add extras from last vl down */
+		for (i = num_vls - 1; i >= 0; i--, extra--)
+			vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0);
+	}
+	/* build new map */
+	newmap = kzalloc(
+		sizeof(struct sdma_vl_map) +
+			roundup_pow_of_two(num_vls) *
+			sizeof(struct sdma_map_elem *),
+		GFP_KERNEL);
+	if (!newmap)
+		goto bail;
+	newmap->actual_vls = num_vls;
+	newmap->vls = roundup_pow_of_two(num_vls);
+	newmap->mask = (1 << ilog2(newmap->vls)) - 1;
+	/* initialize back-map */
+	for (i = 0; i < TXE_NUM_SDMA_ENGINES; i++)
+		newmap->engine_to_vl[i] = -1;
+	for (i = 0; i < newmap->vls; i++) {
+		/* save for wrap around */
+		int first_engine = engine;
+
+		if (i < newmap->actual_vls) {
+			int sz = roundup_pow_of_two(vl_engines[i]);
+
+			/* only allocate once */
+			newmap->map[i] = kzalloc(
+				sizeof(struct sdma_map_elem) +
+					sz * sizeof(struct sdma_engine *),
+				GFP_KERNEL);
+			if (!newmap->map[i])
+				goto bail;
+			newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
+			/* assign engines */
+			for (j = 0; j < sz; j++) {
+				newmap->map[i]->sde[j] =
+					&dd->per_sdma[engine];
+				if (++engine >= first_engine + vl_engines[i])
+					/* wrap back to first engine */
+					engine = first_engine;
+			}
+			/* assign back-map */
+			for (j = 0; j < vl_engines[i]; j++)
+				newmap->engine_to_vl[first_engine + j] = i;
+		} else {
+			/* just re-use entry without allocating */
+			newmap->map[i] = newmap->map[i % num_vls];
+		}
+		engine = first_engine + vl_engines[i];
+	}
+	/* newmap in hand, save old map */
+	spin_lock_irq(&dd->sde_map_lock);
+	oldmap = rcu_dereference_protected(dd->sdma_map,
+					   lockdep_is_held(&dd->sde_map_lock));
+
+	/* publish newmap */
+	rcu_assign_pointer(dd->sdma_map, newmap);
+
+	spin_unlock_irq(&dd->sde_map_lock);
+	/* success, free any old map after grace period */
+	if (oldmap)
+		call_rcu(&oldmap->list, sdma_map_rcu_callback);
+	return 0;
+bail:
+	/* free any partial allocation */
+	sdma_map_free(newmap);
+	return -ENOMEM;
+}
+
+/**
+ * sdma_clean()  Clean up allocated memory
+ * @dd:          struct hfi1_devdata
+ * @num_engines: num sdma engines
+ *
+ * This routine can be called regardless of the success of
+ * sdma_init()
+ */
+void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
+{
+	size_t i;
+	struct sdma_engine *sde;
+
+	if (dd->sdma_pad_dma) {
+		dma_free_coherent(&dd->pcidev->dev, 4,
+				  (void *)dd->sdma_pad_dma,
+				  dd->sdma_pad_phys);
+		dd->sdma_pad_dma = NULL;
+		dd->sdma_pad_phys = 0;
+	}
+	if (dd->sdma_heads_dma) {
+		dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size,
+				  (void *)dd->sdma_heads_dma,
+				  dd->sdma_heads_phys);
+		dd->sdma_heads_dma = NULL;
+		dd->sdma_heads_phys = 0;
+	}
+	for (i = 0; dd->per_sdma && i < num_engines; ++i) {
+		sde = &dd->per_sdma[i];
+
+		sde->head_dma = NULL;
+		sde->head_phys = 0;
+
+		if (sde->descq) {
+			dma_free_coherent(
+				&dd->pcidev->dev,
+				sde->descq_cnt * sizeof(u64[2]),
+				sde->descq,
+				sde->descq_phys
+			);
+			sde->descq = NULL;
+			sde->descq_phys = 0;
+		}
+		kvfree(sde->tx_ring);
+		sde->tx_ring = NULL;
+	}
+	spin_lock_irq(&dd->sde_map_lock);
+	sdma_map_free(rcu_access_pointer(dd->sdma_map));
+	RCU_INIT_POINTER(dd->sdma_map, NULL);
+	spin_unlock_irq(&dd->sde_map_lock);
+	synchronize_rcu();
+	kfree(dd->per_sdma);
+	dd->per_sdma = NULL;
+
+	if (dd->sdma_rht) {
+		rhashtable_free_and_destroy(dd->sdma_rht, sdma_rht_free, NULL);
+		kfree(dd->sdma_rht);
+		dd->sdma_rht = NULL;
+	}
+}
+
+/**
+ * sdma_init() - called when device probed
+ * @dd: hfi1_devdata
+ * @port: port number (currently only zero)
+ *
+ * Initializes each sde and its csrs.
+ * Interrupts are not required to be enabled.
+ *
+ * Returns:
+ * 0 - success, -errno on failure
+ */
+int sdma_init(struct hfi1_devdata *dd, u8 port)
+{
+	unsigned this_idx;
+	struct sdma_engine *sde;
+	struct rhashtable *tmp_sdma_rht;
+	u16 descq_cnt;
+	void *curr_head;
+	struct hfi1_pportdata *ppd = dd->pport + port;
+	u32 per_sdma_credits;
+	uint idle_cnt = sdma_idle_cnt;
+	size_t num_engines = dd->chip_sdma_engines;
+	int ret = -ENOMEM;
+
+	if (!HFI1_CAP_IS_KSET(SDMA)) {
+		HFI1_CAP_CLEAR(SDMA_AHG);
+		return 0;
+	}
+	if (mod_num_sdma &&
+	    /* can't exceed chip support */
+	    mod_num_sdma <= dd->chip_sdma_engines &&
+	    /* count must be >= vls */
+	    mod_num_sdma >= num_vls)
+		num_engines = mod_num_sdma;
+
+	dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma);
+	dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines);
+	dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n",
+		    dd->chip_sdma_mem_size);
+
+	per_sdma_credits =
+		dd->chip_sdma_mem_size / (num_engines * SDMA_BLOCK_SIZE);
+
+	/* set up freeze waitqueue */
+	init_waitqueue_head(&dd->sdma_unfreeze_wq);
+	atomic_set(&dd->sdma_unfreeze_count, 0);
+
+	descq_cnt = sdma_get_descq_cnt();
+	dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n",
+		    num_engines, descq_cnt);
+
+	/* alloc memory for array of send engines */
+	dd->per_sdma = kcalloc_node(num_engines, sizeof(*dd->per_sdma),
+				    GFP_KERNEL, dd->node);
+	if (!dd->per_sdma)
+		return ret;
+
+	idle_cnt = ns_to_cclock(dd, idle_cnt);
+	if (idle_cnt)
+		dd->default_desc1 =
+			SDMA_DESC1_HEAD_TO_HOST_FLAG;
+	else
+		dd->default_desc1 =
+			SDMA_DESC1_INT_REQ_FLAG;
+
+	if (!sdma_desct_intr)
+		sdma_desct_intr = SDMA_DESC_INTR;
+
+	/* Allocate memory for SendDMA descriptor FIFOs */
+	for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+		sde = &dd->per_sdma[this_idx];
+		sde->dd = dd;
+		sde->ppd = ppd;
+		sde->this_idx = this_idx;
+		sde->descq_cnt = descq_cnt;
+		sde->desc_avail = sdma_descq_freecnt(sde);
+		sde->sdma_shift = ilog2(descq_cnt);
+		sde->sdma_mask = (1 << sde->sdma_shift) - 1;
+
+		/* Create a mask specifically for each interrupt source */
+		sde->int_mask = (u64)1 << (0 * TXE_NUM_SDMA_ENGINES +
+					   this_idx);
+		sde->progress_mask = (u64)1 << (1 * TXE_NUM_SDMA_ENGINES +
+						this_idx);
+		sde->idle_mask = (u64)1 << (2 * TXE_NUM_SDMA_ENGINES +
+					    this_idx);
+		/* Create a combined mask to cover all 3 interrupt sources */
+		sde->imask = sde->int_mask | sde->progress_mask |
+			     sde->idle_mask;
+
+		spin_lock_init(&sde->tail_lock);
+		seqlock_init(&sde->head_lock);
+		spin_lock_init(&sde->senddmactrl_lock);
+		spin_lock_init(&sde->flushlist_lock);
+		/* insure there is always a zero bit */
+		sde->ahg_bits = 0xfffffffe00000000ULL;
+
+		sdma_set_state(sde, sdma_state_s00_hw_down);
+
+		/* set up reference counting */
+		kref_init(&sde->state.kref);
+		init_completion(&sde->state.comp);
+
+		INIT_LIST_HEAD(&sde->flushlist);
+		INIT_LIST_HEAD(&sde->dmawait);
+
+		sde->tail_csr =
+			get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
+
+		tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task,
+			     (unsigned long)sde);
+
+		tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
+			     (unsigned long)sde);
+		INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
+		INIT_WORK(&sde->flush_worker, sdma_field_flush);
+
+		sde->progress_check_head = 0;
+
+		timer_setup(&sde->err_progress_check_timer,
+			    sdma_err_progress_check, 0);
+
+		sde->descq = dma_zalloc_coherent(
+			&dd->pcidev->dev,
+			descq_cnt * sizeof(u64[2]),
+			&sde->descq_phys,
+			GFP_KERNEL
+		);
+		if (!sde->descq)
+			goto bail;
+		sde->tx_ring =
+			kvzalloc_node(sizeof(struct sdma_txreq *) * descq_cnt,
+				      GFP_KERNEL, dd->node);
+		if (!sde->tx_ring)
+			goto bail;
+	}
+
+	dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
+	/* Allocate memory for DMA of head registers to memory */
+	dd->sdma_heads_dma = dma_zalloc_coherent(
+		&dd->pcidev->dev,
+		dd->sdma_heads_size,
+		&dd->sdma_heads_phys,
+		GFP_KERNEL
+	);
+	if (!dd->sdma_heads_dma) {
+		dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
+		goto bail;
+	}
+
+	/* Allocate memory for pad */
+	dd->sdma_pad_dma = dma_zalloc_coherent(
+		&dd->pcidev->dev,
+		sizeof(u32),
+		&dd->sdma_pad_phys,
+		GFP_KERNEL
+	);
+	if (!dd->sdma_pad_dma) {
+		dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
+		goto bail;
+	}
+
+	/* assign each engine to different cacheline and init registers */
+	curr_head = (void *)dd->sdma_heads_dma;
+	for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+		unsigned long phys_offset;
+
+		sde = &dd->per_sdma[this_idx];
+
+		sde->head_dma = curr_head;
+		curr_head += L1_CACHE_BYTES;
+		phys_offset = (unsigned long)sde->head_dma -
+			      (unsigned long)dd->sdma_heads_dma;
+		sde->head_phys = dd->sdma_heads_phys + phys_offset;
+		init_sdma_regs(sde, per_sdma_credits, idle_cnt);
+	}
+	dd->flags |= HFI1_HAS_SEND_DMA;
+	dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0;
+	dd->num_sdma = num_engines;
+	ret = sdma_map_init(dd, port, ppd->vls_operational, NULL);
+	if (ret < 0)
+		goto bail;
+
+	tmp_sdma_rht = kzalloc(sizeof(*tmp_sdma_rht), GFP_KERNEL);
+	if (!tmp_sdma_rht) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	ret = rhashtable_init(tmp_sdma_rht, &sdma_rht_params);
+	if (ret < 0)
+		goto bail;
+	dd->sdma_rht = tmp_sdma_rht;
+
+	dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
+	return 0;
+
+bail:
+	sdma_clean(dd, num_engines);
+	return ret;
+}
+
+/**
+ * sdma_all_running() - called when the link goes up
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the running state.
+ */
+void sdma_all_running(struct hfi1_devdata *dd)
+{
+	struct sdma_engine *sde;
+	unsigned int i;
+
+	/* move all engines to running */
+	for (i = 0; i < dd->num_sdma; ++i) {
+		sde = &dd->per_sdma[i];
+		sdma_process_event(sde, sdma_event_e30_go_running);
+	}
+}
+
+/**
+ * sdma_all_idle() - called when the link goes down
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the idle state.
+ */
+void sdma_all_idle(struct hfi1_devdata *dd)
+{
+	struct sdma_engine *sde;
+	unsigned int i;
+
+	/* idle all engines */
+	for (i = 0; i < dd->num_sdma; ++i) {
+		sde = &dd->per_sdma[i];
+		sdma_process_event(sde, sdma_event_e70_go_idle);
+	}
+}
+
+/**
+ * sdma_start() - called to kick off state processing for all engines
+ * @dd: hfi1_devdata
+ *
+ * This routine is for kicking off the state processing for all required
+ * sdma engines.  Interrupts need to be working at this point.
+ *
+ */
+void sdma_start(struct hfi1_devdata *dd)
+{
+	unsigned i;
+	struct sdma_engine *sde;
+
+	/* kick off the engines state processing */
+	for (i = 0; i < dd->num_sdma; ++i) {
+		sde = &dd->per_sdma[i];
+		sdma_process_event(sde, sdma_event_e10_go_hw_start);
+	}
+}
+
+/**
+ * sdma_exit() - used when module is removed
+ * @dd: hfi1_devdata
+ */
+void sdma_exit(struct hfi1_devdata *dd)
+{
+	unsigned this_idx;
+	struct sdma_engine *sde;
+
+	for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma;
+			++this_idx) {
+		sde = &dd->per_sdma[this_idx];
+		if (!list_empty(&sde->dmawait))
+			dd_dev_err(dd, "sde %u: dmawait list not empty!\n",
+				   sde->this_idx);
+		sdma_process_event(sde, sdma_event_e00_go_hw_down);
+
+		del_timer_sync(&sde->err_progress_check_timer);
+
+		/*
+		 * This waits for the state machine to exit so it is not
+		 * necessary to kill the sdma_sw_clean_up_task to make sure
+		 * it is not running.
+		 */
+		sdma_finalput(&sde->state);
+	}
+}
+
+/*
+ * unmap the indicated descriptor
+ */
+static inline void sdma_unmap_desc(
+	struct hfi1_devdata *dd,
+	struct sdma_desc *descp)
+{
+	switch (sdma_mapping_type(descp)) {
+	case SDMA_MAP_SINGLE:
+		dma_unmap_single(
+			&dd->pcidev->dev,
+			sdma_mapping_addr(descp),
+			sdma_mapping_len(descp),
+			DMA_TO_DEVICE);
+		break;
+	case SDMA_MAP_PAGE:
+		dma_unmap_page(
+			&dd->pcidev->dev,
+			sdma_mapping_addr(descp),
+			sdma_mapping_len(descp),
+			DMA_TO_DEVICE);
+		break;
+	}
+}
+
+/*
+ * return the mode as indicated by the first
+ * descriptor in the tx.
+ */
+static inline u8 ahg_mode(struct sdma_txreq *tx)
+{
+	return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+		>> SDMA_DESC1_HEADER_MODE_SHIFT;
+}
+
+/**
+ * __sdma_txclean() - clean tx of mappings, descp *kmalloc's
+ * @dd: hfi1_devdata for unmapping
+ * @tx: tx request to clean
+ *
+ * This is used in the progress routine to clean the tx or
+ * by the ULP to toss an in-process tx build.
+ *
+ * The code can be called multiple times without issue.
+ *
+ */
+void __sdma_txclean(
+	struct hfi1_devdata *dd,
+	struct sdma_txreq *tx)
+{
+	u16 i;
+
+	if (tx->num_desc) {
+		u8 skip = 0, mode = ahg_mode(tx);
+
+		/* unmap first */
+		sdma_unmap_desc(dd, &tx->descp[0]);
+		/* determine number of AHG descriptors to skip */
+		if (mode > SDMA_AHG_APPLY_UPDATE1)
+			skip = mode >> 1;
+		for (i = 1 + skip; i < tx->num_desc; i++)
+			sdma_unmap_desc(dd, &tx->descp[i]);
+		tx->num_desc = 0;
+	}
+	kfree(tx->coalesce_buf);
+	tx->coalesce_buf = NULL;
+	/* kmalloc'ed descp */
+	if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
+		tx->desc_limit = ARRAY_SIZE(tx->descs);
+		kfree(tx->descp);
+	}
+}
+
+static inline u16 sdma_gethead(struct sdma_engine *sde)
+{
+	struct hfi1_devdata *dd = sde->dd;
+	int use_dmahead;
+	u16 hwhead;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+retry:
+	use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) &&
+					(dd->flags & HFI1_HAS_SDMA_TIMEOUT);
+	hwhead = use_dmahead ?
+		(u16)le64_to_cpu(*sde->head_dma) :
+		(u16)read_sde_csr(sde, SD(HEAD));
+
+	if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) {
+		u16 cnt;
+		u16 swtail;
+		u16 swhead;
+		int sane;
+
+		swhead = sde->descq_head & sde->sdma_mask;
+		/* this code is really bad for cache line trading */
+		swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
+		cnt = sde->descq_cnt;
+
+		if (swhead < swtail)
+			/* not wrapped */
+			sane = (hwhead >= swhead) & (hwhead <= swtail);
+		else if (swhead > swtail)
+			/* wrapped around */
+			sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
+				(hwhead <= swtail);
+		else
+			/* empty */
+			sane = (hwhead == swhead);
+
+		if (unlikely(!sane)) {
+			dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%hu swhd=%hu swtl=%hu cnt=%hu\n",
+				   sde->this_idx,
+				   use_dmahead ? "dma" : "kreg",
+				   hwhead, swhead, swtail, cnt);
+			if (use_dmahead) {
+				/* try one more time, using csr */
+				use_dmahead = 0;
+				goto retry;
+			}
+			/* proceed as if no progress */
+			hwhead = swhead;
+		}
+	}
+	return hwhead;
+}
+
+/*
+ * This is called when there are send DMA descriptors that might be
+ * available.
+ *
+ * This is called with head_lock held.
+ */
+static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
+{
+	struct iowait *wait, *nw;
+	struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
+	uint i, n = 0, seq, max_idx = 0;
+	struct sdma_txreq *stx;
+	struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
+	u8 max_starved_cnt = 0;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+		   slashstrip(__FILE__), __LINE__, __func__);
+	dd_dev_err(sde->dd, "avail: %u\n", avail);
+#endif
+
+	do {
+		seq = read_seqbegin(&dev->iowait_lock);
+		if (!list_empty(&sde->dmawait)) {
+			/* at least one item */
+			write_seqlock(&dev->iowait_lock);
+			/* Harvest waiters wanting DMA descriptors */
+			list_for_each_entry_safe(
+					wait,
+					nw,
+					&sde->dmawait,
+					list) {
+				u16 num_desc = 0;
+
+				if (!wait->wakeup)
+					continue;
+				if (n == ARRAY_SIZE(waits))
+					break;
+				if (!list_empty(&wait->tx_head)) {
+					stx = list_first_entry(
+						&wait->tx_head,
+						struct sdma_txreq,
+						list);
+					num_desc = stx->num_desc;
+				}
+				if (num_desc > avail)
+					break;
+				avail -= num_desc;
+				/* Find the most starved wait memeber */
+				iowait_starve_find_max(wait, &max_starved_cnt,
+						       n, &max_idx);
+				list_del_init(&wait->list);
+				waits[n++] = wait;
+			}
+			write_sequnlock(&dev->iowait_lock);
+			break;
+		}
+	} while (read_seqretry(&dev->iowait_lock, seq));
+
+	/* Schedule the most starved one first */
+	if (n)
+		waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON);
+
+	for (i = 0; i < n; i++)
+		if (i != max_idx)
+			waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
+}
+
+/* head_lock must be held */
+static void sdma_make_progress(struct sdma_engine *sde, u64 status)
+{
+	struct sdma_txreq *txp = NULL;
+	int progress = 0;
+	u16 hwhead, swhead;
+	int idle_check_done = 0;
+
+	hwhead = sdma_gethead(sde);
+
+	/* The reason for some of the complexity of this code is that
+	 * not all descriptors have corresponding txps.  So, we have to
+	 * be able to skip over descs until we wander into the range of
+	 * the next txp on the list.
+	 */
+
+retry:
+	txp = get_txhead(sde);
+	swhead = sde->descq_head & sde->sdma_mask;
+	trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+	while (swhead != hwhead) {
+		/* advance head, wrap if needed */
+		swhead = ++sde->descq_head & sde->sdma_mask;
+
+		/* if now past this txp's descs, do the callback */
+		if (txp && txp->next_descq_idx == swhead) {
+			/* remove from list */
+			sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+			complete_tx(sde, txp, SDMA_TXREQ_S_OK);
+			/* see if there is another txp */
+			txp = get_txhead(sde);
+		}
+		trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+		progress++;
+	}
+
+	/*
+	 * The SDMA idle interrupt is not guaranteed to be ordered with respect
+	 * to updates to the the dma_head location in host memory. The head
+	 * value read might not be fully up to date. If there are pending
+	 * descriptors and the SDMA idle interrupt fired then read from the
+	 * CSR SDMA head instead to get the latest value from the hardware.
+	 * The hardware SDMA head should be read at most once in this invocation
+	 * of sdma_make_progress(..) which is ensured by idle_check_done flag
+	 */
+	if ((status & sde->idle_mask) && !idle_check_done) {
+		u16 swtail;
+
+		swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
+		if (swtail != hwhead) {
+			hwhead = (u16)read_sde_csr(sde, SD(HEAD));
+			idle_check_done = 1;
+			goto retry;
+		}
+	}
+
+	sde->last_status = status;
+	if (progress)
+		sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+/*
+ * sdma_engine_interrupt() - interrupt handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ *
+ * Status is a mask of the 3 possible interrupts for this engine.  It will
+ * contain bits _only_ for this SDMA engine.  It will contain at least one
+ * bit, it may contain more.
+ */
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status)
+{
+	trace_hfi1_sdma_engine_interrupt(sde, status);
+	write_seqlock(&sde->head_lock);
+	sdma_set_desc_cnt(sde, sdma_desct_intr);
+	if (status & sde->idle_mask)
+		sde->idle_int_cnt++;
+	else if (status & sde->progress_mask)
+		sde->progress_int_cnt++;
+	else if (status & sde->int_mask)
+		sde->sdma_int_cnt++;
+	sdma_make_progress(sde, status);
+	write_sequnlock(&sde->head_lock);
+}
+
+/**
+ * sdma_engine_error() - error handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ */
+void sdma_engine_error(struct sdma_engine *sde, u64 status)
+{
+	unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n",
+		   sde->this_idx,
+		   (unsigned long long)status,
+		   sdma_state_names[sde->state.current_state]);
+#endif
+	spin_lock_irqsave(&sde->tail_lock, flags);
+	write_seqlock(&sde->head_lock);
+	if (status & ALL_SDMA_ENG_HALT_ERRS)
+		__sdma_process_event(sde, sdma_event_e60_hw_halted);
+	if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) {
+		dd_dev_err(sde->dd,
+			   "SDMA (%u) engine error: 0x%llx state %s\n",
+			   sde->this_idx,
+			   (unsigned long long)status,
+			   sdma_state_names[sde->state.current_state]);
+		dump_sdma_state(sde);
+	}
+	write_sequnlock(&sde->head_lock);
+	spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sendctrl(struct sdma_engine *sde, unsigned op)
+{
+	u64 set_senddmactrl = 0;
+	u64 clr_senddmactrl = 0;
+	unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n",
+		   sde->this_idx,
+		   (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0,
+		   (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0,
+		   (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0,
+		   (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0);
+#endif
+
+	if (op & SDMA_SENDCTRL_OP_ENABLE)
+		set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+	else
+		clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+
+	if (op & SDMA_SENDCTRL_OP_INTENABLE)
+		set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+	else
+		clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+
+	if (op & SDMA_SENDCTRL_OP_HALT)
+		set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+	else
+		clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+
+	spin_lock_irqsave(&sde->senddmactrl_lock, flags);
+
+	sde->p_senddmactrl |= set_senddmactrl;
+	sde->p_senddmactrl &= ~clr_senddmactrl;
+
+	if (op & SDMA_SENDCTRL_OP_CLEANUP)
+		write_sde_csr(sde, SD(CTRL),
+			      sde->p_senddmactrl |
+			      SD(CTRL_SDMA_CLEANUP_SMASK));
+	else
+		write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl);
+
+	spin_unlock_irqrestore(&sde->senddmactrl_lock, flags);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	sdma_dumpstate(sde);
+#endif
+}
+
+static void sdma_setlengen(struct sdma_engine *sde)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+	/*
+	 * Set SendDmaLenGen and clear-then-set the MSB of the generation
+	 * count to enable generation checking and load the internal
+	 * generation counter.
+	 */
+	write_sde_csr(sde, SD(LEN_GEN),
+		      (sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT));
+	write_sde_csr(sde, SD(LEN_GEN),
+		      ((sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)) |
+		      (4ULL << SD(LEN_GEN_GENERATION_SHIFT)));
+}
+
+static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
+{
+	/* Commit writes to memory and advance the tail on the chip */
+	smp_wmb(); /* see get_txhead() */
+	writeq(tail, sde->tail_csr);
+}
+
+/*
+ * This is called when changing to state s10_hw_start_up_halt_wait as
+ * a result of send buffer errors or send DMA descriptor errors.
+ */
+static void sdma_hw_start_up(struct sdma_engine *sde)
+{
+	u64 reg;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+	sdma_setlengen(sde);
+	sdma_update_tail(sde, 0); /* Set SendDmaTail */
+	*sde->head_dma = 0;
+
+	reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) <<
+	      SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT);
+	write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg);
+}
+
+/*
+ * set_sdma_integrity
+ *
+ * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'.
+ */
+static void set_sdma_integrity(struct sdma_engine *sde)
+{
+	struct hfi1_devdata *dd = sde->dd;
+
+	write_sde_csr(sde, SD(CHECK_ENABLE),
+		      hfi1_pkt_base_sdma_integrity(dd));
+}
+
+static void init_sdma_regs(
+	struct sdma_engine *sde,
+	u32 credits,
+	uint idle_cnt)
+{
+	u8 opval, opmask;
+#ifdef CONFIG_SDMA_VERBOSITY
+	struct hfi1_devdata *dd = sde->dd;
+
+	dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+	write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys);
+	sdma_setlengen(sde);
+	sdma_update_tail(sde, 0); /* Set SendDmaTail */
+	write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt);
+	write_sde_csr(sde, SD(DESC_CNT), 0);
+	write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys);
+	write_sde_csr(sde, SD(MEMORY),
+		      ((u64)credits << SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) |
+		      ((u64)(credits * sde->this_idx) <<
+		       SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT)));
+	write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull);
+	set_sdma_integrity(sde);
+	opmask = OPCODE_CHECK_MASK_DISABLED;
+	opval = OPCODE_CHECK_VAL_DISABLED;
+	write_sde_csr(sde, SD(CHECK_OPCODE),
+		      (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) |
+		      (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT));
+}
+
+#ifdef CONFIG_SDMA_VERBOSITY
+
+#define sdma_dumpstate_helper0(reg) do { \
+		csr = read_csr(sde->dd, reg); \
+		dd_dev_err(sde->dd, "%36s     0x%016llx\n", #reg, csr); \
+	} while (0)
+
+#define sdma_dumpstate_helper(reg) do { \
+		csr = read_sde_csr(sde, reg); \
+		dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \
+			#reg, sde->this_idx, csr); \
+	} while (0)
+
+#define sdma_dumpstate_helper2(reg) do { \
+		csr = read_csr(sde->dd, reg + (8 * i)); \
+		dd_dev_err(sde->dd, "%33s_%02u     0x%016llx\n", \
+				#reg, i, csr); \
+	} while (0)
+
+void sdma_dumpstate(struct sdma_engine *sde)
+{
+	u64 csr;
+	unsigned i;
+
+	sdma_dumpstate_helper(SD(CTRL));
+	sdma_dumpstate_helper(SD(STATUS));
+	sdma_dumpstate_helper0(SD(ERR_STATUS));
+	sdma_dumpstate_helper0(SD(ERR_MASK));
+	sdma_dumpstate_helper(SD(ENG_ERR_STATUS));
+	sdma_dumpstate_helper(SD(ENG_ERR_MASK));
+
+	for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
+		sdma_dumpstate_helper2(CCE_INT_STATUS);
+		sdma_dumpstate_helper2(CCE_INT_MASK);
+		sdma_dumpstate_helper2(CCE_INT_BLOCKED);
+	}
+
+	sdma_dumpstate_helper(SD(TAIL));
+	sdma_dumpstate_helper(SD(HEAD));
+	sdma_dumpstate_helper(SD(PRIORITY_THLD));
+	sdma_dumpstate_helper(SD(IDLE_CNT));
+	sdma_dumpstate_helper(SD(RELOAD_CNT));
+	sdma_dumpstate_helper(SD(DESC_CNT));
+	sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
+	sdma_dumpstate_helper(SD(MEMORY));
+	sdma_dumpstate_helper0(SD(ENGINES));
+	sdma_dumpstate_helper0(SD(MEM_SIZE));
+	/* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS);  */
+	sdma_dumpstate_helper(SD(BASE_ADDR));
+	sdma_dumpstate_helper(SD(LEN_GEN));
+	sdma_dumpstate_helper(SD(HEAD_ADDR));
+	sdma_dumpstate_helper(SD(CHECK_ENABLE));
+	sdma_dumpstate_helper(SD(CHECK_VL));
+	sdma_dumpstate_helper(SD(CHECK_JOB_KEY));
+	sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY));
+	sdma_dumpstate_helper(SD(CHECK_SLID));
+	sdma_dumpstate_helper(SD(CHECK_OPCODE));
+}
+#endif
+
+static void dump_sdma_state(struct sdma_engine *sde)
+{
+	struct hw_sdma_desc *descqp;
+	u64 desc[2];
+	u64 addr;
+	u8 gen;
+	u16 len;
+	u16 head, tail, cnt;
+
+	head = sde->descq_head & sde->sdma_mask;
+	tail = sde->descq_tail & sde->sdma_mask;
+	cnt = sdma_descq_freecnt(sde);
+
+	dd_dev_err(sde->dd,
+		   "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n",
+		   sde->this_idx, head, tail, cnt,
+		   !list_empty(&sde->flushlist));
+
+	/* print info for each entry in the descriptor queue */
+	while (head != tail) {
+		char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+		descqp = &sde->descq[head];
+		desc[0] = le64_to_cpu(descqp->qw[0]);
+		desc[1] = le64_to_cpu(descqp->qw[1]);
+		flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+		flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+				'H' : '-';
+		flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+		flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+		addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+			& SDMA_DESC0_PHY_ADDR_MASK;
+		gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+			& SDMA_DESC1_GENERATION_MASK;
+		len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+			& SDMA_DESC0_BYTE_COUNT_MASK;
+		dd_dev_err(sde->dd,
+			   "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+			   head, flags, addr, gen, len);
+		dd_dev_err(sde->dd,
+			   "\tdesc0:0x%016llx desc1 0x%016llx\n",
+			   desc[0], desc[1]);
+		if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+			dd_dev_err(sde->dd,
+				   "\taidx: %u amode: %u alen: %u\n",
+				   (u8)((desc[1] &
+					 SDMA_DESC1_HEADER_INDEX_SMASK) >>
+					SDMA_DESC1_HEADER_INDEX_SHIFT),
+				   (u8)((desc[1] &
+					 SDMA_DESC1_HEADER_MODE_SMASK) >>
+					SDMA_DESC1_HEADER_MODE_SHIFT),
+				   (u8)((desc[1] &
+					 SDMA_DESC1_HEADER_DWS_SMASK) >>
+					SDMA_DESC1_HEADER_DWS_SHIFT));
+		head++;
+		head &= sde->sdma_mask;
+	}
+}
+
+#define SDE_FMT \
+	"SDE %u CPU %d STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
+/**
+ * sdma_seqfile_dump_sde() - debugfs dump of sde
+ * @s: seq file
+ * @sde: send dma engine to dump
+ *
+ * This routine dumps the sde to the indicated seq file.
+ */
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
+{
+	u16 head, tail;
+	struct hw_sdma_desc *descqp;
+	u64 desc[2];
+	u64 addr;
+	u8 gen;
+	u16 len;
+
+	head = sde->descq_head & sde->sdma_mask;
+	tail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
+	seq_printf(s, SDE_FMT, sde->this_idx,
+		   sde->cpu,
+		   sdma_state_name(sde->state.current_state),
+		   (unsigned long long)read_sde_csr(sde, SD(CTRL)),
+		   (unsigned long long)read_sde_csr(sde, SD(STATUS)),
+		   (unsigned long long)read_sde_csr(sde, SD(ENG_ERR_STATUS)),
+		   (unsigned long long)read_sde_csr(sde, SD(TAIL)), tail,
+		   (unsigned long long)read_sde_csr(sde, SD(HEAD)), head,
+		   (unsigned long long)le64_to_cpu(*sde->head_dma),
+		   (unsigned long long)read_sde_csr(sde, SD(MEMORY)),
+		   (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)),
+		   (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)),
+		   (unsigned long long)sde->last_status,
+		   (unsigned long long)sde->ahg_bits,
+		   sde->tx_tail,
+		   sde->tx_head,
+		   sde->descq_tail,
+		   sde->descq_head,
+		   !list_empty(&sde->flushlist),
+		   sde->descq_full_count,
+		   (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID));
+
+	/* print info for each entry in the descriptor queue */
+	while (head != tail) {
+		char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+		descqp = &sde->descq[head];
+		desc[0] = le64_to_cpu(descqp->qw[0]);
+		desc[1] = le64_to_cpu(descqp->qw[1]);
+		flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+		flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+				'H' : '-';
+		flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+		flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+		addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+			& SDMA_DESC0_PHY_ADDR_MASK;
+		gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+			& SDMA_DESC1_GENERATION_MASK;
+		len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+			& SDMA_DESC0_BYTE_COUNT_MASK;
+		seq_printf(s,
+			   "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+			   head, flags, addr, gen, len);
+		if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+			seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n",
+				   (u8)((desc[1] &
+					 SDMA_DESC1_HEADER_INDEX_SMASK) >>
+					SDMA_DESC1_HEADER_INDEX_SHIFT),
+				   (u8)((desc[1] &
+					 SDMA_DESC1_HEADER_MODE_SMASK) >>
+					SDMA_DESC1_HEADER_MODE_SHIFT));
+		head = (head + 1) & sde->sdma_mask;
+	}
+}
+
+/*
+ * add the generation number into
+ * the qw1 and return
+ */
+static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
+{
+	u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
+
+	qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
+	qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
+			<< SDMA_DESC1_GENERATION_SHIFT;
+	return qw1;
+}
+
+/*
+ * This routine submits the indicated tx
+ *
+ * Space has already been guaranteed and
+ * tail side of ring is locked.
+ *
+ * The hardware tail update is done
+ * in the caller and that is facilitated
+ * by returning the new tail.
+ *
+ * There is special case logic for ahg
+ * to not add the generation number for
+ * up to 2 descriptors that follow the
+ * first descriptor.
+ *
+ */
+static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
+{
+	int i;
+	u16 tail;
+	struct sdma_desc *descp = tx->descp;
+	u8 skip = 0, mode = ahg_mode(tx);
+
+	tail = sde->descq_tail & sde->sdma_mask;
+	sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+	sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
+	trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
+				   tail, &sde->descq[tail]);
+	tail = ++sde->descq_tail & sde->sdma_mask;
+	descp++;
+	if (mode > SDMA_AHG_APPLY_UPDATE1)
+		skip = mode >> 1;
+	for (i = 1; i < tx->num_desc; i++, descp++) {
+		u64 qw1;
+
+		sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+		if (skip) {
+			/* edits don't have generation */
+			qw1 = descp->qw[1];
+			skip--;
+		} else {
+			/* replace generation with real one for non-edits */
+			qw1 = add_gen(sde, descp->qw[1]);
+		}
+		sde->descq[tail].qw[1] = cpu_to_le64(qw1);
+		trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
+					   tail, &sde->descq[tail]);
+		tail = ++sde->descq_tail & sde->sdma_mask;
+	}
+	tx->next_descq_idx = tail;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+	tx->sn = sde->tail_sn++;
+	trace_hfi1_sdma_in_sn(sde, tx->sn);
+	WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
+#endif
+	sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
+	sde->desc_avail -= tx->num_desc;
+	return tail;
+}
+
+/*
+ * Check for progress
+ */
+static int sdma_check_progress(
+	struct sdma_engine *sde,
+	struct iowait *wait,
+	struct sdma_txreq *tx,
+	bool pkts_sent)
+{
+	int ret;
+
+	sde->desc_avail = sdma_descq_freecnt(sde);
+	if (tx->num_desc <= sde->desc_avail)
+		return -EAGAIN;
+	/* pulse the head_lock */
+	if (wait && wait->sleep) {
+		unsigned seq;
+
+		seq = raw_seqcount_begin(
+			(const seqcount_t *)&sde->head_lock.seqcount);
+		ret = wait->sleep(sde, wait, tx, seq, pkts_sent);
+		if (ret == -EAGAIN)
+			sde->desc_avail = sdma_descq_freecnt(sde);
+	} else {
+		ret = -EBUSY;
+	}
+	return ret;
+}
+
+/**
+ * sdma_send_txreq() - submit a tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx: sdma_txreq to submit
+ * @pkts_sent: has any packet been sent yet?
+ *
+ * The call submits the tx into the ring.  If a iowait structure is non-NULL
+ * the packet will be queued to the list in wait.
+ *
+ * Return:
+ * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in
+ * ring (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txreq(struct sdma_engine *sde,
+		    struct iowait *wait,
+		    struct sdma_txreq *tx,
+		    bool pkts_sent)
+{
+	int ret = 0;
+	u16 tail;
+	unsigned long flags;
+
+	/* user should have supplied entire packet */
+	if (unlikely(tx->tlen))
+		return -EINVAL;
+	tx->wait = wait;
+	spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+	if (unlikely(!__sdma_running(sde)))
+		goto unlock_noconn;
+	if (unlikely(tx->num_desc > sde->desc_avail))
+		goto nodesc;
+	tail = submit_tx(sde, tx);
+	if (wait)
+		iowait_sdma_inc(wait);
+	sdma_update_tail(sde, tail);
+unlock:
+	spin_unlock_irqrestore(&sde->tail_lock, flags);
+	return ret;
+unlock_noconn:
+	if (wait)
+		iowait_sdma_inc(wait);
+	tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+	tx->sn = sde->tail_sn++;
+	trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+	spin_lock(&sde->flushlist_lock);
+	list_add_tail(&tx->list, &sde->flushlist);
+	spin_unlock(&sde->flushlist_lock);
+	if (wait) {
+		wait->tx_count++;
+		wait->count += tx->num_desc;
+	}
+	schedule_work(&sde->flush_worker);
+	ret = -ECOMM;
+	goto unlock;
+nodesc:
+	ret = sdma_check_progress(sde, wait, tx, pkts_sent);
+	if (ret == -EAGAIN) {
+		ret = 0;
+		goto retry;
+	}
+	sde->descq_full_count++;
+	goto unlock;
+}
+
+/**
+ * sdma_send_txlist() - submit a list of tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx_list: list of sdma_txreqs to submit
+ * @count: pointer to a u32 which, after return will contain the total number of
+ *         sdma_txreqs removed from the tx_list. This will include sdma_txreqs
+ *         whose SDMA descriptors are submitted to the ring and the sdma_txreqs
+ *         which are added to SDMA engine flush list if the SDMA engine state is
+ *         not running.
+ *
+ * The call submits the list into the ring.
+ *
+ * If the iowait structure is non-NULL and not equal to the iowait list
+ * the unprocessed part of the list  will be appended to the list in wait.
+ *
+ * In all cases, the tx_list will be updated so the head of the tx_list is
+ * the list of descriptors that have yet to be transmitted.
+ *
+ * The intent of this call is to provide a more efficient
+ * way of submitting multiple packets to SDMA while holding the tail
+ * side locking.
+ *
+ * Return:
+ * 0 - Success,
+ * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait,
+		     struct list_head *tx_list, u32 *count_out)
+{
+	struct sdma_txreq *tx, *tx_next;
+	int ret = 0;
+	unsigned long flags;
+	u16 tail = INVALID_TAIL;
+	u32 submit_count = 0, flush_count = 0, total_count;
+
+	spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+	list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+		tx->wait = wait;
+		if (unlikely(!__sdma_running(sde)))
+			goto unlock_noconn;
+		if (unlikely(tx->num_desc > sde->desc_avail))
+			goto nodesc;
+		if (unlikely(tx->tlen)) {
+			ret = -EINVAL;
+			goto update_tail;
+		}
+		list_del_init(&tx->list);
+		tail = submit_tx(sde, tx);
+		submit_count++;
+		if (tail != INVALID_TAIL &&
+		    (submit_count & SDMA_TAIL_UPDATE_THRESH) == 0) {
+			sdma_update_tail(sde, tail);
+			tail = INVALID_TAIL;
+		}
+	}
+update_tail:
+	total_count = submit_count + flush_count;
+	if (wait) {
+		iowait_sdma_add(wait, total_count);
+		iowait_starve_clear(submit_count > 0, wait);
+	}
+	if (tail != INVALID_TAIL)
+		sdma_update_tail(sde, tail);
+	spin_unlock_irqrestore(&sde->tail_lock, flags);
+	*count_out = total_count;
+	return ret;
+unlock_noconn:
+	spin_lock(&sde->flushlist_lock);
+	list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+		tx->wait = wait;
+		list_del_init(&tx->list);
+		tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+		tx->sn = sde->tail_sn++;
+		trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+		list_add_tail(&tx->list, &sde->flushlist);
+		flush_count++;
+		if (wait) {
+			wait->tx_count++;
+			wait->count += tx->num_desc;
+		}
+	}
+	spin_unlock(&sde->flushlist_lock);
+	schedule_work(&sde->flush_worker);
+	ret = -ECOMM;
+	goto update_tail;
+nodesc:
+	ret = sdma_check_progress(sde, wait, tx, submit_count > 0);
+	if (ret == -EAGAIN) {
+		ret = 0;
+		goto retry;
+	}
+	sde->descq_full_count++;
+	goto update_tail;
+}
+
+static void sdma_process_event(struct sdma_engine *sde, enum sdma_events event)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sde->tail_lock, flags);
+	write_seqlock(&sde->head_lock);
+
+	__sdma_process_event(sde, event);
+
+	if (sde->state.current_state == sdma_state_s99_running)
+		sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+
+	write_sequnlock(&sde->head_lock);
+	spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void __sdma_process_event(struct sdma_engine *sde,
+				 enum sdma_events event)
+{
+	struct sdma_state *ss = &sde->state;
+	int need_progress = 0;
+
+	/* CONFIG SDMA temporary */
+#ifdef CONFIG_SDMA_VERBOSITY
+	dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx,
+		   sdma_state_names[ss->current_state],
+		   sdma_event_names[event]);
+#endif
+
+	switch (ss->current_state) {
+	case sdma_state_s00_hw_down:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			break;
+		case sdma_event_e30_go_running:
+			/*
+			 * If down, but running requested (usually result
+			 * of link up, then we need to start up.
+			 * This can happen when hw down is requested while
+			 * bringing the link up with traffic active on
+			 * 7220, e.g.
+			 */
+			ss->go_s99_running = 1;
+			/* fall through -- and start dma engine */
+		case sdma_event_e10_go_hw_start:
+			/* This reference means the state machine is started */
+			sdma_get(&sde->state);
+			sdma_set_state(sde,
+				       sdma_state_s10_hw_start_up_halt_wait);
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e40_sw_cleaned:
+			sdma_sw_tear_down(sde);
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			break;
+		case sdma_event_e70_go_idle:
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s10_hw_start_up_halt_wait:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			sdma_sw_tear_down(sde);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			sdma_set_state(sde,
+				       sdma_state_s15_hw_start_up_clean_wait);
+			sdma_start_hw_clean_up(sde);
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			schedule_work(&sde->err_halt_worker);
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s15_hw_start_up_clean_wait:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			sdma_sw_tear_down(sde);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			sdma_hw_start_up(sde);
+			sdma_set_state(sde, ss->go_s99_running ?
+				       sdma_state_s99_running :
+				       sdma_state_s20_idle);
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s20_idle:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			sdma_sw_tear_down(sde);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			sdma_set_state(sde, sdma_state_s99_running);
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+			schedule_work(&sde->err_halt_worker);
+			break;
+		case sdma_event_e70_go_idle:
+			break;
+		case sdma_event_e85_link_down:
+			/* fall through */
+		case sdma_event_e80_hw_freeze:
+			sdma_set_state(sde, sdma_state_s80_hw_freeze);
+			atomic_dec(&sde->dd->sdma_unfreeze_count);
+			wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s30_sw_clean_up_wait:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait);
+			sdma_start_hw_clean_up(sde);
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s40_hw_clean_up_wait:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			sdma_hw_start_up(sde);
+			sdma_set_state(sde, ss->go_s99_running ?
+				       sdma_state_s99_running :
+				       sdma_state_s20_idle);
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s50_hw_halt_wait:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			schedule_work(&sde->err_halt_worker);
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s60_idle_halt_wait:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			schedule_work(&sde->err_halt_worker);
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s80_hw_freeze:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			sdma_set_state(sde, sdma_state_s82_freeze_sw_clean);
+			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		case sdma_event_e85_link_down:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s82_freeze_sw_clean:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case sdma_event_e40_sw_cleaned:
+			/* notify caller this engine is done cleaning */
+			atomic_dec(&sde->dd->sdma_unfreeze_count);
+			wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			break;
+		case sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case sdma_event_e80_hw_freeze:
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			sdma_hw_start_up(sde);
+			sdma_set_state(sde, ss->go_s99_running ?
+				       sdma_state_s99_running :
+				       sdma_state_s20_idle);
+			break;
+		case sdma_event_e85_link_down:
+			break;
+		case sdma_event_e90_sw_halted:
+			break;
+		}
+		break;
+
+	case sdma_state_s99_running:
+		switch (event) {
+		case sdma_event_e00_go_hw_down:
+			sdma_set_state(sde, sdma_state_s00_hw_down);
+			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+			break;
+		case sdma_event_e10_go_hw_start:
+			break;
+		case sdma_event_e15_hw_halt_done:
+			break;
+		case sdma_event_e25_hw_clean_up_done:
+			break;
+		case sdma_event_e30_go_running:
+			break;
+		case sdma_event_e40_sw_cleaned:
+			break;
+		case sdma_event_e50_hw_cleaned:
+			break;
+		case sdma_event_e60_hw_halted:
+			need_progress = 1;
+			sdma_err_progress_check_schedule(sde);
+			/* fall through */
+		case sdma_event_e90_sw_halted:
+			/*
+			* SW initiated halt does not perform engines
+			* progress check
+			*/
+			sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+			schedule_work(&sde->err_halt_worker);
+			break;
+		case sdma_event_e70_go_idle:
+			sdma_set_state(sde, sdma_state_s60_idle_halt_wait);
+			break;
+		case sdma_event_e85_link_down:
+			ss->go_s99_running = 0;
+			/* fall through */
+		case sdma_event_e80_hw_freeze:
+			sdma_set_state(sde, sdma_state_s80_hw_freeze);
+			atomic_dec(&sde->dd->sdma_unfreeze_count);
+			wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+			break;
+		case sdma_event_e81_hw_frozen:
+			break;
+		case sdma_event_e82_hw_unfreeze:
+			break;
+		}
+		break;
+	}
+
+	ss->last_event = event;
+	if (need_progress)
+		sdma_make_progress(sde, 0);
+}
+
+/*
+ * _extend_sdma_tx_descs() - helper to extend txreq
+ *
+ * This is called once the initial nominal allocation
+ * of descriptors in the sdma_txreq is exhausted.
+ *
+ * The code will bump the allocation up to the max
+ * of MAX_DESC (64) descriptors. There doesn't seem
+ * much point in an interim step. The last descriptor
+ * is reserved for coalesce buffer in order to support
+ * cases where input packet has >MAX_DESC iovecs.
+ *
+ */
+static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+	int i;
+
+	/* Handle last descriptor */
+	if (unlikely((tx->num_desc == (MAX_DESC - 1)))) {
+		/* if tlen is 0, it is for padding, release last descriptor */
+		if (!tx->tlen) {
+			tx->desc_limit = MAX_DESC;
+		} else if (!tx->coalesce_buf) {
+			/* allocate coalesce buffer with space for padding */
+			tx->coalesce_buf = kmalloc(tx->tlen + sizeof(u32),
+						   GFP_ATOMIC);
+			if (!tx->coalesce_buf)
+				goto enomem;
+			tx->coalesce_idx = 0;
+		}
+		return 0;
+	}
+
+	if (unlikely(tx->num_desc == MAX_DESC))
+		goto enomem;
+
+	tx->descp = kmalloc_array(
+			MAX_DESC,
+			sizeof(struct sdma_desc),
+			GFP_ATOMIC);
+	if (!tx->descp)
+		goto enomem;
+
+	/* reserve last descriptor for coalescing */
+	tx->desc_limit = MAX_DESC - 1;
+	/* copy ones already built */
+	for (i = 0; i < tx->num_desc; i++)
+		tx->descp[i] = tx->descs[i];
+	return 0;
+enomem:
+	__sdma_txclean(dd, tx);
+	return -ENOMEM;
+}
+
+/*
+ * ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors
+ *
+ * This is called once the initial nominal allocation of descriptors
+ * in the sdma_txreq is exhausted.
+ *
+ * This function calls _extend_sdma_tx_descs to extend or allocate
+ * coalesce buffer. If there is a allocated coalesce buffer, it will
+ * copy the input packet data into the coalesce buffer. It also adds
+ * coalesce buffer descriptor once when whole packet is received.
+ *
+ * Return:
+ * <0 - error
+ * 0 - coalescing, don't populate descriptor
+ * 1 - continue with populating descriptor
+ */
+int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
+			   int type, void *kvaddr, struct page *page,
+			   unsigned long offset, u16 len)
+{
+	int pad_len, rval;
+	dma_addr_t addr;
+
+	rval = _extend_sdma_tx_descs(dd, tx);
+	if (rval) {
+		__sdma_txclean(dd, tx);
+		return rval;
+	}
+
+	/* If coalesce buffer is allocated, copy data into it */
+	if (tx->coalesce_buf) {
+		if (type == SDMA_MAP_NONE) {
+			__sdma_txclean(dd, tx);
+			return -EINVAL;
+		}
+
+		if (type == SDMA_MAP_PAGE) {
+			kvaddr = kmap(page);
+			kvaddr += offset;
+		} else if (WARN_ON(!kvaddr)) {
+			__sdma_txclean(dd, tx);
+			return -EINVAL;
+		}
+
+		memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len);
+		tx->coalesce_idx += len;
+		if (type == SDMA_MAP_PAGE)
+			kunmap(page);
+
+		/* If there is more data, return */
+		if (tx->tlen - tx->coalesce_idx)
+			return 0;
+
+		/* Whole packet is received; add any padding */
+		pad_len = tx->packet_len & (sizeof(u32) - 1);
+		if (pad_len) {
+			pad_len = sizeof(u32) - pad_len;
+			memset(tx->coalesce_buf + tx->coalesce_idx, 0, pad_len);
+			/* padding is taken care of for coalescing case */
+			tx->packet_len += pad_len;
+			tx->tlen += pad_len;
+		}
+
+		/* dma map the coalesce buffer */
+		addr = dma_map_single(&dd->pcidev->dev,
+				      tx->coalesce_buf,
+				      tx->tlen,
+				      DMA_TO_DEVICE);
+
+		if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+			__sdma_txclean(dd, tx);
+			return -ENOSPC;
+		}
+
+		/* Add descriptor for coalesce buffer */
+		tx->desc_limit = MAX_DESC;
+		return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx,
+					 addr, tx->tlen);
+	}
+
+	return 1;
+}
+
+/* Update sdes when the lmc changes */
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid)
+{
+	struct sdma_engine *sde;
+	int i;
+	u64 sreg;
+
+	sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) <<
+		SD(CHECK_SLID_MASK_SHIFT)) |
+		(((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) <<
+		SD(CHECK_SLID_VALUE_SHIFT));
+
+	for (i = 0; i < dd->num_sdma; i++) {
+		hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x",
+			  i, (u32)sreg);
+		sde = &dd->per_sdma[i];
+		write_sde_csr(sde, SD(CHECK_SLID), sreg);
+	}
+}
+
+/* tx not dword sized - pad */
+int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+	int rval = 0;
+
+	tx->num_desc++;
+	if ((unlikely(tx->num_desc == tx->desc_limit))) {
+		rval = _extend_sdma_tx_descs(dd, tx);
+		if (rval) {
+			__sdma_txclean(dd, tx);
+			return rval;
+		}
+	}
+	/* finish the one just added */
+	make_tx_sdma_desc(
+		tx,
+		SDMA_MAP_NONE,
+		dd->sdma_pad_phys,
+		sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
+	_sdma_close_tx(dd, tx);
+	return rval;
+}
+
+/*
+ * Add ahg to the sdma_txreq
+ *
+ * The logic will consume up to 3
+ * descriptors at the beginning of
+ * sdma_txreq.
+ */
+void _sdma_txreq_ahgadd(
+	struct sdma_txreq *tx,
+	u8 num_ahg,
+	u8 ahg_entry,
+	u32 *ahg,
+	u8 ahg_hlen)
+{
+	u32 i, shift = 0, desc = 0;
+	u8 mode;
+
+	WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
+	/* compute mode */
+	if (num_ahg == 1)
+		mode = SDMA_AHG_APPLY_UPDATE1;
+	else if (num_ahg <= 5)
+		mode = SDMA_AHG_APPLY_UPDATE2;
+	else
+		mode = SDMA_AHG_APPLY_UPDATE3;
+	tx->num_desc++;
+	/* initialize to consumed descriptors to zero */
+	switch (mode) {
+	case SDMA_AHG_APPLY_UPDATE3:
+		tx->num_desc++;
+		tx->descs[2].qw[0] = 0;
+		tx->descs[2].qw[1] = 0;
+		/* FALLTHROUGH */
+	case SDMA_AHG_APPLY_UPDATE2:
+		tx->num_desc++;
+		tx->descs[1].qw[0] = 0;
+		tx->descs[1].qw[1] = 0;
+		break;
+	}
+	ahg_hlen >>= 2;
+	tx->descs[0].qw[1] |=
+		(((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+			<< SDMA_DESC1_HEADER_INDEX_SHIFT) |
+		(((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
+			<< SDMA_DESC1_HEADER_DWS_SHIFT) |
+		(((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
+			<< SDMA_DESC1_HEADER_MODE_SHIFT) |
+		(((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
+			<< SDMA_DESC1_HEADER_UPDATE1_SHIFT);
+	for (i = 0; i < (num_ahg - 1); i++) {
+		if (!shift && !(i & 2))
+			desc++;
+		tx->descs[desc].qw[!!(i & 2)] |=
+			(((u64)ahg[i + 1])
+				<< shift);
+		shift = (shift + 32) & 63;
+	}
+}
+
+/**
+ * sdma_ahg_alloc - allocate an AHG entry
+ * @sde: engine to allocate from
+ *
+ * Return:
+ * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
+ * -ENOSPC if an entry is not available
+ */
+int sdma_ahg_alloc(struct sdma_engine *sde)
+{
+	int nr;
+	int oldbit;
+
+	if (!sde) {
+		trace_hfi1_ahg_allocate(sde, -EINVAL);
+		return -EINVAL;
+	}
+	while (1) {
+		nr = ffz(READ_ONCE(sde->ahg_bits));
+		if (nr > 31) {
+			trace_hfi1_ahg_allocate(sde, -ENOSPC);
+			return -ENOSPC;
+		}
+		oldbit = test_and_set_bit(nr, &sde->ahg_bits);
+		if (!oldbit)
+			break;
+		cpu_relax();
+	}
+	trace_hfi1_ahg_allocate(sde, nr);
+	return nr;
+}
+
+/**
+ * sdma_ahg_free - free an AHG entry
+ * @sde: engine to return AHG entry
+ * @ahg_index: index to free
+ *
+ * This routine frees the indicate AHG entry.
+ */
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
+{
+	if (!sde)
+		return;
+	trace_hfi1_ahg_deallocate(sde, ahg_index);
+	if (ahg_index < 0 || ahg_index > 31)
+		return;
+	clear_bit(ahg_index, &sde->ahg_bits);
+}
+
+/*
+ * SPC freeze handling for SDMA engines.  Called when the driver knows
+ * the SPC is going into a freeze but before the freeze is fully
+ * settled.  Generally an error interrupt.
+ *
+ * This event will pull the engine out of running so no more entries can be
+ * added to the engine's queue.
+ */
+void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down)
+{
+	int i;
+	enum sdma_events event = link_down ? sdma_event_e85_link_down :
+					     sdma_event_e80_hw_freeze;
+
+	/* set up the wait but do not wait here */
+	atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+	/* tell all engines to stop running and wait */
+	for (i = 0; i < dd->num_sdma; i++)
+		sdma_process_event(&dd->per_sdma[i], event);
+
+	/* sdma_freeze() will wait for all engines to have stopped */
+}
+
+/*
+ * SPC freeze handling for SDMA engines.  Called when the driver knows
+ * the SPC is fully frozen.
+ */
+void sdma_freeze(struct hfi1_devdata *dd)
+{
+	int i;
+	int ret;
+
+	/*
+	 * Make sure all engines have moved out of the running state before
+	 * continuing.
+	 */
+	ret = wait_event_interruptible(dd->sdma_unfreeze_wq,
+				       atomic_read(&dd->sdma_unfreeze_count) <=
+				       0);
+	/* interrupted or count is negative, then unloading - just exit */
+	if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0)
+		return;
+
+	/* set up the count for the next wait */
+	atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+	/* tell all engines that the SPC is frozen, they can start cleaning */
+	for (i = 0; i < dd->num_sdma; i++)
+		sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen);
+
+	/*
+	 * Wait for everyone to finish software clean before exiting.  The
+	 * software clean will read engine CSRs, so must be completed before
+	 * the next step, which will clear the engine CSRs.
+	 */
+	(void)wait_event_interruptible(dd->sdma_unfreeze_wq,
+				atomic_read(&dd->sdma_unfreeze_count) <= 0);
+	/* no need to check results - done no matter what */
+}
+
+/*
+ * SPC freeze handling for the SDMA engines.  Called after the SPC is unfrozen.
+ *
+ * The SPC freeze acts like a SDMA halt and a hardware clean combined.  All
+ * that is left is a software clean.  We could do it after the SPC is fully
+ * frozen, but then we'd have to add another state to wait for the unfreeze.
+ * Instead, just defer the software clean until the unfreeze step.
+ */
+void sdma_unfreeze(struct hfi1_devdata *dd)
+{
+	int i;
+
+	/* tell all engines start freeze clean up */
+	for (i = 0; i < dd->num_sdma; i++)
+		sdma_process_event(&dd->per_sdma[i],
+				   sdma_event_e82_hw_unfreeze);
+}
+
+/**
+ * _sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ */
+void _sdma_engine_progress_schedule(
+	struct sdma_engine *sde)
+{
+	trace_hfi1_sdma_engine_progress(sde, sde->progress_mask);
+	/* assume we have selected a good cpu */
+	write_csr(sde->dd,
+		  CCE_INT_FORCE + (8 * (IS_SDMA_START / 64)),
+		  sde->progress_mask);
+}
diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
new file mode 100644
index 0000000..46c775f
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/sdma.h
@@ -0,0 +1,1101 @@
+#ifndef _HFI1_SDMA_H
+#define _HFI1_SDMA_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/byteorder.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+
+#include "hfi.h"
+#include "verbs.h"
+#include "sdma_txreq.h"
+
+/* Hardware limit */
+#define MAX_DESC 64
+/* Hardware limit for SDMA packet size */
+#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
+
+#define SDMA_TXREQ_S_OK        0
+#define SDMA_TXREQ_S_SENDERROR 1
+#define SDMA_TXREQ_S_ABORTED   2
+#define SDMA_TXREQ_S_SHUTDOWN  3
+
+/* flags bits */
+#define SDMA_TXREQ_F_URGENT       0x0001
+#define SDMA_TXREQ_F_AHG_COPY     0x0002
+#define SDMA_TXREQ_F_USE_AHG      0x0004
+
+#define SDMA_MAP_NONE          0
+#define SDMA_MAP_SINGLE        1
+#define SDMA_MAP_PAGE          2
+
+#define SDMA_AHG_VALUE_MASK          0xffff
+#define SDMA_AHG_VALUE_SHIFT         0
+#define SDMA_AHG_INDEX_MASK          0xf
+#define SDMA_AHG_INDEX_SHIFT         16
+#define SDMA_AHG_FIELD_LEN_MASK      0xf
+#define SDMA_AHG_FIELD_LEN_SHIFT     20
+#define SDMA_AHG_FIELD_START_MASK    0x1f
+#define SDMA_AHG_FIELD_START_SHIFT   24
+#define SDMA_AHG_UPDATE_ENABLE_MASK  0x1
+#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31
+
+/* AHG modes */
+
+/*
+ * Be aware the ordering and values
+ * for SDMA_AHG_APPLY_UPDATE[123]
+ * are assumed in generating a skip
+ * count in submit_tx() in sdma.c
+ */
+#define SDMA_AHG_NO_AHG              0
+#define SDMA_AHG_COPY                1
+#define SDMA_AHG_APPLY_UPDATE1       2
+#define SDMA_AHG_APPLY_UPDATE2       3
+#define SDMA_AHG_APPLY_UPDATE3       4
+
+/*
+ * Bits defined in the send DMA descriptor.
+ */
+#define SDMA_DESC0_FIRST_DESC_FLAG      BIT_ULL(63)
+#define SDMA_DESC0_LAST_DESC_FLAG       BIT_ULL(62)
+#define SDMA_DESC0_BYTE_COUNT_SHIFT     48
+#define SDMA_DESC0_BYTE_COUNT_WIDTH     14
+#define SDMA_DESC0_BYTE_COUNT_MASK \
+	((1ULL << SDMA_DESC0_BYTE_COUNT_WIDTH) - 1)
+#define SDMA_DESC0_BYTE_COUNT_SMASK \
+	(SDMA_DESC0_BYTE_COUNT_MASK << SDMA_DESC0_BYTE_COUNT_SHIFT)
+#define SDMA_DESC0_PHY_ADDR_SHIFT       0
+#define SDMA_DESC0_PHY_ADDR_WIDTH       48
+#define SDMA_DESC0_PHY_ADDR_MASK \
+	((1ULL << SDMA_DESC0_PHY_ADDR_WIDTH) - 1)
+#define SDMA_DESC0_PHY_ADDR_SMASK \
+	(SDMA_DESC0_PHY_ADDR_MASK << SDMA_DESC0_PHY_ADDR_SHIFT)
+
+#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32
+#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32
+#define SDMA_DESC1_HEADER_UPDATE1_MASK \
+	((1ULL << SDMA_DESC1_HEADER_UPDATE1_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_UPDATE1_SMASK \
+	(SDMA_DESC1_HEADER_UPDATE1_MASK << SDMA_DESC1_HEADER_UPDATE1_SHIFT)
+#define SDMA_DESC1_HEADER_MODE_SHIFT    13
+#define SDMA_DESC1_HEADER_MODE_WIDTH    3
+#define SDMA_DESC1_HEADER_MODE_MASK \
+	((1ULL << SDMA_DESC1_HEADER_MODE_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_MODE_SMASK \
+	(SDMA_DESC1_HEADER_MODE_MASK << SDMA_DESC1_HEADER_MODE_SHIFT)
+#define SDMA_DESC1_HEADER_INDEX_SHIFT   8
+#define SDMA_DESC1_HEADER_INDEX_WIDTH   5
+#define SDMA_DESC1_HEADER_INDEX_MASK \
+	((1ULL << SDMA_DESC1_HEADER_INDEX_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_INDEX_SMASK \
+	(SDMA_DESC1_HEADER_INDEX_MASK << SDMA_DESC1_HEADER_INDEX_SHIFT)
+#define SDMA_DESC1_HEADER_DWS_SHIFT     4
+#define SDMA_DESC1_HEADER_DWS_WIDTH     4
+#define SDMA_DESC1_HEADER_DWS_MASK \
+	((1ULL << SDMA_DESC1_HEADER_DWS_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_DWS_SMASK \
+	(SDMA_DESC1_HEADER_DWS_MASK << SDMA_DESC1_HEADER_DWS_SHIFT)
+#define SDMA_DESC1_GENERATION_SHIFT     2
+#define SDMA_DESC1_GENERATION_WIDTH     2
+#define SDMA_DESC1_GENERATION_MASK \
+	((1ULL << SDMA_DESC1_GENERATION_WIDTH) - 1)
+#define SDMA_DESC1_GENERATION_SMASK \
+	(SDMA_DESC1_GENERATION_MASK << SDMA_DESC1_GENERATION_SHIFT)
+#define SDMA_DESC1_INT_REQ_FLAG         BIT_ULL(1)
+#define SDMA_DESC1_HEAD_TO_HOST_FLAG    BIT_ULL(0)
+
+enum sdma_states {
+	sdma_state_s00_hw_down,
+	sdma_state_s10_hw_start_up_halt_wait,
+	sdma_state_s15_hw_start_up_clean_wait,
+	sdma_state_s20_idle,
+	sdma_state_s30_sw_clean_up_wait,
+	sdma_state_s40_hw_clean_up_wait,
+	sdma_state_s50_hw_halt_wait,
+	sdma_state_s60_idle_halt_wait,
+	sdma_state_s80_hw_freeze,
+	sdma_state_s82_freeze_sw_clean,
+	sdma_state_s99_running,
+};
+
+enum sdma_events {
+	sdma_event_e00_go_hw_down,
+	sdma_event_e10_go_hw_start,
+	sdma_event_e15_hw_halt_done,
+	sdma_event_e25_hw_clean_up_done,
+	sdma_event_e30_go_running,
+	sdma_event_e40_sw_cleaned,
+	sdma_event_e50_hw_cleaned,
+	sdma_event_e60_hw_halted,
+	sdma_event_e70_go_idle,
+	sdma_event_e80_hw_freeze,
+	sdma_event_e81_hw_frozen,
+	sdma_event_e82_hw_unfreeze,
+	sdma_event_e85_link_down,
+	sdma_event_e90_sw_halted,
+};
+
+struct sdma_set_state_action {
+	unsigned op_enable:1;
+	unsigned op_intenable:1;
+	unsigned op_halt:1;
+	unsigned op_cleanup:1;
+	unsigned go_s99_running_tofalse:1;
+	unsigned go_s99_running_totrue:1;
+};
+
+struct sdma_state {
+	struct kref          kref;
+	struct completion    comp;
+	enum sdma_states current_state;
+	unsigned             current_op;
+	unsigned             go_s99_running;
+	/* debugging/development */
+	enum sdma_states previous_state;
+	unsigned             previous_op;
+	enum sdma_events last_event;
+};
+
+/**
+ * DOC: sdma exported routines
+ *
+ * These sdma routines fit into three categories:
+ * - The SDMA API for building and submitting packets
+ *   to the ring
+ *
+ * - Initialization and tear down routines to buildup
+ *   and tear down SDMA
+ *
+ * - ISR entrances to handle interrupts, state changes
+ *   and errors
+ */
+
+/**
+ * DOC: sdma PSM/verbs API
+ *
+ * The sdma API is designed to be used by both PSM
+ * and verbs to supply packets to the SDMA ring.
+ *
+ * The usage of the API is as follows:
+ *
+ * Embed a struct iowait in the QP or
+ * PQ.  The iowait should be initialized with a
+ * call to iowait_init().
+ *
+ * The user of the API should create an allocation method
+ * for their version of the txreq. slabs, pre-allocated lists,
+ * and dma pools can be used.  Once the user's overload of
+ * the sdma_txreq has been allocated, the sdma_txreq member
+ * must be initialized with sdma_txinit() or sdma_txinit_ahg().
+ *
+ * The txreq must be declared with the sdma_txreq first.
+ *
+ * The tx request, once initialized,  is manipulated with calls to
+ * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr()
+ * for each disjoint memory location.  It is the user's responsibility
+ * to understand the packet boundaries and page boundaries to do the
+ * appropriate number of sdma_txadd_* calls..  The user
+ * must be prepared to deal with failures from these routines due to
+ * either memory allocation or dma_mapping failures.
+ *
+ * The mapping specifics for each memory location are recorded
+ * in the tx. Memory locations added with sdma_txadd_page()
+ * and sdma_txadd_kvaddr() are automatically mapped when added
+ * to the tx and nmapped as part of the progress processing in the
+ * SDMA interrupt handling.
+ *
+ * sdma_txadd_daddr() is used to add an dma_addr_t memory to the
+ * tx.   An example of a use case would be a pre-allocated
+ * set of headers allocated via dma_pool_alloc() or
+ * dma_alloc_coherent().  For these memory locations, it
+ * is the responsibility of the user to handle that unmapping.
+ * (This would usually be at an unload or job termination.)
+ *
+ * The routine sdma_send_txreq() is used to submit
+ * a tx to the ring after the appropriate number of
+ * sdma_txadd_* have been done.
+ *
+ * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist()
+ * can be used to submit a list of packets.
+ *
+ * The user is free to use the link overhead in the struct sdma_txreq as
+ * long as the tx isn't in flight.
+ *
+ * The extreme degenerate case of the number of descriptors
+ * exceeding the ring size is automatically handled as
+ * memory locations are added.  An overflow of the descriptor
+ * array that is part of the sdma_txreq is also automatically
+ * handled.
+ *
+ */
+
+/**
+ * DOC: Infrastructure calls
+ *
+ * sdma_init() is used to initialize data structures and
+ * CSRs for the desired number of SDMA engines.
+ *
+ * sdma_start() is used to kick the SDMA engines initialized
+ * with sdma_init().   Interrupts must be enabled at this
+ * point since aspects of the state machine are interrupt
+ * driven.
+ *
+ * sdma_engine_error() and sdma_engine_interrupt() are
+ * entrances for interrupts.
+ *
+ * sdma_map_init() is for the management of the mapping
+ * table when the number of vls is changed.
+ *
+ */
+
+/*
+ * struct hw_sdma_desc - raw 128 bit SDMA descriptor
+ *
+ * This is the raw descriptor in the SDMA ring
+ */
+struct hw_sdma_desc {
+	/* private:  don't use directly */
+	__le64 qw[2];
+};
+
+/**
+ * struct sdma_engine - Data pertaining to each SDMA engine.
+ * @dd: a back-pointer to the device data
+ * @ppd: per port back-pointer
+ * @imask: mask for irq manipulation
+ * @idle_mask: mask for determining if an interrupt is due to sdma_idle
+ *
+ * This structure has the state for each sdma_engine.
+ *
+ * Accessing to non public fields are not supported
+ * since the private members are subject to change.
+ */
+struct sdma_engine {
+	/* read mostly */
+	struct hfi1_devdata *dd;
+	struct hfi1_pportdata *ppd;
+	/* private: */
+	void __iomem *tail_csr;
+	u64 imask;			/* clear interrupt mask */
+	u64 idle_mask;
+	u64 progress_mask;
+	u64 int_mask;
+	/* private: */
+	volatile __le64      *head_dma; /* DMA'ed by chip */
+	/* private: */
+	dma_addr_t            head_phys;
+	/* private: */
+	struct hw_sdma_desc *descq;
+	/* private: */
+	unsigned descq_full_count;
+	struct sdma_txreq **tx_ring;
+	/* private: */
+	dma_addr_t            descq_phys;
+	/* private */
+	u32 sdma_mask;
+	/* private */
+	struct sdma_state state;
+	/* private */
+	int cpu;
+	/* private: */
+	u8 sdma_shift;
+	/* private: */
+	u8 this_idx; /* zero relative engine */
+	/* protect changes to senddmactrl shadow */
+	spinlock_t senddmactrl_lock;
+	/* private: */
+	u64 p_senddmactrl;		/* shadow per-engine SendDmaCtrl */
+
+	/* read/write using tail_lock */
+	spinlock_t            tail_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+	/* private: */
+	u64                   tail_sn;
+#endif
+	/* private: */
+	u32                   descq_tail;
+	/* private: */
+	unsigned long         ahg_bits;
+	/* private: */
+	u16                   desc_avail;
+	/* private: */
+	u16                   tx_tail;
+	/* private: */
+	u16 descq_cnt;
+
+	/* read/write using head_lock */
+	/* private: */
+	seqlock_t            head_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+	/* private: */
+	u64                   head_sn;
+#endif
+	/* private: */
+	u32                   descq_head;
+	/* private: */
+	u16                   tx_head;
+	/* private: */
+	u64                   last_status;
+	/* private */
+	u64                     err_cnt;
+	/* private */
+	u64                     sdma_int_cnt;
+	u64                     idle_int_cnt;
+	u64                     progress_int_cnt;
+
+	/* private: */
+	struct list_head      dmawait;
+
+	/* CONFIG SDMA for now, just blindly duplicate */
+	/* private: */
+	struct tasklet_struct sdma_hw_clean_up_task
+		____cacheline_aligned_in_smp;
+
+	/* private: */
+	struct tasklet_struct sdma_sw_clean_up_task
+		____cacheline_aligned_in_smp;
+	/* private: */
+	struct work_struct err_halt_worker;
+	/* private */
+	struct timer_list     err_progress_check_timer;
+	u32                   progress_check_head;
+	/* private: */
+	struct work_struct flush_worker;
+	/* protect flush list */
+	spinlock_t flushlist_lock;
+	/* private: */
+	struct list_head flushlist;
+	struct cpumask cpu_mask;
+	struct kobject kobj;
+};
+
+int sdma_init(struct hfi1_devdata *dd, u8 port);
+void sdma_start(struct hfi1_devdata *dd);
+void sdma_exit(struct hfi1_devdata *dd);
+void sdma_clean(struct hfi1_devdata *dd, size_t num_engines);
+void sdma_all_running(struct hfi1_devdata *dd);
+void sdma_all_idle(struct hfi1_devdata *dd);
+void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle);
+void sdma_freeze(struct hfi1_devdata *dd);
+void sdma_unfreeze(struct hfi1_devdata *dd);
+void sdma_wait(struct hfi1_devdata *dd);
+
+/**
+ * sdma_empty() - idle engine test
+ * @engine: sdma engine
+ *
+ * Currently used by verbs as a latency optimization.
+ *
+ * Return:
+ * 1 - empty, 0 - non-empty
+ */
+static inline int sdma_empty(struct sdma_engine *sde)
+{
+	return sde->descq_tail == sde->descq_head;
+}
+
+static inline u16 sdma_descq_freecnt(struct sdma_engine *sde)
+{
+	return sde->descq_cnt -
+		(sde->descq_tail -
+		 READ_ONCE(sde->descq_head)) - 1;
+}
+
+static inline u16 sdma_descq_inprocess(struct sdma_engine *sde)
+{
+	return sde->descq_cnt - sdma_descq_freecnt(sde);
+}
+
+/*
+ * Either head_lock or tail lock required to see
+ * a steady state.
+ */
+static inline int __sdma_running(struct sdma_engine *engine)
+{
+	return engine->state.current_state == sdma_state_s99_running;
+}
+
+/**
+ * sdma_running() - state suitability test
+ * @engine: sdma engine
+ *
+ * sdma_running probes the internal state to determine if it is suitable
+ * for submitting packets.
+ *
+ * Return:
+ * 1 - ok to submit, 0 - not ok to submit
+ *
+ */
+static inline int sdma_running(struct sdma_engine *engine)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&engine->tail_lock, flags);
+	ret = __sdma_running(engine);
+	spin_unlock_irqrestore(&engine->tail_lock, flags);
+	return ret;
+}
+
+void _sdma_txreq_ahgadd(
+	struct sdma_txreq *tx,
+	u8 num_ahg,
+	u8 ahg_entry,
+	u32 *ahg,
+	u8 ahg_hlen);
+
+/**
+ * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @ahg_entry: ahg entry to use  (0 - 31)
+ * @num_ahg: ahg descriptor for first descriptor (0 - 9)
+ * @ahg: array of AHG descriptors (up to 9 entries)
+ * @ahg_hlen: number of bytes from ASIC entry to use
+ * @cb: callback
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent.  This routine must be called to initialize the user independent
+ * fields.
+ *
+ * The currently supported flags are SDMA_TXREQ_F_URGENT,
+ * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be
+ * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in
+ * the AHG descriptors into the first 1 to 3 descriptors.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg().  The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used.   Aspects of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted. The callback will be provided this tx, a status, and a flag.
+ *
+ * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ * The flag, if the is the iowait had been used, indicates the iowait
+ * sdma_busy count has reached zero.
+ *
+ * user data portion of tlen should be precise.   The sdma_txadd_* entrances
+ * will pad with a descriptor references 1 - 3 bytes when the number of bytes
+ * specified in tlen have been supplied to the sdma_txreq.
+ *
+ * ahg_hlen is used to determine the number of on-chip entry bytes to
+ * use as the header.   This is for cases where the stored header is
+ * larger than the header to be used in a packet.  This is typical
+ * for verbs where an RDMA_WRITE_FIRST is larger than the packet in
+ * and RDMA_WRITE_MIDDLE.
+ *
+ */
+static inline int sdma_txinit_ahg(
+	struct sdma_txreq *tx,
+	u16 flags,
+	u16 tlen,
+	u8 ahg_entry,
+	u8 num_ahg,
+	u32 *ahg,
+	u8 ahg_hlen,
+	void (*cb)(struct sdma_txreq *, int))
+{
+	if (tlen == 0)
+		return -ENODATA;
+	if (tlen > MAX_SDMA_PKT_SIZE)
+		return -EMSGSIZE;
+	tx->desc_limit = ARRAY_SIZE(tx->descs);
+	tx->descp = &tx->descs[0];
+	INIT_LIST_HEAD(&tx->list);
+	tx->num_desc = 0;
+	tx->flags = flags;
+	tx->complete = cb;
+	tx->coalesce_buf = NULL;
+	tx->wait = NULL;
+	tx->packet_len = tlen;
+	tx->tlen = tx->packet_len;
+	tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG;
+	tx->descs[0].qw[1] = 0;
+	if (flags & SDMA_TXREQ_F_AHG_COPY)
+		tx->descs[0].qw[1] |=
+			(((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+				<< SDMA_DESC1_HEADER_INDEX_SHIFT) |
+			(((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK)
+				<< SDMA_DESC1_HEADER_MODE_SHIFT);
+	else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg)
+		_sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen);
+	return 0;
+}
+
+/**
+ * sdma_txinit() - initialize an sdma_txreq struct (no AHG)
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @cb: callback pointer
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent.  This routine must be called to initialize the user
+ * independent fields.
+ *
+ * The currently supported flags is SDMA_TXREQ_F_URGENT.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg().  The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used.   The head size of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted.
+ *
+ * The callback, if non-NULL,  will be provided this tx and a status.  The
+ * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ */
+static inline int sdma_txinit(
+	struct sdma_txreq *tx,
+	u16 flags,
+	u16 tlen,
+	void (*cb)(struct sdma_txreq *, int))
+{
+	return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb);
+}
+
+/* helpers - don't use */
+static inline int sdma_mapping_type(struct sdma_desc *d)
+{
+	return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK)
+		>> SDMA_DESC1_GENERATION_SHIFT;
+}
+
+static inline size_t sdma_mapping_len(struct sdma_desc *d)
+{
+	return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK)
+		>> SDMA_DESC0_BYTE_COUNT_SHIFT;
+}
+
+static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d)
+{
+	return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK)
+		>> SDMA_DESC0_PHY_ADDR_SHIFT;
+}
+
+static inline void make_tx_sdma_desc(
+	struct sdma_txreq *tx,
+	int type,
+	dma_addr_t addr,
+	size_t len)
+{
+	struct sdma_desc *desc = &tx->descp[tx->num_desc];
+
+	if (!tx->num_desc) {
+		/* qw[0] zero; qw[1] first, ahg mode already in from init */
+		desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK)
+				<< SDMA_DESC1_GENERATION_SHIFT;
+	} else {
+		desc->qw[0] = 0;
+		desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK)
+				<< SDMA_DESC1_GENERATION_SHIFT;
+	}
+	desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK)
+				<< SDMA_DESC0_PHY_ADDR_SHIFT) |
+			(((u64)len & SDMA_DESC0_BYTE_COUNT_MASK)
+				<< SDMA_DESC0_BYTE_COUNT_SHIFT);
+}
+
+/* helper to extend txreq */
+int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
+			   int type, void *kvaddr, struct page *page,
+			   unsigned long offset, u16 len);
+int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
+void __sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *);
+
+static inline void sdma_txclean(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+	if (tx->num_desc)
+		__sdma_txclean(dd, tx);
+}
+
+/* helpers used by public routines */
+static inline void _sdma_close_tx(struct hfi1_devdata *dd,
+				  struct sdma_txreq *tx)
+{
+	tx->descp[tx->num_desc].qw[0] |=
+		SDMA_DESC0_LAST_DESC_FLAG;
+	tx->descp[tx->num_desc].qw[1] |=
+		dd->default_desc1;
+	if (tx->flags & SDMA_TXREQ_F_URGENT)
+		tx->descp[tx->num_desc].qw[1] |=
+			(SDMA_DESC1_HEAD_TO_HOST_FLAG |
+			 SDMA_DESC1_INT_REQ_FLAG);
+}
+
+static inline int _sdma_txadd_daddr(
+	struct hfi1_devdata *dd,
+	int type,
+	struct sdma_txreq *tx,
+	dma_addr_t addr,
+	u16 len)
+{
+	int rval = 0;
+
+	make_tx_sdma_desc(
+		tx,
+		type,
+		addr, len);
+	WARN_ON(len > tx->tlen);
+	tx->tlen -= len;
+	/* special cases for last */
+	if (!tx->tlen) {
+		if (tx->packet_len & (sizeof(u32) - 1)) {
+			rval = _pad_sdma_tx_descs(dd, tx);
+			if (rval)
+				return rval;
+		} else {
+			_sdma_close_tx(dd, tx);
+		}
+	}
+	tx->num_desc++;
+	return rval;
+}
+
+/**
+ * sdma_txadd_page() - add a page to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: tx request to which the page is added
+ * @page: page to map
+ * @offset: offset within the page
+ * @len: length in bytes
+ *
+ * This is used to add a page/offset/length descriptor.
+ *
+ * The mapping/unmapping of the page/offset/len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't
+ * extend/coalesce descriptor array
+ */
+static inline int sdma_txadd_page(
+	struct hfi1_devdata *dd,
+	struct sdma_txreq *tx,
+	struct page *page,
+	unsigned long offset,
+	u16 len)
+{
+	dma_addr_t addr;
+	int rval;
+
+	if ((unlikely(tx->num_desc == tx->desc_limit))) {
+		rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_PAGE,
+					      NULL, page, offset, len);
+		if (rval <= 0)
+			return rval;
+	}
+
+	addr = dma_map_page(
+		       &dd->pcidev->dev,
+		       page,
+		       offset,
+		       len,
+		       DMA_TO_DEVICE);
+
+	if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+		__sdma_txclean(dd, tx);
+		return -ENOSPC;
+	}
+
+	return _sdma_txadd_daddr(
+			dd, SDMA_MAP_PAGE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_daddr() - add a dma address to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @addr: dma address mapped by caller
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor for memory that is already dma mapped.
+ *
+ * In this case, there is no unmapping as part of the progress processing for
+ * this memory location.
+ *
+ * Return:
+ * 0 - success, -ENOMEM - couldn't extend descriptor array
+ */
+
+static inline int sdma_txadd_daddr(
+	struct hfi1_devdata *dd,
+	struct sdma_txreq *tx,
+	dma_addr_t addr,
+	u16 len)
+{
+	int rval;
+
+	if ((unlikely(tx->num_desc == tx->desc_limit))) {
+		rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_NONE,
+					      NULL, NULL, 0, 0);
+		if (rval <= 0)
+			return rval;
+	}
+
+	return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @kvaddr: the kernel virtual address
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor referenced by the indicated kvaddr and
+ * len.
+ *
+ * The mapping/unmapping of the kvaddr and len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend/coalesce
+ * descriptor array
+ */
+static inline int sdma_txadd_kvaddr(
+	struct hfi1_devdata *dd,
+	struct sdma_txreq *tx,
+	void *kvaddr,
+	u16 len)
+{
+	dma_addr_t addr;
+	int rval;
+
+	if ((unlikely(tx->num_desc == tx->desc_limit))) {
+		rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_SINGLE,
+					      kvaddr, NULL, 0, len);
+		if (rval <= 0)
+			return rval;
+	}
+
+	addr = dma_map_single(
+		       &dd->pcidev->dev,
+		       kvaddr,
+		       len,
+		       DMA_TO_DEVICE);
+
+	if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+		__sdma_txclean(dd, tx);
+		return -ENOSPC;
+	}
+
+	return _sdma_txadd_daddr(
+			dd, SDMA_MAP_SINGLE, tx, addr, len);
+}
+
+struct iowait;
+
+int sdma_send_txreq(struct sdma_engine *sde,
+		    struct iowait *wait,
+		    struct sdma_txreq *tx,
+		    bool pkts_sent);
+int sdma_send_txlist(struct sdma_engine *sde,
+		     struct iowait *wait,
+		     struct list_head *tx_list,
+		     u32 *count);
+
+int sdma_ahg_alloc(struct sdma_engine *sde);
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);
+
+/**
+ * sdma_build_ahg - build ahg descriptor
+ * @data
+ * @dwindex
+ * @startbit
+ * @bits
+ *
+ * Build and return a 32 bit descriptor.
+ */
+static inline u32 sdma_build_ahg_descriptor(
+	u16 data,
+	u8 dwindex,
+	u8 startbit,
+	u8 bits)
+{
+	return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT |
+		((startbit & SDMA_AHG_FIELD_START_MASK) <<
+		SDMA_AHG_FIELD_START_SHIFT) |
+		((bits & SDMA_AHG_FIELD_LEN_MASK) <<
+		SDMA_AHG_FIELD_LEN_SHIFT) |
+		((dwindex & SDMA_AHG_INDEX_MASK) <<
+		SDMA_AHG_INDEX_SHIFT) |
+		((data & SDMA_AHG_VALUE_MASK) <<
+		SDMA_AHG_VALUE_SHIFT));
+}
+
+/**
+ * sdma_progress - use seq number of detect head progress
+ * @sde: sdma_engine to check
+ * @seq: base seq count
+ * @tx: txreq for which we need to check descriptor availability
+ *
+ * This is used in the appropriate spot in the sleep routine
+ * to check for potential ring progress.  This routine gets the
+ * seqcount before queuing the iowait structure for progress.
+ *
+ * If the seqcount indicates that progress needs to be checked,
+ * re-submission is detected by checking whether the descriptor
+ * queue has enough descriptor for the txreq.
+ */
+static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq,
+				     struct sdma_txreq *tx)
+{
+	if (read_seqretry(&sde->head_lock, seq)) {
+		sde->desc_avail = sdma_descq_freecnt(sde);
+		if (tx->num_desc > sde->desc_avail)
+			return 0;
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * sdma_iowait_schedule() - initialize wait structure
+ * @sde: sdma_engine to schedule
+ * @wait: wait struct to schedule
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+static inline void sdma_iowait_schedule(
+	struct sdma_engine *sde,
+	struct iowait *wait)
+{
+	struct hfi1_pportdata *ppd = sde->dd->pport;
+
+	iowait_schedule(wait, ppd->hfi1_wq, sde->cpu);
+}
+
+/* for use by interrupt handling */
+void sdma_engine_error(struct sdma_engine *sde, u64 status);
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status);
+
+/*
+ *
+ * The diagram below details the relationship of the mapping structures
+ *
+ * Since the mapping now allows for non-uniform engines per vl, the
+ * number of engines for a vl is either the vl_engines[vl] or
+ * a computation based on num_sdma/num_vls:
+ *
+ * For example:
+ * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls
+ *
+ * n = roundup to next highest power of 2 using nactual
+ *
+ * In the case where there are num_sdma/num_vls doesn't divide
+ * evenly, the extras are added from the last vl downward.
+ *
+ * For the case where n > nactual, the engines are assigned
+ * in a round robin fashion wrapping back to the first engine
+ * for a particular vl.
+ *
+ *               dd->sdma_map
+ *                    |                                   sdma_map_elem[0]
+ *                    |                                +--------------------+
+ *                    v                                |       mask         |
+ *               sdma_vl_map                           |--------------------|
+ *      +--------------------------+                   | sde[0] -> eng 1    |
+ *      |    list (RCU)            |                   |--------------------|
+ *      |--------------------------|                 ->| sde[1] -> eng 2    |
+ *      |    mask                  |              --/  |--------------------|
+ *      |--------------------------|            -/     |        *           |
+ *      |    actual_vls (max 8)    |          -/       |--------------------|
+ *      |--------------------------|       --/         | sde[n-1] -> eng n  |
+ *      |    vls (max 8)           |     -/            +--------------------+
+ *      |--------------------------|  --/
+ *      |    map[0]                |-/
+ *      |--------------------------|                   +---------------------+
+ *      |    map[1]                |---                |       mask          |
+ *      |--------------------------|   \----           |---------------------|
+ *      |           *              |        \--        | sde[0] -> eng 1+n   |
+ *      |           *              |           \----   |---------------------|
+ *      |           *              |                \->| sde[1] -> eng 2+n   |
+ *      |--------------------------|                   |---------------------|
+ *      |   map[vls - 1]           |-                  |         *           |
+ *      +--------------------------+ \-                |---------------------|
+ *                                     \-              | sde[m-1] -> eng m+n |
+ *                                       \             +---------------------+
+ *                                        \-
+ *                                          \
+ *                                           \-        +----------------------+
+ *                                             \-      |       mask           |
+ *                                               \     |----------------------|
+ *                                                \-   | sde[0] -> eng 1+m+n  |
+ *                                                  \- |----------------------|
+ *                                                    >| sde[1] -> eng 2+m+n  |
+ *                                                     |----------------------|
+ *                                                     |         *            |
+ *                                                     |----------------------|
+ *                                                     | sde[o-1] -> eng o+m+n|
+ *                                                     +----------------------+
+ *
+ */
+
+/**
+ * struct sdma_map_elem - mapping for a vl
+ * @mask - selector mask
+ * @sde - array of engines for this vl
+ *
+ * The mask is used to "mod" the selector
+ * to produce index into the trailing
+ * array of sdes.
+ */
+struct sdma_map_elem {
+	u32 mask;
+	struct sdma_engine *sde[0];
+};
+
+/**
+ * struct sdma_map_el - mapping for a vl
+ * @engine_to_vl - map of an engine to a vl
+ * @list - rcu head for free callback
+ * @mask - vl mask to "mod" the vl to produce an index to map array
+ * @actual_vls - number of vls
+ * @vls - number of vls rounded to next power of 2
+ * @map - array of sdma_map_elem entries
+ *
+ * This is the parent mapping structure.  The trailing
+ * members of the struct point to sdma_map_elem entries, which
+ * in turn point to an array of sde's for that vl.
+ */
+struct sdma_vl_map {
+	s8 engine_to_vl[TXE_NUM_SDMA_ENGINES];
+	struct rcu_head list;
+	u32 mask;
+	u8 actual_vls;
+	u8 vls;
+	struct sdma_map_elem *map[0];
+};
+
+int sdma_map_init(
+	struct hfi1_devdata *dd,
+	u8 port,
+	u8 num_vls,
+	u8 *vl_engines);
+
+/* slow path */
+void _sdma_engine_progress_schedule(struct sdma_engine *sde);
+
+/**
+ * sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ * This is the fast path.
+ *
+ */
+static inline void sdma_engine_progress_schedule(
+	struct sdma_engine *sde)
+{
+	if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8))
+		return;
+	_sdma_engine_progress_schedule(sde);
+}
+
+struct sdma_engine *sdma_select_engine_sc(
+	struct hfi1_devdata *dd,
+	u32 selector,
+	u8 sc5);
+
+struct sdma_engine *sdma_select_engine_vl(
+	struct hfi1_devdata *dd,
+	u32 selector,
+	u8 vl);
+
+struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
+					    u32 selector, u8 vl);
+ssize_t sdma_get_cpu_to_sde_map(struct sdma_engine *sde, char *buf);
+ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf,
+				size_t count);
+int sdma_engine_get_vl(struct sdma_engine *sde);
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *);
+void sdma_seqfile_dump_cpu_list(struct seq_file *s, struct hfi1_devdata *dd,
+				unsigned long cpuid);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+void sdma_dumpstate(struct sdma_engine *);
+#endif
+static inline char *slashstrip(char *s)
+{
+	char *r = s;
+
+	while (*s)
+		if (*s++ == '/')
+			r = s;
+	return r;
+}
+
+u16 sdma_get_descq_cnt(void);
+
+extern uint mod_num_sdma;
+
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid);
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h
new file mode 100644
index 0000000..bf7d777
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_SDMA_TXREQ_H
+#define HFI1_SDMA_TXREQ_H
+
+/* increased for AHG */
+#define NUM_DESC 6
+
+/*
+ * struct sdma_desc - canonical fragment descriptor
+ *
+ * This is the descriptor carried in the tx request
+ * corresponding to each fragment.
+ *
+ */
+struct sdma_desc {
+	/* private:  don't use directly */
+	u64 qw[2];
+};
+
+/**
+ * struct sdma_txreq - the sdma_txreq structure (one per packet)
+ * @list: for use by user and by queuing for wait
+ *
+ * This is the representation of a packet which consists of some
+ * number of fragments.   Storage is provided to within the structure.
+ * for all fragments.
+ *
+ * The storage for the descriptors are automatically extended as needed
+ * when the currently allocation is exceeded.
+ *
+ * The user (Verbs or PSM) may overload this structure with fields
+ * specific to their use by putting this struct first in their struct.
+ * The method of allocation of the overloaded structure is user dependent
+ *
+ * The list is the only public field in the structure.
+ *
+ */
+
+#define SDMA_TXREQ_S_OK        0
+#define SDMA_TXREQ_S_SENDERROR 1
+#define SDMA_TXREQ_S_ABORTED   2
+#define SDMA_TXREQ_S_SHUTDOWN  3
+
+/* flags bits */
+#define SDMA_TXREQ_F_URGENT       0x0001
+#define SDMA_TXREQ_F_AHG_COPY     0x0002
+#define SDMA_TXREQ_F_USE_AHG      0x0004
+
+struct sdma_txreq;
+typedef void (*callback_t)(struct sdma_txreq *, int);
+
+struct iowait;
+struct sdma_txreq {
+	struct list_head list;
+	/* private: */
+	struct sdma_desc *descp;
+	/* private: */
+	void *coalesce_buf;
+	/* private: */
+	struct iowait *wait;
+	/* private: */
+	callback_t                  complete;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+	u64 sn;
+#endif
+	/* private: - used in coalesce/pad processing */
+	u16                         packet_len;
+	/* private: - down-counted to trigger last */
+	u16                         tlen;
+	/* private: */
+	u16                         num_desc;
+	/* private: */
+	u16                         desc_limit;
+	/* private: */
+	u16                         next_descq_idx;
+	/* private: */
+	u16 coalesce_idx;
+	/* private: flags */
+	u16                         flags;
+	/* private: */
+	struct sdma_desc descs[NUM_DESC];
+};
+
+static inline int sdma_txreq_built(struct sdma_txreq *tx)
+{
+	return tx->num_desc;
+}
+
+#endif                          /* HFI1_SDMA_TXREQ_H */
diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c
new file mode 100644
index 0000000..25e8673
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/sysfs.c
@@ -0,0 +1,887 @@
+/*
+ * Copyright(c) 2015-2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/ctype.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+
+/*
+ * Start of per-port congestion control structures and support code
+ */
+
+/*
+ * Congestion control table size followed by table entries
+ */
+static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
+				 struct bin_attribute *bin_attr,
+				 char *buf, loff_t pos, size_t count)
+{
+	int ret;
+	struct hfi1_pportdata *ppd =
+		container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+	struct cc_state *cc_state;
+
+	ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow)
+		 + sizeof(__be16);
+
+	if (pos > ret)
+		return -EINVAL;
+
+	if (count > ret - pos)
+		count = ret - pos;
+
+	if (!count)
+		return count;
+
+	rcu_read_lock();
+	cc_state = get_cc_state(ppd);
+	if (!cc_state) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+	memcpy(buf, (void *)&cc_state->cct + pos, count);
+	rcu_read_unlock();
+
+	return count;
+}
+
+static void port_release(struct kobject *kobj)
+{
+	/* nothing to do since memory is freed by hfi1_free_devdata() */
+}
+
+static const struct bin_attribute cc_table_bin_attr = {
+	.attr = {.name = "cc_table_bin", .mode = 0444},
+	.read = read_cc_table_bin,
+	.size = PAGE_SIZE,
+};
+
+/*
+ * Congestion settings: port control, control map and an array of 16
+ * entries for the congestion entries - increase, timer, event log
+ * trigger threshold and the minimum injection rate delay.
+ */
+static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
+				   struct bin_attribute *bin_attr,
+				   char *buf, loff_t pos, size_t count)
+{
+	int ret;
+	struct hfi1_pportdata *ppd =
+		container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+	struct cc_state *cc_state;
+
+	ret = sizeof(struct opa_congestion_setting_attr_shadow);
+
+	if (pos > ret)
+		return -EINVAL;
+	if (count > ret - pos)
+		count = ret - pos;
+
+	if (!count)
+		return count;
+
+	rcu_read_lock();
+	cc_state = get_cc_state(ppd);
+	if (!cc_state) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+	memcpy(buf, (void *)&cc_state->cong_setting + pos, count);
+	rcu_read_unlock();
+
+	return count;
+}
+
+static const struct bin_attribute cc_setting_bin_attr = {
+	.attr = {.name = "cc_settings_bin", .mode = 0444},
+	.read = read_cc_setting_bin,
+	.size = PAGE_SIZE,
+};
+
+struct hfi1_port_attr {
+	struct attribute attr;
+	ssize_t	(*show)(struct hfi1_pportdata *, char *);
+	ssize_t	(*store)(struct hfi1_pportdata *, const char *, size_t);
+};
+
+static ssize_t cc_prescan_show(struct hfi1_pportdata *ppd, char *buf)
+{
+	return sprintf(buf, "%s\n", ppd->cc_prescan ? "on" : "off");
+}
+
+static ssize_t cc_prescan_store(struct hfi1_pportdata *ppd, const char *buf,
+				size_t count)
+{
+	if (!memcmp(buf, "on", 2))
+		ppd->cc_prescan = true;
+	else if (!memcmp(buf, "off", 3))
+		ppd->cc_prescan = false;
+
+	return count;
+}
+
+static struct hfi1_port_attr cc_prescan_attr =
+		__ATTR(cc_prescan, 0600, cc_prescan_show, cc_prescan_store);
+
+static ssize_t cc_attr_show(struct kobject *kobj, struct attribute *attr,
+			    char *buf)
+{
+	struct hfi1_port_attr *port_attr =
+		container_of(attr, struct hfi1_port_attr, attr);
+	struct hfi1_pportdata *ppd =
+		container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+
+	return port_attr->show(ppd, buf);
+}
+
+static ssize_t cc_attr_store(struct kobject *kobj, struct attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct hfi1_port_attr *port_attr =
+		container_of(attr, struct hfi1_port_attr, attr);
+	struct hfi1_pportdata *ppd =
+		container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+
+	return port_attr->store(ppd, buf, count);
+}
+
+static const struct sysfs_ops port_cc_sysfs_ops = {
+	.show = cc_attr_show,
+	.store = cc_attr_store
+};
+
+static struct attribute *port_cc_default_attributes[] = {
+	&cc_prescan_attr.attr,
+	NULL
+};
+
+static struct kobj_type port_cc_ktype = {
+	.release = port_release,
+	.sysfs_ops = &port_cc_sysfs_ops,
+	.default_attrs = port_cc_default_attributes
+};
+
+/* Start sc2vl */
+#define HFI1_SC2VL_ATTR(N)				    \
+	static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \
+		.attr = { .name = __stringify(N), .mode = 0444 }, \
+		.sc = N \
+	}
+
+struct hfi1_sc2vl_attr {
+	struct attribute attr;
+	int sc;
+};
+
+HFI1_SC2VL_ATTR(0);
+HFI1_SC2VL_ATTR(1);
+HFI1_SC2VL_ATTR(2);
+HFI1_SC2VL_ATTR(3);
+HFI1_SC2VL_ATTR(4);
+HFI1_SC2VL_ATTR(5);
+HFI1_SC2VL_ATTR(6);
+HFI1_SC2VL_ATTR(7);
+HFI1_SC2VL_ATTR(8);
+HFI1_SC2VL_ATTR(9);
+HFI1_SC2VL_ATTR(10);
+HFI1_SC2VL_ATTR(11);
+HFI1_SC2VL_ATTR(12);
+HFI1_SC2VL_ATTR(13);
+HFI1_SC2VL_ATTR(14);
+HFI1_SC2VL_ATTR(15);
+HFI1_SC2VL_ATTR(16);
+HFI1_SC2VL_ATTR(17);
+HFI1_SC2VL_ATTR(18);
+HFI1_SC2VL_ATTR(19);
+HFI1_SC2VL_ATTR(20);
+HFI1_SC2VL_ATTR(21);
+HFI1_SC2VL_ATTR(22);
+HFI1_SC2VL_ATTR(23);
+HFI1_SC2VL_ATTR(24);
+HFI1_SC2VL_ATTR(25);
+HFI1_SC2VL_ATTR(26);
+HFI1_SC2VL_ATTR(27);
+HFI1_SC2VL_ATTR(28);
+HFI1_SC2VL_ATTR(29);
+HFI1_SC2VL_ATTR(30);
+HFI1_SC2VL_ATTR(31);
+
+static struct attribute *sc2vl_default_attributes[] = {
+	&hfi1_sc2vl_attr_0.attr,
+	&hfi1_sc2vl_attr_1.attr,
+	&hfi1_sc2vl_attr_2.attr,
+	&hfi1_sc2vl_attr_3.attr,
+	&hfi1_sc2vl_attr_4.attr,
+	&hfi1_sc2vl_attr_5.attr,
+	&hfi1_sc2vl_attr_6.attr,
+	&hfi1_sc2vl_attr_7.attr,
+	&hfi1_sc2vl_attr_8.attr,
+	&hfi1_sc2vl_attr_9.attr,
+	&hfi1_sc2vl_attr_10.attr,
+	&hfi1_sc2vl_attr_11.attr,
+	&hfi1_sc2vl_attr_12.attr,
+	&hfi1_sc2vl_attr_13.attr,
+	&hfi1_sc2vl_attr_14.attr,
+	&hfi1_sc2vl_attr_15.attr,
+	&hfi1_sc2vl_attr_16.attr,
+	&hfi1_sc2vl_attr_17.attr,
+	&hfi1_sc2vl_attr_18.attr,
+	&hfi1_sc2vl_attr_19.attr,
+	&hfi1_sc2vl_attr_20.attr,
+	&hfi1_sc2vl_attr_21.attr,
+	&hfi1_sc2vl_attr_22.attr,
+	&hfi1_sc2vl_attr_23.attr,
+	&hfi1_sc2vl_attr_24.attr,
+	&hfi1_sc2vl_attr_25.attr,
+	&hfi1_sc2vl_attr_26.attr,
+	&hfi1_sc2vl_attr_27.attr,
+	&hfi1_sc2vl_attr_28.attr,
+	&hfi1_sc2vl_attr_29.attr,
+	&hfi1_sc2vl_attr_30.attr,
+	&hfi1_sc2vl_attr_31.attr,
+	NULL
+};
+
+static ssize_t sc2vl_attr_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct hfi1_sc2vl_attr *sattr =
+		container_of(attr, struct hfi1_sc2vl_attr, attr);
+	struct hfi1_pportdata *ppd =
+		container_of(kobj, struct hfi1_pportdata, sc2vl_kobj);
+	struct hfi1_devdata *dd = ppd->dd;
+
+	return sprintf(buf, "%u\n", *((u8 *)dd->sc2vl + sattr->sc));
+}
+
+static const struct sysfs_ops hfi1_sc2vl_ops = {
+	.show = sc2vl_attr_show,
+};
+
+static struct kobj_type hfi1_sc2vl_ktype = {
+	.release = port_release,
+	.sysfs_ops = &hfi1_sc2vl_ops,
+	.default_attrs = sc2vl_default_attributes
+};
+
+/* End sc2vl */
+
+/* Start sl2sc */
+#define HFI1_SL2SC_ATTR(N)				    \
+	static struct hfi1_sl2sc_attr hfi1_sl2sc_attr_##N = {	  \
+		.attr = { .name = __stringify(N), .mode = 0444 }, \
+		.sl = N						  \
+	}
+
+struct hfi1_sl2sc_attr {
+	struct attribute attr;
+	int sl;
+};
+
+HFI1_SL2SC_ATTR(0);
+HFI1_SL2SC_ATTR(1);
+HFI1_SL2SC_ATTR(2);
+HFI1_SL2SC_ATTR(3);
+HFI1_SL2SC_ATTR(4);
+HFI1_SL2SC_ATTR(5);
+HFI1_SL2SC_ATTR(6);
+HFI1_SL2SC_ATTR(7);
+HFI1_SL2SC_ATTR(8);
+HFI1_SL2SC_ATTR(9);
+HFI1_SL2SC_ATTR(10);
+HFI1_SL2SC_ATTR(11);
+HFI1_SL2SC_ATTR(12);
+HFI1_SL2SC_ATTR(13);
+HFI1_SL2SC_ATTR(14);
+HFI1_SL2SC_ATTR(15);
+HFI1_SL2SC_ATTR(16);
+HFI1_SL2SC_ATTR(17);
+HFI1_SL2SC_ATTR(18);
+HFI1_SL2SC_ATTR(19);
+HFI1_SL2SC_ATTR(20);
+HFI1_SL2SC_ATTR(21);
+HFI1_SL2SC_ATTR(22);
+HFI1_SL2SC_ATTR(23);
+HFI1_SL2SC_ATTR(24);
+HFI1_SL2SC_ATTR(25);
+HFI1_SL2SC_ATTR(26);
+HFI1_SL2SC_ATTR(27);
+HFI1_SL2SC_ATTR(28);
+HFI1_SL2SC_ATTR(29);
+HFI1_SL2SC_ATTR(30);
+HFI1_SL2SC_ATTR(31);
+
+static struct attribute *sl2sc_default_attributes[] = {
+	&hfi1_sl2sc_attr_0.attr,
+	&hfi1_sl2sc_attr_1.attr,
+	&hfi1_sl2sc_attr_2.attr,
+	&hfi1_sl2sc_attr_3.attr,
+	&hfi1_sl2sc_attr_4.attr,
+	&hfi1_sl2sc_attr_5.attr,
+	&hfi1_sl2sc_attr_6.attr,
+	&hfi1_sl2sc_attr_7.attr,
+	&hfi1_sl2sc_attr_8.attr,
+	&hfi1_sl2sc_attr_9.attr,
+	&hfi1_sl2sc_attr_10.attr,
+	&hfi1_sl2sc_attr_11.attr,
+	&hfi1_sl2sc_attr_12.attr,
+	&hfi1_sl2sc_attr_13.attr,
+	&hfi1_sl2sc_attr_14.attr,
+	&hfi1_sl2sc_attr_15.attr,
+	&hfi1_sl2sc_attr_16.attr,
+	&hfi1_sl2sc_attr_17.attr,
+	&hfi1_sl2sc_attr_18.attr,
+	&hfi1_sl2sc_attr_19.attr,
+	&hfi1_sl2sc_attr_20.attr,
+	&hfi1_sl2sc_attr_21.attr,
+	&hfi1_sl2sc_attr_22.attr,
+	&hfi1_sl2sc_attr_23.attr,
+	&hfi1_sl2sc_attr_24.attr,
+	&hfi1_sl2sc_attr_25.attr,
+	&hfi1_sl2sc_attr_26.attr,
+	&hfi1_sl2sc_attr_27.attr,
+	&hfi1_sl2sc_attr_28.attr,
+	&hfi1_sl2sc_attr_29.attr,
+	&hfi1_sl2sc_attr_30.attr,
+	&hfi1_sl2sc_attr_31.attr,
+	NULL
+};
+
+static ssize_t sl2sc_attr_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct hfi1_sl2sc_attr *sattr =
+		container_of(attr, struct hfi1_sl2sc_attr, attr);
+	struct hfi1_pportdata *ppd =
+		container_of(kobj, struct hfi1_pportdata, sl2sc_kobj);
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+
+	return sprintf(buf, "%u\n", ibp->sl_to_sc[sattr->sl]);
+}
+
+static const struct sysfs_ops hfi1_sl2sc_ops = {
+	.show = sl2sc_attr_show,
+};
+
+static struct kobj_type hfi1_sl2sc_ktype = {
+	.release = port_release,
+	.sysfs_ops = &hfi1_sl2sc_ops,
+	.default_attrs = sl2sc_default_attributes
+};
+
+/* End sl2sc */
+
+/* Start vl2mtu */
+
+#define HFI1_VL2MTU_ATTR(N) \
+	static struct hfi1_vl2mtu_attr hfi1_vl2mtu_attr_##N = { \
+		.attr = { .name = __stringify(N), .mode = 0444 }, \
+		.vl = N						  \
+	}
+
+struct hfi1_vl2mtu_attr {
+	struct attribute attr;
+	int vl;
+};
+
+HFI1_VL2MTU_ATTR(0);
+HFI1_VL2MTU_ATTR(1);
+HFI1_VL2MTU_ATTR(2);
+HFI1_VL2MTU_ATTR(3);
+HFI1_VL2MTU_ATTR(4);
+HFI1_VL2MTU_ATTR(5);
+HFI1_VL2MTU_ATTR(6);
+HFI1_VL2MTU_ATTR(7);
+HFI1_VL2MTU_ATTR(8);
+HFI1_VL2MTU_ATTR(9);
+HFI1_VL2MTU_ATTR(10);
+HFI1_VL2MTU_ATTR(11);
+HFI1_VL2MTU_ATTR(12);
+HFI1_VL2MTU_ATTR(13);
+HFI1_VL2MTU_ATTR(14);
+HFI1_VL2MTU_ATTR(15);
+
+static struct attribute *vl2mtu_default_attributes[] = {
+	&hfi1_vl2mtu_attr_0.attr,
+	&hfi1_vl2mtu_attr_1.attr,
+	&hfi1_vl2mtu_attr_2.attr,
+	&hfi1_vl2mtu_attr_3.attr,
+	&hfi1_vl2mtu_attr_4.attr,
+	&hfi1_vl2mtu_attr_5.attr,
+	&hfi1_vl2mtu_attr_6.attr,
+	&hfi1_vl2mtu_attr_7.attr,
+	&hfi1_vl2mtu_attr_8.attr,
+	&hfi1_vl2mtu_attr_9.attr,
+	&hfi1_vl2mtu_attr_10.attr,
+	&hfi1_vl2mtu_attr_11.attr,
+	&hfi1_vl2mtu_attr_12.attr,
+	&hfi1_vl2mtu_attr_13.attr,
+	&hfi1_vl2mtu_attr_14.attr,
+	&hfi1_vl2mtu_attr_15.attr,
+	NULL
+};
+
+static ssize_t vl2mtu_attr_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct hfi1_vl2mtu_attr *vlattr =
+		container_of(attr, struct hfi1_vl2mtu_attr, attr);
+	struct hfi1_pportdata *ppd =
+		container_of(kobj, struct hfi1_pportdata, vl2mtu_kobj);
+	struct hfi1_devdata *dd = ppd->dd;
+
+	return sprintf(buf, "%u\n", dd->vld[vlattr->vl].mtu);
+}
+
+static const struct sysfs_ops hfi1_vl2mtu_ops = {
+	.show = vl2mtu_attr_show,
+};
+
+static struct kobj_type hfi1_vl2mtu_ktype = {
+	.release = port_release,
+	.sysfs_ops = &hfi1_vl2mtu_ops,
+	.default_attrs = vl2mtu_default_attributes
+};
+
+/* end of per-port file structures and support code */
+
+/*
+ * Start of per-unit (or driver, in some cases, but replicated
+ * per unit) functions (these get a device *)
+ */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+
+	return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
+}
+
+static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+	int ret;
+
+	if (!dd->boardname)
+		ret = -EINVAL;
+	else
+		ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
+	return ret;
+}
+
+static ssize_t show_boardversion(struct device *device,
+				 struct device_attribute *attr, char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
+}
+
+static ssize_t show_nctxts(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+
+	/*
+	 * Return the smaller of send and receive contexts.
+	 * Normally, user level applications would require both a send
+	 * and a receive context, so returning the smaller of the two counts
+	 * give a more accurate picture of total contexts available.
+	 */
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 min(dd->num_user_contexts,
+			     (u32)dd->sc_sizes[SC_USER].count));
+}
+
+static ssize_t show_nfreectxts(struct device *device,
+			       struct device_attribute *attr, char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+
+	/* Return the number of free user ports (contexts) available. */
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
+}
+
+static ssize_t show_serial(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
+}
+
+static ssize_t store_chip_reset(struct device *device,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+	int ret;
+
+	if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ret = hfi1_reset_device(dd->unit);
+bail:
+	return ret < 0 ? ret : count;
+}
+
+/*
+ * Convert the reported temperature from an integer (reported in
+ * units of 0.25C) to a floating point number.
+ */
+#define temp2str(temp, buf, size, idx)					\
+	scnprintf((buf) + (idx), (size) - (idx), "%u.%02u ",		\
+			      ((temp) >> 2), ((temp) & 0x3) * 25)
+
+/*
+ * Dump tempsense values, in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct hfi1_ibdev *dev =
+		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+	struct hfi1_temp temp;
+	int ret;
+
+	ret = hfi1_tempsense_rd(dd, &temp);
+	if (!ret) {
+		int idx = 0;
+
+		idx += temp2str(temp.curr, buf, PAGE_SIZE, idx);
+		idx += temp2str(temp.lo_lim, buf, PAGE_SIZE, idx);
+		idx += temp2str(temp.hi_lim, buf, PAGE_SIZE, idx);
+		idx += temp2str(temp.crit_lim, buf, PAGE_SIZE, idx);
+		idx += scnprintf(buf + idx, PAGE_SIZE - idx,
+				"%u %u %u\n", temp.triggers & 0x1,
+				temp.triggers & 0x2, temp.triggers & 0x4);
+		ret = idx;
+	}
+	return ret;
+}
+
+/*
+ * end of per-unit (or driver, in some cases, but replicated
+ * per unit) functions
+ */
+
+/* start of per-unit file structures and support code */
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL);
+static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
+static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
+static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+
+static struct device_attribute *hfi1_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_board_id,
+	&dev_attr_nctxts,
+	&dev_attr_nfreectxts,
+	&dev_attr_serial,
+	&dev_attr_boardversion,
+	&dev_attr_tempsense,
+	&dev_attr_chip_reset,
+};
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+			   struct kobject *kobj)
+{
+	struct hfi1_pportdata *ppd;
+	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+	int ret;
+
+	if (!port_num || port_num > dd->num_pports) {
+		dd_dev_err(dd,
+			   "Skipping infiniband class with invalid port %u\n",
+			   port_num);
+		return -ENODEV;
+	}
+	ppd = &dd->pport[port_num - 1];
+
+	ret = kobject_init_and_add(&ppd->sc2vl_kobj, &hfi1_sc2vl_ktype, kobj,
+				   "sc2vl");
+	if (ret) {
+		dd_dev_err(dd,
+			   "Skipping sc2vl sysfs info, (err %d) port %u\n",
+			   ret, port_num);
+		goto bail;
+	}
+	kobject_uevent(&ppd->sc2vl_kobj, KOBJ_ADD);
+
+	ret = kobject_init_and_add(&ppd->sl2sc_kobj, &hfi1_sl2sc_ktype, kobj,
+				   "sl2sc");
+	if (ret) {
+		dd_dev_err(dd,
+			   "Skipping sl2sc sysfs info, (err %d) port %u\n",
+			   ret, port_num);
+		goto bail_sc2vl;
+	}
+	kobject_uevent(&ppd->sl2sc_kobj, KOBJ_ADD);
+
+	ret = kobject_init_and_add(&ppd->vl2mtu_kobj, &hfi1_vl2mtu_ktype, kobj,
+				   "vl2mtu");
+	if (ret) {
+		dd_dev_err(dd,
+			   "Skipping vl2mtu sysfs info, (err %d) port %u\n",
+			   ret, port_num);
+		goto bail_sl2sc;
+	}
+	kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD);
+
+	ret = kobject_init_and_add(&ppd->pport_cc_kobj, &port_cc_ktype,
+				   kobj, "CCMgtA");
+	if (ret) {
+		dd_dev_err(dd,
+			   "Skipping Congestion Control sysfs info, (err %d) port %u\n",
+			   ret, port_num);
+		goto bail_vl2mtu;
+	}
+
+	kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
+
+	ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_setting_bin_attr);
+	if (ret) {
+		dd_dev_err(dd,
+			   "Skipping Congestion Control setting sysfs info, (err %d) port %u\n",
+			   ret, port_num);
+		goto bail_cc;
+	}
+
+	ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_table_bin_attr);
+	if (ret) {
+		dd_dev_err(dd,
+			   "Skipping Congestion Control table sysfs info, (err %d) port %u\n",
+			   ret, port_num);
+		goto bail_cc_entry_bin;
+	}
+
+	dd_dev_info(dd,
+		    "Congestion Control Agent enabled for port %d\n",
+		    port_num);
+
+	return 0;
+
+bail_cc_entry_bin:
+	sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+			      &cc_setting_bin_attr);
+bail_cc:
+	kobject_put(&ppd->pport_cc_kobj);
+bail_vl2mtu:
+	kobject_put(&ppd->vl2mtu_kobj);
+bail_sl2sc:
+	kobject_put(&ppd->sl2sc_kobj);
+bail_sc2vl:
+	kobject_put(&ppd->sc2vl_kobj);
+bail:
+	return ret;
+}
+
+struct sde_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct sdma_engine *sde, char *buf);
+	ssize_t (*store)(struct sdma_engine *sde, const char *buf, size_t cnt);
+};
+
+static ssize_t sde_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct sde_attribute *sde_attr =
+		container_of(attr, struct sde_attribute, attr);
+	struct sdma_engine *sde =
+		container_of(kobj, struct sdma_engine, kobj);
+
+	if (!sde_attr->show)
+		return -EINVAL;
+
+	return sde_attr->show(sde, buf);
+}
+
+static ssize_t sde_store(struct kobject *kobj, struct attribute *attr,
+			 const char *buf, size_t count)
+{
+	struct sde_attribute *sde_attr =
+		container_of(attr, struct sde_attribute, attr);
+	struct sdma_engine *sde =
+		container_of(kobj, struct sdma_engine, kobj);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!sde_attr->store)
+		return -EINVAL;
+
+	return sde_attr->store(sde, buf, count);
+}
+
+static const struct sysfs_ops sde_sysfs_ops = {
+	.show = sde_show,
+	.store = sde_store,
+};
+
+static struct kobj_type sde_ktype = {
+	.sysfs_ops = &sde_sysfs_ops,
+};
+
+#define SDE_ATTR(_name, _mode, _show, _store) \
+	struct sde_attribute sde_attr_##_name = \
+		__ATTR(_name, _mode, _show, _store)
+
+static ssize_t sde_show_cpu_to_sde_map(struct sdma_engine *sde, char *buf)
+{
+	return sdma_get_cpu_to_sde_map(sde, buf);
+}
+
+static ssize_t sde_store_cpu_to_sde_map(struct sdma_engine *sde,
+					const char *buf, size_t count)
+{
+	return sdma_set_cpu_to_sde_map(sde, buf, count);
+}
+
+static ssize_t sde_show_vl(struct sdma_engine *sde, char *buf)
+{
+	int vl;
+
+	vl = sdma_engine_get_vl(sde);
+	if (vl < 0)
+		return vl;
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", vl);
+}
+
+static SDE_ATTR(cpu_list, S_IWUSR | S_IRUGO,
+		sde_show_cpu_to_sde_map,
+		sde_store_cpu_to_sde_map);
+static SDE_ATTR(vl, S_IRUGO, sde_show_vl, NULL);
+
+static struct sde_attribute *sde_attribs[] = {
+	&sde_attr_cpu_list,
+	&sde_attr_vl
+};
+
+/*
+ * Register and create our files in /sys/class/infiniband.
+ */
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd)
+{
+	struct ib_device *dev = &dd->verbs_dev.rdi.ibdev;
+	struct device *class_dev = &dev->dev;
+	int i, j, ret;
+
+	for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) {
+		ret = device_create_file(&dev->dev, hfi1_attributes[i]);
+		if (ret)
+			goto bail;
+	}
+
+	for (i = 0; i < dd->num_sdma; i++) {
+		ret = kobject_init_and_add(&dd->per_sdma[i].kobj,
+					   &sde_ktype, &class_dev->kobj,
+					   "sdma%d", i);
+		if (ret)
+			goto bail;
+
+		for (j = 0; j < ARRAY_SIZE(sde_attribs); j++) {
+			ret = sysfs_create_file(&dd->per_sdma[i].kobj,
+						&sde_attribs[j]->attr);
+			if (ret)
+				goto bail;
+		}
+	}
+
+	return 0;
+bail:
+	for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i)
+		device_remove_file(&dev->dev, hfi1_attributes[i]);
+
+	for (i = 0; i < dd->num_sdma; i++)
+		kobject_del(&dd->per_sdma[i].kobj);
+
+	return ret;
+}
+
+/*
+ * Unregister and remove our files in /sys/class/infiniband.
+ */
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd;
+	int i;
+
+	for (i = 0; i < dd->num_pports; i++) {
+		ppd = &dd->pport[i];
+
+		sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+				      &cc_setting_bin_attr);
+		sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+				      &cc_table_bin_attr);
+		kobject_put(&ppd->pport_cc_kobj);
+		kobject_put(&ppd->vl2mtu_kobj);
+		kobject_put(&ppd->sl2sc_kobj);
+		kobject_put(&ppd->sc2vl_kobj);
+	}
+}
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
new file mode 100644
index 0000000..89bd985
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace.c
@@ -0,0 +1,390 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+static u8 __get_ib_hdr_len(struct ib_header *hdr)
+{
+	struct ib_other_headers *ohdr;
+	u8 opcode;
+
+	if (ib_get_lnh(hdr) == HFI1_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else
+		ohdr = &hdr->u.l.oth;
+	opcode = ib_bth_get_opcode(ohdr);
+	return hdr_len_by_opcode[opcode] == 0 ?
+	       0 : hdr_len_by_opcode[opcode] - (12 + 8);
+}
+
+static u8 __get_16b_hdr_len(struct hfi1_16b_header *hdr)
+{
+	struct ib_other_headers *ohdr;
+	u8 opcode;
+
+	if (hfi1_16B_get_l4(hdr) == OPA_16B_L4_IB_LOCAL)
+		ohdr = &hdr->u.oth;
+	else
+		ohdr = &hdr->u.l.oth;
+	opcode = ib_bth_get_opcode(ohdr);
+	return hdr_len_by_opcode[opcode] == 0 ?
+	       0 : hdr_len_by_opcode[opcode] - (12 + 8 + 8);
+}
+
+u8 hfi1_trace_packet_hdr_len(struct hfi1_packet *packet)
+{
+	if (packet->etype != RHF_RCV_TYPE_BYPASS)
+		return __get_ib_hdr_len(packet->hdr);
+	else
+		return __get_16b_hdr_len(packet->hdr);
+}
+
+u8 hfi1_trace_opa_hdr_len(struct hfi1_opa_header *opa_hdr)
+{
+	if (!opa_hdr->hdr_type)
+		return __get_ib_hdr_len(&opa_hdr->ibh);
+	else
+		return __get_16b_hdr_len(&opa_hdr->opah);
+}
+
+const char *hfi1_trace_get_packet_l4_str(u8 l4)
+{
+	if (l4)
+		return "16B";
+	else
+		return "9B";
+}
+
+const char *hfi1_trace_get_packet_l2_str(u8 l2)
+{
+	switch (l2) {
+	case 0:
+		return "0";
+	case 1:
+		return "1";
+	case 2:
+		return "16B";
+	case 3:
+		return "9B";
+	}
+	return "";
+}
+
+#define IMM_PRN  "imm:%d"
+#define RETH_PRN "reth vaddr:0x%.16llx rkey:0x%.8x dlen:0x%.8x"
+#define AETH_PRN "aeth syn:0x%.2x %s msn:0x%.8x"
+#define DETH_PRN "deth qkey:0x%.8x sqpn:0x%.6x"
+#define IETH_PRN "ieth rkey:0x%.8x"
+#define ATOMICACKETH_PRN "origdata:%llx"
+#define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx"
+
+#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
+
+static const char *parse_syndrome(u8 syndrome)
+{
+	switch (syndrome >> 5) {
+	case 0:
+		return "ACK";
+	case 1:
+		return "RNRNAK";
+	case 3:
+		return "NAK";
+	}
+	return "";
+}
+
+void hfi1_trace_parse_9b_bth(struct ib_other_headers *ohdr,
+			     u8 *ack, bool *becn, bool *fecn, u8 *mig,
+			     u8 *se, u8 *pad, u8 *opcode, u8 *tver,
+			     u16 *pkey, u32 *psn, u32 *qpn)
+{
+	*ack = ib_bth_get_ackreq(ohdr);
+	*becn = ib_bth_get_becn(ohdr);
+	*fecn = ib_bth_get_fecn(ohdr);
+	*mig = ib_bth_get_migreq(ohdr);
+	*se = ib_bth_get_se(ohdr);
+	*pad = ib_bth_get_pad(ohdr);
+	*opcode = ib_bth_get_opcode(ohdr);
+	*tver = ib_bth_get_tver(ohdr);
+	*pkey = ib_bth_get_pkey(ohdr);
+	*psn = mask_psn(ib_bth_get_psn(ohdr));
+	*qpn = ib_bth_get_qpn(ohdr);
+}
+
+void hfi1_trace_parse_16b_bth(struct ib_other_headers *ohdr,
+			      u8 *ack, u8 *mig, u8 *opcode,
+			      u8 *pad, u8 *se, u8 *tver,
+			      u32 *psn, u32 *qpn)
+{
+	*ack = ib_bth_get_ackreq(ohdr);
+	*mig = ib_bth_get_migreq(ohdr);
+	*opcode = ib_bth_get_opcode(ohdr);
+	*pad = ib_bth_get_pad(ohdr);
+	*se = ib_bth_get_se(ohdr);
+	*tver = ib_bth_get_tver(ohdr);
+	*psn = mask_psn(ib_bth_get_psn(ohdr));
+	*qpn = ib_bth_get_qpn(ohdr);
+}
+
+void hfi1_trace_parse_9b_hdr(struct ib_header *hdr, bool sc5,
+			     u8 *lnh, u8 *lver, u8 *sl, u8 *sc,
+			     u16 *len, u32 *dlid, u32 *slid)
+{
+	*lnh = ib_get_lnh(hdr);
+	*lver = ib_get_lver(hdr);
+	*sl = ib_get_sl(hdr);
+	*sc = ib_get_sc(hdr) | (sc5 << 4);
+	*len = ib_get_len(hdr);
+	*dlid = ib_get_dlid(hdr);
+	*slid = ib_get_slid(hdr);
+}
+
+void hfi1_trace_parse_16b_hdr(struct hfi1_16b_header *hdr,
+			      u8 *age, bool *becn, bool *fecn,
+			      u8 *l4, u8 *rc, u8 *sc,
+			      u16 *entropy, u16 *len, u16 *pkey,
+			      u32 *dlid, u32 *slid)
+{
+	*age = hfi1_16B_get_age(hdr);
+	*becn = hfi1_16B_get_becn(hdr);
+	*fecn = hfi1_16B_get_fecn(hdr);
+	*l4 = hfi1_16B_get_l4(hdr);
+	*rc = hfi1_16B_get_rc(hdr);
+	*sc = hfi1_16B_get_sc(hdr);
+	*entropy = hfi1_16B_get_entropy(hdr);
+	*len = hfi1_16B_get_len(hdr);
+	*pkey = hfi1_16B_get_pkey(hdr);
+	*dlid = hfi1_16B_get_dlid(hdr);
+	*slid = hfi1_16B_get_slid(hdr);
+}
+
+#define LRH_PRN "len:%d sc:%d dlid:0x%.4x slid:0x%.4x "
+#define LRH_9B_PRN "lnh:%d,%s lver:%d sl:%d"
+#define LRH_16B_PRN "age:%d becn:%d fecn:%d l4:%d " \
+		    "rc:%d sc:%d pkey:0x%.4x entropy:0x%.4x"
+const char *hfi1_trace_fmt_lrh(struct trace_seq *p, bool bypass,
+			       u8 age, bool becn, bool fecn, u8 l4,
+			       u8 lnh, const char *lnh_name, u8 lver,
+			       u8 rc, u8 sc, u8 sl, u16 entropy,
+			       u16 len, u16 pkey, u32 dlid, u32 slid)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	trace_seq_printf(p, LRH_PRN, len, sc, dlid, slid);
+
+	if (bypass)
+		trace_seq_printf(p, LRH_16B_PRN,
+				 age, becn, fecn, l4, rc, sc, pkey, entropy);
+
+	else
+		trace_seq_printf(p, LRH_9B_PRN,
+				 lnh, lnh_name, lver, sl);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+#define BTH_9B_PRN \
+	"op:0x%.2x,%s se:%d m:%d pad:%d tver:%d pkey:0x%.4x " \
+	"f:%d b:%d qpn:0x%.6x a:%d psn:0x%.8x"
+#define BTH_16B_PRN \
+	"op:0x%.2x,%s se:%d m:%d pad:%d tver:%d " \
+	"qpn:0x%.6x a:%d psn:0x%.8x"
+const char *hfi1_trace_fmt_bth(struct trace_seq *p, bool bypass,
+			       u8 ack, bool becn, bool fecn, u8 mig,
+			       u8 se, u8 pad, u8 opcode, const char *opname,
+			       u8 tver, u16 pkey, u32 psn, u32 qpn)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	if (bypass)
+		trace_seq_printf(p, BTH_16B_PRN,
+				 opcode, opname,
+				 se, mig, pad, tver, qpn, ack, psn);
+
+	else
+		trace_seq_printf(p, BTH_9B_PRN,
+				 opcode, opname,
+				 se, mig, pad, tver, pkey, fecn, becn,
+				 qpn, ack, psn);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+const char *parse_everbs_hdrs(
+	struct trace_seq *p,
+	u8 opcode,
+	void *ehdrs)
+{
+	union ib_ehdrs *eh = ehdrs;
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	switch (opcode) {
+	/* imm */
+	case OP(RC, SEND_LAST_WITH_IMMEDIATE):
+	case OP(UC, SEND_LAST_WITH_IMMEDIATE):
+	case OP(RC, SEND_ONLY_WITH_IMMEDIATE):
+	case OP(UC, SEND_ONLY_WITH_IMMEDIATE):
+	case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+	case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		trace_seq_printf(p, IMM_PRN,
+				 be32_to_cpu(eh->imm_data));
+		break;
+	/* reth + imm */
+	case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+	case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+		trace_seq_printf(p, RETH_PRN " " IMM_PRN,
+				 get_ib_reth_vaddr(&eh->rc.reth),
+				 be32_to_cpu(eh->rc.reth.rkey),
+				 be32_to_cpu(eh->rc.reth.length),
+				 be32_to_cpu(eh->rc.imm_data));
+		break;
+	/* reth */
+	case OP(RC, RDMA_READ_REQUEST):
+	case OP(RC, RDMA_WRITE_FIRST):
+	case OP(UC, RDMA_WRITE_FIRST):
+	case OP(RC, RDMA_WRITE_ONLY):
+	case OP(UC, RDMA_WRITE_ONLY):
+		trace_seq_printf(p, RETH_PRN,
+				 get_ib_reth_vaddr(&eh->rc.reth),
+				 be32_to_cpu(eh->rc.reth.rkey),
+				 be32_to_cpu(eh->rc.reth.length));
+		break;
+	case OP(RC, RDMA_READ_RESPONSE_FIRST):
+	case OP(RC, RDMA_READ_RESPONSE_LAST):
+	case OP(RC, RDMA_READ_RESPONSE_ONLY):
+	case OP(RC, ACKNOWLEDGE):
+		trace_seq_printf(p, AETH_PRN, be32_to_cpu(eh->aeth) >> 24,
+				 parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
+				 be32_to_cpu(eh->aeth) & IB_MSN_MASK);
+		break;
+	/* aeth + atomicacketh */
+	case OP(RC, ATOMIC_ACKNOWLEDGE):
+		trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
+				 be32_to_cpu(eh->at.aeth) >> 24,
+				 parse_syndrome(be32_to_cpu(eh->at.aeth) >> 24),
+				 be32_to_cpu(eh->at.aeth) & IB_MSN_MASK,
+				 ib_u64_get(&eh->at.atomic_ack_eth));
+		break;
+	/* atomiceth */
+	case OP(RC, COMPARE_SWAP):
+	case OP(RC, FETCH_ADD):
+		trace_seq_printf(p, ATOMICETH_PRN,
+				 get_ib_ateth_vaddr(&eh->atomic_eth),
+				 eh->atomic_eth.rkey,
+				 get_ib_ateth_swap(&eh->atomic_eth),
+				 get_ib_ateth_compare(&eh->atomic_eth));
+		break;
+	/* deth */
+	case OP(UD, SEND_ONLY):
+	case OP(UD, SEND_ONLY_WITH_IMMEDIATE):
+		trace_seq_printf(p, DETH_PRN,
+				 be32_to_cpu(eh->ud.deth[0]),
+				 be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK);
+		break;
+	/* ieth */
+	case OP(RC, SEND_LAST_WITH_INVALIDATE):
+	case OP(RC, SEND_ONLY_WITH_INVALIDATE):
+		trace_seq_printf(p, IETH_PRN,
+				 be32_to_cpu(eh->ieth));
+		break;
+	}
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+const char *parse_sdma_flags(
+	struct trace_seq *p,
+	u64 desc0, u64 desc1)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	char flags[5] = { 'x', 'x', 'x', 'x', 0 };
+
+	flags[0] = (desc1 & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+	flags[1] = (desc1 & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?  'H' : '-';
+	flags[2] = (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+	flags[3] = (desc0 & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+	trace_seq_printf(p, "%s", flags);
+	if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG)
+		trace_seq_printf(p, " amode:%u aidx:%u alen:%u",
+				 (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT) &
+				      SDMA_DESC1_HEADER_MODE_MASK),
+				 (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT) &
+				      SDMA_DESC1_HEADER_INDEX_MASK),
+				 (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT) &
+				      SDMA_DESC1_HEADER_DWS_MASK));
+	return ret;
+}
+
+const char *print_u32_array(
+	struct trace_seq *p,
+	u32 *arr, int len)
+{
+	int i;
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	for (i = 0; i < len ; i++)
+		trace_seq_printf(p, "%s%#x", i == 0 ? "" : " ", arr[i]);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+__hfi1_trace_fn(PKT);
+__hfi1_trace_fn(PROC);
+__hfi1_trace_fn(SDMA);
+__hfi1_trace_fn(LINKVERB);
+__hfi1_trace_fn(DEBUG);
+__hfi1_trace_fn(SNOOP);
+__hfi1_trace_fn(CNTR);
+__hfi1_trace_fn(PIO);
+__hfi1_trace_fn(DC8051);
+__hfi1_trace_fn(FIRMWARE);
+__hfi1_trace_fn(RCVCTRL);
+__hfi1_trace_fn(TID);
+__hfi1_trace_fn(MMU);
+__hfi1_trace_fn(IOCTL);
diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h
new file mode 100644
index 0000000..8540463
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
+#define show_packettype(etype)                  \
+__print_symbolic(etype,                         \
+	packettype_name(EXPECTED),              \
+	packettype_name(EAGER),                 \
+	packettype_name(IB),                    \
+	packettype_name(ERROR),                 \
+	packettype_name(BYPASS))
+
+#include "trace_dbg.h"
+#include "trace_misc.h"
+#include "trace_ctxts.h"
+#include "trace_ibhdrs.h"
+#include "trace_rc.h"
+#include "trace_rx.h"
+#include "trace_tx.h"
+#include "trace_mmu.h"
diff --git a/drivers/infiniband/hw/hfi1/trace_ctxts.h b/drivers/infiniband/hw/hfi1/trace_ctxts.h
new file mode 100644
index 0000000..e00c8a7
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_ctxts.h
@@ -0,0 +1,147 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license.  When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+*  - Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+*  - Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in
+*    the documentation and/or other materials provided with the
+*    distribution.
+*  - Neither the name of Intel Corporation nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_CTXTS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_CTXTS_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ctxts
+
+#define UCTXT_FMT \
+	"cred:%u, credaddr:0x%llx, piobase:0x%p, rcvhdr_cnt:%u, "	\
+	"rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx, subctxt_cnt:%u"
+TRACE_EVENT(hfi1_uctxtdata,
+	    TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt,
+		     unsigned int subctxt),
+	    TP_ARGS(dd, uctxt, subctxt),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+			     __field(unsigned int, ctxt)
+			     __field(unsigned int, subctxt)
+			     __field(u32, credits)
+			     __field(u64, hw_free)
+			     __field(void __iomem *, piobase)
+			     __field(u16, rcvhdrq_cnt)
+			     __field(u64, rcvhdrq_dma)
+			     __field(u32, eager_cnt)
+			     __field(u64, rcvegr_dma)
+			     __field(unsigned int, subctxt_cnt)
+			     ),
+	    TP_fast_assign(DD_DEV_ASSIGN(dd);
+			   __entry->ctxt = uctxt->ctxt;
+			   __entry->subctxt = subctxt;
+			   __entry->credits = uctxt->sc->credits;
+			   __entry->hw_free = le64_to_cpu(*uctxt->sc->hw_free);
+			   __entry->piobase = uctxt->sc->base_addr;
+			   __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+			   __entry->rcvhdrq_dma = uctxt->rcvhdrq_dma;
+			   __entry->eager_cnt = uctxt->egrbufs.alloced;
+			   __entry->rcvegr_dma = uctxt->egrbufs.rcvtids[0].dma;
+			   __entry->subctxt_cnt = uctxt->subctxt_cnt;
+			   ),
+	    TP_printk("[%s] ctxt %u:%u " UCTXT_FMT,
+		      __get_str(dev),
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->credits,
+		      __entry->hw_free,
+		      __entry->piobase,
+		      __entry->rcvhdrq_cnt,
+		      __entry->rcvhdrq_dma,
+		      __entry->eager_cnt,
+		      __entry->rcvegr_dma,
+		      __entry->subctxt_cnt
+		      )
+);
+
+#define CINFO_FMT \
+	"egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
+TRACE_EVENT(hfi1_ctxt_info,
+	    TP_PROTO(struct hfi1_devdata *dd, unsigned int ctxt,
+		     unsigned int subctxt,
+		     struct hfi1_ctxt_info *cinfo),
+	    TP_ARGS(dd, ctxt, subctxt, cinfo),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+			     __field(unsigned int, ctxt)
+			     __field(unsigned int, subctxt)
+			     __field(u16, egrtids)
+			     __field(u16, rcvhdrq_cnt)
+			     __field(u16, rcvhdrq_size)
+			     __field(u16, sdma_ring_size)
+			     __field(u32, rcvegr_size)
+			     ),
+	    TP_fast_assign(DD_DEV_ASSIGN(dd);
+			    __entry->ctxt = ctxt;
+			    __entry->subctxt = subctxt;
+			    __entry->egrtids = cinfo->egrtids;
+			    __entry->rcvhdrq_cnt = cinfo->rcvhdrq_cnt;
+			    __entry->rcvhdrq_size = cinfo->rcvhdrq_entsize;
+			    __entry->sdma_ring_size = cinfo->sdma_ring_size;
+			    __entry->rcvegr_size = cinfo->rcvegr_size;
+			    ),
+	    TP_printk("[%s] ctxt %u:%u " CINFO_FMT,
+		      __get_str(dev),
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->egrtids,
+		      __entry->rcvegr_size,
+		      __entry->rcvhdrq_cnt,
+		      __entry->rcvhdrq_size,
+		      __entry->sdma_ring_size
+		      )
+);
+
+#endif /* __HFI1_TRACE_CTXTS_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_ctxts
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h
new file mode 100644
index 0000000..0e7d929
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_dbg.h
@@ -0,0 +1,155 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license.  When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+*  - Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+*  - Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in
+*    the documentation and/or other materials provided with the
+*    distribution.
+*  - Neither the name of Intel Corporation nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_EXTRA_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_EXTRA_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+/*
+ * Note:
+ * This produces a REALLY ugly trace in the console output when the string is
+ * too long.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_dbg
+
+#define MAX_MSG_LEN 512
+
+DECLARE_EVENT_CLASS(hfi1_trace_template,
+		    TP_PROTO(const char *function, struct va_format *vaf),
+		    TP_ARGS(function, vaf),
+		    TP_STRUCT__entry(__string(function, function)
+				     __dynamic_array(char, msg, MAX_MSG_LEN)
+				     ),
+		    TP_fast_assign(__assign_str(function, function);
+				   WARN_ON_ONCE(vsnprintf
+						(__get_dynamic_array(msg),
+						 MAX_MSG_LEN, vaf->fmt,
+						 *vaf->va) >=
+						MAX_MSG_LEN);
+				   ),
+		    TP_printk("(%s) %s",
+			      __get_str(function),
+			      __get_str(msg))
+);
+
+/*
+ * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
+ * actual function to work and can not be in a macro.
+ */
+#define __hfi1_trace_def(lvl) \
+void __hfi1_trace_##lvl(const char *funct, char *fmt, ...);		\
+									\
+DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl,				\
+	TP_PROTO(const char *function, struct va_format *vaf),		\
+	TP_ARGS(function, vaf))
+
+#define __hfi1_trace_fn(lvl) \
+void __hfi1_trace_##lvl(const char *func, char *fmt, ...)		\
+{									\
+	struct va_format vaf = {					\
+		.fmt = fmt,						\
+	};								\
+	va_list args;							\
+									\
+	va_start(args, fmt);						\
+	vaf.va = &args;							\
+	trace_hfi1_ ##lvl(func, &vaf);					\
+	va_end(args);							\
+	return;								\
+}
+
+/*
+ * To create a new trace level simply define it below and as a __hfi1_trace_fn
+ * in trace.c. This will create all the hooks for calling
+ * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
+ * the debugfs stuff.
+ */
+__hfi1_trace_def(PKT);
+__hfi1_trace_def(PROC);
+__hfi1_trace_def(SDMA);
+__hfi1_trace_def(LINKVERB);
+__hfi1_trace_def(DEBUG);
+__hfi1_trace_def(SNOOP);
+__hfi1_trace_def(CNTR);
+__hfi1_trace_def(PIO);
+__hfi1_trace_def(DC8051);
+__hfi1_trace_def(FIRMWARE);
+__hfi1_trace_def(RCVCTRL);
+__hfi1_trace_def(TID);
+__hfi1_trace_def(MMU);
+__hfi1_trace_def(IOCTL);
+
+#define hfi1_cdbg(which, fmt, ...) \
+	__hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
+
+#define hfi1_dbg(fmt, ...) \
+	hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
+
+/*
+ * Define HFI1_EARLY_DBG at compile time or here to enable early trace
+ * messages. Do not check in an enablement for this.
+ */
+
+#ifdef HFI1_EARLY_DBG
+#define hfi1_dbg_early(fmt, ...) \
+	trace_printk(fmt, ##__VA_ARGS__)
+#else
+#define hfi1_dbg_early(fmt, ...)
+#endif
+
+#endif /* __HFI1_TRACE_EXTRA_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_dbg
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
new file mode 100644
index 0000000..2847626
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
@@ -0,0 +1,446 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__HFI1_TRACE_IBHDRS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_IBHDRS_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ibhdrs
+
+#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode  }
+#define show_ib_opcode(opcode)                             \
+__print_symbolic(opcode,                                   \
+	ib_opcode_name(RC_SEND_FIRST),                     \
+	ib_opcode_name(RC_SEND_MIDDLE),                    \
+	ib_opcode_name(RC_SEND_LAST),                      \
+	ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE),       \
+	ib_opcode_name(RC_SEND_ONLY),                      \
+	ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE),       \
+	ib_opcode_name(RC_RDMA_WRITE_FIRST),               \
+	ib_opcode_name(RC_RDMA_WRITE_MIDDLE),              \
+	ib_opcode_name(RC_RDMA_WRITE_LAST),                \
+	ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+	ib_opcode_name(RC_RDMA_WRITE_ONLY),                \
+	ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+	ib_opcode_name(RC_RDMA_READ_REQUEST),              \
+	ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST),       \
+	ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE),      \
+	ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST),        \
+	ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY),        \
+	ib_opcode_name(RC_ACKNOWLEDGE),                    \
+	ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
+	ib_opcode_name(RC_COMPARE_SWAP),                   \
+	ib_opcode_name(RC_FETCH_ADD),                      \
+	ib_opcode_name(UC_SEND_FIRST),                     \
+	ib_opcode_name(UC_SEND_MIDDLE),                    \
+	ib_opcode_name(UC_SEND_LAST),                      \
+	ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE),       \
+	ib_opcode_name(UC_SEND_ONLY),                      \
+	ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE),       \
+	ib_opcode_name(UC_RDMA_WRITE_FIRST),               \
+	ib_opcode_name(UC_RDMA_WRITE_MIDDLE),              \
+	ib_opcode_name(UC_RDMA_WRITE_LAST),                \
+	ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+	ib_opcode_name(UC_RDMA_WRITE_ONLY),                \
+	ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+	ib_opcode_name(UD_SEND_ONLY),                      \
+	ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE),       \
+	ib_opcode_name(CNP))
+
+u8 ibhdr_exhdr_len(struct ib_header *hdr);
+const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs);
+u8 hfi1_trace_opa_hdr_len(struct hfi1_opa_header *opah);
+u8 hfi1_trace_packet_hdr_len(struct hfi1_packet *packet);
+const char *hfi1_trace_get_packet_l4_str(u8 l4);
+void hfi1_trace_parse_9b_bth(struct ib_other_headers *ohdr,
+			     u8 *ack, bool *becn, bool *fecn, u8 *mig,
+			     u8 *se, u8 *pad, u8 *opcode, u8 *tver,
+			     u16 *pkey, u32 *psn, u32 *qpn);
+void hfi1_trace_parse_9b_hdr(struct ib_header *hdr, bool sc5,
+			     u8 *lnh, u8 *lver, u8 *sl, u8 *sc,
+			     u16 *len, u32 *dlid, u32 *slid);
+void hfi1_trace_parse_16b_bth(struct ib_other_headers *ohdr,
+			      u8 *ack, u8 *mig, u8 *opcode,
+			      u8 *pad, u8 *se, u8 *tver,
+			      u32 *psn, u32 *qpn);
+void hfi1_trace_parse_16b_hdr(struct hfi1_16b_header *hdr,
+			      u8 *age, bool *becn, bool *fecn,
+			      u8 *l4, u8 *rc, u8 *sc,
+			      u16 *entropy, u16 *len, u16 *pkey,
+			      u32 *dlid, u32 *slid);
+
+const char *hfi1_trace_fmt_lrh(struct trace_seq *p, bool bypass,
+			       u8 age, bool becn, bool fecn, u8 l4,
+			       u8 lnh, const char *lnh_name, u8 lver,
+			       u8 rc, u8 sc, u8 sl, u16 entropy,
+			       u16 len, u16 pkey, u32 dlid, u32 slid);
+
+const char *hfi1_trace_fmt_bth(struct trace_seq *p, bool bypass,
+			       u8 ack, bool becn, bool fecn, u8 mig,
+			       u8 se, u8 pad, u8 opcode, const char *opname,
+			       u8 tver, u16 pkey, u32 psn, u32 qpn);
+
+const char *hfi1_trace_get_packet_l2_str(u8 l2);
+
+#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
+
+#define lrh_name(lrh) { HFI1_##lrh, #lrh }
+#define show_lnh(lrh)                    \
+__print_symbolic(lrh,                    \
+	lrh_name(LRH_BTH),               \
+	lrh_name(LRH_GRH))
+
+DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template,
+		    TP_PROTO(struct hfi1_devdata *dd,
+			     struct hfi1_packet *packet,
+			     bool sc5),
+		    TP_ARGS(dd, packet, sc5),
+		    TP_STRUCT__entry(
+			DD_DEV_ENTRY(dd)
+			__field(u8, etype)
+			__field(u8, ack)
+			__field(u8, age)
+			__field(bool, becn)
+			__field(bool, fecn)
+			__field(u8, l2)
+			__field(u8, l4)
+			__field(u8, lnh)
+			__field(u8, lver)
+			__field(u8, mig)
+			__field(u8, opcode)
+			__field(u8, pad)
+			__field(u8, rc)
+			__field(u8, sc)
+			__field(u8, se)
+			__field(u8, sl)
+			__field(u8, tver)
+			__field(u16, entropy)
+			__field(u16, len)
+			__field(u16, pkey)
+			__field(u32, dlid)
+			__field(u32, psn)
+			__field(u32, qpn)
+			__field(u32, slid)
+			/* extended headers */
+			__dynamic_array(u8, ehdrs,
+					hfi1_trace_packet_hdr_len(packet))
+			),
+		    TP_fast_assign(
+			DD_DEV_ASSIGN(dd);
+
+			__entry->etype = packet->etype;
+			__entry->l2 = hfi1_16B_get_l2(packet->hdr);
+			if (__entry->etype == RHF_RCV_TYPE_BYPASS) {
+				hfi1_trace_parse_16b_hdr(packet->hdr,
+							 &__entry->age,
+							 &__entry->becn,
+							 &__entry->fecn,
+							 &__entry->l4,
+							 &__entry->rc,
+							 &__entry->sc,
+							 &__entry->entropy,
+							 &__entry->len,
+							 &__entry->pkey,
+							 &__entry->dlid,
+							 &__entry->slid);
+
+				  hfi1_trace_parse_16b_bth(packet->ohdr,
+							   &__entry->ack,
+							   &__entry->mig,
+							   &__entry->opcode,
+							   &__entry->pad,
+							   &__entry->se,
+							   &__entry->tver,
+							   &__entry->psn,
+							   &__entry->qpn);
+			} else {
+				hfi1_trace_parse_9b_hdr(packet->hdr, sc5,
+							&__entry->lnh,
+							&__entry->lver,
+							&__entry->sl,
+							&__entry->sc,
+							&__entry->len,
+							&__entry->dlid,
+							&__entry->slid);
+
+				  hfi1_trace_parse_9b_bth(packet->ohdr,
+							  &__entry->ack,
+							  &__entry->becn,
+							  &__entry->fecn,
+							  &__entry->mig,
+							  &__entry->se,
+							  &__entry->pad,
+							  &__entry->opcode,
+							  &__entry->tver,
+							  &__entry->pkey,
+							  &__entry->psn,
+							  &__entry->qpn);
+				}
+				/* extended headers */
+				memcpy(__get_dynamic_array(ehdrs),
+				       &packet->ohdr->u,
+				       __get_dynamic_array_len(ehdrs));
+			 ),
+		    TP_printk("[%s] (%s) %s %s hlen:%d %s",
+			      __get_str(dev),
+			      __entry->etype != RHF_RCV_TYPE_BYPASS ?
+					show_packettype(__entry->etype) :
+					hfi1_trace_get_packet_l2_str(
+						__entry->l2),
+			      hfi1_trace_fmt_lrh(p,
+						 __entry->etype ==
+							RHF_RCV_TYPE_BYPASS,
+						 __entry->age,
+						 __entry->becn,
+						 __entry->fecn,
+						 __entry->l4,
+						 __entry->lnh,
+						 show_lnh(__entry->lnh),
+						 __entry->lver,
+						 __entry->rc,
+						 __entry->sc,
+						 __entry->sl,
+						 __entry->entropy,
+						 __entry->len,
+						 __entry->pkey,
+						 __entry->dlid,
+						 __entry->slid),
+			      hfi1_trace_fmt_bth(p,
+						 __entry->etype ==
+							RHF_RCV_TYPE_BYPASS,
+						 __entry->ack,
+						 __entry->becn,
+						 __entry->fecn,
+						 __entry->mig,
+						 __entry->se,
+						 __entry->pad,
+						 __entry->opcode,
+						 show_ib_opcode(__entry->opcode),
+						 __entry->tver,
+						 __entry->pkey,
+						 __entry->psn,
+						 __entry->qpn),
+			      /* extended headers */
+			      __get_dynamic_array_len(ehdrs),
+			      __parse_ib_ehdrs(
+					__entry->opcode,
+					(void *)__get_dynamic_array(ehdrs))
+			     )
+);
+
+DEFINE_EVENT(hfi1_input_ibhdr_template, input_ibhdr,
+	     TP_PROTO(struct hfi1_devdata *dd,
+		      struct hfi1_packet *packet, bool sc5),
+	     TP_ARGS(dd, packet, sc5));
+
+DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template,
+		    TP_PROTO(struct hfi1_devdata *dd,
+			     struct hfi1_opa_header *opah, bool sc5),
+		    TP_ARGS(dd, opah, sc5),
+		    TP_STRUCT__entry(
+			DD_DEV_ENTRY(dd)
+			__field(u8, hdr_type)
+			__field(u8, ack)
+			__field(u8, age)
+			__field(bool, becn)
+			__field(bool, fecn)
+			__field(u8, l4)
+			__field(u8, lnh)
+			__field(u8, lver)
+			__field(u8, mig)
+			__field(u8, opcode)
+			__field(u8, pad)
+			__field(u8, rc)
+			__field(u8, sc)
+			__field(u8, se)
+			__field(u8, sl)
+			__field(u8, tver)
+			__field(u16, entropy)
+			__field(u16, len)
+			__field(u16, pkey)
+			__field(u32, dlid)
+			__field(u32, psn)
+			__field(u32, qpn)
+			__field(u32, slid)
+			/* extended headers */
+			__dynamic_array(u8, ehdrs,
+					hfi1_trace_opa_hdr_len(opah))
+			),
+		    TP_fast_assign(
+			struct ib_other_headers *ohdr;
+
+			DD_DEV_ASSIGN(dd);
+
+			__entry->hdr_type = opah->hdr_type;
+			if (__entry->hdr_type)  {
+				hfi1_trace_parse_16b_hdr(&opah->opah,
+							 &__entry->age,
+							 &__entry->becn,
+							 &__entry->fecn,
+							 &__entry->l4,
+							 &__entry->rc,
+							 &__entry->sc,
+							 &__entry->entropy,
+							 &__entry->len,
+							 &__entry->pkey,
+							 &__entry->dlid,
+							 &__entry->slid);
+
+				if (__entry->l4 == OPA_16B_L4_IB_LOCAL)
+					ohdr = &opah->opah.u.oth;
+				else
+					ohdr = &opah->opah.u.l.oth;
+				hfi1_trace_parse_16b_bth(ohdr,
+							 &__entry->ack,
+							 &__entry->mig,
+							 &__entry->opcode,
+							 &__entry->pad,
+							 &__entry->se,
+							 &__entry->tver,
+							 &__entry->psn,
+							 &__entry->qpn);
+			} else {
+				__entry->l4 = OPA_16B_L4_9B;
+				hfi1_trace_parse_9b_hdr(&opah->ibh, sc5,
+							&__entry->lnh,
+							&__entry->lver,
+							&__entry->sl,
+							&__entry->sc,
+							&__entry->len,
+							&__entry->dlid,
+							&__entry->slid);
+				if (__entry->lnh == HFI1_LRH_BTH)
+					ohdr = &opah->ibh.u.oth;
+				else
+					ohdr = &opah->ibh.u.l.oth;
+				hfi1_trace_parse_9b_bth(ohdr,
+							&__entry->ack,
+							&__entry->becn,
+							&__entry->fecn,
+							&__entry->mig,
+							&__entry->se,
+							&__entry->pad,
+							&__entry->opcode,
+							&__entry->tver,
+							&__entry->pkey,
+							&__entry->psn,
+							&__entry->qpn);
+			}
+
+			/* extended headers */
+			memcpy(__get_dynamic_array(ehdrs),
+			       &ohdr->u, __get_dynamic_array_len(ehdrs));
+		    ),
+		    TP_printk("[%s] (%s) %s %s hlen:%d %s",
+			      __get_str(dev),
+			      hfi1_trace_get_packet_l4_str(__entry->l4),
+			      hfi1_trace_fmt_lrh(p,
+						 !!__entry->hdr_type,
+						 __entry->age,
+						 __entry->becn,
+						 __entry->fecn,
+						 __entry->l4,
+						 __entry->lnh,
+						 show_lnh(__entry->lnh),
+						 __entry->lver,
+						 __entry->rc,
+						 __entry->sc,
+						 __entry->sl,
+						 __entry->entropy,
+						 __entry->len,
+						 __entry->pkey,
+						 __entry->dlid,
+						 __entry->slid),
+			      hfi1_trace_fmt_bth(p,
+						 !!__entry->hdr_type,
+						 __entry->ack,
+						 __entry->becn,
+						 __entry->fecn,
+						 __entry->mig,
+						 __entry->se,
+						 __entry->pad,
+						 __entry->opcode,
+						 show_ib_opcode(__entry->opcode),
+						 __entry->tver,
+						 __entry->pkey,
+						 __entry->psn,
+						 __entry->qpn),
+			      /* extended headers */
+			      __get_dynamic_array_len(ehdrs),
+			      __parse_ib_ehdrs(
+					__entry->opcode,
+					(void *)__get_dynamic_array(ehdrs))
+			     )
+);
+
+DEFINE_EVENT(hfi1_output_ibhdr_template, pio_output_ibhdr,
+	     TP_PROTO(struct hfi1_devdata *dd,
+		      struct hfi1_opa_header *opah, bool sc5),
+	     TP_ARGS(dd, opah, sc5));
+
+DEFINE_EVENT(hfi1_output_ibhdr_template, ack_output_ibhdr,
+	     TP_PROTO(struct hfi1_devdata *dd,
+		      struct hfi1_opa_header *opah, bool sc5),
+	     TP_ARGS(dd, opah, sc5));
+
+DEFINE_EVENT(hfi1_output_ibhdr_template, sdma_output_ibhdr,
+	     TP_PROTO(struct hfi1_devdata *dd,
+		      struct hfi1_opa_header *opah, bool sc5),
+	     TP_ARGS(dd, opah, sc5));
+
+
+#endif /* __HFI1_TRACE_IBHDRS_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_ibhdrs
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_misc.h b/drivers/infiniband/hw/hfi1/trace_misc.h
new file mode 100644
index 0000000..8db2253
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_misc.h
@@ -0,0 +1,149 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license.  When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+*  - Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+*  - Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in
+*    the documentation and/or other materials provided with the
+*    distribution.
+*  - Neither the name of Intel Corporation nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_MISC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_MISC_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_misc
+
+TRACE_EVENT(hfi1_interrupt,
+	    TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
+		     int src),
+	    TP_ARGS(dd, is_entry, src),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+			     __array(char, buf, 64)
+			     __field(int, src)
+			     ),
+	    TP_fast_assign(DD_DEV_ASSIGN(dd)
+			   is_entry->is_name(__entry->buf, 64,
+					     src - is_entry->start);
+			   __entry->src = src;
+			   ),
+	    TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
+		      __entry->src)
+);
+
+DECLARE_EVENT_CLASS(
+	hfi1_csr_template,
+	TP_PROTO(void __iomem *addr, u64 value),
+	TP_ARGS(addr, value),
+	TP_STRUCT__entry(
+		__field(void __iomem *, addr)
+		__field(u64, value)
+	),
+	TP_fast_assign(
+		__entry->addr = addr;
+		__entry->value = value;
+	),
+	TP_printk("addr %p value %llx", __entry->addr, __entry->value)
+);
+
+DEFINE_EVENT(
+	hfi1_csr_template, hfi1_write_rcvarray,
+	TP_PROTO(void __iomem *addr, u64 value),
+	TP_ARGS(addr, value));
+
+#ifdef CONFIG_FAULT_INJECTION
+TRACE_EVENT(hfi1_fault_opcode,
+	    TP_PROTO(struct rvt_qp *qp, u8 opcode),
+	    TP_ARGS(qp, opcode),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+			     __field(u32, qpn)
+			     __field(u8, opcode)
+			     ),
+	    TP_fast_assign(DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+			   __entry->qpn = qp->ibqp.qp_num;
+			   __entry->opcode = opcode;
+			   ),
+	    TP_printk("[%s] qpn 0x%x opcode 0x%x",
+		      __get_str(dev), __entry->qpn, __entry->opcode)
+);
+
+TRACE_EVENT(hfi1_fault_packet,
+	    TP_PROTO(struct hfi1_packet *packet),
+	    TP_ARGS(packet),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(packet->rcd->ppd->dd)
+			     __field(u64, eflags)
+			     __field(u32, ctxt)
+			     __field(u32, hlen)
+			     __field(u32, tlen)
+			     __field(u32, updegr)
+			     __field(u32, etail)
+			     ),
+	     TP_fast_assign(DD_DEV_ASSIGN(packet->rcd->ppd->dd);
+			    __entry->eflags = rhf_err_flags(packet->rhf);
+			    __entry->ctxt = packet->rcd->ctxt;
+			    __entry->hlen = packet->hlen;
+			    __entry->tlen = packet->tlen;
+			    __entry->updegr = packet->updegr;
+			    __entry->etail = rhf_egr_index(packet->rhf);
+			    ),
+	     TP_printk(
+		"[%s] ctxt %d eflags 0x%llx hlen %d tlen %d updegr %d etail %d",
+		__get_str(dev),
+		__entry->ctxt,
+		__entry->eflags,
+		__entry->hlen,
+		__entry->tlen,
+		__entry->updegr,
+		__entry->etail
+		)
+);
+#endif
+
+#endif /* __HFI1_TRACE_MISC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_misc
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_mmu.h b/drivers/infiniband/hw/hfi1/trace_mmu.h
new file mode 100644
index 0000000..3b7abbc
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_mmu.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__HFI1_TRACE_MMU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_MMU_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_mmu
+
+DECLARE_EVENT_CLASS(hfi1_mmu_rb_template,
+		    TP_PROTO(unsigned long addr, unsigned long len),
+		    TP_ARGS(addr, len),
+		    TP_STRUCT__entry(__field(unsigned long, addr)
+				     __field(unsigned long, len)
+			    ),
+		    TP_fast_assign(__entry->addr = addr;
+				   __entry->len = len;
+			    ),
+		    TP_printk("MMU node addr 0x%lx, len %lu",
+			      __entry->addr,
+			      __entry->len
+			    )
+);
+
+DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_insert,
+	     TP_PROTO(unsigned long addr, unsigned long len),
+	     TP_ARGS(addr, len));
+
+DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_search,
+	     TP_PROTO(unsigned long addr, unsigned long len),
+	     TP_ARGS(addr, len));
+
+DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_remove,
+	     TP_PROTO(unsigned long addr, unsigned long len),
+	     TP_ARGS(addr, len));
+
+DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_mem_invalidate,
+	     TP_PROTO(unsigned long addr, unsigned long len),
+	     TP_ARGS(addr, len));
+
+#endif /* __HFI1_TRACE_RC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_mmu
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h
new file mode 100644
index 0000000..8ce4765
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_rc.h
@@ -0,0 +1,118 @@
+/*
+* Copyright(c) 2015, 2016, 2017 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license.  When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+*  - Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+*  - Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in
+*    the documentation and/or other materials provided with the
+*    distribution.
+*  - Neither the name of Intel Corporation nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_RC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_RC_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rc
+
+DECLARE_EVENT_CLASS(hfi1_rc_template,
+		    TP_PROTO(struct rvt_qp *qp, u32 psn),
+		    TP_ARGS(qp, psn),
+		    TP_STRUCT__entry(
+			DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+			__field(u32, qpn)
+			__field(u32, s_flags)
+			__field(u32, psn)
+			__field(u32, s_psn)
+			__field(u32, s_next_psn)
+			__field(u32, s_sending_psn)
+			__field(u32, s_sending_hpsn)
+			__field(u32, r_psn)
+			),
+		    TP_fast_assign(
+			DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+			__entry->qpn = qp->ibqp.qp_num;
+			__entry->s_flags = qp->s_flags;
+			__entry->psn = psn;
+			__entry->s_psn = qp->s_psn;
+			__entry->s_next_psn = qp->s_next_psn;
+			__entry->s_sending_psn = qp->s_sending_psn;
+			__entry->s_sending_hpsn = qp->s_sending_hpsn;
+			__entry->r_psn = qp->r_psn;
+			),
+		    TP_printk(
+			"[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x",
+			__get_str(dev),
+			__entry->qpn,
+			__entry->s_flags,
+			__entry->psn,
+			__entry->s_psn,
+			__entry->s_next_psn,
+			__entry->s_sending_psn,
+			__entry->s_sending_hpsn,
+			__entry->r_psn
+			)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_sendcomplete,
+	     TP_PROTO(struct rvt_qp *qp, u32 psn),
+	     TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_ack,
+	     TP_PROTO(struct rvt_qp *qp, u32 psn),
+	     TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_rcv_error,
+	     TP_PROTO(struct rvt_qp *qp, u32 psn),
+	     TP_ARGS(qp, psn)
+);
+
+#endif /* __HFI1_TRACE_RC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_rc
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h
new file mode 100644
index 0000000..7eceb57
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_rx.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__HFI1_TRACE_RX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_RX_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#define tidtype_name(type) { PT_##type, #type }
+#define show_tidtype(type)                   \
+__print_symbolic(type,                       \
+	tidtype_name(EXPECTED),              \
+	tidtype_name(EAGER),                 \
+	tidtype_name(INVALID))               \
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rx
+
+TRACE_EVENT(hfi1_rcvhdr,
+	    TP_PROTO(struct hfi1_packet *packet),
+	    TP_ARGS(packet),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(packet->rcd->dd)
+			     __field(u64, eflags)
+			     __field(u32, ctxt)
+			     __field(u32, etype)
+			     __field(u32, hlen)
+			     __field(u32, tlen)
+			     __field(u32, updegr)
+			     __field(u32, etail)
+			     ),
+	     TP_fast_assign(DD_DEV_ASSIGN(packet->rcd->dd);
+			    __entry->eflags = rhf_err_flags(packet->rhf);
+			    __entry->ctxt = packet->rcd->ctxt;
+			    __entry->etype = packet->etype;
+			    __entry->hlen = packet->hlen;
+			    __entry->tlen = packet->tlen;
+			    __entry->updegr = packet->updegr;
+			    __entry->etail = rhf_egr_index(packet->rhf);
+			    ),
+	     TP_printk(
+		"[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
+		__get_str(dev),
+		__entry->ctxt,
+		__entry->eflags,
+		__entry->etype, show_packettype(__entry->etype),
+		__entry->hlen,
+		__entry->tlen,
+		__entry->updegr,
+		__entry->etail
+		)
+);
+
+TRACE_EVENT(hfi1_receive_interrupt,
+	    TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd),
+	    TP_ARGS(dd, rcd),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+			     __field(u32, ctxt)
+			     __field(u8, slow_path)
+			     __field(u8, dma_rtail)
+			     ),
+	    TP_fast_assign(DD_DEV_ASSIGN(dd);
+			__entry->ctxt = rcd->ctxt;
+			if (rcd->do_interrupt ==
+			    &handle_receive_interrupt) {
+				__entry->slow_path = 1;
+				__entry->dma_rtail = 0xFF;
+			} else if (rcd->do_interrupt ==
+					&handle_receive_interrupt_dma_rtail){
+				__entry->dma_rtail = 1;
+				__entry->slow_path = 0;
+			} else if (rcd->do_interrupt ==
+					&handle_receive_interrupt_nodma_rtail) {
+				__entry->dma_rtail = 0;
+				__entry->slow_path = 0;
+			}
+			),
+	    TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d",
+		      __get_str(dev),
+		      __entry->ctxt,
+		      __entry->slow_path,
+		      __entry->dma_rtail
+		      )
+);
+
+DECLARE_EVENT_CLASS(
+	    hfi1_exp_tid_reg_unreg,
+	    TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr,
+		     u32 npages, unsigned long va, unsigned long pa,
+		     dma_addr_t dma),
+	    TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+	    TP_STRUCT__entry(
+			     __field(unsigned int, ctxt)
+			     __field(u16, subctxt)
+			     __field(u32, rarr)
+			     __field(u32, npages)
+			     __field(unsigned long, va)
+			     __field(unsigned long, pa)
+			     __field(dma_addr_t, dma)
+			     ),
+	    TP_fast_assign(
+			   __entry->ctxt = ctxt;
+			   __entry->subctxt = subctxt;
+			   __entry->rarr = rarr;
+			   __entry->npages = npages;
+			   __entry->va = va;
+			   __entry->pa = pa;
+			   __entry->dma = dma;
+			   ),
+	    TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->rarr,
+		      __entry->npages,
+		      __entry->pa,
+		      __entry->va,
+		      __entry->dma
+		      )
+	);
+
+DEFINE_EVENT(
+	hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg,
+	TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+		 unsigned long va, unsigned long pa, dma_addr_t dma),
+	TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
+
+DEFINE_EVENT(
+	hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg,
+	TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+		 unsigned long va, unsigned long pa, dma_addr_t dma),
+	TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
+
+TRACE_EVENT(
+	hfi1_put_tid,
+	TP_PROTO(struct hfi1_devdata *dd,
+		 u32 index, u32 type, unsigned long pa, u16 order),
+	TP_ARGS(dd, index, type, pa, order),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd)
+		__field(unsigned long, pa);
+		__field(u32, index);
+		__field(u32, type);
+		__field(u16, order);
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(dd);
+		__entry->pa = pa;
+		__entry->index = index;
+		__entry->type = type;
+		__entry->order = order;
+	),
+	TP_printk("[%s] type %s pa %lx index %u order %u",
+		  __get_str(dev),
+		  show_tidtype(__entry->type),
+		  __entry->pa,
+		  __entry->index,
+		  __entry->order
+	)
+);
+
+TRACE_EVENT(hfi1_exp_tid_inval,
+	    TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
+		     u32 npages, dma_addr_t dma),
+	    TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
+	    TP_STRUCT__entry(
+			     __field(unsigned int, ctxt)
+			     __field(u16, subctxt)
+			     __field(unsigned long, va)
+			     __field(u32, rarr)
+			     __field(u32, npages)
+			     __field(dma_addr_t, dma)
+			     ),
+	    TP_fast_assign(
+			   __entry->ctxt = ctxt;
+			   __entry->subctxt = subctxt;
+			   __entry->va = va;
+			   __entry->rarr = rarr;
+			   __entry->npages = npages;
+			   __entry->dma = dma;
+			  ),
+	    TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->rarr,
+		      __entry->npages,
+		      __entry->va,
+		      __entry->dma
+		      )
+	    );
+
+TRACE_EVENT(hfi1_mmu_invalidate,
+	    TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type,
+		     unsigned long start, unsigned long end),
+	    TP_ARGS(ctxt, subctxt, type, start, end),
+	    TP_STRUCT__entry(
+			     __field(unsigned int, ctxt)
+			     __field(u16, subctxt)
+			     __string(type, type)
+			     __field(unsigned long, start)
+			     __field(unsigned long, end)
+			     ),
+	    TP_fast_assign(
+			__entry->ctxt = ctxt;
+			__entry->subctxt = subctxt;
+			__assign_str(type, type);
+			__entry->start = start;
+			__entry->end = end;
+	    ),
+	    TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx",
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __get_str(type),
+		      __entry->start,
+		      __entry->end
+		      )
+	    );
+
+#endif /* __HFI1_TRACE_RX_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_rx
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h
new file mode 100644
index 0000000..c57af3b
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_tx.h
@@ -0,0 +1,853 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__HFI1_TRACE_TX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_TX_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "sdma.h"
+
+const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1);
+
+#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_tx
+
+TRACE_EVENT(hfi1_piofree,
+	    TP_PROTO(struct send_context *sc, int extra),
+	    TP_ARGS(sc, extra),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
+	    __field(u32, sw_index)
+	    __field(u32, hw_context)
+	    __field(int, extra)
+	    ),
+	    TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
+	    __entry->sw_index = sc->sw_index;
+	    __entry->hw_context = sc->hw_context;
+	    __entry->extra = extra;
+	    ),
+	    TP_printk("[%s] ctxt %u(%u) extra %d",
+		      __get_str(dev),
+		      __entry->sw_index,
+		      __entry->hw_context,
+		      __entry->extra
+	    )
+);
+
+TRACE_EVENT(hfi1_wantpiointr,
+	    TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
+	    TP_ARGS(sc, needint, credit_ctrl),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
+			__field(u32, sw_index)
+			__field(u32, hw_context)
+			__field(u32, needint)
+			__field(u64, credit_ctrl)
+			),
+	    TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
+			__entry->sw_index = sc->sw_index;
+			__entry->hw_context = sc->hw_context;
+			__entry->needint = needint;
+			__entry->credit_ctrl = credit_ctrl;
+			),
+	    TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
+		      __get_str(dev),
+		      __entry->sw_index,
+		      __entry->hw_context,
+		      __entry->needint,
+		      (unsigned long long)__entry->credit_ctrl
+		      )
+);
+
+DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
+		    TP_PROTO(struct rvt_qp *qp, u32 flags),
+		    TP_ARGS(qp, flags),
+		    TP_STRUCT__entry(
+		    DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		    __field(u32, qpn)
+		    __field(u32, flags)
+		    __field(u32, s_flags)
+		    ),
+		    TP_fast_assign(
+		    DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		    __entry->flags = flags;
+		    __entry->qpn = qp->ibqp.qp_num;
+		    __entry->s_flags = qp->s_flags;
+		    ),
+		    TP_printk(
+		    "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+		    __get_str(dev),
+		    __entry->qpn,
+		    __entry->flags,
+		    __entry->s_flags
+		    )
+);
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
+	     TP_PROTO(struct rvt_qp *qp, u32 flags),
+	     TP_ARGS(qp, flags));
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
+	     TP_PROTO(struct rvt_qp *qp, u32 flags),
+	     TP_ARGS(qp, flags));
+
+TRACE_EVENT(hfi1_sdma_descriptor,
+	    TP_PROTO(struct sdma_engine *sde,
+		     u64 desc0,
+		     u64 desc1,
+		     u16 e,
+		     void *descp),
+		     TP_ARGS(sde, desc0, desc1, e, descp),
+		     TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+		     __field(void *, descp)
+		     __field(u64, desc0)
+		     __field(u64, desc1)
+		     __field(u16, e)
+		     __field(u8, idx)
+		     ),
+		     TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+		     __entry->desc0 = desc0;
+		     __entry->desc1 = desc1;
+		     __entry->idx = sde->this_idx;
+		     __entry->descp = descp;
+		     __entry->e = e;
+		     ),
+	    TP_printk(
+	    "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
+	    __get_str(dev),
+	    __entry->idx,
+	    __parse_sdma_flags(__entry->desc0, __entry->desc1),
+	    (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) &
+	    SDMA_DESC0_PHY_ADDR_MASK,
+	    (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) &
+	    SDMA_DESC1_GENERATION_MASK),
+	    (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) &
+	    SDMA_DESC0_BYTE_COUNT_MASK),
+	    __entry->desc0,
+	    __entry->desc1,
+	    __entry->descp,
+	    __entry->e
+	    )
+);
+
+TRACE_EVENT(hfi1_sdma_engine_select,
+	    TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
+	    TP_ARGS(dd, sel, vl, idx),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+	    __field(u32, sel)
+	    __field(u8, vl)
+	    __field(u8, idx)
+	    ),
+	    TP_fast_assign(DD_DEV_ASSIGN(dd);
+	    __entry->sel = sel;
+	    __entry->vl = vl;
+	    __entry->idx = idx;
+	    ),
+	    TP_printk("[%s] selecting SDE %u sel 0x%x vl %u",
+		      __get_str(dev),
+		      __entry->idx,
+		      __entry->sel,
+		      __entry->vl
+		      )
+);
+
+TRACE_EVENT(hfi1_sdma_user_free_queues,
+	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt),
+	    TP_ARGS(dd, ctxt, subctxt),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+			     __field(u16, ctxt)
+			     __field(u16, subctxt)
+			     ),
+	    TP_fast_assign(DD_DEV_ASSIGN(dd);
+			   __entry->ctxt = ctxt;
+			   __entry->subctxt = subctxt;
+			   ),
+	    TP_printk("[%s] SDMA [%u:%u] Freeing user SDMA queues",
+		      __get_str(dev),
+		      __entry->ctxt,
+		      __entry->subctxt
+		      )
+);
+
+TRACE_EVENT(hfi1_sdma_user_process_request,
+	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt,
+		     u16 comp_idx),
+	    TP_ARGS(dd, ctxt, subctxt, comp_idx),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+			     __field(u16, ctxt)
+			     __field(u16, subctxt)
+			     __field(u16, comp_idx)
+			     ),
+	    TP_fast_assign(DD_DEV_ASSIGN(dd);
+			   __entry->ctxt = ctxt;
+			   __entry->subctxt = subctxt;
+			   __entry->comp_idx = comp_idx;
+			   ),
+	    TP_printk("[%s] SDMA [%u:%u] Using req/comp entry: %u",
+		      __get_str(dev),
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->comp_idx
+		      )
+);
+
+DECLARE_EVENT_CLASS(
+	hfi1_sdma_value_template,
+	TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt, u16 comp_idx,
+		 u32 value),
+	TP_ARGS(dd, ctxt, subctxt, comp_idx, value),
+	TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+			 __field(u16, ctxt)
+			 __field(u16, subctxt)
+			 __field(u16, comp_idx)
+			 __field(u32, value)
+		),
+	TP_fast_assign(DD_DEV_ASSIGN(dd);
+		       __entry->ctxt = ctxt;
+		       __entry->subctxt = subctxt;
+		       __entry->comp_idx = comp_idx;
+		       __entry->value = value;
+		),
+	TP_printk("[%s] SDMA [%u:%u:%u] value: %u",
+		  __get_str(dev),
+		  __entry->ctxt,
+		  __entry->subctxt,
+		  __entry->comp_idx,
+		  __entry->value
+		)
+);
+
+DEFINE_EVENT(hfi1_sdma_value_template, hfi1_sdma_user_initial_tidoffset,
+	     TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt,
+		      u16 comp_idx, u32 tidoffset),
+	     TP_ARGS(dd, ctxt, subctxt, comp_idx, tidoffset));
+
+DEFINE_EVENT(hfi1_sdma_value_template, hfi1_sdma_user_data_length,
+	     TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt,
+		      u16 comp_idx, u32 data_len),
+	     TP_ARGS(dd, ctxt, subctxt, comp_idx, data_len));
+
+DEFINE_EVENT(hfi1_sdma_value_template, hfi1_sdma_user_compute_length,
+	     TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt,
+		      u16 comp_idx, u32 data_len),
+	     TP_ARGS(dd, ctxt, subctxt, comp_idx, data_len));
+
+TRACE_EVENT(hfi1_sdma_user_tid_info,
+	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt,
+		     u16 comp_idx, u32 tidoffset, u32 units, u8 shift),
+	    TP_ARGS(dd, ctxt, subctxt, comp_idx, tidoffset, units, shift),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+			     __field(u16, ctxt)
+			     __field(u16, subctxt)
+			     __field(u16, comp_idx)
+			     __field(u32, tidoffset)
+			     __field(u32, units)
+			     __field(u8, shift)
+			     ),
+	    TP_fast_assign(DD_DEV_ASSIGN(dd);
+			   __entry->ctxt = ctxt;
+			   __entry->subctxt = subctxt;
+			   __entry->comp_idx = comp_idx;
+			   __entry->tidoffset = tidoffset;
+			   __entry->units = units;
+			   __entry->shift = shift;
+			   ),
+	    TP_printk("[%s] SDMA [%u:%u:%u] TID offset %ubytes %uunits om %u",
+		      __get_str(dev),
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->comp_idx,
+		      __entry->tidoffset,
+		      __entry->units,
+		      __entry->shift
+		      )
+);
+
+TRACE_EVENT(hfi1_sdma_request,
+	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u16 subctxt,
+		     unsigned long dim),
+	    TP_ARGS(dd, ctxt, subctxt, dim),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+			     __field(u16, ctxt)
+			     __field(u16, subctxt)
+			     __field(unsigned long, dim)
+			     ),
+	    TP_fast_assign(DD_DEV_ASSIGN(dd);
+			   __entry->ctxt = ctxt;
+			   __entry->subctxt = subctxt;
+			   __entry->dim = dim;
+			   ),
+	    TP_printk("[%s] SDMA from %u:%u (%lu)",
+		      __get_str(dev),
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->dim
+		      )
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
+		    TP_PROTO(struct sdma_engine *sde, u64 status),
+		    TP_ARGS(sde, status),
+		    TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+		    __field(u64, status)
+		    __field(u8, idx)
+		    ),
+		    TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+		    __entry->status = status;
+		    __entry->idx = sde->this_idx;
+		    ),
+		    TP_printk("[%s] SDE(%u) status %llx",
+			      __get_str(dev),
+			      __entry->idx,
+			      (unsigned long long)__entry->status
+			      )
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
+	     TP_PROTO(struct sdma_engine *sde, u64 status),
+	     TP_ARGS(sde, status)
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
+	     TP_PROTO(struct sdma_engine *sde, u64 status),
+	     TP_ARGS(sde, status)
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
+		    TP_PROTO(struct sdma_engine *sde, int aidx),
+		    TP_ARGS(sde, aidx),
+		    TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+		    __field(int, aidx)
+		    __field(u8, idx)
+		    ),
+		    TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+		    __entry->idx = sde->this_idx;
+		    __entry->aidx = aidx;
+		    ),
+		    TP_printk("[%s] SDE(%u) aidx %d",
+			      __get_str(dev),
+			      __entry->idx,
+			      __entry->aidx
+			      )
+);
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
+	     TP_PROTO(struct sdma_engine *sde, int aidx),
+	     TP_ARGS(sde, aidx));
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
+	     TP_PROTO(struct sdma_engine *sde, int aidx),
+	     TP_ARGS(sde, aidx));
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+TRACE_EVENT(hfi1_sdma_progress,
+	    TP_PROTO(struct sdma_engine *sde,
+		     u16 hwhead,
+		     u16 swhead,
+		     struct sdma_txreq *txp
+		     ),
+	    TP_ARGS(sde, hwhead, swhead, txp),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+	    __field(u64, sn)
+	    __field(u16, hwhead)
+	    __field(u16, swhead)
+	    __field(u16, txnext)
+	    __field(u16, tx_tail)
+	    __field(u16, tx_head)
+	    __field(u8, idx)
+	    ),
+	    TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+	    __entry->hwhead = hwhead;
+	    __entry->swhead = swhead;
+	    __entry->tx_tail = sde->tx_tail;
+	    __entry->tx_head = sde->tx_head;
+	    __entry->txnext = txp ? txp->next_descq_idx : ~0;
+	    __entry->idx = sde->this_idx;
+	    __entry->sn = txp ? txp->sn : ~0;
+	    ),
+	    TP_printk(
+	    "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+	    __get_str(dev),
+	    __entry->idx,
+	    __entry->sn,
+	    __entry->hwhead,
+	    __entry->swhead,
+	    __entry->txnext,
+	    __entry->tx_head,
+	    __entry->tx_tail
+	    )
+);
+#else
+TRACE_EVENT(hfi1_sdma_progress,
+	    TP_PROTO(struct sdma_engine *sde,
+		     u16 hwhead, u16 swhead,
+		     struct sdma_txreq *txp
+		     ),
+	    TP_ARGS(sde, hwhead, swhead, txp),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+		    __field(u16, hwhead)
+		    __field(u16, swhead)
+		    __field(u16, txnext)
+		    __field(u16, tx_tail)
+		    __field(u16, tx_head)
+		    __field(u8, idx)
+		    ),
+	    TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+		    __entry->hwhead = hwhead;
+		    __entry->swhead = swhead;
+		    __entry->tx_tail = sde->tx_tail;
+		    __entry->tx_head = sde->tx_head;
+		    __entry->txnext = txp ? txp->next_descq_idx : ~0;
+		    __entry->idx = sde->this_idx;
+		    ),
+	    TP_printk(
+		    "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+		    __get_str(dev),
+		    __entry->idx,
+		    __entry->hwhead,
+		    __entry->swhead,
+		    __entry->txnext,
+		    __entry->tx_head,
+		    __entry->tx_tail
+	    )
+);
+#endif
+
+DECLARE_EVENT_CLASS(hfi1_sdma_sn,
+		    TP_PROTO(struct sdma_engine *sde, u64 sn),
+		    TP_ARGS(sde, sn),
+		    TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+		    __field(u64, sn)
+		    __field(u8, idx)
+		    ),
+		    TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+		    __entry->sn = sn;
+		    __entry->idx = sde->this_idx;
+		    ),
+		    TP_printk("[%s] SDE(%u) sn %llu",
+			      __get_str(dev),
+			      __entry->idx,
+			      __entry->sn
+			      )
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
+	     TP_PROTO(
+	     struct sdma_engine *sde,
+	     u64 sn
+	     ),
+	     TP_ARGS(sde, sn)
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
+	     TP_PROTO(struct sdma_engine *sde, u64 sn),
+	     TP_ARGS(sde, sn)
+);
+
+#define USDMA_HDR_FORMAT \
+	"[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
+
+TRACE_EVENT(hfi1_sdma_user_header,
+	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+		     struct hfi1_pkt_header *hdr, u32 tidval),
+	    TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
+	    TP_STRUCT__entry(
+		    DD_DEV_ENTRY(dd)
+		    __field(u16, ctxt)
+		    __field(u8, subctxt)
+		    __field(u16, req)
+		    __field(u32, pbc0)
+		    __field(u32, pbc1)
+		    __field(u32, lrh0)
+		    __field(u32, lrh1)
+		    __field(u32, bth0)
+		    __field(u32, bth1)
+		    __field(u32, bth2)
+		    __field(u32, kdeth0)
+		    __field(u32, kdeth1)
+		    __field(u32, kdeth2)
+		    __field(u32, kdeth3)
+		    __field(u32, kdeth4)
+		    __field(u32, kdeth5)
+		    __field(u32, kdeth6)
+		    __field(u32, kdeth7)
+		    __field(u32, kdeth8)
+		    __field(u32, tidval)
+		    ),
+		    TP_fast_assign(
+		    __le32 *pbc = (__le32 *)hdr->pbc;
+		    __be32 *lrh = (__be32 *)hdr->lrh;
+		    __be32 *bth = (__be32 *)hdr->bth;
+		    __le32 *kdeth = (__le32 *)&hdr->kdeth;
+
+		    DD_DEV_ASSIGN(dd);
+		    __entry->ctxt = ctxt;
+		    __entry->subctxt = subctxt;
+		    __entry->req = req;
+		    __entry->pbc0 = le32_to_cpu(pbc[0]);
+		    __entry->pbc1 = le32_to_cpu(pbc[1]);
+		    __entry->lrh0 = be32_to_cpu(lrh[0]);
+		    __entry->lrh1 = be32_to_cpu(lrh[1]);
+		    __entry->bth0 = be32_to_cpu(bth[0]);
+		    __entry->bth1 = be32_to_cpu(bth[1]);
+		    __entry->bth2 = be32_to_cpu(bth[2]);
+		    __entry->kdeth0 = le32_to_cpu(kdeth[0]);
+		    __entry->kdeth1 = le32_to_cpu(kdeth[1]);
+		    __entry->kdeth2 = le32_to_cpu(kdeth[2]);
+		    __entry->kdeth3 = le32_to_cpu(kdeth[3]);
+		    __entry->kdeth4 = le32_to_cpu(kdeth[4]);
+		    __entry->kdeth5 = le32_to_cpu(kdeth[5]);
+		    __entry->kdeth6 = le32_to_cpu(kdeth[6]);
+		    __entry->kdeth7 = le32_to_cpu(kdeth[7]);
+		    __entry->kdeth8 = le32_to_cpu(kdeth[8]);
+		    __entry->tidval = tidval;
+	    ),
+	    TP_printk(USDMA_HDR_FORMAT,
+		      __get_str(dev),
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->req,
+		      __entry->pbc1,
+		      __entry->pbc0,
+		      __entry->lrh0,
+		      __entry->lrh1,
+		      __entry->bth0,
+		      __entry->bth1,
+		      __entry->bth2,
+		      __entry->kdeth0,
+		      __entry->kdeth1,
+		      __entry->kdeth2,
+		      __entry->kdeth3,
+		      __entry->kdeth4,
+		      __entry->kdeth5,
+		      __entry->kdeth6,
+		      __entry->kdeth7,
+		      __entry->kdeth8,
+		      __entry->tidval
+	    )
+);
+
+#define SDMA_UREQ_FMT \
+	"[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
+TRACE_EVENT(hfi1_sdma_user_reqinfo,
+	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
+	    TP_ARGS(dd, ctxt, subctxt, i),
+	    TP_STRUCT__entry(
+		    DD_DEV_ENTRY(dd);
+		    __field(u16, ctxt)
+		    __field(u8, subctxt)
+		    __field(u8, ver_opcode)
+		    __field(u8, iovcnt)
+		    __field(u16, npkts)
+		    __field(u16, fragsize)
+		    __field(u16, comp_idx)
+	    ),
+	    TP_fast_assign(
+		    DD_DEV_ASSIGN(dd);
+		    __entry->ctxt = ctxt;
+		    __entry->subctxt = subctxt;
+		    __entry->ver_opcode = i[0] & 0xff;
+		    __entry->iovcnt = (i[0] >> 8) & 0xff;
+		    __entry->npkts = i[1];
+		    __entry->fragsize = i[2];
+		    __entry->comp_idx = i[3];
+	    ),
+	    TP_printk(SDMA_UREQ_FMT,
+		      __get_str(dev),
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->ver_opcode,
+		      __entry->iovcnt,
+		      __entry->npkts,
+		      __entry->fragsize,
+		      __entry->comp_idx
+		      )
+);
+
+#define usdma_complete_name(st) { st, #st }
+#define show_usdma_complete_state(st)			\
+	__print_symbolic(st,				\
+			usdma_complete_name(FREE),	\
+			usdma_complete_name(QUEUED),	\
+			usdma_complete_name(COMPLETE), \
+			usdma_complete_name(ERROR))
+
+TRACE_EVENT(hfi1_sdma_user_completion,
+	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
+		     u8 state, int code),
+	    TP_ARGS(dd, ctxt, subctxt, idx, state, code),
+	    TP_STRUCT__entry(
+	    DD_DEV_ENTRY(dd)
+	    __field(u16, ctxt)
+	    __field(u8, subctxt)
+	    __field(u16, idx)
+	    __field(u8, state)
+	    __field(int, code)
+	    ),
+	    TP_fast_assign(
+	    DD_DEV_ASSIGN(dd);
+	    __entry->ctxt = ctxt;
+	    __entry->subctxt = subctxt;
+	    __entry->idx = idx;
+	    __entry->state = state;
+	    __entry->code = code;
+	    ),
+	    TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
+		      __get_str(dev), __entry->ctxt, __entry->subctxt,
+		      __entry->idx, show_usdma_complete_state(__entry->state),
+		      __entry->code)
+);
+
+const char *print_u32_array(struct trace_seq *, u32 *, int);
+#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
+
+TRACE_EVENT(hfi1_sdma_user_header_ahg,
+	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+		     u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
+	    TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
+	    TP_STRUCT__entry(
+	    DD_DEV_ENTRY(dd)
+	    __field(u16, ctxt)
+	    __field(u8, subctxt)
+	    __field(u16, req)
+	    __field(u8, sde)
+	    __field(u8, idx)
+	    __field(int, len)
+	    __field(u32, tidval)
+	    __array(u32, ahg, 10)
+	    ),
+	    TP_fast_assign(
+	    DD_DEV_ASSIGN(dd);
+	    __entry->ctxt = ctxt;
+	    __entry->subctxt = subctxt;
+	    __entry->req = req;
+	    __entry->sde = sde;
+	    __entry->idx = ahgidx;
+	    __entry->len = len;
+	    __entry->tidval = tidval;
+	    memcpy(__entry->ahg, ahg, len * sizeof(u32));
+	    ),
+	    TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
+		      __get_str(dev),
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->req,
+		      __entry->sde,
+		      __entry->idx,
+		      __entry->len - 1,
+		      __print_u32_hex(__entry->ahg, __entry->len),
+		      __entry->tidval
+		      )
+);
+
+TRACE_EVENT(hfi1_sdma_state,
+	    TP_PROTO(struct sdma_engine *sde,
+		     const char *cstate,
+		     const char *nstate
+		     ),
+	    TP_ARGS(sde, cstate, nstate),
+	    TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+		__string(curstate, cstate)
+		__string(newstate, nstate)
+	    ),
+	    TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+		__assign_str(curstate, cstate);
+		__assign_str(newstate, nstate);
+	    ),
+	    TP_printk("[%s] current state %s new state %s",
+		      __get_str(dev),
+		      __get_str(curstate),
+		      __get_str(newstate)
+	    )
+);
+
+#define BCT_FORMAT \
+	"shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
+
+#define BCT(field) \
+	be16_to_cpu( \
+	((struct buffer_control *)__get_dynamic_array(bct))->field \
+	)
+
+DECLARE_EVENT_CLASS(hfi1_bct_template,
+		    TP_PROTO(struct hfi1_devdata *dd,
+			     struct buffer_control *bc),
+		    TP_ARGS(dd, bc),
+		    TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+		    __dynamic_array(u8, bct, sizeof(*bc))
+		    ),
+		    TP_fast_assign(DD_DEV_ASSIGN(dd);
+				   memcpy(__get_dynamic_array(bct), bc,
+					  sizeof(*bc));
+		    ),
+		    TP_printk(BCT_FORMAT,
+			      BCT(overall_shared_limit),
+
+			      BCT(vl[0].dedicated),
+			      BCT(vl[0].shared),
+
+			      BCT(vl[1].dedicated),
+			      BCT(vl[1].shared),
+
+			      BCT(vl[2].dedicated),
+			      BCT(vl[2].shared),
+
+			      BCT(vl[3].dedicated),
+			      BCT(vl[3].shared),
+
+			      BCT(vl[4].dedicated),
+			      BCT(vl[4].shared),
+
+			      BCT(vl[5].dedicated),
+			      BCT(vl[5].shared),
+
+			      BCT(vl[6].dedicated),
+			      BCT(vl[6].shared),
+
+			      BCT(vl[7].dedicated),
+			      BCT(vl[7].shared),
+
+			      BCT(vl[15].dedicated),
+			      BCT(vl[15].shared)
+		    )
+);
+
+DEFINE_EVENT(hfi1_bct_template, bct_set,
+	     TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+	     TP_ARGS(dd, bc));
+
+DEFINE_EVENT(hfi1_bct_template, bct_get,
+	     TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+	     TP_ARGS(dd, bc));
+
+TRACE_EVENT(
+	hfi1_qp_send_completion,
+	TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe, u32 idx),
+	TP_ARGS(qp, wqe, idx),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(struct rvt_swqe *, wqe)
+		__field(u64, wr_id)
+		__field(u32, qpn)
+		__field(u32, qpt)
+		__field(u32, length)
+		__field(u32, idx)
+		__field(u32, ssn)
+		__field(enum ib_wr_opcode, opcode)
+		__field(int, send_flags)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		__entry->wqe = wqe;
+		__entry->wr_id = wqe->wr.wr_id;
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->qpt = qp->ibqp.qp_type;
+		__entry->length = wqe->length;
+		__entry->idx = idx;
+		__entry->ssn = wqe->ssn;
+		__entry->opcode = wqe->wr.opcode;
+		__entry->send_flags = wqe->wr.send_flags;
+	),
+	TP_printk(
+		"[%s] qpn 0x%x qpt %u wqe %p idx %u wr_id %llx length %u ssn %u opcode %x send_flags %x",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->qpt,
+		__entry->wqe,
+		__entry->idx,
+		__entry->wr_id,
+		__entry->length,
+		__entry->ssn,
+		__entry->opcode,
+		__entry->send_flags
+	)
+);
+
+DECLARE_EVENT_CLASS(
+	hfi1_do_send_template,
+	TP_PROTO(struct rvt_qp *qp, bool flag),
+	TP_ARGS(qp, flag),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(bool, flag)
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->flag = flag;
+	),
+	TP_printk(
+		"[%s] qpn %x flag %d",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->flag
+	)
+);
+
+DEFINE_EVENT(
+	hfi1_do_send_template, hfi1_rc_do_send,
+	TP_PROTO(struct rvt_qp *qp, bool flag),
+	TP_ARGS(qp, flag)
+);
+
+DEFINE_EVENT(
+	hfi1_do_send_template, hfi1_rc_expired_time_slice,
+	TP_PROTO(struct rvt_qp *qp, bool flag),
+	TP_ARGS(qp, flag)
+);
+
+#endif /* __HFI1_TRACE_TX_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_tx
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
new file mode 100644
index 0000000..9d7a311
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -0,0 +1,588 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "verbs_txreq.h"
+#include "qp.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) UC_OP(x)
+
+/**
+ * hfi1_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Assume s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_other_headers *ohdr;
+	struct rvt_swqe *wqe;
+	u32 hwords;
+	u32 bth0 = 0;
+	u32 len;
+	u32 pmtu = qp->pmtu;
+	int middle = 0;
+
+	ps->s_txreq = get_txreq(ps->dev, qp);
+	if (IS_ERR(ps->s_txreq))
+		goto bail_no_tx;
+
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
+		if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == READ_ONCE(qp->s_head))
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (iowait_sdma_pending(&priv->s_iowait)) {
+			qp->s_flags |= RVT_S_WAIT_DMA;
+			goto bail;
+		}
+		clear_ahg(qp);
+		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+		hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done_free_tx;
+	}
+
+	if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
+		/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+		hwords = 5;
+		if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)
+			ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth;
+		else
+			ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
+	} else {
+		/* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
+		hwords = 7;
+		if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
+		    (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))))
+			ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth;
+		else
+			ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth;
+	}
+
+	/* Get the next send request. */
+	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+	qp->s_wqe = NULL;
+	switch (qp->s_state) {
+	default:
+		if (!(ib_rvt_state_ops[qp->state] &
+		    RVT_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/* Check if send work queue is empty. */
+		if (qp->s_cur == READ_ONCE(qp->s_head)) {
+			clear_ahg(qp);
+			goto bail;
+		}
+		/*
+		 * Local operations are processed immediately
+		 * after all prior requests have completed.
+		 */
+		if (wqe->wr.opcode == IB_WR_REG_MR ||
+		    wqe->wr.opcode == IB_WR_LOCAL_INV) {
+			int local_ops = 0;
+			int err = 0;
+
+			if (qp->s_last != qp->s_cur)
+				goto bail;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
+				err = rvt_invalidate_rkey(
+					qp, wqe->wr.ex.invalidate_rkey);
+				local_ops = 1;
+			}
+			hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR
+							: IB_WC_SUCCESS);
+			if (local_ops)
+				atomic_dec(&qp->local_ops_pending);
+			goto done_free_tx;
+		}
+		/*
+		 * Start a new request.
+		 */
+		qp->s_psn = wqe->psn;
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_sge.total_len = wqe->length;
+		len = wqe->length;
+		qp->s_len = len;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			if (len > pmtu) {
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND) {
+				qp->s_state = OP(SEND_ONLY);
+			} else {
+				qp->s_state =
+					OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->rdma_wr.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->rdma_wr.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / 4;
+			if (len > pmtu) {
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			} else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= IB_BTH_SOLICITED;
+			}
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		break;
+
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND) {
+			qp->s_state = OP(SEND_LAST);
+		} else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= IB_BTH_SOLICITED;
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		} else {
+			qp->s_state =
+				OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+		}
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	qp->s_len -= len;
+	ps->s_txreq->hdr_dwords = hwords;
+	ps->s_txreq->sde = priv->s_sde;
+	ps->s_txreq->ss = &qp->s_sge;
+	ps->s_txreq->s_cur_size = len;
+	hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
+			     mask_psn(qp->s_psn++), middle, ps);
+	return 1;
+
+done_free_tx:
+	hfi1_put_txreq(ps->s_txreq);
+	ps->s_txreq = NULL;
+	return 1;
+
+bail:
+	hfi1_put_txreq(ps->s_txreq);
+
+bail_no_tx:
+	ps->s_txreq = NULL;
+	qp->s_flags &= ~RVT_S_BUSY;
+	return 0;
+}
+
+/**
+ * hfi1_uc_rcv - handle an incoming UC packet
+ * @ibp: the port the packet came in on
+ * @hdr: the header of the packet
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_uc_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
+	void *data = packet->payload;
+	u32 tlen = packet->tlen;
+	struct rvt_qp *qp = packet->qp;
+	struct ib_other_headers *ohdr = packet->ohdr;
+	u32 opcode = packet->opcode;
+	u32 hdrsize = packet->hlen;
+	u32 psn;
+	u32 pad = packet->pad;
+	struct ib_wc wc;
+	u32 pmtu = qp->pmtu;
+	struct ib_reth *reth;
+	int ret;
+	u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
+
+	if (hfi1_ruc_check_hdr(ibp, packet))
+		return;
+
+	process_ecn(qp, packet, true);
+
+	psn = ib_bth_get_psn(ohdr);
+	/* Compare the PSN verses the expected PSN. */
+	if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) {
+		/*
+		 * Handle a sequence error.
+		 * Silently drop any current message.
+		 */
+		qp->r_psn = psn;
+inv:
+		if (qp->r_state == OP(SEND_FIRST) ||
+		    qp->r_state == OP(SEND_MIDDLE)) {
+			set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
+			qp->r_sge.num_sge = 0;
+		} else {
+			rvt_put_ss(&qp->r_sge);
+		}
+		qp->r_state = OP(SEND_LAST);
+		switch (opcode) {
+		case OP(SEND_FIRST):
+		case OP(SEND_ONLY):
+		case OP(SEND_ONLY_WITH_IMMEDIATE):
+			goto send_first;
+
+		case OP(RDMA_WRITE_FIRST):
+		case OP(RDMA_WRITE_ONLY):
+		case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+			goto rdma_first;
+
+		default:
+			goto drop;
+		}
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	default:
+		if (opcode == OP(SEND_FIRST) ||
+		    opcode == OP(SEND_ONLY) ||
+		    opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_FIRST) ||
+		    opcode == OP(RDMA_WRITE_ONLY) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			break;
+		goto inv;
+	}
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+		rvt_comm_est(qp);
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+send_first:
+		if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
+			qp->r_sge = qp->s_rdma_read_sge;
+		} else {
+			ret = hfi1_rvt_get_rwqe(qp, 0);
+			if (ret < 0)
+				goto op_err;
+			if (!ret)
+				goto drop;
+			/*
+			 * qp->s_rdma_read_sge will be the owner
+			 * of the mr references.
+			 */
+			qp->s_rdma_read_sge = qp->r_sge;
+		}
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto no_immediate_data;
+		else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+			goto send_last_imm;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		/*
+		 * There will be no padding for 9B packet but 16B packets
+		 * will come in with some padding since we always add
+		 * CRC and LT bytes which will need to be flit aligned
+		 */
+		if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
+			goto rewind;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto rewind;
+		hfi1_copy_sge(&qp->r_sge, data, pmtu, false, false);
+		break;
+
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+		wc.ex.imm_data = ohdr->u.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+	case OP(SEND_LAST):
+no_immediate_data:
+		wc.ex.imm_data = 0;
+		wc.wc_flags = 0;
+send_last:
+		/* Check for invalid length. */
+		/* LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + extra_bytes)))
+			goto rewind;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + extra_bytes);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len))
+			goto rewind;
+		wc.opcode = IB_WC_RECV;
+		hfi1_copy_sge(&qp->r_sge, data, tlen, false, false);
+		rvt_put_ss(&qp->s_rdma_read_sge);
+last_imm:
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
+		/*
+		 * It seems that IB mandates the presence of an SL in a
+		 * work completion only for the UD transport (see section
+		 * 11.4.2 of IBTA Vol. 1).
+		 *
+		 * However, the way the SL is chosen below is consistent
+		 * with the way that IB/qib works and is trying avoid
+		 * introducing incompatibilities.
+		 *
+		 * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+		 */
+		wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
+		/* zero fields that are N/A */
+		wc.vendor_err = 0;
+		wc.pkey_index = 0;
+		wc.dlid_path_bits = 0;
+		wc.port_num = 0;
+		/* Signal completion event if the solicited bit is set. */
+		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+			     ib_bth_is_solicited(ohdr));
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+rdma_first:
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_WRITE))) {
+			goto drop;
+		}
+		reth = &ohdr->u.rc.reth;
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		qp->r_sge.sg_list = NULL;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey */
+			ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
+					 vaddr, rkey, IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok))
+				goto drop;
+			qp->r_sge.num_sge = 1;
+		} else {
+			qp->r_sge.num_sge = 0;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (opcode == OP(RDMA_WRITE_ONLY)) {
+			goto rdma_last;
+		} else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) {
+			wc.ex.imm_data = ohdr->u.rc.imm_data;
+			goto rdma_last_imm;
+		}
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto drop;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto drop;
+		hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		wc.ex.imm_data = ohdr->u.imm_data;
+rdma_last_imm:
+		wc.wc_flags = IB_WC_WITH_IMM;
+
+		/* Check for invalid length. */
+		/* LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto drop;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + extra_bytes);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+			goto drop;
+		if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
+			rvt_put_ss(&qp->s_rdma_read_sge);
+		} else {
+			ret = hfi1_rvt_get_rwqe(qp, 1);
+			if (ret < 0)
+				goto op_err;
+			if (!ret)
+				goto drop;
+		}
+		wc.byte_len = qp->r_len;
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		hfi1_copy_sge(&qp->r_sge, data, tlen, true, false);
+		rvt_put_ss(&qp->r_sge);
+		goto last_imm;
+
+	case OP(RDMA_WRITE_LAST):
+rdma_last:
+		/* Check for invalid length. */
+		/* LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto drop;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + extra_bytes);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+			goto drop;
+		hfi1_copy_sge(&qp->r_sge, data, tlen, true, false);
+		rvt_put_ss(&qp->r_sge);
+		break;
+
+	default:
+		/* Drop packet for unknown opcodes. */
+		goto drop;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+	return;
+
+rewind:
+	set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
+	qp->r_sge.num_sge = 0;
+drop:
+	ibp->rvp.n_pkt_drops++;
+	return;
+
+op_err:
+	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+}
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
new file mode 100644
index 0000000..69c17a5
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -0,0 +1,1056 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#include <rdma/ib_smi.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "verbs_txreq.h"
+#include "qp.h"
+
+/* We support only two types - 9B and 16B for now */
+static const hfi1_make_req hfi1_make_ud_req_tbl[2] = {
+	[HFI1_PKT_TYPE_9B] = &hfi1_make_ud_req_9B,
+	[HFI1_PKT_TYPE_16B] = &hfi1_make_ud_req_16B
+};
+
+/**
+ * ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from hfi1_make_ud_req() to forward a WQE addressed
+ * to the same HFI.
+ * Note that the receive interrupt handler may be calling hfi1_ud_rcv()
+ * while this is being called.
+ */
+static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
+{
+	struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+	struct hfi1_pportdata *ppd;
+	struct hfi1_qp_priv *priv = sqp->priv;
+	struct rvt_qp *qp;
+	struct rdma_ah_attr *ah_attr;
+	unsigned long flags;
+	struct rvt_sge_state ssge;
+	struct rvt_sge *sge;
+	struct ib_wc wc;
+	u32 length;
+	enum ib_qp_type sqptype, dqptype;
+
+	rcu_read_lock();
+
+	qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
+			    swqe->ud_wr.remote_qpn);
+	if (!qp) {
+		ibp->rvp.n_pkt_drops++;
+		rcu_read_unlock();
+		return;
+	}
+
+	sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ?
+			IB_QPT_UD : sqp->ibqp.qp_type;
+	dqptype = qp->ibqp.qp_type == IB_QPT_GSI ?
+			IB_QPT_UD : qp->ibqp.qp_type;
+
+	if (dqptype != sqptype ||
+	    !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+		ibp->rvp.n_pkt_drops++;
+		goto drop;
+	}
+
+	ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
+	ppd = ppd_from_ibp(ibp);
+
+	if (qp->ibqp.qp_num > 1) {
+		u16 pkey;
+		u32 slid;
+		u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)];
+
+		pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index);
+		slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) &
+				   ((1 << ppd->lmc) - 1));
+		if (unlikely(ingress_pkey_check(ppd, pkey, sc5,
+						qp->s_pkey_index,
+						slid, false))) {
+			hfi1_bad_pkey(ibp, pkey,
+				      rdma_ah_get_sl(ah_attr),
+				      sqp->ibqp.qp_num, qp->ibqp.qp_num,
+				      slid, rdma_ah_get_dlid(ah_attr));
+			goto drop;
+		}
+	}
+
+	/*
+	 * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	if (qp->ibqp.qp_num) {
+		u32 qkey;
+
+		qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
+			sqp->qkey : swqe->ud_wr.remote_qkey;
+		if (unlikely(qkey != qp->qkey))
+			goto drop; /* silently drop per IBTA spec */
+	}
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	length = swqe->length;
+	memset(&wc, 0, sizeof(wc));
+	wc.byte_len = length + sizeof(struct ib_grh);
+
+	if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = swqe->wr.ex.imm_data;
+	}
+
+	spin_lock_irqsave(&qp->r_lock, flags);
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 */
+	if (qp->r_flags & RVT_R_REUSE_SGE) {
+		qp->r_flags &= ~RVT_R_REUSE_SGE;
+	} else {
+		int ret;
+
+		ret = hfi1_rvt_get_rwqe(qp, 0);
+		if (ret < 0) {
+			rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+			goto bail_unlock;
+		}
+		if (!ret) {
+			if (qp->ibqp.qp_num == 0)
+				ibp->rvp.n_vl15_dropped++;
+			goto bail_unlock;
+		}
+	}
+	/* Silently drop packets which are too big. */
+	if (unlikely(wc.byte_len > qp->r_len)) {
+		qp->r_flags |= RVT_R_REUSE_SGE;
+		ibp->rvp.n_pkt_drops++;
+		goto bail_unlock;
+	}
+
+	if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
+		struct ib_grh grh;
+		struct ib_global_route grd = *(rdma_ah_read_grh(ah_attr));
+
+		/*
+		 * For loopback packets with extended LIDs, the
+		 * sgid_index in the GRH is 0 and the dgid is
+		 * OPA GID of the sender. While creating a response
+		 * to the loopback packet, IB core creates the new
+		 * sgid_index from the DGID and that will be the
+		 * OPA_GID_INDEX. The new dgid is from the sgid
+		 * index and that will be in the IB GID format.
+		 *
+		 * We now have a case where the sent packet had a
+		 * different sgid_index and dgid compared to the
+		 * one that was received in response.
+		 *
+		 * Fix this inconsistency.
+		 */
+		if (priv->hdr_type == HFI1_PKT_TYPE_16B) {
+			if (grd.sgid_index == 0)
+				grd.sgid_index = OPA_GID_INDEX;
+
+			if (ib_is_opa_gid(&grd.dgid))
+				grd.dgid.global.interface_id =
+				cpu_to_be64(ppd->guids[HFI1_PORT_GUID_INDEX]);
+		}
+
+		hfi1_make_grh(ibp, &grh, &grd, 0, 0);
+		hfi1_copy_sge(&qp->r_sge, &grh,
+			      sizeof(grh), true, false);
+		wc.wc_flags |= IB_WC_GRH;
+	} else {
+		rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true);
+	}
+	ssge.sg_list = swqe->sg_list + 1;
+	ssge.sge = *swqe->sg_list;
+	ssge.num_sge = swqe->wr.num_sge;
+	sge = &ssge.sge;
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		WARN_ON_ONCE(len == 0);
+		hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, true, false);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ssge.num_sge)
+				*sge = *ssge.sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= RVT_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+	rvt_put_ss(&qp->r_sge);
+	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+		goto bail_unlock;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = sqp->ibqp.qp_num;
+	if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
+		if (sqp->ibqp.qp_type == IB_QPT_GSI ||
+		    sqp->ibqp.qp_type == IB_QPT_SMI)
+			wc.pkey_index = swqe->ud_wr.pkey_index;
+		else
+			wc.pkey_index = sqp->s_pkey_index;
+	} else {
+		wc.pkey_index = 0;
+	}
+	wc.slid = (ppd->lid | (rdma_ah_get_path_bits(ah_attr) &
+				   ((1 << ppd->lmc) - 1))) & U16_MAX;
+	/* Check for loopback when the port lid is not set */
+	if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI)
+		wc.slid = be16_to_cpu(IB_LID_PERMISSIVE);
+	wc.sl = rdma_ah_get_sl(ah_attr);
+	wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1);
+	wc.port_num = qp->port_num;
+	/* Signal completion event if the solicited bit is set. */
+	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+		     swqe->wr.send_flags & IB_SEND_SOLICITED);
+	ibp->rvp.n_loop_pkts++;
+bail_unlock:
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+drop:
+	rcu_read_unlock();
+}
+
+static void hfi1_make_bth_deth(struct rvt_qp *qp, struct rvt_swqe *wqe,
+			       struct ib_other_headers *ohdr,
+			       u16 *pkey, u32 extra_bytes, bool bypass)
+{
+	u32 bth0;
+	struct hfi1_ibport *ibp;
+
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+	} else {
+		bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+	}
+
+	if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+		bth0 |= IB_BTH_SOLICITED;
+	bth0 |= extra_bytes << 20;
+	if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
+		*pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
+	else
+		*pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
+	if (!bypass)
+		bth0 |= *pkey;
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
+	/*
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
+					 qp->qkey : wqe->ud_wr.remote_qkey);
+	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+}
+
+void hfi1_make_ud_req_9B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			 struct rvt_swqe *wqe)
+{
+	u32 nwords, extra_bytes;
+	u16 len, slid, dlid, pkey;
+	u16 lrh0 = 0;
+	u8 sc5;
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_other_headers *ohdr;
+	struct rdma_ah_attr *ah_attr;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	struct ib_grh *grh;
+
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ppd = ppd_from_ibp(ibp);
+	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+
+	extra_bytes = -wqe->length & 3;
+	nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC;
+	/* header size in dwords LRH+BTH+DETH = (8+12+8)/4. */
+	ps->s_txreq->hdr_dwords = 7;
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+		ps->s_txreq->hdr_dwords++;
+
+	if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
+		grh = &ps->s_txreq->phdr.hdr.ibh.u.l.grh;
+		ps->s_txreq->hdr_dwords +=
+			hfi1_make_grh(ibp, grh, rdma_ah_read_grh(ah_attr),
+				      ps->s_txreq->hdr_dwords - LRH_9B_DWORDS,
+				      nwords);
+		lrh0 = HFI1_LRH_GRH;
+		ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth;
+	} else {
+		lrh0 = HFI1_LRH_BTH;
+		ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
+	}
+
+	sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)];
+	lrh0 |= (rdma_ah_get_sl(ah_attr) & 0xf) << 4;
+	if (qp->ibqp.qp_type == IB_QPT_SMI) {
+		lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+		priv->s_sc = 0xf;
+	} else {
+		lrh0 |= (sc5 & 0xf) << 12;
+		priv->s_sc = sc5;
+	}
+
+	dlid = opa_get_lid(rdma_ah_get_dlid(ah_attr), 9B);
+	if (dlid == be16_to_cpu(IB_LID_PERMISSIVE)) {
+		slid = be16_to_cpu(IB_LID_PERMISSIVE);
+	} else {
+		u16 lid = (u16)ppd->lid;
+
+		if (lid) {
+			lid |= rdma_ah_get_path_bits(ah_attr) &
+				((1 << ppd->lmc) - 1);
+			slid = lid;
+		} else {
+			slid = be16_to_cpu(IB_LID_PERMISSIVE);
+		}
+	}
+	hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, false);
+	len = ps->s_txreq->hdr_dwords + nwords;
+
+	/* Setup the packet */
+	ps->s_txreq->phdr.hdr.hdr_type = HFI1_PKT_TYPE_9B;
+	hfi1_make_ib_hdr(&ps->s_txreq->phdr.hdr.ibh,
+			 lrh0, len, dlid, slid);
+}
+
+void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			  struct rvt_swqe *wqe)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_other_headers *ohdr;
+	struct rdma_ah_attr *ah_attr;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	u32 dlid, slid, nwords, extra_bytes;
+	u16 len, pkey;
+	u8 l4, sc5;
+
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ppd = ppd_from_ibp(ibp);
+	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+	/* header size in dwords 16B LRH+BTH+DETH = (16+12+8)/4. */
+	ps->s_txreq->hdr_dwords = 9;
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+		ps->s_txreq->hdr_dwords++;
+
+	/* SW provides space for CRC and LT for bypass packets. */
+	extra_bytes = hfi1_get_16b_padding((ps->s_txreq->hdr_dwords << 2),
+					   wqe->length);
+	nwords = ((wqe->length + extra_bytes + SIZE_OF_LT) >> 2) + SIZE_OF_CRC;
+
+	if ((rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) &&
+	    hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) {
+		struct ib_grh *grh;
+		struct ib_global_route *grd = rdma_ah_retrieve_grh(ah_attr);
+		/*
+		 * Ensure OPA GIDs are transformed to IB gids
+		 * before creating the GRH.
+		 */
+		if (grd->sgid_index == OPA_GID_INDEX) {
+			dd_dev_warn(ppd->dd, "Bad sgid_index. sgid_index: %d\n",
+				    grd->sgid_index);
+			grd->sgid_index = 0;
+		}
+		grh = &ps->s_txreq->phdr.hdr.opah.u.l.grh;
+		ps->s_txreq->hdr_dwords += hfi1_make_grh(
+			ibp, grh, grd,
+			ps->s_txreq->hdr_dwords - LRH_16B_DWORDS,
+			nwords);
+		ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth;
+		l4 = OPA_16B_L4_IB_GLOBAL;
+	} else {
+		ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth;
+		l4 = OPA_16B_L4_IB_LOCAL;
+	}
+
+	sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)];
+	if (qp->ibqp.qp_type == IB_QPT_SMI)
+		priv->s_sc = 0xf;
+	else
+		priv->s_sc = sc5;
+
+	dlid = opa_get_lid(rdma_ah_get_dlid(ah_attr), 16B);
+	if (!ppd->lid)
+		slid = be32_to_cpu(OPA_LID_PERMISSIVE);
+	else
+		slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) &
+			   ((1 << ppd->lmc) - 1));
+
+	hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, true);
+	/* Convert dwords to flits */
+	len = (ps->s_txreq->hdr_dwords + nwords) >> 1;
+
+	/* Setup the packet */
+	ps->s_txreq->phdr.hdr.hdr_type = HFI1_PKT_TYPE_16B;
+	hfi1_make_16b_hdr(&ps->s_txreq->phdr.hdr.opah,
+			  slid, dlid, len, pkey, 0, 0, l4, priv->s_sc);
+}
+
+/**
+ * hfi1_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Assume s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct rdma_ah_attr *ah_attr;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	struct rvt_swqe *wqe;
+	int next_cur;
+	u32 lid;
+
+	ps->s_txreq = get_txreq(ps->dev, qp);
+	if (IS_ERR(ps->s_txreq))
+		goto bail_no_tx;
+
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == READ_ONCE(qp->s_head))
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (iowait_sdma_pending(&priv->s_iowait)) {
+			qp->s_flags |= RVT_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+		hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done_free_tx;
+	}
+
+	/* see post_one_send() */
+	if (qp->s_cur == READ_ONCE(qp->s_head))
+		goto bail;
+
+	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+	next_cur = qp->s_cur + 1;
+	if (next_cur >= qp->s_size)
+		next_cur = 0;
+
+	/* Construct the header. */
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ppd = ppd_from_ibp(ibp);
+	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+	priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr);
+	if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) ||
+	    (rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) {
+		lid = rdma_ah_get_dlid(ah_attr) & ~((1 << ppd->lmc) - 1);
+		if (unlikely(!loopback &&
+			     ((lid == ppd->lid) ||
+			      ((lid == be32_to_cpu(OPA_LID_PERMISSIVE)) &&
+			       (qp->ibqp.qp_type == IB_QPT_GSI))))) {
+			unsigned long tflags = ps->flags;
+			/*
+			 * If DMAs are in progress, we can't generate
+			 * a completion for the loopback packet since
+			 * it would be out of order.
+			 * Instead of waiting, we could queue a
+			 * zero length descriptor so we get a callback.
+			 */
+			if (iowait_sdma_pending(&priv->s_iowait)) {
+				qp->s_flags |= RVT_S_WAIT_DMA;
+				goto bail;
+			}
+			qp->s_cur = next_cur;
+			spin_unlock_irqrestore(&qp->s_lock, tflags);
+			ud_loopback(qp, wqe);
+			spin_lock_irqsave(&qp->s_lock, tflags);
+			ps->flags = tflags;
+			hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
+			goto done_free_tx;
+		}
+	}
+
+	qp->s_cur = next_cur;
+	ps->s_txreq->s_cur_size = wqe->length;
+	ps->s_txreq->ss = &qp->s_sge;
+	qp->s_srate = rdma_ah_get_static_rate(ah_attr);
+	qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
+	qp->s_wqe = wqe;
+	qp->s_sge.sge = wqe->sg_list[0];
+	qp->s_sge.sg_list = wqe->sg_list + 1;
+	qp->s_sge.num_sge = wqe->wr.num_sge;
+	qp->s_sge.total_len = wqe->length;
+
+	/* Make the appropriate header */
+	hfi1_make_ud_req_tbl[priv->hdr_type](qp, ps, qp->s_wqe);
+	priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+	ps->s_txreq->sde = priv->s_sde;
+	priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+	ps->s_txreq->psc = priv->s_sendcontext;
+	/* disarm any ahg */
+	priv->s_ahg->ahgcount = 0;
+	priv->s_ahg->ahgidx = 0;
+	priv->s_ahg->tx_flags = 0;
+
+	return 1;
+
+done_free_tx:
+	hfi1_put_txreq(ps->s_txreq);
+	ps->s_txreq = NULL;
+	return 1;
+
+bail:
+	hfi1_put_txreq(ps->s_txreq);
+
+bail_no_tx:
+	ps->s_txreq = NULL;
+	qp->s_flags &= ~RVT_S_BUSY;
+	return 0;
+}
+
+/*
+ * Hardware can't check this so we do it here.
+ *
+ * This is a slightly different algorithm than the standard pkey check.  It
+ * special cases the management keys and allows for 0x7fff and 0xffff to be in
+ * the table at the same time.
+ *
+ * @returns the index found or -1 if not found
+ */
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned i;
+
+	if (pkey == FULL_MGMT_P_KEY || pkey == LIM_MGMT_P_KEY) {
+		unsigned lim_idx = -1;
+
+		for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) {
+			/* here we look for an exact match */
+			if (ppd->pkeys[i] == pkey)
+				return i;
+			if (ppd->pkeys[i] == LIM_MGMT_P_KEY)
+				lim_idx = i;
+		}
+
+		/* did not find 0xffff return 0x7fff idx if found */
+		if (pkey == FULL_MGMT_P_KEY)
+			return lim_idx;
+
+		/* no match...  */
+		return -1;
+	}
+
+	pkey &= 0x7fff; /* remove limited/full membership bit */
+
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i)
+		if ((ppd->pkeys[i] & 0x7fff) == pkey)
+			return i;
+
+	/*
+	 * Should not get here, this means hardware failed to validate pkeys.
+	 */
+	return -1;
+}
+
+void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp,
+		    u32 remote_qpn, u16 pkey, u32 slid, u32 dlid,
+		    u8 sc5, const struct ib_grh *old_grh)
+{
+	u64 pbc, pbc_flags = 0;
+	u32 bth0, plen, vl, hwords = 7;
+	u16 len;
+	u8 l4;
+	struct hfi1_16b_header hdr;
+	struct ib_other_headers *ohdr;
+	struct pio_buf *pbuf;
+	struct send_context *ctxt = qp_to_send_context(qp, sc5);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 nwords;
+
+	/* Populate length */
+	nwords = ((hfi1_get_16b_padding(hwords << 2, 0) +
+		   SIZE_OF_LT) >> 2) + SIZE_OF_CRC;
+	if (old_grh) {
+		struct ib_grh *grh = &hdr.u.l.grh;
+
+		grh->version_tclass_flow = old_grh->version_tclass_flow;
+		grh->paylen = cpu_to_be16(
+			(hwords - LRH_16B_DWORDS + nwords) << 2);
+		grh->hop_limit = 0xff;
+		grh->sgid = old_grh->dgid;
+		grh->dgid = old_grh->sgid;
+		ohdr = &hdr.u.l.oth;
+		l4 = OPA_16B_L4_IB_GLOBAL;
+		hwords += sizeof(struct ib_grh) / sizeof(u32);
+	} else {
+		ohdr = &hdr.u.oth;
+		l4 = OPA_16B_L4_IB_LOCAL;
+	}
+
+	/* BIT 16 to 19 is TVER. Bit 20 to 22 is pad cnt */
+	bth0 = (IB_OPCODE_CNP << 24) | (1 << 16) |
+	       (hfi1_get_16b_padding(hwords << 2, 0) << 20);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+
+	ohdr->bth[1] = cpu_to_be32(remote_qpn);
+	ohdr->bth[2] = 0; /* PSN 0 */
+
+	/* Convert dwords to flits */
+	len = (hwords + nwords) >> 1;
+	hfi1_make_16b_hdr(&hdr, slid, dlid, len, pkey, 1, 0, l4, sc5);
+
+	plen = 2 /* PBC */ + hwords + nwords;
+	pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
+	vl = sc_to_vlt(ppd->dd, sc5);
+	pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+	if (ctxt) {
+		pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
+		if (pbuf)
+			ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
+						 &hdr, hwords);
+	}
+}
+
+void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
+		u16 pkey, u32 slid, u32 dlid, u8 sc5,
+		const struct ib_grh *old_grh)
+{
+	u64 pbc, pbc_flags = 0;
+	u32 bth0, plen, vl, hwords = 5;
+	u16 lrh0;
+	u8 sl = ibp->sc_to_sl[sc5];
+	struct ib_header hdr;
+	struct ib_other_headers *ohdr;
+	struct pio_buf *pbuf;
+	struct send_context *ctxt = qp_to_send_context(qp, sc5);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+	if (old_grh) {
+		struct ib_grh *grh = &hdr.u.l.grh;
+
+		grh->version_tclass_flow = old_grh->version_tclass_flow;
+		grh->paylen = cpu_to_be16(
+			(hwords - LRH_9B_DWORDS + SIZE_OF_CRC) << 2);
+		grh->hop_limit = 0xff;
+		grh->sgid = old_grh->dgid;
+		grh->dgid = old_grh->sgid;
+		ohdr = &hdr.u.l.oth;
+		lrh0 = HFI1_LRH_GRH;
+		hwords += sizeof(struct ib_grh) / sizeof(u32);
+	} else {
+		ohdr = &hdr.u.oth;
+		lrh0 = HFI1_LRH_BTH;
+	}
+
+	lrh0 |= (sc5 & 0xf) << 12 | sl << 4;
+
+	bth0 = pkey | (IB_OPCODE_CNP << 24);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+
+	ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << IB_BECN_SHIFT));
+	ohdr->bth[2] = 0; /* PSN 0 */
+
+	hfi1_make_ib_hdr(&hdr, lrh0, hwords + SIZE_OF_CRC, dlid, slid);
+	plen = 2 /* PBC */ + hwords;
+	pbc_flags |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
+	vl = sc_to_vlt(ppd->dd, sc5);
+	pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+	if (ctxt) {
+		pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
+		if (pbuf)
+			ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
+						 &hdr, hwords);
+	}
+}
+
+/*
+ * opa_smp_check() - Do the regular pkey checking, and the additional
+ * checks for SMPs specified in OPAv1 rev 1.0, 9/19/2016 update, section
+ * 9.10.25 ("SMA Packet Checks").
+ *
+ * Note that:
+ *   - Checks are done using the pkey directly from the packet's BTH,
+ *     and specifically _not_ the pkey that we attach to the completion,
+ *     which may be different.
+ *   - These checks are specifically for "non-local" SMPs (i.e., SMPs
+ *     which originated on another node). SMPs which are sent from, and
+ *     destined to this node are checked in opa_local_smp_check().
+ *
+ * At the point where opa_smp_check() is called, we know:
+ *   - destination QP is QP0
+ *
+ * opa_smp_check() returns 0 if all checks succeed, 1 otherwise.
+ */
+static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5,
+			 struct rvt_qp *qp, u16 slid, struct opa_smp *smp)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+	/*
+	 * I don't think it's possible for us to get here with sc != 0xf,
+	 * but check it to be certain.
+	 */
+	if (sc5 != 0xf)
+		return 1;
+
+	if (rcv_pkey_check(ppd, pkey, sc5, slid))
+		return 1;
+
+	/*
+	 * At this point we know (and so don't need to check again) that
+	 * the pkey is either LIM_MGMT_P_KEY, or FULL_MGMT_P_KEY
+	 * (see ingress_pkey_check).
+	 */
+	if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE &&
+	    smp->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) {
+		ingress_pkey_table_fail(ppd, pkey, slid);
+		return 1;
+	}
+
+	/*
+	 * SMPs fall into one of four (disjoint) categories:
+	 * SMA request, SMA response, SMA trap, or SMA trap repress.
+	 * Our response depends, in part, on which type of SMP we're
+	 * processing.
+	 *
+	 * If this is an SMA response, skip the check here.
+	 *
+	 * If this is an SMA request or SMA trap repress:
+	 *   - pkey != FULL_MGMT_P_KEY =>
+	 *       increment port recv constraint errors, drop MAD
+	 *
+	 * Otherwise:
+	 *    - accept if the port is running an SM
+	 *    - drop MAD if it's an SMA trap
+	 *    - pkey == FULL_MGMT_P_KEY =>
+	 *        reply with unsupported method
+	 *    - pkey != FULL_MGMT_P_KEY =>
+	 *	  increment port recv constraint errors, drop MAD
+	 */
+	switch (smp->method) {
+	case IB_MGMT_METHOD_GET_RESP:
+	case IB_MGMT_METHOD_REPORT_RESP:
+		break;
+	case IB_MGMT_METHOD_GET:
+	case IB_MGMT_METHOD_SET:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_MGMT_METHOD_TRAP_REPRESS:
+		if (pkey != FULL_MGMT_P_KEY) {
+			ingress_pkey_table_fail(ppd, pkey, slid);
+			return 1;
+		}
+		break;
+	default:
+		if (ibp->rvp.port_cap_flags & IB_PORT_SM)
+			return 0;
+		if (smp->method == IB_MGMT_METHOD_TRAP)
+			return 1;
+		if (pkey == FULL_MGMT_P_KEY) {
+			smp->status |= IB_SMP_UNSUP_METHOD;
+			return 0;
+		}
+		ingress_pkey_table_fail(ppd, pkey, slid);
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * hfi1_ud_rcv - receive an incoming UD packet
+ * @ibp: the port the packet came in on
+ * @hdr: the packet header
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_ud_rcv(struct hfi1_packet *packet)
+{
+	struct ib_other_headers *ohdr = packet->ohdr;
+	u32 hdrsize = packet->hlen;
+	struct ib_wc wc;
+	u32 qkey;
+	u32 src_qp;
+	u16 pkey;
+	int mgmt_pkey_idx = -1;
+	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	void *data = packet->payload;
+	u32 tlen = packet->tlen;
+	struct rvt_qp *qp = packet->qp;
+	u8 sc5 = packet->sc;
+	u8 sl_from_sc;
+	u8 opcode = packet->opcode;
+	u8 sl = packet->sl;
+	u32 dlid = packet->dlid;
+	u32 slid = packet->slid;
+	u8 extra_bytes;
+	bool dlid_is_permissive;
+	bool slid_is_permissive;
+
+	extra_bytes = packet->pad + packet->extra_byte + (SIZE_OF_CRC << 2);
+	qkey = ib_get_qkey(ohdr);
+	src_qp = ib_get_sqpn(ohdr);
+
+	if (packet->etype == RHF_RCV_TYPE_BYPASS) {
+		u32 permissive_lid =
+			opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B);
+
+		pkey = hfi1_16B_get_pkey(packet->hdr);
+		dlid_is_permissive = (dlid == permissive_lid);
+		slid_is_permissive = (slid == permissive_lid);
+	} else {
+		pkey = ib_bth_get_pkey(ohdr);
+		dlid_is_permissive = (dlid == be16_to_cpu(IB_LID_PERMISSIVE));
+		slid_is_permissive = (slid == be16_to_cpu(IB_LID_PERMISSIVE));
+	}
+	sl_from_sc = ibp->sc_to_sl[sc5];
+
+	process_ecn(qp, packet, (opcode != IB_OPCODE_CNP));
+	/*
+	 * Get the number of bytes the message was padded by
+	 * and drop incomplete packets.
+	 */
+	if (unlikely(tlen < (hdrsize + extra_bytes)))
+		goto drop;
+
+	tlen -= hdrsize + extra_bytes;
+
+	/*
+	 * Check that the permissive LID is only used on QP0
+	 * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+	 */
+	if (qp->ibqp.qp_num) {
+		if (unlikely(dlid_is_permissive || slid_is_permissive))
+			goto drop;
+		if (qp->ibqp.qp_num > 1) {
+			if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) {
+				/*
+				 * Traps will not be sent for packets dropped
+				 * by the HW. This is fine, as sending trap
+				 * for invalid pkeys is optional according to
+				 * IB spec (release 1.3, section 10.9.4)
+				 */
+				hfi1_bad_pkey(ibp,
+					      pkey, sl,
+					      src_qp, qp->ibqp.qp_num,
+					      slid, dlid);
+				return;
+			}
+		} else {
+			/* GSI packet */
+			mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+			if (mgmt_pkey_idx < 0)
+				goto drop;
+		}
+		if (unlikely(qkey != qp->qkey)) /* Silent drop */
+			return;
+
+		/* Drop invalid MAD packets (see 13.5.3.1). */
+		if (unlikely(qp->ibqp.qp_num == 1 &&
+			     (tlen > 2048 || (sc5 == 0xF))))
+			goto drop;
+	} else {
+		/* Received on QP0, and so by definition, this is an SMP */
+		struct opa_smp *smp = (struct opa_smp *)data;
+
+		if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp))
+			goto drop;
+
+		if (tlen > 2048)
+			goto drop;
+		if ((dlid_is_permissive || slid_is_permissive) &&
+		    smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+			goto drop;
+
+		/* look up SMI pkey */
+		mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+		if (mgmt_pkey_idx < 0)
+			goto drop;
+	}
+
+	if (qp->ibqp.qp_num > 1 &&
+	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+		wc.ex.imm_data = ohdr->u.ud.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		tlen -= sizeof(u32);
+	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+		wc.ex.imm_data = 0;
+		wc.wc_flags = 0;
+	} else {
+		goto drop;
+	}
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	wc.byte_len = tlen + sizeof(struct ib_grh);
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 */
+	if (qp->r_flags & RVT_R_REUSE_SGE) {
+		qp->r_flags &= ~RVT_R_REUSE_SGE;
+	} else {
+		int ret;
+
+		ret = hfi1_rvt_get_rwqe(qp, 0);
+		if (ret < 0) {
+			rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+			return;
+		}
+		if (!ret) {
+			if (qp->ibqp.qp_num == 0)
+				ibp->rvp.n_vl15_dropped++;
+			return;
+		}
+	}
+	/* Silently drop packets which are too big. */
+	if (unlikely(wc.byte_len > qp->r_len)) {
+		qp->r_flags |= RVT_R_REUSE_SGE;
+		goto drop;
+	}
+	if (packet->grh) {
+		hfi1_copy_sge(&qp->r_sge, packet->grh,
+			      sizeof(struct ib_grh), true, false);
+		wc.wc_flags |= IB_WC_GRH;
+	} else if (packet->etype == RHF_RCV_TYPE_BYPASS) {
+		struct ib_grh grh;
+		/*
+		 * Assuming we only created 16B on the send side
+		 * if we want to use large LIDs, since GRH was stripped
+		 * out when creating 16B, add back the GRH here.
+		 */
+		hfi1_make_ext_grh(packet, &grh, slid, dlid);
+		hfi1_copy_sge(&qp->r_sge, &grh,
+			      sizeof(struct ib_grh), true, false);
+		wc.wc_flags |= IB_WC_GRH;
+	} else {
+		rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true);
+	}
+	hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh),
+		      true, false);
+	rvt_put_ss(&qp->r_sge);
+	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+		return;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.vendor_err = 0;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = src_qp;
+
+	if (qp->ibqp.qp_type == IB_QPT_GSI ||
+	    qp->ibqp.qp_type == IB_QPT_SMI) {
+		if (mgmt_pkey_idx < 0) {
+			if (net_ratelimit()) {
+				struct hfi1_devdata *dd = ppd->dd;
+
+				dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n",
+					   qp->ibqp.qp_type);
+				mgmt_pkey_idx = 0;
+			}
+		}
+		wc.pkey_index = (unsigned)mgmt_pkey_idx;
+	} else {
+		wc.pkey_index = 0;
+	}
+	if (slid_is_permissive)
+		slid = be32_to_cpu(OPA_LID_PERMISSIVE);
+	wc.slid = slid & U16_MAX;
+	wc.sl = sl_from_sc;
+
+	/*
+	 * Save the LMC lower bits if the destination LID is a unicast LID.
+	 */
+	wc.dlid_path_bits = hfi1_check_mcast(dlid) ? 0 :
+		dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
+	wc.port_num = qp->port_num;
+	/* Signal completion event if the solicited bit is set. */
+	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+		     ib_bth_is_solicited(ohdr));
+	return;
+
+drop:
+	ibp->rvp.n_pkt_drops++;
+}
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
new file mode 100644
index 0000000..0d5330b
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -0,0 +1,977 @@
+/*
+ * Copyright(c) 2015-2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <asm/page.h>
+#include <linux/string.h>
+
+#include "mmu_rb.h"
+#include "user_exp_rcv.h"
+#include "trace.h"
+
+static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
+			    struct exp_tid_set *set,
+			    struct hfi1_filedata *fd);
+static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
+static int set_rcvarray_entry(struct hfi1_filedata *fd,
+			      struct tid_user_buf *tbuf,
+			      u32 rcventry, struct tid_group *grp,
+			      u16 pageidx, unsigned int npages);
+static int tid_rb_insert(void *arg, struct mmu_rb_node *node);
+static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
+				    struct tid_rb_node *tnode);
+static void tid_rb_remove(void *arg, struct mmu_rb_node *node);
+static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
+static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
+			    struct tid_group *grp,
+			    unsigned int start, u16 count,
+			    u32 *tidlist, unsigned int *tididx,
+			    unsigned int *pmapped);
+static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
+			      struct tid_group **grp);
+static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
+
+static struct mmu_rb_ops tid_rb_ops = {
+	.insert = tid_rb_insert,
+	.remove = tid_rb_remove,
+	.invalidate = tid_rb_invalidate
+};
+
+/*
+ * Initialize context and file private data needed for Expected
+ * receive caching. This needs to be done after the context has
+ * been configured with the eager/expected RcvEntry counts.
+ */
+int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
+			   struct hfi1_ctxtdata *uctxt)
+{
+	struct hfi1_devdata *dd = uctxt->dd;
+	int ret = 0;
+
+	spin_lock_init(&fd->tid_lock);
+	spin_lock_init(&fd->invalid_lock);
+
+	fd->entry_to_rb = kcalloc(uctxt->expected_count,
+				  sizeof(struct rb_node *),
+				  GFP_KERNEL);
+	if (!fd->entry_to_rb)
+		return -ENOMEM;
+
+	if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
+		fd->invalid_tid_idx = 0;
+		fd->invalid_tids = kcalloc(uctxt->expected_count,
+					   sizeof(*fd->invalid_tids),
+					   GFP_KERNEL);
+		if (!fd->invalid_tids) {
+			kfree(fd->entry_to_rb);
+			fd->entry_to_rb = NULL;
+			return -ENOMEM;
+		}
+
+		/*
+		 * Register MMU notifier callbacks. If the registration
+		 * fails, continue without TID caching for this context.
+		 */
+		ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops,
+					   dd->pport->hfi1_wq,
+					   &fd->handler);
+		if (ret) {
+			dd_dev_info(dd,
+				    "Failed MMU notifier registration %d\n",
+				    ret);
+			ret = 0;
+		}
+	}
+
+	/*
+	 * PSM does not have a good way to separate, count, and
+	 * effectively enforce a limit on RcvArray entries used by
+	 * subctxts (when context sharing is used) when TID caching
+	 * is enabled. To help with that, we calculate a per-process
+	 * RcvArray entry share and enforce that.
+	 * If TID caching is not in use, PSM deals with usage on its
+	 * own. In that case, we allow any subctxt to take all of the
+	 * entries.
+	 *
+	 * Make sure that we set the tid counts only after successful
+	 * init.
+	 */
+	spin_lock(&fd->tid_lock);
+	if (uctxt->subctxt_cnt && fd->handler) {
+		u16 remainder;
+
+		fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
+		remainder = uctxt->expected_count % uctxt->subctxt_cnt;
+		if (remainder && fd->subctxt < remainder)
+			fd->tid_limit++;
+	} else {
+		fd->tid_limit = uctxt->expected_count;
+	}
+	spin_unlock(&fd->tid_lock);
+
+	return ret;
+}
+
+void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
+{
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+
+	/*
+	 * The notifier would have been removed when the process'es mm
+	 * was freed.
+	 */
+	if (fd->handler) {
+		hfi1_mmu_rb_unregister(fd->handler);
+	} else {
+		if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
+			unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
+		if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
+			unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
+	}
+
+	kfree(fd->invalid_tids);
+	fd->invalid_tids = NULL;
+
+	kfree(fd->entry_to_rb);
+	fd->entry_to_rb = NULL;
+}
+
+/**
+ * Release pinned receive buffer pages.
+ *
+ * @mapped - true if the pages have been DMA mapped. false otherwise.
+ * @idx - Index of the first page to unpin.
+ * @npages - No of pages to unpin.
+ *
+ * If the pages have been DMA mapped (indicated by mapped parameter), their
+ * info will be passed via a struct tid_rb_node. If they haven't been mapped,
+ * their info will be passed via a struct tid_user_buf.
+ */
+static void unpin_rcv_pages(struct hfi1_filedata *fd,
+			    struct tid_user_buf *tidbuf,
+			    struct tid_rb_node *node,
+			    unsigned int idx,
+			    unsigned int npages,
+			    bool mapped)
+{
+	struct page **pages;
+	struct hfi1_devdata *dd = fd->uctxt->dd;
+
+	if (mapped) {
+		pci_unmap_single(dd->pcidev, node->dma_addr,
+				 node->mmu.len, PCI_DMA_FROMDEVICE);
+		pages = &node->pages[idx];
+	} else {
+		pages = &tidbuf->pages[idx];
+	}
+	hfi1_release_user_pages(fd->mm, pages, npages, mapped);
+	fd->tid_n_pinned -= npages;
+}
+
+/**
+ * Pin receive buffer pages.
+ */
+static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
+{
+	int pinned;
+	unsigned int npages;
+	unsigned long vaddr = tidbuf->vaddr;
+	struct page **pages = NULL;
+	struct hfi1_devdata *dd = fd->uctxt->dd;
+
+	/* Get the number of pages the user buffer spans */
+	npages = num_user_pages(vaddr, tidbuf->length);
+	if (!npages)
+		return -EINVAL;
+
+	if (npages > fd->uctxt->expected_count) {
+		dd_dev_err(dd, "Expected buffer too big\n");
+		return -EINVAL;
+	}
+
+	/* Verify that access is OK for the user buffer */
+	if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
+		       npages * PAGE_SIZE)) {
+		dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
+			   (void *)vaddr, npages);
+		return -EFAULT;
+	}
+	/* Allocate the array of struct page pointers needed for pinning */
+	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	/*
+	 * Pin all the pages of the user buffer. If we can't pin all the
+	 * pages, accept the amount pinned so far and program only that.
+	 * User space knows how to deal with partially programmed buffers.
+	 */
+	if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) {
+		kfree(pages);
+		return -ENOMEM;
+	}
+
+	pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages);
+	if (pinned <= 0) {
+		kfree(pages);
+		return pinned;
+	}
+	tidbuf->pages = pages;
+	tidbuf->npages = npages;
+	fd->tid_n_pinned += pinned;
+	return pinned;
+}
+
+/*
+ * RcvArray entry allocation for Expected Receives is done by the
+ * following algorithm:
+ *
+ * The context keeps 3 lists of groups of RcvArray entries:
+ *   1. List of empty groups - tid_group_list
+ *      This list is created during user context creation and
+ *      contains elements which describe sets (of 8) of empty
+ *      RcvArray entries.
+ *   2. List of partially used groups - tid_used_list
+ *      This list contains sets of RcvArray entries which are
+ *      not completely used up. Another mapping request could
+ *      use some of all of the remaining entries.
+ *   3. List of full groups - tid_full_list
+ *      This is the list where sets that are completely used
+ *      up go.
+ *
+ * An attempt to optimize the usage of RcvArray entries is
+ * made by finding all sets of physically contiguous pages in a
+ * user's buffer.
+ * These physically contiguous sets are further split into
+ * sizes supported by the receive engine of the HFI. The
+ * resulting sets of pages are stored in struct tid_pageset,
+ * which describes the sets as:
+ *    * .count - number of pages in this set
+ *    * .idx - starting index into struct page ** array
+ *                    of this set
+ *
+ * From this point on, the algorithm deals with the page sets
+ * described above. The number of pagesets is divided by the
+ * RcvArray group size to produce the number of full groups
+ * needed.
+ *
+ * Groups from the 3 lists are manipulated using the following
+ * rules:
+ *   1. For each set of 8 pagesets, a complete group from
+ *      tid_group_list is taken, programmed, and moved to
+ *      the tid_full_list list.
+ *   2. For all remaining pagesets:
+ *      2.1 If the tid_used_list is empty and the tid_group_list
+ *          is empty, stop processing pageset and return only
+ *          what has been programmed up to this point.
+ *      2.2 If the tid_used_list is empty and the tid_group_list
+ *          is not empty, move a group from tid_group_list to
+ *          tid_used_list.
+ *      2.3 For each group is tid_used_group, program as much as
+ *          can fit into the group. If the group becomes fully
+ *          used, move it to tid_full_list.
+ */
+int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
+			    struct hfi1_tid_info *tinfo)
+{
+	int ret = 0, need_group = 0, pinned;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+	unsigned int ngroups, pageidx = 0, pageset_count,
+		tididx = 0, mapped, mapped_pages = 0;
+	u32 *tidlist = NULL;
+	struct tid_user_buf *tidbuf;
+
+	tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
+	if (!tidbuf)
+		return -ENOMEM;
+
+	tidbuf->vaddr = tinfo->vaddr;
+	tidbuf->length = tinfo->length;
+	tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
+				GFP_KERNEL);
+	if (!tidbuf->psets) {
+		kfree(tidbuf);
+		return -ENOMEM;
+	}
+
+	pinned = pin_rcv_pages(fd, tidbuf);
+	if (pinned <= 0) {
+		kfree(tidbuf->psets);
+		kfree(tidbuf);
+		return pinned;
+	}
+
+	/* Find sets of physically contiguous pages */
+	tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
+
+	/*
+	 * We don't need to access this under a lock since tid_used is per
+	 * process and the same process cannot be in hfi1_user_exp_rcv_clear()
+	 * and hfi1_user_exp_rcv_setup() at the same time.
+	 */
+	spin_lock(&fd->tid_lock);
+	if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
+		pageset_count = fd->tid_limit - fd->tid_used;
+	else
+		pageset_count = tidbuf->n_psets;
+	spin_unlock(&fd->tid_lock);
+
+	if (!pageset_count)
+		goto bail;
+
+	ngroups = pageset_count / dd->rcv_entries.group_size;
+	tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
+	if (!tidlist) {
+		ret = -ENOMEM;
+		goto nomem;
+	}
+
+	tididx = 0;
+
+	/*
+	 * From this point on, we are going to be using shared (between master
+	 * and subcontexts) context resources. We need to take the lock.
+	 */
+	mutex_lock(&uctxt->exp_lock);
+	/*
+	 * The first step is to program the RcvArray entries which are complete
+	 * groups.
+	 */
+	while (ngroups && uctxt->tid_group_list.count) {
+		struct tid_group *grp =
+			tid_group_pop(&uctxt->tid_group_list);
+
+		ret = program_rcvarray(fd, tidbuf, grp,
+				       pageidx, dd->rcv_entries.group_size,
+				       tidlist, &tididx, &mapped);
+		/*
+		 * If there was a failure to program the RcvArray
+		 * entries for the entire group, reset the grp fields
+		 * and add the grp back to the free group list.
+		 */
+		if (ret <= 0) {
+			tid_group_add_tail(grp, &uctxt->tid_group_list);
+			hfi1_cdbg(TID,
+				  "Failed to program RcvArray group %d", ret);
+			goto unlock;
+		}
+
+		tid_group_add_tail(grp, &uctxt->tid_full_list);
+		ngroups--;
+		pageidx += ret;
+		mapped_pages += mapped;
+	}
+
+	while (pageidx < pageset_count) {
+		struct tid_group *grp, *ptr;
+		/*
+		 * If we don't have any partially used tid groups, check
+		 * if we have empty groups. If so, take one from there and
+		 * put in the partially used list.
+		 */
+		if (!uctxt->tid_used_list.count || need_group) {
+			if (!uctxt->tid_group_list.count)
+				goto unlock;
+
+			grp = tid_group_pop(&uctxt->tid_group_list);
+			tid_group_add_tail(grp, &uctxt->tid_used_list);
+			need_group = 0;
+		}
+		/*
+		 * There is an optimization opportunity here - instead of
+		 * fitting as many page sets as we can, check for a group
+		 * later on in the list that could fit all of them.
+		 */
+		list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
+					 list) {
+			unsigned use = min_t(unsigned, pageset_count - pageidx,
+					     grp->size - grp->used);
+
+			ret = program_rcvarray(fd, tidbuf, grp,
+					       pageidx, use, tidlist,
+					       &tididx, &mapped);
+			if (ret < 0) {
+				hfi1_cdbg(TID,
+					  "Failed to program RcvArray entries %d",
+					  ret);
+				ret = -EFAULT;
+				goto unlock;
+			} else if (ret > 0) {
+				if (grp->used == grp->size)
+					tid_group_move(grp,
+						       &uctxt->tid_used_list,
+						       &uctxt->tid_full_list);
+				pageidx += ret;
+				mapped_pages += mapped;
+				need_group = 0;
+				/* Check if we are done so we break out early */
+				if (pageidx >= pageset_count)
+					break;
+			} else if (WARN_ON(ret == 0)) {
+				/*
+				 * If ret is 0, we did not program any entries
+				 * into this group, which can only happen if
+				 * we've screwed up the accounting somewhere.
+				 * Warn and try to continue.
+				 */
+				need_group = 1;
+			}
+		}
+	}
+unlock:
+	mutex_unlock(&uctxt->exp_lock);
+nomem:
+	hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
+		  mapped_pages, ret);
+	if (tididx) {
+		spin_lock(&fd->tid_lock);
+		fd->tid_used += tididx;
+		spin_unlock(&fd->tid_lock);
+		tinfo->tidcnt = tididx;
+		tinfo->length = mapped_pages * PAGE_SIZE;
+
+		if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
+				 tidlist, sizeof(tidlist[0]) * tididx)) {
+			/*
+			 * On failure to copy to the user level, we need to undo
+			 * everything done so far so we don't leak resources.
+			 */
+			tinfo->tidlist = (unsigned long)&tidlist;
+			hfi1_user_exp_rcv_clear(fd, tinfo);
+			tinfo->tidlist = 0;
+			ret = -EFAULT;
+			goto bail;
+		}
+	}
+
+	/*
+	 * If not everything was mapped (due to insufficient RcvArray entries,
+	 * for example), unpin all unmapped pages so we can pin them nex time.
+	 */
+	if (mapped_pages != pinned)
+		unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
+				(pinned - mapped_pages), false);
+bail:
+	kfree(tidbuf->psets);
+	kfree(tidlist);
+	kfree(tidbuf->pages);
+	kfree(tidbuf);
+	return ret > 0 ? 0 : ret;
+}
+
+int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
+			    struct hfi1_tid_info *tinfo)
+{
+	int ret = 0;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	u32 *tidinfo;
+	unsigned tididx;
+
+	if (unlikely(tinfo->tidcnt > fd->tid_used))
+		return -EINVAL;
+
+	tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist),
+			      sizeof(tidinfo[0]) * tinfo->tidcnt);
+	if (IS_ERR(tidinfo))
+		return PTR_ERR(tidinfo);
+
+	mutex_lock(&uctxt->exp_lock);
+	for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
+		ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
+		if (ret) {
+			hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
+				  ret);
+			break;
+		}
+	}
+	spin_lock(&fd->tid_lock);
+	fd->tid_used -= tididx;
+	spin_unlock(&fd->tid_lock);
+	tinfo->tidcnt = tididx;
+	mutex_unlock(&uctxt->exp_lock);
+
+	kfree(tidinfo);
+	return ret;
+}
+
+int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
+			      struct hfi1_tid_info *tinfo)
+{
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	unsigned long *ev = uctxt->dd->events +
+		(uctxt_offset(uctxt) + fd->subctxt);
+	u32 *array;
+	int ret = 0;
+
+	/*
+	 * copy_to_user() can sleep, which will leave the invalid_lock
+	 * locked and cause the MMU notifier to be blocked on the lock
+	 * for a long time.
+	 * Copy the data to a local buffer so we can release the lock.
+	 */
+	array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
+	if (!array)
+		return -EFAULT;
+
+	spin_lock(&fd->invalid_lock);
+	if (fd->invalid_tid_idx) {
+		memcpy(array, fd->invalid_tids, sizeof(*array) *
+		       fd->invalid_tid_idx);
+		memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
+		       fd->invalid_tid_idx);
+		tinfo->tidcnt = fd->invalid_tid_idx;
+		fd->invalid_tid_idx = 0;
+		/*
+		 * Reset the user flag while still holding the lock.
+		 * Otherwise, PSM can miss events.
+		 */
+		clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
+	} else {
+		tinfo->tidcnt = 0;
+	}
+	spin_unlock(&fd->invalid_lock);
+
+	if (tinfo->tidcnt) {
+		if (copy_to_user((void __user *)tinfo->tidlist,
+				 array, sizeof(*array) * tinfo->tidcnt))
+			ret = -EFAULT;
+	}
+	kfree(array);
+
+	return ret;
+}
+
+static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
+{
+	unsigned pagecount, pageidx, setcount = 0, i;
+	unsigned long pfn, this_pfn;
+	struct page **pages = tidbuf->pages;
+	struct tid_pageset *list = tidbuf->psets;
+
+	if (!npages)
+		return 0;
+
+	/*
+	 * Look for sets of physically contiguous pages in the user buffer.
+	 * This will allow us to optimize Expected RcvArray entry usage by
+	 * using the bigger supported sizes.
+	 */
+	pfn = page_to_pfn(pages[0]);
+	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
+		this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
+
+		/*
+		 * If the pfn's are not sequential, pages are not physically
+		 * contiguous.
+		 */
+		if (this_pfn != ++pfn) {
+			/*
+			 * At this point we have to loop over the set of
+			 * physically contiguous pages and break them down it
+			 * sizes supported by the HW.
+			 * There are two main constraints:
+			 *     1. The max buffer size is MAX_EXPECTED_BUFFER.
+			 *        If the total set size is bigger than that
+			 *        program only a MAX_EXPECTED_BUFFER chunk.
+			 *     2. The buffer size has to be a power of two. If
+			 *        it is not, round down to the closes power of
+			 *        2 and program that size.
+			 */
+			while (pagecount) {
+				int maxpages = pagecount;
+				u32 bufsize = pagecount * PAGE_SIZE;
+
+				if (bufsize > MAX_EXPECTED_BUFFER)
+					maxpages =
+						MAX_EXPECTED_BUFFER >>
+						PAGE_SHIFT;
+				else if (!is_power_of_2(bufsize))
+					maxpages =
+						rounddown_pow_of_two(bufsize) >>
+						PAGE_SHIFT;
+
+				list[setcount].idx = pageidx;
+				list[setcount].count = maxpages;
+				pagecount -= maxpages;
+				pageidx += maxpages;
+				setcount++;
+			}
+			pageidx = i;
+			pagecount = 1;
+			pfn = this_pfn;
+		} else {
+			pagecount++;
+		}
+	}
+	return setcount;
+}
+
+/**
+ * program_rcvarray() - program an RcvArray group with receive buffers
+ * @fd: filedata pointer
+ * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
+ *	  virtual address, buffer length, page pointers, pagesets (array of
+ *	  struct tid_pageset holding information on physically contiguous
+ *	  chunks from the user buffer), and other fields.
+ * @grp: RcvArray group
+ * @start: starting index into sets array
+ * @count: number of struct tid_pageset's to program
+ * @tidlist: the array of u32 elements when the information about the
+ *           programmed RcvArray entries is to be encoded.
+ * @tididx: starting offset into tidlist
+ * @pmapped: (output parameter) number of pages programmed into the RcvArray
+ *           entries.
+ *
+ * This function will program up to 'count' number of RcvArray entries from the
+ * group 'grp'. To make best use of write-combining writes, the function will
+ * perform writes to the unused RcvArray entries which will be ignored by the
+ * HW. Each RcvArray entry will be programmed with a physically contiguous
+ * buffer chunk from the user's virtual buffer.
+ *
+ * Return:
+ * -EINVAL if the requested count is larger than the size of the group,
+ * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
+ * number of RcvArray entries programmed.
+ */
+static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
+			    struct tid_group *grp,
+			    unsigned int start, u16 count,
+			    u32 *tidlist, unsigned int *tididx,
+			    unsigned int *pmapped)
+{
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+	u16 idx;
+	u32 tidinfo = 0, rcventry, useidx = 0;
+	int mapped = 0;
+
+	/* Count should never be larger than the group size */
+	if (count > grp->size)
+		return -EINVAL;
+
+	/* Find the first unused entry in the group */
+	for (idx = 0; idx < grp->size; idx++) {
+		if (!(grp->map & (1 << idx))) {
+			useidx = idx;
+			break;
+		}
+		rcv_array_wc_fill(dd, grp->base + idx);
+	}
+
+	idx = 0;
+	while (idx < count) {
+		u16 npages, pageidx, setidx = start + idx;
+		int ret = 0;
+
+		/*
+		 * If this entry in the group is used, move to the next one.
+		 * If we go past the end of the group, exit the loop.
+		 */
+		if (useidx >= grp->size) {
+			break;
+		} else if (grp->map & (1 << useidx)) {
+			rcv_array_wc_fill(dd, grp->base + useidx);
+			useidx++;
+			continue;
+		}
+
+		rcventry = grp->base + useidx;
+		npages = tbuf->psets[setidx].count;
+		pageidx = tbuf->psets[setidx].idx;
+
+		ret = set_rcvarray_entry(fd, tbuf,
+					 rcventry, grp, pageidx,
+					 npages);
+		if (ret)
+			return ret;
+		mapped += npages;
+
+		tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
+			EXP_TID_SET(LEN, npages);
+		tidlist[(*tididx)++] = tidinfo;
+		grp->used++;
+		grp->map |= 1 << useidx++;
+		idx++;
+	}
+
+	/* Fill the rest of the group with "blank" writes */
+	for (; useidx < grp->size; useidx++)
+		rcv_array_wc_fill(dd, grp->base + useidx);
+	*pmapped = mapped;
+	return idx;
+}
+
+static int set_rcvarray_entry(struct hfi1_filedata *fd,
+			      struct tid_user_buf *tbuf,
+			      u32 rcventry, struct tid_group *grp,
+			      u16 pageidx, unsigned int npages)
+{
+	int ret;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct tid_rb_node *node;
+	struct hfi1_devdata *dd = uctxt->dd;
+	dma_addr_t phys;
+	struct page **pages = tbuf->pages + pageidx;
+
+	/*
+	 * Allocate the node first so we can handle a potential
+	 * failure before we've programmed anything.
+	 */
+	node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
+		       GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+
+	phys = pci_map_single(dd->pcidev,
+			      __va(page_to_phys(pages[0])),
+			      npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
+	if (dma_mapping_error(&dd->pcidev->dev, phys)) {
+		dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
+			   phys);
+		kfree(node);
+		return -EFAULT;
+	}
+
+	node->mmu.addr = tbuf->vaddr + (pageidx * PAGE_SIZE);
+	node->mmu.len = npages * PAGE_SIZE;
+	node->phys = page_to_phys(pages[0]);
+	node->npages = npages;
+	node->rcventry = rcventry;
+	node->dma_addr = phys;
+	node->grp = grp;
+	node->freed = false;
+	memcpy(node->pages, pages, sizeof(struct page *) * npages);
+
+	if (!fd->handler)
+		ret = tid_rb_insert(fd, &node->mmu);
+	else
+		ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu);
+
+	if (ret) {
+		hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
+			  node->rcventry, node->mmu.addr, node->phys, ret);
+		pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
+				 PCI_DMA_FROMDEVICE);
+		kfree(node);
+		return -EFAULT;
+	}
+	hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
+	trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
+			       node->mmu.addr, node->phys, phys);
+	return 0;
+}
+
+static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
+			      struct tid_group **grp)
+{
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+	struct tid_rb_node *node;
+	u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
+	u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
+
+	if (tididx >= uctxt->expected_count) {
+		dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
+			   tididx, uctxt->ctxt);
+		return -EINVAL;
+	}
+
+	if (tidctrl == 0x3)
+		return -EINVAL;
+
+	rcventry = tididx + (tidctrl - 1);
+
+	node = fd->entry_to_rb[rcventry];
+	if (!node || node->rcventry != (uctxt->expected_base + rcventry))
+		return -EBADF;
+
+	if (grp)
+		*grp = node->grp;
+
+	if (!fd->handler)
+		cacheless_tid_rb_remove(fd, node);
+	else
+		hfi1_mmu_rb_remove(fd->handler, &node->mmu);
+
+	return 0;
+}
+
+static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
+{
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+
+	trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
+				 node->npages, node->mmu.addr, node->phys,
+				 node->dma_addr);
+
+	/*
+	 * Make sure device has seen the write before we unpin the
+	 * pages.
+	 */
+	hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
+
+	unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
+
+	node->grp->used--;
+	node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
+
+	if (node->grp->used == node->grp->size - 1)
+		tid_group_move(node->grp, &uctxt->tid_full_list,
+			       &uctxt->tid_used_list);
+	else if (!node->grp->used)
+		tid_group_move(node->grp, &uctxt->tid_used_list,
+			       &uctxt->tid_group_list);
+	kfree(node);
+}
+
+/*
+ * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
+ * clearing nodes in the non-cached case.
+ */
+static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
+			    struct exp_tid_set *set,
+			    struct hfi1_filedata *fd)
+{
+	struct tid_group *grp, *ptr;
+	int i;
+
+	list_for_each_entry_safe(grp, ptr, &set->list, list) {
+		list_del_init(&grp->list);
+
+		for (i = 0; i < grp->size; i++) {
+			if (grp->map & (1 << i)) {
+				u16 rcventry = grp->base + i;
+				struct tid_rb_node *node;
+
+				node = fd->entry_to_rb[rcventry -
+							  uctxt->expected_base];
+				if (!node || node->rcventry != rcventry)
+					continue;
+
+				cacheless_tid_rb_remove(fd, node);
+			}
+		}
+	}
+}
+
+/*
+ * Always return 0 from this function.  A non-zero return indicates that the
+ * remove operation will be called and that memory should be unpinned.
+ * However, the driver cannot unpin out from under PSM.  Instead, retain the
+ * memory (by returning 0) and inform PSM that the memory is going away.  PSM
+ * will call back later when it has removed the memory from its list.
+ */
+static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
+{
+	struct hfi1_filedata *fdata = arg;
+	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
+	struct tid_rb_node *node =
+		container_of(mnode, struct tid_rb_node, mmu);
+
+	if (node->freed)
+		return 0;
+
+	trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
+				 node->rcventry, node->npages, node->dma_addr);
+	node->freed = true;
+
+	spin_lock(&fdata->invalid_lock);
+	if (fdata->invalid_tid_idx < uctxt->expected_count) {
+		fdata->invalid_tids[fdata->invalid_tid_idx] =
+			rcventry2tidinfo(node->rcventry - uctxt->expected_base);
+		fdata->invalid_tids[fdata->invalid_tid_idx] |=
+			EXP_TID_SET(LEN, node->npages);
+		if (!fdata->invalid_tid_idx) {
+			unsigned long *ev;
+
+			/*
+			 * hfi1_set_uevent_bits() sets a user event flag
+			 * for all processes. Because calling into the
+			 * driver to process TID cache invalidations is
+			 * expensive and TID cache invalidations are
+			 * handled on a per-process basis, we can
+			 * optimize this to set the flag only for the
+			 * process in question.
+			 */
+			ev = uctxt->dd->events +
+				(uctxt_offset(uctxt) + fdata->subctxt);
+			set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
+		}
+		fdata->invalid_tid_idx++;
+	}
+	spin_unlock(&fdata->invalid_lock);
+	return 0;
+}
+
+static int tid_rb_insert(void *arg, struct mmu_rb_node *node)
+{
+	struct hfi1_filedata *fdata = arg;
+	struct tid_rb_node *tnode =
+		container_of(node, struct tid_rb_node, mmu);
+	u32 base = fdata->uctxt->expected_base;
+
+	fdata->entry_to_rb[tnode->rcventry - base] = tnode;
+	return 0;
+}
+
+static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
+				    struct tid_rb_node *tnode)
+{
+	u32 base = fdata->uctxt->expected_base;
+
+	fdata->entry_to_rb[tnode->rcventry - base] = NULL;
+	clear_tid_node(fdata, tnode);
+}
+
+static void tid_rb_remove(void *arg, struct mmu_rb_node *node)
+{
+	struct hfi1_filedata *fdata = arg;
+	struct tid_rb_node *tnode =
+		container_of(node, struct tid_rb_node, mmu);
+
+	cacheless_tid_rb_remove(fdata, tnode);
+}
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
new file mode 100644
index 0000000..e383cc0
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
@@ -0,0 +1,98 @@
+#ifndef _HFI1_USER_EXP_RCV_H
+#define _HFI1_USER_EXP_RCV_H
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+#include "exp_rcv.h"
+
+struct tid_pageset {
+	u16 idx;
+	u16 count;
+};
+
+struct tid_user_buf {
+	unsigned long vaddr;
+	unsigned long length;
+	unsigned int npages;
+	struct page **pages;
+	struct tid_pageset *psets;
+	unsigned int n_psets;
+};
+
+struct tid_rb_node {
+	struct mmu_rb_node mmu;
+	unsigned long phys;
+	struct tid_group *grp;
+	u32 rcventry;
+	dma_addr_t dma_addr;
+	bool freed;
+	unsigned int npages;
+	struct page *pages[0];
+};
+
+static inline int num_user_pages(unsigned long addr,
+				 unsigned long len)
+{
+	const unsigned long spage = addr & PAGE_MASK;
+	const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+	return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
+			   struct hfi1_ctxtdata *uctxt);
+void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd);
+int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
+			    struct hfi1_tid_info *tinfo);
+int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
+			    struct hfi1_tid_info *tinfo);
+int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
+			      struct hfi1_tid_info *tinfo);
+
+#endif /* _HFI1_USER_EXP_RCV_H */
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
new file mode 100644
index 0000000..e341e6d
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright(c) 2015-2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/device.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+
+static unsigned long cache_size = 256;
+module_param(cache_size, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)");
+
+/*
+ * Determine whether the caller can pin pages.
+ *
+ * This function should be used in the implementation of buffer caches.
+ * The cache implementation should call this function prior to attempting
+ * to pin buffer pages in order to determine whether they should do so.
+ * The function computes cache limits based on the configured ulimit and
+ * cache size. Use of this function is especially important for caches
+ * which are not limited in any other way (e.g. by HW resources) and, thus,
+ * could keeping caching buffers.
+ *
+ */
+bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm,
+			u32 nlocked, u32 npages)
+{
+	unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit,
+		size = (cache_size * (1UL << 20)); /* convert to bytes */
+	unsigned int usr_ctxts =
+			dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt;
+	bool can_lock = capable(CAP_IPC_LOCK);
+
+	/*
+	 * Calculate per-cache size. The calculation below uses only a quarter
+	 * of the available per-context limit. This leaves space for other
+	 * pinning. Should we worry about shared ctxts?
+	 */
+	cache_limit = (ulimit / usr_ctxts) / 4;
+
+	/* If ulimit isn't set to "unlimited" and is smaller than cache_size. */
+	if (ulimit != (-1UL) && size > cache_limit)
+		size = cache_limit;
+
+	/* Convert to number of pages */
+	size = DIV_ROUND_UP(size, PAGE_SIZE);
+
+	down_read(&mm->mmap_sem);
+	pinned = mm->pinned_vm;
+	up_read(&mm->mmap_sem);
+
+	/* First, check the absolute limit against all pinned pages. */
+	if (pinned + npages >= ulimit && !can_lock)
+		return false;
+
+	return ((nlocked + npages) <= size) || can_lock;
+}
+
+int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t npages,
+			    bool writable, struct page **pages)
+{
+	int ret;
+
+	ret = get_user_pages_fast(vaddr, npages, writable, pages);
+	if (ret < 0)
+		return ret;
+
+	down_write(&mm->mmap_sem);
+	mm->pinned_vm += ret;
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+
+void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
+			     size_t npages, bool dirty)
+{
+	size_t i;
+
+	for (i = 0; i < npages; i++) {
+		if (dirty)
+			set_page_dirty_lock(p[i]);
+		put_page(p[i]);
+	}
+
+	if (mm) { /* during close after signal, mm can be NULL */
+		down_write(&mm->mmap_sem);
+		mm->pinned_vm -= npages;
+		up_write(&mm->mmap_sem);
+	}
+}
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
new file mode 100644
index 0000000..a3a7b33
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -0,0 +1,1533 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/mmu_context.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+
+#include "hfi.h"
+#include "sdma.h"
+#include "mmu_rb.h"
+#include "user_sdma.h"
+#include "verbs.h"  /* for the headers */
+#include "common.h" /* for struct hfi1_tid_info */
+#include "trace.h"
+
+static uint hfi1_sdma_comp_ring_size = 128;
+module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
+
+static unsigned initial_pkt_count = 8;
+
+static int user_sdma_send_pkts(struct user_sdma_request *req,
+			       unsigned maxpkts);
+static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
+static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
+static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
+static int pin_vector_pages(struct user_sdma_request *req,
+			    struct user_sdma_iovec *iovec);
+static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
+			       unsigned start, unsigned npages);
+static int check_header_template(struct user_sdma_request *req,
+				 struct hfi1_pkt_header *hdr, u32 lrhlen,
+				 u32 datalen);
+static int set_txreq_header(struct user_sdma_request *req,
+			    struct user_sdma_txreq *tx, u32 datalen);
+static int set_txreq_header_ahg(struct user_sdma_request *req,
+				struct user_sdma_txreq *tx, u32 len);
+static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
+				  struct hfi1_user_sdma_comp_q *cq,
+				  u16 idx, enum hfi1_sdma_comp_state state,
+				  int ret);
+static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
+static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
+
+static int defer_packet_queue(
+	struct sdma_engine *sde,
+	struct iowait *wait,
+	struct sdma_txreq *txreq,
+	uint seq,
+	bool pkts_sent);
+static void activate_packet_queue(struct iowait *wait, int reason);
+static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
+			   unsigned long len);
+static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode);
+static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
+			 void *arg2, bool *stop);
+static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
+static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
+
+static struct mmu_rb_ops sdma_rb_ops = {
+	.filter = sdma_rb_filter,
+	.insert = sdma_rb_insert,
+	.evict = sdma_rb_evict,
+	.remove = sdma_rb_remove,
+	.invalidate = sdma_rb_invalidate
+};
+
+static int defer_packet_queue(
+	struct sdma_engine *sde,
+	struct iowait *wait,
+	struct sdma_txreq *txreq,
+	uint seq,
+	bool pkts_sent)
+{
+	struct hfi1_user_sdma_pkt_q *pq =
+		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+	struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
+	struct user_sdma_txreq *tx =
+		container_of(txreq, struct user_sdma_txreq, txreq);
+
+	if (sdma_progress(sde, seq, txreq)) {
+		if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
+			goto eagain;
+	}
+	/*
+	 * We are assuming that if the list is enqueued somewhere, it
+	 * is to the dmawait list since that is the only place where
+	 * it is supposed to be enqueued.
+	 */
+	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
+	write_seqlock(&dev->iowait_lock);
+	if (list_empty(&pq->busy.list))
+		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
+	write_sequnlock(&dev->iowait_lock);
+	return -EBUSY;
+eagain:
+	return -EAGAIN;
+}
+
+static void activate_packet_queue(struct iowait *wait, int reason)
+{
+	struct hfi1_user_sdma_pkt_q *pq =
+		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+	wake_up(&wait->wait_dma);
+};
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
+				struct hfi1_filedata *fd)
+{
+	int ret = -ENOMEM;
+	char buf[64];
+	struct hfi1_devdata *dd;
+	struct hfi1_user_sdma_comp_q *cq;
+	struct hfi1_user_sdma_pkt_q *pq;
+
+	if (!uctxt || !fd)
+		return -EBADF;
+
+	if (!hfi1_sdma_comp_ring_size)
+		return -EINVAL;
+
+	dd = uctxt->dd;
+
+	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
+	if (!pq)
+		return -ENOMEM;
+
+	pq->dd = dd;
+	pq->ctxt = uctxt->ctxt;
+	pq->subctxt = fd->subctxt;
+	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
+	pq->state = SDMA_PKT_Q_INACTIVE;
+	atomic_set(&pq->n_reqs, 0);
+	init_waitqueue_head(&pq->wait);
+	atomic_set(&pq->n_locked, 0);
+	pq->mm = fd->mm;
+
+	iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
+		    activate_packet_queue, NULL);
+	pq->reqidx = 0;
+
+	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
+			   sizeof(*pq->reqs),
+			   GFP_KERNEL);
+	if (!pq->reqs)
+		goto pq_reqs_nomem;
+
+	pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size),
+				 sizeof(*pq->req_in_use),
+				 GFP_KERNEL);
+	if (!pq->req_in_use)
+		goto pq_reqs_no_in_use;
+
+	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
+		 fd->subctxt);
+	pq->txreq_cache = kmem_cache_create(buf,
+					    sizeof(struct user_sdma_txreq),
+					    L1_CACHE_BYTES,
+					    SLAB_HWCACHE_ALIGN,
+					    NULL);
+	if (!pq->txreq_cache) {
+		dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
+			   uctxt->ctxt);
+		goto pq_txreq_nomem;
+	}
+
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		goto cq_nomem;
+
+	cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
+				 * hfi1_sdma_comp_ring_size));
+	if (!cq->comps)
+		goto cq_comps_nomem;
+
+	cq->nentries = hfi1_sdma_comp_ring_size;
+
+	ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq,
+				   &pq->handler);
+	if (ret) {
+		dd_dev_err(dd, "Failed to register with MMU %d", ret);
+		goto pq_mmu_fail;
+	}
+
+	fd->pq = pq;
+	fd->cq = cq;
+
+	return 0;
+
+pq_mmu_fail:
+	vfree(cq->comps);
+cq_comps_nomem:
+	kfree(cq);
+cq_nomem:
+	kmem_cache_destroy(pq->txreq_cache);
+pq_txreq_nomem:
+	kfree(pq->req_in_use);
+pq_reqs_no_in_use:
+	kfree(pq->reqs);
+pq_reqs_nomem:
+	kfree(pq);
+
+	return ret;
+}
+
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
+			       struct hfi1_ctxtdata *uctxt)
+{
+	struct hfi1_user_sdma_pkt_q *pq;
+
+	trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
+
+	pq = fd->pq;
+	if (pq) {
+		if (pq->handler)
+			hfi1_mmu_rb_unregister(pq->handler);
+		iowait_sdma_drain(&pq->busy);
+		/* Wait until all requests have been freed. */
+		wait_event_interruptible(
+			pq->wait,
+			(READ_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
+		kfree(pq->reqs);
+		kfree(pq->req_in_use);
+		kmem_cache_destroy(pq->txreq_cache);
+		kfree(pq);
+		fd->pq = NULL;
+	}
+	if (fd->cq) {
+		vfree(fd->cq->comps);
+		kfree(fd->cq);
+		fd->cq = NULL;
+	}
+	return 0;
+}
+
+static u8 dlid_to_selector(u16 dlid)
+{
+	static u8 mapping[256];
+	static int initialized;
+	static u8 next;
+	int hash;
+
+	if (!initialized) {
+		memset(mapping, 0xFF, 256);
+		initialized = 1;
+	}
+
+	hash = ((dlid >> 8) ^ dlid) & 0xFF;
+	if (mapping[hash] == 0xFF) {
+		mapping[hash] = next;
+		next = (next + 1) & 0x7F;
+	}
+
+	return mapping[hash];
+}
+
+int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
+				   struct iovec *iovec, unsigned long dim,
+				   unsigned long *count)
+{
+	int ret = 0, i;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_user_sdma_pkt_q *pq = fd->pq;
+	struct hfi1_user_sdma_comp_q *cq = fd->cq;
+	struct hfi1_devdata *dd = pq->dd;
+	unsigned long idx = 0;
+	u8 pcount = initial_pkt_count;
+	struct sdma_req_info info;
+	struct user_sdma_request *req;
+	u8 opcode, sc, vl;
+	u16 pkey;
+	u32 slid;
+	int req_queued = 0;
+	u16 dlid;
+	u32 selector;
+
+	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
+		hfi1_cdbg(
+		   SDMA,
+		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
+		   dd->unit, uctxt->ctxt, fd->subctxt,
+		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
+		return -EINVAL;
+	}
+	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
+	if (ret) {
+		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
+			  dd->unit, uctxt->ctxt, fd->subctxt, ret);
+		return -EFAULT;
+	}
+
+	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
+				     (u16 *)&info);
+	if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
+		hfi1_cdbg(SDMA,
+			  "[%u:%u:%u:%u] Invalid comp index",
+			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
+		return -EINVAL;
+	}
+
+	/*
+	 * Sanity check the header io vector count.  Need at least 1 vector
+	 * (header) and cannot be larger than the actual io vector count.
+	 */
+	if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
+		hfi1_cdbg(SDMA,
+			  "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
+			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
+			  req_iovcnt(info.ctrl), dim);
+		return -EINVAL;
+	}
+
+	if (!info.fragsize) {
+		hfi1_cdbg(SDMA,
+			  "[%u:%u:%u:%u] Request does not specify fragsize",
+			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
+		return -EINVAL;
+	}
+
+	/* Try to claim the request. */
+	if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
+		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
+			  dd->unit, uctxt->ctxt, fd->subctxt,
+			  info.comp_idx);
+		return -EBADSLT;
+	}
+	/*
+	 * All safety checks have been done and this request has been claimed.
+	 */
+	trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
+					     info.comp_idx);
+	req = pq->reqs + info.comp_idx;
+	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
+	req->data_len  = 0;
+	req->pq = pq;
+	req->cq = cq;
+	req->status = -1;
+	req->ahg_idx = -1;
+	req->iov_idx = 0;
+	req->sent = 0;
+	req->seqnum = 0;
+	req->seqcomp = 0;
+	req->seqsubmitted = 0;
+	req->tids = NULL;
+	req->done = 0;
+	req->has_error = 0;
+	INIT_LIST_HEAD(&req->txps);
+
+	memcpy(&req->info, &info, sizeof(info));
+
+	if (req_opcode(info.ctrl) == EXPECTED) {
+		/* expected must have a TID info and at least one data vector */
+		if (req->data_iovs < 2) {
+			SDMA_DBG(req,
+				 "Not enough vectors for expected request");
+			ret = -EINVAL;
+			goto free_req;
+		}
+		req->data_iovs--;
+	}
+
+	if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
+		SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
+			 MAX_VECTORS_PER_REQ);
+		ret = -EINVAL;
+		goto free_req;
+	}
+	/* Copy the header from the user buffer */
+	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
+			     sizeof(req->hdr));
+	if (ret) {
+		SDMA_DBG(req, "Failed to copy header template (%d)", ret);
+		ret = -EFAULT;
+		goto free_req;
+	}
+
+	/* If Static rate control is not enabled, sanitize the header. */
+	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
+		req->hdr.pbc[2] = 0;
+
+	/* Validate the opcode. Do not trust packets from user space blindly. */
+	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
+	if ((opcode & USER_OPCODE_CHECK_MASK) !=
+	     USER_OPCODE_CHECK_VAL) {
+		SDMA_DBG(req, "Invalid opcode (%d)", opcode);
+		ret = -EINVAL;
+		goto free_req;
+	}
+	/*
+	 * Validate the vl. Do not trust packets from user space blindly.
+	 * VL comes from PBC, SC comes from LRH, and the VL needs to
+	 * match the SC look up.
+	 */
+	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
+	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
+	      (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
+	if (vl >= dd->pport->vls_operational ||
+	    vl != sc_to_vlt(dd, sc)) {
+		SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
+		ret = -EINVAL;
+		goto free_req;
+	}
+
+	/* Checking P_KEY for requests from user-space */
+	pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
+	slid = be16_to_cpu(req->hdr.lrh[3]);
+	if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
+		ret = -EINVAL;
+		goto free_req;
+	}
+
+	/*
+	 * Also should check the BTH.lnh. If it says the next header is GRH then
+	 * the RXE parsing will be off and will land in the middle of the KDETH
+	 * or miss it entirely.
+	 */
+	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
+		SDMA_DBG(req, "User tried to pass in a GRH");
+		ret = -EINVAL;
+		goto free_req;
+	}
+
+	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
+	/*
+	 * Calculate the initial TID offset based on the values of
+	 * KDETH.OFFSET and KDETH.OM that are passed in.
+	 */
+	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
+		(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+		 KDETH_OM_LARGE : KDETH_OM_SMALL);
+	trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
+					       info.comp_idx, req->tidoffset);
+	idx++;
+
+	/* Save all the IO vector structures */
+	for (i = 0; i < req->data_iovs; i++) {
+		req->iovs[i].offset = 0;
+		INIT_LIST_HEAD(&req->iovs[i].list);
+		memcpy(&req->iovs[i].iov,
+		       iovec + idx++,
+		       sizeof(req->iovs[i].iov));
+		ret = pin_vector_pages(req, &req->iovs[i]);
+		if (ret) {
+			req->data_iovs = i;
+			req->status = ret;
+			goto free_req;
+		}
+		req->data_len += req->iovs[i].iov.iov_len;
+	}
+	trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
+					 info.comp_idx, req->data_len);
+	if (pcount > req->info.npkts)
+		pcount = req->info.npkts;
+	/*
+	 * Copy any TID info
+	 * User space will provide the TID info only when the
+	 * request type is EXPECTED. This is true even if there is
+	 * only one packet in the request and the header is already
+	 * setup. The reason for the singular TID case is that the
+	 * driver needs to perform safety checks.
+	 */
+	if (req_opcode(req->info.ctrl) == EXPECTED) {
+		u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
+		u32 *tmp;
+
+		if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
+			ret = -EINVAL;
+			goto free_req;
+		}
+
+		/*
+		 * We have to copy all of the tids because they may vary
+		 * in size and, therefore, the TID count might not be
+		 * equal to the pkt count. However, there is no way to
+		 * tell at this point.
+		 */
+		tmp = memdup_user(iovec[idx].iov_base,
+				  ntids * sizeof(*req->tids));
+		if (IS_ERR(tmp)) {
+			ret = PTR_ERR(tmp);
+			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
+				 ntids, ret);
+			goto free_req;
+		}
+		req->tids = tmp;
+		req->n_tids = ntids;
+		req->tididx = 0;
+		idx++;
+	}
+
+	dlid = be16_to_cpu(req->hdr.lrh[1]);
+	selector = dlid_to_selector(dlid);
+	selector += uctxt->ctxt + fd->subctxt;
+	req->sde = sdma_select_user_engine(dd, selector, vl);
+
+	if (!req->sde || !sdma_running(req->sde)) {
+		ret = -ECOMM;
+		goto free_req;
+	}
+
+	/* We don't need an AHG entry if the request contains only one packet */
+	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
+		req->ahg_idx = sdma_ahg_alloc(req->sde);
+
+	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
+	atomic_inc(&pq->n_reqs);
+	req_queued = 1;
+	/* Send the first N packets in the request to buy us some time */
+	ret = user_sdma_send_pkts(req, pcount);
+	if (unlikely(ret < 0 && ret != -EBUSY)) {
+		req->status = ret;
+		goto free_req;
+	}
+
+	/*
+	 * It is possible that the SDMA engine would have processed all the
+	 * submitted packets by the time we get here. Therefore, only set
+	 * packet queue state to ACTIVE if there are still uncompleted
+	 * requests.
+	 */
+	if (atomic_read(&pq->n_reqs))
+		xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+
+	/*
+	 * This is a somewhat blocking send implementation.
+	 * The driver will block the caller until all packets of the
+	 * request have been submitted to the SDMA engine. However, it
+	 * will not wait for send completions.
+	 */
+	while (req->seqsubmitted != req->info.npkts) {
+		ret = user_sdma_send_pkts(req, pcount);
+		if (ret < 0) {
+			if (ret != -EBUSY) {
+				req->status = ret;
+				WRITE_ONCE(req->has_error, 1);
+				if (READ_ONCE(req->seqcomp) ==
+				    req->seqsubmitted - 1)
+					goto free_req;
+				return ret;
+			}
+			wait_event_interruptible_timeout(
+				pq->busy.wait_dma,
+				(pq->state == SDMA_PKT_Q_ACTIVE),
+				msecs_to_jiffies(
+					SDMA_IOWAIT_TIMEOUT));
+		}
+	}
+	*count += idx;
+	return 0;
+free_req:
+	user_sdma_free_request(req, true);
+	if (req_queued)
+		pq_update(pq);
+	set_comp_state(pq, cq, info.comp_idx, ERROR, req->status);
+	return ret;
+}
+
+static inline u32 compute_data_length(struct user_sdma_request *req,
+				      struct user_sdma_txreq *tx)
+{
+	/*
+	 * Determine the proper size of the packet data.
+	 * The size of the data of the first packet is in the header
+	 * template. However, it includes the header and ICRC, which need
+	 * to be subtracted.
+	 * The minimum representable packet data length in a header is 4 bytes,
+	 * therefore, when the data length request is less than 4 bytes, there's
+	 * only one packet, and the packet data length is equal to that of the
+	 * request data length.
+	 * The size of the remaining packets is the minimum of the frag
+	 * size (MTU) or remaining data in the request.
+	 */
+	u32 len;
+
+	if (!req->seqnum) {
+		if (req->data_len < sizeof(u32))
+			len = req->data_len;
+		else
+			len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
+			       (sizeof(tx->hdr) - 4));
+	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
+		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
+			PAGE_SIZE;
+		/*
+		 * Get the data length based on the remaining space in the
+		 * TID pair.
+		 */
+		len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
+		/* If we've filled up the TID pair, move to the next one. */
+		if (unlikely(!len) && ++req->tididx < req->n_tids &&
+		    req->tids[req->tididx]) {
+			tidlen = EXP_TID_GET(req->tids[req->tididx],
+					     LEN) * PAGE_SIZE;
+			req->tidoffset = 0;
+			len = min_t(u32, tidlen, req->info.fragsize);
+		}
+		/*
+		 * Since the TID pairs map entire pages, make sure that we
+		 * are not going to try to send more data that we have
+		 * remaining.
+		 */
+		len = min(len, req->data_len - req->sent);
+	} else {
+		len = min(req->data_len - req->sent, (u32)req->info.fragsize);
+	}
+	trace_hfi1_sdma_user_compute_length(req->pq->dd,
+					    req->pq->ctxt,
+					    req->pq->subctxt,
+					    req->info.comp_idx,
+					    len);
+	return len;
+}
+
+static inline u32 pad_len(u32 len)
+{
+	if (len & (sizeof(u32) - 1))
+		len += sizeof(u32) - (len & (sizeof(u32) - 1));
+	return len;
+}
+
+static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
+{
+	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
+	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
+}
+
+static int user_sdma_txadd_ahg(struct user_sdma_request *req,
+			       struct user_sdma_txreq *tx,
+			       u32 datalen)
+{
+	int ret;
+	u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
+	u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
+	struct hfi1_user_sdma_pkt_q *pq = req->pq;
+
+	/*
+	 * Copy the request header into the tx header
+	 * because the HW needs a cacheline-aligned
+	 * address.
+	 * This copy can be optimized out if the hdr
+	 * member of user_sdma_request were also
+	 * cacheline aligned.
+	 */
+	memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
+	if (PBC2LRH(pbclen) != lrhlen) {
+		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
+		tx->hdr.pbc[0] = cpu_to_le16(pbclen);
+	}
+	ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
+	if (ret)
+		return ret;
+	ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
+			      sizeof(tx->hdr) + datalen, req->ahg_idx,
+			      0, NULL, 0, user_sdma_txreq_cb);
+	if (ret)
+		return ret;
+	ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
+	if (ret)
+		sdma_txclean(pq->dd, &tx->txreq);
+	return ret;
+}
+
+static int user_sdma_txadd(struct user_sdma_request *req,
+			   struct user_sdma_txreq *tx,
+			   struct user_sdma_iovec *iovec, u32 datalen,
+			   u32 *queued_ptr, u32 *data_sent_ptr,
+			   u64 *iov_offset_ptr)
+{
+	int ret;
+	unsigned int pageidx, len;
+	unsigned long base, offset;
+	u64 iov_offset = *iov_offset_ptr;
+	u32 queued = *queued_ptr, data_sent = *data_sent_ptr;
+	struct hfi1_user_sdma_pkt_q *pq = req->pq;
+
+	base = (unsigned long)iovec->iov.iov_base;
+	offset = offset_in_page(base + iovec->offset + iov_offset);
+	pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >>
+		   PAGE_SHIFT);
+	len = offset + req->info.fragsize > PAGE_SIZE ?
+		PAGE_SIZE - offset : req->info.fragsize;
+	len = min((datalen - queued), len);
+	ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx],
+			      offset, len);
+	if (ret) {
+		SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret);
+		return ret;
+	}
+	iov_offset += len;
+	queued += len;
+	data_sent += len;
+	if (unlikely(queued < datalen && pageidx == iovec->npages &&
+		     req->iov_idx < req->data_iovs - 1)) {
+		iovec->offset += iov_offset;
+		iovec = &req->iovs[++req->iov_idx];
+		iov_offset = 0;
+	}
+
+	*queued_ptr = queued;
+	*data_sent_ptr = data_sent;
+	*iov_offset_ptr = iov_offset;
+	return ret;
+}
+
+static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
+{
+	int ret = 0, count;
+	unsigned npkts = 0;
+	struct user_sdma_txreq *tx = NULL;
+	struct hfi1_user_sdma_pkt_q *pq = NULL;
+	struct user_sdma_iovec *iovec = NULL;
+
+	if (!req->pq)
+		return -EINVAL;
+
+	pq = req->pq;
+
+	/* If tx completion has reported an error, we are done. */
+	if (READ_ONCE(req->has_error))
+		return -EFAULT;
+
+	/*
+	 * Check if we might have sent the entire request already
+	 */
+	if (unlikely(req->seqnum == req->info.npkts)) {
+		if (!list_empty(&req->txps))
+			goto dosend;
+		return ret;
+	}
+
+	if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
+		maxpkts = req->info.npkts - req->seqnum;
+
+	while (npkts < maxpkts) {
+		u32 datalen = 0, queued = 0, data_sent = 0;
+		u64 iov_offset = 0;
+
+		/*
+		 * Check whether any of the completions have come back
+		 * with errors. If so, we are not going to process any
+		 * more packets from this request.
+		 */
+		if (READ_ONCE(req->has_error))
+			return -EFAULT;
+
+		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
+		if (!tx)
+			return -ENOMEM;
+
+		tx->flags = 0;
+		tx->req = req;
+		tx->busycount = 0;
+		INIT_LIST_HEAD(&tx->list);
+
+		/*
+		 * For the last packet set the ACK request
+		 * and disable header suppression.
+		 */
+		if (req->seqnum == req->info.npkts - 1)
+			tx->flags |= (TXREQ_FLAGS_REQ_ACK |
+				      TXREQ_FLAGS_REQ_DISABLE_SH);
+
+		/*
+		 * Calculate the payload size - this is min of the fragment
+		 * (MTU) size or the remaining bytes in the request but only
+		 * if we have payload data.
+		 */
+		if (req->data_len) {
+			iovec = &req->iovs[req->iov_idx];
+			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
+				if (++req->iov_idx == req->data_iovs) {
+					ret = -EFAULT;
+					goto free_txreq;
+				}
+				iovec = &req->iovs[req->iov_idx];
+				WARN_ON(iovec->offset);
+			}
+
+			datalen = compute_data_length(req, tx);
+
+			/*
+			 * Disable header suppression for the payload <= 8DWS.
+			 * If there is an uncorrectable error in the receive
+			 * data FIFO when the received payload size is less than
+			 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
+			 * not reported.There is set RHF.EccErr if the header
+			 * is not suppressed.
+			 */
+			if (!datalen) {
+				SDMA_DBG(req,
+					 "Request has data but pkt len is 0");
+				ret = -EFAULT;
+				goto free_tx;
+			} else if (datalen <= 32) {
+				tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
+			}
+		}
+
+		if (req->ahg_idx >= 0) {
+			if (!req->seqnum) {
+				ret = user_sdma_txadd_ahg(req, tx, datalen);
+				if (ret)
+					goto free_tx;
+			} else {
+				int changes;
+
+				changes = set_txreq_header_ahg(req, tx,
+							       datalen);
+				if (changes < 0)
+					goto free_tx;
+			}
+		} else {
+			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
+					  datalen, user_sdma_txreq_cb);
+			if (ret)
+				goto free_tx;
+			/*
+			 * Modify the header for this packet. This only needs
+			 * to be done if we are not going to use AHG. Otherwise,
+			 * the HW will do it based on the changes we gave it
+			 * during sdma_txinit_ahg().
+			 */
+			ret = set_txreq_header(req, tx, datalen);
+			if (ret)
+				goto free_txreq;
+		}
+
+		/*
+		 * If the request contains any data vectors, add up to
+		 * fragsize bytes to the descriptor.
+		 */
+		while (queued < datalen &&
+		       (req->sent + data_sent) < req->data_len) {
+			ret = user_sdma_txadd(req, tx, iovec, datalen,
+					      &queued, &data_sent, &iov_offset);
+			if (ret)
+				goto free_txreq;
+		}
+		/*
+		 * The txreq was submitted successfully so we can update
+		 * the counters.
+		 */
+		req->koffset += datalen;
+		if (req_opcode(req->info.ctrl) == EXPECTED)
+			req->tidoffset += datalen;
+		req->sent += data_sent;
+		if (req->data_len)
+			iovec->offset += iov_offset;
+		list_add_tail(&tx->txreq.list, &req->txps);
+		/*
+		 * It is important to increment this here as it is used to
+		 * generate the BTH.PSN and, therefore, can't be bulk-updated
+		 * outside of the loop.
+		 */
+		tx->seqnum = req->seqnum++;
+		npkts++;
+	}
+dosend:
+	ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count);
+	req->seqsubmitted += count;
+	if (req->seqsubmitted == req->info.npkts) {
+		WRITE_ONCE(req->done, 1);
+		/*
+		 * The txreq has already been submitted to the HW queue
+		 * so we can free the AHG entry now. Corruption will not
+		 * happen due to the sequential manner in which
+		 * descriptors are processed.
+		 */
+		if (req->ahg_idx >= 0)
+			sdma_ahg_free(req->sde, req->ahg_idx);
+	}
+	return ret;
+
+free_txreq:
+	sdma_txclean(pq->dd, &tx->txreq);
+free_tx:
+	kmem_cache_free(pq->txreq_cache, tx);
+	return ret;
+}
+
+static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
+{
+	struct evict_data evict_data;
+
+	evict_data.cleared = 0;
+	evict_data.target = npages;
+	hfi1_mmu_rb_evict(pq->handler, &evict_data);
+	return evict_data.cleared;
+}
+
+static int pin_sdma_pages(struct user_sdma_request *req,
+			  struct user_sdma_iovec *iovec,
+			  struct sdma_mmu_node *node,
+			  int npages)
+{
+	int pinned, cleared;
+	struct page **pages;
+	struct hfi1_user_sdma_pkt_q *pq = req->pq;
+
+	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+	memcpy(pages, node->pages, node->npages * sizeof(*pages));
+
+	npages -= node->npages;
+retry:
+	if (!hfi1_can_pin_pages(pq->dd, pq->mm,
+				atomic_read(&pq->n_locked), npages)) {
+		cleared = sdma_cache_evict(pq, npages);
+		if (cleared >= npages)
+			goto retry;
+	}
+	pinned = hfi1_acquire_user_pages(pq->mm,
+					 ((unsigned long)iovec->iov.iov_base +
+					 (node->npages * PAGE_SIZE)), npages, 0,
+					 pages + node->npages);
+	if (pinned < 0) {
+		kfree(pages);
+		return pinned;
+	}
+	if (pinned != npages) {
+		unpin_vector_pages(pq->mm, pages, node->npages, pinned);
+		return -EFAULT;
+	}
+	kfree(node->pages);
+	node->rb.len = iovec->iov.iov_len;
+	node->pages = pages;
+	atomic_add(pinned, &pq->n_locked);
+	return pinned;
+}
+
+static void unpin_sdma_pages(struct sdma_mmu_node *node)
+{
+	if (node->npages) {
+		unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages);
+		atomic_sub(node->npages, &node->pq->n_locked);
+	}
+}
+
+static int pin_vector_pages(struct user_sdma_request *req,
+			    struct user_sdma_iovec *iovec)
+{
+	int ret = 0, pinned, npages;
+	struct hfi1_user_sdma_pkt_q *pq = req->pq;
+	struct sdma_mmu_node *node = NULL;
+	struct mmu_rb_node *rb_node;
+	struct iovec *iov;
+	bool extracted;
+
+	extracted =
+		hfi1_mmu_rb_remove_unless_exact(pq->handler,
+						(unsigned long)
+						iovec->iov.iov_base,
+						iovec->iov.iov_len, &rb_node);
+	if (rb_node) {
+		node = container_of(rb_node, struct sdma_mmu_node, rb);
+		if (!extracted) {
+			atomic_inc(&node->refcount);
+			iovec->pages = node->pages;
+			iovec->npages = node->npages;
+			iovec->node = node;
+			return 0;
+		}
+	}
+
+	if (!node) {
+		node = kzalloc(sizeof(*node), GFP_KERNEL);
+		if (!node)
+			return -ENOMEM;
+
+		node->rb.addr = (unsigned long)iovec->iov.iov_base;
+		node->pq = pq;
+		atomic_set(&node->refcount, 0);
+	}
+
+	iov = &iovec->iov;
+	npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len);
+	if (node->npages < npages) {
+		pinned = pin_sdma_pages(req, iovec, node, npages);
+		if (pinned < 0) {
+			ret = pinned;
+			goto bail;
+		}
+		node->npages += pinned;
+		npages = node->npages;
+	}
+	iovec->pages = node->pages;
+	iovec->npages = npages;
+	iovec->node = node;
+
+	ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
+	if (ret) {
+		iovec->node = NULL;
+		goto bail;
+	}
+	return 0;
+bail:
+	unpin_sdma_pages(node);
+	kfree(node);
+	return ret;
+}
+
+static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
+			       unsigned start, unsigned npages)
+{
+	hfi1_release_user_pages(mm, pages + start, npages, false);
+	kfree(pages);
+}
+
+static int check_header_template(struct user_sdma_request *req,
+				 struct hfi1_pkt_header *hdr, u32 lrhlen,
+				 u32 datalen)
+{
+	/*
+	 * Perform safety checks for any type of packet:
+	 *    - transfer size is multiple of 64bytes
+	 *    - packet length is multiple of 4 bytes
+	 *    - packet length is not larger than MTU size
+	 *
+	 * These checks are only done for the first packet of the
+	 * transfer since the header is "given" to us by user space.
+	 * For the remainder of the packets we compute the values.
+	 */
+	if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
+	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
+		return -EINVAL;
+
+	if (req_opcode(req->info.ctrl) == EXPECTED) {
+		/*
+		 * The header is checked only on the first packet. Furthermore,
+		 * we ensure that at least one TID entry is copied when the
+		 * request is submitted. Therefore, we don't have to verify that
+		 * tididx points to something sane.
+		 */
+		u32 tidval = req->tids[req->tididx],
+			tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
+			tididx = EXP_TID_GET(tidval, IDX),
+			tidctrl = EXP_TID_GET(tidval, CTRL),
+			tidoff;
+		__le32 kval = hdr->kdeth.ver_tid_offset;
+
+		tidoff = KDETH_GET(kval, OFFSET) *
+			  (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+			   KDETH_OM_LARGE : KDETH_OM_SMALL);
+		/*
+		 * Expected receive packets have the following
+		 * additional checks:
+		 *     - offset is not larger than the TID size
+		 *     - TIDCtrl values match between header and TID array
+		 *     - TID indexes match between header and TID array
+		 */
+		if ((tidoff + datalen > tidlen) ||
+		    KDETH_GET(kval, TIDCTRL) != tidctrl ||
+		    KDETH_GET(kval, TID) != tididx)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Correctly set the BTH.PSN field based on type of
+ * transfer - eager packets can just increment the PSN but
+ * expected packets encode generation and sequence in the
+ * BTH.PSN field so just incrementing will result in errors.
+ */
+static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
+{
+	u32 val = be32_to_cpu(bthpsn),
+		mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
+			0xffffffull),
+		psn = val & mask;
+	if (expct)
+		psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+	else
+		psn = psn + frags;
+	return psn & mask;
+}
+
+static int set_txreq_header(struct user_sdma_request *req,
+			    struct user_sdma_txreq *tx, u32 datalen)
+{
+	struct hfi1_user_sdma_pkt_q *pq = req->pq;
+	struct hfi1_pkt_header *hdr = &tx->hdr;
+	u8 omfactor; /* KDETH.OM */
+	u16 pbclen;
+	int ret;
+	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
+
+	/* Copy the header template to the request before modification */
+	memcpy(hdr, &req->hdr, sizeof(*hdr));
+
+	/*
+	 * Check if the PBC and LRH length are mismatched. If so
+	 * adjust both in the header.
+	 */
+	pbclen = le16_to_cpu(hdr->pbc[0]);
+	if (PBC2LRH(pbclen) != lrhlen) {
+		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
+		hdr->pbc[0] = cpu_to_le16(pbclen);
+		hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
+		/*
+		 * Third packet
+		 * This is the first packet in the sequence that has
+		 * a "static" size that can be used for the rest of
+		 * the packets (besides the last one).
+		 */
+		if (unlikely(req->seqnum == 2)) {
+			/*
+			 * From this point on the lengths in both the
+			 * PBC and LRH are the same until the last
+			 * packet.
+			 * Adjust the template so we don't have to update
+			 * every packet
+			 */
+			req->hdr.pbc[0] = hdr->pbc[0];
+			req->hdr.lrh[2] = hdr->lrh[2];
+		}
+	}
+	/*
+	 * We only have to modify the header if this is not the
+	 * first packet in the request. Otherwise, we use the
+	 * header given to us.
+	 */
+	if (unlikely(!req->seqnum)) {
+		ret = check_header_template(req, hdr, lrhlen, datalen);
+		if (ret)
+			return ret;
+		goto done;
+	}
+
+	hdr->bth[2] = cpu_to_be32(
+		set_pkt_bth_psn(hdr->bth[2],
+				(req_opcode(req->info.ctrl) == EXPECTED),
+				req->seqnum));
+
+	/* Set ACK request on last packet */
+	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
+		hdr->bth[2] |= cpu_to_be32(1UL << 31);
+
+	/* Set the new offset */
+	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
+	/* Expected packets have to fill in the new TID information */
+	if (req_opcode(req->info.ctrl) == EXPECTED) {
+		tidval = req->tids[req->tididx];
+		/*
+		 * If the offset puts us at the end of the current TID,
+		 * advance everything.
+		 */
+		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+					 PAGE_SIZE)) {
+			req->tidoffset = 0;
+			/*
+			 * Since we don't copy all the TIDs, all at once,
+			 * we have to check again.
+			 */
+			if (++req->tididx > req->n_tids - 1 ||
+			    !req->tids[req->tididx]) {
+				return -EINVAL;
+			}
+			tidval = req->tids[req->tididx];
+		}
+		omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
+			KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
+			KDETH_OM_SMALL_SHIFT;
+		/* Set KDETH.TIDCtrl based on value for this TID. */
+		KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
+			  EXP_TID_GET(tidval, CTRL));
+		/* Set KDETH.TID based on value for this TID */
+		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
+			  EXP_TID_GET(tidval, IDX));
+		/* Clear KDETH.SH when DISABLE_SH flag is set */
+		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
+			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
+		/*
+		 * Set the KDETH.OFFSET and KDETH.OM based on size of
+		 * transfer.
+		 */
+		trace_hfi1_sdma_user_tid_info(
+			pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
+			req->tidoffset, req->tidoffset >> omfactor,
+			omfactor != KDETH_OM_SMALL_SHIFT);
+		KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
+			  req->tidoffset >> omfactor);
+		KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
+			  omfactor != KDETH_OM_SMALL_SHIFT);
+	}
+done:
+	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
+				    req->info.comp_idx, hdr, tidval);
+	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
+}
+
+static int set_txreq_header_ahg(struct user_sdma_request *req,
+				struct user_sdma_txreq *tx, u32 datalen)
+{
+	u32 ahg[AHG_KDETH_ARRAY_SIZE];
+	int idx = 0;
+	u8 omfactor; /* KDETH.OM */
+	struct hfi1_user_sdma_pkt_q *pq = req->pq;
+	struct hfi1_pkt_header *hdr = &req->hdr;
+	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
+	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
+	size_t array_size = ARRAY_SIZE(ahg);
+
+	if (PBC2LRH(pbclen) != lrhlen) {
+		/* PBC.PbcLengthDWs */
+		idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
+				     (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
+		if (idx < 0)
+			return idx;
+		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
+		idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
+				     (__force u16)cpu_to_be16(lrhlen >> 2));
+		if (idx < 0)
+			return idx;
+	}
+
+	/*
+	 * Do the common updates
+	 */
+	/* BTH.PSN and BTH.A */
+	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
+		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
+	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
+		val32 |= 1UL << 31;
+	idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
+			     (__force u16)cpu_to_be16(val32 >> 16));
+	if (idx < 0)
+		return idx;
+	idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
+			     (__force u16)cpu_to_be16(val32 & 0xffff));
+	if (idx < 0)
+		return idx;
+	/* KDETH.Offset */
+	idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
+			     (__force u16)cpu_to_le16(req->koffset & 0xffff));
+	if (idx < 0)
+		return idx;
+	idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
+			     (__force u16)cpu_to_le16(req->koffset >> 16));
+	if (idx < 0)
+		return idx;
+	if (req_opcode(req->info.ctrl) == EXPECTED) {
+		__le16 val;
+
+		tidval = req->tids[req->tididx];
+
+		/*
+		 * If the offset puts us at the end of the current TID,
+		 * advance everything.
+		 */
+		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+					 PAGE_SIZE)) {
+			req->tidoffset = 0;
+			/*
+			 * Since we don't copy all the TIDs, all at once,
+			 * we have to check again.
+			 */
+			if (++req->tididx > req->n_tids - 1 ||
+			    !req->tids[req->tididx])
+				return -EINVAL;
+			tidval = req->tids[req->tididx];
+		}
+		omfactor = ((EXP_TID_GET(tidval, LEN) *
+				  PAGE_SIZE) >=
+				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
+				 KDETH_OM_SMALL_SHIFT;
+		/* KDETH.OM and KDETH.OFFSET (TID) */
+		idx = ahg_header_set(
+				ahg, idx, array_size, 7, 0, 16,
+				((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
+				((req->tidoffset >> omfactor)
+				& 0x7fff)));
+		if (idx < 0)
+			return idx;
+		/* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
+		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
+				   (EXP_TID_GET(tidval, IDX) & 0x3ff));
+
+		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
+			val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
+						      INTR) <<
+					    AHG_KDETH_INTR_SHIFT));
+		} else {
+			val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
+			       cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
+			       cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
+						      INTR) <<
+					     AHG_KDETH_INTR_SHIFT));
+		}
+
+		idx = ahg_header_set(ahg, idx, array_size,
+				     7, 16, 14, (__force u16)val);
+		if (idx < 0)
+			return idx;
+	}
+
+	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
+					req->info.comp_idx, req->sde->this_idx,
+					req->ahg_idx, ahg, idx, tidval);
+	sdma_txinit_ahg(&tx->txreq,
+			SDMA_TXREQ_F_USE_AHG,
+			datalen, req->ahg_idx, idx,
+			ahg, sizeof(req->hdr),
+			user_sdma_txreq_cb);
+
+	return idx;
+}
+
+/*
+ * SDMA tx request completion callback. Called when the SDMA progress
+ * state machine gets notification that the SDMA descriptors for this
+ * tx request have been processed by the DMA engine. Called in
+ * interrupt context.
+ */
+static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
+{
+	struct user_sdma_txreq *tx =
+		container_of(txreq, struct user_sdma_txreq, txreq);
+	struct user_sdma_request *req;
+	struct hfi1_user_sdma_pkt_q *pq;
+	struct hfi1_user_sdma_comp_q *cq;
+	u16 idx;
+
+	if (!tx->req)
+		return;
+
+	req = tx->req;
+	pq = req->pq;
+	cq = req->cq;
+
+	if (status != SDMA_TXREQ_S_OK) {
+		SDMA_DBG(req, "SDMA completion with error %d",
+			 status);
+		WRITE_ONCE(req->has_error, 1);
+	}
+
+	req->seqcomp = tx->seqnum;
+	kmem_cache_free(pq->txreq_cache, tx);
+	tx = NULL;
+
+	idx = req->info.comp_idx;
+	if (req->status == -1 && status == SDMA_TXREQ_S_OK) {
+		if (req->seqcomp == req->info.npkts - 1) {
+			req->status = 0;
+			user_sdma_free_request(req, false);
+			pq_update(pq);
+			set_comp_state(pq, cq, idx, COMPLETE, 0);
+		}
+	} else {
+		if (status != SDMA_TXREQ_S_OK)
+			req->status = status;
+		if (req->seqcomp == (READ_ONCE(req->seqsubmitted) - 1) &&
+		    (READ_ONCE(req->done) ||
+		     READ_ONCE(req->has_error))) {
+			user_sdma_free_request(req, false);
+			pq_update(pq);
+			set_comp_state(pq, cq, idx, ERROR, req->status);
+		}
+	}
+}
+
+static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
+{
+	if (atomic_dec_and_test(&pq->n_reqs)) {
+		xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
+		wake_up(&pq->wait);
+	}
+}
+
+static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
+{
+	int i;
+
+	if (!list_empty(&req->txps)) {
+		struct sdma_txreq *t, *p;
+
+		list_for_each_entry_safe(t, p, &req->txps, list) {
+			struct user_sdma_txreq *tx =
+				container_of(t, struct user_sdma_txreq, txreq);
+			list_del_init(&t->list);
+			sdma_txclean(req->pq->dd, t);
+			kmem_cache_free(req->pq->txreq_cache, tx);
+		}
+	}
+
+	for (i = 0; i < req->data_iovs; i++) {
+		struct sdma_mmu_node *node = req->iovs[i].node;
+
+		if (!node)
+			continue;
+
+		if (unpin)
+			hfi1_mmu_rb_remove(req->pq->handler,
+					   &node->rb);
+		else
+			atomic_dec(&node->refcount);
+	}
+
+	kfree(req->tids);
+	clear_bit(req->info.comp_idx, req->pq->req_in_use);
+}
+
+static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
+				  struct hfi1_user_sdma_comp_q *cq,
+				  u16 idx, enum hfi1_sdma_comp_state state,
+				  int ret)
+{
+	if (state == ERROR)
+		cq->comps[idx].errcode = -ret;
+	smp_wmb(); /* make sure errcode is visible first */
+	cq->comps[idx].status = state;
+	trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
+					idx, state, ret);
+}
+
+static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
+			   unsigned long len)
+{
+	return (bool)(node->addr == addr);
+}
+
+static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
+{
+	struct sdma_mmu_node *node =
+		container_of(mnode, struct sdma_mmu_node, rb);
+
+	atomic_inc(&node->refcount);
+	return 0;
+}
+
+/*
+ * Return 1 to remove the node from the rb tree and call the remove op.
+ *
+ * Called with the rb tree lock held.
+ */
+static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
+			 void *evict_arg, bool *stop)
+{
+	struct sdma_mmu_node *node =
+		container_of(mnode, struct sdma_mmu_node, rb);
+	struct evict_data *evict_data = evict_arg;
+
+	/* is this node still being used? */
+	if (atomic_read(&node->refcount))
+		return 0; /* keep this node */
+
+	/* this node will be evicted, add its pages to our count */
+	evict_data->cleared += node->npages;
+
+	/* have enough pages been cleared? */
+	if (evict_data->cleared >= evict_data->target)
+		*stop = true;
+
+	return 1; /* remove this node */
+}
+
+static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
+{
+	struct sdma_mmu_node *node =
+		container_of(mnode, struct sdma_mmu_node, rb);
+
+	unpin_sdma_pages(node);
+	kfree(node);
+}
+
+static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
+{
+	struct sdma_mmu_node *node =
+		container_of(mnode, struct sdma_mmu_node, rb);
+
+	if (!atomic_read(&node->refcount))
+		return 1;
+	return 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h
new file mode 100644
index 0000000..a3d1924
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/user_sdma.h
@@ -0,0 +1,264 @@
+#ifndef _HFI1_USER_SDMA_H
+#define _HFI1_USER_SDMA_H
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/device.h>
+#include <linux/wait.h>
+
+#include "common.h"
+#include "iowait.h"
+#include "user_exp_rcv.h"
+
+/* The maximum number of Data io vectors per message/request */
+#define MAX_VECTORS_PER_REQ 8
+/*
+ * Maximum number of packet to send from each message/request
+ * before moving to the next one.
+ */
+#define MAX_PKTS_PER_QUEUE 16
+
+#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT))
+
+#define req_opcode(x) \
+	(((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_version(x) \
+	(((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_iovcnt(x) \
+	(((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK)
+
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define BTH_SEQ_MASK 0x7ffull
+
+#define AHG_KDETH_INTR_SHIFT 12
+#define AHG_KDETH_SH_SHIFT   13
+#define AHG_KDETH_ARRAY_SIZE  9
+
+#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
+#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
+
+/**
+ * Build an SDMA AHG header update descriptor and save it to an array.
+ * @arr        - Array to save the descriptor to.
+ * @idx        - Index of the array at which the descriptor will be saved.
+ * @array_size - Size of the array arr.
+ * @dw         - Update index into the header in DWs.
+ * @bit        - Start bit.
+ * @width      - Field width.
+ * @value      - 16 bits of immediate data to write into the field.
+ * Returns -ERANGE if idx is invalid. If successful, returns the next index
+ * (idx + 1) of the array to be used for the next descriptor.
+ */
+static inline int ahg_header_set(u32 *arr, int idx, size_t array_size,
+				 u8 dw, u8 bit, u8 width, u16 value)
+{
+	if ((size_t)idx >= array_size)
+		return -ERANGE;
+	arr[idx++] = sdma_build_ahg_descriptor(value, dw, bit, width);
+	return idx;
+}
+
+/* Tx request flag bits */
+#define TXREQ_FLAGS_REQ_ACK   BIT(0)      /* Set the ACK bit in the header */
+#define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */
+
+#define SDMA_PKT_Q_INACTIVE BIT(0)
+#define SDMA_PKT_Q_ACTIVE   BIT(1)
+#define SDMA_PKT_Q_DEFERRED BIT(2)
+
+/*
+ * Maximum retry attempts to submit a TX request
+ * before putting the process to sleep.
+ */
+#define MAX_DEFER_RETRY_COUNT 1
+
+#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
+
+#define SDMA_DBG(req, fmt, ...)				     \
+	hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \
+		 (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \
+		 ##__VA_ARGS__)
+
+extern uint extended_psn;
+
+struct hfi1_user_sdma_pkt_q {
+	u16 ctxt;
+	u16 subctxt;
+	u16 n_max_reqs;
+	atomic_t n_reqs;
+	u16 reqidx;
+	struct hfi1_devdata *dd;
+	struct kmem_cache *txreq_cache;
+	struct user_sdma_request *reqs;
+	unsigned long *req_in_use;
+	struct iowait busy;
+	unsigned state;
+	wait_queue_head_t wait;
+	unsigned long unpinned;
+	struct mmu_rb_handler *handler;
+	atomic_t n_locked;
+	struct mm_struct *mm;
+};
+
+struct hfi1_user_sdma_comp_q {
+	u16 nentries;
+	struct hfi1_sdma_comp_entry *comps;
+};
+
+struct sdma_mmu_node {
+	struct mmu_rb_node rb;
+	struct hfi1_user_sdma_pkt_q *pq;
+	atomic_t refcount;
+	struct page **pages;
+	unsigned int npages;
+};
+
+struct user_sdma_iovec {
+	struct list_head list;
+	struct iovec iov;
+	/* number of pages in this vector */
+	unsigned int npages;
+	/* array of pinned pages for this vector */
+	struct page **pages;
+	/*
+	 * offset into the virtual address space of the vector at
+	 * which we last left off.
+	 */
+	u64 offset;
+	struct sdma_mmu_node *node;
+};
+
+/* evict operation argument */
+struct evict_data {
+	u32 cleared;	/* count evicted so far */
+	u32 target;	/* target count to evict */
+};
+
+struct user_sdma_request {
+	/* This is the original header from user space */
+	struct hfi1_pkt_header hdr;
+
+	/* Read mostly fields */
+	struct hfi1_user_sdma_pkt_q *pq ____cacheline_aligned_in_smp;
+	struct hfi1_user_sdma_comp_q *cq;
+	/*
+	 * Pointer to the SDMA engine for this request.
+	 * Since different request could be on different VLs,
+	 * each request will need it's own engine pointer.
+	 */
+	struct sdma_engine *sde;
+	struct sdma_req_info info;
+	/* TID array values copied from the tid_iov vector */
+	u32 *tids;
+	/* total length of the data in the request */
+	u32 data_len;
+	/* number of elements copied to the tids array */
+	u16 n_tids;
+	/*
+	 * We copy the iovs for this request (based on
+	 * info.iovcnt). These are only the data vectors
+	 */
+	u8 data_iovs;
+	s8 ahg_idx;
+
+	/* Writeable fields shared with interrupt */
+	u64 seqcomp ____cacheline_aligned_in_smp;
+	u64 seqsubmitted;
+	/* status of the last txreq completed */
+	int status;
+
+	/* Send side fields */
+	struct list_head txps ____cacheline_aligned_in_smp;
+	u64 seqnum;
+	/*
+	 * KDETH.OFFSET (TID) field
+	 * The offset can cover multiple packets, depending on the
+	 * size of the TID entry.
+	 */
+	u32 tidoffset;
+	/*
+	 * KDETH.Offset (Eager) field
+	 * We need to remember the initial value so the headers
+	 * can be updated properly.
+	 */
+	u32 koffset;
+	u32 sent;
+	/* TID index copied from the tid_iov vector */
+	u16 tididx;
+	/* progress index moving along the iovs array */
+	u8 iov_idx;
+	u8 done;
+	u8 has_error;
+
+	struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
+} ____cacheline_aligned_in_smp;
+
+/*
+ * A single txreq could span up to 3 physical pages when the MTU
+ * is sufficiently large (> 4K). Each of the IOV pointers also
+ * needs it's own set of flags so the vector has been handled
+ * independently of each other.
+ */
+struct user_sdma_txreq {
+	/* Packet header for the txreq */
+	struct hfi1_pkt_header hdr;
+	struct sdma_txreq txreq;
+	struct list_head list;
+	struct user_sdma_request *req;
+	u16 flags;
+	unsigned int busycount;
+	u64 seqnum;
+};
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
+				struct hfi1_filedata *fd);
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
+			       struct hfi1_ctxtdata *uctxt);
+int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
+				   struct iovec *iovec, unsigned long dim,
+				   unsigned long *count);
+
+#endif /* _HFI1_USER_SDMA_H */
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
new file mode 100644
index 0000000..c8cf4d4
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -0,0 +1,2042 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <rdma/opa_addr.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "device.h"
+#include "trace.h"
+#include "qp.h"
+#include "verbs_txreq.h"
+#include "debugfs.h"
+#include "vnic.h"
+
+static unsigned int hfi1_lkey_table_size = 16;
+module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
+		   S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+		 "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int hfi1_max_pds = 0xFFFF;
+module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+		 "Maximum number of protection domains to support");
+
+static unsigned int hfi1_max_ahs = 0xFFFF;
+module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int hfi1_max_cqes = 0x2FFFFF;
+module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+		 "Maximum number of completion queue entries to support");
+
+unsigned int hfi1_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int hfi1_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int hfi1_max_qps = 32768;
+module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int hfi1_max_sges = 0x60;
+module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int hfi1_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+		 "Maximum number of multicast groups to support");
+
+unsigned int hfi1_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
+		   uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+		 "Maximum number of attached QPs to support");
+
+unsigned int hfi1_max_srqs = 1024;
+module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int hfi1_max_srq_sges = 128;
+module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int hfi1_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+unsigned short piothreshold = 256;
+module_param(piothreshold, ushort, S_IRUGO);
+MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
+
+#define COPY_CACHELESS 1
+#define COPY_ADAPTIVE  2
+static unsigned int sge_copy_mode;
+module_param(sge_copy_mode, uint, S_IRUGO);
+MODULE_PARM_DESC(sge_copy_mode,
+		 "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS");
+
+static void verbs_sdma_complete(
+	struct sdma_txreq *cookie,
+	int status);
+
+static int pio_wait(struct rvt_qp *qp,
+		    struct send_context *sc,
+		    struct hfi1_pkt_state *ps,
+		    u32 flag);
+
+/* Length of buffer to create verbs txreq cache name */
+#define TXREQ_NAME_LEN 24
+
+/* 16B trailing buffer */
+static const u8 trail_buf[MAX_16B_PADDING];
+
+static uint wss_threshold;
+module_param(wss_threshold, uint, S_IRUGO);
+MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
+static uint wss_clean_period = 256;
+module_param(wss_clean_period, uint, S_IRUGO);
+MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
+
+/* memory working set size */
+struct hfi1_wss {
+	unsigned long *entries;
+	atomic_t total_count;
+	atomic_t clean_counter;
+	atomic_t clean_entry;
+
+	int threshold;
+	int num_entries;
+	long pages_mask;
+};
+
+static struct hfi1_wss wss;
+
+int hfi1_wss_init(void)
+{
+	long llc_size;
+	long llc_bits;
+	long table_size;
+	long table_bits;
+
+	/* check for a valid percent range - default to 80 if none or invalid */
+	if (wss_threshold < 1 || wss_threshold > 100)
+		wss_threshold = 80;
+	/* reject a wildly large period */
+	if (wss_clean_period > 1000000)
+		wss_clean_period = 256;
+	/* reject a zero period */
+	if (wss_clean_period == 0)
+		wss_clean_period = 1;
+
+	/*
+	 * Calculate the table size - the next power of 2 larger than the
+	 * LLC size.  LLC size is in KiB.
+	 */
+	llc_size = wss_llc_size() * 1024;
+	table_size = roundup_pow_of_two(llc_size);
+
+	/* one bit per page in rounded up table */
+	llc_bits = llc_size / PAGE_SIZE;
+	table_bits = table_size / PAGE_SIZE;
+	wss.pages_mask = table_bits - 1;
+	wss.num_entries = table_bits / BITS_PER_LONG;
+
+	wss.threshold = (llc_bits * wss_threshold) / 100;
+	if (wss.threshold == 0)
+		wss.threshold = 1;
+
+	atomic_set(&wss.clean_counter, wss_clean_period);
+
+	wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
+			      GFP_KERNEL);
+	if (!wss.entries) {
+		hfi1_wss_exit();
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void hfi1_wss_exit(void)
+{
+	/* coded to handle partially initialized and repeat callers */
+	kfree(wss.entries);
+	wss.entries = NULL;
+}
+
+/*
+ * Advance the clean counter.  When the clean period has expired,
+ * clean an entry.
+ *
+ * This is implemented in atomics to avoid locking.  Because multiple
+ * variables are involved, it can be racy which can lead to slightly
+ * inaccurate information.  Since this is only a heuristic, this is
+ * OK.  Any innaccuracies will clean themselves out as the counter
+ * advances.  That said, it is unlikely the entry clean operation will
+ * race - the next possible racer will not start until the next clean
+ * period.
+ *
+ * The clean counter is implemented as a decrement to zero.  When zero
+ * is reached an entry is cleaned.
+ */
+static void wss_advance_clean_counter(void)
+{
+	int entry;
+	int weight;
+	unsigned long bits;
+
+	/* become the cleaner if we decrement the counter to zero */
+	if (atomic_dec_and_test(&wss.clean_counter)) {
+		/*
+		 * Set, not add, the clean period.  This avoids an issue
+		 * where the counter could decrement below the clean period.
+		 * Doing a set can result in lost decrements, slowing the
+		 * clean advance.  Since this a heuristic, this possible
+		 * slowdown is OK.
+		 *
+		 * An alternative is to loop, advancing the counter by a
+		 * clean period until the result is > 0. However, this could
+		 * lead to several threads keeping another in the clean loop.
+		 * This could be mitigated by limiting the number of times
+		 * we stay in the loop.
+		 */
+		atomic_set(&wss.clean_counter, wss_clean_period);
+
+		/*
+		 * Uniquely grab the entry to clean and move to next.
+		 * The current entry is always the lower bits of
+		 * wss.clean_entry.  The table size, wss.num_entries,
+		 * is always a power-of-2.
+		 */
+		entry = (atomic_inc_return(&wss.clean_entry) - 1)
+			& (wss.num_entries - 1);
+
+		/* clear the entry and count the bits */
+		bits = xchg(&wss.entries[entry], 0);
+		weight = hweight64((u64)bits);
+		/* only adjust the contended total count if needed */
+		if (weight)
+			atomic_sub(weight, &wss.total_count);
+	}
+}
+
+/*
+ * Insert the given address into the working set array.
+ */
+static void wss_insert(void *address)
+{
+	u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
+	u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
+	u32 nr = page & (BITS_PER_LONG - 1);
+
+	if (!test_and_set_bit(nr, &wss.entries[entry]))
+		atomic_inc(&wss.total_count);
+
+	wss_advance_clean_counter();
+}
+
+/*
+ * Is the working set larger than the threshold?
+ */
+static inline bool wss_exceeds_threshold(void)
+{
+	return atomic_read(&wss.total_count) >= wss.threshold;
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
+	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+	[IB_WR_SEND] = IB_WC_SEND,
+	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
+	[IB_WR_SEND_WITH_INV] = IB_WC_SEND,
+	[IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV,
+	[IB_WR_REG_MR] = IB_WC_REG_MR
+};
+
+/*
+ * Length of header by opcode, 0 --> not supported
+ */
+const u8 hdr_len_by_opcode[256] = {
+	/* RC */
+	[IB_OPCODE_RC_SEND_FIRST]                     = 12 + 8,
+	[IB_OPCODE_RC_SEND_MIDDLE]                    = 12 + 8,
+	[IB_OPCODE_RC_SEND_LAST]                      = 12 + 8,
+	[IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
+	[IB_OPCODE_RC_SEND_ONLY]                      = 12 + 8,
+	[IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
+	[IB_OPCODE_RC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
+	[IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = 12 + 8,
+	[IB_OPCODE_RC_RDMA_WRITE_LAST]                = 12 + 8,
+	[IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+	[IB_OPCODE_RC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
+	[IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+	[IB_OPCODE_RC_RDMA_READ_REQUEST]              = 12 + 8 + 16,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = 12 + 8 + 4,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = 12 + 8,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = 12 + 8 + 4,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = 12 + 8 + 4,
+	[IB_OPCODE_RC_ACKNOWLEDGE]                    = 12 + 8 + 4,
+	[IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4 + 8,
+	[IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
+	[IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
+	[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = 12 + 8 + 4,
+	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
+	/* UC */
+	[IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
+	[IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
+	[IB_OPCODE_UC_SEND_LAST]                      = 12 + 8,
+	[IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
+	[IB_OPCODE_UC_SEND_ONLY]                      = 12 + 8,
+	[IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
+	[IB_OPCODE_UC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
+	[IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = 12 + 8,
+	[IB_OPCODE_UC_RDMA_WRITE_LAST]                = 12 + 8,
+	[IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+	[IB_OPCODE_UC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
+	[IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+	/* UD */
+	[IB_OPCODE_UD_SEND_ONLY]                      = 12 + 8 + 8,
+	[IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 12
+};
+
+static const opcode_handler opcode_handler_tbl[256] = {
+	/* RC */
+	[IB_OPCODE_RC_SEND_FIRST]                     = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_SEND_MIDDLE]                    = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_SEND_LAST]                      = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_SEND_ONLY]                      = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_WRITE_FIRST]               = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_WRITE_LAST]                = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_WRITE_ONLY]                = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_READ_REQUEST]              = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_ACKNOWLEDGE]                    = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = &hfi1_rc_rcv,
+	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = &hfi1_rc_rcv,
+	/* UC */
+	[IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_SEND_LAST]                      = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_SEND_ONLY]                      = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_RDMA_WRITE_FIRST]               = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_RDMA_WRITE_LAST]                = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_RDMA_WRITE_ONLY]                = &hfi1_uc_rcv,
+	[IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+	/* UD */
+	[IB_OPCODE_UD_SEND_ONLY]                      = &hfi1_ud_rcv,
+	[IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_ud_rcv,
+	/* CNP */
+	[IB_OPCODE_CNP]				      = &hfi1_cnp_rcv
+};
+
+#define OPMASK 0x1f
+
+static const u32 pio_opmask[BIT(3)] = {
+	/* RC */
+	[IB_OPCODE_RC >> 5] =
+		BIT(RC_OP(SEND_ONLY) & OPMASK) |
+		BIT(RC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) |
+		BIT(RC_OP(RDMA_WRITE_ONLY) & OPMASK) |
+		BIT(RC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK) |
+		BIT(RC_OP(RDMA_READ_REQUEST) & OPMASK) |
+		BIT(RC_OP(ACKNOWLEDGE) & OPMASK) |
+		BIT(RC_OP(ATOMIC_ACKNOWLEDGE) & OPMASK) |
+		BIT(RC_OP(COMPARE_SWAP) & OPMASK) |
+		BIT(RC_OP(FETCH_ADD) & OPMASK),
+	/* UC */
+	[IB_OPCODE_UC >> 5] =
+		BIT(UC_OP(SEND_ONLY) & OPMASK) |
+		BIT(UC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) |
+		BIT(UC_OP(RDMA_WRITE_ONLY) & OPMASK) |
+		BIT(UC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK),
+};
+
+/*
+ * System image GUID.
+ */
+__be64 ib_hfi1_sys_image_guid;
+
+/**
+ * hfi1_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ * @release: boolean to release MR
+ * @copy_last: do a separate copy of the last 8 bytes
+ */
+void hfi1_copy_sge(
+	struct rvt_sge_state *ss,
+	void *data, u32 length,
+	bool release,
+	bool copy_last)
+{
+	struct rvt_sge *sge = &ss->sge;
+	int i;
+	bool in_last = false;
+	bool cacheless_copy = false;
+
+	if (sge_copy_mode == COPY_CACHELESS) {
+		cacheless_copy = length >= PAGE_SIZE;
+	} else if (sge_copy_mode == COPY_ADAPTIVE) {
+		if (length >= PAGE_SIZE) {
+			/*
+			 * NOTE: this *assumes*:
+			 * o The first vaddr is the dest.
+			 * o If multiple pages, then vaddr is sequential.
+			 */
+			wss_insert(sge->vaddr);
+			if (length >= (2 * PAGE_SIZE))
+				wss_insert(sge->vaddr + PAGE_SIZE);
+
+			cacheless_copy = wss_exceeds_threshold();
+		} else {
+			wss_advance_clean_counter();
+		}
+	}
+	if (copy_last) {
+		if (length > 8) {
+			length -= 8;
+		} else {
+			copy_last = false;
+			in_last = true;
+		}
+	}
+
+again:
+	while (length) {
+		u32 len = rvt_get_sge_length(sge, length);
+
+		WARN_ON_ONCE(len == 0);
+		if (unlikely(in_last)) {
+			/* enforce byte transfer ordering */
+			for (i = 0; i < len; i++)
+				((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
+		} else if (cacheless_copy) {
+			cacheless_memcpy(sge->vaddr, data, len);
+		} else {
+			memcpy(sge->vaddr, data, len);
+		}
+		rvt_update_sge(ss, len, release);
+		data += len;
+		length -= len;
+	}
+
+	if (copy_last) {
+		copy_last = false;
+		in_last = true;
+		length = 8;
+		goto again;
+	}
+}
+
+/*
+ * Make sure the QP is ready and able to accept the given opcode.
+ */
+static inline opcode_handler qp_ok(struct hfi1_packet *packet)
+{
+	if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
+		return NULL;
+	if (((packet->opcode & RVT_OPCODE_QP_MASK) ==
+	     packet->qp->allowed_ops) ||
+	    (packet->opcode == IB_OPCODE_CNP))
+		return opcode_handler_tbl[packet->opcode];
+
+	return NULL;
+}
+
+static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
+{
+#ifdef CONFIG_FAULT_INJECTION
+	if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP)
+		/*
+		 * In order to drop non-IB traffic we
+		 * set PbcInsertHrc to NONE (0x2).
+		 * The packet will still be delivered
+		 * to the receiving node but a
+		 * KHdrHCRCErr (KDETH packet with a bad
+		 * HCRC) will be triggered and the
+		 * packet will not be delivered to the
+		 * correct context.
+		 */
+		pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT;
+	else
+		/*
+		 * In order to drop regular verbs
+		 * traffic we set the PbcTestEbp
+		 * flag. The packet will still be
+		 * delivered to the receiving node but
+		 * a 'late ebp error' will be
+		 * triggered and will be dropped.
+		 */
+		pbc |= PBC_TEST_EBP;
+#endif
+	return pbc;
+}
+
+static int hfi1_do_pkey_check(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct hfi1_pportdata *ppd = rcd->ppd;
+	struct hfi1_16b_header *hdr = packet->hdr;
+	u16 pkey;
+
+	/* Pkey check needed only for bypass packets */
+	if (packet->etype != RHF_RCV_TYPE_BYPASS)
+		return 0;
+
+	/* Perform pkey check */
+	pkey = hfi1_16B_get_pkey(hdr);
+	return ingress_pkey_check(ppd, pkey, packet->sc,
+				  packet->qp->s_pkey_index,
+				  packet->slid, true);
+}
+
+static inline void hfi1_handle_packet(struct hfi1_packet *packet,
+				      bool is_mcast)
+{
+	u32 qp_num;
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct hfi1_pportdata *ppd = rcd->ppd;
+	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
+	struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+	opcode_handler packet_handler;
+	unsigned long flags;
+
+	inc_opstats(packet->tlen, &rcd->opstats->stats[packet->opcode]);
+
+	if (unlikely(is_mcast)) {
+		struct rvt_mcast *mcast;
+		struct rvt_mcast_qp *p;
+
+		if (!packet->grh)
+			goto drop;
+		mcast = rvt_mcast_find(&ibp->rvp,
+				       &packet->grh->dgid,
+				       opa_get_lid(packet->dlid, 9B));
+		if (!mcast)
+			goto drop;
+		list_for_each_entry_rcu(p, &mcast->qp_list, list) {
+			packet->qp = p->qp;
+			if (hfi1_do_pkey_check(packet))
+				goto drop;
+			spin_lock_irqsave(&packet->qp->r_lock, flags);
+			packet_handler = qp_ok(packet);
+			if (likely(packet_handler))
+				packet_handler(packet);
+			else
+				ibp->rvp.n_pkt_drops++;
+			spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+		}
+		/*
+		 * Notify rvt_multicast_detach() if it is waiting for us
+		 * to finish.
+		 */
+		if (atomic_dec_return(&mcast->refcount) <= 1)
+			wake_up(&mcast->wait);
+	} else {
+		/* Get the destination QP number. */
+		qp_num = ib_bth_get_qpn(packet->ohdr);
+		rcu_read_lock();
+		packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+		if (!packet->qp)
+			goto unlock_drop;
+
+		if (hfi1_do_pkey_check(packet))
+			goto unlock_drop;
+
+		if (unlikely(hfi1_dbg_fault_opcode(packet->qp, packet->opcode,
+						   true)))
+			goto unlock_drop;
+
+		spin_lock_irqsave(&packet->qp->r_lock, flags);
+		packet_handler = qp_ok(packet);
+		if (likely(packet_handler))
+			packet_handler(packet);
+		else
+			ibp->rvp.n_pkt_drops++;
+		spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+		rcu_read_unlock();
+	}
+	return;
+unlock_drop:
+	rcu_read_unlock();
+drop:
+	ibp->rvp.n_pkt_drops++;
+}
+
+/**
+ * hfi1_ib_rcv - process an incoming packet
+ * @packet: data packet information
+ *
+ * This is called to process an incoming packet at interrupt level.
+ */
+void hfi1_ib_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+
+	trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+	hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid));
+}
+
+void hfi1_16B_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+
+	trace_input_ibhdr(rcd->dd, packet, false);
+	hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid));
+}
+
+/*
+ * This is called from a timer to check for QPs
+ * which need kernel memory in order to send a packet.
+ */
+static void mem_timer(struct timer_list *t)
+{
+	struct hfi1_ibdev *dev = from_timer(dev, t, mem_timer);
+	struct list_head *list = &dev->memwait;
+	struct rvt_qp *qp = NULL;
+	struct iowait *wait;
+	unsigned long flags;
+	struct hfi1_qp_priv *priv;
+
+	write_seqlock_irqsave(&dev->iowait_lock, flags);
+	if (!list_empty(list)) {
+		wait = list_first_entry(list, struct iowait, list);
+		qp = iowait_to_qp(wait);
+		priv = qp->priv;
+		list_del_init(&priv->s_iowait.list);
+		priv->s_iowait.lock = NULL;
+		/* refcount held until actual wake up */
+		if (!list_empty(list))
+			mod_timer(&dev->mem_timer, jiffies + 1);
+	}
+	write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+	if (qp)
+		hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM);
+}
+
+/*
+ * This is called with progress side lock held.
+ */
+/* New API */
+static void verbs_sdma_complete(
+	struct sdma_txreq *cookie,
+	int status)
+{
+	struct verbs_txreq *tx =
+		container_of(cookie, struct verbs_txreq, txreq);
+	struct rvt_qp *qp = tx->qp;
+
+	spin_lock(&qp->s_lock);
+	if (tx->wqe) {
+		hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
+		struct hfi1_opa_header *hdr;
+
+		hdr = &tx->phdr.hdr;
+		hfi1_rc_send_complete(qp, hdr);
+	}
+	spin_unlock(&qp->s_lock);
+
+	hfi1_put_txreq(tx);
+}
+
+static int wait_kmem(struct hfi1_ibdev *dev,
+		     struct rvt_qp *qp,
+		     struct hfi1_pkt_state *ps)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+		write_seqlock(&dev->iowait_lock);
+		list_add_tail(&ps->s_txreq->txreq.list,
+			      &priv->s_iowait.tx_head);
+		if (list_empty(&priv->s_iowait.list)) {
+			if (list_empty(&dev->memwait))
+				mod_timer(&dev->mem_timer, jiffies + 1);
+			qp->s_flags |= RVT_S_WAIT_KMEM;
+			list_add_tail(&priv->s_iowait.list, &dev->memwait);
+			priv->s_iowait.lock = &dev->iowait_lock;
+			trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
+			rvt_get_qp(qp);
+		}
+		write_sequnlock(&dev->iowait_lock);
+		qp->s_flags &= ~RVT_S_BUSY;
+		ret = -EBUSY;
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	return ret;
+}
+
+/*
+ * This routine calls txadds for each sg entry.
+ *
+ * Add failures will revert the sge cursor
+ */
+static noinline int build_verbs_ulp_payload(
+	struct sdma_engine *sde,
+	u32 length,
+	struct verbs_txreq *tx)
+{
+	struct rvt_sge_state *ss = tx->ss;
+	struct rvt_sge *sg_list = ss->sg_list;
+	struct rvt_sge sge = ss->sge;
+	u8 num_sge = ss->num_sge;
+	u32 len;
+	int ret = 0;
+
+	while (length) {
+		len = ss->sge.length;
+		if (len > length)
+			len = length;
+		if (len > ss->sge.sge_length)
+			len = ss->sge.sge_length;
+		WARN_ON_ONCE(len == 0);
+		ret = sdma_txadd_kvaddr(
+			sde->dd,
+			&tx->txreq,
+			ss->sge.vaddr,
+			len);
+		if (ret)
+			goto bail_txadd;
+		rvt_update_sge(ss, len, false);
+		length -= len;
+	}
+	return ret;
+bail_txadd:
+	/* unwind cursor */
+	ss->sge = sge;
+	ss->num_sge = num_sge;
+	ss->sg_list = sg_list;
+	return ret;
+}
+
+/**
+ * update_tx_opstats - record stats by opcode
+ * @qp; the qp
+ * @ps: transmit packet state
+ * @plen: the plen in dwords
+ *
+ * This is a routine to record the tx opstats after a
+ * packet has been presented to the egress mechanism.
+ */
+static void update_tx_opstats(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			      u32 plen)
+{
+#ifdef CONFIG_DEBUG_FS
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	struct hfi1_opcode_stats_perctx *s = get_cpu_ptr(dd->tx_opstats);
+
+	inc_opstats(plen * 4, &s->stats[ps->opcode]);
+	put_cpu_ptr(s);
+#endif
+}
+
+/*
+ * Build the number of DMA descriptors needed to send length bytes of data.
+ *
+ * NOTE: DMA mapping is held in the tx until completed in the ring or
+ *       the tx desc is freed without having been submitted to the ring
+ *
+ * This routine ensures all the helper routine calls succeed.
+ */
+/* New API */
+static int build_verbs_tx_desc(
+	struct sdma_engine *sde,
+	u32 length,
+	struct verbs_txreq *tx,
+	struct hfi1_ahg_info *ahg_info,
+	u64 pbc)
+{
+	int ret = 0;
+	struct hfi1_sdma_header *phdr = &tx->phdr;
+	u16 hdrbytes = (tx->hdr_dwords + sizeof(pbc) / 4) << 2;
+	u8 extra_bytes = 0;
+
+	if (tx->phdr.hdr.hdr_type) {
+		/*
+		 * hdrbytes accounts for PBC. Need to subtract 8 bytes
+		 * before calculating padding.
+		 */
+		extra_bytes = hfi1_get_16b_padding(hdrbytes - 8, length) +
+			      (SIZE_OF_CRC << 2) + SIZE_OF_LT;
+	}
+	if (!ahg_info->ahgcount) {
+		ret = sdma_txinit_ahg(
+			&tx->txreq,
+			ahg_info->tx_flags,
+			hdrbytes + length +
+			extra_bytes,
+			ahg_info->ahgidx,
+			0,
+			NULL,
+			0,
+			verbs_sdma_complete);
+		if (ret)
+			goto bail_txadd;
+		phdr->pbc = cpu_to_le64(pbc);
+		ret = sdma_txadd_kvaddr(
+			sde->dd,
+			&tx->txreq,
+			phdr,
+			hdrbytes);
+		if (ret)
+			goto bail_txadd;
+	} else {
+		ret = sdma_txinit_ahg(
+			&tx->txreq,
+			ahg_info->tx_flags,
+			length,
+			ahg_info->ahgidx,
+			ahg_info->ahgcount,
+			ahg_info->ahgdesc,
+			hdrbytes,
+			verbs_sdma_complete);
+		if (ret)
+			goto bail_txadd;
+	}
+	/* add the ulp payload - if any. tx->ss can be NULL for acks */
+	if (tx->ss) {
+		ret = build_verbs_ulp_payload(sde, length, tx);
+		if (ret)
+			goto bail_txadd;
+	}
+
+	/* add icrc, lt byte, and padding to flit */
+	if (extra_bytes)
+		ret = sdma_txadd_kvaddr(sde->dd, &tx->txreq,
+					(void *)trail_buf, extra_bytes);
+
+bail_txadd:
+	return ret;
+}
+
+int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			u64 pbc)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_ahg_info *ahg_info = priv->s_ahg;
+	u32 hdrwords = ps->s_txreq->hdr_dwords;
+	u32 len = ps->s_txreq->s_cur_size;
+	u32 plen;
+	struct hfi1_ibdev *dev = ps->dev;
+	struct hfi1_pportdata *ppd = ps->ppd;
+	struct verbs_txreq *tx;
+	u8 sc5 = priv->s_sc;
+	int ret;
+	u32 dwords;
+
+	if (ps->s_txreq->phdr.hdr.hdr_type) {
+		u8 extra_bytes = hfi1_get_16b_padding((hdrwords << 2), len);
+
+		dwords = (len + extra_bytes + (SIZE_OF_CRC << 2) +
+			  SIZE_OF_LT) >> 2;
+	} else {
+		dwords = (len + 3) >> 2;
+	}
+	plen = hdrwords + dwords + sizeof(pbc) / 4;
+
+	tx = ps->s_txreq;
+	if (!sdma_txreq_built(&tx->txreq)) {
+		if (likely(pbc == 0)) {
+			u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+
+			/* No vl15 here */
+			/* set PBC_DC_INFO bit (aka SC[4]) in pbc */
+			if (ps->s_txreq->phdr.hdr.hdr_type)
+				pbc |= PBC_PACKET_BYPASS |
+				       PBC_INSERT_BYPASS_ICRC;
+			else
+				pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
+
+			if (unlikely(hfi1_dbg_fault_opcode(qp, ps->opcode,
+							   false)))
+				pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
+			pbc = create_pbc(ppd,
+					 pbc,
+					 qp->srate_mbps,
+					 vl,
+					 plen);
+		}
+		tx->wqe = qp->s_wqe;
+		ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc);
+		if (unlikely(ret))
+			goto bail_build;
+	}
+	ret =  sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq,
+			       ps->pkts_sent);
+	if (unlikely(ret < 0)) {
+		if (ret == -ECOMM)
+			goto bail_ecomm;
+		return ret;
+	}
+
+	update_tx_opstats(qp, ps, plen);
+	trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
+				&ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
+	return ret;
+
+bail_ecomm:
+	/* The current one got "sent" */
+	return 0;
+bail_build:
+	ret = wait_kmem(dev, qp, ps);
+	if (!ret) {
+		/* free txreq - bad state */
+		hfi1_put_txreq(ps->s_txreq);
+		ps->s_txreq = NULL;
+	}
+	return ret;
+}
+
+/*
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int pio_wait(struct rvt_qp *qp,
+		    struct send_context *sc,
+		    struct hfi1_pkt_state *ps,
+		    u32 flag)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_devdata *dd = sc->dd;
+	struct hfi1_ibdev *dev = &dd->verbs_dev;
+	unsigned long flags;
+	int ret = 0;
+
+	/*
+	 * Note that as soon as want_buffer() is called and
+	 * possibly before it returns, sc_piobufavail()
+	 * could be called. Therefore, put QP on the I/O wait list before
+	 * enabling the PIO avail interrupt.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+		write_seqlock(&dev->iowait_lock);
+		list_add_tail(&ps->s_txreq->txreq.list,
+			      &priv->s_iowait.tx_head);
+		if (list_empty(&priv->s_iowait.list)) {
+			struct hfi1_ibdev *dev = &dd->verbs_dev;
+			int was_empty;
+
+			dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
+			dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN);
+			qp->s_flags |= flag;
+			was_empty = list_empty(&sc->piowait);
+			iowait_queue(ps->pkts_sent, &priv->s_iowait,
+				     &sc->piowait);
+			priv->s_iowait.lock = &dev->iowait_lock;
+			trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
+			rvt_get_qp(qp);
+			/* counting: only call wantpiobuf_intr if first user */
+			if (was_empty)
+				hfi1_sc_wantpiobuf_intr(sc, 1);
+		}
+		write_sequnlock(&dev->iowait_lock);
+		qp->s_flags &= ~RVT_S_BUSY;
+		ret = -EBUSY;
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+static void verbs_pio_complete(void *arg, int code)
+{
+	struct rvt_qp *qp = (struct rvt_qp *)arg;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	if (iowait_pio_dec(&priv->s_iowait))
+		iowait_drain_wakeup(&priv->s_iowait);
+}
+
+int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			u64 pbc)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	u32 hdrwords = ps->s_txreq->hdr_dwords;
+	struct rvt_sge_state *ss = ps->s_txreq->ss;
+	u32 len = ps->s_txreq->s_cur_size;
+	u32 dwords;
+	u32 plen;
+	struct hfi1_pportdata *ppd = ps->ppd;
+	u32 *hdr;
+	u8 sc5;
+	unsigned long flags = 0;
+	struct send_context *sc;
+	struct pio_buf *pbuf;
+	int wc_status = IB_WC_SUCCESS;
+	int ret = 0;
+	pio_release_cb cb = NULL;
+	u8 extra_bytes = 0;
+
+	if (ps->s_txreq->phdr.hdr.hdr_type) {
+		u8 pad_size = hfi1_get_16b_padding((hdrwords << 2), len);
+
+		extra_bytes = pad_size + (SIZE_OF_CRC << 2) + SIZE_OF_LT;
+		dwords = (len + extra_bytes) >> 2;
+		hdr = (u32 *)&ps->s_txreq->phdr.hdr.opah;
+	} else {
+		dwords = (len + 3) >> 2;
+		hdr = (u32 *)&ps->s_txreq->phdr.hdr.ibh;
+	}
+	plen = hdrwords + dwords + sizeof(pbc) / 4;
+
+	/* only RC/UC use complete */
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+		cb = verbs_pio_complete;
+		break;
+	default:
+		break;
+	}
+
+	/* vl15 special case taken care of in ud.c */
+	sc5 = priv->s_sc;
+	sc = ps->s_txreq->psc;
+
+	if (likely(pbc == 0)) {
+		u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+
+		/* set PBC_DC_INFO bit (aka SC[4]) in pbc */
+		if (ps->s_txreq->phdr.hdr.hdr_type)
+			pbc |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
+		else
+			pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
+		if (unlikely(hfi1_dbg_fault_opcode(qp, ps->opcode, false)))
+			pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
+		pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
+	}
+	if (cb)
+		iowait_pio_inc(&priv->s_iowait);
+	pbuf = sc_buffer_alloc(sc, plen, cb, qp);
+	if (unlikely(!pbuf)) {
+		if (cb)
+			verbs_pio_complete(qp, 0);
+		if (ppd->host_link_state != HLS_UP_ACTIVE) {
+			/*
+			 * If we have filled the PIO buffers to capacity and are
+			 * not in an active state this request is not going to
+			 * go out to so just complete it with an error or else a
+			 * ULP or the core may be stuck waiting.
+			 */
+			hfi1_cdbg(
+				PIO,
+				"alloc failed. state not active, completing");
+			wc_status = IB_WC_GENERAL_ERR;
+			goto pio_bail;
+		} else {
+			/*
+			 * This is a normal occurrence. The PIO buffs are full
+			 * up but we are still happily sending, well we could be
+			 * so lets continue to queue the request.
+			 */
+			hfi1_cdbg(PIO, "alloc failed. state active, queuing");
+			ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO);
+			if (!ret)
+				/* txreq not queued - free */
+				goto bail;
+			/* tx consumed in wait */
+			return ret;
+		}
+	}
+
+	if (dwords == 0) {
+		pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
+	} else {
+		seg_pio_copy_start(pbuf, pbc,
+				   hdr, hdrwords * 4);
+		if (ss) {
+			while (len) {
+				void *addr = ss->sge.vaddr;
+				u32 slen = ss->sge.length;
+
+				if (slen > len)
+					slen = len;
+				rvt_update_sge(ss, slen, false);
+				seg_pio_copy_mid(pbuf, addr, slen);
+				len -= slen;
+			}
+		}
+		/* add icrc, lt byte, and padding to flit */
+		if (extra_bytes)
+			seg_pio_copy_mid(pbuf, trail_buf, extra_bytes);
+
+		seg_pio_copy_end(pbuf);
+	}
+
+	update_tx_opstats(qp, ps, plen);
+	trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
+			       &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
+
+pio_bail:
+	if (qp->s_wqe) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		hfi1_send_complete(qp, qp->s_wqe, wc_status);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+
+	ret = 0;
+
+bail:
+	hfi1_put_txreq(ps->s_txreq);
+	return ret;
+}
+
+/*
+ * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the partition key table), return 0
+ * otherwise. Use the matching criteria for egress partition keys
+ * specified in the OPAv1 spec., section 9.1l.7.
+ */
+static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+	u16 mkey = pkey & PKEY_LOW_15_MASK;
+	u16 mentry = ent & PKEY_LOW_15_MASK;
+
+	if (mkey == mentry) {
+		/*
+		 * If pkey[15] is set (full partition member),
+		 * is bit 15 in the corresponding table element
+		 * clear (limited member)?
+		 */
+		if (pkey & PKEY_MEMBER_MASK)
+			return !!(ent & PKEY_MEMBER_MASK);
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * egress_pkey_check - check P_KEY of a packet
+ * @ppd:  Physical IB port data
+ * @slid: SLID for packet
+ * @bkey: PKEY for header
+ * @sc5:  SC for packet
+ * @s_pkey_index: It will be used for look up optimization for kernel contexts
+ * only. If it is negative value, then it means user contexts is calling this
+ * function.
+ *
+ * It checks if hdr's pkey is valid.
+ *
+ * Return: 0 on success, otherwise, 1
+ */
+int egress_pkey_check(struct hfi1_pportdata *ppd, u32 slid, u16 pkey,
+		      u8 sc5, int8_t s_pkey_index)
+{
+	struct hfi1_devdata *dd;
+	int i;
+	int is_user_ctxt_mechanism = (s_pkey_index < 0);
+
+	if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
+		return 0;
+
+	/* If SC15, pkey[0:14] must be 0x7fff */
+	if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+		goto bad;
+
+	/* Is the pkey = 0x0, or 0x8000? */
+	if ((pkey & PKEY_LOW_15_MASK) == 0)
+		goto bad;
+
+	/*
+	 * For the kernel contexts only, if a qp is passed into the function,
+	 * the most likely matching pkey has index qp->s_pkey_index
+	 */
+	if (!is_user_ctxt_mechanism &&
+	    egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
+		return 0;
+	}
+
+	for (i = 0; i < MAX_PKEY_VALUES; i++) {
+		if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+			return 0;
+	}
+bad:
+	/*
+	 * For the user-context mechanism, the P_KEY check would only happen
+	 * once per SDMA request, not once per packet.  Therefore, there's no
+	 * need to increment the counter for the user-context mechanism.
+	 */
+	if (!is_user_ctxt_mechanism) {
+		incr_cntr64(&ppd->port_xmit_constraint_errors);
+		dd = ppd->dd;
+		if (!(dd->err_info_xmit_constraint.status &
+		      OPA_EI_STATUS_SMASK)) {
+			dd->err_info_xmit_constraint.status |=
+				OPA_EI_STATUS_SMASK;
+			dd->err_info_xmit_constraint.slid = slid;
+			dd->err_info_xmit_constraint.pkey = pkey;
+		}
+	}
+	return 1;
+}
+
+/**
+ * get_send_routine - choose an egress routine
+ *
+ * Choose an egress routine based on QP type
+ * and size
+ */
+static inline send_routine get_send_routine(struct rvt_qp *qp,
+					    struct hfi1_pkt_state *ps)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct verbs_txreq *tx = ps->s_txreq;
+
+	if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA)))
+		return dd->process_pio_send;
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_SMI:
+		return dd->process_pio_send;
+	case IB_QPT_GSI:
+	case IB_QPT_UD:
+		break;
+	case IB_QPT_UC:
+	case IB_QPT_RC: {
+		if (piothreshold &&
+		    tx->s_cur_size <= min(piothreshold, qp->pmtu) &&
+		    (BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) &&
+		    iowait_sdma_pending(&priv->s_iowait) == 0 &&
+		    !sdma_txreq_built(&tx->txreq))
+			return dd->process_pio_send;
+		break;
+	}
+	default:
+		break;
+	}
+	return dd->process_dma_send;
+}
+
+/**
+ * hfi1_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @ps: the state of the packet to send
+ *
+ * Return zero if packet is sent or queued OK.
+ * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise.
+ */
+int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_other_headers *ohdr;
+	send_routine sr;
+	int ret;
+	u16 pkey;
+	u32 slid;
+
+	/* locate the pkey within the headers */
+	if (ps->s_txreq->phdr.hdr.hdr_type) {
+		struct hfi1_16b_header *hdr = &ps->s_txreq->phdr.hdr.opah;
+		u8 l4 = hfi1_16B_get_l4(hdr);
+
+		if (l4 == OPA_16B_L4_IB_GLOBAL)
+			ohdr = &hdr->u.l.oth;
+		else
+			ohdr = &hdr->u.oth;
+		slid = hfi1_16B_get_slid(hdr);
+		pkey = hfi1_16B_get_pkey(hdr);
+	} else {
+		struct ib_header *hdr = &ps->s_txreq->phdr.hdr.ibh;
+		u8 lnh = ib_get_lnh(hdr);
+
+		if (lnh == HFI1_LRH_GRH)
+			ohdr = &hdr->u.l.oth;
+		else
+			ohdr = &hdr->u.oth;
+		slid = ib_get_slid(hdr);
+		pkey = ib_bth_get_pkey(ohdr);
+	}
+
+	ps->opcode = ib_bth_get_opcode(ohdr);
+	sr = get_send_routine(qp, ps);
+	ret = egress_pkey_check(dd->pport, slid, pkey,
+				priv->s_sc, qp->s_pkey_index);
+	if (unlikely(ret)) {
+		/*
+		 * The value we are returning here does not get propagated to
+		 * the verbs caller. Thus we need to complete the request with
+		 * error otherwise the caller could be sitting waiting on the
+		 * completion event. Only do this for PIO. SDMA has its own
+		 * mechanism for handling the errors. So for SDMA we can just
+		 * return.
+		 */
+		if (sr == dd->process_pio_send) {
+			unsigned long flags;
+
+			hfi1_cdbg(PIO, "%s() Failed. Completing with err",
+				  __func__);
+			spin_lock_irqsave(&qp->s_lock, flags);
+			hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+		}
+		return -EINVAL;
+	}
+	if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait))
+		return pio_wait(qp,
+				ps->s_txreq->psc,
+				ps,
+				RVT_S_WAIT_PIO_DRAIN);
+	return sr(qp, ps, 0);
+}
+
+/**
+ * hfi1_fill_device_attr - Fill in rvt dev info device attributes.
+ * @dd: the device data structure
+ */
+static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
+{
+	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+	u32 ver = dd->dc8051_ver;
+
+	memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
+
+	rdi->dparms.props.fw_ver = ((u64)(dc8051_ver_maj(ver)) << 32) |
+		((u64)(dc8051_ver_min(ver)) << 16) |
+		(u64)dc8051_ver_patch(ver);
+
+	rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+			IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+			IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+			IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE |
+			IB_DEVICE_MEM_MGT_EXTENSIONS |
+			IB_DEVICE_RDMA_NETDEV_OPA_VNIC;
+	rdi->dparms.props.page_size_cap = PAGE_SIZE;
+	rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
+	rdi->dparms.props.vendor_part_id = dd->pcidev->device;
+	rdi->dparms.props.hw_ver = dd->minrev;
+	rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid;
+	rdi->dparms.props.max_mr_size = U64_MAX;
+	rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
+	rdi->dparms.props.max_qp = hfi1_max_qps;
+	rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
+	rdi->dparms.props.max_sge = hfi1_max_sges;
+	rdi->dparms.props.max_sge_rd = hfi1_max_sges;
+	rdi->dparms.props.max_cq = hfi1_max_cqs;
+	rdi->dparms.props.max_ah = hfi1_max_ahs;
+	rdi->dparms.props.max_cqe = hfi1_max_cqes;
+	rdi->dparms.props.max_mr = rdi->lkey_table.max;
+	rdi->dparms.props.max_fmr = rdi->lkey_table.max;
+	rdi->dparms.props.max_map_per_fmr = 32767;
+	rdi->dparms.props.max_pd = hfi1_max_pds;
+	rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
+	rdi->dparms.props.max_qp_init_rd_atom = 255;
+	rdi->dparms.props.max_srq = hfi1_max_srqs;
+	rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs;
+	rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges;
+	rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB;
+	rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd);
+	rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps;
+	rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
+	rdi->dparms.props.max_total_mcast_qp_attach =
+					rdi->dparms.props.max_mcast_qp_attach *
+					rdi->dparms.props.max_mcast_grp;
+}
+
+static inline u16 opa_speed_to_ib(u16 in)
+{
+	u16 out = 0;
+
+	if (in & OPA_LINK_SPEED_25G)
+		out |= IB_SPEED_EDR;
+	if (in & OPA_LINK_SPEED_12_5G)
+		out |= IB_SPEED_FDR;
+
+	return out;
+}
+
+/*
+ * Convert a single OPA link width (no multiple flags) to an IB value.
+ * A zero OPA link width means link down, which means the IB width value
+ * is a don't care.
+ */
+static inline u16 opa_width_to_ib(u16 in)
+{
+	switch (in) {
+	case OPA_LINK_WIDTH_1X:
+	/* map 2x and 3x to 1x as they don't exist in IB */
+	case OPA_LINK_WIDTH_2X:
+	case OPA_LINK_WIDTH_3X:
+		return IB_WIDTH_1X;
+	default: /* link down or unknown, return our largest width */
+	case OPA_LINK_WIDTH_4X:
+		return IB_WIDTH_4X;
+	}
+}
+
+static int query_port(struct rvt_dev_info *rdi, u8 port_num,
+		      struct ib_port_attr *props)
+{
+	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+	struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
+	u32 lid = ppd->lid;
+
+	/* props being zeroed by the caller, avoid zeroing it here */
+	props->lid = lid ? lid : 0;
+	props->lmc = ppd->lmc;
+	/* OPA logical states match IB logical states */
+	props->state = driver_lstate(ppd);
+	props->phys_state = driver_pstate(ppd);
+	props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
+	props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
+	/* see rate_show() in ib core/sysfs.c */
+	props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
+	props->max_vl_num = ppd->vls_supported;
+
+	/* Once we are a "first class" citizen and have added the OPA MTUs to
+	 * the core we can advertise the larger MTU enum to the ULPs, for now
+	 * advertise only 4K.
+	 *
+	 * Those applications which are either OPA aware or pass the MTU enum
+	 * from the Path Records to us will get the new 8k MTU.  Those that
+	 * attempt to process the MTU enum may fail in various ways.
+	 */
+	props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
+				      4096 : hfi1_max_mtu), IB_MTU_4096);
+	props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
+		mtu_to_enum(ppd->ibmtu, IB_MTU_4096);
+
+	/*
+	 * sm_lid of 0xFFFF needs special handling so that it can
+	 * be differentiated from a permissve LID of 0xFFFF.
+	 * We set the grh_required flag here so the SA can program
+	 * the DGID in the address handle appropriately
+	 */
+	if (props->sm_lid == be16_to_cpu(IB_LID_PERMISSIVE))
+		props->grh_required = true;
+
+	return 0;
+}
+
+static int modify_device(struct ib_device *device,
+			 int device_modify_mask,
+			 struct ib_device_modify *device_modify)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(device);
+	unsigned i;
+	int ret;
+
+	if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+				   IB_DEVICE_MODIFY_NODE_DESC)) {
+		ret = -EOPNOTSUPP;
+		goto bail;
+	}
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		memcpy(device->node_desc, device_modify->node_desc,
+		       IB_DEVICE_NODE_DESC_MAX);
+		for (i = 0; i < dd->num_pports; i++) {
+			struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+			hfi1_node_desc_chg(ibp);
+		}
+	}
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
+		ib_hfi1_sys_image_guid =
+			cpu_to_be64(device_modify->sys_image_guid);
+		for (i = 0; i < dd->num_pports; i++) {
+			struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+			hfi1_sys_guid_chg(ibp);
+		}
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int shut_down_port(struct rvt_dev_info *rdi, u8 port_num)
+{
+	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+	struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
+	int ret;
+
+	set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
+			     OPA_LINKDOWN_REASON_UNKNOWN);
+	ret = set_link_state(ppd, HLS_DN_DOWNDEF);
+	return ret;
+}
+
+static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
+			    int guid_index, __be64 *guid)
+{
+	struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp);
+
+	if (guid_index >= HFI1_GUIDS_PER_PORT)
+		return -EINVAL;
+
+	*guid = get_sguid(ibp, guid_index);
+	return 0;
+}
+
+/*
+ * convert ah port,sl to sc
+ */
+u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah)
+{
+	struct hfi1_ibport *ibp = to_iport(ibdev, rdma_ah_get_port_num(ah));
+
+	return ibp->sl_to_sc[rdma_ah_get_sl(ah)];
+}
+
+static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr)
+{
+	struct hfi1_ibport *ibp;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_devdata *dd;
+	u8 sc5;
+
+	if (hfi1_check_mcast(rdma_ah_get_dlid(ah_attr)) &&
+	    !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
+		return -EINVAL;
+
+	/* test the mapping for validity */
+	ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
+	ppd = ppd_from_ibp(ibp);
+	sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)];
+	dd = dd_from_ppd(ppd);
+	if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
+		return -EINVAL;
+	return 0;
+}
+
+static void hfi1_notify_new_ah(struct ib_device *ibdev,
+			       struct rdma_ah_attr *ah_attr,
+			       struct rvt_ah *ah)
+{
+	struct hfi1_ibport *ibp;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_devdata *dd;
+	u8 sc5;
+	struct rdma_ah_attr *attr = &ah->attr;
+
+	/*
+	 * Do not trust reading anything from rvt_ah at this point as it is not
+	 * done being setup. We can however modify things which we need to set.
+	 */
+
+	ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
+	ppd = ppd_from_ibp(ibp);
+	sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)];
+	hfi1_update_ah_attr(ibdev, attr);
+	hfi1_make_opa_lid(attr);
+	dd = dd_from_ppd(ppd);
+	ah->vl = sc_to_vlt(dd, sc5);
+	if (ah->vl < num_vls || ah->vl == 15)
+		ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu);
+}
+
+/**
+ * hfi1_get_npkeys - return the size of the PKEY table for context 0
+ * @dd: the hfi1_ib device
+ */
+unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
+{
+	return ARRAY_SIZE(dd->pport[0].pkeys);
+}
+
+static void init_ibport(struct hfi1_pportdata *ppd)
+{
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+	size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
+	int i;
+
+	for (i = 0; i < sz; i++) {
+		ibp->sl_to_sc[i] = i;
+		ibp->sc_to_sl[i] = i;
+	}
+
+	for (i = 0; i < RVT_MAX_TRAP_LISTS ; i++)
+		INIT_LIST_HEAD(&ibp->rvp.trap_lists[i].list);
+	timer_setup(&ibp->rvp.trap_timer, hfi1_handle_trap_timer, 0);
+
+	spin_lock_init(&ibp->rvp.lock);
+	/* Set the prefix to the default value (see ch. 4.1.1) */
+	ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
+	ibp->rvp.sm_lid = 0;
+	/*
+	 * Below should only set bits defined in OPA PortInfo.CapabilityMask
+	 * and PortInfo.CapabilityMask3
+	 */
+	ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
+		IB_PORT_CAP_MASK_NOTICE_SUP;
+	ibp->rvp.port_cap3_flags = OPA_CAP_MASK3_IsSharedSpaceSupported;
+	ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+	ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+	ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+	ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+	ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+	RCU_INIT_POINTER(ibp->rvp.qp[0], NULL);
+	RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
+}
+
+static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str)
+{
+	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+	struct hfi1_ibdev *dev = dev_from_rdi(rdi);
+	u32 ver = dd_from_dev(dev)->dc8051_ver;
+
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u", dc8051_ver_maj(ver),
+		 dc8051_ver_min(ver), dc8051_ver_patch(ver));
+}
+
+static const char * const driver_cntr_names[] = {
+	/* must be element 0*/
+	"DRIVER_KernIntr",
+	"DRIVER_ErrorIntr",
+	"DRIVER_Tx_Errs",
+	"DRIVER_Rcv_Errs",
+	"DRIVER_HW_Errs",
+	"DRIVER_NoPIOBufs",
+	"DRIVER_CtxtsOpen",
+	"DRIVER_RcvLen_Errs",
+	"DRIVER_EgrBufFull",
+	"DRIVER_EgrHdrFull"
+};
+
+static DEFINE_MUTEX(cntr_names_lock); /* protects the *_cntr_names bufers */
+static const char **dev_cntr_names;
+static const char **port_cntr_names;
+static int num_driver_cntrs = ARRAY_SIZE(driver_cntr_names);
+static int num_dev_cntrs;
+static int num_port_cntrs;
+static int cntr_names_initialized;
+
+/*
+ * Convert a list of names separated by '\n' into an array of NULL terminated
+ * strings. Optionally some entries can be reserved in the array to hold extra
+ * external strings.
+ */
+static int init_cntr_names(const char *names_in,
+			   const size_t names_len,
+			   int num_extra_names,
+			   int *num_cntrs,
+			   const char ***cntr_names)
+{
+	char *names_out, *p, **q;
+	int i, n;
+
+	n = 0;
+	for (i = 0; i < names_len; i++)
+		if (names_in[i] == '\n')
+			n++;
+
+	names_out = kmalloc((n + num_extra_names) * sizeof(char *) + names_len,
+			    GFP_KERNEL);
+	if (!names_out) {
+		*num_cntrs = 0;
+		*cntr_names = NULL;
+		return -ENOMEM;
+	}
+
+	p = names_out + (n + num_extra_names) * sizeof(char *);
+	memcpy(p, names_in, names_len);
+
+	q = (char **)names_out;
+	for (i = 0; i < n; i++) {
+		q[i] = p;
+		p = strchr(p, '\n');
+		*p++ = '\0';
+	}
+
+	*num_cntrs = n;
+	*cntr_names = (const char **)names_out;
+	return 0;
+}
+
+static struct rdma_hw_stats *alloc_hw_stats(struct ib_device *ibdev,
+					    u8 port_num)
+{
+	int i, err;
+
+	mutex_lock(&cntr_names_lock);
+	if (!cntr_names_initialized) {
+		struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+
+		err = init_cntr_names(dd->cntrnames,
+				      dd->cntrnameslen,
+				      num_driver_cntrs,
+				      &num_dev_cntrs,
+				      &dev_cntr_names);
+		if (err) {
+			mutex_unlock(&cntr_names_lock);
+			return NULL;
+		}
+
+		for (i = 0; i < num_driver_cntrs; i++)
+			dev_cntr_names[num_dev_cntrs + i] =
+				driver_cntr_names[i];
+
+		err = init_cntr_names(dd->portcntrnames,
+				      dd->portcntrnameslen,
+				      0,
+				      &num_port_cntrs,
+				      &port_cntr_names);
+		if (err) {
+			kfree(dev_cntr_names);
+			dev_cntr_names = NULL;
+			mutex_unlock(&cntr_names_lock);
+			return NULL;
+		}
+		cntr_names_initialized = 1;
+	}
+	mutex_unlock(&cntr_names_lock);
+
+	if (!port_num)
+		return rdma_alloc_hw_stats_struct(
+				dev_cntr_names,
+				num_dev_cntrs + num_driver_cntrs,
+				RDMA_HW_STATS_DEFAULT_LIFESPAN);
+	else
+		return rdma_alloc_hw_stats_struct(
+				port_cntr_names,
+				num_port_cntrs,
+				RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static u64 hfi1_sps_ints(void)
+{
+	unsigned long flags;
+	struct hfi1_devdata *dd;
+	u64 sps_ints = 0;
+
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	list_for_each_entry(dd, &hfi1_dev_list, list) {
+		sps_ints += get_all_cpu_total(dd->int_counter);
+	}
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	return sps_ints;
+}
+
+static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+			u8 port, int index)
+{
+	u64 *values;
+	int count;
+
+	if (!port) {
+		u64 *stats = (u64 *)&hfi1_stats;
+		int i;
+
+		hfi1_read_cntrs(dd_from_ibdev(ibdev), NULL, &values);
+		values[num_dev_cntrs] = hfi1_sps_ints();
+		for (i = 1; i < num_driver_cntrs; i++)
+			values[num_dev_cntrs + i] = stats[i];
+		count = num_dev_cntrs + num_driver_cntrs;
+	} else {
+		struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+		hfi1_read_portcntrs(ppd_from_ibp(ibp), NULL, &values);
+		count = num_port_cntrs;
+	}
+
+	memcpy(stats->value, values, count * sizeof(u64));
+	return count;
+}
+
+/**
+ * hfi1_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return 0 if successful, errno if unsuccessful.
+ */
+int hfi1_register_ib_device(struct hfi1_devdata *dd)
+{
+	struct hfi1_ibdev *dev = &dd->verbs_dev;
+	struct ib_device *ibdev = &dev->rdi.ibdev;
+	struct hfi1_pportdata *ppd = dd->pport;
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < dd->num_pports; i++)
+		init_ibport(ppd + i);
+
+	/* Only need to initialize non-zero fields. */
+
+	timer_setup(&dev->mem_timer, mem_timer, 0);
+
+	seqlock_init(&dev->iowait_lock);
+	seqlock_init(&dev->txwait_lock);
+	INIT_LIST_HEAD(&dev->txwait);
+	INIT_LIST_HEAD(&dev->memwait);
+
+	ret = verbs_txreq_init(dev);
+	if (ret)
+		goto err_verbs_txreq;
+
+	/* Use first-port GUID as node guid */
+	ibdev->node_guid = get_sguid(ibp, HFI1_PORT_GUID_INDEX);
+
+	/*
+	 * The system image GUID is supposed to be the same for all
+	 * HFIs in a single system but since there can be other
+	 * device types in the system, we can't be sure this is unique.
+	 */
+	if (!ib_hfi1_sys_image_guid)
+		ib_hfi1_sys_image_guid = ibdev->node_guid;
+	ibdev->owner = THIS_MODULE;
+	ibdev->phys_port_cnt = dd->num_pports;
+	ibdev->dev.parent = &dd->pcidev->dev;
+	ibdev->modify_device = modify_device;
+	ibdev->alloc_hw_stats = alloc_hw_stats;
+	ibdev->get_hw_stats = get_hw_stats;
+	ibdev->alloc_rdma_netdev = hfi1_vnic_alloc_rn;
+
+	/* keep process mad in the driver */
+	ibdev->process_mad = hfi1_process_mad;
+	ibdev->get_dev_fw_str = hfi1_get_dev_fw_str;
+
+	strncpy(ibdev->node_desc, init_utsname()->nodename,
+		sizeof(ibdev->node_desc));
+
+	/*
+	 * Fill in rvt info object.
+	 */
+	dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
+	dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
+	dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
+	dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
+	dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be;
+	dd->verbs_dev.rdi.driver_f.query_port_state = query_port;
+	dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port;
+	dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg;
+	/*
+	 * Fill in rvt info device attributes.
+	 */
+	hfi1_fill_device_attr(dd);
+
+	/* queue pair */
+	dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size;
+	dd->verbs_dev.rdi.dparms.qpn_start = 0;
+	dd->verbs_dev.rdi.dparms.qpn_inc = 1;
+	dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift;
+	dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16;
+	dd->verbs_dev.rdi.dparms.qpn_res_end =
+	dd->verbs_dev.rdi.dparms.qpn_res_start + 65535;
+	dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC;
+	dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK;
+	dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT;
+	dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK;
+	dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA |
+						RDMA_CORE_CAP_OPA_AH;
+	dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
+
+	dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
+	dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
+	dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
+	dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
+	dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send_from_rvt;
+	dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send;
+	dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send;
+	dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr;
+	dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
+	dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters;
+	dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue;
+	dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp;
+	dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
+	dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp;
+	dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu;
+	dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
+	dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
+	dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
+	dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
+
+	/* completeion queue */
+	snprintf(dd->verbs_dev.rdi.dparms.cq_name,
+		 sizeof(dd->verbs_dev.rdi.dparms.cq_name),
+		 "hfi1_cq%d", dd->unit);
+	dd->verbs_dev.rdi.dparms.node = dd->node;
+
+	/* misc settings */
+	dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */
+	dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
+	dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
+	dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
+
+	/* post send table */
+	dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
+
+	ppd = dd->pport;
+	for (i = 0; i < dd->num_pports; i++, ppd++)
+		rvt_init_port(&dd->verbs_dev.rdi,
+			      &ppd->ibport_data.rvp,
+			      i,
+			      ppd->pkeys);
+
+	ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1);
+	if (ret)
+		goto err_verbs_txreq;
+
+	ret = hfi1_verbs_register_sysfs(dd);
+	if (ret)
+		goto err_class;
+
+	return ret;
+
+err_class:
+	rvt_unregister_device(&dd->verbs_dev.rdi);
+err_verbs_txreq:
+	verbs_txreq_exit(dev);
+	dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+	return ret;
+}
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
+{
+	struct hfi1_ibdev *dev = &dd->verbs_dev;
+
+	hfi1_verbs_unregister_sysfs(dd);
+
+	rvt_unregister_device(&dd->verbs_dev.rdi);
+
+	if (!list_empty(&dev->txwait))
+		dd_dev_err(dd, "txwait list not empty!\n");
+	if (!list_empty(&dev->memwait))
+		dd_dev_err(dd, "memwait list not empty!\n");
+
+	del_timer_sync(&dev->mem_timer);
+	verbs_txreq_exit(dev);
+
+	mutex_lock(&cntr_names_lock);
+	kfree(dev_cntr_names);
+	kfree(port_cntr_names);
+	dev_cntr_names = NULL;
+	port_cntr_names = NULL;
+	cntr_names_initialized = 0;
+	mutex_unlock(&cntr_names_lock);
+}
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct ib_header *hdr = packet->hdr;
+	struct rvt_qp *qp = packet->qp;
+	u32 lqpn, rqpn = 0;
+	u16 rlid = 0;
+	u8 sl, sc5, svc_type;
+
+	switch (packet->qp->ibqp.qp_type) {
+	case IB_QPT_UC:
+		rlid = rdma_ah_get_dlid(&qp->remote_ah_attr);
+		rqpn = qp->remote_qpn;
+		svc_type = IB_CC_SVCTYPE_UC;
+		break;
+	case IB_QPT_RC:
+		rlid = rdma_ah_get_dlid(&qp->remote_ah_attr);
+		rqpn = qp->remote_qpn;
+		svc_type = IB_CC_SVCTYPE_RC;
+		break;
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	case IB_QPT_UD:
+		svc_type = IB_CC_SVCTYPE_UD;
+		break;
+	default:
+		ibp->rvp.n_pkt_drops++;
+		return;
+	}
+
+	sc5 = hfi1_9B_get_sc5(hdr, packet->rhf);
+	sl = ibp->sc_to_sl[sc5];
+	lqpn = qp->ibqp.qp_num;
+
+	process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+}
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
new file mode 100644
index 0000000..2d787b8
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -0,0 +1,449 @@
+/*
+ * Copyright(c) 2015 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_VERBS_H
+#define HFI1_VERBS_H
+
+#include <linux/types.h>
+#include <linux/seqlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_hdrs.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+#include <rdma/rdmavt_cq.h>
+
+struct hfi1_ctxtdata;
+struct hfi1_pportdata;
+struct hfi1_devdata;
+struct hfi1_packet;
+
+#include "iowait.h"
+
+#define HFI1_MAX_RDMA_ATOMIC     16
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define HFI1_UVERBS_ABI_VERSION       2
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE       0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED    0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING    0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA   cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA    cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS   cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS    cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT   cpu_to_be16(0x0005)
+
+#define HFI1_VENDOR_IPG		cpu_to_be16(0xFFA0)
+
+#define IB_DEFAULT_GID_PREFIX	cpu_to_be64(0xfe80000000000000ULL)
+#define OPA_BTH_MIG_REQ		BIT(31)
+
+#define RC_OP(x) IB_OPCODE_RC_##x
+#define UC_OP(x) IB_OPCODE_UC_##x
+
+/* flags passed by hfi1_ib_rcv() */
+enum {
+	HFI1_HAS_GRH = (1 << 0),
+};
+
+#define LRH_16B_BYTES (FIELD_SIZEOF(struct hfi1_16b_header, lrh))
+#define LRH_16B_DWORDS (LRH_16B_BYTES / sizeof(u32))
+#define LRH_9B_BYTES (FIELD_SIZEOF(struct ib_header, lrh))
+#define LRH_9B_DWORDS (LRH_9B_BYTES / sizeof(u32))
+
+struct hfi1_16b_header {
+	u32 lrh[4];
+	union {
+		struct {
+			struct ib_grh grh;
+			struct ib_other_headers oth;
+		} l;
+		struct ib_other_headers oth;
+	} u;
+} __packed;
+
+struct hfi1_opa_header {
+	union {
+		struct ib_header ibh; /* 9B header */
+		struct hfi1_16b_header opah; /* 16B header */
+	};
+	u8 hdr_type; /* 9B or 16B */
+} __packed;
+
+struct hfi1_ahg_info {
+	u32 ahgdesc[2];
+	u16 tx_flags;
+	u8 ahgcount;
+	u8 ahgidx;
+};
+
+struct hfi1_sdma_header {
+	__le64 pbc;
+	struct hfi1_opa_header hdr;
+} __packed;
+
+/*
+ * hfi1 specific data structures that will be hidden from rvt after the queue
+ * pair is made common
+ */
+struct hfi1_qp_priv {
+	struct hfi1_ahg_info *s_ahg;              /* ahg info for next header */
+	struct sdma_engine *s_sde;                /* current sde */
+	struct send_context *s_sendcontext;       /* current sendcontext */
+	u8 s_sc;		                  /* SC[0..4] for next packet */
+	struct iowait s_iowait;
+	struct rvt_qp *owner;
+	u8 hdr_type; /* 9B or 16B */
+};
+
+/*
+ * This structure is used to hold commonly lookedup and computed values during
+ * the send engine progress.
+ */
+struct hfi1_pkt_state {
+	struct hfi1_ibdev *dev;
+	struct hfi1_ibport *ibp;
+	struct hfi1_pportdata *ppd;
+	struct verbs_txreq *s_txreq;
+	unsigned long flags;
+	unsigned long timeout;
+	unsigned long timeout_int;
+	int cpu;
+	u8 opcode;
+	bool in_thread;
+	bool pkts_sent;
+};
+
+#define HFI1_PSN_CREDIT  16
+
+struct hfi1_opcode_stats {
+	u64 n_packets;          /* number of packets */
+	u64 n_bytes;            /* total number of bytes */
+};
+
+struct hfi1_opcode_stats_perctx {
+	struct hfi1_opcode_stats stats[256];
+};
+
+static inline void inc_opstats(
+	u32 tlen,
+	struct hfi1_opcode_stats *stats)
+{
+#ifdef CONFIG_DEBUG_FS
+	stats->n_bytes += tlen;
+	stats->n_packets++;
+#endif
+}
+
+struct hfi1_ibport {
+	struct rvt_qp __rcu *qp[2];
+	struct rvt_ibport rvp;
+
+	/* the first 16 entries are sl_to_vl for !OPA */
+	u8 sl_to_sc[32];
+	u8 sc_to_sl[32];
+};
+
+struct hfi1_ibdev {
+	struct rvt_dev_info rdi; /* Must be first */
+
+	/* QP numbers are shared by all IB ports */
+	/* protect txwait list */
+	seqlock_t txwait_lock ____cacheline_aligned_in_smp;
+	struct list_head txwait;        /* list for wait verbs_txreq */
+	struct list_head memwait;       /* list for wait kernel memory */
+	struct kmem_cache *verbs_txreq_cache;
+	u64 n_txwait;
+	u64 n_kmem_wait;
+
+	/* protect iowait lists */
+	seqlock_t iowait_lock ____cacheline_aligned_in_smp;
+	u64 n_piowait;
+	u64 n_piodrain;
+	struct timer_list mem_timer;
+
+#ifdef CONFIG_DEBUG_FS
+	/* per HFI debugfs */
+	struct dentry *hfi1_ibdev_dbg;
+	/* per HFI symlinks to above */
+	struct dentry *hfi1_ibdev_link;
+#ifdef CONFIG_FAULT_INJECTION
+	struct fault_opcode *fault_opcode;
+	struct fault_packet *fault_packet;
+	bool fault_suppress_err;
+#endif
+#endif
+};
+
+static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev)
+{
+	struct rvt_dev_info *rdi;
+
+	rdi = container_of(ibdev, struct rvt_dev_info, ibdev);
+	return container_of(rdi, struct hfi1_ibdev, rdi);
+}
+
+static inline struct rvt_qp *iowait_to_qp(struct  iowait *s_iowait)
+{
+	struct hfi1_qp_priv *priv;
+
+	priv = container_of(s_iowait, struct hfi1_qp_priv, s_iowait);
+	return priv->owner;
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl,
+		   u32 qp1, u32 qp2, u32 lid1, u32 lid2);
+void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num);
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+		     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		     const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+		     struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+		     u16 *out_mad_pkey_index);
+
+/*
+ * The PSN_MASK and PSN_SHIFT allow for
+ * 1) comparing two PSNs
+ * 2) returning the PSN with any upper bits masked
+ * 3) returning the difference between to PSNs
+ *
+ * The number of significant bits in the PSN must
+ * necessarily be at least one bit less than
+ * the container holding the PSN.
+ */
+#define PSN_MASK 0x7FFFFFFF
+#define PSN_SHIFT 1
+#define PSN_MODIFY_MASK 0xFFFFFF
+
+/*
+ * Compare two PSNs
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp_psn(u32 a, u32 b)
+{
+	return (((int)a) - ((int)b)) << PSN_SHIFT;
+}
+
+/*
+ * Return masked PSN
+ */
+static inline u32 mask_psn(u32 a)
+{
+	return a & PSN_MASK;
+}
+
+/*
+ * Return delta between two PSNs
+ */
+static inline u32 delta_psn(u32 a, u32 b)
+{
+	return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
+}
+
+struct verbs_txreq;
+void hfi1_put_txreq(struct verbs_txreq *tx);
+
+int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length,
+		   bool release, bool copy_last);
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet);
+
+void hfi1_uc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_hdrerr(
+	struct hfi1_ctxtdata *rcd,
+	struct hfi1_packet *packet,
+	struct rvt_qp *qp);
+
+u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr);
+
+void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah);
+
+void hfi1_ud_rcv(struct hfi1_packet *packet);
+
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey);
+
+int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only);
+
+void hfi1_migrate_qp(struct rvt_qp *qp);
+
+int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+			 int attr_mask, struct ib_udata *udata);
+
+void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+		    int attr_mask, struct ib_udata *udata);
+void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait);
+int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+extern const u32 rc_only_opcode;
+extern const u32 uc_only_opcode;
+
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_packet *packet);
+
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+		  const struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
+			  u32 bth0, u32 bth2, int middle,
+			  struct hfi1_pkt_state *ps);
+
+void _hfi1_do_send(struct work_struct *work);
+
+void hfi1_do_send_from_rvt(struct rvt_qp *qp);
+
+void hfi1_do_send(struct rvt_qp *qp, bool in_thread);
+
+void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
+			enum ib_wc_status status);
+
+void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn);
+
+int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+int hfi1_register_ib_device(struct hfi1_devdata *);
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *);
+
+void hfi1_ib_rcv(struct hfi1_packet *packet);
+
+void hfi1_16B_rcv(struct hfi1_packet *packet);
+
+unsigned hfi1_get_npkeys(struct hfi1_devdata *);
+
+int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			u64 pbc);
+
+int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			u64 pbc);
+
+int hfi1_wss_init(void);
+void hfi1_wss_exit(void);
+
+/* platform specific: return the lowest level cache (llc) size, in KiB */
+static inline int wss_llc_size(void)
+{
+	/* assume that the boot CPU value is universal for all CPUs */
+	return boot_cpu_data.x86_cache_size;
+}
+
+/* platform specific: cacheless copy */
+static inline void cacheless_memcpy(void *dst, void *src, size_t n)
+{
+	/*
+	 * Use the only available X64 cacheless copy.  Add a __user cast
+	 * to quiet sparse.  The src agument is already in the kernel so
+	 * there are no security issues.  The extra fault recovery machinery
+	 * is not invoked.
+	 */
+	__copy_user_nocache(dst, (void __user *)src, n, 0);
+}
+
+static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr)
+{
+	return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ);
+}
+
+extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
+
+extern const u8 hdr_len_by_opcode[];
+
+extern const int ib_rvt_state_ops[];
+
+extern __be64 ib_hfi1_sys_image_guid;    /* in network order */
+
+extern unsigned int hfi1_max_cqes;
+
+extern unsigned int hfi1_max_cqs;
+
+extern unsigned int hfi1_max_qp_wrs;
+
+extern unsigned int hfi1_max_qps;
+
+extern unsigned int hfi1_max_sges;
+
+extern unsigned int hfi1_max_mcast_grps;
+
+extern unsigned int hfi1_max_mcast_qp_attached;
+
+extern unsigned int hfi1_max_srqs;
+
+extern unsigned int hfi1_max_srq_sges;
+
+extern unsigned int hfi1_max_srq_wrs;
+
+extern unsigned short piothreshold;
+
+extern const u32 ib_hfi1_rnr_table[];
+
+#endif                          /* HFI1_VERBS_H */
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.c b/drivers/infiniband/hw/hfi1/verbs_txreq.c
new file mode 100644
index 0000000..873e48e
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/verbs_txreq.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright(c) 2016 - 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "verbs_txreq.h"
+#include "qp.h"
+#include "trace.h"
+
+#define TXREQ_LEN 24
+
+void hfi1_put_txreq(struct verbs_txreq *tx)
+{
+	struct hfi1_ibdev *dev;
+	struct rvt_qp *qp;
+	unsigned long flags;
+	unsigned int seq;
+	struct hfi1_qp_priv *priv;
+
+	qp = tx->qp;
+	dev = to_idev(qp->ibqp.device);
+
+	if (tx->mr)
+		rvt_put_mr(tx->mr);
+
+	sdma_txclean(dd_from_dev(dev), &tx->txreq);
+
+	/* Free verbs_txreq and return to slab cache */
+	kmem_cache_free(dev->verbs_txreq_cache, tx);
+
+	do {
+		seq = read_seqbegin(&dev->txwait_lock);
+		if (!list_empty(&dev->txwait)) {
+			struct iowait *wait;
+
+			write_seqlock_irqsave(&dev->txwait_lock, flags);
+			wait = list_first_entry(&dev->txwait, struct iowait,
+						list);
+			qp = iowait_to_qp(wait);
+			priv = qp->priv;
+			list_del_init(&priv->s_iowait.list);
+			/* refcount held until actual wake up */
+			write_sequnlock_irqrestore(&dev->txwait_lock, flags);
+			hfi1_qp_wakeup(qp, RVT_S_WAIT_TX);
+			break;
+		}
+	} while (read_seqretry(&dev->txwait_lock, seq));
+}
+
+struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
+				struct rvt_qp *qp)
+	__must_hold(&qp->s_lock)
+{
+	struct verbs_txreq *tx = ERR_PTR(-EBUSY);
+
+	write_seqlock(&dev->txwait_lock);
+	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+		struct hfi1_qp_priv *priv;
+
+		tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+		if (tx)
+			goto out;
+		priv = qp->priv;
+		if (list_empty(&priv->s_iowait.list)) {
+			dev->n_txwait++;
+			qp->s_flags |= RVT_S_WAIT_TX;
+			list_add_tail(&priv->s_iowait.list, &dev->txwait);
+			priv->s_iowait.lock = &dev->txwait_lock;
+			trace_hfi1_qpsleep(qp, RVT_S_WAIT_TX);
+			rvt_get_qp(qp);
+		}
+		qp->s_flags &= ~RVT_S_BUSY;
+	}
+out:
+	write_sequnlock(&dev->txwait_lock);
+	return tx;
+}
+
+int verbs_txreq_init(struct hfi1_ibdev *dev)
+{
+	char buf[TXREQ_LEN];
+	struct hfi1_devdata *dd = dd_from_dev(dev);
+
+	snprintf(buf, sizeof(buf), "hfi1_%u_vtxreq_cache", dd->unit);
+	dev->verbs_txreq_cache = kmem_cache_create(buf,
+						   sizeof(struct verbs_txreq),
+						   0, SLAB_HWCACHE_ALIGN,
+						   NULL);
+	if (!dev->verbs_txreq_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void verbs_txreq_exit(struct hfi1_ibdev *dev)
+{
+	kmem_cache_destroy(dev->verbs_txreq_cache);
+	dev->verbs_txreq_cache = NULL;
+}
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h
new file mode 100644
index 0000000..729244c
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_VERBS_TXREQ_H
+#define HFI1_VERBS_TXREQ_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include "verbs.h"
+#include "sdma_txreq.h"
+#include "iowait.h"
+
+struct verbs_txreq {
+	struct hfi1_sdma_header	phdr;
+	struct sdma_txreq       txreq;
+	struct rvt_qp           *qp;
+	struct rvt_swqe         *wqe;
+	struct rvt_mregion	*mr;
+	struct rvt_sge_state    *ss;
+	struct sdma_engine     *sde;
+	struct send_context     *psc;
+	u16                     hdr_dwords;
+	u16			s_cur_size;
+};
+
+struct hfi1_ibdev;
+struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
+				struct rvt_qp *qp);
+
+static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
+					    struct rvt_qp *qp)
+	__must_hold(&qp->slock)
+{
+	struct verbs_txreq *tx;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+	if (unlikely(!tx)) {
+		/* call slow path to get the lock */
+		tx = __get_txreq(dev, qp);
+		if (IS_ERR(tx))
+			return tx;
+	}
+	tx->qp = qp;
+	tx->mr = NULL;
+	tx->sde = priv->s_sde;
+	tx->psc = priv->s_sendcontext;
+	/* so that we can test if the sdma decriptors are there */
+	tx->txreq.num_desc = 0;
+	/* Set the header type */
+	tx->phdr.hdr.hdr_type = priv->hdr_type;
+	return tx;
+}
+
+static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx)
+{
+	return &tx->txreq;
+}
+
+static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp)
+{
+	struct sdma_txreq *stx;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	stx = iowait_get_txhead(&priv->s_iowait);
+	if (stx)
+		return container_of(stx, struct verbs_txreq, txreq);
+	return NULL;
+}
+
+static inline bool verbs_txreq_queued(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	return iowait_packet_queued(&priv->s_iowait);
+}
+
+void hfi1_put_txreq(struct verbs_txreq *tx);
+int verbs_txreq_init(struct hfi1_ibdev *dev);
+void verbs_txreq_exit(struct hfi1_ibdev *dev);
+
+#endif                         /* HFI1_VERBS_TXREQ_H */
diff --git a/drivers/infiniband/hw/hfi1/vnic.h b/drivers/infiniband/hw/hfi1/vnic.h
new file mode 100644
index 0000000..5ae7815
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/vnic.h
@@ -0,0 +1,169 @@
+#ifndef _HFI1_VNIC_H
+#define _HFI1_VNIC_H
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/opa_vnic.h>
+#include "hfi.h"
+#include "sdma.h"
+
+#define HFI1_VNIC_MAX_TXQ     16
+#define HFI1_VNIC_MAX_PAD     12
+
+/* L4 header definitions */
+#define HFI1_VNIC_L4_HDR_OFFSET  OPA_VNIC_L2_HDR_LEN
+
+#define HFI1_VNIC_GET_L4_HDR(data)   \
+	(*((u16 *)((u8 *)(data) + HFI1_VNIC_L4_HDR_OFFSET)))
+
+#define HFI1_VNIC_GET_VESWID(data)   \
+	(HFI1_VNIC_GET_L4_HDR(data) & 0xFFF)
+
+/* Service class */
+#define HFI1_VNIC_SC_OFFSET_LOW 6
+#define HFI1_VNIC_SC_OFFSET_HI  7
+#define HFI1_VNIC_SC_SHIFT      4
+
+#define HFI1_VNIC_MAX_QUEUE 16
+
+/**
+ * struct hfi1_vnic_sdma - VNIC per Tx ring SDMA information
+ * @dd - device data pointer
+ * @sde - sdma engine
+ * @vinfo - vnic info pointer
+ * @wait - iowait structure
+ * @stx - sdma tx request
+ * @state - vnic Tx ring SDMA state
+ * @q_idx - vnic Tx queue index
+ */
+struct hfi1_vnic_sdma {
+	struct hfi1_devdata *dd;
+	struct sdma_engine  *sde;
+	struct hfi1_vnic_vport_info *vinfo;
+	struct iowait wait;
+	struct sdma_txreq stx;
+	unsigned int state;
+	u8 q_idx;
+	bool pkts_sent;
+};
+
+/**
+ * struct hfi1_vnic_rx_queue - HFI1 VNIC receive queue
+ * @idx: queue index
+ * @vinfo: pointer to vport information
+ * @netdev: network device
+ * @napi: netdev napi structure
+ * @skbq: queue of received socket buffers
+ */
+struct hfi1_vnic_rx_queue {
+	u8                           idx;
+	struct hfi1_vnic_vport_info *vinfo;
+	struct net_device           *netdev;
+	struct napi_struct           napi;
+	struct sk_buff_head          skbq;
+};
+
+/**
+ * struct hfi1_vnic_vport_info - HFI1 VNIC virtual port information
+ * @dd: device data pointer
+ * @netdev: net device pointer
+ * @flags: state flags
+ * @lock: vport lock
+ * @num_tx_q: number of transmit queues
+ * @num_rx_q: number of receive queues
+ * @vesw_id: virtual switch id
+ * @rxq: Array of receive queues
+ * @stats: per queue stats
+ * @sdma: VNIC SDMA structure per TXQ
+ */
+struct hfi1_vnic_vport_info {
+	struct hfi1_devdata *dd;
+	struct net_device   *netdev;
+	unsigned long        flags;
+
+	/* Lock used around state updates */
+	struct mutex         lock;
+
+	u8  num_tx_q;
+	u8  num_rx_q;
+	u16 vesw_id;
+	struct hfi1_vnic_rx_queue rxq[HFI1_NUM_VNIC_CTXT];
+
+	struct opa_vnic_stats  stats[HFI1_VNIC_MAX_QUEUE];
+	struct hfi1_vnic_sdma  sdma[HFI1_VNIC_MAX_TXQ];
+};
+
+#define v_dbg(format, arg...) \
+	netdev_dbg(vinfo->netdev, format, ## arg)
+#define v_err(format, arg...) \
+	netdev_err(vinfo->netdev, format, ## arg)
+#define v_info(format, arg...) \
+	netdev_info(vinfo->netdev, format, ## arg)
+
+/* vnic hfi1 internal functions */
+void hfi1_vnic_setup(struct hfi1_devdata *dd);
+void hfi1_vnic_cleanup(struct hfi1_devdata *dd);
+int hfi1_vnic_txreq_init(struct hfi1_devdata *dd);
+void hfi1_vnic_txreq_deinit(struct hfi1_devdata *dd);
+
+void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet);
+void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo);
+bool hfi1_vnic_sdma_write_avail(struct hfi1_vnic_vport_info *vinfo,
+				u8 q_idx);
+
+/* vnic rdma netdev operations */
+struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device,
+				      u8 port_num,
+				      enum rdma_netdev_t type,
+				      const char *name,
+				      unsigned char name_assign_type,
+				      void (*setup)(struct net_device *));
+int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx,
+		       struct hfi1_vnic_vport_info *vinfo,
+		       struct sk_buff *skb, u64 pbc, u8 plen);
+
+#endif /* _HFI1_VNIC_H */
diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c
new file mode 100644
index 0000000..5d65582
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/vnic_main.c
@@ -0,0 +1,859 @@
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains HFI1 support for VNIC functionality
+ */
+
+#include <linux/io.h>
+#include <linux/if_vlan.h>
+
+#include "vnic.h"
+
+#define HFI_TX_TIMEOUT_MS 1000
+
+#define HFI1_VNIC_RCV_Q_SIZE   1024
+
+#define HFI1_VNIC_UP 0
+
+static DEFINE_SPINLOCK(vport_cntr_lock);
+
+static int setup_vnic_ctxt(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt)
+{
+	unsigned int rcvctrl_ops = 0;
+	int ret;
+
+	uctxt->do_interrupt = &handle_receive_interrupt;
+
+	/* Now allocate the RcvHdr queue and eager buffers. */
+	ret = hfi1_create_rcvhdrq(dd, uctxt);
+	if (ret)
+		goto done;
+
+	ret = hfi1_setup_eagerbufs(uctxt);
+	if (ret)
+		goto done;
+
+	if (uctxt->rcvhdrtail_kvaddr)
+		clear_rcvhdrtail(uctxt);
+
+	rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
+	rcvctrl_ops |= HFI1_RCVCTRL_INTRAVAIL_ENB;
+
+	if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
+		rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
+		rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
+		rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+	if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
+		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
+
+	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt);
+done:
+	return ret;
+}
+
+static int allocate_vnic_ctxt(struct hfi1_devdata *dd,
+			      struct hfi1_ctxtdata **vnic_ctxt)
+{
+	struct hfi1_ctxtdata *uctxt;
+	int ret;
+
+	if (dd->flags & HFI1_FROZEN)
+		return -EIO;
+
+	ret = hfi1_create_ctxtdata(dd->pport, dd->node, &uctxt);
+	if (ret < 0) {
+		dd_dev_err(dd, "Unable to create ctxtdata, failing open\n");
+		return -ENOMEM;
+	}
+
+	uctxt->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
+			HFI1_CAP_KGET(NODROP_RHQ_FULL) |
+			HFI1_CAP_KGET(NODROP_EGR_FULL) |
+			HFI1_CAP_KGET(DMA_RTAIL);
+	uctxt->seq_cnt = 1;
+	uctxt->is_vnic = true;
+
+	if (dd->num_msix_entries)
+		hfi1_set_vnic_msix_info(uctxt);
+
+	hfi1_stats.sps_ctxts++;
+	dd_dev_dbg(dd, "created vnic context %d\n", uctxt->ctxt);
+	*vnic_ctxt = uctxt;
+
+	return 0;
+}
+
+static void deallocate_vnic_ctxt(struct hfi1_devdata *dd,
+				 struct hfi1_ctxtdata *uctxt)
+{
+	dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt);
+	flush_wc();
+
+	if (dd->num_msix_entries)
+		hfi1_reset_vnic_msix_info(uctxt);
+
+	/*
+	 * Disable receive context and interrupt available, reset all
+	 * RcvCtxtCtrl bits to default values.
+	 */
+	hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+		     HFI1_RCVCTRL_TIDFLOW_DIS |
+		     HFI1_RCVCTRL_INTRAVAIL_DIS |
+		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
+		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
+		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
+
+	uctxt->event_flags = 0;
+
+	hfi1_clear_tids(uctxt);
+	hfi1_clear_ctxt_pkey(dd, uctxt);
+
+	hfi1_stats.sps_ctxts--;
+
+	hfi1_free_ctxt(uctxt);
+}
+
+void hfi1_vnic_setup(struct hfi1_devdata *dd)
+{
+	idr_init(&dd->vnic.vesw_idr);
+}
+
+void hfi1_vnic_cleanup(struct hfi1_devdata *dd)
+{
+	idr_destroy(&dd->vnic.vesw_idr);
+}
+
+#define SUM_GRP_COUNTERS(stats, qstats, x_grp) do {            \
+		u64 *src64, *dst64;                            \
+		for (src64 = &qstats->x_grp.unicast,           \
+			dst64 = &stats->x_grp.unicast;         \
+			dst64 <= &stats->x_grp.s_1519_max;) {  \
+			*dst64++ += *src64++;                  \
+		}                                              \
+	} while (0)
+
+/* hfi1_vnic_update_stats - update statistics */
+static void hfi1_vnic_update_stats(struct hfi1_vnic_vport_info *vinfo,
+				   struct opa_vnic_stats *stats)
+{
+	struct net_device *netdev = vinfo->netdev;
+	u8 i;
+
+	/* add tx counters on different queues */
+	for (i = 0; i < vinfo->num_tx_q; i++) {
+		struct opa_vnic_stats *qstats = &vinfo->stats[i];
+		struct rtnl_link_stats64 *qnstats = &vinfo->stats[i].netstats;
+
+		stats->netstats.tx_fifo_errors += qnstats->tx_fifo_errors;
+		stats->netstats.tx_carrier_errors += qnstats->tx_carrier_errors;
+		stats->tx_drop_state += qstats->tx_drop_state;
+		stats->tx_dlid_zero += qstats->tx_dlid_zero;
+
+		SUM_GRP_COUNTERS(stats, qstats, tx_grp);
+		stats->netstats.tx_packets += qnstats->tx_packets;
+		stats->netstats.tx_bytes += qnstats->tx_bytes;
+	}
+
+	/* add rx counters on different queues */
+	for (i = 0; i < vinfo->num_rx_q; i++) {
+		struct opa_vnic_stats *qstats = &vinfo->stats[i];
+		struct rtnl_link_stats64 *qnstats = &vinfo->stats[i].netstats;
+
+		stats->netstats.rx_fifo_errors += qnstats->rx_fifo_errors;
+		stats->netstats.rx_nohandler += qnstats->rx_nohandler;
+		stats->rx_drop_state += qstats->rx_drop_state;
+		stats->rx_oversize += qstats->rx_oversize;
+		stats->rx_runt += qstats->rx_runt;
+
+		SUM_GRP_COUNTERS(stats, qstats, rx_grp);
+		stats->netstats.rx_packets += qnstats->rx_packets;
+		stats->netstats.rx_bytes += qnstats->rx_bytes;
+	}
+
+	stats->netstats.tx_errors = stats->netstats.tx_fifo_errors +
+				    stats->netstats.tx_carrier_errors +
+				    stats->tx_drop_state + stats->tx_dlid_zero;
+	stats->netstats.tx_dropped = stats->netstats.tx_errors;
+
+	stats->netstats.rx_errors = stats->netstats.rx_fifo_errors +
+				    stats->netstats.rx_nohandler +
+				    stats->rx_drop_state + stats->rx_oversize +
+				    stats->rx_runt;
+	stats->netstats.rx_dropped = stats->netstats.rx_errors;
+
+	netdev->stats.tx_packets = stats->netstats.tx_packets;
+	netdev->stats.tx_bytes = stats->netstats.tx_bytes;
+	netdev->stats.tx_fifo_errors = stats->netstats.tx_fifo_errors;
+	netdev->stats.tx_carrier_errors = stats->netstats.tx_carrier_errors;
+	netdev->stats.tx_errors = stats->netstats.tx_errors;
+	netdev->stats.tx_dropped = stats->netstats.tx_dropped;
+
+	netdev->stats.rx_packets = stats->netstats.rx_packets;
+	netdev->stats.rx_bytes = stats->netstats.rx_bytes;
+	netdev->stats.rx_fifo_errors = stats->netstats.rx_fifo_errors;
+	netdev->stats.multicast = stats->rx_grp.mcastbcast;
+	netdev->stats.rx_length_errors = stats->rx_oversize + stats->rx_runt;
+	netdev->stats.rx_errors = stats->netstats.rx_errors;
+	netdev->stats.rx_dropped = stats->netstats.rx_dropped;
+}
+
+/* update_len_counters - update pkt's len histogram counters */
+static inline void update_len_counters(struct opa_vnic_grp_stats *grp,
+				       int len)
+{
+	/* account for 4 byte FCS */
+	if (len >= 1515)
+		grp->s_1519_max++;
+	else if (len >= 1020)
+		grp->s_1024_1518++;
+	else if (len >= 508)
+		grp->s_512_1023++;
+	else if (len >= 252)
+		grp->s_256_511++;
+	else if (len >= 124)
+		grp->s_128_255++;
+	else if (len >= 61)
+		grp->s_65_127++;
+	else
+		grp->s_64++;
+}
+
+/* hfi1_vnic_update_tx_counters - update transmit counters */
+static void hfi1_vnic_update_tx_counters(struct hfi1_vnic_vport_info *vinfo,
+					 u8 q_idx, struct sk_buff *skb, int err)
+{
+	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
+	struct opa_vnic_stats *stats = &vinfo->stats[q_idx];
+	struct opa_vnic_grp_stats *tx_grp = &stats->tx_grp;
+	u16 vlan_tci;
+
+	stats->netstats.tx_packets++;
+	stats->netstats.tx_bytes += skb->len + ETH_FCS_LEN;
+
+	update_len_counters(tx_grp, skb->len);
+
+	/* rest of the counts are for good packets only */
+	if (unlikely(err))
+		return;
+
+	if (is_multicast_ether_addr(mac_hdr->h_dest))
+		tx_grp->mcastbcast++;
+	else
+		tx_grp->unicast++;
+
+	if (!__vlan_get_tag(skb, &vlan_tci))
+		tx_grp->vlan++;
+	else
+		tx_grp->untagged++;
+}
+
+/* hfi1_vnic_update_rx_counters - update receive counters */
+static void hfi1_vnic_update_rx_counters(struct hfi1_vnic_vport_info *vinfo,
+					 u8 q_idx, struct sk_buff *skb, int err)
+{
+	struct ethhdr *mac_hdr = (struct ethhdr *)skb->data;
+	struct opa_vnic_stats *stats = &vinfo->stats[q_idx];
+	struct opa_vnic_grp_stats *rx_grp = &stats->rx_grp;
+	u16 vlan_tci;
+
+	stats->netstats.rx_packets++;
+	stats->netstats.rx_bytes += skb->len + ETH_FCS_LEN;
+
+	update_len_counters(rx_grp, skb->len);
+
+	/* rest of the counts are for good packets only */
+	if (unlikely(err))
+		return;
+
+	if (is_multicast_ether_addr(mac_hdr->h_dest))
+		rx_grp->mcastbcast++;
+	else
+		rx_grp->unicast++;
+
+	if (!__vlan_get_tag(skb, &vlan_tci))
+		rx_grp->vlan++;
+	else
+		rx_grp->untagged++;
+}
+
+/* This function is overloaded for opa_vnic specific implementation */
+static void hfi1_vnic_get_stats64(struct net_device *netdev,
+				  struct rtnl_link_stats64 *stats)
+{
+	struct opa_vnic_stats *vstats = (struct opa_vnic_stats *)stats;
+	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
+
+	hfi1_vnic_update_stats(vinfo, vstats);
+}
+
+static u64 create_bypass_pbc(u32 vl, u32 dw_len)
+{
+	u64 pbc;
+
+	pbc = ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
+		| PBC_INSERT_BYPASS_ICRC | PBC_CREDIT_RETURN
+		| PBC_PACKET_BYPASS
+		| ((vl & PBC_VL_MASK) << PBC_VL_SHIFT)
+		| (dw_len & PBC_LENGTH_DWS_MASK) << PBC_LENGTH_DWS_SHIFT;
+
+	return pbc;
+}
+
+/* hfi1_vnic_maybe_stop_tx - stop tx queue if required */
+static void hfi1_vnic_maybe_stop_tx(struct hfi1_vnic_vport_info *vinfo,
+				    u8 q_idx)
+{
+	netif_stop_subqueue(vinfo->netdev, q_idx);
+	if (!hfi1_vnic_sdma_write_avail(vinfo, q_idx))
+		return;
+
+	netif_start_subqueue(vinfo->netdev, q_idx);
+}
+
+static netdev_tx_t hfi1_netdev_start_xmit(struct sk_buff *skb,
+					  struct net_device *netdev)
+{
+	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
+	u8 pad_len, q_idx = skb->queue_mapping;
+	struct hfi1_devdata *dd = vinfo->dd;
+	struct opa_vnic_skb_mdata *mdata;
+	u32 pkt_len, total_len;
+	int err = -EINVAL;
+	u64 pbc;
+
+	v_dbg("xmit: queue %d skb len %d\n", q_idx, skb->len);
+	if (unlikely(!netif_oper_up(netdev))) {
+		vinfo->stats[q_idx].tx_drop_state++;
+		goto tx_finish;
+	}
+
+	/* take out meta data */
+	mdata = (struct opa_vnic_skb_mdata *)skb->data;
+	skb_pull(skb, sizeof(*mdata));
+	if (unlikely(mdata->flags & OPA_VNIC_SKB_MDATA_ENCAP_ERR)) {
+		vinfo->stats[q_idx].tx_dlid_zero++;
+		goto tx_finish;
+	}
+
+	/* add tail padding (for 8 bytes size alignment) and icrc */
+	pad_len = -(skb->len + OPA_VNIC_ICRC_TAIL_LEN) & 0x7;
+	pad_len += OPA_VNIC_ICRC_TAIL_LEN;
+
+	/*
+	 * pkt_len is how much data we have to write, includes header and data.
+	 * total_len is length of the packet in Dwords plus the PBC should not
+	 * include the CRC.
+	 */
+	pkt_len = (skb->len + pad_len) >> 2;
+	total_len = pkt_len + 2; /* PBC + packet */
+
+	pbc = create_bypass_pbc(mdata->vl, total_len);
+
+	skb_get(skb);
+	v_dbg("pbc 0x%016llX len %d pad_len %d\n", pbc, skb->len, pad_len);
+	err = dd->process_vnic_dma_send(dd, q_idx, vinfo, skb, pbc, pad_len);
+	if (unlikely(err)) {
+		if (err == -ENOMEM)
+			vinfo->stats[q_idx].netstats.tx_fifo_errors++;
+		else if (err != -EBUSY)
+			vinfo->stats[q_idx].netstats.tx_carrier_errors++;
+	}
+	/* remove the header before updating tx counters */
+	skb_pull(skb, OPA_VNIC_HDR_LEN);
+
+	if (unlikely(err == -EBUSY)) {
+		hfi1_vnic_maybe_stop_tx(vinfo, q_idx);
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_BUSY;
+	}
+
+tx_finish:
+	/* update tx counters */
+	hfi1_vnic_update_tx_counters(vinfo, q_idx, skb, err);
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
+}
+
+static u16 hfi1_vnic_select_queue(struct net_device *netdev,
+				  struct sk_buff *skb,
+				  void *accel_priv,
+				  select_queue_fallback_t fallback)
+{
+	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
+	struct opa_vnic_skb_mdata *mdata;
+	struct sdma_engine *sde;
+
+	mdata = (struct opa_vnic_skb_mdata *)skb->data;
+	sde = sdma_select_engine_vl(vinfo->dd, mdata->entropy, mdata->vl);
+	return sde->this_idx;
+}
+
+/* hfi1_vnic_decap_skb - strip OPA header from the skb (ethernet) packet */
+static inline int hfi1_vnic_decap_skb(struct hfi1_vnic_rx_queue *rxq,
+				      struct sk_buff *skb)
+{
+	struct hfi1_vnic_vport_info *vinfo = rxq->vinfo;
+	int max_len = vinfo->netdev->mtu + VLAN_ETH_HLEN;
+	int rc = -EFAULT;
+
+	skb_pull(skb, OPA_VNIC_HDR_LEN);
+
+	/* Validate Packet length */
+	if (unlikely(skb->len > max_len))
+		vinfo->stats[rxq->idx].rx_oversize++;
+	else if (unlikely(skb->len < ETH_ZLEN))
+		vinfo->stats[rxq->idx].rx_runt++;
+	else
+		rc = 0;
+	return rc;
+}
+
+static inline struct sk_buff *hfi1_vnic_get_skb(struct hfi1_vnic_rx_queue *rxq)
+{
+	unsigned char *pad_info;
+	struct sk_buff *skb;
+
+	skb = skb_dequeue(&rxq->skbq);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* remove tail padding and icrc */
+	pad_info = skb->data + skb->len - 1;
+	skb_trim(skb, (skb->len - OPA_VNIC_ICRC_TAIL_LEN -
+		       ((*pad_info) & 0x7)));
+
+	return skb;
+}
+
+/* hfi1_vnic_handle_rx - handle skb receive */
+static void hfi1_vnic_handle_rx(struct hfi1_vnic_rx_queue *rxq,
+				int *work_done, int work_to_do)
+{
+	struct hfi1_vnic_vport_info *vinfo = rxq->vinfo;
+	struct sk_buff *skb;
+	int rc;
+
+	while (1) {
+		if (*work_done >= work_to_do)
+			break;
+
+		skb = hfi1_vnic_get_skb(rxq);
+		if (unlikely(!skb))
+			break;
+
+		rc = hfi1_vnic_decap_skb(rxq, skb);
+		/* update rx counters */
+		hfi1_vnic_update_rx_counters(vinfo, rxq->idx, skb, rc);
+		if (unlikely(rc)) {
+			dev_kfree_skb_any(skb);
+			continue;
+		}
+
+		skb_checksum_none_assert(skb);
+		skb->protocol = eth_type_trans(skb, rxq->netdev);
+
+		napi_gro_receive(&rxq->napi, skb);
+		(*work_done)++;
+	}
+}
+
+/* hfi1_vnic_napi - napi receive polling callback function */
+static int hfi1_vnic_napi(struct napi_struct *napi, int budget)
+{
+	struct hfi1_vnic_rx_queue *rxq = container_of(napi,
+					      struct hfi1_vnic_rx_queue, napi);
+	struct hfi1_vnic_vport_info *vinfo = rxq->vinfo;
+	int work_done = 0;
+
+	v_dbg("napi %d budget %d\n", rxq->idx, budget);
+	hfi1_vnic_handle_rx(rxq, &work_done, budget);
+
+	v_dbg("napi %d work_done %d\n", rxq->idx, work_done);
+	if (work_done < budget)
+		napi_complete(napi);
+
+	return work_done;
+}
+
+void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_devdata *dd = packet->rcd->dd;
+	struct hfi1_vnic_vport_info *vinfo = NULL;
+	struct hfi1_vnic_rx_queue *rxq;
+	struct sk_buff *skb;
+	int l4_type, vesw_id = -1;
+	u8 q_idx;
+
+	l4_type = hfi1_16B_get_l4(packet->ebuf);
+	if (likely(l4_type == OPA_16B_L4_ETHR)) {
+		vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf);
+		vinfo = idr_find(&dd->vnic.vesw_idr, vesw_id);
+
+		/*
+		 * In case of invalid vesw id, count the error on
+		 * the first available vport.
+		 */
+		if (unlikely(!vinfo)) {
+			struct hfi1_vnic_vport_info *vinfo_tmp;
+			int id_tmp = 0;
+
+			vinfo_tmp =  idr_get_next(&dd->vnic.vesw_idr, &id_tmp);
+			if (vinfo_tmp) {
+				spin_lock(&vport_cntr_lock);
+				vinfo_tmp->stats[0].netstats.rx_nohandler++;
+				spin_unlock(&vport_cntr_lock);
+			}
+		}
+	}
+
+	if (unlikely(!vinfo)) {
+		dd_dev_warn(dd, "vnic rcv err: l4 %d vesw id %d ctx %d\n",
+			    l4_type, vesw_id, packet->rcd->ctxt);
+		return;
+	}
+
+	q_idx = packet->rcd->vnic_q_idx;
+	rxq = &vinfo->rxq[q_idx];
+	if (unlikely(!netif_oper_up(vinfo->netdev))) {
+		vinfo->stats[q_idx].rx_drop_state++;
+		skb_queue_purge(&rxq->skbq);
+		return;
+	}
+
+	if (unlikely(skb_queue_len(&rxq->skbq) > HFI1_VNIC_RCV_Q_SIZE)) {
+		vinfo->stats[q_idx].netstats.rx_fifo_errors++;
+		return;
+	}
+
+	skb = netdev_alloc_skb(vinfo->netdev, packet->tlen);
+	if (unlikely(!skb)) {
+		vinfo->stats[q_idx].netstats.rx_fifo_errors++;
+		return;
+	}
+
+	memcpy(skb->data, packet->ebuf, packet->tlen);
+	skb_put(skb, packet->tlen);
+	skb_queue_tail(&rxq->skbq, skb);
+
+	if (napi_schedule_prep(&rxq->napi)) {
+		v_dbg("napi %d scheduling\n", q_idx);
+		__napi_schedule(&rxq->napi);
+	}
+}
+
+static int hfi1_vnic_up(struct hfi1_vnic_vport_info *vinfo)
+{
+	struct hfi1_devdata *dd = vinfo->dd;
+	struct net_device *netdev = vinfo->netdev;
+	int i, rc;
+
+	/* ensure virtual eth switch id is valid */
+	if (!vinfo->vesw_id)
+		return -EINVAL;
+
+	rc = idr_alloc(&dd->vnic.vesw_idr, vinfo, vinfo->vesw_id,
+		       vinfo->vesw_id + 1, GFP_NOWAIT);
+	if (rc < 0)
+		return rc;
+
+	for (i = 0; i < vinfo->num_rx_q; i++) {
+		struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i];
+
+		skb_queue_head_init(&rxq->skbq);
+		napi_enable(&rxq->napi);
+	}
+
+	netif_carrier_on(netdev);
+	netif_tx_start_all_queues(netdev);
+	set_bit(HFI1_VNIC_UP, &vinfo->flags);
+
+	return 0;
+}
+
+static void hfi1_vnic_down(struct hfi1_vnic_vport_info *vinfo)
+{
+	struct hfi1_devdata *dd = vinfo->dd;
+	u8 i;
+
+	clear_bit(HFI1_VNIC_UP, &vinfo->flags);
+	netif_carrier_off(vinfo->netdev);
+	netif_tx_disable(vinfo->netdev);
+	idr_remove(&dd->vnic.vesw_idr, vinfo->vesw_id);
+
+	/* ensure irqs see the change */
+	hfi1_vnic_synchronize_irq(dd);
+
+	/* remove unread skbs */
+	for (i = 0; i < vinfo->num_rx_q; i++) {
+		struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i];
+
+		napi_disable(&rxq->napi);
+		skb_queue_purge(&rxq->skbq);
+	}
+}
+
+static int hfi1_netdev_open(struct net_device *netdev)
+{
+	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
+	int rc;
+
+	mutex_lock(&vinfo->lock);
+	rc = hfi1_vnic_up(vinfo);
+	mutex_unlock(&vinfo->lock);
+	return rc;
+}
+
+static int hfi1_netdev_close(struct net_device *netdev)
+{
+	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
+
+	mutex_lock(&vinfo->lock);
+	if (test_bit(HFI1_VNIC_UP, &vinfo->flags))
+		hfi1_vnic_down(vinfo);
+	mutex_unlock(&vinfo->lock);
+	return 0;
+}
+
+static int hfi1_vnic_allot_ctxt(struct hfi1_devdata *dd,
+				struct hfi1_ctxtdata **vnic_ctxt)
+{
+	int rc;
+
+	rc = allocate_vnic_ctxt(dd, vnic_ctxt);
+	if (rc) {
+		dd_dev_err(dd, "vnic ctxt alloc failed %d\n", rc);
+		return rc;
+	}
+
+	rc = setup_vnic_ctxt(dd, *vnic_ctxt);
+	if (rc) {
+		dd_dev_err(dd, "vnic ctxt setup failed %d\n", rc);
+		deallocate_vnic_ctxt(dd, *vnic_ctxt);
+		*vnic_ctxt = NULL;
+	}
+
+	return rc;
+}
+
+static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo)
+{
+	struct hfi1_devdata *dd = vinfo->dd;
+	int i, rc = 0;
+
+	mutex_lock(&hfi1_mutex);
+	if (!dd->vnic.num_vports) {
+		rc = hfi1_vnic_txreq_init(dd);
+		if (rc)
+			goto txreq_fail;
+
+		dd->vnic.msix_idx = dd->first_dyn_msix_idx;
+	}
+
+	for (i = dd->vnic.num_ctxt; i < vinfo->num_rx_q; i++) {
+		rc = hfi1_vnic_allot_ctxt(dd, &dd->vnic.ctxt[i]);
+		if (rc)
+			break;
+		hfi1_rcd_get(dd->vnic.ctxt[i]);
+		dd->vnic.ctxt[i]->vnic_q_idx = i;
+	}
+
+	if (i < vinfo->num_rx_q) {
+		/*
+		 * If required amount of contexts is not
+		 * allocated successfully then remaining contexts
+		 * are released.
+		 */
+		while (i-- > dd->vnic.num_ctxt) {
+			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
+			hfi1_rcd_put(dd->vnic.ctxt[i]);
+			dd->vnic.ctxt[i] = NULL;
+		}
+		goto alloc_fail;
+	}
+
+	if (dd->vnic.num_ctxt != i) {
+		dd->vnic.num_ctxt = i;
+		hfi1_init_vnic_rsm(dd);
+	}
+
+	dd->vnic.num_vports++;
+	hfi1_vnic_sdma_init(vinfo);
+alloc_fail:
+	if (!dd->vnic.num_vports)
+		hfi1_vnic_txreq_deinit(dd);
+txreq_fail:
+	mutex_unlock(&hfi1_mutex);
+	return rc;
+}
+
+static void hfi1_vnic_deinit(struct hfi1_vnic_vport_info *vinfo)
+{
+	struct hfi1_devdata *dd = vinfo->dd;
+	int i;
+
+	mutex_lock(&hfi1_mutex);
+	if (--dd->vnic.num_vports == 0) {
+		for (i = 0; i < dd->vnic.num_ctxt; i++) {
+			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
+			hfi1_rcd_put(dd->vnic.ctxt[i]);
+			dd->vnic.ctxt[i] = NULL;
+		}
+		hfi1_deinit_vnic_rsm(dd);
+		dd->vnic.num_ctxt = 0;
+		hfi1_vnic_txreq_deinit(dd);
+	}
+	mutex_unlock(&hfi1_mutex);
+}
+
+static void hfi1_vnic_set_vesw_id(struct net_device *netdev, int id)
+{
+	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
+	bool reopen = false;
+
+	/*
+	 * If vesw_id is being changed, and if the vnic port is up,
+	 * reset the vnic port to ensure new vesw_id gets picked up
+	 */
+	if (id != vinfo->vesw_id) {
+		mutex_lock(&vinfo->lock);
+		if (test_bit(HFI1_VNIC_UP, &vinfo->flags)) {
+			hfi1_vnic_down(vinfo);
+			reopen = true;
+		}
+
+		vinfo->vesw_id = id;
+		if (reopen)
+			hfi1_vnic_up(vinfo);
+
+		mutex_unlock(&vinfo->lock);
+	}
+}
+
+/* netdev ops */
+static const struct net_device_ops hfi1_netdev_ops = {
+	.ndo_open = hfi1_netdev_open,
+	.ndo_stop = hfi1_netdev_close,
+	.ndo_start_xmit = hfi1_netdev_start_xmit,
+	.ndo_select_queue = hfi1_vnic_select_queue,
+	.ndo_get_stats64 = hfi1_vnic_get_stats64,
+};
+
+static void hfi1_vnic_free_rn(struct net_device *netdev)
+{
+	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
+
+	hfi1_vnic_deinit(vinfo);
+	mutex_destroy(&vinfo->lock);
+	free_netdev(netdev);
+}
+
+struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device,
+				      u8 port_num,
+				      enum rdma_netdev_t type,
+				      const char *name,
+				      unsigned char name_assign_type,
+				      void (*setup)(struct net_device *))
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(device);
+	struct hfi1_vnic_vport_info *vinfo;
+	struct net_device *netdev;
+	struct rdma_netdev *rn;
+	int i, size, rc;
+
+	if (!dd->num_vnic_contexts)
+		return ERR_PTR(-ENOMEM);
+
+	if (!port_num || (port_num > dd->num_pports))
+		return ERR_PTR(-EINVAL);
+
+	if (type != RDMA_NETDEV_OPA_VNIC)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	size = sizeof(struct opa_vnic_rdma_netdev) + sizeof(*vinfo);
+	netdev = alloc_netdev_mqs(size, name, name_assign_type, setup,
+				  dd->chip_sdma_engines, dd->num_vnic_contexts);
+	if (!netdev)
+		return ERR_PTR(-ENOMEM);
+
+	rn = netdev_priv(netdev);
+	vinfo = opa_vnic_dev_priv(netdev);
+	vinfo->dd = dd;
+	vinfo->num_tx_q = dd->chip_sdma_engines;
+	vinfo->num_rx_q = dd->num_vnic_contexts;
+	vinfo->netdev = netdev;
+	rn->free_rdma_netdev = hfi1_vnic_free_rn;
+	rn->set_id = hfi1_vnic_set_vesw_id;
+
+	netdev->features = NETIF_F_HIGHDMA | NETIF_F_SG;
+	netdev->hw_features = netdev->features;
+	netdev->vlan_features = netdev->features;
+	netdev->watchdog_timeo = msecs_to_jiffies(HFI_TX_TIMEOUT_MS);
+	netdev->netdev_ops = &hfi1_netdev_ops;
+	mutex_init(&vinfo->lock);
+
+	for (i = 0; i < vinfo->num_rx_q; i++) {
+		struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i];
+
+		rxq->idx = i;
+		rxq->vinfo = vinfo;
+		rxq->netdev = netdev;
+		netif_napi_add(netdev, &rxq->napi, hfi1_vnic_napi, 64);
+	}
+
+	rc = hfi1_vnic_init(vinfo);
+	if (rc)
+		goto init_fail;
+
+	return netdev;
+init_fail:
+	mutex_destroy(&vinfo->lock);
+	free_netdev(netdev);
+	return ERR_PTR(rc);
+}
diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c
new file mode 100644
index 0000000..c3c96c5
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c
@@ -0,0 +1,324 @@
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains HFI1 support for VNIC SDMA functionality
+ */
+
+#include "sdma.h"
+#include "vnic.h"
+
+#define HFI1_VNIC_SDMA_Q_ACTIVE   BIT(0)
+#define HFI1_VNIC_SDMA_Q_DEFERRED BIT(1)
+
+#define HFI1_VNIC_TXREQ_NAME_LEN   32
+#define HFI1_VNIC_SDMA_DESC_WTRMRK 64
+#define HFI1_VNIC_SDMA_RETRY_COUNT 1
+
+/*
+ * struct vnic_txreq - VNIC transmit descriptor
+ * @txreq: sdma transmit request
+ * @sdma: vnic sdma pointer
+ * @skb: skb to send
+ * @pad: pad buffer
+ * @plen: pad length
+ * @pbc_val: pbc value
+ * @retry_count: tx retry count
+ */
+struct vnic_txreq {
+	struct sdma_txreq       txreq;
+	struct hfi1_vnic_sdma   *sdma;
+
+	struct sk_buff         *skb;
+	unsigned char           pad[HFI1_VNIC_MAX_PAD];
+	u16                     plen;
+	__le64                  pbc_val;
+
+	u32                     retry_count;
+};
+
+static void vnic_sdma_complete(struct sdma_txreq *txreq,
+			       int status)
+{
+	struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq);
+	struct hfi1_vnic_sdma *vnic_sdma = tx->sdma;
+
+	sdma_txclean(vnic_sdma->dd, txreq);
+	dev_kfree_skb_any(tx->skb);
+	kmem_cache_free(vnic_sdma->dd->vnic.txreq_cache, tx);
+}
+
+static noinline int build_vnic_ulp_payload(struct sdma_engine *sde,
+					   struct vnic_txreq *tx)
+{
+	int i, ret = 0;
+
+	ret = sdma_txadd_kvaddr(
+		sde->dd,
+		&tx->txreq,
+		tx->skb->data,
+		skb_headlen(tx->skb));
+	if (unlikely(ret))
+		goto bail_txadd;
+
+	for (i = 0; i < skb_shinfo(tx->skb)->nr_frags; i++) {
+		struct skb_frag_struct *frag = &skb_shinfo(tx->skb)->frags[i];
+
+		/* combine physically continuous fragments later? */
+		ret = sdma_txadd_page(sde->dd,
+				      &tx->txreq,
+				      skb_frag_page(frag),
+				      frag->page_offset,
+				      skb_frag_size(frag));
+		if (unlikely(ret))
+			goto bail_txadd;
+	}
+
+	if (tx->plen)
+		ret = sdma_txadd_kvaddr(sde->dd, &tx->txreq,
+					tx->pad + HFI1_VNIC_MAX_PAD - tx->plen,
+					tx->plen);
+
+bail_txadd:
+	return ret;
+}
+
+static int build_vnic_tx_desc(struct sdma_engine *sde,
+			      struct vnic_txreq *tx,
+			      u64 pbc)
+{
+	int ret = 0;
+	u16 hdrbytes = 2 << 2;  /* PBC */
+
+	ret = sdma_txinit_ahg(
+		&tx->txreq,
+		0,
+		hdrbytes + tx->skb->len + tx->plen,
+		0,
+		0,
+		NULL,
+		0,
+		vnic_sdma_complete);
+	if (unlikely(ret))
+		goto bail_txadd;
+
+	/* add pbc */
+	tx->pbc_val = cpu_to_le64(pbc);
+	ret = sdma_txadd_kvaddr(
+		sde->dd,
+		&tx->txreq,
+		&tx->pbc_val,
+		hdrbytes);
+	if (unlikely(ret))
+		goto bail_txadd;
+
+	/* add the ulp payload */
+	ret = build_vnic_ulp_payload(sde, tx);
+bail_txadd:
+	return ret;
+}
+
+/* setup the last plen bypes of pad */
+static inline void hfi1_vnic_update_pad(unsigned char *pad, u8 plen)
+{
+	pad[HFI1_VNIC_MAX_PAD - 1] = plen - OPA_VNIC_ICRC_TAIL_LEN;
+}
+
+int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx,
+		       struct hfi1_vnic_vport_info *vinfo,
+		       struct sk_buff *skb, u64 pbc, u8 plen)
+{
+	struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[q_idx];
+	struct sdma_engine *sde = vnic_sdma->sde;
+	struct vnic_txreq *tx;
+	int ret = -ECOMM;
+
+	if (unlikely(READ_ONCE(vnic_sdma->state) != HFI1_VNIC_SDMA_Q_ACTIVE))
+		goto tx_err;
+
+	if (unlikely(!sde || !sdma_running(sde)))
+		goto tx_err;
+
+	tx = kmem_cache_alloc(dd->vnic.txreq_cache, GFP_ATOMIC);
+	if (unlikely(!tx)) {
+		ret = -ENOMEM;
+		goto tx_err;
+	}
+
+	tx->sdma = vnic_sdma;
+	tx->skb = skb;
+	hfi1_vnic_update_pad(tx->pad, plen);
+	tx->plen = plen;
+	ret = build_vnic_tx_desc(sde, tx, pbc);
+	if (unlikely(ret))
+		goto free_desc;
+	tx->retry_count = 0;
+
+	ret = sdma_send_txreq(sde, &vnic_sdma->wait, &tx->txreq,
+			      vnic_sdma->pkts_sent);
+	/* When -ECOMM, sdma callback will be called with ABORT status */
+	if (unlikely(ret && unlikely(ret != -ECOMM)))
+		goto free_desc;
+
+	if (!ret) {
+		vnic_sdma->pkts_sent = true;
+		iowait_starve_clear(vnic_sdma->pkts_sent, &vnic_sdma->wait);
+	}
+	return ret;
+
+free_desc:
+	sdma_txclean(dd, &tx->txreq);
+	kmem_cache_free(dd->vnic.txreq_cache, tx);
+tx_err:
+	if (ret != -EBUSY)
+		dev_kfree_skb_any(skb);
+	else
+		vnic_sdma->pkts_sent = false;
+	return ret;
+}
+
+/*
+ * hfi1_vnic_sdma_sleep - vnic sdma sleep function
+ *
+ * This function gets called from sdma_send_txreq() when there are not enough
+ * sdma descriptors available to send the packet. It adds Tx queue's wait
+ * structure to sdma engine's dmawait list to be woken up when descriptors
+ * become available.
+ */
+static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde,
+				struct iowait *wait,
+				struct sdma_txreq *txreq,
+				uint seq,
+				bool pkts_sent)
+{
+	struct hfi1_vnic_sdma *vnic_sdma =
+		container_of(wait, struct hfi1_vnic_sdma, wait);
+	struct hfi1_ibdev *dev = &vnic_sdma->dd->verbs_dev;
+	struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq);
+
+	if (sdma_progress(sde, seq, txreq))
+		if (tx->retry_count++ < HFI1_VNIC_SDMA_RETRY_COUNT)
+			return -EAGAIN;
+
+	vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED;
+	write_seqlock(&dev->iowait_lock);
+	if (list_empty(&vnic_sdma->wait.list))
+		iowait_queue(pkts_sent, wait, &sde->dmawait);
+	write_sequnlock(&dev->iowait_lock);
+	return -EBUSY;
+}
+
+/*
+ * hfi1_vnic_sdma_wakeup - vnic sdma wakeup function
+ *
+ * This function gets called when SDMA descriptors becomes available and Tx
+ * queue's wait structure was previously added to sdma engine's dmawait list.
+ * It notifies the upper driver about Tx queue wakeup.
+ */
+static void hfi1_vnic_sdma_wakeup(struct iowait *wait, int reason)
+{
+	struct hfi1_vnic_sdma *vnic_sdma =
+		container_of(wait, struct hfi1_vnic_sdma, wait);
+	struct hfi1_vnic_vport_info *vinfo = vnic_sdma->vinfo;
+
+	vnic_sdma->state = HFI1_VNIC_SDMA_Q_ACTIVE;
+	if (__netif_subqueue_stopped(vinfo->netdev, vnic_sdma->q_idx))
+		netif_wake_subqueue(vinfo->netdev, vnic_sdma->q_idx);
+};
+
+inline bool hfi1_vnic_sdma_write_avail(struct hfi1_vnic_vport_info *vinfo,
+				       u8 q_idx)
+{
+	struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[q_idx];
+
+	return (READ_ONCE(vnic_sdma->state) == HFI1_VNIC_SDMA_Q_ACTIVE);
+}
+
+void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo)
+{
+	int i;
+
+	for (i = 0; i < vinfo->num_tx_q; i++) {
+		struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[i];
+
+		iowait_init(&vnic_sdma->wait, 0, NULL, hfi1_vnic_sdma_sleep,
+			    hfi1_vnic_sdma_wakeup, NULL);
+		vnic_sdma->sde = &vinfo->dd->per_sdma[i];
+		vnic_sdma->dd = vinfo->dd;
+		vnic_sdma->vinfo = vinfo;
+		vnic_sdma->q_idx = i;
+		vnic_sdma->state = HFI1_VNIC_SDMA_Q_ACTIVE;
+
+		/* Add a free descriptor watermark for wakeups */
+		if (vnic_sdma->sde->descq_cnt > HFI1_VNIC_SDMA_DESC_WTRMRK) {
+			INIT_LIST_HEAD(&vnic_sdma->stx.list);
+			vnic_sdma->stx.num_desc = HFI1_VNIC_SDMA_DESC_WTRMRK;
+			list_add_tail(&vnic_sdma->stx.list,
+				      &vnic_sdma->wait.tx_head);
+		}
+	}
+}
+
+int hfi1_vnic_txreq_init(struct hfi1_devdata *dd)
+{
+	char buf[HFI1_VNIC_TXREQ_NAME_LEN];
+
+	snprintf(buf, sizeof(buf), "hfi1_%u_vnic_txreq_cache", dd->unit);
+	dd->vnic.txreq_cache = kmem_cache_create(buf,
+						 sizeof(struct vnic_txreq),
+						 0, SLAB_HWCACHE_ALIGN,
+						 NULL);
+	if (!dd->vnic.txreq_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void hfi1_vnic_txreq_deinit(struct hfi1_devdata *dd)
+{
+	kmem_cache_destroy(dd->vnic.txreq_cache);
+	dd->vnic.txreq_cache = NULL;
+}
diff --git a/drivers/infiniband/hw/hns/Kconfig b/drivers/infiniband/hw/hns/Kconfig
new file mode 100644
index 0000000..fddb5fd
--- /dev/null
+++ b/drivers/infiniband/hw/hns/Kconfig
@@ -0,0 +1,31 @@
+config INFINIBAND_HNS
+	tristate "HNS RoCE Driver"
+	depends on NET_VENDOR_HISILICON
+	depends on ARM64 || (COMPILE_TEST && 64BIT)
+	---help---
+	  This is a RoCE/RDMA driver for the Hisilicon RoCE engine. The engine
+	  is used in Hisilicon Hip06 and more further ICT SoC based on
+	  platform device.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called hns-roce.
+
+config INFINIBAND_HNS_HIP06
+	tristate "Hisilicon Hip06 Family RoCE support"
+	depends on INFINIBAND_HNS && HNS && HNS_DSAF && HNS_ENET
+	---help---
+	  RoCE driver support for Hisilicon RoCE engine in Hisilicon Hip06 and
+	  Hip07 SoC. These RoCE engines are platform devices.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called hns-roce-hw-v1.
+
+config INFINIBAND_HNS_HIP08
+	tristate "Hisilicon Hip08 Family RoCE support"
+	depends on INFINIBAND_HNS && PCI && HNS3
+	---help---
+	  RoCE driver support for Hisilicon RoCE engine in Hisilicon Hip08 SoC.
+	  The RoCE engine is a PCI device.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called hns-roce-hw-v2.
diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile
new file mode 100644
index 0000000..cf03404
--- /dev/null
+++ b/drivers/infiniband/hw/hns/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for the Hisilicon RoCE drivers.
+#
+
+ccflags-y :=  -Idrivers/net/ethernet/hisilicon/hns3
+
+obj-$(CONFIG_INFINIBAND_HNS) += hns-roce.o
+hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \
+	hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \
+	hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o
+obj-$(CONFIG_INFINIBAND_HNS_HIP06) += hns-roce-hw-v1.o
+hns-roce-hw-v1-objs := hns_roce_hw_v1.o
+obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o
+hns-roce-hw-v2-objs := hns_roce_hw_v2.o
diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c
new file mode 100644
index 0000000..d749286
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_ah.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/platform_device.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+#include "hns_roce_device.h"
+
+#define HNS_ROCE_PORT_NUM_SHIFT		24
+#define HNS_ROCE_VLAN_SL_BIT_MASK	7
+#define HNS_ROCE_VLAN_SL_SHIFT		13
+
+struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd,
+				 struct rdma_ah_attr *ah_attr,
+				 struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibpd->device);
+	struct device *dev = hr_dev->dev;
+	struct ib_gid_attr gid_attr;
+	struct hns_roce_ah *ah;
+	u16 vlan_tag = 0xffff;
+	const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
+	union ib_gid sgid;
+	int ret;
+
+	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	/* Get mac address */
+	memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN);
+
+	/* Get source gid */
+	ret = ib_get_cached_gid(ibpd->device, rdma_ah_get_port_num(ah_attr),
+				grh->sgid_index, &sgid, &gid_attr);
+	if (ret) {
+		dev_err(dev, "get sgid failed! ret = %d\n", ret);
+		kfree(ah);
+		return ERR_PTR(ret);
+	}
+
+	if (is_vlan_dev(gid_attr.ndev))
+		vlan_tag = vlan_dev_vlan_id(gid_attr.ndev);
+	dev_put(gid_attr.ndev);
+
+	if (vlan_tag < 0x1000)
+		vlan_tag |= (rdma_ah_get_sl(ah_attr) &
+			     HNS_ROCE_VLAN_SL_BIT_MASK) <<
+			     HNS_ROCE_VLAN_SL_SHIFT;
+
+	ah->av.port_pd = cpu_to_be32(to_hr_pd(ibpd)->pdn |
+				     (rdma_ah_get_port_num(ah_attr) <<
+				     HNS_ROCE_PORT_NUM_SHIFT));
+	ah->av.gid_index = grh->sgid_index;
+	ah->av.vlan = cpu_to_le16(vlan_tag);
+	dev_dbg(dev, "gid_index = 0x%x,vlan = 0x%x\n", ah->av.gid_index,
+		ah->av.vlan);
+
+	if (rdma_ah_get_static_rate(ah_attr))
+		ah->av.stat_rate = IB_RATE_10_GBPS;
+
+	memcpy(ah->av.dgid, grh->dgid.raw, HNS_ROCE_GID_SIZE);
+	ah->av.sl_tclass_flowlabel = cpu_to_le32(rdma_ah_get_sl(ah_attr) <<
+						 HNS_ROCE_SL_SHIFT);
+
+	return &ah->ibah;
+}
+
+int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
+{
+	struct hns_roce_ah *ah = to_hr_ah(ibah);
+
+	memset(ah_attr, 0, sizeof(*ah_attr));
+
+	rdma_ah_set_sl(ah_attr, (le32_to_cpu(ah->av.sl_tclass_flowlabel) >>
+				 HNS_ROCE_SL_SHIFT));
+	rdma_ah_set_port_num(ah_attr, (le32_to_cpu(ah->av.port_pd) >>
+				       HNS_ROCE_PORT_NUM_SHIFT));
+	rdma_ah_set_static_rate(ah_attr, ah->av.stat_rate);
+	rdma_ah_set_grh(ah_attr, NULL,
+			(le32_to_cpu(ah->av.sl_tclass_flowlabel) &
+			 HNS_ROCE_FLOW_LABLE_MASK), ah->av.gid_index,
+			ah->av.hop_limit,
+			(le32_to_cpu(ah->av.sl_tclass_flowlabel) >>
+			 HNS_ROCE_TCLASS_SHIFT));
+	rdma_ah_set_dgid_raw(ah_attr, ah->av.dgid);
+
+	return 0;
+}
+
+int hns_roce_destroy_ah(struct ib_ah *ah)
+{
+	kfree(to_hr_ah(ah));
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c
new file mode 100644
index 0000000..a40ec93
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/vmalloc.h>
+#include "hns_roce_device.h"
+
+int hns_roce_bitmap_alloc(struct hns_roce_bitmap *bitmap, unsigned long *obj)
+{
+	int ret = 0;
+
+	spin_lock(&bitmap->lock);
+	*obj = find_next_zero_bit(bitmap->table, bitmap->max, bitmap->last);
+	if (*obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+			       & bitmap->mask;
+		*obj = find_first_zero_bit(bitmap->table, bitmap->max);
+	}
+
+	if (*obj < bitmap->max) {
+		set_bit(*obj, bitmap->table);
+		bitmap->last = (*obj + 1);
+		if (bitmap->last == bitmap->max)
+			bitmap->last = 0;
+		*obj |= bitmap->top;
+	} else {
+		ret = -1;
+	}
+
+	spin_unlock(&bitmap->lock);
+
+	return ret;
+}
+
+void hns_roce_bitmap_free(struct hns_roce_bitmap *bitmap, unsigned long obj,
+			  int rr)
+{
+	hns_roce_bitmap_free_range(bitmap, obj, 1, rr);
+}
+EXPORT_SYMBOL_GPL(hns_roce_bitmap_free);
+
+int hns_roce_bitmap_alloc_range(struct hns_roce_bitmap *bitmap, int cnt,
+				int align, unsigned long *obj)
+{
+	int ret = 0;
+	int i;
+
+	if (likely(cnt == 1 && align == 1))
+		return hns_roce_bitmap_alloc(bitmap, obj);
+
+	spin_lock(&bitmap->lock);
+
+	*obj = bitmap_find_next_zero_area(bitmap->table, bitmap->max,
+					  bitmap->last, cnt, align - 1);
+	if (*obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+			       & bitmap->mask;
+		*obj = bitmap_find_next_zero_area(bitmap->table, bitmap->max, 0,
+						  cnt, align - 1);
+	}
+
+	if (*obj < bitmap->max) {
+		for (i = 0; i < cnt; i++)
+			set_bit(*obj + i, bitmap->table);
+
+		if (*obj == bitmap->last) {
+			bitmap->last = (*obj + cnt);
+			if (bitmap->last >= bitmap->max)
+				bitmap->last = 0;
+		}
+		*obj |= bitmap->top;
+	} else {
+		ret = -1;
+	}
+
+	spin_unlock(&bitmap->lock);
+
+	return ret;
+}
+
+void hns_roce_bitmap_free_range(struct hns_roce_bitmap *bitmap,
+				unsigned long obj, int cnt,
+				int rr)
+{
+	int i;
+
+	obj &= bitmap->max + bitmap->reserved_top - 1;
+
+	spin_lock(&bitmap->lock);
+	for (i = 0; i < cnt; i++)
+		clear_bit(obj + i, bitmap->table);
+
+	if (!rr)
+		bitmap->last = min(bitmap->last, obj);
+	bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+		       & bitmap->mask;
+	spin_unlock(&bitmap->lock);
+}
+
+int hns_roce_bitmap_init(struct hns_roce_bitmap *bitmap, u32 num, u32 mask,
+			 u32 reserved_bot, u32 reserved_top)
+{
+	u32 i;
+
+	if (num != roundup_pow_of_two(num))
+		return -EINVAL;
+
+	bitmap->last = 0;
+	bitmap->top = 0;
+	bitmap->max = num - reserved_top;
+	bitmap->mask = mask;
+	bitmap->reserved_top = reserved_top;
+	spin_lock_init(&bitmap->lock);
+	bitmap->table = kcalloc(BITS_TO_LONGS(bitmap->max), sizeof(long),
+				GFP_KERNEL);
+	if (!bitmap->table)
+		return -ENOMEM;
+
+	for (i = 0; i < reserved_bot; ++i)
+		set_bit(i, bitmap->table);
+
+	return 0;
+}
+
+void hns_roce_bitmap_cleanup(struct hns_roce_bitmap *bitmap)
+{
+	kfree(bitmap->table);
+}
+
+void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size,
+		       struct hns_roce_buf *buf)
+{
+	int i;
+	struct device *dev = hr_dev->dev;
+
+	if (buf->nbufs == 1) {
+		dma_free_coherent(dev, size, buf->direct.buf, buf->direct.map);
+	} else {
+		for (i = 0; i < buf->nbufs; ++i)
+			if (buf->page_list[i].buf)
+				dma_free_coherent(dev, 1 << buf->page_shift,
+						  buf->page_list[i].buf,
+						  buf->page_list[i].map);
+		kfree(buf->page_list);
+	}
+}
+EXPORT_SYMBOL_GPL(hns_roce_buf_free);
+
+int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,
+		       struct hns_roce_buf *buf, u32 page_shift)
+{
+	int i = 0;
+	dma_addr_t t;
+	struct device *dev = hr_dev->dev;
+	u32 page_size = 1 << page_shift;
+	u32 order;
+
+	/* SQ/RQ buf lease than one page, SQ + RQ = 8K */
+	if (size <= max_direct) {
+		buf->nbufs = 1;
+		/* Npages calculated by page_size */
+		order = get_order(size);
+		if (order <= page_shift - PAGE_SHIFT)
+			order = 0;
+		else
+			order -= page_shift - PAGE_SHIFT;
+		buf->npages = 1 << order;
+		buf->page_shift = page_shift;
+		/* MTT PA must be recorded in 4k alignment, t is 4k aligned */
+		buf->direct.buf = dma_alloc_coherent(dev, size, &t, GFP_KERNEL);
+		if (!buf->direct.buf)
+			return -ENOMEM;
+
+		buf->direct.map = t;
+
+		while (t & ((1 << buf->page_shift) - 1)) {
+			--buf->page_shift;
+			buf->npages *= 2;
+		}
+
+		memset(buf->direct.buf, 0, size);
+	} else {
+		buf->nbufs = (size + page_size - 1) / page_size;
+		buf->npages = buf->nbufs;
+		buf->page_shift = page_shift;
+		buf->page_list = kcalloc(buf->nbufs, sizeof(*buf->page_list),
+					 GFP_KERNEL);
+
+		if (!buf->page_list)
+			return -ENOMEM;
+
+		for (i = 0; i < buf->nbufs; ++i) {
+			buf->page_list[i].buf = dma_alloc_coherent(dev,
+								  page_size, &t,
+								  GFP_KERNEL);
+
+			if (!buf->page_list[i].buf)
+				goto err_free;
+
+			buf->page_list[i].map = t;
+			memset(buf->page_list[i].buf, 0, page_size);
+		}
+	}
+
+	return 0;
+
+err_free:
+	hns_roce_buf_free(hr_dev, size, buf);
+	return -ENOMEM;
+}
+
+void hns_roce_cleanup_bitmap(struct hns_roce_dev *hr_dev)
+{
+	hns_roce_cleanup_qp_table(hr_dev);
+	hns_roce_cleanup_cq_table(hr_dev);
+	hns_roce_cleanup_mr_table(hr_dev);
+	hns_roce_cleanup_pd_table(hr_dev);
+	hns_roce_cleanup_uar_table(hr_dev);
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c
new file mode 100644
index 0000000..9ebe839
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/dmapool.h>
+#include <linux/platform_device.h>
+#include "hns_roce_common.h"
+#include "hns_roce_device.h"
+#include "hns_roce_cmd.h"
+
+#define CMD_POLL_TOKEN		0xffff
+#define CMD_MAX_NUM		32
+#define CMD_TOKEN_MASK		0x1f
+
+static int hns_roce_cmd_mbox_post_hw(struct hns_roce_dev *hr_dev, u64 in_param,
+				     u64 out_param, u32 in_modifier,
+				     u8 op_modifier, u16 op, u16 token,
+				     int event)
+{
+	struct hns_roce_cmdq *cmd = &hr_dev->cmd;
+	int ret;
+
+	mutex_lock(&cmd->hcr_mutex);
+	ret = hr_dev->hw->post_mbox(hr_dev, in_param, out_param, in_modifier,
+				    op_modifier, op, token, event);
+	mutex_unlock(&cmd->hcr_mutex);
+
+	return ret;
+}
+
+/* this should be called with "poll_sem" */
+static int __hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param,
+				    u64 out_param, unsigned long in_modifier,
+				    u8 op_modifier, u16 op,
+				    unsigned long timeout)
+{
+	struct device *dev = hr_dev->dev;
+	int ret;
+
+	ret = hns_roce_cmd_mbox_post_hw(hr_dev, in_param, out_param,
+					in_modifier, op_modifier, op,
+					CMD_POLL_TOKEN, 0);
+	if (ret) {
+		dev_err(dev, "[cmd_poll]hns_roce_cmd_mbox_post_hw failed\n");
+		return ret;
+	}
+
+	return hr_dev->hw->chk_mbox(hr_dev, timeout);
+}
+
+static int hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param,
+				  u64 out_param, unsigned long in_modifier,
+				  u8 op_modifier, u16 op, unsigned long timeout)
+{
+	int ret;
+
+	down(&hr_dev->cmd.poll_sem);
+	ret = __hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param, in_modifier,
+				       op_modifier, op, timeout);
+	up(&hr_dev->cmd.poll_sem);
+
+	return ret;
+}
+
+void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status,
+			u64 out_param)
+{
+	struct hns_roce_cmd_context
+		*context = &hr_dev->cmd.context[token & hr_dev->cmd.token_mask];
+
+	if (token != context->token)
+		return;
+
+	context->result = (status == HNS_ROCE_CMD_SUCCESS) ? 0 : (-EIO);
+	context->out_param = out_param;
+	complete(&context->done);
+}
+EXPORT_SYMBOL_GPL(hns_roce_cmd_event);
+
+/* this should be called with "use_events" */
+static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param,
+				    u64 out_param, unsigned long in_modifier,
+				    u8 op_modifier, u16 op,
+				    unsigned long timeout)
+{
+	struct hns_roce_cmdq *cmd = &hr_dev->cmd;
+	struct hns_roce_cmd_context *context;
+	struct device *dev = hr_dev->dev;
+	int ret;
+
+	spin_lock(&cmd->context_lock);
+	WARN_ON(cmd->free_head < 0);
+	context = &cmd->context[cmd->free_head];
+	context->token += cmd->token_mask + 1;
+	cmd->free_head = context->next;
+	spin_unlock(&cmd->context_lock);
+
+	init_completion(&context->done);
+
+	ret = hns_roce_cmd_mbox_post_hw(hr_dev, in_param, out_param,
+					in_modifier, op_modifier, op,
+					context->token, 1);
+	if (ret)
+		goto out;
+
+	/*
+	 * It is timeout when wait_for_completion_timeout return 0
+	 * The return value is the time limit set in advance
+	 * how many seconds showing
+	 */
+	if (!wait_for_completion_timeout(&context->done,
+					 msecs_to_jiffies(timeout))) {
+		dev_err(dev, "[cmd]wait_for_completion_timeout timeout\n");
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = context->result;
+	if (ret) {
+		dev_err(dev, "[cmd]event mod cmd process error!err=%d\n", ret);
+		goto out;
+	}
+
+out:
+	spin_lock(&cmd->context_lock);
+	context->next = cmd->free_head;
+	cmd->free_head = context - cmd->context;
+	spin_unlock(&cmd->context_lock);
+
+	return ret;
+}
+
+static int hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param,
+				  u64 out_param, unsigned long in_modifier,
+				  u8 op_modifier, u16 op, unsigned long timeout)
+{
+	int ret = 0;
+
+	down(&hr_dev->cmd.event_sem);
+	ret = __hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
+				       in_modifier, op_modifier, op, timeout);
+	up(&hr_dev->cmd.event_sem);
+
+	return ret;
+}
+
+int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param,
+		      unsigned long in_modifier, u8 op_modifier, u16 op,
+		      unsigned long timeout)
+{
+	if (hr_dev->cmd.use_events)
+		return hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
+					      in_modifier, op_modifier, op,
+					      timeout);
+	else
+		return hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param,
+					      in_modifier, op_modifier, op,
+					      timeout);
+}
+EXPORT_SYMBOL_GPL(hns_roce_cmd_mbox);
+
+int hns_roce_cmd_init(struct hns_roce_dev *hr_dev)
+{
+	struct device *dev = hr_dev->dev;
+
+	mutex_init(&hr_dev->cmd.hcr_mutex);
+	sema_init(&hr_dev->cmd.poll_sem, 1);
+	hr_dev->cmd.use_events = 0;
+	hr_dev->cmd.toggle = 1;
+	hr_dev->cmd.max_cmds = CMD_MAX_NUM;
+	hr_dev->cmd.pool = dma_pool_create("hns_roce_cmd", dev,
+					   HNS_ROCE_MAILBOX_SIZE,
+					   HNS_ROCE_MAILBOX_SIZE, 0);
+	if (!hr_dev->cmd.pool)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void hns_roce_cmd_cleanup(struct hns_roce_dev *hr_dev)
+{
+	dma_pool_destroy(hr_dev->cmd.pool);
+}
+
+int hns_roce_cmd_use_events(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_cmdq *hr_cmd = &hr_dev->cmd;
+	int i;
+
+	hr_cmd->context = kmalloc_array(hr_cmd->max_cmds,
+					sizeof(*hr_cmd->context),
+					GFP_KERNEL);
+	if (!hr_cmd->context)
+		return -ENOMEM;
+
+	for (i = 0; i < hr_cmd->max_cmds; ++i) {
+		hr_cmd->context[i].token = i;
+		hr_cmd->context[i].next = i + 1;
+	}
+
+	hr_cmd->context[hr_cmd->max_cmds - 1].next = -1;
+	hr_cmd->free_head = 0;
+
+	sema_init(&hr_cmd->event_sem, hr_cmd->max_cmds);
+	spin_lock_init(&hr_cmd->context_lock);
+
+	hr_cmd->token_mask = CMD_TOKEN_MASK;
+	hr_cmd->use_events = 1;
+
+	down(&hr_cmd->poll_sem);
+
+	return 0;
+}
+
+void hns_roce_cmd_use_polling(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_cmdq *hr_cmd = &hr_dev->cmd;
+	int i;
+
+	hr_cmd->use_events = 0;
+
+	for (i = 0; i < hr_cmd->max_cmds; ++i)
+		down(&hr_cmd->event_sem);
+
+	kfree(hr_cmd->context);
+	up(&hr_cmd->poll_sem);
+}
+
+struct hns_roce_cmd_mailbox
+	*hns_roce_alloc_cmd_mailbox(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_cmd_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof(*mailbox), GFP_KERNEL);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = dma_pool_alloc(hr_dev->cmd.pool, GFP_KERNEL,
+				      &mailbox->dma);
+	if (!mailbox->buf) {
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return mailbox;
+}
+EXPORT_SYMBOL_GPL(hns_roce_alloc_cmd_mailbox);
+
+void hns_roce_free_cmd_mailbox(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_cmd_mailbox *mailbox)
+{
+	if (!mailbox)
+		return;
+
+	dma_pool_free(hr_dev->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+EXPORT_SYMBOL_GPL(hns_roce_free_cmd_mailbox);
diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h
new file mode 100644
index 0000000..9549ae5
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _HNS_ROCE_CMD_H
+#define _HNS_ROCE_CMD_H
+
+#define HNS_ROCE_MAILBOX_SIZE		4096
+#define HNS_ROCE_CMD_TIMEOUT_MSECS	10000
+
+enum {
+	/* QPC BT commands */
+	HNS_ROCE_CMD_WRITE_QPC_BT0	= 0x0,
+	HNS_ROCE_CMD_WRITE_QPC_BT1	= 0x1,
+	HNS_ROCE_CMD_WRITE_QPC_BT2	= 0x2,
+	HNS_ROCE_CMD_READ_QPC_BT0	= 0x4,
+	HNS_ROCE_CMD_READ_QPC_BT1	= 0x5,
+	HNS_ROCE_CMD_READ_QPC_BT2	= 0x6,
+	HNS_ROCE_CMD_DESTROY_QPC_BT0	= 0x8,
+	HNS_ROCE_CMD_DESTROY_QPC_BT1	= 0x9,
+	HNS_ROCE_CMD_DESTROY_QPC_BT2	= 0xa,
+
+	/* QPC operation */
+	HNS_ROCE_CMD_MODIFY_QPC		= 0x41,
+	HNS_ROCE_CMD_QUERY_QPC		= 0x42,
+
+	HNS_ROCE_CMD_MODIFY_CQC		= 0x52,
+	/* CQC BT commands */
+	HNS_ROCE_CMD_WRITE_CQC_BT0	= 0x10,
+	HNS_ROCE_CMD_WRITE_CQC_BT1	= 0x11,
+	HNS_ROCE_CMD_WRITE_CQC_BT2	= 0x12,
+	HNS_ROCE_CMD_READ_CQC_BT0	= 0x14,
+	HNS_ROCE_CMD_READ_CQC_BT1	= 0x15,
+	HNS_ROCE_CMD_READ_CQC_BT2	= 0x1b,
+	HNS_ROCE_CMD_DESTROY_CQC_BT0	= 0x18,
+	HNS_ROCE_CMD_DESTROY_CQC_BT1	= 0x19,
+	HNS_ROCE_CMD_DESTROY_CQC_BT2	= 0x1a,
+
+	/* MPT BT commands */
+	HNS_ROCE_CMD_WRITE_MPT_BT0	= 0x20,
+	HNS_ROCE_CMD_WRITE_MPT_BT1	= 0x21,
+	HNS_ROCE_CMD_WRITE_MPT_BT2	= 0x22,
+	HNS_ROCE_CMD_READ_MPT_BT0	= 0x24,
+	HNS_ROCE_CMD_READ_MPT_BT1	= 0x25,
+	HNS_ROCE_CMD_READ_MPT_BT2	= 0x26,
+	HNS_ROCE_CMD_DESTROY_MPT_BT0	= 0x28,
+	HNS_ROCE_CMD_DESTROY_MPT_BT1	= 0x29,
+	HNS_ROCE_CMD_DESTROY_MPT_BT2	= 0x2a,
+
+	/* MPT commands */
+	HNS_ROCE_CMD_QUERY_MPT		= 0x62,
+
+	/* SRQC BT commands */
+	HNS_ROCE_CMD_WRITE_SRQC_BT0	= 0x30,
+	HNS_ROCE_CMD_WRITE_SRQC_BT1	= 0x31,
+	HNS_ROCE_CMD_WRITE_SRQC_BT2	= 0x32,
+	HNS_ROCE_CMD_READ_SRQC_BT0	= 0x34,
+	HNS_ROCE_CMD_READ_SRQC_BT1	= 0x35,
+	HNS_ROCE_CMD_READ_SRQC_BT2	= 0x36,
+	HNS_ROCE_CMD_DESTROY_SRQC_BT0	= 0x38,
+	HNS_ROCE_CMD_DESTROY_SRQC_BT1	= 0x39,
+	HNS_ROCE_CMD_DESTROY_SRQC_BT2	= 0x3a,
+
+	/* EQC commands */
+	HNS_ROCE_CMD_CREATE_AEQC	= 0x80,
+	HNS_ROCE_CMD_MODIFY_AEQC	= 0x81,
+	HNS_ROCE_CMD_QUERY_AEQC		= 0x82,
+	HNS_ROCE_CMD_DESTROY_AEQC	= 0x83,
+	HNS_ROCE_CMD_CREATE_CEQC	= 0x90,
+	HNS_ROCE_CMD_MODIFY_CEQC	= 0x91,
+	HNS_ROCE_CMD_QUERY_CEQC		= 0x92,
+	HNS_ROCE_CMD_DESTROY_CEQC	= 0x93,
+};
+
+enum {
+	/* TPT commands */
+	HNS_ROCE_CMD_SW2HW_MPT		= 0xd,
+	HNS_ROCE_CMD_HW2SW_MPT		= 0xf,
+
+	/* CQ commands */
+	HNS_ROCE_CMD_SW2HW_CQ		= 0x16,
+	HNS_ROCE_CMD_HW2SW_CQ		= 0x17,
+
+	/* QP/EE commands */
+	HNS_ROCE_CMD_RST2INIT_QP	= 0x19,
+	HNS_ROCE_CMD_INIT2RTR_QP	= 0x1a,
+	HNS_ROCE_CMD_RTR2RTS_QP		= 0x1b,
+	HNS_ROCE_CMD_RTS2RTS_QP		= 0x1c,
+	HNS_ROCE_CMD_2ERR_QP		= 0x1e,
+	HNS_ROCE_CMD_RTS2SQD_QP		= 0x1f,
+	HNS_ROCE_CMD_SQD2SQD_QP		= 0x38,
+	HNS_ROCE_CMD_SQD2RTS_QP		= 0x20,
+	HNS_ROCE_CMD_2RST_QP		= 0x21,
+	HNS_ROCE_CMD_QUERY_QP		= 0x22,
+};
+
+int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param,
+		      unsigned long in_modifier, u8 op_modifier, u16 op,
+		      unsigned long timeout);
+
+struct hns_roce_cmd_mailbox
+	*hns_roce_alloc_cmd_mailbox(struct hns_roce_dev *hr_dev);
+void hns_roce_free_cmd_mailbox(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_cmd_mailbox *mailbox);
+
+#endif /* _HNS_ROCE_CMD_H */
diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
new file mode 100644
index 0000000..319cb74
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_common.h
@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _HNS_ROCE_COMMON_H
+#define _HNS_ROCE_COMMON_H
+
+#ifndef assert
+#define assert(cond)
+#endif
+
+#define roce_write(dev, reg, val)	writel((val), (dev)->reg_base + (reg))
+#define roce_read(dev, reg)		readl((dev)->reg_base + (reg))
+#define roce_raw_write(value, addr) \
+	__raw_writel((__force u32)cpu_to_le32(value), (addr))
+
+#define roce_get_field(origin, mask, shift) \
+	(((le32_to_cpu(origin)) & (mask)) >> (shift))
+
+#define roce_get_bit(origin, shift) \
+	roce_get_field((origin), (1ul << (shift)), (shift))
+
+#define roce_set_field(origin, mask, shift, val) \
+	do { \
+		(origin) &= ~cpu_to_le32(mask); \
+		(origin) |= cpu_to_le32(((u32)(val) << (shift)) & (mask)); \
+	} while (0)
+
+#define roce_set_bit(origin, shift, val) \
+	roce_set_field((origin), (1ul << (shift)), (shift), (val))
+
+/*
+ * roce_hw_index_cmp_lt - Compare two hardware index values in hisilicon
+ *                        SOC, check if a is less than b.
+ * @a: hardware index value
+ * @b: hardware index value
+ * @bits: the number of bits of a and b, range: 0~31.
+ *
+ * Hardware index increases continuously till max value, and then restart
+ * from zero, again and again. Because the bits of reg field is often
+ * limited, the reg field can only hold the low bits of the hardware index
+ * in hisilicon SOC.
+ * In some scenes we need to compare two values(a,b) getted from two reg
+ * fields in this driver, for example:
+ * If a equals 0xfffe, b equals 0x1 and bits equals 16, we think b has
+ * incresed from 0xffff to 0x1 and a is less than b.
+ * If a equals 0xfffe, b equals 0x0xf001 and bits equals 16, we think a
+ * is bigger than b.
+ *
+ * Return true on a less than b, otherwise false.
+ */
+#define roce_hw_index_mask(bits)	((1ul << (bits)) - 1)
+#define roce_hw_index_shift(bits)	(32 - (bits))
+#define roce_hw_index_cmp_lt(a, b, bits) \
+	((int)((((a) - (b)) & roce_hw_index_mask(bits)) << \
+		roce_hw_index_shift(bits)) < 0)
+
+#define ROCEE_GLB_CFG_ROCEE_DB_SQ_MODE_S 3
+#define ROCEE_GLB_CFG_ROCEE_DB_OTH_MODE_S 4
+
+#define ROCEE_GLB_CFG_SQ_EXT_DB_MODE_S 5
+
+#define ROCEE_GLB_CFG_OTH_EXT_DB_MODE_S 6
+
+#define ROCEE_GLB_CFG_ROCEE_PORT_ST_S 10
+#define ROCEE_GLB_CFG_ROCEE_PORT_ST_M  \
+	(((1UL << 6) - 1) << ROCEE_GLB_CFG_ROCEE_PORT_ST_S)
+
+#define ROCEE_GLB_CFG_TRP_RAQ_DROP_EN_S 16
+
+#define ROCEE_DMAE_USER_CFG1_ROCEE_STREAM_ID_TB_CFG_S 0
+#define ROCEE_DMAE_USER_CFG1_ROCEE_STREAM_ID_TB_CFG_M  \
+	(((1UL << 24) - 1) << ROCEE_DMAE_USER_CFG1_ROCEE_STREAM_ID_TB_CFG_S)
+
+#define ROCEE_DMAE_USER_CFG1_ROCEE_CACHE_TB_CFG_S 24
+#define ROCEE_DMAE_USER_CFG1_ROCEE_CACHE_TB_CFG_M  \
+	(((1UL << 4) - 1) << ROCEE_DMAE_USER_CFG1_ROCEE_CACHE_TB_CFG_S)
+
+#define ROCEE_DMAE_USER_CFG2_ROCEE_STREAM_ID_PKT_CFG_S 0
+#define ROCEE_DMAE_USER_CFG2_ROCEE_STREAM_ID_PKT_CFG_M   \
+	(((1UL << 24) - 1) << ROCEE_DMAE_USER_CFG2_ROCEE_STREAM_ID_PKT_CFG_S)
+
+#define ROCEE_DMAE_USER_CFG2_ROCEE_CACHE_PKT_CFG_S 24
+#define ROCEE_DMAE_USER_CFG2_ROCEE_CACHE_PKT_CFG_M   \
+	(((1UL << 4) - 1) << ROCEE_DMAE_USER_CFG2_ROCEE_CACHE_PKT_CFG_S)
+
+#define ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_S 0
+#define ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_M   \
+	(((1UL << 16) - 1) << ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_S)
+
+#define ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_EMPTY_S 16
+#define ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_EMPTY_M   \
+	(((1UL << 16) - 1) << ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_EMPTY_S)
+
+#define ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_S 0
+#define ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_M   \
+	(((1UL << 16) - 1) << ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_S)
+
+#define ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_EMPTY_S 16
+#define ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_EMPTY_M   \
+	(((1UL << 16) - 1) << ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_EMPTY_S)
+
+#define ROCEE_RAQ_WL_ROCEE_RAQ_WL_S 0
+#define ROCEE_RAQ_WL_ROCEE_RAQ_WL_M   \
+	(((1UL << 8) - 1) << ROCEE_RAQ_WL_ROCEE_RAQ_WL_S)
+
+#define ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_POL_TIME_INTERVAL_S 0
+#define ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_POL_TIME_INTERVAL_M   \
+	(((1UL << 15) - 1) << \
+	ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_POL_TIME_INTERVAL_S)
+
+#define ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_RAQ_TIMEOUT_CHK_CFG_S 16
+#define ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_RAQ_TIMEOUT_CHK_CFG_M   \
+	(((1UL << 4) - 1) << \
+	ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_RAQ_TIMEOUT_CHK_CFG_S)
+
+#define ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_RAQ_TIMEOUT_CHK_EN_S 20
+
+#define ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_EXT_RAQ_MODE 21
+
+#define ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_SHIFT_S 0
+#define ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_SHIFT_M   \
+	(((1UL << 5) - 1) << ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_SHIFT_S)
+
+#define ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_BA_H_S 5
+#define ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_BA_H_M   \
+	(((1UL << 5) - 1) << ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_BA_H_S)
+
+#define ROCEE_EXT_DB_OTH_H_EXT_DB_OTH_SHIFT_S 0
+#define ROCEE_EXT_DB_OTH_H_EXT_DB_OTH_SHIFT_M   \
+	(((1UL << 5) - 1) << ROCEE_EXT_DB_OTH_H_EXT_DB_OTH_SHIFT_S)
+
+#define ROCEE_EXT_DB_SQ_H_EXT_DB_OTH_BA_H_S 5
+#define ROCEE_EXT_DB_SQ_H_EXT_DB_OTH_BA_H_M   \
+	(((1UL << 5) - 1) << ROCEE_EXT_DB_SQ_H_EXT_DB_OTH_BA_H_S)
+
+#define ROCEE_EXT_RAQ_H_EXT_RAQ_SHIFT_S 0
+#define ROCEE_EXT_RAQ_H_EXT_RAQ_SHIFT_M   \
+	(((1UL << 5) - 1) << ROCEE_EXT_RAQ_H_EXT_RAQ_SHIFT_S)
+
+#define ROCEE_EXT_RAQ_H_EXT_RAQ_BA_H_S 8
+#define ROCEE_EXT_RAQ_H_EXT_RAQ_BA_H_M   \
+	(((1UL << 5) - 1) << ROCEE_EXT_RAQ_H_EXT_RAQ_BA_H_S)
+
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_S 0
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_M   \
+	(((1UL << 19) - 1) << ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_S)
+
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_S 19
+
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S 20
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M   \
+	(((1UL << 2) - 1) << ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S)
+
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_S 22
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_M   \
+	(((1UL << 5) - 1) << ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_S)
+
+#define ROCEE_BT_CMD_H_ROCEE_BT_CMD_HW_SYNS_S 31
+
+#define ROCEE_QP1C_CFG0_0_ROCEE_QP1C_QP_ST_S 0
+#define ROCEE_QP1C_CFG0_0_ROCEE_QP1C_QP_ST_M   \
+	(((1UL << 3) - 1) << ROCEE_QP1C_CFG0_0_ROCEE_QP1C_QP_ST_S)
+
+#define ROCEE_QP1C_CFG3_0_ROCEE_QP1C_RQ_HEAD_S 0
+#define ROCEE_QP1C_CFG3_0_ROCEE_QP1C_RQ_HEAD_M   \
+	(((1UL << 15) - 1) << ROCEE_QP1C_CFG3_0_ROCEE_QP1C_RQ_HEAD_S)
+
+#define ROCEE_MB6_ROCEE_MB_CMD_S 0
+#define ROCEE_MB6_ROCEE_MB_CMD_M   \
+	(((1UL << 8) - 1) << ROCEE_MB6_ROCEE_MB_CMD_S)
+
+#define ROCEE_MB6_ROCEE_MB_CMD_MDF_S 8
+#define ROCEE_MB6_ROCEE_MB_CMD_MDF_M   \
+	(((1UL << 4) - 1) << ROCEE_MB6_ROCEE_MB_CMD_MDF_S)
+
+#define ROCEE_MB6_ROCEE_MB_EVENT_S 14
+
+#define ROCEE_MB6_ROCEE_MB_HW_RUN_S 15
+
+#define ROCEE_MB6_ROCEE_MB_TOKEN_S 16
+#define ROCEE_MB6_ROCEE_MB_TOKEN_M   \
+	(((1UL << 16) - 1) << ROCEE_MB6_ROCEE_MB_TOKEN_S)
+
+#define ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_INP_H_S 0
+#define ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_INP_H_M   \
+	(((1UL << 24) - 1) << ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_INP_H_S)
+
+#define ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_MDF_S 24
+#define ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_MDF_M   \
+	(((1UL << 4) - 1) << ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_MDF_S)
+
+#define ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_S 28
+#define ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_M   \
+	(((1UL << 3) - 1) << ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_S)
+
+#define ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_HW_SYNS_S 31
+
+#define ROCEE_SMAC_H_ROCEE_SMAC_H_S 0
+#define ROCEE_SMAC_H_ROCEE_SMAC_H_M   \
+	(((1UL << 16) - 1) << ROCEE_SMAC_H_ROCEE_SMAC_H_S)
+
+#define ROCEE_SMAC_H_ROCEE_PORT_MTU_S 16
+#define ROCEE_SMAC_H_ROCEE_PORT_MTU_M   \
+	(((1UL << 4) - 1) << ROCEE_SMAC_H_ROCEE_PORT_MTU_S)
+
+#define ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S 0
+#define ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M   \
+	(((1UL << 2) - 1) << ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S)
+
+#define ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_S 8
+#define ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_M   \
+	(((1UL << 4) - 1) << ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_S)
+
+#define ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQ_ALM_OVF_INT_ST_S 17
+
+#define ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_S 0
+#define ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_M   \
+	(((1UL << 5) - 1) << ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_S)
+
+#define ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_S 16
+#define ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_M   \
+	(((1UL << 16) - 1) << ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_S)
+
+#define ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_S 0
+#define ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_M   \
+	(((1UL << 16) - 1) << ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_S)
+
+#define ROCEE_CAEP_CEQC_SHIFT_CAEP_CEQ_ALM_OVF_INT_ST_S 16
+#define ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S 1
+#define ROCEE_CAEP_CEQ_ALM_OVF_CAEP_CEQ_ALM_OVF_S 0
+
+#define ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S 0
+#define ROCEE_CAEP_AE_MASK_CAEP_AE_IRQ_MASK_S 1
+
+#define ROCEE_CAEP_AE_ST_CAEP_AEQ_ALM_OVF_S 0
+
+#define ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_S 0
+#define ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_M   \
+	(((1UL << 28) - 1) << ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_S)
+
+#define ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S 0
+#define ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M   \
+	(((1UL << 28) - 1) << ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S)
+
+#define ROCEE_SDB_PTR_CMP_BITS 28
+
+#define ROCEE_SDB_INV_CNT_SDB_INV_CNT_S 0
+#define ROCEE_SDB_INV_CNT_SDB_INV_CNT_M   \
+	(((1UL << 16) - 1) << ROCEE_SDB_INV_CNT_SDB_INV_CNT_S)
+
+#define ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S	0
+#define ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M	\
+	(((1UL << 16) - 1) << ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S)
+
+#define ROCEE_SDB_CNT_CMP_BITS 16
+
+#define ROCEE_TSP_BP_ST_QH_FIFO_ENTRY_S	20
+
+#define ROCEE_CNT_CLR_CE_CNT_CLR_CE_S 0
+
+/*************ROCEE_REG DEFINITION****************/
+#define ROCEE_VENDOR_ID_REG			0x0
+#define ROCEE_VENDOR_PART_ID_REG		0x4
+
+#define ROCEE_SYS_IMAGE_GUID_L_REG		0xC
+#define ROCEE_SYS_IMAGE_GUID_H_REG		0x10
+
+#define ROCEE_PORT_GID_L_0_REG			0x50
+#define ROCEE_PORT_GID_ML_0_REG			0x54
+#define ROCEE_PORT_GID_MH_0_REG			0x58
+#define ROCEE_PORT_GID_H_0_REG			0x5C
+
+#define ROCEE_BT_CMD_H_REG			0x204
+
+#define ROCEE_SMAC_L_0_REG			0x240
+#define ROCEE_SMAC_H_0_REG			0x244
+
+#define ROCEE_QP1C_CFG3_0_REG			0x27C
+
+#define ROCEE_CAEP_AEQE_CONS_IDX_REG		0x3AC
+#define ROCEE_CAEP_CEQC_CONS_IDX_0_REG		0x3BC
+
+#define ROCEE_ECC_UCERR_ALM1_REG		0xB38
+#define ROCEE_ECC_UCERR_ALM2_REG		0xB3C
+#define ROCEE_ECC_CERR_ALM1_REG			0xB44
+#define ROCEE_ECC_CERR_ALM2_REG			0xB48
+
+#define ROCEE_ACK_DELAY_REG			0x14
+#define ROCEE_GLB_CFG_REG			0x18
+
+#define ROCEE_DMAE_USER_CFG1_REG		0x40
+#define ROCEE_DMAE_USER_CFG2_REG		0x44
+
+#define ROCEE_DB_SQ_WL_REG			0x154
+#define ROCEE_DB_OTHERS_WL_REG			0x158
+#define ROCEE_RAQ_WL_REG			0x15C
+#define ROCEE_WRMS_POL_TIME_INTERVAL_REG	0x160
+#define ROCEE_EXT_DB_SQ_REG			0x164
+#define ROCEE_EXT_DB_SQ_H_REG			0x168
+#define ROCEE_EXT_DB_OTH_REG			0x16C
+
+#define ROCEE_EXT_DB_OTH_H_REG			0x170
+#define ROCEE_EXT_DB_SQ_WL_EMPTY_REG		0x174
+#define ROCEE_EXT_DB_SQ_WL_REG			0x178
+#define ROCEE_EXT_DB_OTHERS_WL_EMPTY_REG	0x17C
+#define ROCEE_EXT_DB_OTHERS_WL_REG		0x180
+#define ROCEE_EXT_RAQ_REG			0x184
+#define ROCEE_EXT_RAQ_H_REG			0x188
+
+#define ROCEE_CAEP_CE_INTERVAL_CFG_REG		0x190
+#define ROCEE_CAEP_CE_BURST_NUM_CFG_REG		0x194
+#define ROCEE_BT_CMD_L_REG			0x200
+
+#define ROCEE_MB1_REG				0x210
+#define ROCEE_MB6_REG				0x224
+#define ROCEE_DB_SQ_L_0_REG			0x230
+#define ROCEE_DB_OTHERS_L_0_REG			0x238
+#define ROCEE_QP1C_CFG0_0_REG			0x270
+
+#define ROCEE_CAEP_AEQC_AEQE_SHIFT_REG		0x3A0
+#define ROCEE_CAEP_CEQC_SHIFT_0_REG		0x3B0
+#define ROCEE_CAEP_CE_IRQ_MASK_0_REG		0x3C0
+#define ROCEE_CAEP_CEQ_ALM_OVF_0_REG		0x3C4
+#define ROCEE_CAEP_AE_MASK_REG			0x6C8
+#define ROCEE_CAEP_AE_ST_REG			0x6CC
+
+#define ROCEE_SDB_ISSUE_PTR_REG			0x758
+#define ROCEE_SDB_SEND_PTR_REG			0x75C
+#define ROCEE_CAEP_CQE_WCMD_EMPTY		0x850
+#define ROCEE_SCAEP_WR_CQE_CNT			0x8D0
+#define ROCEE_SDB_INV_CNT_REG			0x9A4
+#define ROCEE_SDB_RETRY_CNT_REG			0x9AC
+#define ROCEE_TSP_BP_ST_REG			0x9EC
+#define ROCEE_ECC_UCERR_ALM0_REG		0xB34
+#define ROCEE_ECC_CERR_ALM0_REG			0xB40
+
+/* V2 ROCEE REG */
+#define ROCEE_TX_CMQ_BASEADDR_L_REG		0x07000
+#define ROCEE_TX_CMQ_BASEADDR_H_REG		0x07004
+#define ROCEE_TX_CMQ_DEPTH_REG			0x07008
+#define ROCEE_TX_CMQ_TAIL_REG			0x07010
+#define ROCEE_TX_CMQ_HEAD_REG			0x07014
+
+#define ROCEE_RX_CMQ_BASEADDR_L_REG		0x07018
+#define ROCEE_RX_CMQ_BASEADDR_H_REG		0x0701c
+#define ROCEE_RX_CMQ_DEPTH_REG			0x07020
+#define ROCEE_RX_CMQ_TAIL_REG			0x07024
+#define ROCEE_RX_CMQ_HEAD_REG			0x07028
+
+#define ROCEE_VF_MB_CFG0_REG			0x40
+#define ROCEE_VF_MB_STATUS_REG			0x58
+
+#define ROCEE_VF_EQ_DB_CFG0_REG			0x238
+#define ROCEE_VF_EQ_DB_CFG1_REG			0x23C
+
+#define ROCEE_VF_SMAC_CFG0_REG			0x12000
+#define ROCEE_VF_SMAC_CFG1_REG			0x12004
+
+#define ROCEE_VF_SGID_CFG0_REG			0x10000
+#define ROCEE_VF_SGID_CFG1_REG			0x10004
+#define ROCEE_VF_SGID_CFG2_REG			0x10008
+#define ROCEE_VF_SGID_CFG3_REG			0x1000c
+#define ROCEE_VF_SGID_CFG4_REG			0x10010
+
+#define ROCEE_VF_ABN_INT_CFG_REG		0x13000
+#define ROCEE_VF_ABN_INT_ST_REG			0x13004
+#define ROCEE_VF_ABN_INT_EN_REG			0x13008
+#define ROCEE_VF_EVENT_INT_EN_REG		0x1300c
+
+#endif /* _HNS_ROCE_COMMON_H */
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
new file mode 100644
index 0000000..3a485f5
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -0,0 +1,543 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/platform_device.h>
+#include <rdma/ib_umem.h>
+#include "hns_roce_device.h"
+#include "hns_roce_cmd.h"
+#include "hns_roce_hem.h"
+#include <rdma/hns-abi.h>
+#include "hns_roce_common.h"
+
+static void hns_roce_ib_cq_comp(struct hns_roce_cq *hr_cq)
+{
+	struct ib_cq *ibcq = &hr_cq->ib_cq;
+
+	ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void hns_roce_ib_cq_event(struct hns_roce_cq *hr_cq,
+				 enum hns_roce_event event_type)
+{
+	struct hns_roce_dev *hr_dev;
+	struct ib_event event;
+	struct ib_cq *ibcq;
+
+	ibcq = &hr_cq->ib_cq;
+	hr_dev = to_hr_dev(ibcq->device);
+
+	if (event_type != HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID &&
+	    event_type != HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR &&
+	    event_type != HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW) {
+		dev_err(hr_dev->dev,
+			"hns_roce_ib: Unexpected event type 0x%x on CQ %06lx\n",
+			event_type, hr_cq->cqn);
+		return;
+	}
+
+	if (ibcq->event_handler) {
+		event.device = ibcq->device;
+		event.event = IB_EVENT_CQ_ERR;
+		event.element.cq = ibcq;
+		ibcq->event_handler(&event, ibcq->cq_context);
+	}
+}
+
+static int hns_roce_sw2hw_cq(struct hns_roce_dev *dev,
+			     struct hns_roce_cmd_mailbox *mailbox,
+			     unsigned long cq_num)
+{
+	return hns_roce_cmd_mbox(dev, mailbox->dma, 0, cq_num, 0,
+			    HNS_ROCE_CMD_SW2HW_CQ, HNS_ROCE_CMD_TIMEOUT_MSECS);
+}
+
+static int hns_roce_cq_alloc(struct hns_roce_dev *hr_dev, int nent,
+			     struct hns_roce_mtt *hr_mtt,
+			     struct hns_roce_uar *hr_uar,
+			     struct hns_roce_cq *hr_cq, int vector)
+{
+	struct hns_roce_cmd_mailbox *mailbox;
+	struct hns_roce_hem_table *mtt_table;
+	struct hns_roce_cq_table *cq_table;
+	struct device *dev = hr_dev->dev;
+	dma_addr_t dma_handle;
+	u64 *mtts;
+	int ret;
+
+	cq_table = &hr_dev->cq_table;
+
+	/* Get the physical address of cq buf */
+	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
+		mtt_table = &hr_dev->mr_table.mtt_cqe_table;
+	else
+		mtt_table = &hr_dev->mr_table.mtt_table;
+
+	mtts = hns_roce_table_find(hr_dev, mtt_table,
+				   hr_mtt->first_seg, &dma_handle);
+	if (!mtts) {
+		dev_err(dev, "CQ alloc.Failed to find cq buf addr.\n");
+		return -EINVAL;
+	}
+
+	if (vector >= hr_dev->caps.num_comp_vectors) {
+		dev_err(dev, "CQ alloc.Invalid vector.\n");
+		return -EINVAL;
+	}
+	hr_cq->vector = vector;
+
+	ret = hns_roce_bitmap_alloc(&cq_table->bitmap, &hr_cq->cqn);
+	if (ret == -1) {
+		dev_err(dev, "CQ alloc.Failed to alloc index.\n");
+		return -ENOMEM;
+	}
+
+	/* Get CQC memory HEM(Hardware Entry Memory) table */
+	ret = hns_roce_table_get(hr_dev, &cq_table->table, hr_cq->cqn);
+	if (ret) {
+		dev_err(dev, "CQ alloc.Failed to get context mem.\n");
+		goto err_out;
+	}
+
+	/* The cq insert radix tree */
+	spin_lock_irq(&cq_table->lock);
+	/* Radix_tree: The associated pointer and long integer key value like */
+	ret = radix_tree_insert(&cq_table->tree, hr_cq->cqn, hr_cq);
+	spin_unlock_irq(&cq_table->lock);
+	if (ret) {
+		dev_err(dev, "CQ alloc.Failed to radix_tree_insert.\n");
+		goto err_put;
+	}
+
+	/* Allocate mailbox memory */
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox)) {
+		ret = PTR_ERR(mailbox);
+		goto err_radix;
+	}
+
+	hr_dev->hw->write_cqc(hr_dev, hr_cq, mailbox->buf, mtts, dma_handle,
+			      nent, vector);
+
+	/* Send mailbox to hw */
+	ret = hns_roce_sw2hw_cq(hr_dev, mailbox, hr_cq->cqn);
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+	if (ret) {
+		dev_err(dev, "CQ alloc.Failed to cmd mailbox.\n");
+		goto err_radix;
+	}
+
+	hr_cq->cons_index = 0;
+	hr_cq->arm_sn = 1;
+	hr_cq->uar = hr_uar;
+
+	atomic_set(&hr_cq->refcount, 1);
+	init_completion(&hr_cq->free);
+
+	return 0;
+
+err_radix:
+	spin_lock_irq(&cq_table->lock);
+	radix_tree_delete(&cq_table->tree, hr_cq->cqn);
+	spin_unlock_irq(&cq_table->lock);
+
+err_put:
+	hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn);
+
+err_out:
+	hns_roce_bitmap_free(&cq_table->bitmap, hr_cq->cqn, BITMAP_NO_RR);
+	return ret;
+}
+
+static int hns_roce_hw2sw_cq(struct hns_roce_dev *dev,
+			     struct hns_roce_cmd_mailbox *mailbox,
+			     unsigned long cq_num)
+{
+	return hns_roce_cmd_mbox(dev, 0, mailbox ? mailbox->dma : 0, cq_num,
+				 mailbox ? 0 : 1, HNS_ROCE_CMD_HW2SW_CQ,
+				 HNS_ROCE_CMD_TIMEOUT_MSECS);
+}
+
+void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
+{
+	struct hns_roce_cq_table *cq_table = &hr_dev->cq_table;
+	struct device *dev = hr_dev->dev;
+	int ret;
+
+	ret = hns_roce_hw2sw_cq(hr_dev, NULL, hr_cq->cqn);
+	if (ret)
+		dev_err(dev, "HW2SW_CQ failed (%d) for CQN %06lx\n", ret,
+			hr_cq->cqn);
+
+	/* Waiting interrupt process procedure carried out */
+	synchronize_irq(hr_dev->eq_table.eq[hr_cq->vector].irq);
+
+	/* wait for all interrupt processed */
+	if (atomic_dec_and_test(&hr_cq->refcount))
+		complete(&hr_cq->free);
+	wait_for_completion(&hr_cq->free);
+
+	spin_lock_irq(&cq_table->lock);
+	radix_tree_delete(&cq_table->tree, hr_cq->cqn);
+	spin_unlock_irq(&cq_table->lock);
+
+	hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn);
+	hns_roce_bitmap_free(&cq_table->bitmap, hr_cq->cqn, BITMAP_NO_RR);
+}
+EXPORT_SYMBOL_GPL(hns_roce_free_cq);
+
+static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev,
+				   struct ib_ucontext *context,
+				   struct hns_roce_cq_buf *buf,
+				   struct ib_umem **umem, u64 buf_addr, int cqe)
+{
+	int ret;
+	u32 page_shift;
+	u32 npages;
+
+	*umem = ib_umem_get(context, buf_addr, cqe * hr_dev->caps.cq_entry_sz,
+			    IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(*umem))
+		return PTR_ERR(*umem);
+
+	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
+		buf->hr_mtt.mtt_type = MTT_TYPE_CQE;
+	else
+		buf->hr_mtt.mtt_type = MTT_TYPE_WQE;
+
+	if (hr_dev->caps.cqe_buf_pg_sz) {
+		npages = (ib_umem_page_count(*umem) +
+			(1 << hr_dev->caps.cqe_buf_pg_sz) - 1) /
+			(1 << hr_dev->caps.cqe_buf_pg_sz);
+		page_shift = PAGE_SHIFT + hr_dev->caps.cqe_buf_pg_sz;
+		ret = hns_roce_mtt_init(hr_dev, npages, page_shift,
+					&buf->hr_mtt);
+	} else {
+		ret = hns_roce_mtt_init(hr_dev, ib_umem_page_count(*umem),
+				(*umem)->page_shift,
+				&buf->hr_mtt);
+	}
+	if (ret)
+		goto err_buf;
+
+	ret = hns_roce_ib_umem_write_mtt(hr_dev, &buf->hr_mtt, *umem);
+	if (ret)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	hns_roce_mtt_cleanup(hr_dev, &buf->hr_mtt);
+
+err_buf:
+	ib_umem_release(*umem);
+	return ret;
+}
+
+static int hns_roce_ib_alloc_cq_buf(struct hns_roce_dev *hr_dev,
+				    struct hns_roce_cq_buf *buf, u32 nent)
+{
+	int ret;
+	u32 page_shift = PAGE_SHIFT + hr_dev->caps.cqe_buf_pg_sz;
+
+	ret = hns_roce_buf_alloc(hr_dev, nent * hr_dev->caps.cq_entry_sz,
+				 (1 << page_shift) * 2, &buf->hr_buf,
+				 page_shift);
+	if (ret)
+		goto out;
+
+	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
+		buf->hr_mtt.mtt_type = MTT_TYPE_CQE;
+	else
+		buf->hr_mtt.mtt_type = MTT_TYPE_WQE;
+
+	ret = hns_roce_mtt_init(hr_dev, buf->hr_buf.npages,
+				buf->hr_buf.page_shift, &buf->hr_mtt);
+	if (ret)
+		goto err_buf;
+
+	ret = hns_roce_buf_write_mtt(hr_dev, &buf->hr_mtt, &buf->hr_buf);
+	if (ret)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	hns_roce_mtt_cleanup(hr_dev, &buf->hr_mtt);
+
+err_buf:
+	hns_roce_buf_free(hr_dev, nent * hr_dev->caps.cq_entry_sz,
+			  &buf->hr_buf);
+out:
+	return ret;
+}
+
+static void hns_roce_ib_free_cq_buf(struct hns_roce_dev *hr_dev,
+				    struct hns_roce_cq_buf *buf, int cqe)
+{
+	hns_roce_buf_free(hr_dev, (cqe + 1) * hr_dev->caps.cq_entry_sz,
+			  &buf->hr_buf);
+}
+
+struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
+				    const struct ib_cq_init_attr *attr,
+				    struct ib_ucontext *context,
+				    struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_ib_create_cq ucmd;
+	struct hns_roce_ib_create_cq_resp resp = {};
+	struct hns_roce_cq *hr_cq = NULL;
+	struct hns_roce_uar *uar = NULL;
+	int vector = attr->comp_vector;
+	int cq_entries = attr->cqe;
+	int ret;
+
+	if (cq_entries < 1 || cq_entries > hr_dev->caps.max_cqes) {
+		dev_err(dev, "Creat CQ failed. entries=%d, max=%d\n",
+			cq_entries, hr_dev->caps.max_cqes);
+		return ERR_PTR(-EINVAL);
+	}
+
+	hr_cq = kzalloc(sizeof(*hr_cq), GFP_KERNEL);
+	if (!hr_cq)
+		return ERR_PTR(-ENOMEM);
+
+	if (hr_dev->caps.min_cqes)
+		cq_entries = max(cq_entries, hr_dev->caps.min_cqes);
+
+	cq_entries = roundup_pow_of_two((unsigned int)cq_entries);
+	hr_cq->ib_cq.cqe = cq_entries - 1;
+	spin_lock_init(&hr_cq->lock);
+
+	if (context) {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+			dev_err(dev, "Failed to copy_from_udata.\n");
+			ret = -EFAULT;
+			goto err_cq;
+		}
+
+		/* Get user space address, write it into mtt table */
+		ret = hns_roce_ib_get_cq_umem(hr_dev, context, &hr_cq->hr_buf,
+					      &hr_cq->umem, ucmd.buf_addr,
+					      cq_entries);
+		if (ret) {
+			dev_err(dev, "Failed to get_cq_umem.\n");
+			goto err_cq;
+		}
+
+		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+		    (udata->outlen >= sizeof(resp))) {
+			ret = hns_roce_db_map_user(to_hr_ucontext(context),
+						   ucmd.db_addr, &hr_cq->db);
+			if (ret) {
+				dev_err(dev, "cq record doorbell map failed!\n");
+				goto err_mtt;
+			}
+			hr_cq->db_en = 1;
+			resp.cap_flags |= HNS_ROCE_SUPPORT_CQ_RECORD_DB;
+		}
+
+		/* Get user space parameters */
+		uar = &to_hr_ucontext(context)->uar;
+	} else {
+		if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) {
+			ret = hns_roce_alloc_db(hr_dev, &hr_cq->db, 1);
+			if (ret)
+				goto err_cq;
+
+			hr_cq->set_ci_db = hr_cq->db.db_record;
+			*hr_cq->set_ci_db = 0;
+			hr_cq->db_en = 1;
+		}
+
+		/* Init mmt table and write buff address to mtt table */
+		ret = hns_roce_ib_alloc_cq_buf(hr_dev, &hr_cq->hr_buf,
+					       cq_entries);
+		if (ret) {
+			dev_err(dev, "Failed to alloc_cq_buf.\n");
+			goto err_db;
+		}
+
+		uar = &hr_dev->priv_uar;
+		hr_cq->cq_db_l = hr_dev->reg_base + hr_dev->odb_offset +
+				DB_REG_OFFSET * uar->index;
+	}
+
+	/* Allocate cq index, fill cq_context */
+	ret = hns_roce_cq_alloc(hr_dev, cq_entries, &hr_cq->hr_buf.hr_mtt, uar,
+				hr_cq, vector);
+	if (ret) {
+		dev_err(dev, "Creat CQ .Failed to cq_alloc.\n");
+		goto err_dbmap;
+	}
+
+	/*
+	 * For the QP created by kernel space, tptr value should be initialized
+	 * to zero; For the QP created by user space, it will cause synchronous
+	 * problems if tptr is set to zero here, so we initialze it in user
+	 * space.
+	 */
+	if (!context && hr_cq->tptr_addr)
+		*hr_cq->tptr_addr = 0;
+
+	/* Get created cq handler and carry out event */
+	hr_cq->comp = hns_roce_ib_cq_comp;
+	hr_cq->event = hns_roce_ib_cq_event;
+	hr_cq->cq_depth = cq_entries;
+
+	if (context) {
+		resp.cqn = hr_cq->cqn;
+		ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (ret)
+			goto err_cqc;
+	}
+
+	return &hr_cq->ib_cq;
+
+err_cqc:
+	hns_roce_free_cq(hr_dev, hr_cq);
+
+err_dbmap:
+	if (context && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+	    (udata->outlen >= sizeof(resp)))
+		hns_roce_db_unmap_user(to_hr_ucontext(context),
+				       &hr_cq->db);
+
+err_mtt:
+	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
+	if (context)
+		ib_umem_release(hr_cq->umem);
+	else
+		hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf,
+					hr_cq->ib_cq.cqe);
+
+err_db:
+	if (!context && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB))
+		hns_roce_free_db(hr_dev, &hr_cq->db);
+
+err_cq:
+	kfree(hr_cq);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(hns_roce_ib_create_cq);
+
+int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
+	struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
+	int ret = 0;
+
+	if (hr_dev->hw->destroy_cq) {
+		ret = hr_dev->hw->destroy_cq(ib_cq);
+	} else {
+		hns_roce_free_cq(hr_dev, hr_cq);
+		hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
+
+		if (ib_cq->uobject) {
+			ib_umem_release(hr_cq->umem);
+
+			if (hr_cq->db_en == 1)
+				hns_roce_db_unmap_user(
+					to_hr_ucontext(ib_cq->uobject->context),
+					&hr_cq->db);
+		} else {
+			/* Free the buff of stored cq */
+			hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf,
+						ib_cq->cqe);
+			if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
+				hns_roce_free_db(hr_dev, &hr_cq->db);
+		}
+
+		kfree(hr_cq);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(hns_roce_ib_destroy_cq);
+
+void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn)
+{
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_cq *cq;
+
+	cq = radix_tree_lookup(&hr_dev->cq_table.tree,
+			       cqn & (hr_dev->caps.num_cqs - 1));
+	if (!cq) {
+		dev_warn(dev, "Completion event for bogus CQ 0x%08x\n", cqn);
+		return;
+	}
+
+	++cq->arm_sn;
+	cq->comp(cq);
+}
+EXPORT_SYMBOL_GPL(hns_roce_cq_completion);
+
+void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type)
+{
+	struct hns_roce_cq_table *cq_table = &hr_dev->cq_table;
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_cq *cq;
+
+	cq = radix_tree_lookup(&cq_table->tree,
+			       cqn & (hr_dev->caps.num_cqs - 1));
+	if (cq)
+		atomic_inc(&cq->refcount);
+
+	if (!cq) {
+		dev_warn(dev, "Async event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	cq->event(cq, (enum hns_roce_event)event_type);
+
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+}
+EXPORT_SYMBOL_GPL(hns_roce_cq_event);
+
+int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_cq_table *cq_table = &hr_dev->cq_table;
+
+	spin_lock_init(&cq_table->lock);
+	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+
+	return hns_roce_bitmap_init(&cq_table->bitmap, hr_dev->caps.num_cqs,
+				    hr_dev->caps.num_cqs - 1,
+				    hr_dev->caps.reserved_cqs, 0);
+}
+
+void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev)
+{
+	hns_roce_bitmap_cleanup(&hr_dev->cq_table.bitmap);
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c
new file mode 100644
index 0000000..ebee278
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_db.c
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2017 Hisilicon Limited.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ */
+
+#include <linux/platform_device.h>
+#include <rdma/ib_umem.h>
+#include "hns_roce_device.h"
+
+int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
+			 struct hns_roce_db *db)
+{
+	struct hns_roce_user_db_page *page;
+	int ret = 0;
+
+	mutex_lock(&context->page_mutex);
+
+	list_for_each_entry(page, &context->page_list, list)
+		if (page->user_virt == (virt & PAGE_MASK))
+			goto found;
+
+	page = kmalloc(sizeof(*page), GFP_KERNEL);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	refcount_set(&page->refcount, 1);
+	page->user_virt = (virt & PAGE_MASK);
+	page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
+				 PAGE_SIZE, 0, 0);
+	if (IS_ERR(page->umem)) {
+		ret = PTR_ERR(page->umem);
+		kfree(page);
+		goto out;
+	}
+
+	list_add(&page->list, &context->page_list);
+
+found:
+	db->dma = sg_dma_address(page->umem->sg_head.sgl) +
+		  (virt & ~PAGE_MASK);
+	db->u.user_page = page;
+	refcount_inc(&page->refcount);
+
+out:
+	mutex_unlock(&context->page_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL(hns_roce_db_map_user);
+
+void hns_roce_db_unmap_user(struct hns_roce_ucontext *context,
+			    struct hns_roce_db *db)
+{
+	mutex_lock(&context->page_mutex);
+
+	refcount_dec(&db->u.user_page->refcount);
+	if (refcount_dec_if_one(&db->u.user_page->refcount)) {
+		list_del(&db->u.user_page->list);
+		ib_umem_release(db->u.user_page->umem);
+		kfree(db->u.user_page);
+	}
+
+	mutex_unlock(&context->page_mutex);
+}
+EXPORT_SYMBOL(hns_roce_db_unmap_user);
+
+static struct hns_roce_db_pgdir *hns_roce_alloc_db_pgdir(
+					struct device *dma_device)
+{
+	struct hns_roce_db_pgdir *pgdir;
+
+	pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL);
+	if (!pgdir)
+		return NULL;
+
+	bitmap_fill(pgdir->order1, HNS_ROCE_DB_PER_PAGE / 2);
+	pgdir->bits[0] = pgdir->order0;
+	pgdir->bits[1] = pgdir->order1;
+	pgdir->page = dma_alloc_coherent(dma_device, PAGE_SIZE,
+					 &pgdir->db_dma, GFP_KERNEL);
+	if (!pgdir->page) {
+		kfree(pgdir);
+		return NULL;
+	}
+
+	return pgdir;
+}
+
+static int hns_roce_alloc_db_from_pgdir(struct hns_roce_db_pgdir *pgdir,
+					struct hns_roce_db *db, int order)
+{
+	int o;
+	int i;
+
+	for (o = order; o <= 1; ++o) {
+		i = find_first_bit(pgdir->bits[o], HNS_ROCE_DB_PER_PAGE >> o);
+		if (i < HNS_ROCE_DB_PER_PAGE >> o)
+			goto found;
+	}
+
+	return -ENOMEM;
+
+found:
+	clear_bit(i, pgdir->bits[o]);
+
+	i <<= o;
+
+	if (o > order)
+		set_bit(i ^ 1, pgdir->bits[order]);
+
+	db->u.pgdir	= pgdir;
+	db->index	= i;
+	db->db_record	= pgdir->page + db->index;
+	db->dma		= pgdir->db_dma  + db->index * 4;
+	db->order	= order;
+
+	return 0;
+}
+
+int hns_roce_alloc_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db,
+		      int order)
+{
+	struct hns_roce_db_pgdir *pgdir;
+	int ret = 0;
+
+	mutex_lock(&hr_dev->pgdir_mutex);
+
+	list_for_each_entry(pgdir, &hr_dev->pgdir_list, list)
+		if (!hns_roce_alloc_db_from_pgdir(pgdir, db, order))
+			goto out;
+
+	pgdir = hns_roce_alloc_db_pgdir(hr_dev->dev);
+	if (!pgdir) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_add(&pgdir->list, &hr_dev->pgdir_list);
+
+	/* This should never fail -- we just allocated an empty page: */
+	WARN_ON(hns_roce_alloc_db_from_pgdir(pgdir, db, order));
+
+out:
+	mutex_unlock(&hr_dev->pgdir_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(hns_roce_alloc_db);
+
+void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db)
+{
+	int o;
+	int i;
+
+	mutex_lock(&hr_dev->pgdir_mutex);
+
+	o = db->order;
+	i = db->index;
+
+	if (db->order == 0 && test_bit(i ^ 1, db->u.pgdir->order0)) {
+		clear_bit(i ^ 1, db->u.pgdir->order0);
+		++o;
+	}
+
+	i >>= o;
+	set_bit(i, db->u.pgdir->bits[o]);
+
+	if (bitmap_full(db->u.pgdir->order1, HNS_ROCE_DB_PER_PAGE / 2)) {
+		dma_free_coherent(hr_dev->dev, PAGE_SIZE, db->u.pgdir->page,
+				  db->u.pgdir->db_dma);
+		list_del(&db->u.pgdir->list);
+		kfree(db->u.pgdir);
+	}
+
+	mutex_unlock(&hr_dev->pgdir_mutex);
+}
+EXPORT_SYMBOL_GPL(hns_roce_free_db);
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
new file mode 100644
index 0000000..fb305b7
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -0,0 +1,997 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _HNS_ROCE_DEVICE_H
+#define _HNS_ROCE_DEVICE_H
+
+#include <rdma/ib_verbs.h>
+
+#define DRV_NAME "hns_roce"
+
+#define HNS_ROCE_HW_VER1	('h' << 24 | 'i' << 16 | '0' << 8 | '6')
+
+#define MAC_ADDR_OCTET_NUM			6
+#define HNS_ROCE_MAX_MSG_LEN			0x80000000
+
+#define HNS_ROCE_ALOGN_UP(a, b) ((((a) + (b) - 1) / (b)) * (b))
+
+#define HNS_ROCE_IB_MIN_SQ_STRIDE		6
+
+#define HNS_ROCE_BA_SIZE			(32 * 4096)
+
+/* Hardware specification only for v1 engine */
+#define HNS_ROCE_MIN_CQE_NUM			0x40
+#define HNS_ROCE_MIN_WQE_NUM			0x20
+
+/* Hardware specification only for v1 engine */
+#define HNS_ROCE_MAX_INNER_MTPT_NUM		0x7
+#define HNS_ROCE_MAX_MTPT_PBL_NUM		0x100000
+
+#define HNS_ROCE_EACH_FREE_CQ_WAIT_MSECS	20
+#define HNS_ROCE_MAX_FREE_CQ_WAIT_CNT	\
+	(5000 / HNS_ROCE_EACH_FREE_CQ_WAIT_MSECS)
+#define HNS_ROCE_CQE_WCMD_EMPTY_BIT		0x2
+#define HNS_ROCE_MIN_CQE_CNT			16
+
+#define HNS_ROCE_MAX_IRQ_NUM			128
+
+#define EQ_ENABLE				1
+#define EQ_DISABLE				0
+
+#define HNS_ROCE_CEQ				0
+#define HNS_ROCE_AEQ				1
+
+#define HNS_ROCE_CEQ_ENTRY_SIZE			0x4
+#define HNS_ROCE_AEQ_ENTRY_SIZE			0x10
+
+/* 4G/4K = 1M */
+#define HNS_ROCE_SL_SHIFT			28
+#define HNS_ROCE_TCLASS_SHIFT			20
+#define HNS_ROCE_FLOW_LABLE_MASK		0xfffff
+
+#define HNS_ROCE_MAX_PORTS			6
+#define HNS_ROCE_MAX_GID_NUM			16
+#define HNS_ROCE_GID_SIZE			16
+
+#define HNS_ROCE_HOP_NUM_0			0xff
+
+#define BITMAP_NO_RR				0
+#define BITMAP_RR				1
+
+#define MR_TYPE_MR				0x00
+#define MR_TYPE_DMA				0x03
+
+#define PKEY_ID					0xffff
+#define GUID_LEN				8
+#define NODE_DESC_SIZE				64
+#define DB_REG_OFFSET				0x1000
+
+#define SERV_TYPE_RC				0
+#define SERV_TYPE_RD				1
+#define SERV_TYPE_UC				2
+#define SERV_TYPE_UD				3
+
+#define PAGES_SHIFT_8				8
+#define PAGES_SHIFT_16				16
+#define PAGES_SHIFT_24				24
+#define PAGES_SHIFT_32				32
+
+enum {
+	HNS_ROCE_SUPPORT_RQ_RECORD_DB = 1 << 0,
+};
+
+enum {
+	HNS_ROCE_SUPPORT_CQ_RECORD_DB = 1 << 0,
+};
+
+enum hns_roce_qp_state {
+	HNS_ROCE_QP_STATE_RST,
+	HNS_ROCE_QP_STATE_INIT,
+	HNS_ROCE_QP_STATE_RTR,
+	HNS_ROCE_QP_STATE_RTS,
+	HNS_ROCE_QP_STATE_SQD,
+	HNS_ROCE_QP_STATE_ERR,
+	HNS_ROCE_QP_NUM_STATE,
+};
+
+enum hns_roce_event {
+	HNS_ROCE_EVENT_TYPE_PATH_MIG                  = 0x01,
+	HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED           = 0x02,
+	HNS_ROCE_EVENT_TYPE_COMM_EST                  = 0x03,
+	HNS_ROCE_EVENT_TYPE_SQ_DRAINED                = 0x04,
+	HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR            = 0x05,
+	HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR    = 0x06,
+	HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR     = 0x07,
+	HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH           = 0x08,
+	HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH        = 0x09,
+	HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR           = 0x0a,
+	HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR           = 0x0b,
+	HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW               = 0x0c,
+	HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID             = 0x0d,
+	HNS_ROCE_EVENT_TYPE_PORT_CHANGE               = 0x0f,
+	/* 0x10 and 0x11 is unused in currently application case */
+	HNS_ROCE_EVENT_TYPE_DB_OVERFLOW               = 0x12,
+	HNS_ROCE_EVENT_TYPE_MB                        = 0x13,
+	HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW              = 0x14,
+	HNS_ROCE_EVENT_TYPE_FLR			      = 0x15,
+};
+
+/* Local Work Queue Catastrophic Error,SUBTYPE 0x5 */
+enum {
+	HNS_ROCE_LWQCE_QPC_ERROR		= 1,
+	HNS_ROCE_LWQCE_MTU_ERROR		= 2,
+	HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR	= 3,
+	HNS_ROCE_LWQCE_WQE_ADDR_ERROR		= 4,
+	HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR	= 5,
+	HNS_ROCE_LWQCE_SL_ERROR			= 6,
+	HNS_ROCE_LWQCE_PORT_ERROR		= 7,
+};
+
+/* Local Access Violation Work Queue Error,SUBTYPE 0x7 */
+enum {
+	HNS_ROCE_LAVWQE_R_KEY_VIOLATION		= 1,
+	HNS_ROCE_LAVWQE_LENGTH_ERROR		= 2,
+	HNS_ROCE_LAVWQE_VA_ERROR		= 3,
+	HNS_ROCE_LAVWQE_PD_ERROR		= 4,
+	HNS_ROCE_LAVWQE_RW_ACC_ERROR		= 5,
+	HNS_ROCE_LAVWQE_KEY_STATE_ERROR		= 6,
+	HNS_ROCE_LAVWQE_MR_OPERATION_ERROR	= 7,
+};
+
+/* DOORBELL overflow subtype */
+enum {
+	HNS_ROCE_DB_SUBTYPE_SDB_OVF		= 1,
+	HNS_ROCE_DB_SUBTYPE_SDB_ALM_OVF		= 2,
+	HNS_ROCE_DB_SUBTYPE_ODB_OVF		= 3,
+	HNS_ROCE_DB_SUBTYPE_ODB_ALM_OVF		= 4,
+	HNS_ROCE_DB_SUBTYPE_SDB_ALM_EMP		= 5,
+	HNS_ROCE_DB_SUBTYPE_ODB_ALM_EMP		= 6,
+};
+
+enum {
+	/* RQ&SRQ related operations */
+	HNS_ROCE_OPCODE_SEND_DATA_RECEIVE	= 0x06,
+	HNS_ROCE_OPCODE_RDMA_WITH_IMM_RECEIVE	= 0x07,
+};
+
+enum {
+	HNS_ROCE_CAP_FLAG_REREG_MR		= BIT(0),
+	HNS_ROCE_CAP_FLAG_ROCE_V1_V2		= BIT(1),
+	HNS_ROCE_CAP_FLAG_RQ_INLINE		= BIT(2),
+	HNS_ROCE_CAP_FLAG_RECORD_DB		= BIT(3)
+};
+
+enum hns_roce_mtt_type {
+	MTT_TYPE_WQE,
+	MTT_TYPE_CQE,
+};
+
+enum {
+	HNS_ROCE_DB_PER_PAGE = PAGE_SIZE / 4
+};
+
+#define HNS_ROCE_CMD_SUCCESS			1
+
+#define HNS_ROCE_PORT_DOWN			0
+#define HNS_ROCE_PORT_UP			1
+
+#define HNS_ROCE_MTT_ENTRY_PER_SEG		8
+
+#define PAGE_ADDR_SHIFT				12
+
+struct hns_roce_uar {
+	u64		pfn;
+	unsigned long	index;
+};
+
+struct hns_roce_ucontext {
+	struct ib_ucontext	ibucontext;
+	struct hns_roce_uar	uar;
+	struct list_head	page_list;
+	struct mutex		page_mutex;
+};
+
+struct hns_roce_pd {
+	struct ib_pd		ibpd;
+	unsigned long		pdn;
+};
+
+struct hns_roce_bitmap {
+	/* Bitmap Traversal last a bit which is 1 */
+	unsigned long		last;
+	unsigned long		top;
+	unsigned long		max;
+	unsigned long		reserved_top;
+	unsigned long		mask;
+	spinlock_t		lock;
+	unsigned long		*table;
+};
+
+/* Order bitmap length -- bit num compute formula: 1 << (max_order - order) */
+/* Order = 0: bitmap is biggest, order = max bitmap is least (only a bit) */
+/* Every bit repesent to a partner free/used status in bitmap */
+/*
+ * Initial, bits of other bitmap are all 0 except that a bit of max_order is 1
+ * Bit = 1 represent to idle and available; bit = 0: not available
+ */
+struct hns_roce_buddy {
+	/* Members point to every order level bitmap */
+	unsigned long **bits;
+	/* Represent to avail bits of the order level bitmap */
+	u32            *num_free;
+	int             max_order;
+	spinlock_t      lock;
+};
+
+/* For Hardware Entry Memory */
+struct hns_roce_hem_table {
+	/* HEM type: 0 = qpc, 1 = mtt, 2 = cqc, 3 = srq, 4 = other */
+	u32		type;
+	/* HEM array elment num */
+	unsigned long	num_hem;
+	/* HEM entry record obj total num */
+	unsigned long	num_obj;
+	/*Single obj size */
+	unsigned long	obj_size;
+	unsigned long	table_chunk_size;
+	int		lowmem;
+	struct mutex	mutex;
+	struct hns_roce_hem **hem;
+	u64		**bt_l1;
+	dma_addr_t	*bt_l1_dma_addr;
+	u64		**bt_l0;
+	dma_addr_t	*bt_l0_dma_addr;
+};
+
+struct hns_roce_mtt {
+	unsigned long		first_seg;
+	int			order;
+	int			page_shift;
+	enum hns_roce_mtt_type	mtt_type;
+};
+
+/* Only support 4K page size for mr register */
+#define MR_SIZE_4K 0
+
+struct hns_roce_mr {
+	struct ib_mr		ibmr;
+	struct ib_umem		*umem;
+	u64			iova; /* MR's virtual orignal addr */
+	u64			size; /* Address range of MR */
+	u32			key; /* Key of MR */
+	u32			pd;   /* PD num of MR */
+	u32			access;/* Access permission of MR */
+	int			enabled; /* MR's active status */
+	int			type;	/* MR's register type */
+	u64			*pbl_buf;/* MR's PBL space */
+	dma_addr_t		pbl_dma_addr;	/* MR's PBL space PA */
+	u32			pbl_size;/* PA number in the PBL */
+	u64			pbl_ba;/* page table address */
+	u32			l0_chunk_last_num;/* L0 last number */
+	u32			l1_chunk_last_num;/* L1 last number */
+	u64			**pbl_bt_l2;/* PBL BT L2 */
+	u64			**pbl_bt_l1;/* PBL BT L1 */
+	u64			*pbl_bt_l0;/* PBL BT L0 */
+	dma_addr_t		*pbl_l2_dma_addr;/* PBL BT L2 dma addr */
+	dma_addr_t		*pbl_l1_dma_addr;/* PBL BT L1 dma addr */
+	dma_addr_t		pbl_l0_dma_addr;/* PBL BT L0 dma addr */
+	u32			pbl_ba_pg_sz;/* BT chunk page size */
+	u32			pbl_buf_pg_sz;/* buf chunk page size */
+	u32			pbl_hop_num;/* multi-hop number */
+};
+
+struct hns_roce_mr_table {
+	struct hns_roce_bitmap		mtpt_bitmap;
+	struct hns_roce_buddy		mtt_buddy;
+	struct hns_roce_hem_table	mtt_table;
+	struct hns_roce_hem_table	mtpt_table;
+	struct hns_roce_buddy		mtt_cqe_buddy;
+	struct hns_roce_hem_table	mtt_cqe_table;
+};
+
+struct hns_roce_wq {
+	u64		*wrid;     /* Work request ID */
+	spinlock_t	lock;
+	int		wqe_cnt;  /* WQE num */
+	u32		max_post;
+	int		max_gs;
+	int		offset;
+	int		wqe_shift;/* WQE size */
+	u32		head;
+	u32		tail;
+	void __iomem	*db_reg_l;
+};
+
+struct hns_roce_sge {
+	int		sge_cnt;  /* SGE num */
+	int		offset;
+	int		sge_shift;/* SGE size */
+};
+
+struct hns_roce_buf_list {
+	void		*buf;
+	dma_addr_t	map;
+};
+
+struct hns_roce_buf {
+	struct hns_roce_buf_list	direct;
+	struct hns_roce_buf_list	*page_list;
+	int				nbufs;
+	u32				npages;
+	int				page_shift;
+};
+
+struct hns_roce_db_pgdir {
+	struct list_head	list;
+	DECLARE_BITMAP(order0, HNS_ROCE_DB_PER_PAGE);
+	DECLARE_BITMAP(order1, HNS_ROCE_DB_PER_PAGE / 2);
+	unsigned long		*bits[2];
+	u32			*page;
+	dma_addr_t		db_dma;
+};
+
+struct hns_roce_user_db_page {
+	struct list_head	list;
+	struct ib_umem		*umem;
+	unsigned long		user_virt;
+	refcount_t		refcount;
+};
+
+struct hns_roce_db {
+	u32		*db_record;
+	union {
+		struct hns_roce_db_pgdir *pgdir;
+		struct hns_roce_user_db_page *user_page;
+	} u;
+	dma_addr_t	dma;
+	int		index;
+	int		order;
+};
+
+struct hns_roce_cq_buf {
+	struct hns_roce_buf hr_buf;
+	struct hns_roce_mtt hr_mtt;
+};
+
+struct hns_roce_cq {
+	struct ib_cq			ib_cq;
+	struct hns_roce_cq_buf		hr_buf;
+	struct hns_roce_db		db;
+	u8				db_en;
+	spinlock_t			lock;
+	struct ib_umem			*umem;
+	void (*comp)(struct hns_roce_cq *cq);
+	void (*event)(struct hns_roce_cq *cq, enum hns_roce_event event_type);
+
+	struct hns_roce_uar		*uar;
+	u32				cq_depth;
+	u32				cons_index;
+	u32				*set_ci_db;
+	void __iomem			*cq_db_l;
+	u16				*tptr_addr;
+	int				arm_sn;
+	unsigned long			cqn;
+	u32				vector;
+	atomic_t			refcount;
+	struct completion		free;
+};
+
+struct hns_roce_srq {
+	struct ib_srq		ibsrq;
+	int			srqn;
+};
+
+struct hns_roce_uar_table {
+	struct hns_roce_bitmap bitmap;
+};
+
+struct hns_roce_qp_table {
+	struct hns_roce_bitmap		bitmap;
+	spinlock_t			lock;
+	struct hns_roce_hem_table	qp_table;
+	struct hns_roce_hem_table	irrl_table;
+	struct hns_roce_hem_table	trrl_table;
+};
+
+struct hns_roce_cq_table {
+	struct hns_roce_bitmap		bitmap;
+	spinlock_t			lock;
+	struct radix_tree_root		tree;
+	struct hns_roce_hem_table	table;
+};
+
+struct hns_roce_raq_table {
+	struct hns_roce_buf_list	*e_raq_buf;
+};
+
+struct hns_roce_av {
+	__le32      port_pd;
+	u8          gid_index;
+	u8          stat_rate;
+	u8          hop_limit;
+	__le32      sl_tclass_flowlabel;
+	u8          dgid[HNS_ROCE_GID_SIZE];
+	u8          mac[6];
+	__le16      vlan;
+};
+
+struct hns_roce_ah {
+	struct ib_ah		ibah;
+	struct hns_roce_av	av;
+};
+
+struct hns_roce_cmd_context {
+	struct completion	done;
+	int			result;
+	int			next;
+	u64			out_param;
+	u16			token;
+};
+
+struct hns_roce_cmdq {
+	struct dma_pool		*pool;
+	struct mutex		hcr_mutex;
+	struct semaphore	poll_sem;
+	/*
+	 * Event mode: cmd register mutex protection,
+	 * ensure to not exceed max_cmds and user use limit region
+	 */
+	struct semaphore	event_sem;
+	int			max_cmds;
+	spinlock_t		context_lock;
+	int			free_head;
+	struct hns_roce_cmd_context *context;
+	/*
+	 * Result of get integer part
+	 * which max_comds compute according a power of 2
+	 */
+	u16			token_mask;
+	/*
+	 * Process whether use event mode, init default non-zero
+	 * After the event queue of cmd event ready,
+	 * can switch into event mode
+	 * close device, switch into poll mode(non event mode)
+	 */
+	u8			use_events;
+	u8			toggle;
+};
+
+struct hns_roce_cmd_mailbox {
+	void		       *buf;
+	dma_addr_t		dma;
+};
+
+struct hns_roce_dev;
+
+struct hns_roce_rinl_sge {
+	void			*addr;
+	u32			len;
+};
+
+struct hns_roce_rinl_wqe {
+	struct hns_roce_rinl_sge *sg_list;
+	u32			 sge_cnt;
+};
+
+struct hns_roce_rinl_buf {
+	struct hns_roce_rinl_wqe *wqe_list;
+	u32			 wqe_cnt;
+};
+
+struct hns_roce_qp {
+	struct ib_qp		ibqp;
+	struct hns_roce_buf	hr_buf;
+	struct hns_roce_wq	rq;
+	struct hns_roce_db	rdb;
+	u8			rdb_en;
+	u32			doorbell_qpn;
+	__le32			sq_signal_bits;
+	u32			sq_next_wqe;
+	int			sq_max_wqes_per_wr;
+	int			sq_spare_wqes;
+	struct hns_roce_wq	sq;
+
+	struct ib_umem		*umem;
+	struct hns_roce_mtt	mtt;
+	u32			buff_size;
+	struct mutex		mutex;
+	u8			port;
+	u8			phy_port;
+	u8			sl;
+	u8			resp_depth;
+	u8			state;
+	u32			access_flags;
+	u32                     atomic_rd_en;
+	u32			pkey_index;
+	u32			qkey;
+	void			(*event)(struct hns_roce_qp *qp,
+					 enum hns_roce_event event_type);
+	unsigned long		qpn;
+
+	atomic_t		refcount;
+	struct completion	free;
+
+	struct hns_roce_sge	sge;
+	u32			next_sge;
+
+	struct hns_roce_rinl_buf rq_inl_buf;
+};
+
+struct hns_roce_sqp {
+	struct hns_roce_qp	hr_qp;
+};
+
+struct hns_roce_ib_iboe {
+	spinlock_t		lock;
+	struct net_device      *netdevs[HNS_ROCE_MAX_PORTS];
+	struct notifier_block	nb;
+	u8			phy_port[HNS_ROCE_MAX_PORTS];
+};
+
+enum {
+	HNS_ROCE_EQ_STAT_INVALID  = 0,
+	HNS_ROCE_EQ_STAT_VALID    = 2,
+};
+
+struct hns_roce_ceqe {
+	u32			comp;
+};
+
+struct hns_roce_aeqe {
+	u32 asyn;
+	union {
+		struct {
+			u32 qp;
+			u32 rsv0;
+			u32 rsv1;
+		} qp_event;
+
+		struct {
+			u32 cq;
+			u32 rsv0;
+			u32 rsv1;
+		} cq_event;
+
+		struct {
+			u32 ceqe;
+			u32 rsv0;
+			u32 rsv1;
+		} ce_event;
+
+		struct {
+			__le64  out_param;
+			__le16  token;
+			u8	status;
+			u8	rsv0;
+		} __packed cmd;
+	 } event;
+};
+
+struct hns_roce_eq {
+	struct hns_roce_dev		*hr_dev;
+	void __iomem			*doorbell;
+
+	int				type_flag;/* Aeq:1 ceq:0 */
+	int				eqn;
+	u32				entries;
+	int				log_entries;
+	int				eqe_size;
+	int				irq;
+	int				log_page_size;
+	int				cons_index;
+	struct hns_roce_buf_list	*buf_list;
+	int				over_ignore;
+	int				coalesce;
+	int				arm_st;
+	u64				eqe_ba;
+	int				eqe_ba_pg_sz;
+	int				eqe_buf_pg_sz;
+	int				hop_num;
+	u64				*bt_l0;	/* Base address table for L0 */
+	u64				**bt_l1; /* Base address table for L1 */
+	u64				**buf;
+	dma_addr_t			l0_dma;
+	dma_addr_t			*l1_dma;
+	dma_addr_t			*buf_dma;
+	u32				l0_last_num; /* L0 last chunk num */
+	u32				l1_last_num; /* L1 last chunk num */
+	int				eq_max_cnt;
+	int				eq_period;
+	int				shift;
+	dma_addr_t			cur_eqe_ba;
+	dma_addr_t			nxt_eqe_ba;
+};
+
+struct hns_roce_eq_table {
+	struct hns_roce_eq	*eq;
+	void __iomem		**eqc_base; /* only for hw v1 */
+};
+
+struct hns_roce_caps {
+	u8		num_ports;
+	int		gid_table_len[HNS_ROCE_MAX_PORTS];
+	int		pkey_table_len[HNS_ROCE_MAX_PORTS];
+	int		local_ca_ack_delay;
+	int		num_uars;
+	u32		phy_num_uars;
+	u32		max_sq_sg;	/* 2 */
+	u32		max_sq_inline;	/* 32 */
+	u32		max_rq_sg;	/* 2 */
+	int		num_qps;	/* 256k */
+	u32		max_wqes;	/* 16k */
+	u32		max_sq_desc_sz;	/* 64 */
+	u32		max_rq_desc_sz;	/* 64 */
+	u32		max_srq_desc_sz;
+	int		max_qp_init_rdma;
+	int		max_qp_dest_rdma;
+	int		num_cqs;
+	int		max_cqes;
+	int		min_cqes;
+	u32		min_wqes;
+	int		reserved_cqs;
+	int		num_aeq_vectors;	/* 1 */
+	int		num_comp_vectors;
+	int		num_other_vectors;
+	int		num_mtpts;
+	u32		num_mtt_segs;
+	u32		num_cqe_segs;
+	int		reserved_mrws;
+	int		reserved_uars;
+	int		num_pds;
+	int		reserved_pds;
+	u32		mtt_entry_sz;
+	u32		cq_entry_sz;
+	u32		page_size_cap;
+	u32		reserved_lkey;
+	int		mtpt_entry_sz;
+	int		qpc_entry_sz;
+	int		irrl_entry_sz;
+	int		trrl_entry_sz;
+	int		cqc_entry_sz;
+	u32		pbl_ba_pg_sz;
+	u32		pbl_buf_pg_sz;
+	u32		pbl_hop_num;
+	int		aeqe_depth;
+	int		ceqe_depth;
+	enum ib_mtu	max_mtu;
+	u32		qpc_bt_num;
+	u32		srqc_bt_num;
+	u32		cqc_bt_num;
+	u32		mpt_bt_num;
+	u32		qpc_ba_pg_sz;
+	u32		qpc_buf_pg_sz;
+	u32		qpc_hop_num;
+	u32		srqc_ba_pg_sz;
+	u32		srqc_buf_pg_sz;
+	u32		srqc_hop_num;
+	u32		cqc_ba_pg_sz;
+	u32		cqc_buf_pg_sz;
+	u32		cqc_hop_num;
+	u32		mpt_ba_pg_sz;
+	u32		mpt_buf_pg_sz;
+	u32		mpt_hop_num;
+	u32		mtt_ba_pg_sz;
+	u32		mtt_buf_pg_sz;
+	u32		mtt_hop_num;
+	u32		cqe_ba_pg_sz;
+	u32		cqe_buf_pg_sz;
+	u32		cqe_hop_num;
+	u32		eqe_ba_pg_sz;
+	u32		eqe_buf_pg_sz;
+	u32		eqe_hop_num;
+	u32		chunk_sz;	/* chunk size in non multihop mode*/
+	u64		flags;
+};
+
+struct hns_roce_hw {
+	int (*reset)(struct hns_roce_dev *hr_dev, bool enable);
+	int (*cmq_init)(struct hns_roce_dev *hr_dev);
+	void (*cmq_exit)(struct hns_roce_dev *hr_dev);
+	int (*hw_profile)(struct hns_roce_dev *hr_dev);
+	int (*hw_init)(struct hns_roce_dev *hr_dev);
+	void (*hw_exit)(struct hns_roce_dev *hr_dev);
+	int (*post_mbox)(struct hns_roce_dev *hr_dev, u64 in_param,
+			 u64 out_param, u32 in_modifier, u8 op_modifier, u16 op,
+			 u16 token, int event);
+	int (*chk_mbox)(struct hns_roce_dev *hr_dev, unsigned long timeout);
+	int (*set_gid)(struct hns_roce_dev *hr_dev, u8 port, int gid_index,
+		       union ib_gid *gid, const struct ib_gid_attr *attr);
+	int (*set_mac)(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr);
+	void (*set_mtu)(struct hns_roce_dev *hr_dev, u8 phy_port,
+			enum ib_mtu mtu);
+	int (*write_mtpt)(void *mb_buf, struct hns_roce_mr *mr,
+			  unsigned long mtpt_idx);
+	int (*rereg_write_mtpt)(struct hns_roce_dev *hr_dev,
+				struct hns_roce_mr *mr, int flags, u32 pdn,
+				int mr_access_flags, u64 iova, u64 size,
+				void *mb_buf);
+	void (*write_cqc)(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts,
+			  dma_addr_t dma_handle, int nent, u32 vector);
+	int (*set_hem)(struct hns_roce_dev *hr_dev,
+		       struct hns_roce_hem_table *table, int obj, int step_idx);
+	int (*clear_hem)(struct hns_roce_dev *hr_dev,
+			 struct hns_roce_hem_table *table, int obj,
+			 int step_idx);
+	int (*query_qp)(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+			int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+	int (*modify_qp)(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
+			 int attr_mask, enum ib_qp_state cur_state,
+			 enum ib_qp_state new_state);
+	int (*destroy_qp)(struct ib_qp *ibqp);
+	int (*post_send)(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			 struct ib_send_wr **bad_wr);
+	int (*post_recv)(struct ib_qp *qp, struct ib_recv_wr *recv_wr,
+			 struct ib_recv_wr **bad_recv_wr);
+	int (*req_notify_cq)(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+	int (*poll_cq)(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+	int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr);
+	int (*destroy_cq)(struct ib_cq *ibcq);
+	int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+	int (*init_eq)(struct hns_roce_dev *hr_dev);
+	void (*cleanup_eq)(struct hns_roce_dev *hr_dev);
+};
+
+struct hns_roce_dev {
+	struct ib_device	ib_dev;
+	struct platform_device  *pdev;
+	struct pci_dev		*pci_dev;
+	struct device		*dev;
+	struct hns_roce_uar     priv_uar;
+	const char		*irq_names[HNS_ROCE_MAX_IRQ_NUM];
+	spinlock_t		sm_lock;
+	spinlock_t		bt_cmd_lock;
+	struct hns_roce_ib_iboe iboe;
+
+	struct list_head        pgdir_list;
+	struct mutex            pgdir_mutex;
+	int			irq[HNS_ROCE_MAX_IRQ_NUM];
+	u8 __iomem		*reg_base;
+	struct hns_roce_caps	caps;
+	struct radix_tree_root  qp_table_tree;
+
+	unsigned char	dev_addr[HNS_ROCE_MAX_PORTS][MAC_ADDR_OCTET_NUM];
+	u64			sys_image_guid;
+	u32                     vendor_id;
+	u32                     vendor_part_id;
+	u32                     hw_rev;
+	void __iomem            *priv_addr;
+
+	struct hns_roce_cmdq	cmd;
+	struct hns_roce_bitmap    pd_bitmap;
+	struct hns_roce_uar_table uar_table;
+	struct hns_roce_mr_table  mr_table;
+	struct hns_roce_cq_table  cq_table;
+	struct hns_roce_qp_table  qp_table;
+	struct hns_roce_eq_table  eq_table;
+
+	int			cmd_mod;
+	int			loop_idc;
+	u32			sdb_offset;
+	u32			odb_offset;
+	dma_addr_t		tptr_dma_addr; /*only for hw v1*/
+	u32			tptr_size; /*only for hw v1*/
+	const struct hns_roce_hw *hw;
+	void			*priv;
+};
+
+static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev)
+{
+	return container_of(ib_dev, struct hns_roce_dev, ib_dev);
+}
+
+static inline struct hns_roce_ucontext
+			*to_hr_ucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct hns_roce_ucontext, ibucontext);
+}
+
+static inline struct hns_roce_pd *to_hr_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct hns_roce_pd, ibpd);
+}
+
+static inline struct hns_roce_ah *to_hr_ah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct hns_roce_ah, ibah);
+}
+
+static inline struct hns_roce_mr *to_hr_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct hns_roce_mr, ibmr);
+}
+
+static inline struct hns_roce_qp *to_hr_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct hns_roce_qp, ibqp);
+}
+
+static inline struct hns_roce_cq *to_hr_cq(struct ib_cq *ib_cq)
+{
+	return container_of(ib_cq, struct hns_roce_cq, ib_cq);
+}
+
+static inline struct hns_roce_srq *to_hr_srq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct hns_roce_srq, ibsrq);
+}
+
+static inline struct hns_roce_sqp *hr_to_hr_sqp(struct hns_roce_qp *hr_qp)
+{
+	return container_of(hr_qp, struct hns_roce_sqp, hr_qp);
+}
+
+static inline void hns_roce_write64_k(__be32 val[2], void __iomem *dest)
+{
+	__raw_writeq(*(u64 *) val, dest);
+}
+
+static inline struct hns_roce_qp
+	*__hns_roce_qp_lookup(struct hns_roce_dev *hr_dev, u32 qpn)
+{
+	return radix_tree_lookup(&hr_dev->qp_table_tree,
+				 qpn & (hr_dev->caps.num_qps - 1));
+}
+
+static inline void *hns_roce_buf_offset(struct hns_roce_buf *buf, int offset)
+{
+	u32 page_size = 1 << buf->page_shift;
+
+	if (buf->nbufs == 1)
+		return (char *)(buf->direct.buf) + offset;
+	else
+		return (char *)(buf->page_list[offset >> buf->page_shift].buf) +
+		       (offset & (page_size - 1));
+}
+
+int hns_roce_init_uar_table(struct hns_roce_dev *dev);
+int hns_roce_uar_alloc(struct hns_roce_dev *dev, struct hns_roce_uar *uar);
+void hns_roce_uar_free(struct hns_roce_dev *dev, struct hns_roce_uar *uar);
+void hns_roce_cleanup_uar_table(struct hns_roce_dev *dev);
+
+int hns_roce_cmd_init(struct hns_roce_dev *hr_dev);
+void hns_roce_cmd_cleanup(struct hns_roce_dev *hr_dev);
+void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status,
+			u64 out_param);
+int hns_roce_cmd_use_events(struct hns_roce_dev *hr_dev);
+void hns_roce_cmd_use_polling(struct hns_roce_dev *hr_dev);
+
+int hns_roce_mtt_init(struct hns_roce_dev *hr_dev, int npages, int page_shift,
+		      struct hns_roce_mtt *mtt);
+void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_mtt *mtt);
+int hns_roce_buf_write_mtt(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_mtt *mtt, struct hns_roce_buf *buf);
+
+int hns_roce_init_pd_table(struct hns_roce_dev *hr_dev);
+int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev);
+int hns_roce_init_eq_table(struct hns_roce_dev *hr_dev);
+int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev);
+int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev);
+
+void hns_roce_cleanup_pd_table(struct hns_roce_dev *hr_dev);
+void hns_roce_cleanup_mr_table(struct hns_roce_dev *hr_dev);
+void hns_roce_cleanup_eq_table(struct hns_roce_dev *hr_dev);
+void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev);
+void hns_roce_cleanup_qp_table(struct hns_roce_dev *hr_dev);
+
+int hns_roce_bitmap_alloc(struct hns_roce_bitmap *bitmap, unsigned long *obj);
+void hns_roce_bitmap_free(struct hns_roce_bitmap *bitmap, unsigned long obj,
+			 int rr);
+int hns_roce_bitmap_init(struct hns_roce_bitmap *bitmap, u32 num, u32 mask,
+			 u32 reserved_bot, u32 resetrved_top);
+void hns_roce_bitmap_cleanup(struct hns_roce_bitmap *bitmap);
+void hns_roce_cleanup_bitmap(struct hns_roce_dev *hr_dev);
+int hns_roce_bitmap_alloc_range(struct hns_roce_bitmap *bitmap, int cnt,
+				int align, unsigned long *obj);
+void hns_roce_bitmap_free_range(struct hns_roce_bitmap *bitmap,
+				unsigned long obj, int cnt,
+				int rr);
+
+struct ib_ah *hns_roce_create_ah(struct ib_pd *pd,
+				 struct rdma_ah_attr *ah_attr,
+				 struct ib_udata *udata);
+int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
+int hns_roce_destroy_ah(struct ib_ah *ah);
+
+struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev,
+				struct ib_ucontext *context,
+				struct ib_udata *udata);
+int hns_roce_dealloc_pd(struct ib_pd *pd);
+
+struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc);
+struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				   u64 virt_addr, int access_flags,
+				   struct ib_udata *udata);
+int hns_roce_rereg_user_mr(struct ib_mr *mr, int flags, u64 start, u64 length,
+			   u64 virt_addr, int mr_access_flags, struct ib_pd *pd,
+			   struct ib_udata *udata);
+int hns_roce_dereg_mr(struct ib_mr *ibmr);
+int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
+		       struct hns_roce_cmd_mailbox *mailbox,
+		       unsigned long mpt_index);
+unsigned long key_to_hw_index(u32 key);
+
+void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size,
+		       struct hns_roce_buf *buf);
+int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,
+		       struct hns_roce_buf *buf, u32 page_shift);
+
+int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_mtt *mtt, struct ib_umem *umem);
+
+struct ib_qp *hns_roce_create_qp(struct ib_pd *ib_pd,
+				 struct ib_qp_init_attr *init_attr,
+				 struct ib_udata *udata);
+int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		       int attr_mask, struct ib_udata *udata);
+void *get_recv_wqe(struct hns_roce_qp *hr_qp, int n);
+void *get_send_wqe(struct hns_roce_qp *hr_qp, int n);
+void *get_send_extend_sge(struct hns_roce_qp *hr_qp, int n);
+bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, int nreq,
+			  struct ib_cq *ib_cq);
+enum hns_roce_qp_state to_hns_roce_state(enum ib_qp_state state);
+void hns_roce_lock_cqs(struct hns_roce_cq *send_cq,
+		       struct hns_roce_cq *recv_cq);
+void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq,
+			 struct hns_roce_cq *recv_cq);
+void hns_roce_qp_remove(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp);
+void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp);
+void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn,
+			       int cnt);
+__be32 send_ieth(struct ib_send_wr *wr);
+int to_hr_qp_type(int qp_type);
+
+struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
+				    const struct ib_cq_init_attr *attr,
+				    struct ib_ucontext *context,
+				    struct ib_udata *udata);
+
+int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq);
+void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq);
+
+int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt,
+			 struct hns_roce_db *db);
+void hns_roce_db_unmap_user(struct hns_roce_ucontext *context,
+			    struct hns_roce_db *db);
+int hns_roce_alloc_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db,
+		      int order);
+void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db);
+
+void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn);
+void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type);
+void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type);
+int hns_get_gid_index(struct hns_roce_dev *hr_dev, u8 port, int gid_index);
+int hns_roce_init(struct hns_roce_dev *hr_dev);
+void hns_roce_exit(struct hns_roce_dev *hr_dev);
+
+#endif /* _HNS_ROCE_DEVICE_H */
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
new file mode 100644
index 0000000..63b5b3e
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -0,0 +1,1052 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/platform_device.h>
+#include "hns_roce_device.h"
+#include "hns_roce_hem.h"
+#include "hns_roce_common.h"
+
+#define DMA_ADDR_T_SHIFT		12
+#define BT_BA_SHIFT			32
+
+bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type)
+{
+	if ((hr_dev->caps.qpc_hop_num && type == HEM_TYPE_QPC) ||
+	    (hr_dev->caps.mpt_hop_num && type == HEM_TYPE_MTPT) ||
+	    (hr_dev->caps.cqc_hop_num && type == HEM_TYPE_CQC) ||
+	    (hr_dev->caps.srqc_hop_num && type == HEM_TYPE_SRQC) ||
+	    (hr_dev->caps.cqe_hop_num && type == HEM_TYPE_CQE) ||
+	    (hr_dev->caps.mtt_hop_num && type == HEM_TYPE_MTT))
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(hns_roce_check_whether_mhop);
+
+static bool hns_roce_check_hem_null(struct hns_roce_hem **hem, u64 start_idx,
+			    u32 bt_chunk_num)
+{
+	int i;
+
+	for (i = 0; i < bt_chunk_num; i++)
+		if (hem[start_idx + i])
+			return false;
+
+	return true;
+}
+
+static bool hns_roce_check_bt_null(u64 **bt, u64 start_idx, u32 bt_chunk_num)
+{
+	int i;
+
+	for (i = 0; i < bt_chunk_num; i++)
+		if (bt[start_idx + i])
+			return false;
+
+	return true;
+}
+
+static int hns_roce_get_bt_num(u32 table_type, u32 hop_num)
+{
+	if (check_whether_bt_num_3(table_type, hop_num))
+		return 3;
+	else if (check_whether_bt_num_2(table_type, hop_num))
+		return 2;
+	else if (check_whether_bt_num_1(table_type, hop_num))
+		return 1;
+	else
+		return 0;
+}
+
+int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_hem_table *table, unsigned long *obj,
+			   struct hns_roce_hem_mhop *mhop)
+{
+	struct device *dev = hr_dev->dev;
+	u32 chunk_ba_num;
+	u32 table_idx;
+	u32 bt_num;
+	u32 chunk_size;
+
+	switch (table->type) {
+	case HEM_TYPE_QPC:
+		mhop->buf_chunk_size = 1 << (hr_dev->caps.qpc_buf_pg_sz
+					     + PAGE_SHIFT);
+		mhop->bt_chunk_size = 1 << (hr_dev->caps.qpc_ba_pg_sz
+					     + PAGE_SHIFT);
+		mhop->ba_l0_num = hr_dev->caps.qpc_bt_num;
+		mhop->hop_num = hr_dev->caps.qpc_hop_num;
+		break;
+	case HEM_TYPE_MTPT:
+		mhop->buf_chunk_size = 1 << (hr_dev->caps.mpt_buf_pg_sz
+					     + PAGE_SHIFT);
+		mhop->bt_chunk_size = 1 << (hr_dev->caps.mpt_ba_pg_sz
+					     + PAGE_SHIFT);
+		mhop->ba_l0_num = hr_dev->caps.mpt_bt_num;
+		mhop->hop_num = hr_dev->caps.mpt_hop_num;
+		break;
+	case HEM_TYPE_CQC:
+		mhop->buf_chunk_size = 1 << (hr_dev->caps.cqc_buf_pg_sz
+					     + PAGE_SHIFT);
+		mhop->bt_chunk_size = 1 << (hr_dev->caps.cqc_ba_pg_sz
+					    + PAGE_SHIFT);
+		mhop->ba_l0_num = hr_dev->caps.cqc_bt_num;
+		mhop->hop_num = hr_dev->caps.cqc_hop_num;
+		break;
+	case HEM_TYPE_SRQC:
+		mhop->buf_chunk_size = 1 << (hr_dev->caps.srqc_buf_pg_sz
+					     + PAGE_SHIFT);
+		mhop->bt_chunk_size = 1 << (hr_dev->caps.srqc_ba_pg_sz
+					     + PAGE_SHIFT);
+		mhop->ba_l0_num = hr_dev->caps.srqc_bt_num;
+		mhop->hop_num = hr_dev->caps.srqc_hop_num;
+		break;
+	case HEM_TYPE_MTT:
+		mhop->buf_chunk_size = 1 << (hr_dev->caps.mtt_buf_pg_sz
+					     + PAGE_SHIFT);
+		mhop->bt_chunk_size = 1 << (hr_dev->caps.mtt_ba_pg_sz
+					     + PAGE_SHIFT);
+		mhop->ba_l0_num = mhop->bt_chunk_size / 8;
+		mhop->hop_num = hr_dev->caps.mtt_hop_num;
+		break;
+	case HEM_TYPE_CQE:
+		mhop->buf_chunk_size = 1 << (hr_dev->caps.cqe_buf_pg_sz
+					     + PAGE_SHIFT);
+		mhop->bt_chunk_size = 1 << (hr_dev->caps.cqe_ba_pg_sz
+					     + PAGE_SHIFT);
+		mhop->ba_l0_num = mhop->bt_chunk_size / 8;
+		mhop->hop_num = hr_dev->caps.cqe_hop_num;
+		break;
+	default:
+		dev_err(dev, "Table %d not support multi-hop addressing!\n",
+			 table->type);
+		return -EINVAL;
+	}
+
+	if (!obj)
+		return 0;
+
+	/*
+	 * QPC/MTPT/CQC/SRQC alloc hem for buffer pages.
+	 * MTT/CQE alloc hem for bt pages.
+	 */
+	bt_num = hns_roce_get_bt_num(table->type, mhop->hop_num);
+	chunk_ba_num = mhop->bt_chunk_size / 8;
+	chunk_size = table->type < HEM_TYPE_MTT ? mhop->buf_chunk_size :
+			      mhop->bt_chunk_size;
+	table_idx = (*obj & (table->num_obj - 1)) /
+		     (chunk_size / table->obj_size);
+	switch (bt_num) {
+	case 3:
+		mhop->l2_idx = table_idx & (chunk_ba_num - 1);
+		mhop->l1_idx = table_idx / chunk_ba_num & (chunk_ba_num - 1);
+		mhop->l0_idx = table_idx / chunk_ba_num / chunk_ba_num;
+		break;
+	case 2:
+		mhop->l1_idx = table_idx & (chunk_ba_num - 1);
+		mhop->l0_idx = table_idx / chunk_ba_num;
+		break;
+	case 1:
+		mhop->l0_idx = table_idx;
+		break;
+	default:
+		dev_err(dev, "Table %d not support hop_num = %d!\n",
+			     table->type, mhop->hop_num);
+		return -EINVAL;
+	}
+	if (mhop->l0_idx >= mhop->ba_l0_num)
+		mhop->l0_idx %= mhop->ba_l0_num;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(hns_roce_calc_hem_mhop);
+
+static struct hns_roce_hem *hns_roce_alloc_hem(struct hns_roce_dev *hr_dev,
+					       int npages,
+					       unsigned long hem_alloc_size,
+					       gfp_t gfp_mask)
+{
+	struct hns_roce_hem_chunk *chunk = NULL;
+	struct hns_roce_hem *hem;
+	struct scatterlist *mem;
+	int order;
+	void *buf;
+
+	WARN_ON(gfp_mask & __GFP_HIGHMEM);
+
+	hem = kmalloc(sizeof(*hem),
+		      gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+	if (!hem)
+		return NULL;
+
+	hem->refcount = 0;
+	INIT_LIST_HEAD(&hem->chunk_list);
+
+	order = get_order(hem_alloc_size);
+
+	while (npages > 0) {
+		if (!chunk) {
+			chunk = kmalloc(sizeof(*chunk),
+				gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+			if (!chunk)
+				goto fail;
+
+			sg_init_table(chunk->mem, HNS_ROCE_HEM_CHUNK_LEN);
+			chunk->npages = 0;
+			chunk->nsg = 0;
+			memset(chunk->buf, 0, sizeof(chunk->buf));
+			list_add_tail(&chunk->list, &hem->chunk_list);
+		}
+
+		while (1 << order > npages)
+			--order;
+
+		/*
+		 * Alloc memory one time. If failed, don't alloc small block
+		 * memory, directly return fail.
+		 */
+		mem = &chunk->mem[chunk->npages];
+		buf = dma_alloc_coherent(hr_dev->dev, PAGE_SIZE << order,
+				&sg_dma_address(mem), gfp_mask);
+		if (!buf)
+			goto fail;
+
+		chunk->buf[chunk->npages] = buf;
+		sg_dma_len(mem) = PAGE_SIZE << order;
+
+		++chunk->npages;
+		++chunk->nsg;
+		npages -= 1 << order;
+	}
+
+	return hem;
+
+fail:
+	hns_roce_free_hem(hr_dev, hem);
+	return NULL;
+}
+
+void hns_roce_free_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem *hem)
+{
+	struct hns_roce_hem_chunk *chunk, *tmp;
+	int i;
+
+	if (!hem)
+		return;
+
+	list_for_each_entry_safe(chunk, tmp, &hem->chunk_list, list) {
+		for (i = 0; i < chunk->npages; ++i)
+			dma_free_coherent(hr_dev->dev,
+				   sg_dma_len(&chunk->mem[i]),
+				   chunk->buf[i],
+				   sg_dma_address(&chunk->mem[i]));
+		kfree(chunk);
+	}
+
+	kfree(hem);
+}
+
+static int hns_roce_set_hem(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_hem_table *table, unsigned long obj)
+{
+	spinlock_t *lock = &hr_dev->bt_cmd_lock;
+	struct device *dev = hr_dev->dev;
+	unsigned long end = 0;
+	unsigned long flags;
+	struct hns_roce_hem_iter iter;
+	void __iomem *bt_cmd;
+	u32 bt_cmd_h_val = 0;
+	u32 bt_cmd_val[2];
+	u32 bt_cmd_l = 0;
+	u64 bt_ba = 0;
+	int ret = 0;
+
+	/* Find the HEM(Hardware Entry Memory) entry */
+	unsigned long i = (obj & (table->num_obj - 1)) /
+			  (table->table_chunk_size / table->obj_size);
+
+	switch (table->type) {
+	case HEM_TYPE_QPC:
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, HEM_TYPE_QPC);
+		break;
+	case HEM_TYPE_MTPT:
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S,
+			       HEM_TYPE_MTPT);
+		break;
+	case HEM_TYPE_CQC:
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, HEM_TYPE_CQC);
+		break;
+	case HEM_TYPE_SRQC:
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S,
+			       HEM_TYPE_SRQC);
+		break;
+	default:
+		return ret;
+	}
+	roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_M,
+		       ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_S, obj);
+	roce_set_bit(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_S, 0);
+	roce_set_bit(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_HW_SYNS_S, 1);
+
+	/* Currently iter only a chunk */
+	for (hns_roce_hem_first(table->hem[i], &iter);
+	     !hns_roce_hem_last(&iter); hns_roce_hem_next(&iter)) {
+		bt_ba = hns_roce_hem_addr(&iter) >> DMA_ADDR_T_SHIFT;
+
+		spin_lock_irqsave(lock, flags);
+
+		bt_cmd = hr_dev->reg_base + ROCEE_BT_CMD_H_REG;
+
+		end = msecs_to_jiffies(HW_SYNC_TIMEOUT_MSECS) + jiffies;
+		while (1) {
+			if (readl(bt_cmd) >> BT_CMD_SYNC_SHIFT) {
+				if (!(time_before(jiffies, end))) {
+					dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n");
+					spin_unlock_irqrestore(lock, flags);
+					return -EBUSY;
+				}
+			} else {
+				break;
+			}
+			msleep(HW_SYNC_SLEEP_TIME_INTERVAL);
+		}
+
+		bt_cmd_l = (u32)bt_ba;
+		roce_set_field(bt_cmd_h_val, ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_M,
+			       ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_S,
+			       bt_ba >> BT_BA_SHIFT);
+
+		bt_cmd_val[0] = bt_cmd_l;
+		bt_cmd_val[1] = bt_cmd_h_val;
+		hns_roce_write64_k(bt_cmd_val,
+				   hr_dev->reg_base + ROCEE_BT_CMD_L_REG);
+		spin_unlock_irqrestore(lock, flags);
+	}
+
+	return ret;
+}
+
+static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev,
+				   struct hns_roce_hem_table *table,
+				   unsigned long obj)
+{
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_hem_mhop mhop;
+	struct hns_roce_hem_iter iter;
+	u32 buf_chunk_size;
+	u32 bt_chunk_size;
+	u32 chunk_ba_num;
+	u32 hop_num;
+	u32 size;
+	u32 bt_num;
+	u64 hem_idx;
+	u64 bt_l1_idx = 0;
+	u64 bt_l0_idx = 0;
+	u64 bt_ba;
+	unsigned long mhop_obj = obj;
+	int bt_l1_allocated = 0;
+	int bt_l0_allocated = 0;
+	int step_idx;
+	int ret;
+
+	ret = hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, &mhop);
+	if (ret)
+		return ret;
+
+	buf_chunk_size = mhop.buf_chunk_size;
+	bt_chunk_size = mhop.bt_chunk_size;
+	hop_num = mhop.hop_num;
+	chunk_ba_num = bt_chunk_size / 8;
+
+	bt_num = hns_roce_get_bt_num(table->type, hop_num);
+	switch (bt_num) {
+	case 3:
+		hem_idx = mhop.l0_idx * chunk_ba_num * chunk_ba_num +
+			  mhop.l1_idx * chunk_ba_num + mhop.l2_idx;
+		bt_l1_idx = mhop.l0_idx * chunk_ba_num + mhop.l1_idx;
+		bt_l0_idx = mhop.l0_idx;
+		break;
+	case 2:
+		hem_idx = mhop.l0_idx * chunk_ba_num + mhop.l1_idx;
+		bt_l0_idx = mhop.l0_idx;
+		break;
+	case 1:
+		hem_idx = mhop.l0_idx;
+		break;
+	default:
+		dev_err(dev, "Table %d not support hop_num = %d!\n",
+			     table->type, hop_num);
+		return -EINVAL;
+	}
+
+	mutex_lock(&table->mutex);
+
+	if (table->hem[hem_idx]) {
+		++table->hem[hem_idx]->refcount;
+		goto out;
+	}
+
+	/* alloc L1 BA's chunk */
+	if ((check_whether_bt_num_3(table->type, hop_num) ||
+		check_whether_bt_num_2(table->type, hop_num)) &&
+		!table->bt_l0[bt_l0_idx]) {
+		table->bt_l0[bt_l0_idx] = dma_alloc_coherent(dev, bt_chunk_size,
+					    &(table->bt_l0_dma_addr[bt_l0_idx]),
+					    GFP_KERNEL);
+		if (!table->bt_l0[bt_l0_idx]) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		bt_l0_allocated = 1;
+
+		/* set base address to hardware */
+		if (table->type < HEM_TYPE_MTT) {
+			step_idx = 0;
+			if (hr_dev->hw->set_hem(hr_dev, table, obj, step_idx)) {
+				ret = -ENODEV;
+				dev_err(dev, "set HEM base address to HW failed!\n");
+				goto err_dma_alloc_l1;
+			}
+		}
+	}
+
+	/* alloc L2 BA's chunk */
+	if (check_whether_bt_num_3(table->type, hop_num) &&
+	    !table->bt_l1[bt_l1_idx])  {
+		table->bt_l1[bt_l1_idx] = dma_alloc_coherent(dev, bt_chunk_size,
+					    &(table->bt_l1_dma_addr[bt_l1_idx]),
+					    GFP_KERNEL);
+		if (!table->bt_l1[bt_l1_idx]) {
+			ret = -ENOMEM;
+			goto err_dma_alloc_l1;
+		}
+		bt_l1_allocated = 1;
+		*(table->bt_l0[bt_l0_idx] + mhop.l1_idx) =
+					       table->bt_l1_dma_addr[bt_l1_idx];
+
+		/* set base address to hardware */
+		step_idx = 1;
+		if (hr_dev->hw->set_hem(hr_dev, table, obj, step_idx)) {
+			ret = -ENODEV;
+			dev_err(dev, "set HEM base address to HW failed!\n");
+			goto err_alloc_hem_buf;
+		}
+	}
+
+	/*
+	 * alloc buffer space chunk for QPC/MTPT/CQC/SRQC.
+	 * alloc bt space chunk for MTT/CQE.
+	 */
+	size = table->type < HEM_TYPE_MTT ? buf_chunk_size : bt_chunk_size;
+	table->hem[hem_idx] = hns_roce_alloc_hem(hr_dev,
+						size >> PAGE_SHIFT,
+						size,
+						(table->lowmem ? GFP_KERNEL :
+						GFP_HIGHUSER) | __GFP_NOWARN);
+	if (!table->hem[hem_idx]) {
+		ret = -ENOMEM;
+		goto err_alloc_hem_buf;
+	}
+
+	hns_roce_hem_first(table->hem[hem_idx], &iter);
+	bt_ba = hns_roce_hem_addr(&iter);
+
+	if (table->type < HEM_TYPE_MTT) {
+		if (hop_num == 2) {
+			*(table->bt_l1[bt_l1_idx] + mhop.l2_idx) = bt_ba;
+			step_idx = 2;
+		} else if (hop_num == 1) {
+			*(table->bt_l0[bt_l0_idx] + mhop.l1_idx) = bt_ba;
+			step_idx = 1;
+		} else if (hop_num == HNS_ROCE_HOP_NUM_0) {
+			step_idx = 0;
+		}
+
+		/* set HEM base address to hardware */
+		if (hr_dev->hw->set_hem(hr_dev, table, obj, step_idx)) {
+			ret = -ENODEV;
+			dev_err(dev, "set HEM base address to HW failed!\n");
+			goto err_alloc_hem_buf;
+		}
+	} else if (hop_num == 2) {
+		*(table->bt_l0[bt_l0_idx] + mhop.l1_idx) = bt_ba;
+	}
+
+	++table->hem[hem_idx]->refcount;
+	goto out;
+
+err_alloc_hem_buf:
+	if (bt_l1_allocated) {
+		dma_free_coherent(dev, bt_chunk_size, table->bt_l1[bt_l1_idx],
+				  table->bt_l1_dma_addr[bt_l1_idx]);
+		table->bt_l1[bt_l1_idx] = NULL;
+	}
+
+err_dma_alloc_l1:
+	if (bt_l0_allocated) {
+		dma_free_coherent(dev, bt_chunk_size, table->bt_l0[bt_l0_idx],
+				  table->bt_l0_dma_addr[bt_l0_idx]);
+		table->bt_l0[bt_l0_idx] = NULL;
+	}
+
+out:
+	mutex_unlock(&table->mutex);
+	return ret;
+}
+
+int hns_roce_table_get(struct hns_roce_dev *hr_dev,
+		       struct hns_roce_hem_table *table, unsigned long obj)
+{
+	struct device *dev = hr_dev->dev;
+	int ret = 0;
+	unsigned long i;
+
+	if (hns_roce_check_whether_mhop(hr_dev, table->type))
+		return hns_roce_table_mhop_get(hr_dev, table, obj);
+
+	i = (obj & (table->num_obj - 1)) / (table->table_chunk_size /
+	     table->obj_size);
+
+	mutex_lock(&table->mutex);
+
+	if (table->hem[i]) {
+		++table->hem[i]->refcount;
+		goto out;
+	}
+
+	table->hem[i] = hns_roce_alloc_hem(hr_dev,
+				       table->table_chunk_size >> PAGE_SHIFT,
+				       table->table_chunk_size,
+				       (table->lowmem ? GFP_KERNEL :
+					GFP_HIGHUSER) | __GFP_NOWARN);
+	if (!table->hem[i]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* Set HEM base address(128K/page, pa) to Hardware */
+	if (hns_roce_set_hem(hr_dev, table, obj)) {
+		hns_roce_free_hem(hr_dev, table->hem[i]);
+		table->hem[i] = NULL;
+		ret = -ENODEV;
+		dev_err(dev, "set HEM base address to HW failed.\n");
+		goto out;
+	}
+
+	++table->hem[i]->refcount;
+out:
+	mutex_unlock(&table->mutex);
+	return ret;
+}
+
+static void hns_roce_table_mhop_put(struct hns_roce_dev *hr_dev,
+				    struct hns_roce_hem_table *table,
+				    unsigned long obj,
+				    int check_refcount)
+{
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_hem_mhop mhop;
+	unsigned long mhop_obj = obj;
+	u32 bt_chunk_size;
+	u32 chunk_ba_num;
+	u32 hop_num;
+	u32 start_idx;
+	u32 bt_num;
+	u64 hem_idx;
+	u64 bt_l1_idx = 0;
+	int ret;
+
+	ret = hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, &mhop);
+	if (ret)
+		return;
+
+	bt_chunk_size = mhop.bt_chunk_size;
+	hop_num = mhop.hop_num;
+	chunk_ba_num = bt_chunk_size / 8;
+
+	bt_num = hns_roce_get_bt_num(table->type, hop_num);
+	switch (bt_num) {
+	case 3:
+		hem_idx = mhop.l0_idx * chunk_ba_num * chunk_ba_num +
+			  mhop.l1_idx * chunk_ba_num + mhop.l2_idx;
+		bt_l1_idx = mhop.l0_idx * chunk_ba_num + mhop.l1_idx;
+		break;
+	case 2:
+		hem_idx = mhop.l0_idx * chunk_ba_num + mhop.l1_idx;
+		break;
+	case 1:
+		hem_idx = mhop.l0_idx;
+		break;
+	default:
+		dev_err(dev, "Table %d not support hop_num = %d!\n",
+			     table->type, hop_num);
+		return;
+	}
+
+	mutex_lock(&table->mutex);
+
+	if (check_refcount && (--table->hem[hem_idx]->refcount > 0)) {
+		mutex_unlock(&table->mutex);
+		return;
+	}
+
+	if (table->type < HEM_TYPE_MTT && hop_num == 1) {
+		if (hr_dev->hw->clear_hem(hr_dev, table, obj, 1))
+			dev_warn(dev, "Clear HEM base address failed.\n");
+	} else if (table->type < HEM_TYPE_MTT && hop_num == 2) {
+		if (hr_dev->hw->clear_hem(hr_dev, table, obj, 2))
+			dev_warn(dev, "Clear HEM base address failed.\n");
+	} else if (table->type < HEM_TYPE_MTT &&
+		   hop_num == HNS_ROCE_HOP_NUM_0) {
+		if (hr_dev->hw->clear_hem(hr_dev, table, obj, 0))
+			dev_warn(dev, "Clear HEM base address failed.\n");
+	}
+
+	/*
+	 * free buffer space chunk for QPC/MTPT/CQC/SRQC.
+	 * free bt space chunk for MTT/CQE.
+	 */
+	hns_roce_free_hem(hr_dev, table->hem[hem_idx]);
+	table->hem[hem_idx] = NULL;
+
+	if (check_whether_bt_num_2(table->type, hop_num)) {
+		start_idx = mhop.l0_idx * chunk_ba_num;
+		if (hns_roce_check_hem_null(table->hem, start_idx,
+					    chunk_ba_num)) {
+			if (table->type < HEM_TYPE_MTT &&
+			    hr_dev->hw->clear_hem(hr_dev, table, obj, 0))
+				dev_warn(dev, "Clear HEM base address failed.\n");
+
+			dma_free_coherent(dev, bt_chunk_size,
+					  table->bt_l0[mhop.l0_idx],
+					  table->bt_l0_dma_addr[mhop.l0_idx]);
+			table->bt_l0[mhop.l0_idx] = NULL;
+		}
+	} else if (check_whether_bt_num_3(table->type, hop_num)) {
+		start_idx = mhop.l0_idx * chunk_ba_num * chunk_ba_num +
+			    mhop.l1_idx * chunk_ba_num;
+		if (hns_roce_check_hem_null(table->hem, start_idx,
+					    chunk_ba_num)) {
+			if (hr_dev->hw->clear_hem(hr_dev, table, obj, 1))
+				dev_warn(dev, "Clear HEM base address failed.\n");
+
+			dma_free_coherent(dev, bt_chunk_size,
+					  table->bt_l1[bt_l1_idx],
+					  table->bt_l1_dma_addr[bt_l1_idx]);
+			table->bt_l1[bt_l1_idx] = NULL;
+
+			start_idx = mhop.l0_idx * chunk_ba_num;
+			if (hns_roce_check_bt_null(table->bt_l1, start_idx,
+						   chunk_ba_num)) {
+				if (hr_dev->hw->clear_hem(hr_dev, table, obj,
+							  0))
+					dev_warn(dev, "Clear HEM base address failed.\n");
+
+				dma_free_coherent(dev, bt_chunk_size,
+					    table->bt_l0[mhop.l0_idx],
+					    table->bt_l0_dma_addr[mhop.l0_idx]);
+				table->bt_l0[mhop.l0_idx] = NULL;
+			}
+		}
+	}
+
+	mutex_unlock(&table->mutex);
+}
+
+void hns_roce_table_put(struct hns_roce_dev *hr_dev,
+			struct hns_roce_hem_table *table, unsigned long obj)
+{
+	struct device *dev = hr_dev->dev;
+	unsigned long i;
+
+	if (hns_roce_check_whether_mhop(hr_dev, table->type)) {
+		hns_roce_table_mhop_put(hr_dev, table, obj, 1);
+		return;
+	}
+
+	i = (obj & (table->num_obj - 1)) /
+	    (table->table_chunk_size / table->obj_size);
+
+	mutex_lock(&table->mutex);
+
+	if (--table->hem[i]->refcount == 0) {
+		/* Clear HEM base address */
+		if (hr_dev->hw->clear_hem(hr_dev, table, obj, 0))
+			dev_warn(dev, "Clear HEM base address failed.\n");
+
+		hns_roce_free_hem(hr_dev, table->hem[i]);
+		table->hem[i] = NULL;
+	}
+
+	mutex_unlock(&table->mutex);
+}
+
+void *hns_roce_table_find(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_hem_table *table,
+			  unsigned long obj, dma_addr_t *dma_handle)
+{
+	struct hns_roce_hem_chunk *chunk;
+	struct hns_roce_hem_mhop mhop;
+	struct hns_roce_hem *hem;
+	void *addr = NULL;
+	unsigned long mhop_obj = obj;
+	unsigned long obj_per_chunk;
+	unsigned long idx_offset;
+	int offset, dma_offset;
+	int length;
+	int i, j;
+	u32 hem_idx = 0;
+
+	if (!table->lowmem)
+		return NULL;
+
+	mutex_lock(&table->mutex);
+
+	if (!hns_roce_check_whether_mhop(hr_dev, table->type)) {
+		obj_per_chunk = table->table_chunk_size / table->obj_size;
+		hem = table->hem[(obj & (table->num_obj - 1)) / obj_per_chunk];
+		idx_offset = (obj & (table->num_obj - 1)) % obj_per_chunk;
+		dma_offset = offset = idx_offset * table->obj_size;
+	} else {
+		hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, &mhop);
+		/* mtt mhop */
+		i = mhop.l0_idx;
+		j = mhop.l1_idx;
+		if (mhop.hop_num == 2)
+			hem_idx = i * (mhop.bt_chunk_size / 8) + j;
+		else if (mhop.hop_num == 1 ||
+			 mhop.hop_num == HNS_ROCE_HOP_NUM_0)
+			hem_idx = i;
+
+		hem = table->hem[hem_idx];
+		dma_offset = offset = (obj & (table->num_obj - 1)) *
+				       table->obj_size % mhop.bt_chunk_size;
+		if (mhop.hop_num == 2)
+			dma_offset = offset = 0;
+	}
+
+	if (!hem)
+		goto out;
+
+	list_for_each_entry(chunk, &hem->chunk_list, list) {
+		for (i = 0; i < chunk->npages; ++i) {
+			length = sg_dma_len(&chunk->mem[i]);
+			if (dma_handle && dma_offset >= 0) {
+				if (length > (u32)dma_offset)
+					*dma_handle = sg_dma_address(
+						&chunk->mem[i]) + dma_offset;
+				dma_offset -= length;
+			}
+
+			if (length > (u32)offset) {
+				addr = chunk->buf[i] + offset;
+				goto out;
+			}
+			offset -= length;
+		}
+	}
+
+out:
+	mutex_unlock(&table->mutex);
+	return addr;
+}
+EXPORT_SYMBOL_GPL(hns_roce_table_find);
+
+int hns_roce_table_get_range(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_hem_table *table,
+			     unsigned long start, unsigned long end)
+{
+	struct hns_roce_hem_mhop mhop;
+	unsigned long inc = table->table_chunk_size / table->obj_size;
+	unsigned long i;
+	int ret;
+
+	if (hns_roce_check_whether_mhop(hr_dev, table->type)) {
+		hns_roce_calc_hem_mhop(hr_dev, table, NULL, &mhop);
+		inc = mhop.bt_chunk_size / table->obj_size;
+	}
+
+	/* Allocate MTT entry memory according to chunk(128K) */
+	for (i = start; i <= end; i += inc) {
+		ret = hns_roce_table_get(hr_dev, table, i);
+		if (ret)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (i > start) {
+		i -= inc;
+		hns_roce_table_put(hr_dev, table, i);
+	}
+	return ret;
+}
+
+void hns_roce_table_put_range(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_hem_table *table,
+			      unsigned long start, unsigned long end)
+{
+	struct hns_roce_hem_mhop mhop;
+	unsigned long inc = table->table_chunk_size / table->obj_size;
+	unsigned long i;
+
+	if (hns_roce_check_whether_mhop(hr_dev, table->type)) {
+		hns_roce_calc_hem_mhop(hr_dev, table, NULL, &mhop);
+		inc = mhop.bt_chunk_size / table->obj_size;
+	}
+
+	for (i = start; i <= end; i += inc)
+		hns_roce_table_put(hr_dev, table, i);
+}
+
+int hns_roce_init_hem_table(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_hem_table *table, u32 type,
+			    unsigned long obj_size, unsigned long nobj,
+			    int use_lowmem)
+{
+	struct device *dev = hr_dev->dev;
+	unsigned long obj_per_chunk;
+	unsigned long num_hem;
+
+	if (!hns_roce_check_whether_mhop(hr_dev, type)) {
+		table->table_chunk_size = hr_dev->caps.chunk_sz;
+		obj_per_chunk = table->table_chunk_size / obj_size;
+		num_hem = (nobj + obj_per_chunk - 1) / obj_per_chunk;
+
+		table->hem = kcalloc(num_hem, sizeof(*table->hem), GFP_KERNEL);
+		if (!table->hem)
+			return -ENOMEM;
+	} else {
+		unsigned long buf_chunk_size;
+		unsigned long bt_chunk_size;
+		unsigned long bt_chunk_num;
+		unsigned long num_bt_l0 = 0;
+		u32 hop_num;
+
+		switch (type) {
+		case HEM_TYPE_QPC:
+			buf_chunk_size = 1 << (hr_dev->caps.qpc_buf_pg_sz
+					+ PAGE_SHIFT);
+			bt_chunk_size = 1 << (hr_dev->caps.qpc_ba_pg_sz
+					+ PAGE_SHIFT);
+			num_bt_l0 = hr_dev->caps.qpc_bt_num;
+			hop_num = hr_dev->caps.qpc_hop_num;
+			break;
+		case HEM_TYPE_MTPT:
+			buf_chunk_size = 1 << (hr_dev->caps.mpt_buf_pg_sz
+					+ PAGE_SHIFT);
+			bt_chunk_size = 1 << (hr_dev->caps.mpt_ba_pg_sz
+					+ PAGE_SHIFT);
+			num_bt_l0 = hr_dev->caps.mpt_bt_num;
+			hop_num = hr_dev->caps.mpt_hop_num;
+			break;
+		case HEM_TYPE_CQC:
+			buf_chunk_size = 1 << (hr_dev->caps.cqc_buf_pg_sz
+					+ PAGE_SHIFT);
+			bt_chunk_size = 1 << (hr_dev->caps.cqc_ba_pg_sz
+					+ PAGE_SHIFT);
+			num_bt_l0 = hr_dev->caps.cqc_bt_num;
+			hop_num = hr_dev->caps.cqc_hop_num;
+			break;
+		case HEM_TYPE_SRQC:
+			buf_chunk_size = 1 << (hr_dev->caps.srqc_buf_pg_sz
+					+ PAGE_SHIFT);
+			bt_chunk_size = 1 << (hr_dev->caps.srqc_ba_pg_sz
+					+ PAGE_SHIFT);
+			num_bt_l0 = hr_dev->caps.srqc_bt_num;
+			hop_num = hr_dev->caps.srqc_hop_num;
+			break;
+		case HEM_TYPE_MTT:
+			buf_chunk_size = 1 << (hr_dev->caps.mtt_ba_pg_sz
+					+ PAGE_SHIFT);
+			bt_chunk_size = buf_chunk_size;
+			hop_num = hr_dev->caps.mtt_hop_num;
+			break;
+		case HEM_TYPE_CQE:
+			buf_chunk_size = 1 << (hr_dev->caps.cqe_ba_pg_sz
+					+ PAGE_SHIFT);
+			bt_chunk_size = buf_chunk_size;
+			hop_num = hr_dev->caps.cqe_hop_num;
+			break;
+		default:
+			dev_err(dev,
+			  "Table %d not support to init hem table here!\n",
+			  type);
+			return -EINVAL;
+		}
+		obj_per_chunk = buf_chunk_size / obj_size;
+		num_hem = (nobj + obj_per_chunk - 1) / obj_per_chunk;
+		bt_chunk_num = bt_chunk_size / 8;
+		if (type >= HEM_TYPE_MTT)
+			num_bt_l0 = bt_chunk_num;
+
+		table->hem = kcalloc(num_hem, sizeof(*table->hem),
+					 GFP_KERNEL);
+		if (!table->hem)
+			goto err_kcalloc_hem_buf;
+
+		if (check_whether_bt_num_3(type, hop_num)) {
+			unsigned long num_bt_l1;
+
+			num_bt_l1 = (num_hem + bt_chunk_num - 1) /
+					     bt_chunk_num;
+			table->bt_l1 = kcalloc(num_bt_l1,
+					       sizeof(*table->bt_l1),
+					       GFP_KERNEL);
+			if (!table->bt_l1)
+				goto err_kcalloc_bt_l1;
+
+			table->bt_l1_dma_addr = kcalloc(num_bt_l1,
+						 sizeof(*table->bt_l1_dma_addr),
+						 GFP_KERNEL);
+
+			if (!table->bt_l1_dma_addr)
+				goto err_kcalloc_l1_dma;
+		}
+
+		if (check_whether_bt_num_2(type, hop_num) ||
+			check_whether_bt_num_3(type, hop_num)) {
+			table->bt_l0 = kcalloc(num_bt_l0, sizeof(*table->bt_l0),
+					       GFP_KERNEL);
+			if (!table->bt_l0)
+				goto err_kcalloc_bt_l0;
+
+			table->bt_l0_dma_addr = kcalloc(num_bt_l0,
+						 sizeof(*table->bt_l0_dma_addr),
+						 GFP_KERNEL);
+			if (!table->bt_l0_dma_addr)
+				goto err_kcalloc_l0_dma;
+		}
+	}
+
+	table->type = type;
+	table->num_hem = num_hem;
+	table->num_obj = nobj;
+	table->obj_size = obj_size;
+	table->lowmem = use_lowmem;
+	mutex_init(&table->mutex);
+
+	return 0;
+
+err_kcalloc_l0_dma:
+	kfree(table->bt_l0);
+	table->bt_l0 = NULL;
+
+err_kcalloc_bt_l0:
+	kfree(table->bt_l1_dma_addr);
+	table->bt_l1_dma_addr = NULL;
+
+err_kcalloc_l1_dma:
+	kfree(table->bt_l1);
+	table->bt_l1 = NULL;
+
+err_kcalloc_bt_l1:
+	kfree(table->hem);
+	table->hem = NULL;
+
+err_kcalloc_hem_buf:
+	return -ENOMEM;
+}
+
+static void hns_roce_cleanup_mhop_hem_table(struct hns_roce_dev *hr_dev,
+					    struct hns_roce_hem_table *table)
+{
+	struct hns_roce_hem_mhop mhop;
+	u32 buf_chunk_size;
+	int i;
+	u64 obj;
+
+	hns_roce_calc_hem_mhop(hr_dev, table, NULL, &mhop);
+	buf_chunk_size = table->type < HEM_TYPE_MTT ? mhop.buf_chunk_size :
+					mhop.bt_chunk_size;
+
+	for (i = 0; i < table->num_hem; ++i) {
+		obj = i * buf_chunk_size / table->obj_size;
+		if (table->hem[i])
+			hns_roce_table_mhop_put(hr_dev, table, obj, 0);
+	}
+
+	kfree(table->hem);
+	table->hem = NULL;
+	kfree(table->bt_l1);
+	table->bt_l1 = NULL;
+	kfree(table->bt_l1_dma_addr);
+	table->bt_l1_dma_addr = NULL;
+	kfree(table->bt_l0);
+	table->bt_l0 = NULL;
+	kfree(table->bt_l0_dma_addr);
+	table->bt_l0_dma_addr = NULL;
+}
+
+void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev,
+				struct hns_roce_hem_table *table)
+{
+	struct device *dev = hr_dev->dev;
+	unsigned long i;
+
+	if (hns_roce_check_whether_mhop(hr_dev, table->type)) {
+		hns_roce_cleanup_mhop_hem_table(hr_dev, table);
+		return;
+	}
+
+	for (i = 0; i < table->num_hem; ++i)
+		if (table->hem[i]) {
+			if (hr_dev->hw->clear_hem(hr_dev, table,
+			    i * table->table_chunk_size / table->obj_size, 0))
+				dev_err(dev, "Clear HEM base address failed.\n");
+
+			hns_roce_free_hem(hr_dev, table->hem[i]);
+		}
+
+	kfree(table->hem);
+}
+
+void hns_roce_cleanup_hem(struct hns_roce_dev *hr_dev)
+{
+	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->cq_table.table);
+	if (hr_dev->caps.trrl_entry_sz)
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->qp_table.trrl_table);
+	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->qp_table.irrl_table);
+	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->qp_table.qp_table);
+	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtpt_table);
+	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->mr_table.mtt_cqe_table);
+	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtt_table);
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h
new file mode 100644
index 0000000..e8850d5
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _HNS_ROCE_HEM_H
+#define _HNS_ROCE_HEM_H
+
+#define HW_SYNC_TIMEOUT_MSECS		500
+#define HW_SYNC_SLEEP_TIME_INTERVAL	20
+#define BT_CMD_SYNC_SHIFT		31
+
+enum {
+	/* MAP HEM(Hardware Entry Memory) */
+	HEM_TYPE_QPC = 0,
+	HEM_TYPE_MTPT,
+	HEM_TYPE_CQC,
+	HEM_TYPE_SRQC,
+
+	 /* UNMAP HEM */
+	HEM_TYPE_MTT,
+	HEM_TYPE_CQE,
+	HEM_TYPE_IRRL,
+	HEM_TYPE_TRRL,
+};
+
+#define HNS_ROCE_HEM_CHUNK_LEN	\
+	 ((256 - sizeof(struct list_head) - 2 * sizeof(int)) /	 \
+	 (sizeof(struct scatterlist)))
+
+#define check_whether_bt_num_3(type, hop_num) \
+	(type < HEM_TYPE_MTT && hop_num == 2)
+
+#define check_whether_bt_num_2(type, hop_num) \
+	((type < HEM_TYPE_MTT && hop_num == 1) || \
+	(type >= HEM_TYPE_MTT && hop_num == 2))
+
+#define check_whether_bt_num_1(type, hop_num) \
+	((type < HEM_TYPE_MTT && hop_num == HNS_ROCE_HOP_NUM_0) || \
+	(type >= HEM_TYPE_MTT && hop_num == 1) || \
+	(type >= HEM_TYPE_MTT && hop_num == HNS_ROCE_HOP_NUM_0))
+
+enum {
+	 HNS_ROCE_HEM_PAGE_SHIFT = 12,
+	 HNS_ROCE_HEM_PAGE_SIZE  = 1 << HNS_ROCE_HEM_PAGE_SHIFT,
+};
+
+struct hns_roce_hem_chunk {
+	struct list_head	 list;
+	int			 npages;
+	int			 nsg;
+	struct scatterlist	 mem[HNS_ROCE_HEM_CHUNK_LEN];
+	void			 *buf[HNS_ROCE_HEM_CHUNK_LEN];
+};
+
+struct hns_roce_hem {
+	struct list_head	 chunk_list;
+	int			 refcount;
+};
+
+struct hns_roce_hem_iter {
+	struct hns_roce_hem		 *hem;
+	struct hns_roce_hem_chunk	 *chunk;
+	int				 page_idx;
+};
+
+struct hns_roce_hem_mhop {
+	u32	hop_num;
+	u32	buf_chunk_size;
+	u32	bt_chunk_size;
+	u32	ba_l0_num;
+	u32	l0_idx;/* level 0 base address table index */
+	u32	l1_idx;/* level 1 base address table index */
+	u32	l2_idx;/* level 2 base address table index */
+};
+
+void hns_roce_free_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem *hem);
+int hns_roce_table_get(struct hns_roce_dev *hr_dev,
+		       struct hns_roce_hem_table *table, unsigned long obj);
+void hns_roce_table_put(struct hns_roce_dev *hr_dev,
+			struct hns_roce_hem_table *table, unsigned long obj);
+void *hns_roce_table_find(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_hem_table *table, unsigned long obj,
+			  dma_addr_t *dma_handle);
+int hns_roce_table_get_range(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_hem_table *table,
+			     unsigned long start, unsigned long end);
+void hns_roce_table_put_range(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_hem_table *table,
+			      unsigned long start, unsigned long end);
+int hns_roce_init_hem_table(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_hem_table *table, u32 type,
+			    unsigned long obj_size, unsigned long nobj,
+			    int use_lowmem);
+void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev,
+				struct hns_roce_hem_table *table);
+void hns_roce_cleanup_hem(struct hns_roce_dev *hr_dev);
+int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_hem_table *table, unsigned long *obj,
+			   struct hns_roce_hem_mhop *mhop);
+bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type);
+
+static inline void hns_roce_hem_first(struct hns_roce_hem *hem,
+				      struct hns_roce_hem_iter *iter)
+{
+	iter->hem = hem;
+	iter->chunk = list_empty(&hem->chunk_list) ? NULL :
+				 list_entry(hem->chunk_list.next,
+					    struct hns_roce_hem_chunk, list);
+	iter->page_idx = 0;
+}
+
+static inline int hns_roce_hem_last(struct hns_roce_hem_iter *iter)
+{
+	return !iter->chunk;
+}
+
+static inline void hns_roce_hem_next(struct hns_roce_hem_iter *iter)
+{
+	if (++iter->page_idx >= iter->chunk->nsg) {
+		if (iter->chunk->list.next == &iter->hem->chunk_list) {
+			iter->chunk = NULL;
+			return;
+		}
+
+		iter->chunk = list_entry(iter->chunk->list.next,
+					 struct hns_roce_hem_chunk, list);
+		iter->page_idx = 0;
+	}
+}
+
+static inline dma_addr_t hns_roce_hem_addr(struct hns_roce_hem_iter *iter)
+{
+	return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+#endif /*_HNS_ROCE_HEM_H*/
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
new file mode 100644
index 0000000..8013d69
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -0,0 +1,4989 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/acpi.h>
+#include <linux/etherdevice.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <rdma/ib_umem.h>
+#include "hns_roce_common.h"
+#include "hns_roce_device.h"
+#include "hns_roce_cmd.h"
+#include "hns_roce_hem.h"
+#include "hns_roce_hw_v1.h"
+
+static void set_data_seg(struct hns_roce_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->lkey = cpu_to_le32(sg->lkey);
+	dseg->addr = cpu_to_le64(sg->addr);
+	dseg->len  = cpu_to_le32(sg->length);
+}
+
+static void set_raddr_seg(struct hns_roce_wqe_raddr_seg *rseg, u64 remote_addr,
+			  u32 rkey)
+{
+	rseg->raddr = cpu_to_le64(remote_addr);
+	rseg->rkey  = cpu_to_le32(rkey);
+	rseg->len   = 0;
+}
+
+static int hns_roce_v1_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+				 struct ib_send_wr **bad_wr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_ah *ah = to_hr_ah(ud_wr(wr)->ah);
+	struct hns_roce_ud_send_wqe *ud_sq_wqe = NULL;
+	struct hns_roce_wqe_ctrl_seg *ctrl = NULL;
+	struct hns_roce_wqe_data_seg *dseg = NULL;
+	struct hns_roce_qp *qp = to_hr_qp(ibqp);
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_sq_db sq_db;
+	int ps_opcode = 0, i = 0;
+	unsigned long flags = 0;
+	void *wqe = NULL;
+	u32 doorbell[2];
+	int nreq = 0;
+	u32 ind = 0;
+	int ret = 0;
+	u8 *smac;
+	int loopback;
+
+	if (unlikely(ibqp->qp_type != IB_QPT_GSI &&
+		ibqp->qp_type != IB_QPT_RC)) {
+		dev_err(dev, "un-supported QP type\n");
+		*bad_wr = NULL;
+		return -EOPNOTSUPP;
+	}
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+	ind = qp->sq_next_wqe;
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (hns_roce_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			ret = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+			dev_err(dev, "num_sge=%d > qp->sq.max_gs=%d\n",
+				wr->num_sge, qp->sq.max_gs);
+			ret = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+		qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] =
+								      wr->wr_id;
+
+		/* Corresponding to the RC and RD type wqe process separately */
+		if (ibqp->qp_type == IB_QPT_GSI) {
+			ud_sq_wqe = wqe;
+			roce_set_field(ud_sq_wqe->dmac_h,
+				       UD_SEND_WQE_U32_4_DMAC_0_M,
+				       UD_SEND_WQE_U32_4_DMAC_0_S,
+				       ah->av.mac[0]);
+			roce_set_field(ud_sq_wqe->dmac_h,
+				       UD_SEND_WQE_U32_4_DMAC_1_M,
+				       UD_SEND_WQE_U32_4_DMAC_1_S,
+				       ah->av.mac[1]);
+			roce_set_field(ud_sq_wqe->dmac_h,
+				       UD_SEND_WQE_U32_4_DMAC_2_M,
+				       UD_SEND_WQE_U32_4_DMAC_2_S,
+				       ah->av.mac[2]);
+			roce_set_field(ud_sq_wqe->dmac_h,
+				       UD_SEND_WQE_U32_4_DMAC_3_M,
+				       UD_SEND_WQE_U32_4_DMAC_3_S,
+				       ah->av.mac[3]);
+
+			roce_set_field(ud_sq_wqe->u32_8,
+				       UD_SEND_WQE_U32_8_DMAC_4_M,
+				       UD_SEND_WQE_U32_8_DMAC_4_S,
+				       ah->av.mac[4]);
+			roce_set_field(ud_sq_wqe->u32_8,
+				       UD_SEND_WQE_U32_8_DMAC_5_M,
+				       UD_SEND_WQE_U32_8_DMAC_5_S,
+				       ah->av.mac[5]);
+
+			smac = (u8 *)hr_dev->dev_addr[qp->port];
+			loopback = ether_addr_equal_unaligned(ah->av.mac,
+							      smac) ? 1 : 0;
+			roce_set_bit(ud_sq_wqe->u32_8,
+				     UD_SEND_WQE_U32_8_LOOPBACK_INDICATOR_S,
+				     loopback);
+
+			roce_set_field(ud_sq_wqe->u32_8,
+				       UD_SEND_WQE_U32_8_OPERATION_TYPE_M,
+				       UD_SEND_WQE_U32_8_OPERATION_TYPE_S,
+				       HNS_ROCE_WQE_OPCODE_SEND);
+			roce_set_field(ud_sq_wqe->u32_8,
+				       UD_SEND_WQE_U32_8_NUMBER_OF_DATA_SEG_M,
+				       UD_SEND_WQE_U32_8_NUMBER_OF_DATA_SEG_S,
+				       2);
+			roce_set_bit(ud_sq_wqe->u32_8,
+				UD_SEND_WQE_U32_8_SEND_GL_ROUTING_HDR_FLAG_S,
+				1);
+
+			ud_sq_wqe->u32_8 |= (wr->send_flags & IB_SEND_SIGNALED ?
+				cpu_to_le32(HNS_ROCE_WQE_CQ_NOTIFY) : 0) |
+				(wr->send_flags & IB_SEND_SOLICITED ?
+				cpu_to_le32(HNS_ROCE_WQE_SE) : 0) |
+				((wr->opcode == IB_WR_SEND_WITH_IMM) ?
+				cpu_to_le32(HNS_ROCE_WQE_IMM) : 0);
+
+			roce_set_field(ud_sq_wqe->u32_16,
+				       UD_SEND_WQE_U32_16_DEST_QP_M,
+				       UD_SEND_WQE_U32_16_DEST_QP_S,
+				       ud_wr(wr)->remote_qpn);
+			roce_set_field(ud_sq_wqe->u32_16,
+				       UD_SEND_WQE_U32_16_MAX_STATIC_RATE_M,
+				       UD_SEND_WQE_U32_16_MAX_STATIC_RATE_S,
+				       ah->av.stat_rate);
+
+			roce_set_field(ud_sq_wqe->u32_36,
+				       UD_SEND_WQE_U32_36_FLOW_LABEL_M,
+				       UD_SEND_WQE_U32_36_FLOW_LABEL_S, 0);
+			roce_set_field(ud_sq_wqe->u32_36,
+				       UD_SEND_WQE_U32_36_PRIORITY_M,
+				       UD_SEND_WQE_U32_36_PRIORITY_S,
+				       ah->av.sl_tclass_flowlabel >>
+				       HNS_ROCE_SL_SHIFT);
+			roce_set_field(ud_sq_wqe->u32_36,
+				       UD_SEND_WQE_U32_36_SGID_INDEX_M,
+				       UD_SEND_WQE_U32_36_SGID_INDEX_S,
+				       hns_get_gid_index(hr_dev, qp->phy_port,
+							 ah->av.gid_index));
+
+			roce_set_field(ud_sq_wqe->u32_40,
+				       UD_SEND_WQE_U32_40_HOP_LIMIT_M,
+				       UD_SEND_WQE_U32_40_HOP_LIMIT_S,
+				       ah->av.hop_limit);
+			roce_set_field(ud_sq_wqe->u32_40,
+				       UD_SEND_WQE_U32_40_TRAFFIC_CLASS_M,
+				       UD_SEND_WQE_U32_40_TRAFFIC_CLASS_S, 0);
+
+			memcpy(&ud_sq_wqe->dgid[0], &ah->av.dgid[0], GID_LEN);
+
+			ud_sq_wqe->va0_l =
+				       cpu_to_le32((u32)wr->sg_list[0].addr);
+			ud_sq_wqe->va0_h =
+				       cpu_to_le32((wr->sg_list[0].addr) >> 32);
+			ud_sq_wqe->l_key0 =
+				       cpu_to_le32(wr->sg_list[0].lkey);
+
+			ud_sq_wqe->va1_l =
+				       cpu_to_le32((u32)wr->sg_list[1].addr);
+			ud_sq_wqe->va1_h =
+				       cpu_to_le32((wr->sg_list[1].addr) >> 32);
+			ud_sq_wqe->l_key1 =
+				       cpu_to_le32(wr->sg_list[1].lkey);
+			ind++;
+		} else if (ibqp->qp_type == IB_QPT_RC) {
+			u32 tmp_len = 0;
+
+			ctrl = wqe;
+			memset(ctrl, 0, sizeof(struct hns_roce_wqe_ctrl_seg));
+			for (i = 0; i < wr->num_sge; i++)
+				tmp_len += wr->sg_list[i].length;
+
+			ctrl->msg_length =
+			  cpu_to_le32(le32_to_cpu(ctrl->msg_length) + tmp_len);
+
+			ctrl->sgl_pa_h = 0;
+			ctrl->flag = 0;
+
+			switch (wr->opcode) {
+			case IB_WR_SEND_WITH_IMM:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				ctrl->imm_data = wr->ex.imm_data;
+				break;
+			case IB_WR_SEND_WITH_INV:
+				ctrl->inv_key =
+					cpu_to_le32(wr->ex.invalidate_rkey);
+				break;
+			default:
+				ctrl->imm_data = 0;
+				break;
+			}
+
+			/*Ctrl field, ctrl set type: sig, solic, imm, fence */
+			/* SO wait for conforming application scenarios */
+			ctrl->flag |= (wr->send_flags & IB_SEND_SIGNALED ?
+				      cpu_to_le32(HNS_ROCE_WQE_CQ_NOTIFY) : 0) |
+				      (wr->send_flags & IB_SEND_SOLICITED ?
+				      cpu_to_le32(HNS_ROCE_WQE_SE) : 0) |
+				      ((wr->opcode == IB_WR_SEND_WITH_IMM ||
+				      wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) ?
+				      cpu_to_le32(HNS_ROCE_WQE_IMM) : 0) |
+				      (wr->send_flags & IB_SEND_FENCE ?
+				      (cpu_to_le32(HNS_ROCE_WQE_FENCE)) : 0);
+
+			wqe += sizeof(struct hns_roce_wqe_ctrl_seg);
+
+			switch (wr->opcode) {
+			case IB_WR_RDMA_READ:
+				ps_opcode = HNS_ROCE_WQE_OPCODE_RDMA_READ;
+				set_raddr_seg(wqe,  rdma_wr(wr)->remote_addr,
+					       rdma_wr(wr)->rkey);
+				break;
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				ps_opcode = HNS_ROCE_WQE_OPCODE_RDMA_WRITE;
+				set_raddr_seg(wqe,  rdma_wr(wr)->remote_addr,
+					      rdma_wr(wr)->rkey);
+				break;
+			case IB_WR_SEND:
+			case IB_WR_SEND_WITH_INV:
+			case IB_WR_SEND_WITH_IMM:
+				ps_opcode = HNS_ROCE_WQE_OPCODE_SEND;
+				break;
+			case IB_WR_LOCAL_INV:
+				break;
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+			case IB_WR_LSO:
+			default:
+				ps_opcode = HNS_ROCE_WQE_OPCODE_MASK;
+				break;
+			}
+			ctrl->flag |= cpu_to_le32(ps_opcode);
+			wqe += sizeof(struct hns_roce_wqe_raddr_seg);
+
+			dseg = wqe;
+			if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) {
+				if (le32_to_cpu(ctrl->msg_length) >
+				    hr_dev->caps.max_sq_inline) {
+					ret = -EINVAL;
+					*bad_wr = wr;
+					dev_err(dev, "inline len(1-%d)=%d, illegal",
+						ctrl->msg_length,
+						hr_dev->caps.max_sq_inline);
+					goto out;
+				}
+				for (i = 0; i < wr->num_sge; i++) {
+					memcpy(wqe, ((void *) (uintptr_t)
+					       wr->sg_list[i].addr),
+					       wr->sg_list[i].length);
+					wqe += wr->sg_list[i].length;
+				}
+				ctrl->flag |= cpu_to_le32(HNS_ROCE_WQE_INLINE);
+			} else {
+				/*sqe num is two */
+				for (i = 0; i < wr->num_sge; i++)
+					set_data_seg(dseg + i, wr->sg_list + i);
+
+				ctrl->flag |= cpu_to_le32(wr->num_sge <<
+					      HNS_ROCE_WQE_SGE_NUM_BIT);
+			}
+			ind++;
+		}
+	}
+
+out:
+	/* Set DB return */
+	if (likely(nreq)) {
+		qp->sq.head += nreq;
+		/* Memory barrier */
+		wmb();
+
+		sq_db.u32_4 = 0;
+		sq_db.u32_8 = 0;
+		roce_set_field(sq_db.u32_4, SQ_DOORBELL_U32_4_SQ_HEAD_M,
+			       SQ_DOORBELL_U32_4_SQ_HEAD_S,
+			      (qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1)));
+		roce_set_field(sq_db.u32_4, SQ_DOORBELL_U32_4_SL_M,
+			       SQ_DOORBELL_U32_4_SL_S, qp->sl);
+		roce_set_field(sq_db.u32_4, SQ_DOORBELL_U32_4_PORT_M,
+			       SQ_DOORBELL_U32_4_PORT_S, qp->phy_port);
+		roce_set_field(sq_db.u32_8, SQ_DOORBELL_U32_8_QPN_M,
+			       SQ_DOORBELL_U32_8_QPN_S, qp->doorbell_qpn);
+		roce_set_bit(sq_db.u32_8, SQ_DOORBELL_HW_SYNC_S, 1);
+
+		doorbell[0] = le32_to_cpu(sq_db.u32_4);
+		doorbell[1] = le32_to_cpu(sq_db.u32_8);
+
+		hns_roce_write64_k(doorbell, qp->sq.db_reg_l);
+		qp->sq_next_wqe = ind;
+	}
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+	return ret;
+}
+
+static int hns_roce_v1_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+				 struct ib_recv_wr **bad_wr)
+{
+	int ret = 0;
+	int nreq = 0;
+	int ind = 0;
+	int i = 0;
+	u32 reg_val = 0;
+	unsigned long flags = 0;
+	struct hns_roce_rq_wqe_ctrl *ctrl = NULL;
+	struct hns_roce_wqe_data_seg *scat = NULL;
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_rq_db rq_db;
+	uint32_t doorbell[2] = {0};
+
+	spin_lock_irqsave(&hr_qp->rq.lock, flags);
+	ind = hr_qp->rq.head & (hr_qp->rq.wqe_cnt - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (hns_roce_wq_overflow(&hr_qp->rq, nreq,
+			hr_qp->ibqp.recv_cq)) {
+			ret = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > hr_qp->rq.max_gs)) {
+			dev_err(dev, "rq:num_sge=%d > qp->sq.max_gs=%d\n",
+				wr->num_sge, hr_qp->rq.max_gs);
+			ret = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		ctrl = get_recv_wqe(hr_qp, ind);
+
+		roce_set_field(ctrl->rwqe_byte_12,
+			       RQ_WQE_CTRL_RWQE_BYTE_12_RWQE_SGE_NUM_M,
+			       RQ_WQE_CTRL_RWQE_BYTE_12_RWQE_SGE_NUM_S,
+			       wr->num_sge);
+
+		scat = (struct hns_roce_wqe_data_seg *)(ctrl + 1);
+
+		for (i = 0; i < wr->num_sge; i++)
+			set_data_seg(scat + i, wr->sg_list + i);
+
+		hr_qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (hr_qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (likely(nreq)) {
+		hr_qp->rq.head += nreq;
+		/* Memory barrier */
+		wmb();
+
+		if (ibqp->qp_type == IB_QPT_GSI) {
+			/* SW update GSI rq header */
+			reg_val = roce_read(to_hr_dev(ibqp->device),
+					    ROCEE_QP1C_CFG3_0_REG +
+					    QP1C_CFGN_OFFSET * hr_qp->phy_port);
+			roce_set_field(reg_val,
+				       ROCEE_QP1C_CFG3_0_ROCEE_QP1C_RQ_HEAD_M,
+				       ROCEE_QP1C_CFG3_0_ROCEE_QP1C_RQ_HEAD_S,
+				       hr_qp->rq.head);
+			roce_write(to_hr_dev(ibqp->device),
+				   ROCEE_QP1C_CFG3_0_REG +
+				   QP1C_CFGN_OFFSET * hr_qp->phy_port, reg_val);
+		} else {
+			rq_db.u32_4 = 0;
+			rq_db.u32_8 = 0;
+
+			roce_set_field(rq_db.u32_4, RQ_DOORBELL_U32_4_RQ_HEAD_M,
+				       RQ_DOORBELL_U32_4_RQ_HEAD_S,
+				       hr_qp->rq.head);
+			roce_set_field(rq_db.u32_8, RQ_DOORBELL_U32_8_QPN_M,
+				       RQ_DOORBELL_U32_8_QPN_S, hr_qp->qpn);
+			roce_set_field(rq_db.u32_8, RQ_DOORBELL_U32_8_CMD_M,
+				       RQ_DOORBELL_U32_8_CMD_S, 1);
+			roce_set_bit(rq_db.u32_8, RQ_DOORBELL_U32_8_HW_SYNC_S,
+				     1);
+
+			doorbell[0] = le32_to_cpu(rq_db.u32_4);
+			doorbell[1] = le32_to_cpu(rq_db.u32_8);
+
+			hns_roce_write64_k(doorbell, hr_qp->rq.db_reg_l);
+		}
+	}
+	spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
+
+	return ret;
+}
+
+static void hns_roce_set_db_event_mode(struct hns_roce_dev *hr_dev,
+				       int sdb_mode, int odb_mode)
+{
+	u32 val;
+
+	val = roce_read(hr_dev, ROCEE_GLB_CFG_REG);
+	roce_set_bit(val, ROCEE_GLB_CFG_ROCEE_DB_SQ_MODE_S, sdb_mode);
+	roce_set_bit(val, ROCEE_GLB_CFG_ROCEE_DB_OTH_MODE_S, odb_mode);
+	roce_write(hr_dev, ROCEE_GLB_CFG_REG, val);
+}
+
+static void hns_roce_set_db_ext_mode(struct hns_roce_dev *hr_dev, u32 sdb_mode,
+				     u32 odb_mode)
+{
+	u32 val;
+
+	/* Configure SDB/ODB extend mode */
+	val = roce_read(hr_dev, ROCEE_GLB_CFG_REG);
+	roce_set_bit(val, ROCEE_GLB_CFG_SQ_EXT_DB_MODE_S, sdb_mode);
+	roce_set_bit(val, ROCEE_GLB_CFG_OTH_EXT_DB_MODE_S, odb_mode);
+	roce_write(hr_dev, ROCEE_GLB_CFG_REG, val);
+}
+
+static void hns_roce_set_sdb(struct hns_roce_dev *hr_dev, u32 sdb_alept,
+			     u32 sdb_alful)
+{
+	u32 val;
+
+	/* Configure SDB */
+	val = roce_read(hr_dev, ROCEE_DB_SQ_WL_REG);
+	roce_set_field(val, ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_M,
+		       ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_S, sdb_alful);
+	roce_set_field(val, ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_EMPTY_M,
+		       ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_EMPTY_S, sdb_alept);
+	roce_write(hr_dev, ROCEE_DB_SQ_WL_REG, val);
+}
+
+static void hns_roce_set_odb(struct hns_roce_dev *hr_dev, u32 odb_alept,
+			     u32 odb_alful)
+{
+	u32 val;
+
+	/* Configure ODB */
+	val = roce_read(hr_dev, ROCEE_DB_OTHERS_WL_REG);
+	roce_set_field(val, ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_M,
+		       ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_S, odb_alful);
+	roce_set_field(val, ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_EMPTY_M,
+		       ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_EMPTY_S, odb_alept);
+	roce_write(hr_dev, ROCEE_DB_OTHERS_WL_REG, val);
+}
+
+static void hns_roce_set_sdb_ext(struct hns_roce_dev *hr_dev, u32 ext_sdb_alept,
+				 u32 ext_sdb_alful)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_db_table *db;
+	dma_addr_t sdb_dma_addr;
+	u32 val;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	db = &priv->db_table;
+
+	/* Configure extend SDB threshold */
+	roce_write(hr_dev, ROCEE_EXT_DB_SQ_WL_EMPTY_REG, ext_sdb_alept);
+	roce_write(hr_dev, ROCEE_EXT_DB_SQ_WL_REG, ext_sdb_alful);
+
+	/* Configure extend SDB base addr */
+	sdb_dma_addr = db->ext_db->sdb_buf_list->map;
+	roce_write(hr_dev, ROCEE_EXT_DB_SQ_REG, (u32)(sdb_dma_addr >> 12));
+
+	/* Configure extend SDB depth */
+	val = roce_read(hr_dev, ROCEE_EXT_DB_SQ_H_REG);
+	roce_set_field(val, ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_SHIFT_M,
+		       ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_SHIFT_S,
+		       db->ext_db->esdb_dep);
+	/*
+	 * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of
+	 * using 4K page, and shift more 32 because of
+	 * caculating the high 32 bit value evaluated to hardware.
+	 */
+	roce_set_field(val, ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_BA_H_M,
+		       ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_BA_H_S, sdb_dma_addr >> 44);
+	roce_write(hr_dev, ROCEE_EXT_DB_SQ_H_REG, val);
+
+	dev_dbg(dev, "ext SDB depth: 0x%x\n", db->ext_db->esdb_dep);
+	dev_dbg(dev, "ext SDB threshold: epmty: 0x%x, ful: 0x%x\n",
+		ext_sdb_alept, ext_sdb_alful);
+}
+
+static void hns_roce_set_odb_ext(struct hns_roce_dev *hr_dev, u32 ext_odb_alept,
+				 u32 ext_odb_alful)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_db_table *db;
+	dma_addr_t odb_dma_addr;
+	u32 val;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	db = &priv->db_table;
+
+	/* Configure extend ODB threshold */
+	roce_write(hr_dev, ROCEE_EXT_DB_OTHERS_WL_EMPTY_REG, ext_odb_alept);
+	roce_write(hr_dev, ROCEE_EXT_DB_OTHERS_WL_REG, ext_odb_alful);
+
+	/* Configure extend ODB base addr */
+	odb_dma_addr = db->ext_db->odb_buf_list->map;
+	roce_write(hr_dev, ROCEE_EXT_DB_OTH_REG, (u32)(odb_dma_addr >> 12));
+
+	/* Configure extend ODB depth */
+	val = roce_read(hr_dev, ROCEE_EXT_DB_OTH_H_REG);
+	roce_set_field(val, ROCEE_EXT_DB_OTH_H_EXT_DB_OTH_SHIFT_M,
+		       ROCEE_EXT_DB_OTH_H_EXT_DB_OTH_SHIFT_S,
+		       db->ext_db->eodb_dep);
+	roce_set_field(val, ROCEE_EXT_DB_SQ_H_EXT_DB_OTH_BA_H_M,
+		       ROCEE_EXT_DB_SQ_H_EXT_DB_OTH_BA_H_S,
+		       db->ext_db->eodb_dep);
+	roce_write(hr_dev, ROCEE_EXT_DB_OTH_H_REG, val);
+
+	dev_dbg(dev, "ext ODB depth: 0x%x\n", db->ext_db->eodb_dep);
+	dev_dbg(dev, "ext ODB threshold: empty: 0x%x, ful: 0x%x\n",
+		ext_odb_alept, ext_odb_alful);
+}
+
+static int hns_roce_db_ext_init(struct hns_roce_dev *hr_dev, u32 sdb_ext_mod,
+				u32 odb_ext_mod)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_db_table *db;
+	dma_addr_t sdb_dma_addr;
+	dma_addr_t odb_dma_addr;
+	int ret = 0;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	db = &priv->db_table;
+
+	db->ext_db = kmalloc(sizeof(*db->ext_db), GFP_KERNEL);
+	if (!db->ext_db)
+		return -ENOMEM;
+
+	if (sdb_ext_mod) {
+		db->ext_db->sdb_buf_list = kmalloc(
+				sizeof(*db->ext_db->sdb_buf_list), GFP_KERNEL);
+		if (!db->ext_db->sdb_buf_list) {
+			ret = -ENOMEM;
+			goto ext_sdb_buf_fail_out;
+		}
+
+		db->ext_db->sdb_buf_list->buf = dma_alloc_coherent(dev,
+						     HNS_ROCE_V1_EXT_SDB_SIZE,
+						     &sdb_dma_addr, GFP_KERNEL);
+		if (!db->ext_db->sdb_buf_list->buf) {
+			ret = -ENOMEM;
+			goto alloc_sq_db_buf_fail;
+		}
+		db->ext_db->sdb_buf_list->map = sdb_dma_addr;
+
+		db->ext_db->esdb_dep = ilog2(HNS_ROCE_V1_EXT_SDB_DEPTH);
+		hns_roce_set_sdb_ext(hr_dev, HNS_ROCE_V1_EXT_SDB_ALEPT,
+				     HNS_ROCE_V1_EXT_SDB_ALFUL);
+	} else
+		hns_roce_set_sdb(hr_dev, HNS_ROCE_V1_SDB_ALEPT,
+				 HNS_ROCE_V1_SDB_ALFUL);
+
+	if (odb_ext_mod) {
+		db->ext_db->odb_buf_list = kmalloc(
+				sizeof(*db->ext_db->odb_buf_list), GFP_KERNEL);
+		if (!db->ext_db->odb_buf_list) {
+			ret = -ENOMEM;
+			goto ext_odb_buf_fail_out;
+		}
+
+		db->ext_db->odb_buf_list->buf = dma_alloc_coherent(dev,
+						     HNS_ROCE_V1_EXT_ODB_SIZE,
+						     &odb_dma_addr, GFP_KERNEL);
+		if (!db->ext_db->odb_buf_list->buf) {
+			ret = -ENOMEM;
+			goto alloc_otr_db_buf_fail;
+		}
+		db->ext_db->odb_buf_list->map = odb_dma_addr;
+
+		db->ext_db->eodb_dep = ilog2(HNS_ROCE_V1_EXT_ODB_DEPTH);
+		hns_roce_set_odb_ext(hr_dev, HNS_ROCE_V1_EXT_ODB_ALEPT,
+				     HNS_ROCE_V1_EXT_ODB_ALFUL);
+	} else
+		hns_roce_set_odb(hr_dev, HNS_ROCE_V1_ODB_ALEPT,
+				 HNS_ROCE_V1_ODB_ALFUL);
+
+	hns_roce_set_db_ext_mode(hr_dev, sdb_ext_mod, odb_ext_mod);
+
+	return 0;
+
+alloc_otr_db_buf_fail:
+	kfree(db->ext_db->odb_buf_list);
+
+ext_odb_buf_fail_out:
+	if (sdb_ext_mod) {
+		dma_free_coherent(dev, HNS_ROCE_V1_EXT_SDB_SIZE,
+				  db->ext_db->sdb_buf_list->buf,
+				  db->ext_db->sdb_buf_list->map);
+	}
+
+alloc_sq_db_buf_fail:
+	if (sdb_ext_mod)
+		kfree(db->ext_db->sdb_buf_list);
+
+ext_sdb_buf_fail_out:
+	kfree(db->ext_db);
+	return ret;
+}
+
+static struct hns_roce_qp *hns_roce_v1_create_lp_qp(struct hns_roce_dev *hr_dev,
+						    struct ib_pd *pd)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct ib_qp_init_attr init_attr;
+	struct ib_qp *qp;
+
+	memset(&init_attr, 0, sizeof(struct ib_qp_init_attr));
+	init_attr.qp_type		= IB_QPT_RC;
+	init_attr.sq_sig_type		= IB_SIGNAL_ALL_WR;
+	init_attr.cap.max_recv_wr	= HNS_ROCE_MIN_WQE_NUM;
+	init_attr.cap.max_send_wr	= HNS_ROCE_MIN_WQE_NUM;
+
+	qp = hns_roce_create_qp(pd, &init_attr, NULL);
+	if (IS_ERR(qp)) {
+		dev_err(dev, "Create loop qp for mr free failed!");
+		return NULL;
+	}
+
+	return to_hr_qp(qp);
+}
+
+static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_caps *caps = &hr_dev->caps;
+	struct device *dev = &hr_dev->pdev->dev;
+	struct ib_cq_init_attr cq_init_attr;
+	struct hns_roce_free_mr *free_mr;
+	struct ib_qp_attr attr = { 0 };
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_qp *hr_qp;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+	union ib_gid dgid;
+	u64 subnet_prefix;
+	int attr_mask = 0;
+	int i, j;
+	int ret;
+	u8 queue_en[HNS_ROCE_V1_RESV_QP] = { 0 };
+	u8 phy_port;
+	u8 port = 0;
+	u8 sl;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	free_mr = &priv->free_mr;
+
+	/* Reserved cq for loop qp */
+	cq_init_attr.cqe		= HNS_ROCE_MIN_WQE_NUM * 2;
+	cq_init_attr.comp_vector	= 0;
+	cq = hns_roce_ib_create_cq(&hr_dev->ib_dev, &cq_init_attr, NULL, NULL);
+	if (IS_ERR(cq)) {
+		dev_err(dev, "Create cq for reseved loop qp failed!");
+		return -ENOMEM;
+	}
+	free_mr->mr_free_cq = to_hr_cq(cq);
+	free_mr->mr_free_cq->ib_cq.device		= &hr_dev->ib_dev;
+	free_mr->mr_free_cq->ib_cq.uobject		= NULL;
+	free_mr->mr_free_cq->ib_cq.comp_handler		= NULL;
+	free_mr->mr_free_cq->ib_cq.event_handler	= NULL;
+	free_mr->mr_free_cq->ib_cq.cq_context		= NULL;
+	atomic_set(&free_mr->mr_free_cq->ib_cq.usecnt, 0);
+
+	pd = hns_roce_alloc_pd(&hr_dev->ib_dev, NULL, NULL);
+	if (IS_ERR(pd)) {
+		dev_err(dev, "Create pd for reseved loop qp failed!");
+		ret = -ENOMEM;
+		goto alloc_pd_failed;
+	}
+	free_mr->mr_free_pd = to_hr_pd(pd);
+	free_mr->mr_free_pd->ibpd.device  = &hr_dev->ib_dev;
+	free_mr->mr_free_pd->ibpd.uobject = NULL;
+	free_mr->mr_free_pd->ibpd.__internal_mr = NULL;
+	atomic_set(&free_mr->mr_free_pd->ibpd.usecnt, 0);
+
+	attr.qp_access_flags	= IB_ACCESS_REMOTE_WRITE;
+	attr.pkey_index		= 0;
+	attr.min_rnr_timer	= 0;
+	/* Disable read ability */
+	attr.max_dest_rd_atomic = 0;
+	attr.max_rd_atomic	= 0;
+	/* Use arbitrary values as rq_psn and sq_psn */
+	attr.rq_psn		= 0x0808;
+	attr.sq_psn		= 0x0808;
+	attr.retry_cnt		= 7;
+	attr.rnr_retry		= 7;
+	attr.timeout		= 0x12;
+	attr.path_mtu		= IB_MTU_256;
+	attr.ah_attr.type	= RDMA_AH_ATTR_TYPE_ROCE;
+	rdma_ah_set_grh(&attr.ah_attr, NULL, 0, 0, 1, 0);
+	rdma_ah_set_static_rate(&attr.ah_attr, 3);
+
+	subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+	for (i = 0; i < HNS_ROCE_V1_RESV_QP; i++) {
+		phy_port = (i >= HNS_ROCE_MAX_PORTS) ? (i - 2) :
+				(i % HNS_ROCE_MAX_PORTS);
+		sl = i / HNS_ROCE_MAX_PORTS;
+
+		for (j = 0; j < caps->num_ports; j++) {
+			if (hr_dev->iboe.phy_port[j] == phy_port) {
+				queue_en[i] = 1;
+				port = j;
+				break;
+			}
+		}
+
+		if (!queue_en[i])
+			continue;
+
+		free_mr->mr_free_qp[i] = hns_roce_v1_create_lp_qp(hr_dev, pd);
+		if (!free_mr->mr_free_qp[i]) {
+			dev_err(dev, "Create loop qp failed!\n");
+			goto create_lp_qp_failed;
+		}
+		hr_qp = free_mr->mr_free_qp[i];
+
+		hr_qp->port		= port;
+		hr_qp->phy_port		= phy_port;
+		hr_qp->ibqp.qp_type	= IB_QPT_RC;
+		hr_qp->ibqp.device	= &hr_dev->ib_dev;
+		hr_qp->ibqp.uobject	= NULL;
+		atomic_set(&hr_qp->ibqp.usecnt, 0);
+		hr_qp->ibqp.pd		= pd;
+		hr_qp->ibqp.recv_cq	= cq;
+		hr_qp->ibqp.send_cq	= cq;
+
+		rdma_ah_set_port_num(&attr.ah_attr, port + 1);
+		rdma_ah_set_sl(&attr.ah_attr, sl);
+		attr.port_num		= port + 1;
+
+		attr.dest_qp_num	= hr_qp->qpn;
+		memcpy(rdma_ah_retrieve_dmac(&attr.ah_attr),
+		       hr_dev->dev_addr[port],
+		       MAC_ADDR_OCTET_NUM);
+
+		memcpy(&dgid.raw, &subnet_prefix, sizeof(u64));
+		memcpy(&dgid.raw[8], hr_dev->dev_addr[port], 3);
+		memcpy(&dgid.raw[13], hr_dev->dev_addr[port] + 3, 3);
+		dgid.raw[11] = 0xff;
+		dgid.raw[12] = 0xfe;
+		dgid.raw[8] ^= 2;
+		rdma_ah_set_dgid_raw(&attr.ah_attr, dgid.raw);
+
+		ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, &attr, attr_mask,
+					    IB_QPS_RESET, IB_QPS_INIT);
+		if (ret) {
+			dev_err(dev, "modify qp failed(%d)!\n", ret);
+			goto create_lp_qp_failed;
+		}
+
+		ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, &attr, IB_QP_DEST_QPN,
+					    IB_QPS_INIT, IB_QPS_RTR);
+		if (ret) {
+			dev_err(dev, "modify qp failed(%d)!\n", ret);
+			goto create_lp_qp_failed;
+		}
+
+		ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, &attr, attr_mask,
+					    IB_QPS_RTR, IB_QPS_RTS);
+		if (ret) {
+			dev_err(dev, "modify qp failed(%d)!\n", ret);
+			goto create_lp_qp_failed;
+		}
+	}
+
+	return 0;
+
+create_lp_qp_failed:
+	for (i -= 1; i >= 0; i--) {
+		hr_qp = free_mr->mr_free_qp[i];
+		if (hns_roce_v1_destroy_qp(&hr_qp->ibqp))
+			dev_err(dev, "Destroy qp %d for mr free failed!\n", i);
+	}
+
+	if (hns_roce_dealloc_pd(pd))
+		dev_err(dev, "Destroy pd for create_lp_qp failed!\n");
+
+alloc_pd_failed:
+	if (hns_roce_ib_destroy_cq(cq))
+		dev_err(dev, "Destroy cq for create_lp_qp failed!\n");
+
+	return -EINVAL;
+}
+
+static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_free_mr *free_mr;
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_qp *hr_qp;
+	int ret;
+	int i;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	free_mr = &priv->free_mr;
+
+	for (i = 0; i < HNS_ROCE_V1_RESV_QP; i++) {
+		hr_qp = free_mr->mr_free_qp[i];
+		if (!hr_qp)
+			continue;
+
+		ret = hns_roce_v1_destroy_qp(&hr_qp->ibqp);
+		if (ret)
+			dev_err(dev, "Destroy qp %d for mr free failed(%d)!\n",
+				i, ret);
+	}
+
+	ret = hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq);
+	if (ret)
+		dev_err(dev, "Destroy cq for mr_free failed(%d)!\n", ret);
+
+	ret = hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd);
+	if (ret)
+		dev_err(dev, "Destroy pd for mr_free failed(%d)!\n", ret);
+}
+
+static int hns_roce_db_init(struct hns_roce_dev *hr_dev)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_db_table *db;
+	u32 sdb_ext_mod;
+	u32 odb_ext_mod;
+	u32 sdb_evt_mod;
+	u32 odb_evt_mod;
+	int ret = 0;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	db = &priv->db_table;
+
+	memset(db, 0, sizeof(*db));
+
+	/* Default DB mode */
+	sdb_ext_mod = HNS_ROCE_SDB_EXTEND_MODE;
+	odb_ext_mod = HNS_ROCE_ODB_EXTEND_MODE;
+	sdb_evt_mod = HNS_ROCE_SDB_NORMAL_MODE;
+	odb_evt_mod = HNS_ROCE_ODB_POLL_MODE;
+
+	db->sdb_ext_mod = sdb_ext_mod;
+	db->odb_ext_mod = odb_ext_mod;
+
+	/* Init extend DB */
+	ret = hns_roce_db_ext_init(hr_dev, sdb_ext_mod, odb_ext_mod);
+	if (ret) {
+		dev_err(dev, "Failed in extend DB configuration.\n");
+		return ret;
+	}
+
+	hns_roce_set_db_event_mode(hr_dev, sdb_evt_mod, odb_evt_mod);
+
+	return 0;
+}
+
+static void hns_roce_v1_recreate_lp_qp_work_fn(struct work_struct *work)
+{
+	struct hns_roce_recreate_lp_qp_work *lp_qp_work;
+	struct hns_roce_dev *hr_dev;
+
+	lp_qp_work = container_of(work, struct hns_roce_recreate_lp_qp_work,
+				  work);
+	hr_dev = to_hr_dev(lp_qp_work->ib_dev);
+
+	hns_roce_v1_release_lp_qp(hr_dev);
+
+	if (hns_roce_v1_rsv_lp_qp(hr_dev))
+		dev_err(&hr_dev->pdev->dev, "create reserver qp failed\n");
+
+	if (lp_qp_work->comp_flag)
+		complete(lp_qp_work->comp);
+
+	kfree(lp_qp_work);
+}
+
+static int hns_roce_v1_recreate_lp_qp(struct hns_roce_dev *hr_dev)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_recreate_lp_qp_work *lp_qp_work;
+	struct hns_roce_free_mr *free_mr;
+	struct hns_roce_v1_priv *priv;
+	struct completion comp;
+	unsigned long end =
+	  msecs_to_jiffies(HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS) + jiffies;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	free_mr = &priv->free_mr;
+
+	lp_qp_work = kzalloc(sizeof(struct hns_roce_recreate_lp_qp_work),
+			     GFP_KERNEL);
+	if (!lp_qp_work)
+		return -ENOMEM;
+
+	INIT_WORK(&(lp_qp_work->work), hns_roce_v1_recreate_lp_qp_work_fn);
+
+	lp_qp_work->ib_dev = &(hr_dev->ib_dev);
+	lp_qp_work->comp = &comp;
+	lp_qp_work->comp_flag = 1;
+
+	init_completion(lp_qp_work->comp);
+
+	queue_work(free_mr->free_mr_wq, &(lp_qp_work->work));
+
+	while (time_before_eq(jiffies, end)) {
+		if (try_wait_for_completion(&comp))
+			return 0;
+		msleep(HNS_ROCE_V1_RECREATE_LP_QP_WAIT_VALUE);
+	}
+
+	lp_qp_work->comp_flag = 0;
+	if (try_wait_for_completion(&comp))
+		return 0;
+
+	dev_warn(dev, "recreate lp qp failed 20s timeout and return failed!\n");
+	return -ETIMEDOUT;
+}
+
+static int hns_roce_v1_send_lp_wqe(struct hns_roce_qp *hr_qp)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device);
+	struct device *dev = &hr_dev->pdev->dev;
+	struct ib_send_wr send_wr, *bad_wr;
+	int ret;
+
+	memset(&send_wr, 0, sizeof(send_wr));
+	send_wr.next	= NULL;
+	send_wr.num_sge	= 0;
+	send_wr.send_flags = 0;
+	send_wr.sg_list	= NULL;
+	send_wr.wr_id	= (unsigned long long)&send_wr;
+	send_wr.opcode	= IB_WR_RDMA_WRITE;
+
+	ret = hns_roce_v1_post_send(&hr_qp->ibqp, &send_wr, &bad_wr);
+	if (ret) {
+		dev_err(dev, "Post write wqe for mr free failed(%d)!", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void hns_roce_v1_mr_free_work_fn(struct work_struct *work)
+{
+	struct hns_roce_mr_free_work *mr_work;
+	struct ib_wc wc[HNS_ROCE_V1_RESV_QP];
+	struct hns_roce_free_mr *free_mr;
+	struct hns_roce_cq *mr_free_cq;
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_dev *hr_dev;
+	struct hns_roce_mr *hr_mr;
+	struct hns_roce_qp *hr_qp;
+	struct device *dev;
+	unsigned long end =
+		msecs_to_jiffies(HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS) + jiffies;
+	int i;
+	int ret;
+	int ne = 0;
+
+	mr_work = container_of(work, struct hns_roce_mr_free_work, work);
+	hr_mr = (struct hns_roce_mr *)mr_work->mr;
+	hr_dev = to_hr_dev(mr_work->ib_dev);
+	dev = &hr_dev->pdev->dev;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	free_mr = &priv->free_mr;
+	mr_free_cq = free_mr->mr_free_cq;
+
+	for (i = 0; i < HNS_ROCE_V1_RESV_QP; i++) {
+		hr_qp = free_mr->mr_free_qp[i];
+		if (!hr_qp)
+			continue;
+		ne++;
+
+		ret = hns_roce_v1_send_lp_wqe(hr_qp);
+		if (ret) {
+			dev_err(dev,
+			     "Send wqe (qp:0x%lx) for mr free failed(%d)!\n",
+			     hr_qp->qpn, ret);
+			goto free_work;
+		}
+	}
+
+	if (!ne) {
+		dev_err(dev, "Reserved loop qp is absent!\n");
+		goto free_work;
+	}
+
+	do {
+		ret = hns_roce_v1_poll_cq(&mr_free_cq->ib_cq, ne, wc);
+		if (ret < 0 && hr_qp) {
+			dev_err(dev,
+			   "(qp:0x%lx) starts, Poll cqe failed(%d) for mr 0x%x free! Remain %d cqe\n",
+			   hr_qp->qpn, ret, hr_mr->key, ne);
+			goto free_work;
+		}
+		ne -= ret;
+		usleep_range(HNS_ROCE_V1_FREE_MR_WAIT_VALUE * 1000,
+			     (1 + HNS_ROCE_V1_FREE_MR_WAIT_VALUE) * 1000);
+	} while (ne && time_before_eq(jiffies, end));
+
+	if (ne != 0)
+		dev_err(dev,
+			"Poll cqe for mr 0x%x free timeout! Remain %d cqe\n",
+			hr_mr->key, ne);
+
+free_work:
+	if (mr_work->comp_flag)
+		complete(mr_work->comp);
+	kfree(mr_work);
+}
+
+static int hns_roce_v1_dereg_mr(struct hns_roce_dev *hr_dev,
+				struct hns_roce_mr *mr)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_mr_free_work *mr_work;
+	struct hns_roce_free_mr *free_mr;
+	struct hns_roce_v1_priv *priv;
+	struct completion comp;
+	unsigned long end =
+		msecs_to_jiffies(HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS) + jiffies;
+	unsigned long start = jiffies;
+	int npages;
+	int ret = 0;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	free_mr = &priv->free_mr;
+
+	if (mr->enabled) {
+		if (hns_roce_hw2sw_mpt(hr_dev, NULL, key_to_hw_index(mr->key)
+				       & (hr_dev->caps.num_mtpts - 1)))
+			dev_warn(dev, "HW2SW_MPT failed!\n");
+	}
+
+	mr_work = kzalloc(sizeof(*mr_work), GFP_KERNEL);
+	if (!mr_work) {
+		ret = -ENOMEM;
+		goto free_mr;
+	}
+
+	INIT_WORK(&(mr_work->work), hns_roce_v1_mr_free_work_fn);
+
+	mr_work->ib_dev = &(hr_dev->ib_dev);
+	mr_work->comp = &comp;
+	mr_work->comp_flag = 1;
+	mr_work->mr = (void *)mr;
+	init_completion(mr_work->comp);
+
+	queue_work(free_mr->free_mr_wq, &(mr_work->work));
+
+	while (time_before_eq(jiffies, end)) {
+		if (try_wait_for_completion(&comp))
+			goto free_mr;
+		msleep(HNS_ROCE_V1_FREE_MR_WAIT_VALUE);
+	}
+
+	mr_work->comp_flag = 0;
+	if (try_wait_for_completion(&comp))
+		goto free_mr;
+
+	dev_warn(dev, "Free mr work 0x%x over 50s and failed!\n", mr->key);
+	ret = -ETIMEDOUT;
+
+free_mr:
+	dev_dbg(dev, "Free mr 0x%x use 0x%x us.\n",
+		mr->key, jiffies_to_usecs(jiffies) - jiffies_to_usecs(start));
+
+	if (mr->size != ~0ULL) {
+		npages = ib_umem_page_count(mr->umem);
+		dma_free_coherent(dev, npages * 8, mr->pbl_buf,
+				  mr->pbl_dma_addr);
+	}
+
+	hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap,
+			     key_to_hw_index(mr->key), 0);
+
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+
+	kfree(mr);
+
+	return ret;
+}
+
+static void hns_roce_db_free(struct hns_roce_dev *hr_dev)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_db_table *db;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	db = &priv->db_table;
+
+	if (db->sdb_ext_mod) {
+		dma_free_coherent(dev, HNS_ROCE_V1_EXT_SDB_SIZE,
+				  db->ext_db->sdb_buf_list->buf,
+				  db->ext_db->sdb_buf_list->map);
+		kfree(db->ext_db->sdb_buf_list);
+	}
+
+	if (db->odb_ext_mod) {
+		dma_free_coherent(dev, HNS_ROCE_V1_EXT_ODB_SIZE,
+				  db->ext_db->odb_buf_list->buf,
+				  db->ext_db->odb_buf_list->map);
+		kfree(db->ext_db->odb_buf_list);
+	}
+
+	kfree(db->ext_db);
+}
+
+static int hns_roce_raq_init(struct hns_roce_dev *hr_dev)
+{
+	int ret;
+	int raq_shift = 0;
+	dma_addr_t addr;
+	u32 val;
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_raq_table *raq;
+	struct device *dev = &hr_dev->pdev->dev;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	raq = &priv->raq_table;
+
+	raq->e_raq_buf = kzalloc(sizeof(*(raq->e_raq_buf)), GFP_KERNEL);
+	if (!raq->e_raq_buf)
+		return -ENOMEM;
+
+	raq->e_raq_buf->buf = dma_alloc_coherent(dev, HNS_ROCE_V1_RAQ_SIZE,
+						 &addr, GFP_KERNEL);
+	if (!raq->e_raq_buf->buf) {
+		ret = -ENOMEM;
+		goto err_dma_alloc_raq;
+	}
+	raq->e_raq_buf->map = addr;
+
+	/* Configure raq extended address. 48bit 4K align*/
+	roce_write(hr_dev, ROCEE_EXT_RAQ_REG, raq->e_raq_buf->map >> 12);
+
+	/* Configure raq_shift */
+	raq_shift = ilog2(HNS_ROCE_V1_RAQ_SIZE / HNS_ROCE_V1_RAQ_ENTRY);
+	val = roce_read(hr_dev, ROCEE_EXT_RAQ_H_REG);
+	roce_set_field(val, ROCEE_EXT_RAQ_H_EXT_RAQ_SHIFT_M,
+		       ROCEE_EXT_RAQ_H_EXT_RAQ_SHIFT_S, raq_shift);
+	/*
+	 * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of
+	 * using 4K page, and shift more 32 because of
+	 * caculating the high 32 bit value evaluated to hardware.
+	 */
+	roce_set_field(val, ROCEE_EXT_RAQ_H_EXT_RAQ_BA_H_M,
+		       ROCEE_EXT_RAQ_H_EXT_RAQ_BA_H_S,
+		       raq->e_raq_buf->map >> 44);
+	roce_write(hr_dev, ROCEE_EXT_RAQ_H_REG, val);
+	dev_dbg(dev, "Configure raq_shift 0x%x.\n", val);
+
+	/* Configure raq threshold */
+	val = roce_read(hr_dev, ROCEE_RAQ_WL_REG);
+	roce_set_field(val, ROCEE_RAQ_WL_ROCEE_RAQ_WL_M,
+		       ROCEE_RAQ_WL_ROCEE_RAQ_WL_S,
+		       HNS_ROCE_V1_EXT_RAQ_WF);
+	roce_write(hr_dev, ROCEE_RAQ_WL_REG, val);
+	dev_dbg(dev, "Configure raq_wl 0x%x.\n", val);
+
+	/* Enable extend raq */
+	val = roce_read(hr_dev, ROCEE_WRMS_POL_TIME_INTERVAL_REG);
+	roce_set_field(val,
+		       ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_POL_TIME_INTERVAL_M,
+		       ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_POL_TIME_INTERVAL_S,
+		       POL_TIME_INTERVAL_VAL);
+	roce_set_bit(val, ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_EXT_RAQ_MODE, 1);
+	roce_set_field(val,
+		       ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_RAQ_TIMEOUT_CHK_CFG_M,
+		       ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_RAQ_TIMEOUT_CHK_CFG_S,
+		       2);
+	roce_set_bit(val,
+		     ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_RAQ_TIMEOUT_CHK_EN_S, 1);
+	roce_write(hr_dev, ROCEE_WRMS_POL_TIME_INTERVAL_REG, val);
+	dev_dbg(dev, "Configure WrmsPolTimeInterval 0x%x.\n", val);
+
+	/* Enable raq drop */
+	val = roce_read(hr_dev, ROCEE_GLB_CFG_REG);
+	roce_set_bit(val, ROCEE_GLB_CFG_TRP_RAQ_DROP_EN_S, 1);
+	roce_write(hr_dev, ROCEE_GLB_CFG_REG, val);
+	dev_dbg(dev, "Configure GlbCfg = 0x%x.\n", val);
+
+	return 0;
+
+err_dma_alloc_raq:
+	kfree(raq->e_raq_buf);
+	return ret;
+}
+
+static void hns_roce_raq_free(struct hns_roce_dev *hr_dev)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_raq_table *raq;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	raq = &priv->raq_table;
+
+	dma_free_coherent(dev, HNS_ROCE_V1_RAQ_SIZE, raq->e_raq_buf->buf,
+			  raq->e_raq_buf->map);
+	kfree(raq->e_raq_buf);
+}
+
+static void hns_roce_port_enable(struct hns_roce_dev *hr_dev, int enable_flag)
+{
+	u32 val;
+
+	if (enable_flag) {
+		val = roce_read(hr_dev, ROCEE_GLB_CFG_REG);
+		 /* Open all ports */
+		roce_set_field(val, ROCEE_GLB_CFG_ROCEE_PORT_ST_M,
+			       ROCEE_GLB_CFG_ROCEE_PORT_ST_S,
+			       ALL_PORT_VAL_OPEN);
+		roce_write(hr_dev, ROCEE_GLB_CFG_REG, val);
+	} else {
+		val = roce_read(hr_dev, ROCEE_GLB_CFG_REG);
+		/* Close all ports */
+		roce_set_field(val, ROCEE_GLB_CFG_ROCEE_PORT_ST_M,
+			       ROCEE_GLB_CFG_ROCEE_PORT_ST_S, 0x0);
+		roce_write(hr_dev, ROCEE_GLB_CFG_REG, val);
+	}
+}
+
+static int hns_roce_bt_init(struct hns_roce_dev *hr_dev)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_v1_priv *priv;
+	int ret;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+
+	priv->bt_table.qpc_buf.buf = dma_alloc_coherent(dev,
+		HNS_ROCE_BT_RSV_BUF_SIZE, &priv->bt_table.qpc_buf.map,
+		GFP_KERNEL);
+	if (!priv->bt_table.qpc_buf.buf)
+		return -ENOMEM;
+
+	priv->bt_table.mtpt_buf.buf = dma_alloc_coherent(dev,
+		HNS_ROCE_BT_RSV_BUF_SIZE, &priv->bt_table.mtpt_buf.map,
+		GFP_KERNEL);
+	if (!priv->bt_table.mtpt_buf.buf) {
+		ret = -ENOMEM;
+		goto err_failed_alloc_mtpt_buf;
+	}
+
+	priv->bt_table.cqc_buf.buf = dma_alloc_coherent(dev,
+		HNS_ROCE_BT_RSV_BUF_SIZE, &priv->bt_table.cqc_buf.map,
+		GFP_KERNEL);
+	if (!priv->bt_table.cqc_buf.buf) {
+		ret = -ENOMEM;
+		goto err_failed_alloc_cqc_buf;
+	}
+
+	return 0;
+
+err_failed_alloc_cqc_buf:
+	dma_free_coherent(dev, HNS_ROCE_BT_RSV_BUF_SIZE,
+		priv->bt_table.mtpt_buf.buf, priv->bt_table.mtpt_buf.map);
+
+err_failed_alloc_mtpt_buf:
+	dma_free_coherent(dev, HNS_ROCE_BT_RSV_BUF_SIZE,
+		priv->bt_table.qpc_buf.buf, priv->bt_table.qpc_buf.map);
+
+	return ret;
+}
+
+static void hns_roce_bt_free(struct hns_roce_dev *hr_dev)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_v1_priv *priv;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+
+	dma_free_coherent(dev, HNS_ROCE_BT_RSV_BUF_SIZE,
+		priv->bt_table.cqc_buf.buf, priv->bt_table.cqc_buf.map);
+
+	dma_free_coherent(dev, HNS_ROCE_BT_RSV_BUF_SIZE,
+		priv->bt_table.mtpt_buf.buf, priv->bt_table.mtpt_buf.map);
+
+	dma_free_coherent(dev, HNS_ROCE_BT_RSV_BUF_SIZE,
+		priv->bt_table.qpc_buf.buf, priv->bt_table.qpc_buf.map);
+}
+
+static int hns_roce_tptr_init(struct hns_roce_dev *hr_dev)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_buf_list *tptr_buf;
+	struct hns_roce_v1_priv *priv;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	tptr_buf = &priv->tptr_table.tptr_buf;
+
+	/*
+	 * This buffer will be used for CQ's tptr(tail pointer), also
+	 * named ci(customer index). Every CQ will use 2 bytes to save
+	 * cqe ci in hip06. Hardware will read this area to get new ci
+	 * when the queue is almost full.
+	 */
+	tptr_buf->buf = dma_alloc_coherent(dev, HNS_ROCE_V1_TPTR_BUF_SIZE,
+					   &tptr_buf->map, GFP_KERNEL);
+	if (!tptr_buf->buf)
+		return -ENOMEM;
+
+	hr_dev->tptr_dma_addr = tptr_buf->map;
+	hr_dev->tptr_size = HNS_ROCE_V1_TPTR_BUF_SIZE;
+
+	return 0;
+}
+
+static void hns_roce_tptr_free(struct hns_roce_dev *hr_dev)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_buf_list *tptr_buf;
+	struct hns_roce_v1_priv *priv;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	tptr_buf = &priv->tptr_table.tptr_buf;
+
+	dma_free_coherent(dev, HNS_ROCE_V1_TPTR_BUF_SIZE,
+			  tptr_buf->buf, tptr_buf->map);
+}
+
+static int hns_roce_free_mr_init(struct hns_roce_dev *hr_dev)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_free_mr *free_mr;
+	struct hns_roce_v1_priv *priv;
+	int ret = 0;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	free_mr = &priv->free_mr;
+
+	free_mr->free_mr_wq = create_singlethread_workqueue("hns_roce_free_mr");
+	if (!free_mr->free_mr_wq) {
+		dev_err(dev, "Create free mr workqueue failed!\n");
+		return -ENOMEM;
+	}
+
+	ret = hns_roce_v1_rsv_lp_qp(hr_dev);
+	if (ret) {
+		dev_err(dev, "Reserved loop qp failed(%d)!\n", ret);
+		flush_workqueue(free_mr->free_mr_wq);
+		destroy_workqueue(free_mr->free_mr_wq);
+	}
+
+	return ret;
+}
+
+static void hns_roce_free_mr_free(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_free_mr *free_mr;
+	struct hns_roce_v1_priv *priv;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	free_mr = &priv->free_mr;
+
+	flush_workqueue(free_mr->free_mr_wq);
+	destroy_workqueue(free_mr->free_mr_wq);
+
+	hns_roce_v1_release_lp_qp(hr_dev);
+}
+
+/**
+ * hns_roce_v1_reset - reset RoCE
+ * @hr_dev: RoCE device struct pointer
+ * @enable: true -- drop reset, false -- reset
+ * return 0 - success , negative --fail
+ */
+static int hns_roce_v1_reset(struct hns_roce_dev *hr_dev, bool dereset)
+{
+	struct device_node *dsaf_node;
+	struct device *dev = &hr_dev->pdev->dev;
+	struct device_node *np = dev->of_node;
+	struct fwnode_handle *fwnode;
+	int ret;
+
+	/* check if this is DT/ACPI case */
+	if (dev_of_node(dev)) {
+		dsaf_node = of_parse_phandle(np, "dsaf-handle", 0);
+		if (!dsaf_node) {
+			dev_err(dev, "could not find dsaf-handle\n");
+			return -EINVAL;
+		}
+		fwnode = &dsaf_node->fwnode;
+	} else if (is_acpi_device_node(dev->fwnode)) {
+		struct acpi_reference_args args;
+
+		ret = acpi_node_get_property_reference(dev->fwnode,
+						       "dsaf-handle", 0, &args);
+		if (ret) {
+			dev_err(dev, "could not find dsaf-handle\n");
+			return ret;
+		}
+		fwnode = acpi_fwnode_handle(args.adev);
+	} else {
+		dev_err(dev, "cannot read data from DT or ACPI\n");
+		return -ENXIO;
+	}
+
+	ret = hns_dsaf_roce_reset(fwnode, false);
+	if (ret)
+		return ret;
+
+	if (dereset) {
+		msleep(SLEEP_TIME_INTERVAL);
+		ret = hns_dsaf_roce_reset(fwnode, true);
+	}
+
+	return ret;
+}
+
+static int hns_roce_des_qp_init(struct hns_roce_dev *hr_dev)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_des_qp *des_qp;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	des_qp = &priv->des_qp;
+
+	des_qp->requeue_flag = 1;
+	des_qp->qp_wq = create_singlethread_workqueue("hns_roce_destroy_qp");
+	if (!des_qp->qp_wq) {
+		dev_err(dev, "Create destroy qp workqueue failed!\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void hns_roce_des_qp_free(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_des_qp *des_qp;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	des_qp = &priv->des_qp;
+
+	des_qp->requeue_flag = 0;
+	flush_workqueue(des_qp->qp_wq);
+	destroy_workqueue(des_qp->qp_wq);
+}
+
+static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev)
+{
+	int i = 0;
+	struct hns_roce_caps *caps = &hr_dev->caps;
+
+	hr_dev->vendor_id = le32_to_cpu(roce_read(hr_dev, ROCEE_VENDOR_ID_REG));
+	hr_dev->vendor_part_id = le32_to_cpu(roce_read(hr_dev,
+					     ROCEE_VENDOR_PART_ID_REG));
+	hr_dev->sys_image_guid = le32_to_cpu(roce_read(hr_dev,
+					     ROCEE_SYS_IMAGE_GUID_L_REG)) |
+				((u64)le32_to_cpu(roce_read(hr_dev,
+					    ROCEE_SYS_IMAGE_GUID_H_REG)) << 32);
+	hr_dev->hw_rev		= HNS_ROCE_HW_VER1;
+
+	caps->num_qps		= HNS_ROCE_V1_MAX_QP_NUM;
+	caps->max_wqes		= HNS_ROCE_V1_MAX_WQE_NUM;
+	caps->min_wqes		= HNS_ROCE_MIN_WQE_NUM;
+	caps->num_cqs		= HNS_ROCE_V1_MAX_CQ_NUM;
+	caps->min_cqes		= HNS_ROCE_MIN_CQE_NUM;
+	caps->max_cqes		= HNS_ROCE_V1_MAX_CQE_NUM;
+	caps->max_sq_sg		= HNS_ROCE_V1_SG_NUM;
+	caps->max_rq_sg		= HNS_ROCE_V1_SG_NUM;
+	caps->max_sq_inline	= HNS_ROCE_V1_INLINE_SIZE;
+	caps->num_uars		= HNS_ROCE_V1_UAR_NUM;
+	caps->phy_num_uars	= HNS_ROCE_V1_PHY_UAR_NUM;
+	caps->num_aeq_vectors	= HNS_ROCE_V1_AEQE_VEC_NUM;
+	caps->num_comp_vectors	= HNS_ROCE_V1_COMP_VEC_NUM;
+	caps->num_other_vectors	= HNS_ROCE_V1_ABNORMAL_VEC_NUM;
+	caps->num_mtpts		= HNS_ROCE_V1_MAX_MTPT_NUM;
+	caps->num_mtt_segs	= HNS_ROCE_V1_MAX_MTT_SEGS;
+	caps->num_pds		= HNS_ROCE_V1_MAX_PD_NUM;
+	caps->max_qp_init_rdma	= HNS_ROCE_V1_MAX_QP_INIT_RDMA;
+	caps->max_qp_dest_rdma	= HNS_ROCE_V1_MAX_QP_DEST_RDMA;
+	caps->max_sq_desc_sz	= HNS_ROCE_V1_MAX_SQ_DESC_SZ;
+	caps->max_rq_desc_sz	= HNS_ROCE_V1_MAX_RQ_DESC_SZ;
+	caps->qpc_entry_sz	= HNS_ROCE_V1_QPC_ENTRY_SIZE;
+	caps->irrl_entry_sz	= HNS_ROCE_V1_IRRL_ENTRY_SIZE;
+	caps->cqc_entry_sz	= HNS_ROCE_V1_CQC_ENTRY_SIZE;
+	caps->mtpt_entry_sz	= HNS_ROCE_V1_MTPT_ENTRY_SIZE;
+	caps->mtt_entry_sz	= HNS_ROCE_V1_MTT_ENTRY_SIZE;
+	caps->cq_entry_sz	= HNS_ROCE_V1_CQE_ENTRY_SIZE;
+	caps->page_size_cap	= HNS_ROCE_V1_PAGE_SIZE_SUPPORT;
+	caps->reserved_lkey	= 0;
+	caps->reserved_pds	= 0;
+	caps->reserved_mrws	= 1;
+	caps->reserved_uars	= 0;
+	caps->reserved_cqs	= 0;
+	caps->chunk_sz		= HNS_ROCE_V1_TABLE_CHUNK_SIZE;
+
+	for (i = 0; i < caps->num_ports; i++)
+		caps->pkey_table_len[i] = 1;
+
+	for (i = 0; i < caps->num_ports; i++) {
+		/* Six ports shared 16 GID in v1 engine */
+		if (i >= (HNS_ROCE_V1_GID_NUM % caps->num_ports))
+			caps->gid_table_len[i] = HNS_ROCE_V1_GID_NUM /
+						 caps->num_ports;
+		else
+			caps->gid_table_len[i] = HNS_ROCE_V1_GID_NUM /
+						 caps->num_ports + 1;
+	}
+
+	caps->ceqe_depth = HNS_ROCE_V1_COMP_EQE_NUM;
+	caps->aeqe_depth = HNS_ROCE_V1_ASYNC_EQE_NUM;
+	caps->local_ca_ack_delay = le32_to_cpu(roce_read(hr_dev,
+							 ROCEE_ACK_DELAY_REG));
+	caps->max_mtu = IB_MTU_2048;
+
+	return 0;
+}
+
+static int hns_roce_v1_init(struct hns_roce_dev *hr_dev)
+{
+	int ret;
+	u32 val;
+	struct device *dev = &hr_dev->pdev->dev;
+
+	/* DMAE user config */
+	val = roce_read(hr_dev, ROCEE_DMAE_USER_CFG1_REG);
+	roce_set_field(val, ROCEE_DMAE_USER_CFG1_ROCEE_CACHE_TB_CFG_M,
+		       ROCEE_DMAE_USER_CFG1_ROCEE_CACHE_TB_CFG_S, 0xf);
+	roce_set_field(val, ROCEE_DMAE_USER_CFG1_ROCEE_STREAM_ID_TB_CFG_M,
+		       ROCEE_DMAE_USER_CFG1_ROCEE_STREAM_ID_TB_CFG_S,
+		       1 << PAGES_SHIFT_16);
+	roce_write(hr_dev, ROCEE_DMAE_USER_CFG1_REG, val);
+
+	val = roce_read(hr_dev, ROCEE_DMAE_USER_CFG2_REG);
+	roce_set_field(val, ROCEE_DMAE_USER_CFG2_ROCEE_CACHE_PKT_CFG_M,
+		       ROCEE_DMAE_USER_CFG2_ROCEE_CACHE_PKT_CFG_S, 0xf);
+	roce_set_field(val, ROCEE_DMAE_USER_CFG2_ROCEE_STREAM_ID_PKT_CFG_M,
+		       ROCEE_DMAE_USER_CFG2_ROCEE_STREAM_ID_PKT_CFG_S,
+		       1 << PAGES_SHIFT_16);
+
+	ret = hns_roce_db_init(hr_dev);
+	if (ret) {
+		dev_err(dev, "doorbell init failed!\n");
+		return ret;
+	}
+
+	ret = hns_roce_raq_init(hr_dev);
+	if (ret) {
+		dev_err(dev, "raq init failed!\n");
+		goto error_failed_raq_init;
+	}
+
+	ret = hns_roce_bt_init(hr_dev);
+	if (ret) {
+		dev_err(dev, "bt init failed!\n");
+		goto error_failed_bt_init;
+	}
+
+	ret = hns_roce_tptr_init(hr_dev);
+	if (ret) {
+		dev_err(dev, "tptr init failed!\n");
+		goto error_failed_tptr_init;
+	}
+
+	ret = hns_roce_des_qp_init(hr_dev);
+	if (ret) {
+		dev_err(dev, "des qp init failed!\n");
+		goto error_failed_des_qp_init;
+	}
+
+	ret = hns_roce_free_mr_init(hr_dev);
+	if (ret) {
+		dev_err(dev, "free mr init failed!\n");
+		goto error_failed_free_mr_init;
+	}
+
+	hns_roce_port_enable(hr_dev, HNS_ROCE_PORT_UP);
+
+	return 0;
+
+error_failed_free_mr_init:
+	hns_roce_des_qp_free(hr_dev);
+
+error_failed_des_qp_init:
+	hns_roce_tptr_free(hr_dev);
+
+error_failed_tptr_init:
+	hns_roce_bt_free(hr_dev);
+
+error_failed_bt_init:
+	hns_roce_raq_free(hr_dev);
+
+error_failed_raq_init:
+	hns_roce_db_free(hr_dev);
+	return ret;
+}
+
+static void hns_roce_v1_exit(struct hns_roce_dev *hr_dev)
+{
+	hns_roce_port_enable(hr_dev, HNS_ROCE_PORT_DOWN);
+	hns_roce_free_mr_free(hr_dev);
+	hns_roce_des_qp_free(hr_dev);
+	hns_roce_tptr_free(hr_dev);
+	hns_roce_bt_free(hr_dev);
+	hns_roce_raq_free(hr_dev);
+	hns_roce_db_free(hr_dev);
+}
+
+static int hns_roce_v1_cmd_pending(struct hns_roce_dev *hr_dev)
+{
+	u32 status = readl(hr_dev->reg_base + ROCEE_MB6_REG);
+
+	return (!!(status & (1 << HCR_GO_BIT)));
+}
+
+static int hns_roce_v1_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param,
+				 u64 out_param, u32 in_modifier, u8 op_modifier,
+				 u16 op, u16 token, int event)
+{
+	u32 __iomem *hcr = (u32 __iomem *)(hr_dev->reg_base + ROCEE_MB1_REG);
+	unsigned long end;
+	u32 val = 0;
+
+	end = msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS) + jiffies;
+	while (hns_roce_v1_cmd_pending(hr_dev)) {
+		if (time_after(jiffies, end)) {
+			dev_err(hr_dev->dev, "jiffies=%d end=%d\n",
+				(int)jiffies, (int)end);
+			return -EAGAIN;
+		}
+		cond_resched();
+	}
+
+	roce_set_field(val, ROCEE_MB6_ROCEE_MB_CMD_M, ROCEE_MB6_ROCEE_MB_CMD_S,
+		       op);
+	roce_set_field(val, ROCEE_MB6_ROCEE_MB_CMD_MDF_M,
+		       ROCEE_MB6_ROCEE_MB_CMD_MDF_S, op_modifier);
+	roce_set_bit(val, ROCEE_MB6_ROCEE_MB_EVENT_S, event);
+	roce_set_bit(val, ROCEE_MB6_ROCEE_MB_HW_RUN_S, 1);
+	roce_set_field(val, ROCEE_MB6_ROCEE_MB_TOKEN_M,
+		       ROCEE_MB6_ROCEE_MB_TOKEN_S, token);
+
+	writeq(in_param, hcr + 0);
+	writeq(out_param, hcr + 2);
+	writel(in_modifier, hcr + 4);
+	/* Memory barrier */
+	wmb();
+
+	writel(val, hcr + 5);
+
+	mmiowb();
+
+	return 0;
+}
+
+static int hns_roce_v1_chk_mbox(struct hns_roce_dev *hr_dev,
+				unsigned long timeout)
+{
+	u8 __iomem *hcr = hr_dev->reg_base + ROCEE_MB1_REG;
+	unsigned long end = 0;
+	u32 status = 0;
+
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (hns_roce_v1_cmd_pending(hr_dev) && time_before(jiffies, end))
+		cond_resched();
+
+	if (hns_roce_v1_cmd_pending(hr_dev)) {
+		dev_err(hr_dev->dev, "[cmd_poll]hw run cmd TIMEDOUT!\n");
+		return -ETIMEDOUT;
+	}
+
+	status = le32_to_cpu((__force __be32)
+			      __raw_readl(hcr + HCR_STATUS_OFFSET));
+	if ((status & STATUS_MASK) != 0x1) {
+		dev_err(hr_dev->dev, "mailbox status 0x%x!\n", status);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int hns_roce_v1_set_gid(struct hns_roce_dev *hr_dev, u8 port,
+			       int gid_index, union ib_gid *gid,
+			       const struct ib_gid_attr *attr)
+{
+	u32 *p = NULL;
+	u8 gid_idx = 0;
+
+	gid_idx = hns_get_gid_index(hr_dev, port, gid_index);
+
+	p = (u32 *)&gid->raw[0];
+	roce_raw_write(*p, hr_dev->reg_base + ROCEE_PORT_GID_L_0_REG +
+		       (HNS_ROCE_V1_GID_NUM * gid_idx));
+
+	p = (u32 *)&gid->raw[4];
+	roce_raw_write(*p, hr_dev->reg_base + ROCEE_PORT_GID_ML_0_REG +
+		       (HNS_ROCE_V1_GID_NUM * gid_idx));
+
+	p = (u32 *)&gid->raw[8];
+	roce_raw_write(*p, hr_dev->reg_base + ROCEE_PORT_GID_MH_0_REG +
+		       (HNS_ROCE_V1_GID_NUM * gid_idx));
+
+	p = (u32 *)&gid->raw[0xc];
+	roce_raw_write(*p, hr_dev->reg_base + ROCEE_PORT_GID_H_0_REG +
+		       (HNS_ROCE_V1_GID_NUM * gid_idx));
+
+	return 0;
+}
+
+static int hns_roce_v1_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port,
+			       u8 *addr)
+{
+	u32 reg_smac_l;
+	u16 reg_smac_h;
+	u16 *p_h;
+	u32 *p;
+	u32 val;
+
+	/*
+	 * When mac changed, loopback may fail
+	 * because of smac not equal to dmac.
+	 * We Need to release and create reserved qp again.
+	 */
+	if (hr_dev->hw->dereg_mr) {
+		int ret;
+
+		ret = hns_roce_v1_recreate_lp_qp(hr_dev);
+		if (ret && ret != -ETIMEDOUT)
+			return ret;
+	}
+
+	p = (u32 *)(&addr[0]);
+	reg_smac_l = *p;
+	roce_raw_write(reg_smac_l, hr_dev->reg_base + ROCEE_SMAC_L_0_REG +
+		       PHY_PORT_OFFSET * phy_port);
+
+	val = roce_read(hr_dev,
+			ROCEE_SMAC_H_0_REG + phy_port * PHY_PORT_OFFSET);
+	p_h = (u16 *)(&addr[4]);
+	reg_smac_h  = *p_h;
+	roce_set_field(val, ROCEE_SMAC_H_ROCEE_SMAC_H_M,
+		       ROCEE_SMAC_H_ROCEE_SMAC_H_S, reg_smac_h);
+	roce_write(hr_dev, ROCEE_SMAC_H_0_REG + phy_port * PHY_PORT_OFFSET,
+		   val);
+
+	return 0;
+}
+
+static void hns_roce_v1_set_mtu(struct hns_roce_dev *hr_dev, u8 phy_port,
+				enum ib_mtu mtu)
+{
+	u32 val;
+
+	val = roce_read(hr_dev,
+			ROCEE_SMAC_H_0_REG + phy_port * PHY_PORT_OFFSET);
+	roce_set_field(val, ROCEE_SMAC_H_ROCEE_PORT_MTU_M,
+		       ROCEE_SMAC_H_ROCEE_PORT_MTU_S, mtu);
+	roce_write(hr_dev, ROCEE_SMAC_H_0_REG + phy_port * PHY_PORT_OFFSET,
+		   val);
+}
+
+static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr,
+				  unsigned long mtpt_idx)
+{
+	struct hns_roce_v1_mpt_entry *mpt_entry;
+	struct scatterlist *sg;
+	u64 *pages;
+	int entry;
+	int i;
+
+	/* MPT filled into mailbox buf */
+	mpt_entry = (struct hns_roce_v1_mpt_entry *)mb_buf;
+	memset(mpt_entry, 0, sizeof(*mpt_entry));
+
+	roce_set_field(mpt_entry->mpt_byte_4, MPT_BYTE_4_KEY_STATE_M,
+		       MPT_BYTE_4_KEY_STATE_S, KEY_VALID);
+	roce_set_field(mpt_entry->mpt_byte_4, MPT_BYTE_4_KEY_M,
+		       MPT_BYTE_4_KEY_S, mr->key);
+	roce_set_field(mpt_entry->mpt_byte_4, MPT_BYTE_4_PAGE_SIZE_M,
+		       MPT_BYTE_4_PAGE_SIZE_S, MR_SIZE_4K);
+	roce_set_bit(mpt_entry->mpt_byte_4, MPT_BYTE_4_MW_TYPE_S, 0);
+	roce_set_bit(mpt_entry->mpt_byte_4, MPT_BYTE_4_MW_BIND_ENABLE_S,
+		     (mr->access & IB_ACCESS_MW_BIND ? 1 : 0));
+	roce_set_bit(mpt_entry->mpt_byte_4, MPT_BYTE_4_OWN_S, 0);
+	roce_set_field(mpt_entry->mpt_byte_4, MPT_BYTE_4_MEMORY_LOCATION_TYPE_M,
+		       MPT_BYTE_4_MEMORY_LOCATION_TYPE_S, mr->type);
+	roce_set_bit(mpt_entry->mpt_byte_4, MPT_BYTE_4_REMOTE_ATOMIC_S, 0);
+	roce_set_bit(mpt_entry->mpt_byte_4, MPT_BYTE_4_LOCAL_WRITE_S,
+		     (mr->access & IB_ACCESS_LOCAL_WRITE ? 1 : 0));
+	roce_set_bit(mpt_entry->mpt_byte_4, MPT_BYTE_4_REMOTE_WRITE_S,
+		     (mr->access & IB_ACCESS_REMOTE_WRITE ? 1 : 0));
+	roce_set_bit(mpt_entry->mpt_byte_4, MPT_BYTE_4_REMOTE_READ_S,
+		     (mr->access & IB_ACCESS_REMOTE_READ ? 1 : 0));
+	roce_set_bit(mpt_entry->mpt_byte_4, MPT_BYTE_4_REMOTE_INVAL_ENABLE_S,
+		     0);
+	roce_set_bit(mpt_entry->mpt_byte_4, MPT_BYTE_4_ADDRESS_TYPE_S, 0);
+
+	roce_set_field(mpt_entry->mpt_byte_12, MPT_BYTE_12_PBL_ADDR_H_M,
+		       MPT_BYTE_12_PBL_ADDR_H_S, 0);
+	roce_set_field(mpt_entry->mpt_byte_12, MPT_BYTE_12_MW_BIND_COUNTER_M,
+		       MPT_BYTE_12_MW_BIND_COUNTER_S, 0);
+
+	mpt_entry->virt_addr_l = (u32)mr->iova;
+	mpt_entry->virt_addr_h = (u32)(mr->iova >> 32);
+	mpt_entry->length = (u32)mr->size;
+
+	roce_set_field(mpt_entry->mpt_byte_28, MPT_BYTE_28_PD_M,
+		       MPT_BYTE_28_PD_S, mr->pd);
+	roce_set_field(mpt_entry->mpt_byte_28, MPT_BYTE_28_L_KEY_IDX_L_M,
+		       MPT_BYTE_28_L_KEY_IDX_L_S, mtpt_idx);
+	roce_set_field(mpt_entry->mpt_byte_64, MPT_BYTE_64_L_KEY_IDX_H_M,
+		       MPT_BYTE_64_L_KEY_IDX_H_S, mtpt_idx >> MTPT_IDX_SHIFT);
+
+	/* DMA memory register */
+	if (mr->type == MR_TYPE_DMA)
+		return 0;
+
+	pages = (u64 *) __get_free_page(GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	i = 0;
+	for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
+		pages[i] = ((u64)sg_dma_address(sg)) >> 12;
+
+		/* Directly record to MTPT table firstly 7 entry */
+		if (i >= HNS_ROCE_MAX_INNER_MTPT_NUM)
+			break;
+		i++;
+	}
+
+	/* Register user mr */
+	for (i = 0; i < HNS_ROCE_MAX_INNER_MTPT_NUM; i++) {
+		switch (i) {
+		case 0:
+			mpt_entry->pa0_l = cpu_to_le32((u32)(pages[i]));
+			roce_set_field(mpt_entry->mpt_byte_36,
+				MPT_BYTE_36_PA0_H_M,
+				MPT_BYTE_36_PA0_H_S,
+				cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_32)));
+			break;
+		case 1:
+			roce_set_field(mpt_entry->mpt_byte_36,
+				       MPT_BYTE_36_PA1_L_M,
+				       MPT_BYTE_36_PA1_L_S,
+				       cpu_to_le32((u32)(pages[i])));
+			roce_set_field(mpt_entry->mpt_byte_40,
+				MPT_BYTE_40_PA1_H_M,
+				MPT_BYTE_40_PA1_H_S,
+				cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_24)));
+			break;
+		case 2:
+			roce_set_field(mpt_entry->mpt_byte_40,
+				       MPT_BYTE_40_PA2_L_M,
+				       MPT_BYTE_40_PA2_L_S,
+				       cpu_to_le32((u32)(pages[i])));
+			roce_set_field(mpt_entry->mpt_byte_44,
+				MPT_BYTE_44_PA2_H_M,
+				MPT_BYTE_44_PA2_H_S,
+				cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_16)));
+			break;
+		case 3:
+			roce_set_field(mpt_entry->mpt_byte_44,
+				       MPT_BYTE_44_PA3_L_M,
+				       MPT_BYTE_44_PA3_L_S,
+				       cpu_to_le32((u32)(pages[i])));
+			roce_set_field(mpt_entry->mpt_byte_48,
+				MPT_BYTE_48_PA3_H_M,
+				MPT_BYTE_48_PA3_H_S,
+				cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_8)));
+			break;
+		case 4:
+			mpt_entry->pa4_l = cpu_to_le32((u32)(pages[i]));
+			roce_set_field(mpt_entry->mpt_byte_56,
+				MPT_BYTE_56_PA4_H_M,
+				MPT_BYTE_56_PA4_H_S,
+				cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_32)));
+			break;
+		case 5:
+			roce_set_field(mpt_entry->mpt_byte_56,
+				       MPT_BYTE_56_PA5_L_M,
+				       MPT_BYTE_56_PA5_L_S,
+				       cpu_to_le32((u32)(pages[i])));
+			roce_set_field(mpt_entry->mpt_byte_60,
+				MPT_BYTE_60_PA5_H_M,
+				MPT_BYTE_60_PA5_H_S,
+				cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_24)));
+			break;
+		case 6:
+			roce_set_field(mpt_entry->mpt_byte_60,
+				       MPT_BYTE_60_PA6_L_M,
+				       MPT_BYTE_60_PA6_L_S,
+				       cpu_to_le32((u32)(pages[i])));
+			roce_set_field(mpt_entry->mpt_byte_64,
+				MPT_BYTE_64_PA6_H_M,
+				MPT_BYTE_64_PA6_H_S,
+				cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_16)));
+			break;
+		default:
+			break;
+		}
+	}
+
+	free_page((unsigned long) pages);
+
+	mpt_entry->pbl_addr_l = (u32)(mr->pbl_dma_addr);
+
+	roce_set_field(mpt_entry->mpt_byte_12, MPT_BYTE_12_PBL_ADDR_H_M,
+		       MPT_BYTE_12_PBL_ADDR_H_S,
+		       ((u32)(mr->pbl_dma_addr >> 32)));
+
+	return 0;
+}
+
+static void *get_cqe(struct hns_roce_cq *hr_cq, int n)
+{
+	return hns_roce_buf_offset(&hr_cq->hr_buf.hr_buf,
+				   n * HNS_ROCE_V1_CQE_ENTRY_SIZE);
+}
+
+static void *get_sw_cqe(struct hns_roce_cq *hr_cq, int n)
+{
+	struct hns_roce_cqe *hr_cqe = get_cqe(hr_cq, n & hr_cq->ib_cq.cqe);
+
+	/* Get cqe when Owner bit is Conversely with the MSB of cons_idx */
+	return (roce_get_bit(hr_cqe->cqe_byte_4, CQE_BYTE_4_OWNER_S) ^
+		!!(n & (hr_cq->ib_cq.cqe + 1))) ? hr_cqe : NULL;
+}
+
+static struct hns_roce_cqe *next_cqe_sw(struct hns_roce_cq *hr_cq)
+{
+	return get_sw_cqe(hr_cq, hr_cq->cons_index);
+}
+
+static void hns_roce_v1_cq_set_ci(struct hns_roce_cq *hr_cq, u32 cons_index)
+{
+	u32 doorbell[2];
+
+	doorbell[0] = cons_index & ((hr_cq->cq_depth << 1) - 1);
+	doorbell[1] = 0;
+	roce_set_bit(doorbell[1], ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_HW_SYNS_S, 1);
+	roce_set_field(doorbell[1], ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_M,
+		       ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_S, 3);
+	roce_set_field(doorbell[1], ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_MDF_M,
+		       ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_MDF_S, 0);
+	roce_set_field(doorbell[1], ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_INP_H_M,
+		       ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_INP_H_S, hr_cq->cqn);
+
+	hns_roce_write64_k(doorbell, hr_cq->cq_db_l);
+}
+
+static void __hns_roce_v1_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
+				   struct hns_roce_srq *srq)
+{
+	struct hns_roce_cqe *cqe, *dest;
+	u32 prod_index;
+	int nfreed = 0;
+	u8 owner_bit;
+
+	for (prod_index = hr_cq->cons_index; get_sw_cqe(hr_cq, prod_index);
+	     ++prod_index) {
+		if (prod_index == hr_cq->cons_index + hr_cq->ib_cq.cqe)
+			break;
+	}
+
+	/*
+	 * Now backwards through the CQ, removing CQ entries
+	 * that match our QP by overwriting them with next entries.
+	 */
+	while ((int) --prod_index - (int) hr_cq->cons_index >= 0) {
+		cqe = get_cqe(hr_cq, prod_index & hr_cq->ib_cq.cqe);
+		if ((roce_get_field(cqe->cqe_byte_16, CQE_BYTE_16_LOCAL_QPN_M,
+				     CQE_BYTE_16_LOCAL_QPN_S) &
+				     HNS_ROCE_CQE_QPN_MASK) == qpn) {
+			/* In v1 engine, not support SRQ */
+			++nfreed;
+		} else if (nfreed) {
+			dest = get_cqe(hr_cq, (prod_index + nfreed) &
+				       hr_cq->ib_cq.cqe);
+			owner_bit = roce_get_bit(dest->cqe_byte_4,
+						 CQE_BYTE_4_OWNER_S);
+			memcpy(dest, cqe, sizeof(*cqe));
+			roce_set_bit(dest->cqe_byte_4, CQE_BYTE_4_OWNER_S,
+				     owner_bit);
+		}
+	}
+
+	if (nfreed) {
+		hr_cq->cons_index += nfreed;
+		/*
+		 * Make sure update of buffer contents is done before
+		 * updating consumer index.
+		 */
+		wmb();
+
+		hns_roce_v1_cq_set_ci(hr_cq, hr_cq->cons_index);
+	}
+}
+
+static void hns_roce_v1_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
+				 struct hns_roce_srq *srq)
+{
+	spin_lock_irq(&hr_cq->lock);
+	__hns_roce_v1_cq_clean(hr_cq, qpn, srq);
+	spin_unlock_irq(&hr_cq->lock);
+}
+
+static void hns_roce_v1_write_cqc(struct hns_roce_dev *hr_dev,
+				  struct hns_roce_cq *hr_cq, void *mb_buf,
+				  u64 *mtts, dma_addr_t dma_handle, int nent,
+				  u32 vector)
+{
+	struct hns_roce_cq_context *cq_context = NULL;
+	struct hns_roce_buf_list *tptr_buf;
+	struct hns_roce_v1_priv *priv;
+	dma_addr_t tptr_dma_addr;
+	int offset;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	tptr_buf = &priv->tptr_table.tptr_buf;
+
+	cq_context = mb_buf;
+	memset(cq_context, 0, sizeof(*cq_context));
+
+	/* Get the tptr for this CQ. */
+	offset = hr_cq->cqn * HNS_ROCE_V1_TPTR_ENTRY_SIZE;
+	tptr_dma_addr = tptr_buf->map + offset;
+	hr_cq->tptr_addr = (u16 *)(tptr_buf->buf + offset);
+
+	/* Register cq_context members */
+	roce_set_field(cq_context->cqc_byte_4,
+		       CQ_CONTEXT_CQC_BYTE_4_CQC_STATE_M,
+		       CQ_CONTEXT_CQC_BYTE_4_CQC_STATE_S, CQ_STATE_VALID);
+	roce_set_field(cq_context->cqc_byte_4, CQ_CONTEXT_CQC_BYTE_4_CQN_M,
+		       CQ_CONTEXT_CQC_BYTE_4_CQN_S, hr_cq->cqn);
+	cq_context->cqc_byte_4 = cpu_to_le32(cq_context->cqc_byte_4);
+
+	cq_context->cq_bt_l = (u32)dma_handle;
+	cq_context->cq_bt_l = cpu_to_le32(cq_context->cq_bt_l);
+
+	roce_set_field(cq_context->cqc_byte_12,
+		       CQ_CONTEXT_CQC_BYTE_12_CQ_BT_H_M,
+		       CQ_CONTEXT_CQC_BYTE_12_CQ_BT_H_S,
+		       ((u64)dma_handle >> 32));
+	roce_set_field(cq_context->cqc_byte_12,
+		       CQ_CONTEXT_CQC_BYTE_12_CQ_CQE_SHIFT_M,
+		       CQ_CONTEXT_CQC_BYTE_12_CQ_CQE_SHIFT_S,
+		       ilog2((unsigned int)nent));
+	roce_set_field(cq_context->cqc_byte_12, CQ_CONTEXT_CQC_BYTE_12_CEQN_M,
+		       CQ_CONTEXT_CQC_BYTE_12_CEQN_S, vector);
+	cq_context->cqc_byte_12 = cpu_to_le32(cq_context->cqc_byte_12);
+
+	cq_context->cur_cqe_ba0_l = (u32)(mtts[0]);
+	cq_context->cur_cqe_ba0_l = cpu_to_le32(cq_context->cur_cqe_ba0_l);
+
+	roce_set_field(cq_context->cqc_byte_20,
+		       CQ_CONTEXT_CQC_BYTE_20_CUR_CQE_BA0_H_M,
+		       CQ_CONTEXT_CQC_BYTE_20_CUR_CQE_BA0_H_S,
+		       cpu_to_le32((mtts[0]) >> 32));
+	/* Dedicated hardware, directly set 0 */
+	roce_set_field(cq_context->cqc_byte_20,
+		       CQ_CONTEXT_CQC_BYTE_20_CQ_CUR_INDEX_M,
+		       CQ_CONTEXT_CQC_BYTE_20_CQ_CUR_INDEX_S, 0);
+	/**
+	 * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of
+	 * using 4K page, and shift more 32 because of
+	 * caculating the high 32 bit value evaluated to hardware.
+	 */
+	roce_set_field(cq_context->cqc_byte_20,
+		       CQ_CONTEXT_CQC_BYTE_20_CQE_TPTR_ADDR_H_M,
+		       CQ_CONTEXT_CQC_BYTE_20_CQE_TPTR_ADDR_H_S,
+		       tptr_dma_addr >> 44);
+	cq_context->cqc_byte_20 = cpu_to_le32(cq_context->cqc_byte_20);
+
+	cq_context->cqe_tptr_addr_l = (u32)(tptr_dma_addr >> 12);
+
+	roce_set_field(cq_context->cqc_byte_32,
+		       CQ_CONTEXT_CQC_BYTE_32_CUR_CQE_BA1_H_M,
+		       CQ_CONTEXT_CQC_BYTE_32_CUR_CQE_BA1_H_S, 0);
+	roce_set_bit(cq_context->cqc_byte_32,
+		     CQ_CONTEXT_CQC_BYTE_32_SE_FLAG_S, 0);
+	roce_set_bit(cq_context->cqc_byte_32,
+		     CQ_CONTEXT_CQC_BYTE_32_CE_FLAG_S, 0);
+	roce_set_bit(cq_context->cqc_byte_32,
+		     CQ_CONTEXT_CQC_BYTE_32_NOTIFICATION_FLAG_S, 0);
+	roce_set_bit(cq_context->cqc_byte_32,
+		     CQ_CQNTEXT_CQC_BYTE_32_TYPE_OF_COMPLETION_NOTIFICATION_S,
+		     0);
+	/* The initial value of cq's ci is 0 */
+	roce_set_field(cq_context->cqc_byte_32,
+		       CQ_CONTEXT_CQC_BYTE_32_CQ_CONS_IDX_M,
+		       CQ_CONTEXT_CQC_BYTE_32_CQ_CONS_IDX_S, 0);
+	cq_context->cqc_byte_32 = cpu_to_le32(cq_context->cqc_byte_32);
+}
+
+static int hns_roce_v1_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	return -EOPNOTSUPP;
+}
+
+static int hns_roce_v1_req_notify_cq(struct ib_cq *ibcq,
+				     enum ib_cq_notify_flags flags)
+{
+	struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
+	u32 notification_flag;
+	u32 doorbell[2];
+
+	notification_flag = (flags & IB_CQ_SOLICITED_MASK) ==
+			    IB_CQ_SOLICITED ? CQ_DB_REQ_NOT : CQ_DB_REQ_NOT_SOL;
+	/*
+	 * flags = 0; Notification Flag = 1, next
+	 * flags = 1; Notification Flag = 0, solocited
+	 */
+	doorbell[0] = hr_cq->cons_index & ((hr_cq->cq_depth << 1) - 1);
+	roce_set_bit(doorbell[1], ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_HW_SYNS_S, 1);
+	roce_set_field(doorbell[1], ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_M,
+		       ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_S, 3);
+	roce_set_field(doorbell[1], ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_MDF_M,
+		       ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_MDF_S, 1);
+	roce_set_field(doorbell[1], ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_INP_H_M,
+		       ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_INP_H_S,
+		       hr_cq->cqn | notification_flag);
+
+	hns_roce_write64_k(doorbell, hr_cq->cq_db_l);
+
+	return 0;
+}
+
+static int hns_roce_v1_poll_one(struct hns_roce_cq *hr_cq,
+				struct hns_roce_qp **cur_qp, struct ib_wc *wc)
+{
+	int qpn;
+	int is_send;
+	u16 wqe_ctr;
+	u32 status;
+	u32 opcode;
+	struct hns_roce_cqe *cqe;
+	struct hns_roce_qp *hr_qp;
+	struct hns_roce_wq *wq;
+	struct hns_roce_wqe_ctrl_seg *sq_wqe;
+	struct hns_roce_dev *hr_dev = to_hr_dev(hr_cq->ib_cq.device);
+	struct device *dev = &hr_dev->pdev->dev;
+
+	/* Find cqe according consumer index */
+	cqe = next_cqe_sw(hr_cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	++hr_cq->cons_index;
+	/* Memory barrier */
+	rmb();
+	/* 0->SQ, 1->RQ */
+	is_send  = !(roce_get_bit(cqe->cqe_byte_4, CQE_BYTE_4_SQ_RQ_FLAG_S));
+
+	/* Local_qpn in UD cqe is always 1, so it needs to compute new qpn */
+	if (roce_get_field(cqe->cqe_byte_16, CQE_BYTE_16_LOCAL_QPN_M,
+			   CQE_BYTE_16_LOCAL_QPN_S) <= 1) {
+		qpn = roce_get_field(cqe->cqe_byte_20, CQE_BYTE_20_PORT_NUM_M,
+				     CQE_BYTE_20_PORT_NUM_S) +
+		      roce_get_field(cqe->cqe_byte_16, CQE_BYTE_16_LOCAL_QPN_M,
+				     CQE_BYTE_16_LOCAL_QPN_S) *
+				     HNS_ROCE_MAX_PORTS;
+	} else {
+		qpn = roce_get_field(cqe->cqe_byte_16, CQE_BYTE_16_LOCAL_QPN_M,
+				     CQE_BYTE_16_LOCAL_QPN_S);
+	}
+
+	if (!*cur_qp || (qpn & HNS_ROCE_CQE_QPN_MASK) != (*cur_qp)->qpn) {
+		hr_qp = __hns_roce_qp_lookup(hr_dev, qpn);
+		if (unlikely(!hr_qp)) {
+			dev_err(dev, "CQ %06lx with entry for unknown QPN %06x\n",
+				hr_cq->cqn, (qpn & HNS_ROCE_CQE_QPN_MASK));
+			return -EINVAL;
+		}
+
+		*cur_qp = hr_qp;
+	}
+
+	wc->qp = &(*cur_qp)->ibqp;
+	wc->vendor_err = 0;
+
+	status = roce_get_field(cqe->cqe_byte_4,
+				CQE_BYTE_4_STATUS_OF_THE_OPERATION_M,
+				CQE_BYTE_4_STATUS_OF_THE_OPERATION_S) &
+				HNS_ROCE_CQE_STATUS_MASK;
+	switch (status) {
+	case HNS_ROCE_CQE_SUCCESS:
+		wc->status = IB_WC_SUCCESS;
+		break;
+	case HNS_ROCE_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+		wc->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case HNS_ROCE_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+		wc->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case HNS_ROCE_CQE_SYNDROME_LOCAL_PROT_ERR:
+		wc->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case HNS_ROCE_CQE_SYNDROME_WR_FLUSH_ERR:
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case HNS_ROCE_CQE_SYNDROME_MEM_MANAGE_OPERATE_ERR:
+		wc->status = IB_WC_MW_BIND_ERR;
+		break;
+	case HNS_ROCE_CQE_SYNDROME_BAD_RESP_ERR:
+		wc->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case HNS_ROCE_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+		wc->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case HNS_ROCE_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+		wc->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case HNS_ROCE_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+		wc->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case HNS_ROCE_CQE_SYNDROME_REMOTE_OP_ERR:
+		wc->status = IB_WC_REM_OP_ERR;
+		break;
+	case HNS_ROCE_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+		wc->status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case HNS_ROCE_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+		wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	default:
+		wc->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	/* CQE status error, directly return */
+	if (wc->status != IB_WC_SUCCESS)
+		return 0;
+
+	if (is_send) {
+		/* SQ conrespond to CQE */
+		sq_wqe = get_send_wqe(*cur_qp, roce_get_field(cqe->cqe_byte_4,
+						CQE_BYTE_4_WQE_INDEX_M,
+						CQE_BYTE_4_WQE_INDEX_S)&
+						((*cur_qp)->sq.wqe_cnt-1));
+		switch (le32_to_cpu(sq_wqe->flag) & HNS_ROCE_WQE_OPCODE_MASK) {
+		case HNS_ROCE_WQE_OPCODE_SEND:
+			wc->opcode = IB_WC_SEND;
+			break;
+		case HNS_ROCE_WQE_OPCODE_RDMA_READ:
+			wc->opcode = IB_WC_RDMA_READ;
+			wc->byte_len = le32_to_cpu(cqe->byte_cnt);
+			break;
+		case HNS_ROCE_WQE_OPCODE_RDMA_WRITE:
+			wc->opcode = IB_WC_RDMA_WRITE;
+			break;
+		case HNS_ROCE_WQE_OPCODE_LOCAL_INV:
+			wc->opcode = IB_WC_LOCAL_INV;
+			break;
+		case HNS_ROCE_WQE_OPCODE_UD_SEND:
+			wc->opcode = IB_WC_SEND;
+			break;
+		default:
+			wc->status = IB_WC_GENERAL_ERR;
+			break;
+		}
+		wc->wc_flags = (le32_to_cpu(sq_wqe->flag) & HNS_ROCE_WQE_IMM ?
+				IB_WC_WITH_IMM : 0);
+
+		wq = &(*cur_qp)->sq;
+		if ((*cur_qp)->sq_signal_bits) {
+			/*
+			 * If sg_signal_bit is 1,
+			 * firstly tail pointer updated to wqe
+			 * which current cqe correspond to
+			 */
+			wqe_ctr = (u16)roce_get_field(cqe->cqe_byte_4,
+						      CQE_BYTE_4_WQE_INDEX_M,
+						      CQE_BYTE_4_WQE_INDEX_S);
+			wq->tail += (wqe_ctr - (u16)wq->tail) &
+				    (wq->wqe_cnt - 1);
+		}
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	} else {
+		/* RQ conrespond to CQE */
+		wc->byte_len = le32_to_cpu(cqe->byte_cnt);
+		opcode = roce_get_field(cqe->cqe_byte_4,
+					CQE_BYTE_4_OPERATION_TYPE_M,
+					CQE_BYTE_4_OPERATION_TYPE_S) &
+					HNS_ROCE_CQE_OPCODE_MASK;
+		switch (opcode) {
+		case HNS_ROCE_OPCODE_RDMA_WITH_IMM_RECEIVE:
+			wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+			wc->wc_flags = IB_WC_WITH_IMM;
+			wc->ex.imm_data =
+				cpu_to_be32(le32_to_cpu(cqe->immediate_data));
+			break;
+		case HNS_ROCE_OPCODE_SEND_DATA_RECEIVE:
+			if (roce_get_bit(cqe->cqe_byte_4,
+					 CQE_BYTE_4_IMM_INDICATOR_S)) {
+				wc->opcode = IB_WC_RECV;
+				wc->wc_flags = IB_WC_WITH_IMM;
+				wc->ex.imm_data = cpu_to_be32(
+					le32_to_cpu(cqe->immediate_data));
+			} else {
+				wc->opcode = IB_WC_RECV;
+				wc->wc_flags = 0;
+			}
+			break;
+		default:
+			wc->status = IB_WC_GENERAL_ERR;
+			break;
+		}
+
+		/* Update tail pointer, record wr_id */
+		wq = &(*cur_qp)->rq;
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+		wc->sl = (u8)roce_get_field(cqe->cqe_byte_20, CQE_BYTE_20_SL_M,
+					    CQE_BYTE_20_SL_S);
+		wc->src_qp = (u8)roce_get_field(cqe->cqe_byte_20,
+						CQE_BYTE_20_REMOTE_QPN_M,
+						CQE_BYTE_20_REMOTE_QPN_S);
+		wc->wc_flags |= (roce_get_bit(cqe->cqe_byte_20,
+					      CQE_BYTE_20_GRH_PRESENT_S) ?
+					      IB_WC_GRH : 0);
+		wc->pkey_index = (u16)roce_get_field(cqe->cqe_byte_28,
+						     CQE_BYTE_28_P_KEY_IDX_M,
+						     CQE_BYTE_28_P_KEY_IDX_S);
+	}
+
+	return 0;
+}
+
+int hns_roce_v1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
+	struct hns_roce_qp *cur_qp = NULL;
+	unsigned long flags;
+	int npolled;
+	int ret = 0;
+
+	spin_lock_irqsave(&hr_cq->lock, flags);
+
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		ret = hns_roce_v1_poll_one(hr_cq, &cur_qp, wc + npolled);
+		if (ret)
+			break;
+	}
+
+	if (npolled) {
+		*hr_cq->tptr_addr = hr_cq->cons_index &
+			((hr_cq->cq_depth << 1) - 1);
+
+		/* Memroy barrier */
+		wmb();
+		hns_roce_v1_cq_set_ci(hr_cq, hr_cq->cons_index);
+	}
+
+	spin_unlock_irqrestore(&hr_cq->lock, flags);
+
+	if (ret == 0 || ret == -EAGAIN)
+		return npolled;
+	else
+		return ret;
+}
+
+static int hns_roce_v1_clear_hem(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_hem_table *table, int obj,
+				 int step_idx)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_v1_priv *priv;
+	unsigned long end = 0, flags = 0;
+	uint32_t bt_cmd_val[2] = {0};
+	void __iomem *bt_cmd;
+	u64 bt_ba = 0;
+
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+
+	switch (table->type) {
+	case HEM_TYPE_QPC:
+		roce_set_field(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, HEM_TYPE_QPC);
+		bt_ba = priv->bt_table.qpc_buf.map >> 12;
+		break;
+	case HEM_TYPE_MTPT:
+		roce_set_field(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, HEM_TYPE_MTPT);
+		bt_ba = priv->bt_table.mtpt_buf.map >> 12;
+		break;
+	case HEM_TYPE_CQC:
+		roce_set_field(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_M,
+			ROCEE_BT_CMD_H_ROCEE_BT_CMD_MDF_S, HEM_TYPE_CQC);
+		bt_ba = priv->bt_table.cqc_buf.map >> 12;
+		break;
+	case HEM_TYPE_SRQC:
+		dev_dbg(dev, "HEM_TYPE_SRQC not support.\n");
+		return -EINVAL;
+	default:
+		return 0;
+	}
+	roce_set_field(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_M,
+		ROCEE_BT_CMD_H_ROCEE_BT_CMD_IN_MDF_S, obj);
+	roce_set_bit(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_S, 0);
+	roce_set_bit(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_HW_SYNS_S, 1);
+
+	spin_lock_irqsave(&hr_dev->bt_cmd_lock, flags);
+
+	bt_cmd = hr_dev->reg_base + ROCEE_BT_CMD_H_REG;
+
+	end = msecs_to_jiffies(HW_SYNC_TIMEOUT_MSECS) + jiffies;
+	while (1) {
+		if (readl(bt_cmd) >> BT_CMD_SYNC_SHIFT) {
+			if (!(time_before(jiffies, end))) {
+				dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n");
+				spin_unlock_irqrestore(&hr_dev->bt_cmd_lock,
+					flags);
+				return -EBUSY;
+			}
+		} else {
+			break;
+		}
+		msleep(HW_SYNC_SLEEP_TIME_INTERVAL);
+	}
+
+	bt_cmd_val[0] = (uint32_t)bt_ba;
+	roce_set_field(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_M,
+		ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_S, bt_ba >> 32);
+	hns_roce_write64_k(bt_cmd_val, hr_dev->reg_base + ROCEE_BT_CMD_L_REG);
+
+	spin_unlock_irqrestore(&hr_dev->bt_cmd_lock, flags);
+
+	return 0;
+}
+
+static int hns_roce_v1_qp_modify(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_mtt *mtt,
+				 enum hns_roce_qp_state cur_state,
+				 enum hns_roce_qp_state new_state,
+				 struct hns_roce_qp_context *context,
+				 struct hns_roce_qp *hr_qp)
+{
+	static const u16
+	op[HNS_ROCE_QP_NUM_STATE][HNS_ROCE_QP_NUM_STATE] = {
+		[HNS_ROCE_QP_STATE_RST] = {
+		[HNS_ROCE_QP_STATE_RST] = HNS_ROCE_CMD_2RST_QP,
+		[HNS_ROCE_QP_STATE_ERR] = HNS_ROCE_CMD_2ERR_QP,
+		[HNS_ROCE_QP_STATE_INIT] = HNS_ROCE_CMD_RST2INIT_QP,
+		},
+		[HNS_ROCE_QP_STATE_INIT] = {
+		[HNS_ROCE_QP_STATE_RST] = HNS_ROCE_CMD_2RST_QP,
+		[HNS_ROCE_QP_STATE_ERR] = HNS_ROCE_CMD_2ERR_QP,
+		/* Note: In v1 engine, HW doesn't support RST2INIT.
+		 * We use RST2INIT cmd instead of INIT2INIT.
+		 */
+		[HNS_ROCE_QP_STATE_INIT] = HNS_ROCE_CMD_RST2INIT_QP,
+		[HNS_ROCE_QP_STATE_RTR] = HNS_ROCE_CMD_INIT2RTR_QP,
+		},
+		[HNS_ROCE_QP_STATE_RTR] = {
+		[HNS_ROCE_QP_STATE_RST] = HNS_ROCE_CMD_2RST_QP,
+		[HNS_ROCE_QP_STATE_ERR] = HNS_ROCE_CMD_2ERR_QP,
+		[HNS_ROCE_QP_STATE_RTS] = HNS_ROCE_CMD_RTR2RTS_QP,
+		},
+		[HNS_ROCE_QP_STATE_RTS] = {
+		[HNS_ROCE_QP_STATE_RST] = HNS_ROCE_CMD_2RST_QP,
+		[HNS_ROCE_QP_STATE_ERR] = HNS_ROCE_CMD_2ERR_QP,
+		[HNS_ROCE_QP_STATE_RTS] = HNS_ROCE_CMD_RTS2RTS_QP,
+		[HNS_ROCE_QP_STATE_SQD] = HNS_ROCE_CMD_RTS2SQD_QP,
+		},
+		[HNS_ROCE_QP_STATE_SQD] = {
+		[HNS_ROCE_QP_STATE_RST] = HNS_ROCE_CMD_2RST_QP,
+		[HNS_ROCE_QP_STATE_ERR] = HNS_ROCE_CMD_2ERR_QP,
+		[HNS_ROCE_QP_STATE_RTS] = HNS_ROCE_CMD_SQD2RTS_QP,
+		[HNS_ROCE_QP_STATE_SQD] = HNS_ROCE_CMD_SQD2SQD_QP,
+		},
+		[HNS_ROCE_QP_STATE_ERR] = {
+		[HNS_ROCE_QP_STATE_RST] = HNS_ROCE_CMD_2RST_QP,
+		[HNS_ROCE_QP_STATE_ERR] = HNS_ROCE_CMD_2ERR_QP,
+		}
+	};
+
+	struct hns_roce_cmd_mailbox *mailbox;
+	struct device *dev = &hr_dev->pdev->dev;
+	int ret = 0;
+
+	if (cur_state >= HNS_ROCE_QP_NUM_STATE ||
+	    new_state >= HNS_ROCE_QP_NUM_STATE ||
+	    !op[cur_state][new_state]) {
+		dev_err(dev, "[modify_qp]not support state %d to %d\n",
+			cur_state, new_state);
+		return -EINVAL;
+	}
+
+	if (op[cur_state][new_state] == HNS_ROCE_CMD_2RST_QP)
+		return hns_roce_cmd_mbox(hr_dev, 0, 0, hr_qp->qpn, 2,
+					 HNS_ROCE_CMD_2RST_QP,
+					 HNS_ROCE_CMD_TIMEOUT_MSECS);
+
+	if (op[cur_state][new_state] == HNS_ROCE_CMD_2ERR_QP)
+		return hns_roce_cmd_mbox(hr_dev, 0, 0, hr_qp->qpn, 2,
+					 HNS_ROCE_CMD_2ERR_QP,
+					 HNS_ROCE_CMD_TIMEOUT_MSECS);
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, context, sizeof(*context));
+
+	ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_qp->qpn, 0,
+				op[cur_state][new_state],
+				HNS_ROCE_CMD_TIMEOUT_MSECS);
+
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+	return ret;
+}
+
+static int hns_roce_v1_m_sqp(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
+			     int attr_mask, enum ib_qp_state cur_state,
+			     enum ib_qp_state new_state)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct hns_roce_sqp_context *context;
+	struct device *dev = &hr_dev->pdev->dev;
+	dma_addr_t dma_handle = 0;
+	int rq_pa_start;
+	u32 reg_val;
+	u64 *mtts;
+	u32 __iomem *addr;
+
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	/* Search QP buf's MTTs */
+	mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table,
+				   hr_qp->mtt.first_seg, &dma_handle);
+	if (!mtts) {
+		dev_err(dev, "qp buf pa find failed\n");
+		goto out;
+	}
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		roce_set_field(context->qp1c_bytes_4,
+			       QP1C_BYTES_4_SQ_WQE_SHIFT_M,
+			       QP1C_BYTES_4_SQ_WQE_SHIFT_S,
+			       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
+		roce_set_field(context->qp1c_bytes_4,
+			       QP1C_BYTES_4_RQ_WQE_SHIFT_M,
+			       QP1C_BYTES_4_RQ_WQE_SHIFT_S,
+			       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
+		roce_set_field(context->qp1c_bytes_4, QP1C_BYTES_4_PD_M,
+			       QP1C_BYTES_4_PD_S, to_hr_pd(ibqp->pd)->pdn);
+
+		context->sq_rq_bt_l = (u32)(dma_handle);
+		roce_set_field(context->qp1c_bytes_12,
+			       QP1C_BYTES_12_SQ_RQ_BT_H_M,
+			       QP1C_BYTES_12_SQ_RQ_BT_H_S,
+			       ((u32)(dma_handle >> 32)));
+
+		roce_set_field(context->qp1c_bytes_16, QP1C_BYTES_16_RQ_HEAD_M,
+			       QP1C_BYTES_16_RQ_HEAD_S, hr_qp->rq.head);
+		roce_set_field(context->qp1c_bytes_16, QP1C_BYTES_16_PORT_NUM_M,
+			       QP1C_BYTES_16_PORT_NUM_S, hr_qp->phy_port);
+		roce_set_bit(context->qp1c_bytes_16,
+			     QP1C_BYTES_16_SIGNALING_TYPE_S,
+			     hr_qp->sq_signal_bits);
+		roce_set_bit(context->qp1c_bytes_16, QP1C_BYTES_16_RQ_BA_FLG_S,
+			     1);
+		roce_set_bit(context->qp1c_bytes_16, QP1C_BYTES_16_SQ_BA_FLG_S,
+			     1);
+		roce_set_bit(context->qp1c_bytes_16, QP1C_BYTES_16_QP1_ERR_S,
+			     0);
+
+		roce_set_field(context->qp1c_bytes_20, QP1C_BYTES_20_SQ_HEAD_M,
+			       QP1C_BYTES_20_SQ_HEAD_S, hr_qp->sq.head);
+		roce_set_field(context->qp1c_bytes_20, QP1C_BYTES_20_PKEY_IDX_M,
+			       QP1C_BYTES_20_PKEY_IDX_S, attr->pkey_index);
+
+		rq_pa_start = (u32)hr_qp->rq.offset / PAGE_SIZE;
+		context->cur_rq_wqe_ba_l = (u32)(mtts[rq_pa_start]);
+
+		roce_set_field(context->qp1c_bytes_28,
+			       QP1C_BYTES_28_CUR_RQ_WQE_BA_H_M,
+			       QP1C_BYTES_28_CUR_RQ_WQE_BA_H_S,
+			       (mtts[rq_pa_start]) >> 32);
+		roce_set_field(context->qp1c_bytes_28,
+			       QP1C_BYTES_28_RQ_CUR_IDX_M,
+			       QP1C_BYTES_28_RQ_CUR_IDX_S, 0);
+
+		roce_set_field(context->qp1c_bytes_32,
+			       QP1C_BYTES_32_RX_CQ_NUM_M,
+			       QP1C_BYTES_32_RX_CQ_NUM_S,
+			       to_hr_cq(ibqp->recv_cq)->cqn);
+		roce_set_field(context->qp1c_bytes_32,
+			       QP1C_BYTES_32_TX_CQ_NUM_M,
+			       QP1C_BYTES_32_TX_CQ_NUM_S,
+			       to_hr_cq(ibqp->send_cq)->cqn);
+
+		context->cur_sq_wqe_ba_l  = (u32)mtts[0];
+
+		roce_set_field(context->qp1c_bytes_40,
+			       QP1C_BYTES_40_CUR_SQ_WQE_BA_H_M,
+			       QP1C_BYTES_40_CUR_SQ_WQE_BA_H_S,
+			       (mtts[0]) >> 32);
+		roce_set_field(context->qp1c_bytes_40,
+			       QP1C_BYTES_40_SQ_CUR_IDX_M,
+			       QP1C_BYTES_40_SQ_CUR_IDX_S, 0);
+
+		/* Copy context to QP1C register */
+		addr = (u32 __iomem *)(hr_dev->reg_base +
+				       ROCEE_QP1C_CFG0_0_REG +
+				       hr_qp->phy_port * sizeof(*context));
+
+		writel(context->qp1c_bytes_4, addr);
+		writel(context->sq_rq_bt_l, addr + 1);
+		writel(context->qp1c_bytes_12, addr + 2);
+		writel(context->qp1c_bytes_16, addr + 3);
+		writel(context->qp1c_bytes_20, addr + 4);
+		writel(context->cur_rq_wqe_ba_l, addr + 5);
+		writel(context->qp1c_bytes_28, addr + 6);
+		writel(context->qp1c_bytes_32, addr + 7);
+		writel(context->cur_sq_wqe_ba_l, addr + 8);
+		writel(context->qp1c_bytes_40, addr + 9);
+	}
+
+	/* Modify QP1C status */
+	reg_val = roce_read(hr_dev, ROCEE_QP1C_CFG0_0_REG +
+			    hr_qp->phy_port * sizeof(*context));
+	roce_set_field(reg_val, ROCEE_QP1C_CFG0_0_ROCEE_QP1C_QP_ST_M,
+		       ROCEE_QP1C_CFG0_0_ROCEE_QP1C_QP_ST_S, new_state);
+	roce_write(hr_dev, ROCEE_QP1C_CFG0_0_REG +
+		    hr_qp->phy_port * sizeof(*context), reg_val);
+
+	hr_qp->state = new_state;
+	if (new_state == IB_QPS_RESET) {
+		hns_roce_v1_cq_clean(to_hr_cq(ibqp->recv_cq), hr_qp->qpn,
+				     ibqp->srq ? to_hr_srq(ibqp->srq) : NULL);
+		if (ibqp->send_cq != ibqp->recv_cq)
+			hns_roce_v1_cq_clean(to_hr_cq(ibqp->send_cq),
+					     hr_qp->qpn, NULL);
+
+		hr_qp->rq.head = 0;
+		hr_qp->rq.tail = 0;
+		hr_qp->sq.head = 0;
+		hr_qp->sq.tail = 0;
+		hr_qp->sq_next_wqe = 0;
+	}
+
+	kfree(context);
+	return 0;
+
+out:
+	kfree(context);
+	return -EINVAL;
+}
+
+static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
+			    int attr_mask, enum ib_qp_state cur_state,
+			    enum ib_qp_state new_state)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_qp_context *context;
+	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
+	dma_addr_t dma_handle_2 = 0;
+	dma_addr_t dma_handle = 0;
+	uint32_t doorbell[2] = {0};
+	int rq_pa_start = 0;
+	u64 *mtts_2 = NULL;
+	int ret = -EINVAL;
+	u64 *mtts = NULL;
+	int port;
+	u8 port_num;
+	u8 *dmac;
+	u8 *smac;
+
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	/* Search qp buf's mtts */
+	mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table,
+				   hr_qp->mtt.first_seg, &dma_handle);
+	if (mtts == NULL) {
+		dev_err(dev, "qp buf pa find failed\n");
+		goto out;
+	}
+
+	/* Search IRRL's mtts */
+	mtts_2 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.irrl_table,
+				     hr_qp->qpn, &dma_handle_2);
+	if (mtts_2 == NULL) {
+		dev_err(dev, "qp irrl_table find failed\n");
+		goto out;
+	}
+
+	/*
+	 * Reset to init
+	 *	Mandatory param:
+	 *	IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS
+	 *	Optional param: NA
+	 */
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		roce_set_field(context->qpc_bytes_4,
+			       QP_CONTEXT_QPC_BYTES_4_TRANSPORT_SERVICE_TYPE_M,
+			       QP_CONTEXT_QPC_BYTES_4_TRANSPORT_SERVICE_TYPE_S,
+			       to_hr_qp_type(hr_qp->ibqp.qp_type));
+
+		roce_set_bit(context->qpc_bytes_4,
+			     QP_CONTEXT_QPC_BYTE_4_ENABLE_FPMR_S, 0);
+		roce_set_bit(context->qpc_bytes_4,
+			     QP_CONTEXT_QPC_BYTE_4_RDMA_READ_ENABLE_S,
+			     !!(attr->qp_access_flags & IB_ACCESS_REMOTE_READ));
+		roce_set_bit(context->qpc_bytes_4,
+			     QP_CONTEXT_QPC_BYTE_4_RDMA_WRITE_ENABLE_S,
+			     !!(attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
+			     );
+		roce_set_bit(context->qpc_bytes_4,
+			     QP_CONTEXT_QPC_BYTE_4_ATOMIC_OPERATION_ENABLE_S,
+			     !!(attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)
+			     );
+		roce_set_bit(context->qpc_bytes_4,
+			     QP_CONTEXT_QPC_BYTE_4_RDMAR_USE_S, 1);
+		roce_set_field(context->qpc_bytes_4,
+			       QP_CONTEXT_QPC_BYTES_4_SQ_WQE_SHIFT_M,
+			       QP_CONTEXT_QPC_BYTES_4_SQ_WQE_SHIFT_S,
+			       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
+		roce_set_field(context->qpc_bytes_4,
+			       QP_CONTEXT_QPC_BYTES_4_RQ_WQE_SHIFT_M,
+			       QP_CONTEXT_QPC_BYTES_4_RQ_WQE_SHIFT_S,
+			       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
+		roce_set_field(context->qpc_bytes_4,
+			       QP_CONTEXT_QPC_BYTES_4_PD_M,
+			       QP_CONTEXT_QPC_BYTES_4_PD_S,
+			       to_hr_pd(ibqp->pd)->pdn);
+		hr_qp->access_flags = attr->qp_access_flags;
+		roce_set_field(context->qpc_bytes_8,
+			       QP_CONTEXT_QPC_BYTES_8_TX_COMPLETION_M,
+			       QP_CONTEXT_QPC_BYTES_8_TX_COMPLETION_S,
+			       to_hr_cq(ibqp->send_cq)->cqn);
+		roce_set_field(context->qpc_bytes_8,
+			       QP_CONTEXT_QPC_BYTES_8_RX_COMPLETION_M,
+			       QP_CONTEXT_QPC_BYTES_8_RX_COMPLETION_S,
+			       to_hr_cq(ibqp->recv_cq)->cqn);
+
+		if (ibqp->srq)
+			roce_set_field(context->qpc_bytes_12,
+				       QP_CONTEXT_QPC_BYTES_12_SRQ_NUMBER_M,
+				       QP_CONTEXT_QPC_BYTES_12_SRQ_NUMBER_S,
+				       to_hr_srq(ibqp->srq)->srqn);
+
+		roce_set_field(context->qpc_bytes_12,
+			       QP_CONTEXT_QPC_BYTES_12_P_KEY_INDEX_M,
+			       QP_CONTEXT_QPC_BYTES_12_P_KEY_INDEX_S,
+			       attr->pkey_index);
+		hr_qp->pkey_index = attr->pkey_index;
+		roce_set_field(context->qpc_bytes_16,
+			       QP_CONTEXT_QPC_BYTES_16_QP_NUM_M,
+			       QP_CONTEXT_QPC_BYTES_16_QP_NUM_S, hr_qp->qpn);
+
+	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) {
+		roce_set_field(context->qpc_bytes_4,
+			       QP_CONTEXT_QPC_BYTES_4_TRANSPORT_SERVICE_TYPE_M,
+			       QP_CONTEXT_QPC_BYTES_4_TRANSPORT_SERVICE_TYPE_S,
+			       to_hr_qp_type(hr_qp->ibqp.qp_type));
+		roce_set_bit(context->qpc_bytes_4,
+			     QP_CONTEXT_QPC_BYTE_4_ENABLE_FPMR_S, 0);
+		if (attr_mask & IB_QP_ACCESS_FLAGS) {
+			roce_set_bit(context->qpc_bytes_4,
+				     QP_CONTEXT_QPC_BYTE_4_RDMA_READ_ENABLE_S,
+				     !!(attr->qp_access_flags &
+				     IB_ACCESS_REMOTE_READ));
+			roce_set_bit(context->qpc_bytes_4,
+				     QP_CONTEXT_QPC_BYTE_4_RDMA_WRITE_ENABLE_S,
+				     !!(attr->qp_access_flags &
+				     IB_ACCESS_REMOTE_WRITE));
+		} else {
+			roce_set_bit(context->qpc_bytes_4,
+				     QP_CONTEXT_QPC_BYTE_4_RDMA_READ_ENABLE_S,
+				     !!(hr_qp->access_flags &
+				     IB_ACCESS_REMOTE_READ));
+			roce_set_bit(context->qpc_bytes_4,
+				     QP_CONTEXT_QPC_BYTE_4_RDMA_WRITE_ENABLE_S,
+				     !!(hr_qp->access_flags &
+				     IB_ACCESS_REMOTE_WRITE));
+		}
+
+		roce_set_bit(context->qpc_bytes_4,
+			     QP_CONTEXT_QPC_BYTE_4_RDMAR_USE_S, 1);
+		roce_set_field(context->qpc_bytes_4,
+			       QP_CONTEXT_QPC_BYTES_4_SQ_WQE_SHIFT_M,
+			       QP_CONTEXT_QPC_BYTES_4_SQ_WQE_SHIFT_S,
+			       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
+		roce_set_field(context->qpc_bytes_4,
+			       QP_CONTEXT_QPC_BYTES_4_RQ_WQE_SHIFT_M,
+			       QP_CONTEXT_QPC_BYTES_4_RQ_WQE_SHIFT_S,
+			       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
+		roce_set_field(context->qpc_bytes_4,
+			       QP_CONTEXT_QPC_BYTES_4_PD_M,
+			       QP_CONTEXT_QPC_BYTES_4_PD_S,
+			       to_hr_pd(ibqp->pd)->pdn);
+
+		roce_set_field(context->qpc_bytes_8,
+			       QP_CONTEXT_QPC_BYTES_8_TX_COMPLETION_M,
+			       QP_CONTEXT_QPC_BYTES_8_TX_COMPLETION_S,
+			       to_hr_cq(ibqp->send_cq)->cqn);
+		roce_set_field(context->qpc_bytes_8,
+			       QP_CONTEXT_QPC_BYTES_8_RX_COMPLETION_M,
+			       QP_CONTEXT_QPC_BYTES_8_RX_COMPLETION_S,
+			       to_hr_cq(ibqp->recv_cq)->cqn);
+
+		if (ibqp->srq)
+			roce_set_field(context->qpc_bytes_12,
+				       QP_CONTEXT_QPC_BYTES_12_SRQ_NUMBER_M,
+				       QP_CONTEXT_QPC_BYTES_12_SRQ_NUMBER_S,
+				       to_hr_srq(ibqp->srq)->srqn);
+		if (attr_mask & IB_QP_PKEY_INDEX)
+			roce_set_field(context->qpc_bytes_12,
+				       QP_CONTEXT_QPC_BYTES_12_P_KEY_INDEX_M,
+				       QP_CONTEXT_QPC_BYTES_12_P_KEY_INDEX_S,
+				       attr->pkey_index);
+		else
+			roce_set_field(context->qpc_bytes_12,
+				       QP_CONTEXT_QPC_BYTES_12_P_KEY_INDEX_M,
+				       QP_CONTEXT_QPC_BYTES_12_P_KEY_INDEX_S,
+				       hr_qp->pkey_index);
+
+		roce_set_field(context->qpc_bytes_16,
+			       QP_CONTEXT_QPC_BYTES_16_QP_NUM_M,
+			       QP_CONTEXT_QPC_BYTES_16_QP_NUM_S, hr_qp->qpn);
+	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		if ((attr_mask & IB_QP_ALT_PATH) ||
+		    (attr_mask & IB_QP_ACCESS_FLAGS) ||
+		    (attr_mask & IB_QP_PKEY_INDEX) ||
+		    (attr_mask & IB_QP_QKEY)) {
+			dev_err(dev, "INIT2RTR attr_mask error\n");
+			goto out;
+		}
+
+		dmac = (u8 *)attr->ah_attr.roce.dmac;
+
+		context->sq_rq_bt_l = (u32)(dma_handle);
+		roce_set_field(context->qpc_bytes_24,
+			       QP_CONTEXT_QPC_BYTES_24_SQ_RQ_BT_H_M,
+			       QP_CONTEXT_QPC_BYTES_24_SQ_RQ_BT_H_S,
+			       ((u32)(dma_handle >> 32)));
+		roce_set_bit(context->qpc_bytes_24,
+			     QP_CONTEXT_QPC_BYTE_24_REMOTE_ENABLE_E2E_CREDITS_S,
+			     1);
+		roce_set_field(context->qpc_bytes_24,
+			       QP_CONTEXT_QPC_BYTES_24_MINIMUM_RNR_NAK_TIMER_M,
+			       QP_CONTEXT_QPC_BYTES_24_MINIMUM_RNR_NAK_TIMER_S,
+			       attr->min_rnr_timer);
+		context->irrl_ba_l = (u32)(dma_handle_2);
+		roce_set_field(context->qpc_bytes_32,
+			       QP_CONTEXT_QPC_BYTES_32_IRRL_BA_H_M,
+			       QP_CONTEXT_QPC_BYTES_32_IRRL_BA_H_S,
+			       ((u32)(dma_handle_2 >> 32)) &
+				QP_CONTEXT_QPC_BYTES_32_IRRL_BA_H_M);
+		roce_set_field(context->qpc_bytes_32,
+			       QP_CONTEXT_QPC_BYTES_32_MIG_STATE_M,
+			       QP_CONTEXT_QPC_BYTES_32_MIG_STATE_S, 0);
+		roce_set_bit(context->qpc_bytes_32,
+			     QP_CONTEXT_QPC_BYTE_32_LOCAL_ENABLE_E2E_CREDITS_S,
+			     1);
+		roce_set_bit(context->qpc_bytes_32,
+			     QP_CONTEXT_QPC_BYTE_32_SIGNALING_TYPE_S,
+			     hr_qp->sq_signal_bits);
+
+		port = (attr_mask & IB_QP_PORT) ? (attr->port_num - 1) :
+			hr_qp->port;
+		smac = (u8 *)hr_dev->dev_addr[port];
+		/* when dmac equals smac or loop_idc is 1, it should loopback */
+		if (ether_addr_equal_unaligned(dmac, smac) ||
+		    hr_dev->loop_idc == 0x1)
+			roce_set_bit(context->qpc_bytes_32,
+			      QP_CONTEXT_QPC_BYTE_32_LOOPBACK_INDICATOR_S, 1);
+
+		roce_set_bit(context->qpc_bytes_32,
+			     QP_CONTEXT_QPC_BYTE_32_GLOBAL_HEADER_S,
+			     rdma_ah_get_ah_flags(&attr->ah_attr));
+		roce_set_field(context->qpc_bytes_32,
+			       QP_CONTEXT_QPC_BYTES_32_RESPONDER_RESOURCES_M,
+			       QP_CONTEXT_QPC_BYTES_32_RESPONDER_RESOURCES_S,
+			       ilog2((unsigned int)attr->max_dest_rd_atomic));
+
+		if (attr_mask & IB_QP_DEST_QPN)
+			roce_set_field(context->qpc_bytes_36,
+				       QP_CONTEXT_QPC_BYTES_36_DEST_QP_M,
+				       QP_CONTEXT_QPC_BYTES_36_DEST_QP_S,
+				       attr->dest_qp_num);
+
+		/* Configure GID index */
+		port_num = rdma_ah_get_port_num(&attr->ah_attr);
+		roce_set_field(context->qpc_bytes_36,
+			       QP_CONTEXT_QPC_BYTES_36_SGID_INDEX_M,
+			       QP_CONTEXT_QPC_BYTES_36_SGID_INDEX_S,
+				hns_get_gid_index(hr_dev,
+						  port_num - 1,
+						  grh->sgid_index));
+
+		memcpy(&(context->dmac_l), dmac, 4);
+
+		roce_set_field(context->qpc_bytes_44,
+			       QP_CONTEXT_QPC_BYTES_44_DMAC_H_M,
+			       QP_CONTEXT_QPC_BYTES_44_DMAC_H_S,
+			       *((u16 *)(&dmac[4])));
+		roce_set_field(context->qpc_bytes_44,
+			       QP_CONTEXT_QPC_BYTES_44_MAXIMUM_STATIC_RATE_M,
+			       QP_CONTEXT_QPC_BYTES_44_MAXIMUM_STATIC_RATE_S,
+			       rdma_ah_get_static_rate(&attr->ah_attr));
+		roce_set_field(context->qpc_bytes_44,
+			       QP_CONTEXT_QPC_BYTES_44_HOPLMT_M,
+			       QP_CONTEXT_QPC_BYTES_44_HOPLMT_S,
+			       grh->hop_limit);
+
+		roce_set_field(context->qpc_bytes_48,
+			       QP_CONTEXT_QPC_BYTES_48_FLOWLABEL_M,
+			       QP_CONTEXT_QPC_BYTES_48_FLOWLABEL_S,
+			       grh->flow_label);
+		roce_set_field(context->qpc_bytes_48,
+			       QP_CONTEXT_QPC_BYTES_48_TCLASS_M,
+			       QP_CONTEXT_QPC_BYTES_48_TCLASS_S,
+			       grh->traffic_class);
+		roce_set_field(context->qpc_bytes_48,
+			       QP_CONTEXT_QPC_BYTES_48_MTU_M,
+			       QP_CONTEXT_QPC_BYTES_48_MTU_S, attr->path_mtu);
+
+		memcpy(context->dgid, grh->dgid.raw,
+		       sizeof(grh->dgid.raw));
+
+		dev_dbg(dev, "dmac:%x :%lx\n", context->dmac_l,
+			roce_get_field(context->qpc_bytes_44,
+				       QP_CONTEXT_QPC_BYTES_44_DMAC_H_M,
+				       QP_CONTEXT_QPC_BYTES_44_DMAC_H_S));
+
+		roce_set_field(context->qpc_bytes_68,
+			       QP_CONTEXT_QPC_BYTES_68_RQ_HEAD_M,
+			       QP_CONTEXT_QPC_BYTES_68_RQ_HEAD_S,
+			       hr_qp->rq.head);
+		roce_set_field(context->qpc_bytes_68,
+			       QP_CONTEXT_QPC_BYTES_68_RQ_CUR_INDEX_M,
+			       QP_CONTEXT_QPC_BYTES_68_RQ_CUR_INDEX_S, 0);
+
+		rq_pa_start = (u32)hr_qp->rq.offset / PAGE_SIZE;
+		context->cur_rq_wqe_ba_l = (u32)(mtts[rq_pa_start]);
+
+		roce_set_field(context->qpc_bytes_76,
+			QP_CONTEXT_QPC_BYTES_76_CUR_RQ_WQE_BA_H_M,
+			QP_CONTEXT_QPC_BYTES_76_CUR_RQ_WQE_BA_H_S,
+			mtts[rq_pa_start] >> 32);
+		roce_set_field(context->qpc_bytes_76,
+			       QP_CONTEXT_QPC_BYTES_76_RX_REQ_MSN_M,
+			       QP_CONTEXT_QPC_BYTES_76_RX_REQ_MSN_S, 0);
+
+		context->rx_rnr_time = 0;
+
+		roce_set_field(context->qpc_bytes_84,
+			       QP_CONTEXT_QPC_BYTES_84_LAST_ACK_PSN_M,
+			       QP_CONTEXT_QPC_BYTES_84_LAST_ACK_PSN_S,
+			       attr->rq_psn - 1);
+		roce_set_field(context->qpc_bytes_84,
+			       QP_CONTEXT_QPC_BYTES_84_TRRL_HEAD_M,
+			       QP_CONTEXT_QPC_BYTES_84_TRRL_HEAD_S, 0);
+
+		roce_set_field(context->qpc_bytes_88,
+			       QP_CONTEXT_QPC_BYTES_88_RX_REQ_EPSN_M,
+			       QP_CONTEXT_QPC_BYTES_88_RX_REQ_EPSN_S,
+			       attr->rq_psn);
+		roce_set_bit(context->qpc_bytes_88,
+			     QP_CONTEXT_QPC_BYTES_88_RX_REQ_PSN_ERR_FLAG_S, 0);
+		roce_set_bit(context->qpc_bytes_88,
+			     QP_CONTEXT_QPC_BYTES_88_RX_LAST_OPCODE_FLG_S, 0);
+		roce_set_field(context->qpc_bytes_88,
+			QP_CONTEXT_QPC_BYTES_88_RQ_REQ_LAST_OPERATION_TYPE_M,
+			QP_CONTEXT_QPC_BYTES_88_RQ_REQ_LAST_OPERATION_TYPE_S,
+			0);
+		roce_set_field(context->qpc_bytes_88,
+			       QP_CONTEXT_QPC_BYTES_88_RQ_REQ_RDMA_WR_FLAG_M,
+			       QP_CONTEXT_QPC_BYTES_88_RQ_REQ_RDMA_WR_FLAG_S,
+			       0);
+
+		context->dma_length = 0;
+		context->r_key = 0;
+		context->va_l = 0;
+		context->va_h = 0;
+
+		roce_set_field(context->qpc_bytes_108,
+			       QP_CONTEXT_QPC_BYTES_108_TRRL_SDB_PSN_M,
+			       QP_CONTEXT_QPC_BYTES_108_TRRL_SDB_PSN_S, 0);
+		roce_set_bit(context->qpc_bytes_108,
+			     QP_CONTEXT_QPC_BYTES_108_TRRL_SDB_PSN_FLG_S, 0);
+		roce_set_bit(context->qpc_bytes_108,
+			     QP_CONTEXT_QPC_BYTES_108_TRRL_TDB_PSN_FLG_S, 0);
+
+		roce_set_field(context->qpc_bytes_112,
+			       QP_CONTEXT_QPC_BYTES_112_TRRL_TDB_PSN_M,
+			       QP_CONTEXT_QPC_BYTES_112_TRRL_TDB_PSN_S, 0);
+		roce_set_field(context->qpc_bytes_112,
+			       QP_CONTEXT_QPC_BYTES_112_TRRL_TAIL_M,
+			       QP_CONTEXT_QPC_BYTES_112_TRRL_TAIL_S, 0);
+
+		/* For chip resp ack */
+		roce_set_field(context->qpc_bytes_156,
+			       QP_CONTEXT_QPC_BYTES_156_PORT_NUM_M,
+			       QP_CONTEXT_QPC_BYTES_156_PORT_NUM_S,
+			       hr_qp->phy_port);
+		roce_set_field(context->qpc_bytes_156,
+			       QP_CONTEXT_QPC_BYTES_156_SL_M,
+			       QP_CONTEXT_QPC_BYTES_156_SL_S,
+			       rdma_ah_get_sl(&attr->ah_attr));
+		hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
+	} else if (cur_state == IB_QPS_RTR &&
+		new_state == IB_QPS_RTS) {
+		/* If exist optional param, return error */
+		if ((attr_mask & IB_QP_ALT_PATH) ||
+		    (attr_mask & IB_QP_ACCESS_FLAGS) ||
+		    (attr_mask & IB_QP_QKEY) ||
+		    (attr_mask & IB_QP_PATH_MIG_STATE) ||
+		    (attr_mask & IB_QP_CUR_STATE) ||
+		    (attr_mask & IB_QP_MIN_RNR_TIMER)) {
+			dev_err(dev, "RTR2RTS attr_mask error\n");
+			goto out;
+		}
+
+		context->rx_cur_sq_wqe_ba_l = (u32)(mtts[0]);
+
+		roce_set_field(context->qpc_bytes_120,
+			       QP_CONTEXT_QPC_BYTES_120_RX_CUR_SQ_WQE_BA_H_M,
+			       QP_CONTEXT_QPC_BYTES_120_RX_CUR_SQ_WQE_BA_H_S,
+			       (mtts[0]) >> 32);
+
+		roce_set_field(context->qpc_bytes_124,
+			       QP_CONTEXT_QPC_BYTES_124_RX_ACK_MSN_M,
+			       QP_CONTEXT_QPC_BYTES_124_RX_ACK_MSN_S, 0);
+		roce_set_field(context->qpc_bytes_124,
+			       QP_CONTEXT_QPC_BYTES_124_IRRL_MSG_IDX_M,
+			       QP_CONTEXT_QPC_BYTES_124_IRRL_MSG_IDX_S, 0);
+
+		roce_set_field(context->qpc_bytes_128,
+			       QP_CONTEXT_QPC_BYTES_128_RX_ACK_EPSN_M,
+			       QP_CONTEXT_QPC_BYTES_128_RX_ACK_EPSN_S,
+			       attr->sq_psn);
+		roce_set_bit(context->qpc_bytes_128,
+			     QP_CONTEXT_QPC_BYTES_128_RX_ACK_PSN_ERR_FLG_S, 0);
+		roce_set_field(context->qpc_bytes_128,
+			     QP_CONTEXT_QPC_BYTES_128_ACK_LAST_OPERATION_TYPE_M,
+			     QP_CONTEXT_QPC_BYTES_128_ACK_LAST_OPERATION_TYPE_S,
+			     0);
+		roce_set_bit(context->qpc_bytes_128,
+			     QP_CONTEXT_QPC_BYTES_128_IRRL_PSN_VLD_FLG_S, 0);
+
+		roce_set_field(context->qpc_bytes_132,
+			       QP_CONTEXT_QPC_BYTES_132_IRRL_PSN_M,
+			       QP_CONTEXT_QPC_BYTES_132_IRRL_PSN_S, 0);
+		roce_set_field(context->qpc_bytes_132,
+			       QP_CONTEXT_QPC_BYTES_132_IRRL_TAIL_M,
+			       QP_CONTEXT_QPC_BYTES_132_IRRL_TAIL_S, 0);
+
+		roce_set_field(context->qpc_bytes_136,
+			       QP_CONTEXT_QPC_BYTES_136_RETRY_MSG_PSN_M,
+			       QP_CONTEXT_QPC_BYTES_136_RETRY_MSG_PSN_S,
+			       attr->sq_psn);
+		roce_set_field(context->qpc_bytes_136,
+			       QP_CONTEXT_QPC_BYTES_136_RETRY_MSG_FPKT_PSN_L_M,
+			       QP_CONTEXT_QPC_BYTES_136_RETRY_MSG_FPKT_PSN_L_S,
+			       attr->sq_psn);
+
+		roce_set_field(context->qpc_bytes_140,
+			       QP_CONTEXT_QPC_BYTES_140_RETRY_MSG_FPKT_PSN_H_M,
+			       QP_CONTEXT_QPC_BYTES_140_RETRY_MSG_FPKT_PSN_H_S,
+			       (attr->sq_psn >> SQ_PSN_SHIFT));
+		roce_set_field(context->qpc_bytes_140,
+			       QP_CONTEXT_QPC_BYTES_140_RETRY_MSG_MSN_M,
+			       QP_CONTEXT_QPC_BYTES_140_RETRY_MSG_MSN_S, 0);
+		roce_set_bit(context->qpc_bytes_140,
+			     QP_CONTEXT_QPC_BYTES_140_RNR_RETRY_FLG_S, 0);
+
+		roce_set_field(context->qpc_bytes_148,
+			       QP_CONTEXT_QPC_BYTES_148_CHECK_FLAG_M,
+			       QP_CONTEXT_QPC_BYTES_148_CHECK_FLAG_S, 0);
+		roce_set_field(context->qpc_bytes_148,
+			       QP_CONTEXT_QPC_BYTES_148_RETRY_COUNT_M,
+			       QP_CONTEXT_QPC_BYTES_148_RETRY_COUNT_S,
+			       attr->retry_cnt);
+		roce_set_field(context->qpc_bytes_148,
+			       QP_CONTEXT_QPC_BYTES_148_RNR_RETRY_COUNT_M,
+			       QP_CONTEXT_QPC_BYTES_148_RNR_RETRY_COUNT_S,
+			       attr->rnr_retry);
+		roce_set_field(context->qpc_bytes_148,
+			       QP_CONTEXT_QPC_BYTES_148_LSN_M,
+			       QP_CONTEXT_QPC_BYTES_148_LSN_S, 0x100);
+
+		context->rnr_retry = 0;
+
+		roce_set_field(context->qpc_bytes_156,
+			       QP_CONTEXT_QPC_BYTES_156_RETRY_COUNT_INIT_M,
+			       QP_CONTEXT_QPC_BYTES_156_RETRY_COUNT_INIT_S,
+			       attr->retry_cnt);
+		if (attr->timeout < 0x12) {
+			dev_info(dev, "ack timeout value(0x%x) must bigger than 0x12.\n",
+				 attr->timeout);
+			roce_set_field(context->qpc_bytes_156,
+				       QP_CONTEXT_QPC_BYTES_156_ACK_TIMEOUT_M,
+				       QP_CONTEXT_QPC_BYTES_156_ACK_TIMEOUT_S,
+				       0x12);
+		} else {
+			roce_set_field(context->qpc_bytes_156,
+				       QP_CONTEXT_QPC_BYTES_156_ACK_TIMEOUT_M,
+				       QP_CONTEXT_QPC_BYTES_156_ACK_TIMEOUT_S,
+				       attr->timeout);
+		}
+		roce_set_field(context->qpc_bytes_156,
+			       QP_CONTEXT_QPC_BYTES_156_RNR_RETRY_COUNT_INIT_M,
+			       QP_CONTEXT_QPC_BYTES_156_RNR_RETRY_COUNT_INIT_S,
+			       attr->rnr_retry);
+		roce_set_field(context->qpc_bytes_156,
+			       QP_CONTEXT_QPC_BYTES_156_PORT_NUM_M,
+			       QP_CONTEXT_QPC_BYTES_156_PORT_NUM_S,
+			       hr_qp->phy_port);
+		roce_set_field(context->qpc_bytes_156,
+			       QP_CONTEXT_QPC_BYTES_156_SL_M,
+			       QP_CONTEXT_QPC_BYTES_156_SL_S,
+			       rdma_ah_get_sl(&attr->ah_attr));
+		hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
+		roce_set_field(context->qpc_bytes_156,
+			       QP_CONTEXT_QPC_BYTES_156_INITIATOR_DEPTH_M,
+			       QP_CONTEXT_QPC_BYTES_156_INITIATOR_DEPTH_S,
+			       ilog2((unsigned int)attr->max_rd_atomic));
+		roce_set_field(context->qpc_bytes_156,
+			       QP_CONTEXT_QPC_BYTES_156_ACK_REQ_IND_M,
+			       QP_CONTEXT_QPC_BYTES_156_ACK_REQ_IND_S, 0);
+		context->pkt_use_len = 0;
+
+		roce_set_field(context->qpc_bytes_164,
+			       QP_CONTEXT_QPC_BYTES_164_SQ_PSN_M,
+			       QP_CONTEXT_QPC_BYTES_164_SQ_PSN_S, attr->sq_psn);
+		roce_set_field(context->qpc_bytes_164,
+			       QP_CONTEXT_QPC_BYTES_164_IRRL_HEAD_M,
+			       QP_CONTEXT_QPC_BYTES_164_IRRL_HEAD_S, 0);
+
+		roce_set_field(context->qpc_bytes_168,
+			       QP_CONTEXT_QPC_BYTES_168_RETRY_SQ_PSN_M,
+			       QP_CONTEXT_QPC_BYTES_168_RETRY_SQ_PSN_S,
+			       attr->sq_psn);
+		roce_set_field(context->qpc_bytes_168,
+			       QP_CONTEXT_QPC_BYTES_168_SGE_USE_FLA_M,
+			       QP_CONTEXT_QPC_BYTES_168_SGE_USE_FLA_S, 0);
+		roce_set_field(context->qpc_bytes_168,
+			       QP_CONTEXT_QPC_BYTES_168_DB_TYPE_M,
+			       QP_CONTEXT_QPC_BYTES_168_DB_TYPE_S, 0);
+		roce_set_bit(context->qpc_bytes_168,
+			     QP_CONTEXT_QPC_BYTES_168_MSG_LP_IND_S, 0);
+		roce_set_bit(context->qpc_bytes_168,
+			     QP_CONTEXT_QPC_BYTES_168_CSDB_LP_IND_S, 0);
+		roce_set_bit(context->qpc_bytes_168,
+			     QP_CONTEXT_QPC_BYTES_168_QP_ERR_FLG_S, 0);
+		context->sge_use_len = 0;
+
+		roce_set_field(context->qpc_bytes_176,
+			       QP_CONTEXT_QPC_BYTES_176_DB_CUR_INDEX_M,
+			       QP_CONTEXT_QPC_BYTES_176_DB_CUR_INDEX_S, 0);
+		roce_set_field(context->qpc_bytes_176,
+			       QP_CONTEXT_QPC_BYTES_176_RETRY_DB_CUR_INDEX_M,
+			       QP_CONTEXT_QPC_BYTES_176_RETRY_DB_CUR_INDEX_S,
+			       0);
+		roce_set_field(context->qpc_bytes_180,
+			       QP_CONTEXT_QPC_BYTES_180_SQ_CUR_INDEX_M,
+			       QP_CONTEXT_QPC_BYTES_180_SQ_CUR_INDEX_S, 0);
+		roce_set_field(context->qpc_bytes_180,
+			       QP_CONTEXT_QPC_BYTES_180_SQ_HEAD_M,
+			       QP_CONTEXT_QPC_BYTES_180_SQ_HEAD_S, 0);
+
+		context->tx_cur_sq_wqe_ba_l = (u32)(mtts[0]);
+
+		roce_set_field(context->qpc_bytes_188,
+			       QP_CONTEXT_QPC_BYTES_188_TX_CUR_SQ_WQE_BA_H_M,
+			       QP_CONTEXT_QPC_BYTES_188_TX_CUR_SQ_WQE_BA_H_S,
+			       (mtts[0]) >> 32);
+		roce_set_bit(context->qpc_bytes_188,
+			     QP_CONTEXT_QPC_BYTES_188_PKT_RETRY_FLG_S, 0);
+		roce_set_field(context->qpc_bytes_188,
+			       QP_CONTEXT_QPC_BYTES_188_TX_RETRY_CUR_INDEX_M,
+			       QP_CONTEXT_QPC_BYTES_188_TX_RETRY_CUR_INDEX_S,
+			       0);
+	} else if (!((cur_state == IB_QPS_INIT && new_state == IB_QPS_RESET) ||
+		   (cur_state == IB_QPS_INIT && new_state == IB_QPS_ERR) ||
+		   (cur_state == IB_QPS_RTR && new_state == IB_QPS_RESET) ||
+		   (cur_state == IB_QPS_RTR && new_state == IB_QPS_ERR) ||
+		   (cur_state == IB_QPS_RTS && new_state == IB_QPS_RESET) ||
+		   (cur_state == IB_QPS_RTS && new_state == IB_QPS_ERR) ||
+		   (cur_state == IB_QPS_ERR && new_state == IB_QPS_RESET) ||
+		   (cur_state == IB_QPS_ERR && new_state == IB_QPS_ERR))) {
+		dev_err(dev, "not support this status migration\n");
+		goto out;
+	}
+
+	/* Every status migrate must change state */
+	roce_set_field(context->qpc_bytes_144,
+		       QP_CONTEXT_QPC_BYTES_144_QP_STATE_M,
+		       QP_CONTEXT_QPC_BYTES_144_QP_STATE_S, new_state);
+
+	/* SW pass context to HW */
+	ret = hns_roce_v1_qp_modify(hr_dev, &hr_qp->mtt,
+				    to_hns_roce_state(cur_state),
+				    to_hns_roce_state(new_state), context,
+				    hr_qp);
+	if (ret) {
+		dev_err(dev, "hns_roce_qp_modify failed\n");
+		goto out;
+	}
+
+	/*
+	 * Use rst2init to instead of init2init with drv,
+	 * need to hw to flash RQ HEAD by DB again
+	 */
+	if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) {
+		/* Memory barrier */
+		wmb();
+
+		roce_set_field(doorbell[0], RQ_DOORBELL_U32_4_RQ_HEAD_M,
+			       RQ_DOORBELL_U32_4_RQ_HEAD_S, hr_qp->rq.head);
+		roce_set_field(doorbell[1], RQ_DOORBELL_U32_8_QPN_M,
+			       RQ_DOORBELL_U32_8_QPN_S, hr_qp->qpn);
+		roce_set_field(doorbell[1], RQ_DOORBELL_U32_8_CMD_M,
+			       RQ_DOORBELL_U32_8_CMD_S, 1);
+		roce_set_bit(doorbell[1], RQ_DOORBELL_U32_8_HW_SYNC_S, 1);
+
+		if (ibqp->uobject) {
+			hr_qp->rq.db_reg_l = hr_dev->reg_base +
+				     hr_dev->odb_offset +
+				     DB_REG_OFFSET * hr_dev->priv_uar.index;
+		}
+
+		hns_roce_write64_k(doorbell, hr_qp->rq.db_reg_l);
+	}
+
+	hr_qp->state = new_state;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		hr_qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT) {
+		hr_qp->port = attr->port_num - 1;
+		hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port];
+	}
+
+	if (new_state == IB_QPS_RESET && !ibqp->uobject) {
+		hns_roce_v1_cq_clean(to_hr_cq(ibqp->recv_cq), hr_qp->qpn,
+				     ibqp->srq ? to_hr_srq(ibqp->srq) : NULL);
+		if (ibqp->send_cq != ibqp->recv_cq)
+			hns_roce_v1_cq_clean(to_hr_cq(ibqp->send_cq),
+					     hr_qp->qpn, NULL);
+
+		hr_qp->rq.head = 0;
+		hr_qp->rq.tail = 0;
+		hr_qp->sq.head = 0;
+		hr_qp->sq.tail = 0;
+		hr_qp->sq_next_wqe = 0;
+	}
+out:
+	kfree(context);
+	return ret;
+}
+
+static int hns_roce_v1_modify_qp(struct ib_qp *ibqp,
+				 const struct ib_qp_attr *attr, int attr_mask,
+				 enum ib_qp_state cur_state,
+				 enum ib_qp_state new_state)
+{
+
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI)
+		return hns_roce_v1_m_sqp(ibqp, attr, attr_mask, cur_state,
+					 new_state);
+	else
+		return hns_roce_v1_m_qp(ibqp, attr, attr_mask, cur_state,
+					new_state);
+}
+
+static enum ib_qp_state to_ib_qp_state(enum hns_roce_qp_state state)
+{
+	switch (state) {
+	case HNS_ROCE_QP_STATE_RST:
+		return IB_QPS_RESET;
+	case HNS_ROCE_QP_STATE_INIT:
+		return IB_QPS_INIT;
+	case HNS_ROCE_QP_STATE_RTR:
+		return IB_QPS_RTR;
+	case HNS_ROCE_QP_STATE_RTS:
+		return IB_QPS_RTS;
+	case HNS_ROCE_QP_STATE_SQD:
+		return IB_QPS_SQD;
+	case HNS_ROCE_QP_STATE_ERR:
+		return IB_QPS_ERR;
+	default:
+		return IB_QPS_ERR;
+	}
+}
+
+static int hns_roce_v1_query_qpc(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_qp *hr_qp,
+				 struct hns_roce_qp_context *hr_context)
+{
+	struct hns_roce_cmd_mailbox *mailbox;
+	int ret;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, hr_qp->qpn, 0,
+				HNS_ROCE_CMD_QUERY_QP,
+				HNS_ROCE_CMD_TIMEOUT_MSECS);
+	if (!ret)
+		memcpy(hr_context, mailbox->buf, sizeof(*hr_context));
+	else
+		dev_err(&hr_dev->pdev->dev, "QUERY QP cmd process error\n");
+
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return ret;
+}
+
+static int hns_roce_v1_q_sqp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+			     int qp_attr_mask,
+			     struct ib_qp_init_attr *qp_init_attr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct hns_roce_sqp_context context;
+	u32 addr;
+
+	mutex_lock(&hr_qp->mutex);
+
+	if (hr_qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		goto done;
+	}
+
+	addr = ROCEE_QP1C_CFG0_0_REG +
+		hr_qp->port * sizeof(struct hns_roce_sqp_context);
+	context.qp1c_bytes_4 = roce_read(hr_dev, addr);
+	context.sq_rq_bt_l = roce_read(hr_dev, addr + 1);
+	context.qp1c_bytes_12 = roce_read(hr_dev, addr + 2);
+	context.qp1c_bytes_16 = roce_read(hr_dev, addr + 3);
+	context.qp1c_bytes_20 = roce_read(hr_dev, addr + 4);
+	context.cur_rq_wqe_ba_l = roce_read(hr_dev, addr + 5);
+	context.qp1c_bytes_28 = roce_read(hr_dev, addr + 6);
+	context.qp1c_bytes_32 = roce_read(hr_dev, addr + 7);
+	context.cur_sq_wqe_ba_l = roce_read(hr_dev, addr + 8);
+	context.qp1c_bytes_40 = roce_read(hr_dev, addr + 9);
+
+	hr_qp->state = roce_get_field(context.qp1c_bytes_4,
+				      QP1C_BYTES_4_QP_STATE_M,
+				      QP1C_BYTES_4_QP_STATE_S);
+	qp_attr->qp_state	= hr_qp->state;
+	qp_attr->path_mtu	= IB_MTU_256;
+	qp_attr->path_mig_state	= IB_MIG_ARMED;
+	qp_attr->qkey		= QKEY_VAL;
+	qp_attr->ah_attr.type   = RDMA_AH_ATTR_TYPE_ROCE;
+	qp_attr->rq_psn		= 0;
+	qp_attr->sq_psn		= 0;
+	qp_attr->dest_qp_num	= 1;
+	qp_attr->qp_access_flags = 6;
+
+	qp_attr->pkey_index = roce_get_field(context.qp1c_bytes_20,
+					     QP1C_BYTES_20_PKEY_IDX_M,
+					     QP1C_BYTES_20_PKEY_IDX_S);
+	qp_attr->port_num = hr_qp->port + 1;
+	qp_attr->sq_draining = 0;
+	qp_attr->max_rd_atomic = 0;
+	qp_attr->max_dest_rd_atomic = 0;
+	qp_attr->min_rnr_timer = 0;
+	qp_attr->timeout = 0;
+	qp_attr->retry_cnt = 0;
+	qp_attr->rnr_retry = 0;
+	qp_attr->alt_timeout = 0;
+
+done:
+	qp_attr->cur_qp_state = qp_attr->qp_state;
+	qp_attr->cap.max_recv_wr = hr_qp->rq.wqe_cnt;
+	qp_attr->cap.max_recv_sge = hr_qp->rq.max_gs;
+	qp_attr->cap.max_send_wr = hr_qp->sq.wqe_cnt;
+	qp_attr->cap.max_send_sge = hr_qp->sq.max_gs;
+	qp_attr->cap.max_inline_data = 0;
+	qp_init_attr->cap = qp_attr->cap;
+	qp_init_attr->create_flags = 0;
+
+	mutex_unlock(&hr_qp->mutex);
+
+	return 0;
+}
+
+static int hns_roce_v1_q_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+			    int qp_attr_mask,
+			    struct ib_qp_init_attr *qp_init_attr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_qp_context *context;
+	int tmp_qp_state = 0;
+	int ret = 0;
+	int state;
+
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	memset(qp_attr, 0, sizeof(*qp_attr));
+	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
+
+	mutex_lock(&hr_qp->mutex);
+
+	if (hr_qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		goto done;
+	}
+
+	ret = hns_roce_v1_query_qpc(hr_dev, hr_qp, context);
+	if (ret) {
+		dev_err(dev, "query qpc error\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	state = roce_get_field(context->qpc_bytes_144,
+			       QP_CONTEXT_QPC_BYTES_144_QP_STATE_M,
+			       QP_CONTEXT_QPC_BYTES_144_QP_STATE_S);
+	tmp_qp_state = (int)to_ib_qp_state((enum hns_roce_qp_state)state);
+	if (tmp_qp_state == -1) {
+		dev_err(dev, "to_ib_qp_state error\n");
+		ret = -EINVAL;
+		goto out;
+	}
+	hr_qp->state = (u8)tmp_qp_state;
+	qp_attr->qp_state = (enum ib_qp_state)hr_qp->state;
+	qp_attr->path_mtu = (enum ib_mtu)roce_get_field(context->qpc_bytes_48,
+					       QP_CONTEXT_QPC_BYTES_48_MTU_M,
+					       QP_CONTEXT_QPC_BYTES_48_MTU_S);
+	qp_attr->path_mig_state = IB_MIG_ARMED;
+	qp_attr->ah_attr.type   = RDMA_AH_ATTR_TYPE_ROCE;
+	if (hr_qp->ibqp.qp_type == IB_QPT_UD)
+		qp_attr->qkey = QKEY_VAL;
+
+	qp_attr->rq_psn = roce_get_field(context->qpc_bytes_88,
+					 QP_CONTEXT_QPC_BYTES_88_RX_REQ_EPSN_M,
+					 QP_CONTEXT_QPC_BYTES_88_RX_REQ_EPSN_S);
+	qp_attr->sq_psn = (u32)roce_get_field(context->qpc_bytes_164,
+					     QP_CONTEXT_QPC_BYTES_164_SQ_PSN_M,
+					     QP_CONTEXT_QPC_BYTES_164_SQ_PSN_S);
+	qp_attr->dest_qp_num = (u8)roce_get_field(context->qpc_bytes_36,
+					QP_CONTEXT_QPC_BYTES_36_DEST_QP_M,
+					QP_CONTEXT_QPC_BYTES_36_DEST_QP_S);
+	qp_attr->qp_access_flags = ((roce_get_bit(context->qpc_bytes_4,
+			QP_CONTEXT_QPC_BYTE_4_RDMA_READ_ENABLE_S)) << 2) |
+				   ((roce_get_bit(context->qpc_bytes_4,
+			QP_CONTEXT_QPC_BYTE_4_RDMA_WRITE_ENABLE_S)) << 1) |
+				   ((roce_get_bit(context->qpc_bytes_4,
+			QP_CONTEXT_QPC_BYTE_4_ATOMIC_OPERATION_ENABLE_S)) << 3);
+
+	if (hr_qp->ibqp.qp_type == IB_QPT_RC ||
+	    hr_qp->ibqp.qp_type == IB_QPT_UC) {
+		struct ib_global_route *grh =
+			rdma_ah_retrieve_grh(&qp_attr->ah_attr);
+
+		rdma_ah_set_sl(&qp_attr->ah_attr,
+			       roce_get_field(context->qpc_bytes_156,
+					      QP_CONTEXT_QPC_BYTES_156_SL_M,
+					      QP_CONTEXT_QPC_BYTES_156_SL_S));
+		rdma_ah_set_ah_flags(&qp_attr->ah_attr, IB_AH_GRH);
+		grh->flow_label =
+			roce_get_field(context->qpc_bytes_48,
+				       QP_CONTEXT_QPC_BYTES_48_FLOWLABEL_M,
+				       QP_CONTEXT_QPC_BYTES_48_FLOWLABEL_S);
+		grh->sgid_index =
+			roce_get_field(context->qpc_bytes_36,
+				       QP_CONTEXT_QPC_BYTES_36_SGID_INDEX_M,
+				       QP_CONTEXT_QPC_BYTES_36_SGID_INDEX_S);
+		grh->hop_limit =
+			roce_get_field(context->qpc_bytes_44,
+				       QP_CONTEXT_QPC_BYTES_44_HOPLMT_M,
+				       QP_CONTEXT_QPC_BYTES_44_HOPLMT_S);
+		grh->traffic_class =
+			roce_get_field(context->qpc_bytes_48,
+				       QP_CONTEXT_QPC_BYTES_48_TCLASS_M,
+				       QP_CONTEXT_QPC_BYTES_48_TCLASS_S);
+
+		memcpy(grh->dgid.raw, context->dgid,
+		       sizeof(grh->dgid.raw));
+	}
+
+	qp_attr->pkey_index = roce_get_field(context->qpc_bytes_12,
+			      QP_CONTEXT_QPC_BYTES_12_P_KEY_INDEX_M,
+			      QP_CONTEXT_QPC_BYTES_12_P_KEY_INDEX_S);
+	qp_attr->port_num = hr_qp->port + 1;
+	qp_attr->sq_draining = 0;
+	qp_attr->max_rd_atomic = 1 << roce_get_field(context->qpc_bytes_156,
+				 QP_CONTEXT_QPC_BYTES_156_INITIATOR_DEPTH_M,
+				 QP_CONTEXT_QPC_BYTES_156_INITIATOR_DEPTH_S);
+	qp_attr->max_dest_rd_atomic = 1 << roce_get_field(context->qpc_bytes_32,
+				 QP_CONTEXT_QPC_BYTES_32_RESPONDER_RESOURCES_M,
+				 QP_CONTEXT_QPC_BYTES_32_RESPONDER_RESOURCES_S);
+	qp_attr->min_rnr_timer = (u8)(roce_get_field(context->qpc_bytes_24,
+			QP_CONTEXT_QPC_BYTES_24_MINIMUM_RNR_NAK_TIMER_M,
+			QP_CONTEXT_QPC_BYTES_24_MINIMUM_RNR_NAK_TIMER_S));
+	qp_attr->timeout = (u8)(roce_get_field(context->qpc_bytes_156,
+			    QP_CONTEXT_QPC_BYTES_156_ACK_TIMEOUT_M,
+			    QP_CONTEXT_QPC_BYTES_156_ACK_TIMEOUT_S));
+	qp_attr->retry_cnt = roce_get_field(context->qpc_bytes_148,
+			     QP_CONTEXT_QPC_BYTES_148_RETRY_COUNT_M,
+			     QP_CONTEXT_QPC_BYTES_148_RETRY_COUNT_S);
+	qp_attr->rnr_retry = context->rnr_retry;
+
+done:
+	qp_attr->cur_qp_state = qp_attr->qp_state;
+	qp_attr->cap.max_recv_wr = hr_qp->rq.wqe_cnt;
+	qp_attr->cap.max_recv_sge = hr_qp->rq.max_gs;
+
+	if (!ibqp->uobject) {
+		qp_attr->cap.max_send_wr = hr_qp->sq.wqe_cnt;
+		qp_attr->cap.max_send_sge = hr_qp->sq.max_gs;
+	} else {
+		qp_attr->cap.max_send_wr = 0;
+		qp_attr->cap.max_send_sge = 0;
+	}
+
+	qp_init_attr->cap = qp_attr->cap;
+
+out:
+	mutex_unlock(&hr_qp->mutex);
+	kfree(context);
+	return ret;
+}
+
+static int hns_roce_v1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+				int qp_attr_mask,
+				struct ib_qp_init_attr *qp_init_attr)
+{
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+
+	return hr_qp->doorbell_qpn <= 1 ?
+		hns_roce_v1_q_sqp(ibqp, qp_attr, qp_attr_mask, qp_init_attr) :
+		hns_roce_v1_q_qp(ibqp, qp_attr, qp_attr_mask, qp_init_attr);
+}
+
+static void hns_roce_check_sdb_status(struct hns_roce_dev *hr_dev,
+				      u32 *old_send, u32 *old_retry,
+				      u32 *tsp_st, u32 *success_flags)
+{
+	u32 sdb_retry_cnt;
+	u32 sdb_send_ptr;
+	u32 cur_cnt, old_cnt;
+	u32 send_ptr;
+
+	sdb_send_ptr = roce_read(hr_dev, ROCEE_SDB_SEND_PTR_REG);
+	sdb_retry_cnt =	roce_read(hr_dev, ROCEE_SDB_RETRY_CNT_REG);
+	cur_cnt = roce_get_field(sdb_send_ptr,
+				 ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
+				 ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) +
+		  roce_get_field(sdb_retry_cnt,
+				 ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M,
+				 ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S);
+	if (!roce_get_bit(*tsp_st, ROCEE_CNT_CLR_CE_CNT_CLR_CE_S)) {
+		old_cnt = roce_get_field(*old_send,
+					 ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
+					 ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) +
+			  roce_get_field(*old_retry,
+					 ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M,
+					 ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S);
+		if (cur_cnt - old_cnt > SDB_ST_CMP_VAL)
+			*success_flags = 1;
+	} else {
+		old_cnt = roce_get_field(*old_send,
+					 ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
+					 ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S);
+		if (cur_cnt - old_cnt > SDB_ST_CMP_VAL) {
+			*success_flags = 1;
+		} else {
+			send_ptr = roce_get_field(*old_send,
+					    ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
+					    ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) +
+				   roce_get_field(sdb_retry_cnt,
+					    ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M,
+					    ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S);
+			roce_set_field(*old_send,
+				       ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
+				       ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S,
+				       send_ptr);
+		}
+	}
+}
+
+static int check_qp_db_process_status(struct hns_roce_dev *hr_dev,
+				      struct hns_roce_qp *hr_qp,
+				      u32 sdb_issue_ptr,
+				      u32 *sdb_inv_cnt,
+				      u32 *wait_stage)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	u32 sdb_send_ptr, old_send;
+	u32 success_flags = 0;
+	unsigned long end;
+	u32 old_retry;
+	u32 inv_cnt;
+	u32 tsp_st;
+
+	if (*wait_stage > HNS_ROCE_V1_DB_STAGE2 ||
+	    *wait_stage < HNS_ROCE_V1_DB_STAGE1) {
+		dev_err(dev, "QP(0x%lx) db status wait stage(%d) error!\n",
+			hr_qp->qpn, *wait_stage);
+		return -EINVAL;
+	}
+
+	/* Calculate the total timeout for the entire verification process */
+	end = msecs_to_jiffies(HNS_ROCE_V1_CHECK_DB_TIMEOUT_MSECS) + jiffies;
+
+	if (*wait_stage == HNS_ROCE_V1_DB_STAGE1) {
+		/* Query db process status, until hw process completely */
+		sdb_send_ptr = roce_read(hr_dev, ROCEE_SDB_SEND_PTR_REG);
+		while (roce_hw_index_cmp_lt(sdb_send_ptr, sdb_issue_ptr,
+					    ROCEE_SDB_PTR_CMP_BITS)) {
+			if (!time_before(jiffies, end)) {
+				dev_dbg(dev, "QP(0x%lx) db process stage1 timeout. issue 0x%x send 0x%x.\n",
+					hr_qp->qpn, sdb_issue_ptr,
+					sdb_send_ptr);
+				return 0;
+			}
+
+			msleep(HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS);
+			sdb_send_ptr = roce_read(hr_dev,
+						 ROCEE_SDB_SEND_PTR_REG);
+		}
+
+		if (roce_get_field(sdb_issue_ptr,
+				   ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_M,
+				   ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_S) ==
+		    roce_get_field(sdb_send_ptr,
+				   ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
+				   ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S)) {
+			old_send = roce_read(hr_dev, ROCEE_SDB_SEND_PTR_REG);
+			old_retry = roce_read(hr_dev, ROCEE_SDB_RETRY_CNT_REG);
+
+			do {
+				tsp_st = roce_read(hr_dev, ROCEE_TSP_BP_ST_REG);
+				if (roce_get_bit(tsp_st,
+					ROCEE_TSP_BP_ST_QH_FIFO_ENTRY_S) == 1) {
+					*wait_stage = HNS_ROCE_V1_DB_WAIT_OK;
+					return 0;
+				}
+
+				if (!time_before(jiffies, end)) {
+					dev_dbg(dev, "QP(0x%lx) db process stage1 timeout when send ptr equals issue ptr.\n"
+						     "issue 0x%x send 0x%x.\n",
+						hr_qp->qpn, sdb_issue_ptr,
+						sdb_send_ptr);
+					return 0;
+				}
+
+				msleep(HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS);
+
+				hns_roce_check_sdb_status(hr_dev, &old_send,
+							  &old_retry, &tsp_st,
+							  &success_flags);
+			} while (!success_flags);
+		}
+
+		*wait_stage = HNS_ROCE_V1_DB_STAGE2;
+
+		/* Get list pointer */
+		*sdb_inv_cnt = roce_read(hr_dev, ROCEE_SDB_INV_CNT_REG);
+		dev_dbg(dev, "QP(0x%lx) db process stage2. inv cnt = 0x%x.\n",
+			hr_qp->qpn, *sdb_inv_cnt);
+	}
+
+	if (*wait_stage == HNS_ROCE_V1_DB_STAGE2) {
+		/* Query db's list status, until hw reversal */
+		inv_cnt = roce_read(hr_dev, ROCEE_SDB_INV_CNT_REG);
+		while (roce_hw_index_cmp_lt(inv_cnt,
+					    *sdb_inv_cnt + SDB_INV_CNT_OFFSET,
+					    ROCEE_SDB_CNT_CMP_BITS)) {
+			if (!time_before(jiffies, end)) {
+				dev_dbg(dev, "QP(0x%lx) db process stage2 timeout. inv cnt 0x%x.\n",
+					hr_qp->qpn, inv_cnt);
+				return 0;
+			}
+
+			msleep(HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS);
+			inv_cnt = roce_read(hr_dev, ROCEE_SDB_INV_CNT_REG);
+		}
+
+		*wait_stage = HNS_ROCE_V1_DB_WAIT_OK;
+	}
+
+	return 0;
+}
+
+static int check_qp_reset_state(struct hns_roce_dev *hr_dev,
+				struct hns_roce_qp *hr_qp,
+				struct hns_roce_qp_work *qp_work_entry,
+				int *is_timeout)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	u32 sdb_issue_ptr;
+	int ret;
+
+	if (hr_qp->state != IB_QPS_RESET) {
+		/* Set qp to ERR, waiting for hw complete processing all dbs */
+		ret = hns_roce_v1_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state,
+					    IB_QPS_ERR);
+		if (ret) {
+			dev_err(dev, "Modify QP(0x%lx) to ERR failed!\n",
+				hr_qp->qpn);
+			return ret;
+		}
+
+		/* Record issued doorbell */
+		sdb_issue_ptr = roce_read(hr_dev, ROCEE_SDB_ISSUE_PTR_REG);
+		qp_work_entry->sdb_issue_ptr = sdb_issue_ptr;
+		qp_work_entry->db_wait_stage = HNS_ROCE_V1_DB_STAGE1;
+
+		/* Query db process status, until hw process completely */
+		ret = check_qp_db_process_status(hr_dev, hr_qp, sdb_issue_ptr,
+						 &qp_work_entry->sdb_inv_cnt,
+						 &qp_work_entry->db_wait_stage);
+		if (ret) {
+			dev_err(dev, "Check QP(0x%lx) db process status failed!\n",
+				hr_qp->qpn);
+			return ret;
+		}
+
+		if (qp_work_entry->db_wait_stage != HNS_ROCE_V1_DB_WAIT_OK) {
+			qp_work_entry->sche_cnt = 0;
+			*is_timeout = 1;
+			return 0;
+		}
+
+		/* Modify qp to reset before destroying qp */
+		ret = hns_roce_v1_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state,
+					    IB_QPS_RESET);
+		if (ret) {
+			dev_err(dev, "Modify QP(0x%lx) to RST failed!\n",
+				hr_qp->qpn);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void hns_roce_v1_destroy_qp_work_fn(struct work_struct *work)
+{
+	struct hns_roce_qp_work *qp_work_entry;
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_dev *hr_dev;
+	struct hns_roce_qp *hr_qp;
+	struct device *dev;
+	unsigned long qpn;
+	int ret;
+
+	qp_work_entry = container_of(work, struct hns_roce_qp_work, work);
+	hr_dev = to_hr_dev(qp_work_entry->ib_dev);
+	dev = &hr_dev->pdev->dev;
+	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+	hr_qp = qp_work_entry->qp;
+	qpn = hr_qp->qpn;
+
+	dev_dbg(dev, "Schedule destroy QP(0x%lx) work.\n", qpn);
+
+	qp_work_entry->sche_cnt++;
+
+	/* Query db process status, until hw process completely */
+	ret = check_qp_db_process_status(hr_dev, hr_qp,
+					 qp_work_entry->sdb_issue_ptr,
+					 &qp_work_entry->sdb_inv_cnt,
+					 &qp_work_entry->db_wait_stage);
+	if (ret) {
+		dev_err(dev, "Check QP(0x%lx) db process status failed!\n",
+			qpn);
+		return;
+	}
+
+	if (qp_work_entry->db_wait_stage != HNS_ROCE_V1_DB_WAIT_OK &&
+	    priv->des_qp.requeue_flag) {
+		queue_work(priv->des_qp.qp_wq, work);
+		return;
+	}
+
+	/* Modify qp to reset before destroying qp */
+	ret = hns_roce_v1_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state,
+				    IB_QPS_RESET);
+	if (ret) {
+		dev_err(dev, "Modify QP(0x%lx) to RST failed!\n", qpn);
+		return;
+	}
+
+	hns_roce_qp_remove(hr_dev, hr_qp);
+	hns_roce_qp_free(hr_dev, hr_qp);
+
+	if (hr_qp->ibqp.qp_type == IB_QPT_RC) {
+		/* RC QP, release QPN */
+		hns_roce_release_range_qp(hr_dev, qpn, 1);
+		kfree(hr_qp);
+	} else
+		kfree(hr_to_hr_sqp(hr_qp));
+
+	kfree(qp_work_entry);
+
+	dev_dbg(dev, "Accomplished destroy QP(0x%lx) work.\n", qpn);
+}
+
+int hns_roce_v1_destroy_qp(struct ib_qp *ibqp)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_qp_work qp_work_entry;
+	struct hns_roce_qp_work *qp_work;
+	struct hns_roce_v1_priv *priv;
+	struct hns_roce_cq *send_cq, *recv_cq;
+	int is_user = !!ibqp->pd->uobject;
+	int is_timeout = 0;
+	int ret;
+
+	ret = check_qp_reset_state(hr_dev, hr_qp, &qp_work_entry, &is_timeout);
+	if (ret) {
+		dev_err(dev, "QP reset state check failed(%d)!\n", ret);
+		return ret;
+	}
+
+	send_cq = to_hr_cq(hr_qp->ibqp.send_cq);
+	recv_cq = to_hr_cq(hr_qp->ibqp.recv_cq);
+
+	hns_roce_lock_cqs(send_cq, recv_cq);
+	if (!is_user) {
+		__hns_roce_v1_cq_clean(recv_cq, hr_qp->qpn, hr_qp->ibqp.srq ?
+				       to_hr_srq(hr_qp->ibqp.srq) : NULL);
+		if (send_cq != recv_cq)
+			__hns_roce_v1_cq_clean(send_cq, hr_qp->qpn, NULL);
+	}
+	hns_roce_unlock_cqs(send_cq, recv_cq);
+
+	if (!is_timeout) {
+		hns_roce_qp_remove(hr_dev, hr_qp);
+		hns_roce_qp_free(hr_dev, hr_qp);
+
+		/* RC QP, release QPN */
+		if (hr_qp->ibqp.qp_type == IB_QPT_RC)
+			hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
+	}
+
+	hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
+
+	if (is_user)
+		ib_umem_release(hr_qp->umem);
+	else {
+		kfree(hr_qp->sq.wrid);
+		kfree(hr_qp->rq.wrid);
+
+		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
+	}
+
+	if (!is_timeout) {
+		if (hr_qp->ibqp.qp_type == IB_QPT_RC)
+			kfree(hr_qp);
+		else
+			kfree(hr_to_hr_sqp(hr_qp));
+	} else {
+		qp_work = kzalloc(sizeof(*qp_work), GFP_KERNEL);
+		if (!qp_work)
+			return -ENOMEM;
+
+		INIT_WORK(&qp_work->work, hns_roce_v1_destroy_qp_work_fn);
+		qp_work->ib_dev	= &hr_dev->ib_dev;
+		qp_work->qp		= hr_qp;
+		qp_work->db_wait_stage	= qp_work_entry.db_wait_stage;
+		qp_work->sdb_issue_ptr	= qp_work_entry.sdb_issue_ptr;
+		qp_work->sdb_inv_cnt	= qp_work_entry.sdb_inv_cnt;
+		qp_work->sche_cnt	= qp_work_entry.sche_cnt;
+
+		priv = (struct hns_roce_v1_priv *)hr_dev->priv;
+		queue_work(priv->des_qp.qp_wq, &qp_work->work);
+		dev_dbg(dev, "Begin destroy QP(0x%lx) work.\n", hr_qp->qpn);
+	}
+
+	return 0;
+}
+
+static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibcq->device);
+	struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
+	struct device *dev = &hr_dev->pdev->dev;
+	u32 cqe_cnt_ori;
+	u32 cqe_cnt_cur;
+	u32 cq_buf_size;
+	int wait_time = 0;
+	int ret = 0;
+
+	hns_roce_free_cq(hr_dev, hr_cq);
+
+	/*
+	 * Before freeing cq buffer, we need to ensure that the outstanding CQE
+	 * have been written by checking the CQE counter.
+	 */
+	cqe_cnt_ori = roce_read(hr_dev, ROCEE_SCAEP_WR_CQE_CNT);
+	while (1) {
+		if (roce_read(hr_dev, ROCEE_CAEP_CQE_WCMD_EMPTY) &
+		    HNS_ROCE_CQE_WCMD_EMPTY_BIT)
+			break;
+
+		cqe_cnt_cur = roce_read(hr_dev, ROCEE_SCAEP_WR_CQE_CNT);
+		if ((cqe_cnt_cur - cqe_cnt_ori) >= HNS_ROCE_MIN_CQE_CNT)
+			break;
+
+		msleep(HNS_ROCE_EACH_FREE_CQ_WAIT_MSECS);
+		if (wait_time > HNS_ROCE_MAX_FREE_CQ_WAIT_CNT) {
+			dev_warn(dev, "Destroy cq 0x%lx timeout!\n",
+				hr_cq->cqn);
+			ret = -ETIMEDOUT;
+			break;
+		}
+		wait_time++;
+	}
+
+	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
+
+	if (ibcq->uobject)
+		ib_umem_release(hr_cq->umem);
+	else {
+		/* Free the buff of stored cq */
+		cq_buf_size = (ibcq->cqe + 1) * hr_dev->caps.cq_entry_sz;
+		hns_roce_buf_free(hr_dev, cq_buf_size, &hr_cq->hr_buf.hr_buf);
+	}
+
+	kfree(hr_cq);
+
+	return ret;
+}
+
+static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not)
+{
+	roce_raw_write((eq->cons_index & HNS_ROCE_V1_CONS_IDX_M) |
+		      (req_not << eq->log_entries), eq->doorbell);
+}
+
+static void hns_roce_v1_wq_catas_err_handle(struct hns_roce_dev *hr_dev,
+					    struct hns_roce_aeqe *aeqe, int qpn)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+
+	dev_warn(dev, "Local Work Queue Catastrophic Error.\n");
+	switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M,
+			       HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) {
+	case HNS_ROCE_LWQCE_QPC_ERROR:
+		dev_warn(dev, "QP %d, QPC error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_MTU_ERROR:
+		dev_warn(dev, "QP %d, MTU error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR:
+		dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_WQE_ADDR_ERROR:
+		dev_warn(dev, "QP %d, WQE addr error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR:
+		dev_warn(dev, "QP %d, WQE shift error\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_SL_ERROR:
+		dev_warn(dev, "QP %d, SL error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_PORT_ERROR:
+		dev_warn(dev, "QP %d, port error.\n", qpn);
+		break;
+	default:
+		break;
+	}
+}
+
+static void hns_roce_v1_local_wq_access_err_handle(struct hns_roce_dev *hr_dev,
+						   struct hns_roce_aeqe *aeqe,
+						   int qpn)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+
+	dev_warn(dev, "Local Access Violation Work Queue Error.\n");
+	switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M,
+			       HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) {
+	case HNS_ROCE_LAVWQE_R_KEY_VIOLATION:
+		dev_warn(dev, "QP %d, R_key violation.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_LENGTH_ERROR:
+		dev_warn(dev, "QP %d, length error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_VA_ERROR:
+		dev_warn(dev, "QP %d, VA error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_PD_ERROR:
+		dev_err(dev, "QP %d, PD error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_RW_ACC_ERROR:
+		dev_warn(dev, "QP %d, rw acc error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_KEY_STATE_ERROR:
+		dev_warn(dev, "QP %d, key state error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR:
+		dev_warn(dev, "QP %d, MR operation error.\n", qpn);
+		break;
+	default:
+		break;
+	}
+}
+
+static void hns_roce_v1_qp_err_handle(struct hns_roce_dev *hr_dev,
+				      struct hns_roce_aeqe *aeqe,
+				      int event_type)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	int phy_port;
+	int qpn;
+
+	qpn = roce_get_field(aeqe->event.qp_event.qp,
+			     HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M,
+			     HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S);
+	phy_port = roce_get_field(aeqe->event.qp_event.qp,
+				  HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M,
+				  HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S);
+	if (qpn <= 1)
+		qpn = HNS_ROCE_MAX_PORTS * qpn + phy_port;
+
+	switch (event_type) {
+	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
+		dev_warn(dev, "Invalid Req Local Work Queue Error.\n"
+			 "QP %d, phy_port %d.\n", qpn, phy_port);
+		break;
+	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
+		hns_roce_v1_wq_catas_err_handle(hr_dev, aeqe, qpn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+		hns_roce_v1_local_wq_access_err_handle(hr_dev, aeqe, qpn);
+		break;
+	default:
+		break;
+	}
+
+	hns_roce_qp_event(hr_dev, qpn, event_type);
+}
+
+static void hns_roce_v1_cq_err_handle(struct hns_roce_dev *hr_dev,
+				      struct hns_roce_aeqe *aeqe,
+				      int event_type)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	u32 cqn;
+
+	cqn = le32_to_cpu(roce_get_field(aeqe->event.cq_event.cq,
+			  HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M,
+			  HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S));
+
+	switch (event_type) {
+	case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
+		dev_warn(dev, "CQ 0x%x access err.\n", cqn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
+		dev_warn(dev, "CQ 0x%x overflow\n", cqn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID:
+		dev_warn(dev, "CQ 0x%x ID invalid.\n", cqn);
+		break;
+	default:
+		break;
+	}
+
+	hns_roce_cq_event(hr_dev, cqn, event_type);
+}
+
+static void hns_roce_v1_db_overflow_handle(struct hns_roce_dev *hr_dev,
+					   struct hns_roce_aeqe *aeqe)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+
+	switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M,
+			       HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) {
+	case HNS_ROCE_DB_SUBTYPE_SDB_OVF:
+		dev_warn(dev, "SDB overflow.\n");
+		break;
+	case HNS_ROCE_DB_SUBTYPE_SDB_ALM_OVF:
+		dev_warn(dev, "SDB almost overflow.\n");
+		break;
+	case HNS_ROCE_DB_SUBTYPE_SDB_ALM_EMP:
+		dev_warn(dev, "SDB almost empty.\n");
+		break;
+	case HNS_ROCE_DB_SUBTYPE_ODB_OVF:
+		dev_warn(dev, "ODB overflow.\n");
+		break;
+	case HNS_ROCE_DB_SUBTYPE_ODB_ALM_OVF:
+		dev_warn(dev, "ODB almost overflow.\n");
+		break;
+	case HNS_ROCE_DB_SUBTYPE_ODB_ALM_EMP:
+		dev_warn(dev, "SDB almost empty.\n");
+		break;
+	default:
+		break;
+	}
+}
+
+static struct hns_roce_aeqe *get_aeqe_v1(struct hns_roce_eq *eq, u32 entry)
+{
+	unsigned long off = (entry & (eq->entries - 1)) *
+			     HNS_ROCE_AEQ_ENTRY_SIZE;
+
+	return (struct hns_roce_aeqe *)((u8 *)
+		(eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) +
+		off % HNS_ROCE_BA_SIZE);
+}
+
+static struct hns_roce_aeqe *next_aeqe_sw_v1(struct hns_roce_eq *eq)
+{
+	struct hns_roce_aeqe *aeqe = get_aeqe_v1(eq, eq->cons_index);
+
+	return (roce_get_bit(aeqe->asyn, HNS_ROCE_AEQE_U32_4_OWNER_S) ^
+		!!(eq->cons_index & eq->entries)) ? aeqe : NULL;
+}
+
+static int hns_roce_v1_aeq_int(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_eq *eq)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_aeqe *aeqe;
+	int aeqes_found = 0;
+	int event_type;
+
+	while ((aeqe = next_aeqe_sw_v1(eq))) {
+
+		/* Make sure we read the AEQ entry after we have checked the
+		 * ownership bit
+		 */
+		dma_rmb();
+
+		dev_dbg(dev, "aeqe = %p, aeqe->asyn.event_type = 0x%lx\n", aeqe,
+			roce_get_field(aeqe->asyn,
+				       HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M,
+				       HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S));
+		event_type = roce_get_field(aeqe->asyn,
+					    HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M,
+					    HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S);
+		switch (event_type) {
+		case HNS_ROCE_EVENT_TYPE_PATH_MIG:
+			dev_warn(dev, "PATH MIG not supported\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_COMM_EST:
+			dev_warn(dev, "COMMUNICATION established\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
+			dev_warn(dev, "SQ DRAINED not supported\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
+			dev_warn(dev, "PATH MIG failed\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
+		case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
+		case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+			hns_roce_v1_qp_err_handle(hr_dev, aeqe, event_type);
+			break;
+		case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
+		case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
+		case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
+			dev_warn(dev, "SRQ not support!\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
+		case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
+		case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID:
+			hns_roce_v1_cq_err_handle(hr_dev, aeqe, event_type);
+			break;
+		case HNS_ROCE_EVENT_TYPE_PORT_CHANGE:
+			dev_warn(dev, "port change.\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_MB:
+			hns_roce_cmd_event(hr_dev,
+					   le16_to_cpu(aeqe->event.cmd.token),
+					   aeqe->event.cmd.status,
+					   le64_to_cpu(aeqe->event.cmd.out_param
+					   ));
+			break;
+		case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
+			hns_roce_v1_db_overflow_handle(hr_dev, aeqe);
+			break;
+		case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW:
+			dev_warn(dev, "CEQ 0x%lx overflow.\n",
+			roce_get_field(aeqe->event.ce_event.ceqe,
+				     HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M,
+				     HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S));
+			break;
+		default:
+			dev_warn(dev, "Unhandled event %d on EQ %d at idx %u.\n",
+				 event_type, eq->eqn, eq->cons_index);
+			break;
+		}
+
+		eq->cons_index++;
+		aeqes_found = 1;
+
+		if (eq->cons_index > 2 * hr_dev->caps.aeqe_depth - 1) {
+			dev_warn(dev, "cons_index overflow, set back to 0.\n");
+			eq->cons_index = 0;
+		}
+	}
+
+	set_eq_cons_index_v1(eq, 0);
+
+	return aeqes_found;
+}
+
+static struct hns_roce_ceqe *get_ceqe_v1(struct hns_roce_eq *eq, u32 entry)
+{
+	unsigned long off = (entry & (eq->entries - 1)) *
+			     HNS_ROCE_CEQ_ENTRY_SIZE;
+
+	return (struct hns_roce_ceqe *)((u8 *)
+			(eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) +
+			off % HNS_ROCE_BA_SIZE);
+}
+
+static struct hns_roce_ceqe *next_ceqe_sw_v1(struct hns_roce_eq *eq)
+{
+	struct hns_roce_ceqe *ceqe = get_ceqe_v1(eq, eq->cons_index);
+
+	return (!!(roce_get_bit(ceqe->comp,
+		HNS_ROCE_CEQE_CEQE_COMP_OWNER_S))) ^
+		(!!(eq->cons_index & eq->entries)) ? ceqe : NULL;
+}
+
+static int hns_roce_v1_ceq_int(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_eq *eq)
+{
+	struct hns_roce_ceqe *ceqe;
+	int ceqes_found = 0;
+	u32 cqn;
+
+	while ((ceqe = next_ceqe_sw_v1(eq))) {
+
+		/* Make sure we read CEQ entry after we have checked the
+		 * ownership bit
+		 */
+		dma_rmb();
+
+		cqn = roce_get_field(ceqe->comp,
+				     HNS_ROCE_CEQE_CEQE_COMP_CQN_M,
+				     HNS_ROCE_CEQE_CEQE_COMP_CQN_S);
+		hns_roce_cq_completion(hr_dev, cqn);
+
+		++eq->cons_index;
+		ceqes_found = 1;
+
+		if (eq->cons_index > 2 * hr_dev->caps.ceqe_depth - 1) {
+			dev_warn(&eq->hr_dev->pdev->dev,
+				"cons_index overflow, set back to 0.\n");
+			eq->cons_index = 0;
+		}
+	}
+
+	set_eq_cons_index_v1(eq, 0);
+
+	return ceqes_found;
+}
+
+static irqreturn_t hns_roce_v1_msix_interrupt_eq(int irq, void *eq_ptr)
+{
+	struct hns_roce_eq  *eq  = eq_ptr;
+	struct hns_roce_dev *hr_dev = eq->hr_dev;
+	int int_work = 0;
+
+	if (eq->type_flag == HNS_ROCE_CEQ)
+		/* CEQ irq routine, CEQ is pulse irq, not clear */
+		int_work = hns_roce_v1_ceq_int(hr_dev, eq);
+	else
+		/* AEQ irq routine, AEQ is pulse irq, not clear */
+		int_work = hns_roce_v1_aeq_int(hr_dev, eq);
+
+	return IRQ_RETVAL(int_work);
+}
+
+static irqreturn_t hns_roce_v1_msix_interrupt_abn(int irq, void *dev_id)
+{
+	struct hns_roce_dev *hr_dev = dev_id;
+	struct device *dev = &hr_dev->pdev->dev;
+	int int_work = 0;
+	u32 caepaemask_val;
+	u32 cealmovf_val;
+	u32 caepaest_val;
+	u32 aeshift_val;
+	u32 ceshift_val;
+	u32 cemask_val;
+	int i;
+
+	/*
+	 * Abnormal interrupt:
+	 * AEQ overflow, ECC multi-bit err, CEQ overflow must clear
+	 * interrupt, mask irq, clear irq, cancel mask operation
+	 */
+	aeshift_val = roce_read(hr_dev, ROCEE_CAEP_AEQC_AEQE_SHIFT_REG);
+
+	/* AEQE overflow */
+	if (roce_get_bit(aeshift_val,
+		ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQ_ALM_OVF_INT_ST_S) == 1) {
+		dev_warn(dev, "AEQ overflow!\n");
+
+		/* Set mask */
+		caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG);
+		roce_set_bit(caepaemask_val,
+			     ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S,
+			     HNS_ROCE_INT_MASK_ENABLE);
+		roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val);
+
+		/* Clear int state(INT_WC : write 1 clear) */
+		caepaest_val = roce_read(hr_dev, ROCEE_CAEP_AE_ST_REG);
+		roce_set_bit(caepaest_val,
+			     ROCEE_CAEP_AE_ST_CAEP_AEQ_ALM_OVF_S, 1);
+		roce_write(hr_dev, ROCEE_CAEP_AE_ST_REG, caepaest_val);
+
+		/* Clear mask */
+		caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG);
+		roce_set_bit(caepaemask_val,
+			     ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S,
+			     HNS_ROCE_INT_MASK_DISABLE);
+		roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val);
+	}
+
+	/* CEQ almost overflow */
+	for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) {
+		ceshift_val = roce_read(hr_dev, ROCEE_CAEP_CEQC_SHIFT_0_REG +
+					i * CEQ_REG_OFFSET);
+
+		if (roce_get_bit(ceshift_val,
+			ROCEE_CAEP_CEQC_SHIFT_CAEP_CEQ_ALM_OVF_INT_ST_S) == 1) {
+			dev_warn(dev, "CEQ[%d] almost overflow!\n", i);
+			int_work++;
+
+			/* Set mask */
+			cemask_val = roce_read(hr_dev,
+					       ROCEE_CAEP_CE_IRQ_MASK_0_REG +
+					       i * CEQ_REG_OFFSET);
+			roce_set_bit(cemask_val,
+				ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S,
+				HNS_ROCE_INT_MASK_ENABLE);
+			roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG +
+				   i * CEQ_REG_OFFSET, cemask_val);
+
+			/* Clear int state(INT_WC : write 1 clear) */
+			cealmovf_val = roce_read(hr_dev,
+				       ROCEE_CAEP_CEQ_ALM_OVF_0_REG +
+				       i * CEQ_REG_OFFSET);
+			roce_set_bit(cealmovf_val,
+				     ROCEE_CAEP_CEQ_ALM_OVF_CAEP_CEQ_ALM_OVF_S,
+				     1);
+			roce_write(hr_dev, ROCEE_CAEP_CEQ_ALM_OVF_0_REG +
+				   i * CEQ_REG_OFFSET, cealmovf_val);
+
+			/* Clear mask */
+			cemask_val = roce_read(hr_dev,
+				     ROCEE_CAEP_CE_IRQ_MASK_0_REG +
+				     i * CEQ_REG_OFFSET);
+			roce_set_bit(cemask_val,
+			       ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S,
+			       HNS_ROCE_INT_MASK_DISABLE);
+			roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG +
+				   i * CEQ_REG_OFFSET, cemask_val);
+		}
+	}
+
+	/* ECC multi-bit error alarm */
+	dev_warn(dev, "ECC UCERR ALARM: 0x%x, 0x%x, 0x%x\n",
+		 roce_read(hr_dev, ROCEE_ECC_UCERR_ALM0_REG),
+		 roce_read(hr_dev, ROCEE_ECC_UCERR_ALM1_REG),
+		 roce_read(hr_dev, ROCEE_ECC_UCERR_ALM2_REG));
+
+	dev_warn(dev, "ECC CERR ALARM: 0x%x, 0x%x, 0x%x\n",
+		 roce_read(hr_dev, ROCEE_ECC_CERR_ALM0_REG),
+		 roce_read(hr_dev, ROCEE_ECC_CERR_ALM1_REG),
+		 roce_read(hr_dev, ROCEE_ECC_CERR_ALM2_REG));
+
+	return IRQ_RETVAL(int_work);
+}
+
+static void hns_roce_v1_int_mask_enable(struct hns_roce_dev *hr_dev)
+{
+	u32 aemask_val;
+	int masken = 0;
+	int i;
+
+	/* AEQ INT */
+	aemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG);
+	roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S,
+		     masken);
+	roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AE_IRQ_MASK_S, masken);
+	roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, aemask_val);
+
+	/* CEQ INT */
+	for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) {
+		/* IRQ mask */
+		roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG +
+			   i * CEQ_REG_OFFSET, masken);
+	}
+}
+
+static void hns_roce_v1_free_eq(struct hns_roce_dev *hr_dev,
+				struct hns_roce_eq *eq)
+{
+	int npages = (PAGE_ALIGN(eq->eqe_size * eq->entries) +
+		      HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE;
+	int i;
+
+	if (!eq->buf_list)
+		return;
+
+	for (i = 0; i < npages; ++i)
+		dma_free_coherent(&hr_dev->pdev->dev, HNS_ROCE_BA_SIZE,
+				  eq->buf_list[i].buf, eq->buf_list[i].map);
+
+	kfree(eq->buf_list);
+}
+
+static void hns_roce_v1_enable_eq(struct hns_roce_dev *hr_dev, int eq_num,
+				  int enable_flag)
+{
+	void __iomem *eqc = hr_dev->eq_table.eqc_base[eq_num];
+	u32 val;
+
+	val = readl(eqc);
+
+	if (enable_flag)
+		roce_set_field(val,
+			       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M,
+			       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S,
+			       HNS_ROCE_EQ_STAT_VALID);
+	else
+		roce_set_field(val,
+			       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M,
+			       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S,
+			       HNS_ROCE_EQ_STAT_INVALID);
+	writel(val, eqc);
+}
+
+static int hns_roce_v1_create_eq(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_eq *eq)
+{
+	void __iomem *eqc = hr_dev->eq_table.eqc_base[eq->eqn];
+	struct device *dev = &hr_dev->pdev->dev;
+	dma_addr_t tmp_dma_addr;
+	u32 eqconsindx_val = 0;
+	u32 eqcuridx_val = 0;
+	u32 eqshift_val = 0;
+	int num_bas;
+	int ret;
+	int i;
+
+	num_bas = (PAGE_ALIGN(eq->entries * eq->eqe_size) +
+		   HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE;
+
+	if ((eq->entries * eq->eqe_size) > HNS_ROCE_BA_SIZE) {
+		dev_err(dev, "[error]eq buf %d gt ba size(%d) need bas=%d\n",
+			(eq->entries * eq->eqe_size), HNS_ROCE_BA_SIZE,
+			num_bas);
+		return -EINVAL;
+	}
+
+	eq->buf_list = kcalloc(num_bas, sizeof(*eq->buf_list), GFP_KERNEL);
+	if (!eq->buf_list)
+		return -ENOMEM;
+
+	for (i = 0; i < num_bas; ++i) {
+		eq->buf_list[i].buf = dma_alloc_coherent(dev, HNS_ROCE_BA_SIZE,
+							 &tmp_dma_addr,
+							 GFP_KERNEL);
+		if (!eq->buf_list[i].buf) {
+			ret = -ENOMEM;
+			goto err_out_free_pages;
+		}
+
+		eq->buf_list[i].map = tmp_dma_addr;
+		memset(eq->buf_list[i].buf, 0, HNS_ROCE_BA_SIZE);
+	}
+	eq->cons_index = 0;
+	roce_set_field(eqshift_val,
+		       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M,
+		       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S,
+		       HNS_ROCE_EQ_STAT_INVALID);
+	roce_set_field(eqshift_val,
+		       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_M,
+		       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_S,
+		       eq->log_entries);
+	writel(eqshift_val, eqc);
+
+	/* Configure eq extended address 12~44bit */
+	writel((u32)(eq->buf_list[0].map >> 12), eqc + 4);
+
+	/*
+	 * Configure eq extended address 45~49 bit.
+	 * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of
+	 * using 4K page, and shift more 32 because of
+	 * caculating the high 32 bit value evaluated to hardware.
+	 */
+	roce_set_field(eqcuridx_val, ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_M,
+		       ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_S,
+		       eq->buf_list[0].map >> 44);
+	roce_set_field(eqcuridx_val,
+		       ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_M,
+		       ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_S, 0);
+	writel(eqcuridx_val, eqc + 8);
+
+	/* Configure eq consumer index */
+	roce_set_field(eqconsindx_val,
+		       ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_M,
+		       ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_S, 0);
+	writel(eqconsindx_val, eqc + 0xc);
+
+	return 0;
+
+err_out_free_pages:
+	for (i -= 1; i >= 0; i--)
+		dma_free_coherent(dev, HNS_ROCE_BA_SIZE, eq->buf_list[i].buf,
+				  eq->buf_list[i].map);
+
+	kfree(eq->buf_list);
+	return ret;
+}
+
+static int hns_roce_v1_init_eq_table(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_eq *eq;
+	int irq_num;
+	int eq_num;
+	int ret;
+	int i, j;
+
+	eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors;
+	irq_num = eq_num + hr_dev->caps.num_other_vectors;
+
+	eq_table->eq = kcalloc(eq_num, sizeof(*eq_table->eq), GFP_KERNEL);
+	if (!eq_table->eq)
+		return -ENOMEM;
+
+	eq_table->eqc_base = kcalloc(eq_num, sizeof(*eq_table->eqc_base),
+				     GFP_KERNEL);
+	if (!eq_table->eqc_base) {
+		ret = -ENOMEM;
+		goto err_eqc_base_alloc_fail;
+	}
+
+	for (i = 0; i < eq_num; i++) {
+		eq = &eq_table->eq[i];
+		eq->hr_dev = hr_dev;
+		eq->eqn = i;
+		eq->irq = hr_dev->irq[i];
+		eq->log_page_size = PAGE_SHIFT;
+
+		if (i < hr_dev->caps.num_comp_vectors) {
+			/* CEQ */
+			eq_table->eqc_base[i] = hr_dev->reg_base +
+						ROCEE_CAEP_CEQC_SHIFT_0_REG +
+						CEQ_REG_OFFSET * i;
+			eq->type_flag = HNS_ROCE_CEQ;
+			eq->doorbell = hr_dev->reg_base +
+				       ROCEE_CAEP_CEQC_CONS_IDX_0_REG +
+				       CEQ_REG_OFFSET * i;
+			eq->entries = hr_dev->caps.ceqe_depth;
+			eq->log_entries = ilog2(eq->entries);
+			eq->eqe_size = HNS_ROCE_CEQ_ENTRY_SIZE;
+		} else {
+			/* AEQ */
+			eq_table->eqc_base[i] = hr_dev->reg_base +
+						ROCEE_CAEP_AEQC_AEQE_SHIFT_REG;
+			eq->type_flag = HNS_ROCE_AEQ;
+			eq->doorbell = hr_dev->reg_base +
+				       ROCEE_CAEP_AEQE_CONS_IDX_REG;
+			eq->entries = hr_dev->caps.aeqe_depth;
+			eq->log_entries = ilog2(eq->entries);
+			eq->eqe_size = HNS_ROCE_AEQ_ENTRY_SIZE;
+		}
+	}
+
+	/* Disable irq */
+	hns_roce_v1_int_mask_enable(hr_dev);
+
+	/* Configure ce int interval */
+	roce_write(hr_dev, ROCEE_CAEP_CE_INTERVAL_CFG_REG,
+		   HNS_ROCE_CEQ_DEFAULT_INTERVAL);
+
+	/* Configure ce int burst num */
+	roce_write(hr_dev, ROCEE_CAEP_CE_BURST_NUM_CFG_REG,
+		   HNS_ROCE_CEQ_DEFAULT_BURST_NUM);
+
+	for (i = 0; i < eq_num; i++) {
+		ret = hns_roce_v1_create_eq(hr_dev, &eq_table->eq[i]);
+		if (ret) {
+			dev_err(dev, "eq create failed\n");
+			goto err_create_eq_fail;
+		}
+	}
+
+	for (j = 0; j < irq_num; j++) {
+		if (j < eq_num)
+			ret = request_irq(hr_dev->irq[j],
+					  hns_roce_v1_msix_interrupt_eq, 0,
+					  hr_dev->irq_names[j],
+					  &eq_table->eq[j]);
+		else
+			ret = request_irq(hr_dev->irq[j],
+					  hns_roce_v1_msix_interrupt_abn, 0,
+					  hr_dev->irq_names[j], hr_dev);
+
+		if (ret) {
+			dev_err(dev, "request irq error!\n");
+			goto err_request_irq_fail;
+		}
+	}
+
+	for (i = 0; i < eq_num; i++)
+		hns_roce_v1_enable_eq(hr_dev, i, EQ_ENABLE);
+
+	return 0;
+
+err_request_irq_fail:
+	for (j -= 1; j >= 0; j--)
+		free_irq(hr_dev->irq[j], &eq_table->eq[j]);
+
+err_create_eq_fail:
+	for (i -= 1; i >= 0; i--)
+		hns_roce_v1_free_eq(hr_dev, &eq_table->eq[i]);
+
+	kfree(eq_table->eqc_base);
+
+err_eqc_base_alloc_fail:
+	kfree(eq_table->eq);
+
+	return ret;
+}
+
+static void hns_roce_v1_cleanup_eq_table(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
+	int irq_num;
+	int eq_num;
+	int i;
+
+	eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors;
+	irq_num = eq_num + hr_dev->caps.num_other_vectors;
+	for (i = 0; i < eq_num; i++) {
+		/* Disable EQ */
+		hns_roce_v1_enable_eq(hr_dev, i, EQ_DISABLE);
+
+		free_irq(hr_dev->irq[i], &eq_table->eq[i]);
+
+		hns_roce_v1_free_eq(hr_dev, &eq_table->eq[i]);
+	}
+	for (i = eq_num; i < irq_num; i++)
+		free_irq(hr_dev->irq[i], hr_dev);
+
+	kfree(eq_table->eqc_base);
+	kfree(eq_table->eq);
+}
+
+static const struct hns_roce_hw hns_roce_hw_v1 = {
+	.reset = hns_roce_v1_reset,
+	.hw_profile = hns_roce_v1_profile,
+	.hw_init = hns_roce_v1_init,
+	.hw_exit = hns_roce_v1_exit,
+	.post_mbox = hns_roce_v1_post_mbox,
+	.chk_mbox = hns_roce_v1_chk_mbox,
+	.set_gid = hns_roce_v1_set_gid,
+	.set_mac = hns_roce_v1_set_mac,
+	.set_mtu = hns_roce_v1_set_mtu,
+	.write_mtpt = hns_roce_v1_write_mtpt,
+	.write_cqc = hns_roce_v1_write_cqc,
+	.modify_cq = hns_roce_v1_modify_cq,
+	.clear_hem = hns_roce_v1_clear_hem,
+	.modify_qp = hns_roce_v1_modify_qp,
+	.query_qp = hns_roce_v1_query_qp,
+	.destroy_qp = hns_roce_v1_destroy_qp,
+	.post_send = hns_roce_v1_post_send,
+	.post_recv = hns_roce_v1_post_recv,
+	.req_notify_cq = hns_roce_v1_req_notify_cq,
+	.poll_cq = hns_roce_v1_poll_cq,
+	.dereg_mr = hns_roce_v1_dereg_mr,
+	.destroy_cq = hns_roce_v1_destroy_cq,
+	.init_eq = hns_roce_v1_init_eq_table,
+	.cleanup_eq = hns_roce_v1_cleanup_eq_table,
+};
+
+static const struct of_device_id hns_roce_of_match[] = {
+	{ .compatible = "hisilicon,hns-roce-v1", .data = &hns_roce_hw_v1, },
+	{},
+};
+MODULE_DEVICE_TABLE(of, hns_roce_of_match);
+
+static const struct acpi_device_id hns_roce_acpi_match[] = {
+	{ "HISI00D1", (kernel_ulong_t)&hns_roce_hw_v1 },
+	{},
+};
+MODULE_DEVICE_TABLE(acpi, hns_roce_acpi_match);
+
+static int hns_roce_node_match(struct device *dev, void *fwnode)
+{
+	return dev->fwnode == fwnode;
+}
+
+static struct
+platform_device *hns_roce_find_pdev(struct fwnode_handle *fwnode)
+{
+	struct device *dev;
+
+	/* get the 'device' corresponding to the matching 'fwnode' */
+	dev = bus_find_device(&platform_bus_type, NULL,
+			      fwnode, hns_roce_node_match);
+	/* get the platform device */
+	return dev ? to_platform_device(dev) : NULL;
+}
+
+static int hns_roce_get_cfg(struct hns_roce_dev *hr_dev)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct platform_device *pdev = NULL;
+	struct net_device *netdev = NULL;
+	struct device_node *net_node;
+	struct resource *res;
+	int port_cnt = 0;
+	u8 phy_port;
+	int ret;
+	int i;
+
+	/* check if we are compatible with the underlying SoC */
+	if (dev_of_node(dev)) {
+		const struct of_device_id *of_id;
+
+		of_id = of_match_node(hns_roce_of_match, dev->of_node);
+		if (!of_id) {
+			dev_err(dev, "device is not compatible!\n");
+			return -ENXIO;
+		}
+		hr_dev->hw = (const struct hns_roce_hw *)of_id->data;
+		if (!hr_dev->hw) {
+			dev_err(dev, "couldn't get H/W specific DT data!\n");
+			return -ENXIO;
+		}
+	} else if (is_acpi_device_node(dev->fwnode)) {
+		const struct acpi_device_id *acpi_id;
+
+		acpi_id = acpi_match_device(hns_roce_acpi_match, dev);
+		if (!acpi_id) {
+			dev_err(dev, "device is not compatible!\n");
+			return -ENXIO;
+		}
+		hr_dev->hw = (const struct hns_roce_hw *) acpi_id->driver_data;
+		if (!hr_dev->hw) {
+			dev_err(dev, "couldn't get H/W specific ACPI data!\n");
+			return -ENXIO;
+		}
+	} else {
+		dev_err(dev, "can't read compatibility data from DT or ACPI\n");
+		return -ENXIO;
+	}
+
+	/* get the mapped register base address */
+	res = platform_get_resource(hr_dev->pdev, IORESOURCE_MEM, 0);
+	hr_dev->reg_base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(hr_dev->reg_base))
+		return PTR_ERR(hr_dev->reg_base);
+
+	/* read the node_guid of IB device from the DT or ACPI */
+	ret = device_property_read_u8_array(dev, "node-guid",
+					    (u8 *)&hr_dev->ib_dev.node_guid,
+					    GUID_LEN);
+	if (ret) {
+		dev_err(dev, "couldn't get node_guid from DT or ACPI!\n");
+		return ret;
+	}
+
+	/* get the RoCE associated ethernet ports or netdevices */
+	for (i = 0; i < HNS_ROCE_MAX_PORTS; i++) {
+		if (dev_of_node(dev)) {
+			net_node = of_parse_phandle(dev->of_node, "eth-handle",
+						    i);
+			if (!net_node)
+				continue;
+			pdev = of_find_device_by_node(net_node);
+		} else if (is_acpi_device_node(dev->fwnode)) {
+			struct acpi_reference_args args;
+			struct fwnode_handle *fwnode;
+
+			ret = acpi_node_get_property_reference(dev->fwnode,
+							       "eth-handle",
+							       i, &args);
+			if (ret)
+				continue;
+			fwnode = acpi_fwnode_handle(args.adev);
+			pdev = hns_roce_find_pdev(fwnode);
+		} else {
+			dev_err(dev, "cannot read data from DT or ACPI\n");
+			return -ENXIO;
+		}
+
+		if (pdev) {
+			netdev = platform_get_drvdata(pdev);
+			phy_port = (u8)i;
+			if (netdev) {
+				hr_dev->iboe.netdevs[port_cnt] = netdev;
+				hr_dev->iboe.phy_port[port_cnt] = phy_port;
+			} else {
+				dev_err(dev, "no netdev found with pdev %s\n",
+					pdev->name);
+				return -ENODEV;
+			}
+			port_cnt++;
+		}
+	}
+
+	if (port_cnt == 0) {
+		dev_err(dev, "unable to get eth-handle for available ports!\n");
+		return -EINVAL;
+	}
+
+	hr_dev->caps.num_ports = port_cnt;
+
+	/* cmd issue mode: 0 is poll, 1 is event */
+	hr_dev->cmd_mod = 1;
+	hr_dev->loop_idc = 0;
+	hr_dev->sdb_offset = ROCEE_DB_SQ_L_0_REG;
+	hr_dev->odb_offset = ROCEE_DB_OTHERS_L_0_REG;
+
+	/* read the interrupt names from the DT or ACPI */
+	ret = device_property_read_string_array(dev, "interrupt-names",
+						hr_dev->irq_names,
+						HNS_ROCE_V1_MAX_IRQ_NUM);
+	if (ret < 0) {
+		dev_err(dev, "couldn't get interrupt names from DT or ACPI!\n");
+		return ret;
+	}
+
+	/* fetch the interrupt numbers */
+	for (i = 0; i < HNS_ROCE_V1_MAX_IRQ_NUM; i++) {
+		hr_dev->irq[i] = platform_get_irq(hr_dev->pdev, i);
+		if (hr_dev->irq[i] <= 0) {
+			dev_err(dev, "platform get of irq[=%d] failed!\n", i);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * hns_roce_probe - RoCE driver entrance
+ * @pdev: pointer to platform device
+ * Return : int
+ *
+ */
+static int hns_roce_probe(struct platform_device *pdev)
+{
+	int ret;
+	struct hns_roce_dev *hr_dev;
+	struct device *dev = &pdev->dev;
+
+	hr_dev = (struct hns_roce_dev *)ib_alloc_device(sizeof(*hr_dev));
+	if (!hr_dev)
+		return -ENOMEM;
+
+	hr_dev->priv = kzalloc(sizeof(struct hns_roce_v1_priv), GFP_KERNEL);
+	if (!hr_dev->priv) {
+		ret = -ENOMEM;
+		goto error_failed_kzalloc;
+	}
+
+	hr_dev->pdev = pdev;
+	hr_dev->dev = dev;
+	platform_set_drvdata(pdev, hr_dev);
+
+	if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64ULL)) &&
+	    dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32ULL))) {
+		dev_err(dev, "Not usable DMA addressing mode\n");
+		ret = -EIO;
+		goto error_failed_get_cfg;
+	}
+
+	ret = hns_roce_get_cfg(hr_dev);
+	if (ret) {
+		dev_err(dev, "Get Configuration failed!\n");
+		goto error_failed_get_cfg;
+	}
+
+	ret = hns_roce_init(hr_dev);
+	if (ret) {
+		dev_err(dev, "RoCE engine init failed!\n");
+		goto error_failed_get_cfg;
+	}
+
+	return 0;
+
+error_failed_get_cfg:
+	kfree(hr_dev->priv);
+
+error_failed_kzalloc:
+	ib_dealloc_device(&hr_dev->ib_dev);
+
+	return ret;
+}
+
+/**
+ * hns_roce_remove - remove RoCE device
+ * @pdev: pointer to platform device
+ */
+static int hns_roce_remove(struct platform_device *pdev)
+{
+	struct hns_roce_dev *hr_dev = platform_get_drvdata(pdev);
+
+	hns_roce_exit(hr_dev);
+	kfree(hr_dev->priv);
+	ib_dealloc_device(&hr_dev->ib_dev);
+
+	return 0;
+}
+
+static struct platform_driver hns_roce_driver = {
+	.probe = hns_roce_probe,
+	.remove = hns_roce_remove,
+	.driver = {
+		.name = DRV_NAME,
+		.of_match_table = hns_roce_of_match,
+		.acpi_match_table = ACPI_PTR(hns_roce_acpi_match),
+	},
+};
+
+module_platform_driver(hns_roce_driver);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Wei Hu <xavier.huwei@huawei.com>");
+MODULE_AUTHOR("Nenglong Zhao <zhaonenglong@hisilicon.com>");
+MODULE_AUTHOR("Lijun Ou <oulijun@huawei.com>");
+MODULE_DESCRIPTION("Hisilicon Hip06 Family RoCE Driver");
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
new file mode 100644
index 0000000..e9a2717
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
@@ -0,0 +1,1111 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _HNS_ROCE_HW_V1_H
+#define _HNS_ROCE_HW_V1_H
+
+#define CQ_STATE_VALID					2
+
+#define HNS_ROCE_V1_MAX_PD_NUM				0x8000
+#define HNS_ROCE_V1_MAX_CQ_NUM				0x10000
+#define HNS_ROCE_V1_MAX_CQE_NUM				0x8000
+
+#define HNS_ROCE_V1_MAX_QP_NUM				0x40000
+#define HNS_ROCE_V1_MAX_WQE_NUM				0x4000
+
+#define HNS_ROCE_V1_MAX_MTPT_NUM			0x80000
+
+#define HNS_ROCE_V1_MAX_MTT_SEGS			0x100000
+
+#define HNS_ROCE_V1_MAX_QP_INIT_RDMA			128
+#define HNS_ROCE_V1_MAX_QP_DEST_RDMA			128
+
+#define HNS_ROCE_V1_MAX_SQ_DESC_SZ			64
+#define HNS_ROCE_V1_MAX_RQ_DESC_SZ			64
+#define HNS_ROCE_V1_SG_NUM				2
+#define HNS_ROCE_V1_INLINE_SIZE				32
+
+#define HNS_ROCE_V1_UAR_NUM				256
+#define HNS_ROCE_V1_PHY_UAR_NUM				8
+
+#define HNS_ROCE_V1_GID_NUM				16
+#define HNS_ROCE_V1_RESV_QP				8
+
+#define HNS_ROCE_V1_MAX_IRQ_NUM				34
+#define HNS_ROCE_V1_COMP_VEC_NUM			32
+#define HNS_ROCE_V1_AEQE_VEC_NUM			1
+#define HNS_ROCE_V1_ABNORMAL_VEC_NUM			1
+
+#define HNS_ROCE_V1_COMP_EQE_NUM			0x8000
+#define HNS_ROCE_V1_ASYNC_EQE_NUM			0x400
+
+#define HNS_ROCE_V1_QPC_ENTRY_SIZE			256
+#define HNS_ROCE_V1_IRRL_ENTRY_SIZE			8
+#define HNS_ROCE_V1_CQC_ENTRY_SIZE			64
+#define HNS_ROCE_V1_MTPT_ENTRY_SIZE			64
+#define HNS_ROCE_V1_MTT_ENTRY_SIZE			64
+
+#define HNS_ROCE_V1_CQE_ENTRY_SIZE			32
+#define HNS_ROCE_V1_PAGE_SIZE_SUPPORT			0xFFFFF000
+
+#define HNS_ROCE_V1_TABLE_CHUNK_SIZE			(1 << 17)
+
+#define HNS_ROCE_V1_EXT_RAQ_WF				8
+#define HNS_ROCE_V1_RAQ_ENTRY				64
+#define HNS_ROCE_V1_RAQ_DEPTH				32768
+#define HNS_ROCE_V1_RAQ_SIZE	(HNS_ROCE_V1_RAQ_ENTRY * HNS_ROCE_V1_RAQ_DEPTH)
+
+#define HNS_ROCE_V1_SDB_DEPTH				0x400
+#define HNS_ROCE_V1_ODB_DEPTH				0x400
+
+#define HNS_ROCE_V1_DB_RSVD				0x80
+
+#define HNS_ROCE_V1_SDB_ALEPT				HNS_ROCE_V1_DB_RSVD
+#define HNS_ROCE_V1_SDB_ALFUL	(HNS_ROCE_V1_SDB_DEPTH - HNS_ROCE_V1_DB_RSVD)
+#define HNS_ROCE_V1_ODB_ALEPT				HNS_ROCE_V1_DB_RSVD
+#define HNS_ROCE_V1_ODB_ALFUL	(HNS_ROCE_V1_ODB_DEPTH - HNS_ROCE_V1_DB_RSVD)
+
+#define HNS_ROCE_V1_EXT_SDB_DEPTH			0x4000
+#define HNS_ROCE_V1_EXT_ODB_DEPTH			0x4000
+#define HNS_ROCE_V1_EXT_SDB_ENTRY			16
+#define HNS_ROCE_V1_EXT_ODB_ENTRY			16
+#define HNS_ROCE_V1_EXT_SDB_SIZE  \
+	(HNS_ROCE_V1_EXT_SDB_DEPTH * HNS_ROCE_V1_EXT_SDB_ENTRY)
+#define HNS_ROCE_V1_EXT_ODB_SIZE  \
+	(HNS_ROCE_V1_EXT_ODB_DEPTH * HNS_ROCE_V1_EXT_ODB_ENTRY)
+
+#define HNS_ROCE_V1_EXT_SDB_ALEPT			HNS_ROCE_V1_DB_RSVD
+#define HNS_ROCE_V1_EXT_SDB_ALFUL  \
+	(HNS_ROCE_V1_EXT_SDB_DEPTH - HNS_ROCE_V1_DB_RSVD)
+#define HNS_ROCE_V1_EXT_ODB_ALEPT			HNS_ROCE_V1_DB_RSVD
+#define HNS_ROCE_V1_EXT_ODB_ALFUL	\
+	(HNS_ROCE_V1_EXT_ODB_DEPTH - HNS_ROCE_V1_DB_RSVD)
+
+#define HNS_ROCE_V1_DB_WAIT_OK				0
+#define HNS_ROCE_V1_DB_STAGE1				1
+#define HNS_ROCE_V1_DB_STAGE2				2
+#define HNS_ROCE_V1_CHECK_DB_TIMEOUT_MSECS		10000
+#define HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS		20
+#define HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS		50000
+#define HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS	10000
+#define HNS_ROCE_V1_FREE_MR_WAIT_VALUE			5
+#define HNS_ROCE_V1_RECREATE_LP_QP_WAIT_VALUE		20
+
+#define HNS_ROCE_BT_RSV_BUF_SIZE			(1 << 17)
+
+#define HNS_ROCE_V1_TPTR_ENTRY_SIZE			2
+#define HNS_ROCE_V1_TPTR_BUF_SIZE	\
+	(HNS_ROCE_V1_TPTR_ENTRY_SIZE * HNS_ROCE_V1_MAX_CQ_NUM)
+
+#define HNS_ROCE_ODB_POLL_MODE				0
+
+#define HNS_ROCE_SDB_NORMAL_MODE			0
+#define HNS_ROCE_SDB_EXTEND_MODE			1
+
+#define HNS_ROCE_ODB_EXTEND_MODE			1
+
+#define KEY_VALID					0x02
+
+#define HNS_ROCE_CQE_QPN_MASK				0x3ffff
+#define HNS_ROCE_CQE_STATUS_MASK			0x1f
+#define HNS_ROCE_CQE_OPCODE_MASK			0xf
+
+#define HNS_ROCE_CQE_SUCCESS				0x00
+#define HNS_ROCE_CQE_SYNDROME_LOCAL_LENGTH_ERR		0x01
+#define HNS_ROCE_CQE_SYNDROME_LOCAL_QP_OP_ERR		0x02
+#define HNS_ROCE_CQE_SYNDROME_LOCAL_PROT_ERR		0x03
+#define HNS_ROCE_CQE_SYNDROME_WR_FLUSH_ERR		0x04
+#define HNS_ROCE_CQE_SYNDROME_MEM_MANAGE_OPERATE_ERR	0x05
+#define HNS_ROCE_CQE_SYNDROME_BAD_RESP_ERR		0x06
+#define HNS_ROCE_CQE_SYNDROME_LOCAL_ACCESS_ERR		0x07
+#define HNS_ROCE_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR	0x08
+#define HNS_ROCE_CQE_SYNDROME_REMOTE_ACCESS_ERR		0x09
+#define HNS_ROCE_CQE_SYNDROME_REMOTE_OP_ERR		0x0a
+#define HNS_ROCE_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR	0x0b
+#define HNS_ROCE_CQE_SYNDROME_RNR_RETRY_EXC_ERR		0x0c
+
+#define QP1C_CFGN_OFFSET				0x28
+#define PHY_PORT_OFFSET					0x8
+#define MTPT_IDX_SHIFT					16
+#define ALL_PORT_VAL_OPEN				0x3f
+#define POL_TIME_INTERVAL_VAL				0x80
+#define SLEEP_TIME_INTERVAL				20
+#define SQ_PSN_SHIFT					8
+#define QKEY_VAL					0x80010000
+#define SDB_INV_CNT_OFFSET				8
+#define SDB_ST_CMP_VAL					8
+
+#define HNS_ROCE_CEQ_DEFAULT_INTERVAL			0x10
+#define HNS_ROCE_CEQ_DEFAULT_BURST_NUM			0x10
+
+#define HNS_ROCE_INT_MASK_DISABLE			0
+#define HNS_ROCE_INT_MASK_ENABLE			1
+
+#define CEQ_REG_OFFSET					0x18
+
+#define HNS_ROCE_CEQE_CEQE_COMP_OWNER_S	0
+
+#define HNS_ROCE_V1_CONS_IDX_M GENMASK(15, 0)
+
+#define HNS_ROCE_CEQE_CEQE_COMP_CQN_S 16
+#define HNS_ROCE_CEQE_CEQE_COMP_CQN_M GENMASK(31, 16)
+
+#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S 16
+#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M GENMASK(23, 16)
+
+#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S 24
+#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M GENMASK(30, 24)
+
+#define HNS_ROCE_AEQE_U32_4_OWNER_S 31
+
+#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S 0
+#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M GENMASK(23, 0)
+
+#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S 25
+#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M GENMASK(27, 25)
+
+#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S 0
+#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M GENMASK(15, 0)
+
+#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S 0
+#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M GENMASK(4, 0)
+
+struct hns_roce_cq_context {
+	__le32 cqc_byte_4;
+	__le32 cq_bt_l;
+	__le32 cqc_byte_12;
+	__le32 cur_cqe_ba0_l;
+	__le32 cqc_byte_20;
+	__le32 cqe_tptr_addr_l;
+	__le32 cur_cqe_ba1_l;
+	__le32 cqc_byte_32;
+};
+
+#define CQ_CONTEXT_CQC_BYTE_4_CQC_STATE_S 0
+#define CQ_CONTEXT_CQC_BYTE_4_CQC_STATE_M   \
+	(((1UL << 2) - 1) << CQ_CONTEXT_CQC_BYTE_4_CQC_STATE_S)
+
+#define CQ_CONTEXT_CQC_BYTE_4_CQN_S 16
+#define CQ_CONTEXT_CQC_BYTE_4_CQN_M   \
+	(((1UL << 16) - 1) << CQ_CONTEXT_CQC_BYTE_4_CQN_S)
+
+#define CQ_CONTEXT_CQC_BYTE_12_CQ_BT_H_S 0
+#define CQ_CONTEXT_CQC_BYTE_12_CQ_BT_H_M   \
+	(((1UL << 17) - 1) << CQ_CONTEXT_CQC_BYTE_12_CQ_BT_H_S)
+
+#define CQ_CONTEXT_CQC_BYTE_12_CQ_CQE_SHIFT_S 20
+#define CQ_CONTEXT_CQC_BYTE_12_CQ_CQE_SHIFT_M   \
+	(((1UL << 4) - 1) << CQ_CONTEXT_CQC_BYTE_12_CQ_CQE_SHIFT_S)
+
+#define CQ_CONTEXT_CQC_BYTE_12_CEQN_S 24
+#define CQ_CONTEXT_CQC_BYTE_12_CEQN_M   \
+	(((1UL << 5) - 1) << CQ_CONTEXT_CQC_BYTE_12_CEQN_S)
+
+#define CQ_CONTEXT_CQC_BYTE_20_CUR_CQE_BA0_H_S 0
+#define CQ_CONTEXT_CQC_BYTE_20_CUR_CQE_BA0_H_M   \
+	(((1UL << 5) - 1) << CQ_CONTEXT_CQC_BYTE_20_CUR_CQE_BA0_H_S)
+
+#define CQ_CONTEXT_CQC_BYTE_20_CQ_CUR_INDEX_S 16
+#define CQ_CONTEXT_CQC_BYTE_20_CQ_CUR_INDEX_M   \
+	(((1UL << 16) - 1) << CQ_CONTEXT_CQC_BYTE_20_CQ_CUR_INDEX_S)
+
+#define CQ_CONTEXT_CQC_BYTE_20_CQE_TPTR_ADDR_H_S 8
+#define CQ_CONTEXT_CQC_BYTE_20_CQE_TPTR_ADDR_H_M   \
+	(((1UL << 5) - 1) << CQ_CONTEXT_CQC_BYTE_20_CQE_TPTR_ADDR_H_S)
+
+#define CQ_CONTEXT_CQC_BYTE_32_CUR_CQE_BA1_H_S 0
+#define CQ_CONTEXT_CQC_BYTE_32_CUR_CQE_BA1_H_M   \
+	(((1UL << 5) - 1) << CQ_CONTEXT_CQC_BYTE_32_CUR_CQE_BA1_H_S)
+
+#define CQ_CONTEXT_CQC_BYTE_32_SE_FLAG_S 9
+
+#define CQ_CONTEXT_CQC_BYTE_32_CE_FLAG_S 8
+#define CQ_CONTEXT_CQC_BYTE_32_NOTIFICATION_FLAG_S 14
+#define CQ_CQNTEXT_CQC_BYTE_32_TYPE_OF_COMPLETION_NOTIFICATION_S 15
+
+#define CQ_CONTEXT_CQC_BYTE_32_CQ_CONS_IDX_S 16
+#define CQ_CONTEXT_CQC_BYTE_32_CQ_CONS_IDX_M   \
+	(((1UL << 16) - 1) << CQ_CONTEXT_CQC_BYTE_32_CQ_CONS_IDX_S)
+
+struct hns_roce_cqe {
+	__le32 cqe_byte_4;
+	union {
+		__le32 r_key;
+		__be32 immediate_data;
+	};
+	__le32 byte_cnt;
+	__le32 cqe_byte_16;
+	__le32 cqe_byte_20;
+	__le32 s_mac_l;
+	__le32 cqe_byte_28;
+	__le32 reserved;
+};
+
+#define CQE_BYTE_4_OWNER_S 7
+#define CQE_BYTE_4_SQ_RQ_FLAG_S 14
+
+#define CQE_BYTE_4_STATUS_OF_THE_OPERATION_S 8
+#define CQE_BYTE_4_STATUS_OF_THE_OPERATION_M   \
+	(((1UL << 5) - 1) << CQE_BYTE_4_STATUS_OF_THE_OPERATION_S)
+
+#define CQE_BYTE_4_WQE_INDEX_S 16
+#define CQE_BYTE_4_WQE_INDEX_M	(((1UL << 14) - 1) << CQE_BYTE_4_WQE_INDEX_S)
+
+#define CQE_BYTE_4_OPERATION_TYPE_S 0
+#define CQE_BYTE_4_OPERATION_TYPE_M   \
+	(((1UL << 4) - 1) << CQE_BYTE_4_OPERATION_TYPE_S)
+
+#define CQE_BYTE_4_IMM_INDICATOR_S 15
+
+#define CQE_BYTE_16_LOCAL_QPN_S 0
+#define CQE_BYTE_16_LOCAL_QPN_M	(((1UL << 24) - 1) << CQE_BYTE_16_LOCAL_QPN_S)
+
+#define CQE_BYTE_20_PORT_NUM_S 26
+#define CQE_BYTE_20_PORT_NUM_M	(((1UL << 3) - 1) << CQE_BYTE_20_PORT_NUM_S)
+
+#define CQE_BYTE_20_SL_S 24
+#define CQE_BYTE_20_SL_M	(((1UL << 2) - 1) << CQE_BYTE_20_SL_S)
+
+#define CQE_BYTE_20_REMOTE_QPN_S 0
+#define CQE_BYTE_20_REMOTE_QPN_M   \
+	(((1UL << 24) - 1) << CQE_BYTE_20_REMOTE_QPN_S)
+
+#define CQE_BYTE_20_GRH_PRESENT_S 29
+
+#define CQE_BYTE_28_P_KEY_IDX_S 16
+#define CQE_BYTE_28_P_KEY_IDX_M	(((1UL << 16) - 1) << CQE_BYTE_28_P_KEY_IDX_S)
+
+#define CQ_DB_REQ_NOT_SOL	0
+#define CQ_DB_REQ_NOT		(1 << 16)
+
+struct hns_roce_v1_mpt_entry {
+	__le32  mpt_byte_4;
+	__le32  pbl_addr_l;
+	__le32  mpt_byte_12;
+	__le32  virt_addr_l;
+	__le32  virt_addr_h;
+	__le32  length;
+	__le32  mpt_byte_28;
+	__le32  pa0_l;
+	__le32  mpt_byte_36;
+	__le32  mpt_byte_40;
+	__le32  mpt_byte_44;
+	__le32  mpt_byte_48;
+	__le32  pa4_l;
+	__le32  mpt_byte_56;
+	__le32  mpt_byte_60;
+	__le32  mpt_byte_64;
+};
+
+#define MPT_BYTE_4_KEY_STATE_S 0
+#define MPT_BYTE_4_KEY_STATE_M	(((1UL << 2) - 1) << MPT_BYTE_4_KEY_STATE_S)
+
+#define MPT_BYTE_4_KEY_S 8
+#define MPT_BYTE_4_KEY_M	(((1UL << 8) - 1) << MPT_BYTE_4_KEY_S)
+
+#define MPT_BYTE_4_PAGE_SIZE_S 16
+#define MPT_BYTE_4_PAGE_SIZE_M	(((1UL << 2) - 1) << MPT_BYTE_4_PAGE_SIZE_S)
+
+#define MPT_BYTE_4_MW_TYPE_S 20
+
+#define MPT_BYTE_4_MW_BIND_ENABLE_S 21
+
+#define MPT_BYTE_4_OWN_S 22
+
+#define MPT_BYTE_4_MEMORY_LOCATION_TYPE_S 24
+#define MPT_BYTE_4_MEMORY_LOCATION_TYPE_M   \
+	(((1UL << 2) - 1) << MPT_BYTE_4_MEMORY_LOCATION_TYPE_S)
+
+#define MPT_BYTE_4_REMOTE_ATOMIC_S 26
+#define MPT_BYTE_4_LOCAL_WRITE_S 27
+#define MPT_BYTE_4_REMOTE_WRITE_S 28
+#define MPT_BYTE_4_REMOTE_READ_S 29
+#define MPT_BYTE_4_REMOTE_INVAL_ENABLE_S 30
+#define MPT_BYTE_4_ADDRESS_TYPE_S 31
+
+#define MPT_BYTE_12_PBL_ADDR_H_S 0
+#define MPT_BYTE_12_PBL_ADDR_H_M   \
+	(((1UL << 17) - 1) << MPT_BYTE_12_PBL_ADDR_H_S)
+
+#define MPT_BYTE_12_MW_BIND_COUNTER_S 17
+#define MPT_BYTE_12_MW_BIND_COUNTER_M   \
+	(((1UL << 15) - 1) << MPT_BYTE_12_MW_BIND_COUNTER_S)
+
+#define MPT_BYTE_28_PD_S 0
+#define MPT_BYTE_28_PD_M	(((1UL << 16) - 1) << MPT_BYTE_28_PD_S)
+
+#define MPT_BYTE_28_L_KEY_IDX_L_S 16
+#define MPT_BYTE_28_L_KEY_IDX_L_M   \
+	(((1UL << 16) - 1) << MPT_BYTE_28_L_KEY_IDX_L_S)
+
+#define MPT_BYTE_36_PA0_H_S 0
+#define MPT_BYTE_36_PA0_H_M	(((1UL << 5) - 1) << MPT_BYTE_36_PA0_H_S)
+
+#define MPT_BYTE_36_PA1_L_S 8
+#define MPT_BYTE_36_PA1_L_M	(((1UL << 24) - 1) << MPT_BYTE_36_PA1_L_S)
+
+#define MPT_BYTE_40_PA1_H_S 0
+#define MPT_BYTE_40_PA1_H_M	(((1UL << 13) - 1) << MPT_BYTE_40_PA1_H_S)
+
+#define MPT_BYTE_40_PA2_L_S 16
+#define MPT_BYTE_40_PA2_L_M	(((1UL << 16) - 1) << MPT_BYTE_40_PA2_L_S)
+
+#define MPT_BYTE_44_PA2_H_S 0
+#define MPT_BYTE_44_PA2_H_M	(((1UL << 21) - 1) << MPT_BYTE_44_PA2_H_S)
+
+#define MPT_BYTE_44_PA3_L_S 24
+#define MPT_BYTE_44_PA3_L_M	(((1UL << 8) - 1) << MPT_BYTE_44_PA3_L_S)
+
+#define MPT_BYTE_48_PA3_H_S 0
+#define MPT_BYTE_48_PA3_H_M	(((1UL << 29) - 1) << MPT_BYTE_48_PA3_H_S)
+
+#define MPT_BYTE_56_PA4_H_S 0
+#define MPT_BYTE_56_PA4_H_M	(((1UL << 5) - 1) << MPT_BYTE_56_PA4_H_S)
+
+#define MPT_BYTE_56_PA5_L_S 8
+#define MPT_BYTE_56_PA5_L_M	(((1UL << 24) - 1) << MPT_BYTE_56_PA5_L_S)
+
+#define MPT_BYTE_60_PA5_H_S 0
+#define MPT_BYTE_60_PA5_H_M	(((1UL << 13) - 1) << MPT_BYTE_60_PA5_H_S)
+
+#define MPT_BYTE_60_PA6_L_S 16
+#define MPT_BYTE_60_PA6_L_M	(((1UL << 16) - 1) << MPT_BYTE_60_PA6_L_S)
+
+#define MPT_BYTE_64_PA6_H_S 0
+#define MPT_BYTE_64_PA6_H_M	(((1UL << 21) - 1) << MPT_BYTE_64_PA6_H_S)
+
+#define MPT_BYTE_64_L_KEY_IDX_H_S 24
+#define MPT_BYTE_64_L_KEY_IDX_H_M   \
+	(((1UL << 8) - 1) << MPT_BYTE_64_L_KEY_IDX_H_S)
+
+struct hns_roce_wqe_ctrl_seg {
+	__le32 sgl_pa_h;
+	__le32 flag;
+	union {
+		__be32 imm_data;
+		__le32 inv_key;
+	};
+	__le32 msg_length;
+};
+
+struct hns_roce_wqe_data_seg {
+	__le64    addr;
+	__le32    lkey;
+	__le32    len;
+};
+
+struct hns_roce_wqe_raddr_seg {
+	__le32 rkey;
+	__le32 len;/* reserved */
+	__le64 raddr;
+};
+
+struct hns_roce_rq_wqe_ctrl {
+	__le32 rwqe_byte_4;
+	__le32 rocee_sgl_ba_l;
+	__le32 rwqe_byte_12;
+	__le32 reserved[5];
+};
+
+#define RQ_WQE_CTRL_RWQE_BYTE_12_RWQE_SGE_NUM_S 16
+#define RQ_WQE_CTRL_RWQE_BYTE_12_RWQE_SGE_NUM_M   \
+	(((1UL << 6) - 1) << RQ_WQE_CTRL_RWQE_BYTE_12_RWQE_SGE_NUM_S)
+
+#define HNS_ROCE_QP_DESTROY_TIMEOUT_MSECS	10000
+
+#define GID_LEN					16
+
+struct hns_roce_ud_send_wqe {
+	__le32 dmac_h;
+	__le32 u32_8;
+	__le32 immediate_data;
+
+	__le32 u32_16;
+	union {
+		unsigned char dgid[GID_LEN];
+		struct {
+			__le32 u32_20;
+			__le32 u32_24;
+			__le32 u32_28;
+			__le32 u32_32;
+		};
+	};
+
+	__le32 u32_36;
+	__le32 u32_40;
+
+	__le32 va0_l;
+	__le32 va0_h;
+	__le32 l_key0;
+
+	__le32 va1_l;
+	__le32 va1_h;
+	__le32 l_key1;
+};
+
+#define UD_SEND_WQE_U32_4_DMAC_0_S 0
+#define UD_SEND_WQE_U32_4_DMAC_0_M   \
+	(((1UL << 8) - 1) << UD_SEND_WQE_U32_4_DMAC_0_S)
+
+#define UD_SEND_WQE_U32_4_DMAC_1_S 8
+#define UD_SEND_WQE_U32_4_DMAC_1_M   \
+	(((1UL << 8) - 1) << UD_SEND_WQE_U32_4_DMAC_1_S)
+
+#define UD_SEND_WQE_U32_4_DMAC_2_S 16
+#define UD_SEND_WQE_U32_4_DMAC_2_M   \
+	(((1UL << 8) - 1) << UD_SEND_WQE_U32_4_DMAC_2_S)
+
+#define UD_SEND_WQE_U32_4_DMAC_3_S 24
+#define UD_SEND_WQE_U32_4_DMAC_3_M   \
+	(((1UL << 8) - 1) << UD_SEND_WQE_U32_4_DMAC_3_S)
+
+#define UD_SEND_WQE_U32_8_DMAC_4_S 0
+#define UD_SEND_WQE_U32_8_DMAC_4_M   \
+	(((1UL << 8) - 1) << UD_SEND_WQE_U32_8_DMAC_4_S)
+
+#define UD_SEND_WQE_U32_8_DMAC_5_S 8
+#define UD_SEND_WQE_U32_8_DMAC_5_M   \
+	(((1UL << 8) - 1) << UD_SEND_WQE_U32_8_DMAC_5_S)
+
+#define UD_SEND_WQE_U32_8_LOOPBACK_INDICATOR_S 22
+
+#define UD_SEND_WQE_U32_8_OPERATION_TYPE_S 16
+#define UD_SEND_WQE_U32_8_OPERATION_TYPE_M   \
+	(((1UL << 4) - 1) << UD_SEND_WQE_U32_8_OPERATION_TYPE_S)
+
+#define UD_SEND_WQE_U32_8_NUMBER_OF_DATA_SEG_S 24
+#define UD_SEND_WQE_U32_8_NUMBER_OF_DATA_SEG_M   \
+	(((1UL << 6) - 1) << UD_SEND_WQE_U32_8_NUMBER_OF_DATA_SEG_S)
+
+#define UD_SEND_WQE_U32_8_SEND_GL_ROUTING_HDR_FLAG_S 31
+
+#define UD_SEND_WQE_U32_16_DEST_QP_S 0
+#define UD_SEND_WQE_U32_16_DEST_QP_M   \
+	(((1UL << 24) - 1) << UD_SEND_WQE_U32_16_DEST_QP_S)
+
+#define UD_SEND_WQE_U32_16_MAX_STATIC_RATE_S 24
+#define UD_SEND_WQE_U32_16_MAX_STATIC_RATE_M   \
+	(((1UL << 8) - 1) << UD_SEND_WQE_U32_16_MAX_STATIC_RATE_S)
+
+#define UD_SEND_WQE_U32_36_FLOW_LABEL_S 0
+#define UD_SEND_WQE_U32_36_FLOW_LABEL_M   \
+	(((1UL << 20) - 1) << UD_SEND_WQE_U32_36_FLOW_LABEL_S)
+
+#define UD_SEND_WQE_U32_36_PRIORITY_S 20
+#define UD_SEND_WQE_U32_36_PRIORITY_M   \
+	(((1UL << 4) - 1) << UD_SEND_WQE_U32_36_PRIORITY_S)
+
+#define UD_SEND_WQE_U32_36_SGID_INDEX_S 24
+#define UD_SEND_WQE_U32_36_SGID_INDEX_M   \
+	(((1UL << 8) - 1) << UD_SEND_WQE_U32_36_SGID_INDEX_S)
+
+#define UD_SEND_WQE_U32_40_HOP_LIMIT_S 0
+#define UD_SEND_WQE_U32_40_HOP_LIMIT_M   \
+	(((1UL << 8) - 1) << UD_SEND_WQE_U32_40_HOP_LIMIT_S)
+
+#define UD_SEND_WQE_U32_40_TRAFFIC_CLASS_S 8
+#define UD_SEND_WQE_U32_40_TRAFFIC_CLASS_M   \
+	(((1UL << 8) - 1) << UD_SEND_WQE_U32_40_TRAFFIC_CLASS_S)
+
+struct hns_roce_sqp_context {
+	__le32 qp1c_bytes_4;
+	__le32 sq_rq_bt_l;
+	__le32 qp1c_bytes_12;
+	__le32 qp1c_bytes_16;
+	__le32 qp1c_bytes_20;
+	__le32 cur_rq_wqe_ba_l;
+	__le32 qp1c_bytes_28;
+	__le32 qp1c_bytes_32;
+	__le32 cur_sq_wqe_ba_l;
+	__le32 qp1c_bytes_40;
+};
+
+#define QP1C_BYTES_4_QP_STATE_S 0
+#define QP1C_BYTES_4_QP_STATE_M   \
+	(((1UL << 3) - 1) << QP1C_BYTES_4_QP_STATE_S)
+
+#define QP1C_BYTES_4_SQ_WQE_SHIFT_S 8
+#define QP1C_BYTES_4_SQ_WQE_SHIFT_M   \
+	(((1UL << 4) - 1) << QP1C_BYTES_4_SQ_WQE_SHIFT_S)
+
+#define QP1C_BYTES_4_RQ_WQE_SHIFT_S 12
+#define QP1C_BYTES_4_RQ_WQE_SHIFT_M   \
+	(((1UL << 4) - 1) << QP1C_BYTES_4_RQ_WQE_SHIFT_S)
+
+#define QP1C_BYTES_4_PD_S 16
+#define QP1C_BYTES_4_PD_M	(((1UL << 16) - 1) << QP1C_BYTES_4_PD_S)
+
+#define QP1C_BYTES_12_SQ_RQ_BT_H_S 0
+#define QP1C_BYTES_12_SQ_RQ_BT_H_M   \
+	(((1UL << 17) - 1) << QP1C_BYTES_12_SQ_RQ_BT_H_S)
+
+#define QP1C_BYTES_16_RQ_HEAD_S 0
+#define QP1C_BYTES_16_RQ_HEAD_M	(((1UL << 15) - 1) << QP1C_BYTES_16_RQ_HEAD_S)
+
+#define QP1C_BYTES_16_PORT_NUM_S 16
+#define QP1C_BYTES_16_PORT_NUM_M   \
+	(((1UL << 3) - 1) << QP1C_BYTES_16_PORT_NUM_S)
+
+#define QP1C_BYTES_16_SIGNALING_TYPE_S 27
+#define QP1C_BYTES_16_LOCAL_ENABLE_E2E_CREDIT_S 28
+#define QP1C_BYTES_16_RQ_BA_FLG_S 29
+#define QP1C_BYTES_16_SQ_BA_FLG_S 30
+#define QP1C_BYTES_16_QP1_ERR_S 31
+
+#define QP1C_BYTES_20_SQ_HEAD_S 0
+#define QP1C_BYTES_20_SQ_HEAD_M	(((1UL << 15) - 1) << QP1C_BYTES_20_SQ_HEAD_S)
+
+#define QP1C_BYTES_20_PKEY_IDX_S 16
+#define QP1C_BYTES_20_PKEY_IDX_M   \
+	(((1UL << 16) - 1) << QP1C_BYTES_20_PKEY_IDX_S)
+
+#define QP1C_BYTES_28_CUR_RQ_WQE_BA_H_S 0
+#define QP1C_BYTES_28_CUR_RQ_WQE_BA_H_M   \
+	(((1UL << 5) - 1) << QP1C_BYTES_28_CUR_RQ_WQE_BA_H_S)
+
+#define QP1C_BYTES_28_RQ_CUR_IDX_S 16
+#define QP1C_BYTES_28_RQ_CUR_IDX_M   \
+	(((1UL << 15) - 1) << QP1C_BYTES_28_RQ_CUR_IDX_S)
+
+#define QP1C_BYTES_32_TX_CQ_NUM_S 0
+#define QP1C_BYTES_32_TX_CQ_NUM_M   \
+	(((1UL << 16) - 1) << QP1C_BYTES_32_TX_CQ_NUM_S)
+
+#define QP1C_BYTES_32_RX_CQ_NUM_S 16
+#define QP1C_BYTES_32_RX_CQ_NUM_M   \
+	(((1UL << 16) - 1) << QP1C_BYTES_32_RX_CQ_NUM_S)
+
+#define QP1C_BYTES_40_CUR_SQ_WQE_BA_H_S 0
+#define QP1C_BYTES_40_CUR_SQ_WQE_BA_H_M   \
+	(((1UL << 5) - 1) << QP1C_BYTES_40_CUR_SQ_WQE_BA_H_S)
+
+#define QP1C_BYTES_40_SQ_CUR_IDX_S 16
+#define QP1C_BYTES_40_SQ_CUR_IDX_M   \
+	(((1UL << 15) - 1) << QP1C_BYTES_40_SQ_CUR_IDX_S)
+
+#define HNS_ROCE_WQE_INLINE		(1UL<<31)
+#define HNS_ROCE_WQE_SE			(1UL<<30)
+
+#define HNS_ROCE_WQE_SGE_NUM_BIT	24
+#define HNS_ROCE_WQE_IMM		(1UL<<23)
+#define HNS_ROCE_WQE_FENCE		(1UL<<21)
+#define HNS_ROCE_WQE_CQ_NOTIFY		(1UL<<20)
+
+#define HNS_ROCE_WQE_OPCODE_SEND	(0<<16)
+#define HNS_ROCE_WQE_OPCODE_RDMA_READ	(1<<16)
+#define HNS_ROCE_WQE_OPCODE_RDMA_WRITE	(2<<16)
+#define HNS_ROCE_WQE_OPCODE_LOCAL_INV	(4<<16)
+#define HNS_ROCE_WQE_OPCODE_UD_SEND	(7<<16)
+#define HNS_ROCE_WQE_OPCODE_MASK	(15<<16)
+
+struct hns_roce_qp_context {
+	__le32 qpc_bytes_4;
+	__le32 qpc_bytes_8;
+	__le32 qpc_bytes_12;
+	__le32 qpc_bytes_16;
+	__le32 sq_rq_bt_l;
+	__le32 qpc_bytes_24;
+	__le32 irrl_ba_l;
+	__le32 qpc_bytes_32;
+	__le32 qpc_bytes_36;
+	__le32 dmac_l;
+	__le32 qpc_bytes_44;
+	__le32 qpc_bytes_48;
+	u8     dgid[16];
+	__le32 qpc_bytes_68;
+	__le32 cur_rq_wqe_ba_l;
+	__le32 qpc_bytes_76;
+	__le32 rx_rnr_time;
+	__le32 qpc_bytes_84;
+	__le32 qpc_bytes_88;
+	union {
+		__le32 rx_sge_len;
+		__le32 dma_length;
+	};
+	union {
+		__le32 rx_sge_num;
+		__le32 rx_send_pktn;
+		__le32 r_key;
+	};
+	__le32 va_l;
+	__le32 va_h;
+	__le32 qpc_bytes_108;
+	__le32 qpc_bytes_112;
+	__le32 rx_cur_sq_wqe_ba_l;
+	__le32 qpc_bytes_120;
+	__le32 qpc_bytes_124;
+	__le32 qpc_bytes_128;
+	__le32 qpc_bytes_132;
+	__le32 qpc_bytes_136;
+	__le32 qpc_bytes_140;
+	__le32 qpc_bytes_144;
+	__le32 qpc_bytes_148;
+	union {
+		__le32 rnr_retry;
+		__le32 ack_time;
+	};
+	__le32 qpc_bytes_156;
+	__le32 pkt_use_len;
+	__le32 qpc_bytes_164;
+	__le32 qpc_bytes_168;
+	union {
+		__le32 sge_use_len;
+		__le32 pa_use_len;
+	};
+	__le32 qpc_bytes_176;
+	__le32 qpc_bytes_180;
+	__le32 tx_cur_sq_wqe_ba_l;
+	__le32 qpc_bytes_188;
+	__le32 rvd21;
+};
+
+#define QP_CONTEXT_QPC_BYTES_4_TRANSPORT_SERVICE_TYPE_S 0
+#define QP_CONTEXT_QPC_BYTES_4_TRANSPORT_SERVICE_TYPE_M   \
+	(((1UL << 3) - 1) << QP_CONTEXT_QPC_BYTES_4_TRANSPORT_SERVICE_TYPE_S)
+
+#define QP_CONTEXT_QPC_BYTE_4_ENABLE_FPMR_S 3
+#define QP_CONTEXT_QPC_BYTE_4_RDMA_READ_ENABLE_S 4
+#define QP_CONTEXT_QPC_BYTE_4_RDMA_WRITE_ENABLE_S 5
+#define QP_CONTEXT_QPC_BYTE_4_ATOMIC_OPERATION_ENABLE_S 6
+#define QP_CONTEXT_QPC_BYTE_4_RDMAR_USE_S 7
+
+#define QP_CONTEXT_QPC_BYTES_4_SQ_WQE_SHIFT_S 8
+#define QP_CONTEXT_QPC_BYTES_4_SQ_WQE_SHIFT_M   \
+	(((1UL << 4) - 1) << QP_CONTEXT_QPC_BYTES_4_SQ_WQE_SHIFT_S)
+
+#define QP_CONTEXT_QPC_BYTES_4_RQ_WQE_SHIFT_S 12
+#define QP_CONTEXT_QPC_BYTES_4_RQ_WQE_SHIFT_M   \
+	(((1UL << 4) - 1) << QP_CONTEXT_QPC_BYTES_4_RQ_WQE_SHIFT_S)
+
+#define QP_CONTEXT_QPC_BYTES_4_PD_S 16
+#define QP_CONTEXT_QPC_BYTES_4_PD_M   \
+	(((1UL << 16) - 1) << QP_CONTEXT_QPC_BYTES_4_PD_S)
+
+#define QP_CONTEXT_QPC_BYTES_8_TX_COMPLETION_S 0
+#define QP_CONTEXT_QPC_BYTES_8_TX_COMPLETION_M   \
+	(((1UL << 16) - 1) << QP_CONTEXT_QPC_BYTES_8_TX_COMPLETION_S)
+
+#define QP_CONTEXT_QPC_BYTES_8_RX_COMPLETION_S 16
+#define QP_CONTEXT_QPC_BYTES_8_RX_COMPLETION_M   \
+	(((1UL << 16) - 1) << QP_CONTEXT_QPC_BYTES_8_RX_COMPLETION_S)
+
+#define QP_CONTEXT_QPC_BYTES_12_SRQ_NUMBER_S 0
+#define QP_CONTEXT_QPC_BYTES_12_SRQ_NUMBER_M   \
+	(((1UL << 16) - 1) << QP_CONTEXT_QPC_BYTES_12_SRQ_NUMBER_S)
+
+#define QP_CONTEXT_QPC_BYTES_12_P_KEY_INDEX_S 16
+#define QP_CONTEXT_QPC_BYTES_12_P_KEY_INDEX_M   \
+	(((1UL << 16) - 1) << QP_CONTEXT_QPC_BYTES_12_P_KEY_INDEX_S)
+
+#define QP_CONTEXT_QPC_BYTES_16_QP_NUM_S 0
+#define QP_CONTEXT_QPC_BYTES_16_QP_NUM_M   \
+	(((1UL << 24) - 1) << QP_CONTEXT_QPC_BYTES_16_QP_NUM_S)
+
+#define QP_CONTEXT_QPC_BYTES_24_SQ_RQ_BT_H_S 0
+#define QP_CONTEXT_QPC_BYTES_24_SQ_RQ_BT_H_M   \
+	(((1UL << 17) - 1) << QP_CONTEXT_QPC_BYTES_24_SQ_RQ_BT_H_S)
+
+#define QP_CONTEXT_QPC_BYTES_24_MINIMUM_RNR_NAK_TIMER_S 18
+#define QP_CONTEXT_QPC_BYTES_24_MINIMUM_RNR_NAK_TIMER_M   \
+	(((1UL << 5) - 1) << QP_CONTEXT_QPC_BYTES_24_MINIMUM_RNR_NAK_TIMER_S)
+
+#define QP_CONTEXT_QPC_BYTE_24_REMOTE_ENABLE_E2E_CREDITS_S 23
+
+#define QP_CONTEXT_QPC_BYTES_32_IRRL_BA_H_S 0
+#define QP_CONTEXT_QPC_BYTES_32_IRRL_BA_H_M   \
+	(((1UL << 17) - 1) << QP_CONTEXT_QPC_BYTES_32_IRRL_BA_H_S)
+
+#define QP_CONTEXT_QPC_BYTES_32_MIG_STATE_S 18
+#define QP_CONTEXT_QPC_BYTES_32_MIG_STATE_M   \
+	(((1UL << 2) - 1) << QP_CONTEXT_QPC_BYTES_32_MIG_STATE_S)
+
+#define QP_CONTEXT_QPC_BYTE_32_LOCAL_ENABLE_E2E_CREDITS_S 20
+#define QP_CONTEXT_QPC_BYTE_32_SIGNALING_TYPE_S 21
+#define QP_CONTEXT_QPC_BYTE_32_LOOPBACK_INDICATOR_S 22
+#define QP_CONTEXT_QPC_BYTE_32_GLOBAL_HEADER_S 23
+
+#define QP_CONTEXT_QPC_BYTES_32_RESPONDER_RESOURCES_S 24
+#define QP_CONTEXT_QPC_BYTES_32_RESPONDER_RESOURCES_M   \
+	(((1UL << 8) - 1) << QP_CONTEXT_QPC_BYTES_32_RESPONDER_RESOURCES_S)
+
+#define QP_CONTEXT_QPC_BYTES_36_DEST_QP_S 0
+#define QP_CONTEXT_QPC_BYTES_36_DEST_QP_M   \
+	(((1UL << 24) - 1) << QP_CONTEXT_QPC_BYTES_36_DEST_QP_S)
+
+#define QP_CONTEXT_QPC_BYTES_36_SGID_INDEX_S 24
+#define QP_CONTEXT_QPC_BYTES_36_SGID_INDEX_M   \
+	(((1UL << 8) - 1) << QP_CONTEXT_QPC_BYTES_36_SGID_INDEX_S)
+
+#define QP_CONTEXT_QPC_BYTES_44_DMAC_H_S 0
+#define QP_CONTEXT_QPC_BYTES_44_DMAC_H_M   \
+	(((1UL << 16) - 1) << QP_CONTEXT_QPC_BYTES_44_DMAC_H_S)
+
+#define QP_CONTEXT_QPC_BYTES_44_MAXIMUM_STATIC_RATE_S 16
+#define QP_CONTEXT_QPC_BYTES_44_MAXIMUM_STATIC_RATE_M   \
+	(((1UL << 8) - 1) << QP_CONTEXT_QPC_BYTES_44_MAXIMUM_STATIC_RATE_S)
+
+#define QP_CONTEXT_QPC_BYTES_44_HOPLMT_S 24
+#define QP_CONTEXT_QPC_BYTES_44_HOPLMT_M   \
+	(((1UL << 8) - 1) << QP_CONTEXT_QPC_BYTES_44_HOPLMT_S)
+
+#define QP_CONTEXT_QPC_BYTES_48_FLOWLABEL_S 0
+#define QP_CONTEXT_QPC_BYTES_48_FLOWLABEL_M   \
+	(((1UL << 20) - 1) << QP_CONTEXT_QPC_BYTES_48_FLOWLABEL_S)
+
+#define QP_CONTEXT_QPC_BYTES_48_TCLASS_S 20
+#define QP_CONTEXT_QPC_BYTES_48_TCLASS_M   \
+	(((1UL << 8) - 1) << QP_CONTEXT_QPC_BYTES_48_TCLASS_S)
+
+#define QP_CONTEXT_QPC_BYTES_48_MTU_S 28
+#define QP_CONTEXT_QPC_BYTES_48_MTU_M   \
+	(((1UL << 4) - 1) << QP_CONTEXT_QPC_BYTES_48_MTU_S)
+
+#define QP_CONTEXT_QPC_BYTES_68_RQ_HEAD_S 0
+#define QP_CONTEXT_QPC_BYTES_68_RQ_HEAD_M   \
+	(((1UL << 15) - 1) << QP_CONTEXT_QPC_BYTES_68_RQ_HEAD_S)
+
+#define QP_CONTEXT_QPC_BYTES_68_RQ_CUR_INDEX_S 16
+#define QP_CONTEXT_QPC_BYTES_68_RQ_CUR_INDEX_M   \
+	(((1UL << 15) - 1) << QP_CONTEXT_QPC_BYTES_68_RQ_CUR_INDEX_S)
+
+#define QP_CONTEXT_QPC_BYTES_76_CUR_RQ_WQE_BA_H_S 0
+#define QP_CONTEXT_QPC_BYTES_76_CUR_RQ_WQE_BA_H_M   \
+	(((1UL << 5) - 1) << QP_CONTEXT_QPC_BYTES_76_CUR_RQ_WQE_BA_H_S)
+
+#define QP_CONTEXT_QPC_BYTES_76_RX_REQ_MSN_S 8
+#define QP_CONTEXT_QPC_BYTES_76_RX_REQ_MSN_M   \
+	(((1UL << 24) - 1) << QP_CONTEXT_QPC_BYTES_76_RX_REQ_MSN_S)
+
+#define QP_CONTEXT_QPC_BYTES_84_LAST_ACK_PSN_S 0
+#define QP_CONTEXT_QPC_BYTES_84_LAST_ACK_PSN_M   \
+	(((1UL << 24) - 1) << QP_CONTEXT_QPC_BYTES_84_LAST_ACK_PSN_S)
+
+#define QP_CONTEXT_QPC_BYTES_84_TRRL_HEAD_S 24
+#define QP_CONTEXT_QPC_BYTES_84_TRRL_HEAD_M   \
+	(((1UL << 8) - 1) << QP_CONTEXT_QPC_BYTES_84_TRRL_HEAD_S)
+
+#define QP_CONTEXT_QPC_BYTES_88_RX_REQ_EPSN_S 0
+#define QP_CONTEXT_QPC_BYTES_88_RX_REQ_EPSN_M   \
+	(((1UL << 24) - 1) << QP_CONTEXT_QPC_BYTES_88_RX_REQ_EPSN_S)
+
+#define QP_CONTEXT_QPC_BYTES_88_RX_REQ_PSN_ERR_FLAG_S 24
+#define QP_CONTEXT_QPC_BYTES_88_RX_LAST_OPCODE_FLG_S 25
+
+#define QP_CONTEXT_QPC_BYTES_88_RQ_REQ_LAST_OPERATION_TYPE_S 26
+#define QP_CONTEXT_QPC_BYTES_88_RQ_REQ_LAST_OPERATION_TYPE_M   \
+	(((1UL << 2) - 1) << \
+	QP_CONTEXT_QPC_BYTES_88_RQ_REQ_LAST_OPERATION_TYPE_S)
+
+#define QP_CONTEXT_QPC_BYTES_88_RQ_REQ_RDMA_WR_FLAG_S 29
+#define QP_CONTEXT_QPC_BYTES_88_RQ_REQ_RDMA_WR_FLAG_M   \
+	(((1UL << 2) - 1) << QP_CONTEXT_QPC_BYTES_88_RQ_REQ_RDMA_WR_FLAG_S)
+
+#define QP_CONTEXT_QPC_BYTES_108_TRRL_SDB_PSN_S 0
+#define QP_CONTEXT_QPC_BYTES_108_TRRL_SDB_PSN_M   \
+	(((1UL << 24) - 1) << QP_CONTEXT_QPC_BYTES_108_TRRL_SDB_PSN_S)
+
+#define QP_CONTEXT_QPC_BYTES_108_TRRL_SDB_PSN_FLG_S 24
+#define QP_CONTEXT_QPC_BYTES_108_TRRL_TDB_PSN_FLG_S 25
+
+#define QP_CONTEXT_QPC_BYTES_112_TRRL_TDB_PSN_S 0
+#define QP_CONTEXT_QPC_BYTES_112_TRRL_TDB_PSN_M   \
+	(((1UL << 24) - 1) << QP_CONTEXT_QPC_BYTES_112_TRRL_TDB_PSN_S)
+
+#define QP_CONTEXT_QPC_BYTES_112_TRRL_TAIL_S 24
+#define QP_CONTEXT_QPC_BYTES_112_TRRL_TAIL_M   \
+	(((1UL << 8) - 1) << QP_CONTEXT_QPC_BYTES_112_TRRL_TAIL_S)
+
+#define QP_CONTEXT_QPC_BYTES_120_RX_CUR_SQ_WQE_BA_H_S 0
+#define QP_CONTEXT_QPC_BYTES_120_RX_CUR_SQ_WQE_BA_H_M   \
+	(((1UL << 5) - 1) << QP_CONTEXT_QPC_BYTES_120_RX_CUR_SQ_WQE_BA_H_S)
+
+#define QP_CONTEXT_QPC_BYTES_124_RX_ACK_MSN_S 0
+#define QP_CONTEXT_QPC_BYTES_124_RX_ACK_MSN_M   \
+	(((1UL << 15) - 1) << QP_CONTEXT_QPC_BYTES_124_RX_ACK_MSN_S)
+
+#define QP_CONTEXT_QPC_BYTES_124_IRRL_MSG_IDX_S 16
+#define QP_CONTEXT_QPC_BYTES_124_IRRL_MSG_IDX_M   \
+	(((1UL << 15) - 1) << QP_CONTEXT_QPC_BYTES_124_IRRL_MSG_IDX_S)
+
+#define QP_CONTEXT_QPC_BYTES_128_RX_ACK_EPSN_S 0
+#define QP_CONTEXT_QPC_BYTES_128_RX_ACK_EPSN_M   \
+	(((1UL << 24) - 1) << QP_CONTEXT_QPC_BYTES_128_RX_ACK_EPSN_S)
+
+#define QP_CONTEXT_QPC_BYTES_128_RX_ACK_PSN_ERR_FLG_S 24
+
+#define QP_CONTEXT_QPC_BYTES_128_ACK_LAST_OPERATION_TYPE_S 25
+#define QP_CONTEXT_QPC_BYTES_128_ACK_LAST_OPERATION_TYPE_M   \
+	(((1UL << 2) - 1) << QP_CONTEXT_QPC_BYTES_128_ACK_LAST_OPERATION_TYPE_S)
+
+#define QP_CONTEXT_QPC_BYTES_128_IRRL_PSN_VLD_FLG_S 27
+
+#define QP_CONTEXT_QPC_BYTES_132_IRRL_PSN_S 0
+#define QP_CONTEXT_QPC_BYTES_132_IRRL_PSN_M   \
+	(((1UL << 24) - 1) << QP_CONTEXT_QPC_BYTES_132_IRRL_PSN_S)
+
+#define QP_CONTEXT_QPC_BYTES_132_IRRL_TAIL_S 24
+#define QP_CONTEXT_QPC_BYTES_132_IRRL_TAIL_M   \
+	(((1UL << 8) - 1) << QP_CONTEXT_QPC_BYTES_132_IRRL_TAIL_S)
+
+#define QP_CONTEXT_QPC_BYTES_136_RETRY_MSG_PSN_S 0
+#define QP_CONTEXT_QPC_BYTES_136_RETRY_MSG_PSN_M   \
+	(((1UL << 24) - 1) << QP_CONTEXT_QPC_BYTES_136_RETRY_MSG_PSN_S)
+
+#define QP_CONTEXT_QPC_BYTES_136_RETRY_MSG_FPKT_PSN_L_S 24
+#define QP_CONTEXT_QPC_BYTES_136_RETRY_MSG_FPKT_PSN_L_M   \
+	(((1UL << 8) - 1) << QP_CONTEXT_QPC_BYTES_136_RETRY_MSG_FPKT_PSN_L_S)
+
+#define QP_CONTEXT_QPC_BYTES_140_RETRY_MSG_FPKT_PSN_H_S 0
+#define QP_CONTEXT_QPC_BYTES_140_RETRY_MSG_FPKT_PSN_H_M   \
+	(((1UL << 16) - 1) << QP_CONTEXT_QPC_BYTES_140_RETRY_MSG_FPKT_PSN_H_S)
+
+#define QP_CONTEXT_QPC_BYTES_140_RETRY_MSG_MSN_S 16
+#define QP_CONTEXT_QPC_BYTES_140_RETRY_MSG_MSN_M   \
+	(((1UL << 15) - 1) << QP_CONTEXT_QPC_BYTES_140_RETRY_MSG_MSN_S)
+
+#define QP_CONTEXT_QPC_BYTES_140_RNR_RETRY_FLG_S 31
+
+#define QP_CONTEXT_QPC_BYTES_144_QP_STATE_S 0
+#define QP_CONTEXT_QPC_BYTES_144_QP_STATE_M   \
+	(((1UL << 3) - 1) << QP_CONTEXT_QPC_BYTES_144_QP_STATE_S)
+
+#define QP_CONTEXT_QPC_BYTES_148_CHECK_FLAG_S 0
+#define QP_CONTEXT_QPC_BYTES_148_CHECK_FLAG_M   \
+	(((1UL << 2) - 1) << QP_CONTEXT_QPC_BYTES_148_CHECK_FLAG_S)
+
+#define QP_CONTEXT_QPC_BYTES_148_RETRY_COUNT_S 2
+#define QP_CONTEXT_QPC_BYTES_148_RETRY_COUNT_M   \
+	(((1UL << 3) - 1) << QP_CONTEXT_QPC_BYTES_148_RETRY_COUNT_S)
+
+#define QP_CONTEXT_QPC_BYTES_148_RNR_RETRY_COUNT_S 5
+#define QP_CONTEXT_QPC_BYTES_148_RNR_RETRY_COUNT_M   \
+	(((1UL << 3) - 1) << QP_CONTEXT_QPC_BYTES_148_RNR_RETRY_COUNT_S)
+
+#define QP_CONTEXT_QPC_BYTES_148_LSN_S 8
+#define QP_CONTEXT_QPC_BYTES_148_LSN_M   \
+	(((1UL << 16) - 1) << QP_CONTEXT_QPC_BYTES_148_LSN_S)
+
+#define QP_CONTEXT_QPC_BYTES_156_RETRY_COUNT_INIT_S 0
+#define QP_CONTEXT_QPC_BYTES_156_RETRY_COUNT_INIT_M   \
+	(((1UL << 3) - 1) << QP_CONTEXT_QPC_BYTES_156_RETRY_COUNT_INIT_S)
+
+#define QP_CONTEXT_QPC_BYTES_156_ACK_TIMEOUT_S 3
+#define QP_CONTEXT_QPC_BYTES_156_ACK_TIMEOUT_M   \
+	(((1UL << 5) - 1) << QP_CONTEXT_QPC_BYTES_156_ACK_TIMEOUT_S)
+
+#define QP_CONTEXT_QPC_BYTES_156_RNR_RETRY_COUNT_INIT_S 8
+#define QP_CONTEXT_QPC_BYTES_156_RNR_RETRY_COUNT_INIT_M   \
+	(((1UL << 3) - 1) << QP_CONTEXT_QPC_BYTES_156_RNR_RETRY_COUNT_INIT_S)
+
+#define QP_CONTEXT_QPC_BYTES_156_PORT_NUM_S 11
+#define QP_CONTEXT_QPC_BYTES_156_PORT_NUM_M   \
+	(((1UL << 3) - 1) << QP_CONTEXT_QPC_BYTES_156_PORT_NUM_S)
+
+#define QP_CONTEXT_QPC_BYTES_156_SL_S 14
+#define QP_CONTEXT_QPC_BYTES_156_SL_M   \
+	(((1UL << 2) - 1) << QP_CONTEXT_QPC_BYTES_156_SL_S)
+
+#define QP_CONTEXT_QPC_BYTES_156_INITIATOR_DEPTH_S 16
+#define QP_CONTEXT_QPC_BYTES_156_INITIATOR_DEPTH_M   \
+	(((1UL << 8) - 1) << QP_CONTEXT_QPC_BYTES_156_INITIATOR_DEPTH_S)
+
+#define QP_CONTEXT_QPC_BYTES_156_ACK_REQ_IND_S 24
+#define QP_CONTEXT_QPC_BYTES_156_ACK_REQ_IND_M   \
+	(((1UL << 2) - 1) << QP_CONTEXT_QPC_BYTES_156_ACK_REQ_IND_S)
+
+#define QP_CONTEXT_QPC_BYTES_164_SQ_PSN_S 0
+#define QP_CONTEXT_QPC_BYTES_164_SQ_PSN_M   \
+	(((1UL << 24) - 1) << QP_CONTEXT_QPC_BYTES_164_SQ_PSN_S)
+
+#define QP_CONTEXT_QPC_BYTES_164_IRRL_HEAD_S 24
+#define QP_CONTEXT_QPC_BYTES_164_IRRL_HEAD_M   \
+	(((1UL << 8) - 1) << QP_CONTEXT_QPC_BYTES_164_IRRL_HEAD_S)
+
+#define QP_CONTEXT_QPC_BYTES_168_RETRY_SQ_PSN_S 0
+#define QP_CONTEXT_QPC_BYTES_168_RETRY_SQ_PSN_M   \
+	(((1UL << 24) - 1) << QP_CONTEXT_QPC_BYTES_168_RETRY_SQ_PSN_S)
+
+#define QP_CONTEXT_QPC_BYTES_168_SGE_USE_FLA_S 24
+#define QP_CONTEXT_QPC_BYTES_168_SGE_USE_FLA_M   \
+	(((1UL << 2) - 1) << QP_CONTEXT_QPC_BYTES_168_SGE_USE_FLA_S)
+
+#define QP_CONTEXT_QPC_BYTES_168_DB_TYPE_S 26
+#define QP_CONTEXT_QPC_BYTES_168_DB_TYPE_M   \
+	(((1UL << 2) - 1) << QP_CONTEXT_QPC_BYTES_168_DB_TYPE_S)
+
+#define QP_CONTEXT_QPC_BYTES_168_MSG_LP_IND_S 28
+#define QP_CONTEXT_QPC_BYTES_168_CSDB_LP_IND_S 29
+#define QP_CONTEXT_QPC_BYTES_168_QP_ERR_FLG_S 30
+
+#define QP_CONTEXT_QPC_BYTES_176_DB_CUR_INDEX_S 0
+#define QP_CONTEXT_QPC_BYTES_176_DB_CUR_INDEX_M   \
+	(((1UL << 15) - 1) << QP_CONTEXT_QPC_BYTES_176_DB_CUR_INDEX_S)
+
+#define QP_CONTEXT_QPC_BYTES_176_RETRY_DB_CUR_INDEX_S 16
+#define QP_CONTEXT_QPC_BYTES_176_RETRY_DB_CUR_INDEX_M   \
+	(((1UL << 15) - 1) << QP_CONTEXT_QPC_BYTES_176_RETRY_DB_CUR_INDEX_S)
+
+#define QP_CONTEXT_QPC_BYTES_180_SQ_HEAD_S 0
+#define QP_CONTEXT_QPC_BYTES_180_SQ_HEAD_M   \
+	(((1UL << 15) - 1) << QP_CONTEXT_QPC_BYTES_180_SQ_HEAD_S)
+
+#define QP_CONTEXT_QPC_BYTES_180_SQ_CUR_INDEX_S 16
+#define QP_CONTEXT_QPC_BYTES_180_SQ_CUR_INDEX_M   \
+	(((1UL << 15) - 1) << QP_CONTEXT_QPC_BYTES_180_SQ_CUR_INDEX_S)
+
+#define QP_CONTEXT_QPC_BYTES_188_TX_CUR_SQ_WQE_BA_H_S 0
+#define QP_CONTEXT_QPC_BYTES_188_TX_CUR_SQ_WQE_BA_H_M   \
+	(((1UL << 5) - 1) << QP_CONTEXT_QPC_BYTES_188_TX_CUR_SQ_WQE_BA_H_S)
+
+#define QP_CONTEXT_QPC_BYTES_188_PKT_RETRY_FLG_S 8
+
+#define QP_CONTEXT_QPC_BYTES_188_TX_RETRY_CUR_INDEX_S 16
+#define QP_CONTEXT_QPC_BYTES_188_TX_RETRY_CUR_INDEX_M   \
+	(((1UL << 15) - 1) << QP_CONTEXT_QPC_BYTES_188_TX_RETRY_CUR_INDEX_S)
+
+#define STATUS_MASK		0xff
+#define GO_BIT_TIMEOUT_MSECS	10000
+#define HCR_STATUS_OFFSET	0x18
+#define HCR_GO_BIT		15
+
+struct hns_roce_rq_db {
+	__le32    u32_4;
+	__le32    u32_8;
+};
+
+#define RQ_DOORBELL_U32_4_RQ_HEAD_S 0
+#define RQ_DOORBELL_U32_4_RQ_HEAD_M   \
+	(((1UL << 15) - 1) << RQ_DOORBELL_U32_4_RQ_HEAD_S)
+
+#define RQ_DOORBELL_U32_8_QPN_S 0
+#define RQ_DOORBELL_U32_8_QPN_M   (((1UL << 24) - 1) << RQ_DOORBELL_U32_8_QPN_S)
+
+#define RQ_DOORBELL_U32_8_CMD_S 28
+#define RQ_DOORBELL_U32_8_CMD_M   (((1UL << 3) - 1) << RQ_DOORBELL_U32_8_CMD_S)
+
+#define RQ_DOORBELL_U32_8_HW_SYNC_S 31
+
+struct hns_roce_sq_db {
+	__le32    u32_4;
+	__le32    u32_8;
+};
+
+#define SQ_DOORBELL_U32_4_SQ_HEAD_S 0
+#define SQ_DOORBELL_U32_4_SQ_HEAD_M   \
+	(((1UL << 15) - 1) << SQ_DOORBELL_U32_4_SQ_HEAD_S)
+
+#define SQ_DOORBELL_U32_4_SL_S 16
+#define SQ_DOORBELL_U32_4_SL_M   \
+	(((1UL << 2) - 1) << SQ_DOORBELL_U32_4_SL_S)
+
+#define SQ_DOORBELL_U32_4_PORT_S 18
+#define SQ_DOORBELL_U32_4_PORT_M  (((1UL << 3) - 1) << SQ_DOORBELL_U32_4_PORT_S)
+
+#define SQ_DOORBELL_U32_8_QPN_S 0
+#define SQ_DOORBELL_U32_8_QPN_M   (((1UL << 24) - 1) << SQ_DOORBELL_U32_8_QPN_S)
+
+#define SQ_DOORBELL_HW_SYNC_S 31
+
+struct hns_roce_ext_db {
+	int esdb_dep;
+	int eodb_dep;
+	struct hns_roce_buf_list *sdb_buf_list;
+	struct hns_roce_buf_list *odb_buf_list;
+};
+
+struct hns_roce_db_table {
+	int  sdb_ext_mod;
+	int  odb_ext_mod;
+	struct hns_roce_ext_db *ext_db;
+};
+
+struct hns_roce_bt_table {
+	struct hns_roce_buf_list qpc_buf;
+	struct hns_roce_buf_list mtpt_buf;
+	struct hns_roce_buf_list cqc_buf;
+};
+
+struct hns_roce_tptr_table {
+	struct hns_roce_buf_list tptr_buf;
+};
+
+struct hns_roce_qp_work {
+	struct	work_struct work;
+	struct	ib_device *ib_dev;
+	struct	hns_roce_qp *qp;
+	u32	db_wait_stage;
+	u32	sdb_issue_ptr;
+	u32	sdb_inv_cnt;
+	u32	sche_cnt;
+};
+
+struct hns_roce_des_qp {
+	struct workqueue_struct	*qp_wq;
+	int	requeue_flag;
+};
+
+struct hns_roce_mr_free_work {
+	struct	work_struct work;
+	struct	ib_device *ib_dev;
+	struct	completion *comp;
+	int	comp_flag;
+	void	*mr;
+};
+
+struct hns_roce_recreate_lp_qp_work {
+	struct	work_struct work;
+	struct	ib_device *ib_dev;
+	struct	completion *comp;
+	int	comp_flag;
+};
+
+struct hns_roce_free_mr {
+	struct workqueue_struct *free_mr_wq;
+	struct hns_roce_qp *mr_free_qp[HNS_ROCE_V1_RESV_QP];
+	struct hns_roce_cq *mr_free_cq;
+	struct hns_roce_pd *mr_free_pd;
+};
+
+struct hns_roce_v1_priv {
+	struct hns_roce_db_table  db_table;
+	struct hns_roce_raq_table raq_table;
+	struct hns_roce_bt_table  bt_table;
+	struct hns_roce_tptr_table tptr_table;
+	struct hns_roce_des_qp des_qp;
+	struct hns_roce_free_mr free_mr;
+};
+
+int hns_dsaf_roce_reset(struct fwnode_handle *dsaf_fwnode, bool dereset);
+int hns_roce_v1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int hns_roce_v1_destroy_qp(struct ib_qp *ibqp);
+
+#endif
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
new file mode 100644
index 0000000..1f0965b
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -0,0 +1,4836 @@
+/*
+ * Copyright (c) 2016-2017 Hisilicon Limited.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/acpi.h>
+#include <linux/etherdevice.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <net/addrconf.h>
+#include <rdma/ib_umem.h>
+
+#include "hnae3.h"
+#include "hns_roce_common.h"
+#include "hns_roce_device.h"
+#include "hns_roce_cmd.h"
+#include "hns_roce_hem.h"
+#include "hns_roce_hw_v2.h"
+
+static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg,
+			    struct ib_sge *sg)
+{
+	dseg->lkey = cpu_to_le32(sg->lkey);
+	dseg->addr = cpu_to_le64(sg->addr);
+	dseg->len  = cpu_to_le32(sg->length);
+}
+
+static int set_rwqe_data_seg(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			     struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
+			     void *wqe, unsigned int *sge_ind,
+			     struct ib_send_wr **bad_wr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_v2_wqe_data_seg *dseg = wqe;
+	struct hns_roce_qp *qp = to_hr_qp(ibqp);
+	int i;
+
+	if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) {
+		if (le32_to_cpu(rc_sq_wqe->msg_len) >
+		    hr_dev->caps.max_sq_inline) {
+			*bad_wr = wr;
+			dev_err(hr_dev->dev, "inline len(1-%d)=%d, illegal",
+				rc_sq_wqe->msg_len, hr_dev->caps.max_sq_inline);
+			return -EINVAL;
+		}
+
+		if (wr->opcode == IB_WR_RDMA_READ) {
+			dev_err(hr_dev->dev, "Not support inline data!\n");
+			return -EINVAL;
+		}
+
+		for (i = 0; i < wr->num_sge; i++) {
+			memcpy(wqe, ((void *)wr->sg_list[i].addr),
+			       wr->sg_list[i].length);
+			wqe += wr->sg_list[i].length;
+		}
+
+		roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_INLINE_S,
+			     1);
+	} else {
+		if (wr->num_sge <= 2) {
+			for (i = 0; i < wr->num_sge; i++) {
+				if (likely(wr->sg_list[i].length)) {
+					set_data_seg_v2(dseg, wr->sg_list + i);
+					dseg++;
+				}
+			}
+		} else {
+			roce_set_field(rc_sq_wqe->byte_20,
+				     V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M,
+				     V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S,
+				     (*sge_ind) & (qp->sge.sge_cnt - 1));
+
+			for (i = 0; i < 2; i++) {
+				if (likely(wr->sg_list[i].length)) {
+					set_data_seg_v2(dseg, wr->sg_list + i);
+					dseg++;
+				}
+			}
+
+			dseg = get_send_extend_sge(qp,
+					    (*sge_ind) & (qp->sge.sge_cnt - 1));
+
+			for (i = 0; i < wr->num_sge - 2; i++) {
+				if (likely(wr->sg_list[i + 2].length)) {
+					set_data_seg_v2(dseg,
+							wr->sg_list + 2 + i);
+					dseg++;
+					(*sge_ind)++;
+				}
+			}
+		}
+
+		roce_set_field(rc_sq_wqe->byte_16,
+			       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M,
+			       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, wr->num_sge);
+	}
+
+	return 0;
+}
+
+static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+				 struct ib_send_wr **bad_wr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_ah *ah = to_hr_ah(ud_wr(wr)->ah);
+	struct hns_roce_v2_ud_send_wqe *ud_sq_wqe;
+	struct hns_roce_v2_rc_send_wqe *rc_sq_wqe;
+	struct hns_roce_qp *qp = to_hr_qp(ibqp);
+	struct hns_roce_v2_wqe_data_seg *dseg;
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_v2_db sq_db;
+	unsigned int sge_ind = 0;
+	unsigned int owner_bit;
+	unsigned long flags;
+	unsigned int ind;
+	void *wqe = NULL;
+	bool loopback;
+	u32 tmp_len;
+	int ret = 0;
+	u8 *smac;
+	int nreq;
+	int i;
+
+	if (unlikely(ibqp->qp_type != IB_QPT_RC &&
+		     ibqp->qp_type != IB_QPT_GSI &&
+		     ibqp->qp_type != IB_QPT_UD)) {
+		dev_err(dev, "Not supported QP(0x%x)type!\n", ibqp->qp_type);
+		*bad_wr = wr;
+		return -EOPNOTSUPP;
+	}
+
+	if (unlikely(qp->state == IB_QPS_RESET || qp->state == IB_QPS_INIT ||
+		     qp->state == IB_QPS_RTR)) {
+		dev_err(dev, "Post WQE fail, QP state %d err!\n", qp->state);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+	ind = qp->sq_next_wqe;
+	sge_ind = qp->next_sge;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (hns_roce_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			ret = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+			dev_err(dev, "num_sge=%d > qp->sq.max_gs=%d\n",
+				wr->num_sge, qp->sq.max_gs);
+			ret = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+		qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] =
+								      wr->wr_id;
+
+		owner_bit =
+		       ~(((qp->sq.head + nreq) >> ilog2(qp->sq.wqe_cnt)) & 0x1);
+		tmp_len = 0;
+
+		/* Corresponding to the QP type, wqe process separately */
+		if (ibqp->qp_type == IB_QPT_GSI) {
+			ud_sq_wqe = wqe;
+			memset(ud_sq_wqe, 0, sizeof(*ud_sq_wqe));
+
+			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_0_M,
+				       V2_UD_SEND_WQE_DMAC_0_S, ah->av.mac[0]);
+			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_1_M,
+				       V2_UD_SEND_WQE_DMAC_1_S, ah->av.mac[1]);
+			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_2_M,
+				       V2_UD_SEND_WQE_DMAC_2_S, ah->av.mac[2]);
+			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_3_M,
+				       V2_UD_SEND_WQE_DMAC_3_S, ah->av.mac[3]);
+			roce_set_field(ud_sq_wqe->byte_48,
+				       V2_UD_SEND_WQE_BYTE_48_DMAC_4_M,
+				       V2_UD_SEND_WQE_BYTE_48_DMAC_4_S,
+				       ah->av.mac[4]);
+			roce_set_field(ud_sq_wqe->byte_48,
+				       V2_UD_SEND_WQE_BYTE_48_DMAC_5_M,
+				       V2_UD_SEND_WQE_BYTE_48_DMAC_5_S,
+				       ah->av.mac[5]);
+
+			/* MAC loopback */
+			smac = (u8 *)hr_dev->dev_addr[qp->port];
+			loopback = ether_addr_equal_unaligned(ah->av.mac,
+							      smac) ? 1 : 0;
+
+			roce_set_bit(ud_sq_wqe->byte_40,
+				     V2_UD_SEND_WQE_BYTE_40_LBI_S, loopback);
+
+			roce_set_field(ud_sq_wqe->byte_4,
+				       V2_UD_SEND_WQE_BYTE_4_OPCODE_M,
+				       V2_UD_SEND_WQE_BYTE_4_OPCODE_S,
+				       HNS_ROCE_V2_WQE_OP_SEND);
+
+			for (i = 0; i < wr->num_sge; i++)
+				tmp_len += wr->sg_list[i].length;
+
+			ud_sq_wqe->msg_len =
+			 cpu_to_le32(le32_to_cpu(ud_sq_wqe->msg_len) + tmp_len);
+
+			switch (wr->opcode) {
+			case IB_WR_SEND_WITH_IMM:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				ud_sq_wqe->immtdata = wr->ex.imm_data;
+				break;
+			default:
+				ud_sq_wqe->immtdata = 0;
+				break;
+			}
+
+			/* Set sig attr */
+			roce_set_bit(ud_sq_wqe->byte_4,
+				   V2_UD_SEND_WQE_BYTE_4_CQE_S,
+				   (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0);
+
+			/* Set se attr */
+			roce_set_bit(ud_sq_wqe->byte_4,
+				  V2_UD_SEND_WQE_BYTE_4_SE_S,
+				  (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0);
+
+			roce_set_bit(ud_sq_wqe->byte_4,
+				     V2_UD_SEND_WQE_BYTE_4_OWNER_S, owner_bit);
+
+			roce_set_field(ud_sq_wqe->byte_16,
+				       V2_UD_SEND_WQE_BYTE_16_PD_M,
+				       V2_UD_SEND_WQE_BYTE_16_PD_S,
+				       to_hr_pd(ibqp->pd)->pdn);
+
+			roce_set_field(ud_sq_wqe->byte_16,
+				       V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M,
+				       V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S,
+				       wr->num_sge);
+
+			roce_set_field(ud_sq_wqe->byte_20,
+				     V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M,
+				     V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S,
+				     sge_ind & (qp->sge.sge_cnt - 1));
+
+			roce_set_field(ud_sq_wqe->byte_24,
+				       V2_UD_SEND_WQE_BYTE_24_UDPSPN_M,
+				       V2_UD_SEND_WQE_BYTE_24_UDPSPN_S, 0);
+			ud_sq_wqe->qkey =
+			     cpu_to_le32(ud_wr(wr)->remote_qkey & 0x80000000 ?
+			     qp->qkey : ud_wr(wr)->remote_qkey);
+			roce_set_field(ud_sq_wqe->byte_32,
+				       V2_UD_SEND_WQE_BYTE_32_DQPN_M,
+				       V2_UD_SEND_WQE_BYTE_32_DQPN_S,
+				       ud_wr(wr)->remote_qpn);
+
+			roce_set_field(ud_sq_wqe->byte_36,
+				       V2_UD_SEND_WQE_BYTE_36_VLAN_M,
+				       V2_UD_SEND_WQE_BYTE_36_VLAN_S,
+				       le16_to_cpu(ah->av.vlan));
+			roce_set_field(ud_sq_wqe->byte_36,
+				       V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M,
+				       V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S,
+				       ah->av.hop_limit);
+			roce_set_field(ud_sq_wqe->byte_36,
+				       V2_UD_SEND_WQE_BYTE_36_TCLASS_M,
+				       V2_UD_SEND_WQE_BYTE_36_TCLASS_S,
+				       0);
+			roce_set_field(ud_sq_wqe->byte_36,
+				       V2_UD_SEND_WQE_BYTE_36_TCLASS_M,
+				       V2_UD_SEND_WQE_BYTE_36_TCLASS_S,
+				       0);
+			roce_set_field(ud_sq_wqe->byte_40,
+				       V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M,
+				       V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S, 0);
+			roce_set_field(ud_sq_wqe->byte_40,
+				       V2_UD_SEND_WQE_BYTE_40_SL_M,
+				       V2_UD_SEND_WQE_BYTE_40_SL_S,
+				      le32_to_cpu(ah->av.sl_tclass_flowlabel) >>
+				      HNS_ROCE_SL_SHIFT);
+			roce_set_field(ud_sq_wqe->byte_40,
+				       V2_UD_SEND_WQE_BYTE_40_PORTN_M,
+				       V2_UD_SEND_WQE_BYTE_40_PORTN_S,
+				       qp->port);
+
+			roce_set_field(ud_sq_wqe->byte_48,
+				       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M,
+				       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S,
+				       hns_get_gid_index(hr_dev, qp->phy_port,
+							 ah->av.gid_index));
+
+			memcpy(&ud_sq_wqe->dgid[0], &ah->av.dgid[0],
+			       GID_LEN_V2);
+
+			dseg = get_send_extend_sge(qp,
+					    sge_ind & (qp->sge.sge_cnt - 1));
+			for (i = 0; i < wr->num_sge; i++) {
+				set_data_seg_v2(dseg + i, wr->sg_list + i);
+				sge_ind++;
+			}
+
+			ind++;
+		} else if (ibqp->qp_type == IB_QPT_RC) {
+			rc_sq_wqe = wqe;
+			memset(rc_sq_wqe, 0, sizeof(*rc_sq_wqe));
+			for (i = 0; i < wr->num_sge; i++)
+				tmp_len += wr->sg_list[i].length;
+
+			rc_sq_wqe->msg_len =
+			 cpu_to_le32(le32_to_cpu(rc_sq_wqe->msg_len) + tmp_len);
+
+			switch (wr->opcode) {
+			case IB_WR_SEND_WITH_IMM:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				rc_sq_wqe->immtdata = wr->ex.imm_data;
+				break;
+			case IB_WR_SEND_WITH_INV:
+				rc_sq_wqe->inv_key =
+					cpu_to_le32(wr->ex.invalidate_rkey);
+				break;
+			default:
+				rc_sq_wqe->immtdata = 0;
+				break;
+			}
+
+			roce_set_bit(rc_sq_wqe->byte_4,
+				     V2_RC_SEND_WQE_BYTE_4_FENCE_S,
+				     (wr->send_flags & IB_SEND_FENCE) ? 1 : 0);
+
+			roce_set_bit(rc_sq_wqe->byte_4,
+				  V2_RC_SEND_WQE_BYTE_4_SE_S,
+				  (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0);
+
+			roce_set_bit(rc_sq_wqe->byte_4,
+				   V2_RC_SEND_WQE_BYTE_4_CQE_S,
+				   (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0);
+
+			roce_set_bit(rc_sq_wqe->byte_4,
+				     V2_RC_SEND_WQE_BYTE_4_OWNER_S, owner_bit);
+
+			switch (wr->opcode) {
+			case IB_WR_RDMA_READ:
+				roce_set_field(rc_sq_wqe->byte_4,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					       HNS_ROCE_V2_WQE_OP_RDMA_READ);
+				rc_sq_wqe->rkey =
+					cpu_to_le32(rdma_wr(wr)->rkey);
+				rc_sq_wqe->va =
+					cpu_to_le64(rdma_wr(wr)->remote_addr);
+				break;
+			case IB_WR_RDMA_WRITE:
+				roce_set_field(rc_sq_wqe->byte_4,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					       HNS_ROCE_V2_WQE_OP_RDMA_WRITE);
+				rc_sq_wqe->rkey =
+					cpu_to_le32(rdma_wr(wr)->rkey);
+				rc_sq_wqe->va =
+					cpu_to_le64(rdma_wr(wr)->remote_addr);
+				break;
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				roce_set_field(rc_sq_wqe->byte_4,
+				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+				       HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM);
+				rc_sq_wqe->rkey =
+					cpu_to_le32(rdma_wr(wr)->rkey);
+				rc_sq_wqe->va =
+					cpu_to_le64(rdma_wr(wr)->remote_addr);
+				break;
+			case IB_WR_SEND:
+				roce_set_field(rc_sq_wqe->byte_4,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					       HNS_ROCE_V2_WQE_OP_SEND);
+				break;
+			case IB_WR_SEND_WITH_INV:
+				roce_set_field(rc_sq_wqe->byte_4,
+				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+				       HNS_ROCE_V2_WQE_OP_SEND_WITH_INV);
+				break;
+			case IB_WR_SEND_WITH_IMM:
+				roce_set_field(rc_sq_wqe->byte_4,
+					      V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					      V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					      HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM);
+				break;
+			case IB_WR_LOCAL_INV:
+				roce_set_field(rc_sq_wqe->byte_4,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					       HNS_ROCE_V2_WQE_OP_LOCAL_INV);
+				break;
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+				roce_set_field(rc_sq_wqe->byte_4,
+					  V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					  V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					  HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP);
+				break;
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				roce_set_field(rc_sq_wqe->byte_4,
+					 V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					 V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					 HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD);
+				break;
+			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+				roce_set_field(rc_sq_wqe->byte_4,
+				      V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+				      V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+				      HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP);
+				break;
+			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+				roce_set_field(rc_sq_wqe->byte_4,
+				     V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+				     V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+				     HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD);
+				break;
+			default:
+				roce_set_field(rc_sq_wqe->byte_4,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					       HNS_ROCE_V2_WQE_OP_MASK);
+				break;
+			}
+
+			wqe += sizeof(struct hns_roce_v2_rc_send_wqe);
+			dseg = wqe;
+
+			ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, wqe,
+						&sge_ind, bad_wr);
+			if (ret)
+				goto out;
+			ind++;
+		} else {
+			dev_err(dev, "Illegal qp_type(0x%x)\n", ibqp->qp_type);
+			spin_unlock_irqrestore(&qp->sq.lock, flags);
+			*bad_wr = wr;
+			return -EOPNOTSUPP;
+		}
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->sq.head += nreq;
+		/* Memory barrier */
+		wmb();
+
+		sq_db.byte_4 = 0;
+		sq_db.parameter = 0;
+
+		roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_TAG_M,
+			       V2_DB_BYTE_4_TAG_S, qp->doorbell_qpn);
+		roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_CMD_M,
+			       V2_DB_BYTE_4_CMD_S, HNS_ROCE_V2_SQ_DB);
+		roce_set_field(sq_db.parameter, V2_DB_PARAMETER_CONS_IDX_M,
+			       V2_DB_PARAMETER_CONS_IDX_S,
+			       qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1));
+		roce_set_field(sq_db.parameter, V2_DB_PARAMETER_SL_M,
+			       V2_DB_PARAMETER_SL_S, qp->sl);
+
+		hns_roce_write64_k((__le32 *)&sq_db, qp->sq.db_reg_l);
+
+		qp->sq_next_wqe = ind;
+		qp->next_sge = sge_ind;
+	}
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+	return ret;
+}
+
+static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+				 struct ib_recv_wr **bad_wr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct hns_roce_v2_wqe_data_seg *dseg;
+	struct hns_roce_rinl_sge *sge_list;
+	struct device *dev = hr_dev->dev;
+	unsigned long flags;
+	void *wqe = NULL;
+	int ret = 0;
+	int nreq;
+	int ind;
+	int i;
+
+	spin_lock_irqsave(&hr_qp->rq.lock, flags);
+	ind = hr_qp->rq.head & (hr_qp->rq.wqe_cnt - 1);
+
+	if (hr_qp->state == IB_QPS_RESET) {
+		spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (hns_roce_wq_overflow(&hr_qp->rq, nreq,
+			hr_qp->ibqp.recv_cq)) {
+			ret = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > hr_qp->rq.max_gs)) {
+			dev_err(dev, "rq:num_sge=%d > qp->sq.max_gs=%d\n",
+				wr->num_sge, hr_qp->rq.max_gs);
+			ret = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_recv_wqe(hr_qp, ind);
+		dseg = (struct hns_roce_v2_wqe_data_seg *)wqe;
+		for (i = 0; i < wr->num_sge; i++) {
+			if (!wr->sg_list[i].length)
+				continue;
+			set_data_seg_v2(dseg, wr->sg_list + i);
+			dseg++;
+		}
+
+		if (i < hr_qp->rq.max_gs) {
+			dseg->lkey = cpu_to_le32(HNS_ROCE_INVALID_LKEY);
+			dseg->addr = 0;
+		}
+
+		/* rq support inline data */
+		if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) {
+			sge_list = hr_qp->rq_inl_buf.wqe_list[ind].sg_list;
+			hr_qp->rq_inl_buf.wqe_list[ind].sge_cnt =
+							       (u32)wr->num_sge;
+			for (i = 0; i < wr->num_sge; i++) {
+				sge_list[i].addr =
+					       (void *)(u64)wr->sg_list[i].addr;
+				sge_list[i].len = wr->sg_list[i].length;
+			}
+		}
+
+		hr_qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (hr_qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (likely(nreq)) {
+		hr_qp->rq.head += nreq;
+		/* Memory barrier */
+		wmb();
+
+		*hr_qp->rdb.db_record = hr_qp->rq.head & 0xffff;
+	}
+	spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
+
+	return ret;
+}
+
+static int hns_roce_cmq_space(struct hns_roce_v2_cmq_ring *ring)
+{
+	int ntu = ring->next_to_use;
+	int ntc = ring->next_to_clean;
+	int used = (ntu - ntc + ring->desc_num) % ring->desc_num;
+
+	return ring->desc_num - used - 1;
+}
+
+static int hns_roce_alloc_cmq_desc(struct hns_roce_dev *hr_dev,
+				   struct hns_roce_v2_cmq_ring *ring)
+{
+	int size = ring->desc_num * sizeof(struct hns_roce_cmq_desc);
+
+	ring->desc = kzalloc(size, GFP_KERNEL);
+	if (!ring->desc)
+		return -ENOMEM;
+
+	ring->desc_dma_addr = dma_map_single(hr_dev->dev, ring->desc, size,
+					     DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(hr_dev->dev, ring->desc_dma_addr)) {
+		ring->desc_dma_addr = 0;
+		kfree(ring->desc);
+		ring->desc = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void hns_roce_free_cmq_desc(struct hns_roce_dev *hr_dev,
+				   struct hns_roce_v2_cmq_ring *ring)
+{
+	dma_unmap_single(hr_dev->dev, ring->desc_dma_addr,
+			 ring->desc_num * sizeof(struct hns_roce_cmq_desc),
+			 DMA_BIDIRECTIONAL);
+
+	ring->desc_dma_addr = 0;
+	kfree(ring->desc);
+}
+
+static int hns_roce_init_cmq_ring(struct hns_roce_dev *hr_dev, bool ring_type)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_cmq_ring *ring = (ring_type == TYPE_CSQ) ?
+					    &priv->cmq.csq : &priv->cmq.crq;
+
+	ring->flag = ring_type;
+	ring->next_to_clean = 0;
+	ring->next_to_use = 0;
+
+	return hns_roce_alloc_cmq_desc(hr_dev, ring);
+}
+
+static void hns_roce_cmq_init_regs(struct hns_roce_dev *hr_dev, bool ring_type)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_cmq_ring *ring = (ring_type == TYPE_CSQ) ?
+					    &priv->cmq.csq : &priv->cmq.crq;
+	dma_addr_t dma = ring->desc_dma_addr;
+
+	if (ring_type == TYPE_CSQ) {
+		roce_write(hr_dev, ROCEE_TX_CMQ_BASEADDR_L_REG, (u32)dma);
+		roce_write(hr_dev, ROCEE_TX_CMQ_BASEADDR_H_REG,
+			   upper_32_bits(dma));
+		roce_write(hr_dev, ROCEE_TX_CMQ_DEPTH_REG,
+			  (ring->desc_num >> HNS_ROCE_CMQ_DESC_NUM_S) |
+			   HNS_ROCE_CMQ_ENABLE);
+		roce_write(hr_dev, ROCEE_TX_CMQ_HEAD_REG, 0);
+		roce_write(hr_dev, ROCEE_TX_CMQ_TAIL_REG, 0);
+	} else {
+		roce_write(hr_dev, ROCEE_RX_CMQ_BASEADDR_L_REG, (u32)dma);
+		roce_write(hr_dev, ROCEE_RX_CMQ_BASEADDR_H_REG,
+			   upper_32_bits(dma));
+		roce_write(hr_dev, ROCEE_RX_CMQ_DEPTH_REG,
+			  (ring->desc_num >> HNS_ROCE_CMQ_DESC_NUM_S) |
+			   HNS_ROCE_CMQ_ENABLE);
+		roce_write(hr_dev, ROCEE_RX_CMQ_HEAD_REG, 0);
+		roce_write(hr_dev, ROCEE_RX_CMQ_TAIL_REG, 0);
+	}
+}
+
+static int hns_roce_v2_cmq_init(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	int ret;
+
+	/* Setup the queue entries for command queue */
+	priv->cmq.csq.desc_num = 1024;
+	priv->cmq.crq.desc_num = 1024;
+
+	/* Setup the lock for command queue */
+	spin_lock_init(&priv->cmq.csq.lock);
+	spin_lock_init(&priv->cmq.crq.lock);
+
+	/* Setup Tx write back timeout */
+	priv->cmq.tx_timeout = HNS_ROCE_CMQ_TX_TIMEOUT;
+
+	/* Init CSQ */
+	ret = hns_roce_init_cmq_ring(hr_dev, TYPE_CSQ);
+	if (ret) {
+		dev_err(hr_dev->dev, "Init CSQ error, ret = %d.\n", ret);
+		return ret;
+	}
+
+	/* Init CRQ */
+	ret = hns_roce_init_cmq_ring(hr_dev, TYPE_CRQ);
+	if (ret) {
+		dev_err(hr_dev->dev, "Init CRQ error, ret = %d.\n", ret);
+		goto err_crq;
+	}
+
+	/* Init CSQ REG */
+	hns_roce_cmq_init_regs(hr_dev, TYPE_CSQ);
+
+	/* Init CRQ REG */
+	hns_roce_cmq_init_regs(hr_dev, TYPE_CRQ);
+
+	return 0;
+
+err_crq:
+	hns_roce_free_cmq_desc(hr_dev, &priv->cmq.csq);
+
+	return ret;
+}
+
+static void hns_roce_v2_cmq_exit(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+
+	hns_roce_free_cmq_desc(hr_dev, &priv->cmq.csq);
+	hns_roce_free_cmq_desc(hr_dev, &priv->cmq.crq);
+}
+
+static void hns_roce_cmq_setup_basic_desc(struct hns_roce_cmq_desc *desc,
+					  enum hns_roce_opcode_type opcode,
+					  bool is_read)
+{
+	memset((void *)desc, 0, sizeof(struct hns_roce_cmq_desc));
+	desc->opcode = cpu_to_le16(opcode);
+	desc->flag =
+		cpu_to_le16(HNS_ROCE_CMD_FLAG_NO_INTR | HNS_ROCE_CMD_FLAG_IN);
+	if (is_read)
+		desc->flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_WR);
+	else
+		desc->flag &= cpu_to_le16(~HNS_ROCE_CMD_FLAG_WR);
+}
+
+static int hns_roce_cmq_csq_done(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	u32 head = roce_read(hr_dev, ROCEE_TX_CMQ_HEAD_REG);
+
+	return head == priv->cmq.csq.next_to_use;
+}
+
+static int hns_roce_cmq_csq_clean(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
+	struct hns_roce_cmq_desc *desc;
+	u16 ntc = csq->next_to_clean;
+	u32 head;
+	int clean = 0;
+
+	desc = &csq->desc[ntc];
+	head = roce_read(hr_dev, ROCEE_TX_CMQ_HEAD_REG);
+	while (head != ntc) {
+		memset(desc, 0, sizeof(*desc));
+		ntc++;
+		if (ntc == csq->desc_num)
+			ntc = 0;
+		desc = &csq->desc[ntc];
+		clean++;
+	}
+	csq->next_to_clean = ntc;
+
+	return clean;
+}
+
+static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_cmq_desc *desc, int num)
+{
+	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
+	struct hns_roce_cmq_desc *desc_to_use;
+	bool complete = false;
+	u32 timeout = 0;
+	int handle = 0;
+	u16 desc_ret;
+	int ret = 0;
+	int ntc;
+
+	spin_lock_bh(&csq->lock);
+
+	if (num > hns_roce_cmq_space(csq)) {
+		spin_unlock_bh(&csq->lock);
+		return -EBUSY;
+	}
+
+	/*
+	 * Record the location of desc in the cmq for this time
+	 * which will be use for hardware to write back
+	 */
+	ntc = csq->next_to_use;
+
+	while (handle < num) {
+		desc_to_use = &csq->desc[csq->next_to_use];
+		*desc_to_use = desc[handle];
+		dev_dbg(hr_dev->dev, "set cmq desc:\n");
+		csq->next_to_use++;
+		if (csq->next_to_use == csq->desc_num)
+			csq->next_to_use = 0;
+		handle++;
+	}
+
+	/* Write to hardware */
+	roce_write(hr_dev, ROCEE_TX_CMQ_TAIL_REG, csq->next_to_use);
+
+	/*
+	 * If the command is sync, wait for the firmware to write back,
+	 * if multi descriptors to be sent, use the first one to check
+	 */
+	if ((desc->flag) & HNS_ROCE_CMD_FLAG_NO_INTR) {
+		do {
+			if (hns_roce_cmq_csq_done(hr_dev))
+				break;
+			udelay(1);
+			timeout++;
+		} while (timeout < priv->cmq.tx_timeout);
+	}
+
+	if (hns_roce_cmq_csq_done(hr_dev)) {
+		complete = true;
+		handle = 0;
+		while (handle < num) {
+			/* get the result of hardware write back */
+			desc_to_use = &csq->desc[ntc];
+			desc[handle] = *desc_to_use;
+			dev_dbg(hr_dev->dev, "Get cmq desc:\n");
+			desc_ret = desc[handle].retval;
+			if (desc_ret == CMD_EXEC_SUCCESS)
+				ret = 0;
+			else
+				ret = -EIO;
+			priv->cmq.last_status = desc_ret;
+			ntc++;
+			handle++;
+			if (ntc == csq->desc_num)
+				ntc = 0;
+		}
+	}
+
+	if (!complete)
+		ret = -EAGAIN;
+
+	/* clean the command send queue */
+	handle = hns_roce_cmq_csq_clean(hr_dev);
+	if (handle != num)
+		dev_warn(hr_dev->dev, "Cleaned %d, need to clean %d\n",
+			 handle, num);
+
+	spin_unlock_bh(&csq->lock);
+
+	return ret;
+}
+
+static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_query_version *resp;
+	struct hns_roce_cmq_desc desc;
+	int ret;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_HW_VER, true);
+	ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+	if (ret)
+		return ret;
+
+	resp = (struct hns_roce_query_version *)desc.data;
+	hr_dev->hw_rev = le32_to_cpu(resp->rocee_hw_version);
+	hr_dev->vendor_id = le32_to_cpu(resp->rocee_vendor_id);
+
+	return 0;
+}
+
+static int hns_roce_config_global_param(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_cfg_global_param *req;
+	struct hns_roce_cmq_desc desc;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GLOBAL_PARAM,
+				      false);
+
+	req = (struct hns_roce_cfg_global_param *)desc.data;
+	memset(req, 0, sizeof(*req));
+	roce_set_field(req->time_cfg_udp_port,
+		       CFG_GLOBAL_PARAM_DATA_0_ROCEE_TIME_1US_CFG_M,
+		       CFG_GLOBAL_PARAM_DATA_0_ROCEE_TIME_1US_CFG_S, 0x3e8);
+	roce_set_field(req->time_cfg_udp_port,
+		       CFG_GLOBAL_PARAM_DATA_0_ROCEE_UDP_PORT_M,
+		       CFG_GLOBAL_PARAM_DATA_0_ROCEE_UDP_PORT_S, 0x12b7);
+
+	return hns_roce_cmq_send(hr_dev, &desc, 1);
+}
+
+static int hns_roce_query_pf_resource(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_cmq_desc desc[2];
+	struct hns_roce_pf_res *res;
+	int ret;
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		hns_roce_cmq_setup_basic_desc(&desc[i],
+					      HNS_ROCE_OPC_QUERY_PF_RES, true);
+
+		if (i == 0)
+			desc[i].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+		else
+			desc[i].flag &= ~cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+	}
+
+	ret = hns_roce_cmq_send(hr_dev, desc, 2);
+	if (ret)
+		return ret;
+
+	res = (struct hns_roce_pf_res *)desc[0].data;
+
+	hr_dev->caps.qpc_bt_num = roce_get_field(res->qpc_bt_idx_num,
+						 PF_RES_DATA_1_PF_QPC_BT_NUM_M,
+						 PF_RES_DATA_1_PF_QPC_BT_NUM_S);
+	hr_dev->caps.srqc_bt_num = roce_get_field(res->srqc_bt_idx_num,
+						PF_RES_DATA_2_PF_SRQC_BT_NUM_M,
+						PF_RES_DATA_2_PF_SRQC_BT_NUM_S);
+	hr_dev->caps.cqc_bt_num = roce_get_field(res->cqc_bt_idx_num,
+						 PF_RES_DATA_3_PF_CQC_BT_NUM_M,
+						 PF_RES_DATA_3_PF_CQC_BT_NUM_S);
+	hr_dev->caps.mpt_bt_num = roce_get_field(res->mpt_bt_idx_num,
+						 PF_RES_DATA_4_PF_MPT_BT_NUM_M,
+						 PF_RES_DATA_4_PF_MPT_BT_NUM_S);
+
+	return 0;
+}
+
+static int hns_roce_alloc_vf_resource(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_cmq_desc desc[2];
+	struct hns_roce_vf_res_a *req_a;
+	struct hns_roce_vf_res_b *req_b;
+	int i;
+
+	req_a = (struct hns_roce_vf_res_a *)desc[0].data;
+	req_b = (struct hns_roce_vf_res_b *)desc[1].data;
+	memset(req_a, 0, sizeof(*req_a));
+	memset(req_b, 0, sizeof(*req_b));
+	for (i = 0; i < 2; i++) {
+		hns_roce_cmq_setup_basic_desc(&desc[i],
+					      HNS_ROCE_OPC_ALLOC_VF_RES, false);
+
+		if (i == 0)
+			desc[i].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+		else
+			desc[i].flag &= ~cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+
+		if (i == 0) {
+			roce_set_field(req_a->vf_qpc_bt_idx_num,
+				       VF_RES_A_DATA_1_VF_QPC_BT_IDX_M,
+				       VF_RES_A_DATA_1_VF_QPC_BT_IDX_S, 0);
+			roce_set_field(req_a->vf_qpc_bt_idx_num,
+				       VF_RES_A_DATA_1_VF_QPC_BT_NUM_M,
+				       VF_RES_A_DATA_1_VF_QPC_BT_NUM_S,
+				       HNS_ROCE_VF_QPC_BT_NUM);
+
+			roce_set_field(req_a->vf_srqc_bt_idx_num,
+				       VF_RES_A_DATA_2_VF_SRQC_BT_IDX_M,
+				       VF_RES_A_DATA_2_VF_SRQC_BT_IDX_S, 0);
+			roce_set_field(req_a->vf_srqc_bt_idx_num,
+				       VF_RES_A_DATA_2_VF_SRQC_BT_NUM_M,
+				       VF_RES_A_DATA_2_VF_SRQC_BT_NUM_S,
+				       HNS_ROCE_VF_SRQC_BT_NUM);
+
+			roce_set_field(req_a->vf_cqc_bt_idx_num,
+				       VF_RES_A_DATA_3_VF_CQC_BT_IDX_M,
+				       VF_RES_A_DATA_3_VF_CQC_BT_IDX_S, 0);
+			roce_set_field(req_a->vf_cqc_bt_idx_num,
+				       VF_RES_A_DATA_3_VF_CQC_BT_NUM_M,
+				       VF_RES_A_DATA_3_VF_CQC_BT_NUM_S,
+				       HNS_ROCE_VF_CQC_BT_NUM);
+
+			roce_set_field(req_a->vf_mpt_bt_idx_num,
+				       VF_RES_A_DATA_4_VF_MPT_BT_IDX_M,
+				       VF_RES_A_DATA_4_VF_MPT_BT_IDX_S, 0);
+			roce_set_field(req_a->vf_mpt_bt_idx_num,
+				       VF_RES_A_DATA_4_VF_MPT_BT_NUM_M,
+				       VF_RES_A_DATA_4_VF_MPT_BT_NUM_S,
+				       HNS_ROCE_VF_MPT_BT_NUM);
+
+			roce_set_field(req_a->vf_eqc_bt_idx_num,
+				       VF_RES_A_DATA_5_VF_EQC_IDX_M,
+				       VF_RES_A_DATA_5_VF_EQC_IDX_S, 0);
+			roce_set_field(req_a->vf_eqc_bt_idx_num,
+				       VF_RES_A_DATA_5_VF_EQC_NUM_M,
+				       VF_RES_A_DATA_5_VF_EQC_NUM_S,
+				       HNS_ROCE_VF_EQC_NUM);
+		} else {
+			roce_set_field(req_b->vf_smac_idx_num,
+				       VF_RES_B_DATA_1_VF_SMAC_IDX_M,
+				       VF_RES_B_DATA_1_VF_SMAC_IDX_S, 0);
+			roce_set_field(req_b->vf_smac_idx_num,
+				       VF_RES_B_DATA_1_VF_SMAC_NUM_M,
+				       VF_RES_B_DATA_1_VF_SMAC_NUM_S,
+				       HNS_ROCE_VF_SMAC_NUM);
+
+			roce_set_field(req_b->vf_sgid_idx_num,
+				       VF_RES_B_DATA_2_VF_SGID_IDX_M,
+				       VF_RES_B_DATA_2_VF_SGID_IDX_S, 0);
+			roce_set_field(req_b->vf_sgid_idx_num,
+				       VF_RES_B_DATA_2_VF_SGID_NUM_M,
+				       VF_RES_B_DATA_2_VF_SGID_NUM_S,
+				       HNS_ROCE_VF_SGID_NUM);
+
+			roce_set_field(req_b->vf_qid_idx_sl_num,
+				       VF_RES_B_DATA_3_VF_QID_IDX_M,
+				       VF_RES_B_DATA_3_VF_QID_IDX_S, 0);
+			roce_set_field(req_b->vf_qid_idx_sl_num,
+				       VF_RES_B_DATA_3_VF_SL_NUM_M,
+				       VF_RES_B_DATA_3_VF_SL_NUM_S,
+				       HNS_ROCE_VF_SL_NUM);
+		}
+	}
+
+	return hns_roce_cmq_send(hr_dev, desc, 2);
+}
+
+static int hns_roce_v2_set_bt(struct hns_roce_dev *hr_dev)
+{
+	u8 srqc_hop_num = hr_dev->caps.srqc_hop_num;
+	u8 qpc_hop_num = hr_dev->caps.qpc_hop_num;
+	u8 cqc_hop_num = hr_dev->caps.cqc_hop_num;
+	u8 mpt_hop_num = hr_dev->caps.mpt_hop_num;
+	struct hns_roce_cfg_bt_attr *req;
+	struct hns_roce_cmq_desc desc;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_BT_ATTR, false);
+	req = (struct hns_roce_cfg_bt_attr *)desc.data;
+	memset(req, 0, sizeof(*req));
+
+	roce_set_field(req->vf_qpc_cfg, CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_M,
+		       CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_S,
+		       hr_dev->caps.qpc_ba_pg_sz);
+	roce_set_field(req->vf_qpc_cfg, CFG_BT_ATTR_DATA_0_VF_QPC_BUF_PGSZ_M,
+		       CFG_BT_ATTR_DATA_0_VF_QPC_BUF_PGSZ_S,
+		       hr_dev->caps.qpc_buf_pg_sz);
+	roce_set_field(req->vf_qpc_cfg, CFG_BT_ATTR_DATA_0_VF_QPC_HOPNUM_M,
+		       CFG_BT_ATTR_DATA_0_VF_QPC_HOPNUM_S,
+		       qpc_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : qpc_hop_num);
+
+	roce_set_field(req->vf_srqc_cfg, CFG_BT_ATTR_DATA_1_VF_SRQC_BA_PGSZ_M,
+		       CFG_BT_ATTR_DATA_1_VF_SRQC_BA_PGSZ_S,
+		       hr_dev->caps.srqc_ba_pg_sz);
+	roce_set_field(req->vf_srqc_cfg, CFG_BT_ATTR_DATA_1_VF_SRQC_BUF_PGSZ_M,
+		       CFG_BT_ATTR_DATA_1_VF_SRQC_BUF_PGSZ_S,
+		       hr_dev->caps.srqc_buf_pg_sz);
+	roce_set_field(req->vf_srqc_cfg, CFG_BT_ATTR_DATA_1_VF_SRQC_HOPNUM_M,
+		       CFG_BT_ATTR_DATA_1_VF_SRQC_HOPNUM_S,
+		       srqc_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : srqc_hop_num);
+
+	roce_set_field(req->vf_cqc_cfg, CFG_BT_ATTR_DATA_2_VF_CQC_BA_PGSZ_M,
+		       CFG_BT_ATTR_DATA_2_VF_CQC_BA_PGSZ_S,
+		       hr_dev->caps.cqc_ba_pg_sz);
+	roce_set_field(req->vf_cqc_cfg, CFG_BT_ATTR_DATA_2_VF_CQC_BUF_PGSZ_M,
+		       CFG_BT_ATTR_DATA_2_VF_CQC_BUF_PGSZ_S,
+		       hr_dev->caps.cqc_buf_pg_sz);
+	roce_set_field(req->vf_cqc_cfg, CFG_BT_ATTR_DATA_2_VF_CQC_HOPNUM_M,
+		       CFG_BT_ATTR_DATA_2_VF_CQC_HOPNUM_S,
+		       cqc_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : cqc_hop_num);
+
+	roce_set_field(req->vf_mpt_cfg, CFG_BT_ATTR_DATA_3_VF_MPT_BA_PGSZ_M,
+		       CFG_BT_ATTR_DATA_3_VF_MPT_BA_PGSZ_S,
+		       hr_dev->caps.mpt_ba_pg_sz);
+	roce_set_field(req->vf_mpt_cfg, CFG_BT_ATTR_DATA_3_VF_MPT_BUF_PGSZ_M,
+		       CFG_BT_ATTR_DATA_3_VF_MPT_BUF_PGSZ_S,
+		       hr_dev->caps.mpt_buf_pg_sz);
+	roce_set_field(req->vf_mpt_cfg, CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_M,
+		       CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_S,
+		       mpt_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : mpt_hop_num);
+
+	return hns_roce_cmq_send(hr_dev, &desc, 1);
+}
+
+static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_caps *caps = &hr_dev->caps;
+	int ret;
+
+	ret = hns_roce_cmq_query_hw_info(hr_dev);
+	if (ret) {
+		dev_err(hr_dev->dev, "Query firmware version fail, ret = %d.\n",
+			ret);
+		return ret;
+	}
+
+	ret = hns_roce_config_global_param(hr_dev);
+	if (ret) {
+		dev_err(hr_dev->dev, "Configure global param fail, ret = %d.\n",
+			ret);
+		return ret;
+	}
+
+	/* Get pf resource owned by every pf */
+	ret = hns_roce_query_pf_resource(hr_dev);
+	if (ret) {
+		dev_err(hr_dev->dev, "Query pf resource fail, ret = %d.\n",
+			ret);
+		return ret;
+	}
+
+	ret = hns_roce_alloc_vf_resource(hr_dev);
+	if (ret) {
+		dev_err(hr_dev->dev, "Allocate vf resource fail, ret = %d.\n",
+			ret);
+		return ret;
+	}
+
+	hr_dev->vendor_part_id = 0;
+	hr_dev->sys_image_guid = 0;
+
+	caps->num_qps		= HNS_ROCE_V2_MAX_QP_NUM;
+	caps->max_wqes		= HNS_ROCE_V2_MAX_WQE_NUM;
+	caps->num_cqs		= HNS_ROCE_V2_MAX_CQ_NUM;
+	caps->max_cqes		= HNS_ROCE_V2_MAX_CQE_NUM;
+	caps->max_sq_sg		= HNS_ROCE_V2_MAX_SQ_SGE_NUM;
+	caps->max_rq_sg		= HNS_ROCE_V2_MAX_RQ_SGE_NUM;
+	caps->max_sq_inline	= HNS_ROCE_V2_MAX_SQ_INLINE;
+	caps->num_uars		= HNS_ROCE_V2_UAR_NUM;
+	caps->phy_num_uars	= HNS_ROCE_V2_PHY_UAR_NUM;
+	caps->num_aeq_vectors	= HNS_ROCE_V2_AEQE_VEC_NUM;
+	caps->num_comp_vectors	= HNS_ROCE_V2_COMP_VEC_NUM;
+	caps->num_other_vectors	= HNS_ROCE_V2_ABNORMAL_VEC_NUM;
+	caps->num_mtpts		= HNS_ROCE_V2_MAX_MTPT_NUM;
+	caps->num_mtt_segs	= HNS_ROCE_V2_MAX_MTT_SEGS;
+	caps->num_cqe_segs	= HNS_ROCE_V2_MAX_CQE_SEGS;
+	caps->num_pds		= HNS_ROCE_V2_MAX_PD_NUM;
+	caps->max_qp_init_rdma	= HNS_ROCE_V2_MAX_QP_INIT_RDMA;
+	caps->max_qp_dest_rdma	= HNS_ROCE_V2_MAX_QP_DEST_RDMA;
+	caps->max_sq_desc_sz	= HNS_ROCE_V2_MAX_SQ_DESC_SZ;
+	caps->max_rq_desc_sz	= HNS_ROCE_V2_MAX_RQ_DESC_SZ;
+	caps->max_srq_desc_sz	= HNS_ROCE_V2_MAX_SRQ_DESC_SZ;
+	caps->qpc_entry_sz	= HNS_ROCE_V2_QPC_ENTRY_SZ;
+	caps->irrl_entry_sz	= HNS_ROCE_V2_IRRL_ENTRY_SZ;
+	caps->trrl_entry_sz	= HNS_ROCE_V2_TRRL_ENTRY_SZ;
+	caps->cqc_entry_sz	= HNS_ROCE_V2_CQC_ENTRY_SZ;
+	caps->mtpt_entry_sz	= HNS_ROCE_V2_MTPT_ENTRY_SZ;
+	caps->mtt_entry_sz	= HNS_ROCE_V2_MTT_ENTRY_SZ;
+	caps->cq_entry_sz	= HNS_ROCE_V2_CQE_ENTRY_SIZE;
+	caps->page_size_cap	= HNS_ROCE_V2_PAGE_SIZE_SUPPORTED;
+	caps->reserved_lkey	= 0;
+	caps->reserved_pds	= 0;
+	caps->reserved_mrws	= 1;
+	caps->reserved_uars	= 0;
+	caps->reserved_cqs	= 0;
+
+	caps->qpc_ba_pg_sz	= 0;
+	caps->qpc_buf_pg_sz	= 0;
+	caps->qpc_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
+	caps->srqc_ba_pg_sz	= 0;
+	caps->srqc_buf_pg_sz	= 0;
+	caps->srqc_hop_num	= HNS_ROCE_HOP_NUM_0;
+	caps->cqc_ba_pg_sz	= 0;
+	caps->cqc_buf_pg_sz	= 0;
+	caps->cqc_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
+	caps->mpt_ba_pg_sz	= 0;
+	caps->mpt_buf_pg_sz	= 0;
+	caps->mpt_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
+	caps->pbl_ba_pg_sz	= 0;
+	caps->pbl_buf_pg_sz	= 0;
+	caps->pbl_hop_num	= HNS_ROCE_PBL_HOP_NUM;
+	caps->mtt_ba_pg_sz	= 0;
+	caps->mtt_buf_pg_sz	= 0;
+	caps->mtt_hop_num	= HNS_ROCE_MTT_HOP_NUM;
+	caps->cqe_ba_pg_sz	= 0;
+	caps->cqe_buf_pg_sz	= 0;
+	caps->cqe_hop_num	= HNS_ROCE_CQE_HOP_NUM;
+	caps->eqe_ba_pg_sz	= 0;
+	caps->eqe_buf_pg_sz	= 0;
+	caps->eqe_hop_num	= HNS_ROCE_EQE_HOP_NUM;
+	caps->chunk_sz		= HNS_ROCE_V2_TABLE_CHUNK_SIZE;
+
+	caps->flags		= HNS_ROCE_CAP_FLAG_REREG_MR |
+				  HNS_ROCE_CAP_FLAG_ROCE_V1_V2 |
+				  HNS_ROCE_CAP_FLAG_RQ_INLINE |
+				  HNS_ROCE_CAP_FLAG_RECORD_DB;
+	caps->pkey_table_len[0] = 1;
+	caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM;
+	caps->ceqe_depth	= HNS_ROCE_V2_COMP_EQE_NUM;
+	caps->aeqe_depth	= HNS_ROCE_V2_ASYNC_EQE_NUM;
+	caps->local_ca_ack_delay = 0;
+	caps->max_mtu = IB_MTU_4096;
+
+	ret = hns_roce_v2_set_bt(hr_dev);
+	if (ret)
+		dev_err(hr_dev->dev, "Configure bt attribute fail, ret = %d.\n",
+			ret);
+
+	return ret;
+}
+
+static int hns_roce_v2_cmd_pending(struct hns_roce_dev *hr_dev)
+{
+	u32 status = readl(hr_dev->reg_base + ROCEE_VF_MB_STATUS_REG);
+
+	return status >> HNS_ROCE_HW_RUN_BIT_SHIFT;
+}
+
+static int hns_roce_v2_cmd_complete(struct hns_roce_dev *hr_dev)
+{
+	u32 status = readl(hr_dev->reg_base + ROCEE_VF_MB_STATUS_REG);
+
+	return status & HNS_ROCE_HW_MB_STATUS_MASK;
+}
+
+static int hns_roce_v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param,
+				 u64 out_param, u32 in_modifier, u8 op_modifier,
+				 u16 op, u16 token, int event)
+{
+	struct device *dev = hr_dev->dev;
+	u32 __iomem *hcr = (u32 __iomem *)(hr_dev->reg_base +
+					   ROCEE_VF_MB_CFG0_REG);
+	unsigned long end;
+	u32 val0 = 0;
+	u32 val1 = 0;
+
+	end = msecs_to_jiffies(HNS_ROCE_V2_GO_BIT_TIMEOUT_MSECS) + jiffies;
+	while (hns_roce_v2_cmd_pending(hr_dev)) {
+		if (time_after(jiffies, end)) {
+			dev_dbg(dev, "jiffies=%d end=%d\n", (int)jiffies,
+				(int)end);
+			return -EAGAIN;
+		}
+		cond_resched();
+	}
+
+	roce_set_field(val0, HNS_ROCE_VF_MB4_TAG_MASK,
+		       HNS_ROCE_VF_MB4_TAG_SHIFT, in_modifier);
+	roce_set_field(val0, HNS_ROCE_VF_MB4_CMD_MASK,
+		       HNS_ROCE_VF_MB4_CMD_SHIFT, op);
+	roce_set_field(val1, HNS_ROCE_VF_MB5_EVENT_MASK,
+		       HNS_ROCE_VF_MB5_EVENT_SHIFT, event);
+	roce_set_field(val1, HNS_ROCE_VF_MB5_TOKEN_MASK,
+		       HNS_ROCE_VF_MB5_TOKEN_SHIFT, token);
+
+	writeq(in_param, hcr + 0);
+	writeq(out_param, hcr + 2);
+
+	/* Memory barrier */
+	wmb();
+
+	writel(val0, hcr + 4);
+	writel(val1, hcr + 5);
+
+	mmiowb();
+
+	return 0;
+}
+
+static int hns_roce_v2_chk_mbox(struct hns_roce_dev *hr_dev,
+				unsigned long timeout)
+{
+	struct device *dev = hr_dev->dev;
+	unsigned long end = 0;
+	u32 status;
+
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (hns_roce_v2_cmd_pending(hr_dev) && time_before(jiffies, end))
+		cond_resched();
+
+	if (hns_roce_v2_cmd_pending(hr_dev)) {
+		dev_err(dev, "[cmd_poll]hw run cmd TIMEDOUT!\n");
+		return -ETIMEDOUT;
+	}
+
+	status = hns_roce_v2_cmd_complete(hr_dev);
+	if (status != 0x1) {
+		dev_err(dev, "mailbox status 0x%x!\n", status);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int hns_roce_v2_set_gid(struct hns_roce_dev *hr_dev, u8 port,
+			       int gid_index, union ib_gid *gid,
+			       const struct ib_gid_attr *attr)
+{
+	enum hns_roce_sgid_type sgid_type = GID_TYPE_FLAG_ROCE_V1;
+	u32 *p;
+	u32 val;
+
+	if (!gid || !attr)
+		return -EINVAL;
+
+	if (attr->gid_type == IB_GID_TYPE_ROCE)
+		sgid_type = GID_TYPE_FLAG_ROCE_V1;
+
+	if (attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
+		if (ipv6_addr_v4mapped((void *)gid))
+			sgid_type = GID_TYPE_FLAG_ROCE_V2_IPV4;
+		else
+			sgid_type = GID_TYPE_FLAG_ROCE_V2_IPV6;
+	}
+
+	p = (u32 *)&gid->raw[0];
+	roce_raw_write(*p, hr_dev->reg_base + ROCEE_VF_SGID_CFG0_REG +
+		       0x20 * gid_index);
+
+	p = (u32 *)&gid->raw[4];
+	roce_raw_write(*p, hr_dev->reg_base + ROCEE_VF_SGID_CFG1_REG +
+		       0x20 * gid_index);
+
+	p = (u32 *)&gid->raw[8];
+	roce_raw_write(*p, hr_dev->reg_base + ROCEE_VF_SGID_CFG2_REG +
+		       0x20 * gid_index);
+
+	p = (u32 *)&gid->raw[0xc];
+	roce_raw_write(*p, hr_dev->reg_base + ROCEE_VF_SGID_CFG3_REG +
+		       0x20 * gid_index);
+
+	val = roce_read(hr_dev, ROCEE_VF_SGID_CFG4_REG + 0x20 * gid_index);
+	roce_set_field(val, ROCEE_VF_SGID_CFG4_SGID_TYPE_M,
+		       ROCEE_VF_SGID_CFG4_SGID_TYPE_S, sgid_type);
+
+	roce_write(hr_dev, ROCEE_VF_SGID_CFG4_REG + 0x20 * gid_index, val);
+
+	return 0;
+}
+
+static int hns_roce_v2_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port,
+			       u8 *addr)
+{
+	u16 reg_smac_h;
+	u32 reg_smac_l;
+	u32 val;
+
+	reg_smac_l = *(u32 *)(&addr[0]);
+	roce_raw_write(reg_smac_l, hr_dev->reg_base + ROCEE_VF_SMAC_CFG0_REG +
+		       0x08 * phy_port);
+	val = roce_read(hr_dev, ROCEE_VF_SMAC_CFG1_REG + 0x08 * phy_port);
+
+	reg_smac_h  = *(u16 *)(&addr[4]);
+	roce_set_field(val, ROCEE_VF_SMAC_CFG1_VF_SMAC_H_M,
+		       ROCEE_VF_SMAC_CFG1_VF_SMAC_H_S, reg_smac_h);
+	roce_write(hr_dev, ROCEE_VF_SMAC_CFG1_REG + 0x08 * phy_port, val);
+
+	return 0;
+}
+
+static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr,
+				  unsigned long mtpt_idx)
+{
+	struct hns_roce_v2_mpt_entry *mpt_entry;
+	struct scatterlist *sg;
+	u64 page_addr;
+	u64 *pages;
+	int i, j;
+	int len;
+	int entry;
+
+	mpt_entry = mb_buf;
+	memset(mpt_entry, 0, sizeof(*mpt_entry));
+
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M,
+		       V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_VALID);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_HOP_NUM_M,
+		       V2_MPT_BYTE_4_PBL_HOP_NUM_S, mr->pbl_hop_num ==
+		       HNS_ROCE_HOP_NUM_0 ? 0 : mr->pbl_hop_num);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st,
+		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_M,
+		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_S, mr->pbl_ba_pg_sz);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M,
+		       V2_MPT_BYTE_4_PD_S, mr->pd);
+	mpt_entry->byte_4_pd_hop_st = cpu_to_le32(mpt_entry->byte_4_pd_hop_st);
+
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RA_EN_S, 0);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 0);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_BIND_EN_S,
+		     (mr->access & IB_ACCESS_MW_BIND ? 1 : 0));
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_ATOMIC_EN_S, 0);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RR_EN_S,
+		     (mr->access & IB_ACCESS_REMOTE_READ ? 1 : 0));
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RW_EN_S,
+		     (mr->access & IB_ACCESS_REMOTE_WRITE ? 1 : 0));
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_LW_EN_S,
+		     (mr->access & IB_ACCESS_LOCAL_WRITE ? 1 : 0));
+	mpt_entry->byte_8_mw_cnt_en = cpu_to_le32(mpt_entry->byte_8_mw_cnt_en);
+
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_PA_S,
+		     mr->type == MR_TYPE_MR ? 0 : 1);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_INNER_PA_VLD_S,
+		     1);
+	mpt_entry->byte_12_mw_pa = cpu_to_le32(mpt_entry->byte_12_mw_pa);
+
+	mpt_entry->len_l = cpu_to_le32(lower_32_bits(mr->size));
+	mpt_entry->len_h = cpu_to_le32(upper_32_bits(mr->size));
+	mpt_entry->lkey = cpu_to_le32(mr->key);
+	mpt_entry->va_l = cpu_to_le32(lower_32_bits(mr->iova));
+	mpt_entry->va_h = cpu_to_le32(upper_32_bits(mr->iova));
+
+	if (mr->type == MR_TYPE_DMA)
+		return 0;
+
+	mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size);
+
+	mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3));
+	roce_set_field(mpt_entry->byte_48_mode_ba, V2_MPT_BYTE_48_PBL_BA_H_M,
+		       V2_MPT_BYTE_48_PBL_BA_H_S,
+		       upper_32_bits(mr->pbl_ba >> 3));
+	mpt_entry->byte_48_mode_ba = cpu_to_le32(mpt_entry->byte_48_mode_ba);
+
+	pages = (u64 *)__get_free_page(GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	i = 0;
+	for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
+		len = sg_dma_len(sg) >> PAGE_SHIFT;
+		for (j = 0; j < len; ++j) {
+			page_addr = sg_dma_address(sg) +
+				    (j << mr->umem->page_shift);
+			pages[i] = page_addr >> 6;
+
+			/* Record the first 2 entry directly to MTPT table */
+			if (i >= HNS_ROCE_V2_MAX_INNER_MTPT_NUM - 1)
+				goto found;
+			i++;
+		}
+	}
+
+found:
+	mpt_entry->pa0_l = cpu_to_le32(lower_32_bits(pages[0]));
+	roce_set_field(mpt_entry->byte_56_pa0_h, V2_MPT_BYTE_56_PA0_H_M,
+		       V2_MPT_BYTE_56_PA0_H_S,
+		       upper_32_bits(pages[0]));
+	mpt_entry->byte_56_pa0_h = cpu_to_le32(mpt_entry->byte_56_pa0_h);
+
+	mpt_entry->pa1_l = cpu_to_le32(lower_32_bits(pages[1]));
+	roce_set_field(mpt_entry->byte_64_buf_pa1, V2_MPT_BYTE_64_PA1_H_M,
+		       V2_MPT_BYTE_64_PA1_H_S, upper_32_bits(pages[1]));
+
+	free_page((unsigned long)pages);
+
+	roce_set_field(mpt_entry->byte_64_buf_pa1,
+		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M,
+		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, mr->pbl_buf_pg_sz);
+	mpt_entry->byte_64_buf_pa1 = cpu_to_le32(mpt_entry->byte_64_buf_pa1);
+
+	return 0;
+}
+
+static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev,
+					struct hns_roce_mr *mr, int flags,
+					u32 pdn, int mr_access_flags, u64 iova,
+					u64 size, void *mb_buf)
+{
+	struct hns_roce_v2_mpt_entry *mpt_entry = mb_buf;
+
+	if (flags & IB_MR_REREG_PD) {
+		roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M,
+			       V2_MPT_BYTE_4_PD_S, pdn);
+		mr->pd = pdn;
+	}
+
+	if (flags & IB_MR_REREG_ACCESS) {
+		roce_set_bit(mpt_entry->byte_8_mw_cnt_en,
+			     V2_MPT_BYTE_8_BIND_EN_S,
+			     (mr_access_flags & IB_ACCESS_MW_BIND ? 1 : 0));
+		roce_set_bit(mpt_entry->byte_8_mw_cnt_en,
+			   V2_MPT_BYTE_8_ATOMIC_EN_S,
+			   (mr_access_flags & IB_ACCESS_REMOTE_ATOMIC ? 1 : 0));
+		roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RR_EN_S,
+			     (mr_access_flags & IB_ACCESS_REMOTE_READ ? 1 : 0));
+		roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RW_EN_S,
+			    (mr_access_flags & IB_ACCESS_REMOTE_WRITE ? 1 : 0));
+		roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_LW_EN_S,
+			     (mr_access_flags & IB_ACCESS_LOCAL_WRITE ? 1 : 0));
+	}
+
+	if (flags & IB_MR_REREG_TRANS) {
+		mpt_entry->va_l = cpu_to_le32(lower_32_bits(iova));
+		mpt_entry->va_h = cpu_to_le32(upper_32_bits(iova));
+		mpt_entry->len_l = cpu_to_le32(lower_32_bits(size));
+		mpt_entry->len_h = cpu_to_le32(upper_32_bits(size));
+
+		mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size);
+		mpt_entry->pbl_ba_l =
+				cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3));
+		roce_set_field(mpt_entry->byte_48_mode_ba,
+			       V2_MPT_BYTE_48_PBL_BA_H_M,
+			       V2_MPT_BYTE_48_PBL_BA_H_S,
+			       upper_32_bits(mr->pbl_ba >> 3));
+		mpt_entry->byte_48_mode_ba =
+				cpu_to_le32(mpt_entry->byte_48_mode_ba);
+
+		mr->iova = iova;
+		mr->size = size;
+	}
+
+	return 0;
+}
+
+static void *get_cqe_v2(struct hns_roce_cq *hr_cq, int n)
+{
+	return hns_roce_buf_offset(&hr_cq->hr_buf.hr_buf,
+				   n * HNS_ROCE_V2_CQE_ENTRY_SIZE);
+}
+
+static void *get_sw_cqe_v2(struct hns_roce_cq *hr_cq, int n)
+{
+	struct hns_roce_v2_cqe *cqe = get_cqe_v2(hr_cq, n & hr_cq->ib_cq.cqe);
+
+	/* Get cqe when Owner bit is Conversely with the MSB of cons_idx */
+	return (roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_OWNER_S) ^
+		!!(n & (hr_cq->ib_cq.cqe + 1))) ? cqe : NULL;
+}
+
+static struct hns_roce_v2_cqe *next_cqe_sw_v2(struct hns_roce_cq *hr_cq)
+{
+	return get_sw_cqe_v2(hr_cq, hr_cq->cons_index);
+}
+
+static void hns_roce_v2_cq_set_ci(struct hns_roce_cq *hr_cq, u32 cons_index)
+{
+	*hr_cq->set_ci_db = cons_index & 0xffffff;
+}
+
+static void __hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
+				   struct hns_roce_srq *srq)
+{
+	struct hns_roce_v2_cqe *cqe, *dest;
+	u32 prod_index;
+	int nfreed = 0;
+	u8 owner_bit;
+
+	for (prod_index = hr_cq->cons_index; get_sw_cqe_v2(hr_cq, prod_index);
+	     ++prod_index) {
+		if (prod_index == hr_cq->cons_index + hr_cq->ib_cq.cqe)
+			break;
+	}
+
+	/*
+	 * Now backwards through the CQ, removing CQ entries
+	 * that match our QP by overwriting them with next entries.
+	 */
+	while ((int) --prod_index - (int) hr_cq->cons_index >= 0) {
+		cqe = get_cqe_v2(hr_cq, prod_index & hr_cq->ib_cq.cqe);
+		if ((roce_get_field(cqe->byte_16, V2_CQE_BYTE_16_LCL_QPN_M,
+				    V2_CQE_BYTE_16_LCL_QPN_S) &
+				    HNS_ROCE_V2_CQE_QPN_MASK) == qpn) {
+			/* In v1 engine, not support SRQ */
+			++nfreed;
+		} else if (nfreed) {
+			dest = get_cqe_v2(hr_cq, (prod_index + nfreed) &
+					  hr_cq->ib_cq.cqe);
+			owner_bit = roce_get_bit(dest->byte_4,
+						 V2_CQE_BYTE_4_OWNER_S);
+			memcpy(dest, cqe, sizeof(*cqe));
+			roce_set_bit(dest->byte_4, V2_CQE_BYTE_4_OWNER_S,
+				     owner_bit);
+		}
+	}
+
+	if (nfreed) {
+		hr_cq->cons_index += nfreed;
+		/*
+		 * Make sure update of buffer contents is done before
+		 * updating consumer index.
+		 */
+		wmb();
+		hns_roce_v2_cq_set_ci(hr_cq, hr_cq->cons_index);
+	}
+}
+
+static void hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
+				 struct hns_roce_srq *srq)
+{
+	spin_lock_irq(&hr_cq->lock);
+	__hns_roce_v2_cq_clean(hr_cq, qpn, srq);
+	spin_unlock_irq(&hr_cq->lock);
+}
+
+static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev,
+				  struct hns_roce_cq *hr_cq, void *mb_buf,
+				  u64 *mtts, dma_addr_t dma_handle, int nent,
+				  u32 vector)
+{
+	struct hns_roce_v2_cq_context *cq_context;
+
+	cq_context = mb_buf;
+	memset(cq_context, 0, sizeof(*cq_context));
+
+	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_CQ_ST_M,
+		       V2_CQC_BYTE_4_CQ_ST_S, V2_CQ_STATE_VALID);
+	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_ARM_ST_M,
+		       V2_CQC_BYTE_4_ARM_ST_S, REG_NXT_CEQE);
+	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_SHIFT_M,
+		       V2_CQC_BYTE_4_SHIFT_S, ilog2((unsigned int)nent));
+	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_CEQN_M,
+		       V2_CQC_BYTE_4_CEQN_S, vector);
+	cq_context->byte_4_pg_ceqn = cpu_to_le32(cq_context->byte_4_pg_ceqn);
+
+	roce_set_field(cq_context->byte_8_cqn, V2_CQC_BYTE_8_CQN_M,
+		       V2_CQC_BYTE_8_CQN_S, hr_cq->cqn);
+
+	cq_context->cqe_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT);
+	cq_context->cqe_cur_blk_addr =
+				cpu_to_le32(cq_context->cqe_cur_blk_addr);
+
+	roce_set_field(cq_context->byte_16_hop_addr,
+		       V2_CQC_BYTE_16_CQE_CUR_BLK_ADDR_M,
+		       V2_CQC_BYTE_16_CQE_CUR_BLK_ADDR_S,
+		       cpu_to_le32((mtts[0]) >> (32 + PAGE_ADDR_SHIFT)));
+	roce_set_field(cq_context->byte_16_hop_addr,
+		       V2_CQC_BYTE_16_CQE_HOP_NUM_M,
+		       V2_CQC_BYTE_16_CQE_HOP_NUM_S, hr_dev->caps.cqe_hop_num ==
+		       HNS_ROCE_HOP_NUM_0 ? 0 : hr_dev->caps.cqe_hop_num);
+
+	cq_context->cqe_nxt_blk_addr = (u32)(mtts[1] >> PAGE_ADDR_SHIFT);
+	roce_set_field(cq_context->byte_24_pgsz_addr,
+		       V2_CQC_BYTE_24_CQE_NXT_BLK_ADDR_M,
+		       V2_CQC_BYTE_24_CQE_NXT_BLK_ADDR_S,
+		       cpu_to_le32((mtts[1]) >> (32 + PAGE_ADDR_SHIFT)));
+	roce_set_field(cq_context->byte_24_pgsz_addr,
+		       V2_CQC_BYTE_24_CQE_BA_PG_SZ_M,
+		       V2_CQC_BYTE_24_CQE_BA_PG_SZ_S,
+		       hr_dev->caps.cqe_ba_pg_sz);
+	roce_set_field(cq_context->byte_24_pgsz_addr,
+		       V2_CQC_BYTE_24_CQE_BUF_PG_SZ_M,
+		       V2_CQC_BYTE_24_CQE_BUF_PG_SZ_S,
+		       hr_dev->caps.cqe_buf_pg_sz);
+
+	cq_context->cqe_ba = (u32)(dma_handle >> 3);
+
+	roce_set_field(cq_context->byte_40_cqe_ba, V2_CQC_BYTE_40_CQE_BA_M,
+		       V2_CQC_BYTE_40_CQE_BA_S, (dma_handle >> (32 + 3)));
+
+	if (hr_cq->db_en)
+		roce_set_bit(cq_context->byte_44_db_record,
+			     V2_CQC_BYTE_44_DB_RECORD_EN_S, 1);
+
+	roce_set_field(cq_context->byte_44_db_record,
+		       V2_CQC_BYTE_44_DB_RECORD_ADDR_M,
+		       V2_CQC_BYTE_44_DB_RECORD_ADDR_S,
+		       ((u32)hr_cq->db.dma) >> 1);
+	cq_context->db_record_addr = hr_cq->db.dma >> 32;
+
+	roce_set_field(cq_context->byte_56_cqe_period_maxcnt,
+		       V2_CQC_BYTE_56_CQ_MAX_CNT_M,
+		       V2_CQC_BYTE_56_CQ_MAX_CNT_S,
+		       HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM);
+	roce_set_field(cq_context->byte_56_cqe_period_maxcnt,
+		       V2_CQC_BYTE_56_CQ_PERIOD_M,
+		       V2_CQC_BYTE_56_CQ_PERIOD_S,
+		       HNS_ROCE_V2_CQ_DEFAULT_INTERVAL);
+}
+
+static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq,
+				     enum ib_cq_notify_flags flags)
+{
+	struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
+	u32 notification_flag;
+	u32 doorbell[2];
+
+	doorbell[0] = 0;
+	doorbell[1] = 0;
+
+	notification_flag = (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+			     V2_CQ_DB_REQ_NOT : V2_CQ_DB_REQ_NOT_SOL;
+	/*
+	 * flags = 0; Notification Flag = 1, next
+	 * flags = 1; Notification Flag = 0, solocited
+	 */
+	roce_set_field(doorbell[0], V2_CQ_DB_BYTE_4_TAG_M, V2_DB_BYTE_4_TAG_S,
+		       hr_cq->cqn);
+	roce_set_field(doorbell[0], V2_CQ_DB_BYTE_4_CMD_M, V2_DB_BYTE_4_CMD_S,
+		       HNS_ROCE_V2_CQ_DB_NTR);
+	roce_set_field(doorbell[1], V2_CQ_DB_PARAMETER_CONS_IDX_M,
+		       V2_CQ_DB_PARAMETER_CONS_IDX_S,
+		       hr_cq->cons_index & ((hr_cq->cq_depth << 1) - 1));
+	roce_set_field(doorbell[1], V2_CQ_DB_PARAMETER_CMD_SN_M,
+		       V2_CQ_DB_PARAMETER_CMD_SN_S, hr_cq->arm_sn & 0x3);
+	roce_set_bit(doorbell[1], V2_CQ_DB_PARAMETER_NOTIFY_S,
+		     notification_flag);
+
+	hns_roce_write64_k(doorbell, hr_cq->cq_db_l);
+
+	return 0;
+}
+
+static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe,
+						    struct hns_roce_qp **cur_qp,
+						    struct ib_wc *wc)
+{
+	struct hns_roce_rinl_sge *sge_list;
+	u32 wr_num, wr_cnt, sge_num;
+	u32 sge_cnt, data_len, size;
+	void *wqe_buf;
+
+	wr_num = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_WQE_INDX_M,
+				V2_CQE_BYTE_4_WQE_INDX_S) & 0xffff;
+	wr_cnt = wr_num & ((*cur_qp)->rq.wqe_cnt - 1);
+
+	sge_list = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sg_list;
+	sge_num = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sge_cnt;
+	wqe_buf = get_recv_wqe(*cur_qp, wr_cnt);
+	data_len = wc->byte_len;
+
+	for (sge_cnt = 0; (sge_cnt < sge_num) && (data_len); sge_cnt++) {
+		size = min(sge_list[sge_cnt].len, data_len);
+		memcpy((void *)sge_list[sge_cnt].addr, wqe_buf, size);
+
+		data_len -= size;
+		wqe_buf += size;
+	}
+
+	if (data_len) {
+		wc->status = IB_WC_LOC_LEN_ERR;
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
+				struct hns_roce_qp **cur_qp, struct ib_wc *wc)
+{
+	struct hns_roce_dev *hr_dev;
+	struct hns_roce_v2_cqe *cqe;
+	struct hns_roce_qp *hr_qp;
+	struct hns_roce_wq *wq;
+	int is_send;
+	u16 wqe_ctr;
+	u32 opcode;
+	u32 status;
+	int qpn;
+	int ret;
+
+	/* Find cqe according to consumer index */
+	cqe = next_cqe_sw_v2(hr_cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	++hr_cq->cons_index;
+	/* Memory barrier */
+	rmb();
+
+	/* 0->SQ, 1->RQ */
+	is_send = !roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_S_R_S);
+
+	qpn = roce_get_field(cqe->byte_16, V2_CQE_BYTE_16_LCL_QPN_M,
+				V2_CQE_BYTE_16_LCL_QPN_S);
+
+	if (!*cur_qp || (qpn & HNS_ROCE_V2_CQE_QPN_MASK) != (*cur_qp)->qpn) {
+		hr_dev = to_hr_dev(hr_cq->ib_cq.device);
+		hr_qp = __hns_roce_qp_lookup(hr_dev, qpn);
+		if (unlikely(!hr_qp)) {
+			dev_err(hr_dev->dev, "CQ %06lx with entry for unknown QPN %06x\n",
+				hr_cq->cqn, (qpn & HNS_ROCE_V2_CQE_QPN_MASK));
+			return -EINVAL;
+		}
+		*cur_qp = hr_qp;
+	}
+
+	wc->qp = &(*cur_qp)->ibqp;
+	wc->vendor_err = 0;
+
+	status = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_STATUS_M,
+				V2_CQE_BYTE_4_STATUS_S);
+	switch (status & HNS_ROCE_V2_CQE_STATUS_MASK) {
+	case HNS_ROCE_CQE_V2_SUCCESS:
+		wc->status = IB_WC_SUCCESS;
+		break;
+	case HNS_ROCE_CQE_V2_LOCAL_LENGTH_ERR:
+		wc->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case HNS_ROCE_CQE_V2_LOCAL_QP_OP_ERR:
+		wc->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case HNS_ROCE_CQE_V2_LOCAL_PROT_ERR:
+		wc->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case HNS_ROCE_CQE_V2_WR_FLUSH_ERR:
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case HNS_ROCE_CQE_V2_MW_BIND_ERR:
+		wc->status = IB_WC_MW_BIND_ERR;
+		break;
+	case HNS_ROCE_CQE_V2_BAD_RESP_ERR:
+		wc->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case HNS_ROCE_CQE_V2_LOCAL_ACCESS_ERR:
+		wc->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case HNS_ROCE_CQE_V2_REMOTE_INVAL_REQ_ERR:
+		wc->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case HNS_ROCE_CQE_V2_REMOTE_ACCESS_ERR:
+		wc->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case HNS_ROCE_CQE_V2_REMOTE_OP_ERR:
+		wc->status = IB_WC_REM_OP_ERR;
+		break;
+	case HNS_ROCE_CQE_V2_TRANSPORT_RETRY_EXC_ERR:
+		wc->status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case HNS_ROCE_CQE_V2_RNR_RETRY_EXC_ERR:
+		wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case HNS_ROCE_CQE_V2_REMOTE_ABORT_ERR:
+		wc->status = IB_WC_REM_ABORT_ERR;
+		break;
+	default:
+		wc->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	/* CQE status error, directly return */
+	if (wc->status != IB_WC_SUCCESS)
+		return 0;
+
+	if (is_send) {
+		wc->wc_flags = 0;
+		/* SQ corresponding to CQE */
+		switch (roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_OPCODE_M,
+				       V2_CQE_BYTE_4_OPCODE_S) & 0x1f) {
+		case HNS_ROCE_SQ_OPCODE_SEND:
+			wc->opcode = IB_WC_SEND;
+			break;
+		case HNS_ROCE_SQ_OPCODE_SEND_WITH_INV:
+			wc->opcode = IB_WC_SEND;
+			break;
+		case HNS_ROCE_SQ_OPCODE_SEND_WITH_IMM:
+			wc->opcode = IB_WC_SEND;
+			wc->wc_flags |= IB_WC_WITH_IMM;
+			break;
+		case HNS_ROCE_SQ_OPCODE_RDMA_READ:
+			wc->opcode = IB_WC_RDMA_READ;
+			wc->byte_len = le32_to_cpu(cqe->byte_cnt);
+			break;
+		case HNS_ROCE_SQ_OPCODE_RDMA_WRITE:
+			wc->opcode = IB_WC_RDMA_WRITE;
+			break;
+		case HNS_ROCE_SQ_OPCODE_RDMA_WRITE_WITH_IMM:
+			wc->opcode = IB_WC_RDMA_WRITE;
+			wc->wc_flags |= IB_WC_WITH_IMM;
+			break;
+		case HNS_ROCE_SQ_OPCODE_LOCAL_INV:
+			wc->opcode = IB_WC_LOCAL_INV;
+			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+			break;
+		case HNS_ROCE_SQ_OPCODE_ATOMIC_COMP_AND_SWAP:
+			wc->opcode = IB_WC_COMP_SWAP;
+			wc->byte_len  = 8;
+			break;
+		case HNS_ROCE_SQ_OPCODE_ATOMIC_FETCH_AND_ADD:
+			wc->opcode = IB_WC_FETCH_ADD;
+			wc->byte_len  = 8;
+			break;
+		case HNS_ROCE_SQ_OPCODE_ATOMIC_MASK_COMP_AND_SWAP:
+			wc->opcode = IB_WC_MASKED_COMP_SWAP;
+			wc->byte_len  = 8;
+			break;
+		case HNS_ROCE_SQ_OPCODE_ATOMIC_MASK_FETCH_AND_ADD:
+			wc->opcode = IB_WC_MASKED_FETCH_ADD;
+			wc->byte_len  = 8;
+			break;
+		case HNS_ROCE_SQ_OPCODE_FAST_REG_WR:
+			wc->opcode = IB_WC_REG_MR;
+			break;
+		case HNS_ROCE_SQ_OPCODE_BIND_MW:
+			wc->opcode = IB_WC_REG_MR;
+			break;
+		default:
+			wc->status = IB_WC_GENERAL_ERR;
+			break;
+		}
+
+		wq = &(*cur_qp)->sq;
+		if ((*cur_qp)->sq_signal_bits) {
+			/*
+			 * If sg_signal_bit is 1,
+			 * firstly tail pointer updated to wqe
+			 * which current cqe correspond to
+			 */
+			wqe_ctr = (u16)roce_get_field(cqe->byte_4,
+						      V2_CQE_BYTE_4_WQE_INDX_M,
+						      V2_CQE_BYTE_4_WQE_INDX_S);
+			wq->tail += (wqe_ctr - (u16)wq->tail) &
+				    (wq->wqe_cnt - 1);
+		}
+
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	} else {
+		/* RQ correspond to CQE */
+		wc->byte_len = le32_to_cpu(cqe->byte_cnt);
+
+		opcode = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_OPCODE_M,
+					V2_CQE_BYTE_4_OPCODE_S);
+		switch (opcode & 0x1f) {
+		case HNS_ROCE_V2_OPCODE_RDMA_WRITE_IMM:
+			wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+			wc->wc_flags = IB_WC_WITH_IMM;
+			wc->ex.imm_data = cqe->immtdata;
+			break;
+		case HNS_ROCE_V2_OPCODE_SEND:
+			wc->opcode = IB_WC_RECV;
+			wc->wc_flags = 0;
+			break;
+		case HNS_ROCE_V2_OPCODE_SEND_WITH_IMM:
+			wc->opcode = IB_WC_RECV;
+			wc->wc_flags = IB_WC_WITH_IMM;
+			wc->ex.imm_data = cqe->immtdata;
+			break;
+		case HNS_ROCE_V2_OPCODE_SEND_WITH_INV:
+			wc->opcode = IB_WC_RECV;
+			wc->wc_flags = IB_WC_WITH_INVALIDATE;
+			wc->ex.invalidate_rkey = le32_to_cpu(cqe->rkey);
+			break;
+		default:
+			wc->status = IB_WC_GENERAL_ERR;
+			break;
+		}
+
+		if ((wc->qp->qp_type == IB_QPT_RC ||
+		     wc->qp->qp_type == IB_QPT_UC) &&
+		    (opcode == HNS_ROCE_V2_OPCODE_SEND ||
+		    opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_IMM ||
+		    opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_INV) &&
+		    (roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_RQ_INLINE_S))) {
+			ret = hns_roce_handle_recv_inl_wqe(cqe, cur_qp, wc);
+			if (ret)
+				return -EAGAIN;
+		}
+
+		/* Update tail pointer, record wr_id */
+		wq = &(*cur_qp)->rq;
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+
+		wc->sl = (u8)roce_get_field(cqe->byte_32, V2_CQE_BYTE_32_SL_M,
+					    V2_CQE_BYTE_32_SL_S);
+		wc->src_qp = (u8)roce_get_field(cqe->byte_32,
+						V2_CQE_BYTE_32_RMT_QPN_M,
+						V2_CQE_BYTE_32_RMT_QPN_S);
+		wc->wc_flags |= (roce_get_bit(cqe->byte_32,
+					      V2_CQE_BYTE_32_GRH_S) ?
+					      IB_WC_GRH : 0);
+		wc->port_num = roce_get_field(cqe->byte_32,
+				V2_CQE_BYTE_32_PORTN_M, V2_CQE_BYTE_32_PORTN_S);
+		wc->pkey_index = 0;
+		memcpy(wc->smac, cqe->smac, 4);
+		wc->smac[4] = roce_get_field(cqe->byte_28,
+					     V2_CQE_BYTE_28_SMAC_4_M,
+					     V2_CQE_BYTE_28_SMAC_4_S);
+		wc->smac[5] = roce_get_field(cqe->byte_28,
+					     V2_CQE_BYTE_28_SMAC_5_M,
+					     V2_CQE_BYTE_28_SMAC_5_S);
+		wc->vlan_id = 0xffff;
+		wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
+		wc->network_hdr_type = roce_get_field(cqe->byte_28,
+						    V2_CQE_BYTE_28_PORT_TYPE_M,
+						    V2_CQE_BYTE_28_PORT_TYPE_S);
+	}
+
+	return 0;
+}
+
+static int hns_roce_v2_poll_cq(struct ib_cq *ibcq, int num_entries,
+			       struct ib_wc *wc)
+{
+	struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
+	struct hns_roce_qp *cur_qp = NULL;
+	unsigned long flags;
+	int npolled;
+
+	spin_lock_irqsave(&hr_cq->lock, flags);
+
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		if (hns_roce_v2_poll_one(hr_cq, &cur_qp, wc + npolled))
+			break;
+	}
+
+	if (npolled) {
+		/* Memory barrier */
+		wmb();
+		hns_roce_v2_cq_set_ci(hr_cq, hr_cq->cons_index);
+	}
+
+	spin_unlock_irqrestore(&hr_cq->lock, flags);
+
+	return npolled;
+}
+
+static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_hem_table *table, int obj,
+			       int step_idx)
+{
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_cmd_mailbox *mailbox;
+	struct hns_roce_hem_iter iter;
+	struct hns_roce_hem_mhop mhop;
+	struct hns_roce_hem *hem;
+	unsigned long mhop_obj = obj;
+	int i, j, k;
+	int ret = 0;
+	u64 hem_idx = 0;
+	u64 l1_idx = 0;
+	u64 bt_ba = 0;
+	u32 chunk_ba_num;
+	u32 hop_num;
+	u16 op = 0xff;
+
+	if (!hns_roce_check_whether_mhop(hr_dev, table->type))
+		return 0;
+
+	hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, &mhop);
+	i = mhop.l0_idx;
+	j = mhop.l1_idx;
+	k = mhop.l2_idx;
+	hop_num = mhop.hop_num;
+	chunk_ba_num = mhop.bt_chunk_size / 8;
+
+	if (hop_num == 2) {
+		hem_idx = i * chunk_ba_num * chunk_ba_num + j * chunk_ba_num +
+			  k;
+		l1_idx = i * chunk_ba_num + j;
+	} else if (hop_num == 1) {
+		hem_idx = i * chunk_ba_num + j;
+	} else if (hop_num == HNS_ROCE_HOP_NUM_0) {
+		hem_idx = i;
+	}
+
+	switch (table->type) {
+	case HEM_TYPE_QPC:
+		op = HNS_ROCE_CMD_WRITE_QPC_BT0;
+		break;
+	case HEM_TYPE_MTPT:
+		op = HNS_ROCE_CMD_WRITE_MPT_BT0;
+		break;
+	case HEM_TYPE_CQC:
+		op = HNS_ROCE_CMD_WRITE_CQC_BT0;
+		break;
+	case HEM_TYPE_SRQC:
+		op = HNS_ROCE_CMD_WRITE_SRQC_BT0;
+		break;
+	default:
+		dev_warn(dev, "Table %d not to be written by mailbox!\n",
+			 table->type);
+		return 0;
+	}
+	op += step_idx;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (check_whether_last_step(hop_num, step_idx)) {
+		hem = table->hem[hem_idx];
+		for (hns_roce_hem_first(hem, &iter);
+		     !hns_roce_hem_last(&iter); hns_roce_hem_next(&iter)) {
+			bt_ba = hns_roce_hem_addr(&iter);
+
+			/* configure the ba, tag, and op */
+			ret = hns_roce_cmd_mbox(hr_dev, bt_ba, mailbox->dma,
+						obj, 0, op,
+						HNS_ROCE_CMD_TIMEOUT_MSECS);
+		}
+	} else {
+		if (step_idx == 0)
+			bt_ba = table->bt_l0_dma_addr[i];
+		else if (step_idx == 1 && hop_num == 2)
+			bt_ba = table->bt_l1_dma_addr[l1_idx];
+
+		/* configure the ba, tag, and op */
+		ret = hns_roce_cmd_mbox(hr_dev, bt_ba, mailbox->dma, obj,
+					0, op, HNS_ROCE_CMD_TIMEOUT_MSECS);
+	}
+
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+	return ret;
+}
+
+static int hns_roce_v2_clear_hem(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_hem_table *table, int obj,
+				 int step_idx)
+{
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_cmd_mailbox *mailbox;
+	int ret = 0;
+	u16 op = 0xff;
+
+	if (!hns_roce_check_whether_mhop(hr_dev, table->type))
+		return 0;
+
+	switch (table->type) {
+	case HEM_TYPE_QPC:
+		op = HNS_ROCE_CMD_DESTROY_QPC_BT0;
+		break;
+	case HEM_TYPE_MTPT:
+		op = HNS_ROCE_CMD_DESTROY_MPT_BT0;
+		break;
+	case HEM_TYPE_CQC:
+		op = HNS_ROCE_CMD_DESTROY_CQC_BT0;
+		break;
+	case HEM_TYPE_SRQC:
+		op = HNS_ROCE_CMD_DESTROY_SRQC_BT0;
+		break;
+	default:
+		dev_warn(dev, "Table %d not to be destroyed by mailbox!\n",
+			 table->type);
+		return 0;
+	}
+	op += step_idx;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	/* configure the tag and op */
+	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, obj, 0, op,
+				HNS_ROCE_CMD_TIMEOUT_MSECS);
+
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+	return ret;
+}
+
+static int hns_roce_v2_qp_modify(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_mtt *mtt,
+				 enum ib_qp_state cur_state,
+				 enum ib_qp_state new_state,
+				 struct hns_roce_v2_qp_context *context,
+				 struct hns_roce_qp *hr_qp)
+{
+	struct hns_roce_cmd_mailbox *mailbox;
+	int ret;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, context, sizeof(*context) * 2);
+
+	ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_qp->qpn, 0,
+				HNS_ROCE_CMD_MODIFY_QPC,
+				HNS_ROCE_CMD_TIMEOUT_MSECS);
+
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return ret;
+}
+
+static void set_access_flags(struct hns_roce_qp *hr_qp,
+			     struct hns_roce_v2_qp_context *context,
+			     struct hns_roce_v2_qp_context *qpc_mask,
+			     const struct ib_qp_attr *attr, int attr_mask)
+{
+	u8 dest_rd_atomic;
+	u32 access_flags;
+
+	dest_rd_atomic = (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) ?
+			 attr->max_dest_rd_atomic : hr_qp->resp_depth;
+
+	access_flags = (attr_mask & IB_QP_ACCESS_FLAGS) ?
+		       attr->qp_access_flags : hr_qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S,
+		     !!(access_flags & IB_ACCESS_REMOTE_READ));
+	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S, 0);
+
+	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S,
+		     !!(access_flags & IB_ACCESS_REMOTE_WRITE));
+	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S, 0);
+
+	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S,
+		     !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
+	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, 0);
+}
+
+static void modify_qp_reset_to_init(struct ib_qp *ibqp,
+				    const struct ib_qp_attr *attr,
+				    int attr_mask,
+				    struct hns_roce_v2_qp_context *context,
+				    struct hns_roce_v2_qp_context *qpc_mask)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+
+	/*
+	 * In v2 engine, software pass context and context mask to hardware
+	 * when modifying qp. If software need modify some fields in context,
+	 * we should set all bits of the relevant fields in context mask to
+	 * 0 at the same time, else set them to 0x1.
+	 */
+	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
+		       V2_QPC_BYTE_4_TST_S, to_hr_qp_type(hr_qp->ibqp.qp_type));
+	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
+		       V2_QPC_BYTE_4_TST_S, 0);
+
+	if (ibqp->qp_type == IB_QPT_GSI)
+		roce_set_field(context->byte_4_sqpn_tst,
+			       V2_QPC_BYTE_4_SGE_SHIFT_M,
+			       V2_QPC_BYTE_4_SGE_SHIFT_S,
+			       ilog2((unsigned int)hr_qp->sge.sge_cnt));
+	else
+		roce_set_field(context->byte_4_sqpn_tst,
+			       V2_QPC_BYTE_4_SGE_SHIFT_M,
+			       V2_QPC_BYTE_4_SGE_SHIFT_S,
+			       hr_qp->sq.max_gs > 2 ?
+			       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
+
+	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
+		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
+
+	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
+		       V2_QPC_BYTE_4_SQPN_S, hr_qp->qpn);
+	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
+		       V2_QPC_BYTE_4_SQPN_S, 0);
+
+	roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
+		       V2_QPC_BYTE_16_PD_S, to_hr_pd(ibqp->pd)->pdn);
+	roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
+		       V2_QPC_BYTE_16_PD_S, 0);
+
+	roce_set_field(context->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQWS_M,
+		       V2_QPC_BYTE_20_RQWS_S, ilog2(hr_qp->rq.max_gs));
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQWS_M,
+		       V2_QPC_BYTE_20_RQWS_S, 0);
+
+	roce_set_field(context->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S,
+		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S, 0);
+
+	roce_set_field(context->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
+		       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
+
+	/* No VLAN need to set 0xFFF */
+	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_IDX_M,
+		       V2_QPC_BYTE_24_VLAN_IDX_S, 0xfff);
+	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_IDX_M,
+		       V2_QPC_BYTE_24_VLAN_IDX_S, 0);
+
+	/*
+	 * Set some fields in context to zero, Because the default values
+	 * of all fields in context are zero, we need not set them to 0 again.
+	 * but we should set the relevant fields of context mask to 0.
+	 */
+	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_SQ_TX_ERR_S, 0);
+	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_SQ_RX_ERR_S, 0);
+	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_TX_ERR_S, 0);
+	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_RX_ERR_S, 0);
+
+	roce_set_field(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_MAPID_M,
+		       V2_QPC_BYTE_60_MAPID_S, 0);
+
+	roce_set_bit(qpc_mask->byte_60_qpst_mapid,
+		     V2_QPC_BYTE_60_INNER_MAP_IND_S, 0);
+	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_MAP_IND_S,
+		     0);
+	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_RQ_MAP_IND_S,
+		     0);
+	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_EXT_MAP_IND_S,
+		     0);
+	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_RLS_IND_S,
+		     0);
+	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_EXT_IND_S,
+		     0);
+	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CNP_TX_FLAG_S, 0);
+	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CE_FLAG_S, 0);
+
+	if (attr_mask & IB_QP_QKEY) {
+		context->qkey_xrcd = attr->qkey;
+		qpc_mask->qkey_xrcd = 0;
+		hr_qp->qkey = attr->qkey;
+	}
+
+	if (hr_qp->rdb_en) {
+		roce_set_bit(context->byte_68_rq_db,
+			     V2_QPC_BYTE_68_RQ_RECORD_EN_S, 1);
+		roce_set_bit(qpc_mask->byte_68_rq_db,
+			     V2_QPC_BYTE_68_RQ_RECORD_EN_S, 0);
+	}
+
+	roce_set_field(context->byte_68_rq_db,
+		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_M,
+		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_S,
+		       ((u32)hr_qp->rdb.dma) >> 1);
+	roce_set_field(qpc_mask->byte_68_rq_db,
+		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_M,
+		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_S, 0);
+	context->rq_db_record_addr = hr_qp->rdb.dma >> 32;
+	qpc_mask->rq_db_record_addr = 0;
+
+	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S,
+		    (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) ? 1 : 0);
+	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 0);
+
+	roce_set_field(context->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,
+		       V2_QPC_BYTE_80_RX_CQN_S, to_hr_cq(ibqp->recv_cq)->cqn);
+	roce_set_field(qpc_mask->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,
+		       V2_QPC_BYTE_80_RX_CQN_S, 0);
+	if (ibqp->srq) {
+		roce_set_field(context->byte_76_srqn_op_en,
+			       V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S,
+			       to_hr_srq(ibqp->srq)->srqn);
+		roce_set_field(qpc_mask->byte_76_srqn_op_en,
+			       V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S, 0);
+		roce_set_bit(context->byte_76_srqn_op_en,
+			     V2_QPC_BYTE_76_SRQ_EN_S, 1);
+		roce_set_bit(qpc_mask->byte_76_srqn_op_en,
+			     V2_QPC_BYTE_76_SRQ_EN_S, 0);
+	}
+
+	roce_set_field(qpc_mask->byte_84_rq_ci_pi,
+		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
+		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
+	roce_set_field(qpc_mask->byte_84_rq_ci_pi,
+		       V2_QPC_BYTE_84_RQ_CONSUMER_IDX_M,
+		       V2_QPC_BYTE_84_RQ_CONSUMER_IDX_S, 0);
+
+	roce_set_field(qpc_mask->byte_92_srq_info, V2_QPC_BYTE_92_SRQ_INFO_M,
+		       V2_QPC_BYTE_92_SRQ_INFO_S, 0);
+
+	roce_set_field(qpc_mask->byte_96_rx_reqmsn, V2_QPC_BYTE_96_RX_REQ_MSN_M,
+		       V2_QPC_BYTE_96_RX_REQ_MSN_S, 0);
+
+	roce_set_field(qpc_mask->byte_104_rq_sge,
+		       V2_QPC_BYTE_104_RQ_CUR_WQE_SGE_NUM_M,
+		       V2_QPC_BYTE_104_RQ_CUR_WQE_SGE_NUM_S, 0);
+
+	roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
+		     V2_QPC_BYTE_108_RX_REQ_PSN_ERR_S, 0);
+	roce_set_field(qpc_mask->byte_108_rx_reqepsn,
+		       V2_QPC_BYTE_108_RX_REQ_LAST_OPTYPE_M,
+		       V2_QPC_BYTE_108_RX_REQ_LAST_OPTYPE_S, 0);
+	roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
+		     V2_QPC_BYTE_108_RX_REQ_RNR_S, 0);
+
+	qpc_mask->rq_rnr_timer = 0;
+	qpc_mask->rx_msg_len = 0;
+	qpc_mask->rx_rkey_pkt_info = 0;
+	qpc_mask->rx_va = 0;
+
+	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_HEAD_MAX_M,
+		       V2_QPC_BYTE_132_TRRL_HEAD_MAX_S, 0);
+	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_TAIL_MAX_M,
+		       V2_QPC_BYTE_132_TRRL_TAIL_MAX_S, 0);
+
+	roce_set_bit(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RSVD_RAQ_MAP_S, 0);
+	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_HEAD_M,
+		       V2_QPC_BYTE_140_RAQ_TRRL_HEAD_S, 0);
+	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_TAIL_M,
+		       V2_QPC_BYTE_140_RAQ_TRRL_TAIL_S, 0);
+
+	roce_set_field(qpc_mask->byte_144_raq,
+		       V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_M,
+		       V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_S, 0);
+	roce_set_bit(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_RTY_INI_IND_S,
+		     0);
+	roce_set_field(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_CREDIT_M,
+		       V2_QPC_BYTE_144_RAQ_CREDIT_S, 0);
+	roce_set_bit(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RESP_RTY_FLG_S, 0);
+
+	roce_set_field(qpc_mask->byte_148_raq, V2_QPC_BYTE_148_RQ_MSN_M,
+		       V2_QPC_BYTE_148_RQ_MSN_S, 0);
+	roce_set_field(qpc_mask->byte_148_raq, V2_QPC_BYTE_148_RAQ_SYNDROME_M,
+		       V2_QPC_BYTE_148_RAQ_SYNDROME_S, 0);
+
+	roce_set_field(qpc_mask->byte_152_raq, V2_QPC_BYTE_152_RAQ_PSN_M,
+		       V2_QPC_BYTE_152_RAQ_PSN_S, 0);
+	roce_set_field(qpc_mask->byte_152_raq,
+		       V2_QPC_BYTE_152_RAQ_TRRL_RTY_HEAD_M,
+		       V2_QPC_BYTE_152_RAQ_TRRL_RTY_HEAD_S, 0);
+
+	roce_set_field(qpc_mask->byte_156_raq, V2_QPC_BYTE_156_RAQ_USE_PKTN_M,
+		       V2_QPC_BYTE_156_RAQ_USE_PKTN_S, 0);
+
+	roce_set_field(qpc_mask->byte_160_sq_ci_pi,
+		       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
+		       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, 0);
+	roce_set_field(qpc_mask->byte_160_sq_ci_pi,
+		       V2_QPC_BYTE_160_SQ_CONSUMER_IDX_M,
+		       V2_QPC_BYTE_160_SQ_CONSUMER_IDX_S, 0);
+
+	roce_set_field(context->byte_168_irrl_idx,
+		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_M,
+		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_S,
+		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
+	roce_set_field(qpc_mask->byte_168_irrl_idx,
+		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_M,
+		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, 0);
+
+	roce_set_bit(qpc_mask->byte_168_irrl_idx,
+		     V2_QPC_BYTE_168_MSG_RTY_LP_FLG_S, 0);
+	roce_set_bit(qpc_mask->byte_168_irrl_idx,
+		     V2_QPC_BYTE_168_SQ_INVLD_FLG_S, 0);
+	roce_set_field(qpc_mask->byte_168_irrl_idx,
+		       V2_QPC_BYTE_168_IRRL_IDX_LSB_M,
+		       V2_QPC_BYTE_168_IRRL_IDX_LSB_S, 0);
+
+	roce_set_field(context->byte_172_sq_psn, V2_QPC_BYTE_172_ACK_REQ_FREQ_M,
+		       V2_QPC_BYTE_172_ACK_REQ_FREQ_S, 4);
+	roce_set_field(qpc_mask->byte_172_sq_psn,
+		       V2_QPC_BYTE_172_ACK_REQ_FREQ_M,
+		       V2_QPC_BYTE_172_ACK_REQ_FREQ_S, 0);
+
+	roce_set_bit(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_MSG_RNR_FLG_S,
+		     0);
+
+	roce_set_field(qpc_mask->byte_176_msg_pktn,
+		       V2_QPC_BYTE_176_MSG_USE_PKTN_M,
+		       V2_QPC_BYTE_176_MSG_USE_PKTN_S, 0);
+	roce_set_field(qpc_mask->byte_176_msg_pktn,
+		       V2_QPC_BYTE_176_IRRL_HEAD_PRE_M,
+		       V2_QPC_BYTE_176_IRRL_HEAD_PRE_S, 0);
+
+	roce_set_field(qpc_mask->byte_184_irrl_idx,
+		       V2_QPC_BYTE_184_IRRL_IDX_MSB_M,
+		       V2_QPC_BYTE_184_IRRL_IDX_MSB_S, 0);
+
+	qpc_mask->cur_sge_offset = 0;
+
+	roce_set_field(qpc_mask->byte_192_ext_sge,
+		       V2_QPC_BYTE_192_CUR_SGE_IDX_M,
+		       V2_QPC_BYTE_192_CUR_SGE_IDX_S, 0);
+	roce_set_field(qpc_mask->byte_192_ext_sge,
+		       V2_QPC_BYTE_192_EXT_SGE_NUM_LEFT_M,
+		       V2_QPC_BYTE_192_EXT_SGE_NUM_LEFT_S, 0);
+
+	roce_set_field(qpc_mask->byte_196_sq_psn, V2_QPC_BYTE_196_IRRL_HEAD_M,
+		       V2_QPC_BYTE_196_IRRL_HEAD_S, 0);
+
+	roce_set_field(qpc_mask->byte_200_sq_max, V2_QPC_BYTE_200_SQ_MAX_IDX_M,
+		       V2_QPC_BYTE_200_SQ_MAX_IDX_S, 0);
+	roce_set_field(qpc_mask->byte_200_sq_max,
+		       V2_QPC_BYTE_200_LCL_OPERATED_CNT_M,
+		       V2_QPC_BYTE_200_LCL_OPERATED_CNT_S, 0);
+
+	roce_set_bit(qpc_mask->byte_208_irrl, V2_QPC_BYTE_208_PKT_RNR_FLG_S, 0);
+	roce_set_bit(qpc_mask->byte_208_irrl, V2_QPC_BYTE_208_PKT_RTY_FLG_S, 0);
+
+	roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_CHECK_FLG_M,
+		       V2_QPC_BYTE_212_CHECK_FLG_S, 0);
+
+	qpc_mask->sq_timer = 0;
+
+	roce_set_field(qpc_mask->byte_220_retry_psn_msn,
+		       V2_QPC_BYTE_220_RETRY_MSG_MSN_M,
+		       V2_QPC_BYTE_220_RETRY_MSG_MSN_S, 0);
+	roce_set_field(qpc_mask->byte_232_irrl_sge,
+		       V2_QPC_BYTE_232_IRRL_SGE_IDX_M,
+		       V2_QPC_BYTE_232_IRRL_SGE_IDX_S, 0);
+
+	qpc_mask->irrl_cur_sge_offset = 0;
+
+	roce_set_field(qpc_mask->byte_240_irrl_tail,
+		       V2_QPC_BYTE_240_IRRL_TAIL_REAL_M,
+		       V2_QPC_BYTE_240_IRRL_TAIL_REAL_S, 0);
+	roce_set_field(qpc_mask->byte_240_irrl_tail,
+		       V2_QPC_BYTE_240_IRRL_TAIL_RD_M,
+		       V2_QPC_BYTE_240_IRRL_TAIL_RD_S, 0);
+	roce_set_field(qpc_mask->byte_240_irrl_tail,
+		       V2_QPC_BYTE_240_RX_ACK_MSN_M,
+		       V2_QPC_BYTE_240_RX_ACK_MSN_S, 0);
+
+	roce_set_field(qpc_mask->byte_248_ack_psn, V2_QPC_BYTE_248_IRRL_PSN_M,
+		       V2_QPC_BYTE_248_IRRL_PSN_S, 0);
+	roce_set_bit(qpc_mask->byte_248_ack_psn, V2_QPC_BYTE_248_ACK_PSN_ERR_S,
+		     0);
+	roce_set_field(qpc_mask->byte_248_ack_psn,
+		       V2_QPC_BYTE_248_ACK_LAST_OPTYPE_M,
+		       V2_QPC_BYTE_248_ACK_LAST_OPTYPE_S, 0);
+	roce_set_bit(qpc_mask->byte_248_ack_psn, V2_QPC_BYTE_248_IRRL_PSN_VLD_S,
+		     0);
+	roce_set_bit(qpc_mask->byte_248_ack_psn,
+		     V2_QPC_BYTE_248_RNR_RETRY_FLAG_S, 0);
+	roce_set_bit(qpc_mask->byte_248_ack_psn, V2_QPC_BYTE_248_CQ_ERR_IND_S,
+		     0);
+
+	hr_qp->access_flags = attr->qp_access_flags;
+	hr_qp->pkey_index = attr->pkey_index;
+	roce_set_field(context->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
+		       V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->send_cq)->cqn);
+	roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
+		       V2_QPC_BYTE_252_TX_CQN_S, 0);
+
+	roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_ERR_TYPE_M,
+		       V2_QPC_BYTE_252_ERR_TYPE_S, 0);
+
+	roce_set_field(qpc_mask->byte_256_sqflush_rqcqe,
+		       V2_QPC_BYTE_256_RQ_CQE_IDX_M,
+		       V2_QPC_BYTE_256_RQ_CQE_IDX_S, 0);
+	roce_set_field(qpc_mask->byte_256_sqflush_rqcqe,
+		       V2_QPC_BYTE_256_SQ_FLUSH_IDX_M,
+		       V2_QPC_BYTE_256_SQ_FLUSH_IDX_S, 0);
+}
+
+static void modify_qp_init_to_init(struct ib_qp *ibqp,
+				   const struct ib_qp_attr *attr, int attr_mask,
+				   struct hns_roce_v2_qp_context *context,
+				   struct hns_roce_v2_qp_context *qpc_mask)
+{
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+
+	/*
+	 * In v2 engine, software pass context and context mask to hardware
+	 * when modifying qp. If software need modify some fields in context,
+	 * we should set all bits of the relevant fields in context mask to
+	 * 0 at the same time, else set them to 0x1.
+	 */
+	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
+		       V2_QPC_BYTE_4_TST_S, to_hr_qp_type(hr_qp->ibqp.qp_type));
+	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
+		       V2_QPC_BYTE_4_TST_S, 0);
+
+	if (ibqp->qp_type == IB_QPT_GSI)
+		roce_set_field(context->byte_4_sqpn_tst,
+			       V2_QPC_BYTE_4_SGE_SHIFT_M,
+			       V2_QPC_BYTE_4_SGE_SHIFT_S,
+			       ilog2((unsigned int)hr_qp->sge.sge_cnt));
+	else
+		roce_set_field(context->byte_4_sqpn_tst,
+			       V2_QPC_BYTE_4_SGE_SHIFT_M,
+			       V2_QPC_BYTE_4_SGE_SHIFT_S, hr_qp->sq.max_gs > 2 ?
+			       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
+
+	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
+		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS) {
+		roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S,
+			     !!(attr->qp_access_flags & IB_ACCESS_REMOTE_READ));
+		roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S,
+			     0);
+
+		roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S,
+			     !!(attr->qp_access_flags &
+			     IB_ACCESS_REMOTE_WRITE));
+		roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S,
+			     0);
+
+		roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S,
+			     !!(attr->qp_access_flags &
+			     IB_ACCESS_REMOTE_ATOMIC));
+		roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S,
+			     0);
+	} else {
+		roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S,
+			     !!(hr_qp->access_flags & IB_ACCESS_REMOTE_READ));
+		roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S,
+			     0);
+
+		roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S,
+			     !!(hr_qp->access_flags & IB_ACCESS_REMOTE_WRITE));
+		roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S,
+			     0);
+
+		roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S,
+			     !!(hr_qp->access_flags & IB_ACCESS_REMOTE_ATOMIC));
+		roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S,
+			     0);
+	}
+
+	roce_set_field(context->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S,
+		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S, 0);
+
+	roce_set_field(context->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
+		       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
+
+	roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
+		       V2_QPC_BYTE_16_PD_S, to_hr_pd(ibqp->pd)->pdn);
+	roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
+		       V2_QPC_BYTE_16_PD_S, 0);
+
+	roce_set_field(context->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,
+		       V2_QPC_BYTE_80_RX_CQN_S, to_hr_cq(ibqp->recv_cq)->cqn);
+	roce_set_field(qpc_mask->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,
+		       V2_QPC_BYTE_80_RX_CQN_S, 0);
+
+	roce_set_field(context->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
+		       V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->send_cq)->cqn);
+	roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
+		       V2_QPC_BYTE_252_TX_CQN_S, 0);
+
+	if (ibqp->srq) {
+		roce_set_bit(context->byte_76_srqn_op_en,
+			     V2_QPC_BYTE_76_SRQ_EN_S, 1);
+		roce_set_bit(qpc_mask->byte_76_srqn_op_en,
+			     V2_QPC_BYTE_76_SRQ_EN_S, 0);
+		roce_set_field(context->byte_76_srqn_op_en,
+			       V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S,
+			       to_hr_srq(ibqp->srq)->srqn);
+		roce_set_field(qpc_mask->byte_76_srqn_op_en,
+			       V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S, 0);
+	}
+
+	if (attr_mask & IB_QP_QKEY) {
+		context->qkey_xrcd = attr->qkey;
+		qpc_mask->qkey_xrcd = 0;
+	}
+
+	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
+		       V2_QPC_BYTE_4_SQPN_S, hr_qp->qpn);
+	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
+		       V2_QPC_BYTE_4_SQPN_S, 0);
+
+	if (attr_mask & IB_QP_DEST_QPN) {
+		roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_DQPN_M,
+			       V2_QPC_BYTE_56_DQPN_S, hr_qp->qpn);
+		roce_set_field(qpc_mask->byte_56_dqpn_err,
+			       V2_QPC_BYTE_56_DQPN_M, V2_QPC_BYTE_56_DQPN_S, 0);
+	}
+	roce_set_field(context->byte_168_irrl_idx,
+		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_M,
+		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_S,
+		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
+	roce_set_field(qpc_mask->byte_168_irrl_idx,
+		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_M,
+		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, 0);
+}
+
+static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
+				 const struct ib_qp_attr *attr, int attr_mask,
+				 struct hns_roce_v2_qp_context *context,
+				 struct hns_roce_v2_qp_context *qpc_mask)
+{
+	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct device *dev = hr_dev->dev;
+	dma_addr_t dma_handle_3;
+	dma_addr_t dma_handle_2;
+	dma_addr_t dma_handle;
+	u32 page_size;
+	u8 port_num;
+	u64 *mtts_3;
+	u64 *mtts_2;
+	u64 *mtts;
+	u8 *dmac;
+	u8 *smac;
+	int port;
+
+	/* Search qp buf's mtts */
+	mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table,
+				   hr_qp->mtt.first_seg, &dma_handle);
+	if (!mtts) {
+		dev_err(dev, "qp buf pa find failed\n");
+		return -EINVAL;
+	}
+
+	/* Search IRRL's mtts */
+	mtts_2 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.irrl_table,
+				     hr_qp->qpn, &dma_handle_2);
+	if (!mtts_2) {
+		dev_err(dev, "qp irrl_table find failed\n");
+		return -EINVAL;
+	}
+
+	/* Search TRRL's mtts */
+	mtts_3 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.trrl_table,
+				     hr_qp->qpn, &dma_handle_3);
+	if (!mtts_3) {
+		dev_err(dev, "qp trrl_table find failed\n");
+		return -EINVAL;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		dev_err(dev, "INIT2RTR attr_mask (0x%x) error\n", attr_mask);
+		return -EINVAL;
+	}
+
+	dmac = (u8 *)attr->ah_attr.roce.dmac;
+	context->wqe_sge_ba = (u32)(dma_handle >> 3);
+	qpc_mask->wqe_sge_ba = 0;
+
+	/*
+	 * In v2 engine, software pass context and context mask to hardware
+	 * when modifying qp. If software need modify some fields in context,
+	 * we should set all bits of the relevant fields in context mask to
+	 * 0 at the same time, else set them to 0x1.
+	 */
+	roce_set_field(context->byte_12_sq_hop, V2_QPC_BYTE_12_WQE_SGE_BA_M,
+		       V2_QPC_BYTE_12_WQE_SGE_BA_S, dma_handle >> (32 + 3));
+	roce_set_field(qpc_mask->byte_12_sq_hop, V2_QPC_BYTE_12_WQE_SGE_BA_M,
+		       V2_QPC_BYTE_12_WQE_SGE_BA_S, 0);
+
+	roce_set_field(context->byte_12_sq_hop, V2_QPC_BYTE_12_SQ_HOP_NUM_M,
+		       V2_QPC_BYTE_12_SQ_HOP_NUM_S,
+		       hr_dev->caps.mtt_hop_num == HNS_ROCE_HOP_NUM_0 ?
+		       0 : hr_dev->caps.mtt_hop_num);
+	roce_set_field(qpc_mask->byte_12_sq_hop, V2_QPC_BYTE_12_SQ_HOP_NUM_M,
+		       V2_QPC_BYTE_12_SQ_HOP_NUM_S, 0);
+
+	roce_set_field(context->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SGE_HOP_NUM_M,
+		       V2_QPC_BYTE_20_SGE_HOP_NUM_S,
+		       ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ?
+		       hr_dev->caps.mtt_hop_num : 0);
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SGE_HOP_NUM_M,
+		       V2_QPC_BYTE_20_SGE_HOP_NUM_S, 0);
+
+	roce_set_field(context->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_RQ_HOP_NUM_M,
+		       V2_QPC_BYTE_20_RQ_HOP_NUM_S,
+		       hr_dev->caps.mtt_hop_num == HNS_ROCE_HOP_NUM_0 ?
+		       0 : hr_dev->caps.mtt_hop_num);
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_RQ_HOP_NUM_M,
+		       V2_QPC_BYTE_20_RQ_HOP_NUM_S, 0);
+
+	roce_set_field(context->byte_16_buf_ba_pg_sz,
+		       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M,
+		       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S,
+		       hr_dev->caps.mtt_ba_pg_sz);
+	roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz,
+		       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M,
+		       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S, 0);
+
+	roce_set_field(context->byte_16_buf_ba_pg_sz,
+		       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M,
+		       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S,
+		       hr_dev->caps.mtt_buf_pg_sz);
+	roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz,
+		       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M,
+		       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S, 0);
+
+	roce_set_field(context->byte_80_rnr_rx_cqn,
+		       V2_QPC_BYTE_80_MIN_RNR_TIME_M,
+		       V2_QPC_BYTE_80_MIN_RNR_TIME_S, attr->min_rnr_timer);
+	roce_set_field(qpc_mask->byte_80_rnr_rx_cqn,
+		       V2_QPC_BYTE_80_MIN_RNR_TIME_M,
+		       V2_QPC_BYTE_80_MIN_RNR_TIME_S, 0);
+
+	page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
+	context->rq_cur_blk_addr = (u32)(mtts[hr_qp->rq.offset / page_size]
+				    >> PAGE_ADDR_SHIFT);
+	qpc_mask->rq_cur_blk_addr = 0;
+
+	roce_set_field(context->byte_92_srq_info,
+		       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_M,
+		       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_S,
+		       mtts[hr_qp->rq.offset / page_size]
+		       >> (32 + PAGE_ADDR_SHIFT));
+	roce_set_field(qpc_mask->byte_92_srq_info,
+		       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_M,
+		       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_S, 0);
+
+	context->rq_nxt_blk_addr = (u32)(mtts[hr_qp->rq.offset / page_size + 1]
+				    >> PAGE_ADDR_SHIFT);
+	qpc_mask->rq_nxt_blk_addr = 0;
+
+	roce_set_field(context->byte_104_rq_sge,
+		       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M,
+		       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S,
+		       mtts[hr_qp->rq.offset / page_size + 1]
+		       >> (32 + PAGE_ADDR_SHIFT));
+	roce_set_field(qpc_mask->byte_104_rq_sge,
+		       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M,
+		       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S, 0);
+
+	roce_set_field(context->byte_108_rx_reqepsn,
+		       V2_QPC_BYTE_108_RX_REQ_EPSN_M,
+		       V2_QPC_BYTE_108_RX_REQ_EPSN_S, attr->rq_psn);
+	roce_set_field(qpc_mask->byte_108_rx_reqepsn,
+		       V2_QPC_BYTE_108_RX_REQ_EPSN_M,
+		       V2_QPC_BYTE_108_RX_REQ_EPSN_S, 0);
+
+	roce_set_field(context->byte_132_trrl, V2_QPC_BYTE_132_TRRL_BA_M,
+		       V2_QPC_BYTE_132_TRRL_BA_S, dma_handle_3 >> 4);
+	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_BA_M,
+		       V2_QPC_BYTE_132_TRRL_BA_S, 0);
+	context->trrl_ba = (u32)(dma_handle_3 >> (16 + 4));
+	qpc_mask->trrl_ba = 0;
+	roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_TRRL_BA_M,
+		       V2_QPC_BYTE_140_TRRL_BA_S,
+		       (u32)(dma_handle_3 >> (32 + 16 + 4)));
+	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_TRRL_BA_M,
+		       V2_QPC_BYTE_140_TRRL_BA_S, 0);
+
+	context->irrl_ba = (u32)(dma_handle_2 >> 6);
+	qpc_mask->irrl_ba = 0;
+	roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_IRRL_BA_M,
+		       V2_QPC_BYTE_208_IRRL_BA_S,
+		       dma_handle_2 >> (32 + 6));
+	roce_set_field(qpc_mask->byte_208_irrl, V2_QPC_BYTE_208_IRRL_BA_M,
+		       V2_QPC_BYTE_208_IRRL_BA_S, 0);
+
+	roce_set_bit(context->byte_208_irrl, V2_QPC_BYTE_208_RMT_E2E_S, 1);
+	roce_set_bit(qpc_mask->byte_208_irrl, V2_QPC_BYTE_208_RMT_E2E_S, 0);
+
+	roce_set_bit(context->byte_252_err_txcqn, V2_QPC_BYTE_252_SIG_TYPE_S,
+		     hr_qp->sq_signal_bits);
+	roce_set_bit(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_SIG_TYPE_S,
+		     0);
+
+	port = (attr_mask & IB_QP_PORT) ? (attr->port_num - 1) : hr_qp->port;
+
+	smac = (u8 *)hr_dev->dev_addr[port];
+	/* when dmac equals smac or loop_idc is 1, it should loopback */
+	if (ether_addr_equal_unaligned(dmac, smac) ||
+	    hr_dev->loop_idc == 0x1) {
+		roce_set_bit(context->byte_28_at_fl, V2_QPC_BYTE_28_LBI_S, 1);
+		roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_LBI_S, 0);
+	}
+
+	if ((attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) &&
+	     attr->max_dest_rd_atomic) {
+		roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
+			       V2_QPC_BYTE_140_RR_MAX_S,
+			       fls(attr->max_dest_rd_atomic - 1));
+		roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
+			       V2_QPC_BYTE_140_RR_MAX_S, 0);
+	}
+
+	if (attr_mask & IB_QP_DEST_QPN) {
+		roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_DQPN_M,
+			       V2_QPC_BYTE_56_DQPN_S, attr->dest_qp_num);
+		roce_set_field(qpc_mask->byte_56_dqpn_err,
+			       V2_QPC_BYTE_56_DQPN_M, V2_QPC_BYTE_56_DQPN_S, 0);
+	}
+
+	/* Configure GID index */
+	port_num = rdma_ah_get_port_num(&attr->ah_attr);
+	roce_set_field(context->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SGID_IDX_M,
+		       V2_QPC_BYTE_20_SGID_IDX_S,
+		       hns_get_gid_index(hr_dev, port_num - 1,
+					 grh->sgid_index));
+	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
+		       V2_QPC_BYTE_20_SGID_IDX_M,
+		       V2_QPC_BYTE_20_SGID_IDX_S, 0);
+	memcpy(&(context->dmac), dmac, 4);
+	roce_set_field(context->byte_52_udpspn_dmac, V2_QPC_BYTE_52_DMAC_M,
+		       V2_QPC_BYTE_52_DMAC_S, *((u16 *)(&dmac[4])));
+	qpc_mask->dmac = 0;
+	roce_set_field(qpc_mask->byte_52_udpspn_dmac, V2_QPC_BYTE_52_DMAC_M,
+		       V2_QPC_BYTE_52_DMAC_S, 0);
+
+	roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M,
+		       V2_QPC_BYTE_56_LP_PKTN_INI_S, 4);
+	roce_set_field(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M,
+		       V2_QPC_BYTE_56_LP_PKTN_INI_S, 0);
+
+	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_HOP_LIMIT_M,
+		       V2_QPC_BYTE_24_HOP_LIMIT_S, grh->hop_limit);
+	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_HOP_LIMIT_M,
+		       V2_QPC_BYTE_24_HOP_LIMIT_S, 0);
+
+	roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
+		       V2_QPC_BYTE_28_FL_S, grh->flow_label);
+	roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
+		       V2_QPC_BYTE_28_FL_S, 0);
+
+	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
+		       V2_QPC_BYTE_24_TC_S, grh->traffic_class);
+	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
+		       V2_QPC_BYTE_24_TC_S, 0);
+
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_UD)
+		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M,
+			       V2_QPC_BYTE_24_MTU_S, IB_MTU_4096);
+	else if (attr_mask & IB_QP_PATH_MTU)
+		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M,
+			       V2_QPC_BYTE_24_MTU_S, attr->path_mtu);
+
+	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M,
+		       V2_QPC_BYTE_24_MTU_S, 0);
+
+	memcpy(context->dgid, grh->dgid.raw, sizeof(grh->dgid.raw));
+	memset(qpc_mask->dgid, 0, sizeof(grh->dgid.raw));
+
+	roce_set_field(context->byte_84_rq_ci_pi,
+		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
+		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, hr_qp->rq.head);
+	roce_set_field(qpc_mask->byte_84_rq_ci_pi,
+		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
+		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
+
+	roce_set_field(qpc_mask->byte_84_rq_ci_pi,
+		       V2_QPC_BYTE_84_RQ_CONSUMER_IDX_M,
+		       V2_QPC_BYTE_84_RQ_CONSUMER_IDX_S, 0);
+	roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
+		     V2_QPC_BYTE_108_RX_REQ_PSN_ERR_S, 0);
+	roce_set_field(qpc_mask->byte_96_rx_reqmsn, V2_QPC_BYTE_96_RX_REQ_MSN_M,
+		       V2_QPC_BYTE_96_RX_REQ_MSN_S, 0);
+	roce_set_field(qpc_mask->byte_108_rx_reqepsn,
+		       V2_QPC_BYTE_108_RX_REQ_LAST_OPTYPE_M,
+		       V2_QPC_BYTE_108_RX_REQ_LAST_OPTYPE_S, 0);
+
+	context->rq_rnr_timer = 0;
+	qpc_mask->rq_rnr_timer = 0;
+
+	roce_set_field(context->byte_152_raq, V2_QPC_BYTE_152_RAQ_PSN_M,
+		       V2_QPC_BYTE_152_RAQ_PSN_S, attr->rq_psn - 1);
+	roce_set_field(qpc_mask->byte_152_raq, V2_QPC_BYTE_152_RAQ_PSN_M,
+		       V2_QPC_BYTE_152_RAQ_PSN_S, 0);
+
+	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_HEAD_MAX_M,
+		       V2_QPC_BYTE_132_TRRL_HEAD_MAX_S, 0);
+	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_TAIL_MAX_M,
+		       V2_QPC_BYTE_132_TRRL_TAIL_MAX_S, 0);
+
+	roce_set_field(context->byte_168_irrl_idx,
+		       V2_QPC_BYTE_168_LP_SGEN_INI_M,
+		       V2_QPC_BYTE_168_LP_SGEN_INI_S, 3);
+	roce_set_field(qpc_mask->byte_168_irrl_idx,
+		       V2_QPC_BYTE_168_LP_SGEN_INI_M,
+		       V2_QPC_BYTE_168_LP_SGEN_INI_S, 0);
+
+	roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
+		       V2_QPC_BYTE_28_SL_S, rdma_ah_get_sl(&attr->ah_attr));
+	roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
+		       V2_QPC_BYTE_28_SL_S, 0);
+	hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
+
+	return 0;
+}
+
+static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
+				const struct ib_qp_attr *attr, int attr_mask,
+				struct hns_roce_v2_qp_context *context,
+				struct hns_roce_v2_qp_context *qpc_mask)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct device *dev = hr_dev->dev;
+	dma_addr_t dma_handle;
+	u32 page_size;
+	u64 *mtts;
+
+	/* Search qp buf's mtts */
+	mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table,
+				   hr_qp->mtt.first_seg, &dma_handle);
+	if (!mtts) {
+		dev_err(dev, "qp buf pa find failed\n");
+		return -EINVAL;
+	}
+
+	/* Not support alternate path and path migration */
+	if ((attr_mask & IB_QP_ALT_PATH) ||
+	    (attr_mask & IB_QP_PATH_MIG_STATE)) {
+		dev_err(dev, "RTR2RTS attr_mask (0x%x)error\n", attr_mask);
+		return -EINVAL;
+	}
+
+	/*
+	 * In v2 engine, software pass context and context mask to hardware
+	 * when modifying qp. If software need modify some fields in context,
+	 * we should set all bits of the relevant fields in context mask to
+	 * 0 at the same time, else set them to 0x1.
+	 */
+	roce_set_field(context->byte_60_qpst_mapid,
+		       V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M,
+		       V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S, attr->retry_cnt);
+	roce_set_field(qpc_mask->byte_60_qpst_mapid,
+		       V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M,
+		       V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S, 0);
+
+	context->sq_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT);
+	roce_set_field(context->byte_168_irrl_idx,
+		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M,
+		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S,
+		       mtts[0] >> (32 + PAGE_ADDR_SHIFT));
+	qpc_mask->sq_cur_blk_addr = 0;
+	roce_set_field(qpc_mask->byte_168_irrl_idx,
+		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M,
+		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0);
+
+	page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
+	context->sq_cur_sge_blk_addr =
+		       ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ?
+				      ((u32)(mtts[hr_qp->sge.offset / page_size]
+				      >> PAGE_ADDR_SHIFT)) : 0;
+	roce_set_field(context->byte_184_irrl_idx,
+		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M,
+		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S,
+		       ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ?
+		       (mtts[hr_qp->sge.offset / page_size] >>
+		       (32 + PAGE_ADDR_SHIFT)) : 0);
+	qpc_mask->sq_cur_sge_blk_addr = 0;
+	roce_set_field(qpc_mask->byte_184_irrl_idx,
+		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M,
+		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, 0);
+
+	context->rx_sq_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT);
+	roce_set_field(context->byte_232_irrl_sge,
+		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M,
+		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S,
+		       mtts[0] >> (32 + PAGE_ADDR_SHIFT));
+	qpc_mask->rx_sq_cur_blk_addr = 0;
+	roce_set_field(qpc_mask->byte_232_irrl_sge,
+		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M,
+		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S, 0);
+
+	/*
+	 * Set some fields in context to zero, Because the default values
+	 * of all fields in context are zero, we need not set them to 0 again.
+	 * but we should set the relevant fields of context mask to 0.
+	 */
+	roce_set_field(qpc_mask->byte_232_irrl_sge,
+		       V2_QPC_BYTE_232_IRRL_SGE_IDX_M,
+		       V2_QPC_BYTE_232_IRRL_SGE_IDX_S, 0);
+
+	roce_set_field(qpc_mask->byte_240_irrl_tail,
+		       V2_QPC_BYTE_240_RX_ACK_MSN_M,
+		       V2_QPC_BYTE_240_RX_ACK_MSN_S, 0);
+
+	roce_set_field(context->byte_244_rnr_rxack,
+		       V2_QPC_BYTE_244_RX_ACK_EPSN_M,
+		       V2_QPC_BYTE_244_RX_ACK_EPSN_S, attr->sq_psn);
+	roce_set_field(qpc_mask->byte_244_rnr_rxack,
+		       V2_QPC_BYTE_244_RX_ACK_EPSN_M,
+		       V2_QPC_BYTE_244_RX_ACK_EPSN_S, 0);
+
+	roce_set_field(qpc_mask->byte_248_ack_psn,
+		       V2_QPC_BYTE_248_ACK_LAST_OPTYPE_M,
+		       V2_QPC_BYTE_248_ACK_LAST_OPTYPE_S, 0);
+	roce_set_bit(qpc_mask->byte_248_ack_psn,
+		     V2_QPC_BYTE_248_IRRL_PSN_VLD_S, 0);
+	roce_set_field(qpc_mask->byte_248_ack_psn,
+		       V2_QPC_BYTE_248_IRRL_PSN_M,
+		       V2_QPC_BYTE_248_IRRL_PSN_S, 0);
+
+	roce_set_field(qpc_mask->byte_240_irrl_tail,
+		       V2_QPC_BYTE_240_IRRL_TAIL_REAL_M,
+		       V2_QPC_BYTE_240_IRRL_TAIL_REAL_S, 0);
+
+	roce_set_field(context->byte_220_retry_psn_msn,
+		       V2_QPC_BYTE_220_RETRY_MSG_PSN_M,
+		       V2_QPC_BYTE_220_RETRY_MSG_PSN_S, attr->sq_psn);
+	roce_set_field(qpc_mask->byte_220_retry_psn_msn,
+		       V2_QPC_BYTE_220_RETRY_MSG_PSN_M,
+		       V2_QPC_BYTE_220_RETRY_MSG_PSN_S, 0);
+
+	roce_set_field(context->byte_224_retry_msg,
+		       V2_QPC_BYTE_224_RETRY_MSG_PSN_M,
+		       V2_QPC_BYTE_224_RETRY_MSG_PSN_S, attr->sq_psn >> 16);
+	roce_set_field(qpc_mask->byte_224_retry_msg,
+		       V2_QPC_BYTE_224_RETRY_MSG_PSN_M,
+		       V2_QPC_BYTE_224_RETRY_MSG_PSN_S, 0);
+
+	roce_set_field(context->byte_224_retry_msg,
+		       V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_M,
+		       V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_S, attr->sq_psn);
+	roce_set_field(qpc_mask->byte_224_retry_msg,
+		       V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_M,
+		       V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_S, 0);
+
+	roce_set_field(qpc_mask->byte_220_retry_psn_msn,
+		       V2_QPC_BYTE_220_RETRY_MSG_MSN_M,
+		       V2_QPC_BYTE_220_RETRY_MSG_MSN_S, 0);
+
+	roce_set_bit(qpc_mask->byte_248_ack_psn,
+		     V2_QPC_BYTE_248_RNR_RETRY_FLAG_S, 0);
+
+	roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_CHECK_FLG_M,
+		       V2_QPC_BYTE_212_CHECK_FLG_S, 0);
+
+	roce_set_field(context->byte_212_lsn, V2_QPC_BYTE_212_RETRY_CNT_M,
+		       V2_QPC_BYTE_212_RETRY_CNT_S, attr->retry_cnt);
+	roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_RETRY_CNT_M,
+		       V2_QPC_BYTE_212_RETRY_CNT_S, 0);
+
+	roce_set_field(context->byte_212_lsn, V2_QPC_BYTE_212_RETRY_NUM_INIT_M,
+		       V2_QPC_BYTE_212_RETRY_NUM_INIT_S, attr->retry_cnt);
+	roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_RETRY_NUM_INIT_M,
+		       V2_QPC_BYTE_212_RETRY_NUM_INIT_S, 0);
+
+	roce_set_field(context->byte_244_rnr_rxack,
+		       V2_QPC_BYTE_244_RNR_NUM_INIT_M,
+		       V2_QPC_BYTE_244_RNR_NUM_INIT_S, attr->rnr_retry);
+	roce_set_field(qpc_mask->byte_244_rnr_rxack,
+		       V2_QPC_BYTE_244_RNR_NUM_INIT_M,
+		       V2_QPC_BYTE_244_RNR_NUM_INIT_S, 0);
+
+	roce_set_field(context->byte_244_rnr_rxack, V2_QPC_BYTE_244_RNR_CNT_M,
+		       V2_QPC_BYTE_244_RNR_CNT_S, attr->rnr_retry);
+	roce_set_field(qpc_mask->byte_244_rnr_rxack, V2_QPC_BYTE_244_RNR_CNT_M,
+		       V2_QPC_BYTE_244_RNR_CNT_S, 0);
+
+	roce_set_field(context->byte_212_lsn, V2_QPC_BYTE_212_LSN_M,
+		       V2_QPC_BYTE_212_LSN_S, 0x100);
+	roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_LSN_M,
+		       V2_QPC_BYTE_212_LSN_S, 0);
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_AT_M,
+			       V2_QPC_BYTE_28_AT_S, attr->timeout);
+		roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_AT_M,
+			      V2_QPC_BYTE_28_AT_S, 0);
+	}
+
+	roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
+		       V2_QPC_BYTE_28_SL_S,
+		       rdma_ah_get_sl(&attr->ah_attr));
+	roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
+		       V2_QPC_BYTE_28_SL_S, 0);
+	hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
+
+	roce_set_field(context->byte_172_sq_psn, V2_QPC_BYTE_172_SQ_CUR_PSN_M,
+		       V2_QPC_BYTE_172_SQ_CUR_PSN_S, attr->sq_psn);
+	roce_set_field(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_SQ_CUR_PSN_M,
+		       V2_QPC_BYTE_172_SQ_CUR_PSN_S, 0);
+
+	roce_set_field(qpc_mask->byte_196_sq_psn, V2_QPC_BYTE_196_IRRL_HEAD_M,
+		       V2_QPC_BYTE_196_IRRL_HEAD_S, 0);
+	roce_set_field(context->byte_196_sq_psn, V2_QPC_BYTE_196_SQ_MAX_PSN_M,
+		       V2_QPC_BYTE_196_SQ_MAX_PSN_S, attr->sq_psn);
+	roce_set_field(qpc_mask->byte_196_sq_psn, V2_QPC_BYTE_196_SQ_MAX_PSN_M,
+		       V2_QPC_BYTE_196_SQ_MAX_PSN_S, 0);
+
+	if ((attr_mask & IB_QP_MAX_QP_RD_ATOMIC) && attr->max_rd_atomic) {
+		roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M,
+			       V2_QPC_BYTE_208_SR_MAX_S,
+			       fls(attr->max_rd_atomic - 1));
+		roce_set_field(qpc_mask->byte_208_irrl,
+			       V2_QPC_BYTE_208_SR_MAX_M,
+			       V2_QPC_BYTE_208_SR_MAX_S, 0);
+	}
+	return 0;
+}
+
+static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
+				 const struct ib_qp_attr *attr,
+				 int attr_mask, enum ib_qp_state cur_state,
+				 enum ib_qp_state new_state)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct hns_roce_v2_qp_context *context;
+	struct hns_roce_v2_qp_context *qpc_mask;
+	struct device *dev = hr_dev->dev;
+	int ret = -EINVAL;
+
+	context = kzalloc(2 * sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	qpc_mask = context + 1;
+	/*
+	 * In v2 engine, software pass context and context mask to hardware
+	 * when modifying qp. If software need modify some fields in context,
+	 * we should set all bits of the relevant fields in context mask to
+	 * 0 at the same time, else set them to 0x1.
+	 */
+	memset(qpc_mask, 0xff, sizeof(*qpc_mask));
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		modify_qp_reset_to_init(ibqp, attr, attr_mask, context,
+					qpc_mask);
+	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) {
+		modify_qp_init_to_init(ibqp, attr, attr_mask, context,
+				       qpc_mask);
+	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		ret = modify_qp_init_to_rtr(ibqp, attr, attr_mask, context,
+					    qpc_mask);
+		if (ret)
+			goto out;
+	} else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) {
+		ret = modify_qp_rtr_to_rts(ibqp, attr, attr_mask, context,
+					   qpc_mask);
+		if (ret)
+			goto out;
+	} else if ((cur_state == IB_QPS_RTS && new_state == IB_QPS_RTS) ||
+		   (cur_state == IB_QPS_SQE && new_state == IB_QPS_RTS) ||
+		   (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD) ||
+		   (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD) ||
+		   (cur_state == IB_QPS_SQD && new_state == IB_QPS_RTS) ||
+		   (cur_state == IB_QPS_INIT && new_state == IB_QPS_RESET) ||
+		   (cur_state == IB_QPS_RTR && new_state == IB_QPS_RESET) ||
+		   (cur_state == IB_QPS_RTS && new_state == IB_QPS_RESET) ||
+		   (cur_state == IB_QPS_ERR && new_state == IB_QPS_RESET) ||
+		   (cur_state == IB_QPS_INIT && new_state == IB_QPS_ERR) ||
+		   (cur_state == IB_QPS_RTR && new_state == IB_QPS_ERR) ||
+		   (cur_state == IB_QPS_RTS && new_state == IB_QPS_ERR) ||
+		   (cur_state == IB_QPS_SQD && new_state == IB_QPS_ERR) ||
+		   (cur_state == IB_QPS_SQE && new_state == IB_QPS_ERR) ||
+		   (cur_state == IB_QPS_ERR && new_state == IB_QPS_ERR)) {
+		/* Nothing */
+		;
+	} else {
+		dev_err(dev, "Illegal state for QP!\n");
+		goto out;
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC))
+		set_access_flags(hr_qp, context, qpc_mask, attr, attr_mask);
+
+	/* Every status migrate must change state */
+	roce_set_field(context->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M,
+		       V2_QPC_BYTE_60_QP_ST_S, new_state);
+	roce_set_field(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M,
+		       V2_QPC_BYTE_60_QP_ST_S, 0);
+
+	/* SW pass context to HW */
+	ret = hns_roce_v2_qp_modify(hr_dev, &hr_qp->mtt, cur_state, new_state,
+				    context, hr_qp);
+	if (ret) {
+		dev_err(dev, "hns_roce_qp_modify failed(%d)\n", ret);
+		goto out;
+	}
+
+	hr_qp->state = new_state;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		hr_qp->atomic_rd_en = attr->qp_access_flags;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		hr_qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT) {
+		hr_qp->port = attr->port_num - 1;
+		hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port];
+	}
+
+	if (new_state == IB_QPS_RESET && !ibqp->uobject) {
+		hns_roce_v2_cq_clean(to_hr_cq(ibqp->recv_cq), hr_qp->qpn,
+				     ibqp->srq ? to_hr_srq(ibqp->srq) : NULL);
+		if (ibqp->send_cq != ibqp->recv_cq)
+			hns_roce_v2_cq_clean(to_hr_cq(ibqp->send_cq),
+					     hr_qp->qpn, NULL);
+
+		hr_qp->rq.head = 0;
+		hr_qp->rq.tail = 0;
+		hr_qp->sq.head = 0;
+		hr_qp->sq.tail = 0;
+		hr_qp->sq_next_wqe = 0;
+		hr_qp->next_sge = 0;
+		if (hr_qp->rq.wqe_cnt)
+			*hr_qp->rdb.db_record = 0;
+	}
+
+out:
+	kfree(context);
+	return ret;
+}
+
+static inline enum ib_qp_state to_ib_qp_st(enum hns_roce_v2_qp_state state)
+{
+	switch (state) {
+	case HNS_ROCE_QP_ST_RST:	return IB_QPS_RESET;
+	case HNS_ROCE_QP_ST_INIT:	return IB_QPS_INIT;
+	case HNS_ROCE_QP_ST_RTR:	return IB_QPS_RTR;
+	case HNS_ROCE_QP_ST_RTS:	return IB_QPS_RTS;
+	case HNS_ROCE_QP_ST_SQ_DRAINING:
+	case HNS_ROCE_QP_ST_SQD:	return IB_QPS_SQD;
+	case HNS_ROCE_QP_ST_SQER:	return IB_QPS_SQE;
+	case HNS_ROCE_QP_ST_ERR:	return IB_QPS_ERR;
+	default:			return -1;
+	}
+}
+
+static int hns_roce_v2_query_qpc(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_qp *hr_qp,
+				 struct hns_roce_v2_qp_context *hr_context)
+{
+	struct hns_roce_cmd_mailbox *mailbox;
+	int ret;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, hr_qp->qpn, 0,
+				HNS_ROCE_CMD_QUERY_QPC,
+				HNS_ROCE_CMD_TIMEOUT_MSECS);
+	if (ret) {
+		dev_err(hr_dev->dev, "QUERY QP cmd process error\n");
+		goto out;
+	}
+
+	memcpy(hr_context, mailbox->buf, sizeof(*hr_context));
+
+out:
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+	return ret;
+}
+
+static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+				int qp_attr_mask,
+				struct ib_qp_init_attr *qp_init_attr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct hns_roce_v2_qp_context *context;
+	struct device *dev = hr_dev->dev;
+	int tmp_qp_state;
+	int state;
+	int ret;
+
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	memset(qp_attr, 0, sizeof(*qp_attr));
+	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
+
+	mutex_lock(&hr_qp->mutex);
+
+	if (hr_qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		ret = 0;
+		goto done;
+	}
+
+	ret = hns_roce_v2_query_qpc(hr_dev, hr_qp, context);
+	if (ret) {
+		dev_err(dev, "query qpc error\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	state = roce_get_field(context->byte_60_qpst_mapid,
+			       V2_QPC_BYTE_60_QP_ST_M, V2_QPC_BYTE_60_QP_ST_S);
+	tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state);
+	if (tmp_qp_state == -1) {
+		dev_err(dev, "Illegal ib_qp_state\n");
+		ret = -EINVAL;
+		goto out;
+	}
+	hr_qp->state = (u8)tmp_qp_state;
+	qp_attr->qp_state = (enum ib_qp_state)hr_qp->state;
+	qp_attr->path_mtu = (enum ib_mtu)roce_get_field(context->byte_24_mtu_tc,
+							V2_QPC_BYTE_24_MTU_M,
+							V2_QPC_BYTE_24_MTU_S);
+	qp_attr->path_mig_state = IB_MIG_ARMED;
+	qp_attr->ah_attr.type   = RDMA_AH_ATTR_TYPE_ROCE;
+	if (hr_qp->ibqp.qp_type == IB_QPT_UD)
+		qp_attr->qkey = V2_QKEY_VAL;
+
+	qp_attr->rq_psn = roce_get_field(context->byte_108_rx_reqepsn,
+					 V2_QPC_BYTE_108_RX_REQ_EPSN_M,
+					 V2_QPC_BYTE_108_RX_REQ_EPSN_S);
+	qp_attr->sq_psn = (u32)roce_get_field(context->byte_172_sq_psn,
+					      V2_QPC_BYTE_172_SQ_CUR_PSN_M,
+					      V2_QPC_BYTE_172_SQ_CUR_PSN_S);
+	qp_attr->dest_qp_num = (u8)roce_get_field(context->byte_56_dqpn_err,
+						  V2_QPC_BYTE_56_DQPN_M,
+						  V2_QPC_BYTE_56_DQPN_S);
+	qp_attr->qp_access_flags = ((roce_get_bit(context->byte_76_srqn_op_en,
+						  V2_QPC_BYTE_76_RRE_S)) << 2) |
+				   ((roce_get_bit(context->byte_76_srqn_op_en,
+						  V2_QPC_BYTE_76_RWE_S)) << 1) |
+				   ((roce_get_bit(context->byte_76_srqn_op_en,
+						  V2_QPC_BYTE_76_ATE_S)) << 3);
+	if (hr_qp->ibqp.qp_type == IB_QPT_RC ||
+	    hr_qp->ibqp.qp_type == IB_QPT_UC) {
+		struct ib_global_route *grh =
+				rdma_ah_retrieve_grh(&qp_attr->ah_attr);
+
+		rdma_ah_set_sl(&qp_attr->ah_attr,
+			       roce_get_field(context->byte_28_at_fl,
+					      V2_QPC_BYTE_28_SL_M,
+					      V2_QPC_BYTE_28_SL_S));
+		grh->flow_label = roce_get_field(context->byte_28_at_fl,
+						 V2_QPC_BYTE_28_FL_M,
+						 V2_QPC_BYTE_28_FL_S);
+		grh->sgid_index = roce_get_field(context->byte_20_smac_sgid_idx,
+						 V2_QPC_BYTE_20_SGID_IDX_M,
+						 V2_QPC_BYTE_20_SGID_IDX_S);
+		grh->hop_limit = roce_get_field(context->byte_24_mtu_tc,
+						V2_QPC_BYTE_24_HOP_LIMIT_M,
+						V2_QPC_BYTE_24_HOP_LIMIT_S);
+		grh->traffic_class = roce_get_field(context->byte_24_mtu_tc,
+						    V2_QPC_BYTE_24_TC_M,
+						    V2_QPC_BYTE_24_TC_S);
+
+		memcpy(grh->dgid.raw, context->dgid, sizeof(grh->dgid.raw));
+	}
+
+	qp_attr->port_num = hr_qp->port + 1;
+	qp_attr->sq_draining = 0;
+	qp_attr->max_rd_atomic = 1 << roce_get_field(context->byte_208_irrl,
+						     V2_QPC_BYTE_208_SR_MAX_M,
+						     V2_QPC_BYTE_208_SR_MAX_S);
+	qp_attr->max_dest_rd_atomic = 1 << roce_get_field(context->byte_140_raq,
+						     V2_QPC_BYTE_140_RR_MAX_M,
+						     V2_QPC_BYTE_140_RR_MAX_S);
+	qp_attr->min_rnr_timer = (u8)roce_get_field(context->byte_80_rnr_rx_cqn,
+						 V2_QPC_BYTE_80_MIN_RNR_TIME_M,
+						 V2_QPC_BYTE_80_MIN_RNR_TIME_S);
+	qp_attr->timeout = (u8)roce_get_field(context->byte_28_at_fl,
+					      V2_QPC_BYTE_28_AT_M,
+					      V2_QPC_BYTE_28_AT_S);
+	qp_attr->retry_cnt = roce_get_field(context->byte_212_lsn,
+					    V2_QPC_BYTE_212_RETRY_CNT_M,
+					    V2_QPC_BYTE_212_RETRY_CNT_S);
+	qp_attr->rnr_retry = context->rq_rnr_timer;
+
+done:
+	qp_attr->cur_qp_state = qp_attr->qp_state;
+	qp_attr->cap.max_recv_wr = hr_qp->rq.wqe_cnt;
+	qp_attr->cap.max_recv_sge = hr_qp->rq.max_gs;
+
+	if (!ibqp->uobject) {
+		qp_attr->cap.max_send_wr = hr_qp->sq.wqe_cnt;
+		qp_attr->cap.max_send_sge = hr_qp->sq.max_gs;
+	} else {
+		qp_attr->cap.max_send_wr = 0;
+		qp_attr->cap.max_send_sge = 0;
+	}
+
+	qp_init_attr->cap = qp_attr->cap;
+
+out:
+	mutex_unlock(&hr_qp->mutex);
+	kfree(context);
+	return ret;
+}
+
+static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
+					 struct hns_roce_qp *hr_qp,
+					 int is_user)
+{
+	struct hns_roce_cq *send_cq, *recv_cq;
+	struct device *dev = hr_dev->dev;
+	int ret;
+
+	if (hr_qp->ibqp.qp_type == IB_QPT_RC && hr_qp->state != IB_QPS_RESET) {
+		/* Modify qp to reset before destroying qp */
+		ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0,
+					    hr_qp->state, IB_QPS_RESET);
+		if (ret) {
+			dev_err(dev, "modify QP %06lx to ERR failed.\n",
+				hr_qp->qpn);
+			return ret;
+		}
+	}
+
+	send_cq = to_hr_cq(hr_qp->ibqp.send_cq);
+	recv_cq = to_hr_cq(hr_qp->ibqp.recv_cq);
+
+	hns_roce_lock_cqs(send_cq, recv_cq);
+
+	if (!is_user) {
+		__hns_roce_v2_cq_clean(recv_cq, hr_qp->qpn, hr_qp->ibqp.srq ?
+				       to_hr_srq(hr_qp->ibqp.srq) : NULL);
+		if (send_cq != recv_cq)
+			__hns_roce_v2_cq_clean(send_cq, hr_qp->qpn, NULL);
+	}
+
+	hns_roce_qp_remove(hr_dev, hr_qp);
+
+	hns_roce_unlock_cqs(send_cq, recv_cq);
+
+	hns_roce_qp_free(hr_dev, hr_qp);
+
+	/* Not special_QP, free their QPN */
+	if ((hr_qp->ibqp.qp_type == IB_QPT_RC) ||
+	    (hr_qp->ibqp.qp_type == IB_QPT_UC) ||
+	    (hr_qp->ibqp.qp_type == IB_QPT_UD))
+		hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
+
+	hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
+
+	if (is_user) {
+		if (hr_qp->rq.wqe_cnt && (hr_qp->rdb_en == 1))
+			hns_roce_db_unmap_user(
+				to_hr_ucontext(hr_qp->ibqp.uobject->context),
+				&hr_qp->rdb);
+		ib_umem_release(hr_qp->umem);
+	} else {
+		kfree(hr_qp->sq.wrid);
+		kfree(hr_qp->rq.wrid);
+		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
+		if (hr_qp->rq.wqe_cnt)
+			hns_roce_free_db(hr_dev, &hr_qp->rdb);
+	}
+
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) {
+		kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
+		kfree(hr_qp->rq_inl_buf.wqe_list);
+	}
+
+	return 0;
+}
+
+static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	int ret;
+
+	ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, !!ibqp->pd->uobject);
+	if (ret) {
+		dev_err(hr_dev->dev, "Destroy qp failed(%d)\n", ret);
+		return ret;
+	}
+
+	if (hr_qp->ibqp.qp_type == IB_QPT_GSI)
+		kfree(hr_to_hr_sqp(hr_qp));
+	else
+		kfree(hr_qp);
+
+	return 0;
+}
+
+static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(cq->device);
+	struct hns_roce_v2_cq_context *cq_context;
+	struct hns_roce_cq *hr_cq = to_hr_cq(cq);
+	struct hns_roce_v2_cq_context *cqc_mask;
+	struct hns_roce_cmd_mailbox *mailbox;
+	int ret;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cq_context = mailbox->buf;
+	cqc_mask = (struct hns_roce_v2_cq_context *)mailbox->buf + 1;
+
+	memset(cqc_mask, 0xff, sizeof(*cqc_mask));
+
+	roce_set_field(cq_context->byte_56_cqe_period_maxcnt,
+		       V2_CQC_BYTE_56_CQ_MAX_CNT_M, V2_CQC_BYTE_56_CQ_MAX_CNT_S,
+		       cq_count);
+	roce_set_field(cqc_mask->byte_56_cqe_period_maxcnt,
+		       V2_CQC_BYTE_56_CQ_MAX_CNT_M, V2_CQC_BYTE_56_CQ_MAX_CNT_S,
+		       0);
+	roce_set_field(cq_context->byte_56_cqe_period_maxcnt,
+		       V2_CQC_BYTE_56_CQ_PERIOD_M, V2_CQC_BYTE_56_CQ_PERIOD_S,
+		       cq_period);
+	roce_set_field(cqc_mask->byte_56_cqe_period_maxcnt,
+		       V2_CQC_BYTE_56_CQ_PERIOD_M, V2_CQC_BYTE_56_CQ_PERIOD_S,
+		       0);
+
+	ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_cq->cqn, 1,
+				HNS_ROCE_CMD_MODIFY_CQC,
+				HNS_ROCE_CMD_TIMEOUT_MSECS);
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+	if (ret)
+		dev_err(hr_dev->dev, "MODIFY CQ Failed to cmd mailbox.\n");
+
+	return ret;
+}
+
+static void set_eq_cons_index_v2(struct hns_roce_eq *eq)
+{
+	u32 doorbell[2];
+
+	doorbell[0] = 0;
+	doorbell[1] = 0;
+
+	if (eq->type_flag == HNS_ROCE_AEQ) {
+		roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_CMD_M,
+			       HNS_ROCE_V2_EQ_DB_CMD_S,
+			       eq->arm_st == HNS_ROCE_V2_EQ_ALWAYS_ARMED ?
+			       HNS_ROCE_EQ_DB_CMD_AEQ :
+			       HNS_ROCE_EQ_DB_CMD_AEQ_ARMED);
+	} else {
+		roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_TAG_M,
+			       HNS_ROCE_V2_EQ_DB_TAG_S, eq->eqn);
+
+		roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_CMD_M,
+			       HNS_ROCE_V2_EQ_DB_CMD_S,
+			       eq->arm_st == HNS_ROCE_V2_EQ_ALWAYS_ARMED ?
+			       HNS_ROCE_EQ_DB_CMD_CEQ :
+			       HNS_ROCE_EQ_DB_CMD_CEQ_ARMED);
+	}
+
+	roce_set_field(doorbell[1], HNS_ROCE_V2_EQ_DB_PARA_M,
+		       HNS_ROCE_V2_EQ_DB_PARA_S,
+		       (eq->cons_index & HNS_ROCE_V2_CONS_IDX_M));
+
+	hns_roce_write64_k(doorbell, eq->doorbell);
+}
+
+static void hns_roce_v2_wq_catas_err_handle(struct hns_roce_dev *hr_dev,
+						  struct hns_roce_aeqe *aeqe,
+						  u32 qpn)
+{
+	struct device *dev = hr_dev->dev;
+	int sub_type;
+
+	dev_warn(dev, "Local work queue catastrophic error.\n");
+	sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M,
+				  HNS_ROCE_V2_AEQE_SUB_TYPE_S);
+	switch (sub_type) {
+	case HNS_ROCE_LWQCE_QPC_ERROR:
+		dev_warn(dev, "QP %d, QPC error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_MTU_ERROR:
+		dev_warn(dev, "QP %d, MTU error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR:
+		dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_WQE_ADDR_ERROR:
+		dev_warn(dev, "QP %d, WQE addr error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR:
+		dev_warn(dev, "QP %d, WQE shift error.\n", qpn);
+		break;
+	default:
+		dev_err(dev, "Unhandled sub_event type %d.\n", sub_type);
+		break;
+	}
+}
+
+static void hns_roce_v2_local_wq_access_err_handle(struct hns_roce_dev *hr_dev,
+					    struct hns_roce_aeqe *aeqe, u32 qpn)
+{
+	struct device *dev = hr_dev->dev;
+	int sub_type;
+
+	dev_warn(dev, "Local access violation work queue error.\n");
+	sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M,
+				  HNS_ROCE_V2_AEQE_SUB_TYPE_S);
+	switch (sub_type) {
+	case HNS_ROCE_LAVWQE_R_KEY_VIOLATION:
+		dev_warn(dev, "QP %d, R_key violation.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_LENGTH_ERROR:
+		dev_warn(dev, "QP %d, length error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_VA_ERROR:
+		dev_warn(dev, "QP %d, VA error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_PD_ERROR:
+		dev_err(dev, "QP %d, PD error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_RW_ACC_ERROR:
+		dev_warn(dev, "QP %d, rw acc error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_KEY_STATE_ERROR:
+		dev_warn(dev, "QP %d, key state error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR:
+		dev_warn(dev, "QP %d, MR operation error.\n", qpn);
+		break;
+	default:
+		dev_err(dev, "Unhandled sub_event type %d.\n", sub_type);
+		break;
+	}
+}
+
+static void hns_roce_v2_qp_err_handle(struct hns_roce_dev *hr_dev,
+				      struct hns_roce_aeqe *aeqe,
+				      int event_type)
+{
+	struct device *dev = hr_dev->dev;
+	u32 qpn;
+
+	qpn = roce_get_field(aeqe->event.qp_event.qp,
+			     HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M,
+			     HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S);
+
+	switch (event_type) {
+	case HNS_ROCE_EVENT_TYPE_COMM_EST:
+		dev_warn(dev, "Communication established.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
+		dev_warn(dev, "Send queue drained.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
+		hns_roce_v2_wq_catas_err_handle(hr_dev, aeqe, qpn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
+		dev_warn(dev, "Invalid request local work queue error.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+		hns_roce_v2_local_wq_access_err_handle(hr_dev, aeqe, qpn);
+		break;
+	default:
+		break;
+	}
+
+	hns_roce_qp_event(hr_dev, qpn, event_type);
+}
+
+static void hns_roce_v2_cq_err_handle(struct hns_roce_dev *hr_dev,
+				      struct hns_roce_aeqe *aeqe,
+				      int event_type)
+{
+	struct device *dev = hr_dev->dev;
+	u32 cqn;
+
+	cqn = roce_get_field(aeqe->event.cq_event.cq,
+			     HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M,
+			     HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S);
+
+	switch (event_type) {
+	case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
+		dev_warn(dev, "CQ 0x%x access err.\n", cqn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
+		dev_warn(dev, "CQ 0x%x overflow\n", cqn);
+		break;
+	default:
+		break;
+	}
+
+	hns_roce_cq_event(hr_dev, cqn, event_type);
+}
+
+static struct hns_roce_aeqe *get_aeqe_v2(struct hns_roce_eq *eq, u32 entry)
+{
+	u32 buf_chk_sz;
+	unsigned long off;
+
+	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
+	off = (entry & (eq->entries - 1)) * HNS_ROCE_AEQ_ENTRY_SIZE;
+
+	return (struct hns_roce_aeqe *)((char *)(eq->buf_list->buf) +
+		off % buf_chk_sz);
+}
+
+static struct hns_roce_aeqe *mhop_get_aeqe(struct hns_roce_eq *eq, u32 entry)
+{
+	u32 buf_chk_sz;
+	unsigned long off;
+
+	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
+
+	off = (entry & (eq->entries - 1)) * HNS_ROCE_AEQ_ENTRY_SIZE;
+
+	if (eq->hop_num == HNS_ROCE_HOP_NUM_0)
+		return (struct hns_roce_aeqe *)((u8 *)(eq->bt_l0) +
+			off % buf_chk_sz);
+	else
+		return (struct hns_roce_aeqe *)((u8 *)
+			(eq->buf[off / buf_chk_sz]) + off % buf_chk_sz);
+}
+
+static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq)
+{
+	struct hns_roce_aeqe *aeqe;
+
+	if (!eq->hop_num)
+		aeqe = get_aeqe_v2(eq, eq->cons_index);
+	else
+		aeqe = mhop_get_aeqe(eq, eq->cons_index);
+
+	return (roce_get_bit(aeqe->asyn, HNS_ROCE_V2_AEQ_AEQE_OWNER_S) ^
+		!!(eq->cons_index & eq->entries)) ? aeqe : NULL;
+}
+
+static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_eq *eq)
+{
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_aeqe *aeqe;
+	int aeqe_found = 0;
+	int event_type;
+
+	while ((aeqe = next_aeqe_sw_v2(eq))) {
+
+		/* Make sure we read AEQ entry after we have checked the
+		 * ownership bit
+		 */
+		dma_rmb();
+
+		event_type = roce_get_field(aeqe->asyn,
+					    HNS_ROCE_V2_AEQE_EVENT_TYPE_M,
+					    HNS_ROCE_V2_AEQE_EVENT_TYPE_S);
+
+		switch (event_type) {
+		case HNS_ROCE_EVENT_TYPE_PATH_MIG:
+			dev_warn(dev, "Path migrated succeeded.\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
+			dev_warn(dev, "Path migration failed.\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_COMM_EST:
+		case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
+		case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
+		case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
+		case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+			hns_roce_v2_qp_err_handle(hr_dev, aeqe, event_type);
+			break;
+		case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
+		case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
+		case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
+			dev_warn(dev, "SRQ not support.\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
+		case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
+			hns_roce_v2_cq_err_handle(hr_dev, aeqe, event_type);
+			break;
+		case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
+			dev_warn(dev, "DB overflow.\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_MB:
+			hns_roce_cmd_event(hr_dev,
+					le16_to_cpu(aeqe->event.cmd.token),
+					aeqe->event.cmd.status,
+					le64_to_cpu(aeqe->event.cmd.out_param));
+			break;
+		case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW:
+			dev_warn(dev, "CEQ overflow.\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_FLR:
+			dev_warn(dev, "Function level reset.\n");
+			break;
+		default:
+			dev_err(dev, "Unhandled event %d on EQ %d at idx %u.\n",
+				event_type, eq->eqn, eq->cons_index);
+			break;
+		};
+
+		++eq->cons_index;
+		aeqe_found = 1;
+
+		if (eq->cons_index > (2 * eq->entries - 1)) {
+			dev_warn(dev, "cons_index overflow, set back to 0.\n");
+			eq->cons_index = 0;
+		}
+	}
+
+	set_eq_cons_index_v2(eq);
+	return aeqe_found;
+}
+
+static struct hns_roce_ceqe *get_ceqe_v2(struct hns_roce_eq *eq, u32 entry)
+{
+	u32 buf_chk_sz;
+	unsigned long off;
+
+	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
+	off = (entry & (eq->entries - 1)) * HNS_ROCE_CEQ_ENTRY_SIZE;
+
+	return (struct hns_roce_ceqe *)((char *)(eq->buf_list->buf) +
+		off % buf_chk_sz);
+}
+
+static struct hns_roce_ceqe *mhop_get_ceqe(struct hns_roce_eq *eq, u32 entry)
+{
+	u32 buf_chk_sz;
+	unsigned long off;
+
+	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
+
+	off = (entry & (eq->entries - 1)) * HNS_ROCE_CEQ_ENTRY_SIZE;
+
+	if (eq->hop_num == HNS_ROCE_HOP_NUM_0)
+		return (struct hns_roce_ceqe *)((u8 *)(eq->bt_l0) +
+			off % buf_chk_sz);
+	else
+		return (struct hns_roce_ceqe *)((u8 *)(eq->buf[off /
+			buf_chk_sz]) + off % buf_chk_sz);
+}
+
+static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq)
+{
+	struct hns_roce_ceqe *ceqe;
+
+	if (!eq->hop_num)
+		ceqe = get_ceqe_v2(eq, eq->cons_index);
+	else
+		ceqe = mhop_get_ceqe(eq, eq->cons_index);
+
+	return (!!(roce_get_bit(ceqe->comp, HNS_ROCE_V2_CEQ_CEQE_OWNER_S))) ^
+		(!!(eq->cons_index & eq->entries)) ? ceqe : NULL;
+}
+
+static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_eq *eq)
+{
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_ceqe *ceqe;
+	int ceqe_found = 0;
+	u32 cqn;
+
+	while ((ceqe = next_ceqe_sw_v2(eq))) {
+
+		/* Make sure we read CEQ entry after we have checked the
+		 * ownership bit
+		 */
+		dma_rmb();
+
+		cqn = roce_get_field(ceqe->comp,
+				     HNS_ROCE_V2_CEQE_COMP_CQN_M,
+				     HNS_ROCE_V2_CEQE_COMP_CQN_S);
+
+		hns_roce_cq_completion(hr_dev, cqn);
+
+		++eq->cons_index;
+		ceqe_found = 1;
+
+		if (eq->cons_index > (2 * eq->entries - 1)) {
+			dev_warn(dev, "cons_index overflow, set back to 0.\n");
+			eq->cons_index = 0;
+		}
+	}
+
+	set_eq_cons_index_v2(eq);
+
+	return ceqe_found;
+}
+
+static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr)
+{
+	struct hns_roce_eq *eq = eq_ptr;
+	struct hns_roce_dev *hr_dev = eq->hr_dev;
+	int int_work = 0;
+
+	if (eq->type_flag == HNS_ROCE_CEQ)
+		/* Completion event interrupt */
+		int_work = hns_roce_v2_ceq_int(hr_dev, eq);
+	else
+		/* Asychronous event interrupt */
+		int_work = hns_roce_v2_aeq_int(hr_dev, eq);
+
+	return IRQ_RETVAL(int_work);
+}
+
+static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
+{
+	struct hns_roce_dev *hr_dev = dev_id;
+	struct device *dev = hr_dev->dev;
+	int int_work = 0;
+	u32 int_st;
+	u32 int_en;
+
+	/* Abnormal interrupt */
+	int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG);
+	int_en = roce_read(hr_dev, ROCEE_VF_ABN_INT_EN_REG);
+
+	if (roce_get_bit(int_st, HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) {
+		dev_err(dev, "AEQ overflow!\n");
+
+		roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S, 1);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
+
+		roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
+
+		int_work = 1;
+	} else if (roce_get_bit(int_st,	HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S)) {
+		dev_err(dev, "BUS ERR!\n");
+
+		roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S, 1);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
+
+		roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
+
+		int_work = 1;
+	} else if (roce_get_bit(int_st,	HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S)) {
+		dev_err(dev, "OTHER ERR!\n");
+
+		roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S, 1);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
+
+		roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
+
+		int_work = 1;
+	} else
+		dev_err(dev, "There is no abnormal irq found!\n");
+
+	return IRQ_RETVAL(int_work);
+}
+
+static void hns_roce_v2_int_mask_enable(struct hns_roce_dev *hr_dev,
+					int eq_num, int enable_flag)
+{
+	int i;
+
+	if (enable_flag == EQ_ENABLE) {
+		for (i = 0; i < eq_num; i++)
+			roce_write(hr_dev, ROCEE_VF_EVENT_INT_EN_REG +
+				   i * EQ_REG_OFFSET,
+				   HNS_ROCE_V2_VF_EVENT_INT_EN_M);
+
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG,
+			   HNS_ROCE_V2_VF_ABN_INT_EN_M);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG,
+			   HNS_ROCE_V2_VF_ABN_INT_CFG_M);
+	} else {
+		for (i = 0; i < eq_num; i++)
+			roce_write(hr_dev, ROCEE_VF_EVENT_INT_EN_REG +
+				   i * EQ_REG_OFFSET,
+				   HNS_ROCE_V2_VF_EVENT_INT_EN_M & 0x0);
+
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG,
+			   HNS_ROCE_V2_VF_ABN_INT_EN_M & 0x0);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG,
+			   HNS_ROCE_V2_VF_ABN_INT_CFG_M & 0x0);
+	}
+}
+
+static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, int eqn)
+{
+	struct device *dev = hr_dev->dev;
+	int ret;
+
+	if (eqn < hr_dev->caps.num_comp_vectors)
+		ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M,
+					0, HNS_ROCE_CMD_DESTROY_CEQC,
+					HNS_ROCE_CMD_TIMEOUT_MSECS);
+	else
+		ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M,
+					0, HNS_ROCE_CMD_DESTROY_AEQC,
+					HNS_ROCE_CMD_TIMEOUT_MSECS);
+	if (ret)
+		dev_err(dev, "[mailbox cmd] destroy eqc(%d) failed.\n", eqn);
+}
+
+static void hns_roce_mhop_free_eq(struct hns_roce_dev *hr_dev,
+				  struct hns_roce_eq *eq)
+{
+	struct device *dev = hr_dev->dev;
+	u64 idx;
+	u64 size;
+	u32 buf_chk_sz;
+	u32 bt_chk_sz;
+	u32 mhop_num;
+	int eqe_alloc;
+	int ba_num;
+	int i = 0;
+	int j = 0;
+
+	mhop_num = hr_dev->caps.eqe_hop_num;
+	buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
+	bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT);
+	ba_num = (PAGE_ALIGN(eq->entries * eq->eqe_size) + buf_chk_sz - 1) /
+		 buf_chk_sz;
+
+	/* hop_num = 0 */
+	if (mhop_num == HNS_ROCE_HOP_NUM_0) {
+		dma_free_coherent(dev, (unsigned int)(eq->entries *
+				  eq->eqe_size), eq->bt_l0, eq->l0_dma);
+		return;
+	}
+
+	/* hop_num = 1 or hop = 2 */
+	dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma);
+	if (mhop_num == 1) {
+		for (i = 0; i < eq->l0_last_num; i++) {
+			if (i == eq->l0_last_num - 1) {
+				eqe_alloc = i * (buf_chk_sz / eq->eqe_size);
+				size = (eq->entries - eqe_alloc) * eq->eqe_size;
+				dma_free_coherent(dev, size, eq->buf[i],
+						  eq->buf_dma[i]);
+				break;
+			}
+			dma_free_coherent(dev, buf_chk_sz, eq->buf[i],
+					  eq->buf_dma[i]);
+		}
+	} else if (mhop_num == 2) {
+		for (i = 0; i < eq->l0_last_num; i++) {
+			dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
+					  eq->l1_dma[i]);
+
+			for (j = 0; j < bt_chk_sz / 8; j++) {
+				idx = i * (bt_chk_sz / 8) + j;
+				if ((i == eq->l0_last_num - 1)
+				     && j == eq->l1_last_num - 1) {
+					eqe_alloc = (buf_chk_sz / eq->eqe_size)
+						    * idx;
+					size = (eq->entries - eqe_alloc)
+						* eq->eqe_size;
+					dma_free_coherent(dev, size,
+							  eq->buf[idx],
+							  eq->buf_dma[idx]);
+					break;
+				}
+				dma_free_coherent(dev, buf_chk_sz, eq->buf[idx],
+						  eq->buf_dma[idx]);
+			}
+		}
+	}
+	kfree(eq->buf_dma);
+	kfree(eq->buf);
+	kfree(eq->l1_dma);
+	kfree(eq->bt_l1);
+	eq->buf_dma = NULL;
+	eq->buf = NULL;
+	eq->l1_dma = NULL;
+	eq->bt_l1 = NULL;
+}
+
+static void hns_roce_v2_free_eq(struct hns_roce_dev *hr_dev,
+				struct hns_roce_eq *eq)
+{
+	u32 buf_chk_sz;
+
+	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
+
+	if (hr_dev->caps.eqe_hop_num) {
+		hns_roce_mhop_free_eq(hr_dev, eq);
+		return;
+	}
+
+	if (eq->buf_list)
+		dma_free_coherent(hr_dev->dev, buf_chk_sz,
+				  eq->buf_list->buf, eq->buf_list->map);
+}
+
+static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev,
+				struct hns_roce_eq *eq,
+				void *mb_buf)
+{
+	struct hns_roce_eq_context *eqc;
+
+	eqc = mb_buf;
+	memset(eqc, 0, sizeof(struct hns_roce_eq_context));
+
+	/* init eqc */
+	eq->doorbell = hr_dev->reg_base + ROCEE_VF_EQ_DB_CFG0_REG;
+	eq->hop_num = hr_dev->caps.eqe_hop_num;
+	eq->cons_index = 0;
+	eq->over_ignore = HNS_ROCE_V2_EQ_OVER_IGNORE_0;
+	eq->coalesce = HNS_ROCE_V2_EQ_COALESCE_0;
+	eq->arm_st = HNS_ROCE_V2_EQ_ALWAYS_ARMED;
+	eq->eqe_ba_pg_sz = hr_dev->caps.eqe_ba_pg_sz;
+	eq->eqe_buf_pg_sz = hr_dev->caps.eqe_buf_pg_sz;
+	eq->shift = ilog2((unsigned int)eq->entries);
+
+	if (!eq->hop_num)
+		eq->eqe_ba = eq->buf_list->map;
+	else
+		eq->eqe_ba = eq->l0_dma;
+
+	/* set eqc state */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_EQ_ST_M,
+		       HNS_ROCE_EQC_EQ_ST_S,
+		       HNS_ROCE_V2_EQ_STATE_VALID);
+
+	/* set eqe hop num */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_HOP_NUM_M,
+		       HNS_ROCE_EQC_HOP_NUM_S, eq->hop_num);
+
+	/* set eqc over_ignore */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_OVER_IGNORE_M,
+		       HNS_ROCE_EQC_OVER_IGNORE_S, eq->over_ignore);
+
+	/* set eqc coalesce */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_COALESCE_M,
+		       HNS_ROCE_EQC_COALESCE_S, eq->coalesce);
+
+	/* set eqc arm_state */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_ARM_ST_M,
+		       HNS_ROCE_EQC_ARM_ST_S, eq->arm_st);
+
+	/* set eqn */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_EQN_M,
+		       HNS_ROCE_EQC_EQN_S, eq->eqn);
+
+	/* set eqe_cnt */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_EQE_CNT_M,
+		       HNS_ROCE_EQC_EQE_CNT_S,
+		       HNS_ROCE_EQ_INIT_EQE_CNT);
+
+	/* set eqe_ba_pg_sz */
+	roce_set_field(eqc->byte_8,
+		       HNS_ROCE_EQC_BA_PG_SZ_M,
+		       HNS_ROCE_EQC_BA_PG_SZ_S, eq->eqe_ba_pg_sz);
+
+	/* set eqe_buf_pg_sz */
+	roce_set_field(eqc->byte_8,
+		       HNS_ROCE_EQC_BUF_PG_SZ_M,
+		       HNS_ROCE_EQC_BUF_PG_SZ_S, eq->eqe_buf_pg_sz);
+
+	/* set eq_producer_idx */
+	roce_set_field(eqc->byte_8,
+		       HNS_ROCE_EQC_PROD_INDX_M,
+		       HNS_ROCE_EQC_PROD_INDX_S,
+		       HNS_ROCE_EQ_INIT_PROD_IDX);
+
+	/* set eq_max_cnt */
+	roce_set_field(eqc->byte_12,
+		       HNS_ROCE_EQC_MAX_CNT_M,
+		       HNS_ROCE_EQC_MAX_CNT_S, eq->eq_max_cnt);
+
+	/* set eq_period */
+	roce_set_field(eqc->byte_12,
+		       HNS_ROCE_EQC_PERIOD_M,
+		       HNS_ROCE_EQC_PERIOD_S, eq->eq_period);
+
+	/* set eqe_report_timer */
+	roce_set_field(eqc->eqe_report_timer,
+		       HNS_ROCE_EQC_REPORT_TIMER_M,
+		       HNS_ROCE_EQC_REPORT_TIMER_S,
+		       HNS_ROCE_EQ_INIT_REPORT_TIMER);
+
+	/* set eqe_ba [34:3] */
+	roce_set_field(eqc->eqe_ba0,
+		       HNS_ROCE_EQC_EQE_BA_L_M,
+		       HNS_ROCE_EQC_EQE_BA_L_S, eq->eqe_ba >> 3);
+
+	/* set eqe_ba [64:35] */
+	roce_set_field(eqc->eqe_ba1,
+		       HNS_ROCE_EQC_EQE_BA_H_M,
+		       HNS_ROCE_EQC_EQE_BA_H_S, eq->eqe_ba >> 35);
+
+	/* set eq shift */
+	roce_set_field(eqc->byte_28,
+		       HNS_ROCE_EQC_SHIFT_M,
+		       HNS_ROCE_EQC_SHIFT_S, eq->shift);
+
+	/* set eq MSI_IDX */
+	roce_set_field(eqc->byte_28,
+		       HNS_ROCE_EQC_MSI_INDX_M,
+		       HNS_ROCE_EQC_MSI_INDX_S,
+		       HNS_ROCE_EQ_INIT_MSI_IDX);
+
+	/* set cur_eqe_ba [27:12] */
+	roce_set_field(eqc->byte_28,
+		       HNS_ROCE_EQC_CUR_EQE_BA_L_M,
+		       HNS_ROCE_EQC_CUR_EQE_BA_L_S, eq->cur_eqe_ba >> 12);
+
+	/* set cur_eqe_ba [59:28] */
+	roce_set_field(eqc->byte_32,
+		       HNS_ROCE_EQC_CUR_EQE_BA_M_M,
+		       HNS_ROCE_EQC_CUR_EQE_BA_M_S, eq->cur_eqe_ba >> 28);
+
+	/* set cur_eqe_ba [63:60] */
+	roce_set_field(eqc->byte_36,
+		       HNS_ROCE_EQC_CUR_EQE_BA_H_M,
+		       HNS_ROCE_EQC_CUR_EQE_BA_H_S, eq->cur_eqe_ba >> 60);
+
+	/* set eq consumer idx */
+	roce_set_field(eqc->byte_36,
+		       HNS_ROCE_EQC_CONS_INDX_M,
+		       HNS_ROCE_EQC_CONS_INDX_S,
+		       HNS_ROCE_EQ_INIT_CONS_IDX);
+
+	/* set nex_eqe_ba[43:12] */
+	roce_set_field(eqc->nxt_eqe_ba0,
+		       HNS_ROCE_EQC_NXT_EQE_BA_L_M,
+		       HNS_ROCE_EQC_NXT_EQE_BA_L_S, eq->nxt_eqe_ba >> 12);
+
+	/* set nex_eqe_ba[63:44] */
+	roce_set_field(eqc->nxt_eqe_ba1,
+		       HNS_ROCE_EQC_NXT_EQE_BA_H_M,
+		       HNS_ROCE_EQC_NXT_EQE_BA_H_S, eq->nxt_eqe_ba >> 44);
+}
+
+static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev,
+				  struct hns_roce_eq *eq)
+{
+	struct device *dev = hr_dev->dev;
+	int eq_alloc_done = 0;
+	int eq_buf_cnt = 0;
+	int eqe_alloc;
+	u32 buf_chk_sz;
+	u32 bt_chk_sz;
+	u32 mhop_num;
+	u64 size;
+	u64 idx;
+	int ba_num;
+	int bt_num;
+	int record_i;
+	int record_j;
+	int i = 0;
+	int j = 0;
+
+	mhop_num = hr_dev->caps.eqe_hop_num;
+	buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
+	bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT);
+
+	ba_num = (PAGE_ALIGN(eq->entries * eq->eqe_size) + buf_chk_sz - 1)
+		  / buf_chk_sz;
+	bt_num = (ba_num + bt_chk_sz / 8 - 1) / (bt_chk_sz / 8);
+
+	/* hop_num = 0 */
+	if (mhop_num == HNS_ROCE_HOP_NUM_0) {
+		if (eq->entries > buf_chk_sz / eq->eqe_size) {
+			dev_err(dev, "eq entries %d is larger than buf_pg_sz!",
+				eq->entries);
+			return -EINVAL;
+		}
+		eq->bt_l0 = dma_alloc_coherent(dev, eq->entries * eq->eqe_size,
+					       &(eq->l0_dma), GFP_KERNEL);
+		if (!eq->bt_l0)
+			return -ENOMEM;
+
+		eq->cur_eqe_ba = eq->l0_dma;
+		eq->nxt_eqe_ba = 0;
+
+		memset(eq->bt_l0, 0, eq->entries * eq->eqe_size);
+
+		return 0;
+	}
+
+	eq->buf_dma = kcalloc(ba_num, sizeof(*eq->buf_dma), GFP_KERNEL);
+	if (!eq->buf_dma)
+		return -ENOMEM;
+	eq->buf = kcalloc(ba_num, sizeof(*eq->buf), GFP_KERNEL);
+	if (!eq->buf)
+		goto err_kcalloc_buf;
+
+	if (mhop_num == 2) {
+		eq->l1_dma = kcalloc(bt_num, sizeof(*eq->l1_dma), GFP_KERNEL);
+		if (!eq->l1_dma)
+			goto err_kcalloc_l1_dma;
+
+		eq->bt_l1 = kcalloc(bt_num, sizeof(*eq->bt_l1), GFP_KERNEL);
+		if (!eq->bt_l1)
+			goto err_kcalloc_bt_l1;
+	}
+
+	/* alloc L0 BT */
+	eq->bt_l0 = dma_alloc_coherent(dev, bt_chk_sz, &eq->l0_dma, GFP_KERNEL);
+	if (!eq->bt_l0)
+		goto err_dma_alloc_l0;
+
+	if (mhop_num == 1) {
+		if (ba_num > (bt_chk_sz / 8))
+			dev_err(dev, "ba_num %d is too large for 1 hop\n",
+				ba_num);
+
+		/* alloc buf */
+		for (i = 0; i < bt_chk_sz / 8; i++) {
+			if (eq_buf_cnt + 1 < ba_num) {
+				size = buf_chk_sz;
+			} else {
+				eqe_alloc = i * (buf_chk_sz / eq->eqe_size);
+				size = (eq->entries - eqe_alloc) * eq->eqe_size;
+			}
+			eq->buf[i] = dma_alloc_coherent(dev, size,
+							&(eq->buf_dma[i]),
+							GFP_KERNEL);
+			if (!eq->buf[i])
+				goto err_dma_alloc_buf;
+
+			memset(eq->buf[i], 0, size);
+			*(eq->bt_l0 + i) = eq->buf_dma[i];
+
+			eq_buf_cnt++;
+			if (eq_buf_cnt >= ba_num)
+				break;
+		}
+		eq->cur_eqe_ba = eq->buf_dma[0];
+		eq->nxt_eqe_ba = eq->buf_dma[1];
+
+	} else if (mhop_num == 2) {
+		/* alloc L1 BT and buf */
+		for (i = 0; i < bt_chk_sz / 8; i++) {
+			eq->bt_l1[i] = dma_alloc_coherent(dev, bt_chk_sz,
+							  &(eq->l1_dma[i]),
+							  GFP_KERNEL);
+			if (!eq->bt_l1[i])
+				goto err_dma_alloc_l1;
+			*(eq->bt_l0 + i) = eq->l1_dma[i];
+
+			for (j = 0; j < bt_chk_sz / 8; j++) {
+				idx = i * bt_chk_sz / 8 + j;
+				if (eq_buf_cnt + 1 < ba_num) {
+					size = buf_chk_sz;
+				} else {
+					eqe_alloc = (buf_chk_sz / eq->eqe_size)
+						    * idx;
+					size = (eq->entries - eqe_alloc)
+						* eq->eqe_size;
+				}
+				eq->buf[idx] = dma_alloc_coherent(dev, size,
+							    &(eq->buf_dma[idx]),
+							    GFP_KERNEL);
+				if (!eq->buf[idx])
+					goto err_dma_alloc_buf;
+
+				memset(eq->buf[idx], 0, size);
+				*(eq->bt_l1[i] + j) = eq->buf_dma[idx];
+
+				eq_buf_cnt++;
+				if (eq_buf_cnt >= ba_num) {
+					eq_alloc_done = 1;
+					break;
+				}
+			}
+
+			if (eq_alloc_done)
+				break;
+		}
+		eq->cur_eqe_ba = eq->buf_dma[0];
+		eq->nxt_eqe_ba = eq->buf_dma[1];
+	}
+
+	eq->l0_last_num = i + 1;
+	if (mhop_num == 2)
+		eq->l1_last_num = j + 1;
+
+	return 0;
+
+err_dma_alloc_l1:
+	dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma);
+	eq->bt_l0 = NULL;
+	eq->l0_dma = 0;
+	for (i -= 1; i >= 0; i--) {
+		dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
+				  eq->l1_dma[i]);
+
+		for (j = 0; j < bt_chk_sz / 8; j++) {
+			idx = i * bt_chk_sz / 8 + j;
+			dma_free_coherent(dev, buf_chk_sz, eq->buf[idx],
+					  eq->buf_dma[idx]);
+		}
+	}
+	goto err_dma_alloc_l0;
+
+err_dma_alloc_buf:
+	dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma);
+	eq->bt_l0 = NULL;
+	eq->l0_dma = 0;
+
+	if (mhop_num == 1)
+		for (i -= 1; i >= 0; i--)
+			dma_free_coherent(dev, buf_chk_sz, eq->buf[i],
+					  eq->buf_dma[i]);
+	else if (mhop_num == 2) {
+		record_i = i;
+		record_j = j;
+		for (; i >= 0; i--) {
+			dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
+					  eq->l1_dma[i]);
+
+			for (j = 0; j < bt_chk_sz / 8; j++) {
+				if (i == record_i && j >= record_j)
+					break;
+
+				idx = i * bt_chk_sz / 8 + j;
+				dma_free_coherent(dev, buf_chk_sz,
+						  eq->buf[idx],
+						  eq->buf_dma[idx]);
+			}
+		}
+	}
+
+err_dma_alloc_l0:
+	kfree(eq->bt_l1);
+	eq->bt_l1 = NULL;
+
+err_kcalloc_bt_l1:
+	kfree(eq->l1_dma);
+	eq->l1_dma = NULL;
+
+err_kcalloc_l1_dma:
+	kfree(eq->buf);
+	eq->buf = NULL;
+
+err_kcalloc_buf:
+	kfree(eq->buf_dma);
+	eq->buf_dma = NULL;
+
+	return -ENOMEM;
+}
+
+static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_eq *eq,
+				 unsigned int eq_cmd)
+{
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_cmd_mailbox *mailbox;
+	u32 buf_chk_sz = 0;
+	int ret;
+
+	/* Allocate mailbox memory */
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (!hr_dev->caps.eqe_hop_num) {
+		buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
+
+		eq->buf_list = kzalloc(sizeof(struct hns_roce_buf_list),
+				       GFP_KERNEL);
+		if (!eq->buf_list) {
+			ret = -ENOMEM;
+			goto free_cmd_mbox;
+		}
+
+		eq->buf_list->buf = dma_alloc_coherent(dev, buf_chk_sz,
+						       &(eq->buf_list->map),
+						       GFP_KERNEL);
+		if (!eq->buf_list->buf) {
+			ret = -ENOMEM;
+			goto err_alloc_buf;
+		}
+
+		memset(eq->buf_list->buf, 0, buf_chk_sz);
+	} else {
+		ret = hns_roce_mhop_alloc_eq(hr_dev, eq);
+		if (ret) {
+			ret = -ENOMEM;
+			goto free_cmd_mbox;
+		}
+	}
+
+	hns_roce_config_eqc(hr_dev, eq, mailbox->buf);
+
+	ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq->eqn, 0,
+				eq_cmd, HNS_ROCE_CMD_TIMEOUT_MSECS);
+	if (ret) {
+		dev_err(dev, "[mailbox cmd] create eqc failed.\n");
+		goto err_cmd_mbox;
+	}
+
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return 0;
+
+err_cmd_mbox:
+	if (!hr_dev->caps.eqe_hop_num)
+		dma_free_coherent(dev, buf_chk_sz, eq->buf_list->buf,
+				  eq->buf_list->map);
+	else {
+		hns_roce_mhop_free_eq(hr_dev, eq);
+		goto free_cmd_mbox;
+	}
+
+err_alloc_buf:
+	kfree(eq->buf_list);
+
+free_cmd_mbox:
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return ret;
+}
+
+static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_eq *eq;
+	unsigned int eq_cmd;
+	int irq_num;
+	int eq_num;
+	int other_num;
+	int comp_num;
+	int aeq_num;
+	int i, j, k;
+	int ret;
+
+	other_num = hr_dev->caps.num_other_vectors;
+	comp_num = hr_dev->caps.num_comp_vectors;
+	aeq_num = hr_dev->caps.num_aeq_vectors;
+
+	eq_num = comp_num + aeq_num;
+	irq_num = eq_num + other_num;
+
+	eq_table->eq = kcalloc(eq_num, sizeof(*eq_table->eq), GFP_KERNEL);
+	if (!eq_table->eq)
+		return -ENOMEM;
+
+	for (i = 0; i < irq_num; i++) {
+		hr_dev->irq_names[i] = kzalloc(HNS_ROCE_INT_NAME_LEN,
+					       GFP_KERNEL);
+		if (!hr_dev->irq_names[i]) {
+			ret = -ENOMEM;
+			goto err_failed_kzalloc;
+		}
+	}
+
+	/* create eq */
+	for (j = 0; j < eq_num; j++) {
+		eq = &eq_table->eq[j];
+		eq->hr_dev = hr_dev;
+		eq->eqn = j;
+		if (j < comp_num) {
+			/* CEQ */
+			eq_cmd = HNS_ROCE_CMD_CREATE_CEQC;
+			eq->type_flag = HNS_ROCE_CEQ;
+			eq->entries = hr_dev->caps.ceqe_depth;
+			eq->eqe_size = HNS_ROCE_CEQ_ENTRY_SIZE;
+			eq->irq = hr_dev->irq[j + other_num + aeq_num];
+			eq->eq_max_cnt = HNS_ROCE_CEQ_DEFAULT_BURST_NUM;
+			eq->eq_period = HNS_ROCE_CEQ_DEFAULT_INTERVAL;
+		} else {
+			/* AEQ */
+			eq_cmd = HNS_ROCE_CMD_CREATE_AEQC;
+			eq->type_flag = HNS_ROCE_AEQ;
+			eq->entries = hr_dev->caps.aeqe_depth;
+			eq->eqe_size = HNS_ROCE_AEQ_ENTRY_SIZE;
+			eq->irq = hr_dev->irq[j - comp_num + other_num];
+			eq->eq_max_cnt = HNS_ROCE_AEQ_DEFAULT_BURST_NUM;
+			eq->eq_period = HNS_ROCE_AEQ_DEFAULT_INTERVAL;
+		}
+
+		ret = hns_roce_v2_create_eq(hr_dev, eq, eq_cmd);
+		if (ret) {
+			dev_err(dev, "eq create failed.\n");
+			goto err_create_eq_fail;
+		}
+	}
+
+	/* enable irq */
+	hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_ENABLE);
+
+	/* irq contains: abnormal + AEQ + CEQ*/
+	for (k = 0; k < irq_num; k++)
+		if (k < other_num)
+			snprintf((char *)hr_dev->irq_names[k],
+				 HNS_ROCE_INT_NAME_LEN, "hns-abn-%d", k);
+		else if (k < (other_num + aeq_num))
+			snprintf((char *)hr_dev->irq_names[k],
+				 HNS_ROCE_INT_NAME_LEN, "hns-aeq-%d",
+				 k - other_num);
+		else
+			snprintf((char *)hr_dev->irq_names[k],
+				 HNS_ROCE_INT_NAME_LEN, "hns-ceq-%d",
+				 k - other_num - aeq_num);
+
+	for (k = 0; k < irq_num; k++) {
+		if (k < other_num)
+			ret = request_irq(hr_dev->irq[k],
+					  hns_roce_v2_msix_interrupt_abn,
+					  0, hr_dev->irq_names[k], hr_dev);
+
+		else if (k < (other_num + comp_num))
+			ret = request_irq(eq_table->eq[k - other_num].irq,
+					  hns_roce_v2_msix_interrupt_eq,
+					  0, hr_dev->irq_names[k + aeq_num],
+					  &eq_table->eq[k - other_num]);
+		else
+			ret = request_irq(eq_table->eq[k - other_num].irq,
+					  hns_roce_v2_msix_interrupt_eq,
+					  0, hr_dev->irq_names[k - comp_num],
+					  &eq_table->eq[k - other_num]);
+		if (ret) {
+			dev_err(dev, "Request irq error!\n");
+			goto err_request_irq_fail;
+		}
+	}
+
+	return 0;
+
+err_request_irq_fail:
+	for (k -= 1; k >= 0; k--)
+		if (k < other_num)
+			free_irq(hr_dev->irq[k], hr_dev);
+		else
+			free_irq(eq_table->eq[k - other_num].irq,
+				 &eq_table->eq[k - other_num]);
+
+err_create_eq_fail:
+	for (j -= 1; j >= 0; j--)
+		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[j]);
+
+err_failed_kzalloc:
+	for (i -= 1; i >= 0; i--)
+		kfree(hr_dev->irq_names[i]);
+	kfree(eq_table->eq);
+
+	return ret;
+}
+
+static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
+	int irq_num;
+	int eq_num;
+	int i;
+
+	eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors;
+	irq_num = eq_num + hr_dev->caps.num_other_vectors;
+
+	/* Disable irq */
+	hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_DISABLE);
+
+	for (i = 0; i < hr_dev->caps.num_other_vectors; i++)
+		free_irq(hr_dev->irq[i], hr_dev);
+
+	for (i = 0; i < eq_num; i++) {
+		hns_roce_v2_destroy_eqc(hr_dev, i);
+
+		free_irq(eq_table->eq[i].irq, &eq_table->eq[i]);
+
+		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[i]);
+	}
+
+	for (i = 0; i < irq_num; i++)
+		kfree(hr_dev->irq_names[i]);
+
+	kfree(eq_table->eq);
+}
+
+static const struct hns_roce_hw hns_roce_hw_v2 = {
+	.cmq_init = hns_roce_v2_cmq_init,
+	.cmq_exit = hns_roce_v2_cmq_exit,
+	.hw_profile = hns_roce_v2_profile,
+	.post_mbox = hns_roce_v2_post_mbox,
+	.chk_mbox = hns_roce_v2_chk_mbox,
+	.set_gid = hns_roce_v2_set_gid,
+	.set_mac = hns_roce_v2_set_mac,
+	.write_mtpt = hns_roce_v2_write_mtpt,
+	.rereg_write_mtpt = hns_roce_v2_rereg_write_mtpt,
+	.write_cqc = hns_roce_v2_write_cqc,
+	.set_hem = hns_roce_v2_set_hem,
+	.clear_hem = hns_roce_v2_clear_hem,
+	.modify_qp = hns_roce_v2_modify_qp,
+	.query_qp = hns_roce_v2_query_qp,
+	.destroy_qp = hns_roce_v2_destroy_qp,
+	.modify_cq = hns_roce_v2_modify_cq,
+	.post_send = hns_roce_v2_post_send,
+	.post_recv = hns_roce_v2_post_recv,
+	.req_notify_cq = hns_roce_v2_req_notify_cq,
+	.poll_cq = hns_roce_v2_poll_cq,
+	.init_eq = hns_roce_v2_init_eq_table,
+	.cleanup_eq = hns_roce_v2_cleanup_eq_table,
+};
+
+static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = {
+	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), 0},
+	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), 0},
+	{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0},
+	/* required last entry */
+	{0, }
+};
+
+MODULE_DEVICE_TABLE(pci, hns_roce_hw_v2_pci_tbl);
+
+static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
+				  struct hnae3_handle *handle)
+{
+	const struct pci_device_id *id;
+	int i;
+
+	id = pci_match_id(hns_roce_hw_v2_pci_tbl, hr_dev->pci_dev);
+	if (!id) {
+		dev_err(hr_dev->dev, "device is not compatible!\n");
+		return -ENXIO;
+	}
+
+	hr_dev->hw = &hns_roce_hw_v2;
+	hr_dev->sdb_offset = ROCEE_DB_SQ_L_0_REG;
+	hr_dev->odb_offset = hr_dev->sdb_offset;
+
+	/* Get info from NIC driver. */
+	hr_dev->reg_base = handle->rinfo.roce_io_base;
+	hr_dev->caps.num_ports = 1;
+	hr_dev->iboe.netdevs[0] = handle->rinfo.netdev;
+	hr_dev->iboe.phy_port[0] = 0;
+
+	addrconf_addr_eui48((u8 *)&hr_dev->ib_dev.node_guid,
+			    hr_dev->iboe.netdevs[0]->dev_addr);
+
+	for (i = 0; i < HNS_ROCE_V2_MAX_IRQ_NUM; i++)
+		hr_dev->irq[i] = pci_irq_vector(handle->pdev,
+						i + handle->rinfo.base_vector);
+
+	/* cmd issue mode: 0 is poll, 1 is event */
+	hr_dev->cmd_mod = 1;
+	hr_dev->loop_idc = 0;
+
+	return 0;
+}
+
+static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
+{
+	struct hns_roce_dev *hr_dev;
+	int ret;
+
+	hr_dev = (struct hns_roce_dev *)ib_alloc_device(sizeof(*hr_dev));
+	if (!hr_dev)
+		return -ENOMEM;
+
+	hr_dev->priv = kzalloc(sizeof(struct hns_roce_v2_priv), GFP_KERNEL);
+	if (!hr_dev->priv) {
+		ret = -ENOMEM;
+		goto error_failed_kzalloc;
+	}
+
+	hr_dev->pci_dev = handle->pdev;
+	hr_dev->dev = &handle->pdev->dev;
+	handle->priv = hr_dev;
+
+	ret = hns_roce_hw_v2_get_cfg(hr_dev, handle);
+	if (ret) {
+		dev_err(hr_dev->dev, "Get Configuration failed!\n");
+		goto error_failed_get_cfg;
+	}
+
+	ret = hns_roce_init(hr_dev);
+	if (ret) {
+		dev_err(hr_dev->dev, "RoCE Engine init failed!\n");
+		goto error_failed_get_cfg;
+	}
+
+	return 0;
+
+error_failed_get_cfg:
+	kfree(hr_dev->priv);
+
+error_failed_kzalloc:
+	ib_dealloc_device(&hr_dev->ib_dev);
+
+	return ret;
+}
+
+static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
+					   bool reset)
+{
+	struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv;
+
+	hns_roce_exit(hr_dev);
+	kfree(hr_dev->priv);
+	ib_dealloc_device(&hr_dev->ib_dev);
+}
+
+static const struct hnae3_client_ops hns_roce_hw_v2_ops = {
+	.init_instance = hns_roce_hw_v2_init_instance,
+	.uninit_instance = hns_roce_hw_v2_uninit_instance,
+};
+
+static struct hnae3_client hns_roce_hw_v2_client = {
+	.name = "hns_roce_hw_v2",
+	.type = HNAE3_CLIENT_ROCE,
+	.ops = &hns_roce_hw_v2_ops,
+};
+
+static int __init hns_roce_hw_v2_init(void)
+{
+	return hnae3_register_client(&hns_roce_hw_v2_client);
+}
+
+static void __exit hns_roce_hw_v2_exit(void)
+{
+	hnae3_unregister_client(&hns_roce_hw_v2_client);
+}
+
+module_init(hns_roce_hw_v2_init);
+module_exit(hns_roce_hw_v2_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Wei Hu <xavier.huwei@huawei.com>");
+MODULE_AUTHOR("Lijun Ou <oulijun@huawei.com>");
+MODULE_AUTHOR("Shaobo Xu <xushaobo2@huawei.com>");
+MODULE_DESCRIPTION("Hisilicon Hip08 Family RoCE Driver");
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
new file mode 100644
index 0000000..182b672
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -0,0 +1,1456 @@
+/*
+ * Copyright (c) 2016-2017 Hisilicon Limited.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _HNS_ROCE_HW_V2_H
+#define _HNS_ROCE_HW_V2_H
+
+#include <linux/bitops.h>
+
+#define HNS_ROCE_VF_QPC_BT_NUM			256
+#define HNS_ROCE_VF_SRQC_BT_NUM			64
+#define HNS_ROCE_VF_CQC_BT_NUM			64
+#define HNS_ROCE_VF_MPT_BT_NUM			64
+#define HNS_ROCE_VF_EQC_NUM			64
+#define HNS_ROCE_VF_SMAC_NUM			32
+#define HNS_ROCE_VF_SGID_NUM			32
+#define HNS_ROCE_VF_SL_NUM			8
+
+#define HNS_ROCE_V2_MAX_QP_NUM			0x2000
+#define HNS_ROCE_V2_MAX_WQE_NUM			0x8000
+#define HNS_ROCE_V2_MAX_CQ_NUM			0x8000
+#define HNS_ROCE_V2_MAX_CQE_NUM			0x10000
+#define HNS_ROCE_V2_MAX_RQ_SGE_NUM		0x100
+#define HNS_ROCE_V2_MAX_SQ_SGE_NUM		0xff
+#define HNS_ROCE_V2_MAX_SQ_INLINE		0x20
+#define HNS_ROCE_V2_UAR_NUM			256
+#define HNS_ROCE_V2_PHY_UAR_NUM			1
+#define HNS_ROCE_V2_MAX_IRQ_NUM			65
+#define HNS_ROCE_V2_COMP_VEC_NUM		63
+#define HNS_ROCE_V2_AEQE_VEC_NUM		1
+#define HNS_ROCE_V2_ABNORMAL_VEC_NUM		1
+#define HNS_ROCE_V2_MAX_MTPT_NUM		0x8000
+#define HNS_ROCE_V2_MAX_MTT_SEGS		0x1000000
+#define HNS_ROCE_V2_MAX_CQE_SEGS		0x1000000
+#define HNS_ROCE_V2_MAX_PD_NUM			0x1000000
+#define HNS_ROCE_V2_MAX_QP_INIT_RDMA		128
+#define HNS_ROCE_V2_MAX_QP_DEST_RDMA		128
+#define HNS_ROCE_V2_MAX_SQ_DESC_SZ		64
+#define HNS_ROCE_V2_MAX_RQ_DESC_SZ		16
+#define HNS_ROCE_V2_MAX_SRQ_DESC_SZ		64
+#define HNS_ROCE_V2_QPC_ENTRY_SZ		256
+#define HNS_ROCE_V2_IRRL_ENTRY_SZ		64
+#define HNS_ROCE_V2_TRRL_ENTRY_SZ		48
+#define HNS_ROCE_V2_CQC_ENTRY_SZ		64
+#define HNS_ROCE_V2_MTPT_ENTRY_SZ		64
+#define HNS_ROCE_V2_MTT_ENTRY_SZ		64
+#define HNS_ROCE_V2_CQE_ENTRY_SIZE		32
+#define HNS_ROCE_V2_PAGE_SIZE_SUPPORTED		0xFFFFF000
+#define HNS_ROCE_V2_MAX_INNER_MTPT_NUM		2
+#define HNS_ROCE_INVALID_LKEY			0x100
+#define HNS_ROCE_CMQ_TX_TIMEOUT			200
+
+#define HNS_ROCE_CONTEXT_HOP_NUM		1
+#define HNS_ROCE_MTT_HOP_NUM			1
+#define HNS_ROCE_CQE_HOP_NUM			1
+#define HNS_ROCE_PBL_HOP_NUM			2
+#define HNS_ROCE_EQE_HOP_NUM			2
+
+#define HNS_ROCE_V2_GID_INDEX_NUM		256
+
+#define HNS_ROCE_V2_TABLE_CHUNK_SIZE		(1 << 18)
+
+#define HNS_ROCE_CMD_FLAG_IN_VALID_SHIFT	0
+#define HNS_ROCE_CMD_FLAG_OUT_VALID_SHIFT	1
+#define HNS_ROCE_CMD_FLAG_NEXT_SHIFT		2
+#define HNS_ROCE_CMD_FLAG_WR_OR_RD_SHIFT	3
+#define HNS_ROCE_CMD_FLAG_NO_INTR_SHIFT		4
+#define HNS_ROCE_CMD_FLAG_ERR_INTR_SHIFT	5
+
+#define HNS_ROCE_CMD_FLAG_IN		BIT(HNS_ROCE_CMD_FLAG_IN_VALID_SHIFT)
+#define HNS_ROCE_CMD_FLAG_OUT		BIT(HNS_ROCE_CMD_FLAG_OUT_VALID_SHIFT)
+#define HNS_ROCE_CMD_FLAG_NEXT		BIT(HNS_ROCE_CMD_FLAG_NEXT_SHIFT)
+#define HNS_ROCE_CMD_FLAG_WR		BIT(HNS_ROCE_CMD_FLAG_WR_OR_RD_SHIFT)
+#define HNS_ROCE_CMD_FLAG_NO_INTR	BIT(HNS_ROCE_CMD_FLAG_NO_INTR_SHIFT)
+#define HNS_ROCE_CMD_FLAG_ERR_INTR	BIT(HNS_ROCE_CMD_FLAG_ERR_INTR_SHIFT)
+
+#define HNS_ROCE_CMQ_DESC_NUM_S		3
+#define HNS_ROCE_CMQ_EN_B		16
+#define HNS_ROCE_CMQ_ENABLE		BIT(HNS_ROCE_CMQ_EN_B)
+
+#define check_whether_last_step(hop_num, step_idx) \
+	((step_idx == 0 && hop_num == HNS_ROCE_HOP_NUM_0) || \
+	(step_idx == 1 && hop_num == 1) || \
+	(step_idx == 2 && hop_num == 2))
+
+enum {
+	NO_ARMED = 0x0,
+	REG_NXT_CEQE = 0x2,
+	REG_NXT_SE_CEQE = 0x3
+};
+
+#define V2_CQ_DB_REQ_NOT_SOL			0
+#define V2_CQ_DB_REQ_NOT			1
+
+#define V2_CQ_STATE_VALID			1
+#define V2_QKEY_VAL				0x80010000
+
+#define	GID_LEN_V2				16
+
+#define HNS_ROCE_V2_CQE_QPN_MASK		0x3ffff
+
+enum {
+	HNS_ROCE_V2_WQE_OP_SEND				= 0x0,
+	HNS_ROCE_V2_WQE_OP_SEND_WITH_INV		= 0x1,
+	HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM		= 0x2,
+	HNS_ROCE_V2_WQE_OP_RDMA_WRITE			= 0x3,
+	HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM		= 0x4,
+	HNS_ROCE_V2_WQE_OP_RDMA_READ			= 0x5,
+	HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP		= 0x6,
+	HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD		= 0x7,
+	HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP	= 0x8,
+	HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD	= 0x9,
+	HNS_ROCE_V2_WQE_OP_FAST_REG_PMR			= 0xa,
+	HNS_ROCE_V2_WQE_OP_LOCAL_INV			= 0xb,
+	HNS_ROCE_V2_WQE_OP_BIND_MW_TYPE			= 0xc,
+	HNS_ROCE_V2_WQE_OP_MASK				= 0x1f,
+};
+
+enum {
+	HNS_ROCE_SQ_OPCODE_SEND = 0x0,
+	HNS_ROCE_SQ_OPCODE_SEND_WITH_INV = 0x1,
+	HNS_ROCE_SQ_OPCODE_SEND_WITH_IMM = 0x2,
+	HNS_ROCE_SQ_OPCODE_RDMA_WRITE = 0x3,
+	HNS_ROCE_SQ_OPCODE_RDMA_WRITE_WITH_IMM = 0x4,
+	HNS_ROCE_SQ_OPCODE_RDMA_READ = 0x5,
+	HNS_ROCE_SQ_OPCODE_ATOMIC_COMP_AND_SWAP = 0x6,
+	HNS_ROCE_SQ_OPCODE_ATOMIC_FETCH_AND_ADD = 0x7,
+	HNS_ROCE_SQ_OPCODE_ATOMIC_MASK_COMP_AND_SWAP = 0x8,
+	HNS_ROCE_SQ_OPCODE_ATOMIC_MASK_FETCH_AND_ADD = 0x9,
+	HNS_ROCE_SQ_OPCODE_FAST_REG_WR = 0xa,
+	HNS_ROCE_SQ_OPCODE_LOCAL_INV = 0xb,
+	HNS_ROCE_SQ_OPCODE_BIND_MW = 0xc,
+};
+
+enum {
+	/* rq operations */
+	HNS_ROCE_V2_OPCODE_RDMA_WRITE_IMM = 0x0,
+	HNS_ROCE_V2_OPCODE_SEND = 0x1,
+	HNS_ROCE_V2_OPCODE_SEND_WITH_IMM = 0x2,
+	HNS_ROCE_V2_OPCODE_SEND_WITH_INV = 0x3,
+};
+
+enum {
+	HNS_ROCE_V2_SQ_DB	= 0x0,
+	HNS_ROCE_V2_RQ_DB	= 0x1,
+	HNS_ROCE_V2_SRQ_DB	= 0x2,
+	HNS_ROCE_V2_CQ_DB_PTR	= 0x3,
+	HNS_ROCE_V2_CQ_DB_NTR	= 0x4,
+};
+
+enum {
+	HNS_ROCE_CQE_V2_SUCCESS				= 0x00,
+	HNS_ROCE_CQE_V2_LOCAL_LENGTH_ERR		= 0x01,
+	HNS_ROCE_CQE_V2_LOCAL_QP_OP_ERR			= 0x02,
+	HNS_ROCE_CQE_V2_LOCAL_PROT_ERR			= 0x04,
+	HNS_ROCE_CQE_V2_WR_FLUSH_ERR			= 0x05,
+	HNS_ROCE_CQE_V2_MW_BIND_ERR			= 0x06,
+	HNS_ROCE_CQE_V2_BAD_RESP_ERR			= 0x10,
+	HNS_ROCE_CQE_V2_LOCAL_ACCESS_ERR		= 0x11,
+	HNS_ROCE_CQE_V2_REMOTE_INVAL_REQ_ERR		= 0x12,
+	HNS_ROCE_CQE_V2_REMOTE_ACCESS_ERR		= 0x13,
+	HNS_ROCE_CQE_V2_REMOTE_OP_ERR			= 0x14,
+	HNS_ROCE_CQE_V2_TRANSPORT_RETRY_EXC_ERR		= 0x15,
+	HNS_ROCE_CQE_V2_RNR_RETRY_EXC_ERR		= 0x16,
+	HNS_ROCE_CQE_V2_REMOTE_ABORT_ERR		= 0x22,
+
+	HNS_ROCE_V2_CQE_STATUS_MASK			= 0xff,
+};
+
+/* CMQ command */
+enum hns_roce_opcode_type {
+	HNS_ROCE_OPC_QUERY_HW_VER			= 0x8000,
+	HNS_ROCE_OPC_CFG_GLOBAL_PARAM			= 0x8001,
+	HNS_ROCE_OPC_ALLOC_PF_RES			= 0x8004,
+	HNS_ROCE_OPC_QUERY_PF_RES			= 0x8400,
+	HNS_ROCE_OPC_ALLOC_VF_RES			= 0x8401,
+	HNS_ROCE_OPC_CFG_BT_ATTR			= 0x8506,
+};
+
+enum {
+	TYPE_CRQ,
+	TYPE_CSQ,
+};
+
+enum hns_roce_cmd_return_status {
+	CMD_EXEC_SUCCESS	= 0,
+	CMD_NO_AUTH		= 1,
+	CMD_NOT_EXEC		= 2,
+	CMD_QUEUE_FULL		= 3,
+};
+
+enum hns_roce_sgid_type {
+	GID_TYPE_FLAG_ROCE_V1 = 0,
+	GID_TYPE_FLAG_ROCE_V2_IPV4,
+	GID_TYPE_FLAG_ROCE_V2_IPV6,
+};
+
+struct hns_roce_v2_cq_context {
+	__le32	byte_4_pg_ceqn;
+	__le32	byte_8_cqn;
+	__le32	cqe_cur_blk_addr;
+	__le32	byte_16_hop_addr;
+	__le32	cqe_nxt_blk_addr;
+	__le32	byte_24_pgsz_addr;
+	__le32	byte_28_cq_pi;
+	__le32	byte_32_cq_ci;
+	__le32	cqe_ba;
+	__le32	byte_40_cqe_ba;
+	__le32	byte_44_db_record;
+	__le32	db_record_addr;
+	__le32	byte_52_cqe_cnt;
+	__le32	byte_56_cqe_period_maxcnt;
+	__le32	cqe_report_timer;
+	__le32	byte_64_se_cqe_idx;
+};
+#define HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM 0x0
+#define HNS_ROCE_V2_CQ_DEFAULT_INTERVAL	0x0
+
+#define	V2_CQC_BYTE_4_CQ_ST_S 0
+#define V2_CQC_BYTE_4_CQ_ST_M GENMASK(1, 0)
+
+#define	V2_CQC_BYTE_4_POLL_S 2
+
+#define	V2_CQC_BYTE_4_SE_S 3
+
+#define	V2_CQC_BYTE_4_OVER_IGNORE_S 4
+
+#define	V2_CQC_BYTE_4_COALESCE_S 5
+
+#define	V2_CQC_BYTE_4_ARM_ST_S 6
+#define V2_CQC_BYTE_4_ARM_ST_M GENMASK(7, 6)
+
+#define	V2_CQC_BYTE_4_SHIFT_S 8
+#define V2_CQC_BYTE_4_SHIFT_M GENMASK(12, 8)
+
+#define	V2_CQC_BYTE_4_CMD_SN_S 13
+#define V2_CQC_BYTE_4_CMD_SN_M GENMASK(14, 13)
+
+#define	V2_CQC_BYTE_4_CEQN_S 15
+#define V2_CQC_BYTE_4_CEQN_M GENMASK(23, 15)
+
+#define	V2_CQC_BYTE_4_PAGE_OFFSET_S 24
+#define V2_CQC_BYTE_4_PAGE_OFFSET_M GENMASK(31, 24)
+
+#define	V2_CQC_BYTE_8_CQN_S 0
+#define V2_CQC_BYTE_8_CQN_M GENMASK(23, 0)
+
+#define	V2_CQC_BYTE_16_CQE_CUR_BLK_ADDR_S 0
+#define V2_CQC_BYTE_16_CQE_CUR_BLK_ADDR_M GENMASK(19, 0)
+
+#define	V2_CQC_BYTE_16_CQE_HOP_NUM_S 30
+#define V2_CQC_BYTE_16_CQE_HOP_NUM_M GENMASK(31, 30)
+
+#define	V2_CQC_BYTE_24_CQE_NXT_BLK_ADDR_S 0
+#define V2_CQC_BYTE_24_CQE_NXT_BLK_ADDR_M GENMASK(19, 0)
+
+#define	V2_CQC_BYTE_24_CQE_BA_PG_SZ_S 24
+#define V2_CQC_BYTE_24_CQE_BA_PG_SZ_M GENMASK(27, 24)
+
+#define	V2_CQC_BYTE_24_CQE_BUF_PG_SZ_S 28
+#define V2_CQC_BYTE_24_CQE_BUF_PG_SZ_M GENMASK(31, 28)
+
+#define	V2_CQC_BYTE_28_CQ_PRODUCER_IDX_S 0
+#define V2_CQC_BYTE_28_CQ_PRODUCER_IDX_M GENMASK(23, 0)
+
+#define	V2_CQC_BYTE_32_CQ_CONSUMER_IDX_S 0
+#define V2_CQC_BYTE_32_CQ_CONSUMER_IDX_M GENMASK(23, 0)
+
+#define	V2_CQC_BYTE_40_CQE_BA_S 0
+#define V2_CQC_BYTE_40_CQE_BA_M GENMASK(28, 0)
+
+#define	V2_CQC_BYTE_44_DB_RECORD_EN_S 0
+
+#define	V2_CQC_BYTE_44_DB_RECORD_ADDR_S 1
+#define	V2_CQC_BYTE_44_DB_RECORD_ADDR_M GENMASK(31, 1)
+
+#define	V2_CQC_BYTE_52_CQE_CNT_S 0
+#define	V2_CQC_BYTE_52_CQE_CNT_M GENMASK(23, 0)
+
+#define	V2_CQC_BYTE_56_CQ_MAX_CNT_S 0
+#define V2_CQC_BYTE_56_CQ_MAX_CNT_M GENMASK(15, 0)
+
+#define	V2_CQC_BYTE_56_CQ_PERIOD_S 16
+#define V2_CQC_BYTE_56_CQ_PERIOD_M GENMASK(31, 16)
+
+#define	V2_CQC_BYTE_64_SE_CQE_IDX_S 0
+#define	V2_CQC_BYTE_64_SE_CQE_IDX_M GENMASK(23, 0)
+
+enum{
+	V2_MPT_ST_VALID = 0x1,
+};
+
+enum hns_roce_v2_qp_state {
+	HNS_ROCE_QP_ST_RST,
+	HNS_ROCE_QP_ST_INIT,
+	HNS_ROCE_QP_ST_RTR,
+	HNS_ROCE_QP_ST_RTS,
+	HNS_ROCE_QP_ST_SQER,
+	HNS_ROCE_QP_ST_SQD,
+	HNS_ROCE_QP_ST_ERR,
+	HNS_ROCE_QP_ST_SQ_DRAINING,
+	HNS_ROCE_QP_NUM_ST
+};
+
+struct hns_roce_v2_qp_context {
+	__le32	byte_4_sqpn_tst;
+	__le32	wqe_sge_ba;
+	__le32	byte_12_sq_hop;
+	__le32	byte_16_buf_ba_pg_sz;
+	__le32	byte_20_smac_sgid_idx;
+	__le32	byte_24_mtu_tc;
+	__le32	byte_28_at_fl;
+	u8	dgid[GID_LEN_V2];
+	__le32	dmac;
+	__le32	byte_52_udpspn_dmac;
+	__le32	byte_56_dqpn_err;
+	__le32	byte_60_qpst_mapid;
+	__le32	qkey_xrcd;
+	__le32	byte_68_rq_db;
+	__le32	rq_db_record_addr;
+	__le32	byte_76_srqn_op_en;
+	__le32	byte_80_rnr_rx_cqn;
+	__le32	byte_84_rq_ci_pi;
+	__le32	rq_cur_blk_addr;
+	__le32	byte_92_srq_info;
+	__le32	byte_96_rx_reqmsn;
+	__le32	rq_nxt_blk_addr;
+	__le32	byte_104_rq_sge;
+	__le32	byte_108_rx_reqepsn;
+	__le32	rq_rnr_timer;
+	__le32	rx_msg_len;
+	__le32	rx_rkey_pkt_info;
+	__le64	rx_va;
+	__le32	byte_132_trrl;
+	__le32	trrl_ba;
+	__le32	byte_140_raq;
+	__le32	byte_144_raq;
+	__le32	byte_148_raq;
+	__le32	byte_152_raq;
+	__le32	byte_156_raq;
+	__le32	byte_160_sq_ci_pi;
+	__le32	sq_cur_blk_addr;
+	__le32	byte_168_irrl_idx;
+	__le32	byte_172_sq_psn;
+	__le32	byte_176_msg_pktn;
+	__le32	sq_cur_sge_blk_addr;
+	__le32	byte_184_irrl_idx;
+	__le32	cur_sge_offset;
+	__le32	byte_192_ext_sge;
+	__le32	byte_196_sq_psn;
+	__le32	byte_200_sq_max;
+	__le32	irrl_ba;
+	__le32	byte_208_irrl;
+	__le32	byte_212_lsn;
+	__le32	sq_timer;
+	__le32	byte_220_retry_psn_msn;
+	__le32	byte_224_retry_msg;
+	__le32	rx_sq_cur_blk_addr;
+	__le32	byte_232_irrl_sge;
+	__le32	irrl_cur_sge_offset;
+	__le32	byte_240_irrl_tail;
+	__le32	byte_244_rnr_rxack;
+	__le32	byte_248_ack_psn;
+	__le32	byte_252_err_txcqn;
+	__le32	byte_256_sqflush_rqcqe;
+};
+
+#define	V2_QPC_BYTE_4_TST_S 0
+#define V2_QPC_BYTE_4_TST_M GENMASK(2, 0)
+
+#define	V2_QPC_BYTE_4_SGE_SHIFT_S 3
+#define V2_QPC_BYTE_4_SGE_SHIFT_M GENMASK(7, 3)
+
+#define	V2_QPC_BYTE_4_SQPN_S 8
+#define V2_QPC_BYTE_4_SQPN_M  GENMASK(31, 8)
+
+#define	V2_QPC_BYTE_12_WQE_SGE_BA_S 0
+#define V2_QPC_BYTE_12_WQE_SGE_BA_M GENMASK(28, 0)
+
+#define	V2_QPC_BYTE_12_SQ_HOP_NUM_S 29
+#define V2_QPC_BYTE_12_SQ_HOP_NUM_M GENMASK(30, 29)
+
+#define V2_QPC_BYTE_12_RSVD_LKEY_EN_S 31
+
+#define	V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S 0
+#define V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M GENMASK(3, 0)
+
+#define	V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S 4
+#define V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M GENMASK(7, 4)
+
+#define	V2_QPC_BYTE_16_PD_S 8
+#define V2_QPC_BYTE_16_PD_M GENMASK(31, 8)
+
+#define	V2_QPC_BYTE_20_RQ_HOP_NUM_S 0
+#define V2_QPC_BYTE_20_RQ_HOP_NUM_M GENMASK(1, 0)
+
+#define	V2_QPC_BYTE_20_SGE_HOP_NUM_S 2
+#define V2_QPC_BYTE_20_SGE_HOP_NUM_M GENMASK(3, 2)
+
+#define	V2_QPC_BYTE_20_RQWS_S 4
+#define V2_QPC_BYTE_20_RQWS_M GENMASK(7, 4)
+
+#define	V2_QPC_BYTE_20_SQ_SHIFT_S 8
+#define V2_QPC_BYTE_20_SQ_SHIFT_M GENMASK(11, 8)
+
+#define	V2_QPC_BYTE_20_RQ_SHIFT_S 12
+#define V2_QPC_BYTE_20_RQ_SHIFT_M GENMASK(15, 12)
+
+#define	V2_QPC_BYTE_20_SGID_IDX_S 16
+#define V2_QPC_BYTE_20_SGID_IDX_M GENMASK(23, 16)
+
+#define	V2_QPC_BYTE_20_SMAC_IDX_S 24
+#define V2_QPC_BYTE_20_SMAC_IDX_M GENMASK(31, 24)
+
+#define	V2_QPC_BYTE_24_HOP_LIMIT_S 0
+#define V2_QPC_BYTE_24_HOP_LIMIT_M GENMASK(7, 0)
+
+#define	V2_QPC_BYTE_24_TC_S 8
+#define V2_QPC_BYTE_24_TC_M GENMASK(15, 8)
+
+#define	V2_QPC_BYTE_24_VLAN_IDX_S 16
+#define V2_QPC_BYTE_24_VLAN_IDX_M GENMASK(27, 16)
+
+#define	V2_QPC_BYTE_24_MTU_S 28
+#define V2_QPC_BYTE_24_MTU_M GENMASK(31, 28)
+
+#define	V2_QPC_BYTE_28_FL_S 0
+#define V2_QPC_BYTE_28_FL_M GENMASK(19, 0)
+
+#define	V2_QPC_BYTE_28_SL_S 20
+#define V2_QPC_BYTE_28_SL_M GENMASK(23, 20)
+
+#define V2_QPC_BYTE_28_CNP_TX_FLAG_S 24
+
+#define V2_QPC_BYTE_28_CE_FLAG_S 25
+
+#define V2_QPC_BYTE_28_LBI_S 26
+
+#define	V2_QPC_BYTE_28_AT_S 27
+#define V2_QPC_BYTE_28_AT_M GENMASK(31, 27)
+
+#define	V2_QPC_BYTE_52_DMAC_S 0
+#define V2_QPC_BYTE_52_DMAC_M GENMASK(15, 0)
+
+#define V2_QPC_BYTE_52_UDPSPN_S 16
+#define V2_QPC_BYTE_52_UDPSPN_M GENMASK(31, 16)
+
+#define	V2_QPC_BYTE_56_DQPN_S 0
+#define V2_QPC_BYTE_56_DQPN_M GENMASK(23, 0)
+
+#define	V2_QPC_BYTE_56_SQ_TX_ERR_S 24
+#define	V2_QPC_BYTE_56_SQ_RX_ERR_S 25
+#define	V2_QPC_BYTE_56_RQ_TX_ERR_S 26
+#define	V2_QPC_BYTE_56_RQ_RX_ERR_S 27
+
+#define	V2_QPC_BYTE_56_LP_PKTN_INI_S 28
+#define V2_QPC_BYTE_56_LP_PKTN_INI_M GENMASK(31, 28)
+
+#define	V2_QPC_BYTE_60_MAPID_S 0
+#define V2_QPC_BYTE_60_MAPID_M GENMASK(12, 0)
+
+#define	V2_QPC_BYTE_60_INNER_MAP_IND_S 13
+
+#define	V2_QPC_BYTE_60_SQ_MAP_IND_S 14
+
+#define	V2_QPC_BYTE_60_RQ_MAP_IND_S 15
+
+#define	V2_QPC_BYTE_60_TEMPID_S 16
+#define V2_QPC_BYTE_60_TEMPID_M  GENMASK(22, 16)
+
+#define	V2_QPC_BYTE_60_EXT_MAP_IND_S 23
+
+#define	V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S 24
+#define V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M GENMASK(26, 24)
+
+#define V2_QPC_BYTE_60_SQ_RLS_IND_S 27
+
+#define	V2_QPC_BYTE_60_SQ_EXT_IND_S 28
+
+#define	V2_QPC_BYTE_60_QP_ST_S 29
+#define V2_QPC_BYTE_60_QP_ST_M GENMASK(31, 29)
+
+#define	V2_QPC_BYTE_68_RQ_RECORD_EN_S 0
+
+#define	V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_S 1
+#define V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_M GENMASK(31, 1)
+
+#define	V2_QPC_BYTE_76_SRQN_S 0
+#define V2_QPC_BYTE_76_SRQN_M GENMASK(23, 0)
+
+#define	V2_QPC_BYTE_76_SRQ_EN_S 24
+
+#define	V2_QPC_BYTE_76_RRE_S 25
+
+#define	V2_QPC_BYTE_76_RWE_S 26
+
+#define	V2_QPC_BYTE_76_ATE_S 27
+
+#define	V2_QPC_BYTE_76_RQIE_S 28
+
+#define	V2_QPC_BYTE_80_RX_CQN_S 0
+#define V2_QPC_BYTE_80_RX_CQN_M GENMASK(23, 0)
+
+#define	V2_QPC_BYTE_80_MIN_RNR_TIME_S 27
+#define V2_QPC_BYTE_80_MIN_RNR_TIME_M GENMASK(31, 27)
+
+#define	V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S 0
+#define V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M GENMASK(15, 0)
+
+#define	V2_QPC_BYTE_84_RQ_CONSUMER_IDX_S 16
+#define V2_QPC_BYTE_84_RQ_CONSUMER_IDX_M GENMASK(31, 16)
+
+#define	V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_S 0
+#define V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_M GENMASK(19, 0)
+
+#define	V2_QPC_BYTE_92_SRQ_INFO_S 20
+#define V2_QPC_BYTE_92_SRQ_INFO_M GENMASK(31, 20)
+
+#define	V2_QPC_BYTE_96_RX_REQ_MSN_S 0
+#define V2_QPC_BYTE_96_RX_REQ_MSN_M GENMASK(23, 0)
+
+#define	V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S 0
+#define V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M GENMASK(19, 0)
+
+#define	V2_QPC_BYTE_104_RQ_CUR_WQE_SGE_NUM_S 24
+#define V2_QPC_BYTE_104_RQ_CUR_WQE_SGE_NUM_M GENMASK(31, 24)
+
+#define V2_QPC_BYTE_108_INV_CREDIT_S 0
+
+#define V2_QPC_BYTE_108_RX_REQ_PSN_ERR_S 3
+
+#define	V2_QPC_BYTE_108_RX_REQ_LAST_OPTYPE_S 4
+#define V2_QPC_BYTE_108_RX_REQ_LAST_OPTYPE_M GENMASK(6, 4)
+
+#define V2_QPC_BYTE_108_RX_REQ_RNR_S 7
+
+#define	V2_QPC_BYTE_108_RX_REQ_EPSN_S 8
+#define V2_QPC_BYTE_108_RX_REQ_EPSN_M GENMASK(31, 8)
+
+#define	V2_QPC_BYTE_132_TRRL_HEAD_MAX_S 0
+#define V2_QPC_BYTE_132_TRRL_HEAD_MAX_M GENMASK(7, 0)
+
+#define	V2_QPC_BYTE_132_TRRL_TAIL_MAX_S 8
+#define V2_QPC_BYTE_132_TRRL_TAIL_MAX_M GENMASK(15, 8)
+
+#define	V2_QPC_BYTE_132_TRRL_BA_S 16
+#define V2_QPC_BYTE_132_TRRL_BA_M GENMASK(31, 16)
+
+#define	V2_QPC_BYTE_140_TRRL_BA_S 0
+#define V2_QPC_BYTE_140_TRRL_BA_M GENMASK(11, 0)
+
+#define	V2_QPC_BYTE_140_RR_MAX_S 12
+#define V2_QPC_BYTE_140_RR_MAX_M GENMASK(14, 12)
+
+#define	V2_QPC_BYTE_140_RSVD_RAQ_MAP_S 15
+
+#define	V2_QPC_BYTE_140_RAQ_TRRL_HEAD_S 16
+#define V2_QPC_BYTE_140_RAQ_TRRL_HEAD_M GENMASK(23, 16)
+
+#define	V2_QPC_BYTE_140_RAQ_TRRL_TAIL_S 24
+#define V2_QPC_BYTE_140_RAQ_TRRL_TAIL_M GENMASK(31, 24)
+
+#define	V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_S 0
+#define V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_M GENMASK(23, 0)
+
+#define V2_QPC_BYTE_144_RAQ_RTY_INI_IND_S 24
+
+#define V2_QPC_BYTE_144_RAQ_CREDIT_S 25
+#define V2_QPC_BYTE_144_RAQ_CREDIT_M GENMASK(29, 25)
+
+#define V2_QPC_BYTE_144_RESP_RTY_FLG_S 31
+
+#define	V2_QPC_BYTE_148_RQ_MSN_S 0
+#define V2_QPC_BYTE_148_RQ_MSN_M GENMASK(23, 0)
+
+#define	V2_QPC_BYTE_148_RAQ_SYNDROME_S 24
+#define V2_QPC_BYTE_148_RAQ_SYNDROME_M GENMASK(31, 24)
+
+#define	V2_QPC_BYTE_152_RAQ_PSN_S 8
+#define V2_QPC_BYTE_152_RAQ_PSN_M GENMASK(31, 8)
+
+#define	V2_QPC_BYTE_152_RAQ_TRRL_RTY_HEAD_S 24
+#define V2_QPC_BYTE_152_RAQ_TRRL_RTY_HEAD_M GENMASK(31, 24)
+
+#define	V2_QPC_BYTE_156_RAQ_USE_PKTN_S 0
+#define V2_QPC_BYTE_156_RAQ_USE_PKTN_M GENMASK(23, 0)
+
+#define	V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S 0
+#define V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M GENMASK(15, 0)
+
+#define	V2_QPC_BYTE_160_SQ_CONSUMER_IDX_S 16
+#define V2_QPC_BYTE_160_SQ_CONSUMER_IDX_M GENMASK(31, 16)
+
+#define	V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S 0
+#define V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M GENMASK(19, 0)
+
+#define V2_QPC_BYTE_168_MSG_RTY_LP_FLG_S 20
+
+#define V2_QPC_BYTE_168_SQ_INVLD_FLG_S 21
+
+#define	V2_QPC_BYTE_168_LP_SGEN_INI_S 22
+#define V2_QPC_BYTE_168_LP_SGEN_INI_M GENMASK(23, 22)
+
+#define	V2_QPC_BYTE_168_SQ_SHIFT_BAK_S 24
+#define V2_QPC_BYTE_168_SQ_SHIFT_BAK_M GENMASK(27, 24)
+
+#define	V2_QPC_BYTE_168_IRRL_IDX_LSB_S 28
+#define V2_QPC_BYTE_168_IRRL_IDX_LSB_M GENMASK(31, 28)
+
+#define	V2_QPC_BYTE_172_ACK_REQ_FREQ_S 0
+#define V2_QPC_BYTE_172_ACK_REQ_FREQ_M GENMASK(5, 0)
+
+#define V2_QPC_BYTE_172_MSG_RNR_FLG_S 6
+
+#define V2_QPC_BYTE_172_FRE_S 7
+
+#define	V2_QPC_BYTE_172_SQ_CUR_PSN_S 8
+#define V2_QPC_BYTE_172_SQ_CUR_PSN_M GENMASK(31, 8)
+
+#define	V2_QPC_BYTE_176_MSG_USE_PKTN_S 0
+#define V2_QPC_BYTE_176_MSG_USE_PKTN_M GENMASK(23, 0)
+
+#define	V2_QPC_BYTE_176_IRRL_HEAD_PRE_S 24
+#define V2_QPC_BYTE_176_IRRL_HEAD_PRE_M GENMASK(31, 24)
+
+#define	V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S 0
+#define V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M GENMASK(19, 0)
+
+#define	V2_QPC_BYTE_184_IRRL_IDX_MSB_S 20
+#define V2_QPC_BYTE_184_IRRL_IDX_MSB_M GENMASK(31, 20)
+
+#define	V2_QPC_BYTE_192_CUR_SGE_IDX_S 0
+#define V2_QPC_BYTE_192_CUR_SGE_IDX_M GENMASK(23, 0)
+
+#define	V2_QPC_BYTE_192_EXT_SGE_NUM_LEFT_S 24
+#define V2_QPC_BYTE_192_EXT_SGE_NUM_LEFT_M GENMASK(31, 24)
+
+#define	V2_QPC_BYTE_196_IRRL_HEAD_S 0
+#define V2_QPC_BYTE_196_IRRL_HEAD_M GENMASK(7, 0)
+
+#define	V2_QPC_BYTE_196_SQ_MAX_PSN_S 8
+#define V2_QPC_BYTE_196_SQ_MAX_PSN_M GENMASK(31, 8)
+
+#define	V2_QPC_BYTE_200_SQ_MAX_IDX_S 0
+#define V2_QPC_BYTE_200_SQ_MAX_IDX_M GENMASK(15, 0)
+
+#define	V2_QPC_BYTE_200_LCL_OPERATED_CNT_S 16
+#define V2_QPC_BYTE_200_LCL_OPERATED_CNT_M GENMASK(31, 16)
+
+#define	V2_QPC_BYTE_208_IRRL_BA_S 0
+#define V2_QPC_BYTE_208_IRRL_BA_M GENMASK(25, 0)
+
+#define V2_QPC_BYTE_208_PKT_RNR_FLG_S 26
+
+#define V2_QPC_BYTE_208_PKT_RTY_FLG_S 27
+
+#define V2_QPC_BYTE_208_RMT_E2E_S 28
+
+#define	V2_QPC_BYTE_208_SR_MAX_S 29
+#define V2_QPC_BYTE_208_SR_MAX_M GENMASK(31, 29)
+
+#define	V2_QPC_BYTE_212_LSN_S 0
+#define V2_QPC_BYTE_212_LSN_M GENMASK(23, 0)
+
+#define	V2_QPC_BYTE_212_RETRY_NUM_INIT_S 24
+#define V2_QPC_BYTE_212_RETRY_NUM_INIT_M GENMASK(26, 24)
+
+#define	V2_QPC_BYTE_212_CHECK_FLG_S 27
+#define V2_QPC_BYTE_212_CHECK_FLG_M GENMASK(28, 27)
+
+#define	V2_QPC_BYTE_212_RETRY_CNT_S 29
+#define V2_QPC_BYTE_212_RETRY_CNT_M GENMASK(31, 29)
+
+#define	V2_QPC_BYTE_220_RETRY_MSG_MSN_S 0
+#define V2_QPC_BYTE_220_RETRY_MSG_MSN_M GENMASK(15, 0)
+
+#define	V2_QPC_BYTE_220_RETRY_MSG_PSN_S 16
+#define V2_QPC_BYTE_220_RETRY_MSG_PSN_M GENMASK(31, 16)
+
+#define	V2_QPC_BYTE_224_RETRY_MSG_PSN_S 0
+#define V2_QPC_BYTE_224_RETRY_MSG_PSN_M GENMASK(7, 0)
+
+#define	V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_S 8
+#define V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_M GENMASK(31, 8)
+
+#define	V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S 0
+#define V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M GENMASK(19, 0)
+
+#define	V2_QPC_BYTE_232_IRRL_SGE_IDX_S 20
+#define V2_QPC_BYTE_232_IRRL_SGE_IDX_M GENMASK(28, 20)
+
+#define	V2_QPC_BYTE_240_IRRL_TAIL_REAL_S 0
+#define V2_QPC_BYTE_240_IRRL_TAIL_REAL_M GENMASK(7, 0)
+
+#define	V2_QPC_BYTE_240_IRRL_TAIL_RD_S 8
+#define V2_QPC_BYTE_240_IRRL_TAIL_RD_M GENMASK(15, 8)
+
+#define	V2_QPC_BYTE_240_RX_ACK_MSN_S 16
+#define V2_QPC_BYTE_240_RX_ACK_MSN_M GENMASK(31, 16)
+
+#define	V2_QPC_BYTE_244_RX_ACK_EPSN_S 0
+#define V2_QPC_BYTE_244_RX_ACK_EPSN_M GENMASK(23, 0)
+
+#define	V2_QPC_BYTE_244_RNR_NUM_INIT_S 24
+#define V2_QPC_BYTE_244_RNR_NUM_INIT_M GENMASK(26, 24)
+
+#define	V2_QPC_BYTE_244_RNR_CNT_S 27
+#define V2_QPC_BYTE_244_RNR_CNT_M GENMASK(29, 27)
+
+#define	V2_QPC_BYTE_248_IRRL_PSN_S 0
+#define V2_QPC_BYTE_248_IRRL_PSN_M GENMASK(23, 0)
+
+#define V2_QPC_BYTE_248_ACK_PSN_ERR_S 24
+
+#define	V2_QPC_BYTE_248_ACK_LAST_OPTYPE_S 25
+#define V2_QPC_BYTE_248_ACK_LAST_OPTYPE_M GENMASK(26, 25)
+
+#define V2_QPC_BYTE_248_IRRL_PSN_VLD_S 27
+
+#define V2_QPC_BYTE_248_RNR_RETRY_FLAG_S 28
+
+#define V2_QPC_BYTE_248_CQ_ERR_IND_S 31
+
+#define	V2_QPC_BYTE_252_TX_CQN_S 0
+#define V2_QPC_BYTE_252_TX_CQN_M GENMASK(23, 0)
+
+#define	V2_QPC_BYTE_252_SIG_TYPE_S 24
+
+#define	V2_QPC_BYTE_252_ERR_TYPE_S 25
+#define V2_QPC_BYTE_252_ERR_TYPE_M GENMASK(31, 25)
+
+#define	V2_QPC_BYTE_256_RQ_CQE_IDX_S 0
+#define V2_QPC_BYTE_256_RQ_CQE_IDX_M GENMASK(15, 0)
+
+#define	V2_QPC_BYTE_256_SQ_FLUSH_IDX_S 16
+#define V2_QPC_BYTE_256_SQ_FLUSH_IDX_M GENMASK(31, 16)
+
+struct hns_roce_v2_cqe {
+	__le32	byte_4;
+	union {
+		__le32 rkey;
+		__be32 immtdata;
+	};
+	__le32	byte_12;
+	__le32	byte_16;
+	__le32	byte_cnt;
+	u8	smac[4];
+	__le32	byte_28;
+	__le32	byte_32;
+};
+
+#define	V2_CQE_BYTE_4_OPCODE_S 0
+#define V2_CQE_BYTE_4_OPCODE_M GENMASK(4, 0)
+
+#define	V2_CQE_BYTE_4_RQ_INLINE_S 5
+
+#define	V2_CQE_BYTE_4_S_R_S 6
+
+#define	V2_CQE_BYTE_4_OWNER_S 7
+
+#define	V2_CQE_BYTE_4_STATUS_S 8
+#define V2_CQE_BYTE_4_STATUS_M GENMASK(15, 8)
+
+#define	V2_CQE_BYTE_4_WQE_INDX_S 16
+#define V2_CQE_BYTE_4_WQE_INDX_M GENMASK(31, 16)
+
+#define	V2_CQE_BYTE_12_XRC_SRQN_S 0
+#define V2_CQE_BYTE_12_XRC_SRQN_M GENMASK(23, 0)
+
+#define	V2_CQE_BYTE_16_LCL_QPN_S 0
+#define V2_CQE_BYTE_16_LCL_QPN_M GENMASK(23, 0)
+
+#define	V2_CQE_BYTE_16_SUB_STATUS_S 24
+#define V2_CQE_BYTE_16_SUB_STATUS_M GENMASK(31, 24)
+
+#define	V2_CQE_BYTE_28_SMAC_4_S 0
+#define V2_CQE_BYTE_28_SMAC_4_M	GENMASK(7, 0)
+
+#define	V2_CQE_BYTE_28_SMAC_5_S 8
+#define V2_CQE_BYTE_28_SMAC_5_M	GENMASK(15, 8)
+
+#define	V2_CQE_BYTE_28_PORT_TYPE_S 16
+#define V2_CQE_BYTE_28_PORT_TYPE_M GENMASK(17, 16)
+
+#define	V2_CQE_BYTE_32_RMT_QPN_S 0
+#define V2_CQE_BYTE_32_RMT_QPN_M GENMASK(23, 0)
+
+#define	V2_CQE_BYTE_32_SL_S 24
+#define V2_CQE_BYTE_32_SL_M GENMASK(26, 24)
+
+#define	V2_CQE_BYTE_32_PORTN_S 27
+#define V2_CQE_BYTE_32_PORTN_M GENMASK(29, 27)
+
+#define	V2_CQE_BYTE_32_GRH_S 30
+
+#define	V2_CQE_BYTE_32_LPK_S 31
+
+struct hns_roce_v2_mpt_entry {
+	__le32	byte_4_pd_hop_st;
+	__le32	byte_8_mw_cnt_en;
+	__le32	byte_12_mw_pa;
+	__le32	bound_lkey;
+	__le32	len_l;
+	__le32	len_h;
+	__le32	lkey;
+	__le32	va_l;
+	__le32	va_h;
+	__le32	pbl_size;
+	__le32	pbl_ba_l;
+	__le32	byte_48_mode_ba;
+	__le32	pa0_l;
+	__le32	byte_56_pa0_h;
+	__le32	pa1_l;
+	__le32	byte_64_buf_pa1;
+};
+
+#define V2_MPT_BYTE_4_MPT_ST_S 0
+#define V2_MPT_BYTE_4_MPT_ST_M GENMASK(1, 0)
+
+#define V2_MPT_BYTE_4_PBL_HOP_NUM_S 2
+#define V2_MPT_BYTE_4_PBL_HOP_NUM_M GENMASK(3, 2)
+
+#define V2_MPT_BYTE_4_PBL_BA_PG_SZ_S 4
+#define V2_MPT_BYTE_4_PBL_BA_PG_SZ_M GENMASK(7, 4)
+
+#define V2_MPT_BYTE_4_PD_S 8
+#define V2_MPT_BYTE_4_PD_M GENMASK(31, 8)
+
+#define V2_MPT_BYTE_8_RA_EN_S 0
+
+#define V2_MPT_BYTE_8_R_INV_EN_S 1
+
+#define V2_MPT_BYTE_8_L_INV_EN_S 2
+
+#define V2_MPT_BYTE_8_BIND_EN_S 3
+
+#define V2_MPT_BYTE_8_ATOMIC_EN_S 4
+
+#define V2_MPT_BYTE_8_RR_EN_S 5
+
+#define V2_MPT_BYTE_8_RW_EN_S 6
+
+#define V2_MPT_BYTE_8_LW_EN_S 7
+
+#define V2_MPT_BYTE_12_PA_S 1
+
+#define V2_MPT_BYTE_12_INNER_PA_VLD_S 7
+
+#define V2_MPT_BYTE_12_MW_BIND_QPN_S 8
+#define V2_MPT_BYTE_12_MW_BIND_QPN_M GENMASK(31, 8)
+
+#define V2_MPT_BYTE_48_PBL_BA_H_S 0
+#define V2_MPT_BYTE_48_PBL_BA_H_M GENMASK(28, 0)
+
+#define V2_MPT_BYTE_48_BLK_MODE_S 29
+
+#define V2_MPT_BYTE_56_PA0_H_S 0
+#define V2_MPT_BYTE_56_PA0_H_M GENMASK(25, 0)
+
+#define V2_MPT_BYTE_64_PA1_H_S 0
+#define V2_MPT_BYTE_64_PA1_H_M GENMASK(25, 0)
+
+#define V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S 28
+#define V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M GENMASK(31, 28)
+
+#define	V2_DB_BYTE_4_TAG_S 0
+#define V2_DB_BYTE_4_TAG_M GENMASK(23, 0)
+
+#define	V2_DB_BYTE_4_CMD_S 24
+#define V2_DB_BYTE_4_CMD_M GENMASK(27, 24)
+
+#define V2_DB_PARAMETER_CONS_IDX_S 0
+#define V2_DB_PARAMETER_CONS_IDX_M GENMASK(15, 0)
+
+#define V2_DB_PARAMETER_SL_S 16
+#define V2_DB_PARAMETER_SL_M GENMASK(18, 16)
+
+struct hns_roce_v2_cq_db {
+	__le32	byte_4;
+	__le32	parameter;
+};
+
+#define	V2_CQ_DB_BYTE_4_TAG_S 0
+#define V2_CQ_DB_BYTE_4_TAG_M GENMASK(23, 0)
+
+#define	V2_CQ_DB_BYTE_4_CMD_S 24
+#define V2_CQ_DB_BYTE_4_CMD_M GENMASK(27, 24)
+
+#define V2_CQ_DB_PARAMETER_CONS_IDX_S 0
+#define V2_CQ_DB_PARAMETER_CONS_IDX_M GENMASK(23, 0)
+
+#define V2_CQ_DB_PARAMETER_CMD_SN_S 25
+#define V2_CQ_DB_PARAMETER_CMD_SN_M GENMASK(26, 25)
+
+#define V2_CQ_DB_PARAMETER_NOTIFY_S 24
+
+struct hns_roce_v2_ud_send_wqe {
+	__le32	byte_4;
+	__le32	msg_len;
+	__be32	immtdata;
+	__le32	byte_16;
+	__le32	byte_20;
+	__le32	byte_24;
+	__le32	qkey;
+	__le32	byte_32;
+	__le32	byte_36;
+	__le32	byte_40;
+	__le32	dmac;
+	__le32	byte_48;
+	u8	dgid[GID_LEN_V2];
+
+};
+#define	V2_UD_SEND_WQE_BYTE_4_OPCODE_S 0
+#define V2_UD_SEND_WQE_BYTE_4_OPCODE_M GENMASK(4, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_4_OWNER_S 7
+
+#define	V2_UD_SEND_WQE_BYTE_4_CQE_S 8
+
+#define	V2_UD_SEND_WQE_BYTE_4_SE_S 11
+
+#define	V2_UD_SEND_WQE_BYTE_16_PD_S 0
+#define V2_UD_SEND_WQE_BYTE_16_PD_M GENMASK(23, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S 24
+#define V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M GENMASK(31, 24)
+
+#define	V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S 0
+#define V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M GENMASK(23, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_24_UDPSPN_S 16
+#define V2_UD_SEND_WQE_BYTE_24_UDPSPN_M GENMASK(31, 16)
+
+#define	V2_UD_SEND_WQE_BYTE_32_DQPN_S 0
+#define V2_UD_SEND_WQE_BYTE_32_DQPN_M GENMASK(23, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_36_VLAN_S 0
+#define V2_UD_SEND_WQE_BYTE_36_VLAN_M GENMASK(15, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S 16
+#define V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M GENMASK(23, 16)
+
+#define	V2_UD_SEND_WQE_BYTE_36_TCLASS_S 24
+#define V2_UD_SEND_WQE_BYTE_36_TCLASS_M GENMASK(31, 24)
+
+#define	V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S 0
+#define V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M GENMASK(19, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_40_SL_S 20
+#define V2_UD_SEND_WQE_BYTE_40_SL_M GENMASK(23, 20)
+
+#define	V2_UD_SEND_WQE_BYTE_40_PORTN_S 24
+#define V2_UD_SEND_WQE_BYTE_40_PORTN_M GENMASK(26, 24)
+
+#define	V2_UD_SEND_WQE_BYTE_40_LBI_S 31
+
+#define	V2_UD_SEND_WQE_DMAC_0_S 0
+#define V2_UD_SEND_WQE_DMAC_0_M GENMASK(7, 0)
+
+#define	V2_UD_SEND_WQE_DMAC_1_S 8
+#define V2_UD_SEND_WQE_DMAC_1_M GENMASK(15, 8)
+
+#define	V2_UD_SEND_WQE_DMAC_2_S 16
+#define V2_UD_SEND_WQE_DMAC_2_M GENMASK(23, 16)
+
+#define	V2_UD_SEND_WQE_DMAC_3_S 24
+#define V2_UD_SEND_WQE_DMAC_3_M GENMASK(31, 24)
+
+#define	V2_UD_SEND_WQE_BYTE_48_DMAC_4_S 0
+#define V2_UD_SEND_WQE_BYTE_48_DMAC_4_M GENMASK(7, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_48_DMAC_5_S 8
+#define V2_UD_SEND_WQE_BYTE_48_DMAC_5_M GENMASK(15, 8)
+
+#define	V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S 16
+#define V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M GENMASK(23, 16)
+
+#define	V2_UD_SEND_WQE_BYTE_48_SMAC_INDX_S 24
+#define V2_UD_SEND_WQE_BYTE_48_SMAC_INDX_M GENMASK(31, 24)
+
+struct hns_roce_v2_rc_send_wqe {
+	__le32		byte_4;
+	__le32		msg_len;
+	union {
+		__le32  inv_key;
+		__be32  immtdata;
+	};
+	__le32		byte_16;
+	__le32		byte_20;
+	__le32		rkey;
+	__le64		va;
+};
+
+#define	V2_RC_SEND_WQE_BYTE_4_OPCODE_S 0
+#define V2_RC_SEND_WQE_BYTE_4_OPCODE_M GENMASK(4, 0)
+
+#define V2_RC_SEND_WQE_BYTE_4_OWNER_S 7
+
+#define V2_RC_SEND_WQE_BYTE_4_CQE_S 8
+
+#define V2_RC_SEND_WQE_BYTE_4_FENCE_S 9
+
+#define V2_RC_SEND_WQE_BYTE_4_SO_S 10
+
+#define V2_RC_SEND_WQE_BYTE_4_SE_S 11
+
+#define V2_RC_SEND_WQE_BYTE_4_INLINE_S 12
+
+#define	V2_RC_SEND_WQE_BYTE_16_XRC_SRQN_S 0
+#define V2_RC_SEND_WQE_BYTE_16_XRC_SRQN_M GENMASK(23, 0)
+
+#define	V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S 24
+#define V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M GENMASK(31, 24)
+
+#define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S 0
+#define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M GENMASK(23, 0)
+
+struct hns_roce_v2_wqe_data_seg {
+	__le32    len;
+	__le32    lkey;
+	__le64    addr;
+};
+
+struct hns_roce_v2_db {
+	__le32	byte_4;
+	__le32	parameter;
+};
+
+struct hns_roce_query_version {
+	__le16 rocee_vendor_id;
+	__le16 rocee_hw_version;
+	__le32 rsv[5];
+};
+
+struct hns_roce_cfg_global_param {
+	__le32 time_cfg_udp_port;
+	__le32 rsv[5];
+};
+
+#define CFG_GLOBAL_PARAM_DATA_0_ROCEE_TIME_1US_CFG_S 0
+#define CFG_GLOBAL_PARAM_DATA_0_ROCEE_TIME_1US_CFG_M GENMASK(9, 0)
+
+#define CFG_GLOBAL_PARAM_DATA_0_ROCEE_UDP_PORT_S 16
+#define CFG_GLOBAL_PARAM_DATA_0_ROCEE_UDP_PORT_M GENMASK(31, 16)
+
+struct hns_roce_pf_res {
+	__le32	rsv;
+	__le32	qpc_bt_idx_num;
+	__le32	srqc_bt_idx_num;
+	__le32	cqc_bt_idx_num;
+	__le32	mpt_bt_idx_num;
+	__le32	eqc_bt_idx_num;
+};
+
+#define PF_RES_DATA_1_PF_QPC_BT_IDX_S 0
+#define PF_RES_DATA_1_PF_QPC_BT_IDX_M GENMASK(10, 0)
+
+#define PF_RES_DATA_1_PF_QPC_BT_NUM_S 16
+#define PF_RES_DATA_1_PF_QPC_BT_NUM_M GENMASK(27, 16)
+
+#define PF_RES_DATA_2_PF_SRQC_BT_IDX_S 0
+#define PF_RES_DATA_2_PF_SRQC_BT_IDX_M GENMASK(8, 0)
+
+#define PF_RES_DATA_2_PF_SRQC_BT_NUM_S 16
+#define PF_RES_DATA_2_PF_SRQC_BT_NUM_M GENMASK(25, 16)
+
+#define PF_RES_DATA_3_PF_CQC_BT_IDX_S 0
+#define PF_RES_DATA_3_PF_CQC_BT_IDX_M GENMASK(8, 0)
+
+#define PF_RES_DATA_3_PF_CQC_BT_NUM_S 16
+#define PF_RES_DATA_3_PF_CQC_BT_NUM_M GENMASK(25, 16)
+
+#define PF_RES_DATA_4_PF_MPT_BT_IDX_S 0
+#define PF_RES_DATA_4_PF_MPT_BT_IDX_M GENMASK(8, 0)
+
+#define PF_RES_DATA_4_PF_MPT_BT_NUM_S 16
+#define PF_RES_DATA_4_PF_MPT_BT_NUM_M GENMASK(25, 16)
+
+#define PF_RES_DATA_5_PF_EQC_BT_IDX_S 0
+#define PF_RES_DATA_5_PF_EQC_BT_IDX_M GENMASK(8, 0)
+
+#define PF_RES_DATA_5_PF_EQC_BT_NUM_S 16
+#define PF_RES_DATA_5_PF_EQC_BT_NUM_M GENMASK(25, 16)
+
+struct hns_roce_vf_res_a {
+	__le32 vf_id;
+	__le32 vf_qpc_bt_idx_num;
+	__le32 vf_srqc_bt_idx_num;
+	__le32 vf_cqc_bt_idx_num;
+	__le32 vf_mpt_bt_idx_num;
+	__le32 vf_eqc_bt_idx_num;
+};
+
+#define VF_RES_A_DATA_1_VF_QPC_BT_IDX_S 0
+#define VF_RES_A_DATA_1_VF_QPC_BT_IDX_M GENMASK(10, 0)
+
+#define VF_RES_A_DATA_1_VF_QPC_BT_NUM_S 16
+#define VF_RES_A_DATA_1_VF_QPC_BT_NUM_M GENMASK(27, 16)
+
+#define VF_RES_A_DATA_2_VF_SRQC_BT_IDX_S 0
+#define VF_RES_A_DATA_2_VF_SRQC_BT_IDX_M GENMASK(8, 0)
+
+#define VF_RES_A_DATA_2_VF_SRQC_BT_NUM_S 16
+#define VF_RES_A_DATA_2_VF_SRQC_BT_NUM_M GENMASK(25, 16)
+
+#define VF_RES_A_DATA_3_VF_CQC_BT_IDX_S 0
+#define VF_RES_A_DATA_3_VF_CQC_BT_IDX_M GENMASK(8, 0)
+
+#define VF_RES_A_DATA_3_VF_CQC_BT_NUM_S 16
+#define VF_RES_A_DATA_3_VF_CQC_BT_NUM_M GENMASK(25, 16)
+
+#define VF_RES_A_DATA_4_VF_MPT_BT_IDX_S 0
+#define VF_RES_A_DATA_4_VF_MPT_BT_IDX_M GENMASK(8, 0)
+
+#define VF_RES_A_DATA_4_VF_MPT_BT_NUM_S 16
+#define VF_RES_A_DATA_4_VF_MPT_BT_NUM_M GENMASK(25, 16)
+
+#define VF_RES_A_DATA_5_VF_EQC_IDX_S 0
+#define VF_RES_A_DATA_5_VF_EQC_IDX_M GENMASK(8, 0)
+
+#define VF_RES_A_DATA_5_VF_EQC_NUM_S 16
+#define VF_RES_A_DATA_5_VF_EQC_NUM_M GENMASK(25, 16)
+
+struct hns_roce_vf_res_b {
+	__le32 rsv0;
+	__le32 vf_smac_idx_num;
+	__le32 vf_sgid_idx_num;
+	__le32 vf_qid_idx_sl_num;
+	__le32 rsv[2];
+};
+
+#define VF_RES_B_DATA_0_VF_ID_S 0
+#define VF_RES_B_DATA_0_VF_ID_M GENMASK(7, 0)
+
+#define VF_RES_B_DATA_1_VF_SMAC_IDX_S 0
+#define VF_RES_B_DATA_1_VF_SMAC_IDX_M GENMASK(7, 0)
+
+#define VF_RES_B_DATA_1_VF_SMAC_NUM_S 8
+#define VF_RES_B_DATA_1_VF_SMAC_NUM_M GENMASK(16, 8)
+
+#define VF_RES_B_DATA_2_VF_SGID_IDX_S 0
+#define VF_RES_B_DATA_2_VF_SGID_IDX_M GENMASK(7, 0)
+
+#define VF_RES_B_DATA_2_VF_SGID_NUM_S 8
+#define VF_RES_B_DATA_2_VF_SGID_NUM_M GENMASK(16, 8)
+
+#define VF_RES_B_DATA_3_VF_QID_IDX_S 0
+#define VF_RES_B_DATA_3_VF_QID_IDX_M GENMASK(9, 0)
+
+#define VF_RES_B_DATA_3_VF_SL_NUM_S 16
+#define VF_RES_B_DATA_3_VF_SL_NUM_M GENMASK(19, 16)
+
+/* Reg field definition */
+#define ROCEE_VF_SMAC_CFG1_VF_SMAC_H_S 0
+#define ROCEE_VF_SMAC_CFG1_VF_SMAC_H_M GENMASK(15, 0)
+
+#define ROCEE_VF_SGID_CFG4_SGID_TYPE_S 0
+#define ROCEE_VF_SGID_CFG4_SGID_TYPE_M GENMASK(1, 0)
+
+struct hns_roce_cfg_bt_attr {
+	__le32 vf_qpc_cfg;
+	__le32 vf_srqc_cfg;
+	__le32 vf_cqc_cfg;
+	__le32 vf_mpt_cfg;
+	__le32 rsv[2];
+};
+
+#define CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_S 0
+#define CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_M GENMASK(3, 0)
+
+#define CFG_BT_ATTR_DATA_0_VF_QPC_BUF_PGSZ_S 4
+#define CFG_BT_ATTR_DATA_0_VF_QPC_BUF_PGSZ_M GENMASK(7, 4)
+
+#define CFG_BT_ATTR_DATA_0_VF_QPC_HOPNUM_S 8
+#define CFG_BT_ATTR_DATA_0_VF_QPC_HOPNUM_M GENMASK(9, 8)
+
+#define CFG_BT_ATTR_DATA_1_VF_SRQC_BA_PGSZ_S 0
+#define CFG_BT_ATTR_DATA_1_VF_SRQC_BA_PGSZ_M GENMASK(3, 0)
+
+#define CFG_BT_ATTR_DATA_1_VF_SRQC_BUF_PGSZ_S 4
+#define CFG_BT_ATTR_DATA_1_VF_SRQC_BUF_PGSZ_M GENMASK(7, 4)
+
+#define CFG_BT_ATTR_DATA_1_VF_SRQC_HOPNUM_S 8
+#define CFG_BT_ATTR_DATA_1_VF_SRQC_HOPNUM_M GENMASK(9, 8)
+
+#define CFG_BT_ATTR_DATA_2_VF_CQC_BA_PGSZ_S 0
+#define CFG_BT_ATTR_DATA_2_VF_CQC_BA_PGSZ_M GENMASK(3, 0)
+
+#define CFG_BT_ATTR_DATA_2_VF_CQC_BUF_PGSZ_S 4
+#define CFG_BT_ATTR_DATA_2_VF_CQC_BUF_PGSZ_M GENMASK(7, 4)
+
+#define CFG_BT_ATTR_DATA_2_VF_CQC_HOPNUM_S 8
+#define CFG_BT_ATTR_DATA_2_VF_CQC_HOPNUM_M GENMASK(9, 8)
+
+#define CFG_BT_ATTR_DATA_3_VF_MPT_BA_PGSZ_S 0
+#define CFG_BT_ATTR_DATA_3_VF_MPT_BA_PGSZ_M GENMASK(3, 0)
+
+#define CFG_BT_ATTR_DATA_3_VF_MPT_BUF_PGSZ_S 4
+#define CFG_BT_ATTR_DATA_3_VF_MPT_BUF_PGSZ_M GENMASK(7, 4)
+
+#define CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_S 8
+#define CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_M GENMASK(9, 8)
+
+struct hns_roce_cmq_desc {
+	__le16 opcode;
+	__le16 flag;
+	__le16 retval;
+	__le16 rsv;
+	__le32 data[6];
+};
+
+#define HNS_ROCE_V2_GO_BIT_TIMEOUT_MSECS	10000
+
+#define HNS_ROCE_HW_RUN_BIT_SHIFT	31
+#define HNS_ROCE_HW_MB_STATUS_MASK	0xFF
+
+#define HNS_ROCE_VF_MB4_TAG_MASK	0xFFFFFF00
+#define HNS_ROCE_VF_MB4_TAG_SHIFT	8
+
+#define HNS_ROCE_VF_MB4_CMD_MASK	0xFF
+#define HNS_ROCE_VF_MB4_CMD_SHIFT	0
+
+#define HNS_ROCE_VF_MB5_EVENT_MASK	0x10000
+#define HNS_ROCE_VF_MB5_EVENT_SHIFT	16
+
+#define HNS_ROCE_VF_MB5_TOKEN_MASK	0xFFFF
+#define HNS_ROCE_VF_MB5_TOKEN_SHIFT	0
+
+struct hns_roce_v2_cmq_ring {
+	dma_addr_t desc_dma_addr;
+	struct hns_roce_cmq_desc *desc;
+	u32 head;
+	u32 tail;
+
+	u16 buf_size;
+	u16 desc_num;
+	int next_to_use;
+	int next_to_clean;
+	u8 flag;
+	spinlock_t lock; /* command queue lock */
+};
+
+struct hns_roce_v2_cmq {
+	struct hns_roce_v2_cmq_ring csq;
+	struct hns_roce_v2_cmq_ring crq;
+	u16 tx_timeout;
+	u16 last_status;
+};
+
+struct hns_roce_v2_priv {
+	struct hns_roce_v2_cmq cmq;
+};
+
+struct hns_roce_eq_context {
+	__le32	byte_4;
+	__le32	byte_8;
+	__le32	byte_12;
+	__le32	eqe_report_timer;
+	__le32	eqe_ba0;
+	__le32	eqe_ba1;
+	__le32	byte_28;
+	__le32	byte_32;
+	__le32	byte_36;
+	__le32	nxt_eqe_ba0;
+	__le32	nxt_eqe_ba1;
+	__le32	rsv[5];
+};
+
+#define HNS_ROCE_AEQ_DEFAULT_BURST_NUM	0x0
+#define HNS_ROCE_AEQ_DEFAULT_INTERVAL	0x0
+#define HNS_ROCE_CEQ_DEFAULT_BURST_NUM	0x0
+#define HNS_ROCE_CEQ_DEFAULT_INTERVAL	0x0
+
+#define HNS_ROCE_V2_EQ_STATE_INVALID		0
+#define HNS_ROCE_V2_EQ_STATE_VALID		1
+#define HNS_ROCE_V2_EQ_STATE_OVERFLOW		2
+#define HNS_ROCE_V2_EQ_STATE_FAILURE		3
+
+#define HNS_ROCE_V2_EQ_OVER_IGNORE_0		0
+#define HNS_ROCE_V2_EQ_OVER_IGNORE_1		1
+
+#define HNS_ROCE_V2_EQ_COALESCE_0		0
+#define HNS_ROCE_V2_EQ_COALESCE_1		1
+
+#define HNS_ROCE_V2_EQ_FIRED			0
+#define HNS_ROCE_V2_EQ_ARMED			1
+#define HNS_ROCE_V2_EQ_ALWAYS_ARMED		3
+
+#define HNS_ROCE_EQ_INIT_EQE_CNT		0
+#define HNS_ROCE_EQ_INIT_PROD_IDX		0
+#define HNS_ROCE_EQ_INIT_REPORT_TIMER		0
+#define HNS_ROCE_EQ_INIT_MSI_IDX		0
+#define HNS_ROCE_EQ_INIT_CONS_IDX		0
+#define HNS_ROCE_EQ_INIT_NXT_EQE_BA		0
+
+#define HNS_ROCE_V2_CEQ_CEQE_OWNER_S		31
+#define HNS_ROCE_V2_AEQ_AEQE_OWNER_S		31
+
+#define HNS_ROCE_V2_COMP_EQE_NUM		0x1000
+#define HNS_ROCE_V2_ASYNC_EQE_NUM		0x1000
+
+#define HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S	0
+#define HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S		1
+#define HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S	2
+
+#define HNS_ROCE_EQ_DB_CMD_AEQ			0x0
+#define HNS_ROCE_EQ_DB_CMD_AEQ_ARMED		0x1
+#define HNS_ROCE_EQ_DB_CMD_CEQ			0x2
+#define HNS_ROCE_EQ_DB_CMD_CEQ_ARMED		0x3
+
+#define EQ_ENABLE				1
+#define EQ_DISABLE				0
+
+#define EQ_REG_OFFSET				0x4
+
+#define HNS_ROCE_INT_NAME_LEN			32
+#define HNS_ROCE_V2_EQN_M GENMASK(23, 0)
+
+#define HNS_ROCE_V2_CONS_IDX_M GENMASK(23, 0)
+
+#define HNS_ROCE_V2_VF_ABN_INT_EN_S 0
+#define HNS_ROCE_V2_VF_ABN_INT_EN_M GENMASK(0, 0)
+#define HNS_ROCE_V2_VF_ABN_INT_ST_M GENMASK(2, 0)
+#define HNS_ROCE_V2_VF_ABN_INT_CFG_M GENMASK(2, 0)
+#define HNS_ROCE_V2_VF_EVENT_INT_EN_M GENMASK(0, 0)
+
+/* WORD0 */
+#define HNS_ROCE_EQC_EQ_ST_S 0
+#define HNS_ROCE_EQC_EQ_ST_M GENMASK(1, 0)
+
+#define HNS_ROCE_EQC_HOP_NUM_S 2
+#define HNS_ROCE_EQC_HOP_NUM_M GENMASK(3, 2)
+
+#define HNS_ROCE_EQC_OVER_IGNORE_S 4
+#define HNS_ROCE_EQC_OVER_IGNORE_M GENMASK(4, 4)
+
+#define HNS_ROCE_EQC_COALESCE_S 5
+#define HNS_ROCE_EQC_COALESCE_M GENMASK(5, 5)
+
+#define HNS_ROCE_EQC_ARM_ST_S 6
+#define HNS_ROCE_EQC_ARM_ST_M GENMASK(7, 6)
+
+#define HNS_ROCE_EQC_EQN_S 8
+#define HNS_ROCE_EQC_EQN_M GENMASK(15, 8)
+
+#define HNS_ROCE_EQC_EQE_CNT_S 16
+#define HNS_ROCE_EQC_EQE_CNT_M GENMASK(31, 16)
+
+/* WORD1 */
+#define HNS_ROCE_EQC_BA_PG_SZ_S 0
+#define HNS_ROCE_EQC_BA_PG_SZ_M GENMASK(3, 0)
+
+#define HNS_ROCE_EQC_BUF_PG_SZ_S 4
+#define HNS_ROCE_EQC_BUF_PG_SZ_M GENMASK(7, 4)
+
+#define HNS_ROCE_EQC_PROD_INDX_S 8
+#define HNS_ROCE_EQC_PROD_INDX_M GENMASK(31, 8)
+
+/* WORD2 */
+#define HNS_ROCE_EQC_MAX_CNT_S 0
+#define HNS_ROCE_EQC_MAX_CNT_M GENMASK(15, 0)
+
+#define HNS_ROCE_EQC_PERIOD_S 16
+#define HNS_ROCE_EQC_PERIOD_M GENMASK(31, 16)
+
+/* WORD3 */
+#define HNS_ROCE_EQC_REPORT_TIMER_S 0
+#define HNS_ROCE_EQC_REPORT_TIMER_M GENMASK(31, 0)
+
+/* WORD4 */
+#define HNS_ROCE_EQC_EQE_BA_L_S 0
+#define HNS_ROCE_EQC_EQE_BA_L_M GENMASK(31, 0)
+
+/* WORD5 */
+#define HNS_ROCE_EQC_EQE_BA_H_S 0
+#define HNS_ROCE_EQC_EQE_BA_H_M GENMASK(28, 0)
+
+/* WORD6 */
+#define HNS_ROCE_EQC_SHIFT_S 0
+#define HNS_ROCE_EQC_SHIFT_M GENMASK(7, 0)
+
+#define HNS_ROCE_EQC_MSI_INDX_S 8
+#define HNS_ROCE_EQC_MSI_INDX_M GENMASK(15, 8)
+
+#define HNS_ROCE_EQC_CUR_EQE_BA_L_S 16
+#define HNS_ROCE_EQC_CUR_EQE_BA_L_M GENMASK(31, 16)
+
+/* WORD7 */
+#define HNS_ROCE_EQC_CUR_EQE_BA_M_S 0
+#define HNS_ROCE_EQC_CUR_EQE_BA_M_M GENMASK(31, 0)
+
+/* WORD8 */
+#define HNS_ROCE_EQC_CUR_EQE_BA_H_S 0
+#define HNS_ROCE_EQC_CUR_EQE_BA_H_M GENMASK(3, 0)
+
+#define HNS_ROCE_EQC_CONS_INDX_S 8
+#define HNS_ROCE_EQC_CONS_INDX_M GENMASK(31, 8)
+
+/* WORD9 */
+#define HNS_ROCE_EQC_NXT_EQE_BA_L_S 0
+#define HNS_ROCE_EQC_NXT_EQE_BA_L_M GENMASK(31, 0)
+
+/* WORD10 */
+#define HNS_ROCE_EQC_NXT_EQE_BA_H_S 0
+#define HNS_ROCE_EQC_NXT_EQE_BA_H_M GENMASK(19, 0)
+
+#define HNS_ROCE_V2_CEQE_COMP_CQN_S 0
+#define HNS_ROCE_V2_CEQE_COMP_CQN_M GENMASK(23, 0)
+
+#define HNS_ROCE_V2_AEQE_EVENT_TYPE_S 0
+#define HNS_ROCE_V2_AEQE_EVENT_TYPE_M GENMASK(7, 0)
+
+#define HNS_ROCE_V2_AEQE_SUB_TYPE_S 8
+#define HNS_ROCE_V2_AEQE_SUB_TYPE_M GENMASK(15, 8)
+
+#define HNS_ROCE_V2_EQ_DB_CMD_S	16
+#define HNS_ROCE_V2_EQ_DB_CMD_M	GENMASK(17, 16)
+
+#define HNS_ROCE_V2_EQ_DB_TAG_S	0
+#define HNS_ROCE_V2_EQ_DB_TAG_M	GENMASK(7, 0)
+
+#define HNS_ROCE_V2_EQ_DB_PARA_S 0
+#define HNS_ROCE_V2_EQ_DB_PARA_M GENMASK(23, 0)
+
+#define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S 0
+#define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M GENMASK(23, 0)
+
+#endif
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
new file mode 100644
index 0000000..96fb6a9
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -0,0 +1,852 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/acpi.h>
+#include <linux/of_platform.h>
+#include <linux/module.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_cache.h>
+#include "hns_roce_common.h"
+#include "hns_roce_device.h"
+#include <rdma/hns-abi.h>
+#include "hns_roce_hem.h"
+
+/**
+ * hns_get_gid_index - Get gid index.
+ * @hr_dev: pointer to structure hns_roce_dev.
+ * @port:  port, value range: 0 ~ MAX
+ * @gid_index:  gid_index, value range: 0 ~ MAX
+ * Description:
+ *    N ports shared gids, allocation method as follow:
+ *		GID[0][0], GID[1][0],.....GID[N - 1][0],
+ *		GID[0][0], GID[1][0],.....GID[N - 1][0],
+ *		And so on
+ */
+int hns_get_gid_index(struct hns_roce_dev *hr_dev, u8 port, int gid_index)
+{
+	return gid_index * hr_dev->caps.num_ports + port;
+}
+EXPORT_SYMBOL_GPL(hns_get_gid_index);
+
+static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u8 port, u8 *addr)
+{
+	u8 phy_port;
+	u32 i = 0;
+
+	if (!memcmp(hr_dev->dev_addr[port], addr, MAC_ADDR_OCTET_NUM))
+		return 0;
+
+	for (i = 0; i < MAC_ADDR_OCTET_NUM; i++)
+		hr_dev->dev_addr[port][i] = addr[i];
+
+	phy_port = hr_dev->iboe.phy_port[port];
+	return hr_dev->hw->set_mac(hr_dev, phy_port, addr);
+}
+
+static int hns_roce_add_gid(const union ib_gid *gid,
+			    const struct ib_gid_attr *attr, void **context)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(attr->device);
+	u8 port = attr->port_num - 1;
+	unsigned long flags;
+	int ret;
+
+	if (port >= hr_dev->caps.num_ports)
+		return -EINVAL;
+
+	spin_lock_irqsave(&hr_dev->iboe.lock, flags);
+
+	ret = hr_dev->hw->set_gid(hr_dev, port, attr->index,
+				  (union ib_gid *)gid, attr);
+
+	spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
+
+	return ret;
+}
+
+static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(attr->device);
+	struct ib_gid_attr zattr = { };
+	union ib_gid zgid = { {0} };
+	u8 port = attr->port_num - 1;
+	unsigned long flags;
+	int ret;
+
+	if (port >= hr_dev->caps.num_ports)
+		return -EINVAL;
+
+	spin_lock_irqsave(&hr_dev->iboe.lock, flags);
+
+	ret = hr_dev->hw->set_gid(hr_dev, port, attr->index, &zgid, &zattr);
+
+	spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
+
+	return ret;
+}
+
+static int handle_en_event(struct hns_roce_dev *hr_dev, u8 port,
+			   unsigned long event)
+{
+	struct device *dev = hr_dev->dev;
+	struct net_device *netdev;
+	int ret = 0;
+
+	netdev = hr_dev->iboe.netdevs[port];
+	if (!netdev) {
+		dev_err(dev, "port(%d) can't find netdev\n", port);
+		return -ENODEV;
+	}
+
+	switch (event) {
+	case NETDEV_UP:
+	case NETDEV_CHANGE:
+	case NETDEV_REGISTER:
+	case NETDEV_CHANGEADDR:
+		ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr);
+		break;
+	case NETDEV_DOWN:
+		/*
+		 * In v1 engine, only support all ports closed together.
+		 */
+		break;
+	default:
+		dev_dbg(dev, "NETDEV event = 0x%x!\n", (u32)(event));
+		break;
+	}
+
+	return ret;
+}
+
+static int hns_roce_netdev_event(struct notifier_block *self,
+				 unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct hns_roce_ib_iboe *iboe = NULL;
+	struct hns_roce_dev *hr_dev = NULL;
+	u8 port = 0;
+	int ret = 0;
+
+	hr_dev = container_of(self, struct hns_roce_dev, iboe.nb);
+	iboe = &hr_dev->iboe;
+
+	for (port = 0; port < hr_dev->caps.num_ports; port++) {
+		if (dev == iboe->netdevs[port]) {
+			ret = handle_en_event(hr_dev, port, event);
+			if (ret)
+				return NOTIFY_DONE;
+			break;
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int hns_roce_setup_mtu_mac(struct hns_roce_dev *hr_dev)
+{
+	int ret;
+	u8 i;
+
+	for (i = 0; i < hr_dev->caps.num_ports; i++) {
+		if (hr_dev->hw->set_mtu)
+			hr_dev->hw->set_mtu(hr_dev, hr_dev->iboe.phy_port[i],
+					    hr_dev->caps.max_mtu);
+		ret = hns_roce_set_mac(hr_dev, i,
+				       hr_dev->iboe.netdevs[i]->dev_addr);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int hns_roce_query_device(struct ib_device *ib_dev,
+				 struct ib_device_attr *props,
+				 struct ib_udata *uhw)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
+
+	memset(props, 0, sizeof(*props));
+
+	props->sys_image_guid = cpu_to_be64(hr_dev->sys_image_guid);
+	props->max_mr_size = (u64)(~(0ULL));
+	props->page_size_cap = hr_dev->caps.page_size_cap;
+	props->vendor_id = hr_dev->vendor_id;
+	props->vendor_part_id = hr_dev->vendor_part_id;
+	props->hw_ver = hr_dev->hw_rev;
+	props->max_qp = hr_dev->caps.num_qps;
+	props->max_qp_wr = hr_dev->caps.max_wqes;
+	props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT |
+				  IB_DEVICE_RC_RNR_NAK_GEN;
+	props->max_sge = max(hr_dev->caps.max_sq_sg, hr_dev->caps.max_rq_sg);
+	props->max_sge_rd = 1;
+	props->max_cq = hr_dev->caps.num_cqs;
+	props->max_cqe = hr_dev->caps.max_cqes;
+	props->max_mr = hr_dev->caps.num_mtpts;
+	props->max_pd = hr_dev->caps.num_pds;
+	props->max_qp_rd_atom = hr_dev->caps.max_qp_dest_rdma;
+	props->max_qp_init_rd_atom = hr_dev->caps.max_qp_init_rdma;
+	props->atomic_cap = IB_ATOMIC_NONE;
+	props->max_pkeys = 1;
+	props->local_ca_ack_delay = hr_dev->caps.local_ca_ack_delay;
+
+	return 0;
+}
+
+static struct net_device *hns_roce_get_netdev(struct ib_device *ib_dev,
+					      u8 port_num)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
+	struct net_device *ndev;
+
+	if (port_num < 1 || port_num > hr_dev->caps.num_ports)
+		return NULL;
+
+	rcu_read_lock();
+
+	ndev = hr_dev->iboe.netdevs[port_num - 1];
+	if (ndev)
+		dev_hold(ndev);
+
+	rcu_read_unlock();
+	return ndev;
+}
+
+static int hns_roce_query_port(struct ib_device *ib_dev, u8 port_num,
+			       struct ib_port_attr *props)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
+	struct device *dev = hr_dev->dev;
+	struct net_device *net_dev;
+	unsigned long flags;
+	enum ib_mtu mtu;
+	u8 port;
+
+	assert(port_num > 0);
+	port = port_num - 1;
+
+	/* props being zeroed by the caller, avoid zeroing it here */
+
+	props->max_mtu = hr_dev->caps.max_mtu;
+	props->gid_tbl_len = hr_dev->caps.gid_table_len[port];
+	props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
+				IB_PORT_VENDOR_CLASS_SUP |
+				IB_PORT_BOOT_MGMT_SUP;
+	props->max_msg_sz = HNS_ROCE_MAX_MSG_LEN;
+	props->pkey_tbl_len = 1;
+	props->active_width = IB_WIDTH_4X;
+	props->active_speed = 1;
+
+	spin_lock_irqsave(&hr_dev->iboe.lock, flags);
+
+	net_dev = hr_dev->iboe.netdevs[port];
+	if (!net_dev) {
+		spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
+		dev_err(dev, "find netdev %d failed!\r\n", port);
+		return -EINVAL;
+	}
+
+	mtu = iboe_get_mtu(net_dev->mtu);
+	props->active_mtu = mtu ? min(props->max_mtu, mtu) : IB_MTU_256;
+	props->state = (netif_running(net_dev) && netif_carrier_ok(net_dev)) ?
+			IB_PORT_ACTIVE : IB_PORT_DOWN;
+	props->phys_state = (props->state == IB_PORT_ACTIVE) ? 5 : 3;
+
+	spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
+
+	return 0;
+}
+
+static enum rdma_link_layer hns_roce_get_link_layer(struct ib_device *device,
+						    u8 port_num)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+static int hns_roce_query_pkey(struct ib_device *ib_dev, u8 port, u16 index,
+			       u16 *pkey)
+{
+	*pkey = PKEY_ID;
+
+	return 0;
+}
+
+static int hns_roce_modify_device(struct ib_device *ib_dev, int mask,
+				  struct ib_device_modify *props)
+{
+	unsigned long flags;
+
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		spin_lock_irqsave(&to_hr_dev(ib_dev)->sm_lock, flags);
+		memcpy(ib_dev->node_desc, props->node_desc, NODE_DESC_SIZE);
+		spin_unlock_irqrestore(&to_hr_dev(ib_dev)->sm_lock, flags);
+	}
+
+	return 0;
+}
+
+static int hns_roce_modify_port(struct ib_device *ib_dev, u8 port_num, int mask,
+				struct ib_port_modify *props)
+{
+	return 0;
+}
+
+static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev,
+						   struct ib_udata *udata)
+{
+	int ret = 0;
+	struct hns_roce_ucontext *context;
+	struct hns_roce_ib_alloc_ucontext_resp resp = {};
+	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
+
+	resp.qp_tab_size = hr_dev->caps.num_qps;
+
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	ret = hns_roce_uar_alloc(hr_dev, &context->uar);
+	if (ret)
+		goto error_fail_uar_alloc;
+
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) {
+		INIT_LIST_HEAD(&context->page_list);
+		mutex_init(&context->page_mutex);
+	}
+
+	ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (ret)
+		goto error_fail_copy_to_udata;
+
+	return &context->ibucontext;
+
+error_fail_copy_to_udata:
+	hns_roce_uar_free(hr_dev, &context->uar);
+
+error_fail_uar_alloc:
+	kfree(context);
+
+	return ERR_PTR(ret);
+}
+
+static int hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext);
+
+	hns_roce_uar_free(to_hr_dev(ibcontext->device), &context->uar);
+	kfree(context);
+
+	return 0;
+}
+
+static int hns_roce_mmap(struct ib_ucontext *context,
+			 struct vm_area_struct *vma)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(context->device);
+
+	if (((vma->vm_end - vma->vm_start) % PAGE_SIZE) != 0)
+		return -EINVAL;
+
+	if (vma->vm_pgoff == 0) {
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       to_hr_ucontext(context)->uar.pfn,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+	} else if (vma->vm_pgoff == 1 && hr_dev->tptr_dma_addr &&
+		   hr_dev->tptr_size) {
+		/* vm_pgoff: 1 -- TPTR */
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       hr_dev->tptr_dma_addr >> PAGE_SHIFT,
+				       hr_dev->tptr_size,
+				       vma->vm_page_prot))
+			return -EAGAIN;
+	} else
+		return -EINVAL;
+
+	return 0;
+}
+
+static int hns_roce_port_immutable(struct ib_device *ib_dev, u8 port_num,
+				   struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int ret;
+
+	ret = ib_query_port(ib_dev, port_num, &attr);
+	if (ret)
+		return ret;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+	if (to_hr_dev(ib_dev)->caps.flags & HNS_ROCE_CAP_FLAG_ROCE_V1_V2)
+		immutable->core_cap_flags |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+
+	return 0;
+}
+
+static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_ib_iboe *iboe = &hr_dev->iboe;
+
+	unregister_netdevice_notifier(&iboe->nb);
+	ib_unregister_device(&hr_dev->ib_dev);
+}
+
+static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
+{
+	int ret;
+	struct hns_roce_ib_iboe *iboe = NULL;
+	struct ib_device *ib_dev = NULL;
+	struct device *dev = hr_dev->dev;
+
+	iboe = &hr_dev->iboe;
+	spin_lock_init(&iboe->lock);
+
+	ib_dev = &hr_dev->ib_dev;
+	strlcpy(ib_dev->name, "hns_%d", IB_DEVICE_NAME_MAX);
+
+	ib_dev->owner			= THIS_MODULE;
+	ib_dev->node_type		= RDMA_NODE_IB_CA;
+	ib_dev->dev.parent		= dev;
+
+	ib_dev->phys_port_cnt		= hr_dev->caps.num_ports;
+	ib_dev->local_dma_lkey		= hr_dev->caps.reserved_lkey;
+	ib_dev->num_comp_vectors	= hr_dev->caps.num_comp_vectors;
+	ib_dev->uverbs_abi_ver		= 1;
+	ib_dev->uverbs_cmd_mask		=
+		(1ULL << IB_USER_VERBS_CMD_GET_CONTEXT) |
+		(1ULL << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+		(1ULL << IB_USER_VERBS_CMD_QUERY_PORT) |
+		(1ULL << IB_USER_VERBS_CMD_ALLOC_PD) |
+		(1ULL << IB_USER_VERBS_CMD_DEALLOC_PD) |
+		(1ULL << IB_USER_VERBS_CMD_REG_MR) |
+		(1ULL << IB_USER_VERBS_CMD_DEREG_MR) |
+		(1ULL << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ULL << IB_USER_VERBS_CMD_CREATE_CQ) |
+		(1ULL << IB_USER_VERBS_CMD_DESTROY_CQ) |
+		(1ULL << IB_USER_VERBS_CMD_CREATE_QP) |
+		(1ULL << IB_USER_VERBS_CMD_MODIFY_QP) |
+		(1ULL << IB_USER_VERBS_CMD_QUERY_QP) |
+		(1ULL << IB_USER_VERBS_CMD_DESTROY_QP);
+
+	/* HCA||device||port */
+	ib_dev->modify_device		= hns_roce_modify_device;
+	ib_dev->query_device		= hns_roce_query_device;
+	ib_dev->query_port		= hns_roce_query_port;
+	ib_dev->modify_port		= hns_roce_modify_port;
+	ib_dev->get_link_layer		= hns_roce_get_link_layer;
+	ib_dev->get_netdev		= hns_roce_get_netdev;
+	ib_dev->add_gid			= hns_roce_add_gid;
+	ib_dev->del_gid			= hns_roce_del_gid;
+	ib_dev->query_pkey		= hns_roce_query_pkey;
+	ib_dev->alloc_ucontext		= hns_roce_alloc_ucontext;
+	ib_dev->dealloc_ucontext	= hns_roce_dealloc_ucontext;
+	ib_dev->mmap			= hns_roce_mmap;
+
+	/* PD */
+	ib_dev->alloc_pd		= hns_roce_alloc_pd;
+	ib_dev->dealloc_pd		= hns_roce_dealloc_pd;
+
+	/* AH */
+	ib_dev->create_ah		= hns_roce_create_ah;
+	ib_dev->query_ah		= hns_roce_query_ah;
+	ib_dev->destroy_ah		= hns_roce_destroy_ah;
+
+	/* QP */
+	ib_dev->create_qp		= hns_roce_create_qp;
+	ib_dev->modify_qp		= hns_roce_modify_qp;
+	ib_dev->query_qp		= hr_dev->hw->query_qp;
+	ib_dev->destroy_qp		= hr_dev->hw->destroy_qp;
+	ib_dev->post_send		= hr_dev->hw->post_send;
+	ib_dev->post_recv		= hr_dev->hw->post_recv;
+
+	/* CQ */
+	ib_dev->create_cq		= hns_roce_ib_create_cq;
+	ib_dev->modify_cq		= hr_dev->hw->modify_cq;
+	ib_dev->destroy_cq		= hns_roce_ib_destroy_cq;
+	ib_dev->req_notify_cq		= hr_dev->hw->req_notify_cq;
+	ib_dev->poll_cq			= hr_dev->hw->poll_cq;
+
+	/* MR */
+	ib_dev->get_dma_mr		= hns_roce_get_dma_mr;
+	ib_dev->reg_user_mr		= hns_roce_reg_user_mr;
+	ib_dev->dereg_mr		= hns_roce_dereg_mr;
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_REREG_MR) {
+		ib_dev->rereg_user_mr	= hns_roce_rereg_user_mr;
+		ib_dev->uverbs_cmd_mask |= (1ULL << IB_USER_VERBS_CMD_REREG_MR);
+	}
+
+	/* OTHERS */
+	ib_dev->get_port_immutable	= hns_roce_port_immutable;
+
+	ib_dev->driver_id = RDMA_DRIVER_HNS;
+	ret = ib_register_device(ib_dev, NULL);
+	if (ret) {
+		dev_err(dev, "ib_register_device failed!\n");
+		return ret;
+	}
+
+	ret = hns_roce_setup_mtu_mac(hr_dev);
+	if (ret) {
+		dev_err(dev, "setup_mtu_mac failed!\n");
+		goto error_failed_setup_mtu_mac;
+	}
+
+	iboe->nb.notifier_call = hns_roce_netdev_event;
+	ret = register_netdevice_notifier(&iboe->nb);
+	if (ret) {
+		dev_err(dev, "register_netdevice_notifier failed!\n");
+		goto error_failed_setup_mtu_mac;
+	}
+
+	return 0;
+
+error_failed_setup_mtu_mac:
+	ib_unregister_device(ib_dev);
+
+	return ret;
+}
+
+static int hns_roce_init_hem(struct hns_roce_dev *hr_dev)
+{
+	int ret;
+	struct device *dev = hr_dev->dev;
+
+	ret = hns_roce_init_hem_table(hr_dev, &hr_dev->mr_table.mtt_table,
+				      HEM_TYPE_MTT, hr_dev->caps.mtt_entry_sz,
+				      hr_dev->caps.num_mtt_segs, 1);
+	if (ret) {
+		dev_err(dev, "Failed to init MTT context memory, aborting.\n");
+		return ret;
+	}
+
+	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE)) {
+		ret = hns_roce_init_hem_table(hr_dev,
+				      &hr_dev->mr_table.mtt_cqe_table,
+				      HEM_TYPE_CQE, hr_dev->caps.mtt_entry_sz,
+				      hr_dev->caps.num_cqe_segs, 1);
+		if (ret) {
+			dev_err(dev, "Failed to init MTT CQE context memory, aborting.\n");
+			goto err_unmap_cqe;
+		}
+	}
+
+	ret = hns_roce_init_hem_table(hr_dev, &hr_dev->mr_table.mtpt_table,
+				      HEM_TYPE_MTPT, hr_dev->caps.mtpt_entry_sz,
+				      hr_dev->caps.num_mtpts, 1);
+	if (ret) {
+		dev_err(dev, "Failed to init MTPT context memory, aborting.\n");
+		goto err_unmap_mtt;
+	}
+
+	ret = hns_roce_init_hem_table(hr_dev, &hr_dev->qp_table.qp_table,
+				      HEM_TYPE_QPC, hr_dev->caps.qpc_entry_sz,
+				      hr_dev->caps.num_qps, 1);
+	if (ret) {
+		dev_err(dev, "Failed to init QP context memory, aborting.\n");
+		goto err_unmap_dmpt;
+	}
+
+	ret = hns_roce_init_hem_table(hr_dev, &hr_dev->qp_table.irrl_table,
+				      HEM_TYPE_IRRL,
+				      hr_dev->caps.irrl_entry_sz *
+				      hr_dev->caps.max_qp_init_rdma,
+				      hr_dev->caps.num_qps, 1);
+	if (ret) {
+		dev_err(dev, "Failed to init irrl_table memory, aborting.\n");
+		goto err_unmap_qp;
+	}
+
+	if (hr_dev->caps.trrl_entry_sz) {
+		ret = hns_roce_init_hem_table(hr_dev,
+					      &hr_dev->qp_table.trrl_table,
+					      HEM_TYPE_TRRL,
+					      hr_dev->caps.trrl_entry_sz *
+					      hr_dev->caps.max_qp_dest_rdma,
+					      hr_dev->caps.num_qps, 1);
+		if (ret) {
+			dev_err(dev,
+			       "Failed to init trrl_table memory, aborting.\n");
+			goto err_unmap_irrl;
+		}
+	}
+
+	ret = hns_roce_init_hem_table(hr_dev, &hr_dev->cq_table.table,
+				      HEM_TYPE_CQC, hr_dev->caps.cqc_entry_sz,
+				      hr_dev->caps.num_cqs, 1);
+	if (ret) {
+		dev_err(dev, "Failed to init CQ context memory, aborting.\n");
+		goto err_unmap_trrl;
+	}
+
+	return 0;
+
+err_unmap_trrl:
+	if (hr_dev->caps.trrl_entry_sz)
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->qp_table.trrl_table);
+
+err_unmap_irrl:
+	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->qp_table.irrl_table);
+
+err_unmap_qp:
+	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->qp_table.qp_table);
+
+err_unmap_dmpt:
+	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtpt_table);
+
+err_unmap_mtt:
+	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
+		hns_roce_cleanup_hem_table(hr_dev,
+					   &hr_dev->mr_table.mtt_cqe_table);
+
+err_unmap_cqe:
+	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtt_table);
+
+	return ret;
+}
+
+/**
+ * hns_roce_setup_hca - setup host channel adapter
+ * @hr_dev: pointer to hns roce device
+ * Return : int
+ */
+static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev)
+{
+	int ret;
+	struct device *dev = hr_dev->dev;
+
+	spin_lock_init(&hr_dev->sm_lock);
+	spin_lock_init(&hr_dev->bt_cmd_lock);
+
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) {
+		INIT_LIST_HEAD(&hr_dev->pgdir_list);
+		mutex_init(&hr_dev->pgdir_mutex);
+	}
+
+	ret = hns_roce_init_uar_table(hr_dev);
+	if (ret) {
+		dev_err(dev, "Failed to initialize uar table. aborting\n");
+		return ret;
+	}
+
+	ret = hns_roce_uar_alloc(hr_dev, &hr_dev->priv_uar);
+	if (ret) {
+		dev_err(dev, "Failed to allocate priv_uar.\n");
+		goto err_uar_table_free;
+	}
+
+	ret = hns_roce_init_pd_table(hr_dev);
+	if (ret) {
+		dev_err(dev, "Failed to init protected domain table.\n");
+		goto err_uar_alloc_free;
+	}
+
+	ret = hns_roce_init_mr_table(hr_dev);
+	if (ret) {
+		dev_err(dev, "Failed to init memory region table.\n");
+		goto err_pd_table_free;
+	}
+
+	ret = hns_roce_init_cq_table(hr_dev);
+	if (ret) {
+		dev_err(dev, "Failed to init completion queue table.\n");
+		goto err_mr_table_free;
+	}
+
+	ret = hns_roce_init_qp_table(hr_dev);
+	if (ret) {
+		dev_err(dev, "Failed to init queue pair table.\n");
+		goto err_cq_table_free;
+	}
+
+	return 0;
+
+err_cq_table_free:
+	hns_roce_cleanup_cq_table(hr_dev);
+
+err_mr_table_free:
+	hns_roce_cleanup_mr_table(hr_dev);
+
+err_pd_table_free:
+	hns_roce_cleanup_pd_table(hr_dev);
+
+err_uar_alloc_free:
+	hns_roce_uar_free(hr_dev, &hr_dev->priv_uar);
+
+err_uar_table_free:
+	hns_roce_cleanup_uar_table(hr_dev);
+	return ret;
+}
+
+int hns_roce_init(struct hns_roce_dev *hr_dev)
+{
+	int ret;
+	struct device *dev = hr_dev->dev;
+
+	if (hr_dev->hw->reset) {
+		ret = hr_dev->hw->reset(hr_dev, true);
+		if (ret) {
+			dev_err(dev, "Reset RoCE engine failed!\n");
+			return ret;
+		}
+	}
+
+	if (hr_dev->hw->cmq_init) {
+		ret = hr_dev->hw->cmq_init(hr_dev);
+		if (ret) {
+			dev_err(dev, "Init RoCE Command Queue failed!\n");
+			goto error_failed_cmq_init;
+		}
+	}
+
+	ret = hr_dev->hw->hw_profile(hr_dev);
+	if (ret) {
+		dev_err(dev, "Get RoCE engine profile failed!\n");
+		goto error_failed_cmd_init;
+	}
+
+	ret = hns_roce_cmd_init(hr_dev);
+	if (ret) {
+		dev_err(dev, "cmd init failed!\n");
+		goto error_failed_cmd_init;
+	}
+
+	ret = hr_dev->hw->init_eq(hr_dev);
+	if (ret) {
+		dev_err(dev, "eq init failed!\n");
+		goto error_failed_eq_table;
+	}
+
+	if (hr_dev->cmd_mod) {
+		ret = hns_roce_cmd_use_events(hr_dev);
+		if (ret) {
+			dev_err(dev, "Switch to event-driven cmd failed!\n");
+			goto error_failed_use_event;
+		}
+	}
+
+	ret = hns_roce_init_hem(hr_dev);
+	if (ret) {
+		dev_err(dev, "init HEM(Hardware Entry Memory) failed!\n");
+		goto error_failed_init_hem;
+	}
+
+	ret = hns_roce_setup_hca(hr_dev);
+	if (ret) {
+		dev_err(dev, "setup hca failed!\n");
+		goto error_failed_setup_hca;
+	}
+
+	if (hr_dev->hw->hw_init) {
+		ret = hr_dev->hw->hw_init(hr_dev);
+		if (ret) {
+			dev_err(dev, "hw_init failed!\n");
+			goto error_failed_engine_init;
+		}
+	}
+
+	ret = hns_roce_register_device(hr_dev);
+	if (ret)
+		goto error_failed_register_device;
+
+	return 0;
+
+error_failed_register_device:
+	if (hr_dev->hw->hw_exit)
+		hr_dev->hw->hw_exit(hr_dev);
+
+error_failed_engine_init:
+	hns_roce_cleanup_bitmap(hr_dev);
+
+error_failed_setup_hca:
+	hns_roce_cleanup_hem(hr_dev);
+
+error_failed_init_hem:
+	if (hr_dev->cmd_mod)
+		hns_roce_cmd_use_polling(hr_dev);
+
+error_failed_use_event:
+	hr_dev->hw->cleanup_eq(hr_dev);
+
+error_failed_eq_table:
+	hns_roce_cmd_cleanup(hr_dev);
+
+error_failed_cmd_init:
+	if (hr_dev->hw->cmq_exit)
+		hr_dev->hw->cmq_exit(hr_dev);
+
+error_failed_cmq_init:
+	if (hr_dev->hw->reset) {
+		ret = hr_dev->hw->reset(hr_dev, false);
+		if (ret)
+			dev_err(dev, "Dereset RoCE engine failed!\n");
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(hns_roce_init);
+
+void hns_roce_exit(struct hns_roce_dev *hr_dev)
+{
+	hns_roce_unregister_device(hr_dev);
+	if (hr_dev->hw->hw_exit)
+		hr_dev->hw->hw_exit(hr_dev);
+	hns_roce_cleanup_bitmap(hr_dev);
+	hns_roce_cleanup_hem(hr_dev);
+
+	if (hr_dev->cmd_mod)
+		hns_roce_cmd_use_polling(hr_dev);
+
+	hr_dev->hw->cleanup_eq(hr_dev);
+	hns_roce_cmd_cleanup(hr_dev);
+	if (hr_dev->hw->cmq_exit)
+		hr_dev->hw->cmq_exit(hr_dev);
+	if (hr_dev->hw->reset)
+		hr_dev->hw->reset(hr_dev, false);
+}
+EXPORT_SYMBOL_GPL(hns_roce_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Wei Hu <xavier.huwei@huawei.com>");
+MODULE_AUTHOR("Nenglong Zhao <zhaonenglong@hisilicon.com>");
+MODULE_AUTHOR("Lijun Ou <oulijun@huawei.com>");
+MODULE_DESCRIPTION("HNS RoCE Driver");
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
new file mode 100644
index 0000000..f7256d8
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -0,0 +1,1209 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/vmalloc.h>
+#include <rdma/ib_umem.h>
+#include "hns_roce_device.h"
+#include "hns_roce_cmd.h"
+#include "hns_roce_hem.h"
+
+static u32 hw_index_to_key(unsigned long ind)
+{
+	return (u32)(ind >> 24) | (ind << 8);
+}
+
+unsigned long key_to_hw_index(u32 key)
+{
+	return (key << 24) | (key >> 8);
+}
+EXPORT_SYMBOL_GPL(key_to_hw_index);
+
+static int hns_roce_sw2hw_mpt(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_cmd_mailbox *mailbox,
+			      unsigned long mpt_index)
+{
+	return hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, mpt_index, 0,
+				 HNS_ROCE_CMD_SW2HW_MPT,
+				 HNS_ROCE_CMD_TIMEOUT_MSECS);
+}
+
+int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_cmd_mailbox *mailbox,
+			      unsigned long mpt_index)
+{
+	return hns_roce_cmd_mbox(hr_dev, 0, mailbox ? mailbox->dma : 0,
+				 mpt_index, !mailbox, HNS_ROCE_CMD_HW2SW_MPT,
+				 HNS_ROCE_CMD_TIMEOUT_MSECS);
+}
+EXPORT_SYMBOL_GPL(hns_roce_hw2sw_mpt);
+
+static int hns_roce_buddy_alloc(struct hns_roce_buddy *buddy, int order,
+				unsigned long *seg)
+{
+	int o;
+	u32 m;
+
+	spin_lock(&buddy->lock);
+
+	for (o = order; o <= buddy->max_order; ++o) {
+		if (buddy->num_free[o]) {
+			m = 1 << (buddy->max_order - o);
+			*seg = find_first_bit(buddy->bits[o], m);
+			if (*seg < m)
+				goto found;
+		}
+	}
+	spin_unlock(&buddy->lock);
+	return -1;
+
+ found:
+	clear_bit(*seg, buddy->bits[o]);
+	--buddy->num_free[o];
+
+	while (o > order) {
+		--o;
+		*seg <<= 1;
+		set_bit(*seg ^ 1, buddy->bits[o]);
+		++buddy->num_free[o];
+	}
+
+	spin_unlock(&buddy->lock);
+
+	*seg <<= order;
+	return 0;
+}
+
+static void hns_roce_buddy_free(struct hns_roce_buddy *buddy, unsigned long seg,
+				int order)
+{
+	seg >>= order;
+
+	spin_lock(&buddy->lock);
+
+	while (test_bit(seg ^ 1, buddy->bits[order])) {
+		clear_bit(seg ^ 1, buddy->bits[order]);
+		--buddy->num_free[order];
+		seg >>= 1;
+		++order;
+	}
+
+	set_bit(seg, buddy->bits[order]);
+	++buddy->num_free[order];
+
+	spin_unlock(&buddy->lock);
+}
+
+static int hns_roce_buddy_init(struct hns_roce_buddy *buddy, int max_order)
+{
+	int i, s;
+
+	buddy->max_order = max_order;
+	spin_lock_init(&buddy->lock);
+	buddy->bits = kcalloc(buddy->max_order + 1,
+			      sizeof(*buddy->bits),
+			      GFP_KERNEL);
+	buddy->num_free = kcalloc(buddy->max_order + 1,
+				  sizeof(*buddy->num_free),
+				  GFP_KERNEL);
+	if (!buddy->bits || !buddy->num_free)
+		goto err_out;
+
+	for (i = 0; i <= buddy->max_order; ++i) {
+		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
+		buddy->bits[i] = kcalloc(s, sizeof(long), GFP_KERNEL |
+					 __GFP_NOWARN);
+		if (!buddy->bits[i]) {
+			buddy->bits[i] = vzalloc(s * sizeof(long));
+			if (!buddy->bits[i])
+				goto err_out_free;
+		}
+	}
+
+	set_bit(0, buddy->bits[buddy->max_order]);
+	buddy->num_free[buddy->max_order] = 1;
+
+	return 0;
+
+err_out_free:
+	for (i = 0; i <= buddy->max_order; ++i)
+		kvfree(buddy->bits[i]);
+
+err_out:
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+	return -ENOMEM;
+}
+
+static void hns_roce_buddy_cleanup(struct hns_roce_buddy *buddy)
+{
+	int i;
+
+	for (i = 0; i <= buddy->max_order; ++i)
+		kvfree(buddy->bits[i]);
+
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+}
+
+static int hns_roce_alloc_mtt_range(struct hns_roce_dev *hr_dev, int order,
+				    unsigned long *seg, u32 mtt_type)
+{
+	struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
+	struct hns_roce_hem_table *table;
+	struct hns_roce_buddy *buddy;
+	int ret;
+
+	if (mtt_type == MTT_TYPE_WQE) {
+		buddy = &mr_table->mtt_buddy;
+		table = &mr_table->mtt_table;
+	} else {
+		buddy = &mr_table->mtt_cqe_buddy;
+		table = &mr_table->mtt_cqe_table;
+	}
+
+	ret = hns_roce_buddy_alloc(buddy, order, seg);
+	if (ret == -1)
+		return -1;
+
+	if (hns_roce_table_get_range(hr_dev, table, *seg,
+				     *seg + (1 << order) - 1)) {
+		hns_roce_buddy_free(buddy, *seg, order);
+		return -1;
+	}
+
+	return 0;
+}
+
+int hns_roce_mtt_init(struct hns_roce_dev *hr_dev, int npages, int page_shift,
+		      struct hns_roce_mtt *mtt)
+{
+	int ret;
+	int i;
+
+	/* Page num is zero, correspond to DMA memory register */
+	if (!npages) {
+		mtt->order = -1;
+		mtt->page_shift = HNS_ROCE_HEM_PAGE_SHIFT;
+		return 0;
+	}
+
+	/* Note: if page_shift is zero, FAST memory register */
+	mtt->page_shift = page_shift;
+
+	/* Compute MTT entry necessary */
+	for (mtt->order = 0, i = HNS_ROCE_MTT_ENTRY_PER_SEG; i < npages;
+	     i <<= 1)
+		++mtt->order;
+
+	/* Allocate MTT entry */
+	ret = hns_roce_alloc_mtt_range(hr_dev, mtt->order, &mtt->first_seg,
+				       mtt->mtt_type);
+	if (ret == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev, struct hns_roce_mtt *mtt)
+{
+	struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
+
+	if (mtt->order < 0)
+		return;
+
+	if (mtt->mtt_type == MTT_TYPE_WQE) {
+		hns_roce_buddy_free(&mr_table->mtt_buddy, mtt->first_seg,
+				    mtt->order);
+		hns_roce_table_put_range(hr_dev, &mr_table->mtt_table,
+					mtt->first_seg,
+					mtt->first_seg + (1 << mtt->order) - 1);
+	} else {
+		hns_roce_buddy_free(&mr_table->mtt_cqe_buddy, mtt->first_seg,
+				    mtt->order);
+		hns_roce_table_put_range(hr_dev, &mr_table->mtt_cqe_table,
+					mtt->first_seg,
+					mtt->first_seg + (1 << mtt->order) - 1);
+	}
+}
+EXPORT_SYMBOL_GPL(hns_roce_mtt_cleanup);
+
+static void hns_roce_loop_free(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_mr *mr, int err_loop_index,
+			       int loop_i, int loop_j)
+{
+	struct device *dev = hr_dev->dev;
+	u32 mhop_num;
+	u32 pbl_bt_sz;
+	u64 bt_idx;
+	int i, j;
+
+	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
+	mhop_num = hr_dev->caps.pbl_hop_num;
+
+	i = loop_i;
+	if (mhop_num == 3 && err_loop_index == 2) {
+		for (; i >= 0; i--) {
+			dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
+					  mr->pbl_l1_dma_addr[i]);
+
+			for (j = 0; j < pbl_bt_sz / 8; j++) {
+				if (i == loop_i && j >= loop_j)
+					break;
+
+				bt_idx = i * pbl_bt_sz / 8 + j;
+				dma_free_coherent(dev, pbl_bt_sz,
+						  mr->pbl_bt_l2[bt_idx],
+						  mr->pbl_l2_dma_addr[bt_idx]);
+			}
+		}
+	} else if (mhop_num == 3 && err_loop_index == 1) {
+		for (i -= 1; i >= 0; i--) {
+			dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
+					  mr->pbl_l1_dma_addr[i]);
+
+			for (j = 0; j < pbl_bt_sz / 8; j++) {
+				bt_idx = i * pbl_bt_sz / 8 + j;
+				dma_free_coherent(dev, pbl_bt_sz,
+						  mr->pbl_bt_l2[bt_idx],
+						  mr->pbl_l2_dma_addr[bt_idx]);
+			}
+		}
+	} else if (mhop_num == 2 && err_loop_index == 1) {
+		for (i -= 1; i >= 0; i--)
+			dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
+					  mr->pbl_l1_dma_addr[i]);
+	} else {
+		dev_warn(dev, "not support: mhop_num=%d, err_loop_index=%d.",
+			 mhop_num, err_loop_index);
+		return;
+	}
+
+	dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l0, mr->pbl_l0_dma_addr);
+	mr->pbl_bt_l0 = NULL;
+	mr->pbl_l0_dma_addr = 0;
+}
+
+/* PBL multi hop addressing */
+static int hns_roce_mhop_alloc(struct hns_roce_dev *hr_dev, int npages,
+			       struct hns_roce_mr *mr)
+{
+	struct device *dev = hr_dev->dev;
+	int mr_alloc_done = 0;
+	int npages_allocated;
+	int i = 0, j = 0;
+	u32 pbl_bt_sz;
+	u32 mhop_num;
+	u64 pbl_last_bt_num;
+	u64 pbl_bt_cnt = 0;
+	u64 bt_idx;
+	u64 size;
+
+	mhop_num = hr_dev->caps.pbl_hop_num;
+	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
+	pbl_last_bt_num = (npages + pbl_bt_sz / 8 - 1) / (pbl_bt_sz / 8);
+
+	if (mhop_num == HNS_ROCE_HOP_NUM_0)
+		return 0;
+
+	/* hop_num = 1 */
+	if (mhop_num == 1) {
+		if (npages > pbl_bt_sz / 8) {
+			dev_err(dev, "npages %d is larger than buf_pg_sz!",
+				npages);
+			return -EINVAL;
+		}
+		mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
+						 &(mr->pbl_dma_addr),
+						 GFP_KERNEL);
+		if (!mr->pbl_buf)
+			return -ENOMEM;
+
+		mr->pbl_size = npages;
+		mr->pbl_ba = mr->pbl_dma_addr;
+		mr->pbl_hop_num = hr_dev->caps.pbl_hop_num;
+		mr->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz;
+		mr->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz;
+		return 0;
+	}
+
+	mr->pbl_l1_dma_addr = kcalloc(pbl_bt_sz / 8,
+				      sizeof(*mr->pbl_l1_dma_addr),
+				      GFP_KERNEL);
+	if (!mr->pbl_l1_dma_addr)
+		return -ENOMEM;
+
+	mr->pbl_bt_l1 = kcalloc(pbl_bt_sz / 8, sizeof(*mr->pbl_bt_l1),
+				GFP_KERNEL);
+	if (!mr->pbl_bt_l1)
+		goto err_kcalloc_bt_l1;
+
+	if (mhop_num == 3) {
+		mr->pbl_l2_dma_addr = kcalloc(pbl_last_bt_num,
+					      sizeof(*mr->pbl_l2_dma_addr),
+					      GFP_KERNEL);
+		if (!mr->pbl_l2_dma_addr)
+			goto err_kcalloc_l2_dma;
+
+		mr->pbl_bt_l2 = kcalloc(pbl_last_bt_num,
+					sizeof(*mr->pbl_bt_l2),
+					GFP_KERNEL);
+		if (!mr->pbl_bt_l2)
+			goto err_kcalloc_bt_l2;
+	}
+
+	/* alloc L0 BT */
+	mr->pbl_bt_l0 = dma_alloc_coherent(dev, pbl_bt_sz,
+					   &(mr->pbl_l0_dma_addr),
+					   GFP_KERNEL);
+	if (!mr->pbl_bt_l0)
+		goto err_dma_alloc_l0;
+
+	if (mhop_num == 2) {
+		/* alloc L1 BT */
+		for (i = 0; i < pbl_bt_sz / 8; i++) {
+			if (pbl_bt_cnt + 1 < pbl_last_bt_num) {
+				size = pbl_bt_sz;
+			} else {
+				npages_allocated = i * (pbl_bt_sz / 8);
+				size = (npages - npages_allocated) * 8;
+			}
+			mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, size,
+						    &(mr->pbl_l1_dma_addr[i]),
+						    GFP_KERNEL);
+			if (!mr->pbl_bt_l1[i]) {
+				hns_roce_loop_free(hr_dev, mr, 1, i, 0);
+				goto err_dma_alloc_l0;
+			}
+
+			*(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i];
+
+			pbl_bt_cnt++;
+			if (pbl_bt_cnt >= pbl_last_bt_num)
+				break;
+		}
+	} else if (mhop_num == 3) {
+		/* alloc L1, L2 BT */
+		for (i = 0; i < pbl_bt_sz / 8; i++) {
+			mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, pbl_bt_sz,
+						    &(mr->pbl_l1_dma_addr[i]),
+						    GFP_KERNEL);
+			if (!mr->pbl_bt_l1[i]) {
+				hns_roce_loop_free(hr_dev, mr, 1, i, 0);
+				goto err_dma_alloc_l0;
+			}
+
+			*(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i];
+
+			for (j = 0; j < pbl_bt_sz / 8; j++) {
+				bt_idx = i * pbl_bt_sz / 8 + j;
+
+				if (pbl_bt_cnt + 1 < pbl_last_bt_num) {
+					size = pbl_bt_sz;
+				} else {
+					npages_allocated = bt_idx *
+							   (pbl_bt_sz / 8);
+					size = (npages - npages_allocated) * 8;
+				}
+				mr->pbl_bt_l2[bt_idx] = dma_alloc_coherent(
+					      dev, size,
+					      &(mr->pbl_l2_dma_addr[bt_idx]),
+					      GFP_KERNEL);
+				if (!mr->pbl_bt_l2[bt_idx]) {
+					hns_roce_loop_free(hr_dev, mr, 2, i, j);
+					goto err_dma_alloc_l0;
+				}
+
+				*(mr->pbl_bt_l1[i] + j) =
+						mr->pbl_l2_dma_addr[bt_idx];
+
+				pbl_bt_cnt++;
+				if (pbl_bt_cnt >= pbl_last_bt_num) {
+					mr_alloc_done = 1;
+					break;
+				}
+			}
+
+			if (mr_alloc_done)
+				break;
+		}
+	}
+
+	mr->l0_chunk_last_num = i + 1;
+	if (mhop_num == 3)
+		mr->l1_chunk_last_num = j + 1;
+
+	mr->pbl_size = npages;
+	mr->pbl_ba = mr->pbl_l0_dma_addr;
+	mr->pbl_hop_num = hr_dev->caps.pbl_hop_num;
+	mr->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz;
+	mr->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz;
+
+	return 0;
+
+err_dma_alloc_l0:
+	kfree(mr->pbl_bt_l2);
+	mr->pbl_bt_l2 = NULL;
+
+err_kcalloc_bt_l2:
+	kfree(mr->pbl_l2_dma_addr);
+	mr->pbl_l2_dma_addr = NULL;
+
+err_kcalloc_l2_dma:
+	kfree(mr->pbl_bt_l1);
+	mr->pbl_bt_l1 = NULL;
+
+err_kcalloc_bt_l1:
+	kfree(mr->pbl_l1_dma_addr);
+	mr->pbl_l1_dma_addr = NULL;
+
+	return -ENOMEM;
+}
+
+static int hns_roce_mr_alloc(struct hns_roce_dev *hr_dev, u32 pd, u64 iova,
+			     u64 size, u32 access, int npages,
+			     struct hns_roce_mr *mr)
+{
+	struct device *dev = hr_dev->dev;
+	unsigned long index = 0;
+	int ret = 0;
+
+	/* Allocate a key for mr from mr_table */
+	ret = hns_roce_bitmap_alloc(&hr_dev->mr_table.mtpt_bitmap, &index);
+	if (ret == -1)
+		return -ENOMEM;
+
+	mr->iova = iova;			/* MR va starting addr */
+	mr->size = size;			/* MR addr range */
+	mr->pd = pd;				/* MR num */
+	mr->access = access;			/* MR access permit */
+	mr->enabled = 0;			/* MR active status */
+	mr->key = hw_index_to_key(index);	/* MR key */
+
+	if (size == ~0ull) {
+		mr->type = MR_TYPE_DMA;
+		mr->pbl_buf = NULL;
+		mr->pbl_dma_addr = 0;
+		/* PBL multi-hop addressing parameters */
+		mr->pbl_bt_l2 = NULL;
+		mr->pbl_bt_l1 = NULL;
+		mr->pbl_bt_l0 = NULL;
+		mr->pbl_l2_dma_addr = NULL;
+		mr->pbl_l1_dma_addr = NULL;
+		mr->pbl_l0_dma_addr = 0;
+	} else {
+		mr->type = MR_TYPE_MR;
+		if (!hr_dev->caps.pbl_hop_num) {
+			mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
+							 &(mr->pbl_dma_addr),
+							 GFP_KERNEL);
+			if (!mr->pbl_buf)
+				return -ENOMEM;
+		} else {
+			ret = hns_roce_mhop_alloc(hr_dev, npages, mr);
+		}
+	}
+
+	return ret;
+}
+
+static void hns_roce_mhop_free(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_mr *mr)
+{
+	struct device *dev = hr_dev->dev;
+	int npages_allocated;
+	int npages;
+	int i, j;
+	u32 pbl_bt_sz;
+	u32 mhop_num;
+	u64 bt_idx;
+
+	npages = ib_umem_page_count(mr->umem);
+	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
+	mhop_num = hr_dev->caps.pbl_hop_num;
+
+	if (mhop_num == HNS_ROCE_HOP_NUM_0)
+		return;
+
+	/* hop_num = 1 */
+	if (mhop_num == 1) {
+		dma_free_coherent(dev, (unsigned int)(npages * 8),
+				  mr->pbl_buf, mr->pbl_dma_addr);
+		return;
+	}
+
+	dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l0,
+			  mr->pbl_l0_dma_addr);
+
+	if (mhop_num == 2) {
+		for (i = 0; i < mr->l0_chunk_last_num; i++) {
+			if (i == mr->l0_chunk_last_num - 1) {
+				npages_allocated = i * (pbl_bt_sz / 8);
+
+				dma_free_coherent(dev,
+					      (npages - npages_allocated) * 8,
+					      mr->pbl_bt_l1[i],
+					      mr->pbl_l1_dma_addr[i]);
+
+				break;
+			}
+
+			dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
+					  mr->pbl_l1_dma_addr[i]);
+		}
+	} else if (mhop_num == 3) {
+		for (i = 0; i < mr->l0_chunk_last_num; i++) {
+			dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
+					  mr->pbl_l1_dma_addr[i]);
+
+			for (j = 0; j < pbl_bt_sz / 8; j++) {
+				bt_idx = i * (pbl_bt_sz / 8) + j;
+
+				if ((i == mr->l0_chunk_last_num - 1)
+				    && j == mr->l1_chunk_last_num - 1) {
+					npages_allocated = bt_idx *
+							   (pbl_bt_sz / 8);
+
+					dma_free_coherent(dev,
+					      (npages - npages_allocated) * 8,
+					      mr->pbl_bt_l2[bt_idx],
+					      mr->pbl_l2_dma_addr[bt_idx]);
+
+					break;
+				}
+
+				dma_free_coherent(dev, pbl_bt_sz,
+						mr->pbl_bt_l2[bt_idx],
+						mr->pbl_l2_dma_addr[bt_idx]);
+			}
+		}
+	}
+
+	kfree(mr->pbl_bt_l1);
+	kfree(mr->pbl_l1_dma_addr);
+	mr->pbl_bt_l1 = NULL;
+	mr->pbl_l1_dma_addr = NULL;
+	if (mhop_num == 3) {
+		kfree(mr->pbl_bt_l2);
+		kfree(mr->pbl_l2_dma_addr);
+		mr->pbl_bt_l2 = NULL;
+		mr->pbl_l2_dma_addr = NULL;
+	}
+}
+
+static void hns_roce_mr_free(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_mr *mr)
+{
+	struct device *dev = hr_dev->dev;
+	int npages = 0;
+	int ret;
+
+	if (mr->enabled) {
+		ret = hns_roce_hw2sw_mpt(hr_dev, NULL, key_to_hw_index(mr->key)
+					 & (hr_dev->caps.num_mtpts - 1));
+		if (ret)
+			dev_warn(dev, "HW2SW_MPT failed (%d)\n", ret);
+	}
+
+	if (mr->size != ~0ULL) {
+		npages = ib_umem_page_count(mr->umem);
+
+		if (!hr_dev->caps.pbl_hop_num)
+			dma_free_coherent(dev, (unsigned int)(npages * 8),
+					  mr->pbl_buf, mr->pbl_dma_addr);
+		else
+			hns_roce_mhop_free(hr_dev, mr);
+	}
+
+	if (mr->enabled)
+		hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table,
+				   key_to_hw_index(mr->key));
+
+	hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap,
+			     key_to_hw_index(mr->key), BITMAP_NO_RR);
+}
+
+static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_mr *mr)
+{
+	int ret;
+	unsigned long mtpt_idx = key_to_hw_index(mr->key);
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_cmd_mailbox *mailbox;
+	struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
+
+	/* Prepare HEM entry memory */
+	ret = hns_roce_table_get(hr_dev, &mr_table->mtpt_table, mtpt_idx);
+	if (ret)
+		return ret;
+
+	/* Allocate mailbox memory */
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox)) {
+		ret = PTR_ERR(mailbox);
+		goto err_table;
+	}
+
+	ret = hr_dev->hw->write_mtpt(mailbox->buf, mr, mtpt_idx);
+	if (ret) {
+		dev_err(dev, "Write mtpt fail!\n");
+		goto err_page;
+	}
+
+	ret = hns_roce_sw2hw_mpt(hr_dev, mailbox,
+				 mtpt_idx & (hr_dev->caps.num_mtpts - 1));
+	if (ret) {
+		dev_err(dev, "SW2HW_MPT failed (%d)\n", ret);
+		goto err_page;
+	}
+
+	mr->enabled = 1;
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return 0;
+
+err_page:
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+err_table:
+	hns_roce_table_put(hr_dev, &mr_table->mtpt_table, mtpt_idx);
+	return ret;
+}
+
+static int hns_roce_write_mtt_chunk(struct hns_roce_dev *hr_dev,
+				    struct hns_roce_mtt *mtt, u32 start_index,
+				    u32 npages, u64 *page_list)
+{
+	struct hns_roce_hem_table *table;
+	dma_addr_t dma_handle;
+	__le64 *mtts;
+	u32 s = start_index * sizeof(u64);
+	u32 bt_page_size;
+	u32 i;
+
+	if (mtt->mtt_type == MTT_TYPE_WQE)
+		bt_page_size = 1 << (hr_dev->caps.mtt_ba_pg_sz + PAGE_SHIFT);
+	else
+		bt_page_size = 1 << (hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT);
+
+	/* All MTTs must fit in the same page */
+	if (start_index / (bt_page_size / sizeof(u64)) !=
+		(start_index + npages - 1) / (bt_page_size / sizeof(u64)))
+		return -EINVAL;
+
+	if (start_index & (HNS_ROCE_MTT_ENTRY_PER_SEG - 1))
+		return -EINVAL;
+
+	if (mtt->mtt_type == MTT_TYPE_WQE)
+		table = &hr_dev->mr_table.mtt_table;
+	else
+		table = &hr_dev->mr_table.mtt_cqe_table;
+
+	mtts = hns_roce_table_find(hr_dev, table,
+				mtt->first_seg + s / hr_dev->caps.mtt_entry_sz,
+				&dma_handle);
+	if (!mtts)
+		return -ENOMEM;
+
+	/* Save page addr, low 12 bits : 0 */
+	for (i = 0; i < npages; ++i) {
+		if (!hr_dev->caps.mtt_hop_num)
+			mtts[i] = cpu_to_le64(page_list[i] >> PAGE_ADDR_SHIFT);
+		else
+			mtts[i] = cpu_to_le64(page_list[i]);
+	}
+
+	return 0;
+}
+
+static int hns_roce_write_mtt(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_mtt *mtt, u32 start_index,
+			      u32 npages, u64 *page_list)
+{
+	int chunk;
+	int ret;
+	u32 bt_page_size;
+
+	if (mtt->order < 0)
+		return -EINVAL;
+
+	if (mtt->mtt_type == MTT_TYPE_WQE)
+		bt_page_size = 1 << (hr_dev->caps.mtt_ba_pg_sz + PAGE_SHIFT);
+	else
+		bt_page_size = 1 << (hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT);
+
+	while (npages > 0) {
+		chunk = min_t(int, bt_page_size / sizeof(u64), npages);
+
+		ret = hns_roce_write_mtt_chunk(hr_dev, mtt, start_index, chunk,
+					       page_list);
+		if (ret)
+			return ret;
+
+		npages -= chunk;
+		start_index += chunk;
+		page_list += chunk;
+	}
+
+	return 0;
+}
+
+int hns_roce_buf_write_mtt(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_mtt *mtt, struct hns_roce_buf *buf)
+{
+	u64 *page_list;
+	int ret;
+	u32 i;
+
+	page_list = kmalloc_array(buf->npages, sizeof(*page_list), GFP_KERNEL);
+	if (!page_list)
+		return -ENOMEM;
+
+	for (i = 0; i < buf->npages; ++i) {
+		if (buf->nbufs == 1)
+			page_list[i] = buf->direct.map + (i << buf->page_shift);
+		else
+			page_list[i] = buf->page_list[i].map;
+
+	}
+	ret = hns_roce_write_mtt(hr_dev, mtt, 0, buf->npages, page_list);
+
+	kfree(page_list);
+
+	return ret;
+}
+
+int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
+	int ret;
+
+	ret = hns_roce_bitmap_init(&mr_table->mtpt_bitmap,
+				   hr_dev->caps.num_mtpts,
+				   hr_dev->caps.num_mtpts - 1,
+				   hr_dev->caps.reserved_mrws, 0);
+	if (ret)
+		return ret;
+
+	ret = hns_roce_buddy_init(&mr_table->mtt_buddy,
+				  ilog2(hr_dev->caps.num_mtt_segs));
+	if (ret)
+		goto err_buddy;
+
+	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE)) {
+		ret = hns_roce_buddy_init(&mr_table->mtt_cqe_buddy,
+					  ilog2(hr_dev->caps.num_cqe_segs));
+		if (ret)
+			goto err_buddy_cqe;
+	}
+	return 0;
+
+err_buddy_cqe:
+	hns_roce_buddy_cleanup(&mr_table->mtt_buddy);
+
+err_buddy:
+	hns_roce_bitmap_cleanup(&mr_table->mtpt_bitmap);
+	return ret;
+}
+
+void hns_roce_cleanup_mr_table(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
+
+	hns_roce_buddy_cleanup(&mr_table->mtt_buddy);
+	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
+		hns_roce_buddy_cleanup(&mr_table->mtt_cqe_buddy);
+	hns_roce_bitmap_cleanup(&mr_table->mtpt_bitmap);
+}
+
+struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct hns_roce_mr *mr;
+	int ret;
+
+	mr = kmalloc(sizeof(*mr), GFP_KERNEL);
+	if (mr == NULL)
+		return  ERR_PTR(-ENOMEM);
+
+	/* Allocate memory region key */
+	ret = hns_roce_mr_alloc(to_hr_dev(pd->device), to_hr_pd(pd)->pdn, 0,
+				~0ULL, acc, 0, mr);
+	if (ret)
+		goto err_free;
+
+	ret = hns_roce_mr_enable(to_hr_dev(pd->device), mr);
+	if (ret)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_mr:
+	hns_roce_mr_free(to_hr_dev(pd->device), mr);
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(ret);
+}
+
+int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_mtt *mtt, struct ib_umem *umem)
+{
+	struct device *dev = hr_dev->dev;
+	struct scatterlist *sg;
+	unsigned int order;
+	int i, k, entry;
+	int npage = 0;
+	int ret = 0;
+	int len;
+	u64 page_addr;
+	u64 *pages;
+	u32 bt_page_size;
+	u32 n;
+
+	order = mtt->mtt_type == MTT_TYPE_WQE ? hr_dev->caps.mtt_ba_pg_sz :
+		hr_dev->caps.cqe_ba_pg_sz;
+	bt_page_size = 1 << (order + PAGE_SHIFT);
+
+	pages = (u64 *) __get_free_pages(GFP_KERNEL, order);
+	if (!pages)
+		return -ENOMEM;
+
+	i = n = 0;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		len = sg_dma_len(sg) >> PAGE_SHIFT;
+		for (k = 0; k < len; ++k) {
+			page_addr =
+				sg_dma_address(sg) + (k << umem->page_shift);
+			if (!(npage % (1 << (mtt->page_shift - PAGE_SHIFT)))) {
+				if (page_addr & ((1 << mtt->page_shift) - 1)) {
+					dev_err(dev, "page_addr 0x%llx is not page_shift %d alignment!\n",
+						page_addr, mtt->page_shift);
+					ret = -EINVAL;
+					goto out;
+				}
+				pages[i++] = page_addr;
+			}
+			npage++;
+			if (i == bt_page_size / sizeof(u64)) {
+				ret = hns_roce_write_mtt(hr_dev, mtt, n, i,
+							 pages);
+				if (ret)
+					goto out;
+				n += i;
+				i = 0;
+			}
+		}
+	}
+
+	if (i)
+		ret = hns_roce_write_mtt(hr_dev, mtt, n, i, pages);
+
+out:
+	free_pages((unsigned long) pages, order);
+	return ret;
+}
+
+static int hns_roce_ib_umem_write_mr(struct hns_roce_dev *hr_dev,
+				     struct hns_roce_mr *mr,
+				     struct ib_umem *umem)
+{
+	struct scatterlist *sg;
+	int i = 0, j = 0, k;
+	int entry;
+	int len;
+	u64 page_addr;
+	u32 pbl_bt_sz;
+
+	if (hr_dev->caps.pbl_hop_num == HNS_ROCE_HOP_NUM_0)
+		return 0;
+
+	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		len = sg_dma_len(sg) >> PAGE_SHIFT;
+		for (k = 0; k < len; ++k) {
+			page_addr = sg_dma_address(sg) +
+				    (k << umem->page_shift);
+
+			if (!hr_dev->caps.pbl_hop_num) {
+				mr->pbl_buf[i++] = page_addr >> 12;
+			} else if (hr_dev->caps.pbl_hop_num == 1) {
+				mr->pbl_buf[i++] = page_addr;
+			} else {
+				if (hr_dev->caps.pbl_hop_num == 2)
+					mr->pbl_bt_l1[i][j] = page_addr;
+				else if (hr_dev->caps.pbl_hop_num == 3)
+					mr->pbl_bt_l2[i][j] = page_addr;
+
+				j++;
+				if (j >= (pbl_bt_sz / 8)) {
+					i++;
+					j = 0;
+				}
+			}
+		}
+	}
+
+	/* Memory barrier */
+	mb();
+
+	return 0;
+}
+
+struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				   u64 virt_addr, int access_flags,
+				   struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_mr *mr;
+	int bt_size;
+	int ret;
+	int n;
+	int i;
+
+	mr = kmalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->umem = ib_umem_get(pd->uobject->context, start, length,
+			       access_flags, 0);
+	if (IS_ERR(mr->umem)) {
+		ret = PTR_ERR(mr->umem);
+		goto err_free;
+	}
+
+	n = ib_umem_page_count(mr->umem);
+	if (mr->umem->page_shift != HNS_ROCE_HEM_PAGE_SHIFT) {
+		dev_err(dev, "Just support 4K page size but is 0x%lx now!\n",
+			BIT(mr->umem->page_shift));
+		ret = -EINVAL;
+		goto err_umem;
+	}
+
+	if (!hr_dev->caps.pbl_hop_num) {
+		if (n > HNS_ROCE_MAX_MTPT_PBL_NUM) {
+			dev_err(dev,
+			     " MR len %lld err. MR is limited to 4G at most!\n",
+			     length);
+			ret = -EINVAL;
+			goto err_umem;
+		}
+	} else {
+		int pbl_size = 1;
+
+		bt_size = (1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT)) / 8;
+		for (i = 0; i < hr_dev->caps.pbl_hop_num; i++)
+			pbl_size *= bt_size;
+		if (n > pbl_size) {
+			dev_err(dev,
+			    " MR len %lld err. MR page num is limited to %d!\n",
+			    length, pbl_size);
+			ret = -EINVAL;
+			goto err_umem;
+		}
+	}
+
+	ret = hns_roce_mr_alloc(hr_dev, to_hr_pd(pd)->pdn, virt_addr, length,
+				access_flags, n, mr);
+	if (ret)
+		goto err_umem;
+
+	ret = hns_roce_ib_umem_write_mr(hr_dev, mr, mr->umem);
+	if (ret)
+		goto err_mr;
+
+	ret = hns_roce_mr_enable(hr_dev, mr);
+	if (ret)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->key;
+
+	return &mr->ibmr;
+
+err_mr:
+	hns_roce_mr_free(hr_dev, mr);
+
+err_umem:
+	ib_umem_release(mr->umem);
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(ret);
+}
+
+int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length,
+			   u64 virt_addr, int mr_access_flags, struct ib_pd *pd,
+			   struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device);
+	struct hns_roce_mr *mr = to_hr_mr(ibmr);
+	struct hns_roce_cmd_mailbox *mailbox;
+	struct device *dev = hr_dev->dev;
+	unsigned long mtpt_idx;
+	u32 pdn = 0;
+	int npages;
+	int ret;
+
+	if (!mr->enabled)
+		return -EINVAL;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	mtpt_idx = key_to_hw_index(mr->key) & (hr_dev->caps.num_mtpts - 1);
+	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, mtpt_idx, 0,
+				HNS_ROCE_CMD_QUERY_MPT,
+				HNS_ROCE_CMD_TIMEOUT_MSECS);
+	if (ret)
+		goto free_cmd_mbox;
+
+	ret = hns_roce_hw2sw_mpt(hr_dev, NULL, mtpt_idx);
+	if (ret)
+		dev_warn(dev, "HW2SW_MPT failed (%d)\n", ret);
+
+	mr->enabled = 0;
+
+	if (flags & IB_MR_REREG_PD)
+		pdn = to_hr_pd(pd)->pdn;
+
+	if (flags & IB_MR_REREG_TRANS) {
+		if (mr->size != ~0ULL) {
+			npages = ib_umem_page_count(mr->umem);
+
+			if (hr_dev->caps.pbl_hop_num)
+				hns_roce_mhop_free(hr_dev, mr);
+			else
+				dma_free_coherent(dev, npages * 8, mr->pbl_buf,
+						  mr->pbl_dma_addr);
+		}
+		ib_umem_release(mr->umem);
+
+		mr->umem = ib_umem_get(ibmr->uobject->context, start, length,
+				       mr_access_flags, 0);
+		if (IS_ERR(mr->umem)) {
+			ret = PTR_ERR(mr->umem);
+			mr->umem = NULL;
+			goto free_cmd_mbox;
+		}
+		npages = ib_umem_page_count(mr->umem);
+
+		if (hr_dev->caps.pbl_hop_num) {
+			ret = hns_roce_mhop_alloc(hr_dev, npages, mr);
+			if (ret)
+				goto release_umem;
+		} else {
+			mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
+							 &(mr->pbl_dma_addr),
+							 GFP_KERNEL);
+			if (!mr->pbl_buf) {
+				ret = -ENOMEM;
+				goto release_umem;
+			}
+		}
+	}
+
+	ret = hr_dev->hw->rereg_write_mtpt(hr_dev, mr, flags, pdn,
+					   mr_access_flags, virt_addr,
+					   length, mailbox->buf);
+	if (ret) {
+		if (flags & IB_MR_REREG_TRANS)
+			goto release_umem;
+		else
+			goto free_cmd_mbox;
+	}
+
+	if (flags & IB_MR_REREG_TRANS) {
+		ret = hns_roce_ib_umem_write_mr(hr_dev, mr, mr->umem);
+		if (ret) {
+			if (mr->size != ~0ULL) {
+				npages = ib_umem_page_count(mr->umem);
+
+				if (hr_dev->caps.pbl_hop_num)
+					hns_roce_mhop_free(hr_dev, mr);
+				else
+					dma_free_coherent(dev, npages * 8,
+							  mr->pbl_buf,
+							  mr->pbl_dma_addr);
+			}
+
+			goto release_umem;
+		}
+	}
+
+	ret = hns_roce_sw2hw_mpt(hr_dev, mailbox, mtpt_idx);
+	if (ret) {
+		dev_err(dev, "SW2HW_MPT failed (%d)\n", ret);
+		goto release_umem;
+	}
+
+	mr->enabled = 1;
+	if (flags & IB_MR_REREG_ACCESS)
+		mr->access = mr_access_flags;
+
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return 0;
+
+release_umem:
+	ib_umem_release(mr->umem);
+
+free_cmd_mbox:
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return ret;
+}
+
+int hns_roce_dereg_mr(struct ib_mr *ibmr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device);
+	struct hns_roce_mr *mr = to_hr_mr(ibmr);
+	int ret = 0;
+
+	if (hr_dev->hw->dereg_mr) {
+		ret = hr_dev->hw->dereg_mr(hr_dev, mr);
+	} else {
+		hns_roce_mr_free(hr_dev, mr);
+
+		if (mr->umem)
+			ib_umem_release(mr->umem);
+
+		kfree(mr);
+	}
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c
new file mode 100644
index 0000000..4b41e04
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_pd.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/pci.h>
+#include <uapi/rdma/hns-abi.h>
+#include "hns_roce_device.h"
+
+static int hns_roce_pd_alloc(struct hns_roce_dev *hr_dev, unsigned long *pdn)
+{
+	return hns_roce_bitmap_alloc(&hr_dev->pd_bitmap, pdn);
+}
+
+static void hns_roce_pd_free(struct hns_roce_dev *hr_dev, unsigned long pdn)
+{
+	hns_roce_bitmap_free(&hr_dev->pd_bitmap, pdn, BITMAP_NO_RR);
+}
+
+int hns_roce_init_pd_table(struct hns_roce_dev *hr_dev)
+{
+	return hns_roce_bitmap_init(&hr_dev->pd_bitmap, hr_dev->caps.num_pds,
+				    hr_dev->caps.num_pds - 1,
+				    hr_dev->caps.reserved_pds, 0);
+}
+
+void hns_roce_cleanup_pd_table(struct hns_roce_dev *hr_dev)
+{
+	hns_roce_bitmap_cleanup(&hr_dev->pd_bitmap);
+}
+
+struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev,
+				struct ib_ucontext *context,
+				struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_pd *pd;
+	int ret;
+
+	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	ret = hns_roce_pd_alloc(to_hr_dev(ib_dev), &pd->pdn);
+	if (ret) {
+		kfree(pd);
+		dev_err(dev, "[alloc_pd]hns_roce_pd_alloc failed!\n");
+		return ERR_PTR(ret);
+	}
+
+	if (context) {
+		struct hns_roce_ib_alloc_pd_resp uresp = {.pdn = pd->pdn};
+
+		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
+			hns_roce_pd_free(to_hr_dev(ib_dev), pd->pdn);
+			dev_err(dev, "[alloc_pd]ib_copy_to_udata failed!\n");
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	return &pd->ibpd;
+}
+EXPORT_SYMBOL_GPL(hns_roce_alloc_pd);
+
+int hns_roce_dealloc_pd(struct ib_pd *pd)
+{
+	hns_roce_pd_free(to_hr_dev(pd->device), to_hr_pd(pd)->pdn);
+	kfree(to_hr_pd(pd));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(hns_roce_dealloc_pd);
+
+int hns_roce_uar_alloc(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar)
+{
+	struct resource *res;
+	int ret = 0;
+
+	/* Using bitmap to manager UAR index */
+	ret = hns_roce_bitmap_alloc(&hr_dev->uar_table.bitmap, &uar->index);
+	if (ret == -1)
+		return -ENOMEM;
+
+	if (uar->index > 0)
+		uar->index = (uar->index - 1) %
+			     (hr_dev->caps.phy_num_uars - 1) + 1;
+
+	if (!dev_is_pci(hr_dev->dev)) {
+		res = platform_get_resource(hr_dev->pdev, IORESOURCE_MEM, 0);
+		if (!res) {
+			dev_err(&hr_dev->pdev->dev, "memory resource not found!\n");
+			return -EINVAL;
+		}
+		uar->pfn = ((res->start) >> PAGE_SHIFT) + uar->index;
+	} else {
+		uar->pfn = ((pci_resource_start(hr_dev->pci_dev, 2))
+			   >> PAGE_SHIFT);
+	}
+
+	return 0;
+}
+
+void hns_roce_uar_free(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar)
+{
+	hns_roce_bitmap_free(&hr_dev->uar_table.bitmap, uar->index,
+			     BITMAP_NO_RR);
+}
+
+int hns_roce_init_uar_table(struct hns_roce_dev *hr_dev)
+{
+	return hns_roce_bitmap_init(&hr_dev->uar_table.bitmap,
+				    hr_dev->caps.num_uars,
+				    hr_dev->caps.num_uars - 1,
+				    hr_dev->caps.reserved_uars, 0);
+}
+
+void hns_roce_cleanup_uar_table(struct hns_roce_dev *hr_dev)
+{
+	hns_roce_bitmap_cleanup(&hr_dev->uar_table.bitmap);
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
new file mode 100644
index 0000000..baaf906
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -0,0 +1,1081 @@
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/platform_device.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_umem.h>
+#include "hns_roce_common.h"
+#include "hns_roce_device.h"
+#include "hns_roce_hem.h"
+#include <rdma/hns-abi.h>
+
+#define SQP_NUM				(2 * HNS_ROCE_MAX_PORTS)
+
+void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type)
+{
+	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_qp *qp;
+
+	spin_lock(&qp_table->lock);
+
+	qp = __hns_roce_qp_lookup(hr_dev, qpn);
+	if (qp)
+		atomic_inc(&qp->refcount);
+
+	spin_unlock(&qp_table->lock);
+
+	if (!qp) {
+		dev_warn(dev, "Async event for bogus QP %08x\n", qpn);
+		return;
+	}
+
+	qp->event(qp, (enum hns_roce_event)event_type);
+
+	if (atomic_dec_and_test(&qp->refcount))
+		complete(&qp->free);
+}
+EXPORT_SYMBOL_GPL(hns_roce_qp_event);
+
+static void hns_roce_ib_qp_event(struct hns_roce_qp *hr_qp,
+				 enum hns_roce_event type)
+{
+	struct ib_event event;
+	struct ib_qp *ibqp = &hr_qp->ibqp;
+
+	if (ibqp->event_handler) {
+		event.device = ibqp->device;
+		event.element.qp = ibqp;
+		switch (type) {
+		case HNS_ROCE_EVENT_TYPE_PATH_MIG:
+			event.event = IB_EVENT_PATH_MIG;
+			break;
+		case HNS_ROCE_EVENT_TYPE_COMM_EST:
+			event.event = IB_EVENT_COMM_EST;
+			break;
+		case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
+			event.event = IB_EVENT_SQ_DRAINED;
+			break;
+		case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
+			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			break;
+		case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
+			event.event = IB_EVENT_QP_FATAL;
+			break;
+		case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
+			event.event = IB_EVENT_PATH_MIG_ERR;
+			break;
+		case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
+			event.event = IB_EVENT_QP_REQ_ERR;
+			break;
+		case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+			event.event = IB_EVENT_QP_ACCESS_ERR;
+			break;
+		default:
+			dev_dbg(ibqp->device->dev.parent, "roce_ib: Unexpected event type %d on QP %06lx\n",
+				type, hr_qp->qpn);
+			return;
+		}
+		ibqp->event_handler(&event, ibqp->qp_context);
+	}
+}
+
+static int hns_roce_reserve_range_qp(struct hns_roce_dev *hr_dev, int cnt,
+				     int align, unsigned long *base)
+{
+	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+
+	return hns_roce_bitmap_alloc_range(&qp_table->bitmap, cnt, align, base);
+}
+
+enum hns_roce_qp_state to_hns_roce_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:
+		return HNS_ROCE_QP_STATE_RST;
+	case IB_QPS_INIT:
+		return HNS_ROCE_QP_STATE_INIT;
+	case IB_QPS_RTR:
+		return HNS_ROCE_QP_STATE_RTR;
+	case IB_QPS_RTS:
+		return HNS_ROCE_QP_STATE_RTS;
+	case IB_QPS_SQD:
+		return HNS_ROCE_QP_STATE_SQD;
+	case IB_QPS_ERR:
+		return HNS_ROCE_QP_STATE_ERR;
+	default:
+		return HNS_ROCE_QP_NUM_STATE;
+	}
+}
+EXPORT_SYMBOL_GPL(to_hns_roce_state);
+
+static int hns_roce_gsi_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
+				 struct hns_roce_qp *hr_qp)
+{
+	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+	int ret;
+
+	if (!qpn)
+		return -EINVAL;
+
+	hr_qp->qpn = qpn;
+
+	spin_lock_irq(&qp_table->lock);
+	ret = radix_tree_insert(&hr_dev->qp_table_tree,
+				hr_qp->qpn & (hr_dev->caps.num_qps - 1), hr_qp);
+	spin_unlock_irq(&qp_table->lock);
+	if (ret) {
+		dev_err(hr_dev->dev, "QPC radix_tree_insert failed\n");
+		goto err_put_irrl;
+	}
+
+	atomic_set(&hr_qp->refcount, 1);
+	init_completion(&hr_qp->free);
+
+	return 0;
+
+err_put_irrl:
+
+	return ret;
+}
+
+static int hns_roce_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
+			     struct hns_roce_qp *hr_qp)
+{
+	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+	struct device *dev = hr_dev->dev;
+	int ret;
+
+	if (!qpn)
+		return -EINVAL;
+
+	hr_qp->qpn = qpn;
+
+	/* Alloc memory for QPC */
+	ret = hns_roce_table_get(hr_dev, &qp_table->qp_table, hr_qp->qpn);
+	if (ret) {
+		dev_err(dev, "QPC table get failed\n");
+		goto err_out;
+	}
+
+	/* Alloc memory for IRRL */
+	ret = hns_roce_table_get(hr_dev, &qp_table->irrl_table, hr_qp->qpn);
+	if (ret) {
+		dev_err(dev, "IRRL table get failed\n");
+		goto err_put_qp;
+	}
+
+	if (hr_dev->caps.trrl_entry_sz) {
+		/* Alloc memory for TRRL */
+		ret = hns_roce_table_get(hr_dev, &qp_table->trrl_table,
+					 hr_qp->qpn);
+		if (ret) {
+			dev_err(dev, "TRRL table get failed\n");
+			goto err_put_irrl;
+		}
+	}
+
+	spin_lock_irq(&qp_table->lock);
+	ret = radix_tree_insert(&hr_dev->qp_table_tree,
+				hr_qp->qpn & (hr_dev->caps.num_qps - 1), hr_qp);
+	spin_unlock_irq(&qp_table->lock);
+	if (ret) {
+		dev_err(dev, "QPC radix_tree_insert failed\n");
+		goto err_put_trrl;
+	}
+
+	atomic_set(&hr_qp->refcount, 1);
+	init_completion(&hr_qp->free);
+
+	return 0;
+
+err_put_trrl:
+	if (hr_dev->caps.trrl_entry_sz)
+		hns_roce_table_put(hr_dev, &qp_table->trrl_table, hr_qp->qpn);
+
+err_put_irrl:
+	hns_roce_table_put(hr_dev, &qp_table->irrl_table, hr_qp->qpn);
+
+err_put_qp:
+	hns_roce_table_put(hr_dev, &qp_table->qp_table, hr_qp->qpn);
+
+err_out:
+	return ret;
+}
+
+void hns_roce_qp_remove(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
+{
+	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp_table->lock, flags);
+	radix_tree_delete(&hr_dev->qp_table_tree,
+			  hr_qp->qpn & (hr_dev->caps.num_qps - 1));
+	spin_unlock_irqrestore(&qp_table->lock, flags);
+}
+EXPORT_SYMBOL_GPL(hns_roce_qp_remove);
+
+void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
+{
+	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+
+	if (atomic_dec_and_test(&hr_qp->refcount))
+		complete(&hr_qp->free);
+	wait_for_completion(&hr_qp->free);
+
+	if ((hr_qp->ibqp.qp_type) != IB_QPT_GSI) {
+		if (hr_dev->caps.trrl_entry_sz)
+			hns_roce_table_put(hr_dev, &qp_table->trrl_table,
+					   hr_qp->qpn);
+		hns_roce_table_put(hr_dev, &qp_table->irrl_table, hr_qp->qpn);
+		hns_roce_table_put(hr_dev, &qp_table->qp_table, hr_qp->qpn);
+	}
+}
+EXPORT_SYMBOL_GPL(hns_roce_qp_free);
+
+void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn,
+			       int cnt)
+{
+	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+
+	if (base_qpn < SQP_NUM)
+		return;
+
+	hns_roce_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt, BITMAP_RR);
+}
+EXPORT_SYMBOL_GPL(hns_roce_release_range_qp);
+
+static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
+				struct ib_qp_cap *cap, int is_user, int has_srq,
+				struct hns_roce_qp *hr_qp)
+{
+	struct device *dev = hr_dev->dev;
+	u32 max_cnt;
+
+	/* Check the validity of QP support capacity */
+	if (cap->max_recv_wr > hr_dev->caps.max_wqes ||
+	    cap->max_recv_sge > hr_dev->caps.max_rq_sg) {
+		dev_err(dev, "RQ WR or sge error!max_recv_wr=%d max_recv_sge=%d\n",
+			cap->max_recv_wr, cap->max_recv_sge);
+		return -EINVAL;
+	}
+
+	/* If srq exit, set zero for relative number of rq */
+	if (has_srq) {
+		if (cap->max_recv_wr) {
+			dev_dbg(dev, "srq no need config max_recv_wr\n");
+			return -EINVAL;
+		}
+
+		hr_qp->rq.wqe_cnt = hr_qp->rq.max_gs = 0;
+	} else {
+		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) {
+			dev_err(dev, "user space no need config max_recv_wr max_recv_sge\n");
+			return -EINVAL;
+		}
+
+		if (hr_dev->caps.min_wqes)
+			max_cnt = max(cap->max_recv_wr, hr_dev->caps.min_wqes);
+		else
+			max_cnt = cap->max_recv_wr;
+
+		hr_qp->rq.wqe_cnt = roundup_pow_of_two(max_cnt);
+
+		if ((u32)hr_qp->rq.wqe_cnt > hr_dev->caps.max_wqes) {
+			dev_err(dev, "while setting rq size, rq.wqe_cnt too large\n");
+			return -EINVAL;
+		}
+
+		max_cnt = max(1U, cap->max_recv_sge);
+		hr_qp->rq.max_gs = roundup_pow_of_two(max_cnt);
+		if (hr_dev->caps.max_rq_sg <= 2)
+			hr_qp->rq.wqe_shift =
+					ilog2(hr_dev->caps.max_rq_desc_sz);
+		else
+			hr_qp->rq.wqe_shift =
+					ilog2(hr_dev->caps.max_rq_desc_sz
+					      * hr_qp->rq.max_gs);
+	}
+
+	cap->max_recv_wr = hr_qp->rq.max_post = hr_qp->rq.wqe_cnt;
+	cap->max_recv_sge = hr_qp->rq.max_gs;
+
+	return 0;
+}
+
+static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
+				     struct ib_qp_cap *cap,
+				     struct hns_roce_qp *hr_qp,
+				     struct hns_roce_ib_create_qp *ucmd)
+{
+	u32 roundup_sq_stride = roundup_pow_of_two(hr_dev->caps.max_sq_desc_sz);
+	u8 max_sq_stride = ilog2(roundup_sq_stride);
+	u32 page_size;
+	u32 max_cnt;
+
+	/* Sanity check SQ size before proceeding */
+	if ((u32)(1 << ucmd->log_sq_bb_count) > hr_dev->caps.max_wqes ||
+	     ucmd->log_sq_stride > max_sq_stride ||
+	     ucmd->log_sq_stride < HNS_ROCE_IB_MIN_SQ_STRIDE) {
+		dev_err(hr_dev->dev, "check SQ size error!\n");
+		return -EINVAL;
+	}
+
+	if (cap->max_send_sge > hr_dev->caps.max_sq_sg) {
+		dev_err(hr_dev->dev, "SQ sge error! max_send_sge=%d\n",
+			cap->max_send_sge);
+		return -EINVAL;
+	}
+
+	hr_qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count;
+	hr_qp->sq.wqe_shift = ucmd->log_sq_stride;
+
+	max_cnt = max(1U, cap->max_send_sge);
+	if (hr_dev->caps.max_sq_sg <= 2)
+		hr_qp->sq.max_gs = roundup_pow_of_two(max_cnt);
+	else
+		hr_qp->sq.max_gs = max_cnt;
+
+	if (hr_qp->sq.max_gs > 2)
+		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
+							(hr_qp->sq.max_gs - 2));
+	hr_qp->sge.sge_shift = 4;
+
+	/* Get buf size, SQ and RQ  are aligned to page_szie */
+	if (hr_dev->caps.max_sq_sg <= 2) {
+		hr_qp->buff_size = HNS_ROCE_ALOGN_UP((hr_qp->rq.wqe_cnt <<
+					     hr_qp->rq.wqe_shift), PAGE_SIZE) +
+				   HNS_ROCE_ALOGN_UP((hr_qp->sq.wqe_cnt <<
+					     hr_qp->sq.wqe_shift), PAGE_SIZE);
+
+		hr_qp->sq.offset = 0;
+		hr_qp->rq.offset = HNS_ROCE_ALOGN_UP((hr_qp->sq.wqe_cnt <<
+					     hr_qp->sq.wqe_shift), PAGE_SIZE);
+	} else {
+		page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
+		hr_qp->buff_size = HNS_ROCE_ALOGN_UP((hr_qp->rq.wqe_cnt <<
+					     hr_qp->rq.wqe_shift), page_size) +
+				   HNS_ROCE_ALOGN_UP((hr_qp->sge.sge_cnt <<
+					     hr_qp->sge.sge_shift), page_size) +
+				   HNS_ROCE_ALOGN_UP((hr_qp->sq.wqe_cnt <<
+					     hr_qp->sq.wqe_shift), page_size);
+
+		hr_qp->sq.offset = 0;
+		if (hr_qp->sge.sge_cnt) {
+			hr_qp->sge.offset = HNS_ROCE_ALOGN_UP(
+							(hr_qp->sq.wqe_cnt <<
+							hr_qp->sq.wqe_shift),
+							page_size);
+			hr_qp->rq.offset = hr_qp->sge.offset +
+					HNS_ROCE_ALOGN_UP((hr_qp->sge.sge_cnt <<
+						hr_qp->sge.sge_shift),
+						page_size);
+		} else {
+			hr_qp->rq.offset = HNS_ROCE_ALOGN_UP(
+							(hr_qp->sq.wqe_cnt <<
+							hr_qp->sq.wqe_shift),
+							page_size);
+		}
+	}
+
+	return 0;
+}
+
+static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
+				       struct ib_qp_cap *cap,
+				       struct hns_roce_qp *hr_qp)
+{
+	struct device *dev = hr_dev->dev;
+	u32 page_size;
+	u32 max_cnt;
+	int size;
+
+	if (cap->max_send_wr  > hr_dev->caps.max_wqes  ||
+	    cap->max_send_sge > hr_dev->caps.max_sq_sg ||
+	    cap->max_inline_data > hr_dev->caps.max_sq_inline) {
+		dev_err(dev, "SQ WR or sge or inline data error!\n");
+		return -EINVAL;
+	}
+
+	hr_qp->sq.wqe_shift = ilog2(hr_dev->caps.max_sq_desc_sz);
+	hr_qp->sq_max_wqes_per_wr = 1;
+	hr_qp->sq_spare_wqes = 0;
+
+	if (hr_dev->caps.min_wqes)
+		max_cnt = max(cap->max_send_wr, hr_dev->caps.min_wqes);
+	else
+		max_cnt = cap->max_send_wr;
+
+	hr_qp->sq.wqe_cnt = roundup_pow_of_two(max_cnt);
+	if ((u32)hr_qp->sq.wqe_cnt > hr_dev->caps.max_wqes) {
+		dev_err(dev, "while setting kernel sq size, sq.wqe_cnt too large\n");
+		return -EINVAL;
+	}
+
+	/* Get data_seg numbers */
+	max_cnt = max(1U, cap->max_send_sge);
+	if (hr_dev->caps.max_sq_sg <= 2)
+		hr_qp->sq.max_gs = roundup_pow_of_two(max_cnt);
+	else
+		hr_qp->sq.max_gs = max_cnt;
+
+	if (hr_qp->sq.max_gs > 2) {
+		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
+				     (hr_qp->sq.max_gs - 2));
+		hr_qp->sge.sge_shift = 4;
+	}
+
+	/* ud sqwqe's sge use extend sge */
+	if (hr_dev->caps.max_sq_sg > 2 && hr_qp->ibqp.qp_type == IB_QPT_GSI) {
+		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
+				     hr_qp->sq.max_gs);
+		hr_qp->sge.sge_shift = 4;
+	}
+
+	/* Get buf size, SQ and RQ are aligned to PAGE_SIZE */
+	page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
+	hr_qp->sq.offset = 0;
+	size = HNS_ROCE_ALOGN_UP(hr_qp->sq.wqe_cnt << hr_qp->sq.wqe_shift,
+				 page_size);
+
+	if (hr_dev->caps.max_sq_sg > 2 && hr_qp->sge.sge_cnt) {
+		hr_qp->sge.offset = size;
+		size += HNS_ROCE_ALOGN_UP(hr_qp->sge.sge_cnt <<
+					  hr_qp->sge.sge_shift, page_size);
+	}
+
+	hr_qp->rq.offset = size;
+	size += HNS_ROCE_ALOGN_UP((hr_qp->rq.wqe_cnt << hr_qp->rq.wqe_shift),
+				  page_size);
+	hr_qp->buff_size = size;
+
+	/* Get wr and sge number which send */
+	cap->max_send_wr = hr_qp->sq.max_post = hr_qp->sq.wqe_cnt;
+	cap->max_send_sge = hr_qp->sq.max_gs;
+
+	/* We don't support inline sends for kernel QPs (yet) */
+	cap->max_inline_data = 0;
+
+	return 0;
+}
+
+static int hns_roce_qp_has_rq(struct ib_qp_init_attr *attr)
+{
+	if (attr->qp_type == IB_QPT_XRC_INI ||
+	    attr->qp_type == IB_QPT_XRC_TGT || attr->srq)
+		return 0;
+
+	return 1;
+}
+
+static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
+				     struct ib_pd *ib_pd,
+				     struct ib_qp_init_attr *init_attr,
+				     struct ib_udata *udata, unsigned long sqpn,
+				     struct hns_roce_qp *hr_qp)
+{
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_ib_create_qp ucmd;
+	struct hns_roce_ib_create_qp_resp resp = {};
+	unsigned long qpn = 0;
+	int ret = 0;
+	u32 page_shift;
+	u32 npages;
+	int i;
+
+	mutex_init(&hr_qp->mutex);
+	spin_lock_init(&hr_qp->sq.lock);
+	spin_lock_init(&hr_qp->rq.lock);
+
+	hr_qp->state = IB_QPS_RESET;
+
+	hr_qp->ibqp.qp_type = init_attr->qp_type;
+
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		hr_qp->sq_signal_bits = cpu_to_le32(IB_SIGNAL_ALL_WR);
+	else
+		hr_qp->sq_signal_bits = cpu_to_le32(IB_SIGNAL_REQ_WR);
+
+	ret = hns_roce_set_rq_size(hr_dev, &init_attr->cap, !!ib_pd->uobject,
+				   !!init_attr->srq, hr_qp);
+	if (ret) {
+		dev_err(dev, "hns_roce_set_rq_size failed\n");
+		goto err_out;
+	}
+
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) {
+		/* allocate recv inline buf */
+		hr_qp->rq_inl_buf.wqe_list = kcalloc(hr_qp->rq.wqe_cnt,
+					       sizeof(struct hns_roce_rinl_wqe),
+					       GFP_KERNEL);
+		if (!hr_qp->rq_inl_buf.wqe_list) {
+			ret = -ENOMEM;
+			goto err_out;
+		}
+
+		hr_qp->rq_inl_buf.wqe_cnt = hr_qp->rq.wqe_cnt;
+
+		/* Firstly, allocate a list of sge space buffer */
+		hr_qp->rq_inl_buf.wqe_list[0].sg_list =
+					kcalloc(hr_qp->rq_inl_buf.wqe_cnt,
+					       init_attr->cap.max_recv_sge *
+					       sizeof(struct hns_roce_rinl_sge),
+					       GFP_KERNEL);
+		if (!hr_qp->rq_inl_buf.wqe_list[0].sg_list) {
+			ret = -ENOMEM;
+			goto err_wqe_list;
+		}
+
+		for (i = 1; i < hr_qp->rq_inl_buf.wqe_cnt; i++)
+			/* Secondly, reallocate the buffer */
+			hr_qp->rq_inl_buf.wqe_list[i].sg_list =
+				&hr_qp->rq_inl_buf.wqe_list[0].sg_list[i *
+				init_attr->cap.max_recv_sge];
+	}
+
+	if (ib_pd->uobject) {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+			dev_err(dev, "ib_copy_from_udata error for create qp\n");
+			ret = -EFAULT;
+			goto err_rq_sge_list;
+		}
+
+		ret = hns_roce_set_user_sq_size(hr_dev, &init_attr->cap, hr_qp,
+						&ucmd);
+		if (ret) {
+			dev_err(dev, "hns_roce_set_user_sq_size error for create qp\n");
+			goto err_rq_sge_list;
+		}
+
+		hr_qp->umem = ib_umem_get(ib_pd->uobject->context,
+					  ucmd.buf_addr, hr_qp->buff_size, 0,
+					  0);
+		if (IS_ERR(hr_qp->umem)) {
+			dev_err(dev, "ib_umem_get error for create qp\n");
+			ret = PTR_ERR(hr_qp->umem);
+			goto err_rq_sge_list;
+		}
+
+		hr_qp->mtt.mtt_type = MTT_TYPE_WQE;
+		if (hr_dev->caps.mtt_buf_pg_sz) {
+			npages = (ib_umem_page_count(hr_qp->umem) +
+				  (1 << hr_dev->caps.mtt_buf_pg_sz) - 1) /
+				  (1 << hr_dev->caps.mtt_buf_pg_sz);
+			page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
+			ret = hns_roce_mtt_init(hr_dev, npages,
+				    page_shift,
+				    &hr_qp->mtt);
+		} else {
+			ret = hns_roce_mtt_init(hr_dev,
+				    ib_umem_page_count(hr_qp->umem),
+				    hr_qp->umem->page_shift,
+				    &hr_qp->mtt);
+		}
+		if (ret) {
+			dev_err(dev, "hns_roce_mtt_init error for create qp\n");
+			goto err_buf;
+		}
+
+		ret = hns_roce_ib_umem_write_mtt(hr_dev, &hr_qp->mtt,
+						 hr_qp->umem);
+		if (ret) {
+			dev_err(dev, "hns_roce_ib_umem_write_mtt error for create qp\n");
+			goto err_mtt;
+		}
+
+		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+		    (udata->outlen >= sizeof(resp)) &&
+		    hns_roce_qp_has_rq(init_attr)) {
+			ret = hns_roce_db_map_user(
+					to_hr_ucontext(ib_pd->uobject->context),
+					ucmd.db_addr, &hr_qp->rdb);
+			if (ret) {
+				dev_err(dev, "rq record doorbell map failed!\n");
+				goto err_mtt;
+			}
+		}
+	} else {
+		if (init_attr->create_flags &
+		    IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
+			dev_err(dev, "init_attr->create_flags error!\n");
+			ret = -EINVAL;
+			goto err_rq_sge_list;
+		}
+
+		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) {
+			dev_err(dev, "init_attr->create_flags error!\n");
+			ret = -EINVAL;
+			goto err_rq_sge_list;
+		}
+
+		/* Set SQ size */
+		ret = hns_roce_set_kernel_sq_size(hr_dev, &init_attr->cap,
+						  hr_qp);
+		if (ret) {
+			dev_err(dev, "hns_roce_set_kernel_sq_size error!\n");
+			goto err_rq_sge_list;
+		}
+
+		/* QP doorbell register address */
+		hr_qp->sq.db_reg_l = hr_dev->reg_base + hr_dev->sdb_offset +
+				     DB_REG_OFFSET * hr_dev->priv_uar.index;
+		hr_qp->rq.db_reg_l = hr_dev->reg_base + hr_dev->odb_offset +
+				     DB_REG_OFFSET * hr_dev->priv_uar.index;
+
+		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+		    hns_roce_qp_has_rq(init_attr)) {
+			ret = hns_roce_alloc_db(hr_dev, &hr_qp->rdb, 0);
+			if (ret) {
+				dev_err(dev, "rq record doorbell alloc failed!\n");
+				goto err_rq_sge_list;
+			}
+			*hr_qp->rdb.db_record = 0;
+			hr_qp->rdb_en = 1;
+		}
+
+		/* Allocate QP buf */
+		page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
+		if (hns_roce_buf_alloc(hr_dev, hr_qp->buff_size,
+				       (1 << page_shift) * 2,
+				       &hr_qp->hr_buf, page_shift)) {
+			dev_err(dev, "hns_roce_buf_alloc error!\n");
+			ret = -ENOMEM;
+			goto err_db;
+		}
+
+		hr_qp->mtt.mtt_type = MTT_TYPE_WQE;
+		/* Write MTT */
+		ret = hns_roce_mtt_init(hr_dev, hr_qp->hr_buf.npages,
+					hr_qp->hr_buf.page_shift, &hr_qp->mtt);
+		if (ret) {
+			dev_err(dev, "hns_roce_mtt_init error for kernel create qp\n");
+			goto err_buf;
+		}
+
+		ret = hns_roce_buf_write_mtt(hr_dev, &hr_qp->mtt,
+					     &hr_qp->hr_buf);
+		if (ret) {
+			dev_err(dev, "hns_roce_buf_write_mtt error for kernel create qp\n");
+			goto err_mtt;
+		}
+
+		hr_qp->sq.wrid = kmalloc_array(hr_qp->sq.wqe_cnt, sizeof(u64),
+					       GFP_KERNEL);
+		hr_qp->rq.wrid = kmalloc_array(hr_qp->rq.wqe_cnt, sizeof(u64),
+					       GFP_KERNEL);
+		if (!hr_qp->sq.wrid || !hr_qp->rq.wrid) {
+			ret = -ENOMEM;
+			goto err_wrid;
+		}
+	}
+
+	if (sqpn) {
+		qpn = sqpn;
+	} else {
+		/* Get QPN */
+		ret = hns_roce_reserve_range_qp(hr_dev, 1, 1, &qpn);
+		if (ret) {
+			dev_err(dev, "hns_roce_reserve_range_qp alloc qpn error\n");
+			goto err_wrid;
+		}
+	}
+
+	if (init_attr->qp_type == IB_QPT_GSI &&
+	    hr_dev->hw_rev == HNS_ROCE_HW_VER1) {
+		/* In v1 engine, GSI QP context in RoCE engine's register */
+		ret = hns_roce_gsi_qp_alloc(hr_dev, qpn, hr_qp);
+		if (ret) {
+			dev_err(dev, "hns_roce_qp_alloc failed!\n");
+			goto err_qpn;
+		}
+	} else {
+		ret = hns_roce_qp_alloc(hr_dev, qpn, hr_qp);
+		if (ret) {
+			dev_err(dev, "hns_roce_qp_alloc failed!\n");
+			goto err_qpn;
+		}
+	}
+
+	if (sqpn)
+		hr_qp->doorbell_qpn = 1;
+	else
+		hr_qp->doorbell_qpn = cpu_to_le64(hr_qp->qpn);
+
+	if (ib_pd->uobject && (udata->outlen >= sizeof(resp)) &&
+		(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)) {
+
+		/* indicate kernel supports record db */
+		resp.cap_flags |= HNS_ROCE_SUPPORT_RQ_RECORD_DB;
+		ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (ret)
+			goto err_qp;
+
+		hr_qp->rdb_en = 1;
+	}
+	hr_qp->event = hns_roce_ib_qp_event;
+
+	return 0;
+
+err_qp:
+	if (init_attr->qp_type == IB_QPT_GSI &&
+		hr_dev->hw_rev == HNS_ROCE_HW_VER1)
+		hns_roce_qp_remove(hr_dev, hr_qp);
+	else
+		hns_roce_qp_free(hr_dev, hr_qp);
+
+err_qpn:
+	if (!sqpn)
+		hns_roce_release_range_qp(hr_dev, qpn, 1);
+
+err_wrid:
+	if (ib_pd->uobject) {
+		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+		    (udata->outlen >= sizeof(resp)) &&
+		    hns_roce_qp_has_rq(init_attr))
+			hns_roce_db_unmap_user(
+					to_hr_ucontext(ib_pd->uobject->context),
+					&hr_qp->rdb);
+	} else {
+		kfree(hr_qp->sq.wrid);
+		kfree(hr_qp->rq.wrid);
+	}
+
+err_mtt:
+	hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
+
+err_buf:
+	if (ib_pd->uobject)
+		ib_umem_release(hr_qp->umem);
+	else
+		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
+
+err_db:
+	if (!ib_pd->uobject && hns_roce_qp_has_rq(init_attr) &&
+	    (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB))
+		hns_roce_free_db(hr_dev, &hr_qp->rdb);
+
+err_rq_sge_list:
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE)
+		kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
+
+err_wqe_list:
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE)
+		kfree(hr_qp->rq_inl_buf.wqe_list);
+
+err_out:
+	return ret;
+}
+
+struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
+				 struct ib_qp_init_attr *init_attr,
+				 struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_sqp *hr_sqp;
+	struct hns_roce_qp *hr_qp;
+	int ret;
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_RC: {
+		hr_qp = kzalloc(sizeof(*hr_qp), GFP_KERNEL);
+		if (!hr_qp)
+			return ERR_PTR(-ENOMEM);
+
+		ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata, 0,
+						hr_qp);
+		if (ret) {
+			dev_err(dev, "Create RC QP failed\n");
+			kfree(hr_qp);
+			return ERR_PTR(ret);
+		}
+
+		hr_qp->ibqp.qp_num = hr_qp->qpn;
+
+		break;
+	}
+	case IB_QPT_GSI: {
+		/* Userspace is not allowed to create special QPs: */
+		if (pd->uobject) {
+			dev_err(dev, "not support usr space GSI\n");
+			return ERR_PTR(-EINVAL);
+		}
+
+		hr_sqp = kzalloc(sizeof(*hr_sqp), GFP_KERNEL);
+		if (!hr_sqp)
+			return ERR_PTR(-ENOMEM);
+
+		hr_qp = &hr_sqp->hr_qp;
+		hr_qp->port = init_attr->port_num - 1;
+		hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port];
+
+		/* when hw version is v1, the sqpn is allocated */
+		if (hr_dev->caps.max_sq_sg <= 2)
+			hr_qp->ibqp.qp_num = HNS_ROCE_MAX_PORTS +
+					     hr_dev->iboe.phy_port[hr_qp->port];
+		else
+			hr_qp->ibqp.qp_num = 1;
+
+		ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata,
+						hr_qp->ibqp.qp_num, hr_qp);
+		if (ret) {
+			dev_err(dev, "Create GSI QP failed!\n");
+			kfree(hr_sqp);
+			return ERR_PTR(ret);
+		}
+
+		break;
+	}
+	default:{
+		dev_err(dev, "not support QP type %d\n", init_attr->qp_type);
+		return ERR_PTR(-EINVAL);
+	}
+	}
+
+	return &hr_qp->ibqp;
+}
+EXPORT_SYMBOL_GPL(hns_roce_create_qp);
+
+int to_hr_qp_type(int qp_type)
+{
+	int transport_type;
+
+	if (qp_type == IB_QPT_RC)
+		transport_type = SERV_TYPE_RC;
+	else if (qp_type == IB_QPT_UC)
+		transport_type = SERV_TYPE_UC;
+	else if (qp_type == IB_QPT_UD)
+		transport_type = SERV_TYPE_UD;
+	else if (qp_type == IB_QPT_GSI)
+		transport_type = SERV_TYPE_UD;
+	else
+		transport_type = -1;
+
+	return transport_type;
+}
+EXPORT_SYMBOL_GPL(to_hr_qp_type);
+
+int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		       int attr_mask, struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	struct device *dev = hr_dev->dev;
+	int ret = -EINVAL;
+	int p;
+	enum ib_mtu active_mtu;
+
+	mutex_lock(&hr_qp->mutex);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ?
+		    attr->cur_qp_state : (enum ib_qp_state)hr_qp->state;
+	new_state = attr_mask & IB_QP_STATE ?
+		    attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask,
+				IB_LINK_LAYER_ETHERNET)) {
+		dev_err(dev, "ib_modify_qp_is_ok failed\n");
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > hr_dev->caps.num_ports)) {
+		dev_err(dev, "attr port_num invalid.attr->port_num=%d\n",
+			attr->port_num);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port;
+		if (attr->pkey_index >= hr_dev->caps.pkey_table_len[p]) {
+			dev_err(dev, "attr pkey_index invalid.attr->pkey_index=%d\n",
+				attr->pkey_index);
+			goto out;
+		}
+	}
+
+	if (attr_mask & IB_QP_PATH_MTU) {
+		p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port;
+		active_mtu = iboe_get_mtu(hr_dev->iboe.netdevs[p]->mtu);
+
+		if ((hr_dev->caps.max_mtu == IB_MTU_4096 &&
+		    attr->path_mtu > IB_MTU_4096) ||
+		    (hr_dev->caps.max_mtu == IB_MTU_2048 &&
+		    attr->path_mtu > IB_MTU_2048) ||
+		    attr->path_mtu < IB_MTU_256 ||
+		    attr->path_mtu > active_mtu) {
+			dev_err(dev, "attr path_mtu(%d)invalid while modify qp",
+				attr->path_mtu);
+			goto out;
+		}
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > hr_dev->caps.max_qp_init_rdma) {
+		dev_err(dev, "attr max_rd_atomic invalid.attr->max_rd_atomic=%d\n",
+			attr->max_rd_atomic);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > hr_dev->caps.max_qp_dest_rdma) {
+		dev_err(dev, "attr max_dest_rd_atomic invalid.attr->max_dest_rd_atomic=%d\n",
+			attr->max_dest_rd_atomic);
+		goto out;
+	}
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		if (hr_dev->caps.min_wqes) {
+			ret = -EPERM;
+			dev_err(dev, "cur_state=%d new_state=%d\n", cur_state,
+				new_state);
+		} else {
+			ret = 0;
+		}
+
+		goto out;
+	}
+
+	ret = hr_dev->hw->modify_qp(ibqp, attr, attr_mask, cur_state,
+				    new_state);
+
+out:
+	mutex_unlock(&hr_qp->mutex);
+
+	return ret;
+}
+
+void hns_roce_lock_cqs(struct hns_roce_cq *send_cq, struct hns_roce_cq *recv_cq)
+		       __acquires(&send_cq->lock) __acquires(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		spin_lock_irq(&send_cq->lock);
+		__acquire(&recv_cq->lock);
+	} else if (send_cq->cqn < recv_cq->cqn) {
+		spin_lock_irq(&send_cq->lock);
+		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock_irq(&recv_cq->lock);
+		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+EXPORT_SYMBOL_GPL(hns_roce_lock_cqs);
+
+void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq,
+			 struct hns_roce_cq *recv_cq) __releases(&send_cq->lock)
+			 __releases(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		__release(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else if (send_cq->cqn < recv_cq->cqn) {
+		spin_unlock(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else {
+		spin_unlock(&send_cq->lock);
+		spin_unlock_irq(&recv_cq->lock);
+	}
+}
+EXPORT_SYMBOL_GPL(hns_roce_unlock_cqs);
+
+static void *get_wqe(struct hns_roce_qp *hr_qp, int offset)
+{
+
+	return hns_roce_buf_offset(&hr_qp->hr_buf, offset);
+}
+
+void *get_recv_wqe(struct hns_roce_qp *hr_qp, int n)
+{
+	return get_wqe(hr_qp, hr_qp->rq.offset + (n << hr_qp->rq.wqe_shift));
+}
+EXPORT_SYMBOL_GPL(get_recv_wqe);
+
+void *get_send_wqe(struct hns_roce_qp *hr_qp, int n)
+{
+	return get_wqe(hr_qp, hr_qp->sq.offset + (n << hr_qp->sq.wqe_shift));
+}
+EXPORT_SYMBOL_GPL(get_send_wqe);
+
+void *get_send_extend_sge(struct hns_roce_qp *hr_qp, int n)
+{
+	return hns_roce_buf_offset(&hr_qp->hr_buf, hr_qp->sge.offset +
+					(n << hr_qp->sge.sge_shift));
+}
+EXPORT_SYMBOL_GPL(get_send_extend_sge);
+
+bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, int nreq,
+			  struct ib_cq *ib_cq)
+{
+	struct hns_roce_cq *hr_cq;
+	u32 cur;
+
+	cur = hr_wq->head - hr_wq->tail;
+	if (likely(cur + nreq < hr_wq->max_post))
+		return false;
+
+	hr_cq = to_hr_cq(ib_cq);
+	spin_lock(&hr_cq->lock);
+	cur = hr_wq->head - hr_wq->tail;
+	spin_unlock(&hr_cq->lock);
+
+	return cur + nreq >= hr_wq->max_post;
+}
+EXPORT_SYMBOL_GPL(hns_roce_wq_overflow);
+
+int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+	int reserved_from_top = 0;
+	int ret;
+
+	spin_lock_init(&qp_table->lock);
+	INIT_RADIX_TREE(&hr_dev->qp_table_tree, GFP_ATOMIC);
+
+	/* A port include two SQP, six port total 12 */
+	ret = hns_roce_bitmap_init(&qp_table->bitmap, hr_dev->caps.num_qps,
+				   hr_dev->caps.num_qps - 1, SQP_NUM,
+				   reserved_from_top);
+	if (ret) {
+		dev_err(hr_dev->dev, "qp bitmap init failed!error=%d\n",
+			ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+void hns_roce_cleanup_qp_table(struct hns_roce_dev *hr_dev)
+{
+	hns_roce_bitmap_cleanup(&hr_dev->qp_table.bitmap);
+}
diff --git a/drivers/infiniband/hw/i40iw/Kconfig b/drivers/infiniband/hw/i40iw/Kconfig
new file mode 100644
index 0000000..2962979
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/Kconfig
@@ -0,0 +1,7 @@
+config INFINIBAND_I40IW
+	tristate "Intel(R) Ethernet X722 iWARP Driver"
+	depends on INET && I40E
+	depends on PCI
+	select GENERIC_ALLOCATOR
+	---help---
+	Intel(R) Ethernet X722 iWARP Driver
diff --git a/drivers/infiniband/hw/i40iw/Makefile b/drivers/infiniband/hw/i40iw/Makefile
new file mode 100644
index 0000000..5a8a7a3
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+ccflags-y :=  -Idrivers/net/ethernet/intel/i40e
+
+obj-$(CONFIG_INFINIBAND_I40IW) += i40iw.o
+
+i40iw-objs :=\
+               i40iw_cm.o i40iw_ctrl.o \
+               i40iw_hmc.o i40iw_hw.o i40iw_main.o  \
+               i40iw_pble.o i40iw_puda.o i40iw_uk.o i40iw_utils.o \
+               i40iw_verbs.o i40iw_virtchnl.o i40iw_vf.o
diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h
new file mode 100644
index 0000000..2f2b442
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw.h
@@ -0,0 +1,602 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_IW_H
+#define I40IW_IW_H
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/crc32c.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/iw_cm.h>
+#include <crypto/hash.h>
+
+#include "i40iw_status.h"
+#include "i40iw_osdep.h"
+#include "i40iw_d.h"
+#include "i40iw_hmc.h"
+
+#include <i40e_client.h>
+#include "i40iw_type.h"
+#include "i40iw_p.h"
+#include <rdma/i40iw-abi.h>
+#include "i40iw_pble.h"
+#include "i40iw_verbs.h"
+#include "i40iw_cm.h"
+#include "i40iw_user.h"
+#include "i40iw_puda.h"
+
+#define I40IW_FW_VERSION  2
+#define I40IW_HW_VERSION  2
+
+#define I40IW_ARP_ADD     1
+#define I40IW_ARP_DELETE  2
+#define I40IW_ARP_RESOLVE 3
+
+#define I40IW_MACIP_ADD     1
+#define I40IW_MACIP_DELETE  2
+
+#define IW_CCQ_SIZE         (I40IW_CQP_SW_SQSIZE_2048 + 1)
+#define IW_CEQ_SIZE         2048
+#define IW_AEQ_SIZE         2048
+
+#define RX_BUF_SIZE            (1536 + 8)
+#define IW_REG0_SIZE           (4 * 1024)
+#define IW_TX_TIMEOUT          (6 * HZ)
+#define IW_FIRST_QPN           1
+#define IW_SW_CONTEXT_ALIGN    1024
+
+#define MAX_DPC_ITERATIONS		128
+
+#define I40IW_EVENT_TIMEOUT		100000
+#define I40IW_VCHNL_EVENT_TIMEOUT	100000
+
+#define	I40IW_NO_VLAN			0xffff
+#define	I40IW_NO_QSET			0xffff
+
+/* access to mcast filter list */
+#define IW_ADD_MCAST false
+#define IW_DEL_MCAST true
+
+#define I40IW_DRV_OPT_ENABLE_MPA_VER_0     0x00000001
+#define I40IW_DRV_OPT_DISABLE_MPA_CRC      0x00000002
+#define I40IW_DRV_OPT_DISABLE_FIRST_WRITE  0x00000004
+#define I40IW_DRV_OPT_DISABLE_INTF         0x00000008
+#define I40IW_DRV_OPT_ENABLE_MSI           0x00000010
+#define I40IW_DRV_OPT_DUAL_LOGICAL_PORT    0x00000020
+#define I40IW_DRV_OPT_NO_INLINE_DATA       0x00000080
+#define I40IW_DRV_OPT_DISABLE_INT_MOD      0x00000100
+#define I40IW_DRV_OPT_DISABLE_VIRT_WQ      0x00000200
+#define I40IW_DRV_OPT_ENABLE_PAU           0x00000400
+#define I40IW_DRV_OPT_MCAST_LOGPORT_MAP    0x00000800
+
+#define IW_HMC_OBJ_TYPE_NUM ARRAY_SIZE(iw_hmc_obj_types)
+#define IW_CFG_FPM_QP_COUNT               32768
+#define I40IW_MAX_PAGES_PER_FMR           512
+#define I40IW_MIN_PAGES_PER_FMR           1
+#define I40IW_CQP_COMPL_RQ_WQE_FLUSHED    2
+#define I40IW_CQP_COMPL_SQ_WQE_FLUSHED    3
+#define I40IW_CQP_COMPL_RQ_SQ_WQE_FLUSHED 4
+
+struct i40iw_cqp_compl_info {
+	u32 op_ret_val;
+	u16 maj_err_code;
+	u16 min_err_code;
+	bool error;
+	u8 op_code;
+};
+
+#define i40iw_pr_err(fmt, args ...) pr_err("%s: "fmt, __func__, ## args)
+
+#define i40iw_pr_info(fmt, args ...) pr_info("%s: " fmt, __func__, ## args)
+
+#define i40iw_pr_warn(fmt, args ...) pr_warn("%s: " fmt, __func__, ## args)
+
+struct i40iw_cqp_request {
+	struct cqp_commands_info info;
+	wait_queue_head_t waitq;
+	struct list_head list;
+	atomic_t refcount;
+	void (*callback_fcn)(struct i40iw_cqp_request*, u32);
+	void *param;
+	struct i40iw_cqp_compl_info compl_info;
+	bool waiting;
+	bool request_done;
+	bool dynamic;
+};
+
+struct i40iw_cqp {
+	struct i40iw_sc_cqp sc_cqp;
+	spinlock_t req_lock; /*cqp request list */
+	wait_queue_head_t waitq;
+	struct i40iw_dma_mem sq;
+	struct i40iw_dma_mem host_ctx;
+	u64 *scratch_array;
+	struct i40iw_cqp_request *cqp_requests;
+	struct list_head cqp_avail_reqs;
+	struct list_head cqp_pending_reqs;
+};
+
+struct i40iw_device;
+
+struct i40iw_ccq {
+	struct i40iw_sc_cq sc_cq;
+	spinlock_t lock; /* ccq control */
+	wait_queue_head_t waitq;
+	struct i40iw_dma_mem mem_cq;
+	struct i40iw_dma_mem shadow_area;
+};
+
+struct i40iw_ceq {
+	struct i40iw_sc_ceq sc_ceq;
+	struct i40iw_dma_mem mem;
+	u32 irq;
+	u32 msix_idx;
+	struct i40iw_device *iwdev;
+	struct tasklet_struct dpc_tasklet;
+};
+
+struct i40iw_aeq {
+	struct i40iw_sc_aeq sc_aeq;
+	struct i40iw_dma_mem mem;
+};
+
+struct i40iw_arp_entry {
+	u32 ip_addr[4];
+	u8 mac_addr[ETH_ALEN];
+};
+
+enum init_completion_state {
+	INVALID_STATE = 0,
+	INITIAL_STATE,
+	CQP_CREATED,
+	HMC_OBJS_CREATED,
+	PBLE_CHUNK_MEM,
+	CCQ_CREATED,
+	AEQ_CREATED,
+	CEQ_CREATED,
+	ILQ_CREATED,
+	IEQ_CREATED,
+	IP_ADDR_REGISTERED,
+	RDMA_DEV_REGISTERED
+};
+
+struct i40iw_msix_vector {
+	u32 idx;
+	u32 irq;
+	u32 cpu_affinity;
+	u32 ceq_id;
+	cpumask_t mask;
+};
+
+struct l2params_work {
+	struct work_struct work;
+	struct i40iw_device *iwdev;
+	struct i40iw_l2params l2params;
+};
+
+#define I40IW_MSIX_TABLE_SIZE   65
+
+struct virtchnl_work {
+	struct work_struct work;
+	union {
+		struct i40iw_cqp_request *cqp_request;
+		struct i40iw_virtchnl_work_info work_info;
+	};
+};
+
+struct i40e_qvlist_info;
+
+struct i40iw_device {
+	struct i40iw_ib_device *iwibdev;
+	struct net_device *netdev;
+	wait_queue_head_t vchnl_waitq;
+	struct i40iw_sc_dev sc_dev;
+	struct i40iw_sc_vsi vsi;
+	struct i40iw_handler *hdl;
+	struct i40e_info *ldev;
+	struct i40e_client *client;
+	struct i40iw_hw hw;
+	struct i40iw_cm_core cm_core;
+	u8 *mem_resources;
+	unsigned long *allocated_qps;
+	unsigned long *allocated_cqs;
+	unsigned long *allocated_mrs;
+	unsigned long *allocated_pds;
+	unsigned long *allocated_arps;
+	struct i40iw_qp **qp_table;
+	bool msix_shared;
+	u32 msix_count;
+	struct i40iw_msix_vector *iw_msixtbl;
+	struct i40e_qvlist_info *iw_qvlist;
+
+	struct i40iw_hmc_pble_rsrc *pble_rsrc;
+	struct i40iw_arp_entry *arp_table;
+	struct i40iw_cqp cqp;
+	struct i40iw_ccq ccq;
+	u32 ceqs_count;
+	struct i40iw_ceq *ceqlist;
+	struct i40iw_aeq aeq;
+	u32 arp_table_size;
+	u32 next_arp_index;
+	spinlock_t resource_lock; /* hw resource access */
+	spinlock_t qptable_lock;
+	u32 vendor_id;
+	u32 vendor_part_id;
+	u32 of_device_registered;
+
+	u32 device_cap_flags;
+	unsigned long db_start;
+	u8 resource_profile;
+	u8 max_rdma_vfs;
+	u8 max_enabled_vfs;
+	u8 max_sge;
+	u8 iw_status;
+	u8 send_term_ok;
+	bool push_mode;		/* Initialized from parameter passed to driver */
+
+	/* x710 specific */
+	struct mutex pbl_mutex;
+	struct tasklet_struct dpc_tasklet;
+	struct workqueue_struct *virtchnl_wq;
+	struct virtchnl_work virtchnl_w[I40IW_MAX_PE_ENABLED_VF_COUNT];
+	struct i40iw_dma_mem obj_mem;
+	struct i40iw_dma_mem obj_next;
+	u8 *hmc_info_mem;
+	u32 sd_type;
+	struct workqueue_struct *param_wq;
+	atomic_t params_busy;
+	enum init_completion_state init_state;
+	u16 mac_ip_table_idx;
+	atomic_t vchnl_msgs;
+	u32 max_mr;
+	u32 max_qp;
+	u32 max_cq;
+	u32 max_pd;
+	u32 next_qp;
+	u32 next_cq;
+	u32 next_pd;
+	u32 max_mr_size;
+	u32 max_qp_wr;
+	u32 max_cqe;
+	u32 mr_stagmask;
+	u32 mpa_version;
+	bool dcb;
+	bool closing;
+	bool reset;
+	u32 used_pds;
+	u32 used_cqs;
+	u32 used_mrs;
+	u32 used_qps;
+	wait_queue_head_t close_wq;
+	atomic64_t use_count;
+};
+
+struct i40iw_ib_device {
+	struct ib_device ibdev;
+	struct i40iw_device *iwdev;
+};
+
+struct i40iw_handler {
+	struct list_head list;
+	struct i40e_client *client;
+	struct i40iw_device device;
+	struct i40e_info ldev;
+};
+
+/**
+ * to_iwdev - get device
+ * @ibdev: ib device
+ **/
+static inline struct i40iw_device *to_iwdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct i40iw_ib_device, ibdev)->iwdev;
+}
+
+/**
+ * to_ucontext - get user context
+ * @ibucontext: ib user context
+ **/
+static inline struct i40iw_ucontext *to_ucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct i40iw_ucontext, ibucontext);
+}
+
+/**
+ * to_iwpd - get protection domain
+ * @ibpd: ib pd
+ **/
+static inline struct i40iw_pd *to_iwpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct i40iw_pd, ibpd);
+}
+
+/**
+ * to_iwmr - get device memory region
+ * @ibdev: ib memory region
+ **/
+static inline struct i40iw_mr *to_iwmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct i40iw_mr, ibmr);
+}
+
+/**
+ * to_iwmr_from_ibfmr - get device memory region
+ * @ibfmr: ib fmr
+ **/
+static inline struct i40iw_mr *to_iwmr_from_ibfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct i40iw_mr, ibfmr);
+}
+
+/**
+ * to_iwmw - get device memory window
+ * @ibmw: ib memory window
+ **/
+static inline struct i40iw_mr *to_iwmw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct i40iw_mr, ibmw);
+}
+
+/**
+ * to_iwcq - get completion queue
+ * @ibcq: ib cqdevice
+ **/
+static inline struct i40iw_cq *to_iwcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct i40iw_cq, ibcq);
+}
+
+/**
+ * to_iwqp - get device qp
+ * @ibqp: ib qp
+ **/
+static inline struct i40iw_qp *to_iwqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct i40iw_qp, ibqp);
+}
+
+/* i40iw.c */
+void i40iw_add_ref(struct ib_qp *);
+void i40iw_rem_ref(struct ib_qp *);
+struct ib_qp *i40iw_get_qp(struct ib_device *, int);
+
+void i40iw_flush_wqes(struct i40iw_device *iwdev,
+		      struct i40iw_qp *qp);
+
+void i40iw_manage_arp_cache(struct i40iw_device *iwdev,
+			    unsigned char *mac_addr,
+			    u32 *ip_addr,
+			    bool ipv4,
+			    u32 action);
+
+int i40iw_manage_apbvt(struct i40iw_device *iwdev,
+		       u16 accel_local_port,
+		       bool add_port);
+
+struct i40iw_cqp_request *i40iw_get_cqp_request(struct i40iw_cqp *cqp, bool wait);
+void i40iw_free_cqp_request(struct i40iw_cqp *cqp, struct i40iw_cqp_request *cqp_request);
+void i40iw_put_cqp_request(struct i40iw_cqp *cqp, struct i40iw_cqp_request *cqp_request);
+
+/**
+ * i40iw_alloc_resource - allocate a resource
+ * @iwdev: device pointer
+ * @resource_array: resource bit array:
+ * @max_resources: maximum resource number
+ * @req_resources_num: Allocated resource number
+ * @next: next free id
+ **/
+static inline int i40iw_alloc_resource(struct i40iw_device *iwdev,
+				       unsigned long *resource_array,
+				       u32 max_resources,
+				       u32 *req_resource_num,
+				       u32 *next)
+{
+	u32 resource_num;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iwdev->resource_lock, flags);
+	resource_num = find_next_zero_bit(resource_array, max_resources, *next);
+	if (resource_num >= max_resources) {
+		resource_num = find_first_zero_bit(resource_array, max_resources);
+		if (resource_num >= max_resources) {
+			spin_unlock_irqrestore(&iwdev->resource_lock, flags);
+			return -EOVERFLOW;
+		}
+	}
+	set_bit(resource_num, resource_array);
+	*next = resource_num + 1;
+	if (*next == max_resources)
+		*next = 0;
+	*req_resource_num = resource_num;
+	spin_unlock_irqrestore(&iwdev->resource_lock, flags);
+
+	return 0;
+}
+
+/**
+ * i40iw_is_resource_allocated - detrmine if resource is
+ * allocated
+ * @iwdev: device pointer
+ * @resource_array: resource array for the resource_num
+ * @resource_num: resource number to check
+ **/
+static inline bool i40iw_is_resource_allocated(struct i40iw_device *iwdev,
+					       unsigned long *resource_array,
+					       u32 resource_num)
+{
+	bool bit_is_set;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iwdev->resource_lock, flags);
+
+	bit_is_set = test_bit(resource_num, resource_array);
+	spin_unlock_irqrestore(&iwdev->resource_lock, flags);
+
+	return bit_is_set;
+}
+
+/**
+ * i40iw_free_resource - free a resource
+ * @iwdev: device pointer
+ * @resource_array: resource array for the resource_num
+ * @resource_num: resource number to free
+ **/
+static inline void i40iw_free_resource(struct i40iw_device *iwdev,
+				       unsigned long *resource_array,
+				       u32 resource_num)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&iwdev->resource_lock, flags);
+	clear_bit(resource_num, resource_array);
+	spin_unlock_irqrestore(&iwdev->resource_lock, flags);
+}
+
+/**
+ * to_iwhdl - Get the handler from the device pointer
+ * @iwdev: device pointer
+ **/
+static inline struct i40iw_handler *to_iwhdl(struct i40iw_device *iw_dev)
+{
+	return container_of(iw_dev, struct i40iw_handler, device);
+}
+
+struct i40iw_handler *i40iw_find_netdev(struct net_device *netdev);
+
+/**
+ * iw_init_resources -
+ */
+u32 i40iw_initialize_hw_resources(struct i40iw_device *iwdev);
+
+int i40iw_register_rdma_device(struct i40iw_device *iwdev);
+void i40iw_port_ibevent(struct i40iw_device *iwdev);
+void i40iw_cm_disconn(struct i40iw_qp *iwqp);
+void i40iw_cm_disconn_worker(void *);
+int mini_cm_recv_pkt(struct i40iw_cm_core *, struct i40iw_device *,
+		     struct sk_buff *);
+
+enum i40iw_status_code i40iw_handle_cqp_op(struct i40iw_device *iwdev,
+					   struct i40iw_cqp_request *cqp_request);
+enum i40iw_status_code i40iw_add_mac_addr(struct i40iw_device *iwdev,
+					  u8 *mac_addr, u8 *mac_index);
+int i40iw_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *);
+void i40iw_cq_wq_destroy(struct i40iw_device *iwdev, struct i40iw_sc_cq *cq);
+
+void i40iw_cleanup_pending_cqp_op(struct i40iw_device *iwdev);
+void i40iw_rem_pdusecount(struct i40iw_pd *iwpd, struct i40iw_device *iwdev);
+void i40iw_add_pdusecount(struct i40iw_pd *iwpd);
+void i40iw_rem_devusecount(struct i40iw_device *iwdev);
+void i40iw_add_devusecount(struct i40iw_device *iwdev);
+void i40iw_hw_modify_qp(struct i40iw_device *iwdev, struct i40iw_qp *iwqp,
+			struct i40iw_modify_qp_info *info, bool wait);
+
+void i40iw_qp_suspend_resume(struct i40iw_sc_dev *dev,
+			     struct i40iw_sc_qp *qp,
+			     bool suspend);
+enum i40iw_status_code i40iw_manage_qhash(struct i40iw_device *iwdev,
+					  struct i40iw_cm_info *cminfo,
+					  enum i40iw_quad_entry_type etype,
+					  enum i40iw_quad_hash_manage_type mtype,
+					  void *cmnode,
+					  bool wait);
+void i40iw_receive_ilq(struct i40iw_sc_vsi *vsi, struct i40iw_puda_buf *rbuf);
+void i40iw_free_sqbuf(struct i40iw_sc_vsi *vsi, void *bufp);
+void i40iw_free_qp_resources(struct i40iw_device *iwdev,
+			     struct i40iw_qp *iwqp,
+			     u32 qp_num);
+enum i40iw_status_code i40iw_obj_aligned_mem(struct i40iw_device *iwdev,
+					     struct i40iw_dma_mem *memptr,
+					     u32 size, u32 mask);
+
+void i40iw_request_reset(struct i40iw_device *iwdev);
+void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev);
+void i40iw_setup_cm_core(struct i40iw_device *iwdev);
+void i40iw_cleanup_cm_core(struct i40iw_cm_core *cm_core);
+void i40iw_process_ceq(struct i40iw_device *, struct i40iw_ceq *iwceq);
+void i40iw_process_aeq(struct i40iw_device *);
+void i40iw_next_iw_state(struct i40iw_qp *iwqp,
+			 u8 state, u8 del_hash,
+			 u8 term, u8 term_len);
+int i40iw_send_syn(struct i40iw_cm_node *cm_node, u32 sendack);
+int i40iw_send_reset(struct i40iw_cm_node *cm_node);
+struct i40iw_cm_node *i40iw_find_node(struct i40iw_cm_core *cm_core,
+				      u16 rem_port,
+				      u32 *rem_addr,
+				      u16 loc_port,
+				      u32 *loc_addr,
+				      bool add_refcnt,
+				      bool accelerated_list);
+
+enum i40iw_status_code i40iw_hw_flush_wqes(struct i40iw_device *iwdev,
+					   struct i40iw_sc_qp *qp,
+					   struct i40iw_qp_flush_info *info,
+					   bool wait);
+
+void i40iw_gen_ae(struct i40iw_device *iwdev,
+		  struct i40iw_sc_qp *qp,
+		  struct i40iw_gen_ae_info *info,
+		  bool wait);
+
+void i40iw_copy_ip_ntohl(u32 *dst, __be32 *src);
+struct ib_mr *i40iw_reg_phys_mr(struct ib_pd *ib_pd,
+				u64 addr,
+				u64 size,
+				int acc,
+				u64 *iova_start);
+
+int i40iw_inetaddr_event(struct notifier_block *notifier,
+			 unsigned long event,
+			 void *ptr);
+int i40iw_inet6addr_event(struct notifier_block *notifier,
+			  unsigned long event,
+			  void *ptr);
+int i40iw_net_event(struct notifier_block *notifier,
+		    unsigned long event,
+		    void *ptr);
+int i40iw_netdevice_event(struct notifier_block *notifier,
+			  unsigned long event,
+			  void *ptr);
+
+#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c
new file mode 100644
index 0000000..f7c6fd9
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -0,0 +1,4403 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#include <linux/atomic.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/notifier.h>
+#include <linux/net.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/delay.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/random.h>
+#include <linux/list.h>
+#include <linux/threads.h>
+#include <linux/highmem.h>
+#include <net/arp.h>
+#include <net/ndisc.h>
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/ip_fib.h>
+#include <net/tcp.h>
+#include <asm/checksum.h>
+
+#include "i40iw.h"
+
+static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *);
+static void i40iw_cm_post_event(struct i40iw_cm_event *event);
+static void i40iw_disconnect_worker(struct work_struct *work);
+
+/**
+ * i40iw_free_sqbuf - put back puda buffer if refcount = 0
+ * @vsi: pointer to vsi structure
+ * @buf: puda buffer to free
+ */
+void i40iw_free_sqbuf(struct i40iw_sc_vsi *vsi, void *bufp)
+{
+	struct i40iw_puda_buf *buf = (struct i40iw_puda_buf *)bufp;
+	struct i40iw_puda_rsrc *ilq = vsi->ilq;
+
+	if (!atomic_dec_return(&buf->refcount))
+		i40iw_puda_ret_bufpool(ilq, buf);
+}
+
+/**
+ * i40iw_derive_hw_ird_setting - Calculate IRD
+ *
+ * @cm_ird: IRD of connection's node
+ *
+ * The ird from the connection is rounded to a supported HW
+ * setting (2,8,32,64) and then encoded for ird_size field of
+ * qp_ctx
+ */
+static u8 i40iw_derive_hw_ird_setting(u16 cm_ird)
+{
+	u8 encoded_ird_size;
+
+	/* ird_size field is encoded in qp_ctx */
+	switch (cm_ird ? roundup_pow_of_two(cm_ird) : 0) {
+	case I40IW_HW_IRD_SETTING_64:
+		encoded_ird_size = 3;
+		break;
+	case I40IW_HW_IRD_SETTING_32:
+	case I40IW_HW_IRD_SETTING_16:
+		encoded_ird_size = 2;
+		break;
+	case I40IW_HW_IRD_SETTING_8:
+	case I40IW_HW_IRD_SETTING_4:
+		encoded_ird_size = 1;
+		break;
+	case I40IW_HW_IRD_SETTING_2:
+	default:
+		encoded_ird_size = 0;
+		break;
+	}
+	return encoded_ird_size;
+}
+
+/**
+ * i40iw_record_ird_ord - Record IRD/ORD passed in
+ * @cm_node: connection's node
+ * @conn_ird: connection IRD
+ * @conn_ord: connection ORD
+ */
+static void i40iw_record_ird_ord(struct i40iw_cm_node *cm_node, u32 conn_ird,
+				 u32 conn_ord)
+{
+	if (conn_ird > I40IW_MAX_IRD_SIZE)
+		conn_ird = I40IW_MAX_IRD_SIZE;
+
+	if (conn_ord > I40IW_MAX_ORD_SIZE)
+		conn_ord = I40IW_MAX_ORD_SIZE;
+	else if (!conn_ord && cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO)
+		conn_ord = 1;
+
+	cm_node->ird_size = conn_ird;
+	cm_node->ord_size = conn_ord;
+}
+
+/**
+ * i40iw_copy_ip_ntohl - change network to host ip
+ * @dst: host ip
+ * @src: big endian
+ */
+void i40iw_copy_ip_ntohl(u32 *dst, __be32 *src)
+{
+	*dst++ = ntohl(*src++);
+	*dst++ = ntohl(*src++);
+	*dst++ = ntohl(*src++);
+	*dst = ntohl(*src);
+}
+
+/**
+ * i40iw_copy_ip_htonl - change host addr to network ip
+ * @dst: host ip
+ * @src: little endian
+ */
+static inline void i40iw_copy_ip_htonl(__be32 *dst, u32 *src)
+{
+	*dst++ = htonl(*src++);
+	*dst++ = htonl(*src++);
+	*dst++ = htonl(*src++);
+	*dst = htonl(*src);
+}
+
+/**
+ * i40iw_fill_sockaddr4 - get addr info for passive connection
+ * @cm_node: connection's node
+ * @event: upper layer's cm event
+ */
+static inline void i40iw_fill_sockaddr4(struct i40iw_cm_node *cm_node,
+					struct iw_cm_event *event)
+{
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&event->local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&event->remote_addr;
+
+	laddr->sin_family = AF_INET;
+	raddr->sin_family = AF_INET;
+
+	laddr->sin_port = htons(cm_node->loc_port);
+	raddr->sin_port = htons(cm_node->rem_port);
+
+	laddr->sin_addr.s_addr = htonl(cm_node->loc_addr[0]);
+	raddr->sin_addr.s_addr = htonl(cm_node->rem_addr[0]);
+}
+
+/**
+ * i40iw_fill_sockaddr6 - get ipv6 addr info for passive side
+ * @cm_node: connection's node
+ * @event: upper layer's cm event
+ */
+static inline void i40iw_fill_sockaddr6(struct i40iw_cm_node *cm_node,
+					struct iw_cm_event *event)
+{
+	struct sockaddr_in6 *laddr6 = (struct sockaddr_in6 *)&event->local_addr;
+	struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *)&event->remote_addr;
+
+	laddr6->sin6_family = AF_INET6;
+	raddr6->sin6_family = AF_INET6;
+
+	laddr6->sin6_port = htons(cm_node->loc_port);
+	raddr6->sin6_port = htons(cm_node->rem_port);
+
+	i40iw_copy_ip_htonl(laddr6->sin6_addr.in6_u.u6_addr32,
+			    cm_node->loc_addr);
+	i40iw_copy_ip_htonl(raddr6->sin6_addr.in6_u.u6_addr32,
+			    cm_node->rem_addr);
+}
+
+/**
+ * i40iw_get_addr_info
+ * @cm_node: contains ip/tcp info
+ * @cm_info: to get a copy of the cm_node ip/tcp info
+*/
+static void i40iw_get_addr_info(struct i40iw_cm_node *cm_node,
+				struct i40iw_cm_info *cm_info)
+{
+	cm_info->ipv4 = cm_node->ipv4;
+	cm_info->vlan_id = cm_node->vlan_id;
+	memcpy(cm_info->loc_addr, cm_node->loc_addr, sizeof(cm_info->loc_addr));
+	memcpy(cm_info->rem_addr, cm_node->rem_addr, sizeof(cm_info->rem_addr));
+	cm_info->loc_port = cm_node->loc_port;
+	cm_info->rem_port = cm_node->rem_port;
+	cm_info->user_pri = cm_node->user_pri;
+}
+
+/**
+ * i40iw_get_cmevent_info - for cm event upcall
+ * @cm_node: connection's node
+ * @cm_id: upper layers cm struct for the event
+ * @event: upper layer's cm event
+ */
+static inline void i40iw_get_cmevent_info(struct i40iw_cm_node *cm_node,
+					  struct iw_cm_id *cm_id,
+					  struct iw_cm_event *event)
+{
+	memcpy(&event->local_addr, &cm_id->m_local_addr,
+	       sizeof(event->local_addr));
+	memcpy(&event->remote_addr, &cm_id->m_remote_addr,
+	       sizeof(event->remote_addr));
+	if (cm_node) {
+		event->private_data = (void *)cm_node->pdata_buf;
+		event->private_data_len = (u8)cm_node->pdata.size;
+		event->ird = cm_node->ird_size;
+		event->ord = cm_node->ord_size;
+	}
+}
+
+/**
+ * i40iw_send_cm_event - upcall cm's event handler
+ * @cm_node: connection's node
+ * @cm_id: upper layer's cm info struct
+ * @type: Event type to indicate
+ * @status: status for the event type
+ */
+static int i40iw_send_cm_event(struct i40iw_cm_node *cm_node,
+			       struct iw_cm_id *cm_id,
+			       enum iw_cm_event_type type,
+			       int status)
+{
+	struct iw_cm_event event;
+
+	memset(&event, 0, sizeof(event));
+	event.event = type;
+	event.status = status;
+	switch (type) {
+	case IW_CM_EVENT_CONNECT_REQUEST:
+		if (cm_node->ipv4)
+			i40iw_fill_sockaddr4(cm_node, &event);
+		else
+			i40iw_fill_sockaddr6(cm_node, &event);
+		event.provider_data = (void *)cm_node;
+		event.private_data = (void *)cm_node->pdata_buf;
+		event.private_data_len = (u8)cm_node->pdata.size;
+		event.ird = cm_node->ird_size;
+		break;
+	case IW_CM_EVENT_CONNECT_REPLY:
+		i40iw_get_cmevent_info(cm_node, cm_id, &event);
+		break;
+	case IW_CM_EVENT_ESTABLISHED:
+		event.ird = cm_node->ird_size;
+		event.ord = cm_node->ord_size;
+		break;
+	case IW_CM_EVENT_DISCONNECT:
+		break;
+	case IW_CM_EVENT_CLOSE:
+		break;
+	default:
+		i40iw_pr_err("event type received type = %d\n", type);
+		return -1;
+	}
+	return cm_id->event_handler(cm_id, &event);
+}
+
+/**
+ * i40iw_create_event - create cm event
+ * @cm_node: connection's node
+ * @type: Event type to generate
+ */
+static struct i40iw_cm_event *i40iw_create_event(struct i40iw_cm_node *cm_node,
+						 enum i40iw_cm_event_type type)
+{
+	struct i40iw_cm_event *event;
+
+	if (!cm_node->cm_id)
+		return NULL;
+
+	event = kzalloc(sizeof(*event), GFP_ATOMIC);
+
+	if (!event)
+		return NULL;
+
+	event->type = type;
+	event->cm_node = cm_node;
+	memcpy(event->cm_info.rem_addr, cm_node->rem_addr, sizeof(event->cm_info.rem_addr));
+	memcpy(event->cm_info.loc_addr, cm_node->loc_addr, sizeof(event->cm_info.loc_addr));
+	event->cm_info.rem_port = cm_node->rem_port;
+	event->cm_info.loc_port = cm_node->loc_port;
+	event->cm_info.cm_id = cm_node->cm_id;
+
+	i40iw_debug(cm_node->dev,
+		    I40IW_DEBUG_CM,
+		    "node=%p event=%p type=%u dst=%pI4 src=%pI4\n",
+		    cm_node,
+		    event,
+		    type,
+		    event->cm_info.loc_addr,
+		    event->cm_info.rem_addr);
+
+	i40iw_cm_post_event(event);
+	return event;
+}
+
+/**
+ * i40iw_free_retrans_entry - free send entry
+ * @cm_node: connection's node
+ */
+static void i40iw_free_retrans_entry(struct i40iw_cm_node *cm_node)
+{
+	struct i40iw_device *iwdev = cm_node->iwdev;
+	struct i40iw_timer_entry *send_entry;
+
+	send_entry = cm_node->send_entry;
+	if (send_entry) {
+		cm_node->send_entry = NULL;
+		i40iw_free_sqbuf(&iwdev->vsi, (void *)send_entry->sqbuf);
+		kfree(send_entry);
+		atomic_dec(&cm_node->ref_count);
+	}
+}
+
+/**
+ * i40iw_cleanup_retrans_entry - free send entry with lock
+ * @cm_node: connection's node
+ */
+static void i40iw_cleanup_retrans_entry(struct i40iw_cm_node *cm_node)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+	i40iw_free_retrans_entry(cm_node);
+	spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+}
+
+/**
+ * i40iw_form_cm_frame - get a free packet and build frame
+ * @cm_node: connection's node ionfo to use in frame
+ * @options: pointer to options info
+ * @hdr: pointer mpa header
+ * @pdata: pointer to private data
+ * @flags:  indicates FIN or ACK
+ */
+static struct i40iw_puda_buf *i40iw_form_cm_frame(struct i40iw_cm_node *cm_node,
+						  struct i40iw_kmem_info *options,
+						  struct i40iw_kmem_info *hdr,
+						  struct i40iw_kmem_info *pdata,
+						  u8 flags)
+{
+	struct i40iw_puda_buf *sqbuf;
+	struct i40iw_sc_vsi *vsi = &cm_node->iwdev->vsi;
+	u8 *buf;
+
+	struct tcphdr *tcph;
+	struct iphdr *iph;
+	struct ipv6hdr *ip6h;
+	struct ethhdr *ethh;
+	u16 packetsize;
+	u16 eth_hlen = ETH_HLEN;
+	u32 opts_len = 0;
+	u32 pd_len = 0;
+	u32 hdr_len = 0;
+	u16 vtag;
+
+	sqbuf = i40iw_puda_get_bufpool(vsi->ilq);
+	if (!sqbuf)
+		return NULL;
+	buf = sqbuf->mem.va;
+
+	if (options)
+		opts_len = (u32)options->size;
+
+	if (hdr)
+		hdr_len = hdr->size;
+
+	if (pdata)
+		pd_len = pdata->size;
+
+	if (cm_node->vlan_id < VLAN_TAG_PRESENT)
+		eth_hlen += 4;
+
+	if (cm_node->ipv4)
+		packetsize = sizeof(*iph) + sizeof(*tcph);
+	else
+		packetsize = sizeof(*ip6h) + sizeof(*tcph);
+	packetsize += opts_len + hdr_len + pd_len;
+
+	memset(buf, 0x00, eth_hlen + packetsize);
+
+	sqbuf->totallen = packetsize + eth_hlen;
+	sqbuf->maclen = eth_hlen;
+	sqbuf->tcphlen = sizeof(*tcph) + opts_len;
+	sqbuf->scratch = (void *)cm_node;
+
+	ethh = (struct ethhdr *)buf;
+	buf += eth_hlen;
+
+	if (cm_node->ipv4) {
+		sqbuf->ipv4 = true;
+
+		iph = (struct iphdr *)buf;
+		buf += sizeof(*iph);
+		tcph = (struct tcphdr *)buf;
+		buf += sizeof(*tcph);
+
+		ether_addr_copy(ethh->h_dest, cm_node->rem_mac);
+		ether_addr_copy(ethh->h_source, cm_node->loc_mac);
+		if (cm_node->vlan_id < VLAN_TAG_PRESENT) {
+			((struct vlan_ethhdr *)ethh)->h_vlan_proto = htons(ETH_P_8021Q);
+			vtag = (cm_node->user_pri << VLAN_PRIO_SHIFT) | cm_node->vlan_id;
+			((struct vlan_ethhdr *)ethh)->h_vlan_TCI = htons(vtag);
+
+			((struct vlan_ethhdr *)ethh)->h_vlan_encapsulated_proto = htons(ETH_P_IP);
+		} else {
+			ethh->h_proto = htons(ETH_P_IP);
+		}
+
+		iph->version = IPVERSION;
+		iph->ihl = 5;	/* 5 * 4Byte words, IP headr len */
+		iph->tos = cm_node->tos;
+		iph->tot_len = htons(packetsize);
+		iph->id = htons(++cm_node->tcp_cntxt.loc_id);
+
+		iph->frag_off = htons(0x4000);
+		iph->ttl = 0x40;
+		iph->protocol = IPPROTO_TCP;
+		iph->saddr = htonl(cm_node->loc_addr[0]);
+		iph->daddr = htonl(cm_node->rem_addr[0]);
+	} else {
+		sqbuf->ipv4 = false;
+		ip6h = (struct ipv6hdr *)buf;
+		buf += sizeof(*ip6h);
+		tcph = (struct tcphdr *)buf;
+		buf += sizeof(*tcph);
+
+		ether_addr_copy(ethh->h_dest, cm_node->rem_mac);
+		ether_addr_copy(ethh->h_source, cm_node->loc_mac);
+		if (cm_node->vlan_id < VLAN_TAG_PRESENT) {
+			((struct vlan_ethhdr *)ethh)->h_vlan_proto = htons(ETH_P_8021Q);
+			vtag = (cm_node->user_pri << VLAN_PRIO_SHIFT) | cm_node->vlan_id;
+			((struct vlan_ethhdr *)ethh)->h_vlan_TCI = htons(vtag);
+			((struct vlan_ethhdr *)ethh)->h_vlan_encapsulated_proto = htons(ETH_P_IPV6);
+		} else {
+			ethh->h_proto = htons(ETH_P_IPV6);
+		}
+		ip6h->version = 6;
+		ip6h->priority = cm_node->tos >> 4;
+		ip6h->flow_lbl[0] = cm_node->tos << 4;
+		ip6h->flow_lbl[1] = 0;
+		ip6h->flow_lbl[2] = 0;
+		ip6h->payload_len = htons(packetsize - sizeof(*ip6h));
+		ip6h->nexthdr = 6;
+		ip6h->hop_limit = 128;
+		i40iw_copy_ip_htonl(ip6h->saddr.in6_u.u6_addr32,
+				    cm_node->loc_addr);
+		i40iw_copy_ip_htonl(ip6h->daddr.in6_u.u6_addr32,
+				    cm_node->rem_addr);
+	}
+
+	tcph->source = htons(cm_node->loc_port);
+	tcph->dest = htons(cm_node->rem_port);
+
+	tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num);
+
+	if (flags & SET_ACK) {
+		cm_node->tcp_cntxt.loc_ack_num = cm_node->tcp_cntxt.rcv_nxt;
+		tcph->ack_seq = htonl(cm_node->tcp_cntxt.loc_ack_num);
+		tcph->ack = 1;
+	} else {
+		tcph->ack_seq = 0;
+	}
+
+	if (flags & SET_SYN) {
+		cm_node->tcp_cntxt.loc_seq_num++;
+		tcph->syn = 1;
+	} else {
+		cm_node->tcp_cntxt.loc_seq_num += hdr_len + pd_len;
+	}
+
+	if (flags & SET_FIN) {
+		cm_node->tcp_cntxt.loc_seq_num++;
+		tcph->fin = 1;
+	}
+
+	if (flags & SET_RST)
+		tcph->rst = 1;
+
+	tcph->doff = (u16)((sizeof(*tcph) + opts_len + 3) >> 2);
+	sqbuf->tcphlen = tcph->doff << 2;
+	tcph->window = htons(cm_node->tcp_cntxt.rcv_wnd);
+	tcph->urg_ptr = 0;
+
+	if (opts_len) {
+		memcpy(buf, options->addr, opts_len);
+		buf += opts_len;
+	}
+
+	if (hdr_len) {
+		memcpy(buf, hdr->addr, hdr_len);
+		buf += hdr_len;
+	}
+
+	if (pdata && pdata->addr)
+		memcpy(buf, pdata->addr, pdata->size);
+
+	atomic_set(&sqbuf->refcount, 1);
+
+	return sqbuf;
+}
+
+/**
+ * i40iw_send_reset - Send RST packet
+ * @cm_node: connection's node
+ */
+int i40iw_send_reset(struct i40iw_cm_node *cm_node)
+{
+	struct i40iw_puda_buf *sqbuf;
+	int flags = SET_RST | SET_ACK;
+
+	sqbuf = i40iw_form_cm_frame(cm_node, NULL, NULL, NULL, flags);
+	if (!sqbuf) {
+		i40iw_pr_err("no sqbuf\n");
+		return -1;
+	}
+
+	return i40iw_schedule_cm_timer(cm_node, sqbuf, I40IW_TIMER_TYPE_SEND, 0, 1);
+}
+
+/**
+ * i40iw_active_open_err - send event for active side cm error
+ * @cm_node: connection's node
+ * @reset: Flag to send reset or not
+ */
+static void i40iw_active_open_err(struct i40iw_cm_node *cm_node, bool reset)
+{
+	i40iw_cleanup_retrans_entry(cm_node);
+	cm_node->cm_core->stats_connect_errs++;
+	if (reset) {
+		i40iw_debug(cm_node->dev,
+			    I40IW_DEBUG_CM,
+			    "%s cm_node=%p state=%d\n",
+			    __func__,
+			    cm_node,
+			    cm_node->state);
+		atomic_inc(&cm_node->ref_count);
+		i40iw_send_reset(cm_node);
+	}
+
+	cm_node->state = I40IW_CM_STATE_CLOSED;
+	i40iw_create_event(cm_node, I40IW_CM_EVENT_ABORTED);
+}
+
+/**
+ * i40iw_passive_open_err - handle passive side cm error
+ * @cm_node: connection's node
+ * @reset: send reset or just free cm_node
+ */
+static void i40iw_passive_open_err(struct i40iw_cm_node *cm_node, bool reset)
+{
+	i40iw_cleanup_retrans_entry(cm_node);
+	cm_node->cm_core->stats_passive_errs++;
+	cm_node->state = I40IW_CM_STATE_CLOSED;
+	i40iw_debug(cm_node->dev,
+		    I40IW_DEBUG_CM,
+		    "%s cm_node=%p state =%d\n",
+		    __func__,
+		    cm_node,
+		    cm_node->state);
+	if (reset)
+		i40iw_send_reset(cm_node);
+	else
+		i40iw_rem_ref_cm_node(cm_node);
+}
+
+/**
+ * i40iw_event_connect_error - to create connect error event
+ * @event: cm information for connect event
+ */
+static void i40iw_event_connect_error(struct i40iw_cm_event *event)
+{
+	struct i40iw_qp *iwqp;
+	struct iw_cm_id *cm_id;
+
+	cm_id = event->cm_node->cm_id;
+	if (!cm_id)
+		return;
+
+	iwqp = cm_id->provider_data;
+
+	if (!iwqp || !iwqp->iwdev)
+		return;
+
+	iwqp->cm_id = NULL;
+	cm_id->provider_data = NULL;
+	i40iw_send_cm_event(event->cm_node, cm_id,
+			    IW_CM_EVENT_CONNECT_REPLY,
+			    -ECONNRESET);
+	cm_id->rem_ref(cm_id);
+	i40iw_rem_ref_cm_node(event->cm_node);
+}
+
+/**
+ * i40iw_process_options
+ * @cm_node: connection's node
+ * @optionsloc: point to start of options
+ * @optionsize: size of all options
+ * @syn_packet: flag if syn packet
+ */
+static int i40iw_process_options(struct i40iw_cm_node *cm_node,
+				 u8 *optionsloc,
+				 u32 optionsize,
+				 u32 syn_packet)
+{
+	u32 tmp;
+	u32 offset = 0;
+	union all_known_options *all_options;
+	char got_mss_option = 0;
+
+	while (offset < optionsize) {
+		all_options = (union all_known_options *)(optionsloc + offset);
+		switch (all_options->as_base.optionnum) {
+		case OPTION_NUMBER_END:
+			offset = optionsize;
+			break;
+		case OPTION_NUMBER_NONE:
+			offset += 1;
+			continue;
+		case OPTION_NUMBER_MSS:
+			i40iw_debug(cm_node->dev,
+				    I40IW_DEBUG_CM,
+				    "%s: MSS Length: %d Offset: %d Size: %d\n",
+				    __func__,
+				    all_options->as_mss.length,
+				    offset,
+				    optionsize);
+			got_mss_option = 1;
+			if (all_options->as_mss.length != 4)
+				return -1;
+			tmp = ntohs(all_options->as_mss.mss);
+			if (tmp > 0 && tmp < cm_node->tcp_cntxt.mss)
+				cm_node->tcp_cntxt.mss = tmp;
+			break;
+		case OPTION_NUMBER_WINDOW_SCALE:
+			cm_node->tcp_cntxt.snd_wscale =
+			    all_options->as_windowscale.shiftcount;
+			break;
+		default:
+			i40iw_debug(cm_node->dev,
+				    I40IW_DEBUG_CM,
+				    "TCP Option not understood: %x\n",
+				    all_options->as_base.optionnum);
+			break;
+		}
+		offset += all_options->as_base.length;
+	}
+	if (!got_mss_option && syn_packet)
+		cm_node->tcp_cntxt.mss = I40IW_CM_DEFAULT_MSS;
+	return 0;
+}
+
+/**
+ * i40iw_handle_tcp_options -
+ * @cm_node: connection's node
+ * @tcph: pointer tcp header
+ * @optionsize: size of options rcvd
+ * @passive: active or passive flag
+ */
+static int i40iw_handle_tcp_options(struct i40iw_cm_node *cm_node,
+				    struct tcphdr *tcph,
+				    int optionsize,
+				    int passive)
+{
+	u8 *optionsloc = (u8 *)&tcph[1];
+
+	if (optionsize) {
+		if (i40iw_process_options(cm_node,
+					  optionsloc,
+					  optionsize,
+					  (u32)tcph->syn)) {
+			i40iw_debug(cm_node->dev,
+				    I40IW_DEBUG_CM,
+				    "%s: Node %p, Sending RESET\n",
+				    __func__,
+				    cm_node);
+			if (passive)
+				i40iw_passive_open_err(cm_node, true);
+			else
+				i40iw_active_open_err(cm_node, true);
+			return -1;
+		}
+	}
+
+	cm_node->tcp_cntxt.snd_wnd = ntohs(tcph->window) <<
+	    cm_node->tcp_cntxt.snd_wscale;
+
+	if (cm_node->tcp_cntxt.snd_wnd > cm_node->tcp_cntxt.max_snd_wnd)
+		cm_node->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.snd_wnd;
+	return 0;
+}
+
+/**
+ * i40iw_build_mpa_v1 - build a MPA V1 frame
+ * @cm_node: connection's node
+ * @mpa_key: to do read0 or write0
+ */
+static void i40iw_build_mpa_v1(struct i40iw_cm_node *cm_node,
+			       void *start_addr,
+			       u8 mpa_key)
+{
+	struct ietf_mpa_v1 *mpa_frame = (struct ietf_mpa_v1 *)start_addr;
+
+	switch (mpa_key) {
+	case MPA_KEY_REQUEST:
+		memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE);
+		break;
+	case MPA_KEY_REPLY:
+		memcpy(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE);
+		break;
+	default:
+		break;
+	}
+	mpa_frame->flags = IETF_MPA_FLAGS_CRC;
+	mpa_frame->rev = cm_node->mpa_frame_rev;
+	mpa_frame->priv_data_len = htons(cm_node->pdata.size);
+}
+
+/**
+ * i40iw_build_mpa_v2 - build a MPA V2 frame
+ * @cm_node: connection's node
+ * @start_addr: buffer start address
+ * @mpa_key: to do read0 or write0
+ */
+static void i40iw_build_mpa_v2(struct i40iw_cm_node *cm_node,
+			       void *start_addr,
+			       u8 mpa_key)
+{
+	struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr;
+	struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg;
+	u16 ctrl_ird, ctrl_ord;
+
+	/* initialize the upper 5 bytes of the frame */
+	i40iw_build_mpa_v1(cm_node, start_addr, mpa_key);
+	mpa_frame->flags |= IETF_MPA_V2_FLAG;
+	mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE);
+
+	/* initialize RTR msg */
+	if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
+		ctrl_ird = IETF_NO_IRD_ORD;
+		ctrl_ord = IETF_NO_IRD_ORD;
+	} else {
+		ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ?
+			IETF_NO_IRD_ORD : cm_node->ird_size;
+		ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ?
+			IETF_NO_IRD_ORD : cm_node->ord_size;
+	}
+
+	ctrl_ird |= IETF_PEER_TO_PEER;
+
+	switch (mpa_key) {
+	case MPA_KEY_REQUEST:
+		ctrl_ord |= IETF_RDMA0_WRITE;
+		ctrl_ord |= IETF_RDMA0_READ;
+		break;
+	case MPA_KEY_REPLY:
+		switch (cm_node->send_rdma0_op) {
+		case SEND_RDMA_WRITE_ZERO:
+			ctrl_ord |= IETF_RDMA0_WRITE;
+			break;
+		case SEND_RDMA_READ_ZERO:
+			ctrl_ord |= IETF_RDMA0_READ;
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+	rtr_msg->ctrl_ird = htons(ctrl_ird);
+	rtr_msg->ctrl_ord = htons(ctrl_ord);
+}
+
+/**
+ * i40iw_cm_build_mpa_frame - build mpa frame for mpa version 1 or version 2
+ * @cm_node: connection's node
+ * @mpa: mpa: data buffer
+ * @mpa_key: to do read0 or write0
+ */
+static int i40iw_cm_build_mpa_frame(struct i40iw_cm_node *cm_node,
+				    struct i40iw_kmem_info *mpa,
+				    u8 mpa_key)
+{
+	int hdr_len = 0;
+
+	switch (cm_node->mpa_frame_rev) {
+	case IETF_MPA_V1:
+		hdr_len = sizeof(struct ietf_mpa_v1);
+		i40iw_build_mpa_v1(cm_node, mpa->addr, mpa_key);
+		break;
+	case IETF_MPA_V2:
+		hdr_len = sizeof(struct ietf_mpa_v2);
+		i40iw_build_mpa_v2(cm_node, mpa->addr, mpa_key);
+		break;
+	default:
+		break;
+	}
+
+	return hdr_len;
+}
+
+/**
+ * i40iw_send_mpa_request - active node send mpa request to passive node
+ * @cm_node: connection's node
+ */
+static int i40iw_send_mpa_request(struct i40iw_cm_node *cm_node)
+{
+	struct i40iw_puda_buf *sqbuf;
+
+	if (!cm_node) {
+		i40iw_pr_err("cm_node == NULL\n");
+		return -1;
+	}
+
+	cm_node->mpa_hdr.addr = &cm_node->mpa_frame;
+	cm_node->mpa_hdr.size = i40iw_cm_build_mpa_frame(cm_node,
+							 &cm_node->mpa_hdr,
+							 MPA_KEY_REQUEST);
+	if (!cm_node->mpa_hdr.size) {
+		i40iw_pr_err("mpa size = %d\n", cm_node->mpa_hdr.size);
+		return -1;
+	}
+
+	sqbuf = i40iw_form_cm_frame(cm_node,
+				    NULL,
+				    &cm_node->mpa_hdr,
+				    &cm_node->pdata,
+				    SET_ACK);
+	if (!sqbuf) {
+		i40iw_pr_err("sq_buf == NULL\n");
+		return -1;
+	}
+	return i40iw_schedule_cm_timer(cm_node, sqbuf, I40IW_TIMER_TYPE_SEND, 1, 0);
+}
+
+/**
+ * i40iw_send_mpa_reject -
+ * @cm_node: connection's node
+ * @pdata: reject data for connection
+ * @plen: length of reject data
+ */
+static int i40iw_send_mpa_reject(struct i40iw_cm_node *cm_node,
+				 const void *pdata,
+				 u8 plen)
+{
+	struct i40iw_puda_buf *sqbuf;
+	struct i40iw_kmem_info priv_info;
+
+	cm_node->mpa_hdr.addr = &cm_node->mpa_frame;
+	cm_node->mpa_hdr.size = i40iw_cm_build_mpa_frame(cm_node,
+							 &cm_node->mpa_hdr,
+							 MPA_KEY_REPLY);
+
+	cm_node->mpa_frame.flags |= IETF_MPA_FLAGS_REJECT;
+	priv_info.addr = (void *)pdata;
+	priv_info.size = plen;
+
+	sqbuf = i40iw_form_cm_frame(cm_node,
+				    NULL,
+				    &cm_node->mpa_hdr,
+				    &priv_info,
+				    SET_ACK | SET_FIN);
+	if (!sqbuf) {
+		i40iw_pr_err("no sqbuf\n");
+		return -ENOMEM;
+	}
+	cm_node->state = I40IW_CM_STATE_FIN_WAIT1;
+	return i40iw_schedule_cm_timer(cm_node, sqbuf, I40IW_TIMER_TYPE_SEND, 1, 0);
+}
+
+/**
+ * recv_mpa - process an IETF MPA frame
+ * @cm_node: connection's node
+ * @buffer: Data pointer
+ * @type: to return accept or reject
+ * @len: Len of mpa buffer
+ */
+static int i40iw_parse_mpa(struct i40iw_cm_node *cm_node, u8 *buffer, u32 *type, u32 len)
+{
+	struct ietf_mpa_v1 *mpa_frame;
+	struct ietf_mpa_v2 *mpa_v2_frame;
+	struct ietf_rtr_msg *rtr_msg;
+	int mpa_hdr_len;
+	int priv_data_len;
+
+	*type = I40IW_MPA_REQUEST_ACCEPT;
+
+	if (len < sizeof(struct ietf_mpa_v1)) {
+		i40iw_pr_err("ietf buffer small (%x)\n", len);
+		return -1;
+	}
+
+	mpa_frame = (struct ietf_mpa_v1 *)buffer;
+	mpa_hdr_len = sizeof(struct ietf_mpa_v1);
+	priv_data_len = ntohs(mpa_frame->priv_data_len);
+
+	if (priv_data_len > IETF_MAX_PRIV_DATA_LEN) {
+		i40iw_pr_err("large pri_data %d\n", priv_data_len);
+		return -1;
+	}
+	if (mpa_frame->rev != IETF_MPA_V1 && mpa_frame->rev != IETF_MPA_V2) {
+		i40iw_pr_err("unsupported mpa rev = %d\n", mpa_frame->rev);
+		return -1;
+	}
+	if (mpa_frame->rev > cm_node->mpa_frame_rev) {
+		i40iw_pr_err("rev %d\n", mpa_frame->rev);
+		return -1;
+	}
+	cm_node->mpa_frame_rev = mpa_frame->rev;
+
+	if (cm_node->state != I40IW_CM_STATE_MPAREQ_SENT) {
+		if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE)) {
+			i40iw_pr_err("Unexpected MPA Key received\n");
+			return -1;
+		}
+	} else {
+		if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE)) {
+			i40iw_pr_err("Unexpected MPA Key received\n");
+			return -1;
+		}
+	}
+
+	if (priv_data_len + mpa_hdr_len > len) {
+		i40iw_pr_err("ietf buffer len(%x + %x != %x)\n",
+			     priv_data_len, mpa_hdr_len, len);
+		return -1;
+	}
+	if (len > MAX_CM_BUFFER) {
+		i40iw_pr_err("ietf buffer large len = %d\n", len);
+		return -1;
+	}
+
+	switch (mpa_frame->rev) {
+	case IETF_MPA_V2:{
+			u16 ird_size;
+			u16 ord_size;
+			u16 ctrl_ord;
+			u16 ctrl_ird;
+
+			mpa_v2_frame = (struct ietf_mpa_v2 *)buffer;
+			mpa_hdr_len += IETF_RTR_MSG_SIZE;
+			rtr_msg = &mpa_v2_frame->rtr_msg;
+
+			/* parse rtr message */
+			ctrl_ord = ntohs(rtr_msg->ctrl_ord);
+			ctrl_ird = ntohs(rtr_msg->ctrl_ird);
+			ird_size = ctrl_ird & IETF_NO_IRD_ORD;
+			ord_size = ctrl_ord & IETF_NO_IRD_ORD;
+
+			if (!(ctrl_ird & IETF_PEER_TO_PEER))
+				return -1;
+
+			if (ird_size == IETF_NO_IRD_ORD || ord_size == IETF_NO_IRD_ORD) {
+				cm_node->mpav2_ird_ord = IETF_NO_IRD_ORD;
+				goto negotiate_done;
+			}
+
+			if (cm_node->state != I40IW_CM_STATE_MPAREQ_SENT) {
+				/* responder */
+				if (!ord_size && (ctrl_ord & IETF_RDMA0_READ))
+					cm_node->ird_size = 1;
+				if (cm_node->ord_size > ird_size)
+					cm_node->ord_size = ird_size;
+			} else {
+				/* initiator */
+				if (!ird_size && (ctrl_ord & IETF_RDMA0_READ))
+					return -1;
+				if (cm_node->ord_size > ird_size)
+					cm_node->ord_size = ird_size;
+
+				if (cm_node->ird_size < ord_size)
+					/* no resources available */
+					return -1;
+			}
+
+negotiate_done:
+			if (ctrl_ord & IETF_RDMA0_READ)
+				cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
+			else if (ctrl_ord & IETF_RDMA0_WRITE)
+				cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO;
+			else	/* Not supported RDMA0 operation */
+				return -1;
+			i40iw_debug(cm_node->dev, I40IW_DEBUG_CM,
+				    "MPAV2: Negotiated ORD: %d, IRD: %d\n",
+				    cm_node->ord_size, cm_node->ird_size);
+			break;
+		}
+		break;
+	case IETF_MPA_V1:
+	default:
+		break;
+	}
+
+	memcpy(cm_node->pdata_buf, buffer + mpa_hdr_len, priv_data_len);
+	cm_node->pdata.size = priv_data_len;
+
+	if (mpa_frame->flags & IETF_MPA_FLAGS_REJECT)
+		*type = I40IW_MPA_REQUEST_REJECT;
+
+	if (mpa_frame->flags & IETF_MPA_FLAGS_MARKERS)
+		cm_node->snd_mark_en = true;
+
+	return 0;
+}
+
+/**
+ * i40iw_schedule_cm_timer
+ * @@cm_node: connection's node
+ * @sqbuf: buffer to send
+ * @type: if it is send or close
+ * @send_retrans: if rexmits to be done
+ * @close_when_complete: is cm_node to be removed
+ *
+ * note - cm_node needs to be protected before calling this. Encase in:
+ *		i40iw_rem_ref_cm_node(cm_core, cm_node);
+ *		i40iw_schedule_cm_timer(...)
+ *		atomic_inc(&cm_node->ref_count);
+ */
+int i40iw_schedule_cm_timer(struct i40iw_cm_node *cm_node,
+			    struct i40iw_puda_buf *sqbuf,
+			    enum i40iw_timer_type type,
+			    int send_retrans,
+			    int close_when_complete)
+{
+	struct i40iw_sc_vsi *vsi = &cm_node->iwdev->vsi;
+	struct i40iw_cm_core *cm_core = cm_node->cm_core;
+	struct i40iw_timer_entry *new_send;
+	int ret = 0;
+	u32 was_timer_set;
+	unsigned long flags;
+
+	new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC);
+	if (!new_send) {
+		if (type != I40IW_TIMER_TYPE_CLOSE)
+			i40iw_free_sqbuf(vsi, (void *)sqbuf);
+		return -ENOMEM;
+	}
+	new_send->retrycount = I40IW_DEFAULT_RETRYS;
+	new_send->retranscount = I40IW_DEFAULT_RETRANS;
+	new_send->sqbuf = sqbuf;
+	new_send->timetosend = jiffies;
+	new_send->type = type;
+	new_send->send_retrans = send_retrans;
+	new_send->close_when_complete = close_when_complete;
+
+	if (type == I40IW_TIMER_TYPE_CLOSE) {
+		new_send->timetosend += (HZ / 10);
+		if (cm_node->close_entry) {
+			kfree(new_send);
+			i40iw_pr_err("already close entry\n");
+			return -EINVAL;
+		}
+		cm_node->close_entry = new_send;
+	}
+
+	if (type == I40IW_TIMER_TYPE_SEND) {
+		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+		cm_node->send_entry = new_send;
+		atomic_inc(&cm_node->ref_count);
+		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+		new_send->timetosend = jiffies + I40IW_RETRY_TIMEOUT;
+
+		atomic_inc(&sqbuf->refcount);
+		i40iw_puda_send_buf(vsi->ilq, sqbuf);
+		if (!send_retrans) {
+			i40iw_cleanup_retrans_entry(cm_node);
+			if (close_when_complete)
+				i40iw_rem_ref_cm_node(cm_node);
+			return ret;
+		}
+	}
+
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+	was_timer_set = timer_pending(&cm_core->tcp_timer);
+
+	if (!was_timer_set) {
+		cm_core->tcp_timer.expires = new_send->timetosend;
+		add_timer(&cm_core->tcp_timer);
+	}
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	return ret;
+}
+
+/**
+ * i40iw_retrans_expired - Could not rexmit the packet
+ * @cm_node: connection's node
+ */
+static void i40iw_retrans_expired(struct i40iw_cm_node *cm_node)
+{
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	enum i40iw_cm_node_state state = cm_node->state;
+
+	cm_node->state = I40IW_CM_STATE_CLOSED;
+	switch (state) {
+	case I40IW_CM_STATE_SYN_RCVD:
+	case I40IW_CM_STATE_CLOSING:
+		i40iw_rem_ref_cm_node(cm_node);
+		break;
+	case I40IW_CM_STATE_FIN_WAIT1:
+	case I40IW_CM_STATE_LAST_ACK:
+		if (cm_node->cm_id)
+			cm_id->rem_ref(cm_id);
+		i40iw_send_reset(cm_node);
+		break;
+	default:
+		atomic_inc(&cm_node->ref_count);
+		i40iw_send_reset(cm_node);
+		i40iw_create_event(cm_node, I40IW_CM_EVENT_ABORTED);
+		break;
+	}
+}
+
+/**
+ * i40iw_handle_close_entry - for handling retry/timeouts
+ * @cm_node: connection's node
+ * @rem_node: flag for remove cm_node
+ */
+static void i40iw_handle_close_entry(struct i40iw_cm_node *cm_node, u32 rem_node)
+{
+	struct i40iw_timer_entry *close_entry = cm_node->close_entry;
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	struct i40iw_qp *iwqp;
+	unsigned long flags;
+
+	if (!close_entry)
+		return;
+	iwqp = (struct i40iw_qp *)close_entry->sqbuf;
+	if (iwqp) {
+		spin_lock_irqsave(&iwqp->lock, flags);
+		if (iwqp->cm_id) {
+			iwqp->hw_tcp_state = I40IW_TCP_STATE_CLOSED;
+			iwqp->hw_iwarp_state = I40IW_QP_STATE_ERROR;
+			iwqp->last_aeq = I40IW_AE_RESET_SENT;
+			iwqp->ibqp_state = IB_QPS_ERR;
+			spin_unlock_irqrestore(&iwqp->lock, flags);
+			i40iw_cm_disconn(iwqp);
+		} else {
+			spin_unlock_irqrestore(&iwqp->lock, flags);
+		}
+	} else if (rem_node) {
+		/* TIME_WAIT state */
+		i40iw_rem_ref_cm_node(cm_node);
+	}
+	if (cm_id)
+		cm_id->rem_ref(cm_id);
+	kfree(close_entry);
+	cm_node->close_entry = NULL;
+}
+
+/**
+ * i40iw_build_timer_list - Add cm_nodes to timer list
+ * @timer_list: ptr to timer list
+ * @hte: ptr to accelerated or non-accelerated list
+ */
+static void i40iw_build_timer_list(struct list_head *timer_list,
+				   struct list_head *hte)
+{
+	struct i40iw_cm_node *cm_node;
+	struct list_head *list_core_temp, *list_node;
+
+	list_for_each_safe(list_node, list_core_temp, hte) {
+		cm_node = container_of(list_node, struct i40iw_cm_node, list);
+		if (cm_node->close_entry || cm_node->send_entry) {
+			atomic_inc(&cm_node->ref_count);
+			list_add(&cm_node->timer_entry, timer_list);
+		}
+	}
+}
+
+/**
+ * i40iw_cm_timer_tick - system's timer expired callback
+ * @pass: Pointing to cm_core
+ */
+static void i40iw_cm_timer_tick(struct timer_list *t)
+{
+	unsigned long nexttimeout = jiffies + I40IW_LONG_TIME;
+	struct i40iw_cm_node *cm_node;
+	struct i40iw_timer_entry *send_entry, *close_entry;
+	struct list_head *list_core_temp;
+	struct i40iw_sc_vsi *vsi;
+	struct list_head *list_node;
+	struct i40iw_cm_core *cm_core = from_timer(cm_core, t, tcp_timer);
+	u32 settimer = 0;
+	unsigned long timetosend;
+	unsigned long flags;
+
+	struct list_head timer_list;
+
+	INIT_LIST_HEAD(&timer_list);
+
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+	i40iw_build_timer_list(&timer_list, &cm_core->non_accelerated_list);
+	i40iw_build_timer_list(&timer_list, &cm_core->accelerated_list);
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	list_for_each_safe(list_node, list_core_temp, &timer_list) {
+		cm_node = container_of(list_node,
+				       struct i40iw_cm_node,
+				       timer_entry);
+		close_entry = cm_node->close_entry;
+
+		if (close_entry) {
+			if (time_after(close_entry->timetosend, jiffies)) {
+				if (nexttimeout > close_entry->timetosend ||
+				    !settimer) {
+					nexttimeout = close_entry->timetosend;
+					settimer = 1;
+				}
+			} else {
+				i40iw_handle_close_entry(cm_node, 1);
+			}
+		}
+
+		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+
+		send_entry = cm_node->send_entry;
+		if (!send_entry)
+			goto done;
+		if (time_after(send_entry->timetosend, jiffies)) {
+			if (cm_node->state != I40IW_CM_STATE_OFFLOADED) {
+				if ((nexttimeout > send_entry->timetosend) ||
+				    !settimer) {
+					nexttimeout = send_entry->timetosend;
+					settimer = 1;
+				}
+			} else {
+				i40iw_free_retrans_entry(cm_node);
+			}
+			goto done;
+		}
+
+		if ((cm_node->state == I40IW_CM_STATE_OFFLOADED) ||
+		    (cm_node->state == I40IW_CM_STATE_CLOSED)) {
+			i40iw_free_retrans_entry(cm_node);
+			goto done;
+		}
+
+		if (!send_entry->retranscount || !send_entry->retrycount) {
+			i40iw_free_retrans_entry(cm_node);
+
+			spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+			i40iw_retrans_expired(cm_node);
+			cm_node->state = I40IW_CM_STATE_CLOSED;
+			spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+			goto done;
+		}
+		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+
+		vsi = &cm_node->iwdev->vsi;
+
+		if (!cm_node->ack_rcvd) {
+			atomic_inc(&send_entry->sqbuf->refcount);
+			i40iw_puda_send_buf(vsi->ilq, send_entry->sqbuf);
+			cm_node->cm_core->stats_pkt_retrans++;
+		}
+		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+		if (send_entry->send_retrans) {
+			send_entry->retranscount--;
+			timetosend = (I40IW_RETRY_TIMEOUT <<
+				      (I40IW_DEFAULT_RETRANS -
+				       send_entry->retranscount));
+
+			send_entry->timetosend = jiffies +
+			    min(timetosend, I40IW_MAX_TIMEOUT);
+			if (nexttimeout > send_entry->timetosend || !settimer) {
+				nexttimeout = send_entry->timetosend;
+				settimer = 1;
+			}
+		} else {
+			int close_when_complete;
+
+			close_when_complete = send_entry->close_when_complete;
+			i40iw_debug(cm_node->dev,
+				    I40IW_DEBUG_CM,
+				    "cm_node=%p state=%d\n",
+				    cm_node,
+				    cm_node->state);
+			i40iw_free_retrans_entry(cm_node);
+			if (close_when_complete)
+				i40iw_rem_ref_cm_node(cm_node);
+		}
+done:
+		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+		i40iw_rem_ref_cm_node(cm_node);
+	}
+
+	if (settimer) {
+		spin_lock_irqsave(&cm_core->ht_lock, flags);
+		if (!timer_pending(&cm_core->tcp_timer)) {
+			cm_core->tcp_timer.expires = nexttimeout;
+			add_timer(&cm_core->tcp_timer);
+		}
+		spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+	}
+}
+
+/**
+ * i40iw_send_syn - send SYN packet
+ * @cm_node: connection's node
+ * @sendack: flag to set ACK bit or not
+ */
+int i40iw_send_syn(struct i40iw_cm_node *cm_node, u32 sendack)
+{
+	struct i40iw_puda_buf *sqbuf;
+	int flags = SET_SYN;
+	char optionsbuffer[sizeof(struct option_mss) +
+			   sizeof(struct option_windowscale) +
+			   sizeof(struct option_base) + TCP_OPTIONS_PADDING];
+	struct i40iw_kmem_info opts;
+
+	int optionssize = 0;
+	/* Sending MSS option */
+	union all_known_options *options;
+
+	opts.addr = optionsbuffer;
+	if (!cm_node) {
+		i40iw_pr_err("no cm_node\n");
+		return -EINVAL;
+	}
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_mss.optionnum = OPTION_NUMBER_MSS;
+	options->as_mss.length = sizeof(struct option_mss);
+	options->as_mss.mss = htons(cm_node->tcp_cntxt.mss);
+	optionssize += sizeof(struct option_mss);
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_windowscale.optionnum = OPTION_NUMBER_WINDOW_SCALE;
+	options->as_windowscale.length = sizeof(struct option_windowscale);
+	options->as_windowscale.shiftcount = cm_node->tcp_cntxt.rcv_wscale;
+	optionssize += sizeof(struct option_windowscale);
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_end = OPTION_NUMBER_END;
+	optionssize += 1;
+
+	if (sendack)
+		flags |= SET_ACK;
+
+	opts.size = optionssize;
+
+	sqbuf = i40iw_form_cm_frame(cm_node, &opts, NULL, NULL, flags);
+	if (!sqbuf) {
+		i40iw_pr_err("no sqbuf\n");
+		return -1;
+	}
+	return i40iw_schedule_cm_timer(cm_node, sqbuf, I40IW_TIMER_TYPE_SEND, 1, 0);
+}
+
+/**
+ * i40iw_send_ack - Send ACK packet
+ * @cm_node: connection's node
+ */
+static void i40iw_send_ack(struct i40iw_cm_node *cm_node)
+{
+	struct i40iw_puda_buf *sqbuf;
+	struct i40iw_sc_vsi *vsi = &cm_node->iwdev->vsi;
+
+	sqbuf = i40iw_form_cm_frame(cm_node, NULL, NULL, NULL, SET_ACK);
+	if (sqbuf)
+		i40iw_puda_send_buf(vsi->ilq, sqbuf);
+	else
+		i40iw_pr_err("no sqbuf\n");
+}
+
+/**
+ * i40iw_send_fin - Send FIN pkt
+ * @cm_node: connection's node
+ */
+static int i40iw_send_fin(struct i40iw_cm_node *cm_node)
+{
+	struct i40iw_puda_buf *sqbuf;
+
+	sqbuf = i40iw_form_cm_frame(cm_node, NULL, NULL, NULL, SET_ACK | SET_FIN);
+	if (!sqbuf) {
+		i40iw_pr_err("no sqbuf\n");
+		return -1;
+	}
+	return i40iw_schedule_cm_timer(cm_node, sqbuf, I40IW_TIMER_TYPE_SEND, 1, 0);
+}
+
+/**
+ * i40iw_find_node - find a cm node that matches the reference cm node
+ * @cm_core: cm's core
+ * @rem_port: remote tcp port num
+ * @rem_addr: remote ip addr
+ * @loc_port: local tcp port num
+ * @loc_addr: loc ip addr
+ * @add_refcnt: flag to increment refcount of cm_node
+ * @accelerated_list: flag for accelerated vs non-accelerated list to search
+ */
+struct i40iw_cm_node *i40iw_find_node(struct i40iw_cm_core *cm_core,
+				      u16 rem_port,
+				      u32 *rem_addr,
+				      u16 loc_port,
+				      u32 *loc_addr,
+				      bool add_refcnt,
+				      bool accelerated_list)
+{
+	struct list_head *hte;
+	struct i40iw_cm_node *cm_node;
+	unsigned long flags;
+
+	hte = accelerated_list ?
+	      &cm_core->accelerated_list : &cm_core->non_accelerated_list;
+
+	/* walk list and find cm_node associated with this session ID */
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+	list_for_each_entry(cm_node, hte, list) {
+		if (!memcmp(cm_node->loc_addr, loc_addr, sizeof(cm_node->loc_addr)) &&
+		    (cm_node->loc_port == loc_port) &&
+		    !memcmp(cm_node->rem_addr, rem_addr, sizeof(cm_node->rem_addr)) &&
+		    (cm_node->rem_port == rem_port)) {
+			if (add_refcnt)
+				atomic_inc(&cm_node->ref_count);
+			spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+			return cm_node;
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	/* no owner node */
+	return NULL;
+}
+
+/**
+ * i40iw_find_listener - find a cm node listening on this addr-port pair
+ * @cm_core: cm's core
+ * @dst_port: listener tcp port num
+ * @dst_addr: listener ip addr
+ * @listener_state: state to match with listen node's
+ */
+static struct i40iw_cm_listener *i40iw_find_listener(
+						     struct i40iw_cm_core *cm_core,
+						     u32 *dst_addr,
+						     u16 dst_port,
+						     u16 vlan_id,
+						     enum i40iw_cm_listener_state
+						     listener_state)
+{
+	struct i40iw_cm_listener *listen_node;
+	static const u32 ip_zero[4] = { 0, 0, 0, 0 };
+	u32 listen_addr[4];
+	u16 listen_port;
+	unsigned long flags;
+
+	/* walk list and find cm_node associated with this session ID */
+	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+	list_for_each_entry(listen_node, &cm_core->listen_nodes, list) {
+		memcpy(listen_addr, listen_node->loc_addr, sizeof(listen_addr));
+		listen_port = listen_node->loc_port;
+		/* compare node pair, return node handle if a match */
+		if ((!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) ||
+		     !memcmp(listen_addr, ip_zero, sizeof(listen_addr))) &&
+		     (listen_port == dst_port) &&
+		     (listener_state & listen_node->listener_state)) {
+			atomic_inc(&listen_node->ref_count);
+			spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+			return listen_node;
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+	return NULL;
+}
+
+/**
+ * i40iw_add_hte_node - add a cm node to the hash table
+ * @cm_core: cm's core
+ * @cm_node: connection's node
+ */
+static void i40iw_add_hte_node(struct i40iw_cm_core *cm_core,
+			       struct i40iw_cm_node *cm_node)
+{
+	unsigned long flags;
+
+	if (!cm_node || !cm_core) {
+		i40iw_pr_err("cm_node or cm_core == NULL\n");
+		return;
+	}
+
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+	list_add_tail(&cm_node->list, &cm_core->non_accelerated_list);
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+}
+
+/**
+ * i40iw_find_port - find port that matches reference port
+ * @port: port number
+ * @accelerated_list: flag for accelerated vs non-accelerated list
+ */
+static bool i40iw_find_port(struct i40iw_cm_core *cm_core, u16 port,
+			    bool accelerated_list)
+{
+	struct list_head *hte;
+	struct i40iw_cm_node *cm_node;
+
+	hte = accelerated_list ?
+	      &cm_core->accelerated_list : &cm_core->non_accelerated_list;
+
+	list_for_each_entry(cm_node, hte, list) {
+		if (cm_node->loc_port == port)
+			return true;
+	}
+	return false;
+}
+
+/**
+ * i40iw_port_in_use - determine if port is in use
+ * @port: port number
+ * @active_side: flag for listener side vs active side
+ */
+static bool i40iw_port_in_use(struct i40iw_cm_core *cm_core, u16 port, bool active_side)
+{
+	struct i40iw_cm_listener *listen_node;
+	unsigned long flags;
+	bool ret = false;
+
+	if (active_side) {
+		spin_lock_irqsave(&cm_core->ht_lock, flags);
+		ret = i40iw_find_port(cm_core, port, true);
+		if (!ret)
+			ret = i40iw_find_port(cm_core, port, false);
+		if (!ret)
+			clear_bit(port, cm_core->active_side_ports);
+		spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+	} else {
+		spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+		list_for_each_entry(listen_node, &cm_core->listen_nodes, list) {
+			if (listen_node->loc_port == port) {
+				ret = true;
+				break;
+			}
+		}
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+	}
+
+	return ret;
+}
+
+/**
+ * i40iw_del_multiple_qhash - Remove qhash and child listens
+ * @iwdev: iWarp device
+ * @cm_info: CM info for parent listen node
+ * @cm_parent_listen_node: The parent listen node
+ */
+static enum i40iw_status_code i40iw_del_multiple_qhash(
+						       struct i40iw_device *iwdev,
+						       struct i40iw_cm_info *cm_info,
+						       struct i40iw_cm_listener *cm_parent_listen_node)
+{
+	struct i40iw_cm_listener *child_listen_node;
+	enum i40iw_status_code ret = I40IW_ERR_CONFIG;
+	struct list_head *pos, *tpos;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iwdev->cm_core.listen_list_lock, flags);
+	list_for_each_safe(pos, tpos, &cm_parent_listen_node->child_listen_list) {
+		child_listen_node = list_entry(pos, struct i40iw_cm_listener, child_listen_list);
+		if (child_listen_node->ipv4)
+			i40iw_debug(&iwdev->sc_dev,
+				    I40IW_DEBUG_CM,
+				    "removing child listen for IP=%pI4, port=%d, vlan=%d\n",
+				    child_listen_node->loc_addr,
+				    child_listen_node->loc_port,
+				    child_listen_node->vlan_id);
+		else
+			i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM,
+				    "removing child listen for IP=%pI6, port=%d, vlan=%d\n",
+				    child_listen_node->loc_addr,
+				    child_listen_node->loc_port,
+				    child_listen_node->vlan_id);
+		list_del(pos);
+		memcpy(cm_info->loc_addr, child_listen_node->loc_addr,
+		       sizeof(cm_info->loc_addr));
+		cm_info->vlan_id = child_listen_node->vlan_id;
+		if (child_listen_node->qhash_set) {
+			ret = i40iw_manage_qhash(iwdev, cm_info,
+						 I40IW_QHASH_TYPE_TCP_SYN,
+						 I40IW_QHASH_MANAGE_TYPE_DELETE,
+						 NULL, false);
+			child_listen_node->qhash_set = false;
+		} else {
+			ret = I40IW_SUCCESS;
+		}
+		i40iw_debug(&iwdev->sc_dev,
+			    I40IW_DEBUG_CM,
+			    "freed pointer = %p\n",
+			    child_listen_node);
+		kfree(child_listen_node);
+		cm_parent_listen_node->cm_core->stats_listen_nodes_destroyed++;
+	}
+	spin_unlock_irqrestore(&iwdev->cm_core.listen_list_lock, flags);
+
+	return ret;
+}
+
+/**
+ * i40iw_netdev_vlan_ipv6 - Gets the netdev and vlan
+ * @addr: local IPv6 address
+ * @vlan_id: vlan id for the given IPv6 address
+ *
+ * Returns the net_device of the IPv6 address and also sets the
+ * vlan id for that address.
+ */
+static struct net_device *i40iw_netdev_vlan_ipv6(u32 *addr, u16 *vlan_id)
+{
+	struct net_device *ip_dev = NULL;
+	struct in6_addr laddr6;
+
+	if (!IS_ENABLED(CONFIG_IPV6))
+		return NULL;
+	i40iw_copy_ip_htonl(laddr6.in6_u.u6_addr32, addr);
+	if (vlan_id)
+		*vlan_id = I40IW_NO_VLAN;
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, ip_dev) {
+		if (ipv6_chk_addr(&init_net, &laddr6, ip_dev, 1)) {
+			if (vlan_id)
+				*vlan_id = rdma_vlan_dev_vlan_id(ip_dev);
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return ip_dev;
+}
+
+/**
+ * i40iw_get_vlan_ipv4 - Returns the vlan_id for IPv4 address
+ * @addr: local IPv4 address
+ */
+static u16 i40iw_get_vlan_ipv4(u32 *addr)
+{
+	struct net_device *netdev;
+	u16 vlan_id = I40IW_NO_VLAN;
+
+	netdev = ip_dev_find(&init_net, htonl(addr[0]));
+	if (netdev) {
+		vlan_id = rdma_vlan_dev_vlan_id(netdev);
+		dev_put(netdev);
+	}
+	return vlan_id;
+}
+
+/**
+ * i40iw_add_mqh_6 - Adds multiple qhashes for IPv6
+ * @iwdev: iWarp device
+ * @cm_info: CM info for parent listen node
+ * @cm_parent_listen_node: The parent listen node
+ *
+ * Adds a qhash and a child listen node for every IPv6 address
+ * on the adapter and adds the associated qhash filter
+ */
+static enum i40iw_status_code i40iw_add_mqh_6(struct i40iw_device *iwdev,
+					      struct i40iw_cm_info *cm_info,
+					      struct i40iw_cm_listener *cm_parent_listen_node)
+{
+	struct net_device *ip_dev;
+	struct inet6_dev *idev;
+	struct inet6_ifaddr *ifp, *tmp;
+	enum i40iw_status_code ret = 0;
+	struct i40iw_cm_listener *child_listen_node;
+	unsigned long flags;
+
+	rtnl_lock();
+	for_each_netdev_rcu(&init_net, ip_dev) {
+		if ((((rdma_vlan_dev_vlan_id(ip_dev) < I40IW_NO_VLAN) &&
+		      (rdma_vlan_dev_real_dev(ip_dev) == iwdev->netdev)) ||
+		     (ip_dev == iwdev->netdev)) && (ip_dev->flags & IFF_UP)) {
+			idev = __in6_dev_get(ip_dev);
+			if (!idev) {
+				i40iw_pr_err("idev == NULL\n");
+				break;
+			}
+			list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
+				i40iw_debug(&iwdev->sc_dev,
+					    I40IW_DEBUG_CM,
+					    "IP=%pI6, vlan_id=%d, MAC=%pM\n",
+					    &ifp->addr,
+					    rdma_vlan_dev_vlan_id(ip_dev),
+					    ip_dev->dev_addr);
+				child_listen_node =
+					kzalloc(sizeof(*child_listen_node), GFP_ATOMIC);
+				i40iw_debug(&iwdev->sc_dev,
+					    I40IW_DEBUG_CM,
+					    "Allocating child listener %p\n",
+					    child_listen_node);
+				if (!child_listen_node) {
+					ret = I40IW_ERR_NO_MEMORY;
+					goto exit;
+				}
+				cm_info->vlan_id = rdma_vlan_dev_vlan_id(ip_dev);
+				cm_parent_listen_node->vlan_id = cm_info->vlan_id;
+
+				memcpy(child_listen_node, cm_parent_listen_node,
+				       sizeof(*child_listen_node));
+
+				i40iw_copy_ip_ntohl(child_listen_node->loc_addr,
+						    ifp->addr.in6_u.u6_addr32);
+				memcpy(cm_info->loc_addr, child_listen_node->loc_addr,
+				       sizeof(cm_info->loc_addr));
+
+				ret = i40iw_manage_qhash(iwdev, cm_info,
+							 I40IW_QHASH_TYPE_TCP_SYN,
+							 I40IW_QHASH_MANAGE_TYPE_ADD,
+							 NULL, true);
+				if (!ret) {
+					child_listen_node->qhash_set = true;
+					spin_lock_irqsave(&iwdev->cm_core.listen_list_lock, flags);
+					list_add(&child_listen_node->child_listen_list,
+						 &cm_parent_listen_node->child_listen_list);
+					spin_unlock_irqrestore(&iwdev->cm_core.listen_list_lock, flags);
+					cm_parent_listen_node->cm_core->stats_listen_nodes_created++;
+				} else {
+					kfree(child_listen_node);
+				}
+			}
+		}
+	}
+exit:
+	rtnl_unlock();
+	return ret;
+}
+
+/**
+ * i40iw_add_mqh_4 - Adds multiple qhashes for IPv4
+ * @iwdev: iWarp device
+ * @cm_info: CM info for parent listen node
+ * @cm_parent_listen_node: The parent listen node
+ *
+ * Adds a qhash and a child listen node for every IPv4 address
+ * on the adapter and adds the associated qhash filter
+ */
+static enum i40iw_status_code i40iw_add_mqh_4(
+				struct i40iw_device *iwdev,
+				struct i40iw_cm_info *cm_info,
+				struct i40iw_cm_listener *cm_parent_listen_node)
+{
+	struct net_device *dev;
+	struct in_device *idev;
+	struct i40iw_cm_listener *child_listen_node;
+	enum i40iw_status_code ret = 0;
+	unsigned long flags;
+
+	rtnl_lock();
+	for_each_netdev(&init_net, dev) {
+		if ((((rdma_vlan_dev_vlan_id(dev) < I40IW_NO_VLAN) &&
+		      (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
+		    (dev == iwdev->netdev)) && (dev->flags & IFF_UP)) {
+			idev = in_dev_get(dev);
+			for_ifa(idev) {
+				i40iw_debug(&iwdev->sc_dev,
+					    I40IW_DEBUG_CM,
+					    "Allocating child CM Listener forIP=%pI4, vlan_id=%d, MAC=%pM\n",
+					    &ifa->ifa_address,
+					    rdma_vlan_dev_vlan_id(dev),
+					    dev->dev_addr);
+				child_listen_node = kzalloc(sizeof(*child_listen_node), GFP_ATOMIC);
+				cm_parent_listen_node->cm_core->stats_listen_nodes_created++;
+				i40iw_debug(&iwdev->sc_dev,
+					    I40IW_DEBUG_CM,
+					    "Allocating child listener %p\n",
+					    child_listen_node);
+				if (!child_listen_node) {
+					in_dev_put(idev);
+					ret = I40IW_ERR_NO_MEMORY;
+					goto exit;
+				}
+				cm_info->vlan_id = rdma_vlan_dev_vlan_id(dev);
+				cm_parent_listen_node->vlan_id = cm_info->vlan_id;
+				memcpy(child_listen_node,
+				       cm_parent_listen_node,
+				       sizeof(*child_listen_node));
+
+				child_listen_node->loc_addr[0] = ntohl(ifa->ifa_address);
+				memcpy(cm_info->loc_addr, child_listen_node->loc_addr,
+				       sizeof(cm_info->loc_addr));
+
+				ret = i40iw_manage_qhash(iwdev,
+							 cm_info,
+							 I40IW_QHASH_TYPE_TCP_SYN,
+							 I40IW_QHASH_MANAGE_TYPE_ADD,
+							 NULL,
+							 true);
+				if (!ret) {
+					child_listen_node->qhash_set = true;
+					spin_lock_irqsave(&iwdev->cm_core.listen_list_lock, flags);
+					list_add(&child_listen_node->child_listen_list,
+						 &cm_parent_listen_node->child_listen_list);
+					spin_unlock_irqrestore(&iwdev->cm_core.listen_list_lock, flags);
+				} else {
+					kfree(child_listen_node);
+					cm_parent_listen_node->cm_core->stats_listen_nodes_created--;
+				}
+			}
+			endfor_ifa(idev);
+			in_dev_put(idev);
+		}
+	}
+exit:
+	rtnl_unlock();
+	return ret;
+}
+
+/**
+ * i40iw_dec_refcnt_listen - delete listener and associated cm nodes
+ * @cm_core: cm's core
+ * @free_hanging_nodes: to free associated cm_nodes
+ * @apbvt_del: flag to delete the apbvt
+ */
+static int i40iw_dec_refcnt_listen(struct i40iw_cm_core *cm_core,
+				   struct i40iw_cm_listener *listener,
+				   int free_hanging_nodes, bool apbvt_del)
+{
+	int ret = -EINVAL;
+	int err = 0;
+	struct list_head *list_pos;
+	struct list_head *list_temp;
+	struct i40iw_cm_node *cm_node;
+	struct list_head reset_list;
+	struct i40iw_cm_info nfo;
+	struct i40iw_cm_node *loopback;
+	enum i40iw_cm_node_state old_state;
+	unsigned long flags;
+
+	/* free non-accelerated child nodes for this listener */
+	INIT_LIST_HEAD(&reset_list);
+	if (free_hanging_nodes) {
+		spin_lock_irqsave(&cm_core->ht_lock, flags);
+		list_for_each_safe(list_pos,
+				   list_temp, &cm_core->non_accelerated_list) {
+			cm_node = container_of(list_pos, struct i40iw_cm_node, list);
+			if ((cm_node->listener == listener) &&
+			    !cm_node->accelerated) {
+				atomic_inc(&cm_node->ref_count);
+				list_add(&cm_node->reset_entry, &reset_list);
+			}
+		}
+		spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+	}
+
+	list_for_each_safe(list_pos, list_temp, &reset_list) {
+		cm_node = container_of(list_pos, struct i40iw_cm_node, reset_entry);
+		loopback = cm_node->loopbackpartner;
+		if (cm_node->state >= I40IW_CM_STATE_FIN_WAIT1) {
+			i40iw_rem_ref_cm_node(cm_node);
+		} else {
+			if (!loopback) {
+				i40iw_cleanup_retrans_entry(cm_node);
+				err = i40iw_send_reset(cm_node);
+				if (err) {
+					cm_node->state = I40IW_CM_STATE_CLOSED;
+					i40iw_pr_err("send reset\n");
+				} else {
+					old_state = cm_node->state;
+					cm_node->state = I40IW_CM_STATE_LISTENER_DESTROYED;
+					if (old_state != I40IW_CM_STATE_MPAREQ_RCVD)
+						i40iw_rem_ref_cm_node(cm_node);
+				}
+			} else {
+				struct i40iw_cm_event event;
+
+				event.cm_node = loopback;
+				memcpy(event.cm_info.rem_addr,
+				       loopback->rem_addr, sizeof(event.cm_info.rem_addr));
+				memcpy(event.cm_info.loc_addr,
+				       loopback->loc_addr, sizeof(event.cm_info.loc_addr));
+				event.cm_info.rem_port = loopback->rem_port;
+				event.cm_info.loc_port = loopback->loc_port;
+				event.cm_info.cm_id = loopback->cm_id;
+				event.cm_info.ipv4 = loopback->ipv4;
+				atomic_inc(&loopback->ref_count);
+				loopback->state = I40IW_CM_STATE_CLOSED;
+				i40iw_event_connect_error(&event);
+				cm_node->state = I40IW_CM_STATE_LISTENER_DESTROYED;
+				i40iw_rem_ref_cm_node(cm_node);
+			}
+		}
+	}
+
+	if (!atomic_dec_return(&listener->ref_count)) {
+		spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+		list_del(&listener->list);
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+
+		if (listener->iwdev) {
+			if (apbvt_del && !i40iw_port_in_use(cm_core, listener->loc_port, false))
+				i40iw_manage_apbvt(listener->iwdev,
+						   listener->loc_port,
+						   I40IW_MANAGE_APBVT_DEL);
+
+			memcpy(nfo.loc_addr, listener->loc_addr, sizeof(nfo.loc_addr));
+			nfo.loc_port = listener->loc_port;
+			nfo.ipv4 = listener->ipv4;
+			nfo.vlan_id = listener->vlan_id;
+			nfo.user_pri = listener->user_pri;
+
+			if (!list_empty(&listener->child_listen_list)) {
+				i40iw_del_multiple_qhash(listener->iwdev, &nfo, listener);
+			} else {
+				if (listener->qhash_set)
+					i40iw_manage_qhash(listener->iwdev,
+							   &nfo,
+							   I40IW_QHASH_TYPE_TCP_SYN,
+							   I40IW_QHASH_MANAGE_TYPE_DELETE,
+							   NULL,
+							   false);
+			}
+		}
+
+		cm_core->stats_listen_destroyed++;
+		kfree(listener);
+		cm_core->stats_listen_nodes_destroyed++;
+		listener = NULL;
+		ret = 0;
+	}
+
+	if (listener) {
+		if (atomic_read(&listener->pend_accepts_cnt) > 0)
+			i40iw_debug(cm_core->dev,
+				    I40IW_DEBUG_CM,
+				    "%s: listener (%p) pending accepts=%u\n",
+				    __func__,
+				    listener,
+				    atomic_read(&listener->pend_accepts_cnt));
+	}
+
+	return ret;
+}
+
+/**
+ * i40iw_cm_del_listen - delete a linstener
+ * @cm_core: cm's core
+  * @listener: passive connection's listener
+ * @apbvt_del: flag to delete apbvt
+ */
+static int i40iw_cm_del_listen(struct i40iw_cm_core *cm_core,
+			       struct i40iw_cm_listener *listener,
+			       bool apbvt_del)
+{
+	listener->listener_state = I40IW_CM_LISTENER_PASSIVE_STATE;
+	listener->cm_id = NULL;	/* going to be destroyed pretty soon */
+	return i40iw_dec_refcnt_listen(cm_core, listener, 1, apbvt_del);
+}
+
+/**
+ * i40iw_addr_resolve_neigh - resolve neighbor address
+ * @iwdev: iwarp device structure
+ * @src_ip: local ip address
+ * @dst_ip: remote ip address
+ * @arpindex: if there is an arp entry
+ */
+static int i40iw_addr_resolve_neigh(struct i40iw_device *iwdev,
+				    u32 src_ip,
+				    u32 dst_ip,
+				    int arpindex)
+{
+	struct rtable *rt;
+	struct neighbour *neigh;
+	int rc = arpindex;
+	struct net_device *netdev = iwdev->netdev;
+	__be32 dst_ipaddr = htonl(dst_ip);
+	__be32 src_ipaddr = htonl(src_ip);
+
+	rt = ip_route_output(&init_net, dst_ipaddr, src_ipaddr, 0, 0);
+	if (IS_ERR(rt)) {
+		i40iw_pr_err("ip_route_output\n");
+		return rc;
+	}
+
+	if (netif_is_bond_slave(netdev))
+		netdev = netdev_master_upper_dev_get(netdev);
+
+	neigh = dst_neigh_lookup(&rt->dst, &dst_ipaddr);
+
+	rcu_read_lock();
+	if (neigh) {
+		if (neigh->nud_state & NUD_VALID) {
+			if (arpindex >= 0) {
+				if (ether_addr_equal(iwdev->arp_table[arpindex].mac_addr,
+						     neigh->ha))
+					/* Mac address same as arp table */
+					goto resolve_neigh_exit;
+				i40iw_manage_arp_cache(iwdev,
+						       iwdev->arp_table[arpindex].mac_addr,
+						       &dst_ip,
+						       true,
+						       I40IW_ARP_DELETE);
+			}
+
+			i40iw_manage_arp_cache(iwdev, neigh->ha, &dst_ip, true, I40IW_ARP_ADD);
+			rc = i40iw_arp_table(iwdev, &dst_ip, true, NULL, I40IW_ARP_RESOLVE);
+		} else {
+			neigh_event_send(neigh, NULL);
+		}
+	}
+ resolve_neigh_exit:
+
+	rcu_read_unlock();
+	if (neigh)
+		neigh_release(neigh);
+
+	ip_rt_put(rt);
+	return rc;
+}
+
+/**
+ * i40iw_get_dst_ipv6
+ */
+static struct dst_entry *i40iw_get_dst_ipv6(struct sockaddr_in6 *src_addr,
+					    struct sockaddr_in6 *dst_addr)
+{
+	struct dst_entry *dst;
+	struct flowi6 fl6;
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.daddr = dst_addr->sin6_addr;
+	fl6.saddr = src_addr->sin6_addr;
+	if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL)
+		fl6.flowi6_oif = dst_addr->sin6_scope_id;
+
+	dst = ip6_route_output(&init_net, NULL, &fl6);
+	return dst;
+}
+
+/**
+ * i40iw_addr_resolve_neigh_ipv6 - resolve neighbor ipv6 address
+ * @iwdev: iwarp device structure
+ * @dst_ip: remote ip address
+ * @arpindex: if there is an arp entry
+ */
+static int i40iw_addr_resolve_neigh_ipv6(struct i40iw_device *iwdev,
+					 u32 *src,
+					 u32 *dest,
+					 int arpindex)
+{
+	struct neighbour *neigh;
+	int rc = arpindex;
+	struct net_device *netdev = iwdev->netdev;
+	struct dst_entry *dst;
+	struct sockaddr_in6 dst_addr;
+	struct sockaddr_in6 src_addr;
+
+	memset(&dst_addr, 0, sizeof(dst_addr));
+	dst_addr.sin6_family = AF_INET6;
+	i40iw_copy_ip_htonl(dst_addr.sin6_addr.in6_u.u6_addr32, dest);
+	memset(&src_addr, 0, sizeof(src_addr));
+	src_addr.sin6_family = AF_INET6;
+	i40iw_copy_ip_htonl(src_addr.sin6_addr.in6_u.u6_addr32, src);
+	dst = i40iw_get_dst_ipv6(&src_addr, &dst_addr);
+	if (!dst || dst->error) {
+		if (dst) {
+			dst_release(dst);
+			i40iw_pr_err("ip6_route_output returned dst->error = %d\n",
+				     dst->error);
+		}
+		return rc;
+	}
+
+	if (netif_is_bond_slave(netdev))
+		netdev = netdev_master_upper_dev_get(netdev);
+
+	neigh = dst_neigh_lookup(dst, dst_addr.sin6_addr.in6_u.u6_addr32);
+
+	rcu_read_lock();
+	if (neigh) {
+		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM, "dst_neigh_lookup MAC=%pM\n", neigh->ha);
+		if (neigh->nud_state & NUD_VALID) {
+			if (arpindex >= 0) {
+				if (ether_addr_equal
+				    (iwdev->arp_table[arpindex].mac_addr,
+				     neigh->ha)) {
+					/* Mac address same as in arp table */
+					goto resolve_neigh_exit6;
+				}
+				i40iw_manage_arp_cache(iwdev,
+						       iwdev->arp_table[arpindex].mac_addr,
+						       dest,
+						       false,
+						       I40IW_ARP_DELETE);
+			}
+			i40iw_manage_arp_cache(iwdev,
+					       neigh->ha,
+					       dest,
+					       false,
+					       I40IW_ARP_ADD);
+			rc = i40iw_arp_table(iwdev,
+					     dest,
+					     false,
+					     NULL,
+					     I40IW_ARP_RESOLVE);
+		} else {
+			neigh_event_send(neigh, NULL);
+		}
+	}
+
+ resolve_neigh_exit6:
+	rcu_read_unlock();
+	if (neigh)
+		neigh_release(neigh);
+	dst_release(dst);
+	return rc;
+}
+
+/**
+ * i40iw_ipv4_is_loopback - check if loopback
+ * @loc_addr: local addr to compare
+ * @rem_addr: remote address
+ */
+static bool i40iw_ipv4_is_loopback(u32 loc_addr, u32 rem_addr)
+{
+	return ipv4_is_loopback(htonl(rem_addr)) || (loc_addr == rem_addr);
+}
+
+/**
+ * i40iw_ipv6_is_loopback - check if loopback
+ * @loc_addr: local addr to compare
+ * @rem_addr: remote address
+ */
+static bool i40iw_ipv6_is_loopback(u32 *loc_addr, u32 *rem_addr)
+{
+	struct in6_addr raddr6;
+
+	i40iw_copy_ip_htonl(raddr6.in6_u.u6_addr32, rem_addr);
+	return !memcmp(loc_addr, rem_addr, 16) || ipv6_addr_loopback(&raddr6);
+}
+
+/**
+ * i40iw_make_cm_node - create a new instance of a cm node
+ * @cm_core: cm's core
+ * @iwdev: iwarp device structure
+ * @cm_info: quad info for connection
+ * @listener: passive connection's listener
+ */
+static struct i40iw_cm_node *i40iw_make_cm_node(
+				   struct i40iw_cm_core *cm_core,
+				   struct i40iw_device *iwdev,
+				   struct i40iw_cm_info *cm_info,
+				   struct i40iw_cm_listener *listener)
+{
+	struct i40iw_cm_node *cm_node;
+	struct timespec ts;
+	int oldarpindex;
+	int arpindex;
+	struct net_device *netdev = iwdev->netdev;
+
+	/* create an hte and cm_node for this instance */
+	cm_node = kzalloc(sizeof(*cm_node), GFP_ATOMIC);
+	if (!cm_node)
+		return NULL;
+
+	/* set our node specific transport info */
+	cm_node->ipv4 = cm_info->ipv4;
+	cm_node->vlan_id = cm_info->vlan_id;
+	if ((cm_node->vlan_id == I40IW_NO_VLAN) && iwdev->dcb)
+		cm_node->vlan_id = 0;
+	cm_node->tos = cm_info->tos;
+	cm_node->user_pri = cm_info->user_pri;
+	if (listener) {
+		if (listener->tos != cm_info->tos)
+			i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_DCB,
+				    "application TOS[%d] and remote client TOS[%d] mismatch\n",
+				     listener->tos, cm_info->tos);
+		cm_node->tos = max(listener->tos, cm_info->tos);
+		cm_node->user_pri = rt_tos2priority(cm_node->tos);
+		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_DCB, "listener: TOS:[%d] UP:[%d]\n",
+			    cm_node->tos, cm_node->user_pri);
+	}
+	memcpy(cm_node->loc_addr, cm_info->loc_addr, sizeof(cm_node->loc_addr));
+	memcpy(cm_node->rem_addr, cm_info->rem_addr, sizeof(cm_node->rem_addr));
+	cm_node->loc_port = cm_info->loc_port;
+	cm_node->rem_port = cm_info->rem_port;
+
+	cm_node->mpa_frame_rev = iwdev->mpa_version;
+	cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
+	cm_node->ird_size = I40IW_MAX_IRD_SIZE;
+	cm_node->ord_size = I40IW_MAX_ORD_SIZE;
+
+	cm_node->listener = listener;
+	cm_node->cm_id = cm_info->cm_id;
+	ether_addr_copy(cm_node->loc_mac, netdev->dev_addr);
+	spin_lock_init(&cm_node->retrans_list_lock);
+	cm_node->ack_rcvd = false;
+
+	atomic_set(&cm_node->ref_count, 1);
+	/* associate our parent CM core */
+	cm_node->cm_core = cm_core;
+	cm_node->tcp_cntxt.loc_id = I40IW_CM_DEF_LOCAL_ID;
+	cm_node->tcp_cntxt.rcv_wscale = I40IW_CM_DEFAULT_RCV_WND_SCALE;
+	cm_node->tcp_cntxt.rcv_wnd =
+			I40IW_CM_DEFAULT_RCV_WND_SCALED >> I40IW_CM_DEFAULT_RCV_WND_SCALE;
+	ts = current_kernel_time();
+	cm_node->tcp_cntxt.loc_seq_num = ts.tv_nsec;
+	cm_node->tcp_cntxt.mss = (cm_node->ipv4) ? (iwdev->vsi.mtu - I40IW_MTU_TO_MSS_IPV4) :
+				 (iwdev->vsi.mtu - I40IW_MTU_TO_MSS_IPV6);
+
+	cm_node->iwdev = iwdev;
+	cm_node->dev = &iwdev->sc_dev;
+
+	if ((cm_node->ipv4 &&
+	     i40iw_ipv4_is_loopback(cm_node->loc_addr[0], cm_node->rem_addr[0])) ||
+	     (!cm_node->ipv4 && i40iw_ipv6_is_loopback(cm_node->loc_addr,
+						       cm_node->rem_addr))) {
+		arpindex = i40iw_arp_table(iwdev,
+					   cm_node->rem_addr,
+					   false,
+					   NULL,
+					   I40IW_ARP_RESOLVE);
+	} else {
+		oldarpindex = i40iw_arp_table(iwdev,
+					      cm_node->rem_addr,
+					      false,
+					      NULL,
+					      I40IW_ARP_RESOLVE);
+		if (cm_node->ipv4)
+			arpindex = i40iw_addr_resolve_neigh(iwdev,
+							    cm_info->loc_addr[0],
+							    cm_info->rem_addr[0],
+							    oldarpindex);
+		else if (IS_ENABLED(CONFIG_IPV6))
+			arpindex = i40iw_addr_resolve_neigh_ipv6(iwdev,
+								 cm_info->loc_addr,
+								 cm_info->rem_addr,
+								 oldarpindex);
+		else
+			arpindex = -EINVAL;
+	}
+	if (arpindex < 0) {
+		i40iw_pr_err("cm_node arpindex\n");
+		kfree(cm_node);
+		return NULL;
+	}
+	ether_addr_copy(cm_node->rem_mac, iwdev->arp_table[arpindex].mac_addr);
+	i40iw_add_hte_node(cm_core, cm_node);
+	cm_core->stats_nodes_created++;
+	return cm_node;
+}
+
+/**
+ * i40iw_rem_ref_cm_node - destroy an instance of a cm node
+ * @cm_node: connection's node
+ */
+static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *cm_node)
+{
+	struct i40iw_cm_core *cm_core = cm_node->cm_core;
+	struct i40iw_qp *iwqp;
+	struct i40iw_cm_info nfo;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags);
+	if (atomic_dec_return(&cm_node->ref_count)) {
+		spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
+		return;
+	}
+	list_del(&cm_node->list);
+	spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
+
+	/* if the node is destroyed before connection was accelerated */
+	if (!cm_node->accelerated && cm_node->accept_pend) {
+		pr_err("node destroyed before established\n");
+		atomic_dec(&cm_node->listener->pend_accepts_cnt);
+	}
+	if (cm_node->close_entry)
+		i40iw_handle_close_entry(cm_node, 0);
+	if (cm_node->listener) {
+		i40iw_dec_refcnt_listen(cm_core, cm_node->listener, 0, true);
+	} else {
+		if (!i40iw_port_in_use(cm_core, cm_node->loc_port, true) && cm_node->apbvt_set) {
+			i40iw_manage_apbvt(cm_node->iwdev,
+					   cm_node->loc_port,
+					   I40IW_MANAGE_APBVT_DEL);
+			cm_node->apbvt_set = 0;
+		}
+		i40iw_get_addr_info(cm_node, &nfo);
+		if (cm_node->qhash_set) {
+			i40iw_manage_qhash(cm_node->iwdev,
+					   &nfo,
+					   I40IW_QHASH_TYPE_TCP_ESTABLISHED,
+					   I40IW_QHASH_MANAGE_TYPE_DELETE,
+					   NULL,
+					   false);
+			cm_node->qhash_set = 0;
+		}
+	}
+
+	iwqp = cm_node->iwqp;
+	if (iwqp) {
+		iwqp->cm_node = NULL;
+		i40iw_rem_ref(&iwqp->ibqp);
+		cm_node->iwqp = NULL;
+	} else if (cm_node->qhash_set) {
+		i40iw_get_addr_info(cm_node, &nfo);
+		i40iw_manage_qhash(cm_node->iwdev,
+				   &nfo,
+				   I40IW_QHASH_TYPE_TCP_ESTABLISHED,
+				   I40IW_QHASH_MANAGE_TYPE_DELETE,
+				   NULL,
+				   false);
+		cm_node->qhash_set = 0;
+	}
+
+	cm_node->cm_core->stats_nodes_destroyed++;
+	kfree(cm_node);
+}
+
+/**
+ * i40iw_handle_fin_pkt - FIN packet received
+ * @cm_node: connection's node
+ */
+static void i40iw_handle_fin_pkt(struct i40iw_cm_node *cm_node)
+{
+	u32 ret;
+
+	switch (cm_node->state) {
+	case I40IW_CM_STATE_SYN_RCVD:
+	case I40IW_CM_STATE_SYN_SENT:
+	case I40IW_CM_STATE_ESTABLISHED:
+	case I40IW_CM_STATE_MPAREJ_RCVD:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		i40iw_cleanup_retrans_entry(cm_node);
+		cm_node->state = I40IW_CM_STATE_LAST_ACK;
+		i40iw_send_fin(cm_node);
+		break;
+	case I40IW_CM_STATE_MPAREQ_SENT:
+		i40iw_create_event(cm_node, I40IW_CM_EVENT_ABORTED);
+		cm_node->tcp_cntxt.rcv_nxt++;
+		i40iw_cleanup_retrans_entry(cm_node);
+		cm_node->state = I40IW_CM_STATE_CLOSED;
+		atomic_inc(&cm_node->ref_count);
+		i40iw_send_reset(cm_node);
+		break;
+	case I40IW_CM_STATE_FIN_WAIT1:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		i40iw_cleanup_retrans_entry(cm_node);
+		cm_node->state = I40IW_CM_STATE_CLOSING;
+		i40iw_send_ack(cm_node);
+		/*
+		 * Wait for ACK as this is simultaneous close.
+		 * After we receive ACK, do not send anything.
+		 * Just rm the node.
+		 */
+		break;
+	case I40IW_CM_STATE_FIN_WAIT2:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		i40iw_cleanup_retrans_entry(cm_node);
+		cm_node->state = I40IW_CM_STATE_TIME_WAIT;
+		i40iw_send_ack(cm_node);
+		ret =
+		    i40iw_schedule_cm_timer(cm_node, NULL, I40IW_TIMER_TYPE_CLOSE, 1, 0);
+		if (ret)
+			i40iw_pr_err("node %p state = %d\n", cm_node, cm_node->state);
+		break;
+	case I40IW_CM_STATE_TIME_WAIT:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		i40iw_cleanup_retrans_entry(cm_node);
+		cm_node->state = I40IW_CM_STATE_CLOSED;
+		i40iw_rem_ref_cm_node(cm_node);
+		break;
+	case I40IW_CM_STATE_OFFLOADED:
+	default:
+		i40iw_pr_err("bad state node %p state = %d\n", cm_node, cm_node->state);
+		break;
+	}
+}
+
+/**
+ * i40iw_handle_rst_pkt - process received RST packet
+ * @cm_node: connection's node
+ * @rbuf: receive buffer
+ */
+static void i40iw_handle_rst_pkt(struct i40iw_cm_node *cm_node,
+				 struct i40iw_puda_buf *rbuf)
+{
+	i40iw_cleanup_retrans_entry(cm_node);
+	switch (cm_node->state) {
+	case I40IW_CM_STATE_SYN_SENT:
+	case I40IW_CM_STATE_MPAREQ_SENT:
+		switch (cm_node->mpa_frame_rev) {
+		case IETF_MPA_V2:
+			cm_node->mpa_frame_rev = IETF_MPA_V1;
+			/* send a syn and goto syn sent state */
+			cm_node->state = I40IW_CM_STATE_SYN_SENT;
+			if (i40iw_send_syn(cm_node, 0))
+				i40iw_active_open_err(cm_node, false);
+			break;
+		case IETF_MPA_V1:
+		default:
+			i40iw_active_open_err(cm_node, false);
+			break;
+		}
+		break;
+	case I40IW_CM_STATE_MPAREQ_RCVD:
+		atomic_add_return(1, &cm_node->passive_state);
+		break;
+	case I40IW_CM_STATE_ESTABLISHED:
+	case I40IW_CM_STATE_SYN_RCVD:
+	case I40IW_CM_STATE_LISTENING:
+		i40iw_pr_err("Bad state state = %d\n", cm_node->state);
+		i40iw_passive_open_err(cm_node, false);
+		break;
+	case I40IW_CM_STATE_OFFLOADED:
+		i40iw_active_open_err(cm_node, false);
+		break;
+	case I40IW_CM_STATE_CLOSED:
+		break;
+	case I40IW_CM_STATE_FIN_WAIT2:
+	case I40IW_CM_STATE_FIN_WAIT1:
+	case I40IW_CM_STATE_LAST_ACK:
+		cm_node->cm_id->rem_ref(cm_node->cm_id);
+		/* fall through */
+	case I40IW_CM_STATE_TIME_WAIT:
+		cm_node->state = I40IW_CM_STATE_CLOSED;
+		i40iw_rem_ref_cm_node(cm_node);
+		break;
+	default:
+		break;
+	}
+}
+
+/**
+ * i40iw_handle_rcv_mpa - Process a recv'd mpa buffer
+ * @cm_node: connection's node
+ * @rbuf: receive buffer
+ */
+static void i40iw_handle_rcv_mpa(struct i40iw_cm_node *cm_node,
+				 struct i40iw_puda_buf *rbuf)
+{
+	int ret;
+	int datasize = rbuf->datalen;
+	u8 *dataloc = rbuf->data;
+
+	enum i40iw_cm_event_type type = I40IW_CM_EVENT_UNKNOWN;
+	u32 res_type;
+
+	ret = i40iw_parse_mpa(cm_node, dataloc, &res_type, datasize);
+	if (ret) {
+		if (cm_node->state == I40IW_CM_STATE_MPAREQ_SENT)
+			i40iw_active_open_err(cm_node, true);
+		else
+			i40iw_passive_open_err(cm_node, true);
+		return;
+	}
+
+	switch (cm_node->state) {
+	case I40IW_CM_STATE_ESTABLISHED:
+		if (res_type == I40IW_MPA_REQUEST_REJECT)
+			i40iw_pr_err("state for reject\n");
+		cm_node->state = I40IW_CM_STATE_MPAREQ_RCVD;
+		type = I40IW_CM_EVENT_MPA_REQ;
+		i40iw_send_ack(cm_node);	/* ACK received MPA request */
+		atomic_set(&cm_node->passive_state,
+			   I40IW_PASSIVE_STATE_INDICATED);
+		break;
+	case I40IW_CM_STATE_MPAREQ_SENT:
+		i40iw_cleanup_retrans_entry(cm_node);
+		if (res_type == I40IW_MPA_REQUEST_REJECT) {
+			type = I40IW_CM_EVENT_MPA_REJECT;
+			cm_node->state = I40IW_CM_STATE_MPAREJ_RCVD;
+		} else {
+			type = I40IW_CM_EVENT_CONNECTED;
+			cm_node->state = I40IW_CM_STATE_OFFLOADED;
+		}
+		i40iw_send_ack(cm_node);
+		break;
+	default:
+		pr_err("%s wrong cm_node state =%d\n", __func__, cm_node->state);
+		break;
+	}
+	i40iw_create_event(cm_node, type);
+}
+
+/**
+ * i40iw_indicate_pkt_err - Send up err event to cm
+ * @cm_node: connection's node
+ */
+static void i40iw_indicate_pkt_err(struct i40iw_cm_node *cm_node)
+{
+	switch (cm_node->state) {
+	case I40IW_CM_STATE_SYN_SENT:
+	case I40IW_CM_STATE_MPAREQ_SENT:
+		i40iw_active_open_err(cm_node, true);
+		break;
+	case I40IW_CM_STATE_ESTABLISHED:
+	case I40IW_CM_STATE_SYN_RCVD:
+		i40iw_passive_open_err(cm_node, true);
+		break;
+	case I40IW_CM_STATE_OFFLOADED:
+	default:
+		break;
+	}
+}
+
+/**
+ * i40iw_check_syn - Check for error on received syn ack
+ * @cm_node: connection's node
+ * @tcph: pointer tcp header
+ */
+static int i40iw_check_syn(struct i40iw_cm_node *cm_node, struct tcphdr *tcph)
+{
+	int err = 0;
+
+	if (ntohl(tcph->ack_seq) != cm_node->tcp_cntxt.loc_seq_num) {
+		err = 1;
+		i40iw_active_open_err(cm_node, true);
+	}
+	return err;
+}
+
+/**
+ * i40iw_check_seq - check seq numbers if OK
+ * @cm_node: connection's node
+ * @tcph: pointer tcp header
+ */
+static int i40iw_check_seq(struct i40iw_cm_node *cm_node, struct tcphdr *tcph)
+{
+	int err = 0;
+	u32 seq;
+	u32 ack_seq;
+	u32 loc_seq_num = cm_node->tcp_cntxt.loc_seq_num;
+	u32 rcv_nxt = cm_node->tcp_cntxt.rcv_nxt;
+	u32 rcv_wnd;
+
+	seq = ntohl(tcph->seq);
+	ack_seq = ntohl(tcph->ack_seq);
+	rcv_wnd = cm_node->tcp_cntxt.rcv_wnd;
+	if (ack_seq != loc_seq_num)
+		err = -1;
+	else if (!between(seq, rcv_nxt, (rcv_nxt + rcv_wnd)))
+		err = -1;
+	if (err) {
+		i40iw_pr_err("seq number\n");
+		i40iw_indicate_pkt_err(cm_node);
+	}
+	return err;
+}
+
+/**
+ * i40iw_handle_syn_pkt - is for Passive node
+ * @cm_node: connection's node
+ * @rbuf: receive buffer
+ */
+static void i40iw_handle_syn_pkt(struct i40iw_cm_node *cm_node,
+				 struct i40iw_puda_buf *rbuf)
+{
+	struct tcphdr *tcph = (struct tcphdr *)rbuf->tcph;
+	int ret;
+	u32 inc_sequence;
+	int optionsize;
+	struct i40iw_cm_info nfo;
+
+	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
+	inc_sequence = ntohl(tcph->seq);
+
+	switch (cm_node->state) {
+	case I40IW_CM_STATE_SYN_SENT:
+	case I40IW_CM_STATE_MPAREQ_SENT:
+		/* Rcvd syn on active open connection */
+		i40iw_active_open_err(cm_node, 1);
+		break;
+	case I40IW_CM_STATE_LISTENING:
+		/* Passive OPEN */
+		if (atomic_read(&cm_node->listener->pend_accepts_cnt) >
+		    cm_node->listener->backlog) {
+			cm_node->cm_core->stats_backlog_drops++;
+			i40iw_passive_open_err(cm_node, false);
+			break;
+		}
+		ret = i40iw_handle_tcp_options(cm_node, tcph, optionsize, 1);
+		if (ret) {
+			i40iw_passive_open_err(cm_node, false);
+			/* drop pkt */
+			break;
+		}
+		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
+		cm_node->accept_pend = 1;
+		atomic_inc(&cm_node->listener->pend_accepts_cnt);
+
+		cm_node->state = I40IW_CM_STATE_SYN_RCVD;
+		i40iw_get_addr_info(cm_node, &nfo);
+		ret = i40iw_manage_qhash(cm_node->iwdev,
+					 &nfo,
+					 I40IW_QHASH_TYPE_TCP_ESTABLISHED,
+					 I40IW_QHASH_MANAGE_TYPE_ADD,
+					 (void *)cm_node,
+					 false);
+		cm_node->qhash_set = true;
+		break;
+	case I40IW_CM_STATE_CLOSED:
+		i40iw_cleanup_retrans_entry(cm_node);
+		atomic_inc(&cm_node->ref_count);
+		i40iw_send_reset(cm_node);
+		break;
+	case I40IW_CM_STATE_OFFLOADED:
+	case I40IW_CM_STATE_ESTABLISHED:
+	case I40IW_CM_STATE_FIN_WAIT1:
+	case I40IW_CM_STATE_FIN_WAIT2:
+	case I40IW_CM_STATE_MPAREQ_RCVD:
+	case I40IW_CM_STATE_LAST_ACK:
+	case I40IW_CM_STATE_CLOSING:
+	case I40IW_CM_STATE_UNKNOWN:
+	default:
+		break;
+	}
+}
+
+/**
+ * i40iw_handle_synack_pkt - Process SYN+ACK packet (active side)
+ * @cm_node: connection's node
+ * @rbuf: receive buffer
+ */
+static void i40iw_handle_synack_pkt(struct i40iw_cm_node *cm_node,
+				    struct i40iw_puda_buf *rbuf)
+{
+	struct tcphdr *tcph = (struct tcphdr *)rbuf->tcph;
+	int ret;
+	u32 inc_sequence;
+	int optionsize;
+
+	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
+	inc_sequence = ntohl(tcph->seq);
+	switch (cm_node->state) {
+	case I40IW_CM_STATE_SYN_SENT:
+		i40iw_cleanup_retrans_entry(cm_node);
+		/* active open */
+		if (i40iw_check_syn(cm_node, tcph)) {
+			i40iw_pr_err("check syn fail\n");
+			return;
+		}
+		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
+		/* setup options */
+		ret = i40iw_handle_tcp_options(cm_node, tcph, optionsize, 0);
+		if (ret) {
+			i40iw_debug(cm_node->dev,
+				    I40IW_DEBUG_CM,
+				    "cm_node=%p tcp_options failed\n",
+				    cm_node);
+			break;
+		}
+		i40iw_cleanup_retrans_entry(cm_node);
+		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
+		i40iw_send_ack(cm_node);	/* ACK  for the syn_ack */
+		ret = i40iw_send_mpa_request(cm_node);
+		if (ret) {
+			i40iw_debug(cm_node->dev,
+				    I40IW_DEBUG_CM,
+				    "cm_node=%p i40iw_send_mpa_request failed\n",
+				    cm_node);
+			break;
+		}
+		cm_node->state = I40IW_CM_STATE_MPAREQ_SENT;
+		break;
+	case I40IW_CM_STATE_MPAREQ_RCVD:
+		i40iw_passive_open_err(cm_node, true);
+		break;
+	case I40IW_CM_STATE_LISTENING:
+		cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
+		i40iw_cleanup_retrans_entry(cm_node);
+		cm_node->state = I40IW_CM_STATE_CLOSED;
+		i40iw_send_reset(cm_node);
+		break;
+	case I40IW_CM_STATE_CLOSED:
+		cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
+		i40iw_cleanup_retrans_entry(cm_node);
+		atomic_inc(&cm_node->ref_count);
+		i40iw_send_reset(cm_node);
+		break;
+	case I40IW_CM_STATE_ESTABLISHED:
+	case I40IW_CM_STATE_FIN_WAIT1:
+	case I40IW_CM_STATE_FIN_WAIT2:
+	case I40IW_CM_STATE_LAST_ACK:
+	case I40IW_CM_STATE_OFFLOADED:
+	case I40IW_CM_STATE_CLOSING:
+	case I40IW_CM_STATE_UNKNOWN:
+	case I40IW_CM_STATE_MPAREQ_SENT:
+	default:
+		break;
+	}
+}
+
+/**
+ * i40iw_handle_ack_pkt - process packet with ACK
+ * @cm_node: connection's node
+ * @rbuf: receive buffer
+ */
+static int i40iw_handle_ack_pkt(struct i40iw_cm_node *cm_node,
+				struct i40iw_puda_buf *rbuf)
+{
+	struct tcphdr *tcph = (struct tcphdr *)rbuf->tcph;
+	u32 inc_sequence;
+	int ret = 0;
+	int optionsize;
+	u32 datasize = rbuf->datalen;
+
+	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
+
+	if (i40iw_check_seq(cm_node, tcph))
+		return -EINVAL;
+
+	inc_sequence = ntohl(tcph->seq);
+	switch (cm_node->state) {
+	case I40IW_CM_STATE_SYN_RCVD:
+		i40iw_cleanup_retrans_entry(cm_node);
+		ret = i40iw_handle_tcp_options(cm_node, tcph, optionsize, 1);
+		if (ret)
+			break;
+		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
+		cm_node->state = I40IW_CM_STATE_ESTABLISHED;
+		if (datasize) {
+			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
+			i40iw_handle_rcv_mpa(cm_node, rbuf);
+		}
+		break;
+	case I40IW_CM_STATE_ESTABLISHED:
+		i40iw_cleanup_retrans_entry(cm_node);
+		if (datasize) {
+			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
+			i40iw_handle_rcv_mpa(cm_node, rbuf);
+		}
+		break;
+	case I40IW_CM_STATE_MPAREQ_SENT:
+		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
+		if (datasize) {
+			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
+			cm_node->ack_rcvd = false;
+			i40iw_handle_rcv_mpa(cm_node, rbuf);
+		} else {
+			cm_node->ack_rcvd = true;
+		}
+		break;
+	case I40IW_CM_STATE_LISTENING:
+		i40iw_cleanup_retrans_entry(cm_node);
+		cm_node->state = I40IW_CM_STATE_CLOSED;
+		i40iw_send_reset(cm_node);
+		break;
+	case I40IW_CM_STATE_CLOSED:
+		i40iw_cleanup_retrans_entry(cm_node);
+		atomic_inc(&cm_node->ref_count);
+		i40iw_send_reset(cm_node);
+		break;
+	case I40IW_CM_STATE_LAST_ACK:
+	case I40IW_CM_STATE_CLOSING:
+		i40iw_cleanup_retrans_entry(cm_node);
+		cm_node->state = I40IW_CM_STATE_CLOSED;
+		if (!cm_node->accept_pend)
+			cm_node->cm_id->rem_ref(cm_node->cm_id);
+		i40iw_rem_ref_cm_node(cm_node);
+		break;
+	case I40IW_CM_STATE_FIN_WAIT1:
+		i40iw_cleanup_retrans_entry(cm_node);
+		cm_node->state = I40IW_CM_STATE_FIN_WAIT2;
+		break;
+	case I40IW_CM_STATE_SYN_SENT:
+	case I40IW_CM_STATE_FIN_WAIT2:
+	case I40IW_CM_STATE_OFFLOADED:
+	case I40IW_CM_STATE_MPAREQ_RCVD:
+	case I40IW_CM_STATE_UNKNOWN:
+	default:
+		i40iw_cleanup_retrans_entry(cm_node);
+		break;
+	}
+	return ret;
+}
+
+/**
+ * i40iw_process_packet - process cm packet
+ * @cm_node: connection's node
+ * @rbuf: receive buffer
+ */
+static void i40iw_process_packet(struct i40iw_cm_node *cm_node,
+				 struct i40iw_puda_buf *rbuf)
+{
+	enum i40iw_tcpip_pkt_type pkt_type = I40IW_PKT_TYPE_UNKNOWN;
+	struct tcphdr *tcph = (struct tcphdr *)rbuf->tcph;
+	u32 fin_set = 0;
+	int ret;
+
+	if (tcph->rst) {
+		pkt_type = I40IW_PKT_TYPE_RST;
+	} else if (tcph->syn) {
+		pkt_type = I40IW_PKT_TYPE_SYN;
+		if (tcph->ack)
+			pkt_type = I40IW_PKT_TYPE_SYNACK;
+	} else if (tcph->ack) {
+		pkt_type = I40IW_PKT_TYPE_ACK;
+	}
+	if (tcph->fin)
+		fin_set = 1;
+
+	switch (pkt_type) {
+	case I40IW_PKT_TYPE_SYN:
+		i40iw_handle_syn_pkt(cm_node, rbuf);
+		break;
+	case I40IW_PKT_TYPE_SYNACK:
+		i40iw_handle_synack_pkt(cm_node, rbuf);
+		break;
+	case I40IW_PKT_TYPE_ACK:
+		ret = i40iw_handle_ack_pkt(cm_node, rbuf);
+		if (fin_set && !ret)
+			i40iw_handle_fin_pkt(cm_node);
+		break;
+	case I40IW_PKT_TYPE_RST:
+		i40iw_handle_rst_pkt(cm_node, rbuf);
+		break;
+	default:
+		if (fin_set &&
+		    (!i40iw_check_seq(cm_node, (struct tcphdr *)rbuf->tcph)))
+			i40iw_handle_fin_pkt(cm_node);
+		break;
+	}
+}
+
+/**
+ * i40iw_make_listen_node - create a listen node with params
+ * @cm_core: cm's core
+ * @iwdev: iwarp device structure
+ * @cm_info: quad info for connection
+ */
+static struct i40iw_cm_listener *i40iw_make_listen_node(
+					struct i40iw_cm_core *cm_core,
+					struct i40iw_device *iwdev,
+					struct i40iw_cm_info *cm_info)
+{
+	struct i40iw_cm_listener *listener;
+	unsigned long flags;
+
+	/* cannot have multiple matching listeners */
+	listener = i40iw_find_listener(cm_core, cm_info->loc_addr,
+				       cm_info->loc_port,
+				       cm_info->vlan_id,
+				       I40IW_CM_LISTENER_EITHER_STATE);
+	if (listener &&
+	    (listener->listener_state == I40IW_CM_LISTENER_ACTIVE_STATE)) {
+		atomic_dec(&listener->ref_count);
+		i40iw_debug(cm_core->dev,
+			    I40IW_DEBUG_CM,
+			    "Not creating listener since it already exists\n");
+		return NULL;
+	}
+
+	if (!listener) {
+		/* create a CM listen node (1/2 node to compare incoming traffic to) */
+		listener = kzalloc(sizeof(*listener), GFP_ATOMIC);
+		if (!listener)
+			return NULL;
+		cm_core->stats_listen_nodes_created++;
+		memcpy(listener->loc_addr, cm_info->loc_addr, sizeof(listener->loc_addr));
+		listener->loc_port = cm_info->loc_port;
+
+		INIT_LIST_HEAD(&listener->child_listen_list);
+
+		atomic_set(&listener->ref_count, 1);
+	} else {
+		listener->reused_node = 1;
+	}
+
+	listener->cm_id = cm_info->cm_id;
+	listener->ipv4 = cm_info->ipv4;
+	listener->vlan_id = cm_info->vlan_id;
+	atomic_set(&listener->pend_accepts_cnt, 0);
+	listener->cm_core = cm_core;
+	listener->iwdev = iwdev;
+
+	listener->backlog = cm_info->backlog;
+	listener->listener_state = I40IW_CM_LISTENER_ACTIVE_STATE;
+
+	if (!listener->reused_node) {
+		spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+		list_add(&listener->list, &cm_core->listen_nodes);
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+	}
+
+	return listener;
+}
+
+/**
+ * i40iw_create_cm_node - make a connection node with params
+ * @cm_core: cm's core
+ * @iwdev: iwarp device structure
+ * @conn_param: upper layer connection parameters
+ * @cm_info: quad info for connection
+ */
+static struct i40iw_cm_node *i40iw_create_cm_node(
+					struct i40iw_cm_core *cm_core,
+					struct i40iw_device *iwdev,
+					struct iw_cm_conn_param *conn_param,
+					struct i40iw_cm_info *cm_info)
+{
+	struct i40iw_cm_node *cm_node;
+	struct i40iw_cm_listener *loopback_remotelistener;
+	struct i40iw_cm_node *loopback_remotenode;
+	struct i40iw_cm_info loopback_cm_info;
+
+	u16 private_data_len = conn_param->private_data_len;
+	const void *private_data = conn_param->private_data;
+
+	/* create a CM connection node */
+	cm_node = i40iw_make_cm_node(cm_core, iwdev, cm_info, NULL);
+	if (!cm_node)
+		return ERR_PTR(-ENOMEM);
+	/* set our node side to client (active) side */
+	cm_node->tcp_cntxt.client = 1;
+	cm_node->tcp_cntxt.rcv_wscale = I40IW_CM_DEFAULT_RCV_WND_SCALE;
+
+	i40iw_record_ird_ord(cm_node, conn_param->ird, conn_param->ord);
+
+	if (!memcmp(cm_info->loc_addr, cm_info->rem_addr, sizeof(cm_info->loc_addr))) {
+		loopback_remotelistener = i40iw_find_listener(
+						cm_core,
+						cm_info->rem_addr,
+						cm_node->rem_port,
+						cm_node->vlan_id,
+						I40IW_CM_LISTENER_ACTIVE_STATE);
+		if (!loopback_remotelistener) {
+			i40iw_rem_ref_cm_node(cm_node);
+			return ERR_PTR(-ECONNREFUSED);
+		} else {
+			loopback_cm_info = *cm_info;
+			loopback_cm_info.loc_port = cm_info->rem_port;
+			loopback_cm_info.rem_port = cm_info->loc_port;
+			loopback_cm_info.cm_id = loopback_remotelistener->cm_id;
+			loopback_cm_info.ipv4 = cm_info->ipv4;
+			loopback_remotenode = i40iw_make_cm_node(cm_core,
+								 iwdev,
+								 &loopback_cm_info,
+								 loopback_remotelistener);
+			if (!loopback_remotenode) {
+				i40iw_rem_ref_cm_node(cm_node);
+				return ERR_PTR(-ENOMEM);
+			}
+			cm_core->stats_loopbacks++;
+			loopback_remotenode->loopbackpartner = cm_node;
+			loopback_remotenode->tcp_cntxt.rcv_wscale =
+				I40IW_CM_DEFAULT_RCV_WND_SCALE;
+			cm_node->loopbackpartner = loopback_remotenode;
+			memcpy(loopback_remotenode->pdata_buf, private_data,
+			       private_data_len);
+			loopback_remotenode->pdata.size = private_data_len;
+
+			if (loopback_remotenode->ord_size > cm_node->ird_size)
+				loopback_remotenode->ord_size =
+					cm_node->ird_size;
+
+			cm_node->state = I40IW_CM_STATE_OFFLOADED;
+			cm_node->tcp_cntxt.rcv_nxt =
+				loopback_remotenode->tcp_cntxt.loc_seq_num;
+			loopback_remotenode->tcp_cntxt.rcv_nxt =
+				cm_node->tcp_cntxt.loc_seq_num;
+			cm_node->tcp_cntxt.max_snd_wnd =
+				loopback_remotenode->tcp_cntxt.rcv_wnd;
+			loopback_remotenode->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.rcv_wnd;
+			cm_node->tcp_cntxt.snd_wnd = loopback_remotenode->tcp_cntxt.rcv_wnd;
+			loopback_remotenode->tcp_cntxt.snd_wnd = cm_node->tcp_cntxt.rcv_wnd;
+			cm_node->tcp_cntxt.snd_wscale = loopback_remotenode->tcp_cntxt.rcv_wscale;
+			loopback_remotenode->tcp_cntxt.snd_wscale = cm_node->tcp_cntxt.rcv_wscale;
+		}
+		return cm_node;
+	}
+
+	cm_node->pdata.size = private_data_len;
+	cm_node->pdata.addr = cm_node->pdata_buf;
+
+	memcpy(cm_node->pdata_buf, private_data, private_data_len);
+
+	cm_node->state = I40IW_CM_STATE_SYN_SENT;
+	return cm_node;
+}
+
+/**
+ * i40iw_cm_reject - reject and teardown a connection
+ * @cm_node: connection's node
+ * @pdate: ptr to private data for reject
+ * @plen: size of private data
+ */
+static int i40iw_cm_reject(struct i40iw_cm_node *cm_node, const void *pdata, u8 plen)
+{
+	int ret = 0;
+	int err;
+	int passive_state;
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	struct i40iw_cm_node *loopback = cm_node->loopbackpartner;
+
+	if (cm_node->tcp_cntxt.client)
+		return ret;
+	i40iw_cleanup_retrans_entry(cm_node);
+
+	if (!loopback) {
+		passive_state = atomic_add_return(1, &cm_node->passive_state);
+		if (passive_state == I40IW_SEND_RESET_EVENT) {
+			cm_node->state = I40IW_CM_STATE_CLOSED;
+			i40iw_rem_ref_cm_node(cm_node);
+		} else {
+			if (cm_node->state == I40IW_CM_STATE_LISTENER_DESTROYED) {
+				i40iw_rem_ref_cm_node(cm_node);
+			} else {
+				ret = i40iw_send_mpa_reject(cm_node, pdata, plen);
+				if (ret) {
+					cm_node->state = I40IW_CM_STATE_CLOSED;
+					err = i40iw_send_reset(cm_node);
+					if (err)
+						i40iw_pr_err("send reset failed\n");
+				} else {
+					cm_id->add_ref(cm_id);
+				}
+			}
+		}
+	} else {
+		cm_node->cm_id = NULL;
+		if (cm_node->state == I40IW_CM_STATE_LISTENER_DESTROYED) {
+			i40iw_rem_ref_cm_node(cm_node);
+			i40iw_rem_ref_cm_node(loopback);
+		} else {
+			ret = i40iw_send_cm_event(loopback,
+						  loopback->cm_id,
+						  IW_CM_EVENT_CONNECT_REPLY,
+						  -ECONNREFUSED);
+			i40iw_rem_ref_cm_node(cm_node);
+			loopback->state = I40IW_CM_STATE_CLOSING;
+
+			cm_id = loopback->cm_id;
+			i40iw_rem_ref_cm_node(loopback);
+			cm_id->rem_ref(cm_id);
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * i40iw_cm_close - close of cm connection
+ * @cm_node: connection's node
+ */
+static int i40iw_cm_close(struct i40iw_cm_node *cm_node)
+{
+	int ret = 0;
+
+	if (!cm_node)
+		return -EINVAL;
+
+	switch (cm_node->state) {
+	case I40IW_CM_STATE_SYN_RCVD:
+	case I40IW_CM_STATE_SYN_SENT:
+	case I40IW_CM_STATE_ONE_SIDE_ESTABLISHED:
+	case I40IW_CM_STATE_ESTABLISHED:
+	case I40IW_CM_STATE_ACCEPTING:
+	case I40IW_CM_STATE_MPAREQ_SENT:
+	case I40IW_CM_STATE_MPAREQ_RCVD:
+		i40iw_cleanup_retrans_entry(cm_node);
+		i40iw_send_reset(cm_node);
+		break;
+	case I40IW_CM_STATE_CLOSE_WAIT:
+		cm_node->state = I40IW_CM_STATE_LAST_ACK;
+		i40iw_send_fin(cm_node);
+		break;
+	case I40IW_CM_STATE_FIN_WAIT1:
+	case I40IW_CM_STATE_FIN_WAIT2:
+	case I40IW_CM_STATE_LAST_ACK:
+	case I40IW_CM_STATE_TIME_WAIT:
+	case I40IW_CM_STATE_CLOSING:
+		ret = -1;
+		break;
+	case I40IW_CM_STATE_LISTENING:
+		i40iw_cleanup_retrans_entry(cm_node);
+		i40iw_send_reset(cm_node);
+		break;
+	case I40IW_CM_STATE_MPAREJ_RCVD:
+	case I40IW_CM_STATE_UNKNOWN:
+	case I40IW_CM_STATE_INITED:
+	case I40IW_CM_STATE_CLOSED:
+	case I40IW_CM_STATE_LISTENER_DESTROYED:
+		i40iw_rem_ref_cm_node(cm_node);
+		break;
+	case I40IW_CM_STATE_OFFLOADED:
+		if (cm_node->send_entry)
+			i40iw_pr_err("send_entry\n");
+		i40iw_rem_ref_cm_node(cm_node);
+		break;
+	}
+	return ret;
+}
+
+/**
+ * i40iw_receive_ilq - recv an ETHERNET packet, and process it
+ * through CM
+ * @vsi: pointer to the vsi structure
+ * @rbuf: receive buffer
+ */
+void i40iw_receive_ilq(struct i40iw_sc_vsi *vsi, struct i40iw_puda_buf *rbuf)
+{
+	struct i40iw_cm_node *cm_node;
+	struct i40iw_cm_listener *listener;
+	struct iphdr *iph;
+	struct ipv6hdr *ip6h;
+	struct tcphdr *tcph;
+	struct i40iw_cm_info cm_info;
+	struct i40iw_sc_dev *dev = vsi->dev;
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
+	struct vlan_ethhdr *ethh;
+	u16 vtag;
+
+	/* if vlan, then maclen = 18 else 14 */
+	iph = (struct iphdr *)rbuf->iph;
+	memset(&cm_info, 0, sizeof(cm_info));
+
+	i40iw_debug_buf(dev,
+			I40IW_DEBUG_ILQ,
+			"RECEIVE ILQ BUFFER",
+			rbuf->mem.va,
+			rbuf->totallen);
+	ethh = (struct vlan_ethhdr *)rbuf->mem.va;
+
+	if (ethh->h_vlan_proto == htons(ETH_P_8021Q)) {
+		vtag = ntohs(ethh->h_vlan_TCI);
+		cm_info.user_pri = (vtag & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+		cm_info.vlan_id = vtag & VLAN_VID_MASK;
+		i40iw_debug(cm_core->dev,
+			    I40IW_DEBUG_CM,
+			    "%s vlan_id=%d\n",
+			    __func__,
+			    cm_info.vlan_id);
+	} else {
+		cm_info.vlan_id = I40IW_NO_VLAN;
+	}
+	tcph = (struct tcphdr *)rbuf->tcph;
+
+	if (rbuf->ipv4) {
+		cm_info.loc_addr[0] = ntohl(iph->daddr);
+		cm_info.rem_addr[0] = ntohl(iph->saddr);
+		cm_info.ipv4 = true;
+		cm_info.tos = iph->tos;
+	} else {
+		ip6h = (struct ipv6hdr *)rbuf->iph;
+		i40iw_copy_ip_ntohl(cm_info.loc_addr,
+				    ip6h->daddr.in6_u.u6_addr32);
+		i40iw_copy_ip_ntohl(cm_info.rem_addr,
+				    ip6h->saddr.in6_u.u6_addr32);
+		cm_info.ipv4 = false;
+		cm_info.tos = (ip6h->priority << 4) | (ip6h->flow_lbl[0] >> 4);
+	}
+	cm_info.loc_port = ntohs(tcph->dest);
+	cm_info.rem_port = ntohs(tcph->source);
+	cm_node = i40iw_find_node(cm_core,
+				  cm_info.rem_port,
+				  cm_info.rem_addr,
+				  cm_info.loc_port,
+				  cm_info.loc_addr,
+				  true,
+				  false);
+
+	if (!cm_node) {
+		/* Only type of packet accepted are for */
+		/* the PASSIVE open (syn only) */
+		if (!tcph->syn || tcph->ack)
+			return;
+		listener =
+		    i40iw_find_listener(cm_core,
+					cm_info.loc_addr,
+					cm_info.loc_port,
+					cm_info.vlan_id,
+					I40IW_CM_LISTENER_ACTIVE_STATE);
+		if (!listener) {
+			cm_info.cm_id = NULL;
+			i40iw_debug(cm_core->dev,
+				    I40IW_DEBUG_CM,
+				    "%s no listener found\n",
+				    __func__);
+			return;
+		}
+		cm_info.cm_id = listener->cm_id;
+		cm_node = i40iw_make_cm_node(cm_core, iwdev, &cm_info, listener);
+		if (!cm_node) {
+			i40iw_debug(cm_core->dev,
+				    I40IW_DEBUG_CM,
+				    "%s allocate node failed\n",
+				    __func__);
+			atomic_dec(&listener->ref_count);
+			return;
+		}
+		if (!tcph->rst && !tcph->fin) {
+			cm_node->state = I40IW_CM_STATE_LISTENING;
+		} else {
+			i40iw_rem_ref_cm_node(cm_node);
+			return;
+		}
+		atomic_inc(&cm_node->ref_count);
+	} else if (cm_node->state == I40IW_CM_STATE_OFFLOADED) {
+		i40iw_rem_ref_cm_node(cm_node);
+		return;
+	}
+	i40iw_process_packet(cm_node, rbuf);
+	i40iw_rem_ref_cm_node(cm_node);
+}
+
+/**
+ * i40iw_setup_cm_core - allocate a top level instance of a cm
+ * core
+ * @iwdev: iwarp device structure
+ */
+void i40iw_setup_cm_core(struct i40iw_device *iwdev)
+{
+	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
+
+	cm_core->iwdev = iwdev;
+	cm_core->dev = &iwdev->sc_dev;
+
+	INIT_LIST_HEAD(&cm_core->accelerated_list);
+	INIT_LIST_HEAD(&cm_core->non_accelerated_list);
+	INIT_LIST_HEAD(&cm_core->listen_nodes);
+
+	timer_setup(&cm_core->tcp_timer, i40iw_cm_timer_tick, 0);
+
+	spin_lock_init(&cm_core->ht_lock);
+	spin_lock_init(&cm_core->listen_list_lock);
+
+	cm_core->event_wq = alloc_ordered_workqueue("iwewq",
+						    WQ_MEM_RECLAIM);
+
+	cm_core->disconn_wq = alloc_ordered_workqueue("iwdwq",
+						      WQ_MEM_RECLAIM);
+}
+
+/**
+ * i40iw_cleanup_cm_core - deallocate a top level instance of a
+ * cm core
+ * @cm_core: cm's core
+ */
+void i40iw_cleanup_cm_core(struct i40iw_cm_core *cm_core)
+{
+	unsigned long flags;
+
+	if (!cm_core)
+		return;
+
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+	if (timer_pending(&cm_core->tcp_timer))
+		del_timer_sync(&cm_core->tcp_timer);
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	destroy_workqueue(cm_core->event_wq);
+	destroy_workqueue(cm_core->disconn_wq);
+}
+
+/**
+ * i40iw_init_tcp_ctx - setup qp context
+ * @cm_node: connection's node
+ * @tcp_info: offload info for tcp
+ * @iwqp: associate qp for the connection
+ */
+static void i40iw_init_tcp_ctx(struct i40iw_cm_node *cm_node,
+			       struct i40iw_tcp_offload_info *tcp_info,
+			       struct i40iw_qp *iwqp)
+{
+	tcp_info->ipv4 = cm_node->ipv4;
+	tcp_info->drop_ooo_seg = true;
+	tcp_info->wscale = true;
+	tcp_info->ignore_tcp_opt = true;
+	tcp_info->ignore_tcp_uns_opt = true;
+	tcp_info->no_nagle = false;
+
+	tcp_info->ttl = I40IW_DEFAULT_TTL;
+	tcp_info->rtt_var = cpu_to_le32(I40IW_DEFAULT_RTT_VAR);
+	tcp_info->ss_thresh = cpu_to_le32(I40IW_DEFAULT_SS_THRESH);
+	tcp_info->rexmit_thresh = I40IW_DEFAULT_REXMIT_THRESH;
+
+	tcp_info->tcp_state = I40IW_TCP_STATE_ESTABLISHED;
+	tcp_info->snd_wscale = cm_node->tcp_cntxt.snd_wscale;
+	tcp_info->rcv_wscale = cm_node->tcp_cntxt.rcv_wscale;
+
+	tcp_info->snd_nxt = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	tcp_info->snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.snd_wnd);
+	tcp_info->rcv_nxt = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
+	tcp_info->snd_max = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+
+	tcp_info->snd_una = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	tcp_info->cwnd = cpu_to_le32(2 * cm_node->tcp_cntxt.mss);
+	tcp_info->snd_wl1 = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
+	tcp_info->snd_wl2 = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	tcp_info->max_snd_window = cpu_to_le32(cm_node->tcp_cntxt.max_snd_wnd);
+	tcp_info->rcv_wnd = cpu_to_le32(cm_node->tcp_cntxt.rcv_wnd <<
+					cm_node->tcp_cntxt.rcv_wscale);
+
+	tcp_info->flow_label = 0;
+	tcp_info->snd_mss = cpu_to_le32(((u32)cm_node->tcp_cntxt.mss));
+	if (cm_node->vlan_id < VLAN_TAG_PRESENT) {
+		tcp_info->insert_vlan_tag = true;
+		tcp_info->vlan_tag = cpu_to_le16(((u16)cm_node->user_pri << I40IW_VLAN_PRIO_SHIFT) |
+						  cm_node->vlan_id);
+	}
+	if (cm_node->ipv4) {
+		tcp_info->src_port = cpu_to_le16(cm_node->loc_port);
+		tcp_info->dst_port = cpu_to_le16(cm_node->rem_port);
+
+		tcp_info->dest_ip_addr3 = cpu_to_le32(cm_node->rem_addr[0]);
+		tcp_info->local_ipaddr3 = cpu_to_le32(cm_node->loc_addr[0]);
+		tcp_info->arp_idx =
+			cpu_to_le16((u16)i40iw_arp_table(
+							 iwqp->iwdev,
+							 &tcp_info->dest_ip_addr3,
+							 true,
+							 NULL,
+							 I40IW_ARP_RESOLVE));
+	} else {
+		tcp_info->src_port = cpu_to_le16(cm_node->loc_port);
+		tcp_info->dst_port = cpu_to_le16(cm_node->rem_port);
+		tcp_info->dest_ip_addr0 = cpu_to_le32(cm_node->rem_addr[0]);
+		tcp_info->dest_ip_addr1 = cpu_to_le32(cm_node->rem_addr[1]);
+		tcp_info->dest_ip_addr2 = cpu_to_le32(cm_node->rem_addr[2]);
+		tcp_info->dest_ip_addr3 = cpu_to_le32(cm_node->rem_addr[3]);
+		tcp_info->local_ipaddr0 = cpu_to_le32(cm_node->loc_addr[0]);
+		tcp_info->local_ipaddr1 = cpu_to_le32(cm_node->loc_addr[1]);
+		tcp_info->local_ipaddr2 = cpu_to_le32(cm_node->loc_addr[2]);
+		tcp_info->local_ipaddr3 = cpu_to_le32(cm_node->loc_addr[3]);
+		tcp_info->arp_idx =
+			cpu_to_le16((u16)i40iw_arp_table(
+							 iwqp->iwdev,
+							 &tcp_info->dest_ip_addr0,
+							 false,
+							 NULL,
+							 I40IW_ARP_RESOLVE));
+	}
+}
+
+/**
+ * i40iw_cm_init_tsa_conn - setup qp for RTS
+ * @iwqp: associate qp for the connection
+ * @cm_node: connection's node
+ */
+static void i40iw_cm_init_tsa_conn(struct i40iw_qp *iwqp,
+				   struct i40iw_cm_node *cm_node)
+{
+	struct i40iw_tcp_offload_info tcp_info;
+	struct i40iwarp_offload_info *iwarp_info;
+	struct i40iw_qp_host_ctx_info *ctx_info;
+	struct i40iw_device *iwdev = iwqp->iwdev;
+	struct i40iw_sc_dev *dev = &iwqp->iwdev->sc_dev;
+
+	memset(&tcp_info, 0x00, sizeof(struct i40iw_tcp_offload_info));
+	iwarp_info = &iwqp->iwarp_info;
+	ctx_info = &iwqp->ctx_info;
+
+	ctx_info->tcp_info = &tcp_info;
+	ctx_info->send_cq_num = iwqp->iwscq->sc_cq.cq_uk.cq_id;
+	ctx_info->rcv_cq_num = iwqp->iwrcq->sc_cq.cq_uk.cq_id;
+
+	iwarp_info->ord_size = cm_node->ord_size;
+	iwarp_info->ird_size = i40iw_derive_hw_ird_setting(cm_node->ird_size);
+
+	if (iwarp_info->ord_size == 1)
+		iwarp_info->ord_size = 2;
+
+	iwarp_info->rd_enable = true;
+	iwarp_info->rdmap_ver = 1;
+	iwarp_info->ddp_ver = 1;
+
+	iwarp_info->pd_id = iwqp->iwpd->sc_pd.pd_id;
+
+	ctx_info->tcp_info_valid = true;
+	ctx_info->iwarp_info_valid = true;
+	ctx_info->add_to_qoslist = true;
+	ctx_info->user_pri = cm_node->user_pri;
+
+	i40iw_init_tcp_ctx(cm_node, &tcp_info, iwqp);
+	if (cm_node->snd_mark_en) {
+		iwarp_info->snd_mark_en = true;
+		iwarp_info->snd_mark_offset = (tcp_info.snd_nxt &
+				SNDMARKER_SEQNMASK) + cm_node->lsmm_size;
+	}
+
+	cm_node->state = I40IW_CM_STATE_OFFLOADED;
+	tcp_info.tcp_state = I40IW_TCP_STATE_ESTABLISHED;
+	tcp_info.src_mac_addr_idx = iwdev->mac_ip_table_idx;
+	tcp_info.tos = cm_node->tos;
+
+	dev->iw_priv_qp_ops->qp_setctx(&iwqp->sc_qp, (u64 *)(iwqp->host_ctx.va), ctx_info);
+
+	/* once tcp_info is set, no need to do it again */
+	ctx_info->tcp_info_valid = false;
+	ctx_info->iwarp_info_valid = false;
+	ctx_info->add_to_qoslist = false;
+}
+
+/**
+ * i40iw_cm_disconn - when a connection is being closed
+ * @iwqp: associate qp for the connection
+ */
+void i40iw_cm_disconn(struct i40iw_qp *iwqp)
+{
+	struct disconn_work *work;
+	struct i40iw_device *iwdev = iwqp->iwdev;
+	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
+	unsigned long flags;
+
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return;	/* Timer will clean up */
+
+	spin_lock_irqsave(&iwdev->qptable_lock, flags);
+	if (!iwdev->qp_table[iwqp->ibqp.qp_num]) {
+		spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
+		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM,
+			    "%s qp_id %d is already freed\n",
+			     __func__, iwqp->ibqp.qp_num);
+		kfree(work);
+		return;
+	}
+	i40iw_add_ref(&iwqp->ibqp);
+	spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
+
+	work->iwqp = iwqp;
+	INIT_WORK(&work->work, i40iw_disconnect_worker);
+	queue_work(cm_core->disconn_wq, &work->work);
+	return;
+}
+
+/**
+ * i40iw_qp_disconnect - free qp and close cm
+ * @iwqp: associate qp for the connection
+ */
+static void i40iw_qp_disconnect(struct i40iw_qp *iwqp)
+{
+	struct i40iw_device *iwdev;
+	struct i40iw_ib_device *iwibdev;
+
+	iwdev = to_iwdev(iwqp->ibqp.device);
+	if (!iwdev) {
+		i40iw_pr_err("iwdev == NULL\n");
+		return;
+	}
+
+	iwibdev = iwdev->iwibdev;
+
+	if (iwqp->active_conn) {
+		/* indicate this connection is NOT active */
+		iwqp->active_conn = 0;
+	} else {
+		/* Need to free the Last Streaming Mode Message */
+		if (iwqp->ietf_mem.va) {
+			if (iwqp->lsmm_mr)
+				iwibdev->ibdev.dereg_mr(iwqp->lsmm_mr);
+			i40iw_free_dma_mem(iwdev->sc_dev.hw, &iwqp->ietf_mem);
+		}
+	}
+
+	/* close the CM node down if it is still active */
+	if (iwqp->cm_node) {
+		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM, "%s Call close API\n", __func__);
+		i40iw_cm_close(iwqp->cm_node);
+	}
+}
+
+/**
+ * i40iw_cm_disconn_true - called by worker thread to disconnect qp
+ * @iwqp: associate qp for the connection
+ */
+static void i40iw_cm_disconn_true(struct i40iw_qp *iwqp)
+{
+	struct iw_cm_id *cm_id;
+	struct i40iw_device *iwdev;
+	struct i40iw_sc_qp *qp = &iwqp->sc_qp;
+	u16 last_ae;
+	u8 original_hw_tcp_state;
+	u8 original_ibqp_state;
+	int disconn_status = 0;
+	int issue_disconn = 0;
+	int issue_close = 0;
+	int issue_flush = 0;
+	struct ib_event ibevent;
+	unsigned long flags;
+	int ret;
+
+	if (!iwqp) {
+		i40iw_pr_err("iwqp == NULL\n");
+		return;
+	}
+
+	spin_lock_irqsave(&iwqp->lock, flags);
+	cm_id = iwqp->cm_id;
+	/* make sure we havent already closed this connection */
+	if (!cm_id) {
+		spin_unlock_irqrestore(&iwqp->lock, flags);
+		return;
+	}
+
+	iwdev = to_iwdev(iwqp->ibqp.device);
+
+	original_hw_tcp_state = iwqp->hw_tcp_state;
+	original_ibqp_state = iwqp->ibqp_state;
+	last_ae = iwqp->last_aeq;
+
+	if (qp->term_flags) {
+		issue_disconn = 1;
+		issue_close = 1;
+		iwqp->cm_id = NULL;
+		/*When term timer expires after cm_timer, don't want
+		 *terminate-handler to issue cm_disconn which can re-free
+		 *a QP even after its refcnt=0.
+		 */
+		i40iw_terminate_del_timer(qp);
+		if (!iwqp->flush_issued) {
+			iwqp->flush_issued = 1;
+			issue_flush = 1;
+		}
+	} else if ((original_hw_tcp_state == I40IW_TCP_STATE_CLOSE_WAIT) ||
+		   ((original_ibqp_state == IB_QPS_RTS) &&
+		    (last_ae == I40IW_AE_LLP_CONNECTION_RESET))) {
+		issue_disconn = 1;
+		if (last_ae == I40IW_AE_LLP_CONNECTION_RESET)
+			disconn_status = -ECONNRESET;
+	}
+
+	if (((original_hw_tcp_state == I40IW_TCP_STATE_CLOSED) ||
+	     (original_hw_tcp_state == I40IW_TCP_STATE_TIME_WAIT) ||
+	     (last_ae == I40IW_AE_RDMAP_ROE_BAD_LLP_CLOSE) ||
+	     (last_ae == I40IW_AE_LLP_CONNECTION_RESET) ||
+	      iwdev->reset)) {
+		issue_close = 1;
+		iwqp->cm_id = NULL;
+		if (!iwqp->flush_issued) {
+			iwqp->flush_issued = 1;
+			issue_flush = 1;
+		}
+	}
+
+	spin_unlock_irqrestore(&iwqp->lock, flags);
+	if (issue_flush && !iwqp->destroyed) {
+		/* Flush the queues */
+		i40iw_flush_wqes(iwdev, iwqp);
+
+		if (qp->term_flags && iwqp->ibqp.event_handler) {
+			ibevent.device = iwqp->ibqp.device;
+			ibevent.event = (qp->eventtype == TERM_EVENT_QP_FATAL) ?
+					IB_EVENT_QP_FATAL : IB_EVENT_QP_ACCESS_ERR;
+			ibevent.element.qp = &iwqp->ibqp;
+			iwqp->ibqp.event_handler(&ibevent, iwqp->ibqp.qp_context);
+		}
+	}
+
+	if (cm_id && cm_id->event_handler) {
+		if (issue_disconn) {
+			ret = i40iw_send_cm_event(NULL,
+						  cm_id,
+						  IW_CM_EVENT_DISCONNECT,
+						  disconn_status);
+
+			if (ret)
+				i40iw_debug(&iwdev->sc_dev,
+					    I40IW_DEBUG_CM,
+					    "disconnect event failed %s: - cm_id = %p\n",
+					    __func__, cm_id);
+		}
+		if (issue_close) {
+			i40iw_qp_disconnect(iwqp);
+			cm_id->provider_data = iwqp;
+			ret = i40iw_send_cm_event(NULL, cm_id, IW_CM_EVENT_CLOSE, 0);
+			if (ret)
+				i40iw_debug(&iwdev->sc_dev,
+					    I40IW_DEBUG_CM,
+					    "close event failed %s: - cm_id = %p\n",
+					    __func__, cm_id);
+			cm_id->rem_ref(cm_id);
+		}
+	}
+}
+
+/**
+ * i40iw_disconnect_worker - worker for connection close
+ * @work: points or disconn structure
+ */
+static void i40iw_disconnect_worker(struct work_struct *work)
+{
+	struct disconn_work *dwork = container_of(work, struct disconn_work, work);
+	struct i40iw_qp *iwqp = dwork->iwqp;
+
+	kfree(dwork);
+	i40iw_cm_disconn_true(iwqp);
+	i40iw_rem_ref(&iwqp->ibqp);
+}
+
+/**
+ * i40iw_accept - registered call for connection to be accepted
+ * @cm_id: cm information for passive connection
+ * @conn_param: accpet parameters
+ */
+int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct ib_qp *ibqp;
+	struct i40iw_qp *iwqp;
+	struct i40iw_device *iwdev;
+	struct i40iw_sc_dev *dev;
+	struct i40iw_cm_core *cm_core;
+	struct i40iw_cm_node *cm_node;
+	struct ib_qp_attr attr;
+	int passive_state;
+	struct ib_mr *ibmr;
+	struct i40iw_pd *iwpd;
+	u16 buf_len = 0;
+	struct i40iw_kmem_info accept;
+	enum i40iw_status_code status;
+	u64 tagged_offset;
+	unsigned long flags;
+
+	memset(&attr, 0, sizeof(attr));
+	ibqp = i40iw_get_qp(cm_id->device, conn_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+
+	iwqp = to_iwqp(ibqp);
+	iwdev = iwqp->iwdev;
+	dev = &iwdev->sc_dev;
+	cm_core = &iwdev->cm_core;
+	cm_node = (struct i40iw_cm_node *)cm_id->provider_data;
+
+	if (((struct sockaddr_in *)&cm_id->local_addr)->sin_family == AF_INET) {
+		cm_node->ipv4 = true;
+		cm_node->vlan_id = i40iw_get_vlan_ipv4(cm_node->loc_addr);
+	} else {
+		cm_node->ipv4 = false;
+		i40iw_netdev_vlan_ipv6(cm_node->loc_addr, &cm_node->vlan_id);
+	}
+	i40iw_debug(cm_node->dev,
+		    I40IW_DEBUG_CM,
+		    "Accept vlan_id=%d\n",
+		    cm_node->vlan_id);
+	if (cm_node->state == I40IW_CM_STATE_LISTENER_DESTROYED) {
+		if (cm_node->loopbackpartner)
+			i40iw_rem_ref_cm_node(cm_node->loopbackpartner);
+		i40iw_rem_ref_cm_node(cm_node);
+		return -EINVAL;
+	}
+
+	passive_state = atomic_add_return(1, &cm_node->passive_state);
+	if (passive_state == I40IW_SEND_RESET_EVENT) {
+		i40iw_rem_ref_cm_node(cm_node);
+		return -ECONNRESET;
+	}
+
+	cm_node->cm_core->stats_accepts++;
+	iwqp->cm_node = (void *)cm_node;
+	cm_node->iwqp = iwqp;
+
+	buf_len = conn_param->private_data_len + I40IW_MAX_IETF_SIZE;
+
+	status = i40iw_allocate_dma_mem(dev->hw, &iwqp->ietf_mem, buf_len, 1);
+
+	if (status)
+		return -ENOMEM;
+	cm_node->pdata.size = conn_param->private_data_len;
+	accept.addr = iwqp->ietf_mem.va;
+	accept.size = i40iw_cm_build_mpa_frame(cm_node, &accept, MPA_KEY_REPLY);
+	memcpy(accept.addr + accept.size, conn_param->private_data,
+	       conn_param->private_data_len);
+
+	/* setup our first outgoing iWarp send WQE (the IETF frame response) */
+	if ((cm_node->ipv4 &&
+	     !i40iw_ipv4_is_loopback(cm_node->loc_addr[0], cm_node->rem_addr[0])) ||
+	    (!cm_node->ipv4 &&
+	     !i40iw_ipv6_is_loopback(cm_node->loc_addr, cm_node->rem_addr))) {
+		iwpd = iwqp->iwpd;
+		tagged_offset = (uintptr_t)iwqp->ietf_mem.va;
+		ibmr = i40iw_reg_phys_mr(&iwpd->ibpd,
+					 iwqp->ietf_mem.pa,
+					 buf_len,
+					 IB_ACCESS_LOCAL_WRITE,
+					 &tagged_offset);
+		if (IS_ERR(ibmr)) {
+			i40iw_free_dma_mem(dev->hw, &iwqp->ietf_mem);
+			return -ENOMEM;
+		}
+
+		ibmr->pd = &iwpd->ibpd;
+		ibmr->device = iwpd->ibpd.device;
+		iwqp->lsmm_mr = ibmr;
+		if (iwqp->page)
+			iwqp->sc_qp.qp_uk.sq_base = kmap(iwqp->page);
+		dev->iw_priv_qp_ops->qp_send_lsmm(&iwqp->sc_qp,
+							iwqp->ietf_mem.va,
+							(accept.size + conn_param->private_data_len),
+							ibmr->lkey);
+
+	} else {
+		if (iwqp->page)
+			iwqp->sc_qp.qp_uk.sq_base = kmap(iwqp->page);
+		dev->iw_priv_qp_ops->qp_send_lsmm(&iwqp->sc_qp, NULL, 0, 0);
+	}
+
+	if (iwqp->page)
+		kunmap(iwqp->page);
+
+	iwqp->cm_id = cm_id;
+	cm_node->cm_id = cm_id;
+
+	cm_id->provider_data = (void *)iwqp;
+	iwqp->active_conn = 0;
+
+	cm_node->lsmm_size = accept.size + conn_param->private_data_len;
+	i40iw_cm_init_tsa_conn(iwqp, cm_node);
+	cm_id->add_ref(cm_id);
+	i40iw_add_ref(&iwqp->ibqp);
+
+	attr.qp_state = IB_QPS_RTS;
+	cm_node->qhash_set = false;
+	i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL);
+
+	cm_node->accelerated = true;
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+	list_move_tail(&cm_node->list, &cm_core->accelerated_list);
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	status =
+		i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_ESTABLISHED, 0);
+	if (status)
+		i40iw_debug(dev, I40IW_DEBUG_CM, "error sending cm event - ESTABLISHED\n");
+
+	if (cm_node->loopbackpartner) {
+		cm_node->loopbackpartner->pdata.size = conn_param->private_data_len;
+
+		/* copy entire MPA frame to our cm_node's frame */
+		memcpy(cm_node->loopbackpartner->pdata_buf,
+		       conn_param->private_data,
+		       conn_param->private_data_len);
+		i40iw_create_event(cm_node->loopbackpartner, I40IW_CM_EVENT_CONNECTED);
+	}
+
+	if (cm_node->accept_pend) {
+		atomic_dec(&cm_node->listener->pend_accepts_cnt);
+		cm_node->accept_pend = 0;
+	}
+	return 0;
+}
+
+/**
+ * i40iw_reject - registered call for connection to be rejected
+ * @cm_id: cm information for passive connection
+ * @pdata: private data to be sent
+ * @pdata_len: private data length
+ */
+int i40iw_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	struct i40iw_device *iwdev;
+	struct i40iw_cm_node *cm_node;
+	struct i40iw_cm_node *loopback;
+
+	cm_node = (struct i40iw_cm_node *)cm_id->provider_data;
+	loopback = cm_node->loopbackpartner;
+	cm_node->cm_id = cm_id;
+	cm_node->pdata.size = pdata_len;
+
+	iwdev = to_iwdev(cm_id->device);
+	if (!iwdev)
+		return -EINVAL;
+	cm_node->cm_core->stats_rejects++;
+
+	if (pdata_len + sizeof(struct ietf_mpa_v2) > MAX_CM_BUFFER)
+		return -EINVAL;
+
+	if (loopback) {
+		memcpy(&loopback->pdata_buf, pdata, pdata_len);
+		loopback->pdata.size = pdata_len;
+	}
+
+	return i40iw_cm_reject(cm_node, pdata, pdata_len);
+}
+
+/**
+ * i40iw_connect - registered call for connection to be established
+ * @cm_id: cm information for passive connection
+ * @conn_param: Information about the connection
+ */
+int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct ib_qp *ibqp;
+	struct i40iw_qp *iwqp;
+	struct i40iw_device *iwdev;
+	struct i40iw_cm_node *cm_node;
+	struct i40iw_cm_info cm_info;
+	struct sockaddr_in *laddr;
+	struct sockaddr_in *raddr;
+	struct sockaddr_in6 *laddr6;
+	struct sockaddr_in6 *raddr6;
+	int ret = 0;
+	unsigned long flags;
+
+	ibqp = i40iw_get_qp(cm_id->device, conn_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+	iwqp = to_iwqp(ibqp);
+	if (!iwqp)
+		return -EINVAL;
+	iwdev = to_iwdev(iwqp->ibqp.device);
+	if (!iwdev)
+		return -EINVAL;
+
+	laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
+	raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
+	laddr6 = (struct sockaddr_in6 *)&cm_id->m_local_addr;
+	raddr6 = (struct sockaddr_in6 *)&cm_id->m_remote_addr;
+
+	if (!(laddr->sin_port) || !(raddr->sin_port))
+		return -EINVAL;
+
+	iwqp->active_conn = 1;
+	iwqp->cm_id = NULL;
+	cm_id->provider_data = iwqp;
+
+	/* set up the connection params for the node */
+	if (cm_id->remote_addr.ss_family == AF_INET) {
+		cm_info.ipv4 = true;
+		memset(cm_info.loc_addr, 0, sizeof(cm_info.loc_addr));
+		memset(cm_info.rem_addr, 0, sizeof(cm_info.rem_addr));
+		cm_info.loc_addr[0] = ntohl(laddr->sin_addr.s_addr);
+		cm_info.rem_addr[0] = ntohl(raddr->sin_addr.s_addr);
+		cm_info.loc_port = ntohs(laddr->sin_port);
+		cm_info.rem_port = ntohs(raddr->sin_port);
+		cm_info.vlan_id = i40iw_get_vlan_ipv4(cm_info.loc_addr);
+	} else {
+		cm_info.ipv4 = false;
+		i40iw_copy_ip_ntohl(cm_info.loc_addr,
+				    laddr6->sin6_addr.in6_u.u6_addr32);
+		i40iw_copy_ip_ntohl(cm_info.rem_addr,
+				    raddr6->sin6_addr.in6_u.u6_addr32);
+		cm_info.loc_port = ntohs(laddr6->sin6_port);
+		cm_info.rem_port = ntohs(raddr6->sin6_port);
+		i40iw_netdev_vlan_ipv6(cm_info.loc_addr, &cm_info.vlan_id);
+	}
+	cm_info.cm_id = cm_id;
+	cm_info.tos = cm_id->tos;
+	cm_info.user_pri = rt_tos2priority(cm_id->tos);
+	i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_DCB, "%s TOS:[%d] UP:[%d]\n",
+		    __func__, cm_id->tos, cm_info.user_pri);
+	cm_id->add_ref(cm_id);
+	cm_node = i40iw_create_cm_node(&iwdev->cm_core, iwdev,
+				       conn_param, &cm_info);
+
+	if (IS_ERR(cm_node)) {
+		ret = PTR_ERR(cm_node);
+		cm_id->rem_ref(cm_id);
+		return ret;
+	}
+
+	if ((cm_info.ipv4 && (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr)) ||
+	    (!cm_info.ipv4 && memcmp(laddr6->sin6_addr.in6_u.u6_addr32,
+				     raddr6->sin6_addr.in6_u.u6_addr32,
+				     sizeof(laddr6->sin6_addr.in6_u.u6_addr32)))) {
+		if (i40iw_manage_qhash(iwdev, &cm_info, I40IW_QHASH_TYPE_TCP_ESTABLISHED,
+				       I40IW_QHASH_MANAGE_TYPE_ADD, NULL, true)) {
+			ret = -EINVAL;
+			goto err;
+		}
+		cm_node->qhash_set = true;
+	}
+
+	spin_lock_irqsave(&iwdev->cm_core.ht_lock, flags);
+	if (!test_and_set_bit(cm_info.loc_port, iwdev->cm_core.active_side_ports)) {
+		spin_unlock_irqrestore(&iwdev->cm_core.ht_lock, flags);
+		if (i40iw_manage_apbvt(iwdev, cm_info.loc_port, I40IW_MANAGE_APBVT_ADD)) {
+			ret =  -EINVAL;
+			goto err;
+		}
+	} else {
+		spin_unlock_irqrestore(&iwdev->cm_core.ht_lock, flags);
+	}
+
+	cm_node->apbvt_set = true;
+	iwqp->cm_node = cm_node;
+	cm_node->iwqp = iwqp;
+	iwqp->cm_id = cm_id;
+	i40iw_add_ref(&iwqp->ibqp);
+
+	if (cm_node->state != I40IW_CM_STATE_OFFLOADED) {
+		cm_node->state = I40IW_CM_STATE_SYN_SENT;
+		ret = i40iw_send_syn(cm_node, 0);
+		if (ret)
+			goto err;
+	}
+
+	if (cm_node->loopbackpartner) {
+		cm_node->loopbackpartner->state = I40IW_CM_STATE_MPAREQ_RCVD;
+		i40iw_create_event(cm_node->loopbackpartner,
+				   I40IW_CM_EVENT_MPA_REQ);
+	}
+
+	i40iw_debug(cm_node->dev,
+		    I40IW_DEBUG_CM,
+		    "Api - connect(): port=0x%04x, cm_node=%p, cm_id = %p.\n",
+		    cm_node->rem_port,
+		    cm_node,
+		    cm_node->cm_id);
+
+	return 0;
+
+err:
+	if (cm_info.ipv4)
+		i40iw_debug(&iwdev->sc_dev,
+			    I40IW_DEBUG_CM,
+			    "Api - connect() FAILED: dest addr=%pI4",
+			    cm_info.rem_addr);
+	else
+		i40iw_debug(&iwdev->sc_dev,
+			    I40IW_DEBUG_CM,
+			    "Api - connect() FAILED: dest addr=%pI6",
+			    cm_info.rem_addr);
+
+	i40iw_rem_ref_cm_node(cm_node);
+	cm_id->rem_ref(cm_id);
+	iwdev->cm_core.stats_connect_errs++;
+	return ret;
+}
+
+/**
+ * i40iw_create_listen - registered call creating listener
+ * @cm_id: cm information for passive connection
+ * @backlog: to max accept pending count
+ */
+int i40iw_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	struct i40iw_device *iwdev;
+	struct i40iw_cm_listener *cm_listen_node;
+	struct i40iw_cm_info cm_info;
+	enum i40iw_status_code ret;
+	struct sockaddr_in *laddr;
+	struct sockaddr_in6 *laddr6;
+	bool wildcard = false;
+
+	iwdev = to_iwdev(cm_id->device);
+	if (!iwdev)
+		return -EINVAL;
+
+	laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
+	laddr6 = (struct sockaddr_in6 *)&cm_id->m_local_addr;
+	memset(&cm_info, 0, sizeof(cm_info));
+	if (laddr->sin_family == AF_INET) {
+		cm_info.ipv4 = true;
+		cm_info.loc_addr[0] = ntohl(laddr->sin_addr.s_addr);
+		cm_info.loc_port = ntohs(laddr->sin_port);
+
+		if (laddr->sin_addr.s_addr != INADDR_ANY)
+			cm_info.vlan_id = i40iw_get_vlan_ipv4(cm_info.loc_addr);
+		else
+			wildcard = true;
+
+	} else {
+		cm_info.ipv4 = false;
+		i40iw_copy_ip_ntohl(cm_info.loc_addr,
+				    laddr6->sin6_addr.in6_u.u6_addr32);
+		cm_info.loc_port = ntohs(laddr6->sin6_port);
+		if (ipv6_addr_type(&laddr6->sin6_addr) != IPV6_ADDR_ANY)
+			i40iw_netdev_vlan_ipv6(cm_info.loc_addr,
+					       &cm_info.vlan_id);
+		else
+			wildcard = true;
+	}
+	cm_info.backlog = backlog;
+	cm_info.cm_id = cm_id;
+
+	cm_listen_node = i40iw_make_listen_node(&iwdev->cm_core, iwdev, &cm_info);
+	if (!cm_listen_node) {
+		i40iw_pr_err("cm_listen_node == NULL\n");
+		return -ENOMEM;
+	}
+
+	cm_id->provider_data = cm_listen_node;
+
+	cm_listen_node->tos = cm_id->tos;
+	cm_listen_node->user_pri = rt_tos2priority(cm_id->tos);
+	cm_info.user_pri = cm_listen_node->user_pri;
+
+	if (!cm_listen_node->reused_node) {
+		if (wildcard) {
+			if (cm_info.ipv4)
+				ret = i40iw_add_mqh_4(iwdev,
+						      &cm_info,
+						      cm_listen_node);
+			else
+				ret = i40iw_add_mqh_6(iwdev,
+						      &cm_info,
+						      cm_listen_node);
+			if (ret)
+				goto error;
+
+			ret = i40iw_manage_apbvt(iwdev,
+						 cm_info.loc_port,
+						 I40IW_MANAGE_APBVT_ADD);
+
+			if (ret)
+				goto error;
+		} else {
+			ret = i40iw_manage_qhash(iwdev,
+						 &cm_info,
+						 I40IW_QHASH_TYPE_TCP_SYN,
+						 I40IW_QHASH_MANAGE_TYPE_ADD,
+						 NULL,
+						 true);
+			if (ret)
+				goto error;
+			cm_listen_node->qhash_set = true;
+			ret = i40iw_manage_apbvt(iwdev,
+						 cm_info.loc_port,
+						 I40IW_MANAGE_APBVT_ADD);
+			if (ret)
+				goto error;
+		}
+	}
+	cm_id->add_ref(cm_id);
+	cm_listen_node->cm_core->stats_listen_created++;
+	return 0;
+ error:
+	i40iw_cm_del_listen(&iwdev->cm_core, (void *)cm_listen_node, false);
+	return -EINVAL;
+}
+
+/**
+ * i40iw_destroy_listen - registered call to destroy listener
+ * @cm_id: cm information for passive connection
+ */
+int i40iw_destroy_listen(struct iw_cm_id *cm_id)
+{
+	struct i40iw_device *iwdev;
+
+	iwdev = to_iwdev(cm_id->device);
+	if (cm_id->provider_data)
+		i40iw_cm_del_listen(&iwdev->cm_core, cm_id->provider_data, true);
+	else
+		i40iw_pr_err("cm_id->provider_data was NULL\n");
+
+	cm_id->rem_ref(cm_id);
+
+	return 0;
+}
+
+/**
+ * i40iw_cm_event_connected - handle connected active node
+ * @event: the info for cm_node of connection
+ */
+static void i40iw_cm_event_connected(struct i40iw_cm_event *event)
+{
+	struct i40iw_qp *iwqp;
+	struct i40iw_device *iwdev;
+	struct i40iw_cm_core *cm_core;
+	struct i40iw_cm_node *cm_node;
+	struct i40iw_sc_dev *dev;
+	struct ib_qp_attr attr;
+	struct iw_cm_id *cm_id;
+	unsigned long flags;
+	int status;
+	bool read0;
+
+	cm_node = event->cm_node;
+	cm_id = cm_node->cm_id;
+	iwqp = (struct i40iw_qp *)cm_id->provider_data;
+	iwdev = to_iwdev(iwqp->ibqp.device);
+	dev = &iwdev->sc_dev;
+	cm_core = &iwdev->cm_core;
+
+	if (iwqp->destroyed) {
+		status = -ETIMEDOUT;
+		goto error;
+	}
+	i40iw_cm_init_tsa_conn(iwqp, cm_node);
+	read0 = (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO);
+	if (iwqp->page)
+		iwqp->sc_qp.qp_uk.sq_base = kmap(iwqp->page);
+	dev->iw_priv_qp_ops->qp_send_rtt(&iwqp->sc_qp, read0);
+	if (iwqp->page)
+		kunmap(iwqp->page);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.qp_state = IB_QPS_RTS;
+	cm_node->qhash_set = false;
+	i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL);
+
+	cm_node->accelerated = true;
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+	list_move_tail(&cm_node->list, &cm_core->accelerated_list);
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+	status = i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_CONNECT_REPLY,
+				     0);
+	if (status)
+		i40iw_debug(dev, I40IW_DEBUG_CM, "error sending cm event - CONNECT_REPLY\n");
+
+	return;
+
+error:
+	iwqp->cm_id = NULL;
+	cm_id->provider_data = NULL;
+	i40iw_send_cm_event(event->cm_node,
+			    cm_id,
+			    IW_CM_EVENT_CONNECT_REPLY,
+			    status);
+	cm_id->rem_ref(cm_id);
+	i40iw_rem_ref_cm_node(event->cm_node);
+}
+
+/**
+ * i40iw_cm_event_reset - handle reset
+ * @event: the info for cm_node of connection
+ */
+static void i40iw_cm_event_reset(struct i40iw_cm_event *event)
+{
+	struct i40iw_cm_node *cm_node = event->cm_node;
+	struct iw_cm_id   *cm_id = cm_node->cm_id;
+	struct i40iw_qp *iwqp;
+
+	if (!cm_id)
+		return;
+
+	iwqp = cm_id->provider_data;
+	if (!iwqp)
+		return;
+
+	i40iw_debug(cm_node->dev,
+		    I40IW_DEBUG_CM,
+		    "reset event %p - cm_id = %p\n",
+		     event->cm_node, cm_id);
+	iwqp->cm_id = NULL;
+
+	i40iw_send_cm_event(cm_node, cm_node->cm_id, IW_CM_EVENT_DISCONNECT, -ECONNRESET);
+	i40iw_send_cm_event(cm_node, cm_node->cm_id, IW_CM_EVENT_CLOSE, 0);
+}
+
+/**
+ * i40iw_cm_event_handler - worker thread callback to send event to cm upper layer
+ * @work: pointer of cm event info.
+ */
+static void i40iw_cm_event_handler(struct work_struct *work)
+{
+	struct i40iw_cm_event *event = container_of(work,
+						    struct i40iw_cm_event,
+						    event_work);
+	struct i40iw_cm_node *cm_node;
+
+	if (!event || !event->cm_node || !event->cm_node->cm_core)
+		return;
+
+	cm_node = event->cm_node;
+
+	switch (event->type) {
+	case I40IW_CM_EVENT_MPA_REQ:
+		i40iw_send_cm_event(cm_node,
+				    cm_node->cm_id,
+				    IW_CM_EVENT_CONNECT_REQUEST,
+				    0);
+		break;
+	case I40IW_CM_EVENT_RESET:
+		i40iw_cm_event_reset(event);
+		break;
+	case I40IW_CM_EVENT_CONNECTED:
+		if (!event->cm_node->cm_id ||
+		    (event->cm_node->state != I40IW_CM_STATE_OFFLOADED))
+			break;
+		i40iw_cm_event_connected(event);
+		break;
+	case I40IW_CM_EVENT_MPA_REJECT:
+		if (!event->cm_node->cm_id ||
+		    (cm_node->state == I40IW_CM_STATE_OFFLOADED))
+			break;
+		i40iw_send_cm_event(cm_node,
+				    cm_node->cm_id,
+				    IW_CM_EVENT_CONNECT_REPLY,
+				    -ECONNREFUSED);
+		break;
+	case I40IW_CM_EVENT_ABORTED:
+		if (!event->cm_node->cm_id ||
+		    (event->cm_node->state == I40IW_CM_STATE_OFFLOADED))
+			break;
+		i40iw_event_connect_error(event);
+		break;
+	default:
+		i40iw_pr_err("event type = %d\n", event->type);
+		break;
+	}
+
+	event->cm_info.cm_id->rem_ref(event->cm_info.cm_id);
+	i40iw_rem_ref_cm_node(event->cm_node);
+	kfree(event);
+}
+
+/**
+ * i40iw_cm_post_event - queue event request for worker thread
+ * @event: cm node's info for up event call
+ */
+static void i40iw_cm_post_event(struct i40iw_cm_event *event)
+{
+	atomic_inc(&event->cm_node->ref_count);
+	event->cm_info.cm_id->add_ref(event->cm_info.cm_id);
+	INIT_WORK(&event->event_work, i40iw_cm_event_handler);
+
+	queue_work(event->cm_node->cm_core->event_wq, &event->event_work);
+}
+
+/**
+ * i40iw_qhash_ctrl - enable/disable qhash for list
+ * @iwdev: device pointer
+ * @parent_listen_node: parent listen node
+ * @nfo: cm info node
+ * @ipaddr: Pointer to IPv4 or IPv6 address
+ * @ipv4: flag indicating IPv4 when true
+ * @ifup: flag indicating interface up when true
+ *
+ * Enables or disables the qhash for the node in the child
+ * listen list that matches ipaddr. If no matching IP was found
+ * it will allocate and add a new child listen node to the
+ * parent listen node. The listen_list_lock is assumed to be
+ * held when called.
+ */
+static void i40iw_qhash_ctrl(struct i40iw_device *iwdev,
+			     struct i40iw_cm_listener *parent_listen_node,
+			     struct i40iw_cm_info *nfo,
+			     u32 *ipaddr, bool ipv4, bool ifup)
+{
+	struct list_head *child_listen_list = &parent_listen_node->child_listen_list;
+	struct i40iw_cm_listener *child_listen_node;
+	struct list_head *pos, *tpos;
+	enum i40iw_status_code ret;
+	bool node_allocated = false;
+	enum i40iw_quad_hash_manage_type op =
+		ifup ? I40IW_QHASH_MANAGE_TYPE_ADD : I40IW_QHASH_MANAGE_TYPE_DELETE;
+
+	list_for_each_safe(pos, tpos, child_listen_list) {
+		child_listen_node =
+			list_entry(pos,
+				   struct i40iw_cm_listener,
+				   child_listen_list);
+		if (!memcmp(child_listen_node->loc_addr, ipaddr, ipv4 ? 4 : 16))
+			goto set_qhash;
+	}
+
+	/* if not found then add a child listener if interface is going up */
+	if (!ifup)
+		return;
+	child_listen_node = kzalloc(sizeof(*child_listen_node), GFP_ATOMIC);
+	if (!child_listen_node)
+		return;
+	node_allocated = true;
+	memcpy(child_listen_node, parent_listen_node, sizeof(*child_listen_node));
+
+	memcpy(child_listen_node->loc_addr, ipaddr,  ipv4 ? 4 : 16);
+
+set_qhash:
+	memcpy(nfo->loc_addr,
+	       child_listen_node->loc_addr,
+	       sizeof(nfo->loc_addr));
+	nfo->vlan_id = child_listen_node->vlan_id;
+	ret = i40iw_manage_qhash(iwdev, nfo,
+				 I40IW_QHASH_TYPE_TCP_SYN,
+				 op,
+				 NULL, false);
+	if (!ret) {
+		child_listen_node->qhash_set = ifup;
+		if (node_allocated)
+			list_add(&child_listen_node->child_listen_list,
+				 &parent_listen_node->child_listen_list);
+	} else if (node_allocated) {
+		kfree(child_listen_node);
+	}
+}
+
+/**
+ * i40iw_cm_teardown_connections - teardown QPs
+ * @iwdev: device pointer
+ * @ipaddr: Pointer to IPv4 or IPv6 address
+ * @ipv4: flag indicating IPv4 when true
+ * @disconnect_all: flag indicating disconnect all QPs
+ * teardown QPs where source or destination addr matches ip addr
+ */
+void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr,
+				   struct i40iw_cm_info *nfo,
+				   bool disconnect_all)
+{
+	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
+	struct list_head *list_core_temp;
+	struct list_head *list_node;
+	struct i40iw_cm_node *cm_node;
+	unsigned long flags;
+	struct list_head teardown_list;
+	struct ib_qp_attr attr;
+
+	INIT_LIST_HEAD(&teardown_list);
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+	list_for_each_safe(list_node, list_core_temp,
+			   &cm_core->accelerated_list) {
+		cm_node = container_of(list_node, struct i40iw_cm_node, list);
+		if (disconnect_all ||
+		    (nfo->vlan_id == cm_node->vlan_id &&
+		    (!memcmp(cm_node->loc_addr, ipaddr, nfo->ipv4 ? 4 : 16) ||
+		     !memcmp(cm_node->rem_addr, ipaddr, nfo->ipv4 ? 4 : 16)))) {
+			atomic_inc(&cm_node->ref_count);
+			list_add(&cm_node->teardown_entry, &teardown_list);
+		}
+	}
+	list_for_each_safe(list_node, list_core_temp,
+			   &cm_core->non_accelerated_list) {
+		cm_node = container_of(list_node, struct i40iw_cm_node, list);
+		if (disconnect_all ||
+		    (nfo->vlan_id == cm_node->vlan_id &&
+		    (!memcmp(cm_node->loc_addr, ipaddr, nfo->ipv4 ? 4 : 16) ||
+		     !memcmp(cm_node->rem_addr, ipaddr, nfo->ipv4 ? 4 : 16)))) {
+			atomic_inc(&cm_node->ref_count);
+			list_add(&cm_node->teardown_entry, &teardown_list);
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	list_for_each_safe(list_node, list_core_temp, &teardown_list) {
+		cm_node = container_of(list_node, struct i40iw_cm_node,
+				       teardown_entry);
+		attr.qp_state = IB_QPS_ERR;
+		i40iw_modify_qp(&cm_node->iwqp->ibqp, &attr, IB_QP_STATE, NULL);
+		if (iwdev->reset)
+			i40iw_cm_disconn(cm_node->iwqp);
+		i40iw_rem_ref_cm_node(cm_node);
+	}
+}
+
+/**
+ * i40iw_ifdown_notify - process an ifdown on an interface
+ * @iwdev: device pointer
+ * @ipaddr: Pointer to IPv4 or IPv6 address
+ * @ipv4: flag indicating IPv4 when true
+ * @ifup: flag indicating interface up when true
+ */
+void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev,
+		     u32 *ipaddr, bool ipv4, bool ifup)
+{
+	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
+	unsigned long flags;
+	struct i40iw_cm_listener *listen_node;
+	static const u32 ip_zero[4] = { 0, 0, 0, 0 };
+	struct i40iw_cm_info nfo;
+	u16 vlan_id = rdma_vlan_dev_vlan_id(netdev);
+	enum i40iw_status_code ret;
+	enum i40iw_quad_hash_manage_type op =
+		ifup ? I40IW_QHASH_MANAGE_TYPE_ADD : I40IW_QHASH_MANAGE_TYPE_DELETE;
+
+	nfo.vlan_id = vlan_id;
+	nfo.ipv4 = ipv4;
+
+	/* Disable or enable qhash for listeners */
+	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+	list_for_each_entry(listen_node, &cm_core->listen_nodes, list) {
+		if (vlan_id == listen_node->vlan_id &&
+		    (!memcmp(listen_node->loc_addr, ipaddr, ipv4 ? 4 : 16) ||
+		    !memcmp(listen_node->loc_addr, ip_zero, ipv4 ? 4 : 16))) {
+			memcpy(nfo.loc_addr, listen_node->loc_addr,
+			       sizeof(nfo.loc_addr));
+			nfo.loc_port = listen_node->loc_port;
+			nfo.user_pri = listen_node->user_pri;
+			if (!list_empty(&listen_node->child_listen_list)) {
+				i40iw_qhash_ctrl(iwdev,
+						 listen_node,
+						 &nfo,
+						 ipaddr, ipv4, ifup);
+			} else if (memcmp(listen_node->loc_addr, ip_zero,
+					  ipv4 ? 4 : 16)) {
+				ret = i40iw_manage_qhash(iwdev,
+							 &nfo,
+							 I40IW_QHASH_TYPE_TCP_SYN,
+							 op,
+							 NULL,
+							 false);
+				if (!ret)
+					listen_node->qhash_set = ifup;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+
+	/* teardown connected qp's on ifdown */
+	if (!ifup)
+		i40iw_cm_teardown_connections(iwdev, ipaddr, &nfo, false);
+}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h
new file mode 100644
index 0000000..78ba36a
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h
@@ -0,0 +1,460 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_CM_H
+#define I40IW_CM_H
+
+#define QUEUE_EVENTS
+
+#define I40IW_MANAGE_APBVT_DEL 0
+#define I40IW_MANAGE_APBVT_ADD 1
+
+#define I40IW_MPA_REQUEST_ACCEPT  1
+#define I40IW_MPA_REQUEST_REJECT  2
+
+/* IETF MPA -- defines, enums, structs */
+#define IEFT_MPA_KEY_REQ  "MPA ID Req Frame"
+#define IEFT_MPA_KEY_REP  "MPA ID Rep Frame"
+#define IETF_MPA_KEY_SIZE 16
+#define IETF_MPA_VERSION  1
+#define IETF_MAX_PRIV_DATA_LEN 512
+#define IETF_MPA_FRAME_SIZE    20
+#define IETF_RTR_MSG_SIZE      4
+#define IETF_MPA_V2_FLAG       0x10
+#define SNDMARKER_SEQNMASK     0x000001FF
+
+#define I40IW_MAX_IETF_SIZE      32
+
+/* IETF RTR MSG Fields               */
+#define IETF_PEER_TO_PEER       0x8000
+#define IETF_FLPDU_ZERO_LEN     0x4000
+#define IETF_RDMA0_WRITE        0x8000
+#define IETF_RDMA0_READ         0x4000
+#define IETF_NO_IRD_ORD         0x3FFF
+
+/* HW-supported IRD sizes*/
+#define	I40IW_HW_IRD_SETTING_2	2
+#define	I40IW_HW_IRD_SETTING_4	4
+#define	I40IW_HW_IRD_SETTING_8	8
+#define	I40IW_HW_IRD_SETTING_16	16
+#define	I40IW_HW_IRD_SETTING_32	32
+#define	I40IW_HW_IRD_SETTING_64	64
+
+#define MAX_PORTS		65536
+#define I40IW_VLAN_PRIO_SHIFT   13
+
+enum ietf_mpa_flags {
+	IETF_MPA_FLAGS_MARKERS = 0x80,	/* receive Markers */
+	IETF_MPA_FLAGS_CRC = 0x40,	/* receive Markers */
+	IETF_MPA_FLAGS_REJECT = 0x20,	/* Reject */
+};
+
+struct ietf_mpa_v1 {
+	u8 key[IETF_MPA_KEY_SIZE];
+	u8 flags;
+	u8 rev;
+	__be16 priv_data_len;
+	u8 priv_data[0];
+};
+
+#define ietf_mpa_req_resp_frame ietf_mpa_frame
+
+struct ietf_rtr_msg {
+	__be16 ctrl_ird;
+	__be16 ctrl_ord;
+};
+
+struct ietf_mpa_v2 {
+	u8 key[IETF_MPA_KEY_SIZE];
+	u8 flags;
+	u8 rev;
+	__be16 priv_data_len;
+	struct ietf_rtr_msg rtr_msg;
+	u8 priv_data[0];
+};
+
+struct i40iw_cm_node;
+enum i40iw_timer_type {
+	I40IW_TIMER_TYPE_SEND,
+	I40IW_TIMER_TYPE_RECV,
+	I40IW_TIMER_NODE_CLEANUP,
+	I40IW_TIMER_TYPE_CLOSE,
+};
+
+#define I40IW_PASSIVE_STATE_INDICATED    0
+#define I40IW_DO_NOT_SEND_RESET_EVENT    1
+#define I40IW_SEND_RESET_EVENT           2
+
+#define MAX_I40IW_IFS 4
+
+#define SET_ACK 0x1
+#define SET_SYN 0x2
+#define SET_FIN 0x4
+#define SET_RST 0x8
+
+#define TCP_OPTIONS_PADDING     3
+
+struct option_base {
+	u8 optionnum;
+	u8 length;
+};
+
+enum option_numbers {
+	OPTION_NUMBER_END,
+	OPTION_NUMBER_NONE,
+	OPTION_NUMBER_MSS,
+	OPTION_NUMBER_WINDOW_SCALE,
+	OPTION_NUMBER_SACK_PERM,
+	OPTION_NUMBER_SACK,
+	OPTION_NUMBER_WRITE0 = 0xbc
+};
+
+struct option_mss {
+	u8 optionnum;
+	u8 length;
+	__be16 mss;
+};
+
+struct option_windowscale {
+	u8 optionnum;
+	u8 length;
+	u8 shiftcount;
+};
+
+union all_known_options {
+	char as_end;
+	struct option_base as_base;
+	struct option_mss as_mss;
+	struct option_windowscale as_windowscale;
+};
+
+struct i40iw_timer_entry {
+	struct list_head list;
+	unsigned long timetosend;	/* jiffies */
+	struct i40iw_puda_buf *sqbuf;
+	u32 type;
+	u32 retrycount;
+	u32 retranscount;
+	u32 context;
+	u32 send_retrans;
+	int close_when_complete;
+};
+
+#define I40IW_DEFAULT_RETRYS	64
+#define I40IW_DEFAULT_RETRANS	8
+#define I40IW_DEFAULT_TTL	0x40
+#define I40IW_DEFAULT_RTT_VAR	0x6
+#define I40IW_DEFAULT_SS_THRESH 0x3FFFFFFF
+#define I40IW_DEFAULT_REXMIT_THRESH 8
+
+#define I40IW_RETRY_TIMEOUT   HZ
+#define I40IW_SHORT_TIME      10
+#define I40IW_LONG_TIME       (2 * HZ)
+#define I40IW_MAX_TIMEOUT     ((unsigned long)(12 * HZ))
+
+#define I40IW_CM_HASHTABLE_SIZE         1024
+#define I40IW_CM_TCP_TIMER_INTERVAL     3000
+#define I40IW_CM_DEFAULT_MTU            1540
+#define I40IW_CM_DEFAULT_FRAME_CNT      10
+#define I40IW_CM_THREAD_STACK_SIZE      256
+#define I40IW_CM_DEFAULT_RCV_WND        64240
+#define I40IW_CM_DEFAULT_RCV_WND_SCALED 0x3fffc
+#define I40IW_CM_DEFAULT_RCV_WND_SCALE  2
+#define I40IW_CM_DEFAULT_FREE_PKTS      0x000A
+#define I40IW_CM_FREE_PKT_LO_WATERMARK  2
+
+#define I40IW_CM_DEFAULT_MSS   536
+
+#define I40IW_CM_DEF_SEQ       0x159bf75f
+#define I40IW_CM_DEF_LOCAL_ID  0x3b47
+
+#define I40IW_CM_DEF_SEQ2      0x18ed5740
+#define I40IW_CM_DEF_LOCAL_ID2 0xb807
+#define MAX_CM_BUFFER   (I40IW_MAX_IETF_SIZE + IETF_MAX_PRIV_DATA_LEN)
+
+typedef u32 i40iw_addr_t;
+
+#define i40iw_cm_tsa_context i40iw_qp_context
+
+struct i40iw_qp;
+
+/* cm node transition states */
+enum i40iw_cm_node_state {
+	I40IW_CM_STATE_UNKNOWN,
+	I40IW_CM_STATE_INITED,
+	I40IW_CM_STATE_LISTENING,
+	I40IW_CM_STATE_SYN_RCVD,
+	I40IW_CM_STATE_SYN_SENT,
+	I40IW_CM_STATE_ONE_SIDE_ESTABLISHED,
+	I40IW_CM_STATE_ESTABLISHED,
+	I40IW_CM_STATE_ACCEPTING,
+	I40IW_CM_STATE_MPAREQ_SENT,
+	I40IW_CM_STATE_MPAREQ_RCVD,
+	I40IW_CM_STATE_MPAREJ_RCVD,
+	I40IW_CM_STATE_OFFLOADED,
+	I40IW_CM_STATE_FIN_WAIT1,
+	I40IW_CM_STATE_FIN_WAIT2,
+	I40IW_CM_STATE_CLOSE_WAIT,
+	I40IW_CM_STATE_TIME_WAIT,
+	I40IW_CM_STATE_LAST_ACK,
+	I40IW_CM_STATE_CLOSING,
+	I40IW_CM_STATE_LISTENER_DESTROYED,
+	I40IW_CM_STATE_CLOSED
+};
+
+enum mpa_frame_version {
+	IETF_MPA_V1 = 1,
+	IETF_MPA_V2 = 2
+};
+
+enum mpa_frame_key {
+	MPA_KEY_REQUEST,
+	MPA_KEY_REPLY
+};
+
+enum send_rdma0 {
+	SEND_RDMA_READ_ZERO = 1,
+	SEND_RDMA_WRITE_ZERO = 2
+};
+
+enum i40iw_tcpip_pkt_type {
+	I40IW_PKT_TYPE_UNKNOWN,
+	I40IW_PKT_TYPE_SYN,
+	I40IW_PKT_TYPE_SYNACK,
+	I40IW_PKT_TYPE_ACK,
+	I40IW_PKT_TYPE_FIN,
+	I40IW_PKT_TYPE_RST
+};
+
+/* CM context params */
+struct i40iw_cm_tcp_context {
+	u8 client;
+
+	u32 loc_seq_num;
+	u32 loc_ack_num;
+	u32 rem_ack_num;
+	u32 rcv_nxt;
+
+	u32 loc_id;
+	u32 rem_id;
+
+	u32 snd_wnd;
+	u32 max_snd_wnd;
+
+	u32 rcv_wnd;
+	u32 mss;
+	u8 snd_wscale;
+	u8 rcv_wscale;
+};
+
+enum i40iw_cm_listener_state {
+	I40IW_CM_LISTENER_PASSIVE_STATE = 1,
+	I40IW_CM_LISTENER_ACTIVE_STATE = 2,
+	I40IW_CM_LISTENER_EITHER_STATE = 3
+};
+
+struct i40iw_cm_listener {
+	struct list_head list;
+	struct i40iw_cm_core *cm_core;
+	u8 loc_mac[ETH_ALEN];
+	u32 loc_addr[4];
+	u16 loc_port;
+	struct iw_cm_id *cm_id;
+	atomic_t ref_count;
+	struct i40iw_device *iwdev;
+	atomic_t pend_accepts_cnt;
+	int backlog;
+	enum i40iw_cm_listener_state listener_state;
+	u32 reused_node;
+	u8 user_pri;
+	u8 tos;
+	u16 vlan_id;
+	bool qhash_set;
+	bool ipv4;
+	struct list_head child_listen_list;
+
+};
+
+struct i40iw_kmem_info {
+	void *addr;
+	u32 size;
+};
+
+/* per connection node and node state information */
+struct i40iw_cm_node {
+	u32 loc_addr[4], rem_addr[4];
+	u16 loc_port, rem_port;
+	u16 vlan_id;
+	enum i40iw_cm_node_state state;
+	u8 loc_mac[ETH_ALEN];
+	u8 rem_mac[ETH_ALEN];
+	atomic_t ref_count;
+	struct i40iw_qp *iwqp;
+	struct i40iw_device *iwdev;
+	struct i40iw_sc_dev *dev;
+	struct i40iw_cm_tcp_context tcp_cntxt;
+	struct i40iw_cm_core *cm_core;
+	struct i40iw_cm_node *loopbackpartner;
+	struct i40iw_timer_entry *send_entry;
+	struct i40iw_timer_entry *close_entry;
+	spinlock_t retrans_list_lock; /* cm transmit packet */
+	enum send_rdma0 send_rdma0_op;
+	u16 ird_size;
+	u16 ord_size;
+	u16     mpav2_ird_ord;
+	struct iw_cm_id *cm_id;
+	struct list_head list;
+	bool accelerated;
+	struct i40iw_cm_listener *listener;
+	int apbvt_set;
+	int accept_pend;
+	struct list_head timer_entry;
+	struct list_head reset_entry;
+	struct list_head teardown_entry;
+	atomic_t passive_state;
+	bool qhash_set;
+	u8 user_pri;
+	u8 tos;
+	bool ipv4;
+	bool snd_mark_en;
+	u16 lsmm_size;
+	enum mpa_frame_version mpa_frame_rev;
+	struct i40iw_kmem_info pdata;
+	union {
+		struct ietf_mpa_v1 mpa_frame;
+		struct ietf_mpa_v2 mpa_v2_frame;
+	};
+
+	u8 pdata_buf[IETF_MAX_PRIV_DATA_LEN];
+	struct i40iw_kmem_info mpa_hdr;
+	bool ack_rcvd;
+};
+
+/* structure for client or CM to fill when making CM api calls. */
+/*	- only need to set relevant data, based on op. */
+struct i40iw_cm_info {
+	struct iw_cm_id *cm_id;
+	u16 loc_port;
+	u16 rem_port;
+	u32 loc_addr[4];
+	u32 rem_addr[4];
+	u16 vlan_id;
+	int backlog;
+	u8 user_pri;
+	u8 tos;
+	bool ipv4;
+};
+
+/* CM event codes */
+enum i40iw_cm_event_type {
+	I40IW_CM_EVENT_UNKNOWN,
+	I40IW_CM_EVENT_ESTABLISHED,
+	I40IW_CM_EVENT_MPA_REQ,
+	I40IW_CM_EVENT_MPA_CONNECT,
+	I40IW_CM_EVENT_MPA_ACCEPT,
+	I40IW_CM_EVENT_MPA_REJECT,
+	I40IW_CM_EVENT_MPA_ESTABLISHED,
+	I40IW_CM_EVENT_CONNECTED,
+	I40IW_CM_EVENT_RESET,
+	I40IW_CM_EVENT_ABORTED
+};
+
+/* event to post to CM event handler */
+struct i40iw_cm_event {
+	enum i40iw_cm_event_type type;
+	struct i40iw_cm_info cm_info;
+	struct work_struct event_work;
+	struct i40iw_cm_node *cm_node;
+};
+
+struct i40iw_cm_core {
+	struct i40iw_device *iwdev;
+	struct i40iw_sc_dev *dev;
+
+	struct list_head listen_nodes;
+	struct list_head accelerated_list;
+	struct list_head non_accelerated_list;
+
+	struct timer_list tcp_timer;
+
+	struct workqueue_struct *event_wq;
+	struct workqueue_struct *disconn_wq;
+
+	spinlock_t ht_lock; /* manage hash table */
+	spinlock_t listen_list_lock; /* listen list */
+
+	unsigned long active_side_ports[BITS_TO_LONGS(MAX_PORTS)];
+
+	u64	stats_nodes_created;
+	u64	stats_nodes_destroyed;
+	u64	stats_listen_created;
+	u64	stats_listen_destroyed;
+	u64	stats_listen_nodes_created;
+	u64	stats_listen_nodes_destroyed;
+	u64	stats_loopbacks;
+	u64	stats_accepts;
+	u64	stats_rejects;
+	u64	stats_connect_errs;
+	u64	stats_passive_errs;
+	u64	stats_pkt_retrans;
+	u64	stats_backlog_drops;
+};
+
+int i40iw_schedule_cm_timer(struct i40iw_cm_node *cm_node,
+			    struct i40iw_puda_buf *sqbuf,
+			    enum i40iw_timer_type type,
+			    int send_retrans,
+			    int close_when_complete);
+
+int i40iw_accept(struct iw_cm_id *, struct iw_cm_conn_param *);
+int i40iw_reject(struct iw_cm_id *, const void *, u8);
+int i40iw_connect(struct iw_cm_id *, struct iw_cm_conn_param *);
+int i40iw_create_listen(struct iw_cm_id *, int);
+int i40iw_destroy_listen(struct iw_cm_id *);
+
+int i40iw_cm_start(struct i40iw_device *);
+int i40iw_cm_stop(struct i40iw_device *);
+
+int i40iw_arp_table(struct i40iw_device *iwdev,
+		    u32 *ip_addr,
+		    bool ipv4,
+		    u8 *mac_addr,
+		    u32 action);
+
+void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev,
+		     u32 *ipaddr, bool ipv4, bool ifup);
+void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr,
+				   struct i40iw_cm_info *nfo,
+				   bool disconnect_all);
+#endif /* I40IW_CM_H */
diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
new file mode 100644
index 0000000..4d841a3
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
@@ -0,0 +1,5198 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#include "i40iw_osdep.h"
+#include "i40iw_register.h"
+#include "i40iw_status.h"
+#include "i40iw_hmc.h"
+
+#include "i40iw_d.h"
+#include "i40iw_type.h"
+#include "i40iw_p.h"
+#include "i40iw_vf.h"
+#include "i40iw_virtchnl.h"
+
+/**
+ * i40iw_insert_wqe_hdr - write wqe header
+ * @wqe: cqp wqe for header
+ * @header: header for the cqp wqe
+ */
+void i40iw_insert_wqe_hdr(u64 *wqe, u64 header)
+{
+	wmb();            /* make sure WQE is populated before polarity is set */
+	set_64bit_val(wqe, 24, header);
+}
+
+void i40iw_check_cqp_progress(struct i40iw_cqp_timeout *cqp_timeout, struct i40iw_sc_dev *dev)
+{
+	if (cqp_timeout->compl_cqp_cmds != dev->cqp_cmd_stats[OP_COMPLETED_COMMANDS]) {
+		cqp_timeout->compl_cqp_cmds = dev->cqp_cmd_stats[OP_COMPLETED_COMMANDS];
+		cqp_timeout->count = 0;
+	} else {
+		if (dev->cqp_cmd_stats[OP_REQUESTED_COMMANDS] != cqp_timeout->compl_cqp_cmds)
+			cqp_timeout->count++;
+	}
+}
+
+/**
+ * i40iw_get_cqp_reg_info - get head and tail for cqp using registers
+ * @cqp: struct for cqp hw
+ * @val: cqp tail register value
+ * @tail:wqtail register value
+ * @error: cqp processing err
+ */
+static inline void i40iw_get_cqp_reg_info(struct i40iw_sc_cqp *cqp,
+					  u32 *val,
+					  u32 *tail,
+					  u32 *error)
+{
+	if (cqp->dev->is_pf) {
+		*val = i40iw_rd32(cqp->dev->hw, I40E_PFPE_CQPTAIL);
+		*tail = RS_32(*val, I40E_PFPE_CQPTAIL_WQTAIL);
+		*error = RS_32(*val, I40E_PFPE_CQPTAIL_CQP_OP_ERR);
+	} else {
+		*val = i40iw_rd32(cqp->dev->hw, I40E_VFPE_CQPTAIL1);
+		*tail = RS_32(*val, I40E_VFPE_CQPTAIL_WQTAIL);
+		*error = RS_32(*val, I40E_VFPE_CQPTAIL_CQP_OP_ERR);
+	}
+}
+
+/**
+ * i40iw_cqp_poll_registers - poll cqp registers
+ * @cqp: struct for cqp hw
+ * @tail:wqtail register value
+ * @count: how many times to try for completion
+ */
+static enum i40iw_status_code i40iw_cqp_poll_registers(
+						struct i40iw_sc_cqp *cqp,
+						u32 tail,
+						u32 count)
+{
+	u32 i = 0;
+	u32 newtail, error, val;
+
+	while (i < count) {
+		i++;
+		i40iw_get_cqp_reg_info(cqp, &val, &newtail, &error);
+		if (error) {
+			error = (cqp->dev->is_pf) ?
+				 i40iw_rd32(cqp->dev->hw, I40E_PFPE_CQPERRCODES) :
+				 i40iw_rd32(cqp->dev->hw, I40E_VFPE_CQPERRCODES1);
+			return I40IW_ERR_CQP_COMPL_ERROR;
+		}
+		if (newtail != tail) {
+			/* SUCCESS */
+			I40IW_RING_MOVE_TAIL(cqp->sq_ring);
+			cqp->dev->cqp_cmd_stats[OP_COMPLETED_COMMANDS]++;
+			return 0;
+		}
+		udelay(I40IW_SLEEP_COUNT);
+	}
+	return I40IW_ERR_TIMEOUT;
+}
+
+/**
+ * i40iw_sc_parse_fpm_commit_buf - parse fpm commit buffer
+ * @buf: ptr to fpm commit buffer
+ * @info: ptr to i40iw_hmc_obj_info struct
+ * @sd: number of SDs for HMC objects
+ *
+ * parses fpm commit info and copy base value
+ * of hmc objects in hmc_info
+ */
+static enum i40iw_status_code i40iw_sc_parse_fpm_commit_buf(
+				u64 *buf,
+				struct i40iw_hmc_obj_info *info,
+				u32 *sd)
+{
+	u64 temp;
+	u64 size;
+	u64 base = 0;
+	u32 i, j;
+	u32 k = 0;
+
+	/* copy base values in obj_info */
+	for (i = I40IW_HMC_IW_QP, j = 0; i <= I40IW_HMC_IW_PBLE; i++, j += 8) {
+		if ((i == I40IW_HMC_IW_SRQ) ||
+			(i == I40IW_HMC_IW_FSIMC) ||
+			(i == I40IW_HMC_IW_FSIAV)) {
+			info[i].base = 0;
+			info[i].cnt = 0;
+			continue;
+		}
+		get_64bit_val(buf, j, &temp);
+		info[i].base = RS_64_1(temp, 32) * 512;
+		if (info[i].base > base) {
+			base = info[i].base;
+			k = i;
+		}
+		if (i == I40IW_HMC_IW_APBVT_ENTRY) {
+			info[i].cnt = 1;
+			continue;
+		}
+		if (i == I40IW_HMC_IW_QP)
+			info[i].cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_QPS);
+		else if (i == I40IW_HMC_IW_CQ)
+			info[i].cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_CQS);
+		else
+			info[i].cnt = (u32)(temp);
+	}
+	size = info[k].cnt * info[k].size + info[k].base;
+	if (size & 0x1FFFFF)
+		*sd = (u32)((size >> 21) + 1); /* add 1 for remainder */
+	else
+		*sd = (u32)(size >> 21);
+
+	return 0;
+}
+
+/**
+ * i40iw_sc_decode_fpm_query() - Decode a 64 bit value into max count and size
+ * @buf: ptr to fpm query buffer
+ * @buf_idx: index into buf
+ * @info: ptr to i40iw_hmc_obj_info struct
+ * @rsrc_idx: resource index into info
+ *
+ * Decode a 64 bit value from fpm query buffer into max count and size
+ */
+static u64 i40iw_sc_decode_fpm_query(u64 *buf,
+					    u32 buf_idx,
+					    struct i40iw_hmc_obj_info *obj_info,
+					    u32 rsrc_idx)
+{
+	u64 temp;
+	u32 size;
+
+	get_64bit_val(buf, buf_idx, &temp);
+	obj_info[rsrc_idx].max_cnt = (u32)temp;
+	size = (u32)RS_64_1(temp, 32);
+	obj_info[rsrc_idx].size = LS_64_1(1, size);
+
+	return temp;
+}
+
+/**
+ * i40iw_sc_parse_fpm_query_buf() - parses fpm query buffer
+ * @buf: ptr to fpm query buffer
+ * @info: ptr to i40iw_hmc_obj_info struct
+ * @hmc_fpm_misc: ptr to fpm data
+ *
+ * parses fpm query buffer and copy max_cnt and
+ * size value of hmc objects in hmc_info
+ */
+static enum i40iw_status_code i40iw_sc_parse_fpm_query_buf(
+				u64 *buf,
+				struct i40iw_hmc_info *hmc_info,
+				struct i40iw_hmc_fpm_misc *hmc_fpm_misc)
+{
+	struct i40iw_hmc_obj_info *obj_info;
+	u64 temp;
+	u32 size;
+	u16 max_pe_sds;
+
+	obj_info = hmc_info->hmc_obj;
+
+	get_64bit_val(buf, 0, &temp);
+	hmc_info->first_sd_index = (u16)RS_64(temp, I40IW_QUERY_FPM_FIRST_PE_SD_INDEX);
+	max_pe_sds = (u16)RS_64(temp, I40IW_QUERY_FPM_MAX_PE_SDS);
+
+	/* Reduce SD count for VFs by 1 to account for PBLE backing page rounding */
+	if (hmc_info->hmc_fn_id >= I40IW_FIRST_VF_FPM_ID)
+		max_pe_sds--;
+	hmc_fpm_misc->max_sds = max_pe_sds;
+	hmc_info->sd_table.sd_cnt = max_pe_sds + hmc_info->first_sd_index;
+
+	get_64bit_val(buf, 8, &temp);
+	obj_info[I40IW_HMC_IW_QP].max_cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_QPS);
+	size = (u32)RS_64_1(temp, 32);
+	obj_info[I40IW_HMC_IW_QP].size = LS_64_1(1, size);
+
+	get_64bit_val(buf, 16, &temp);
+	obj_info[I40IW_HMC_IW_CQ].max_cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_CQS);
+	size = (u32)RS_64_1(temp, 32);
+	obj_info[I40IW_HMC_IW_CQ].size = LS_64_1(1, size);
+
+	i40iw_sc_decode_fpm_query(buf, 32, obj_info, I40IW_HMC_IW_HTE);
+	i40iw_sc_decode_fpm_query(buf, 40, obj_info, I40IW_HMC_IW_ARP);
+
+	obj_info[I40IW_HMC_IW_APBVT_ENTRY].size = 8192;
+	obj_info[I40IW_HMC_IW_APBVT_ENTRY].max_cnt = 1;
+
+	i40iw_sc_decode_fpm_query(buf, 48, obj_info, I40IW_HMC_IW_MR);
+	i40iw_sc_decode_fpm_query(buf, 56, obj_info, I40IW_HMC_IW_XF);
+
+	get_64bit_val(buf, 64, &temp);
+	obj_info[I40IW_HMC_IW_XFFL].max_cnt = (u32)temp;
+	obj_info[I40IW_HMC_IW_XFFL].size = 4;
+	hmc_fpm_misc->xf_block_size = RS_64(temp, I40IW_QUERY_FPM_XFBLOCKSIZE);
+	if (!hmc_fpm_misc->xf_block_size)
+		return I40IW_ERR_INVALID_SIZE;
+
+	i40iw_sc_decode_fpm_query(buf, 72, obj_info, I40IW_HMC_IW_Q1);
+
+	get_64bit_val(buf, 80, &temp);
+	obj_info[I40IW_HMC_IW_Q1FL].max_cnt = (u32)temp;
+	obj_info[I40IW_HMC_IW_Q1FL].size = 4;
+	hmc_fpm_misc->q1_block_size = RS_64(temp, I40IW_QUERY_FPM_Q1BLOCKSIZE);
+	if (!hmc_fpm_misc->q1_block_size)
+		return I40IW_ERR_INVALID_SIZE;
+
+	i40iw_sc_decode_fpm_query(buf, 88, obj_info, I40IW_HMC_IW_TIMER);
+
+	get_64bit_val(buf, 112, &temp);
+	obj_info[I40IW_HMC_IW_PBLE].max_cnt = (u32)temp;
+	obj_info[I40IW_HMC_IW_PBLE].size = 8;
+
+	get_64bit_val(buf, 120, &temp);
+	hmc_fpm_misc->max_ceqs = (u8)RS_64(temp, I40IW_QUERY_FPM_MAX_CEQS);
+	hmc_fpm_misc->ht_multiplier = RS_64(temp, I40IW_QUERY_FPM_HTMULTIPLIER);
+	hmc_fpm_misc->timer_bucket = RS_64(temp, I40IW_QUERY_FPM_TIMERBUCKET);
+
+	return 0;
+}
+
+/**
+ * i40iw_fill_qos_list - Change all unknown qs handles to available ones
+ * @qs_list: list of qs_handles to be fixed with valid qs_handles
+ */
+static void i40iw_fill_qos_list(u16 *qs_list)
+{
+	u16 qshandle = qs_list[0];
+	int i;
+
+	for (i = 0; i < I40IW_MAX_USER_PRIORITY; i++) {
+		if (qs_list[i] == QS_HANDLE_UNKNOWN)
+			qs_list[i] = qshandle;
+		else
+			qshandle = qs_list[i];
+	}
+}
+
+/**
+ * i40iw_qp_from_entry - Given entry, get to the qp structure
+ * @entry: Points to list of qp structure
+ */
+static struct i40iw_sc_qp *i40iw_qp_from_entry(struct list_head *entry)
+{
+	if (!entry)
+		return NULL;
+
+	return (struct i40iw_sc_qp *)((char *)entry - offsetof(struct i40iw_sc_qp, list));
+}
+
+/**
+ * i40iw_get_qp - get the next qp from the list given current qp
+ * @head: Listhead of qp's
+ * @qp: current qp
+ */
+static struct i40iw_sc_qp *i40iw_get_qp(struct list_head *head, struct i40iw_sc_qp *qp)
+{
+	struct list_head *entry = NULL;
+	struct list_head *lastentry;
+
+	if (list_empty(head))
+		return NULL;
+
+	if (!qp) {
+		entry = head->next;
+	} else {
+		lastentry = &qp->list;
+		entry = (lastentry != head) ? lastentry->next : NULL;
+	}
+
+	return i40iw_qp_from_entry(entry);
+}
+
+/**
+ * i40iw_change_l2params - given the new l2 parameters, change all qp
+ * @vsi: pointer to the vsi structure
+ * @l2params: New paramaters from l2
+ */
+void i40iw_change_l2params(struct i40iw_sc_vsi *vsi, struct i40iw_l2params *l2params)
+{
+	struct i40iw_sc_dev *dev = vsi->dev;
+	struct i40iw_sc_qp *qp = NULL;
+	bool qs_handle_change = false;
+	unsigned long flags;
+	u16 qs_handle;
+	int i;
+
+	if (vsi->mtu != l2params->mtu) {
+		vsi->mtu = l2params->mtu;
+		i40iw_reinitialize_ieq(dev);
+	}
+
+	i40iw_fill_qos_list(l2params->qs_handle_list);
+	for (i = 0; i < I40IW_MAX_USER_PRIORITY; i++) {
+		qs_handle = l2params->qs_handle_list[i];
+		if (vsi->qos[i].qs_handle != qs_handle)
+			qs_handle_change = true;
+		spin_lock_irqsave(&vsi->qos[i].lock, flags);
+		qp = i40iw_get_qp(&vsi->qos[i].qplist, qp);
+		while (qp) {
+			if (qs_handle_change) {
+				qp->qs_handle = qs_handle;
+				/* issue cqp suspend command */
+				i40iw_qp_suspend_resume(dev, qp, true);
+			}
+			qp = i40iw_get_qp(&vsi->qos[i].qplist, qp);
+		}
+		spin_unlock_irqrestore(&vsi->qos[i].lock, flags);
+		vsi->qos[i].qs_handle = qs_handle;
+	}
+}
+
+/**
+ * i40iw_qp_rem_qos - remove qp from qos lists during destroy qp
+ * @qp: qp to be removed from qos
+ */
+void i40iw_qp_rem_qos(struct i40iw_sc_qp *qp)
+{
+	struct i40iw_sc_vsi *vsi = qp->vsi;
+	unsigned long flags;
+
+	if (!qp->on_qoslist)
+		return;
+	spin_lock_irqsave(&vsi->qos[qp->user_pri].lock, flags);
+	list_del(&qp->list);
+	spin_unlock_irqrestore(&vsi->qos[qp->user_pri].lock, flags);
+}
+
+/**
+ * i40iw_qp_add_qos - called during setctx fot qp to be added to qos
+ * @qp: qp to be added to qos
+ */
+void i40iw_qp_add_qos(struct i40iw_sc_qp *qp)
+{
+	struct i40iw_sc_vsi *vsi = qp->vsi;
+	unsigned long flags;
+
+	if (qp->on_qoslist)
+		return;
+	spin_lock_irqsave(&vsi->qos[qp->user_pri].lock, flags);
+	qp->qs_handle = vsi->qos[qp->user_pri].qs_handle;
+	list_add(&qp->list, &vsi->qos[qp->user_pri].qplist);
+	qp->on_qoslist = true;
+	spin_unlock_irqrestore(&vsi->qos[qp->user_pri].lock, flags);
+}
+
+/**
+ * i40iw_sc_pd_init - initialize sc pd struct
+ * @dev: sc device struct
+ * @pd: sc pd ptr
+ * @pd_id: pd_id for allocated pd
+ * @abi_ver: ABI version from user context, -1 if not valid
+ */
+static void i40iw_sc_pd_init(struct i40iw_sc_dev *dev,
+			     struct i40iw_sc_pd *pd,
+			     u16 pd_id,
+			     int abi_ver)
+{
+	pd->size = sizeof(*pd);
+	pd->pd_id = pd_id;
+	pd->abi_ver = abi_ver;
+	pd->dev = dev;
+}
+
+/**
+ * i40iw_get_encoded_wqe_size - given wq size, returns hardware encoded size
+ * @wqsize: size of the wq (sq, rq, srq) to encoded_size
+ * @cqpsq: encoded size for sq for cqp as its encoded size is 1+ other wq's
+ */
+u8 i40iw_get_encoded_wqe_size(u32 wqsize, bool cqpsq)
+{
+	u8 encoded_size = 0;
+
+	/* cqp sq's hw coded value starts from 1 for size of 4
+	 * while it starts from 0 for qp' wq's.
+	 */
+	if (cqpsq)
+		encoded_size = 1;
+	wqsize >>= 2;
+	while (wqsize >>= 1)
+		encoded_size++;
+	return encoded_size;
+}
+
+/**
+ * i40iw_sc_cqp_init - Initialize buffers for a control Queue Pair
+ * @cqp: IWARP control queue pair pointer
+ * @info: IWARP control queue pair init info pointer
+ *
+ * Initializes the object and context buffers for a control Queue Pair.
+ */
+static enum i40iw_status_code i40iw_sc_cqp_init(struct i40iw_sc_cqp *cqp,
+						struct i40iw_cqp_init_info *info)
+{
+	u8 hw_sq_size;
+
+	if ((info->sq_size > I40IW_CQP_SW_SQSIZE_2048) ||
+	    (info->sq_size < I40IW_CQP_SW_SQSIZE_4) ||
+	    ((info->sq_size & (info->sq_size - 1))))
+		return I40IW_ERR_INVALID_SIZE;
+
+	hw_sq_size = i40iw_get_encoded_wqe_size(info->sq_size, true);
+	cqp->size = sizeof(*cqp);
+	cqp->sq_size = info->sq_size;
+	cqp->hw_sq_size = hw_sq_size;
+	cqp->sq_base = info->sq;
+	cqp->host_ctx = info->host_ctx;
+	cqp->sq_pa = info->sq_pa;
+	cqp->host_ctx_pa = info->host_ctx_pa;
+	cqp->dev = info->dev;
+	cqp->struct_ver = info->struct_ver;
+	cqp->scratch_array = info->scratch_array;
+	cqp->polarity = 0;
+	cqp->en_datacenter_tcp = info->en_datacenter_tcp;
+	cqp->enabled_vf_count = info->enabled_vf_count;
+	cqp->hmc_profile = info->hmc_profile;
+	info->dev->cqp = cqp;
+
+	I40IW_RING_INIT(cqp->sq_ring, cqp->sq_size);
+	cqp->dev->cqp_cmd_stats[OP_REQUESTED_COMMANDS] = 0;
+	cqp->dev->cqp_cmd_stats[OP_COMPLETED_COMMANDS] = 0;
+	INIT_LIST_HEAD(&cqp->dev->cqp_cmd_head);               /* for the cqp commands backlog. */
+
+	i40iw_wr32(cqp->dev->hw, I40E_PFPE_CQPTAIL, 0);
+	i40iw_wr32(cqp->dev->hw, I40E_PFPE_CQPDB, 0);
+
+	i40iw_debug(cqp->dev, I40IW_DEBUG_WQE,
+		    "%s: sq_size[%04d] hw_sq_size[%04d] sq_base[%p] sq_pa[%llxh] cqp[%p] polarity[x%04X]\n",
+		    __func__, cqp->sq_size, cqp->hw_sq_size,
+		    cqp->sq_base, cqp->sq_pa, cqp, cqp->polarity);
+	return 0;
+}
+
+/**
+ * i40iw_sc_cqp_create - create cqp during bringup
+ * @cqp: struct for cqp hw
+ * @maj_err: If error, major err number
+ * @min_err: If error, minor err number
+ */
+static enum i40iw_status_code i40iw_sc_cqp_create(struct i40iw_sc_cqp *cqp,
+						  u16 *maj_err,
+						  u16 *min_err)
+{
+	u64 temp;
+	u32 cnt = 0, p1, p2, val = 0, err_code;
+	enum i40iw_status_code ret_code;
+
+	*maj_err = 0;
+	*min_err = 0;
+
+	ret_code = i40iw_allocate_dma_mem(cqp->dev->hw,
+					  &cqp->sdbuf,
+					  I40IW_UPDATE_SD_BUF_SIZE * cqp->sq_size,
+					  I40IW_SD_BUF_ALIGNMENT);
+
+	if (ret_code)
+		goto exit;
+
+	temp = LS_64(cqp->hw_sq_size, I40IW_CQPHC_SQSIZE) |
+	       LS_64(cqp->struct_ver, I40IW_CQPHC_SVER);
+
+	set_64bit_val(cqp->host_ctx, 0, temp);
+	set_64bit_val(cqp->host_ctx, 8, cqp->sq_pa);
+	temp = LS_64(cqp->enabled_vf_count, I40IW_CQPHC_ENABLED_VFS) |
+	       LS_64(cqp->hmc_profile, I40IW_CQPHC_HMC_PROFILE);
+	set_64bit_val(cqp->host_ctx, 16, temp);
+	set_64bit_val(cqp->host_ctx, 24, (uintptr_t)cqp);
+	set_64bit_val(cqp->host_ctx, 32, 0);
+	set_64bit_val(cqp->host_ctx, 40, 0);
+	set_64bit_val(cqp->host_ctx, 48, 0);
+	set_64bit_val(cqp->host_ctx, 56, 0);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CQP_HOST_CTX",
+			cqp->host_ctx, I40IW_CQP_CTX_SIZE * 8);
+
+	p1 = RS_32_1(cqp->host_ctx_pa, 32);
+	p2 = (u32)cqp->host_ctx_pa;
+
+	if (cqp->dev->is_pf) {
+		i40iw_wr32(cqp->dev->hw, I40E_PFPE_CCQPHIGH, p1);
+		i40iw_wr32(cqp->dev->hw, I40E_PFPE_CCQPLOW, p2);
+	} else {
+		i40iw_wr32(cqp->dev->hw, I40E_VFPE_CCQPHIGH1, p1);
+		i40iw_wr32(cqp->dev->hw, I40E_VFPE_CCQPLOW1, p2);
+	}
+	do {
+		if (cnt++ > I40IW_DONE_COUNT) {
+			i40iw_free_dma_mem(cqp->dev->hw, &cqp->sdbuf);
+			ret_code = I40IW_ERR_TIMEOUT;
+			/*
+			 * read PFPE_CQPERRORCODES register to get the minor
+			 * and major error code
+			 */
+			if (cqp->dev->is_pf)
+				err_code = i40iw_rd32(cqp->dev->hw, I40E_PFPE_CQPERRCODES);
+			else
+				err_code = i40iw_rd32(cqp->dev->hw, I40E_VFPE_CQPERRCODES1);
+			*min_err = RS_32(err_code, I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE);
+			*maj_err = RS_32(err_code, I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE);
+			goto exit;
+		}
+		udelay(I40IW_SLEEP_COUNT);
+		if (cqp->dev->is_pf)
+			val = i40iw_rd32(cqp->dev->hw, I40E_PFPE_CCQPSTATUS);
+		else
+			val = i40iw_rd32(cqp->dev->hw, I40E_VFPE_CCQPSTATUS1);
+	} while (!val);
+
+exit:
+	if (!ret_code)
+		cqp->process_cqp_sds = i40iw_update_sds_noccq;
+	return ret_code;
+}
+
+/**
+ * i40iw_sc_cqp_post_sq - post of cqp's sq
+ * @cqp: struct for cqp hw
+ */
+void i40iw_sc_cqp_post_sq(struct i40iw_sc_cqp *cqp)
+{
+	if (cqp->dev->is_pf)
+		i40iw_wr32(cqp->dev->hw, I40E_PFPE_CQPDB, I40IW_RING_GETCURRENT_HEAD(cqp->sq_ring));
+	else
+		i40iw_wr32(cqp->dev->hw, I40E_VFPE_CQPDB1, I40IW_RING_GETCURRENT_HEAD(cqp->sq_ring));
+
+	i40iw_debug(cqp->dev,
+		    I40IW_DEBUG_WQE,
+		    "%s: HEAD_TAIL[%04d,%04d,%04d]\n",
+		    __func__,
+		    cqp->sq_ring.head,
+		    cqp->sq_ring.tail,
+		    cqp->sq_ring.size);
+}
+
+/**
+ * i40iw_sc_cqp_get_next_send_wqe_idx - get next WQE on CQP SQ and pass back the index
+ * @cqp: pointer to CQP structure
+ * @scratch: private data for CQP WQE
+ * @wqe_idx: WQE index for next WQE on CQP SQ
+ */
+static u64 *i40iw_sc_cqp_get_next_send_wqe_idx(struct i40iw_sc_cqp *cqp,
+					       u64 scratch, u32 *wqe_idx)
+{
+	u64 *wqe = NULL;
+	enum i40iw_status_code ret_code;
+
+	if (I40IW_RING_FULL_ERR(cqp->sq_ring)) {
+		i40iw_debug(cqp->dev,
+			    I40IW_DEBUG_WQE,
+			    "%s: ring is full head %x tail %x size %x\n",
+			    __func__,
+			    cqp->sq_ring.head,
+			    cqp->sq_ring.tail,
+			    cqp->sq_ring.size);
+		return NULL;
+	}
+	I40IW_ATOMIC_RING_MOVE_HEAD(cqp->sq_ring, *wqe_idx, ret_code);
+	cqp->dev->cqp_cmd_stats[OP_REQUESTED_COMMANDS]++;
+	if (ret_code)
+		return NULL;
+	if (!*wqe_idx)
+		cqp->polarity = !cqp->polarity;
+
+	wqe = cqp->sq_base[*wqe_idx].elem;
+	cqp->scratch_array[*wqe_idx] = scratch;
+	I40IW_CQP_INIT_WQE(wqe);
+
+	return wqe;
+}
+
+/**
+ * i40iw_sc_cqp_get_next_send_wqe - get next wqe on cqp sq
+ * @cqp: struct for cqp hw
+ * @scratch: private data for CQP WQE
+ */
+u64 *i40iw_sc_cqp_get_next_send_wqe(struct i40iw_sc_cqp *cqp, u64 scratch)
+{
+	u32 wqe_idx;
+
+	return i40iw_sc_cqp_get_next_send_wqe_idx(cqp, scratch, &wqe_idx);
+}
+
+/**
+ * i40iw_sc_cqp_destroy - destroy cqp during close
+ * @cqp: struct for cqp hw
+ */
+static enum i40iw_status_code i40iw_sc_cqp_destroy(struct i40iw_sc_cqp *cqp)
+{
+	u32 cnt = 0, val = 1;
+	enum i40iw_status_code ret_code = 0;
+	u32 cqpstat_addr;
+
+	if (cqp->dev->is_pf) {
+		i40iw_wr32(cqp->dev->hw, I40E_PFPE_CCQPHIGH, 0);
+		i40iw_wr32(cqp->dev->hw, I40E_PFPE_CCQPLOW, 0);
+		cqpstat_addr = I40E_PFPE_CCQPSTATUS;
+	} else {
+		i40iw_wr32(cqp->dev->hw, I40E_VFPE_CCQPHIGH1, 0);
+		i40iw_wr32(cqp->dev->hw, I40E_VFPE_CCQPLOW1, 0);
+		cqpstat_addr = I40E_VFPE_CCQPSTATUS1;
+	}
+	do {
+		if (cnt++ > I40IW_DONE_COUNT) {
+			ret_code = I40IW_ERR_TIMEOUT;
+			break;
+		}
+		udelay(I40IW_SLEEP_COUNT);
+		val = i40iw_rd32(cqp->dev->hw, cqpstat_addr);
+	} while (val);
+
+	i40iw_free_dma_mem(cqp->dev->hw, &cqp->sdbuf);
+	return ret_code;
+}
+
+/**
+ * i40iw_sc_ccq_arm - enable intr for control cq
+ * @ccq: ccq sc struct
+ */
+static void i40iw_sc_ccq_arm(struct i40iw_sc_cq *ccq)
+{
+	u64 temp_val;
+	u16 sw_cq_sel;
+	u8 arm_next_se;
+	u8 arm_seq_num;
+
+	/* write to cq doorbell shadow area */
+	/* arm next se should always be zero */
+	get_64bit_val(ccq->cq_uk.shadow_area, 32, &temp_val);
+
+	sw_cq_sel = (u16)RS_64(temp_val, I40IW_CQ_DBSA_SW_CQ_SELECT);
+	arm_next_se = (u8)RS_64(temp_val, I40IW_CQ_DBSA_ARM_NEXT_SE);
+
+	arm_seq_num = (u8)RS_64(temp_val, I40IW_CQ_DBSA_ARM_SEQ_NUM);
+	arm_seq_num++;
+
+	temp_val = LS_64(arm_seq_num, I40IW_CQ_DBSA_ARM_SEQ_NUM) |
+		   LS_64(sw_cq_sel, I40IW_CQ_DBSA_SW_CQ_SELECT) |
+		   LS_64(arm_next_se, I40IW_CQ_DBSA_ARM_NEXT_SE) |
+		   LS_64(1, I40IW_CQ_DBSA_ARM_NEXT);
+
+	set_64bit_val(ccq->cq_uk.shadow_area, 32, temp_val);
+
+	wmb();       /* make sure shadow area is updated before arming */
+
+	if (ccq->dev->is_pf)
+		i40iw_wr32(ccq->dev->hw, I40E_PFPE_CQARM, ccq->cq_uk.cq_id);
+	else
+		i40iw_wr32(ccq->dev->hw, I40E_VFPE_CQARM1, ccq->cq_uk.cq_id);
+}
+
+/**
+ * i40iw_sc_ccq_get_cqe_info - get ccq's cq entry
+ * @ccq: ccq sc struct
+ * @info: completion q entry to return
+ */
+static enum i40iw_status_code i40iw_sc_ccq_get_cqe_info(
+					struct i40iw_sc_cq *ccq,
+					struct i40iw_ccq_cqe_info *info)
+{
+	u64 qp_ctx, temp, temp1;
+	u64 *cqe;
+	struct i40iw_sc_cqp *cqp;
+	u32 wqe_idx;
+	u8 polarity;
+	enum i40iw_status_code ret_code = 0;
+
+	if (ccq->cq_uk.avoid_mem_cflct)
+		cqe = (u64 *)I40IW_GET_CURRENT_EXTENDED_CQ_ELEMENT(&ccq->cq_uk);
+	else
+		cqe = (u64 *)I40IW_GET_CURRENT_CQ_ELEMENT(&ccq->cq_uk);
+
+	get_64bit_val(cqe, 24, &temp);
+	polarity = (u8)RS_64(temp, I40IW_CQ_VALID);
+	if (polarity != ccq->cq_uk.polarity)
+		return I40IW_ERR_QUEUE_EMPTY;
+
+	get_64bit_val(cqe, 8, &qp_ctx);
+	cqp = (struct i40iw_sc_cqp *)(unsigned long)qp_ctx;
+	info->error = (bool)RS_64(temp, I40IW_CQ_ERROR);
+	info->min_err_code = (u16)RS_64(temp, I40IW_CQ_MINERR);
+	if (info->error) {
+		info->maj_err_code = (u16)RS_64(temp, I40IW_CQ_MAJERR);
+		info->min_err_code = (u16)RS_64(temp, I40IW_CQ_MINERR);
+	}
+	wqe_idx = (u32)RS_64(temp, I40IW_CQ_WQEIDX);
+	info->scratch = cqp->scratch_array[wqe_idx];
+
+	get_64bit_val(cqe, 16, &temp1);
+	info->op_ret_val = (u32)RS_64(temp1, I40IW_CCQ_OPRETVAL);
+	get_64bit_val(cqp->sq_base[wqe_idx].elem, 24, &temp1);
+	info->op_code = (u8)RS_64(temp1, I40IW_CQPSQ_OPCODE);
+	info->cqp = cqp;
+
+	/*  move the head for cq */
+	I40IW_RING_MOVE_HEAD(ccq->cq_uk.cq_ring, ret_code);
+	if (I40IW_RING_GETCURRENT_HEAD(ccq->cq_uk.cq_ring) == 0)
+		ccq->cq_uk.polarity ^= 1;
+
+	/* update cq tail in cq shadow memory also */
+	I40IW_RING_MOVE_TAIL(ccq->cq_uk.cq_ring);
+	set_64bit_val(ccq->cq_uk.shadow_area,
+		      0,
+		      I40IW_RING_GETCURRENT_HEAD(ccq->cq_uk.cq_ring));
+	wmb(); /* write shadow area before tail */
+	I40IW_RING_MOVE_TAIL(cqp->sq_ring);
+	ccq->dev->cqp_cmd_stats[OP_COMPLETED_COMMANDS]++;
+
+	return ret_code;
+}
+
+/**
+ * i40iw_sc_poll_for_cqp_op_done - Waits for last write to complete in CQP SQ
+ * @cqp: struct for cqp hw
+ * @op_code: cqp opcode for completion
+ * @info: completion q entry to return
+ */
+static enum i40iw_status_code i40iw_sc_poll_for_cqp_op_done(
+					struct i40iw_sc_cqp *cqp,
+					u8 op_code,
+					struct i40iw_ccq_cqe_info *compl_info)
+{
+	struct i40iw_ccq_cqe_info info;
+	struct i40iw_sc_cq *ccq;
+	enum i40iw_status_code ret_code = 0;
+	u32 cnt = 0;
+
+	memset(&info, 0, sizeof(info));
+	ccq = cqp->dev->ccq;
+	while (1) {
+		if (cnt++ > I40IW_DONE_COUNT)
+			return I40IW_ERR_TIMEOUT;
+
+		if (i40iw_sc_ccq_get_cqe_info(ccq, &info)) {
+			udelay(I40IW_SLEEP_COUNT);
+			continue;
+		}
+
+		if (info.error) {
+			ret_code = I40IW_ERR_CQP_COMPL_ERROR;
+			break;
+		}
+		/* check if opcode is cq create */
+		if (op_code != info.op_code) {
+			i40iw_debug(cqp->dev, I40IW_DEBUG_WQE,
+				    "%s: opcode mismatch for my op code 0x%x, returned opcode %x\n",
+				    __func__, op_code, info.op_code);
+		}
+		/* success, exit out of the loop */
+		if (op_code == info.op_code)
+			break;
+	}
+
+	if (compl_info)
+		memcpy(compl_info, &info, sizeof(*compl_info));
+
+	return ret_code;
+}
+
+/**
+ * i40iw_sc_manage_push_page - Handle push page
+ * @cqp: struct for cqp hw
+ * @info: push page info
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_manage_push_page(
+				struct i40iw_sc_cqp *cqp,
+				struct i40iw_cqp_manage_push_page_info *info,
+				u64 scratch,
+				bool post_sq)
+{
+	u64 *wqe;
+	u64 header;
+
+	if (info->push_idx >= I40IW_MAX_PUSH_PAGE_COUNT)
+		return I40IW_ERR_INVALID_PUSH_PAGE_INDEX;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 16, info->qs_handle);
+
+	header = LS_64(info->push_idx, I40IW_CQPSQ_MPP_PPIDX) |
+		 LS_64(I40IW_CQP_OP_MANAGE_PUSH_PAGES, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID) |
+		 LS_64(info->free_page, I40IW_CQPSQ_MPP_FREE_PAGE);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "MANAGE_PUSH_PAGES WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_manage_hmc_pm_func_table - manage of function table
+ * @cqp: struct for cqp hw
+ * @scratch: u64 saved to be used during cqp completion
+ * @vf_index: vf index for cqp
+ * @free_pm_fcn: function number
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_manage_hmc_pm_func_table(
+				struct i40iw_sc_cqp *cqp,
+				u64 scratch,
+				u8 vf_index,
+				bool free_pm_fcn,
+				bool post_sq)
+{
+	u64 *wqe;
+	u64 header;
+
+	if (vf_index >= I40IW_MAX_VF_PER_PF)
+		return I40IW_ERR_INVALID_VF_ID;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	header = LS_64(vf_index, I40IW_CQPSQ_MHMC_VFIDX) |
+		 LS_64(I40IW_CQP_OP_MANAGE_HMC_PM_FUNC_TABLE, I40IW_CQPSQ_OPCODE) |
+		 LS_64(free_pm_fcn, I40IW_CQPSQ_MHMC_FREEPMFN) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "MANAGE_HMC_PM_FUNC_TABLE WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_set_hmc_resource_profile - cqp wqe for hmc profile
+ * @cqp: struct for cqp hw
+ * @scratch: u64 saved to be used during cqp completion
+ * @hmc_profile_type: type of profile to set
+ * @vf_num: vf number for profile
+ * @post_sq: flag for cqp db to ring
+ * @poll_registers: flag to poll register for cqp completion
+ */
+static enum i40iw_status_code i40iw_sc_set_hmc_resource_profile(
+				struct i40iw_sc_cqp *cqp,
+				u64 scratch,
+				u8 hmc_profile_type,
+				u8 vf_num, bool post_sq,
+				bool poll_registers)
+{
+	u64 *wqe;
+	u64 header;
+	u32 val, tail, error;
+	enum i40iw_status_code ret_code = 0;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 16,
+		      (LS_64(hmc_profile_type, I40IW_CQPSQ_SHMCRP_HMC_PROFILE) |
+				LS_64(vf_num, I40IW_CQPSQ_SHMCRP_VFNUM)));
+
+	header = LS_64(I40IW_CQP_OP_SET_HMC_RESOURCE_PROFILE, I40IW_CQPSQ_OPCODE) |
+		       LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "MANAGE_HMC_PM_FUNC_TABLE WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	i40iw_get_cqp_reg_info(cqp, &val, &tail, &error);
+	if (error)
+		return I40IW_ERR_CQP_COMPL_ERROR;
+
+	if (post_sq) {
+		i40iw_sc_cqp_post_sq(cqp);
+		if (poll_registers)
+			ret_code = i40iw_cqp_poll_registers(cqp, tail, 1000000);
+		else
+			ret_code = i40iw_sc_poll_for_cqp_op_done(cqp,
+								 I40IW_CQP_OP_SHMC_PAGES_ALLOCATED,
+								 NULL);
+	}
+
+	return ret_code;
+}
+
+/**
+ * i40iw_sc_manage_hmc_pm_func_table_done - wait for cqp wqe completion for function table
+ * @cqp: struct for cqp hw
+ */
+static enum i40iw_status_code i40iw_sc_manage_hmc_pm_func_table_done(struct i40iw_sc_cqp *cqp)
+{
+	return i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_MANAGE_HMC_PM_FUNC_TABLE, NULL);
+}
+
+/**
+ * i40iw_sc_commit_fpm_values_done - wait for cqp eqe completion for fpm commit
+ * @cqp: struct for cqp hw
+ */
+static enum i40iw_status_code i40iw_sc_commit_fpm_values_done(struct i40iw_sc_cqp *cqp)
+{
+	return i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_COMMIT_FPM_VALUES, NULL);
+}
+
+/**
+ * i40iw_sc_commit_fpm_values - cqp wqe for commit fpm values
+ * @cqp: struct for cqp hw
+ * @scratch: u64 saved to be used during cqp completion
+ * @hmc_fn_id: hmc function id
+ * @commit_fpm_mem; Memory for fpm values
+ * @post_sq: flag for cqp db to ring
+ * @wait_type: poll ccq or cqp registers for cqp completion
+ */
+static enum i40iw_status_code i40iw_sc_commit_fpm_values(
+					struct i40iw_sc_cqp *cqp,
+					u64 scratch,
+					u8 hmc_fn_id,
+					struct i40iw_dma_mem *commit_fpm_mem,
+					bool post_sq,
+					u8 wait_type)
+{
+	u64 *wqe;
+	u64 header;
+	u32 tail, val, error;
+	enum i40iw_status_code ret_code = 0;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 16, hmc_fn_id);
+	set_64bit_val(wqe, 32, commit_fpm_mem->pa);
+
+	header = LS_64(I40IW_CQP_OP_COMMIT_FPM_VALUES, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "COMMIT_FPM_VALUES WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	i40iw_get_cqp_reg_info(cqp, &val, &tail, &error);
+	if (error)
+		return I40IW_ERR_CQP_COMPL_ERROR;
+
+	if (post_sq) {
+		i40iw_sc_cqp_post_sq(cqp);
+
+		if (wait_type == I40IW_CQP_WAIT_POLL_REGS)
+			ret_code = i40iw_cqp_poll_registers(cqp, tail, I40IW_DONE_COUNT);
+		else if (wait_type == I40IW_CQP_WAIT_POLL_CQ)
+			ret_code = i40iw_sc_commit_fpm_values_done(cqp);
+	}
+
+	return ret_code;
+}
+
+/**
+ * i40iw_sc_query_fpm_values_done - poll for cqp wqe completion for query fpm
+ * @cqp: struct for cqp hw
+ */
+static enum i40iw_status_code i40iw_sc_query_fpm_values_done(struct i40iw_sc_cqp *cqp)
+{
+	return i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_QUERY_FPM_VALUES, NULL);
+}
+
+/**
+ * i40iw_sc_query_fpm_values - cqp wqe query fpm values
+ * @cqp: struct for cqp hw
+ * @scratch: u64 saved to be used during cqp completion
+ * @hmc_fn_id: hmc function id
+ * @query_fpm_mem: memory for return fpm values
+ * @post_sq: flag for cqp db to ring
+ * @wait_type: poll ccq or cqp registers for cqp completion
+ */
+static enum i40iw_status_code i40iw_sc_query_fpm_values(
+					struct i40iw_sc_cqp *cqp,
+					u64 scratch,
+					u8 hmc_fn_id,
+					struct i40iw_dma_mem *query_fpm_mem,
+					bool post_sq,
+					u8 wait_type)
+{
+	u64 *wqe;
+	u64 header;
+	u32 tail, val, error;
+	enum i40iw_status_code ret_code = 0;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 16, hmc_fn_id);
+	set_64bit_val(wqe, 32, query_fpm_mem->pa);
+
+	header = LS_64(I40IW_CQP_OP_QUERY_FPM_VALUES, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QUERY_FPM WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	/* read the tail from CQP_TAIL register */
+	i40iw_get_cqp_reg_info(cqp, &val, &tail, &error);
+
+	if (error)
+		return I40IW_ERR_CQP_COMPL_ERROR;
+
+	if (post_sq) {
+		i40iw_sc_cqp_post_sq(cqp);
+		if (wait_type == I40IW_CQP_WAIT_POLL_REGS)
+			ret_code = i40iw_cqp_poll_registers(cqp, tail, I40IW_DONE_COUNT);
+		else if (wait_type == I40IW_CQP_WAIT_POLL_CQ)
+			ret_code = i40iw_sc_query_fpm_values_done(cqp);
+	}
+
+	return ret_code;
+}
+
+/**
+ * i40iw_sc_add_arp_cache_entry - cqp wqe add arp cache entry
+ * @cqp: struct for cqp hw
+ * @info: arp entry information
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_add_arp_cache_entry(
+				struct i40iw_sc_cqp *cqp,
+				struct i40iw_add_arp_cache_entry_info *info,
+				u64 scratch,
+				bool post_sq)
+{
+	u64 *wqe;
+	u64 temp, header;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe, 8, info->reach_max);
+
+	temp = info->mac_addr[5] |
+	       LS_64_1(info->mac_addr[4], 8) |
+	       LS_64_1(info->mac_addr[3], 16) |
+	       LS_64_1(info->mac_addr[2], 24) |
+	       LS_64_1(info->mac_addr[1], 32) |
+	       LS_64_1(info->mac_addr[0], 40);
+
+	set_64bit_val(wqe, 16, temp);
+
+	header = info->arp_index |
+		 LS_64(I40IW_CQP_OP_MANAGE_ARP, I40IW_CQPSQ_OPCODE) |
+		 LS_64((info->permanent ? 1 : 0), I40IW_CQPSQ_MAT_PERMANENT) |
+		 LS_64(1, I40IW_CQPSQ_MAT_ENTRYVALID) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "ARP_CACHE_ENTRY WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_del_arp_cache_entry - dele arp cache entry
+ * @cqp: struct for cqp hw
+ * @scratch: u64 saved to be used during cqp completion
+ * @arp_index: arp index to delete arp entry
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_del_arp_cache_entry(
+					struct i40iw_sc_cqp *cqp,
+					u64 scratch,
+					u16 arp_index,
+					bool post_sq)
+{
+	u64 *wqe;
+	u64 header;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	header = arp_index |
+		 LS_64(I40IW_CQP_OP_MANAGE_ARP, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "ARP_CACHE_DEL_ENTRY WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_query_arp_cache_entry - cqp wqe to query arp and arp index
+ * @cqp: struct for cqp hw
+ * @scratch: u64 saved to be used during cqp completion
+ * @arp_index: arp index to delete arp entry
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_query_arp_cache_entry(
+				struct i40iw_sc_cqp *cqp,
+				u64 scratch,
+				u16 arp_index,
+				bool post_sq)
+{
+	u64 *wqe;
+	u64 header;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	header = arp_index |
+		 LS_64(I40IW_CQP_OP_MANAGE_ARP, I40IW_CQPSQ_OPCODE) |
+		 LS_64(1, I40IW_CQPSQ_MAT_QUERY) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QUERY_ARP_CACHE_ENTRY WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_manage_apbvt_entry - for adding and deleting apbvt entries
+ * @cqp: struct for cqp hw
+ * @info: info for apbvt entry to add or delete
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_manage_apbvt_entry(
+				struct i40iw_sc_cqp *cqp,
+				struct i40iw_apbvt_info *info,
+				u64 scratch,
+				bool post_sq)
+{
+	u64 *wqe;
+	u64 header;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 16, info->port);
+
+	header = LS_64(I40IW_CQP_OP_MANAGE_APBVT, I40IW_CQPSQ_OPCODE) |
+		 LS_64(info->add, I40IW_CQPSQ_MAPT_ADDPORT) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "MANAGE_APBVT WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_manage_qhash_table_entry - manage quad hash entries
+ * @cqp: struct for cqp hw
+ * @info: info for quad hash to manage
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ *
+ * This is called before connection establishment is started. For passive connections, when
+ * listener is created, it will call with entry type of  I40IW_QHASH_TYPE_TCP_SYN with local
+ * ip address and tcp port. When SYN is received (passive connections) or
+ * sent (active connections), this routine is called with entry type of
+ * I40IW_QHASH_TYPE_TCP_ESTABLISHED and quad is passed in info.
+ *
+ * When iwarp connection is done and its state moves to RTS, the quad hash entry in
+ * the hardware will point to iwarp's qp number and requires no calls from the driver.
+ */
+static enum i40iw_status_code i40iw_sc_manage_qhash_table_entry(
+					struct i40iw_sc_cqp *cqp,
+					struct i40iw_qhash_table_info *info,
+					u64 scratch,
+					bool post_sq)
+{
+	u64 *wqe;
+	u64 qw1 = 0;
+	u64 qw2 = 0;
+	u64 temp;
+	struct i40iw_sc_vsi *vsi = info->vsi;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	temp = info->mac_addr[5] |
+		LS_64_1(info->mac_addr[4], 8) |
+		LS_64_1(info->mac_addr[3], 16) |
+		LS_64_1(info->mac_addr[2], 24) |
+		LS_64_1(info->mac_addr[1], 32) |
+		LS_64_1(info->mac_addr[0], 40);
+
+	set_64bit_val(wqe, 0, temp);
+
+	qw1 = LS_64(info->qp_num, I40IW_CQPSQ_QHASH_QPN) |
+	      LS_64(info->dest_port, I40IW_CQPSQ_QHASH_DEST_PORT);
+	if (info->ipv4_valid) {
+		set_64bit_val(wqe,
+			      48,
+			      LS_64(info->dest_ip[0], I40IW_CQPSQ_QHASH_ADDR3));
+	} else {
+		set_64bit_val(wqe,
+			      56,
+			      LS_64(info->dest_ip[0], I40IW_CQPSQ_QHASH_ADDR0) |
+			      LS_64(info->dest_ip[1], I40IW_CQPSQ_QHASH_ADDR1));
+
+		set_64bit_val(wqe,
+			      48,
+			      LS_64(info->dest_ip[2], I40IW_CQPSQ_QHASH_ADDR2) |
+			      LS_64(info->dest_ip[3], I40IW_CQPSQ_QHASH_ADDR3));
+	}
+	qw2 = LS_64(vsi->qos[info->user_pri].qs_handle, I40IW_CQPSQ_QHASH_QS_HANDLE);
+	if (info->vlan_valid)
+		qw2 |= LS_64(info->vlan_id, I40IW_CQPSQ_QHASH_VLANID);
+	set_64bit_val(wqe, 16, qw2);
+	if (info->entry_type == I40IW_QHASH_TYPE_TCP_ESTABLISHED) {
+		qw1 |= LS_64(info->src_port, I40IW_CQPSQ_QHASH_SRC_PORT);
+		if (!info->ipv4_valid) {
+			set_64bit_val(wqe,
+				      40,
+				      LS_64(info->src_ip[0], I40IW_CQPSQ_QHASH_ADDR0) |
+				      LS_64(info->src_ip[1], I40IW_CQPSQ_QHASH_ADDR1));
+			set_64bit_val(wqe,
+				      32,
+				      LS_64(info->src_ip[2], I40IW_CQPSQ_QHASH_ADDR2) |
+				      LS_64(info->src_ip[3], I40IW_CQPSQ_QHASH_ADDR3));
+		} else {
+			set_64bit_val(wqe,
+				      32,
+				      LS_64(info->src_ip[0], I40IW_CQPSQ_QHASH_ADDR3));
+		}
+	}
+
+	set_64bit_val(wqe, 8, qw1);
+	temp = LS_64(cqp->polarity, I40IW_CQPSQ_QHASH_WQEVALID) |
+	       LS_64(I40IW_CQP_OP_MANAGE_QUAD_HASH_TABLE_ENTRY, I40IW_CQPSQ_QHASH_OPCODE) |
+	       LS_64(info->manage, I40IW_CQPSQ_QHASH_MANAGE) |
+	       LS_64(info->ipv4_valid, I40IW_CQPSQ_QHASH_IPV4VALID) |
+	       LS_64(info->vlan_valid, I40IW_CQPSQ_QHASH_VLANVALID) |
+	       LS_64(info->entry_type, I40IW_CQPSQ_QHASH_ENTRYTYPE);
+
+	i40iw_insert_wqe_hdr(wqe, temp);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "MANAGE_QHASH WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_alloc_local_mac_ipaddr_entry - cqp wqe for loc mac entry
+ * @cqp: struct for cqp hw
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_alloc_local_mac_ipaddr_entry(
+					struct i40iw_sc_cqp *cqp,
+					u64 scratch,
+					bool post_sq)
+{
+	u64 *wqe;
+	u64 header;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	header = LS_64(I40IW_CQP_OP_ALLOCATE_LOC_MAC_IP_TABLE_ENTRY, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "ALLOCATE_LOCAL_MAC_IPADDR WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_add_local_mac_ipaddr_entry - add mac enry
+ * @cqp: struct for cqp hw
+ * @info:mac addr info
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_add_local_mac_ipaddr_entry(
+				struct i40iw_sc_cqp *cqp,
+				struct i40iw_local_mac_ipaddr_entry_info *info,
+				u64 scratch,
+				bool post_sq)
+{
+	u64 *wqe;
+	u64 temp, header;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	temp = info->mac_addr[5] |
+		LS_64_1(info->mac_addr[4], 8) |
+		LS_64_1(info->mac_addr[3], 16) |
+		LS_64_1(info->mac_addr[2], 24) |
+		LS_64_1(info->mac_addr[1], 32) |
+		LS_64_1(info->mac_addr[0], 40);
+
+	set_64bit_val(wqe, 32, temp);
+
+	header = LS_64(info->entry_idx, I40IW_CQPSQ_MLIPA_IPTABLEIDX) |
+		 LS_64(I40IW_CQP_OP_MANAGE_LOC_MAC_IP_TABLE, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "ADD_LOCAL_MAC_IPADDR WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_del_local_mac_ipaddr_entry - cqp wqe to dele local mac
+ * @cqp: struct for cqp hw
+ * @scratch: u64 saved to be used during cqp completion
+ * @entry_idx: index of mac entry
+ * @ ignore_ref_count: to force mac adde delete
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_del_local_mac_ipaddr_entry(
+				struct i40iw_sc_cqp *cqp,
+				u64 scratch,
+				u8 entry_idx,
+				u8 ignore_ref_count,
+				bool post_sq)
+{
+	u64 *wqe;
+	u64 header;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	header = LS_64(entry_idx, I40IW_CQPSQ_MLIPA_IPTABLEIDX) |
+		 LS_64(I40IW_CQP_OP_MANAGE_LOC_MAC_IP_TABLE, I40IW_CQPSQ_OPCODE) |
+		 LS_64(1, I40IW_CQPSQ_MLIPA_FREEENTRY) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID) |
+		 LS_64(ignore_ref_count, I40IW_CQPSQ_MLIPA_IGNORE_REF_CNT);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "DEL_LOCAL_MAC_IPADDR WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_cqp_nop - send a nop wqe
+ * @cqp: struct for cqp hw
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_cqp_nop(struct i40iw_sc_cqp *cqp,
+					       u64 scratch,
+					       bool post_sq)
+{
+	u64 *wqe;
+	u64 header;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	header = LS_64(I40IW_CQP_OP_NOP, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+	i40iw_insert_wqe_hdr(wqe, header);
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "NOP WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_ceq_init - initialize ceq
+ * @ceq: ceq sc structure
+ * @info: ceq initialization info
+ */
+static enum i40iw_status_code i40iw_sc_ceq_init(struct i40iw_sc_ceq *ceq,
+						struct i40iw_ceq_init_info *info)
+{
+	u32 pble_obj_cnt;
+
+	if ((info->elem_cnt < I40IW_MIN_CEQ_ENTRIES) ||
+	    (info->elem_cnt > I40IW_MAX_CEQ_ENTRIES))
+		return I40IW_ERR_INVALID_SIZE;
+
+	if (info->ceq_id >= I40IW_MAX_CEQID)
+		return I40IW_ERR_INVALID_CEQ_ID;
+
+	pble_obj_cnt = info->dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
+
+	if (info->virtual_map && (info->first_pm_pbl_idx >= pble_obj_cnt))
+		return I40IW_ERR_INVALID_PBLE_INDEX;
+
+	ceq->size = sizeof(*ceq);
+	ceq->ceqe_base = (struct i40iw_ceqe *)info->ceqe_base;
+	ceq->ceq_id = info->ceq_id;
+	ceq->dev = info->dev;
+	ceq->elem_cnt = info->elem_cnt;
+	ceq->ceq_elem_pa = info->ceqe_pa;
+	ceq->virtual_map = info->virtual_map;
+
+	ceq->pbl_chunk_size = (ceq->virtual_map ? info->pbl_chunk_size : 0);
+	ceq->first_pm_pbl_idx = (ceq->virtual_map ? info->first_pm_pbl_idx : 0);
+	ceq->pbl_list = (ceq->virtual_map ? info->pbl_list : NULL);
+
+	ceq->tph_en = info->tph_en;
+	ceq->tph_val = info->tph_val;
+	ceq->polarity = 1;
+	I40IW_RING_INIT(ceq->ceq_ring, ceq->elem_cnt);
+	ceq->dev->ceq[info->ceq_id] = ceq;
+
+	return 0;
+}
+
+/**
+ * i40iw_sc_ceq_create - create ceq wqe
+ * @ceq: ceq sc structure
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_ceq_create(struct i40iw_sc_ceq *ceq,
+						  u64 scratch,
+						  bool post_sq)
+{
+	struct i40iw_sc_cqp *cqp;
+	u64 *wqe;
+	u64 header;
+
+	cqp = ceq->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe, 16, ceq->elem_cnt);
+	set_64bit_val(wqe, 32, (ceq->virtual_map ? 0 : ceq->ceq_elem_pa));
+	set_64bit_val(wqe, 48, (ceq->virtual_map ? ceq->first_pm_pbl_idx : 0));
+	set_64bit_val(wqe, 56, LS_64(ceq->tph_val, I40IW_CQPSQ_TPHVAL));
+
+	header = ceq->ceq_id |
+		 LS_64(I40IW_CQP_OP_CREATE_CEQ, I40IW_CQPSQ_OPCODE) |
+		 LS_64(ceq->pbl_chunk_size, I40IW_CQPSQ_CEQ_LPBLSIZE) |
+		 LS_64(ceq->virtual_map, I40IW_CQPSQ_CEQ_VMAP) |
+		 LS_64(ceq->tph_en, I40IW_CQPSQ_TPHEN) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CEQ_CREATE WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_cceq_create_done - poll for control ceq wqe to complete
+ * @ceq: ceq sc structure
+ */
+static enum i40iw_status_code i40iw_sc_cceq_create_done(struct i40iw_sc_ceq *ceq)
+{
+	struct i40iw_sc_cqp *cqp;
+
+	cqp = ceq->dev->cqp;
+	return i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_CREATE_CEQ, NULL);
+}
+
+/**
+ * i40iw_sc_cceq_destroy_done - poll for destroy cceq to complete
+ * @ceq: ceq sc structure
+ */
+static enum i40iw_status_code i40iw_sc_cceq_destroy_done(struct i40iw_sc_ceq *ceq)
+{
+	struct i40iw_sc_cqp *cqp;
+
+	cqp = ceq->dev->cqp;
+	cqp->process_cqp_sds = i40iw_update_sds_noccq;
+	return i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_DESTROY_CEQ, NULL);
+}
+
+/**
+ * i40iw_sc_cceq_create - create cceq
+ * @ceq: ceq sc structure
+ * @scratch: u64 saved to be used during cqp completion
+ */
+static enum i40iw_status_code i40iw_sc_cceq_create(struct i40iw_sc_ceq *ceq, u64 scratch)
+{
+	enum i40iw_status_code ret_code;
+
+	ret_code = i40iw_sc_ceq_create(ceq, scratch, true);
+	if (!ret_code)
+		ret_code = i40iw_sc_cceq_create_done(ceq);
+	return ret_code;
+}
+
+/**
+ * i40iw_sc_ceq_destroy - destroy ceq
+ * @ceq: ceq sc structure
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_ceq_destroy(struct i40iw_sc_ceq *ceq,
+						   u64 scratch,
+						   bool post_sq)
+{
+	struct i40iw_sc_cqp *cqp;
+	u64 *wqe;
+	u64 header;
+
+	cqp = ceq->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe, 16, ceq->elem_cnt);
+	set_64bit_val(wqe, 48, ceq->first_pm_pbl_idx);
+	header = ceq->ceq_id |
+		 LS_64(I40IW_CQP_OP_DESTROY_CEQ, I40IW_CQPSQ_OPCODE) |
+		 LS_64(ceq->pbl_chunk_size, I40IW_CQPSQ_CEQ_LPBLSIZE) |
+		 LS_64(ceq->virtual_map, I40IW_CQPSQ_CEQ_VMAP) |
+		 LS_64(ceq->tph_en, I40IW_CQPSQ_TPHEN) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+	i40iw_insert_wqe_hdr(wqe, header);
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CEQ_DESTROY WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_process_ceq - process ceq
+ * @dev: sc device struct
+ * @ceq: ceq sc structure
+ */
+static void *i40iw_sc_process_ceq(struct i40iw_sc_dev *dev, struct i40iw_sc_ceq *ceq)
+{
+	u64 temp;
+	u64 *ceqe;
+	struct i40iw_sc_cq *cq = NULL;
+	u8 polarity;
+
+	ceqe = (u64 *)I40IW_GET_CURRENT_CEQ_ELEMENT(ceq);
+	get_64bit_val(ceqe, 0, &temp);
+	polarity = (u8)RS_64(temp, I40IW_CEQE_VALID);
+	if (polarity != ceq->polarity)
+		return cq;
+
+	cq = (struct i40iw_sc_cq *)(unsigned long)LS_64_1(temp, 1);
+
+	I40IW_RING_MOVE_TAIL(ceq->ceq_ring);
+	if (I40IW_RING_GETCURRENT_TAIL(ceq->ceq_ring) == 0)
+		ceq->polarity ^= 1;
+
+	if (dev->is_pf)
+		i40iw_wr32(dev->hw, I40E_PFPE_CQACK, cq->cq_uk.cq_id);
+	else
+		i40iw_wr32(dev->hw, I40E_VFPE_CQACK1, cq->cq_uk.cq_id);
+
+	return cq;
+}
+
+/**
+ * i40iw_sc_aeq_init - initialize aeq
+ * @aeq: aeq structure ptr
+ * @info: aeq initialization info
+ */
+static enum i40iw_status_code i40iw_sc_aeq_init(struct i40iw_sc_aeq *aeq,
+						struct i40iw_aeq_init_info *info)
+{
+	u32 pble_obj_cnt;
+
+	if ((info->elem_cnt < I40IW_MIN_AEQ_ENTRIES) ||
+	    (info->elem_cnt > I40IW_MAX_AEQ_ENTRIES))
+		return I40IW_ERR_INVALID_SIZE;
+	pble_obj_cnt = info->dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
+
+	if (info->virtual_map && (info->first_pm_pbl_idx >= pble_obj_cnt))
+		return I40IW_ERR_INVALID_PBLE_INDEX;
+
+	aeq->size = sizeof(*aeq);
+	aeq->polarity = 1;
+	aeq->aeqe_base = (struct i40iw_sc_aeqe *)info->aeqe_base;
+	aeq->dev = info->dev;
+	aeq->elem_cnt = info->elem_cnt;
+
+	aeq->aeq_elem_pa = info->aeq_elem_pa;
+	I40IW_RING_INIT(aeq->aeq_ring, aeq->elem_cnt);
+	info->dev->aeq = aeq;
+
+	aeq->virtual_map = info->virtual_map;
+	aeq->pbl_list = (aeq->virtual_map ? info->pbl_list : NULL);
+	aeq->pbl_chunk_size = (aeq->virtual_map ? info->pbl_chunk_size : 0);
+	aeq->first_pm_pbl_idx = (aeq->virtual_map ? info->first_pm_pbl_idx : 0);
+	info->dev->aeq = aeq;
+	return 0;
+}
+
+/**
+ * i40iw_sc_aeq_create - create aeq
+ * @aeq: aeq structure ptr
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_aeq_create(struct i40iw_sc_aeq *aeq,
+						  u64 scratch,
+						  bool post_sq)
+{
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+	u64 header;
+
+	cqp = aeq->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe, 16, aeq->elem_cnt);
+	set_64bit_val(wqe, 32,
+		      (aeq->virtual_map ? 0 : aeq->aeq_elem_pa));
+	set_64bit_val(wqe, 48,
+		      (aeq->virtual_map ? aeq->first_pm_pbl_idx : 0));
+
+	header = LS_64(I40IW_CQP_OP_CREATE_AEQ, I40IW_CQPSQ_OPCODE) |
+		 LS_64(aeq->pbl_chunk_size, I40IW_CQPSQ_AEQ_LPBLSIZE) |
+		 LS_64(aeq->virtual_map, I40IW_CQPSQ_AEQ_VMAP) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "AEQ_CREATE WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_aeq_destroy - destroy aeq during close
+ * @aeq: aeq structure ptr
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_aeq_destroy(struct i40iw_sc_aeq *aeq,
+						   u64 scratch,
+						   bool post_sq)
+{
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+	u64 header;
+
+	cqp = aeq->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe, 16, aeq->elem_cnt);
+	set_64bit_val(wqe, 48, aeq->first_pm_pbl_idx);
+	header = LS_64(I40IW_CQP_OP_DESTROY_AEQ, I40IW_CQPSQ_OPCODE) |
+		 LS_64(aeq->pbl_chunk_size, I40IW_CQPSQ_AEQ_LPBLSIZE) |
+		 LS_64(aeq->virtual_map, I40IW_CQPSQ_AEQ_VMAP) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "AEQ_DESTROY WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_get_next_aeqe - get next aeq entry
+ * @aeq: aeq structure ptr
+ * @info: aeqe info to be returned
+ */
+static enum i40iw_status_code i40iw_sc_get_next_aeqe(struct i40iw_sc_aeq *aeq,
+						     struct i40iw_aeqe_info *info)
+{
+	u64 temp, compl_ctx;
+	u64 *aeqe;
+	u16 wqe_idx;
+	u8 ae_src;
+	u8 polarity;
+
+	aeqe = (u64 *)I40IW_GET_CURRENT_AEQ_ELEMENT(aeq);
+	get_64bit_val(aeqe, 0, &compl_ctx);
+	get_64bit_val(aeqe, 8, &temp);
+	polarity = (u8)RS_64(temp, I40IW_AEQE_VALID);
+
+	if (aeq->polarity != polarity)
+		return I40IW_ERR_QUEUE_EMPTY;
+
+	i40iw_debug_buf(aeq->dev, I40IW_DEBUG_WQE, "AEQ_ENTRY", aeqe, 16);
+
+	ae_src = (u8)RS_64(temp, I40IW_AEQE_AESRC);
+	wqe_idx = (u16)RS_64(temp, I40IW_AEQE_WQDESCIDX);
+	info->qp_cq_id = (u32)RS_64(temp, I40IW_AEQE_QPCQID);
+	info->ae_id = (u16)RS_64(temp, I40IW_AEQE_AECODE);
+	info->tcp_state = (u8)RS_64(temp, I40IW_AEQE_TCPSTATE);
+	info->iwarp_state = (u8)RS_64(temp, I40IW_AEQE_IWSTATE);
+	info->q2_data_written = (u8)RS_64(temp, I40IW_AEQE_Q2DATA);
+	info->aeqe_overflow = (bool)RS_64(temp, I40IW_AEQE_OVERFLOW);
+
+	switch (info->ae_id) {
+	case I40IW_AE_PRIV_OPERATION_DENIED:
+	case I40IW_AE_UDA_XMIT_DGRAM_TOO_LONG:
+	case I40IW_AE_UDA_XMIT_DGRAM_TOO_SHORT:
+	case I40IW_AE_BAD_CLOSE:
+	case I40IW_AE_RDMAP_ROE_BAD_LLP_CLOSE:
+	case I40IW_AE_RDMA_READ_WHILE_ORD_ZERO:
+	case I40IW_AE_STAG_ZERO_INVALID:
+	case I40IW_AE_IB_RREQ_AND_Q1_FULL:
+	case I40IW_AE_WQE_UNEXPECTED_OPCODE:
+	case I40IW_AE_DDP_UBE_INVALID_DDP_VERSION:
+	case I40IW_AE_DDP_UBE_INVALID_MO:
+	case I40IW_AE_DDP_UBE_INVALID_QN:
+	case I40IW_AE_DDP_NO_L_BIT:
+	case I40IW_AE_RDMAP_ROE_INVALID_RDMAP_VERSION:
+	case I40IW_AE_RDMAP_ROE_UNEXPECTED_OPCODE:
+	case I40IW_AE_ROE_INVALID_RDMA_READ_REQUEST:
+	case I40IW_AE_ROE_INVALID_RDMA_WRITE_OR_READ_RESP:
+	case I40IW_AE_INVALID_ARP_ENTRY:
+	case I40IW_AE_INVALID_TCP_OPTION_RCVD:
+	case I40IW_AE_STALE_ARP_ENTRY:
+	case I40IW_AE_LLP_CLOSE_COMPLETE:
+	case I40IW_AE_LLP_CONNECTION_RESET:
+	case I40IW_AE_LLP_FIN_RECEIVED:
+	case I40IW_AE_LLP_RECEIVED_MPA_CRC_ERROR:
+	case I40IW_AE_LLP_SEGMENT_TOO_SMALL:
+	case I40IW_AE_LLP_SYN_RECEIVED:
+	case I40IW_AE_LLP_TERMINATE_RECEIVED:
+	case I40IW_AE_LLP_TOO_MANY_RETRIES:
+	case I40IW_AE_LLP_DOUBT_REACHABILITY:
+	case I40IW_AE_RESET_SENT:
+	case I40IW_AE_TERMINATE_SENT:
+	case I40IW_AE_RESET_NOT_SENT:
+	case I40IW_AE_LCE_QP_CATASTROPHIC:
+	case I40IW_AE_QP_SUSPEND_COMPLETE:
+		info->qp = true;
+		info->compl_ctx = compl_ctx;
+		ae_src = I40IW_AE_SOURCE_RSVD;
+		break;
+	case I40IW_AE_LCE_CQ_CATASTROPHIC:
+		info->cq = true;
+		info->compl_ctx = LS_64_1(compl_ctx, 1);
+		ae_src = I40IW_AE_SOURCE_RSVD;
+		break;
+	}
+
+	switch (ae_src) {
+	case I40IW_AE_SOURCE_RQ:
+	case I40IW_AE_SOURCE_RQ_0011:
+		info->qp = true;
+		info->wqe_idx = wqe_idx;
+		info->compl_ctx = compl_ctx;
+		break;
+	case I40IW_AE_SOURCE_CQ:
+	case I40IW_AE_SOURCE_CQ_0110:
+	case I40IW_AE_SOURCE_CQ_1010:
+	case I40IW_AE_SOURCE_CQ_1110:
+		info->cq = true;
+		info->compl_ctx = LS_64_1(compl_ctx, 1);
+		break;
+	case I40IW_AE_SOURCE_SQ:
+	case I40IW_AE_SOURCE_SQ_0111:
+		info->qp = true;
+		info->sq = true;
+		info->wqe_idx = wqe_idx;
+		info->compl_ctx = compl_ctx;
+		break;
+	case I40IW_AE_SOURCE_IN_RR_WR:
+	case I40IW_AE_SOURCE_IN_RR_WR_1011:
+		info->qp = true;
+		info->compl_ctx = compl_ctx;
+		info->in_rdrsp_wr = true;
+		break;
+	case I40IW_AE_SOURCE_OUT_RR:
+	case I40IW_AE_SOURCE_OUT_RR_1111:
+		info->qp = true;
+		info->compl_ctx = compl_ctx;
+		info->out_rdrsp = true;
+		break;
+	case I40IW_AE_SOURCE_RSVD:
+		/* fallthrough */
+	default:
+		break;
+	}
+	I40IW_RING_MOVE_TAIL(aeq->aeq_ring);
+	if (I40IW_RING_GETCURRENT_TAIL(aeq->aeq_ring) == 0)
+		aeq->polarity ^= 1;
+	return 0;
+}
+
+/**
+ * i40iw_sc_repost_aeq_entries - repost completed aeq entries
+ * @dev: sc device struct
+ * @count: allocate count
+ */
+static enum i40iw_status_code i40iw_sc_repost_aeq_entries(struct i40iw_sc_dev *dev,
+							  u32 count)
+{
+
+	if (dev->is_pf)
+		i40iw_wr32(dev->hw, I40E_PFPE_AEQALLOC, count);
+	else
+		i40iw_wr32(dev->hw, I40E_VFPE_AEQALLOC1, count);
+
+	return 0;
+}
+
+/**
+ * i40iw_sc_aeq_create_done - create aeq
+ * @aeq: aeq structure ptr
+ */
+static enum i40iw_status_code i40iw_sc_aeq_create_done(struct i40iw_sc_aeq *aeq)
+{
+	struct i40iw_sc_cqp *cqp;
+
+	cqp = aeq->dev->cqp;
+	return i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_CREATE_AEQ, NULL);
+}
+
+/**
+ * i40iw_sc_aeq_destroy_done - destroy of aeq during close
+ * @aeq: aeq structure ptr
+ */
+static enum i40iw_status_code i40iw_sc_aeq_destroy_done(struct i40iw_sc_aeq *aeq)
+{
+	struct i40iw_sc_cqp *cqp;
+
+	cqp = aeq->dev->cqp;
+	return  i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_DESTROY_AEQ, NULL);
+}
+
+/**
+ * i40iw_sc_ccq_init - initialize control cq
+ * @cq: sc's cq ctruct
+ * @info: info for control cq initialization
+ */
+static enum i40iw_status_code i40iw_sc_ccq_init(struct i40iw_sc_cq *cq,
+						struct i40iw_ccq_init_info *info)
+{
+	u32 pble_obj_cnt;
+
+	if (info->num_elem < I40IW_MIN_CQ_SIZE || info->num_elem > I40IW_MAX_CQ_SIZE)
+		return I40IW_ERR_INVALID_SIZE;
+
+	if (info->ceq_id > I40IW_MAX_CEQID)
+		return I40IW_ERR_INVALID_CEQ_ID;
+
+	pble_obj_cnt = info->dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
+
+	if (info->virtual_map && (info->first_pm_pbl_idx >= pble_obj_cnt))
+		return I40IW_ERR_INVALID_PBLE_INDEX;
+
+	cq->cq_pa = info->cq_pa;
+	cq->cq_uk.cq_base = info->cq_base;
+	cq->shadow_area_pa = info->shadow_area_pa;
+	cq->cq_uk.shadow_area = info->shadow_area;
+	cq->shadow_read_threshold = info->shadow_read_threshold;
+	cq->dev = info->dev;
+	cq->ceq_id = info->ceq_id;
+	cq->cq_uk.cq_size = info->num_elem;
+	cq->cq_type = I40IW_CQ_TYPE_CQP;
+	cq->ceqe_mask = info->ceqe_mask;
+	I40IW_RING_INIT(cq->cq_uk.cq_ring, info->num_elem);
+
+	cq->cq_uk.cq_id = 0;    /* control cq is id 0 always */
+	cq->ceq_id_valid = info->ceq_id_valid;
+	cq->tph_en = info->tph_en;
+	cq->tph_val = info->tph_val;
+	cq->cq_uk.avoid_mem_cflct = info->avoid_mem_cflct;
+
+	cq->pbl_list = info->pbl_list;
+	cq->virtual_map = info->virtual_map;
+	cq->pbl_chunk_size = info->pbl_chunk_size;
+	cq->first_pm_pbl_idx = info->first_pm_pbl_idx;
+	cq->cq_uk.polarity = true;
+
+	/* following are only for iw cqs so initialize them to zero */
+	cq->cq_uk.cqe_alloc_reg = NULL;
+	info->dev->ccq = cq;
+	return 0;
+}
+
+/**
+ * i40iw_sc_ccq_create_done - poll cqp for ccq create
+ * @ccq: ccq sc struct
+ */
+static enum i40iw_status_code i40iw_sc_ccq_create_done(struct i40iw_sc_cq *ccq)
+{
+	struct i40iw_sc_cqp *cqp;
+
+	cqp = ccq->dev->cqp;
+	return	i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_CREATE_CQ, NULL);
+}
+
+/**
+ * i40iw_sc_ccq_create - create control cq
+ * @ccq: ccq sc struct
+ * @scratch: u64 saved to be used during cqp completion
+ * @check_overflow: overlow flag for ccq
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_ccq_create(struct i40iw_sc_cq *ccq,
+						  u64 scratch,
+						  bool check_overflow,
+						  bool post_sq)
+{
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+	u64 header;
+	enum i40iw_status_code ret_code;
+
+	cqp = ccq->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe, 0, ccq->cq_uk.cq_size);
+	set_64bit_val(wqe, 8, RS_64_1(ccq, 1));
+	set_64bit_val(wqe, 16,
+		      LS_64(ccq->shadow_read_threshold, I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD));
+	set_64bit_val(wqe, 32, (ccq->virtual_map ? 0 : ccq->cq_pa));
+	set_64bit_val(wqe, 40, ccq->shadow_area_pa);
+	set_64bit_val(wqe, 48,
+		      (ccq->virtual_map ? ccq->first_pm_pbl_idx : 0));
+	set_64bit_val(wqe, 56,
+		      LS_64(ccq->tph_val, I40IW_CQPSQ_TPHVAL));
+
+	header = ccq->cq_uk.cq_id |
+		 LS_64((ccq->ceq_id_valid ? ccq->ceq_id : 0), I40IW_CQPSQ_CQ_CEQID) |
+		 LS_64(I40IW_CQP_OP_CREATE_CQ, I40IW_CQPSQ_OPCODE) |
+		 LS_64(ccq->pbl_chunk_size, I40IW_CQPSQ_CQ_LPBLSIZE) |
+		 LS_64(check_overflow, I40IW_CQPSQ_CQ_CHKOVERFLOW) |
+		 LS_64(ccq->virtual_map, I40IW_CQPSQ_CQ_VIRTMAP) |
+		 LS_64(ccq->ceqe_mask, I40IW_CQPSQ_CQ_ENCEQEMASK) |
+		 LS_64(ccq->ceq_id_valid, I40IW_CQPSQ_CQ_CEQIDVALID) |
+		 LS_64(ccq->tph_en, I40IW_CQPSQ_TPHEN) |
+		 LS_64(ccq->cq_uk.avoid_mem_cflct, I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CCQ_CREATE WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq) {
+		i40iw_sc_cqp_post_sq(cqp);
+		ret_code = i40iw_sc_ccq_create_done(ccq);
+		if (ret_code)
+			return ret_code;
+	}
+	cqp->process_cqp_sds = i40iw_cqp_sds_cmd;
+
+	return 0;
+}
+
+/**
+ * i40iw_sc_ccq_destroy - destroy ccq during close
+ * @ccq: ccq sc struct
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_ccq_destroy(struct i40iw_sc_cq *ccq,
+						   u64 scratch,
+						   bool post_sq)
+{
+	struct i40iw_sc_cqp *cqp;
+	u64 *wqe;
+	u64 header;
+	enum i40iw_status_code ret_code = 0;
+	u32 tail, val, error;
+
+	cqp = ccq->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe, 0, ccq->cq_uk.cq_size);
+	set_64bit_val(wqe, 8, RS_64_1(ccq, 1));
+	set_64bit_val(wqe, 40, ccq->shadow_area_pa);
+
+	header = ccq->cq_uk.cq_id |
+		 LS_64((ccq->ceq_id_valid ? ccq->ceq_id : 0), I40IW_CQPSQ_CQ_CEQID) |
+		 LS_64(I40IW_CQP_OP_DESTROY_CQ, I40IW_CQPSQ_OPCODE) |
+		 LS_64(ccq->ceqe_mask, I40IW_CQPSQ_CQ_ENCEQEMASK) |
+		 LS_64(ccq->ceq_id_valid, I40IW_CQPSQ_CQ_CEQIDVALID) |
+		 LS_64(ccq->tph_en, I40IW_CQPSQ_TPHEN) |
+		 LS_64(ccq->cq_uk.avoid_mem_cflct, I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CCQ_DESTROY WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	i40iw_get_cqp_reg_info(cqp, &val, &tail, &error);
+	if (error)
+		return I40IW_ERR_CQP_COMPL_ERROR;
+
+	if (post_sq) {
+		i40iw_sc_cqp_post_sq(cqp);
+		ret_code = i40iw_cqp_poll_registers(cqp, tail, 1000);
+	}
+
+	cqp->process_cqp_sds = i40iw_update_sds_noccq;
+
+	return ret_code;
+}
+
+/**
+ * i40iw_sc_cq_init - initialize completion q
+ * @cq: cq struct
+ * @info: cq initialization info
+ */
+static enum i40iw_status_code i40iw_sc_cq_init(struct i40iw_sc_cq *cq,
+					       struct i40iw_cq_init_info *info)
+{
+	u32 __iomem *cqe_alloc_reg = NULL;
+	enum i40iw_status_code ret_code;
+	u32 pble_obj_cnt;
+	u32 arm_offset;
+
+	pble_obj_cnt = info->dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
+
+	if (info->virtual_map && (info->first_pm_pbl_idx >= pble_obj_cnt))
+		return I40IW_ERR_INVALID_PBLE_INDEX;
+
+	cq->cq_pa = info->cq_base_pa;
+	cq->dev = info->dev;
+	cq->ceq_id = info->ceq_id;
+	arm_offset = (info->dev->is_pf) ? I40E_PFPE_CQARM : I40E_VFPE_CQARM1;
+	if (i40iw_get_hw_addr(cq->dev))
+		cqe_alloc_reg = (u32 __iomem *)(i40iw_get_hw_addr(cq->dev) +
+					      arm_offset);
+	info->cq_uk_init_info.cqe_alloc_reg = cqe_alloc_reg;
+	ret_code = i40iw_cq_uk_init(&cq->cq_uk, &info->cq_uk_init_info);
+	if (ret_code)
+		return ret_code;
+	cq->virtual_map = info->virtual_map;
+	cq->pbl_chunk_size = info->pbl_chunk_size;
+	cq->ceqe_mask = info->ceqe_mask;
+	cq->cq_type = (info->type) ? info->type : I40IW_CQ_TYPE_IWARP;
+
+	cq->shadow_area_pa = info->shadow_area_pa;
+	cq->shadow_read_threshold = info->shadow_read_threshold;
+
+	cq->ceq_id_valid = info->ceq_id_valid;
+	cq->tph_en = info->tph_en;
+	cq->tph_val = info->tph_val;
+
+	cq->first_pm_pbl_idx = info->first_pm_pbl_idx;
+
+	return 0;
+}
+
+/**
+ * i40iw_sc_cq_create - create completion q
+ * @cq: cq struct
+ * @scratch: u64 saved to be used during cqp completion
+ * @check_overflow: flag for overflow check
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_cq_create(struct i40iw_sc_cq *cq,
+						 u64 scratch,
+						 bool check_overflow,
+						 bool post_sq)
+{
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+	u64 header;
+
+	if (cq->cq_uk.cq_id > I40IW_MAX_CQID)
+		return I40IW_ERR_INVALID_CQ_ID;
+
+	if (cq->ceq_id > I40IW_MAX_CEQID)
+		return I40IW_ERR_INVALID_CEQ_ID;
+
+	cqp = cq->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
+	set_64bit_val(wqe, 8, RS_64_1(cq, 1));
+	set_64bit_val(wqe,
+		      16,
+		      LS_64(cq->shadow_read_threshold, I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD));
+
+	set_64bit_val(wqe, 32, (cq->virtual_map ? 0 : cq->cq_pa));
+
+	set_64bit_val(wqe, 40, cq->shadow_area_pa);
+	set_64bit_val(wqe, 48, (cq->virtual_map ? cq->first_pm_pbl_idx : 0));
+	set_64bit_val(wqe, 56, LS_64(cq->tph_val, I40IW_CQPSQ_TPHVAL));
+
+	header = cq->cq_uk.cq_id |
+		 LS_64((cq->ceq_id_valid ? cq->ceq_id : 0), I40IW_CQPSQ_CQ_CEQID) |
+		 LS_64(I40IW_CQP_OP_CREATE_CQ, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cq->pbl_chunk_size, I40IW_CQPSQ_CQ_LPBLSIZE) |
+		 LS_64(check_overflow, I40IW_CQPSQ_CQ_CHKOVERFLOW) |
+		 LS_64(cq->virtual_map, I40IW_CQPSQ_CQ_VIRTMAP) |
+		 LS_64(cq->ceqe_mask, I40IW_CQPSQ_CQ_ENCEQEMASK) |
+		 LS_64(cq->ceq_id_valid, I40IW_CQPSQ_CQ_CEQIDVALID) |
+		 LS_64(cq->tph_en, I40IW_CQPSQ_TPHEN) |
+		 LS_64(cq->cq_uk.avoid_mem_cflct, I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CQ_CREATE WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_cq_destroy - destroy completion q
+ * @cq: cq struct
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_cq_destroy(struct i40iw_sc_cq *cq,
+						  u64 scratch,
+						  bool post_sq)
+{
+	struct i40iw_sc_cqp *cqp;
+	u64 *wqe;
+	u64 header;
+
+	cqp = cq->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
+	set_64bit_val(wqe, 8, RS_64_1(cq, 1));
+	set_64bit_val(wqe, 40, cq->shadow_area_pa);
+	set_64bit_val(wqe, 48, (cq->virtual_map ? cq->first_pm_pbl_idx : 0));
+
+	header = cq->cq_uk.cq_id |
+		 LS_64((cq->ceq_id_valid ? cq->ceq_id : 0), I40IW_CQPSQ_CQ_CEQID) |
+		 LS_64(I40IW_CQP_OP_DESTROY_CQ, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cq->pbl_chunk_size, I40IW_CQPSQ_CQ_LPBLSIZE) |
+		 LS_64(cq->virtual_map, I40IW_CQPSQ_CQ_VIRTMAP) |
+		 LS_64(cq->ceqe_mask, I40IW_CQPSQ_CQ_ENCEQEMASK) |
+		 LS_64(cq->ceq_id_valid, I40IW_CQPSQ_CQ_CEQIDVALID) |
+		 LS_64(cq->tph_en, I40IW_CQPSQ_TPHEN) |
+		 LS_64(cq->cq_uk.avoid_mem_cflct, I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CQ_DESTROY WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_cq_modify - modify a Completion Queue
+ * @cq: cq struct
+ * @info: modification info struct
+ * @scratch:
+ * @post_sq: flag to post to sq
+ */
+static enum i40iw_status_code i40iw_sc_cq_modify(struct i40iw_sc_cq *cq,
+						 struct i40iw_modify_cq_info *info,
+						 u64 scratch,
+						 bool post_sq)
+{
+	struct i40iw_sc_cqp *cqp;
+	u64 *wqe;
+	u64 header;
+	u32 cq_size, ceq_id, first_pm_pbl_idx;
+	u8 pbl_chunk_size;
+	bool virtual_map, ceq_id_valid, check_overflow;
+	u32 pble_obj_cnt;
+
+	if (info->ceq_valid && (info->ceq_id > I40IW_MAX_CEQID))
+		return I40IW_ERR_INVALID_CEQ_ID;
+
+	pble_obj_cnt = cq->dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
+
+	if (info->cq_resize && info->virtual_map &&
+	    (info->first_pm_pbl_idx >= pble_obj_cnt))
+		return I40IW_ERR_INVALID_PBLE_INDEX;
+
+	cqp = cq->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	cq->pbl_list = info->pbl_list;
+	cq->cq_pa = info->cq_pa;
+	cq->first_pm_pbl_idx = info->first_pm_pbl_idx;
+
+	cq_size = info->cq_resize ? info->cq_size : cq->cq_uk.cq_size;
+	if (info->ceq_change) {
+		ceq_id_valid = true;
+		ceq_id = info->ceq_id;
+	} else {
+		ceq_id_valid = cq->ceq_id_valid;
+		ceq_id = ceq_id_valid ? cq->ceq_id : 0;
+	}
+	virtual_map = info->cq_resize ? info->virtual_map : cq->virtual_map;
+	first_pm_pbl_idx = (info->cq_resize ?
+			    (info->virtual_map ? info->first_pm_pbl_idx : 0) :
+			    (cq->virtual_map ? cq->first_pm_pbl_idx : 0));
+	pbl_chunk_size = (info->cq_resize ?
+			  (info->virtual_map ? info->pbl_chunk_size : 0) :
+			  (cq->virtual_map ? cq->pbl_chunk_size : 0));
+	check_overflow = info->check_overflow_change ? info->check_overflow :
+			 cq->check_overflow;
+	cq->cq_uk.cq_size = cq_size;
+	cq->ceq_id_valid = ceq_id_valid;
+	cq->ceq_id = ceq_id;
+	cq->virtual_map = virtual_map;
+	cq->first_pm_pbl_idx = first_pm_pbl_idx;
+	cq->pbl_chunk_size = pbl_chunk_size;
+	cq->check_overflow = check_overflow;
+
+	set_64bit_val(wqe, 0, cq_size);
+	set_64bit_val(wqe, 8, RS_64_1(cq, 1));
+	set_64bit_val(wqe, 16,
+		      LS_64(info->shadow_read_threshold, I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD));
+	set_64bit_val(wqe, 32, (cq->virtual_map ? 0 : cq->cq_pa));
+	set_64bit_val(wqe, 40, cq->shadow_area_pa);
+	set_64bit_val(wqe, 48, (cq->virtual_map ? first_pm_pbl_idx : 0));
+	set_64bit_val(wqe, 56, LS_64(cq->tph_val, I40IW_CQPSQ_TPHVAL));
+
+	header = cq->cq_uk.cq_id |
+		 LS_64(ceq_id, I40IW_CQPSQ_CQ_CEQID) |
+		 LS_64(I40IW_CQP_OP_MODIFY_CQ, I40IW_CQPSQ_OPCODE) |
+		 LS_64(info->cq_resize, I40IW_CQPSQ_CQ_CQRESIZE) |
+		 LS_64(pbl_chunk_size, I40IW_CQPSQ_CQ_LPBLSIZE) |
+		 LS_64(check_overflow, I40IW_CQPSQ_CQ_CHKOVERFLOW) |
+		 LS_64(virtual_map, I40IW_CQPSQ_CQ_VIRTMAP) |
+		 LS_64(cq->ceqe_mask, I40IW_CQPSQ_CQ_ENCEQEMASK) |
+		 LS_64(ceq_id_valid, I40IW_CQPSQ_CQ_CEQIDVALID) |
+		 LS_64(cq->tph_en, I40IW_CQPSQ_TPHEN) |
+		 LS_64(cq->cq_uk.avoid_mem_cflct, I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CQ_MODIFY WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_qp_init - initialize qp
+ * @qp: sc qp
+ * @info: initialization qp info
+ */
+static enum i40iw_status_code i40iw_sc_qp_init(struct i40iw_sc_qp *qp,
+					       struct i40iw_qp_init_info *info)
+{
+	u32 __iomem *wqe_alloc_reg = NULL;
+	enum i40iw_status_code ret_code;
+	u32 pble_obj_cnt;
+	u8 wqe_size;
+	u32 offset;
+
+	qp->dev = info->pd->dev;
+	qp->vsi = info->vsi;
+	qp->sq_pa = info->sq_pa;
+	qp->rq_pa = info->rq_pa;
+	qp->hw_host_ctx_pa = info->host_ctx_pa;
+	qp->q2_pa = info->q2_pa;
+	qp->shadow_area_pa = info->shadow_area_pa;
+
+	qp->q2_buf = info->q2;
+	qp->pd = info->pd;
+	qp->hw_host_ctx = info->host_ctx;
+	offset = (qp->pd->dev->is_pf) ? I40E_PFPE_WQEALLOC : I40E_VFPE_WQEALLOC1;
+	if (i40iw_get_hw_addr(qp->pd->dev))
+		wqe_alloc_reg = (u32 __iomem *)(i40iw_get_hw_addr(qp->pd->dev) +
+					      offset);
+
+	info->qp_uk_init_info.wqe_alloc_reg = wqe_alloc_reg;
+	info->qp_uk_init_info.abi_ver = qp->pd->abi_ver;
+	ret_code = i40iw_qp_uk_init(&qp->qp_uk, &info->qp_uk_init_info);
+	if (ret_code)
+		return ret_code;
+	qp->virtual_map = info->virtual_map;
+
+	pble_obj_cnt = info->pd->dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
+
+	if ((info->virtual_map && (info->sq_pa >= pble_obj_cnt)) ||
+	    (info->virtual_map && (info->rq_pa >= pble_obj_cnt)))
+		return I40IW_ERR_INVALID_PBLE_INDEX;
+
+	qp->llp_stream_handle = (void *)(-1);
+	qp->qp_type = (info->type) ? info->type : I40IW_QP_TYPE_IWARP;
+
+	qp->hw_sq_size = i40iw_get_encoded_wqe_size(qp->qp_uk.sq_ring.size,
+						    false);
+	i40iw_debug(qp->dev, I40IW_DEBUG_WQE, "%s: hw_sq_size[%04d] sq_ring.size[%04d]\n",
+		    __func__, qp->hw_sq_size, qp->qp_uk.sq_ring.size);
+
+	switch (qp->pd->abi_ver) {
+	case 4:
+		ret_code = i40iw_fragcnt_to_wqesize_rq(qp->qp_uk.max_rq_frag_cnt,
+						       &wqe_size);
+		if (ret_code)
+			return ret_code;
+		break;
+	case 5: /* fallthrough until next ABI version */
+	default:
+		if (qp->qp_uk.max_rq_frag_cnt > I40IW_MAX_WQ_FRAGMENT_COUNT)
+			return I40IW_ERR_INVALID_FRAG_COUNT;
+		wqe_size = I40IW_MAX_WQE_SIZE_RQ;
+		break;
+	}
+	qp->hw_rq_size = i40iw_get_encoded_wqe_size(qp->qp_uk.rq_size *
+				(wqe_size / I40IW_QP_WQE_MIN_SIZE), false);
+	i40iw_debug(qp->dev, I40IW_DEBUG_WQE,
+		    "%s: hw_rq_size[%04d] qp_uk.rq_size[%04d] wqe_size[%04d]\n",
+		    __func__, qp->hw_rq_size, qp->qp_uk.rq_size, wqe_size);
+	qp->sq_tph_val = info->sq_tph_val;
+	qp->rq_tph_val = info->rq_tph_val;
+	qp->sq_tph_en = info->sq_tph_en;
+	qp->rq_tph_en = info->rq_tph_en;
+	qp->rcv_tph_en = info->rcv_tph_en;
+	qp->xmit_tph_en = info->xmit_tph_en;
+	qp->qs_handle = qp->vsi->qos[qp->user_pri].qs_handle;
+
+	return 0;
+}
+
+/**
+ * i40iw_sc_qp_create - create qp
+ * @qp: sc qp
+ * @info: qp create info
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_qp_create(
+				struct i40iw_sc_qp *qp,
+				struct i40iw_create_qp_info *info,
+				u64 scratch,
+				bool post_sq)
+{
+	struct i40iw_sc_cqp *cqp;
+	u64 *wqe;
+	u64 header;
+
+	if ((qp->qp_uk.qp_id < I40IW_MIN_IW_QP_ID) ||
+	    (qp->qp_uk.qp_id > I40IW_MAX_IW_QP_ID))
+		return I40IW_ERR_INVALID_QP_ID;
+
+	cqp = qp->pd->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 16, qp->hw_host_ctx_pa);
+
+	set_64bit_val(wqe, 40, qp->shadow_area_pa);
+
+	header = qp->qp_uk.qp_id |
+		 LS_64(I40IW_CQP_OP_CREATE_QP, I40IW_CQPSQ_OPCODE) |
+		 LS_64((info->ord_valid ? 1 : 0), I40IW_CQPSQ_QP_ORDVALID) |
+		 LS_64(info->tcp_ctx_valid, I40IW_CQPSQ_QP_TOECTXVALID) |
+		 LS_64(qp->qp_type, I40IW_CQPSQ_QP_QPTYPE) |
+		 LS_64(qp->virtual_map, I40IW_CQPSQ_QP_VQ) |
+		 LS_64(info->cq_num_valid, I40IW_CQPSQ_QP_CQNUMVALID) |
+		 LS_64(info->arp_cache_idx_valid, I40IW_CQPSQ_QP_ARPTABIDXVALID) |
+		 LS_64(info->next_iwarp_state, I40IW_CQPSQ_QP_NEXTIWSTATE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QP_CREATE WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_qp_modify - modify qp cqp wqe
+ * @qp: sc qp
+ * @info: modify qp info
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_qp_modify(
+				struct i40iw_sc_qp *qp,
+				struct i40iw_modify_qp_info *info,
+				u64 scratch,
+				bool post_sq)
+{
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+	u64 header;
+	u8 term_actions = 0;
+	u8 term_len = 0;
+
+	cqp = qp->pd->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	if (info->next_iwarp_state == I40IW_QP_STATE_TERMINATE) {
+		if (info->dont_send_fin)
+			term_actions += I40IWQP_TERM_SEND_TERM_ONLY;
+		if (info->dont_send_term)
+			term_actions += I40IWQP_TERM_SEND_FIN_ONLY;
+		if ((term_actions == I40IWQP_TERM_SEND_TERM_AND_FIN) ||
+		    (term_actions == I40IWQP_TERM_SEND_TERM_ONLY))
+			term_len = info->termlen;
+	}
+
+	set_64bit_val(wqe,
+		      8,
+		      LS_64(term_len, I40IW_CQPSQ_QP_TERMLEN));
+
+	set_64bit_val(wqe, 16, qp->hw_host_ctx_pa);
+	set_64bit_val(wqe, 40, qp->shadow_area_pa);
+
+	header = qp->qp_uk.qp_id |
+		 LS_64(I40IW_CQP_OP_MODIFY_QP, I40IW_CQPSQ_OPCODE) |
+		 LS_64(info->ord_valid, I40IW_CQPSQ_QP_ORDVALID) |
+		 LS_64(info->tcp_ctx_valid, I40IW_CQPSQ_QP_TOECTXVALID) |
+		 LS_64(info->cached_var_valid, I40IW_CQPSQ_QP_CACHEDVARVALID) |
+		 LS_64(qp->virtual_map, I40IW_CQPSQ_QP_VQ) |
+		 LS_64(info->cq_num_valid, I40IW_CQPSQ_QP_CQNUMVALID) |
+		 LS_64(info->force_loopback, I40IW_CQPSQ_QP_FORCELOOPBACK) |
+		 LS_64(qp->qp_type, I40IW_CQPSQ_QP_QPTYPE) |
+		 LS_64(info->remove_hash_idx, I40IW_CQPSQ_QP_REMOVEHASHENTRY) |
+		 LS_64(term_actions, I40IW_CQPSQ_QP_TERMACT) |
+		 LS_64(info->reset_tcp_conn, I40IW_CQPSQ_QP_RESETCON) |
+		 LS_64(info->arp_cache_idx_valid, I40IW_CQPSQ_QP_ARPTABIDXVALID) |
+		 LS_64(info->next_iwarp_state, I40IW_CQPSQ_QP_NEXTIWSTATE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QP_MODIFY WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_qp_destroy - cqp destroy qp
+ * @qp: sc qp
+ * @scratch: u64 saved to be used during cqp completion
+ * @remove_hash_idx: flag if to remove hash idx
+ * @ignore_mw_bnd: memory window bind flag
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_qp_destroy(
+					struct i40iw_sc_qp *qp,
+					u64 scratch,
+					bool remove_hash_idx,
+					bool ignore_mw_bnd,
+					bool post_sq)
+{
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+	u64 header;
+
+	i40iw_qp_rem_qos(qp);
+	cqp = qp->pd->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe, 16, qp->hw_host_ctx_pa);
+	set_64bit_val(wqe, 40, qp->shadow_area_pa);
+
+	header = qp->qp_uk.qp_id |
+		 LS_64(I40IW_CQP_OP_DESTROY_QP, I40IW_CQPSQ_OPCODE) |
+		 LS_64(qp->qp_type, I40IW_CQPSQ_QP_QPTYPE) |
+		 LS_64(ignore_mw_bnd, I40IW_CQPSQ_QP_IGNOREMWBOUND) |
+		 LS_64(remove_hash_idx, I40IW_CQPSQ_QP_REMOVEHASHENTRY) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QP_DESTROY WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_qp_flush_wqes - flush qp's wqe
+ * @qp: sc qp
+ * @info: dlush information
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_qp_flush_wqes(
+				struct i40iw_sc_qp *qp,
+				struct i40iw_qp_flush_info *info,
+				u64 scratch,
+				bool post_sq)
+{
+	u64 temp = 0;
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+	u64 header;
+	bool flush_sq = false, flush_rq = false;
+
+	if (info->rq && !qp->flush_rq)
+		flush_rq = true;
+
+	if (info->sq && !qp->flush_sq)
+		flush_sq = true;
+
+	qp->flush_sq |= flush_sq;
+	qp->flush_rq |= flush_rq;
+	if (!flush_sq && !flush_rq)
+		return 0;
+
+	cqp = qp->pd->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	if (info->userflushcode) {
+		if (flush_rq) {
+			temp |= LS_64(info->rq_minor_code, I40IW_CQPSQ_FWQE_RQMNERR) |
+				LS_64(info->rq_major_code, I40IW_CQPSQ_FWQE_RQMJERR);
+		}
+		if (flush_sq) {
+			temp |= LS_64(info->sq_minor_code, I40IW_CQPSQ_FWQE_SQMNERR) |
+				LS_64(info->sq_major_code, I40IW_CQPSQ_FWQE_SQMJERR);
+		}
+	}
+	set_64bit_val(wqe, 16, temp);
+
+	temp = (info->generate_ae) ?
+		info->ae_code | LS_64(info->ae_source, I40IW_CQPSQ_FWQE_AESOURCE) : 0;
+
+	set_64bit_val(wqe, 8, temp);
+
+	header = qp->qp_uk.qp_id |
+		 LS_64(I40IW_CQP_OP_FLUSH_WQES, I40IW_CQPSQ_OPCODE) |
+		 LS_64(info->generate_ae, I40IW_CQPSQ_FWQE_GENERATE_AE) |
+		 LS_64(info->userflushcode, I40IW_CQPSQ_FWQE_USERFLCODE) |
+		 LS_64(flush_sq, I40IW_CQPSQ_FWQE_FLUSHSQ) |
+		 LS_64(flush_rq, I40IW_CQPSQ_FWQE_FLUSHRQ) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QP_FLUSH WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_gen_ae - generate AE, currently uses flush WQE CQP OP
+ * @qp: sc qp
+ * @info: gen ae information
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_gen_ae(
+				struct i40iw_sc_qp *qp,
+				struct i40iw_gen_ae_info *info,
+				u64 scratch,
+				bool post_sq)
+{
+	u64 temp;
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+	u64 header;
+
+	cqp = qp->pd->dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	temp = info->ae_code |
+	       LS_64(info->ae_source, I40IW_CQPSQ_FWQE_AESOURCE);
+
+	set_64bit_val(wqe, 8, temp);
+
+	header = qp->qp_uk.qp_id |
+		 LS_64(I40IW_CQP_OP_GEN_AE, I40IW_CQPSQ_OPCODE) |
+		 LS_64(1, I40IW_CQPSQ_FWQE_GENERATE_AE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "GEN_AE WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_qp_upload_context - upload qp's context
+ * @dev: sc device struct
+ * @info: upload context info ptr for return
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_qp_upload_context(
+					struct i40iw_sc_dev *dev,
+					struct i40iw_upload_context_info *info,
+					u64 scratch,
+					bool post_sq)
+{
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+	u64 header;
+
+	cqp = dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe, 16, info->buf_pa);
+
+	header = LS_64(info->qp_id, I40IW_CQPSQ_UCTX_QPID) |
+		 LS_64(I40IW_CQP_OP_UPLOAD_CONTEXT, I40IW_CQPSQ_OPCODE) |
+		 LS_64(info->qp_type, I40IW_CQPSQ_UCTX_QPTYPE) |
+		 LS_64(info->raw_format, I40IW_CQPSQ_UCTX_RAWFORMAT) |
+		 LS_64(info->freeze_qp, I40IW_CQPSQ_UCTX_FREEZEQP) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "QP_UPLOAD_CTX WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_qp_setctx - set qp's context
+ * @qp: sc qp
+ * @qp_ctx: context ptr
+ * @info: ctx info
+ */
+static enum i40iw_status_code i40iw_sc_qp_setctx(
+				struct i40iw_sc_qp *qp,
+				u64 *qp_ctx,
+				struct i40iw_qp_host_ctx_info *info)
+{
+	struct i40iwarp_offload_info *iw;
+	struct i40iw_tcp_offload_info *tcp;
+	struct i40iw_sc_vsi *vsi;
+	struct i40iw_sc_dev *dev;
+	u64 qw0, qw3, qw7 = 0;
+
+	iw = info->iwarp_info;
+	tcp = info->tcp_info;
+	vsi = qp->vsi;
+	dev = qp->dev;
+	if (info->add_to_qoslist) {
+		qp->user_pri = info->user_pri;
+		i40iw_qp_add_qos(qp);
+		i40iw_debug(qp->dev, I40IW_DEBUG_DCB, "%s qp[%d] UP[%d] qset[%d]\n",
+			    __func__, qp->qp_uk.qp_id, qp->user_pri, qp->qs_handle);
+	}
+	qw0 = LS_64(qp->qp_uk.rq_wqe_size, I40IWQPC_RQWQESIZE) |
+	      LS_64(info->err_rq_idx_valid, I40IWQPC_ERR_RQ_IDX_VALID) |
+	      LS_64(qp->rcv_tph_en, I40IWQPC_RCVTPHEN) |
+	      LS_64(qp->xmit_tph_en, I40IWQPC_XMITTPHEN) |
+	      LS_64(qp->rq_tph_en, I40IWQPC_RQTPHEN) |
+	      LS_64(qp->sq_tph_en, I40IWQPC_SQTPHEN) |
+	      LS_64(info->push_idx, I40IWQPC_PPIDX) |
+	      LS_64(info->push_mode_en, I40IWQPC_PMENA);
+
+	set_64bit_val(qp_ctx, 8, qp->sq_pa);
+	set_64bit_val(qp_ctx, 16, qp->rq_pa);
+
+	qw3 = LS_64(qp->src_mac_addr_idx, I40IWQPC_SRCMACADDRIDX) |
+	      LS_64(qp->hw_rq_size, I40IWQPC_RQSIZE) |
+	      LS_64(qp->hw_sq_size, I40IWQPC_SQSIZE);
+
+	set_64bit_val(qp_ctx,
+		      128,
+		      LS_64(info->err_rq_idx, I40IWQPC_ERR_RQ_IDX));
+
+	set_64bit_val(qp_ctx,
+		      136,
+		      LS_64(info->send_cq_num, I40IWQPC_TXCQNUM) |
+		      LS_64(info->rcv_cq_num, I40IWQPC_RXCQNUM));
+
+	set_64bit_val(qp_ctx,
+		      168,
+		      LS_64(info->qp_compl_ctx, I40IWQPC_QPCOMPCTX));
+	set_64bit_val(qp_ctx,
+		      176,
+		      LS_64(qp->sq_tph_val, I40IWQPC_SQTPHVAL) |
+		      LS_64(qp->rq_tph_val, I40IWQPC_RQTPHVAL) |
+		      LS_64(qp->qs_handle, I40IWQPC_QSHANDLE) |
+		      LS_64(vsi->exception_lan_queue, I40IWQPC_EXCEPTION_LAN_QUEUE));
+
+	if (info->iwarp_info_valid) {
+		qw0 |= LS_64(iw->ddp_ver, I40IWQPC_DDP_VER) |
+		       LS_64(iw->rdmap_ver, I40IWQPC_RDMAP_VER);
+
+		qw7 |= LS_64(iw->pd_id, I40IWQPC_PDIDX);
+		set_64bit_val(qp_ctx,
+			      144,
+			      LS_64(qp->q2_pa, I40IWQPC_Q2ADDR) |
+			      LS_64(vsi->fcn_id, I40IWQPC_STAT_INDEX));
+		set_64bit_val(qp_ctx,
+			      152,
+			      LS_64(iw->last_byte_sent, I40IWQPC_LASTBYTESENT));
+
+		set_64bit_val(qp_ctx,
+			      160,
+			      LS_64(iw->ord_size, I40IWQPC_ORDSIZE) |
+			      LS_64(iw->ird_size, I40IWQPC_IRDSIZE) |
+			      LS_64(iw->wr_rdresp_en, I40IWQPC_WRRDRSPOK) |
+			      LS_64(iw->rd_enable, I40IWQPC_RDOK) |
+			      LS_64(iw->snd_mark_en, I40IWQPC_SNDMARKERS) |
+			      LS_64(iw->bind_en, I40IWQPC_BINDEN) |
+			      LS_64(iw->fast_reg_en, I40IWQPC_FASTREGEN) |
+			      LS_64(iw->priv_mode_en, I40IWQPC_PRIVEN) |
+			      LS_64((((vsi->stats_fcn_id_alloc) &&
+				      (dev->is_pf) && (vsi->fcn_id >= I40IW_FIRST_NON_PF_STAT)) ? 1 : 0),
+				    I40IWQPC_USESTATSINSTANCE) |
+			      LS_64(1, I40IWQPC_IWARPMODE) |
+			      LS_64(iw->rcv_mark_en, I40IWQPC_RCVMARKERS) |
+			      LS_64(iw->align_hdrs, I40IWQPC_ALIGNHDRS) |
+			      LS_64(iw->rcv_no_mpa_crc, I40IWQPC_RCVNOMPACRC) |
+			      LS_64(iw->rcv_mark_offset, I40IWQPC_RCVMARKOFFSET) |
+			      LS_64(iw->snd_mark_offset, I40IWQPC_SNDMARKOFFSET));
+	}
+	if (info->tcp_info_valid) {
+		qw0 |= LS_64(tcp->ipv4, I40IWQPC_IPV4) |
+		       LS_64(tcp->no_nagle, I40IWQPC_NONAGLE) |
+		       LS_64(tcp->insert_vlan_tag, I40IWQPC_INSERTVLANTAG) |
+		       LS_64(tcp->time_stamp, I40IWQPC_TIMESTAMP) |
+		       LS_64(tcp->cwnd_inc_limit, I40IWQPC_LIMIT) |
+		       LS_64(tcp->drop_ooo_seg, I40IWQPC_DROPOOOSEG) |
+		       LS_64(tcp->dup_ack_thresh, I40IWQPC_DUPACK_THRESH);
+
+		qw3 |= LS_64(tcp->ttl, I40IWQPC_TTL) |
+		       LS_64(tcp->src_mac_addr_idx, I40IWQPC_SRCMACADDRIDX) |
+		       LS_64(tcp->avoid_stretch_ack, I40IWQPC_AVOIDSTRETCHACK) |
+		       LS_64(tcp->tos, I40IWQPC_TOS) |
+		       LS_64(tcp->src_port, I40IWQPC_SRCPORTNUM) |
+		       LS_64(tcp->dst_port, I40IWQPC_DESTPORTNUM);
+
+		qp->src_mac_addr_idx = tcp->src_mac_addr_idx;
+		set_64bit_val(qp_ctx,
+			      32,
+			      LS_64(tcp->dest_ip_addr2, I40IWQPC_DESTIPADDR2) |
+			      LS_64(tcp->dest_ip_addr3, I40IWQPC_DESTIPADDR3));
+
+		set_64bit_val(qp_ctx,
+			      40,
+			      LS_64(tcp->dest_ip_addr0, I40IWQPC_DESTIPADDR0) |
+			      LS_64(tcp->dest_ip_addr1, I40IWQPC_DESTIPADDR1));
+
+		set_64bit_val(qp_ctx,
+			      48,
+			      LS_64(tcp->snd_mss, I40IWQPC_SNDMSS) |
+				LS_64(tcp->vlan_tag, I40IWQPC_VLANTAG) |
+				LS_64(tcp->arp_idx, I40IWQPC_ARPIDX));
+
+		qw7 |= LS_64(tcp->flow_label, I40IWQPC_FLOWLABEL) |
+		       LS_64(tcp->wscale, I40IWQPC_WSCALE) |
+		       LS_64(tcp->ignore_tcp_opt, I40IWQPC_IGNORE_TCP_OPT) |
+		       LS_64(tcp->ignore_tcp_uns_opt, I40IWQPC_IGNORE_TCP_UNS_OPT) |
+		       LS_64(tcp->tcp_state, I40IWQPC_TCPSTATE) |
+		       LS_64(tcp->rcv_wscale, I40IWQPC_RCVSCALE) |
+		       LS_64(tcp->snd_wscale, I40IWQPC_SNDSCALE);
+
+		set_64bit_val(qp_ctx,
+			      72,
+			      LS_64(tcp->time_stamp_recent, I40IWQPC_TIMESTAMP_RECENT) |
+			      LS_64(tcp->time_stamp_age, I40IWQPC_TIMESTAMP_AGE));
+		set_64bit_val(qp_ctx,
+			      80,
+			      LS_64(tcp->snd_nxt, I40IWQPC_SNDNXT) |
+			      LS_64(tcp->snd_wnd, I40IWQPC_SNDWND));
+
+		set_64bit_val(qp_ctx,
+			      88,
+			      LS_64(tcp->rcv_nxt, I40IWQPC_RCVNXT) |
+			      LS_64(tcp->rcv_wnd, I40IWQPC_RCVWND));
+		set_64bit_val(qp_ctx,
+			      96,
+			      LS_64(tcp->snd_max, I40IWQPC_SNDMAX) |
+			      LS_64(tcp->snd_una, I40IWQPC_SNDUNA));
+		set_64bit_val(qp_ctx,
+			      104,
+			      LS_64(tcp->srtt, I40IWQPC_SRTT) |
+			      LS_64(tcp->rtt_var, I40IWQPC_RTTVAR));
+		set_64bit_val(qp_ctx,
+			      112,
+			      LS_64(tcp->ss_thresh, I40IWQPC_SSTHRESH) |
+			      LS_64(tcp->cwnd, I40IWQPC_CWND));
+		set_64bit_val(qp_ctx,
+			      120,
+			      LS_64(tcp->snd_wl1, I40IWQPC_SNDWL1) |
+			      LS_64(tcp->snd_wl2, I40IWQPC_SNDWL2));
+		set_64bit_val(qp_ctx,
+			      128,
+			      LS_64(tcp->max_snd_window, I40IWQPC_MAXSNDWND) |
+			      LS_64(tcp->rexmit_thresh, I40IWQPC_REXMIT_THRESH));
+		set_64bit_val(qp_ctx,
+			      184,
+			      LS_64(tcp->local_ipaddr3, I40IWQPC_LOCAL_IPADDR3) |
+			      LS_64(tcp->local_ipaddr2, I40IWQPC_LOCAL_IPADDR2));
+		set_64bit_val(qp_ctx,
+			      192,
+			      LS_64(tcp->local_ipaddr1, I40IWQPC_LOCAL_IPADDR1) |
+			      LS_64(tcp->local_ipaddr0, I40IWQPC_LOCAL_IPADDR0));
+	}
+
+	set_64bit_val(qp_ctx, 0, qw0);
+	set_64bit_val(qp_ctx, 24, qw3);
+	set_64bit_val(qp_ctx, 56, qw7);
+
+	i40iw_debug_buf(qp->dev, I40IW_DEBUG_WQE, "QP_HOST)CTX WQE",
+			qp_ctx, I40IW_QP_CTX_SIZE);
+	return 0;
+}
+
+/**
+ * i40iw_sc_alloc_stag - mr stag alloc
+ * @dev: sc device struct
+ * @info: stag info
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_alloc_stag(
+				struct i40iw_sc_dev *dev,
+				struct i40iw_allocate_stag_info *info,
+				u64 scratch,
+				bool post_sq)
+{
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+	u64 header;
+	enum i40iw_page_size page_size;
+
+	page_size = (info->page_size == 0x200000) ? I40IW_PAGE_SIZE_2M : I40IW_PAGE_SIZE_4K;
+	cqp = dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe,
+		      8,
+		      LS_64(info->pd_id, I40IW_CQPSQ_STAG_PDID) |
+		      LS_64(info->total_len, I40IW_CQPSQ_STAG_STAGLEN));
+	set_64bit_val(wqe,
+		      16,
+		      LS_64(info->stag_idx, I40IW_CQPSQ_STAG_IDX));
+	set_64bit_val(wqe,
+		      40,
+		      LS_64(info->hmc_fcn_index, I40IW_CQPSQ_STAG_HMCFNIDX));
+
+	header = LS_64(I40IW_CQP_OP_ALLOC_STAG, I40IW_CQPSQ_OPCODE) |
+		 LS_64(1, I40IW_CQPSQ_STAG_MR) |
+		 LS_64(info->access_rights, I40IW_CQPSQ_STAG_ARIGHTS) |
+		 LS_64(info->chunk_size, I40IW_CQPSQ_STAG_LPBLSIZE) |
+		 LS_64(page_size, I40IW_CQPSQ_STAG_HPAGESIZE) |
+		 LS_64(info->remote_access, I40IW_CQPSQ_STAG_REMACCENABLED) |
+		 LS_64(info->use_hmc_fcn_index, I40IW_CQPSQ_STAG_USEHMCFNIDX) |
+		 LS_64(info->use_pf_rid, I40IW_CQPSQ_STAG_USEPFRID) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "ALLOC_STAG WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_mr_reg_non_shared - non-shared mr registration
+ * @dev: sc device struct
+ * @info: mr info
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_mr_reg_non_shared(
+				struct i40iw_sc_dev *dev,
+				struct i40iw_reg_ns_stag_info *info,
+				u64 scratch,
+				bool post_sq)
+{
+	u64 *wqe;
+	u64 temp;
+	struct i40iw_sc_cqp *cqp;
+	u64 header;
+	u32 pble_obj_cnt;
+	bool remote_access;
+	u8 addr_type;
+	enum i40iw_page_size page_size;
+
+	page_size = (info->page_size == 0x200000) ? I40IW_PAGE_SIZE_2M : I40IW_PAGE_SIZE_4K;
+	if (info->access_rights & (I40IW_ACCESS_FLAGS_REMOTEREAD_ONLY |
+				   I40IW_ACCESS_FLAGS_REMOTEWRITE_ONLY))
+		remote_access = true;
+	else
+		remote_access = false;
+
+	pble_obj_cnt = dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
+
+	if (info->chunk_size && (info->first_pm_pbl_index >= pble_obj_cnt))
+		return I40IW_ERR_INVALID_PBLE_INDEX;
+
+	cqp = dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	temp = (info->addr_type == I40IW_ADDR_TYPE_VA_BASED) ? (uintptr_t)info->va : info->fbo;
+	set_64bit_val(wqe, 0, temp);
+
+	set_64bit_val(wqe,
+		      8,
+		      LS_64(info->total_len, I40IW_CQPSQ_STAG_STAGLEN) |
+		      LS_64(info->pd_id, I40IW_CQPSQ_STAG_PDID));
+
+	set_64bit_val(wqe,
+		      16,
+		      LS_64(info->stag_key, I40IW_CQPSQ_STAG_KEY) |
+		      LS_64(info->stag_idx, I40IW_CQPSQ_STAG_IDX));
+	if (!info->chunk_size) {
+		set_64bit_val(wqe, 32, info->reg_addr_pa);
+		set_64bit_val(wqe, 48, 0);
+	} else {
+		set_64bit_val(wqe, 32, 0);
+		set_64bit_val(wqe, 48, info->first_pm_pbl_index);
+	}
+	set_64bit_val(wqe, 40, info->hmc_fcn_index);
+	set_64bit_val(wqe, 56, 0);
+
+	addr_type = (info->addr_type == I40IW_ADDR_TYPE_VA_BASED) ? 1 : 0;
+	header = LS_64(I40IW_CQP_OP_REG_MR, I40IW_CQPSQ_OPCODE) |
+		 LS_64(1, I40IW_CQPSQ_STAG_MR) |
+		 LS_64(info->chunk_size, I40IW_CQPSQ_STAG_LPBLSIZE) |
+		 LS_64(page_size, I40IW_CQPSQ_STAG_HPAGESIZE) |
+		 LS_64(info->access_rights, I40IW_CQPSQ_STAG_ARIGHTS) |
+		 LS_64(remote_access, I40IW_CQPSQ_STAG_REMACCENABLED) |
+		 LS_64(addr_type, I40IW_CQPSQ_STAG_VABASEDTO) |
+		 LS_64(info->use_hmc_fcn_index, I40IW_CQPSQ_STAG_USEHMCFNIDX) |
+		 LS_64(info->use_pf_rid, I40IW_CQPSQ_STAG_USEPFRID) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "MR_REG_NS WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_mr_reg_shared - registered shared memory region
+ * @dev: sc device struct
+ * @info: info for shared memory registeration
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_mr_reg_shared(
+					struct i40iw_sc_dev *dev,
+					struct i40iw_register_shared_stag *info,
+					u64 scratch,
+					bool post_sq)
+{
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+	u64 temp, va64, fbo, header;
+	u32 va32;
+	bool remote_access;
+	u8 addr_type;
+
+	if (info->access_rights & (I40IW_ACCESS_FLAGS_REMOTEREAD_ONLY |
+				   I40IW_ACCESS_FLAGS_REMOTEWRITE_ONLY))
+		remote_access = true;
+	else
+		remote_access = false;
+	cqp = dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	va64 = (uintptr_t)(info->va);
+	va32 = (u32)(va64 & 0x00000000FFFFFFFF);
+	fbo = (u64)(va32 & (4096 - 1));
+
+	set_64bit_val(wqe,
+		      0,
+		      (info->addr_type == I40IW_ADDR_TYPE_VA_BASED ? (uintptr_t)info->va : fbo));
+
+	set_64bit_val(wqe,
+		      8,
+		      LS_64(info->pd_id, I40IW_CQPSQ_STAG_PDID));
+	temp = LS_64(info->new_stag_key, I40IW_CQPSQ_STAG_KEY) |
+	       LS_64(info->new_stag_idx, I40IW_CQPSQ_STAG_IDX) |
+	       LS_64(info->parent_stag_idx, I40IW_CQPSQ_STAG_PARENTSTAGIDX);
+	set_64bit_val(wqe, 16, temp);
+
+	addr_type = (info->addr_type == I40IW_ADDR_TYPE_VA_BASED) ? 1 : 0;
+	header = LS_64(I40IW_CQP_OP_REG_SMR, I40IW_CQPSQ_OPCODE) |
+		 LS_64(1, I40IW_CQPSQ_STAG_MR) |
+		 LS_64(info->access_rights, I40IW_CQPSQ_STAG_ARIGHTS) |
+		 LS_64(remote_access, I40IW_CQPSQ_STAG_REMACCENABLED) |
+		 LS_64(addr_type, I40IW_CQPSQ_STAG_VABASEDTO) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "MR_REG_SHARED WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_dealloc_stag - deallocate stag
+ * @dev: sc device struct
+ * @info: dealloc stag info
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_dealloc_stag(
+					struct i40iw_sc_dev *dev,
+					struct i40iw_dealloc_stag_info *info,
+					u64 scratch,
+					bool post_sq)
+{
+	u64 header;
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+
+	cqp = dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe,
+		      8,
+		      LS_64(info->pd_id, I40IW_CQPSQ_STAG_PDID));
+	set_64bit_val(wqe,
+		      16,
+		      LS_64(info->stag_idx, I40IW_CQPSQ_STAG_IDX));
+
+	header = LS_64(I40IW_CQP_OP_DEALLOC_STAG, I40IW_CQPSQ_OPCODE) |
+		 LS_64(info->mr, I40IW_CQPSQ_STAG_MR) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "DEALLOC_STAG WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_query_stag - query hardware for stag
+ * @dev: sc device struct
+ * @scratch: u64 saved to be used during cqp completion
+ * @stag_index: stag index for query
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_query_stag(struct i40iw_sc_dev *dev,
+						  u64 scratch,
+						  u32 stag_index,
+						  bool post_sq)
+{
+	u64 header;
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+
+	cqp = dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe,
+		      16,
+		      LS_64(stag_index, I40IW_CQPSQ_QUERYSTAG_IDX));
+
+	header = LS_64(I40IW_CQP_OP_QUERY_STAG, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "QUERY_STAG WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_mw_alloc - mw allocate
+ * @dev: sc device struct
+ * @scratch: u64 saved to be used during cqp completion
+ * @mw_stag_index:stag index
+ * @pd_id: pd is for this mw
+ * @post_sq: flag for cqp db to ring
+ */
+static enum i40iw_status_code i40iw_sc_mw_alloc(
+					struct i40iw_sc_dev *dev,
+					u64 scratch,
+					u32 mw_stag_index,
+					u16 pd_id,
+					bool post_sq)
+{
+	u64 header;
+	struct i40iw_sc_cqp *cqp;
+	u64 *wqe;
+
+	cqp = dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe, 8, LS_64(pd_id, I40IW_CQPSQ_STAG_PDID));
+	set_64bit_val(wqe,
+		      16,
+		      LS_64(mw_stag_index, I40IW_CQPSQ_STAG_IDX));
+
+	header = LS_64(I40IW_CQP_OP_ALLOC_STAG, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "MW_ALLOC WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_mr_fast_register - Posts RDMA fast register mr WR to iwarp qp
+ * @qp: sc qp struct
+ * @info: fast mr info
+ * @post_sq: flag for cqp db to ring
+ */
+enum i40iw_status_code i40iw_sc_mr_fast_register(
+				struct i40iw_sc_qp *qp,
+				struct i40iw_fast_reg_stag_info *info,
+				bool post_sq)
+{
+	u64 temp, header;
+	u64 *wqe;
+	u32 wqe_idx;
+	enum i40iw_page_size page_size;
+
+	page_size = (info->page_size == 0x200000) ? I40IW_PAGE_SIZE_2M : I40IW_PAGE_SIZE_4K;
+	wqe = i40iw_qp_get_next_send_wqe(&qp->qp_uk, &wqe_idx, I40IW_QP_WQE_MIN_SIZE,
+					 0, info->wr_id);
+	if (!wqe)
+		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+
+	i40iw_debug(qp->dev, I40IW_DEBUG_MR, "%s: wr_id[%llxh] wqe_idx[%04d] location[%p]\n",
+		    __func__, info->wr_id, wqe_idx,
+		    &qp->qp_uk.sq_wrtrk_array[wqe_idx].wrid);
+	temp = (info->addr_type == I40IW_ADDR_TYPE_VA_BASED) ? (uintptr_t)info->va : info->fbo;
+	set_64bit_val(wqe, 0, temp);
+
+	temp = RS_64(info->first_pm_pbl_index >> 16, I40IWQPSQ_FIRSTPMPBLIDXHI);
+	set_64bit_val(wqe,
+		      8,
+		      LS_64(temp, I40IWQPSQ_FIRSTPMPBLIDXHI) |
+		      LS_64(info->reg_addr_pa >> I40IWQPSQ_PBLADDR_SHIFT, I40IWQPSQ_PBLADDR));
+
+	set_64bit_val(wqe,
+		      16,
+		      info->total_len |
+		      LS_64(info->first_pm_pbl_index, I40IWQPSQ_FIRSTPMPBLIDXLO));
+
+	header = LS_64(info->stag_key, I40IWQPSQ_STAGKEY) |
+		 LS_64(info->stag_idx, I40IWQPSQ_STAGINDEX) |
+		 LS_64(I40IWQP_OP_FAST_REGISTER, I40IWQPSQ_OPCODE) |
+		 LS_64(info->chunk_size, I40IWQPSQ_LPBLSIZE) |
+		 LS_64(page_size, I40IWQPSQ_HPAGESIZE) |
+		 LS_64(info->access_rights, I40IWQPSQ_STAGRIGHTS) |
+		 LS_64(info->addr_type, I40IWQPSQ_VABASEDTO) |
+		 LS_64(info->read_fence, I40IWQPSQ_READFENCE) |
+		 LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) |
+		 LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
+		 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(qp->dev, I40IW_DEBUG_WQE, "FAST_REG WQE",
+			wqe, I40IW_QP_WQE_MIN_SIZE);
+
+	if (post_sq)
+		i40iw_qp_post_wr(&qp->qp_uk);
+	return 0;
+}
+
+/**
+ * i40iw_sc_send_lsmm - send last streaming mode message
+ * @qp: sc qp struct
+ * @lsmm_buf: buffer with lsmm message
+ * @size: size of lsmm buffer
+ * @stag: stag of lsmm buffer
+ */
+static void i40iw_sc_send_lsmm(struct i40iw_sc_qp *qp,
+			       void *lsmm_buf,
+			       u32 size,
+			       i40iw_stag stag)
+{
+	u64 *wqe;
+	u64 header;
+	struct i40iw_qp_uk *qp_uk;
+
+	qp_uk = &qp->qp_uk;
+	wqe = qp_uk->sq_base->elem;
+
+	set_64bit_val(wqe, 0, (uintptr_t)lsmm_buf);
+
+	set_64bit_val(wqe, 8, (size | LS_64(stag, I40IWQPSQ_FRAG_STAG)));
+
+	set_64bit_val(wqe, 16, 0);
+
+	header = LS_64(I40IWQP_OP_RDMA_SEND, I40IWQPSQ_OPCODE) |
+		 LS_64(1, I40IWQPSQ_STREAMMODE) |
+		 LS_64(1, I40IWQPSQ_WAITFORRCVPDU) |
+		 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(qp->dev, I40IW_DEBUG_QP, "SEND_LSMM WQE",
+			wqe, I40IW_QP_WQE_MIN_SIZE);
+}
+
+/**
+ * i40iw_sc_send_lsmm_nostag - for privilege qp
+ * @qp: sc qp struct
+ * @lsmm_buf: buffer with lsmm message
+ * @size: size of lsmm buffer
+ */
+static void i40iw_sc_send_lsmm_nostag(struct i40iw_sc_qp *qp,
+				      void *lsmm_buf,
+				      u32 size)
+{
+	u64 *wqe;
+	u64 header;
+	struct i40iw_qp_uk *qp_uk;
+
+	qp_uk = &qp->qp_uk;
+	wqe = qp_uk->sq_base->elem;
+
+	set_64bit_val(wqe, 0, (uintptr_t)lsmm_buf);
+
+	set_64bit_val(wqe, 8, size);
+
+	set_64bit_val(wqe, 16, 0);
+
+	header = LS_64(I40IWQP_OP_RDMA_SEND, I40IWQPSQ_OPCODE) |
+		 LS_64(1, I40IWQPSQ_STREAMMODE) |
+		 LS_64(1, I40IWQPSQ_WAITFORRCVPDU) |
+		 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(qp->dev, I40IW_DEBUG_WQE, "SEND_LSMM_NOSTAG WQE",
+			wqe, I40IW_QP_WQE_MIN_SIZE);
+}
+
+/**
+ * i40iw_sc_send_rtt - send last read0 or write0
+ * @qp: sc qp struct
+ * @read: Do read0 or write0
+ */
+static void i40iw_sc_send_rtt(struct i40iw_sc_qp *qp, bool read)
+{
+	u64 *wqe;
+	u64 header;
+	struct i40iw_qp_uk *qp_uk;
+
+	qp_uk = &qp->qp_uk;
+	wqe = qp_uk->sq_base->elem;
+
+	set_64bit_val(wqe, 0, 0);
+	set_64bit_val(wqe, 8, 0);
+	set_64bit_val(wqe, 16, 0);
+	if (read) {
+		header = LS_64(0x1234, I40IWQPSQ_REMSTAG) |
+			 LS_64(I40IWQP_OP_RDMA_READ, I40IWQPSQ_OPCODE) |
+			 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
+		set_64bit_val(wqe, 8, ((u64)0xabcd << 32));
+	} else {
+		header = LS_64(I40IWQP_OP_RDMA_WRITE, I40IWQPSQ_OPCODE) |
+			 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
+	}
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(qp->dev, I40IW_DEBUG_WQE, "RTR WQE",
+			wqe, I40IW_QP_WQE_MIN_SIZE);
+}
+
+/**
+ * i40iw_sc_post_wqe0 - send wqe with opcode
+ * @qp: sc qp struct
+ * @opcode: opcode to use for wqe0
+ */
+static enum i40iw_status_code i40iw_sc_post_wqe0(struct i40iw_sc_qp *qp, u8 opcode)
+{
+	u64 *wqe;
+	u64 header;
+	struct i40iw_qp_uk *qp_uk;
+
+	qp_uk = &qp->qp_uk;
+	wqe = qp_uk->sq_base->elem;
+
+	if (!wqe)
+		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+	switch (opcode) {
+	case I40IWQP_OP_NOP:
+		set_64bit_val(wqe, 0, 0);
+		set_64bit_val(wqe, 8, 0);
+		set_64bit_val(wqe, 16, 0);
+		header = LS_64(I40IWQP_OP_NOP, I40IWQPSQ_OPCODE) |
+			 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
+
+		i40iw_insert_wqe_hdr(wqe, header);
+		break;
+	case I40IWQP_OP_RDMA_SEND:
+		set_64bit_val(wqe, 0, 0);
+		set_64bit_val(wqe, 8, 0);
+		set_64bit_val(wqe, 16, 0);
+		header = LS_64(I40IWQP_OP_RDMA_SEND, I40IWQPSQ_OPCODE) |
+			 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID) |
+			 LS_64(1, I40IWQPSQ_STREAMMODE) |
+			 LS_64(1, I40IWQPSQ_WAITFORRCVPDU);
+
+		i40iw_insert_wqe_hdr(wqe, header);
+		break;
+	default:
+		i40iw_debug(qp->dev, I40IW_DEBUG_QP, "%s: Invalid WQE zero opcode\n",
+			    __func__);
+		break;
+	}
+	return 0;
+}
+
+/**
+ * i40iw_sc_init_iw_hmc() - queries fpm values using cqp and populates hmc_info
+ * @dev : ptr to i40iw_dev struct
+ * @hmc_fn_id: hmc function id
+ */
+enum i40iw_status_code i40iw_sc_init_iw_hmc(struct i40iw_sc_dev *dev, u8 hmc_fn_id)
+{
+	struct i40iw_hmc_info *hmc_info;
+	struct i40iw_dma_mem query_fpm_mem;
+	struct i40iw_virt_mem virt_mem;
+	struct i40iw_vfdev *vf_dev = NULL;
+	u32 mem_size;
+	enum i40iw_status_code ret_code = 0;
+	bool poll_registers = true;
+	u16 iw_vf_idx;
+	u8 wait_type;
+
+	if (hmc_fn_id >= I40IW_MAX_VF_FPM_ID ||
+	    (dev->hmc_fn_id != hmc_fn_id && hmc_fn_id < I40IW_FIRST_VF_FPM_ID))
+		return I40IW_ERR_INVALID_HMCFN_ID;
+
+	i40iw_debug(dev, I40IW_DEBUG_HMC, "hmc_fn_id %u, dev->hmc_fn_id %u\n", hmc_fn_id,
+		    dev->hmc_fn_id);
+	if (hmc_fn_id == dev->hmc_fn_id) {
+		hmc_info = dev->hmc_info;
+		query_fpm_mem.pa = dev->fpm_query_buf_pa;
+		query_fpm_mem.va = dev->fpm_query_buf;
+	} else {
+		vf_dev = i40iw_vfdev_from_fpm(dev, hmc_fn_id);
+		if (!vf_dev)
+			return I40IW_ERR_INVALID_VF_ID;
+
+		hmc_info = &vf_dev->hmc_info;
+		iw_vf_idx = vf_dev->iw_vf_idx;
+		i40iw_debug(dev, I40IW_DEBUG_HMC, "vf_dev %p, hmc_info %p, hmc_obj %p\n", vf_dev,
+			    hmc_info, hmc_info->hmc_obj);
+		if (!vf_dev->fpm_query_buf) {
+			if (!dev->vf_fpm_query_buf[iw_vf_idx].va) {
+				ret_code = i40iw_alloc_query_fpm_buf(dev,
+								     &dev->vf_fpm_query_buf[iw_vf_idx]);
+				if (ret_code)
+					return ret_code;
+			}
+			vf_dev->fpm_query_buf = dev->vf_fpm_query_buf[iw_vf_idx].va;
+			vf_dev->fpm_query_buf_pa = dev->vf_fpm_query_buf[iw_vf_idx].pa;
+		}
+		query_fpm_mem.pa = vf_dev->fpm_query_buf_pa;
+		query_fpm_mem.va = vf_dev->fpm_query_buf;
+		/**
+		 * It is HARDWARE specific:
+		 * this call is done by PF for VF and
+		 * i40iw_sc_query_fpm_values needs ccq poll
+		 * because PF ccq is already created.
+		 */
+		poll_registers = false;
+	}
+
+	hmc_info->hmc_fn_id = hmc_fn_id;
+
+	if (hmc_fn_id != dev->hmc_fn_id) {
+		ret_code =
+			i40iw_cqp_query_fpm_values_cmd(dev, &query_fpm_mem, hmc_fn_id);
+	} else {
+		wait_type = poll_registers ? (u8)I40IW_CQP_WAIT_POLL_REGS :
+			    (u8)I40IW_CQP_WAIT_POLL_CQ;
+
+		ret_code = i40iw_sc_query_fpm_values(
+					dev->cqp,
+					0,
+					hmc_info->hmc_fn_id,
+					&query_fpm_mem,
+					true,
+					wait_type);
+	}
+	if (ret_code)
+		return ret_code;
+
+	/* parse the fpm_query_buf and fill hmc obj info */
+	ret_code =
+		i40iw_sc_parse_fpm_query_buf((u64 *)query_fpm_mem.va,
+					     hmc_info,
+					     &dev->hmc_fpm_misc);
+	if (ret_code)
+		return ret_code;
+	i40iw_debug_buf(dev, I40IW_DEBUG_HMC, "QUERY FPM BUFFER",
+			query_fpm_mem.va, I40IW_QUERY_FPM_BUF_SIZE);
+
+	if (hmc_fn_id != dev->hmc_fn_id) {
+		i40iw_cqp_commit_fpm_values_cmd(dev, &query_fpm_mem, hmc_fn_id);
+
+		/* parse the fpm_commit_buf and fill hmc obj info */
+		i40iw_sc_parse_fpm_commit_buf((u64 *)query_fpm_mem.va, hmc_info->hmc_obj, &hmc_info->sd_table.sd_cnt);
+		mem_size = sizeof(struct i40iw_hmc_sd_entry) *
+			   (hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index);
+		ret_code = i40iw_allocate_virt_mem(dev->hw, &virt_mem, mem_size);
+		if (ret_code)
+			return ret_code;
+		hmc_info->sd_table.sd_entry = virt_mem.va;
+	}
+
+	return ret_code;
+}
+
+/**
+ * i40iw_sc_configure_iw_fpm() - commits hmc obj cnt values using cqp command and
+ * populates fpm base address in hmc_info
+ * @dev : ptr to i40iw_dev struct
+ * @hmc_fn_id: hmc function id
+ */
+static enum i40iw_status_code i40iw_sc_configure_iw_fpm(struct i40iw_sc_dev *dev,
+							u8 hmc_fn_id)
+{
+	struct i40iw_hmc_info *hmc_info;
+	struct i40iw_hmc_obj_info *obj_info;
+	u64 *buf;
+	struct i40iw_dma_mem commit_fpm_mem;
+	u32 i, j;
+	enum i40iw_status_code ret_code = 0;
+	bool poll_registers = true;
+	u8 wait_type;
+
+	if (hmc_fn_id >= I40IW_MAX_VF_FPM_ID ||
+	    (dev->hmc_fn_id != hmc_fn_id && hmc_fn_id < I40IW_FIRST_VF_FPM_ID))
+		return I40IW_ERR_INVALID_HMCFN_ID;
+
+	if (hmc_fn_id == dev->hmc_fn_id) {
+		hmc_info = dev->hmc_info;
+	} else {
+		hmc_info = i40iw_vf_hmcinfo_from_fpm(dev, hmc_fn_id);
+		poll_registers = false;
+	}
+	if (!hmc_info)
+		return I40IW_ERR_BAD_PTR;
+
+	obj_info = hmc_info->hmc_obj;
+	buf = dev->fpm_commit_buf;
+
+	/* copy cnt values in commit buf */
+	for (i = I40IW_HMC_IW_QP, j = 0; i <= I40IW_HMC_IW_PBLE;
+	     i++, j += 8)
+		set_64bit_val(buf, j, (u64)obj_info[i].cnt);
+
+	set_64bit_val(buf, 40, 0);   /* APBVT rsvd */
+
+	commit_fpm_mem.pa = dev->fpm_commit_buf_pa;
+	commit_fpm_mem.va = dev->fpm_commit_buf;
+	wait_type = poll_registers ? (u8)I40IW_CQP_WAIT_POLL_REGS :
+			(u8)I40IW_CQP_WAIT_POLL_CQ;
+	ret_code = i40iw_sc_commit_fpm_values(
+					dev->cqp,
+					0,
+					hmc_info->hmc_fn_id,
+					&commit_fpm_mem,
+					true,
+					wait_type);
+
+	/* parse the fpm_commit_buf and fill hmc obj info */
+	if (!ret_code)
+		ret_code = i40iw_sc_parse_fpm_commit_buf(dev->fpm_commit_buf,
+							 hmc_info->hmc_obj,
+							 &hmc_info->sd_table.sd_cnt);
+
+	i40iw_debug_buf(dev, I40IW_DEBUG_HMC, "COMMIT FPM BUFFER",
+			commit_fpm_mem.va, I40IW_COMMIT_FPM_BUF_SIZE);
+
+	return ret_code;
+}
+
+/**
+ * cqp_sds_wqe_fill - fill cqp wqe doe sd
+ * @cqp: struct for cqp hw
+ * @info; sd info for wqe
+ * @scratch: u64 saved to be used during cqp completion
+ */
+static enum i40iw_status_code cqp_sds_wqe_fill(struct i40iw_sc_cqp *cqp,
+					       struct i40iw_update_sds_info *info,
+					       u64 scratch)
+{
+	u64 data;
+	u64 header;
+	u64 *wqe;
+	int mem_entries, wqe_entries;
+	struct i40iw_dma_mem *sdbuf = &cqp->sdbuf;
+	u64 offset;
+	u32 wqe_idx;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe_idx(cqp, scratch, &wqe_idx);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	I40IW_CQP_INIT_WQE(wqe);
+	wqe_entries = (info->cnt > 3) ? 3 : info->cnt;
+	mem_entries = info->cnt - wqe_entries;
+
+	header = LS_64(I40IW_CQP_OP_UPDATE_PE_SDS, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID) |
+		 LS_64(mem_entries, I40IW_CQPSQ_UPESD_ENTRY_COUNT);
+
+	if (mem_entries) {
+		offset = wqe_idx * I40IW_UPDATE_SD_BUF_SIZE;
+		memcpy((char *)sdbuf->va + offset, &info->entry[3],
+		       mem_entries << 4);
+		data = (u64)sdbuf->pa + offset;
+	} else {
+		data = 0;
+	}
+	data |= LS_64(info->hmc_fn_id, I40IW_CQPSQ_UPESD_HMCFNID);
+
+	set_64bit_val(wqe, 16, data);
+
+	switch (wqe_entries) {
+	case 3:
+		set_64bit_val(wqe, 48,
+			      (LS_64(info->entry[2].cmd, I40IW_CQPSQ_UPESD_SDCMD) |
+					LS_64(1, I40IW_CQPSQ_UPESD_ENTRY_VALID)));
+
+		set_64bit_val(wqe, 56, info->entry[2].data);
+		/* fallthrough */
+	case 2:
+		set_64bit_val(wqe, 32,
+			      (LS_64(info->entry[1].cmd, I40IW_CQPSQ_UPESD_SDCMD) |
+					LS_64(1, I40IW_CQPSQ_UPESD_ENTRY_VALID)));
+
+		set_64bit_val(wqe, 40, info->entry[1].data);
+		/* fallthrough */
+	case 1:
+		set_64bit_val(wqe, 0,
+			      LS_64(info->entry[0].cmd, I40IW_CQPSQ_UPESD_SDCMD));
+
+		set_64bit_val(wqe, 8, info->entry[0].data);
+		break;
+	default:
+		break;
+	}
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "UPDATE_PE_SDS WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+	return 0;
+}
+
+/**
+ * i40iw_update_pe_sds - cqp wqe for sd
+ * @dev: ptr to i40iw_dev struct
+ * @info: sd info for sd's
+ * @scratch: u64 saved to be used during cqp completion
+ */
+static enum i40iw_status_code i40iw_update_pe_sds(struct i40iw_sc_dev *dev,
+						  struct i40iw_update_sds_info *info,
+						  u64 scratch)
+{
+	struct i40iw_sc_cqp *cqp = dev->cqp;
+	enum i40iw_status_code ret_code;
+
+	ret_code = cqp_sds_wqe_fill(cqp, info, scratch);
+	if (!ret_code)
+		i40iw_sc_cqp_post_sq(cqp);
+
+	return ret_code;
+}
+
+/**
+ * i40iw_update_sds_noccq - update sd before ccq created
+ * @dev: sc device struct
+ * @info: sd info for sd's
+ */
+enum i40iw_status_code i40iw_update_sds_noccq(struct i40iw_sc_dev *dev,
+					      struct i40iw_update_sds_info *info)
+{
+	u32 error, val, tail;
+	struct i40iw_sc_cqp *cqp = dev->cqp;
+	enum i40iw_status_code ret_code;
+
+	ret_code = cqp_sds_wqe_fill(cqp, info, 0);
+	if (ret_code)
+		return ret_code;
+	i40iw_get_cqp_reg_info(cqp, &val, &tail, &error);
+	if (error)
+		return I40IW_ERR_CQP_COMPL_ERROR;
+
+	i40iw_sc_cqp_post_sq(cqp);
+	ret_code = i40iw_cqp_poll_registers(cqp, tail, I40IW_DONE_COUNT);
+
+	return ret_code;
+}
+
+/**
+ * i40iw_sc_suspend_qp - suspend qp for param change
+ * @cqp: struct for cqp hw
+ * @qp: sc qp struct
+ * @scratch: u64 saved to be used during cqp completion
+ */
+enum i40iw_status_code i40iw_sc_suspend_qp(struct i40iw_sc_cqp *cqp,
+					   struct i40iw_sc_qp *qp,
+					   u64 scratch)
+{
+	u64 header;
+	u64 *wqe;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	header = LS_64(qp->qp_uk.qp_id, I40IW_CQPSQ_SUSPENDQP_QPID) |
+		 LS_64(I40IW_CQP_OP_SUSPEND_QP, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "SUSPEND_QP WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_resume_qp - resume qp after suspend
+ * @cqp: struct for cqp hw
+ * @qp: sc qp struct
+ * @scratch: u64 saved to be used during cqp completion
+ */
+enum i40iw_status_code i40iw_sc_resume_qp(struct i40iw_sc_cqp *cqp,
+					  struct i40iw_sc_qp *qp,
+					  u64 scratch)
+{
+	u64 header;
+	u64 *wqe;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe,
+		      16,
+			LS_64(qp->qs_handle, I40IW_CQPSQ_RESUMEQP_QSHANDLE));
+
+	header = LS_64(qp->qp_uk.qp_id, I40IW_CQPSQ_RESUMEQP_QPID) |
+		 LS_64(I40IW_CQP_OP_RESUME_QP, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "RESUME_QP WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+/**
+ * i40iw_sc_static_hmc_pages_allocated - cqp wqe to allocate hmc pages
+ * @cqp: struct for cqp hw
+ * @scratch: u64 saved to be used during cqp completion
+ * @hmc_fn_id: hmc function id
+ * @post_sq: flag for cqp db to ring
+ * @poll_registers: flag to poll register for cqp completion
+ */
+enum i40iw_status_code i40iw_sc_static_hmc_pages_allocated(
+					struct i40iw_sc_cqp *cqp,
+					u64 scratch,
+					u8 hmc_fn_id,
+					bool post_sq,
+					bool poll_registers)
+{
+	u64 header;
+	u64 *wqe;
+	u32 tail, val, error;
+	enum i40iw_status_code ret_code = 0;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+	set_64bit_val(wqe,
+		      16,
+		      LS_64(hmc_fn_id, I40IW_SHMC_PAGE_ALLOCATED_HMC_FN_ID));
+
+	header = LS_64(I40IW_CQP_OP_SHMC_PAGES_ALLOCATED, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "SHMC_PAGES_ALLOCATED WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+	i40iw_get_cqp_reg_info(cqp, &val, &tail, &error);
+	if (error) {
+		ret_code = I40IW_ERR_CQP_COMPL_ERROR;
+		return ret_code;
+	}
+	if (post_sq) {
+		i40iw_sc_cqp_post_sq(cqp);
+		if (poll_registers)
+			/* check for cqp sq tail update */
+			ret_code = i40iw_cqp_poll_registers(cqp, tail, 1000);
+		else
+			ret_code = i40iw_sc_poll_for_cqp_op_done(cqp,
+								 I40IW_CQP_OP_SHMC_PAGES_ALLOCATED,
+								 NULL);
+	}
+
+	return ret_code;
+}
+
+/**
+ * i40iw_ring_full - check if cqp ring is full
+ * @cqp: struct for cqp hw
+ */
+static bool i40iw_ring_full(struct i40iw_sc_cqp *cqp)
+{
+	return I40IW_RING_FULL_ERR(cqp->sq_ring);
+}
+
+/**
+ * i40iw_est_sd - returns approximate number of SDs for HMC
+ * @dev: sc device struct
+ * @hmc_info: hmc structure, size and count for HMC objects
+ */
+static u64 i40iw_est_sd(struct i40iw_sc_dev *dev, struct i40iw_hmc_info *hmc_info)
+{
+	int i;
+	u64 size = 0;
+	u64 sd;
+
+	for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_PBLE; i++)
+		size += hmc_info->hmc_obj[i].cnt * hmc_info->hmc_obj[i].size;
+
+	if (dev->is_pf)
+		size += hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt * hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].size;
+
+	if (size & 0x1FFFFF)
+		sd = (size >> 21) + 1; /* add 1 for remainder */
+	else
+		sd = size >> 21;
+
+	if (!dev->is_pf) {
+		/* 2MB alignment for VF PBLE HMC */
+		size = hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt * hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].size;
+		if (size & 0x1FFFFF)
+			sd += (size >> 21) + 1; /* add 1 for remainder */
+		else
+			sd += size >> 21;
+	}
+
+	return sd;
+}
+
+/**
+ * i40iw_config_fpm_values - configure HMC objects
+ * @dev: sc device struct
+ * @qp_count: desired qp count
+ */
+enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_count)
+{
+	struct i40iw_virt_mem virt_mem;
+	u32 i, mem_size;
+	u32 qpwantedoriginal, qpwanted, mrwanted, pblewanted;
+	u64 sd_needed;
+	u32 loop_count = 0;
+
+	struct i40iw_hmc_info *hmc_info;
+	struct i40iw_hmc_fpm_misc *hmc_fpm_misc;
+	enum i40iw_status_code ret_code = 0;
+
+	hmc_info = dev->hmc_info;
+	hmc_fpm_misc = &dev->hmc_fpm_misc;
+
+	ret_code = i40iw_sc_init_iw_hmc(dev, dev->hmc_fn_id);
+	if (ret_code) {
+		i40iw_debug(dev, I40IW_DEBUG_HMC,
+			    "i40iw_sc_init_iw_hmc returned error_code = %d\n",
+			    ret_code);
+		return ret_code;
+	}
+
+	for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++)
+		hmc_info->hmc_obj[i].cnt = hmc_info->hmc_obj[i].max_cnt;
+	sd_needed = i40iw_est_sd(dev, hmc_info);
+	i40iw_debug(dev, I40IW_DEBUG_HMC,
+		    "%s: FW initial max sd_count[%08lld] first_sd_index[%04d]\n",
+		    __func__, sd_needed, hmc_info->first_sd_index);
+	i40iw_debug(dev, I40IW_DEBUG_HMC,
+		    "%s: sd count %d where max sd is %d\n",
+		    __func__, hmc_info->sd_table.sd_cnt,
+		    hmc_fpm_misc->max_sds);
+
+	qpwanted = min(qp_count, hmc_info->hmc_obj[I40IW_HMC_IW_QP].max_cnt);
+	qpwantedoriginal = qpwanted;
+	mrwanted = hmc_info->hmc_obj[I40IW_HMC_IW_MR].max_cnt;
+	pblewanted = hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].max_cnt;
+
+	i40iw_debug(dev, I40IW_DEBUG_HMC,
+		    "req_qp=%d max_sd=%d, max_qp = %d, max_cq=%d, max_mr=%d, max_pble=%d\n",
+		    qp_count, hmc_fpm_misc->max_sds,
+		    hmc_info->hmc_obj[I40IW_HMC_IW_QP].max_cnt,
+		    hmc_info->hmc_obj[I40IW_HMC_IW_CQ].max_cnt,
+		    hmc_info->hmc_obj[I40IW_HMC_IW_MR].max_cnt,
+		    hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].max_cnt);
+
+	do {
+		++loop_count;
+		hmc_info->hmc_obj[I40IW_HMC_IW_QP].cnt = qpwanted;
+		hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt =
+			min(2 * qpwanted, hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt);
+		hmc_info->hmc_obj[I40IW_HMC_IW_SRQ].cnt = 0x00; /* Reserved */
+		hmc_info->hmc_obj[I40IW_HMC_IW_HTE].cnt =
+					qpwanted * hmc_fpm_misc->ht_multiplier;
+		hmc_info->hmc_obj[I40IW_HMC_IW_ARP].cnt =
+			hmc_info->hmc_obj[I40IW_HMC_IW_ARP].max_cnt;
+		hmc_info->hmc_obj[I40IW_HMC_IW_APBVT_ENTRY].cnt = 1;
+		hmc_info->hmc_obj[I40IW_HMC_IW_MR].cnt = mrwanted;
+
+		hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt =
+			roundup_pow_of_two(I40IW_MAX_WQ_ENTRIES * qpwanted);
+		hmc_info->hmc_obj[I40IW_HMC_IW_Q1].cnt =
+			roundup_pow_of_two(2 * I40IW_MAX_IRD_SIZE * qpwanted);
+		hmc_info->hmc_obj[I40IW_HMC_IW_XFFL].cnt =
+			hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt / hmc_fpm_misc->xf_block_size;
+		hmc_info->hmc_obj[I40IW_HMC_IW_Q1FL].cnt =
+			hmc_info->hmc_obj[I40IW_HMC_IW_Q1].cnt / hmc_fpm_misc->q1_block_size;
+		hmc_info->hmc_obj[I40IW_HMC_IW_TIMER].cnt =
+			((qpwanted) / 512 + 1) * hmc_fpm_misc->timer_bucket;
+		hmc_info->hmc_obj[I40IW_HMC_IW_FSIMC].cnt = 0x00;
+		hmc_info->hmc_obj[I40IW_HMC_IW_FSIAV].cnt = 0x00;
+		hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt = pblewanted;
+
+		/* How much memory is needed for all the objects. */
+		sd_needed = i40iw_est_sd(dev, hmc_info);
+		if ((loop_count > 1000) ||
+		    ((!(loop_count % 10)) &&
+		    (qpwanted > qpwantedoriginal * 2 / 3))) {
+			if (qpwanted > FPM_MULTIPLIER)
+				qpwanted = roundup_pow_of_two(qpwanted -
+							      FPM_MULTIPLIER);
+			qpwanted >>= 1;
+		}
+		if (mrwanted > FPM_MULTIPLIER * 10)
+			mrwanted -= FPM_MULTIPLIER * 10;
+		if (pblewanted > FPM_MULTIPLIER * 1000)
+			pblewanted -= FPM_MULTIPLIER * 1000;
+	} while (sd_needed > hmc_fpm_misc->max_sds && loop_count < 2000);
+
+	i40iw_debug(dev, I40IW_DEBUG_HMC,
+		    "loop_cnt=%d, sd_needed=%lld, qpcnt = %d, cqcnt=%d, mrcnt=%d, pblecnt=%d\n",
+		    loop_count, sd_needed,
+		    hmc_info->hmc_obj[I40IW_HMC_IW_QP].cnt,
+		    hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt,
+		    hmc_info->hmc_obj[I40IW_HMC_IW_MR].cnt,
+		    hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt);
+
+	ret_code = i40iw_sc_configure_iw_fpm(dev, dev->hmc_fn_id);
+	if (ret_code) {
+		i40iw_debug(dev, I40IW_DEBUG_HMC,
+			    "configure_iw_fpm returned error_code[x%08X]\n",
+			    i40iw_rd32(dev->hw, dev->is_pf ? I40E_PFPE_CQPERRCODES : I40E_VFPE_CQPERRCODES1));
+		return ret_code;
+	}
+
+	mem_size = sizeof(struct i40iw_hmc_sd_entry) *
+		   (hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index + 1);
+	ret_code = i40iw_allocate_virt_mem(dev->hw, &virt_mem, mem_size);
+	if (ret_code) {
+		i40iw_debug(dev, I40IW_DEBUG_HMC,
+			    "%s: failed to allocate memory for sd_entry buffer\n",
+			    __func__);
+		return ret_code;
+	}
+	hmc_info->sd_table.sd_entry = virt_mem.va;
+
+	return ret_code;
+}
+
+/**
+ * i40iw_exec_cqp_cmd - execute cqp cmd when wqe are available
+ * @dev: rdma device
+ * @pcmdinfo: cqp command info
+ */
+static enum i40iw_status_code i40iw_exec_cqp_cmd(struct i40iw_sc_dev *dev,
+						 struct cqp_commands_info *pcmdinfo)
+{
+	enum i40iw_status_code status;
+	struct i40iw_dma_mem values_mem;
+
+	dev->cqp_cmd_stats[pcmdinfo->cqp_cmd]++;
+	switch (pcmdinfo->cqp_cmd) {
+	case OP_DELETE_LOCAL_MAC_IPADDR_ENTRY:
+		status = i40iw_sc_del_local_mac_ipaddr_entry(
+				pcmdinfo->in.u.del_local_mac_ipaddr_entry.cqp,
+				pcmdinfo->in.u.del_local_mac_ipaddr_entry.scratch,
+				pcmdinfo->in.u.del_local_mac_ipaddr_entry.entry_idx,
+				pcmdinfo->in.u.del_local_mac_ipaddr_entry.ignore_ref_count,
+				pcmdinfo->post_sq);
+		break;
+	case OP_CEQ_DESTROY:
+		status = i40iw_sc_ceq_destroy(pcmdinfo->in.u.ceq_destroy.ceq,
+					      pcmdinfo->in.u.ceq_destroy.scratch,
+					      pcmdinfo->post_sq);
+		break;
+	case OP_AEQ_DESTROY:
+		status = i40iw_sc_aeq_destroy(pcmdinfo->in.u.aeq_destroy.aeq,
+					      pcmdinfo->in.u.aeq_destroy.scratch,
+					      pcmdinfo->post_sq);
+
+		break;
+	case OP_DELETE_ARP_CACHE_ENTRY:
+		status = i40iw_sc_del_arp_cache_entry(
+				pcmdinfo->in.u.del_arp_cache_entry.cqp,
+				pcmdinfo->in.u.del_arp_cache_entry.scratch,
+				pcmdinfo->in.u.del_arp_cache_entry.arp_index,
+				pcmdinfo->post_sq);
+		break;
+	case OP_MANAGE_APBVT_ENTRY:
+		status = i40iw_sc_manage_apbvt_entry(
+				pcmdinfo->in.u.manage_apbvt_entry.cqp,
+				&pcmdinfo->in.u.manage_apbvt_entry.info,
+				pcmdinfo->in.u.manage_apbvt_entry.scratch,
+				pcmdinfo->post_sq);
+		break;
+	case OP_CEQ_CREATE:
+		status = i40iw_sc_ceq_create(pcmdinfo->in.u.ceq_create.ceq,
+					     pcmdinfo->in.u.ceq_create.scratch,
+					     pcmdinfo->post_sq);
+		break;
+	case OP_AEQ_CREATE:
+		status = i40iw_sc_aeq_create(pcmdinfo->in.u.aeq_create.aeq,
+					     pcmdinfo->in.u.aeq_create.scratch,
+					     pcmdinfo->post_sq);
+		break;
+	case OP_ALLOC_LOCAL_MAC_IPADDR_ENTRY:
+		status = i40iw_sc_alloc_local_mac_ipaddr_entry(
+				pcmdinfo->in.u.alloc_local_mac_ipaddr_entry.cqp,
+				pcmdinfo->in.u.alloc_local_mac_ipaddr_entry.scratch,
+				pcmdinfo->post_sq);
+		break;
+	case OP_ADD_LOCAL_MAC_IPADDR_ENTRY:
+		status = i40iw_sc_add_local_mac_ipaddr_entry(
+				pcmdinfo->in.u.add_local_mac_ipaddr_entry.cqp,
+				&pcmdinfo->in.u.add_local_mac_ipaddr_entry.info,
+				pcmdinfo->in.u.add_local_mac_ipaddr_entry.scratch,
+				pcmdinfo->post_sq);
+		break;
+	case OP_MANAGE_QHASH_TABLE_ENTRY:
+		status = i40iw_sc_manage_qhash_table_entry(
+				pcmdinfo->in.u.manage_qhash_table_entry.cqp,
+				&pcmdinfo->in.u.manage_qhash_table_entry.info,
+				pcmdinfo->in.u.manage_qhash_table_entry.scratch,
+				pcmdinfo->post_sq);
+
+		break;
+	case OP_QP_MODIFY:
+		status = i40iw_sc_qp_modify(
+				pcmdinfo->in.u.qp_modify.qp,
+				&pcmdinfo->in.u.qp_modify.info,
+				pcmdinfo->in.u.qp_modify.scratch,
+				pcmdinfo->post_sq);
+
+		break;
+	case OP_QP_UPLOAD_CONTEXT:
+		status = i40iw_sc_qp_upload_context(
+				pcmdinfo->in.u.qp_upload_context.dev,
+				&pcmdinfo->in.u.qp_upload_context.info,
+				pcmdinfo->in.u.qp_upload_context.scratch,
+				pcmdinfo->post_sq);
+
+		break;
+	case OP_CQ_CREATE:
+		status = i40iw_sc_cq_create(
+				pcmdinfo->in.u.cq_create.cq,
+				pcmdinfo->in.u.cq_create.scratch,
+				pcmdinfo->in.u.cq_create.check_overflow,
+				pcmdinfo->post_sq);
+		break;
+	case OP_CQ_DESTROY:
+		status = i40iw_sc_cq_destroy(
+				pcmdinfo->in.u.cq_destroy.cq,
+				pcmdinfo->in.u.cq_destroy.scratch,
+				pcmdinfo->post_sq);
+
+		break;
+	case OP_QP_CREATE:
+		status = i40iw_sc_qp_create(
+				pcmdinfo->in.u.qp_create.qp,
+				&pcmdinfo->in.u.qp_create.info,
+				pcmdinfo->in.u.qp_create.scratch,
+				pcmdinfo->post_sq);
+		break;
+	case OP_QP_DESTROY:
+		status = i40iw_sc_qp_destroy(
+				pcmdinfo->in.u.qp_destroy.qp,
+				pcmdinfo->in.u.qp_destroy.scratch,
+				pcmdinfo->in.u.qp_destroy.remove_hash_idx,
+				pcmdinfo->in.u.qp_destroy.
+				ignore_mw_bnd,
+				pcmdinfo->post_sq);
+
+		break;
+	case OP_ALLOC_STAG:
+		status = i40iw_sc_alloc_stag(
+				pcmdinfo->in.u.alloc_stag.dev,
+				&pcmdinfo->in.u.alloc_stag.info,
+				pcmdinfo->in.u.alloc_stag.scratch,
+				pcmdinfo->post_sq);
+		break;
+	case OP_MR_REG_NON_SHARED:
+		status = i40iw_sc_mr_reg_non_shared(
+				pcmdinfo->in.u.mr_reg_non_shared.dev,
+				&pcmdinfo->in.u.mr_reg_non_shared.info,
+				pcmdinfo->in.u.mr_reg_non_shared.scratch,
+				pcmdinfo->post_sq);
+
+		break;
+	case OP_DEALLOC_STAG:
+		status = i40iw_sc_dealloc_stag(
+				pcmdinfo->in.u.dealloc_stag.dev,
+				&pcmdinfo->in.u.dealloc_stag.info,
+				pcmdinfo->in.u.dealloc_stag.scratch,
+				pcmdinfo->post_sq);
+
+		break;
+	case OP_MW_ALLOC:
+		status = i40iw_sc_mw_alloc(
+				pcmdinfo->in.u.mw_alloc.dev,
+				pcmdinfo->in.u.mw_alloc.scratch,
+				pcmdinfo->in.u.mw_alloc.mw_stag_index,
+				pcmdinfo->in.u.mw_alloc.pd_id,
+				pcmdinfo->post_sq);
+
+		break;
+	case OP_QP_FLUSH_WQES:
+		status = i40iw_sc_qp_flush_wqes(
+				pcmdinfo->in.u.qp_flush_wqes.qp,
+				&pcmdinfo->in.u.qp_flush_wqes.info,
+				pcmdinfo->in.u.qp_flush_wqes.
+				scratch, pcmdinfo->post_sq);
+		break;
+	case OP_GEN_AE:
+		status = i40iw_sc_gen_ae(
+				pcmdinfo->in.u.gen_ae.qp,
+				&pcmdinfo->in.u.gen_ae.info,
+				pcmdinfo->in.u.gen_ae.scratch,
+				pcmdinfo->post_sq);
+		break;
+	case OP_ADD_ARP_CACHE_ENTRY:
+		status = i40iw_sc_add_arp_cache_entry(
+				pcmdinfo->in.u.add_arp_cache_entry.cqp,
+				&pcmdinfo->in.u.add_arp_cache_entry.info,
+				pcmdinfo->in.u.add_arp_cache_entry.scratch,
+				pcmdinfo->post_sq);
+		break;
+	case OP_MANAGE_PUSH_PAGE:
+		status = i40iw_sc_manage_push_page(
+				pcmdinfo->in.u.manage_push_page.cqp,
+				&pcmdinfo->in.u.manage_push_page.info,
+				pcmdinfo->in.u.manage_push_page.scratch,
+				pcmdinfo->post_sq);
+		break;
+	case OP_UPDATE_PE_SDS:
+		/* case I40IW_CQP_OP_UPDATE_PE_SDS */
+		status = i40iw_update_pe_sds(
+				pcmdinfo->in.u.update_pe_sds.dev,
+				&pcmdinfo->in.u.update_pe_sds.info,
+				pcmdinfo->in.u.update_pe_sds.
+				scratch);
+
+		break;
+	case OP_MANAGE_HMC_PM_FUNC_TABLE:
+		status = i40iw_sc_manage_hmc_pm_func_table(
+				pcmdinfo->in.u.manage_hmc_pm.dev->cqp,
+				pcmdinfo->in.u.manage_hmc_pm.scratch,
+				(u8)pcmdinfo->in.u.manage_hmc_pm.info.vf_id,
+				pcmdinfo->in.u.manage_hmc_pm.info.free_fcn,
+				true);
+		break;
+	case OP_SUSPEND:
+		status = i40iw_sc_suspend_qp(
+				pcmdinfo->in.u.suspend_resume.cqp,
+				pcmdinfo->in.u.suspend_resume.qp,
+				pcmdinfo->in.u.suspend_resume.scratch);
+		break;
+	case OP_RESUME:
+		status = i40iw_sc_resume_qp(
+				pcmdinfo->in.u.suspend_resume.cqp,
+				pcmdinfo->in.u.suspend_resume.qp,
+				pcmdinfo->in.u.suspend_resume.scratch);
+		break;
+	case OP_MANAGE_VF_PBLE_BP:
+		status = i40iw_manage_vf_pble_bp(
+				pcmdinfo->in.u.manage_vf_pble_bp.cqp,
+				&pcmdinfo->in.u.manage_vf_pble_bp.info,
+				pcmdinfo->in.u.manage_vf_pble_bp.scratch, true);
+		break;
+	case OP_QUERY_FPM_VALUES:
+		values_mem.pa = pcmdinfo->in.u.query_fpm_values.fpm_values_pa;
+		values_mem.va = pcmdinfo->in.u.query_fpm_values.fpm_values_va;
+		status = i40iw_sc_query_fpm_values(
+				pcmdinfo->in.u.query_fpm_values.cqp,
+				pcmdinfo->in.u.query_fpm_values.scratch,
+				pcmdinfo->in.u.query_fpm_values.hmc_fn_id,
+				&values_mem, true, I40IW_CQP_WAIT_EVENT);
+		break;
+	case OP_COMMIT_FPM_VALUES:
+		values_mem.pa = pcmdinfo->in.u.commit_fpm_values.fpm_values_pa;
+		values_mem.va = pcmdinfo->in.u.commit_fpm_values.fpm_values_va;
+		status = i40iw_sc_commit_fpm_values(
+				pcmdinfo->in.u.commit_fpm_values.cqp,
+				pcmdinfo->in.u.commit_fpm_values.scratch,
+				pcmdinfo->in.u.commit_fpm_values.hmc_fn_id,
+				&values_mem,
+				true,
+				I40IW_CQP_WAIT_EVENT);
+		break;
+	default:
+		status = I40IW_NOT_SUPPORTED;
+		break;
+	}
+
+	return status;
+}
+
+/**
+ * i40iw_process_cqp_cmd - process all cqp commands
+ * @dev: sc device struct
+ * @pcmdinfo: cqp command info
+ */
+enum i40iw_status_code i40iw_process_cqp_cmd(struct i40iw_sc_dev *dev,
+					     struct cqp_commands_info *pcmdinfo)
+{
+	enum i40iw_status_code status = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->cqp_lock, flags);
+	if (list_empty(&dev->cqp_cmd_head) && !i40iw_ring_full(dev->cqp))
+		status = i40iw_exec_cqp_cmd(dev, pcmdinfo);
+	else
+		list_add_tail(&pcmdinfo->cqp_cmd_entry, &dev->cqp_cmd_head);
+	spin_unlock_irqrestore(&dev->cqp_lock, flags);
+	return status;
+}
+
+/**
+ * i40iw_process_bh - called from tasklet for cqp list
+ * @dev: sc device struct
+ */
+enum i40iw_status_code i40iw_process_bh(struct i40iw_sc_dev *dev)
+{
+	enum i40iw_status_code status = 0;
+	struct cqp_commands_info *pcmdinfo;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->cqp_lock, flags);
+	while (!list_empty(&dev->cqp_cmd_head) && !i40iw_ring_full(dev->cqp)) {
+		pcmdinfo = (struct cqp_commands_info *)i40iw_remove_head(&dev->cqp_cmd_head);
+
+		status = i40iw_exec_cqp_cmd(dev, pcmdinfo);
+		if (status)
+			break;
+	}
+	spin_unlock_irqrestore(&dev->cqp_lock, flags);
+	return status;
+}
+
+/**
+ * i40iw_iwarp_opcode - determine if incoming is rdma layer
+ * @info: aeq info for the packet
+ * @pkt: packet for error
+ */
+static u32 i40iw_iwarp_opcode(struct i40iw_aeqe_info *info, u8 *pkt)
+{
+	__be16 *mpa;
+	u32 opcode = 0xffffffff;
+
+	if (info->q2_data_written) {
+		mpa = (__be16 *)pkt;
+		opcode = ntohs(mpa[1]) & 0xf;
+	}
+	return opcode;
+}
+
+/**
+ * i40iw_locate_mpa - return pointer to mpa in the pkt
+ * @pkt: packet with data
+ */
+static u8 *i40iw_locate_mpa(u8 *pkt)
+{
+	/* skip over ethernet header */
+	pkt += I40IW_MAC_HLEN;
+
+	/* Skip over IP and TCP headers */
+	pkt += 4 * (pkt[0] & 0x0f);
+	pkt += 4 * ((pkt[12] >> 4) & 0x0f);
+	return pkt;
+}
+
+/**
+ * i40iw_setup_termhdr - termhdr for terminate pkt
+ * @qp: sc qp ptr for pkt
+ * @hdr: term hdr
+ * @opcode: flush opcode for termhdr
+ * @layer_etype: error layer + error type
+ * @err: error cod ein the header
+ */
+static void i40iw_setup_termhdr(struct i40iw_sc_qp *qp,
+				struct i40iw_terminate_hdr *hdr,
+				enum i40iw_flush_opcode opcode,
+				u8 layer_etype,
+				u8 err)
+{
+	qp->flush_code = opcode;
+	hdr->layer_etype = layer_etype;
+	hdr->error_code = err;
+}
+
+/**
+ * i40iw_bld_terminate_hdr - build terminate message header
+ * @qp: qp associated with received terminate AE
+ * @info: the struct contiaing AE information
+ */
+static int i40iw_bld_terminate_hdr(struct i40iw_sc_qp *qp,
+				   struct i40iw_aeqe_info *info)
+{
+	u8 *pkt = qp->q2_buf + Q2_BAD_FRAME_OFFSET;
+	u16 ddp_seg_len;
+	int copy_len = 0;
+	u8 is_tagged = 0;
+	u32 opcode;
+	struct i40iw_terminate_hdr *termhdr;
+
+	termhdr = (struct i40iw_terminate_hdr *)qp->q2_buf;
+	memset(termhdr, 0, Q2_BAD_FRAME_OFFSET);
+
+	if (info->q2_data_written) {
+		/* Use data from offending packet to fill in ddp & rdma hdrs */
+		pkt = i40iw_locate_mpa(pkt);
+		ddp_seg_len = ntohs(*(__be16 *)pkt);
+		if (ddp_seg_len) {
+			copy_len = 2;
+			termhdr->hdrct = DDP_LEN_FLAG;
+			if (pkt[2] & 0x80) {
+				is_tagged = 1;
+				if (ddp_seg_len >= TERM_DDP_LEN_TAGGED) {
+					copy_len += TERM_DDP_LEN_TAGGED;
+					termhdr->hdrct |= DDP_HDR_FLAG;
+				}
+			} else {
+				if (ddp_seg_len >= TERM_DDP_LEN_UNTAGGED) {
+					copy_len += TERM_DDP_LEN_UNTAGGED;
+					termhdr->hdrct |= DDP_HDR_FLAG;
+				}
+
+				if (ddp_seg_len >= (TERM_DDP_LEN_UNTAGGED + TERM_RDMA_LEN)) {
+					if ((pkt[3] & RDMA_OPCODE_MASK) == RDMA_READ_REQ_OPCODE) {
+						copy_len += TERM_RDMA_LEN;
+						termhdr->hdrct |= RDMA_HDR_FLAG;
+					}
+				}
+			}
+		}
+	}
+
+	opcode = i40iw_iwarp_opcode(info, pkt);
+
+	switch (info->ae_id) {
+	case I40IW_AE_AMP_UNALLOCATED_STAG:
+		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
+		if (opcode == I40IW_OP_TYPE_RDMA_WRITE)
+			i40iw_setup_termhdr(qp, termhdr, FLUSH_PROT_ERR,
+					    (LAYER_DDP << 4) | DDP_TAGGED_BUFFER, DDP_TAGGED_INV_STAG);
+		else
+			i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
+					    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_INV_STAG);
+		break;
+	case I40IW_AE_AMP_BOUNDS_VIOLATION:
+		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
+		if (info->q2_data_written)
+			i40iw_setup_termhdr(qp, termhdr, FLUSH_PROT_ERR,
+					    (LAYER_DDP << 4) | DDP_TAGGED_BUFFER, DDP_TAGGED_BOUNDS);
+		else
+			i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
+					    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_INV_BOUNDS);
+		break;
+	case I40IW_AE_AMP_BAD_PD:
+		switch (opcode) {
+		case I40IW_OP_TYPE_RDMA_WRITE:
+			i40iw_setup_termhdr(qp, termhdr, FLUSH_PROT_ERR,
+					    (LAYER_DDP << 4) | DDP_TAGGED_BUFFER, DDP_TAGGED_UNASSOC_STAG);
+			break;
+		case I40IW_OP_TYPE_SEND_INV:
+		case I40IW_OP_TYPE_SEND_SOL_INV:
+			i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
+					    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_CANT_INV_STAG);
+			break;
+		default:
+			i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
+					    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_UNASSOC_STAG);
+		}
+		break;
+	case I40IW_AE_AMP_INVALID_STAG:
+		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
+				    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_INV_STAG);
+		break;
+	case I40IW_AE_AMP_BAD_QP:
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_LOC_QP_OP_ERR,
+				    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_QN);
+		break;
+	case I40IW_AE_AMP_BAD_STAG_KEY:
+	case I40IW_AE_AMP_BAD_STAG_INDEX:
+		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
+		switch (opcode) {
+		case I40IW_OP_TYPE_SEND_INV:
+		case I40IW_OP_TYPE_SEND_SOL_INV:
+			i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_OP_ERR,
+					    (LAYER_RDMA << 4) | RDMAP_REMOTE_OP, RDMAP_CANT_INV_STAG);
+			break;
+		default:
+			i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
+					    (LAYER_RDMA << 4) | RDMAP_REMOTE_OP, RDMAP_INV_STAG);
+		}
+		break;
+	case I40IW_AE_AMP_RIGHTS_VIOLATION:
+	case I40IW_AE_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
+	case I40IW_AE_PRIV_OPERATION_DENIED:
+		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
+				    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_ACCESS);
+		break;
+	case I40IW_AE_AMP_TO_WRAP:
+		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
+				    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_TO_WRAP);
+		break;
+	case I40IW_AE_LLP_RECEIVED_MPA_CRC_ERROR:
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
+				    (LAYER_MPA << 4) | DDP_LLP, MPA_CRC);
+		break;
+	case I40IW_AE_LLP_SEGMENT_TOO_LARGE:
+	case I40IW_AE_LLP_SEGMENT_TOO_SMALL:
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_LOC_LEN_ERR,
+				    (LAYER_DDP << 4) | DDP_CATASTROPHIC, DDP_CATASTROPHIC_LOCAL);
+		break;
+	case I40IW_AE_LCE_QP_CATASTROPHIC:
+	case I40IW_AE_DDP_NO_L_BIT:
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_FATAL_ERR,
+				    (LAYER_DDP << 4) | DDP_CATASTROPHIC, DDP_CATASTROPHIC_LOCAL);
+		break;
+	case I40IW_AE_DDP_INVALID_MSN_GAP_IN_MSN:
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
+				    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_MSN_RANGE);
+		break;
+	case I40IW_AE_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
+		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_LOC_LEN_ERR,
+				    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_TOO_LONG);
+		break;
+	case I40IW_AE_DDP_UBE_INVALID_DDP_VERSION:
+		if (is_tagged)
+			i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
+					    (LAYER_DDP << 4) | DDP_TAGGED_BUFFER, DDP_TAGGED_INV_DDP_VER);
+		else
+			i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
+					    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_DDP_VER);
+		break;
+	case I40IW_AE_DDP_UBE_INVALID_MO:
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
+				    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_MO);
+		break;
+	case I40IW_AE_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE:
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_OP_ERR,
+				    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_MSN_NO_BUF);
+		break;
+	case I40IW_AE_DDP_UBE_INVALID_QN:
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
+				    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_QN);
+		break;
+	case I40IW_AE_RDMAP_ROE_INVALID_RDMAP_VERSION:
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
+				    (LAYER_RDMA << 4) | RDMAP_REMOTE_OP, RDMAP_INV_RDMAP_VER);
+		break;
+	case I40IW_AE_RDMAP_ROE_UNEXPECTED_OPCODE:
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_LOC_QP_OP_ERR,
+				    (LAYER_RDMA << 4) | RDMAP_REMOTE_OP, RDMAP_UNEXPECTED_OP);
+		break;
+	default:
+		i40iw_setup_termhdr(qp, termhdr, FLUSH_FATAL_ERR,
+				    (LAYER_RDMA << 4) | RDMAP_REMOTE_OP, RDMAP_UNSPECIFIED);
+		break;
+	}
+
+	if (copy_len)
+		memcpy(termhdr + 1, pkt, copy_len);
+
+	return sizeof(struct i40iw_terminate_hdr) + copy_len;
+}
+
+/**
+ * i40iw_terminate_send_fin() - Send fin for terminate message
+ * @qp: qp associated with received terminate AE
+ */
+void i40iw_terminate_send_fin(struct i40iw_sc_qp *qp)
+{
+	/* Send the fin only */
+	i40iw_term_modify_qp(qp,
+			     I40IW_QP_STATE_TERMINATE,
+			     I40IWQP_TERM_SEND_FIN_ONLY,
+			     0);
+}
+
+/**
+ * i40iw_terminate_connection() - Bad AE and send terminate to remote QP
+ * @qp: qp associated with received terminate AE
+ * @info: the struct contiaing AE information
+ */
+void i40iw_terminate_connection(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *info)
+{
+	u8 termlen = 0;
+
+	if (qp->term_flags & I40IW_TERM_SENT)
+		return;         /* Sanity check */
+
+	/* Eventtype can change from bld_terminate_hdr */
+	qp->eventtype = TERM_EVENT_QP_FATAL;
+	termlen = i40iw_bld_terminate_hdr(qp, info);
+	i40iw_terminate_start_timer(qp);
+	qp->term_flags |= I40IW_TERM_SENT;
+	i40iw_term_modify_qp(qp, I40IW_QP_STATE_TERMINATE,
+			     I40IWQP_TERM_SEND_TERM_ONLY, termlen);
+}
+
+/**
+ * i40iw_terminate_received - handle terminate received AE
+ * @qp: qp associated with received terminate AE
+ * @info: the struct contiaing AE information
+ */
+void i40iw_terminate_received(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *info)
+{
+	u8 *pkt = qp->q2_buf + Q2_BAD_FRAME_OFFSET;
+	__be32 *mpa;
+	u8 ddp_ctl;
+	u8 rdma_ctl;
+	u16 aeq_id = 0;
+	struct i40iw_terminate_hdr *termhdr;
+
+	mpa = (__be32 *)i40iw_locate_mpa(pkt);
+	if (info->q2_data_written) {
+		/* did not validate the frame - do it now */
+		ddp_ctl = (ntohl(mpa[0]) >> 8) & 0xff;
+		rdma_ctl = ntohl(mpa[0]) & 0xff;
+		if ((ddp_ctl & 0xc0) != 0x40)
+			aeq_id = I40IW_AE_LCE_QP_CATASTROPHIC;
+		else if ((ddp_ctl & 0x03) != 1)
+			aeq_id = I40IW_AE_DDP_UBE_INVALID_DDP_VERSION;
+		else if (ntohl(mpa[2]) != 2)
+			aeq_id = I40IW_AE_DDP_UBE_INVALID_QN;
+		else if (ntohl(mpa[3]) != 1)
+			aeq_id = I40IW_AE_DDP_INVALID_MSN_GAP_IN_MSN;
+		else if (ntohl(mpa[4]) != 0)
+			aeq_id = I40IW_AE_DDP_UBE_INVALID_MO;
+		else if ((rdma_ctl & 0xc0) != 0x40)
+			aeq_id = I40IW_AE_RDMAP_ROE_INVALID_RDMAP_VERSION;
+
+		info->ae_id = aeq_id;
+		if (info->ae_id) {
+			/* Bad terminate recvd - send back a terminate */
+			i40iw_terminate_connection(qp, info);
+			return;
+		}
+	}
+
+	qp->term_flags |= I40IW_TERM_RCVD;
+	qp->eventtype = TERM_EVENT_QP_FATAL;
+	termhdr = (struct i40iw_terminate_hdr *)&mpa[5];
+	if (termhdr->layer_etype == RDMAP_REMOTE_PROT ||
+	    termhdr->layer_etype == RDMAP_REMOTE_OP) {
+		i40iw_terminate_done(qp, 0);
+	} else {
+		i40iw_terminate_start_timer(qp);
+		i40iw_terminate_send_fin(qp);
+	}
+}
+
+/**
+ * i40iw_sc_vsi_init - Initialize virtual device
+ * @vsi: pointer to the vsi structure
+ * @info: parameters to initialize vsi
+ **/
+void i40iw_sc_vsi_init(struct i40iw_sc_vsi *vsi, struct i40iw_vsi_init_info *info)
+{
+	int i;
+
+	vsi->dev = info->dev;
+	vsi->back_vsi = info->back_vsi;
+	vsi->mtu = info->params->mtu;
+	vsi->exception_lan_queue = info->exception_lan_queue;
+	i40iw_fill_qos_list(info->params->qs_handle_list);
+
+	for (i = 0; i < I40IW_MAX_USER_PRIORITY; i++) {
+		vsi->qos[i].qs_handle = info->params->qs_handle_list[i];
+		i40iw_debug(vsi->dev, I40IW_DEBUG_DCB, "qset[%d]: %d\n", i,
+			    vsi->qos[i].qs_handle);
+		spin_lock_init(&vsi->qos[i].lock);
+		INIT_LIST_HEAD(&vsi->qos[i].qplist);
+	}
+}
+
+/**
+ * i40iw_hw_stats_init - Initiliaze HW stats table
+ * @stats: pestat struct
+ * @fcn_idx: PCI fn id
+ * @is_pf: Is it a PF?
+ *
+ * Populate the HW stats table with register offset addr for each
+ * stats. And start the perioidic stats timer.
+ */
+void i40iw_hw_stats_init(struct i40iw_vsi_pestat *stats, u8 fcn_idx, bool is_pf)
+{
+	u32 stats_reg_offset;
+	u32 stats_index;
+	struct i40iw_dev_hw_stats_offsets *stats_table =
+		&stats->hw_stats_offsets;
+	struct i40iw_dev_hw_stats *last_rd_stats = &stats->last_read_hw_stats;
+
+	if (is_pf) {
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP4RXDISCARD] =
+				I40E_GLPES_PFIP4RXDISCARD(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP4RXTRUNC] =
+				I40E_GLPES_PFIP4RXTRUNC(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP4TXNOROUTE] =
+				I40E_GLPES_PFIP4TXNOROUTE(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP6RXDISCARD] =
+				I40E_GLPES_PFIP6RXDISCARD(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP6RXTRUNC] =
+				I40E_GLPES_PFIP6RXTRUNC(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP6TXNOROUTE] =
+				I40E_GLPES_PFIP6TXNOROUTE(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_TCPRTXSEG] =
+				I40E_GLPES_PFTCPRTXSEG(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_TCPRXOPTERR] =
+				I40E_GLPES_PFTCPRXOPTERR(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_TCPRXPROTOERR] =
+				I40E_GLPES_PFTCPRXPROTOERR(fcn_idx);
+
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXOCTS] =
+				I40E_GLPES_PFIP4RXOCTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXPKTS] =
+				I40E_GLPES_PFIP4RXPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXFRAGS] =
+				I40E_GLPES_PFIP4RXFRAGSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXMCPKTS] =
+				I40E_GLPES_PFIP4RXMCPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXOCTS] =
+				I40E_GLPES_PFIP4TXOCTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXPKTS] =
+				I40E_GLPES_PFIP4TXPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXFRAGS] =
+				I40E_GLPES_PFIP4TXFRAGSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXMCPKTS] =
+				I40E_GLPES_PFIP4TXMCPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXOCTS] =
+				I40E_GLPES_PFIP6RXOCTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXPKTS] =
+				I40E_GLPES_PFIP6RXPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXFRAGS] =
+				I40E_GLPES_PFIP6RXFRAGSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXMCPKTS] =
+				I40E_GLPES_PFIP6RXMCPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXOCTS] =
+				I40E_GLPES_PFIP6TXOCTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXPKTS] =
+				I40E_GLPES_PFIP6TXPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXPKTS] =
+				I40E_GLPES_PFIP6TXPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXFRAGS] =
+				I40E_GLPES_PFIP6TXFRAGSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_TCPRXSEGS] =
+				I40E_GLPES_PFTCPRXSEGSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_TCPTXSEG] =
+				I40E_GLPES_PFTCPTXSEGLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMARXRDS] =
+				I40E_GLPES_PFRDMARXRDSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMARXSNDS] =
+				I40E_GLPES_PFRDMARXSNDSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMARXWRS] =
+				I40E_GLPES_PFRDMARXWRSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMATXRDS] =
+				I40E_GLPES_PFRDMATXRDSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMATXSNDS] =
+				I40E_GLPES_PFRDMATXSNDSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMATXWRS] =
+				I40E_GLPES_PFRDMATXWRSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMAVBND] =
+				I40E_GLPES_PFRDMAVBNDLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMAVINV] =
+				I40E_GLPES_PFRDMAVINVLO(fcn_idx);
+	} else {
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP4RXDISCARD] =
+				I40E_GLPES_VFIP4RXDISCARD(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP4RXTRUNC] =
+				I40E_GLPES_VFIP4RXTRUNC(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP4TXNOROUTE] =
+				I40E_GLPES_VFIP4TXNOROUTE(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP6RXDISCARD] =
+				I40E_GLPES_VFIP6RXDISCARD(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP6RXTRUNC] =
+				I40E_GLPES_VFIP6RXTRUNC(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP6TXNOROUTE] =
+				I40E_GLPES_VFIP6TXNOROUTE(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_TCPRTXSEG] =
+				I40E_GLPES_VFTCPRTXSEG(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_TCPRXOPTERR] =
+				I40E_GLPES_VFTCPRXOPTERR(fcn_idx);
+		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_TCPRXPROTOERR] =
+				I40E_GLPES_VFTCPRXPROTOERR(fcn_idx);
+
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXOCTS] =
+				I40E_GLPES_VFIP4RXOCTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXPKTS] =
+				I40E_GLPES_VFIP4RXPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXFRAGS] =
+				I40E_GLPES_VFIP4RXFRAGSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXMCPKTS] =
+				I40E_GLPES_VFIP4RXMCPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXOCTS] =
+				I40E_GLPES_VFIP4TXOCTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXPKTS] =
+				I40E_GLPES_VFIP4TXPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXFRAGS] =
+				I40E_GLPES_VFIP4TXFRAGSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXMCPKTS] =
+				I40E_GLPES_VFIP4TXMCPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXOCTS] =
+				I40E_GLPES_VFIP6RXOCTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXPKTS] =
+				I40E_GLPES_VFIP6RXPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXFRAGS] =
+				I40E_GLPES_VFIP6RXFRAGSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXMCPKTS] =
+				I40E_GLPES_VFIP6RXMCPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXOCTS] =
+				I40E_GLPES_VFIP6TXOCTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXPKTS] =
+				I40E_GLPES_VFIP6TXPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXPKTS] =
+				I40E_GLPES_VFIP6TXPKTSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXFRAGS] =
+				I40E_GLPES_VFIP6TXFRAGSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_TCPRXSEGS] =
+				I40E_GLPES_VFTCPRXSEGSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_TCPTXSEG] =
+				I40E_GLPES_VFTCPTXSEGLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMARXRDS] =
+				I40E_GLPES_VFRDMARXRDSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMARXSNDS] =
+				I40E_GLPES_VFRDMARXSNDSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMARXWRS] =
+				I40E_GLPES_VFRDMARXWRSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMATXRDS] =
+				I40E_GLPES_VFRDMATXRDSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMATXSNDS] =
+				I40E_GLPES_VFRDMATXSNDSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMATXWRS] =
+				I40E_GLPES_VFRDMATXWRSLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMAVBND] =
+				I40E_GLPES_VFRDMAVBNDLO(fcn_idx);
+		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMAVINV] =
+				I40E_GLPES_VFRDMAVINVLO(fcn_idx);
+	}
+
+	for (stats_index = 0; stats_index < I40IW_HW_STAT_INDEX_MAX_64;
+	     stats_index++) {
+		stats_reg_offset = stats_table->stats_offset_64[stats_index];
+		last_rd_stats->stats_value_64[stats_index] =
+			readq(stats->hw->hw_addr + stats_reg_offset);
+	}
+
+	for (stats_index = 0; stats_index < I40IW_HW_STAT_INDEX_MAX_32;
+	     stats_index++) {
+		stats_reg_offset = stats_table->stats_offset_32[stats_index];
+		last_rd_stats->stats_value_32[stats_index] =
+			i40iw_rd32(stats->hw, stats_reg_offset);
+	}
+}
+
+/**
+ * i40iw_hw_stats_read_32 - Read 32-bit HW stats counters and accommodates for roll-overs.
+ * @stat: pestat struct
+ * @index: index in HW stats table which contains offset reg-addr
+ * @value: hw stats value
+ */
+void i40iw_hw_stats_read_32(struct i40iw_vsi_pestat *stats,
+			    enum i40iw_hw_stats_index_32b index,
+			    u64 *value)
+{
+	struct i40iw_dev_hw_stats_offsets *stats_table =
+		&stats->hw_stats_offsets;
+	struct i40iw_dev_hw_stats *last_rd_stats = &stats->last_read_hw_stats;
+	struct i40iw_dev_hw_stats *hw_stats = &stats->hw_stats;
+	u64 new_stats_value = 0;
+	u32 stats_reg_offset = stats_table->stats_offset_32[index];
+
+	new_stats_value = i40iw_rd32(stats->hw, stats_reg_offset);
+	/*roll-over case */
+	if (new_stats_value < last_rd_stats->stats_value_32[index])
+		hw_stats->stats_value_32[index] += new_stats_value;
+	else
+		hw_stats->stats_value_32[index] +=
+			new_stats_value - last_rd_stats->stats_value_32[index];
+	last_rd_stats->stats_value_32[index] = new_stats_value;
+	*value = hw_stats->stats_value_32[index];
+}
+
+/**
+ * i40iw_hw_stats_read_64 - Read HW stats counters (greater than 32-bit) and accommodates for roll-overs.
+ * @stats: pestat struct
+ * @index: index in HW stats table which contains offset reg-addr
+ * @value: hw stats value
+ */
+void i40iw_hw_stats_read_64(struct i40iw_vsi_pestat *stats,
+			    enum i40iw_hw_stats_index_64b index,
+			    u64 *value)
+{
+	struct i40iw_dev_hw_stats_offsets *stats_table =
+		&stats->hw_stats_offsets;
+	struct i40iw_dev_hw_stats *last_rd_stats = &stats->last_read_hw_stats;
+	struct i40iw_dev_hw_stats *hw_stats = &stats->hw_stats;
+	u64 new_stats_value = 0;
+	u32 stats_reg_offset = stats_table->stats_offset_64[index];
+
+	new_stats_value = readq(stats->hw->hw_addr + stats_reg_offset);
+	/*roll-over case */
+	if (new_stats_value < last_rd_stats->stats_value_64[index])
+		hw_stats->stats_value_64[index] += new_stats_value;
+	else
+		hw_stats->stats_value_64[index] +=
+			new_stats_value - last_rd_stats->stats_value_64[index];
+	last_rd_stats->stats_value_64[index] = new_stats_value;
+	*value = hw_stats->stats_value_64[index];
+}
+
+/**
+ * i40iw_hw_stats_read_all - read all HW stat counters
+ * @stats: pestat struct
+ * @stats_values: hw stats structure
+ *
+ * Read all the HW stat counters and populates hw_stats structure
+ * of passed-in vsi's pestat as well as copy created in stat_values.
+ */
+void i40iw_hw_stats_read_all(struct i40iw_vsi_pestat *stats,
+			     struct i40iw_dev_hw_stats *stats_values)
+{
+	u32 stats_index;
+	unsigned long flags;
+
+	spin_lock_irqsave(&stats->lock, flags);
+
+	for (stats_index = 0; stats_index < I40IW_HW_STAT_INDEX_MAX_32;
+	     stats_index++)
+		i40iw_hw_stats_read_32(stats, stats_index,
+				       &stats_values->stats_value_32[stats_index]);
+	for (stats_index = 0; stats_index < I40IW_HW_STAT_INDEX_MAX_64;
+	     stats_index++)
+		i40iw_hw_stats_read_64(stats, stats_index,
+				       &stats_values->stats_value_64[stats_index]);
+	spin_unlock_irqrestore(&stats->lock, flags);
+}
+
+/**
+ * i40iw_hw_stats_refresh_all - Update all HW stats structs
+ * @stats: pestat struct
+ *
+ * Read all the HW stats counters to refresh values in hw_stats structure
+ * of passed-in dev's pestat
+ */
+void i40iw_hw_stats_refresh_all(struct i40iw_vsi_pestat *stats)
+{
+	u64 stats_value;
+	u32 stats_index;
+	unsigned long flags;
+
+	spin_lock_irqsave(&stats->lock, flags);
+
+	for (stats_index = 0; stats_index < I40IW_HW_STAT_INDEX_MAX_32;
+	     stats_index++)
+		i40iw_hw_stats_read_32(stats, stats_index, &stats_value);
+	for (stats_index = 0; stats_index < I40IW_HW_STAT_INDEX_MAX_64;
+	     stats_index++)
+		i40iw_hw_stats_read_64(stats, stats_index, &stats_value);
+	spin_unlock_irqrestore(&stats->lock, flags);
+}
+
+/**
+ * i40iw_get_fcn_id - Return the function id
+ * @dev: pointer to the device
+ */
+static u8 i40iw_get_fcn_id(struct i40iw_sc_dev *dev)
+{
+	u8 fcn_id = I40IW_INVALID_FCN_ID;
+	u8 i;
+
+	for (i = I40IW_FIRST_NON_PF_STAT; i < I40IW_MAX_STATS_COUNT; i++)
+		if (!dev->fcn_id_array[i]) {
+			fcn_id = i;
+			dev->fcn_id_array[i] = true;
+			break;
+		}
+	return fcn_id;
+}
+
+/**
+ * i40iw_vsi_stats_init - Initialize the vsi statistics
+ * @vsi: pointer to the vsi structure
+ * @info: The info structure used for initialization
+ */
+enum i40iw_status_code i40iw_vsi_stats_init(struct i40iw_sc_vsi *vsi, struct i40iw_vsi_stats_info *info)
+{
+	u8 fcn_id = info->fcn_id;
+
+	if (info->alloc_fcn_id)
+		fcn_id = i40iw_get_fcn_id(vsi->dev);
+
+	if (fcn_id == I40IW_INVALID_FCN_ID)
+		return I40IW_ERR_NOT_READY;
+
+	vsi->pestat = info->pestat;
+	vsi->pestat->hw = vsi->dev->hw;
+	vsi->pestat->vsi = vsi;
+
+	if (info->stats_initialize) {
+		i40iw_hw_stats_init(vsi->pestat, fcn_id, true);
+		spin_lock_init(&vsi->pestat->lock);
+		i40iw_hw_stats_start_timer(vsi);
+	}
+	vsi->stats_fcn_id_alloc = info->alloc_fcn_id;
+	vsi->fcn_id = fcn_id;
+	return I40IW_SUCCESS;
+}
+
+/**
+ * i40iw_vsi_stats_free - Free the vsi stats
+ * @vsi: pointer to the vsi structure
+ */
+void i40iw_vsi_stats_free(struct i40iw_sc_vsi *vsi)
+{
+	u8 fcn_id = vsi->fcn_id;
+
+	if (vsi->stats_fcn_id_alloc && fcn_id < I40IW_MAX_STATS_COUNT)
+		vsi->dev->fcn_id_array[fcn_id] = false;
+	i40iw_hw_stats_stop_timer(vsi);
+}
+
+static struct i40iw_cqp_ops iw_cqp_ops = {
+	.cqp_init = i40iw_sc_cqp_init,
+	.cqp_create = i40iw_sc_cqp_create,
+	.cqp_post_sq = i40iw_sc_cqp_post_sq,
+	.cqp_get_next_send_wqe = i40iw_sc_cqp_get_next_send_wqe,
+	.cqp_destroy = i40iw_sc_cqp_destroy,
+	.poll_for_cqp_op_done = i40iw_sc_poll_for_cqp_op_done
+};
+
+static struct i40iw_ccq_ops iw_ccq_ops = {
+	.ccq_init = i40iw_sc_ccq_init,
+	.ccq_create = i40iw_sc_ccq_create,
+	.ccq_destroy = i40iw_sc_ccq_destroy,
+	.ccq_create_done = i40iw_sc_ccq_create_done,
+	.ccq_get_cqe_info = i40iw_sc_ccq_get_cqe_info,
+	.ccq_arm = i40iw_sc_ccq_arm
+};
+
+static struct i40iw_ceq_ops iw_ceq_ops = {
+	.ceq_init = i40iw_sc_ceq_init,
+	.ceq_create = i40iw_sc_ceq_create,
+	.cceq_create_done = i40iw_sc_cceq_create_done,
+	.cceq_destroy_done = i40iw_sc_cceq_destroy_done,
+	.cceq_create = i40iw_sc_cceq_create,
+	.ceq_destroy = i40iw_sc_ceq_destroy,
+	.process_ceq = i40iw_sc_process_ceq
+};
+
+static struct i40iw_aeq_ops iw_aeq_ops = {
+	.aeq_init = i40iw_sc_aeq_init,
+	.aeq_create = i40iw_sc_aeq_create,
+	.aeq_destroy = i40iw_sc_aeq_destroy,
+	.get_next_aeqe = i40iw_sc_get_next_aeqe,
+	.repost_aeq_entries = i40iw_sc_repost_aeq_entries,
+	.aeq_create_done = i40iw_sc_aeq_create_done,
+	.aeq_destroy_done = i40iw_sc_aeq_destroy_done
+};
+
+/* iwarp pd ops */
+static struct i40iw_pd_ops iw_pd_ops = {
+	.pd_init = i40iw_sc_pd_init,
+};
+
+static struct i40iw_priv_qp_ops iw_priv_qp_ops = {
+	.qp_init = i40iw_sc_qp_init,
+	.qp_create = i40iw_sc_qp_create,
+	.qp_modify = i40iw_sc_qp_modify,
+	.qp_destroy = i40iw_sc_qp_destroy,
+	.qp_flush_wqes = i40iw_sc_qp_flush_wqes,
+	.qp_upload_context = i40iw_sc_qp_upload_context,
+	.qp_setctx = i40iw_sc_qp_setctx,
+	.qp_send_lsmm = i40iw_sc_send_lsmm,
+	.qp_send_lsmm_nostag = i40iw_sc_send_lsmm_nostag,
+	.qp_send_rtt = i40iw_sc_send_rtt,
+	.qp_post_wqe0 = i40iw_sc_post_wqe0,
+	.iw_mr_fast_register = i40iw_sc_mr_fast_register
+};
+
+static struct i40iw_priv_cq_ops iw_priv_cq_ops = {
+	.cq_init = i40iw_sc_cq_init,
+	.cq_create = i40iw_sc_cq_create,
+	.cq_destroy = i40iw_sc_cq_destroy,
+	.cq_modify = i40iw_sc_cq_modify,
+};
+
+static struct i40iw_mr_ops iw_mr_ops = {
+	.alloc_stag = i40iw_sc_alloc_stag,
+	.mr_reg_non_shared = i40iw_sc_mr_reg_non_shared,
+	.mr_reg_shared = i40iw_sc_mr_reg_shared,
+	.dealloc_stag = i40iw_sc_dealloc_stag,
+	.query_stag = i40iw_sc_query_stag,
+	.mw_alloc = i40iw_sc_mw_alloc
+};
+
+static struct i40iw_cqp_misc_ops iw_cqp_misc_ops = {
+	.manage_push_page = i40iw_sc_manage_push_page,
+	.manage_hmc_pm_func_table = i40iw_sc_manage_hmc_pm_func_table,
+	.set_hmc_resource_profile = i40iw_sc_set_hmc_resource_profile,
+	.commit_fpm_values = i40iw_sc_commit_fpm_values,
+	.query_fpm_values = i40iw_sc_query_fpm_values,
+	.static_hmc_pages_allocated = i40iw_sc_static_hmc_pages_allocated,
+	.add_arp_cache_entry = i40iw_sc_add_arp_cache_entry,
+	.del_arp_cache_entry = i40iw_sc_del_arp_cache_entry,
+	.query_arp_cache_entry = i40iw_sc_query_arp_cache_entry,
+	.manage_apbvt_entry = i40iw_sc_manage_apbvt_entry,
+	.manage_qhash_table_entry = i40iw_sc_manage_qhash_table_entry,
+	.alloc_local_mac_ipaddr_table_entry = i40iw_sc_alloc_local_mac_ipaddr_entry,
+	.add_local_mac_ipaddr_entry = i40iw_sc_add_local_mac_ipaddr_entry,
+	.del_local_mac_ipaddr_entry = i40iw_sc_del_local_mac_ipaddr_entry,
+	.cqp_nop = i40iw_sc_cqp_nop,
+	.commit_fpm_values_done = i40iw_sc_commit_fpm_values_done,
+	.query_fpm_values_done = i40iw_sc_query_fpm_values_done,
+	.manage_hmc_pm_func_table_done = i40iw_sc_manage_hmc_pm_func_table_done,
+	.update_suspend_qp = i40iw_sc_suspend_qp,
+	.update_resume_qp = i40iw_sc_resume_qp
+};
+
+static struct i40iw_hmc_ops iw_hmc_ops = {
+	.init_iw_hmc = i40iw_sc_init_iw_hmc,
+	.parse_fpm_query_buf = i40iw_sc_parse_fpm_query_buf,
+	.configure_iw_fpm = i40iw_sc_configure_iw_fpm,
+	.parse_fpm_commit_buf = i40iw_sc_parse_fpm_commit_buf,
+	.create_hmc_object = i40iw_sc_create_hmc_obj,
+	.del_hmc_object = i40iw_sc_del_hmc_obj
+};
+
+/**
+ * i40iw_device_init - Initialize IWARP device
+ * @dev: IWARP device pointer
+ * @info: IWARP init info
+ */
+enum i40iw_status_code i40iw_device_init(struct i40iw_sc_dev *dev,
+					 struct i40iw_device_init_info *info)
+{
+	u32 val;
+	u32 vchnl_ver = 0;
+	u16 hmc_fcn = 0;
+	enum i40iw_status_code ret_code = 0;
+	u8 db_size;
+
+	spin_lock_init(&dev->cqp_lock);
+
+	i40iw_device_init_uk(&dev->dev_uk);
+
+	dev->debug_mask = info->debug_mask;
+
+	dev->hmc_fn_id = info->hmc_fn_id;
+	dev->is_pf = info->is_pf;
+
+	dev->fpm_query_buf_pa = info->fpm_query_buf_pa;
+	dev->fpm_query_buf = info->fpm_query_buf;
+
+	dev->fpm_commit_buf_pa = info->fpm_commit_buf_pa;
+	dev->fpm_commit_buf = info->fpm_commit_buf;
+
+	dev->hw = info->hw;
+	dev->hw->hw_addr = info->bar0;
+
+	if (dev->is_pf) {
+		val = i40iw_rd32(dev->hw, I40E_GLPCI_DREVID);
+		dev->hw_rev = (u8)RS_32(val, I40E_GLPCI_DREVID_DEFAULT_REVID);
+
+		val = i40iw_rd32(dev->hw, I40E_GLPCI_LBARCTRL);
+		db_size = (u8)RS_32(val, I40E_GLPCI_LBARCTRL_PE_DB_SIZE);
+		if ((db_size != I40IW_PE_DB_SIZE_4M) &&
+		    (db_size != I40IW_PE_DB_SIZE_8M)) {
+			i40iw_debug(dev, I40IW_DEBUG_DEV,
+				    "%s: PE doorbell is not enabled in CSR val 0x%x\n",
+				    __func__, val);
+			ret_code = I40IW_ERR_PE_DOORBELL_NOT_ENABLED;
+			return ret_code;
+		}
+		dev->db_addr = dev->hw->hw_addr + I40IW_DB_ADDR_OFFSET;
+		dev->vchnl_if.vchnl_recv = i40iw_vchnl_recv_pf;
+	} else {
+		dev->db_addr = dev->hw->hw_addr + I40IW_VF_DB_ADDR_OFFSET;
+	}
+
+	dev->cqp_ops = &iw_cqp_ops;
+	dev->ccq_ops = &iw_ccq_ops;
+	dev->ceq_ops = &iw_ceq_ops;
+	dev->aeq_ops = &iw_aeq_ops;
+	dev->cqp_misc_ops = &iw_cqp_misc_ops;
+	dev->iw_pd_ops = &iw_pd_ops;
+	dev->iw_priv_qp_ops = &iw_priv_qp_ops;
+	dev->iw_priv_cq_ops = &iw_priv_cq_ops;
+	dev->mr_ops = &iw_mr_ops;
+	dev->hmc_ops = &iw_hmc_ops;
+	dev->vchnl_if.vchnl_send = info->vchnl_send;
+	if (dev->vchnl_if.vchnl_send)
+		dev->vchnl_up = true;
+	else
+		dev->vchnl_up = false;
+	if (!dev->is_pf) {
+		dev->vchnl_if.vchnl_recv = i40iw_vchnl_recv_vf;
+		ret_code = i40iw_vchnl_vf_get_ver(dev, &vchnl_ver);
+		if (!ret_code) {
+			i40iw_debug(dev, I40IW_DEBUG_DEV,
+				    "%s: Get Channel version rc = 0x%0x, version is %u\n",
+				__func__, ret_code, vchnl_ver);
+			ret_code = i40iw_vchnl_vf_get_hmc_fcn(dev, &hmc_fcn);
+			if (!ret_code) {
+				i40iw_debug(dev, I40IW_DEBUG_DEV,
+					    "%s Get HMC function rc = 0x%0x, hmc fcn is %u\n",
+					    __func__, ret_code, hmc_fcn);
+				dev->hmc_fn_id = (u8)hmc_fcn;
+			}
+		}
+	}
+	dev->iw_vf_cqp_ops = &iw_vf_cqp_ops;
+
+	return ret_code;
+}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h
new file mode 100644
index 0000000..6ddaeec
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_d.h
@@ -0,0 +1,1737 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_D_H
+#define I40IW_D_H
+
+#define I40IW_FIRST_USER_QP_ID  2
+
+#define I40IW_DB_ADDR_OFFSET    (4 * 1024 * 1024 - 64 * 1024)
+#define I40IW_VF_DB_ADDR_OFFSET (64 * 1024)
+
+#define I40IW_PUSH_OFFSET       (4 * 1024 * 1024)
+#define I40IW_PF_FIRST_PUSH_PAGE_INDEX 16
+#define I40IW_VF_PUSH_OFFSET    ((8 + 64) * 1024)
+#define I40IW_VF_FIRST_PUSH_PAGE_INDEX 2
+
+#define I40IW_PE_DB_SIZE_4M     1
+#define I40IW_PE_DB_SIZE_8M     2
+
+#define I40IW_DDP_VER 1
+#define I40IW_RDMAP_VER 1
+
+#define I40IW_RDMA_MODE_RDMAC 0
+#define I40IW_RDMA_MODE_IETF  1
+
+#define I40IW_QP_STATE_INVALID 0
+#define I40IW_QP_STATE_IDLE 1
+#define I40IW_QP_STATE_RTS 2
+#define I40IW_QP_STATE_CLOSING 3
+#define I40IW_QP_STATE_RESERVED 4
+#define I40IW_QP_STATE_TERMINATE 5
+#define I40IW_QP_STATE_ERROR 6
+
+#define I40IW_STAG_STATE_INVALID 0
+#define I40IW_STAG_STATE_VALID 1
+
+#define I40IW_STAG_TYPE_SHARED 0
+#define I40IW_STAG_TYPE_NONSHARED 1
+
+#define I40IW_MAX_USER_PRIORITY 8
+#define I40IW_MAX_STATS_COUNT 16
+#define I40IW_FIRST_NON_PF_STAT	4
+
+
+#define I40IW_MTU_TO_MSS_IPV4		40
+#define I40IW_MTU_TO_MSS_IPV6		60
+#define I40IW_DEFAULT_MTU		1500
+
+#define LS_64_1(val, bits)      ((u64)(uintptr_t)val << bits)
+#define RS_64_1(val, bits)      ((u64)(uintptr_t)val >> bits)
+#define LS_32_1(val, bits)      (u32)(val << bits)
+#define RS_32_1(val, bits)      (u32)(val >> bits)
+#define I40E_HI_DWORD(x)        ((u32)((((x) >> 16) >> 16) & 0xFFFFFFFF))
+
+#define QS_HANDLE_UNKNOWN       0xffff
+
+#define LS_64(val, field) (((u64)val << field ## _SHIFT) & (field ## _MASK))
+
+#define RS_64(val, field) ((u64)(val & field ## _MASK) >> field ## _SHIFT)
+#define LS_32(val, field) ((val << field ## _SHIFT) & (field ## _MASK))
+#define RS_32(val, field) ((val & field ## _MASK) >> field ## _SHIFT)
+
+#define TERM_DDP_LEN_TAGGED     14
+#define TERM_DDP_LEN_UNTAGGED   18
+#define TERM_RDMA_LEN           28
+#define RDMA_OPCODE_MASK        0x0f
+#define RDMA_READ_REQ_OPCODE    1
+#define Q2_BAD_FRAME_OFFSET     72
+#define Q2_FPSN_OFFSET          64
+#define CQE_MAJOR_DRV           0x8000
+
+#define I40IW_TERM_SENT 0x01
+#define I40IW_TERM_RCVD 0x02
+#define I40IW_TERM_DONE 0x04
+#define I40IW_MAC_HLEN  14
+
+#define I40IW_INVALID_WQE_INDEX 0xffffffff
+
+#define I40IW_CQP_WAIT_POLL_REGS 1
+#define I40IW_CQP_WAIT_POLL_CQ 2
+#define I40IW_CQP_WAIT_EVENT 3
+
+#define I40IW_CQP_INIT_WQE(wqe) memset(wqe, 0, 64)
+
+#define I40IW_GET_CURRENT_CQ_ELEMENT(_cq) \
+	( \
+		&((_cq)->cq_base[I40IW_RING_GETCURRENT_HEAD((_cq)->cq_ring)])  \
+	)
+#define I40IW_GET_CURRENT_EXTENDED_CQ_ELEMENT(_cq) \
+	( \
+		&(((struct i40iw_extended_cqe *)        \
+		   ((_cq)->cq_base))[I40IW_RING_GETCURRENT_HEAD((_cq)->cq_ring)]) \
+	)
+
+#define I40IW_GET_CURRENT_AEQ_ELEMENT(_aeq) \
+	( \
+		&_aeq->aeqe_base[I40IW_RING_GETCURRENT_TAIL(_aeq->aeq_ring)]   \
+	)
+
+#define I40IW_GET_CURRENT_CEQ_ELEMENT(_ceq) \
+	( \
+		&_ceq->ceqe_base[I40IW_RING_GETCURRENT_TAIL(_ceq->ceq_ring)]   \
+	)
+
+#define I40IW_AE_SOURCE_RSVD            0x0
+#define I40IW_AE_SOURCE_RQ              0x1
+#define I40IW_AE_SOURCE_RQ_0011         0x3
+
+#define I40IW_AE_SOURCE_CQ              0x2
+#define I40IW_AE_SOURCE_CQ_0110         0x6
+#define I40IW_AE_SOURCE_CQ_1010         0xA
+#define I40IW_AE_SOURCE_CQ_1110         0xE
+
+#define I40IW_AE_SOURCE_SQ              0x5
+#define I40IW_AE_SOURCE_SQ_0111         0x7
+
+#define I40IW_AE_SOURCE_IN_RR_WR        0x9
+#define I40IW_AE_SOURCE_IN_RR_WR_1011   0xB
+#define I40IW_AE_SOURCE_OUT_RR          0xD
+#define I40IW_AE_SOURCE_OUT_RR_1111     0xF
+
+#define I40IW_TCP_STATE_NON_EXISTENT 0
+#define I40IW_TCP_STATE_CLOSED 1
+#define I40IW_TCP_STATE_LISTEN 2
+#define I40IW_STATE_SYN_SEND 3
+#define I40IW_TCP_STATE_SYN_RECEIVED 4
+#define I40IW_TCP_STATE_ESTABLISHED 5
+#define I40IW_TCP_STATE_CLOSE_WAIT 6
+#define I40IW_TCP_STATE_FIN_WAIT_1 7
+#define I40IW_TCP_STATE_CLOSING  8
+#define I40IW_TCP_STATE_LAST_ACK 9
+#define I40IW_TCP_STATE_FIN_WAIT_2 10
+#define I40IW_TCP_STATE_TIME_WAIT 11
+#define I40IW_TCP_STATE_RESERVED_1 12
+#define I40IW_TCP_STATE_RESERVED_2 13
+#define I40IW_TCP_STATE_RESERVED_3 14
+#define I40IW_TCP_STATE_RESERVED_4 15
+
+/* ILQ CQP hash table fields */
+#define I40IW_CQPSQ_QHASH_VLANID_SHIFT 32
+#define I40IW_CQPSQ_QHASH_VLANID_MASK \
+	((u64)0xfff << I40IW_CQPSQ_QHASH_VLANID_SHIFT)
+
+#define I40IW_CQPSQ_QHASH_QPN_SHIFT 32
+#define I40IW_CQPSQ_QHASH_QPN_MASK \
+	((u64)0x3ffff << I40IW_CQPSQ_QHASH_QPN_SHIFT)
+
+#define I40IW_CQPSQ_QHASH_QS_HANDLE_SHIFT 0
+#define I40IW_CQPSQ_QHASH_QS_HANDLE_MASK ((u64)0x3ff << I40IW_CQPSQ_QHASH_QS_HANDLE_SHIFT)
+
+#define I40IW_CQPSQ_QHASH_SRC_PORT_SHIFT 16
+#define I40IW_CQPSQ_QHASH_SRC_PORT_MASK \
+	((u64)0xffff << I40IW_CQPSQ_QHASH_SRC_PORT_SHIFT)
+
+#define I40IW_CQPSQ_QHASH_DEST_PORT_SHIFT 0
+#define I40IW_CQPSQ_QHASH_DEST_PORT_MASK \
+	((u64)0xffff << I40IW_CQPSQ_QHASH_DEST_PORT_SHIFT)
+
+#define I40IW_CQPSQ_QHASH_ADDR0_SHIFT 32
+#define I40IW_CQPSQ_QHASH_ADDR0_MASK \
+	((u64)0xffffffff << I40IW_CQPSQ_QHASH_ADDR0_SHIFT)
+
+#define I40IW_CQPSQ_QHASH_ADDR1_SHIFT 0
+#define I40IW_CQPSQ_QHASH_ADDR1_MASK \
+	((u64)0xffffffff << I40IW_CQPSQ_QHASH_ADDR1_SHIFT)
+
+#define I40IW_CQPSQ_QHASH_ADDR2_SHIFT 32
+#define I40IW_CQPSQ_QHASH_ADDR2_MASK \
+	((u64)0xffffffff << I40IW_CQPSQ_QHASH_ADDR2_SHIFT)
+
+#define I40IW_CQPSQ_QHASH_ADDR3_SHIFT 0
+#define I40IW_CQPSQ_QHASH_ADDR3_MASK \
+	((u64)0xffffffff << I40IW_CQPSQ_QHASH_ADDR3_SHIFT)
+
+#define I40IW_CQPSQ_QHASH_WQEVALID_SHIFT 63
+#define I40IW_CQPSQ_QHASH_WQEVALID_MASK \
+	((u64)0x1 << I40IW_CQPSQ_QHASH_WQEVALID_SHIFT)
+#define I40IW_CQPSQ_QHASH_OPCODE_SHIFT 32
+#define I40IW_CQPSQ_QHASH_OPCODE_MASK \
+	((u64)0x3f << I40IW_CQPSQ_QHASH_OPCODE_SHIFT)
+
+#define I40IW_CQPSQ_QHASH_MANAGE_SHIFT 61
+#define I40IW_CQPSQ_QHASH_MANAGE_MASK \
+	((u64)0x3 << I40IW_CQPSQ_QHASH_MANAGE_SHIFT)
+
+#define I40IW_CQPSQ_QHASH_IPV4VALID_SHIFT 60
+#define I40IW_CQPSQ_QHASH_IPV4VALID_MASK \
+	((u64)0x1 << I40IW_CQPSQ_QHASH_IPV4VALID_SHIFT)
+
+#define I40IW_CQPSQ_QHASH_VLANVALID_SHIFT 59
+#define I40IW_CQPSQ_QHASH_VLANVALID_MASK \
+	((u64)0x1 << I40IW_CQPSQ_QHASH_VLANVALID_SHIFT)
+
+#define I40IW_CQPSQ_QHASH_ENTRYTYPE_SHIFT 42
+#define I40IW_CQPSQ_QHASH_ENTRYTYPE_MASK \
+	((u64)0x7 << I40IW_CQPSQ_QHASH_ENTRYTYPE_SHIFT)
+/* CQP Host Context */
+#define I40IW_CQPHC_EN_DC_TCP_SHIFT 0
+#define I40IW_CQPHC_EN_DC_TCP_MASK (1UL << I40IW_CQPHC_EN_DC_TCP_SHIFT)
+
+#define I40IW_CQPHC_SQSIZE_SHIFT 8
+#define I40IW_CQPHC_SQSIZE_MASK (0xfUL << I40IW_CQPHC_SQSIZE_SHIFT)
+
+#define I40IW_CQPHC_DISABLE_PFPDUS_SHIFT 1
+#define I40IW_CQPHC_DISABLE_PFPDUS_MASK (0x1UL << I40IW_CQPHC_DISABLE_PFPDUS_SHIFT)
+
+#define I40IW_CQPHC_ENABLED_VFS_SHIFT 32
+#define I40IW_CQPHC_ENABLED_VFS_MASK (0x3fULL << I40IW_CQPHC_ENABLED_VFS_SHIFT)
+
+#define I40IW_CQPHC_HMC_PROFILE_SHIFT 0
+#define I40IW_CQPHC_HMC_PROFILE_MASK (0x7ULL << I40IW_CQPHC_HMC_PROFILE_SHIFT)
+
+#define I40IW_CQPHC_SVER_SHIFT 24
+#define I40IW_CQPHC_SVER_MASK (0xffUL << I40IW_CQPHC_SVER_SHIFT)
+
+#define I40IW_CQPHC_SQBASE_SHIFT 9
+#define I40IW_CQPHC_SQBASE_MASK \
+	(0xfffffffffffffeULL << I40IW_CQPHC_SQBASE_SHIFT)
+
+#define I40IW_CQPHC_QPCTX_SHIFT 0
+#define I40IW_CQPHC_QPCTX_MASK  \
+	(0xffffffffffffffffULL << I40IW_CQPHC_QPCTX_SHIFT)
+#define I40IW_CQPHC_SVER        1
+
+#define I40IW_CQP_SW_SQSIZE_4 4
+#define I40IW_CQP_SW_SQSIZE_2048 2048
+
+/* iWARP QP Doorbell shadow area */
+#define I40IW_QP_DBSA_HW_SQ_TAIL_SHIFT 0
+#define I40IW_QP_DBSA_HW_SQ_TAIL_MASK \
+	(0x3fffUL << I40IW_QP_DBSA_HW_SQ_TAIL_SHIFT)
+
+/* Completion Queue Doorbell shadow area */
+#define I40IW_CQ_DBSA_CQEIDX_SHIFT 0
+#define I40IW_CQ_DBSA_CQEIDX_MASK (0xfffffUL << I40IW_CQ_DBSA_CQEIDX_SHIFT)
+
+#define I40IW_CQ_DBSA_SW_CQ_SELECT_SHIFT 0
+#define I40IW_CQ_DBSA_SW_CQ_SELECT_MASK \
+	(0x3fffUL << I40IW_CQ_DBSA_SW_CQ_SELECT_SHIFT)
+
+#define I40IW_CQ_DBSA_ARM_NEXT_SHIFT 14
+#define I40IW_CQ_DBSA_ARM_NEXT_MASK (1UL << I40IW_CQ_DBSA_ARM_NEXT_SHIFT)
+
+#define I40IW_CQ_DBSA_ARM_NEXT_SE_SHIFT 15
+#define I40IW_CQ_DBSA_ARM_NEXT_SE_MASK (1UL << I40IW_CQ_DBSA_ARM_NEXT_SE_SHIFT)
+
+#define I40IW_CQ_DBSA_ARM_SEQ_NUM_SHIFT 16
+#define I40IW_CQ_DBSA_ARM_SEQ_NUM_MASK \
+	(0x3UL << I40IW_CQ_DBSA_ARM_SEQ_NUM_SHIFT)
+
+/* CQP and iWARP Completion Queue */
+#define I40IW_CQ_QPCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IW_CQ_QPCTX_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IW_CCQ_OPRETVAL_SHIFT 0
+#define I40IW_CCQ_OPRETVAL_MASK (0xffffffffUL << I40IW_CCQ_OPRETVAL_SHIFT)
+
+#define I40IW_CQ_MINERR_SHIFT 0
+#define I40IW_CQ_MINERR_MASK (0xffffUL << I40IW_CQ_MINERR_SHIFT)
+
+#define I40IW_CQ_MAJERR_SHIFT 16
+#define I40IW_CQ_MAJERR_MASK (0xffffUL << I40IW_CQ_MAJERR_SHIFT)
+
+#define I40IW_CQ_WQEIDX_SHIFT 32
+#define I40IW_CQ_WQEIDX_MASK (0x3fffULL << I40IW_CQ_WQEIDX_SHIFT)
+
+#define I40IW_CQ_ERROR_SHIFT 55
+#define I40IW_CQ_ERROR_MASK (1ULL << I40IW_CQ_ERROR_SHIFT)
+
+#define I40IW_CQ_SQ_SHIFT 62
+#define I40IW_CQ_SQ_MASK (1ULL << I40IW_CQ_SQ_SHIFT)
+
+#define I40IW_CQ_VALID_SHIFT 63
+#define I40IW_CQ_VALID_MASK (1ULL << I40IW_CQ_VALID_SHIFT)
+
+#define I40IWCQ_PAYLDLEN_SHIFT 0
+#define I40IWCQ_PAYLDLEN_MASK (0xffffffffUL << I40IWCQ_PAYLDLEN_SHIFT)
+
+#define I40IWCQ_TCPSEQNUM_SHIFT 32
+#define I40IWCQ_TCPSEQNUM_MASK (0xffffffffULL << I40IWCQ_TCPSEQNUM_SHIFT)
+
+#define I40IWCQ_INVSTAG_SHIFT 0
+#define I40IWCQ_INVSTAG_MASK (0xffffffffUL << I40IWCQ_INVSTAG_SHIFT)
+
+#define I40IWCQ_QPID_SHIFT 32
+#define I40IWCQ_QPID_MASK (0x3ffffULL << I40IWCQ_QPID_SHIFT)
+
+#define I40IWCQ_PSHDROP_SHIFT 51
+#define I40IWCQ_PSHDROP_MASK (1ULL << I40IWCQ_PSHDROP_SHIFT)
+
+#define I40IWCQ_SRQ_SHIFT 52
+#define I40IWCQ_SRQ_MASK (1ULL << I40IWCQ_SRQ_SHIFT)
+
+#define I40IWCQ_STAG_SHIFT 53
+#define I40IWCQ_STAG_MASK (1ULL << I40IWCQ_STAG_SHIFT)
+
+#define I40IWCQ_SOEVENT_SHIFT 54
+#define I40IWCQ_SOEVENT_MASK (1ULL << I40IWCQ_SOEVENT_SHIFT)
+
+#define I40IWCQ_OP_SHIFT 56
+#define I40IWCQ_OP_MASK (0x3fULL << I40IWCQ_OP_SHIFT)
+
+/* CEQE format */
+#define I40IW_CEQE_CQCTX_SHIFT 0
+#define I40IW_CEQE_CQCTX_MASK   \
+	(0x7fffffffffffffffULL << I40IW_CEQE_CQCTX_SHIFT)
+
+#define I40IW_CEQE_VALID_SHIFT 63
+#define I40IW_CEQE_VALID_MASK (1ULL << I40IW_CEQE_VALID_SHIFT)
+
+/* AEQE format */
+#define I40IW_AEQE_COMPCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IW_AEQE_COMPCTX_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IW_AEQE_QPCQID_SHIFT 0
+#define I40IW_AEQE_QPCQID_MASK (0x3ffffUL << I40IW_AEQE_QPCQID_SHIFT)
+
+#define I40IW_AEQE_WQDESCIDX_SHIFT 18
+#define I40IW_AEQE_WQDESCIDX_MASK (0x3fffULL << I40IW_AEQE_WQDESCIDX_SHIFT)
+
+#define I40IW_AEQE_OVERFLOW_SHIFT 33
+#define I40IW_AEQE_OVERFLOW_MASK (1ULL << I40IW_AEQE_OVERFLOW_SHIFT)
+
+#define I40IW_AEQE_AECODE_SHIFT 34
+#define I40IW_AEQE_AECODE_MASK (0xffffULL << I40IW_AEQE_AECODE_SHIFT)
+
+#define I40IW_AEQE_AESRC_SHIFT 50
+#define I40IW_AEQE_AESRC_MASK (0xfULL << I40IW_AEQE_AESRC_SHIFT)
+
+#define I40IW_AEQE_IWSTATE_SHIFT 54
+#define I40IW_AEQE_IWSTATE_MASK (0x7ULL << I40IW_AEQE_IWSTATE_SHIFT)
+
+#define I40IW_AEQE_TCPSTATE_SHIFT 57
+#define I40IW_AEQE_TCPSTATE_MASK (0xfULL << I40IW_AEQE_TCPSTATE_SHIFT)
+
+#define I40IW_AEQE_Q2DATA_SHIFT 61
+#define I40IW_AEQE_Q2DATA_MASK (0x3ULL << I40IW_AEQE_Q2DATA_SHIFT)
+
+#define I40IW_AEQE_VALID_SHIFT 63
+#define I40IW_AEQE_VALID_MASK (1ULL << I40IW_AEQE_VALID_SHIFT)
+
+/* CQP SQ WQES */
+#define I40IW_QP_TYPE_IWARP     1
+#define I40IW_QP_TYPE_UDA       2
+#define I40IW_QP_TYPE_CQP       4
+
+#define I40IW_CQ_TYPE_IWARP     1
+#define I40IW_CQ_TYPE_ILQ       2
+#define I40IW_CQ_TYPE_IEQ       3
+#define I40IW_CQ_TYPE_CQP       4
+
+#define I40IWQP_TERM_SEND_TERM_AND_FIN          0
+#define I40IWQP_TERM_SEND_TERM_ONLY             1
+#define I40IWQP_TERM_SEND_FIN_ONLY              2
+#define I40IWQP_TERM_DONOT_SEND_TERM_OR_FIN     3
+
+#define I40IW_CQP_OP_CREATE_QP                  0
+#define I40IW_CQP_OP_MODIFY_QP                  0x1
+#define I40IW_CQP_OP_DESTROY_QP                 0x02
+#define I40IW_CQP_OP_CREATE_CQ                  0x03
+#define I40IW_CQP_OP_MODIFY_CQ                  0x04
+#define I40IW_CQP_OP_DESTROY_CQ                 0x05
+#define I40IW_CQP_OP_CREATE_SRQ                 0x06
+#define I40IW_CQP_OP_MODIFY_SRQ                 0x07
+#define I40IW_CQP_OP_DESTROY_SRQ                0x08
+#define I40IW_CQP_OP_ALLOC_STAG                 0x09
+#define I40IW_CQP_OP_REG_MR                     0x0a
+#define I40IW_CQP_OP_QUERY_STAG                 0x0b
+#define I40IW_CQP_OP_REG_SMR                    0x0c
+#define I40IW_CQP_OP_DEALLOC_STAG               0x0d
+#define I40IW_CQP_OP_MANAGE_LOC_MAC_IP_TABLE    0x0e
+#define I40IW_CQP_OP_MANAGE_ARP                 0x0f
+#define I40IW_CQP_OP_MANAGE_VF_PBLE_BP          0x10
+#define I40IW_CQP_OP_MANAGE_PUSH_PAGES          0x11
+#define I40IW_CQP_OP_MANAGE_PE_TEAM             0x12
+#define I40IW_CQP_OP_UPLOAD_CONTEXT             0x13
+#define I40IW_CQP_OP_ALLOCATE_LOC_MAC_IP_TABLE_ENTRY 0x14
+#define I40IW_CQP_OP_MANAGE_HMC_PM_FUNC_TABLE   0x15
+#define I40IW_CQP_OP_CREATE_CEQ                 0x16
+#define I40IW_CQP_OP_DESTROY_CEQ                0x18
+#define I40IW_CQP_OP_CREATE_AEQ                 0x19
+#define I40IW_CQP_OP_DESTROY_AEQ                0x1b
+#define I40IW_CQP_OP_CREATE_ADDR_VECT           0x1c
+#define I40IW_CQP_OP_MODIFY_ADDR_VECT           0x1d
+#define I40IW_CQP_OP_DESTROY_ADDR_VECT          0x1e
+#define I40IW_CQP_OP_UPDATE_PE_SDS              0x1f
+#define I40IW_CQP_OP_QUERY_FPM_VALUES           0x20
+#define I40IW_CQP_OP_COMMIT_FPM_VALUES          0x21
+#define I40IW_CQP_OP_FLUSH_WQES                 0x22
+/* I40IW_CQP_OP_GEN_AE is the same value as I40IW_CQP_OP_FLUSH_WQES */
+#define I40IW_CQP_OP_GEN_AE                     0x22
+#define I40IW_CQP_OP_MANAGE_APBVT               0x23
+#define I40IW_CQP_OP_NOP                        0x24
+#define I40IW_CQP_OP_MANAGE_QUAD_HASH_TABLE_ENTRY 0x25
+#define I40IW_CQP_OP_CREATE_UDA_MCAST_GROUP     0x26
+#define I40IW_CQP_OP_MODIFY_UDA_MCAST_GROUP     0x27
+#define I40IW_CQP_OP_DESTROY_UDA_MCAST_GROUP    0x28
+#define I40IW_CQP_OP_SUSPEND_QP                 0x29
+#define I40IW_CQP_OP_RESUME_QP                  0x2a
+#define I40IW_CQP_OP_SHMC_PAGES_ALLOCATED       0x2b
+#define I40IW_CQP_OP_SET_HMC_RESOURCE_PROFILE   0x2d
+
+#define I40IW_UDA_QPSQ_NEXT_HEADER_SHIFT 16
+#define I40IW_UDA_QPSQ_NEXT_HEADER_MASK ((u64)0xff << I40IW_UDA_QPSQ_NEXT_HEADER_SHIFT)
+
+#define I40IW_UDA_QPSQ_OPCODE_SHIFT 32
+#define I40IW_UDA_QPSQ_OPCODE_MASK ((u64)0x3f << I40IW_UDA_QPSQ_OPCODE_SHIFT)
+
+#define I40IW_UDA_QPSQ_MACLEN_SHIFT 56
+#define I40IW_UDA_QPSQ_MACLEN_MASK \
+	((u64)0x7f << I40IW_UDA_QPSQ_MACLEN_SHIFT)
+
+#define I40IW_UDA_QPSQ_IPLEN_SHIFT 48
+#define I40IW_UDA_QPSQ_IPLEN_MASK \
+	((u64)0x7f << I40IW_UDA_QPSQ_IPLEN_SHIFT)
+
+#define I40IW_UDA_QPSQ_L4T_SHIFT 30
+#define I40IW_UDA_QPSQ_L4T_MASK \
+	((u64)0x3 << I40IW_UDA_QPSQ_L4T_SHIFT)
+
+#define I40IW_UDA_QPSQ_IIPT_SHIFT 28
+#define I40IW_UDA_QPSQ_IIPT_MASK \
+	((u64)0x3 << I40IW_UDA_QPSQ_IIPT_SHIFT)
+
+#define I40IW_UDA_QPSQ_L4LEN_SHIFT 24
+#define I40IW_UDA_QPSQ_L4LEN_MASK ((u64)0xf << I40IW_UDA_QPSQ_L4LEN_SHIFT)
+
+#define I40IW_UDA_QPSQ_AVIDX_SHIFT 0
+#define I40IW_UDA_QPSQ_AVIDX_MASK ((u64)0xffff << I40IW_UDA_QPSQ_AVIDX_SHIFT)
+
+#define I40IW_UDA_QPSQ_VALID_SHIFT 63
+#define I40IW_UDA_QPSQ_VALID_MASK \
+	((u64)0x1 << I40IW_UDA_QPSQ_VALID_SHIFT)
+
+#define I40IW_UDA_QPSQ_SIGCOMPL_SHIFT 62
+#define I40IW_UDA_QPSQ_SIGCOMPL_MASK ((u64)0x1 << I40IW_UDA_QPSQ_SIGCOMPL_SHIFT)
+
+#define I40IW_UDA_PAYLOADLEN_SHIFT 0
+#define I40IW_UDA_PAYLOADLEN_MASK ((u64)0x3fff << I40IW_UDA_PAYLOADLEN_SHIFT)
+
+#define I40IW_UDA_HDRLEN_SHIFT 16
+#define I40IW_UDA_HDRLEN_MASK ((u64)0x1ff << I40IW_UDA_HDRLEN_SHIFT)
+
+#define I40IW_VLAN_TAG_VALID_SHIFT 50
+#define I40IW_VLAN_TAG_VALID_MASK ((u64)0x1 << I40IW_VLAN_TAG_VALID_SHIFT)
+
+#define I40IW_UDA_L3PROTO_SHIFT 0
+#define I40IW_UDA_L3PROTO_MASK ((u64)0x3 << I40IW_UDA_L3PROTO_SHIFT)
+
+#define I40IW_UDA_L4PROTO_SHIFT 16
+#define I40IW_UDA_L4PROTO_MASK ((u64)0x3 << I40IW_UDA_L4PROTO_SHIFT)
+
+#define I40IW_UDA_QPSQ_DOLOOPBACK_SHIFT 44
+#define I40IW_UDA_QPSQ_DOLOOPBACK_MASK \
+	((u64)0x1 << I40IW_UDA_QPSQ_DOLOOPBACK_SHIFT)
+
+/* CQP SQ WQE common fields */
+#define I40IW_CQPSQ_OPCODE_SHIFT 32
+#define I40IW_CQPSQ_OPCODE_MASK (0x3fULL << I40IW_CQPSQ_OPCODE_SHIFT)
+
+#define I40IW_CQPSQ_WQEVALID_SHIFT 63
+#define I40IW_CQPSQ_WQEVALID_MASK (1ULL << I40IW_CQPSQ_WQEVALID_SHIFT)
+
+#define I40IW_CQPSQ_TPHVAL_SHIFT 0
+#define I40IW_CQPSQ_TPHVAL_MASK (0xffUL << I40IW_CQPSQ_TPHVAL_SHIFT)
+
+#define I40IW_CQPSQ_TPHEN_SHIFT 60
+#define I40IW_CQPSQ_TPHEN_MASK (1ULL << I40IW_CQPSQ_TPHEN_SHIFT)
+
+#define I40IW_CQPSQ_PBUFADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IW_CQPSQ_PBUFADDR_MASK I40IW_CQPHC_QPCTX_MASK
+
+/* Create/Modify/Destroy QP */
+
+#define I40IW_CQPSQ_QP_NEWMSS_SHIFT 32
+#define I40IW_CQPSQ_QP_NEWMSS_MASK (0x3fffULL << I40IW_CQPSQ_QP_NEWMSS_SHIFT)
+
+#define I40IW_CQPSQ_QP_TERMLEN_SHIFT 48
+#define I40IW_CQPSQ_QP_TERMLEN_MASK (0xfULL << I40IW_CQPSQ_QP_TERMLEN_SHIFT)
+
+#define I40IW_CQPSQ_QP_QPCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IW_CQPSQ_QP_QPCTX_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IW_CQPSQ_QP_QPID_SHIFT 0
+#define I40IW_CQPSQ_QP_QPID_MASK (0x3FFFFUL)
+/* I40IWCQ_QPID_MASK */
+
+#define I40IW_CQPSQ_QP_OP_SHIFT 32
+#define I40IW_CQPSQ_QP_OP_MASK I40IWCQ_OP_MASK
+
+#define I40IW_CQPSQ_QP_ORDVALID_SHIFT 42
+#define I40IW_CQPSQ_QP_ORDVALID_MASK (1ULL << I40IW_CQPSQ_QP_ORDVALID_SHIFT)
+
+#define I40IW_CQPSQ_QP_TOECTXVALID_SHIFT 43
+#define I40IW_CQPSQ_QP_TOECTXVALID_MASK \
+	(1ULL << I40IW_CQPSQ_QP_TOECTXVALID_SHIFT)
+
+#define I40IW_CQPSQ_QP_CACHEDVARVALID_SHIFT 44
+#define I40IW_CQPSQ_QP_CACHEDVARVALID_MASK      \
+	(1ULL << I40IW_CQPSQ_QP_CACHEDVARVALID_SHIFT)
+
+#define I40IW_CQPSQ_QP_VQ_SHIFT 45
+#define I40IW_CQPSQ_QP_VQ_MASK (1ULL << I40IW_CQPSQ_QP_VQ_SHIFT)
+
+#define I40IW_CQPSQ_QP_FORCELOOPBACK_SHIFT 46
+#define I40IW_CQPSQ_QP_FORCELOOPBACK_MASK       \
+	(1ULL << I40IW_CQPSQ_QP_FORCELOOPBACK_SHIFT)
+
+#define I40IW_CQPSQ_QP_CQNUMVALID_SHIFT 47
+#define I40IW_CQPSQ_QP_CQNUMVALID_MASK  \
+	(1ULL << I40IW_CQPSQ_QP_CQNUMVALID_SHIFT)
+
+#define I40IW_CQPSQ_QP_QPTYPE_SHIFT 48
+#define I40IW_CQPSQ_QP_QPTYPE_MASK (0x3ULL << I40IW_CQPSQ_QP_QPTYPE_SHIFT)
+
+#define I40IW_CQPSQ_QP_MSSCHANGE_SHIFT 52
+#define I40IW_CQPSQ_QP_MSSCHANGE_MASK (1ULL << I40IW_CQPSQ_QP_MSSCHANGE_SHIFT)
+
+#define I40IW_CQPSQ_QP_IGNOREMWBOUND_SHIFT 54
+#define I40IW_CQPSQ_QP_IGNOREMWBOUND_MASK       \
+	(1ULL << I40IW_CQPSQ_QP_IGNOREMWBOUND_SHIFT)
+
+#define I40IW_CQPSQ_QP_REMOVEHASHENTRY_SHIFT 55
+#define I40IW_CQPSQ_QP_REMOVEHASHENTRY_MASK     \
+	(1ULL << I40IW_CQPSQ_QP_REMOVEHASHENTRY_SHIFT)
+
+#define I40IW_CQPSQ_QP_TERMACT_SHIFT 56
+#define I40IW_CQPSQ_QP_TERMACT_MASK (0x3ULL << I40IW_CQPSQ_QP_TERMACT_SHIFT)
+
+#define I40IW_CQPSQ_QP_RESETCON_SHIFT 58
+#define I40IW_CQPSQ_QP_RESETCON_MASK (1ULL << I40IW_CQPSQ_QP_RESETCON_SHIFT)
+
+#define I40IW_CQPSQ_QP_ARPTABIDXVALID_SHIFT 59
+#define I40IW_CQPSQ_QP_ARPTABIDXVALID_MASK      \
+	(1ULL << I40IW_CQPSQ_QP_ARPTABIDXVALID_SHIFT)
+
+#define I40IW_CQPSQ_QP_NEXTIWSTATE_SHIFT 60
+#define I40IW_CQPSQ_QP_NEXTIWSTATE_MASK \
+	(0x7ULL << I40IW_CQPSQ_QP_NEXTIWSTATE_SHIFT)
+
+#define I40IW_CQPSQ_QP_DBSHADOWADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IW_CQPSQ_QP_DBSHADOWADDR_MASK I40IW_CQPHC_QPCTX_MASK
+
+/* Create/Modify/Destroy CQ */
+#define I40IW_CQPSQ_CQ_CQSIZE_SHIFT 0
+#define I40IW_CQPSQ_CQ_CQSIZE_MASK (0x3ffffUL << I40IW_CQPSQ_CQ_CQSIZE_SHIFT)
+
+#define I40IW_CQPSQ_CQ_CQCTX_SHIFT 0
+#define I40IW_CQPSQ_CQ_CQCTX_MASK       \
+	(0x7fffffffffffffffULL << I40IW_CQPSQ_CQ_CQCTX_SHIFT)
+
+#define I40IW_CQPSQ_CQ_CQCTX_SHIFT 0
+#define I40IW_CQPSQ_CQ_CQCTX_MASK       \
+	(0x7fffffffffffffffULL << I40IW_CQPSQ_CQ_CQCTX_SHIFT)
+
+#define I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD_SHIFT 0
+#define I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD_MASK       \
+	(0x3ffff << I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD_SHIFT)
+
+#define I40IW_CQPSQ_CQ_CEQID_SHIFT 24
+#define I40IW_CQPSQ_CQ_CEQID_MASK (0x7fUL << I40IW_CQPSQ_CQ_CEQID_SHIFT)
+
+#define I40IW_CQPSQ_CQ_OP_SHIFT 32
+#define I40IW_CQPSQ_CQ_OP_MASK (0x3fULL << I40IW_CQPSQ_CQ_OP_SHIFT)
+
+#define I40IW_CQPSQ_CQ_CQRESIZE_SHIFT 43
+#define I40IW_CQPSQ_CQ_CQRESIZE_MASK (1ULL << I40IW_CQPSQ_CQ_CQRESIZE_SHIFT)
+
+#define I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT 44
+#define I40IW_CQPSQ_CQ_LPBLSIZE_MASK (3ULL << I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT)
+
+#define I40IW_CQPSQ_CQ_CHKOVERFLOW_SHIFT 46
+#define I40IW_CQPSQ_CQ_CHKOVERFLOW_MASK         \
+	(1ULL << I40IW_CQPSQ_CQ_CHKOVERFLOW_SHIFT)
+
+#define I40IW_CQPSQ_CQ_VIRTMAP_SHIFT 47
+#define I40IW_CQPSQ_CQ_VIRTMAP_MASK (1ULL << I40IW_CQPSQ_CQ_VIRTMAP_SHIFT)
+
+#define I40IW_CQPSQ_CQ_ENCEQEMASK_SHIFT 48
+#define I40IW_CQPSQ_CQ_ENCEQEMASK_MASK  \
+	(1ULL << I40IW_CQPSQ_CQ_ENCEQEMASK_SHIFT)
+
+#define I40IW_CQPSQ_CQ_CEQIDVALID_SHIFT 49
+#define I40IW_CQPSQ_CQ_CEQIDVALID_MASK  \
+	(1ULL << I40IW_CQPSQ_CQ_CEQIDVALID_SHIFT)
+
+#define I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT_SHIFT 61
+#define I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT_MASK      \
+	(1ULL << I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT_SHIFT)
+
+/* Create/Modify/Destroy Shared Receive Queue */
+
+#define I40IW_CQPSQ_SRQ_RQSIZE_SHIFT 0
+#define I40IW_CQPSQ_SRQ_RQSIZE_MASK (0xfUL << I40IW_CQPSQ_SRQ_RQSIZE_SHIFT)
+
+#define I40IW_CQPSQ_SRQ_RQWQESIZE_SHIFT 4
+#define I40IW_CQPSQ_SRQ_RQWQESIZE_MASK \
+	(0x7UL << I40IW_CQPSQ_SRQ_RQWQESIZE_SHIFT)
+
+#define I40IW_CQPSQ_SRQ_SRQLIMIT_SHIFT 32
+#define I40IW_CQPSQ_SRQ_SRQLIMIT_MASK   \
+	(0xfffULL << I40IW_CQPSQ_SRQ_SRQLIMIT_SHIFT)
+
+#define I40IW_CQPSQ_SRQ_SRQCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IW_CQPSQ_SRQ_SRQCTX_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IW_CQPSQ_SRQ_PDID_SHIFT 16
+#define I40IW_CQPSQ_SRQ_PDID_MASK       \
+	(0x7fffULL << I40IW_CQPSQ_SRQ_PDID_SHIFT)
+
+#define I40IW_CQPSQ_SRQ_SRQID_SHIFT 0
+#define I40IW_CQPSQ_SRQ_SRQID_MASK (0x7fffUL << I40IW_CQPSQ_SRQ_SRQID_SHIFT)
+
+#define I40IW_CQPSQ_SRQ_LPBLSIZE_SHIFT I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT
+#define I40IW_CQPSQ_SRQ_LPBLSIZE_MASK I40IW_CQPSQ_CQ_LPBLSIZE_MASK
+
+#define I40IW_CQPSQ_SRQ_VIRTMAP_SHIFT I40IW_CQPSQ_CQ_VIRTMAP_SHIFT
+#define I40IW_CQPSQ_SRQ_VIRTMAP_MASK I40IW_CQPSQ_CQ_VIRTMAP_MASK
+
+#define I40IW_CQPSQ_SRQ_TPHEN_SHIFT I40IW_CQPSQ_TPHEN_SHIFT
+#define I40IW_CQPSQ_SRQ_TPHEN_MASK I40IW_CQPSQ_TPHEN_MASK
+
+#define I40IW_CQPSQ_SRQ_ARMLIMITEVENT_SHIFT 61
+#define I40IW_CQPSQ_SRQ_ARMLIMITEVENT_MASK      \
+	(1ULL << I40IW_CQPSQ_SRQ_ARMLIMITEVENT_SHIFT)
+
+#define I40IW_CQPSQ_SRQ_DBSHADOWAREA_SHIFT 6
+#define I40IW_CQPSQ_SRQ_DBSHADOWAREA_MASK       \
+	(0x3ffffffffffffffULL << I40IW_CQPSQ_SRQ_DBSHADOWAREA_SHIFT)
+
+#define I40IW_CQPSQ_SRQ_FIRSTPMPBLIDX_SHIFT 0
+#define I40IW_CQPSQ_SRQ_FIRSTPMPBLIDX_MASK      \
+	(0xfffffffUL << I40IW_CQPSQ_SRQ_FIRSTPMPBLIDX_SHIFT)
+
+/* Allocate/Register/Register Shared/Deallocate Stag */
+#define I40IW_CQPSQ_STAG_VA_FBO_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IW_CQPSQ_STAG_VA_FBO_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IW_CQPSQ_STAG_STAGLEN_SHIFT 0
+#define I40IW_CQPSQ_STAG_STAGLEN_MASK   \
+	(0x3fffffffffffULL << I40IW_CQPSQ_STAG_STAGLEN_SHIFT)
+
+#define I40IW_CQPSQ_STAG_PDID_SHIFT 48
+#define I40IW_CQPSQ_STAG_PDID_MASK (0x7fffULL << I40IW_CQPSQ_STAG_PDID_SHIFT)
+
+#define I40IW_CQPSQ_STAG_KEY_SHIFT 0
+#define I40IW_CQPSQ_STAG_KEY_MASK (0xffUL << I40IW_CQPSQ_STAG_KEY_SHIFT)
+
+#define I40IW_CQPSQ_STAG_IDX_SHIFT 8
+#define I40IW_CQPSQ_STAG_IDX_MASK (0xffffffUL << I40IW_CQPSQ_STAG_IDX_SHIFT)
+
+#define I40IW_CQPSQ_STAG_PARENTSTAGIDX_SHIFT 32
+#define I40IW_CQPSQ_STAG_PARENTSTAGIDX_MASK     \
+	(0xffffffULL << I40IW_CQPSQ_STAG_PARENTSTAGIDX_SHIFT)
+
+#define I40IW_CQPSQ_STAG_MR_SHIFT 43
+#define I40IW_CQPSQ_STAG_MR_MASK (1ULL << I40IW_CQPSQ_STAG_MR_SHIFT)
+
+#define I40IW_CQPSQ_STAG_LPBLSIZE_SHIFT I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT
+#define I40IW_CQPSQ_STAG_LPBLSIZE_MASK I40IW_CQPSQ_CQ_LPBLSIZE_MASK
+
+#define I40IW_CQPSQ_STAG_HPAGESIZE_SHIFT 46
+#define I40IW_CQPSQ_STAG_HPAGESIZE_MASK \
+	(1ULL << I40IW_CQPSQ_STAG_HPAGESIZE_SHIFT)
+
+#define I40IW_CQPSQ_STAG_ARIGHTS_SHIFT 48
+#define I40IW_CQPSQ_STAG_ARIGHTS_MASK   \
+	(0x1fULL << I40IW_CQPSQ_STAG_ARIGHTS_SHIFT)
+
+#define I40IW_CQPSQ_STAG_REMACCENABLED_SHIFT 53
+#define I40IW_CQPSQ_STAG_REMACCENABLED_MASK     \
+	(1ULL << I40IW_CQPSQ_STAG_REMACCENABLED_SHIFT)
+
+#define I40IW_CQPSQ_STAG_VABASEDTO_SHIFT 59
+#define I40IW_CQPSQ_STAG_VABASEDTO_MASK \
+	(1ULL << I40IW_CQPSQ_STAG_VABASEDTO_SHIFT)
+
+#define I40IW_CQPSQ_STAG_USEHMCFNIDX_SHIFT 60
+#define I40IW_CQPSQ_STAG_USEHMCFNIDX_MASK       \
+	(1ULL << I40IW_CQPSQ_STAG_USEHMCFNIDX_SHIFT)
+
+#define I40IW_CQPSQ_STAG_USEPFRID_SHIFT 61
+#define I40IW_CQPSQ_STAG_USEPFRID_MASK  \
+	(1ULL << I40IW_CQPSQ_STAG_USEPFRID_SHIFT)
+
+#define I40IW_CQPSQ_STAG_PBA_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IW_CQPSQ_STAG_PBA_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IW_CQPSQ_STAG_HMCFNIDX_SHIFT 0
+#define I40IW_CQPSQ_STAG_HMCFNIDX_MASK \
+	(0x3fUL << I40IW_CQPSQ_STAG_HMCFNIDX_SHIFT)
+
+#define I40IW_CQPSQ_STAG_FIRSTPMPBLIDX_SHIFT 0
+#define I40IW_CQPSQ_STAG_FIRSTPMPBLIDX_MASK     \
+	(0xfffffffUL << I40IW_CQPSQ_STAG_FIRSTPMPBLIDX_SHIFT)
+
+/* Query stag */
+#define I40IW_CQPSQ_QUERYSTAG_IDX_SHIFT I40IW_CQPSQ_STAG_IDX_SHIFT
+#define I40IW_CQPSQ_QUERYSTAG_IDX_MASK I40IW_CQPSQ_STAG_IDX_MASK
+
+/* Allocate Local IP Address Entry */
+
+/* Manage Local IP Address Table - MLIPA */
+#define I40IW_CQPSQ_MLIPA_IPV6LO_SHIFT  I40IW_CQPHC_QPCTX_SHIFT
+#define I40IW_CQPSQ_MLIPA_IPV6LO_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IW_CQPSQ_MLIPA_IPV6HI_SHIFT  I40IW_CQPHC_QPCTX_SHIFT
+#define I40IW_CQPSQ_MLIPA_IPV6HI_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IW_CQPSQ_MLIPA_IPV4_SHIFT 0
+#define I40IW_CQPSQ_MLIPA_IPV4_MASK \
+	(0xffffffffUL << I40IW_CQPSQ_MLIPA_IPV4_SHIFT)
+
+#define I40IW_CQPSQ_MLIPA_IPTABLEIDX_SHIFT 0
+#define I40IW_CQPSQ_MLIPA_IPTABLEIDX_MASK       \
+	(0x3fUL << I40IW_CQPSQ_MLIPA_IPTABLEIDX_SHIFT)
+
+#define I40IW_CQPSQ_MLIPA_IPV4VALID_SHIFT 42
+#define I40IW_CQPSQ_MLIPA_IPV4VALID_MASK        \
+	(1ULL << I40IW_CQPSQ_MLIPA_IPV4VALID_SHIFT)
+
+#define I40IW_CQPSQ_MLIPA_IPV6VALID_SHIFT 43
+#define I40IW_CQPSQ_MLIPA_IPV6VALID_MASK        \
+	(1ULL << I40IW_CQPSQ_MLIPA_IPV6VALID_SHIFT)
+
+#define I40IW_CQPSQ_MLIPA_FREEENTRY_SHIFT 62
+#define I40IW_CQPSQ_MLIPA_FREEENTRY_MASK        \
+	(1ULL << I40IW_CQPSQ_MLIPA_FREEENTRY_SHIFT)
+
+#define I40IW_CQPSQ_MLIPA_IGNORE_REF_CNT_SHIFT 61
+#define I40IW_CQPSQ_MLIPA_IGNORE_REF_CNT_MASK   \
+	(1ULL << I40IW_CQPSQ_MLIPA_IGNORE_REF_CNT_SHIFT)
+
+#define I40IW_CQPSQ_MLIPA_MAC0_SHIFT 0
+#define I40IW_CQPSQ_MLIPA_MAC0_MASK (0xffUL << I40IW_CQPSQ_MLIPA_MAC0_SHIFT)
+
+#define I40IW_CQPSQ_MLIPA_MAC1_SHIFT 8
+#define I40IW_CQPSQ_MLIPA_MAC1_MASK (0xffUL << I40IW_CQPSQ_MLIPA_MAC1_SHIFT)
+
+#define I40IW_CQPSQ_MLIPA_MAC2_SHIFT 16
+#define I40IW_CQPSQ_MLIPA_MAC2_MASK (0xffUL << I40IW_CQPSQ_MLIPA_MAC2_SHIFT)
+
+#define I40IW_CQPSQ_MLIPA_MAC3_SHIFT 24
+#define I40IW_CQPSQ_MLIPA_MAC3_MASK (0xffUL << I40IW_CQPSQ_MLIPA_MAC3_SHIFT)
+
+#define I40IW_CQPSQ_MLIPA_MAC4_SHIFT 32
+#define I40IW_CQPSQ_MLIPA_MAC4_MASK (0xffULL << I40IW_CQPSQ_MLIPA_MAC4_SHIFT)
+
+#define I40IW_CQPSQ_MLIPA_MAC5_SHIFT 40
+#define I40IW_CQPSQ_MLIPA_MAC5_MASK (0xffULL << I40IW_CQPSQ_MLIPA_MAC5_SHIFT)
+
+/* Manage ARP Table  - MAT */
+#define I40IW_CQPSQ_MAT_REACHMAX_SHIFT 0
+#define I40IW_CQPSQ_MAT_REACHMAX_MASK   \
+	(0xffffffffUL << I40IW_CQPSQ_MAT_REACHMAX_SHIFT)
+
+#define I40IW_CQPSQ_MAT_MACADDR_SHIFT 0
+#define I40IW_CQPSQ_MAT_MACADDR_MASK    \
+	(0xffffffffffffULL << I40IW_CQPSQ_MAT_MACADDR_SHIFT)
+
+#define I40IW_CQPSQ_MAT_ARPENTRYIDX_SHIFT 0
+#define I40IW_CQPSQ_MAT_ARPENTRYIDX_MASK        \
+	(0xfffUL << I40IW_CQPSQ_MAT_ARPENTRYIDX_SHIFT)
+
+#define I40IW_CQPSQ_MAT_ENTRYVALID_SHIFT 42
+#define I40IW_CQPSQ_MAT_ENTRYVALID_MASK \
+	(1ULL << I40IW_CQPSQ_MAT_ENTRYVALID_SHIFT)
+
+#define I40IW_CQPSQ_MAT_PERMANENT_SHIFT 43
+#define I40IW_CQPSQ_MAT_PERMANENT_MASK  \
+	(1ULL << I40IW_CQPSQ_MAT_PERMANENT_SHIFT)
+
+#define I40IW_CQPSQ_MAT_QUERY_SHIFT 44
+#define I40IW_CQPSQ_MAT_QUERY_MASK (1ULL << I40IW_CQPSQ_MAT_QUERY_SHIFT)
+
+/* Manage VF PBLE Backing Pages - MVPBP*/
+#define I40IW_CQPSQ_MVPBP_PD_ENTRY_CNT_SHIFT 0
+#define I40IW_CQPSQ_MVPBP_PD_ENTRY_CNT_MASK \
+	(0x3ffULL << I40IW_CQPSQ_MVPBP_PD_ENTRY_CNT_SHIFT)
+
+#define I40IW_CQPSQ_MVPBP_FIRST_PD_INX_SHIFT 16
+#define I40IW_CQPSQ_MVPBP_FIRST_PD_INX_MASK \
+	(0x1ffULL << I40IW_CQPSQ_MVPBP_FIRST_PD_INX_SHIFT)
+
+#define I40IW_CQPSQ_MVPBP_SD_INX_SHIFT 32
+#define I40IW_CQPSQ_MVPBP_SD_INX_MASK \
+	(0xfffULL << I40IW_CQPSQ_MVPBP_SD_INX_SHIFT)
+
+#define I40IW_CQPSQ_MVPBP_INV_PD_ENT_SHIFT 62
+#define I40IW_CQPSQ_MVPBP_INV_PD_ENT_MASK \
+	(0x1ULL << I40IW_CQPSQ_MVPBP_INV_PD_ENT_SHIFT)
+
+#define I40IW_CQPSQ_MVPBP_PD_PLPBA_SHIFT 3
+#define I40IW_CQPSQ_MVPBP_PD_PLPBA_MASK \
+	(0x1fffffffffffffffULL << I40IW_CQPSQ_MVPBP_PD_PLPBA_SHIFT)
+
+/* Manage Push Page - MPP */
+#define I40IW_INVALID_PUSH_PAGE_INDEX 0xffff
+
+#define I40IW_CQPSQ_MPP_QS_HANDLE_SHIFT 0
+#define I40IW_CQPSQ_MPP_QS_HANDLE_MASK (0xffffUL << \
+					I40IW_CQPSQ_MPP_QS_HANDLE_SHIFT)
+
+#define I40IW_CQPSQ_MPP_PPIDX_SHIFT 0
+#define I40IW_CQPSQ_MPP_PPIDX_MASK (0x3ffUL << I40IW_CQPSQ_MPP_PPIDX_SHIFT)
+
+#define I40IW_CQPSQ_MPP_FREE_PAGE_SHIFT 62
+#define I40IW_CQPSQ_MPP_FREE_PAGE_MASK (1ULL << I40IW_CQPSQ_MPP_FREE_PAGE_SHIFT)
+
+/* Upload Context - UCTX */
+#define I40IW_CQPSQ_UCTX_QPCTXADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IW_CQPSQ_UCTX_QPCTXADDR_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IW_CQPSQ_UCTX_QPID_SHIFT 0
+#define I40IW_CQPSQ_UCTX_QPID_MASK (0x3ffffUL << I40IW_CQPSQ_UCTX_QPID_SHIFT)
+
+#define I40IW_CQPSQ_UCTX_QPTYPE_SHIFT 48
+#define I40IW_CQPSQ_UCTX_QPTYPE_MASK (0xfULL << I40IW_CQPSQ_UCTX_QPTYPE_SHIFT)
+
+#define I40IW_CQPSQ_UCTX_RAWFORMAT_SHIFT 61
+#define I40IW_CQPSQ_UCTX_RAWFORMAT_MASK \
+	(1ULL << I40IW_CQPSQ_UCTX_RAWFORMAT_SHIFT)
+
+#define I40IW_CQPSQ_UCTX_FREEZEQP_SHIFT 62
+#define I40IW_CQPSQ_UCTX_FREEZEQP_MASK  \
+	(1ULL << I40IW_CQPSQ_UCTX_FREEZEQP_SHIFT)
+
+/* Manage HMC PM Function Table - MHMC */
+#define I40IW_CQPSQ_MHMC_VFIDX_SHIFT 0
+#define I40IW_CQPSQ_MHMC_VFIDX_MASK (0x7fUL << I40IW_CQPSQ_MHMC_VFIDX_SHIFT)
+
+#define I40IW_CQPSQ_MHMC_FREEPMFN_SHIFT 62
+#define I40IW_CQPSQ_MHMC_FREEPMFN_MASK  \
+	(1ULL << I40IW_CQPSQ_MHMC_FREEPMFN_SHIFT)
+
+/* Set HMC Resource Profile - SHMCRP */
+#define I40IW_CQPSQ_SHMCRP_HMC_PROFILE_SHIFT 0
+#define I40IW_CQPSQ_SHMCRP_HMC_PROFILE_MASK \
+	(0x7ULL << I40IW_CQPSQ_SHMCRP_HMC_PROFILE_SHIFT)
+#define I40IW_CQPSQ_SHMCRP_VFNUM_SHIFT 32
+#define I40IW_CQPSQ_SHMCRP_VFNUM_MASK (0x3fULL << I40IW_CQPSQ_SHMCRP_VFNUM_SHIFT)
+
+/* Create/Destroy CEQ */
+#define I40IW_CQPSQ_CEQ_CEQSIZE_SHIFT 0
+#define I40IW_CQPSQ_CEQ_CEQSIZE_MASK \
+	(0x1ffffUL << I40IW_CQPSQ_CEQ_CEQSIZE_SHIFT)
+
+#define I40IW_CQPSQ_CEQ_CEQID_SHIFT 0
+#define I40IW_CQPSQ_CEQ_CEQID_MASK (0x7fUL << I40IW_CQPSQ_CEQ_CEQID_SHIFT)
+
+#define I40IW_CQPSQ_CEQ_LPBLSIZE_SHIFT I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT
+#define I40IW_CQPSQ_CEQ_LPBLSIZE_MASK I40IW_CQPSQ_CQ_LPBLSIZE_MASK
+
+#define I40IW_CQPSQ_CEQ_VMAP_SHIFT 47
+#define I40IW_CQPSQ_CEQ_VMAP_MASK (1ULL << I40IW_CQPSQ_CEQ_VMAP_SHIFT)
+
+#define I40IW_CQPSQ_CEQ_FIRSTPMPBLIDX_SHIFT 0
+#define I40IW_CQPSQ_CEQ_FIRSTPMPBLIDX_MASK      \
+	(0xfffffffUL << I40IW_CQPSQ_CEQ_FIRSTPMPBLIDX_SHIFT)
+
+/* Create/Destroy AEQ */
+#define I40IW_CQPSQ_AEQ_AEQECNT_SHIFT 0
+#define I40IW_CQPSQ_AEQ_AEQECNT_MASK \
+	(0x7ffffUL << I40IW_CQPSQ_AEQ_AEQECNT_SHIFT)
+
+#define I40IW_CQPSQ_AEQ_LPBLSIZE_SHIFT I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT
+#define I40IW_CQPSQ_AEQ_LPBLSIZE_MASK I40IW_CQPSQ_CQ_LPBLSIZE_MASK
+
+#define I40IW_CQPSQ_AEQ_VMAP_SHIFT 47
+#define I40IW_CQPSQ_AEQ_VMAP_MASK (1ULL << I40IW_CQPSQ_AEQ_VMAP_SHIFT)
+
+#define I40IW_CQPSQ_AEQ_FIRSTPMPBLIDX_SHIFT 0
+#define I40IW_CQPSQ_AEQ_FIRSTPMPBLIDX_MASK      \
+	(0xfffffffUL << I40IW_CQPSQ_AEQ_FIRSTPMPBLIDX_SHIFT)
+
+/* Commit FPM Values - CFPM */
+#define I40IW_CQPSQ_CFPM_HMCFNID_SHIFT 0
+#define I40IW_CQPSQ_CFPM_HMCFNID_MASK (0x3fUL << I40IW_CQPSQ_CFPM_HMCFNID_SHIFT)
+
+/* Flush WQEs - FWQE */
+#define I40IW_CQPSQ_FWQE_AECODE_SHIFT 0
+#define I40IW_CQPSQ_FWQE_AECODE_MASK (0xffffUL << I40IW_CQPSQ_FWQE_AECODE_SHIFT)
+
+#define I40IW_CQPSQ_FWQE_AESOURCE_SHIFT 16
+#define I40IW_CQPSQ_FWQE_AESOURCE_MASK \
+	(0xfUL << I40IW_CQPSQ_FWQE_AESOURCE_SHIFT)
+
+#define I40IW_CQPSQ_FWQE_RQMNERR_SHIFT 0
+#define I40IW_CQPSQ_FWQE_RQMNERR_MASK \
+	(0xffffUL << I40IW_CQPSQ_FWQE_RQMNERR_SHIFT)
+
+#define I40IW_CQPSQ_FWQE_RQMJERR_SHIFT 16
+#define I40IW_CQPSQ_FWQE_RQMJERR_MASK \
+	(0xffffUL << I40IW_CQPSQ_FWQE_RQMJERR_SHIFT)
+
+#define I40IW_CQPSQ_FWQE_SQMNERR_SHIFT 32
+#define I40IW_CQPSQ_FWQE_SQMNERR_MASK   \
+	(0xffffULL << I40IW_CQPSQ_FWQE_SQMNERR_SHIFT)
+
+#define I40IW_CQPSQ_FWQE_SQMJERR_SHIFT 48
+#define I40IW_CQPSQ_FWQE_SQMJERR_MASK   \
+	(0xffffULL << I40IW_CQPSQ_FWQE_SQMJERR_SHIFT)
+
+#define I40IW_CQPSQ_FWQE_QPID_SHIFT 0
+#define I40IW_CQPSQ_FWQE_QPID_MASK (0x3ffffULL << I40IW_CQPSQ_FWQE_QPID_SHIFT)
+
+#define I40IW_CQPSQ_FWQE_GENERATE_AE_SHIFT 59
+#define I40IW_CQPSQ_FWQE_GENERATE_AE_MASK (1ULL <<      \
+					   I40IW_CQPSQ_FWQE_GENERATE_AE_SHIFT)
+
+#define I40IW_CQPSQ_FWQE_USERFLCODE_SHIFT 60
+#define I40IW_CQPSQ_FWQE_USERFLCODE_MASK        \
+	(1ULL << I40IW_CQPSQ_FWQE_USERFLCODE_SHIFT)
+
+#define I40IW_CQPSQ_FWQE_FLUSHSQ_SHIFT 61
+#define I40IW_CQPSQ_FWQE_FLUSHSQ_MASK (1ULL << I40IW_CQPSQ_FWQE_FLUSHSQ_SHIFT)
+
+#define I40IW_CQPSQ_FWQE_FLUSHRQ_SHIFT 62
+#define I40IW_CQPSQ_FWQE_FLUSHRQ_MASK (1ULL << I40IW_CQPSQ_FWQE_FLUSHRQ_SHIFT)
+
+/* Manage Accelerated Port Table - MAPT */
+#define I40IW_CQPSQ_MAPT_PORT_SHIFT 0
+#define I40IW_CQPSQ_MAPT_PORT_MASK (0xffffUL << I40IW_CQPSQ_MAPT_PORT_SHIFT)
+
+#define I40IW_CQPSQ_MAPT_ADDPORT_SHIFT 62
+#define I40IW_CQPSQ_MAPT_ADDPORT_MASK (1ULL << I40IW_CQPSQ_MAPT_ADDPORT_SHIFT)
+
+/* Update Protocol Engine SDs */
+#define I40IW_CQPSQ_UPESD_SDCMD_SHIFT 0
+#define I40IW_CQPSQ_UPESD_SDCMD_MASK (0xffffffffUL << I40IW_CQPSQ_UPESD_SDCMD_SHIFT)
+
+#define I40IW_CQPSQ_UPESD_SDDATALOW_SHIFT 0
+#define I40IW_CQPSQ_UPESD_SDDATALOW_MASK        \
+	(0xffffffffUL << I40IW_CQPSQ_UPESD_SDDATALOW_SHIFT)
+
+#define I40IW_CQPSQ_UPESD_SDDATAHI_SHIFT 32
+#define I40IW_CQPSQ_UPESD_SDDATAHI_MASK \
+	(0xffffffffULL << I40IW_CQPSQ_UPESD_SDDATAHI_SHIFT)
+#define I40IW_CQPSQ_UPESD_HMCFNID_SHIFT 0
+#define I40IW_CQPSQ_UPESD_HMCFNID_MASK  \
+	(0x3fUL << I40IW_CQPSQ_UPESD_HMCFNID_SHIFT)
+
+#define I40IW_CQPSQ_UPESD_ENTRY_VALID_SHIFT 63
+#define I40IW_CQPSQ_UPESD_ENTRY_VALID_MASK      \
+	((u64)1 << I40IW_CQPSQ_UPESD_ENTRY_VALID_SHIFT)
+
+#define I40IW_CQPSQ_UPESD_ENTRY_COUNT_SHIFT 0
+#define I40IW_CQPSQ_UPESD_ENTRY_COUNT_MASK      \
+	(0xfUL << I40IW_CQPSQ_UPESD_ENTRY_COUNT_SHIFT)
+
+#define I40IW_CQPSQ_UPESD_SKIP_ENTRY_SHIFT 7
+#define I40IW_CQPSQ_UPESD_SKIP_ENTRY_MASK       \
+	(0x1UL << I40IW_CQPSQ_UPESD_SKIP_ENTRY_SHIFT)
+
+/* Suspend QP */
+#define I40IW_CQPSQ_SUSPENDQP_QPID_SHIFT 0
+#define I40IW_CQPSQ_SUSPENDQP_QPID_MASK (0x3FFFFUL)
+/* I40IWCQ_QPID_MASK */
+
+/* Resume QP */
+#define I40IW_CQPSQ_RESUMEQP_QSHANDLE_SHIFT 0
+#define I40IW_CQPSQ_RESUMEQP_QSHANDLE_MASK      \
+	(0xffffffffUL << I40IW_CQPSQ_RESUMEQP_QSHANDLE_SHIFT)
+
+#define I40IW_CQPSQ_RESUMEQP_QPID_SHIFT 0
+#define I40IW_CQPSQ_RESUMEQP_QPID_MASK (0x3FFFFUL)
+/* I40IWCQ_QPID_MASK */
+
+/* IW QP Context */
+#define I40IWQPC_DDP_VER_SHIFT 0
+#define I40IWQPC_DDP_VER_MASK (3UL << I40IWQPC_DDP_VER_SHIFT)
+
+#define I40IWQPC_SNAP_SHIFT 2
+#define I40IWQPC_SNAP_MASK (1UL << I40IWQPC_SNAP_SHIFT)
+
+#define I40IWQPC_IPV4_SHIFT 3
+#define I40IWQPC_IPV4_MASK (1UL << I40IWQPC_IPV4_SHIFT)
+
+#define I40IWQPC_NONAGLE_SHIFT 4
+#define I40IWQPC_NONAGLE_MASK (1UL << I40IWQPC_NONAGLE_SHIFT)
+
+#define I40IWQPC_INSERTVLANTAG_SHIFT 5
+#define I40IWQPC_INSERTVLANTAG_MASK (1 << I40IWQPC_INSERTVLANTAG_SHIFT)
+
+#define I40IWQPC_USESRQ_SHIFT 6
+#define I40IWQPC_USESRQ_MASK (1UL << I40IWQPC_USESRQ_SHIFT)
+
+#define I40IWQPC_TIMESTAMP_SHIFT 7
+#define I40IWQPC_TIMESTAMP_MASK (1UL << I40IWQPC_TIMESTAMP_SHIFT)
+
+#define I40IWQPC_RQWQESIZE_SHIFT 8
+#define I40IWQPC_RQWQESIZE_MASK (3UL << I40IWQPC_RQWQESIZE_SHIFT)
+
+#define I40IWQPC_INSERTL2TAG2_SHIFT 11
+#define I40IWQPC_INSERTL2TAG2_MASK (1UL << I40IWQPC_INSERTL2TAG2_SHIFT)
+
+#define I40IWQPC_LIMIT_SHIFT 12
+#define I40IWQPC_LIMIT_MASK (3UL << I40IWQPC_LIMIT_SHIFT)
+
+#define I40IWQPC_DROPOOOSEG_SHIFT 15
+#define I40IWQPC_DROPOOOSEG_MASK (1UL << I40IWQPC_DROPOOOSEG_SHIFT)
+
+#define I40IWQPC_DUPACK_THRESH_SHIFT 16
+#define I40IWQPC_DUPACK_THRESH_MASK (7UL << I40IWQPC_DUPACK_THRESH_SHIFT)
+
+#define I40IWQPC_ERR_RQ_IDX_VALID_SHIFT 19
+#define I40IWQPC_ERR_RQ_IDX_VALID_MASK  (1UL << I40IWQPC_ERR_RQ_IDX_VALID_SHIFT)
+
+#define I40IWQPC_DIS_VLAN_CHECKS_SHIFT 19
+#define I40IWQPC_DIS_VLAN_CHECKS_MASK (7UL << I40IWQPC_DIS_VLAN_CHECKS_SHIFT)
+
+#define I40IWQPC_RCVTPHEN_SHIFT 28
+#define I40IWQPC_RCVTPHEN_MASK (1UL << I40IWQPC_RCVTPHEN_SHIFT)
+
+#define I40IWQPC_XMITTPHEN_SHIFT 29
+#define I40IWQPC_XMITTPHEN_MASK (1ULL << I40IWQPC_XMITTPHEN_SHIFT)
+
+#define I40IWQPC_RQTPHEN_SHIFT 30
+#define I40IWQPC_RQTPHEN_MASK (1UL << I40IWQPC_RQTPHEN_SHIFT)
+
+#define I40IWQPC_SQTPHEN_SHIFT 31
+#define I40IWQPC_SQTPHEN_MASK (1ULL << I40IWQPC_SQTPHEN_SHIFT)
+
+#define I40IWQPC_PPIDX_SHIFT 32
+#define I40IWQPC_PPIDX_MASK (0x3ffULL << I40IWQPC_PPIDX_SHIFT)
+
+#define I40IWQPC_PMENA_SHIFT 47
+#define I40IWQPC_PMENA_MASK (1ULL << I40IWQPC_PMENA_SHIFT)
+
+#define I40IWQPC_RDMAP_VER_SHIFT 62
+#define I40IWQPC_RDMAP_VER_MASK (3ULL << I40IWQPC_RDMAP_VER_SHIFT)
+
+#define I40IWQPC_SQADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IWQPC_SQADDR_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IWQPC_RQADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IWQPC_RQADDR_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IWQPC_TTL_SHIFT 0
+#define I40IWQPC_TTL_MASK (0xffUL << I40IWQPC_TTL_SHIFT)
+
+#define I40IWQPC_RQSIZE_SHIFT 8
+#define I40IWQPC_RQSIZE_MASK (0xfUL << I40IWQPC_RQSIZE_SHIFT)
+
+#define I40IWQPC_SQSIZE_SHIFT 12
+#define I40IWQPC_SQSIZE_MASK (0xfUL << I40IWQPC_SQSIZE_SHIFT)
+
+#define I40IWQPC_SRCMACADDRIDX_SHIFT 16
+#define I40IWQPC_SRCMACADDRIDX_MASK (0x3fUL << I40IWQPC_SRCMACADDRIDX_SHIFT)
+
+#define I40IWQPC_AVOIDSTRETCHACK_SHIFT 23
+#define I40IWQPC_AVOIDSTRETCHACK_MASK (1UL << I40IWQPC_AVOIDSTRETCHACK_SHIFT)
+
+#define I40IWQPC_TOS_SHIFT 24
+#define I40IWQPC_TOS_MASK (0xffUL << I40IWQPC_TOS_SHIFT)
+
+#define I40IWQPC_SRCPORTNUM_SHIFT 32
+#define I40IWQPC_SRCPORTNUM_MASK (0xffffULL << I40IWQPC_SRCPORTNUM_SHIFT)
+
+#define I40IWQPC_DESTPORTNUM_SHIFT 48
+#define I40IWQPC_DESTPORTNUM_MASK (0xffffULL << I40IWQPC_DESTPORTNUM_SHIFT)
+
+#define I40IWQPC_DESTIPADDR0_SHIFT 32
+#define I40IWQPC_DESTIPADDR0_MASK       \
+	(0xffffffffULL << I40IWQPC_DESTIPADDR0_SHIFT)
+
+#define I40IWQPC_DESTIPADDR1_SHIFT 0
+#define I40IWQPC_DESTIPADDR1_MASK       \
+	(0xffffffffULL << I40IWQPC_DESTIPADDR1_SHIFT)
+
+#define I40IWQPC_DESTIPADDR2_SHIFT 32
+#define I40IWQPC_DESTIPADDR2_MASK       \
+	(0xffffffffULL << I40IWQPC_DESTIPADDR2_SHIFT)
+
+#define I40IWQPC_DESTIPADDR3_SHIFT 0
+#define I40IWQPC_DESTIPADDR3_MASK       \
+	(0xffffffffULL << I40IWQPC_DESTIPADDR3_SHIFT)
+
+#define I40IWQPC_SNDMSS_SHIFT 16
+#define I40IWQPC_SNDMSS_MASK (0x3fffUL << I40IWQPC_SNDMSS_SHIFT)
+
+#define I40IW_UDA_QPC_MAXFRAMESIZE_SHIFT 16
+#define I40IW_UDA_QPC_MAXFRAMESIZE_MASK (0x3fffUL << I40IW_UDA_QPC_MAXFRAMESIZE_SHIFT)
+
+#define I40IWQPC_VLANTAG_SHIFT 32
+#define I40IWQPC_VLANTAG_MASK (0xffffULL << I40IWQPC_VLANTAG_SHIFT)
+
+#define I40IWQPC_ARPIDX_SHIFT 48
+#define I40IWQPC_ARPIDX_MASK (0xffffULL << I40IWQPC_ARPIDX_SHIFT)
+
+#define I40IWQPC_FLOWLABEL_SHIFT 0
+#define I40IWQPC_FLOWLABEL_MASK (0xfffffUL << I40IWQPC_FLOWLABEL_SHIFT)
+
+#define I40IWQPC_WSCALE_SHIFT 20
+#define I40IWQPC_WSCALE_MASK (1UL << I40IWQPC_WSCALE_SHIFT)
+
+#define I40IWQPC_KEEPALIVE_SHIFT 21
+#define I40IWQPC_KEEPALIVE_MASK (1UL << I40IWQPC_KEEPALIVE_SHIFT)
+
+#define I40IWQPC_IGNORE_TCP_OPT_SHIFT 22
+#define I40IWQPC_IGNORE_TCP_OPT_MASK (1UL << I40IWQPC_IGNORE_TCP_OPT_SHIFT)
+
+#define I40IWQPC_IGNORE_TCP_UNS_OPT_SHIFT 23
+#define I40IWQPC_IGNORE_TCP_UNS_OPT_MASK        \
+	(1UL << I40IWQPC_IGNORE_TCP_UNS_OPT_SHIFT)
+
+#define I40IWQPC_TCPSTATE_SHIFT 28
+#define I40IWQPC_TCPSTATE_MASK (0xfUL << I40IWQPC_TCPSTATE_SHIFT)
+
+#define I40IWQPC_RCVSCALE_SHIFT 32
+#define I40IWQPC_RCVSCALE_MASK (0xfULL << I40IWQPC_RCVSCALE_SHIFT)
+
+#define I40IWQPC_SNDSCALE_SHIFT 40
+#define I40IWQPC_SNDSCALE_MASK (0xfULL << I40IWQPC_SNDSCALE_SHIFT)
+
+#define I40IWQPC_PDIDX_SHIFT 48
+#define I40IWQPC_PDIDX_MASK (0x7fffULL << I40IWQPC_PDIDX_SHIFT)
+
+#define I40IWQPC_KALIVE_TIMER_MAX_PROBES_SHIFT 16
+#define I40IWQPC_KALIVE_TIMER_MAX_PROBES_MASK   \
+	(0xffUL << I40IWQPC_KALIVE_TIMER_MAX_PROBES_SHIFT)
+
+#define I40IWQPC_KEEPALIVE_INTERVAL_SHIFT 24
+#define I40IWQPC_KEEPALIVE_INTERVAL_MASK        \
+	(0xffUL << I40IWQPC_KEEPALIVE_INTERVAL_SHIFT)
+
+#define I40IWQPC_TIMESTAMP_RECENT_SHIFT 0
+#define I40IWQPC_TIMESTAMP_RECENT_MASK  \
+	(0xffffffffUL << I40IWQPC_TIMESTAMP_RECENT_SHIFT)
+
+#define I40IWQPC_TIMESTAMP_AGE_SHIFT 32
+#define I40IWQPC_TIMESTAMP_AGE_MASK     \
+	(0xffffffffULL << I40IWQPC_TIMESTAMP_AGE_SHIFT)
+
+#define I40IWQPC_SNDNXT_SHIFT 0
+#define I40IWQPC_SNDNXT_MASK (0xffffffffUL << I40IWQPC_SNDNXT_SHIFT)
+
+#define I40IWQPC_SNDWND_SHIFT 32
+#define I40IWQPC_SNDWND_MASK (0xffffffffULL << I40IWQPC_SNDWND_SHIFT)
+
+#define I40IWQPC_RCVNXT_SHIFT 0
+#define I40IWQPC_RCVNXT_MASK (0xffffffffUL << I40IWQPC_RCVNXT_SHIFT)
+
+#define I40IWQPC_RCVWND_SHIFT 32
+#define I40IWQPC_RCVWND_MASK (0xffffffffULL << I40IWQPC_RCVWND_SHIFT)
+
+#define I40IWQPC_SNDMAX_SHIFT 0
+#define I40IWQPC_SNDMAX_MASK (0xffffffffUL << I40IWQPC_SNDMAX_SHIFT)
+
+#define I40IWQPC_SNDUNA_SHIFT 32
+#define I40IWQPC_SNDUNA_MASK (0xffffffffULL << I40IWQPC_SNDUNA_SHIFT)
+
+#define I40IWQPC_SRTT_SHIFT 0
+#define I40IWQPC_SRTT_MASK (0xffffffffUL << I40IWQPC_SRTT_SHIFT)
+
+#define I40IWQPC_RTTVAR_SHIFT 32
+#define I40IWQPC_RTTVAR_MASK (0xffffffffULL << I40IWQPC_RTTVAR_SHIFT)
+
+#define I40IWQPC_SSTHRESH_SHIFT 0
+#define I40IWQPC_SSTHRESH_MASK (0xffffffffUL << I40IWQPC_SSTHRESH_SHIFT)
+
+#define I40IWQPC_CWND_SHIFT 32
+#define I40IWQPC_CWND_MASK (0xffffffffULL << I40IWQPC_CWND_SHIFT)
+
+#define I40IWQPC_SNDWL1_SHIFT 0
+#define I40IWQPC_SNDWL1_MASK (0xffffffffUL << I40IWQPC_SNDWL1_SHIFT)
+
+#define I40IWQPC_SNDWL2_SHIFT 32
+#define I40IWQPC_SNDWL2_MASK (0xffffffffULL << I40IWQPC_SNDWL2_SHIFT)
+
+#define I40IWQPC_ERR_RQ_IDX_SHIFT 32
+#define I40IWQPC_ERR_RQ_IDX_MASK  (0x3fffULL << I40IWQPC_ERR_RQ_IDX_SHIFT)
+
+#define I40IWQPC_MAXSNDWND_SHIFT 0
+#define I40IWQPC_MAXSNDWND_MASK (0xffffffffUL << I40IWQPC_MAXSNDWND_SHIFT)
+
+#define I40IWQPC_REXMIT_THRESH_SHIFT 48
+#define I40IWQPC_REXMIT_THRESH_MASK (0x3fULL << I40IWQPC_REXMIT_THRESH_SHIFT)
+
+#define I40IWQPC_TXCQNUM_SHIFT 0
+#define I40IWQPC_TXCQNUM_MASK (0x1ffffUL << I40IWQPC_TXCQNUM_SHIFT)
+
+#define I40IWQPC_RXCQNUM_SHIFT 32
+#define I40IWQPC_RXCQNUM_MASK (0x1ffffULL << I40IWQPC_RXCQNUM_SHIFT)
+
+#define I40IWQPC_STAT_INDEX_SHIFT 0
+#define I40IWQPC_STAT_INDEX_MASK (0x1fULL << I40IWQPC_STAT_INDEX_SHIFT)
+
+#define I40IWQPC_Q2ADDR_SHIFT 0
+#define I40IWQPC_Q2ADDR_MASK (0xffffffffffffff00ULL << I40IWQPC_Q2ADDR_SHIFT)
+
+#define I40IWQPC_LASTBYTESENT_SHIFT 0
+#define I40IWQPC_LASTBYTESENT_MASK (0xffUL << I40IWQPC_LASTBYTESENT_SHIFT)
+
+#define I40IWQPC_SRQID_SHIFT 32
+#define I40IWQPC_SRQID_MASK (0xffULL << I40IWQPC_SRQID_SHIFT)
+
+#define I40IWQPC_ORDSIZE_SHIFT 0
+#define I40IWQPC_ORDSIZE_MASK (0x7fUL << I40IWQPC_ORDSIZE_SHIFT)
+
+#define I40IWQPC_IRDSIZE_SHIFT 16
+#define I40IWQPC_IRDSIZE_MASK (0x3UL << I40IWQPC_IRDSIZE_SHIFT)
+
+#define I40IWQPC_WRRDRSPOK_SHIFT 20
+#define I40IWQPC_WRRDRSPOK_MASK (1UL << I40IWQPC_WRRDRSPOK_SHIFT)
+
+#define I40IWQPC_RDOK_SHIFT 21
+#define I40IWQPC_RDOK_MASK (1UL << I40IWQPC_RDOK_SHIFT)
+
+#define I40IWQPC_SNDMARKERS_SHIFT 22
+#define I40IWQPC_SNDMARKERS_MASK (1UL << I40IWQPC_SNDMARKERS_SHIFT)
+
+#define I40IWQPC_BINDEN_SHIFT 23
+#define I40IWQPC_BINDEN_MASK (1UL << I40IWQPC_BINDEN_SHIFT)
+
+#define I40IWQPC_FASTREGEN_SHIFT 24
+#define I40IWQPC_FASTREGEN_MASK (1UL << I40IWQPC_FASTREGEN_SHIFT)
+
+#define I40IWQPC_PRIVEN_SHIFT 25
+#define I40IWQPC_PRIVEN_MASK (1UL << I40IWQPC_PRIVEN_SHIFT)
+
+#define I40IWQPC_USESTATSINSTANCE_SHIFT 26
+#define I40IWQPC_USESTATSINSTANCE_MASK (1UL << I40IWQPC_USESTATSINSTANCE_SHIFT)
+
+#define I40IWQPC_IWARPMODE_SHIFT 28
+#define I40IWQPC_IWARPMODE_MASK (1UL << I40IWQPC_IWARPMODE_SHIFT)
+
+#define I40IWQPC_RCVMARKERS_SHIFT 29
+#define I40IWQPC_RCVMARKERS_MASK (1UL << I40IWQPC_RCVMARKERS_SHIFT)
+
+#define I40IWQPC_ALIGNHDRS_SHIFT 30
+#define I40IWQPC_ALIGNHDRS_MASK (1UL << I40IWQPC_ALIGNHDRS_SHIFT)
+
+#define I40IWQPC_RCVNOMPACRC_SHIFT 31
+#define I40IWQPC_RCVNOMPACRC_MASK (1UL << I40IWQPC_RCVNOMPACRC_SHIFT)
+
+#define I40IWQPC_RCVMARKOFFSET_SHIFT 33
+#define I40IWQPC_RCVMARKOFFSET_MASK (0x1ffULL << I40IWQPC_RCVMARKOFFSET_SHIFT)
+
+#define I40IWQPC_SNDMARKOFFSET_SHIFT 48
+#define I40IWQPC_SNDMARKOFFSET_MASK (0x1ffULL << I40IWQPC_SNDMARKOFFSET_SHIFT)
+
+#define I40IWQPC_QPCOMPCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IWQPC_QPCOMPCTX_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IWQPC_SQTPHVAL_SHIFT 0
+#define I40IWQPC_SQTPHVAL_MASK (0xffUL << I40IWQPC_SQTPHVAL_SHIFT)
+
+#define I40IWQPC_RQTPHVAL_SHIFT 8
+#define I40IWQPC_RQTPHVAL_MASK (0xffUL << I40IWQPC_RQTPHVAL_SHIFT)
+
+#define I40IWQPC_QSHANDLE_SHIFT 16
+#define I40IWQPC_QSHANDLE_MASK (0x3ffUL << I40IWQPC_QSHANDLE_SHIFT)
+
+#define I40IWQPC_EXCEPTION_LAN_QUEUE_SHIFT 32
+#define I40IWQPC_EXCEPTION_LAN_QUEUE_MASK (0xfffULL <<  \
+					   I40IWQPC_EXCEPTION_LAN_QUEUE_SHIFT)
+
+#define I40IWQPC_LOCAL_IPADDR3_SHIFT 0
+#define I40IWQPC_LOCAL_IPADDR3_MASK \
+	(0xffffffffUL << I40IWQPC_LOCAL_IPADDR3_SHIFT)
+
+#define I40IWQPC_LOCAL_IPADDR2_SHIFT 32
+#define I40IWQPC_LOCAL_IPADDR2_MASK     \
+	(0xffffffffULL << I40IWQPC_LOCAL_IPADDR2_SHIFT)
+
+#define I40IWQPC_LOCAL_IPADDR1_SHIFT 0
+#define I40IWQPC_LOCAL_IPADDR1_MASK     \
+	(0xffffffffUL << I40IWQPC_LOCAL_IPADDR1_SHIFT)
+
+#define I40IWQPC_LOCAL_IPADDR0_SHIFT 32
+#define I40IWQPC_LOCAL_IPADDR0_MASK     \
+	(0xffffffffULL << I40IWQPC_LOCAL_IPADDR0_SHIFT)
+
+/* wqe size considering 32 bytes per wqe*/
+#define I40IW_QP_SW_MIN_WQSIZE 4		/*in WRs*/
+#define I40IW_SQ_RSVD 2
+#define I40IW_RQ_RSVD 1
+#define I40IW_MAX_QUANTAS_PER_WR 2
+#define I40IW_QP_SW_MAX_SQ_QUANTAS 2048
+#define I40IW_QP_SW_MAX_RQ_QUANTAS 16384
+#define I40IW_MAX_QP_WRS ((I40IW_QP_SW_MAX_SQ_QUANTAS / I40IW_MAX_QUANTAS_PER_WR) - 1)
+
+#define I40IWQP_OP_RDMA_WRITE 0
+#define I40IWQP_OP_RDMA_READ 1
+#define I40IWQP_OP_RDMA_SEND 3
+#define I40IWQP_OP_RDMA_SEND_INV 4
+#define I40IWQP_OP_RDMA_SEND_SOL_EVENT 5
+#define I40IWQP_OP_RDMA_SEND_SOL_EVENT_INV 6
+#define I40IWQP_OP_BIND_MW 8
+#define I40IWQP_OP_FAST_REGISTER 9
+#define I40IWQP_OP_LOCAL_INVALIDATE 10
+#define I40IWQP_OP_RDMA_READ_LOC_INV 11
+#define I40IWQP_OP_NOP 12
+
+#define I40IW_RSVD_SHIFT        41
+#define I40IW_RSVD_MASK (0x7fffULL << I40IW_RSVD_SHIFT)
+
+/* iwarp QP SQ WQE common fields */
+#define I40IWQPSQ_OPCODE_SHIFT 32
+#define I40IWQPSQ_OPCODE_MASK (0x3fULL << I40IWQPSQ_OPCODE_SHIFT)
+
+#define I40IWQPSQ_ADDFRAGCNT_SHIFT 38
+#define I40IWQPSQ_ADDFRAGCNT_MASK (0x7ULL << I40IWQPSQ_ADDFRAGCNT_SHIFT)
+
+#define I40IWQPSQ_PUSHWQE_SHIFT 56
+#define I40IWQPSQ_PUSHWQE_MASK (1ULL << I40IWQPSQ_PUSHWQE_SHIFT)
+
+#define I40IWQPSQ_STREAMMODE_SHIFT 58
+#define I40IWQPSQ_STREAMMODE_MASK (1ULL << I40IWQPSQ_STREAMMODE_SHIFT)
+
+#define I40IWQPSQ_WAITFORRCVPDU_SHIFT 59
+#define I40IWQPSQ_WAITFORRCVPDU_MASK (1ULL << I40IWQPSQ_WAITFORRCVPDU_SHIFT)
+
+#define I40IWQPSQ_READFENCE_SHIFT 60
+#define I40IWQPSQ_READFENCE_MASK (1ULL << I40IWQPSQ_READFENCE_SHIFT)
+
+#define I40IWQPSQ_LOCALFENCE_SHIFT 61
+#define I40IWQPSQ_LOCALFENCE_MASK (1ULL << I40IWQPSQ_LOCALFENCE_SHIFT)
+
+#define I40IWQPSQ_SIGCOMPL_SHIFT 62
+#define I40IWQPSQ_SIGCOMPL_MASK (1ULL << I40IWQPSQ_SIGCOMPL_SHIFT)
+
+#define I40IWQPSQ_VALID_SHIFT 63
+#define I40IWQPSQ_VALID_MASK (1ULL << I40IWQPSQ_VALID_SHIFT)
+
+#define I40IWQPSQ_FRAG_TO_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IWQPSQ_FRAG_TO_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IWQPSQ_FRAG_LEN_SHIFT 0
+#define I40IWQPSQ_FRAG_LEN_MASK (0xffffffffUL << I40IWQPSQ_FRAG_LEN_SHIFT)
+
+#define I40IWQPSQ_FRAG_STAG_SHIFT 32
+#define I40IWQPSQ_FRAG_STAG_MASK (0xffffffffULL << I40IWQPSQ_FRAG_STAG_SHIFT)
+
+#define I40IWQPSQ_REMSTAGINV_SHIFT 0
+#define I40IWQPSQ_REMSTAGINV_MASK (0xffffffffUL << I40IWQPSQ_REMSTAGINV_SHIFT)
+
+#define I40IWQPSQ_INLINEDATAFLAG_SHIFT 57
+#define I40IWQPSQ_INLINEDATAFLAG_MASK (1ULL << I40IWQPSQ_INLINEDATAFLAG_SHIFT)
+
+#define I40IWQPSQ_INLINEDATALEN_SHIFT 48
+#define I40IWQPSQ_INLINEDATALEN_MASK    \
+	(0x7fULL << I40IWQPSQ_INLINEDATALEN_SHIFT)
+
+/* iwarp send with push mode */
+#define I40IWQPSQ_WQDESCIDX_SHIFT 0
+#define I40IWQPSQ_WQDESCIDX_MASK (0x3fffUL << I40IWQPSQ_WQDESCIDX_SHIFT)
+
+/* rdma write */
+#define I40IWQPSQ_REMSTAG_SHIFT 0
+#define I40IWQPSQ_REMSTAG_MASK (0xffffffffUL << I40IWQPSQ_REMSTAG_SHIFT)
+
+#define I40IWQPSQ_REMTO_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IWQPSQ_REMTO_MASK I40IW_CQPHC_QPCTX_MASK
+
+/* memory window */
+#define I40IWQPSQ_STAGRIGHTS_SHIFT 48
+#define I40IWQPSQ_STAGRIGHTS_MASK (0x1fULL << I40IWQPSQ_STAGRIGHTS_SHIFT)
+
+#define I40IWQPSQ_VABASEDTO_SHIFT 53
+#define I40IWQPSQ_VABASEDTO_MASK (1ULL << I40IWQPSQ_VABASEDTO_SHIFT)
+
+#define I40IWQPSQ_MWLEN_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IWQPSQ_MWLEN_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IWQPSQ_PARENTMRSTAG_SHIFT 0
+#define I40IWQPSQ_PARENTMRSTAG_MASK \
+	(0xffffffffUL << I40IWQPSQ_PARENTMRSTAG_SHIFT)
+
+#define I40IWQPSQ_MWSTAG_SHIFT 32
+#define I40IWQPSQ_MWSTAG_MASK (0xffffffffULL << I40IWQPSQ_MWSTAG_SHIFT)
+
+#define I40IWQPSQ_BASEVA_TO_FBO_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IWQPSQ_BASEVA_TO_FBO_MASK I40IW_CQPHC_QPCTX_MASK
+
+/* Local Invalidate */
+#define I40IWQPSQ_LOCSTAG_SHIFT 32
+#define I40IWQPSQ_LOCSTAG_MASK (0xffffffffULL << I40IWQPSQ_LOCSTAG_SHIFT)
+
+/* Fast Register */
+#define I40IWQPSQ_STAGKEY_SHIFT 0
+#define I40IWQPSQ_STAGKEY_MASK (0xffUL << I40IWQPSQ_STAGKEY_SHIFT)
+
+#define I40IWQPSQ_STAGINDEX_SHIFT 8
+#define I40IWQPSQ_STAGINDEX_MASK (0xffffffUL << I40IWQPSQ_STAGINDEX_SHIFT)
+
+#define I40IWQPSQ_COPYHOSTPBLS_SHIFT 43
+#define I40IWQPSQ_COPYHOSTPBLS_MASK (1ULL << I40IWQPSQ_COPYHOSTPBLS_SHIFT)
+
+#define I40IWQPSQ_LPBLSIZE_SHIFT 44
+#define I40IWQPSQ_LPBLSIZE_MASK (3ULL << I40IWQPSQ_LPBLSIZE_SHIFT)
+
+#define I40IWQPSQ_HPAGESIZE_SHIFT 46
+#define I40IWQPSQ_HPAGESIZE_MASK (3ULL << I40IWQPSQ_HPAGESIZE_SHIFT)
+
+#define I40IWQPSQ_STAGLEN_SHIFT 0
+#define I40IWQPSQ_STAGLEN_MASK (0x1ffffffffffULL << I40IWQPSQ_STAGLEN_SHIFT)
+
+#define I40IWQPSQ_FIRSTPMPBLIDXLO_SHIFT 48
+#define I40IWQPSQ_FIRSTPMPBLIDXLO_MASK  \
+	(0xffffULL << I40IWQPSQ_FIRSTPMPBLIDXLO_SHIFT)
+
+#define I40IWQPSQ_FIRSTPMPBLIDXHI_SHIFT 0
+#define I40IWQPSQ_FIRSTPMPBLIDXHI_MASK  \
+	(0xfffUL << I40IWQPSQ_FIRSTPMPBLIDXHI_SHIFT)
+
+#define I40IWQPSQ_PBLADDR_SHIFT 12
+#define I40IWQPSQ_PBLADDR_MASK (0xfffffffffffffULL << I40IWQPSQ_PBLADDR_SHIFT)
+
+/*  iwarp QP RQ WQE common fields */
+#define I40IWQPRQ_ADDFRAGCNT_SHIFT I40IWQPSQ_ADDFRAGCNT_SHIFT
+#define I40IWQPRQ_ADDFRAGCNT_MASK I40IWQPSQ_ADDFRAGCNT_MASK
+
+#define I40IWQPRQ_VALID_SHIFT I40IWQPSQ_VALID_SHIFT
+#define I40IWQPRQ_VALID_MASK I40IWQPSQ_VALID_MASK
+
+#define I40IWQPRQ_COMPLCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT
+#define I40IWQPRQ_COMPLCTX_MASK I40IW_CQPHC_QPCTX_MASK
+
+#define I40IWQPRQ_FRAG_LEN_SHIFT I40IWQPSQ_FRAG_LEN_SHIFT
+#define I40IWQPRQ_FRAG_LEN_MASK I40IWQPSQ_FRAG_LEN_MASK
+
+#define I40IWQPRQ_STAG_SHIFT I40IWQPSQ_FRAG_STAG_SHIFT
+#define I40IWQPRQ_STAG_MASK I40IWQPSQ_FRAG_STAG_MASK
+
+#define I40IWQPRQ_TO_SHIFT I40IWQPSQ_FRAG_TO_SHIFT
+#define I40IWQPRQ_TO_MASK I40IWQPSQ_FRAG_TO_MASK
+
+/* Query FPM CQP buf */
+#define I40IW_QUERY_FPM_MAX_QPS_SHIFT 0
+#define I40IW_QUERY_FPM_MAX_QPS_MASK               \
+	(0x7ffffUL << I40IW_QUERY_FPM_MAX_QPS_SHIFT)
+
+#define I40IW_QUERY_FPM_MAX_CQS_SHIFT 0
+#define I40IW_QUERY_FPM_MAX_CQS_MASK               \
+	(0x3ffffUL << I40IW_QUERY_FPM_MAX_CQS_SHIFT)
+
+#define I40IW_QUERY_FPM_FIRST_PE_SD_INDEX_SHIFT 0
+#define I40IW_QUERY_FPM_FIRST_PE_SD_INDEX_MASK  \
+	(0x3fffUL << I40IW_QUERY_FPM_FIRST_PE_SD_INDEX_SHIFT)
+
+#define I40IW_QUERY_FPM_MAX_PE_SDS_SHIFT 32
+#define I40IW_QUERY_FPM_MAX_PE_SDS_MASK \
+	(0x3fffULL << I40IW_QUERY_FPM_MAX_PE_SDS_SHIFT)
+
+#define I40IW_QUERY_FPM_MAX_QPS_SHIFT 0
+#define I40IW_QUERY_FPM_MAX_QPS_MASK    \
+	(0x7ffffUL << I40IW_QUERY_FPM_MAX_QPS_SHIFT)
+
+#define I40IW_QUERY_FPM_MAX_CQS_SHIFT 0
+#define I40IW_QUERY_FPM_MAX_CQS_MASK    \
+	(0x3ffffUL << I40IW_QUERY_FPM_MAX_CQS_SHIFT)
+
+#define I40IW_QUERY_FPM_MAX_CEQS_SHIFT 0
+#define I40IW_QUERY_FPM_MAX_CEQS_MASK   \
+	(0xffUL << I40IW_QUERY_FPM_MAX_CEQS_SHIFT)
+
+#define I40IW_QUERY_FPM_XFBLOCKSIZE_SHIFT 32
+#define I40IW_QUERY_FPM_XFBLOCKSIZE_MASK        \
+	(0xffffffffULL << I40IW_QUERY_FPM_XFBLOCKSIZE_SHIFT)
+
+#define I40IW_QUERY_FPM_Q1BLOCKSIZE_SHIFT 32
+#define I40IW_QUERY_FPM_Q1BLOCKSIZE_MASK        \
+	(0xffffffffULL << I40IW_QUERY_FPM_Q1BLOCKSIZE_SHIFT)
+
+#define I40IW_QUERY_FPM_HTMULTIPLIER_SHIFT 16
+#define I40IW_QUERY_FPM_HTMULTIPLIER_MASK       \
+	(0xfUL << I40IW_QUERY_FPM_HTMULTIPLIER_SHIFT)
+
+#define I40IW_QUERY_FPM_TIMERBUCKET_SHIFT 32
+#define I40IW_QUERY_FPM_TIMERBUCKET_MASK        \
+	(0xffFFULL << I40IW_QUERY_FPM_TIMERBUCKET_SHIFT)
+
+/* Static HMC pages allocated buf */
+#define I40IW_SHMC_PAGE_ALLOCATED_HMC_FN_ID_SHIFT 0
+#define I40IW_SHMC_PAGE_ALLOCATED_HMC_FN_ID_MASK        \
+	(0x3fUL << I40IW_SHMC_PAGE_ALLOCATED_HMC_FN_ID_SHIFT)
+
+#define I40IW_HW_PAGE_SIZE	4096
+#define I40IW_DONE_COUNT	1000
+#define I40IW_SLEEP_COUNT	10
+
+enum {
+	I40IW_QUEUES_ALIGNMENT_MASK =		(128 - 1),
+	I40IW_AEQ_ALIGNMENT_MASK =		(256 - 1),
+	I40IW_Q2_ALIGNMENT_MASK =		(256 - 1),
+	I40IW_CEQ_ALIGNMENT_MASK =		(256 - 1),
+	I40IW_CQ0_ALIGNMENT_MASK =		(256 - 1),
+	I40IW_HOST_CTX_ALIGNMENT_MASK =		(4 - 1),
+	I40IW_SHADOWAREA_MASK =			(128 - 1),
+	I40IW_FPM_QUERY_BUF_ALIGNMENT_MASK =	(4 - 1),
+	I40IW_FPM_COMMIT_BUF_ALIGNMENT_MASK =	(4 - 1)
+};
+
+enum i40iw_alignment {
+	I40IW_CQP_ALIGNMENT =		0x200,
+	I40IW_AEQ_ALIGNMENT =		0x100,
+	I40IW_CEQ_ALIGNMENT =		0x100,
+	I40IW_CQ0_ALIGNMENT =		0x100,
+	I40IW_SD_BUF_ALIGNMENT =	0x80
+};
+
+#define I40IW_WQE_SIZE_64	64
+
+#define I40IW_QP_WQE_MIN_SIZE	32
+#define I40IW_QP_WQE_MAX_SIZE	128
+
+#define I40IW_UPDATE_SD_BUF_SIZE 128
+
+#define I40IW_CQE_QTYPE_RQ 0
+#define I40IW_CQE_QTYPE_SQ 1
+
+#define I40IW_RING_INIT(_ring, _size) \
+	{ \
+		(_ring).head = 0; \
+		(_ring).tail = 0; \
+		(_ring).size = (_size); \
+	}
+#define I40IW_RING_GETSIZE(_ring) ((_ring).size)
+#define I40IW_RING_GETCURRENT_HEAD(_ring) ((_ring).head)
+#define I40IW_RING_GETCURRENT_TAIL(_ring) ((_ring).tail)
+
+#define I40IW_RING_MOVE_HEAD(_ring, _retcode) \
+	{ \
+		register u32 size; \
+		size = (_ring).size;  \
+		if (!I40IW_RING_FULL_ERR(_ring)) { \
+			(_ring).head = ((_ring).head + 1) % size; \
+			(_retcode) = 0; \
+		} else { \
+			(_retcode) = I40IW_ERR_RING_FULL; \
+		} \
+	}
+
+#define I40IW_RING_MOVE_HEAD_BY_COUNT(_ring, _count, _retcode) \
+	{ \
+		register u32 size; \
+		size = (_ring).size; \
+		if ((I40IW_RING_WORK_AVAILABLE(_ring) + (_count)) < size) { \
+			(_ring).head = ((_ring).head + (_count)) % size; \
+			(_retcode) = 0; \
+		} else { \
+			(_retcode) = I40IW_ERR_RING_FULL; \
+		} \
+	}
+
+#define I40IW_RING_MOVE_TAIL(_ring) \
+	(_ring).tail = ((_ring).tail + 1) % (_ring).size
+
+#define I40IW_RING_MOVE_HEAD_NOCHECK(_ring) \
+	(_ring).head = ((_ring).head + 1) % (_ring).size
+
+#define I40IW_RING_MOVE_TAIL_BY_COUNT(_ring, _count) \
+	(_ring).tail = ((_ring).tail + (_count)) % (_ring).size
+
+#define I40IW_RING_SET_TAIL(_ring, _pos) \
+	(_ring).tail = (_pos) % (_ring).size
+
+#define I40IW_RING_FULL_ERR(_ring) \
+	( \
+		(I40IW_RING_WORK_AVAILABLE(_ring) == ((_ring).size - 1))  \
+	)
+
+#define I40IW_ERR_RING_FULL2(_ring) \
+	( \
+		(I40IW_RING_WORK_AVAILABLE(_ring) == ((_ring).size - 2))  \
+	)
+
+#define I40IW_ERR_RING_FULL3(_ring) \
+	( \
+		(I40IW_RING_WORK_AVAILABLE(_ring) == ((_ring).size - 3))  \
+	)
+
+#define I40IW_RING_MORE_WORK(_ring) \
+	( \
+		(I40IW_RING_WORK_AVAILABLE(_ring) != 0) \
+	)
+
+#define I40IW_RING_WORK_AVAILABLE(_ring) \
+	( \
+		(((_ring).head + (_ring).size - (_ring).tail) % (_ring).size) \
+	)
+
+#define I40IW_RING_GET_WQES_AVAILABLE(_ring) \
+	( \
+		((_ring).size - I40IW_RING_WORK_AVAILABLE(_ring) - 1) \
+	)
+
+#define I40IW_ATOMIC_RING_MOVE_HEAD(_ring, index, _retcode) \
+	{ \
+		index = I40IW_RING_GETCURRENT_HEAD(_ring); \
+		I40IW_RING_MOVE_HEAD(_ring, _retcode); \
+	}
+
+/* Async Events codes */
+#define I40IW_AE_AMP_UNALLOCATED_STAG                                   0x0102
+#define I40IW_AE_AMP_INVALID_STAG                                       0x0103
+#define I40IW_AE_AMP_BAD_QP                                             0x0104
+#define I40IW_AE_AMP_BAD_PD                                             0x0105
+#define I40IW_AE_AMP_BAD_STAG_KEY                                       0x0106
+#define I40IW_AE_AMP_BAD_STAG_INDEX                                     0x0107
+#define I40IW_AE_AMP_BOUNDS_VIOLATION                                   0x0108
+#define I40IW_AE_AMP_RIGHTS_VIOLATION                                   0x0109
+#define I40IW_AE_AMP_TO_WRAP                                            0x010a
+#define I40IW_AE_AMP_FASTREG_SHARED                                     0x010b
+#define I40IW_AE_AMP_FASTREG_VALID_STAG                                 0x010c
+#define I40IW_AE_AMP_FASTREG_MW_STAG                                    0x010d
+#define I40IW_AE_AMP_FASTREG_INVALID_RIGHTS                             0x010e
+#define I40IW_AE_AMP_FASTREG_PBL_TABLE_OVERFLOW                         0x010f
+#define I40IW_AE_AMP_FASTREG_INVALID_LENGTH                             0x0110
+#define I40IW_AE_AMP_INVALIDATE_SHARED                                  0x0111
+#define I40IW_AE_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS                 0x0112
+#define I40IW_AE_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS                   0x0113
+#define I40IW_AE_AMP_MWBIND_VALID_STAG                                  0x0114
+#define I40IW_AE_AMP_MWBIND_OF_MR_STAG                                  0x0115
+#define I40IW_AE_AMP_MWBIND_TO_ZERO_BASED_STAG                          0x0116
+#define I40IW_AE_AMP_MWBIND_TO_MW_STAG                                  0x0117
+#define I40IW_AE_AMP_MWBIND_INVALID_RIGHTS                              0x0118
+#define I40IW_AE_AMP_MWBIND_INVALID_BOUNDS                              0x0119
+#define I40IW_AE_AMP_MWBIND_TO_INVALID_PARENT                           0x011a
+#define I40IW_AE_AMP_MWBIND_BIND_DISABLED                               0x011b
+#define I40IW_AE_UDA_XMIT_DGRAM_TOO_LONG                                0x0132
+#define I40IW_AE_UDA_XMIT_DGRAM_TOO_SHORT                               0x0134
+#define I40IW_AE_BAD_CLOSE                                              0x0201
+#define I40IW_AE_RDMAP_ROE_BAD_LLP_CLOSE                                0x0202
+#define I40IW_AE_CQ_OPERATION_ERROR                                     0x0203
+#define I40IW_AE_PRIV_OPERATION_DENIED                                  0x011c
+#define I40IW_AE_RDMA_READ_WHILE_ORD_ZERO                               0x0205
+#define I40IW_AE_STAG_ZERO_INVALID                                      0x0206
+#define I40IW_AE_IB_RREQ_AND_Q1_FULL                                    0x0207
+#define I40IW_AE_WQE_UNEXPECTED_OPCODE                                  0x020a
+#define I40IW_AE_WQE_INVALID_PARAMETER                                  0x020b
+#define I40IW_AE_WQE_LSMM_TOO_LONG                                      0x0220
+#define I40IW_AE_DDP_INVALID_MSN_GAP_IN_MSN                             0x0301
+#define I40IW_AE_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER      0x0303
+#define I40IW_AE_DDP_UBE_INVALID_DDP_VERSION                            0x0304
+#define I40IW_AE_DDP_UBE_INVALID_MO                                     0x0305
+#define I40IW_AE_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE                0x0306
+#define I40IW_AE_DDP_UBE_INVALID_QN                                     0x0307
+#define I40IW_AE_DDP_NO_L_BIT                                           0x0308
+#define I40IW_AE_RDMAP_ROE_INVALID_RDMAP_VERSION                        0x0311
+#define I40IW_AE_RDMAP_ROE_UNEXPECTED_OPCODE                            0x0312
+#define I40IW_AE_ROE_INVALID_RDMA_READ_REQUEST                          0x0313
+#define I40IW_AE_ROE_INVALID_RDMA_WRITE_OR_READ_RESP                    0x0314
+#define I40IW_AE_INVALID_ARP_ENTRY                                      0x0401
+#define I40IW_AE_INVALID_TCP_OPTION_RCVD                                0x0402
+#define I40IW_AE_STALE_ARP_ENTRY                                        0x0403
+#define I40IW_AE_INVALID_MAC_ENTRY                                      0x0405
+#define I40IW_AE_LLP_CLOSE_COMPLETE                                     0x0501
+#define I40IW_AE_LLP_CONNECTION_RESET                                   0x0502
+#define I40IW_AE_LLP_FIN_RECEIVED                                       0x0503
+#define I40IW_AE_LLP_RECEIVED_MPA_CRC_ERROR                             0x0505
+#define I40IW_AE_LLP_SEGMENT_TOO_LARGE                                  0x0506
+#define I40IW_AE_LLP_SEGMENT_TOO_SMALL                                  0x0507
+#define I40IW_AE_LLP_SYN_RECEIVED                                       0x0508
+#define I40IW_AE_LLP_TERMINATE_RECEIVED                                 0x0509
+#define I40IW_AE_LLP_TOO_MANY_RETRIES                                   0x050a
+#define I40IW_AE_LLP_TOO_MANY_KEEPALIVE_RETRIES                         0x050b
+#define I40IW_AE_LLP_DOUBT_REACHABILITY                                 0x050c
+#define I40IW_AE_LLP_RX_VLAN_MISMATCH                                   0x050d
+#define I40IW_AE_RESOURCE_EXHAUSTION                                    0x0520
+#define I40IW_AE_RESET_SENT                                             0x0601
+#define I40IW_AE_TERMINATE_SENT                                         0x0602
+#define I40IW_AE_RESET_NOT_SENT                                         0x0603
+#define I40IW_AE_LCE_QP_CATASTROPHIC                                    0x0700
+#define I40IW_AE_LCE_FUNCTION_CATASTROPHIC                              0x0701
+#define I40IW_AE_LCE_CQ_CATASTROPHIC                                    0x0702
+#define I40IW_AE_QP_SUSPEND_COMPLETE                                    0x0900
+
+#define OP_DELETE_LOCAL_MAC_IPADDR_ENTRY        1
+#define OP_CEQ_DESTROY                          2
+#define OP_AEQ_DESTROY                          3
+#define OP_DELETE_ARP_CACHE_ENTRY               4
+#define OP_MANAGE_APBVT_ENTRY                   5
+#define OP_CEQ_CREATE                           6
+#define OP_AEQ_CREATE                           7
+#define OP_ALLOC_LOCAL_MAC_IPADDR_ENTRY         8
+#define OP_ADD_LOCAL_MAC_IPADDR_ENTRY           9
+#define OP_MANAGE_QHASH_TABLE_ENTRY             10
+#define OP_QP_MODIFY                            11
+#define OP_QP_UPLOAD_CONTEXT                    12
+#define OP_CQ_CREATE                            13
+#define OP_CQ_DESTROY                           14
+#define OP_QP_CREATE                            15
+#define OP_QP_DESTROY                           16
+#define OP_ALLOC_STAG                           17
+#define OP_MR_REG_NON_SHARED                    18
+#define OP_DEALLOC_STAG                         19
+#define OP_MW_ALLOC                             20
+#define OP_QP_FLUSH_WQES                        21
+#define OP_ADD_ARP_CACHE_ENTRY                  22
+#define OP_MANAGE_PUSH_PAGE                     23
+#define OP_UPDATE_PE_SDS                        24
+#define OP_MANAGE_HMC_PM_FUNC_TABLE             25
+#define OP_SUSPEND                              26
+#define OP_RESUME                               27
+#define OP_MANAGE_VF_PBLE_BP                    28
+#define OP_QUERY_FPM_VALUES                     29
+#define OP_COMMIT_FPM_VALUES                    30
+#define OP_REQUESTED_COMMANDS                   31
+#define OP_COMPLETED_COMMANDS                   32
+#define OP_GEN_AE                               33
+#define OP_SIZE_CQP_STAT_ARRAY                  34
+
+#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_hmc.c b/drivers/infiniband/hw/i40iw/i40iw_hmc.c
new file mode 100644
index 0000000..5484cbf
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_hmc.c
@@ -0,0 +1,821 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#include "i40iw_osdep.h"
+#include "i40iw_register.h"
+#include "i40iw_status.h"
+#include "i40iw_hmc.h"
+#include "i40iw_d.h"
+#include "i40iw_type.h"
+#include "i40iw_p.h"
+#include "i40iw_vf.h"
+#include "i40iw_virtchnl.h"
+
+/**
+ * i40iw_find_sd_index_limit - finds segment descriptor index limit
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @type: type of HMC resources we're searching
+ * @index: starting index for the object
+ * @cnt: number of objects we're trying to create
+ * @sd_idx: pointer to return index of the segment descriptor in question
+ * @sd_limit: pointer to return the maximum number of segment descriptors
+ *
+ * This function calculates the segment descriptor index and index limit
+ * for the resource defined by i40iw_hmc_rsrc_type.
+ */
+
+static inline void i40iw_find_sd_index_limit(struct i40iw_hmc_info *hmc_info,
+					     u32 type,
+					     u32 idx,
+					     u32 cnt,
+					     u32 *sd_idx,
+					     u32 *sd_limit)
+{
+	u64 fpm_addr, fpm_limit;
+
+	fpm_addr = hmc_info->hmc_obj[(type)].base +
+			hmc_info->hmc_obj[type].size * idx;
+	fpm_limit = fpm_addr + hmc_info->hmc_obj[type].size * cnt;
+	*sd_idx = (u32)(fpm_addr / I40IW_HMC_DIRECT_BP_SIZE);
+	*sd_limit = (u32)((fpm_limit - 1) / I40IW_HMC_DIRECT_BP_SIZE);
+	*sd_limit += 1;
+}
+
+/**
+ * i40iw_find_pd_index_limit - finds page descriptor index limit
+ * @hmc_info: pointer to the HMC configuration information struct
+ * @type: HMC resource type we're examining
+ * @idx: starting index for the object
+ * @cnt: number of objects we're trying to create
+ * @pd_index: pointer to return page descriptor index
+ * @pd_limit: pointer to return page descriptor index limit
+ *
+ * Calculates the page descriptor index and index limit for the resource
+ * defined by i40iw_hmc_rsrc_type.
+ */
+
+static inline void i40iw_find_pd_index_limit(struct i40iw_hmc_info *hmc_info,
+					     u32 type,
+					     u32 idx,
+					     u32 cnt,
+					     u32 *pd_idx,
+					     u32 *pd_limit)
+{
+	u64 fpm_adr, fpm_limit;
+
+	fpm_adr = hmc_info->hmc_obj[type].base +
+			hmc_info->hmc_obj[type].size * idx;
+	fpm_limit = fpm_adr + (hmc_info)->hmc_obj[(type)].size * (cnt);
+	*(pd_idx) = (u32)(fpm_adr / I40IW_HMC_PAGED_BP_SIZE);
+	*(pd_limit) = (u32)((fpm_limit - 1) / I40IW_HMC_PAGED_BP_SIZE);
+	*(pd_limit) += 1;
+}
+
+/**
+ * i40iw_set_sd_entry - setup entry for sd programming
+ * @pa: physical addr
+ * @idx: sd index
+ * @type: paged or direct sd
+ * @entry: sd entry ptr
+ */
+static inline void i40iw_set_sd_entry(u64 pa,
+				      u32 idx,
+				      enum i40iw_sd_entry_type type,
+				      struct update_sd_entry *entry)
+{
+	entry->data = pa | (I40IW_HMC_MAX_BP_COUNT << I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT) |
+			(((type == I40IW_SD_TYPE_PAGED) ? 0 : 1) <<
+				I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT) |
+			(1 << I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT);
+	entry->cmd = (idx | (1 << I40E_PFHMC_SDCMD_PMSDWR_SHIFT) | (1 << 15));
+}
+
+/**
+ * i40iw_clr_sd_entry - setup entry for sd clear
+ * @idx: sd index
+ * @type: paged or direct sd
+ * @entry: sd entry ptr
+ */
+static inline void i40iw_clr_sd_entry(u32 idx, enum i40iw_sd_entry_type type,
+				      struct update_sd_entry *entry)
+{
+	entry->data = (I40IW_HMC_MAX_BP_COUNT <<
+			I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT) |
+			(((type == I40IW_SD_TYPE_PAGED) ? 0 : 1) <<
+				I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT);
+	entry->cmd = (idx | (1 << I40E_PFHMC_SDCMD_PMSDWR_SHIFT) | (1 << 15));
+}
+
+/**
+ * i40iw_hmc_sd_one - setup 1 sd entry for cqp
+ * @dev: pointer to the device structure
+ * @hmc_fn_id: hmc's function id
+ * @pa: physical addr
+ * @sd_idx: sd index
+ * @type: paged or direct sd
+ * @setsd: flag to set or clear sd
+ */
+enum i40iw_status_code i40iw_hmc_sd_one(struct i40iw_sc_dev *dev,
+					u8 hmc_fn_id,
+					u64 pa, u32 sd_idx,
+					enum i40iw_sd_entry_type type,
+					bool setsd)
+{
+	struct i40iw_update_sds_info sdinfo;
+
+	sdinfo.cnt = 1;
+	sdinfo.hmc_fn_id = hmc_fn_id;
+	if (setsd)
+		i40iw_set_sd_entry(pa, sd_idx, type, sdinfo.entry);
+	else
+		i40iw_clr_sd_entry(sd_idx, type, sdinfo.entry);
+
+	return dev->cqp->process_cqp_sds(dev, &sdinfo);
+}
+
+/**
+ * i40iw_hmc_sd_grp - setup group od sd entries for cqp
+ * @dev: pointer to the device structure
+ * @hmc_info: pointer to the HMC configuration information struct
+ * @sd_index: sd index
+ * @sd_cnt: number of sd entries
+ * @setsd: flag to set or clear sd
+ */
+static enum i40iw_status_code i40iw_hmc_sd_grp(struct i40iw_sc_dev *dev,
+					       struct i40iw_hmc_info *hmc_info,
+					       u32 sd_index,
+					       u32 sd_cnt,
+					       bool setsd)
+{
+	struct i40iw_hmc_sd_entry *sd_entry;
+	struct i40iw_update_sds_info sdinfo;
+	u64 pa;
+	u32 i;
+	enum i40iw_status_code ret_code = 0;
+
+	memset(&sdinfo, 0, sizeof(sdinfo));
+	sdinfo.hmc_fn_id = hmc_info->hmc_fn_id;
+	for (i = sd_index; i < sd_index + sd_cnt; i++) {
+		sd_entry = &hmc_info->sd_table.sd_entry[i];
+		if (!sd_entry ||
+		    (!sd_entry->valid && setsd) ||
+		    (sd_entry->valid && !setsd))
+			continue;
+		if (setsd) {
+			pa = (sd_entry->entry_type == I40IW_SD_TYPE_PAGED) ?
+			    sd_entry->u.pd_table.pd_page_addr.pa :
+			    sd_entry->u.bp.addr.pa;
+			i40iw_set_sd_entry(pa, i, sd_entry->entry_type,
+					   &sdinfo.entry[sdinfo.cnt]);
+		} else {
+			i40iw_clr_sd_entry(i, sd_entry->entry_type,
+					   &sdinfo.entry[sdinfo.cnt]);
+		}
+		sdinfo.cnt++;
+		if (sdinfo.cnt == I40IW_MAX_SD_ENTRIES) {
+			ret_code = dev->cqp->process_cqp_sds(dev, &sdinfo);
+			if (ret_code) {
+				i40iw_debug(dev, I40IW_DEBUG_HMC,
+					    "i40iw_hmc_sd_grp: sd_programming failed err=%d\n",
+					    ret_code);
+				return ret_code;
+			}
+			sdinfo.cnt = 0;
+		}
+	}
+	if (sdinfo.cnt)
+		ret_code = dev->cqp->process_cqp_sds(dev, &sdinfo);
+
+	return ret_code;
+}
+
+/**
+ * i40iw_vfdev_from_fpm - return vf dev ptr for hmc function id
+ * @dev: pointer to the device structure
+ * @hmc_fn_id: hmc's function id
+ */
+struct i40iw_vfdev *i40iw_vfdev_from_fpm(struct i40iw_sc_dev *dev, u8 hmc_fn_id)
+{
+	struct i40iw_vfdev *vf_dev = NULL;
+	u16 idx;
+
+	for (idx = 0; idx < I40IW_MAX_PE_ENABLED_VF_COUNT; idx++) {
+		if (dev->vf_dev[idx] &&
+		    ((u8)dev->vf_dev[idx]->pmf_index == hmc_fn_id)) {
+			vf_dev = dev->vf_dev[idx];
+			break;
+		}
+	}
+	return vf_dev;
+}
+
+/**
+ * i40iw_vf_hmcinfo_from_fpm - get ptr to hmc for func_id
+ * @dev: pointer to the device structure
+ * @hmc_fn_id: hmc's function id
+ */
+struct i40iw_hmc_info *i40iw_vf_hmcinfo_from_fpm(struct i40iw_sc_dev *dev,
+						 u8 hmc_fn_id)
+{
+	struct i40iw_hmc_info *hmc_info = NULL;
+	u16 idx;
+
+	for (idx = 0; idx < I40IW_MAX_PE_ENABLED_VF_COUNT; idx++) {
+		if (dev->vf_dev[idx] &&
+		    ((u8)dev->vf_dev[idx]->pmf_index == hmc_fn_id)) {
+			hmc_info = &dev->vf_dev[idx]->hmc_info;
+			break;
+		}
+	}
+	return hmc_info;
+}
+
+/**
+ * i40iw_hmc_finish_add_sd_reg - program sd entries for objects
+ * @dev: pointer to the device structure
+ * @info: create obj info
+ */
+static enum i40iw_status_code i40iw_hmc_finish_add_sd_reg(struct i40iw_sc_dev *dev,
+							  struct i40iw_hmc_create_obj_info *info)
+{
+	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt)
+		return I40IW_ERR_INVALID_HMC_OBJ_INDEX;
+
+	if ((info->start_idx + info->count) >
+			info->hmc_info->hmc_obj[info->rsrc_type].cnt)
+		return I40IW_ERR_INVALID_HMC_OBJ_COUNT;
+
+	if (!info->add_sd_cnt)
+		return 0;
+
+	return i40iw_hmc_sd_grp(dev, info->hmc_info,
+				info->hmc_info->sd_indexes[0],
+				info->add_sd_cnt, true);
+}
+
+/**
+ * i40iw_create_iw_hmc_obj - allocate backing store for hmc objects
+ * @dev: pointer to the device structure
+ * @info: pointer to i40iw_hmc_iw_create_obj_info struct
+ *
+ * This will allocate memory for PDs and backing pages and populate
+ * the sd and pd entries.
+ */
+enum i40iw_status_code i40iw_sc_create_hmc_obj(struct i40iw_sc_dev *dev,
+					       struct i40iw_hmc_create_obj_info *info)
+{
+	struct i40iw_hmc_sd_entry *sd_entry;
+	u32 sd_idx, sd_lmt;
+	u32 pd_idx = 0, pd_lmt = 0;
+	u32 pd_idx1 = 0, pd_lmt1 = 0;
+	u32 i, j;
+	bool pd_error = false;
+	enum i40iw_status_code ret_code = 0;
+
+	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt)
+		return I40IW_ERR_INVALID_HMC_OBJ_INDEX;
+
+	if ((info->start_idx + info->count) >
+	    info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		i40iw_debug(dev, I40IW_DEBUG_HMC,
+			    "%s: error type %u, start = %u, req cnt %u, cnt = %u\n",
+			    __func__, info->rsrc_type, info->start_idx, info->count,
+			    info->hmc_info->hmc_obj[info->rsrc_type].cnt);
+		return I40IW_ERR_INVALID_HMC_OBJ_COUNT;
+	}
+
+	if (!dev->is_pf)
+		return i40iw_vchnl_vf_add_hmc_objs(dev, info->rsrc_type, 0, info->count);
+
+	i40iw_find_sd_index_limit(info->hmc_info, info->rsrc_type,
+				  info->start_idx, info->count,
+				  &sd_idx, &sd_lmt);
+	if (sd_idx >= info->hmc_info->sd_table.sd_cnt ||
+	    sd_lmt > info->hmc_info->sd_table.sd_cnt) {
+		return I40IW_ERR_INVALID_SD_INDEX;
+	}
+	i40iw_find_pd_index_limit(info->hmc_info, info->rsrc_type,
+				  info->start_idx, info->count, &pd_idx, &pd_lmt);
+
+	for (j = sd_idx; j < sd_lmt; j++) {
+		ret_code = i40iw_add_sd_table_entry(dev->hw, info->hmc_info,
+						    j,
+						    info->entry_type,
+						    I40IW_HMC_DIRECT_BP_SIZE);
+		if (ret_code)
+			goto exit_sd_error;
+		sd_entry = &info->hmc_info->sd_table.sd_entry[j];
+
+		if ((sd_entry->entry_type == I40IW_SD_TYPE_PAGED) &&
+		    ((dev->hmc_info == info->hmc_info) &&
+		     (info->rsrc_type != I40IW_HMC_IW_PBLE))) {
+			pd_idx1 = max(pd_idx, (j * I40IW_HMC_MAX_BP_COUNT));
+			pd_lmt1 = min(pd_lmt,
+				      (j + 1) * I40IW_HMC_MAX_BP_COUNT);
+			for (i = pd_idx1; i < pd_lmt1; i++) {
+				/* update the pd table entry */
+				ret_code = i40iw_add_pd_table_entry(dev->hw, info->hmc_info,
+								    i, NULL);
+				if (ret_code) {
+					pd_error = true;
+					break;
+				}
+			}
+			if (pd_error) {
+				while (i && (i > pd_idx1)) {
+					i40iw_remove_pd_bp(dev->hw, info->hmc_info, (i - 1),
+							   info->is_pf);
+					i--;
+				}
+			}
+		}
+		if (sd_entry->valid)
+			continue;
+
+		info->hmc_info->sd_indexes[info->add_sd_cnt] = (u16)j;
+		info->add_sd_cnt++;
+		sd_entry->valid = true;
+	}
+	return i40iw_hmc_finish_add_sd_reg(dev, info);
+
+exit_sd_error:
+	while (j && (j > sd_idx)) {
+		sd_entry = &info->hmc_info->sd_table.sd_entry[j - 1];
+		switch (sd_entry->entry_type) {
+		case I40IW_SD_TYPE_PAGED:
+			pd_idx1 = max(pd_idx,
+				      (j - 1) * I40IW_HMC_MAX_BP_COUNT);
+			pd_lmt1 = min(pd_lmt, (j * I40IW_HMC_MAX_BP_COUNT));
+			for (i = pd_idx1; i < pd_lmt1; i++)
+				i40iw_prep_remove_pd_page(info->hmc_info, i);
+			break;
+		case I40IW_SD_TYPE_DIRECT:
+			i40iw_prep_remove_pd_page(info->hmc_info, (j - 1));
+			break;
+		default:
+			ret_code = I40IW_ERR_INVALID_SD_TYPE;
+			break;
+		}
+		j--;
+	}
+
+	return ret_code;
+}
+
+/**
+ * i40iw_finish_del_sd_reg - delete sd entries for objects
+ * @dev: pointer to the device structure
+ * @info: dele obj info
+ * @reset: true if called before reset
+ */
+static enum i40iw_status_code i40iw_finish_del_sd_reg(struct i40iw_sc_dev *dev,
+						      struct i40iw_hmc_del_obj_info *info,
+						      bool reset)
+{
+	struct i40iw_hmc_sd_entry *sd_entry;
+	enum i40iw_status_code ret_code = 0;
+	u32 i, sd_idx;
+	struct i40iw_dma_mem *mem;
+
+	if (dev->is_pf && !reset)
+		ret_code = i40iw_hmc_sd_grp(dev, info->hmc_info,
+					    info->hmc_info->sd_indexes[0],
+					    info->del_sd_cnt, false);
+
+	if (ret_code)
+		i40iw_debug(dev, I40IW_DEBUG_HMC, "%s: error cqp sd sd_grp\n", __func__);
+
+	for (i = 0; i < info->del_sd_cnt; i++) {
+		sd_idx = info->hmc_info->sd_indexes[i];
+		sd_entry = &info->hmc_info->sd_table.sd_entry[sd_idx];
+		if (!sd_entry)
+			continue;
+		mem = (sd_entry->entry_type == I40IW_SD_TYPE_PAGED) ?
+			&sd_entry->u.pd_table.pd_page_addr :
+			&sd_entry->u.bp.addr;
+
+		if (!mem || !mem->va)
+			i40iw_debug(dev, I40IW_DEBUG_HMC, "%s: error cqp sd mem\n", __func__);
+		else
+			i40iw_free_dma_mem(dev->hw, mem);
+	}
+	return ret_code;
+}
+
+/**
+ * i40iw_del_iw_hmc_obj - remove pe hmc objects
+ * @dev: pointer to the device structure
+ * @info: pointer to i40iw_hmc_del_obj_info struct
+ * @reset: true if called before reset
+ *
+ * This will de-populate the SDs and PDs.  It frees
+ * the memory for PDS and backing storage.  After this function is returned,
+ * caller should deallocate memory allocated previously for
+ * book-keeping information about PDs and backing storage.
+ */
+enum i40iw_status_code i40iw_sc_del_hmc_obj(struct i40iw_sc_dev *dev,
+					    struct i40iw_hmc_del_obj_info *info,
+					    bool reset)
+{
+	struct i40iw_hmc_pd_table *pd_table;
+	u32 sd_idx, sd_lmt;
+	u32 pd_idx, pd_lmt, rel_pd_idx;
+	u32 i, j;
+	enum i40iw_status_code ret_code = 0;
+
+	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		i40iw_debug(dev, I40IW_DEBUG_HMC,
+			    "%s: error start_idx[%04d]  >= [type %04d].cnt[%04d]\n",
+			    __func__, info->start_idx, info->rsrc_type,
+			    info->hmc_info->hmc_obj[info->rsrc_type].cnt);
+		return I40IW_ERR_INVALID_HMC_OBJ_INDEX;
+	}
+
+	if ((info->start_idx + info->count) >
+	    info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		i40iw_debug(dev, I40IW_DEBUG_HMC,
+			    "%s: error start_idx[%04d] + count %04d  >= [type %04d].cnt[%04d]\n",
+			    __func__, info->start_idx, info->count,
+			    info->rsrc_type,
+			    info->hmc_info->hmc_obj[info->rsrc_type].cnt);
+		return I40IW_ERR_INVALID_HMC_OBJ_COUNT;
+	}
+	if (!dev->is_pf) {
+		ret_code = i40iw_vchnl_vf_del_hmc_obj(dev, info->rsrc_type, 0,
+						      info->count);
+		if (info->rsrc_type != I40IW_HMC_IW_PBLE)
+			return ret_code;
+	}
+
+	i40iw_find_pd_index_limit(info->hmc_info, info->rsrc_type,
+				  info->start_idx, info->count, &pd_idx, &pd_lmt);
+
+	for (j = pd_idx; j < pd_lmt; j++) {
+		sd_idx = j / I40IW_HMC_PD_CNT_IN_SD;
+
+		if (info->hmc_info->sd_table.sd_entry[sd_idx].entry_type !=
+		    I40IW_SD_TYPE_PAGED)
+			continue;
+
+		rel_pd_idx = j % I40IW_HMC_PD_CNT_IN_SD;
+		pd_table = &info->hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
+		if (pd_table->pd_entry[rel_pd_idx].valid) {
+			ret_code = i40iw_remove_pd_bp(dev->hw, info->hmc_info, j,
+						      info->is_pf);
+			if (ret_code) {
+				i40iw_debug(dev, I40IW_DEBUG_HMC, "%s: error\n", __func__);
+				return ret_code;
+			}
+		}
+	}
+
+	i40iw_find_sd_index_limit(info->hmc_info, info->rsrc_type,
+				  info->start_idx, info->count, &sd_idx, &sd_lmt);
+	if (sd_idx >= info->hmc_info->sd_table.sd_cnt ||
+	    sd_lmt > info->hmc_info->sd_table.sd_cnt) {
+		i40iw_debug(dev, I40IW_DEBUG_HMC, "%s: error invalid sd_idx\n", __func__);
+		return I40IW_ERR_INVALID_SD_INDEX;
+	}
+
+	for (i = sd_idx; i < sd_lmt; i++) {
+		if (!info->hmc_info->sd_table.sd_entry[i].valid)
+			continue;
+		switch (info->hmc_info->sd_table.sd_entry[i].entry_type) {
+		case I40IW_SD_TYPE_DIRECT:
+			ret_code = i40iw_prep_remove_sd_bp(info->hmc_info, i);
+			if (!ret_code) {
+				info->hmc_info->sd_indexes[info->del_sd_cnt] = (u16)i;
+				info->del_sd_cnt++;
+			}
+			break;
+		case I40IW_SD_TYPE_PAGED:
+			ret_code = i40iw_prep_remove_pd_page(info->hmc_info, i);
+			if (!ret_code) {
+				info->hmc_info->sd_indexes[info->del_sd_cnt] = (u16)i;
+				info->del_sd_cnt++;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+	return i40iw_finish_del_sd_reg(dev, info, reset);
+}
+
+/**
+ * i40iw_add_sd_table_entry - Adds a segment descriptor to the table
+ * @hw: pointer to our hw struct
+ * @hmc_info: pointer to the HMC configuration information struct
+ * @sd_index: segment descriptor index to manipulate
+ * @type: what type of segment descriptor we're manipulating
+ * @direct_mode_sz: size to alloc in direct mode
+ */
+enum i40iw_status_code i40iw_add_sd_table_entry(struct i40iw_hw *hw,
+						struct i40iw_hmc_info *hmc_info,
+						u32 sd_index,
+						enum i40iw_sd_entry_type type,
+						u64 direct_mode_sz)
+{
+	enum i40iw_status_code ret_code = 0;
+	struct i40iw_hmc_sd_entry *sd_entry;
+	bool dma_mem_alloc_done = false;
+	struct i40iw_dma_mem mem;
+	u64 alloc_len;
+
+	sd_entry = &hmc_info->sd_table.sd_entry[sd_index];
+	if (!sd_entry->valid) {
+		if (type == I40IW_SD_TYPE_PAGED)
+			alloc_len = I40IW_HMC_PAGED_BP_SIZE;
+		else
+			alloc_len = direct_mode_sz;
+
+		/* allocate a 4K pd page or 2M backing page */
+		ret_code = i40iw_allocate_dma_mem(hw, &mem, alloc_len,
+						  I40IW_HMC_PD_BP_BUF_ALIGNMENT);
+		if (ret_code)
+			goto exit;
+		dma_mem_alloc_done = true;
+		if (type == I40IW_SD_TYPE_PAGED) {
+			ret_code = i40iw_allocate_virt_mem(hw,
+							   &sd_entry->u.pd_table.pd_entry_virt_mem,
+							   sizeof(struct i40iw_hmc_pd_entry) * 512);
+			if (ret_code)
+				goto exit;
+			sd_entry->u.pd_table.pd_entry = (struct i40iw_hmc_pd_entry *)
+							 sd_entry->u.pd_table.pd_entry_virt_mem.va;
+
+			memcpy(&sd_entry->u.pd_table.pd_page_addr, &mem, sizeof(struct i40iw_dma_mem));
+		} else {
+			memcpy(&sd_entry->u.bp.addr, &mem, sizeof(struct i40iw_dma_mem));
+			sd_entry->u.bp.sd_pd_index = sd_index;
+		}
+
+		hmc_info->sd_table.sd_entry[sd_index].entry_type = type;
+
+		I40IW_INC_SD_REFCNT(&hmc_info->sd_table);
+	}
+	if (sd_entry->entry_type == I40IW_SD_TYPE_DIRECT)
+		I40IW_INC_BP_REFCNT(&sd_entry->u.bp);
+exit:
+	if (ret_code)
+		if (dma_mem_alloc_done)
+			i40iw_free_dma_mem(hw, &mem);
+
+	return ret_code;
+}
+
+/**
+ * i40iw_add_pd_table_entry - Adds page descriptor to the specified table
+ * @hw: pointer to our HW structure
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @pd_index: which page descriptor index to manipulate
+ * @rsrc_pg: if not NULL, use preallocated page instead of allocating new one.
+ *
+ * This function:
+ *	1. Initializes the pd entry
+ *	2. Adds pd_entry in the pd_table
+ *	3. Mark the entry valid in i40iw_hmc_pd_entry structure
+ *	4. Initializes the pd_entry's ref count to 1
+ * assumptions:
+ *	1. The memory for pd should be pinned down, physically contiguous and
+ *	   aligned on 4K boundary and zeroed memory.
+ *	2. It should be 4K in size.
+ */
+enum i40iw_status_code i40iw_add_pd_table_entry(struct i40iw_hw *hw,
+						struct i40iw_hmc_info *hmc_info,
+						u32 pd_index,
+						struct i40iw_dma_mem *rsrc_pg)
+{
+	enum i40iw_status_code ret_code = 0;
+	struct i40iw_hmc_pd_table *pd_table;
+	struct i40iw_hmc_pd_entry *pd_entry;
+	struct i40iw_dma_mem mem;
+	struct i40iw_dma_mem *page = &mem;
+	u32 sd_idx, rel_pd_idx;
+	u64 *pd_addr;
+	u64 page_desc;
+
+	if (pd_index / I40IW_HMC_PD_CNT_IN_SD >= hmc_info->sd_table.sd_cnt)
+		return I40IW_ERR_INVALID_PAGE_DESC_INDEX;
+
+	sd_idx = (pd_index / I40IW_HMC_PD_CNT_IN_SD);
+	if (hmc_info->sd_table.sd_entry[sd_idx].entry_type != I40IW_SD_TYPE_PAGED)
+		return 0;
+
+	rel_pd_idx = (pd_index % I40IW_HMC_PD_CNT_IN_SD);
+	pd_table = &hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
+	pd_entry = &pd_table->pd_entry[rel_pd_idx];
+	if (!pd_entry->valid) {
+		if (rsrc_pg) {
+			pd_entry->rsrc_pg = true;
+			page = rsrc_pg;
+		} else {
+			ret_code = i40iw_allocate_dma_mem(hw, page,
+							  I40IW_HMC_PAGED_BP_SIZE,
+							  I40IW_HMC_PD_BP_BUF_ALIGNMENT);
+			if (ret_code)
+				return ret_code;
+			pd_entry->rsrc_pg = false;
+		}
+
+		memcpy(&pd_entry->bp.addr, page, sizeof(struct i40iw_dma_mem));
+		pd_entry->bp.sd_pd_index = pd_index;
+		pd_entry->bp.entry_type = I40IW_SD_TYPE_PAGED;
+		page_desc = page->pa | 0x1;
+
+		pd_addr = (u64 *)pd_table->pd_page_addr.va;
+		pd_addr += rel_pd_idx;
+
+		memcpy(pd_addr, &page_desc, sizeof(*pd_addr));
+
+		pd_entry->sd_index = sd_idx;
+		pd_entry->valid = true;
+		I40IW_INC_PD_REFCNT(pd_table);
+		if (hmc_info->hmc_fn_id < I40IW_FIRST_VF_FPM_ID)
+			I40IW_INVALIDATE_PF_HMC_PD(hw, sd_idx, rel_pd_idx);
+		else if (hw->hmc.hmc_fn_id != hmc_info->hmc_fn_id)
+			I40IW_INVALIDATE_VF_HMC_PD(hw, sd_idx, rel_pd_idx,
+						   hmc_info->hmc_fn_id);
+	}
+	I40IW_INC_BP_REFCNT(&pd_entry->bp);
+
+	return 0;
+}
+
+/**
+ * i40iw_remove_pd_bp - remove a backing page from a page descriptor
+ * @hw: pointer to our HW structure
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: the page index
+ * @is_pf: distinguishes a VF from a PF
+ *
+ * This function:
+ *	1. Marks the entry in pd table (for paged address mode) or in sd table
+ *	   (for direct address mode) invalid.
+ *	2. Write to register PMPDINV to invalidate the backing page in FV cache
+ *	3. Decrement the ref count for the pd _entry
+ * assumptions:
+ *	1. Caller can deallocate the memory used by backing storage after this
+ *	   function returns.
+ */
+enum i40iw_status_code i40iw_remove_pd_bp(struct i40iw_hw *hw,
+					  struct i40iw_hmc_info *hmc_info,
+					  u32 idx,
+					  bool is_pf)
+{
+	struct i40iw_hmc_pd_entry *pd_entry;
+	struct i40iw_hmc_pd_table *pd_table;
+	struct i40iw_hmc_sd_entry *sd_entry;
+	u32 sd_idx, rel_pd_idx;
+	struct i40iw_dma_mem *mem;
+	u64 *pd_addr;
+
+	sd_idx = idx / I40IW_HMC_PD_CNT_IN_SD;
+	rel_pd_idx = idx % I40IW_HMC_PD_CNT_IN_SD;
+	if (sd_idx >= hmc_info->sd_table.sd_cnt)
+		return I40IW_ERR_INVALID_PAGE_DESC_INDEX;
+
+	sd_entry = &hmc_info->sd_table.sd_entry[sd_idx];
+	if (sd_entry->entry_type != I40IW_SD_TYPE_PAGED)
+		return I40IW_ERR_INVALID_SD_TYPE;
+
+	pd_table = &hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
+	pd_entry = &pd_table->pd_entry[rel_pd_idx];
+	I40IW_DEC_BP_REFCNT(&pd_entry->bp);
+	if (pd_entry->bp.ref_cnt)
+		return 0;
+
+	pd_entry->valid = false;
+	I40IW_DEC_PD_REFCNT(pd_table);
+	pd_addr = (u64 *)pd_table->pd_page_addr.va;
+	pd_addr += rel_pd_idx;
+	memset(pd_addr, 0, sizeof(u64));
+	if (is_pf)
+		I40IW_INVALIDATE_PF_HMC_PD(hw, sd_idx, idx);
+	else
+		I40IW_INVALIDATE_VF_HMC_PD(hw, sd_idx, idx,
+					   hmc_info->hmc_fn_id);
+
+	if (!pd_entry->rsrc_pg) {
+		mem = &pd_entry->bp.addr;
+		if (!mem || !mem->va)
+			return I40IW_ERR_PARAM;
+		i40iw_free_dma_mem(hw, mem);
+	}
+	if (!pd_table->ref_cnt)
+		i40iw_free_virt_mem(hw, &pd_table->pd_entry_virt_mem);
+
+	return 0;
+}
+
+/**
+ * i40iw_prep_remove_sd_bp - Prepares to remove a backing page from a sd entry
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: the page index
+ */
+enum i40iw_status_code i40iw_prep_remove_sd_bp(struct i40iw_hmc_info *hmc_info, u32 idx)
+{
+	struct i40iw_hmc_sd_entry *sd_entry;
+
+	sd_entry = &hmc_info->sd_table.sd_entry[idx];
+	I40IW_DEC_BP_REFCNT(&sd_entry->u.bp);
+	if (sd_entry->u.bp.ref_cnt)
+		return I40IW_ERR_NOT_READY;
+
+	I40IW_DEC_SD_REFCNT(&hmc_info->sd_table);
+	sd_entry->valid = false;
+
+	return 0;
+}
+
+/**
+ * i40iw_prep_remove_pd_page - Prepares to remove a PD page from sd entry.
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: segment descriptor index to find the relevant page descriptor
+ */
+enum i40iw_status_code i40iw_prep_remove_pd_page(struct i40iw_hmc_info *hmc_info,
+						 u32 idx)
+{
+	struct i40iw_hmc_sd_entry *sd_entry;
+
+	sd_entry = &hmc_info->sd_table.sd_entry[idx];
+
+	if (sd_entry->u.pd_table.ref_cnt)
+		return I40IW_ERR_NOT_READY;
+
+	sd_entry->valid = false;
+	I40IW_DEC_SD_REFCNT(&hmc_info->sd_table);
+
+	return 0;
+}
+
+/**
+ * i40iw_pf_init_vfhmc -
+ * @vf_cnt_array: array of cnt values of iwarp hmc objects
+ * @vf_hmc_fn_id: hmc function id ofr vf driver
+ * @dev: pointer to i40iw_dev struct
+ *
+ * Called by pf driver to initialize hmc_info for vf driver instance.
+ */
+enum i40iw_status_code i40iw_pf_init_vfhmc(struct i40iw_sc_dev *dev,
+					   u8 vf_hmc_fn_id,
+					   u32 *vf_cnt_array)
+{
+	struct i40iw_hmc_info *hmc_info;
+	enum i40iw_status_code ret_code = 0;
+	u32 i;
+
+	if ((vf_hmc_fn_id < I40IW_FIRST_VF_FPM_ID) ||
+	    (vf_hmc_fn_id >= I40IW_FIRST_VF_FPM_ID +
+	     I40IW_MAX_PE_ENABLED_VF_COUNT)) {
+		i40iw_debug(dev, I40IW_DEBUG_HMC, "%s: invalid vf_hmc_fn_id  0x%x\n",
+			    __func__, vf_hmc_fn_id);
+		return I40IW_ERR_INVALID_HMCFN_ID;
+	}
+
+	ret_code = i40iw_sc_init_iw_hmc(dev, vf_hmc_fn_id);
+	if (ret_code)
+		return ret_code;
+
+	hmc_info = i40iw_vf_hmcinfo_from_fpm(dev, vf_hmc_fn_id);
+
+	for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++)
+		if (vf_cnt_array)
+			hmc_info->hmc_obj[i].cnt =
+			    vf_cnt_array[i - I40IW_HMC_IW_QP];
+		else
+			hmc_info->hmc_obj[i].cnt = hmc_info->hmc_obj[i].max_cnt;
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_hmc.h b/drivers/infiniband/hw/i40iw/i40iw_hmc.h
new file mode 100644
index 0000000..4c3fdd8
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_hmc.h
@@ -0,0 +1,241 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_HMC_H
+#define I40IW_HMC_H
+
+#include "i40iw_d.h"
+
+struct i40iw_hw;
+enum i40iw_status_code;
+
+#define I40IW_HMC_MAX_BP_COUNT 512
+#define I40IW_MAX_SD_ENTRIES 11
+#define I40IW_HW_DBG_HMC_INVALID_BP_MARK     0xCA
+
+#define I40IW_HMC_INFO_SIGNATURE	0x484D5347
+#define I40IW_HMC_PD_CNT_IN_SD		512
+#define I40IW_HMC_DIRECT_BP_SIZE	0x200000
+#define I40IW_HMC_MAX_SD_COUNT		4096
+#define I40IW_HMC_PAGED_BP_SIZE		4096
+#define I40IW_HMC_PD_BP_BUF_ALIGNMENT	4096
+#define I40IW_FIRST_VF_FPM_ID		16
+#define FPM_MULTIPLIER			1024
+
+#define I40IW_INC_SD_REFCNT(sd_table)   ((sd_table)->ref_cnt++)
+#define I40IW_INC_PD_REFCNT(pd_table)   ((pd_table)->ref_cnt++)
+#define I40IW_INC_BP_REFCNT(bp)         ((bp)->ref_cnt++)
+
+#define I40IW_DEC_SD_REFCNT(sd_table)   ((sd_table)->ref_cnt--)
+#define I40IW_DEC_PD_REFCNT(pd_table)   ((pd_table)->ref_cnt--)
+#define I40IW_DEC_BP_REFCNT(bp)         ((bp)->ref_cnt--)
+
+/**
+ * I40IW_INVALIDATE_PF_HMC_PD - Invalidates the pd cache in the hardware
+ * @hw: pointer to our hw struct
+ * @sd_idx: segment descriptor index
+ * @pd_idx: page descriptor index
+ */
+#define I40IW_INVALIDATE_PF_HMC_PD(hw, sd_idx, pd_idx)                  \
+	i40iw_wr32((hw), I40E_PFHMC_PDINV,                                    \
+		(((sd_idx) << I40E_PFHMC_PDINV_PMSDIDX_SHIFT) |             \
+		(0x1 << I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT) | \
+		((pd_idx) << I40E_PFHMC_PDINV_PMPDIDX_SHIFT)))
+
+/**
+ * I40IW_INVALIDATE_VF_HMC_PD - Invalidates the pd cache in the hardware
+ * @hw: pointer to our hw struct
+ * @sd_idx: segment descriptor index
+ * @pd_idx: page descriptor index
+ * @hmc_fn_id: VF's function id
+ */
+#define I40IW_INVALIDATE_VF_HMC_PD(hw, sd_idx, pd_idx, hmc_fn_id)        \
+	i40iw_wr32(hw, I40E_GLHMC_VFPDINV(hmc_fn_id - I40IW_FIRST_VF_FPM_ID),  \
+	     ((sd_idx << I40E_PFHMC_PDINV_PMSDIDX_SHIFT) |              \
+	      (pd_idx << I40E_PFHMC_PDINV_PMPDIDX_SHIFT)))
+
+struct i40iw_hmc_obj_info {
+	u64 base;
+	u32 max_cnt;
+	u32 cnt;
+	u64 size;
+};
+
+enum i40iw_sd_entry_type {
+	I40IW_SD_TYPE_INVALID = 0,
+	I40IW_SD_TYPE_PAGED = 1,
+	I40IW_SD_TYPE_DIRECT = 2
+};
+
+struct i40iw_hmc_bp {
+	enum i40iw_sd_entry_type entry_type;
+	struct i40iw_dma_mem addr;
+	u32 sd_pd_index;
+	u32 ref_cnt;
+};
+
+struct i40iw_hmc_pd_entry {
+	struct i40iw_hmc_bp bp;
+	u32 sd_index;
+	bool rsrc_pg;
+	bool valid;
+};
+
+struct i40iw_hmc_pd_table {
+	struct i40iw_dma_mem pd_page_addr;
+	struct i40iw_hmc_pd_entry *pd_entry;
+	struct i40iw_virt_mem pd_entry_virt_mem;
+	u32 ref_cnt;
+	u32 sd_index;
+};
+
+struct i40iw_hmc_sd_entry {
+	enum i40iw_sd_entry_type entry_type;
+	bool valid;
+
+	union {
+		struct i40iw_hmc_pd_table pd_table;
+		struct i40iw_hmc_bp bp;
+	} u;
+};
+
+struct i40iw_hmc_sd_table {
+	struct i40iw_virt_mem addr;
+	u32 sd_cnt;
+	u32 ref_cnt;
+	struct i40iw_hmc_sd_entry *sd_entry;
+};
+
+struct i40iw_hmc_info {
+	u32 signature;
+	u8 hmc_fn_id;
+	u16 first_sd_index;
+
+	struct i40iw_hmc_obj_info *hmc_obj;
+	struct i40iw_virt_mem hmc_obj_virt_mem;
+	struct i40iw_hmc_sd_table sd_table;
+	u16 sd_indexes[I40IW_HMC_MAX_SD_COUNT];
+};
+
+struct update_sd_entry {
+	u64 cmd;
+	u64 data;
+};
+
+struct i40iw_update_sds_info {
+	u32 cnt;
+	u8 hmc_fn_id;
+	struct update_sd_entry entry[I40IW_MAX_SD_ENTRIES];
+};
+
+struct i40iw_ccq_cqe_info;
+struct i40iw_hmc_fcn_info {
+	void (*callback_fcn)(struct i40iw_sc_dev *, void *,
+			     struct i40iw_ccq_cqe_info *);
+	void *cqp_callback_param;
+	u32 vf_id;
+	u16 iw_vf_idx;
+	bool free_fcn;
+};
+
+enum i40iw_hmc_rsrc_type {
+	I40IW_HMC_IW_QP = 0,
+	I40IW_HMC_IW_CQ = 1,
+	I40IW_HMC_IW_SRQ = 2,
+	I40IW_HMC_IW_HTE = 3,
+	I40IW_HMC_IW_ARP = 4,
+	I40IW_HMC_IW_APBVT_ENTRY = 5,
+	I40IW_HMC_IW_MR = 6,
+	I40IW_HMC_IW_XF = 7,
+	I40IW_HMC_IW_XFFL = 8,
+	I40IW_HMC_IW_Q1 = 9,
+	I40IW_HMC_IW_Q1FL = 10,
+	I40IW_HMC_IW_TIMER = 11,
+	I40IW_HMC_IW_FSIMC = 12,
+	I40IW_HMC_IW_FSIAV = 13,
+	I40IW_HMC_IW_PBLE = 14,
+	I40IW_HMC_IW_MAX = 15,
+};
+
+struct i40iw_hmc_create_obj_info {
+	struct i40iw_hmc_info *hmc_info;
+	struct i40iw_virt_mem add_sd_virt_mem;
+	u32 rsrc_type;
+	u32 start_idx;
+	u32 count;
+	u32 add_sd_cnt;
+	enum i40iw_sd_entry_type entry_type;
+	bool is_pf;
+};
+
+struct i40iw_hmc_del_obj_info {
+	struct i40iw_hmc_info *hmc_info;
+	struct i40iw_virt_mem del_sd_virt_mem;
+	u32 rsrc_type;
+	u32 start_idx;
+	u32 count;
+	u32 del_sd_cnt;
+	bool is_pf;
+};
+
+enum i40iw_status_code i40iw_copy_dma_mem(struct i40iw_hw *hw, void *dest_buf,
+					  struct i40iw_dma_mem *src_mem, u64 src_offset, u64 size);
+enum i40iw_status_code i40iw_sc_create_hmc_obj(struct i40iw_sc_dev *dev,
+					       struct i40iw_hmc_create_obj_info *info);
+enum i40iw_status_code i40iw_sc_del_hmc_obj(struct i40iw_sc_dev *dev,
+					    struct i40iw_hmc_del_obj_info *info,
+					    bool reset);
+enum i40iw_status_code i40iw_hmc_sd_one(struct i40iw_sc_dev *dev, u8 hmc_fn_id,
+					u64 pa, u32 sd_idx, enum i40iw_sd_entry_type type,
+					bool setsd);
+enum i40iw_status_code i40iw_update_sds_noccq(struct i40iw_sc_dev *dev,
+					      struct i40iw_update_sds_info *info);
+struct i40iw_vfdev *i40iw_vfdev_from_fpm(struct i40iw_sc_dev *dev, u8 hmc_fn_id);
+struct i40iw_hmc_info *i40iw_vf_hmcinfo_from_fpm(struct i40iw_sc_dev *dev,
+						 u8 hmc_fn_id);
+enum i40iw_status_code i40iw_add_sd_table_entry(struct i40iw_hw *hw,
+						struct i40iw_hmc_info *hmc_info, u32 sd_index,
+						enum i40iw_sd_entry_type type, u64 direct_mode_sz);
+enum i40iw_status_code i40iw_add_pd_table_entry(struct i40iw_hw *hw,
+						struct i40iw_hmc_info *hmc_info, u32 pd_index,
+						struct i40iw_dma_mem *rsrc_pg);
+enum i40iw_status_code i40iw_remove_pd_bp(struct i40iw_hw *hw,
+					  struct i40iw_hmc_info *hmc_info, u32 idx, bool is_pf);
+enum i40iw_status_code i40iw_prep_remove_sd_bp(struct i40iw_hmc_info *hmc_info, u32 idx);
+enum i40iw_status_code i40iw_prep_remove_pd_page(struct i40iw_hmc_info *hmc_info, u32 idx);
+
+#define     ENTER_SHARED_FUNCTION()
+#define     EXIT_SHARED_FUNCTION()
+
+#endif				/* I40IW_HMC_H */
diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c
new file mode 100644
index 0000000..c9f62ca
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c
@@ -0,0 +1,805 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+
+#include "i40iw.h"
+
+/**
+ * i40iw_initialize_hw_resources - initialize hw resource during open
+ * @iwdev: iwarp device
+ */
+u32 i40iw_initialize_hw_resources(struct i40iw_device *iwdev)
+{
+	unsigned long num_pds;
+	u32 resources_size;
+	u32 max_mr;
+	u32 max_qp;
+	u32 max_cq;
+	u32 arp_table_size;
+	u32 mrdrvbits;
+	void *resource_ptr;
+
+	max_qp = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_QP].cnt;
+	max_cq = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt;
+	max_mr = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_MR].cnt;
+	arp_table_size = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_ARP].cnt;
+	iwdev->max_cqe = 0xFFFFF;
+	num_pds = I40IW_MAX_PDS;
+	resources_size = sizeof(struct i40iw_arp_entry) * arp_table_size;
+	resources_size += sizeof(unsigned long) * BITS_TO_LONGS(max_qp);
+	resources_size += sizeof(unsigned long) * BITS_TO_LONGS(max_mr);
+	resources_size += sizeof(unsigned long) * BITS_TO_LONGS(max_cq);
+	resources_size += sizeof(unsigned long) * BITS_TO_LONGS(num_pds);
+	resources_size += sizeof(unsigned long) * BITS_TO_LONGS(arp_table_size);
+	resources_size += sizeof(struct i40iw_qp **) * max_qp;
+	iwdev->mem_resources = kzalloc(resources_size, GFP_KERNEL);
+
+	if (!iwdev->mem_resources)
+		return -ENOMEM;
+
+	iwdev->max_qp = max_qp;
+	iwdev->max_mr = max_mr;
+	iwdev->max_cq = max_cq;
+	iwdev->max_pd = num_pds;
+	iwdev->arp_table_size = arp_table_size;
+	iwdev->arp_table = (struct i40iw_arp_entry *)iwdev->mem_resources;
+	resource_ptr = iwdev->mem_resources + (sizeof(struct i40iw_arp_entry) * arp_table_size);
+
+	iwdev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
+	    IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_MGT_EXTENSIONS;
+
+	iwdev->allocated_qps = resource_ptr;
+	iwdev->allocated_cqs = &iwdev->allocated_qps[BITS_TO_LONGS(max_qp)];
+	iwdev->allocated_mrs = &iwdev->allocated_cqs[BITS_TO_LONGS(max_cq)];
+	iwdev->allocated_pds = &iwdev->allocated_mrs[BITS_TO_LONGS(max_mr)];
+	iwdev->allocated_arps = &iwdev->allocated_pds[BITS_TO_LONGS(num_pds)];
+	iwdev->qp_table = (struct i40iw_qp **)(&iwdev->allocated_arps[BITS_TO_LONGS(arp_table_size)]);
+	set_bit(0, iwdev->allocated_mrs);
+	set_bit(0, iwdev->allocated_qps);
+	set_bit(0, iwdev->allocated_cqs);
+	set_bit(0, iwdev->allocated_pds);
+	set_bit(0, iwdev->allocated_arps);
+
+	/* Following for ILQ/IEQ */
+	set_bit(1, iwdev->allocated_qps);
+	set_bit(1, iwdev->allocated_cqs);
+	set_bit(1, iwdev->allocated_pds);
+	set_bit(2, iwdev->allocated_cqs);
+	set_bit(2, iwdev->allocated_pds);
+
+	spin_lock_init(&iwdev->resource_lock);
+	spin_lock_init(&iwdev->qptable_lock);
+	/* stag index mask has a minimum of 14 bits */
+	mrdrvbits = 24 - max(get_count_order(iwdev->max_mr), 14);
+	iwdev->mr_stagmask = ~(((1 << mrdrvbits) - 1) << (32 - mrdrvbits));
+	return 0;
+}
+
+/**
+ * i40iw_cqp_ce_handler - handle cqp completions
+ * @iwdev: iwarp device
+ * @arm: flag to arm after completions
+ * @cq: cq for cqp completions
+ */
+static void i40iw_cqp_ce_handler(struct i40iw_device *iwdev, struct i40iw_sc_cq *cq, bool arm)
+{
+	struct i40iw_cqp_request *cqp_request;
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	u32 cqe_count = 0;
+	struct i40iw_ccq_cqe_info info;
+	int ret;
+
+	do {
+		memset(&info, 0, sizeof(info));
+		ret = dev->ccq_ops->ccq_get_cqe_info(cq, &info);
+		if (ret)
+			break;
+		cqp_request = (struct i40iw_cqp_request *)(unsigned long)info.scratch;
+		if (info.error)
+			i40iw_pr_err("opcode = 0x%x maj_err_code = 0x%x min_err_code = 0x%x\n",
+				     info.op_code, info.maj_err_code, info.min_err_code);
+		if (cqp_request) {
+			cqp_request->compl_info.maj_err_code = info.maj_err_code;
+			cqp_request->compl_info.min_err_code = info.min_err_code;
+			cqp_request->compl_info.op_ret_val = info.op_ret_val;
+			cqp_request->compl_info.error = info.error;
+
+			if (cqp_request->waiting) {
+				cqp_request->request_done = true;
+				wake_up(&cqp_request->waitq);
+				i40iw_put_cqp_request(&iwdev->cqp, cqp_request);
+			} else {
+				if (cqp_request->callback_fcn)
+					cqp_request->callback_fcn(cqp_request, 1);
+				i40iw_put_cqp_request(&iwdev->cqp, cqp_request);
+			}
+		}
+
+		cqe_count++;
+	} while (1);
+
+	if (arm && cqe_count) {
+		i40iw_process_bh(dev);
+		dev->ccq_ops->ccq_arm(cq);
+	}
+}
+
+/**
+ * i40iw_iwarp_ce_handler - handle iwarp completions
+ * @iwdev: iwarp device
+ * @iwcp: iwarp cq receiving event
+ */
+static void i40iw_iwarp_ce_handler(struct i40iw_device *iwdev,
+				   struct i40iw_sc_cq *iwcq)
+{
+	struct i40iw_cq *i40iwcq = iwcq->back_cq;
+
+	if (i40iwcq->ibcq.comp_handler)
+		i40iwcq->ibcq.comp_handler(&i40iwcq->ibcq,
+					   i40iwcq->ibcq.cq_context);
+}
+
+/**
+ * i40iw_puda_ce_handler - handle puda completion events
+ * @iwdev: iwarp device
+ * @cq: puda completion q for event
+ */
+static void i40iw_puda_ce_handler(struct i40iw_device *iwdev,
+				  struct i40iw_sc_cq *cq)
+{
+	struct i40iw_sc_dev *dev = (struct i40iw_sc_dev *)&iwdev->sc_dev;
+	enum i40iw_status_code status;
+	u32 compl_error;
+
+	do {
+		status = i40iw_puda_poll_completion(dev, cq, &compl_error);
+		if (status == I40IW_ERR_QUEUE_EMPTY)
+			break;
+		if (status) {
+			i40iw_pr_err("puda  status = %d\n", status);
+			break;
+		}
+		if (compl_error) {
+			i40iw_pr_err("puda compl_err  =0x%x\n", compl_error);
+			break;
+		}
+	} while (1);
+
+	dev->ccq_ops->ccq_arm(cq);
+}
+
+/**
+ * i40iw_process_ceq - handle ceq for completions
+ * @iwdev: iwarp device
+ * @ceq: ceq having cq for completion
+ */
+void i40iw_process_ceq(struct i40iw_device *iwdev, struct i40iw_ceq *ceq)
+{
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_sc_ceq *sc_ceq;
+	struct i40iw_sc_cq *cq;
+	bool arm = true;
+
+	sc_ceq = &ceq->sc_ceq;
+	do {
+		cq = dev->ceq_ops->process_ceq(dev, sc_ceq);
+		if (!cq)
+			break;
+
+		if (cq->cq_type == I40IW_CQ_TYPE_CQP)
+			i40iw_cqp_ce_handler(iwdev, cq, arm);
+		else if (cq->cq_type == I40IW_CQ_TYPE_IWARP)
+			i40iw_iwarp_ce_handler(iwdev, cq);
+		else if ((cq->cq_type == I40IW_CQ_TYPE_ILQ) ||
+			 (cq->cq_type == I40IW_CQ_TYPE_IEQ))
+			i40iw_puda_ce_handler(iwdev, cq);
+	} while (1);
+}
+
+/**
+ * i40iw_next_iw_state - modify qp state
+ * @iwqp: iwarp qp to modify
+ * @state: next state for qp
+ * @del_hash: del hash
+ * @term: term message
+ * @termlen: length of term message
+ */
+void i40iw_next_iw_state(struct i40iw_qp *iwqp,
+			 u8 state,
+			 u8 del_hash,
+			 u8 term,
+			 u8 termlen)
+{
+	struct i40iw_modify_qp_info info;
+
+	memset(&info, 0, sizeof(info));
+	info.next_iwarp_state = state;
+	info.remove_hash_idx = del_hash;
+	info.cq_num_valid = true;
+	info.arp_cache_idx_valid = true;
+	info.dont_send_term = true;
+	info.dont_send_fin = true;
+	info.termlen = termlen;
+
+	if (term & I40IWQP_TERM_SEND_TERM_ONLY)
+		info.dont_send_term = false;
+	if (term & I40IWQP_TERM_SEND_FIN_ONLY)
+		info.dont_send_fin = false;
+	if (iwqp->sc_qp.term_flags && (state == I40IW_QP_STATE_ERROR))
+		info.reset_tcp_conn = true;
+	iwqp->hw_iwarp_state = state;
+	i40iw_hw_modify_qp(iwqp->iwdev, iwqp, &info, 0);
+}
+
+/**
+ * i40iw_process_aeq - handle aeq events
+ * @iwdev: iwarp device
+ */
+void i40iw_process_aeq(struct i40iw_device *iwdev)
+{
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_aeq *aeq = &iwdev->aeq;
+	struct i40iw_sc_aeq *sc_aeq = &aeq->sc_aeq;
+	struct i40iw_aeqe_info aeinfo;
+	struct i40iw_aeqe_info *info = &aeinfo;
+	int ret;
+	struct i40iw_qp *iwqp = NULL;
+	struct i40iw_sc_cq *cq = NULL;
+	struct i40iw_cq *iwcq = NULL;
+	struct i40iw_sc_qp *qp = NULL;
+	struct i40iw_qp_host_ctx_info *ctx_info = NULL;
+	unsigned long flags;
+
+	u32 aeqcnt = 0;
+
+	if (!sc_aeq->size)
+		return;
+
+	do {
+		memset(info, 0, sizeof(*info));
+		ret = dev->aeq_ops->get_next_aeqe(sc_aeq, info);
+		if (ret)
+			break;
+
+		aeqcnt++;
+		i40iw_debug(dev, I40IW_DEBUG_AEQ,
+			    "%s ae_id = 0x%x bool qp=%d qp_id = %d\n",
+			    __func__, info->ae_id, info->qp, info->qp_cq_id);
+		if (info->qp) {
+			spin_lock_irqsave(&iwdev->qptable_lock, flags);
+			iwqp = iwdev->qp_table[info->qp_cq_id];
+			if (!iwqp) {
+				spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
+				i40iw_debug(dev, I40IW_DEBUG_AEQ,
+					    "%s qp_id %d is already freed\n",
+					    __func__, info->qp_cq_id);
+				continue;
+			}
+			i40iw_add_ref(&iwqp->ibqp);
+			spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
+			qp = &iwqp->sc_qp;
+			spin_lock_irqsave(&iwqp->lock, flags);
+			iwqp->hw_tcp_state = info->tcp_state;
+			iwqp->hw_iwarp_state = info->iwarp_state;
+			iwqp->last_aeq = info->ae_id;
+			spin_unlock_irqrestore(&iwqp->lock, flags);
+			ctx_info = &iwqp->ctx_info;
+			ctx_info->err_rq_idx_valid = true;
+		} else {
+			if (info->ae_id != I40IW_AE_CQ_OPERATION_ERROR)
+				continue;
+		}
+
+		switch (info->ae_id) {
+		case I40IW_AE_LLP_FIN_RECEIVED:
+			if (qp->term_flags)
+				break;
+			if (atomic_inc_return(&iwqp->close_timer_started) == 1) {
+				iwqp->hw_tcp_state = I40IW_TCP_STATE_CLOSE_WAIT;
+				if ((iwqp->hw_tcp_state == I40IW_TCP_STATE_CLOSE_WAIT) &&
+				    (iwqp->ibqp_state == IB_QPS_RTS)) {
+					i40iw_next_iw_state(iwqp,
+							    I40IW_QP_STATE_CLOSING, 0, 0, 0);
+					i40iw_cm_disconn(iwqp);
+				}
+				iwqp->cm_id->add_ref(iwqp->cm_id);
+				i40iw_schedule_cm_timer(iwqp->cm_node,
+							(struct i40iw_puda_buf *)iwqp,
+							I40IW_TIMER_TYPE_CLOSE, 1, 0);
+			}
+			break;
+		case I40IW_AE_LLP_CLOSE_COMPLETE:
+			if (qp->term_flags)
+				i40iw_terminate_done(qp, 0);
+			else
+				i40iw_cm_disconn(iwqp);
+			break;
+		case I40IW_AE_BAD_CLOSE:
+			/* fall through */
+		case I40IW_AE_RESET_SENT:
+			i40iw_next_iw_state(iwqp, I40IW_QP_STATE_ERROR, 1, 0, 0);
+			i40iw_cm_disconn(iwqp);
+			break;
+		case I40IW_AE_LLP_CONNECTION_RESET:
+			if (atomic_read(&iwqp->close_timer_started))
+				break;
+			i40iw_cm_disconn(iwqp);
+			break;
+		case I40IW_AE_QP_SUSPEND_COMPLETE:
+			i40iw_qp_suspend_resume(dev, &iwqp->sc_qp, false);
+			break;
+		case I40IW_AE_TERMINATE_SENT:
+			i40iw_terminate_send_fin(qp);
+			break;
+		case I40IW_AE_LLP_TERMINATE_RECEIVED:
+			i40iw_terminate_received(qp, info);
+			break;
+		case I40IW_AE_CQ_OPERATION_ERROR:
+			i40iw_pr_err("Processing an iWARP related AE for CQ misc = 0x%04X\n",
+				     info->ae_id);
+			cq = (struct i40iw_sc_cq *)(unsigned long)info->compl_ctx;
+			iwcq = (struct i40iw_cq *)cq->back_cq;
+
+			if (iwcq->ibcq.event_handler) {
+				struct ib_event ibevent;
+
+				ibevent.device = iwcq->ibcq.device;
+				ibevent.event = IB_EVENT_CQ_ERR;
+				ibevent.element.cq = &iwcq->ibcq;
+				iwcq->ibcq.event_handler(&ibevent, iwcq->ibcq.cq_context);
+			}
+			break;
+		case I40IW_AE_LLP_DOUBT_REACHABILITY:
+			break;
+		case I40IW_AE_PRIV_OPERATION_DENIED:
+		case I40IW_AE_STAG_ZERO_INVALID:
+		case I40IW_AE_IB_RREQ_AND_Q1_FULL:
+		case I40IW_AE_DDP_UBE_INVALID_DDP_VERSION:
+		case I40IW_AE_DDP_UBE_INVALID_MO:
+		case I40IW_AE_DDP_UBE_INVALID_QN:
+		case I40IW_AE_DDP_NO_L_BIT:
+		case I40IW_AE_RDMAP_ROE_INVALID_RDMAP_VERSION:
+		case I40IW_AE_RDMAP_ROE_UNEXPECTED_OPCODE:
+		case I40IW_AE_ROE_INVALID_RDMA_READ_REQUEST:
+		case I40IW_AE_ROE_INVALID_RDMA_WRITE_OR_READ_RESP:
+		case I40IW_AE_INVALID_ARP_ENTRY:
+		case I40IW_AE_INVALID_TCP_OPTION_RCVD:
+		case I40IW_AE_STALE_ARP_ENTRY:
+		case I40IW_AE_LLP_RECEIVED_MPA_CRC_ERROR:
+		case I40IW_AE_LLP_SEGMENT_TOO_SMALL:
+		case I40IW_AE_LLP_SYN_RECEIVED:
+		case I40IW_AE_LLP_TOO_MANY_RETRIES:
+		case I40IW_AE_LCE_QP_CATASTROPHIC:
+		case I40IW_AE_LCE_FUNCTION_CATASTROPHIC:
+		case I40IW_AE_LCE_CQ_CATASTROPHIC:
+		case I40IW_AE_UDA_XMIT_DGRAM_TOO_LONG:
+		case I40IW_AE_UDA_XMIT_DGRAM_TOO_SHORT:
+			ctx_info->err_rq_idx_valid = false;
+			/* fall through */
+		default:
+			if (!info->sq && ctx_info->err_rq_idx_valid) {
+				ctx_info->err_rq_idx = info->wqe_idx;
+				ctx_info->tcp_info_valid = false;
+				ctx_info->iwarp_info_valid = false;
+				ret = dev->iw_priv_qp_ops->qp_setctx(&iwqp->sc_qp,
+								     iwqp->host_ctx.va,
+								     ctx_info);
+			}
+			i40iw_terminate_connection(qp, info);
+			break;
+		}
+		if (info->qp)
+			i40iw_rem_ref(&iwqp->ibqp);
+	} while (1);
+
+	if (aeqcnt)
+		dev->aeq_ops->repost_aeq_entries(dev, aeqcnt);
+}
+
+/**
+ * i40iw_manage_apbvt - add or delete tcp port
+ * @iwdev: iwarp device
+ * @accel_local_port: port for apbvt
+ * @add_port: add or delete port
+ */
+int i40iw_manage_apbvt(struct i40iw_device *iwdev, u16 accel_local_port, bool add_port)
+{
+	struct i40iw_apbvt_info *info;
+	enum i40iw_status_code status;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, add_port);
+	if (!cqp_request)
+		return -ENOMEM;
+
+	cqp_info = &cqp_request->info;
+	info = &cqp_info->in.u.manage_apbvt_entry.info;
+
+	memset(info, 0, sizeof(*info));
+	info->add = add_port;
+	info->port = cpu_to_le16(accel_local_port);
+
+	cqp_info->cqp_cmd = OP_MANAGE_APBVT_ENTRY;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.manage_apbvt_entry.cqp = &iwdev->cqp.sc_cqp;
+	cqp_info->in.u.manage_apbvt_entry.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP Manage APBVT entry fail");
+	return status;
+}
+
+/**
+ * i40iw_manage_arp_cache - manage hw arp cache
+ * @iwdev: iwarp device
+ * @mac_addr: mac address ptr
+ * @ip_addr: ip addr for arp cache
+ * @action: add, delete or modify
+ */
+void i40iw_manage_arp_cache(struct i40iw_device *iwdev,
+			    unsigned char *mac_addr,
+			    u32 *ip_addr,
+			    bool ipv4,
+			    u32 action)
+{
+	struct i40iw_add_arp_cache_entry_info *info;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	int arp_index;
+
+	arp_index = i40iw_arp_table(iwdev, ip_addr, ipv4, mac_addr, action);
+	if (arp_index == -1)
+		return;
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, false);
+	if (!cqp_request)
+		return;
+
+	cqp_info = &cqp_request->info;
+	if (action == I40IW_ARP_ADD) {
+		cqp_info->cqp_cmd = OP_ADD_ARP_CACHE_ENTRY;
+		info = &cqp_info->in.u.add_arp_cache_entry.info;
+		memset(info, 0, sizeof(*info));
+		info->arp_index = cpu_to_le16((u16)arp_index);
+		info->permanent = true;
+		ether_addr_copy(info->mac_addr, mac_addr);
+		cqp_info->in.u.add_arp_cache_entry.scratch = (uintptr_t)cqp_request;
+		cqp_info->in.u.add_arp_cache_entry.cqp = &iwdev->cqp.sc_cqp;
+	} else {
+		cqp_info->cqp_cmd = OP_DELETE_ARP_CACHE_ENTRY;
+		cqp_info->in.u.del_arp_cache_entry.scratch = (uintptr_t)cqp_request;
+		cqp_info->in.u.del_arp_cache_entry.cqp = &iwdev->cqp.sc_cqp;
+		cqp_info->in.u.del_arp_cache_entry.arp_index = arp_index;
+	}
+
+	cqp_info->in.u.add_arp_cache_entry.cqp = &iwdev->cqp.sc_cqp;
+	cqp_info->in.u.add_arp_cache_entry.scratch = (uintptr_t)cqp_request;
+	cqp_info->post_sq = 1;
+	if (i40iw_handle_cqp_op(iwdev, cqp_request))
+		i40iw_pr_err("CQP-OP Add/Del Arp Cache entry fail");
+}
+
+/**
+ * i40iw_send_syn_cqp_callback - do syn/ack after qhash
+ * @cqp_request: qhash cqp completion
+ * @send_ack: flag send ack
+ */
+static void i40iw_send_syn_cqp_callback(struct i40iw_cqp_request *cqp_request, u32 send_ack)
+{
+	i40iw_send_syn(cqp_request->param, send_ack);
+}
+
+/**
+ * i40iw_manage_qhash - add or modify qhash
+ * @iwdev: iwarp device
+ * @cminfo: cm info for qhash
+ * @etype: type (syn or quad)
+ * @mtype: type of qhash
+ * @cmnode: cmnode associated with connection
+ * @wait: wait for completion
+ * @user_pri:user pri of the connection
+ */
+enum i40iw_status_code i40iw_manage_qhash(struct i40iw_device *iwdev,
+					  struct i40iw_cm_info *cminfo,
+					  enum i40iw_quad_entry_type etype,
+					  enum i40iw_quad_hash_manage_type mtype,
+					  void *cmnode,
+					  bool wait)
+{
+	struct i40iw_qhash_table_info *info;
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_sc_vsi *vsi = &iwdev->vsi;
+	enum i40iw_status_code status;
+	struct i40iw_cqp *iwcqp = &iwdev->cqp;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+
+	cqp_request = i40iw_get_cqp_request(iwcqp, wait);
+	if (!cqp_request)
+		return I40IW_ERR_NO_MEMORY;
+	cqp_info = &cqp_request->info;
+	info = &cqp_info->in.u.manage_qhash_table_entry.info;
+	memset(info, 0, sizeof(*info));
+
+	info->vsi = &iwdev->vsi;
+	info->manage = mtype;
+	info->entry_type = etype;
+	if (cminfo->vlan_id != 0xFFFF) {
+		info->vlan_valid = true;
+		info->vlan_id = cpu_to_le16(cminfo->vlan_id);
+	} else {
+		info->vlan_valid = false;
+	}
+
+	info->ipv4_valid = cminfo->ipv4;
+	info->user_pri = cminfo->user_pri;
+	ether_addr_copy(info->mac_addr, iwdev->netdev->dev_addr);
+	info->qp_num = cpu_to_le32(vsi->ilq->qp_id);
+	info->dest_port = cpu_to_le16(cminfo->loc_port);
+	info->dest_ip[0] = cpu_to_le32(cminfo->loc_addr[0]);
+	info->dest_ip[1] = cpu_to_le32(cminfo->loc_addr[1]);
+	info->dest_ip[2] = cpu_to_le32(cminfo->loc_addr[2]);
+	info->dest_ip[3] = cpu_to_le32(cminfo->loc_addr[3]);
+	if (etype == I40IW_QHASH_TYPE_TCP_ESTABLISHED) {
+		info->src_port = cpu_to_le16(cminfo->rem_port);
+		info->src_ip[0] = cpu_to_le32(cminfo->rem_addr[0]);
+		info->src_ip[1] = cpu_to_le32(cminfo->rem_addr[1]);
+		info->src_ip[2] = cpu_to_le32(cminfo->rem_addr[2]);
+		info->src_ip[3] = cpu_to_le32(cminfo->rem_addr[3]);
+	}
+	if (cmnode) {
+		cqp_request->callback_fcn = i40iw_send_syn_cqp_callback;
+		cqp_request->param = (void *)cmnode;
+	}
+
+	if (info->ipv4_valid)
+		i40iw_debug(dev, I40IW_DEBUG_CM,
+			    "%s:%s IP=%pI4, port=%d, mac=%pM, vlan_id=%d\n",
+			    __func__, (!mtype) ? "DELETE" : "ADD",
+			    info->dest_ip,
+			    info->dest_port, info->mac_addr, cminfo->vlan_id);
+	else
+		i40iw_debug(dev, I40IW_DEBUG_CM,
+			    "%s:%s IP=%pI6, port=%d, mac=%pM, vlan_id=%d\n",
+			    __func__, (!mtype) ? "DELETE" : "ADD",
+			    info->dest_ip,
+			    info->dest_port, info->mac_addr, cminfo->vlan_id);
+	cqp_info->in.u.manage_qhash_table_entry.cqp = &iwdev->cqp.sc_cqp;
+	cqp_info->in.u.manage_qhash_table_entry.scratch = (uintptr_t)cqp_request;
+	cqp_info->cqp_cmd = OP_MANAGE_QHASH_TABLE_ENTRY;
+	cqp_info->post_sq = 1;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP Manage Qhash Entry fail");
+	return status;
+}
+
+/**
+ * i40iw_hw_flush_wqes - flush qp's wqe
+ * @iwdev: iwarp device
+ * @qp: hardware control qp
+ * @info: info for flush
+ * @wait: flag wait for completion
+ */
+enum i40iw_status_code i40iw_hw_flush_wqes(struct i40iw_device *iwdev,
+					   struct i40iw_sc_qp *qp,
+					   struct i40iw_qp_flush_info *info,
+					   bool wait)
+{
+	enum i40iw_status_code status;
+	struct i40iw_qp_flush_info *hw_info;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	struct i40iw_qp *iwqp = (struct i40iw_qp *)qp->back_qp;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, wait);
+	if (!cqp_request)
+		return I40IW_ERR_NO_MEMORY;
+
+	cqp_info = &cqp_request->info;
+	hw_info = &cqp_request->info.in.u.qp_flush_wqes.info;
+	memcpy(hw_info, info, sizeof(*hw_info));
+
+	cqp_info->cqp_cmd = OP_QP_FLUSH_WQES;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.qp_flush_wqes.qp = qp;
+	cqp_info->in.u.qp_flush_wqes.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status) {
+		i40iw_pr_err("CQP-OP Flush WQE's fail");
+		complete(&iwqp->sq_drained);
+		complete(&iwqp->rq_drained);
+		return status;
+	}
+	if (!cqp_request->compl_info.maj_err_code) {
+		switch (cqp_request->compl_info.min_err_code) {
+		case I40IW_CQP_COMPL_RQ_WQE_FLUSHED:
+			complete(&iwqp->sq_drained);
+			break;
+		case I40IW_CQP_COMPL_SQ_WQE_FLUSHED:
+			complete(&iwqp->rq_drained);
+			break;
+		case I40IW_CQP_COMPL_RQ_SQ_WQE_FLUSHED:
+			break;
+		default:
+			complete(&iwqp->sq_drained);
+			complete(&iwqp->rq_drained);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * i40iw_gen_ae - generate AE
+ * @iwdev: iwarp device
+ * @qp: qp associated with AE
+ * @info: info for ae
+ * @wait: wait for completion
+ */
+void i40iw_gen_ae(struct i40iw_device *iwdev,
+		  struct i40iw_sc_qp *qp,
+		  struct i40iw_gen_ae_info *info,
+		  bool wait)
+{
+	struct i40iw_gen_ae_info *ae_info;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, wait);
+	if (!cqp_request)
+		return;
+
+	cqp_info = &cqp_request->info;
+	ae_info = &cqp_request->info.in.u.gen_ae.info;
+	memcpy(ae_info, info, sizeof(*ae_info));
+
+	cqp_info->cqp_cmd = OP_GEN_AE;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.gen_ae.qp = qp;
+	cqp_info->in.u.gen_ae.scratch = (uintptr_t)cqp_request;
+	if (i40iw_handle_cqp_op(iwdev, cqp_request))
+		i40iw_pr_err("CQP OP failed attempting to generate ae_code=0x%x\n",
+			     info->ae_code);
+}
+
+/**
+ * i40iw_hw_manage_vf_pble_bp - manage vf pbles
+ * @iwdev: iwarp device
+ * @info: info for managing pble
+ * @wait: flag wait for completion
+ */
+enum i40iw_status_code i40iw_hw_manage_vf_pble_bp(struct i40iw_device *iwdev,
+						  struct i40iw_manage_vf_pble_info *info,
+						  bool wait)
+{
+	enum i40iw_status_code status;
+	struct i40iw_manage_vf_pble_info *hw_info;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+
+	if ((iwdev->init_state < CCQ_CREATED) && wait)
+		wait = false;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, wait);
+	if (!cqp_request)
+		return I40IW_ERR_NO_MEMORY;
+
+	cqp_info = &cqp_request->info;
+	hw_info = &cqp_request->info.in.u.manage_vf_pble_bp.info;
+	memcpy(hw_info, info, sizeof(*hw_info));
+
+	cqp_info->cqp_cmd = OP_MANAGE_VF_PBLE_BP;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.manage_vf_pble_bp.cqp = &iwdev->cqp.sc_cqp;
+	cqp_info->in.u.manage_vf_pble_bp.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP Manage VF pble_bp fail");
+	return status;
+}
+
+/**
+ * i40iw_get_ib_wc - return change flush code to IB's
+ * @opcode: iwarp flush code
+ */
+static enum ib_wc_status i40iw_get_ib_wc(enum i40iw_flush_opcode opcode)
+{
+	switch (opcode) {
+	case FLUSH_PROT_ERR:
+		return IB_WC_LOC_PROT_ERR;
+	case FLUSH_REM_ACCESS_ERR:
+		return IB_WC_REM_ACCESS_ERR;
+	case FLUSH_LOC_QP_OP_ERR:
+		return IB_WC_LOC_QP_OP_ERR;
+	case FLUSH_REM_OP_ERR:
+		return IB_WC_REM_OP_ERR;
+	case FLUSH_LOC_LEN_ERR:
+		return IB_WC_LOC_LEN_ERR;
+	case FLUSH_GENERAL_ERR:
+		return IB_WC_GENERAL_ERR;
+	case FLUSH_FATAL_ERR:
+	default:
+		return IB_WC_FATAL_ERR;
+	}
+}
+
+/**
+ * i40iw_set_flush_info - set flush info
+ * @pinfo: set flush info
+ * @min: minor err
+ * @maj: major err
+ * @opcode: flush error code
+ */
+static void i40iw_set_flush_info(struct i40iw_qp_flush_info *pinfo,
+				 u16 *min,
+				 u16 *maj,
+				 enum i40iw_flush_opcode opcode)
+{
+	*min = (u16)i40iw_get_ib_wc(opcode);
+	*maj = CQE_MAJOR_DRV;
+	pinfo->userflushcode = true;
+}
+
+/**
+ * i40iw_flush_wqes - flush wqe for qp
+ * @iwdev: iwarp device
+ * @iwqp: qp to flush wqes
+ */
+void i40iw_flush_wqes(struct i40iw_device *iwdev, struct i40iw_qp *iwqp)
+{
+	struct i40iw_qp_flush_info info;
+	struct i40iw_qp_flush_info *pinfo = &info;
+
+	struct i40iw_sc_qp *qp = &iwqp->sc_qp;
+
+	memset(pinfo, 0, sizeof(*pinfo));
+	info.sq = true;
+	info.rq = true;
+	if (qp->term_flags) {
+		i40iw_set_flush_info(pinfo, &pinfo->sq_minor_code,
+				     &pinfo->sq_major_code, qp->flush_code);
+		i40iw_set_flush_info(pinfo, &pinfo->rq_minor_code,
+				     &pinfo->rq_major_code, qp->flush_code);
+	}
+	(void)i40iw_hw_flush_wqes(iwdev, &iwqp->sc_qp, &info, true);
+}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c
new file mode 100644
index 0000000..05001e6
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -0,0 +1,2063 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+#include <net/addrconf.h>
+
+#include "i40iw.h"
+#include "i40iw_register.h"
+#include <net/netevent.h>
+#define CLIENT_IW_INTERFACE_VERSION_MAJOR 0
+#define CLIENT_IW_INTERFACE_VERSION_MINOR 01
+#define CLIENT_IW_INTERFACE_VERSION_BUILD 00
+
+#define DRV_VERSION_MAJOR 0
+#define DRV_VERSION_MINOR 5
+#define DRV_VERSION_BUILD 123
+#define DRV_VERSION	__stringify(DRV_VERSION_MAJOR) "."		\
+	__stringify(DRV_VERSION_MINOR) "." __stringify(DRV_VERSION_BUILD)
+
+static int push_mode;
+module_param(push_mode, int, 0644);
+MODULE_PARM_DESC(push_mode, "Low latency mode: 0=disabled (default), 1=enabled)");
+
+static int debug;
+module_param(debug, int, 0644);
+MODULE_PARM_DESC(debug, "debug flags: 0=disabled (default), 0x7fffffff=all");
+
+static int resource_profile;
+module_param(resource_profile, int, 0644);
+MODULE_PARM_DESC(resource_profile,
+		 "Resource Profile: 0=no VF RDMA support (default), 1=Weighted VF, 2=Even Distribution");
+
+static int max_rdma_vfs = 32;
+module_param(max_rdma_vfs, int, 0644);
+MODULE_PARM_DESC(max_rdma_vfs, "Maximum VF count: 0-32 32=default");
+static int mpa_version = 2;
+module_param(mpa_version, int, 0644);
+MODULE_PARM_DESC(mpa_version, "MPA version to be used in MPA Req/Resp 1 or 2");
+
+MODULE_AUTHOR("Intel Corporation, <e1000-rdma@lists.sourceforge.net>");
+MODULE_DESCRIPTION("Intel(R) Ethernet Connection X722 iWARP RDMA Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static struct i40e_client i40iw_client;
+static char i40iw_client_name[I40E_CLIENT_STR_LENGTH] = "i40iw";
+
+static LIST_HEAD(i40iw_handlers);
+static spinlock_t i40iw_handler_lock;
+
+static enum i40iw_status_code i40iw_virtchnl_send(struct i40iw_sc_dev *dev,
+						  u32 vf_id, u8 *msg, u16 len);
+
+static struct notifier_block i40iw_inetaddr_notifier = {
+	.notifier_call = i40iw_inetaddr_event
+};
+
+static struct notifier_block i40iw_inetaddr6_notifier = {
+	.notifier_call = i40iw_inet6addr_event
+};
+
+static struct notifier_block i40iw_net_notifier = {
+	.notifier_call = i40iw_net_event
+};
+
+static struct notifier_block i40iw_netdevice_notifier = {
+	.notifier_call = i40iw_netdevice_event
+};
+
+/**
+ * i40iw_find_i40e_handler - find a handler given a client info
+ * @ldev: pointer to a client info
+ */
+static struct i40iw_handler *i40iw_find_i40e_handler(struct i40e_info *ldev)
+{
+	struct i40iw_handler *hdl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i40iw_handler_lock, flags);
+	list_for_each_entry(hdl, &i40iw_handlers, list) {
+		if (hdl->ldev.netdev == ldev->netdev) {
+			spin_unlock_irqrestore(&i40iw_handler_lock, flags);
+			return hdl;
+		}
+	}
+	spin_unlock_irqrestore(&i40iw_handler_lock, flags);
+	return NULL;
+}
+
+/**
+ * i40iw_find_netdev - find a handler given a netdev
+ * @netdev: pointer to net_device
+ */
+struct i40iw_handler *i40iw_find_netdev(struct net_device *netdev)
+{
+	struct i40iw_handler *hdl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i40iw_handler_lock, flags);
+	list_for_each_entry(hdl, &i40iw_handlers, list) {
+		if (hdl->ldev.netdev == netdev) {
+			spin_unlock_irqrestore(&i40iw_handler_lock, flags);
+			return hdl;
+		}
+	}
+	spin_unlock_irqrestore(&i40iw_handler_lock, flags);
+	return NULL;
+}
+
+/**
+ * i40iw_add_handler - add a handler to the list
+ * @hdl: handler to be added to the handler list
+ */
+static void i40iw_add_handler(struct i40iw_handler *hdl)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i40iw_handler_lock, flags);
+	list_add(&hdl->list, &i40iw_handlers);
+	spin_unlock_irqrestore(&i40iw_handler_lock, flags);
+}
+
+/**
+ * i40iw_del_handler - delete a handler from the list
+ * @hdl: handler to be deleted from the handler list
+ */
+static int i40iw_del_handler(struct i40iw_handler *hdl)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i40iw_handler_lock, flags);
+	list_del(&hdl->list);
+	spin_unlock_irqrestore(&i40iw_handler_lock, flags);
+	return 0;
+}
+
+/**
+ * i40iw_enable_intr - set up device interrupts
+ * @dev: hardware control device structure
+ * @msix_id: id of the interrupt to be enabled
+ */
+static void i40iw_enable_intr(struct i40iw_sc_dev *dev, u32 msix_id)
+{
+	u32 val;
+
+	val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
+		I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
+		(3 << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
+	if (dev->is_pf)
+		i40iw_wr32(dev->hw, I40E_PFINT_DYN_CTLN(msix_id - 1), val);
+	else
+		i40iw_wr32(dev->hw, I40E_VFINT_DYN_CTLN1(msix_id - 1), val);
+}
+
+/**
+ * i40iw_dpc - tasklet for aeq and ceq 0
+ * @data: iwarp device
+ */
+static void i40iw_dpc(unsigned long data)
+{
+	struct i40iw_device *iwdev = (struct i40iw_device *)data;
+
+	if (iwdev->msix_shared)
+		i40iw_process_ceq(iwdev, iwdev->ceqlist);
+	i40iw_process_aeq(iwdev);
+	i40iw_enable_intr(&iwdev->sc_dev, iwdev->iw_msixtbl[0].idx);
+}
+
+/**
+ * i40iw_ceq_dpc - dpc handler for CEQ
+ * @data: data points to CEQ
+ */
+static void i40iw_ceq_dpc(unsigned long data)
+{
+	struct i40iw_ceq *iwceq = (struct i40iw_ceq *)data;
+	struct i40iw_device *iwdev = iwceq->iwdev;
+
+	i40iw_process_ceq(iwdev, iwceq);
+	i40iw_enable_intr(&iwdev->sc_dev, iwceq->msix_idx);
+}
+
+/**
+ * i40iw_irq_handler - interrupt handler for aeq and ceq0
+ * @irq: Interrupt request number
+ * @data: iwarp device
+ */
+static irqreturn_t i40iw_irq_handler(int irq, void *data)
+{
+	struct i40iw_device *iwdev = (struct i40iw_device *)data;
+
+	tasklet_schedule(&iwdev->dpc_tasklet);
+	return IRQ_HANDLED;
+}
+
+/**
+ * i40iw_destroy_cqp  - destroy control qp
+ * @iwdev: iwarp device
+ * @create_done: 1 if cqp create poll was success
+ *
+ * Issue destroy cqp request and
+ * free the resources associated with the cqp
+ */
+static void i40iw_destroy_cqp(struct i40iw_device *iwdev, bool free_hwcqp)
+{
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_cqp *cqp = &iwdev->cqp;
+
+	if (free_hwcqp)
+		dev->cqp_ops->cqp_destroy(dev->cqp);
+
+	i40iw_cleanup_pending_cqp_op(iwdev);
+
+	i40iw_free_dma_mem(dev->hw, &cqp->sq);
+	kfree(cqp->scratch_array);
+	iwdev->cqp.scratch_array = NULL;
+
+	kfree(cqp->cqp_requests);
+	cqp->cqp_requests = NULL;
+}
+
+/**
+ * i40iw_disable_irqs - disable device interrupts
+ * @dev: hardware control device structure
+ * @msic_vec: msix vector to disable irq
+ * @dev_id: parameter to pass to free_irq (used during irq setup)
+ *
+ * The function is called when destroying aeq/ceq
+ */
+static void i40iw_disable_irq(struct i40iw_sc_dev *dev,
+			      struct i40iw_msix_vector *msix_vec,
+			      void *dev_id)
+{
+	if (dev->is_pf)
+		i40iw_wr32(dev->hw, I40E_PFINT_DYN_CTLN(msix_vec->idx - 1), 0);
+	else
+		i40iw_wr32(dev->hw, I40E_VFINT_DYN_CTLN1(msix_vec->idx - 1), 0);
+	irq_set_affinity_hint(msix_vec->irq, NULL);
+	free_irq(msix_vec->irq, dev_id);
+}
+
+/**
+ * i40iw_destroy_aeq - destroy aeq
+ * @iwdev: iwarp device
+ *
+ * Issue a destroy aeq request and
+ * free the resources associated with the aeq
+ * The function is called during driver unload
+ */
+static void i40iw_destroy_aeq(struct i40iw_device *iwdev)
+{
+	enum i40iw_status_code status = I40IW_ERR_NOT_READY;
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_aeq *aeq = &iwdev->aeq;
+
+	if (!iwdev->msix_shared)
+		i40iw_disable_irq(dev, iwdev->iw_msixtbl, (void *)iwdev);
+	if (iwdev->reset)
+		goto exit;
+
+	if (!dev->aeq_ops->aeq_destroy(&aeq->sc_aeq, 0, 1))
+		status = dev->aeq_ops->aeq_destroy_done(&aeq->sc_aeq);
+	if (status)
+		i40iw_pr_err("destroy aeq failed %d\n", status);
+
+exit:
+	i40iw_free_dma_mem(dev->hw, &aeq->mem);
+}
+
+/**
+ * i40iw_destroy_ceq - destroy ceq
+ * @iwdev: iwarp device
+ * @iwceq: ceq to be destroyed
+ *
+ * Issue a destroy ceq request and
+ * free the resources associated with the ceq
+ */
+static void i40iw_destroy_ceq(struct i40iw_device *iwdev,
+			      struct i40iw_ceq *iwceq)
+{
+	enum i40iw_status_code status;
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+
+	if (iwdev->reset)
+		goto exit;
+
+	status = dev->ceq_ops->ceq_destroy(&iwceq->sc_ceq, 0, 1);
+	if (status) {
+		i40iw_pr_err("ceq destroy command failed %d\n", status);
+		goto exit;
+	}
+
+	status = dev->ceq_ops->cceq_destroy_done(&iwceq->sc_ceq);
+	if (status)
+		i40iw_pr_err("ceq destroy completion failed %d\n", status);
+exit:
+	i40iw_free_dma_mem(dev->hw, &iwceq->mem);
+}
+
+/**
+ * i40iw_dele_ceqs - destroy all ceq's
+ * @iwdev: iwarp device
+ *
+ * Go through all of the device ceq's and for each ceq
+ * disable the ceq interrupt and destroy the ceq
+ */
+static void i40iw_dele_ceqs(struct i40iw_device *iwdev)
+{
+	u32 i = 0;
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_ceq *iwceq = iwdev->ceqlist;
+	struct i40iw_msix_vector *msix_vec = iwdev->iw_msixtbl;
+
+	if (iwdev->msix_shared) {
+		i40iw_disable_irq(dev, msix_vec, (void *)iwdev);
+		i40iw_destroy_ceq(iwdev, iwceq);
+		iwceq++;
+		i++;
+	}
+
+	for (msix_vec++; i < iwdev->ceqs_count; i++, msix_vec++, iwceq++) {
+		i40iw_disable_irq(dev, msix_vec, (void *)iwceq);
+		i40iw_destroy_ceq(iwdev, iwceq);
+	}
+
+	iwdev->sc_dev.ceq_valid = false;
+}
+
+/**
+ * i40iw_destroy_ccq - destroy control cq
+ * @iwdev: iwarp device
+ *
+ * Issue destroy ccq request and
+ * free the resources associated with the ccq
+ */
+static void i40iw_destroy_ccq(struct i40iw_device *iwdev)
+{
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_ccq *ccq = &iwdev->ccq;
+	enum i40iw_status_code status = 0;
+
+	if (!iwdev->reset)
+		status = dev->ccq_ops->ccq_destroy(dev->ccq, 0, true);
+	if (status)
+		i40iw_pr_err("ccq destroy failed %d\n", status);
+	i40iw_free_dma_mem(dev->hw, &ccq->mem_cq);
+}
+
+/* types of hmc objects */
+static enum i40iw_hmc_rsrc_type iw_hmc_obj_types[] = {
+	I40IW_HMC_IW_QP,
+	I40IW_HMC_IW_CQ,
+	I40IW_HMC_IW_HTE,
+	I40IW_HMC_IW_ARP,
+	I40IW_HMC_IW_APBVT_ENTRY,
+	I40IW_HMC_IW_MR,
+	I40IW_HMC_IW_XF,
+	I40IW_HMC_IW_XFFL,
+	I40IW_HMC_IW_Q1,
+	I40IW_HMC_IW_Q1FL,
+	I40IW_HMC_IW_TIMER,
+};
+
+/**
+ * i40iw_close_hmc_objects_type - delete hmc objects of a given type
+ * @iwdev: iwarp device
+ * @obj_type: the hmc object type to be deleted
+ * @is_pf: true if the function is PF otherwise false
+ * @reset: true if called before reset
+ */
+static void i40iw_close_hmc_objects_type(struct i40iw_sc_dev *dev,
+					 enum i40iw_hmc_rsrc_type obj_type,
+					 struct i40iw_hmc_info *hmc_info,
+					 bool is_pf,
+					 bool reset)
+{
+	struct i40iw_hmc_del_obj_info info;
+
+	memset(&info, 0, sizeof(info));
+	info.hmc_info = hmc_info;
+	info.rsrc_type = obj_type;
+	info.count = hmc_info->hmc_obj[obj_type].cnt;
+	info.is_pf = is_pf;
+	if (dev->hmc_ops->del_hmc_object(dev, &info, reset))
+		i40iw_pr_err("del obj of type %d failed\n", obj_type);
+}
+
+/**
+ * i40iw_del_hmc_objects - remove all device hmc objects
+ * @dev: iwarp device
+ * @hmc_info: hmc_info to free
+ * @is_pf: true if hmc_info belongs to PF, not vf nor allocated
+ *	   by PF on behalf of VF
+ * @reset: true if called before reset
+ */
+static void i40iw_del_hmc_objects(struct i40iw_sc_dev *dev,
+				  struct i40iw_hmc_info *hmc_info,
+				  bool is_pf,
+				  bool reset)
+{
+	unsigned int i;
+
+	for (i = 0; i < IW_HMC_OBJ_TYPE_NUM; i++)
+		i40iw_close_hmc_objects_type(dev, iw_hmc_obj_types[i], hmc_info, is_pf, reset);
+}
+
+/**
+ * i40iw_ceq_handler - interrupt handler for ceq
+ * @data: ceq pointer
+ */
+static irqreturn_t i40iw_ceq_handler(int irq, void *data)
+{
+	struct i40iw_ceq *iwceq = (struct i40iw_ceq *)data;
+
+	if (iwceq->irq != irq)
+		i40iw_pr_err("expected irq = %d received irq = %d\n", iwceq->irq, irq);
+	tasklet_schedule(&iwceq->dpc_tasklet);
+	return IRQ_HANDLED;
+}
+
+/**
+ * i40iw_create_hmc_obj_type - create hmc object of a given type
+ * @dev: hardware control device structure
+ * @info: information for the hmc object to create
+ */
+static enum i40iw_status_code i40iw_create_hmc_obj_type(struct i40iw_sc_dev *dev,
+							struct i40iw_hmc_create_obj_info *info)
+{
+	return dev->hmc_ops->create_hmc_object(dev, info);
+}
+
+/**
+ * i40iw_create_hmc_objs - create all hmc objects for the device
+ * @iwdev: iwarp device
+ * @is_pf: true if the function is PF otherwise false
+ *
+ * Create the device hmc objects and allocate hmc pages
+ * Return 0 if successful, otherwise clean up and return error
+ */
+static enum i40iw_status_code i40iw_create_hmc_objs(struct i40iw_device *iwdev,
+						    bool is_pf)
+{
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_hmc_create_obj_info info;
+	enum i40iw_status_code status;
+	int i;
+
+	memset(&info, 0, sizeof(info));
+	info.hmc_info = dev->hmc_info;
+	info.is_pf = is_pf;
+	info.entry_type = iwdev->sd_type;
+	for (i = 0; i < IW_HMC_OBJ_TYPE_NUM; i++) {
+		info.rsrc_type = iw_hmc_obj_types[i];
+		info.count = dev->hmc_info->hmc_obj[info.rsrc_type].cnt;
+		info.add_sd_cnt = 0;
+		status = i40iw_create_hmc_obj_type(dev, &info);
+		if (status) {
+			i40iw_pr_err("create obj type %d status = %d\n",
+				     iw_hmc_obj_types[i], status);
+			break;
+		}
+	}
+	if (!status)
+		return (dev->cqp_misc_ops->static_hmc_pages_allocated(dev->cqp, 0,
+								      dev->hmc_fn_id,
+								      true, true));
+
+	while (i) {
+		i--;
+		/* destroy the hmc objects of a given type */
+		i40iw_close_hmc_objects_type(dev,
+					     iw_hmc_obj_types[i],
+					     dev->hmc_info,
+					     is_pf,
+					     false);
+	}
+	return status;
+}
+
+/**
+ * i40iw_obj_aligned_mem - get aligned memory from device allocated memory
+ * @iwdev: iwarp device
+ * @memptr: points to the memory addresses
+ * @size: size of memory needed
+ * @mask: mask for the aligned memory
+ *
+ * Get aligned memory of the requested size and
+ * update the memptr to point to the new aligned memory
+ * Return 0 if successful, otherwise return no memory error
+ */
+enum i40iw_status_code i40iw_obj_aligned_mem(struct i40iw_device *iwdev,
+					     struct i40iw_dma_mem *memptr,
+					     u32 size,
+					     u32 mask)
+{
+	unsigned long va, newva;
+	unsigned long extra;
+
+	va = (unsigned long)iwdev->obj_next.va;
+	newva = va;
+	if (mask)
+		newva = ALIGN(va, (mask + 1));
+	extra = newva - va;
+	memptr->va = (u8 *)va + extra;
+	memptr->pa = iwdev->obj_next.pa + extra;
+	memptr->size = size;
+	if ((memptr->va + size) > (iwdev->obj_mem.va + iwdev->obj_mem.size))
+		return I40IW_ERR_NO_MEMORY;
+
+	iwdev->obj_next.va = memptr->va + size;
+	iwdev->obj_next.pa = memptr->pa + size;
+	return 0;
+}
+
+/**
+ * i40iw_create_cqp - create control qp
+ * @iwdev: iwarp device
+ *
+ * Return 0, if the cqp and all the resources associated with it
+ * are successfully created, otherwise return error
+ */
+static enum i40iw_status_code i40iw_create_cqp(struct i40iw_device *iwdev)
+{
+	enum i40iw_status_code status;
+	u32 sqsize = I40IW_CQP_SW_SQSIZE_2048;
+	struct i40iw_dma_mem mem;
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_cqp_init_info cqp_init_info;
+	struct i40iw_cqp *cqp = &iwdev->cqp;
+	u16 maj_err, min_err;
+	int i;
+
+	cqp->cqp_requests = kcalloc(sqsize, sizeof(*cqp->cqp_requests), GFP_KERNEL);
+	if (!cqp->cqp_requests)
+		return I40IW_ERR_NO_MEMORY;
+	cqp->scratch_array = kcalloc(sqsize, sizeof(*cqp->scratch_array), GFP_KERNEL);
+	if (!cqp->scratch_array) {
+		kfree(cqp->cqp_requests);
+		return I40IW_ERR_NO_MEMORY;
+	}
+	dev->cqp = &cqp->sc_cqp;
+	dev->cqp->dev = dev;
+	memset(&cqp_init_info, 0, sizeof(cqp_init_info));
+	status = i40iw_allocate_dma_mem(dev->hw, &cqp->sq,
+					(sizeof(struct i40iw_cqp_sq_wqe) * sqsize),
+					I40IW_CQP_ALIGNMENT);
+	if (status)
+		goto exit;
+	status = i40iw_obj_aligned_mem(iwdev, &mem, sizeof(struct i40iw_cqp_ctx),
+				       I40IW_HOST_CTX_ALIGNMENT_MASK);
+	if (status)
+		goto exit;
+	dev->cqp->host_ctx_pa = mem.pa;
+	dev->cqp->host_ctx = mem.va;
+	/* populate the cqp init info */
+	cqp_init_info.dev = dev;
+	cqp_init_info.sq_size = sqsize;
+	cqp_init_info.sq = cqp->sq.va;
+	cqp_init_info.sq_pa = cqp->sq.pa;
+	cqp_init_info.host_ctx_pa = mem.pa;
+	cqp_init_info.host_ctx = mem.va;
+	cqp_init_info.hmc_profile = iwdev->resource_profile;
+	cqp_init_info.enabled_vf_count = iwdev->max_rdma_vfs;
+	cqp_init_info.scratch_array = cqp->scratch_array;
+	status = dev->cqp_ops->cqp_init(dev->cqp, &cqp_init_info);
+	if (status) {
+		i40iw_pr_err("cqp init status %d\n", status);
+		goto exit;
+	}
+	status = dev->cqp_ops->cqp_create(dev->cqp, &maj_err, &min_err);
+	if (status) {
+		i40iw_pr_err("cqp create status %d maj_err %d min_err %d\n",
+			     status, maj_err, min_err);
+		goto exit;
+	}
+	spin_lock_init(&cqp->req_lock);
+	INIT_LIST_HEAD(&cqp->cqp_avail_reqs);
+	INIT_LIST_HEAD(&cqp->cqp_pending_reqs);
+	/* init the waitq of the cqp_requests and add them to the list */
+	for (i = 0; i < sqsize; i++) {
+		init_waitqueue_head(&cqp->cqp_requests[i].waitq);
+		list_add_tail(&cqp->cqp_requests[i].list, &cqp->cqp_avail_reqs);
+	}
+	return 0;
+exit:
+	/* clean up the created resources */
+	i40iw_destroy_cqp(iwdev, false);
+	return status;
+}
+
+/**
+ * i40iw_create_ccq - create control cq
+ * @iwdev: iwarp device
+ *
+ * Return 0, if the ccq and the resources associated with it
+ * are successfully created, otherwise return error
+ */
+static enum i40iw_status_code i40iw_create_ccq(struct i40iw_device *iwdev)
+{
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_dma_mem mem;
+	enum i40iw_status_code status;
+	struct i40iw_ccq_init_info info;
+	struct i40iw_ccq *ccq = &iwdev->ccq;
+
+	memset(&info, 0, sizeof(info));
+	dev->ccq = &ccq->sc_cq;
+	dev->ccq->dev = dev;
+	info.dev = dev;
+	ccq->shadow_area.size = sizeof(struct i40iw_cq_shadow_area);
+	ccq->mem_cq.size = sizeof(struct i40iw_cqe) * IW_CCQ_SIZE;
+	status = i40iw_allocate_dma_mem(dev->hw, &ccq->mem_cq,
+					ccq->mem_cq.size, I40IW_CQ0_ALIGNMENT);
+	if (status)
+		goto exit;
+	status = i40iw_obj_aligned_mem(iwdev, &mem, ccq->shadow_area.size,
+				       I40IW_SHADOWAREA_MASK);
+	if (status)
+		goto exit;
+	ccq->sc_cq.back_cq = (void *)ccq;
+	/* populate the ccq init info */
+	info.cq_base = ccq->mem_cq.va;
+	info.cq_pa = ccq->mem_cq.pa;
+	info.num_elem = IW_CCQ_SIZE;
+	info.shadow_area = mem.va;
+	info.shadow_area_pa = mem.pa;
+	info.ceqe_mask = false;
+	info.ceq_id_valid = true;
+	info.shadow_read_threshold = 16;
+	status = dev->ccq_ops->ccq_init(dev->ccq, &info);
+	if (!status)
+		status = dev->ccq_ops->ccq_create(dev->ccq, 0, true, true);
+exit:
+	if (status)
+		i40iw_free_dma_mem(dev->hw, &ccq->mem_cq);
+	return status;
+}
+
+/**
+ * i40iw_configure_ceq_vector - set up the msix interrupt vector for ceq
+ * @iwdev: iwarp device
+ * @msix_vec: interrupt vector information
+ * @iwceq: ceq associated with the vector
+ * @ceq_id: the id number of the iwceq
+ *
+ * Allocate interrupt resources and enable irq handling
+ * Return 0 if successful, otherwise return error
+ */
+static enum i40iw_status_code i40iw_configure_ceq_vector(struct i40iw_device *iwdev,
+							 struct i40iw_ceq *iwceq,
+							 u32 ceq_id,
+							 struct i40iw_msix_vector *msix_vec)
+{
+	enum i40iw_status_code status;
+
+	if (iwdev->msix_shared && !ceq_id) {
+		tasklet_init(&iwdev->dpc_tasklet, i40iw_dpc, (unsigned long)iwdev);
+		status = request_irq(msix_vec->irq, i40iw_irq_handler, 0, "AEQCEQ", iwdev);
+	} else {
+		tasklet_init(&iwceq->dpc_tasklet, i40iw_ceq_dpc, (unsigned long)iwceq);
+		status = request_irq(msix_vec->irq, i40iw_ceq_handler, 0, "CEQ", iwceq);
+	}
+
+	cpumask_clear(&msix_vec->mask);
+	cpumask_set_cpu(msix_vec->cpu_affinity, &msix_vec->mask);
+	irq_set_affinity_hint(msix_vec->irq, &msix_vec->mask);
+
+	if (status) {
+		i40iw_pr_err("ceq irq config fail\n");
+		return I40IW_ERR_CONFIG;
+	}
+	msix_vec->ceq_id = ceq_id;
+
+	return 0;
+}
+
+/**
+ * i40iw_create_ceq - create completion event queue
+ * @iwdev: iwarp device
+ * @iwceq: pointer to the ceq resources to be created
+ * @ceq_id: the id number of the iwceq
+ *
+ * Return 0, if the ceq and the resources associated with it
+ * are successfully created, otherwise return error
+ */
+static enum i40iw_status_code i40iw_create_ceq(struct i40iw_device *iwdev,
+					       struct i40iw_ceq *iwceq,
+					       u32 ceq_id)
+{
+	enum i40iw_status_code status;
+	struct i40iw_ceq_init_info info;
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	u64 scratch;
+
+	memset(&info, 0, sizeof(info));
+	info.ceq_id = ceq_id;
+	iwceq->iwdev = iwdev;
+	iwceq->mem.size = sizeof(struct i40iw_ceqe) *
+		iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt;
+	status = i40iw_allocate_dma_mem(dev->hw, &iwceq->mem, iwceq->mem.size,
+					I40IW_CEQ_ALIGNMENT);
+	if (status)
+		goto exit;
+	info.ceq_id = ceq_id;
+	info.ceqe_base = iwceq->mem.va;
+	info.ceqe_pa = iwceq->mem.pa;
+
+	info.elem_cnt = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt;
+	iwceq->sc_ceq.ceq_id = ceq_id;
+	info.dev = dev;
+	scratch = (uintptr_t)&iwdev->cqp.sc_cqp;
+	status = dev->ceq_ops->ceq_init(&iwceq->sc_ceq, &info);
+	if (!status)
+		status = dev->ceq_ops->cceq_create(&iwceq->sc_ceq, scratch);
+
+exit:
+	if (status)
+		i40iw_free_dma_mem(dev->hw, &iwceq->mem);
+	return status;
+}
+
+void i40iw_request_reset(struct i40iw_device *iwdev)
+{
+	struct i40e_info *ldev = iwdev->ldev;
+
+	ldev->ops->request_reset(ldev, iwdev->client, 1);
+}
+
+/**
+ * i40iw_setup_ceqs - manage the device ceq's and their interrupt resources
+ * @iwdev: iwarp device
+ * @ldev: i40e lan device
+ *
+ * Allocate a list for all device completion event queues
+ * Create the ceq's and configure their msix interrupt vectors
+ * Return 0, if at least one ceq is successfully set up, otherwise return error
+ */
+static enum i40iw_status_code i40iw_setup_ceqs(struct i40iw_device *iwdev,
+					       struct i40e_info *ldev)
+{
+	u32 i;
+	u32 ceq_id;
+	struct i40iw_ceq *iwceq;
+	struct i40iw_msix_vector *msix_vec;
+	enum i40iw_status_code status = 0;
+	u32 num_ceqs;
+
+	if (ldev && ldev->ops && ldev->ops->setup_qvlist) {
+		status = ldev->ops->setup_qvlist(ldev, &i40iw_client,
+						 iwdev->iw_qvlist);
+		if (status)
+			goto exit;
+	} else {
+		status = I40IW_ERR_BAD_PTR;
+		goto exit;
+	}
+
+	num_ceqs = min(iwdev->msix_count, iwdev->sc_dev.hmc_fpm_misc.max_ceqs);
+	iwdev->ceqlist = kcalloc(num_ceqs, sizeof(*iwdev->ceqlist), GFP_KERNEL);
+	if (!iwdev->ceqlist) {
+		status = I40IW_ERR_NO_MEMORY;
+		goto exit;
+	}
+	i = (iwdev->msix_shared) ? 0 : 1;
+	for (ceq_id = 0; i < num_ceqs; i++, ceq_id++) {
+		iwceq = &iwdev->ceqlist[ceq_id];
+		status = i40iw_create_ceq(iwdev, iwceq, ceq_id);
+		if (status) {
+			i40iw_pr_err("create ceq status = %d\n", status);
+			break;
+		}
+
+		msix_vec = &iwdev->iw_msixtbl[i];
+		iwceq->irq = msix_vec->irq;
+		iwceq->msix_idx = msix_vec->idx;
+		status = i40iw_configure_ceq_vector(iwdev, iwceq, ceq_id, msix_vec);
+		if (status) {
+			i40iw_destroy_ceq(iwdev, iwceq);
+			break;
+		}
+		i40iw_enable_intr(&iwdev->sc_dev, msix_vec->idx);
+		iwdev->ceqs_count++;
+	}
+exit:
+	if (status && !iwdev->ceqs_count) {
+		kfree(iwdev->ceqlist);
+		iwdev->ceqlist = NULL;
+		return status;
+	} else {
+		iwdev->sc_dev.ceq_valid = true;
+		return 0;
+	}
+
+}
+
+/**
+ * i40iw_configure_aeq_vector - set up the msix vector for aeq
+ * @iwdev: iwarp device
+ *
+ * Allocate interrupt resources and enable irq handling
+ * Return 0 if successful, otherwise return error
+ */
+static enum i40iw_status_code i40iw_configure_aeq_vector(struct i40iw_device *iwdev)
+{
+	struct i40iw_msix_vector *msix_vec = iwdev->iw_msixtbl;
+	u32 ret = 0;
+
+	if (!iwdev->msix_shared) {
+		tasklet_init(&iwdev->dpc_tasklet, i40iw_dpc, (unsigned long)iwdev);
+		ret = request_irq(msix_vec->irq, i40iw_irq_handler, 0, "i40iw", iwdev);
+	}
+	if (ret) {
+		i40iw_pr_err("aeq irq config fail\n");
+		return I40IW_ERR_CONFIG;
+	}
+
+	return 0;
+}
+
+/**
+ * i40iw_create_aeq - create async event queue
+ * @iwdev: iwarp device
+ *
+ * Return 0, if the aeq and the resources associated with it
+ * are successfully created, otherwise return error
+ */
+static enum i40iw_status_code i40iw_create_aeq(struct i40iw_device *iwdev)
+{
+	enum i40iw_status_code status;
+	struct i40iw_aeq_init_info info;
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_aeq *aeq = &iwdev->aeq;
+	u64 scratch = 0;
+	u32 aeq_size;
+
+	aeq_size = 2 * iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_QP].cnt +
+		iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt;
+	memset(&info, 0, sizeof(info));
+	aeq->mem.size = sizeof(struct i40iw_sc_aeqe) * aeq_size;
+	status = i40iw_allocate_dma_mem(dev->hw, &aeq->mem, aeq->mem.size,
+					I40IW_AEQ_ALIGNMENT);
+	if (status)
+		goto exit;
+
+	info.aeqe_base = aeq->mem.va;
+	info.aeq_elem_pa = aeq->mem.pa;
+	info.elem_cnt = aeq_size;
+	info.dev = dev;
+	status = dev->aeq_ops->aeq_init(&aeq->sc_aeq, &info);
+	if (status)
+		goto exit;
+	status = dev->aeq_ops->aeq_create(&aeq->sc_aeq, scratch, 1);
+	if (!status)
+		status = dev->aeq_ops->aeq_create_done(&aeq->sc_aeq);
+exit:
+	if (status)
+		i40iw_free_dma_mem(dev->hw, &aeq->mem);
+	return status;
+}
+
+/**
+ * i40iw_setup_aeq - set up the device aeq
+ * @iwdev: iwarp device
+ *
+ * Create the aeq and configure its msix interrupt vector
+ * Return 0 if successful, otherwise return error
+ */
+static enum i40iw_status_code i40iw_setup_aeq(struct i40iw_device *iwdev)
+{
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	enum i40iw_status_code status;
+
+	status = i40iw_create_aeq(iwdev);
+	if (status)
+		return status;
+
+	status = i40iw_configure_aeq_vector(iwdev);
+	if (status) {
+		i40iw_destroy_aeq(iwdev);
+		return status;
+	}
+
+	if (!iwdev->msix_shared)
+		i40iw_enable_intr(dev, iwdev->iw_msixtbl[0].idx);
+	return 0;
+}
+
+/**
+ * i40iw_initialize_ilq - create iwarp local queue for cm
+ * @iwdev: iwarp device
+ *
+ * Return 0 if successful, otherwise return error
+ */
+static enum i40iw_status_code i40iw_initialize_ilq(struct i40iw_device *iwdev)
+{
+	struct i40iw_puda_rsrc_info info;
+	enum i40iw_status_code status;
+
+	memset(&info, 0, sizeof(info));
+	info.type = I40IW_PUDA_RSRC_TYPE_ILQ;
+	info.cq_id = 1;
+	info.qp_id = 0;
+	info.count = 1;
+	info.pd_id = 1;
+	info.sq_size = 8192;
+	info.rq_size = 8192;
+	info.buf_size = 1024;
+	info.tx_buf_cnt = 16384;
+	info.receive = i40iw_receive_ilq;
+	info.xmit_complete = i40iw_free_sqbuf;
+	status = i40iw_puda_create_rsrc(&iwdev->vsi, &info);
+	if (status)
+		i40iw_pr_err("ilq create fail\n");
+	return status;
+}
+
+/**
+ * i40iw_initialize_ieq - create iwarp exception queue
+ * @iwdev: iwarp device
+ *
+ * Return 0 if successful, otherwise return error
+ */
+static enum i40iw_status_code i40iw_initialize_ieq(struct i40iw_device *iwdev)
+{
+	struct i40iw_puda_rsrc_info info;
+	enum i40iw_status_code status;
+
+	memset(&info, 0, sizeof(info));
+	info.type = I40IW_PUDA_RSRC_TYPE_IEQ;
+	info.cq_id = 2;
+	info.qp_id = iwdev->vsi.exception_lan_queue;
+	info.count = 1;
+	info.pd_id = 2;
+	info.sq_size = 8192;
+	info.rq_size = 8192;
+	info.buf_size = iwdev->vsi.mtu + VLAN_ETH_HLEN;
+	info.tx_buf_cnt = 4096;
+	status = i40iw_puda_create_rsrc(&iwdev->vsi, &info);
+	if (status)
+		i40iw_pr_err("ieq create fail\n");
+	return status;
+}
+
+/**
+ * i40iw_reinitialize_ieq - destroy and re-create ieq
+ * @dev: iwarp device
+ */
+void i40iw_reinitialize_ieq(struct i40iw_sc_dev *dev)
+{
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+
+	i40iw_puda_dele_resources(&iwdev->vsi, I40IW_PUDA_RSRC_TYPE_IEQ, false);
+	if (i40iw_initialize_ieq(iwdev)) {
+		iwdev->reset = true;
+		i40iw_request_reset(iwdev);
+	}
+}
+
+/**
+ * i40iw_hmc_setup - create hmc objects for the device
+ * @iwdev: iwarp device
+ *
+ * Set up the device private memory space for the number and size of
+ * the hmc objects and create the objects
+ * Return 0 if successful, otherwise return error
+ */
+static enum i40iw_status_code i40iw_hmc_setup(struct i40iw_device *iwdev)
+{
+	enum i40iw_status_code status;
+
+	iwdev->sd_type = I40IW_SD_TYPE_DIRECT;
+	status = i40iw_config_fpm_values(&iwdev->sc_dev, IW_CFG_FPM_QP_COUNT);
+	if (status)
+		goto exit;
+	status = i40iw_create_hmc_objs(iwdev, true);
+	if (status)
+		goto exit;
+	iwdev->init_state = HMC_OBJS_CREATED;
+exit:
+	return status;
+}
+
+/**
+ * i40iw_del_init_mem - deallocate memory resources
+ * @iwdev: iwarp device
+ */
+static void i40iw_del_init_mem(struct i40iw_device *iwdev)
+{
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+
+	i40iw_free_dma_mem(&iwdev->hw, &iwdev->obj_mem);
+	kfree(dev->hmc_info->sd_table.sd_entry);
+	dev->hmc_info->sd_table.sd_entry = NULL;
+	kfree(iwdev->mem_resources);
+	iwdev->mem_resources = NULL;
+	kfree(iwdev->ceqlist);
+	iwdev->ceqlist = NULL;
+	kfree(iwdev->iw_msixtbl);
+	iwdev->iw_msixtbl = NULL;
+	kfree(iwdev->hmc_info_mem);
+	iwdev->hmc_info_mem = NULL;
+}
+
+/**
+ * i40iw_del_macip_entry - remove a mac ip address entry from the hw table
+ * @iwdev: iwarp device
+ * @idx: the index of the mac ip address to delete
+ */
+static void i40iw_del_macip_entry(struct i40iw_device *iwdev, u8 idx)
+{
+	struct i40iw_cqp *iwcqp = &iwdev->cqp;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	enum i40iw_status_code status = 0;
+
+	cqp_request = i40iw_get_cqp_request(iwcqp, true);
+	if (!cqp_request) {
+		i40iw_pr_err("cqp_request memory failed\n");
+		return;
+	}
+	cqp_info = &cqp_request->info;
+	cqp_info->cqp_cmd = OP_DELETE_LOCAL_MAC_IPADDR_ENTRY;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.del_local_mac_ipaddr_entry.cqp = &iwcqp->sc_cqp;
+	cqp_info->in.u.del_local_mac_ipaddr_entry.scratch = (uintptr_t)cqp_request;
+	cqp_info->in.u.del_local_mac_ipaddr_entry.entry_idx = idx;
+	cqp_info->in.u.del_local_mac_ipaddr_entry.ignore_ref_count = 0;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP Del MAC Ip entry fail");
+}
+
+/**
+ * i40iw_add_mac_ipaddr_entry - add a mac ip address entry to the hw table
+ * @iwdev: iwarp device
+ * @mac_addr: pointer to mac address
+ * @idx: the index of the mac ip address to add
+ */
+static enum i40iw_status_code i40iw_add_mac_ipaddr_entry(struct i40iw_device *iwdev,
+							 u8 *mac_addr,
+							 u8 idx)
+{
+	struct i40iw_local_mac_ipaddr_entry_info *info;
+	struct i40iw_cqp *iwcqp = &iwdev->cqp;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	enum i40iw_status_code status = 0;
+
+	cqp_request = i40iw_get_cqp_request(iwcqp, true);
+	if (!cqp_request) {
+		i40iw_pr_err("cqp_request memory failed\n");
+		return I40IW_ERR_NO_MEMORY;
+	}
+
+	cqp_info = &cqp_request->info;
+
+	cqp_info->post_sq = 1;
+	info = &cqp_info->in.u.add_local_mac_ipaddr_entry.info;
+	ether_addr_copy(info->mac_addr, mac_addr);
+	info->entry_idx = idx;
+	cqp_info->in.u.add_local_mac_ipaddr_entry.scratch = (uintptr_t)cqp_request;
+	cqp_info->cqp_cmd = OP_ADD_LOCAL_MAC_IPADDR_ENTRY;
+	cqp_info->in.u.add_local_mac_ipaddr_entry.cqp = &iwcqp->sc_cqp;
+	cqp_info->in.u.add_local_mac_ipaddr_entry.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP Add MAC Ip entry fail");
+	return status;
+}
+
+/**
+ * i40iw_alloc_local_mac_ipaddr_entry - allocate a mac ip address entry
+ * @iwdev: iwarp device
+ * @mac_ip_tbl_idx: the index of the new mac ip address
+ *
+ * Allocate a mac ip address entry and update the mac_ip_tbl_idx
+ * to hold the index of the newly created mac ip address
+ * Return 0 if successful, otherwise return error
+ */
+static enum i40iw_status_code i40iw_alloc_local_mac_ipaddr_entry(struct i40iw_device *iwdev,
+								 u16 *mac_ip_tbl_idx)
+{
+	struct i40iw_cqp *iwcqp = &iwdev->cqp;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	enum i40iw_status_code status = 0;
+
+	cqp_request = i40iw_get_cqp_request(iwcqp, true);
+	if (!cqp_request) {
+		i40iw_pr_err("cqp_request memory failed\n");
+		return I40IW_ERR_NO_MEMORY;
+	}
+
+	/* increment refcount, because we need the cqp request ret value */
+	atomic_inc(&cqp_request->refcount);
+
+	cqp_info = &cqp_request->info;
+	cqp_info->cqp_cmd = OP_ALLOC_LOCAL_MAC_IPADDR_ENTRY;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.alloc_local_mac_ipaddr_entry.cqp = &iwcqp->sc_cqp;
+	cqp_info->in.u.alloc_local_mac_ipaddr_entry.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (!status)
+		*mac_ip_tbl_idx = cqp_request->compl_info.op_ret_val;
+	else
+		i40iw_pr_err("CQP-OP Alloc MAC Ip entry fail");
+	/* decrement refcount and free the cqp request, if no longer used */
+	i40iw_put_cqp_request(iwcqp, cqp_request);
+	return status;
+}
+
+/**
+ * i40iw_alloc_set_mac_ipaddr - set up a mac ip address table entry
+ * @iwdev: iwarp device
+ * @macaddr: pointer to mac address
+ *
+ * Allocate a mac ip address entry and add it to the hw table
+ * Return 0 if successful, otherwise return error
+ */
+static enum i40iw_status_code i40iw_alloc_set_mac_ipaddr(struct i40iw_device *iwdev,
+							 u8 *macaddr)
+{
+	enum i40iw_status_code status;
+
+	status = i40iw_alloc_local_mac_ipaddr_entry(iwdev, &iwdev->mac_ip_table_idx);
+	if (!status) {
+		status = i40iw_add_mac_ipaddr_entry(iwdev, macaddr,
+						    (u8)iwdev->mac_ip_table_idx);
+		if (status)
+			i40iw_del_macip_entry(iwdev, (u8)iwdev->mac_ip_table_idx);
+	}
+	return status;
+}
+
+/**
+ * i40iw_add_ipv6_addr - add ipv6 address to the hw arp table
+ * @iwdev: iwarp device
+ */
+static void i40iw_add_ipv6_addr(struct i40iw_device *iwdev)
+{
+	struct net_device *ip_dev;
+	struct inet6_dev *idev;
+	struct inet6_ifaddr *ifp, *tmp;
+	u32 local_ipaddr6[4];
+
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, ip_dev) {
+		if ((((rdma_vlan_dev_vlan_id(ip_dev) < 0xFFFF) &&
+		      (rdma_vlan_dev_real_dev(ip_dev) == iwdev->netdev)) ||
+		     (ip_dev == iwdev->netdev)) && (ip_dev->flags & IFF_UP)) {
+			idev = __in6_dev_get(ip_dev);
+			if (!idev) {
+				i40iw_pr_err("ipv6 inet device not found\n");
+				break;
+			}
+			list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
+				i40iw_pr_info("IP=%pI6, vlan_id=%d, MAC=%pM\n", &ifp->addr,
+					      rdma_vlan_dev_vlan_id(ip_dev), ip_dev->dev_addr);
+				i40iw_copy_ip_ntohl(local_ipaddr6,
+						    ifp->addr.in6_u.u6_addr32);
+				i40iw_manage_arp_cache(iwdev,
+						       ip_dev->dev_addr,
+						       local_ipaddr6,
+						       false,
+						       I40IW_ARP_ADD);
+			}
+		}
+	}
+	rcu_read_unlock();
+}
+
+/**
+ * i40iw_add_ipv4_addr - add ipv4 address to the hw arp table
+ * @iwdev: iwarp device
+ */
+static void i40iw_add_ipv4_addr(struct i40iw_device *iwdev)
+{
+	struct net_device *dev;
+	struct in_device *idev;
+	bool got_lock = true;
+	u32 ip_addr;
+
+	if (!rtnl_trylock())
+		got_lock = false;
+
+	for_each_netdev(&init_net, dev) {
+		if ((((rdma_vlan_dev_vlan_id(dev) < 0xFFFF) &&
+		      (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
+		    (dev == iwdev->netdev)) && (dev->flags & IFF_UP)) {
+			idev = in_dev_get(dev);
+			for_ifa(idev) {
+				i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM,
+					    "IP=%pI4, vlan_id=%d, MAC=%pM\n", &ifa->ifa_address,
+					     rdma_vlan_dev_vlan_id(dev), dev->dev_addr);
+
+				ip_addr = ntohl(ifa->ifa_address);
+				i40iw_manage_arp_cache(iwdev,
+						       dev->dev_addr,
+						       &ip_addr,
+						       true,
+						       I40IW_ARP_ADD);
+			}
+			endfor_ifa(idev);
+			in_dev_put(idev);
+		}
+	}
+	if (got_lock)
+		rtnl_unlock();
+}
+
+/**
+ * i40iw_add_mac_ip - add mac and ip addresses
+ * @iwdev: iwarp device
+ *
+ * Create and add a mac ip address entry to the hw table and
+ * ipv4/ipv6 addresses to the arp cache
+ * Return 0 if successful, otherwise return error
+ */
+static enum i40iw_status_code i40iw_add_mac_ip(struct i40iw_device *iwdev)
+{
+	struct net_device *netdev = iwdev->netdev;
+	enum i40iw_status_code status;
+
+	status = i40iw_alloc_set_mac_ipaddr(iwdev, (u8 *)netdev->dev_addr);
+	if (status)
+		return status;
+	i40iw_add_ipv4_addr(iwdev);
+	i40iw_add_ipv6_addr(iwdev);
+	return 0;
+}
+
+/**
+ * i40iw_wait_pe_ready - Check if firmware is ready
+ * @hw: provides access to registers
+ */
+static void i40iw_wait_pe_ready(struct i40iw_hw *hw)
+{
+	u32 statusfw;
+	u32 statuscpu0;
+	u32 statuscpu1;
+	u32 statuscpu2;
+	u32 retrycount = 0;
+
+	do {
+		statusfw = i40iw_rd32(hw, I40E_GLPE_FWLDSTATUS);
+		i40iw_pr_info("[%04d] fm load status[x%04X]\n", __LINE__, statusfw);
+		statuscpu0 = i40iw_rd32(hw, I40E_GLPE_CPUSTATUS0);
+		i40iw_pr_info("[%04d] CSR_CQP status[x%04X]\n", __LINE__, statuscpu0);
+		statuscpu1 = i40iw_rd32(hw, I40E_GLPE_CPUSTATUS1);
+		i40iw_pr_info("[%04d] I40E_GLPE_CPUSTATUS1 status[x%04X]\n",
+			      __LINE__, statuscpu1);
+		statuscpu2 = i40iw_rd32(hw, I40E_GLPE_CPUSTATUS2);
+		i40iw_pr_info("[%04d] I40E_GLPE_CPUSTATUS2 status[x%04X]\n",
+			      __LINE__, statuscpu2);
+		if ((statuscpu0 == 0x80) && (statuscpu1 == 0x80) && (statuscpu2 == 0x80))
+			break;	/* SUCCESS */
+		msleep(1000);
+		retrycount++;
+	} while (retrycount < 14);
+	i40iw_wr32(hw, 0xb4040, 0x4C104C5);
+}
+
+/**
+ * i40iw_initialize_dev - initialize device
+ * @iwdev: iwarp device
+ * @ldev: lan device information
+ *
+ * Allocate memory for the hmc objects and initialize iwdev
+ * Return 0 if successful, otherwise clean up the resources
+ * and return error
+ */
+static enum i40iw_status_code i40iw_initialize_dev(struct i40iw_device *iwdev,
+						   struct i40e_info *ldev)
+{
+	enum i40iw_status_code status;
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_device_init_info info;
+	struct i40iw_vsi_init_info vsi_info;
+	struct i40iw_dma_mem mem;
+	struct i40iw_l2params l2params;
+	u32 size;
+	struct i40iw_vsi_stats_info stats_info;
+	u16 last_qset = I40IW_NO_QSET;
+	u16 qset;
+	u32 i;
+
+	memset(&l2params, 0, sizeof(l2params));
+	memset(&info, 0, sizeof(info));
+	size = sizeof(struct i40iw_hmc_pble_rsrc) + sizeof(struct i40iw_hmc_info) +
+				(sizeof(struct i40iw_hmc_obj_info) * I40IW_HMC_IW_MAX);
+	iwdev->hmc_info_mem = kzalloc(size, GFP_KERNEL);
+	if (!iwdev->hmc_info_mem)
+		return I40IW_ERR_NO_MEMORY;
+
+	iwdev->pble_rsrc = (struct i40iw_hmc_pble_rsrc *)iwdev->hmc_info_mem;
+	dev->hmc_info = &iwdev->hw.hmc;
+	dev->hmc_info->hmc_obj = (struct i40iw_hmc_obj_info *)(iwdev->pble_rsrc + 1);
+	status = i40iw_obj_aligned_mem(iwdev, &mem, I40IW_QUERY_FPM_BUF_SIZE,
+				       I40IW_FPM_QUERY_BUF_ALIGNMENT_MASK);
+	if (status)
+		goto error;
+	info.fpm_query_buf_pa = mem.pa;
+	info.fpm_query_buf = mem.va;
+	status = i40iw_obj_aligned_mem(iwdev, &mem, I40IW_COMMIT_FPM_BUF_SIZE,
+				       I40IW_FPM_COMMIT_BUF_ALIGNMENT_MASK);
+	if (status)
+		goto error;
+	info.fpm_commit_buf_pa = mem.pa;
+	info.fpm_commit_buf = mem.va;
+	info.hmc_fn_id = ldev->fid;
+	info.is_pf = (ldev->ftype) ? false : true;
+	info.bar0 = ldev->hw_addr;
+	info.hw = &iwdev->hw;
+	info.debug_mask = debug;
+	l2params.mtu =
+		(ldev->params.mtu) ? ldev->params.mtu : I40IW_DEFAULT_MTU;
+	for (i = 0; i < I40E_CLIENT_MAX_USER_PRIORITY; i++) {
+		qset = ldev->params.qos.prio_qos[i].qs_handle;
+		l2params.qs_handle_list[i] = qset;
+		if (last_qset == I40IW_NO_QSET)
+			last_qset = qset;
+		else if ((qset != last_qset) && (qset != I40IW_NO_QSET))
+			iwdev->dcb = true;
+	}
+	i40iw_pr_info("DCB is set/clear = %d\n", iwdev->dcb);
+	info.vchnl_send = i40iw_virtchnl_send;
+	status = i40iw_device_init(&iwdev->sc_dev, &info);
+
+	if (status)
+		goto error;
+	memset(&vsi_info, 0, sizeof(vsi_info));
+	vsi_info.dev = &iwdev->sc_dev;
+	vsi_info.back_vsi = (void *)iwdev;
+	vsi_info.params = &l2params;
+	vsi_info.exception_lan_queue = 1;
+	i40iw_sc_vsi_init(&iwdev->vsi, &vsi_info);
+
+	if (dev->is_pf) {
+		memset(&stats_info, 0, sizeof(stats_info));
+		stats_info.fcn_id = ldev->fid;
+		stats_info.pestat = kzalloc(sizeof(*stats_info.pestat), GFP_KERNEL);
+		if (!stats_info.pestat) {
+			status = I40IW_ERR_NO_MEMORY;
+			goto error;
+		}
+		stats_info.stats_initialize = true;
+		if (stats_info.pestat)
+			i40iw_vsi_stats_init(&iwdev->vsi, &stats_info);
+	}
+	return status;
+error:
+	kfree(iwdev->hmc_info_mem);
+	iwdev->hmc_info_mem = NULL;
+	return status;
+}
+
+/**
+ * i40iw_register_notifiers - register tcp ip notifiers
+ */
+static void i40iw_register_notifiers(void)
+{
+	register_inetaddr_notifier(&i40iw_inetaddr_notifier);
+	register_inet6addr_notifier(&i40iw_inetaddr6_notifier);
+	register_netevent_notifier(&i40iw_net_notifier);
+	register_netdevice_notifier(&i40iw_netdevice_notifier);
+}
+
+/**
+ * i40iw_unregister_notifiers - unregister tcp ip notifiers
+ */
+
+static void i40iw_unregister_notifiers(void)
+{
+	unregister_netevent_notifier(&i40iw_net_notifier);
+	unregister_inetaddr_notifier(&i40iw_inetaddr_notifier);
+	unregister_inet6addr_notifier(&i40iw_inetaddr6_notifier);
+	unregister_netdevice_notifier(&i40iw_netdevice_notifier);
+}
+
+/**
+ * i40iw_save_msix_info - copy msix vector information to iwarp device
+ * @iwdev: iwarp device
+ * @ldev: lan device information
+ *
+ * Allocate iwdev msix table and copy the ldev msix info to the table
+ * Return 0 if successful, otherwise return error
+ */
+static enum i40iw_status_code i40iw_save_msix_info(struct i40iw_device *iwdev,
+						   struct i40e_info *ldev)
+{
+	struct i40e_qvlist_info *iw_qvlist;
+	struct i40e_qv_info *iw_qvinfo;
+	u32 ceq_idx;
+	u32 i;
+	u32 size;
+
+	if (!ldev->msix_count) {
+		i40iw_pr_err("No MSI-X vectors\n");
+		return I40IW_ERR_CONFIG;
+	}
+
+	iwdev->msix_count = ldev->msix_count;
+
+	size = sizeof(struct i40iw_msix_vector) * iwdev->msix_count;
+	size += sizeof(struct i40e_qvlist_info);
+	size +=  sizeof(struct i40e_qv_info) * iwdev->msix_count - 1;
+	iwdev->iw_msixtbl = kzalloc(size, GFP_KERNEL);
+
+	if (!iwdev->iw_msixtbl)
+		return I40IW_ERR_NO_MEMORY;
+	iwdev->iw_qvlist = (struct i40e_qvlist_info *)(&iwdev->iw_msixtbl[iwdev->msix_count]);
+	iw_qvlist = iwdev->iw_qvlist;
+	iw_qvinfo = iw_qvlist->qv_info;
+	iw_qvlist->num_vectors = iwdev->msix_count;
+	if (iwdev->msix_count <= num_online_cpus())
+		iwdev->msix_shared = true;
+	for (i = 0, ceq_idx = 0; i < iwdev->msix_count; i++, iw_qvinfo++) {
+		iwdev->iw_msixtbl[i].idx = ldev->msix_entries[i].entry;
+		iwdev->iw_msixtbl[i].irq = ldev->msix_entries[i].vector;
+		iwdev->iw_msixtbl[i].cpu_affinity = ceq_idx;
+		if (i == 0) {
+			iw_qvinfo->aeq_idx = 0;
+			if (iwdev->msix_shared)
+				iw_qvinfo->ceq_idx = ceq_idx++;
+			else
+				iw_qvinfo->ceq_idx = I40E_QUEUE_INVALID_IDX;
+		} else {
+			iw_qvinfo->aeq_idx = I40E_QUEUE_INVALID_IDX;
+			iw_qvinfo->ceq_idx = ceq_idx++;
+		}
+		iw_qvinfo->itr_idx = 3;
+		iw_qvinfo->v_idx = iwdev->iw_msixtbl[i].idx;
+	}
+	return 0;
+}
+
+/**
+ * i40iw_deinit_device - clean up the device resources
+ * @iwdev: iwarp device
+ *
+ * Destroy the ib device interface, remove the mac ip entry and ipv4/ipv6 addresses,
+ * destroy the device queues and free the pble and the hmc objects
+ */
+static void i40iw_deinit_device(struct i40iw_device *iwdev)
+{
+	struct i40e_info *ldev = iwdev->ldev;
+
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+
+	i40iw_pr_info("state = %d\n", iwdev->init_state);
+	if (iwdev->param_wq)
+		destroy_workqueue(iwdev->param_wq);
+
+	switch (iwdev->init_state) {
+	case RDMA_DEV_REGISTERED:
+		iwdev->iw_status = 0;
+		i40iw_port_ibevent(iwdev);
+		i40iw_destroy_rdma_device(iwdev->iwibdev);
+		/* fallthrough */
+	case IP_ADDR_REGISTERED:
+		if (!iwdev->reset)
+			i40iw_del_macip_entry(iwdev, (u8)iwdev->mac_ip_table_idx);
+		/* fallthrough */
+		/* fallthrough */
+	case PBLE_CHUNK_MEM:
+		i40iw_destroy_pble_pool(dev, iwdev->pble_rsrc);
+		/* fallthrough */
+	case CEQ_CREATED:
+		i40iw_dele_ceqs(iwdev);
+		/* fallthrough */
+	case AEQ_CREATED:
+		i40iw_destroy_aeq(iwdev);
+		/* fallthrough */
+	case IEQ_CREATED:
+		i40iw_puda_dele_resources(&iwdev->vsi, I40IW_PUDA_RSRC_TYPE_IEQ, iwdev->reset);
+		/* fallthrough */
+	case ILQ_CREATED:
+		i40iw_puda_dele_resources(&iwdev->vsi, I40IW_PUDA_RSRC_TYPE_ILQ, iwdev->reset);
+		/* fallthrough */
+	case CCQ_CREATED:
+		i40iw_destroy_ccq(iwdev);
+		/* fallthrough */
+	case HMC_OBJS_CREATED:
+		i40iw_del_hmc_objects(dev, dev->hmc_info, true, iwdev->reset);
+		/* fallthrough */
+	case CQP_CREATED:
+		i40iw_destroy_cqp(iwdev, true);
+		/* fallthrough */
+	case INITIAL_STATE:
+		i40iw_cleanup_cm_core(&iwdev->cm_core);
+		if (iwdev->vsi.pestat) {
+			i40iw_vsi_stats_free(&iwdev->vsi);
+			kfree(iwdev->vsi.pestat);
+		}
+		i40iw_del_init_mem(iwdev);
+		break;
+	case INVALID_STATE:
+		/* fallthrough */
+	default:
+		i40iw_pr_err("bad init_state = %d\n", iwdev->init_state);
+		break;
+	}
+
+	i40iw_del_handler(i40iw_find_i40e_handler(ldev));
+	kfree(iwdev->hdl);
+}
+
+/**
+ * i40iw_setup_init_state - set up the initial device struct
+ * @hdl: handler for iwarp device - one per instance
+ * @ldev: lan device information
+ * @client: iwarp client information, provided during registration
+ *
+ * Initialize the iwarp device and its hdl information
+ * using the ldev and client information
+ * Return 0 if successful, otherwise return error
+ */
+static enum i40iw_status_code i40iw_setup_init_state(struct i40iw_handler *hdl,
+						     struct i40e_info *ldev,
+						     struct i40e_client *client)
+{
+	struct i40iw_device *iwdev = &hdl->device;
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	enum i40iw_status_code status;
+
+	memcpy(&hdl->ldev, ldev, sizeof(*ldev));
+
+	iwdev->mpa_version = mpa_version;
+	iwdev->resource_profile = (resource_profile < I40IW_HMC_PROFILE_EQUAL) ?
+	    (u8)resource_profile + I40IW_HMC_PROFILE_DEFAULT :
+	    I40IW_HMC_PROFILE_DEFAULT;
+	iwdev->max_rdma_vfs =
+		(iwdev->resource_profile != I40IW_HMC_PROFILE_DEFAULT) ?  max_rdma_vfs : 0;
+	iwdev->max_enabled_vfs = iwdev->max_rdma_vfs;
+	iwdev->netdev = ldev->netdev;
+	hdl->client = client;
+	if (!ldev->ftype)
+		iwdev->db_start = pci_resource_start(ldev->pcidev, 0) + I40IW_DB_ADDR_OFFSET;
+	else
+		iwdev->db_start = pci_resource_start(ldev->pcidev, 0) + I40IW_VF_DB_ADDR_OFFSET;
+
+	status = i40iw_save_msix_info(iwdev, ldev);
+	if (status)
+		return status;
+	iwdev->hw.dev_context = (void *)ldev->pcidev;
+	iwdev->hw.hw_addr = ldev->hw_addr;
+	status = i40iw_allocate_dma_mem(&iwdev->hw,
+					&iwdev->obj_mem, 8192, 4096);
+	if (status)
+		goto exit;
+	iwdev->obj_next = iwdev->obj_mem;
+	iwdev->push_mode = push_mode;
+
+	init_waitqueue_head(&iwdev->vchnl_waitq);
+	init_waitqueue_head(&dev->vf_reqs);
+	init_waitqueue_head(&iwdev->close_wq);
+
+	status = i40iw_initialize_dev(iwdev, ldev);
+exit:
+	if (status) {
+		kfree(iwdev->iw_msixtbl);
+		i40iw_free_dma_mem(dev->hw, &iwdev->obj_mem);
+		iwdev->iw_msixtbl = NULL;
+	}
+	return status;
+}
+
+/**
+ * i40iw_get_used_rsrc - determine resources used internally
+ * @iwdev: iwarp device
+ *
+ * Called after internal allocations
+ */
+static void i40iw_get_used_rsrc(struct i40iw_device *iwdev)
+{
+	iwdev->used_pds = find_next_zero_bit(iwdev->allocated_pds, iwdev->max_pd, 0);
+	iwdev->used_qps = find_next_zero_bit(iwdev->allocated_qps, iwdev->max_qp, 0);
+	iwdev->used_cqs = find_next_zero_bit(iwdev->allocated_cqs, iwdev->max_cq, 0);
+	iwdev->used_mrs = find_next_zero_bit(iwdev->allocated_mrs, iwdev->max_mr, 0);
+}
+
+/**
+ * i40iw_open - client interface operation open for iwarp/uda device
+ * @ldev: lan device information
+ * @client: iwarp client information, provided during registration
+ *
+ * Called by the lan driver during the processing of client register
+ * Create device resources, set up queues, pble and hmc objects and
+ * register the device with the ib verbs interface
+ * Return 0 if successful, otherwise return error
+ */
+static int i40iw_open(struct i40e_info *ldev, struct i40e_client *client)
+{
+	struct i40iw_device *iwdev;
+	struct i40iw_sc_dev *dev;
+	enum i40iw_status_code status;
+	struct i40iw_handler *hdl;
+
+	hdl = i40iw_find_netdev(ldev->netdev);
+	if (hdl)
+		return 0;
+
+	hdl = kzalloc(sizeof(*hdl), GFP_KERNEL);
+	if (!hdl)
+		return -ENOMEM;
+	iwdev = &hdl->device;
+	iwdev->hdl = hdl;
+	dev = &iwdev->sc_dev;
+	i40iw_setup_cm_core(iwdev);
+
+	dev->back_dev = (void *)iwdev;
+	iwdev->ldev = &hdl->ldev;
+	iwdev->client = client;
+	mutex_init(&iwdev->pbl_mutex);
+	i40iw_add_handler(hdl);
+
+	do {
+		status = i40iw_setup_init_state(hdl, ldev, client);
+		if (status)
+			break;
+		iwdev->init_state = INITIAL_STATE;
+		if (dev->is_pf)
+			i40iw_wait_pe_ready(dev->hw);
+		status = i40iw_create_cqp(iwdev);
+		if (status)
+			break;
+		iwdev->init_state = CQP_CREATED;
+		status = i40iw_hmc_setup(iwdev);
+		if (status)
+			break;
+		status = i40iw_create_ccq(iwdev);
+		if (status)
+			break;
+		iwdev->init_state = CCQ_CREATED;
+		status = i40iw_initialize_ilq(iwdev);
+		if (status)
+			break;
+		iwdev->init_state = ILQ_CREATED;
+		status = i40iw_initialize_ieq(iwdev);
+		if (status)
+			break;
+		iwdev->init_state = IEQ_CREATED;
+		status = i40iw_setup_aeq(iwdev);
+		if (status)
+			break;
+		iwdev->init_state = AEQ_CREATED;
+		status = i40iw_setup_ceqs(iwdev, ldev);
+		if (status)
+			break;
+		iwdev->init_state = CEQ_CREATED;
+		status = i40iw_initialize_hw_resources(iwdev);
+		if (status)
+			break;
+		i40iw_get_used_rsrc(iwdev);
+		dev->ccq_ops->ccq_arm(dev->ccq);
+		status = i40iw_hmc_init_pble(&iwdev->sc_dev, iwdev->pble_rsrc);
+		if (status)
+			break;
+		iwdev->init_state = PBLE_CHUNK_MEM;
+		iwdev->virtchnl_wq = alloc_ordered_workqueue("iwvch", WQ_MEM_RECLAIM);
+		status = i40iw_add_mac_ip(iwdev);
+		if (status)
+			break;
+		iwdev->init_state = IP_ADDR_REGISTERED;
+		if (i40iw_register_rdma_device(iwdev)) {
+			i40iw_pr_err("register rdma device fail\n");
+			break;
+		};
+
+		iwdev->init_state = RDMA_DEV_REGISTERED;
+		iwdev->iw_status = 1;
+		i40iw_port_ibevent(iwdev);
+		iwdev->param_wq = alloc_ordered_workqueue("l2params", WQ_MEM_RECLAIM);
+		if(iwdev->param_wq == NULL)
+			break;
+		i40iw_pr_info("i40iw_open completed\n");
+		return 0;
+	} while (0);
+
+	i40iw_pr_err("status = %d last completion = %d\n", status, iwdev->init_state);
+	i40iw_deinit_device(iwdev);
+	return -ERESTART;
+}
+
+/**
+ * i40iw_l2params_worker - worker for l2 params change
+ * @work: work pointer for l2 params
+ */
+static void i40iw_l2params_worker(struct work_struct *work)
+{
+	struct l2params_work *dwork =
+	    container_of(work, struct l2params_work, work);
+	struct i40iw_device *iwdev = dwork->iwdev;
+
+	i40iw_change_l2params(&iwdev->vsi, &dwork->l2params);
+	atomic_dec(&iwdev->params_busy);
+	kfree(work);
+}
+
+/**
+ * i40iw_l2param_change - handle qs handles for qos and mss change
+ * @ldev: lan device information
+ * @client: client for paramater change
+ * @params: new parameters from L2
+ */
+static void i40iw_l2param_change(struct i40e_info *ldev, struct i40e_client *client,
+				 struct i40e_params *params)
+{
+	struct i40iw_handler *hdl;
+	struct i40iw_l2params *l2params;
+	struct l2params_work *work;
+	struct i40iw_device *iwdev;
+	int i;
+
+	hdl = i40iw_find_i40e_handler(ldev);
+	if (!hdl)
+		return;
+
+	iwdev = &hdl->device;
+
+	if (atomic_read(&iwdev->params_busy))
+		return;
+
+
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return;
+
+	atomic_inc(&iwdev->params_busy);
+
+	work->iwdev = iwdev;
+	l2params = &work->l2params;
+	for (i = 0; i < I40E_CLIENT_MAX_USER_PRIORITY; i++)
+		l2params->qs_handle_list[i] = params->qos.prio_qos[i].qs_handle;
+
+	l2params->mtu = (params->mtu) ? params->mtu : iwdev->vsi.mtu;
+
+	INIT_WORK(&work->work, i40iw_l2params_worker);
+	queue_work(iwdev->param_wq, &work->work);
+}
+
+/**
+ * i40iw_close - client interface operation close for iwarp/uda device
+ * @ldev: lan device information
+ * @client: client to close
+ *
+ * Called by the lan driver during the processing of client unregister
+ * Destroy and clean up the driver resources
+ */
+static void i40iw_close(struct i40e_info *ldev, struct i40e_client *client, bool reset)
+{
+	struct i40iw_device *iwdev;
+	struct i40iw_handler *hdl;
+
+	hdl = i40iw_find_i40e_handler(ldev);
+	if (!hdl)
+		return;
+
+	iwdev = &hdl->device;
+	iwdev->closing = true;
+
+	if (reset)
+		iwdev->reset = true;
+
+	i40iw_cm_teardown_connections(iwdev, NULL, NULL, true);
+	destroy_workqueue(iwdev->virtchnl_wq);
+	i40iw_deinit_device(iwdev);
+}
+
+/**
+ * i40iw_vf_reset - process VF reset
+ * @ldev: lan device information
+ * @client: client interface instance
+ * @vf_id: virtual function id
+ *
+ * Called when a VF is reset by the PF
+ * Destroy and clean up the VF resources
+ */
+static void i40iw_vf_reset(struct i40e_info *ldev, struct i40e_client *client, u32 vf_id)
+{
+	struct i40iw_handler *hdl;
+	struct i40iw_sc_dev *dev;
+	struct i40iw_hmc_fcn_info hmc_fcn_info;
+	struct i40iw_virt_mem vf_dev_mem;
+	struct i40iw_vfdev *tmp_vfdev;
+	unsigned int i;
+	unsigned long flags;
+	struct i40iw_device *iwdev;
+
+	hdl = i40iw_find_i40e_handler(ldev);
+	if (!hdl)
+		return;
+
+	dev = &hdl->device.sc_dev;
+	iwdev = (struct i40iw_device *)dev->back_dev;
+
+	for (i = 0; i < I40IW_MAX_PE_ENABLED_VF_COUNT; i++) {
+		if (!dev->vf_dev[i] || (dev->vf_dev[i]->vf_id != vf_id))
+			continue;
+		/* free all resources allocated on behalf of vf */
+		tmp_vfdev = dev->vf_dev[i];
+		spin_lock_irqsave(&iwdev->vsi.pestat->lock, flags);
+		dev->vf_dev[i] = NULL;
+		spin_unlock_irqrestore(&iwdev->vsi.pestat->lock, flags);
+		i40iw_del_hmc_objects(dev, &tmp_vfdev->hmc_info, false, false);
+		/* remove vf hmc function */
+		memset(&hmc_fcn_info, 0, sizeof(hmc_fcn_info));
+		hmc_fcn_info.vf_id = vf_id;
+		hmc_fcn_info.iw_vf_idx = tmp_vfdev->iw_vf_idx;
+		hmc_fcn_info.free_fcn = true;
+		i40iw_cqp_manage_hmc_fcn_cmd(dev, &hmc_fcn_info);
+		/* free vf_dev */
+		vf_dev_mem.va = tmp_vfdev;
+		vf_dev_mem.size = sizeof(struct i40iw_vfdev) +
+					sizeof(struct i40iw_hmc_obj_info) * I40IW_HMC_IW_MAX;
+		i40iw_free_virt_mem(dev->hw, &vf_dev_mem);
+		break;
+	}
+}
+
+/**
+ * i40iw_vf_enable - enable a number of VFs
+ * @ldev: lan device information
+ * @client: client interface instance
+ * @num_vfs: number of VFs for the PF
+ *
+ * Called when the number of VFs changes
+ */
+static void i40iw_vf_enable(struct i40e_info *ldev,
+			    struct i40e_client *client,
+			    u32 num_vfs)
+{
+	struct i40iw_handler *hdl;
+
+	hdl = i40iw_find_i40e_handler(ldev);
+	if (!hdl)
+		return;
+
+	if (num_vfs > I40IW_MAX_PE_ENABLED_VF_COUNT)
+		hdl->device.max_enabled_vfs = I40IW_MAX_PE_ENABLED_VF_COUNT;
+	else
+		hdl->device.max_enabled_vfs = num_vfs;
+}
+
+/**
+ * i40iw_vf_capable - check if VF capable
+ * @ldev: lan device information
+ * @client: client interface instance
+ * @vf_id: virtual function id
+ *
+ * Return 1 if a VF slot is available or if VF is already RDMA enabled
+ * Return 0 otherwise
+ */
+static int i40iw_vf_capable(struct i40e_info *ldev,
+			    struct i40e_client *client,
+			    u32 vf_id)
+{
+	struct i40iw_handler *hdl;
+	struct i40iw_sc_dev *dev;
+	unsigned int i;
+
+	hdl = i40iw_find_i40e_handler(ldev);
+	if (!hdl)
+		return 0;
+
+	dev = &hdl->device.sc_dev;
+
+	for (i = 0; i < hdl->device.max_enabled_vfs; i++) {
+		if (!dev->vf_dev[i] || (dev->vf_dev[i]->vf_id == vf_id))
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * i40iw_virtchnl_receive - receive a message through the virtual channel
+ * @ldev: lan device information
+ * @client: client interface instance
+ * @vf_id: virtual function id associated with the message
+ * @msg: message buffer pointer
+ * @len: length of the message
+ *
+ * Invoke virtual channel receive operation for the given msg
+ * Return 0 if successful, otherwise return error
+ */
+static int i40iw_virtchnl_receive(struct i40e_info *ldev,
+				  struct i40e_client *client,
+				  u32 vf_id,
+				  u8 *msg,
+				  u16 len)
+{
+	struct i40iw_handler *hdl;
+	struct i40iw_sc_dev *dev;
+	struct i40iw_device *iwdev;
+	int ret_code = I40IW_NOT_SUPPORTED;
+
+	if (!len || !msg)
+		return I40IW_ERR_PARAM;
+
+	hdl = i40iw_find_i40e_handler(ldev);
+	if (!hdl)
+		return I40IW_ERR_PARAM;
+
+	dev = &hdl->device.sc_dev;
+	iwdev = dev->back_dev;
+
+	if (dev->vchnl_if.vchnl_recv) {
+		ret_code = dev->vchnl_if.vchnl_recv(dev, vf_id, msg, len);
+		if (!dev->is_pf) {
+			atomic_dec(&iwdev->vchnl_msgs);
+			wake_up(&iwdev->vchnl_waitq);
+		}
+	}
+	return ret_code;
+}
+
+/**
+ * i40iw_vf_clear_to_send - wait to send virtual channel message
+ * @dev: iwarp device *
+ * Wait for until virtual channel is clear
+ * before sending the next message
+ *
+ * Returns false if error
+ * Returns true if clear to send
+ */
+bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev)
+{
+	struct i40iw_device *iwdev;
+	wait_queue_entry_t wait;
+
+	iwdev = dev->back_dev;
+
+	if (!wq_has_sleeper(&dev->vf_reqs) &&
+	    (atomic_read(&iwdev->vchnl_msgs) == 0))
+		return true; /* virtual channel is clear */
+
+	init_wait(&wait);
+	add_wait_queue_exclusive(&dev->vf_reqs, &wait);
+
+	if (!wait_event_timeout(dev->vf_reqs,
+				(atomic_read(&iwdev->vchnl_msgs) == 0),
+				I40IW_VCHNL_EVENT_TIMEOUT))
+		dev->vchnl_up = false;
+
+	remove_wait_queue(&dev->vf_reqs, &wait);
+
+	return dev->vchnl_up;
+}
+
+/**
+ * i40iw_virtchnl_send - send a message through the virtual channel
+ * @dev: iwarp device
+ * @vf_id: virtual function id associated with the message
+ * @msg: virtual channel message buffer pointer
+ * @len: length of the message
+ *
+ * Invoke virtual channel send operation for the given msg
+ * Return 0 if successful, otherwise return error
+ */
+static enum i40iw_status_code i40iw_virtchnl_send(struct i40iw_sc_dev *dev,
+						  u32 vf_id,
+						  u8 *msg,
+						  u16 len)
+{
+	struct i40iw_device *iwdev;
+	struct i40e_info *ldev;
+
+	if (!dev || !dev->back_dev)
+		return I40IW_ERR_BAD_PTR;
+
+	iwdev = dev->back_dev;
+	ldev = iwdev->ldev;
+
+	if (ldev && ldev->ops && ldev->ops->virtchnl_send)
+		return ldev->ops->virtchnl_send(ldev, &i40iw_client, vf_id, msg, len);
+	return I40IW_ERR_BAD_PTR;
+}
+
+/* client interface functions */
+static const struct i40e_client_ops i40e_ops = {
+	.open = i40iw_open,
+	.close = i40iw_close,
+	.l2_param_change = i40iw_l2param_change,
+	.virtchnl_receive = i40iw_virtchnl_receive,
+	.vf_reset = i40iw_vf_reset,
+	.vf_enable = i40iw_vf_enable,
+	.vf_capable = i40iw_vf_capable
+};
+
+/**
+ * i40iw_init_module - driver initialization function
+ *
+ * First function to call when the driver is loaded
+ * Register the driver as i40e client and port mapper client
+ */
+static int __init i40iw_init_module(void)
+{
+	int ret;
+
+	memset(&i40iw_client, 0, sizeof(i40iw_client));
+	i40iw_client.version.major = CLIENT_IW_INTERFACE_VERSION_MAJOR;
+	i40iw_client.version.minor = CLIENT_IW_INTERFACE_VERSION_MINOR;
+	i40iw_client.version.build = CLIENT_IW_INTERFACE_VERSION_BUILD;
+	i40iw_client.ops = &i40e_ops;
+	memcpy(i40iw_client.name, i40iw_client_name, I40E_CLIENT_STR_LENGTH);
+	i40iw_client.type = I40E_CLIENT_IWARP;
+	spin_lock_init(&i40iw_handler_lock);
+	ret = i40e_register_client(&i40iw_client);
+	i40iw_register_notifiers();
+
+	return ret;
+}
+
+/**
+ * i40iw_exit_module - driver exit clean up function
+ *
+ * The function is called just before the driver is unloaded
+ * Unregister the driver as i40e client and port mapper client
+ */
+static void __exit i40iw_exit_module(void)
+{
+	i40iw_unregister_notifiers();
+	i40e_unregister_client(&i40iw_client);
+}
+
+module_init(i40iw_init_module);
+module_exit(i40iw_exit_module);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_osdep.h b/drivers/infiniband/hw/i40iw/i40iw_osdep.h
new file mode 100644
index 0000000..f27be3e
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_osdep.h
@@ -0,0 +1,217 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_OSDEP_H
+#define I40IW_OSDEP_H
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <net/tcp.h>
+#include <crypto/hash.h>
+/* get readq/writeq support for 32 bit kernels, use the low-first version */
+#include <linux/io-64-nonatomic-lo-hi.h>
+
+#define STATS_TIMER_DELAY 1000
+
+static inline void set_64bit_val(u64 *wqe_words, u32 byte_index, u64 value)
+{
+	wqe_words[byte_index >> 3] = value;
+}
+
+/**
+ * set_32bit_val - set 32 value to hw wqe
+ * @wqe_words: wqe addr to write
+ * @byte_index: index in wqe
+ * @value: value to write
+ **/
+static inline void set_32bit_val(u32 *wqe_words, u32 byte_index, u32 value)
+{
+	wqe_words[byte_index >> 2] = value;
+}
+
+/**
+ * get_64bit_val - read 64 bit value from wqe
+ * @wqe_words: wqe addr
+ * @byte_index: index to read from
+ * @value: read value
+ **/
+static inline void get_64bit_val(u64 *wqe_words, u32 byte_index, u64 *value)
+{
+	*value = wqe_words[byte_index >> 3];
+}
+
+/**
+ * get_32bit_val - read 32 bit value from wqe
+ * @wqe_words: wqe addr
+ * @byte_index: index to reaad from
+ * @value: return 32 bit value
+ **/
+static inline void get_32bit_val(u32 *wqe_words, u32 byte_index, u32 *value)
+{
+	*value = wqe_words[byte_index >> 2];
+}
+
+struct i40iw_dma_mem {
+	void *va;
+	dma_addr_t pa;
+	u32 size;
+} __packed;
+
+struct i40iw_virt_mem {
+	void *va;
+	u32 size;
+} __packed;
+
+#define i40iw_debug(h, m, s, ...)                               \
+do {                                                            \
+	if (((m) & (h)->debug_mask))                            \
+		pr_info("i40iw " s, ##__VA_ARGS__);             \
+} while (0)
+
+#define i40iw_flush(a)          readl((a)->hw_addr + I40E_GLGEN_STAT)
+
+#define I40E_GLHMC_VFSDCMD(_i)  (0x000C8000 + ((_i) * 4)) \
+				/* _i=0...31 */
+#define I40E_GLHMC_VFSDCMD_MAX_INDEX    31
+#define I40E_GLHMC_VFSDCMD_PMSDIDX_SHIFT  0
+#define I40E_GLHMC_VFSDCMD_PMSDIDX_MASK  (0xFFF \
+					  << I40E_GLHMC_VFSDCMD_PMSDIDX_SHIFT)
+#define I40E_GLHMC_VFSDCMD_PF_SHIFT       16
+#define I40E_GLHMC_VFSDCMD_PF_MASK        (0xF << I40E_GLHMC_VFSDCMD_PF_SHIFT)
+#define I40E_GLHMC_VFSDCMD_VF_SHIFT       20
+#define I40E_GLHMC_VFSDCMD_VF_MASK        (0x1FF << I40E_GLHMC_VFSDCMD_VF_SHIFT)
+#define I40E_GLHMC_VFSDCMD_PMF_TYPE_SHIFT 29
+#define I40E_GLHMC_VFSDCMD_PMF_TYPE_MASK  (0x3 \
+					   << I40E_GLHMC_VFSDCMD_PMF_TYPE_SHIFT)
+#define I40E_GLHMC_VFSDCMD_PMSDWR_SHIFT   31
+#define I40E_GLHMC_VFSDCMD_PMSDWR_MASK  (0x1 << I40E_GLHMC_VFSDCMD_PMSDWR_SHIFT)
+
+#define I40E_GLHMC_VFSDDATAHIGH(_i)     (0x000C8200 + ((_i) * 4)) \
+				/* _i=0...31 */
+#define I40E_GLHMC_VFSDDATAHIGH_MAX_INDEX       31
+#define I40E_GLHMC_VFSDDATAHIGH_PMSDDATAHIGH_SHIFT 0
+#define I40E_GLHMC_VFSDDATAHIGH_PMSDDATAHIGH_MASK  (0xFFFFFFFF \
+			<< I40E_GLHMC_VFSDDATAHIGH_PMSDDATAHIGH_SHIFT)
+
+#define I40E_GLHMC_VFSDDATALOW(_i)      (0x000C8100 + ((_i) * 4)) \
+				/* _i=0...31 */
+#define I40E_GLHMC_VFSDDATALOW_MAX_INDEX        31
+#define I40E_GLHMC_VFSDDATALOW_PMSDVALID_SHIFT   0
+#define I40E_GLHMC_VFSDDATALOW_PMSDVALID_MASK  (0x1 \
+			<< I40E_GLHMC_VFSDDATALOW_PMSDVALID_SHIFT)
+#define I40E_GLHMC_VFSDDATALOW_PMSDTYPE_SHIFT    1
+#define I40E_GLHMC_VFSDDATALOW_PMSDTYPE_MASK  (0x1 \
+			<< I40E_GLHMC_VFSDDATALOW_PMSDTYPE_SHIFT)
+#define I40E_GLHMC_VFSDDATALOW_PMSDBPCOUNT_SHIFT 2
+#define I40E_GLHMC_VFSDDATALOW_PMSDBPCOUNT_MASK  (0x3FF \
+			<< I40E_GLHMC_VFSDDATALOW_PMSDBPCOUNT_SHIFT)
+#define I40E_GLHMC_VFSDDATALOW_PMSDDATALOW_SHIFT 12
+#define I40E_GLHMC_VFSDDATALOW_PMSDDATALOW_MASK  (0xFFFFF \
+			<< I40E_GLHMC_VFSDDATALOW_PMSDDATALOW_SHIFT)
+
+#define I40E_GLPE_FWLDSTATUS                     0x0000D200
+#define I40E_GLPE_FWLDSTATUS_LOAD_REQUESTED_SHIFT 0
+#define I40E_GLPE_FWLDSTATUS_LOAD_REQUESTED_MASK  (0x1 \
+			<< I40E_GLPE_FWLDSTATUS_LOAD_REQUESTED_SHIFT)
+#define I40E_GLPE_FWLDSTATUS_DONE_SHIFT           1
+#define I40E_GLPE_FWLDSTATUS_DONE_MASK  (0x1 << I40E_GLPE_FWLDSTATUS_DONE_SHIFT)
+#define I40E_GLPE_FWLDSTATUS_CQP_FAIL_SHIFT       2
+#define I40E_GLPE_FWLDSTATUS_CQP_FAIL_MASK  (0x1 \
+			 << I40E_GLPE_FWLDSTATUS_CQP_FAIL_SHIFT)
+#define I40E_GLPE_FWLDSTATUS_TEP_FAIL_SHIFT       3
+#define I40E_GLPE_FWLDSTATUS_TEP_FAIL_MASK  (0x1 \
+			 << I40E_GLPE_FWLDSTATUS_TEP_FAIL_SHIFT)
+#define I40E_GLPE_FWLDSTATUS_OOP_FAIL_SHIFT       4
+#define I40E_GLPE_FWLDSTATUS_OOP_FAIL_MASK  (0x1 \
+			 << I40E_GLPE_FWLDSTATUS_OOP_FAIL_SHIFT)
+
+struct i40iw_sc_dev;
+struct i40iw_sc_qp;
+struct i40iw_puda_buf;
+struct i40iw_puda_completion_info;
+struct i40iw_update_sds_info;
+struct i40iw_hmc_fcn_info;
+struct i40iw_virtchnl_work_info;
+struct i40iw_manage_vf_pble_info;
+struct i40iw_device;
+struct i40iw_hmc_info;
+struct i40iw_hw;
+
+u8 __iomem *i40iw_get_hw_addr(void *dev);
+void i40iw_ieq_mpa_crc_ae(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp);
+enum i40iw_status_code i40iw_vf_wait_vchnl_resp(struct i40iw_sc_dev *dev);
+bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev);
+enum i40iw_status_code i40iw_ieq_check_mpacrc(struct shash_desc *desc, void *addr,
+					      u32 length, u32 value);
+struct i40iw_sc_qp *i40iw_ieq_get_qp(struct i40iw_sc_dev *dev, struct i40iw_puda_buf *buf);
+void i40iw_ieq_update_tcpip_info(struct i40iw_puda_buf *buf, u16 length, u32 seqnum);
+void i40iw_free_hash_desc(struct shash_desc *);
+enum i40iw_status_code i40iw_init_hash_desc(struct shash_desc **);
+enum i40iw_status_code i40iw_puda_get_tcpip_info(struct i40iw_puda_completion_info *info,
+						 struct i40iw_puda_buf *buf);
+enum i40iw_status_code i40iw_cqp_sds_cmd(struct i40iw_sc_dev *dev,
+					 struct i40iw_update_sds_info *info);
+enum i40iw_status_code i40iw_cqp_manage_hmc_fcn_cmd(struct i40iw_sc_dev *dev,
+						    struct i40iw_hmc_fcn_info *hmcfcninfo);
+enum i40iw_status_code i40iw_cqp_query_fpm_values_cmd(struct i40iw_sc_dev *dev,
+						      struct i40iw_dma_mem *values_mem,
+						      u8 hmc_fn_id);
+enum i40iw_status_code i40iw_cqp_commit_fpm_values_cmd(struct i40iw_sc_dev *dev,
+						       struct i40iw_dma_mem *values_mem,
+						       u8 hmc_fn_id);
+enum i40iw_status_code i40iw_alloc_query_fpm_buf(struct i40iw_sc_dev *dev,
+						 struct i40iw_dma_mem *mem);
+enum i40iw_status_code i40iw_cqp_manage_vf_pble_bp(struct i40iw_sc_dev *dev,
+						   struct i40iw_manage_vf_pble_info *info);
+void i40iw_cqp_spawn_worker(struct i40iw_sc_dev *dev,
+			    struct i40iw_virtchnl_work_info *work_info, u32 iw_vf_idx);
+void *i40iw_remove_head(struct list_head *list);
+void i40iw_qp_suspend_resume(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp, bool suspend);
+
+void i40iw_term_modify_qp(struct i40iw_sc_qp *qp, u8 next_state, u8 term, u8 term_len);
+void i40iw_terminate_done(struct i40iw_sc_qp *qp, int timeout_occurred);
+void i40iw_terminate_start_timer(struct i40iw_sc_qp *qp);
+void i40iw_terminate_del_timer(struct i40iw_sc_qp *qp);
+
+enum i40iw_status_code i40iw_hw_manage_vf_pble_bp(struct i40iw_device *iwdev,
+						  struct i40iw_manage_vf_pble_info *info,
+						  bool wait);
+struct i40iw_sc_vsi;
+void i40iw_hw_stats_start_timer(struct i40iw_sc_vsi *vsi);
+void i40iw_hw_stats_stop_timer(struct i40iw_sc_vsi *vsi);
+#define i40iw_mmiowb() mmiowb()
+void i40iw_wr32(struct i40iw_hw *hw, u32 reg, u32 value);
+u32  i40iw_rd32(struct i40iw_hw *hw, u32 reg);
+#endif				/* _I40IW_OSDEP_H_ */
diff --git a/drivers/infiniband/hw/i40iw/i40iw_p.h b/drivers/infiniband/hw/i40iw/i40iw_p.h
new file mode 100644
index 0000000..11d3a2a
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_p.h
@@ -0,0 +1,128 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_P_H
+#define I40IW_P_H
+
+#define PAUSE_TIMER_VALUE       0xFFFF
+#define REFRESH_THRESHOLD       0x7FFF
+#define HIGH_THRESHOLD          0x800
+#define LOW_THRESHOLD           0x200
+#define ALL_TC2PFC              0xFF
+#define CQP_COMPL_WAIT_TIME     0x3E8
+#define CQP_TIMEOUT_THRESHOLD   5
+
+void i40iw_debug_buf(struct i40iw_sc_dev *dev, enum i40iw_debug_flag mask,
+		     char *desc, u64 *buf, u32 size);
+/* init operations */
+enum i40iw_status_code i40iw_device_init(struct i40iw_sc_dev *dev,
+					 struct i40iw_device_init_info *info);
+
+void i40iw_sc_cqp_post_sq(struct i40iw_sc_cqp *cqp);
+
+u64 *i40iw_sc_cqp_get_next_send_wqe(struct i40iw_sc_cqp *cqp, u64 scratch);
+
+void i40iw_check_cqp_progress(struct i40iw_cqp_timeout *cqp_timeout, struct i40iw_sc_dev *dev);
+
+enum i40iw_status_code i40iw_sc_mr_fast_register(struct i40iw_sc_qp *qp,
+						 struct i40iw_fast_reg_stag_info *info,
+						 bool post_sq);
+
+void i40iw_insert_wqe_hdr(u64 *wqe, u64 header);
+
+/* HMC/FPM functions */
+enum i40iw_status_code i40iw_sc_init_iw_hmc(struct i40iw_sc_dev *dev,
+					    u8 hmc_fn_id);
+
+enum i40iw_status_code i40iw_pf_init_vfhmc(struct i40iw_sc_dev *dev, u8 vf_hmc_fn_id,
+					   u32 *vf_cnt_array);
+
+/* stats functions */
+void i40iw_hw_stats_refresh_all(struct i40iw_vsi_pestat *stats);
+void i40iw_hw_stats_read_all(struct i40iw_vsi_pestat *stats, struct i40iw_dev_hw_stats *stats_values);
+void i40iw_hw_stats_read_32(struct i40iw_vsi_pestat *stats,
+			    enum i40iw_hw_stats_index_32b index,
+			    u64 *value);
+void i40iw_hw_stats_read_64(struct i40iw_vsi_pestat *stats,
+			    enum i40iw_hw_stats_index_64b index,
+			    u64 *value);
+void i40iw_hw_stats_init(struct i40iw_vsi_pestat *stats, u8 index, bool is_pf);
+
+/* vsi misc functions */
+enum i40iw_status_code i40iw_vsi_stats_init(struct i40iw_sc_vsi *vsi, struct i40iw_vsi_stats_info *info);
+void i40iw_vsi_stats_free(struct i40iw_sc_vsi *vsi);
+void i40iw_sc_vsi_init(struct i40iw_sc_vsi *vsi, struct i40iw_vsi_init_info *info);
+
+void i40iw_change_l2params(struct i40iw_sc_vsi *vsi, struct i40iw_l2params *l2params);
+void i40iw_qp_add_qos(struct i40iw_sc_qp *qp);
+void i40iw_qp_rem_qos(struct i40iw_sc_qp *qp);
+void i40iw_terminate_send_fin(struct i40iw_sc_qp *qp);
+
+void i40iw_terminate_connection(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *info);
+
+void i40iw_terminate_received(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *info);
+
+enum i40iw_status_code i40iw_sc_suspend_qp(struct i40iw_sc_cqp *cqp,
+					   struct i40iw_sc_qp *qp, u64 scratch);
+
+enum i40iw_status_code i40iw_sc_resume_qp(struct i40iw_sc_cqp *cqp,
+					  struct i40iw_sc_qp *qp, u64 scratch);
+
+enum i40iw_status_code i40iw_sc_static_hmc_pages_allocated(struct i40iw_sc_cqp *cqp,
+							   u64 scratch, u8 hmc_fn_id,
+							   bool post_sq,
+							   bool poll_registers);
+
+enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_count);
+
+void free_sd_mem(struct i40iw_sc_dev *dev);
+
+enum i40iw_status_code i40iw_process_cqp_cmd(struct i40iw_sc_dev *dev,
+					     struct cqp_commands_info *pcmdinfo);
+
+enum i40iw_status_code i40iw_process_bh(struct i40iw_sc_dev *dev);
+
+/* prototype for functions used for dynamic memory allocation */
+enum i40iw_status_code i40iw_allocate_dma_mem(struct i40iw_hw *hw,
+					      struct i40iw_dma_mem *mem, u64 size,
+					      u32 alignment);
+void i40iw_free_dma_mem(struct i40iw_hw *hw, struct i40iw_dma_mem *mem);
+enum i40iw_status_code i40iw_allocate_virt_mem(struct i40iw_hw *hw,
+					       struct i40iw_virt_mem *mem, u32 size);
+enum i40iw_status_code i40iw_free_virt_mem(struct i40iw_hw *hw,
+					   struct i40iw_virt_mem *mem);
+u8 i40iw_get_encoded_wqe_size(u32 wqsize, bool cqpsq);
+void i40iw_reinitialize_ieq(struct i40iw_sc_dev *dev);
+
+#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_pble.c b/drivers/infiniband/hw/i40iw/i40iw_pble.c
new file mode 100644
index 0000000..540aab5
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_pble.c
@@ -0,0 +1,612 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#include "i40iw_status.h"
+#include "i40iw_osdep.h"
+#include "i40iw_register.h"
+#include "i40iw_hmc.h"
+
+#include "i40iw_d.h"
+#include "i40iw_type.h"
+#include "i40iw_p.h"
+
+#include <linux/pci.h>
+#include <linux/genalloc.h>
+#include <linux/vmalloc.h>
+#include "i40iw_pble.h"
+#include "i40iw.h"
+
+struct i40iw_device;
+static enum i40iw_status_code add_pble_pool(struct i40iw_sc_dev *dev,
+					    struct i40iw_hmc_pble_rsrc *pble_rsrc);
+static void i40iw_free_vmalloc_mem(struct i40iw_hw *hw, struct i40iw_chunk *chunk);
+
+/**
+ * i40iw_destroy_pble_pool - destroy pool during module unload
+ * @pble_rsrc:	pble resources
+ */
+void i40iw_destroy_pble_pool(struct i40iw_sc_dev *dev, struct i40iw_hmc_pble_rsrc *pble_rsrc)
+{
+	struct list_head *clist;
+	struct list_head *tlist;
+	struct i40iw_chunk *chunk;
+	struct i40iw_pble_pool *pinfo = &pble_rsrc->pinfo;
+
+	if (pinfo->pool) {
+		list_for_each_safe(clist, tlist, &pinfo->clist) {
+			chunk = list_entry(clist, struct i40iw_chunk, list);
+			if (chunk->type == I40IW_VMALLOC)
+				i40iw_free_vmalloc_mem(dev->hw, chunk);
+			kfree(chunk);
+		}
+		gen_pool_destroy(pinfo->pool);
+	}
+}
+
+/**
+ * i40iw_hmc_init_pble - Initialize pble resources during module load
+ * @dev: i40iw_sc_dev struct
+ * @pble_rsrc:	pble resources
+ */
+enum i40iw_status_code i40iw_hmc_init_pble(struct i40iw_sc_dev *dev,
+					   struct i40iw_hmc_pble_rsrc *pble_rsrc)
+{
+	struct i40iw_hmc_info *hmc_info;
+	u32 fpm_idx = 0;
+
+	hmc_info = dev->hmc_info;
+	pble_rsrc->fpm_base_addr = hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].base;
+	/* Now start the pble' on 4k boundary */
+	if (pble_rsrc->fpm_base_addr & 0xfff)
+		fpm_idx = (PAGE_SIZE - (pble_rsrc->fpm_base_addr & 0xfff)) >> 3;
+
+	pble_rsrc->unallocated_pble =
+	    hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt - fpm_idx;
+	pble_rsrc->next_fpm_addr = pble_rsrc->fpm_base_addr + (fpm_idx << 3);
+
+	pble_rsrc->pinfo.pool_shift = POOL_SHIFT;
+	pble_rsrc->pinfo.pool = gen_pool_create(pble_rsrc->pinfo.pool_shift, -1);
+	INIT_LIST_HEAD(&pble_rsrc->pinfo.clist);
+	if (!pble_rsrc->pinfo.pool)
+		goto error;
+
+	if (add_pble_pool(dev, pble_rsrc))
+		goto error;
+
+	return 0;
+
+ error:i40iw_destroy_pble_pool(dev, pble_rsrc);
+	return I40IW_ERR_NO_MEMORY;
+}
+
+/**
+ * get_sd_pd_idx -  Returns sd index, pd index and rel_pd_idx from fpm address
+ * @ pble_rsrc:	structure containing fpm address
+ * @ idx: where to return indexes
+ */
+static inline void get_sd_pd_idx(struct i40iw_hmc_pble_rsrc *pble_rsrc,
+				 struct sd_pd_idx *idx)
+{
+	idx->sd_idx = (u32)(pble_rsrc->next_fpm_addr) / I40IW_HMC_DIRECT_BP_SIZE;
+	idx->pd_idx = (u32)(pble_rsrc->next_fpm_addr) / I40IW_HMC_PAGED_BP_SIZE;
+	idx->rel_pd_idx = (idx->pd_idx % I40IW_HMC_PD_CNT_IN_SD);
+}
+
+/**
+ * add_sd_direct - add sd direct for pble
+ * @dev: hardware control device structure
+ * @pble_rsrc: pble resource ptr
+ * @info: page info for sd
+ */
+static enum i40iw_status_code add_sd_direct(struct i40iw_sc_dev *dev,
+					    struct i40iw_hmc_pble_rsrc *pble_rsrc,
+					    struct i40iw_add_page_info *info)
+{
+	enum i40iw_status_code ret_code = 0;
+	struct sd_pd_idx *idx = &info->idx;
+	struct i40iw_chunk *chunk = info->chunk;
+	struct i40iw_hmc_info *hmc_info = info->hmc_info;
+	struct i40iw_hmc_sd_entry *sd_entry = info->sd_entry;
+	u32 offset = 0;
+
+	if (!sd_entry->valid) {
+		if (dev->is_pf) {
+			ret_code = i40iw_add_sd_table_entry(dev->hw, hmc_info,
+							    info->idx.sd_idx,
+							    I40IW_SD_TYPE_DIRECT,
+							    I40IW_HMC_DIRECT_BP_SIZE);
+			if (ret_code)
+				return ret_code;
+			chunk->type = I40IW_DMA_COHERENT;
+		}
+	}
+	offset = idx->rel_pd_idx << I40IW_HMC_PAGED_BP_SHIFT;
+	chunk->size = info->pages << I40IW_HMC_PAGED_BP_SHIFT;
+	chunk->vaddr = ((u8 *)sd_entry->u.bp.addr.va + offset);
+	chunk->fpm_addr = pble_rsrc->next_fpm_addr;
+	i40iw_debug(dev, I40IW_DEBUG_PBLE, "chunk_size[%d] = 0x%x vaddr=%p fpm_addr = %llx\n",
+		    chunk->size, chunk->size, chunk->vaddr, chunk->fpm_addr);
+	return 0;
+}
+
+/**
+ * i40iw_free_vmalloc_mem - free vmalloc during close
+ * @hw: hw struct
+ * @chunk: chunk information for vmalloc
+ */
+static void i40iw_free_vmalloc_mem(struct i40iw_hw *hw, struct i40iw_chunk *chunk)
+{
+	struct pci_dev *pcidev = (struct pci_dev *)hw->dev_context;
+	int i;
+
+	if (!chunk->pg_cnt)
+		goto done;
+	for (i = 0; i < chunk->pg_cnt; i++)
+		dma_unmap_page(&pcidev->dev, chunk->dmaaddrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
+
+ done:
+	kfree(chunk->dmaaddrs);
+	chunk->dmaaddrs = NULL;
+	vfree(chunk->vaddr);
+	chunk->vaddr = NULL;
+	chunk->type = 0;
+}
+
+/**
+ * i40iw_get_vmalloc_mem - get 2M page for sd
+ * @hw: hardware address
+ * @chunk: chunk to adf
+ * @pg_cnt: #of 4 K pages
+ */
+static enum i40iw_status_code i40iw_get_vmalloc_mem(struct i40iw_hw *hw,
+						    struct i40iw_chunk *chunk,
+						    int pg_cnt)
+{
+	struct pci_dev *pcidev = (struct pci_dev *)hw->dev_context;
+	struct page *page;
+	u8 *addr;
+	u32 size;
+	int i;
+
+	chunk->dmaaddrs = kzalloc(pg_cnt << 3, GFP_KERNEL);
+	if (!chunk->dmaaddrs)
+		return I40IW_ERR_NO_MEMORY;
+	size = PAGE_SIZE * pg_cnt;
+	chunk->vaddr = vmalloc(size);
+	if (!chunk->vaddr) {
+		kfree(chunk->dmaaddrs);
+		chunk->dmaaddrs = NULL;
+		return I40IW_ERR_NO_MEMORY;
+	}
+	chunk->size = size;
+	addr = (u8 *)chunk->vaddr;
+	for (i = 0; i < pg_cnt; i++) {
+		page = vmalloc_to_page((void *)addr);
+		if (!page)
+			break;
+		chunk->dmaaddrs[i] = dma_map_page(&pcidev->dev, page, 0,
+						  PAGE_SIZE, DMA_BIDIRECTIONAL);
+		if (dma_mapping_error(&pcidev->dev, chunk->dmaaddrs[i]))
+			break;
+		addr += PAGE_SIZE;
+	}
+
+	chunk->pg_cnt = i;
+	chunk->type = I40IW_VMALLOC;
+	if (i == pg_cnt)
+		return 0;
+
+	i40iw_free_vmalloc_mem(hw, chunk);
+	return I40IW_ERR_NO_MEMORY;
+}
+
+/**
+ * fpm_to_idx - given fpm address, get pble index
+ * @pble_rsrc: pble resource management
+ * @addr: fpm address for index
+ */
+static inline u32 fpm_to_idx(struct i40iw_hmc_pble_rsrc *pble_rsrc, u64 addr)
+{
+	return (addr - (pble_rsrc->fpm_base_addr)) >> 3;
+}
+
+/**
+ * add_bp_pages - add backing pages for sd
+ * @dev: hardware control device structure
+ * @pble_rsrc: pble resource management
+ * @info: page info for sd
+ */
+static enum i40iw_status_code add_bp_pages(struct i40iw_sc_dev *dev,
+					   struct i40iw_hmc_pble_rsrc *pble_rsrc,
+					   struct i40iw_add_page_info *info)
+{
+	u8 *addr;
+	struct i40iw_dma_mem mem;
+	struct i40iw_hmc_pd_entry *pd_entry;
+	struct i40iw_hmc_sd_entry *sd_entry = info->sd_entry;
+	struct i40iw_hmc_info *hmc_info = info->hmc_info;
+	struct i40iw_chunk *chunk = info->chunk;
+	struct i40iw_manage_vf_pble_info vf_pble_info;
+	enum i40iw_status_code status = 0;
+	u32 rel_pd_idx = info->idx.rel_pd_idx;
+	u32 pd_idx = info->idx.pd_idx;
+	u32 i;
+
+	status = i40iw_get_vmalloc_mem(dev->hw, chunk, info->pages);
+	if (status)
+		return I40IW_ERR_NO_MEMORY;
+	status = i40iw_add_sd_table_entry(dev->hw, hmc_info,
+					  info->idx.sd_idx, I40IW_SD_TYPE_PAGED,
+					  I40IW_HMC_DIRECT_BP_SIZE);
+	if (status)
+		goto error;
+	if (!dev->is_pf) {
+		status = i40iw_vchnl_vf_add_hmc_objs(dev, I40IW_HMC_IW_PBLE,
+						     fpm_to_idx(pble_rsrc,
+								pble_rsrc->next_fpm_addr),
+						     (info->pages << PBLE_512_SHIFT));
+		if (status) {
+			i40iw_pr_err("allocate PBLEs in the PF.  Error %i\n", status);
+			goto error;
+		}
+	}
+	addr = chunk->vaddr;
+	for (i = 0; i < info->pages; i++) {
+		mem.pa = chunk->dmaaddrs[i];
+		mem.size = PAGE_SIZE;
+		mem.va = (void *)(addr);
+		pd_entry = &sd_entry->u.pd_table.pd_entry[rel_pd_idx++];
+		if (!pd_entry->valid) {
+			status = i40iw_add_pd_table_entry(dev->hw, hmc_info, pd_idx++, &mem);
+			if (status)
+				goto error;
+			addr += PAGE_SIZE;
+		} else {
+			i40iw_pr_err("pd entry is valid expecting to be invalid\n");
+		}
+	}
+	if (!dev->is_pf) {
+		vf_pble_info.first_pd_index = info->idx.rel_pd_idx;
+		vf_pble_info.inv_pd_ent = false;
+		vf_pble_info.pd_entry_cnt = PBLE_PER_PAGE;
+		vf_pble_info.pd_pl_pba = sd_entry->u.pd_table.pd_page_addr.pa;
+		vf_pble_info.sd_index = info->idx.sd_idx;
+		status = i40iw_hw_manage_vf_pble_bp(dev->back_dev,
+						    &vf_pble_info, true);
+		if (status) {
+			i40iw_pr_err("CQP manage VF PBLE BP failed.  %i\n", status);
+			goto error;
+		}
+	}
+	chunk->fpm_addr = pble_rsrc->next_fpm_addr;
+	return 0;
+error:
+	i40iw_free_vmalloc_mem(dev->hw, chunk);
+	return status;
+}
+
+/**
+ * add_pble_pool - add a sd entry for pble resoure
+ * @dev: hardware control device structure
+ * @pble_rsrc: pble resource management
+ */
+static enum i40iw_status_code add_pble_pool(struct i40iw_sc_dev *dev,
+					    struct i40iw_hmc_pble_rsrc *pble_rsrc)
+{
+	struct i40iw_hmc_sd_entry *sd_entry;
+	struct i40iw_hmc_info *hmc_info;
+	struct i40iw_chunk *chunk;
+	struct i40iw_add_page_info info;
+	struct sd_pd_idx *idx = &info.idx;
+	enum i40iw_status_code ret_code = 0;
+	enum i40iw_sd_entry_type sd_entry_type;
+	u64 sd_reg_val = 0;
+	u32 pages;
+
+	if (pble_rsrc->unallocated_pble < PBLE_PER_PAGE)
+		return I40IW_ERR_NO_MEMORY;
+	if (pble_rsrc->next_fpm_addr & 0xfff) {
+		i40iw_pr_err("next fpm_addr %llx\n", pble_rsrc->next_fpm_addr);
+		return I40IW_ERR_INVALID_PAGE_DESC_INDEX;
+	}
+	chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+	if (!chunk)
+		return I40IW_ERR_NO_MEMORY;
+	hmc_info = dev->hmc_info;
+	chunk->fpm_addr = pble_rsrc->next_fpm_addr;
+	get_sd_pd_idx(pble_rsrc, idx);
+	sd_entry = &hmc_info->sd_table.sd_entry[idx->sd_idx];
+	pages = (idx->rel_pd_idx) ? (I40IW_HMC_PD_CNT_IN_SD -
+			idx->rel_pd_idx) : I40IW_HMC_PD_CNT_IN_SD;
+	pages = min(pages, pble_rsrc->unallocated_pble >> PBLE_512_SHIFT);
+	info.chunk = chunk;
+	info.hmc_info = hmc_info;
+	info.pages = pages;
+	info.sd_entry = sd_entry;
+	if (!sd_entry->valid) {
+		sd_entry_type = (!idx->rel_pd_idx &&
+				 (pages == I40IW_HMC_PD_CNT_IN_SD) &&
+				 dev->is_pf) ? I40IW_SD_TYPE_DIRECT : I40IW_SD_TYPE_PAGED;
+	} else {
+		sd_entry_type = sd_entry->entry_type;
+	}
+	i40iw_debug(dev, I40IW_DEBUG_PBLE,
+		    "pages = %d, unallocated_pble[%u] current_fpm_addr = %llx\n",
+		    pages, pble_rsrc->unallocated_pble, pble_rsrc->next_fpm_addr);
+	i40iw_debug(dev, I40IW_DEBUG_PBLE, "sd_entry_type = %d sd_entry valid = %d\n",
+		    sd_entry_type, sd_entry->valid);
+
+	if (sd_entry_type == I40IW_SD_TYPE_DIRECT)
+		ret_code = add_sd_direct(dev, pble_rsrc, &info);
+	if (ret_code)
+		sd_entry_type = I40IW_SD_TYPE_PAGED;
+	else
+		pble_rsrc->stats_direct_sds++;
+
+	if (sd_entry_type == I40IW_SD_TYPE_PAGED) {
+		ret_code = add_bp_pages(dev, pble_rsrc, &info);
+		if (ret_code)
+			goto error;
+		else
+			pble_rsrc->stats_paged_sds++;
+	}
+
+	if (gen_pool_add_virt(pble_rsrc->pinfo.pool, (unsigned long)chunk->vaddr,
+			      (phys_addr_t)chunk->fpm_addr, chunk->size, -1)) {
+		i40iw_pr_err("could not allocate memory by gen_pool_addr_virt()\n");
+		ret_code = I40IW_ERR_NO_MEMORY;
+		goto error;
+	}
+	pble_rsrc->next_fpm_addr += chunk->size;
+	i40iw_debug(dev, I40IW_DEBUG_PBLE, "next_fpm_addr = %llx chunk_size[%u] = 0x%x\n",
+		    pble_rsrc->next_fpm_addr, chunk->size, chunk->size);
+	pble_rsrc->unallocated_pble -= (chunk->size >> 3);
+	list_add(&chunk->list, &pble_rsrc->pinfo.clist);
+	sd_reg_val = (sd_entry_type == I40IW_SD_TYPE_PAGED) ?
+			sd_entry->u.pd_table.pd_page_addr.pa : sd_entry->u.bp.addr.pa;
+	if (sd_entry->valid)
+		return 0;
+	if (dev->is_pf) {
+		ret_code = i40iw_hmc_sd_one(dev, hmc_info->hmc_fn_id,
+					    sd_reg_val, idx->sd_idx,
+					    sd_entry->entry_type, true);
+		if (ret_code) {
+			i40iw_pr_err("cqp cmd failed for sd (pbles)\n");
+			goto error;
+		}
+	}
+
+	sd_entry->valid = true;
+	return 0;
+ error:
+	kfree(chunk);
+	return ret_code;
+}
+
+/**
+ * free_lvl2 - fee level 2 pble
+ * @pble_rsrc: pble resource management
+ * @palloc: level 2 pble allocation
+ */
+static void free_lvl2(struct i40iw_hmc_pble_rsrc *pble_rsrc,
+		      struct i40iw_pble_alloc *palloc)
+{
+	u32 i;
+	struct gen_pool *pool;
+	struct i40iw_pble_level2 *lvl2 = &palloc->level2;
+	struct i40iw_pble_info *root = &lvl2->root;
+	struct i40iw_pble_info *leaf = lvl2->leaf;
+
+	pool = pble_rsrc->pinfo.pool;
+
+	for (i = 0; i < lvl2->leaf_cnt; i++, leaf++) {
+		if (leaf->addr)
+			gen_pool_free(pool, leaf->addr, (leaf->cnt << 3));
+		else
+			break;
+	}
+
+	if (root->addr)
+		gen_pool_free(pool, root->addr, (root->cnt << 3));
+
+	kfree(lvl2->leaf);
+	lvl2->leaf = NULL;
+}
+
+/**
+ * get_lvl2_pble - get level 2 pble resource
+ * @pble_rsrc: pble resource management
+ * @palloc: level 2 pble allocation
+ * @pool: pool pointer
+ */
+static enum i40iw_status_code get_lvl2_pble(struct i40iw_hmc_pble_rsrc *pble_rsrc,
+					    struct i40iw_pble_alloc *palloc,
+					    struct gen_pool *pool)
+{
+	u32 lf4k, lflast, total, i;
+	u32 pblcnt = PBLE_PER_PAGE;
+	u64 *addr;
+	struct i40iw_pble_level2 *lvl2 = &palloc->level2;
+	struct i40iw_pble_info *root = &lvl2->root;
+	struct i40iw_pble_info *leaf;
+
+	/* number of full 512 (4K) leafs) */
+	lf4k = palloc->total_cnt >> 9;
+	lflast = palloc->total_cnt % PBLE_PER_PAGE;
+	total = (lflast == 0) ? lf4k : lf4k + 1;
+	lvl2->leaf_cnt = total;
+
+	leaf = kzalloc((sizeof(*leaf) * total), GFP_ATOMIC);
+	if (!leaf)
+		return I40IW_ERR_NO_MEMORY;
+	lvl2->leaf = leaf;
+	/* allocate pbles for the root */
+	root->addr = gen_pool_alloc(pool, (total << 3));
+	if (!root->addr) {
+		kfree(lvl2->leaf);
+		lvl2->leaf = NULL;
+		return I40IW_ERR_NO_MEMORY;
+	}
+	root->idx = fpm_to_idx(pble_rsrc,
+			       (u64)gen_pool_virt_to_phys(pool, root->addr));
+	root->cnt = total;
+	addr = (u64 *)root->addr;
+	for (i = 0; i < total; i++, leaf++) {
+		pblcnt = (lflast && ((i + 1) == total)) ? lflast : PBLE_PER_PAGE;
+		leaf->addr = gen_pool_alloc(pool, (pblcnt << 3));
+		if (!leaf->addr)
+			goto error;
+		leaf->idx = fpm_to_idx(pble_rsrc, (u64)gen_pool_virt_to_phys(pool, leaf->addr));
+
+		leaf->cnt = pblcnt;
+		*addr = (u64)leaf->idx;
+		addr++;
+	}
+	palloc->level = I40IW_LEVEL_2;
+	pble_rsrc->stats_lvl2++;
+	return 0;
+ error:
+	free_lvl2(pble_rsrc, palloc);
+	return I40IW_ERR_NO_MEMORY;
+}
+
+/**
+ * get_lvl1_pble - get level 1 pble resource
+ * @dev: hardware control device structure
+ * @pble_rsrc: pble resource management
+ * @palloc: level 1 pble allocation
+ */
+static enum i40iw_status_code get_lvl1_pble(struct i40iw_sc_dev *dev,
+					    struct i40iw_hmc_pble_rsrc *pble_rsrc,
+					    struct i40iw_pble_alloc *palloc)
+{
+	u64 *addr;
+	struct gen_pool *pool;
+	struct i40iw_pble_info *lvl1 = &palloc->level1;
+
+	pool = pble_rsrc->pinfo.pool;
+	addr = (u64 *)gen_pool_alloc(pool, (palloc->total_cnt << 3));
+
+	if (!addr)
+		return I40IW_ERR_NO_MEMORY;
+
+	palloc->level = I40IW_LEVEL_1;
+	lvl1->addr = (unsigned long)addr;
+	lvl1->idx = fpm_to_idx(pble_rsrc, (u64)gen_pool_virt_to_phys(pool,
+			       (unsigned long)addr));
+	lvl1->cnt = palloc->total_cnt;
+	pble_rsrc->stats_lvl1++;
+	return 0;
+}
+
+/**
+ * get_lvl1_lvl2_pble - calls get_lvl1 and get_lvl2 pble routine
+ * @dev: i40iw_sc_dev struct
+ * @pble_rsrc:	pble resources
+ * @palloc: contains all inforamtion regarding pble (idx + pble addr)
+ * @pool: pointer to general purpose special memory pool descriptor
+ */
+static inline enum i40iw_status_code get_lvl1_lvl2_pble(struct i40iw_sc_dev *dev,
+							struct i40iw_hmc_pble_rsrc *pble_rsrc,
+							struct i40iw_pble_alloc *palloc,
+							struct gen_pool *pool)
+{
+	enum i40iw_status_code status = 0;
+
+	status = get_lvl1_pble(dev, pble_rsrc, palloc);
+	if (status && (palloc->total_cnt > PBLE_PER_PAGE))
+		status = get_lvl2_pble(pble_rsrc, palloc, pool);
+	return status;
+}
+
+/**
+ * i40iw_get_pble - allocate pbles from the pool
+ * @dev: i40iw_sc_dev struct
+ * @pble_rsrc:	pble resources
+ * @palloc: contains all inforamtion regarding pble (idx + pble addr)
+ * @pble_cnt: #of pbles requested
+ */
+enum i40iw_status_code i40iw_get_pble(struct i40iw_sc_dev *dev,
+				      struct i40iw_hmc_pble_rsrc *pble_rsrc,
+				      struct i40iw_pble_alloc *palloc,
+				      u32 pble_cnt)
+{
+	struct gen_pool *pool;
+	enum i40iw_status_code status = 0;
+	u32 max_sds = 0;
+	int i;
+
+	pool = pble_rsrc->pinfo.pool;
+	palloc->total_cnt = pble_cnt;
+	palloc->level = I40IW_LEVEL_0;
+	/*check first to see if we can get pble's without acquiring additional sd's */
+	status = get_lvl1_lvl2_pble(dev, pble_rsrc, palloc, pool);
+	if (!status)
+		goto exit;
+	max_sds = (palloc->total_cnt >> 18) + 1;
+	for (i = 0; i < max_sds; i++) {
+		status = add_pble_pool(dev, pble_rsrc);
+		if (status)
+			break;
+		status = get_lvl1_lvl2_pble(dev, pble_rsrc, palloc, pool);
+		if (!status)
+			break;
+	}
+exit:
+	if (!status)
+		pble_rsrc->stats_alloc_ok++;
+	else
+		pble_rsrc->stats_alloc_fail++;
+
+	return status;
+}
+
+/**
+ * i40iw_free_pble - put pbles back into pool
+ * @pble_rsrc:	pble resources
+ * @palloc: contains all inforamtion regarding pble resource being freed
+ */
+void i40iw_free_pble(struct i40iw_hmc_pble_rsrc *pble_rsrc,
+		     struct i40iw_pble_alloc *palloc)
+{
+	struct gen_pool *pool;
+
+	pool = pble_rsrc->pinfo.pool;
+	if (palloc->level == I40IW_LEVEL_2)
+		free_lvl2(pble_rsrc, palloc);
+	else
+		gen_pool_free(pool, palloc->level1.addr,
+			      (palloc->level1.cnt << 3));
+	pble_rsrc->stats_alloc_freed++;
+}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_pble.h b/drivers/infiniband/hw/i40iw/i40iw_pble.h
new file mode 100644
index 0000000..7b1851d
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_pble.h
@@ -0,0 +1,131 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_PBLE_H
+#define I40IW_PBLE_H
+
+#define POOL_SHIFT      6
+#define PBLE_PER_PAGE   512
+#define I40IW_HMC_PAGED_BP_SHIFT 12
+#define PBLE_512_SHIFT  9
+
+enum i40iw_pble_level {
+	I40IW_LEVEL_0 = 0,
+	I40IW_LEVEL_1 = 1,
+	I40IW_LEVEL_2 = 2
+};
+
+enum i40iw_alloc_type {
+	I40IW_NO_ALLOC = 0,
+	I40IW_DMA_COHERENT = 1,
+	I40IW_VMALLOC = 2
+};
+
+struct i40iw_pble_info {
+	unsigned long addr;
+	u32 idx;
+	u32 cnt;
+};
+
+struct i40iw_pble_level2 {
+	struct i40iw_pble_info root;
+	struct i40iw_pble_info *leaf;
+	u32 leaf_cnt;
+};
+
+struct i40iw_pble_alloc {
+	u32 total_cnt;
+	enum i40iw_pble_level level;
+	union {
+		struct i40iw_pble_info level1;
+		struct i40iw_pble_level2 level2;
+	};
+};
+
+struct sd_pd_idx {
+	u32 sd_idx;
+	u32 pd_idx;
+	u32 rel_pd_idx;
+};
+
+struct i40iw_add_page_info {
+	struct i40iw_chunk *chunk;
+	struct i40iw_hmc_sd_entry *sd_entry;
+	struct i40iw_hmc_info *hmc_info;
+	struct sd_pd_idx idx;
+	u32 pages;
+};
+
+struct i40iw_chunk {
+	struct list_head list;
+	u32 size;
+	void *vaddr;
+	u64 fpm_addr;
+	u32 pg_cnt;
+	dma_addr_t *dmaaddrs;
+	enum i40iw_alloc_type type;
+};
+
+struct i40iw_pble_pool {
+	struct gen_pool *pool;
+	struct list_head clist;
+	u32 total_pble_alloc;
+	u32 free_pble_cnt;
+	u32 pool_shift;
+};
+
+struct i40iw_hmc_pble_rsrc {
+	u32 unallocated_pble;
+	u64 fpm_base_addr;
+	u64 next_fpm_addr;
+	struct i40iw_pble_pool pinfo;
+
+	u32 stats_direct_sds;
+	u32 stats_paged_sds;
+	u64 stats_alloc_ok;
+	u64 stats_alloc_fail;
+	u64 stats_alloc_freed;
+	u64 stats_lvl1;
+	u64 stats_lvl2;
+};
+
+void i40iw_destroy_pble_pool(struct i40iw_sc_dev *dev, struct i40iw_hmc_pble_rsrc *pble_rsrc);
+enum i40iw_status_code i40iw_hmc_init_pble(struct i40iw_sc_dev *dev,
+					   struct i40iw_hmc_pble_rsrc *pble_rsrc);
+void i40iw_free_pble(struct i40iw_hmc_pble_rsrc *pble_rsrc, struct i40iw_pble_alloc *palloc);
+enum i40iw_status_code i40iw_get_pble(struct i40iw_sc_dev *dev,
+				      struct i40iw_hmc_pble_rsrc *pble_rsrc,
+				      struct i40iw_pble_alloc *palloc,
+				      u32 pble_cnt);
+#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c
new file mode 100644
index 0000000..d9c7ae6
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c
@@ -0,0 +1,1493 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#include "i40iw_osdep.h"
+#include "i40iw_register.h"
+#include "i40iw_status.h"
+#include "i40iw_hmc.h"
+
+#include "i40iw_d.h"
+#include "i40iw_type.h"
+#include "i40iw_p.h"
+#include "i40iw_puda.h"
+
+static void i40iw_ieq_receive(struct i40iw_sc_vsi *vsi,
+			      struct i40iw_puda_buf *buf);
+static void i40iw_ieq_tx_compl(struct i40iw_sc_vsi *vsi, void *sqwrid);
+static void i40iw_ilq_putback_rcvbuf(struct i40iw_sc_qp *qp, u32 wqe_idx);
+static enum i40iw_status_code i40iw_puda_replenish_rq(struct i40iw_puda_rsrc
+						      *rsrc, bool initial);
+/**
+ * i40iw_puda_get_listbuf - get buffer from puda list
+ * @list: list to use for buffers (ILQ or IEQ)
+ */
+static struct i40iw_puda_buf *i40iw_puda_get_listbuf(struct list_head *list)
+{
+	struct i40iw_puda_buf *buf = NULL;
+
+	if (!list_empty(list)) {
+		buf = (struct i40iw_puda_buf *)list->next;
+		list_del((struct list_head *)&buf->list);
+	}
+	return buf;
+}
+
+/**
+ * i40iw_puda_get_bufpool - return buffer from resource
+ * @rsrc: resource to use for buffer
+ */
+struct i40iw_puda_buf *i40iw_puda_get_bufpool(struct i40iw_puda_rsrc *rsrc)
+{
+	struct i40iw_puda_buf *buf = NULL;
+	struct list_head *list = &rsrc->bufpool;
+	unsigned long	flags;
+
+	spin_lock_irqsave(&rsrc->bufpool_lock, flags);
+	buf = i40iw_puda_get_listbuf(list);
+	if (buf)
+		rsrc->avail_buf_count--;
+	else
+		rsrc->stats_buf_alloc_fail++;
+	spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
+	return buf;
+}
+
+/**
+ * i40iw_puda_ret_bufpool - return buffer to rsrc list
+ * @rsrc: resource to use for buffer
+ * @buf: buffe to return to resouce
+ */
+void i40iw_puda_ret_bufpool(struct i40iw_puda_rsrc *rsrc,
+			    struct i40iw_puda_buf *buf)
+{
+	unsigned long	flags;
+
+	spin_lock_irqsave(&rsrc->bufpool_lock, flags);
+	list_add(&buf->list, &rsrc->bufpool);
+	spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
+	rsrc->avail_buf_count++;
+}
+
+/**
+ * i40iw_puda_post_recvbuf - set wqe for rcv buffer
+ * @rsrc: resource ptr
+ * @wqe_idx: wqe index to use
+ * @buf: puda buffer for rcv q
+ * @initial: flag if during init time
+ */
+static void i40iw_puda_post_recvbuf(struct i40iw_puda_rsrc *rsrc, u32 wqe_idx,
+				    struct i40iw_puda_buf *buf, bool initial)
+{
+	u64 *wqe;
+	struct i40iw_sc_qp *qp = &rsrc->qp;
+	u64 offset24 = 0;
+
+	qp->qp_uk.rq_wrid_array[wqe_idx] = (uintptr_t)buf;
+	wqe = qp->qp_uk.rq_base[wqe_idx].elem;
+	i40iw_debug(rsrc->dev, I40IW_DEBUG_PUDA,
+		    "%s: wqe_idx= %d buf = %p wqe = %p\n", __func__,
+		    wqe_idx, buf, wqe);
+	if (!initial)
+		get_64bit_val(wqe, 24, &offset24);
+
+	offset24 = (offset24) ? 0 : LS_64(1, I40IWQPSQ_VALID);
+
+	set_64bit_val(wqe, 0, buf->mem.pa);
+	set_64bit_val(wqe, 8,
+		      LS_64(buf->mem.size, I40IWQPSQ_FRAG_LEN));
+	i40iw_insert_wqe_hdr(wqe, offset24);
+}
+
+/**
+ * i40iw_puda_replenish_rq - post rcv buffers
+ * @rsrc: resource to use for buffer
+ * @initial: flag if during init time
+ */
+static enum i40iw_status_code i40iw_puda_replenish_rq(struct i40iw_puda_rsrc *rsrc,
+						      bool initial)
+{
+	u32 i;
+	u32 invalid_cnt = rsrc->rxq_invalid_cnt;
+	struct i40iw_puda_buf *buf = NULL;
+
+	for (i = 0; i < invalid_cnt; i++) {
+		buf = i40iw_puda_get_bufpool(rsrc);
+		if (!buf)
+			return I40IW_ERR_list_empty;
+		i40iw_puda_post_recvbuf(rsrc, rsrc->rx_wqe_idx, buf,
+					initial);
+		rsrc->rx_wqe_idx =
+		    ((rsrc->rx_wqe_idx + 1) % rsrc->rq_size);
+		rsrc->rxq_invalid_cnt--;
+	}
+	return 0;
+}
+
+/**
+ * i40iw_puda_alloc_buf - allocate mem for buffer
+ * @dev: iwarp device
+ * @length: length of buffer
+ */
+static struct i40iw_puda_buf *i40iw_puda_alloc_buf(struct i40iw_sc_dev *dev,
+						   u32 length)
+{
+	struct i40iw_puda_buf *buf = NULL;
+	struct i40iw_virt_mem buf_mem;
+	enum i40iw_status_code ret;
+
+	ret = i40iw_allocate_virt_mem(dev->hw, &buf_mem,
+				      sizeof(struct i40iw_puda_buf));
+	if (ret) {
+		i40iw_debug(dev, I40IW_DEBUG_PUDA,
+			    "%s: error mem for buf\n", __func__);
+		return NULL;
+	}
+	buf = (struct i40iw_puda_buf *)buf_mem.va;
+	ret = i40iw_allocate_dma_mem(dev->hw, &buf->mem, length, 1);
+	if (ret) {
+		i40iw_debug(dev, I40IW_DEBUG_PUDA,
+			    "%s: error dma mem for buf\n", __func__);
+		i40iw_free_virt_mem(dev->hw, &buf_mem);
+		return NULL;
+	}
+	buf->buf_mem.va = buf_mem.va;
+	buf->buf_mem.size = buf_mem.size;
+	return buf;
+}
+
+/**
+ * i40iw_puda_dele_buf - delete buffer back to system
+ * @dev: iwarp device
+ * @buf: buffer to free
+ */
+static void i40iw_puda_dele_buf(struct i40iw_sc_dev *dev,
+				struct i40iw_puda_buf *buf)
+{
+	i40iw_free_dma_mem(dev->hw, &buf->mem);
+	i40iw_free_virt_mem(dev->hw, &buf->buf_mem);
+}
+
+/**
+ * i40iw_puda_get_next_send_wqe - return next wqe for processing
+ * @qp: puda qp for wqe
+ * @wqe_idx: wqe index for caller
+ */
+static u64 *i40iw_puda_get_next_send_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx)
+{
+	u64 *wqe = NULL;
+	enum i40iw_status_code ret_code = 0;
+
+	*wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
+	if (!*wqe_idx)
+		qp->swqe_polarity = !qp->swqe_polarity;
+	I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code);
+	if (ret_code)
+		return wqe;
+	wqe = qp->sq_base[*wqe_idx].elem;
+
+	return wqe;
+}
+
+/**
+ * i40iw_puda_poll_info - poll cq for completion
+ * @cq: cq for poll
+ * @info: info return for successful completion
+ */
+static enum i40iw_status_code i40iw_puda_poll_info(struct i40iw_sc_cq *cq,
+						   struct i40iw_puda_completion_info *info)
+{
+	u64 qword0, qword2, qword3;
+	u64 *cqe;
+	u64 comp_ctx;
+	bool valid_bit;
+	u32 major_err, minor_err;
+	bool error;
+
+	cqe = (u64 *)I40IW_GET_CURRENT_CQ_ELEMENT(&cq->cq_uk);
+	get_64bit_val(cqe, 24, &qword3);
+	valid_bit = (bool)RS_64(qword3, I40IW_CQ_VALID);
+
+	if (valid_bit != cq->cq_uk.polarity)
+		return I40IW_ERR_QUEUE_EMPTY;
+
+	i40iw_debug_buf(cq->dev, I40IW_DEBUG_PUDA, "PUDA CQE", cqe, 32);
+	error = (bool)RS_64(qword3, I40IW_CQ_ERROR);
+	if (error) {
+		i40iw_debug(cq->dev, I40IW_DEBUG_PUDA, "%s receive error\n", __func__);
+		major_err = (u32)(RS_64(qword3, I40IW_CQ_MAJERR));
+		minor_err = (u32)(RS_64(qword3, I40IW_CQ_MINERR));
+		info->compl_error = major_err << 16 | minor_err;
+		return I40IW_ERR_CQ_COMPL_ERROR;
+	}
+
+	get_64bit_val(cqe, 0, &qword0);
+	get_64bit_val(cqe, 16, &qword2);
+
+	info->q_type = (u8)RS_64(qword3, I40IW_CQ_SQ);
+	info->qp_id = (u32)RS_64(qword2, I40IWCQ_QPID);
+
+	get_64bit_val(cqe, 8, &comp_ctx);
+	info->qp = (struct i40iw_qp_uk *)(unsigned long)comp_ctx;
+	info->wqe_idx = (u32)RS_64(qword3, I40IW_CQ_WQEIDX);
+
+	if (info->q_type == I40IW_CQE_QTYPE_RQ) {
+		info->vlan_valid = (bool)RS_64(qword3, I40IW_VLAN_TAG_VALID);
+		info->l4proto = (u8)RS_64(qword2, I40IW_UDA_L4PROTO);
+		info->l3proto = (u8)RS_64(qword2, I40IW_UDA_L3PROTO);
+		info->payload_len = (u16)RS_64(qword0, I40IW_UDA_PAYLOADLEN);
+	}
+
+	return 0;
+}
+
+/**
+ * i40iw_puda_poll_completion - processes completion for cq
+ * @dev: iwarp device
+ * @cq: cq getting interrupt
+ * @compl_err: return any completion err
+ */
+enum i40iw_status_code i40iw_puda_poll_completion(struct i40iw_sc_dev *dev,
+						  struct i40iw_sc_cq *cq, u32 *compl_err)
+{
+	struct i40iw_qp_uk *qp;
+	struct i40iw_cq_uk *cq_uk = &cq->cq_uk;
+	struct i40iw_puda_completion_info info;
+	enum i40iw_status_code ret = 0;
+	struct i40iw_puda_buf *buf;
+	struct i40iw_puda_rsrc *rsrc;
+	void *sqwrid;
+	u8 cq_type = cq->cq_type;
+	unsigned long	flags;
+
+	if ((cq_type == I40IW_CQ_TYPE_ILQ) || (cq_type == I40IW_CQ_TYPE_IEQ)) {
+		rsrc = (cq_type == I40IW_CQ_TYPE_ILQ) ? cq->vsi->ilq : cq->vsi->ieq;
+	} else {
+		i40iw_debug(dev, I40IW_DEBUG_PUDA, "%s qp_type error\n", __func__);
+		return I40IW_ERR_BAD_PTR;
+	}
+	memset(&info, 0, sizeof(info));
+	ret = i40iw_puda_poll_info(cq, &info);
+	*compl_err = info.compl_error;
+	if (ret == I40IW_ERR_QUEUE_EMPTY)
+		return ret;
+	if (ret)
+		goto done;
+
+	qp = info.qp;
+	if (!qp || !rsrc) {
+		ret = I40IW_ERR_BAD_PTR;
+		goto done;
+	}
+
+	if (qp->qp_id != rsrc->qp_id) {
+		ret = I40IW_ERR_BAD_PTR;
+		goto done;
+	}
+
+	if (info.q_type == I40IW_CQE_QTYPE_RQ) {
+		buf = (struct i40iw_puda_buf *)(uintptr_t)qp->rq_wrid_array[info.wqe_idx];
+		/* Get all the tcpip information in the buf header */
+		ret = i40iw_puda_get_tcpip_info(&info, buf);
+		if (ret) {
+			rsrc->stats_rcvd_pkt_err++;
+			if (cq_type == I40IW_CQ_TYPE_ILQ) {
+				i40iw_ilq_putback_rcvbuf(&rsrc->qp,
+							 info.wqe_idx);
+			} else {
+				i40iw_puda_ret_bufpool(rsrc, buf);
+				i40iw_puda_replenish_rq(rsrc, false);
+			}
+			goto done;
+		}
+
+		rsrc->stats_pkt_rcvd++;
+		rsrc->compl_rxwqe_idx = info.wqe_idx;
+		i40iw_debug(dev, I40IW_DEBUG_PUDA, "%s RQ completion\n", __func__);
+		rsrc->receive(rsrc->vsi, buf);
+		if (cq_type == I40IW_CQ_TYPE_ILQ)
+			i40iw_ilq_putback_rcvbuf(&rsrc->qp, info.wqe_idx);
+		else
+			i40iw_puda_replenish_rq(rsrc, false);
+
+	} else {
+		i40iw_debug(dev, I40IW_DEBUG_PUDA, "%s SQ completion\n", __func__);
+		sqwrid = (void *)(uintptr_t)qp->sq_wrtrk_array[info.wqe_idx].wrid;
+		I40IW_RING_SET_TAIL(qp->sq_ring, info.wqe_idx);
+		rsrc->xmit_complete(rsrc->vsi, sqwrid);
+		spin_lock_irqsave(&rsrc->bufpool_lock, flags);
+		rsrc->tx_wqe_avail_cnt++;
+		spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
+		if (!list_empty(&rsrc->txpend))
+			i40iw_puda_send_buf(rsrc, NULL);
+	}
+
+done:
+	I40IW_RING_MOVE_HEAD(cq_uk->cq_ring, ret);
+	if (I40IW_RING_GETCURRENT_HEAD(cq_uk->cq_ring) == 0)
+		cq_uk->polarity = !cq_uk->polarity;
+	/* update cq tail in cq shadow memory also */
+	I40IW_RING_MOVE_TAIL(cq_uk->cq_ring);
+	set_64bit_val(cq_uk->shadow_area, 0,
+		      I40IW_RING_GETCURRENT_HEAD(cq_uk->cq_ring));
+	return 0;
+}
+
+/**
+ * i40iw_puda_send - complete send wqe for transmit
+ * @qp: puda qp for send
+ * @info: buffer information for transmit
+ */
+enum i40iw_status_code i40iw_puda_send(struct i40iw_sc_qp *qp,
+				       struct i40iw_puda_send_info *info)
+{
+	u64 *wqe;
+	u32 iplen, l4len;
+	u64 header[2];
+	u32 wqe_idx;
+	u8 iipt;
+
+	/* number of 32 bits DWORDS in header */
+	l4len = info->tcplen >> 2;
+	if (info->ipv4) {
+		iipt = 3;
+		iplen = 5;
+	} else {
+		iipt = 1;
+		iplen = 10;
+	}
+
+	wqe = i40iw_puda_get_next_send_wqe(&qp->qp_uk, &wqe_idx);
+	if (!wqe)
+		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+	qp->qp_uk.sq_wrtrk_array[wqe_idx].wrid = (uintptr_t)info->scratch;
+	/* Third line of WQE descriptor */
+	/* maclen is in words */
+	header[0] = LS_64((info->maclen >> 1), I40IW_UDA_QPSQ_MACLEN) |
+		    LS_64(iplen, I40IW_UDA_QPSQ_IPLEN) | LS_64(1, I40IW_UDA_QPSQ_L4T) |
+		    LS_64(iipt, I40IW_UDA_QPSQ_IIPT) |
+		    LS_64(l4len, I40IW_UDA_QPSQ_L4LEN);
+	/* Forth line of WQE descriptor */
+	header[1] = LS_64(I40IW_OP_TYPE_SEND, I40IW_UDA_QPSQ_OPCODE) |
+		    LS_64(1, I40IW_UDA_QPSQ_SIGCOMPL) |
+		    LS_64(info->doloopback, I40IW_UDA_QPSQ_DOLOOPBACK) |
+		    LS_64(qp->qp_uk.swqe_polarity, I40IW_UDA_QPSQ_VALID);
+
+	set_64bit_val(wqe, 0, info->paddr);
+	set_64bit_val(wqe, 8, LS_64(info->len, I40IWQPSQ_FRAG_LEN));
+	set_64bit_val(wqe, 16, header[0]);
+
+	i40iw_insert_wqe_hdr(wqe, header[1]);
+
+	i40iw_debug_buf(qp->dev, I40IW_DEBUG_PUDA, "PUDA SEND WQE", wqe, 32);
+	i40iw_qp_post_wr(&qp->qp_uk);
+	return 0;
+}
+
+/**
+ * i40iw_puda_send_buf - transmit puda buffer
+ * @rsrc: resource to use for buffer
+ * @buf: puda buffer to transmit
+ */
+void i40iw_puda_send_buf(struct i40iw_puda_rsrc *rsrc, struct i40iw_puda_buf *buf)
+{
+	struct i40iw_puda_send_info info;
+	enum i40iw_status_code ret = 0;
+	unsigned long	flags;
+
+	spin_lock_irqsave(&rsrc->bufpool_lock, flags);
+	/* if no wqe available or not from a completion and we have
+	 * pending buffers, we must queue new buffer
+	 */
+	if (!rsrc->tx_wqe_avail_cnt || (buf && !list_empty(&rsrc->txpend))) {
+		list_add_tail(&buf->list, &rsrc->txpend);
+		spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
+		rsrc->stats_sent_pkt_q++;
+		if (rsrc->type == I40IW_PUDA_RSRC_TYPE_ILQ)
+			i40iw_debug(rsrc->dev, I40IW_DEBUG_PUDA,
+				    "%s: adding to txpend\n", __func__);
+		return;
+	}
+	rsrc->tx_wqe_avail_cnt--;
+	/* if we are coming from a completion and have pending buffers
+	 * then Get one from pending list
+	 */
+	if (!buf) {
+		buf = i40iw_puda_get_listbuf(&rsrc->txpend);
+		if (!buf)
+			goto done;
+	}
+
+	info.scratch = (void *)buf;
+	info.paddr = buf->mem.pa;
+	info.len = buf->totallen;
+	info.tcplen = buf->tcphlen;
+	info.maclen = buf->maclen;
+	info.ipv4 = buf->ipv4;
+	info.doloopback = (rsrc->type == I40IW_PUDA_RSRC_TYPE_IEQ);
+
+	ret = i40iw_puda_send(&rsrc->qp, &info);
+	if (ret) {
+		rsrc->tx_wqe_avail_cnt++;
+		rsrc->stats_sent_pkt_q++;
+		list_add(&buf->list, &rsrc->txpend);
+		if (rsrc->type == I40IW_PUDA_RSRC_TYPE_ILQ)
+			i40iw_debug(rsrc->dev, I40IW_DEBUG_PUDA,
+				    "%s: adding to puda_send\n", __func__);
+	} else {
+		rsrc->stats_pkt_sent++;
+	}
+done:
+	spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
+}
+
+/**
+ * i40iw_puda_qp_setctx - during init, set qp's context
+ * @rsrc: qp's resource
+ */
+static void i40iw_puda_qp_setctx(struct i40iw_puda_rsrc *rsrc)
+{
+	struct i40iw_sc_qp *qp = &rsrc->qp;
+	u64 *qp_ctx = qp->hw_host_ctx;
+
+	set_64bit_val(qp_ctx, 8, qp->sq_pa);
+	set_64bit_val(qp_ctx, 16, qp->rq_pa);
+
+	set_64bit_val(qp_ctx, 24,
+		      LS_64(qp->hw_rq_size, I40IWQPC_RQSIZE) |
+		      LS_64(qp->hw_sq_size, I40IWQPC_SQSIZE));
+
+	set_64bit_val(qp_ctx, 48, LS_64(rsrc->buf_size, I40IW_UDA_QPC_MAXFRAMESIZE));
+	set_64bit_val(qp_ctx, 56, 0);
+	set_64bit_val(qp_ctx, 64, 1);
+
+	set_64bit_val(qp_ctx, 136,
+		      LS_64(rsrc->cq_id, I40IWQPC_TXCQNUM) |
+		      LS_64(rsrc->cq_id, I40IWQPC_RXCQNUM));
+
+	set_64bit_val(qp_ctx, 160, LS_64(1, I40IWQPC_PRIVEN));
+
+	set_64bit_val(qp_ctx, 168,
+		      LS_64((uintptr_t)qp, I40IWQPC_QPCOMPCTX));
+
+	set_64bit_val(qp_ctx, 176,
+		      LS_64(qp->sq_tph_val, I40IWQPC_SQTPHVAL) |
+		      LS_64(qp->rq_tph_val, I40IWQPC_RQTPHVAL) |
+		      LS_64(qp->qs_handle, I40IWQPC_QSHANDLE));
+
+	i40iw_debug_buf(rsrc->dev, I40IW_DEBUG_PUDA, "PUDA QP CONTEXT",
+			qp_ctx, I40IW_QP_CTX_SIZE);
+}
+
+/**
+ * i40iw_puda_qp_wqe - setup wqe for qp create
+ * @rsrc: resource for qp
+ */
+static enum i40iw_status_code i40iw_puda_qp_wqe(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp)
+{
+	struct i40iw_sc_cqp *cqp;
+	u64 *wqe;
+	u64 header;
+	struct i40iw_ccq_cqe_info compl_info;
+	enum i40iw_status_code status = 0;
+
+	cqp = dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, 0);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 16, qp->hw_host_ctx_pa);
+	set_64bit_val(wqe, 40, qp->shadow_area_pa);
+	header = qp->qp_uk.qp_id |
+		 LS_64(I40IW_CQP_OP_CREATE_QP, I40IW_CQPSQ_OPCODE) |
+		 LS_64(I40IW_QP_TYPE_UDA, I40IW_CQPSQ_QP_QPTYPE) |
+		 LS_64(1, I40IW_CQPSQ_QP_CQNUMVALID) |
+		 LS_64(2, I40IW_CQPSQ_QP_NEXTIWSTATE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_PUDA, "PUDA CQE", wqe, 32);
+	i40iw_sc_cqp_post_sq(cqp);
+	status = dev->cqp_ops->poll_for_cqp_op_done(dev->cqp,
+						    I40IW_CQP_OP_CREATE_QP,
+						    &compl_info);
+	return status;
+}
+
+/**
+ * i40iw_puda_qp_create - create qp for resource
+ * @rsrc: resource to use for buffer
+ */
+static enum i40iw_status_code i40iw_puda_qp_create(struct i40iw_puda_rsrc *rsrc)
+{
+	struct i40iw_sc_qp *qp = &rsrc->qp;
+	struct i40iw_qp_uk *ukqp = &qp->qp_uk;
+	enum i40iw_status_code ret = 0;
+	u32 sq_size, rq_size, t_size;
+	struct i40iw_dma_mem *mem;
+
+	sq_size = rsrc->sq_size * I40IW_QP_WQE_MIN_SIZE;
+	rq_size = rsrc->rq_size * I40IW_QP_WQE_MIN_SIZE;
+	t_size = (sq_size + rq_size + (I40IW_SHADOW_AREA_SIZE << 3) +
+		  I40IW_QP_CTX_SIZE);
+	/* Get page aligned memory */
+	ret =
+	    i40iw_allocate_dma_mem(rsrc->dev->hw, &rsrc->qpmem, t_size,
+				   I40IW_HW_PAGE_SIZE);
+	if (ret) {
+		i40iw_debug(rsrc->dev, I40IW_DEBUG_PUDA, "%s: error dma mem\n", __func__);
+		return ret;
+	}
+
+	mem = &rsrc->qpmem;
+	memset(mem->va, 0, t_size);
+	qp->hw_sq_size = i40iw_get_encoded_wqe_size(rsrc->sq_size, false);
+	qp->hw_rq_size = i40iw_get_encoded_wqe_size(rsrc->rq_size, false);
+	qp->pd = &rsrc->sc_pd;
+	qp->qp_type = I40IW_QP_TYPE_UDA;
+	qp->dev = rsrc->dev;
+	qp->back_qp = (void *)rsrc;
+	qp->sq_pa = mem->pa;
+	qp->rq_pa = qp->sq_pa + sq_size;
+	qp->vsi = rsrc->vsi;
+	ukqp->sq_base = mem->va;
+	ukqp->rq_base = &ukqp->sq_base[rsrc->sq_size];
+	ukqp->shadow_area = ukqp->rq_base[rsrc->rq_size].elem;
+	qp->shadow_area_pa = qp->rq_pa + rq_size;
+	qp->hw_host_ctx = ukqp->shadow_area + I40IW_SHADOW_AREA_SIZE;
+	qp->hw_host_ctx_pa =
+		qp->shadow_area_pa + (I40IW_SHADOW_AREA_SIZE << 3);
+	ukqp->qp_id = rsrc->qp_id;
+	ukqp->sq_wrtrk_array = rsrc->sq_wrtrk_array;
+	ukqp->rq_wrid_array = rsrc->rq_wrid_array;
+
+	ukqp->qp_id = rsrc->qp_id;
+	ukqp->sq_size = rsrc->sq_size;
+	ukqp->rq_size = rsrc->rq_size;
+
+	I40IW_RING_INIT(ukqp->sq_ring, ukqp->sq_size);
+	I40IW_RING_INIT(ukqp->initial_ring, ukqp->sq_size);
+	I40IW_RING_INIT(ukqp->rq_ring, ukqp->rq_size);
+
+	if (qp->pd->dev->is_pf)
+		ukqp->wqe_alloc_reg = (u32 __iomem *)(i40iw_get_hw_addr(qp->pd->dev) +
+						    I40E_PFPE_WQEALLOC);
+	else
+		ukqp->wqe_alloc_reg = (u32 __iomem *)(i40iw_get_hw_addr(qp->pd->dev) +
+						    I40E_VFPE_WQEALLOC1);
+
+	qp->user_pri = 0;
+	i40iw_qp_add_qos(qp);
+	i40iw_puda_qp_setctx(rsrc);
+	if (rsrc->dev->ceq_valid)
+		ret = i40iw_cqp_qp_create_cmd(rsrc->dev, qp);
+	else
+		ret = i40iw_puda_qp_wqe(rsrc->dev, qp);
+	if (ret) {
+		i40iw_qp_rem_qos(qp);
+		i40iw_free_dma_mem(rsrc->dev->hw, &rsrc->qpmem);
+	}
+	return ret;
+}
+
+/**
+ * i40iw_puda_cq_wqe - setup wqe for cq create
+ * @rsrc: resource for cq
+ */
+static enum i40iw_status_code i40iw_puda_cq_wqe(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq)
+{
+	u64 *wqe;
+	struct i40iw_sc_cqp *cqp;
+	u64 header;
+	struct i40iw_ccq_cqe_info compl_info;
+	enum i40iw_status_code status = 0;
+
+	cqp = dev->cqp;
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, 0);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
+	set_64bit_val(wqe, 8, RS_64_1(cq, 1));
+	set_64bit_val(wqe, 16,
+		      LS_64(cq->shadow_read_threshold,
+			    I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD));
+	set_64bit_val(wqe, 32, cq->cq_pa);
+
+	set_64bit_val(wqe, 40, cq->shadow_area_pa);
+
+	header = cq->cq_uk.cq_id |
+	    LS_64(I40IW_CQP_OP_CREATE_CQ, I40IW_CQPSQ_OPCODE) |
+	    LS_64(1, I40IW_CQPSQ_CQ_CHKOVERFLOW) |
+	    LS_64(1, I40IW_CQPSQ_CQ_ENCEQEMASK) |
+	    LS_64(1, I40IW_CQPSQ_CQ_CEQIDVALID) |
+	    LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(dev, I40IW_DEBUG_PUDA, "PUDA CQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	i40iw_sc_cqp_post_sq(dev->cqp);
+	status = dev->cqp_ops->poll_for_cqp_op_done(dev->cqp,
+						 I40IW_CQP_OP_CREATE_CQ,
+						 &compl_info);
+	return status;
+}
+
+/**
+ * i40iw_puda_cq_create - create cq for resource
+ * @rsrc: resource for which cq to create
+ */
+static enum i40iw_status_code i40iw_puda_cq_create(struct i40iw_puda_rsrc *rsrc)
+{
+	struct i40iw_sc_dev *dev = rsrc->dev;
+	struct i40iw_sc_cq *cq = &rsrc->cq;
+	enum i40iw_status_code ret = 0;
+	u32 tsize, cqsize;
+	struct i40iw_dma_mem *mem;
+	struct i40iw_cq_init_info info;
+	struct i40iw_cq_uk_init_info *init_info = &info.cq_uk_init_info;
+
+	cq->vsi = rsrc->vsi;
+	cqsize = rsrc->cq_size * (sizeof(struct i40iw_cqe));
+	tsize = cqsize + sizeof(struct i40iw_cq_shadow_area);
+	ret = i40iw_allocate_dma_mem(dev->hw, &rsrc->cqmem, tsize,
+				     I40IW_CQ0_ALIGNMENT);
+	if (ret)
+		return ret;
+
+	mem = &rsrc->cqmem;
+	memset(&info, 0, sizeof(info));
+	info.dev = dev;
+	info.type = (rsrc->type == I40IW_PUDA_RSRC_TYPE_ILQ) ?
+			 I40IW_CQ_TYPE_ILQ : I40IW_CQ_TYPE_IEQ;
+	info.shadow_read_threshold = rsrc->cq_size >> 2;
+	info.ceq_id_valid = true;
+	info.cq_base_pa = mem->pa;
+	info.shadow_area_pa = mem->pa + cqsize;
+	init_info->cq_base = mem->va;
+	init_info->shadow_area = (u64 *)((u8 *)mem->va + cqsize);
+	init_info->cq_size = rsrc->cq_size;
+	init_info->cq_id = rsrc->cq_id;
+	info.ceqe_mask = true;
+	info.ceq_id_valid = true;
+	ret = dev->iw_priv_cq_ops->cq_init(cq, &info);
+	if (ret)
+		goto error;
+	if (rsrc->dev->ceq_valid)
+		ret = i40iw_cqp_cq_create_cmd(dev, cq);
+	else
+		ret = i40iw_puda_cq_wqe(dev, cq);
+error:
+	if (ret)
+		i40iw_free_dma_mem(dev->hw, &rsrc->cqmem);
+	return ret;
+}
+
+/**
+ * i40iw_puda_free_qp - free qp for resource
+ * @rsrc: resource for which qp to free
+ */
+static void i40iw_puda_free_qp(struct i40iw_puda_rsrc *rsrc)
+{
+	enum i40iw_status_code ret;
+	struct i40iw_ccq_cqe_info compl_info;
+	struct i40iw_sc_dev *dev = rsrc->dev;
+
+	if (rsrc->dev->ceq_valid) {
+		i40iw_cqp_qp_destroy_cmd(dev, &rsrc->qp);
+		return;
+	}
+
+	ret = dev->iw_priv_qp_ops->qp_destroy(&rsrc->qp,
+			0, false, true, true);
+	if (ret)
+		i40iw_debug(dev, I40IW_DEBUG_PUDA,
+			    "%s error puda qp destroy wqe\n",
+			    __func__);
+
+	if (!ret) {
+		ret = dev->cqp_ops->poll_for_cqp_op_done(dev->cqp,
+				I40IW_CQP_OP_DESTROY_QP,
+				&compl_info);
+		if (ret)
+			i40iw_debug(dev, I40IW_DEBUG_PUDA,
+				    "%s error puda qp destroy failed\n",
+				    __func__);
+	}
+}
+
+/**
+ * i40iw_puda_free_cq - free cq for resource
+ * @rsrc: resource for which cq to free
+ */
+static void i40iw_puda_free_cq(struct i40iw_puda_rsrc *rsrc)
+{
+	enum i40iw_status_code ret;
+	struct i40iw_ccq_cqe_info compl_info;
+	struct i40iw_sc_dev *dev = rsrc->dev;
+
+	if (rsrc->dev->ceq_valid) {
+		i40iw_cqp_cq_destroy_cmd(dev, &rsrc->cq);
+		return;
+	}
+	ret = dev->iw_priv_cq_ops->cq_destroy(&rsrc->cq, 0, true);
+
+	if (ret)
+		i40iw_debug(dev, I40IW_DEBUG_PUDA,
+			    "%s error ieq cq destroy\n",
+			    __func__);
+
+	if (!ret) {
+		ret = dev->cqp_ops->poll_for_cqp_op_done(dev->cqp,
+				I40IW_CQP_OP_DESTROY_CQ,
+				&compl_info);
+		if (ret)
+			i40iw_debug(dev, I40IW_DEBUG_PUDA,
+				    "%s error ieq qp destroy done\n",
+				    __func__);
+	}
+}
+
+/**
+ * i40iw_puda_dele_resources - delete all resources during close
+ * @dev: iwarp device
+ * @type: type of resource to dele
+ * @reset: true if reset chip
+ */
+void i40iw_puda_dele_resources(struct i40iw_sc_vsi *vsi,
+			       enum puda_resource_type type,
+			       bool reset)
+{
+	struct i40iw_sc_dev *dev = vsi->dev;
+	struct i40iw_puda_rsrc *rsrc;
+	struct i40iw_puda_buf *buf = NULL;
+	struct i40iw_puda_buf *nextbuf = NULL;
+	struct i40iw_virt_mem *vmem;
+
+	switch (type) {
+	case I40IW_PUDA_RSRC_TYPE_ILQ:
+		rsrc = vsi->ilq;
+		vmem = &vsi->ilq_mem;
+		break;
+	case I40IW_PUDA_RSRC_TYPE_IEQ:
+		rsrc = vsi->ieq;
+		vmem = &vsi->ieq_mem;
+		break;
+	default:
+		i40iw_debug(dev, I40IW_DEBUG_PUDA, "%s: error resource type = 0x%x\n",
+			    __func__, type);
+		return;
+	}
+
+	switch (rsrc->completion) {
+	case PUDA_HASH_CRC_COMPLETE:
+		i40iw_free_hash_desc(rsrc->hash_desc);
+		/* fall through */
+	case PUDA_QP_CREATED:
+		if (!reset)
+			i40iw_puda_free_qp(rsrc);
+
+		i40iw_free_dma_mem(dev->hw, &rsrc->qpmem);
+		/* fallthrough */
+	case PUDA_CQ_CREATED:
+		if (!reset)
+			i40iw_puda_free_cq(rsrc);
+
+		i40iw_free_dma_mem(dev->hw, &rsrc->cqmem);
+		break;
+	default:
+		i40iw_debug(rsrc->dev, I40IW_DEBUG_PUDA, "%s error no resources\n", __func__);
+		break;
+	}
+	/* Free all allocated puda buffers for both tx and rx */
+	buf = rsrc->alloclist;
+	while (buf) {
+		nextbuf = buf->next;
+		i40iw_puda_dele_buf(dev, buf);
+		buf = nextbuf;
+		rsrc->alloc_buf_count--;
+	}
+	i40iw_free_virt_mem(dev->hw, vmem);
+}
+
+/**
+ * i40iw_puda_allocbufs - allocate buffers for resource
+ * @rsrc: resource for buffer allocation
+ * @count: number of buffers to create
+ */
+static enum i40iw_status_code i40iw_puda_allocbufs(struct i40iw_puda_rsrc *rsrc,
+						   u32 count)
+{
+	u32 i;
+	struct i40iw_puda_buf *buf;
+	struct i40iw_puda_buf *nextbuf;
+
+	for (i = 0; i < count; i++) {
+		buf = i40iw_puda_alloc_buf(rsrc->dev, rsrc->buf_size);
+		if (!buf) {
+			rsrc->stats_buf_alloc_fail++;
+			return I40IW_ERR_NO_MEMORY;
+		}
+		i40iw_puda_ret_bufpool(rsrc, buf);
+		rsrc->alloc_buf_count++;
+		if (!rsrc->alloclist) {
+			rsrc->alloclist = buf;
+		} else {
+			nextbuf = rsrc->alloclist;
+			rsrc->alloclist = buf;
+			buf->next = nextbuf;
+		}
+	}
+	rsrc->avail_buf_count = rsrc->alloc_buf_count;
+	return 0;
+}
+
+/**
+ * i40iw_puda_create_rsrc - create resouce (ilq or ieq)
+ * @dev: iwarp device
+ * @info: resource information
+ */
+enum i40iw_status_code i40iw_puda_create_rsrc(struct i40iw_sc_vsi *vsi,
+					      struct i40iw_puda_rsrc_info *info)
+{
+	struct i40iw_sc_dev *dev = vsi->dev;
+	enum i40iw_status_code ret = 0;
+	struct i40iw_puda_rsrc *rsrc;
+	u32 pudasize;
+	u32 sqwridsize, rqwridsize;
+	struct i40iw_virt_mem *vmem;
+
+	info->count = 1;
+	pudasize = sizeof(struct i40iw_puda_rsrc);
+	sqwridsize = info->sq_size * sizeof(struct i40iw_sq_uk_wr_trk_info);
+	rqwridsize = info->rq_size * 8;
+	switch (info->type) {
+	case I40IW_PUDA_RSRC_TYPE_ILQ:
+		vmem = &vsi->ilq_mem;
+		break;
+	case I40IW_PUDA_RSRC_TYPE_IEQ:
+		vmem = &vsi->ieq_mem;
+		break;
+	default:
+		return I40IW_NOT_SUPPORTED;
+	}
+	ret =
+	    i40iw_allocate_virt_mem(dev->hw, vmem,
+				    pudasize + sqwridsize + rqwridsize);
+	if (ret)
+		return ret;
+	rsrc = (struct i40iw_puda_rsrc *)vmem->va;
+	spin_lock_init(&rsrc->bufpool_lock);
+	if (info->type == I40IW_PUDA_RSRC_TYPE_ILQ) {
+		vsi->ilq = (struct i40iw_puda_rsrc *)vmem->va;
+		vsi->ilq_count = info->count;
+		rsrc->receive = info->receive;
+		rsrc->xmit_complete = info->xmit_complete;
+	} else {
+		vmem = &vsi->ieq_mem;
+		vsi->ieq_count = info->count;
+		vsi->ieq = (struct i40iw_puda_rsrc *)vmem->va;
+		rsrc->receive = i40iw_ieq_receive;
+		rsrc->xmit_complete = i40iw_ieq_tx_compl;
+	}
+
+	rsrc->type = info->type;
+	rsrc->sq_wrtrk_array = (struct i40iw_sq_uk_wr_trk_info *)((u8 *)vmem->va + pudasize);
+	rsrc->rq_wrid_array = (u64 *)((u8 *)vmem->va + pudasize + sqwridsize);
+	/* Initialize all ieq lists */
+	INIT_LIST_HEAD(&rsrc->bufpool);
+	INIT_LIST_HEAD(&rsrc->txpend);
+
+	rsrc->tx_wqe_avail_cnt = info->sq_size - 1;
+	dev->iw_pd_ops->pd_init(dev, &rsrc->sc_pd, info->pd_id, -1);
+	rsrc->qp_id = info->qp_id;
+	rsrc->cq_id = info->cq_id;
+	rsrc->sq_size = info->sq_size;
+	rsrc->rq_size = info->rq_size;
+	rsrc->cq_size = info->rq_size + info->sq_size;
+	rsrc->buf_size = info->buf_size;
+	rsrc->dev = dev;
+	rsrc->vsi = vsi;
+
+	ret = i40iw_puda_cq_create(rsrc);
+	if (!ret) {
+		rsrc->completion = PUDA_CQ_CREATED;
+		ret = i40iw_puda_qp_create(rsrc);
+	}
+	if (ret) {
+		i40iw_debug(dev, I40IW_DEBUG_PUDA, "[%s] error qp_create\n",
+			    __func__);
+		goto error;
+	}
+	rsrc->completion = PUDA_QP_CREATED;
+
+	ret = i40iw_puda_allocbufs(rsrc, info->tx_buf_cnt + info->rq_size);
+	if (ret) {
+		i40iw_debug(dev, I40IW_DEBUG_PUDA, "[%s] error alloc_buf\n",
+			    __func__);
+		goto error;
+	}
+
+	rsrc->rxq_invalid_cnt = info->rq_size;
+	ret = i40iw_puda_replenish_rq(rsrc, true);
+	if (ret)
+		goto error;
+
+	if (info->type == I40IW_PUDA_RSRC_TYPE_IEQ) {
+		if (!i40iw_init_hash_desc(&rsrc->hash_desc)) {
+			rsrc->check_crc = true;
+			rsrc->completion = PUDA_HASH_CRC_COMPLETE;
+			ret = 0;
+		}
+	}
+
+	dev->ccq_ops->ccq_arm(&rsrc->cq);
+	return ret;
+ error:
+	i40iw_puda_dele_resources(vsi, info->type, false);
+
+	return ret;
+}
+
+/**
+ * i40iw_ilq_putback_rcvbuf - ilq buffer to put back on rq
+ * @qp: ilq's qp resource
+ * @wqe_idx:  wqe index of completed rcvbuf
+ */
+static void i40iw_ilq_putback_rcvbuf(struct i40iw_sc_qp *qp, u32 wqe_idx)
+{
+	u64 *wqe;
+	u64 offset24;
+
+	wqe = qp->qp_uk.rq_base[wqe_idx].elem;
+	get_64bit_val(wqe, 24, &offset24);
+	offset24 = (offset24) ? 0 : LS_64(1, I40IWQPSQ_VALID);
+	set_64bit_val(wqe, 24, offset24);
+}
+
+/**
+ * i40iw_ieq_get_fpdu - given length return fpdu length
+ * @length: length if fpdu
+ */
+static u16 i40iw_ieq_get_fpdu_length(u16 length)
+{
+	u16 fpdu_len;
+
+	fpdu_len = length + I40IW_IEQ_MPA_FRAMING;
+	fpdu_len = (fpdu_len + 3) & 0xfffffffc;
+	return fpdu_len;
+}
+
+/**
+ * i40iw_ieq_copy_to_txbuf - copydata from rcv buf to tx buf
+ * @buf: rcv buffer with partial
+ * @txbuf: tx buffer for sendign back
+ * @buf_offset: rcv buffer offset to copy from
+ * @txbuf_offset: at offset in tx buf to copy
+ * @length: length of data to copy
+ */
+static void i40iw_ieq_copy_to_txbuf(struct i40iw_puda_buf *buf,
+				    struct i40iw_puda_buf *txbuf,
+				    u16 buf_offset, u32 txbuf_offset,
+				    u32 length)
+{
+	void *mem1 = (u8 *)buf->mem.va + buf_offset;
+	void *mem2 = (u8 *)txbuf->mem.va + txbuf_offset;
+
+	memcpy(mem2, mem1, length);
+}
+
+/**
+ * i40iw_ieq_setup_tx_buf - setup tx buffer for partial handling
+ * @buf: reeive buffer with partial
+ * @txbuf: buffer to prepare
+ */
+static void i40iw_ieq_setup_tx_buf(struct i40iw_puda_buf *buf,
+				   struct i40iw_puda_buf *txbuf)
+{
+	txbuf->maclen = buf->maclen;
+	txbuf->tcphlen = buf->tcphlen;
+	txbuf->ipv4 = buf->ipv4;
+	txbuf->hdrlen = buf->hdrlen;
+	i40iw_ieq_copy_to_txbuf(buf, txbuf, 0, 0, buf->hdrlen);
+}
+
+/**
+ * i40iw_ieq_check_first_buf - check if rcv buffer's seq is in range
+ * @buf: receive exception buffer
+ * @fps: first partial sequence number
+ */
+static void i40iw_ieq_check_first_buf(struct i40iw_puda_buf *buf, u32 fps)
+{
+	u32 offset;
+
+	if (buf->seqnum < fps) {
+		offset = fps - buf->seqnum;
+		if (offset > buf->datalen)
+			return;
+		buf->data += offset;
+		buf->datalen -= (u16)offset;
+		buf->seqnum = fps;
+	}
+}
+
+/**
+ * i40iw_ieq_compl_pfpdu - write txbuf with full fpdu
+ * @ieq: ieq resource
+ * @rxlist: ieq's received buffer list
+ * @pbufl: temporary list for buffers for fpddu
+ * @txbuf: tx buffer for fpdu
+ * @fpdu_len: total length of fpdu
+ */
+static void  i40iw_ieq_compl_pfpdu(struct i40iw_puda_rsrc *ieq,
+				   struct list_head *rxlist,
+				   struct list_head *pbufl,
+				   struct i40iw_puda_buf *txbuf,
+				   u16 fpdu_len)
+{
+	struct i40iw_puda_buf *buf;
+	u32 nextseqnum;
+	u16 txoffset, bufoffset;
+
+	buf = i40iw_puda_get_listbuf(pbufl);
+	if (!buf)
+		return;
+	nextseqnum = buf->seqnum + fpdu_len;
+	txbuf->totallen = buf->hdrlen + fpdu_len;
+	txbuf->data = (u8 *)txbuf->mem.va + buf->hdrlen;
+	i40iw_ieq_setup_tx_buf(buf, txbuf);
+
+	txoffset = buf->hdrlen;
+	bufoffset = (u16)(buf->data - (u8 *)buf->mem.va);
+
+	do {
+		if (buf->datalen >= fpdu_len) {
+			/* copied full fpdu */
+			i40iw_ieq_copy_to_txbuf(buf, txbuf, bufoffset, txoffset, fpdu_len);
+			buf->datalen -= fpdu_len;
+			buf->data += fpdu_len;
+			buf->seqnum = nextseqnum;
+			break;
+		}
+		/* copy partial fpdu */
+		i40iw_ieq_copy_to_txbuf(buf, txbuf, bufoffset, txoffset, buf->datalen);
+		txoffset += buf->datalen;
+		fpdu_len -= buf->datalen;
+		i40iw_puda_ret_bufpool(ieq, buf);
+		buf = i40iw_puda_get_listbuf(pbufl);
+		if (!buf)
+			return;
+		bufoffset = (u16)(buf->data - (u8 *)buf->mem.va);
+	} while (1);
+
+	/* last buffer on the list*/
+	if (buf->datalen)
+		list_add(&buf->list, rxlist);
+	else
+		i40iw_puda_ret_bufpool(ieq, buf);
+}
+
+/**
+ * i40iw_ieq_create_pbufl - create buffer list for single fpdu
+ * @rxlist: resource list for receive ieq buffes
+ * @pbufl: temp. list for buffers for fpddu
+ * @buf: first receive buffer
+ * @fpdu_len: total length of fpdu
+ */
+static enum i40iw_status_code i40iw_ieq_create_pbufl(
+						     struct i40iw_pfpdu *pfpdu,
+						     struct list_head *rxlist,
+						     struct list_head *pbufl,
+						     struct i40iw_puda_buf *buf,
+						     u16 fpdu_len)
+{
+	enum i40iw_status_code status = 0;
+	struct i40iw_puda_buf *nextbuf;
+	u32	nextseqnum;
+	u16 plen = fpdu_len - buf->datalen;
+	bool done = false;
+
+	nextseqnum = buf->seqnum + buf->datalen;
+	do {
+		nextbuf = i40iw_puda_get_listbuf(rxlist);
+		if (!nextbuf) {
+			status = I40IW_ERR_list_empty;
+			break;
+		}
+		list_add_tail(&nextbuf->list, pbufl);
+		if (nextbuf->seqnum != nextseqnum) {
+			pfpdu->bad_seq_num++;
+			status = I40IW_ERR_SEQ_NUM;
+			break;
+		}
+		if (nextbuf->datalen >= plen) {
+			done = true;
+		} else {
+			plen -= nextbuf->datalen;
+			nextseqnum = nextbuf->seqnum + nextbuf->datalen;
+		}
+
+	} while (!done);
+
+	return status;
+}
+
+/**
+ * i40iw_ieq_handle_partial - process partial fpdu buffer
+ * @ieq: ieq resource
+ * @pfpdu: partial management per user qp
+ * @buf: receive buffer
+ * @fpdu_len: fpdu len in the buffer
+ */
+static enum i40iw_status_code i40iw_ieq_handle_partial(struct i40iw_puda_rsrc *ieq,
+						       struct i40iw_pfpdu *pfpdu,
+						       struct i40iw_puda_buf *buf,
+						       u16 fpdu_len)
+{
+	enum i40iw_status_code status = 0;
+	u8 *crcptr;
+	u32 mpacrc;
+	u32 seqnum = buf->seqnum;
+	struct list_head pbufl;	/* partial buffer list */
+	struct i40iw_puda_buf *txbuf = NULL;
+	struct list_head *rxlist = &pfpdu->rxlist;
+
+	INIT_LIST_HEAD(&pbufl);
+	list_add(&buf->list, &pbufl);
+
+	status = i40iw_ieq_create_pbufl(pfpdu, rxlist, &pbufl, buf, fpdu_len);
+	if (status)
+		goto error;
+
+	txbuf = i40iw_puda_get_bufpool(ieq);
+	if (!txbuf) {
+		pfpdu->no_tx_bufs++;
+		status = I40IW_ERR_NO_TXBUFS;
+		goto error;
+	}
+
+	i40iw_ieq_compl_pfpdu(ieq, rxlist, &pbufl, txbuf, fpdu_len);
+	i40iw_ieq_update_tcpip_info(txbuf, fpdu_len, seqnum);
+	crcptr = txbuf->data + fpdu_len - 4;
+	mpacrc = *(u32 *)crcptr;
+	if (ieq->check_crc) {
+		status = i40iw_ieq_check_mpacrc(ieq->hash_desc, txbuf->data,
+						(fpdu_len - 4), mpacrc);
+		if (status) {
+			i40iw_debug(ieq->dev, I40IW_DEBUG_IEQ,
+				    "%s: error bad crc\n", __func__);
+			goto error;
+		}
+	}
+
+	i40iw_debug_buf(ieq->dev, I40IW_DEBUG_IEQ, "IEQ TX BUFFER",
+			txbuf->mem.va, txbuf->totallen);
+	i40iw_puda_send_buf(ieq, txbuf);
+	pfpdu->rcv_nxt = seqnum + fpdu_len;
+	return status;
+ error:
+	while (!list_empty(&pbufl)) {
+		buf = (struct i40iw_puda_buf *)(pbufl.prev);
+		list_del(&buf->list);
+		list_add(&buf->list, rxlist);
+	}
+	if (txbuf)
+		i40iw_puda_ret_bufpool(ieq, txbuf);
+	return status;
+}
+
+/**
+ * i40iw_ieq_process_buf - process buffer rcvd for ieq
+ * @ieq: ieq resource
+ * @pfpdu: partial management per user qp
+ * @buf: receive buffer
+ */
+static enum i40iw_status_code i40iw_ieq_process_buf(struct i40iw_puda_rsrc *ieq,
+						    struct i40iw_pfpdu *pfpdu,
+						    struct i40iw_puda_buf *buf)
+{
+	u16 fpdu_len = 0;
+	u16 datalen = buf->datalen;
+	u8 *datap = buf->data;
+	u8 *crcptr;
+	u16 ioffset = 0;
+	u32 mpacrc;
+	u32 seqnum = buf->seqnum;
+	u16 length = 0;
+	u16 full = 0;
+	bool partial = false;
+	struct i40iw_puda_buf *txbuf;
+	struct list_head *rxlist = &pfpdu->rxlist;
+	enum i40iw_status_code ret = 0;
+	enum i40iw_status_code status = 0;
+
+	ioffset = (u16)(buf->data - (u8 *)buf->mem.va);
+	while (datalen) {
+		fpdu_len = i40iw_ieq_get_fpdu_length(ntohs(*(__be16 *)datap));
+		if (fpdu_len > pfpdu->max_fpdu_data) {
+			i40iw_debug(ieq->dev, I40IW_DEBUG_IEQ,
+				    "%s: error bad fpdu_len\n", __func__);
+			status = I40IW_ERR_MPA_CRC;
+			list_add(&buf->list, rxlist);
+			return status;
+		}
+
+		if (datalen < fpdu_len) {
+			partial = true;
+			break;
+		}
+		crcptr = datap + fpdu_len - 4;
+		mpacrc = *(u32 *)crcptr;
+		if (ieq->check_crc)
+			ret = i40iw_ieq_check_mpacrc(ieq->hash_desc,
+						     datap, fpdu_len - 4, mpacrc);
+		if (ret) {
+			status = I40IW_ERR_MPA_CRC;
+			list_add(&buf->list, rxlist);
+			return status;
+		}
+		full++;
+		pfpdu->fpdu_processed++;
+		datap += fpdu_len;
+		length += fpdu_len;
+		datalen -= fpdu_len;
+	}
+	if (full) {
+		/* copy full pdu's in the txbuf and send them out */
+		txbuf = i40iw_puda_get_bufpool(ieq);
+		if (!txbuf) {
+			pfpdu->no_tx_bufs++;
+			status = I40IW_ERR_NO_TXBUFS;
+			list_add(&buf->list, rxlist);
+			return status;
+		}
+		/* modify txbuf's buffer header */
+		i40iw_ieq_setup_tx_buf(buf, txbuf);
+		/* copy full fpdu's to new buffer */
+		i40iw_ieq_copy_to_txbuf(buf, txbuf, ioffset, buf->hdrlen,
+					length);
+		txbuf->totallen = buf->hdrlen + length;
+
+		i40iw_ieq_update_tcpip_info(txbuf, length, buf->seqnum);
+		i40iw_puda_send_buf(ieq, txbuf);
+
+		if (!datalen) {
+			pfpdu->rcv_nxt = buf->seqnum + length;
+			i40iw_puda_ret_bufpool(ieq, buf);
+			return status;
+		}
+		buf->data = datap;
+		buf->seqnum = seqnum + length;
+		buf->datalen = datalen;
+		pfpdu->rcv_nxt = buf->seqnum;
+	}
+	if (partial)
+		status = i40iw_ieq_handle_partial(ieq, pfpdu, buf, fpdu_len);
+
+	return status;
+}
+
+/**
+ * i40iw_ieq_process_fpdus - process fpdu's buffers on its list
+ * @qp: qp for which partial fpdus
+ * @ieq: ieq resource
+ */
+static void i40iw_ieq_process_fpdus(struct i40iw_sc_qp *qp,
+				    struct i40iw_puda_rsrc *ieq)
+{
+	struct i40iw_pfpdu *pfpdu = &qp->pfpdu;
+	struct list_head *rxlist = &pfpdu->rxlist;
+	struct i40iw_puda_buf *buf;
+	enum i40iw_status_code status;
+
+	do {
+		if (list_empty(rxlist))
+			break;
+		buf = i40iw_puda_get_listbuf(rxlist);
+		if (!buf) {
+			i40iw_debug(ieq->dev, I40IW_DEBUG_IEQ,
+				    "%s: error no buf\n", __func__);
+			break;
+		}
+		if (buf->seqnum != pfpdu->rcv_nxt) {
+			/* This could be out of order or missing packet */
+			pfpdu->out_of_order++;
+			list_add(&buf->list, rxlist);
+			break;
+		}
+		/* keep processing buffers from the head of the list */
+		status = i40iw_ieq_process_buf(ieq, pfpdu, buf);
+		if (status == I40IW_ERR_MPA_CRC) {
+			pfpdu->mpa_crc_err = true;
+			while (!list_empty(rxlist)) {
+				buf = i40iw_puda_get_listbuf(rxlist);
+				i40iw_puda_ret_bufpool(ieq, buf);
+				pfpdu->crc_err++;
+			}
+			/* create CQP for AE */
+			i40iw_ieq_mpa_crc_ae(ieq->dev, qp);
+		}
+	} while (!status);
+}
+
+/**
+ * i40iw_ieq_handle_exception - handle qp's exception
+ * @ieq: ieq resource
+ * @qp: qp receiving excpetion
+ * @buf: receive buffer
+ */
+static void i40iw_ieq_handle_exception(struct i40iw_puda_rsrc *ieq,
+				       struct i40iw_sc_qp *qp,
+				       struct i40iw_puda_buf *buf)
+{
+	struct i40iw_puda_buf *tmpbuf = NULL;
+	struct i40iw_pfpdu *pfpdu = &qp->pfpdu;
+	u32 *hw_host_ctx = (u32 *)qp->hw_host_ctx;
+	u32 rcv_wnd = hw_host_ctx[23];
+	/* first partial seq # in q2 */
+	u32 fps = *(u32 *)(qp->q2_buf + Q2_FPSN_OFFSET);
+	struct list_head *rxlist = &pfpdu->rxlist;
+	struct list_head *plist;
+
+	pfpdu->total_ieq_bufs++;
+
+	if (pfpdu->mpa_crc_err) {
+		pfpdu->crc_err++;
+		goto error;
+	}
+	if (pfpdu->mode && (fps != pfpdu->fps)) {
+		/* clean up qp as it is new partial sequence */
+		i40iw_ieq_cleanup_qp(ieq, qp);
+		i40iw_debug(ieq->dev, I40IW_DEBUG_IEQ,
+			    "%s: restarting new partial\n", __func__);
+		pfpdu->mode = false;
+	}
+
+	if (!pfpdu->mode) {
+		i40iw_debug_buf(ieq->dev, I40IW_DEBUG_IEQ, "Q2 BUFFER", (u64 *)qp->q2_buf, 128);
+		/* First_Partial_Sequence_Number check */
+		pfpdu->rcv_nxt = fps;
+		pfpdu->fps = fps;
+		pfpdu->mode = true;
+		pfpdu->max_fpdu_data = (buf->ipv4) ? (ieq->vsi->mtu - I40IW_MTU_TO_MSS_IPV4) :
+				       (ieq->vsi->mtu - I40IW_MTU_TO_MSS_IPV6);
+		pfpdu->pmode_count++;
+		INIT_LIST_HEAD(rxlist);
+		i40iw_ieq_check_first_buf(buf, fps);
+	}
+
+	if (!(rcv_wnd >= (buf->seqnum - pfpdu->rcv_nxt))) {
+		pfpdu->bad_seq_num++;
+		goto error;
+	}
+
+	if (!list_empty(rxlist)) {
+		tmpbuf = (struct i40iw_puda_buf *)rxlist->next;
+		while ((struct list_head *)tmpbuf != rxlist) {
+			if ((int)(buf->seqnum - tmpbuf->seqnum) < 0)
+				break;
+			plist = &tmpbuf->list;
+			tmpbuf = (struct i40iw_puda_buf *)plist->next;
+		}
+		/* Insert buf before tmpbuf */
+		list_add_tail(&buf->list, &tmpbuf->list);
+	} else {
+		list_add_tail(&buf->list, rxlist);
+	}
+	i40iw_ieq_process_fpdus(qp, ieq);
+	return;
+ error:
+	i40iw_puda_ret_bufpool(ieq, buf);
+}
+
+/**
+ * i40iw_ieq_receive - received exception buffer
+ * @dev: iwarp device
+ * @buf: exception buffer received
+ */
+static void i40iw_ieq_receive(struct i40iw_sc_vsi *vsi,
+			      struct i40iw_puda_buf *buf)
+{
+	struct i40iw_puda_rsrc *ieq = vsi->ieq;
+	struct i40iw_sc_qp *qp = NULL;
+	u32 wqe_idx = ieq->compl_rxwqe_idx;
+
+	qp = i40iw_ieq_get_qp(vsi->dev, buf);
+	if (!qp) {
+		ieq->stats_bad_qp_id++;
+		i40iw_puda_ret_bufpool(ieq, buf);
+	} else {
+		i40iw_ieq_handle_exception(ieq, qp, buf);
+	}
+	/*
+	 * ieq->rx_wqe_idx is used by i40iw_puda_replenish_rq()
+	 * on which wqe_idx to start replenish rq
+	 */
+	if (!ieq->rxq_invalid_cnt)
+		ieq->rx_wqe_idx = wqe_idx;
+	ieq->rxq_invalid_cnt++;
+}
+
+/**
+ * i40iw_ieq_tx_compl - put back after sending completed exception buffer
+ * @vsi: pointer to the vsi structure
+ * @sqwrid: pointer to puda buffer
+ */
+static void i40iw_ieq_tx_compl(struct i40iw_sc_vsi *vsi, void *sqwrid)
+{
+	struct i40iw_puda_rsrc *ieq = vsi->ieq;
+	struct i40iw_puda_buf *buf = (struct i40iw_puda_buf *)sqwrid;
+
+	i40iw_puda_ret_bufpool(ieq, buf);
+}
+
+/**
+ * i40iw_ieq_cleanup_qp - qp is being destroyed
+ * @ieq: ieq resource
+ * @qp: all pending fpdu buffers
+ */
+void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp)
+{
+	struct i40iw_puda_buf *buf;
+	struct i40iw_pfpdu *pfpdu = &qp->pfpdu;
+	struct list_head *rxlist = &pfpdu->rxlist;
+
+	if (!pfpdu->mode)
+		return;
+	while (!list_empty(rxlist)) {
+		buf = i40iw_puda_get_listbuf(rxlist);
+		i40iw_puda_ret_bufpool(ieq, buf);
+	}
+}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.h b/drivers/infiniband/hw/i40iw/i40iw_puda.h
new file mode 100644
index 0000000..53a7d58
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_puda.h
@@ -0,0 +1,188 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_PUDA_H
+#define I40IW_PUDA_H
+
+#define I40IW_IEQ_MPA_FRAMING 6
+
+struct i40iw_sc_dev;
+struct i40iw_sc_qp;
+struct i40iw_sc_cq;
+
+enum puda_resource_type {
+	I40IW_PUDA_RSRC_TYPE_ILQ = 1,
+	I40IW_PUDA_RSRC_TYPE_IEQ
+};
+
+enum puda_rsrc_complete {
+	PUDA_CQ_CREATED = 1,
+	PUDA_QP_CREATED,
+	PUDA_TX_COMPLETE,
+	PUDA_RX_COMPLETE,
+	PUDA_HASH_CRC_COMPLETE
+};
+
+struct i40iw_puda_completion_info {
+	struct i40iw_qp_uk *qp;
+	u8 q_type;
+	u8 vlan_valid;
+	u8 l3proto;
+	u8 l4proto;
+	u16 payload_len;
+	u32 compl_error;	/* No_err=0, else major and minor err code */
+	u32 qp_id;
+	u32 wqe_idx;
+};
+
+struct i40iw_puda_send_info {
+	u64 paddr;		/* Physical address */
+	u32 len;
+	u8 tcplen;
+	u8 maclen;
+	bool ipv4;
+	bool doloopback;
+	void *scratch;
+};
+
+struct i40iw_puda_buf {
+	struct list_head list;	/* MUST be first entry */
+	struct i40iw_dma_mem mem;	/* DMA memory for the buffer */
+	struct i40iw_puda_buf *next;	/* for alloclist in rsrc struct */
+	struct i40iw_virt_mem buf_mem;	/* Buffer memory for this buffer */
+	void *scratch;
+	u8 *iph;
+	u8 *tcph;
+	u8 *data;
+	u16 datalen;
+	u16 vlan_id;
+	u8 tcphlen;		/* tcp length in bytes */
+	u8 maclen;		/* mac length in bytes */
+	u32 totallen;		/* machlen+iphlen+tcphlen+datalen */
+	atomic_t refcount;
+	u8 hdrlen;
+	bool ipv4;
+	u32 seqnum;
+};
+
+struct i40iw_puda_rsrc_info {
+	enum puda_resource_type type;	/* ILQ or IEQ */
+	u32 count;
+	u16 pd_id;
+	u32 cq_id;
+	u32 qp_id;
+	u32 sq_size;
+	u32 rq_size;
+	u16 buf_size;
+	u16 mss;
+	u32 tx_buf_cnt;		/* total bufs allocated will be rq_size + tx_buf_cnt */
+	void (*receive)(struct i40iw_sc_vsi *, struct i40iw_puda_buf *);
+	void (*xmit_complete)(struct i40iw_sc_vsi *, void *);
+};
+
+struct i40iw_puda_rsrc {
+	struct i40iw_sc_cq cq;
+	struct i40iw_sc_qp qp;
+	struct i40iw_sc_pd sc_pd;
+	struct i40iw_sc_dev *dev;
+	struct i40iw_sc_vsi *vsi;
+	struct i40iw_dma_mem cqmem;
+	struct i40iw_dma_mem qpmem;
+	struct i40iw_virt_mem ilq_mem;
+	enum puda_rsrc_complete completion;
+	enum puda_resource_type type;
+	u16 buf_size;		/*buffer must be max datalen + tcpip hdr + mac */
+	u16 mss;
+	u32 cq_id;
+	u32 qp_id;
+	u32 sq_size;
+	u32 rq_size;
+	u32 cq_size;
+	struct i40iw_sq_uk_wr_trk_info *sq_wrtrk_array;
+	u64 *rq_wrid_array;
+	u32 compl_rxwqe_idx;
+	u32 rx_wqe_idx;
+	u32 rxq_invalid_cnt;
+	u32 tx_wqe_avail_cnt;
+	bool check_crc;
+	struct shash_desc *hash_desc;
+	struct list_head txpend;
+	struct list_head bufpool;	/* free buffers pool list for recv and xmit */
+	u32 alloc_buf_count;
+	u32 avail_buf_count;		/* snapshot of currently available buffers */
+	spinlock_t bufpool_lock;
+	struct i40iw_puda_buf *alloclist;
+	void (*receive)(struct i40iw_sc_vsi *, struct i40iw_puda_buf *);
+	void (*xmit_complete)(struct i40iw_sc_vsi *, void *);
+	/* puda stats */
+	u64 stats_buf_alloc_fail;
+	u64 stats_pkt_rcvd;
+	u64 stats_pkt_sent;
+	u64 stats_rcvd_pkt_err;
+	u64 stats_sent_pkt_q;
+	u64 stats_bad_qp_id;
+};
+
+struct i40iw_puda_buf *i40iw_puda_get_bufpool(struct i40iw_puda_rsrc *rsrc);
+void i40iw_puda_ret_bufpool(struct i40iw_puda_rsrc *rsrc,
+			    struct i40iw_puda_buf *buf);
+void i40iw_puda_send_buf(struct i40iw_puda_rsrc *rsrc,
+			 struct i40iw_puda_buf *buf);
+enum i40iw_status_code i40iw_puda_send(struct i40iw_sc_qp *qp,
+				       struct i40iw_puda_send_info *info);
+enum i40iw_status_code i40iw_puda_create_rsrc(struct i40iw_sc_vsi *vsi,
+					      struct i40iw_puda_rsrc_info *info);
+void i40iw_puda_dele_resources(struct i40iw_sc_vsi *vsi,
+			       enum puda_resource_type type,
+			       bool reset);
+enum i40iw_status_code i40iw_puda_poll_completion(struct i40iw_sc_dev *dev,
+						  struct i40iw_sc_cq *cq, u32 *compl_err);
+
+struct i40iw_sc_qp *i40iw_ieq_get_qp(struct i40iw_sc_dev *dev,
+				     struct i40iw_puda_buf *buf);
+enum i40iw_status_code i40iw_puda_get_tcpip_info(struct i40iw_puda_completion_info *info,
+						 struct i40iw_puda_buf *buf);
+enum i40iw_status_code i40iw_ieq_check_mpacrc(struct shash_desc *desc,
+					      void *addr, u32 length, u32 value);
+enum i40iw_status_code i40iw_init_hash_desc(struct shash_desc **desc);
+void i40iw_ieq_mpa_crc_ae(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp);
+void i40iw_free_hash_desc(struct shash_desc *desc);
+void i40iw_ieq_update_tcpip_info(struct i40iw_puda_buf *buf, u16 length,
+				 u32 seqnum);
+enum i40iw_status_code i40iw_cqp_qp_create_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp);
+enum i40iw_status_code i40iw_cqp_cq_create_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq);
+void i40iw_cqp_qp_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp);
+void i40iw_cqp_cq_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq);
+void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp);
+#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_register.h b/drivers/infiniband/hw/i40iw/i40iw_register.h
new file mode 100644
index 0000000..5776818
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_register.h
@@ -0,0 +1,1030 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_REGISTER_H
+#define I40IW_REGISTER_H
+
+#define I40E_GLGEN_STAT               0x000B612C /* Reset: POR */
+
+#define I40E_PFHMC_PDINV               0x000C0300 /* Reset: PFR */
+#define I40E_PFHMC_PDINV_PMSDIDX_SHIFT 0
+#define I40E_PFHMC_PDINV_PMSDIDX_MASK  (0xFFF <<  I40E_PFHMC_PDINV_PMSDIDX_SHIFT)
+#define I40E_PFHMC_PDINV_PMPDIDX_SHIFT 16
+#define I40E_PFHMC_PDINV_PMPDIDX_MASK  (0x1FF <<  I40E_PFHMC_PDINV_PMPDIDX_SHIFT)
+#define I40E_PFHMC_SDCMD_PMSDWR_SHIFT  31
+#define I40E_PFHMC_SDCMD_PMSDWR_MASK   (0x1 <<  I40E_PFHMC_SDCMD_PMSDWR_SHIFT)
+#define I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT   0
+#define I40E_PFHMC_SDDATALOW_PMSDVALID_MASK    (0x1 <<  I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT)
+#define I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT    1
+#define I40E_PFHMC_SDDATALOW_PMSDTYPE_MASK     (0x1 <<  I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT)
+#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT 2
+#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_MASK  (0x3FF <<  I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT)
+
+#define I40E_PFINT_DYN_CTLN(_INTPF) (0x00034800 + ((_INTPF) * 4)) /* _i=0...511 */	/* Reset: PFR */
+#define I40E_PFINT_DYN_CTLN_INTENA_SHIFT          0
+#define I40E_PFINT_DYN_CTLN_INTENA_MASK           (0x1 <<  I40E_PFINT_DYN_CTLN_INTENA_SHIFT)
+#define I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT        1
+#define I40E_PFINT_DYN_CTLN_CLEARPBA_MASK         (0x1 <<  I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT)
+#define I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT        3
+#define I40E_PFINT_DYN_CTLN_ITR_INDX_MASK         (0x3 <<  I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT)
+
+#define I40E_VFINT_DYN_CTLN1(_INTVF)               (0x00003800 + ((_INTVF) * 4)) /* _i=0...15 */ /* Reset: VFR */
+#define I40E_GLHMC_VFPDINV(_i)               (0x000C8300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+
+#define I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT 15
+#define I40E_PFHMC_PDINV_PMSDPARTSEL_MASK  (0x1 <<  I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT)
+#define I40E_GLPCI_LBARCTRL                    0x000BE484 /* Reset: POR */
+#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT    4
+#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_MASK     (0x3 <<  I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT)
+#define I40E_GLPCI_DREVID			0x0009C480 /* Reset: PCIR */
+#define I40E_GLPCI_DREVID_DEFAULT_REVID_SHIFT 0
+#define I40E_GLPCI_DREVID_DEFAULT_REVID_MASK 0xFF
+
+#define I40E_PFPE_AEQALLOC               0x00131180 /* Reset: PFR */
+#define I40E_PFPE_AEQALLOC_AECOUNT_SHIFT 0
+#define I40E_PFPE_AEQALLOC_AECOUNT_MASK  (0xFFFFFFFF <<  I40E_PFPE_AEQALLOC_AECOUNT_SHIFT)
+#define I40E_PFPE_CCQPHIGH                  0x00008200 /* Reset: PFR */
+#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT 0
+#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_MASK  (0xFFFFFFFF <<  I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT)
+#define I40E_PFPE_CCQPLOW                 0x00008180 /* Reset: PFR */
+#define I40E_PFPE_CCQPLOW_PECCQPLOW_SHIFT 0
+#define I40E_PFPE_CCQPLOW_PECCQPLOW_MASK  (0xFFFFFFFF <<  I40E_PFPE_CCQPLOW_PECCQPLOW_SHIFT)
+#define I40E_PFPE_CCQPSTATUS                   0x00008100 /* Reset: PFR */
+#define I40E_PFPE_CCQPSTATUS_CCQP_DONE_SHIFT   0
+#define I40E_PFPE_CCQPSTATUS_CCQP_DONE_MASK    (0x1 <<  I40E_PFPE_CCQPSTATUS_CCQP_DONE_SHIFT)
+#define I40E_PFPE_CCQPSTATUS_HMC_PROFILE_SHIFT 4
+#define I40E_PFPE_CCQPSTATUS_HMC_PROFILE_MASK  (0x7 <<  I40E_PFPE_CCQPSTATUS_HMC_PROFILE_SHIFT)
+#define I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT 16
+#define I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_MASK  (0x3F <<  I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT)
+#define I40E_PFPE_CCQPSTATUS_CCQP_ERR_SHIFT    31
+#define I40E_PFPE_CCQPSTATUS_CCQP_ERR_MASK     (0x1 <<  I40E_PFPE_CCQPSTATUS_CCQP_ERR_SHIFT)
+#define I40E_PFPE_CQACK              0x00131100 /* Reset: PFR */
+#define I40E_PFPE_CQACK_PECQID_SHIFT 0
+#define I40E_PFPE_CQACK_PECQID_MASK  (0x1FFFF <<  I40E_PFPE_CQACK_PECQID_SHIFT)
+#define I40E_PFPE_CQARM              0x00131080 /* Reset: PFR */
+#define I40E_PFPE_CQARM_PECQID_SHIFT 0
+#define I40E_PFPE_CQARM_PECQID_MASK  (0x1FFFF <<  I40E_PFPE_CQARM_PECQID_SHIFT)
+#define I40E_PFPE_CQPDB              0x00008000 /* Reset: PFR */
+#define I40E_PFPE_CQPDB_WQHEAD_SHIFT 0
+#define I40E_PFPE_CQPDB_WQHEAD_MASK  (0x7FF <<  I40E_PFPE_CQPDB_WQHEAD_SHIFT)
+#define I40E_PFPE_CQPERRCODES                      0x00008880 /* Reset: PFR */
+#define I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT 0
+#define I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_MASK  (0xFFFF <<  I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT)
+#define I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT 16
+#define I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_MASK  (0xFFFF <<  I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT)
+#define I40E_PFPE_CQPTAIL                  0x00008080 /* Reset: PFR */
+#define I40E_PFPE_CQPTAIL_WQTAIL_SHIFT     0
+#define I40E_PFPE_CQPTAIL_WQTAIL_MASK      (0x7FF <<  I40E_PFPE_CQPTAIL_WQTAIL_SHIFT)
+#define I40E_PFPE_CQPTAIL_CQP_OP_ERR_SHIFT 31
+#define I40E_PFPE_CQPTAIL_CQP_OP_ERR_MASK  (0x1 <<  I40E_PFPE_CQPTAIL_CQP_OP_ERR_SHIFT)
+#define I40E_PFPE_FLMQ1ALLOCERR                   0x00008980 /* Reset: PFR */
+#define I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_SHIFT 0
+#define I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_MASK  (0xFFFF <<  I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_SHIFT)
+#define I40E_PFPE_FLMXMITALLOCERR                   0x00008900 /* Reset: PFR */
+#define I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_SHIFT 0
+#define I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_MASK  (0xFFFF <<  I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_SHIFT)
+#define I40E_PFPE_IPCONFIG0                        0x00008280 /* Reset: PFR */
+#define I40E_PFPE_IPCONFIG0_PEIPID_SHIFT           0
+#define I40E_PFPE_IPCONFIG0_PEIPID_MASK            (0xFFFF <<  I40E_PFPE_IPCONFIG0_PEIPID_SHIFT)
+#define I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT 16
+#define I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_MASK  (0x1 <<  I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT)
+#define I40E_PFPE_MRTEIDXMASK                       0x00008600 /* Reset: PFR */
+#define I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT 0
+#define I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_MASK  (0x1F <<  I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT)
+#define I40E_PFPE_RCVUNEXPECTEDERROR                        0x00008680 /* Reset: PFR */
+#define I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT 0
+#define I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_MASK  (0xFFFFFF <<  I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT)
+#define I40E_PFPE_TCPNOWTIMER               0x00008580 /* Reset: PFR */
+#define I40E_PFPE_TCPNOWTIMER_TCP_NOW_SHIFT 0
+#define I40E_PFPE_TCPNOWTIMER_TCP_NOW_MASK  (0xFFFFFFFF <<  I40E_PFPE_TCPNOWTIMER_TCP_NOW_SHIFT)
+
+#define I40E_PFPE_WQEALLOC                      0x00138C00 /* Reset: PFR */
+#define I40E_PFPE_WQEALLOC_PEQPID_SHIFT         0
+#define I40E_PFPE_WQEALLOC_PEQPID_MASK          (0x3FFFF <<  I40E_PFPE_WQEALLOC_PEQPID_SHIFT)
+#define I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT 20
+#define I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_MASK  (0xFFF <<  I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT)
+
+#define I40E_VFPE_AEQALLOC(_VF)          (0x00130C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_AEQALLOC_MAX_INDEX     127
+#define I40E_VFPE_AEQALLOC_AECOUNT_SHIFT 0
+#define I40E_VFPE_AEQALLOC_AECOUNT_MASK  (0xFFFFFFFF <<  I40E_VFPE_AEQALLOC_AECOUNT_SHIFT)
+#define I40E_VFPE_CCQPHIGH(_VF)             (0x00001000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CCQPHIGH_MAX_INDEX        127
+#define I40E_VFPE_CCQPHIGH_PECCQPHIGH_SHIFT 0
+#define I40E_VFPE_CCQPHIGH_PECCQPHIGH_MASK  (0xFFFFFFFF <<  I40E_VFPE_CCQPHIGH_PECCQPHIGH_SHIFT)
+#define I40E_VFPE_CCQPLOW(_VF)            (0x00000C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CCQPLOW_MAX_INDEX       127
+#define I40E_VFPE_CCQPLOW_PECCQPLOW_SHIFT 0
+#define I40E_VFPE_CCQPLOW_PECCQPLOW_MASK  (0xFFFFFFFF <<  I40E_VFPE_CCQPLOW_PECCQPLOW_SHIFT)
+#define I40E_VFPE_CCQPSTATUS(_VF)              (0x00000800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CCQPSTATUS_MAX_INDEX         127
+#define I40E_VFPE_CCQPSTATUS_CCQP_DONE_SHIFT   0
+#define I40E_VFPE_CCQPSTATUS_CCQP_DONE_MASK    (0x1 <<  I40E_VFPE_CCQPSTATUS_CCQP_DONE_SHIFT)
+#define I40E_VFPE_CCQPSTATUS_HMC_PROFILE_SHIFT 4
+#define I40E_VFPE_CCQPSTATUS_HMC_PROFILE_MASK  (0x7 <<  I40E_VFPE_CCQPSTATUS_HMC_PROFILE_SHIFT)
+#define I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT 16
+#define I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_MASK  (0x3F <<  I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT)
+#define I40E_VFPE_CCQPSTATUS_CCQP_ERR_SHIFT    31
+#define I40E_VFPE_CCQPSTATUS_CCQP_ERR_MASK     (0x1 <<  I40E_VFPE_CCQPSTATUS_CCQP_ERR_SHIFT)
+#define I40E_VFPE_CQACK(_VF)         (0x00130800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQACK_MAX_INDEX    127
+#define I40E_VFPE_CQACK_PECQID_SHIFT 0
+#define I40E_VFPE_CQACK_PECQID_MASK  (0x1FFFF <<  I40E_VFPE_CQACK_PECQID_SHIFT)
+#define I40E_VFPE_CQARM(_VF)         (0x00130400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQARM_MAX_INDEX    127
+#define I40E_VFPE_CQARM_PECQID_SHIFT 0
+#define I40E_VFPE_CQARM_PECQID_MASK  (0x1FFFF <<  I40E_VFPE_CQARM_PECQID_SHIFT)
+#define I40E_VFPE_CQPDB(_VF)         (0x00000000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQPDB_MAX_INDEX    127
+#define I40E_VFPE_CQPDB_WQHEAD_SHIFT 0
+#define I40E_VFPE_CQPDB_WQHEAD_MASK  (0x7FF <<  I40E_VFPE_CQPDB_WQHEAD_SHIFT)
+#define I40E_VFPE_CQPERRCODES(_VF)                 (0x00001800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQPERRCODES_MAX_INDEX            127
+#define I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT 0
+#define I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_MASK  (0xFFFF <<  I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT)
+#define I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT 16
+#define I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_MASK  (0xFFFF <<  I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT)
+#define I40E_VFPE_CQPTAIL(_VF)             (0x00000400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQPTAIL_MAX_INDEX        127
+#define I40E_VFPE_CQPTAIL_WQTAIL_SHIFT     0
+#define I40E_VFPE_CQPTAIL_WQTAIL_MASK      (0x7FF <<  I40E_VFPE_CQPTAIL_WQTAIL_SHIFT)
+#define I40E_VFPE_CQPTAIL_CQP_OP_ERR_SHIFT 31
+#define I40E_VFPE_CQPTAIL_CQP_OP_ERR_MASK  (0x1 <<  I40E_VFPE_CQPTAIL_CQP_OP_ERR_SHIFT)
+#define I40E_VFPE_IPCONFIG0(_VF)                   (0x00001400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_IPCONFIG0_MAX_INDEX              127
+#define I40E_VFPE_IPCONFIG0_PEIPID_SHIFT           0
+#define I40E_VFPE_IPCONFIG0_PEIPID_MASK            (0xFFFF <<  I40E_VFPE_IPCONFIG0_PEIPID_SHIFT)
+#define I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT 16
+#define I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_MASK  (0x1 <<  I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT)
+#define I40E_VFPE_MRTEIDXMASK(_VF)                  (0x00003000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_MRTEIDXMASK_MAX_INDEX             127
+#define I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT 0
+#define I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_MASK  (0x1F <<  I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT)
+#define I40E_VFPE_RCVUNEXPECTEDERROR(_VF)                   (0x00003400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_RCVUNEXPECTEDERROR_MAX_INDEX              127
+#define I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT 0
+#define I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_MASK  (0xFFFFFF <<  I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT)
+#define I40E_VFPE_TCPNOWTIMER(_VF)          (0x00002C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_TCPNOWTIMER_MAX_INDEX     127
+#define I40E_VFPE_TCPNOWTIMER_TCP_NOW_SHIFT 0
+#define I40E_VFPE_TCPNOWTIMER_TCP_NOW_MASK  (0xFFFFFFFF <<  I40E_VFPE_TCPNOWTIMER_TCP_NOW_SHIFT)
+#define I40E_VFPE_WQEALLOC(_VF)                 (0x00138000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_WQEALLOC_MAX_INDEX            127
+#define I40E_VFPE_WQEALLOC_PEQPID_SHIFT         0
+#define I40E_VFPE_WQEALLOC_PEQPID_MASK          (0x3FFFF <<  I40E_VFPE_WQEALLOC_PEQPID_SHIFT)
+#define I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT 20
+#define I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_MASK  (0xFFF <<  I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT)
+
+#define I40E_GLPE_CPUSTATUS0                    0x0000D040 /* Reset: PE_CORER */
+#define I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_SHIFT 0
+#define I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_MASK  (0xFFFFFFFF <<  I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_SHIFT)
+#define I40E_GLPE_CPUSTATUS1                    0x0000D044 /* Reset: PE_CORER */
+#define I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_SHIFT 0
+#define I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_MASK  (0xFFFFFFFF <<  I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_SHIFT)
+#define I40E_GLPE_CPUSTATUS2                    0x0000D048 /* Reset: PE_CORER */
+#define I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_SHIFT 0
+#define I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_MASK  (0xFFFFFFFF <<  I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_SHIFT)
+#define I40E_GLPE_CPUTRIG0                   0x0000D060 /* Reset: PE_CORER */
+#define I40E_GLPE_CPUTRIG0_PECPUTRIG0_SHIFT  0
+#define I40E_GLPE_CPUTRIG0_PECPUTRIG0_MASK   (0xFFFF <<  I40E_GLPE_CPUTRIG0_PECPUTRIG0_SHIFT)
+#define I40E_GLPE_CPUTRIG0_TEPREQUEST0_SHIFT 17
+#define I40E_GLPE_CPUTRIG0_TEPREQUEST0_MASK  (0x1 <<  I40E_GLPE_CPUTRIG0_TEPREQUEST0_SHIFT)
+#define I40E_GLPE_CPUTRIG0_OOPREQUEST0_SHIFT 18
+#define I40E_GLPE_CPUTRIG0_OOPREQUEST0_MASK  (0x1 <<  I40E_GLPE_CPUTRIG0_OOPREQUEST0_SHIFT)
+#define I40E_GLPE_DUAL40_RUPM                     0x0000DA04 /* Reset: PE_CORER */
+#define I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_SHIFT 0
+#define I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_MASK  (0x1 <<  I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_SHIFT)
+#define I40E_GLPE_PFAEQEDROPCNT(_i)               (0x00131440 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLPE_PFAEQEDROPCNT_MAX_INDEX         15
+#define I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_SHIFT 0
+#define I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_MASK  (0xFFFF <<  I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_SHIFT)
+#define I40E_GLPE_PFCEQEDROPCNT(_i)               (0x001313C0 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLPE_PFCEQEDROPCNT_MAX_INDEX         15
+#define I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_SHIFT 0
+#define I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_MASK  (0xFFFF <<  I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_SHIFT)
+#define I40E_GLPE_PFCQEDROPCNT(_i)              (0x00131340 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLPE_PFCQEDROPCNT_MAX_INDEX        15
+#define I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_SHIFT 0
+#define I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_MASK  (0xFFFF <<  I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_SHIFT)
+#define I40E_GLPE_RUPM_CQPPOOL                0x0000DACC /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_SHIFT 0
+#define I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_MASK  (0xFF <<  I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_SHIFT)
+#define I40E_GLPE_RUPM_FLRPOOL                0x0000DAC4 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_SHIFT 0
+#define I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_MASK  (0xFF <<  I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL                   0x0000DA00 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_GCTL_ALLOFFTH_SHIFT    0
+#define I40E_GLPE_RUPM_GCTL_ALLOFFTH_MASK     (0xFF <<  I40E_GLPE_RUPM_GCTL_ALLOFFTH_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_SHIFT 26
+#define I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_MASK  (0x1 <<  I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_SHIFT 27
+#define I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_MASK  (0x1 <<  I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_SHIFT 28
+#define I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_MASK  (0x1 <<  I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_SHIFT 29
+#define I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_MASK  (0x1 <<  I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_DIS_SHIFT    30
+#define I40E_GLPE_RUPM_GCTL_RUPM_DIS_MASK     (0x1 <<  I40E_GLPE_RUPM_GCTL_RUPM_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_SWLB_MODE_SHIFT   31
+#define I40E_GLPE_RUPM_GCTL_SWLB_MODE_MASK    (0x1 <<  I40E_GLPE_RUPM_GCTL_SWLB_MODE_SHIFT)
+#define I40E_GLPE_RUPM_PTXPOOL                0x0000DAC8 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_SHIFT 0
+#define I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_MASK  (0xFF <<  I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_SHIFT)
+#define I40E_GLPE_RUPM_PUSHPOOL                 0x0000DAC0 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_SHIFT 0
+#define I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_MASK  (0xFF <<  I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_SHIFT)
+#define I40E_GLPE_RUPM_TXHOST_EN                 0x0000DA08 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_SHIFT 0
+#define I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_MASK  (0x1 <<  I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_SHIFT)
+#define I40E_GLPE_VFAEQEDROPCNT(_i)               (0x00132540 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLPE_VFAEQEDROPCNT_MAX_INDEX         31
+#define I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_SHIFT 0
+#define I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_MASK  (0xFFFF <<  I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_SHIFT)
+#define I40E_GLPE_VFCEQEDROPCNT(_i)               (0x00132440 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLPE_VFCEQEDROPCNT_MAX_INDEX         31
+#define I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_SHIFT 0
+#define I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_MASK  (0xFFFF <<  I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_SHIFT)
+#define I40E_GLPE_VFCQEDROPCNT(_i)              (0x00132340 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLPE_VFCQEDROPCNT_MAX_INDEX        31
+#define I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_SHIFT 0
+#define I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_MASK  (0xFFFF <<  I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_SHIFT)
+#define I40E_GLPE_VFFLMOBJCTRL(_i)                  (0x0000D400 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFFLMOBJCTRL_MAX_INDEX            31
+#define I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_SHIFT 0
+#define I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_MASK  (0x7 <<  I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_SHIFT)
+#define I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_SHIFT   8
+#define I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_MASK    (0x7 <<  I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_SHIFT)
+#define I40E_GLPE_VFFLMQ1ALLOCERR(_i)               (0x0000C700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFFLMQ1ALLOCERR_MAX_INDEX         31
+#define I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_SHIFT 0
+#define I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_MASK  (0xFFFF <<  I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_SHIFT)
+#define I40E_GLPE_VFFLMXMITALLOCERR(_i)               (0x0000C600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFFLMXMITALLOCERR_MAX_INDEX         31
+#define I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_SHIFT 0
+#define I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_MASK  (0xFFFF <<  I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_SHIFT)
+#define I40E_GLPE_VFUDACTRL(_i)                    (0x0000C000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFUDACTRL_MAX_INDEX              31
+#define I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_SHIFT  0
+#define I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_MASK   (0x1 <<  I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_SHIFT)
+#define I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_SHIFT  1
+#define I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_MASK   (0x1 <<  I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_SHIFT)
+#define I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_SHIFT  2
+#define I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_MASK   (0x1 <<  I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_SHIFT)
+#define I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_SHIFT  3
+#define I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_MASK   (0x1 <<  I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_SHIFT)
+#define I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_SHIFT 4
+#define I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_MASK  (0x1 <<  I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_SHIFT)
+#define I40E_GLPE_VFUDAUCFBQPN(_i)         (0x0000C100 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFUDAUCFBQPN_MAX_INDEX   31
+#define I40E_GLPE_VFUDAUCFBQPN_QPN_SHIFT   0
+#define I40E_GLPE_VFUDAUCFBQPN_QPN_MASK    (0x3FFFF <<  I40E_GLPE_VFUDAUCFBQPN_QPN_SHIFT)
+#define I40E_GLPE_VFUDAUCFBQPN_VALID_SHIFT 31
+#define I40E_GLPE_VFUDAUCFBQPN_VALID_MASK  (0x1 <<  I40E_GLPE_VFUDAUCFBQPN_VALID_SHIFT)
+
+#define I40E_GLPES_PFIP4RXDISCARD(_i)                (0x00010600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXDISCARD_MAX_INDEX          15
+#define I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_SHIFT 0
+#define I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_SHIFT)
+#define I40E_GLPES_PFIP4RXFRAGSHI(_i)                (0x00010804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXFRAGSHI_MAX_INDEX          15
+#define I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXFRAGSLO(_i)                (0x00010800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXFRAGSLO_MAX_INDEX          15
+#define I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXMCOCTSHI(_i)                 (0x00010A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXMCOCTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXMCOCTSLO(_i)                 (0x00010A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXMCOCTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXMCPKTSHI(_i)                 (0x00010C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXMCPKTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXMCPKTSLO(_i)                 (0x00010C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXMCPKTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXOCTSHI(_i)               (0x00010204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXOCTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXOCTSLO(_i)               (0x00010200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXOCTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXPKTSHI(_i)               (0x00010404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXPKTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXPKTSLO(_i)               (0x00010400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXPKTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXTRUNC(_i)              (0x00010700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXTRUNC_MAX_INDEX        15
+#define I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_SHIFT 0
+#define I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_SHIFT)
+#define I40E_GLPES_PFIP4TXFRAGSHI(_i)                (0x00011E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXFRAGSHI_MAX_INDEX          15
+#define I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXFRAGSLO(_i)                (0x00011E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXFRAGSLO_MAX_INDEX          15
+#define I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT)
+#define I40E_GLPES_PFIP4TXMCOCTSHI(_i)                 (0x00012004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXMCOCTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXMCOCTSLO(_i)                 (0x00012000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXMCOCTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP4TXMCPKTSHI(_i)                 (0x00012204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXMCPKTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXMCPKTSLO(_i)                 (0x00012200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXMCPKTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP4TXNOROUTE(_i)                (0x00012E00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXNOROUTE_MAX_INDEX          15
+#define I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT 0
+#define I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_MASK  (0xFFFFFF <<  I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT)
+#define I40E_GLPES_PFIP4TXOCTSHI(_i)               (0x00011A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXOCTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXOCTSLO(_i)               (0x00011A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXOCTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP4TXPKTSHI(_i)               (0x00011C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXPKTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXPKTSLO(_i)               (0x00011C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXPKTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXDISCARD(_i)                (0x00011200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXDISCARD_MAX_INDEX          15
+#define I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_SHIFT 0
+#define I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_SHIFT)
+#define I40E_GLPES_PFIP6RXFRAGSHI(_i)                (0x00011404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXFRAGSHI_MAX_INDEX          15
+#define I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXFRAGSLO(_i)                (0x00011400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXFRAGSLO_MAX_INDEX          15
+#define I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXMCOCTSHI(_i)                 (0x00011604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXMCOCTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXMCOCTSLO(_i)                 (0x00011600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXMCOCTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXMCPKTSHI(_i)                 (0x00011804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXMCPKTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXMCPKTSLO(_i)                 (0x00011800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXMCPKTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXOCTSHI(_i)               (0x00010E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXOCTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXOCTSLO(_i)               (0x00010E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXOCTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXPKTSHI(_i)               (0x00011004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXPKTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXPKTSLO(_i)               (0x00011000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXPKTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXTRUNC(_i)              (0x00011300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXTRUNC_MAX_INDEX        15
+#define I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_SHIFT 0
+#define I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_SHIFT)
+#define I40E_GLPES_PFIP6TXFRAGSHI(_i)                (0x00012804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXFRAGSHI_MAX_INDEX          15
+#define I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXFRAGSLO(_i)                (0x00012800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXFRAGSLO_MAX_INDEX          15
+#define I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT)
+#define I40E_GLPES_PFIP6TXMCOCTSHI(_i)                 (0x00012A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXMCOCTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXMCOCTSLO(_i)                 (0x00012A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXMCOCTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP6TXMCPKTSHI(_i)                 (0x00012C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXMCPKTSHI_MAX_INDEX           15
+#define I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXMCPKTSLO(_i)                 (0x00012C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXMCPKTSLO_MAX_INDEX           15
+#define I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP6TXNOROUTE(_i)                (0x00012F00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXNOROUTE_MAX_INDEX          15
+#define I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT 0
+#define I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_MASK  (0xFFFFFF <<  I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT)
+#define I40E_GLPES_PFIP6TXOCTSHI(_i)               (0x00012404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXOCTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXOCTSLO(_i)               (0x00012400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXOCTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP6TXPKTSHI(_i)               (0x00012604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXPKTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXPKTSLO(_i)               (0x00012600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXPKTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT)
+#define I40E_GLPES_PFRDMARXRDSHI(_i)               (0x00013E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXRDSHI_MAX_INDEX         15
+#define I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_SHIFT 0
+#define I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_MASK  (0xFFFF <<  I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_SHIFT)
+#define I40E_GLPES_PFRDMARXRDSLO(_i)               (0x00013E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXRDSLO_MAX_INDEX         15
+#define I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_SHIFT 0
+#define I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_SHIFT)
+#define I40E_GLPES_PFRDMARXSNDSHI(_i)                (0x00014004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXSNDSHI_MAX_INDEX          15
+#define I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT 0
+#define I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_MASK  (0xFFFF <<  I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT)
+#define I40E_GLPES_PFRDMARXSNDSLO(_i)                (0x00014000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXSNDSLO_MAX_INDEX          15
+#define I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT 0
+#define I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT)
+#define I40E_GLPES_PFRDMARXWRSHI(_i)               (0x00013C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXWRSHI_MAX_INDEX         15
+#define I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_SHIFT 0
+#define I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_MASK  (0xFFFF <<  I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_SHIFT)
+#define I40E_GLPES_PFRDMARXWRSLO(_i)               (0x00013C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXWRSLO_MAX_INDEX         15
+#define I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_SHIFT 0
+#define I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_SHIFT)
+#define I40E_GLPES_PFRDMATXRDSHI(_i)               (0x00014404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXRDSHI_MAX_INDEX         15
+#define I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_SHIFT 0
+#define I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_MASK  (0xFFFF <<  I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_SHIFT)
+#define I40E_GLPES_PFRDMATXRDSLO(_i)               (0x00014400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXRDSLO_MAX_INDEX         15
+#define I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_SHIFT 0
+#define I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_SHIFT)
+#define I40E_GLPES_PFRDMATXSNDSHI(_i)                (0x00014604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXSNDSHI_MAX_INDEX          15
+#define I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT 0
+#define I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_MASK  (0xFFFF <<  I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT)
+#define I40E_GLPES_PFRDMATXSNDSLO(_i)                (0x00014600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXSNDSLO_MAX_INDEX          15
+#define I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT 0
+#define I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT)
+#define I40E_GLPES_PFRDMATXWRSHI(_i)               (0x00014204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXWRSHI_MAX_INDEX         15
+#define I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_SHIFT 0
+#define I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_MASK  (0xFFFF <<  I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_SHIFT)
+#define I40E_GLPES_PFRDMATXWRSLO(_i)               (0x00014200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXWRSLO_MAX_INDEX         15
+#define I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_SHIFT 0
+#define I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_SHIFT)
+#define I40E_GLPES_PFRDMAVBNDHI(_i)              (0x00014804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMAVBNDHI_MAX_INDEX        15
+#define I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_SHIFT 0
+#define I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_SHIFT)
+#define I40E_GLPES_PFRDMAVBNDLO(_i)              (0x00014800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMAVBNDLO_MAX_INDEX        15
+#define I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_SHIFT 0
+#define I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_SHIFT)
+#define I40E_GLPES_PFRDMAVINVHI(_i)              (0x00014A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMAVINVHI_MAX_INDEX        15
+#define I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_SHIFT 0
+#define I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_SHIFT)
+#define I40E_GLPES_PFRDMAVINVLO(_i)              (0x00014A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMAVINVLO_MAX_INDEX        15
+#define I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_SHIFT 0
+#define I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_SHIFT)
+#define I40E_GLPES_PFRXVLANERR(_i)             (0x00010000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRXVLANERR_MAX_INDEX       15
+#define I40E_GLPES_PFRXVLANERR_RXVLANERR_SHIFT 0
+#define I40E_GLPES_PFRXVLANERR_RXVLANERR_MASK  (0xFFFFFF <<  I40E_GLPES_PFRXVLANERR_RXVLANERR_SHIFT)
+#define I40E_GLPES_PFTCPRTXSEG(_i)             (0x00013600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRTXSEG_MAX_INDEX       15
+#define I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_SHIFT 0
+#define I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_SHIFT)
+#define I40E_GLPES_PFTCPRXOPTERR(_i)               (0x00013200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRXOPTERR_MAX_INDEX         15
+#define I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_SHIFT 0
+#define I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_MASK  (0xFFFFFF <<  I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_SHIFT)
+#define I40E_GLPES_PFTCPRXPROTOERR(_i)                 (0x00013300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRXPROTOERR_MAX_INDEX           15
+#define I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT 0
+#define I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_MASK  (0xFFFFFF <<  I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT)
+#define I40E_GLPES_PFTCPRXSEGSHI(_i)               (0x00013004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRXSEGSHI_MAX_INDEX         15
+#define I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT 0
+#define I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_MASK  (0xFFFF <<  I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT)
+#define I40E_GLPES_PFTCPRXSEGSLO(_i)               (0x00013000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRXSEGSLO_MAX_INDEX         15
+#define I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT 0
+#define I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT)
+#define I40E_GLPES_PFTCPTXSEGHI(_i)              (0x00013404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPTXSEGHI_MAX_INDEX        15
+#define I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_SHIFT 0
+#define I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_MASK  (0xFFFF <<  I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_SHIFT)
+#define I40E_GLPES_PFTCPTXSEGLO(_i)              (0x00013400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPTXSEGLO_MAX_INDEX        15
+#define I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_SHIFT 0
+#define I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_SHIFT)
+#define I40E_GLPES_PFUDPRXPKTSHI(_i)               (0x00013804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFUDPRXPKTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT)
+#define I40E_GLPES_PFUDPRXPKTSLO(_i)               (0x00013800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFUDPRXPKTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT)
+#define I40E_GLPES_PFUDPTXPKTSHI(_i)               (0x00013A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFUDPTXPKTSHI_MAX_INDEX         15
+#define I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT)
+#define I40E_GLPES_PFUDPTXPKTSLO(_i)               (0x00013A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFUDPTXPKTSLO_MAX_INDEX         15
+#define I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT)
+#define I40E_GLPES_RDMARXMULTFPDUSHI                         0x0001E014 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_SHIFT 0
+#define I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_MASK  (0xFFFFFF <<  I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_SHIFT)
+#define I40E_GLPES_RDMARXMULTFPDUSLO                         0x0001E010 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_SHIFT 0
+#define I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_SHIFT)
+#define I40E_GLPES_RDMARXOOODDPHI                      0x0001E01C /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_SHIFT 0
+#define I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_MASK  (0xFFFFFF <<  I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_SHIFT)
+#define I40E_GLPES_RDMARXOOODDPLO                      0x0001E018 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_SHIFT 0
+#define I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_SHIFT)
+#define I40E_GLPES_RDMARXOOONOMARK                     0x0001E004 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_SHIFT 0
+#define I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_MASK  (0xFFFFFFFF <<  I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_SHIFT)
+#define I40E_GLPES_RDMARXUNALIGN                     0x0001E000 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_SHIFT 0
+#define I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_MASK  (0xFFFFFFFF <<  I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_SHIFT)
+#define I40E_GLPES_TCPRXFOURHOLEHI                       0x0001E044 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_SHIFT 0
+#define I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_SHIFT)
+#define I40E_GLPES_TCPRXFOURHOLELO                       0x0001E040 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_SHIFT 0
+#define I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_SHIFT)
+#define I40E_GLPES_TCPRXONEHOLEHI                      0x0001E02C /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_SHIFT 0
+#define I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_SHIFT)
+#define I40E_GLPES_TCPRXONEHOLELO                      0x0001E028 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_SHIFT 0
+#define I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_SHIFT)
+#define I40E_GLPES_TCPRXPUREACKHI                       0x0001E024 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_SHIFT 0
+#define I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_SHIFT)
+#define I40E_GLPES_TCPRXPUREACKSLO                      0x0001E020 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_SHIFT 0
+#define I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_SHIFT)
+#define I40E_GLPES_TCPRXTHREEHOLEHI                        0x0001E03C /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_SHIFT 0
+#define I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_SHIFT)
+#define I40E_GLPES_TCPRXTHREEHOLELO                        0x0001E038 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_SHIFT 0
+#define I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_SHIFT)
+#define I40E_GLPES_TCPRXTWOHOLEHI                      0x0001E034 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_SHIFT 0
+#define I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_SHIFT)
+#define I40E_GLPES_TCPRXTWOHOLELO                      0x0001E030 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_SHIFT 0
+#define I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_SHIFT)
+#define I40E_GLPES_TCPTXRETRANSFASTHI                          0x0001E04C /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_SHIFT 0
+#define I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_SHIFT)
+#define I40E_GLPES_TCPTXRETRANSFASTLO                          0x0001E048 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_SHIFT 0
+#define I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_SHIFT)
+#define I40E_GLPES_TCPTXTOUTSFASTHI                        0x0001E054 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_SHIFT 0
+#define I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_SHIFT)
+#define I40E_GLPES_TCPTXTOUTSFASTLO                        0x0001E050 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_SHIFT 0
+#define I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_SHIFT)
+#define I40E_GLPES_TCPTXTOUTSHI                    0x0001E05C /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_SHIFT 0
+#define I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_SHIFT)
+#define I40E_GLPES_TCPTXTOUTSLO                    0x0001E058 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_SHIFT 0
+#define I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXDISCARD(_i)                (0x00018600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXDISCARD_MAX_INDEX          31
+#define I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_SHIFT 0
+#define I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_SHIFT)
+#define I40E_GLPES_VFIP4RXFRAGSHI(_i)                (0x00018804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXFRAGSHI_MAX_INDEX          31
+#define I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXFRAGSLO(_i)                (0x00018800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXFRAGSLO_MAX_INDEX          31
+#define I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXMCOCTSHI(_i)                 (0x00018A04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXMCOCTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXMCOCTSLO(_i)                 (0x00018A00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXMCOCTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXMCPKTSHI(_i)                 (0x00018C04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXMCPKTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXMCPKTSLO(_i)                 (0x00018C00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXMCPKTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXOCTSHI(_i)               (0x00018204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXOCTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXOCTSLO(_i)               (0x00018200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXOCTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXPKTSHI(_i)               (0x00018404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXPKTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXPKTSLO(_i)               (0x00018400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXPKTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXTRUNC(_i)              (0x00018700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXTRUNC_MAX_INDEX        31
+#define I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_SHIFT 0
+#define I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_SHIFT)
+#define I40E_GLPES_VFIP4TXFRAGSHI(_i)                (0x00019E04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXFRAGSHI_MAX_INDEX          31
+#define I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXFRAGSLO(_i)                (0x00019E00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXFRAGSLO_MAX_INDEX          31
+#define I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT)
+#define I40E_GLPES_VFIP4TXMCOCTSHI(_i)                 (0x0001A004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXMCOCTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXMCOCTSLO(_i)                 (0x0001A000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXMCOCTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP4TXMCPKTSHI(_i)                 (0x0001A204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXMCPKTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXMCPKTSLO(_i)                 (0x0001A200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXMCPKTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP4TXNOROUTE(_i)                (0x0001AE00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXNOROUTE_MAX_INDEX          31
+#define I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT 0
+#define I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_MASK  (0xFFFFFF <<  I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT)
+#define I40E_GLPES_VFIP4TXOCTSHI(_i)               (0x00019A04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXOCTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXOCTSLO(_i)               (0x00019A00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXOCTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP4TXPKTSHI(_i)               (0x00019C04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXPKTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXPKTSLO(_i)               (0x00019C00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXPKTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXDISCARD(_i)                (0x00019200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXDISCARD_MAX_INDEX          31
+#define I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_SHIFT 0
+#define I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_SHIFT)
+#define I40E_GLPES_VFIP6RXFRAGSHI(_i)                (0x00019404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXFRAGSHI_MAX_INDEX          31
+#define I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXFRAGSLO(_i)                (0x00019400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXFRAGSLO_MAX_INDEX          31
+#define I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXMCOCTSHI(_i)                 (0x00019604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXMCOCTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXMCOCTSLO(_i)                 (0x00019600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXMCOCTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXMCPKTSHI(_i)                 (0x00019804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXMCPKTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXMCPKTSLO(_i)                 (0x00019800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXMCPKTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXOCTSHI(_i)               (0x00018E04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXOCTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXOCTSLO(_i)               (0x00018E00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXOCTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXPKTSHI(_i)               (0x00019004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXPKTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXPKTSLO(_i)               (0x00019000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXPKTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXTRUNC(_i)              (0x00019300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXTRUNC_MAX_INDEX        31
+#define I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_SHIFT 0
+#define I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_SHIFT)
+#define I40E_GLPES_VFIP6TXFRAGSHI(_i)                (0x0001A804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXFRAGSHI_MAX_INDEX          31
+#define I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXFRAGSLO(_i)                (0x0001A800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXFRAGSLO_MAX_INDEX          31
+#define I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT)
+#define I40E_GLPES_VFIP6TXMCOCTSHI(_i)                 (0x0001AA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXMCOCTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXMCOCTSLO(_i)                 (0x0001AA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXMCOCTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP6TXMCPKTSHI(_i)                 (0x0001AC04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXMCPKTSHI_MAX_INDEX           31
+#define I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXMCPKTSLO(_i)                 (0x0001AC00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXMCPKTSLO_MAX_INDEX           31
+#define I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP6TXNOROUTE(_i)                (0x0001AF00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXNOROUTE_MAX_INDEX          31
+#define I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT 0
+#define I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_MASK  (0xFFFFFF <<  I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT)
+#define I40E_GLPES_VFIP6TXOCTSHI(_i)               (0x0001A404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXOCTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXOCTSLO(_i)               (0x0001A400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXOCTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP6TXPKTSHI(_i)               (0x0001A604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXPKTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXPKTSLO(_i)               (0x0001A600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXPKTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT)
+#define I40E_GLPES_VFRDMARXRDSHI(_i)               (0x0001BE04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXRDSHI_MAX_INDEX         31
+#define I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_SHIFT 0
+#define I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_MASK  (0xFFFF <<  I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_SHIFT)
+#define I40E_GLPES_VFRDMARXRDSLO(_i)               (0x0001BE00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXRDSLO_MAX_INDEX         31
+#define I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_SHIFT 0
+#define I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_SHIFT)
+#define I40E_GLPES_VFRDMARXSNDSHI(_i)                (0x0001C004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXSNDSHI_MAX_INDEX          31
+#define I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT 0
+#define I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_MASK  (0xFFFF <<  I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT)
+#define I40E_GLPES_VFRDMARXSNDSLO(_i)                (0x0001C000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXSNDSLO_MAX_INDEX          31
+#define I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT 0
+#define I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT)
+#define I40E_GLPES_VFRDMARXWRSHI(_i)               (0x0001BC04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXWRSHI_MAX_INDEX         31
+#define I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_SHIFT 0
+#define I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_MASK  (0xFFFF <<  I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_SHIFT)
+#define I40E_GLPES_VFRDMARXWRSLO(_i)               (0x0001BC00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXWRSLO_MAX_INDEX         31
+#define I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_SHIFT 0
+#define I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_SHIFT)
+#define I40E_GLPES_VFRDMATXRDSHI(_i)               (0x0001C404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXRDSHI_MAX_INDEX         31
+#define I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_SHIFT 0
+#define I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_MASK  (0xFFFF <<  I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_SHIFT)
+#define I40E_GLPES_VFRDMATXRDSLO(_i)               (0x0001C400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXRDSLO_MAX_INDEX         31
+#define I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_SHIFT 0
+#define I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_SHIFT)
+#define I40E_GLPES_VFRDMATXSNDSHI(_i)                (0x0001C604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXSNDSHI_MAX_INDEX          31
+#define I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT 0
+#define I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_MASK  (0xFFFF <<  I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT)
+#define I40E_GLPES_VFRDMATXSNDSLO(_i)                (0x0001C600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXSNDSLO_MAX_INDEX          31
+#define I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT 0
+#define I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT)
+#define I40E_GLPES_VFRDMATXWRSHI(_i)               (0x0001C204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXWRSHI_MAX_INDEX         31
+#define I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_SHIFT 0
+#define I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_MASK  (0xFFFF <<  I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_SHIFT)
+#define I40E_GLPES_VFRDMATXWRSLO(_i)               (0x0001C200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXWRSLO_MAX_INDEX         31
+#define I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_SHIFT 0
+#define I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_SHIFT)
+#define I40E_GLPES_VFRDMAVBNDHI(_i)              (0x0001C804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMAVBNDHI_MAX_INDEX        31
+#define I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_SHIFT 0
+#define I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_SHIFT)
+#define I40E_GLPES_VFRDMAVBNDLO(_i)              (0x0001C800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMAVBNDLO_MAX_INDEX        31
+#define I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_SHIFT 0
+#define I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_SHIFT)
+#define I40E_GLPES_VFRDMAVINVHI(_i)              (0x0001CA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMAVINVHI_MAX_INDEX        31
+#define I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_SHIFT 0
+#define I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_SHIFT)
+#define I40E_GLPES_VFRDMAVINVLO(_i)              (0x0001CA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMAVINVLO_MAX_INDEX        31
+#define I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_SHIFT 0
+#define I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_SHIFT)
+#define I40E_GLPES_VFRXVLANERR(_i)             (0x00018000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRXVLANERR_MAX_INDEX       31
+#define I40E_GLPES_VFRXVLANERR_RXVLANERR_SHIFT 0
+#define I40E_GLPES_VFRXVLANERR_RXVLANERR_MASK  (0xFFFFFF <<  I40E_GLPES_VFRXVLANERR_RXVLANERR_SHIFT)
+#define I40E_GLPES_VFTCPRTXSEG(_i)             (0x0001B600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRTXSEG_MAX_INDEX       31
+#define I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_SHIFT 0
+#define I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_SHIFT)
+#define I40E_GLPES_VFTCPRXOPTERR(_i)               (0x0001B200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRXOPTERR_MAX_INDEX         31
+#define I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_SHIFT 0
+#define I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_MASK  (0xFFFFFF <<  I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_SHIFT)
+#define I40E_GLPES_VFTCPRXPROTOERR(_i)                 (0x0001B300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRXPROTOERR_MAX_INDEX           31
+#define I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT 0
+#define I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_MASK  (0xFFFFFF <<  I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT)
+#define I40E_GLPES_VFTCPRXSEGSHI(_i)               (0x0001B004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRXSEGSHI_MAX_INDEX         31
+#define I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT 0
+#define I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_MASK  (0xFFFF <<  I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT)
+#define I40E_GLPES_VFTCPRXSEGSLO(_i)               (0x0001B000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRXSEGSLO_MAX_INDEX         31
+#define I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT 0
+#define I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT)
+#define I40E_GLPES_VFTCPTXSEGHI(_i)              (0x0001B404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPTXSEGHI_MAX_INDEX        31
+#define I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_SHIFT 0
+#define I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_MASK  (0xFFFF <<  I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_SHIFT)
+#define I40E_GLPES_VFTCPTXSEGLO(_i)              (0x0001B400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPTXSEGLO_MAX_INDEX        31
+#define I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_SHIFT 0
+#define I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_SHIFT)
+#define I40E_GLPES_VFUDPRXPKTSHI(_i)               (0x0001B804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFUDPRXPKTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT)
+#define I40E_GLPES_VFUDPRXPKTSLO(_i)               (0x0001B800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFUDPRXPKTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT)
+#define I40E_GLPES_VFUDPTXPKTSHI(_i)               (0x0001BA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFUDPTXPKTSHI_MAX_INDEX         31
+#define I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT)
+#define I40E_GLPES_VFUDPTXPKTSLO(_i)               (0x0001BA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFUDPTXPKTSLO_MAX_INDEX         31
+#define I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT)
+
+#define I40E_VFPE_AEQALLOC1               0x0000A400 /* Reset: VFR */
+#define I40E_VFPE_AEQALLOC1_AECOUNT_SHIFT 0
+#define I40E_VFPE_AEQALLOC1_AECOUNT_MASK  (0xFFFFFFFF <<  I40E_VFPE_AEQALLOC1_AECOUNT_SHIFT)
+#define I40E_VFPE_CCQPHIGH1                  0x00009800 /* Reset: VFR */
+#define I40E_VFPE_CCQPHIGH1_PECCQPHIGH_SHIFT 0
+#define I40E_VFPE_CCQPHIGH1_PECCQPHIGH_MASK  (0xFFFFFFFF <<  I40E_VFPE_CCQPHIGH1_PECCQPHIGH_SHIFT)
+#define I40E_VFPE_CCQPLOW1                 0x0000AC00 /* Reset: VFR */
+#define I40E_VFPE_CCQPLOW1_PECCQPLOW_SHIFT 0
+#define I40E_VFPE_CCQPLOW1_PECCQPLOW_MASK  (0xFFFFFFFF <<  I40E_VFPE_CCQPLOW1_PECCQPLOW_SHIFT)
+#define I40E_VFPE_CCQPSTATUS1                   0x0000B800 /* Reset: VFR */
+#define I40E_VFPE_CCQPSTATUS1_CCQP_DONE_SHIFT   0
+#define I40E_VFPE_CCQPSTATUS1_CCQP_DONE_MASK    (0x1 <<  I40E_VFPE_CCQPSTATUS1_CCQP_DONE_SHIFT)
+#define I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_SHIFT 4
+#define I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_MASK  (0x7 <<  I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_SHIFT)
+#define I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_SHIFT 16
+#define I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_MASK  (0x3F <<  I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_SHIFT)
+#define I40E_VFPE_CCQPSTATUS1_CCQP_ERR_SHIFT    31
+#define I40E_VFPE_CCQPSTATUS1_CCQP_ERR_MASK     (0x1 <<  I40E_VFPE_CCQPSTATUS1_CCQP_ERR_SHIFT)
+#define I40E_VFPE_CQACK1              0x0000B000 /* Reset: VFR */
+#define I40E_VFPE_CQACK1_PECQID_SHIFT 0
+#define I40E_VFPE_CQACK1_PECQID_MASK  (0x1FFFF <<  I40E_VFPE_CQACK1_PECQID_SHIFT)
+#define I40E_VFPE_CQARM1              0x0000B400 /* Reset: VFR */
+#define I40E_VFPE_CQARM1_PECQID_SHIFT 0
+#define I40E_VFPE_CQARM1_PECQID_MASK  (0x1FFFF <<  I40E_VFPE_CQARM1_PECQID_SHIFT)
+#define I40E_VFPE_CQPDB1              0x0000BC00 /* Reset: VFR */
+#define I40E_VFPE_CQPDB1_WQHEAD_SHIFT 0
+#define I40E_VFPE_CQPDB1_WQHEAD_MASK  (0x7FF <<  I40E_VFPE_CQPDB1_WQHEAD_SHIFT)
+#define I40E_VFPE_CQPERRCODES1                      0x00009C00 /* Reset: VFR */
+#define I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_SHIFT 0
+#define I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_MASK  (0xFFFF <<  I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_SHIFT)
+#define I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_SHIFT 16
+#define I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_MASK  (0xFFFF <<  I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_SHIFT)
+#define I40E_VFPE_CQPTAIL1                  0x0000A000 /* Reset: VFR */
+#define I40E_VFPE_CQPTAIL1_WQTAIL_SHIFT     0
+#define I40E_VFPE_CQPTAIL1_WQTAIL_MASK      (0x7FF <<  I40E_VFPE_CQPTAIL1_WQTAIL_SHIFT)
+#define I40E_VFPE_CQPTAIL1_CQP_OP_ERR_SHIFT 31
+#define I40E_VFPE_CQPTAIL1_CQP_OP_ERR_MASK  (0x1 <<  I40E_VFPE_CQPTAIL1_CQP_OP_ERR_SHIFT)
+#define I40E_VFPE_IPCONFIG01                        0x00008C00 /* Reset: VFR */
+#define I40E_VFPE_IPCONFIG01_PEIPID_SHIFT           0
+#define I40E_VFPE_IPCONFIG01_PEIPID_MASK            (0xFFFF <<  I40E_VFPE_IPCONFIG01_PEIPID_SHIFT)
+#define I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_SHIFT 16
+#define I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_MASK  (0x1 <<  I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_SHIFT)
+#define I40E_VFPE_MRTEIDXMASK1                       0x00009000 /* Reset: VFR */
+#define I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_SHIFT 0
+#define I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_MASK  (0x1F <<  I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_SHIFT)
+#define I40E_VFPE_RCVUNEXPECTEDERROR1                        0x00009400 /* Reset: VFR */
+#define I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_SHIFT 0
+#define I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_MASK  (0xFFFFFF <<  I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_SHIFT)
+#define I40E_VFPE_TCPNOWTIMER1               0x0000A800 /* Reset: VFR */
+#define I40E_VFPE_TCPNOWTIMER1_TCP_NOW_SHIFT 0
+#define I40E_VFPE_TCPNOWTIMER1_TCP_NOW_MASK  (0xFFFFFFFF <<  I40E_VFPE_TCPNOWTIMER1_TCP_NOW_SHIFT)
+#define I40E_VFPE_WQEALLOC1                      0x0000C000 /* Reset: VFR */
+#define I40E_VFPE_WQEALLOC1_PEQPID_SHIFT         0
+#define I40E_VFPE_WQEALLOC1_PEQPID_MASK          (0x3FFFF <<  I40E_VFPE_WQEALLOC1_PEQPID_SHIFT)
+#define I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_SHIFT 20
+#define I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_MASK  (0xFFF <<  I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_SHIFT)
+#endif /* I40IW_REGISTER_H */
diff --git a/drivers/infiniband/hw/i40iw/i40iw_status.h b/drivers/infiniband/hw/i40iw/i40iw_status.h
new file mode 100644
index 0000000..f7013f1
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_status.h
@@ -0,0 +1,101 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_STATUS_H
+#define I40IW_STATUS_H
+
+/* Error Codes */
+enum i40iw_status_code {
+	I40IW_SUCCESS = 0,
+	I40IW_ERR_NVM = -1,
+	I40IW_ERR_NVM_CHECKSUM = -2,
+	I40IW_ERR_CONFIG = -4,
+	I40IW_ERR_PARAM = -5,
+	I40IW_ERR_DEVICE_NOT_SUPPORTED = -6,
+	I40IW_ERR_RESET_FAILED = -7,
+	I40IW_ERR_SWFW_SYNC = -8,
+	I40IW_ERR_NO_MEMORY = -9,
+	I40IW_ERR_BAD_PTR = -10,
+	I40IW_ERR_INVALID_PD_ID = -11,
+	I40IW_ERR_INVALID_QP_ID = -12,
+	I40IW_ERR_INVALID_CQ_ID = -13,
+	I40IW_ERR_INVALID_CEQ_ID = -14,
+	I40IW_ERR_INVALID_AEQ_ID = -15,
+	I40IW_ERR_INVALID_SIZE = -16,
+	I40IW_ERR_INVALID_ARP_INDEX = -17,
+	I40IW_ERR_INVALID_FPM_FUNC_ID = -18,
+	I40IW_ERR_QP_INVALID_MSG_SIZE = -19,
+	I40IW_ERR_QP_TOOMANY_WRS_POSTED = -20,
+	I40IW_ERR_INVALID_FRAG_COUNT = -21,
+	I40IW_ERR_QUEUE_EMPTY = -22,
+	I40IW_ERR_INVALID_ALIGNMENT = -23,
+	I40IW_ERR_FLUSHED_QUEUE = -24,
+	I40IW_ERR_INVALID_PUSH_PAGE_INDEX = -25,
+	I40IW_ERR_INVALID_INLINE_DATA_SIZE = -26,
+	I40IW_ERR_TIMEOUT = -27,
+	I40IW_ERR_OPCODE_MISMATCH = -28,
+	I40IW_ERR_CQP_COMPL_ERROR = -29,
+	I40IW_ERR_INVALID_VF_ID = -30,
+	I40IW_ERR_INVALID_HMCFN_ID = -31,
+	I40IW_ERR_BACKING_PAGE_ERROR = -32,
+	I40IW_ERR_NO_PBLCHUNKS_AVAILABLE = -33,
+	I40IW_ERR_INVALID_PBLE_INDEX = -34,
+	I40IW_ERR_INVALID_SD_INDEX = -35,
+	I40IW_ERR_INVALID_PAGE_DESC_INDEX = -36,
+	I40IW_ERR_INVALID_SD_TYPE = -37,
+	I40IW_ERR_MEMCPY_FAILED = -38,
+	I40IW_ERR_INVALID_HMC_OBJ_INDEX = -39,
+	I40IW_ERR_INVALID_HMC_OBJ_COUNT = -40,
+	I40IW_ERR_INVALID_SRQ_ARM_LIMIT = -41,
+	I40IW_ERR_SRQ_ENABLED = -42,
+	I40IW_ERR_BUF_TOO_SHORT = -43,
+	I40IW_ERR_BAD_IWARP_CQE = -44,
+	I40IW_ERR_NVM_BLANK_MODE = -45,
+	I40IW_ERR_NOT_IMPLEMENTED = -46,
+	I40IW_ERR_PE_DOORBELL_NOT_ENABLED = -47,
+	I40IW_ERR_NOT_READY = -48,
+	I40IW_NOT_SUPPORTED = -49,
+	I40IW_ERR_FIRMWARE_API_VERSION = -50,
+	I40IW_ERR_RING_FULL = -51,
+	I40IW_ERR_MPA_CRC = -61,
+	I40IW_ERR_NO_TXBUFS = -62,
+	I40IW_ERR_SEQ_NUM = -63,
+	I40IW_ERR_list_empty = -64,
+	I40IW_ERR_INVALID_MAC_ADDR = -65,
+	I40IW_ERR_BAD_STAG      = -66,
+	I40IW_ERR_CQ_COMPL_ERROR = -67,
+	I40IW_ERR_QUEUE_DESTROYED = -68
+
+};
+#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h
new file mode 100644
index 0000000..adc8d2e
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_type.h
@@ -0,0 +1,1363 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_TYPE_H
+#define I40IW_TYPE_H
+#include "i40iw_user.h"
+#include "i40iw_hmc.h"
+#include "i40iw_vf.h"
+#include "i40iw_virtchnl.h"
+
+struct i40iw_cqp_sq_wqe {
+	u64 buf[I40IW_CQP_WQE_SIZE];
+};
+
+struct i40iw_sc_aeqe {
+	u64 buf[I40IW_AEQE_SIZE];
+};
+
+struct i40iw_ceqe {
+	u64 buf[I40IW_CEQE_SIZE];
+};
+
+struct i40iw_cqp_ctx {
+	u64 buf[I40IW_CQP_CTX_SIZE];
+};
+
+struct i40iw_cq_shadow_area {
+	u64 buf[I40IW_SHADOW_AREA_SIZE];
+};
+
+struct i40iw_sc_dev;
+struct i40iw_hmc_info;
+struct i40iw_vsi_pestat;
+
+struct i40iw_cqp_ops;
+struct i40iw_ccq_ops;
+struct i40iw_ceq_ops;
+struct i40iw_aeq_ops;
+struct i40iw_mr_ops;
+struct i40iw_cqp_misc_ops;
+struct i40iw_pd_ops;
+struct i40iw_priv_qp_ops;
+struct i40iw_priv_cq_ops;
+struct i40iw_hmc_ops;
+
+enum i40iw_page_size {
+	I40IW_PAGE_SIZE_4K,
+	I40IW_PAGE_SIZE_2M
+};
+
+enum i40iw_resource_indicator_type {
+	I40IW_RSRC_INDICATOR_TYPE_ADAPTER = 0,
+	I40IW_RSRC_INDICATOR_TYPE_CQ,
+	I40IW_RSRC_INDICATOR_TYPE_QP,
+	I40IW_RSRC_INDICATOR_TYPE_SRQ
+};
+
+enum i40iw_hdrct_flags {
+	DDP_LEN_FLAG = 0x80,
+	DDP_HDR_FLAG = 0x40,
+	RDMA_HDR_FLAG = 0x20
+};
+
+enum i40iw_term_layers {
+	LAYER_RDMA = 0,
+	LAYER_DDP = 1,
+	LAYER_MPA = 2
+};
+
+enum i40iw_term_error_types {
+	RDMAP_REMOTE_PROT = 1,
+	RDMAP_REMOTE_OP = 2,
+	DDP_CATASTROPHIC = 0,
+	DDP_TAGGED_BUFFER = 1,
+	DDP_UNTAGGED_BUFFER = 2,
+	DDP_LLP = 3
+};
+
+enum i40iw_term_rdma_errors {
+	RDMAP_INV_STAG = 0x00,
+	RDMAP_INV_BOUNDS = 0x01,
+	RDMAP_ACCESS = 0x02,
+	RDMAP_UNASSOC_STAG = 0x03,
+	RDMAP_TO_WRAP = 0x04,
+	RDMAP_INV_RDMAP_VER = 0x05,
+	RDMAP_UNEXPECTED_OP = 0x06,
+	RDMAP_CATASTROPHIC_LOCAL = 0x07,
+	RDMAP_CATASTROPHIC_GLOBAL = 0x08,
+	RDMAP_CANT_INV_STAG = 0x09,
+	RDMAP_UNSPECIFIED = 0xff
+};
+
+enum i40iw_term_ddp_errors {
+	DDP_CATASTROPHIC_LOCAL = 0x00,
+	DDP_TAGGED_INV_STAG = 0x00,
+	DDP_TAGGED_BOUNDS = 0x01,
+	DDP_TAGGED_UNASSOC_STAG = 0x02,
+	DDP_TAGGED_TO_WRAP = 0x03,
+	DDP_TAGGED_INV_DDP_VER = 0x04,
+	DDP_UNTAGGED_INV_QN = 0x01,
+	DDP_UNTAGGED_INV_MSN_NO_BUF = 0x02,
+	DDP_UNTAGGED_INV_MSN_RANGE = 0x03,
+	DDP_UNTAGGED_INV_MO = 0x04,
+	DDP_UNTAGGED_INV_TOO_LONG = 0x05,
+	DDP_UNTAGGED_INV_DDP_VER = 0x06
+};
+
+enum i40iw_term_mpa_errors {
+	MPA_CLOSED = 0x01,
+	MPA_CRC = 0x02,
+	MPA_MARKER = 0x03,
+	MPA_REQ_RSP = 0x04,
+};
+
+enum i40iw_flush_opcode {
+	FLUSH_INVALID = 0,
+	FLUSH_PROT_ERR,
+	FLUSH_REM_ACCESS_ERR,
+	FLUSH_LOC_QP_OP_ERR,
+	FLUSH_REM_OP_ERR,
+	FLUSH_LOC_LEN_ERR,
+	FLUSH_GENERAL_ERR,
+	FLUSH_FATAL_ERR
+};
+
+enum i40iw_term_eventtypes {
+	TERM_EVENT_QP_FATAL,
+	TERM_EVENT_QP_ACCESS_ERR
+};
+
+struct i40iw_terminate_hdr {
+	u8 layer_etype;
+	u8 error_code;
+	u8 hdrct;
+	u8 rsvd;
+};
+
+enum i40iw_debug_flag {
+	I40IW_DEBUG_NONE	= 0x00000000,
+	I40IW_DEBUG_ERR		= 0x00000001,
+	I40IW_DEBUG_INIT	= 0x00000002,
+	I40IW_DEBUG_DEV		= 0x00000004,
+	I40IW_DEBUG_CM		= 0x00000008,
+	I40IW_DEBUG_VERBS	= 0x00000010,
+	I40IW_DEBUG_PUDA	= 0x00000020,
+	I40IW_DEBUG_ILQ		= 0x00000040,
+	I40IW_DEBUG_IEQ		= 0x00000080,
+	I40IW_DEBUG_QP		= 0x00000100,
+	I40IW_DEBUG_CQ		= 0x00000200,
+	I40IW_DEBUG_MR		= 0x00000400,
+	I40IW_DEBUG_PBLE	= 0x00000800,
+	I40IW_DEBUG_WQE		= 0x00001000,
+	I40IW_DEBUG_AEQ		= 0x00002000,
+	I40IW_DEBUG_CQP		= 0x00004000,
+	I40IW_DEBUG_HMC		= 0x00008000,
+	I40IW_DEBUG_USER	= 0x00010000,
+	I40IW_DEBUG_VIRT	= 0x00020000,
+	I40IW_DEBUG_DCB		= 0x00040000,
+	I40IW_DEBUG_CQE		= 0x00800000,
+	I40IW_DEBUG_ALL		= 0xFFFFFFFF
+};
+
+enum i40iw_hw_stats_index_32b {
+	I40IW_HW_STAT_INDEX_IP4RXDISCARD = 0,
+	I40IW_HW_STAT_INDEX_IP4RXTRUNC,
+	I40IW_HW_STAT_INDEX_IP4TXNOROUTE,
+	I40IW_HW_STAT_INDEX_IP6RXDISCARD,
+	I40IW_HW_STAT_INDEX_IP6RXTRUNC,
+	I40IW_HW_STAT_INDEX_IP6TXNOROUTE,
+	I40IW_HW_STAT_INDEX_TCPRTXSEG,
+	I40IW_HW_STAT_INDEX_TCPRXOPTERR,
+	I40IW_HW_STAT_INDEX_TCPRXPROTOERR,
+	I40IW_HW_STAT_INDEX_MAX_32
+};
+
+enum i40iw_hw_stats_index_64b {
+	I40IW_HW_STAT_INDEX_IP4RXOCTS = 0,
+	I40IW_HW_STAT_INDEX_IP4RXPKTS,
+	I40IW_HW_STAT_INDEX_IP4RXFRAGS,
+	I40IW_HW_STAT_INDEX_IP4RXMCPKTS,
+	I40IW_HW_STAT_INDEX_IP4TXOCTS,
+	I40IW_HW_STAT_INDEX_IP4TXPKTS,
+	I40IW_HW_STAT_INDEX_IP4TXFRAGS,
+	I40IW_HW_STAT_INDEX_IP4TXMCPKTS,
+	I40IW_HW_STAT_INDEX_IP6RXOCTS,
+	I40IW_HW_STAT_INDEX_IP6RXPKTS,
+	I40IW_HW_STAT_INDEX_IP6RXFRAGS,
+	I40IW_HW_STAT_INDEX_IP6RXMCPKTS,
+	I40IW_HW_STAT_INDEX_IP6TXOCTS,
+	I40IW_HW_STAT_INDEX_IP6TXPKTS,
+	I40IW_HW_STAT_INDEX_IP6TXFRAGS,
+	I40IW_HW_STAT_INDEX_IP6TXMCPKTS,
+	I40IW_HW_STAT_INDEX_TCPRXSEGS,
+	I40IW_HW_STAT_INDEX_TCPTXSEG,
+	I40IW_HW_STAT_INDEX_RDMARXRDS,
+	I40IW_HW_STAT_INDEX_RDMARXSNDS,
+	I40IW_HW_STAT_INDEX_RDMARXWRS,
+	I40IW_HW_STAT_INDEX_RDMATXRDS,
+	I40IW_HW_STAT_INDEX_RDMATXSNDS,
+	I40IW_HW_STAT_INDEX_RDMATXWRS,
+	I40IW_HW_STAT_INDEX_RDMAVBND,
+	I40IW_HW_STAT_INDEX_RDMAVINV,
+	I40IW_HW_STAT_INDEX_MAX_64
+};
+
+struct i40iw_dev_hw_stats_offsets {
+	u32 stats_offset_32[I40IW_HW_STAT_INDEX_MAX_32];
+	u32 stats_offset_64[I40IW_HW_STAT_INDEX_MAX_64];
+};
+
+struct i40iw_dev_hw_stats {
+	u64 stats_value_32[I40IW_HW_STAT_INDEX_MAX_32];
+	u64 stats_value_64[I40IW_HW_STAT_INDEX_MAX_64];
+};
+
+struct i40iw_vsi_pestat {
+	struct i40iw_hw *hw;
+	struct i40iw_dev_hw_stats hw_stats;
+	struct i40iw_dev_hw_stats last_read_hw_stats;
+	struct i40iw_dev_hw_stats_offsets hw_stats_offsets;
+	struct timer_list stats_timer;
+	struct i40iw_sc_vsi *vsi;
+	spinlock_t lock; /* rdma stats lock */
+};
+
+struct i40iw_hw {
+	u8 __iomem *hw_addr;
+	void *dev_context;
+	struct i40iw_hmc_info hmc;
+};
+
+struct i40iw_pfpdu {
+	struct list_head rxlist;
+	u32 rcv_nxt;
+	u32 fps;
+	u32 max_fpdu_data;
+	bool mode;
+	bool mpa_crc_err;
+	u64 total_ieq_bufs;
+	u64 fpdu_processed;
+	u64 bad_seq_num;
+	u64 crc_err;
+	u64 no_tx_bufs;
+	u64 tx_err;
+	u64 out_of_order;
+	u64 pmode_count;
+};
+
+struct i40iw_sc_pd {
+	u32 size;
+	struct i40iw_sc_dev *dev;
+	u16 pd_id;
+	int abi_ver;
+};
+
+struct i40iw_cqp_quanta {
+	u64 elem[I40IW_CQP_WQE_SIZE];
+};
+
+struct i40iw_sc_cqp {
+	u32 size;
+	u64 sq_pa;
+	u64 host_ctx_pa;
+	void *back_cqp;
+	struct i40iw_sc_dev *dev;
+	enum i40iw_status_code (*process_cqp_sds)(struct i40iw_sc_dev *,
+						  struct i40iw_update_sds_info *);
+	struct i40iw_dma_mem sdbuf;
+	struct i40iw_ring sq_ring;
+	struct i40iw_cqp_quanta *sq_base;
+	u64 *host_ctx;
+	u64 *scratch_array;
+	u32 cqp_id;
+	u32 sq_size;
+	u32 hw_sq_size;
+	u8 struct_ver;
+	u8 polarity;
+	bool en_datacenter_tcp;
+	u8 hmc_profile;
+	u8 enabled_vf_count;
+	u8 timeout_count;
+};
+
+struct i40iw_sc_aeq {
+	u32 size;
+	u64 aeq_elem_pa;
+	struct i40iw_sc_dev *dev;
+	struct i40iw_sc_aeqe *aeqe_base;
+	void *pbl_list;
+	u32 elem_cnt;
+	struct i40iw_ring aeq_ring;
+	bool virtual_map;
+	u8 pbl_chunk_size;
+	u32 first_pm_pbl_idx;
+	u8 polarity;
+};
+
+struct i40iw_sc_ceq {
+	u32 size;
+	u64 ceq_elem_pa;
+	struct i40iw_sc_dev *dev;
+	struct i40iw_ceqe *ceqe_base;
+	void *pbl_list;
+	u32 ceq_id;
+	u32 elem_cnt;
+	struct i40iw_ring ceq_ring;
+	bool virtual_map;
+	u8 pbl_chunk_size;
+	bool tph_en;
+	u8 tph_val;
+	u32 first_pm_pbl_idx;
+	u8 polarity;
+};
+
+struct i40iw_sc_cq {
+	struct i40iw_cq_uk cq_uk;
+	u64 cq_pa;
+	u64 shadow_area_pa;
+	struct i40iw_sc_dev *dev;
+	struct i40iw_sc_vsi *vsi;
+	void *pbl_list;
+	void *back_cq;
+	u32 ceq_id;
+	u32 shadow_read_threshold;
+	bool ceqe_mask;
+	bool virtual_map;
+	u8 pbl_chunk_size;
+	u8 cq_type;
+	bool ceq_id_valid;
+	bool tph_en;
+	u8 tph_val;
+	u32 first_pm_pbl_idx;
+	bool check_overflow;
+};
+
+struct i40iw_sc_qp {
+	struct i40iw_qp_uk qp_uk;
+	u64 sq_pa;
+	u64 rq_pa;
+	u64 hw_host_ctx_pa;
+	u64 shadow_area_pa;
+	u64 q2_pa;
+	struct i40iw_sc_dev *dev;
+	struct i40iw_sc_vsi *vsi;
+	struct i40iw_sc_pd *pd;
+	u64 *hw_host_ctx;
+	void *llp_stream_handle;
+	void *back_qp;
+	struct i40iw_pfpdu pfpdu;
+	u8 *q2_buf;
+	u64 qp_compl_ctx;
+	u16 qs_handle;
+	u16 push_idx;
+	u8 sq_tph_val;
+	u8 rq_tph_val;
+	u8 qp_state;
+	u8 qp_type;
+	u8 hw_sq_size;
+	u8 hw_rq_size;
+	u8 src_mac_addr_idx;
+	bool sq_tph_en;
+	bool rq_tph_en;
+	bool rcv_tph_en;
+	bool xmit_tph_en;
+	bool virtual_map;
+	bool flush_sq;
+	bool flush_rq;
+	u8 user_pri;
+	struct list_head list;
+	bool on_qoslist;
+	bool sq_flush;
+	enum i40iw_flush_opcode flush_code;
+	enum i40iw_term_eventtypes eventtype;
+	u8 term_flags;
+};
+
+struct i40iw_hmc_fpm_misc {
+	u32 max_ceqs;
+	u32 max_sds;
+	u32 xf_block_size;
+	u32 q1_block_size;
+	u32 ht_multiplier;
+	u32 timer_bucket;
+};
+
+struct i40iw_vchnl_if {
+	enum i40iw_status_code (*vchnl_recv)(struct i40iw_sc_dev *, u32, u8 *, u16);
+	enum i40iw_status_code (*vchnl_send)(struct i40iw_sc_dev *dev, u32, u8 *, u16);
+};
+
+#define I40IW_VCHNL_MAX_VF_MSG_SIZE 512
+
+struct i40iw_vchnl_vf_msg_buffer {
+	struct i40iw_virtchnl_op_buf vchnl_msg;
+	char parm_buffer[I40IW_VCHNL_MAX_VF_MSG_SIZE - 1];
+};
+
+struct i40iw_qos {
+	struct list_head qplist;
+	spinlock_t lock;	/* qos list */
+	u16 qs_handle;
+};
+
+struct i40iw_vfdev {
+	struct i40iw_sc_dev *pf_dev;
+	u8 *hmc_info_mem;
+	struct i40iw_vsi_pestat pestat;
+	struct i40iw_hmc_pble_info *pble_info;
+	struct i40iw_hmc_info hmc_info;
+	struct i40iw_vchnl_vf_msg_buffer vf_msg_buffer;
+	u64 fpm_query_buf_pa;
+	u64 *fpm_query_buf;
+	u32 vf_id;
+	u32 msg_count;
+	bool pf_hmc_initialized;
+	u16 pmf_index;
+	u16 iw_vf_idx;		/* VF Device table index */
+	bool stats_initialized;
+};
+
+#define I40IW_INVALID_FCN_ID 0xff
+struct i40iw_sc_vsi {
+	struct i40iw_sc_dev *dev;
+	void *back_vsi; /* Owned by OS */
+	u32 ilq_count;
+	struct i40iw_virt_mem ilq_mem;
+	struct i40iw_puda_rsrc *ilq;
+	u32 ieq_count;
+	struct i40iw_virt_mem ieq_mem;
+	struct i40iw_puda_rsrc *ieq;
+	u16 exception_lan_queue;
+	u16 mtu;
+	u8 fcn_id;
+	bool stats_fcn_id_alloc;
+	struct i40iw_qos qos[I40IW_MAX_USER_PRIORITY];
+	struct i40iw_vsi_pestat *pestat;
+};
+
+struct i40iw_sc_dev {
+	struct list_head cqp_cmd_head;	/* head of the CQP command list */
+	spinlock_t cqp_lock; /* cqp list sync */
+	struct i40iw_dev_uk dev_uk;
+	bool fcn_id_array[I40IW_MAX_STATS_COUNT];
+	struct i40iw_dma_mem vf_fpm_query_buf[I40IW_MAX_PE_ENABLED_VF_COUNT];
+	u64 fpm_query_buf_pa;
+	u64 fpm_commit_buf_pa;
+	u64 *fpm_query_buf;
+	u64 *fpm_commit_buf;
+	void *back_dev;
+	struct i40iw_hw *hw;
+	u8 __iomem *db_addr;
+	struct i40iw_hmc_info *hmc_info;
+	struct i40iw_hmc_pble_info *pble_info;
+	struct i40iw_vfdev *vf_dev[I40IW_MAX_PE_ENABLED_VF_COUNT];
+	struct i40iw_sc_cqp *cqp;
+	struct i40iw_sc_aeq *aeq;
+	struct i40iw_sc_ceq *ceq[I40IW_CEQ_MAX_COUNT];
+	struct i40iw_sc_cq *ccq;
+	struct i40iw_cqp_ops *cqp_ops;
+	struct i40iw_ccq_ops *ccq_ops;
+	struct i40iw_ceq_ops *ceq_ops;
+	struct i40iw_aeq_ops *aeq_ops;
+	struct i40iw_pd_ops *iw_pd_ops;
+	struct i40iw_priv_qp_ops *iw_priv_qp_ops;
+	struct i40iw_priv_cq_ops *iw_priv_cq_ops;
+	struct i40iw_mr_ops *mr_ops;
+	struct i40iw_cqp_misc_ops *cqp_misc_ops;
+	struct i40iw_hmc_ops *hmc_ops;
+	struct i40iw_vchnl_if vchnl_if;
+	const struct i40iw_vf_cqp_ops *iw_vf_cqp_ops;
+
+	struct i40iw_hmc_fpm_misc hmc_fpm_misc;
+	u32 debug_mask;
+	u8 hmc_fn_id;
+	bool is_pf;
+	bool vchnl_up;
+	bool ceq_valid;
+	u8 vf_id;
+	wait_queue_head_t vf_reqs;
+	u64 cqp_cmd_stats[OP_SIZE_CQP_STAT_ARRAY];
+	struct i40iw_vchnl_vf_msg_buffer vchnl_vf_msg_buf;
+	u8 hw_rev;
+};
+
+struct i40iw_modify_cq_info {
+	u64 cq_pa;
+	struct i40iw_cqe *cq_base;
+	void *pbl_list;
+	u32 ceq_id;
+	u32 cq_size;
+	u32 shadow_read_threshold;
+	bool virtual_map;
+	u8 pbl_chunk_size;
+	bool check_overflow;
+	bool cq_resize;
+	bool ceq_change;
+	bool check_overflow_change;
+	u32 first_pm_pbl_idx;
+	bool ceq_valid;
+};
+
+struct i40iw_create_qp_info {
+	u8 next_iwarp_state;
+	bool ord_valid;
+	bool tcp_ctx_valid;
+	bool cq_num_valid;
+	bool arp_cache_idx_valid;
+};
+
+struct i40iw_modify_qp_info {
+	u64 rx_win0;
+	u64 rx_win1;
+	u8 next_iwarp_state;
+	u8 termlen;
+	bool ord_valid;
+	bool tcp_ctx_valid;
+	bool cq_num_valid;
+	bool arp_cache_idx_valid;
+	bool reset_tcp_conn;
+	bool remove_hash_idx;
+	bool dont_send_term;
+	bool dont_send_fin;
+	bool cached_var_valid;
+	bool force_loopback;
+};
+
+struct i40iw_ccq_cqe_info {
+	struct i40iw_sc_cqp *cqp;
+	u64 scratch;
+	u32 op_ret_val;
+	u16 maj_err_code;
+	u16 min_err_code;
+	u8 op_code;
+	bool error;
+};
+
+struct i40iw_l2params {
+	u16 qs_handle_list[I40IW_MAX_USER_PRIORITY];
+	u16 mtu;
+};
+
+struct i40iw_vsi_init_info {
+	struct i40iw_sc_dev *dev;
+	void  *back_vsi;
+	struct i40iw_l2params *params;
+	u16 exception_lan_queue;
+};
+
+struct i40iw_vsi_stats_info {
+	struct i40iw_vsi_pestat *pestat;
+	u8 fcn_id;
+	bool alloc_fcn_id;
+	bool stats_initialize;
+};
+
+struct i40iw_device_init_info {
+	u64 fpm_query_buf_pa;
+	u64 fpm_commit_buf_pa;
+	u64 *fpm_query_buf;
+	u64 *fpm_commit_buf;
+	struct i40iw_hw *hw;
+	void __iomem *bar0;
+	enum i40iw_status_code (*vchnl_send)(struct i40iw_sc_dev *, u32, u8 *, u16);
+	u8 hmc_fn_id;
+	bool is_pf;
+	u32 debug_mask;
+};
+
+enum i40iw_cqp_hmc_profile {
+	I40IW_HMC_PROFILE_DEFAULT = 1,
+	I40IW_HMC_PROFILE_FAVOR_VF = 2,
+	I40IW_HMC_PROFILE_EQUAL = 3,
+};
+
+struct i40iw_cqp_init_info {
+	u64 cqp_compl_ctx;
+	u64 host_ctx_pa;
+	u64 sq_pa;
+	struct i40iw_sc_dev *dev;
+	struct i40iw_cqp_quanta *sq;
+	u64 *host_ctx;
+	u64 *scratch_array;
+	u32 sq_size;
+	u8 struct_ver;
+	bool en_datacenter_tcp;
+	u8 hmc_profile;
+	u8 enabled_vf_count;
+};
+
+struct i40iw_ceq_init_info {
+	u64 ceqe_pa;
+	struct i40iw_sc_dev *dev;
+	u64 *ceqe_base;
+	void *pbl_list;
+	u32 elem_cnt;
+	u32 ceq_id;
+	bool virtual_map;
+	u8 pbl_chunk_size;
+	bool tph_en;
+	u8 tph_val;
+	u32 first_pm_pbl_idx;
+};
+
+struct i40iw_aeq_init_info {
+	u64 aeq_elem_pa;
+	struct i40iw_sc_dev *dev;
+	u32 *aeqe_base;
+	void *pbl_list;
+	u32 elem_cnt;
+	bool virtual_map;
+	u8 pbl_chunk_size;
+	u32 first_pm_pbl_idx;
+};
+
+struct i40iw_ccq_init_info {
+	u64 cq_pa;
+	u64 shadow_area_pa;
+	struct i40iw_sc_dev *dev;
+	struct i40iw_cqe *cq_base;
+	u64 *shadow_area;
+	void *pbl_list;
+	u32 num_elem;
+	u32 ceq_id;
+	u32 shadow_read_threshold;
+	bool ceqe_mask;
+	bool ceq_id_valid;
+	bool tph_en;
+	u8 tph_val;
+	bool avoid_mem_cflct;
+	bool virtual_map;
+	u8 pbl_chunk_size;
+	u32 first_pm_pbl_idx;
+};
+
+struct i40iwarp_offload_info {
+	u16 rcv_mark_offset;
+	u16 snd_mark_offset;
+	u16 pd_id;
+	u8 ddp_ver;
+	u8 rdmap_ver;
+	u8 ord_size;
+	u8 ird_size;
+	bool wr_rdresp_en;
+	bool rd_enable;
+	bool snd_mark_en;
+	bool rcv_mark_en;
+	bool bind_en;
+	bool fast_reg_en;
+	bool priv_mode_en;
+	bool lsmm_present;
+	u8 iwarp_mode;
+	bool align_hdrs;
+	bool rcv_no_mpa_crc;
+
+	u8 last_byte_sent;
+};
+
+struct i40iw_tcp_offload_info {
+	bool ipv4;
+	bool no_nagle;
+	bool insert_vlan_tag;
+	bool time_stamp;
+	u8 cwnd_inc_limit;
+	bool drop_ooo_seg;
+	u8 dup_ack_thresh;
+	u8 ttl;
+	u8 src_mac_addr_idx;
+	bool avoid_stretch_ack;
+	u8 tos;
+	u16 src_port;
+	u16 dst_port;
+	u32 dest_ip_addr0;
+	u32 dest_ip_addr1;
+	u32 dest_ip_addr2;
+	u32 dest_ip_addr3;
+	u32 snd_mss;
+	u16 vlan_tag;
+	u16 arp_idx;
+	u32 flow_label;
+	bool wscale;
+	u8 tcp_state;
+	u8 snd_wscale;
+	u8 rcv_wscale;
+	u32 time_stamp_recent;
+	u32 time_stamp_age;
+	u32 snd_nxt;
+	u32 snd_wnd;
+	u32 rcv_nxt;
+	u32 rcv_wnd;
+	u32 snd_max;
+	u32 snd_una;
+	u32 srtt;
+	u32 rtt_var;
+	u32 ss_thresh;
+	u32 cwnd;
+	u32 snd_wl1;
+	u32 snd_wl2;
+	u32 max_snd_window;
+	u8 rexmit_thresh;
+	u32 local_ipaddr0;
+	u32 local_ipaddr1;
+	u32 local_ipaddr2;
+	u32 local_ipaddr3;
+	bool ignore_tcp_opt;
+	bool ignore_tcp_uns_opt;
+};
+
+struct i40iw_qp_host_ctx_info {
+	u64 qp_compl_ctx;
+	struct i40iw_tcp_offload_info *tcp_info;
+	struct i40iwarp_offload_info *iwarp_info;
+	u32 send_cq_num;
+	u32 rcv_cq_num;
+	u16 push_idx;
+	bool push_mode_en;
+	bool tcp_info_valid;
+	bool iwarp_info_valid;
+	bool err_rq_idx_valid;
+	u16 err_rq_idx;
+	bool add_to_qoslist;
+	u8 user_pri;
+};
+
+struct i40iw_aeqe_info {
+	u64 compl_ctx;
+	u32 qp_cq_id;
+	u16 ae_id;
+	u16 wqe_idx;
+	u8 tcp_state;
+	u8 iwarp_state;
+	bool qp;
+	bool cq;
+	bool sq;
+	bool in_rdrsp_wr;
+	bool out_rdrsp;
+	u8 q2_data_written;
+	bool aeqe_overflow;
+};
+
+struct i40iw_allocate_stag_info {
+	u64 total_len;
+	u32 chunk_size;
+	u32 stag_idx;
+	u32 page_size;
+	u16 pd_id;
+	u16 access_rights;
+	bool remote_access;
+	bool use_hmc_fcn_index;
+	u8 hmc_fcn_index;
+	bool use_pf_rid;
+};
+
+struct i40iw_reg_ns_stag_info {
+	u64 reg_addr_pa;
+	u64 fbo;
+	void *va;
+	u64 total_len;
+	u32 page_size;
+	u32 chunk_size;
+	u32 first_pm_pbl_index;
+	enum i40iw_addressing_type addr_type;
+	i40iw_stag_index stag_idx;
+	u16 access_rights;
+	u16 pd_id;
+	i40iw_stag_key stag_key;
+	bool use_hmc_fcn_index;
+	u8 hmc_fcn_index;
+	bool use_pf_rid;
+};
+
+struct i40iw_fast_reg_stag_info {
+	u64 wr_id;
+	u64 reg_addr_pa;
+	u64 fbo;
+	void *va;
+	u64 total_len;
+	u32 page_size;
+	u32 chunk_size;
+	u32 first_pm_pbl_index;
+	enum i40iw_addressing_type addr_type;
+	i40iw_stag_index stag_idx;
+	u16 access_rights;
+	u16 pd_id;
+	i40iw_stag_key stag_key;
+	bool local_fence;
+	bool read_fence;
+	bool signaled;
+	bool use_hmc_fcn_index;
+	u8 hmc_fcn_index;
+	bool use_pf_rid;
+	bool defer_flag;
+};
+
+struct i40iw_dealloc_stag_info {
+	u32 stag_idx;
+	u16 pd_id;
+	bool mr;
+	bool dealloc_pbl;
+};
+
+struct i40iw_register_shared_stag {
+	void *va;
+	enum i40iw_addressing_type addr_type;
+	i40iw_stag_index new_stag_idx;
+	i40iw_stag_index parent_stag_idx;
+	u32 access_rights;
+	u16 pd_id;
+	i40iw_stag_key new_stag_key;
+};
+
+struct i40iw_qp_init_info {
+	struct i40iw_qp_uk_init_info qp_uk_init_info;
+	struct i40iw_sc_pd *pd;
+	struct i40iw_sc_vsi *vsi;
+	u64 *host_ctx;
+	u8 *q2;
+	u64 sq_pa;
+	u64 rq_pa;
+	u64 host_ctx_pa;
+	u64 q2_pa;
+	u64 shadow_area_pa;
+	int abi_ver;
+	u8 sq_tph_val;
+	u8 rq_tph_val;
+	u8 type;
+	bool sq_tph_en;
+	bool rq_tph_en;
+	bool rcv_tph_en;
+	bool xmit_tph_en;
+	bool virtual_map;
+};
+
+struct i40iw_cq_init_info {
+	struct i40iw_sc_dev *dev;
+	u64 cq_base_pa;
+	u64 shadow_area_pa;
+	u32 ceq_id;
+	u32 shadow_read_threshold;
+	bool virtual_map;
+	bool ceqe_mask;
+	u8 pbl_chunk_size;
+	u32 first_pm_pbl_idx;
+	bool ceq_id_valid;
+	bool tph_en;
+	u8 tph_val;
+	u8 type;
+	struct i40iw_cq_uk_init_info cq_uk_init_info;
+};
+
+struct i40iw_upload_context_info {
+	u64 buf_pa;
+	bool freeze_qp;
+	bool raw_format;
+	u32 qp_id;
+	u8 qp_type;
+};
+
+struct i40iw_add_arp_cache_entry_info {
+	u8 mac_addr[6];
+	u32 reach_max;
+	u16 arp_index;
+	bool permanent;
+};
+
+struct i40iw_apbvt_info {
+	u16 port;
+	bool add;
+};
+
+enum i40iw_quad_entry_type {
+	I40IW_QHASH_TYPE_TCP_ESTABLISHED = 1,
+	I40IW_QHASH_TYPE_TCP_SYN,
+};
+
+enum i40iw_quad_hash_manage_type {
+	I40IW_QHASH_MANAGE_TYPE_DELETE = 0,
+	I40IW_QHASH_MANAGE_TYPE_ADD,
+	I40IW_QHASH_MANAGE_TYPE_MODIFY
+};
+
+struct i40iw_qhash_table_info {
+	struct i40iw_sc_vsi *vsi;
+	enum i40iw_quad_hash_manage_type manage;
+	enum i40iw_quad_entry_type entry_type;
+	bool vlan_valid;
+	bool ipv4_valid;
+	u8 mac_addr[6];
+	u16 vlan_id;
+	u8 user_pri;
+	u32 qp_num;
+	u32 dest_ip[4];
+	u32 src_ip[4];
+	u16 dest_port;
+	u16 src_port;
+};
+
+struct i40iw_local_mac_ipaddr_entry_info {
+	u8 mac_addr[6];
+	u8 entry_idx;
+};
+
+struct i40iw_cqp_manage_push_page_info {
+	u32 push_idx;
+	u16 qs_handle;
+	u8 free_page;
+};
+
+struct i40iw_qp_flush_info {
+	u16 sq_minor_code;
+	u16 sq_major_code;
+	u16 rq_minor_code;
+	u16 rq_major_code;
+	u16 ae_code;
+	u8 ae_source;
+	bool sq;
+	bool rq;
+	bool userflushcode;
+	bool generate_ae;
+};
+
+struct i40iw_cqp_commit_fpm_values {
+	u64 qp_base;
+	u64 cq_base;
+	u32 hte_base;
+	u32 arp_base;
+	u32 apbvt_inuse_base;
+	u32 mr_base;
+	u32 xf_base;
+	u32 xffl_base;
+	u32 q1_base;
+	u32 q1fl_base;
+	u32 fsimc_base;
+	u32 fsiav_base;
+	u32 pbl_base;
+
+	u32 qp_cnt;
+	u32 cq_cnt;
+	u32 hte_cnt;
+	u32 arp_cnt;
+	u32 mr_cnt;
+	u32 xf_cnt;
+	u32 xffl_cnt;
+	u32 q1_cnt;
+	u32 q1fl_cnt;
+	u32 fsimc_cnt;
+	u32 fsiav_cnt;
+	u32 pbl_cnt;
+};
+
+struct i40iw_cqp_query_fpm_values {
+	u16 first_pe_sd_index;
+	u32 qp_objsize;
+	u32 cq_objsize;
+	u32 hte_objsize;
+	u32 arp_objsize;
+	u32 mr_objsize;
+	u32 xf_objsize;
+	u32 q1_objsize;
+	u32 fsimc_objsize;
+	u32 fsiav_objsize;
+
+	u32 qp_max;
+	u32 cq_max;
+	u32 hte_max;
+	u32 arp_max;
+	u32 mr_max;
+	u32 xf_max;
+	u32 xffl_max;
+	u32 q1_max;
+	u32 q1fl_max;
+	u32 fsimc_max;
+	u32 fsiav_max;
+	u32 pbl_max;
+};
+
+struct i40iw_gen_ae_info {
+	u16 ae_code;
+	u8 ae_source;
+};
+
+struct i40iw_cqp_ops {
+	enum i40iw_status_code (*cqp_init)(struct i40iw_sc_cqp *,
+					   struct i40iw_cqp_init_info *);
+	enum i40iw_status_code (*cqp_create)(struct i40iw_sc_cqp *, u16 *, u16 *);
+	void (*cqp_post_sq)(struct i40iw_sc_cqp *);
+	u64 *(*cqp_get_next_send_wqe)(struct i40iw_sc_cqp *, u64 scratch);
+	enum i40iw_status_code (*cqp_destroy)(struct i40iw_sc_cqp *);
+	enum i40iw_status_code (*poll_for_cqp_op_done)(struct i40iw_sc_cqp *, u8,
+						       struct i40iw_ccq_cqe_info *);
+};
+
+struct i40iw_ccq_ops {
+	enum i40iw_status_code (*ccq_init)(struct i40iw_sc_cq *,
+					   struct i40iw_ccq_init_info *);
+	enum i40iw_status_code (*ccq_create)(struct i40iw_sc_cq *, u64, bool, bool);
+	enum i40iw_status_code (*ccq_destroy)(struct i40iw_sc_cq *, u64, bool);
+	enum i40iw_status_code (*ccq_create_done)(struct i40iw_sc_cq *);
+	enum i40iw_status_code (*ccq_get_cqe_info)(struct i40iw_sc_cq *,
+						   struct i40iw_ccq_cqe_info *);
+	void (*ccq_arm)(struct i40iw_sc_cq *);
+};
+
+struct i40iw_ceq_ops {
+	enum i40iw_status_code (*ceq_init)(struct i40iw_sc_ceq *,
+					   struct i40iw_ceq_init_info *);
+	enum i40iw_status_code (*ceq_create)(struct i40iw_sc_ceq *, u64, bool);
+	enum i40iw_status_code (*cceq_create_done)(struct i40iw_sc_ceq *);
+	enum i40iw_status_code (*cceq_destroy_done)(struct i40iw_sc_ceq *);
+	enum i40iw_status_code (*cceq_create)(struct i40iw_sc_ceq *, u64);
+	enum i40iw_status_code (*ceq_destroy)(struct i40iw_sc_ceq *, u64, bool);
+	void *(*process_ceq)(struct i40iw_sc_dev *, struct i40iw_sc_ceq *);
+};
+
+struct i40iw_aeq_ops {
+	enum i40iw_status_code (*aeq_init)(struct i40iw_sc_aeq *,
+					   struct i40iw_aeq_init_info *);
+	enum i40iw_status_code (*aeq_create)(struct i40iw_sc_aeq *, u64, bool);
+	enum i40iw_status_code (*aeq_destroy)(struct i40iw_sc_aeq *, u64, bool);
+	enum i40iw_status_code (*get_next_aeqe)(struct i40iw_sc_aeq *,
+						struct i40iw_aeqe_info *);
+	enum i40iw_status_code (*repost_aeq_entries)(struct i40iw_sc_dev *, u32);
+	enum i40iw_status_code (*aeq_create_done)(struct i40iw_sc_aeq *);
+	enum i40iw_status_code (*aeq_destroy_done)(struct i40iw_sc_aeq *);
+};
+
+struct i40iw_pd_ops {
+	void (*pd_init)(struct i40iw_sc_dev *, struct i40iw_sc_pd *, u16, int);
+};
+
+struct i40iw_priv_qp_ops {
+	enum i40iw_status_code (*qp_init)(struct i40iw_sc_qp *, struct i40iw_qp_init_info *);
+	enum i40iw_status_code (*qp_create)(struct i40iw_sc_qp *,
+					    struct i40iw_create_qp_info *, u64, bool);
+	enum i40iw_status_code (*qp_modify)(struct i40iw_sc_qp *,
+					    struct i40iw_modify_qp_info *, u64, bool);
+	enum i40iw_status_code (*qp_destroy)(struct i40iw_sc_qp *, u64, bool, bool, bool);
+	enum i40iw_status_code (*qp_flush_wqes)(struct i40iw_sc_qp *,
+						struct i40iw_qp_flush_info *, u64, bool);
+	enum i40iw_status_code (*qp_upload_context)(struct i40iw_sc_dev *,
+						    struct i40iw_upload_context_info *,
+						    u64, bool);
+	enum i40iw_status_code (*qp_setctx)(struct i40iw_sc_qp *, u64 *,
+					    struct i40iw_qp_host_ctx_info *);
+
+	void (*qp_send_lsmm)(struct i40iw_sc_qp *, void *, u32, i40iw_stag);
+	void (*qp_send_lsmm_nostag)(struct i40iw_sc_qp *, void *, u32);
+	void (*qp_send_rtt)(struct i40iw_sc_qp *, bool);
+	enum i40iw_status_code (*qp_post_wqe0)(struct i40iw_sc_qp *, u8);
+	enum i40iw_status_code (*iw_mr_fast_register)(struct i40iw_sc_qp *,
+						      struct i40iw_fast_reg_stag_info *,
+						      bool);
+};
+
+struct i40iw_priv_cq_ops {
+	enum i40iw_status_code (*cq_init)(struct i40iw_sc_cq *, struct i40iw_cq_init_info *);
+	enum i40iw_status_code (*cq_create)(struct i40iw_sc_cq *, u64, bool, bool);
+	enum i40iw_status_code (*cq_destroy)(struct i40iw_sc_cq *, u64, bool);
+	enum i40iw_status_code (*cq_modify)(struct i40iw_sc_cq *,
+					    struct i40iw_modify_cq_info *, u64, bool);
+};
+
+struct i40iw_mr_ops {
+	enum i40iw_status_code (*alloc_stag)(struct i40iw_sc_dev *,
+					     struct i40iw_allocate_stag_info *, u64, bool);
+	enum i40iw_status_code (*mr_reg_non_shared)(struct i40iw_sc_dev *,
+						    struct i40iw_reg_ns_stag_info *,
+						    u64, bool);
+	enum i40iw_status_code (*mr_reg_shared)(struct i40iw_sc_dev *,
+						struct i40iw_register_shared_stag *,
+						u64, bool);
+	enum i40iw_status_code (*dealloc_stag)(struct i40iw_sc_dev *,
+					       struct i40iw_dealloc_stag_info *,
+					       u64, bool);
+	enum i40iw_status_code (*query_stag)(struct i40iw_sc_dev *, u64, u32, bool);
+	enum i40iw_status_code (*mw_alloc)(struct i40iw_sc_dev *, u64, u32, u16, bool);
+};
+
+struct i40iw_cqp_misc_ops {
+	enum i40iw_status_code (*manage_push_page)(struct i40iw_sc_cqp *,
+						   struct i40iw_cqp_manage_push_page_info *,
+						   u64, bool);
+	enum i40iw_status_code (*manage_hmc_pm_func_table)(struct i40iw_sc_cqp *,
+							   u64, u8, bool, bool);
+	enum i40iw_status_code (*set_hmc_resource_profile)(struct i40iw_sc_cqp *,
+							   u64, u8, u8, bool, bool);
+	enum i40iw_status_code (*commit_fpm_values)(struct i40iw_sc_cqp *, u64, u8,
+						    struct i40iw_dma_mem *, bool, u8);
+	enum i40iw_status_code (*query_fpm_values)(struct i40iw_sc_cqp *, u64, u8,
+						   struct i40iw_dma_mem *, bool, u8);
+	enum i40iw_status_code (*static_hmc_pages_allocated)(struct i40iw_sc_cqp *,
+							     u64, u8, bool, bool);
+	enum i40iw_status_code (*add_arp_cache_entry)(struct i40iw_sc_cqp *,
+						      struct i40iw_add_arp_cache_entry_info *,
+						      u64, bool);
+	enum i40iw_status_code (*del_arp_cache_entry)(struct i40iw_sc_cqp *, u64, u16, bool);
+	enum i40iw_status_code (*query_arp_cache_entry)(struct i40iw_sc_cqp *, u64, u16, bool);
+	enum i40iw_status_code (*manage_apbvt_entry)(struct i40iw_sc_cqp *,
+						     struct i40iw_apbvt_info *, u64, bool);
+	enum i40iw_status_code (*manage_qhash_table_entry)(struct i40iw_sc_cqp *,
+							   struct i40iw_qhash_table_info *, u64, bool);
+	enum i40iw_status_code (*alloc_local_mac_ipaddr_table_entry)(struct i40iw_sc_cqp *, u64, bool);
+	enum i40iw_status_code (*add_local_mac_ipaddr_entry)(struct i40iw_sc_cqp *,
+							     struct i40iw_local_mac_ipaddr_entry_info *,
+							     u64, bool);
+	enum i40iw_status_code (*del_local_mac_ipaddr_entry)(struct i40iw_sc_cqp *, u64, u8, u8, bool);
+	enum i40iw_status_code (*cqp_nop)(struct i40iw_sc_cqp *, u64, bool);
+	enum i40iw_status_code (*commit_fpm_values_done)(struct i40iw_sc_cqp
+							  *);
+	enum i40iw_status_code (*query_fpm_values_done)(struct i40iw_sc_cqp *);
+	enum i40iw_status_code (*manage_hmc_pm_func_table_done)(struct i40iw_sc_cqp *);
+	enum i40iw_status_code (*update_suspend_qp)(struct i40iw_sc_cqp *, struct i40iw_sc_qp *, u64);
+	enum i40iw_status_code (*update_resume_qp)(struct i40iw_sc_cqp *, struct i40iw_sc_qp *, u64);
+};
+
+struct i40iw_hmc_ops {
+	enum i40iw_status_code (*init_iw_hmc)(struct i40iw_sc_dev *, u8);
+	enum i40iw_status_code (*parse_fpm_query_buf)(u64 *, struct i40iw_hmc_info *,
+						      struct i40iw_hmc_fpm_misc *);
+	enum i40iw_status_code (*configure_iw_fpm)(struct i40iw_sc_dev *, u8);
+	enum i40iw_status_code (*parse_fpm_commit_buf)(u64 *, struct i40iw_hmc_obj_info *, u32 *sd);
+	enum i40iw_status_code (*create_hmc_object)(struct i40iw_sc_dev *dev,
+						    struct i40iw_hmc_create_obj_info *);
+	enum i40iw_status_code (*del_hmc_object)(struct i40iw_sc_dev *dev,
+						 struct i40iw_hmc_del_obj_info *,
+						 bool reset);
+	enum i40iw_status_code (*pf_init_vfhmc)(struct i40iw_sc_dev *, u8, u32 *);
+	enum i40iw_status_code (*vf_configure_vffpm)(struct i40iw_sc_dev *, u32 *);
+};
+
+struct cqp_info {
+	union {
+		struct {
+			struct i40iw_sc_qp *qp;
+			struct i40iw_create_qp_info info;
+			u64 scratch;
+		} qp_create;
+
+		struct {
+			struct i40iw_sc_qp *qp;
+			struct i40iw_modify_qp_info info;
+			u64 scratch;
+		} qp_modify;
+
+		struct {
+			struct i40iw_sc_qp *qp;
+			u64 scratch;
+			bool remove_hash_idx;
+			bool ignore_mw_bnd;
+		} qp_destroy;
+
+		struct {
+			struct i40iw_sc_cq *cq;
+			u64 scratch;
+			bool check_overflow;
+		} cq_create;
+
+		struct {
+			struct i40iw_sc_cq *cq;
+			u64 scratch;
+		} cq_destroy;
+
+		struct {
+			struct i40iw_sc_dev *dev;
+			struct i40iw_allocate_stag_info info;
+			u64 scratch;
+		} alloc_stag;
+
+		struct {
+			struct i40iw_sc_dev *dev;
+			u64 scratch;
+			u32 mw_stag_index;
+			u16 pd_id;
+		} mw_alloc;
+
+		struct {
+			struct i40iw_sc_dev *dev;
+			struct i40iw_reg_ns_stag_info info;
+			u64 scratch;
+		} mr_reg_non_shared;
+
+		struct {
+			struct i40iw_sc_dev *dev;
+			struct i40iw_dealloc_stag_info info;
+			u64 scratch;
+		} dealloc_stag;
+
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			struct i40iw_local_mac_ipaddr_entry_info info;
+			u64 scratch;
+		} add_local_mac_ipaddr_entry;
+
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			struct i40iw_add_arp_cache_entry_info info;
+			u64 scratch;
+		} add_arp_cache_entry;
+
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			u64 scratch;
+			u8 entry_idx;
+			u8 ignore_ref_count;
+		} del_local_mac_ipaddr_entry;
+
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			u64 scratch;
+			u16 arp_index;
+		} del_arp_cache_entry;
+
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			struct i40iw_manage_vf_pble_info info;
+			u64 scratch;
+		} manage_vf_pble_bp;
+
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			struct i40iw_cqp_manage_push_page_info info;
+			u64 scratch;
+		} manage_push_page;
+
+		struct {
+			struct i40iw_sc_dev *dev;
+			struct i40iw_upload_context_info info;
+			u64 scratch;
+		} qp_upload_context;
+
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			u64 scratch;
+		} alloc_local_mac_ipaddr_entry;
+
+		struct {
+			struct i40iw_sc_dev *dev;
+			struct i40iw_hmc_fcn_info info;
+			u64 scratch;
+		} manage_hmc_pm;
+
+		struct {
+			struct i40iw_sc_ceq *ceq;
+			u64 scratch;
+		} ceq_create;
+
+		struct {
+			struct i40iw_sc_ceq *ceq;
+			u64 scratch;
+		} ceq_destroy;
+
+		struct {
+			struct i40iw_sc_aeq *aeq;
+			u64 scratch;
+		} aeq_create;
+
+		struct {
+			struct i40iw_sc_aeq *aeq;
+			u64 scratch;
+		} aeq_destroy;
+
+		struct {
+			struct i40iw_sc_qp *qp;
+			struct i40iw_qp_flush_info info;
+			u64 scratch;
+		} qp_flush_wqes;
+
+		struct {
+			struct i40iw_sc_qp *qp;
+			struct i40iw_gen_ae_info info;
+			u64 scratch;
+		} gen_ae;
+
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			void *fpm_values_va;
+			u64 fpm_values_pa;
+			u8 hmc_fn_id;
+			u64 scratch;
+		} query_fpm_values;
+
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			void *fpm_values_va;
+			u64 fpm_values_pa;
+			u8 hmc_fn_id;
+			u64 scratch;
+		} commit_fpm_values;
+
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			struct i40iw_apbvt_info info;
+			u64 scratch;
+		} manage_apbvt_entry;
+
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			struct i40iw_qhash_table_info info;
+			u64 scratch;
+		} manage_qhash_table_entry;
+
+		struct {
+			struct i40iw_sc_dev *dev;
+			struct i40iw_update_sds_info info;
+			u64 scratch;
+		} update_pe_sds;
+
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			struct i40iw_sc_qp *qp;
+			u64 scratch;
+		} suspend_resume;
+	} u;
+};
+
+struct cqp_commands_info {
+	struct list_head cqp_cmd_entry;
+	u8 cqp_cmd;
+	u8 post_sq;
+	struct cqp_info in;
+};
+
+struct i40iw_virtchnl_work_info {
+	void (*callback_fcn)(void *vf_dev);
+	void *worker_vf_dev;
+};
+
+struct i40iw_cqp_timeout {
+	u64 compl_cqp_cmds;
+	u8 count;
+};
+
+#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c b/drivers/infiniband/hw/i40iw/i40iw_uk.c
new file mode 100644
index 0000000..8afa5a6
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_uk.c
@@ -0,0 +1,1232 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#include "i40iw_osdep.h"
+#include "i40iw_status.h"
+#include "i40iw_d.h"
+#include "i40iw_user.h"
+#include "i40iw_register.h"
+
+static u32 nop_signature = 0x55550000;
+
+/**
+ * i40iw_nop_1 - insert a nop wqe and move head. no post work
+ * @qp: hw qp ptr
+ */
+static enum i40iw_status_code i40iw_nop_1(struct i40iw_qp_uk *qp)
+{
+	u64 header, *wqe;
+	u64 *wqe_0 = NULL;
+	u32 wqe_idx, peek_head;
+	bool signaled = false;
+
+	if (!qp->sq_ring.head)
+		return I40IW_ERR_PARAM;
+
+	wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
+	wqe = qp->sq_base[wqe_idx].elem;
+
+	qp->sq_wrtrk_array[wqe_idx].wqe_size = I40IW_QP_WQE_MIN_SIZE;
+
+	peek_head = (qp->sq_ring.head + 1) % qp->sq_ring.size;
+	wqe_0 = qp->sq_base[peek_head].elem;
+	if (peek_head)
+		wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID);
+	else
+		wqe_0[3] = LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
+
+	set_64bit_val(wqe, 0, 0);
+	set_64bit_val(wqe, 8, 0);
+	set_64bit_val(wqe, 16, 0);
+
+	header = LS_64(I40IWQP_OP_NOP, I40IWQPSQ_OPCODE) |
+	    LS_64(signaled, I40IWQPSQ_SIGCOMPL) |
+	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID) | nop_signature++;
+
+	wmb();	/* Memory barrier to ensure data is written before valid bit is set */
+
+	set_64bit_val(wqe, 24, header);
+	return 0;
+}
+
+/**
+ * i40iw_qp_post_wr - post wr to hrdware
+ * @qp: hw qp ptr
+ */
+void i40iw_qp_post_wr(struct i40iw_qp_uk *qp)
+{
+	u64 temp;
+	u32 hw_sq_tail;
+	u32 sw_sq_head;
+
+	mb(); /* valid bit is written and loads completed before reading shadow */
+
+	/* read the doorbell shadow area */
+	get_64bit_val(qp->shadow_area, 0, &temp);
+
+	hw_sq_tail = (u32)RS_64(temp, I40IW_QP_DBSA_HW_SQ_TAIL);
+	sw_sq_head = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
+	if (sw_sq_head != hw_sq_tail) {
+		if (sw_sq_head > qp->initial_ring.head) {
+			if ((hw_sq_tail >= qp->initial_ring.head) &&
+			    (hw_sq_tail < sw_sq_head)) {
+				writel(qp->qp_id, qp->wqe_alloc_reg);
+			}
+		} else if (sw_sq_head != qp->initial_ring.head) {
+			if ((hw_sq_tail >= qp->initial_ring.head) ||
+			    (hw_sq_tail < sw_sq_head)) {
+				writel(qp->qp_id, qp->wqe_alloc_reg);
+			}
+		}
+	}
+
+	qp->initial_ring.head = qp->sq_ring.head;
+}
+
+/**
+ * i40iw_qp_ring_push_db -  ring qp doorbell
+ * @qp: hw qp ptr
+ * @wqe_idx: wqe index
+ */
+static void i40iw_qp_ring_push_db(struct i40iw_qp_uk *qp, u32 wqe_idx)
+{
+	set_32bit_val(qp->push_db, 0, LS_32((wqe_idx >> 2), I40E_PFPE_WQEALLOC_WQE_DESC_INDEX) | qp->qp_id);
+	qp->initial_ring.head = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
+}
+
+/**
+ * i40iw_qp_get_next_send_wqe - return next wqe ptr
+ * @qp: hw qp ptr
+ * @wqe_idx: return wqe index
+ * @wqe_size: size of sq wqe
+ */
+u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp,
+				u32 *wqe_idx,
+				u8 wqe_size,
+				u32 total_size,
+				u64 wr_id
+				)
+{
+	u64 *wqe = NULL;
+	u64 wqe_ptr;
+	u32 peek_head = 0;
+	u16 offset;
+	enum i40iw_status_code ret_code = 0;
+	u8 nop_wqe_cnt = 0, i;
+	u64 *wqe_0 = NULL;
+
+	*wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
+
+	if (!*wqe_idx)
+		qp->swqe_polarity = !qp->swqe_polarity;
+	wqe_ptr = (uintptr_t)qp->sq_base[*wqe_idx].elem;
+	offset = (u16)(wqe_ptr) & 0x7F;
+	if ((offset + wqe_size) > I40IW_QP_WQE_MAX_SIZE) {
+		nop_wqe_cnt = (u8)(I40IW_QP_WQE_MAX_SIZE - offset) / I40IW_QP_WQE_MIN_SIZE;
+		for (i = 0; i < nop_wqe_cnt; i++) {
+			i40iw_nop_1(qp);
+			I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code);
+			if (ret_code)
+				return NULL;
+		}
+
+		*wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
+		if (!*wqe_idx)
+			qp->swqe_polarity = !qp->swqe_polarity;
+	}
+
+	if (((*wqe_idx & 3) == 1) && (wqe_size == I40IW_WQE_SIZE_64)) {
+		i40iw_nop_1(qp);
+		I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code);
+		if (ret_code)
+			return NULL;
+		*wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
+		if (!*wqe_idx)
+			qp->swqe_polarity = !qp->swqe_polarity;
+	}
+	I40IW_RING_MOVE_HEAD_BY_COUNT(qp->sq_ring,
+				      wqe_size / I40IW_QP_WQE_MIN_SIZE, ret_code);
+	if (ret_code)
+		return NULL;
+
+	wqe = qp->sq_base[*wqe_idx].elem;
+
+	peek_head = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
+	wqe_0 = qp->sq_base[peek_head].elem;
+
+	if (((peek_head & 3) == 1) || ((peek_head & 3) == 3)) {
+		if (RS_64(wqe_0[3], I40IWQPSQ_VALID) != !qp->swqe_polarity)
+			wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID);
+	}
+
+	qp->sq_wrtrk_array[*wqe_idx].wrid = wr_id;
+	qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size;
+	qp->sq_wrtrk_array[*wqe_idx].wqe_size = wqe_size;
+	return wqe;
+}
+
+/**
+ * i40iw_set_fragment - set fragment in wqe
+ * @wqe: wqe for setting fragment
+ * @offset: offset value
+ * @sge: sge length and stag
+ */
+static void i40iw_set_fragment(u64 *wqe, u32 offset, struct i40iw_sge *sge)
+{
+	if (sge) {
+		set_64bit_val(wqe, offset, LS_64(sge->tag_off, I40IWQPSQ_FRAG_TO));
+		set_64bit_val(wqe, (offset + 8),
+			      (LS_64(sge->len, I40IWQPSQ_FRAG_LEN) |
+			       LS_64(sge->stag, I40IWQPSQ_FRAG_STAG)));
+	}
+}
+
+/**
+ * i40iw_qp_get_next_recv_wqe - get next qp's rcv wqe
+ * @qp: hw qp ptr
+ * @wqe_idx: return wqe index
+ */
+u64 *i40iw_qp_get_next_recv_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx)
+{
+	u64 *wqe = NULL;
+	enum i40iw_status_code ret_code;
+
+	if (I40IW_RING_FULL_ERR(qp->rq_ring))
+		return NULL;
+
+	I40IW_ATOMIC_RING_MOVE_HEAD(qp->rq_ring, *wqe_idx, ret_code);
+	if (ret_code)
+		return NULL;
+	if (!*wqe_idx)
+		qp->rwqe_polarity = !qp->rwqe_polarity;
+	/* rq_wqe_size_multiplier is no of qwords in one rq wqe */
+	wqe = qp->rq_base[*wqe_idx * (qp->rq_wqe_size_multiplier >> 2)].elem;
+
+	return wqe;
+}
+
+/**
+ * i40iw_rdma_write - rdma write operation
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @post_sq: flag to post sq
+ */
+static enum i40iw_status_code i40iw_rdma_write(struct i40iw_qp_uk *qp,
+					       struct i40iw_post_sq_info *info,
+					       bool post_sq)
+{
+	u64 header;
+	u64 *wqe;
+	struct i40iw_rdma_write *op_info;
+	u32 i, wqe_idx;
+	u32 total_size = 0, byte_off;
+	enum i40iw_status_code ret_code;
+	bool read_fence = false;
+	u8 wqe_size;
+
+	op_info = &info->op.rdma_write;
+	if (op_info->num_lo_sges > qp->max_sq_frag_cnt)
+		return I40IW_ERR_INVALID_FRAG_COUNT;
+
+	for (i = 0; i < op_info->num_lo_sges; i++)
+		total_size += op_info->lo_sg_list[i].len;
+
+	if (total_size > I40IW_MAX_OUTBOUND_MESSAGE_SIZE)
+		return I40IW_ERR_QP_INVALID_MSG_SIZE;
+
+	read_fence |= info->read_fence;
+
+	ret_code = i40iw_fragcnt_to_wqesize_sq(op_info->num_lo_sges, &wqe_size);
+	if (ret_code)
+		return ret_code;
+
+	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, total_size, info->wr_id);
+	if (!wqe)
+		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+	set_64bit_val(wqe, 16,
+		      LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO));
+	if (!op_info->rem_addr.stag)
+		return I40IW_ERR_BAD_STAG;
+
+	header = LS_64(op_info->rem_addr.stag, I40IWQPSQ_REMSTAG) |
+		 LS_64(I40IWQP_OP_RDMA_WRITE, I40IWQPSQ_OPCODE) |
+		 LS_64((op_info->num_lo_sges > 1 ?  (op_info->num_lo_sges - 1) : 0), I40IWQPSQ_ADDFRAGCNT) |
+		 LS_64(read_fence, I40IWQPSQ_READFENCE) |
+		 LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) |
+		 LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
+		 LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
+
+	i40iw_set_fragment(wqe, 0, op_info->lo_sg_list);
+
+	for (i = 1, byte_off = 32; i < op_info->num_lo_sges; i++) {
+		i40iw_set_fragment(wqe, byte_off, &op_info->lo_sg_list[i]);
+		byte_off += 16;
+	}
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, header);
+
+	if (post_sq)
+		i40iw_qp_post_wr(qp);
+
+	return 0;
+}
+
+/**
+ * i40iw_rdma_read - rdma read command
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @inv_stag: flag for inv_stag
+ * @post_sq: flag to post sq
+ */
+static enum i40iw_status_code i40iw_rdma_read(struct i40iw_qp_uk *qp,
+					      struct i40iw_post_sq_info *info,
+					      bool inv_stag,
+					      bool post_sq)
+{
+	u64 *wqe;
+	struct i40iw_rdma_read *op_info;
+	u64 header;
+	u32 wqe_idx;
+	enum i40iw_status_code ret_code;
+	u8 wqe_size;
+	bool local_fence = false;
+
+	op_info = &info->op.rdma_read;
+	ret_code = i40iw_fragcnt_to_wqesize_sq(1, &wqe_size);
+	if (ret_code)
+		return ret_code;
+	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->lo_addr.len, info->wr_id);
+	if (!wqe)
+		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+	local_fence |= info->local_fence;
+
+	set_64bit_val(wqe, 16, LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO));
+	header = LS_64(op_info->rem_addr.stag, I40IWQPSQ_REMSTAG) |
+		 LS_64((inv_stag ? I40IWQP_OP_RDMA_READ_LOC_INV : I40IWQP_OP_RDMA_READ), I40IWQPSQ_OPCODE) |
+		 LS_64(info->read_fence, I40IWQPSQ_READFENCE) |
+		 LS_64(local_fence, I40IWQPSQ_LOCALFENCE) |
+		 LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
+		 LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
+
+	i40iw_set_fragment(wqe, 0, &op_info->lo_addr);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, header);
+	if (post_sq)
+		i40iw_qp_post_wr(qp);
+
+	return 0;
+}
+
+/**
+ * i40iw_send - rdma send command
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @stag_to_inv: stag_to_inv value
+ * @post_sq: flag to post sq
+ */
+static enum i40iw_status_code i40iw_send(struct i40iw_qp_uk *qp,
+					 struct i40iw_post_sq_info *info,
+					 u32 stag_to_inv,
+					 bool post_sq)
+{
+	u64 *wqe;
+	struct i40iw_post_send *op_info;
+	u64 header;
+	u32 i, wqe_idx, total_size = 0, byte_off;
+	enum i40iw_status_code ret_code;
+	bool read_fence = false;
+	u8 wqe_size;
+
+	op_info = &info->op.send;
+	if (qp->max_sq_frag_cnt < op_info->num_sges)
+		return I40IW_ERR_INVALID_FRAG_COUNT;
+
+	for (i = 0; i < op_info->num_sges; i++)
+		total_size += op_info->sg_list[i].len;
+	ret_code = i40iw_fragcnt_to_wqesize_sq(op_info->num_sges, &wqe_size);
+	if (ret_code)
+		return ret_code;
+
+	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, total_size, info->wr_id);
+	if (!wqe)
+		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+
+	read_fence |= info->read_fence;
+	set_64bit_val(wqe, 16, 0);
+	header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) |
+		 LS_64(info->op_type, I40IWQPSQ_OPCODE) |
+		 LS_64((op_info->num_sges > 1 ? (op_info->num_sges - 1) : 0),
+		       I40IWQPSQ_ADDFRAGCNT) |
+		 LS_64(read_fence, I40IWQPSQ_READFENCE) |
+		 LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) |
+		 LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
+		 LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
+
+	i40iw_set_fragment(wqe, 0, op_info->sg_list);
+
+	for (i = 1, byte_off = 32; i < op_info->num_sges; i++) {
+		i40iw_set_fragment(wqe, byte_off, &op_info->sg_list[i]);
+		byte_off += 16;
+	}
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, header);
+	if (post_sq)
+		i40iw_qp_post_wr(qp);
+
+	return 0;
+}
+
+/**
+ * i40iw_inline_rdma_write - inline rdma write operation
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @post_sq: flag to post sq
+ */
+static enum i40iw_status_code i40iw_inline_rdma_write(struct i40iw_qp_uk *qp,
+						      struct i40iw_post_sq_info *info,
+						      bool post_sq)
+{
+	u64 *wqe;
+	u8 *dest, *src;
+	struct i40iw_inline_rdma_write *op_info;
+	u64 *push;
+	u64 header = 0;
+	u32 wqe_idx;
+	enum i40iw_status_code ret_code;
+	bool read_fence = false;
+	u8 wqe_size;
+
+	op_info = &info->op.inline_rdma_write;
+	if (op_info->len > I40IW_MAX_INLINE_DATA_SIZE)
+		return I40IW_ERR_INVALID_INLINE_DATA_SIZE;
+
+	ret_code = i40iw_inline_data_size_to_wqesize(op_info->len, &wqe_size);
+	if (ret_code)
+		return ret_code;
+
+	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->len, info->wr_id);
+	if (!wqe)
+		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+
+	read_fence |= info->read_fence;
+	set_64bit_val(wqe, 16,
+		      LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO));
+
+	header = LS_64(op_info->rem_addr.stag, I40IWQPSQ_REMSTAG) |
+		 LS_64(I40IWQP_OP_RDMA_WRITE, I40IWQPSQ_OPCODE) |
+		 LS_64(op_info->len, I40IWQPSQ_INLINEDATALEN) |
+		 LS_64(1, I40IWQPSQ_INLINEDATAFLAG) |
+		 LS_64((qp->push_db ? 1 : 0), I40IWQPSQ_PUSHWQE) |
+		 LS_64(read_fence, I40IWQPSQ_READFENCE) |
+		 LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) |
+		 LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
+		 LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
+
+	dest = (u8 *)wqe;
+	src = (u8 *)(op_info->data);
+
+	if (op_info->len <= 16) {
+		memcpy(dest, src, op_info->len);
+	} else {
+		memcpy(dest, src, 16);
+		src += 16;
+		dest = (u8 *)wqe + 32;
+		memcpy(dest, src, op_info->len - 16);
+	}
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, header);
+
+	if (qp->push_db) {
+		push = (u64 *)((uintptr_t)qp->push_wqe + (wqe_idx & 0x3) * 0x20);
+		memcpy(push, wqe, (op_info->len > 16) ? op_info->len + 16 : 32);
+		i40iw_qp_ring_push_db(qp, wqe_idx);
+	} else {
+		if (post_sq)
+			i40iw_qp_post_wr(qp);
+	}
+
+	return 0;
+}
+
+/**
+ * i40iw_inline_send - inline send operation
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @stag_to_inv: remote stag
+ * @post_sq: flag to post sq
+ */
+static enum i40iw_status_code i40iw_inline_send(struct i40iw_qp_uk *qp,
+						struct i40iw_post_sq_info *info,
+						u32 stag_to_inv,
+						bool post_sq)
+{
+	u64 *wqe;
+	u8 *dest, *src;
+	struct i40iw_post_inline_send *op_info;
+	u64 header;
+	u32 wqe_idx;
+	enum i40iw_status_code ret_code;
+	bool read_fence = false;
+	u8 wqe_size;
+	u64 *push;
+
+	op_info = &info->op.inline_send;
+	if (op_info->len > I40IW_MAX_INLINE_DATA_SIZE)
+		return I40IW_ERR_INVALID_INLINE_DATA_SIZE;
+
+	ret_code = i40iw_inline_data_size_to_wqesize(op_info->len, &wqe_size);
+	if (ret_code)
+		return ret_code;
+
+	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->len, info->wr_id);
+	if (!wqe)
+		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+
+	read_fence |= info->read_fence;
+	header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) |
+	    LS_64(info->op_type, I40IWQPSQ_OPCODE) |
+	    LS_64(op_info->len, I40IWQPSQ_INLINEDATALEN) |
+	    LS_64(1, I40IWQPSQ_INLINEDATAFLAG) |
+	    LS_64((qp->push_db ? 1 : 0), I40IWQPSQ_PUSHWQE) |
+	    LS_64(read_fence, I40IWQPSQ_READFENCE) |
+	    LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) |
+	    LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
+	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
+
+	dest = (u8 *)wqe;
+	src = (u8 *)(op_info->data);
+
+	if (op_info->len <= 16) {
+		memcpy(dest, src, op_info->len);
+	} else {
+		memcpy(dest, src, 16);
+		src += 16;
+		dest = (u8 *)wqe + 32;
+		memcpy(dest, src, op_info->len - 16);
+	}
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, header);
+
+	if (qp->push_db) {
+		push = (u64 *)((uintptr_t)qp->push_wqe + (wqe_idx & 0x3) * 0x20);
+		memcpy(push, wqe, (op_info->len > 16) ? op_info->len + 16 : 32);
+		i40iw_qp_ring_push_db(qp, wqe_idx);
+	} else {
+		if (post_sq)
+			i40iw_qp_post_wr(qp);
+	}
+
+	return 0;
+}
+
+/**
+ * i40iw_stag_local_invalidate - stag invalidate operation
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @post_sq: flag to post sq
+ */
+static enum i40iw_status_code i40iw_stag_local_invalidate(struct i40iw_qp_uk *qp,
+							  struct i40iw_post_sq_info *info,
+							  bool post_sq)
+{
+	u64 *wqe;
+	struct i40iw_inv_local_stag *op_info;
+	u64 header;
+	u32 wqe_idx;
+	bool local_fence = false;
+
+	op_info = &info->op.inv_local_stag;
+	local_fence = info->local_fence;
+
+	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, info->wr_id);
+	if (!wqe)
+		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+	set_64bit_val(wqe, 0, 0);
+	set_64bit_val(wqe, 8,
+		      LS_64(op_info->target_stag, I40IWQPSQ_LOCSTAG));
+	set_64bit_val(wqe, 16, 0);
+	header = LS_64(I40IW_OP_TYPE_INV_STAG, I40IWQPSQ_OPCODE) |
+	    LS_64(info->read_fence, I40IWQPSQ_READFENCE) |
+	    LS_64(local_fence, I40IWQPSQ_LOCALFENCE) |
+	    LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
+	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, header);
+
+	if (post_sq)
+		i40iw_qp_post_wr(qp);
+
+	return 0;
+}
+
+/**
+ * i40iw_mw_bind - Memory Window bind operation
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @post_sq: flag to post sq
+ */
+static enum i40iw_status_code i40iw_mw_bind(struct i40iw_qp_uk *qp,
+					    struct i40iw_post_sq_info *info,
+					    bool post_sq)
+{
+	u64 *wqe;
+	struct i40iw_bind_window *op_info;
+	u64 header;
+	u32 wqe_idx;
+	bool local_fence = false;
+
+	op_info = &info->op.bind_window;
+
+	local_fence |= info->local_fence;
+	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, info->wr_id);
+	if (!wqe)
+		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+	set_64bit_val(wqe, 0, (uintptr_t)op_info->va);
+	set_64bit_val(wqe, 8,
+		      LS_64(op_info->mr_stag, I40IWQPSQ_PARENTMRSTAG) |
+		      LS_64(op_info->mw_stag, I40IWQPSQ_MWSTAG));
+	set_64bit_val(wqe, 16, op_info->bind_length);
+	header = LS_64(I40IW_OP_TYPE_BIND_MW, I40IWQPSQ_OPCODE) |
+	    LS_64(((op_info->enable_reads << 2) |
+		   (op_info->enable_writes << 3)),
+		  I40IWQPSQ_STAGRIGHTS) |
+	    LS_64((op_info->addressing_type == I40IW_ADDR_TYPE_VA_BASED ?  1 : 0),
+		  I40IWQPSQ_VABASEDTO) |
+	    LS_64(info->read_fence, I40IWQPSQ_READFENCE) |
+	    LS_64(local_fence, I40IWQPSQ_LOCALFENCE) |
+	    LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
+	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, header);
+
+	if (post_sq)
+		i40iw_qp_post_wr(qp);
+
+	return 0;
+}
+
+/**
+ * i40iw_post_receive - post receive wqe
+ * @qp: hw qp ptr
+ * @info: post rq information
+ */
+static enum i40iw_status_code i40iw_post_receive(struct i40iw_qp_uk *qp,
+						 struct i40iw_post_rq_info *info)
+{
+	u64 *wqe;
+	u64 header;
+	u32 total_size = 0, wqe_idx, i, byte_off;
+
+	if (qp->max_rq_frag_cnt < info->num_sges)
+		return I40IW_ERR_INVALID_FRAG_COUNT;
+	for (i = 0; i < info->num_sges; i++)
+		total_size += info->sg_list[i].len;
+	wqe = i40iw_qp_get_next_recv_wqe(qp, &wqe_idx);
+	if (!wqe)
+		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+
+	qp->rq_wrid_array[wqe_idx] = info->wr_id;
+	set_64bit_val(wqe, 16, 0);
+
+	header = LS_64((info->num_sges > 1 ? (info->num_sges - 1) : 0),
+		       I40IWQPSQ_ADDFRAGCNT) |
+	    LS_64(qp->rwqe_polarity, I40IWQPSQ_VALID);
+
+	i40iw_set_fragment(wqe, 0, info->sg_list);
+
+	for (i = 1, byte_off = 32; i < info->num_sges; i++) {
+		i40iw_set_fragment(wqe, byte_off, &info->sg_list[i]);
+		byte_off += 16;
+	}
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, header);
+
+	return 0;
+}
+
+/**
+ * i40iw_cq_request_notification - cq notification request (door bell)
+ * @cq: hw cq
+ * @cq_notify: notification type
+ */
+static void i40iw_cq_request_notification(struct i40iw_cq_uk *cq,
+					  enum i40iw_completion_notify cq_notify)
+{
+	u64 temp_val;
+	u16 sw_cq_sel;
+	u8 arm_next_se = 0;
+	u8 arm_next = 0;
+	u8 arm_seq_num;
+
+	get_64bit_val(cq->shadow_area, 32, &temp_val);
+	arm_seq_num = (u8)RS_64(temp_val, I40IW_CQ_DBSA_ARM_SEQ_NUM);
+	arm_seq_num++;
+
+	sw_cq_sel = (u16)RS_64(temp_val, I40IW_CQ_DBSA_SW_CQ_SELECT);
+	arm_next_se = (u8)RS_64(temp_val, I40IW_CQ_DBSA_ARM_NEXT_SE);
+	arm_next_se |= 1;
+	if (cq_notify == IW_CQ_COMPL_EVENT)
+		arm_next = 1;
+	temp_val = LS_64(arm_seq_num, I40IW_CQ_DBSA_ARM_SEQ_NUM) |
+	    LS_64(sw_cq_sel, I40IW_CQ_DBSA_SW_CQ_SELECT) |
+	    LS_64(arm_next_se, I40IW_CQ_DBSA_ARM_NEXT_SE) |
+	    LS_64(arm_next, I40IW_CQ_DBSA_ARM_NEXT);
+
+	set_64bit_val(cq->shadow_area, 32, temp_val);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	writel(cq->cq_id, cq->cqe_alloc_reg);
+}
+
+/**
+ * i40iw_cq_post_entries - update tail in shadow memory
+ * @cq: hw cq
+ * @count: # of entries processed
+ */
+static enum i40iw_status_code i40iw_cq_post_entries(struct i40iw_cq_uk *cq,
+						    u8 count)
+{
+	I40IW_RING_MOVE_TAIL_BY_COUNT(cq->cq_ring, count);
+	set_64bit_val(cq->shadow_area, 0,
+		      I40IW_RING_GETCURRENT_HEAD(cq->cq_ring));
+	return 0;
+}
+
+/**
+ * i40iw_cq_poll_completion - get cq completion info
+ * @cq: hw cq
+ * @info: cq poll information returned
+ * @post_cq: update cq tail
+ */
+static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
+						       struct i40iw_cq_poll_info *info)
+{
+	u64 comp_ctx, qword0, qword2, qword3, wqe_qword;
+	u64 *cqe, *sw_wqe;
+	struct i40iw_qp_uk *qp;
+	struct i40iw_ring *pring = NULL;
+	u32 wqe_idx, q_type, array_idx = 0;
+	enum i40iw_status_code ret_code = 0;
+	bool move_cq_head = true;
+	u8 polarity;
+	u8 addl_wqes = 0;
+
+	if (cq->avoid_mem_cflct)
+		cqe = (u64 *)I40IW_GET_CURRENT_EXTENDED_CQ_ELEMENT(cq);
+	else
+		cqe = (u64 *)I40IW_GET_CURRENT_CQ_ELEMENT(cq);
+
+	get_64bit_val(cqe, 24, &qword3);
+	polarity = (u8)RS_64(qword3, I40IW_CQ_VALID);
+
+	if (polarity != cq->polarity)
+		return I40IW_ERR_QUEUE_EMPTY;
+
+	q_type = (u8)RS_64(qword3, I40IW_CQ_SQ);
+	info->error = (bool)RS_64(qword3, I40IW_CQ_ERROR);
+	info->push_dropped = (bool)RS_64(qword3, I40IWCQ_PSHDROP);
+	if (info->error) {
+		info->comp_status = I40IW_COMPL_STATUS_FLUSHED;
+		info->major_err = (bool)RS_64(qword3, I40IW_CQ_MAJERR);
+		info->minor_err = (bool)RS_64(qword3, I40IW_CQ_MINERR);
+	} else {
+		info->comp_status = I40IW_COMPL_STATUS_SUCCESS;
+	}
+
+	get_64bit_val(cqe, 0, &qword0);
+	get_64bit_val(cqe, 16, &qword2);
+
+	info->tcp_seq_num = (u32)RS_64(qword0, I40IWCQ_TCPSEQNUM);
+
+	info->qp_id = (u32)RS_64(qword2, I40IWCQ_QPID);
+
+	get_64bit_val(cqe, 8, &comp_ctx);
+
+	info->solicited_event = (bool)RS_64(qword3, I40IWCQ_SOEVENT);
+	info->is_srq = (bool)RS_64(qword3, I40IWCQ_SRQ);
+
+	qp = (struct i40iw_qp_uk *)(unsigned long)comp_ctx;
+	if (!qp) {
+		ret_code = I40IW_ERR_QUEUE_DESTROYED;
+		goto exit;
+	}
+	wqe_idx = (u32)RS_64(qword3, I40IW_CQ_WQEIDX);
+	info->qp_handle = (i40iw_qp_handle)(unsigned long)qp;
+
+	if (q_type == I40IW_CQE_QTYPE_RQ) {
+		array_idx = (wqe_idx * 4) / qp->rq_wqe_size_multiplier;
+		if (info->comp_status == I40IW_COMPL_STATUS_FLUSHED) {
+			info->wr_id = qp->rq_wrid_array[qp->rq_ring.tail];
+			array_idx = qp->rq_ring.tail;
+		} else {
+			info->wr_id = qp->rq_wrid_array[array_idx];
+		}
+
+		info->op_type = I40IW_OP_TYPE_REC;
+		if (qword3 & I40IWCQ_STAG_MASK) {
+			info->stag_invalid_set = true;
+			info->inv_stag = (u32)RS_64(qword2, I40IWCQ_INVSTAG);
+		} else {
+			info->stag_invalid_set = false;
+		}
+		info->bytes_xfered = (u32)RS_64(qword0, I40IWCQ_PAYLDLEN);
+		I40IW_RING_SET_TAIL(qp->rq_ring, array_idx + 1);
+		pring = &qp->rq_ring;
+	} else {
+		if (qp->first_sq_wq) {
+			qp->first_sq_wq = false;
+			if (!wqe_idx && (qp->sq_ring.head == qp->sq_ring.tail)) {
+				I40IW_RING_MOVE_HEAD_NOCHECK(cq->cq_ring);
+				I40IW_RING_MOVE_TAIL(cq->cq_ring);
+				set_64bit_val(cq->shadow_area, 0,
+					      I40IW_RING_GETCURRENT_HEAD(cq->cq_ring));
+				memset(info, 0, sizeof(struct i40iw_cq_poll_info));
+				return i40iw_cq_poll_completion(cq, info);
+			}
+		}
+
+		if (info->comp_status != I40IW_COMPL_STATUS_FLUSHED) {
+			info->wr_id = qp->sq_wrtrk_array[wqe_idx].wrid;
+			info->bytes_xfered = qp->sq_wrtrk_array[wqe_idx].wr_len;
+
+			info->op_type = (u8)RS_64(qword3, I40IWCQ_OP);
+			sw_wqe = qp->sq_base[wqe_idx].elem;
+			get_64bit_val(sw_wqe, 24, &wqe_qword);
+
+			addl_wqes = qp->sq_wrtrk_array[wqe_idx].wqe_size / I40IW_QP_WQE_MIN_SIZE;
+			I40IW_RING_SET_TAIL(qp->sq_ring, (wqe_idx + addl_wqes));
+		} else {
+			do {
+				u8 op_type;
+				u32 tail;
+
+				tail = qp->sq_ring.tail;
+				sw_wqe = qp->sq_base[tail].elem;
+				get_64bit_val(sw_wqe, 24, &wqe_qword);
+				op_type = (u8)RS_64(wqe_qword, I40IWQPSQ_OPCODE);
+				info->op_type = op_type;
+				addl_wqes = qp->sq_wrtrk_array[tail].wqe_size / I40IW_QP_WQE_MIN_SIZE;
+				I40IW_RING_SET_TAIL(qp->sq_ring, (tail + addl_wqes));
+				if (op_type != I40IWQP_OP_NOP) {
+					info->wr_id = qp->sq_wrtrk_array[tail].wrid;
+					info->bytes_xfered = qp->sq_wrtrk_array[tail].wr_len;
+					break;
+				}
+			} while (1);
+		}
+		pring = &qp->sq_ring;
+	}
+
+	ret_code = 0;
+
+exit:
+	if (!ret_code &&
+	    (info->comp_status == I40IW_COMPL_STATUS_FLUSHED))
+		if (pring && (I40IW_RING_MORE_WORK(*pring)))
+			move_cq_head = false;
+
+	if (move_cq_head) {
+		I40IW_RING_MOVE_HEAD_NOCHECK(cq->cq_ring);
+
+		if (I40IW_RING_GETCURRENT_HEAD(cq->cq_ring) == 0)
+			cq->polarity ^= 1;
+
+		I40IW_RING_MOVE_TAIL(cq->cq_ring);
+		set_64bit_val(cq->shadow_area, 0,
+			      I40IW_RING_GETCURRENT_HEAD(cq->cq_ring));
+	} else {
+		if (info->is_srq)
+			return ret_code;
+		qword3 &= ~I40IW_CQ_WQEIDX_MASK;
+		qword3 |= LS_64(pring->tail, I40IW_CQ_WQEIDX);
+		set_64bit_val(cqe, 24, qword3);
+	}
+
+	return ret_code;
+}
+
+/**
+ * i40iw_get_wqe_shift - get shift count for maximum wqe size
+ * @sge: Maximum Scatter Gather Elements wqe
+ * @inline_data: Maximum inline data size
+ * @shift: Returns the shift needed based on sge
+ *
+ * Shift can be used to left shift the wqe size based on number of SGEs and inlind data size.
+ * For 1 SGE or inline data <= 16, shift = 0 (wqe size of 32 bytes).
+ * For 2 or 3 SGEs or inline data <= 48, shift = 1 (wqe size of 64 bytes).
+ * Shift of 2 otherwise (wqe size of 128 bytes).
+ */
+void i40iw_get_wqe_shift(u32 sge, u32 inline_data, u8 *shift)
+{
+	*shift = 0;
+	if (sge > 1 || inline_data > 16)
+		*shift = (sge < 4 && inline_data <= 48) ? 1 : 2;
+}
+
+/*
+ * i40iw_get_sqdepth - get SQ depth (quantas)
+ * @sq_size: SQ size
+ * @shift: shift which determines size of WQE
+ * @sqdepth: depth of SQ
+ *
+ */
+enum i40iw_status_code i40iw_get_sqdepth(u32 sq_size, u8 shift, u32 *sqdepth)
+{
+	*sqdepth = roundup_pow_of_two((sq_size << shift) + I40IW_SQ_RSVD);
+
+	if (*sqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift))
+		*sqdepth = I40IW_QP_SW_MIN_WQSIZE << shift;
+	else if (*sqdepth > I40IW_QP_SW_MAX_SQ_QUANTAS)
+		return I40IW_ERR_INVALID_SIZE;
+
+	return 0;
+}
+
+/*
+ * i40iw_get_rq_depth - get RQ depth (quantas)
+ * @rq_size: RQ size
+ * @shift: shift which determines size of WQE
+ * @rqdepth: depth of RQ
+ *
+ */
+enum i40iw_status_code i40iw_get_rqdepth(u32 rq_size, u8 shift, u32 *rqdepth)
+{
+	*rqdepth = roundup_pow_of_two((rq_size << shift) + I40IW_RQ_RSVD);
+
+	if (*rqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift))
+		*rqdepth = I40IW_QP_SW_MIN_WQSIZE << shift;
+	else if (*rqdepth > I40IW_QP_SW_MAX_RQ_QUANTAS)
+		return I40IW_ERR_INVALID_SIZE;
+
+	return 0;
+}
+
+static const struct i40iw_qp_uk_ops iw_qp_uk_ops = {
+	.iw_qp_post_wr = i40iw_qp_post_wr,
+	.iw_qp_ring_push_db = i40iw_qp_ring_push_db,
+	.iw_rdma_write = i40iw_rdma_write,
+	.iw_rdma_read = i40iw_rdma_read,
+	.iw_send = i40iw_send,
+	.iw_inline_rdma_write = i40iw_inline_rdma_write,
+	.iw_inline_send = i40iw_inline_send,
+	.iw_stag_local_invalidate = i40iw_stag_local_invalidate,
+	.iw_mw_bind = i40iw_mw_bind,
+	.iw_post_receive = i40iw_post_receive,
+	.iw_post_nop = i40iw_nop
+};
+
+static const struct i40iw_cq_ops iw_cq_ops = {
+	.iw_cq_request_notification = i40iw_cq_request_notification,
+	.iw_cq_poll_completion = i40iw_cq_poll_completion,
+	.iw_cq_post_entries = i40iw_cq_post_entries,
+	.iw_cq_clean = i40iw_clean_cq
+};
+
+static const struct i40iw_device_uk_ops iw_device_uk_ops = {
+	.iwarp_cq_uk_init = i40iw_cq_uk_init,
+	.iwarp_qp_uk_init = i40iw_qp_uk_init,
+};
+
+/**
+ * i40iw_qp_uk_init - initialize shared qp
+ * @qp: hw qp (user and kernel)
+ * @info: qp initialization info
+ *
+ * initializes the vars used in both user and kernel mode.
+ * size of the wqe depends on numbers of max. fragements
+ * allowed. Then size of wqe * the number of wqes should be the
+ * amount of memory allocated for sq and rq. If srq is used,
+ * then rq_base will point to one rq wqe only (not the whole
+ * array of wqes)
+ */
+enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp,
+					struct i40iw_qp_uk_init_info *info)
+{
+	enum i40iw_status_code ret_code = 0;
+	u32 sq_ring_size;
+	u8 sqshift, rqshift;
+
+	if (info->max_sq_frag_cnt > I40IW_MAX_WQ_FRAGMENT_COUNT)
+		return I40IW_ERR_INVALID_FRAG_COUNT;
+
+	if (info->max_rq_frag_cnt > I40IW_MAX_WQ_FRAGMENT_COUNT)
+		return I40IW_ERR_INVALID_FRAG_COUNT;
+	i40iw_get_wqe_shift(info->max_sq_frag_cnt, info->max_inline_data, &sqshift);
+
+	qp->sq_base = info->sq;
+	qp->rq_base = info->rq;
+	qp->shadow_area = info->shadow_area;
+	qp->sq_wrtrk_array = info->sq_wrtrk_array;
+	qp->rq_wrid_array = info->rq_wrid_array;
+
+	qp->wqe_alloc_reg = info->wqe_alloc_reg;
+	qp->qp_id = info->qp_id;
+
+	qp->sq_size = info->sq_size;
+	qp->push_db = info->push_db;
+	qp->push_wqe = info->push_wqe;
+
+	qp->max_sq_frag_cnt = info->max_sq_frag_cnt;
+	sq_ring_size = qp->sq_size << sqshift;
+
+	I40IW_RING_INIT(qp->sq_ring, sq_ring_size);
+	I40IW_RING_INIT(qp->initial_ring, sq_ring_size);
+	I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code);
+	I40IW_RING_MOVE_TAIL(qp->sq_ring);
+	I40IW_RING_MOVE_HEAD(qp->initial_ring, ret_code);
+	qp->swqe_polarity = 1;
+	qp->first_sq_wq = true;
+	qp->swqe_polarity_deferred = 1;
+	qp->rwqe_polarity = 0;
+
+	if (!qp->use_srq) {
+		qp->rq_size = info->rq_size;
+		qp->max_rq_frag_cnt = info->max_rq_frag_cnt;
+		I40IW_RING_INIT(qp->rq_ring, qp->rq_size);
+		switch (info->abi_ver) {
+		case 4:
+			i40iw_get_wqe_shift(info->max_rq_frag_cnt, 0, &rqshift);
+			break;
+		case 5: /* fallthrough until next ABI version */
+		default:
+			rqshift = I40IW_MAX_RQ_WQE_SHIFT;
+			break;
+		}
+		qp->rq_wqe_size = rqshift;
+		qp->rq_wqe_size_multiplier = 4 << rqshift;
+	}
+	qp->ops = iw_qp_uk_ops;
+
+	return ret_code;
+}
+
+/**
+ * i40iw_cq_uk_init - initialize shared cq (user and kernel)
+ * @cq: hw cq
+ * @info: hw cq initialization info
+ */
+enum i40iw_status_code i40iw_cq_uk_init(struct i40iw_cq_uk *cq,
+					struct i40iw_cq_uk_init_info *info)
+{
+	if ((info->cq_size < I40IW_MIN_CQ_SIZE) ||
+	    (info->cq_size > I40IW_MAX_CQ_SIZE))
+		return I40IW_ERR_INVALID_SIZE;
+	cq->cq_base = (struct i40iw_cqe *)info->cq_base;
+	cq->cq_id = info->cq_id;
+	cq->cq_size = info->cq_size;
+	cq->cqe_alloc_reg = info->cqe_alloc_reg;
+	cq->shadow_area = info->shadow_area;
+	cq->avoid_mem_cflct = info->avoid_mem_cflct;
+
+	I40IW_RING_INIT(cq->cq_ring, cq->cq_size);
+	cq->polarity = 1;
+	cq->ops = iw_cq_ops;
+
+	return 0;
+}
+
+/**
+ * i40iw_device_init_uk - setup routines for iwarp shared device
+ * @dev: iwarp shared (user and kernel)
+ */
+void i40iw_device_init_uk(struct i40iw_dev_uk *dev)
+{
+	dev->ops_uk = iw_device_uk_ops;
+}
+
+/**
+ * i40iw_clean_cq - clean cq entries
+ * @ queue completion context
+ * @cq: cq to clean
+ */
+void i40iw_clean_cq(void *queue, struct i40iw_cq_uk *cq)
+{
+	u64 *cqe;
+	u64 qword3, comp_ctx;
+	u32 cq_head;
+	u8 polarity, temp;
+
+	cq_head = cq->cq_ring.head;
+	temp = cq->polarity;
+	do {
+		if (cq->avoid_mem_cflct)
+			cqe = (u64 *)&(((struct i40iw_extended_cqe *)cq->cq_base)[cq_head]);
+		else
+			cqe = (u64 *)&cq->cq_base[cq_head];
+		get_64bit_val(cqe, 24, &qword3);
+		polarity = (u8)RS_64(qword3, I40IW_CQ_VALID);
+
+		if (polarity != temp)
+			break;
+
+		get_64bit_val(cqe, 8, &comp_ctx);
+		if ((void *)(unsigned long)comp_ctx == queue)
+			set_64bit_val(cqe, 8, 0);
+
+		cq_head = (cq_head + 1) % cq->cq_ring.size;
+		if (!cq_head)
+			temp ^= 1;
+	} while (true);
+}
+
+/**
+ * i40iw_nop - send a nop
+ * @qp: hw qp ptr
+ * @wr_id: work request id
+ * @signaled: flag if signaled for completion
+ * @post_sq: flag to post sq
+ */
+enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp,
+				 u64 wr_id,
+				 bool signaled,
+				 bool post_sq)
+{
+	u64 header, *wqe;
+	u32 wqe_idx;
+
+	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, wr_id);
+	if (!wqe)
+		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+	set_64bit_val(wqe, 0, 0);
+	set_64bit_val(wqe, 8, 0);
+	set_64bit_val(wqe, 16, 0);
+
+	header = LS_64(I40IWQP_OP_NOP, I40IWQPSQ_OPCODE) |
+	    LS_64(signaled, I40IWQPSQ_SIGCOMPL) |
+	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
+
+	wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, header);
+	if (post_sq)
+		i40iw_qp_post_wr(qp);
+
+	return 0;
+}
+
+/**
+ * i40iw_fragcnt_to_wqesize_sq - calculate wqe size based on fragment count for SQ
+ * @frag_cnt: number of fragments
+ * @wqe_size: size of sq wqe returned
+ */
+enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size)
+{
+	switch (frag_cnt) {
+	case 0:
+	case 1:
+		*wqe_size = I40IW_QP_WQE_MIN_SIZE;
+		break;
+	case 2:
+	case 3:
+		*wqe_size = 64;
+		break;
+	case 4:
+	case 5:
+		*wqe_size = 96;
+		break;
+	case 6:
+	case 7:
+		*wqe_size = 128;
+		break;
+	default:
+		return I40IW_ERR_INVALID_FRAG_COUNT;
+	}
+
+	return 0;
+}
+
+/**
+ * i40iw_fragcnt_to_wqesize_rq - calculate wqe size based on fragment count for RQ
+ * @frag_cnt: number of fragments
+ * @wqe_size: size of rq wqe returned
+ */
+enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size)
+{
+	switch (frag_cnt) {
+	case 0:
+	case 1:
+		*wqe_size = 32;
+		break;
+	case 2:
+	case 3:
+		*wqe_size = 64;
+		break;
+	case 4:
+	case 5:
+	case 6:
+	case 7:
+		*wqe_size = 128;
+		break;
+	default:
+		return I40IW_ERR_INVALID_FRAG_COUNT;
+	}
+
+	return 0;
+}
+
+/**
+ * i40iw_inline_data_size_to_wqesize - based on inline data, wqe size
+ * @data_size: data size for inline
+ * @wqe_size: size of sq wqe returned
+ */
+enum i40iw_status_code i40iw_inline_data_size_to_wqesize(u32 data_size,
+							 u8 *wqe_size)
+{
+	if (data_size > I40IW_MAX_INLINE_DATA_SIZE)
+		return I40IW_ERR_INVALID_INLINE_DATA_SIZE;
+
+	if (data_size <= 16)
+		*wqe_size = I40IW_QP_WQE_MIN_SIZE;
+	else
+		*wqe_size = 64;
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_user.h b/drivers/infiniband/hw/i40iw/i40iw_user.h
new file mode 100644
index 0000000..b125925
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_user.h
@@ -0,0 +1,430 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_USER_H
+#define I40IW_USER_H
+
+enum i40iw_device_capabilities_const {
+	I40IW_WQE_SIZE =			4,
+	I40IW_CQP_WQE_SIZE =			8,
+	I40IW_CQE_SIZE =			4,
+	I40IW_EXTENDED_CQE_SIZE =		8,
+	I40IW_AEQE_SIZE =			2,
+	I40IW_CEQE_SIZE =			1,
+	I40IW_CQP_CTX_SIZE =			8,
+	I40IW_SHADOW_AREA_SIZE =		8,
+	I40IW_CEQ_MAX_COUNT =			256,
+	I40IW_QUERY_FPM_BUF_SIZE =		128,
+	I40IW_COMMIT_FPM_BUF_SIZE =		128,
+	I40IW_MIN_IW_QP_ID =			1,
+	I40IW_MAX_IW_QP_ID =			262143,
+	I40IW_MIN_CEQID =			0,
+	I40IW_MAX_CEQID =			256,
+	I40IW_MIN_CQID =			0,
+	I40IW_MAX_CQID =			131071,
+	I40IW_MIN_AEQ_ENTRIES =			1,
+	I40IW_MAX_AEQ_ENTRIES =			524287,
+	I40IW_MIN_CEQ_ENTRIES =			1,
+	I40IW_MAX_CEQ_ENTRIES =			131071,
+	I40IW_MIN_CQ_SIZE =			1,
+	I40IW_MAX_CQ_SIZE =			1048575,
+	I40IW_DB_ID_ZERO =			0,
+	I40IW_MAX_WQ_FRAGMENT_COUNT =		3,
+	I40IW_MAX_SGE_RD =			1,
+	I40IW_MAX_OUTBOUND_MESSAGE_SIZE =	2147483647,
+	I40IW_MAX_INBOUND_MESSAGE_SIZE =	2147483647,
+	I40IW_MAX_PUSH_PAGE_COUNT =		4096,
+	I40IW_MAX_PE_ENABLED_VF_COUNT =		32,
+	I40IW_MAX_VF_FPM_ID =			47,
+	I40IW_MAX_VF_PER_PF =			127,
+	I40IW_MAX_SQ_PAYLOAD_SIZE =		2145386496,
+	I40IW_MAX_INLINE_DATA_SIZE =		48,
+	I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE =	48,
+	I40IW_MAX_IRD_SIZE =			64,
+	I40IW_MAX_ORD_SIZE =			127,
+	I40IW_MAX_WQ_ENTRIES =			2048,
+	I40IW_Q2_BUFFER_SIZE =			(248 + 100),
+	I40IW_MAX_WQE_SIZE_RQ =			128,
+	I40IW_QP_CTX_SIZE =			248,
+	I40IW_MAX_PDS = 			32768
+};
+
+#define i40iw_handle void *
+#define i40iw_adapter_handle i40iw_handle
+#define i40iw_qp_handle i40iw_handle
+#define i40iw_cq_handle i40iw_handle
+#define i40iw_srq_handle i40iw_handle
+#define i40iw_pd_id i40iw_handle
+#define i40iw_stag_handle i40iw_handle
+#define i40iw_stag_index u32
+#define i40iw_stag u32
+#define i40iw_stag_key u8
+
+#define i40iw_tagged_offset u64
+#define i40iw_access_privileges u32
+#define i40iw_physical_fragment u64
+#define i40iw_address_list u64 *
+
+#define	I40IW_MAX_MR_SIZE	0x10000000000L
+#define	I40IW_MAX_RQ_WQE_SHIFT	2
+
+struct i40iw_qp_uk;
+struct i40iw_cq_uk;
+struct i40iw_srq_uk;
+struct i40iw_qp_uk_init_info;
+struct i40iw_cq_uk_init_info;
+struct i40iw_srq_uk_init_info;
+
+struct i40iw_sge {
+	i40iw_tagged_offset tag_off;
+	u32 len;
+	i40iw_stag stag;
+};
+
+#define i40iw_sgl struct i40iw_sge *
+
+struct i40iw_ring {
+	u32 head;
+	u32 tail;
+	u32 size;
+};
+
+struct i40iw_cqe {
+	u64 buf[I40IW_CQE_SIZE];
+};
+
+struct i40iw_extended_cqe {
+	u64 buf[I40IW_EXTENDED_CQE_SIZE];
+};
+
+struct i40iw_wqe {
+	u64 buf[I40IW_WQE_SIZE];
+};
+
+struct i40iw_qp_uk_ops;
+
+enum i40iw_addressing_type {
+	I40IW_ADDR_TYPE_ZERO_BASED = 0,
+	I40IW_ADDR_TYPE_VA_BASED = 1,
+};
+
+#define I40IW_ACCESS_FLAGS_LOCALREAD		0x01
+#define I40IW_ACCESS_FLAGS_LOCALWRITE		0x02
+#define I40IW_ACCESS_FLAGS_REMOTEREAD_ONLY	0x04
+#define I40IW_ACCESS_FLAGS_REMOTEREAD		0x05
+#define I40IW_ACCESS_FLAGS_REMOTEWRITE_ONLY	0x08
+#define I40IW_ACCESS_FLAGS_REMOTEWRITE		0x0a
+#define I40IW_ACCESS_FLAGS_BIND_WINDOW		0x10
+#define I40IW_ACCESS_FLAGS_ALL			0x1F
+
+#define I40IW_OP_TYPE_RDMA_WRITE	0
+#define I40IW_OP_TYPE_RDMA_READ		1
+#define I40IW_OP_TYPE_SEND		3
+#define I40IW_OP_TYPE_SEND_INV		4
+#define I40IW_OP_TYPE_SEND_SOL		5
+#define I40IW_OP_TYPE_SEND_SOL_INV	6
+#define I40IW_OP_TYPE_REC		7
+#define I40IW_OP_TYPE_BIND_MW		8
+#define I40IW_OP_TYPE_FAST_REG_NSMR	9
+#define I40IW_OP_TYPE_INV_STAG		10
+#define I40IW_OP_TYPE_RDMA_READ_INV_STAG 11
+#define I40IW_OP_TYPE_NOP		12
+
+enum i40iw_completion_status {
+	I40IW_COMPL_STATUS_SUCCESS = 0,
+	I40IW_COMPL_STATUS_FLUSHED,
+	I40IW_COMPL_STATUS_INVALID_WQE,
+	I40IW_COMPL_STATUS_QP_CATASTROPHIC,
+	I40IW_COMPL_STATUS_REMOTE_TERMINATION,
+	I40IW_COMPL_STATUS_INVALID_STAG,
+	I40IW_COMPL_STATUS_BASE_BOUND_VIOLATION,
+	I40IW_COMPL_STATUS_ACCESS_VIOLATION,
+	I40IW_COMPL_STATUS_INVALID_PD_ID,
+	I40IW_COMPL_STATUS_WRAP_ERROR,
+	I40IW_COMPL_STATUS_STAG_INVALID_PDID,
+	I40IW_COMPL_STATUS_RDMA_READ_ZERO_ORD,
+	I40IW_COMPL_STATUS_QP_NOT_PRIVLEDGED,
+	I40IW_COMPL_STATUS_STAG_NOT_INVALID,
+	I40IW_COMPL_STATUS_INVALID_PHYS_BUFFER_SIZE,
+	I40IW_COMPL_STATUS_INVALID_PHYS_BUFFER_ENTRY,
+	I40IW_COMPL_STATUS_INVALID_FBO,
+	I40IW_COMPL_STATUS_INVALID_LENGTH,
+	I40IW_COMPL_STATUS_INVALID_ACCESS,
+	I40IW_COMPL_STATUS_PHYS_BUFFER_LIST_TOO_LONG,
+	I40IW_COMPL_STATUS_INVALID_VIRT_ADDRESS,
+	I40IW_COMPL_STATUS_INVALID_REGION,
+	I40IW_COMPL_STATUS_INVALID_WINDOW,
+	I40IW_COMPL_STATUS_INVALID_TOTAL_LENGTH
+};
+
+enum i40iw_completion_notify {
+	IW_CQ_COMPL_EVENT = 0,
+	IW_CQ_COMPL_SOLICITED = 1
+};
+
+struct i40iw_post_send {
+	i40iw_sgl sg_list;
+	u32 num_sges;
+};
+
+struct i40iw_post_inline_send {
+	void *data;
+	u32 len;
+};
+
+struct i40iw_rdma_write {
+	i40iw_sgl lo_sg_list;
+	u32 num_lo_sges;
+	struct i40iw_sge rem_addr;
+};
+
+struct i40iw_inline_rdma_write {
+	void *data;
+	u32 len;
+	struct i40iw_sge rem_addr;
+};
+
+struct i40iw_rdma_read {
+	struct i40iw_sge lo_addr;
+	struct i40iw_sge rem_addr;
+};
+
+struct i40iw_bind_window {
+	i40iw_stag mr_stag;
+	u64 bind_length;
+	void *va;
+	enum i40iw_addressing_type addressing_type;
+	bool enable_reads;
+	bool enable_writes;
+	i40iw_stag mw_stag;
+};
+
+struct i40iw_inv_local_stag {
+	i40iw_stag target_stag;
+};
+
+struct i40iw_post_sq_info {
+	u64 wr_id;
+	u8 op_type;
+	bool signaled;
+	bool read_fence;
+	bool local_fence;
+	bool inline_data;
+	bool defer_flag;
+	union {
+		struct i40iw_post_send send;
+		struct i40iw_rdma_write rdma_write;
+		struct i40iw_rdma_read rdma_read;
+		struct i40iw_rdma_read rdma_read_inv;
+		struct i40iw_bind_window bind_window;
+		struct i40iw_inv_local_stag inv_local_stag;
+		struct i40iw_inline_rdma_write inline_rdma_write;
+		struct i40iw_post_inline_send inline_send;
+	} op;
+};
+
+struct i40iw_post_rq_info {
+	u64 wr_id;
+	i40iw_sgl sg_list;
+	u32 num_sges;
+};
+
+struct i40iw_cq_poll_info {
+	u64 wr_id;
+	i40iw_qp_handle qp_handle;
+	u32 bytes_xfered;
+	u32 tcp_seq_num;
+	u32 qp_id;
+	i40iw_stag inv_stag;
+	enum i40iw_completion_status comp_status;
+	u16 major_err;
+	u16 minor_err;
+	u8 op_type;
+	bool stag_invalid_set;
+	bool push_dropped;
+	bool error;
+	bool is_srq;
+	bool solicited_event;
+};
+
+struct i40iw_qp_uk_ops {
+	void (*iw_qp_post_wr)(struct i40iw_qp_uk *);
+	void (*iw_qp_ring_push_db)(struct i40iw_qp_uk *, u32);
+	enum i40iw_status_code (*iw_rdma_write)(struct i40iw_qp_uk *,
+						struct i40iw_post_sq_info *, bool);
+	enum i40iw_status_code (*iw_rdma_read)(struct i40iw_qp_uk *,
+					       struct i40iw_post_sq_info *, bool, bool);
+	enum i40iw_status_code (*iw_send)(struct i40iw_qp_uk *,
+					  struct i40iw_post_sq_info *, u32, bool);
+	enum i40iw_status_code (*iw_inline_rdma_write)(struct i40iw_qp_uk *,
+						       struct i40iw_post_sq_info *, bool);
+	enum i40iw_status_code (*iw_inline_send)(struct i40iw_qp_uk *,
+						 struct i40iw_post_sq_info *, u32, bool);
+	enum i40iw_status_code (*iw_stag_local_invalidate)(struct i40iw_qp_uk *,
+							   struct i40iw_post_sq_info *, bool);
+	enum i40iw_status_code (*iw_mw_bind)(struct i40iw_qp_uk *,
+					     struct i40iw_post_sq_info *, bool);
+	enum i40iw_status_code (*iw_post_receive)(struct i40iw_qp_uk *,
+						  struct i40iw_post_rq_info *);
+	enum i40iw_status_code (*iw_post_nop)(struct i40iw_qp_uk *, u64, bool, bool);
+};
+
+struct i40iw_cq_ops {
+	void (*iw_cq_request_notification)(struct i40iw_cq_uk *,
+					   enum i40iw_completion_notify);
+	enum i40iw_status_code (*iw_cq_poll_completion)(struct i40iw_cq_uk *,
+							struct i40iw_cq_poll_info *);
+	enum i40iw_status_code (*iw_cq_post_entries)(struct i40iw_cq_uk *, u8 count);
+	void (*iw_cq_clean)(void *, struct i40iw_cq_uk *);
+};
+
+struct i40iw_dev_uk;
+
+struct i40iw_device_uk_ops {
+	enum i40iw_status_code (*iwarp_cq_uk_init)(struct i40iw_cq_uk *,
+						   struct i40iw_cq_uk_init_info *);
+	enum i40iw_status_code (*iwarp_qp_uk_init)(struct i40iw_qp_uk *,
+						   struct i40iw_qp_uk_init_info *);
+};
+
+struct i40iw_dev_uk {
+	struct i40iw_device_uk_ops ops_uk;
+};
+
+struct i40iw_sq_uk_wr_trk_info {
+	u64 wrid;
+	u32 wr_len;
+	u8 wqe_size;
+	u8 reserved[3];
+};
+
+struct i40iw_qp_quanta {
+	u64 elem[I40IW_WQE_SIZE];
+};
+
+struct i40iw_qp_uk {
+	struct i40iw_qp_quanta *sq_base;
+	struct i40iw_qp_quanta *rq_base;
+	u32 __iomem *wqe_alloc_reg;
+	struct i40iw_sq_uk_wr_trk_info *sq_wrtrk_array;
+	u64 *rq_wrid_array;
+	u64 *shadow_area;
+	u32 *push_db;
+	u64 *push_wqe;
+	struct i40iw_ring sq_ring;
+	struct i40iw_ring rq_ring;
+	struct i40iw_ring initial_ring;
+	u32 qp_id;
+	u32 sq_size;
+	u32 rq_size;
+	u32 max_sq_frag_cnt;
+	u32 max_rq_frag_cnt;
+	struct i40iw_qp_uk_ops ops;
+	bool use_srq;
+	u8 swqe_polarity;
+	u8 swqe_polarity_deferred;
+	u8 rwqe_polarity;
+	u8 rq_wqe_size;
+	u8 rq_wqe_size_multiplier;
+	bool first_sq_wq;
+	bool deferred_flag;
+};
+
+struct i40iw_cq_uk {
+	struct i40iw_cqe *cq_base;
+	u32 __iomem *cqe_alloc_reg;
+	u64 *shadow_area;
+	u32 cq_id;
+	u32 cq_size;
+	struct i40iw_ring cq_ring;
+	u8 polarity;
+	bool avoid_mem_cflct;
+
+	struct i40iw_cq_ops ops;
+};
+
+struct i40iw_qp_uk_init_info {
+	struct i40iw_qp_quanta *sq;
+	struct i40iw_qp_quanta *rq;
+	u32 __iomem *wqe_alloc_reg;
+	u64 *shadow_area;
+	struct i40iw_sq_uk_wr_trk_info *sq_wrtrk_array;
+	u64 *rq_wrid_array;
+	u32 *push_db;
+	u64 *push_wqe;
+	u32 qp_id;
+	u32 sq_size;
+	u32 rq_size;
+	u32 max_sq_frag_cnt;
+	u32 max_rq_frag_cnt;
+	u32 max_inline_data;
+	int abi_ver;
+};
+
+struct i40iw_cq_uk_init_info {
+	u32 __iomem *cqe_alloc_reg;
+	struct i40iw_cqe *cq_base;
+	u64 *shadow_area;
+	u32 cq_size;
+	u32 cq_id;
+	bool avoid_mem_cflct;
+};
+
+void i40iw_device_init_uk(struct i40iw_dev_uk *dev);
+
+void i40iw_qp_post_wr(struct i40iw_qp_uk *qp);
+u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx,
+				u8 wqe_size,
+				u32 total_size,
+				u64 wr_id
+				);
+u64 *i40iw_qp_get_next_recv_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx);
+u64 *i40iw_qp_get_next_srq_wqe(struct i40iw_srq_uk *srq, u32 *wqe_idx);
+
+enum i40iw_status_code i40iw_cq_uk_init(struct i40iw_cq_uk *cq,
+					struct i40iw_cq_uk_init_info *info);
+enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp,
+					struct i40iw_qp_uk_init_info *info);
+
+void i40iw_clean_cq(void *queue, struct i40iw_cq_uk *cq);
+enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp, u64 wr_id,
+				 bool signaled, bool post_sq);
+enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size);
+enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size);
+enum i40iw_status_code i40iw_inline_data_size_to_wqesize(u32 data_size,
+							 u8 *wqe_size);
+void i40iw_get_wqe_shift(u32 sge, u32 inline_data, u8 *shift);
+enum i40iw_status_code i40iw_get_sqdepth(u32 sq_size, u8 shift, u32 *sqdepth);
+enum i40iw_status_code i40iw_get_rqdepth(u32 rq_size, u8 shift, u32 *rqdepth);
+#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c
new file mode 100644
index 0000000..a9ea966
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c
@@ -0,0 +1,1544 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+#include <net/netevent.h>
+#include <net/neighbour.h>
+#include "i40iw.h"
+
+/**
+ * i40iw_arp_table - manage arp table
+ * @iwdev: iwarp device
+ * @ip_addr: ip address for device
+ * @mac_addr: mac address ptr
+ * @action: modify, delete or add
+ */
+int i40iw_arp_table(struct i40iw_device *iwdev,
+		    u32 *ip_addr,
+		    bool ipv4,
+		    u8 *mac_addr,
+		    u32 action)
+{
+	int arp_index;
+	int err;
+	u32 ip[4];
+
+	if (ipv4) {
+		memset(ip, 0, sizeof(ip));
+		ip[0] = *ip_addr;
+	} else {
+		memcpy(ip, ip_addr, sizeof(ip));
+	}
+
+	for (arp_index = 0; (u32)arp_index < iwdev->arp_table_size; arp_index++)
+		if (memcmp(iwdev->arp_table[arp_index].ip_addr, ip, sizeof(ip)) == 0)
+			break;
+	switch (action) {
+	case I40IW_ARP_ADD:
+		if (arp_index != iwdev->arp_table_size)
+			return -1;
+
+		arp_index = 0;
+		err = i40iw_alloc_resource(iwdev, iwdev->allocated_arps,
+					   iwdev->arp_table_size,
+					   (u32 *)&arp_index,
+					   &iwdev->next_arp_index);
+
+		if (err)
+			return err;
+
+		memcpy(iwdev->arp_table[arp_index].ip_addr, ip, sizeof(ip));
+		ether_addr_copy(iwdev->arp_table[arp_index].mac_addr, mac_addr);
+		break;
+	case I40IW_ARP_RESOLVE:
+		if (arp_index == iwdev->arp_table_size)
+			return -1;
+		break;
+	case I40IW_ARP_DELETE:
+		if (arp_index == iwdev->arp_table_size)
+			return -1;
+		memset(iwdev->arp_table[arp_index].ip_addr, 0,
+		       sizeof(iwdev->arp_table[arp_index].ip_addr));
+		eth_zero_addr(iwdev->arp_table[arp_index].mac_addr);
+		i40iw_free_resource(iwdev, iwdev->allocated_arps, arp_index);
+		break;
+	default:
+		return -1;
+	}
+	return arp_index;
+}
+
+/**
+ * i40iw_wr32 - write 32 bits to hw register
+ * @hw: hardware information including registers
+ * @reg: register offset
+ * @value: vvalue to write to register
+ */
+inline void i40iw_wr32(struct i40iw_hw *hw, u32 reg, u32 value)
+{
+	writel(value, hw->hw_addr + reg);
+}
+
+/**
+ * i40iw_rd32 - read a 32 bit hw register
+ * @hw: hardware information including registers
+ * @reg: register offset
+ *
+ * Return value of register content
+ */
+inline u32 i40iw_rd32(struct i40iw_hw *hw, u32 reg)
+{
+	return readl(hw->hw_addr + reg);
+}
+
+/**
+ * i40iw_inetaddr_event - system notifier for ipv4 addr events
+ * @notfier: not used
+ * @event: event for notifier
+ * @ptr: if address
+ */
+int i40iw_inetaddr_event(struct notifier_block *notifier,
+			 unsigned long event,
+			 void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+	struct net_device *event_netdev = ifa->ifa_dev->dev;
+	struct net_device *netdev;
+	struct net_device *upper_dev;
+	struct i40iw_device *iwdev;
+	struct i40iw_handler *hdl;
+	u32 local_ipaddr;
+	u32 action = I40IW_ARP_ADD;
+
+	hdl = i40iw_find_netdev(event_netdev);
+	if (!hdl)
+		return NOTIFY_DONE;
+
+	iwdev = &hdl->device;
+	if (iwdev->init_state < IP_ADDR_REGISTERED || iwdev->closing)
+		return NOTIFY_DONE;
+
+	netdev = iwdev->ldev->netdev;
+	upper_dev = netdev_master_upper_dev_get(netdev);
+	if (netdev != event_netdev)
+		return NOTIFY_DONE;
+
+	if (upper_dev) {
+		struct in_device *in;
+
+		rcu_read_lock();
+		in = __in_dev_get_rcu(upper_dev);
+		local_ipaddr = ntohl(in->ifa_list->ifa_address);
+		rcu_read_unlock();
+	} else {
+		local_ipaddr = ntohl(ifa->ifa_address);
+	}
+	switch (event) {
+	case NETDEV_DOWN:
+		action = I40IW_ARP_DELETE;
+		/* Fall through */
+	case NETDEV_UP:
+		/* Fall through */
+	case NETDEV_CHANGEADDR:
+		i40iw_manage_arp_cache(iwdev,
+				       netdev->dev_addr,
+				       &local_ipaddr,
+				       true,
+				       action);
+		i40iw_if_notify(iwdev, netdev, &local_ipaddr, true,
+				(action == I40IW_ARP_ADD) ? true : false);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+/**
+ * i40iw_inet6addr_event - system notifier for ipv6 addr events
+ * @notfier: not used
+ * @event: event for notifier
+ * @ptr: if address
+ */
+int i40iw_inet6addr_event(struct notifier_block *notifier,
+			  unsigned long event,
+			  void *ptr)
+{
+	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+	struct net_device *event_netdev = ifa->idev->dev;
+	struct net_device *netdev;
+	struct i40iw_device *iwdev;
+	struct i40iw_handler *hdl;
+	u32 local_ipaddr6[4];
+	u32 action = I40IW_ARP_ADD;
+
+	hdl = i40iw_find_netdev(event_netdev);
+	if (!hdl)
+		return NOTIFY_DONE;
+
+	iwdev = &hdl->device;
+	if (iwdev->init_state < IP_ADDR_REGISTERED || iwdev->closing)
+		return NOTIFY_DONE;
+
+	netdev = iwdev->ldev->netdev;
+	if (netdev != event_netdev)
+		return NOTIFY_DONE;
+
+	i40iw_copy_ip_ntohl(local_ipaddr6, ifa->addr.in6_u.u6_addr32);
+	switch (event) {
+	case NETDEV_DOWN:
+		action = I40IW_ARP_DELETE;
+		/* Fall through */
+	case NETDEV_UP:
+		/* Fall through */
+	case NETDEV_CHANGEADDR:
+		i40iw_manage_arp_cache(iwdev,
+				       netdev->dev_addr,
+				       local_ipaddr6,
+				       false,
+				       action);
+		i40iw_if_notify(iwdev, netdev, local_ipaddr6, false,
+				(action == I40IW_ARP_ADD) ? true : false);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+/**
+ * i40iw_net_event - system notifier for netevents
+ * @notfier: not used
+ * @event: event for notifier
+ * @ptr: neighbor
+ */
+int i40iw_net_event(struct notifier_block *notifier, unsigned long event, void *ptr)
+{
+	struct neighbour *neigh = ptr;
+	struct i40iw_device *iwdev;
+	struct i40iw_handler *iwhdl;
+	__be32 *p;
+	u32 local_ipaddr[4];
+
+	switch (event) {
+	case NETEVENT_NEIGH_UPDATE:
+		iwhdl = i40iw_find_netdev((struct net_device *)neigh->dev);
+		if (!iwhdl)
+			return NOTIFY_DONE;
+		iwdev = &iwhdl->device;
+		if (iwdev->init_state < IP_ADDR_REGISTERED || iwdev->closing)
+			return NOTIFY_DONE;
+		p = (__be32 *)neigh->primary_key;
+		i40iw_copy_ip_ntohl(local_ipaddr, p);
+		if (neigh->nud_state & NUD_VALID) {
+			i40iw_manage_arp_cache(iwdev,
+					       neigh->ha,
+					       local_ipaddr,
+					       false,
+					       I40IW_ARP_ADD);
+
+		} else {
+			i40iw_manage_arp_cache(iwdev,
+					       neigh->ha,
+					       local_ipaddr,
+					       false,
+					       I40IW_ARP_DELETE);
+		}
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+/**
+ * i40iw_netdevice_event - system notifier for netdev events
+ * @notfier: not used
+ * @event: event for notifier
+ * @ptr: netdev
+ */
+int i40iw_netdevice_event(struct notifier_block *notifier,
+			  unsigned long event,
+			  void *ptr)
+{
+	struct net_device *event_netdev;
+	struct net_device *netdev;
+	struct i40iw_device *iwdev;
+	struct i40iw_handler *hdl;
+
+	event_netdev = netdev_notifier_info_to_dev(ptr);
+
+	hdl = i40iw_find_netdev(event_netdev);
+	if (!hdl)
+		return NOTIFY_DONE;
+
+	iwdev = &hdl->device;
+	if (iwdev->init_state < RDMA_DEV_REGISTERED || iwdev->closing)
+		return NOTIFY_DONE;
+
+	netdev = iwdev->ldev->netdev;
+	if (netdev != event_netdev)
+		return NOTIFY_DONE;
+
+	iwdev->iw_status = 1;
+
+	switch (event) {
+	case NETDEV_DOWN:
+		iwdev->iw_status = 0;
+		/* Fall through */
+	case NETDEV_UP:
+		i40iw_port_ibevent(iwdev);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+/**
+ * i40iw_get_cqp_request - get cqp struct
+ * @cqp: device cqp ptr
+ * @wait: cqp to be used in wait mode
+ */
+struct i40iw_cqp_request *i40iw_get_cqp_request(struct i40iw_cqp *cqp, bool wait)
+{
+	struct i40iw_cqp_request *cqp_request = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cqp->req_lock, flags);
+	if (!list_empty(&cqp->cqp_avail_reqs)) {
+		cqp_request = list_entry(cqp->cqp_avail_reqs.next,
+					 struct i40iw_cqp_request, list);
+		list_del_init(&cqp_request->list);
+	}
+	spin_unlock_irqrestore(&cqp->req_lock, flags);
+	if (!cqp_request) {
+		cqp_request = kzalloc(sizeof(*cqp_request), GFP_ATOMIC);
+		if (cqp_request) {
+			cqp_request->dynamic = true;
+			INIT_LIST_HEAD(&cqp_request->list);
+			init_waitqueue_head(&cqp_request->waitq);
+		}
+	}
+	if (!cqp_request) {
+		i40iw_pr_err("CQP Request Fail: No Memory");
+		return NULL;
+	}
+
+	if (wait) {
+		atomic_set(&cqp_request->refcount, 2);
+		cqp_request->waiting = true;
+	} else {
+		atomic_set(&cqp_request->refcount, 1);
+	}
+	return cqp_request;
+}
+
+/**
+ * i40iw_free_cqp_request - free cqp request
+ * @cqp: cqp ptr
+ * @cqp_request: to be put back in cqp list
+ */
+void i40iw_free_cqp_request(struct i40iw_cqp *cqp, struct i40iw_cqp_request *cqp_request)
+{
+	struct i40iw_device *iwdev = container_of(cqp, struct i40iw_device, cqp);
+	unsigned long flags;
+
+	if (cqp_request->dynamic) {
+		kfree(cqp_request);
+	} else {
+		cqp_request->request_done = false;
+		cqp_request->callback_fcn = NULL;
+		cqp_request->waiting = false;
+
+		spin_lock_irqsave(&cqp->req_lock, flags);
+		list_add_tail(&cqp_request->list, &cqp->cqp_avail_reqs);
+		spin_unlock_irqrestore(&cqp->req_lock, flags);
+	}
+	wake_up(&iwdev->close_wq);
+}
+
+/**
+ * i40iw_put_cqp_request - dec ref count and free if 0
+ * @cqp: cqp ptr
+ * @cqp_request: to be put back in cqp list
+ */
+void i40iw_put_cqp_request(struct i40iw_cqp *cqp,
+			   struct i40iw_cqp_request *cqp_request)
+{
+	if (atomic_dec_and_test(&cqp_request->refcount))
+		i40iw_free_cqp_request(cqp, cqp_request);
+}
+
+/**
+ * i40iw_free_pending_cqp_request -free pending cqp request objs
+ * @cqp: cqp ptr
+ * @cqp_request: to be put back in cqp list
+ */
+static void i40iw_free_pending_cqp_request(struct i40iw_cqp *cqp,
+					   struct i40iw_cqp_request *cqp_request)
+{
+	struct i40iw_device *iwdev = container_of(cqp, struct i40iw_device, cqp);
+
+	if (cqp_request->waiting) {
+		cqp_request->compl_info.error = true;
+		cqp_request->request_done = true;
+		wake_up(&cqp_request->waitq);
+	}
+	i40iw_put_cqp_request(cqp, cqp_request);
+	wait_event_timeout(iwdev->close_wq,
+			   !atomic_read(&cqp_request->refcount),
+			   1000);
+}
+
+/**
+ * i40iw_cleanup_pending_cqp_op - clean-up cqp with no completions
+ * @iwdev: iwarp device
+ */
+void i40iw_cleanup_pending_cqp_op(struct i40iw_device *iwdev)
+{
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_cqp *cqp = &iwdev->cqp;
+	struct i40iw_cqp_request *cqp_request = NULL;
+	struct cqp_commands_info *pcmdinfo = NULL;
+	u32 i, pending_work, wqe_idx;
+
+	pending_work = I40IW_RING_WORK_AVAILABLE(cqp->sc_cqp.sq_ring);
+	wqe_idx = I40IW_RING_GETCURRENT_TAIL(cqp->sc_cqp.sq_ring);
+	for (i = 0; i < pending_work; i++) {
+		cqp_request = (struct i40iw_cqp_request *)(unsigned long)cqp->scratch_array[wqe_idx];
+		if (cqp_request)
+			i40iw_free_pending_cqp_request(cqp, cqp_request);
+		wqe_idx = (wqe_idx + 1) % I40IW_RING_GETSIZE(cqp->sc_cqp.sq_ring);
+	}
+
+	while (!list_empty(&dev->cqp_cmd_head)) {
+		pcmdinfo = (struct cqp_commands_info *)i40iw_remove_head(&dev->cqp_cmd_head);
+		cqp_request = container_of(pcmdinfo, struct i40iw_cqp_request, info);
+		if (cqp_request)
+			i40iw_free_pending_cqp_request(cqp, cqp_request);
+	}
+}
+
+/**
+ * i40iw_free_qp - callback after destroy cqp completes
+ * @cqp_request: cqp request for destroy qp
+ * @num: not used
+ */
+static void i40iw_free_qp(struct i40iw_cqp_request *cqp_request, u32 num)
+{
+	struct i40iw_sc_qp *qp = (struct i40iw_sc_qp *)cqp_request->param;
+	struct i40iw_qp *iwqp = (struct i40iw_qp *)qp->back_qp;
+	struct i40iw_device *iwdev;
+	u32 qp_num = iwqp->ibqp.qp_num;
+
+	iwdev = iwqp->iwdev;
+
+	i40iw_rem_pdusecount(iwqp->iwpd, iwdev);
+	i40iw_free_qp_resources(iwdev, iwqp, qp_num);
+	i40iw_rem_devusecount(iwdev);
+}
+
+/**
+ * i40iw_wait_event - wait for completion
+ * @iwdev: iwarp device
+ * @cqp_request: cqp request to wait
+ */
+static int i40iw_wait_event(struct i40iw_device *iwdev,
+			    struct i40iw_cqp_request *cqp_request)
+{
+	struct cqp_commands_info *info = &cqp_request->info;
+	struct i40iw_cqp *iwcqp = &iwdev->cqp;
+	struct i40iw_cqp_timeout cqp_timeout;
+	bool cqp_error = false;
+	int err_code = 0;
+	memset(&cqp_timeout, 0, sizeof(cqp_timeout));
+	cqp_timeout.compl_cqp_cmds = iwdev->sc_dev.cqp_cmd_stats[OP_COMPLETED_COMMANDS];
+	do {
+		if (wait_event_timeout(cqp_request->waitq,
+				       cqp_request->request_done, CQP_COMPL_WAIT_TIME))
+			break;
+
+		i40iw_check_cqp_progress(&cqp_timeout, &iwdev->sc_dev);
+
+		if (cqp_timeout.count < CQP_TIMEOUT_THRESHOLD)
+			continue;
+
+		i40iw_pr_err("error cqp command 0x%x timed out", info->cqp_cmd);
+		err_code = -ETIME;
+		if (!iwdev->reset) {
+			iwdev->reset = true;
+			i40iw_request_reset(iwdev);
+		}
+		goto done;
+	} while (1);
+	cqp_error = cqp_request->compl_info.error;
+	if (cqp_error) {
+		i40iw_pr_err("error cqp command 0x%x completion maj = 0x%x min=0x%x\n",
+			     info->cqp_cmd, cqp_request->compl_info.maj_err_code,
+			     cqp_request->compl_info.min_err_code);
+		err_code = -EPROTO;
+		goto done;
+	}
+done:
+	i40iw_put_cqp_request(iwcqp, cqp_request);
+	return err_code;
+}
+
+/**
+ * i40iw_handle_cqp_op - process cqp command
+ * @iwdev: iwarp device
+ * @cqp_request: cqp request to process
+ */
+enum i40iw_status_code i40iw_handle_cqp_op(struct i40iw_device *iwdev,
+					   struct i40iw_cqp_request
+					   *cqp_request)
+{
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	enum i40iw_status_code status;
+	struct cqp_commands_info *info = &cqp_request->info;
+	int err_code = 0;
+
+	if (iwdev->reset) {
+		i40iw_free_cqp_request(&iwdev->cqp, cqp_request);
+		return I40IW_ERR_CQP_COMPL_ERROR;
+	}
+
+	status = i40iw_process_cqp_cmd(dev, info);
+	if (status) {
+		i40iw_pr_err("error cqp command 0x%x failed\n", info->cqp_cmd);
+		i40iw_free_cqp_request(&iwdev->cqp, cqp_request);
+		return status;
+	}
+	if (cqp_request->waiting)
+		err_code = i40iw_wait_event(iwdev, cqp_request);
+	if (err_code)
+		status = I40IW_ERR_CQP_COMPL_ERROR;
+	return status;
+}
+
+/**
+ * i40iw_add_devusecount - add dev refcount
+ * @iwdev: dev for refcount
+ */
+void i40iw_add_devusecount(struct i40iw_device *iwdev)
+{
+	atomic64_inc(&iwdev->use_count);
+}
+
+/**
+ * i40iw_rem_devusecount - decrement refcount for dev
+ * @iwdev: device
+ */
+void i40iw_rem_devusecount(struct i40iw_device *iwdev)
+{
+	if (!atomic64_dec_and_test(&iwdev->use_count))
+		return;
+	wake_up(&iwdev->close_wq);
+}
+
+/**
+ * i40iw_add_pdusecount - add pd refcount
+ * @iwpd: pd for refcount
+ */
+void i40iw_add_pdusecount(struct i40iw_pd *iwpd)
+{
+	atomic_inc(&iwpd->usecount);
+}
+
+/**
+ * i40iw_rem_pdusecount - decrement refcount for pd and free if 0
+ * @iwpd: pd for refcount
+ * @iwdev: iwarp device
+ */
+void i40iw_rem_pdusecount(struct i40iw_pd *iwpd, struct i40iw_device *iwdev)
+{
+	if (!atomic_dec_and_test(&iwpd->usecount))
+		return;
+	i40iw_free_resource(iwdev, iwdev->allocated_pds, iwpd->sc_pd.pd_id);
+	kfree(iwpd);
+}
+
+/**
+ * i40iw_add_ref - add refcount for qp
+ * @ibqp: iqarp qp
+ */
+void i40iw_add_ref(struct ib_qp *ibqp)
+{
+	struct i40iw_qp *iwqp = (struct i40iw_qp *)ibqp;
+
+	atomic_inc(&iwqp->refcount);
+}
+
+/**
+ * i40iw_rem_ref - rem refcount for qp and free if 0
+ * @ibqp: iqarp qp
+ */
+void i40iw_rem_ref(struct ib_qp *ibqp)
+{
+	struct i40iw_qp *iwqp;
+	enum i40iw_status_code status;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	struct i40iw_device *iwdev;
+	u32 qp_num;
+	unsigned long flags;
+
+	iwqp = to_iwqp(ibqp);
+	iwdev = iwqp->iwdev;
+	spin_lock_irqsave(&iwdev->qptable_lock, flags);
+	if (!atomic_dec_and_test(&iwqp->refcount)) {
+		spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
+		return;
+	}
+
+	qp_num = iwqp->ibqp.qp_num;
+	iwdev->qp_table[qp_num] = NULL;
+	spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, false);
+	if (!cqp_request)
+		return;
+
+	cqp_request->callback_fcn = i40iw_free_qp;
+	cqp_request->param = (void *)&iwqp->sc_qp;
+	cqp_info = &cqp_request->info;
+	cqp_info->cqp_cmd = OP_QP_DESTROY;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.qp_destroy.qp = &iwqp->sc_qp;
+	cqp_info->in.u.qp_destroy.scratch = (uintptr_t)cqp_request;
+	cqp_info->in.u.qp_destroy.remove_hash_idx = true;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (!status)
+		return;
+
+	i40iw_rem_pdusecount(iwqp->iwpd, iwdev);
+	i40iw_free_qp_resources(iwdev, iwqp, qp_num);
+	i40iw_rem_devusecount(iwdev);
+}
+
+/**
+ * i40iw_get_qp - get qp address
+ * @device: iwarp device
+ * @qpn: qp number
+ */
+struct ib_qp *i40iw_get_qp(struct ib_device *device, int qpn)
+{
+	struct i40iw_device *iwdev = to_iwdev(device);
+
+	if ((qpn < IW_FIRST_QPN) || (qpn >= iwdev->max_qp))
+		return NULL;
+
+	return &iwdev->qp_table[qpn]->ibqp;
+}
+
+/**
+ * i40iw_debug_buf - print debug msg and buffer is mask set
+ * @dev: hardware control device structure
+ * @mask: mask to compare if to print debug buffer
+ * @buf: points buffer addr
+ * @size: saize of buffer to print
+ */
+void i40iw_debug_buf(struct i40iw_sc_dev *dev,
+		     enum i40iw_debug_flag mask,
+		     char *desc,
+		     u64 *buf,
+		     u32 size)
+{
+	u32 i;
+
+	if (!(dev->debug_mask & mask))
+		return;
+	i40iw_debug(dev, mask, "%s\n", desc);
+	i40iw_debug(dev, mask, "starting address virt=%p phy=%llxh\n", buf,
+		    (unsigned long long)virt_to_phys(buf));
+
+	for (i = 0; i < size; i += 8)
+		i40iw_debug(dev, mask, "index %03d val: %016llx\n", i, buf[i / 8]);
+}
+
+/**
+ * i40iw_get_hw_addr - return hw addr
+ * @par: points to shared dev
+ */
+u8 __iomem *i40iw_get_hw_addr(void *par)
+{
+	struct i40iw_sc_dev *dev = (struct i40iw_sc_dev *)par;
+
+	return dev->hw->hw_addr;
+}
+
+/**
+ * i40iw_remove_head - return head entry and remove from list
+ * @list: list for entry
+ */
+void *i40iw_remove_head(struct list_head *list)
+{
+	struct list_head *entry;
+
+	if (list_empty(list))
+		return NULL;
+
+	entry = (void *)list->next;
+	list_del(entry);
+	return (void *)entry;
+}
+
+/**
+ * i40iw_allocate_dma_mem - Memory alloc helper fn
+ * @hw:   pointer to the HW structure
+ * @mem:  ptr to mem struct to fill out
+ * @size: size of memory requested
+ * @alignment: what to align the allocation to
+ */
+enum i40iw_status_code i40iw_allocate_dma_mem(struct i40iw_hw *hw,
+					      struct i40iw_dma_mem *mem,
+					      u64 size,
+					      u32 alignment)
+{
+	struct pci_dev *pcidev = (struct pci_dev *)hw->dev_context;
+
+	if (!mem)
+		return I40IW_ERR_PARAM;
+	mem->size = ALIGN(size, alignment);
+	mem->va = dma_zalloc_coherent(&pcidev->dev, mem->size,
+				      (dma_addr_t *)&mem->pa, GFP_KERNEL);
+	if (!mem->va)
+		return I40IW_ERR_NO_MEMORY;
+	return 0;
+}
+
+/**
+ * i40iw_free_dma_mem - Memory free helper fn
+ * @hw:   pointer to the HW structure
+ * @mem:  ptr to mem struct to free
+ */
+void i40iw_free_dma_mem(struct i40iw_hw *hw, struct i40iw_dma_mem *mem)
+{
+	struct pci_dev *pcidev = (struct pci_dev *)hw->dev_context;
+
+	if (!mem || !mem->va)
+		return;
+
+	dma_free_coherent(&pcidev->dev, mem->size,
+			  mem->va, (dma_addr_t)mem->pa);
+	mem->va = NULL;
+}
+
+/**
+ * i40iw_allocate_virt_mem - virtual memory alloc helper fn
+ * @hw:   pointer to the HW structure
+ * @mem:  ptr to mem struct to fill out
+ * @size: size of memory requested
+ */
+enum i40iw_status_code i40iw_allocate_virt_mem(struct i40iw_hw *hw,
+					       struct i40iw_virt_mem *mem,
+					       u32 size)
+{
+	if (!mem)
+		return I40IW_ERR_PARAM;
+
+	mem->size = size;
+	mem->va = kzalloc(size, GFP_KERNEL);
+
+	if (mem->va)
+		return 0;
+	else
+		return I40IW_ERR_NO_MEMORY;
+}
+
+/**
+ * i40iw_free_virt_mem - virtual memory free helper fn
+ * @hw:   pointer to the HW structure
+ * @mem:  ptr to mem struct to free
+ */
+enum i40iw_status_code i40iw_free_virt_mem(struct i40iw_hw *hw,
+					   struct i40iw_virt_mem *mem)
+{
+	if (!mem)
+		return I40IW_ERR_PARAM;
+	/*
+	 * mem->va points to the parent of mem, so both mem and mem->va
+	 * can not be touched once mem->va is freed
+	 */
+	kfree(mem->va);
+	return 0;
+}
+
+/**
+ * i40iw_cqp_sds_cmd - create cqp command for sd
+ * @dev: hardware control device structure
+ * @sd_info: information  for sd cqp
+ *
+ */
+enum i40iw_status_code i40iw_cqp_sds_cmd(struct i40iw_sc_dev *dev,
+					 struct i40iw_update_sds_info *sdinfo)
+{
+	enum i40iw_status_code status;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
+	if (!cqp_request)
+		return I40IW_ERR_NO_MEMORY;
+	cqp_info = &cqp_request->info;
+	memcpy(&cqp_info->in.u.update_pe_sds.info, sdinfo,
+	       sizeof(cqp_info->in.u.update_pe_sds.info));
+	cqp_info->cqp_cmd = OP_UPDATE_PE_SDS;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.update_pe_sds.dev = dev;
+	cqp_info->in.u.update_pe_sds.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP Update SD's fail");
+	return status;
+}
+
+/**
+ * i40iw_qp_suspend_resume - cqp command for suspend/resume
+ * @dev: hardware control device structure
+ * @qp: hardware control qp
+ * @suspend: flag if suspend or resume
+ */
+void i40iw_qp_suspend_resume(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp, bool suspend)
+{
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+	struct i40iw_cqp_request *cqp_request;
+	struct i40iw_sc_cqp *cqp = dev->cqp;
+	struct cqp_commands_info *cqp_info;
+	enum i40iw_status_code status;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, false);
+	if (!cqp_request)
+		return;
+
+	cqp_info = &cqp_request->info;
+	cqp_info->cqp_cmd = (suspend) ? OP_SUSPEND : OP_RESUME;
+	cqp_info->in.u.suspend_resume.cqp = cqp;
+	cqp_info->in.u.suspend_resume.qp = qp;
+	cqp_info->in.u.suspend_resume.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP QP Suspend/Resume fail");
+}
+
+/**
+ * i40iw_term_modify_qp - modify qp for term message
+ * @qp: hardware control qp
+ * @next_state: qp's next state
+ * @term: terminate code
+ * @term_len: length
+ */
+void i40iw_term_modify_qp(struct i40iw_sc_qp *qp, u8 next_state, u8 term, u8 term_len)
+{
+	struct i40iw_qp *iwqp;
+
+	iwqp = (struct i40iw_qp *)qp->back_qp;
+	i40iw_next_iw_state(iwqp, next_state, 0, term, term_len);
+};
+
+/**
+ * i40iw_terminate_done - after terminate is completed
+ * @qp: hardware control qp
+ * @timeout_occurred: indicates if terminate timer expired
+ */
+void i40iw_terminate_done(struct i40iw_sc_qp *qp, int timeout_occurred)
+{
+	struct i40iw_qp *iwqp;
+	u32 next_iwarp_state = I40IW_QP_STATE_ERROR;
+	u8 hte = 0;
+	bool first_time;
+	unsigned long flags;
+
+	iwqp = (struct i40iw_qp *)qp->back_qp;
+	spin_lock_irqsave(&iwqp->lock, flags);
+	if (iwqp->hte_added) {
+		iwqp->hte_added = 0;
+		hte = 1;
+	}
+	first_time = !(qp->term_flags & I40IW_TERM_DONE);
+	qp->term_flags |= I40IW_TERM_DONE;
+	spin_unlock_irqrestore(&iwqp->lock, flags);
+	if (first_time) {
+		if (!timeout_occurred)
+			i40iw_terminate_del_timer(qp);
+		else
+			next_iwarp_state = I40IW_QP_STATE_CLOSING;
+
+		i40iw_next_iw_state(iwqp, next_iwarp_state, hte, 0, 0);
+		i40iw_cm_disconn(iwqp);
+	}
+}
+
+/**
+ * i40iw_terminate_imeout - timeout happened
+ * @context: points to iwarp qp
+ */
+static void i40iw_terminate_timeout(struct timer_list *t)
+{
+	struct i40iw_qp *iwqp = from_timer(iwqp, t, terminate_timer);
+	struct i40iw_sc_qp *qp = (struct i40iw_sc_qp *)&iwqp->sc_qp;
+
+	i40iw_terminate_done(qp, 1);
+	i40iw_rem_ref(&iwqp->ibqp);
+}
+
+/**
+ * i40iw_terminate_start_timer - start terminate timeout
+ * @qp: hardware control qp
+ */
+void i40iw_terminate_start_timer(struct i40iw_sc_qp *qp)
+{
+	struct i40iw_qp *iwqp;
+
+	iwqp = (struct i40iw_qp *)qp->back_qp;
+	i40iw_add_ref(&iwqp->ibqp);
+	timer_setup(&iwqp->terminate_timer, i40iw_terminate_timeout, 0);
+	iwqp->terminate_timer.expires = jiffies + HZ;
+	add_timer(&iwqp->terminate_timer);
+}
+
+/**
+ * i40iw_terminate_del_timer - delete terminate timeout
+ * @qp: hardware control qp
+ */
+void i40iw_terminate_del_timer(struct i40iw_sc_qp *qp)
+{
+	struct i40iw_qp *iwqp;
+
+	iwqp = (struct i40iw_qp *)qp->back_qp;
+	if (del_timer(&iwqp->terminate_timer))
+		i40iw_rem_ref(&iwqp->ibqp);
+}
+
+/**
+ * i40iw_cqp_generic_worker - generic worker for cqp
+ * @work: work pointer
+ */
+static void i40iw_cqp_generic_worker(struct work_struct *work)
+{
+	struct i40iw_virtchnl_work_info *work_info =
+	    &((struct virtchnl_work *)work)->work_info;
+
+	if (work_info->worker_vf_dev)
+		work_info->callback_fcn(work_info->worker_vf_dev);
+}
+
+/**
+ * i40iw_cqp_spawn_worker - spawn worket thread
+ * @iwdev: device struct pointer
+ * @work_info: work request info
+ * @iw_vf_idx: virtual function index
+ */
+void i40iw_cqp_spawn_worker(struct i40iw_sc_dev *dev,
+			    struct i40iw_virtchnl_work_info *work_info,
+			    u32 iw_vf_idx)
+{
+	struct virtchnl_work *work;
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+
+	work = &iwdev->virtchnl_w[iw_vf_idx];
+	memcpy(&work->work_info, work_info, sizeof(*work_info));
+	INIT_WORK(&work->work, i40iw_cqp_generic_worker);
+	queue_work(iwdev->virtchnl_wq, &work->work);
+}
+
+/**
+ * i40iw_cqp_manage_hmc_fcn_worker -
+ * @work: work pointer for hmc info
+ */
+static void i40iw_cqp_manage_hmc_fcn_worker(struct work_struct *work)
+{
+	struct i40iw_cqp_request *cqp_request =
+	    ((struct virtchnl_work *)work)->cqp_request;
+	struct i40iw_ccq_cqe_info ccq_cqe_info;
+	struct i40iw_hmc_fcn_info *hmcfcninfo =
+			&cqp_request->info.in.u.manage_hmc_pm.info;
+	struct i40iw_device *iwdev =
+	    (struct i40iw_device *)cqp_request->info.in.u.manage_hmc_pm.dev->back_dev;
+
+	ccq_cqe_info.cqp = NULL;
+	ccq_cqe_info.maj_err_code = cqp_request->compl_info.maj_err_code;
+	ccq_cqe_info.min_err_code = cqp_request->compl_info.min_err_code;
+	ccq_cqe_info.op_code = cqp_request->compl_info.op_code;
+	ccq_cqe_info.op_ret_val = cqp_request->compl_info.op_ret_val;
+	ccq_cqe_info.scratch = 0;
+	ccq_cqe_info.error = cqp_request->compl_info.error;
+	hmcfcninfo->callback_fcn(cqp_request->info.in.u.manage_hmc_pm.dev,
+				 hmcfcninfo->cqp_callback_param, &ccq_cqe_info);
+	i40iw_put_cqp_request(&iwdev->cqp, cqp_request);
+}
+
+/**
+ * i40iw_cqp_manage_hmc_fcn_callback - called function after cqp completion
+ * @cqp_request: cqp request info struct for hmc fun
+ * @unused: unused param of callback
+ */
+static void i40iw_cqp_manage_hmc_fcn_callback(struct i40iw_cqp_request *cqp_request,
+					      u32 unused)
+{
+	struct virtchnl_work *work;
+	struct i40iw_hmc_fcn_info *hmcfcninfo =
+	    &cqp_request->info.in.u.manage_hmc_pm.info;
+	struct i40iw_device *iwdev =
+	    (struct i40iw_device *)cqp_request->info.in.u.manage_hmc_pm.dev->
+	    back_dev;
+
+	if (hmcfcninfo && hmcfcninfo->callback_fcn) {
+		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_HMC, "%s1\n", __func__);
+		atomic_inc(&cqp_request->refcount);
+		work = &iwdev->virtchnl_w[hmcfcninfo->iw_vf_idx];
+		work->cqp_request = cqp_request;
+		INIT_WORK(&work->work, i40iw_cqp_manage_hmc_fcn_worker);
+		queue_work(iwdev->virtchnl_wq, &work->work);
+		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_HMC, "%s2\n", __func__);
+	} else {
+		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_HMC, "%s: Something wrong\n", __func__);
+	}
+}
+
+/**
+ * i40iw_cqp_manage_hmc_fcn_cmd - issue cqp command to manage hmc
+ * @dev: hardware control device structure
+ * @hmcfcninfo: info for hmc
+ */
+enum i40iw_status_code i40iw_cqp_manage_hmc_fcn_cmd(struct i40iw_sc_dev *dev,
+						    struct i40iw_hmc_fcn_info *hmcfcninfo)
+{
+	enum i40iw_status_code status;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+
+	i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_HMC, "%s\n", __func__);
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, false);
+	if (!cqp_request)
+		return I40IW_ERR_NO_MEMORY;
+	cqp_info = &cqp_request->info;
+	cqp_request->callback_fcn = i40iw_cqp_manage_hmc_fcn_callback;
+	cqp_request->param = hmcfcninfo;
+	memcpy(&cqp_info->in.u.manage_hmc_pm.info, hmcfcninfo,
+	       sizeof(*hmcfcninfo));
+	cqp_info->in.u.manage_hmc_pm.dev = dev;
+	cqp_info->cqp_cmd = OP_MANAGE_HMC_PM_FUNC_TABLE;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.manage_hmc_pm.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP Manage HMC fail");
+	return status;
+}
+
+/**
+ * i40iw_cqp_query_fpm_values_cmd - send cqp command for fpm
+ * @iwdev: function device struct
+ * @values_mem: buffer for fpm
+ * @hmc_fn_id: function id for fpm
+ */
+enum i40iw_status_code i40iw_cqp_query_fpm_values_cmd(struct i40iw_sc_dev *dev,
+						      struct i40iw_dma_mem *values_mem,
+						      u8 hmc_fn_id)
+{
+	enum i40iw_status_code status;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
+	if (!cqp_request)
+		return I40IW_ERR_NO_MEMORY;
+	cqp_info = &cqp_request->info;
+	cqp_request->param = NULL;
+	cqp_info->in.u.query_fpm_values.cqp = dev->cqp;
+	cqp_info->in.u.query_fpm_values.fpm_values_pa = values_mem->pa;
+	cqp_info->in.u.query_fpm_values.fpm_values_va = values_mem->va;
+	cqp_info->in.u.query_fpm_values.hmc_fn_id = hmc_fn_id;
+	cqp_info->cqp_cmd = OP_QUERY_FPM_VALUES;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.query_fpm_values.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP Query FPM fail");
+	return status;
+}
+
+/**
+ * i40iw_cqp_commit_fpm_values_cmd - commit fpm values in hw
+ * @dev: hardware control device structure
+ * @values_mem: buffer with fpm values
+ * @hmc_fn_id: function id for fpm
+ */
+enum i40iw_status_code i40iw_cqp_commit_fpm_values_cmd(struct i40iw_sc_dev *dev,
+						       struct i40iw_dma_mem *values_mem,
+						       u8 hmc_fn_id)
+{
+	enum i40iw_status_code status;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
+	if (!cqp_request)
+		return I40IW_ERR_NO_MEMORY;
+	cqp_info = &cqp_request->info;
+	cqp_request->param = NULL;
+	cqp_info->in.u.commit_fpm_values.cqp = dev->cqp;
+	cqp_info->in.u.commit_fpm_values.fpm_values_pa = values_mem->pa;
+	cqp_info->in.u.commit_fpm_values.fpm_values_va = values_mem->va;
+	cqp_info->in.u.commit_fpm_values.hmc_fn_id = hmc_fn_id;
+	cqp_info->cqp_cmd = OP_COMMIT_FPM_VALUES;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.commit_fpm_values.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP Commit FPM fail");
+	return status;
+}
+
+/**
+ * i40iw_vf_wait_vchnl_resp - wait for channel msg
+ * @iwdev: function's device struct
+ */
+enum i40iw_status_code i40iw_vf_wait_vchnl_resp(struct i40iw_sc_dev *dev)
+{
+	struct i40iw_device *iwdev = dev->back_dev;
+	int timeout_ret;
+
+	i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s[%u] dev %p, iwdev %p\n",
+		    __func__, __LINE__, dev, iwdev);
+
+	atomic_set(&iwdev->vchnl_msgs, 2);
+	timeout_ret = wait_event_timeout(iwdev->vchnl_waitq,
+					 (atomic_read(&iwdev->vchnl_msgs) == 1),
+					 I40IW_VCHNL_EVENT_TIMEOUT);
+	atomic_dec(&iwdev->vchnl_msgs);
+	if (!timeout_ret) {
+		i40iw_pr_err("virt channel completion timeout = 0x%x\n", timeout_ret);
+		atomic_set(&iwdev->vchnl_msgs, 0);
+		dev->vchnl_up = false;
+		return I40IW_ERR_TIMEOUT;
+	}
+	wake_up(&dev->vf_reqs);
+	return 0;
+}
+
+/**
+ * i40iw_cqp_cq_create_cmd - create a cq for the cqp
+ * @dev: device pointer
+ * @cq: pointer to created cq
+ */
+enum i40iw_status_code i40iw_cqp_cq_create_cmd(struct i40iw_sc_dev *dev,
+					       struct i40iw_sc_cq *cq)
+{
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+	struct i40iw_cqp *iwcqp = &iwdev->cqp;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	enum i40iw_status_code status;
+
+	cqp_request = i40iw_get_cqp_request(iwcqp, true);
+	if (!cqp_request)
+		return I40IW_ERR_NO_MEMORY;
+
+	cqp_info = &cqp_request->info;
+	cqp_info->cqp_cmd = OP_CQ_CREATE;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.cq_create.cq = cq;
+	cqp_info->in.u.cq_create.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP Create QP fail");
+
+	return status;
+}
+
+/**
+ * i40iw_cqp_qp_create_cmd - create a qp for the cqp
+ * @dev: device pointer
+ * @qp: pointer to created qp
+ */
+enum i40iw_status_code i40iw_cqp_qp_create_cmd(struct i40iw_sc_dev *dev,
+					       struct i40iw_sc_qp *qp)
+{
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+	struct i40iw_cqp *iwcqp = &iwdev->cqp;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	struct i40iw_create_qp_info *qp_info;
+	enum i40iw_status_code status;
+
+	cqp_request = i40iw_get_cqp_request(iwcqp, true);
+	if (!cqp_request)
+		return I40IW_ERR_NO_MEMORY;
+
+	cqp_info = &cqp_request->info;
+	qp_info = &cqp_request->info.in.u.qp_create.info;
+
+	memset(qp_info, 0, sizeof(*qp_info));
+
+	qp_info->cq_num_valid = true;
+	qp_info->next_iwarp_state = I40IW_QP_STATE_RTS;
+
+	cqp_info->cqp_cmd = OP_QP_CREATE;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.qp_create.qp = qp;
+	cqp_info->in.u.qp_create.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP QP create fail");
+	return status;
+}
+
+/**
+ * i40iw_cqp_cq_destroy_cmd - destroy the cqp cq
+ * @dev: device pointer
+ * @cq: pointer to cq
+ */
+void i40iw_cqp_cq_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq)
+{
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+
+	i40iw_cq_wq_destroy(iwdev, cq);
+}
+
+/**
+ * i40iw_cqp_qp_destroy_cmd - destroy the cqp
+ * @dev: device pointer
+ * @qp: pointer to qp
+ */
+void i40iw_cqp_qp_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp)
+{
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+	struct i40iw_cqp *iwcqp = &iwdev->cqp;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	enum i40iw_status_code status;
+
+	cqp_request = i40iw_get_cqp_request(iwcqp, true);
+	if (!cqp_request)
+		return;
+
+	cqp_info = &cqp_request->info;
+	memset(cqp_info, 0, sizeof(*cqp_info));
+
+	cqp_info->cqp_cmd = OP_QP_DESTROY;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.qp_destroy.qp = qp;
+	cqp_info->in.u.qp_destroy.scratch = (uintptr_t)cqp_request;
+	cqp_info->in.u.qp_destroy.remove_hash_idx = true;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP QP_DESTROY fail");
+}
+
+
+/**
+ * i40iw_ieq_mpa_crc_ae - generate AE for crc error
+ * @dev: hardware control device structure
+ * @qp: hardware control qp
+ */
+void i40iw_ieq_mpa_crc_ae(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp)
+{
+	struct i40iw_gen_ae_info info;
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+
+	i40iw_debug(dev, I40IW_DEBUG_AEQ, "%s entered\n", __func__);
+	info.ae_code = I40IW_AE_LLP_RECEIVED_MPA_CRC_ERROR;
+	info.ae_source = I40IW_AE_SOURCE_RQ;
+	i40iw_gen_ae(iwdev, qp, &info, false);
+}
+
+/**
+ * i40iw_init_hash_desc - initialize hash for crc calculation
+ * @desc: cryption type
+ */
+enum i40iw_status_code i40iw_init_hash_desc(struct shash_desc **desc)
+{
+	struct crypto_shash *tfm;
+	struct shash_desc *tdesc;
+
+	tfm = crypto_alloc_shash("crc32c", 0, 0);
+	if (IS_ERR(tfm))
+		return I40IW_ERR_MPA_CRC;
+
+	tdesc = kzalloc(sizeof(*tdesc) + crypto_shash_descsize(tfm),
+			GFP_KERNEL);
+	if (!tdesc) {
+		crypto_free_shash(tfm);
+		return I40IW_ERR_MPA_CRC;
+	}
+	tdesc->tfm = tfm;
+	*desc = tdesc;
+
+	return 0;
+}
+
+/**
+ * i40iw_free_hash_desc - free hash desc
+ * @desc: to be freed
+ */
+void i40iw_free_hash_desc(struct shash_desc *desc)
+{
+	if (desc) {
+		crypto_free_shash(desc->tfm);
+		kfree(desc);
+	}
+}
+
+/**
+ * i40iw_alloc_query_fpm_buf - allocate buffer for fpm
+ * @dev: hardware control device structure
+ * @mem: buffer ptr for fpm to be allocated
+ * @return: memory allocation status
+ */
+enum i40iw_status_code i40iw_alloc_query_fpm_buf(struct i40iw_sc_dev *dev,
+						 struct i40iw_dma_mem *mem)
+{
+	enum i40iw_status_code status;
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+
+	status = i40iw_obj_aligned_mem(iwdev, mem, I40IW_QUERY_FPM_BUF_SIZE,
+				       I40IW_FPM_QUERY_BUF_ALIGNMENT_MASK);
+	return status;
+}
+
+/**
+ * i40iw_ieq_check_mpacrc - check if mpa crc is OK
+ * @desc: desc for hash
+ * @addr: address of buffer for crc
+ * @length: length of buffer
+ * @value: value to be compared
+ */
+enum i40iw_status_code i40iw_ieq_check_mpacrc(struct shash_desc *desc,
+					      void *addr,
+					      u32 length,
+					      u32 value)
+{
+	u32 crc = 0;
+	int ret;
+	enum i40iw_status_code ret_code = 0;
+
+	crypto_shash_init(desc);
+	ret = crypto_shash_update(desc, addr, length);
+	if (!ret)
+		crypto_shash_final(desc, (u8 *)&crc);
+	if (crc != value) {
+		i40iw_pr_err("mpa crc check fail\n");
+		ret_code = I40IW_ERR_MPA_CRC;
+	}
+	return ret_code;
+}
+
+/**
+ * i40iw_ieq_get_qp - get qp based on quad in puda buffer
+ * @dev: hardware control device structure
+ * @buf: receive puda buffer on exception q
+ */
+struct i40iw_sc_qp *i40iw_ieq_get_qp(struct i40iw_sc_dev *dev,
+				     struct i40iw_puda_buf *buf)
+{
+	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
+	struct i40iw_qp *iwqp;
+	struct i40iw_cm_node *cm_node;
+	u32 loc_addr[4], rem_addr[4];
+	u16 loc_port, rem_port;
+	struct ipv6hdr *ip6h;
+	struct iphdr *iph = (struct iphdr *)buf->iph;
+	struct tcphdr *tcph = (struct tcphdr *)buf->tcph;
+
+	if (iph->version == 4) {
+		memset(loc_addr, 0, sizeof(loc_addr));
+		loc_addr[0] = ntohl(iph->daddr);
+		memset(rem_addr, 0, sizeof(rem_addr));
+		rem_addr[0] = ntohl(iph->saddr);
+	} else {
+		ip6h = (struct ipv6hdr *)buf->iph;
+		i40iw_copy_ip_ntohl(loc_addr, ip6h->daddr.in6_u.u6_addr32);
+		i40iw_copy_ip_ntohl(rem_addr, ip6h->saddr.in6_u.u6_addr32);
+	}
+	loc_port = ntohs(tcph->dest);
+	rem_port = ntohs(tcph->source);
+
+	cm_node = i40iw_find_node(&iwdev->cm_core, rem_port, rem_addr, loc_port,
+				  loc_addr, false, true);
+	if (!cm_node)
+		return NULL;
+	iwqp = cm_node->iwqp;
+	return &iwqp->sc_qp;
+}
+
+/**
+ * i40iw_ieq_update_tcpip_info - update tcpip in the buffer
+ * @buf: puda to update
+ * @length: length of buffer
+ * @seqnum: seq number for tcp
+ */
+void i40iw_ieq_update_tcpip_info(struct i40iw_puda_buf *buf, u16 length, u32 seqnum)
+{
+	struct tcphdr *tcph;
+	struct iphdr *iph;
+	u16 iphlen;
+	u16 packetsize;
+	u8 *addr = (u8 *)buf->mem.va;
+
+	iphlen = (buf->ipv4) ? 20 : 40;
+	iph = (struct iphdr *)(addr + buf->maclen);
+	tcph = (struct tcphdr *)(addr + buf->maclen + iphlen);
+	packetsize = length + buf->tcphlen + iphlen;
+
+	iph->tot_len = htons(packetsize);
+	tcph->seq = htonl(seqnum);
+}
+
+/**
+ * i40iw_puda_get_tcpip_info - get tcpip info from puda buffer
+ * @info: to get information
+ * @buf: puda buffer
+ */
+enum i40iw_status_code i40iw_puda_get_tcpip_info(struct i40iw_puda_completion_info *info,
+						 struct i40iw_puda_buf *buf)
+{
+	struct iphdr *iph;
+	struct ipv6hdr *ip6h;
+	struct tcphdr *tcph;
+	u16 iphlen;
+	u16 pkt_len;
+	u8 *mem = (u8 *)buf->mem.va;
+	struct ethhdr *ethh = (struct ethhdr *)buf->mem.va;
+
+	if (ethh->h_proto == htons(0x8100)) {
+		info->vlan_valid = true;
+		buf->vlan_id = ntohs(((struct vlan_ethhdr *)ethh)->h_vlan_TCI) & VLAN_VID_MASK;
+	}
+	buf->maclen = (info->vlan_valid) ? 18 : 14;
+	iphlen = (info->l3proto) ? 40 : 20;
+	buf->ipv4 = (info->l3proto) ? false : true;
+	buf->iph = mem + buf->maclen;
+	iph = (struct iphdr *)buf->iph;
+
+	buf->tcph = buf->iph + iphlen;
+	tcph = (struct tcphdr *)buf->tcph;
+
+	if (buf->ipv4) {
+		pkt_len = ntohs(iph->tot_len);
+	} else {
+		ip6h = (struct ipv6hdr *)buf->iph;
+		pkt_len = ntohs(ip6h->payload_len) + iphlen;
+	}
+
+	buf->totallen = pkt_len + buf->maclen;
+
+	if (info->payload_len < buf->totallen) {
+		i40iw_pr_err("payload_len = 0x%x totallen expected0x%x\n",
+			     info->payload_len, buf->totallen);
+		return I40IW_ERR_INVALID_SIZE;
+	}
+
+	buf->tcphlen = (tcph->doff) << 2;
+	buf->datalen = pkt_len - iphlen - buf->tcphlen;
+	buf->data = (buf->datalen) ? buf->tcph + buf->tcphlen : NULL;
+	buf->hdrlen = buf->maclen + iphlen + buf->tcphlen;
+	buf->seqnum = ntohl(tcph->seq);
+	return 0;
+}
+
+/**
+ * i40iw_hw_stats_timeout - Stats timer-handler which updates all HW stats
+ * @vsi: pointer to the vsi structure
+ */
+static void i40iw_hw_stats_timeout(struct timer_list *t)
+{
+	struct i40iw_vsi_pestat *pf_devstat = from_timer(pf_devstat, t,
+						       stats_timer);
+	struct i40iw_sc_vsi *sc_vsi = pf_devstat->vsi;
+	struct i40iw_sc_dev *pf_dev = sc_vsi->dev;
+	struct i40iw_vsi_pestat *vf_devstat = NULL;
+	u16 iw_vf_idx;
+	unsigned long flags;
+
+	/*PF*/
+	i40iw_hw_stats_read_all(pf_devstat, &pf_devstat->hw_stats);
+
+	for (iw_vf_idx = 0; iw_vf_idx < I40IW_MAX_PE_ENABLED_VF_COUNT; iw_vf_idx++) {
+		spin_lock_irqsave(&pf_devstat->lock, flags);
+		if (pf_dev->vf_dev[iw_vf_idx]) {
+			if (pf_dev->vf_dev[iw_vf_idx]->stats_initialized) {
+				vf_devstat = &pf_dev->vf_dev[iw_vf_idx]->pestat;
+				i40iw_hw_stats_read_all(vf_devstat, &vf_devstat->hw_stats);
+			}
+		}
+		spin_unlock_irqrestore(&pf_devstat->lock, flags);
+	}
+
+	mod_timer(&pf_devstat->stats_timer,
+		  jiffies + msecs_to_jiffies(STATS_TIMER_DELAY));
+}
+
+/**
+ * i40iw_hw_stats_start_timer - Start periodic stats timer
+ * @vsi: pointer to the vsi structure
+ */
+void i40iw_hw_stats_start_timer(struct i40iw_sc_vsi *vsi)
+{
+	struct i40iw_vsi_pestat *devstat = vsi->pestat;
+
+	timer_setup(&devstat->stats_timer, i40iw_hw_stats_timeout, 0);
+	mod_timer(&devstat->stats_timer,
+		  jiffies + msecs_to_jiffies(STATS_TIMER_DELAY));
+}
+
+/**
+ * i40iw_hw_stats_stop_timer - Delete periodic stats timer
+ * @vsi: pointer to the vsi structure
+ */
+void i40iw_hw_stats_stop_timer(struct i40iw_sc_vsi *vsi)
+{
+	struct i40iw_vsi_pestat *devstat = vsi->pestat;
+
+	del_timer_sync(&devstat->stats_timer);
+}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
new file mode 100644
index 0000000..68679ad
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -0,0 +1,2961 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/random.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/hugetlb.h>
+#include <linux/irq.h>
+#include <asm/byteorder.h>
+#include <net/ip.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+#include "i40iw.h"
+
+/**
+ * i40iw_query_device - get device attributes
+ * @ibdev: device pointer from stack
+ * @props: returning device attributes
+ * @udata: user data
+ */
+static int i40iw_query_device(struct ib_device *ibdev,
+			      struct ib_device_attr *props,
+			      struct ib_udata *udata)
+{
+	struct i40iw_device *iwdev = to_iwdev(ibdev);
+
+	if (udata->inlen || udata->outlen)
+		return -EINVAL;
+	memset(props, 0, sizeof(*props));
+	ether_addr_copy((u8 *)&props->sys_image_guid, iwdev->netdev->dev_addr);
+	props->fw_ver = I40IW_FW_VERSION;
+	props->device_cap_flags = iwdev->device_cap_flags;
+	props->vendor_id = iwdev->ldev->pcidev->vendor;
+	props->vendor_part_id = iwdev->ldev->pcidev->device;
+	props->hw_ver = (u32)iwdev->sc_dev.hw_rev;
+	props->max_mr_size = I40IW_MAX_OUTBOUND_MESSAGE_SIZE;
+	props->max_qp = iwdev->max_qp - iwdev->used_qps;
+	props->max_qp_wr = I40IW_MAX_QP_WRS;
+	props->max_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
+	props->max_cq = iwdev->max_cq - iwdev->used_cqs;
+	props->max_cqe = iwdev->max_cqe;
+	props->max_mr = iwdev->max_mr - iwdev->used_mrs;
+	props->max_pd = iwdev->max_pd - iwdev->used_pds;
+	props->max_sge_rd = I40IW_MAX_SGE_RD;
+	props->max_qp_rd_atom = I40IW_MAX_IRD_SIZE;
+	props->max_qp_init_rd_atom = props->max_qp_rd_atom;
+	props->atomic_cap = IB_ATOMIC_NONE;
+	props->max_map_per_fmr = 1;
+	props->max_fast_reg_page_list_len = I40IW_MAX_PAGES_PER_FMR;
+	return 0;
+}
+
+/**
+ * i40iw_query_port - get port attrubutes
+ * @ibdev: device pointer from stack
+ * @port: port number for query
+ * @props: returning device attributes
+ */
+static int i40iw_query_port(struct ib_device *ibdev,
+			    u8 port,
+			    struct ib_port_attr *props)
+{
+	struct i40iw_device *iwdev = to_iwdev(ibdev);
+	struct net_device *netdev = iwdev->netdev;
+
+	/* props being zeroed by the caller, avoid zeroing it here */
+	props->max_mtu = IB_MTU_4096;
+	props->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
+
+	props->lid = 1;
+	if (netif_carrier_ok(iwdev->netdev))
+		props->state = IB_PORT_ACTIVE;
+	else
+		props->state = IB_PORT_DOWN;
+	props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
+		IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->active_width = IB_WIDTH_4X;
+	props->active_speed = 1;
+	props->max_msg_sz = I40IW_MAX_OUTBOUND_MESSAGE_SIZE;
+	return 0;
+}
+
+/**
+ * i40iw_alloc_ucontext - Allocate the user context data structure
+ * @ibdev: device pointer from stack
+ * @udata: user data
+ *
+ * This keeps track of all objects associated with a particular
+ * user-mode client.
+ */
+static struct ib_ucontext *i40iw_alloc_ucontext(struct ib_device *ibdev,
+						struct ib_udata *udata)
+{
+	struct i40iw_device *iwdev = to_iwdev(ibdev);
+	struct i40iw_alloc_ucontext_req req;
+	struct i40iw_alloc_ucontext_resp uresp;
+	struct i40iw_ucontext *ucontext;
+
+	if (ib_copy_from_udata(&req, udata, sizeof(req)))
+		return ERR_PTR(-EINVAL);
+
+	if (req.userspace_ver < 4 || req.userspace_ver > I40IW_ABI_VER) {
+		i40iw_pr_err("Unsupported provider library version %u.\n", req.userspace_ver);
+		return ERR_PTR(-EINVAL);
+	}
+
+	memset(&uresp, 0, sizeof(uresp));
+	uresp.max_qps = iwdev->max_qp;
+	uresp.max_pds = iwdev->max_pd;
+	uresp.wq_size = iwdev->max_qp_wr * 2;
+	uresp.kernel_ver = req.userspace_ver;
+
+	ucontext = kzalloc(sizeof(*ucontext), GFP_KERNEL);
+	if (!ucontext)
+		return ERR_PTR(-ENOMEM);
+
+	ucontext->iwdev = iwdev;
+	ucontext->abi_ver = req.userspace_ver;
+
+	if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
+		kfree(ucontext);
+		return ERR_PTR(-EFAULT);
+	}
+
+	INIT_LIST_HEAD(&ucontext->cq_reg_mem_list);
+	spin_lock_init(&ucontext->cq_reg_mem_list_lock);
+	INIT_LIST_HEAD(&ucontext->qp_reg_mem_list);
+	spin_lock_init(&ucontext->qp_reg_mem_list_lock);
+
+	return &ucontext->ibucontext;
+}
+
+/**
+ * i40iw_dealloc_ucontext - deallocate the user context data structure
+ * @context: user context created during alloc
+ */
+static int i40iw_dealloc_ucontext(struct ib_ucontext *context)
+{
+	struct i40iw_ucontext *ucontext = to_ucontext(context);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags);
+	if (!list_empty(&ucontext->cq_reg_mem_list)) {
+		spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
+		return -EBUSY;
+	}
+	spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
+	spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags);
+	if (!list_empty(&ucontext->qp_reg_mem_list)) {
+		spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
+		return -EBUSY;
+	}
+	spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
+
+	kfree(ucontext);
+	return 0;
+}
+
+/**
+ * i40iw_mmap - user memory map
+ * @context: context created during alloc
+ * @vma: kernel info for user memory map
+ */
+static int i40iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct i40iw_ucontext *ucontext;
+	u64 db_addr_offset;
+	u64 push_offset;
+
+	ucontext = to_ucontext(context);
+	if (ucontext->iwdev->sc_dev.is_pf) {
+		db_addr_offset = I40IW_DB_ADDR_OFFSET;
+		push_offset = I40IW_PUSH_OFFSET;
+		if (vma->vm_pgoff)
+			vma->vm_pgoff += I40IW_PF_FIRST_PUSH_PAGE_INDEX - 1;
+	} else {
+		db_addr_offset = I40IW_VF_DB_ADDR_OFFSET;
+		push_offset = I40IW_VF_PUSH_OFFSET;
+		if (vma->vm_pgoff)
+			vma->vm_pgoff += I40IW_VF_FIRST_PUSH_PAGE_INDEX - 1;
+	}
+
+	vma->vm_pgoff += db_addr_offset >> PAGE_SHIFT;
+
+	if (vma->vm_pgoff == (db_addr_offset >> PAGE_SHIFT)) {
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		vma->vm_private_data = ucontext;
+	} else {
+		if ((vma->vm_pgoff - (push_offset >> PAGE_SHIFT)) % 2)
+			vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		else
+			vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+	}
+
+	if (io_remap_pfn_range(vma, vma->vm_start,
+			       vma->vm_pgoff + (pci_resource_start(ucontext->iwdev->ldev->pcidev, 0) >> PAGE_SHIFT),
+			       PAGE_SIZE, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+/**
+ * i40iw_alloc_push_page - allocate a push page for qp
+ * @iwdev: iwarp device
+ * @qp: hardware control qp
+ */
+static void i40iw_alloc_push_page(struct i40iw_device *iwdev, struct i40iw_sc_qp *qp)
+{
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	enum i40iw_status_code status;
+
+	if (qp->push_idx != I40IW_INVALID_PUSH_PAGE_INDEX)
+		return;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
+	if (!cqp_request)
+		return;
+
+	atomic_inc(&cqp_request->refcount);
+
+	cqp_info = &cqp_request->info;
+	cqp_info->cqp_cmd = OP_MANAGE_PUSH_PAGE;
+	cqp_info->post_sq = 1;
+
+	cqp_info->in.u.manage_push_page.info.qs_handle = qp->qs_handle;
+	cqp_info->in.u.manage_push_page.info.free_page = 0;
+	cqp_info->in.u.manage_push_page.cqp = &iwdev->cqp.sc_cqp;
+	cqp_info->in.u.manage_push_page.scratch = (uintptr_t)cqp_request;
+
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (!status)
+		qp->push_idx = cqp_request->compl_info.op_ret_val;
+	else
+		i40iw_pr_err("CQP-OP Push page fail");
+	i40iw_put_cqp_request(&iwdev->cqp, cqp_request);
+}
+
+/**
+ * i40iw_dealloc_push_page - free a push page for qp
+ * @iwdev: iwarp device
+ * @qp: hardware control qp
+ */
+static void i40iw_dealloc_push_page(struct i40iw_device *iwdev, struct i40iw_sc_qp *qp)
+{
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	enum i40iw_status_code status;
+
+	if (qp->push_idx == I40IW_INVALID_PUSH_PAGE_INDEX)
+		return;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, false);
+	if (!cqp_request)
+		return;
+
+	cqp_info = &cqp_request->info;
+	cqp_info->cqp_cmd = OP_MANAGE_PUSH_PAGE;
+	cqp_info->post_sq = 1;
+
+	cqp_info->in.u.manage_push_page.info.push_idx = qp->push_idx;
+	cqp_info->in.u.manage_push_page.info.qs_handle = qp->qs_handle;
+	cqp_info->in.u.manage_push_page.info.free_page = 1;
+	cqp_info->in.u.manage_push_page.cqp = &iwdev->cqp.sc_cqp;
+	cqp_info->in.u.manage_push_page.scratch = (uintptr_t)cqp_request;
+
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (!status)
+		qp->push_idx = I40IW_INVALID_PUSH_PAGE_INDEX;
+	else
+		i40iw_pr_err("CQP-OP Push page fail");
+}
+
+/**
+ * i40iw_alloc_pd - allocate protection domain
+ * @ibdev: device pointer from stack
+ * @context: user context created during alloc
+ * @udata: user data
+ */
+static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev,
+				    struct ib_ucontext *context,
+				    struct ib_udata *udata)
+{
+	struct i40iw_pd *iwpd;
+	struct i40iw_device *iwdev = to_iwdev(ibdev);
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_alloc_pd_resp uresp;
+	struct i40iw_sc_pd *sc_pd;
+	struct i40iw_ucontext *ucontext;
+	u32 pd_id = 0;
+	int err;
+
+	if (iwdev->closing)
+		return ERR_PTR(-ENODEV);
+
+	err = i40iw_alloc_resource(iwdev, iwdev->allocated_pds,
+				   iwdev->max_pd, &pd_id, &iwdev->next_pd);
+	if (err) {
+		i40iw_pr_err("alloc resource failed\n");
+		return ERR_PTR(err);
+	}
+
+	iwpd = kzalloc(sizeof(*iwpd), GFP_KERNEL);
+	if (!iwpd) {
+		err = -ENOMEM;
+		goto free_res;
+	}
+
+	sc_pd = &iwpd->sc_pd;
+
+	if (context) {
+		ucontext = to_ucontext(context);
+		dev->iw_pd_ops->pd_init(dev, sc_pd, pd_id, ucontext->abi_ver);
+		memset(&uresp, 0, sizeof(uresp));
+		uresp.pd_id = pd_id;
+		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
+			err = -EFAULT;
+			goto error;
+		}
+	} else {
+		dev->iw_pd_ops->pd_init(dev, sc_pd, pd_id, -1);
+	}
+
+	i40iw_add_pdusecount(iwpd);
+	return &iwpd->ibpd;
+error:
+	kfree(iwpd);
+free_res:
+	i40iw_free_resource(iwdev, iwdev->allocated_pds, pd_id);
+	return ERR_PTR(err);
+}
+
+/**
+ * i40iw_dealloc_pd - deallocate pd
+ * @ibpd: ptr of pd to be deallocated
+ */
+static int i40iw_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct i40iw_pd *iwpd = to_iwpd(ibpd);
+	struct i40iw_device *iwdev = to_iwdev(ibpd->device);
+
+	i40iw_rem_pdusecount(iwpd, iwdev);
+	return 0;
+}
+
+/**
+ * i40iw_get_pbl - Retrieve pbl from a list given a virtual
+ * address
+ * @va: user virtual address
+ * @pbl_list: pbl list to search in (QP's or CQ's)
+ */
+static struct i40iw_pbl *i40iw_get_pbl(unsigned long va,
+				       struct list_head *pbl_list)
+{
+	struct i40iw_pbl *iwpbl;
+
+	list_for_each_entry(iwpbl, pbl_list, list) {
+		if (iwpbl->user_base == va) {
+			iwpbl->on_list = false;
+			list_del(&iwpbl->list);
+			return iwpbl;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * i40iw_free_qp_resources - free up memory resources for qp
+ * @iwdev: iwarp device
+ * @iwqp: qp ptr (user or kernel)
+ * @qp_num: qp number assigned
+ */
+void i40iw_free_qp_resources(struct i40iw_device *iwdev,
+			     struct i40iw_qp *iwqp,
+			     u32 qp_num)
+{
+	struct i40iw_pbl *iwpbl = &iwqp->iwpbl;
+
+	i40iw_ieq_cleanup_qp(iwdev->vsi.ieq, &iwqp->sc_qp);
+	i40iw_dealloc_push_page(iwdev, &iwqp->sc_qp);
+	if (qp_num)
+		i40iw_free_resource(iwdev, iwdev->allocated_qps, qp_num);
+	if (iwpbl->pbl_allocated)
+		i40iw_free_pble(iwdev->pble_rsrc, &iwpbl->pble_alloc);
+	i40iw_free_dma_mem(iwdev->sc_dev.hw, &iwqp->q2_ctx_mem);
+	i40iw_free_dma_mem(iwdev->sc_dev.hw, &iwqp->kqp.dma_mem);
+	kfree(iwqp->kqp.wrid_mem);
+	iwqp->kqp.wrid_mem = NULL;
+	kfree(iwqp->allocated_buffer);
+}
+
+/**
+ * i40iw_clean_cqes - clean cq entries for qp
+ * @iwqp: qp ptr (user or kernel)
+ * @iwcq: cq ptr
+ */
+static void i40iw_clean_cqes(struct i40iw_qp *iwqp, struct i40iw_cq *iwcq)
+{
+	struct i40iw_cq_uk *ukcq = &iwcq->sc_cq.cq_uk;
+
+	ukcq->ops.iw_cq_clean(&iwqp->sc_qp.qp_uk, ukcq);
+}
+
+/**
+ * i40iw_destroy_qp - destroy qp
+ * @ibqp: qp's ib pointer also to get to device's qp address
+ */
+static int i40iw_destroy_qp(struct ib_qp *ibqp)
+{
+	struct i40iw_qp *iwqp = to_iwqp(ibqp);
+
+	iwqp->destroyed = 1;
+
+	if (iwqp->ibqp_state >= IB_QPS_INIT && iwqp->ibqp_state < IB_QPS_RTS)
+		i40iw_next_iw_state(iwqp, I40IW_QP_STATE_ERROR, 0, 0, 0);
+
+	if (!iwqp->user_mode) {
+		if (iwqp->iwscq) {
+			i40iw_clean_cqes(iwqp, iwqp->iwscq);
+			if (iwqp->iwrcq != iwqp->iwscq)
+				i40iw_clean_cqes(iwqp, iwqp->iwrcq);
+		}
+	}
+
+	i40iw_rem_ref(&iwqp->ibqp);
+	return 0;
+}
+
+/**
+ * i40iw_setup_virt_qp - setup for allocation of virtual qp
+ * @dev: iwarp device
+ * @qp: qp ptr
+ * @init_info: initialize info to return
+ */
+static int i40iw_setup_virt_qp(struct i40iw_device *iwdev,
+			       struct i40iw_qp *iwqp,
+			       struct i40iw_qp_init_info *init_info)
+{
+	struct i40iw_pbl *iwpbl = &iwqp->iwpbl;
+	struct i40iw_qp_mr *qpmr = &iwpbl->qp_mr;
+
+	iwqp->page = qpmr->sq_page;
+	init_info->shadow_area_pa = cpu_to_le64(qpmr->shadow);
+	if (iwpbl->pbl_allocated) {
+		init_info->virtual_map = true;
+		init_info->sq_pa = qpmr->sq_pbl.idx;
+		init_info->rq_pa = qpmr->rq_pbl.idx;
+	} else {
+		init_info->sq_pa = qpmr->sq_pbl.addr;
+		init_info->rq_pa = qpmr->rq_pbl.addr;
+	}
+	return 0;
+}
+
+/**
+ * i40iw_setup_kmode_qp - setup initialization for kernel mode qp
+ * @iwdev: iwarp device
+ * @iwqp: qp ptr (user or kernel)
+ * @info: initialize info to return
+ */
+static int i40iw_setup_kmode_qp(struct i40iw_device *iwdev,
+				struct i40iw_qp *iwqp,
+				struct i40iw_qp_init_info *info)
+{
+	struct i40iw_dma_mem *mem = &iwqp->kqp.dma_mem;
+	u32 sqdepth, rqdepth;
+	u8 sqshift;
+	u32 size;
+	enum i40iw_status_code status;
+	struct i40iw_qp_uk_init_info *ukinfo = &info->qp_uk_init_info;
+
+	i40iw_get_wqe_shift(ukinfo->max_sq_frag_cnt, ukinfo->max_inline_data, &sqshift);
+	status = i40iw_get_sqdepth(ukinfo->sq_size, sqshift, &sqdepth);
+	if (status)
+		return -ENOMEM;
+
+	status = i40iw_get_rqdepth(ukinfo->rq_size, I40IW_MAX_RQ_WQE_SHIFT, &rqdepth);
+	if (status)
+		return -ENOMEM;
+
+	size = sqdepth * sizeof(struct i40iw_sq_uk_wr_trk_info) + (rqdepth << 3);
+	iwqp->kqp.wrid_mem = kzalloc(size, GFP_KERNEL);
+
+	ukinfo->sq_wrtrk_array = (struct i40iw_sq_uk_wr_trk_info *)iwqp->kqp.wrid_mem;
+	if (!ukinfo->sq_wrtrk_array)
+		return -ENOMEM;
+
+	ukinfo->rq_wrid_array = (u64 *)&ukinfo->sq_wrtrk_array[sqdepth];
+
+	size = (sqdepth + rqdepth) * I40IW_QP_WQE_MIN_SIZE;
+	size += (I40IW_SHADOW_AREA_SIZE << 3);
+
+	status = i40iw_allocate_dma_mem(iwdev->sc_dev.hw, mem, size, 256);
+	if (status) {
+		kfree(ukinfo->sq_wrtrk_array);
+		ukinfo->sq_wrtrk_array = NULL;
+		return -ENOMEM;
+	}
+
+	ukinfo->sq = mem->va;
+	info->sq_pa = mem->pa;
+
+	ukinfo->rq = &ukinfo->sq[sqdepth];
+	info->rq_pa = info->sq_pa + (sqdepth * I40IW_QP_WQE_MIN_SIZE);
+
+	ukinfo->shadow_area = ukinfo->rq[rqdepth].elem;
+	info->shadow_area_pa = info->rq_pa + (rqdepth * I40IW_QP_WQE_MIN_SIZE);
+
+	ukinfo->sq_size = sqdepth >> sqshift;
+	ukinfo->rq_size = rqdepth >> I40IW_MAX_RQ_WQE_SHIFT;
+	ukinfo->qp_id = iwqp->ibqp.qp_num;
+	return 0;
+}
+
+/**
+ * i40iw_create_qp - create qp
+ * @ibpd: ptr of pd
+ * @init_attr: attributes for qp
+ * @udata: user data for create qp
+ */
+static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
+				     struct ib_qp_init_attr *init_attr,
+				     struct ib_udata *udata)
+{
+	struct i40iw_pd *iwpd = to_iwpd(ibpd);
+	struct i40iw_device *iwdev = to_iwdev(ibpd->device);
+	struct i40iw_cqp *iwcqp = &iwdev->cqp;
+	struct i40iw_qp *iwqp;
+	struct i40iw_ucontext *ucontext;
+	struct i40iw_create_qp_req req;
+	struct i40iw_create_qp_resp uresp;
+	u32 qp_num = 0;
+	void *mem;
+	enum i40iw_status_code ret;
+	int err_code;
+	int sq_size;
+	int rq_size;
+	struct i40iw_sc_qp *qp;
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_qp_init_info init_info;
+	struct i40iw_create_qp_info *qp_info;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+
+	struct i40iw_qp_host_ctx_info *ctx_info;
+	struct i40iwarp_offload_info *iwarp_info;
+	unsigned long flags;
+
+	if (iwdev->closing)
+		return ERR_PTR(-ENODEV);
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+	if (init_attr->cap.max_inline_data > I40IW_MAX_INLINE_DATA_SIZE)
+		init_attr->cap.max_inline_data = I40IW_MAX_INLINE_DATA_SIZE;
+
+	if (init_attr->cap.max_send_sge > I40IW_MAX_WQ_FRAGMENT_COUNT)
+		init_attr->cap.max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
+
+	if (init_attr->cap.max_recv_sge > I40IW_MAX_WQ_FRAGMENT_COUNT)
+		init_attr->cap.max_recv_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
+
+	memset(&init_info, 0, sizeof(init_info));
+
+	sq_size = init_attr->cap.max_send_wr;
+	rq_size = init_attr->cap.max_recv_wr;
+
+	init_info.vsi = &iwdev->vsi;
+	init_info.qp_uk_init_info.sq_size = sq_size;
+	init_info.qp_uk_init_info.rq_size = rq_size;
+	init_info.qp_uk_init_info.max_sq_frag_cnt = init_attr->cap.max_send_sge;
+	init_info.qp_uk_init_info.max_rq_frag_cnt = init_attr->cap.max_recv_sge;
+	init_info.qp_uk_init_info.max_inline_data = init_attr->cap.max_inline_data;
+
+	mem = kzalloc(sizeof(*iwqp), GFP_KERNEL);
+	if (!mem)
+		return ERR_PTR(-ENOMEM);
+
+	iwqp = (struct i40iw_qp *)mem;
+	iwqp->allocated_buffer = mem;
+	qp = &iwqp->sc_qp;
+	qp->back_qp = (void *)iwqp;
+	qp->push_idx = I40IW_INVALID_PUSH_PAGE_INDEX;
+
+	iwqp->ctx_info.iwarp_info = &iwqp->iwarp_info;
+
+	if (i40iw_allocate_dma_mem(dev->hw,
+				   &iwqp->q2_ctx_mem,
+				   I40IW_Q2_BUFFER_SIZE + I40IW_QP_CTX_SIZE,
+				   256)) {
+		i40iw_pr_err("dma_mem failed\n");
+		err_code = -ENOMEM;
+		goto error;
+	}
+
+	init_info.q2 = iwqp->q2_ctx_mem.va;
+	init_info.q2_pa = iwqp->q2_ctx_mem.pa;
+
+	init_info.host_ctx = (void *)init_info.q2 + I40IW_Q2_BUFFER_SIZE;
+	init_info.host_ctx_pa = init_info.q2_pa + I40IW_Q2_BUFFER_SIZE;
+
+	err_code = i40iw_alloc_resource(iwdev, iwdev->allocated_qps, iwdev->max_qp,
+					&qp_num, &iwdev->next_qp);
+	if (err_code) {
+		i40iw_pr_err("qp resource\n");
+		goto error;
+	}
+
+	iwqp->iwdev = iwdev;
+	iwqp->iwpd = iwpd;
+	iwqp->ibqp.qp_num = qp_num;
+	qp = &iwqp->sc_qp;
+	iwqp->iwscq = to_iwcq(init_attr->send_cq);
+	iwqp->iwrcq = to_iwcq(init_attr->recv_cq);
+
+	iwqp->host_ctx.va = init_info.host_ctx;
+	iwqp->host_ctx.pa = init_info.host_ctx_pa;
+	iwqp->host_ctx.size = I40IW_QP_CTX_SIZE;
+
+	init_info.pd = &iwpd->sc_pd;
+	init_info.qp_uk_init_info.qp_id = iwqp->ibqp.qp_num;
+	iwqp->ctx_info.qp_compl_ctx = (uintptr_t)qp;
+
+	if (init_attr->qp_type != IB_QPT_RC) {
+		err_code = -EINVAL;
+		goto error;
+	}
+	if (iwdev->push_mode)
+		i40iw_alloc_push_page(iwdev, qp);
+	if (udata) {
+		err_code = ib_copy_from_udata(&req, udata, sizeof(req));
+		if (err_code) {
+			i40iw_pr_err("ib_copy_from_data\n");
+			goto error;
+		}
+		iwqp->ctx_info.qp_compl_ctx = req.user_compl_ctx;
+		if (ibpd->uobject && ibpd->uobject->context) {
+			iwqp->user_mode = 1;
+			ucontext = to_ucontext(ibpd->uobject->context);
+
+			if (req.user_wqe_buffers) {
+				struct i40iw_pbl *iwpbl;
+
+				spin_lock_irqsave(
+				    &ucontext->qp_reg_mem_list_lock, flags);
+				iwpbl = i40iw_get_pbl(
+				    (unsigned long)req.user_wqe_buffers,
+				    &ucontext->qp_reg_mem_list);
+				spin_unlock_irqrestore(
+				    &ucontext->qp_reg_mem_list_lock, flags);
+
+				if (!iwpbl) {
+					err_code = -ENODATA;
+					i40iw_pr_err("no pbl info\n");
+					goto error;
+				}
+				memcpy(&iwqp->iwpbl, iwpbl, sizeof(iwqp->iwpbl));
+			}
+		}
+		err_code = i40iw_setup_virt_qp(iwdev, iwqp, &init_info);
+	} else {
+		err_code = i40iw_setup_kmode_qp(iwdev, iwqp, &init_info);
+	}
+
+	if (err_code) {
+		i40iw_pr_err("setup qp failed\n");
+		goto error;
+	}
+
+	init_info.type = I40IW_QP_TYPE_IWARP;
+	ret = dev->iw_priv_qp_ops->qp_init(qp, &init_info);
+	if (ret) {
+		err_code = -EPROTO;
+		i40iw_pr_err("qp_init fail\n");
+		goto error;
+	}
+	ctx_info = &iwqp->ctx_info;
+	iwarp_info = &iwqp->iwarp_info;
+	iwarp_info->rd_enable = true;
+	iwarp_info->wr_rdresp_en = true;
+	if (!iwqp->user_mode) {
+		iwarp_info->fast_reg_en = true;
+		iwarp_info->priv_mode_en = true;
+	}
+	iwarp_info->ddp_ver = 1;
+	iwarp_info->rdmap_ver = 1;
+
+	ctx_info->iwarp_info_valid = true;
+	ctx_info->send_cq_num = iwqp->iwscq->sc_cq.cq_uk.cq_id;
+	ctx_info->rcv_cq_num = iwqp->iwrcq->sc_cq.cq_uk.cq_id;
+	if (qp->push_idx == I40IW_INVALID_PUSH_PAGE_INDEX) {
+		ctx_info->push_mode_en = false;
+	} else {
+		ctx_info->push_mode_en = true;
+		ctx_info->push_idx = qp->push_idx;
+	}
+
+	ret = dev->iw_priv_qp_ops->qp_setctx(&iwqp->sc_qp,
+					     (u64 *)iwqp->host_ctx.va,
+					     ctx_info);
+	ctx_info->iwarp_info_valid = false;
+	cqp_request = i40iw_get_cqp_request(iwcqp, true);
+	if (!cqp_request) {
+		err_code = -ENOMEM;
+		goto error;
+	}
+	cqp_info = &cqp_request->info;
+	qp_info = &cqp_request->info.in.u.qp_create.info;
+
+	memset(qp_info, 0, sizeof(*qp_info));
+
+	qp_info->cq_num_valid = true;
+	qp_info->next_iwarp_state = I40IW_QP_STATE_IDLE;
+
+	cqp_info->cqp_cmd = OP_QP_CREATE;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.qp_create.qp = qp;
+	cqp_info->in.u.qp_create.scratch = (uintptr_t)cqp_request;
+	ret = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (ret) {
+		i40iw_pr_err("CQP-OP QP create fail");
+		err_code = -EACCES;
+		goto error;
+	}
+
+	i40iw_add_ref(&iwqp->ibqp);
+	spin_lock_init(&iwqp->lock);
+	iwqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) ? 1 : 0;
+	iwdev->qp_table[qp_num] = iwqp;
+	i40iw_add_pdusecount(iwqp->iwpd);
+	i40iw_add_devusecount(iwdev);
+	if (ibpd->uobject && udata) {
+		memset(&uresp, 0, sizeof(uresp));
+		uresp.actual_sq_size = sq_size;
+		uresp.actual_rq_size = rq_size;
+		uresp.qp_id = qp_num;
+		uresp.push_idx = qp->push_idx;
+		err_code = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+		if (err_code) {
+			i40iw_pr_err("copy_to_udata failed\n");
+			i40iw_destroy_qp(&iwqp->ibqp);
+			   /* let the completion of the qp destroy free the qp */
+			return ERR_PTR(err_code);
+		}
+	}
+	init_completion(&iwqp->sq_drained);
+	init_completion(&iwqp->rq_drained);
+
+	return &iwqp->ibqp;
+error:
+	i40iw_free_qp_resources(iwdev, iwqp, qp_num);
+	return ERR_PTR(err_code);
+}
+
+/**
+ * i40iw_query - query qp attributes
+ * @ibqp: qp pointer
+ * @attr: attributes pointer
+ * @attr_mask: Not used
+ * @init_attr: qp attributes to return
+ */
+static int i40iw_query_qp(struct ib_qp *ibqp,
+			  struct ib_qp_attr *attr,
+			  int attr_mask,
+			  struct ib_qp_init_attr *init_attr)
+{
+	struct i40iw_qp *iwqp = to_iwqp(ibqp);
+	struct i40iw_sc_qp *qp = &iwqp->sc_qp;
+
+	attr->qp_access_flags = 0;
+	attr->cap.max_send_wr = qp->qp_uk.sq_size;
+	attr->cap.max_recv_wr = qp->qp_uk.rq_size;
+	attr->cap.max_inline_data = I40IW_MAX_INLINE_DATA_SIZE;
+	attr->cap.max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
+	attr->cap.max_recv_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
+	attr->port_num = 1;
+	init_attr->event_handler = iwqp->ibqp.event_handler;
+	init_attr->qp_context = iwqp->ibqp.qp_context;
+	init_attr->send_cq = iwqp->ibqp.send_cq;
+	init_attr->recv_cq = iwqp->ibqp.recv_cq;
+	init_attr->srq = iwqp->ibqp.srq;
+	init_attr->cap = attr->cap;
+	init_attr->port_num = 1;
+	return 0;
+}
+
+/**
+ * i40iw_hw_modify_qp - setup cqp for modify qp
+ * @iwdev: iwarp device
+ * @iwqp: qp ptr (user or kernel)
+ * @info: info for modify qp
+ * @wait: flag to wait or not for modify qp completion
+ */
+void i40iw_hw_modify_qp(struct i40iw_device *iwdev, struct i40iw_qp *iwqp,
+			struct i40iw_modify_qp_info *info, bool wait)
+{
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	struct i40iw_modify_qp_info *m_info;
+	struct i40iw_gen_ae_info ae_info;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, wait);
+	if (!cqp_request)
+		return;
+
+	cqp_info = &cqp_request->info;
+	m_info = &cqp_info->in.u.qp_modify.info;
+	memcpy(m_info, info, sizeof(*m_info));
+	cqp_info->cqp_cmd = OP_QP_MODIFY;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.qp_modify.qp = &iwqp->sc_qp;
+	cqp_info->in.u.qp_modify.scratch = (uintptr_t)cqp_request;
+	if (!i40iw_handle_cqp_op(iwdev, cqp_request))
+		return;
+
+	switch (m_info->next_iwarp_state) {
+	case I40IW_QP_STATE_RTS:
+		if (iwqp->iwarp_state == I40IW_QP_STATE_IDLE)
+			i40iw_send_reset(iwqp->cm_node);
+		/* fall through */
+	case I40IW_QP_STATE_IDLE:
+	case I40IW_QP_STATE_TERMINATE:
+	case I40IW_QP_STATE_CLOSING:
+		ae_info.ae_code = I40IW_AE_BAD_CLOSE;
+		ae_info.ae_source = 0;
+		i40iw_gen_ae(iwdev, &iwqp->sc_qp, &ae_info, false);
+		break;
+	case I40IW_QP_STATE_ERROR:
+	default:
+		break;
+	}
+}
+
+/**
+ * i40iw_modify_qp - modify qp request
+ * @ibqp: qp's pointer for modify
+ * @attr: access attributes
+ * @attr_mask: state mask
+ * @udata: user data
+ */
+int i40iw_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		    int attr_mask, struct ib_udata *udata)
+{
+	struct i40iw_qp *iwqp = to_iwqp(ibqp);
+	struct i40iw_device *iwdev = iwqp->iwdev;
+	struct i40iw_qp_host_ctx_info *ctx_info;
+	struct i40iwarp_offload_info *iwarp_info;
+	struct i40iw_modify_qp_info info;
+	u8 issue_modify_qp = 0;
+	u8 dont_wait = 0;
+	u32 err;
+	unsigned long flags;
+
+	memset(&info, 0, sizeof(info));
+	ctx_info = &iwqp->ctx_info;
+	iwarp_info = &iwqp->iwarp_info;
+
+	spin_lock_irqsave(&iwqp->lock, flags);
+
+	if (attr_mask & IB_QP_STATE) {
+		if (iwdev->closing && attr->qp_state != IB_QPS_ERR) {
+			err = -EINVAL;
+			goto exit;
+		}
+
+		switch (attr->qp_state) {
+		case IB_QPS_INIT:
+		case IB_QPS_RTR:
+			if (iwqp->iwarp_state > (u32)I40IW_QP_STATE_IDLE) {
+				err = -EINVAL;
+				goto exit;
+			}
+			if (iwqp->iwarp_state == I40IW_QP_STATE_INVALID) {
+				info.next_iwarp_state = I40IW_QP_STATE_IDLE;
+				issue_modify_qp = 1;
+			}
+			break;
+		case IB_QPS_RTS:
+			if ((iwqp->iwarp_state > (u32)I40IW_QP_STATE_RTS) ||
+			    (!iwqp->cm_id)) {
+				err = -EINVAL;
+				goto exit;
+			}
+
+			issue_modify_qp = 1;
+			iwqp->hw_tcp_state = I40IW_TCP_STATE_ESTABLISHED;
+			iwqp->hte_added = 1;
+			info.next_iwarp_state = I40IW_QP_STATE_RTS;
+			info.tcp_ctx_valid = true;
+			info.ord_valid = true;
+			info.arp_cache_idx_valid = true;
+			info.cq_num_valid = true;
+			break;
+		case IB_QPS_SQD:
+			if (iwqp->hw_iwarp_state > (u32)I40IW_QP_STATE_RTS) {
+				err = 0;
+				goto exit;
+			}
+			if ((iwqp->iwarp_state == (u32)I40IW_QP_STATE_CLOSING) ||
+			    (iwqp->iwarp_state < (u32)I40IW_QP_STATE_RTS)) {
+				err = 0;
+				goto exit;
+			}
+			if (iwqp->iwarp_state > (u32)I40IW_QP_STATE_CLOSING) {
+				err = -EINVAL;
+				goto exit;
+			}
+			info.next_iwarp_state = I40IW_QP_STATE_CLOSING;
+			issue_modify_qp = 1;
+			break;
+		case IB_QPS_SQE:
+			if (iwqp->iwarp_state >= (u32)I40IW_QP_STATE_TERMINATE) {
+				err = -EINVAL;
+				goto exit;
+			}
+			info.next_iwarp_state = I40IW_QP_STATE_TERMINATE;
+			issue_modify_qp = 1;
+			break;
+		case IB_QPS_ERR:
+		case IB_QPS_RESET:
+			if (iwqp->iwarp_state == (u32)I40IW_QP_STATE_ERROR) {
+				err = -EINVAL;
+				goto exit;
+			}
+			if (iwqp->sc_qp.term_flags)
+				i40iw_terminate_del_timer(&iwqp->sc_qp);
+			info.next_iwarp_state = I40IW_QP_STATE_ERROR;
+			if ((iwqp->hw_tcp_state > I40IW_TCP_STATE_CLOSED) &&
+			    iwdev->iw_status &&
+			    (iwqp->hw_tcp_state != I40IW_TCP_STATE_TIME_WAIT))
+				info.reset_tcp_conn = true;
+			else
+				dont_wait = 1;
+			issue_modify_qp = 1;
+			info.next_iwarp_state = I40IW_QP_STATE_ERROR;
+			break;
+		default:
+			err = -EINVAL;
+			goto exit;
+		}
+
+		iwqp->ibqp_state = attr->qp_state;
+
+	}
+	if (attr_mask & IB_QP_ACCESS_FLAGS) {
+		ctx_info->iwarp_info_valid = true;
+		if (attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE)
+			iwarp_info->wr_rdresp_en = true;
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
+			iwarp_info->wr_rdresp_en = true;
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
+			iwarp_info->rd_enable = true;
+		if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
+			iwarp_info->bind_en = true;
+
+		if (iwqp->user_mode) {
+			iwarp_info->rd_enable = true;
+			iwarp_info->wr_rdresp_en = true;
+			iwarp_info->priv_mode_en = false;
+		}
+	}
+
+	if (ctx_info->iwarp_info_valid) {
+		struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+		int ret;
+
+		ctx_info->send_cq_num = iwqp->iwscq->sc_cq.cq_uk.cq_id;
+		ctx_info->rcv_cq_num = iwqp->iwrcq->sc_cq.cq_uk.cq_id;
+		ret = dev->iw_priv_qp_ops->qp_setctx(&iwqp->sc_qp,
+						     (u64 *)iwqp->host_ctx.va,
+						     ctx_info);
+		if (ret) {
+			i40iw_pr_err("setting QP context\n");
+			err = -EINVAL;
+			goto exit;
+		}
+	}
+
+	spin_unlock_irqrestore(&iwqp->lock, flags);
+
+	if (issue_modify_qp) {
+		i40iw_hw_modify_qp(iwdev, iwqp, &info, true);
+
+		spin_lock_irqsave(&iwqp->lock, flags);
+		iwqp->iwarp_state = info.next_iwarp_state;
+		spin_unlock_irqrestore(&iwqp->lock, flags);
+	}
+
+	if (issue_modify_qp && (iwqp->ibqp_state > IB_QPS_RTS)) {
+		if (dont_wait) {
+			if (iwqp->cm_id && iwqp->hw_tcp_state) {
+				spin_lock_irqsave(&iwqp->lock, flags);
+				iwqp->hw_tcp_state = I40IW_TCP_STATE_CLOSED;
+				iwqp->last_aeq = I40IW_AE_RESET_SENT;
+				spin_unlock_irqrestore(&iwqp->lock, flags);
+				i40iw_cm_disconn(iwqp);
+			}
+		} else {
+			spin_lock_irqsave(&iwqp->lock, flags);
+			if (iwqp->cm_id) {
+				if (atomic_inc_return(&iwqp->close_timer_started) == 1) {
+					iwqp->cm_id->add_ref(iwqp->cm_id);
+					i40iw_schedule_cm_timer(iwqp->cm_node,
+								(struct i40iw_puda_buf *)iwqp,
+								 I40IW_TIMER_TYPE_CLOSE, 1, 0);
+				}
+			}
+			spin_unlock_irqrestore(&iwqp->lock, flags);
+		}
+	}
+	return 0;
+exit:
+	spin_unlock_irqrestore(&iwqp->lock, flags);
+	return err;
+}
+
+/**
+ * cq_free_resources - free up recources for cq
+ * @iwdev: iwarp device
+ * @iwcq: cq ptr
+ */
+static void cq_free_resources(struct i40iw_device *iwdev, struct i40iw_cq *iwcq)
+{
+	struct i40iw_sc_cq *cq = &iwcq->sc_cq;
+
+	if (!iwcq->user_mode)
+		i40iw_free_dma_mem(iwdev->sc_dev.hw, &iwcq->kmem);
+	i40iw_free_resource(iwdev, iwdev->allocated_cqs, cq->cq_uk.cq_id);
+}
+
+/**
+ * i40iw_cq_wq_destroy - send cq destroy cqp
+ * @iwdev: iwarp device
+ * @cq: hardware control cq
+ */
+void i40iw_cq_wq_destroy(struct i40iw_device *iwdev, struct i40iw_sc_cq *cq)
+{
+	enum i40iw_status_code status;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
+	if (!cqp_request)
+		return;
+
+	cqp_info = &cqp_request->info;
+
+	cqp_info->cqp_cmd = OP_CQ_DESTROY;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.cq_destroy.cq = cq;
+	cqp_info->in.u.cq_destroy.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP Destroy QP fail");
+}
+
+/**
+ * i40iw_destroy_cq - destroy cq
+ * @ib_cq: cq pointer
+ */
+static int i40iw_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct i40iw_cq *iwcq;
+	struct i40iw_device *iwdev;
+	struct i40iw_sc_cq *cq;
+
+	if (!ib_cq) {
+		i40iw_pr_err("ib_cq == NULL\n");
+		return 0;
+	}
+
+	iwcq = to_iwcq(ib_cq);
+	iwdev = to_iwdev(ib_cq->device);
+	cq = &iwcq->sc_cq;
+	i40iw_cq_wq_destroy(iwdev, cq);
+	cq_free_resources(iwdev, iwcq);
+	kfree(iwcq);
+	i40iw_rem_devusecount(iwdev);
+	return 0;
+}
+
+/**
+ * i40iw_create_cq - create cq
+ * @ibdev: device pointer from stack
+ * @attr: attributes for cq
+ * @context: user context created during alloc
+ * @udata: user data
+ */
+static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev,
+				     const struct ib_cq_init_attr *attr,
+				     struct ib_ucontext *context,
+				     struct ib_udata *udata)
+{
+	struct i40iw_device *iwdev = to_iwdev(ibdev);
+	struct i40iw_cq *iwcq;
+	struct i40iw_pbl *iwpbl;
+	u32 cq_num = 0;
+	struct i40iw_sc_cq *cq;
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_cq_init_info info;
+	enum i40iw_status_code status;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	struct i40iw_cq_uk_init_info *ukinfo = &info.cq_uk_init_info;
+	unsigned long flags;
+	int err_code;
+	int entries = attr->cqe;
+
+	if (iwdev->closing)
+		return ERR_PTR(-ENODEV);
+
+	if (entries > iwdev->max_cqe)
+		return ERR_PTR(-EINVAL);
+
+	iwcq = kzalloc(sizeof(*iwcq), GFP_KERNEL);
+	if (!iwcq)
+		return ERR_PTR(-ENOMEM);
+
+	memset(&info, 0, sizeof(info));
+
+	err_code = i40iw_alloc_resource(iwdev, iwdev->allocated_cqs,
+					iwdev->max_cq, &cq_num,
+					&iwdev->next_cq);
+	if (err_code)
+		goto error;
+
+	cq = &iwcq->sc_cq;
+	cq->back_cq = (void *)iwcq;
+	spin_lock_init(&iwcq->lock);
+
+	info.dev = dev;
+	ukinfo->cq_size = max(entries, 4);
+	ukinfo->cq_id = cq_num;
+	iwcq->ibcq.cqe = info.cq_uk_init_info.cq_size;
+	info.ceqe_mask = 0;
+	if (attr->comp_vector < iwdev->ceqs_count)
+		info.ceq_id = attr->comp_vector;
+	info.ceq_id_valid = true;
+	info.ceqe_mask = 1;
+	info.type = I40IW_CQ_TYPE_IWARP;
+	if (context) {
+		struct i40iw_ucontext *ucontext;
+		struct i40iw_create_cq_req req;
+		struct i40iw_cq_mr *cqmr;
+
+		memset(&req, 0, sizeof(req));
+		iwcq->user_mode = true;
+		ucontext = to_ucontext(context);
+		if (ib_copy_from_udata(&req, udata, sizeof(struct i40iw_create_cq_req))) {
+			err_code = -EFAULT;
+			goto cq_free_resources;
+		}
+
+		spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags);
+		iwpbl = i40iw_get_pbl((unsigned long)req.user_cq_buffer,
+				      &ucontext->cq_reg_mem_list);
+		spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
+		if (!iwpbl) {
+			err_code = -EPROTO;
+			goto cq_free_resources;
+		}
+
+		iwcq->iwpbl = iwpbl;
+		iwcq->cq_mem_size = 0;
+		cqmr = &iwpbl->cq_mr;
+		info.shadow_area_pa = cpu_to_le64(cqmr->shadow);
+		if (iwpbl->pbl_allocated) {
+			info.virtual_map = true;
+			info.pbl_chunk_size = 1;
+			info.first_pm_pbl_idx = cqmr->cq_pbl.idx;
+		} else {
+			info.cq_base_pa = cqmr->cq_pbl.addr;
+		}
+	} else {
+		/* Kmode allocations */
+		int rsize;
+		int shadow;
+
+		rsize = info.cq_uk_init_info.cq_size * sizeof(struct i40iw_cqe);
+		rsize = round_up(rsize, 256);
+		shadow = I40IW_SHADOW_AREA_SIZE << 3;
+		status = i40iw_allocate_dma_mem(dev->hw, &iwcq->kmem,
+						rsize + shadow, 256);
+		if (status) {
+			err_code = -ENOMEM;
+			goto cq_free_resources;
+		}
+		ukinfo->cq_base = iwcq->kmem.va;
+		info.cq_base_pa = iwcq->kmem.pa;
+		info.shadow_area_pa = info.cq_base_pa + rsize;
+		ukinfo->shadow_area = iwcq->kmem.va + rsize;
+	}
+
+	if (dev->iw_priv_cq_ops->cq_init(cq, &info)) {
+		i40iw_pr_err("init cq fail\n");
+		err_code = -EPROTO;
+		goto cq_free_resources;
+	}
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
+	if (!cqp_request) {
+		err_code = -ENOMEM;
+		goto cq_free_resources;
+	}
+
+	cqp_info = &cqp_request->info;
+	cqp_info->cqp_cmd = OP_CQ_CREATE;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.cq_create.cq = cq;
+	cqp_info->in.u.cq_create.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status) {
+		i40iw_pr_err("CQP-OP Create QP fail");
+		err_code = -EPROTO;
+		goto cq_free_resources;
+	}
+
+	if (context) {
+		struct i40iw_create_cq_resp resp;
+
+		memset(&resp, 0, sizeof(resp));
+		resp.cq_id = info.cq_uk_init_info.cq_id;
+		resp.cq_size = info.cq_uk_init_info.cq_size;
+		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
+			i40iw_pr_err("copy to user data\n");
+			err_code = -EPROTO;
+			goto cq_destroy;
+		}
+	}
+
+	i40iw_add_devusecount(iwdev);
+	return (struct ib_cq *)iwcq;
+
+cq_destroy:
+	i40iw_cq_wq_destroy(iwdev, cq);
+cq_free_resources:
+	cq_free_resources(iwdev, iwcq);
+error:
+	kfree(iwcq);
+	return ERR_PTR(err_code);
+}
+
+/**
+ * i40iw_get_user_access - get hw access from IB access
+ * @acc: IB access to return hw access
+ */
+static inline u16 i40iw_get_user_access(int acc)
+{
+	u16 access = 0;
+
+	access |= (acc & IB_ACCESS_LOCAL_WRITE) ? I40IW_ACCESS_FLAGS_LOCALWRITE : 0;
+	access |= (acc & IB_ACCESS_REMOTE_WRITE) ? I40IW_ACCESS_FLAGS_REMOTEWRITE : 0;
+	access |= (acc & IB_ACCESS_REMOTE_READ) ? I40IW_ACCESS_FLAGS_REMOTEREAD : 0;
+	access |= (acc & IB_ACCESS_MW_BIND) ? I40IW_ACCESS_FLAGS_BIND_WINDOW : 0;
+	return access;
+}
+
+/**
+ * i40iw_free_stag - free stag resource
+ * @iwdev: iwarp device
+ * @stag: stag to free
+ */
+static void i40iw_free_stag(struct i40iw_device *iwdev, u32 stag)
+{
+	u32 stag_idx;
+
+	stag_idx = (stag & iwdev->mr_stagmask) >> I40IW_CQPSQ_STAG_IDX_SHIFT;
+	i40iw_free_resource(iwdev, iwdev->allocated_mrs, stag_idx);
+	i40iw_rem_devusecount(iwdev);
+}
+
+/**
+ * i40iw_create_stag - create random stag
+ * @iwdev: iwarp device
+ */
+static u32 i40iw_create_stag(struct i40iw_device *iwdev)
+{
+	u32 stag = 0;
+	u32 stag_index = 0;
+	u32 next_stag_index;
+	u32 driver_key;
+	u32 random;
+	u8 consumer_key;
+	int ret;
+
+	get_random_bytes(&random, sizeof(random));
+	consumer_key = (u8)random;
+
+	driver_key = random & ~iwdev->mr_stagmask;
+	next_stag_index = (random & iwdev->mr_stagmask) >> 8;
+	next_stag_index %= iwdev->max_mr;
+
+	ret = i40iw_alloc_resource(iwdev,
+				   iwdev->allocated_mrs, iwdev->max_mr,
+				   &stag_index, &next_stag_index);
+	if (!ret) {
+		stag = stag_index << I40IW_CQPSQ_STAG_IDX_SHIFT;
+		stag |= driver_key;
+		stag += (u32)consumer_key;
+		i40iw_add_devusecount(iwdev);
+	}
+	return stag;
+}
+
+/**
+ * i40iw_next_pbl_addr - Get next pbl address
+ * @pbl: pointer to a pble
+ * @pinfo: info pointer
+ * @idx: index
+ */
+static inline u64 *i40iw_next_pbl_addr(u64 *pbl,
+				       struct i40iw_pble_info **pinfo,
+				       u32 *idx)
+{
+	*idx += 1;
+	if ((!(*pinfo)) || (*idx != (*pinfo)->cnt))
+		return ++pbl;
+	*idx = 0;
+	(*pinfo)++;
+	return (u64 *)(*pinfo)->addr;
+}
+
+/**
+ * i40iw_copy_user_pgaddrs - copy user page address to pble's os locally
+ * @iwmr: iwmr for IB's user page addresses
+ * @pbl: ple pointer to save 1 level or 0 level pble
+ * @level: indicated level 0, 1 or 2
+ */
+static void i40iw_copy_user_pgaddrs(struct i40iw_mr *iwmr,
+				    u64 *pbl,
+				    enum i40iw_pble_level level)
+{
+	struct ib_umem *region = iwmr->region;
+	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
+	int chunk_pages, entry, i;
+	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
+	struct i40iw_pble_info *pinfo;
+	struct scatterlist *sg;
+	u64 pg_addr = 0;
+	u32 idx = 0;
+
+	pinfo = (level == I40IW_LEVEL_1) ? NULL : palloc->level2.leaf;
+
+	for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
+		chunk_pages = sg_dma_len(sg) >> region->page_shift;
+		if ((iwmr->type == IW_MEMREG_TYPE_QP) &&
+		    !iwpbl->qp_mr.sq_page)
+			iwpbl->qp_mr.sq_page = sg_page(sg);
+		for (i = 0; i < chunk_pages; i++) {
+			pg_addr = sg_dma_address(sg) +
+				(i << region->page_shift);
+
+			if ((entry + i) == 0)
+				*pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
+			else if (!(pg_addr & ~iwmr->page_msk))
+				*pbl = cpu_to_le64(pg_addr);
+			else
+				continue;
+			pbl = i40iw_next_pbl_addr(pbl, &pinfo, &idx);
+		}
+	}
+}
+
+/**
+ * i40iw_set_hugetlb_params - set MR pg size and mask to huge pg values.
+ * @addr: virtual address
+ * @iwmr: mr pointer for this memory registration
+ */
+static void i40iw_set_hugetlb_values(u64 addr, struct i40iw_mr *iwmr)
+{
+	struct vm_area_struct *vma;
+	struct hstate *h;
+
+	vma = find_vma(current->mm, addr);
+	if (vma && is_vm_hugetlb_page(vma)) {
+		h = hstate_vma(vma);
+		if (huge_page_size(h) == 0x200000) {
+			iwmr->page_size = huge_page_size(h);
+			iwmr->page_msk = huge_page_mask(h);
+		}
+	}
+}
+
+/**
+ * i40iw_check_mem_contiguous - check if pbls stored in arr are contiguous
+ * @arr: lvl1 pbl array
+ * @npages: page count
+ * pg_size: page size
+ *
+ */
+static bool i40iw_check_mem_contiguous(u64 *arr, u32 npages, u32 pg_size)
+{
+	u32 pg_idx;
+
+	for (pg_idx = 0; pg_idx < npages; pg_idx++) {
+		if ((*arr + (pg_size * pg_idx)) != arr[pg_idx])
+			return false;
+	}
+	return true;
+}
+
+/**
+ * i40iw_check_mr_contiguous - check if MR is physically contiguous
+ * @palloc: pbl allocation struct
+ * pg_size: page size
+ */
+static bool i40iw_check_mr_contiguous(struct i40iw_pble_alloc *palloc, u32 pg_size)
+{
+	struct i40iw_pble_level2 *lvl2 = &palloc->level2;
+	struct i40iw_pble_info *leaf = lvl2->leaf;
+	u64 *arr = NULL;
+	u64 *start_addr = NULL;
+	int i;
+	bool ret;
+
+	if (palloc->level == I40IW_LEVEL_1) {
+		arr = (u64 *)palloc->level1.addr;
+		ret = i40iw_check_mem_contiguous(arr, palloc->total_cnt, pg_size);
+		return ret;
+	}
+
+	start_addr = (u64 *)leaf->addr;
+
+	for (i = 0; i < lvl2->leaf_cnt; i++, leaf++) {
+		arr = (u64 *)leaf->addr;
+		if ((*start_addr + (i * pg_size * PBLE_PER_PAGE)) != *arr)
+			return false;
+		ret = i40iw_check_mem_contiguous(arr, leaf->cnt, pg_size);
+		if (!ret)
+			return false;
+	}
+
+	return true;
+}
+
+/**
+ * i40iw_setup_pbles - copy user pg address to pble's
+ * @iwdev: iwarp device
+ * @iwmr: mr pointer for this memory registration
+ * @use_pbles: flag if to use pble's
+ */
+static int i40iw_setup_pbles(struct i40iw_device *iwdev,
+			     struct i40iw_mr *iwmr,
+			     bool use_pbles)
+{
+	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
+	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
+	struct i40iw_pble_info *pinfo;
+	u64 *pbl;
+	enum i40iw_status_code status;
+	enum i40iw_pble_level level = I40IW_LEVEL_1;
+
+	if (use_pbles) {
+		mutex_lock(&iwdev->pbl_mutex);
+		status = i40iw_get_pble(&iwdev->sc_dev, iwdev->pble_rsrc, palloc, iwmr->page_cnt);
+		mutex_unlock(&iwdev->pbl_mutex);
+		if (status)
+			return -ENOMEM;
+
+		iwpbl->pbl_allocated = true;
+		level = palloc->level;
+		pinfo = (level == I40IW_LEVEL_1) ? &palloc->level1 : palloc->level2.leaf;
+		pbl = (u64 *)pinfo->addr;
+	} else {
+		pbl = iwmr->pgaddrmem;
+	}
+
+	i40iw_copy_user_pgaddrs(iwmr, pbl, level);
+
+	if (use_pbles)
+		iwmr->pgaddrmem[0] = *pbl;
+
+	return 0;
+}
+
+/**
+ * i40iw_handle_q_mem - handle memory for qp and cq
+ * @iwdev: iwarp device
+ * @req: information for q memory management
+ * @iwpbl: pble struct
+ * @use_pbles: flag to use pble
+ */
+static int i40iw_handle_q_mem(struct i40iw_device *iwdev,
+			      struct i40iw_mem_reg_req *req,
+			      struct i40iw_pbl *iwpbl,
+			      bool use_pbles)
+{
+	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
+	struct i40iw_mr *iwmr = iwpbl->iwmr;
+	struct i40iw_qp_mr *qpmr = &iwpbl->qp_mr;
+	struct i40iw_cq_mr *cqmr = &iwpbl->cq_mr;
+	struct i40iw_hmc_pble *hmc_p;
+	u64 *arr = iwmr->pgaddrmem;
+	u32 pg_size;
+	int err;
+	int total;
+	bool ret = true;
+
+	total = req->sq_pages + req->rq_pages + req->cq_pages;
+	pg_size = iwmr->page_size;
+
+	err = i40iw_setup_pbles(iwdev, iwmr, use_pbles);
+	if (err)
+		return err;
+
+	if (use_pbles && (palloc->level != I40IW_LEVEL_1)) {
+		i40iw_free_pble(iwdev->pble_rsrc, palloc);
+		iwpbl->pbl_allocated = false;
+		return -ENOMEM;
+	}
+
+	if (use_pbles)
+		arr = (u64 *)palloc->level1.addr;
+
+	if (iwmr->type == IW_MEMREG_TYPE_QP) {
+		hmc_p = &qpmr->sq_pbl;
+		qpmr->shadow = (dma_addr_t)arr[total];
+
+		if (use_pbles) {
+			ret = i40iw_check_mem_contiguous(arr, req->sq_pages, pg_size);
+			if (ret)
+				ret = i40iw_check_mem_contiguous(&arr[req->sq_pages], req->rq_pages, pg_size);
+		}
+
+		if (!ret) {
+			hmc_p->idx = palloc->level1.idx;
+			hmc_p = &qpmr->rq_pbl;
+			hmc_p->idx = palloc->level1.idx + req->sq_pages;
+		} else {
+			hmc_p->addr = arr[0];
+			hmc_p = &qpmr->rq_pbl;
+			hmc_p->addr = arr[req->sq_pages];
+		}
+	} else {		/* CQ */
+		hmc_p = &cqmr->cq_pbl;
+		cqmr->shadow = (dma_addr_t)arr[total];
+
+		if (use_pbles)
+			ret = i40iw_check_mem_contiguous(arr, req->cq_pages, pg_size);
+
+		if (!ret)
+			hmc_p->idx = palloc->level1.idx;
+		else
+			hmc_p->addr = arr[0];
+	}
+
+	if (use_pbles && ret) {
+		i40iw_free_pble(iwdev->pble_rsrc, palloc);
+		iwpbl->pbl_allocated = false;
+	}
+
+	return err;
+}
+
+/**
+ * i40iw_hw_alloc_stag - cqp command to allocate stag
+ * @iwdev: iwarp device
+ * @iwmr: iwarp mr pointer
+ */
+static int i40iw_hw_alloc_stag(struct i40iw_device *iwdev, struct i40iw_mr *iwmr)
+{
+	struct i40iw_allocate_stag_info *info;
+	struct i40iw_pd *iwpd = to_iwpd(iwmr->ibmr.pd);
+	enum i40iw_status_code status;
+	int err = 0;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
+	if (!cqp_request)
+		return -ENOMEM;
+
+	cqp_info = &cqp_request->info;
+	info = &cqp_info->in.u.alloc_stag.info;
+	memset(info, 0, sizeof(*info));
+	info->page_size = PAGE_SIZE;
+	info->stag_idx = iwmr->stag >> I40IW_CQPSQ_STAG_IDX_SHIFT;
+	info->pd_id = iwpd->sc_pd.pd_id;
+	info->total_len = iwmr->length;
+	info->remote_access = true;
+	cqp_info->cqp_cmd = OP_ALLOC_STAG;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.alloc_stag.dev = &iwdev->sc_dev;
+	cqp_info->in.u.alloc_stag.scratch = (uintptr_t)cqp_request;
+
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status) {
+		err = -ENOMEM;
+		i40iw_pr_err("CQP-OP MR Reg fail");
+	}
+	return err;
+}
+
+/**
+ * i40iw_alloc_mr - register stag for fast memory registration
+ * @pd: ibpd pointer
+ * @mr_type: memory for stag registrion
+ * @max_num_sg: man number of pages
+ */
+static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd,
+				    enum ib_mr_type mr_type,
+				    u32 max_num_sg)
+{
+	struct i40iw_pd *iwpd = to_iwpd(pd);
+	struct i40iw_device *iwdev = to_iwdev(pd->device);
+	struct i40iw_pble_alloc *palloc;
+	struct i40iw_pbl *iwpbl;
+	struct i40iw_mr *iwmr;
+	enum i40iw_status_code status;
+	u32 stag;
+	int err_code = -ENOMEM;
+
+	iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL);
+	if (!iwmr)
+		return ERR_PTR(-ENOMEM);
+
+	stag = i40iw_create_stag(iwdev);
+	if (!stag) {
+		err_code = -EOVERFLOW;
+		goto err;
+	}
+	stag &= ~I40IW_CQPSQ_STAG_KEY_MASK;
+	iwmr->stag = stag;
+	iwmr->ibmr.rkey = stag;
+	iwmr->ibmr.lkey = stag;
+	iwmr->ibmr.pd = pd;
+	iwmr->ibmr.device = pd->device;
+	iwpbl = &iwmr->iwpbl;
+	iwpbl->iwmr = iwmr;
+	iwmr->type = IW_MEMREG_TYPE_MEM;
+	palloc = &iwpbl->pble_alloc;
+	iwmr->page_cnt = max_num_sg;
+	mutex_lock(&iwdev->pbl_mutex);
+	status = i40iw_get_pble(&iwdev->sc_dev, iwdev->pble_rsrc, palloc, iwmr->page_cnt);
+	mutex_unlock(&iwdev->pbl_mutex);
+	if (status)
+		goto err1;
+
+	if (palloc->level != I40IW_LEVEL_1)
+		goto err2;
+	err_code = i40iw_hw_alloc_stag(iwdev, iwmr);
+	if (err_code)
+		goto err2;
+	iwpbl->pbl_allocated = true;
+	i40iw_add_pdusecount(iwpd);
+	return &iwmr->ibmr;
+err2:
+	i40iw_free_pble(iwdev->pble_rsrc, palloc);
+err1:
+	i40iw_free_stag(iwdev, stag);
+err:
+	kfree(iwmr);
+	return ERR_PTR(err_code);
+}
+
+/**
+ * i40iw_set_page - populate pbl list for fmr
+ * @ibmr: ib mem to access iwarp mr pointer
+ * @addr: page dma address fro pbl list
+ */
+static int i40iw_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct i40iw_mr *iwmr = to_iwmr(ibmr);
+	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
+	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
+	u64 *pbl;
+
+	if (unlikely(iwmr->npages == iwmr->page_cnt))
+		return -ENOMEM;
+
+	pbl = (u64 *)palloc->level1.addr;
+	pbl[iwmr->npages++] = cpu_to_le64(addr);
+	return 0;
+}
+
+/**
+ * i40iw_map_mr_sg - map of sg list for fmr
+ * @ibmr: ib mem to access iwarp mr pointer
+ * @sg: scatter gather list for fmr
+ * @sg_nents: number of sg pages
+ */
+static int i40iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+			   int sg_nents, unsigned int *sg_offset)
+{
+	struct i40iw_mr *iwmr = to_iwmr(ibmr);
+
+	iwmr->npages = 0;
+	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, i40iw_set_page);
+}
+
+/**
+ * i40iw_drain_sq - drain the send queue
+ * @ibqp: ib qp pointer
+ */
+static void i40iw_drain_sq(struct ib_qp *ibqp)
+{
+	struct i40iw_qp *iwqp = to_iwqp(ibqp);
+	struct i40iw_sc_qp *qp = &iwqp->sc_qp;
+
+	if (I40IW_RING_MORE_WORK(qp->qp_uk.sq_ring))
+		wait_for_completion(&iwqp->sq_drained);
+}
+
+/**
+ * i40iw_drain_rq - drain the receive queue
+ * @ibqp: ib qp pointer
+ */
+static void i40iw_drain_rq(struct ib_qp *ibqp)
+{
+	struct i40iw_qp *iwqp = to_iwqp(ibqp);
+	struct i40iw_sc_qp *qp = &iwqp->sc_qp;
+
+	if (I40IW_RING_MORE_WORK(qp->qp_uk.rq_ring))
+		wait_for_completion(&iwqp->rq_drained);
+}
+
+/**
+ * i40iw_hwreg_mr - send cqp command for memory registration
+ * @iwdev: iwarp device
+ * @iwmr: iwarp mr pointer
+ * @access: access for MR
+ */
+static int i40iw_hwreg_mr(struct i40iw_device *iwdev,
+			  struct i40iw_mr *iwmr,
+			  u16 access)
+{
+	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
+	struct i40iw_reg_ns_stag_info *stag_info;
+	struct i40iw_pd *iwpd = to_iwpd(iwmr->ibmr.pd);
+	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
+	enum i40iw_status_code status;
+	int err = 0;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
+	if (!cqp_request)
+		return -ENOMEM;
+
+	cqp_info = &cqp_request->info;
+	stag_info = &cqp_info->in.u.mr_reg_non_shared.info;
+	memset(stag_info, 0, sizeof(*stag_info));
+	stag_info->va = (void *)(unsigned long)iwpbl->user_base;
+	stag_info->stag_idx = iwmr->stag >> I40IW_CQPSQ_STAG_IDX_SHIFT;
+	stag_info->stag_key = (u8)iwmr->stag;
+	stag_info->total_len = iwmr->length;
+	stag_info->access_rights = access;
+	stag_info->pd_id = iwpd->sc_pd.pd_id;
+	stag_info->addr_type = I40IW_ADDR_TYPE_VA_BASED;
+	stag_info->page_size = iwmr->page_size;
+
+	if (iwpbl->pbl_allocated) {
+		if (palloc->level == I40IW_LEVEL_1) {
+			stag_info->first_pm_pbl_index = palloc->level1.idx;
+			stag_info->chunk_size = 1;
+		} else {
+			stag_info->first_pm_pbl_index = palloc->level2.root.idx;
+			stag_info->chunk_size = 3;
+		}
+	} else {
+		stag_info->reg_addr_pa = iwmr->pgaddrmem[0];
+	}
+
+	cqp_info->cqp_cmd = OP_MR_REG_NON_SHARED;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.mr_reg_non_shared.dev = &iwdev->sc_dev;
+	cqp_info->in.u.mr_reg_non_shared.scratch = (uintptr_t)cqp_request;
+
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status) {
+		err = -ENOMEM;
+		i40iw_pr_err("CQP-OP MR Reg fail");
+	}
+	return err;
+}
+
+/**
+ * i40iw_reg_user_mr - Register a user memory region
+ * @pd: ptr of pd
+ * @start: virtual start address
+ * @length: length of mr
+ * @virt: virtual address
+ * @acc: access of mr
+ * @udata: user data
+ */
+static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
+				       u64 start,
+				       u64 length,
+				       u64 virt,
+				       int acc,
+				       struct ib_udata *udata)
+{
+	struct i40iw_pd *iwpd = to_iwpd(pd);
+	struct i40iw_device *iwdev = to_iwdev(pd->device);
+	struct i40iw_ucontext *ucontext;
+	struct i40iw_pble_alloc *palloc;
+	struct i40iw_pbl *iwpbl;
+	struct i40iw_mr *iwmr;
+	struct ib_umem *region;
+	struct i40iw_mem_reg_req req;
+	u64 pbl_depth = 0;
+	u32 stag = 0;
+	u16 access;
+	u64 region_length;
+	bool use_pbles = false;
+	unsigned long flags;
+	int err = -ENOSYS;
+	int ret;
+	int pg_shift;
+
+	if (iwdev->closing)
+		return ERR_PTR(-ENODEV);
+
+	if (length > I40IW_MAX_MR_SIZE)
+		return ERR_PTR(-EINVAL);
+	region = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(region))
+		return (struct ib_mr *)region;
+
+	if (ib_copy_from_udata(&req, udata, sizeof(req))) {
+		ib_umem_release(region);
+		return ERR_PTR(-EFAULT);
+	}
+
+	iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL);
+	if (!iwmr) {
+		ib_umem_release(region);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	iwpbl = &iwmr->iwpbl;
+	iwpbl->iwmr = iwmr;
+	iwmr->region = region;
+	iwmr->ibmr.pd = pd;
+	iwmr->ibmr.device = pd->device;
+	ucontext = to_ucontext(pd->uobject->context);
+
+	iwmr->page_size = PAGE_SIZE;
+	iwmr->page_msk = PAGE_MASK;
+
+	if (region->hugetlb && (req.reg_type == IW_MEMREG_TYPE_MEM))
+		i40iw_set_hugetlb_values(start, iwmr);
+
+	region_length = region->length + (start & (iwmr->page_size - 1));
+	pg_shift = ffs(iwmr->page_size) - 1;
+	pbl_depth = region_length >> pg_shift;
+	pbl_depth += (region_length & (iwmr->page_size - 1)) ? 1 : 0;
+	iwmr->length = region->length;
+
+	iwpbl->user_base = virt;
+	palloc = &iwpbl->pble_alloc;
+
+	iwmr->type = req.reg_type;
+	iwmr->page_cnt = (u32)pbl_depth;
+
+	switch (req.reg_type) {
+	case IW_MEMREG_TYPE_QP:
+		use_pbles = ((req.sq_pages + req.rq_pages) > 2);
+		err = i40iw_handle_q_mem(iwdev, &req, iwpbl, use_pbles);
+		if (err)
+			goto error;
+		spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags);
+		list_add_tail(&iwpbl->list, &ucontext->qp_reg_mem_list);
+		iwpbl->on_list = true;
+		spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
+		break;
+	case IW_MEMREG_TYPE_CQ:
+		use_pbles = (req.cq_pages > 1);
+		err = i40iw_handle_q_mem(iwdev, &req, iwpbl, use_pbles);
+		if (err)
+			goto error;
+
+		spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags);
+		list_add_tail(&iwpbl->list, &ucontext->cq_reg_mem_list);
+		iwpbl->on_list = true;
+		spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
+		break;
+	case IW_MEMREG_TYPE_MEM:
+		use_pbles = (iwmr->page_cnt != 1);
+		access = I40IW_ACCESS_FLAGS_LOCALREAD;
+
+		err = i40iw_setup_pbles(iwdev, iwmr, use_pbles);
+		if (err)
+			goto error;
+
+		if (use_pbles) {
+			ret = i40iw_check_mr_contiguous(palloc, iwmr->page_size);
+			if (ret) {
+				i40iw_free_pble(iwdev->pble_rsrc, palloc);
+				iwpbl->pbl_allocated = false;
+			}
+		}
+
+		access |= i40iw_get_user_access(acc);
+		stag = i40iw_create_stag(iwdev);
+		if (!stag) {
+			err = -ENOMEM;
+			goto error;
+		}
+
+		iwmr->stag = stag;
+		iwmr->ibmr.rkey = stag;
+		iwmr->ibmr.lkey = stag;
+
+		err = i40iw_hwreg_mr(iwdev, iwmr, access);
+		if (err) {
+			i40iw_free_stag(iwdev, stag);
+			goto error;
+		}
+
+		break;
+	default:
+		goto error;
+	}
+
+	iwmr->type = req.reg_type;
+	if (req.reg_type == IW_MEMREG_TYPE_MEM)
+		i40iw_add_pdusecount(iwpd);
+	return &iwmr->ibmr;
+
+error:
+	if (palloc->level != I40IW_LEVEL_0 && iwpbl->pbl_allocated)
+		i40iw_free_pble(iwdev->pble_rsrc, palloc);
+	ib_umem_release(region);
+	kfree(iwmr);
+	return ERR_PTR(err);
+}
+
+/**
+ * i40iw_reg_phys_mr - register kernel physical memory
+ * @pd: ibpd pointer
+ * @addr: physical address of memory to register
+ * @size: size of memory to register
+ * @acc: Access rights
+ * @iova_start: start of virtual address for physical buffers
+ */
+struct ib_mr *i40iw_reg_phys_mr(struct ib_pd *pd,
+				u64 addr,
+				u64 size,
+				int acc,
+				u64 *iova_start)
+{
+	struct i40iw_pd *iwpd = to_iwpd(pd);
+	struct i40iw_device *iwdev = to_iwdev(pd->device);
+	struct i40iw_pbl *iwpbl;
+	struct i40iw_mr *iwmr;
+	enum i40iw_status_code status;
+	u32 stag;
+	u16 access = I40IW_ACCESS_FLAGS_LOCALREAD;
+	int ret;
+
+	iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL);
+	if (!iwmr)
+		return ERR_PTR(-ENOMEM);
+	iwmr->ibmr.pd = pd;
+	iwmr->ibmr.device = pd->device;
+	iwpbl = &iwmr->iwpbl;
+	iwpbl->iwmr = iwmr;
+	iwmr->type = IW_MEMREG_TYPE_MEM;
+	iwpbl->user_base = *iova_start;
+	stag = i40iw_create_stag(iwdev);
+	if (!stag) {
+		ret = -EOVERFLOW;
+		goto err;
+	}
+	access |= i40iw_get_user_access(acc);
+	iwmr->stag = stag;
+	iwmr->ibmr.rkey = stag;
+	iwmr->ibmr.lkey = stag;
+	iwmr->page_cnt = 1;
+	iwmr->pgaddrmem[0]  = addr;
+	iwmr->length = size;
+	status = i40iw_hwreg_mr(iwdev, iwmr, access);
+	if (status) {
+		i40iw_free_stag(iwdev, stag);
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	i40iw_add_pdusecount(iwpd);
+	return &iwmr->ibmr;
+ err:
+	kfree(iwmr);
+	return ERR_PTR(ret);
+}
+
+/**
+ * i40iw_get_dma_mr - register physical mem
+ * @pd: ptr of pd
+ * @acc: access for memory
+ */
+static struct ib_mr *i40iw_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	u64 kva = 0;
+
+	return i40iw_reg_phys_mr(pd, 0, 0, acc, &kva);
+}
+
+/**
+ * i40iw_del_mem_list - Deleting pbl list entries for CQ/QP
+ * @iwmr: iwmr for IB's user page addresses
+ * @ucontext: ptr to user context
+ */
+static void i40iw_del_memlist(struct i40iw_mr *iwmr,
+			      struct i40iw_ucontext *ucontext)
+{
+	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
+	unsigned long flags;
+
+	switch (iwmr->type) {
+	case IW_MEMREG_TYPE_CQ:
+		spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags);
+		if (iwpbl->on_list) {
+			iwpbl->on_list = false;
+			list_del(&iwpbl->list);
+		}
+		spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
+		break;
+	case IW_MEMREG_TYPE_QP:
+		spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags);
+		if (iwpbl->on_list) {
+			iwpbl->on_list = false;
+			list_del(&iwpbl->list);
+		}
+		spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
+		break;
+	default:
+		break;
+	}
+}
+
+/**
+ * i40iw_dereg_mr - deregister mr
+ * @ib_mr: mr ptr for dereg
+ */
+static int i40iw_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct ib_pd *ibpd = ib_mr->pd;
+	struct i40iw_pd *iwpd = to_iwpd(ibpd);
+	struct i40iw_mr *iwmr = to_iwmr(ib_mr);
+	struct i40iw_device *iwdev = to_iwdev(ib_mr->device);
+	enum i40iw_status_code status;
+	struct i40iw_dealloc_stag_info *info;
+	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
+	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
+	struct i40iw_cqp_request *cqp_request;
+	struct cqp_commands_info *cqp_info;
+	u32 stag_idx;
+
+	if (iwmr->region)
+		ib_umem_release(iwmr->region);
+
+	if (iwmr->type != IW_MEMREG_TYPE_MEM) {
+		if (ibpd->uobject) {
+			struct i40iw_ucontext *ucontext;
+
+			ucontext = to_ucontext(ibpd->uobject->context);
+			i40iw_del_memlist(iwmr, ucontext);
+		}
+		if (iwpbl->pbl_allocated && iwmr->type != IW_MEMREG_TYPE_QP)
+			i40iw_free_pble(iwdev->pble_rsrc, palloc);
+		kfree(iwmr);
+		return 0;
+	}
+
+	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
+	if (!cqp_request)
+		return -ENOMEM;
+
+	cqp_info = &cqp_request->info;
+	info = &cqp_info->in.u.dealloc_stag.info;
+	memset(info, 0, sizeof(*info));
+
+	info->pd_id = cpu_to_le32(iwpd->sc_pd.pd_id & 0x00007fff);
+	info->stag_idx = RS_64_1(ib_mr->rkey, I40IW_CQPSQ_STAG_IDX_SHIFT);
+	stag_idx = info->stag_idx;
+	info->mr = true;
+	if (iwpbl->pbl_allocated)
+		info->dealloc_pbl = true;
+
+	cqp_info->cqp_cmd = OP_DEALLOC_STAG;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.dealloc_stag.dev = &iwdev->sc_dev;
+	cqp_info->in.u.dealloc_stag.scratch = (uintptr_t)cqp_request;
+	status = i40iw_handle_cqp_op(iwdev, cqp_request);
+	if (status)
+		i40iw_pr_err("CQP-OP dealloc failed for stag_idx = 0x%x\n", stag_idx);
+	i40iw_rem_pdusecount(iwpd, iwdev);
+	i40iw_free_stag(iwdev, iwmr->stag);
+	if (iwpbl->pbl_allocated)
+		i40iw_free_pble(iwdev->pble_rsrc, palloc);
+	kfree(iwmr);
+	return 0;
+}
+
+/**
+ * i40iw_show_rev
+ */
+static ssize_t i40iw_show_rev(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct i40iw_ib_device *iwibdev = container_of(dev,
+						       struct i40iw_ib_device,
+						       ibdev.dev);
+	u32 hw_rev = iwibdev->iwdev->sc_dev.hw_rev;
+
+	return sprintf(buf, "%x\n", hw_rev);
+}
+
+/**
+ * i40iw_show_hca
+ */
+static ssize_t i40iw_show_hca(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "I40IW\n");
+}
+
+/**
+ * i40iw_show_board
+ */
+static ssize_t i40iw_show_board(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	return sprintf(buf, "%.*s\n", 32, "I40IW Board ID");
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, i40iw_show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, i40iw_show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, i40iw_show_board, NULL);
+
+static struct device_attribute *i40iw_dev_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+/**
+ * i40iw_copy_sg_list - copy sg list for qp
+ * @sg_list: copied into sg_list
+ * @sgl: copy from sgl
+ * @num_sges: count of sg entries
+ */
+static void i40iw_copy_sg_list(struct i40iw_sge *sg_list, struct ib_sge *sgl, int num_sges)
+{
+	unsigned int i;
+
+	for (i = 0; (i < num_sges) && (i < I40IW_MAX_WQ_FRAGMENT_COUNT); i++) {
+		sg_list[i].tag_off = sgl[i].addr;
+		sg_list[i].len = sgl[i].length;
+		sg_list[i].stag = sgl[i].lkey;
+	}
+}
+
+/**
+ * i40iw_post_send -  kernel application wr
+ * @ibqp: qp ptr for wr
+ * @ib_wr: work request ptr
+ * @bad_wr: return of bad wr if err
+ */
+static int i40iw_post_send(struct ib_qp *ibqp,
+			   struct ib_send_wr *ib_wr,
+			   struct ib_send_wr **bad_wr)
+{
+	struct i40iw_qp *iwqp;
+	struct i40iw_qp_uk *ukqp;
+	struct i40iw_post_sq_info info;
+	enum i40iw_status_code ret;
+	int err = 0;
+	unsigned long flags;
+	bool inv_stag;
+
+	iwqp = (struct i40iw_qp *)ibqp;
+	ukqp = &iwqp->sc_qp.qp_uk;
+
+	spin_lock_irqsave(&iwqp->lock, flags);
+
+	if (iwqp->flush_issued) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	while (ib_wr) {
+		inv_stag = false;
+		memset(&info, 0, sizeof(info));
+		info.wr_id = (u64)(ib_wr->wr_id);
+		if ((ib_wr->send_flags & IB_SEND_SIGNALED) || iwqp->sig_all)
+			info.signaled = true;
+		if (ib_wr->send_flags & IB_SEND_FENCE)
+			info.read_fence = true;
+
+		switch (ib_wr->opcode) {
+		case IB_WR_SEND:
+			/* fall-through */
+		case IB_WR_SEND_WITH_INV:
+			if (ib_wr->opcode == IB_WR_SEND) {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					info.op_type = I40IW_OP_TYPE_SEND_SOL;
+				else
+					info.op_type = I40IW_OP_TYPE_SEND;
+			} else {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					info.op_type = I40IW_OP_TYPE_SEND_SOL_INV;
+				else
+					info.op_type = I40IW_OP_TYPE_SEND_INV;
+			}
+
+			if (ib_wr->send_flags & IB_SEND_INLINE) {
+				info.op.inline_send.data = (void *)(unsigned long)ib_wr->sg_list[0].addr;
+				info.op.inline_send.len = ib_wr->sg_list[0].length;
+				ret = ukqp->ops.iw_inline_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false);
+			} else {
+				info.op.send.num_sges = ib_wr->num_sge;
+				info.op.send.sg_list = (struct i40iw_sge *)ib_wr->sg_list;
+				ret = ukqp->ops.iw_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false);
+			}
+
+			if (ret) {
+				if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+					err = -ENOMEM;
+				else
+					err = -EINVAL;
+			}
+			break;
+		case IB_WR_RDMA_WRITE:
+			info.op_type = I40IW_OP_TYPE_RDMA_WRITE;
+
+			if (ib_wr->send_flags & IB_SEND_INLINE) {
+				info.op.inline_rdma_write.data = (void *)(unsigned long)ib_wr->sg_list[0].addr;
+				info.op.inline_rdma_write.len = ib_wr->sg_list[0].length;
+				info.op.inline_rdma_write.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
+				info.op.inline_rdma_write.rem_addr.stag = rdma_wr(ib_wr)->rkey;
+				ret = ukqp->ops.iw_inline_rdma_write(ukqp, &info, false);
+			} else {
+				info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list;
+				info.op.rdma_write.num_lo_sges = ib_wr->num_sge;
+				info.op.rdma_write.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
+				info.op.rdma_write.rem_addr.stag = rdma_wr(ib_wr)->rkey;
+				ret = ukqp->ops.iw_rdma_write(ukqp, &info, false);
+			}
+
+			if (ret) {
+				if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+					err = -ENOMEM;
+				else
+					err = -EINVAL;
+			}
+			break;
+		case IB_WR_RDMA_READ_WITH_INV:
+			inv_stag = true;
+			/* fall-through*/
+		case IB_WR_RDMA_READ:
+			if (ib_wr->num_sge > I40IW_MAX_SGE_RD) {
+				err = -EINVAL;
+				break;
+			}
+			info.op_type = I40IW_OP_TYPE_RDMA_READ;
+			info.op.rdma_read.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
+			info.op.rdma_read.rem_addr.stag = rdma_wr(ib_wr)->rkey;
+			info.op.rdma_read.lo_addr.tag_off = ib_wr->sg_list->addr;
+			info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey;
+			info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length;
+			ret = ukqp->ops.iw_rdma_read(ukqp, &info, inv_stag, false);
+			if (ret) {
+				if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+					err = -ENOMEM;
+				else
+					err = -EINVAL;
+			}
+			break;
+		case IB_WR_LOCAL_INV:
+			info.op_type = I40IW_OP_TYPE_INV_STAG;
+			info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey;
+			ret = ukqp->ops.iw_stag_local_invalidate(ukqp, &info, true);
+			if (ret)
+				err = -ENOMEM;
+			break;
+		case IB_WR_REG_MR:
+		{
+			struct i40iw_mr *iwmr = to_iwmr(reg_wr(ib_wr)->mr);
+			int flags = reg_wr(ib_wr)->access;
+			struct i40iw_pble_alloc *palloc = &iwmr->iwpbl.pble_alloc;
+			struct i40iw_sc_dev *dev = &iwqp->iwdev->sc_dev;
+			struct i40iw_fast_reg_stag_info info;
+
+			memset(&info, 0, sizeof(info));
+			info.access_rights = I40IW_ACCESS_FLAGS_LOCALREAD;
+			info.access_rights |= i40iw_get_user_access(flags);
+			info.stag_key = reg_wr(ib_wr)->key & 0xff;
+			info.stag_idx = reg_wr(ib_wr)->key >> 8;
+			info.page_size = reg_wr(ib_wr)->mr->page_size;
+			info.wr_id = ib_wr->wr_id;
+
+			info.addr_type = I40IW_ADDR_TYPE_VA_BASED;
+			info.va = (void *)(uintptr_t)iwmr->ibmr.iova;
+			info.total_len = iwmr->ibmr.length;
+			info.reg_addr_pa = *(u64 *)palloc->level1.addr;
+			info.first_pm_pbl_index = palloc->level1.idx;
+			info.local_fence = ib_wr->send_flags & IB_SEND_FENCE;
+			info.signaled = ib_wr->send_flags & IB_SEND_SIGNALED;
+
+			if (iwmr->npages > I40IW_MIN_PAGES_PER_FMR)
+				info.chunk_size = 1;
+
+			ret = dev->iw_priv_qp_ops->iw_mr_fast_register(&iwqp->sc_qp, &info, true);
+			if (ret)
+				err = -ENOMEM;
+			break;
+		}
+		default:
+			err = -EINVAL;
+			i40iw_pr_err(" upost_send bad opcode = 0x%x\n",
+				     ib_wr->opcode);
+			break;
+		}
+
+		if (err)
+			break;
+		ib_wr = ib_wr->next;
+	}
+
+out:
+	if (err)
+		*bad_wr = ib_wr;
+	else
+		ukqp->ops.iw_qp_post_wr(ukqp);
+	spin_unlock_irqrestore(&iwqp->lock, flags);
+
+	return err;
+}
+
+/**
+ * i40iw_post_recv - post receive wr for kernel application
+ * @ibqp: ib qp pointer
+ * @ib_wr: work request for receive
+ * @bad_wr: bad wr caused an error
+ */
+static int i40iw_post_recv(struct ib_qp *ibqp,
+			   struct ib_recv_wr *ib_wr,
+			   struct ib_recv_wr **bad_wr)
+{
+	struct i40iw_qp *iwqp;
+	struct i40iw_qp_uk *ukqp;
+	struct i40iw_post_rq_info post_recv;
+	struct i40iw_sge sg_list[I40IW_MAX_WQ_FRAGMENT_COUNT];
+	enum i40iw_status_code ret = 0;
+	unsigned long flags;
+	int err = 0;
+
+	iwqp = (struct i40iw_qp *)ibqp;
+	ukqp = &iwqp->sc_qp.qp_uk;
+
+	memset(&post_recv, 0, sizeof(post_recv));
+	spin_lock_irqsave(&iwqp->lock, flags);
+
+	if (iwqp->flush_issued) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	while (ib_wr) {
+		post_recv.num_sges = ib_wr->num_sge;
+		post_recv.wr_id = ib_wr->wr_id;
+		i40iw_copy_sg_list(sg_list, ib_wr->sg_list, ib_wr->num_sge);
+		post_recv.sg_list = sg_list;
+		ret = ukqp->ops.iw_post_receive(ukqp, &post_recv);
+		if (ret) {
+			i40iw_pr_err(" post_recv err %d\n", ret);
+			if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+				err = -ENOMEM;
+			else
+				err = -EINVAL;
+			*bad_wr = ib_wr;
+			goto out;
+		}
+		ib_wr = ib_wr->next;
+	}
+ out:
+	spin_unlock_irqrestore(&iwqp->lock, flags);
+	return err;
+}
+
+/**
+ * i40iw_poll_cq - poll cq for completion (kernel apps)
+ * @ibcq: cq to poll
+ * @num_entries: number of entries to poll
+ * @entry: wr of entry completed
+ */
+static int i40iw_poll_cq(struct ib_cq *ibcq,
+			 int num_entries,
+			 struct ib_wc *entry)
+{
+	struct i40iw_cq *iwcq;
+	int cqe_count = 0;
+	struct i40iw_cq_poll_info cq_poll_info;
+	enum i40iw_status_code ret;
+	struct i40iw_cq_uk *ukcq;
+	struct i40iw_sc_qp *qp;
+	struct i40iw_qp *iwqp;
+	unsigned long flags;
+
+	iwcq = (struct i40iw_cq *)ibcq;
+	ukcq = &iwcq->sc_cq.cq_uk;
+
+	spin_lock_irqsave(&iwcq->lock, flags);
+	while (cqe_count < num_entries) {
+		ret = ukcq->ops.iw_cq_poll_completion(ukcq, &cq_poll_info);
+		if (ret == I40IW_ERR_QUEUE_EMPTY) {
+			break;
+		} else if (ret == I40IW_ERR_QUEUE_DESTROYED) {
+			continue;
+		} else if (ret) {
+			if (!cqe_count)
+				cqe_count = -1;
+			break;
+		}
+		entry->wc_flags = 0;
+		entry->wr_id = cq_poll_info.wr_id;
+		if (cq_poll_info.error) {
+			entry->status = IB_WC_WR_FLUSH_ERR;
+			entry->vendor_err = cq_poll_info.major_err << 16 | cq_poll_info.minor_err;
+		} else {
+			entry->status = IB_WC_SUCCESS;
+		}
+
+		switch (cq_poll_info.op_type) {
+		case I40IW_OP_TYPE_RDMA_WRITE:
+			entry->opcode = IB_WC_RDMA_WRITE;
+			break;
+		case I40IW_OP_TYPE_RDMA_READ_INV_STAG:
+		case I40IW_OP_TYPE_RDMA_READ:
+			entry->opcode = IB_WC_RDMA_READ;
+			break;
+		case I40IW_OP_TYPE_SEND_SOL:
+		case I40IW_OP_TYPE_SEND_SOL_INV:
+		case I40IW_OP_TYPE_SEND_INV:
+		case I40IW_OP_TYPE_SEND:
+			entry->opcode = IB_WC_SEND;
+			break;
+		case I40IW_OP_TYPE_REC:
+			entry->opcode = IB_WC_RECV;
+			break;
+		default:
+			entry->opcode = IB_WC_RECV;
+			break;
+		}
+
+		entry->ex.imm_data = 0;
+		qp = (struct i40iw_sc_qp *)cq_poll_info.qp_handle;
+		entry->qp = (struct ib_qp *)qp->back_qp;
+		entry->src_qp = cq_poll_info.qp_id;
+		iwqp = (struct i40iw_qp *)qp->back_qp;
+		if (iwqp->iwarp_state > I40IW_QP_STATE_RTS) {
+			if (!I40IW_RING_MORE_WORK(qp->qp_uk.sq_ring))
+				complete(&iwqp->sq_drained);
+			if (!I40IW_RING_MORE_WORK(qp->qp_uk.rq_ring))
+				complete(&iwqp->rq_drained);
+		}
+		entry->byte_len = cq_poll_info.bytes_xfered;
+		entry++;
+		cqe_count++;
+	}
+	spin_unlock_irqrestore(&iwcq->lock, flags);
+	return cqe_count;
+}
+
+/**
+ * i40iw_req_notify_cq - arm cq kernel application
+ * @ibcq: cq to arm
+ * @notify_flags: notofication flags
+ */
+static int i40iw_req_notify_cq(struct ib_cq *ibcq,
+			       enum ib_cq_notify_flags notify_flags)
+{
+	struct i40iw_cq *iwcq;
+	struct i40iw_cq_uk *ukcq;
+	unsigned long flags;
+	enum i40iw_completion_notify cq_notify = IW_CQ_COMPL_EVENT;
+
+	iwcq = (struct i40iw_cq *)ibcq;
+	ukcq = &iwcq->sc_cq.cq_uk;
+	if (notify_flags == IB_CQ_SOLICITED)
+		cq_notify = IW_CQ_COMPL_SOLICITED;
+	spin_lock_irqsave(&iwcq->lock, flags);
+	ukcq->ops.iw_cq_request_notification(ukcq, cq_notify);
+	spin_unlock_irqrestore(&iwcq->lock, flags);
+	return 0;
+}
+
+/**
+ * i40iw_port_immutable - return port's immutable data
+ * @ibdev: ib dev struct
+ * @port_num: port number
+ * @immutable: immutable data for the port return
+ */
+static int i40iw_port_immutable(struct ib_device *ibdev, u8 port_num,
+				struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	return 0;
+}
+
+static const char * const i40iw_hw_stat_names[] = {
+	// 32bit names
+	[I40IW_HW_STAT_INDEX_IP4RXDISCARD] = "ip4InDiscards",
+	[I40IW_HW_STAT_INDEX_IP4RXTRUNC] = "ip4InTruncatedPkts",
+	[I40IW_HW_STAT_INDEX_IP4TXNOROUTE] = "ip4OutNoRoutes",
+	[I40IW_HW_STAT_INDEX_IP6RXDISCARD] = "ip6InDiscards",
+	[I40IW_HW_STAT_INDEX_IP6RXTRUNC] = "ip6InTruncatedPkts",
+	[I40IW_HW_STAT_INDEX_IP6TXNOROUTE] = "ip6OutNoRoutes",
+	[I40IW_HW_STAT_INDEX_TCPRTXSEG] = "tcpRetransSegs",
+	[I40IW_HW_STAT_INDEX_TCPRXOPTERR] = "tcpInOptErrors",
+	[I40IW_HW_STAT_INDEX_TCPRXPROTOERR] = "tcpInProtoErrors",
+	// 64bit names
+	[I40IW_HW_STAT_INDEX_IP4RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4InOctets",
+	[I40IW_HW_STAT_INDEX_IP4RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4InPkts",
+	[I40IW_HW_STAT_INDEX_IP4RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4InReasmRqd",
+	[I40IW_HW_STAT_INDEX_IP4RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4InMcastPkts",
+	[I40IW_HW_STAT_INDEX_IP4TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4OutOctets",
+	[I40IW_HW_STAT_INDEX_IP4TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4OutPkts",
+	[I40IW_HW_STAT_INDEX_IP4TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4OutSegRqd",
+	[I40IW_HW_STAT_INDEX_IP4TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip4OutMcastPkts",
+	[I40IW_HW_STAT_INDEX_IP6RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6InOctets",
+	[I40IW_HW_STAT_INDEX_IP6RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6InPkts",
+	[I40IW_HW_STAT_INDEX_IP6RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6InReasmRqd",
+	[I40IW_HW_STAT_INDEX_IP6RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6InMcastPkts",
+	[I40IW_HW_STAT_INDEX_IP6TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6OutOctets",
+	[I40IW_HW_STAT_INDEX_IP6TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6OutPkts",
+	[I40IW_HW_STAT_INDEX_IP6TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6OutSegRqd",
+	[I40IW_HW_STAT_INDEX_IP6TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"ip6OutMcastPkts",
+	[I40IW_HW_STAT_INDEX_TCPRXSEGS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"tcpInSegs",
+	[I40IW_HW_STAT_INDEX_TCPTXSEG + I40IW_HW_STAT_INDEX_MAX_32] =
+		"tcpOutSegs",
+	[I40IW_HW_STAT_INDEX_RDMARXRDS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwInRdmaReads",
+	[I40IW_HW_STAT_INDEX_RDMARXSNDS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwInRdmaSends",
+	[I40IW_HW_STAT_INDEX_RDMARXWRS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwInRdmaWrites",
+	[I40IW_HW_STAT_INDEX_RDMATXRDS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwOutRdmaReads",
+	[I40IW_HW_STAT_INDEX_RDMATXSNDS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwOutRdmaSends",
+	[I40IW_HW_STAT_INDEX_RDMATXWRS + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwOutRdmaWrites",
+	[I40IW_HW_STAT_INDEX_RDMAVBND + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwRdmaBnd",
+	[I40IW_HW_STAT_INDEX_RDMAVINV + I40IW_HW_STAT_INDEX_MAX_32] =
+		"iwRdmaInv"
+};
+
+static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str)
+{
+	u32 firmware_version = I40IW_FW_VERSION;
+
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u", firmware_version,
+		 (firmware_version & 0x000000ff));
+}
+
+/**
+ * i40iw_alloc_hw_stats - Allocate a hw stats structure
+ * @ibdev: device pointer from stack
+ * @port_num: port number
+ */
+static struct rdma_hw_stats *i40iw_alloc_hw_stats(struct ib_device *ibdev,
+						  u8 port_num)
+{
+	struct i40iw_device *iwdev = to_iwdev(ibdev);
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	int num_counters = I40IW_HW_STAT_INDEX_MAX_32 +
+		I40IW_HW_STAT_INDEX_MAX_64;
+	unsigned long lifespan = RDMA_HW_STATS_DEFAULT_LIFESPAN;
+
+	BUILD_BUG_ON(ARRAY_SIZE(i40iw_hw_stat_names) !=
+		     (I40IW_HW_STAT_INDEX_MAX_32 +
+		      I40IW_HW_STAT_INDEX_MAX_64));
+
+	/*
+	 * PFs get the default update lifespan, but VFs only update once
+	 * per second
+	 */
+	if (!dev->is_pf)
+		lifespan = 1000;
+	return rdma_alloc_hw_stats_struct(i40iw_hw_stat_names, num_counters,
+					  lifespan);
+}
+
+/**
+ * i40iw_get_hw_stats - Populates the rdma_hw_stats structure
+ * @ibdev: device pointer from stack
+ * @stats: stats pointer from stack
+ * @port_num: port number
+ * @index: which hw counter the stack is requesting we update
+ */
+static int i40iw_get_hw_stats(struct ib_device *ibdev,
+			      struct rdma_hw_stats *stats,
+			      u8 port_num, int index)
+{
+	struct i40iw_device *iwdev = to_iwdev(ibdev);
+	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+	struct i40iw_vsi_pestat *devstat = iwdev->vsi.pestat;
+	struct i40iw_dev_hw_stats *hw_stats = &devstat->hw_stats;
+
+	if (dev->is_pf) {
+		i40iw_hw_stats_read_all(devstat, &devstat->hw_stats);
+	} else {
+		if (i40iw_vchnl_vf_get_pe_stats(dev, &devstat->hw_stats))
+			return -ENOSYS;
+	}
+
+	memcpy(&stats->value[0], hw_stats, sizeof(*hw_stats));
+
+	return stats->num_counters;
+}
+
+/**
+ * i40iw_query_gid - Query port GID
+ * @ibdev: device pointer from stack
+ * @port: port number
+ * @index: Entry index
+ * @gid: Global ID
+ */
+static int i40iw_query_gid(struct ib_device *ibdev,
+			   u8 port,
+			   int index,
+			   union ib_gid *gid)
+{
+	struct i40iw_device *iwdev = to_iwdev(ibdev);
+
+	memset(gid->raw, 0, sizeof(gid->raw));
+	ether_addr_copy(gid->raw, iwdev->netdev->dev_addr);
+	return 0;
+}
+
+/**
+ * i40iw_modify_port  Modify port properties
+ * @ibdev: device pointer from stack
+ * @port: port number
+ * @port_modify_mask: mask for port modifications
+ * @props: port properties
+ */
+static int i40iw_modify_port(struct ib_device *ibdev,
+			     u8 port,
+			     int port_modify_mask,
+			     struct ib_port_modify *props)
+{
+	return -ENOSYS;
+}
+
+/**
+ * i40iw_query_pkey - Query partition key
+ * @ibdev: device pointer from stack
+ * @port: port number
+ * @index: index of pkey
+ * @pkey: pointer to store the pkey
+ */
+static int i40iw_query_pkey(struct ib_device *ibdev,
+			    u8 port,
+			    u16 index,
+			    u16 *pkey)
+{
+	*pkey = 0;
+	return 0;
+}
+
+/**
+ * i40iw_create_ah - create address handle
+ * @ibpd: ptr of pd
+ * @ah_attr: address handle attributes
+ */
+static struct ib_ah *i40iw_create_ah(struct ib_pd *ibpd,
+				     struct rdma_ah_attr *attr,
+				     struct ib_udata *udata)
+
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+/**
+ * i40iw_destroy_ah - Destroy address handle
+ * @ah: pointer to address handle
+ */
+static int i40iw_destroy_ah(struct ib_ah *ah)
+{
+	return -ENOSYS;
+}
+
+/**
+ * i40iw_get_vector_affinity - report IRQ affinity mask
+ * @ibdev: IB device
+ * @comp_vector: completion vector index
+ */
+static const struct cpumask *i40iw_get_vector_affinity(struct ib_device *ibdev,
+						       int comp_vector)
+{
+	struct i40iw_device *iwdev = to_iwdev(ibdev);
+	struct i40iw_msix_vector *msix_vec;
+
+	if (iwdev->msix_shared)
+		msix_vec = &iwdev->iw_msixtbl[comp_vector];
+	else
+		msix_vec = &iwdev->iw_msixtbl[comp_vector + 1];
+
+	return irq_get_affinity_mask(msix_vec->irq);
+}
+
+/**
+ * i40iw_init_rdma_device - initialization of iwarp device
+ * @iwdev: iwarp device
+ */
+static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev)
+{
+	struct i40iw_ib_device *iwibdev;
+	struct net_device *netdev = iwdev->netdev;
+	struct pci_dev *pcidev = (struct pci_dev *)iwdev->hw.dev_context;
+
+	iwibdev = (struct i40iw_ib_device *)ib_alloc_device(sizeof(*iwibdev));
+	if (!iwibdev) {
+		i40iw_pr_err("iwdev == NULL\n");
+		return NULL;
+	}
+	strlcpy(iwibdev->ibdev.name, "i40iw%d", IB_DEVICE_NAME_MAX);
+	iwibdev->ibdev.owner = THIS_MODULE;
+	iwdev->iwibdev = iwibdev;
+	iwibdev->iwdev = iwdev;
+
+	iwibdev->ibdev.node_type = RDMA_NODE_RNIC;
+	ether_addr_copy((u8 *)&iwibdev->ibdev.node_guid, netdev->dev_addr);
+
+	iwibdev->ibdev.uverbs_cmd_mask =
+	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POST_RECV) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SEND);
+	iwibdev->ibdev.phys_port_cnt = 1;
+	iwibdev->ibdev.num_comp_vectors = iwdev->ceqs_count;
+	iwibdev->ibdev.dev.parent = &pcidev->dev;
+	iwibdev->ibdev.query_port = i40iw_query_port;
+	iwibdev->ibdev.modify_port = i40iw_modify_port;
+	iwibdev->ibdev.query_pkey = i40iw_query_pkey;
+	iwibdev->ibdev.query_gid = i40iw_query_gid;
+	iwibdev->ibdev.alloc_ucontext = i40iw_alloc_ucontext;
+	iwibdev->ibdev.dealloc_ucontext = i40iw_dealloc_ucontext;
+	iwibdev->ibdev.mmap = i40iw_mmap;
+	iwibdev->ibdev.alloc_pd = i40iw_alloc_pd;
+	iwibdev->ibdev.dealloc_pd = i40iw_dealloc_pd;
+	iwibdev->ibdev.create_qp = i40iw_create_qp;
+	iwibdev->ibdev.modify_qp = i40iw_modify_qp;
+	iwibdev->ibdev.query_qp = i40iw_query_qp;
+	iwibdev->ibdev.destroy_qp = i40iw_destroy_qp;
+	iwibdev->ibdev.create_cq = i40iw_create_cq;
+	iwibdev->ibdev.destroy_cq = i40iw_destroy_cq;
+	iwibdev->ibdev.get_dma_mr = i40iw_get_dma_mr;
+	iwibdev->ibdev.reg_user_mr = i40iw_reg_user_mr;
+	iwibdev->ibdev.dereg_mr = i40iw_dereg_mr;
+	iwibdev->ibdev.alloc_hw_stats = i40iw_alloc_hw_stats;
+	iwibdev->ibdev.get_hw_stats = i40iw_get_hw_stats;
+	iwibdev->ibdev.query_device = i40iw_query_device;
+	iwibdev->ibdev.create_ah = i40iw_create_ah;
+	iwibdev->ibdev.destroy_ah = i40iw_destroy_ah;
+	iwibdev->ibdev.drain_sq = i40iw_drain_sq;
+	iwibdev->ibdev.drain_rq = i40iw_drain_rq;
+	iwibdev->ibdev.alloc_mr = i40iw_alloc_mr;
+	iwibdev->ibdev.map_mr_sg = i40iw_map_mr_sg;
+	iwibdev->ibdev.iwcm = kzalloc(sizeof(*iwibdev->ibdev.iwcm), GFP_KERNEL);
+	if (!iwibdev->ibdev.iwcm) {
+		ib_dealloc_device(&iwibdev->ibdev);
+		return NULL;
+	}
+
+	iwibdev->ibdev.iwcm->add_ref = i40iw_add_ref;
+	iwibdev->ibdev.iwcm->rem_ref = i40iw_rem_ref;
+	iwibdev->ibdev.iwcm->get_qp = i40iw_get_qp;
+	iwibdev->ibdev.iwcm->connect = i40iw_connect;
+	iwibdev->ibdev.iwcm->accept = i40iw_accept;
+	iwibdev->ibdev.iwcm->reject = i40iw_reject;
+	iwibdev->ibdev.iwcm->create_listen = i40iw_create_listen;
+	iwibdev->ibdev.iwcm->destroy_listen = i40iw_destroy_listen;
+	memcpy(iwibdev->ibdev.iwcm->ifname, netdev->name,
+	       sizeof(iwibdev->ibdev.iwcm->ifname));
+	iwibdev->ibdev.get_port_immutable   = i40iw_port_immutable;
+	iwibdev->ibdev.get_dev_fw_str       = i40iw_get_dev_fw_str;
+	iwibdev->ibdev.poll_cq = i40iw_poll_cq;
+	iwibdev->ibdev.req_notify_cq = i40iw_req_notify_cq;
+	iwibdev->ibdev.post_send = i40iw_post_send;
+	iwibdev->ibdev.post_recv = i40iw_post_recv;
+	iwibdev->ibdev.get_vector_affinity = i40iw_get_vector_affinity;
+
+	return iwibdev;
+}
+
+/**
+ * i40iw_port_ibevent - indicate port event
+ * @iwdev: iwarp device
+ */
+void i40iw_port_ibevent(struct i40iw_device *iwdev)
+{
+	struct i40iw_ib_device *iwibdev = iwdev->iwibdev;
+	struct ib_event event;
+
+	event.device = &iwibdev->ibdev;
+	event.element.port_num = 1;
+	event.event = iwdev->iw_status ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+	ib_dispatch_event(&event);
+}
+
+/**
+ * i40iw_unregister_rdma_device - unregister of iwarp from IB
+ * @iwibdev: rdma device ptr
+ */
+static void i40iw_unregister_rdma_device(struct i40iw_ib_device *iwibdev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(i40iw_dev_attributes); ++i)
+		device_remove_file(&iwibdev->ibdev.dev,
+				   i40iw_dev_attributes[i]);
+	ib_unregister_device(&iwibdev->ibdev);
+}
+
+/**
+ * i40iw_destroy_rdma_device - destroy rdma device and free resources
+ * @iwibdev: IB device ptr
+ */
+void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev)
+{
+	if (!iwibdev)
+		return;
+
+	i40iw_unregister_rdma_device(iwibdev);
+	kfree(iwibdev->ibdev.iwcm);
+	iwibdev->ibdev.iwcm = NULL;
+	wait_event_timeout(iwibdev->iwdev->close_wq,
+			   !atomic64_read(&iwibdev->iwdev->use_count),
+			   I40IW_EVENT_TIMEOUT);
+	ib_dealloc_device(&iwibdev->ibdev);
+}
+
+/**
+ * i40iw_register_rdma_device - register iwarp device to IB
+ * @iwdev: iwarp device
+ */
+int i40iw_register_rdma_device(struct i40iw_device *iwdev)
+{
+	int i, ret;
+	struct i40iw_ib_device *iwibdev;
+
+	iwdev->iwibdev = i40iw_init_rdma_device(iwdev);
+	if (!iwdev->iwibdev)
+		return -ENOMEM;
+	iwibdev = iwdev->iwibdev;
+
+	iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW;
+	ret = ib_register_device(&iwibdev->ibdev, NULL);
+	if (ret)
+		goto error;
+
+	for (i = 0; i < ARRAY_SIZE(i40iw_dev_attributes); ++i) {
+		ret =
+		    device_create_file(&iwibdev->ibdev.dev,
+				       i40iw_dev_attributes[i]);
+		if (ret) {
+			while (i > 0) {
+				i--;
+				device_remove_file(&iwibdev->ibdev.dev, i40iw_dev_attributes[i]);
+			}
+			ib_unregister_device(&iwibdev->ibdev);
+			goto error;
+		}
+	}
+	return 0;
+error:
+	kfree(iwdev->iwibdev->ibdev.iwcm);
+	iwdev->iwibdev->ibdev.iwcm = NULL;
+	ib_dealloc_device(&iwdev->iwibdev->ibdev);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.h b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
new file mode 100644
index 0000000..76cf173
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
@@ -0,0 +1,180 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_VERBS_H
+#define I40IW_VERBS_H
+
+struct i40iw_ucontext {
+	struct ib_ucontext ibucontext;
+	struct i40iw_device *iwdev;
+	struct list_head cq_reg_mem_list;
+	spinlock_t cq_reg_mem_list_lock; /* memory list for cq's */
+	struct list_head qp_reg_mem_list;
+	spinlock_t qp_reg_mem_list_lock; /* memory list for qp's */
+	int abi_ver;
+};
+
+struct i40iw_pd {
+	struct ib_pd ibpd;
+	struct i40iw_sc_pd sc_pd;
+	atomic_t usecount;
+};
+
+struct i40iw_hmc_pble {
+	union {
+		u32 idx;
+		dma_addr_t addr;
+	};
+};
+
+struct i40iw_cq_mr {
+	struct i40iw_hmc_pble cq_pbl;
+	dma_addr_t shadow;
+};
+
+struct i40iw_qp_mr {
+	struct i40iw_hmc_pble sq_pbl;
+	struct i40iw_hmc_pble rq_pbl;
+	dma_addr_t shadow;
+	struct page *sq_page;
+};
+
+struct i40iw_pbl {
+	struct list_head list;
+	union {
+		struct i40iw_qp_mr qp_mr;
+		struct i40iw_cq_mr cq_mr;
+	};
+
+	bool pbl_allocated;
+	bool on_list;
+	u64 user_base;
+	struct i40iw_pble_alloc pble_alloc;
+	struct i40iw_mr *iwmr;
+};
+
+#define MAX_SAVE_PAGE_ADDRS     4
+struct i40iw_mr {
+	union {
+		struct ib_mr ibmr;
+		struct ib_mw ibmw;
+		struct ib_fmr ibfmr;
+	};
+	struct ib_umem *region;
+	u16 type;
+	u32 page_cnt;
+	u32 page_size;
+	u64 page_msk;
+	u32 npages;
+	u32 stag;
+	u64 length;
+	u64 pgaddrmem[MAX_SAVE_PAGE_ADDRS];
+	struct i40iw_pbl iwpbl;
+};
+
+struct i40iw_cq {
+	struct ib_cq ibcq;
+	struct i40iw_sc_cq sc_cq;
+	u16 cq_head;
+	u16 cq_size;
+	u16 cq_number;
+	bool user_mode;
+	u32 polled_completions;
+	u32 cq_mem_size;
+	struct i40iw_dma_mem kmem;
+	spinlock_t lock; /* for poll cq */
+	struct i40iw_pbl *iwpbl;
+};
+
+struct disconn_work {
+	struct work_struct work;
+	struct i40iw_qp *iwqp;
+};
+
+struct iw_cm_id;
+struct ietf_mpa_frame;
+struct i40iw_ud_file;
+
+struct i40iw_qp_kmode {
+	struct i40iw_dma_mem dma_mem;
+	u64 *wrid_mem;
+};
+
+struct i40iw_qp {
+	struct ib_qp ibqp;
+	struct i40iw_sc_qp sc_qp;
+	struct i40iw_device *iwdev;
+	struct i40iw_cq *iwscq;
+	struct i40iw_cq *iwrcq;
+	struct i40iw_pd *iwpd;
+	struct i40iw_qp_host_ctx_info ctx_info;
+	struct i40iwarp_offload_info iwarp_info;
+	void *allocated_buffer;
+	atomic_t refcount;
+	struct iw_cm_id *cm_id;
+	void *cm_node;
+	struct ib_mr *lsmm_mr;
+	struct work_struct work;
+	enum ib_qp_state ibqp_state;
+	u32 iwarp_state;
+	u32 qp_mem_size;
+	u32 last_aeq;
+	atomic_t close_timer_started;
+	spinlock_t lock; /* for post work requests */
+	struct i40iw_qp_context *iwqp_context;
+	void *pbl_vbase;
+	dma_addr_t pbl_pbase;
+	struct page *page;
+	u8 active_conn:1;
+	u8 user_mode:1;
+	u8 hte_added:1;
+	u8 flush_issued:1;
+	u8 destroyed:1;
+	u8 sig_all:1;
+	u8 pau_mode:1;
+	u8 rsvd:1;
+	u16 term_sq_flush_code;
+	u16 term_rq_flush_code;
+	u8 hw_iwarp_state;
+	u8 hw_tcp_state;
+	struct i40iw_qp_kmode kqp;
+	struct i40iw_dma_mem host_ctx;
+	struct timer_list terminate_timer;
+	struct i40iw_pbl iwpbl;
+	struct i40iw_dma_mem q2_ctx_mem;
+	struct i40iw_dma_mem ietf_mem;
+	struct completion sq_drained;
+	struct completion rq_drained;
+};
+#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.c b/drivers/infiniband/hw/i40iw/i40iw_vf.c
new file mode 100644
index 0000000..e33d481
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_vf.c
@@ -0,0 +1,85 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#include "i40iw_osdep.h"
+#include "i40iw_register.h"
+#include "i40iw_status.h"
+#include "i40iw_hmc.h"
+#include "i40iw_d.h"
+#include "i40iw_type.h"
+#include "i40iw_p.h"
+#include "i40iw_vf.h"
+
+/**
+ * i40iw_manage_vf_pble_bp - manage vf pble
+ * @cqp: cqp for cqp' sq wqe
+ * @info: pble info
+ * @scratch: pointer for completion
+ * @post_sq: to post and ring
+ */
+enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp,
+					       struct i40iw_manage_vf_pble_info *info,
+					       u64 scratch,
+					       bool post_sq)
+{
+	u64 *wqe;
+	u64 temp, header, pd_pl_pba = 0;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	temp = LS_64(info->pd_entry_cnt, I40IW_CQPSQ_MVPBP_PD_ENTRY_CNT) |
+	    LS_64(info->first_pd_index, I40IW_CQPSQ_MVPBP_FIRST_PD_INX) |
+	    LS_64(info->sd_index, I40IW_CQPSQ_MVPBP_SD_INX);
+	set_64bit_val(wqe, 16, temp);
+
+	header = LS_64((info->inv_pd_ent ? 1 : 0), I40IW_CQPSQ_MVPBP_INV_PD_ENT) |
+	    LS_64(I40IW_CQP_OP_MANAGE_VF_PBLE_BP, I40IW_CQPSQ_OPCODE) |
+	    LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+	set_64bit_val(wqe, 24, header);
+
+	pd_pl_pba = LS_64(info->pd_pl_pba >> 3, I40IW_CQPSQ_MVPBP_PD_PLPBA);
+	set_64bit_val(wqe, 32, pd_pl_pba);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "MANAGE VF_PBLE_BP WQE", wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	if (post_sq)
+		i40iw_sc_cqp_post_sq(cqp);
+	return 0;
+}
+
+const struct i40iw_vf_cqp_ops iw_vf_cqp_ops = {
+	i40iw_manage_vf_pble_bp
+};
diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.h b/drivers/infiniband/hw/i40iw/i40iw_vf.h
new file mode 100644
index 0000000..4359559
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_vf.h
@@ -0,0 +1,62 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_VF_H
+#define I40IW_VF_H
+
+struct i40iw_sc_cqp;
+
+struct i40iw_manage_vf_pble_info {
+	u32 sd_index;
+	u16 first_pd_index;
+	u16 pd_entry_cnt;
+	u8 inv_pd_ent;
+	u64 pd_pl_pba;
+};
+
+struct i40iw_vf_cqp_ops {
+	enum i40iw_status_code (*manage_vf_pble_bp)(struct i40iw_sc_cqp *,
+						    struct i40iw_manage_vf_pble_info *,
+						    u64,
+						    bool);
+};
+
+enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp,
+					       struct i40iw_manage_vf_pble_info *info,
+					       u64 scratch,
+					       bool post_sq);
+
+extern const struct i40iw_vf_cqp_ops iw_vf_cqp_ops;
+
+#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c
new file mode 100644
index 0000000..48fd327
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c
@@ -0,0 +1,756 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#include "i40iw_osdep.h"
+#include "i40iw_register.h"
+#include "i40iw_status.h"
+#include "i40iw_hmc.h"
+#include "i40iw_d.h"
+#include "i40iw_type.h"
+#include "i40iw_p.h"
+#include "i40iw_virtchnl.h"
+
+/**
+ * vchnl_vf_send_get_ver_req - Request Channel version
+ * @dev: IWARP device pointer
+ * @vchnl_req: Virtual channel message request pointer
+ */
+static enum i40iw_status_code vchnl_vf_send_get_ver_req(struct i40iw_sc_dev *dev,
+							struct i40iw_virtchnl_req *vchnl_req)
+{
+	enum i40iw_status_code ret_code = I40IW_ERR_NOT_READY;
+	struct i40iw_virtchnl_op_buf *vchnl_msg = vchnl_req->vchnl_msg;
+
+	if (!dev->vchnl_up)
+		return ret_code;
+
+	memset(vchnl_msg, 0, sizeof(*vchnl_msg));
+	vchnl_msg->iw_chnl_op_ctx = (uintptr_t)vchnl_req;
+	vchnl_msg->iw_chnl_buf_len = sizeof(*vchnl_msg);
+	vchnl_msg->iw_op_code = I40IW_VCHNL_OP_GET_VER;
+	vchnl_msg->iw_op_ver = I40IW_VCHNL_OP_GET_VER_V0;
+	ret_code = dev->vchnl_if.vchnl_send(dev, 0, (u8 *)vchnl_msg, vchnl_msg->iw_chnl_buf_len);
+	if (ret_code)
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
+	return ret_code;
+}
+
+/**
+ * vchnl_vf_send_get_hmc_fcn_req - Request HMC Function from VF
+ * @dev: IWARP device pointer
+ * @vchnl_req: Virtual channel message request pointer
+ */
+static enum i40iw_status_code vchnl_vf_send_get_hmc_fcn_req(struct i40iw_sc_dev *dev,
+							    struct i40iw_virtchnl_req *vchnl_req)
+{
+	enum i40iw_status_code ret_code = I40IW_ERR_NOT_READY;
+	struct i40iw_virtchnl_op_buf *vchnl_msg = vchnl_req->vchnl_msg;
+
+	if (!dev->vchnl_up)
+		return ret_code;
+
+	memset(vchnl_msg, 0, sizeof(*vchnl_msg));
+	vchnl_msg->iw_chnl_op_ctx = (uintptr_t)vchnl_req;
+	vchnl_msg->iw_chnl_buf_len = sizeof(*vchnl_msg);
+	vchnl_msg->iw_op_code = I40IW_VCHNL_OP_GET_HMC_FCN;
+	vchnl_msg->iw_op_ver = I40IW_VCHNL_OP_GET_HMC_FCN_V0;
+	ret_code = dev->vchnl_if.vchnl_send(dev, 0, (u8 *)vchnl_msg, vchnl_msg->iw_chnl_buf_len);
+	if (ret_code)
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
+	return ret_code;
+}
+
+/**
+ * vchnl_vf_send_get_pe_stats_req - Request PE stats from VF
+ * @dev: IWARP device pointer
+ * @vchnl_req: Virtual channel message request pointer
+ */
+static enum i40iw_status_code vchnl_vf_send_get_pe_stats_req(struct i40iw_sc_dev *dev,
+							     struct i40iw_virtchnl_req  *vchnl_req)
+{
+	enum i40iw_status_code ret_code = I40IW_ERR_NOT_READY;
+	struct i40iw_virtchnl_op_buf *vchnl_msg = vchnl_req->vchnl_msg;
+
+	if (!dev->vchnl_up)
+		return ret_code;
+
+	memset(vchnl_msg, 0, sizeof(*vchnl_msg));
+	vchnl_msg->iw_chnl_op_ctx = (uintptr_t)vchnl_req;
+	vchnl_msg->iw_chnl_buf_len = sizeof(*vchnl_msg) + sizeof(struct i40iw_dev_hw_stats) - 1;
+	vchnl_msg->iw_op_code = I40IW_VCHNL_OP_GET_STATS;
+	vchnl_msg->iw_op_ver = I40IW_VCHNL_OP_GET_STATS_V0;
+	ret_code = dev->vchnl_if.vchnl_send(dev, 0, (u8 *)vchnl_msg, vchnl_msg->iw_chnl_buf_len);
+	if (ret_code)
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
+	return ret_code;
+}
+
+/**
+ * vchnl_vf_send_add_hmc_objs_req - Add HMC objects
+ * @dev: IWARP device pointer
+ * @vchnl_req: Virtual channel message request pointer
+ */
+static enum i40iw_status_code vchnl_vf_send_add_hmc_objs_req(struct i40iw_sc_dev *dev,
+							     struct i40iw_virtchnl_req *vchnl_req,
+							     enum i40iw_hmc_rsrc_type rsrc_type,
+							     u32 start_index,
+							     u32 rsrc_count)
+{
+	enum i40iw_status_code ret_code = I40IW_ERR_NOT_READY;
+	struct i40iw_virtchnl_op_buf *vchnl_msg = vchnl_req->vchnl_msg;
+	struct i40iw_virtchnl_hmc_obj_range *add_hmc_obj;
+
+	if (!dev->vchnl_up)
+		return ret_code;
+
+	add_hmc_obj = (struct i40iw_virtchnl_hmc_obj_range *)vchnl_msg->iw_chnl_buf;
+	memset(vchnl_msg, 0, sizeof(*vchnl_msg));
+	memset(add_hmc_obj, 0, sizeof(*add_hmc_obj));
+	vchnl_msg->iw_chnl_op_ctx = (uintptr_t)vchnl_req;
+	vchnl_msg->iw_chnl_buf_len = sizeof(*vchnl_msg) + sizeof(struct i40iw_virtchnl_hmc_obj_range) - 1;
+	vchnl_msg->iw_op_code = I40IW_VCHNL_OP_ADD_HMC_OBJ_RANGE;
+	vchnl_msg->iw_op_ver = I40IW_VCHNL_OP_ADD_HMC_OBJ_RANGE_V0;
+	add_hmc_obj->obj_type = (u16)rsrc_type;
+	add_hmc_obj->start_index = start_index;
+	add_hmc_obj->obj_count = rsrc_count;
+	ret_code = dev->vchnl_if.vchnl_send(dev, 0, (u8 *)vchnl_msg, vchnl_msg->iw_chnl_buf_len);
+	if (ret_code)
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
+	return ret_code;
+}
+
+/**
+ * vchnl_vf_send_del_hmc_objs_req - del HMC objects
+ * @dev: IWARP device pointer
+ * @vchnl_req: Virtual channel message request pointer
+ * @ rsrc_type - resource type to delete
+ * @ start_index - starting index for resource
+ * @ rsrc_count - number of resource type to delete
+ */
+static enum i40iw_status_code vchnl_vf_send_del_hmc_objs_req(struct i40iw_sc_dev *dev,
+							     struct i40iw_virtchnl_req *vchnl_req,
+							     enum i40iw_hmc_rsrc_type rsrc_type,
+							     u32 start_index,
+							     u32 rsrc_count)
+{
+	enum i40iw_status_code ret_code = I40IW_ERR_NOT_READY;
+	struct i40iw_virtchnl_op_buf *vchnl_msg = vchnl_req->vchnl_msg;
+	struct i40iw_virtchnl_hmc_obj_range *add_hmc_obj;
+
+	if (!dev->vchnl_up)
+		return ret_code;
+
+	add_hmc_obj = (struct i40iw_virtchnl_hmc_obj_range *)vchnl_msg->iw_chnl_buf;
+	memset(vchnl_msg, 0, sizeof(*vchnl_msg));
+	memset(add_hmc_obj, 0, sizeof(*add_hmc_obj));
+	vchnl_msg->iw_chnl_op_ctx = (uintptr_t)vchnl_req;
+	vchnl_msg->iw_chnl_buf_len = sizeof(*vchnl_msg) + sizeof(struct i40iw_virtchnl_hmc_obj_range) - 1;
+	vchnl_msg->iw_op_code = I40IW_VCHNL_OP_DEL_HMC_OBJ_RANGE;
+	vchnl_msg->iw_op_ver = I40IW_VCHNL_OP_DEL_HMC_OBJ_RANGE_V0;
+	add_hmc_obj->obj_type = (u16)rsrc_type;
+	add_hmc_obj->start_index = start_index;
+	add_hmc_obj->obj_count = rsrc_count;
+	ret_code = dev->vchnl_if.vchnl_send(dev, 0, (u8 *)vchnl_msg, vchnl_msg->iw_chnl_buf_len);
+	if (ret_code)
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
+	return ret_code;
+}
+
+/**
+ * vchnl_pf_send_get_ver_resp - Send channel version to VF
+ * @dev: IWARP device pointer
+ * @vf_id: Virtual function ID associated with the message
+ * @vchnl_msg: Virtual channel message buffer pointer
+ */
+static void vchnl_pf_send_get_ver_resp(struct i40iw_sc_dev *dev,
+				       u32 vf_id,
+				       struct i40iw_virtchnl_op_buf *vchnl_msg)
+{
+	enum i40iw_status_code ret_code;
+	u8 resp_buffer[sizeof(struct i40iw_virtchnl_resp_buf) + sizeof(u32) - 1];
+	struct i40iw_virtchnl_resp_buf *vchnl_msg_resp = (struct i40iw_virtchnl_resp_buf *)resp_buffer;
+
+	memset(resp_buffer, 0, sizeof(*resp_buffer));
+	vchnl_msg_resp->iw_chnl_op_ctx = vchnl_msg->iw_chnl_op_ctx;
+	vchnl_msg_resp->iw_chnl_buf_len = sizeof(resp_buffer);
+	vchnl_msg_resp->iw_op_ret_code = I40IW_SUCCESS;
+	*((u32 *)vchnl_msg_resp->iw_chnl_buf) = I40IW_VCHNL_CHNL_VER_V0;
+	ret_code = dev->vchnl_if.vchnl_send(dev, vf_id, resp_buffer, sizeof(resp_buffer));
+	if (ret_code)
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
+}
+
+/**
+ * vchnl_pf_send_get_hmc_fcn_resp - Send HMC Function to VF
+ * @dev: IWARP device pointer
+ * @vf_id: Virtual function ID associated with the message
+ * @vchnl_msg: Virtual channel message buffer pointer
+ */
+static void vchnl_pf_send_get_hmc_fcn_resp(struct i40iw_sc_dev *dev,
+					   u32 vf_id,
+					   struct i40iw_virtchnl_op_buf *vchnl_msg,
+					   u16 hmc_fcn)
+{
+	enum i40iw_status_code ret_code;
+	u8 resp_buffer[sizeof(struct i40iw_virtchnl_resp_buf) + sizeof(u16) - 1];
+	struct i40iw_virtchnl_resp_buf *vchnl_msg_resp = (struct i40iw_virtchnl_resp_buf *)resp_buffer;
+
+	memset(resp_buffer, 0, sizeof(*resp_buffer));
+	vchnl_msg_resp->iw_chnl_op_ctx = vchnl_msg->iw_chnl_op_ctx;
+	vchnl_msg_resp->iw_chnl_buf_len = sizeof(resp_buffer);
+	vchnl_msg_resp->iw_op_ret_code = I40IW_SUCCESS;
+	*((u16 *)vchnl_msg_resp->iw_chnl_buf) = hmc_fcn;
+	ret_code = dev->vchnl_if.vchnl_send(dev, vf_id, resp_buffer, sizeof(resp_buffer));
+	if (ret_code)
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
+}
+
+/**
+ * vchnl_pf_send_get_pe_stats_resp - Send PE Stats to VF
+ * @dev: IWARP device pointer
+ * @vf_id: Virtual function ID associated with the message
+ * @vchnl_msg: Virtual channel message buffer pointer
+ * @hw_stats: HW Stats struct
+ */
+
+static void vchnl_pf_send_get_pe_stats_resp(struct i40iw_sc_dev *dev,
+					    u32 vf_id,
+					    struct i40iw_virtchnl_op_buf *vchnl_msg,
+					    struct i40iw_dev_hw_stats *hw_stats)
+{
+	enum i40iw_status_code ret_code;
+	u8 resp_buffer[sizeof(struct i40iw_virtchnl_resp_buf) + sizeof(struct i40iw_dev_hw_stats) - 1];
+	struct i40iw_virtchnl_resp_buf *vchnl_msg_resp = (struct i40iw_virtchnl_resp_buf *)resp_buffer;
+
+	memset(resp_buffer, 0, sizeof(*resp_buffer));
+	vchnl_msg_resp->iw_chnl_op_ctx = vchnl_msg->iw_chnl_op_ctx;
+	vchnl_msg_resp->iw_chnl_buf_len = sizeof(resp_buffer);
+	vchnl_msg_resp->iw_op_ret_code = I40IW_SUCCESS;
+	*((struct i40iw_dev_hw_stats *)vchnl_msg_resp->iw_chnl_buf) = *hw_stats;
+	ret_code = dev->vchnl_if.vchnl_send(dev, vf_id, resp_buffer, sizeof(resp_buffer));
+	if (ret_code)
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
+}
+
+/**
+ * vchnl_pf_send_error_resp - Send an error response to VF
+ * @dev: IWARP device pointer
+ * @vf_id: Virtual function ID associated with the message
+ * @vchnl_msg: Virtual channel message buffer pointer
+ */
+static void vchnl_pf_send_error_resp(struct i40iw_sc_dev *dev, u32 vf_id,
+				     struct i40iw_virtchnl_op_buf *vchnl_msg,
+				     u16 op_ret_code)
+{
+	enum i40iw_status_code ret_code;
+	u8 resp_buffer[sizeof(struct i40iw_virtchnl_resp_buf)];
+	struct i40iw_virtchnl_resp_buf *vchnl_msg_resp = (struct i40iw_virtchnl_resp_buf *)resp_buffer;
+
+	memset(resp_buffer, 0, sizeof(resp_buffer));
+	vchnl_msg_resp->iw_chnl_op_ctx = vchnl_msg->iw_chnl_op_ctx;
+	vchnl_msg_resp->iw_chnl_buf_len = sizeof(resp_buffer);
+	vchnl_msg_resp->iw_op_ret_code = (u16)op_ret_code;
+	ret_code = dev->vchnl_if.vchnl_send(dev, vf_id, resp_buffer, sizeof(resp_buffer));
+	if (ret_code)
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
+}
+
+/**
+ * pf_cqp_get_hmc_fcn_callback - Callback for Get HMC Fcn
+ * @cqp_req_param: CQP Request param value
+ * @not_used: unused CQP callback parameter
+ */
+static void pf_cqp_get_hmc_fcn_callback(struct i40iw_sc_dev *dev, void *callback_param,
+					struct i40iw_ccq_cqe_info *cqe_info)
+{
+	struct i40iw_vfdev *vf_dev = callback_param;
+	struct i40iw_virt_mem vf_dev_mem;
+
+	if (cqe_info->error) {
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "CQP Completion Error on Get HMC Function.  Maj = 0x%04x, Minor = 0x%04x\n",
+			    cqe_info->maj_err_code, cqe_info->min_err_code);
+		dev->vf_dev[vf_dev->iw_vf_idx] = NULL;
+		vchnl_pf_send_error_resp(dev, vf_dev->vf_id, &vf_dev->vf_msg_buffer.vchnl_msg,
+					 (u16)I40IW_ERR_CQP_COMPL_ERROR);
+		vf_dev_mem.va = vf_dev;
+		vf_dev_mem.size = sizeof(*vf_dev);
+		i40iw_free_virt_mem(dev->hw, &vf_dev_mem);
+	} else {
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "CQP Completion Operation Return information = 0x%08x\n",
+			    cqe_info->op_ret_val);
+		vf_dev->pmf_index = (u16)cqe_info->op_ret_val;
+		vf_dev->msg_count--;
+		vchnl_pf_send_get_hmc_fcn_resp(dev,
+					       vf_dev->vf_id,
+					       &vf_dev->vf_msg_buffer.vchnl_msg,
+					       vf_dev->pmf_index);
+	}
+}
+
+/**
+ * pf_add_hmc_obj - Callback for Add HMC Object
+ * @vf_dev: pointer to the VF Device
+ */
+static void pf_add_hmc_obj_callback(void *work_vf_dev)
+{
+	struct i40iw_vfdev *vf_dev = (struct i40iw_vfdev *)work_vf_dev;
+	struct i40iw_hmc_info *hmc_info = &vf_dev->hmc_info;
+	struct i40iw_virtchnl_op_buf *vchnl_msg = &vf_dev->vf_msg_buffer.vchnl_msg;
+	struct i40iw_hmc_create_obj_info info;
+	struct i40iw_virtchnl_hmc_obj_range *add_hmc_obj;
+	enum i40iw_status_code ret_code;
+
+	if (!vf_dev->pf_hmc_initialized) {
+		ret_code = i40iw_pf_init_vfhmc(vf_dev->pf_dev, (u8)vf_dev->pmf_index, NULL);
+		if (ret_code)
+			goto add_out;
+		vf_dev->pf_hmc_initialized = true;
+	}
+
+	add_hmc_obj = (struct i40iw_virtchnl_hmc_obj_range *)vchnl_msg->iw_chnl_buf;
+
+	memset(&info, 0, sizeof(info));
+	info.hmc_info = hmc_info;
+	info.is_pf = false;
+	info.rsrc_type = (u32)add_hmc_obj->obj_type;
+	info.entry_type = (info.rsrc_type == I40IW_HMC_IW_PBLE) ? I40IW_SD_TYPE_PAGED : I40IW_SD_TYPE_DIRECT;
+	info.start_idx = add_hmc_obj->start_index;
+	info.count = add_hmc_obj->obj_count;
+	i40iw_debug(vf_dev->pf_dev, I40IW_DEBUG_VIRT,
+		    "I40IW_VCHNL_OP_ADD_HMC_OBJ_RANGE.  Add %u type %u objects\n",
+		    info.count, info.rsrc_type);
+	ret_code = i40iw_sc_create_hmc_obj(vf_dev->pf_dev, &info);
+	if (!ret_code)
+		vf_dev->hmc_info.hmc_obj[add_hmc_obj->obj_type].cnt = add_hmc_obj->obj_count;
+add_out:
+	vf_dev->msg_count--;
+	vchnl_pf_send_error_resp(vf_dev->pf_dev, vf_dev->vf_id, vchnl_msg, (u16)ret_code);
+}
+
+/**
+ * pf_del_hmc_obj_callback - Callback for delete HMC Object
+ * @work_vf_dev: pointer to the VF Device
+ */
+static void pf_del_hmc_obj_callback(void *work_vf_dev)
+{
+	struct i40iw_vfdev *vf_dev = (struct i40iw_vfdev *)work_vf_dev;
+	struct i40iw_hmc_info *hmc_info = &vf_dev->hmc_info;
+	struct i40iw_virtchnl_op_buf *vchnl_msg = &vf_dev->vf_msg_buffer.vchnl_msg;
+	struct i40iw_hmc_del_obj_info info;
+	struct i40iw_virtchnl_hmc_obj_range *del_hmc_obj;
+	enum i40iw_status_code ret_code = I40IW_SUCCESS;
+
+	if (!vf_dev->pf_hmc_initialized)
+		goto del_out;
+
+	del_hmc_obj = (struct i40iw_virtchnl_hmc_obj_range *)vchnl_msg->iw_chnl_buf;
+
+	memset(&info, 0, sizeof(info));
+	info.hmc_info = hmc_info;
+	info.is_pf = false;
+	info.rsrc_type = (u32)del_hmc_obj->obj_type;
+	info.start_idx = del_hmc_obj->start_index;
+	info.count = del_hmc_obj->obj_count;
+	i40iw_debug(vf_dev->pf_dev, I40IW_DEBUG_VIRT,
+		    "I40IW_VCHNL_OP_DEL_HMC_OBJ_RANGE.  Delete %u type %u objects\n",
+		    info.count, info.rsrc_type);
+	ret_code = i40iw_sc_del_hmc_obj(vf_dev->pf_dev, &info, false);
+del_out:
+	vf_dev->msg_count--;
+	vchnl_pf_send_error_resp(vf_dev->pf_dev, vf_dev->vf_id, vchnl_msg, (u16)ret_code);
+}
+
+/**
+ * i40iw_vf_init_pestat - Initialize stats for VF
+ * @devL pointer to the VF Device
+ * @stats: Statistics structure pointer
+ * @index: Stats index
+ */
+static void i40iw_vf_init_pestat(struct i40iw_sc_dev *dev, struct i40iw_vsi_pestat *stats, u16 index)
+{
+	stats->hw = dev->hw;
+	i40iw_hw_stats_init(stats, (u8)index, false);
+	spin_lock_init(&stats->lock);
+}
+
+/**
+ * i40iw_vchnl_recv_pf - Receive PF virtual channel messages
+ * @dev: IWARP device pointer
+ * @vf_id: Virtual function ID associated with the message
+ * @msg: Virtual channel message buffer pointer
+ * @len: Length of the virtual channels message
+ */
+enum i40iw_status_code i40iw_vchnl_recv_pf(struct i40iw_sc_dev *dev,
+					   u32 vf_id,
+					   u8 *msg,
+					   u16 len)
+{
+	struct i40iw_virtchnl_op_buf *vchnl_msg = (struct i40iw_virtchnl_op_buf *)msg;
+	struct i40iw_vfdev *vf_dev = NULL;
+	struct i40iw_hmc_fcn_info hmc_fcn_info;
+	u16 iw_vf_idx;
+	u16 first_avail_iw_vf = I40IW_MAX_PE_ENABLED_VF_COUNT;
+	struct i40iw_virt_mem vf_dev_mem;
+	struct i40iw_virtchnl_work_info work_info;
+	struct i40iw_vsi_pestat *stats;
+	enum i40iw_status_code ret_code;
+
+	if (!dev || !msg || !len)
+		return I40IW_ERR_PARAM;
+
+	if (!dev->vchnl_up)
+		return I40IW_ERR_NOT_READY;
+	if (vchnl_msg->iw_op_code == I40IW_VCHNL_OP_GET_VER) {
+		vchnl_pf_send_get_ver_resp(dev, vf_id, vchnl_msg);
+		return I40IW_SUCCESS;
+	}
+	for (iw_vf_idx = 0; iw_vf_idx < I40IW_MAX_PE_ENABLED_VF_COUNT; iw_vf_idx++) {
+		if (!dev->vf_dev[iw_vf_idx]) {
+			if (first_avail_iw_vf == I40IW_MAX_PE_ENABLED_VF_COUNT)
+				first_avail_iw_vf = iw_vf_idx;
+			continue;
+		}
+		if (dev->vf_dev[iw_vf_idx]->vf_id == vf_id) {
+			vf_dev = dev->vf_dev[iw_vf_idx];
+			break;
+		}
+	}
+	if (vf_dev) {
+		if (!vf_dev->msg_count) {
+			vf_dev->msg_count++;
+		} else {
+			i40iw_debug(dev, I40IW_DEBUG_VIRT,
+				    "VF%u already has a channel message in progress.\n",
+				    vf_id);
+			return I40IW_SUCCESS;
+		}
+	}
+	switch (vchnl_msg->iw_op_code) {
+	case I40IW_VCHNL_OP_GET_HMC_FCN:
+		if (!vf_dev &&
+		    (first_avail_iw_vf != I40IW_MAX_PE_ENABLED_VF_COUNT)) {
+			ret_code = i40iw_allocate_virt_mem(dev->hw, &vf_dev_mem, sizeof(struct i40iw_vfdev) +
+							   (sizeof(struct i40iw_hmc_obj_info) * I40IW_HMC_IW_MAX));
+			if (!ret_code) {
+				vf_dev = vf_dev_mem.va;
+				vf_dev->stats_initialized = false;
+				vf_dev->pf_dev = dev;
+				vf_dev->msg_count = 1;
+				vf_dev->vf_id = vf_id;
+				vf_dev->iw_vf_idx = first_avail_iw_vf;
+				vf_dev->pf_hmc_initialized = false;
+				vf_dev->hmc_info.hmc_obj = (struct i40iw_hmc_obj_info *)(&vf_dev[1]);
+				i40iw_debug(dev, I40IW_DEBUG_VIRT,
+					    "vf_dev %p, hmc_info %p, hmc_obj %p\n",
+					    vf_dev, &vf_dev->hmc_info, vf_dev->hmc_info.hmc_obj);
+				dev->vf_dev[first_avail_iw_vf] = vf_dev;
+				iw_vf_idx = first_avail_iw_vf;
+			} else {
+				i40iw_debug(dev, I40IW_DEBUG_VIRT,
+					    "VF%u Unable to allocate a VF device structure.\n",
+					    vf_id);
+				vchnl_pf_send_error_resp(dev, vf_id, vchnl_msg, (u16)I40IW_ERR_NO_MEMORY);
+				return I40IW_SUCCESS;
+			}
+			memcpy(&vf_dev->vf_msg_buffer.vchnl_msg, vchnl_msg, len);
+			hmc_fcn_info.callback_fcn = pf_cqp_get_hmc_fcn_callback;
+			hmc_fcn_info.vf_id = vf_id;
+			hmc_fcn_info.iw_vf_idx = vf_dev->iw_vf_idx;
+			hmc_fcn_info.cqp_callback_param = vf_dev;
+			hmc_fcn_info.free_fcn = false;
+			ret_code = i40iw_cqp_manage_hmc_fcn_cmd(dev, &hmc_fcn_info);
+			if (ret_code)
+				i40iw_debug(dev, I40IW_DEBUG_VIRT,
+					    "VF%u error CQP HMC Function operation.\n",
+					    vf_id);
+			i40iw_vf_init_pestat(dev, &vf_dev->pestat, vf_dev->pmf_index);
+			vf_dev->stats_initialized = true;
+		} else {
+			if (vf_dev) {
+				vf_dev->msg_count--;
+				vchnl_pf_send_get_hmc_fcn_resp(dev, vf_id, vchnl_msg, vf_dev->pmf_index);
+			} else {
+				vchnl_pf_send_error_resp(dev, vf_id, vchnl_msg,
+							 (u16)I40IW_ERR_NO_MEMORY);
+			}
+		}
+		break;
+	case I40IW_VCHNL_OP_ADD_HMC_OBJ_RANGE:
+		if (!vf_dev)
+			return I40IW_ERR_BAD_PTR;
+		work_info.worker_vf_dev = vf_dev;
+		work_info.callback_fcn = pf_add_hmc_obj_callback;
+		memcpy(&vf_dev->vf_msg_buffer.vchnl_msg, vchnl_msg, len);
+		i40iw_cqp_spawn_worker(dev, &work_info, vf_dev->iw_vf_idx);
+		break;
+	case I40IW_VCHNL_OP_DEL_HMC_OBJ_RANGE:
+		if (!vf_dev)
+			return I40IW_ERR_BAD_PTR;
+		work_info.worker_vf_dev = vf_dev;
+		work_info.callback_fcn = pf_del_hmc_obj_callback;
+		memcpy(&vf_dev->vf_msg_buffer.vchnl_msg, vchnl_msg, len);
+		i40iw_cqp_spawn_worker(dev, &work_info, vf_dev->iw_vf_idx);
+		break;
+	case I40IW_VCHNL_OP_GET_STATS:
+		if (!vf_dev)
+			return I40IW_ERR_BAD_PTR;
+		stats = &vf_dev->pestat;
+		i40iw_hw_stats_read_all(stats, &stats->hw_stats);
+		vf_dev->msg_count--;
+		vchnl_pf_send_get_pe_stats_resp(dev, vf_id, vchnl_msg, &stats->hw_stats);
+		break;
+	default:
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "40iw_vchnl_recv_pf: Invalid OpCode 0x%x\n",
+			    vchnl_msg->iw_op_code);
+		vchnl_pf_send_error_resp(dev, vf_id,
+					 vchnl_msg, (u16)I40IW_ERR_NOT_IMPLEMENTED);
+	}
+	return I40IW_SUCCESS;
+}
+
+/**
+ * i40iw_vchnl_recv_vf - Receive VF virtual channel messages
+ * @dev: IWARP device pointer
+ * @vf_id: Virtual function ID associated with the message
+ * @msg: Virtual channel message buffer pointer
+ * @len: Length of the virtual channels message
+ */
+enum i40iw_status_code i40iw_vchnl_recv_vf(struct i40iw_sc_dev *dev,
+					   u32 vf_id,
+					   u8 *msg,
+					   u16 len)
+{
+	struct i40iw_virtchnl_resp_buf *vchnl_msg_resp = (struct i40iw_virtchnl_resp_buf *)msg;
+	struct i40iw_virtchnl_req *vchnl_req;
+
+	vchnl_req = (struct i40iw_virtchnl_req *)(uintptr_t)vchnl_msg_resp->iw_chnl_op_ctx;
+	vchnl_req->ret_code = (enum i40iw_status_code)vchnl_msg_resp->iw_op_ret_code;
+	if (len == (sizeof(*vchnl_msg_resp) + vchnl_req->parm_len - 1)) {
+		if (vchnl_req->parm_len && vchnl_req->parm)
+			memcpy(vchnl_req->parm, vchnl_msg_resp->iw_chnl_buf, vchnl_req->parm_len);
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s: Got response, data size %u\n", __func__,
+			    vchnl_req->parm_len);
+	} else {
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s: error length on response, Got %u, expected %u\n", __func__,
+			    len, (u32)(sizeof(*vchnl_msg_resp) + vchnl_req->parm_len - 1));
+	}
+
+	return I40IW_SUCCESS;
+}
+
+/**
+ * i40iw_vchnl_vf_get_ver - Request Channel version
+ * @dev: IWARP device pointer
+ * @vchnl_ver: Virtual channel message version pointer
+ */
+enum i40iw_status_code i40iw_vchnl_vf_get_ver(struct i40iw_sc_dev *dev,
+					      u32 *vchnl_ver)
+{
+	struct i40iw_virtchnl_req vchnl_req;
+	enum i40iw_status_code ret_code;
+
+	if (!i40iw_vf_clear_to_send(dev))
+		return I40IW_ERR_TIMEOUT;
+	memset(&vchnl_req, 0, sizeof(vchnl_req));
+	vchnl_req.dev = dev;
+	vchnl_req.parm = vchnl_ver;
+	vchnl_req.parm_len = sizeof(*vchnl_ver);
+	vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
+	ret_code = vchnl_vf_send_get_ver_req(dev, &vchnl_req);
+	if (ret_code) {
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s Send message failed 0x%0x\n", __func__, ret_code);
+		return ret_code;
+	}
+	ret_code = i40iw_vf_wait_vchnl_resp(dev);
+	if (ret_code)
+		return ret_code;
+	else
+		return vchnl_req.ret_code;
+}
+
+/**
+ * i40iw_vchnl_vf_get_hmc_fcn - Request HMC Function
+ * @dev: IWARP device pointer
+ * @hmc_fcn: HMC function index pointer
+ */
+enum i40iw_status_code i40iw_vchnl_vf_get_hmc_fcn(struct i40iw_sc_dev *dev,
+						  u16 *hmc_fcn)
+{
+	struct i40iw_virtchnl_req vchnl_req;
+	enum i40iw_status_code ret_code;
+
+	if (!i40iw_vf_clear_to_send(dev))
+		return I40IW_ERR_TIMEOUT;
+	memset(&vchnl_req, 0, sizeof(vchnl_req));
+	vchnl_req.dev = dev;
+	vchnl_req.parm = hmc_fcn;
+	vchnl_req.parm_len = sizeof(*hmc_fcn);
+	vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
+	ret_code = vchnl_vf_send_get_hmc_fcn_req(dev, &vchnl_req);
+	if (ret_code) {
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s Send message failed 0x%0x\n", __func__, ret_code);
+		return ret_code;
+	}
+	ret_code = i40iw_vf_wait_vchnl_resp(dev);
+	if (ret_code)
+		return ret_code;
+	else
+		return vchnl_req.ret_code;
+}
+
+/**
+ * i40iw_vchnl_vf_add_hmc_objs - Add HMC Object
+ * @dev: IWARP device pointer
+ * @rsrc_type: HMC Resource type
+ * @start_index: Starting index of the objects to be added
+ * @rsrc_count: Number of resources to be added
+ */
+enum i40iw_status_code i40iw_vchnl_vf_add_hmc_objs(struct i40iw_sc_dev *dev,
+						   enum i40iw_hmc_rsrc_type rsrc_type,
+						   u32 start_index,
+						   u32 rsrc_count)
+{
+	struct i40iw_virtchnl_req vchnl_req;
+	enum i40iw_status_code ret_code;
+
+	if (!i40iw_vf_clear_to_send(dev))
+		return I40IW_ERR_TIMEOUT;
+	memset(&vchnl_req, 0, sizeof(vchnl_req));
+	vchnl_req.dev = dev;
+	vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
+	ret_code = vchnl_vf_send_add_hmc_objs_req(dev,
+						  &vchnl_req,
+						  rsrc_type,
+						  start_index,
+						  rsrc_count);
+	if (ret_code) {
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s Send message failed 0x%0x\n", __func__, ret_code);
+		return ret_code;
+	}
+	ret_code = i40iw_vf_wait_vchnl_resp(dev);
+	if (ret_code)
+		return ret_code;
+	else
+		return vchnl_req.ret_code;
+}
+
+/**
+ * i40iw_vchnl_vf_del_hmc_obj - del HMC obj
+ * @dev: IWARP device pointer
+ * @rsrc_type: HMC Resource type
+ * @start_index: Starting index of the object to delete
+ * @rsrc_count: Number of resources to be delete
+ */
+enum i40iw_status_code i40iw_vchnl_vf_del_hmc_obj(struct i40iw_sc_dev *dev,
+						  enum i40iw_hmc_rsrc_type rsrc_type,
+						  u32 start_index,
+						  u32 rsrc_count)
+{
+	struct i40iw_virtchnl_req vchnl_req;
+	enum i40iw_status_code ret_code;
+
+	if (!i40iw_vf_clear_to_send(dev))
+		return I40IW_ERR_TIMEOUT;
+	memset(&vchnl_req, 0, sizeof(vchnl_req));
+	vchnl_req.dev = dev;
+	vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
+	ret_code = vchnl_vf_send_del_hmc_objs_req(dev,
+						  &vchnl_req,
+						  rsrc_type,
+						  start_index,
+						  rsrc_count);
+	if (ret_code) {
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s Send message failed 0x%0x\n", __func__, ret_code);
+		return ret_code;
+	}
+	ret_code = i40iw_vf_wait_vchnl_resp(dev);
+	if (ret_code)
+		return ret_code;
+	else
+		return vchnl_req.ret_code;
+}
+
+/**
+ * i40iw_vchnl_vf_get_pe_stats - Get PE stats
+ * @dev: IWARP device pointer
+ * @hw_stats: HW stats struct
+ */
+enum i40iw_status_code i40iw_vchnl_vf_get_pe_stats(struct i40iw_sc_dev *dev,
+						   struct i40iw_dev_hw_stats *hw_stats)
+{
+	struct i40iw_virtchnl_req  vchnl_req;
+	enum i40iw_status_code ret_code;
+
+	if (!i40iw_vf_clear_to_send(dev))
+		return I40IW_ERR_TIMEOUT;
+	memset(&vchnl_req, 0, sizeof(vchnl_req));
+	vchnl_req.dev = dev;
+	vchnl_req.parm = hw_stats;
+	vchnl_req.parm_len = sizeof(*hw_stats);
+	vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
+	ret_code = vchnl_vf_send_get_pe_stats_req(dev, &vchnl_req);
+	if (ret_code) {
+		i40iw_debug(dev, I40IW_DEBUG_VIRT,
+			    "%s Send message failed 0x%0x\n", __func__, ret_code);
+		return ret_code;
+	}
+	ret_code = i40iw_vf_wait_vchnl_resp(dev);
+	if (ret_code)
+		return ret_code;
+	else
+		return vchnl_req.ret_code;
+}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.h b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.h
new file mode 100644
index 0000000..24886ef
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.h
@@ -0,0 +1,124 @@
+/*******************************************************************************
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*    - Redistributions of source code must retain the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer.
+*
+*    - Redistributions in binary form must reproduce the above
+*	copyright notice, this list of conditions and the following
+*	disclaimer in the documentation and/or other materials
+*	provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+*******************************************************************************/
+
+#ifndef I40IW_VIRTCHNL_H
+#define I40IW_VIRTCHNL_H
+
+#include "i40iw_hmc.h"
+
+#pragma pack(push, 1)
+
+struct i40iw_virtchnl_op_buf {
+	u16 iw_op_code;
+	u16 iw_op_ver;
+	u16 iw_chnl_buf_len;
+	u16 rsvd;
+	u64 iw_chnl_op_ctx;
+	/* Member alignment MUST be maintained above this location */
+	u8 iw_chnl_buf[1];
+};
+
+struct i40iw_virtchnl_resp_buf {
+	u64 iw_chnl_op_ctx;
+	u16 iw_chnl_buf_len;
+	s16 iw_op_ret_code;
+	/* Member alignment MUST be maintained above this location */
+	u16 rsvd[2];
+	u8 iw_chnl_buf[1];
+};
+
+enum i40iw_virtchnl_ops {
+	I40IW_VCHNL_OP_GET_VER = 0,
+	I40IW_VCHNL_OP_GET_HMC_FCN,
+	I40IW_VCHNL_OP_ADD_HMC_OBJ_RANGE,
+	I40IW_VCHNL_OP_DEL_HMC_OBJ_RANGE,
+	I40IW_VCHNL_OP_GET_STATS
+};
+
+#define I40IW_VCHNL_OP_GET_VER_V0 0
+#define I40IW_VCHNL_OP_GET_HMC_FCN_V0 0
+#define I40IW_VCHNL_OP_ADD_HMC_OBJ_RANGE_V0 0
+#define I40IW_VCHNL_OP_DEL_HMC_OBJ_RANGE_V0 0
+#define I40IW_VCHNL_OP_GET_STATS_V0 0
+#define I40IW_VCHNL_CHNL_VER_V0 0
+
+struct i40iw_dev_hw_stats;
+
+struct i40iw_virtchnl_hmc_obj_range {
+	u16 obj_type;
+	u16 rsvd;
+	u32 start_index;
+	u32 obj_count;
+};
+
+enum i40iw_status_code i40iw_vchnl_recv_pf(struct i40iw_sc_dev *dev,
+					   u32 vf_id,
+					   u8 *msg,
+					   u16 len);
+
+enum i40iw_status_code i40iw_vchnl_recv_vf(struct i40iw_sc_dev *dev,
+					   u32 vf_id,
+					   u8 *msg,
+					   u16 len);
+
+struct i40iw_virtchnl_req {
+	struct i40iw_sc_dev *dev;
+	struct i40iw_virtchnl_op_buf *vchnl_msg;
+	void *parm;
+	u32 vf_id;
+	u16 parm_len;
+	s16 ret_code;
+};
+
+#pragma pack(pop)
+
+enum i40iw_status_code i40iw_vchnl_vf_get_ver(struct i40iw_sc_dev *dev,
+					      u32 *vchnl_ver);
+
+enum i40iw_status_code i40iw_vchnl_vf_get_hmc_fcn(struct i40iw_sc_dev *dev,
+						  u16 *hmc_fcn);
+
+enum i40iw_status_code i40iw_vchnl_vf_add_hmc_objs(struct i40iw_sc_dev *dev,
+						   enum i40iw_hmc_rsrc_type rsrc_type,
+						   u32 start_index,
+						   u32 rsrc_count);
+
+enum i40iw_status_code i40iw_vchnl_vf_del_hmc_obj(struct i40iw_sc_dev *dev,
+						  enum i40iw_hmc_rsrc_type rsrc_type,
+						  u32 start_index,
+						  u32 rsrc_count);
+
+enum i40iw_status_code i40iw_vchnl_vf_get_pe_stats(struct i40iw_sc_dev *dev,
+						   struct i40iw_dev_hw_stats *hw_stats);
+#endif
diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig
new file mode 100644
index 0000000..db4aa13
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/Kconfig
@@ -0,0 +1,11 @@
+config MLX4_INFINIBAND
+	tristate "Mellanox ConnectX HCA support"
+	depends on NETDEVICES && ETHERNET && PCI && INET
+	depends on MAY_USE_DEVLINK
+	select NET_VENDOR_MELLANOX
+	select MLX4_CORE
+	---help---
+	  This driver provides low-level InfiniBand support for
+	  Mellanox ConnectX PCI Express host channel adapters (HCAs).
+	  This is required to use InfiniBand protocols such as
+	  IP-over-IB or SRP with these devices.
diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile
new file mode 100644
index 0000000..f4213b3
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_MLX4_INFINIBAND)	+= mlx4_ib.o
+
+mlx4_ib-y :=	ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o alias_GUID.o sysfs.o
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
new file mode 100644
index 0000000..9345d5b
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include <linux/slab.h>
+#include <linux/inet.h>
+#include <linux/string.h>
+#include <linux/mlx4/driver.h>
+
+#include "mlx4_ib.h"
+
+static struct ib_ah *create_ib_ah(struct ib_pd *pd,
+				  struct rdma_ah_attr *ah_attr,
+				  struct mlx4_ib_ah *ah)
+{
+	struct mlx4_dev *dev = to_mdev(pd->device)->dev;
+
+	ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn |
+			    (rdma_ah_get_port_num(ah_attr) << 24));
+	ah->av.ib.g_slid  = rdma_ah_get_path_bits(ah_attr);
+	ah->av.ib.sl_tclass_flowlabel =
+			cpu_to_be32(rdma_ah_get_sl(ah_attr) << 28);
+	if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
+		const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
+
+		ah->av.ib.g_slid   |= 0x80;
+		ah->av.ib.gid_index = grh->sgid_index;
+		ah->av.ib.hop_limit = grh->hop_limit;
+		ah->av.ib.sl_tclass_flowlabel |=
+			cpu_to_be32((grh->traffic_class << 20) |
+				    grh->flow_label);
+		memcpy(ah->av.ib.dgid, grh->dgid.raw, 16);
+	}
+
+	ah->av.ib.dlid = cpu_to_be16(rdma_ah_get_dlid(ah_attr));
+	if (rdma_ah_get_static_rate(ah_attr)) {
+		u8 static_rate = rdma_ah_get_static_rate(ah_attr) +
+					MLX4_STAT_RATE_OFFSET;
+
+		while (static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << static_rate & dev->caps.stat_rate_support))
+			--static_rate;
+		ah->av.ib.stat_rate = static_rate;
+	}
+
+	return &ah->ibah;
+}
+
+static struct ib_ah *create_iboe_ah(struct ib_pd *pd,
+				    struct rdma_ah_attr *ah_attr,
+				    struct mlx4_ib_ah *ah)
+{
+	struct mlx4_ib_dev *ibdev = to_mdev(pd->device);
+	struct mlx4_dev *dev = ibdev->dev;
+	int is_mcast = 0;
+	struct in6_addr in6;
+	u16 vlan_tag = 0xffff;
+	union ib_gid sgid;
+	struct ib_gid_attr gid_attr;
+	const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
+	int ret;
+
+	memcpy(&in6, grh->dgid.raw, sizeof(in6));
+	if (rdma_is_multicast_addr(&in6))
+		is_mcast = 1;
+
+	memcpy(ah->av.eth.mac, ah_attr->roce.dmac, ETH_ALEN);
+	ret = ib_get_cached_gid(pd->device, rdma_ah_get_port_num(ah_attr),
+				grh->sgid_index, &sgid, &gid_attr);
+	if (ret)
+		return ERR_PTR(ret);
+	eth_zero_addr(ah->av.eth.s_mac);
+	if (is_vlan_dev(gid_attr.ndev))
+		vlan_tag = vlan_dev_vlan_id(gid_attr.ndev);
+	memcpy(ah->av.eth.s_mac, gid_attr.ndev->dev_addr, ETH_ALEN);
+	dev_put(gid_attr.ndev);
+	if (vlan_tag < 0x1000)
+		vlan_tag |= (rdma_ah_get_sl(ah_attr) & 7) << 13;
+	ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn |
+					 (rdma_ah_get_port_num(ah_attr) << 24));
+	ret = mlx4_ib_gid_index_to_real_index(ibdev,
+					      rdma_ah_get_port_num(ah_attr),
+					      grh->sgid_index);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	ah->av.eth.gid_index = ret;
+	ah->av.eth.vlan = cpu_to_be16(vlan_tag);
+	ah->av.eth.hop_limit = grh->hop_limit;
+	if (rdma_ah_get_static_rate(ah_attr)) {
+		ah->av.eth.stat_rate = rdma_ah_get_static_rate(ah_attr) +
+					MLX4_STAT_RATE_OFFSET;
+		while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << ah->av.eth.stat_rate & dev->caps.stat_rate_support))
+			--ah->av.eth.stat_rate;
+	}
+	ah->av.eth.sl_tclass_flowlabel |=
+			cpu_to_be32((grh->traffic_class << 20) |
+				    grh->flow_label);
+	/*
+	 * HW requires multicast LID so we just choose one.
+	 */
+	if (is_mcast)
+		ah->av.ib.dlid = cpu_to_be16(0xc000);
+
+	memcpy(ah->av.eth.dgid, grh->dgid.raw, 16);
+	ah->av.eth.sl_tclass_flowlabel |= cpu_to_be32(rdma_ah_get_sl(ah_attr)
+						      << 29);
+	return &ah->ibah;
+}
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
+				struct ib_udata *udata)
+
+{
+	struct mlx4_ib_ah *ah;
+	struct ib_ah *ret;
+
+	ah = kzalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
+		if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) {
+			ret = ERR_PTR(-EINVAL);
+		} else {
+			/*
+			 * TBD: need to handle the case when we get
+			 * called in an atomic context and there we
+			 * might sleep.  We don't expect this
+			 * currently since we're working with link
+			 * local addresses which we can translate
+			 * without going to sleep.
+			 */
+			ret = create_iboe_ah(pd, ah_attr, ah);
+		}
+
+		if (IS_ERR(ret))
+			kfree(ah);
+
+		return ret;
+	} else
+		return create_ib_ah(pd, ah_attr, ah); /* never fails */
+}
+
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
+{
+	struct mlx4_ib_ah *ah = to_mah(ibah);
+	int port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->type = ibah->type;
+
+	if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
+		rdma_ah_set_dlid(ah_attr, 0);
+		rdma_ah_set_sl(ah_attr,
+			       be32_to_cpu(ah->av.eth.sl_tclass_flowlabel)
+			       >> 29);
+	} else {
+		rdma_ah_set_dlid(ah_attr, be16_to_cpu(ah->av.ib.dlid));
+		rdma_ah_set_sl(ah_attr,
+			       be32_to_cpu(ah->av.ib.sl_tclass_flowlabel)
+			       >> 28);
+	}
+
+	rdma_ah_set_port_num(ah_attr, port_num);
+	if (ah->av.ib.stat_rate)
+		rdma_ah_set_static_rate(ah_attr,
+					ah->av.ib.stat_rate -
+					MLX4_STAT_RATE_OFFSET);
+	rdma_ah_set_path_bits(ah_attr, ah->av.ib.g_slid & 0x7F);
+	if (mlx4_ib_ah_grh_present(ah)) {
+		u32 tc_fl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel);
+
+		rdma_ah_set_grh(ah_attr, NULL,
+				tc_fl & 0xfffff, ah->av.ib.gid_index,
+				ah->av.ib.hop_limit,
+				tc_fl >> 20);
+		rdma_ah_set_dgid_raw(ah_attr, ah->av.ib.dgid);
+	}
+
+	return 0;
+}
+
+int mlx4_ib_destroy_ah(struct ib_ah *ah)
+{
+	kfree(to_mah(ah));
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c
new file mode 100644
index 0000000..155b4df
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -0,0 +1,905 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+ /***********************************************************/
+/*This file support the handling of the Alias GUID feature. */
+/***********************************************************/
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_pack.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/delay.h>
+#include "mlx4_ib.h"
+
+/*
+The driver keeps the current state of all guids, as they are in the HW.
+Whenever we receive an smp mad GUIDInfo record, the data will be cached.
+*/
+
+struct mlx4_alias_guid_work_context {
+	u8 port;
+	struct mlx4_ib_dev     *dev ;
+	struct ib_sa_query     *sa_query;
+	struct completion	done;
+	int			query_id;
+	struct list_head	list;
+	int			block_num;
+	ib_sa_comp_mask		guid_indexes;
+	u8			method;
+};
+
+struct mlx4_next_alias_guid_work {
+	u8 port;
+	u8 block_num;
+	u8 method;
+	struct mlx4_sriov_alias_guid_info_rec_det rec_det;
+};
+
+static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port,
+				     int *resched_delay_sec);
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num,
+					 u8 port_num, u8 *p_data)
+{
+	int i;
+	u64 guid_indexes;
+	int slave_id;
+	int port_index = port_num - 1;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+
+	guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+				   ports_guid[port_num - 1].
+				   all_rec_per_port[block_num].guid_indexes);
+	pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes);
+
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		/* The location of the specific index starts from bit number 4
+		 * until bit num 11 */
+		if (test_bit(i + 4, (unsigned long *)&guid_indexes)) {
+			slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
+			if (slave_id >= dev->dev->num_slaves) {
+				pr_debug("The last slave: %d\n", slave_id);
+				return;
+			}
+
+			/* cache the guid: */
+			memcpy(&dev->sriov.demux[port_index].guid_cache[slave_id],
+			       &p_data[i * GUID_REC_SIZE],
+			       GUID_REC_SIZE);
+		} else
+			pr_debug("Guid number: %d in block: %d"
+				 " was not updated\n", i, block_num);
+	}
+}
+
+static __be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index)
+{
+	if (index >= NUM_ALIAS_GUID_PER_PORT) {
+		pr_err("%s: ERROR: asked for index:%d\n", __func__, index);
+		return (__force __be64) -1;
+	}
+	return *(__be64 *)&dev->sriov.demux[port - 1].guid_cache[index];
+}
+
+
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index)
+{
+	return IB_SA_COMP_MASK(4 + index);
+}
+
+void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave,
+				    int port,  int slave_init)
+{
+	__be64 curr_guid, required_guid;
+	int record_num = slave / 8;
+	int index = slave % 8;
+	int port_index = port - 1;
+	unsigned long flags;
+	int do_work = 0;
+
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+	if (dev->sriov.alias_guid.ports_guid[port_index].state_flags &
+	    GUID_STATE_NEED_PORT_INIT)
+		goto unlock;
+	if (!slave_init) {
+		curr_guid = *(__be64 *)&dev->sriov.
+			alias_guid.ports_guid[port_index].
+			all_rec_per_port[record_num].
+			all_recs[GUID_REC_SIZE * index];
+		if (curr_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL) ||
+		    !curr_guid)
+			goto unlock;
+		required_guid = cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL);
+	} else {
+		required_guid = mlx4_get_admin_guid(dev->dev, slave, port);
+		if (required_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+			goto unlock;
+	}
+	*(__be64 *)&dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[record_num].
+		all_recs[GUID_REC_SIZE * index] = required_guid;
+	dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[record_num].guid_indexes
+		|= mlx4_ib_get_aguid_comp_mask_from_ix(index);
+	dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[record_num].status
+		= MLX4_GUID_INFO_STATUS_IDLE;
+	/* set to run immediately */
+	dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[record_num].time_to_run = 0;
+	dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[record_num].
+		guids_retry_schedule[index] = 0;
+	do_work = 1;
+unlock:
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+
+	if (do_work)
+		mlx4_ib_init_alias_guid_work(dev, port_index);
+}
+
+/*
+ * Whenever new GUID is set/unset (guid table change) create event and
+ * notify the relevant slave (master also should be notified).
+ * If the GUID value is not as we have in the cache the slave will not be
+ * updated; in this case it waits for the smp_snoop or the port management
+ * event to call the function and to update the slave.
+ * block_number - the index of the block (16 blocks available)
+ * port_number - 1 or 2
+ */
+void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
+					  int block_num, u8 port_num,
+					  u8 *p_data)
+{
+	int i;
+	u64 guid_indexes;
+	int slave_id, slave_port;
+	enum slave_port_state new_state;
+	enum slave_port_state prev_state;
+	__be64 tmp_cur_ag, form_cache_ag;
+	enum slave_port_gen_event gen_event;
+	struct mlx4_sriov_alias_guid_info_rec_det *rec;
+	unsigned long flags;
+	__be64 required_value;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+
+	rec = &dev->sriov.alias_guid.ports_guid[port_num - 1].
+			all_rec_per_port[block_num];
+	guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+				   ports_guid[port_num - 1].
+				   all_rec_per_port[block_num].guid_indexes);
+	pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes);
+
+	/*calculate the slaves and notify them*/
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		/* the location of the specific index runs from bits 4..11 */
+		if (!(test_bit(i + 4, (unsigned long *)&guid_indexes)))
+			continue;
+
+		slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
+		if (slave_id >= dev->dev->persist->num_vfs + 1)
+			return;
+
+		slave_port = mlx4_phys_to_slave_port(dev->dev, slave_id, port_num);
+		if (slave_port < 0) /* this port isn't available for the VF */
+			continue;
+
+		tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE];
+		form_cache_ag = get_cached_alias_guid(dev, port_num,
+					(NUM_ALIAS_GUID_IN_REC * block_num) + i);
+		/*
+		 * Check if guid is not the same as in the cache,
+		 * If it is different, wait for the snoop_smp or the port mgmt
+		 * change event to update the slave on its port state change
+		 */
+		if (tmp_cur_ag != form_cache_ag)
+			continue;
+
+		spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+		required_value = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE];
+
+		if (required_value == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+			required_value = 0;
+
+		if (tmp_cur_ag == required_value) {
+			rec->guid_indexes = rec->guid_indexes &
+			       ~mlx4_ib_get_aguid_comp_mask_from_ix(i);
+		} else {
+			/* may notify port down if value is 0 */
+			if (tmp_cur_ag != MLX4_NOT_SET_GUID) {
+				spin_unlock_irqrestore(&dev->sriov.
+					alias_guid.ag_work_lock, flags);
+				continue;
+			}
+		}
+		spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock,
+				       flags);
+		mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num);
+		/*2 cases: Valid GUID, and Invalid Guid*/
+
+		if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/
+			prev_state = mlx4_get_slave_port_state(dev->dev, slave_id, port_num);
+			new_state = set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
+								  MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID,
+								  &gen_event);
+			pr_debug("slave: %d, port: %d prev_port_state: %d,"
+				 " new_port_state: %d, gen_event: %d\n",
+				 slave_id, port_num, prev_state, new_state, gen_event);
+			if (gen_event == SLAVE_PORT_GEN_EVENT_UP) {
+				pr_debug("sending PORT_UP event to slave: %d, port: %d\n",
+					 slave_id, port_num);
+				mlx4_gen_port_state_change_eqe(dev->dev, slave_id,
+							       port_num, MLX4_PORT_CHANGE_SUBTYPE_ACTIVE);
+			}
+		} else { /* request to invalidate GUID */
+			set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
+						      MLX4_PORT_STATE_IB_EVENT_GID_INVALID,
+						      &gen_event);
+			if (gen_event == SLAVE_PORT_GEN_EVENT_DOWN) {
+				pr_debug("sending PORT DOWN event to slave: %d, port: %d\n",
+					 slave_id, port_num);
+				mlx4_gen_port_state_change_eqe(dev->dev,
+							       slave_id,
+							       port_num,
+							       MLX4_PORT_CHANGE_SUBTYPE_DOWN);
+			}
+		}
+	}
+}
+
+static void aliasguid_query_handler(int status,
+				    struct ib_sa_guidinfo_rec *guid_rec,
+				    void *context)
+{
+	struct mlx4_ib_dev *dev;
+	struct mlx4_alias_guid_work_context *cb_ctx = context;
+	u8 port_index ;
+	int i;
+	struct mlx4_sriov_alias_guid_info_rec_det *rec;
+	unsigned long flags, flags1;
+	ib_sa_comp_mask declined_guid_indexes = 0;
+	ib_sa_comp_mask applied_guid_indexes = 0;
+	unsigned int resched_delay_sec = 0;
+
+	if (!context)
+		return;
+
+	dev = cb_ctx->dev;
+	port_index = cb_ctx->port - 1;
+	rec = &dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[cb_ctx->block_num];
+
+	if (status) {
+		pr_debug("(port: %d) failed: status = %d\n",
+			 cb_ctx->port, status);
+		rec->time_to_run = ktime_get_boot_ns() + 1 * NSEC_PER_SEC;
+		goto out;
+	}
+
+	if (guid_rec->block_num != cb_ctx->block_num) {
+		pr_err("block num mismatch: %d != %d\n",
+		       cb_ctx->block_num, guid_rec->block_num);
+		goto out;
+	}
+
+	pr_debug("lid/port: %d/%d, block_num: %d\n",
+		 be16_to_cpu(guid_rec->lid), cb_ctx->port,
+		 guid_rec->block_num);
+
+	rec = &dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[guid_rec->block_num];
+
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+	for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		__be64 sm_response, required_val;
+
+		if (!(cb_ctx->guid_indexes &
+			mlx4_ib_get_aguid_comp_mask_from_ix(i)))
+			continue;
+		sm_response = *(__be64 *)&guid_rec->guid_info_list
+				[i * GUID_REC_SIZE];
+		required_val = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE];
+		if (cb_ctx->method == MLX4_GUID_INFO_RECORD_DELETE) {
+			if (required_val ==
+			    cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+				goto next_entry;
+
+			/* A new value was set till we got the response */
+			pr_debug("need to set new value %llx, record num %d, block_num:%d\n",
+				 be64_to_cpu(required_val),
+				 i, guid_rec->block_num);
+			goto entry_declined;
+		}
+
+		/* check if the SM didn't assign one of the records.
+		 * if it didn't, re-ask for.
+		 */
+		if (sm_response == MLX4_NOT_SET_GUID) {
+			if (rec->guids_retry_schedule[i] == 0)
+				mlx4_ib_warn(&dev->ib_dev,
+					     "%s:Record num %d in  block_num: %d was declined by SM\n",
+					     __func__, i,
+					     guid_rec->block_num);
+			goto entry_declined;
+		} else {
+		       /* properly assigned record. */
+		       /* We save the GUID we just got from the SM in the
+			* admin_guid in order to be persistent, and in the
+			* request from the sm the process will ask for the same GUID */
+			if (required_val &&
+			    sm_response != required_val) {
+				/* Warn only on first retry */
+				if (rec->guids_retry_schedule[i] == 0)
+					mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set"
+						     " admin guid after SysAdmin "
+						     "configuration. "
+						     "Record num %d in block_num:%d "
+						     "was declined by SM, "
+						     "new val(0x%llx) was kept, SM returned (0x%llx)\n",
+						      __func__, i,
+						     guid_rec->block_num,
+						     be64_to_cpu(required_val),
+						     be64_to_cpu(sm_response));
+				goto entry_declined;
+			} else {
+				*(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] =
+					sm_response;
+				if (required_val == 0)
+					mlx4_set_admin_guid(dev->dev,
+							    sm_response,
+							    (guid_rec->block_num
+							    * NUM_ALIAS_GUID_IN_REC) + i,
+							    cb_ctx->port);
+				goto next_entry;
+			}
+		}
+entry_declined:
+		declined_guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
+		rec->guids_retry_schedule[i] =
+			(rec->guids_retry_schedule[i] == 0) ?  1 :
+			min((unsigned int)60,
+			    rec->guids_retry_schedule[i] * 2);
+		/* using the minimum value among all entries in that record */
+		resched_delay_sec = (resched_delay_sec == 0) ?
+				rec->guids_retry_schedule[i] :
+				min(resched_delay_sec,
+				    rec->guids_retry_schedule[i]);
+		continue;
+
+next_entry:
+		rec->guids_retry_schedule[i] = 0;
+	}
+
+	applied_guid_indexes =  cb_ctx->guid_indexes & ~declined_guid_indexes;
+	if (declined_guid_indexes ||
+	    rec->guid_indexes & ~(applied_guid_indexes)) {
+		pr_debug("record=%d wasn't fully set, guid_indexes=0x%llx applied_indexes=0x%llx, declined_indexes=0x%llx\n",
+			 guid_rec->block_num,
+			 be64_to_cpu((__force __be64)rec->guid_indexes),
+			 be64_to_cpu((__force __be64)applied_guid_indexes),
+			 be64_to_cpu((__force __be64)declined_guid_indexes));
+		rec->time_to_run = ktime_get_boot_ns() +
+			resched_delay_sec * NSEC_PER_SEC;
+	} else {
+		rec->status = MLX4_GUID_INFO_STATUS_SET;
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+	/*
+	The func is call here to close the cases when the
+	sm doesn't send smp, so in the sa response the driver
+	notifies the slave.
+	*/
+	mlx4_ib_notify_slaves_on_guid_change(dev, guid_rec->block_num,
+					     cb_ctx->port,
+					     guid_rec->guid_info_list);
+out:
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	if (!dev->sriov.is_going_down) {
+		get_low_record_time_index(dev, port_index, &resched_delay_sec);
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq,
+				   &dev->sriov.alias_guid.ports_guid[port_index].
+				   alias_guid_work,
+				   msecs_to_jiffies(resched_delay_sec * 1000));
+	}
+	if (cb_ctx->sa_query) {
+		list_del(&cb_ctx->list);
+		kfree(cb_ctx);
+	} else
+		complete(&cb_ctx->done);
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index)
+{
+	int i;
+	u64 cur_admin_val;
+	ib_sa_comp_mask comp_mask = 0;
+
+	dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status
+		= MLX4_GUID_INFO_STATUS_SET;
+
+	/* calculate the comp_mask for that record.*/
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		cur_admin_val =
+			*(u64 *)&dev->sriov.alias_guid.ports_guid[port - 1].
+			all_rec_per_port[index].all_recs[GUID_REC_SIZE * i];
+		/*
+		check the admin value: if it's for delete (~00LL) or
+		it is the first guid of the first record (hw guid) or
+		the records is not in ownership of the sysadmin and the sm doesn't
+		need to assign GUIDs, then don't put it up for assignment.
+		*/
+		if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val ||
+		    (!index && !i))
+			continue;
+		comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
+	}
+	dev->sriov.alias_guid.ports_guid[port - 1].
+		all_rec_per_port[index].guid_indexes |= comp_mask;
+	if (dev->sriov.alias_guid.ports_guid[port - 1].
+	    all_rec_per_port[index].guid_indexes)
+		dev->sriov.alias_guid.ports_guid[port - 1].
+		all_rec_per_port[index].status = MLX4_GUID_INFO_STATUS_IDLE;
+
+}
+
+static int set_guid_rec(struct ib_device *ibdev,
+			struct mlx4_next_alias_guid_work *rec)
+{
+	int err;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_sa_guidinfo_rec guid_info_rec;
+	ib_sa_comp_mask comp_mask;
+	struct ib_port_attr attr;
+	struct mlx4_alias_guid_work_context *callback_context;
+	unsigned long resched_delay, flags, flags1;
+	u8 port = rec->port + 1;
+	int index = rec->block_num;
+	struct mlx4_sriov_alias_guid_info_rec_det *rec_det = &rec->rec_det;
+	struct list_head *head =
+		&dev->sriov.alias_guid.ports_guid[port - 1].cb_list;
+
+	memset(&attr, 0, sizeof(attr));
+	err = __mlx4_ib_query_port(ibdev, port, &attr, 1);
+	if (err) {
+		pr_debug("mlx4_ib_query_port failed (err: %d), port: %d\n",
+			 err, port);
+		return err;
+	}
+	/*check the port was configured by the sm, otherwise no need to send */
+	if (attr.state != IB_PORT_ACTIVE) {
+		pr_debug("port %d not active...rescheduling\n", port);
+		resched_delay = 5 * HZ;
+		err = -EAGAIN;
+		goto new_schedule;
+	}
+
+	callback_context = kmalloc(sizeof *callback_context, GFP_KERNEL);
+	if (!callback_context) {
+		err = -ENOMEM;
+		resched_delay = HZ * 5;
+		goto new_schedule;
+	}
+	callback_context->port = port;
+	callback_context->dev = dev;
+	callback_context->block_num = index;
+	callback_context->guid_indexes = rec_det->guid_indexes;
+	callback_context->method = rec->method;
+
+	memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec));
+
+	guid_info_rec.lid = ib_lid_be16(attr.lid);
+	guid_info_rec.block_num = index;
+
+	memcpy(guid_info_rec.guid_info_list, rec_det->all_recs,
+	       GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC);
+	comp_mask = IB_SA_GUIDINFO_REC_LID | IB_SA_GUIDINFO_REC_BLOCK_NUM |
+		rec_det->guid_indexes;
+
+	init_completion(&callback_context->done);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	list_add_tail(&callback_context->list, head);
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+
+	callback_context->query_id =
+		ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client,
+					  ibdev, port, &guid_info_rec,
+					  comp_mask, rec->method, 1000,
+					  GFP_KERNEL, aliasguid_query_handler,
+					  callback_context,
+					  &callback_context->sa_query);
+	if (callback_context->query_id < 0) {
+		pr_debug("ib_sa_guid_info_rec_query failed, query_id: "
+			 "%d. will reschedule to the next 1 sec.\n",
+			 callback_context->query_id);
+		spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+		list_del(&callback_context->list);
+		kfree(callback_context);
+		spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+		resched_delay = 1 * HZ;
+		err = -EAGAIN;
+		goto new_schedule;
+	}
+	err = 0;
+	goto out;
+
+new_schedule:
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	invalidate_guid_record(dev, port, index);
+	if (!dev->sriov.is_going_down) {
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq,
+				   &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work,
+				   resched_delay);
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+
+out:
+	return err;
+}
+
+static void mlx4_ib_guid_port_init(struct mlx4_ib_dev *dev, int port)
+{
+	int j, k, entry;
+	__be64 guid;
+
+	/*Check if the SM doesn't need to assign the GUIDs*/
+	for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+		for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) {
+			entry = j * NUM_ALIAS_GUID_IN_REC + k;
+			/* no request for the 0 entry (hw guid) */
+			if (!entry || entry > dev->dev->persist->num_vfs ||
+			    !mlx4_is_slave_active(dev->dev, entry))
+				continue;
+			guid = mlx4_get_admin_guid(dev->dev, entry, port);
+			*(__be64 *)&dev->sriov.alias_guid.ports_guid[port - 1].
+				all_rec_per_port[j].all_recs
+				[GUID_REC_SIZE * k] = guid;
+			pr_debug("guid was set, entry=%d, val=0x%llx, port=%d\n",
+				 entry,
+				 be64_to_cpu(guid),
+				 port);
+		}
+	}
+}
+void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port)
+{
+	int i;
+	unsigned long flags, flags1;
+
+	pr_debug("port %d\n", port);
+
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+
+	if (dev->sriov.alias_guid.ports_guid[port - 1].state_flags &
+		GUID_STATE_NEED_PORT_INIT) {
+		mlx4_ib_guid_port_init(dev, port);
+		dev->sriov.alias_guid.ports_guid[port - 1].state_flags &=
+			(~GUID_STATE_NEED_PORT_INIT);
+	}
+	for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++)
+		invalidate_guid_record(dev, port, i);
+
+	if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) {
+		/*
+		make sure no work waits in the queue, if the work is already
+		queued(not on the timer) the cancel will fail. That is not a problem
+		because we just want the work started.
+		*/
+		cancel_delayed_work(&dev->sriov.alias_guid.
+				      ports_guid[port - 1].alias_guid_work);
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq,
+				   &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work,
+				   0);
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static void set_required_record(struct mlx4_ib_dev *dev, u8 port,
+				struct mlx4_next_alias_guid_work *next_rec,
+				int record_index)
+{
+	int i;
+	int lowset_time_entry = -1;
+	int lowest_time = 0;
+	ib_sa_comp_mask delete_guid_indexes = 0;
+	ib_sa_comp_mask set_guid_indexes = 0;
+	struct mlx4_sriov_alias_guid_info_rec_det *rec =
+			&dev->sriov.alias_guid.ports_guid[port].
+			all_rec_per_port[record_index];
+
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		if (!(rec->guid_indexes &
+			mlx4_ib_get_aguid_comp_mask_from_ix(i)))
+			continue;
+
+		if (*(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] ==
+				cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+			delete_guid_indexes |=
+				mlx4_ib_get_aguid_comp_mask_from_ix(i);
+		else
+			set_guid_indexes |=
+				mlx4_ib_get_aguid_comp_mask_from_ix(i);
+
+		if (lowset_time_entry == -1 || rec->guids_retry_schedule[i] <=
+			lowest_time) {
+			lowset_time_entry = i;
+			lowest_time = rec->guids_retry_schedule[i];
+		}
+	}
+
+	memcpy(&next_rec->rec_det, rec, sizeof(*rec));
+	next_rec->port = port;
+	next_rec->block_num = record_index;
+
+	if (*(__be64 *)&rec->all_recs[lowset_time_entry * GUID_REC_SIZE] ==
+				cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) {
+		next_rec->rec_det.guid_indexes = delete_guid_indexes;
+		next_rec->method = MLX4_GUID_INFO_RECORD_DELETE;
+	} else {
+		next_rec->rec_det.guid_indexes = set_guid_indexes;
+		next_rec->method = MLX4_GUID_INFO_RECORD_SET;
+	}
+}
+
+/* return index of record that should be updated based on lowest
+ * rescheduled time
+ */
+static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port,
+				     int *resched_delay_sec)
+{
+	int record_index = -1;
+	u64 low_record_time = 0;
+	struct mlx4_sriov_alias_guid_info_rec_det rec;
+	int j;
+
+	for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+		rec = dev->sriov.alias_guid.ports_guid[port].
+			all_rec_per_port[j];
+		if (rec.status == MLX4_GUID_INFO_STATUS_IDLE &&
+		    rec.guid_indexes) {
+			if (record_index == -1 ||
+			    rec.time_to_run < low_record_time) {
+				record_index = j;
+				low_record_time = rec.time_to_run;
+			}
+		}
+	}
+	if (resched_delay_sec) {
+		u64 curr_time = ktime_get_boot_ns();
+
+		*resched_delay_sec = (low_record_time < curr_time) ? 0 :
+			div_u64((low_record_time - curr_time), NSEC_PER_SEC);
+	}
+
+	return record_index;
+}
+
+/* The function returns the next record that was
+ * not configured (or failed to be configured) */
+static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port,
+				     struct mlx4_next_alias_guid_work *rec)
+{
+	unsigned long flags;
+	int record_index;
+	int ret = 0;
+
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+	record_index = get_low_record_time_index(dev, port, NULL);
+
+	if (record_index < 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	set_required_record(dev, port, rec, record_index);
+out:
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+	return ret;
+}
+
+static void alias_guid_work(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	int ret = 0;
+	struct mlx4_next_alias_guid_work *rec;
+	struct mlx4_sriov_alias_guid_port_rec_det *sriov_alias_port =
+		container_of(delay, struct mlx4_sriov_alias_guid_port_rec_det,
+			     alias_guid_work);
+	struct mlx4_sriov_alias_guid *sriov_alias_guid = sriov_alias_port->parent;
+	struct mlx4_ib_sriov *ib_sriov = container_of(sriov_alias_guid,
+						struct mlx4_ib_sriov,
+						alias_guid);
+	struct mlx4_ib_dev *dev = container_of(ib_sriov, struct mlx4_ib_dev, sriov);
+
+	rec = kzalloc(sizeof *rec, GFP_KERNEL);
+	if (!rec)
+		return;
+
+	pr_debug("starting [port: %d]...\n", sriov_alias_port->port + 1);
+	ret = get_next_record_to_update(dev, sriov_alias_port->port, rec);
+	if (ret) {
+		pr_debug("No more records to update.\n");
+		goto out;
+	}
+
+	set_guid_rec(&dev->ib_dev, rec);
+out:
+	kfree(rec);
+}
+
+
+void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port)
+{
+	unsigned long flags, flags1;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	if (!dev->sriov.is_going_down) {
+		/* If there is pending one should cancel then run, otherwise
+		  * won't run till previous one is ended as same work
+		  * struct is used.
+		  */
+		cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[port].
+				    alias_guid_work);
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq,
+			   &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0);
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev)
+{
+	int i;
+	struct mlx4_ib_sriov *sriov = &dev->sriov;
+	struct mlx4_alias_guid_work_context *cb_ctx;
+	struct mlx4_sriov_alias_guid_port_rec_det *det;
+	struct ib_sa_query *sa_query;
+	unsigned long flags;
+
+	for (i = 0 ; i < dev->num_ports; i++) {
+		cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work);
+		det = &sriov->alias_guid.ports_guid[i];
+		spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
+		while (!list_empty(&det->cb_list)) {
+			cb_ctx = list_entry(det->cb_list.next,
+					    struct mlx4_alias_guid_work_context,
+					    list);
+			sa_query = cb_ctx->sa_query;
+			cb_ctx->sa_query = NULL;
+			list_del(&cb_ctx->list);
+			spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags);
+			ib_sa_cancel_query(cb_ctx->query_id, sa_query);
+			wait_for_completion(&cb_ctx->done);
+			kfree(cb_ctx);
+			spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
+		}
+		spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags);
+	}
+	for (i = 0 ; i < dev->num_ports; i++) {
+		flush_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+		destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+	}
+	ib_sa_unregister_client(dev->sriov.alias_guid.sa_client);
+	kfree(dev->sriov.alias_guid.sa_client);
+}
+
+int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev)
+{
+	char alias_wq_name[15];
+	int ret = 0;
+	int i, j;
+	union ib_gid gid;
+
+	if (!mlx4_is_master(dev->dev))
+		return 0;
+	dev->sriov.alias_guid.sa_client =
+		kzalloc(sizeof *dev->sriov.alias_guid.sa_client, GFP_KERNEL);
+	if (!dev->sriov.alias_guid.sa_client)
+		return -ENOMEM;
+
+	ib_sa_register_client(dev->sriov.alias_guid.sa_client);
+
+	spin_lock_init(&dev->sriov.alias_guid.ag_work_lock);
+
+	for (i = 1; i <= dev->num_ports; ++i) {
+		if (dev->ib_dev.query_gid(&dev->ib_dev , i, 0, &gid)) {
+			ret = -EFAULT;
+			goto err_unregister;
+		}
+	}
+
+	for (i = 0 ; i < dev->num_ports; i++) {
+		memset(&dev->sriov.alias_guid.ports_guid[i], 0,
+		       sizeof (struct mlx4_sriov_alias_guid_port_rec_det));
+		dev->sriov.alias_guid.ports_guid[i].state_flags |=
+				GUID_STATE_NEED_PORT_INIT;
+		for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+			/* mark each val as it was deleted */
+			memset(dev->sriov.alias_guid.ports_guid[i].
+				all_rec_per_port[j].all_recs, 0xFF,
+				sizeof(dev->sriov.alias_guid.ports_guid[i].
+				all_rec_per_port[j].all_recs));
+		}
+		INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list);
+		/*prepare the records, set them to be allocated by sm*/
+		if (mlx4_ib_sm_guid_assign)
+			for (j = 1; j < NUM_ALIAS_GUID_PER_PORT; j++)
+				mlx4_set_admin_guid(dev->dev, 0, j, i + 1);
+		for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++)
+			invalidate_guid_record(dev, i + 1, j);
+
+		dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid;
+		dev->sriov.alias_guid.ports_guid[i].port  = i;
+
+		snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i);
+		dev->sriov.alias_guid.ports_guid[i].wq =
+			alloc_ordered_workqueue(alias_wq_name, WQ_MEM_RECLAIM);
+		if (!dev->sriov.alias_guid.ports_guid[i].wq) {
+			ret = -ENOMEM;
+			goto err_thread;
+		}
+		INIT_DELAYED_WORK(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work,
+			  alias_guid_work);
+	}
+	return 0;
+
+err_thread:
+	for (--i; i >= 0; i--) {
+		destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+		dev->sriov.alias_guid.ports_guid[i].wq = NULL;
+	}
+
+err_unregister:
+	ib_sa_unregister_client(dev->sriov.alias_guid.sa_client);
+	kfree(dev->sriov.alias_guid.sa_client);
+	dev->sriov.alias_guid.sa_client = NULL;
+	pr_err("init_alias_guid_service: Failed. (ret:%d)\n", ret);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
new file mode 100644
index 0000000..fedaf82
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/rbtree.h>
+#include <linux/idr.h>
+#include <rdma/ib_cm.h>
+
+#include "mlx4_ib.h"
+
+#define CM_CLEANUP_CACHE_TIMEOUT  (5 * HZ)
+
+struct id_map_entry {
+	struct rb_node node;
+
+	u32 sl_cm_id;
+	u32 pv_cm_id;
+	int slave_id;
+	int scheduled_delete;
+	struct mlx4_ib_dev *dev;
+
+	struct list_head list;
+	struct delayed_work timeout;
+};
+
+struct cm_generic_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+};
+
+struct cm_sidr_generic_msg {
+	struct ib_mad_hdr hdr;
+	__be32 request_id;
+};
+
+struct cm_req_msg {
+	unsigned char unused[0x60];
+	union ib_gid primary_path_sgid;
+};
+
+
+static void set_local_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+	if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		struct cm_sidr_generic_msg *msg =
+			(struct cm_sidr_generic_msg *)mad;
+		msg->request_id = cpu_to_be32(cm_id);
+	} else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		pr_err("trying to set local_comm_id in SIDR_REP\n");
+		return;
+	} else {
+		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+		msg->local_comm_id = cpu_to_be32(cm_id);
+	}
+}
+
+static u32 get_local_comm_id(struct ib_mad *mad)
+{
+	if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		struct cm_sidr_generic_msg *msg =
+			(struct cm_sidr_generic_msg *)mad;
+		return be32_to_cpu(msg->request_id);
+	} else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		pr_err("trying to set local_comm_id in SIDR_REP\n");
+		return -1;
+	} else {
+		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+		return be32_to_cpu(msg->local_comm_id);
+	}
+}
+
+static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+	if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		struct cm_sidr_generic_msg *msg =
+			(struct cm_sidr_generic_msg *)mad;
+		msg->request_id = cpu_to_be32(cm_id);
+	} else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		pr_err("trying to set remote_comm_id in SIDR_REQ\n");
+		return;
+	} else {
+		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+		msg->remote_comm_id = cpu_to_be32(cm_id);
+	}
+}
+
+static u32 get_remote_comm_id(struct ib_mad *mad)
+{
+	if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		struct cm_sidr_generic_msg *msg =
+			(struct cm_sidr_generic_msg *)mad;
+		return be32_to_cpu(msg->request_id);
+	} else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		pr_err("trying to set remote_comm_id in SIDR_REQ\n");
+		return -1;
+	} else {
+		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+		return be32_to_cpu(msg->remote_comm_id);
+	}
+}
+
+static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad)
+{
+	struct cm_req_msg *msg = (struct cm_req_msg *)mad;
+
+	return msg->primary_path_sgid;
+}
+
+/* Lock should be taken before called */
+static struct id_map_entry *
+id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id)
+{
+	struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+	struct rb_node *node = sl_id_map->rb_node;
+
+	while (node) {
+		struct id_map_entry *id_map_entry =
+			rb_entry(node, struct id_map_entry, node);
+
+		if (id_map_entry->sl_cm_id > sl_cm_id)
+			node = node->rb_left;
+		else if (id_map_entry->sl_cm_id < sl_cm_id)
+			node = node->rb_right;
+		else if (id_map_entry->slave_id > slave_id)
+			node = node->rb_left;
+		else if (id_map_entry->slave_id < slave_id)
+			node = node->rb_right;
+		else
+			return id_map_entry;
+	}
+	return NULL;
+}
+
+static void id_map_ent_timeout(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout);
+	struct id_map_entry *db_ent, *found_ent;
+	struct mlx4_ib_dev *dev = ent->dev;
+	struct mlx4_ib_sriov *sriov = &dev->sriov;
+	struct rb_root *sl_id_map = &sriov->sl_id_map;
+	int pv_id = (int) ent->pv_cm_id;
+
+	spin_lock(&sriov->id_map_lock);
+	db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id);
+	if (!db_ent)
+		goto out;
+	found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id);
+	if (found_ent && found_ent == ent)
+		rb_erase(&found_ent->node, sl_id_map);
+	idr_remove(&sriov->pv_id_table, pv_id);
+
+out:
+	list_del(&ent->list);
+	spin_unlock(&sriov->id_map_lock);
+	kfree(ent);
+}
+
+static void id_map_find_del(struct ib_device *ibdev, int pv_cm_id)
+{
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+	struct rb_root *sl_id_map = &sriov->sl_id_map;
+	struct id_map_entry *ent, *found_ent;
+
+	spin_lock(&sriov->id_map_lock);
+	ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id);
+	if (!ent)
+		goto out;
+	found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id);
+	if (found_ent && found_ent == ent)
+		rb_erase(&found_ent->node, sl_id_map);
+	idr_remove(&sriov->pv_id_table, pv_cm_id);
+out:
+	spin_unlock(&sriov->id_map_lock);
+}
+
+static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new)
+{
+	struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+	struct rb_node **link = &sl_id_map->rb_node, *parent = NULL;
+	struct id_map_entry *ent;
+	int slave_id = new->slave_id;
+	int sl_cm_id = new->sl_cm_id;
+
+	ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id);
+	if (ent) {
+		pr_debug("overriding existing sl_id_map entry (cm_id = %x)\n",
+			 sl_cm_id);
+
+		rb_replace_node(&ent->node, &new->node, sl_id_map);
+		return;
+	}
+
+	/* Go to the bottom of the tree */
+	while (*link) {
+		parent = *link;
+		ent = rb_entry(parent, struct id_map_entry, node);
+
+		if (ent->sl_cm_id > sl_cm_id || (ent->sl_cm_id == sl_cm_id && ent->slave_id > slave_id))
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, sl_id_map);
+}
+
+static struct id_map_entry *
+id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id)
+{
+	int ret;
+	struct id_map_entry *ent;
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+
+	ent = kmalloc(sizeof (struct id_map_entry), GFP_KERNEL);
+	if (!ent)
+		return ERR_PTR(-ENOMEM);
+
+	ent->sl_cm_id = sl_cm_id;
+	ent->slave_id = slave_id;
+	ent->scheduled_delete = 0;
+	ent->dev = to_mdev(ibdev);
+	INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout);
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&to_mdev(ibdev)->sriov.id_map_lock);
+
+	ret = idr_alloc_cyclic(&sriov->pv_id_table, ent, 0, 0, GFP_NOWAIT);
+	if (ret >= 0) {
+		ent->pv_cm_id = (u32)ret;
+		sl_id_map_add(ibdev, ent);
+		list_add_tail(&ent->list, &sriov->cm_list);
+	}
+
+	spin_unlock(&sriov->id_map_lock);
+	idr_preload_end();
+
+	if (ret >= 0)
+		return ent;
+
+	/*error flow*/
+	kfree(ent);
+	mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret);
+	return ERR_PTR(-ENOMEM);
+}
+
+static struct id_map_entry *
+id_map_get(struct ib_device *ibdev, int *pv_cm_id, int slave_id, int sl_cm_id)
+{
+	struct id_map_entry *ent;
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+
+	spin_lock(&sriov->id_map_lock);
+	if (*pv_cm_id == -1) {
+		ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id);
+		if (ent)
+			*pv_cm_id = (int) ent->pv_cm_id;
+	} else
+		ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id);
+	spin_unlock(&sriov->id_map_lock);
+
+	return ent;
+}
+
+static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id)
+{
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+	unsigned long flags;
+
+	spin_lock(&sriov->id_map_lock);
+	spin_lock_irqsave(&sriov->going_down_lock, flags);
+	/*make sure that there is no schedule inside the scheduled work.*/
+	if (!sriov->is_going_down) {
+		id->scheduled_delete = 1;
+		schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT);
+	}
+	spin_unlock_irqrestore(&sriov->going_down_lock, flags);
+	spin_unlock(&sriov->id_map_lock);
+}
+
+int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
+		struct ib_mad *mad)
+{
+	struct id_map_entry *id;
+	u32 sl_cm_id;
+	int pv_cm_id = -1;
+
+	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
+			mad->mad_hdr.attr_id == CM_REP_ATTR_ID ||
+			mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		sl_cm_id = get_local_comm_id(mad);
+		id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
+		if (id)
+			goto cont;
+		id = id_map_alloc(ibdev, slave_id, sl_cm_id);
+		if (IS_ERR(id)) {
+			mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n",
+				__func__, slave_id, sl_cm_id);
+			return PTR_ERR(id);
+		}
+	} else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID ||
+		   mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		return 0;
+	} else {
+		sl_cm_id = get_local_comm_id(mad);
+		id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
+	}
+
+	if (!id) {
+		pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n",
+			 slave_id, sl_cm_id);
+		return -EINVAL;
+	}
+
+cont:
+	set_local_comm_id(mad, id->pv_cm_id);
+
+	if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID)
+		schedule_delayed(ibdev, id);
+	else if (mad->mad_hdr.attr_id == CM_DREP_ATTR_ID)
+		id_map_find_del(ibdev, pv_cm_id);
+
+	return 0;
+}
+
+int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
+			     struct ib_mad *mad)
+{
+	u32 pv_cm_id;
+	struct id_map_entry *id;
+
+	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
+	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		union ib_gid gid;
+
+		if (!slave)
+			return 0;
+
+		gid = gid_from_req_msg(ibdev, mad);
+		*slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id);
+		if (*slave < 0) {
+			mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n",
+				     be64_to_cpu(gid.global.interface_id));
+			return -ENOENT;
+		}
+		return 0;
+	}
+
+	pv_cm_id = get_remote_comm_id(mad);
+	id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1);
+
+	if (!id) {
+		pr_debug("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id);
+		return -ENOENT;
+	}
+
+	if (slave)
+		*slave = id->slave_id;
+	set_remote_comm_id(mad, id->sl_cm_id);
+
+	if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID)
+		schedule_delayed(ibdev, id);
+	else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID ||
+			mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) {
+		id_map_find_del(ibdev, (int) pv_cm_id);
+	}
+
+	return 0;
+}
+
+void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev)
+{
+	spin_lock_init(&dev->sriov.id_map_lock);
+	INIT_LIST_HEAD(&dev->sriov.cm_list);
+	dev->sriov.sl_id_map = RB_ROOT;
+	idr_init(&dev->sriov.pv_id_table);
+}
+
+/* slave = -1 ==> all slaves */
+/* TBD -- call paravirt clean for single slave.  Need for slave RESET event */
+void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
+{
+	struct mlx4_ib_sriov *sriov = &dev->sriov;
+	struct rb_root *sl_id_map = &sriov->sl_id_map;
+	struct list_head lh;
+	struct rb_node *nd;
+	int need_flush = 0;
+	struct id_map_entry *map, *tmp_map;
+	/* cancel all delayed work queue entries */
+	INIT_LIST_HEAD(&lh);
+	spin_lock(&sriov->id_map_lock);
+	list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) {
+		if (slave < 0 || slave == map->slave_id) {
+			if (map->scheduled_delete)
+				need_flush |= !cancel_delayed_work(&map->timeout);
+		}
+	}
+
+	spin_unlock(&sriov->id_map_lock);
+
+	if (need_flush)
+		flush_scheduled_work(); /* make sure all timers were flushed */
+
+	/* now, remove all leftover entries from databases*/
+	spin_lock(&sriov->id_map_lock);
+	if (slave < 0) {
+		while (rb_first(sl_id_map)) {
+			struct id_map_entry *ent =
+				rb_entry(rb_first(sl_id_map),
+					 struct id_map_entry, node);
+
+			rb_erase(&ent->node, sl_id_map);
+			idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id);
+		}
+		list_splice_init(&dev->sriov.cm_list, &lh);
+	} else {
+		/* first, move nodes belonging to slave to db remove list */
+		nd = rb_first(sl_id_map);
+		while (nd) {
+			struct id_map_entry *ent =
+				rb_entry(nd, struct id_map_entry, node);
+			nd = rb_next(nd);
+			if (ent->slave_id == slave)
+				list_move_tail(&ent->list, &lh);
+		}
+		/* remove those nodes from databases */
+		list_for_each_entry_safe(map, tmp_map, &lh, list) {
+			rb_erase(&map->node, sl_id_map);
+			idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id);
+		}
+
+		/* add remaining nodes from cm_list */
+		list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) {
+			if (slave == map->slave_id)
+				list_move_tail(&map->list, &lh);
+		}
+	}
+
+	spin_unlock(&sriov->id_map_lock);
+
+	/* free any map entries left behind due to cancel_delayed_work above */
+	list_for_each_entry_safe(map, tmp_map, &lh, list) {
+		list_del(&map->list);
+		kfree(map);
+	}
+}
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
new file mode 100644
index 0000000..82adc0d
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -0,0 +1,980 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+#include <rdma/mlx4-abi.h>
+
+static void mlx4_ib_cq_comp(struct mlx4_cq *cq)
+{
+	struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
+	ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_cq *ibcq;
+
+	if (type != MLX4_EVENT_TYPE_CQ_ERROR) {
+		pr_warn("Unexpected event type %d "
+		       "on CQ %06x\n", type, cq->cqn);
+		return;
+	}
+
+	ibcq = &to_mibcq(cq)->ibcq;
+	if (ibcq->event_handler) {
+		event.device     = ibcq->device;
+		event.event      = IB_EVENT_CQ_ERR;
+		event.element.cq = ibcq;
+		ibcq->event_handler(&event, ibcq->cq_context);
+	}
+}
+
+static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
+{
+	return mlx4_buf_offset(&buf->buf, n * buf->entry_size);
+}
+
+static void *get_cqe(struct mlx4_ib_cq *cq, int n)
+{
+	return get_cqe_from_buf(&cq->buf, n);
+}
+
+static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
+{
+	struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+	struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe);
+
+	return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+		!!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
+}
+
+static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq)
+{
+	return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	struct mlx4_ib_cq *mcq = to_mcq(cq);
+	struct mlx4_ib_dev *dev = to_mdev(cq->device);
+
+	return mlx4_cq_modify(dev->dev, &mcq->mcq, cq_count, cq_period);
+}
+
+static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int nent)
+{
+	int err;
+
+	err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size,
+			     PAGE_SIZE * 2, &buf->buf);
+
+	if (err)
+		goto out;
+
+	buf->entry_size = dev->dev->caps.cqe_size;
+	err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift,
+				    &buf->mtt);
+	if (err)
+		goto err_buf;
+
+	err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf);
+	if (err)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &buf->mtt);
+
+err_buf:
+	mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf);
+
+out:
+	return err;
+}
+
+static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe)
+{
+	mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf);
+}
+
+static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context,
+			       struct mlx4_ib_cq_buf *buf, struct ib_umem **umem,
+			       u64 buf_addr, int cqe)
+{
+	int err;
+	int cqe_size = dev->dev->caps.cqe_size;
+	int shift;
+	int n;
+
+	*umem = ib_umem_get(context, buf_addr, cqe * cqe_size,
+			    IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(*umem))
+		return PTR_ERR(*umem);
+
+	n = ib_umem_page_count(*umem);
+	shift = mlx4_ib_umem_calc_optimal_mtt_size(*umem, 0, &n);
+	err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt);
+
+	if (err)
+		goto err_buf;
+
+	err = mlx4_ib_umem_write_mtt(dev, &buf->mtt, *umem);
+	if (err)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &buf->mtt);
+
+err_buf:
+	ib_umem_release(*umem);
+
+	return err;
+}
+
+#define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
+				const struct ib_cq_init_attr *attr,
+				struct ib_ucontext *context,
+				struct ib_udata *udata)
+{
+	int entries = attr->cqe;
+	int vector = attr->comp_vector;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_cq *cq;
+	struct mlx4_uar *uar;
+	int err;
+
+	if (entries < 1 || entries > dev->dev->caps.max_cqes)
+		return ERR_PTR(-EINVAL);
+
+	if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED)
+		return ERR_PTR(-EINVAL);
+
+	cq = kmalloc(sizeof *cq, GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	entries      = roundup_pow_of_two(entries + 1);
+	cq->ibcq.cqe = entries - 1;
+	mutex_init(&cq->resize_mutex);
+	spin_lock_init(&cq->lock);
+	cq->resize_buf = NULL;
+	cq->resize_umem = NULL;
+	cq->create_flags = attr->flags;
+	INIT_LIST_HEAD(&cq->send_qp_list);
+	INIT_LIST_HEAD(&cq->recv_qp_list);
+
+	if (context) {
+		struct mlx4_ib_create_cq ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_cq;
+		}
+
+		err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem,
+					  ucmd.buf_addr, entries);
+		if (err)
+			goto err_cq;
+
+		err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+					  &cq->db);
+		if (err)
+			goto err_mtt;
+
+		uar = &to_mucontext(context)->uar;
+		cq->mcq.usage = MLX4_RES_USAGE_USER_VERBS;
+	} else {
+		err = mlx4_db_alloc(dev->dev, &cq->db, 1);
+		if (err)
+			goto err_cq;
+
+		cq->mcq.set_ci_db  = cq->db.db;
+		cq->mcq.arm_db     = cq->db.db + 1;
+		*cq->mcq.set_ci_db = 0;
+		*cq->mcq.arm_db    = 0;
+
+		err = mlx4_ib_alloc_cq_buf(dev, &cq->buf, entries);
+		if (err)
+			goto err_db;
+
+		uar = &dev->priv_uar;
+		cq->mcq.usage = MLX4_RES_USAGE_DRIVER;
+	}
+
+	if (dev->eq_table)
+		vector = dev->eq_table[vector % ibdev->num_comp_vectors];
+
+	err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
+			    cq->db.dma, &cq->mcq, vector, 0,
+			    !!(cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION));
+	if (err)
+		goto err_dbmap;
+
+	if (context)
+		cq->mcq.tasklet_ctx.comp = mlx4_ib_cq_comp;
+	else
+		cq->mcq.comp = mlx4_ib_cq_comp;
+	cq->mcq.event = mlx4_ib_cq_event;
+
+	if (context)
+		if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) {
+			err = -EFAULT;
+			goto err_cq_free;
+		}
+
+	return &cq->ibcq;
+
+err_cq_free:
+	mlx4_cq_free(dev->dev, &cq->mcq);
+
+err_dbmap:
+	if (context)
+		mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db);
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
+
+	if (context)
+		ib_umem_release(cq->umem);
+	else
+		mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+
+err_db:
+	if (!context)
+		mlx4_db_free(dev->dev, &cq->db);
+
+err_cq:
+	kfree(cq);
+
+	return ERR_PTR(err);
+}
+
+static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
+				  int entries)
+{
+	int err;
+
+	if (cq->resize_buf)
+		return -EBUSY;
+
+	cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL);
+	if (!cq->resize_buf)
+		return -ENOMEM;
+
+	err = mlx4_ib_alloc_cq_buf(dev, &cq->resize_buf->buf, entries);
+	if (err) {
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		return err;
+	}
+
+	cq->resize_buf->cqe = entries - 1;
+
+	return 0;
+}
+
+static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
+				   int entries, struct ib_udata *udata)
+{
+	struct mlx4_ib_resize_cq ucmd;
+	int err;
+
+	if (cq->resize_umem)
+		return -EBUSY;
+
+	if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+		return -EFAULT;
+
+	cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL);
+	if (!cq->resize_buf)
+		return -ENOMEM;
+
+	err = mlx4_ib_get_cq_umem(dev, cq->umem->context, &cq->resize_buf->buf,
+				  &cq->resize_umem, ucmd.buf_addr, entries);
+	if (err) {
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		return err;
+	}
+
+	cq->resize_buf->cqe = entries - 1;
+
+	return 0;
+}
+
+static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq)
+{
+	u32 i;
+
+	i = cq->mcq.cons_index;
+	while (get_sw_cqe(cq, i))
+		++i;
+
+	return i - cq->mcq.cons_index;
+}
+
+static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq)
+{
+	struct mlx4_cqe *cqe, *new_cqe;
+	int i;
+	int cqe_size = cq->buf.entry_size;
+	int cqe_inc = cqe_size == 64 ? 1 : 0;
+
+	i = cq->mcq.cons_index;
+	cqe = get_cqe(cq, i & cq->ibcq.cqe);
+	cqe += cqe_inc;
+
+	while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
+		new_cqe = get_cqe_from_buf(&cq->resize_buf->buf,
+					   (i + 1) & cq->resize_buf->cqe);
+		memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size);
+		new_cqe += cqe_inc;
+
+		new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
+			(((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
+		cqe = get_cqe(cq, ++i & cq->ibcq.cqe);
+		cqe += cqe_inc;
+	}
+	++cq->mcq.cons_index;
+}
+
+int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibcq->device);
+	struct mlx4_ib_cq *cq = to_mcq(ibcq);
+	struct mlx4_mtt mtt;
+	int outst_cqe;
+	int err;
+
+	mutex_lock(&cq->resize_mutex);
+	if (entries < 1 || entries > dev->dev->caps.max_cqes) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries == ibcq->cqe + 1) {
+		err = 0;
+		goto out;
+	}
+
+	if (entries > dev->dev->caps.max_cqes + 1) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (ibcq->uobject) {
+		err = mlx4_alloc_resize_umem(dev, cq, entries, udata);
+		if (err)
+			goto out;
+	} else {
+		/* Can't be smaller than the number of outstanding CQEs */
+		outst_cqe = mlx4_ib_get_outstanding_cqes(cq);
+		if (entries < outst_cqe + 1) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		err = mlx4_alloc_resize_buf(dev, cq, entries);
+		if (err)
+			goto out;
+	}
+
+	mtt = cq->buf.mtt;
+
+	err = mlx4_cq_resize(dev->dev, &cq->mcq, entries, &cq->resize_buf->buf.mtt);
+	if (err)
+		goto err_buf;
+
+	mlx4_mtt_cleanup(dev->dev, &mtt);
+	if (ibcq->uobject) {
+		cq->buf      = cq->resize_buf->buf;
+		cq->ibcq.cqe = cq->resize_buf->cqe;
+		ib_umem_release(cq->umem);
+		cq->umem     = cq->resize_umem;
+
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		cq->resize_umem = NULL;
+	} else {
+		struct mlx4_ib_cq_buf tmp_buf;
+		int tmp_cqe = 0;
+
+		spin_lock_irq(&cq->lock);
+		if (cq->resize_buf) {
+			mlx4_ib_cq_resize_copy_cqes(cq);
+			tmp_buf = cq->buf;
+			tmp_cqe = cq->ibcq.cqe;
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+
+			kfree(cq->resize_buf);
+			cq->resize_buf = NULL;
+		}
+		spin_unlock_irq(&cq->lock);
+
+		if (tmp_cqe)
+			mlx4_ib_free_cq_buf(dev, &tmp_buf, tmp_cqe);
+	}
+
+	goto out;
+
+err_buf:
+	mlx4_mtt_cleanup(dev->dev, &cq->resize_buf->buf.mtt);
+	if (!ibcq->uobject)
+		mlx4_ib_free_cq_buf(dev, &cq->resize_buf->buf,
+				    cq->resize_buf->cqe);
+
+	kfree(cq->resize_buf);
+	cq->resize_buf = NULL;
+
+	if (cq->resize_umem) {
+		ib_umem_release(cq->resize_umem);
+		cq->resize_umem = NULL;
+	}
+
+out:
+	mutex_unlock(&cq->resize_mutex);
+
+	return err;
+}
+
+int mlx4_ib_destroy_cq(struct ib_cq *cq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(cq->device);
+	struct mlx4_ib_cq *mcq = to_mcq(cq);
+
+	mlx4_cq_free(dev->dev, &mcq->mcq);
+	mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt);
+
+	if (cq->uobject) {
+		mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
+		ib_umem_release(mcq->umem);
+	} else {
+		mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe);
+		mlx4_db_free(dev->dev, &mcq->db);
+	}
+
+	kfree(mcq);
+
+	return 0;
+}
+
+static void dump_cqe(void *cqe)
+{
+	__be32 *buf = cqe;
+
+	pr_debug("CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+	       be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]),
+	       be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]),
+	       be32_to_cpu(buf[6]), be32_to_cpu(buf[7]));
+}
+
+static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe,
+				     struct ib_wc *wc)
+{
+	if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) {
+		pr_debug("local QP operation err "
+		       "(QPN %06x, WQE index %x, vendor syndrome %02x, "
+		       "opcode = %02x)\n",
+		       be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index),
+		       cqe->vendor_err_syndrome,
+		       cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+		dump_cqe(cqe);
+	}
+
+	switch (cqe->syndrome) {
+	case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+		wc->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+		wc->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR:
+		wc->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_WR_FLUSH_ERR:
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_MW_BIND_ERR:
+		wc->status = IB_WC_MW_BIND_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_BAD_RESP_ERR:
+		wc->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+		wc->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+		wc->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+		wc->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_OP_ERR:
+		wc->status = IB_WC_REM_OP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+		wc->status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+		wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+		wc->status = IB_WC_REM_ABORT_ERR;
+		break;
+	default:
+		wc->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	wc->vendor_err = cqe->vendor_err_syndrome;
+}
+
+static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum)
+{
+	return ((status & cpu_to_be16(MLX4_CQE_STATUS_IPV4      |
+				      MLX4_CQE_STATUS_IPV4F     |
+				      MLX4_CQE_STATUS_IPV4OPT   |
+				      MLX4_CQE_STATUS_IPV6      |
+				      MLX4_CQE_STATUS_IPOK)) ==
+		cpu_to_be16(MLX4_CQE_STATUS_IPV4        |
+			    MLX4_CQE_STATUS_IPOK))              &&
+		(status & cpu_to_be16(MLX4_CQE_STATUS_UDP       |
+				      MLX4_CQE_STATUS_TCP))     &&
+		checksum == cpu_to_be16(0xffff);
+}
+
+static void use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc,
+			    unsigned tail, struct mlx4_cqe *cqe, int is_eth)
+{
+	struct mlx4_ib_proxy_sqp_hdr *hdr;
+
+	ib_dma_sync_single_for_cpu(qp->ibqp.device,
+				   qp->sqp_proxy_rcv[tail].map,
+				   sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				   DMA_FROM_DEVICE);
+	hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr);
+	wc->pkey_index	= be16_to_cpu(hdr->tun.pkey_index);
+	wc->src_qp	= be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF;
+	wc->wc_flags   |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0;
+	wc->dlid_path_bits = 0;
+
+	if (is_eth) {
+		wc->slid = 0;
+		wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid);
+		memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4);
+		memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2);
+		wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
+	} else {
+		wc->slid        = be16_to_cpu(hdr->tun.slid_mac_47_32);
+		wc->sl          = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12);
+	}
+}
+
+static void mlx4_ib_qp_sw_comp(struct mlx4_ib_qp *qp, int num_entries,
+			       struct ib_wc *wc, int *npolled, int is_send)
+{
+	struct mlx4_ib_wq *wq;
+	unsigned cur;
+	int i;
+
+	wq = is_send ? &qp->sq : &qp->rq;
+	cur = wq->head - wq->tail;
+
+	if (cur == 0)
+		return;
+
+	for (i = 0;  i < cur && *npolled < num_entries; i++) {
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		wc->vendor_err = MLX4_CQE_SYNDROME_WR_FLUSH_ERR;
+		wq->tail++;
+		(*npolled)++;
+		wc->qp = &qp->ibqp;
+		wc++;
+	}
+}
+
+static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries,
+				 struct ib_wc *wc, int *npolled)
+{
+	struct mlx4_ib_qp *qp;
+
+	*npolled = 0;
+	/* Find uncompleted WQEs belonging to that cq and return
+	 * simulated FLUSH_ERR completions
+	 */
+	list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) {
+		mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 1);
+		if (*npolled >= num_entries)
+			goto out;
+	}
+
+	list_for_each_entry(qp, &cq->recv_qp_list, cq_recv_list) {
+		mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 0);
+		if (*npolled >= num_entries)
+			goto out;
+	}
+
+out:
+	return;
+}
+
+static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
+			    struct mlx4_ib_qp **cur_qp,
+			    struct ib_wc *wc)
+{
+	struct mlx4_cqe *cqe;
+	struct mlx4_qp *mqp;
+	struct mlx4_ib_wq *wq;
+	struct mlx4_ib_srq *srq;
+	struct mlx4_srq *msrq = NULL;
+	int is_send;
+	int is_error;
+	int is_eth;
+	u32 g_mlpath_rqpn;
+	u16 wqe_ctr;
+	unsigned tail = 0;
+
+repoll:
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	if (cq->buf.entry_size == 64)
+		cqe++;
+
+	++cq->mcq.cons_index;
+
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	is_send  = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+		MLX4_CQE_OPCODE_ERROR;
+
+	/* Resize CQ in progress */
+	if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_RESIZE)) {
+		if (cq->resize_buf) {
+			struct mlx4_ib_dev *dev = to_mdev(cq->ibcq.device);
+
+			mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+
+			kfree(cq->resize_buf);
+			cq->resize_buf = NULL;
+		}
+
+		goto repoll;
+	}
+
+	if (!*cur_qp ||
+	    (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) != (*cur_qp)->mqp.qpn) {
+		/*
+		 * We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev,
+				       be32_to_cpu(cqe->vlan_my_qpn));
+		*cur_qp = to_mibqp(mqp);
+	}
+
+	wc->qp = &(*cur_qp)->ibqp;
+
+	if (wc->qp->qp_type == IB_QPT_XRC_TGT) {
+		u32 srq_num;
+		g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn);
+		srq_num       = g_mlpath_rqpn & 0xffffff;
+		/* SRQ is also in the radix tree */
+		msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev,
+				       srq_num);
+	}
+
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		if (!(*cur_qp)->sq_signal_bits) {
+			wqe_ctr = be16_to_cpu(cqe->wqe_index);
+			wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+		}
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	} else if ((*cur_qp)->ibqp.srq) {
+		srq = to_msrq((*cur_qp)->ibqp.srq);
+		wqe_ctr = be16_to_cpu(cqe->wqe_index);
+		wc->wr_id = srq->wrid[wqe_ctr];
+		mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+	} else if (msrq) {
+		srq = to_mibsrq(msrq);
+		wqe_ctr = be16_to_cpu(cqe->wqe_index);
+		wc->wr_id = srq->wrid[wqe_ctr];
+		mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+	} else {
+		wq	  = &(*cur_qp)->rq;
+		tail	  = wq->tail & (wq->wqe_cnt - 1);
+		wc->wr_id = wq->wrid[tail];
+		++wq->tail;
+	}
+
+	if (unlikely(is_error)) {
+		mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
+		return 0;
+	}
+
+	wc->status = IB_WC_SUCCESS;
+
+	if (is_send) {
+		wc->wc_flags = 0;
+		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_OPCODE_RDMA_WRITE_IMM:
+			wc->wc_flags |= IB_WC_WITH_IMM;
+			/* fall through */
+		case MLX4_OPCODE_RDMA_WRITE:
+			wc->opcode    = IB_WC_RDMA_WRITE;
+			break;
+		case MLX4_OPCODE_SEND_IMM:
+			wc->wc_flags |= IB_WC_WITH_IMM;
+			/* fall through */
+		case MLX4_OPCODE_SEND:
+		case MLX4_OPCODE_SEND_INVAL:
+			wc->opcode    = IB_WC_SEND;
+			break;
+		case MLX4_OPCODE_RDMA_READ:
+			wc->opcode    = IB_WC_RDMA_READ;
+			wc->byte_len  = be32_to_cpu(cqe->byte_cnt);
+			break;
+		case MLX4_OPCODE_ATOMIC_CS:
+			wc->opcode    = IB_WC_COMP_SWAP;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_ATOMIC_FA:
+			wc->opcode    = IB_WC_FETCH_ADD;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_MASKED_ATOMIC_CS:
+			wc->opcode    = IB_WC_MASKED_COMP_SWAP;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_MASKED_ATOMIC_FA:
+			wc->opcode    = IB_WC_MASKED_FETCH_ADD;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_LSO:
+			wc->opcode    = IB_WC_LSO;
+			break;
+		case MLX4_OPCODE_FMR:
+			wc->opcode    = IB_WC_REG_MR;
+			break;
+		case MLX4_OPCODE_LOCAL_INVAL:
+			wc->opcode    = IB_WC_LOCAL_INV;
+			break;
+		}
+	} else {
+		wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+
+		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+			wc->opcode	= IB_WC_RECV_RDMA_WITH_IMM;
+			wc->wc_flags	= IB_WC_WITH_IMM;
+			wc->ex.imm_data = cqe->immed_rss_invalid;
+			break;
+		case MLX4_RECV_OPCODE_SEND_INVAL:
+			wc->opcode	= IB_WC_RECV;
+			wc->wc_flags	= IB_WC_WITH_INVALIDATE;
+			wc->ex.invalidate_rkey = be32_to_cpu(cqe->immed_rss_invalid);
+			break;
+		case MLX4_RECV_OPCODE_SEND:
+			wc->opcode   = IB_WC_RECV;
+			wc->wc_flags = 0;
+			break;
+		case MLX4_RECV_OPCODE_SEND_IMM:
+			wc->opcode	= IB_WC_RECV;
+			wc->wc_flags	= IB_WC_WITH_IMM;
+			wc->ex.imm_data = cqe->immed_rss_invalid;
+			break;
+		}
+
+		is_eth = (rdma_port_get_link_layer(wc->qp->device,
+						  (*cur_qp)->port) ==
+			  IB_LINK_LAYER_ETHERNET);
+		if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) {
+			if ((*cur_qp)->mlx4_ib_qp_type &
+			    (MLX4_IB_QPT_PROXY_SMI_OWNER |
+			     MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+				use_tunnel_data(*cur_qp, cq, wc, tail, cqe,
+						is_eth);
+				return 0;
+			}
+		}
+
+		g_mlpath_rqpn	   = be32_to_cpu(cqe->g_mlpath_rqpn);
+		wc->src_qp	   = g_mlpath_rqpn & 0xffffff;
+		wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
+		wc->wc_flags	  |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0;
+		wc->pkey_index     = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f;
+		wc->wc_flags	  |= mlx4_ib_ipoib_csum_ok(cqe->status,
+					cqe->checksum) ? IB_WC_IP_CSUM_OK : 0;
+		if (is_eth) {
+			wc->slid = 0;
+			wc->sl  = be16_to_cpu(cqe->sl_vid) >> 13;
+			if (be32_to_cpu(cqe->vlan_my_qpn) &
+					MLX4_CQE_CVLAN_PRESENT_MASK) {
+				wc->vlan_id = be16_to_cpu(cqe->sl_vid) &
+					MLX4_CQE_VID_MASK;
+			} else {
+				wc->vlan_id = 0xffff;
+			}
+			memcpy(wc->smac, cqe->smac, ETH_ALEN);
+			wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
+		} else {
+			wc->slid = be16_to_cpu(cqe->rlid);
+			wc->sl  = be16_to_cpu(cqe->sl_vid) >> 12;
+			wc->vlan_id = 0xffff;
+		}
+	}
+
+	return 0;
+}
+
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct mlx4_ib_cq *cq = to_mcq(ibcq);
+	struct mlx4_ib_qp *cur_qp = NULL;
+	unsigned long flags;
+	int npolled;
+	struct mlx4_ib_dev *mdev = to_mdev(cq->ibcq.device);
+
+	spin_lock_irqsave(&cq->lock, flags);
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx4_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
+		goto out;
+	}
+
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		if (mlx4_ib_poll_one(cq, &cur_qp, wc + npolled))
+			break;
+	}
+
+	mlx4_cq_set_ci(&cq->mcq);
+
+out:
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return npolled;
+}
+
+int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	mlx4_cq_arm(&to_mcq(ibcq)->mcq,
+		    (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT,
+		    to_mdev(ibcq->device)->uar_map,
+		    MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock));
+
+	return 0;
+}
+
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+	u32 prod_index;
+	int nfreed = 0;
+	struct mlx4_cqe *cqe, *dest;
+	u8 owner_bit;
+	int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0;
+
+	/*
+	 * First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
+		if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+			break;
+
+	/*
+	 * Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		cqe += cqe_inc;
+
+		if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
+			if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+				mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
+			++nfreed;
+		} else if (nfreed) {
+			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+			dest += cqe_inc;
+
+			owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
+			memcpy(dest, cqe, sizeof *cqe);
+			dest->owner_sr_opcode = owner_bit |
+				(dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+		}
+	}
+
+	if (nfreed) {
+		cq->mcq.cons_index += nfreed;
+		/*
+		 * Make sure update of buffer contents is done before
+		 * updating consumer index.
+		 */
+		wmb();
+		mlx4_cq_set_ci(&cq->mcq);
+	}
+}
+
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+	spin_lock_irq(&cq->lock);
+	__mlx4_ib_cq_clean(cq, qpn, srq);
+	spin_unlock_irq(&cq->lock);
+}
diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c
new file mode 100644
index 0000000..c517409
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/doorbell.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+
+struct mlx4_ib_user_db_page {
+	struct list_head	list;
+	struct ib_umem	       *umem;
+	unsigned long		user_virt;
+	int			refcnt;
+};
+
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+			struct mlx4_db *db)
+{
+	struct mlx4_ib_user_db_page *page;
+	int err = 0;
+
+	mutex_lock(&context->db_page_mutex);
+
+	list_for_each_entry(page, &context->db_page_list, list)
+		if (page->user_virt == (virt & PAGE_MASK))
+			goto found;
+
+	page = kmalloc(sizeof *page, GFP_KERNEL);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	page->user_virt = (virt & PAGE_MASK);
+	page->refcnt    = 0;
+	page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
+				      PAGE_SIZE, 0, 0);
+	if (IS_ERR(page->umem)) {
+		err = PTR_ERR(page->umem);
+		kfree(page);
+		goto out;
+	}
+
+	list_add(&page->list, &context->db_page_list);
+
+found:
+	db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK);
+	db->u.user_page = page;
+	++page->refcnt;
+
+out:
+	mutex_unlock(&context->db_page_mutex);
+
+	return err;
+}
+
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db)
+{
+	mutex_lock(&context->db_page_mutex);
+
+	if (!--db->u.user_page->refcnt) {
+		list_del(&db->u.user_page->list);
+		ib_umem_release(db->u.user_page->umem);
+		kfree(db->u.user_page);
+	}
+
+	mutex_unlock(&context->db_page_mutex);
+}
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
new file mode 100644
index 0000000..0793a21
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -0,0 +1,2384 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cache.h>
+
+#include <linux/random.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/gfp.h>
+#include <rdma/ib_pma.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+
+#include <linux/mlx4/driver.h>
+#include "mlx4_ib.h"
+
+enum {
+	MLX4_IB_VENDOR_CLASS1 = 0x9,
+	MLX4_IB_VENDOR_CLASS2 = 0xa
+};
+
+#define MLX4_TUN_SEND_WRID_SHIFT 34
+#define MLX4_TUN_QPN_SHIFT 32
+#define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT)
+#define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT)
+
+#define MLX4_TUN_IS_RECV(a)  (((a) >>  MLX4_TUN_SEND_WRID_SHIFT) & 0x1)
+#define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3)
+
+ /* Port mgmt change event handling */
+
+#define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr)
+#define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask)
+#define NUM_IDX_IN_PKEY_TBL_BLK 32
+#define GUID_TBL_ENTRY_SIZE 8	   /* size in bytes */
+#define GUID_TBL_BLK_NUM_ENTRIES 8
+#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES)
+
+struct mlx4_mad_rcv_buf {
+	struct ib_grh grh;
+	u8 payload[256];
+} __packed;
+
+struct mlx4_mad_snd_buf {
+	u8 payload[256];
+} __packed;
+
+struct mlx4_tunnel_mad {
+	struct ib_grh grh;
+	struct mlx4_ib_tunnel_header hdr;
+	struct ib_mad mad;
+} __packed;
+
+struct mlx4_rcv_tunnel_mad {
+	struct mlx4_rcv_tunnel_hdr hdr;
+	struct ib_grh grh;
+	struct ib_mad mad;
+} __packed;
+
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+				int block, u32 change_bitmap);
+
+__be64 mlx4_ib_gen_node_guid(void)
+{
+#define NODE_GUID_HI	((u64) (((u64)IB_OPENIB_OUI) << 40))
+	return cpu_to_be64(NODE_GUID_HI | prandom_u32());
+}
+
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
+{
+	return cpu_to_be64(atomic_inc_return(&ctx->tid)) |
+		cpu_to_be64(0xff00000000000000LL);
+}
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
+		 int port, const struct ib_wc *in_wc,
+		 const struct ib_grh *in_grh,
+		 const void *in_mad, void *response_mad)
+{
+	struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+	void *inbox;
+	int err;
+	u32 in_modifier = port;
+	u8 op_modifier = 0;
+
+	inmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+	inbox = inmailbox->buf;
+
+	outmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(outmailbox)) {
+		mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	memcpy(inbox, in_mad, 256);
+
+	/*
+	 * Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc)
+		op_modifier |= 0x1;
+	if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc)
+		op_modifier |= 0x2;
+	if (mlx4_is_mfunc(dev->dev) &&
+	    (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc))
+		op_modifier |= 0x8;
+
+	if (in_wc) {
+		struct {
+			__be32		my_qpn;
+			u32		reserved1;
+			__be32		rqpn;
+			u8		sl;
+			u8		g_path;
+			u16		reserved2[2];
+			__be16		pkey;
+			u32		reserved3[11];
+			u8		grh[40];
+		} *ext_info;
+
+		memset(inbox + 256, 0, 256);
+		ext_info = inbox + 256;
+
+		ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num);
+		ext_info->rqpn   = cpu_to_be32(in_wc->src_qp);
+		ext_info->sl     = in_wc->sl << 4;
+		ext_info->g_path = in_wc->dlid_path_bits |
+			(in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+		ext_info->pkey   = cpu_to_be16(in_wc->pkey_index);
+
+		if (in_grh)
+			memcpy(ext_info->grh, in_grh, 40);
+
+		op_modifier |= 0x4;
+
+		in_modifier |= ib_lid_cpu16(in_wc->slid) << 16;
+	}
+
+	err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier,
+			   mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : op_modifier,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   (op_modifier & 0x8) ? MLX4_CMD_NATIVE : MLX4_CMD_WRAPPED);
+
+	if (!err)
+		memcpy(response_mad, outmailbox->buf, 256);
+
+	mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+	mlx4_free_cmd_mailbox(dev->dev, outmailbox);
+
+	return err;
+}
+
+static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl)
+{
+	struct ib_ah *new_ah;
+	struct rdma_ah_attr ah_attr;
+	unsigned long flags;
+
+	if (!dev->send_agent[port_num - 1][0])
+		return;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.type = rdma_ah_find_type(&dev->ib_dev, port_num);
+	rdma_ah_set_dlid(&ah_attr, lid);
+	rdma_ah_set_sl(&ah_attr, sl);
+	rdma_ah_set_port_num(&ah_attr, port_num);
+
+	new_ah = rdma_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+				&ah_attr);
+	if (IS_ERR(new_ah))
+		return;
+
+	spin_lock_irqsave(&dev->sm_lock, flags);
+	if (dev->sm_ah[port_num - 1])
+		rdma_destroy_ah(dev->sm_ah[port_num - 1]);
+	dev->sm_ah[port_num - 1] = new_ah;
+	spin_unlock_irqrestore(&dev->sm_lock, flags);
+}
+
+/*
+ * Snoop SM MADs for port info, GUID info, and  P_Key table sets, so we can
+ * synthesize LID change, Client-Rereg, GID change, and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev, u8 port_num, const struct ib_mad *mad,
+		      u16 prev_lid)
+{
+	struct ib_port_info *pinfo;
+	u16 lid;
+	__be16 *base;
+	u32 bn, pkey_change_bitmap;
+	int i;
+
+
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_SET)
+		switch (mad->mad_hdr.attr_id) {
+		case IB_SMP_ATTR_PORT_INFO:
+			if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
+				return;
+			pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data;
+			lid = be16_to_cpu(pinfo->lid);
+
+			update_sm_ah(dev, port_num,
+				     be16_to_cpu(pinfo->sm_lid),
+				     pinfo->neighbormtu_mastersmsl & 0xf);
+
+			if (pinfo->clientrereg_resv_subnetto & 0x80)
+				handle_client_rereg_event(dev, port_num);
+
+			if (prev_lid != lid)
+				handle_lid_change_event(dev, port_num);
+			break;
+
+		case IB_SMP_ATTR_PKEY_TABLE:
+			if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
+				return;
+			if (!mlx4_is_mfunc(dev->dev)) {
+				mlx4_ib_dispatch_event(dev, port_num,
+						       IB_EVENT_PKEY_CHANGE);
+				break;
+			}
+
+			/* at this point, we are running in the master.
+			 * Slaves do not receive SMPs.
+			 */
+			bn  = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF;
+			base = (__be16 *) &(((struct ib_smp *)mad)->data[0]);
+			pkey_change_bitmap = 0;
+			for (i = 0; i < 32; i++) {
+				pr_debug("PKEY[%d] = x%x\n",
+					 i + bn*32, be16_to_cpu(base[i]));
+				if (be16_to_cpu(base[i]) !=
+				    dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) {
+					pkey_change_bitmap |= (1 << i);
+					dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] =
+						be16_to_cpu(base[i]);
+				}
+			}
+			pr_debug("PKEY Change event: port=%d, "
+				 "block=0x%x, change_bitmap=0x%x\n",
+				 port_num, bn, pkey_change_bitmap);
+
+			if (pkey_change_bitmap) {
+				mlx4_ib_dispatch_event(dev, port_num,
+						       IB_EVENT_PKEY_CHANGE);
+				if (!dev->sriov.is_going_down)
+					__propagate_pkey_ev(dev, port_num, bn,
+							    pkey_change_bitmap);
+			}
+			break;
+
+		case IB_SMP_ATTR_GUID_INFO:
+			if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
+				return;
+			/* paravirtualized master's guid is guid 0 -- does not change */
+			if (!mlx4_is_master(dev->dev))
+				mlx4_ib_dispatch_event(dev, port_num,
+						       IB_EVENT_GID_CHANGE);
+			/*if master, notify relevant slaves*/
+			if (mlx4_is_master(dev->dev) &&
+			    !dev->sriov.is_going_down) {
+				bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod);
+				mlx4_ib_update_cache_on_guid_change(dev, bn, port_num,
+								    (u8 *)(&((struct ib_smp *)mad)->data));
+				mlx4_ib_notify_slaves_on_guid_change(dev, bn, port_num,
+								     (u8 *)(&((struct ib_smp *)mad)->data));
+			}
+			break;
+
+		case IB_SMP_ATTR_SL_TO_VL_TABLE:
+			/* cache sl to vl mapping changes for use in
+			 * filling QP1 LRH VL field when sending packets
+			 */
+			if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV &&
+			    dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT)
+				return;
+			if (!mlx4_is_slave(dev->dev)) {
+				union sl2vl_tbl_to_u64 sl2vl64;
+				int jj;
+
+				for (jj = 0; jj < 8; jj++) {
+					sl2vl64.sl8[jj] = ((struct ib_smp *)mad)->data[jj];
+					pr_debug("port %u, sl2vl[%d] = %02x\n",
+						 port_num, jj, sl2vl64.sl8[jj]);
+				}
+				atomic64_set(&dev->sl2vl[port_num - 1], sl2vl64.sl64);
+			}
+			break;
+
+		default:
+			break;
+		}
+}
+
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+				int block, u32 change_bitmap)
+{
+	int i, ix, slave, err;
+	int have_event = 0;
+
+	for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) {
+		if (slave == mlx4_master_func_num(dev->dev))
+			continue;
+		if (!mlx4_is_slave_active(dev->dev, slave))
+			continue;
+
+		have_event = 0;
+		for (i = 0; i < 32; i++) {
+			if (!(change_bitmap & (1 << i)))
+				continue;
+			for (ix = 0;
+			     ix < dev->dev->caps.pkey_table_len[port_num]; ix++) {
+				if (dev->pkeys.virt2phys_pkey[slave][port_num - 1]
+				    [ix] == i + 32 * block) {
+					err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num);
+					pr_debug("propagate_pkey_ev: slave %d,"
+						 " port %d, ix %d (%d)\n",
+						 slave, port_num, ix, err);
+					have_event = 1;
+					break;
+				}
+			}
+			if (have_event)
+				break;
+		}
+	}
+}
+
+static void node_desc_override(struct ib_device *dev,
+			       struct ib_mad *mad)
+{
+	unsigned long flags;
+
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+	    mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+		spin_lock_irqsave(&to_mdev(dev)->sm_lock, flags);
+		memcpy(((struct ib_smp *) mad)->data, dev->node_desc,
+		       IB_DEVICE_NODE_DESC_MAX);
+		spin_unlock_irqrestore(&to_mdev(dev)->sm_lock, flags);
+	}
+}
+
+static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, const struct ib_mad *mad)
+{
+	int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+	int ret;
+	unsigned long flags;
+
+	if (agent) {
+		send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+					      IB_MGMT_MAD_DATA, GFP_ATOMIC,
+					      IB_MGMT_BASE_VERSION);
+		if (IS_ERR(send_buf))
+			return;
+		/*
+		 * We rely here on the fact that MLX QPs don't use the
+		 * address handle after the send is posted (this is
+		 * wrong following the IB spec strictly, but we know
+		 * it's OK for our devices).
+		 */
+		spin_lock_irqsave(&dev->sm_lock, flags);
+		memcpy(send_buf->mad, mad, sizeof *mad);
+		if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+			ret = ib_post_send_mad(send_buf, NULL);
+		else
+			ret = -EINVAL;
+		spin_unlock_irqrestore(&dev->sm_lock, flags);
+
+		if (ret)
+			ib_free_send_mad(send_buf);
+	}
+}
+
+static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave,
+							     struct ib_sa_mad *sa_mad)
+{
+	int ret = 0;
+
+	/* dispatch to different sa handlers */
+	switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+	case IB_SA_ATTR_MC_MEMBER_REC:
+		ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int i;
+
+	for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+		if (dev->sriov.demux[port - 1].guid_cache[i] == guid)
+			return i;
+	}
+	return -1;
+}
+
+
+static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave,
+				   u8 port, u16 pkey, u16 *ix)
+{
+	int i, ret;
+	u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF;
+	u16 slot_pkey;
+
+	if (slave == mlx4_master_func_num(dev->dev))
+		return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix);
+
+	unassigned_pkey_ix = dev->dev->phys_caps.pkey_phys_table_len[port] - 1;
+
+	for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) {
+		if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix)
+			continue;
+
+		pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i];
+
+		ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey);
+		if (ret)
+			continue;
+		if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) {
+			if (slot_pkey & 0x8000) {
+				*ix = (u16) pkey_ix;
+				return 0;
+			} else {
+				/* take first partial pkey index found */
+				if (partial_ix == 0xFF)
+					partial_ix = pkey_ix;
+			}
+		}
+	}
+
+	if (partial_ix < 0xFF) {
+		*ix = (u16) partial_ix;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int get_gids_from_l3_hdr(struct ib_grh *grh, union ib_gid *sgid,
+				union ib_gid *dgid)
+{
+	int version = ib_get_rdma_header_version((const union rdma_network_hdr *)grh);
+	enum rdma_network_type net_type;
+
+	if (version == 4)
+		net_type = RDMA_NETWORK_IPV4;
+	else if (version == 6)
+		net_type = RDMA_NETWORK_IPV6;
+	else
+		return -EINVAL;
+
+	return ib_get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type,
+					 sgid, dgid);
+}
+
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
+			  enum ib_qp_type dest_qpt, struct ib_wc *wc,
+			  struct ib_grh *grh, struct ib_mad *mad)
+{
+	struct ib_sge list;
+	struct ib_ud_wr wr;
+	struct ib_send_wr *bad_wr;
+	struct mlx4_ib_demux_pv_ctx *tun_ctx;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	struct mlx4_rcv_tunnel_mad *tun_mad;
+	struct rdma_ah_attr attr;
+	struct ib_ah *ah;
+	struct ib_qp *src_qp = NULL;
+	unsigned tun_tx_ix = 0;
+	int dqpn;
+	int ret = 0;
+	u16 tun_pkey_ix;
+	u16 cached_pkey;
+	u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
+
+	if (dest_qpt > IB_QPT_GSI)
+		return -EINVAL;
+
+	tun_ctx = dev->sriov.demux[port-1].tun[slave];
+
+	/* check if proxy qp created */
+	if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE)
+		return -EAGAIN;
+
+	if (!dest_qpt)
+		tun_qp = &tun_ctx->qp[0];
+	else
+		tun_qp = &tun_ctx->qp[1];
+
+	/* compute P_Key index to put in tunnel header for slave */
+	if (dest_qpt) {
+		u16 pkey_ix;
+		ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey);
+		if (ret)
+			return -EINVAL;
+
+		ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix);
+		if (ret)
+			return -EINVAL;
+		tun_pkey_ix = pkey_ix;
+	} else
+		tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
+
+	dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + (dest_qpt * 2) - 1;
+
+	/* get tunnel tx data buf for slave */
+	src_qp = tun_qp->qp;
+
+	/* create ah. Just need an empty one with the port num for the post send.
+	 * The driver will set the force loopback bit in post_send */
+	memset(&attr, 0, sizeof attr);
+	attr.type = rdma_ah_find_type(&dev->ib_dev, port);
+
+	rdma_ah_set_port_num(&attr, port);
+	if (is_eth) {
+		union ib_gid sgid;
+		union ib_gid dgid;
+
+		if (get_gids_from_l3_hdr(grh, &sgid, &dgid))
+			return -EINVAL;
+		rdma_ah_set_grh(&attr, &dgid, 0, 0, 0, 0);
+	}
+	ah = rdma_create_ah(tun_ctx->pd, &attr);
+	if (IS_ERR(ah))
+		return -ENOMEM;
+
+	/* allocate tunnel tx buf after pass failure returns */
+	spin_lock(&tun_qp->tx_lock);
+	if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >=
+	    (MLX4_NUM_TUNNEL_BUFS - 1))
+		ret = -EAGAIN;
+	else
+		tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
+	spin_unlock(&tun_qp->tx_lock);
+	if (ret)
+		goto end;
+
+	tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr);
+	if (tun_qp->tx_ring[tun_tx_ix].ah)
+		rdma_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah);
+	tun_qp->tx_ring[tun_tx_ix].ah = ah;
+	ib_dma_sync_single_for_cpu(&dev->ib_dev,
+				   tun_qp->tx_ring[tun_tx_ix].buf.map,
+				   sizeof (struct mlx4_rcv_tunnel_mad),
+				   DMA_TO_DEVICE);
+
+	/* copy over to tunnel buffer */
+	if (grh)
+		memcpy(&tun_mad->grh, grh, sizeof *grh);
+	memcpy(&tun_mad->mad, mad, sizeof *mad);
+
+	/* adjust tunnel data */
+	tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix);
+	tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF);
+	tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0;
+
+	if (is_eth) {
+		u16 vlan = 0;
+		if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan,
+						NULL)) {
+			/* VST mode */
+			if (vlan != wc->vlan_id)
+				/* Packet vlan is not the VST-assigned vlan.
+				 * Drop the packet.
+				 */
+				goto out;
+			 else
+				/* Remove the vlan tag before forwarding
+				 * the packet to the VF.
+				 */
+				vlan = 0xffff;
+		} else {
+			vlan = wc->vlan_id;
+		}
+
+		tun_mad->hdr.sl_vid = cpu_to_be16(vlan);
+		memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4);
+		memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2);
+	} else {
+		tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12);
+		tun_mad->hdr.slid_mac_47_32 = ib_lid_be16(wc->slid);
+	}
+
+	ib_dma_sync_single_for_device(&dev->ib_dev,
+				      tun_qp->tx_ring[tun_tx_ix].buf.map,
+				      sizeof (struct mlx4_rcv_tunnel_mad),
+				      DMA_TO_DEVICE);
+
+	list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map;
+	list.length = sizeof (struct mlx4_rcv_tunnel_mad);
+	list.lkey = tun_ctx->pd->local_dma_lkey;
+
+	wr.ah = ah;
+	wr.port_num = port;
+	wr.remote_qkey = IB_QP_SET_QKEY;
+	wr.remote_qpn = dqpn;
+	wr.wr.next = NULL;
+	wr.wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt);
+	wr.wr.sg_list = &list;
+	wr.wr.num_sge = 1;
+	wr.wr.opcode = IB_WR_SEND;
+	wr.wr.send_flags = IB_SEND_SIGNALED;
+
+	ret = ib_post_send(src_qp, &wr.wr, &bad_wr);
+	if (!ret)
+		return 0;
+ out:
+	spin_lock(&tun_qp->tx_lock);
+	tun_qp->tx_ix_tail++;
+	spin_unlock(&tun_qp->tx_lock);
+	tun_qp->tx_ring[tun_tx_ix].ah = NULL;
+end:
+	rdma_destroy_ah(ah);
+	return ret;
+}
+
+static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port,
+			struct ib_wc *wc, struct ib_grh *grh,
+			struct ib_mad *mad)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int err, other_port;
+	int slave = -1;
+	u8 *slave_id;
+	int is_eth = 0;
+
+	if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
+		is_eth = 0;
+	else
+		is_eth = 1;
+
+	if (is_eth) {
+		union ib_gid dgid;
+		union ib_gid sgid;
+
+		if (get_gids_from_l3_hdr(grh, &sgid, &dgid))
+			return -EINVAL;
+		if (!(wc->wc_flags & IB_WC_GRH)) {
+			mlx4_ib_warn(ibdev, "RoCE grh not present.\n");
+			return -EINVAL;
+		}
+		if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) {
+			mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n");
+			return -EINVAL;
+		}
+		err = mlx4_get_slave_from_roce_gid(dev->dev, port, dgid.raw, &slave);
+		if (err && mlx4_is_mf_bonded(dev->dev)) {
+			other_port = (port == 1) ? 2 : 1;
+			err = mlx4_get_slave_from_roce_gid(dev->dev, other_port, dgid.raw, &slave);
+			if (!err) {
+				port = other_port;
+				pr_debug("resolved slave %d from gid %pI6 wire port %d other %d\n",
+					 slave, grh->dgid.raw, port, other_port);
+			}
+		}
+		if (err) {
+			mlx4_ib_warn(ibdev, "failed matching grh\n");
+			return -ENOENT;
+		}
+		if (slave >= dev->dev->caps.sqp_demux) {
+			mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n",
+				     slave, dev->dev->caps.sqp_demux);
+			return -ENOENT;
+		}
+
+		if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad))
+			return 0;
+
+		err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
+		if (err)
+			pr_debug("failed sending to slave %d via tunnel qp (%d)\n",
+				 slave, err);
+		return 0;
+	}
+
+	/* Initially assume that this mad is for us */
+	slave = mlx4_master_func_num(dev->dev);
+
+	/* See if the slave id is encoded in a response mad */
+	if (mad->mad_hdr.method & 0x80) {
+		slave_id = (u8 *) &mad->mad_hdr.tid;
+		slave = *slave_id;
+		if (slave != 255) /*255 indicates the dom0*/
+			*slave_id = 0; /* remap tid */
+	}
+
+	/* If a grh is present, we demux according to it */
+	if (wc->wc_flags & IB_WC_GRH) {
+		if (grh->dgid.global.interface_id ==
+			cpu_to_be64(IB_SA_WELL_KNOWN_GUID) &&
+		    grh->dgid.global.subnet_prefix == cpu_to_be64(
+			atomic64_read(&dev->sriov.demux[port - 1].subnet_prefix))) {
+			slave = 0;
+		} else {
+			slave = mlx4_ib_find_real_gid(ibdev, port,
+						      grh->dgid.global.interface_id);
+			if (slave < 0) {
+				mlx4_ib_warn(ibdev, "failed matching grh\n");
+				return -ENOENT;
+			}
+		}
+	}
+	/* Class-specific handling */
+	switch (mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+		/* 255 indicates the dom0 */
+		if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) {
+			if (!mlx4_vf_smi_enabled(dev->dev, slave, port))
+				return -EPERM;
+			/* for a VF. drop unsolicited MADs */
+			if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) {
+				mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n",
+					     slave, mad->mad_hdr.mgmt_class,
+					     mad->mad_hdr.method);
+				return -EINVAL;
+			}
+		}
+		break;
+	case IB_MGMT_CLASS_SUBN_ADM:
+		if (mlx4_ib_demux_sa_handler(ibdev, port, slave,
+					     (struct ib_sa_mad *) mad))
+			return 0;
+		break;
+	case IB_MGMT_CLASS_CM:
+		if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad))
+			return 0;
+		break;
+	case IB_MGMT_CLASS_DEVICE_MGMT:
+		if (mad->mad_hdr.method != IB_MGMT_METHOD_GET_RESP)
+			return 0;
+		break;
+	default:
+		/* Drop unsupported classes for slaves in tunnel mode */
+		if (slave != mlx4_master_func_num(dev->dev)) {
+			pr_debug("dropping unsupported ingress mad from class:%d "
+				 "for slave:%d\n", mad->mad_hdr.mgmt_class, slave);
+			return 0;
+		}
+	}
+	/*make sure that no slave==255 was not handled yet.*/
+	if (slave >= dev->dev->caps.sqp_demux) {
+		mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n",
+			     slave, dev->dev->caps.sqp_demux);
+		return -ENOENT;
+	}
+
+	err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
+	if (err)
+		pr_debug("failed sending to slave %d via tunnel qp (%d)\n",
+			 slave, err);
+	return 0;
+}
+
+static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			const struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	u16 slid, prev_lid = 0;
+	int err;
+	struct ib_port_attr pattr;
+
+	if (in_wc && in_wc->qp->qp_num) {
+		pr_debug("received MAD: slid:%d sqpn:%d "
+			"dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n",
+			in_wc->slid, in_wc->src_qp,
+			in_wc->dlid_path_bits,
+			in_wc->qp->qp_num,
+			in_wc->wc_flags,
+			in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method,
+			be16_to_cpu(in_mad->mad_hdr.attr_id));
+		if (in_wc->wc_flags & IB_WC_GRH) {
+			pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n",
+				 be64_to_cpu(in_grh->sgid.global.subnet_prefix),
+				 be64_to_cpu(in_grh->sgid.global.interface_id));
+			pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n",
+				 be64_to_cpu(in_grh->dgid.global.subnet_prefix),
+				 be64_to_cpu(in_grh->dgid.global.interface_id));
+		}
+	}
+
+	slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
+		forward_trap(to_mdev(ibdev), port_num, in_mad);
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+	}
+
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/*
+		 * Don't process SMInfo queries -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO)
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1   ||
+		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2   ||
+		   in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else
+		return IB_MAD_RESULT_SUCCESS;
+
+	if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
+	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+	    !ib_query_port(ibdev, port_num, &pattr))
+		prev_lid = ib_lid_cpu16(pattr.lid);
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev),
+			   (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) |
+			   (mad_flags & IB_MAD_IGNORE_BKEY ? MLX4_MAD_IFC_IGNORE_BKEY : 0) |
+			   MLX4_MAD_IFC_NET_VIEW,
+			   port_num, in_wc, in_grh, in_mad, out_mad);
+	if (err)
+		return IB_MAD_RESULT_FAILURE;
+
+	if (!out_mad->mad_hdr.status) {
+		smp_snoop(ibdev, port_num, in_mad, prev_lid);
+		/* slaves get node desc from FW */
+		if (!mlx4_is_slave(to_mdev(ibdev)->dev))
+			node_desc_override(ibdev, out_mad);
+	}
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void edit_counter(struct mlx4_counter *cnt, void *counters,
+			 __be16 attr_id)
+{
+	switch (attr_id) {
+	case IB_PMA_PORT_COUNTERS:
+	{
+		struct ib_pma_portcounters *pma_cnt =
+			(struct ib_pma_portcounters *)counters;
+
+		ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data,
+				     (be64_to_cpu(cnt->tx_bytes) >> 2));
+		ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data,
+				     (be64_to_cpu(cnt->rx_bytes) >> 2));
+		ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets,
+				     be64_to_cpu(cnt->tx_frames));
+		ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets,
+				     be64_to_cpu(cnt->rx_frames));
+		break;
+	}
+	case IB_PMA_PORT_COUNTERS_EXT:
+	{
+		struct ib_pma_portcounters_ext *pma_cnt_ext =
+			(struct ib_pma_portcounters_ext *)counters;
+
+		pma_cnt_ext->port_xmit_data =
+			cpu_to_be64(be64_to_cpu(cnt->tx_bytes) >> 2);
+		pma_cnt_ext->port_rcv_data =
+			cpu_to_be64(be64_to_cpu(cnt->rx_bytes) >> 2);
+		pma_cnt_ext->port_xmit_packets = cnt->tx_frames;
+		pma_cnt_ext->port_rcv_packets = cnt->rx_frames;
+		break;
+	}
+	}
+}
+
+static int iboe_process_mad_port_info(void *out_mad)
+{
+	struct ib_class_port_info cpi = {};
+
+	cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
+	memcpy(out_mad, &cpi, sizeof(cpi));
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			const struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	struct mlx4_counter counter_stats;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct counter_index *tmp_counter;
+	int err = IB_MAD_RESULT_FAILURE, stats_avail = 0;
+
+	if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
+		return -EINVAL;
+
+	if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO)
+		return iboe_process_mad_port_info((void *)(out_mad->data + 40));
+
+	memset(&counter_stats, 0, sizeof(counter_stats));
+	mutex_lock(&dev->counters_table[port_num - 1].mutex);
+	list_for_each_entry(tmp_counter,
+			    &dev->counters_table[port_num - 1].counters_list,
+			    list) {
+		err = mlx4_get_counter_stats(dev->dev,
+					     tmp_counter->index,
+					     &counter_stats, 0);
+		if (err) {
+			err = IB_MAD_RESULT_FAILURE;
+			stats_avail = 0;
+			break;
+		}
+		stats_avail = 1;
+	}
+	mutex_unlock(&dev->counters_table[port_num - 1].mutex);
+	if (stats_avail) {
+		memset(out_mad->data, 0, sizeof out_mad->data);
+		switch (counter_stats.counter_mode & 0xf) {
+		case 0:
+			edit_counter(&counter_stats,
+				     (void *)(out_mad->data + 40),
+				     in_mad->mad_hdr.attr_id);
+			err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+			break;
+		default:
+			err = IB_MAD_RESULT_FAILURE;
+		}
+	}
+
+	return err;
+}
+
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			const struct ib_mad_hdr *in, size_t in_mad_size,
+			struct ib_mad_hdr *out, size_t *out_mad_size,
+			u16 *out_mad_pkey_index)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	const struct ib_mad *in_mad = (const struct ib_mad *)in;
+	struct ib_mad *out_mad = (struct ib_mad *)out;
+	enum rdma_link_layer link = rdma_port_get_link_layer(ibdev, port_num);
+
+	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
+			 *out_mad_size != sizeof(*out_mad)))
+		return IB_MAD_RESULT_FAILURE;
+
+	/* iboe_process_mad() which uses the HCA flow-counters to implement IB PMA
+	 * queries, should be called only by VFs and for that specific purpose
+	 */
+	if (link == IB_LINK_LAYER_INFINIBAND) {
+		if (mlx4_is_slave(dev->dev) &&
+		    (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
+		     (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS ||
+		      in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT ||
+		      in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO)))
+			return iboe_process_mad(ibdev, mad_flags, port_num, in_wc,
+						in_grh, in_mad, out_mad);
+
+		return ib_process_mad(ibdev, mad_flags, port_num, in_wc,
+				      in_grh, in_mad, out_mad);
+	}
+
+	if (link == IB_LINK_LAYER_ETHERNET)
+		return iboe_process_mad(ibdev, mad_flags, port_num, in_wc,
+					in_grh, in_mad, out_mad);
+
+	return -EINVAL;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	if (mad_send_wc->send_buf->context[0])
+		rdma_destroy_ah(mad_send_wc->send_buf->context[0]);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+	int ret;
+	enum rdma_link_layer ll;
+
+	for (p = 0; p < dev->num_ports; ++p) {
+		ll = rdma_port_get_link_layer(&dev->ib_dev, p + 1);
+		for (q = 0; q <= 1; ++q) {
+			if (ll == IB_LINK_LAYER_INFINIBAND) {
+				agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+							      q ? IB_QPT_GSI : IB_QPT_SMI,
+							      NULL, 0, send_handler,
+							      NULL, NULL, 0);
+				if (IS_ERR(agent)) {
+					ret = PTR_ERR(agent);
+					goto err;
+				}
+				dev->send_agent[p][q] = agent;
+			} else
+				dev->send_agent[p][q] = NULL;
+		}
+	}
+
+	return 0;
+
+err:
+	for (p = 0; p < dev->num_ports; ++p)
+		for (q = 0; q <= 1; ++q)
+			if (dev->send_agent[p][q])
+				ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+	return ret;
+}
+
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+
+	for (p = 0; p < dev->num_ports; ++p) {
+		for (q = 0; q <= 1; ++q) {
+			agent = dev->send_agent[p][q];
+			if (agent) {
+				dev->send_agent[p][q] = NULL;
+				ib_unregister_mad_agent(agent);
+			}
+		}
+
+		if (dev->sm_ah[p])
+			rdma_destroy_ah(dev->sm_ah[p]);
+	}
+}
+
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num)
+{
+	mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE);
+
+	if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down)
+		mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
+					    MLX4_EQ_PORT_INFO_LID_CHANGE_MASK);
+}
+
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num)
+{
+	/* re-configure the alias-guid and mcg's */
+	if (mlx4_is_master(dev->dev)) {
+		mlx4_ib_invalidate_all_guid_record(dev, port_num);
+
+		if (!dev->sriov.is_going_down) {
+			mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0);
+			mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
+						    MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK);
+		}
+	}
+
+	/* Update the sl to vl table from inside client rereg
+	 * only if in secure-host mode (snooping is not possible)
+	 * and the sl-to-vl change event is not generated by FW.
+	 */
+	if (!mlx4_is_slave(dev->dev) &&
+	    dev->dev->flags & MLX4_FLAG_SECURE_HOST &&
+	    !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT)) {
+		if (mlx4_is_master(dev->dev))
+			/* already in work queue from mlx4_ib_event queueing
+			 * mlx4_handle_port_mgmt_change_event, which calls
+			 * this procedure. Therefore, call sl2vl_update directly.
+			 */
+			mlx4_ib_sl2vl_update(dev, port_num);
+		else
+			mlx4_sched_ib_sl2vl_update_work(dev, port_num);
+	}
+	mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER);
+}
+
+static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+			      struct mlx4_eqe *eqe)
+{
+	__propagate_pkey_ev(dev, port_num, GET_BLK_PTR_FROM_EQE(eqe),
+			    GET_MASK_FROM_EQE(eqe));
+}
+
+static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num,
+				      u32 guid_tbl_blk_num, u32 change_bitmap)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad  = NULL;
+	u16 i;
+
+	if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev))
+		return;
+
+	in_mad  = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	guid_tbl_blk_num  *= 4;
+
+	for (i = 0; i < 4; i++) {
+		if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff)))
+			continue;
+		memset(in_mad, 0, sizeof *in_mad);
+		memset(out_mad, 0, sizeof *out_mad);
+
+		in_mad->base_version  = 1;
+		in_mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+		in_mad->class_version = 1;
+		in_mad->method        = IB_MGMT_METHOD_GET;
+		in_mad->attr_id       = IB_SMP_ATTR_GUID_INFO;
+		in_mad->attr_mod      = cpu_to_be32(guid_tbl_blk_num + i);
+
+		if (mlx4_MAD_IFC(dev,
+				 MLX4_MAD_IFC_IGNORE_KEYS | MLX4_MAD_IFC_NET_VIEW,
+				 port_num, NULL, NULL, in_mad, out_mad)) {
+			mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n");
+			goto out;
+		}
+
+		mlx4_ib_update_cache_on_guid_change(dev, guid_tbl_blk_num + i,
+						    port_num,
+						    (u8 *)(&((struct ib_smp *)out_mad)->data));
+		mlx4_ib_notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i,
+						     port_num,
+						     (u8 *)(&((struct ib_smp *)out_mad)->data));
+	}
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return;
+}
+
+void handle_port_mgmt_change_event(struct work_struct *work)
+{
+	struct ib_event_work *ew = container_of(work, struct ib_event_work, work);
+	struct mlx4_ib_dev *dev = ew->ib_dev;
+	struct mlx4_eqe *eqe = &(ew->ib_eqe);
+	u8 port = eqe->event.port_mgmt_change.port;
+	u32 changed_attr;
+	u32 tbl_block;
+	u32 change_bitmap;
+
+	switch (eqe->subtype) {
+	case MLX4_DEV_PMC_SUBTYPE_PORT_INFO:
+		changed_attr = be32_to_cpu(eqe->event.port_mgmt_change.params.port_info.changed_attr);
+
+		/* Update the SM ah - This should be done before handling
+		   the other changed attributes so that MADs can be sent to the SM */
+		if (changed_attr & MSTR_SM_CHANGE_MASK) {
+			u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid);
+			u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf;
+			update_sm_ah(dev, port, lid, sl);
+		}
+
+		/* Check if it is a lid change event */
+		if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK)
+			handle_lid_change_event(dev, port);
+
+		/* Generate GUID changed event */
+		if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) {
+			if (mlx4_is_master(dev->dev)) {
+				union ib_gid gid;
+				int err = 0;
+
+				if (!eqe->event.port_mgmt_change.params.port_info.gid_prefix)
+					err = __mlx4_ib_query_gid(&dev->ib_dev, port, 0, &gid, 1);
+				else
+					gid.global.subnet_prefix =
+						eqe->event.port_mgmt_change.params.port_info.gid_prefix;
+				if (err) {
+					pr_warn("Could not change QP1 subnet prefix for port %d: query_gid error (%d)\n",
+						port, err);
+				} else {
+					pr_debug("Changing QP1 subnet prefix for port %d. old=0x%llx. new=0x%llx\n",
+						 port,
+						 (u64)atomic64_read(&dev->sriov.demux[port - 1].subnet_prefix),
+						 be64_to_cpu(gid.global.subnet_prefix));
+					atomic64_set(&dev->sriov.demux[port - 1].subnet_prefix,
+						     be64_to_cpu(gid.global.subnet_prefix));
+				}
+			}
+			mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
+			/*if master, notify all slaves*/
+			if (mlx4_is_master(dev->dev))
+				mlx4_gen_slaves_port_mgt_ev(dev->dev, port,
+							    MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK);
+		}
+
+		if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK)
+			handle_client_rereg_event(dev, port);
+		break;
+
+	case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE:
+		mlx4_ib_dispatch_event(dev, port, IB_EVENT_PKEY_CHANGE);
+		if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down)
+			propagate_pkey_ev(dev, port, eqe);
+		break;
+	case MLX4_DEV_PMC_SUBTYPE_GUID_INFO:
+		/* paravirtualized master's guid is guid 0 -- does not change */
+		if (!mlx4_is_master(dev->dev))
+			mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
+		/*if master, notify relevant slaves*/
+		else if (!dev->sriov.is_going_down) {
+			tbl_block = GET_BLK_PTR_FROM_EQE(eqe);
+			change_bitmap = GET_MASK_FROM_EQE(eqe);
+			handle_slaves_guid_change(dev, port, tbl_block, change_bitmap);
+		}
+		break;
+
+	case MLX4_DEV_PMC_SUBTYPE_SL_TO_VL_MAP:
+		/* cache sl to vl mapping changes for use in
+		 * filling QP1 LRH VL field when sending packets
+		 */
+		if (!mlx4_is_slave(dev->dev)) {
+			union sl2vl_tbl_to_u64 sl2vl64;
+			int jj;
+
+			for (jj = 0; jj < 8; jj++) {
+				sl2vl64.sl8[jj] =
+					eqe->event.port_mgmt_change.params.sl2vl_tbl_change_info.sl2vl_table[jj];
+				pr_debug("port %u, sl2vl[%d] = %02x\n",
+					 port, jj, sl2vl64.sl8[jj]);
+			}
+			atomic64_set(&dev->sl2vl[port - 1], sl2vl64.sl64);
+		}
+		break;
+	default:
+		pr_warn("Unsupported subtype 0x%x for "
+			"Port Management Change event\n", eqe->subtype);
+	}
+
+	kfree(ew);
+}
+
+void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num,
+			    enum ib_event_type type)
+{
+	struct ib_event event;
+
+	event.device		= &dev->ib_dev;
+	event.element.port_num	= port_num;
+	event.event		= type;
+
+	ib_dispatch_event(&event);
+}
+
+static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg)
+{
+	unsigned long flags;
+	struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context;
+	struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE)
+		queue_work(ctx->wq, &ctx->work);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
+				  struct mlx4_ib_demux_pv_qp *tun_qp,
+				  int index)
+{
+	struct ib_sge sg_list;
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	int size;
+
+	size = (tun_qp->qp->qp_type == IB_QPT_UD) ?
+		sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf);
+
+	sg_list.addr = tun_qp->ring[index].map;
+	sg_list.length = size;
+	sg_list.lkey = ctx->pd->local_dma_lkey;
+
+	recv_wr.next = NULL;
+	recv_wr.sg_list = &sg_list;
+	recv_wr.num_sge = 1;
+	recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV |
+		MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt);
+	ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map,
+				      size, DMA_FROM_DEVICE);
+	return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr);
+}
+
+static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port,
+		int slave, struct ib_sa_mad *sa_mad)
+{
+	int ret = 0;
+
+	/* dispatch to different sa handlers */
+	switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+	case IB_SA_ATTR_MC_MEMBER_REC:
+		ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)
+{
+	int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave;
+
+	return (qpn >= proxy_start && qpn <= proxy_start + 1);
+}
+
+
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
+			 enum ib_qp_type dest_qpt, u16 pkey_index,
+			 u32 remote_qpn, u32 qkey, struct rdma_ah_attr *attr,
+			 u8 *s_mac, u16 vlan_id, struct ib_mad *mad)
+{
+	struct ib_sge list;
+	struct ib_ud_wr wr;
+	struct ib_send_wr *bad_wr;
+	struct mlx4_ib_demux_pv_ctx *sqp_ctx;
+	struct mlx4_ib_demux_pv_qp *sqp;
+	struct mlx4_mad_snd_buf *sqp_mad;
+	struct ib_ah *ah;
+	struct ib_qp *send_qp = NULL;
+	struct ib_global_route *grh;
+	unsigned wire_tx_ix = 0;
+	int ret = 0;
+	u16 wire_pkey_ix;
+	int src_qpnum;
+	u8 sgid_index;
+
+
+	sqp_ctx = dev->sriov.sqps[port-1];
+
+	/* check if proxy qp created */
+	if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE)
+		return -EAGAIN;
+
+	if (dest_qpt == IB_QPT_SMI) {
+		src_qpnum = 0;
+		sqp = &sqp_ctx->qp[0];
+		wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
+	} else {
+		src_qpnum = 1;
+		sqp = &sqp_ctx->qp[1];
+		wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index];
+	}
+
+	send_qp = sqp->qp;
+
+	/* create ah */
+	grh = rdma_ah_retrieve_grh(attr);
+	sgid_index = grh->sgid_index;
+	grh->sgid_index = 0;
+	ah = rdma_create_ah(sqp_ctx->pd, attr);
+	if (IS_ERR(ah))
+		return -ENOMEM;
+	grh->sgid_index = sgid_index;
+	to_mah(ah)->av.ib.gid_index = sgid_index;
+	/* get rid of force-loopback bit */
+	to_mah(ah)->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF);
+	spin_lock(&sqp->tx_lock);
+	if (sqp->tx_ix_head - sqp->tx_ix_tail >=
+	    (MLX4_NUM_TUNNEL_BUFS - 1))
+		ret = -EAGAIN;
+	else
+		wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
+	spin_unlock(&sqp->tx_lock);
+	if (ret)
+		goto out;
+
+	sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr);
+	if (sqp->tx_ring[wire_tx_ix].ah)
+		rdma_destroy_ah(sqp->tx_ring[wire_tx_ix].ah);
+	sqp->tx_ring[wire_tx_ix].ah = ah;
+	ib_dma_sync_single_for_cpu(&dev->ib_dev,
+				   sqp->tx_ring[wire_tx_ix].buf.map,
+				   sizeof (struct mlx4_mad_snd_buf),
+				   DMA_TO_DEVICE);
+
+	memcpy(&sqp_mad->payload, mad, sizeof *mad);
+
+	ib_dma_sync_single_for_device(&dev->ib_dev,
+				      sqp->tx_ring[wire_tx_ix].buf.map,
+				      sizeof (struct mlx4_mad_snd_buf),
+				      DMA_TO_DEVICE);
+
+	list.addr = sqp->tx_ring[wire_tx_ix].buf.map;
+	list.length = sizeof (struct mlx4_mad_snd_buf);
+	list.lkey = sqp_ctx->pd->local_dma_lkey;
+
+	wr.ah = ah;
+	wr.port_num = port;
+	wr.pkey_index = wire_pkey_ix;
+	wr.remote_qkey = qkey;
+	wr.remote_qpn = remote_qpn;
+	wr.wr.next = NULL;
+	wr.wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum);
+	wr.wr.sg_list = &list;
+	wr.wr.num_sge = 1;
+	wr.wr.opcode = IB_WR_SEND;
+	wr.wr.send_flags = IB_SEND_SIGNALED;
+	if (s_mac)
+		memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6);
+	if (vlan_id < 0x1000)
+		vlan_id |= (rdma_ah_get_sl(attr) & 7) << 13;
+	to_mah(ah)->av.eth.vlan = cpu_to_be16(vlan_id);
+
+
+	ret = ib_post_send(send_qp, &wr.wr, &bad_wr);
+	if (!ret)
+		return 0;
+
+	spin_lock(&sqp->tx_lock);
+	sqp->tx_ix_tail++;
+	spin_unlock(&sqp->tx_lock);
+	sqp->tx_ring[wire_tx_ix].ah = NULL;
+out:
+	rdma_destroy_ah(ah);
+	return ret;
+}
+
+static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port)
+{
+	if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND)
+		return slave;
+	return mlx4_get_base_gid_ix(dev->dev, slave, port);
+}
+
+static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port,
+				    struct rdma_ah_attr *ah_attr)
+{
+	struct ib_global_route *grh = rdma_ah_retrieve_grh(ah_attr);
+	if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND)
+		grh->sgid_index = slave;
+	else
+		grh->sgid_index += get_slave_base_gid_ix(dev, slave, port);
+}
+
+static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+	struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)];
+	int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1);
+	struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr;
+	struct mlx4_ib_ah ah;
+	struct rdma_ah_attr ah_attr;
+	u8 *slave_id;
+	int slave;
+	int port;
+	u16 vlan_id;
+	u8 qos;
+	u8 *dmac;
+
+	/* Get slave that sent this packet */
+	if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn ||
+	    wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * MLX4_MFUNC_MAX ||
+	    (wc->src_qp & 0x1) != ctx->port - 1 ||
+	    wc->src_qp & 0x4) {
+		mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp);
+		return;
+	}
+	slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8;
+	if (slave != ctx->slave) {
+		mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: "
+			     "belongs to another slave\n", wc->src_qp);
+		return;
+	}
+
+	/* Map transaction ID */
+	ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map,
+				   sizeof (struct mlx4_tunnel_mad),
+				   DMA_FROM_DEVICE);
+	switch (tunnel->mad.mad_hdr.method) {
+	case IB_MGMT_METHOD_SET:
+	case IB_MGMT_METHOD_GET:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_SA_METHOD_GET_TABLE:
+	case IB_SA_METHOD_DELETE:
+	case IB_SA_METHOD_GET_MULTI:
+	case IB_SA_METHOD_GET_TRACE_TBL:
+		slave_id = (u8 *) &tunnel->mad.mad_hdr.tid;
+		if (*slave_id) {
+			mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d "
+				     "class:%d slave:%d\n", *slave_id,
+				     tunnel->mad.mad_hdr.mgmt_class, slave);
+			return;
+		} else
+			*slave_id = slave;
+	default:
+		/* nothing */;
+	}
+
+	/* Class-specific handling */
+	switch (tunnel->mad.mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+		if (slave != mlx4_master_func_num(dev->dev) &&
+		    !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port))
+			return;
+		break;
+	case IB_MGMT_CLASS_SUBN_ADM:
+		if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave,
+			      (struct ib_sa_mad *) &tunnel->mad))
+			return;
+		break;
+	case IB_MGMT_CLASS_CM:
+		if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave,
+			      (struct ib_mad *) &tunnel->mad))
+			return;
+		break;
+	case IB_MGMT_CLASS_DEVICE_MGMT:
+		if (tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_GET &&
+		    tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_SET)
+			return;
+		break;
+	default:
+		/* Drop unsupported classes for slaves in tunnel mode */
+		if (slave != mlx4_master_func_num(dev->dev)) {
+			mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d "
+				     "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave);
+			return;
+		}
+	}
+
+	/* We are using standard ib_core services to send the mad, so generate a
+	 * stadard address handle by decoding the tunnelled mlx4_ah fields */
+	memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av));
+	ah.ibah.device = ctx->ib_dev;
+
+	port = be32_to_cpu(ah.av.ib.port_pd) >> 24;
+	port = mlx4_slave_convert_port(dev->dev, slave, port);
+	if (port < 0)
+		return;
+	ah.av.ib.port_pd = cpu_to_be32(port << 24 | (be32_to_cpu(ah.av.ib.port_pd) & 0xffffff));
+	ah.ibah.type = rdma_ah_find_type(&dev->ib_dev, port);
+
+	mlx4_ib_query_ah(&ah.ibah, &ah_attr);
+	if (rdma_ah_get_ah_flags(&ah_attr) & IB_AH_GRH)
+		fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr);
+	dmac = rdma_ah_retrieve_dmac(&ah_attr);
+	if (dmac)
+		memcpy(dmac, tunnel->hdr.mac, ETH_ALEN);
+	vlan_id = be16_to_cpu(tunnel->hdr.vlan);
+	/* if slave have default vlan use it */
+	if (mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave,
+					&vlan_id, &qos))
+		rdma_ah_set_sl(&ah_attr, qos);
+
+	mlx4_ib_send_to_wire(dev, slave, ctx->port,
+			     is_proxy_qp0(dev, wc->src_qp, slave) ?
+			     IB_QPT_SMI : IB_QPT_GSI,
+			     be16_to_cpu(tunnel->hdr.pkey_index),
+			     be32_to_cpu(tunnel->hdr.remote_qpn),
+			     be32_to_cpu(tunnel->hdr.qkey),
+			     &ah_attr, wc->smac, vlan_id, &tunnel->mad);
+}
+
+static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
+				 enum ib_qp_type qp_type, int is_tun)
+{
+	int i;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	int rx_buf_size, tx_buf_size;
+
+	if (qp_type > IB_QPT_GSI)
+		return -EINVAL;
+
+	tun_qp = &ctx->qp[qp_type];
+
+	tun_qp->ring = kzalloc(sizeof (struct mlx4_ib_buf) * MLX4_NUM_TUNNEL_BUFS,
+			       GFP_KERNEL);
+	if (!tun_qp->ring)
+		return -ENOMEM;
+
+	tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS,
+				  sizeof (struct mlx4_ib_tun_tx_buf),
+				  GFP_KERNEL);
+	if (!tun_qp->tx_ring) {
+		kfree(tun_qp->ring);
+		tun_qp->ring = NULL;
+		return -ENOMEM;
+	}
+
+	if (is_tun) {
+		rx_buf_size = sizeof (struct mlx4_tunnel_mad);
+		tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad);
+	} else {
+		rx_buf_size = sizeof (struct mlx4_mad_rcv_buf);
+		tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL);
+		if (!tun_qp->ring[i].addr)
+			goto err;
+		tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev,
+							tun_qp->ring[i].addr,
+							rx_buf_size,
+							DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(ctx->ib_dev, tun_qp->ring[i].map)) {
+			kfree(tun_qp->ring[i].addr);
+			goto err;
+		}
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		tun_qp->tx_ring[i].buf.addr =
+			kmalloc(tx_buf_size, GFP_KERNEL);
+		if (!tun_qp->tx_ring[i].buf.addr)
+			goto tx_err;
+		tun_qp->tx_ring[i].buf.map =
+			ib_dma_map_single(ctx->ib_dev,
+					  tun_qp->tx_ring[i].buf.addr,
+					  tx_buf_size,
+					  DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(ctx->ib_dev,
+					 tun_qp->tx_ring[i].buf.map)) {
+			kfree(tun_qp->tx_ring[i].buf.addr);
+			goto tx_err;
+		}
+		tun_qp->tx_ring[i].ah = NULL;
+	}
+	spin_lock_init(&tun_qp->tx_lock);
+	tun_qp->tx_ix_head = 0;
+	tun_qp->tx_ix_tail = 0;
+	tun_qp->proxy_qpt = qp_type;
+
+	return 0;
+
+tx_err:
+	while (i > 0) {
+		--i;
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
+				    tx_buf_size, DMA_TO_DEVICE);
+		kfree(tun_qp->tx_ring[i].buf.addr);
+	}
+	kfree(tun_qp->tx_ring);
+	tun_qp->tx_ring = NULL;
+	i = MLX4_NUM_TUNNEL_BUFS;
+err:
+	while (i > 0) {
+		--i;
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
+				    rx_buf_size, DMA_FROM_DEVICE);
+		kfree(tun_qp->ring[i].addr);
+	}
+	kfree(tun_qp->ring);
+	tun_qp->ring = NULL;
+	return -ENOMEM;
+}
+
+static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
+				     enum ib_qp_type qp_type, int is_tun)
+{
+	int i;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	int rx_buf_size, tx_buf_size;
+
+	if (qp_type > IB_QPT_GSI)
+		return;
+
+	tun_qp = &ctx->qp[qp_type];
+	if (is_tun) {
+		rx_buf_size = sizeof (struct mlx4_tunnel_mad);
+		tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad);
+	} else {
+		rx_buf_size = sizeof (struct mlx4_mad_rcv_buf);
+		tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
+	}
+
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
+				    rx_buf_size, DMA_FROM_DEVICE);
+		kfree(tun_qp->ring[i].addr);
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
+				    tx_buf_size, DMA_TO_DEVICE);
+		kfree(tun_qp->tx_ring[i].buf.addr);
+		if (tun_qp->tx_ring[i].ah)
+			rdma_destroy_ah(tun_qp->tx_ring[i].ah);
+	}
+	kfree(tun_qp->tx_ring);
+	kfree(tun_qp->ring);
+}
+
+static void mlx4_ib_tunnel_comp_worker(struct work_struct *work)
+{
+	struct mlx4_ib_demux_pv_ctx *ctx;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	struct ib_wc wc;
+	int ret;
+	ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work);
+	ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+
+	while (ib_poll_cq(ctx->cq, 1, &wc) == 1) {
+		tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)];
+		if (wc.status == IB_WC_SUCCESS) {
+			switch (wc.opcode) {
+			case IB_WC_RECV:
+				mlx4_ib_multiplex_mad(ctx, &wc);
+				ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp,
+							     wc.wr_id &
+							     (MLX4_NUM_TUNNEL_BUFS - 1));
+				if (ret)
+					pr_err("Failed reposting tunnel "
+					       "buf:%lld\n", wc.wr_id);
+				break;
+			case IB_WC_SEND:
+				pr_debug("received tunnel send completion:"
+					 "wrid=0x%llx, status=0x%x\n",
+					 wc.wr_id, wc.status);
+				rdma_destroy_ah(tun_qp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&tun_qp->tx_lock);
+				tun_qp->tx_ix_tail++;
+				spin_unlock(&tun_qp->tx_lock);
+
+				break;
+			default:
+				break;
+			}
+		} else  {
+			pr_debug("mlx4_ib: completion error in tunnel: %d."
+				 " status = %d, wrid = 0x%llx\n",
+				 ctx->slave, wc.status, wc.wr_id);
+			if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
+				rdma_destroy_ah(tun_qp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&tun_qp->tx_lock);
+				tun_qp->tx_ix_tail++;
+				spin_unlock(&tun_qp->tx_lock);
+			}
+		}
+	}
+}
+
+static void pv_qp_event_handler(struct ib_event *event, void *qp_context)
+{
+	struct mlx4_ib_demux_pv_ctx *sqp = qp_context;
+
+	/* It's worse than that! He's dead, Jim! */
+	pr_err("Fatal error (%d) on a MAD QP on port %d\n",
+	       event->event, sqp->port);
+}
+
+static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx,
+			    enum ib_qp_type qp_type, int create_tun)
+{
+	int i, ret;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	struct mlx4_ib_qp_tunnel_init_attr qp_init_attr;
+	struct ib_qp_attr attr;
+	int qp_attr_mask_INIT;
+
+	if (qp_type > IB_QPT_GSI)
+		return -EINVAL;
+
+	tun_qp = &ctx->qp[qp_type];
+
+	memset(&qp_init_attr, 0, sizeof qp_init_attr);
+	qp_init_attr.init_attr.send_cq = ctx->cq;
+	qp_init_attr.init_attr.recv_cq = ctx->cq;
+	qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS;
+	qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS;
+	qp_init_attr.init_attr.cap.max_send_sge = 1;
+	qp_init_attr.init_attr.cap.max_recv_sge = 1;
+	if (create_tun) {
+		qp_init_attr.init_attr.qp_type = IB_QPT_UD;
+		qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_TUNNEL_QP;
+		qp_init_attr.port = ctx->port;
+		qp_init_attr.slave = ctx->slave;
+		qp_init_attr.proxy_qp_type = qp_type;
+		qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX |
+			   IB_QP_QKEY | IB_QP_PORT;
+	} else {
+		qp_init_attr.init_attr.qp_type = qp_type;
+		qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_SQP;
+		qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY;
+	}
+	qp_init_attr.init_attr.port_num = ctx->port;
+	qp_init_attr.init_attr.qp_context = ctx;
+	qp_init_attr.init_attr.event_handler = pv_qp_event_handler;
+	tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr);
+	if (IS_ERR(tun_qp->qp)) {
+		ret = PTR_ERR(tun_qp->qp);
+		tun_qp->qp = NULL;
+		pr_err("Couldn't create %s QP (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		return ret;
+	}
+
+	memset(&attr, 0, sizeof attr);
+	attr.qp_state = IB_QPS_INIT;
+	ret = 0;
+	if (create_tun)
+		ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave,
+					      ctx->port, IB_DEFAULT_PKEY_FULL,
+					      &attr.pkey_index);
+	if (ret || !create_tun)
+		attr.pkey_index =
+			to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0];
+	attr.qkey = IB_QP1_QKEY;
+	attr.port_num = ctx->port;
+	ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT);
+	if (ret) {
+		pr_err("Couldn't change %s qp state to INIT (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		goto err_qp;
+	}
+	attr.qp_state = IB_QPS_RTR;
+	ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE);
+	if (ret) {
+		pr_err("Couldn't change %s qp state to RTR (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		goto err_qp;
+	}
+	attr.qp_state = IB_QPS_RTS;
+	attr.sq_psn = 0;
+	ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN);
+	if (ret) {
+		pr_err("Couldn't change %s qp state to RTS (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		goto err_qp;
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i);
+		if (ret) {
+			pr_err(" mlx4_ib_post_pv_buf error"
+			       " (err = %d, i = %d)\n", ret, i);
+			goto err_qp;
+		}
+	}
+	return 0;
+
+err_qp:
+	ib_destroy_qp(tun_qp->qp);
+	tun_qp->qp = NULL;
+	return ret;
+}
+
+/*
+ * IB MAD completion callback for real SQPs
+ */
+static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
+{
+	struct mlx4_ib_demux_pv_ctx *ctx;
+	struct mlx4_ib_demux_pv_qp *sqp;
+	struct ib_wc wc;
+	struct ib_grh *grh;
+	struct ib_mad *mad;
+
+	ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work);
+	ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+
+	while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) {
+		sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)];
+		if (wc.status == IB_WC_SUCCESS) {
+			switch (wc.opcode) {
+			case IB_WC_SEND:
+				rdma_destroy_ah(sqp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&sqp->tx_lock);
+				sqp->tx_ix_tail++;
+				spin_unlock(&sqp->tx_lock);
+				break;
+			case IB_WC_RECV:
+				mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *)
+						(sqp->ring[wc.wr_id &
+						(MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload);
+				grh = &(((struct mlx4_mad_rcv_buf *)
+						(sqp->ring[wc.wr_id &
+						(MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh);
+				mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad);
+				if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id &
+							   (MLX4_NUM_TUNNEL_BUFS - 1)))
+					pr_err("Failed reposting SQP "
+					       "buf:%lld\n", wc.wr_id);
+				break;
+			default:
+				BUG_ON(1);
+				break;
+			}
+		} else  {
+			pr_debug("mlx4_ib: completion error in tunnel: %d."
+				 " status = %d, wrid = 0x%llx\n",
+				 ctx->slave, wc.status, wc.wr_id);
+			if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
+				rdma_destroy_ah(sqp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&sqp->tx_lock);
+				sqp->tx_ix_tail++;
+				spin_unlock(&sqp->tx_lock);
+			}
+		}
+	}
+}
+
+static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port,
+			       struct mlx4_ib_demux_pv_ctx **ret_ctx)
+{
+	struct mlx4_ib_demux_pv_ctx *ctx;
+
+	*ret_ctx = NULL;
+	ctx = kzalloc(sizeof (struct mlx4_ib_demux_pv_ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->ib_dev = &dev->ib_dev;
+	ctx->port = port;
+	ctx->slave = slave;
+	*ret_ctx = ctx;
+	return 0;
+}
+
+static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port)
+{
+	if (dev->sriov.demux[port - 1].tun[slave]) {
+		kfree(dev->sriov.demux[port - 1].tun[slave]);
+		dev->sriov.demux[port - 1].tun[slave] = NULL;
+	}
+}
+
+static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
+			       int create_tun, struct mlx4_ib_demux_pv_ctx *ctx)
+{
+	int ret, cq_size;
+	struct ib_cq_init_attr cq_attr = {};
+
+	if (ctx->state != DEMUX_PV_STATE_DOWN)
+		return -EEXIST;
+
+	ctx->state = DEMUX_PV_STATE_STARTING;
+	/* have QP0 only if link layer is IB */
+	if (rdma_port_get_link_layer(ibdev, ctx->port) ==
+	    IB_LINK_LAYER_INFINIBAND)
+		ctx->has_smi = 1;
+
+	if (ctx->has_smi) {
+		ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun);
+		if (ret) {
+			pr_err("Failed allocating qp0 tunnel bufs (%d)\n", ret);
+			goto err_out;
+		}
+	}
+
+	ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun);
+	if (ret) {
+		pr_err("Failed allocating qp1 tunnel bufs (%d)\n", ret);
+		goto err_out_qp0;
+	}
+
+	cq_size = 2 * MLX4_NUM_TUNNEL_BUFS;
+	if (ctx->has_smi)
+		cq_size *= 2;
+
+	cq_attr.cqe = cq_size;
+	ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler,
+			       NULL, ctx, &cq_attr);
+	if (IS_ERR(ctx->cq)) {
+		ret = PTR_ERR(ctx->cq);
+		pr_err("Couldn't create tunnel CQ (%d)\n", ret);
+		goto err_buf;
+	}
+
+	ctx->pd = ib_alloc_pd(ctx->ib_dev, 0);
+	if (IS_ERR(ctx->pd)) {
+		ret = PTR_ERR(ctx->pd);
+		pr_err("Couldn't create tunnel PD (%d)\n", ret);
+		goto err_cq;
+	}
+
+	if (ctx->has_smi) {
+		ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun);
+		if (ret) {
+			pr_err("Couldn't create %s QP0 (%d)\n",
+			       create_tun ? "tunnel for" : "",  ret);
+			goto err_pd;
+		}
+	}
+
+	ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun);
+	if (ret) {
+		pr_err("Couldn't create %s QP1 (%d)\n",
+		       create_tun ? "tunnel for" : "",  ret);
+		goto err_qp0;
+	}
+
+	if (create_tun)
+		INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker);
+	else
+		INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker);
+
+	ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq;
+
+	ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		pr_err("Couldn't arm tunnel cq (%d)\n", ret);
+		goto err_wq;
+	}
+	ctx->state = DEMUX_PV_STATE_ACTIVE;
+	return 0;
+
+err_wq:
+	ctx->wq = NULL;
+	ib_destroy_qp(ctx->qp[1].qp);
+	ctx->qp[1].qp = NULL;
+
+
+err_qp0:
+	if (ctx->has_smi)
+		ib_destroy_qp(ctx->qp[0].qp);
+	ctx->qp[0].qp = NULL;
+
+err_pd:
+	ib_dealloc_pd(ctx->pd);
+	ctx->pd = NULL;
+
+err_cq:
+	ib_destroy_cq(ctx->cq);
+	ctx->cq = NULL;
+
+err_buf:
+	mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun);
+
+err_out_qp0:
+	if (ctx->has_smi)
+		mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun);
+err_out:
+	ctx->state = DEMUX_PV_STATE_DOWN;
+	return ret;
+}
+
+static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port,
+				 struct mlx4_ib_demux_pv_ctx *ctx, int flush)
+{
+	if (!ctx)
+		return;
+	if (ctx->state > DEMUX_PV_STATE_DOWN) {
+		ctx->state = DEMUX_PV_STATE_DOWNING;
+		if (flush)
+			flush_workqueue(ctx->wq);
+		if (ctx->has_smi) {
+			ib_destroy_qp(ctx->qp[0].qp);
+			ctx->qp[0].qp = NULL;
+			mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1);
+		}
+		ib_destroy_qp(ctx->qp[1].qp);
+		ctx->qp[1].qp = NULL;
+		mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1);
+		ib_dealloc_pd(ctx->pd);
+		ctx->pd = NULL;
+		ib_destroy_cq(ctx->cq);
+		ctx->cq = NULL;
+		ctx->state = DEMUX_PV_STATE_DOWN;
+	}
+}
+
+static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave,
+				  int port, int do_init)
+{
+	int ret = 0;
+
+	if (!do_init) {
+		clean_vf_mcast(&dev->sriov.demux[port - 1], slave);
+		/* for master, destroy real sqp resources */
+		if (slave == mlx4_master_func_num(dev->dev))
+			destroy_pv_resources(dev, slave, port,
+					     dev->sriov.sqps[port - 1], 1);
+		/* destroy the tunnel qp resources */
+		destroy_pv_resources(dev, slave, port,
+				     dev->sriov.demux[port - 1].tun[slave], 1);
+		return 0;
+	}
+
+	/* create the tunnel qp resources */
+	ret = create_pv_resources(&dev->ib_dev, slave, port, 1,
+				  dev->sriov.demux[port - 1].tun[slave]);
+
+	/* for master, create the real sqp resources */
+	if (!ret && slave == mlx4_master_func_num(dev->dev))
+		ret = create_pv_resources(&dev->ib_dev, slave, port, 0,
+					  dev->sriov.sqps[port - 1]);
+	return ret;
+}
+
+void mlx4_ib_tunnels_update_work(struct work_struct *work)
+{
+	struct mlx4_ib_demux_work *dmxw;
+
+	dmxw = container_of(work, struct mlx4_ib_demux_work, work);
+	mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port,
+			       dmxw->do_init);
+	kfree(dmxw);
+	return;
+}
+
+static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
+				       struct mlx4_ib_demux_ctx *ctx,
+				       int port)
+{
+	char name[12];
+	int ret = 0;
+	int i;
+
+	ctx->tun = kcalloc(dev->dev->caps.sqp_demux,
+			   sizeof (struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL);
+	if (!ctx->tun)
+		return -ENOMEM;
+
+	ctx->dev = dev;
+	ctx->port = port;
+	ctx->ib_dev = &dev->ib_dev;
+
+	for (i = 0;
+	     i < min(dev->dev->caps.sqp_demux,
+	     (u16)(dev->dev->persist->num_vfs + 1));
+	     i++) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev->dev, i);
+
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+
+		ret = alloc_pv_object(dev, i, port, &ctx->tun[i]);
+		if (ret) {
+			ret = -ENOMEM;
+			goto err_mcg;
+		}
+	}
+
+	ret = mlx4_ib_mcg_port_init(ctx);
+	if (ret) {
+		pr_err("Failed initializing mcg para-virt (%d)\n", ret);
+		goto err_mcg;
+	}
+
+	snprintf(name, sizeof name, "mlx4_ibt%d", port);
+	ctx->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+	if (!ctx->wq) {
+		pr_err("Failed to create tunnelling WQ for port %d\n", port);
+		ret = -ENOMEM;
+		goto err_wq;
+	}
+
+	snprintf(name, sizeof name, "mlx4_ibud%d", port);
+	ctx->ud_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+	if (!ctx->ud_wq) {
+		pr_err("Failed to create up/down WQ for port %d\n", port);
+		ret = -ENOMEM;
+		goto err_udwq;
+	}
+
+	return 0;
+
+err_udwq:
+	destroy_workqueue(ctx->wq);
+	ctx->wq = NULL;
+
+err_wq:
+	mlx4_ib_mcg_port_cleanup(ctx, 1);
+err_mcg:
+	for (i = 0; i < dev->dev->caps.sqp_demux; i++)
+		free_pv_object(dev, i, port);
+	kfree(ctx->tun);
+	ctx->tun = NULL;
+	return ret;
+}
+
+static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx)
+{
+	if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) {
+		sqp_ctx->state = DEMUX_PV_STATE_DOWNING;
+		flush_workqueue(sqp_ctx->wq);
+		if (sqp_ctx->has_smi) {
+			ib_destroy_qp(sqp_ctx->qp[0].qp);
+			sqp_ctx->qp[0].qp = NULL;
+			mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0);
+		}
+		ib_destroy_qp(sqp_ctx->qp[1].qp);
+		sqp_ctx->qp[1].qp = NULL;
+		mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0);
+		ib_dealloc_pd(sqp_ctx->pd);
+		sqp_ctx->pd = NULL;
+		ib_destroy_cq(sqp_ctx->cq);
+		sqp_ctx->cq = NULL;
+		sqp_ctx->state = DEMUX_PV_STATE_DOWN;
+	}
+}
+
+static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx)
+{
+	int i;
+	if (ctx) {
+		struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+		mlx4_ib_mcg_port_cleanup(ctx, 1);
+		for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+			if (!ctx->tun[i])
+				continue;
+			if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN)
+				ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING;
+		}
+		flush_workqueue(ctx->wq);
+		for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+			destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0);
+			free_pv_object(dev, i, ctx->port);
+		}
+		kfree(ctx->tun);
+		destroy_workqueue(ctx->ud_wq);
+		destroy_workqueue(ctx->wq);
+	}
+}
+
+static void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init)
+{
+	int i;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+	/* initialize or tear down tunnel QPs for the master */
+	for (i = 0; i < dev->dev->caps.num_ports; i++)
+		mlx4_ib_tunnels_update(dev, mlx4_master_func_num(dev->dev), i + 1, do_init);
+	return;
+}
+
+int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev)
+{
+	int i = 0;
+	int err;
+
+	if (!mlx4_is_mfunc(dev->dev))
+		return 0;
+
+	dev->sriov.is_going_down = 0;
+	spin_lock_init(&dev->sriov.going_down_lock);
+	mlx4_ib_cm_paravirt_init(dev);
+
+	mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n");
+
+	if (mlx4_is_slave(dev->dev)) {
+		mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n");
+		return 0;
+	}
+
+	for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+		if (i == mlx4_master_func_num(dev->dev))
+			mlx4_put_slave_node_guid(dev->dev, i, dev->ib_dev.node_guid);
+		else
+			mlx4_put_slave_node_guid(dev->dev, i, mlx4_ib_gen_node_guid());
+	}
+
+	err = mlx4_ib_init_alias_guid_service(dev);
+	if (err) {
+		mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n");
+		goto paravirt_err;
+	}
+	err = mlx4_ib_device_register_sysfs(dev);
+	if (err) {
+		mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n");
+		goto sysfs_err;
+	}
+
+	mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n",
+		     dev->dev->caps.sqp_demux);
+	for (i = 0; i < dev->num_ports; i++) {
+		union ib_gid gid;
+		err = __mlx4_ib_query_gid(&dev->ib_dev, i + 1, 0, &gid, 1);
+		if (err)
+			goto demux_err;
+		dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id;
+		atomic64_set(&dev->sriov.demux[i].subnet_prefix,
+			     be64_to_cpu(gid.global.subnet_prefix));
+		err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1,
+				      &dev->sriov.sqps[i]);
+		if (err)
+			goto demux_err;
+		err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1);
+		if (err)
+			goto free_pv;
+	}
+	mlx4_ib_master_tunnels(dev, 1);
+	return 0;
+
+free_pv:
+	free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1);
+demux_err:
+	while (--i >= 0) {
+		free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1);
+		mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
+	}
+	mlx4_ib_device_unregister_sysfs(dev);
+
+sysfs_err:
+	mlx4_ib_destroy_alias_guid_service(dev);
+
+paravirt_err:
+	mlx4_ib_cm_paravirt_clean(dev, -1);
+
+	return err;
+}
+
+void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev)
+{
+	int i;
+	unsigned long flags;
+
+	if (!mlx4_is_mfunc(dev->dev))
+		return;
+
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	dev->sriov.is_going_down = 1;
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+	if (mlx4_is_master(dev->dev)) {
+		for (i = 0; i < dev->num_ports; i++) {
+			flush_workqueue(dev->sriov.demux[i].ud_wq);
+			mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]);
+			kfree(dev->sriov.sqps[i]);
+			dev->sriov.sqps[i] = NULL;
+			mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
+		}
+
+		mlx4_ib_cm_paravirt_clean(dev, -1);
+		mlx4_ib_destroy_alias_guid_service(dev);
+		mlx4_ib_device_unregister_sysfs(dev);
+	}
+}
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
new file mode 100644
index 0000000..5b70744
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -0,0 +1,3472 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/devlink.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include <net/bonding.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include <rdma/mlx4-abi.h>
+
+#define DRV_NAME	MLX4_IB_DRV_NAME
+#define DRV_VERSION	"4.0-0"
+
+#define MLX4_IB_FLOW_MAX_PRIO 0xFFF
+#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF
+#define MLX4_IB_CARD_REV_A0   0xA0
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+int mlx4_ib_sm_guid_assign = 0;
+module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444);
+MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 0)");
+
+static const char mlx4_ib_version[] =
+	DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
+	DRV_VERSION "\n";
+
+static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
+static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device,
+						    u8 port_num);
+
+static struct workqueue_struct *wq;
+
+static void init_query_mad(struct ib_smp *mad)
+{
+	mad->base_version  = 1;
+	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	mad->class_version = 1;
+	mad->method	   = IB_MGMT_METHOD_GET;
+}
+
+static int check_flow_steering_support(struct mlx4_dev *dev)
+{
+	int eth_num_ports = 0;
+	int ib_num_ports = 0;
+
+	int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED;
+
+	if (dmfs) {
+		int i;
+		mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+			eth_num_ports++;
+		mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+			ib_num_ports++;
+		dmfs &= (!ib_num_ports ||
+			 (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) &&
+			(!eth_num_ports ||
+			 (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN));
+		if (ib_num_ports && mlx4_is_mfunc(dev)) {
+			pr_warn("Device managed flow steering is unavailable for IB port in multifunction env.\n");
+			dmfs = 0;
+		}
+	}
+	return dmfs;
+}
+
+static int num_ib_ports(struct mlx4_dev *dev)
+{
+	int ib_ports = 0;
+	int i;
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		ib_ports++;
+
+	return ib_ports;
+}
+
+static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_num)
+{
+	struct mlx4_ib_dev *ibdev = to_mdev(device);
+	struct net_device *dev;
+
+	rcu_read_lock();
+	dev = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port_num);
+
+	if (dev) {
+		if (mlx4_is_bonded(ibdev->dev)) {
+			struct net_device *upper = NULL;
+
+			upper = netdev_master_upper_dev_get_rcu(dev);
+			if (upper) {
+				struct net_device *active;
+
+				active = bond_option_active_slave_get_rcu(netdev_priv(upper));
+				if (active)
+					dev = active;
+			}
+		}
+	}
+	if (dev)
+		dev_hold(dev);
+
+	rcu_read_unlock();
+	return dev;
+}
+
+static int mlx4_ib_update_gids_v1(struct gid_entry *gids,
+				  struct mlx4_ib_dev *ibdev,
+				  u8 port_num)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	struct mlx4_dev *dev = ibdev->dev;
+	int i;
+	union ib_gid *gid_tbl;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+
+	gid_tbl = mailbox->buf;
+
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
+		memcpy(&gid_tbl[i], &gids[i].gid, sizeof(union ib_gid));
+
+	err = mlx4_cmd(dev, mailbox->dma,
+		       MLX4_SET_PORT_GID_TABLE << 8 | port_num,
+		       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+	if (mlx4_is_bonded(dev))
+		err += mlx4_cmd(dev, mailbox->dma,
+				MLX4_SET_PORT_GID_TABLE << 8 | 2,
+				1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static int mlx4_ib_update_gids_v1_v2(struct gid_entry *gids,
+				     struct mlx4_ib_dev *ibdev,
+				     u8 port_num)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	struct mlx4_dev *dev = ibdev->dev;
+	int i;
+	struct {
+		union ib_gid	gid;
+		__be32		rsrvd1[2];
+		__be16		rsrvd2;
+		u8		type;
+		u8		version;
+		__be32		rsrvd3;
+	} *gid_tbl;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+
+	gid_tbl = mailbox->buf;
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
+		memcpy(&gid_tbl[i].gid, &gids[i].gid, sizeof(union ib_gid));
+		if (gids[i].gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
+			gid_tbl[i].version = 2;
+			if (!ipv6_addr_v4mapped((struct in6_addr *)&gids[i].gid))
+				gid_tbl[i].type = 1;
+		}
+	}
+
+	err = mlx4_cmd(dev, mailbox->dma,
+		       MLX4_SET_PORT_ROCE_ADDR << 8 | port_num,
+		       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+	if (mlx4_is_bonded(dev))
+		err += mlx4_cmd(dev, mailbox->dma,
+				MLX4_SET_PORT_ROCE_ADDR << 8 | 2,
+				1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static int mlx4_ib_update_gids(struct gid_entry *gids,
+			       struct mlx4_ib_dev *ibdev,
+			       u8 port_num)
+{
+	if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+		return mlx4_ib_update_gids_v1_v2(gids, ibdev, port_num);
+
+	return mlx4_ib_update_gids_v1(gids, ibdev, port_num);
+}
+
+static int mlx4_ib_add_gid(const union ib_gid *gid,
+			   const struct ib_gid_attr *attr,
+			   void **context)
+{
+	struct mlx4_ib_dev *ibdev = to_mdev(attr->device);
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	struct mlx4_port_gid_table   *port_gid_table;
+	int free = -1, found = -1;
+	int ret = 0;
+	int hw_update = 0;
+	int i;
+	struct gid_entry *gids = NULL;
+
+	if (!rdma_cap_roce_gid_table(attr->device, attr->port_num))
+		return -EINVAL;
+
+	if (attr->port_num > MLX4_MAX_PORTS)
+		return -EINVAL;
+
+	if (!context)
+		return -EINVAL;
+
+	port_gid_table = &iboe->gids[attr->port_num - 1];
+	spin_lock_bh(&iboe->lock);
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
+		if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid)) &&
+		    (port_gid_table->gids[i].gid_type == attr->gid_type))  {
+			found = i;
+			break;
+		}
+		if (free < 0 && !memcmp(&port_gid_table->gids[i].gid, &zgid, sizeof(*gid)))
+			free = i; /* HW has space */
+	}
+
+	if (found < 0) {
+		if (free < 0) {
+			ret = -ENOSPC;
+		} else {
+			port_gid_table->gids[free].ctx = kmalloc(sizeof(*port_gid_table->gids[free].ctx), GFP_ATOMIC);
+			if (!port_gid_table->gids[free].ctx) {
+				ret = -ENOMEM;
+			} else {
+				*context = port_gid_table->gids[free].ctx;
+				memcpy(&port_gid_table->gids[free].gid, gid, sizeof(*gid));
+				port_gid_table->gids[free].gid_type = attr->gid_type;
+				port_gid_table->gids[free].ctx->real_index = free;
+				port_gid_table->gids[free].ctx->refcount = 1;
+				hw_update = 1;
+			}
+		}
+	} else {
+		struct gid_cache_context *ctx = port_gid_table->gids[found].ctx;
+		*context = ctx;
+		ctx->refcount++;
+	}
+	if (!ret && hw_update) {
+		gids = kmalloc(sizeof(*gids) * MLX4_MAX_PORT_GIDS, GFP_ATOMIC);
+		if (!gids) {
+			ret = -ENOMEM;
+		} else {
+			for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) {
+				memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
+				gids[i].gid_type = port_gid_table->gids[i].gid_type;
+			}
+		}
+	}
+	spin_unlock_bh(&iboe->lock);
+
+	if (!ret && hw_update) {
+		ret = mlx4_ib_update_gids(gids, ibdev, attr->port_num);
+		kfree(gids);
+	}
+
+	return ret;
+}
+
+static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context)
+{
+	struct gid_cache_context *ctx = *context;
+	struct mlx4_ib_dev *ibdev = to_mdev(attr->device);
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	struct mlx4_port_gid_table   *port_gid_table;
+	int ret = 0;
+	int hw_update = 0;
+	struct gid_entry *gids = NULL;
+
+	if (!rdma_cap_roce_gid_table(attr->device, attr->port_num))
+		return -EINVAL;
+
+	if (attr->port_num > MLX4_MAX_PORTS)
+		return -EINVAL;
+
+	port_gid_table = &iboe->gids[attr->port_num - 1];
+	spin_lock_bh(&iboe->lock);
+	if (ctx) {
+		ctx->refcount--;
+		if (!ctx->refcount) {
+			unsigned int real_index = ctx->real_index;
+
+			memcpy(&port_gid_table->gids[real_index].gid, &zgid, sizeof(zgid));
+			kfree(port_gid_table->gids[real_index].ctx);
+			port_gid_table->gids[real_index].ctx = NULL;
+			hw_update = 1;
+		}
+	}
+	if (!ret && hw_update) {
+		int i;
+
+		gids = kmalloc(sizeof(*gids) * MLX4_MAX_PORT_GIDS, GFP_ATOMIC);
+		if (!gids) {
+			ret = -ENOMEM;
+		} else {
+			for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) {
+				memcpy(&gids[i].gid,
+				       &port_gid_table->gids[i].gid,
+				       sizeof(union ib_gid));
+				gids[i].gid_type =
+				    port_gid_table->gids[i].gid_type;
+			}
+		}
+	}
+	spin_unlock_bh(&iboe->lock);
+
+	if (!ret && hw_update) {
+		ret = mlx4_ib_update_gids(gids, ibdev, attr->port_num);
+		kfree(gids);
+	}
+	return ret;
+}
+
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+				    u8 port_num, int index)
+{
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	struct gid_cache_context *ctx = NULL;
+	union ib_gid gid;
+	struct mlx4_port_gid_table   *port_gid_table;
+	int real_index = -EINVAL;
+	int i;
+	int ret;
+	unsigned long flags;
+	struct ib_gid_attr attr;
+
+	if (port_num > MLX4_MAX_PORTS)
+		return -EINVAL;
+
+	if (mlx4_is_bonded(ibdev->dev))
+		port_num = 1;
+
+	if (!rdma_cap_roce_gid_table(&ibdev->ib_dev, port_num))
+		return index;
+
+	ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid, &attr);
+	if (ret)
+		return ret;
+
+	if (attr.ndev)
+		dev_put(attr.ndev);
+
+	spin_lock_irqsave(&iboe->lock, flags);
+	port_gid_table = &iboe->gids[port_num - 1];
+
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
+		if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid)) &&
+		    attr.gid_type == port_gid_table->gids[i].gid_type) {
+			ctx = port_gid_table->gids[i].ctx;
+			break;
+		}
+	if (ctx)
+		real_index = ctx->real_index;
+	spin_unlock_irqrestore(&iboe->lock, flags);
+	return real_index;
+}
+
+#define field_avail(type, fld, sz) (offsetof(type, fld) + \
+				    sizeof(((type *)0)->fld) <= (sz))
+
+static int mlx4_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props,
+				struct ib_udata *uhw)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err;
+	int have_ib_ports;
+	struct mlx4_uverbs_ex_query_device cmd;
+	struct mlx4_uverbs_ex_query_device_resp resp = {.comp_mask = 0};
+	struct mlx4_clock_params clock_params;
+
+	if (uhw->inlen) {
+		if (uhw->inlen < sizeof(cmd))
+			return -EINVAL;
+
+		err = ib_copy_from_udata(&cmd, uhw, sizeof(cmd));
+		if (err)
+			return err;
+
+		if (cmd.comp_mask)
+			return -EINVAL;
+
+		if (cmd.reserved)
+			return -EINVAL;
+	}
+
+	resp.response_length = offsetof(typeof(resp), response_length) +
+		sizeof(resp.response_length);
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	err = -ENOMEM;
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS,
+			   1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	have_ib_ports = num_ib_ports(dev->dev);
+
+	props->fw_ver = dev->dev->caps.fw_ver;
+	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT		|
+		IB_DEVICE_SYS_IMAGE_GUID		|
+		IB_DEVICE_RC_RNR_NAK_GEN		|
+		IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM && have_ib_ports)
+		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT)
+		props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
+		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+	if (dev->dev->caps.max_gso_sz &&
+	    (dev->dev->rev_id != MLX4_IB_CARD_REV_A0) &&
+	    (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH))
+		props->device_cap_flags |= IB_DEVICE_UD_TSO;
+	if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY)
+		props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
+	if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) &&
+	    (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) &&
+	    (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR))
+		props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)
+		props->device_cap_flags |= IB_DEVICE_XRC;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW)
+		props->device_cap_flags |= IB_DEVICE_MEM_WINDOW;
+	if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
+		if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B)
+			props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B;
+		else
+			props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A;
+	}
+	if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED)
+		props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
+
+	props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
+
+	props->vendor_id	   = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+		0xffffff;
+	props->vendor_part_id	   = dev->dev->persist->pdev->device;
+	props->hw_ver		   = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
+
+	props->max_mr_size	   = ~0ull;
+	props->page_size_cap	   = dev->dev->caps.page_size_cap;
+	props->max_qp		   = dev->dev->quotas.qp;
+	props->max_qp_wr	   = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE;
+	props->max_sge		   = min(dev->dev->caps.max_sq_sg,
+					 dev->dev->caps.max_rq_sg);
+	props->max_sge_rd	   = MLX4_MAX_SGE_RD;
+	props->max_cq		   = dev->dev->quotas.cq;
+	props->max_cqe		   = dev->dev->caps.max_cqes;
+	props->max_mr		   = dev->dev->quotas.mpt;
+	props->max_pd		   = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds;
+	props->max_qp_rd_atom	   = dev->dev->caps.max_qp_dest_rdma;
+	props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma;
+	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
+	props->max_srq		   = dev->dev->quotas.srq;
+	props->max_srq_wr	   = dev->dev->caps.max_srq_wqes - 1;
+	props->max_srq_sge	   = dev->dev->caps.max_srq_sge;
+	props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES;
+	props->local_ca_ack_delay  = dev->dev->caps.local_ca_ack_delay;
+	props->atomic_cap	   = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ?
+		IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+	props->masked_atomic_cap   = props->atomic_cap;
+	props->max_pkeys	   = dev->dev->caps.pkey_table_len[1];
+	props->max_mcast_grp	   = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms;
+	props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+					   props->max_mcast_grp;
+	props->max_map_per_fmr = dev->dev->caps.max_fmr_maps;
+	props->hca_core_clock = dev->dev->caps.hca_core_clock * 1000UL;
+	props->timestamp_mask = 0xFFFFFFFFFFFFULL;
+	props->max_ah = INT_MAX;
+
+	if (mlx4_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET ||
+	    mlx4_ib_port_link_layer(ibdev, 2) == IB_LINK_LAYER_ETHERNET) {
+		if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) {
+			props->rss_caps.max_rwq_indirection_tables =
+				props->max_qp;
+			props->rss_caps.max_rwq_indirection_table_size =
+				dev->dev->caps.max_rss_tbl_sz;
+			props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
+			props->max_wq_type_rq = props->max_qp;
+		}
+
+		if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP)
+			props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
+	}
+
+	props->cq_caps.max_cq_moderation_count = MLX4_MAX_CQ_COUNT;
+	props->cq_caps.max_cq_moderation_period = MLX4_MAX_CQ_PERIOD;
+
+	if (!mlx4_is_slave(dev->dev))
+		err = mlx4_get_internal_clock_params(dev->dev, &clock_params);
+
+	if (uhw->outlen >= resp.response_length + sizeof(resp.hca_core_clock_offset)) {
+		resp.response_length += sizeof(resp.hca_core_clock_offset);
+		if (!err && !mlx4_is_slave(dev->dev)) {
+			resp.comp_mask |= MLX4_IB_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET;
+			resp.hca_core_clock_offset = clock_params.offset % PAGE_SIZE;
+		}
+	}
+
+	if (uhw->outlen >= resp.response_length +
+	    sizeof(resp.max_inl_recv_sz)) {
+		resp.response_length += sizeof(resp.max_inl_recv_sz);
+		resp.max_inl_recv_sz  = dev->dev->caps.max_rq_sg *
+			sizeof(struct mlx4_wqe_data_seg);
+	}
+
+	if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
+		if (props->rss_caps.supported_qpts) {
+			resp.rss_caps.rx_hash_function =
+				MLX4_IB_RX_HASH_FUNC_TOEPLITZ;
+
+			resp.rss_caps.rx_hash_fields_mask =
+				MLX4_IB_RX_HASH_SRC_IPV4 |
+				MLX4_IB_RX_HASH_DST_IPV4 |
+				MLX4_IB_RX_HASH_SRC_IPV6 |
+				MLX4_IB_RX_HASH_DST_IPV6 |
+				MLX4_IB_RX_HASH_SRC_PORT_TCP |
+				MLX4_IB_RX_HASH_DST_PORT_TCP |
+				MLX4_IB_RX_HASH_SRC_PORT_UDP |
+				MLX4_IB_RX_HASH_DST_PORT_UDP;
+
+			if (dev->dev->caps.tunnel_offload_mode ==
+			    MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+				resp.rss_caps.rx_hash_fields_mask |=
+					MLX4_IB_RX_HASH_INNER;
+		}
+		resp.response_length = offsetof(typeof(resp), rss_caps) +
+				       sizeof(resp.rss_caps);
+	}
+
+	if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
+		if (dev->dev->caps.max_gso_sz &&
+		    ((mlx4_ib_port_link_layer(ibdev, 1) ==
+		    IB_LINK_LAYER_ETHERNET) ||
+		    (mlx4_ib_port_link_layer(ibdev, 2) ==
+		    IB_LINK_LAYER_ETHERNET))) {
+			resp.tso_caps.max_tso = dev->dev->caps.max_gso_sz;
+			resp.tso_caps.supported_qpts |=
+				1 << IB_QPT_RAW_PACKET;
+		}
+		resp.response_length = offsetof(typeof(resp), tso_caps) +
+				       sizeof(resp.tso_caps);
+	}
+
+	if (uhw->outlen) {
+		err = ib_copy_to_udata(uhw, &resp, resp.response_length);
+		if (err)
+			goto out;
+	}
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
+
+static enum rdma_link_layer
+mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num)
+{
+	struct mlx4_dev *dev = to_mdev(device)->dev;
+
+	return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ?
+		IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
+}
+
+static int ib_link_query_port(struct ib_device *ibdev, u8 port,
+			      struct ib_port_attr *props, int netw_view)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int ext_active_speed;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+				in_mad, out_mad);
+	if (err)
+		goto out;
+
+
+	props->lid		= be16_to_cpup((__be16 *) (out_mad->data + 16));
+	props->lmc		= out_mad->data[34] & 0x7;
+	props->sm_lid		= be16_to_cpup((__be16 *) (out_mad->data + 18));
+	props->sm_sl		= out_mad->data[36] & 0xf;
+	props->state		= out_mad->data[32] & 0xf;
+	props->phys_state	= out_mad->data[33] >> 4;
+	props->port_cap_flags	= be32_to_cpup((__be32 *) (out_mad->data + 20));
+	if (netw_view)
+		props->gid_tbl_len = out_mad->data[50];
+	else
+		props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port];
+	props->max_msg_sz	= to_mdev(ibdev)->dev->caps.max_msg_sz;
+	props->pkey_tbl_len	= to_mdev(ibdev)->dev->caps.pkey_table_len[port];
+	props->bad_pkey_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 46));
+	props->qkey_viol_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 48));
+	props->active_width	= out_mad->data[31] & 0xf;
+	props->active_speed	= out_mad->data[35] >> 4;
+	props->max_mtu		= out_mad->data[41] & 0xf;
+	props->active_mtu	= out_mad->data[36] >> 4;
+	props->subnet_timeout	= out_mad->data[51] & 0x1f;
+	props->max_vl_num	= out_mad->data[37] >> 4;
+	props->init_type_reply	= out_mad->data[41] >> 4;
+
+	/* Check if extended speeds (EDR/FDR/...) are supported */
+	if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) {
+		ext_active_speed = out_mad->data[62] >> 4;
+
+		switch (ext_active_speed) {
+		case 1:
+			props->active_speed = IB_SPEED_FDR;
+			break;
+		case 2:
+			props->active_speed = IB_SPEED_EDR;
+			break;
+		}
+	}
+
+	/* If reported active speed is QDR, check if is FDR-10 */
+	if (props->active_speed == IB_SPEED_QDR) {
+		init_query_mad(in_mad);
+		in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO;
+		in_mad->attr_mod = cpu_to_be32(port);
+
+		err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port,
+				   NULL, NULL, in_mad, out_mad);
+		if (err)
+			goto out;
+
+		/* Checking LinkSpeedActive for FDR-10 */
+		if (out_mad->data[15] & 0x1)
+			props->active_speed = IB_SPEED_FDR10;
+	}
+
+	/* Avoid wrong speed value returned by FW if the IB link is down. */
+	if (props->state == IB_PORT_DOWN)
+		 props->active_speed = IB_SPEED_SDR;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static u8 state_to_phys_state(enum ib_port_state state)
+{
+	return state == IB_PORT_ACTIVE ? 5 : 3;
+}
+
+static int eth_link_query_port(struct ib_device *ibdev, u8 port,
+			       struct ib_port_attr *props)
+{
+
+	struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+	struct mlx4_ib_iboe *iboe = &mdev->iboe;
+	struct net_device *ndev;
+	enum ib_mtu tmp;
+	struct mlx4_cmd_mailbox *mailbox;
+	int err = 0;
+	int is_bonded = mlx4_is_bonded(mdev->dev);
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	props->active_width	=  (((u8 *)mailbox->buf)[5] == 0x40) ||
+				   (((u8 *)mailbox->buf)[5] == 0x20 /*56Gb*/) ?
+					   IB_WIDTH_4X : IB_WIDTH_1X;
+	props->active_speed	=  (((u8 *)mailbox->buf)[5] == 0x20 /*56Gb*/) ?
+					   IB_SPEED_FDR : IB_SPEED_QDR;
+	props->port_cap_flags	= IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS;
+	props->gid_tbl_len	= mdev->dev->caps.gid_table_len[port];
+	props->max_msg_sz	= mdev->dev->caps.max_msg_sz;
+	props->pkey_tbl_len	= 1;
+	props->max_mtu		= IB_MTU_4096;
+	props->max_vl_num	= 2;
+	props->state		= IB_PORT_DOWN;
+	props->phys_state	= state_to_phys_state(props->state);
+	props->active_mtu	= IB_MTU_256;
+	spin_lock_bh(&iboe->lock);
+	ndev = iboe->netdevs[port - 1];
+	if (ndev && is_bonded) {
+		rcu_read_lock(); /* required to get upper dev */
+		ndev = netdev_master_upper_dev_get_rcu(ndev);
+		rcu_read_unlock();
+	}
+	if (!ndev)
+		goto out_unlock;
+
+	tmp = iboe_get_mtu(ndev->mtu);
+	props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256;
+
+	props->state		= (netif_running(ndev) && netif_carrier_ok(ndev)) ?
+					IB_PORT_ACTIVE : IB_PORT_DOWN;
+	props->phys_state	= state_to_phys_state(props->state);
+out_unlock:
+	spin_unlock_bh(&iboe->lock);
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return err;
+}
+
+int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			 struct ib_port_attr *props, int netw_view)
+{
+	int err;
+
+	/* props being zeroed by the caller, avoid zeroing it here */
+
+	err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ?
+		ib_link_query_port(ibdev, port, props, netw_view) :
+				eth_link_query_port(ibdev, port, props);
+
+	return err;
+}
+
+static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			      struct ib_port_attr *props)
+{
+	/* returns host view */
+	return __mlx4_ib_query_port(ibdev, port, props, 0);
+}
+
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			union ib_gid *gid, int netw_view)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int clear = 0;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	if (mlx4_is_mfunc(dev->dev) && netw_view)
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	if (mlx4_is_mfunc(dev->dev) && !netw_view) {
+		if (index) {
+			/* For any index > 0, return the null guid */
+			err = 0;
+			clear = 1;
+			goto out;
+		}
+	}
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod = cpu_to_be32(index / 8);
+
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, port,
+			   NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+out:
+	if (clear)
+		memset(gid->raw + 8, 0, 8);
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			     union ib_gid *gid)
+{
+	if (rdma_protocol_ib(ibdev, port))
+		return __mlx4_ib_query_gid(ibdev, port, index, gid, 0);
+	return 0;
+}
+
+static int mlx4_ib_query_sl2vl(struct ib_device *ibdev, u8 port, u64 *sl2vl_tbl)
+{
+	union sl2vl_tbl_to_u64 sl2vl64;
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+	int err = -ENOMEM;
+	int jj;
+
+	if (mlx4_is_slave(to_mdev(ibdev)->dev)) {
+		*sl2vl_tbl = 0;
+		return 0;
+	}
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_SL_TO_VL_TABLE;
+	in_mad->attr_mod = 0;
+
+	if (mlx4_is_mfunc(to_mdev(ibdev)->dev))
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+			   in_mad, out_mad);
+	if (err)
+		goto out;
+
+	for (jj = 0; jj < 8; jj++)
+		sl2vl64.sl8[jj] = ((struct ib_smp *)out_mad)->data[jj];
+	*sl2vl_tbl = sl2vl64.sl64;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static void mlx4_init_sl2vl_tbl(struct mlx4_ib_dev *mdev)
+{
+	u64 sl2vl;
+	int i;
+	int err;
+
+	for (i = 1; i <= mdev->dev->caps.num_ports; i++) {
+		if (mdev->dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH)
+			continue;
+		err = mlx4_ib_query_sl2vl(&mdev->ib_dev, i, &sl2vl);
+		if (err) {
+			pr_err("Unable to get default sl to vl mapping for port %d.  Using all zeroes (%d)\n",
+			       i, err);
+			sl2vl = 0;
+		}
+		atomic64_set(&mdev->sl2vl[i - 1], sl2vl);
+	}
+}
+
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			 u16 *pkey, int netw_view)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+			   in_mad, out_mad);
+	if (err)
+		goto out;
+
+	*pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0);
+}
+
+static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
+				 struct ib_device_modify *props)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	unsigned long flags;
+
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
+		return 0;
+
+	if (mlx4_is_slave(to_mdev(ibdev)->dev))
+		return -EOPNOTSUPP;
+
+	spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags);
+	memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
+	spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags);
+
+	/*
+	 * If possible, pass node desc to FW, so it can generate
+	 * a 144 trap.  If cmd fails, just ignore.
+	 */
+	mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev);
+	if (IS_ERR(mailbox))
+		return 0;
+
+	memcpy(mailbox->buf, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
+	mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,
+		 MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox);
+
+	return 0;
+}
+
+static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,
+			    u32 cap_mask)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		*(u8 *) mailbox->buf	     = !!reset_qkey_viols << 6;
+		((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask);
+	} else {
+		((u8 *) mailbox->buf)[3]     = !!reset_qkey_viols;
+		((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask);
+	}
+
+	err = mlx4_cmd(dev->dev, mailbox->dma, port, MLX4_SET_PORT_IB_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev->dev, mailbox);
+	return err;
+}
+
+static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+			       struct ib_port_modify *props)
+{
+	struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+	u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
+	struct ib_port_attr attr;
+	u32 cap_mask;
+	int err;
+
+	/* return OK if this is RoCE. CM calls ib_modify_port() regardless
+	 * of whether port link layer is ETH or IB. For ETH ports, qkey
+	 * violations and port capabilities are not meaningful.
+	 */
+	if (is_eth)
+		return 0;
+
+	mutex_lock(&mdev->cap_mask_mutex);
+
+	err = ib_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mlx4_ib_SET_PORT(mdev, port,
+			       !!(mask & IB_PORT_RESET_QKEY_CNTR),
+			       cap_mask);
+
+out:
+	mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+	return err;
+}
+
+static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
+						  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_ucontext *context;
+	struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3;
+	struct mlx4_ib_alloc_ucontext_resp resp;
+	int err;
+
+	if (!dev->ib_active)
+		return ERR_PTR(-EAGAIN);
+
+	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
+		resp_v3.qp_tab_size      = dev->dev->caps.num_qps;
+		resp_v3.bf_reg_size      = dev->dev->caps.bf_reg_size;
+		resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+	} else {
+		resp.dev_caps	      = dev->dev->caps.userspace_caps;
+		resp.qp_tab_size      = dev->dev->caps.num_qps;
+		resp.bf_reg_size      = dev->dev->caps.bf_reg_size;
+		resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+		resp.cqe_size	      = dev->dev->caps.cqe_size;
+	}
+
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar);
+	if (err) {
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	INIT_LIST_HEAD(&context->db_page_list);
+	mutex_init(&context->db_page_mutex);
+
+	INIT_LIST_HEAD(&context->wqn_ranges_list);
+	mutex_init(&context->wqn_ranges_mutex);
+
+	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
+		err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
+	else
+		err = ib_copy_to_udata(udata, &resp, sizeof(resp));
+
+	if (err) {
+		mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
+		kfree(context);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return &context->ibucontext;
+}
+
+static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+
+	mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
+	kfree(context);
+
+	return 0;
+}
+
+static void  mlx4_ib_vma_open(struct vm_area_struct *area)
+{
+	/* vma_open is called when a new VMA is created on top of our VMA.
+	 * This is done through either mremap flow or split_vma (usually due
+	 * to mlock, madvise, munmap, etc.). We do not support a clone of the
+	 * vma, as this VMA is strongly hardware related. Therefore we set the
+	 * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
+	 * calling us again and trying to do incorrect actions. We assume that
+	 * the original vma size is exactly a single page that there will be no
+	 * "splitting" operations on.
+	 */
+	area->vm_ops = NULL;
+}
+
+static void  mlx4_ib_vma_close(struct vm_area_struct *area)
+{
+	struct mlx4_ib_vma_private_data *mlx4_ib_vma_priv_data;
+
+	/* It's guaranteed that all VMAs opened on a FD are closed before the
+	 * file itself is closed, therefore no sync is needed with the regular
+	 * closing flow. (e.g. mlx4_ib_dealloc_ucontext) However need a sync
+	 * with accessing the vma as part of mlx4_ib_disassociate_ucontext.
+	 * The close operation is usually called under mm->mmap_sem except when
+	 * process is exiting.  The exiting case is handled explicitly as part
+	 * of mlx4_ib_disassociate_ucontext.
+	 */
+	mlx4_ib_vma_priv_data = (struct mlx4_ib_vma_private_data *)
+				area->vm_private_data;
+
+	/* set the vma context pointer to null in the mlx4_ib driver's private
+	 * data to protect against a race condition in mlx4_ib_dissassociate_ucontext().
+	 */
+	mlx4_ib_vma_priv_data->vma = NULL;
+}
+
+static const struct vm_operations_struct mlx4_ib_vm_ops = {
+	.open = mlx4_ib_vma_open,
+	.close = mlx4_ib_vma_close
+};
+
+static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+	int i;
+	int ret = 0;
+	struct vm_area_struct *vma;
+	struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+	struct task_struct *owning_process  = NULL;
+	struct mm_struct   *owning_mm       = NULL;
+
+	owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
+	if (!owning_process)
+		return;
+
+	owning_mm = get_task_mm(owning_process);
+	if (!owning_mm) {
+		pr_info("no mm, disassociate ucontext is pending task termination\n");
+		while (1) {
+			/* make sure that task is dead before returning, it may
+			 * prevent a rare case of module down in parallel to a
+			 * call to mlx4_ib_vma_close.
+			 */
+			put_task_struct(owning_process);
+			usleep_range(1000, 2000);
+			owning_process = get_pid_task(ibcontext->tgid,
+						      PIDTYPE_PID);
+			if (!owning_process ||
+			    owning_process->state == TASK_DEAD) {
+				pr_info("disassociate ucontext done, task was terminated\n");
+				/* in case task was dead need to release the task struct */
+				if (owning_process)
+					put_task_struct(owning_process);
+				return;
+			}
+		}
+	}
+
+	/* need to protect from a race on closing the vma as part of
+	 * mlx4_ib_vma_close().
+	 */
+	down_write(&owning_mm->mmap_sem);
+	for (i = 0; i < HW_BAR_COUNT; i++) {
+		vma = context->hw_bar_info[i].vma;
+		if (!vma)
+			continue;
+
+		ret = zap_vma_ptes(context->hw_bar_info[i].vma,
+				   context->hw_bar_info[i].vma->vm_start,
+				   PAGE_SIZE);
+		if (ret) {
+			pr_err("Error: zap_vma_ptes failed for index=%d, ret=%d\n", i, ret);
+			BUG_ON(1);
+		}
+
+		context->hw_bar_info[i].vma->vm_flags &=
+			~(VM_SHARED | VM_MAYSHARE);
+		/* context going to be destroyed, should not access ops any more */
+		context->hw_bar_info[i].vma->vm_ops = NULL;
+	}
+
+	up_write(&owning_mm->mmap_sem);
+	mmput(owning_mm);
+	put_task_struct(owning_process);
+}
+
+static void mlx4_ib_set_vma_data(struct vm_area_struct *vma,
+				 struct mlx4_ib_vma_private_data *vma_private_data)
+{
+	vma_private_data->vma = vma;
+	vma->vm_private_data = vma_private_data;
+	vma->vm_ops =  &mlx4_ib_vm_ops;
+}
+
+static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct mlx4_ib_dev *dev = to_mdev(context->device);
+	struct mlx4_ib_ucontext *mucontext = to_mucontext(context);
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	if (vma->vm_pgoff == 0) {
+		/* We prevent double mmaping on same context */
+		if (mucontext->hw_bar_info[HW_BAR_DB].vma)
+			return -EINVAL;
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       to_mucontext(context)->uar.pfn,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+
+		mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_DB]);
+
+	} else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
+		/* We prevent double mmaping on same context */
+		if (mucontext->hw_bar_info[HW_BAR_BF].vma)
+			return -EINVAL;
+
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       to_mucontext(context)->uar.pfn +
+				       dev->dev->caps.num_uars,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+
+		mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_BF]);
+
+	} else if (vma->vm_pgoff == 3) {
+		struct mlx4_clock_params params;
+		int ret;
+
+		/* We prevent double mmaping on same context */
+		if (mucontext->hw_bar_info[HW_BAR_CLOCK].vma)
+			return -EINVAL;
+
+		ret = mlx4_get_internal_clock_params(dev->dev, &params);
+
+		if (ret)
+			return ret;
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       (pci_resource_start(dev->dev->persist->pdev,
+							   params.bar) +
+					params.offset)
+				       >> PAGE_SHIFT,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+
+		mlx4_ib_set_vma_data(vma,
+				     &mucontext->hw_bar_info[HW_BAR_CLOCK]);
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
+				      struct ib_ucontext *context,
+				      struct ib_udata *udata)
+{
+	struct mlx4_ib_pd *pd;
+	int err;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context)
+		if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) {
+			mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	return &pd->ibpd;
+}
+
+static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
+{
+	mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
+	kfree(pd);
+
+	return 0;
+}
+
+static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
+					  struct ib_ucontext *context,
+					  struct ib_udata *udata)
+{
+	struct mlx4_ib_xrcd *xrcd;
+	struct ib_cq_init_attr cq_attr = {};
+	int err;
+
+	if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+		return ERR_PTR(-ENOSYS);
+
+	xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL);
+	if (!xrcd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn);
+	if (err)
+		goto err1;
+
+	xrcd->pd = ib_alloc_pd(ibdev, 0);
+	if (IS_ERR(xrcd->pd)) {
+		err = PTR_ERR(xrcd->pd);
+		goto err2;
+	}
+
+	cq_attr.cqe = 1;
+	xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, &cq_attr);
+	if (IS_ERR(xrcd->cq)) {
+		err = PTR_ERR(xrcd->cq);
+		goto err3;
+	}
+
+	return &xrcd->ibxrcd;
+
+err3:
+	ib_dealloc_pd(xrcd->pd);
+err2:
+	mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn);
+err1:
+	kfree(xrcd);
+	return ERR_PTR(err);
+}
+
+static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	ib_destroy_cq(to_mxrcd(xrcd)->cq);
+	ib_dealloc_pd(to_mxrcd(xrcd)->pd);
+	mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn);
+	kfree(xrcd);
+
+	return 0;
+}
+
+static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid)
+{
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+	struct mlx4_ib_gid_entry *ge;
+
+	ge = kzalloc(sizeof *ge, GFP_KERNEL);
+	if (!ge)
+		return -ENOMEM;
+
+	ge->gid = *gid;
+	if (mlx4_ib_add_mc(mdev, mqp, gid)) {
+		ge->port = mqp->port;
+		ge->added = 1;
+	}
+
+	mutex_lock(&mqp->mutex);
+	list_add_tail(&ge->list, &mqp->gid_list);
+	mutex_unlock(&mqp->mutex);
+
+	return 0;
+}
+
+static void mlx4_ib_delete_counters_table(struct mlx4_ib_dev *ibdev,
+					  struct mlx4_ib_counters *ctr_table)
+{
+	struct counter_index *counter, *tmp_count;
+
+	mutex_lock(&ctr_table->mutex);
+	list_for_each_entry_safe(counter, tmp_count, &ctr_table->counters_list,
+				 list) {
+		if (counter->allocated)
+			mlx4_counter_free(ibdev->dev, counter->index);
+		list_del(&counter->list);
+		kfree(counter);
+	}
+	mutex_unlock(&ctr_table->mutex);
+}
+
+int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+		   union ib_gid *gid)
+{
+	struct net_device *ndev;
+	int ret = 0;
+
+	if (!mqp->port)
+		return 0;
+
+	spin_lock_bh(&mdev->iboe.lock);
+	ndev = mdev->iboe.netdevs[mqp->port - 1];
+	if (ndev)
+		dev_hold(ndev);
+	spin_unlock_bh(&mdev->iboe.lock);
+
+	if (ndev) {
+		ret = 1;
+		dev_put(ndev);
+	}
+
+	return ret;
+}
+
+struct mlx4_ib_steering {
+	struct list_head list;
+	struct mlx4_flow_reg_id reg_id;
+	union ib_gid gid;
+};
+
+#define LAST_ETH_FIELD vlan_tag
+#define LAST_IB_FIELD sl
+#define LAST_IPV4_FIELD dst_ip
+#define LAST_TCP_UDP_FIELD src_port
+
+/* Field is the last supported field */
+#define FIELDS_NOT_SUPPORTED(filter, field)\
+	memchr_inv((void *)&filter.field  +\
+		   sizeof(filter.field), 0,\
+		   sizeof(filter) -\
+		   offsetof(typeof(filter), field) -\
+		   sizeof(filter.field))
+
+static int parse_flow_attr(struct mlx4_dev *dev,
+			   u32 qp_num,
+			   union ib_flow_spec *ib_spec,
+			   struct _rule_hw *mlx4_spec)
+{
+	enum mlx4_net_trans_rule_id type;
+
+	switch (ib_spec->type) {
+	case IB_FLOW_SPEC_ETH:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
+			return -ENOTSUPP;
+
+		type = MLX4_NET_TRANS_RULE_ID_ETH;
+		memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac,
+		       ETH_ALEN);
+		memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac,
+		       ETH_ALEN);
+		mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag;
+		mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag;
+		break;
+	case IB_FLOW_SPEC_IB:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->ib.mask, LAST_IB_FIELD))
+			return -ENOTSUPP;
+
+		type = MLX4_NET_TRANS_RULE_ID_IB;
+		mlx4_spec->ib.l3_qpn =
+			cpu_to_be32(qp_num);
+		mlx4_spec->ib.qpn_mask =
+			cpu_to_be32(MLX4_IB_FLOW_QPN_MASK);
+		break;
+
+
+	case IB_FLOW_SPEC_IPV4:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
+			return -ENOTSUPP;
+
+		type = MLX4_NET_TRANS_RULE_ID_IPV4;
+		mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip;
+		mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip;
+		mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip;
+		mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip;
+		break;
+
+	case IB_FLOW_SPEC_TCP:
+	case IB_FLOW_SPEC_UDP:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, LAST_TCP_UDP_FIELD))
+			return -ENOTSUPP;
+
+		type = ib_spec->type == IB_FLOW_SPEC_TCP ?
+					MLX4_NET_TRANS_RULE_ID_TCP :
+					MLX4_NET_TRANS_RULE_ID_UDP;
+		mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port;
+		mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port;
+		mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port;
+		mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	if (mlx4_map_sw_to_hw_steering_id(dev, type) < 0 ||
+	    mlx4_hw_rule_sz(dev, type) < 0)
+		return -EINVAL;
+	mlx4_spec->id = cpu_to_be16(mlx4_map_sw_to_hw_steering_id(dev, type));
+	mlx4_spec->size = mlx4_hw_rule_sz(dev, type) >> 2;
+	return mlx4_hw_rule_sz(dev, type);
+}
+
+struct default_rules {
+	__u32 mandatory_fields[IB_FLOW_SPEC_SUPPORT_LAYERS];
+	__u32 mandatory_not_fields[IB_FLOW_SPEC_SUPPORT_LAYERS];
+	__u32 rules_create_list[IB_FLOW_SPEC_SUPPORT_LAYERS];
+	__u8  link_layer;
+};
+static const struct default_rules default_table[] = {
+	{
+		.mandatory_fields = {IB_FLOW_SPEC_IPV4},
+		.mandatory_not_fields = {IB_FLOW_SPEC_ETH},
+		.rules_create_list = {IB_FLOW_SPEC_IB},
+		.link_layer = IB_LINK_LAYER_INFINIBAND
+	}
+};
+
+static int __mlx4_ib_default_rules_match(struct ib_qp *qp,
+					 struct ib_flow_attr *flow_attr)
+{
+	int i, j, k;
+	void *ib_flow;
+	const struct default_rules *pdefault_rules = default_table;
+	u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port);
+
+	for (i = 0; i < ARRAY_SIZE(default_table); i++, pdefault_rules++) {
+		__u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS];
+		memset(&field_types, 0, sizeof(field_types));
+
+		if (link_layer != pdefault_rules->link_layer)
+			continue;
+
+		ib_flow = flow_attr + 1;
+		/* we assume the specs are sorted */
+		for (j = 0, k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS &&
+		     j < flow_attr->num_of_specs; k++) {
+			union ib_flow_spec *current_flow =
+				(union ib_flow_spec *)ib_flow;
+
+			/* same layer but different type */
+			if (((current_flow->type & IB_FLOW_SPEC_LAYER_MASK) ==
+			     (pdefault_rules->mandatory_fields[k] &
+			      IB_FLOW_SPEC_LAYER_MASK)) &&
+			    (current_flow->type !=
+			     pdefault_rules->mandatory_fields[k]))
+				goto out;
+
+			/* same layer, try match next one */
+			if (current_flow->type ==
+			    pdefault_rules->mandatory_fields[k]) {
+				j++;
+				ib_flow +=
+					((union ib_flow_spec *)ib_flow)->size;
+			}
+		}
+
+		ib_flow = flow_attr + 1;
+		for (j = 0; j < flow_attr->num_of_specs;
+		     j++, ib_flow += ((union ib_flow_spec *)ib_flow)->size)
+			for (k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS; k++)
+				/* same layer and same type */
+				if (((union ib_flow_spec *)ib_flow)->type ==
+				    pdefault_rules->mandatory_not_fields[k])
+					goto out;
+
+		return i;
+	}
+out:
+	return -1;
+}
+
+static int __mlx4_ib_create_default_rules(
+		struct mlx4_ib_dev *mdev,
+		struct ib_qp *qp,
+		const struct default_rules *pdefault_rules,
+		struct _rule_hw *mlx4_spec) {
+	int size = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pdefault_rules->rules_create_list); i++) {
+		int ret;
+		union ib_flow_spec ib_spec;
+		switch (pdefault_rules->rules_create_list[i]) {
+		case 0:
+			/* no rule */
+			continue;
+		case IB_FLOW_SPEC_IB:
+			ib_spec.type = IB_FLOW_SPEC_IB;
+			ib_spec.size = sizeof(struct ib_flow_spec_ib);
+
+			break;
+		default:
+			/* invalid rule */
+			return -EINVAL;
+		}
+		/* We must put empty rule, qpn is being ignored */
+		ret = parse_flow_attr(mdev->dev, 0, &ib_spec,
+				      mlx4_spec);
+		if (ret < 0) {
+			pr_info("invalid parsing\n");
+			return -EINVAL;
+		}
+
+		mlx4_spec = (void *)mlx4_spec + ret;
+		size += ret;
+	}
+	return size;
+}
+
+static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+			  int domain,
+			  enum mlx4_net_trans_promisc_mode flow_type,
+			  u64 *reg_id)
+{
+	int ret, i;
+	int size = 0;
+	void *ib_flow;
+	struct mlx4_ib_dev *mdev = to_mdev(qp->device);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+	int default_flow;
+
+	static const u16 __mlx4_domain[] = {
+		[IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS,
+		[IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL,
+		[IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS,
+		[IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC,
+	};
+
+	if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) {
+		pr_err("Invalid priority value %d\n", flow_attr->priority);
+		return -EINVAL;
+	}
+
+	if (domain >= IB_FLOW_DOMAIN_NUM) {
+		pr_err("Invalid domain value %d\n", domain);
+		return -EINVAL;
+	}
+
+	if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0)
+		return -EINVAL;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	ctrl = mailbox->buf;
+
+	ctrl->prio = cpu_to_be16(__mlx4_domain[domain] |
+				 flow_attr->priority);
+	ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type);
+	ctrl->port = flow_attr->port;
+	ctrl->qpn = cpu_to_be32(qp->qp_num);
+
+	ib_flow = flow_attr + 1;
+	size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+	/* Add default flows */
+	default_flow = __mlx4_ib_default_rules_match(qp, flow_attr);
+	if (default_flow >= 0) {
+		ret = __mlx4_ib_create_default_rules(
+				mdev, qp, default_table + default_flow,
+				mailbox->buf + size);
+		if (ret < 0) {
+			mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+			return -EINVAL;
+		}
+		size += ret;
+	}
+	for (i = 0; i < flow_attr->num_of_specs; i++) {
+		ret = parse_flow_attr(mdev->dev, qp->qp_num, ib_flow,
+				      mailbox->buf + size);
+		if (ret < 0) {
+			mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+			return -EINVAL;
+		}
+		ib_flow += ((union ib_flow_spec *) ib_flow)->size;
+		size += ret;
+	}
+
+	if (mlx4_is_master(mdev->dev) && flow_type == MLX4_FS_REGULAR &&
+	    flow_attr->num_of_specs == 1) {
+		struct _rule_hw *rule_header = (struct _rule_hw *)(ctrl + 1);
+		enum ib_flow_spec_type header_spec =
+			((union ib_flow_spec *)(flow_attr + 1))->type;
+
+		if (header_spec == IB_FLOW_SPEC_ETH)
+			mlx4_handle_eth_header_mcast_prio(ctrl, rule_header);
+	}
+
+	ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (ret == -ENOMEM)
+		pr_err("mcg table is full. Fail to register network rule.\n");
+	else if (ret == -ENXIO)
+		pr_err("Device managed flow steering is disabled. Fail to register network rule.\n");
+	else if (ret)
+		pr_err("Invalid argument. Fail to register network rule.\n");
+
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return ret;
+}
+
+static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id)
+{
+	int err;
+	err = mlx4_cmd(dev, reg_id, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+	if (err)
+		pr_err("Fail to detach network rule. registration id = 0x%llx\n",
+		       reg_id);
+	return err;
+}
+
+static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+				    u64 *reg_id)
+{
+	void *ib_flow;
+	union ib_flow_spec *ib_spec;
+	struct mlx4_dev	*dev = to_mdev(qp->device)->dev;
+	int err = 0;
+
+	if (dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN ||
+	    dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC)
+		return 0; /* do nothing */
+
+	ib_flow = flow_attr + 1;
+	ib_spec = (union ib_flow_spec *)ib_flow;
+
+	if (ib_spec->type !=  IB_FLOW_SPEC_ETH || flow_attr->num_of_specs != 1)
+		return 0; /* do nothing */
+
+	err = mlx4_tunnel_steer_add(to_mdev(qp->device)->dev, ib_spec->eth.val.dst_mac,
+				    flow_attr->port, qp->qp_num,
+				    MLX4_DOMAIN_UVERBS | (flow_attr->priority & 0xff),
+				    reg_id);
+	return err;
+}
+
+static int mlx4_ib_add_dont_trap_rule(struct mlx4_dev *dev,
+				      struct ib_flow_attr *flow_attr,
+				      enum mlx4_net_trans_promisc_mode *type)
+{
+	int err = 0;
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER) ||
+	    (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC) ||
+	    (flow_attr->num_of_specs > 1) || (flow_attr->priority != 0)) {
+		return -EOPNOTSUPP;
+	}
+
+	if (flow_attr->num_of_specs == 0) {
+		type[0] = MLX4_FS_MC_SNIFFER;
+		type[1] = MLX4_FS_UC_SNIFFER;
+	} else {
+		union ib_flow_spec *ib_spec;
+
+		ib_spec = (union ib_flow_spec *)(flow_attr + 1);
+		if (ib_spec->type !=  IB_FLOW_SPEC_ETH)
+			return -EINVAL;
+
+		/* if all is zero than MC and UC */
+		if (is_zero_ether_addr(ib_spec->eth.mask.dst_mac)) {
+			type[0] = MLX4_FS_MC_SNIFFER;
+			type[1] = MLX4_FS_UC_SNIFFER;
+		} else {
+			u8 mac[ETH_ALEN] = {ib_spec->eth.mask.dst_mac[0] ^ 0x01,
+					    ib_spec->eth.mask.dst_mac[1],
+					    ib_spec->eth.mask.dst_mac[2],
+					    ib_spec->eth.mask.dst_mac[3],
+					    ib_spec->eth.mask.dst_mac[4],
+					    ib_spec->eth.mask.dst_mac[5]};
+
+			/* Above xor was only on MC bit, non empty mask is valid
+			 * only if this bit is set and rest are zero.
+			 */
+			if (!is_zero_ether_addr(&mac[0]))
+				return -EINVAL;
+
+			if (is_multicast_ether_addr(ib_spec->eth.val.dst_mac))
+				type[0] = MLX4_FS_MC_SNIFFER;
+			else
+				type[0] = MLX4_FS_UC_SNIFFER;
+		}
+	}
+
+	return err;
+}
+
+static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
+				    struct ib_flow_attr *flow_attr,
+				    int domain)
+{
+	int err = 0, i = 0, j = 0;
+	struct mlx4_ib_flow *mflow;
+	enum mlx4_net_trans_promisc_mode type[2];
+	struct mlx4_dev *dev = (to_mdev(qp->device))->dev;
+	int is_bonded = mlx4_is_bonded(dev);
+
+	if (flow_attr->port < 1 || flow_attr->port > qp->device->phys_port_cnt)
+		return ERR_PTR(-EINVAL);
+
+	if (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if ((flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) &&
+	    (flow_attr->type != IB_FLOW_ATTR_NORMAL))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	memset(type, 0, sizeof(type));
+
+	mflow = kzalloc(sizeof(*mflow), GFP_KERNEL);
+	if (!mflow) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	switch (flow_attr->type) {
+	case IB_FLOW_ATTR_NORMAL:
+		/* If dont trap flag (continue match) is set, under specific
+		 * condition traffic be replicated to given qp,
+		 * without stealing it
+		 */
+		if (unlikely(flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)) {
+			err = mlx4_ib_add_dont_trap_rule(dev,
+							 flow_attr,
+							 type);
+			if (err)
+				goto err_free;
+		} else {
+			type[0] = MLX4_FS_REGULAR;
+		}
+		break;
+
+	case IB_FLOW_ATTR_ALL_DEFAULT:
+		type[0] = MLX4_FS_ALL_DEFAULT;
+		break;
+
+	case IB_FLOW_ATTR_MC_DEFAULT:
+		type[0] = MLX4_FS_MC_DEFAULT;
+		break;
+
+	case IB_FLOW_ATTR_SNIFFER:
+		type[0] = MLX4_FS_MIRROR_RX_PORT;
+		type[1] = MLX4_FS_MIRROR_SX_PORT;
+		break;
+
+	default:
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	while (i < ARRAY_SIZE(type) && type[i]) {
+		err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i],
+					    &mflow->reg_id[i].id);
+		if (err)
+			goto err_create_flow;
+		if (is_bonded) {
+			/* Application always sees one port so the mirror rule
+			 * must be on port #2
+			 */
+			flow_attr->port = 2;
+			err = __mlx4_ib_create_flow(qp, flow_attr,
+						    domain, type[j],
+						    &mflow->reg_id[j].mirror);
+			flow_attr->port = 1;
+			if (err)
+				goto err_create_flow;
+			j++;
+		}
+
+		i++;
+	}
+
+	if (i < ARRAY_SIZE(type) && flow_attr->type == IB_FLOW_ATTR_NORMAL) {
+		err = mlx4_ib_tunnel_steer_add(qp, flow_attr,
+					       &mflow->reg_id[i].id);
+		if (err)
+			goto err_create_flow;
+
+		if (is_bonded) {
+			flow_attr->port = 2;
+			err = mlx4_ib_tunnel_steer_add(qp, flow_attr,
+						       &mflow->reg_id[j].mirror);
+			flow_attr->port = 1;
+			if (err)
+				goto err_create_flow;
+			j++;
+		}
+		/* function to create mirror rule */
+		i++;
+	}
+
+	return &mflow->ibflow;
+
+err_create_flow:
+	while (i) {
+		(void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev,
+					     mflow->reg_id[i].id);
+		i--;
+	}
+
+	while (j) {
+		(void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev,
+					     mflow->reg_id[j].mirror);
+		j--;
+	}
+err_free:
+	kfree(mflow);
+	return ERR_PTR(err);
+}
+
+static int mlx4_ib_destroy_flow(struct ib_flow *flow_id)
+{
+	int err, ret = 0;
+	int i = 0;
+	struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device);
+	struct mlx4_ib_flow *mflow = to_mflow(flow_id);
+
+	while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i].id) {
+		err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i].id);
+		if (err)
+			ret = err;
+		if (mflow->reg_id[i].mirror) {
+			err = __mlx4_ib_destroy_flow(mdev->dev,
+						     mflow->reg_id[i].mirror);
+			if (err)
+				ret = err;
+		}
+		i++;
+	}
+
+	kfree(mflow);
+	return ret;
+}
+
+static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	int err;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+	struct mlx4_dev	*dev = mdev->dev;
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	struct mlx4_ib_steering *ib_steering = NULL;
+	enum mlx4_protocol prot = MLX4_PROT_IB_IPV6;
+	struct mlx4_flow_reg_id	reg_id;
+
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL);
+		if (!ib_steering)
+			return -ENOMEM;
+	}
+
+	err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port,
+				    !!(mqp->flags &
+				       MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+				    prot, &reg_id.id);
+	if (err) {
+		pr_err("multicast attach op failed, err %d\n", err);
+		goto err_malloc;
+	}
+
+	reg_id.mirror = 0;
+	if (mlx4_is_bonded(dev)) {
+		err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw,
+					    (mqp->port == 1) ? 2 : 1,
+					    !!(mqp->flags &
+					    MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+					    prot, &reg_id.mirror);
+		if (err)
+			goto err_add;
+	}
+
+	err = add_gid_entry(ibqp, gid);
+	if (err)
+		goto err_add;
+
+	if (ib_steering) {
+		memcpy(ib_steering->gid.raw, gid->raw, 16);
+		ib_steering->reg_id = reg_id;
+		mutex_lock(&mqp->mutex);
+		list_add(&ib_steering->list, &mqp->steering_rules);
+		mutex_unlock(&mqp->mutex);
+	}
+	return 0;
+
+err_add:
+	mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+			      prot, reg_id.id);
+	if (reg_id.mirror)
+		mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+				      prot, reg_id.mirror);
+err_malloc:
+	kfree(ib_steering);
+
+	return err;
+}
+
+static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw)
+{
+	struct mlx4_ib_gid_entry *ge;
+	struct mlx4_ib_gid_entry *tmp;
+	struct mlx4_ib_gid_entry *ret = NULL;
+
+	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+		if (!memcmp(raw, ge->gid.raw, 16)) {
+			ret = ge;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	int err;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+	struct mlx4_dev *dev = mdev->dev;
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	struct net_device *ndev;
+	struct mlx4_ib_gid_entry *ge;
+	struct mlx4_flow_reg_id reg_id = {0, 0};
+	enum mlx4_protocol prot =  MLX4_PROT_IB_IPV6;
+
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		struct mlx4_ib_steering *ib_steering;
+
+		mutex_lock(&mqp->mutex);
+		list_for_each_entry(ib_steering, &mqp->steering_rules, list) {
+			if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) {
+				list_del(&ib_steering->list);
+				break;
+			}
+		}
+		mutex_unlock(&mqp->mutex);
+		if (&ib_steering->list == &mqp->steering_rules) {
+			pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n");
+			return -EINVAL;
+		}
+		reg_id = ib_steering->reg_id;
+		kfree(ib_steering);
+	}
+
+	err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+				    prot, reg_id.id);
+	if (err)
+		return err;
+
+	if (mlx4_is_bonded(dev)) {
+		err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+					    prot, reg_id.mirror);
+		if (err)
+			return err;
+	}
+
+	mutex_lock(&mqp->mutex);
+	ge = find_gid_entry(mqp, gid->raw);
+	if (ge) {
+		spin_lock_bh(&mdev->iboe.lock);
+		ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL;
+		if (ndev)
+			dev_hold(ndev);
+		spin_unlock_bh(&mdev->iboe.lock);
+		if (ndev)
+			dev_put(ndev);
+		list_del(&ge->list);
+		kfree(ge);
+	} else
+		pr_warn("could not find mgid entry\n");
+
+	mutex_unlock(&mqp->mutex);
+
+	return 0;
+}
+
+static int init_node_data(struct mlx4_ib_dev *dev)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+	if (mlx4_is_master(dev->dev))
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(dev->ib_dev.node_desc, out_mad->data, IB_DEVICE_NODE_DESC_MAX);
+
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%x\n", dev->dev->rev_id);
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN,
+		       dev->dev->board_id);
+}
+
+static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+
+static struct device_attribute *mlx4_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+struct diag_counter {
+	const char *name;
+	u32 offset;
+};
+
+#define DIAG_COUNTER(_name, _offset)			\
+	{ .name = #_name, .offset = _offset }
+
+static const struct diag_counter diag_basic[] = {
+	DIAG_COUNTER(rq_num_lle, 0x00),
+	DIAG_COUNTER(sq_num_lle, 0x04),
+	DIAG_COUNTER(rq_num_lqpoe, 0x08),
+	DIAG_COUNTER(sq_num_lqpoe, 0x0C),
+	DIAG_COUNTER(rq_num_lpe, 0x18),
+	DIAG_COUNTER(sq_num_lpe, 0x1C),
+	DIAG_COUNTER(rq_num_wrfe, 0x20),
+	DIAG_COUNTER(sq_num_wrfe, 0x24),
+	DIAG_COUNTER(sq_num_mwbe, 0x2C),
+	DIAG_COUNTER(sq_num_bre, 0x34),
+	DIAG_COUNTER(sq_num_rire, 0x44),
+	DIAG_COUNTER(rq_num_rire, 0x48),
+	DIAG_COUNTER(sq_num_rae, 0x4C),
+	DIAG_COUNTER(rq_num_rae, 0x50),
+	DIAG_COUNTER(sq_num_roe, 0x54),
+	DIAG_COUNTER(sq_num_tree, 0x5C),
+	DIAG_COUNTER(sq_num_rree, 0x64),
+	DIAG_COUNTER(rq_num_rnr, 0x68),
+	DIAG_COUNTER(sq_num_rnr, 0x6C),
+	DIAG_COUNTER(rq_num_oos, 0x100),
+	DIAG_COUNTER(sq_num_oos, 0x104),
+};
+
+static const struct diag_counter diag_ext[] = {
+	DIAG_COUNTER(rq_num_dup, 0x130),
+	DIAG_COUNTER(sq_num_to, 0x134),
+};
+
+static const struct diag_counter diag_device_only[] = {
+	DIAG_COUNTER(num_cqovf, 0x1A0),
+	DIAG_COUNTER(rq_num_udsdprd, 0x118),
+};
+
+static struct rdma_hw_stats *mlx4_ib_alloc_hw_stats(struct ib_device *ibdev,
+						    u8 port_num)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_diag_counters *diag = dev->diag_counters;
+
+	if (!diag[!!port_num].name)
+		return NULL;
+
+	return rdma_alloc_hw_stats_struct(diag[!!port_num].name,
+					  diag[!!port_num].num_counters,
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx4_ib_get_hw_stats(struct ib_device *ibdev,
+				struct rdma_hw_stats *stats,
+				u8 port, int index)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_diag_counters *diag = dev->diag_counters;
+	u32 hw_value[ARRAY_SIZE(diag_device_only) +
+		ARRAY_SIZE(diag_ext) + ARRAY_SIZE(diag_basic)] = {};
+	int ret;
+	int i;
+
+	ret = mlx4_query_diag_counters(dev->dev,
+				       MLX4_OP_MOD_QUERY_TRANSPORT_CI_ERRORS,
+				       diag[!!port].offset, hw_value,
+				       diag[!!port].num_counters, port);
+
+	if (ret)
+		return ret;
+
+	for (i = 0; i < diag[!!port].num_counters; i++)
+		stats->value[i] = hw_value[i];
+
+	return diag[!!port].num_counters;
+}
+
+static int __mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev,
+					 const char ***name,
+					 u32 **offset,
+					 u32 *num,
+					 bool port)
+{
+	u32 num_counters;
+
+	num_counters = ARRAY_SIZE(diag_basic);
+
+	if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT)
+		num_counters += ARRAY_SIZE(diag_ext);
+
+	if (!port)
+		num_counters += ARRAY_SIZE(diag_device_only);
+
+	*name = kcalloc(num_counters, sizeof(**name), GFP_KERNEL);
+	if (!*name)
+		return -ENOMEM;
+
+	*offset = kcalloc(num_counters, sizeof(**offset), GFP_KERNEL);
+	if (!*offset)
+		goto err_name;
+
+	*num = num_counters;
+
+	return 0;
+
+err_name:
+	kfree(*name);
+	return -ENOMEM;
+}
+
+static void mlx4_ib_fill_diag_counters(struct mlx4_ib_dev *ibdev,
+				       const char **name,
+				       u32 *offset,
+				       bool port)
+{
+	int i;
+	int j;
+
+	for (i = 0, j = 0; i < ARRAY_SIZE(diag_basic); i++, j++) {
+		name[i] = diag_basic[i].name;
+		offset[i] = diag_basic[i].offset;
+	}
+
+	if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT) {
+		for (i = 0; i < ARRAY_SIZE(diag_ext); i++, j++) {
+			name[j] = diag_ext[i].name;
+			offset[j] = diag_ext[i].offset;
+		}
+	}
+
+	if (!port) {
+		for (i = 0; i < ARRAY_SIZE(diag_device_only); i++, j++) {
+			name[j] = diag_device_only[i].name;
+			offset[j] = diag_device_only[i].offset;
+		}
+	}
+}
+
+static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev)
+{
+	struct mlx4_ib_diag_counters *diag = ibdev->diag_counters;
+	int i;
+	int ret;
+	bool per_port = !!(ibdev->dev->caps.flags2 &
+		MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT);
+
+	if (mlx4_is_slave(ibdev->dev))
+		return 0;
+
+	for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) {
+		/* i == 1 means we are building port counters */
+		if (i && !per_port)
+			continue;
+
+		ret = __mlx4_ib_alloc_diag_counters(ibdev, &diag[i].name,
+						    &diag[i].offset,
+						    &diag[i].num_counters, i);
+		if (ret)
+			goto err_alloc;
+
+		mlx4_ib_fill_diag_counters(ibdev, diag[i].name,
+					   diag[i].offset, i);
+	}
+
+	ibdev->ib_dev.get_hw_stats	= mlx4_ib_get_hw_stats;
+	ibdev->ib_dev.alloc_hw_stats	= mlx4_ib_alloc_hw_stats;
+
+	return 0;
+
+err_alloc:
+	if (i) {
+		kfree(diag[i - 1].name);
+		kfree(diag[i - 1].offset);
+	}
+
+	return ret;
+}
+
+static void mlx4_ib_diag_cleanup(struct mlx4_ib_dev *ibdev)
+{
+	int i;
+
+	for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) {
+		kfree(ibdev->diag_counters[i].offset);
+		kfree(ibdev->diag_counters[i].name);
+	}
+}
+
+#define MLX4_IB_INVALID_MAC	((u64)-1)
+static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
+			       struct net_device *dev,
+			       int port)
+{
+	u64 new_smac = 0;
+	u64 release_mac = MLX4_IB_INVALID_MAC;
+	struct mlx4_ib_qp *qp;
+
+	read_lock(&dev_base_lock);
+	new_smac = mlx4_mac_to_u64(dev->dev_addr);
+	read_unlock(&dev_base_lock);
+
+	atomic64_set(&ibdev->iboe.mac[port - 1], new_smac);
+
+	/* no need for update QP1 and mac registration in non-SRIOV */
+	if (!mlx4_is_mfunc(ibdev->dev))
+		return;
+
+	mutex_lock(&ibdev->qp1_proxy_lock[port - 1]);
+	qp = ibdev->qp1_proxy[port - 1];
+	if (qp) {
+		int new_smac_index;
+		u64 old_smac;
+		struct mlx4_update_qp_params update_params;
+
+		mutex_lock(&qp->mutex);
+		old_smac = qp->pri.smac;
+		if (new_smac == old_smac)
+			goto unlock;
+
+		new_smac_index = mlx4_register_mac(ibdev->dev, port, new_smac);
+
+		if (new_smac_index < 0)
+			goto unlock;
+
+		update_params.smac_index = new_smac_index;
+		if (mlx4_update_qp(ibdev->dev, qp->mqp.qpn, MLX4_UPDATE_QP_SMAC,
+				   &update_params)) {
+			release_mac = new_smac;
+			goto unlock;
+		}
+		/* if old port was zero, no mac was yet registered for this QP */
+		if (qp->pri.smac_port)
+			release_mac = old_smac;
+		qp->pri.smac = new_smac;
+		qp->pri.smac_port = port;
+		qp->pri.smac_index = new_smac_index;
+	}
+
+unlock:
+	if (release_mac != MLX4_IB_INVALID_MAC)
+		mlx4_unregister_mac(ibdev->dev, port, release_mac);
+	if (qp)
+		mutex_unlock(&qp->mutex);
+	mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]);
+}
+
+static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
+				 struct net_device *dev,
+				 unsigned long event)
+
+{
+	struct mlx4_ib_iboe *iboe;
+	int update_qps_port = -1;
+	int port;
+
+	ASSERT_RTNL();
+
+	iboe = &ibdev->iboe;
+
+	spin_lock_bh(&iboe->lock);
+	mlx4_foreach_ib_transport_port(port, ibdev->dev) {
+
+		iboe->netdevs[port - 1] =
+			mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port);
+
+		if (dev == iboe->netdevs[port - 1] &&
+		    (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER ||
+		     event == NETDEV_UP || event == NETDEV_CHANGE))
+			update_qps_port = port;
+
+	}
+	spin_unlock_bh(&iboe->lock);
+
+	if (update_qps_port > 0)
+		mlx4_ib_update_qps(ibdev, dev, update_qps_port);
+}
+
+static int mlx4_ib_netdev_event(struct notifier_block *this,
+				unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct mlx4_ib_dev *ibdev;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb);
+	mlx4_ib_scan_netdevs(ibdev, dev, event);
+
+	return NOTIFY_DONE;
+}
+
+static void init_pkeys(struct mlx4_ib_dev *ibdev)
+{
+	int port;
+	int slave;
+	int i;
+
+	if (mlx4_is_master(ibdev->dev)) {
+		for (slave = 0; slave <= ibdev->dev->persist->num_vfs;
+		     ++slave) {
+			for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
+				for (i = 0;
+				     i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
+				     ++i) {
+					ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] =
+					/* master has the identity virt2phys pkey mapping */
+						(slave == mlx4_master_func_num(ibdev->dev) || !i) ? i :
+							ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1;
+					mlx4_sync_pkey_table(ibdev->dev, slave, port, i,
+							     ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]);
+				}
+			}
+		}
+		/* initialize pkey cache */
+		for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
+			for (i = 0;
+			     i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
+			     ++i)
+				ibdev->pkeys.phys_pkey_cache[port-1][i] =
+					(i) ? 0 : 0xFFFF;
+		}
+	}
+}
+
+static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
+{
+	int i, j, eq = 0, total_eqs = 0;
+
+	ibdev->eq_table = kcalloc(dev->caps.num_comp_vectors,
+				  sizeof(ibdev->eq_table[0]), GFP_KERNEL);
+	if (!ibdev->eq_table)
+		return;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		for (j = 0; j < mlx4_get_eqs_per_port(dev, i);
+		     j++, total_eqs++) {
+			if (i > 1 &&  mlx4_is_eq_shared(dev, total_eqs))
+				continue;
+			ibdev->eq_table[eq] = total_eqs;
+			if (!mlx4_assign_eq(dev, i,
+					    &ibdev->eq_table[eq]))
+				eq++;
+			else
+				ibdev->eq_table[eq] = -1;
+		}
+	}
+
+	for (i = eq; i < dev->caps.num_comp_vectors;
+	     ibdev->eq_table[i++] = -1)
+		;
+
+	/* Advertise the new number of EQs to clients */
+	ibdev->ib_dev.num_comp_vectors = eq;
+}
+
+static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
+{
+	int i;
+	int total_eqs = ibdev->ib_dev.num_comp_vectors;
+
+	/* no eqs were allocated */
+	if (!ibdev->eq_table)
+		return;
+
+	/* Reset the advertised EQ number */
+	ibdev->ib_dev.num_comp_vectors = 0;
+
+	for (i = 0; i < total_eqs; i++)
+		mlx4_release_eq(dev, ibdev->eq_table[i]);
+
+	kfree(ibdev->eq_table);
+	ibdev->eq_table = NULL;
+}
+
+static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
+			       struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+	int err;
+
+	if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND) {
+		immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+		immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+	} else {
+		if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)
+			immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+			immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE |
+				RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+		immutable->core_cap_flags |= RDMA_CORE_PORT_RAW_PACKET;
+		if (immutable->core_cap_flags & (RDMA_CORE_PORT_IBA_ROCE |
+		    RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP))
+			immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+	}
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	return 0;
+}
+
+static void get_fw_ver_str(struct ib_device *device, char *str)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev);
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d",
+		 (int) (dev->dev->caps.fw_ver >> 32),
+		 (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
+		 (int) dev->dev->caps.fw_ver & 0xffff);
+}
+
+static void *mlx4_ib_add(struct mlx4_dev *dev)
+{
+	struct mlx4_ib_dev *ibdev;
+	int num_ports = 0;
+	int i, j;
+	int err;
+	struct mlx4_ib_iboe *iboe;
+	int ib_num_ports = 0;
+	int num_req_counters;
+	int allocated;
+	u32 counter_index;
+	struct counter_index *new_counter_index = NULL;
+
+	pr_info_once("%s", mlx4_ib_version);
+
+	num_ports = 0;
+	mlx4_foreach_ib_transport_port(i, dev)
+		num_ports++;
+
+	/* No point in registering a device with no ports... */
+	if (num_ports == 0)
+		return NULL;
+
+	ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
+	if (!ibdev) {
+		dev_err(&dev->persist->pdev->dev,
+			"Device struct alloc failed\n");
+		return NULL;
+	}
+
+	iboe = &ibdev->iboe;
+
+	if (mlx4_pd_alloc(dev, &ibdev->priv_pdn))
+		goto err_dealloc;
+
+	if (mlx4_uar_alloc(dev, &ibdev->priv_uar))
+		goto err_pd;
+
+	ibdev->uar_map = ioremap((phys_addr_t) ibdev->priv_uar.pfn << PAGE_SHIFT,
+				 PAGE_SIZE);
+	if (!ibdev->uar_map)
+		goto err_uar;
+	MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock);
+
+	ibdev->dev = dev;
+	ibdev->bond_next_port	= 0;
+
+	strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
+	ibdev->ib_dev.owner		= THIS_MODULE;
+	ibdev->ib_dev.node_type		= RDMA_NODE_IB_CA;
+	ibdev->ib_dev.local_dma_lkey	= dev->caps.reserved_lkey;
+	ibdev->num_ports		= num_ports;
+	ibdev->ib_dev.phys_port_cnt     = mlx4_is_bonded(dev) ?
+						1 : ibdev->num_ports;
+	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
+	ibdev->ib_dev.dev.parent	= &dev->persist->pdev->dev;
+	ibdev->ib_dev.get_netdev	= mlx4_ib_get_netdev;
+	ibdev->ib_dev.add_gid		= mlx4_ib_add_gid;
+	ibdev->ib_dev.del_gid		= mlx4_ib_del_gid;
+
+	if (dev->caps.userspace_caps)
+		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
+	else
+		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;
+
+	ibdev->ib_dev.uverbs_cmd_mask	=
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
+		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
+
+	ibdev->ib_dev.query_device	= mlx4_ib_query_device;
+	ibdev->ib_dev.query_port	= mlx4_ib_query_port;
+	ibdev->ib_dev.get_link_layer	= mlx4_ib_port_link_layer;
+	ibdev->ib_dev.query_gid		= mlx4_ib_query_gid;
+	ibdev->ib_dev.query_pkey	= mlx4_ib_query_pkey;
+	ibdev->ib_dev.modify_device	= mlx4_ib_modify_device;
+	ibdev->ib_dev.modify_port	= mlx4_ib_modify_port;
+	ibdev->ib_dev.alloc_ucontext	= mlx4_ib_alloc_ucontext;
+	ibdev->ib_dev.dealloc_ucontext	= mlx4_ib_dealloc_ucontext;
+	ibdev->ib_dev.mmap		= mlx4_ib_mmap;
+	ibdev->ib_dev.alloc_pd		= mlx4_ib_alloc_pd;
+	ibdev->ib_dev.dealloc_pd	= mlx4_ib_dealloc_pd;
+	ibdev->ib_dev.create_ah		= mlx4_ib_create_ah;
+	ibdev->ib_dev.query_ah		= mlx4_ib_query_ah;
+	ibdev->ib_dev.destroy_ah	= mlx4_ib_destroy_ah;
+	ibdev->ib_dev.create_srq	= mlx4_ib_create_srq;
+	ibdev->ib_dev.modify_srq	= mlx4_ib_modify_srq;
+	ibdev->ib_dev.query_srq		= mlx4_ib_query_srq;
+	ibdev->ib_dev.destroy_srq	= mlx4_ib_destroy_srq;
+	ibdev->ib_dev.post_srq_recv	= mlx4_ib_post_srq_recv;
+	ibdev->ib_dev.create_qp		= mlx4_ib_create_qp;
+	ibdev->ib_dev.modify_qp		= mlx4_ib_modify_qp;
+	ibdev->ib_dev.query_qp		= mlx4_ib_query_qp;
+	ibdev->ib_dev.destroy_qp	= mlx4_ib_destroy_qp;
+	ibdev->ib_dev.post_send		= mlx4_ib_post_send;
+	ibdev->ib_dev.post_recv		= mlx4_ib_post_recv;
+	ibdev->ib_dev.create_cq		= mlx4_ib_create_cq;
+	ibdev->ib_dev.modify_cq		= mlx4_ib_modify_cq;
+	ibdev->ib_dev.resize_cq		= mlx4_ib_resize_cq;
+	ibdev->ib_dev.destroy_cq	= mlx4_ib_destroy_cq;
+	ibdev->ib_dev.poll_cq		= mlx4_ib_poll_cq;
+	ibdev->ib_dev.req_notify_cq	= mlx4_ib_arm_cq;
+	ibdev->ib_dev.get_dma_mr	= mlx4_ib_get_dma_mr;
+	ibdev->ib_dev.reg_user_mr	= mlx4_ib_reg_user_mr;
+	ibdev->ib_dev.rereg_user_mr	= mlx4_ib_rereg_user_mr;
+	ibdev->ib_dev.dereg_mr		= mlx4_ib_dereg_mr;
+	ibdev->ib_dev.alloc_mr		= mlx4_ib_alloc_mr;
+	ibdev->ib_dev.map_mr_sg		= mlx4_ib_map_mr_sg;
+	ibdev->ib_dev.attach_mcast	= mlx4_ib_mcg_attach;
+	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
+	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
+	ibdev->ib_dev.get_port_immutable = mlx4_port_immutable;
+	ibdev->ib_dev.get_dev_fw_str    = get_fw_ver_str;
+	ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
+
+	ibdev->ib_dev.uverbs_ex_cmd_mask |=
+		(1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ);
+
+	if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) &&
+	    ((mlx4_ib_port_link_layer(&ibdev->ib_dev, 1) ==
+	    IB_LINK_LAYER_ETHERNET) ||
+	    (mlx4_ib_port_link_layer(&ibdev->ib_dev, 2) ==
+	    IB_LINK_LAYER_ETHERNET))) {
+		ibdev->ib_dev.create_wq		= mlx4_ib_create_wq;
+		ibdev->ib_dev.modify_wq		= mlx4_ib_modify_wq;
+		ibdev->ib_dev.destroy_wq	= mlx4_ib_destroy_wq;
+		ibdev->ib_dev.create_rwq_ind_table  =
+			mlx4_ib_create_rwq_ind_table;
+		ibdev->ib_dev.destroy_rwq_ind_table =
+			mlx4_ib_destroy_rwq_ind_table;
+		ibdev->ib_dev.uverbs_ex_cmd_mask |=
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ)	  |
+			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ)	  |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ)	  |
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
+	}
+
+	if (!mlx4_is_slave(ibdev->dev)) {
+		ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
+		ibdev->ib_dev.map_phys_fmr	= mlx4_ib_map_phys_fmr;
+		ibdev->ib_dev.unmap_fmr		= mlx4_ib_unmap_fmr;
+		ibdev->ib_dev.dealloc_fmr	= mlx4_ib_fmr_dealloc;
+	}
+
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
+	    dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
+		ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw;
+		ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw;
+
+		ibdev->ib_dev.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
+	}
+
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) {
+		ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd;
+		ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd;
+		ibdev->ib_dev.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
+			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
+	}
+
+	if (check_flow_steering_support(dev)) {
+		ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED;
+		ibdev->ib_dev.create_flow	= mlx4_ib_create_flow;
+		ibdev->ib_dev.destroy_flow	= mlx4_ib_destroy_flow;
+
+		ibdev->ib_dev.uverbs_ex_cmd_mask	|=
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
+	}
+
+	ibdev->ib_dev.uverbs_ex_cmd_mask |=
+		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) |
+		(1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) |
+		(1ull << IB_USER_VERBS_EX_CMD_CREATE_QP);
+
+	mlx4_ib_alloc_eqs(dev, ibdev);
+
+	spin_lock_init(&iboe->lock);
+
+	if (init_node_data(ibdev))
+		goto err_map;
+	mlx4_init_sl2vl_tbl(ibdev);
+
+	for (i = 0; i < ibdev->num_ports; ++i) {
+		mutex_init(&ibdev->counters_table[i].mutex);
+		INIT_LIST_HEAD(&ibdev->counters_table[i].counters_list);
+	}
+
+	num_req_counters = mlx4_is_bonded(dev) ? 1 : ibdev->num_ports;
+	for (i = 0; i < num_req_counters; ++i) {
+		mutex_init(&ibdev->qp1_proxy_lock[i]);
+		allocated = 0;
+		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
+						IB_LINK_LAYER_ETHERNET) {
+			err = mlx4_counter_alloc(ibdev->dev, &counter_index,
+						 MLX4_RES_USAGE_DRIVER);
+			/* if failed to allocate a new counter, use default */
+			if (err)
+				counter_index =
+					mlx4_get_default_counter_index(dev,
+								       i + 1);
+			else
+				allocated = 1;
+		} else { /* IB_LINK_LAYER_INFINIBAND use the default counter */
+			counter_index = mlx4_get_default_counter_index(dev,
+								       i + 1);
+		}
+		new_counter_index = kmalloc(sizeof(*new_counter_index),
+					    GFP_KERNEL);
+		if (!new_counter_index) {
+			if (allocated)
+				mlx4_counter_free(ibdev->dev, counter_index);
+			goto err_counter;
+		}
+		new_counter_index->index = counter_index;
+		new_counter_index->allocated = allocated;
+		list_add_tail(&new_counter_index->list,
+			      &ibdev->counters_table[i].counters_list);
+		ibdev->counters_table[i].default_counter = counter_index;
+		pr_info("counter index %d for port %d allocated %d\n",
+			counter_index, i + 1, allocated);
+	}
+	if (mlx4_is_bonded(dev))
+		for (i = 1; i < ibdev->num_ports ; ++i) {
+			new_counter_index =
+					kmalloc(sizeof(struct counter_index),
+						GFP_KERNEL);
+			if (!new_counter_index)
+				goto err_counter;
+			new_counter_index->index = counter_index;
+			new_counter_index->allocated = 0;
+			list_add_tail(&new_counter_index->list,
+				      &ibdev->counters_table[i].counters_list);
+			ibdev->counters_table[i].default_counter =
+								counter_index;
+		}
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		ib_num_ports++;
+
+	spin_lock_init(&ibdev->sm_lock);
+	mutex_init(&ibdev->cap_mask_mutex);
+	INIT_LIST_HEAD(&ibdev->qp_list);
+	spin_lock_init(&ibdev->reset_flow_resource_lock);
+
+	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED &&
+	    ib_num_ports) {
+		ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS;
+		err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count,
+					    MLX4_IB_UC_STEER_QPN_ALIGN,
+					    &ibdev->steer_qpn_base, 0,
+					    MLX4_RES_USAGE_DRIVER);
+		if (err)
+			goto err_counter;
+
+		ibdev->ib_uc_qpns_bitmap =
+			kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) *
+				sizeof(long),
+				GFP_KERNEL);
+		if (!ibdev->ib_uc_qpns_bitmap)
+			goto err_steer_qp_release;
+
+		if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB) {
+			bitmap_zero(ibdev->ib_uc_qpns_bitmap,
+				    ibdev->steer_qpn_count);
+			err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(
+					dev, ibdev->steer_qpn_base,
+					ibdev->steer_qpn_base +
+					ibdev->steer_qpn_count - 1);
+			if (err)
+				goto err_steer_free_bitmap;
+		} else {
+			bitmap_fill(ibdev->ib_uc_qpns_bitmap,
+				    ibdev->steer_qpn_count);
+		}
+	}
+
+	for (j = 1; j <= ibdev->dev->caps.num_ports; j++)
+		atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]);
+
+	if (mlx4_ib_alloc_diag_counters(ibdev))
+		goto err_steer_free_bitmap;
+
+	ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4;
+	if (ib_register_device(&ibdev->ib_dev, NULL))
+		goto err_diag_counters;
+
+	if (mlx4_ib_mad_init(ibdev))
+		goto err_reg;
+
+	if (mlx4_ib_init_sriov(ibdev))
+		goto err_mad;
+
+	if (!iboe->nb.notifier_call) {
+		iboe->nb.notifier_call = mlx4_ib_netdev_event;
+		err = register_netdevice_notifier(&iboe->nb);
+		if (err) {
+			iboe->nb.notifier_call = NULL;
+			goto err_notif;
+		}
+	}
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+		err = mlx4_config_roce_v2_port(dev, ROCE_V2_UDP_DPORT);
+		if (err)
+			goto err_notif;
+	}
+
+	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
+		if (device_create_file(&ibdev->ib_dev.dev,
+				       mlx4_class_attributes[j]))
+			goto err_notif;
+	}
+
+	ibdev->ib_active = true;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		devlink_port_type_ib_set(mlx4_get_devlink_port(dev, i),
+					 &ibdev->ib_dev);
+
+	if (mlx4_is_mfunc(ibdev->dev))
+		init_pkeys(ibdev);
+
+	/* create paravirt contexts for any VFs which are active */
+	if (mlx4_is_master(ibdev->dev)) {
+		for (j = 0; j < MLX4_MFUNC_MAX; j++) {
+			if (j == mlx4_master_func_num(ibdev->dev))
+				continue;
+			if (mlx4_is_slave_active(ibdev->dev, j))
+				do_slave_init(ibdev, j, 1);
+		}
+	}
+	return ibdev;
+
+err_notif:
+	if (ibdev->iboe.nb.notifier_call) {
+		if (unregister_netdevice_notifier(&ibdev->iboe.nb))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb.notifier_call = NULL;
+	}
+	flush_workqueue(wq);
+
+	mlx4_ib_close_sriov(ibdev);
+
+err_mad:
+	mlx4_ib_mad_cleanup(ibdev);
+
+err_reg:
+	ib_unregister_device(&ibdev->ib_dev);
+
+err_diag_counters:
+	mlx4_ib_diag_cleanup(ibdev);
+
+err_steer_free_bitmap:
+	kfree(ibdev->ib_uc_qpns_bitmap);
+
+err_steer_qp_release:
+	mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
+			      ibdev->steer_qpn_count);
+err_counter:
+	for (i = 0; i < ibdev->num_ports; ++i)
+		mlx4_ib_delete_counters_table(ibdev, &ibdev->counters_table[i]);
+
+err_map:
+	mlx4_ib_free_eqs(dev, ibdev);
+	iounmap(ibdev->uar_map);
+
+err_uar:
+	mlx4_uar_free(dev, &ibdev->priv_uar);
+
+err_pd:
+	mlx4_pd_free(dev, ibdev->priv_pdn);
+
+err_dealloc:
+	ib_dealloc_device(&ibdev->ib_dev);
+
+	return NULL;
+}
+
+int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn)
+{
+	int offset;
+
+	WARN_ON(!dev->ib_uc_qpns_bitmap);
+
+	offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap,
+					 dev->steer_qpn_count,
+					 get_count_order(count));
+	if (offset < 0)
+		return offset;
+
+	*qpn = dev->steer_qpn_base + offset;
+	return 0;
+}
+
+void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count)
+{
+	if (!qpn ||
+	    dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return;
+
+	BUG_ON(qpn < dev->steer_qpn_base);
+
+	bitmap_release_region(dev->ib_uc_qpns_bitmap,
+			      qpn - dev->steer_qpn_base,
+			      get_count_order(count));
+}
+
+int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+			 int is_attach)
+{
+	int err;
+	size_t flow_size;
+	struct ib_flow_attr *flow = NULL;
+	struct ib_flow_spec_ib *ib_spec;
+
+	if (is_attach) {
+		flow_size = sizeof(struct ib_flow_attr) +
+			    sizeof(struct ib_flow_spec_ib);
+		flow = kzalloc(flow_size, GFP_KERNEL);
+		if (!flow)
+			return -ENOMEM;
+		flow->port = mqp->port;
+		flow->num_of_specs = 1;
+		flow->size = flow_size;
+		ib_spec = (struct ib_flow_spec_ib *)(flow + 1);
+		ib_spec->type = IB_FLOW_SPEC_IB;
+		ib_spec->size = sizeof(struct ib_flow_spec_ib);
+		/* Add an empty rule for IB L2 */
+		memset(&ib_spec->mask, 0, sizeof(ib_spec->mask));
+
+		err = __mlx4_ib_create_flow(&mqp->ibqp, flow,
+					    IB_FLOW_DOMAIN_NIC,
+					    MLX4_FS_REGULAR,
+					    &mqp->reg_id);
+	} else {
+		err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id);
+	}
+	kfree(flow);
+	return err;
+}
+
+static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
+{
+	struct mlx4_ib_dev *ibdev = ibdev_ptr;
+	int p;
+	int i;
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		devlink_port_type_clear(mlx4_get_devlink_port(dev, i));
+	ibdev->ib_active = false;
+	flush_workqueue(wq);
+
+	mlx4_ib_close_sriov(ibdev);
+	mlx4_ib_mad_cleanup(ibdev);
+	ib_unregister_device(&ibdev->ib_dev);
+	mlx4_ib_diag_cleanup(ibdev);
+	if (ibdev->iboe.nb.notifier_call) {
+		if (unregister_netdevice_notifier(&ibdev->iboe.nb))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb.notifier_call = NULL;
+	}
+
+	mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
+			      ibdev->steer_qpn_count);
+	kfree(ibdev->ib_uc_qpns_bitmap);
+
+	iounmap(ibdev->uar_map);
+	for (p = 0; p < ibdev->num_ports; ++p)
+		mlx4_ib_delete_counters_table(ibdev, &ibdev->counters_table[p]);
+
+	mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB)
+		mlx4_CLOSE_PORT(dev, p);
+
+	mlx4_ib_free_eqs(dev, ibdev);
+
+	mlx4_uar_free(dev, &ibdev->priv_uar);
+	mlx4_pd_free(dev, ibdev->priv_pdn);
+	ib_dealloc_device(&ibdev->ib_dev);
+}
+
+static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init)
+{
+	struct mlx4_ib_demux_work **dm = NULL;
+	struct mlx4_dev *dev = ibdev->dev;
+	int i;
+	unsigned long flags;
+	struct mlx4_active_ports actv_ports;
+	unsigned int ports;
+	unsigned int first_port;
+
+	if (!mlx4_is_master(dev))
+		return;
+
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+	first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports);
+
+	dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC);
+	if (!dm)
+		return;
+
+	for (i = 0; i < ports; i++) {
+		dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC);
+		if (!dm[i]) {
+			while (--i >= 0)
+				kfree(dm[i]);
+			goto out;
+		}
+		INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work);
+		dm[i]->port = first_port + i + 1;
+		dm[i]->slave = slave;
+		dm[i]->do_init = do_init;
+		dm[i]->dev = ibdev;
+	}
+	/* initialize or tear down tunnel QPs for the slave */
+	spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags);
+	if (!ibdev->sriov.is_going_down) {
+		for (i = 0; i < ports; i++)
+			queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work);
+		spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags);
+	} else {
+		spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags);
+		for (i = 0; i < ports; i++)
+			kfree(dm[i]);
+	}
+out:
+	kfree(dm);
+	return;
+}
+
+static void mlx4_ib_handle_catas_error(struct mlx4_ib_dev *ibdev)
+{
+	struct mlx4_ib_qp *mqp;
+	unsigned long flags_qp;
+	unsigned long flags_cq;
+	struct mlx4_ib_cq *send_mcq, *recv_mcq;
+	struct list_head    cq_notify_list;
+	struct mlx4_cq *mcq;
+	unsigned long flags;
+
+	pr_warn("mlx4_ib_handle_catas_error was started\n");
+	INIT_LIST_HEAD(&cq_notify_list);
+
+	/* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
+	spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
+
+	list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
+		spin_lock_irqsave(&mqp->sq.lock, flags_qp);
+		if (mqp->sq.tail != mqp->sq.head) {
+			send_mcq = to_mcq(mqp->ibqp.send_cq);
+			spin_lock_irqsave(&send_mcq->lock, flags_cq);
+			if (send_mcq->mcq.comp &&
+			    mqp->ibqp.send_cq->comp_handler) {
+				if (!send_mcq->mcq.reset_notify_added) {
+					send_mcq->mcq.reset_notify_added = 1;
+					list_add_tail(&send_mcq->mcq.reset_notify,
+						      &cq_notify_list);
+				}
+			}
+			spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
+		}
+		spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
+		/* Now, handle the QP's receive queue */
+		spin_lock_irqsave(&mqp->rq.lock, flags_qp);
+		/* no handling is needed for SRQ */
+		if (!mqp->ibqp.srq) {
+			if (mqp->rq.tail != mqp->rq.head) {
+				recv_mcq = to_mcq(mqp->ibqp.recv_cq);
+				spin_lock_irqsave(&recv_mcq->lock, flags_cq);
+				if (recv_mcq->mcq.comp &&
+				    mqp->ibqp.recv_cq->comp_handler) {
+					if (!recv_mcq->mcq.reset_notify_added) {
+						recv_mcq->mcq.reset_notify_added = 1;
+						list_add_tail(&recv_mcq->mcq.reset_notify,
+							      &cq_notify_list);
+					}
+				}
+				spin_unlock_irqrestore(&recv_mcq->lock,
+						       flags_cq);
+			}
+		}
+		spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
+	}
+
+	list_for_each_entry(mcq, &cq_notify_list, reset_notify) {
+		mcq->comp(mcq);
+	}
+	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
+	pr_warn("mlx4_ib_handle_catas_error ended\n");
+}
+
+static void handle_bonded_port_state_event(struct work_struct *work)
+{
+	struct ib_event_work *ew =
+		container_of(work, struct ib_event_work, work);
+	struct mlx4_ib_dev *ibdev = ew->ib_dev;
+	enum ib_port_state bonded_port_state = IB_PORT_NOP;
+	int i;
+	struct ib_event ibev;
+
+	kfree(ew);
+	spin_lock_bh(&ibdev->iboe.lock);
+	for (i = 0; i < MLX4_MAX_PORTS; ++i) {
+		struct net_device *curr_netdev = ibdev->iboe.netdevs[i];
+		enum ib_port_state curr_port_state;
+
+		if (!curr_netdev)
+			continue;
+
+		curr_port_state =
+			(netif_running(curr_netdev) &&
+			 netif_carrier_ok(curr_netdev)) ?
+			IB_PORT_ACTIVE : IB_PORT_DOWN;
+
+		bonded_port_state = (bonded_port_state != IB_PORT_ACTIVE) ?
+			curr_port_state : IB_PORT_ACTIVE;
+	}
+	spin_unlock_bh(&ibdev->iboe.lock);
+
+	ibev.device = &ibdev->ib_dev;
+	ibev.element.port_num = 1;
+	ibev.event = (bonded_port_state == IB_PORT_ACTIVE) ?
+		IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+
+	ib_dispatch_event(&ibev);
+}
+
+void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port)
+{
+	u64 sl2vl;
+	int err;
+
+	err = mlx4_ib_query_sl2vl(&mdev->ib_dev, port, &sl2vl);
+	if (err) {
+		pr_err("Unable to get current sl to vl mapping for port %d.  Using all zeroes (%d)\n",
+		       port, err);
+		sl2vl = 0;
+	}
+	atomic64_set(&mdev->sl2vl[port - 1], sl2vl);
+}
+
+static void ib_sl2vl_update_work(struct work_struct *work)
+{
+	struct ib_event_work *ew = container_of(work, struct ib_event_work, work);
+	struct mlx4_ib_dev *mdev = ew->ib_dev;
+	int port = ew->port;
+
+	mlx4_ib_sl2vl_update(mdev, port);
+
+	kfree(ew);
+}
+
+void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev,
+				     int port)
+{
+	struct ib_event_work *ew;
+
+	ew = kmalloc(sizeof(*ew), GFP_ATOMIC);
+	if (ew) {
+		INIT_WORK(&ew->work, ib_sl2vl_update_work);
+		ew->port = port;
+		ew->ib_dev = ibdev;
+		queue_work(wq, &ew->work);
+	}
+}
+
+static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
+			  enum mlx4_dev_event event, unsigned long param)
+{
+	struct ib_event ibev;
+	struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr);
+	struct mlx4_eqe *eqe = NULL;
+	struct ib_event_work *ew;
+	int p = 0;
+
+	if (mlx4_is_bonded(dev) &&
+	    ((event == MLX4_DEV_EVENT_PORT_UP) ||
+	    (event == MLX4_DEV_EVENT_PORT_DOWN))) {
+		ew = kmalloc(sizeof(*ew), GFP_ATOMIC);
+		if (!ew)
+			return;
+		INIT_WORK(&ew->work, handle_bonded_port_state_event);
+		ew->ib_dev = ibdev;
+		queue_work(wq, &ew->work);
+		return;
+	}
+
+	if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE)
+		eqe = (struct mlx4_eqe *)param;
+	else
+		p = (int) param;
+
+	switch (event) {
+	case MLX4_DEV_EVENT_PORT_UP:
+		if (p > ibdev->num_ports)
+			return;
+		if (!mlx4_is_slave(dev) &&
+		    rdma_port_get_link_layer(&ibdev->ib_dev, p) ==
+			IB_LINK_LAYER_INFINIBAND) {
+			if (mlx4_is_master(dev))
+				mlx4_ib_invalidate_all_guid_record(ibdev, p);
+			if (ibdev->dev->flags & MLX4_FLAG_SECURE_HOST &&
+			    !(ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT))
+				mlx4_sched_ib_sl2vl_update_work(ibdev, p);
+		}
+		ibev.event = IB_EVENT_PORT_ACTIVE;
+		break;
+
+	case MLX4_DEV_EVENT_PORT_DOWN:
+		if (p > ibdev->num_ports)
+			return;
+		ibev.event = IB_EVENT_PORT_ERR;
+		break;
+
+	case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
+		ibdev->ib_active = false;
+		ibev.event = IB_EVENT_DEVICE_FATAL;
+		mlx4_ib_handle_catas_error(ibdev);
+		break;
+
+	case MLX4_DEV_EVENT_PORT_MGMT_CHANGE:
+		ew = kmalloc(sizeof *ew, GFP_ATOMIC);
+		if (!ew)
+			break;
+
+		INIT_WORK(&ew->work, handle_port_mgmt_change_event);
+		memcpy(&ew->ib_eqe, eqe, sizeof *eqe);
+		ew->ib_dev = ibdev;
+		/* need to queue only for port owner, which uses GEN_EQE */
+		if (mlx4_is_master(dev))
+			queue_work(wq, &ew->work);
+		else
+			handle_port_mgmt_change_event(&ew->work);
+		return;
+
+	case MLX4_DEV_EVENT_SLAVE_INIT:
+		/* here, p is the slave id */
+		do_slave_init(ibdev, p, 1);
+		if (mlx4_is_master(dev)) {
+			int i;
+
+			for (i = 1; i <= ibdev->num_ports; i++) {
+				if (rdma_port_get_link_layer(&ibdev->ib_dev, i)
+					== IB_LINK_LAYER_INFINIBAND)
+					mlx4_ib_slave_alias_guid_event(ibdev,
+								       p, i,
+								       1);
+			}
+		}
+		return;
+
+	case MLX4_DEV_EVENT_SLAVE_SHUTDOWN:
+		if (mlx4_is_master(dev)) {
+			int i;
+
+			for (i = 1; i <= ibdev->num_ports; i++) {
+				if (rdma_port_get_link_layer(&ibdev->ib_dev, i)
+					== IB_LINK_LAYER_INFINIBAND)
+					mlx4_ib_slave_alias_guid_event(ibdev,
+								       p, i,
+								       0);
+			}
+		}
+		/* here, p is the slave id */
+		do_slave_init(ibdev, p, 0);
+		return;
+
+	default:
+		return;
+	}
+
+	ibev.device	      = ibdev_ptr;
+	ibev.element.port_num = mlx4_is_bonded(ibdev->dev) ? 1 : (u8)p;
+
+	ib_dispatch_event(&ibev);
+}
+
+static struct mlx4_interface mlx4_ib_interface = {
+	.add		= mlx4_ib_add,
+	.remove		= mlx4_ib_remove,
+	.event		= mlx4_ib_event,
+	.protocol	= MLX4_PROT_IB_IPV6,
+	.flags		= MLX4_INTFF_BONDING
+};
+
+static int __init mlx4_ib_init(void)
+{
+	int err;
+
+	wq = alloc_ordered_workqueue("mlx4_ib", WQ_MEM_RECLAIM);
+	if (!wq)
+		return -ENOMEM;
+
+	err = mlx4_ib_mcg_init();
+	if (err)
+		goto clean_wq;
+
+	err = mlx4_register_interface(&mlx4_ib_interface);
+	if (err)
+		goto clean_mcg;
+
+	return 0;
+
+clean_mcg:
+	mlx4_ib_mcg_destroy();
+
+clean_wq:
+	destroy_workqueue(wq);
+	return err;
+}
+
+static void __exit mlx4_ib_cleanup(void)
+{
+	mlx4_unregister_interface(&mlx4_ib_interface);
+	mlx4_ib_mcg_destroy();
+	destroy_workqueue(wq);
+}
+
+module_init(mlx4_ib_init);
+module_exit(mlx4_ib_cleanup);
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
new file mode 100644
index 0000000..81ffc00
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -0,0 +1,1257 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/rbtree.h>
+#include <linux/delay.h>
+
+#include "mlx4_ib.h"
+
+#define MAX_VFS		80
+#define MAX_PEND_REQS_PER_FUNC 4
+#define MAD_TIMEOUT_MS	2000
+
+#define mcg_warn(fmt, arg...)	pr_warn("MCG WARNING: " fmt, ##arg)
+#define mcg_error(fmt, arg...)	pr_err(fmt, ##arg)
+#define mcg_warn_group(group, format, arg...) \
+	pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
+	(group)->name, group->demux->port, ## arg)
+
+#define mcg_debug_group(group, format, arg...) \
+	pr_debug("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
+		 (group)->name, (group)->demux->port, ## arg)
+
+#define mcg_error_group(group, format, arg...) \
+	pr_err("  %16s: " format, (group)->name, ## arg)
+
+
+static union ib_gid mgid0;
+
+static struct workqueue_struct *clean_wq;
+
+enum mcast_state {
+	MCAST_NOT_MEMBER = 0,
+	MCAST_MEMBER,
+};
+
+enum mcast_group_state {
+	MCAST_IDLE,
+	MCAST_JOIN_SENT,
+	MCAST_LEAVE_SENT,
+	MCAST_RESP_READY
+};
+
+struct mcast_member {
+	enum mcast_state state;
+	uint8_t			join_state;
+	int			num_pend_reqs;
+	struct list_head	pending;
+};
+
+struct ib_sa_mcmember_data {
+	union ib_gid	mgid;
+	union ib_gid	port_gid;
+	__be32		qkey;
+	__be16		mlid;
+	u8		mtusel_mtu;
+	u8		tclass;
+	__be16		pkey;
+	u8		ratesel_rate;
+	u8		lifetmsel_lifetm;
+	__be32		sl_flowlabel_hoplimit;
+	u8		scope_join_state;
+	u8		proxy_join;
+	u8		reserved[2];
+} __packed __aligned(4);
+
+struct mcast_group {
+	struct ib_sa_mcmember_data rec;
+	struct rb_node		node;
+	struct list_head	mgid0_list;
+	struct mlx4_ib_demux_ctx *demux;
+	struct mcast_member	func[MAX_VFS];
+	struct mutex		lock;
+	struct work_struct	work;
+	struct list_head	pending_list;
+	int			members[3];
+	enum mcast_group_state	state;
+	enum mcast_group_state	prev_state;
+	struct ib_sa_mad	response_sa_mad;
+	__be64			last_req_tid;
+
+	char			name[33]; /* MGID string */
+	struct device_attribute	dentry;
+
+	/* refcount is the reference count for the following:
+	   1. Each queued request
+	   2. Each invocation of the worker thread
+	   3. Membership of the port at the SA
+	*/
+	atomic_t		refcount;
+
+	/* delayed work to clean pending SM request */
+	struct delayed_work	timeout_work;
+	struct list_head	cleanup_list;
+};
+
+struct mcast_req {
+	int			func;
+	struct ib_sa_mad	sa_mad;
+	struct list_head	group_list;
+	struct list_head	func_list;
+	struct mcast_group	*group;
+	int			clean;
+};
+
+
+#define safe_atomic_dec(ref) \
+	do {\
+		if (atomic_dec_and_test(ref)) \
+			mcg_warn_group(group, "did not expect to reach zero\n"); \
+	} while (0)
+
+static const char *get_state_string(enum mcast_group_state state)
+{
+	switch (state) {
+	case MCAST_IDLE:
+		return "MCAST_IDLE";
+	case MCAST_JOIN_SENT:
+		return "MCAST_JOIN_SENT";
+	case MCAST_LEAVE_SENT:
+		return "MCAST_LEAVE_SENT";
+	case MCAST_RESP_READY:
+		return "MCAST_RESP_READY";
+	}
+	return "Invalid State";
+}
+
+static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx,
+				      union ib_gid *mgid)
+{
+	struct rb_node *node = ctx->mcg_table.rb_node;
+	struct mcast_group *group;
+	int ret;
+
+	while (node) {
+		group = rb_entry(node, struct mcast_group, node);
+		ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+		if (!ret)
+			return group;
+
+		if (ret < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx,
+					struct mcast_group *group)
+{
+	struct rb_node **link = &ctx->mcg_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct mcast_group *cur_group;
+	int ret;
+
+	while (*link) {
+		parent = *link;
+		cur_group = rb_entry(parent, struct mcast_group, node);
+
+		ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+			     sizeof group->rec.mgid);
+		if (ret < 0)
+			link = &(*link)->rb_left;
+		else if (ret > 0)
+			link = &(*link)->rb_right;
+		else
+			return cur_group;
+	}
+	rb_link_node(&group->node, parent, link);
+	rb_insert_color(&group->node, &ctx->mcg_table);
+	return NULL;
+}
+
+static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)
+{
+	struct mlx4_ib_dev *dev = ctx->dev;
+	struct rdma_ah_attr	ah_attr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->sm_lock, flags);
+	if (!dev->sm_ah[ctx->port - 1]) {
+		/* port is not yet Active, sm_ah not ready */
+		spin_unlock_irqrestore(&dev->sm_lock, flags);
+		return -EAGAIN;
+	}
+	mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
+	spin_unlock_irqrestore(&dev->sm_lock, flags);
+	return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev),
+				    ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY,
+				    &ah_attr, NULL, 0xffff, mad);
+}
+
+static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx,
+			     struct ib_mad *mad)
+{
+	struct mlx4_ib_dev *dev = ctx->dev;
+	struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1];
+	struct ib_wc wc;
+	struct rdma_ah_attr ah_attr;
+
+	/* Our agent might not yet be registered when mads start to arrive */
+	if (!agent)
+		return -EAGAIN;
+
+	rdma_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
+
+	if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index))
+		return -EINVAL;
+	wc.sl = 0;
+	wc.dlid_path_bits = 0;
+	wc.port_num = ctx->port;
+	wc.slid = rdma_ah_get_dlid(&ah_attr);  /* opensm lid */
+	wc.src_qp = 1;
+	return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad);
+}
+
+static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad)
+{
+	struct ib_sa_mad mad;
+	struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data;
+	int ret;
+
+	/* we rely on a mad request as arrived from a VF */
+	memcpy(&mad, sa_mad, sizeof mad);
+
+	/* fix port GID to be the real one (slave 0) */
+	sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0];
+
+	/* assign our own TID */
+	mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
+	group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
+
+	ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
+	/* set timeout handler */
+	if (!ret) {
+		/* calls mlx4_ib_mcg_timeout_handler */
+		queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
+				msecs_to_jiffies(MAD_TIMEOUT_MS));
+	}
+
+	return ret;
+}
+
+static int send_leave_to_wire(struct mcast_group *group, u8 join_state)
+{
+	struct ib_sa_mad mad;
+	struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
+	int ret;
+
+	memset(&mad, 0, sizeof mad);
+	mad.mad_hdr.base_version = 1;
+	mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+	mad.mad_hdr.class_version = 2;
+	mad.mad_hdr.method = IB_SA_METHOD_DELETE;
+	mad.mad_hdr.status = cpu_to_be16(0);
+	mad.mad_hdr.class_specific = cpu_to_be16(0);
+	mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
+	group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
+	mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	mad.mad_hdr.attr_mod = cpu_to_be32(0);
+	mad.sa_hdr.sm_key = 0x0;
+	mad.sa_hdr.attr_offset = cpu_to_be16(7);
+	mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID |
+		IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+	*sa_data = group->rec;
+	sa_data->scope_join_state = join_state;
+
+	ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
+	if (ret)
+		group->state = MCAST_IDLE;
+
+	/* set timeout handler */
+	if (!ret) {
+		/* calls mlx4_ib_mcg_timeout_handler */
+		queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
+				msecs_to_jiffies(MAD_TIMEOUT_MS));
+	}
+
+	return ret;
+}
+
+static int send_reply_to_slave(int slave, struct mcast_group *group,
+		struct ib_sa_mad *req_sa_mad, u16 status)
+{
+	struct ib_sa_mad mad;
+	struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
+	struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data;
+	int ret;
+
+	memset(&mad, 0, sizeof mad);
+	mad.mad_hdr.base_version = 1;
+	mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+	mad.mad_hdr.class_version = 2;
+	mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+	mad.mad_hdr.status = cpu_to_be16(status);
+	mad.mad_hdr.class_specific = cpu_to_be16(0);
+	mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid;
+	*(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */
+	mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	mad.mad_hdr.attr_mod = cpu_to_be32(0);
+	mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key;
+	mad.sa_hdr.attr_offset = cpu_to_be16(7);
+	mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */
+
+	*sa_data = group->rec;
+
+	/* reconstruct VF's requested join_state and port_gid */
+	sa_data->scope_join_state &= 0xf0;
+	sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f);
+	memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid);
+
+	ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad);
+	return ret;
+}
+
+static int check_selector(ib_sa_comp_mask comp_mask,
+			  ib_sa_comp_mask selector_mask,
+			  ib_sa_comp_mask value_mask,
+			  u8 src_value, u8 dst_value)
+{
+	int err;
+	u8 selector = dst_value >> 6;
+	dst_value &= 0x3f;
+	src_value &= 0x3f;
+
+	if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+		return 0;
+
+	switch (selector) {
+	case IB_SA_GT:
+		err = (src_value <= dst_value);
+		break;
+	case IB_SA_LT:
+		err = (src_value >= dst_value);
+		break;
+	case IB_SA_EQ:
+		err = (src_value != dst_value);
+		break;
+	default:
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+static u16 cmp_rec(struct ib_sa_mcmember_data *src,
+		   struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask)
+{
+	/* src is group record, dst is request record */
+	/* MGID must already match */
+	/* Port_GID we always replace to our Port_GID, so it is a match */
+
+#define MAD_STATUS_REQ_INVALID 0x0200
+	if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+		return MAD_STATUS_REQ_INVALID;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+				 IB_SA_MCMEMBER_REC_MTU,
+				 src->mtusel_mtu, dst->mtusel_mtu))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+	    src->tclass != dst->tclass)
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+		return MAD_STATUS_REQ_INVALID;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+				 IB_SA_MCMEMBER_REC_RATE,
+				 src->ratesel_rate, dst->ratesel_rate))
+		return MAD_STATUS_REQ_INVALID;
+	if (check_selector(comp_mask,
+				 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+				 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+				 src->lifetmsel_lifetm, dst->lifetmsel_lifetm))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SL &&
+			(be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) !=
+			(be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+			(be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) !=
+			(be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+			(be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) !=
+			(be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE &&
+			(src->scope_join_state & 0xf0) !=
+			(dst->scope_join_state & 0xf0))
+		return MAD_STATUS_REQ_INVALID;
+
+	/* join_state checked separately, proxy_join ignored */
+
+	return 0;
+}
+
+/* release group, return 1 if this was last release and group is destroyed
+ * timout work is canceled sync */
+static int release_group(struct mcast_group *group, int from_timeout_handler)
+{
+	struct mlx4_ib_demux_ctx *ctx = group->demux;
+	int nzgroup;
+
+	mutex_lock(&ctx->mcg_table_lock);
+	mutex_lock(&group->lock);
+	if (atomic_dec_and_test(&group->refcount)) {
+		if (!from_timeout_handler) {
+			if (group->state != MCAST_IDLE &&
+			    !cancel_delayed_work(&group->timeout_work)) {
+				atomic_inc(&group->refcount);
+				mutex_unlock(&group->lock);
+				mutex_unlock(&ctx->mcg_table_lock);
+				return 0;
+			}
+		}
+
+		nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0);
+		if (nzgroup)
+			del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+		if (!list_empty(&group->pending_list))
+			mcg_warn_group(group, "releasing a group with non empty pending list\n");
+		if (nzgroup)
+			rb_erase(&group->node, &ctx->mcg_table);
+		list_del_init(&group->mgid0_list);
+		mutex_unlock(&group->lock);
+		mutex_unlock(&ctx->mcg_table_lock);
+		kfree(group);
+		return 1;
+	} else {
+		mutex_unlock(&group->lock);
+		mutex_unlock(&ctx->mcg_table_lock);
+	}
+	return 0;
+}
+
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+	int i;
+
+	for (i = 0; i < 3; i++, join_state >>= 1)
+		if (join_state & 0x1)
+			group->members[i] += inc;
+}
+
+static u8 get_leave_state(struct mcast_group *group)
+{
+	u8 leave_state = 0;
+	int i;
+
+	for (i = 0; i < 3; i++)
+		if (!group->members[i])
+			leave_state |= (1 << i);
+
+	return leave_state & (group->rec.scope_join_state & 0xf);
+}
+
+static int join_group(struct mcast_group *group, int slave, u8 join_mask)
+{
+	int ret = 0;
+	u8 join_state;
+
+	/* remove bits that slave is already member of, and adjust */
+	join_state = join_mask & (~group->func[slave].join_state);
+	adjust_membership(group, join_state, 1);
+	group->func[slave].join_state |= join_state;
+	if (group->func[slave].state != MCAST_MEMBER && join_state) {
+		group->func[slave].state = MCAST_MEMBER;
+		ret = 1;
+	}
+	return ret;
+}
+
+static int leave_group(struct mcast_group *group, int slave, u8 leave_state)
+{
+	int ret = 0;
+
+	adjust_membership(group, leave_state, -1);
+	group->func[slave].join_state &= ~leave_state;
+	if (!group->func[slave].join_state) {
+		group->func[slave].state = MCAST_NOT_MEMBER;
+		ret = 1;
+	}
+	return ret;
+}
+
+static int check_leave(struct mcast_group *group, int slave, u8 leave_mask)
+{
+	if (group->func[slave].state != MCAST_MEMBER)
+		return MAD_STATUS_REQ_INVALID;
+
+	/* make sure we're not deleting unset bits */
+	if (~group->func[slave].join_state & leave_mask)
+		return MAD_STATUS_REQ_INVALID;
+
+	if (!leave_mask)
+		return MAD_STATUS_REQ_INVALID;
+
+	return 0;
+}
+
+static void mlx4_ib_mcg_timeout_handler(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mcast_group *group;
+	struct mcast_req *req = NULL;
+
+	group = container_of(delay, typeof(*group), timeout_work);
+
+	mutex_lock(&group->lock);
+	if (group->state == MCAST_JOIN_SENT) {
+		if (!list_empty(&group->pending_list)) {
+			req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+			list_del(&req->group_list);
+			list_del(&req->func_list);
+			--group->func[req->func].num_pend_reqs;
+			mutex_unlock(&group->lock);
+			kfree(req);
+			if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) {
+				if (release_group(group, 1))
+					return;
+			} else {
+				kfree(group);
+				return;
+			}
+			mutex_lock(&group->lock);
+		} else
+			mcg_warn_group(group, "DRIVER BUG\n");
+	} else if (group->state == MCAST_LEAVE_SENT) {
+		if (group->rec.scope_join_state & 0xf)
+			group->rec.scope_join_state &= 0xf0;
+		group->state = MCAST_IDLE;
+		mutex_unlock(&group->lock);
+		if (release_group(group, 1))
+			return;
+		mutex_lock(&group->lock);
+	} else
+		mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state));
+	group->state = MCAST_IDLE;
+	atomic_inc(&group->refcount);
+	if (!queue_work(group->demux->mcg_wq, &group->work))
+		safe_atomic_dec(&group->refcount);
+
+	mutex_unlock(&group->lock);
+}
+
+static int handle_leave_req(struct mcast_group *group, u8 leave_mask,
+			    struct mcast_req *req)
+{
+	u16 status;
+
+	if (req->clean)
+		leave_mask = group->func[req->func].join_state;
+
+	status = check_leave(group, req->func, leave_mask);
+	if (!status)
+		leave_group(group, req->func, leave_mask);
+
+	if (!req->clean)
+		send_reply_to_slave(req->func, group, &req->sa_mad, status);
+	--group->func[req->func].num_pend_reqs;
+	list_del(&req->group_list);
+	list_del(&req->func_list);
+	kfree(req);
+	return 1;
+}
+
+static int handle_join_req(struct mcast_group *group, u8 join_mask,
+			   struct mcast_req *req)
+{
+	u8 group_join_state = group->rec.scope_join_state & 0xf;
+	int ref = 0;
+	u16 status;
+	struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
+
+	if (join_mask == (group_join_state & join_mask)) {
+		/* port's membership need not change */
+		status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask);
+		if (!status)
+			join_group(group, req->func, join_mask);
+
+		--group->func[req->func].num_pend_reqs;
+		send_reply_to_slave(req->func, group, &req->sa_mad, status);
+		list_del(&req->group_list);
+		list_del(&req->func_list);
+		kfree(req);
+		++ref;
+	} else {
+		/* port's membership needs to be updated */
+		group->prev_state = group->state;
+		if (send_join_to_wire(group, &req->sa_mad)) {
+			--group->func[req->func].num_pend_reqs;
+			list_del(&req->group_list);
+			list_del(&req->func_list);
+			kfree(req);
+			ref = 1;
+			group->state = group->prev_state;
+		} else
+			group->state = MCAST_JOIN_SENT;
+	}
+
+	return ref;
+}
+
+static void mlx4_ib_mcg_work_handler(struct work_struct *work)
+{
+	struct mcast_group *group;
+	struct mcast_req *req = NULL;
+	struct ib_sa_mcmember_data *sa_data;
+	u8 req_join_state;
+	int rc = 1; /* release_count - this is for the scheduled work */
+	u16 status;
+	u8 method;
+
+	group = container_of(work, typeof(*group), work);
+
+	mutex_lock(&group->lock);
+
+	/* First, let's see if a response from SM is waiting regarding this group.
+	 * If so, we need to update the group's REC. If this is a bad response, we
+	 * may need to send a bad response to a VF waiting for it. If VF is waiting
+	 * and this is a good response, the VF will be answered later in this func. */
+	if (group->state == MCAST_RESP_READY) {
+		/* cancels mlx4_ib_mcg_timeout_handler */
+		cancel_delayed_work(&group->timeout_work);
+		status = be16_to_cpu(group->response_sa_mad.mad_hdr.status);
+		method = group->response_sa_mad.mad_hdr.method;
+		if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) {
+			mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n",
+				be64_to_cpu(group->response_sa_mad.mad_hdr.tid),
+				be64_to_cpu(group->last_req_tid));
+			group->state = group->prev_state;
+			goto process_requests;
+		}
+		if (status) {
+			if (!list_empty(&group->pending_list))
+				req = list_first_entry(&group->pending_list,
+						struct mcast_req, group_list);
+			if ((method == IB_MGMT_METHOD_GET_RESP)) {
+					if (req) {
+						send_reply_to_slave(req->func, group, &req->sa_mad, status);
+						--group->func[req->func].num_pend_reqs;
+						list_del(&req->group_list);
+						list_del(&req->func_list);
+						kfree(req);
+						++rc;
+					} else
+						mcg_warn_group(group, "no request for failed join\n");
+			} else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing)
+				++rc;
+		} else {
+			u8 resp_join_state;
+			u8 cur_join_state;
+
+			resp_join_state = ((struct ib_sa_mcmember_data *)
+						group->response_sa_mad.data)->scope_join_state & 0xf;
+			cur_join_state = group->rec.scope_join_state & 0xf;
+
+			if (method == IB_MGMT_METHOD_GET_RESP) {
+				/* successfull join */
+				if (!cur_join_state && resp_join_state)
+					--rc;
+			} else if (!resp_join_state)
+					++rc;
+			memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec);
+		}
+		group->state = MCAST_IDLE;
+	}
+
+process_requests:
+	/* We should now go over pending join/leave requests, as long as we are idle. */
+	while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) {
+		req = list_first_entry(&group->pending_list, struct mcast_req,
+				       group_list);
+		sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
+		req_join_state = sa_data->scope_join_state & 0xf;
+
+		/* For a leave request, we will immediately answer the VF, and
+		 * update our internal counters. The actual leave will be sent
+		 * to SM later, if at all needed. We dequeue the request now. */
+		if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE)
+			rc += handle_leave_req(group, req_join_state, req);
+		else
+			rc += handle_join_req(group, req_join_state, req);
+	}
+
+	/* Handle leaves */
+	if (group->state == MCAST_IDLE) {
+		req_join_state = get_leave_state(group);
+		if (req_join_state) {
+			group->rec.scope_join_state &= ~req_join_state;
+			group->prev_state = group->state;
+			if (send_leave_to_wire(group, req_join_state)) {
+				group->state = group->prev_state;
+				++rc;
+			} else
+				group->state = MCAST_LEAVE_SENT;
+		}
+	}
+
+	if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE)
+		goto process_requests;
+	mutex_unlock(&group->lock);
+
+	while (rc--)
+		release_group(group, 0);
+}
+
+static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx,
+						       __be64 tid,
+						       union ib_gid *new_mgid)
+{
+	struct mcast_group *group = NULL, *cur_group, *n;
+	struct mcast_req *req;
+
+	mutex_lock(&ctx->mcg_table_lock);
+	list_for_each_entry_safe(group, n, &ctx->mcg_mgid0_list, mgid0_list) {
+		mutex_lock(&group->lock);
+		if (group->last_req_tid == tid) {
+			if (memcmp(new_mgid, &mgid0, sizeof mgid0)) {
+				group->rec.mgid = *new_mgid;
+				sprintf(group->name, "%016llx%016llx",
+						be64_to_cpu(group->rec.mgid.global.subnet_prefix),
+						be64_to_cpu(group->rec.mgid.global.interface_id));
+				list_del_init(&group->mgid0_list);
+				cur_group = mcast_insert(ctx, group);
+				if (cur_group) {
+					/* A race between our code and SM. Silently cleaning the new one */
+					req = list_first_entry(&group->pending_list,
+							       struct mcast_req, group_list);
+					--group->func[req->func].num_pend_reqs;
+					list_del(&req->group_list);
+					list_del(&req->func_list);
+					kfree(req);
+					mutex_unlock(&group->lock);
+					mutex_unlock(&ctx->mcg_table_lock);
+					release_group(group, 0);
+					return NULL;
+				}
+
+				atomic_inc(&group->refcount);
+				add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+				mutex_unlock(&group->lock);
+				mutex_unlock(&ctx->mcg_table_lock);
+				return group;
+			} else {
+				struct mcast_req *tmp1, *tmp2;
+
+				list_del(&group->mgid0_list);
+				if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE)
+					cancel_delayed_work_sync(&group->timeout_work);
+
+				list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) {
+					list_del(&tmp1->group_list);
+					kfree(tmp1);
+				}
+				mutex_unlock(&group->lock);
+				mutex_unlock(&ctx->mcg_table_lock);
+				kfree(group);
+				return NULL;
+			}
+		}
+		mutex_unlock(&group->lock);
+	}
+	mutex_unlock(&ctx->mcg_table_lock);
+
+	return NULL;
+}
+
+static ssize_t sysfs_show_group(struct device *dev,
+		struct device_attribute *attr, char *buf);
+
+static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx,
+					 union ib_gid *mgid, int create)
+{
+	struct mcast_group *group, *cur_group;
+	int is_mgid0;
+	int i;
+
+	is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+	if (!is_mgid0) {
+		group = mcast_find(ctx, mgid);
+		if (group)
+			goto found;
+	}
+
+	if (!create)
+		return ERR_PTR(-ENOENT);
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	group->demux = ctx;
+	group->rec.mgid = *mgid;
+	INIT_LIST_HEAD(&group->pending_list);
+	INIT_LIST_HEAD(&group->mgid0_list);
+	for (i = 0; i < MAX_VFS; ++i)
+		INIT_LIST_HEAD(&group->func[i].pending);
+	INIT_WORK(&group->work, mlx4_ib_mcg_work_handler);
+	INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler);
+	mutex_init(&group->lock);
+	sprintf(group->name, "%016llx%016llx",
+			be64_to_cpu(group->rec.mgid.global.subnet_prefix),
+			be64_to_cpu(group->rec.mgid.global.interface_id));
+	sysfs_attr_init(&group->dentry.attr);
+	group->dentry.show = sysfs_show_group;
+	group->dentry.store = NULL;
+	group->dentry.attr.name = group->name;
+	group->dentry.attr.mode = 0400;
+	group->state = MCAST_IDLE;
+
+	if (is_mgid0) {
+		list_add(&group->mgid0_list, &ctx->mcg_mgid0_list);
+		goto found;
+	}
+
+	cur_group = mcast_insert(ctx, group);
+	if (cur_group) {
+		mcg_warn("group just showed up %s - confused\n", cur_group->name);
+		kfree(group);
+		return ERR_PTR(-EINVAL);
+	}
+
+	add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+
+found:
+	atomic_inc(&group->refcount);
+	return group;
+}
+
+static void queue_req(struct mcast_req *req)
+{
+	struct mcast_group *group = req->group;
+
+	atomic_inc(&group->refcount); /* for the request */
+	atomic_inc(&group->refcount); /* for scheduling the work */
+	list_add_tail(&req->group_list, &group->pending_list);
+	list_add_tail(&req->func_list, &group->func[req->func].pending);
+	/* calls mlx4_ib_mcg_work_handler */
+	if (!queue_work(group->demux->mcg_wq, &group->work))
+		safe_atomic_dec(&group->refcount);
+}
+
+int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
+			      struct ib_sa_mad *mad)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data;
+	struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
+	struct mcast_group *group;
+
+	switch (mad->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET_RESP:
+	case IB_SA_METHOD_DELETE_RESP:
+		mutex_lock(&ctx->mcg_table_lock);
+		group = acquire_group(ctx, &rec->mgid, 0);
+		mutex_unlock(&ctx->mcg_table_lock);
+		if (IS_ERR(group)) {
+			if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) {
+				__be64 tid = mad->mad_hdr.tid;
+				*(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */
+				group = search_relocate_mgid0_group(ctx, tid, &rec->mgid);
+			} else
+				group = NULL;
+		}
+
+		if (!group)
+			return 1;
+
+		mutex_lock(&group->lock);
+		group->response_sa_mad = *mad;
+		group->prev_state = group->state;
+		group->state = MCAST_RESP_READY;
+		/* calls mlx4_ib_mcg_work_handler */
+		atomic_inc(&group->refcount);
+		if (!queue_work(ctx->mcg_wq, &group->work))
+			safe_atomic_dec(&group->refcount);
+		mutex_unlock(&group->lock);
+		release_group(group, 0);
+		return 1; /* consumed */
+	case IB_MGMT_METHOD_SET:
+	case IB_SA_METHOD_GET_TABLE:
+	case IB_SA_METHOD_GET_TABLE_RESP:
+	case IB_SA_METHOD_DELETE:
+		return 0; /* not consumed, pass-through to guest over tunnel */
+	default:
+		mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n",
+			port, mad->mad_hdr.method);
+		return 1; /* consumed */
+	}
+}
+
+int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port,
+				  int slave, struct ib_sa_mad *sa_mad)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data;
+	struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
+	struct mcast_group *group;
+	struct mcast_req *req;
+	int may_create = 0;
+
+	if (ctx->flushing)
+		return -EAGAIN;
+
+	switch (sa_mad->mad_hdr.method) {
+	case IB_MGMT_METHOD_SET:
+		may_create = 1;
+		/* fall through */
+	case IB_SA_METHOD_DELETE:
+		req = kzalloc(sizeof *req, GFP_KERNEL);
+		if (!req)
+			return -ENOMEM;
+
+		req->func = slave;
+		req->sa_mad = *sa_mad;
+
+		mutex_lock(&ctx->mcg_table_lock);
+		group = acquire_group(ctx, &rec->mgid, may_create);
+		mutex_unlock(&ctx->mcg_table_lock);
+		if (IS_ERR(group)) {
+			kfree(req);
+			return PTR_ERR(group);
+		}
+		mutex_lock(&group->lock);
+		if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) {
+			mutex_unlock(&group->lock);
+			mcg_debug_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n",
+					port, slave, MAX_PEND_REQS_PER_FUNC);
+			release_group(group, 0);
+			kfree(req);
+			return -ENOMEM;
+		}
+		++group->func[slave].num_pend_reqs;
+		req->group = group;
+		queue_req(req);
+		mutex_unlock(&group->lock);
+		release_group(group, 0);
+		return 1; /* consumed */
+	case IB_SA_METHOD_GET_TABLE:
+	case IB_MGMT_METHOD_GET_RESP:
+	case IB_SA_METHOD_GET_TABLE_RESP:
+	case IB_SA_METHOD_DELETE_RESP:
+		return 0; /* not consumed, pass-through */
+	default:
+		mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n",
+			port, slave, sa_mad->mad_hdr.method);
+		return 1; /* consumed */
+	}
+}
+
+static ssize_t sysfs_show_group(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct mcast_group *group =
+		container_of(attr, struct mcast_group, dentry);
+	struct mcast_req *req = NULL;
+	char pending_str[40];
+	char state_str[40];
+	ssize_t len = 0;
+	int f;
+
+	if (group->state == MCAST_IDLE)
+		sprintf(state_str, "%s", get_state_string(group->state));
+	else
+		sprintf(state_str, "%s(TID=0x%llx)",
+				get_state_string(group->state),
+				be64_to_cpu(group->last_req_tid));
+	if (list_empty(&group->pending_list)) {
+		sprintf(pending_str, "No");
+	} else {
+		req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+		sprintf(pending_str, "Yes(TID=0x%llx)",
+				be64_to_cpu(req->sa_mad.mad_hdr.tid));
+	}
+	len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s     ",
+			group->rec.scope_join_state & 0xf,
+			group->members[2], group->members[1], group->members[0],
+			atomic_read(&group->refcount),
+			pending_str,
+			state_str);
+	for (f = 0; f < MAX_VFS; ++f)
+		if (group->func[f].state == MCAST_MEMBER)
+			len += sprintf(buf + len, "%d[%1x] ",
+					f, group->func[f].join_state);
+
+	len += sprintf(buf + len, "\t\t(%4hx %4x %2x %2x %2x %2x %2x "
+		"%4x %4x %2x %2x)\n",
+		be16_to_cpu(group->rec.pkey),
+		be32_to_cpu(group->rec.qkey),
+		(group->rec.mtusel_mtu & 0xc0) >> 6,
+		group->rec.mtusel_mtu & 0x3f,
+		group->rec.tclass,
+		(group->rec.ratesel_rate & 0xc0) >> 6,
+		group->rec.ratesel_rate & 0x3f,
+		(be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0xf0000000) >> 28,
+		(be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x0fffff00) >> 8,
+		be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x000000ff,
+		group->rec.proxy_join);
+
+	return len;
+}
+
+int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx)
+{
+	char name[20];
+
+	atomic_set(&ctx->tid, 0);
+	sprintf(name, "mlx4_ib_mcg%d", ctx->port);
+	ctx->mcg_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+	if (!ctx->mcg_wq)
+		return -ENOMEM;
+
+	mutex_init(&ctx->mcg_table_lock);
+	ctx->mcg_table = RB_ROOT;
+	INIT_LIST_HEAD(&ctx->mcg_mgid0_list);
+	ctx->flushing = 0;
+
+	return 0;
+}
+
+static void force_clean_group(struct mcast_group *group)
+{
+	struct mcast_req *req, *tmp
+		;
+	list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) {
+		list_del(&req->group_list);
+		kfree(req);
+	}
+	del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr);
+	rb_erase(&group->node, &group->demux->mcg_table);
+	kfree(group);
+}
+
+static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
+{
+	int i;
+	struct rb_node *p;
+	struct mcast_group *group;
+	unsigned long end;
+	int count;
+
+	for (i = 0; i < MAX_VFS; ++i)
+		clean_vf_mcast(ctx, i);
+
+	end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000);
+	do {
+		count = 0;
+		mutex_lock(&ctx->mcg_table_lock);
+		for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p))
+			++count;
+		mutex_unlock(&ctx->mcg_table_lock);
+		if (!count)
+			break;
+
+		usleep_range(1000, 2000);
+	} while (time_after(end, jiffies));
+
+	flush_workqueue(ctx->mcg_wq);
+	if (destroy_wq)
+		destroy_workqueue(ctx->mcg_wq);
+
+	mutex_lock(&ctx->mcg_table_lock);
+	while ((p = rb_first(&ctx->mcg_table)) != NULL) {
+		group = rb_entry(p, struct mcast_group, node);
+		if (atomic_read(&group->refcount))
+			mcg_debug_group(group, "group refcount %d!!! (pointer %p)\n",
+					atomic_read(&group->refcount), group);
+
+		force_clean_group(group);
+	}
+	mutex_unlock(&ctx->mcg_table_lock);
+}
+
+struct clean_work {
+	struct work_struct work;
+	struct mlx4_ib_demux_ctx *ctx;
+	int destroy_wq;
+};
+
+static void mcg_clean_task(struct work_struct *work)
+{
+	struct clean_work *cw = container_of(work, struct clean_work, work);
+
+	_mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq);
+	cw->ctx->flushing = 0;
+	kfree(cw);
+}
+
+void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
+{
+	struct clean_work *work;
+
+	if (ctx->flushing)
+		return;
+
+	ctx->flushing = 1;
+
+	if (destroy_wq) {
+		_mlx4_ib_mcg_port_cleanup(ctx, destroy_wq);
+		ctx->flushing = 0;
+		return;
+	}
+
+	work = kmalloc(sizeof *work, GFP_KERNEL);
+	if (!work) {
+		ctx->flushing = 0;
+		return;
+	}
+
+	work->ctx = ctx;
+	work->destroy_wq = destroy_wq;
+	INIT_WORK(&work->work, mcg_clean_task);
+	queue_work(clean_wq, &work->work);
+}
+
+static void build_leave_mad(struct mcast_req *req)
+{
+	struct ib_sa_mad *mad = &req->sa_mad;
+
+	mad->mad_hdr.method = IB_SA_METHOD_DELETE;
+}
+
+
+static void clear_pending_reqs(struct mcast_group *group, int vf)
+{
+	struct mcast_req *req, *tmp, *group_first = NULL;
+	int clear;
+	int pend = 0;
+
+	if (!list_empty(&group->pending_list))
+		group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+
+	list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) {
+		clear = 1;
+		if (group_first == req &&
+		    (group->state == MCAST_JOIN_SENT ||
+		     group->state == MCAST_LEAVE_SENT)) {
+			clear = cancel_delayed_work(&group->timeout_work);
+			pend = !clear;
+			group->state = MCAST_IDLE;
+		}
+		if (clear) {
+			--group->func[vf].num_pend_reqs;
+			list_del(&req->group_list);
+			list_del(&req->func_list);
+			kfree(req);
+			atomic_dec(&group->refcount);
+		}
+	}
+
+	if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) {
+		mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n",
+			       list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs);
+	}
+}
+
+static int push_deleteing_req(struct mcast_group *group, int slave)
+{
+	struct mcast_req *req;
+	struct mcast_req *pend_req;
+
+	if (!group->func[slave].join_state)
+		return 0;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	if (!list_empty(&group->func[slave].pending)) {
+		pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list);
+		if (pend_req->clean) {
+			kfree(req);
+			return 0;
+		}
+	}
+
+	req->clean = 1;
+	req->func = slave;
+	req->group = group;
+	++group->func[slave].num_pend_reqs;
+	build_leave_mad(req);
+	queue_req(req);
+	return 0;
+}
+
+void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave)
+{
+	struct mcast_group *group;
+	struct rb_node *p;
+
+	mutex_lock(&ctx->mcg_table_lock);
+	for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) {
+		group = rb_entry(p, struct mcast_group, node);
+		mutex_lock(&group->lock);
+		if (atomic_read(&group->refcount)) {
+			/* clear pending requests of this VF */
+			clear_pending_reqs(group, slave);
+			push_deleteing_req(group, slave);
+		}
+		mutex_unlock(&group->lock);
+	}
+	mutex_unlock(&ctx->mcg_table_lock);
+}
+
+
+int mlx4_ib_mcg_init(void)
+{
+	clean_wq = alloc_ordered_workqueue("mlx4_ib_mcg", WQ_MEM_RECLAIM);
+	if (!clean_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_ib_mcg_destroy(void)
+{
+	destroy_workqueue(clean_wq);
+}
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
new file mode 100644
index 0000000..7b14299
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -0,0 +1,925 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_H
+#define MLX4_IB_H
+
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/idr.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_sa.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cq.h>
+
+#define MLX4_IB_DRV_NAME	"mlx4_ib"
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt)	"<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__
+
+#define mlx4_ib_warn(ibdev, format, arg...) \
+	dev_warn((ibdev)->dev.parent, MLX4_IB_DRV_NAME ": " format, ## arg)
+
+enum {
+	MLX4_IB_SQ_MIN_WQE_SHIFT = 6,
+	MLX4_IB_MAX_HEADROOM	 = 2048
+};
+
+#define MLX4_IB_SQ_HEADROOM(shift)	((MLX4_IB_MAX_HEADROOM >> (shift)) + 1)
+#define MLX4_IB_SQ_MAX_SPARE		(MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT))
+
+/*module param to indicate if SM assigns the alias_GUID*/
+extern int mlx4_ib_sm_guid_assign;
+
+#define MLX4_IB_UC_STEER_QPN_ALIGN 1
+#define MLX4_IB_UC_MAX_NUM_QPS     256
+
+enum hw_bar_type {
+	HW_BAR_BF,
+	HW_BAR_DB,
+	HW_BAR_CLOCK,
+	HW_BAR_COUNT
+};
+
+struct mlx4_ib_vma_private_data {
+	struct vm_area_struct *vma;
+};
+
+struct mlx4_ib_ucontext {
+	struct ib_ucontext	ibucontext;
+	struct mlx4_uar		uar;
+	struct list_head	db_page_list;
+	struct mutex		db_page_mutex;
+	struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT];
+	struct list_head	wqn_ranges_list;
+	struct mutex		wqn_ranges_mutex; /* protect wqn_ranges_list */
+};
+
+struct mlx4_ib_pd {
+	struct ib_pd		ibpd;
+	u32			pdn;
+};
+
+struct mlx4_ib_xrcd {
+	struct ib_xrcd		ibxrcd;
+	u32			xrcdn;
+	struct ib_pd	       *pd;
+	struct ib_cq	       *cq;
+};
+
+struct mlx4_ib_cq_buf {
+	struct mlx4_buf		buf;
+	struct mlx4_mtt		mtt;
+	int			entry_size;
+};
+
+struct mlx4_ib_cq_resize {
+	struct mlx4_ib_cq_buf	buf;
+	int			cqe;
+};
+
+struct mlx4_ib_cq {
+	struct ib_cq		ibcq;
+	struct mlx4_cq		mcq;
+	struct mlx4_ib_cq_buf	buf;
+	struct mlx4_ib_cq_resize *resize_buf;
+	struct mlx4_db		db;
+	spinlock_t		lock;
+	struct mutex		resize_mutex;
+	struct ib_umem	       *umem;
+	struct ib_umem	       *resize_umem;
+	int			create_flags;
+	/* List of qps that it serves.*/
+	struct list_head		send_qp_list;
+	struct list_head		recv_qp_list;
+};
+
+#define MLX4_MR_PAGES_ALIGN 0x40
+
+struct mlx4_ib_mr {
+	struct ib_mr		ibmr;
+	__be64			*pages;
+	dma_addr_t		page_map;
+	u32			npages;
+	u32			max_pages;
+	struct mlx4_mr		mmr;
+	struct ib_umem	       *umem;
+	size_t			page_map_size;
+};
+
+struct mlx4_ib_mw {
+	struct ib_mw		ibmw;
+	struct mlx4_mw		mmw;
+};
+
+struct mlx4_ib_fmr {
+	struct ib_fmr           ibfmr;
+	struct mlx4_fmr         mfmr;
+};
+
+#define MAX_REGS_PER_FLOW 2
+
+struct mlx4_flow_reg_id {
+	u64 id;
+	u64 mirror;
+};
+
+struct mlx4_ib_flow {
+	struct ib_flow ibflow;
+	/* translating DMFS verbs sniffer rule to FW API requires two reg IDs */
+	struct mlx4_flow_reg_id reg_id[MAX_REGS_PER_FLOW];
+};
+
+struct mlx4_ib_wq {
+	u64		       *wrid;
+	spinlock_t		lock;
+	int			wqe_cnt;
+	int			max_post;
+	int			max_gs;
+	int			offset;
+	int			wqe_shift;
+	unsigned		head;
+	unsigned		tail;
+};
+
+enum {
+	MLX4_IB_QP_CREATE_ROCE_V2_GSI = IB_QP_CREATE_RESERVED_START
+};
+
+enum mlx4_ib_qp_flags {
+	MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,
+	MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
+	MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP,
+	MLX4_IB_QP_SCATTER_FCS = IB_QP_CREATE_SCATTER_FCS,
+
+	/* Mellanox specific flags start from IB_QP_CREATE_RESERVED_START */
+	MLX4_IB_ROCE_V2_GSI_QP = MLX4_IB_QP_CREATE_ROCE_V2_GSI,
+	MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,
+	MLX4_IB_SRIOV_SQP = 1 << 31,
+};
+
+struct mlx4_ib_gid_entry {
+	struct list_head	list;
+	union ib_gid		gid;
+	int			added;
+	u8			port;
+};
+
+enum mlx4_ib_qp_type {
+	/*
+	 * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+	 * here (and in that order) since the MAD layer uses them as
+	 * indices into a 2-entry table.
+	 */
+	MLX4_IB_QPT_SMI = IB_QPT_SMI,
+	MLX4_IB_QPT_GSI = IB_QPT_GSI,
+
+	MLX4_IB_QPT_RC = IB_QPT_RC,
+	MLX4_IB_QPT_UC = IB_QPT_UC,
+	MLX4_IB_QPT_UD = IB_QPT_UD,
+	MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6,
+	MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE,
+	MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET,
+	MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI,
+	MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT,
+
+	MLX4_IB_QPT_PROXY_SMI_OWNER	= 1 << 16,
+	MLX4_IB_QPT_PROXY_SMI		= 1 << 17,
+	MLX4_IB_QPT_PROXY_GSI		= 1 << 18,
+	MLX4_IB_QPT_TUN_SMI_OWNER	= 1 << 19,
+	MLX4_IB_QPT_TUN_SMI		= 1 << 20,
+	MLX4_IB_QPT_TUN_GSI		= 1 << 21,
+};
+
+#define MLX4_IB_QPT_ANY_SRIOV	(MLX4_IB_QPT_PROXY_SMI_OWNER | \
+	MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \
+	MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI)
+
+enum mlx4_ib_mad_ifc_flags {
+	MLX4_MAD_IFC_IGNORE_MKEY	= 1,
+	MLX4_MAD_IFC_IGNORE_BKEY	= 2,
+	MLX4_MAD_IFC_IGNORE_KEYS	= (MLX4_MAD_IFC_IGNORE_MKEY |
+					   MLX4_MAD_IFC_IGNORE_BKEY),
+	MLX4_MAD_IFC_NET_VIEW		= 4,
+};
+
+enum {
+	MLX4_NUM_TUNNEL_BUFS		= 256,
+};
+
+struct mlx4_ib_tunnel_header {
+	struct mlx4_av av;
+	__be32 remote_qpn;
+	__be32 qkey;
+	__be16 vlan;
+	u8 mac[6];
+	__be16 pkey_index;
+	u8 reserved[6];
+};
+
+struct mlx4_ib_buf {
+	void *addr;
+	dma_addr_t map;
+};
+
+struct mlx4_rcv_tunnel_hdr {
+	__be32 flags_src_qp; /* flags[6:5] is defined for VLANs:
+			      * 0x0 - no vlan was in the packet
+			      * 0x01 - C-VLAN was in the packet */
+	u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */
+	u8 reserved;
+	__be16 pkey_index;
+	__be16 sl_vid;
+	__be16 slid_mac_47_32;
+	__be32 mac_31_0;
+};
+
+struct mlx4_ib_proxy_sqp_hdr {
+	struct ib_grh grh;
+	struct mlx4_rcv_tunnel_hdr tun;
+}  __packed;
+
+struct mlx4_roce_smac_vlan_info {
+	u64 smac;
+	int smac_index;
+	int smac_port;
+	u64 candidate_smac;
+	int candidate_smac_index;
+	int candidate_smac_port;
+	u16 vid;
+	int vlan_index;
+	int vlan_port;
+	u16 candidate_vid;
+	int candidate_vlan_index;
+	int candidate_vlan_port;
+	int update_vid;
+};
+
+struct mlx4_wqn_range {
+	int			base_wqn;
+	int			size;
+	int			refcount;
+	bool			dirty;
+	struct list_head	list;
+};
+
+struct mlx4_ib_rss {
+	unsigned int		base_qpn_tbl_sz;
+	u8			flags;
+	u8			rss_key[MLX4_EN_RSS_KEY_SIZE];
+};
+
+struct mlx4_ib_qp {
+	union {
+		struct ib_qp	ibqp;
+		struct ib_wq	ibwq;
+	};
+	struct mlx4_qp		mqp;
+	struct mlx4_buf		buf;
+
+	struct mlx4_db		db;
+	struct mlx4_ib_wq	rq;
+
+	u32			doorbell_qpn;
+	__be32			sq_signal_bits;
+	unsigned		sq_next_wqe;
+	int			sq_max_wqes_per_wr;
+	int			sq_spare_wqes;
+	struct mlx4_ib_wq	sq;
+
+	enum mlx4_ib_qp_type	mlx4_ib_qp_type;
+	struct ib_umem	       *umem;
+	struct mlx4_mtt		mtt;
+	int			buf_size;
+	struct mutex		mutex;
+	u16			xrcdn;
+	u32			flags;
+	u8			port;
+	u8			alt_port;
+	u8			atomic_rd_en;
+	u8			resp_depth;
+	u8			sq_no_prefetch;
+	u8			state;
+	int			mlx_type;
+	u32			inl_recv_sz;
+	struct list_head	gid_list;
+	struct list_head	steering_rules;
+	struct mlx4_ib_buf	*sqp_proxy_rcv;
+	struct mlx4_roce_smac_vlan_info pri;
+	struct mlx4_roce_smac_vlan_info alt;
+	u64			reg_id;
+	struct list_head	qps_list;
+	struct list_head	cq_recv_list;
+	struct list_head	cq_send_list;
+	struct counter_index	*counter_index;
+	struct mlx4_wqn_range	*wqn_range;
+	/* Number of RSS QP parents that uses this WQ */
+	u32			rss_usecnt;
+	struct mlx4_ib_rss	*rss_ctx;
+};
+
+struct mlx4_ib_srq {
+	struct ib_srq		ibsrq;
+	struct mlx4_srq		msrq;
+	struct mlx4_buf		buf;
+	struct mlx4_db		db;
+	u64		       *wrid;
+	spinlock_t		lock;
+	int			head;
+	int			tail;
+	u16			wqe_ctr;
+	struct ib_umem	       *umem;
+	struct mlx4_mtt		mtt;
+	struct mutex		mutex;
+};
+
+struct mlx4_ib_ah {
+	struct ib_ah		ibah;
+	union mlx4_ext_av       av;
+};
+
+/****************************************/
+/* alias guid support */
+/****************************************/
+#define NUM_PORT_ALIAS_GUID		2
+#define NUM_ALIAS_GUID_IN_REC		8
+#define NUM_ALIAS_GUID_REC_IN_PORT	16
+#define GUID_REC_SIZE			8
+#define NUM_ALIAS_GUID_PER_PORT		128
+#define MLX4_NOT_SET_GUID		(0x00LL)
+#define MLX4_GUID_FOR_DELETE_VAL	(~(0x00LL))
+
+enum mlx4_guid_alias_rec_status {
+	MLX4_GUID_INFO_STATUS_IDLE,
+	MLX4_GUID_INFO_STATUS_SET,
+};
+
+#define GUID_STATE_NEED_PORT_INIT 0x01
+
+enum mlx4_guid_alias_rec_method {
+	MLX4_GUID_INFO_RECORD_SET	= IB_MGMT_METHOD_SET,
+	MLX4_GUID_INFO_RECORD_DELETE	= IB_SA_METHOD_DELETE,
+};
+
+struct mlx4_sriov_alias_guid_info_rec_det {
+	u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC];
+	ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/
+	enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/
+	unsigned int guids_retry_schedule[NUM_ALIAS_GUID_IN_REC];
+	u64 time_to_run;
+};
+
+struct mlx4_sriov_alias_guid_port_rec_det {
+	struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT];
+	struct workqueue_struct *wq;
+	struct delayed_work alias_guid_work;
+	u8 port;
+	u32 state_flags;
+	struct mlx4_sriov_alias_guid *parent;
+	struct list_head cb_list;
+};
+
+struct mlx4_sriov_alias_guid {
+	struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS];
+	spinlock_t ag_work_lock;
+	struct ib_sa_client *sa_client;
+};
+
+struct mlx4_ib_demux_work {
+	struct work_struct	work;
+	struct mlx4_ib_dev     *dev;
+	int			slave;
+	int			do_init;
+	u8			port;
+
+};
+
+struct mlx4_ib_tun_tx_buf {
+	struct mlx4_ib_buf buf;
+	struct ib_ah *ah;
+};
+
+struct mlx4_ib_demux_pv_qp {
+	struct ib_qp *qp;
+	enum ib_qp_type proxy_qpt;
+	struct mlx4_ib_buf *ring;
+	struct mlx4_ib_tun_tx_buf *tx_ring;
+	spinlock_t tx_lock;
+	unsigned tx_ix_head;
+	unsigned tx_ix_tail;
+};
+
+enum mlx4_ib_demux_pv_state {
+	DEMUX_PV_STATE_DOWN,
+	DEMUX_PV_STATE_STARTING,
+	DEMUX_PV_STATE_ACTIVE,
+	DEMUX_PV_STATE_DOWNING,
+};
+
+struct mlx4_ib_demux_pv_ctx {
+	int port;
+	int slave;
+	enum mlx4_ib_demux_pv_state state;
+	int has_smi;
+	struct ib_device *ib_dev;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+	struct work_struct work;
+	struct workqueue_struct *wq;
+	struct mlx4_ib_demux_pv_qp qp[2];
+};
+
+struct mlx4_ib_demux_ctx {
+	struct ib_device *ib_dev;
+	int port;
+	struct workqueue_struct *wq;
+	struct workqueue_struct *ud_wq;
+	spinlock_t ud_lock;
+	atomic64_t subnet_prefix;
+	__be64 guid_cache[128];
+	struct mlx4_ib_dev *dev;
+	/* the following lock protects both mcg_table and mcg_mgid0_list */
+	struct mutex		mcg_table_lock;
+	struct rb_root		mcg_table;
+	struct list_head	mcg_mgid0_list;
+	struct workqueue_struct	*mcg_wq;
+	struct mlx4_ib_demux_pv_ctx **tun;
+	atomic_t tid;
+	int    flushing; /* flushing the work queue */
+};
+
+struct mlx4_ib_sriov {
+	struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS];
+	struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS];
+	/* when using this spinlock you should use "irq" because
+	 * it may be called from interrupt context.*/
+	spinlock_t going_down_lock;
+	int is_going_down;
+
+	struct mlx4_sriov_alias_guid alias_guid;
+
+	/* CM paravirtualization fields */
+	struct list_head cm_list;
+	spinlock_t id_map_lock;
+	struct rb_root sl_id_map;
+	struct idr pv_id_table;
+};
+
+struct gid_cache_context {
+	int real_index;
+	int refcount;
+};
+
+struct gid_entry {
+	union ib_gid	gid;
+	enum ib_gid_type gid_type;
+	struct gid_cache_context *ctx;
+};
+
+struct mlx4_port_gid_table {
+	struct gid_entry gids[MLX4_MAX_PORT_GIDS];
+};
+
+struct mlx4_ib_iboe {
+	spinlock_t		lock;
+	struct net_device      *netdevs[MLX4_MAX_PORTS];
+	atomic64_t		mac[MLX4_MAX_PORTS];
+	struct notifier_block 	nb;
+	struct mlx4_port_gid_table gids[MLX4_MAX_PORTS];
+};
+
+struct pkey_mgt {
+	u8			virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	u16			phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	struct list_head	pkey_port_list[MLX4_MFUNC_MAX];
+	struct kobject	       *device_parent[MLX4_MFUNC_MAX];
+};
+
+struct mlx4_ib_iov_sysfs_attr {
+	void *ctx;
+	struct kobject *kobj;
+	unsigned long data;
+	u32 entry_num;
+	char name[15];
+	struct device_attribute dentry;
+	struct device *dev;
+};
+
+struct mlx4_ib_iov_sysfs_attr_ar {
+	struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1];
+};
+
+struct mlx4_ib_iov_port {
+	char name[100];
+	u8 num;
+	struct mlx4_ib_dev *dev;
+	struct list_head list;
+	struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar;
+	struct ib_port_attr attr;
+	struct kobject	*cur_port;
+	struct kobject	*admin_alias_parent;
+	struct kobject	*gids_parent;
+	struct kobject	*pkeys_parent;
+	struct kobject	*mcgs_parent;
+	struct mlx4_ib_iov_sysfs_attr mcg_dentry;
+};
+
+struct counter_index {
+	struct  list_head       list;
+	u32		index;
+	u8		allocated;
+};
+
+struct mlx4_ib_counters {
+	struct list_head        counters_list;
+	struct mutex            mutex; /* mutex for accessing counters list */
+	u32			default_counter;
+};
+
+#define MLX4_DIAG_COUNTERS_TYPES 2
+
+struct mlx4_ib_diag_counters {
+	const char **name;
+	u32 *offset;
+	u32 num_counters;
+};
+
+struct mlx4_ib_dev {
+	struct ib_device	ib_dev;
+	struct mlx4_dev	       *dev;
+	int			num_ports;
+	void __iomem	       *uar_map;
+
+	struct mlx4_uar		priv_uar;
+	u32			priv_pdn;
+	MLX4_DECLARE_DOORBELL_LOCK(uar_lock);
+
+	struct ib_mad_agent    *send_agent[MLX4_MAX_PORTS][2];
+	struct ib_ah	       *sm_ah[MLX4_MAX_PORTS];
+	spinlock_t		sm_lock;
+	atomic64_t		sl2vl[MLX4_MAX_PORTS];
+	struct mlx4_ib_sriov	sriov;
+
+	struct mutex		cap_mask_mutex;
+	bool			ib_active;
+	struct mlx4_ib_iboe	iboe;
+	struct mlx4_ib_counters counters_table[MLX4_MAX_PORTS];
+	int		       *eq_table;
+	struct kobject	       *iov_parent;
+	struct kobject	       *ports_parent;
+	struct kobject	       *dev_ports_parent[MLX4_MFUNC_MAX];
+	struct mlx4_ib_iov_port	iov_ports[MLX4_MAX_PORTS];
+	struct pkey_mgt		pkeys;
+	unsigned long *ib_uc_qpns_bitmap;
+	int steer_qpn_count;
+	int steer_qpn_base;
+	int steering_support;
+	struct mlx4_ib_qp      *qp1_proxy[MLX4_MAX_PORTS];
+	/* lock when destroying qp1_proxy and getting netdev events */
+	struct mutex		qp1_proxy_lock[MLX4_MAX_PORTS];
+	u8			bond_next_port;
+	/* protect resources needed as part of reset flow */
+	spinlock_t		reset_flow_resource_lock;
+	struct list_head		qp_list;
+	struct mlx4_ib_diag_counters diag_counters[MLX4_DIAG_COUNTERS_TYPES];
+};
+
+struct ib_event_work {
+	struct work_struct	work;
+	struct mlx4_ib_dev	*ib_dev;
+	struct mlx4_eqe		ib_eqe;
+	int			port;
+};
+
+struct mlx4_ib_qp_tunnel_init_attr {
+	struct ib_qp_init_attr init_attr;
+	int slave;
+	enum ib_qp_type proxy_qp_type;
+	u8 port;
+};
+
+struct mlx4_uverbs_ex_query_device {
+	__u32 comp_mask;
+	__u32 reserved;
+};
+
+static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mlx4_ib_dev, ib_dev);
+}
+
+static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext);
+}
+
+static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mlx4_ib_pd, ibpd);
+}
+
+static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd)
+{
+	return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd);
+}
+
+static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mlx4_ib_cq, ibcq);
+}
+
+static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq)
+{
+	return container_of(mcq, struct mlx4_ib_cq, mcq);
+}
+
+static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mlx4_ib_mr, ibmr);
+}
+
+static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct mlx4_ib_mw, ibmw);
+}
+
+static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr);
+}
+
+static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow)
+{
+	return container_of(ibflow, struct mlx4_ib_flow, ibflow);
+}
+
+static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mlx4_ib_qp, ibqp);
+}
+
+static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp)
+{
+	return container_of(mqp, struct mlx4_ib_qp, mqp);
+}
+
+static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct mlx4_ib_srq, ibsrq);
+}
+
+static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq)
+{
+	return container_of(msrq, struct mlx4_ib_srq, msrq);
+}
+
+static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mlx4_ib_ah, ibah);
+}
+
+static inline u8 mlx4_ib_bond_next_port(struct mlx4_ib_dev *dev)
+{
+	dev->bond_next_port = (dev->bond_next_port + 1) % dev->num_ports;
+
+	return dev->bond_next_port + 1;
+}
+
+int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev);
+void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev);
+
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+			struct mlx4_db *db);
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db);
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc);
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+			   struct ib_umem *umem);
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata);
+int mlx4_ib_dereg_mr(struct ib_mr *mr);
+struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
+			       struct ib_udata *udata);
+int mlx4_ib_dealloc_mw(struct ib_mw *mw);
+struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
+			       enum ib_mr_type mr_type,
+			       u32 max_num_sg);
+int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		      unsigned int *sg_offset);
+int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
+				const struct ib_cq_init_attr *attr,
+				struct ib_ucontext *context,
+				struct ib_udata *udata);
+int mlx4_ib_destroy_cq(struct ib_cq *cq);
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
+				struct ib_udata *udata);
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
+int mlx4_ib_destroy_ah(struct ib_ah *ah);
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata);
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int mlx4_ib_destroy_srq(struct ib_srq *srq);
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index);
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr);
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata);
+int mlx4_ib_destroy_qp(struct ib_qp *qp);
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr);
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
+		 int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		 const void *in_mad, void *response_mad);
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
+			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			const struct ib_mad_hdr *in, size_t in_mad_size,
+			struct ib_mad_hdr *out, size_t *out_mad_size,
+			u16 *out_mad_pkey_index);
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev);
+
+struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags,
+				  struct ib_fmr_attr *fmr_attr);
+int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages,
+			 u64 iova);
+int mlx4_ib_unmap_fmr(struct list_head *fmr_list);
+int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr);
+int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			 struct ib_port_attr *props, int netw_view);
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			 u16 *pkey, int netw_view);
+
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			union ib_gid *gid, int netw_view);
+
+static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
+{
+	u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3;
+
+	if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET)
+		return true;
+
+	return !!(ah->av.ib.g_slid & 0x80);
+}
+
+int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx);
+void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq);
+void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave);
+int mlx4_ib_mcg_init(void);
+void mlx4_ib_mcg_destroy(void);
+
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid);
+
+int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave,
+				  struct ib_sa_mad *sa_mad);
+int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
+			      struct ib_sa_mad *mad);
+
+int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+		   union ib_gid *gid);
+
+void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num,
+			    enum ib_event_type type);
+
+void mlx4_ib_tunnels_update_work(struct work_struct *work);
+
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
+			  enum ib_qp_type qpt, struct ib_wc *wc,
+			  struct ib_grh *grh, struct ib_mad *mad);
+
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
+			 enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
+			 u32 qkey, struct rdma_ah_attr *attr, u8 *s_mac,
+			 u16 vlan_id, struct ib_mad *mad);
+
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx);
+
+int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
+		struct ib_mad *mad);
+
+int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
+		struct ib_mad *mad);
+
+void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id);
+
+/* alias guid support */
+void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port);
+int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev);
+void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev);
+void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port);
+
+void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
+					  int block_num,
+					  u8 port_num, u8 *p_data);
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev,
+					 int block_num, u8 port_num,
+					 u8 *p_data);
+
+int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+			    struct attribute *attr);
+void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+			     struct attribute *attr);
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index);
+void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave,
+				    int port, int slave_init);
+
+int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ;
+
+void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device);
+
+__be64 mlx4_ib_gen_node_guid(void);
+
+int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn);
+void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count);
+int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+			 int is_attach);
+int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
+			  u64 start, u64 length, u64 virt_addr,
+			  int mr_access_flags, struct ib_pd *pd,
+			  struct ib_udata *udata);
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+				    u8 port_num, int index);
+
+void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev,
+				     int port);
+
+void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port);
+
+struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
+				struct ib_wq_init_attr *init_attr,
+				struct ib_udata *udata);
+int mlx4_ib_destroy_wq(struct ib_wq *wq);
+int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+		      u32 wq_attr_mask, struct ib_udata *udata);
+
+struct ib_rwq_ind_table
+*mlx4_ib_create_rwq_ind_table(struct ib_device *device,
+			      struct ib_rwq_ind_table_init_attr *init_attr,
+			      struct ib_udata *udata);
+int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
+int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
+				       int *num_of_mtts);
+
+#endif /* MLX4_IB_H */
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
new file mode 100644
index 0000000..61d8b06
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -0,0 +1,789 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "mlx4_ib.h"
+
+static u32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX4_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX4_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX4_PERM_LOCAL_WRITE  : 0) |
+	       (acc & IB_ACCESS_MW_BIND	      ? MLX4_PERM_BIND_MW      : 0) |
+	       MLX4_PERM_LOCAL_READ;
+}
+
+static enum mlx4_mw_type to_mlx4_type(enum ib_mw_type type)
+{
+	switch (type) {
+	case IB_MW_TYPE_1:	return MLX4_MW_TYPE_1;
+	case IB_MW_TYPE_2:	return MLX4_MW_TYPE_2;
+	default:		return -1;
+	}
+}
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mlx4_ib_mr *mr;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
+			    ~0ull, convert_access(acc), 0, 0, &mr->mmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
+	if (err)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_mr:
+	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+enum {
+	MLX4_MAX_MTT_SHIFT = 31
+};
+
+static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
+					struct mlx4_mtt *mtt,
+					u64 mtt_size, u64 mtt_shift, u64 len,
+					u64 cur_start_addr, u64 *pages,
+					int *start_index, int *npages)
+{
+	u64 cur_end_addr = cur_start_addr + len;
+	u64 cur_end_addr_aligned = 0;
+	u64 mtt_entries;
+	int err = 0;
+	int k;
+
+	len += (cur_start_addr & (mtt_size - 1ULL));
+	cur_end_addr_aligned = round_up(cur_end_addr, mtt_size);
+	len += (cur_end_addr_aligned - cur_end_addr);
+	if (len & (mtt_size - 1ULL)) {
+		pr_warn("write_block: len %llx is not aligned to mtt_size %llx\n",
+			len, mtt_size);
+		return -EINVAL;
+	}
+
+	mtt_entries = (len >> mtt_shift);
+
+	/*
+	 * Align the MTT start address to the mtt_size.
+	 * Required to handle cases when the MR starts in the middle of an MTT
+	 * record. Was not required in old code since the physical addresses
+	 * provided by the dma subsystem were page aligned, which was also the
+	 * MTT size.
+	 */
+	cur_start_addr = round_down(cur_start_addr, mtt_size);
+	/* A new block is started ... */
+	for (k = 0; k < mtt_entries; ++k) {
+		pages[*npages] = cur_start_addr + (mtt_size * k);
+		(*npages)++;
+		/*
+		 * Be friendly to mlx4_write_mtt() and pass it chunks of
+		 * appropriate size.
+		 */
+		if (*npages == PAGE_SIZE / sizeof(u64)) {
+			err = mlx4_write_mtt(dev->dev, mtt, *start_index,
+					     *npages, pages);
+			if (err)
+				return err;
+
+			(*start_index) += *npages;
+			*npages = 0;
+		}
+	}
+
+	return 0;
+}
+
+static inline u64 alignment_of(u64 ptr)
+{
+	return ilog2(ptr & (~(ptr - 1)));
+}
+
+static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start,
+				       u64 current_block_end,
+				       u64 block_shift)
+{
+	/* Check whether the alignment of the new block is aligned as well as
+	 * the previous block.
+	 * Block address must start with zeros till size of entity_size.
+	 */
+	if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0)
+		/*
+		 * It is not as well aligned as the previous block-reduce the
+		 * mtt size accordingly. Here we take the last right bit which
+		 * is 1.
+		 */
+		block_shift = alignment_of(next_block_start);
+
+	/*
+	 * Check whether the alignment of the end of previous block - is it
+	 * aligned as well as the start of the block
+	 */
+	if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0)
+		/*
+		 * It is not as well aligned as the start of the block -
+		 * reduce the mtt size accordingly.
+		 */
+		block_shift = alignment_of(current_block_end);
+
+	return block_shift;
+}
+
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+			   struct ib_umem *umem)
+{
+	u64 *pages;
+	u64 len = 0;
+	int err = 0;
+	u64 mtt_size;
+	u64 cur_start_addr = 0;
+	u64 mtt_shift;
+	int start_index = 0;
+	int npages = 0;
+	struct scatterlist *sg;
+	int i;
+
+	pages = (u64 *) __get_free_page(GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	mtt_shift = mtt->page_shift;
+	mtt_size = 1ULL << mtt_shift;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
+		if (cur_start_addr + len == sg_dma_address(sg)) {
+			/* still the same block */
+			len += sg_dma_len(sg);
+			continue;
+		}
+		/*
+		 * A new block is started ...
+		 * If len is malaligned, write an extra mtt entry to cover the
+		 * misaligned area (round up the division)
+		 */
+		err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size,
+						   mtt_shift, len,
+						   cur_start_addr,
+						   pages, &start_index,
+						   &npages);
+		if (err)
+			goto out;
+
+		cur_start_addr = sg_dma_address(sg);
+		len = sg_dma_len(sg);
+	}
+
+	/* Handle the last block */
+	if (len > 0) {
+		/*
+		 * If len is malaligned, write an extra mtt entry to cover
+		 * the misaligned area (round up the division)
+		 */
+		err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size,
+						   mtt_shift, len,
+						   cur_start_addr, pages,
+						   &start_index, &npages);
+		if (err)
+			goto out;
+	}
+
+	if (npages)
+		err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages);
+
+out:
+	free_page((unsigned long) pages);
+	return err;
+}
+
+/*
+ * Calculate optimal mtt size based on contiguous pages.
+ * Function will return also the number of pages that are not aligned to the
+ * calculated mtt_size to be added to total number of pages. For that we should
+ * check the first chunk length & last chunk length and if not aligned to
+ * mtt_size we should increment the non_aligned_pages number. All chunks in the
+ * middle already handled as part of mtt shift calculation for both their start
+ * & end addresses.
+ */
+int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
+				       int *num_of_mtts)
+{
+	u64 block_shift = MLX4_MAX_MTT_SHIFT;
+	u64 min_shift = umem->page_shift;
+	u64 last_block_aligned_end = 0;
+	u64 current_block_start = 0;
+	u64 first_block_start = 0;
+	u64 current_block_len = 0;
+	u64 last_block_end = 0;
+	struct scatterlist *sg;
+	u64 current_block_end;
+	u64 misalignment_bits;
+	u64 next_block_start;
+	u64 total_len = 0;
+	int i;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
+		/*
+		 * Initialization - save the first chunk start as the
+		 * current_block_start - block means contiguous pages.
+		 */
+		if (current_block_len == 0 && current_block_start == 0) {
+			current_block_start = sg_dma_address(sg);
+			first_block_start = current_block_start;
+			/*
+			 * Find the bits that are different between the physical
+			 * address and the virtual address for the start of the
+			 * MR.
+			 * umem_get aligned the start_va to a page boundary.
+			 * Therefore, we need to align the start va to the same
+			 * boundary.
+			 * misalignment_bits is needed to handle the  case of a
+			 * single memory region. In this case, the rest of the
+			 * logic will not reduce the block size.  If we use a
+			 * block size which is bigger than the alignment of the
+			 * misalignment bits, we might use the virtual page
+			 * number instead of the physical page number, resulting
+			 * in access to the wrong data.
+			 */
+			misalignment_bits =
+			(start_va & (~(((u64)(BIT(umem->page_shift))) - 1ULL)))
+			^ current_block_start;
+			block_shift = min(alignment_of(misalignment_bits),
+					  block_shift);
+		}
+
+		/*
+		 * Go over the scatter entries and check if they continue the
+		 * previous scatter entry.
+		 */
+		next_block_start = sg_dma_address(sg);
+		current_block_end = current_block_start	+ current_block_len;
+		/* If we have a split (non-contig.) between two blocks */
+		if (current_block_end != next_block_start) {
+			block_shift = mlx4_ib_umem_calc_block_mtt
+					(next_block_start,
+					 current_block_end,
+					 block_shift);
+
+			/*
+			 * If we reached the minimum shift for 4k page we stop
+			 * the loop.
+			 */
+			if (block_shift <= min_shift)
+				goto end;
+
+			/*
+			 * If not saved yet we are in first block - we save the
+			 * length of first block to calculate the
+			 * non_aligned_pages number at the end.
+			 */
+			total_len += current_block_len;
+
+			/* Start a new block */
+			current_block_start = next_block_start;
+			current_block_len = sg_dma_len(sg);
+			continue;
+		}
+		/* The scatter entry is another part of the current block,
+		 * increase the block size.
+		 * An entry in the scatter can be larger than 4k (page) as of
+		 * dma mapping which merge some blocks together.
+		 */
+		current_block_len += sg_dma_len(sg);
+	}
+
+	/* Account for the last block in the total len */
+	total_len += current_block_len;
+	/* Add to the first block the misalignment that it suffers from. */
+	total_len += (first_block_start & ((1ULL << block_shift) - 1ULL));
+	last_block_end = current_block_start + current_block_len;
+	last_block_aligned_end = round_up(last_block_end, 1ULL << block_shift);
+	total_len += (last_block_aligned_end - last_block_end);
+
+	if (total_len & ((1ULL << block_shift) - 1ULL))
+		pr_warn("misaligned total length detected (%llu, %llu)!",
+			total_len, block_shift);
+
+	*num_of_mtts = total_len >> block_shift;
+end:
+	if (block_shift < min_shift) {
+		/*
+		 * If shift is less than the min we set a warning and return the
+		 * min shift.
+		 */
+		pr_warn("umem_calc_optimal_mtt_size - unexpected shift %lld\n", block_shift);
+
+		block_shift = min_shift;
+	}
+	return block_shift;
+}
+
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_mr *mr;
+	int shift;
+	int err;
+	int n;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	/* Force registering the memory as writable. */
+	/* Used for memory re-registeration. HCA protects the access */
+	mr->umem = ib_umem_get(pd->uobject->context, start, length,
+			       access_flags | IB_ACCESS_LOCAL_WRITE, 0);
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		goto err_free;
+	}
+
+	n = ib_umem_page_count(mr->umem);
+	shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start, &n);
+
+	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
+			    convert_access(access_flags), n, shift, &mr->mmr);
+	if (err)
+		goto err_umem;
+
+	err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
+	if (err)
+		goto err_mr;
+
+	err = mlx4_mr_enable(dev->dev, &mr->mmr);
+	if (err)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+	mr->ibmr.length = length;
+	mr->ibmr.iova = virt_addr;
+	mr->ibmr.page_size = 1U << shift;
+
+	return &mr->ibmr;
+
+err_mr:
+	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_umem:
+	ib_umem_release(mr->umem);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
+			  u64 start, u64 length, u64 virt_addr,
+			  int mr_access_flags, struct ib_pd *pd,
+			  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(mr->device);
+	struct mlx4_ib_mr *mmr = to_mmr(mr);
+	struct mlx4_mpt_entry *mpt_entry;
+	struct mlx4_mpt_entry **pmpt_entry = &mpt_entry;
+	int err;
+
+	/* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs,
+	 * we assume that the calls can't run concurrently. Otherwise, a
+	 * race exists.
+	 */
+	err =  mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry);
+
+	if (err)
+		return err;
+
+	if (flags & IB_MR_REREG_PD) {
+		err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry,
+					   to_mpd(pd)->pdn);
+
+		if (err)
+			goto release_mpt_entry;
+	}
+
+	if (flags & IB_MR_REREG_ACCESS) {
+		err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry,
+					       convert_access(mr_access_flags));
+
+		if (err)
+			goto release_mpt_entry;
+	}
+
+	if (flags & IB_MR_REREG_TRANS) {
+		int shift;
+		int n;
+
+		mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+		ib_umem_release(mmr->umem);
+		mmr->umem = ib_umem_get(mr->uobject->context, start, length,
+					mr_access_flags |
+					IB_ACCESS_LOCAL_WRITE,
+					0);
+		if (IS_ERR(mmr->umem)) {
+			err = PTR_ERR(mmr->umem);
+			/* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */
+			mmr->umem = NULL;
+			goto release_mpt_entry;
+		}
+		n = ib_umem_page_count(mmr->umem);
+		shift = mmr->umem->page_shift;
+
+		err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
+					      virt_addr, length, n, shift,
+					      *pmpt_entry);
+		if (err) {
+			ib_umem_release(mmr->umem);
+			goto release_mpt_entry;
+		}
+		mmr->mmr.iova       = virt_addr;
+		mmr->mmr.size       = length;
+
+		err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem);
+		if (err) {
+			mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+			ib_umem_release(mmr->umem);
+			goto release_mpt_entry;
+		}
+	}
+
+	/* If we couldn't transfer the MR to the HCA, just remember to
+	 * return a failure. But dereg_mr will free the resources.
+	 */
+	err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry);
+	if (!err && flags & IB_MR_REREG_ACCESS)
+		mmr->mmr.access = mr_access_flags;
+
+release_mpt_entry:
+	mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry);
+
+	return err;
+}
+
+static int
+mlx4_alloc_priv_pages(struct ib_device *device,
+		      struct mlx4_ib_mr *mr,
+		      int max_pages)
+{
+	int ret;
+
+	/* Ensure that size is aligned to DMA cacheline
+	 * requirements.
+	 * max_pages is limited to MLX4_MAX_FAST_REG_PAGES
+	 * so page_map_size will never cross PAGE_SIZE.
+	 */
+	mr->page_map_size = roundup(max_pages * sizeof(u64),
+				    MLX4_MR_PAGES_ALIGN);
+
+	/* Prevent cross page boundary allocation. */
+	mr->pages = (__be64 *)get_zeroed_page(GFP_KERNEL);
+	if (!mr->pages)
+		return -ENOMEM;
+
+	mr->page_map = dma_map_single(device->dev.parent, mr->pages,
+				      mr->page_map_size, DMA_TO_DEVICE);
+
+	if (dma_mapping_error(device->dev.parent, mr->page_map)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	return 0;
+
+err:
+	free_page((unsigned long)mr->pages);
+	return ret;
+}
+
+static void
+mlx4_free_priv_pages(struct mlx4_ib_mr *mr)
+{
+	if (mr->pages) {
+		struct ib_device *device = mr->ibmr.device;
+
+		dma_unmap_single(device->dev.parent, mr->page_map,
+				 mr->page_map_size, DMA_TO_DEVICE);
+		free_page((unsigned long)mr->pages);
+		mr->pages = NULL;
+	}
+}
+
+int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
+{
+	struct mlx4_ib_mr *mr = to_mmr(ibmr);
+	int ret;
+
+	mlx4_free_priv_pages(mr);
+
+	ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
+	if (ret)
+		return ret;
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+	kfree(mr);
+
+	return 0;
+}
+
+struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
+			       struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_mw *mw;
+	int err;
+
+	mw = kmalloc(sizeof(*mw), GFP_KERNEL);
+	if (!mw)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn,
+			    to_mlx4_type(type), &mw->mmw);
+	if (err)
+		goto err_free;
+
+	err = mlx4_mw_enable(dev->dev, &mw->mmw);
+	if (err)
+		goto err_mw;
+
+	mw->ibmw.rkey = mw->mmw.key;
+
+	return &mw->ibmw;
+
+err_mw:
+	mlx4_mw_free(dev->dev, &mw->mmw);
+
+err_free:
+	kfree(mw);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
+{
+	struct mlx4_ib_mw *mw = to_mmw(ibmw);
+
+	mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw);
+	kfree(mw);
+
+	return 0;
+}
+
+struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
+			       enum ib_mr_type mr_type,
+			       u32 max_num_sg)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_mr *mr;
+	int err;
+
+	if (mr_type != IB_MR_TYPE_MEM_REG ||
+	    max_num_sg > MLX4_MAX_FAST_REG_PAGES)
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
+			    max_num_sg, 0, &mr->mmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_alloc_priv_pages(pd->device, mr, max_num_sg);
+	if (err)
+		goto err_free_mr;
+
+	mr->max_pages = max_num_sg;
+	err = mlx4_mr_enable(dev->dev, &mr->mmr);
+	if (err)
+		goto err_free_pl;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_free_pl:
+	mr->ibmr.device = pd->device;
+	mlx4_free_priv_pages(mr);
+err_free_mr:
+	(void) mlx4_mr_free(dev->dev, &mr->mmr);
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
+				 struct ib_fmr_attr *fmr_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_fmr *fmr;
+	int err = -ENOMEM;
+
+	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
+	if (!fmr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc),
+			     fmr_attr->max_pages, fmr_attr->max_maps,
+			     fmr_attr->page_shift, &fmr->mfmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr);
+	if (err)
+		goto err_mr;
+
+	fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key;
+
+	return &fmr->ibfmr;
+
+err_mr:
+	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
+
+err_free:
+	kfree(fmr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		      int npages, u64 iova)
+{
+	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+	struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device);
+
+	return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova,
+				 &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
+}
+
+int mlx4_ib_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *ibfmr;
+	int err;
+	struct mlx4_dev *mdev = NULL;
+
+	list_for_each_entry(ibfmr, fmr_list, list) {
+		if (mdev && to_mdev(ibfmr->device)->dev != mdev)
+			return -EINVAL;
+		mdev = to_mdev(ibfmr->device)->dev;
+	}
+
+	if (!mdev)
+		return 0;
+
+	list_for_each_entry(ibfmr, fmr_list, list) {
+		struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+
+		mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
+	}
+
+	/*
+	 * Make sure all MPT status updates are visible before issuing
+	 * SYNC_TPT firmware command.
+	 */
+	wmb();
+
+	err = mlx4_SYNC_TPT(mdev);
+	if (err)
+		pr_warn("SYNC_TPT error %d when "
+		       "unmapping FMRs\n", err);
+
+	return 0;
+}
+
+int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr)
+{
+	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+	struct mlx4_ib_dev *dev = to_mdev(ibfmr->device);
+	int err;
+
+	err = mlx4_fmr_free(dev->dev, &ifmr->mfmr);
+
+	if (!err)
+		kfree(ifmr);
+
+	return err;
+}
+
+static int mlx4_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct mlx4_ib_mr *mr = to_mmr(ibmr);
+
+	if (unlikely(mr->npages == mr->max_pages))
+		return -ENOMEM;
+
+	mr->pages[mr->npages++] = cpu_to_be64(addr | MLX4_MTT_FLAG_PRESENT);
+
+	return 0;
+}
+
+int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		      unsigned int *sg_offset)
+{
+	struct mlx4_ib_mr *mr = to_mmr(ibmr);
+	int rc;
+
+	mr->npages = 0;
+
+	ib_dma_sync_single_for_cpu(ibmr->device, mr->page_map,
+				   mr->page_map_size, DMA_TO_DEVICE);
+
+	rc = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, mlx4_set_page);
+
+	ib_dma_sync_single_for_device(ibmr->device, mr->page_map,
+				      mr->page_map_size, DMA_TO_DEVICE);
+
+	return rc;
+}
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
new file mode 100644
index 0000000..199648a
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -0,0 +1,4467 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/log2.h>
+#include <linux/etherdevice.h>
+#include <net/ip.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_mad.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include <rdma/mlx4-abi.h>
+
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
+			     struct mlx4_ib_cq *recv_cq);
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
+			       struct mlx4_ib_cq *recv_cq);
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state);
+
+enum {
+	MLX4_IB_ACK_REQ_FREQ	= 8,
+};
+
+enum {
+	MLX4_IB_DEFAULT_SCHED_QUEUE	= 0x83,
+	MLX4_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
+	MLX4_IB_LINK_TYPE_IB		= 0,
+	MLX4_IB_LINK_TYPE_ETH		= 1
+};
+
+enum {
+	/*
+	 * Largest possible UD header: send with GRH and immediate
+	 * data plus 18 bytes for an Ethernet header with VLAN/802.1Q
+	 * tag.  (LRH would only use 8 bytes, so Ethernet is the
+	 * biggest case)
+	 */
+	MLX4_IB_UD_HEADER_SIZE		= 82,
+	MLX4_IB_LSO_HEADER_SPARE	= 128,
+};
+
+struct mlx4_ib_sqp {
+	struct mlx4_ib_qp	qp;
+	int			pkey_index;
+	u32			qkey;
+	u32			send_psn;
+	struct ib_ud_header	ud_header;
+	u8			header_buf[MLX4_IB_UD_HEADER_SIZE];
+	struct ib_qp		*roce_v2_gsi;
+};
+
+enum {
+	MLX4_IB_MIN_SQ_STRIDE	= 6,
+	MLX4_IB_CACHE_LINE_SIZE	= 64,
+};
+
+enum {
+	MLX4_RAW_QP_MTU		= 7,
+	MLX4_RAW_QP_MSGMAX	= 31,
+};
+
+#ifndef ETH_ALEN
+#define ETH_ALEN        6
+#endif
+
+static const __be32 mlx4_ib_opcode[] = {
+	[IB_WR_SEND]				= cpu_to_be32(MLX4_OPCODE_SEND),
+	[IB_WR_LSO]				= cpu_to_be32(MLX4_OPCODE_LSO),
+	[IB_WR_SEND_WITH_IMM]			= cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+	[IB_WR_RDMA_WRITE]			= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+	[IB_WR_RDMA_WRITE_WITH_IMM]		= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+	[IB_WR_RDMA_READ]			= cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+	[IB_WR_ATOMIC_CMP_AND_SWP]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+	[IB_WR_ATOMIC_FETCH_AND_ADD]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+	[IB_WR_SEND_WITH_INV]			= cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
+	[IB_WR_LOCAL_INV]			= cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
+	[IB_WR_REG_MR]				= cpu_to_be32(MLX4_OPCODE_FMR),
+	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
+	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
+};
+
+enum mlx4_ib_source_type {
+	MLX4_IB_QP_SRC	= 0,
+	MLX4_IB_RWQ_SRC	= 1,
+};
+
+static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
+{
+	return container_of(mqp, struct mlx4_ib_sqp, qp);
+}
+
+static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	if (!mlx4_is_master(dev->dev))
+		return 0;
+
+	return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn &&
+	       qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn +
+		8 * MLX4_MFUNC_MAX;
+}
+
+static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	int proxy_sqp = 0;
+	int real_sqp = 0;
+	int i;
+	/* PPF or Native -- real SQP */
+	real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
+		    qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
+		    qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3);
+	if (real_sqp)
+		return 1;
+	/* VF or PF -- proxy SQP */
+	if (mlx4_is_mfunc(dev->dev)) {
+		for (i = 0; i < dev->dev->caps.num_ports; i++) {
+			if (qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp0_proxy ||
+			    qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp1_proxy) {
+				proxy_sqp = 1;
+				break;
+			}
+		}
+	}
+	if (proxy_sqp)
+		return 1;
+
+	return !!(qp->flags & MLX4_IB_ROCE_V2_GSI_QP);
+}
+
+/* used for INIT/CLOSE port logic */
+static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	int proxy_qp0 = 0;
+	int real_qp0 = 0;
+	int i;
+	/* PPF or Native -- real QP0 */
+	real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
+		    qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
+		    qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1);
+	if (real_qp0)
+		return 1;
+	/* VF or PF -- proxy QP0 */
+	if (mlx4_is_mfunc(dev->dev)) {
+		for (i = 0; i < dev->dev->caps.num_ports; i++) {
+			if (qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp0_proxy) {
+				proxy_qp0 = 1;
+				break;
+			}
+		}
+	}
+	return proxy_qp0;
+}
+
+static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
+{
+	return mlx4_buf_offset(&qp->buf, offset);
+}
+
+static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
+}
+
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with
+ *     0x7FFFFFF | (invalid_ownership_value << 31).
+ *
+ * When the max work request size is less than or equal to the WQE
+ * basic block size, as an optimization, we can stamp all WQEs with
+ * 0xffffffff, and skip the very first chunk of each WQE.
+ */
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+	__be32 *wqe;
+	int i;
+	int s;
+	int ind;
+	void *buf;
+	__be32 stamp;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+
+	if (qp->sq_max_wqes_per_wr > 1) {
+		s = roundup(size, 1U << qp->sq.wqe_shift);
+		for (i = 0; i < s; i += 64) {
+			ind = (i >> qp->sq.wqe_shift) + n;
+			stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
+						       cpu_to_be32(0xffffffff);
+			buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+			wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
+			*wqe = stamp;
+		}
+	} else {
+		ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+		s = (ctrl->qpn_vlan.fence_size & 0x3f) << 4;
+		for (i = 64; i < s; i += 64) {
+			wqe = buf + i;
+			*wqe = cpu_to_be32(0xffffffff);
+		}
+	}
+}
+
+static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_inline_seg *inl;
+	void *wqe;
+	int s;
+
+	ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+	s = sizeof(struct mlx4_wqe_ctrl_seg);
+
+	if (qp->ibqp.qp_type == IB_QPT_UD) {
+		struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
+		struct mlx4_av *av = (struct mlx4_av *)dgram->av;
+		memset(dgram, 0, sizeof *dgram);
+		av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
+		s += sizeof(struct mlx4_wqe_datagram_seg);
+	}
+
+	/* Pad the remainder of the WQE with an inline data segment. */
+	if (size > s) {
+		inl = wqe + s;
+		inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+	}
+	ctrl->srcrb_flags = 0;
+	ctrl->qpn_vlan.fence_size = size / 16;
+	/*
+	 * Make sure descriptor is fully written before setting ownership bit
+	 * (because HW can start executing as soon as we do).
+	 */
+	wmb();
+
+	ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
+		(n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+	stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
+}
+
+/* Post NOP WQE to prevent wrap-around in the middle of WR */
+static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
+{
+	unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
+	if (unlikely(s < qp->sq_max_wqes_per_wr)) {
+		post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
+		ind += s;
+	}
+	return ind;
+}
+
+static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
+
+	if (type == MLX4_EVENT_TYPE_PATH_MIG)
+		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+	if (ibqp->event_handler) {
+		event.device     = ibqp->device;
+		event.element.qp = ibqp;
+		switch (type) {
+		case MLX4_EVENT_TYPE_PATH_MIG:
+			event.event = IB_EVENT_PATH_MIG;
+			break;
+		case MLX4_EVENT_TYPE_COMM_EST:
+			event.event = IB_EVENT_COMM_EST;
+			break;
+		case MLX4_EVENT_TYPE_SQ_DRAINED:
+			event.event = IB_EVENT_SQ_DRAINED;
+			break;
+		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			break;
+		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+			event.event = IB_EVENT_QP_FATAL;
+			break;
+		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+			event.event = IB_EVENT_PATH_MIG_ERR;
+			break;
+		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			event.event = IB_EVENT_QP_REQ_ERR;
+			break;
+		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+			event.event = IB_EVENT_QP_ACCESS_ERR;
+			break;
+		default:
+			pr_warn("Unexpected event type %d "
+			       "on QP %06x\n", type, qp->qpn);
+			return;
+		}
+
+		ibqp->event_handler(&event, ibqp->qp_context);
+	}
+}
+
+static void mlx4_ib_wq_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+	pr_warn_ratelimited("Unexpected event type %d on WQ 0x%06x. Events are not supported for WQs\n",
+			    type, qp->qpn);
+}
+
+static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
+{
+	/*
+	 * UD WQEs must have a datagram segment.
+	 * RC and UC WQEs might have a remote address segment.
+	 * MLX WQEs need two extra inline data segments (for the UD
+	 * header and space for the ICRC).
+	 */
+	switch (type) {
+	case MLX4_IB_QPT_UD:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg) +
+			((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0);
+	case MLX4_IB_QPT_PROXY_SMI_OWNER:
+	case MLX4_IB_QPT_PROXY_SMI:
+	case MLX4_IB_QPT_PROXY_GSI:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg) + 64;
+	case MLX4_IB_QPT_TUN_SMI_OWNER:
+	case MLX4_IB_QPT_TUN_GSI:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg);
+
+	case MLX4_IB_QPT_UC:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_raddr_seg);
+	case MLX4_IB_QPT_RC:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_masked_atomic_seg) +
+			sizeof (struct mlx4_wqe_raddr_seg);
+	case MLX4_IB_QPT_SMI:
+	case MLX4_IB_QPT_GSI:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			ALIGN(MLX4_IB_UD_HEADER_SIZE +
+			      DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
+					   MLX4_INLINE_ALIGN) *
+			      sizeof (struct mlx4_wqe_inline_seg),
+			      sizeof (struct mlx4_wqe_data_seg)) +
+			ALIGN(4 +
+			      sizeof (struct mlx4_wqe_inline_seg),
+			      sizeof (struct mlx4_wqe_data_seg));
+	default:
+		return sizeof (struct mlx4_wqe_ctrl_seg);
+	}
+}
+
+static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+		       int is_user, int has_rq, struct mlx4_ib_qp *qp,
+		       u32 inl_recv_sz)
+{
+	/* Sanity check RQ size before proceeding */
+	if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
+	    cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg))
+		return -EINVAL;
+
+	if (!has_rq) {
+		if (cap->max_recv_wr || inl_recv_sz)
+			return -EINVAL;
+
+		qp->rq.wqe_cnt = qp->rq.max_gs = 0;
+	} else {
+		u32 max_inl_recv_sz = dev->dev->caps.max_rq_sg *
+			sizeof(struct mlx4_wqe_data_seg);
+		u32 wqe_size;
+
+		/* HW requires >= 1 RQ entry with >= 1 gather entry */
+		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge ||
+				inl_recv_sz > max_inl_recv_sz))
+			return -EINVAL;
+
+		qp->rq.wqe_cnt	 = roundup_pow_of_two(max(1U, cap->max_recv_wr));
+		qp->rq.max_gs	 = roundup_pow_of_two(max(1U, cap->max_recv_sge));
+		wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg);
+		qp->rq.wqe_shift = ilog2(max_t(u32, wqe_size, inl_recv_sz));
+	}
+
+	/* leave userspace return values as they were, so as not to break ABI */
+	if (is_user) {
+		cap->max_recv_wr  = qp->rq.max_post = qp->rq.wqe_cnt;
+		cap->max_recv_sge = qp->rq.max_gs;
+	} else {
+		cap->max_recv_wr  = qp->rq.max_post =
+			min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt);
+		cap->max_recv_sge = min(qp->rq.max_gs,
+					min(dev->dev->caps.max_sq_sg,
+					    dev->dev->caps.max_rq_sg));
+	}
+
+	return 0;
+}
+
+static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+			      enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp,
+			      bool shrink_wqe)
+{
+	int s;
+
+	/* Sanity check SQ size before proceeding */
+	if (cap->max_send_wr  > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) ||
+	    cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||
+	    cap->max_inline_data + send_wqe_overhead(type, qp->flags) +
+	    sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
+		return -EINVAL;
+
+	/*
+	 * For MLX transport we need 2 extra S/G entries:
+	 * one for the header and one for the checksum at the end
+	 */
+	if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI ||
+	     type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) &&
+	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
+		return -EINVAL;
+
+	s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
+		cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
+		send_wqe_overhead(type, qp->flags);
+
+	if (s > dev->dev->caps.max_sq_desc_sz)
+		return -EINVAL;
+
+	/*
+	 * Hermon supports shrinking WQEs, such that a single work
+	 * request can include multiple units of 1 << wqe_shift.  This
+	 * way, work requests can differ in size, and do not have to
+	 * be a power of 2 in size, saving memory and speeding up send
+	 * WR posting.  Unfortunately, if we do this then the
+	 * wqe_index field in CQEs can't be used to look up the WR ID
+	 * anymore, so we do this only if selective signaling is off.
+	 *
+	 * Further, on 32-bit platforms, we can't use vmap() to make
+	 * the QP buffer virtually contiguous.  Thus we have to use
+	 * constant-sized WRs to make sure a WR is always fully within
+	 * a single page-sized chunk.
+	 *
+	 * Finally, we use NOP work requests to pad the end of the
+	 * work queue, to avoid wrap-around in the middle of WR.  We
+	 * set NEC bit to avoid getting completions with error for
+	 * these NOP WRs, but since NEC is only supported starting
+	 * with firmware 2.2.232, we use constant-sized WRs for older
+	 * firmware.
+	 *
+	 * And, since MLX QPs only support SEND, we use constant-sized
+	 * WRs in this case.
+	 *
+	 * We look for the smallest value of wqe_shift such that the
+	 * resulting number of wqes does not exceed device
+	 * capabilities.
+	 *
+	 * We set WQE size to at least 64 bytes, this way stamping
+	 * invalidates each WQE.
+	 */
+	if (shrink_wqe && dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
+	    qp->sq_signal_bits && BITS_PER_LONG == 64 &&
+	    type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI &&
+	    !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI |
+		      MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER)))
+		qp->sq.wqe_shift = ilog2(64);
+	else
+		qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
+
+	for (;;) {
+		qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
+
+		/*
+		 * We need to leave 2 KB + 1 WR of headroom in the SQ to
+		 * allow HW to prefetch.
+		 */
+		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+		qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
+						    qp->sq_max_wqes_per_wr +
+						    qp->sq_spare_wqes);
+
+		if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
+			break;
+
+		if (qp->sq_max_wqes_per_wr <= 1)
+			return -EINVAL;
+
+		++qp->sq.wqe_shift;
+	}
+
+	qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz,
+			     (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) -
+			 send_wqe_overhead(type, qp->flags)) /
+		sizeof (struct mlx4_wqe_data_seg);
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+		qp->rq.offset = 0;
+		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+	} else {
+		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+		qp->sq.offset = 0;
+	}
+
+	cap->max_send_wr  = qp->sq.max_post =
+		(qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
+	cap->max_send_sge = min(qp->sq.max_gs,
+				min(dev->dev->caps.max_sq_sg,
+				    dev->dev->caps.max_rq_sg));
+	/* We don't support inline sends for kernel QPs (yet) */
+	cap->max_inline_data = 0;
+
+	return 0;
+}
+
+static int set_user_sq_size(struct mlx4_ib_dev *dev,
+			    struct mlx4_ib_qp *qp,
+			    struct mlx4_ib_create_qp *ucmd)
+{
+	/* Sanity check SQ size before proceeding */
+	if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes	 ||
+	    ucmd->log_sq_stride >
+		ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) ||
+	    ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE)
+		return -EINVAL;
+
+	qp->sq.wqe_cnt   = 1 << ucmd->log_sq_bb_count;
+	qp->sq.wqe_shift = ucmd->log_sq_stride;
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+
+	return 0;
+}
+
+static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
+{
+	int i;
+
+	qp->sqp_proxy_rcv =
+		kmalloc(sizeof (struct mlx4_ib_buf) * qp->rq.wqe_cnt,
+			GFP_KERNEL);
+	if (!qp->sqp_proxy_rcv)
+		return -ENOMEM;
+	for (i = 0; i < qp->rq.wqe_cnt; i++) {
+		qp->sqp_proxy_rcv[i].addr =
+			kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				GFP_KERNEL);
+		if (!qp->sqp_proxy_rcv[i].addr)
+			goto err;
+		qp->sqp_proxy_rcv[i].map =
+			ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr,
+					  sizeof (struct mlx4_ib_proxy_sqp_hdr),
+					  DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(dev, qp->sqp_proxy_rcv[i].map)) {
+			kfree(qp->sqp_proxy_rcv[i].addr);
+			goto err;
+		}
+	}
+	return 0;
+
+err:
+	while (i > 0) {
+		--i;
+		ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
+				    sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				    DMA_FROM_DEVICE);
+		kfree(qp->sqp_proxy_rcv[i].addr);
+	}
+	kfree(qp->sqp_proxy_rcv);
+	qp->sqp_proxy_rcv = NULL;
+	return -ENOMEM;
+}
+
+static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
+{
+	int i;
+
+	for (i = 0; i < qp->rq.wqe_cnt; i++) {
+		ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
+				    sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				    DMA_FROM_DEVICE);
+		kfree(qp->sqp_proxy_rcv[i].addr);
+	}
+	kfree(qp->sqp_proxy_rcv);
+}
+
+static int qp_has_rq(struct ib_qp_init_attr *attr)
+{
+	if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
+		return 0;
+
+	return !attr->srq;
+}
+
+static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn)
+{
+	int i;
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (qpn == dev->caps.spec_qps[i].qp0_proxy)
+			return !!dev->caps.spec_qps[i].qp0_qkey;
+	}
+	return 0;
+}
+
+static void mlx4_ib_free_qp_counter(struct mlx4_ib_dev *dev,
+				    struct mlx4_ib_qp *qp)
+{
+	mutex_lock(&dev->counters_table[qp->port - 1].mutex);
+	mlx4_counter_free(dev->dev, qp->counter_index->index);
+	list_del(&qp->counter_index->list);
+	mutex_unlock(&dev->counters_table[qp->port - 1].mutex);
+
+	kfree(qp->counter_index);
+	qp->counter_index = NULL;
+}
+
+static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx,
+		      struct ib_qp_init_attr *init_attr,
+		      struct mlx4_ib_create_qp_rss *ucmd)
+{
+	rss_ctx->base_qpn_tbl_sz = init_attr->rwq_ind_tbl->ind_tbl[0]->wq_num |
+		(init_attr->rwq_ind_tbl->log_ind_tbl_size << 24);
+
+	if ((ucmd->rx_hash_function == MLX4_IB_RX_HASH_FUNC_TOEPLITZ) &&
+	    (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP)) {
+		memcpy(rss_ctx->rss_key, ucmd->rx_hash_key,
+		       MLX4_EN_RSS_KEY_SIZE);
+	} else {
+		pr_debug("RX Hash function is not supported\n");
+		return (-EOPNOTSUPP);
+	}
+
+	if (ucmd->rx_hash_fields_mask & ~(MLX4_IB_RX_HASH_SRC_IPV4	|
+					  MLX4_IB_RX_HASH_DST_IPV4	|
+					  MLX4_IB_RX_HASH_SRC_IPV6	|
+					  MLX4_IB_RX_HASH_DST_IPV6	|
+					  MLX4_IB_RX_HASH_SRC_PORT_TCP	|
+					  MLX4_IB_RX_HASH_DST_PORT_TCP	|
+					  MLX4_IB_RX_HASH_SRC_PORT_UDP	|
+					  MLX4_IB_RX_HASH_DST_PORT_UDP  |
+					  MLX4_IB_RX_HASH_INNER)) {
+		pr_debug("RX Hash fields_mask has unsupported mask (0x%llx)\n",
+			 ucmd->rx_hash_fields_mask);
+		return (-EOPNOTSUPP);
+	}
+
+	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) &&
+	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) {
+		rss_ctx->flags = MLX4_RSS_IPV4;
+	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) ||
+		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) {
+		pr_debug("RX Hash fields_mask is not supported - both IPv4 SRC and DST must be set\n");
+		return (-EOPNOTSUPP);
+	}
+
+	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) &&
+	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) {
+		rss_ctx->flags |= MLX4_RSS_IPV6;
+	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) ||
+		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) {
+		pr_debug("RX Hash fields_mask is not supported - both IPv6 SRC and DST must be set\n");
+		return (-EOPNOTSUPP);
+	}
+
+	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) &&
+	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) {
+		if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS)) {
+			pr_debug("RX Hash fields_mask for UDP is not supported\n");
+			return (-EOPNOTSUPP);
+		}
+
+		if (rss_ctx->flags & MLX4_RSS_IPV4)
+			rss_ctx->flags |= MLX4_RSS_UDP_IPV4;
+		if (rss_ctx->flags & MLX4_RSS_IPV6)
+			rss_ctx->flags |= MLX4_RSS_UDP_IPV6;
+		if (!(rss_ctx->flags & (MLX4_RSS_IPV6 | MLX4_RSS_IPV4))) {
+			pr_debug("RX Hash fields_mask is not supported - UDP must be set with IPv4 or IPv6\n");
+			return (-EOPNOTSUPP);
+		}
+	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) ||
+		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) {
+		pr_debug("RX Hash fields_mask is not supported - both UDP SRC and DST must be set\n");
+		return (-EOPNOTSUPP);
+	}
+
+	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) &&
+	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) {
+		if (rss_ctx->flags & MLX4_RSS_IPV4)
+			rss_ctx->flags |= MLX4_RSS_TCP_IPV4;
+		if (rss_ctx->flags & MLX4_RSS_IPV6)
+			rss_ctx->flags |= MLX4_RSS_TCP_IPV6;
+		if (!(rss_ctx->flags & (MLX4_RSS_IPV6 | MLX4_RSS_IPV4))) {
+			pr_debug("RX Hash fields_mask is not supported - TCP must be set with IPv4 or IPv6\n");
+			return (-EOPNOTSUPP);
+		}
+	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) ||
+		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) {
+		pr_debug("RX Hash fields_mask is not supported - both TCP SRC and DST must be set\n");
+		return (-EOPNOTSUPP);
+	}
+
+	if (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_INNER) {
+		if (dev->dev->caps.tunnel_offload_mode ==
+		    MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+			/*
+			 * Hash according to inner headers if exist, otherwise
+			 * according to outer headers.
+			 */
+			rss_ctx->flags |= MLX4_RSS_BY_INNER_HEADERS_IPONLY;
+		} else {
+			pr_debug("RSS Hash for inner headers isn't supported\n");
+			return (-EOPNOTSUPP);
+		}
+	}
+
+	return 0;
+}
+
+static int create_qp_rss(struct mlx4_ib_dev *dev,
+			 struct ib_qp_init_attr *init_attr,
+			 struct mlx4_ib_create_qp_rss *ucmd,
+			 struct mlx4_ib_qp *qp)
+{
+	int qpn;
+	int err;
+
+	qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
+
+	err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn, 0, qp->mqp.usage);
+	if (err)
+		return err;
+
+	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
+	if (err)
+		goto err_qpn;
+
+	mutex_init(&qp->mutex);
+
+	INIT_LIST_HEAD(&qp->gid_list);
+	INIT_LIST_HEAD(&qp->steering_rules);
+
+	qp->mlx4_ib_qp_type = MLX4_IB_QPT_RAW_PACKET;
+	qp->state = IB_QPS_RESET;
+
+	/* Set dummy send resources to be compatible with HV and PRM */
+	qp->sq_no_prefetch = 1;
+	qp->sq.wqe_cnt = 1;
+	qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
+	qp->buf_size = qp->sq.wqe_cnt << MLX4_IB_MIN_SQ_STRIDE;
+	qp->mtt = (to_mqp(
+		   (struct ib_qp *)init_attr->rwq_ind_tbl->ind_tbl[0]))->mtt;
+
+	qp->rss_ctx = kzalloc(sizeof(*qp->rss_ctx), GFP_KERNEL);
+	if (!qp->rss_ctx) {
+		err = -ENOMEM;
+		goto err_qp_alloc;
+	}
+
+	err = set_qp_rss(dev, qp->rss_ctx, init_attr, ucmd);
+	if (err)
+		goto err;
+
+	return 0;
+
+err:
+	kfree(qp->rss_ctx);
+
+err_qp_alloc:
+	mlx4_qp_remove(dev->dev, &qp->mqp);
+	mlx4_qp_free(dev->dev, &qp->mqp);
+
+err_qpn:
+	mlx4_qp_release_range(dev->dev, qpn, 1);
+	return err;
+}
+
+static struct ib_qp *_mlx4_ib_create_qp_rss(struct ib_pd *pd,
+					    struct ib_qp_init_attr *init_attr,
+					    struct ib_udata *udata)
+{
+	struct mlx4_ib_qp *qp;
+	struct mlx4_ib_create_qp_rss ucmd = {};
+	size_t required_cmd_sz;
+	int err;
+
+	if (!udata) {
+		pr_debug("RSS QP with NULL udata\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (udata->outlen)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	required_cmd_sz = offsetof(typeof(ucmd), reserved1) +
+					sizeof(ucmd.reserved1);
+	if (udata->inlen < required_cmd_sz) {
+		pr_debug("invalid inlen\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
+		pr_debug("copy failed\n");
+		return ERR_PTR(-EFAULT);
+	}
+
+	if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved)))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (ucmd.comp_mask || ucmd.reserved1)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (udata->inlen > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 udata->inlen - sizeof(ucmd))) {
+		pr_debug("inlen is not supported\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
+		pr_debug("RSS QP with unsupported QP type %d\n",
+			 init_attr->qp_type);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (init_attr->create_flags) {
+		pr_debug("RSS QP doesn't support create flags\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (init_attr->send_cq || init_attr->cap.max_send_wr) {
+		pr_debug("RSS QP with unsupported send attributes\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->pri.vid = 0xFFFF;
+	qp->alt.vid = 0xFFFF;
+
+	err = create_qp_rss(to_mdev(pd->device), init_attr, &ucmd, qp);
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	qp->ibqp.qp_num = qp->mqp.qpn;
+
+	return &qp->ibqp;
+}
+
+/*
+ * This function allocates a WQN from a range which is consecutive and aligned
+ * to its size. In case the range is full, then it creates a new range and
+ * allocates WQN from it. The new range will be used for following allocations.
+ */
+static int mlx4_ib_alloc_wqn(struct mlx4_ib_ucontext *context,
+			     struct mlx4_ib_qp *qp, int range_size, int *wqn)
+{
+	struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device);
+	struct mlx4_wqn_range *range;
+	int err = 0;
+
+	mutex_lock(&context->wqn_ranges_mutex);
+
+	range = list_first_entry_or_null(&context->wqn_ranges_list,
+					 struct mlx4_wqn_range, list);
+
+	if (!range || (range->refcount == range->size) || range->dirty) {
+		range = kzalloc(sizeof(*range), GFP_KERNEL);
+		if (!range) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = mlx4_qp_reserve_range(dev->dev, range_size,
+					    range_size, &range->base_wqn, 0,
+					    qp->mqp.usage);
+		if (err) {
+			kfree(range);
+			goto out;
+		}
+
+		range->size = range_size;
+		list_add(&range->list, &context->wqn_ranges_list);
+	} else if (range_size != 1) {
+		/*
+		 * Requesting a new range (>1) when last range is still open, is
+		 * not valid.
+		 */
+		err = -EINVAL;
+		goto out;
+	}
+
+	qp->wqn_range = range;
+
+	*wqn = range->base_wqn + range->refcount;
+
+	range->refcount++;
+
+out:
+	mutex_unlock(&context->wqn_ranges_mutex);
+
+	return err;
+}
+
+static void mlx4_ib_release_wqn(struct mlx4_ib_ucontext *context,
+				struct mlx4_ib_qp *qp, bool dirty_release)
+{
+	struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device);
+	struct mlx4_wqn_range *range;
+
+	mutex_lock(&context->wqn_ranges_mutex);
+
+	range = qp->wqn_range;
+
+	range->refcount--;
+	if (!range->refcount) {
+		mlx4_qp_release_range(dev->dev, range->base_wqn,
+				      range->size);
+		list_del(&range->list);
+		kfree(range);
+	} else if (dirty_release) {
+	/*
+	 * A range which one of its WQNs is destroyed, won't be able to be
+	 * reused for further WQN allocations.
+	 * The next created WQ will allocate a new range.
+	 */
+		range->dirty = 1;
+	}
+
+	mutex_unlock(&context->wqn_ranges_mutex);
+}
+
+static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
+			    enum mlx4_ib_source_type src,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata, int sqpn,
+			    struct mlx4_ib_qp **caller_qp)
+{
+	int qpn;
+	int err;
+	struct ib_qp_cap backup_cap;
+	struct mlx4_ib_sqp *sqp = NULL;
+	struct mlx4_ib_qp *qp;
+	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
+	struct mlx4_ib_cq *mcq;
+	unsigned long flags;
+	int range_size = 0;
+
+	/* When tunneling special qps, we use a plain UD qp */
+	if (sqpn) {
+		if (mlx4_is_mfunc(dev->dev) &&
+		    (!mlx4_is_master(dev->dev) ||
+		     !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) {
+			if (init_attr->qp_type == IB_QPT_GSI)
+				qp_type = MLX4_IB_QPT_PROXY_GSI;
+			else {
+				if (mlx4_is_master(dev->dev) ||
+				    qp0_enabled_vf(dev->dev, sqpn))
+					qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER;
+				else
+					qp_type = MLX4_IB_QPT_PROXY_SMI;
+			}
+		}
+		qpn = sqpn;
+		/* add extra sg entry for tunneling */
+		init_attr->cap.max_recv_sge++;
+	} else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) {
+		struct mlx4_ib_qp_tunnel_init_attr *tnl_init =
+			container_of(init_attr,
+				     struct mlx4_ib_qp_tunnel_init_attr, init_attr);
+		if ((tnl_init->proxy_qp_type != IB_QPT_SMI &&
+		     tnl_init->proxy_qp_type != IB_QPT_GSI)   ||
+		    !mlx4_is_master(dev->dev))
+			return -EINVAL;
+		if (tnl_init->proxy_qp_type == IB_QPT_GSI)
+			qp_type = MLX4_IB_QPT_TUN_GSI;
+		else if (tnl_init->slave == mlx4_master_func_num(dev->dev) ||
+			 mlx4_vf_smi_enabled(dev->dev, tnl_init->slave,
+					     tnl_init->port))
+			qp_type = MLX4_IB_QPT_TUN_SMI_OWNER;
+		else
+			qp_type = MLX4_IB_QPT_TUN_SMI;
+		/* we are definitely in the PPF here, since we are creating
+		 * tunnel QPs. base_tunnel_sqpn is therefore valid. */
+		qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave
+			+ tnl_init->proxy_qp_type * 2 + tnl_init->port - 1;
+		sqpn = qpn;
+	}
+
+	if (!*caller_qp) {
+		if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI ||
+		    (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER |
+				MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) {
+			sqp = kzalloc(sizeof(struct mlx4_ib_sqp), GFP_KERNEL);
+			if (!sqp)
+				return -ENOMEM;
+			qp = &sqp->qp;
+			qp->pri.vid = 0xFFFF;
+			qp->alt.vid = 0xFFFF;
+		} else {
+			qp = kzalloc(sizeof(struct mlx4_ib_qp), GFP_KERNEL);
+			if (!qp)
+				return -ENOMEM;
+			qp->pri.vid = 0xFFFF;
+			qp->alt.vid = 0xFFFF;
+		}
+	} else
+		qp = *caller_qp;
+
+	qp->mlx4_ib_qp_type = qp_type;
+
+	mutex_init(&qp->mutex);
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+	INIT_LIST_HEAD(&qp->gid_list);
+	INIT_LIST_HEAD(&qp->steering_rules);
+
+	qp->state	 = IB_QPS_RESET;
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+
+	if (pd->uobject) {
+		union {
+			struct mlx4_ib_create_qp qp;
+			struct mlx4_ib_create_wq wq;
+		} ucmd;
+		size_t copy_len;
+		int shift;
+		int n;
+
+		copy_len = (src == MLX4_IB_QP_SRC) ?
+			   sizeof(struct mlx4_ib_create_qp) :
+			   min(sizeof(struct mlx4_ib_create_wq), udata->inlen);
+
+		if (ib_copy_from_udata(&ucmd, udata, copy_len)) {
+			err = -EFAULT;
+			goto err;
+		}
+
+		if (src == MLX4_IB_RWQ_SRC) {
+			if (ucmd.wq.comp_mask || ucmd.wq.reserved[0] ||
+			    ucmd.wq.reserved[1] || ucmd.wq.reserved[2]) {
+				pr_debug("user command isn't supported\n");
+				err = -EOPNOTSUPP;
+				goto err;
+			}
+
+			if (ucmd.wq.log_range_size >
+			    ilog2(dev->dev->caps.max_rss_tbl_sz)) {
+				pr_debug("WQN range size must be equal or smaller than %d\n",
+					 dev->dev->caps.max_rss_tbl_sz);
+				err = -EOPNOTSUPP;
+				goto err;
+			}
+			range_size = 1 << ucmd.wq.log_range_size;
+		} else {
+			qp->inl_recv_sz = ucmd.qp.inl_recv_sz;
+		}
+
+		if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) {
+			if (!(dev->dev->caps.flags &
+			      MLX4_DEV_CAP_FLAG_FCS_KEEP)) {
+				pr_debug("scatter FCS is unsupported\n");
+				err = -EOPNOTSUPP;
+				goto err;
+			}
+
+			qp->flags |= MLX4_IB_QP_SCATTER_FCS;
+		}
+
+		err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+				  qp_has_rq(init_attr), qp, qp->inl_recv_sz);
+		if (err)
+			goto err;
+
+		if (src == MLX4_IB_QP_SRC) {
+			qp->sq_no_prefetch = ucmd.qp.sq_no_prefetch;
+
+			err = set_user_sq_size(dev, qp,
+					       (struct mlx4_ib_create_qp *)
+					       &ucmd);
+			if (err)
+				goto err;
+		} else {
+			qp->sq_no_prefetch = 1;
+			qp->sq.wqe_cnt = 1;
+			qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
+			/* Allocated buffer expects to have at least that SQ
+			 * size.
+			 */
+			qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+				(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+		}
+
+		qp->umem = ib_umem_get(pd->uobject->context,
+				(src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr :
+				ucmd.wq.buf_addr, qp->buf_size, 0, 0);
+		if (IS_ERR(qp->umem)) {
+			err = PTR_ERR(qp->umem);
+			goto err;
+		}
+
+		n = ib_umem_page_count(qp->umem);
+		shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n);
+		err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt);
+
+		if (err)
+			goto err_buf;
+
+		err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
+		if (err)
+			goto err_mtt;
+
+		if (qp_has_rq(init_attr)) {
+			err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+				(src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr :
+				ucmd.wq.db_addr, &qp->db);
+			if (err)
+				goto err_mtt;
+		}
+		qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
+	} else {
+		err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+				  qp_has_rq(init_attr), qp, 0);
+		if (err)
+			goto err;
+
+		qp->sq_no_prefetch = 0;
+
+		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
+			qp->flags |= MLX4_IB_QP_LSO;
+
+		if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
+			if (dev->steering_support ==
+			    MLX4_STEERING_MODE_DEVICE_MANAGED)
+				qp->flags |= MLX4_IB_QP_NETIF;
+			else
+				goto err;
+		}
+
+		memcpy(&backup_cap, &init_attr->cap, sizeof(backup_cap));
+		err = set_kernel_sq_size(dev, &init_attr->cap,
+					 qp_type, qp, true);
+		if (err)
+			goto err;
+
+		if (qp_has_rq(init_attr)) {
+			err = mlx4_db_alloc(dev->dev, &qp->db, 0);
+			if (err)
+				goto err;
+
+			*qp->db.db = 0;
+		}
+
+		if (mlx4_buf_alloc(dev->dev, qp->buf_size, qp->buf_size,
+				   &qp->buf)) {
+			memcpy(&init_attr->cap, &backup_cap,
+			       sizeof(backup_cap));
+			err = set_kernel_sq_size(dev, &init_attr->cap, qp_type,
+						 qp, false);
+			if (err)
+				goto err_db;
+
+			if (mlx4_buf_alloc(dev->dev, qp->buf_size,
+					   PAGE_SIZE * 2, &qp->buf)) {
+				err = -ENOMEM;
+				goto err_db;
+			}
+		}
+
+		err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
+				    &qp->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
+		if (err)
+			goto err_mtt;
+
+		qp->sq.wrid = kvmalloc_array(qp->sq.wqe_cnt,
+					     sizeof(u64), GFP_KERNEL);
+		qp->rq.wrid = kvmalloc_array(qp->rq.wqe_cnt,
+					     sizeof(u64), GFP_KERNEL);
+		if (!qp->sq.wrid || !qp->rq.wrid) {
+			err = -ENOMEM;
+			goto err_wrid;
+		}
+		qp->mqp.usage = MLX4_RES_USAGE_DRIVER;
+	}
+
+	if (sqpn) {
+		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+			if (alloc_proxy_bufs(pd->device, qp)) {
+				err = -ENOMEM;
+				goto err_wrid;
+			}
+		}
+	} else if (src == MLX4_IB_RWQ_SRC) {
+		err = mlx4_ib_alloc_wqn(to_mucontext(pd->uobject->context), qp,
+					range_size, &qpn);
+		if (err)
+			goto err_wrid;
+	} else {
+		/* Raw packet QPNs may not have bits 6,7 set in their qp_num;
+		 * otherwise, the WQE BlueFlame setup flow wrongly causes
+		 * VLAN insertion. */
+		if (init_attr->qp_type == IB_QPT_RAW_PACKET)
+			err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn,
+						    (init_attr->cap.max_send_wr ?
+						     MLX4_RESERVE_ETH_BF_QP : 0) |
+						    (init_attr->cap.max_recv_wr ?
+						     MLX4_RESERVE_A0_QP : 0),
+						    qp->mqp.usage);
+		else
+			if (qp->flags & MLX4_IB_QP_NETIF)
+				err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn);
+			else
+				err = mlx4_qp_reserve_range(dev->dev, 1, 1,
+							    &qpn, 0, qp->mqp.usage);
+		if (err)
+			goto err_proxy;
+	}
+
+	if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+		qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
+	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
+	if (err)
+		goto err_qpn;
+
+	if (init_attr->qp_type == IB_QPT_XRC_TGT)
+		qp->mqp.qpn |= (1 << 23);
+
+	/*
+	 * Hardware wants QPN written in big-endian order (after
+	 * shifting) for send doorbell.  Precompute this value to save
+	 * a little bit when posting sends.
+	 */
+	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+	qp->mqp.event = (src == MLX4_IB_QP_SRC) ? mlx4_ib_qp_event :
+						  mlx4_ib_wq_event;
+
+	if (!*caller_qp)
+		*caller_qp = qp;
+
+	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+	mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq),
+			 to_mcq(init_attr->recv_cq));
+	/* Maintain device to QPs access, needed for further handling
+	 * via reset flow
+	 */
+	list_add_tail(&qp->qps_list, &dev->qp_list);
+	/* Maintain CQ to QPs access, needed for further handling
+	 * via reset flow
+	 */
+	mcq = to_mcq(init_attr->send_cq);
+	list_add_tail(&qp->cq_send_list, &mcq->send_qp_list);
+	mcq = to_mcq(init_attr->recv_cq);
+	list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list);
+	mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq),
+			   to_mcq(init_attr->recv_cq));
+	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+	return 0;
+
+err_qpn:
+	if (!sqpn) {
+		if (qp->flags & MLX4_IB_QP_NETIF)
+			mlx4_ib_steer_qp_free(dev, qpn, 1);
+		else if (src == MLX4_IB_RWQ_SRC)
+			mlx4_ib_release_wqn(to_mucontext(pd->uobject->context),
+					    qp, 0);
+		else
+			mlx4_qp_release_range(dev->dev, qpn, 1);
+	}
+err_proxy:
+	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
+		free_proxy_bufs(pd->device, qp);
+err_wrid:
+	if (pd->uobject) {
+		if (qp_has_rq(init_attr))
+			mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
+	} else {
+		kvfree(qp->sq.wrid);
+		kvfree(qp->rq.wrid);
+	}
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+err_buf:
+	if (pd->uobject)
+		ib_umem_release(qp->umem);
+	else
+		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+
+err_db:
+	if (!pd->uobject && qp_has_rq(init_attr))
+		mlx4_db_free(dev->dev, &qp->db);
+
+err:
+	if (sqp)
+		kfree(sqp);
+	else if (!*caller_qp)
+		kfree(qp);
+	return err;
+}
+
+static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:	return MLX4_QP_STATE_RST;
+	case IB_QPS_INIT:	return MLX4_QP_STATE_INIT;
+	case IB_QPS_RTR:	return MLX4_QP_STATE_RTR;
+	case IB_QPS_RTS:	return MLX4_QP_STATE_RTS;
+	case IB_QPS_SQD:	return MLX4_QP_STATE_SQD;
+	case IB_QPS_SQE:	return MLX4_QP_STATE_SQER;
+	case IB_QPS_ERR:	return MLX4_QP_STATE_ERR;
+	default:		return -1;
+	}
+}
+
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		spin_lock(&send_cq->lock);
+		__acquire(&recv_cq->lock);
+	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+		spin_lock(&send_cq->lock);
+		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock(&recv_cq->lock);
+		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+	__releases(&send_cq->lock) __releases(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		__release(&recv_cq->lock);
+		spin_unlock(&send_cq->lock);
+	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+		spin_unlock(&recv_cq->lock);
+		spin_unlock(&send_cq->lock);
+	} else {
+		spin_unlock(&send_cq->lock);
+		spin_unlock(&recv_cq->lock);
+	}
+}
+
+static void del_gid_entries(struct mlx4_ib_qp *qp)
+{
+	struct mlx4_ib_gid_entry *ge, *tmp;
+
+	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+		list_del(&ge->list);
+		kfree(ge);
+	}
+}
+
+static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp)
+{
+	if (qp->ibqp.qp_type == IB_QPT_XRC_TGT)
+		return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd);
+	else
+		return to_mpd(qp->ibqp.pd);
+}
+
+static void get_cqs(struct mlx4_ib_qp *qp, enum mlx4_ib_source_type src,
+		    struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq)
+{
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_XRC_TGT:
+		*send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq);
+		*recv_cq = *send_cq;
+		break;
+	case IB_QPT_XRC_INI:
+		*send_cq = to_mcq(qp->ibqp.send_cq);
+		*recv_cq = *send_cq;
+		break;
+	default:
+		*recv_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.recv_cq) :
+						     to_mcq(qp->ibwq.cq);
+		*send_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.send_cq) :
+						     *recv_cq;
+		break;
+	}
+}
+
+static void destroy_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	if (qp->state != IB_QPS_RESET) {
+		int i;
+
+		for (i = 0; i < (1 << qp->ibqp.rwq_ind_tbl->log_ind_tbl_size);
+		     i++) {
+			struct ib_wq *ibwq = qp->ibqp.rwq_ind_tbl->ind_tbl[i];
+			struct mlx4_ib_qp *wq =	to_mqp((struct ib_qp *)ibwq);
+
+			mutex_lock(&wq->mutex);
+
+			wq->rss_usecnt--;
+
+			mutex_unlock(&wq->mutex);
+		}
+
+		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+			pr_warn("modify QP %06x to RESET failed.\n",
+				qp->mqp.qpn);
+	}
+
+	mlx4_qp_remove(dev->dev, &qp->mqp);
+	mlx4_qp_free(dev->dev, &qp->mqp);
+	mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+	del_gid_entries(qp);
+	kfree(qp->rss_ctx);
+}
+
+static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+			      enum mlx4_ib_source_type src, int is_user)
+{
+	struct mlx4_ib_cq *send_cq, *recv_cq;
+	unsigned long flags;
+
+	if (qp->state != IB_QPS_RESET) {
+		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+			pr_warn("modify QP %06x to RESET failed.\n",
+			       qp->mqp.qpn);
+		if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
+			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+			qp->pri.smac = 0;
+			qp->pri.smac_port = 0;
+		}
+		if (qp->alt.smac) {
+			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+			qp->alt.smac = 0;
+		}
+		if (qp->pri.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
+			qp->pri.vid = 0xFFFF;
+			qp->pri.candidate_vid = 0xFFFF;
+			qp->pri.update_vid = 0;
+		}
+		if (qp->alt.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
+			qp->alt.vid = 0xFFFF;
+			qp->alt.candidate_vid = 0xFFFF;
+			qp->alt.update_vid = 0;
+		}
+	}
+
+	get_cqs(qp, src, &send_cq, &recv_cq);
+
+	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+	mlx4_ib_lock_cqs(send_cq, recv_cq);
+
+	/* del from lists under both locks above to protect reset flow paths */
+	list_del(&qp->qps_list);
+	list_del(&qp->cq_send_list);
+	list_del(&qp->cq_recv_list);
+	if (!is_user) {
+		__mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+				 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
+		if (send_cq != recv_cq)
+			__mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+	}
+
+	mlx4_qp_remove(dev->dev, &qp->mqp);
+
+	mlx4_ib_unlock_cqs(send_cq, recv_cq);
+	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+
+	mlx4_qp_free(dev->dev, &qp->mqp);
+
+	if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) {
+		if (qp->flags & MLX4_IB_QP_NETIF)
+			mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1);
+		else if (src == MLX4_IB_RWQ_SRC)
+			mlx4_ib_release_wqn(to_mucontext(
+					    qp->ibwq.uobject->context), qp, 1);
+		else
+			mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+	}
+
+	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+	if (is_user) {
+		if (qp->rq.wqe_cnt) {
+			struct mlx4_ib_ucontext *mcontext = !src ?
+				to_mucontext(qp->ibqp.uobject->context) :
+				to_mucontext(qp->ibwq.uobject->context);
+			mlx4_ib_db_unmap_user(mcontext, &qp->db);
+		}
+		ib_umem_release(qp->umem);
+	} else {
+		kvfree(qp->sq.wrid);
+		kvfree(qp->rq.wrid);
+		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
+			free_proxy_bufs(&dev->ib_dev, qp);
+		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+		if (qp->rq.wqe_cnt)
+			mlx4_db_free(dev->dev, &qp->db);
+	}
+
+	del_gid_entries(qp);
+}
+
+static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
+{
+	/* Native or PPF */
+	if (!mlx4_is_mfunc(dev->dev) ||
+	    (mlx4_is_master(dev->dev) &&
+	     attr->create_flags & MLX4_IB_SRIOV_SQP)) {
+		return  dev->dev->phys_caps.base_sqpn +
+			(attr->qp_type == IB_QPT_SMI ? 0 : 2) +
+			attr->port_num - 1;
+	}
+	/* PF or VF -- creating proxies */
+	if (attr->qp_type == IB_QPT_SMI)
+		return dev->dev->caps.spec_qps[attr->port_num - 1].qp0_proxy;
+	else
+		return dev->dev->caps.spec_qps[attr->port_num - 1].qp1_proxy;
+}
+
+static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
+					struct ib_qp_init_attr *init_attr,
+					struct ib_udata *udata)
+{
+	struct mlx4_ib_qp *qp = NULL;
+	int err;
+	int sup_u_create_flags = MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+	u16 xrcdn = 0;
+
+	if (init_attr->rwq_ind_tbl)
+		return _mlx4_ib_create_qp_rss(pd, init_attr, udata);
+
+	/*
+	 * We only support LSO, vendor flag1, and multicast loopback blocking,
+	 * and only for kernel UD QPs.
+	 */
+	if (init_attr->create_flags & ~(MLX4_IB_QP_LSO |
+					MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK |
+					MLX4_IB_SRIOV_TUNNEL_QP |
+					MLX4_IB_SRIOV_SQP |
+					MLX4_IB_QP_NETIF |
+					MLX4_IB_QP_CREATE_ROCE_V2_GSI))
+		return ERR_PTR(-EINVAL);
+
+	if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
+		if (init_attr->qp_type != IB_QPT_UD)
+			return ERR_PTR(-EINVAL);
+	}
+
+	if (init_attr->create_flags) {
+		if (udata && init_attr->create_flags & ~(sup_u_create_flags))
+			return ERR_PTR(-EINVAL);
+
+		if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP |
+						 MLX4_IB_QP_CREATE_ROCE_V2_GSI  |
+						 MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) &&
+		     init_attr->qp_type != IB_QPT_UD) ||
+		    (init_attr->create_flags & MLX4_IB_SRIOV_SQP &&
+		     init_attr->qp_type > IB_QPT_GSI) ||
+		    (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI &&
+		     init_attr->qp_type != IB_QPT_GSI))
+			return ERR_PTR(-EINVAL);
+	}
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_XRC_TGT:
+		pd = to_mxrcd(init_attr->xrcd)->pd;
+		xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
+		init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq;
+		/* fall through */
+	case IB_QPT_XRC_INI:
+		if (!(to_mdev(pd->device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+			return ERR_PTR(-ENOSYS);
+		init_attr->recv_cq = init_attr->send_cq;
+		/* fall through */
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_RAW_PACKET:
+		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+		qp->pri.vid = 0xFFFF;
+		qp->alt.vid = 0xFFFF;
+		/* fall through */
+	case IB_QPT_UD:
+	{
+		err = create_qp_common(to_mdev(pd->device), pd,	MLX4_IB_QP_SRC,
+				       init_attr, udata, 0, &qp);
+		if (err) {
+			kfree(qp);
+			return ERR_PTR(err);
+		}
+
+		qp->ibqp.qp_num = qp->mqp.qpn;
+		qp->xrcdn = xrcdn;
+
+		break;
+	}
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	{
+		int sqpn;
+
+		/* Userspace is not allowed to create special QPs: */
+		if (udata)
+			return ERR_PTR(-EINVAL);
+		if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) {
+			int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev,
+							1, 1, &sqpn, 0,
+							MLX4_RES_USAGE_DRIVER);
+
+			if (res)
+				return ERR_PTR(res);
+		} else {
+			sqpn = get_sqp_num(to_mdev(pd->device), init_attr);
+		}
+
+		err = create_qp_common(to_mdev(pd->device), pd, MLX4_IB_QP_SRC,
+				       init_attr, udata, sqpn, &qp);
+		if (err)
+			return ERR_PTR(err);
+
+		qp->port	= init_attr->port_num;
+		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 :
+			init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI ? sqpn : 1;
+		break;
+	}
+	default:
+		/* Don't support raw QPs */
+		return ERR_PTR(-EINVAL);
+	}
+
+	return &qp->ibqp;
+}
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata) {
+	struct ib_device *device = pd ? pd->device : init_attr->xrcd->device;
+	struct ib_qp *ibqp;
+	struct mlx4_ib_dev *dev = to_mdev(device);
+
+	ibqp = _mlx4_ib_create_qp(pd, init_attr, udata);
+
+	if (!IS_ERR(ibqp) &&
+	    (init_attr->qp_type == IB_QPT_GSI) &&
+	    !(init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI)) {
+		struct mlx4_ib_sqp *sqp = to_msqp((to_mqp(ibqp)));
+		int is_eth = rdma_cap_eth_ah(&dev->ib_dev, init_attr->port_num);
+
+		if (is_eth &&
+		    dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+			init_attr->create_flags |= MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+			sqp->roce_v2_gsi = ib_create_qp(pd, init_attr);
+
+			if (IS_ERR(sqp->roce_v2_gsi)) {
+				pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi));
+				sqp->roce_v2_gsi = NULL;
+			} else {
+				sqp = to_msqp(to_mqp(sqp->roce_v2_gsi));
+				sqp->qp.flags |= MLX4_IB_ROCE_V2_GSI_QP;
+			}
+
+			init_attr->create_flags &= ~MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+		}
+	}
+	return ibqp;
+}
+
+static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct mlx4_ib_dev *dev = to_mdev(qp->device);
+	struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+	if (is_qp0(dev, mqp))
+		mlx4_CLOSE_PORT(dev->dev, mqp->port);
+
+	if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI &&
+	    dev->qp1_proxy[mqp->port - 1] == mqp) {
+		mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]);
+		dev->qp1_proxy[mqp->port - 1] = NULL;
+		mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]);
+	}
+
+	if (mqp->counter_index)
+		mlx4_ib_free_qp_counter(dev, mqp);
+
+	if (qp->rwq_ind_tbl) {
+		destroy_qp_rss(dev, mqp);
+	} else {
+		struct mlx4_ib_pd *pd;
+
+		pd = get_pd(mqp);
+		destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, !!pd->ibpd.uobject);
+	}
+
+	if (is_sqp(dev, mqp))
+		kfree(to_msqp(mqp));
+	else
+		kfree(mqp);
+
+	return 0;
+}
+
+int mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+	if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+		struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+
+		if (sqp->roce_v2_gsi)
+			ib_destroy_qp(sqp->roce_v2_gsi);
+	}
+
+	return _mlx4_ib_destroy_qp(qp);
+}
+
+static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
+{
+	switch (type) {
+	case MLX4_IB_QPT_RC:		return MLX4_QP_ST_RC;
+	case MLX4_IB_QPT_UC:		return MLX4_QP_ST_UC;
+	case MLX4_IB_QPT_UD:		return MLX4_QP_ST_UD;
+	case MLX4_IB_QPT_XRC_INI:
+	case MLX4_IB_QPT_XRC_TGT:	return MLX4_QP_ST_XRC;
+	case MLX4_IB_QPT_SMI:
+	case MLX4_IB_QPT_GSI:
+	case MLX4_IB_QPT_RAW_PACKET:	return MLX4_QP_ST_MLX;
+
+	case MLX4_IB_QPT_PROXY_SMI_OWNER:
+	case MLX4_IB_QPT_TUN_SMI_OWNER:	return (mlx4_is_mfunc(dev->dev) ?
+						MLX4_QP_ST_MLX : -1);
+	case MLX4_IB_QPT_PROXY_SMI:
+	case MLX4_IB_QPT_TUN_SMI:
+	case MLX4_IB_QPT_PROXY_GSI:
+	case MLX4_IB_QPT_TUN_GSI:	return (mlx4_is_mfunc(dev->dev) ?
+						MLX4_QP_ST_UD : -1);
+	default:			return -1;
+	}
+}
+
+static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
+				   int attr_mask)
+{
+	u8 dest_rd_atomic;
+	u32 access_flags;
+	u32 hw_access_flags = 0;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MLX4_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= MLX4_QP_BIT_RAE;
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MLX4_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
+static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
+			    int attr_mask)
+{
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		sqp->pkey_index = attr->pkey_index;
+	if (attr_mask & IB_QP_QKEY)
+		sqp->qkey = attr->qkey;
+	if (attr_mask & IB_QP_SQ_PSN)
+		sqp->send_psn = attr->sq_psn;
+}
+
+static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
+{
+	path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
+}
+
+static int _mlx4_set_path(struct mlx4_ib_dev *dev,
+			  const struct rdma_ah_attr *ah,
+			  u64 smac, u16 vlan_tag, struct mlx4_qp_path *path,
+			  struct mlx4_roce_smac_vlan_info *smac_info, u8 port)
+{
+	int vidx;
+	int smac_index;
+	int err;
+
+	path->grh_mylmc = rdma_ah_get_path_bits(ah) & 0x7f;
+	path->rlid = cpu_to_be16(rdma_ah_get_dlid(ah));
+	if (rdma_ah_get_static_rate(ah)) {
+		path->static_rate = rdma_ah_get_static_rate(ah) +
+				    MLX4_STAT_RATE_OFFSET;
+		while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
+			--path->static_rate;
+	} else
+		path->static_rate = 0;
+
+	if (rdma_ah_get_ah_flags(ah) & IB_AH_GRH) {
+		const struct ib_global_route *grh = rdma_ah_read_grh(ah);
+		int real_sgid_index =
+			mlx4_ib_gid_index_to_real_index(dev, port,
+							grh->sgid_index);
+
+		if (real_sgid_index < 0)
+			return real_sgid_index;
+		if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) {
+			pr_err("sgid_index (%u) too large. max is %d\n",
+			       real_sgid_index, dev->dev->caps.gid_table_len[port] - 1);
+			return -1;
+		}
+
+		path->grh_mylmc |= 1 << 7;
+		path->mgid_index = real_sgid_index;
+		path->hop_limit  = grh->hop_limit;
+		path->tclass_flowlabel =
+			cpu_to_be32((grh->traffic_class << 20) |
+				    (grh->flow_label));
+		memcpy(path->rgid, grh->dgid.raw, 16);
+	}
+
+	if (ah->type == RDMA_AH_ATTR_TYPE_ROCE) {
+		if (!(rdma_ah_get_ah_flags(ah) & IB_AH_GRH))
+			return -1;
+
+		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+			((port - 1) << 6) | ((rdma_ah_get_sl(ah) & 7) << 3);
+
+		path->feup |= MLX4_FEUP_FORCE_ETH_UP;
+		if (vlan_tag < 0x1000) {
+			if (smac_info->vid < 0x1000) {
+				/* both valid vlan ids */
+				if (smac_info->vid != vlan_tag) {
+					/* different VIDs.  unreg old and reg new */
+					err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
+					if (err)
+						return err;
+					smac_info->candidate_vid = vlan_tag;
+					smac_info->candidate_vlan_index = vidx;
+					smac_info->candidate_vlan_port = port;
+					smac_info->update_vid = 1;
+					path->vlan_index = vidx;
+				} else {
+					path->vlan_index = smac_info->vlan_index;
+				}
+			} else {
+				/* no current vlan tag in qp */
+				err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
+				if (err)
+					return err;
+				smac_info->candidate_vid = vlan_tag;
+				smac_info->candidate_vlan_index = vidx;
+				smac_info->candidate_vlan_port = port;
+				smac_info->update_vid = 1;
+				path->vlan_index = vidx;
+			}
+			path->feup |= MLX4_FVL_FORCE_ETH_VLAN;
+			path->fl = 1 << 6;
+		} else {
+			/* have current vlan tag. unregister it at modify-qp success */
+			if (smac_info->vid < 0x1000) {
+				smac_info->candidate_vid = 0xFFFF;
+				smac_info->update_vid = 1;
+			}
+		}
+
+		/* get smac_index for RoCE use.
+		 * If no smac was yet assigned, register one.
+		 * If one was already assigned, but the new mac differs,
+		 * unregister the old one and register the new one.
+		*/
+		if ((!smac_info->smac && !smac_info->smac_port) ||
+		    smac_info->smac != smac) {
+			/* register candidate now, unreg if needed, after success */
+			smac_index = mlx4_register_mac(dev->dev, port, smac);
+			if (smac_index >= 0) {
+				smac_info->candidate_smac_index = smac_index;
+				smac_info->candidate_smac = smac;
+				smac_info->candidate_smac_port = port;
+			} else {
+				return -EINVAL;
+			}
+		} else {
+			smac_index = smac_info->smac_index;
+		}
+		memcpy(path->dmac, ah->roce.dmac, 6);
+		path->ackto = MLX4_IB_LINK_TYPE_ETH;
+		/* put MAC table smac index for IBoE */
+		path->grh_mylmc = (u8) (smac_index) | 0x80;
+	} else {
+		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+			((port - 1) << 6) | ((rdma_ah_get_sl(ah) & 0xf) << 2);
+	}
+
+	return 0;
+}
+
+static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp,
+			 enum ib_qp_attr_mask qp_attr_mask,
+			 struct mlx4_ib_qp *mqp,
+			 struct mlx4_qp_path *path, u8 port,
+			 u16 vlan_id, u8 *smac)
+{
+	return _mlx4_set_path(dev, &qp->ah_attr,
+			      mlx4_mac_to_u64(smac),
+			      vlan_id,
+			      path, &mqp->pri, port);
+}
+
+static int mlx4_set_alt_path(struct mlx4_ib_dev *dev,
+			     const struct ib_qp_attr *qp,
+			     enum ib_qp_attr_mask qp_attr_mask,
+			     struct mlx4_ib_qp *mqp,
+			     struct mlx4_qp_path *path, u8 port)
+{
+	return _mlx4_set_path(dev, &qp->alt_ah_attr,
+			      0,
+			      0xffff,
+			      path, &mqp->alt, port);
+}
+
+static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	struct mlx4_ib_gid_entry *ge, *tmp;
+
+	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+		if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) {
+			ge->added = 1;
+			ge->port = qp->port;
+		}
+	}
+}
+
+static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev,
+				    struct mlx4_ib_qp *qp,
+				    struct mlx4_qp_context *context)
+{
+	u64 u64_mac;
+	int smac_index;
+
+	u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]);
+
+	context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6);
+	if (!qp->pri.smac && !qp->pri.smac_port) {
+		smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac);
+		if (smac_index >= 0) {
+			qp->pri.candidate_smac_index = smac_index;
+			qp->pri.candidate_smac = u64_mac;
+			qp->pri.candidate_smac_port = qp->port;
+			context->pri_path.grh_mylmc = 0x80 | (u8) smac_index;
+		} else {
+			return -ENOENT;
+		}
+	}
+	return 0;
+}
+
+static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	struct counter_index *new_counter_index;
+	int err;
+	u32 tmp_idx;
+
+	if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) !=
+	    IB_LINK_LAYER_ETHERNET ||
+	    !(qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) ||
+	    !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK))
+		return 0;
+
+	err = mlx4_counter_alloc(dev->dev, &tmp_idx, MLX4_RES_USAGE_DRIVER);
+	if (err)
+		return err;
+
+	new_counter_index = kmalloc(sizeof(*new_counter_index), GFP_KERNEL);
+	if (!new_counter_index) {
+		mlx4_counter_free(dev->dev, tmp_idx);
+		return -ENOMEM;
+	}
+
+	new_counter_index->index = tmp_idx;
+	new_counter_index->allocated = 1;
+	qp->counter_index = new_counter_index;
+
+	mutex_lock(&dev->counters_table[qp->port - 1].mutex);
+	list_add_tail(&new_counter_index->list,
+		      &dev->counters_table[qp->port - 1].counters_list);
+	mutex_unlock(&dev->counters_table[qp->port - 1].mutex);
+
+	return 0;
+}
+
+enum {
+	MLX4_QPC_ROCE_MODE_1 = 0,
+	MLX4_QPC_ROCE_MODE_2 = 2,
+	MLX4_QPC_ROCE_MODE_UNDEFINED = 0xff
+};
+
+static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
+{
+	switch (gid_type) {
+	case IB_GID_TYPE_ROCE:
+		return MLX4_QPC_ROCE_MODE_1;
+	case IB_GID_TYPE_ROCE_UDP_ENCAP:
+		return MLX4_QPC_ROCE_MODE_2;
+	default:
+		return MLX4_QPC_ROCE_MODE_UNDEFINED;
+	}
+}
+
+/*
+ * Go over all RSS QP's childes (WQs) and apply their HW state according to
+ * their logic state if the RSS QP is the first RSS QP associated for the WQ.
+ */
+static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
+		struct ib_wq *ibwq = ind_tbl->ind_tbl[i];
+		struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
+
+		mutex_lock(&wq->mutex);
+
+		/* Mlx4_ib restrictions:
+		 * WQ's is associated to a port according to the RSS QP it is
+		 * associates to.
+		 * In case the WQ is associated to a different port by another
+		 * RSS QP, return a failure.
+		 */
+		if ((wq->rss_usecnt > 0) && (wq->port != port_num)) {
+			err = -EINVAL;
+			mutex_unlock(&wq->mutex);
+			break;
+		}
+		wq->port = port_num;
+		if ((wq->rss_usecnt == 0) && (ibwq->state == IB_WQS_RDY)) {
+			err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY);
+			if (err) {
+				mutex_unlock(&wq->mutex);
+				break;
+			}
+		}
+		wq->rss_usecnt++;
+
+		mutex_unlock(&wq->mutex);
+	}
+
+	if (i && err) {
+		int j;
+
+		for (j = (i - 1); j >= 0; j--) {
+			struct ib_wq *ibwq = ind_tbl->ind_tbl[j];
+			struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
+
+			mutex_lock(&wq->mutex);
+
+			if ((wq->rss_usecnt == 1) &&
+			    (ibwq->state == IB_WQS_RDY))
+				if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET))
+					pr_warn("failed to reverse WQN=0x%06x\n",
+						ibwq->wq_num);
+			wq->rss_usecnt--;
+
+			mutex_unlock(&wq->mutex);
+		}
+	}
+
+	return err;
+}
+
+static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl)
+{
+	int i;
+
+	for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
+		struct ib_wq *ibwq = ind_tbl->ind_tbl[i];
+		struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
+
+		mutex_lock(&wq->mutex);
+
+		if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY))
+			if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET))
+				pr_warn("failed to reverse WQN=%x\n",
+					ibwq->wq_num);
+		wq->rss_usecnt--;
+
+		mutex_unlock(&wq->mutex);
+	}
+}
+
+static void fill_qp_rss_context(struct mlx4_qp_context *context,
+				struct mlx4_ib_qp *qp)
+{
+	struct mlx4_rss_context *rss_context;
+
+	rss_context = (void *)context + offsetof(struct mlx4_qp_context,
+			pri_path) + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH;
+
+	rss_context->base_qpn = cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz);
+	rss_context->default_qpn =
+		cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz & 0xffffff);
+	if (qp->rss_ctx->flags & (MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6))
+		rss_context->base_qpn_udp = rss_context->default_qpn;
+	rss_context->flags = qp->rss_ctx->flags;
+	/* Currently support just toeplitz */
+	rss_context->hash_fn = MLX4_RSS_HASH_TOP;
+
+	memcpy(rss_context->rss_key, qp->rss_ctx->rss_key,
+	       MLX4_EN_RSS_KEY_SIZE);
+}
+
+static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
+			       const struct ib_qp_attr *attr, int attr_mask,
+			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+	struct ib_uobject *ibuobject;
+	struct ib_srq  *ibsrq;
+	struct ib_rwq_ind_table *rwq_ind_tbl;
+	enum ib_qp_type qp_type;
+	struct mlx4_ib_dev *dev;
+	struct mlx4_ib_qp *qp;
+	struct mlx4_ib_pd *pd;
+	struct mlx4_ib_cq *send_cq, *recv_cq;
+	struct mlx4_qp_context *context;
+	enum mlx4_qp_optpar optpar = 0;
+	int sqd_event;
+	int steer_qp = 0;
+	int err = -EINVAL;
+	int counter_index;
+
+	if (src_type == MLX4_IB_RWQ_SRC) {
+		struct ib_wq *ibwq;
+
+		ibwq	    = (struct ib_wq *)src;
+		ibuobject   = ibwq->uobject;
+		ibsrq	    = NULL;
+		rwq_ind_tbl = NULL;
+		qp_type     = IB_QPT_RAW_PACKET;
+		qp	    = to_mqp((struct ib_qp *)ibwq);
+		dev	    = to_mdev(ibwq->device);
+		pd	    = to_mpd(ibwq->pd);
+	} else {
+		struct ib_qp *ibqp;
+
+		ibqp	    = (struct ib_qp *)src;
+		ibuobject   = ibqp->uobject;
+		ibsrq	    = ibqp->srq;
+		rwq_ind_tbl = ibqp->rwq_ind_tbl;
+		qp_type     = ibqp->qp_type;
+		qp	    = to_mqp(ibqp);
+		dev	    = to_mdev(ibqp->device);
+		pd	    = get_pd(qp);
+	}
+
+	/* APM is not supported under RoCE */
+	if (attr_mask & IB_QP_ALT_PATH &&
+	    rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+	    IB_LINK_LAYER_ETHERNET)
+		return -ENOTSUPP;
+
+	context = kzalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
+				     (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16));
+
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+		context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+	else {
+		optpar |= MLX4_QP_OPTPAR_PM_STATE;
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+
+	if (qp->inl_recv_sz)
+		context->param3 |= cpu_to_be32(1 << 25);
+
+	if (qp->flags & MLX4_IB_QP_SCATTER_FCS)
+		context->param3 |= cpu_to_be32(1 << 29);
+
+	if (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI)
+		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
+	else if (qp_type == IB_QPT_RAW_PACKET)
+		context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX;
+	else if (qp_type == IB_QPT_UD) {
+		if (qp->flags & MLX4_IB_QP_LSO)
+			context->mtu_msgmax = (IB_MTU_4096 << 5) |
+					      ilog2(dev->dev->caps.max_gso_sz);
+		else
+			context->mtu_msgmax = (IB_MTU_4096 << 5) | 13;
+	} else if (attr_mask & IB_QP_PATH_MTU) {
+		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
+			pr_err("path MTU (%u) is invalid\n",
+			       attr->path_mtu);
+			goto out;
+		}
+		context->mtu_msgmax = (attr->path_mtu << 5) |
+			ilog2(dev->dev->caps.max_msg_sz);
+	}
+
+	if (!rwq_ind_tbl) { /* PRM RSS receive side should be left zeros */
+		if (qp->rq.wqe_cnt)
+			context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
+		context->rq_size_stride |= qp->rq.wqe_shift - 4;
+	}
+
+	if (qp->sq.wqe_cnt)
+		context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
+	context->sq_size_stride |= qp->sq.wqe_shift - 4;
+
+	if (new_state == IB_QPS_RESET && qp->counter_index)
+		mlx4_ib_free_qp_counter(dev, qp);
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
+		context->xrcd = cpu_to_be32((u32) qp->xrcdn);
+		if (qp_type == IB_QPT_RAW_PACKET)
+			context->param3 |= cpu_to_be32(1 << 30);
+	}
+
+	if (ibuobject)
+		context->usr_page = cpu_to_be32(
+			mlx4_to_hw_uar_index(dev->dev,
+					     to_mucontext(ibuobject->context)
+					     ->uar.index));
+	else
+		context->usr_page = cpu_to_be32(
+			mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index));
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+
+	if (attr_mask & IB_QP_PORT) {
+		if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD &&
+		    !(attr_mask & IB_QP_AV)) {
+			mlx4_set_sched(&context->pri_path, attr->port_num);
+			optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE;
+		}
+	}
+
+	if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		err = create_qp_lb_counter(dev, qp);
+		if (err)
+			goto out;
+
+		counter_index =
+			dev->counters_table[qp->port - 1].default_counter;
+		if (qp->counter_index)
+			counter_index = qp->counter_index->index;
+
+		if (counter_index != -1) {
+			context->pri_path.counter_index = counter_index;
+			optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
+			if (qp->counter_index) {
+				context->pri_path.fl |=
+					MLX4_FL_ETH_SRC_CHECK_MC_LB;
+				context->pri_path.vlan_control |=
+					MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER;
+			}
+		} else
+			context->pri_path.counter_index =
+				MLX4_SINK_COUNTER_INDEX(dev->dev);
+
+		if (qp->flags & MLX4_IB_QP_NETIF) {
+			mlx4_ib_steer_qp_reg(dev, qp, 1);
+			steer_qp = 1;
+		}
+
+		if (qp_type == IB_QPT_GSI) {
+			enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ?
+				IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE;
+			u8 qpc_roce_mode = gid_type_to_qpc(gid_type);
+
+			context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+		}
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
+			context->pri_path.disable_pkey_check = 0x40;
+		context->pri_path.pkey_index = attr->pkey_index;
+		optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
+	}
+
+	if (attr_mask & IB_QP_AV) {
+		u8 port_num = mlx4_is_bonded(dev->dev) ? 1 :
+			attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		union ib_gid gid;
+		struct ib_gid_attr gid_attr = {.gid_type = IB_GID_TYPE_IB};
+		u16 vlan = 0xffff;
+		u8 smac[ETH_ALEN];
+		int status = 0;
+		int is_eth =
+			rdma_cap_eth_ah(&dev->ib_dev, port_num) &&
+			rdma_ah_get_ah_flags(&attr->ah_attr) & IB_AH_GRH;
+
+		if (is_eth) {
+			int index =
+				rdma_ah_read_grh(&attr->ah_attr)->sgid_index;
+
+			status = ib_get_cached_gid(&dev->ib_dev, port_num,
+						   index, &gid, &gid_attr);
+			if (!status) {
+				vlan = rdma_vlan_dev_vlan_id(gid_attr.ndev);
+				memcpy(smac, gid_attr.ndev->dev_addr, ETH_ALEN);
+				dev_put(gid_attr.ndev);
+			}
+		}
+		if (status)
+			goto out;
+
+		if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path,
+				  port_num, vlan, smac))
+			goto out;
+
+		optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
+			   MLX4_QP_OPTPAR_SCHED_QUEUE);
+
+		if (is_eth &&
+		    (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)) {
+			u8 qpc_roce_mode = gid_type_to_qpc(gid_attr.gid_type);
+
+			if (qpc_roce_mode == MLX4_QPC_ROCE_MODE_UNDEFINED) {
+				err = -EINVAL;
+				goto out;
+			}
+			context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+		}
+
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		context->pri_path.ackto |= attr->timeout << 3;
+		optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_port_num == 0 ||
+		    attr->alt_port_num > dev->dev->caps.num_ports)
+			goto out;
+
+		if (attr->alt_pkey_index >=
+		    dev->dev->caps.pkey_table_len[attr->alt_port_num])
+			goto out;
+
+		if (mlx4_set_alt_path(dev, attr, attr_mask, qp,
+				      &context->alt_path,
+				      attr->alt_port_num))
+			goto out;
+
+		context->alt_path.pkey_index = attr->alt_pkey_index;
+		context->alt_path.ackto = attr->alt_timeout << 3;
+		optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
+	}
+
+	context->pd = cpu_to_be32(pd->pdn);
+
+	if (!rwq_ind_tbl) {
+		context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
+		get_cqs(qp, src_type, &send_cq, &recv_cq);
+	} else { /* Set dummy CQs to be compatible with HV and PRM */
+		send_cq = to_mcq(rwq_ind_tbl->ind_tbl[0]->cq);
+		recv_cq = send_cq;
+	}
+	context->cqn_send = cpu_to_be32(send_cq->mcq.cqn);
+	context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
+
+	/* Set "fast registration enabled" for all kernel QPs */
+	if (!ibuobject)
+		context->params1 |= cpu_to_be32(1 << 11);
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+		optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
+	}
+
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+		optpar |= MLX4_QP_OPTPAR_RETRY_COUNT;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic)
+			context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+		optpar |= MLX4_QP_OPTPAR_SRA_MAX;
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		context->next_send_psn = cpu_to_be32(attr->sq_psn);
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic)
+			context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+		optpar |= MLX4_QP_OPTPAR_RRA_MAX;
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+		context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask);
+		optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
+	}
+
+	if (ibsrq)
+		context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+		optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT;
+	}
+	if (attr_mask & IB_QP_RQ_PSN)
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	/* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */
+	if (attr_mask & IB_QP_QKEY) {
+		if (qp->mlx4_ib_qp_type &
+		    (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))
+			context->qkey = cpu_to_be32(IB_QP_SET_QKEY);
+		else {
+			if (mlx4_is_mfunc(dev->dev) &&
+			    !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) &&
+			    (attr->qkey & MLX4_RESERVED_QKEY_MASK) ==
+			    MLX4_RESERVED_QKEY_BASE) {
+				pr_err("Cannot use reserved QKEY"
+				       " 0x%x (range 0xffff0000..0xffffffff"
+				       " is reserved)\n", attr->qkey);
+				err = -EINVAL;
+				goto out;
+			}
+			context->qkey = cpu_to_be32(attr->qkey);
+		}
+		optpar |= MLX4_QP_OPTPAR_Q_KEY;
+	}
+
+	if (ibsrq)
+		context->srqn = cpu_to_be32(1 << 24 |
+					    to_msrq(ibsrq)->msrq.srqn);
+
+	if (qp->rq.wqe_cnt &&
+	    cur_state == IB_QPS_RESET &&
+	    new_state == IB_QPS_INIT)
+		context->db_rec_addr = cpu_to_be64(qp->db.dma);
+
+	if (cur_state == IB_QPS_INIT &&
+	    new_state == IB_QPS_RTR  &&
+	    (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI ||
+	     qp_type == IB_QPT_UD || qp_type == IB_QPT_RAW_PACKET)) {
+		context->pri_path.sched_queue = (qp->port - 1) << 6;
+		if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
+		    qp->mlx4_ib_qp_type &
+		    (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) {
+			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
+			if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI)
+				context->pri_path.fl = 0x80;
+		} else {
+			if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
+				context->pri_path.fl = 0x80;
+			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
+		}
+		if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+		    IB_LINK_LAYER_ETHERNET) {
+			if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI ||
+			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI)
+				context->pri_path.feup = 1 << 7; /* don't fsm */
+			/* handle smac_index */
+			if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD ||
+			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI ||
+			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) {
+				err = handle_eth_ud_smac_index(dev, qp, context);
+				if (err) {
+					err = -EINVAL;
+					goto out;
+				}
+				if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
+					dev->qp1_proxy[qp->port - 1] = qp;
+			}
+		}
+	}
+
+	if (qp_type == IB_QPT_RAW_PACKET) {
+		context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
+					MLX4_IB_LINK_TYPE_ETH;
+		if (dev->dev->caps.tunnel_offload_mode ==  MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+			/* set QP to receive both tunneled & non-tunneled packets */
+			if (!rwq_ind_tbl)
+				context->srqn = cpu_to_be32(7 << 28);
+		}
+	}
+
+	if (qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) {
+		int is_eth = rdma_port_get_link_layer(
+				&dev->ib_dev, qp->port) ==
+				IB_LINK_LAYER_ETHERNET;
+		if (is_eth) {
+			context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH;
+			optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH;
+		}
+	}
+
+	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
+	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
+		sqd_event = 1;
+	else
+		sqd_event = 0;
+
+	if (!ibuobject &&
+	    cur_state == IB_QPS_RESET &&
+	    new_state == IB_QPS_INIT)
+		context->rlkey_roce_mode |= (1 << 4);
+
+	/*
+	 * Before passing a kernel QP to the HW, make sure that the
+	 * ownership bits of the send queue are set and the SQ
+	 * headroom is stamped so that the hardware doesn't start
+	 * processing stale work requests.
+	 */
+	if (!ibuobject &&
+	    cur_state == IB_QPS_RESET &&
+	    new_state == IB_QPS_INIT) {
+		struct mlx4_wqe_ctrl_seg *ctrl;
+		int i;
+
+		for (i = 0; i < qp->sq.wqe_cnt; ++i) {
+			ctrl = get_send_wqe(qp, i);
+			ctrl->owner_opcode = cpu_to_be32(1 << 31);
+			if (qp->sq_max_wqes_per_wr == 1)
+				ctrl->qpn_vlan.fence_size =
+						1 << (qp->sq.wqe_shift - 4);
+
+			stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
+		}
+	}
+
+	if (rwq_ind_tbl	&&
+	    cur_state == IB_QPS_RESET &&
+	    new_state == IB_QPS_INIT) {
+		fill_qp_rss_context(context, qp);
+		context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET);
+	}
+
+	err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
+			     to_mlx4_state(new_state), context, optpar,
+			     sqd_event, &qp->mqp);
+	if (err)
+		goto out;
+
+	qp->state = new_state;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->atomic_rd_en = attr->qp_access_flags;
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT) {
+		qp->port = attr->port_num;
+		update_mcg_macs(dev, qp);
+	}
+	if (attr_mask & IB_QP_ALT_PATH)
+		qp->alt_port = attr->alt_port_num;
+
+	if (is_sqp(dev, qp))
+		store_sqp_attrs(to_msqp(qp), attr, attr_mask);
+
+	/*
+	 * If we moved QP0 to RTR, bring the IB link up; if we moved
+	 * QP0 to RESET or ERROR, bring the link back down.
+	 */
+	if (is_qp0(dev, qp)) {
+		if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
+			if (mlx4_INIT_PORT(dev->dev, qp->port))
+				pr_warn("INIT_PORT failed for port %d\n",
+				       qp->port);
+
+		if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
+		    (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+			mlx4_CLOSE_PORT(dev->dev, qp->port);
+	}
+
+	/*
+	 * If we moved a kernel QP to RESET, clean up all old CQ
+	 * entries and reinitialize the QP.
+	 */
+	if (new_state == IB_QPS_RESET) {
+		if (!ibuobject) {
+			mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+					 ibsrq ? to_msrq(ibsrq) : NULL);
+			if (send_cq != recv_cq)
+				mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+
+			qp->rq.head = 0;
+			qp->rq.tail = 0;
+			qp->sq.head = 0;
+			qp->sq.tail = 0;
+			qp->sq_next_wqe = 0;
+			if (qp->rq.wqe_cnt)
+				*qp->db.db  = 0;
+
+			if (qp->flags & MLX4_IB_QP_NETIF)
+				mlx4_ib_steer_qp_reg(dev, qp, 0);
+		}
+		if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
+			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+			qp->pri.smac = 0;
+			qp->pri.smac_port = 0;
+		}
+		if (qp->alt.smac) {
+			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+			qp->alt.smac = 0;
+		}
+		if (qp->pri.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
+			qp->pri.vid = 0xFFFF;
+			qp->pri.candidate_vid = 0xFFFF;
+			qp->pri.update_vid = 0;
+		}
+
+		if (qp->alt.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
+			qp->alt.vid = 0xFFFF;
+			qp->alt.candidate_vid = 0xFFFF;
+			qp->alt.update_vid = 0;
+		}
+	}
+out:
+	if (err && qp->counter_index)
+		mlx4_ib_free_qp_counter(dev, qp);
+	if (err && steer_qp)
+		mlx4_ib_steer_qp_reg(dev, qp, 0);
+	kfree(context);
+	if (qp->pri.candidate_smac ||
+	    (!qp->pri.candidate_smac && qp->pri.candidate_smac_port)) {
+		if (err) {
+			mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac);
+		} else {
+			if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port))
+				mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+			qp->pri.smac = qp->pri.candidate_smac;
+			qp->pri.smac_index = qp->pri.candidate_smac_index;
+			qp->pri.smac_port = qp->pri.candidate_smac_port;
+		}
+		qp->pri.candidate_smac = 0;
+		qp->pri.candidate_smac_index = 0;
+		qp->pri.candidate_smac_port = 0;
+	}
+	if (qp->alt.candidate_smac) {
+		if (err) {
+			mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac);
+		} else {
+			if (qp->alt.smac)
+				mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+			qp->alt.smac = qp->alt.candidate_smac;
+			qp->alt.smac_index = qp->alt.candidate_smac_index;
+			qp->alt.smac_port = qp->alt.candidate_smac_port;
+		}
+		qp->alt.candidate_smac = 0;
+		qp->alt.candidate_smac_index = 0;
+		qp->alt.candidate_smac_port = 0;
+	}
+
+	if (qp->pri.update_vid) {
+		if (err) {
+			if (qp->pri.candidate_vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port,
+						     qp->pri.candidate_vid);
+		} else {
+			if (qp->pri.vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port,
+						     qp->pri.vid);
+			qp->pri.vid = qp->pri.candidate_vid;
+			qp->pri.vlan_port = qp->pri.candidate_vlan_port;
+			qp->pri.vlan_index =  qp->pri.candidate_vlan_index;
+		}
+		qp->pri.candidate_vid = 0xFFFF;
+		qp->pri.update_vid = 0;
+	}
+
+	if (qp->alt.update_vid) {
+		if (err) {
+			if (qp->alt.candidate_vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port,
+						     qp->alt.candidate_vid);
+		} else {
+			if (qp->alt.vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port,
+						     qp->alt.vid);
+			qp->alt.vid = qp->alt.candidate_vid;
+			qp->alt.vlan_port = qp->alt.candidate_vlan_port;
+			qp->alt.vlan_index =  qp->alt.candidate_vlan_index;
+		}
+		qp->alt.candidate_vid = 0xFFFF;
+		qp->alt.update_vid = 0;
+	}
+
+	return err;
+}
+
+enum {
+	MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK = (IB_QP_STATE	|
+					      IB_QP_PORT),
+};
+
+static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+			      int attr_mask, struct ib_udata *udata)
+{
+	enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED;
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int err = -EINVAL;
+	mutex_lock(&qp->mutex);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (cur_state != new_state || cur_state != IB_QPS_RESET) {
+		int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		ll = rdma_port_get_link_layer(&dev->ib_dev, port);
+	}
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask, ll)) {
+		pr_debug("qpn 0x%x: invalid attribute mask specified "
+			 "for transition %d to %d. qp_type %d,"
+			 " attr_mask 0x%x\n",
+			 ibqp->qp_num, cur_state, new_state,
+			 ibqp->qp_type, attr_mask);
+		goto out;
+	}
+
+	if (ibqp->rwq_ind_tbl) {
+		if (!(((cur_state == IB_QPS_RESET) &&
+		       (new_state == IB_QPS_INIT)) ||
+		      ((cur_state == IB_QPS_INIT)  &&
+		       (new_state == IB_QPS_RTR)))) {
+			pr_debug("qpn 0x%x: RSS QP unsupported transition %d to %d\n",
+				 ibqp->qp_num, cur_state, new_state);
+
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+
+		if (attr_mask & ~MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK) {
+			pr_debug("qpn 0x%x: RSS QP unsupported attribute mask 0x%x for transition %d to %d\n",
+				 ibqp->qp_num, attr_mask, cur_state, new_state);
+
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+	}
+
+	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) {
+		if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) {
+			if ((ibqp->qp_type == IB_QPT_RC) ||
+			    (ibqp->qp_type == IB_QPT_UD) ||
+			    (ibqp->qp_type == IB_QPT_UC) ||
+			    (ibqp->qp_type == IB_QPT_RAW_PACKET) ||
+			    (ibqp->qp_type == IB_QPT_XRC_INI)) {
+				attr->port_num = mlx4_ib_bond_next_port(dev);
+			}
+		} else {
+			/* no sense in changing port_num
+			 * when ports are bonded */
+			attr_mask &= ~IB_QP_PORT;
+		}
+	}
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->num_ports)) {
+		pr_debug("qpn 0x%x: invalid port number (%d) specified "
+			 "for transition %d to %d. qp_type %d\n",
+			 ibqp->qp_num, attr->port_num, cur_state,
+			 new_state, ibqp->qp_type);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) &&
+	    (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) !=
+	     IB_LINK_LAYER_ETHERNET))
+		goto out;
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) {
+			pr_debug("qpn 0x%x: invalid pkey index (%d) specified "
+				 "for transition %d to %d. qp_type %d\n",
+				 ibqp->qp_num, attr->pkey_index, cur_state,
+				 new_state, ibqp->qp_type);
+			goto out;
+		}
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
+		pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. "
+			 "Transition %d to %d. qp_type %d\n",
+			 ibqp->qp_num, attr->max_rd_atomic, cur_state,
+			 new_state, ibqp->qp_type);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
+		pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. "
+			 "Transition %d to %d. qp_type %d\n",
+			 ibqp->qp_num, attr->max_dest_rd_atomic, cur_state,
+			 new_state, ibqp->qp_type);
+		goto out;
+	}
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	if (ibqp->rwq_ind_tbl && (new_state == IB_QPS_INIT)) {
+		err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num);
+		if (err)
+			goto out;
+	}
+
+	err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask,
+				  cur_state, new_state);
+
+	if (ibqp->rwq_ind_tbl && err)
+		bring_down_rss_rwqs(ibqp->rwq_ind_tbl);
+
+	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT))
+		attr->port_num = 1;
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	int ret;
+
+	ret = _mlx4_ib_modify_qp(ibqp, attr, attr_mask, udata);
+
+	if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+		struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+		int err = 0;
+
+		if (sqp->roce_v2_gsi)
+			err = ib_modify_qp(sqp->roce_v2_gsi, attr, attr_mask);
+		if (err)
+			pr_err("Failed to modify GSI QP for RoCEv2 (%d)\n",
+			       err);
+	}
+	return ret;
+}
+
+static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey)
+{
+	int i;
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (qpn == dev->caps.spec_qps[i].qp0_proxy ||
+		    qpn == dev->caps.spec_qps[i].qp0_tunnel) {
+			*qkey = dev->caps.spec_qps[i].qp0_qkey;
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
+				  struct ib_ud_wr *wr,
+				  void *wqe, unsigned *mlx_seg_len)
+{
+	struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device);
+	struct ib_device *ib_dev = &mdev->ib_dev;
+	struct mlx4_wqe_mlx_seg *mlx = wqe;
+	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+	struct mlx4_ib_ah *ah = to_mah(wr->ah);
+	u16 pkey;
+	u32 qkey;
+	int send_size;
+	int header_size;
+	int spc;
+	int i;
+
+	if (wr->wr.opcode != IB_WR_SEND)
+		return -EINVAL;
+
+	send_size = 0;
+
+	for (i = 0; i < wr->wr.num_sge; ++i)
+		send_size += wr->wr.sg_list[i].length;
+
+	/* for proxy-qp0 sends, need to add in size of tunnel header */
+	/* for tunnel-qp0 sends, tunnel header is already in s/g list */
+	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
+		send_size += sizeof (struct mlx4_ib_tunnel_header);
+
+	ib_ud_header_init(send_size, 1, 0, 0, 0, 0, 0, 0, &sqp->ud_header);
+
+	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
+		sqp->ud_header.lrh.service_level =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+		sqp->ud_header.lrh.destination_lid =
+			cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+		sqp->ud_header.lrh.source_lid =
+			cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+	}
+
+	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+	/* force loopback */
+	mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR);
+	mlx->rlid = sqp->ud_header.lrh.destination_lid;
+
+	sqp->ud_header.lrh.virtual_lane    = 0;
+	sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED);
+	ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER)
+		sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn);
+	else
+		sqp->ud_header.bth.destination_qpn =
+			cpu_to_be32(mdev->dev->caps.spec_qps[sqp->qp.port - 1].qp0_tunnel);
+
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	if (mlx4_is_master(mdev->dev)) {
+		if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+			return -EINVAL;
+	} else {
+		if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+			return -EINVAL;
+	}
+	sqp->ud_header.deth.qkey = cpu_to_be32(qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn);
+
+	sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY;
+	sqp->ud_header.immediate_present = 0;
+
+	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+	/*
+	 * Inline data segments may not cross a 64 byte boundary.  If
+	 * our UD header is bigger than the space available up to the
+	 * next 64 byte boundary in the WQE, use two inline data
+	 * segments to hold the UD header.
+	 */
+	spc = MLX4_INLINE_ALIGN -
+	      ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (header_size <= spc) {
+		inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+		memcpy(inl + 1, sqp->header_buf, header_size);
+		i = 1;
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		memcpy(inl + 1, sqp->header_buf, spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+		/*
+		 * Need a barrier here to make sure all the data is
+		 * visible before the byte_count field is set.
+		 * Otherwise the HCA prefetcher could grab the 64-byte
+		 * chunk with this inline segment and get a valid (!=
+		 * 0xffffffff) byte count but stale data, and end up
+		 * generating a packet with bad headers.
+		 *
+		 * The first inline segment's byte_count field doesn't
+		 * need a barrier, because it comes after a
+		 * control/MLX segment and therefore is at an offset
+		 * of 16 mod 64.
+		 */
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+		i = 2;
+	}
+
+	*mlx_seg_len =
+	ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+	return 0;
+}
+
+static u8 sl_to_vl(struct mlx4_ib_dev *dev, u8 sl, int port_num)
+{
+	union sl2vl_tbl_to_u64 tmp_vltab;
+	u8 vl;
+
+	if (sl > 15)
+		return 0xf;
+	tmp_vltab.sl64 = atomic64_read(&dev->sl2vl[port_num - 1]);
+	vl = tmp_vltab.sl8[sl >> 1];
+	if (sl & 1)
+		vl &= 0x0f;
+	else
+		vl >>= 4;
+	return vl;
+}
+
+static int fill_gid_by_hw_index(struct mlx4_ib_dev *ibdev, u8 port_num,
+				int index, union ib_gid *gid,
+				enum ib_gid_type *gid_type)
+{
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	struct mlx4_port_gid_table *port_gid_table;
+	unsigned long flags;
+
+	port_gid_table = &iboe->gids[port_num - 1];
+	spin_lock_irqsave(&iboe->lock, flags);
+	memcpy(gid, &port_gid_table->gids[index].gid, sizeof(*gid));
+	*gid_type = port_gid_table->gids[index].gid_type;
+	spin_unlock_irqrestore(&iboe->lock, flags);
+	if (!memcmp(gid, &zgid, sizeof(*gid)))
+		return -ENOENT;
+
+	return 0;
+}
+
+#define MLX4_ROCEV2_QP1_SPORT 0xC000
+static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
+			    void *wqe, unsigned *mlx_seg_len)
+{
+	struct ib_device *ib_dev = sqp->qp.ibqp.device;
+	struct mlx4_ib_dev *ibdev = to_mdev(ib_dev);
+	struct mlx4_wqe_mlx_seg *mlx = wqe;
+	struct mlx4_wqe_ctrl_seg *ctrl = wqe;
+	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+	struct mlx4_ib_ah *ah = to_mah(wr->ah);
+	union ib_gid sgid;
+	u16 pkey;
+	int send_size;
+	int header_size;
+	int spc;
+	int i;
+	int err = 0;
+	u16 vlan = 0xffff;
+	bool is_eth;
+	bool is_vlan = false;
+	bool is_grh;
+	bool is_udp = false;
+	int ip_version = 0;
+
+	send_size = 0;
+	for (i = 0; i < wr->wr.num_sge; ++i)
+		send_size += wr->wr.sg_list[i].length;
+
+	is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
+	is_grh = mlx4_ib_ah_grh_present(ah);
+	if (is_eth) {
+		enum ib_gid_type gid_type;
+		if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+			/* When multi-function is enabled, the ib_core gid
+			 * indexes don't necessarily match the hw ones, so
+			 * we must use our own cache */
+			err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev,
+							   be32_to_cpu(ah->av.ib.port_pd) >> 24,
+							   ah->av.ib.gid_index, &sgid.raw[0]);
+			if (err)
+				return err;
+		} else  {
+			err = fill_gid_by_hw_index(ibdev, sqp->qp.port,
+					    ah->av.ib.gid_index,
+					    &sgid, &gid_type);
+			if (!err) {
+				is_udp = gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP;
+				if (is_udp) {
+					if (ipv6_addr_v4mapped((struct in6_addr *)&sgid))
+						ip_version = 4;
+					else
+						ip_version = 6;
+					is_grh = false;
+				}
+			} else {
+				return err;
+			}
+		}
+		if (ah->av.eth.vlan != cpu_to_be16(0xffff)) {
+			vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff;
+			is_vlan = 1;
+		}
+	}
+	err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh,
+			  ip_version, is_udp, 0, &sqp->ud_header);
+	if (err)
+		return err;
+
+	if (!is_eth) {
+		sqp->ud_header.lrh.service_level =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+		sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
+		sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+	}
+
+	if (is_grh || (ip_version == 6)) {
+		sqp->ud_header.grh.traffic_class =
+			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+		sqp->ud_header.grh.flow_label    =
+			ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		sqp->ud_header.grh.hop_limit     = ah->av.ib.hop_limit;
+		if (is_eth) {
+			memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16);
+		} else {
+			if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+				/* When multi-function is enabled, the ib_core gid
+				 * indexes don't necessarily match the hw ones, so
+				 * we must use our own cache
+				 */
+				sqp->ud_header.grh.source_gid.global.subnet_prefix =
+					cpu_to_be64(atomic64_read(&(to_mdev(ib_dev)->sriov.
+								    demux[sqp->qp.port - 1].
+								    subnet_prefix)));
+				sqp->ud_header.grh.source_gid.global.interface_id =
+					to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+						       guid_cache[ah->av.ib.gid_index];
+			} else {
+				ib_get_cached_gid(ib_dev,
+						  be32_to_cpu(ah->av.ib.port_pd) >> 24,
+						  ah->av.ib.gid_index,
+						  &sqp->ud_header.grh.source_gid, NULL);
+			}
+		}
+		memcpy(sqp->ud_header.grh.destination_gid.raw,
+		       ah->av.ib.dgid, 16);
+	}
+
+	if (ip_version == 4) {
+		sqp->ud_header.ip4.tos =
+			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+		sqp->ud_header.ip4.id = 0;
+		sqp->ud_header.ip4.frag_off = htons(IP_DF);
+		sqp->ud_header.ip4.ttl = ah->av.eth.hop_limit;
+
+		memcpy(&sqp->ud_header.ip4.saddr,
+		       sgid.raw + 12, 4);
+		memcpy(&sqp->ud_header.ip4.daddr, ah->av.ib.dgid + 12, 4);
+		sqp->ud_header.ip4.check = ib_ud_ip4_csum(&sqp->ud_header);
+	}
+
+	if (is_udp) {
+		sqp->ud_header.udp.dport = htons(ROCE_V2_UDP_DPORT);
+		sqp->ud_header.udp.sport = htons(MLX4_ROCEV2_QP1_SPORT);
+		sqp->ud_header.udp.csum = 0;
+	}
+
+	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+	if (!is_eth) {
+		mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
+					  (sqp->ud_header.lrh.destination_lid ==
+					   IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
+					  (sqp->ud_header.lrh.service_level << 8));
+		if (ah->av.ib.port_pd & cpu_to_be32(0x80000000))
+			mlx->flags |= cpu_to_be32(0x1); /* force loopback */
+		mlx->rlid = sqp->ud_header.lrh.destination_lid;
+	}
+
+	switch (wr->wr.opcode) {
+	case IB_WR_SEND:
+		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY;
+		sqp->ud_header.immediate_present = 0;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->ud_header.immediate_present = 1;
+		sqp->ud_header.immediate_data    = wr->wr.ex.imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (is_eth) {
+		struct in6_addr in6;
+		u16 ether_type;
+		u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
+
+		ether_type = (!is_udp) ? ETH_P_IBOE:
+			(ip_version == 4 ? ETH_P_IP : ETH_P_IPV6);
+
+		mlx->sched_prio = cpu_to_be16(pcp);
+
+		ether_addr_copy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac);
+		memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
+		memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
+		memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
+		memcpy(&in6, sgid.raw, sizeof(in6));
+
+
+		if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
+			mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
+		if (!is_vlan) {
+			sqp->ud_header.eth.type = cpu_to_be16(ether_type);
+		} else {
+			sqp->ud_header.vlan.type = cpu_to_be16(ether_type);
+			sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
+		}
+	} else {
+		sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 :
+							sl_to_vl(to_mdev(ib_dev),
+								 sqp->ud_header.lrh.service_level,
+								 sqp->qp.port);
+		if (sqp->qp.ibqp.qp_num && sqp->ud_header.lrh.virtual_lane == 15)
+			return -EINVAL;
+		if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+			sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+	}
+	sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED);
+	if (!sqp->qp.ibqp.qp_num)
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
+	else
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->pkey_index, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn);
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->ud_header.deth.qkey = cpu_to_be32(wr->remote_qkey & 0x80000000 ?
+					       sqp->qkey : wr->remote_qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+	if (0) {
+		pr_err("built UD header of size %d:\n", header_size);
+		for (i = 0; i < header_size / 4; ++i) {
+			if (i % 8 == 0)
+				pr_err("  [%02x] ", i * 4);
+			pr_cont(" %08x",
+				be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
+			if ((i + 1) % 8 == 0)
+				pr_cont("\n");
+		}
+		pr_err("\n");
+	}
+
+	/*
+	 * Inline data segments may not cross a 64 byte boundary.  If
+	 * our UD header is bigger than the space available up to the
+	 * next 64 byte boundary in the WQE, use two inline data
+	 * segments to hold the UD header.
+	 */
+	spc = MLX4_INLINE_ALIGN -
+		((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (header_size <= spc) {
+		inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+		memcpy(inl + 1, sqp->header_buf, header_size);
+		i = 1;
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		memcpy(inl + 1, sqp->header_buf, spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+		/*
+		 * Need a barrier here to make sure all the data is
+		 * visible before the byte_count field is set.
+		 * Otherwise the HCA prefetcher could grab the 64-byte
+		 * chunk with this inline segment and get a valid (!=
+		 * 0xffffffff) byte count but stale data, and end up
+		 * generating a packet with bad headers.
+		 *
+		 * The first inline segment's byte_count field doesn't
+		 * need a barrier, because it comes after a
+		 * control/MLX segment and therefore is at an offset
+		 * of 16 mod 64.
+		 */
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+		i = 2;
+	}
+
+	*mlx_seg_len =
+		ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+	return 0;
+}
+
+static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+	unsigned cur;
+	struct mlx4_ib_cq *cq;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max_post))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max_post;
+}
+
+static __be32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ?
+		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC)       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ?
+		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ?
+		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ)  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE)  : 0) |
+		cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
+}
+
+static void set_reg_seg(struct mlx4_wqe_fmr_seg *fseg,
+			struct ib_reg_wr *wr)
+{
+	struct mlx4_ib_mr *mr = to_mmr(wr->mr);
+
+	fseg->flags		= convert_access(wr->access);
+	fseg->mem_key		= cpu_to_be32(wr->key);
+	fseg->buf_list		= cpu_to_be64(mr->page_map);
+	fseg->start_addr	= cpu_to_be64(mr->ibmr.iova);
+	fseg->reg_len		= cpu_to_be64(mr->ibmr.length);
+	fseg->offset		= 0; /* XXX -- is this just for ZBVA? */
+	fseg->page_size		= cpu_to_be32(ilog2(mr->ibmr.page_size));
+	fseg->reserved[0]	= 0;
+	fseg->reserved[1]	= 0;
+}
+
+static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
+{
+	memset(iseg, 0, sizeof(*iseg));
+	iseg->mem_key = cpu_to_be32(rkey);
+}
+
+static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg,
+		struct ib_atomic_wr *wr)
+{
+	if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->swap_add = cpu_to_be64(wr->swap);
+		aseg->compare  = cpu_to_be64(wr->compare_add);
+	} else if (wr->wr.opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
+		aseg->swap_add = cpu_to_be64(wr->compare_add);
+		aseg->compare  = cpu_to_be64(wr->compare_add_mask);
+	} else {
+		aseg->swap_add = cpu_to_be64(wr->compare_add);
+		aseg->compare  = 0;
+	}
+
+}
+
+static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
+				  struct ib_atomic_wr *wr)
+{
+	aseg->swap_add		= cpu_to_be64(wr->swap);
+	aseg->swap_add_mask	= cpu_to_be64(wr->swap_mask);
+	aseg->compare		= cpu_to_be64(wr->compare_add);
+	aseg->compare_mask	= cpu_to_be64(wr->compare_add_mask);
+}
+
+static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
+			     struct ib_ud_wr *wr)
+{
+	memcpy(dseg->av, &to_mah(wr->ah)->av, sizeof (struct mlx4_av));
+	dseg->dqpn = cpu_to_be32(wr->remote_qpn);
+	dseg->qkey = cpu_to_be32(wr->remote_qkey);
+	dseg->vlan = to_mah(wr->ah)->av.eth.vlan;
+	memcpy(dseg->mac, to_mah(wr->ah)->av.eth.mac, 6);
+}
+
+static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,
+				    struct mlx4_wqe_datagram_seg *dseg,
+				    struct ib_ud_wr *wr,
+				    enum mlx4_ib_qp_type qpt)
+{
+	union mlx4_ext_av *av = &to_mah(wr->ah)->av;
+	struct mlx4_av sqp_av = {0};
+	int port = *((u8 *) &av->ib.port_pd) & 0x3;
+
+	/* force loopback */
+	sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000);
+	sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */
+	sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel &
+			cpu_to_be32(0xf0000000);
+
+	memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av));
+	if (qpt == MLX4_IB_QPT_PROXY_GSI)
+		dseg->dqpn = cpu_to_be32(dev->dev->caps.spec_qps[port - 1].qp1_tunnel);
+	else
+		dseg->dqpn = cpu_to_be32(dev->dev->caps.spec_qps[port - 1].qp0_tunnel);
+	/* Use QKEY from the QP context, which is set by master */
+	dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY);
+}
+
+static void build_tunnel_header(struct ib_ud_wr *wr, void *wqe, unsigned *mlx_seg_len)
+{
+	struct mlx4_wqe_inline_seg *inl = wqe;
+	struct mlx4_ib_tunnel_header hdr;
+	struct mlx4_ib_ah *ah = to_mah(wr->ah);
+	int spc;
+	int i;
+
+	memcpy(&hdr.av, &ah->av, sizeof hdr.av);
+	hdr.remote_qpn = cpu_to_be32(wr->remote_qpn);
+	hdr.pkey_index = cpu_to_be16(wr->pkey_index);
+	hdr.qkey = cpu_to_be32(wr->remote_qkey);
+	memcpy(hdr.mac, ah->av.eth.mac, 6);
+	hdr.vlan = ah->av.eth.vlan;
+
+	spc = MLX4_INLINE_ALIGN -
+		((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (sizeof (hdr) <= spc) {
+		memcpy(inl + 1, &hdr, sizeof (hdr));
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr));
+		i = 1;
+	} else {
+		memcpy(inl + 1, &hdr, spc);
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc);
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc));
+		i = 2;
+	}
+
+	*mlx_seg_len =
+		ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16);
+}
+
+static void set_mlx_icrc_seg(void *dseg)
+{
+	u32 *t = dseg;
+	struct mlx4_wqe_inline_seg *iseg = dseg;
+
+	t[1] = 0;
+
+	/*
+	 * Need a barrier here before writing the byte_count field to
+	 * make sure that all the data is visible before the
+	 * byte_count field is set.  Otherwise, if the segment begins
+	 * a new cacheline, the HCA prefetcher could grab the 64-byte
+	 * chunk and get a valid (!= * 0xffffffff) byte count but
+	 * stale data, and end up sending the wrong data.
+	 */
+	wmb();
+
+	iseg->byte_count = cpu_to_be32((1 << 31) | 4);
+}
+
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+
+	/*
+	 * Need a barrier here before writing the byte_count field to
+	 * make sure that all the data is visible before the
+	 * byte_count field is set.  Otherwise, if the segment begins
+	 * a new cacheline, the HCA prefetcher could grab the 64-byte
+	 * chunk and get a valid (!= * 0xffffffff) byte count but
+	 * stale data, and end up sending the wrong data.
+	 */
+	wmb();
+
+	dseg->byte_count = cpu_to_be32(sg->length);
+}
+
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_ud_wr *wr,
+			 struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
+			 __be32 *lso_hdr_sz, __be32 *blh)
+{
+	unsigned halign = ALIGN(sizeof *wqe + wr->hlen, 16);
+
+	if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE))
+		*blh = cpu_to_be32(1 << 6);
+
+	if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
+		     wr->wr.num_sge > qp->sq.max_gs - (halign >> 4)))
+		return -EINVAL;
+
+	memcpy(wqe->header, wr->header, wr->hlen);
+
+	*lso_hdr_sz  = cpu_to_be32(wr->mss << 16 | wr->hlen);
+	*lso_seg_len = halign;
+	return 0;
+}
+
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return wr->ex.imm_data;
+
+	case IB_WR_SEND_WITH_INV:
+		return cpu_to_be32(wr->ex.invalidate_rkey);
+
+	default:
+		return 0;
+	}
+}
+
+static void add_zero_len_inline(void *wqe)
+{
+	struct mlx4_wqe_inline_seg *inl = wqe;
+	memset(wqe, 0, 16);
+	inl->byte_count = cpu_to_be32(1 << 31);
+}
+
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_data_seg *dseg;
+	unsigned long flags;
+	int nreq;
+	int err = 0;
+	unsigned ind;
+	int uninitialized_var(stamp);
+	int uninitialized_var(size);
+	unsigned uninitialized_var(seglen);
+	__be32 dummy;
+	__be32 *lso_wqe;
+	__be32 uninitialized_var(lso_hdr_sz);
+	__be32 blh;
+	int i;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+
+	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+		struct mlx4_ib_sqp *sqp = to_msqp(qp);
+
+		if (sqp->roce_v2_gsi) {
+			struct mlx4_ib_ah *ah = to_mah(ud_wr(wr)->ah);
+			enum ib_gid_type gid_type;
+			union ib_gid gid;
+
+			if (!fill_gid_by_hw_index(mdev, sqp->qp.port,
+					   ah->av.ib.gid_index,
+					   &gid, &gid_type))
+				qp = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ?
+						to_mqp(sqp->roce_v2_gsi) : qp;
+			else
+				pr_err("Failed to get gid at index %d. RoCEv2 will not work properly\n",
+				       ah->av.ib.gid_index);
+		}
+	}
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
+
+	ind = qp->sq_next_wqe;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		lso_wqe = &dummy;
+		blh = 0;
+
+		if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+		qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+		ctrl->srcrb_flags =
+			(wr->send_flags & IB_SEND_SIGNALED ?
+			 cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+			(wr->send_flags & IB_SEND_SOLICITED ?
+			 cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
+			((wr->send_flags & IB_SEND_IP_CSUM) ?
+			 cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+				     MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) |
+			qp->sq_signal_bits;
+
+		ctrl->imm = send_ieth(wr);
+
+		wqe += sizeof *ctrl;
+		size = sizeof *ctrl / 16;
+
+		switch (qp->mlx4_ib_qp_type) {
+		case MLX4_IB_QPT_RC:
+		case MLX4_IB_QPT_UC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, atomic_wr(wr)->remote_addr,
+					      atomic_wr(wr)->rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+
+				set_atomic_seg(wqe, atomic_wr(wr));
+				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
+
+				size += (sizeof (struct mlx4_wqe_raddr_seg) +
+					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
+
+				break;
+
+			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+				set_raddr_seg(wqe, atomic_wr(wr)->remote_addr,
+					      atomic_wr(wr)->rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+
+				set_masked_atomic_seg(wqe, atomic_wr(wr));
+				wqe  += sizeof (struct mlx4_wqe_masked_atomic_seg);
+
+				size += (sizeof (struct mlx4_wqe_raddr_seg) +
+					 sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16;
+
+				break;
+
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, rdma_wr(wr)->remote_addr,
+					      rdma_wr(wr)->rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
+				break;
+
+			case IB_WR_LOCAL_INV:
+				ctrl->srcrb_flags |=
+					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
+				wqe  += sizeof (struct mlx4_wqe_local_inval_seg);
+				size += sizeof (struct mlx4_wqe_local_inval_seg) / 16;
+				break;
+
+			case IB_WR_REG_MR:
+				ctrl->srcrb_flags |=
+					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_reg_seg(wqe, reg_wr(wr));
+				wqe  += sizeof(struct mlx4_wqe_fmr_seg);
+				size += sizeof(struct mlx4_wqe_fmr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+			break;
+
+		case MLX4_IB_QPT_TUN_SMI_OWNER:
+			err =  build_sriov_qp0_header(to_msqp(qp), ud_wr(wr),
+					ctrl, &seglen);
+			if (unlikely(err)) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+		case MLX4_IB_QPT_TUN_SMI:
+		case MLX4_IB_QPT_TUN_GSI:
+			/* this is a UD qp used in MAD responses to slaves. */
+			set_datagram_seg(wqe, ud_wr(wr));
+			/* set the forced-loopback bit in the data seg av */
+			*(__be32 *) wqe |= cpu_to_be32(0x80000000);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+			break;
+		case MLX4_IB_QPT_UD:
+			set_datagram_seg(wqe, ud_wr(wr));
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+
+			if (wr->opcode == IB_WR_LSO) {
+				err = build_lso_seg(wqe, ud_wr(wr), qp, &seglen,
+						&lso_hdr_sz, &blh);
+				if (unlikely(err)) {
+					*bad_wr = wr;
+					goto out;
+				}
+				lso_wqe = (__be32 *) wqe;
+				wqe  += seglen;
+				size += seglen / 16;
+			}
+			break;
+
+		case MLX4_IB_QPT_PROXY_SMI_OWNER:
+			err = build_sriov_qp0_header(to_msqp(qp), ud_wr(wr),
+					ctrl, &seglen);
+			if (unlikely(err)) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe  += seglen;
+			size += seglen / 16;
+			/* to start tunnel header on a cache-line boundary */
+			add_zero_len_inline(wqe);
+			wqe += 16;
+			size++;
+			build_tunnel_header(ud_wr(wr), wqe, &seglen);
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+		case MLX4_IB_QPT_PROXY_SMI:
+		case MLX4_IB_QPT_PROXY_GSI:
+			/* If we are tunneling special qps, this is a UD qp.
+			 * In this case we first add a UD segment targeting
+			 * the tunnel qp, and then add a header with address
+			 * information */
+			set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe,
+						ud_wr(wr),
+						qp->mlx4_ib_qp_type);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+			build_tunnel_header(ud_wr(wr), wqe, &seglen);
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+
+		case MLX4_IB_QPT_SMI:
+		case MLX4_IB_QPT_GSI:
+			err = build_mlx_header(to_msqp(qp), ud_wr(wr), ctrl,
+					&seglen);
+			if (unlikely(err)) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+
+		default:
+			break;
+		}
+
+		/*
+		 * Write data segments in reverse order, so as to
+		 * overwrite cacheline stamp last within each
+		 * cacheline.  This avoids issues with WQE
+		 * prefetching.
+		 */
+
+		dseg = wqe;
+		dseg += wr->num_sge - 1;
+		size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
+
+		/* Add one more inline data segment for ICRC for MLX sends */
+		if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
+			     qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI ||
+			     qp->mlx4_ib_qp_type &
+			     (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) {
+			set_mlx_icrc_seg(dseg + 1);
+			size += sizeof (struct mlx4_wqe_data_seg) / 16;
+		}
+
+		for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
+			set_data_seg(dseg, wr->sg_list + i);
+
+		/*
+		 * Possibly overwrite stamping in cacheline with LSO
+		 * segment only after making sure all data segments
+		 * are written.
+		 */
+		wmb();
+		*lso_wqe = lso_hdr_sz;
+
+		ctrl->qpn_vlan.fence_size = (wr->send_flags & IB_SEND_FENCE ?
+					     MLX4_WQE_CTRL_FENCE : 0) | size;
+
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		wmb();
+
+		if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
+			*bad_wr = wr;
+			err = -EINVAL;
+			goto out;
+		}
+
+		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
+			(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh;
+
+		stamp = ind + qp->sq_spare_wqes;
+		ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
+
+		/*
+		 * We can improve latency by not stamping the last
+		 * send queue WQE until after ringing the doorbell, so
+		 * only stamp here if there are still more WQEs to post.
+		 *
+		 * Same optimization applies to padding with NOP wqe
+		 * in case of WQE shrinking (used to prevent wrap-around
+		 * in the middle of WR).
+		 */
+		if (wr->next) {
+			stamp_send_wqe(qp, stamp, size * 16);
+			ind = pad_wraparound(qp, ind);
+		}
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		writel_relaxed(qp->doorbell_qpn,
+			to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
+
+		/*
+		 * Make sure doorbells don't leak out of SQ spinlock
+		 * and reach the HCA out of order.
+		 */
+		mmiowb();
+
+		stamp_send_wqe(qp, stamp, size * 16);
+
+		ind = pad_wraparound(qp, ind);
+		qp->sq_next_wqe = ind;
+	}
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+	return err;
+}
+
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int max_gs;
+	int i;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+
+	max_gs = qp->rq.max_gs;
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
+
+	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		scat = get_recv_wqe(qp, ind);
+
+		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+			ib_dma_sync_single_for_device(ibqp->device,
+						      qp->sqp_proxy_rcv[ind].map,
+						      sizeof (struct mlx4_ib_proxy_sqp_hdr),
+						      DMA_FROM_DEVICE);
+			scat->byte_count =
+				cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr));
+			/* use dma lkey from upper layer entry */
+			scat->lkey = cpu_to_be32(wr->sg_list->lkey);
+			scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map);
+			scat++;
+			max_gs--;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i)
+			__set_data_seg(scat + i, wr->sg_list + i);
+
+		if (i < max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+
+		qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	return err;
+}
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state)
+{
+	switch (mlx4_state) {
+	case MLX4_QP_STATE_RST:      return IB_QPS_RESET;
+	case MLX4_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MLX4_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MLX4_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MLX4_QP_STATE_SQ_DRAINING:
+	case MLX4_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MLX4_QP_STATE_SQER:     return IB_QPS_SQE;
+	case MLX4_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:		     return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state)
+{
+	switch (mlx4_mig_state) {
+	case MLX4_QP_PM_ARMED:		return IB_MIG_ARMED;
+	case MLX4_QP_PM_REARM:		return IB_MIG_REARM;
+	case MLX4_QP_PM_MIGRATED:	return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mlx4_flags)
+{
+	int ib_flags = 0;
+
+	if (mlx4_flags & MLX4_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mlx4_flags & MLX4_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mlx4_flags & MLX4_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_rdma_ah_attr(struct mlx4_ib_dev *ibdev,
+			    struct rdma_ah_attr *ah_attr,
+			    struct mlx4_qp_path *path)
+{
+	struct mlx4_dev *dev = ibdev->dev;
+	u8 port_num = path->sched_queue & 0x40 ? 2 : 1;
+
+	memset(ah_attr, 0, sizeof(*ah_attr));
+	ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, port_num);
+	if (port_num == 0 || port_num > dev->caps.num_ports)
+		return;
+
+	if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE)
+		rdma_ah_set_sl(ah_attr, ((path->sched_queue >> 3) & 0x7) |
+			       ((path->sched_queue & 4) << 1));
+	else
+		rdma_ah_set_sl(ah_attr, (path->sched_queue >> 2) & 0xf);
+	rdma_ah_set_port_num(ah_attr, port_num);
+
+	rdma_ah_set_dlid(ah_attr, be16_to_cpu(path->rlid));
+	rdma_ah_set_path_bits(ah_attr, path->grh_mylmc & 0x7f);
+	rdma_ah_set_static_rate(ah_attr,
+				path->static_rate ? path->static_rate - 5 : 0);
+	if (path->grh_mylmc & (1 << 7)) {
+		rdma_ah_set_grh(ah_attr, NULL,
+				be32_to_cpu(path->tclass_flowlabel) & 0xfffff,
+				path->mgid_index,
+				path->hop_limit,
+				(be32_to_cpu(path->tclass_flowlabel)
+				 >> 20) & 0xff);
+		rdma_ah_set_dgid_raw(ah_attr, path->rgid);
+	}
+}
+
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_qp_context context;
+	int mlx4_state;
+	int err = 0;
+
+	if (ibqp->rwq_ind_tbl)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&qp->mutex);
+
+	if (qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		goto done;
+	}
+
+	err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
+	if (err) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	mlx4_state = be32_to_cpu(context.flags) >> 28;
+
+	qp->state		     = to_ib_qp_state(mlx4_state);
+	qp_attr->qp_state	     = qp->state;
+	qp_attr->path_mtu	     = context.mtu_msgmax >> 5;
+	qp_attr->path_mig_state	     =
+		to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
+	qp_attr->qkey		     = be32_to_cpu(context.qkey);
+	qp_attr->rq_psn		     = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn		     = be32_to_cpu(context.next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num	     = be32_to_cpu(context.remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context.params2));
+
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
+		to_rdma_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
+		to_rdma_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
+		qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
+		qp_attr->alt_port_num	=
+			rdma_ah_get_port_num(&qp_attr->alt_ah_attr);
+	}
+
+	qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
+	if (qp_attr->qp_state == IB_QPS_INIT)
+		qp_attr->port_num = qp->port;
+	else
+		qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer	    =
+		(be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout	    = context.pri_path.ackto >> 3;
+	qp_attr->retry_cnt	    = (be32_to_cpu(context.params1) >> 16) & 0x7;
+	qp_attr->rnr_retry	    = (be32_to_cpu(context.params1) >> 13) & 0x7;
+	qp_attr->alt_timeout	    = context.alt_path.ackto >> 3;
+
+done:
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
+	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+
+	if (!ibqp->uobject) {
+		qp_attr->cap.max_send_wr  = qp->sq.wqe_cnt;
+		qp_attr->cap.max_send_sge = qp->sq.max_gs;
+	} else {
+		qp_attr->cap.max_send_wr  = 0;
+		qp_attr->cap.max_send_sge = 0;
+	}
+
+	/*
+	 * We don't support inline sends for kernel QPs (yet), and we
+	 * don't know what userspace's value should be.
+	 */
+	qp_attr->cap.max_inline_data = 0;
+
+	qp_init_attr->cap	     = qp_attr->cap;
+
+	qp_init_attr->create_flags = 0;
+	if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	if (qp->flags & MLX4_IB_QP_LSO)
+		qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+	if (qp->flags & MLX4_IB_QP_NETIF)
+		qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP;
+
+	qp_init_attr->sq_sig_type =
+		qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ?
+		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
+				struct ib_wq_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev;
+	struct ib_qp_init_attr ib_qp_init_attr;
+	struct mlx4_ib_qp *qp;
+	struct mlx4_ib_create_wq ucmd;
+	int err, required_cmd_sz;
+
+	if (!(udata && pd->uobject))
+		return ERR_PTR(-EINVAL);
+
+	required_cmd_sz = offsetof(typeof(ucmd), comp_mask) +
+			  sizeof(ucmd.comp_mask);
+	if (udata->inlen < required_cmd_sz) {
+		pr_debug("invalid inlen\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (udata->inlen > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 udata->inlen - sizeof(ucmd))) {
+		pr_debug("inlen is not supported\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (udata->outlen)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	dev = to_mdev(pd->device);
+
+	if (init_attr->wq_type != IB_WQT_RQ) {
+		pr_debug("unsupported wq type %d\n", init_attr->wq_type);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (init_attr->create_flags & ~IB_WQ_FLAGS_SCATTER_FCS) {
+		pr_debug("unsupported create_flags %u\n",
+			 init_attr->create_flags);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->pri.vid = 0xFFFF;
+	qp->alt.vid = 0xFFFF;
+
+	memset(&ib_qp_init_attr, 0, sizeof(ib_qp_init_attr));
+	ib_qp_init_attr.qp_context = init_attr->wq_context;
+	ib_qp_init_attr.qp_type = IB_QPT_RAW_PACKET;
+	ib_qp_init_attr.cap.max_recv_wr = init_attr->max_wr;
+	ib_qp_init_attr.cap.max_recv_sge = init_attr->max_sge;
+	ib_qp_init_attr.recv_cq = init_attr->cq;
+	ib_qp_init_attr.send_cq = ib_qp_init_attr.recv_cq; /* Dummy CQ */
+
+	if (init_attr->create_flags & IB_WQ_FLAGS_SCATTER_FCS)
+		ib_qp_init_attr.create_flags |= IB_QP_CREATE_SCATTER_FCS;
+
+	err = create_qp_common(dev, pd, MLX4_IB_RWQ_SRC, &ib_qp_init_attr,
+			       udata, 0, &qp);
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	qp->ibwq.event_handler = init_attr->event_handler;
+	qp->ibwq.wq_num = qp->mqp.qpn;
+	qp->ibwq.state = IB_WQS_RESET;
+
+	return &qp->ibwq;
+}
+
+static int ib_wq2qp_state(enum ib_wq_state state)
+{
+	switch (state) {
+	case IB_WQS_RESET:
+		return IB_QPS_RESET;
+	case IB_WQS_RDY:
+		return IB_QPS_RTR;
+	default:
+		return IB_QPS_ERR;
+	}
+}
+
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state)
+{
+	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
+	enum ib_qp_state qp_cur_state;
+	enum ib_qp_state qp_new_state;
+	int attr_mask;
+	int err;
+
+	/* ib_qp.state represents the WQ HW state while ib_wq.state represents
+	 * the WQ logic state.
+	 */
+	qp_cur_state = qp->state;
+	qp_new_state = ib_wq2qp_state(new_state);
+
+	if (ib_wq2qp_state(new_state) == qp_cur_state)
+		return 0;
+
+	if (new_state == IB_WQS_RDY) {
+		struct ib_qp_attr attr = {};
+
+		attr.port_num = qp->port;
+		attr_mask = IB_QP_PORT;
+
+		err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, &attr,
+					  attr_mask, IB_QPS_RESET, IB_QPS_INIT);
+		if (err) {
+			pr_debug("WQN=0x%06x failed to apply RST->INIT on the HW QP\n",
+				 ibwq->wq_num);
+			return err;
+		}
+
+		qp_cur_state = IB_QPS_INIT;
+	}
+
+	attr_mask = 0;
+	err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, attr_mask,
+				  qp_cur_state,  qp_new_state);
+
+	if (err && (qp_cur_state == IB_QPS_INIT)) {
+		qp_new_state = IB_QPS_RESET;
+		if (__mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL,
+					attr_mask, IB_QPS_INIT, IB_QPS_RESET)) {
+			pr_warn("WQN=0x%06x failed with reverting HW's resources failure\n",
+				ibwq->wq_num);
+			qp_new_state = IB_QPS_INIT;
+		}
+	}
+
+	qp->state = qp_new_state;
+
+	return err;
+}
+
+int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr,
+		      u32 wq_attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
+	struct mlx4_ib_modify_wq ucmd = {};
+	size_t required_cmd_sz;
+	enum ib_wq_state cur_state, new_state;
+	int err = 0;
+
+	required_cmd_sz = offsetof(typeof(ucmd), reserved) +
+				   sizeof(ucmd.reserved);
+	if (udata->inlen < required_cmd_sz)
+		return -EINVAL;
+
+	if (udata->inlen > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 udata->inlen - sizeof(ucmd)))
+		return -EOPNOTSUPP;
+
+	if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)))
+		return -EFAULT;
+
+	if (ucmd.comp_mask || ucmd.reserved)
+		return -EOPNOTSUPP;
+
+	if (wq_attr_mask & IB_WQ_FLAGS)
+		return -EOPNOTSUPP;
+
+	cur_state = wq_attr_mask & IB_WQ_CUR_STATE ? wq_attr->curr_wq_state :
+						     ibwq->state;
+	new_state = wq_attr_mask & IB_WQ_STATE ? wq_attr->wq_state : cur_state;
+
+	if (cur_state  < IB_WQS_RESET || cur_state  > IB_WQS_ERR ||
+	    new_state < IB_WQS_RESET || new_state > IB_WQS_ERR)
+		return -EINVAL;
+
+	if ((new_state == IB_WQS_RDY) && (cur_state == IB_WQS_ERR))
+		return -EINVAL;
+
+	if ((new_state == IB_WQS_ERR) && (cur_state == IB_WQS_RESET))
+		return -EINVAL;
+
+	/* Need to protect against the parent RSS which also may modify WQ
+	 * state.
+	 */
+	mutex_lock(&qp->mutex);
+
+	/* Can update HW state only if a RSS QP has already associated to this
+	 * WQ, so we can apply its port on the WQ.
+	 */
+	if (qp->rss_usecnt)
+		err = _mlx4_ib_modify_wq(ibwq, new_state);
+
+	if (!err)
+		ibwq->state = new_state;
+
+	mutex_unlock(&qp->mutex);
+
+	return err;
+}
+
+int mlx4_ib_destroy_wq(struct ib_wq *ibwq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibwq->device);
+	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
+
+	if (qp->counter_index)
+		mlx4_ib_free_qp_counter(dev, qp);
+
+	destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, 1);
+
+	kfree(qp);
+
+	return 0;
+}
+
+struct ib_rwq_ind_table
+*mlx4_ib_create_rwq_ind_table(struct ib_device *device,
+			      struct ib_rwq_ind_table_init_attr *init_attr,
+			      struct ib_udata *udata)
+{
+	struct ib_rwq_ind_table *rwq_ind_table;
+	struct mlx4_ib_create_rwq_ind_tbl_resp resp = {};
+	unsigned int ind_tbl_size = 1 << init_attr->log_ind_tbl_size;
+	unsigned int base_wqn;
+	size_t min_resp_len;
+	int i;
+	int err;
+
+	if (udata->inlen > 0 &&
+	    !ib_is_udata_cleared(udata, 0,
+				 udata->inlen))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+	if (udata->outlen && udata->outlen < min_resp_len)
+		return ERR_PTR(-EINVAL);
+
+	if (ind_tbl_size >
+	    device->attrs.rss_caps.max_rwq_indirection_table_size) {
+		pr_debug("log_ind_tbl_size = %d is bigger than supported = %d\n",
+			 ind_tbl_size,
+			 device->attrs.rss_caps.max_rwq_indirection_table_size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	base_wqn = init_attr->ind_tbl[0]->wq_num;
+
+	if (base_wqn % ind_tbl_size) {
+		pr_debug("WQN=0x%x isn't aligned with indirection table size\n",
+			 base_wqn);
+		return ERR_PTR(-EINVAL);
+	}
+
+	for (i = 1; i < ind_tbl_size; i++) {
+		if (++base_wqn != init_attr->ind_tbl[i]->wq_num) {
+			pr_debug("indirection table's WQNs aren't consecutive\n");
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	rwq_ind_table = kzalloc(sizeof(*rwq_ind_table), GFP_KERNEL);
+	if (!rwq_ind_table)
+		return ERR_PTR(-ENOMEM);
+
+	if (udata->outlen) {
+		resp.response_length = offsetof(typeof(resp), response_length) +
+					sizeof(resp.response_length);
+		err = ib_copy_to_udata(udata, &resp, resp.response_length);
+		if (err)
+			goto err;
+	}
+
+	return rwq_ind_table;
+
+err:
+	kfree(rwq_ind_table);
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
+{
+	kfree(ib_rwq_ind_tbl);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
new file mode 100644
index 0000000..ebee56c
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+#include <rdma/mlx4-abi.h>
+
+static void *get_wqe(struct mlx4_ib_srq *srq, int n)
+{
+	return mlx4_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
+}
+
+static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq;
+
+	if (ibsrq->event_handler) {
+		event.device      = ibsrq->device;
+		event.element.srq = ibsrq;
+		switch (type) {
+		case MLX4_EVENT_TYPE_SRQ_LIMIT:
+			event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			break;
+		case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+			event.event = IB_EVENT_SRQ_ERR;
+			break;
+		default:
+			pr_warn("Unexpected event type %d "
+			       "on SRQ %06x\n", type, srq->srqn);
+			return;
+		}
+
+		ibsrq->event_handler(&event, ibsrq->srq_context);
+	}
+}
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_srq *srq;
+	struct mlx4_wqe_srq_next_seg *next;
+	struct mlx4_wqe_data_seg *scatter;
+	u32 cqn;
+	u16 xrcdn;
+	int desc_size;
+	int buf_size;
+	int err;
+	int i;
+
+	/* Sanity check SRQ size before proceeding */
+	if (init_attr->attr.max_wr  >= dev->dev->caps.max_srq_wqes ||
+	    init_attr->attr.max_sge >  dev->dev->caps.max_srq_sge)
+		return ERR_PTR(-EINVAL);
+
+	srq = kmalloc(sizeof *srq, GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&srq->mutex);
+	spin_lock_init(&srq->lock);
+	srq->msrq.max    = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+	srq->msrq.max_gs = init_attr->attr.max_sge;
+
+	desc_size = max(32UL,
+			roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) +
+					   srq->msrq.max_gs *
+					   sizeof (struct mlx4_wqe_data_seg)));
+	srq->msrq.wqe_shift = ilog2(desc_size);
+
+	buf_size = srq->msrq.max * desc_size;
+
+	if (pd->uobject) {
+		struct mlx4_ib_create_srq ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_srq;
+		}
+
+		srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+					buf_size, 0, 0);
+		if (IS_ERR(srq->umem)) {
+			err = PTR_ERR(srq->umem);
+			goto err_srq;
+		}
+
+		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem),
+				    srq->umem->page_shift, &srq->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem);
+		if (err)
+			goto err_mtt;
+
+		err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+					  ucmd.db_addr, &srq->db);
+		if (err)
+			goto err_mtt;
+	} else {
+		err = mlx4_db_alloc(dev->dev, &srq->db, 0);
+		if (err)
+			goto err_srq;
+
+		*srq->db.db = 0;
+
+		if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2,
+				   &srq->buf)) {
+			err = -ENOMEM;
+			goto err_db;
+		}
+
+		srq->head    = 0;
+		srq->tail    = srq->msrq.max - 1;
+		srq->wqe_ctr = 0;
+
+		for (i = 0; i < srq->msrq.max; ++i) {
+			next = get_wqe(srq, i);
+			next->next_wqe_index =
+				cpu_to_be16((i + 1) & (srq->msrq.max - 1));
+
+			for (scatter = (void *) (next + 1);
+			     (void *) scatter < (void *) next + desc_size;
+			     ++scatter)
+				scatter->lkey = cpu_to_be32(MLX4_INVALID_LKEY);
+		}
+
+		err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift,
+				    &srq->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf);
+		if (err)
+			goto err_mtt;
+
+		srq->wrid = kvmalloc_array(srq->msrq.max,
+					   sizeof(u64), GFP_KERNEL);
+		if (!srq->wrid) {
+			err = -ENOMEM;
+			goto err_mtt;
+		}
+	}
+
+	cqn = ib_srq_has_cq(init_attr->srq_type) ?
+		to_mcq(init_attr->ext.cq)->mcq.cqn : 0;
+	xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ?
+		to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn :
+		(u16) dev->dev->caps.reserved_xrcds;
+	err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt,
+			     srq->db.dma, &srq->msrq);
+	if (err)
+		goto err_wrid;
+
+	srq->msrq.event = mlx4_ib_srq_event;
+	srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn;
+
+	if (pd->uobject)
+		if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) {
+			err = -EFAULT;
+			goto err_wrid;
+		}
+
+	init_attr->attr.max_wr = srq->msrq.max - 1;
+
+	return &srq->ibsrq;
+
+err_wrid:
+	if (pd->uobject)
+		mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+	else
+		kvfree(srq->wrid);
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &srq->mtt);
+
+err_buf:
+	if (pd->uobject)
+		ib_umem_release(srq->umem);
+	else
+		mlx4_buf_free(dev->dev, buf_size, &srq->buf);
+
+err_db:
+	if (!pd->uobject)
+		mlx4_db_free(dev->dev, &srq->db);
+
+err_srq:
+	kfree(srq);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+
+	/* We don't support resizing SRQs (yet?) */
+	if (attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		if (attr->srq_limit >= srq->msrq.max)
+			return -EINVAL;
+
+		mutex_lock(&srq->mutex);
+		ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit);
+		mutex_unlock(&srq->mutex);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+	int limit_watermark;
+
+	ret = mlx4_srq_query(dev->dev, &srq->msrq, &limit_watermark);
+	if (ret)
+		return ret;
+
+	srq_attr->srq_limit = limit_watermark;
+	srq_attr->max_wr    = srq->msrq.max - 1;
+	srq_attr->max_sge   = srq->msrq.max_gs;
+
+	return 0;
+}
+
+int mlx4_ib_destroy_srq(struct ib_srq *srq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(srq->device);
+	struct mlx4_ib_srq *msrq = to_msrq(srq);
+
+	mlx4_srq_free(dev->dev, &msrq->msrq);
+	mlx4_mtt_cleanup(dev->dev, &msrq->mtt);
+
+	if (srq->uobject) {
+		mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+		ib_umem_release(msrq->umem);
+	} else {
+		kvfree(msrq->wrid);
+		mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift,
+			      &msrq->buf);
+		mlx4_db_free(dev->dev, &msrq->db);
+	}
+
+	kfree(msrq);
+
+	return 0;
+}
+
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index)
+{
+	struct mlx4_wqe_srq_next_seg *next;
+
+	/* always called with interrupts disabled. */
+	spin_lock(&srq->lock);
+
+	next = get_wqe(srq, srq->tail);
+	next->next_wqe_index = cpu_to_be16(wqe_index);
+	srq->tail = wqe_index;
+
+	spin_unlock(&srq->lock);
+}
+
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr)
+{
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	struct mlx4_wqe_srq_next_seg *next;
+	struct mlx4_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	struct mlx4_ib_dev *mdev = to_mdev(ibsrq->device);
+
+	spin_lock_irqsave(&srq->lock, flags);
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (unlikely(srq->head == srq->tail)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		srq->wrid[srq->head] = wr->wr_id;
+
+		next      = get_wqe(srq, srq->head);
+		srq->head = be16_to_cpu(next->next_wqe_index);
+		scat      = (struct mlx4_wqe_data_seg *) (next + 1);
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+			scat[i].lkey       = cpu_to_be32(wr->sg_list[i].lkey);
+			scat[i].addr       = cpu_to_be64(wr->sg_list[i].addr);
+		}
+
+		if (i < srq->msrq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+	}
+
+	if (likely(nreq)) {
+		srq->wqe_ctr += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*srq->db.db = cpu_to_be32(srq->wqe_ctr);
+	}
+out:
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c
new file mode 100644
index 0000000..e219093
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -0,0 +1,890 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*#include "core_priv.h"*/
+#include "mlx4_ib.h"
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+
+#include <rdma/ib_mad.h>
+/*show_admin_alias_guid returns the administratively assigned value of that GUID.
+ * Values returned in buf parameter string:
+ *	0			- requests opensm to assign a value.
+ *	ffffffffffffffff	- delete this entry.
+ *	other			- value assigned by administrator.
+ */
+static ssize_t show_admin_alias_guid(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	__be64 sysadmin_ag_val;
+
+	sysadmin_ag_val = mlx4_get_admin_guid(mdev->dev,
+					      mlx4_ib_iov_dentry->entry_num,
+					      port->num);
+
+	return sprintf(buf, "%llx\n", be64_to_cpu(sysadmin_ag_val));
+}
+
+/* store_admin_alias_guid stores the (new) administratively assigned value of that GUID.
+ * Values in buf parameter string:
+ *	0			- requests opensm to assign a value.
+ *	0xffffffffffffffff	- delete this entry.
+ *	other			- guid value assigned by the administrator.
+ */
+static ssize_t store_admin_alias_guid(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	int record_num;/*0-15*/
+	int guid_index_in_rec; /*0 - 7*/
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	u64 sysadmin_ag_val;
+	unsigned long flags;
+
+	record_num = mlx4_ib_iov_dentry->entry_num / 8;
+	guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8;
+	if (0 == record_num && 0 == guid_index_in_rec) {
+		pr_err("GUID 0 block 0 is RO\n");
+		return count;
+	}
+	spin_lock_irqsave(&mdev->sriov.alias_guid.ag_work_lock, flags);
+	sscanf(buf, "%llx", &sysadmin_ag_val);
+	*(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1].
+		all_rec_per_port[record_num].
+		all_recs[GUID_REC_SIZE * guid_index_in_rec] =
+			cpu_to_be64(sysadmin_ag_val);
+
+	/* Change the state to be pending for update */
+	mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status
+		= MLX4_GUID_INFO_STATUS_IDLE ;
+	mlx4_set_admin_guid(mdev->dev, cpu_to_be64(sysadmin_ag_val),
+			    mlx4_ib_iov_dentry->entry_num,
+			    port->num);
+
+	/* set the record index */
+	mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes
+		|= mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec);
+
+	spin_unlock_irqrestore(&mdev->sriov.alias_guid.ag_work_lock, flags);
+	mlx4_ib_init_alias_guid_work(mdev, port->num - 1);
+
+	return count;
+}
+
+static ssize_t show_port_gid(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	union ib_gid gid;
+	ssize_t ret;
+
+	ret = __mlx4_ib_query_gid(&mdev->ib_dev, port->num,
+				  mlx4_ib_iov_dentry->entry_num, &gid, 1);
+	if (ret)
+		return ret;
+	ret = sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+		      be16_to_cpu(((__be16 *) gid.raw)[0]),
+		      be16_to_cpu(((__be16 *) gid.raw)[1]),
+		      be16_to_cpu(((__be16 *) gid.raw)[2]),
+		      be16_to_cpu(((__be16 *) gid.raw)[3]),
+		      be16_to_cpu(((__be16 *) gid.raw)[4]),
+		      be16_to_cpu(((__be16 *) gid.raw)[5]),
+		      be16_to_cpu(((__be16 *) gid.raw)[6]),
+		      be16_to_cpu(((__be16 *) gid.raw)[7]));
+	return ret;
+}
+
+static ssize_t show_phys_port_pkey(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	u16 pkey;
+	ssize_t ret;
+
+	ret = __mlx4_ib_query_pkey(&mdev->ib_dev, port->num,
+				   mlx4_ib_iov_dentry->entry_num, &pkey, 1);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define DENTRY_REMOVE(_dentry)						\
+do {									\
+	sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr);	\
+} while (0);
+
+static int create_sysfs_entry(void *_ctx, struct mlx4_ib_iov_sysfs_attr *_dentry,
+			      char *_name, struct kobject *_kobj,
+			      ssize_t (*show)(struct device *dev,
+					      struct device_attribute *attr,
+					      char *buf),
+			      ssize_t (*store)(struct device *dev,
+					       struct device_attribute *attr,
+					       const char *buf, size_t count)
+			      )
+{
+	int ret = 0;
+	struct mlx4_ib_iov_sysfs_attr *vdentry = _dentry;
+
+	vdentry->ctx = _ctx;
+	vdentry->dentry.show = show;
+	vdentry->dentry.store = store;
+	sysfs_attr_init(&vdentry->dentry.attr);
+	vdentry->dentry.attr.name = vdentry->name;
+	vdentry->dentry.attr.mode = 0;
+	vdentry->kobj = _kobj;
+	snprintf(vdentry->name, 15, "%s", _name);
+
+	if (vdentry->dentry.store)
+		vdentry->dentry.attr.mode |= S_IWUSR;
+
+	if (vdentry->dentry.show)
+		vdentry->dentry.attr.mode |= S_IRUGO;
+
+	ret = sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr);
+	if (ret) {
+		pr_err("failed to create %s\n", vdentry->dentry.attr.name);
+		vdentry->ctx = NULL;
+		return ret;
+	}
+
+	return ret;
+}
+
+int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+		struct attribute *attr)
+{
+	struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1];
+	int ret;
+
+	ret = sysfs_create_file(port->mcgs_parent, attr);
+	if (ret)
+		pr_err("failed to create %s\n", attr->name);
+
+	return ret;
+}
+
+void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+		struct attribute *attr)
+{
+	struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1];
+
+	sysfs_remove_file(port->mcgs_parent, attr);
+}
+
+static int add_port_entries(struct mlx4_ib_dev *device, int port_num)
+{
+	int i;
+	char buff[11];
+	struct mlx4_ib_iov_port *port = NULL;
+	int ret = 0 ;
+	struct ib_port_attr attr;
+
+	memset(&attr, 0, sizeof(attr));
+	/* get the physical gid and pkey table sizes.*/
+	ret = __mlx4_ib_query_port(&device->ib_dev, port_num, &attr, 1);
+	if (ret)
+		goto err;
+
+	port = &device->iov_ports[port_num - 1];
+	port->dev = device;
+	port->num = port_num;
+	/* Directory structure:
+	 * iov -
+	 *   port num -
+	 *	admin_guids
+	 *	gids (operational)
+	 *	mcg_table
+	 */
+	port->dentr_ar = kzalloc(sizeof (struct mlx4_ib_iov_sysfs_attr_ar),
+				 GFP_KERNEL);
+	if (!port->dentr_ar) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	sprintf(buff, "%d", port_num);
+	port->cur_port = kobject_create_and_add(buff,
+				 kobject_get(device->ports_parent));
+	if (!port->cur_port) {
+		ret = -ENOMEM;
+		goto kobj_create_err;
+	}
+	/* admin GUIDs */
+	port->admin_alias_parent = kobject_create_and_add("admin_guids",
+						  kobject_get(port->cur_port));
+	if (!port->admin_alias_parent) {
+		ret = -ENOMEM;
+		goto err_admin_guids;
+	}
+	for (i = 0 ; i < attr.gid_tbl_len; i++) {
+		sprintf(buff, "%d", i);
+		port->dentr_ar->dentries[i].entry_num = i;
+		ret = create_sysfs_entry(port, &port->dentr_ar->dentries[i],
+					  buff, port->admin_alias_parent,
+					  show_admin_alias_guid, store_admin_alias_guid);
+		if (ret)
+			goto err_admin_alias_parent;
+	}
+
+	/* gids subdirectory (operational gids) */
+	port->gids_parent = kobject_create_and_add("gids",
+						  kobject_get(port->cur_port));
+	if (!port->gids_parent) {
+		ret = -ENOMEM;
+		goto err_gids;
+	}
+
+	for (i = 0 ; i < attr.gid_tbl_len; i++) {
+		sprintf(buff, "%d", i);
+		port->dentr_ar->dentries[attr.gid_tbl_len + i].entry_num = i;
+		ret = create_sysfs_entry(port,
+					 &port->dentr_ar->dentries[attr.gid_tbl_len + i],
+					 buff,
+					 port->gids_parent, show_port_gid, NULL);
+		if (ret)
+			goto err_gids_parent;
+	}
+
+	/* physical port pkey table */
+	port->pkeys_parent =
+		kobject_create_and_add("pkeys", kobject_get(port->cur_port));
+	if (!port->pkeys_parent) {
+		ret = -ENOMEM;
+		goto err_pkeys;
+	}
+
+	for (i = 0 ; i < attr.pkey_tbl_len; i++) {
+		sprintf(buff, "%d", i);
+		port->dentr_ar->dentries[2 * attr.gid_tbl_len + i].entry_num = i;
+		ret = create_sysfs_entry(port,
+					 &port->dentr_ar->dentries[2 * attr.gid_tbl_len + i],
+					 buff, port->pkeys_parent,
+					 show_phys_port_pkey, NULL);
+		if (ret)
+			goto err_pkeys_parent;
+	}
+
+	/* MCGs table */
+	port->mcgs_parent =
+		kobject_create_and_add("mcgs", kobject_get(port->cur_port));
+	if (!port->mcgs_parent) {
+		ret = -ENOMEM;
+		goto err_mcgs;
+	}
+	return 0;
+
+err_mcgs:
+	kobject_put(port->cur_port);
+
+err_pkeys_parent:
+	kobject_put(port->pkeys_parent);
+
+err_pkeys:
+	kobject_put(port->cur_port);
+
+err_gids_parent:
+	kobject_put(port->gids_parent);
+
+err_gids:
+	kobject_put(port->cur_port);
+
+err_admin_alias_parent:
+	kobject_put(port->admin_alias_parent);
+
+err_admin_guids:
+	kobject_put(port->cur_port);
+	kobject_put(port->cur_port); /* once more for create_and_add buff */
+
+kobj_create_err:
+	kobject_put(device->ports_parent);
+	kfree(port->dentr_ar);
+
+err:
+	pr_err("add_port_entries FAILED: for port:%d, error: %d\n",
+	       port_num, ret);
+	return ret;
+}
+
+static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max)
+{
+	char base_name[9];
+
+	/* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */
+	strlcpy(name, pci_name(dev->dev->persist->pdev), max);
+	strncpy(base_name, name, 8); /*till xxxx:yy:*/
+	base_name[8] = '\0';
+	/* with no ARI only 3 last bits are used so when the fn is higher than 8
+	 * need to add it to the dev num, so count in the last number will be
+	 * modulo 8 */
+	sprintf(name, "%s%.2d.%d", base_name, (i/8), (i%8));
+}
+
+struct mlx4_port {
+	struct kobject         kobj;
+	struct mlx4_ib_dev    *dev;
+	struct attribute_group pkey_group;
+	struct attribute_group gid_group;
+	struct device_attribute	enable_smi_admin;
+	struct device_attribute	smi_enabled;
+	int		       slave;
+	u8                     port_num;
+};
+
+
+static void mlx4_port_release(struct kobject *kobj)
+{
+	struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+	struct attribute *a;
+	int i;
+
+	for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+		kfree(a);
+	kfree(p->pkey_group.attrs);
+	for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+		kfree(a);
+	kfree(p->gid_group.attrs);
+	kfree(p);
+}
+
+struct port_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct mlx4_port *, struct port_attribute *, char *buf);
+	ssize_t (*store)(struct mlx4_port *, struct port_attribute *,
+			 const char *buf, size_t count);
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+
+	if (!port_attr->show)
+		return -EIO;
+	return port_attr->show(p, port_attr, buf);
+}
+
+static ssize_t port_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t size)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+
+	if (!port_attr->store)
+		return -EIO;
+	return port_attr->store(p, port_attr, buf, size);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+	.show = port_attr_show,
+	.store = port_attr_store,
+};
+
+static struct kobj_type port_type = {
+	.release    = mlx4_port_release,
+	.sysfs_ops  = &port_sysfs_ops,
+};
+
+struct port_table_attribute {
+	struct port_attribute	attr;
+	char			name[8];
+	int			index;
+};
+
+static ssize_t show_port_pkey(struct mlx4_port *p, struct port_attribute *attr,
+			      char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	ssize_t ret = -ENODEV;
+
+	if (p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1][tab_attr->index] >=
+	    (p->dev->dev->caps.pkey_table_len[p->port_num]))
+		ret = sprintf(buf, "none\n");
+	else
+		ret = sprintf(buf, "%d\n",
+			      p->dev->pkeys.virt2phys_pkey[p->slave]
+			      [p->port_num - 1][tab_attr->index]);
+	return ret;
+}
+
+static ssize_t store_port_pkey(struct mlx4_port *p, struct port_attribute *attr,
+			       const char *buf, size_t count)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	int idx;
+	int err;
+
+	/* do not allow remapping Dom0 virtual pkey table */
+	if (p->slave == mlx4_master_func_num(p->dev->dev))
+		return -EINVAL;
+
+	if (!strncasecmp(buf, "no", 2))
+		idx = p->dev->dev->phys_caps.pkey_phys_table_len[p->port_num] - 1;
+	else if (sscanf(buf, "%i", &idx) != 1 ||
+		 idx >= p->dev->dev->caps.pkey_table_len[p->port_num] ||
+		 idx < 0)
+		return -EINVAL;
+
+	p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1]
+				    [tab_attr->index] = idx;
+	mlx4_sync_pkey_table(p->dev->dev, p->slave, p->port_num,
+			     tab_attr->index, idx);
+	err = mlx4_gen_pkey_eqe(p->dev->dev, p->slave, p->port_num);
+	if (err) {
+		pr_err("mlx4_gen_pkey_eqe failed for slave %d,"
+		       " port %d, index %d\n", p->slave, p->port_num, idx);
+		return err;
+	}
+	return count;
+}
+
+static ssize_t show_port_gid_idx(struct mlx4_port *p,
+				 struct port_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", p->slave);
+}
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct mlx4_port *,
+				  struct port_attribute *, char *buf),
+		  ssize_t (*store)(struct mlx4_port *, struct port_attribute *,
+				   const char *buf, size_t count),
+		  int len)
+{
+	struct attribute **tab_attr;
+	struct port_table_attribute *element;
+	int i;
+
+	tab_attr = kcalloc(1 + len, sizeof (struct attribute *), GFP_KERNEL);
+	if (!tab_attr)
+		return NULL;
+
+	for (i = 0; i < len; i++) {
+		element = kzalloc(sizeof (struct port_table_attribute),
+				  GFP_KERNEL);
+		if (!element)
+			goto err;
+		if (snprintf(element->name, sizeof (element->name),
+			     "%d", i) >= sizeof (element->name)) {
+			kfree(element);
+			goto err;
+		}
+		sysfs_attr_init(&element->attr.attr);
+		element->attr.attr.name  = element->name;
+		if (store) {
+			element->attr.attr.mode  = S_IWUSR | S_IRUGO;
+			element->attr.store	 = store;
+		} else
+			element->attr.attr.mode  = S_IRUGO;
+
+		element->attr.show       = show;
+		element->index		 = i;
+		tab_attr[i] = &element->attr.attr;
+	}
+	return tab_attr;
+
+err:
+	while (--i >= 0)
+		kfree(tab_attr[i]);
+	kfree(tab_attr);
+	return NULL;
+}
+
+static ssize_t sysfs_show_smi_enabled(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct mlx4_port *p =
+		container_of(attr, struct mlx4_port, smi_enabled);
+	ssize_t len = 0;
+
+	if (mlx4_vf_smi_enabled(p->dev->dev, p->slave, p->port_num))
+		len = sprintf(buf, "%d\n", 1);
+	else
+		len = sprintf(buf, "%d\n", 0);
+
+	return len;
+}
+
+static ssize_t sysfs_show_enable_smi_admin(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct mlx4_port *p =
+		container_of(attr, struct mlx4_port, enable_smi_admin);
+	ssize_t len = 0;
+
+	if (mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave, p->port_num))
+		len = sprintf(buf, "%d\n", 1);
+	else
+		len = sprintf(buf, "%d\n", 0);
+
+	return len;
+}
+
+static ssize_t sysfs_store_enable_smi_admin(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct mlx4_port *p =
+		container_of(attr, struct mlx4_port, enable_smi_admin);
+	int enable;
+
+	if (sscanf(buf, "%i", &enable) != 1 ||
+	    enable < 0 || enable > 1)
+		return -EINVAL;
+
+	if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable))
+		return -EINVAL;
+	return count;
+}
+
+static int add_vf_smi_entries(struct mlx4_port *p)
+{
+	int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) ==
+			IB_LINK_LAYER_ETHERNET;
+	int ret;
+
+	/* do not display entries if eth transport, or if master */
+	if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev))
+		return 0;
+
+	sysfs_attr_init(&p->smi_enabled.attr);
+	p->smi_enabled.show = sysfs_show_smi_enabled;
+	p->smi_enabled.store = NULL;
+	p->smi_enabled.attr.name = "smi_enabled";
+	p->smi_enabled.attr.mode = 0444;
+	ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr);
+	if (ret) {
+		pr_err("failed to create smi_enabled\n");
+		return ret;
+	}
+
+	sysfs_attr_init(&p->enable_smi_admin.attr);
+	p->enable_smi_admin.show = sysfs_show_enable_smi_admin;
+	p->enable_smi_admin.store = sysfs_store_enable_smi_admin;
+	p->enable_smi_admin.attr.name = "enable_smi_admin";
+	p->enable_smi_admin.attr.mode = 0644;
+	ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr);
+	if (ret) {
+		pr_err("failed to create enable_smi_admin\n");
+		sysfs_remove_file(&p->kobj, &p->smi_enabled.attr);
+		return ret;
+	}
+	return 0;
+}
+
+static void remove_vf_smi_entries(struct mlx4_port *p)
+{
+	int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) ==
+			IB_LINK_LAYER_ETHERNET;
+
+	if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev))
+		return;
+
+	sysfs_remove_file(&p->kobj, &p->smi_enabled.attr);
+	sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr);
+}
+
+static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
+{
+	struct mlx4_port *p;
+	int i;
+	int ret;
+	int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port_num) ==
+			IB_LINK_LAYER_ETHERNET;
+
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	p->dev = dev;
+	p->port_num = port_num;
+	p->slave = slave;
+
+	ret = kobject_init_and_add(&p->kobj, &port_type,
+				   kobject_get(dev->dev_ports_parent[slave]),
+				   "%d", port_num);
+	if (ret)
+		goto err_alloc;
+
+	p->pkey_group.name  = "pkey_idx";
+	p->pkey_group.attrs =
+		alloc_group_attrs(show_port_pkey,
+				  is_eth ? NULL : store_port_pkey,
+				  dev->dev->caps.pkey_table_len[port_num]);
+	if (!p->pkey_group.attrs) {
+		ret = -ENOMEM;
+		goto err_alloc;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+	if (ret)
+		goto err_free_pkey;
+
+	p->gid_group.name  = "gid_idx";
+	p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1);
+	if (!p->gid_group.attrs) {
+		ret = -ENOMEM;
+		goto err_free_pkey;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &p->gid_group);
+	if (ret)
+		goto err_free_gid;
+
+	ret = add_vf_smi_entries(p);
+	if (ret)
+		goto err_free_gid;
+
+	list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]);
+	return 0;
+
+err_free_gid:
+	kfree(p->gid_group.attrs[0]);
+	kfree(p->gid_group.attrs);
+
+err_free_pkey:
+	for (i = 0; i < dev->dev->caps.pkey_table_len[port_num]; ++i)
+		kfree(p->pkey_group.attrs[i]);
+	kfree(p->pkey_group.attrs);
+
+err_alloc:
+	kobject_put(dev->dev_ports_parent[slave]);
+	kfree(p);
+	return ret;
+}
+
+static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave)
+{
+	char name[32];
+	int err;
+	int port;
+	struct kobject *p, *t;
+	struct mlx4_port *mport;
+	struct mlx4_active_ports actv_ports;
+
+	get_name(dev, name, slave, sizeof name);
+
+	dev->pkeys.device_parent[slave] =
+		kobject_create_and_add(name, kobject_get(dev->iov_parent));
+
+	if (!dev->pkeys.device_parent[slave]) {
+		err = -ENOMEM;
+		goto fail_dev;
+	}
+
+	INIT_LIST_HEAD(&dev->pkeys.pkey_port_list[slave]);
+
+	dev->dev_ports_parent[slave] =
+		kobject_create_and_add("ports",
+				       kobject_get(dev->pkeys.device_parent[slave]));
+
+	if (!dev->dev_ports_parent[slave]) {
+		err = -ENOMEM;
+		goto err_ports;
+	}
+
+	actv_ports = mlx4_get_active_ports(dev->dev, slave);
+
+	for (port = 1; port <= dev->dev->caps.num_ports; ++port) {
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+		err = add_port(dev, port, slave);
+		if (err)
+			goto err_add;
+	}
+	return 0;
+
+err_add:
+	list_for_each_entry_safe(p, t,
+				 &dev->pkeys.pkey_port_list[slave],
+				 entry) {
+		list_del(&p->entry);
+		mport = container_of(p, struct mlx4_port, kobj);
+		sysfs_remove_group(p, &mport->pkey_group);
+		sysfs_remove_group(p, &mport->gid_group);
+		remove_vf_smi_entries(mport);
+		kobject_put(p);
+	}
+	kobject_put(dev->dev_ports_parent[slave]);
+
+err_ports:
+	kobject_put(dev->pkeys.device_parent[slave]);
+	/* extra put for the device_parent create_and_add */
+	kobject_put(dev->pkeys.device_parent[slave]);
+
+fail_dev:
+	kobject_put(dev->iov_parent);
+	return err;
+}
+
+static int register_pkey_tree(struct mlx4_ib_dev *device)
+{
+	int i;
+
+	if (!mlx4_is_master(device->dev))
+		return 0;
+
+	for (i = 0; i <= device->dev->persist->num_vfs; ++i)
+		register_one_pkey_tree(device, i);
+
+	return 0;
+}
+
+static void unregister_pkey_tree(struct mlx4_ib_dev *device)
+{
+	int slave;
+	struct kobject *p, *t;
+	struct mlx4_port *port;
+
+	if (!mlx4_is_master(device->dev))
+		return;
+
+	for (slave = device->dev->persist->num_vfs; slave >= 0; --slave) {
+		list_for_each_entry_safe(p, t,
+					 &device->pkeys.pkey_port_list[slave],
+					 entry) {
+			list_del(&p->entry);
+			port = container_of(p, struct mlx4_port, kobj);
+			sysfs_remove_group(p, &port->pkey_group);
+			sysfs_remove_group(p, &port->gid_group);
+			remove_vf_smi_entries(port);
+			kobject_put(p);
+			kobject_put(device->dev_ports_parent[slave]);
+		}
+		kobject_put(device->dev_ports_parent[slave]);
+		kobject_put(device->pkeys.device_parent[slave]);
+		kobject_put(device->pkeys.device_parent[slave]);
+		kobject_put(device->iov_parent);
+	}
+}
+
+int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev)
+{
+	int i;
+	int ret = 0;
+
+	if (!mlx4_is_master(dev->dev))
+		return 0;
+
+	dev->iov_parent =
+		kobject_create_and_add("iov",
+				       kobject_get(dev->ib_dev.ports_parent->parent));
+	if (!dev->iov_parent) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	dev->ports_parent =
+		kobject_create_and_add("ports",
+				       kobject_get(dev->iov_parent));
+	if (!dev->ports_parent) {
+		ret = -ENOMEM;
+		goto err_ports;
+	}
+
+	for (i = 1; i <= dev->ib_dev.phys_port_cnt; ++i) {
+		ret = add_port_entries(dev, i);
+		if (ret)
+			goto err_add_entries;
+	}
+
+	ret = register_pkey_tree(dev);
+	if (ret)
+		goto err_add_entries;
+	return 0;
+
+err_add_entries:
+	kobject_put(dev->ports_parent);
+
+err_ports:
+	kobject_put(dev->iov_parent);
+err:
+	kobject_put(dev->ib_dev.ports_parent->parent);
+	pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret);
+	return ret;
+}
+
+static void unregister_alias_guid_tree(struct mlx4_ib_dev *device)
+{
+	struct mlx4_ib_iov_port *p;
+	int i;
+
+	if (!mlx4_is_master(device->dev))
+		return;
+
+	for (i = 0; i < device->dev->caps.num_ports; i++) {
+		p = &device->iov_ports[i];
+		kobject_put(p->admin_alias_parent);
+		kobject_put(p->gids_parent);
+		kobject_put(p->pkeys_parent);
+		kobject_put(p->mcgs_parent);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->dev->ports_parent);
+		kfree(p->dentr_ar);
+	}
+}
+
+void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device)
+{
+	unregister_alias_guid_tree(device);
+	unregister_pkey_tree(device);
+	kobject_put(device->ports_parent);
+	kobject_put(device->iov_parent);
+	kobject_put(device->iov_parent);
+	kobject_put(device->ib_dev.ports_parent->parent);
+}
diff --git a/drivers/infiniband/hw/mlx5/Kconfig b/drivers/infiniband/hw/mlx5/Kconfig
new file mode 100644
index 0000000..fb4d77b
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/Kconfig
@@ -0,0 +1,9 @@
+config MLX5_INFINIBAND
+	tristate "Mellanox Connect-IB HCA support"
+	depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE
+	depends on INFINIBAND_USER_ACCESS || INFINIBAND_USER_ACCESS=n
+	---help---
+	  This driver provides low-level InfiniBand support for
+	  Mellanox Connect-IB PCI Express host channel adapters (HCAs).
+	  This is required to use InfiniBand protocols such as
+	  IP-over-IB or SRP with these devices.
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
new file mode 100644
index 0000000..d42b922
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_MLX5_INFINIBAND)	+= mlx5_ib.o
+
+mlx5_ib-y :=	main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o cong.o
+mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
+mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o
diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c
new file mode 100644
index 0000000..e6bde32
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/ah.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx5_ib.h"
+
+static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev,
+				  struct mlx5_ib_ah *ah,
+				  struct rdma_ah_attr *ah_attr)
+{
+	enum ib_gid_type gid_type;
+	int err;
+
+	if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
+		const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
+
+		memcpy(ah->av.rgid, &grh->dgid, 16);
+		ah->av.grh_gid_fl = cpu_to_be32(grh->flow_label |
+						(1 << 30) |
+						grh->sgid_index << 20);
+		ah->av.hop_limit = grh->hop_limit;
+		ah->av.tclass = grh->traffic_class;
+	}
+
+	ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4);
+
+	if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
+		err = mlx5_get_roce_gid_type(dev, ah_attr->port_num,
+					     ah_attr->grh.sgid_index,
+					     &gid_type);
+		if (err)
+			return ERR_PTR(err);
+
+		memcpy(ah->av.rmac, ah_attr->roce.dmac,
+		       sizeof(ah_attr->roce.dmac));
+		ah->av.udp_sport =
+		mlx5_get_roce_udp_sport(dev,
+					rdma_ah_get_port_num(ah_attr),
+					rdma_ah_read_grh(ah_attr)->sgid_index);
+		ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0x7) << 1;
+		if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+#define MLX5_ECN_ENABLED BIT(1)
+			ah->av.tclass |= MLX5_ECN_ENABLED;
+	} else {
+		ah->av.rlid = cpu_to_be16(rdma_ah_get_dlid(ah_attr));
+		ah->av.fl_mlid = rdma_ah_get_path_bits(ah_attr) & 0x7f;
+		ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0xf);
+	}
+
+	return &ah->ibah;
+}
+
+struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
+				struct ib_udata *udata)
+
+{
+	struct mlx5_ib_ah *ah;
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	enum rdma_ah_attr_type ah_type = ah_attr->type;
+
+	if ((ah_type == RDMA_AH_ATTR_TYPE_ROCE) &&
+	    !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
+		return ERR_PTR(-EINVAL);
+
+	if (ah_type == RDMA_AH_ATTR_TYPE_ROCE && udata) {
+		int err;
+		struct mlx5_ib_create_ah_resp resp = {};
+		u32 min_resp_len = offsetof(typeof(resp), dmac) +
+				   sizeof(resp.dmac);
+
+		if (udata->outlen < min_resp_len)
+			return ERR_PTR(-EINVAL);
+
+		resp.response_length = min_resp_len;
+
+		memcpy(resp.dmac, ah_attr->roce.dmac, ETH_ALEN);
+		err = ib_copy_to_udata(udata, &resp, resp.response_length);
+		if (err)
+			return ERR_PTR(err);
+	}
+
+	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	return create_ib_ah(dev, ah, ah_attr); /* never fails */
+}
+
+int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
+{
+	struct mlx5_ib_ah *ah = to_mah(ibah);
+	u32 tmp;
+
+	memset(ah_attr, 0, sizeof(*ah_attr));
+	ah_attr->type = ibah->type;
+
+	tmp = be32_to_cpu(ah->av.grh_gid_fl);
+	if (tmp & (1 << 30)) {
+		rdma_ah_set_grh(ah_attr, NULL,
+				tmp & 0xfffff,
+				(tmp >> 20) & 0xff,
+				ah->av.hop_limit,
+				ah->av.tclass);
+		rdma_ah_set_dgid_raw(ah_attr, ah->av.rgid);
+	}
+	rdma_ah_set_dlid(ah_attr, be16_to_cpu(ah->av.rlid));
+	rdma_ah_set_static_rate(ah_attr, ah->av.stat_rate_sl >> 4);
+	rdma_ah_set_sl(ah_attr, ah->av.stat_rate_sl & 0xf);
+
+	return 0;
+}
+
+int mlx5_ib_destroy_ah(struct ib_ah *ah)
+{
+	kfree(to_mah(ah));
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
new file mode 100644
index 0000000..188512b
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "cmd.h"
+
+int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey)
+{
+	u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)]   = {};
+	int err;
+
+	MLX5_SET(query_special_contexts_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*null_mkey = MLX5_GET(query_special_contexts_out, out,
+				      null_mkey);
+	return err;
+}
+
+int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
+			       void *out, int out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_cong_params_in)] = { };
+
+	MLX5_SET(query_cong_params_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_CONG_PARAMS);
+	MLX5_SET(query_cong_params_in, in, cong_protocol, cong_point);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
+}
+
+int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *dev,
+				void *in, int in_size)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_cong_params_out)] = { };
+
+	return mlx5_cmd_exec(dev, in, in_size, out, sizeof(out));
+}
+
+int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr,
+			  u64 length, u32 alignment)
+{
+	struct mlx5_core_dev *dev = memic->dev;
+	u64 num_memic_hw_pages = MLX5_CAP_DEV_MEM(dev, memic_bar_size)
+					>> PAGE_SHIFT;
+	u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr);
+	u32 max_alignment = MLX5_CAP_DEV_MEM(dev, log_max_memic_addr_alignment);
+	u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE);
+	u32 out[MLX5_ST_SZ_DW(alloc_memic_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(alloc_memic_in)] = {};
+	u32 mlx5_alignment;
+	u64 page_idx = 0;
+	int ret = 0;
+
+	if (!length || (length & MLX5_MEMIC_ALLOC_SIZE_MASK))
+		return -EINVAL;
+
+	/* mlx5 device sets alignment as 64*2^driver_value
+	 * so normalizing is needed.
+	 */
+	mlx5_alignment = (alignment < MLX5_MEMIC_BASE_ALIGN) ? 0 :
+			 alignment - MLX5_MEMIC_BASE_ALIGN;
+	if (mlx5_alignment > max_alignment)
+		return -EINVAL;
+
+	MLX5_SET(alloc_memic_in, in, opcode, MLX5_CMD_OP_ALLOC_MEMIC);
+	MLX5_SET(alloc_memic_in, in, range_size, num_pages * PAGE_SIZE);
+	MLX5_SET(alloc_memic_in, in, memic_size, length);
+	MLX5_SET(alloc_memic_in, in, log_memic_addr_alignment,
+		 mlx5_alignment);
+
+	while (page_idx < num_memic_hw_pages) {
+		spin_lock(&memic->memic_lock);
+		page_idx = bitmap_find_next_zero_area(memic->memic_alloc_pages,
+						      num_memic_hw_pages,
+						      page_idx,
+						      num_pages, 0);
+
+		if (page_idx < num_memic_hw_pages)
+			bitmap_set(memic->memic_alloc_pages,
+				   page_idx, num_pages);
+
+		spin_unlock(&memic->memic_lock);
+
+		if (page_idx >= num_memic_hw_pages)
+			break;
+
+		MLX5_SET64(alloc_memic_in, in, range_start_addr,
+			   hw_start_addr + (page_idx * PAGE_SIZE));
+
+		ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+		if (ret) {
+			spin_lock(&memic->memic_lock);
+			bitmap_clear(memic->memic_alloc_pages,
+				     page_idx, num_pages);
+			spin_unlock(&memic->memic_lock);
+
+			if (ret == -EAGAIN) {
+				page_idx++;
+				continue;
+			}
+
+			return ret;
+		}
+
+		*addr = pci_resource_start(dev->pdev, 0) +
+			MLX5_GET64(alloc_memic_out, out, memic_start_addr);
+
+		return 0;
+	}
+
+	return -ENOMEM;
+}
+
+int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length)
+{
+	struct mlx5_core_dev *dev = memic->dev;
+	u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr);
+	u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE);
+	u32 out[MLX5_ST_SZ_DW(dealloc_memic_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(dealloc_memic_in)] = {0};
+	u64 start_page_idx;
+	int err;
+
+	addr -= pci_resource_start(dev->pdev, 0);
+	start_page_idx = (addr - hw_start_addr) >> PAGE_SHIFT;
+
+	MLX5_SET(dealloc_memic_in, in, opcode, MLX5_CMD_OP_DEALLOC_MEMIC);
+	MLX5_SET64(dealloc_memic_in, in, memic_start_addr, addr);
+	MLX5_SET(dealloc_memic_in, in, memic_size, length);
+
+	err =  mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+
+	if (!err) {
+		spin_lock(&memic->memic_lock);
+		bitmap_clear(memic->memic_alloc_pages,
+			     start_page_idx, num_pages);
+		spin_unlock(&memic->memic_lock);
+	}
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
new file mode 100644
index 0000000..e7206c8
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_IB_CMD_H
+#define MLX5_IB_CMD_H
+
+#include "mlx5_ib.h"
+#include <linux/kernel.h>
+#include <linux/mlx5/driver.h>
+
+int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey);
+int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
+			       void *out, int out_size);
+int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev,
+				void *in, int in_size);
+int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr,
+			 u64 length, u32 alignment);
+int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length);
+#endif /* MLX5_IB_CMD_H */
diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c
new file mode 100644
index 0000000..985fa26
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/cong.c
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2013-2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/debugfs.h>
+
+#include "mlx5_ib.h"
+#include "cmd.h"
+
+enum mlx5_ib_cong_node_type {
+	MLX5_IB_RROCE_ECN_RP = 1,
+	MLX5_IB_RROCE_ECN_NP = 2,
+};
+
+static const char * const mlx5_ib_dbg_cc_name[] = {
+	"rp_clamp_tgt_rate",
+	"rp_clamp_tgt_rate_ati",
+	"rp_time_reset",
+	"rp_byte_reset",
+	"rp_threshold",
+	"rp_ai_rate",
+	"rp_hai_rate",
+	"rp_min_dec_fac",
+	"rp_min_rate",
+	"rp_rate_to_set_on_first_cnp",
+	"rp_dce_tcp_g",
+	"rp_dce_tcp_rtt",
+	"rp_rate_reduce_monitor_period",
+	"rp_initial_alpha_value",
+	"rp_gd",
+	"np_cnp_dscp",
+	"np_cnp_prio_mode",
+	"np_cnp_prio",
+};
+
+#define MLX5_IB_RP_CLAMP_TGT_RATE_ATTR			BIT(1)
+#define MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR		BIT(2)
+#define MLX5_IB_RP_TIME_RESET_ATTR			BIT(3)
+#define MLX5_IB_RP_BYTE_RESET_ATTR			BIT(4)
+#define MLX5_IB_RP_THRESHOLD_ATTR			BIT(5)
+#define MLX5_IB_RP_AI_RATE_ATTR				BIT(7)
+#define MLX5_IB_RP_HAI_RATE_ATTR			BIT(8)
+#define MLX5_IB_RP_MIN_DEC_FAC_ATTR			BIT(9)
+#define MLX5_IB_RP_MIN_RATE_ATTR			BIT(10)
+#define MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR	BIT(11)
+#define MLX5_IB_RP_DCE_TCP_G_ATTR			BIT(12)
+#define MLX5_IB_RP_DCE_TCP_RTT_ATTR			BIT(13)
+#define MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR	BIT(14)
+#define MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR		BIT(15)
+#define MLX5_IB_RP_GD_ATTR				BIT(16)
+
+#define MLX5_IB_NP_CNP_DSCP_ATTR			BIT(3)
+#define MLX5_IB_NP_CNP_PRIO_MODE_ATTR			BIT(4)
+
+static enum mlx5_ib_cong_node_type
+mlx5_ib_param_to_node(enum mlx5_ib_dbg_cc_types param_offset)
+{
+	if (param_offset >= MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE &&
+	    param_offset <= MLX5_IB_DBG_CC_RP_GD)
+		return MLX5_IB_RROCE_ECN_RP;
+	else
+		return MLX5_IB_RROCE_ECN_NP;
+}
+
+static u32 mlx5_get_cc_param_val(void *field, int offset)
+{
+	switch (offset) {
+	case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				clamp_tgt_rate);
+	case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				clamp_tgt_rate_after_time_inc);
+	case MLX5_IB_DBG_CC_RP_TIME_RESET:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_time_reset);
+	case MLX5_IB_DBG_CC_RP_BYTE_RESET:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_byte_reset);
+	case MLX5_IB_DBG_CC_RP_THRESHOLD:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_threshold);
+	case MLX5_IB_DBG_CC_RP_AI_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_ai_rate);
+	case MLX5_IB_DBG_CC_RP_HAI_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_hai_rate);
+	case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_min_dec_fac);
+	case MLX5_IB_DBG_CC_RP_MIN_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_min_rate);
+	case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rate_to_set_on_first_cnp);
+	case MLX5_IB_DBG_CC_RP_DCE_TCP_G:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				dce_tcp_g);
+	case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				dce_tcp_rtt);
+	case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rate_reduce_monitor_period);
+	case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				initial_alpha_value);
+	case MLX5_IB_DBG_CC_RP_GD:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_gd);
+	case MLX5_IB_DBG_CC_NP_CNP_DSCP:
+		return MLX5_GET(cong_control_r_roce_ecn_np, field,
+				cnp_dscp);
+	case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE:
+		return MLX5_GET(cong_control_r_roce_ecn_np, field,
+				cnp_prio_mode);
+	case MLX5_IB_DBG_CC_NP_CNP_PRIO:
+		return MLX5_GET(cong_control_r_roce_ecn_np, field,
+				cnp_802p_prio);
+	default:
+		return 0;
+	}
+}
+
+static void mlx5_ib_set_cc_param_mask_val(void *field, int offset,
+					  u32 var, u32 *attr_mask)
+{
+	switch (offset) {
+	case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE:
+		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 clamp_tgt_rate, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI:
+		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 clamp_tgt_rate_after_time_inc, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_TIME_RESET:
+		*attr_mask |= MLX5_IB_RP_TIME_RESET_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_time_reset, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_BYTE_RESET:
+		*attr_mask |= MLX5_IB_RP_BYTE_RESET_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_byte_reset, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_THRESHOLD:
+		*attr_mask |= MLX5_IB_RP_THRESHOLD_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_threshold, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_AI_RATE:
+		*attr_mask |= MLX5_IB_RP_AI_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_ai_rate, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_HAI_RATE:
+		*attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_hai_rate, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC:
+		*attr_mask |= MLX5_IB_RP_MIN_DEC_FAC_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_min_dec_fac, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_MIN_RATE:
+		*attr_mask |= MLX5_IB_RP_MIN_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_min_rate, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP:
+		*attr_mask |= MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rate_to_set_on_first_cnp, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_DCE_TCP_G:
+		*attr_mask |= MLX5_IB_RP_DCE_TCP_G_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 dce_tcp_g, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT:
+		*attr_mask |= MLX5_IB_RP_DCE_TCP_RTT_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 dce_tcp_rtt, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD:
+		*attr_mask |= MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rate_reduce_monitor_period, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE:
+		*attr_mask |= MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 initial_alpha_value, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_GD:
+		*attr_mask |= MLX5_IB_RP_GD_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_gd, var);
+		break;
+	case MLX5_IB_DBG_CC_NP_CNP_DSCP:
+		*attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_dscp, var);
+		break;
+	case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE:
+		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, var);
+		break;
+	case MLX5_IB_DBG_CC_NP_CNP_PRIO:
+		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0);
+		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var);
+		break;
+	}
+}
+
+static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, u8 port_num,
+				 int offset, u32 *var)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out);
+	void *out;
+	void *field;
+	int err;
+	enum mlx5_ib_cong_node_type node;
+	struct mlx5_core_dev *mdev;
+
+	/* Takes a 1-based port number */
+	mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL);
+	if (!mdev)
+		return -ENODEV;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out) {
+		err = -ENOMEM;
+		goto alloc_err;
+	}
+
+	node = mlx5_ib_param_to_node(offset);
+
+	err = mlx5_cmd_query_cong_params(mdev, node, out, outlen);
+	if (err)
+		goto free;
+
+	field = MLX5_ADDR_OF(query_cong_params_out, out, congestion_parameters);
+	*var = mlx5_get_cc_param_val(field, offset);
+
+free:
+	kvfree(out);
+alloc_err:
+	mlx5_ib_put_native_port_mdev(dev, port_num + 1);
+	return err;
+}
+
+static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u8 port_num,
+				 int offset, u32 var)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in);
+	void *in;
+	void *field;
+	enum mlx5_ib_cong_node_type node;
+	struct mlx5_core_dev *mdev;
+	u32 attr_mask = 0;
+	int err;
+
+	/* Takes a 1-based port number */
+	mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL);
+	if (!mdev)
+		return -ENODEV;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto alloc_err;
+	}
+
+	MLX5_SET(modify_cong_params_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_CONG_PARAMS);
+
+	node = mlx5_ib_param_to_node(offset);
+	MLX5_SET(modify_cong_params_in, in, cong_protocol, node);
+
+	field = MLX5_ADDR_OF(modify_cong_params_in, in, congestion_parameters);
+	mlx5_ib_set_cc_param_mask_val(field, offset, var, &attr_mask);
+
+	field = MLX5_ADDR_OF(modify_cong_params_in, in, field_select);
+	MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp,
+		 attr_mask);
+
+	err = mlx5_cmd_modify_cong_params(mdev, in, inlen);
+	kvfree(in);
+alloc_err:
+	mlx5_ib_put_native_port_mdev(dev, port_num + 1);
+	return err;
+}
+
+static ssize_t set_param(struct file *filp, const char __user *buf,
+			 size_t count, loff_t *pos)
+{
+	struct mlx5_ib_dbg_param *param = filp->private_data;
+	int offset = param->offset;
+	char lbuf[11] = { };
+	u32 var;
+	int ret;
+
+	if (count > sizeof(lbuf))
+		return -EINVAL;
+
+	if (copy_from_user(lbuf, buf, count))
+		return -EFAULT;
+
+	lbuf[sizeof(lbuf) - 1] = '\0';
+
+	if (kstrtou32(lbuf, 0, &var))
+		return -EINVAL;
+
+	ret = mlx5_ib_set_cc_params(param->dev, param->port_num, offset, var);
+	return ret ? ret : count;
+}
+
+static ssize_t get_param(struct file *filp, char __user *buf, size_t count,
+			 loff_t *pos)
+{
+	struct mlx5_ib_dbg_param *param = filp->private_data;
+	int offset = param->offset;
+	u32 var = 0;
+	int ret;
+	char lbuf[11];
+
+	if (*pos)
+		return 0;
+
+	ret = mlx5_ib_get_cc_params(param->dev, param->port_num, offset, &var);
+	if (ret)
+		return ret;
+
+	ret = snprintf(lbuf, sizeof(lbuf), "%d\n", var);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(buf, lbuf, ret))
+		return -EFAULT;
+
+	*pos += ret;
+	return ret;
+}
+
+static const struct file_operations dbg_cc_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= set_param,
+	.read	= get_param,
+};
+
+void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
+{
+	if (!mlx5_debugfs_root ||
+	    !dev->port[port_num].dbg_cc_params ||
+	    !dev->port[port_num].dbg_cc_params->root)
+		return;
+
+	debugfs_remove_recursive(dev->port[port_num].dbg_cc_params->root);
+	kfree(dev->port[port_num].dbg_cc_params);
+	dev->port[port_num].dbg_cc_params = NULL;
+}
+
+int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
+{
+	struct mlx5_ib_dbg_cc_params *dbg_cc_params;
+	struct mlx5_core_dev *mdev;
+	int i;
+
+	if (!mlx5_debugfs_root)
+		goto out;
+
+	/* Takes a 1-based port number */
+	mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL);
+	if (!mdev)
+		goto out;
+
+	if (!MLX5_CAP_GEN(mdev, cc_query_allowed) ||
+	    !MLX5_CAP_GEN(mdev, cc_modify_allowed))
+		goto put_mdev;
+
+	dbg_cc_params = kzalloc(sizeof(*dbg_cc_params), GFP_KERNEL);
+	if (!dbg_cc_params)
+		goto err;
+
+	dev->port[port_num].dbg_cc_params = dbg_cc_params;
+
+	dbg_cc_params->root = debugfs_create_dir("cc_params",
+						 mdev->priv.dbg_root);
+	if (!dbg_cc_params->root)
+		goto err;
+
+	for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) {
+		dbg_cc_params->params[i].offset = i;
+		dbg_cc_params->params[i].dev = dev;
+		dbg_cc_params->params[i].port_num = port_num;
+		dbg_cc_params->params[i].dentry =
+			debugfs_create_file(mlx5_ib_dbg_cc_name[i],
+					    0600, dbg_cc_params->root,
+					    &dbg_cc_params->params[i],
+					    &dbg_cc_fops);
+		if (!dbg_cc_params->params[i].dentry)
+			goto err;
+	}
+
+put_mdev:
+	mlx5_ib_put_native_port_mdev(dev, port_num + 1);
+out:
+	return 0;
+
+err:
+	mlx5_ib_warn(dev, "cong debugfs failure\n");
+	mlx5_ib_cleanup_cong_debugfs(dev, port_num);
+	mlx5_ib_put_native_port_mdev(dev, port_num + 1);
+
+	/*
+	 * We don't want to fail driver if debugfs failed to initialize,
+	 * so we are not forwarding error to the user.
+	 */
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
new file mode 100644
index 0000000..77d257e
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -0,0 +1,1464 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kref.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_cache.h>
+#include "mlx5_ib.h"
+
+static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq)
+{
+	struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
+
+	ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void mlx5_ib_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
+{
+	struct mlx5_ib_cq *cq = container_of(mcq, struct mlx5_ib_cq, mcq);
+	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+	struct ib_cq *ibcq = &cq->ibcq;
+	struct ib_event event;
+
+	if (type != MLX5_EVENT_TYPE_CQ_ERROR) {
+		mlx5_ib_warn(dev, "Unexpected event type %d on CQ %06x\n",
+			     type, mcq->cqn);
+		return;
+	}
+
+	if (ibcq->event_handler) {
+		event.device     = &dev->ib_dev;
+		event.event      = IB_EVENT_CQ_ERR;
+		event.element.cq = ibcq;
+		ibcq->event_handler(&event, ibcq->cq_context);
+	}
+}
+
+static void *get_cqe(struct mlx5_ib_cq *cq, int n)
+{
+	return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
+}
+
+static u8 sw_ownership_bit(int n, int nent)
+{
+	return (n & nent) ? 1 : 0;
+}
+
+static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n)
+{
+	void *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+	struct mlx5_cqe64 *cqe64;
+
+	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+
+	if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) &&
+	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) {
+		return cqe;
+	} else {
+		return NULL;
+	}
+}
+
+static void *next_cqe_sw(struct mlx5_ib_cq *cq)
+{
+	return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx)
+{
+	switch (wq->wr_data[idx]) {
+	case MLX5_IB_WR_UMR:
+		return 0;
+
+	case IB_WR_LOCAL_INV:
+		return IB_WC_LOCAL_INV;
+
+	case IB_WR_REG_MR:
+		return IB_WC_REG_MR;
+
+	default:
+		pr_warn("unknown completion status\n");
+		return 0;
+	}
+}
+
+static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
+			    struct mlx5_ib_wq *wq, int idx)
+{
+	wc->wc_flags = 0;
+	switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) {
+	case MLX5_OPCODE_RDMA_WRITE_IMM:
+		wc->wc_flags |= IB_WC_WITH_IMM;
+		/* fall through */
+	case MLX5_OPCODE_RDMA_WRITE:
+		wc->opcode    = IB_WC_RDMA_WRITE;
+		break;
+	case MLX5_OPCODE_SEND_IMM:
+		wc->wc_flags |= IB_WC_WITH_IMM;
+		/* fall through */
+	case MLX5_OPCODE_SEND:
+	case MLX5_OPCODE_SEND_INVAL:
+		wc->opcode    = IB_WC_SEND;
+		break;
+	case MLX5_OPCODE_RDMA_READ:
+		wc->opcode    = IB_WC_RDMA_READ;
+		wc->byte_len  = be32_to_cpu(cqe->byte_cnt);
+		break;
+	case MLX5_OPCODE_ATOMIC_CS:
+		wc->opcode    = IB_WC_COMP_SWAP;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_ATOMIC_FA:
+		wc->opcode    = IB_WC_FETCH_ADD;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_ATOMIC_MASKED_CS:
+		wc->opcode    = IB_WC_MASKED_COMP_SWAP;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_ATOMIC_MASKED_FA:
+		wc->opcode    = IB_WC_MASKED_FETCH_ADD;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_UMR:
+		wc->opcode = get_umr_comp(wq, idx);
+		break;
+	}
+}
+
+enum {
+	MLX5_GRH_IN_BUFFER = 1,
+	MLX5_GRH_IN_CQE	   = 2,
+};
+
+static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
+			     struct mlx5_ib_qp *qp)
+{
+	enum rdma_link_layer ll = rdma_port_get_link_layer(qp->ibqp.device, 1);
+	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
+	struct mlx5_ib_srq *srq;
+	struct mlx5_ib_wq *wq;
+	u16 wqe_ctr;
+	u8  roce_packet_type;
+	bool vlan_present;
+	u8 g;
+
+	if (qp->ibqp.srq || qp->ibqp.xrcd) {
+		struct mlx5_core_srq *msrq = NULL;
+
+		if (qp->ibqp.xrcd) {
+			msrq = mlx5_core_get_srq(dev->mdev,
+						 be32_to_cpu(cqe->srqn));
+			srq = to_mibsrq(msrq);
+		} else {
+			srq = to_msrq(qp->ibqp.srq);
+		}
+		if (srq) {
+			wqe_ctr = be16_to_cpu(cqe->wqe_counter);
+			wc->wr_id = srq->wrid[wqe_ctr];
+			mlx5_ib_free_srq_wqe(srq, wqe_ctr);
+			if (msrq && atomic_dec_and_test(&msrq->refcount))
+				complete(&msrq->free);
+		}
+	} else {
+		wq	  = &qp->rq;
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	}
+	wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+
+	switch (cqe->op_own >> 4) {
+	case MLX5_CQE_RESP_WR_IMM:
+		wc->opcode	= IB_WC_RECV_RDMA_WITH_IMM;
+		wc->wc_flags	= IB_WC_WITH_IMM;
+		wc->ex.imm_data = cqe->imm_inval_pkey;
+		break;
+	case MLX5_CQE_RESP_SEND:
+		wc->opcode   = IB_WC_RECV;
+		wc->wc_flags = IB_WC_IP_CSUM_OK;
+		if (unlikely(!((cqe->hds_ip_ext & CQE_L3_OK) &&
+			       (cqe->hds_ip_ext & CQE_L4_OK))))
+			wc->wc_flags = 0;
+		break;
+	case MLX5_CQE_RESP_SEND_IMM:
+		wc->opcode	= IB_WC_RECV;
+		wc->wc_flags	= IB_WC_WITH_IMM;
+		wc->ex.imm_data = cqe->imm_inval_pkey;
+		break;
+	case MLX5_CQE_RESP_SEND_INV:
+		wc->opcode	= IB_WC_RECV;
+		wc->wc_flags	= IB_WC_WITH_INVALIDATE;
+		wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey);
+		break;
+	}
+	wc->src_qp	   = be32_to_cpu(cqe->flags_rqpn) & 0xffffff;
+	wc->dlid_path_bits = cqe->ml_path;
+	g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3;
+	wc->wc_flags |= g ? IB_WC_GRH : 0;
+	if (unlikely(is_qp1(qp->ibqp.qp_type))) {
+		u16 pkey = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff;
+
+		ib_find_cached_pkey(&dev->ib_dev, qp->port, pkey,
+				    &wc->pkey_index);
+	} else {
+		wc->pkey_index = 0;
+	}
+
+	if (ll != IB_LINK_LAYER_ETHERNET) {
+		wc->slid = be16_to_cpu(cqe->slid);
+		wc->sl = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0xf;
+		return;
+	}
+
+	wc->slid = 0;
+	vlan_present = cqe->l4_l3_hdr_type & 0x1;
+	roce_packet_type   = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0x3;
+	if (vlan_present) {
+		wc->vlan_id = (be16_to_cpu(cqe->vlan_info)) & 0xfff;
+		wc->sl = (be16_to_cpu(cqe->vlan_info) >> 13) & 0x7;
+		wc->wc_flags |= IB_WC_WITH_VLAN;
+	} else {
+		wc->sl = 0;
+	}
+
+	switch (roce_packet_type) {
+	case MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH:
+		wc->network_hdr_type = RDMA_NETWORK_IB;
+		break;
+	case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6:
+		wc->network_hdr_type = RDMA_NETWORK_IPV6;
+		break;
+	case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4:
+		wc->network_hdr_type = RDMA_NETWORK_IPV4;
+		break;
+	}
+	wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
+}
+
+static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
+{
+	mlx5_ib_warn(dev, "dump error cqe\n");
+	mlx5_dump_err_cqe(dev->mdev, cqe);
+}
+
+static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev,
+				  struct mlx5_err_cqe *cqe,
+				  struct ib_wc *wc)
+{
+	int dump = 1;
+
+	switch (cqe->syndrome) {
+	case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+		wc->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+		wc->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR:
+		wc->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_WR_FLUSH_ERR:
+		dump = 0;
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_MW_BIND_ERR:
+		wc->status = IB_WC_MW_BIND_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_BAD_RESP_ERR:
+		wc->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+		wc->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+		wc->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+		wc->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_OP_ERR:
+		wc->status = IB_WC_REM_OP_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+		wc->status = IB_WC_RETRY_EXC_ERR;
+		dump = 0;
+		break;
+	case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+		wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+		dump = 0;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+		wc->status = IB_WC_REM_ABORT_ERR;
+		break;
+	default:
+		wc->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	wc->vendor_err = cqe->vendor_err_synd;
+	if (dump)
+		dump_cqe(dev, cqe);
+}
+
+static int is_atomic_response(struct mlx5_ib_qp *qp, uint16_t idx)
+{
+	/* TBD: waiting decision
+	*/
+	return 0;
+}
+
+static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, uint16_t idx)
+{
+	struct mlx5_wqe_data_seg *dpseg;
+	void *addr;
+
+	dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) +
+		sizeof(struct mlx5_wqe_raddr_seg) +
+		sizeof(struct mlx5_wqe_atomic_seg);
+	addr = (void *)(unsigned long)be64_to_cpu(dpseg->addr);
+	return addr;
+}
+
+static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
+			  uint16_t idx)
+{
+	void *addr;
+	int byte_count;
+	int i;
+
+	if (!is_atomic_response(qp, idx))
+		return;
+
+	byte_count = be32_to_cpu(cqe64->byte_cnt);
+	addr = mlx5_get_atomic_laddr(qp, idx);
+
+	if (byte_count == 4) {
+		*(uint32_t *)addr = be32_to_cpu(*((__be32 *)addr));
+	} else {
+		for (i = 0; i < byte_count; i += 8) {
+			*(uint64_t *)addr = be64_to_cpu(*((__be64 *)addr));
+			addr += 8;
+		}
+	}
+
+	return;
+}
+
+static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
+			   u16 tail, u16 head)
+{
+	u16 idx;
+
+	do {
+		idx = tail & (qp->sq.wqe_cnt - 1);
+		handle_atomic(qp, cqe64, idx);
+		if (idx == head)
+			break;
+
+		tail = qp->sq.w_list[idx].next;
+	} while (1);
+	tail = qp->sq.w_list[idx].next;
+	qp->sq.last_poll = tail;
+}
+
+static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf)
+{
+	mlx5_frag_buf_free(dev->mdev, &buf->fbc.frag_buf);
+}
+
+static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe,
+			     struct ib_sig_err *item)
+{
+	u16 syndrome = be16_to_cpu(cqe->syndrome);
+
+#define GUARD_ERR   (1 << 13)
+#define APPTAG_ERR  (1 << 12)
+#define REFTAG_ERR  (1 << 11)
+
+	if (syndrome & GUARD_ERR) {
+		item->err_type = IB_SIG_BAD_GUARD;
+		item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16;
+		item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16;
+	} else
+	if (syndrome & REFTAG_ERR) {
+		item->err_type = IB_SIG_BAD_REFTAG;
+		item->expected = be32_to_cpu(cqe->expected_reftag);
+		item->actual = be32_to_cpu(cqe->actual_reftag);
+	} else
+	if (syndrome & APPTAG_ERR) {
+		item->err_type = IB_SIG_BAD_APPTAG;
+		item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff;
+		item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff;
+	} else {
+		pr_err("Got signature completion error with bad syndrome %04x\n",
+		       syndrome);
+	}
+
+	item->sig_err_offset = be64_to_cpu(cqe->err_offset);
+	item->key = be32_to_cpu(cqe->mkey);
+}
+
+static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries,
+			 struct ib_wc *wc, int *npolled)
+{
+	struct mlx5_ib_wq *wq;
+	unsigned int cur;
+	unsigned int idx;
+	int np;
+	int i;
+
+	wq = &qp->sq;
+	cur = wq->head - wq->tail;
+	np = *npolled;
+
+	if (cur == 0)
+		return;
+
+	for (i = 0;  i < cur && np < num_entries; i++) {
+		idx = wq->last_poll & (wq->wqe_cnt - 1);
+		wc->wr_id = wq->wrid[idx];
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
+		wq->tail++;
+		np++;
+		wc->qp = &qp->ibqp;
+		wc++;
+		wq->last_poll = wq->w_list[idx].next;
+	}
+	*npolled = np;
+}
+
+static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries,
+			 struct ib_wc *wc, int *npolled)
+{
+	struct mlx5_ib_wq *wq;
+	unsigned int cur;
+	int np;
+	int i;
+
+	wq = &qp->rq;
+	cur = wq->head - wq->tail;
+	np = *npolled;
+
+	if (cur == 0)
+		return;
+
+	for (i = 0;  i < cur && np < num_entries; i++) {
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
+		wq->tail++;
+		np++;
+		wc->qp = &qp->ibqp;
+		wc++;
+	}
+	*npolled = np;
+}
+
+static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries,
+				 struct ib_wc *wc, int *npolled)
+{
+	struct mlx5_ib_qp *qp;
+
+	*npolled = 0;
+	/* Find uncompleted WQEs belonging to that cq and return mmics ones */
+	list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) {
+		sw_send_comp(qp, num_entries, wc + *npolled, npolled);
+		if (*npolled >= num_entries)
+			return;
+	}
+
+	list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) {
+		sw_recv_comp(qp, num_entries, wc + *npolled, npolled);
+		if (*npolled >= num_entries)
+			return;
+	}
+}
+
+static int mlx5_poll_one(struct mlx5_ib_cq *cq,
+			 struct mlx5_ib_qp **cur_qp,
+			 struct ib_wc *wc)
+{
+	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+	struct mlx5_err_cqe *err_cqe;
+	struct mlx5_cqe64 *cqe64;
+	struct mlx5_core_qp *mqp;
+	struct mlx5_ib_wq *wq;
+	struct mlx5_sig_err_cqe *sig_err_cqe;
+	struct mlx5_core_mkey *mmkey;
+	struct mlx5_ib_mr *mr;
+	uint8_t opcode;
+	uint32_t qpn;
+	u16 wqe_ctr;
+	void *cqe;
+	int idx;
+
+repoll:
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+
+	++cq->mcq.cons_index;
+
+	/* Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	opcode = cqe64->op_own >> 4;
+	if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) {
+		if (likely(cq->resize_buf)) {
+			free_cq_buf(dev, &cq->buf);
+			cq->buf = *cq->resize_buf;
+			kfree(cq->resize_buf);
+			cq->resize_buf = NULL;
+			goto repoll;
+		} else {
+			mlx5_ib_warn(dev, "unexpected resize cqe\n");
+		}
+	}
+
+	qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff;
+	if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) {
+		/* We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		mqp = __mlx5_qp_lookup(dev->mdev, qpn);
+		*cur_qp = to_mibqp(mqp);
+	}
+
+	wc->qp  = &(*cur_qp)->ibqp;
+	switch (opcode) {
+	case MLX5_CQE_REQ:
+		wq = &(*cur_qp)->sq;
+		wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+		idx = wqe_ctr & (wq->wqe_cnt - 1);
+		handle_good_req(wc, cqe64, wq, idx);
+		handle_atomics(*cur_qp, cqe64, wq->last_poll, idx);
+		wc->wr_id = wq->wrid[idx];
+		wq->tail = wq->wqe_head[idx] + 1;
+		wc->status = IB_WC_SUCCESS;
+		break;
+	case MLX5_CQE_RESP_WR_IMM:
+	case MLX5_CQE_RESP_SEND:
+	case MLX5_CQE_RESP_SEND_IMM:
+	case MLX5_CQE_RESP_SEND_INV:
+		handle_responder(wc, cqe64, *cur_qp);
+		wc->status = IB_WC_SUCCESS;
+		break;
+	case MLX5_CQE_RESIZE_CQ:
+		break;
+	case MLX5_CQE_REQ_ERR:
+	case MLX5_CQE_RESP_ERR:
+		err_cqe = (struct mlx5_err_cqe *)cqe64;
+		mlx5_handle_error_cqe(dev, err_cqe, wc);
+		mlx5_ib_dbg(dev, "%s error cqe on cqn 0x%x:\n",
+			    opcode == MLX5_CQE_REQ_ERR ?
+			    "Requestor" : "Responder", cq->mcq.cqn);
+		mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n",
+			    err_cqe->syndrome, err_cqe->vendor_err_synd);
+		if (opcode == MLX5_CQE_REQ_ERR) {
+			wq = &(*cur_qp)->sq;
+			wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+			idx = wqe_ctr & (wq->wqe_cnt - 1);
+			wc->wr_id = wq->wrid[idx];
+			wq->tail = wq->wqe_head[idx] + 1;
+		} else {
+			struct mlx5_ib_srq *srq;
+
+			if ((*cur_qp)->ibqp.srq) {
+				srq = to_msrq((*cur_qp)->ibqp.srq);
+				wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+				wc->wr_id = srq->wrid[wqe_ctr];
+				mlx5_ib_free_srq_wqe(srq, wqe_ctr);
+			} else {
+				wq = &(*cur_qp)->rq;
+				wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+				++wq->tail;
+			}
+		}
+		break;
+	case MLX5_CQE_SIG_ERR:
+		sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64;
+
+		read_lock(&dev->mdev->priv.mkey_table.lock);
+		mmkey = __mlx5_mr_lookup(dev->mdev,
+					 mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
+		mr = to_mibmr(mmkey);
+		get_sig_err_item(sig_err_cqe, &mr->sig->err_item);
+		mr->sig->sig_err_exists = true;
+		mr->sig->sigerr_count++;
+
+		mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n",
+			     cq->mcq.cqn, mr->sig->err_item.key,
+			     mr->sig->err_item.err_type,
+			     mr->sig->err_item.sig_err_offset,
+			     mr->sig->err_item.expected,
+			     mr->sig->err_item.actual);
+
+		read_unlock(&dev->mdev->priv.mkey_table.lock);
+		goto repoll;
+	}
+
+	return 0;
+}
+
+static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries,
+			struct ib_wc *wc)
+{
+	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+	struct mlx5_ib_wc *soft_wc, *next;
+	int npolled = 0;
+
+	list_for_each_entry_safe(soft_wc, next, &cq->wc_list, list) {
+		if (npolled >= num_entries)
+			break;
+
+		mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n",
+			    cq->mcq.cqn);
+
+		wc[npolled++] = soft_wc->wc;
+		list_del(&soft_wc->list);
+		kfree(soft_wc);
+	}
+
+	return npolled;
+}
+
+int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct mlx5_ib_cq *cq = to_mcq(ibcq);
+	struct mlx5_ib_qp *cur_qp = NULL;
+	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	unsigned long flags;
+	int soft_polled = 0;
+	int npolled;
+
+	spin_lock_irqsave(&cq->lock, flags);
+	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
+		goto out;
+	}
+
+	if (unlikely(!list_empty(&cq->wc_list)))
+		soft_polled = poll_soft_wc(cq, num_entries, wc);
+
+	for (npolled = 0; npolled < num_entries - soft_polled; npolled++) {
+		if (mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled))
+			break;
+	}
+
+	if (npolled)
+		mlx5_cq_set_ci(&cq->mcq);
+out:
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return soft_polled + npolled;
+}
+
+int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev;
+	struct mlx5_ib_cq *cq = to_mcq(ibcq);
+	void __iomem *uar_page = mdev->priv.uar->map;
+	unsigned long irq_flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&cq->lock, irq_flags);
+	if (cq->notify_flags != IB_CQ_NEXT_COMP)
+		cq->notify_flags = flags & IB_CQ_SOLICITED_MASK;
+
+	if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !list_empty(&cq->wc_list))
+		ret = 1;
+	spin_unlock_irqrestore(&cq->lock, irq_flags);
+
+	mlx5_cq_arm(&cq->mcq,
+		    (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT,
+		    uar_page, to_mcq(ibcq)->mcq.cons_index);
+
+	return ret;
+}
+
+static int alloc_cq_frag_buf(struct mlx5_ib_dev *dev,
+			     struct mlx5_ib_cq_buf *buf,
+			     int nent,
+			     int cqe_size)
+{
+	struct mlx5_frag_buf_ctrl *c = &buf->fbc;
+	struct mlx5_frag_buf *frag_buf = &c->frag_buf;
+	u32 cqc_buff[MLX5_ST_SZ_DW(cqc)] = {0};
+	int err;
+
+	MLX5_SET(cqc, cqc_buff, log_cq_size, ilog2(cqe_size));
+	MLX5_SET(cqc, cqc_buff, cqe_sz, (cqe_size == 128) ? 1 : 0);
+
+	mlx5_core_init_cq_frag_buf(&buf->fbc, cqc_buff);
+
+	err = mlx5_frag_buf_alloc_node(dev->mdev,
+				       nent * cqe_size,
+				       frag_buf,
+				       dev->mdev->priv.numa_node);
+	if (err)
+		return err;
+
+	buf->cqe_size = cqe_size;
+	buf->nent = nent;
+
+	return 0;
+}
+
+static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
+			  struct ib_ucontext *context, struct mlx5_ib_cq *cq,
+			  int entries, u32 **cqb,
+			  int *cqe_size, int *index, int *inlen)
+{
+	struct mlx5_ib_create_cq ucmd = {};
+	size_t ucmdlen;
+	int page_shift;
+	__be64 *pas;
+	int npages;
+	int ncont;
+	void *cqc;
+	int err;
+
+	ucmdlen = udata->inlen < sizeof(ucmd) ?
+		  (sizeof(ucmd) - sizeof(ucmd.flags)) : sizeof(ucmd);
+
+	if (ib_copy_from_udata(&ucmd, udata, ucmdlen))
+		return -EFAULT;
+
+	if (ucmdlen == sizeof(ucmd) &&
+	    (ucmd.flags & ~(MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD)))
+		return -EINVAL;
+
+	if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128)
+		return -EINVAL;
+
+	*cqe_size = ucmd.cqe_size;
+
+	cq->buf.umem = ib_umem_get(context, ucmd.buf_addr,
+				   entries * ucmd.cqe_size,
+				   IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(cq->buf.umem)) {
+		err = PTR_ERR(cq->buf.umem);
+		return err;
+	}
+
+	err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+				  &cq->db);
+	if (err)
+		goto err_umem;
+
+	mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, 0, &npages, &page_shift,
+			   &ncont, NULL);
+	mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n",
+		    ucmd.buf_addr, entries * ucmd.cqe_size, npages, page_shift, ncont);
+
+	*inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
+		 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * ncont;
+	*cqb = kvzalloc(*inlen, GFP_KERNEL);
+	if (!*cqb) {
+		err = -ENOMEM;
+		goto err_db;
+	}
+
+	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas);
+	mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, pas, 0);
+
+	cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context);
+	MLX5_SET(cqc, cqc, log_page_size,
+		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+
+	*index = to_mucontext(context)->bfregi.sys_pages[0];
+
+	if (ucmd.cqe_comp_en == 1) {
+		if (!((*cqe_size == 128 &&
+		       MLX5_CAP_GEN(dev->mdev, cqe_compression_128)) ||
+		      (*cqe_size == 64  &&
+		       MLX5_CAP_GEN(dev->mdev, cqe_compression)))) {
+			err = -EOPNOTSUPP;
+			mlx5_ib_warn(dev, "CQE compression is not supported for size %d!\n",
+				     *cqe_size);
+			goto err_cqb;
+		}
+
+		if (unlikely(!ucmd.cqe_comp_res_format ||
+			     !(ucmd.cqe_comp_res_format <
+			       MLX5_IB_CQE_RES_RESERVED) ||
+			     (ucmd.cqe_comp_res_format &
+			      (ucmd.cqe_comp_res_format - 1)))) {
+			err = -EOPNOTSUPP;
+			mlx5_ib_warn(dev, "CQE compression res format %d is not supported!\n",
+				     ucmd.cqe_comp_res_format);
+			goto err_cqb;
+		}
+
+		MLX5_SET(cqc, cqc, cqe_comp_en, 1);
+		MLX5_SET(cqc, cqc, mini_cqe_res_format,
+			 ilog2(ucmd.cqe_comp_res_format));
+	}
+
+	if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD) {
+		if (*cqe_size != 128 ||
+		    !MLX5_CAP_GEN(dev->mdev, cqe_128_always)) {
+			err = -EOPNOTSUPP;
+			mlx5_ib_warn(dev,
+				     "CQE padding is not supported for CQE size of %dB!\n",
+				     *cqe_size);
+			goto err_cqb;
+		}
+
+		cq->private_flags |= MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD;
+	}
+
+	return 0;
+
+err_cqb:
+	kfree(*cqb);
+
+err_db:
+	mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
+
+err_umem:
+	ib_umem_release(cq->buf.umem);
+	return err;
+}
+
+static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context)
+{
+	mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
+	ib_umem_release(cq->buf.umem);
+}
+
+static void init_cq_frag_buf(struct mlx5_ib_cq *cq,
+			     struct mlx5_ib_cq_buf *buf)
+{
+	int i;
+	void *cqe;
+	struct mlx5_cqe64 *cqe64;
+
+	for (i = 0; i < buf->nent; i++) {
+		cqe = get_cqe(cq, i);
+		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
+		cqe64->op_own = MLX5_CQE_INVALID << 4;
+	}
+}
+
+static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
+			    int entries, int cqe_size,
+			    u32 **cqb, int *index, int *inlen)
+{
+	__be64 *pas;
+	void *cqc;
+	int err;
+
+	err = mlx5_db_alloc(dev->mdev, &cq->db);
+	if (err)
+		return err;
+
+	cq->mcq.set_ci_db  = cq->db.db;
+	cq->mcq.arm_db     = cq->db.db + 1;
+	cq->mcq.cqe_sz = cqe_size;
+
+	err = alloc_cq_frag_buf(dev, &cq->buf, entries, cqe_size);
+	if (err)
+		goto err_db;
+
+	init_cq_frag_buf(cq, &cq->buf);
+
+	*inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
+		 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
+		 cq->buf.fbc.frag_buf.npages;
+	*cqb = kvzalloc(*inlen, GFP_KERNEL);
+	if (!*cqb) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+
+	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas);
+	mlx5_fill_page_frag_array(&cq->buf.fbc.frag_buf, pas);
+
+	cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context);
+	MLX5_SET(cqc, cqc, log_page_size,
+		 cq->buf.fbc.frag_buf.page_shift -
+		 MLX5_ADAPTER_PAGE_SHIFT);
+
+	*index = dev->mdev->priv.uar->index;
+
+	return 0;
+
+err_buf:
+	free_cq_buf(dev, &cq->buf);
+
+err_db:
+	mlx5_db_free(dev->mdev, &cq->db);
+	return err;
+}
+
+static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq)
+{
+	free_cq_buf(dev, &cq->buf);
+	mlx5_db_free(dev->mdev, &cq->db);
+}
+
+static void notify_soft_wc_handler(struct work_struct *work)
+{
+	struct mlx5_ib_cq *cq = container_of(work, struct mlx5_ib_cq,
+					     notify_work);
+
+	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
+				const struct ib_cq_init_attr *attr,
+				struct ib_ucontext *context,
+				struct ib_udata *udata)
+{
+	int entries = attr->cqe;
+	int vector = attr->comp_vector;
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_cq *cq;
+	int uninitialized_var(index);
+	int uninitialized_var(inlen);
+	u32 *cqb = NULL;
+	void *cqc;
+	int cqe_size;
+	unsigned int irqn;
+	int eqn;
+	int err;
+
+	if (entries < 0 ||
+	    (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))))
+		return ERR_PTR(-EINVAL);
+
+	if (check_cq_create_flags(attr->flags))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))
+		return ERR_PTR(-EINVAL);
+
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	cq->ibcq.cqe = entries - 1;
+	mutex_init(&cq->resize_mutex);
+	spin_lock_init(&cq->lock);
+	cq->resize_buf = NULL;
+	cq->resize_umem = NULL;
+	cq->create_flags = attr->flags;
+	INIT_LIST_HEAD(&cq->list_send_qp);
+	INIT_LIST_HEAD(&cq->list_recv_qp);
+
+	if (context) {
+		err = create_cq_user(dev, udata, context, cq, entries,
+				     &cqb, &cqe_size, &index, &inlen);
+		if (err)
+			goto err_create;
+	} else {
+		cqe_size = cache_line_size() == 128 ? 128 : 64;
+		err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb,
+				       &index, &inlen);
+		if (err)
+			goto err_create;
+
+		INIT_WORK(&cq->notify_work, notify_soft_wc_handler);
+	}
+
+	err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn);
+	if (err)
+		goto err_cqb;
+
+	cq->cqe_size = cqe_size;
+
+	cqc = MLX5_ADDR_OF(create_cq_in, cqb, cq_context);
+	MLX5_SET(cqc, cqc, cqe_sz,
+		 cqe_sz_to_mlx_sz(cqe_size,
+				  cq->private_flags &
+				  MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD));
+	MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries));
+	MLX5_SET(cqc, cqc, uar_page, index);
+	MLX5_SET(cqc, cqc, c_eqn, eqn);
+	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
+	if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN)
+		MLX5_SET(cqc, cqc, oi, 1);
+
+	err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen);
+	if (err)
+		goto err_cqb;
+
+	mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn);
+	cq->mcq.irqn = irqn;
+	if (context)
+		cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp;
+	else
+		cq->mcq.comp  = mlx5_ib_cq_comp;
+	cq->mcq.event = mlx5_ib_cq_event;
+
+	INIT_LIST_HEAD(&cq->wc_list);
+
+	if (context)
+		if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) {
+			err = -EFAULT;
+			goto err_cmd;
+		}
+
+
+	kvfree(cqb);
+	return &cq->ibcq;
+
+err_cmd:
+	mlx5_core_destroy_cq(dev->mdev, &cq->mcq);
+
+err_cqb:
+	kvfree(cqb);
+	if (context)
+		destroy_cq_user(cq, context);
+	else
+		destroy_cq_kernel(dev, cq);
+
+err_create:
+	kfree(cq);
+
+	return ERR_PTR(err);
+}
+
+
+int mlx5_ib_destroy_cq(struct ib_cq *cq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(cq->device);
+	struct mlx5_ib_cq *mcq = to_mcq(cq);
+	struct ib_ucontext *context = NULL;
+
+	if (cq->uobject)
+		context = cq->uobject->context;
+
+	mlx5_core_destroy_cq(dev->mdev, &mcq->mcq);
+	if (context)
+		destroy_cq_user(mcq, context);
+	else
+		destroy_cq_kernel(dev, mcq);
+
+	kfree(mcq);
+
+	return 0;
+}
+
+static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn)
+{
+	return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff);
+}
+
+void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq)
+{
+	struct mlx5_cqe64 *cqe64, *dest64;
+	void *cqe, *dest;
+	u32 prod_index;
+	int nfreed = 0;
+	u8 owner_bit;
+
+	if (!cq)
+		return;
+
+	/* First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); prod_index++)
+		if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+			break;
+
+	/* Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+		if (is_equal_rsn(cqe64, rsn)) {
+			if (srq && (ntohl(cqe64->srqn) & 0xffffff))
+				mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter));
+			++nfreed;
+		} else if (nfreed) {
+			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+			dest64 = (cq->mcq.cqe_sz == 64) ? dest : dest + 64;
+			owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK;
+			memcpy(dest, cqe, cq->mcq.cqe_sz);
+			dest64->op_own = owner_bit |
+				(dest64->op_own & ~MLX5_CQE_OWNER_MASK);
+		}
+	}
+
+	if (nfreed) {
+		cq->mcq.cons_index += nfreed;
+		/* Make sure update of buffer contents is done before
+		 * updating consumer index.
+		 */
+		wmb();
+		mlx5_cq_set_ci(&cq->mcq);
+	}
+}
+
+void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq)
+{
+	if (!cq)
+		return;
+
+	spin_lock_irq(&cq->lock);
+	__mlx5_ib_cq_clean(cq, qpn, srq);
+	spin_unlock_irq(&cq->lock);
+}
+
+int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	struct mlx5_ib_dev *dev = to_mdev(cq->device);
+	struct mlx5_ib_cq *mcq = to_mcq(cq);
+	int err;
+
+	if (!MLX5_CAP_GEN(dev->mdev, cq_moderation))
+		return -ENOSYS;
+
+	if (cq_period > MLX5_MAX_CQ_PERIOD)
+		return -EINVAL;
+
+	err = mlx5_core_modify_cq_moderation(dev->mdev, &mcq->mcq,
+					     cq_period, cq_count);
+	if (err)
+		mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn);
+
+	return err;
+}
+
+static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
+		       int entries, struct ib_udata *udata, int *npas,
+		       int *page_shift, int *cqe_size)
+{
+	struct mlx5_ib_resize_cq ucmd;
+	struct ib_umem *umem;
+	int err;
+	int npages;
+	struct ib_ucontext *context = cq->buf.umem->context;
+
+	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
+	if (err)
+		return err;
+
+	if (ucmd.reserved0 || ucmd.reserved1)
+		return -EINVAL;
+
+	/* check multiplication overflow */
+	if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1)
+		return -EINVAL;
+
+	umem = ib_umem_get(context, ucmd.buf_addr,
+			   (size_t)ucmd.cqe_size * entries,
+			   IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(umem)) {
+		err = PTR_ERR(umem);
+		return err;
+	}
+
+	mlx5_ib_cont_pages(umem, ucmd.buf_addr, 0, &npages, page_shift,
+			   npas, NULL);
+
+	cq->resize_umem = umem;
+	*cqe_size = ucmd.cqe_size;
+
+	return 0;
+}
+
+static void un_resize_user(struct mlx5_ib_cq *cq)
+{
+	ib_umem_release(cq->resize_umem);
+}
+
+static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
+			 int entries, int cqe_size)
+{
+	int err;
+
+	cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL);
+	if (!cq->resize_buf)
+		return -ENOMEM;
+
+	err = alloc_cq_frag_buf(dev, cq->resize_buf, entries, cqe_size);
+	if (err)
+		goto ex;
+
+	init_cq_frag_buf(cq, cq->resize_buf);
+
+	return 0;
+
+ex:
+	kfree(cq->resize_buf);
+	return err;
+}
+
+static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq)
+{
+	free_cq_buf(dev, cq->resize_buf);
+	cq->resize_buf = NULL;
+}
+
+static int copy_resize_cqes(struct mlx5_ib_cq *cq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+	struct mlx5_cqe64 *scqe64;
+	struct mlx5_cqe64 *dcqe64;
+	void *start_cqe;
+	void *scqe;
+	void *dcqe;
+	int ssize;
+	int dsize;
+	int i;
+	u8 sw_own;
+
+	ssize = cq->buf.cqe_size;
+	dsize = cq->resize_buf->cqe_size;
+	if (ssize != dsize) {
+		mlx5_ib_warn(dev, "resize from different cqe size is not supported\n");
+		return -EINVAL;
+	}
+
+	i = cq->mcq.cons_index;
+	scqe = get_sw_cqe(cq, i);
+	scqe64 = ssize == 64 ? scqe : scqe + 64;
+	start_cqe = scqe;
+	if (!scqe) {
+		mlx5_ib_warn(dev, "expected cqe in sw ownership\n");
+		return -EINVAL;
+	}
+
+	while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) {
+		dcqe = mlx5_frag_buf_get_wqe(&cq->resize_buf->fbc,
+					     (i + 1) & cq->resize_buf->nent);
+		dcqe64 = dsize == 64 ? dcqe : dcqe + 64;
+		sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent);
+		memcpy(dcqe, scqe, dsize);
+		dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own;
+
+		++i;
+		scqe = get_sw_cqe(cq, i);
+		scqe64 = ssize == 64 ? scqe : scqe + 64;
+		if (!scqe) {
+			mlx5_ib_warn(dev, "expected cqe in sw ownership\n");
+			return -EINVAL;
+		}
+
+		if (scqe == start_cqe) {
+			pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n",
+				cq->mcq.cqn);
+			return -ENOMEM;
+		}
+	}
+	++cq->mcq.cons_index;
+	return 0;
+}
+
+int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibcq->device);
+	struct mlx5_ib_cq *cq = to_mcq(ibcq);
+	void *cqc;
+	u32 *in;
+	int err;
+	int npas;
+	__be64 *pas;
+	int page_shift;
+	int inlen;
+	int uninitialized_var(cqe_size);
+	unsigned long flags;
+
+	if (!MLX5_CAP_GEN(dev->mdev, cq_resize)) {
+		pr_info("Firmware does not support resize CQ\n");
+		return -ENOSYS;
+	}
+
+	if (entries < 1 ||
+	    entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) {
+		mlx5_ib_warn(dev, "wrong entries number %d, max %d\n",
+			     entries,
+			     1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz));
+		return -EINVAL;
+	}
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)) + 1)
+		return -EINVAL;
+
+	if (entries == ibcq->cqe + 1)
+		return 0;
+
+	mutex_lock(&cq->resize_mutex);
+	if (udata) {
+		err = resize_user(dev, cq, entries, udata, &npas, &page_shift,
+				  &cqe_size);
+	} else {
+		cqe_size = 64;
+		err = resize_kernel(dev, cq, entries, cqe_size);
+		if (!err) {
+			struct mlx5_frag_buf_ctrl *c;
+
+			c = &cq->resize_buf->fbc;
+			npas = c->frag_buf.npages;
+			page_shift = c->frag_buf.page_shift;
+		}
+	}
+
+	if (err)
+		goto ex;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_cq_in) +
+		MLX5_FLD_SZ_BYTES(modify_cq_in, pas[0]) * npas;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto ex_resize;
+	}
+
+	pas = (__be64 *)MLX5_ADDR_OF(modify_cq_in, in, pas);
+	if (udata)
+		mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift,
+				     pas, 0);
+	else
+		mlx5_fill_page_frag_array(&cq->resize_buf->fbc.frag_buf,
+					  pas);
+
+	MLX5_SET(modify_cq_in, in,
+		 modify_field_select_resize_field_select.resize_field_select.resize_field_select,
+		 MLX5_MODIFY_CQ_MASK_LOG_SIZE  |
+		 MLX5_MODIFY_CQ_MASK_PG_OFFSET |
+		 MLX5_MODIFY_CQ_MASK_PG_SIZE);
+
+	cqc = MLX5_ADDR_OF(modify_cq_in, in, cq_context);
+
+	MLX5_SET(cqc, cqc, log_page_size,
+		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET(cqc, cqc, cqe_sz,
+		 cqe_sz_to_mlx_sz(cqe_size,
+				  cq->private_flags &
+				  MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD));
+	MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries));
+
+	MLX5_SET(modify_cq_in, in, op_mod, MLX5_CQ_OPMOD_RESIZE);
+	MLX5_SET(modify_cq_in, in, cqn, cq->mcq.cqn);
+
+	err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, in, inlen);
+	if (err)
+		goto ex_alloc;
+
+	if (udata) {
+		cq->ibcq.cqe = entries - 1;
+		ib_umem_release(cq->buf.umem);
+		cq->buf.umem = cq->resize_umem;
+		cq->resize_umem = NULL;
+	} else {
+		struct mlx5_ib_cq_buf tbuf;
+		int resized = 0;
+
+		spin_lock_irqsave(&cq->lock, flags);
+		if (cq->resize_buf) {
+			err = copy_resize_cqes(cq);
+			if (!err) {
+				tbuf = cq->buf;
+				cq->buf = *cq->resize_buf;
+				kfree(cq->resize_buf);
+				cq->resize_buf = NULL;
+				resized = 1;
+			}
+		}
+		cq->ibcq.cqe = entries - 1;
+		spin_unlock_irqrestore(&cq->lock, flags);
+		if (resized)
+			free_cq_buf(dev, &tbuf);
+	}
+	mutex_unlock(&cq->resize_mutex);
+
+	kvfree(in);
+	return 0;
+
+ex_alloc:
+	kvfree(in);
+
+ex_resize:
+	if (udata)
+		un_resize_user(cq);
+	else
+		un_resize_kernel(dev, cq);
+ex:
+	mutex_unlock(&cq->resize_mutex);
+	return err;
+}
+
+int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq)
+{
+	struct mlx5_ib_cq *cq;
+
+	if (!ibcq)
+		return 128;
+
+	cq = to_mcq(ibcq);
+	return cq->cqe_size;
+}
+
+/* Called from atomic context */
+int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc)
+{
+	struct mlx5_ib_wc *soft_wc;
+	struct mlx5_ib_cq *cq = to_mcq(ibcq);
+	unsigned long flags;
+
+	soft_wc = kmalloc(sizeof(*soft_wc), GFP_ATOMIC);
+	if (!soft_wc)
+		return -ENOMEM;
+
+	soft_wc->wc = *wc;
+	spin_lock_irqsave(&cq->lock, flags);
+	list_add_tail(&soft_wc->list, &cq->wc_list);
+	if (cq->notify_flags == IB_CQ_NEXT_COMP ||
+	    wc->status != IB_WC_SUCCESS) {
+		cq->notify_flags = 0;
+		schedule_work(&cq->notify_work);
+	}
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c
new file mode 100644
index 0000000..a0e4e6d
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/doorbell.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kref.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+
+#include "mlx5_ib.h"
+
+struct mlx5_ib_user_db_page {
+	struct list_head	list;
+	struct ib_umem	       *umem;
+	unsigned long		user_virt;
+	int			refcnt;
+};
+
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+			struct mlx5_db *db)
+{
+	struct mlx5_ib_user_db_page *page;
+	int err = 0;
+
+	mutex_lock(&context->db_page_mutex);
+
+	list_for_each_entry(page, &context->db_page_list, list)
+		if (page->user_virt == (virt & PAGE_MASK))
+			goto found;
+
+	page = kmalloc(sizeof(*page), GFP_KERNEL);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	page->user_virt = (virt & PAGE_MASK);
+	page->refcnt    = 0;
+	page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
+				      PAGE_SIZE, 0, 0);
+	if (IS_ERR(page->umem)) {
+		err = PTR_ERR(page->umem);
+		kfree(page);
+		goto out;
+	}
+
+	list_add(&page->list, &context->db_page_list);
+
+found:
+	db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK);
+	db->u.user_page = page;
+	++page->refcnt;
+
+out:
+	mutex_unlock(&context->db_page_mutex);
+
+	return err;
+}
+
+void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db)
+{
+	mutex_lock(&context->db_page_mutex);
+
+	if (!--db->u.user_page->refcnt) {
+		list_del(&db->u.user_page->list);
+		ib_umem_release(db->u.user_page->umem);
+		kfree(db->u.user_page);
+	}
+
+	mutex_unlock(&context->db_page_mutex);
+}
diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c
new file mode 100644
index 0000000..79e6309
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/gsi.c
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx5_ib.h"
+
+struct mlx5_ib_gsi_wr {
+	struct ib_cqe cqe;
+	struct ib_wc wc;
+	int send_flags;
+	bool completed:1;
+};
+
+struct mlx5_ib_gsi_qp {
+	struct ib_qp ibqp;
+	struct ib_qp *rx_qp;
+	u8 port_num;
+	struct ib_qp_cap cap;
+	enum ib_sig_type sq_sig_type;
+	/* Serialize qp state modifications */
+	struct mutex mutex;
+	struct ib_cq *cq;
+	struct mlx5_ib_gsi_wr *outstanding_wrs;
+	u32 outstanding_pi, outstanding_ci;
+	int num_qps;
+	/* Protects access to the tx_qps. Post send operations synchronize
+	 * with tx_qp creation in setup_qp(). Also protects the
+	 * outstanding_wrs array and indices.
+	 */
+	spinlock_t lock;
+	struct ib_qp **tx_qps;
+};
+
+static struct mlx5_ib_gsi_qp *gsi_qp(struct ib_qp *qp)
+{
+	return container_of(qp, struct mlx5_ib_gsi_qp, ibqp);
+}
+
+static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev)
+{
+	return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn);
+}
+
+/* Call with gsi->lock locked */
+static void generate_completions(struct mlx5_ib_gsi_qp *gsi)
+{
+	struct ib_cq *gsi_cq = gsi->ibqp.send_cq;
+	struct mlx5_ib_gsi_wr *wr;
+	u32 index;
+
+	for (index = gsi->outstanding_ci; index != gsi->outstanding_pi;
+	     index++) {
+		wr = &gsi->outstanding_wrs[index % gsi->cap.max_send_wr];
+
+		if (!wr->completed)
+			break;
+
+		if (gsi->sq_sig_type == IB_SIGNAL_ALL_WR ||
+		    wr->send_flags & IB_SEND_SIGNALED)
+			WARN_ON_ONCE(mlx5_ib_generate_wc(gsi_cq, &wr->wc));
+
+		wr->completed = false;
+	}
+
+	gsi->outstanding_ci = index;
+}
+
+static void handle_single_completion(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct mlx5_ib_gsi_qp *gsi = cq->cq_context;
+	struct mlx5_ib_gsi_wr *wr =
+		container_of(wc->wr_cqe, struct mlx5_ib_gsi_wr, cqe);
+	u64 wr_id;
+	unsigned long flags;
+
+	spin_lock_irqsave(&gsi->lock, flags);
+	wr->completed = true;
+	wr_id = wr->wc.wr_id;
+	wr->wc = *wc;
+	wr->wc.wr_id = wr_id;
+	wr->wc.qp = &gsi->ibqp;
+
+	generate_completions(gsi);
+	spin_unlock_irqrestore(&gsi->lock, flags);
+}
+
+struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd,
+				    struct ib_qp_init_attr *init_attr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_gsi_qp *gsi;
+	struct ib_qp_init_attr hw_init_attr = *init_attr;
+	const u8 port_num = init_attr->port_num;
+	const int num_pkeys = pd->device->attrs.max_pkeys;
+	const int num_qps = mlx5_ib_deth_sqpn_cap(dev) ? num_pkeys : 0;
+	int ret;
+
+	mlx5_ib_dbg(dev, "creating GSI QP\n");
+
+	if (port_num > ARRAY_SIZE(dev->devr.ports) || port_num < 1) {
+		mlx5_ib_warn(dev,
+			     "invalid port number %d during GSI QP creation\n",
+			     port_num);
+		return ERR_PTR(-EINVAL);
+	}
+
+	gsi = kzalloc(sizeof(*gsi), GFP_KERNEL);
+	if (!gsi)
+		return ERR_PTR(-ENOMEM);
+
+	gsi->tx_qps = kcalloc(num_qps, sizeof(*gsi->tx_qps), GFP_KERNEL);
+	if (!gsi->tx_qps) {
+		ret = -ENOMEM;
+		goto err_free;
+	}
+
+	gsi->outstanding_wrs = kcalloc(init_attr->cap.max_send_wr,
+				       sizeof(*gsi->outstanding_wrs),
+				       GFP_KERNEL);
+	if (!gsi->outstanding_wrs) {
+		ret = -ENOMEM;
+		goto err_free_tx;
+	}
+
+	mutex_init(&gsi->mutex);
+
+	mutex_lock(&dev->devr.mutex);
+
+	if (dev->devr.ports[port_num - 1].gsi) {
+		mlx5_ib_warn(dev, "GSI QP already exists on port %d\n",
+			     port_num);
+		ret = -EBUSY;
+		goto err_free_wrs;
+	}
+	gsi->num_qps = num_qps;
+	spin_lock_init(&gsi->lock);
+
+	gsi->cap = init_attr->cap;
+	gsi->sq_sig_type = init_attr->sq_sig_type;
+	gsi->ibqp.qp_num = 1;
+	gsi->port_num = port_num;
+
+	gsi->cq = ib_alloc_cq(pd->device, gsi, init_attr->cap.max_send_wr, 0,
+			      IB_POLL_SOFTIRQ);
+	if (IS_ERR(gsi->cq)) {
+		mlx5_ib_warn(dev, "unable to create send CQ for GSI QP. error %ld\n",
+			     PTR_ERR(gsi->cq));
+		ret = PTR_ERR(gsi->cq);
+		goto err_free_wrs;
+	}
+
+	hw_init_attr.qp_type = MLX5_IB_QPT_HW_GSI;
+	hw_init_attr.send_cq = gsi->cq;
+	if (num_qps) {
+		hw_init_attr.cap.max_send_wr = 0;
+		hw_init_attr.cap.max_send_sge = 0;
+		hw_init_attr.cap.max_inline_data = 0;
+	}
+	gsi->rx_qp = ib_create_qp(pd, &hw_init_attr);
+	if (IS_ERR(gsi->rx_qp)) {
+		mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n",
+			     PTR_ERR(gsi->rx_qp));
+		ret = PTR_ERR(gsi->rx_qp);
+		goto err_destroy_cq;
+	}
+
+	dev->devr.ports[init_attr->port_num - 1].gsi = gsi;
+
+	mutex_unlock(&dev->devr.mutex);
+
+	return &gsi->ibqp;
+
+err_destroy_cq:
+	ib_free_cq(gsi->cq);
+err_free_wrs:
+	mutex_unlock(&dev->devr.mutex);
+	kfree(gsi->outstanding_wrs);
+err_free_tx:
+	kfree(gsi->tx_qps);
+err_free:
+	kfree(gsi);
+	return ERR_PTR(ret);
+}
+
+int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp);
+	const int port_num = gsi->port_num;
+	int qp_index;
+	int ret;
+
+	mlx5_ib_dbg(dev, "destroying GSI QP\n");
+
+	mutex_lock(&dev->devr.mutex);
+	ret = ib_destroy_qp(gsi->rx_qp);
+	if (ret) {
+		mlx5_ib_warn(dev, "unable to destroy hardware GSI QP. error %d\n",
+			     ret);
+		mutex_unlock(&dev->devr.mutex);
+		return ret;
+	}
+	dev->devr.ports[port_num - 1].gsi = NULL;
+	mutex_unlock(&dev->devr.mutex);
+	gsi->rx_qp = NULL;
+
+	for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) {
+		if (!gsi->tx_qps[qp_index])
+			continue;
+		WARN_ON_ONCE(ib_destroy_qp(gsi->tx_qps[qp_index]));
+		gsi->tx_qps[qp_index] = NULL;
+	}
+
+	ib_free_cq(gsi->cq);
+
+	kfree(gsi->outstanding_wrs);
+	kfree(gsi->tx_qps);
+	kfree(gsi);
+
+	return 0;
+}
+
+static struct ib_qp *create_gsi_ud_qp(struct mlx5_ib_gsi_qp *gsi)
+{
+	struct ib_pd *pd = gsi->rx_qp->pd;
+	struct ib_qp_init_attr init_attr = {
+		.event_handler = gsi->rx_qp->event_handler,
+		.qp_context = gsi->rx_qp->qp_context,
+		.send_cq = gsi->cq,
+		.recv_cq = gsi->rx_qp->recv_cq,
+		.cap = {
+			.max_send_wr = gsi->cap.max_send_wr,
+			.max_send_sge = gsi->cap.max_send_sge,
+			.max_inline_data = gsi->cap.max_inline_data,
+		},
+		.sq_sig_type = gsi->sq_sig_type,
+		.qp_type = IB_QPT_UD,
+		.create_flags = mlx5_ib_create_qp_sqpn_qp1(),
+	};
+
+	return ib_create_qp(pd, &init_attr);
+}
+
+static int modify_to_rts(struct mlx5_ib_gsi_qp *gsi, struct ib_qp *qp,
+			 u16 qp_index)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct ib_qp_attr attr;
+	int mask;
+	int ret;
+
+	mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY | IB_QP_PORT;
+	attr.qp_state = IB_QPS_INIT;
+	attr.pkey_index = qp_index;
+	attr.qkey = IB_QP1_QKEY;
+	attr.port_num = gsi->port_num;
+	ret = ib_modify_qp(qp, &attr, mask);
+	if (ret) {
+		mlx5_ib_err(dev, "could not change QP%d state to INIT: %d\n",
+			    qp->qp_num, ret);
+		return ret;
+	}
+
+	attr.qp_state = IB_QPS_RTR;
+	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+	if (ret) {
+		mlx5_ib_err(dev, "could not change QP%d state to RTR: %d\n",
+			    qp->qp_num, ret);
+		return ret;
+	}
+
+	attr.qp_state = IB_QPS_RTS;
+	attr.sq_psn = 0;
+	ret = ib_modify_qp(qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN);
+	if (ret) {
+		mlx5_ib_err(dev, "could not change QP%d state to RTS: %d\n",
+			    qp->qp_num, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index)
+{
+	struct ib_device *device = gsi->rx_qp->device;
+	struct mlx5_ib_dev *dev = to_mdev(device);
+	struct ib_qp *qp;
+	unsigned long flags;
+	u16 pkey;
+	int ret;
+
+	ret = ib_query_pkey(device, gsi->port_num, qp_index, &pkey);
+	if (ret) {
+		mlx5_ib_warn(dev, "unable to read P_Key at port %d, index %d\n",
+			     gsi->port_num, qp_index);
+		return;
+	}
+
+	if (!pkey) {
+		mlx5_ib_dbg(dev, "invalid P_Key at port %d, index %d.  Skipping.\n",
+			    gsi->port_num, qp_index);
+		return;
+	}
+
+	spin_lock_irqsave(&gsi->lock, flags);
+	qp = gsi->tx_qps[qp_index];
+	spin_unlock_irqrestore(&gsi->lock, flags);
+	if (qp) {
+		mlx5_ib_dbg(dev, "already existing GSI TX QP at port %d, index %d. Skipping\n",
+			    gsi->port_num, qp_index);
+		return;
+	}
+
+	qp = create_gsi_ud_qp(gsi);
+	if (IS_ERR(qp)) {
+		mlx5_ib_warn(dev, "unable to create hardware UD QP for GSI: %ld\n",
+			     PTR_ERR(qp));
+		return;
+	}
+
+	ret = modify_to_rts(gsi, qp, qp_index);
+	if (ret)
+		goto err_destroy_qp;
+
+	spin_lock_irqsave(&gsi->lock, flags);
+	WARN_ON_ONCE(gsi->tx_qps[qp_index]);
+	gsi->tx_qps[qp_index] = qp;
+	spin_unlock_irqrestore(&gsi->lock, flags);
+
+	return;
+
+err_destroy_qp:
+	WARN_ON_ONCE(qp);
+}
+
+static void setup_qps(struct mlx5_ib_gsi_qp *gsi)
+{
+	u16 qp_index;
+
+	for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index)
+		setup_qp(gsi, qp_index);
+}
+
+int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
+			  int attr_mask)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp);
+	int ret;
+
+	mlx5_ib_dbg(dev, "modifying GSI QP to state %d\n", attr->qp_state);
+
+	mutex_lock(&gsi->mutex);
+	ret = ib_modify_qp(gsi->rx_qp, attr, attr_mask);
+	if (ret) {
+		mlx5_ib_warn(dev, "unable to modify GSI rx QP: %d\n", ret);
+		goto unlock;
+	}
+
+	if (to_mqp(gsi->rx_qp)->state == IB_QPS_RTS)
+		setup_qps(gsi);
+
+unlock:
+	mutex_unlock(&gsi->mutex);
+
+	return ret;
+}
+
+int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+			 int qp_attr_mask,
+			 struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp);
+	int ret;
+
+	mutex_lock(&gsi->mutex);
+	ret = ib_query_qp(gsi->rx_qp, qp_attr, qp_attr_mask, qp_init_attr);
+	qp_init_attr->cap = gsi->cap;
+	mutex_unlock(&gsi->mutex);
+
+	return ret;
+}
+
+/* Call with gsi->lock locked */
+static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_gsi_qp *gsi,
+				      struct ib_ud_wr *wr, struct ib_wc *wc)
+{
+	struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device);
+	struct mlx5_ib_gsi_wr *gsi_wr;
+
+	if (gsi->outstanding_pi == gsi->outstanding_ci + gsi->cap.max_send_wr) {
+		mlx5_ib_warn(dev, "no available GSI work request.\n");
+		return -ENOMEM;
+	}
+
+	gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi %
+				       gsi->cap.max_send_wr];
+	gsi->outstanding_pi++;
+
+	if (!wc) {
+		memset(&gsi_wr->wc, 0, sizeof(gsi_wr->wc));
+		gsi_wr->wc.pkey_index = wr->pkey_index;
+		gsi_wr->wc.wr_id = wr->wr.wr_id;
+	} else {
+		gsi_wr->wc = *wc;
+		gsi_wr->completed = true;
+	}
+
+	gsi_wr->cqe.done = &handle_single_completion;
+	wr->wr.wr_cqe = &gsi_wr->cqe;
+
+	return 0;
+}
+
+/* Call with gsi->lock locked */
+static int mlx5_ib_gsi_silent_drop(struct mlx5_ib_gsi_qp *gsi,
+				    struct ib_ud_wr *wr)
+{
+	struct ib_wc wc = {
+		{ .wr_id = wr->wr.wr_id },
+		.status = IB_WC_SUCCESS,
+		.opcode = IB_WC_SEND,
+		.qp = &gsi->ibqp,
+	};
+	int ret;
+
+	ret = mlx5_ib_add_outstanding_wr(gsi, wr, &wc);
+	if (ret)
+		return ret;
+
+	generate_completions(gsi);
+
+	return 0;
+}
+
+/* Call with gsi->lock locked */
+static struct ib_qp *get_tx_qp(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device);
+	int qp_index = wr->pkey_index;
+
+	if (!mlx5_ib_deth_sqpn_cap(dev))
+		return gsi->rx_qp;
+
+	if (qp_index >= gsi->num_qps)
+		return NULL;
+
+	return gsi->tx_qps[qp_index];
+}
+
+int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr)
+{
+	struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp);
+	struct ib_qp *tx_qp;
+	unsigned long flags;
+	int ret;
+
+	for (; wr; wr = wr->next) {
+		struct ib_ud_wr cur_wr = *ud_wr(wr);
+
+		cur_wr.wr.next = NULL;
+
+		spin_lock_irqsave(&gsi->lock, flags);
+		tx_qp = get_tx_qp(gsi, &cur_wr);
+		if (!tx_qp) {
+			ret = mlx5_ib_gsi_silent_drop(gsi, &cur_wr);
+			if (ret)
+				goto err;
+			spin_unlock_irqrestore(&gsi->lock, flags);
+			continue;
+		}
+
+		ret = mlx5_ib_add_outstanding_wr(gsi, &cur_wr, NULL);
+		if (ret)
+			goto err;
+
+		ret = ib_post_send(tx_qp, &cur_wr.wr, bad_wr);
+		if (ret) {
+			/* Undo the effect of adding the outstanding wr */
+			gsi->outstanding_pi = (gsi->outstanding_pi - 1) %
+					      gsi->cap.max_send_wr;
+			goto err;
+		}
+		spin_unlock_irqrestore(&gsi->lock, flags);
+	}
+
+	return 0;
+
+err:
+	spin_unlock_irqrestore(&gsi->lock, flags);
+	*bad_wr = wr;
+	return ret;
+}
+
+int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr)
+{
+	struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp);
+
+	return ib_post_recv(gsi->rx_qp, wr, bad_wr);
+}
+
+void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi)
+{
+	if (!gsi)
+		return;
+
+	mutex_lock(&gsi->mutex);
+	setup_qps(gsi);
+	mutex_unlock(&gsi->mutex);
+}
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
new file mode 100644
index 0000000..0e04fdd
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -0,0 +1,192 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ */
+
+#include "ib_rep.h"
+
+static const struct mlx5_ib_profile rep_profile = {
+	STAGE_CREATE(MLX5_IB_STAGE_INIT,
+		     mlx5_ib_stage_init_init,
+		     mlx5_ib_stage_init_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
+		     mlx5_ib_stage_rep_flow_db_init,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_CAPS,
+		     mlx5_ib_stage_caps_init,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
+		     mlx5_ib_stage_rep_non_default_cb,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
+		     mlx5_ib_stage_rep_roce_init,
+		     mlx5_ib_stage_rep_roce_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
+		     mlx5_ib_stage_dev_res_init,
+		     mlx5_ib_stage_dev_res_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
+		     mlx5_ib_stage_counters_init,
+		     mlx5_ib_stage_counters_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_BFREG,
+		     mlx5_ib_stage_bfrag_init,
+		     mlx5_ib_stage_bfrag_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
+		     NULL,
+		     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
+		     mlx5_ib_stage_ib_reg_init,
+		     mlx5_ib_stage_ib_reg_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
+		     mlx5_ib_stage_post_ib_reg_umr_init,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR,
+		     mlx5_ib_stage_class_attr_init,
+		     NULL),
+};
+
+static int
+mlx5_ib_nic_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
+{
+	return 0;
+}
+
+static void
+mlx5_ib_nic_rep_unload(struct mlx5_eswitch_rep *rep)
+{
+	rep->rep_if[REP_IB].priv = NULL;
+}
+
+static int
+mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5_ib_dev *ibdev;
+
+	ibdev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*ibdev));
+	if (!ibdev)
+		return -ENOMEM;
+
+	ibdev->rep = rep;
+	ibdev->mdev = dev;
+	ibdev->num_ports = max(MLX5_CAP_GEN(dev, num_ports),
+			       MLX5_CAP_GEN(dev, num_vhca_ports));
+	if (!__mlx5_ib_add(ibdev, &rep_profile))
+		return -EINVAL;
+
+	rep->rep_if[REP_IB].priv = ibdev;
+
+	return 0;
+}
+
+static void
+mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5_ib_dev *dev;
+
+	if (!rep->rep_if[REP_IB].priv)
+		return;
+
+	dev = mlx5_ib_rep_to_dev(rep);
+	__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
+	rep->rep_if[REP_IB].priv = NULL;
+}
+
+static void *mlx5_ib_vport_get_proto_dev(struct mlx5_eswitch_rep *rep)
+{
+	return mlx5_ib_rep_to_dev(rep);
+}
+
+static void mlx5_ib_rep_register_vf_vports(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_eswitch *esw   = dev->mdev->priv.eswitch;
+	int total_vfs = MLX5_TOTAL_VPORTS(dev->mdev);
+	int vport;
+
+	for (vport = 1; vport < total_vfs; vport++) {
+		struct mlx5_eswitch_rep_if rep_if = {};
+
+		rep_if.load = mlx5_ib_vport_rep_load;
+		rep_if.unload = mlx5_ib_vport_rep_unload;
+		rep_if.get_proto_dev = mlx5_ib_vport_get_proto_dev;
+		mlx5_eswitch_register_vport_rep(esw, vport, &rep_if, REP_IB);
+	}
+}
+
+static void mlx5_ib_rep_unregister_vf_vports(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_eswitch *esw   = dev->mdev->priv.eswitch;
+	int total_vfs = MLX5_TOTAL_VPORTS(dev->mdev);
+	int vport;
+
+	for (vport = 1; vport < total_vfs; vport++)
+		mlx5_eswitch_unregister_vport_rep(esw, vport, REP_IB);
+}
+
+void mlx5_ib_register_vport_reps(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
+	struct mlx5_eswitch_rep_if rep_if = {};
+
+	rep_if.load = mlx5_ib_nic_rep_load;
+	rep_if.unload = mlx5_ib_nic_rep_unload;
+	rep_if.get_proto_dev = mlx5_ib_vport_get_proto_dev;
+	rep_if.priv = dev;
+
+	mlx5_eswitch_register_vport_rep(esw, 0, &rep_if, REP_IB);
+
+	mlx5_ib_rep_register_vf_vports(dev);
+}
+
+void mlx5_ib_unregister_vport_reps(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_eswitch *esw   = dev->mdev->priv.eswitch;
+
+	mlx5_ib_rep_unregister_vf_vports(dev); /* VFs vports */
+	mlx5_eswitch_unregister_vport_rep(esw, 0, REP_IB); /* UPLINK PF*/
+}
+
+u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw)
+{
+	return mlx5_eswitch_mode(esw);
+}
+
+struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw,
+					  int vport_index)
+{
+	return mlx5_eswitch_get_proto_dev(esw, vport_index, REP_IB);
+}
+
+struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
+					  int vport_index)
+{
+	return mlx5_eswitch_get_proto_dev(esw, vport_index, REP_ETH);
+}
+
+struct mlx5_ib_dev *mlx5_ib_get_uplink_ibdev(struct mlx5_eswitch *esw)
+{
+	return mlx5_eswitch_uplink_get_proto_dev(esw, REP_IB);
+}
+
+struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw, int vport)
+{
+	return mlx5_eswitch_vport_rep(esw, vport);
+}
+
+int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
+			      struct mlx5_ib_sq *sq)
+{
+	struct mlx5_flow_handle *flow_rule;
+	struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
+
+	if (!dev->rep)
+		return 0;
+
+	flow_rule =
+		mlx5_eswitch_add_send_to_vport_rule(esw,
+						    dev->rep->vport,
+						    sq->base.mqp.qpn);
+	if (IS_ERR(flow_rule))
+		return PTR_ERR(flow_rule);
+	sq->flow_rule = flow_rule;
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h
new file mode 100644
index 0000000..046fd94
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/ib_rep.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef __MLX5_IB_REP_H__
+#define __MLX5_IB_REP_H__
+
+#include <linux/mlx5/eswitch.h>
+#include "mlx5_ib.h"
+
+#ifdef CONFIG_MLX5_ESWITCH
+u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw);
+struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw,
+					  int vport_index);
+struct mlx5_ib_dev *mlx5_ib_get_uplink_ibdev(struct mlx5_eswitch *esw);
+struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw,
+					   int vport_index);
+void mlx5_ib_register_vport_reps(struct mlx5_ib_dev *dev);
+void mlx5_ib_unregister_vport_reps(struct mlx5_ib_dev *dev);
+int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
+			      struct mlx5_ib_sq *sq);
+struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
+					  int vport_index);
+#else /* CONFIG_MLX5_ESWITCH */
+static inline u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw)
+{
+	return SRIOV_NONE;
+}
+
+static inline
+struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw,
+					  int vport_index)
+{
+	return NULL;
+}
+
+static inline
+struct mlx5_ib_dev *mlx5_ib_get_uplink_ibdev(struct mlx5_eswitch *esw)
+{
+	return NULL;
+}
+
+static inline
+struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw,
+					   int vport_index)
+{
+	return NULL;
+}
+
+static inline void mlx5_ib_register_vport_reps(struct mlx5_ib_dev *dev) {}
+static inline void mlx5_ib_unregister_vport_reps(struct mlx5_ib_dev *dev) {}
+static inline int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
+					    struct mlx5_ib_sq *sq)
+{
+	return 0;
+}
+
+static inline
+struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
+					  int vport_index)
+{
+	return NULL;
+}
+#endif
+
+static inline
+struct mlx5_ib_dev *mlx5_ib_rep_to_dev(struct mlx5_eswitch_rep *rep)
+{
+	return (struct mlx5_ib_dev *)rep->rep_if[REP_IB].priv;
+}
+#endif /* __MLX5_IB_REP_H__ */
diff --git a/drivers/infiniband/hw/mlx5/ib_virt.c b/drivers/infiniband/hw/mlx5/ib_virt.c
new file mode 100644
index 0000000..649a336
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/ib_virt.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/mlx5/vport.h>
+#include "mlx5_ib.h"
+
+static inline u32 mlx_to_net_policy(enum port_state_policy mlx_policy)
+{
+	switch (mlx_policy) {
+	case MLX5_POLICY_DOWN:
+		return IFLA_VF_LINK_STATE_DISABLE;
+	case MLX5_POLICY_UP:
+		return IFLA_VF_LINK_STATE_ENABLE;
+	case MLX5_POLICY_FOLLOW:
+		return IFLA_VF_LINK_STATE_AUTO;
+	default:
+		return __IFLA_VF_LINK_STATE_MAX;
+	}
+}
+
+int mlx5_ib_get_vf_config(struct ib_device *device, int vf, u8 port,
+			  struct ifla_vf_info *info)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_hca_vport_context *rep;
+	int err;
+
+	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+	if (!rep)
+		return -ENOMEM;
+
+	err = mlx5_query_hca_vport_context(mdev, 1, 1,  vf + 1, rep);
+	if (err) {
+		mlx5_ib_warn(dev, "failed to query port policy for vf %d (%d)\n",
+			     vf, err);
+		goto free;
+	}
+	memset(info, 0, sizeof(*info));
+	info->linkstate = mlx_to_net_policy(rep->policy);
+	if (info->linkstate == __IFLA_VF_LINK_STATE_MAX)
+		err = -EINVAL;
+
+free:
+	kfree(rep);
+	return err;
+}
+
+static inline enum port_state_policy net_to_mlx_policy(int policy)
+{
+	switch (policy) {
+	case IFLA_VF_LINK_STATE_DISABLE:
+		return MLX5_POLICY_DOWN;
+	case IFLA_VF_LINK_STATE_ENABLE:
+		return MLX5_POLICY_UP;
+	case IFLA_VF_LINK_STATE_AUTO:
+		return MLX5_POLICY_FOLLOW;
+	default:
+		return MLX5_POLICY_INVALID;
+	}
+}
+
+int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf,
+			      u8 port, int state)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_hca_vport_context *in;
+	struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx;
+	int err;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	in->policy = net_to_mlx_policy(state);
+	if (in->policy == MLX5_POLICY_INVALID) {
+		err = -EINVAL;
+		goto out;
+	}
+	in->field_select = MLX5_HCA_VPORT_SEL_STATE_POLICY;
+	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
+	if (!err)
+		vfs_ctx[vf].policy = in->policy;
+
+out:
+	kfree(in);
+	return err;
+}
+
+int mlx5_ib_get_vf_stats(struct ib_device *device, int vf,
+			 u8 port, struct ifla_vf_stats *stats)
+{
+	int out_sz = MLX5_ST_SZ_BYTES(query_vport_counter_out);
+	struct mlx5_core_dev *mdev;
+	struct mlx5_ib_dev *dev;
+	void *out;
+	int err;
+
+	dev = to_mdev(device);
+	mdev = dev->mdev;
+
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_core_query_vport_counter(mdev, true, vf, port, out, out_sz);
+	if (err)
+		goto ex;
+
+	stats->rx_packets = MLX5_GET64_PR(query_vport_counter_out, out, received_ib_unicast.packets);
+	stats->tx_packets = MLX5_GET64_PR(query_vport_counter_out, out, transmitted_ib_unicast.packets);
+	stats->rx_bytes = MLX5_GET64_PR(query_vport_counter_out, out, received_ib_unicast.octets);
+	stats->tx_bytes = MLX5_GET64_PR(query_vport_counter_out, out, transmitted_ib_unicast.octets);
+	stats->multicast = MLX5_GET64_PR(query_vport_counter_out, out, received_ib_multicast.packets);
+
+ex:
+	kfree(out);
+	return err;
+}
+
+static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_hca_vport_context *in;
+	struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx;
+	int err;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	in->field_select = MLX5_HCA_VPORT_SEL_NODE_GUID;
+	in->node_guid = guid;
+	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
+	if (!err)
+		vfs_ctx[vf].node_guid = guid;
+	kfree(in);
+	return err;
+}
+
+static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_hca_vport_context *in;
+	struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx;
+	int err;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	in->field_select = MLX5_HCA_VPORT_SEL_PORT_GUID;
+	in->port_guid = guid;
+	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
+	if (!err)
+		vfs_ctx[vf].port_guid = guid;
+	kfree(in);
+	return err;
+}
+
+int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port,
+			u64 guid, int type)
+{
+	if (type == IFLA_VF_IB_NODE_GUID)
+		return set_vf_node_guid(device, vf, port, guid);
+	else if (type == IFLA_VF_IB_PORT_GUID)
+		return set_vf_port_guid(device, vf, port, guid);
+
+	return -EINVAL;
+}
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
new file mode 100644
index 0000000..32a9e92
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -0,0 +1,609 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/cmd.h>
+#include <linux/mlx5/vport.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_pma.h>
+#include "mlx5_ib.h"
+
+enum {
+	MLX5_IB_VENDOR_CLASS1 = 0x9,
+	MLX5_IB_VENDOR_CLASS2 = 0xa
+};
+
+static bool can_do_mad_ifc(struct mlx5_ib_dev *dev, u8 port_num,
+			   struct ib_mad *in_mad)
+{
+	if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED &&
+	    in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		return true;
+	return dev->mdev->port_caps[port_num - 1].has_smi;
+}
+
+int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+		 u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		 const void *in_mad, void *response_mad)
+{
+	u8 op_modifier = 0;
+
+	if (!can_do_mad_ifc(dev, port, (struct ib_mad *)in_mad))
+		return -EPERM;
+
+	/* Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if (ignore_mkey || !in_wc)
+		op_modifier |= 0x1;
+	if (ignore_bkey || !in_wc)
+		op_modifier |= 0x2;
+
+	return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port);
+}
+
+static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		       const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		       const struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	u16 slid;
+	int err;
+
+	slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0)
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/* Don't process SMInfo queries -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO)
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS1   ||
+		   in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS2   ||
+		   in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else {
+		return IB_MAD_RESULT_SUCCESS;
+	}
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev),
+			   mad_flags & IB_MAD_IGNORE_MKEY,
+			   mad_flags & IB_MAD_IGNORE_BKEY,
+			   port_num, in_wc, in_grh, in_mad, out_mad);
+	if (err)
+		return IB_MAD_RESULT_FAILURE;
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void pma_cnt_ext_assign(struct ib_pma_portcounters_ext *pma_cnt_ext,
+			       void *out)
+{
+#define MLX5_SUM_CNT(p, cntr1, cntr2)	\
+	(MLX5_GET64(query_vport_counter_out, p, cntr1) + \
+	MLX5_GET64(query_vport_counter_out, p, cntr2))
+
+	pma_cnt_ext->port_xmit_data =
+		cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.octets,
+					 transmitted_ib_multicast.octets) >> 2);
+	pma_cnt_ext->port_rcv_data =
+		cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.octets,
+					 received_ib_multicast.octets) >> 2);
+	pma_cnt_ext->port_xmit_packets =
+		cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.packets,
+					 transmitted_ib_multicast.packets));
+	pma_cnt_ext->port_rcv_packets =
+		cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.packets,
+					 received_ib_multicast.packets));
+	pma_cnt_ext->port_unicast_xmit_packets =
+		MLX5_GET64_BE(query_vport_counter_out,
+			      out, transmitted_ib_unicast.packets);
+	pma_cnt_ext->port_unicast_rcv_packets =
+		MLX5_GET64_BE(query_vport_counter_out,
+			      out, received_ib_unicast.packets);
+	pma_cnt_ext->port_multicast_xmit_packets =
+		MLX5_GET64_BE(query_vport_counter_out,
+			      out, transmitted_ib_multicast.packets);
+	pma_cnt_ext->port_multicast_rcv_packets =
+		MLX5_GET64_BE(query_vport_counter_out,
+			      out, received_ib_multicast.packets);
+}
+
+static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt,
+			   void *out)
+{
+	/* Traffic counters will be reported in
+	 * their 64bit form via ib_pma_portcounters_ext by default.
+	 */
+	void *out_pma = MLX5_ADDR_OF(ppcnt_reg, out,
+				     counter_set);
+
+#define MLX5_ASSIGN_PMA_CNTR(counter_var, counter_name)	{		\
+	counter_var = MLX5_GET_BE(typeof(counter_var),			\
+				  ib_port_cntrs_grp_data_layout,	\
+				  out_pma, counter_name);		\
+	}
+
+	MLX5_ASSIGN_PMA_CNTR(pma_cnt->symbol_error_counter,
+			     symbol_error_counter);
+	MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_error_recovery_counter,
+			     link_error_recovery_counter);
+	MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_downed_counter,
+			     link_downed_counter);
+	MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_errors,
+			     port_rcv_errors);
+	MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_remphys_errors,
+			     port_rcv_remote_physical_errors);
+	MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_switch_relay_errors,
+			     port_rcv_switch_relay_errors);
+	MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_discards,
+			     port_xmit_discards);
+	MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_constraint_errors,
+			     port_xmit_constraint_errors);
+	MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_wait,
+			     port_xmit_wait);
+	MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_constraint_errors,
+			     port_rcv_constraint_errors);
+	MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_overrun_errors,
+			     link_overrun_errors);
+	MLX5_ASSIGN_PMA_CNTR(pma_cnt->vl15_dropped,
+			     vl_15_dropped);
+}
+
+static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num,
+			   const struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	int err;
+	void *out_cnt;
+
+	/* Declaring support of extended counters */
+	if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) {
+		struct ib_class_port_info cpi = {};
+
+		cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
+		memcpy((out_mad->data + 40), &cpi, sizeof(cpi));
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+	}
+
+	if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) {
+		struct ib_pma_portcounters_ext *pma_cnt_ext =
+			(struct ib_pma_portcounters_ext *)(out_mad->data + 40);
+		int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out);
+
+		out_cnt = kvzalloc(sz, GFP_KERNEL);
+		if (!out_cnt)
+			return IB_MAD_RESULT_FAILURE;
+
+		err = mlx5_core_query_vport_counter(mdev, 0, 0,
+						    port_num, out_cnt, sz);
+		if (!err)
+			pma_cnt_ext_assign(pma_cnt_ext, out_cnt);
+	} else {
+		struct ib_pma_portcounters *pma_cnt =
+			(struct ib_pma_portcounters *)(out_mad->data + 40);
+		int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+
+		out_cnt = kvzalloc(sz, GFP_KERNEL);
+		if (!out_cnt)
+			return IB_MAD_RESULT_FAILURE;
+
+		err = mlx5_core_query_ib_ppcnt(mdev, port_num,
+					       out_cnt, sz);
+		if (!err)
+			pma_cnt_assign(pma_cnt, out_cnt);
+		}
+
+	kvfree(out_cnt);
+	if (err)
+		return IB_MAD_RESULT_FAILURE;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			const struct ib_mad_hdr *in, size_t in_mad_size,
+			struct ib_mad_hdr *out, size_t *out_mad_size,
+			u16 *out_mad_pkey_index)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	const struct ib_mad *in_mad = (const struct ib_mad *)in;
+	struct ib_mad *out_mad = (struct ib_mad *)out;
+	struct mlx5_core_dev *mdev;
+	u8 mdev_port_num;
+	int ret;
+
+	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
+			 *out_mad_size != sizeof(*out_mad)))
+		return IB_MAD_RESULT_FAILURE;
+
+	memset(out_mad->data, 0, sizeof(out_mad->data));
+
+	mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
+	if (!mdev)
+		return IB_MAD_RESULT_FAILURE;
+
+	if (MLX5_CAP_GEN(mdev, vport_counters) &&
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
+	    in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) {
+		ret = process_pma_cmd(mdev, mdev_port_num, in_mad, out_mad);
+	} else {
+		ret =  process_mad(ibdev, mad_flags, port_num, in_wc, in_grh,
+				   in_mad, out_mad);
+	}
+	mlx5_ib_put_native_port_mdev(dev, port_num);
+	return ret;
+}
+
+int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	u16 packet_error;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+
+	packet_error = be16_to_cpu(out_mad->status);
+
+	dev->mdev->port_caps[port - 1].ext_port_cap = (!err && !packet_error) ?
+		MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+int mlx5_query_mad_ifc_smp_attr_node_info(struct ib_device *ibdev,
+					  struct ib_smp *out_mad)
+{
+	struct ib_smp *in_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	if (!in_mad)
+		return -ENOMEM;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad,
+			   out_mad);
+
+	kfree(in_mad);
+	return err;
+}
+
+int mlx5_query_mad_ifc_system_image_guid(struct ib_device *ibdev,
+					 __be64 *sys_image_guid)
+{
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!out_mad)
+		return -ENOMEM;
+
+	err = mlx5_query_mad_ifc_smp_attr_node_info(ibdev, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(sys_image_guid, out_mad->data + 4, 8);
+
+out:
+	kfree(out_mad);
+
+	return err;
+}
+
+int mlx5_query_mad_ifc_max_pkeys(struct ib_device *ibdev,
+				 u16 *max_pkeys)
+{
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!out_mad)
+		return -ENOMEM;
+
+	err = mlx5_query_mad_ifc_smp_attr_node_info(ibdev, out_mad);
+	if (err)
+		goto out;
+
+	*max_pkeys = be16_to_cpup((__be16 *)(out_mad->data + 28));
+
+out:
+	kfree(out_mad);
+
+	return err;
+}
+
+int mlx5_query_mad_ifc_vendor_id(struct ib_device *ibdev,
+				 u32 *vendor_id)
+{
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!out_mad)
+		return -ENOMEM;
+
+	err = mlx5_query_mad_ifc_smp_attr_node_info(ibdev, out_mad);
+	if (err)
+		goto out;
+
+	*vendor_id = be32_to_cpup((__be32 *)(out_mad->data + 36)) & 0xffff;
+
+out:
+	kfree(out_mad);
+
+	return err;
+}
+
+int mlx5_query_mad_ifc_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+	err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(node_desc, out_mad->data, IB_DEVICE_NODE_DESC_MAX);
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+int mlx5_query_mad_ifc_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(node_guid, out_mad->data + 12, 8);
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			    u16 *pkey)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad,
+			   out_mad);
+	if (err)
+		goto out;
+
+	*pkey = be16_to_cpu(((__be16 *)out_mad->data)[index % 32]);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u8 port, int index,
+			    union ib_gid *gid)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad,
+			   out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod = cpu_to_be32(index / 8);
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad,
+			   out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port,
+			    struct ib_port_attr *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int ext_active_speed;
+	int err = -ENOMEM;
+
+	if (port < 1 || port > dev->num_ports) {
+		mlx5_ib_warn(dev, "invalid port number %d\n", port);
+		return -EINVAL;
+	}
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	/* props being zeroed by the caller, avoid zeroing it here */
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx5_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err) {
+		mlx5_ib_warn(dev, "err %d\n", err);
+		goto out;
+	}
+
+	props->lid		= be16_to_cpup((__be16 *)(out_mad->data + 16));
+	props->lmc		= out_mad->data[34] & 0x7;
+	props->sm_lid		= be16_to_cpup((__be16 *)(out_mad->data + 18));
+	props->sm_sl		= out_mad->data[36] & 0xf;
+	props->state		= out_mad->data[32] & 0xf;
+	props->phys_state	= out_mad->data[33] >> 4;
+	props->port_cap_flags	= be32_to_cpup((__be32 *)(out_mad->data + 20));
+	props->gid_tbl_len	= out_mad->data[50];
+	props->max_msg_sz	= 1 << MLX5_CAP_GEN(mdev, log_max_msg);
+	props->pkey_tbl_len	= mdev->port_caps[port - 1].pkey_table_len;
+	props->bad_pkey_cntr	= be16_to_cpup((__be16 *)(out_mad->data + 46));
+	props->qkey_viol_cntr	= be16_to_cpup((__be16 *)(out_mad->data + 48));
+	props->active_width	= out_mad->data[31] & 0xf;
+	props->active_speed	= out_mad->data[35] >> 4;
+	props->max_mtu		= out_mad->data[41] & 0xf;
+	props->active_mtu	= out_mad->data[36] >> 4;
+	props->subnet_timeout	= out_mad->data[51] & 0x1f;
+	props->max_vl_num	= out_mad->data[37] >> 4;
+	props->init_type_reply	= out_mad->data[41] >> 4;
+
+	/* Check if extended speeds (EDR/FDR/...) are supported */
+	if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) {
+		ext_active_speed = out_mad->data[62] >> 4;
+
+		switch (ext_active_speed) {
+		case 1:
+			props->active_speed = 16; /* FDR */
+			break;
+		case 2:
+			props->active_speed = 32; /* EDR */
+			break;
+		}
+	}
+
+	/* If reported active speed is QDR, check if is FDR-10 */
+	if (props->active_speed == 4) {
+		if (mdev->port_caps[port - 1].ext_port_cap &
+		    MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) {
+			init_query_mad(in_mad);
+			in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO;
+			in_mad->attr_mod = cpu_to_be32(port);
+
+			err = mlx5_MAD_IFC(dev, 1, 1, port,
+					   NULL, NULL, in_mad, out_mad);
+			if (err)
+				goto out;
+
+			/* Checking LinkSpeedActive for FDR-10 */
+			if (out_mad->data[15] & 0x1)
+				props->active_speed = 8;
+		}
+	}
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
new file mode 100644
index 0000000..69716a7
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -0,0 +1,5821 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+#if defined(CONFIG_X86)
+#include <asm/pat.h>
+#endif
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+#include <linux/delay.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+#include <linux/mlx5/port.h>
+#include <linux/mlx5/vport.h>
+#include <linux/mlx5/fs.h>
+#include <linux/list.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <linux/in.h>
+#include <linux/etherdevice.h>
+#include "mlx5_ib.h"
+#include "ib_rep.h"
+#include "cmd.h"
+#include <linux/mlx5/fs_helpers.h>
+#include <linux/mlx5/accel.h>
+#include <rdma/uverbs_std_types.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
+#include <rdma/mlx5_user_ioctl_cmds.h>
+
+#define UVERBS_MODULE_NAME mlx5_ib
+#include <rdma/uverbs_named_ioctl.h>
+
+#define DRIVER_NAME "mlx5_ib"
+#define DRIVER_VERSION "5.0-0"
+
+MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static char mlx5_version[] =
+	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
+	DRIVER_VERSION "\n";
+
+struct mlx5_ib_event_work {
+	struct work_struct	work;
+	struct mlx5_core_dev	*dev;
+	void			*context;
+	enum mlx5_dev_event	event;
+	unsigned long		param;
+};
+
+enum {
+	MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
+};
+
+static struct workqueue_struct *mlx5_ib_event_wq;
+static LIST_HEAD(mlx5_ib_unaffiliated_port_list);
+static LIST_HEAD(mlx5_ib_dev_list);
+/*
+ * This mutex should be held when accessing either of the above lists
+ */
+static DEFINE_MUTEX(mlx5_ib_multiport_mutex);
+
+/* We can't use an array for xlt_emergency_page because dma_map_single
+ * doesn't work on kernel modules memory
+ */
+static unsigned long xlt_emergency_page;
+static struct mutex xlt_emergency_page_mutex;
+
+struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi)
+{
+	struct mlx5_ib_dev *dev;
+
+	mutex_lock(&mlx5_ib_multiport_mutex);
+	dev = mpi->ibdev;
+	mutex_unlock(&mlx5_ib_multiport_mutex);
+	return dev;
+}
+
+static enum rdma_link_layer
+mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
+{
+	switch (port_type_cap) {
+	case MLX5_CAP_PORT_TYPE_IB:
+		return IB_LINK_LAYER_INFINIBAND;
+	case MLX5_CAP_PORT_TYPE_ETH:
+		return IB_LINK_LAYER_ETHERNET;
+	default:
+		return IB_LINK_LAYER_UNSPECIFIED;
+	}
+}
+
+static enum rdma_link_layer
+mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+	int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
+
+	return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+}
+
+static int get_port_state(struct ib_device *ibdev,
+			  u8 port_num,
+			  enum ib_port_state *state)
+{
+	struct ib_port_attr attr;
+	int ret;
+
+	memset(&attr, 0, sizeof(attr));
+	ret = ibdev->query_port(ibdev, port_num, &attr);
+	if (!ret)
+		*state = attr.state;
+	return ret;
+}
+
+static int mlx5_netdev_event(struct notifier_block *this,
+			     unsigned long event, void *ptr)
+{
+	struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb);
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	u8 port_num = roce->native_port_num;
+	struct mlx5_core_dev *mdev;
+	struct mlx5_ib_dev *ibdev;
+
+	ibdev = roce->dev;
+	mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
+	if (!mdev)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+	case NETDEV_UNREGISTER:
+		write_lock(&roce->netdev_lock);
+		if (ibdev->rep) {
+			struct mlx5_eswitch *esw = ibdev->mdev->priv.eswitch;
+			struct net_device *rep_ndev;
+
+			rep_ndev = mlx5_ib_get_rep_netdev(esw,
+							  ibdev->rep->vport);
+			if (rep_ndev == ndev)
+				roce->netdev = (event == NETDEV_UNREGISTER) ?
+					NULL : ndev;
+		} else if (ndev->dev.parent == &mdev->pdev->dev) {
+			roce->netdev = (event == NETDEV_UNREGISTER) ?
+				NULL : ndev;
+		}
+		write_unlock(&roce->netdev_lock);
+		break;
+
+	case NETDEV_CHANGE:
+	case NETDEV_UP:
+	case NETDEV_DOWN: {
+		struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev);
+		struct net_device *upper = NULL;
+
+		if (lag_ndev) {
+			upper = netdev_master_upper_dev_get(lag_ndev);
+			dev_put(lag_ndev);
+		}
+
+		if ((upper == ndev || (!upper && ndev == roce->netdev))
+		    && ibdev->ib_active) {
+			struct ib_event ibev = { };
+			enum ib_port_state port_state;
+
+			if (get_port_state(&ibdev->ib_dev, port_num,
+					   &port_state))
+				goto done;
+
+			if (roce->last_port_state == port_state)
+				goto done;
+
+			roce->last_port_state = port_state;
+			ibev.device = &ibdev->ib_dev;
+			if (port_state == IB_PORT_DOWN)
+				ibev.event = IB_EVENT_PORT_ERR;
+			else if (port_state == IB_PORT_ACTIVE)
+				ibev.event = IB_EVENT_PORT_ACTIVE;
+			else
+				goto done;
+
+			ibev.element.port_num = port_num;
+			ib_dispatch_event(&ibev);
+		}
+		break;
+	}
+
+	default:
+		break;
+	}
+done:
+	mlx5_ib_put_native_port_mdev(ibdev, port_num);
+	return NOTIFY_DONE;
+}
+
+static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
+					     u8 port_num)
+{
+	struct mlx5_ib_dev *ibdev = to_mdev(device);
+	struct net_device *ndev;
+	struct mlx5_core_dev *mdev;
+
+	mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
+	if (!mdev)
+		return NULL;
+
+	ndev = mlx5_lag_get_roce_netdev(mdev);
+	if (ndev)
+		goto out;
+
+	/* Ensure ndev does not disappear before we invoke dev_hold()
+	 */
+	read_lock(&ibdev->roce[port_num - 1].netdev_lock);
+	ndev = ibdev->roce[port_num - 1].netdev;
+	if (ndev)
+		dev_hold(ndev);
+	read_unlock(&ibdev->roce[port_num - 1].netdev_lock);
+
+out:
+	mlx5_ib_put_native_port_mdev(ibdev, port_num);
+	return ndev;
+}
+
+struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
+						   u8 ib_port_num,
+						   u8 *native_port_num)
+{
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
+							  ib_port_num);
+	struct mlx5_core_dev *mdev = NULL;
+	struct mlx5_ib_multiport_info *mpi;
+	struct mlx5_ib_port *port;
+
+	if (!mlx5_core_mp_enabled(ibdev->mdev) ||
+	    ll != IB_LINK_LAYER_ETHERNET) {
+		if (native_port_num)
+			*native_port_num = ib_port_num;
+		return ibdev->mdev;
+	}
+
+	if (native_port_num)
+		*native_port_num = 1;
+
+	port = &ibdev->port[ib_port_num - 1];
+	if (!port)
+		return NULL;
+
+	spin_lock(&port->mp.mpi_lock);
+	mpi = ibdev->port[ib_port_num - 1].mp.mpi;
+	if (mpi && !mpi->unaffiliate) {
+		mdev = mpi->mdev;
+		/* If it's the master no need to refcount, it'll exist
+		 * as long as the ib_dev exists.
+		 */
+		if (!mpi->is_master)
+			mpi->mdev_refcnt++;
+	}
+	spin_unlock(&port->mp.mpi_lock);
+
+	return mdev;
+}
+
+void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u8 port_num)
+{
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
+							  port_num);
+	struct mlx5_ib_multiport_info *mpi;
+	struct mlx5_ib_port *port;
+
+	if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
+		return;
+
+	port = &ibdev->port[port_num - 1];
+
+	spin_lock(&port->mp.mpi_lock);
+	mpi = ibdev->port[port_num - 1].mp.mpi;
+	if (mpi->is_master)
+		goto out;
+
+	mpi->mdev_refcnt--;
+	if (mpi->unaffiliate)
+		complete(&mpi->unref_comp);
+out:
+	spin_unlock(&port->mp.mpi_lock);
+}
+
+static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
+				    u8 *active_width)
+{
+	switch (eth_proto_oper) {
+	case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
+	case MLX5E_PROT_MASK(MLX5E_1000BASE_KX):
+	case MLX5E_PROT_MASK(MLX5E_100BASE_TX):
+	case MLX5E_PROT_MASK(MLX5E_1000BASE_T):
+		*active_width = IB_WIDTH_1X;
+		*active_speed = IB_SPEED_SDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_10GBASE_T):
+	case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4):
+	case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4):
+	case MLX5E_PROT_MASK(MLX5E_10GBASE_KR):
+	case MLX5E_PROT_MASK(MLX5E_10GBASE_CR):
+	case MLX5E_PROT_MASK(MLX5E_10GBASE_SR):
+	case MLX5E_PROT_MASK(MLX5E_10GBASE_ER):
+		*active_width = IB_WIDTH_1X;
+		*active_speed = IB_SPEED_QDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_25GBASE_CR):
+	case MLX5E_PROT_MASK(MLX5E_25GBASE_KR):
+	case MLX5E_PROT_MASK(MLX5E_25GBASE_SR):
+		*active_width = IB_WIDTH_1X;
+		*active_speed = IB_SPEED_EDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4):
+	case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4):
+	case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4):
+	case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4):
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_QDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2):
+	case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2):
+	case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2):
+		*active_width = IB_WIDTH_1X;
+		*active_speed = IB_SPEED_HDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_56GBASE_R4):
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_FDR;
+		break;
+	case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4):
+	case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4):
+	case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4):
+	case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4):
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_EDR;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
+				struct ib_port_attr *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+	struct mlx5_core_dev *mdev;
+	struct net_device *ndev, *upper;
+	enum ib_mtu ndev_ib_mtu;
+	bool put_mdev = true;
+	u16 qkey_viol_cntr;
+	u32 eth_prot_oper;
+	u8 mdev_port_num;
+	int err;
+
+	mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
+	if (!mdev) {
+		/* This means the port isn't affiliated yet. Get the
+		 * info for the master port instead.
+		 */
+		put_mdev = false;
+		mdev = dev->mdev;
+		mdev_port_num = 1;
+		port_num = 1;
+	}
+
+	/* Possible bad flows are checked before filling out props so in case
+	 * of an error it will still be zeroed out.
+	 */
+	err = mlx5_query_port_eth_proto_oper(mdev, &eth_prot_oper,
+					     mdev_port_num);
+	if (err)
+		goto out;
+
+	props->active_width     = IB_WIDTH_4X;
+	props->active_speed     = IB_SPEED_QDR;
+
+	translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
+				 &props->active_width);
+
+	props->port_cap_flags  |= IB_PORT_CM_SUP;
+	props->port_cap_flags  |= IB_PORT_IP_BASED_GIDS;
+
+	props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
+						roce_address_table_size);
+	props->max_mtu          = IB_MTU_4096;
+	props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
+	props->pkey_tbl_len     = 1;
+	props->state            = IB_PORT_DOWN;
+	props->phys_state       = 3;
+
+	mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr);
+	props->qkey_viol_cntr = qkey_viol_cntr;
+
+	/* If this is a stub query for an unaffiliated port stop here */
+	if (!put_mdev)
+		goto out;
+
+	ndev = mlx5_ib_get_netdev(device, port_num);
+	if (!ndev)
+		goto out;
+
+	if (mlx5_lag_is_active(dev->mdev)) {
+		rcu_read_lock();
+		upper = netdev_master_upper_dev_get_rcu(ndev);
+		if (upper) {
+			dev_put(ndev);
+			ndev = upper;
+			dev_hold(ndev);
+		}
+		rcu_read_unlock();
+	}
+
+	if (netif_running(ndev) && netif_carrier_ok(ndev)) {
+		props->state      = IB_PORT_ACTIVE;
+		props->phys_state = 5;
+	}
+
+	ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
+
+	dev_put(ndev);
+
+	props->active_mtu	= min(props->max_mtu, ndev_ib_mtu);
+out:
+	if (put_mdev)
+		mlx5_ib_put_native_port_mdev(dev, port_num);
+	return err;
+}
+
+static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
+			 unsigned int index, const union ib_gid *gid,
+			 const struct ib_gid_attr *attr)
+{
+	enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+	u8 roce_version = 0;
+	u8 roce_l3_type = 0;
+	bool vlan = false;
+	u8 mac[ETH_ALEN];
+	u16 vlan_id = 0;
+
+	if (gid) {
+		gid_type = attr->gid_type;
+		ether_addr_copy(mac, attr->ndev->dev_addr);
+
+		if (is_vlan_dev(attr->ndev)) {
+			vlan = true;
+			vlan_id = vlan_dev_vlan_id(attr->ndev);
+		}
+	}
+
+	switch (gid_type) {
+	case IB_GID_TYPE_IB:
+		roce_version = MLX5_ROCE_VERSION_1;
+		break;
+	case IB_GID_TYPE_ROCE_UDP_ENCAP:
+		roce_version = MLX5_ROCE_VERSION_2;
+		if (ipv6_addr_v4mapped((void *)gid))
+			roce_l3_type = MLX5_ROCE_L3_TYPE_IPV4;
+		else
+			roce_l3_type = MLX5_ROCE_L3_TYPE_IPV6;
+		break;
+
+	default:
+		mlx5_ib_warn(dev, "Unexpected GID type %u\n", gid_type);
+	}
+
+	return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
+				      roce_l3_type, gid->raw, mac, vlan,
+				      vlan_id, port_num);
+}
+
+static int mlx5_ib_add_gid(const union ib_gid *gid,
+			   const struct ib_gid_attr *attr,
+			   __always_unused void **context)
+{
+	return set_roce_addr(to_mdev(attr->device), attr->port_num,
+			     attr->index, gid, attr);
+}
+
+static int mlx5_ib_del_gid(const struct ib_gid_attr *attr,
+			   __always_unused void **context)
+{
+	return set_roce_addr(to_mdev(attr->device), attr->port_num,
+			     attr->index, NULL, NULL);
+}
+
+__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
+			       int index)
+{
+	struct ib_gid_attr attr;
+	union ib_gid gid;
+
+	if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr))
+		return 0;
+
+	dev_put(attr.ndev);
+
+	if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
+		return 0;
+
+	return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
+}
+
+int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num,
+			   int index, enum ib_gid_type *gid_type)
+{
+	struct ib_gid_attr attr;
+	union ib_gid gid;
+	int ret;
+
+	ret = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr);
+	if (ret)
+		return ret;
+
+	dev_put(attr.ndev);
+
+	*gid_type = attr.gid_type;
+
+	return 0;
+}
+
+static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
+{
+	if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
+		return !MLX5_CAP_GEN(dev->mdev, ib_virt);
+	return 0;
+}
+
+enum {
+	MLX5_VPORT_ACCESS_METHOD_MAD,
+	MLX5_VPORT_ACCESS_METHOD_HCA,
+	MLX5_VPORT_ACCESS_METHOD_NIC,
+};
+
+static int mlx5_get_vport_access_method(struct ib_device *ibdev)
+{
+	if (mlx5_use_mad_ifc(to_mdev(ibdev)))
+		return MLX5_VPORT_ACCESS_METHOD_MAD;
+
+	if (mlx5_ib_port_link_layer(ibdev, 1) ==
+	    IB_LINK_LAYER_ETHERNET)
+		return MLX5_VPORT_ACCESS_METHOD_NIC;
+
+	return MLX5_VPORT_ACCESS_METHOD_HCA;
+}
+
+static void get_atomic_caps(struct mlx5_ib_dev *dev,
+			    u8 atomic_size_qp,
+			    struct ib_device_attr *props)
+{
+	u8 tmp;
+	u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
+	u8 atomic_req_8B_endianness_mode =
+		MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode);
+
+	/* Check if HW supports 8 bytes standard atomic operations and capable
+	 * of host endianness respond
+	 */
+	tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
+	if (((atomic_operations & tmp) == tmp) &&
+	    (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
+	    (atomic_req_8B_endianness_mode)) {
+		props->atomic_cap = IB_ATOMIC_HCA;
+	} else {
+		props->atomic_cap = IB_ATOMIC_NONE;
+	}
+}
+
+static void get_atomic_caps_qp(struct mlx5_ib_dev *dev,
+			       struct ib_device_attr *props)
+{
+	u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
+
+	get_atomic_caps(dev, atomic_size_qp, props);
+}
+
+static void get_atomic_caps_dc(struct mlx5_ib_dev *dev,
+			       struct ib_device_attr *props)
+{
+	u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
+
+	get_atomic_caps(dev, atomic_size_qp, props);
+}
+
+bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev)
+{
+	struct ib_device_attr props = {};
+
+	get_atomic_caps_dc(dev, &props);
+	return (props.atomic_cap == IB_ATOMIC_HCA) ? true : false;
+}
+static int mlx5_query_system_image_guid(struct ib_device *ibdev,
+					__be64 *sys_image_guid)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	u64 tmp;
+	int err;
+
+	switch (mlx5_get_vport_access_method(ibdev)) {
+	case MLX5_VPORT_ACCESS_METHOD_MAD:
+		return mlx5_query_mad_ifc_system_image_guid(ibdev,
+							    sys_image_guid);
+
+	case MLX5_VPORT_ACCESS_METHOD_HCA:
+		err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
+		break;
+
+	case MLX5_VPORT_ACCESS_METHOD_NIC:
+		err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (!err)
+		*sys_image_guid = cpu_to_be64(tmp);
+
+	return err;
+
+}
+
+static int mlx5_query_max_pkeys(struct ib_device *ibdev,
+				u16 *max_pkeys)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_core_dev *mdev = dev->mdev;
+
+	switch (mlx5_get_vport_access_method(ibdev)) {
+	case MLX5_VPORT_ACCESS_METHOD_MAD:
+		return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
+
+	case MLX5_VPORT_ACCESS_METHOD_HCA:
+	case MLX5_VPORT_ACCESS_METHOD_NIC:
+		*max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
+						pkey_table_size));
+		return 0;
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static int mlx5_query_vendor_id(struct ib_device *ibdev,
+				u32 *vendor_id)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+
+	switch (mlx5_get_vport_access_method(ibdev)) {
+	case MLX5_VPORT_ACCESS_METHOD_MAD:
+		return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
+
+	case MLX5_VPORT_ACCESS_METHOD_HCA:
+	case MLX5_VPORT_ACCESS_METHOD_NIC:
+		return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
+				__be64 *node_guid)
+{
+	u64 tmp;
+	int err;
+
+	switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
+	case MLX5_VPORT_ACCESS_METHOD_MAD:
+		return mlx5_query_mad_ifc_node_guid(dev, node_guid);
+
+	case MLX5_VPORT_ACCESS_METHOD_HCA:
+		err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
+		break;
+
+	case MLX5_VPORT_ACCESS_METHOD_NIC:
+		err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (!err)
+		*node_guid = cpu_to_be64(tmp);
+
+	return err;
+}
+
+struct mlx5_reg_node_desc {
+	u8	desc[IB_DEVICE_NODE_DESC_MAX];
+};
+
+static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
+{
+	struct mlx5_reg_node_desc in;
+
+	if (mlx5_use_mad_ifc(dev))
+		return mlx5_query_mad_ifc_node_desc(dev, node_desc);
+
+	memset(&in, 0, sizeof(in));
+
+	return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
+				    sizeof(struct mlx5_reg_node_desc),
+				    MLX5_REG_NODE_DESC, 0, 0);
+}
+
+static int mlx5_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props,
+				struct ib_udata *uhw)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	int err = -ENOMEM;
+	int max_sq_desc;
+	int max_rq_sg;
+	int max_sq_sg;
+	u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
+	bool raw_support = !mlx5_core_mp_enabled(mdev);
+	struct mlx5_ib_query_device_resp resp = {};
+	size_t resp_len;
+	u64 max_tso;
+
+	resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
+	if (uhw->outlen && uhw->outlen < resp_len)
+		return -EINVAL;
+	else
+		resp.response_length = resp_len;
+
+	if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
+		return -EINVAL;
+
+	memset(props, 0, sizeof(*props));
+	err = mlx5_query_system_image_guid(ibdev,
+					   &props->sys_image_guid);
+	if (err)
+		return err;
+
+	err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
+	if (err)
+		return err;
+
+	err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
+	if (err)
+		return err;
+
+	props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
+		(fw_rev_min(dev->mdev) << 16) |
+		fw_rev_sub(dev->mdev);
+	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT		|
+		IB_DEVICE_SYS_IMAGE_GUID		|
+		IB_DEVICE_RC_RNR_NAK_GEN;
+
+	if (MLX5_CAP_GEN(mdev, pkv))
+		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+	if (MLX5_CAP_GEN(mdev, qkv))
+		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+	if (MLX5_CAP_GEN(mdev, apm))
+		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+	if (MLX5_CAP_GEN(mdev, xrc))
+		props->device_cap_flags |= IB_DEVICE_XRC;
+	if (MLX5_CAP_GEN(mdev, imaicl)) {
+		props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
+					   IB_DEVICE_MEM_WINDOW_TYPE_2B;
+		props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
+		/* We support 'Gappy' memory registration too */
+		props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
+	}
+	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+	if (MLX5_CAP_GEN(mdev, sho)) {
+		props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
+		/* At this stage no support for signature handover */
+		props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
+				      IB_PROT_T10DIF_TYPE_2 |
+				      IB_PROT_T10DIF_TYPE_3;
+		props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
+				       IB_GUARD_T10DIF_CSUM;
+	}
+	if (MLX5_CAP_GEN(mdev, block_lb_mc))
+		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+
+	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) {
+		if (MLX5_CAP_ETH(mdev, csum_cap)) {
+			/* Legacy bit to support old userspace libraries */
+			props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
+			props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM;
+		}
+
+		if (MLX5_CAP_ETH(dev->mdev, vlan_cap))
+			props->raw_packet_caps |=
+				IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
+
+		if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
+			max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
+			if (max_tso) {
+				resp.tso_caps.max_tso = 1 << max_tso;
+				resp.tso_caps.supported_qpts |=
+					1 << IB_QPT_RAW_PACKET;
+				resp.response_length += sizeof(resp.tso_caps);
+			}
+		}
+
+		if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
+			resp.rss_caps.rx_hash_function =
+						MLX5_RX_HASH_FUNC_TOEPLITZ;
+			resp.rss_caps.rx_hash_fields_mask =
+						MLX5_RX_HASH_SRC_IPV4 |
+						MLX5_RX_HASH_DST_IPV4 |
+						MLX5_RX_HASH_SRC_IPV6 |
+						MLX5_RX_HASH_DST_IPV6 |
+						MLX5_RX_HASH_SRC_PORT_TCP |
+						MLX5_RX_HASH_DST_PORT_TCP |
+						MLX5_RX_HASH_SRC_PORT_UDP |
+						MLX5_RX_HASH_DST_PORT_UDP |
+						MLX5_RX_HASH_INNER;
+			if (mlx5_accel_ipsec_device_caps(dev->mdev) &
+			    MLX5_ACCEL_IPSEC_CAP_DEVICE)
+				resp.rss_caps.rx_hash_fields_mask |=
+					MLX5_RX_HASH_IPSEC_SPI;
+			resp.response_length += sizeof(resp.rss_caps);
+		}
+	} else {
+		if (field_avail(typeof(resp), tso_caps, uhw->outlen))
+			resp.response_length += sizeof(resp.tso_caps);
+		if (field_avail(typeof(resp), rss_caps, uhw->outlen))
+			resp.response_length += sizeof(resp.rss_caps);
+	}
+
+	if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
+		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+		props->device_cap_flags |= IB_DEVICE_UD_TSO;
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) &&
+	    MLX5_CAP_GEN(dev->mdev, general_notification_event) &&
+	    raw_support)
+		props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP;
+
+	if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
+	    MLX5_CAP_IPOIB_ENHANCED(mdev, csum_cap))
+		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+
+	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
+	    MLX5_CAP_ETH(dev->mdev, scatter_fcs) &&
+	    raw_support) {
+		/* Legacy bit to support old userspace libraries */
+		props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
+		props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
+	}
+
+	if (MLX5_CAP_DEV_MEM(mdev, memic)) {
+		props->max_dm_size =
+			MLX5_CAP_DEV_MEM(mdev, max_memic_size);
+	}
+
+	if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
+		props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
+
+	if (MLX5_CAP_GEN(mdev, end_pad))
+		props->device_cap_flags |= IB_DEVICE_PCI_WRITE_END_PADDING;
+
+	props->vendor_part_id	   = mdev->pdev->device;
+	props->hw_ver		   = mdev->pdev->revision;
+
+	props->max_mr_size	   = ~0ull;
+	props->page_size_cap	   = ~(min_page_size - 1);
+	props->max_qp		   = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
+	props->max_qp_wr	   = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
+	max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
+		     sizeof(struct mlx5_wqe_data_seg);
+	max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
+	max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) -
+		     sizeof(struct mlx5_wqe_raddr_seg)) /
+		sizeof(struct mlx5_wqe_data_seg);
+	props->max_sge = min(max_rq_sg, max_sq_sg);
+	props->max_sge_rd	   = MLX5_MAX_SGE_RD;
+	props->max_cq		   = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
+	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
+	props->max_mr		   = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
+	props->max_pd		   = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
+	props->max_qp_rd_atom	   = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
+	props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
+	props->max_srq		   = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
+	props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
+	props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
+	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
+	props->max_srq_sge	   = max_rq_sg - 1;
+	props->max_fast_reg_page_list_len =
+		1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
+	get_atomic_caps_qp(dev, props);
+	props->masked_atomic_cap   = IB_ATOMIC_NONE;
+	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
+	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+					   props->max_mcast_grp;
+	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
+	props->max_ah = INT_MAX;
+	props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
+	props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (MLX5_CAP_GEN(mdev, pg))
+		props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
+	props->odp_caps = dev->odp_caps;
+#endif
+
+	if (MLX5_CAP_GEN(mdev, cd))
+		props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
+
+	if (!mlx5_core_is_pf(mdev))
+		props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
+
+	if (mlx5_ib_port_link_layer(ibdev, 1) ==
+	    IB_LINK_LAYER_ETHERNET && raw_support) {
+		props->rss_caps.max_rwq_indirection_tables =
+			1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
+		props->rss_caps.max_rwq_indirection_table_size =
+			1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
+		props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
+		props->max_wq_type_rq =
+			1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
+	}
+
+	if (MLX5_CAP_GEN(mdev, tag_matching)) {
+		props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
+		props->tm_caps.max_num_tags =
+			(1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1;
+		props->tm_caps.flags = IB_TM_CAP_RC;
+		props->tm_caps.max_ops =
+			1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
+		props->tm_caps.max_sge = MLX5_TM_MAX_SGE;
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) {
+		props->cq_caps.max_cq_moderation_count =
+						MLX5_MAX_CQ_COUNT;
+		props->cq_caps.max_cq_moderation_period =
+						MLX5_MAX_CQ_PERIOD;
+	}
+
+	if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) {
+		resp.cqe_comp_caps.max_num =
+			MLX5_CAP_GEN(dev->mdev, cqe_compression) ?
+			MLX5_CAP_GEN(dev->mdev, cqe_compression_max_num) : 0;
+		resp.cqe_comp_caps.supported_format =
+			MLX5_IB_CQE_RES_FORMAT_HASH |
+			MLX5_IB_CQE_RES_FORMAT_CSUM;
+		resp.response_length += sizeof(resp.cqe_comp_caps);
+	}
+
+	if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) &&
+	    raw_support) {
+		if (MLX5_CAP_QOS(mdev, packet_pacing) &&
+		    MLX5_CAP_GEN(mdev, qos)) {
+			resp.packet_pacing_caps.qp_rate_limit_max =
+				MLX5_CAP_QOS(mdev, packet_pacing_max_rate);
+			resp.packet_pacing_caps.qp_rate_limit_min =
+				MLX5_CAP_QOS(mdev, packet_pacing_min_rate);
+			resp.packet_pacing_caps.supported_qpts |=
+				1 << IB_QPT_RAW_PACKET;
+			if (MLX5_CAP_QOS(mdev, packet_pacing_burst_bound) &&
+			    MLX5_CAP_QOS(mdev, packet_pacing_typical_size))
+				resp.packet_pacing_caps.cap_flags |=
+					MLX5_IB_PP_SUPPORT_BURST;
+		}
+		resp.response_length += sizeof(resp.packet_pacing_caps);
+	}
+
+	if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
+			uhw->outlen)) {
+		if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
+			resp.mlx5_ib_support_multi_pkt_send_wqes =
+				MLX5_IB_ALLOW_MPW;
+
+		if (MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe))
+			resp.mlx5_ib_support_multi_pkt_send_wqes |=
+				MLX5_IB_SUPPORT_EMPW;
+
+		resp.response_length +=
+			sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
+	}
+
+	if (field_avail(typeof(resp), flags, uhw->outlen)) {
+		resp.response_length += sizeof(resp.flags);
+
+		if (MLX5_CAP_GEN(mdev, cqe_compression_128))
+			resp.flags |=
+				MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP;
+
+		if (MLX5_CAP_GEN(mdev, cqe_128_always))
+			resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD;
+	}
+
+	if (field_avail(typeof(resp), sw_parsing_caps,
+			uhw->outlen)) {
+		resp.response_length += sizeof(resp.sw_parsing_caps);
+		if (MLX5_CAP_ETH(mdev, swp)) {
+			resp.sw_parsing_caps.sw_parsing_offloads |=
+				MLX5_IB_SW_PARSING;
+
+			if (MLX5_CAP_ETH(mdev, swp_csum))
+				resp.sw_parsing_caps.sw_parsing_offloads |=
+					MLX5_IB_SW_PARSING_CSUM;
+
+			if (MLX5_CAP_ETH(mdev, swp_lso))
+				resp.sw_parsing_caps.sw_parsing_offloads |=
+					MLX5_IB_SW_PARSING_LSO;
+
+			if (resp.sw_parsing_caps.sw_parsing_offloads)
+				resp.sw_parsing_caps.supported_qpts =
+					BIT(IB_QPT_RAW_PACKET);
+		}
+	}
+
+	if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen) &&
+	    raw_support) {
+		resp.response_length += sizeof(resp.striding_rq_caps);
+		if (MLX5_CAP_GEN(mdev, striding_rq)) {
+			resp.striding_rq_caps.min_single_stride_log_num_of_bytes =
+				MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
+			resp.striding_rq_caps.max_single_stride_log_num_of_bytes =
+				MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES;
+			resp.striding_rq_caps.min_single_wqe_log_num_of_strides =
+				MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
+			resp.striding_rq_caps.max_single_wqe_log_num_of_strides =
+				MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES;
+			resp.striding_rq_caps.supported_qpts =
+				BIT(IB_QPT_RAW_PACKET);
+		}
+	}
+
+	if (field_avail(typeof(resp), tunnel_offloads_caps,
+			uhw->outlen)) {
+		resp.response_length += sizeof(resp.tunnel_offloads_caps);
+		if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
+			resp.tunnel_offloads_caps |=
+				MLX5_IB_TUNNELED_OFFLOADS_VXLAN;
+		if (MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx))
+			resp.tunnel_offloads_caps |=
+				MLX5_IB_TUNNELED_OFFLOADS_GENEVE;
+		if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre))
+			resp.tunnel_offloads_caps |=
+				MLX5_IB_TUNNELED_OFFLOADS_GRE;
+	}
+
+	if (uhw->outlen) {
+		err = ib_copy_to_udata(uhw, &resp, resp.response_length);
+
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+enum mlx5_ib_width {
+	MLX5_IB_WIDTH_1X	= 1 << 0,
+	MLX5_IB_WIDTH_2X	= 1 << 1,
+	MLX5_IB_WIDTH_4X	= 1 << 2,
+	MLX5_IB_WIDTH_8X	= 1 << 3,
+	MLX5_IB_WIDTH_12X	= 1 << 4
+};
+
+static int translate_active_width(struct ib_device *ibdev, u8 active_width,
+				  u8 *ib_width)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	int err = 0;
+
+	if (active_width & MLX5_IB_WIDTH_1X) {
+		*ib_width = IB_WIDTH_1X;
+	} else if (active_width & MLX5_IB_WIDTH_2X) {
+		mlx5_ib_dbg(dev, "active_width %d is not supported by IB spec\n",
+			    (int)active_width);
+		err = -EINVAL;
+	} else if (active_width & MLX5_IB_WIDTH_4X) {
+		*ib_width = IB_WIDTH_4X;
+	} else if (active_width & MLX5_IB_WIDTH_8X) {
+		*ib_width = IB_WIDTH_8X;
+	} else if (active_width & MLX5_IB_WIDTH_12X) {
+		*ib_width = IB_WIDTH_12X;
+	} else {
+		mlx5_ib_dbg(dev, "Invalid active_width %d\n",
+			    (int)active_width);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int mlx5_mtu_to_ib_mtu(int mtu)
+{
+	switch (mtu) {
+	case 256: return 1;
+	case 512: return 2;
+	case 1024: return 3;
+	case 2048: return 4;
+	case 4096: return 5;
+	default:
+		pr_warn("invalid mtu\n");
+		return -1;
+	}
+}
+
+enum ib_max_vl_num {
+	__IB_MAX_VL_0		= 1,
+	__IB_MAX_VL_0_1		= 2,
+	__IB_MAX_VL_0_3		= 3,
+	__IB_MAX_VL_0_7		= 4,
+	__IB_MAX_VL_0_14	= 5,
+};
+
+enum mlx5_vl_hw_cap {
+	MLX5_VL_HW_0	= 1,
+	MLX5_VL_HW_0_1	= 2,
+	MLX5_VL_HW_0_2	= 3,
+	MLX5_VL_HW_0_3	= 4,
+	MLX5_VL_HW_0_4	= 5,
+	MLX5_VL_HW_0_5	= 6,
+	MLX5_VL_HW_0_6	= 7,
+	MLX5_VL_HW_0_7	= 8,
+	MLX5_VL_HW_0_14	= 15
+};
+
+static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
+				u8 *max_vl_num)
+{
+	switch (vl_hw_cap) {
+	case MLX5_VL_HW_0:
+		*max_vl_num = __IB_MAX_VL_0;
+		break;
+	case MLX5_VL_HW_0_1:
+		*max_vl_num = __IB_MAX_VL_0_1;
+		break;
+	case MLX5_VL_HW_0_3:
+		*max_vl_num = __IB_MAX_VL_0_3;
+		break;
+	case MLX5_VL_HW_0_7:
+		*max_vl_num = __IB_MAX_VL_0_7;
+		break;
+	case MLX5_VL_HW_0_14:
+		*max_vl_num = __IB_MAX_VL_0_14;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
+			       struct ib_port_attr *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_hca_vport_context *rep;
+	u16 max_mtu;
+	u16 oper_mtu;
+	int err;
+	u8 ib_link_width_oper;
+	u8 vl_hw_cap;
+
+	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+	if (!rep) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* props being zeroed by the caller, avoid zeroing it here */
+
+	err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
+	if (err)
+		goto out;
+
+	props->lid		= rep->lid;
+	props->lmc		= rep->lmc;
+	props->sm_lid		= rep->sm_lid;
+	props->sm_sl		= rep->sm_sl;
+	props->state		= rep->vport_state;
+	props->phys_state	= rep->port_physical_state;
+	props->port_cap_flags	= rep->cap_mask1;
+	props->gid_tbl_len	= mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
+	props->max_msg_sz	= 1 << MLX5_CAP_GEN(mdev, log_max_msg);
+	props->pkey_tbl_len	= mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
+	props->bad_pkey_cntr	= rep->pkey_violation_counter;
+	props->qkey_viol_cntr	= rep->qkey_violation_counter;
+	props->subnet_timeout	= rep->subnet_timeout;
+	props->init_type_reply	= rep->init_type_reply;
+	props->grh_required	= rep->grh_required;
+
+	err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
+	if (err)
+		goto out;
+
+	err = translate_active_width(ibdev, ib_link_width_oper,
+				     &props->active_width);
+	if (err)
+		goto out;
+	err = mlx5_query_port_ib_proto_oper(mdev, &props->active_speed, port);
+	if (err)
+		goto out;
+
+	mlx5_query_port_max_mtu(mdev, &max_mtu, port);
+
+	props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
+
+	mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
+
+	props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
+
+	err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
+	if (err)
+		goto out;
+
+	err = translate_max_vl_num(ibdev, vl_hw_cap,
+				   &props->max_vl_num);
+out:
+	kfree(rep);
+	return err;
+}
+
+int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
+		       struct ib_port_attr *props)
+{
+	unsigned int count;
+	int ret;
+
+	switch (mlx5_get_vport_access_method(ibdev)) {
+	case MLX5_VPORT_ACCESS_METHOD_MAD:
+		ret = mlx5_query_mad_ifc_port(ibdev, port, props);
+		break;
+
+	case MLX5_VPORT_ACCESS_METHOD_HCA:
+		ret = mlx5_query_hca_port(ibdev, port, props);
+		break;
+
+	case MLX5_VPORT_ACCESS_METHOD_NIC:
+		ret = mlx5_query_port_roce(ibdev, port, props);
+		break;
+
+	default:
+		ret = -EINVAL;
+	}
+
+	if (!ret && props) {
+		struct mlx5_ib_dev *dev = to_mdev(ibdev);
+		struct mlx5_core_dev *mdev;
+		bool put_mdev = true;
+
+		mdev = mlx5_ib_get_native_port_mdev(dev, port, NULL);
+		if (!mdev) {
+			/* If the port isn't affiliated yet query the master.
+			 * The master and slave will have the same values.
+			 */
+			mdev = dev->mdev;
+			port = 1;
+			put_mdev = false;
+		}
+		count = mlx5_core_reserved_gids_count(mdev);
+		if (put_mdev)
+			mlx5_ib_put_native_port_mdev(dev, port);
+		props->gid_tbl_len -= count;
+	}
+	return ret;
+}
+
+static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u8 port,
+				  struct ib_port_attr *props)
+{
+	int ret;
+
+	/* Only link layer == ethernet is valid for representors */
+	ret = mlx5_query_port_roce(ibdev, port, props);
+	if (ret || !props)
+		return ret;
+
+	/* We don't support GIDS */
+	props->gid_tbl_len = 0;
+
+	return ret;
+}
+
+static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			     union ib_gid *gid)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_core_dev *mdev = dev->mdev;
+
+	switch (mlx5_get_vport_access_method(ibdev)) {
+	case MLX5_VPORT_ACCESS_METHOD_MAD:
+		return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
+
+	case MLX5_VPORT_ACCESS_METHOD_HCA:
+		return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
+
+	default:
+		return -EINVAL;
+	}
+
+}
+
+static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port,
+				   u16 index, u16 *pkey)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_core_dev *mdev;
+	bool put_mdev = true;
+	u8 mdev_port_num;
+	int err;
+
+	mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num);
+	if (!mdev) {
+		/* The port isn't affiliated yet, get the PKey from the master
+		 * port. For RoCE the PKey tables will be the same.
+		 */
+		put_mdev = false;
+		mdev = dev->mdev;
+		mdev_port_num = 1;
+	}
+
+	err = mlx5_query_hca_vport_pkey(mdev, 0, mdev_port_num, 0,
+					index, pkey);
+	if (put_mdev)
+		mlx5_ib_put_native_port_mdev(dev, port);
+
+	return err;
+}
+
+static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			      u16 *pkey)
+{
+	switch (mlx5_get_vport_access_method(ibdev)) {
+	case MLX5_VPORT_ACCESS_METHOD_MAD:
+		return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
+
+	case MLX5_VPORT_ACCESS_METHOD_HCA:
+	case MLX5_VPORT_ACCESS_METHOD_NIC:
+		return mlx5_query_hca_nic_pkey(ibdev, port, index, pkey);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
+				 struct ib_device_modify *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_reg_node_desc in;
+	struct mlx5_reg_node_desc out;
+	int err;
+
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
+		return 0;
+
+	/*
+	 * If possible, pass node desc to FW, so it can generate
+	 * a 144 trap.  If cmd fails, just ignore.
+	 */
+	memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
+	err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
+				   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
+	if (err)
+		return err;
+
+	memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
+
+	return err;
+}
+
+static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask,
+				u32 value)
+{
+	struct mlx5_hca_vport_context ctx = {};
+	struct mlx5_core_dev *mdev;
+	u8 mdev_port_num;
+	int err;
+
+	mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
+	if (!mdev)
+		return -ENODEV;
+
+	err = mlx5_query_hca_vport_context(mdev, 0, mdev_port_num, 0, &ctx);
+	if (err)
+		goto out;
+
+	if (~ctx.cap_mask1_perm & mask) {
+		mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n",
+			     mask, ctx.cap_mask1_perm);
+		err = -EINVAL;
+		goto out;
+	}
+
+	ctx.cap_mask1 = value;
+	ctx.cap_mask1_perm = mask;
+	err = mlx5_core_modify_hca_vport_context(mdev, 0, mdev_port_num,
+						 0, &ctx);
+
+out:
+	mlx5_ib_put_native_port_mdev(dev, port_num);
+
+	return err;
+}
+
+static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+			       struct ib_port_modify *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct ib_port_attr attr;
+	u32 tmp;
+	int err;
+	u32 change_mask;
+	u32 value;
+	bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) ==
+		      IB_LINK_LAYER_INFINIBAND);
+
+	/* CM layer calls ib_modify_port() regardless of the link layer. For
+	 * Ethernet ports, qkey violation and Port capabilities are meaningless.
+	 */
+	if (!is_ib)
+		return 0;
+
+	if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) {
+		change_mask = props->clr_port_cap_mask | props->set_port_cap_mask;
+		value = ~props->clr_port_cap_mask | props->set_port_cap_mask;
+		return set_port_caps_atomic(dev, port, change_mask, value);
+	}
+
+	mutex_lock(&dev->cap_mask_mutex);
+
+	err = ib_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mlx5_set_port_caps(dev->mdev, port, tmp);
+
+out:
+	mutex_unlock(&dev->cap_mask_mutex);
+	return err;
+}
+
+static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
+{
+	mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
+		    caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
+}
+
+static u16 calc_dynamic_bfregs(int uars_per_sys_page)
+{
+	/* Large page with non 4k uar support might limit the dynamic size */
+	if (uars_per_sys_page == 1  && PAGE_SIZE > 4096)
+		return MLX5_MIN_DYN_BFREGS;
+
+	return MLX5_MAX_DYN_BFREGS;
+}
+
+static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
+			     struct mlx5_ib_alloc_ucontext_req_v2 *req,
+			     struct mlx5_bfreg_info *bfregi)
+{
+	int uars_per_sys_page;
+	int bfregs_per_sys_page;
+	int ref_bfregs = req->total_num_bfregs;
+
+	if (req->total_num_bfregs == 0)
+		return -EINVAL;
+
+	BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE);
+	BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE);
+
+	if (req->total_num_bfregs > MLX5_MAX_BFREGS)
+		return -ENOMEM;
+
+	uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
+	bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
+	/* This holds the required static allocation asked by the user */
+	req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
+	if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
+		return -EINVAL;
+
+	bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
+	bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page);
+	bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs;
+	bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page;
+
+	mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n",
+		    MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
+		    lib_uar_4k ? "yes" : "no", ref_bfregs,
+		    req->total_num_bfregs, bfregi->total_num_bfregs,
+		    bfregi->num_sys_pages);
+
+	return 0;
+}
+
+static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
+{
+	struct mlx5_bfreg_info *bfregi;
+	int err;
+	int i;
+
+	bfregi = &context->bfregi;
+	for (i = 0; i < bfregi->num_static_sys_pages; i++) {
+		err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]);
+		if (err)
+			goto error;
+
+		mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
+	}
+
+	for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++)
+		bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX;
+
+	return 0;
+
+error:
+	for (--i; i >= 0; i--)
+		if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]))
+			mlx5_ib_warn(dev, "failed to free uar %d\n", i);
+
+	return err;
+}
+
+static int deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
+{
+	struct mlx5_bfreg_info *bfregi;
+	int err;
+	int i;
+
+	bfregi = &context->bfregi;
+	for (i = 0; i < bfregi->num_sys_pages; i++) {
+		if (i < bfregi->num_static_sys_pages ||
+		    bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX) {
+			err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
+			if (err) {
+				mlx5_ib_warn(dev, "failed to free uar %d, err=%d\n", i, err);
+				return err;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn)
+{
+	int err;
+
+	err = mlx5_core_alloc_transport_domain(dev->mdev, tdn);
+	if (err)
+		return err;
+
+	if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
+	    (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
+	     !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
+		return err;
+
+	mutex_lock(&dev->lb_mutex);
+	dev->user_td++;
+
+	if (dev->user_td == 2)
+		err = mlx5_nic_vport_update_local_lb(dev->mdev, true);
+
+	mutex_unlock(&dev->lb_mutex);
+	return err;
+}
+
+static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn)
+{
+	mlx5_core_dealloc_transport_domain(dev->mdev, tdn);
+
+	if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
+	    (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
+	     !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
+		return;
+
+	mutex_lock(&dev->lb_mutex);
+	dev->user_td--;
+
+	if (dev->user_td < 2)
+		mlx5_nic_vport_update_local_lb(dev->mdev, false);
+
+	mutex_unlock(&dev->lb_mutex);
+}
+
+static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
+						  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
+	struct mlx5_ib_alloc_ucontext_resp resp = {};
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_ib_ucontext *context;
+	struct mlx5_bfreg_info *bfregi;
+	int ver;
+	int err;
+	size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
+				     max_cqe_version);
+	bool lib_uar_4k;
+
+	if (!dev->ib_active)
+		return ERR_PTR(-EAGAIN);
+
+	if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
+		ver = 0;
+	else if (udata->inlen >= min_req_v2)
+		ver = 2;
+	else
+		return ERR_PTR(-EINVAL);
+
+	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
+	if (err)
+		return ERR_PTR(err);
+
+	if (req.flags)
+		return ERR_PTR(-EINVAL);
+
+	if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	req.total_num_bfregs = ALIGN(req.total_num_bfregs,
+				    MLX5_NON_FP_BFREGS_PER_UAR);
+	if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
+		return ERR_PTR(-EINVAL);
+
+	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
+	if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
+		resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
+	resp.cache_line_size = cache_line_size();
+	resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
+	resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
+	resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
+	resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
+	resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
+	resp.cqe_version = min_t(__u8,
+				 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
+				 req.max_cqe_version);
+	resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+				MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
+	resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+					MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1;
+	resp.response_length = min(offsetof(typeof(resp), response_length) +
+				   sizeof(resp.response_length), udata->outlen);
+
+	if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) {
+		if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_EGRESS))
+			resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM;
+		if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA)
+			resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA;
+		if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
+			resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING;
+		if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN)
+			resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN;
+		/* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
+	}
+
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
+	bfregi = &context->bfregi;
+
+	/* updates req->total_num_bfregs */
+	err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
+	if (err)
+		goto out_ctx;
+
+	mutex_init(&bfregi->lock);
+	bfregi->lib_uar_4k = lib_uar_4k;
+	bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count),
+				GFP_KERNEL);
+	if (!bfregi->count) {
+		err = -ENOMEM;
+		goto out_ctx;
+	}
+
+	bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
+				    sizeof(*bfregi->sys_pages),
+				    GFP_KERNEL);
+	if (!bfregi->sys_pages) {
+		err = -ENOMEM;
+		goto out_count;
+	}
+
+	err = allocate_uars(dev, context);
+	if (err)
+		goto out_sys_pages;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
+#endif
+
+	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
+		err = mlx5_ib_alloc_transport_domain(dev, &context->tdn);
+		if (err)
+			goto out_uars;
+	}
+
+	INIT_LIST_HEAD(&context->vma_private_list);
+	mutex_init(&context->vma_private_list_mutex);
+	INIT_LIST_HEAD(&context->db_page_list);
+	mutex_init(&context->db_page_mutex);
+
+	resp.tot_bfregs = req.total_num_bfregs;
+	resp.num_ports = dev->num_ports;
+
+	if (field_avail(typeof(resp), cqe_version, udata->outlen))
+		resp.response_length += sizeof(resp.cqe_version);
+
+	if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
+		resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
+				      MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
+		resp.response_length += sizeof(resp.cmds_supp_uhw);
+	}
+
+	if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) {
+		if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
+			mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline);
+			resp.eth_min_inline++;
+		}
+		resp.response_length += sizeof(resp.eth_min_inline);
+	}
+
+	if (field_avail(typeof(resp), clock_info_versions, udata->outlen)) {
+		if (mdev->clock_info)
+			resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
+		resp.response_length += sizeof(resp.clock_info_versions);
+	}
+
+	/*
+	 * We don't want to expose information from the PCI bar that is located
+	 * after 4096 bytes, so if the arch only supports larger pages, let's
+	 * pretend we don't support reading the HCA's core clock. This is also
+	 * forced by mmap function.
+	 */
+	if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
+		if (PAGE_SIZE <= 4096) {
+			resp.comp_mask |=
+				MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
+			resp.hca_core_clock_offset =
+				offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE;
+		}
+		resp.response_length += sizeof(resp.hca_core_clock_offset);
+	}
+
+	if (field_avail(typeof(resp), log_uar_size, udata->outlen))
+		resp.response_length += sizeof(resp.log_uar_size);
+
+	if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
+		resp.response_length += sizeof(resp.num_uars_per_page);
+
+	if (field_avail(typeof(resp), num_dyn_bfregs, udata->outlen)) {
+		resp.num_dyn_bfregs = bfregi->num_dyn_bfregs;
+		resp.response_length += sizeof(resp.num_dyn_bfregs);
+	}
+
+	err = ib_copy_to_udata(udata, &resp, resp.response_length);
+	if (err)
+		goto out_td;
+
+	bfregi->ver = ver;
+	bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
+	context->cqe_version = resp.cqe_version;
+	context->lib_caps = req.lib_caps;
+	print_lib_caps(dev, context->lib_caps);
+
+	return &context->ibucontext;
+
+out_td:
+	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
+		mlx5_ib_dealloc_transport_domain(dev, context->tdn);
+
+out_uars:
+	deallocate_uars(dev, context);
+
+out_sys_pages:
+	kfree(bfregi->sys_pages);
+
+out_count:
+	kfree(bfregi->count);
+
+out_ctx:
+	kfree(context);
+
+	return ERR_PTR(err);
+}
+
+static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
+	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
+	struct mlx5_bfreg_info *bfregi;
+
+	bfregi = &context->bfregi;
+	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
+		mlx5_ib_dealloc_transport_domain(dev, context->tdn);
+
+	deallocate_uars(dev, context);
+	kfree(bfregi->sys_pages);
+	kfree(bfregi->count);
+	kfree(context);
+
+	return 0;
+}
+
+static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
+				 int uar_idx)
+{
+	int fw_uars_per_page;
+
+	fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
+
+	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
+}
+
+static int get_command(unsigned long offset)
+{
+	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
+}
+
+static int get_arg(unsigned long offset)
+{
+	return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
+}
+
+static int get_index(unsigned long offset)
+{
+	return get_arg(offset);
+}
+
+/* Index resides in an extra byte to enable larger values than 255 */
+static int get_extended_index(unsigned long offset)
+{
+	return get_arg(offset) | ((offset >> 16) & 0xff) << 8;
+}
+
+static void  mlx5_ib_vma_open(struct vm_area_struct *area)
+{
+	/* vma_open is called when a new VMA is created on top of our VMA.  This
+	 * is done through either mremap flow or split_vma (usually due to
+	 * mlock, madvise, munmap, etc.) We do not support a clone of the VMA,
+	 * as this VMA is strongly hardware related.  Therefore we set the
+	 * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
+	 * calling us again and trying to do incorrect actions.  We assume that
+	 * the original VMA size is exactly a single page, and therefore all
+	 * "splitting" operation will not happen to it.
+	 */
+	area->vm_ops = NULL;
+}
+
+static void  mlx5_ib_vma_close(struct vm_area_struct *area)
+{
+	struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data;
+
+	/* It's guaranteed that all VMAs opened on a FD are closed before the
+	 * file itself is closed, therefore no sync is needed with the regular
+	 * closing flow. (e.g. mlx5 ib_dealloc_ucontext)
+	 * However need a sync with accessing the vma as part of
+	 * mlx5_ib_disassociate_ucontext.
+	 * The close operation is usually called under mm->mmap_sem except when
+	 * process is exiting.
+	 * The exiting case is handled explicitly as part of
+	 * mlx5_ib_disassociate_ucontext.
+	 */
+	mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data;
+
+	/* setting the vma context pointer to null in the mlx5_ib driver's
+	 * private data, to protect a race condition in
+	 * mlx5_ib_disassociate_ucontext().
+	 */
+	mlx5_ib_vma_priv_data->vma = NULL;
+	mutex_lock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
+	list_del(&mlx5_ib_vma_priv_data->list);
+	mutex_unlock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
+	kfree(mlx5_ib_vma_priv_data);
+}
+
+static const struct vm_operations_struct mlx5_ib_vm_ops = {
+	.open = mlx5_ib_vma_open,
+	.close = mlx5_ib_vma_close
+};
+
+static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
+				struct mlx5_ib_ucontext *ctx)
+{
+	struct mlx5_ib_vma_private_data *vma_prv;
+	struct list_head *vma_head = &ctx->vma_private_list;
+
+	vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL);
+	if (!vma_prv)
+		return -ENOMEM;
+
+	vma_prv->vma = vma;
+	vma_prv->vma_private_list_mutex = &ctx->vma_private_list_mutex;
+	vma->vm_private_data = vma_prv;
+	vma->vm_ops =  &mlx5_ib_vm_ops;
+
+	mutex_lock(&ctx->vma_private_list_mutex);
+	list_add(&vma_prv->list, vma_head);
+	mutex_unlock(&ctx->vma_private_list_mutex);
+
+	return 0;
+}
+
+static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+	int ret;
+	struct vm_area_struct *vma;
+	struct mlx5_ib_vma_private_data *vma_private, *n;
+	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
+	struct task_struct *owning_process  = NULL;
+	struct mm_struct   *owning_mm       = NULL;
+
+	owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
+	if (!owning_process)
+		return;
+
+	owning_mm = get_task_mm(owning_process);
+	if (!owning_mm) {
+		pr_info("no mm, disassociate ucontext is pending task termination\n");
+		while (1) {
+			put_task_struct(owning_process);
+			usleep_range(1000, 2000);
+			owning_process = get_pid_task(ibcontext->tgid,
+						      PIDTYPE_PID);
+			if (!owning_process ||
+			    owning_process->state == TASK_DEAD) {
+				pr_info("disassociate ucontext done, task was terminated\n");
+				/* in case task was dead need to release the
+				 * task struct.
+				 */
+				if (owning_process)
+					put_task_struct(owning_process);
+				return;
+			}
+		}
+	}
+
+	/* need to protect from a race on closing the vma as part of
+	 * mlx5_ib_vma_close.
+	 */
+	down_write(&owning_mm->mmap_sem);
+	mutex_lock(&context->vma_private_list_mutex);
+	list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
+				 list) {
+		vma = vma_private->vma;
+		ret = zap_vma_ptes(vma, vma->vm_start,
+				   PAGE_SIZE);
+		WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__);
+		/* context going to be destroyed, should
+		 * not access ops any more.
+		 */
+		vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
+		vma->vm_ops = NULL;
+		list_del(&vma_private->list);
+		kfree(vma_private);
+	}
+	mutex_unlock(&context->vma_private_list_mutex);
+	up_write(&owning_mm->mmap_sem);
+	mmput(owning_mm);
+	put_task_struct(owning_process);
+}
+
+static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
+{
+	switch (cmd) {
+	case MLX5_IB_MMAP_WC_PAGE:
+		return "WC";
+	case MLX5_IB_MMAP_REGULAR_PAGE:
+		return "best effort WC";
+	case MLX5_IB_MMAP_NC_PAGE:
+		return "NC";
+	case MLX5_IB_MMAP_DEVICE_MEM:
+		return "Device Memory";
+	default:
+		return NULL;
+	}
+}
+
+static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
+					struct vm_area_struct *vma,
+					struct mlx5_ib_ucontext *context)
+{
+	phys_addr_t pfn;
+	int err;
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1)
+		return -EOPNOTSUPP;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	if (!dev->mdev->clock_info_page)
+		return -EOPNOTSUPP;
+
+	pfn = page_to_pfn(dev->mdev->clock_info_page);
+	err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE,
+			      vma->vm_page_prot);
+	if (err)
+		return err;
+
+	mlx5_ib_dbg(dev, "mapped clock info at 0x%lx, PA 0x%llx\n",
+		    vma->vm_start,
+		    (unsigned long long)pfn << PAGE_SHIFT);
+
+	return mlx5_ib_set_vma_data(vma, context);
+}
+
+static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
+		    struct vm_area_struct *vma,
+		    struct mlx5_ib_ucontext *context)
+{
+	struct mlx5_bfreg_info *bfregi = &context->bfregi;
+	int err;
+	unsigned long idx;
+	phys_addr_t pfn, pa;
+	pgprot_t prot;
+	u32 bfreg_dyn_idx = 0;
+	u32 uar_index;
+	int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC);
+	int max_valid_idx = dyn_uar ? bfregi->num_sys_pages :
+				bfregi->num_static_sys_pages;
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	if (dyn_uar)
+		idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages;
+	else
+		idx = get_index(vma->vm_pgoff);
+
+	if (idx >= max_valid_idx) {
+		mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n",
+			     idx, max_valid_idx);
+		return -EINVAL;
+	}
+
+	switch (cmd) {
+	case MLX5_IB_MMAP_WC_PAGE:
+	case MLX5_IB_MMAP_ALLOC_WC:
+/* Some architectures don't support WC memory */
+#if defined(CONFIG_X86)
+		if (!pat_enabled())
+			return -EPERM;
+#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
+			return -EPERM;
+#endif
+	/* fall through */
+	case MLX5_IB_MMAP_REGULAR_PAGE:
+		/* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
+		prot = pgprot_writecombine(vma->vm_page_prot);
+		break;
+	case MLX5_IB_MMAP_NC_PAGE:
+		prot = pgprot_noncached(vma->vm_page_prot);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (dyn_uar) {
+		int uars_per_page;
+
+		uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
+		bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR);
+		if (bfreg_dyn_idx >= bfregi->total_num_bfregs) {
+			mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n",
+				     bfreg_dyn_idx, bfregi->total_num_bfregs);
+			return -EINVAL;
+		}
+
+		mutex_lock(&bfregi->lock);
+		/* Fail if uar already allocated, first bfreg index of each
+		 * page holds its count.
+		 */
+		if (bfregi->count[bfreg_dyn_idx]) {
+			mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx);
+			mutex_unlock(&bfregi->lock);
+			return -EINVAL;
+		}
+
+		bfregi->count[bfreg_dyn_idx]++;
+		mutex_unlock(&bfregi->lock);
+
+		err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index);
+		if (err) {
+			mlx5_ib_warn(dev, "UAR alloc failed\n");
+			goto free_bfreg;
+		}
+	} else {
+		uar_index = bfregi->sys_pages[idx];
+	}
+
+	pfn = uar_index2pfn(dev, uar_index);
+	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
+
+	vma->vm_page_prot = prot;
+	err = io_remap_pfn_range(vma, vma->vm_start, pfn,
+				 PAGE_SIZE, vma->vm_page_prot);
+	if (err) {
+		mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n",
+			    err, vma->vm_start, &pfn, mmap_cmd2str(cmd));
+		err = -EAGAIN;
+		goto err;
+	}
+
+	pa = pfn << PAGE_SHIFT;
+	mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd),
+		    vma->vm_start, &pa);
+
+	err = mlx5_ib_set_vma_data(vma, context);
+	if (err)
+		goto err;
+
+	if (dyn_uar)
+		bfregi->sys_pages[idx] = uar_index;
+	return 0;
+
+err:
+	if (!dyn_uar)
+		return err;
+
+	mlx5_cmd_free_uar(dev->mdev, idx);
+
+free_bfreg:
+	mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx);
+
+	return err;
+}
+
+static int dm_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct mlx5_ib_ucontext *mctx = to_mucontext(context);
+	struct mlx5_ib_dev *dev = to_mdev(context->device);
+	u16 page_idx = get_extended_index(vma->vm_pgoff);
+	size_t map_size = vma->vm_end - vma->vm_start;
+	u32 npages = map_size >> PAGE_SHIFT;
+	phys_addr_t pfn;
+	pgprot_t prot;
+
+	if (find_next_zero_bit(mctx->dm_pages, page_idx + npages, page_idx) !=
+	    page_idx + npages)
+		return -EINVAL;
+
+	pfn = ((pci_resource_start(dev->mdev->pdev, 0) +
+	      MLX5_CAP64_DEV_MEM(dev->mdev, memic_bar_start_addr)) >>
+	      PAGE_SHIFT) +
+	      page_idx;
+	prot = pgprot_writecombine(vma->vm_page_prot);
+	vma->vm_page_prot = prot;
+
+	if (io_remap_pfn_range(vma, vma->vm_start, pfn, map_size,
+			       vma->vm_page_prot))
+		return -EAGAIN;
+
+	return mlx5_ib_set_vma_data(vma, mctx);
+}
+
+static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
+{
+	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
+	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
+	unsigned long command;
+	phys_addr_t pfn;
+
+	command = get_command(vma->vm_pgoff);
+	switch (command) {
+	case MLX5_IB_MMAP_WC_PAGE:
+	case MLX5_IB_MMAP_NC_PAGE:
+	case MLX5_IB_MMAP_REGULAR_PAGE:
+	case MLX5_IB_MMAP_ALLOC_WC:
+		return uar_mmap(dev, command, vma, context);
+
+	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
+		return -ENOSYS;
+
+	case MLX5_IB_MMAP_CORE_CLOCK:
+		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+			return -EINVAL;
+
+		if (vma->vm_flags & VM_WRITE)
+			return -EPERM;
+
+		/* Don't expose to user-space information it shouldn't have */
+		if (PAGE_SIZE > 4096)
+			return -EOPNOTSUPP;
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		pfn = (dev->mdev->iseg_base +
+		       offsetof(struct mlx5_init_seg, internal_timer_h)) >>
+			PAGE_SHIFT;
+		if (io_remap_pfn_range(vma, vma->vm_start, pfn,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+
+		mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n",
+			    vma->vm_start,
+			    (unsigned long long)pfn << PAGE_SHIFT);
+		break;
+	case MLX5_IB_MMAP_CLOCK_INFO:
+		return mlx5_ib_mmap_clock_info_page(dev, vma, context);
+
+	case MLX5_IB_MMAP_DEVICE_MEM:
+		return dm_mmap(ibcontext, vma);
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
+			       struct ib_ucontext *context,
+			       struct ib_dm_alloc_attr *attr,
+			       struct uverbs_attr_bundle *attrs)
+{
+	u64 act_size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
+	struct mlx5_memic *memic = &to_mdev(ibdev)->memic;
+	phys_addr_t memic_addr;
+	struct mlx5_ib_dm *dm;
+	u64 start_offset;
+	u32 page_idx;
+	int err;
+
+	dm = kzalloc(sizeof(*dm), GFP_KERNEL);
+	if (!dm)
+		return ERR_PTR(-ENOMEM);
+
+	mlx5_ib_dbg(to_mdev(ibdev), "alloc_memic req: user_length=0x%llx act_length=0x%llx log_alignment=%d\n",
+		    attr->length, act_size, attr->alignment);
+
+	err = mlx5_cmd_alloc_memic(memic, &memic_addr,
+				   act_size, attr->alignment);
+	if (err)
+		goto err_free;
+
+	start_offset = memic_addr & ~PAGE_MASK;
+	page_idx = (memic_addr - pci_resource_start(memic->dev->pdev, 0) -
+		    MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >>
+		    PAGE_SHIFT;
+
+	err = uverbs_copy_to(attrs,
+			     MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
+			     &start_offset, sizeof(start_offset));
+	if (err)
+		goto err_dealloc;
+
+	err = uverbs_copy_to(attrs,
+			     MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
+			     &page_idx, sizeof(page_idx));
+	if (err)
+		goto err_dealloc;
+
+	bitmap_set(to_mucontext(context)->dm_pages, page_idx,
+		   DIV_ROUND_UP(act_size, PAGE_SIZE));
+
+	dm->dev_addr = memic_addr;
+
+	return &dm->ibdm;
+
+err_dealloc:
+	mlx5_cmd_dealloc_memic(memic, memic_addr,
+			       act_size);
+err_free:
+	kfree(dm);
+	return ERR_PTR(err);
+}
+
+int mlx5_ib_dealloc_dm(struct ib_dm *ibdm)
+{
+	struct mlx5_memic *memic = &to_mdev(ibdm->device)->memic;
+	struct mlx5_ib_dm *dm = to_mdm(ibdm);
+	u64 act_size = roundup(dm->ibdm.length, MLX5_MEMIC_BASE_SIZE);
+	u32 page_idx;
+	int ret;
+
+	ret = mlx5_cmd_dealloc_memic(memic, dm->dev_addr, act_size);
+	if (ret)
+		return ret;
+
+	page_idx = (dm->dev_addr - pci_resource_start(memic->dev->pdev, 0) -
+		    MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >>
+		    PAGE_SHIFT;
+	bitmap_clear(to_mucontext(ibdm->uobject->context)->dm_pages,
+		     page_idx,
+		     DIV_ROUND_UP(act_size, PAGE_SIZE));
+
+	kfree(dm);
+
+	return 0;
+}
+
+static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
+				      struct ib_ucontext *context,
+				      struct ib_udata *udata)
+{
+	struct mlx5_ib_alloc_pd_resp resp;
+	struct mlx5_ib_pd *pd;
+	int err;
+
+	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context) {
+		resp.pdn = pd->pdn;
+		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
+			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	return &pd->ibpd;
+}
+
+static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
+{
+	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
+	struct mlx5_ib_pd *mpd = to_mpd(pd);
+
+	mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
+	kfree(mpd);
+
+	return 0;
+}
+
+enum {
+	MATCH_CRITERIA_ENABLE_OUTER_BIT,
+	MATCH_CRITERIA_ENABLE_MISC_BIT,
+	MATCH_CRITERIA_ENABLE_INNER_BIT
+};
+
+#define HEADER_IS_ZERO(match_criteria, headers)			           \
+	!(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
+		    0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
+
+static u8 get_match_criteria_enable(u32 *match_criteria)
+{
+	u8 match_criteria_enable;
+
+	match_criteria_enable =
+		(!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
+		MATCH_CRITERIA_ENABLE_OUTER_BIT;
+	match_criteria_enable |=
+		(!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
+		MATCH_CRITERIA_ENABLE_MISC_BIT;
+	match_criteria_enable |=
+		(!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
+		MATCH_CRITERIA_ENABLE_INNER_BIT;
+
+	return match_criteria_enable;
+}
+
+static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
+{
+	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
+	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
+}
+
+static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
+			   bool inner)
+{
+	if (inner) {
+		MLX5_SET(fte_match_set_misc,
+			 misc_c, inner_ipv6_flow_label, mask);
+		MLX5_SET(fte_match_set_misc,
+			 misc_v, inner_ipv6_flow_label, val);
+	} else {
+		MLX5_SET(fte_match_set_misc,
+			 misc_c, outer_ipv6_flow_label, mask);
+		MLX5_SET(fte_match_set_misc,
+			 misc_v, outer_ipv6_flow_label, val);
+	}
+}
+
+static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
+{
+	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
+	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
+	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
+	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
+}
+
+#define LAST_ETH_FIELD vlan_tag
+#define LAST_IB_FIELD sl
+#define LAST_IPV4_FIELD tos
+#define LAST_IPV6_FIELD traffic_class
+#define LAST_TCP_UDP_FIELD src_port
+#define LAST_TUNNEL_FIELD tunnel_id
+#define LAST_FLOW_TAG_FIELD tag_id
+#define LAST_DROP_FIELD size
+
+/* Field is the last supported field */
+#define FIELDS_NOT_SUPPORTED(filter, field)\
+	memchr_inv((void *)&filter.field  +\
+		   sizeof(filter.field), 0,\
+		   sizeof(filter) -\
+		   offsetof(typeof(filter), field) -\
+		   sizeof(filter.field))
+
+static int parse_flow_flow_action(const union ib_flow_spec *ib_spec,
+				  const struct ib_flow_attr *flow_attr,
+				  struct mlx5_flow_act *action)
+{
+	struct mlx5_ib_flow_action *maction = to_mflow_act(ib_spec->action.act);
+
+	switch (maction->ib_action.type) {
+	case IB_FLOW_ACTION_ESP:
+		/* Currently only AES_GCM keymat is supported by the driver */
+		action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx;
+		action->action |= flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS ?
+			MLX5_FLOW_CONTEXT_ACTION_ENCRYPT :
+			MLX5_FLOW_CONTEXT_ACTION_DECRYPT;
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
+			   u32 *match_v, const union ib_flow_spec *ib_spec,
+			   const struct ib_flow_attr *flow_attr,
+			   struct mlx5_flow_act *action)
+{
+	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					   misc_parameters);
+	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					   misc_parameters);
+	void *headers_c;
+	void *headers_v;
+	int match_ipv;
+	int ret;
+
+	if (ib_spec->type & IB_FLOW_SPEC_INNER) {
+		headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					 inner_headers);
+		match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.inner_ip_version);
+	} else {
+		headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					 outer_headers);
+		match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.outer_ip_version);
+	}
+
+	switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
+	case IB_FLOW_SPEC_ETH:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
+			return -EOPNOTSUPP;
+
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+					     dmac_47_16),
+				ib_spec->eth.mask.dst_mac);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+					     dmac_47_16),
+				ib_spec->eth.val.dst_mac);
+
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+					     smac_47_16),
+				ib_spec->eth.mask.src_mac);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+					     smac_47_16),
+				ib_spec->eth.val.src_mac);
+
+		if (ib_spec->eth.mask.vlan_tag) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 cvlan_tag, 1);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 cvlan_tag, 1);
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 first_vid, ntohs(ib_spec->eth.val.vlan_tag));
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 first_cfi,
+				 ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 first_cfi,
+				 ntohs(ib_spec->eth.val.vlan_tag) >> 12);
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 first_prio,
+				 ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 first_prio,
+				 ntohs(ib_spec->eth.val.vlan_tag) >> 13);
+		}
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+			 ethertype, ntohs(ib_spec->eth.mask.ether_type));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+			 ethertype, ntohs(ib_spec->eth.val.ether_type));
+		break;
+	case IB_FLOW_SPEC_IPV4:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
+			return -EOPNOTSUPP;
+
+		if (match_ipv) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ip_version, 0xf);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ip_version, MLX5_FS_IPV4_VERSION);
+		} else {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ethertype, 0xffff);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ethertype, ETH_P_IP);
+		}
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &ib_spec->ipv4.mask.src_ip,
+		       sizeof(ib_spec->ipv4.mask.src_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &ib_spec->ipv4.val.src_ip,
+		       sizeof(ib_spec->ipv4.val.src_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &ib_spec->ipv4.mask.dst_ip,
+		       sizeof(ib_spec->ipv4.mask.dst_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &ib_spec->ipv4.val.dst_ip,
+		       sizeof(ib_spec->ipv4.val.dst_ip));
+
+		set_tos(headers_c, headers_v,
+			ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
+
+		set_proto(headers_c, headers_v,
+			  ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto);
+		break;
+	case IB_FLOW_SPEC_IPV6:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
+			return -EOPNOTSUPP;
+
+		if (match_ipv) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ip_version, 0xf);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ip_version, MLX5_FS_IPV6_VERSION);
+		} else {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ethertype, 0xffff);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ethertype, ETH_P_IPV6);
+		}
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &ib_spec->ipv6.mask.src_ip,
+		       sizeof(ib_spec->ipv6.mask.src_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &ib_spec->ipv6.val.src_ip,
+		       sizeof(ib_spec->ipv6.val.src_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &ib_spec->ipv6.mask.dst_ip,
+		       sizeof(ib_spec->ipv6.mask.dst_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &ib_spec->ipv6.val.dst_ip,
+		       sizeof(ib_spec->ipv6.val.dst_ip));
+
+		set_tos(headers_c, headers_v,
+			ib_spec->ipv6.mask.traffic_class,
+			ib_spec->ipv6.val.traffic_class);
+
+		set_proto(headers_c, headers_v,
+			  ib_spec->ipv6.mask.next_hdr,
+			  ib_spec->ipv6.val.next_hdr);
+
+		set_flow_label(misc_params_c, misc_params_v,
+			       ntohl(ib_spec->ipv6.mask.flow_label),
+			       ntohl(ib_spec->ipv6.val.flow_label),
+			       ib_spec->type & IB_FLOW_SPEC_INNER);
+		break;
+	case IB_FLOW_SPEC_ESP:
+		if (ib_spec->esp.mask.seq)
+			return -EOPNOTSUPP;
+
+		MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi,
+			 ntohl(ib_spec->esp.mask.spi));
+		MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi,
+			 ntohl(ib_spec->esp.val.spi));
+		break;
+	case IB_FLOW_SPEC_TCP:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
+					 LAST_TCP_UDP_FIELD))
+			return -EOPNOTSUPP;
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
+			 0xff);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+			 IPPROTO_TCP);
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
+			 ntohs(ib_spec->tcp_udp.mask.src_port));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
+			 ntohs(ib_spec->tcp_udp.val.src_port));
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport,
+			 ntohs(ib_spec->tcp_udp.mask.dst_port));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
+			 ntohs(ib_spec->tcp_udp.val.dst_port));
+		break;
+	case IB_FLOW_SPEC_UDP:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
+					 LAST_TCP_UDP_FIELD))
+			return -EOPNOTSUPP;
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
+			 0xff);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+			 IPPROTO_UDP);
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
+			 ntohs(ib_spec->tcp_udp.mask.src_port));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
+			 ntohs(ib_spec->tcp_udp.val.src_port));
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport,
+			 ntohs(ib_spec->tcp_udp.mask.dst_port));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
+			 ntohs(ib_spec->tcp_udp.val.dst_port));
+		break;
+	case IB_FLOW_SPEC_VXLAN_TUNNEL:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
+					 LAST_TUNNEL_FIELD))
+			return -EOPNOTSUPP;
+
+		MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni,
+			 ntohl(ib_spec->tunnel.mask.tunnel_id));
+		MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni,
+			 ntohl(ib_spec->tunnel.val.tunnel_id));
+		break;
+	case IB_FLOW_SPEC_ACTION_TAG:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag,
+					 LAST_FLOW_TAG_FIELD))
+			return -EOPNOTSUPP;
+		if (ib_spec->flow_tag.tag_id >= BIT(24))
+			return -EINVAL;
+
+		action->flow_tag = ib_spec->flow_tag.tag_id;
+		action->has_flow_tag = true;
+		break;
+	case IB_FLOW_SPEC_ACTION_DROP:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
+					 LAST_DROP_FIELD))
+			return -EOPNOTSUPP;
+		action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
+		break;
+	case IB_FLOW_SPEC_ACTION_HANDLE:
+		ret = parse_flow_flow_action(ib_spec, flow_attr, action);
+		if (ret)
+			return ret;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* If a flow could catch both multicast and unicast packets,
+ * it won't fall into the multicast flow steering table and this rule
+ * could steal other multicast packets.
+ */
+static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr)
+{
+	union ib_flow_spec *flow_spec;
+
+	if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
+	    ib_attr->num_of_specs < 1)
+		return false;
+
+	flow_spec = (union ib_flow_spec *)(ib_attr + 1);
+	if (flow_spec->type == IB_FLOW_SPEC_IPV4) {
+		struct ib_flow_spec_ipv4 *ipv4_spec;
+
+		ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec;
+		if (ipv4_is_multicast(ipv4_spec->val.dst_ip))
+			return true;
+
+		return false;
+	}
+
+	if (flow_spec->type == IB_FLOW_SPEC_ETH) {
+		struct ib_flow_spec_eth *eth_spec;
+
+		eth_spec = (struct ib_flow_spec_eth *)flow_spec;
+		return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
+		       is_multicast_ether_addr(eth_spec->val.dst_mac);
+	}
+
+	return false;
+}
+
+enum valid_spec {
+	VALID_SPEC_INVALID,
+	VALID_SPEC_VALID,
+	VALID_SPEC_NA,
+};
+
+static enum valid_spec
+is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
+		     const struct mlx5_flow_spec *spec,
+		     const struct mlx5_flow_act *flow_act,
+		     bool egress)
+{
+	const u32 *match_c = spec->match_criteria;
+	bool is_crypto =
+		(flow_act->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+				     MLX5_FLOW_CONTEXT_ACTION_DECRYPT));
+	bool is_ipsec = mlx5_fs_is_ipsec_flow(match_c);
+	bool is_drop = flow_act->action & MLX5_FLOW_CONTEXT_ACTION_DROP;
+
+	/*
+	 * Currently only crypto is supported in egress, when regular egress
+	 * rules would be supported, always return VALID_SPEC_NA.
+	 */
+	if (!is_crypto)
+		return egress ? VALID_SPEC_INVALID : VALID_SPEC_NA;
+
+	return is_crypto && is_ipsec &&
+		(!egress || (!is_drop && !flow_act->has_flow_tag)) ?
+		VALID_SPEC_VALID : VALID_SPEC_INVALID;
+}
+
+static bool is_valid_spec(struct mlx5_core_dev *mdev,
+			  const struct mlx5_flow_spec *spec,
+			  const struct mlx5_flow_act *flow_act,
+			  bool egress)
+{
+	/* We curretly only support ipsec egress flow */
+	return is_valid_esp_aes_gcm(mdev, spec, flow_act, egress) != VALID_SPEC_INVALID;
+}
+
+static bool is_valid_ethertype(struct mlx5_core_dev *mdev,
+			       const struct ib_flow_attr *flow_attr,
+			       bool check_inner)
+{
+	union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
+	int match_ipv = check_inner ?
+			MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.inner_ip_version) :
+			MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.outer_ip_version);
+	int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0;
+	bool ipv4_spec_valid, ipv6_spec_valid;
+	unsigned int ip_spec_type = 0;
+	bool has_ethertype = false;
+	unsigned int spec_index;
+	bool mask_valid = true;
+	u16 eth_type = 0;
+	bool type_valid;
+
+	/* Validate that ethertype is correct */
+	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
+		if ((ib_spec->type == (IB_FLOW_SPEC_ETH | inner_bit)) &&
+		    ib_spec->eth.mask.ether_type) {
+			mask_valid = (ib_spec->eth.mask.ether_type ==
+				      htons(0xffff));
+			has_ethertype = true;
+			eth_type = ntohs(ib_spec->eth.val.ether_type);
+		} else if ((ib_spec->type == (IB_FLOW_SPEC_IPV4 | inner_bit)) ||
+			   (ib_spec->type == (IB_FLOW_SPEC_IPV6 | inner_bit))) {
+			ip_spec_type = ib_spec->type;
+		}
+		ib_spec = (void *)ib_spec + ib_spec->size;
+	}
+
+	type_valid = (!has_ethertype) || (!ip_spec_type);
+	if (!type_valid && mask_valid) {
+		ipv4_spec_valid = (eth_type == ETH_P_IP) &&
+			(ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit));
+		ipv6_spec_valid = (eth_type == ETH_P_IPV6) &&
+			(ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit));
+
+		type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) ||
+			     (((eth_type == ETH_P_MPLS_UC) ||
+			       (eth_type == ETH_P_MPLS_MC)) && match_ipv);
+	}
+
+	return type_valid;
+}
+
+static bool is_valid_attr(struct mlx5_core_dev *mdev,
+			  const struct ib_flow_attr *flow_attr)
+{
+	return is_valid_ethertype(mdev, flow_attr, false) &&
+	       is_valid_ethertype(mdev, flow_attr, true);
+}
+
+static void put_flow_table(struct mlx5_ib_dev *dev,
+			   struct mlx5_ib_flow_prio *prio, bool ft_added)
+{
+	prio->refcount -= !!ft_added;
+	if (!prio->refcount) {
+		mlx5_destroy_flow_table(prio->flow_table);
+		prio->flow_table = NULL;
+	}
+}
+
+static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
+{
+	struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device);
+	struct mlx5_ib_flow_handler *handler = container_of(flow_id,
+							  struct mlx5_ib_flow_handler,
+							  ibflow);
+	struct mlx5_ib_flow_handler *iter, *tmp;
+
+	mutex_lock(&dev->flow_db->lock);
+
+	list_for_each_entry_safe(iter, tmp, &handler->list, list) {
+		mlx5_del_flow_rules(iter->rule);
+		put_flow_table(dev, iter->prio, true);
+		list_del(&iter->list);
+		kfree(iter);
+	}
+
+	mlx5_del_flow_rules(handler->rule);
+	put_flow_table(dev, handler->prio, true);
+	mutex_unlock(&dev->flow_db->lock);
+
+	kfree(handler);
+
+	return 0;
+}
+
+static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
+{
+	priority *= 2;
+	if (!dont_trap)
+		priority++;
+	return priority;
+}
+
+enum flow_table_type {
+	MLX5_IB_FT_RX,
+	MLX5_IB_FT_TX
+};
+
+#define MLX5_FS_MAX_TYPES	 6
+#define MLX5_FS_MAX_ENTRIES	 BIT(16)
+static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
+						struct ib_flow_attr *flow_attr,
+						enum flow_table_type ft_type)
+{
+	bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
+	struct mlx5_flow_namespace *ns = NULL;
+	struct mlx5_ib_flow_prio *prio;
+	struct mlx5_flow_table *ft;
+	int max_table_size;
+	int num_entries;
+	int num_groups;
+	int priority;
+	int err = 0;
+
+	max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+						       log_max_ft_size));
+	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
+		if (ft_type == MLX5_IB_FT_TX)
+			priority = 0;
+		else if (flow_is_multicast_only(flow_attr) &&
+			 !dont_trap)
+			priority = MLX5_IB_FLOW_MCAST_PRIO;
+		else
+			priority = ib_prio_to_core_prio(flow_attr->priority,
+							dont_trap);
+		ns = mlx5_get_flow_namespace(dev->mdev,
+					     ft_type == MLX5_IB_FT_TX ?
+					     MLX5_FLOW_NAMESPACE_EGRESS :
+					     MLX5_FLOW_NAMESPACE_BYPASS);
+		num_entries = MLX5_FS_MAX_ENTRIES;
+		num_groups = MLX5_FS_MAX_TYPES;
+		prio = &dev->flow_db->prios[priority];
+	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
+		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
+		ns = mlx5_get_flow_namespace(dev->mdev,
+					     MLX5_FLOW_NAMESPACE_LEFTOVERS);
+		build_leftovers_ft_param(&priority,
+					 &num_entries,
+					 &num_groups);
+		prio = &dev->flow_db->prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
+	} else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
+		if (!MLX5_CAP_FLOWTABLE(dev->mdev,
+					allow_sniffer_and_nic_rx_shared_tir))
+			return ERR_PTR(-ENOTSUPP);
+
+		ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
+					     MLX5_FLOW_NAMESPACE_SNIFFER_RX :
+					     MLX5_FLOW_NAMESPACE_SNIFFER_TX);
+
+		prio = &dev->flow_db->sniffer[ft_type];
+		priority = 0;
+		num_entries = 1;
+		num_groups = 1;
+	}
+
+	if (!ns)
+		return ERR_PTR(-ENOTSUPP);
+
+	if (num_entries > max_table_size)
+		return ERR_PTR(-ENOMEM);
+
+	ft = prio->flow_table;
+	if (!ft) {
+		ft = mlx5_create_auto_grouped_flow_table(ns, priority,
+							 num_entries,
+							 num_groups,
+							 0, 0);
+
+		if (!IS_ERR(ft)) {
+			prio->refcount = 0;
+			prio->flow_table = ft;
+		} else {
+			err = PTR_ERR(ft);
+		}
+	}
+
+	return err ? ERR_PTR(err) : prio;
+}
+
+static void set_underlay_qp(struct mlx5_ib_dev *dev,
+			    struct mlx5_flow_spec *spec,
+			    u32 underlay_qpn)
+{
+	void *misc_params_c = MLX5_ADDR_OF(fte_match_param,
+					   spec->match_criteria,
+					   misc_parameters);
+	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+					   misc_parameters);
+
+	if (underlay_qpn &&
+	    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+				      ft_field_support.bth_dst_qp)) {
+		MLX5_SET(fte_match_set_misc,
+			 misc_params_v, bth_dst_qp, underlay_qpn);
+		MLX5_SET(fte_match_set_misc,
+			 misc_params_c, bth_dst_qp, 0xffffff);
+	}
+}
+
+static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
+						      struct mlx5_ib_flow_prio *ft_prio,
+						      const struct ib_flow_attr *flow_attr,
+						      struct mlx5_flow_destination *dst,
+						      u32 underlay_qpn)
+{
+	struct mlx5_flow_table	*ft = ft_prio->flow_table;
+	struct mlx5_ib_flow_handler *handler;
+	struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
+	struct mlx5_flow_spec *spec;
+	struct mlx5_flow_destination *rule_dst = dst;
+	const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
+	unsigned int spec_index;
+	int err = 0;
+	int dest_num = 1;
+	bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
+
+	if (!is_valid_attr(dev->mdev, flow_attr))
+		return ERR_PTR(-EINVAL);
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
+	if (!handler || !spec) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	INIT_LIST_HEAD(&handler->list);
+
+	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
+		err = parse_flow_attr(dev->mdev, spec->match_criteria,
+				      spec->match_value,
+				      ib_flow, flow_attr, &flow_act);
+		if (err < 0)
+			goto free;
+
+		ib_flow += ((union ib_flow_spec *)ib_flow)->size;
+	}
+
+	if (!flow_is_multicast_only(flow_attr))
+		set_underlay_qp(dev, spec, underlay_qpn);
+
+	if (dev->rep) {
+		void *misc;
+
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    misc_parameters);
+		MLX5_SET(fte_match_set_misc, misc, source_port,
+			 dev->rep->vport);
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    misc_parameters);
+		MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+	}
+
+	spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
+
+	if (is_egress &&
+	    !is_valid_spec(dev->mdev, spec, &flow_act, is_egress)) {
+		err = -EINVAL;
+		goto free;
+	}
+
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) {
+		rule_dst = NULL;
+		dest_num = 0;
+	} else {
+		if (is_egress)
+			flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+		else
+			flow_act.action |=
+				dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
+					MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
+	}
+
+	if (flow_act.has_flow_tag &&
+	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
+	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
+		mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
+			     flow_act.flow_tag, flow_attr->type);
+		err = -EINVAL;
+		goto free;
+	}
+	handler->rule = mlx5_add_flow_rules(ft, spec,
+					    &flow_act,
+					    rule_dst, dest_num);
+
+	if (IS_ERR(handler->rule)) {
+		err = PTR_ERR(handler->rule);
+		goto free;
+	}
+
+	ft_prio->refcount++;
+	handler->prio = ft_prio;
+
+	ft_prio->flow_table = ft;
+free:
+	if (err)
+		kfree(handler);
+	kvfree(spec);
+	return err ? ERR_PTR(err) : handler;
+}
+
+static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
+						     struct mlx5_ib_flow_prio *ft_prio,
+						     const struct ib_flow_attr *flow_attr,
+						     struct mlx5_flow_destination *dst)
+{
+	return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0);
+}
+
+static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
+							  struct mlx5_ib_flow_prio *ft_prio,
+							  struct ib_flow_attr *flow_attr,
+							  struct mlx5_flow_destination *dst)
+{
+	struct mlx5_ib_flow_handler *handler_dst = NULL;
+	struct mlx5_ib_flow_handler *handler = NULL;
+
+	handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
+	if (!IS_ERR(handler)) {
+		handler_dst = create_flow_rule(dev, ft_prio,
+					       flow_attr, dst);
+		if (IS_ERR(handler_dst)) {
+			mlx5_del_flow_rules(handler->rule);
+			ft_prio->refcount--;
+			kfree(handler);
+			handler = handler_dst;
+		} else {
+			list_add(&handler_dst->list, &handler->list);
+		}
+	}
+
+	return handler;
+}
+enum {
+	LEFTOVERS_MC,
+	LEFTOVERS_UC,
+};
+
+static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
+							  struct mlx5_ib_flow_prio *ft_prio,
+							  struct ib_flow_attr *flow_attr,
+							  struct mlx5_flow_destination *dst)
+{
+	struct mlx5_ib_flow_handler *handler_ucast = NULL;
+	struct mlx5_ib_flow_handler *handler = NULL;
+
+	static struct {
+		struct ib_flow_attr	flow_attr;
+		struct ib_flow_spec_eth eth_flow;
+	} leftovers_specs[] = {
+		[LEFTOVERS_MC] = {
+			.flow_attr = {
+				.num_of_specs = 1,
+				.size = sizeof(leftovers_specs[0])
+			},
+			.eth_flow = {
+				.type = IB_FLOW_SPEC_ETH,
+				.size = sizeof(struct ib_flow_spec_eth),
+				.mask = {.dst_mac = {0x1} },
+				.val =  {.dst_mac = {0x1} }
+			}
+		},
+		[LEFTOVERS_UC] = {
+			.flow_attr = {
+				.num_of_specs = 1,
+				.size = sizeof(leftovers_specs[0])
+			},
+			.eth_flow = {
+				.type = IB_FLOW_SPEC_ETH,
+				.size = sizeof(struct ib_flow_spec_eth),
+				.mask = {.dst_mac = {0x1} },
+				.val = {.dst_mac = {} }
+			}
+		}
+	};
+
+	handler = create_flow_rule(dev, ft_prio,
+				   &leftovers_specs[LEFTOVERS_MC].flow_attr,
+				   dst);
+	if (!IS_ERR(handler) &&
+	    flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
+		handler_ucast = create_flow_rule(dev, ft_prio,
+						 &leftovers_specs[LEFTOVERS_UC].flow_attr,
+						 dst);
+		if (IS_ERR(handler_ucast)) {
+			mlx5_del_flow_rules(handler->rule);
+			ft_prio->refcount--;
+			kfree(handler);
+			handler = handler_ucast;
+		} else {
+			list_add(&handler_ucast->list, &handler->list);
+		}
+	}
+
+	return handler;
+}
+
+static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
+							struct mlx5_ib_flow_prio *ft_rx,
+							struct mlx5_ib_flow_prio *ft_tx,
+							struct mlx5_flow_destination *dst)
+{
+	struct mlx5_ib_flow_handler *handler_rx;
+	struct mlx5_ib_flow_handler *handler_tx;
+	int err;
+	static const struct ib_flow_attr flow_attr  = {
+		.num_of_specs = 0,
+		.size = sizeof(flow_attr)
+	};
+
+	handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
+	if (IS_ERR(handler_rx)) {
+		err = PTR_ERR(handler_rx);
+		goto err;
+	}
+
+	handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
+	if (IS_ERR(handler_tx)) {
+		err = PTR_ERR(handler_tx);
+		goto err_tx;
+	}
+
+	list_add(&handler_tx->list, &handler_rx->list);
+
+	return handler_rx;
+
+err_tx:
+	mlx5_del_flow_rules(handler_rx->rule);
+	ft_rx->refcount--;
+	kfree(handler_rx);
+err:
+	return ERR_PTR(err);
+}
+
+static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
+					   struct ib_flow_attr *flow_attr,
+					   int domain)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	struct mlx5_ib_flow_handler *handler = NULL;
+	struct mlx5_flow_destination *dst = NULL;
+	struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
+	struct mlx5_ib_flow_prio *ft_prio;
+	bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
+	int err;
+	int underlay_qpn;
+
+	if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO)
+		return ERR_PTR(-ENOMEM);
+
+	if (domain != IB_FLOW_DOMAIN_USER ||
+	    flow_attr->port > dev->num_ports ||
+	    (flow_attr->flags & ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP |
+				  IB_FLOW_ATTR_FLAGS_EGRESS)))
+		return ERR_PTR(-EINVAL);
+
+	if (is_egress &&
+	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
+	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT))
+		return ERR_PTR(-EINVAL);
+
+	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+	if (!dst)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&dev->flow_db->lock);
+
+	ft_prio = get_flow_table(dev, flow_attr,
+				 is_egress ? MLX5_IB_FT_TX : MLX5_IB_FT_RX);
+	if (IS_ERR(ft_prio)) {
+		err = PTR_ERR(ft_prio);
+		goto unlock;
+	}
+	if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
+		ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
+		if (IS_ERR(ft_prio_tx)) {
+			err = PTR_ERR(ft_prio_tx);
+			ft_prio_tx = NULL;
+			goto destroy_ft;
+		}
+	}
+
+	if (is_egress) {
+		dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT;
+	} else {
+		dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+		if (mqp->flags & MLX5_IB_QP_RSS)
+			dst->tir_num = mqp->rss_qp.tirn;
+		else
+			dst->tir_num = mqp->raw_packet_qp.rq.tirn;
+	}
+
+	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
+		if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)  {
+			handler = create_dont_trap_rule(dev, ft_prio,
+							flow_attr, dst);
+		} else {
+			underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ?
+					mqp->underlay_qpn : 0;
+			handler = _create_flow_rule(dev, ft_prio, flow_attr,
+						    dst, underlay_qpn);
+		}
+	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
+		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
+		handler = create_leftovers_rule(dev, ft_prio, flow_attr,
+						dst);
+	} else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
+		handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
+	} else {
+		err = -EINVAL;
+		goto destroy_ft;
+	}
+
+	if (IS_ERR(handler)) {
+		err = PTR_ERR(handler);
+		handler = NULL;
+		goto destroy_ft;
+	}
+
+	mutex_unlock(&dev->flow_db->lock);
+	kfree(dst);
+
+	return &handler->ibflow;
+
+destroy_ft:
+	put_flow_table(dev, ft_prio, false);
+	if (ft_prio_tx)
+		put_flow_table(dev, ft_prio_tx, false);
+unlock:
+	mutex_unlock(&dev->flow_db->lock);
+	kfree(dst);
+	kfree(handler);
+	return ERR_PTR(err);
+}
+
+static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags)
+{
+	u32 flags = 0;
+
+	if (mlx5_flags & MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA)
+		flags |= MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA;
+
+	return flags;
+}
+
+#define MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED	MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA
+static struct ib_flow_action *
+mlx5_ib_create_flow_action_esp(struct ib_device *device,
+			       const struct ib_flow_action_attrs_esp *attr,
+			       struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_dev *mdev = to_mdev(device);
+	struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm;
+	struct mlx5_accel_esp_xfrm_attrs accel_attrs = {};
+	struct mlx5_ib_flow_action *action;
+	u64 action_flags;
+	u64 flags;
+	int err = 0;
+
+	if (IS_UVERBS_COPY_ERR(uverbs_copy_from(&action_flags, attrs,
+						MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS)))
+		return ERR_PTR(-EFAULT);
+
+	if (action_flags >= (MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags);
+
+	/* We current only support a subset of the standard features. Only a
+	 * keymat of type AES_GCM, with icv_len == 16, iv_algo == SEQ and esn
+	 * (with overlap). Full offload mode isn't supported.
+	 */
+	if (!attr->keymat || attr->replay || attr->encap ||
+	    attr->spi || attr->seq || attr->tfc_pad ||
+	    attr->hard_limit_pkts ||
+	    (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
+			     IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (attr->keymat->protocol !=
+	    IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	aes_gcm = &attr->keymat->keymat.aes_gcm;
+
+	if (aes_gcm->icv_len != 16 ||
+	    aes_gcm->iv_algo != IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	action = kmalloc(sizeof(*action), GFP_KERNEL);
+	if (!action)
+		return ERR_PTR(-ENOMEM);
+
+	action->esp_aes_gcm.ib_flags = attr->flags;
+	memcpy(&accel_attrs.keymat.aes_gcm.aes_key, &aes_gcm->aes_key,
+	       sizeof(accel_attrs.keymat.aes_gcm.aes_key));
+	accel_attrs.keymat.aes_gcm.key_len = aes_gcm->key_len * 8;
+	memcpy(&accel_attrs.keymat.aes_gcm.salt, &aes_gcm->salt,
+	       sizeof(accel_attrs.keymat.aes_gcm.salt));
+	memcpy(&accel_attrs.keymat.aes_gcm.seq_iv, &aes_gcm->iv,
+	       sizeof(accel_attrs.keymat.aes_gcm.seq_iv));
+	accel_attrs.keymat.aes_gcm.icv_len = aes_gcm->icv_len * 8;
+	accel_attrs.keymat.aes_gcm.iv_algo = MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ;
+	accel_attrs.keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM;
+
+	accel_attrs.esn = attr->esn;
+	if (attr->flags & IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED)
+		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED;
+	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
+		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
+
+	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)
+		accel_attrs.action |= MLX5_ACCEL_ESP_ACTION_ENCRYPT;
+
+	action->esp_aes_gcm.ctx =
+		mlx5_accel_esp_create_xfrm(mdev->mdev, &accel_attrs, flags);
+	if (IS_ERR(action->esp_aes_gcm.ctx)) {
+		err = PTR_ERR(action->esp_aes_gcm.ctx);
+		goto err_parse;
+	}
+
+	action->esp_aes_gcm.ib_flags = attr->flags;
+
+	return &action->ib_action;
+
+err_parse:
+	kfree(action);
+	return ERR_PTR(err);
+}
+
+static int
+mlx5_ib_modify_flow_action_esp(struct ib_flow_action *action,
+			       const struct ib_flow_action_attrs_esp *attr,
+			       struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_flow_action *maction = to_mflow_act(action);
+	struct mlx5_accel_esp_xfrm_attrs accel_attrs;
+	int err = 0;
+
+	if (attr->keymat || attr->replay || attr->encap ||
+	    attr->spi || attr->seq || attr->tfc_pad ||
+	    attr->hard_limit_pkts ||
+	    (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
+			     IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS |
+			     IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)))
+		return -EOPNOTSUPP;
+
+	/* Only the ESN value or the MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP can
+	 * be modified.
+	 */
+	if (!(maction->esp_aes_gcm.ib_flags &
+	      IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) &&
+	    attr->flags & (IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
+			   IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW))
+		return -EINVAL;
+
+	memcpy(&accel_attrs, &maction->esp_aes_gcm.ctx->attrs,
+	       sizeof(accel_attrs));
+
+	accel_attrs.esn = attr->esn;
+	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
+		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
+	else
+		accel_attrs.flags &= ~MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
+
+	err = mlx5_accel_esp_modify_xfrm(maction->esp_aes_gcm.ctx,
+					 &accel_attrs);
+	if (err)
+		return err;
+
+	maction->esp_aes_gcm.ib_flags &=
+		~IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
+	maction->esp_aes_gcm.ib_flags |=
+		attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
+
+	return 0;
+}
+
+static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action)
+{
+	struct mlx5_ib_flow_action *maction = to_mflow_act(action);
+
+	switch (action->type) {
+	case IB_FLOW_ACTION_ESP:
+		/*
+		 * We only support aes_gcm by now, so we implicitly know this is
+		 * the underline crypto.
+		 */
+		mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx);
+		break;
+	default:
+		WARN_ON(true);
+		break;
+	}
+
+	kfree(maction);
+	return 0;
+}
+
+static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(ibqp);
+	int err;
+
+	if (mqp->flags & MLX5_IB_QP_UNDERLAY) {
+		mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
+	if (err)
+		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
+			     ibqp->qp_num, gid->raw);
+
+	return err;
+}
+
+static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	int err;
+
+	err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
+	if (err)
+		mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
+			     ibqp->qp_num, gid->raw);
+
+	return err;
+}
+
+static int init_node_data(struct mlx5_ib_dev *dev)
+{
+	int err;
+
+	err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
+	if (err)
+		return err;
+
+	dev->mdev->rev_id = dev->mdev->pdev->revision;
+
+	return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
+}
+
+static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
+			     char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+
+	return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
+}
+
+static ssize_t show_reg_pages(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+
+	return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%x\n", dev->mdev->rev_id);
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
+		       dev->mdev->board_id);
+}
+
+static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
+static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
+
+static struct device_attribute *mlx5_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+	&dev_attr_fw_pages,
+	&dev_attr_reg_pages,
+};
+
+static void pkey_change_handler(struct work_struct *work)
+{
+	struct mlx5_ib_port_resources *ports =
+		container_of(work, struct mlx5_ib_port_resources,
+			     pkey_change_work);
+
+	mutex_lock(&ports->devr->mutex);
+	mlx5_ib_gsi_pkey_change(ports->gsi);
+	mutex_unlock(&ports->devr->mutex);
+}
+
+static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
+{
+	struct mlx5_ib_qp *mqp;
+	struct mlx5_ib_cq *send_mcq, *recv_mcq;
+	struct mlx5_core_cq *mcq;
+	struct list_head cq_armed_list;
+	unsigned long flags_qp;
+	unsigned long flags_cq;
+	unsigned long flags;
+
+	INIT_LIST_HEAD(&cq_armed_list);
+
+	/* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
+	spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
+	list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
+		spin_lock_irqsave(&mqp->sq.lock, flags_qp);
+		if (mqp->sq.tail != mqp->sq.head) {
+			send_mcq = to_mcq(mqp->ibqp.send_cq);
+			spin_lock_irqsave(&send_mcq->lock, flags_cq);
+			if (send_mcq->mcq.comp &&
+			    mqp->ibqp.send_cq->comp_handler) {
+				if (!send_mcq->mcq.reset_notify_added) {
+					send_mcq->mcq.reset_notify_added = 1;
+					list_add_tail(&send_mcq->mcq.reset_notify,
+						      &cq_armed_list);
+				}
+			}
+			spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
+		}
+		spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
+		spin_lock_irqsave(&mqp->rq.lock, flags_qp);
+		/* no handling is needed for SRQ */
+		if (!mqp->ibqp.srq) {
+			if (mqp->rq.tail != mqp->rq.head) {
+				recv_mcq = to_mcq(mqp->ibqp.recv_cq);
+				spin_lock_irqsave(&recv_mcq->lock, flags_cq);
+				if (recv_mcq->mcq.comp &&
+				    mqp->ibqp.recv_cq->comp_handler) {
+					if (!recv_mcq->mcq.reset_notify_added) {
+						recv_mcq->mcq.reset_notify_added = 1;
+						list_add_tail(&recv_mcq->mcq.reset_notify,
+							      &cq_armed_list);
+					}
+				}
+				spin_unlock_irqrestore(&recv_mcq->lock,
+						       flags_cq);
+			}
+		}
+		spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
+	}
+	/*At that point all inflight post send were put to be executed as of we
+	 * lock/unlock above locks Now need to arm all involved CQs.
+	 */
+	list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
+		mcq->comp(mcq);
+	}
+	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
+}
+
+static void delay_drop_handler(struct work_struct *work)
+{
+	int err;
+	struct mlx5_ib_delay_drop *delay_drop =
+		container_of(work, struct mlx5_ib_delay_drop,
+			     delay_drop_work);
+
+	atomic_inc(&delay_drop->events_cnt);
+
+	mutex_lock(&delay_drop->lock);
+	err = mlx5_core_set_delay_drop(delay_drop->dev->mdev,
+				       delay_drop->timeout);
+	if (err) {
+		mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n",
+			     delay_drop->timeout);
+		delay_drop->activate = false;
+	}
+	mutex_unlock(&delay_drop->lock);
+}
+
+static void mlx5_ib_handle_event(struct work_struct *_work)
+{
+	struct mlx5_ib_event_work *work =
+		container_of(_work, struct mlx5_ib_event_work, work);
+	struct mlx5_ib_dev *ibdev;
+	struct ib_event ibev;
+	bool fatal = false;
+	u8 port = (u8)work->param;
+
+	if (mlx5_core_is_mp_slave(work->dev)) {
+		ibdev = mlx5_ib_get_ibdev_from_mpi(work->context);
+		if (!ibdev)
+			goto out;
+	} else {
+		ibdev = work->context;
+	}
+
+	switch (work->event) {
+	case MLX5_DEV_EVENT_SYS_ERROR:
+		ibev.event = IB_EVENT_DEVICE_FATAL;
+		mlx5_ib_handle_internal_error(ibdev);
+		fatal = true;
+		break;
+
+	case MLX5_DEV_EVENT_PORT_UP:
+	case MLX5_DEV_EVENT_PORT_DOWN:
+	case MLX5_DEV_EVENT_PORT_INITIALIZED:
+		/* In RoCE, port up/down events are handled in
+		 * mlx5_netdev_event().
+		 */
+		if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
+			IB_LINK_LAYER_ETHERNET)
+			goto out;
+
+		ibev.event = (work->event == MLX5_DEV_EVENT_PORT_UP) ?
+			     IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+		break;
+
+	case MLX5_DEV_EVENT_LID_CHANGE:
+		ibev.event = IB_EVENT_LID_CHANGE;
+		break;
+
+	case MLX5_DEV_EVENT_PKEY_CHANGE:
+		ibev.event = IB_EVENT_PKEY_CHANGE;
+		schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
+		break;
+
+	case MLX5_DEV_EVENT_GUID_CHANGE:
+		ibev.event = IB_EVENT_GID_CHANGE;
+		break;
+
+	case MLX5_DEV_EVENT_CLIENT_REREG:
+		ibev.event = IB_EVENT_CLIENT_REREGISTER;
+		break;
+	case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT:
+		schedule_work(&ibdev->delay_drop.delay_drop_work);
+		goto out;
+	default:
+		goto out;
+	}
+
+	ibev.device	      = &ibdev->ib_dev;
+	ibev.element.port_num = port;
+
+	if (!rdma_is_port_valid(&ibdev->ib_dev, port)) {
+		mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
+		goto out;
+	}
+
+	if (ibdev->ib_active)
+		ib_dispatch_event(&ibev);
+
+	if (fatal)
+		ibdev->ib_active = false;
+out:
+	kfree(work);
+}
+
+static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
+			  enum mlx5_dev_event event, unsigned long param)
+{
+	struct mlx5_ib_event_work *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return;
+
+	INIT_WORK(&work->work, mlx5_ib_handle_event);
+	work->dev = dev;
+	work->param = param;
+	work->context = context;
+	work->event = event;
+
+	queue_work(mlx5_ib_event_wq, &work->work);
+}
+
+static int set_has_smi_cap(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_hca_vport_context vport_ctx;
+	int err;
+	int port;
+
+	for (port = 1; port <= dev->num_ports; port++) {
+		dev->mdev->port_caps[port - 1].has_smi = false;
+		if (MLX5_CAP_GEN(dev->mdev, port_type) ==
+		    MLX5_CAP_PORT_TYPE_IB) {
+			if (MLX5_CAP_GEN(dev->mdev, ib_virt)) {
+				err = mlx5_query_hca_vport_context(dev->mdev, 0,
+								   port, 0,
+								   &vport_ctx);
+				if (err) {
+					mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n",
+						    port, err);
+					return err;
+				}
+				dev->mdev->port_caps[port - 1].has_smi =
+					vport_ctx.has_smi;
+			} else {
+				dev->mdev->port_caps[port - 1].has_smi = true;
+			}
+		}
+	}
+	return 0;
+}
+
+static void get_ext_port_caps(struct mlx5_ib_dev *dev)
+{
+	int port;
+
+	for (port = 1; port <= dev->num_ports; port++)
+		mlx5_query_ext_port_caps(dev, port);
+}
+
+static int get_port_caps(struct mlx5_ib_dev *dev, u8 port)
+{
+	struct ib_device_attr *dprops = NULL;
+	struct ib_port_attr *pprops = NULL;
+	int err = -ENOMEM;
+	struct ib_udata uhw = {.inlen = 0, .outlen = 0};
+
+	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
+	if (!pprops)
+		goto out;
+
+	dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
+	if (!dprops)
+		goto out;
+
+	err = set_has_smi_cap(dev);
+	if (err)
+		goto out;
+
+	err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
+	if (err) {
+		mlx5_ib_warn(dev, "query_device failed %d\n", err);
+		goto out;
+	}
+
+	memset(pprops, 0, sizeof(*pprops));
+	err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
+	if (err) {
+		mlx5_ib_warn(dev, "query_port %d failed %d\n",
+			     port, err);
+		goto out;
+	}
+
+	dev->mdev->port_caps[port - 1].pkey_table_len =
+					dprops->max_pkeys;
+	dev->mdev->port_caps[port - 1].gid_table_len =
+					pprops->gid_tbl_len;
+	mlx5_ib_dbg(dev, "port %d: pkey_table_len %d, gid_table_len %d\n",
+		    port, dprops->max_pkeys, pprops->gid_tbl_len);
+
+out:
+	kfree(pprops);
+	kfree(dprops);
+
+	return err;
+}
+
+static void destroy_umrc_res(struct mlx5_ib_dev *dev)
+{
+	int err;
+
+	err = mlx5_mr_cache_cleanup(dev);
+	if (err)
+		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
+
+	if (dev->umrc.qp)
+		mlx5_ib_destroy_qp(dev->umrc.qp);
+	if (dev->umrc.cq)
+		ib_free_cq(dev->umrc.cq);
+	if (dev->umrc.pd)
+		ib_dealloc_pd(dev->umrc.pd);
+}
+
+enum {
+	MAX_UMR_WR = 128,
+};
+
+static int create_umr_res(struct mlx5_ib_dev *dev)
+{
+	struct ib_qp_init_attr *init_attr = NULL;
+	struct ib_qp_attr *attr = NULL;
+	struct ib_pd *pd;
+	struct ib_cq *cq;
+	struct ib_qp *qp;
+	int ret;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
+	if (!attr || !init_attr) {
+		ret = -ENOMEM;
+		goto error_0;
+	}
+
+	pd = ib_alloc_pd(&dev->ib_dev, 0);
+	if (IS_ERR(pd)) {
+		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
+		ret = PTR_ERR(pd);
+		goto error_0;
+	}
+
+	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
+	if (IS_ERR(cq)) {
+		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
+		ret = PTR_ERR(cq);
+		goto error_2;
+	}
+
+	init_attr->send_cq = cq;
+	init_attr->recv_cq = cq;
+	init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+	init_attr->cap.max_send_wr = MAX_UMR_WR;
+	init_attr->cap.max_send_sge = 1;
+	init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
+	init_attr->port_num = 1;
+	qp = mlx5_ib_create_qp(pd, init_attr, NULL);
+	if (IS_ERR(qp)) {
+		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
+		ret = PTR_ERR(qp);
+		goto error_3;
+	}
+	qp->device     = &dev->ib_dev;
+	qp->real_qp    = qp;
+	qp->uobject    = NULL;
+	qp->qp_type    = MLX5_IB_QPT_REG_UMR;
+	qp->send_cq    = init_attr->send_cq;
+	qp->recv_cq    = init_attr->recv_cq;
+
+	attr->qp_state = IB_QPS_INIT;
+	attr->port_num = 1;
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
+				IB_QP_PORT, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
+		goto error_4;
+	}
+
+	memset(attr, 0, sizeof(*attr));
+	attr->qp_state = IB_QPS_RTR;
+	attr->path_mtu = IB_MTU_256;
+
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
+		goto error_4;
+	}
+
+	memset(attr, 0, sizeof(*attr));
+	attr->qp_state = IB_QPS_RTS;
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
+		goto error_4;
+	}
+
+	dev->umrc.qp = qp;
+	dev->umrc.cq = cq;
+	dev->umrc.pd = pd;
+
+	sema_init(&dev->umrc.sem, MAX_UMR_WR);
+	ret = mlx5_mr_cache_init(dev);
+	if (ret) {
+		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
+		goto error_4;
+	}
+
+	kfree(attr);
+	kfree(init_attr);
+
+	return 0;
+
+error_4:
+	mlx5_ib_destroy_qp(qp);
+	dev->umrc.qp = NULL;
+
+error_3:
+	ib_free_cq(cq);
+	dev->umrc.cq = NULL;
+
+error_2:
+	ib_dealloc_pd(pd);
+	dev->umrc.pd = NULL;
+
+error_0:
+	kfree(attr);
+	kfree(init_attr);
+	return ret;
+}
+
+static u8 mlx5_get_umr_fence(u8 umr_fence_cap)
+{
+	switch (umr_fence_cap) {
+	case MLX5_CAP_UMR_FENCE_NONE:
+		return MLX5_FENCE_MODE_NONE;
+	case MLX5_CAP_UMR_FENCE_SMALL:
+		return MLX5_FENCE_MODE_INITIATOR_SMALL;
+	default:
+		return MLX5_FENCE_MODE_STRONG_ORDERING;
+	}
+}
+
+static int create_dev_resources(struct mlx5_ib_resources *devr)
+{
+	struct ib_srq_init_attr attr;
+	struct mlx5_ib_dev *dev;
+	struct ib_cq_init_attr cq_attr = {.cqe = 1};
+	int port;
+	int ret = 0;
+
+	dev = container_of(devr, struct mlx5_ib_dev, devr);
+
+	mutex_init(&devr->mutex);
+
+	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
+	if (IS_ERR(devr->p0)) {
+		ret = PTR_ERR(devr->p0);
+		goto error0;
+	}
+	devr->p0->device  = &dev->ib_dev;
+	devr->p0->uobject = NULL;
+	atomic_set(&devr->p0->usecnt, 0);
+
+	devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
+	if (IS_ERR(devr->c0)) {
+		ret = PTR_ERR(devr->c0);
+		goto error1;
+	}
+	devr->c0->device        = &dev->ib_dev;
+	devr->c0->uobject       = NULL;
+	devr->c0->comp_handler  = NULL;
+	devr->c0->event_handler = NULL;
+	devr->c0->cq_context    = NULL;
+	atomic_set(&devr->c0->usecnt, 0);
+
+	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
+	if (IS_ERR(devr->x0)) {
+		ret = PTR_ERR(devr->x0);
+		goto error2;
+	}
+	devr->x0->device = &dev->ib_dev;
+	devr->x0->inode = NULL;
+	atomic_set(&devr->x0->usecnt, 0);
+	mutex_init(&devr->x0->tgt_qp_mutex);
+	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
+
+	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
+	if (IS_ERR(devr->x1)) {
+		ret = PTR_ERR(devr->x1);
+		goto error3;
+	}
+	devr->x1->device = &dev->ib_dev;
+	devr->x1->inode = NULL;
+	atomic_set(&devr->x1->usecnt, 0);
+	mutex_init(&devr->x1->tgt_qp_mutex);
+	INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.attr.max_sge = 1;
+	attr.attr.max_wr = 1;
+	attr.srq_type = IB_SRQT_XRC;
+	attr.ext.cq = devr->c0;
+	attr.ext.xrc.xrcd = devr->x0;
+
+	devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
+	if (IS_ERR(devr->s0)) {
+		ret = PTR_ERR(devr->s0);
+		goto error4;
+	}
+	devr->s0->device	= &dev->ib_dev;
+	devr->s0->pd		= devr->p0;
+	devr->s0->uobject       = NULL;
+	devr->s0->event_handler = NULL;
+	devr->s0->srq_context   = NULL;
+	devr->s0->srq_type      = IB_SRQT_XRC;
+	devr->s0->ext.xrc.xrcd	= devr->x0;
+	devr->s0->ext.cq	= devr->c0;
+	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
+	atomic_inc(&devr->s0->ext.cq->usecnt);
+	atomic_inc(&devr->p0->usecnt);
+	atomic_set(&devr->s0->usecnt, 0);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.attr.max_sge = 1;
+	attr.attr.max_wr = 1;
+	attr.srq_type = IB_SRQT_BASIC;
+	devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
+	if (IS_ERR(devr->s1)) {
+		ret = PTR_ERR(devr->s1);
+		goto error5;
+	}
+	devr->s1->device	= &dev->ib_dev;
+	devr->s1->pd		= devr->p0;
+	devr->s1->uobject       = NULL;
+	devr->s1->event_handler = NULL;
+	devr->s1->srq_context   = NULL;
+	devr->s1->srq_type      = IB_SRQT_BASIC;
+	devr->s1->ext.cq	= devr->c0;
+	atomic_inc(&devr->p0->usecnt);
+	atomic_set(&devr->s1->usecnt, 0);
+
+	for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
+		INIT_WORK(&devr->ports[port].pkey_change_work,
+			  pkey_change_handler);
+		devr->ports[port].devr = devr;
+	}
+
+	return 0;
+
+error5:
+	mlx5_ib_destroy_srq(devr->s0);
+error4:
+	mlx5_ib_dealloc_xrcd(devr->x1);
+error3:
+	mlx5_ib_dealloc_xrcd(devr->x0);
+error2:
+	mlx5_ib_destroy_cq(devr->c0);
+error1:
+	mlx5_ib_dealloc_pd(devr->p0);
+error0:
+	return ret;
+}
+
+static void destroy_dev_resources(struct mlx5_ib_resources *devr)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(devr, struct mlx5_ib_dev, devr);
+	int port;
+
+	mlx5_ib_destroy_srq(devr->s1);
+	mlx5_ib_destroy_srq(devr->s0);
+	mlx5_ib_dealloc_xrcd(devr->x0);
+	mlx5_ib_dealloc_xrcd(devr->x1);
+	mlx5_ib_destroy_cq(devr->c0);
+	mlx5_ib_dealloc_pd(devr->p0);
+
+	/* Make sure no change P_Key work items are still executing */
+	for (port = 0; port < dev->num_ports; ++port)
+		cancel_work_sync(&devr->ports[port].pkey_change_work);
+}
+
+static u32 get_core_cap_flags(struct ib_device *ibdev)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
+	u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
+	u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
+	bool raw_support = !mlx5_core_mp_enabled(dev->mdev);
+	u32 ret = 0;
+
+	if (ll == IB_LINK_LAYER_INFINIBAND)
+		return RDMA_CORE_PORT_IBA_IB;
+
+	if (raw_support)
+		ret = RDMA_CORE_PORT_RAW_PACKET;
+
+	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
+		return ret;
+
+	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
+		return ret;
+
+	if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
+		ret |= RDMA_CORE_PORT_IBA_ROCE;
+
+	if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
+		ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+
+	return ret;
+}
+
+static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
+			       struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
+	int err;
+
+	immutable->core_cap_flags = get_core_cap_flags(ibdev);
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+	immutable->core_cap_flags = get_core_cap_flags(ibdev);
+	if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce))
+		immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+	return 0;
+}
+
+static int mlx5_port_rep_immutable(struct ib_device *ibdev, u8 port_num,
+				   struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+	immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
+
+	return 0;
+}
+
+static void get_dev_fw_str(struct ib_device *ibdev, char *str)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(ibdev, struct mlx5_ib_dev, ib_dev);
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d",
+		 fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev),
+		 fw_rev_sub(dev->mdev));
+}
+
+static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev,
+								 MLX5_FLOW_NAMESPACE_LAG);
+	struct mlx5_flow_table *ft;
+	int err;
+
+	if (!ns || !mlx5_lag_is_active(mdev))
+		return 0;
+
+	err = mlx5_cmd_create_vport_lag(mdev);
+	if (err)
+		return err;
+
+	ft = mlx5_create_lag_demux_flow_table(ns, 0, 0);
+	if (IS_ERR(ft)) {
+		err = PTR_ERR(ft);
+		goto err_destroy_vport_lag;
+	}
+
+	dev->flow_db->lag_demux_ft = ft;
+	return 0;
+
+err_destroy_vport_lag:
+	mlx5_cmd_destroy_vport_lag(mdev);
+	return err;
+}
+
+static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+
+	if (dev->flow_db->lag_demux_ft) {
+		mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft);
+		dev->flow_db->lag_demux_ft = NULL;
+
+		mlx5_cmd_destroy_vport_lag(mdev);
+	}
+}
+
+static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
+{
+	int err;
+
+	dev->roce[port_num].nb.notifier_call = mlx5_netdev_event;
+	err = register_netdevice_notifier(&dev->roce[port_num].nb);
+	if (err) {
+		dev->roce[port_num].nb.notifier_call = NULL;
+		return err;
+	}
+
+	return 0;
+}
+
+static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
+{
+	if (dev->roce[port_num].nb.notifier_call) {
+		unregister_netdevice_notifier(&dev->roce[port_num].nb);
+		dev->roce[port_num].nb.notifier_call = NULL;
+	}
+}
+
+static int mlx5_enable_eth(struct mlx5_ib_dev *dev, u8 port_num)
+{
+	int err;
+
+	if (MLX5_CAP_GEN(dev->mdev, roce)) {
+		err = mlx5_nic_vport_enable_roce(dev->mdev);
+		if (err)
+			return err;
+	}
+
+	err = mlx5_eth_lag_init(dev);
+	if (err)
+		goto err_disable_roce;
+
+	return 0;
+
+err_disable_roce:
+	if (MLX5_CAP_GEN(dev->mdev, roce))
+		mlx5_nic_vport_disable_roce(dev->mdev);
+
+	return err;
+}
+
+static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
+{
+	mlx5_eth_lag_cleanup(dev);
+	if (MLX5_CAP_GEN(dev->mdev, roce))
+		mlx5_nic_vport_disable_roce(dev->mdev);
+}
+
+struct mlx5_ib_counter {
+	const char *name;
+	size_t offset;
+};
+
+#define INIT_Q_COUNTER(_name)		\
+	{ .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
+
+static const struct mlx5_ib_counter basic_q_cnts[] = {
+	INIT_Q_COUNTER(rx_write_requests),
+	INIT_Q_COUNTER(rx_read_requests),
+	INIT_Q_COUNTER(rx_atomic_requests),
+	INIT_Q_COUNTER(out_of_buffer),
+};
+
+static const struct mlx5_ib_counter out_of_seq_q_cnts[] = {
+	INIT_Q_COUNTER(out_of_sequence),
+};
+
+static const struct mlx5_ib_counter retrans_q_cnts[] = {
+	INIT_Q_COUNTER(duplicate_request),
+	INIT_Q_COUNTER(rnr_nak_retry_err),
+	INIT_Q_COUNTER(packet_seq_err),
+	INIT_Q_COUNTER(implied_nak_seq_err),
+	INIT_Q_COUNTER(local_ack_timeout_err),
+};
+
+#define INIT_CONG_COUNTER(_name)		\
+	{ .name = #_name, .offset =	\
+		MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)}
+
+static const struct mlx5_ib_counter cong_cnts[] = {
+	INIT_CONG_COUNTER(rp_cnp_ignored),
+	INIT_CONG_COUNTER(rp_cnp_handled),
+	INIT_CONG_COUNTER(np_ecn_marked_roce_packets),
+	INIT_CONG_COUNTER(np_cnp_sent),
+};
+
+static const struct mlx5_ib_counter extended_err_cnts[] = {
+	INIT_Q_COUNTER(resp_local_length_error),
+	INIT_Q_COUNTER(resp_cqe_error),
+	INIT_Q_COUNTER(req_cqe_error),
+	INIT_Q_COUNTER(req_remote_invalid_request),
+	INIT_Q_COUNTER(req_remote_access_errors),
+	INIT_Q_COUNTER(resp_remote_access_errors),
+	INIT_Q_COUNTER(resp_cqe_flush_error),
+	INIT_Q_COUNTER(req_cqe_flush_error),
+};
+
+static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->num_ports; i++) {
+		if (dev->port[i].cnts.set_id)
+			mlx5_core_dealloc_q_counter(dev->mdev,
+						    dev->port[i].cnts.set_id);
+		kfree(dev->port[i].cnts.names);
+		kfree(dev->port[i].cnts.offsets);
+	}
+}
+
+static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
+				    struct mlx5_ib_counters *cnts)
+{
+	u32 num_counters;
+
+	num_counters = ARRAY_SIZE(basic_q_cnts);
+
+	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
+		num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
+
+	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
+		num_counters += ARRAY_SIZE(retrans_q_cnts);
+
+	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
+		num_counters += ARRAY_SIZE(extended_err_cnts);
+
+	cnts->num_q_counters = num_counters;
+
+	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+		cnts->num_cong_counters = ARRAY_SIZE(cong_cnts);
+		num_counters += ARRAY_SIZE(cong_cnts);
+	}
+
+	cnts->names = kcalloc(num_counters, sizeof(cnts->names), GFP_KERNEL);
+	if (!cnts->names)
+		return -ENOMEM;
+
+	cnts->offsets = kcalloc(num_counters,
+				sizeof(cnts->offsets), GFP_KERNEL);
+	if (!cnts->offsets)
+		goto err_names;
+
+	return 0;
+
+err_names:
+	kfree(cnts->names);
+	cnts->names = NULL;
+	return -ENOMEM;
+}
+
+static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
+				  const char **names,
+				  size_t *offsets)
+{
+	int i;
+	int j = 0;
+
+	for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
+		names[j] = basic_q_cnts[i].name;
+		offsets[j] = basic_q_cnts[i].offset;
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
+		for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
+			names[j] = out_of_seq_q_cnts[i].name;
+			offsets[j] = out_of_seq_q_cnts[i].offset;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
+		for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
+			names[j] = retrans_q_cnts[i].name;
+			offsets[j] = retrans_q_cnts[i].offset;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) {
+		for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) {
+			names[j] = extended_err_cnts[i].name;
+			offsets[j] = extended_err_cnts[i].offset;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+		for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
+			names[j] = cong_cnts[i].name;
+			offsets[j] = cong_cnts[i].offset;
+		}
+	}
+}
+
+static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; i < dev->num_ports; i++) {
+		err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts);
+		if (err)
+			goto err_alloc;
+
+		mlx5_ib_fill_counters(dev, dev->port[i].cnts.names,
+				      dev->port[i].cnts.offsets);
+
+		err = mlx5_core_alloc_q_counter(dev->mdev,
+						&dev->port[i].cnts.set_id);
+		if (err) {
+			mlx5_ib_warn(dev,
+				     "couldn't allocate queue counter for port %d, err %d\n",
+				     i + 1, err);
+			goto err_alloc;
+		}
+		dev->port[i].cnts.set_id_valid = true;
+	}
+
+	return 0;
+
+err_alloc:
+	mlx5_ib_dealloc_counters(dev);
+	return err;
+}
+
+static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
+						    u8 port_num)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_port *port = &dev->port[port_num - 1];
+
+	/* We support only per port stats */
+	if (port_num == 0)
+		return NULL;
+
+	return rdma_alloc_hw_stats_struct(port->cnts.names,
+					  port->cnts.num_q_counters +
+					  port->cnts.num_cong_counters,
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
+				    struct mlx5_ib_port *port,
+				    struct rdma_hw_stats *stats)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
+	void *out;
+	__be32 val;
+	int ret, i;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	ret = mlx5_core_query_q_counter(mdev,
+					port->cnts.set_id, 0,
+					out, outlen);
+	if (ret)
+		goto free;
+
+	for (i = 0; i < port->cnts.num_q_counters; i++) {
+		val = *(__be32 *)(out + port->cnts.offsets[i]);
+		stats->value[i] = (u64)be32_to_cpu(val);
+	}
+
+free:
+	kvfree(out);
+	return ret;
+}
+
+static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
+				struct rdma_hw_stats *stats,
+				u8 port_num, int index)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_port *port = &dev->port[port_num - 1];
+	struct mlx5_core_dev *mdev;
+	int ret, num_counters;
+	u8 mdev_port_num;
+
+	if (!stats)
+		return -EINVAL;
+
+	num_counters = port->cnts.num_q_counters + port->cnts.num_cong_counters;
+
+	/* q_counters are per IB device, query the master mdev */
+	ret = mlx5_ib_query_q_counters(dev->mdev, port, stats);
+	if (ret)
+		return ret;
+
+	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+		mdev = mlx5_ib_get_native_port_mdev(dev, port_num,
+						    &mdev_port_num);
+		if (!mdev) {
+			/* If port is not affiliated yet, its in down state
+			 * which doesn't have any counters yet, so it would be
+			 * zero. So no need to read from the HCA.
+			 */
+			goto done;
+		}
+		ret = mlx5_lag_query_cong_counters(dev->mdev,
+						   stats->value +
+						   port->cnts.num_q_counters,
+						   port->cnts.num_cong_counters,
+						   port->cnts.offsets +
+						   port->cnts.num_q_counters);
+
+		mlx5_ib_put_native_port_mdev(dev, port_num);
+		if (ret)
+			return ret;
+	}
+
+done:
+	return num_counters;
+}
+
+static void mlx5_ib_free_rdma_netdev(struct net_device *netdev)
+{
+	return mlx5_rdma_netdev_free(netdev);
+}
+
+static struct net_device*
+mlx5_ib_alloc_rdma_netdev(struct ib_device *hca,
+			  u8 port_num,
+			  enum rdma_netdev_t type,
+			  const char *name,
+			  unsigned char name_assign_type,
+			  void (*setup)(struct net_device *))
+{
+	struct net_device *netdev;
+	struct rdma_netdev *rn;
+
+	if (type != RDMA_NETDEV_IPOIB)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	netdev = mlx5_rdma_netdev_alloc(to_mdev(hca)->mdev, hca,
+					name, setup);
+	if (likely(!IS_ERR_OR_NULL(netdev))) {
+		rn = netdev_priv(netdev);
+		rn->free_rdma_netdev = mlx5_ib_free_rdma_netdev;
+	}
+	return netdev;
+}
+
+static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev)
+{
+	if (!dev->delay_drop.dbg)
+		return;
+	debugfs_remove_recursive(dev->delay_drop.dbg->dir_debugfs);
+	kfree(dev->delay_drop.dbg);
+	dev->delay_drop.dbg = NULL;
+}
+
+static void cancel_delay_drop(struct mlx5_ib_dev *dev)
+{
+	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
+		return;
+
+	cancel_work_sync(&dev->delay_drop.delay_drop_work);
+	delay_drop_debugfs_cleanup(dev);
+}
+
+static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf,
+				       size_t count, loff_t *pos)
+{
+	struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
+	char lbuf[20];
+	int len;
+
+	len = snprintf(lbuf, sizeof(lbuf), "%u\n", delay_drop->timeout);
+	return simple_read_from_buffer(buf, count, pos, lbuf, len);
+}
+
+static ssize_t delay_drop_timeout_write(struct file *filp, const char __user *buf,
+					size_t count, loff_t *pos)
+{
+	struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
+	u32 timeout;
+	u32 var;
+
+	if (kstrtouint_from_user(buf, count, 0, &var))
+		return -EFAULT;
+
+	timeout = min_t(u32, roundup(var, 100), MLX5_MAX_DELAY_DROP_TIMEOUT_MS *
+			1000);
+	if (timeout != var)
+		mlx5_ib_dbg(delay_drop->dev, "Round delay drop timeout to %u usec\n",
+			    timeout);
+
+	delay_drop->timeout = timeout;
+
+	return count;
+}
+
+static const struct file_operations fops_delay_drop_timeout = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= delay_drop_timeout_write,
+	.read	= delay_drop_timeout_read,
+};
+
+static int delay_drop_debugfs_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_ib_dbg_delay_drop *dbg;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dbg = kzalloc(sizeof(*dbg), GFP_KERNEL);
+	if (!dbg)
+		return -ENOMEM;
+
+	dev->delay_drop.dbg = dbg;
+
+	dbg->dir_debugfs =
+		debugfs_create_dir("delay_drop",
+				   dev->mdev->priv.dbg_root);
+	if (!dbg->dir_debugfs)
+		goto out_debugfs;
+
+	dbg->events_cnt_debugfs =
+		debugfs_create_atomic_t("num_timeout_events", 0400,
+					dbg->dir_debugfs,
+					&dev->delay_drop.events_cnt);
+	if (!dbg->events_cnt_debugfs)
+		goto out_debugfs;
+
+	dbg->rqs_cnt_debugfs =
+		debugfs_create_atomic_t("num_rqs", 0400,
+					dbg->dir_debugfs,
+					&dev->delay_drop.rqs_cnt);
+	if (!dbg->rqs_cnt_debugfs)
+		goto out_debugfs;
+
+	dbg->timeout_debugfs =
+		debugfs_create_file("timeout", 0600,
+				    dbg->dir_debugfs,
+				    &dev->delay_drop,
+				    &fops_delay_drop_timeout);
+	if (!dbg->timeout_debugfs)
+		goto out_debugfs;
+
+	return 0;
+
+out_debugfs:
+	delay_drop_debugfs_cleanup(dev);
+	return -ENOMEM;
+}
+
+static void init_delay_drop(struct mlx5_ib_dev *dev)
+{
+	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
+		return;
+
+	mutex_init(&dev->delay_drop.lock);
+	dev->delay_drop.dev = dev;
+	dev->delay_drop.activate = false;
+	dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
+	INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
+	atomic_set(&dev->delay_drop.rqs_cnt, 0);
+	atomic_set(&dev->delay_drop.events_cnt, 0);
+
+	if (delay_drop_debugfs_init(dev))
+		mlx5_ib_warn(dev, "Failed to init delay drop debugfs\n");
+}
+
+static const struct cpumask *
+mlx5_ib_get_vector_affinity(struct ib_device *ibdev, int comp_vector)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+
+	return mlx5_get_vector_affinity_hint(dev->mdev, comp_vector);
+}
+
+/* The mlx5_ib_multiport_mutex should be held when calling this function */
+static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
+				      struct mlx5_ib_multiport_info *mpi)
+{
+	u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
+	struct mlx5_ib_port *port = &ibdev->port[port_num];
+	int comps;
+	int err;
+	int i;
+
+	mlx5_ib_cleanup_cong_debugfs(ibdev, port_num);
+
+	spin_lock(&port->mp.mpi_lock);
+	if (!mpi->ibdev) {
+		spin_unlock(&port->mp.mpi_lock);
+		return;
+	}
+	mpi->ibdev = NULL;
+
+	spin_unlock(&port->mp.mpi_lock);
+	mlx5_remove_netdev_notifier(ibdev, port_num);
+	spin_lock(&port->mp.mpi_lock);
+
+	comps = mpi->mdev_refcnt;
+	if (comps) {
+		mpi->unaffiliate = true;
+		init_completion(&mpi->unref_comp);
+		spin_unlock(&port->mp.mpi_lock);
+
+		for (i = 0; i < comps; i++)
+			wait_for_completion(&mpi->unref_comp);
+
+		spin_lock(&port->mp.mpi_lock);
+		mpi->unaffiliate = false;
+	}
+
+	port->mp.mpi = NULL;
+
+	list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
+
+	spin_unlock(&port->mp.mpi_lock);
+
+	err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev);
+
+	mlx5_ib_dbg(ibdev, "unaffiliated port %d\n", port_num + 1);
+	/* Log an error, still needed to cleanup the pointers and add
+	 * it back to the list.
+	 */
+	if (err)
+		mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n",
+			    port_num + 1);
+
+	ibdev->roce[port_num].last_port_state = IB_PORT_DOWN;
+}
+
+/* The mlx5_ib_multiport_mutex should be held when calling this function */
+static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
+				    struct mlx5_ib_multiport_info *mpi)
+{
+	u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
+	int err;
+
+	spin_lock(&ibdev->port[port_num].mp.mpi_lock);
+	if (ibdev->port[port_num].mp.mpi) {
+		mlx5_ib_warn(ibdev, "port %d already affiliated.\n",
+			     port_num + 1);
+		spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
+		return false;
+	}
+
+	ibdev->port[port_num].mp.mpi = mpi;
+	mpi->ibdev = ibdev;
+	spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
+
+	err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev);
+	if (err)
+		goto unbind;
+
+	err = get_port_caps(ibdev, mlx5_core_native_port_num(mpi->mdev));
+	if (err)
+		goto unbind;
+
+	err = mlx5_add_netdev_notifier(ibdev, port_num);
+	if (err) {
+		mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n",
+			    port_num + 1);
+		goto unbind;
+	}
+
+	err = mlx5_ib_init_cong_debugfs(ibdev, port_num);
+	if (err)
+		goto unbind;
+
+	return true;
+
+unbind:
+	mlx5_ib_unbind_slave_port(ibdev, mpi);
+	return false;
+}
+
+static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
+{
+	int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
+							  port_num + 1);
+	struct mlx5_ib_multiport_info *mpi;
+	int err;
+	int i;
+
+	if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
+		return 0;
+
+	err = mlx5_query_nic_vport_system_image_guid(dev->mdev,
+						     &dev->sys_image_guid);
+	if (err)
+		return err;
+
+	err = mlx5_nic_vport_enable_roce(dev->mdev);
+	if (err)
+		return err;
+
+	mutex_lock(&mlx5_ib_multiport_mutex);
+	for (i = 0; i < dev->num_ports; i++) {
+		bool bound = false;
+
+		/* build a stub multiport info struct for the native port. */
+		if (i == port_num) {
+			mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
+			if (!mpi) {
+				mutex_unlock(&mlx5_ib_multiport_mutex);
+				mlx5_nic_vport_disable_roce(dev->mdev);
+				return -ENOMEM;
+			}
+
+			mpi->is_master = true;
+			mpi->mdev = dev->mdev;
+			mpi->sys_image_guid = dev->sys_image_guid;
+			dev->port[i].mp.mpi = mpi;
+			mpi->ibdev = dev;
+			mpi = NULL;
+			continue;
+		}
+
+		list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list,
+				    list) {
+			if (dev->sys_image_guid == mpi->sys_image_guid &&
+			    (mlx5_core_native_port_num(mpi->mdev) - 1) == i) {
+				bound = mlx5_ib_bind_slave_port(dev, mpi);
+			}
+
+			if (bound) {
+				dev_dbg(&mpi->mdev->pdev->dev, "removing port from unaffiliated list.\n");
+				mlx5_ib_dbg(dev, "port %d bound\n", i + 1);
+				list_del(&mpi->list);
+				break;
+			}
+		}
+		if (!bound) {
+			get_port_caps(dev, i + 1);
+			mlx5_ib_dbg(dev, "no free port found for port %d\n",
+				    i + 1);
+		}
+	}
+
+	list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list);
+	mutex_unlock(&mlx5_ib_multiport_mutex);
+	return err;
+}
+
+static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev)
+{
+	int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
+							  port_num + 1);
+	int i;
+
+	if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
+		return;
+
+	mutex_lock(&mlx5_ib_multiport_mutex);
+	for (i = 0; i < dev->num_ports; i++) {
+		if (dev->port[i].mp.mpi) {
+			/* Destroy the native port stub */
+			if (i == port_num) {
+				kfree(dev->port[i].mp.mpi);
+				dev->port[i].mp.mpi = NULL;
+			} else {
+				mlx5_ib_dbg(dev, "unbinding port_num: %d\n", i + 1);
+				mlx5_ib_unbind_slave_port(dev, dev->port[i].mp.mpi);
+			}
+		}
+	}
+
+	mlx5_ib_dbg(dev, "removing from devlist\n");
+	list_del(&dev->ib_dev_list);
+	mutex_unlock(&mlx5_ib_multiport_mutex);
+
+	mlx5_nic_vport_disable_roce(dev->mdev);
+}
+
+ADD_UVERBS_ATTRIBUTES_SIMPLE(mlx5_ib_dm, UVERBS_OBJECT_DM,
+			     UVERBS_METHOD_DM_ALLOC,
+			     &UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
+						  UVERBS_ATTR_TYPE(u64),
+						  UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+			     &UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
+						  UVERBS_ATTR_TYPE(u16),
+						  UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
+
+ADD_UVERBS_ATTRIBUTES_SIMPLE(mlx5_ib_flow_action, UVERBS_OBJECT_FLOW_ACTION,
+			     UVERBS_METHOD_FLOW_ACTION_ESP_CREATE,
+			     &UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
+						 UVERBS_ATTR_TYPE(u64),
+						 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
+
+#define NUM_TREES	2
+static int populate_specs_root(struct mlx5_ib_dev *dev)
+{
+	const struct uverbs_object_tree_def *default_root[NUM_TREES + 1] = {
+		uverbs_default_get_objects()};
+	size_t num_trees = 1;
+
+	if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE &&
+	    !WARN_ON(num_trees >= ARRAY_SIZE(default_root)))
+		default_root[num_trees++] = &mlx5_ib_flow_action;
+
+	if (MLX5_CAP_DEV_MEM(dev->mdev, memic) &&
+	    !WARN_ON(num_trees >= ARRAY_SIZE(default_root)))
+		default_root[num_trees++] = &mlx5_ib_dm;
+
+	dev->ib_dev.specs_root =
+		uverbs_alloc_spec_tree(num_trees, default_root);
+
+	return PTR_ERR_OR_ZERO(dev->ib_dev.specs_root);
+}
+
+static void depopulate_specs_root(struct mlx5_ib_dev *dev)
+{
+	uverbs_free_spec_tree(dev->ib_dev.specs_root);
+}
+
+void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
+{
+	mlx5_ib_cleanup_multiport_master(dev);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	cleanup_srcu_struct(&dev->mr_srcu);
+#endif
+	kfree(dev->port);
+}
+
+int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	const char *name;
+	int err;
+	int i;
+
+	dev->port = kcalloc(dev->num_ports, sizeof(*dev->port),
+			    GFP_KERNEL);
+	if (!dev->port)
+		return -ENOMEM;
+
+	for (i = 0; i < dev->num_ports; i++) {
+		spin_lock_init(&dev->port[i].mp.mpi_lock);
+		rwlock_init(&dev->roce[i].netdev_lock);
+	}
+
+	err = mlx5_ib_init_multiport_master(dev);
+	if (err)
+		goto err_free_port;
+
+	if (!mlx5_core_mp_enabled(mdev)) {
+		for (i = 1; i <= dev->num_ports; i++) {
+			err = get_port_caps(dev, i);
+			if (err)
+				break;
+		}
+	} else {
+		err = get_port_caps(dev, mlx5_core_native_port_num(mdev));
+	}
+	if (err)
+		goto err_mp;
+
+	if (mlx5_use_mad_ifc(dev))
+		get_ext_port_caps(dev);
+
+	if (!mlx5_lag_is_active(mdev))
+		name = "mlx5_%d";
+	else
+		name = "mlx5_bond_%d";
+
+	strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX);
+	dev->ib_dev.owner		= THIS_MODULE;
+	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
+	dev->ib_dev.local_dma_lkey	= 0 /* not supported for now */;
+	dev->ib_dev.phys_port_cnt	= dev->num_ports;
+	dev->ib_dev.num_comp_vectors    =
+		dev->mdev->priv.eq_table.num_comp_vectors;
+	dev->ib_dev.dev.parent		= &mdev->pdev->dev;
+
+	mutex_init(&dev->cap_mask_mutex);
+	INIT_LIST_HEAD(&dev->qp_list);
+	spin_lock_init(&dev->reset_flow_resource_lock);
+
+	spin_lock_init(&dev->memic.memic_lock);
+	dev->memic.dev = mdev;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	err = init_srcu_struct(&dev->mr_srcu);
+	if (err)
+		goto err_free_port;
+#endif
+
+	return 0;
+err_mp:
+	mlx5_ib_cleanup_multiport_master(dev);
+
+err_free_port:
+	kfree(dev->port);
+
+	return -ENOMEM;
+}
+
+static int mlx5_ib_stage_flow_db_init(struct mlx5_ib_dev *dev)
+{
+	dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL);
+
+	if (!dev->flow_db)
+		return -ENOMEM;
+
+	mutex_init(&dev->flow_db->lock);
+
+	return 0;
+}
+
+int mlx5_ib_stage_rep_flow_db_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_ib_dev *nic_dev;
+
+	nic_dev = mlx5_ib_get_uplink_ibdev(dev->mdev->priv.eswitch);
+
+	if (!nic_dev)
+		return -EINVAL;
+
+	dev->flow_db = nic_dev->flow_db;
+
+	return 0;
+}
+
+static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
+{
+	kfree(dev->flow_db);
+}
+
+int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	int err;
+
+	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
+	dev->ib_dev.uverbs_cmd_mask	=
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
+		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
+	dev->ib_dev.uverbs_ex_cmd_mask =
+		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)	|
+		(1ull << IB_USER_VERBS_EX_CMD_CREATE_QP)	|
+		(1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP)	|
+		(1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ);
+
+	dev->ib_dev.query_device	= mlx5_ib_query_device;
+	dev->ib_dev.get_link_layer	= mlx5_ib_port_link_layer;
+	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
+	dev->ib_dev.add_gid		= mlx5_ib_add_gid;
+	dev->ib_dev.del_gid		= mlx5_ib_del_gid;
+	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
+	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
+	dev->ib_dev.modify_port		= mlx5_ib_modify_port;
+	dev->ib_dev.alloc_ucontext	= mlx5_ib_alloc_ucontext;
+	dev->ib_dev.dealloc_ucontext	= mlx5_ib_dealloc_ucontext;
+	dev->ib_dev.mmap		= mlx5_ib_mmap;
+	dev->ib_dev.alloc_pd		= mlx5_ib_alloc_pd;
+	dev->ib_dev.dealloc_pd		= mlx5_ib_dealloc_pd;
+	dev->ib_dev.create_ah		= mlx5_ib_create_ah;
+	dev->ib_dev.query_ah		= mlx5_ib_query_ah;
+	dev->ib_dev.destroy_ah		= mlx5_ib_destroy_ah;
+	dev->ib_dev.create_srq		= mlx5_ib_create_srq;
+	dev->ib_dev.modify_srq		= mlx5_ib_modify_srq;
+	dev->ib_dev.query_srq		= mlx5_ib_query_srq;
+	dev->ib_dev.destroy_srq		= mlx5_ib_destroy_srq;
+	dev->ib_dev.post_srq_recv	= mlx5_ib_post_srq_recv;
+	dev->ib_dev.create_qp		= mlx5_ib_create_qp;
+	dev->ib_dev.modify_qp		= mlx5_ib_modify_qp;
+	dev->ib_dev.query_qp		= mlx5_ib_query_qp;
+	dev->ib_dev.destroy_qp		= mlx5_ib_destroy_qp;
+	dev->ib_dev.post_send		= mlx5_ib_post_send;
+	dev->ib_dev.post_recv		= mlx5_ib_post_recv;
+	dev->ib_dev.create_cq		= mlx5_ib_create_cq;
+	dev->ib_dev.modify_cq		= mlx5_ib_modify_cq;
+	dev->ib_dev.resize_cq		= mlx5_ib_resize_cq;
+	dev->ib_dev.destroy_cq		= mlx5_ib_destroy_cq;
+	dev->ib_dev.poll_cq		= mlx5_ib_poll_cq;
+	dev->ib_dev.req_notify_cq	= mlx5_ib_arm_cq;
+	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;
+	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;
+	dev->ib_dev.rereg_user_mr	= mlx5_ib_rereg_user_mr;
+	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr;
+	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
+	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
+	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
+	dev->ib_dev.alloc_mr		= mlx5_ib_alloc_mr;
+	dev->ib_dev.map_mr_sg		= mlx5_ib_map_mr_sg;
+	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;
+	dev->ib_dev.get_dev_fw_str      = get_dev_fw_str;
+	dev->ib_dev.get_vector_affinity	= mlx5_ib_get_vector_affinity;
+	if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads))
+		dev->ib_dev.alloc_rdma_netdev	= mlx5_ib_alloc_rdma_netdev;
+
+	if (mlx5_core_is_pf(mdev)) {
+		dev->ib_dev.get_vf_config	= mlx5_ib_get_vf_config;
+		dev->ib_dev.set_vf_link_state	= mlx5_ib_set_vf_link_state;
+		dev->ib_dev.get_vf_stats	= mlx5_ib_get_vf_stats;
+		dev->ib_dev.set_vf_guid		= mlx5_ib_set_vf_guid;
+	}
+
+	dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext;
+
+	dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence));
+
+	if (MLX5_CAP_GEN(mdev, imaicl)) {
+		dev->ib_dev.alloc_mw		= mlx5_ib_alloc_mw;
+		dev->ib_dev.dealloc_mw		= mlx5_ib_dealloc_mw;
+		dev->ib_dev.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_ALLOC_MW)	|
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
+	}
+
+	if (MLX5_CAP_GEN(mdev, xrc)) {
+		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
+		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
+		dev->ib_dev.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
+			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
+	}
+
+	if (MLX5_CAP_DEV_MEM(mdev, memic)) {
+		dev->ib_dev.alloc_dm = mlx5_ib_alloc_dm;
+		dev->ib_dev.dealloc_dm = mlx5_ib_dealloc_dm;
+		dev->ib_dev.reg_dm_mr = mlx5_ib_reg_dm_mr;
+	}
+
+	dev->ib_dev.create_flow	= mlx5_ib_create_flow;
+	dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
+	dev->ib_dev.uverbs_ex_cmd_mask |=
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
+	dev->ib_dev.create_flow_action_esp = mlx5_ib_create_flow_action_esp;
+	dev->ib_dev.destroy_flow_action = mlx5_ib_destroy_flow_action;
+	dev->ib_dev.modify_flow_action_esp = mlx5_ib_modify_flow_action_esp;
+	dev->ib_dev.driver_id = RDMA_DRIVER_MLX5;
+
+	err = init_node_data(dev);
+	if (err)
+		return err;
+
+	if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
+	    (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) ||
+	     MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
+		mutex_init(&dev->lb_mutex);
+
+	return 0;
+}
+
+static int mlx5_ib_stage_non_default_cb(struct mlx5_ib_dev *dev)
+{
+	dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
+	dev->ib_dev.query_port		= mlx5_ib_query_port;
+
+	return 0;
+}
+
+int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev)
+{
+	dev->ib_dev.get_port_immutable  = mlx5_port_rep_immutable;
+	dev->ib_dev.query_port		= mlx5_ib_rep_query_port;
+
+	return 0;
+}
+
+static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev,
+					  u8 port_num)
+{
+	int i;
+
+	for (i = 0; i < dev->num_ports; i++) {
+		dev->roce[i].dev = dev;
+		dev->roce[i].native_port_num = i + 1;
+		dev->roce[i].last_port_state = IB_PORT_DOWN;
+	}
+
+	dev->ib_dev.get_netdev	= mlx5_ib_get_netdev;
+	dev->ib_dev.create_wq	 = mlx5_ib_create_wq;
+	dev->ib_dev.modify_wq	 = mlx5_ib_modify_wq;
+	dev->ib_dev.destroy_wq	 = mlx5_ib_destroy_wq;
+	dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
+	dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
+
+	dev->ib_dev.uverbs_ex_cmd_mask |=
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
+			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
+
+	return mlx5_add_netdev_notifier(dev, port_num);
+}
+
+static void mlx5_ib_stage_common_roce_cleanup(struct mlx5_ib_dev *dev)
+{
+	u8 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+
+	mlx5_remove_netdev_notifier(dev, port_num);
+}
+
+int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	enum rdma_link_layer ll;
+	int port_type_cap;
+	int err = 0;
+	u8 port_num;
+
+	port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
+	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+
+	if (ll == IB_LINK_LAYER_ETHERNET)
+		err = mlx5_ib_stage_common_roce_init(dev, port_num);
+
+	return err;
+}
+
+void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev)
+{
+	mlx5_ib_stage_common_roce_cleanup(dev);
+}
+
+static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	enum rdma_link_layer ll;
+	int port_type_cap;
+	u8 port_num;
+	int err;
+
+	port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
+	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+
+	if (ll == IB_LINK_LAYER_ETHERNET) {
+		err = mlx5_ib_stage_common_roce_init(dev, port_num);
+		if (err)
+			return err;
+
+		err = mlx5_enable_eth(dev, port_num);
+		if (err)
+			goto cleanup;
+	}
+
+	return 0;
+cleanup:
+	mlx5_ib_stage_common_roce_cleanup(dev);
+
+	return err;
+}
+
+static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	enum rdma_link_layer ll;
+	int port_type_cap;
+	u8 port_num;
+
+	port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
+	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+
+	if (ll == IB_LINK_LAYER_ETHERNET) {
+		mlx5_disable_eth(dev);
+		mlx5_ib_stage_common_roce_cleanup(dev);
+	}
+}
+
+int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev)
+{
+	return create_dev_resources(&dev->devr);
+}
+
+void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev)
+{
+	destroy_dev_resources(&dev->devr);
+}
+
+static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
+{
+	mlx5_ib_internal_fill_odp_caps(dev);
+
+	return mlx5_ib_odp_init_one(dev);
+}
+
+int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
+{
+	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
+		dev->ib_dev.get_hw_stats	= mlx5_ib_get_hw_stats;
+		dev->ib_dev.alloc_hw_stats	= mlx5_ib_alloc_hw_stats;
+
+		return mlx5_ib_alloc_counters(dev);
+	}
+
+	return 0;
+}
+
+void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev)
+{
+	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
+		mlx5_ib_dealloc_counters(dev);
+}
+
+static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
+{
+	return mlx5_ib_init_cong_debugfs(dev,
+					 mlx5_core_native_port_num(dev->mdev) - 1);
+}
+
+static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
+{
+	mlx5_ib_cleanup_cong_debugfs(dev,
+				     mlx5_core_native_port_num(dev->mdev) - 1);
+}
+
+static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev)
+{
+	dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
+	return PTR_ERR_OR_ZERO(dev->mdev->priv.uar);
+}
+
+static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
+{
+	mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
+}
+
+int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
+{
+	int err;
+
+	err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
+	if (err)
+		return err;
+
+	err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
+	if (err)
+		mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
+
+	return err;
+}
+
+void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
+{
+	mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
+	mlx5_free_bfreg(dev->mdev, &dev->bfreg);
+}
+
+static int mlx5_ib_stage_populate_specs(struct mlx5_ib_dev *dev)
+{
+	return populate_specs_root(dev);
+}
+
+int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
+{
+	return ib_register_device(&dev->ib_dev, NULL);
+}
+
+static void mlx5_ib_stage_depopulate_specs(struct mlx5_ib_dev *dev)
+{
+	depopulate_specs_root(dev);
+}
+
+void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
+{
+	destroy_umrc_res(dev);
+}
+
+void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
+{
+	ib_unregister_device(&dev->ib_dev);
+}
+
+int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
+{
+	return create_umr_res(dev);
+}
+
+static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev)
+{
+	init_delay_drop(dev);
+
+	return 0;
+}
+
+static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
+{
+	cancel_delay_drop(dev);
+}
+
+int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
+		err = device_create_file(&dev->ib_dev.dev,
+					 mlx5_class_attributes[i]);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int mlx5_ib_stage_rep_reg_init(struct mlx5_ib_dev *dev)
+{
+	mlx5_ib_register_vport_reps(dev);
+
+	return 0;
+}
+
+static void mlx5_ib_stage_rep_reg_cleanup(struct mlx5_ib_dev *dev)
+{
+	mlx5_ib_unregister_vport_reps(dev);
+}
+
+void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
+		      const struct mlx5_ib_profile *profile,
+		      int stage)
+{
+	/* Number of stages to cleanup */
+	while (stage) {
+		stage--;
+		if (profile->stage[stage].cleanup)
+			profile->stage[stage].cleanup(dev);
+	}
+
+	ib_dealloc_device((struct ib_device *)dev);
+}
+
+static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev, u8 port_num);
+
+void *__mlx5_ib_add(struct mlx5_ib_dev *dev,
+		    const struct mlx5_ib_profile *profile)
+{
+	int err;
+	int i;
+
+	printk_once(KERN_INFO "%s", mlx5_version);
+
+	for (i = 0; i < MLX5_IB_STAGE_MAX; i++) {
+		if (profile->stage[i].init) {
+			err = profile->stage[i].init(dev);
+			if (err)
+				goto err_out;
+		}
+	}
+
+	dev->profile = profile;
+	dev->ib_active = true;
+
+	return dev;
+
+err_out:
+	__mlx5_ib_remove(dev, profile, i);
+
+	return NULL;
+}
+
+static const struct mlx5_ib_profile pf_profile = {
+	STAGE_CREATE(MLX5_IB_STAGE_INIT,
+		     mlx5_ib_stage_init_init,
+		     mlx5_ib_stage_init_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
+		     mlx5_ib_stage_flow_db_init,
+		     mlx5_ib_stage_flow_db_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_CAPS,
+		     mlx5_ib_stage_caps_init,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
+		     mlx5_ib_stage_non_default_cb,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
+		     mlx5_ib_stage_roce_init,
+		     mlx5_ib_stage_roce_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
+		     mlx5_ib_stage_dev_res_init,
+		     mlx5_ib_stage_dev_res_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_ODP,
+		     mlx5_ib_stage_odp_init,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
+		     mlx5_ib_stage_counters_init,
+		     mlx5_ib_stage_counters_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
+		     mlx5_ib_stage_cong_debugfs_init,
+		     mlx5_ib_stage_cong_debugfs_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_UAR,
+		     mlx5_ib_stage_uar_init,
+		     mlx5_ib_stage_uar_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_BFREG,
+		     mlx5_ib_stage_bfrag_init,
+		     mlx5_ib_stage_bfrag_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
+		     NULL,
+		     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_SPECS,
+		     mlx5_ib_stage_populate_specs,
+		     mlx5_ib_stage_depopulate_specs),
+	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
+		     mlx5_ib_stage_ib_reg_init,
+		     mlx5_ib_stage_ib_reg_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
+		     mlx5_ib_stage_post_ib_reg_umr_init,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
+		     mlx5_ib_stage_delay_drop_init,
+		     mlx5_ib_stage_delay_drop_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR,
+		     mlx5_ib_stage_class_attr_init,
+		     NULL),
+};
+
+static const struct mlx5_ib_profile nic_rep_profile = {
+	STAGE_CREATE(MLX5_IB_STAGE_INIT,
+		     mlx5_ib_stage_init_init,
+		     mlx5_ib_stage_init_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
+		     mlx5_ib_stage_flow_db_init,
+		     mlx5_ib_stage_flow_db_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_CAPS,
+		     mlx5_ib_stage_caps_init,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
+		     mlx5_ib_stage_rep_non_default_cb,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
+		     mlx5_ib_stage_rep_roce_init,
+		     mlx5_ib_stage_rep_roce_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
+		     mlx5_ib_stage_dev_res_init,
+		     mlx5_ib_stage_dev_res_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
+		     mlx5_ib_stage_counters_init,
+		     mlx5_ib_stage_counters_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_UAR,
+		     mlx5_ib_stage_uar_init,
+		     mlx5_ib_stage_uar_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_BFREG,
+		     mlx5_ib_stage_bfrag_init,
+		     mlx5_ib_stage_bfrag_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
+		     NULL,
+		     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_SPECS,
+		     mlx5_ib_stage_populate_specs,
+		     mlx5_ib_stage_depopulate_specs),
+	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
+		     mlx5_ib_stage_ib_reg_init,
+		     mlx5_ib_stage_ib_reg_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
+		     mlx5_ib_stage_post_ib_reg_umr_init,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR,
+		     mlx5_ib_stage_class_attr_init,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_REP_REG,
+		     mlx5_ib_stage_rep_reg_init,
+		     mlx5_ib_stage_rep_reg_cleanup),
+};
+
+static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev, u8 port_num)
+{
+	struct mlx5_ib_multiport_info *mpi;
+	struct mlx5_ib_dev *dev;
+	bool bound = false;
+	int err;
+
+	mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
+	if (!mpi)
+		return NULL;
+
+	mpi->mdev = mdev;
+
+	err = mlx5_query_nic_vport_system_image_guid(mdev,
+						     &mpi->sys_image_guid);
+	if (err) {
+		kfree(mpi);
+		return NULL;
+	}
+
+	mutex_lock(&mlx5_ib_multiport_mutex);
+	list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) {
+		if (dev->sys_image_guid == mpi->sys_image_guid)
+			bound = mlx5_ib_bind_slave_port(dev, mpi);
+
+		if (bound) {
+			rdma_roce_rescan_device(&dev->ib_dev);
+			break;
+		}
+	}
+
+	if (!bound) {
+		list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
+		dev_dbg(&mdev->pdev->dev, "no suitable IB device found to bind to, added to unaffiliated list.\n");
+	} else {
+		mlx5_ib_dbg(dev, "bound port %u\n", port_num + 1);
+	}
+	mutex_unlock(&mlx5_ib_multiport_mutex);
+
+	return mpi;
+}
+
+static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
+{
+	enum rdma_link_layer ll;
+	struct mlx5_ib_dev *dev;
+	int port_type_cap;
+
+	printk_once(KERN_INFO "%s", mlx5_version);
+
+	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
+	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+
+	if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET) {
+		u8 port_num = mlx5_core_native_port_num(mdev) - 1;
+
+		return mlx5_ib_add_slave_port(mdev, port_num);
+	}
+
+	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
+	if (!dev)
+		return NULL;
+
+	dev->mdev = mdev;
+	dev->num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
+			     MLX5_CAP_GEN(mdev, num_vhca_ports));
+
+	if (MLX5_VPORT_MANAGER(mdev) &&
+	    mlx5_ib_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) {
+		dev->rep = mlx5_ib_vport_rep(mdev->priv.eswitch, 0);
+
+		return __mlx5_ib_add(dev, &nic_rep_profile);
+	}
+
+	return __mlx5_ib_add(dev, &pf_profile);
+}
+
+static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
+{
+	struct mlx5_ib_multiport_info *mpi;
+	struct mlx5_ib_dev *dev;
+
+	if (mlx5_core_is_mp_slave(mdev)) {
+		mpi = context;
+		mutex_lock(&mlx5_ib_multiport_mutex);
+		if (mpi->ibdev)
+			mlx5_ib_unbind_slave_port(mpi->ibdev, mpi);
+		list_del(&mpi->list);
+		mutex_unlock(&mlx5_ib_multiport_mutex);
+		return;
+	}
+
+	dev = context;
+	__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
+}
+
+static struct mlx5_interface mlx5_ib_interface = {
+	.add            = mlx5_ib_add,
+	.remove         = mlx5_ib_remove,
+	.event          = mlx5_ib_event,
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	.pfault		= mlx5_ib_pfault,
+#endif
+	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
+};
+
+unsigned long mlx5_ib_get_xlt_emergency_page(void)
+{
+	mutex_lock(&xlt_emergency_page_mutex);
+	return xlt_emergency_page;
+}
+
+void mlx5_ib_put_xlt_emergency_page(void)
+{
+	mutex_unlock(&xlt_emergency_page_mutex);
+}
+
+static int __init mlx5_ib_init(void)
+{
+	int err;
+
+	xlt_emergency_page = __get_free_page(GFP_KERNEL);
+	if (!xlt_emergency_page)
+		return -ENOMEM;
+
+	mutex_init(&xlt_emergency_page_mutex);
+
+	mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0);
+	if (!mlx5_ib_event_wq) {
+		free_page(xlt_emergency_page);
+		return -ENOMEM;
+	}
+
+	mlx5_ib_odp_init();
+
+	err = mlx5_register_interface(&mlx5_ib_interface);
+
+	return err;
+}
+
+static void __exit mlx5_ib_cleanup(void)
+{
+	mlx5_unregister_interface(&mlx5_ib_interface);
+	destroy_workqueue(mlx5_ib_event_wq);
+	mutex_destroy(&xlt_emergency_page_mutex);
+	free_page(xlt_emergency_page);
+}
+
+module_init(mlx5_ib_init);
+module_exit(mlx5_ib_cleanup);
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
new file mode 100644
index 0000000..f3dbd75
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+#include "mlx5_ib.h"
+
+/* @umem: umem object to scan
+ * @addr: ib virtual address requested by the user
+ * @max_page_shift: high limit for page_shift - 0 means no limit
+ * @count: number of PAGE_SIZE pages covered by umem
+ * @shift: page shift for the compound pages found in the region
+ * @ncont: number of compund pages
+ * @order: log2 of the number of compound pages
+ */
+void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
+			unsigned long max_page_shift,
+			int *count, int *shift,
+			int *ncont, int *order)
+{
+	unsigned long tmp;
+	unsigned long m;
+	u64 base = ~0, p = 0;
+	u64 len, pfn;
+	int i = 0;
+	struct scatterlist *sg;
+	int entry;
+	unsigned long page_shift = umem->page_shift;
+
+	if (umem->odp_data) {
+		*ncont = ib_umem_page_count(umem);
+		*count = *ncont << (page_shift - PAGE_SHIFT);
+		*shift = page_shift;
+		if (order)
+			*order = ilog2(roundup_pow_of_two(*ncont));
+
+		return;
+	}
+
+	addr = addr >> page_shift;
+	tmp = (unsigned long)addr;
+	m = find_first_bit(&tmp, BITS_PER_LONG);
+	if (max_page_shift)
+		m = min_t(unsigned long, max_page_shift - page_shift, m);
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		len = sg_dma_len(sg) >> page_shift;
+		pfn = sg_dma_address(sg) >> page_shift;
+		if (base + p != pfn) {
+			/* If either the offset or the new
+			 * base are unaligned update m
+			 */
+			tmp = (unsigned long)(pfn | p);
+			if (!IS_ALIGNED(tmp, 1 << m))
+				m = find_first_bit(&tmp, BITS_PER_LONG);
+
+			base = pfn;
+			p = 0;
+		}
+
+		p += len;
+		i += len;
+	}
+
+	if (i) {
+		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
+
+		if (order)
+			*order = ilog2(roundup_pow_of_two(i) >> m);
+
+		*ncont = DIV_ROUND_UP(i, (1 << m));
+	} else {
+		m  = 0;
+
+		if (order)
+			*order = 0;
+
+		*ncont = 0;
+	}
+	*shift = page_shift + m;
+	*count = i;
+}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
+{
+	u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
+
+	if (umem_dma & ODP_READ_ALLOWED_BIT)
+		mtt_entry |= MLX5_IB_MTT_READ;
+	if (umem_dma & ODP_WRITE_ALLOWED_BIT)
+		mtt_entry |= MLX5_IB_MTT_WRITE;
+
+	return mtt_entry;
+}
+#endif
+
+/*
+ * Populate the given array with bus addresses from the umem.
+ *
+ * dev - mlx5_ib device
+ * umem - umem to use to fill the pages
+ * page_shift - determines the page size used in the resulting array
+ * offset - offset into the umem to start from,
+ *          only implemented for ODP umems
+ * num_pages - total number of pages to fill
+ * pas - bus addresses array to fill
+ * access_flags - access flags to set on all present pages.
+		  use enum mlx5_ib_mtt_access_flags for this.
+ */
+void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+			    int page_shift, size_t offset, size_t num_pages,
+			    __be64 *pas, int access_flags)
+{
+	unsigned long umem_page_shift = umem->page_shift;
+	int shift = page_shift - umem_page_shift;
+	int mask = (1 << shift) - 1;
+	int i, k, idx;
+	u64 cur = 0;
+	u64 base;
+	int len;
+	struct scatterlist *sg;
+	int entry;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	const bool odp = umem->odp_data != NULL;
+
+	if (odp) {
+		WARN_ON(shift != 0);
+		WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE));
+
+		for (i = 0; i < num_pages; ++i) {
+			dma_addr_t pa = umem->odp_data->dma_list[offset + i];
+
+			pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
+		}
+		return;
+	}
+#endif
+
+	i = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		len = sg_dma_len(sg) >> umem_page_shift;
+		base = sg_dma_address(sg);
+
+		/* Skip elements below offset */
+		if (i + len < offset << shift) {
+			i += len;
+			continue;
+		}
+
+		/* Skip pages below offset */
+		if (i < offset << shift) {
+			k = (offset << shift) - i;
+			i = offset << shift;
+		} else {
+			k = 0;
+		}
+
+		for (; k < len; k++) {
+			if (!(i & mask)) {
+				cur = base + (k << umem_page_shift);
+				cur |= access_flags;
+				idx = (i >> shift) - offset;
+
+				pas[idx] = cpu_to_be64(cur);
+				mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n",
+					    i >> shift, be64_to_cpu(pas[idx]));
+			}
+			i++;
+
+			/* Stop after num_pages reached */
+			if (i >> shift >= offset + num_pages)
+				return;
+		}
+	}
+}
+
+void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+			  int page_shift, __be64 *pas, int access_flags)
+{
+	return __mlx5_ib_populate_pas(dev, umem, page_shift, 0,
+				      ib_umem_num_pages(umem), pas,
+				      access_flags);
+}
+int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
+{
+	u64 page_size;
+	u64 page_mask;
+	u64 off_size;
+	u64 off_mask;
+	u64 buf_off;
+
+	page_size = (u64)1 << page_shift;
+	page_mask = page_size - 1;
+	buf_off = addr & page_mask;
+	off_size = page_size >> 6;
+	off_mask = off_size - 1;
+
+	if (buf_off & off_mask)
+		return -EINVAL;
+
+	*offset = buf_off >> ilog2(off_size);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
new file mode 100644
index 0000000..49a1aa0
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -0,0 +1,1285 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_IB_H
+#define MLX5_IB_H
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/srq.h>
+#include <linux/types.h>
+#include <linux/mlx5/transobj.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/mlx5-abi.h>
+#include <rdma/uverbs_ioctl.h>
+
+#define mlx5_ib_dbg(dev, format, arg...)				\
+pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
+	 __LINE__, current->pid, ##arg)
+
+#define mlx5_ib_err(dev, format, arg...)				\
+pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
+	__LINE__, current->pid, ##arg)
+
+#define mlx5_ib_warn(dev, format, arg...)				\
+pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
+	__LINE__, current->pid, ##arg)
+
+#define field_avail(type, fld, sz) (offsetof(type, fld) +		\
+				    sizeof(((type *)0)->fld) <= (sz))
+#define MLX5_IB_DEFAULT_UIDX 0xffffff
+#define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index)
+
+#define MLX5_MKEY_PAGE_SHIFT_MASK __mlx5_mask(mkc, log_page_size)
+
+enum {
+	MLX5_IB_MMAP_CMD_SHIFT	= 8,
+	MLX5_IB_MMAP_CMD_MASK	= 0xff,
+};
+
+enum {
+	MLX5_RES_SCAT_DATA32_CQE	= 0x1,
+	MLX5_RES_SCAT_DATA64_CQE	= 0x2,
+	MLX5_REQ_SCAT_DATA32_CQE	= 0x11,
+	MLX5_REQ_SCAT_DATA64_CQE	= 0x22,
+};
+
+enum mlx5_ib_latency_class {
+	MLX5_IB_LATENCY_CLASS_LOW,
+	MLX5_IB_LATENCY_CLASS_MEDIUM,
+	MLX5_IB_LATENCY_CLASS_HIGH,
+};
+
+enum mlx5_ib_mad_ifc_flags {
+	MLX5_MAD_IFC_IGNORE_MKEY	= 1,
+	MLX5_MAD_IFC_IGNORE_BKEY	= 2,
+	MLX5_MAD_IFC_NET_VIEW		= 4,
+};
+
+enum {
+	MLX5_CROSS_CHANNEL_BFREG         = 0,
+};
+
+enum {
+	MLX5_CQE_VERSION_V0,
+	MLX5_CQE_VERSION_V1,
+};
+
+enum {
+	MLX5_TM_MAX_RNDV_MSG_SIZE	= 64,
+	MLX5_TM_MAX_SGE			= 1,
+};
+
+enum {
+	MLX5_IB_INVALID_UAR_INDEX	= BIT(31),
+	MLX5_IB_INVALID_BFREG		= BIT(31),
+};
+
+enum {
+	MLX5_MAX_MEMIC_PAGES = 0x100,
+	MLX5_MEMIC_ALLOC_SIZE_MASK = 0x3f,
+};
+
+enum {
+	MLX5_MEMIC_BASE_ALIGN	= 6,
+	MLX5_MEMIC_BASE_SIZE	= 1 << MLX5_MEMIC_BASE_ALIGN,
+};
+
+struct mlx5_ib_vma_private_data {
+	struct list_head list;
+	struct vm_area_struct *vma;
+	/* protect vma_private_list add/del */
+	struct mutex *vma_private_list_mutex;
+};
+
+struct mlx5_ib_ucontext {
+	struct ib_ucontext	ibucontext;
+	struct list_head	db_page_list;
+
+	/* protect doorbell record alloc/free
+	 */
+	struct mutex		db_page_mutex;
+	struct mlx5_bfreg_info	bfregi;
+	u8			cqe_version;
+	/* Transport Domain number */
+	u32			tdn;
+	struct list_head	vma_private_list;
+	/* protect vma_private_list add/del */
+	struct mutex		vma_private_list_mutex;
+
+	u64			lib_caps;
+	DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES);
+};
+
+static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext);
+}
+
+struct mlx5_ib_pd {
+	struct ib_pd		ibpd;
+	u32			pdn;
+};
+
+#define MLX5_IB_FLOW_MCAST_PRIO		(MLX5_BY_PASS_NUM_PRIOS - 1)
+#define MLX5_IB_FLOW_LAST_PRIO		(MLX5_BY_PASS_NUM_REGULAR_PRIOS - 1)
+#if (MLX5_IB_FLOW_LAST_PRIO <= 0)
+#error "Invalid number of bypass priorities"
+#endif
+#define MLX5_IB_FLOW_LEFTOVERS_PRIO	(MLX5_IB_FLOW_MCAST_PRIO + 1)
+
+#define MLX5_IB_NUM_FLOW_FT		(MLX5_IB_FLOW_LEFTOVERS_PRIO + 1)
+#define MLX5_IB_NUM_SNIFFER_FTS		2
+#define MLX5_IB_NUM_EGRESS_FTS		1
+struct mlx5_ib_flow_prio {
+	struct mlx5_flow_table		*flow_table;
+	unsigned int			refcount;
+};
+
+struct mlx5_ib_flow_handler {
+	struct list_head		list;
+	struct ib_flow			ibflow;
+	struct mlx5_ib_flow_prio	*prio;
+	struct mlx5_flow_handle		*rule;
+};
+
+struct mlx5_ib_flow_db {
+	struct mlx5_ib_flow_prio	prios[MLX5_IB_NUM_FLOW_FT];
+	struct mlx5_ib_flow_prio	sniffer[MLX5_IB_NUM_SNIFFER_FTS];
+	struct mlx5_ib_flow_prio	egress[MLX5_IB_NUM_EGRESS_FTS];
+	struct mlx5_flow_table		*lag_demux_ft;
+	/* Protect flow steering bypass flow tables
+	 * when add/del flow rules.
+	 * only single add/removal of flow steering rule could be done
+	 * simultaneously.
+	 */
+	struct mutex			lock;
+};
+
+/* Use macros here so that don't have to duplicate
+ * enum ib_send_flags and enum ib_qp_type for low-level driver
+ */
+
+#define MLX5_IB_SEND_UMR_ENABLE_MR	       (IB_SEND_RESERVED_START << 0)
+#define MLX5_IB_SEND_UMR_DISABLE_MR	       (IB_SEND_RESERVED_START << 1)
+#define MLX5_IB_SEND_UMR_FAIL_IF_FREE	       (IB_SEND_RESERVED_START << 2)
+#define MLX5_IB_SEND_UMR_UPDATE_XLT	       (IB_SEND_RESERVED_START << 3)
+#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION    (IB_SEND_RESERVED_START << 4)
+#define MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS       IB_SEND_RESERVED_END
+
+#define MLX5_IB_QPT_REG_UMR	IB_QPT_RESERVED1
+/*
+ * IB_QPT_GSI creates the software wrapper around GSI, and MLX5_IB_QPT_HW_GSI
+ * creates the actual hardware QP.
+ */
+#define MLX5_IB_QPT_HW_GSI	IB_QPT_RESERVED2
+#define MLX5_IB_QPT_DCI		IB_QPT_RESERVED3
+#define MLX5_IB_QPT_DCT		IB_QPT_RESERVED4
+#define MLX5_IB_WR_UMR		IB_WR_RESERVED1
+
+#define MLX5_IB_UMR_OCTOWORD	       16
+#define MLX5_IB_UMR_XLT_ALIGNMENT      64
+
+#define MLX5_IB_UPD_XLT_ZAP	      BIT(0)
+#define MLX5_IB_UPD_XLT_ENABLE	      BIT(1)
+#define MLX5_IB_UPD_XLT_ATOMIC	      BIT(2)
+#define MLX5_IB_UPD_XLT_ADDR	      BIT(3)
+#define MLX5_IB_UPD_XLT_PD	      BIT(4)
+#define MLX5_IB_UPD_XLT_ACCESS	      BIT(5)
+#define MLX5_IB_UPD_XLT_INDIRECT      BIT(6)
+
+/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
+ *
+ * These flags are intended for internal use by the mlx5_ib driver, and they
+ * rely on the range reserved for that use in the ib_qp_create_flags enum.
+ */
+
+/* Create a UD QP whose source QP number is 1 */
+static inline enum ib_qp_create_flags mlx5_ib_create_qp_sqpn_qp1(void)
+{
+	return IB_QP_CREATE_RESERVED_START;
+}
+
+struct wr_list {
+	u16	opcode;
+	u16	next;
+};
+
+enum mlx5_ib_rq_flags {
+	MLX5_IB_RQ_CVLAN_STRIPPING	= 1 << 0,
+	MLX5_IB_RQ_PCI_WRITE_END_PADDING	= 1 << 1,
+};
+
+struct mlx5_ib_wq {
+	u64		       *wrid;
+	u32		       *wr_data;
+	struct wr_list	       *w_list;
+	unsigned	       *wqe_head;
+	u16		        unsig_count;
+
+	/* serialize post to the work queue
+	 */
+	spinlock_t		lock;
+	int			wqe_cnt;
+	int			max_post;
+	int			max_gs;
+	int			offset;
+	int			wqe_shift;
+	unsigned		head;
+	unsigned		tail;
+	u16			cur_post;
+	u16			last_poll;
+	void		       *qend;
+};
+
+enum mlx5_ib_wq_flags {
+	MLX5_IB_WQ_FLAGS_DELAY_DROP = 0x1,
+	MLX5_IB_WQ_FLAGS_STRIDING_RQ = 0x2,
+};
+
+#define MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES 9
+#define MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES 16
+#define MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES 6
+#define MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES 13
+
+struct mlx5_ib_rwq {
+	struct ib_wq		ibwq;
+	struct mlx5_core_qp	core_qp;
+	u32			rq_num_pas;
+	u32			log_rq_stride;
+	u32			log_rq_size;
+	u32			rq_page_offset;
+	u32			log_page_size;
+	u32			log_num_strides;
+	u32			two_byte_shift_en;
+	u32			single_stride_log_num_of_bytes;
+	struct ib_umem		*umem;
+	size_t			buf_size;
+	unsigned int		page_shift;
+	int			create_type;
+	struct mlx5_db		db;
+	u32			user_index;
+	u32			wqe_count;
+	u32			wqe_shift;
+	int			wq_sig;
+	u32			create_flags; /* Use enum mlx5_ib_wq_flags */
+};
+
+enum {
+	MLX5_QP_USER,
+	MLX5_QP_KERNEL,
+	MLX5_QP_EMPTY
+};
+
+enum {
+	MLX5_WQ_USER,
+	MLX5_WQ_KERNEL
+};
+
+struct mlx5_ib_rwq_ind_table {
+	struct ib_rwq_ind_table ib_rwq_ind_tbl;
+	u32			rqtn;
+};
+
+struct mlx5_ib_ubuffer {
+	struct ib_umem	       *umem;
+	int			buf_size;
+	u64			buf_addr;
+};
+
+struct mlx5_ib_qp_base {
+	struct mlx5_ib_qp	*container_mibqp;
+	struct mlx5_core_qp	mqp;
+	struct mlx5_ib_ubuffer	ubuffer;
+};
+
+struct mlx5_ib_qp_trans {
+	struct mlx5_ib_qp_base	base;
+	u16			xrcdn;
+	u8			alt_port;
+	u8			atomic_rd_en;
+	u8			resp_depth;
+};
+
+struct mlx5_ib_rss_qp {
+	u32	tirn;
+};
+
+struct mlx5_ib_rq {
+	struct mlx5_ib_qp_base base;
+	struct mlx5_ib_wq	*rq;
+	struct mlx5_ib_ubuffer	ubuffer;
+	struct mlx5_db		*doorbell;
+	u32			tirn;
+	u8			state;
+	u32			flags;
+};
+
+struct mlx5_ib_sq {
+	struct mlx5_ib_qp_base base;
+	struct mlx5_ib_wq	*sq;
+	struct mlx5_ib_ubuffer  ubuffer;
+	struct mlx5_db		*doorbell;
+	struct mlx5_flow_handle	*flow_rule;
+	u32			tisn;
+	u8			state;
+};
+
+struct mlx5_ib_raw_packet_qp {
+	struct mlx5_ib_sq sq;
+	struct mlx5_ib_rq rq;
+};
+
+struct mlx5_bf {
+	int			buf_size;
+	unsigned long		offset;
+	struct mlx5_sq_bfreg   *bfreg;
+};
+
+struct mlx5_ib_dct {
+	struct mlx5_core_dct    mdct;
+	u32                     *in;
+};
+
+struct mlx5_ib_qp {
+	struct ib_qp		ibqp;
+	union {
+		struct mlx5_ib_qp_trans trans_qp;
+		struct mlx5_ib_raw_packet_qp raw_packet_qp;
+		struct mlx5_ib_rss_qp rss_qp;
+		struct mlx5_ib_dct dct;
+	};
+	struct mlx5_frag_buf	buf;
+
+	struct mlx5_db		db;
+	struct mlx5_ib_wq	rq;
+
+	u8			sq_signal_bits;
+	u8			next_fence;
+	struct mlx5_ib_wq	sq;
+
+	/* serialize qp state modifications
+	 */
+	struct mutex		mutex;
+	u32			flags;
+	u8			port;
+	u8			state;
+	int			wq_sig;
+	int			scat_cqe;
+	int			max_inline_data;
+	struct mlx5_bf	        bf;
+	int			has_rq;
+
+	/* only for user space QPs. For kernel
+	 * we have it from the bf object
+	 */
+	int			bfregn;
+
+	int			create_type;
+
+	/* Store signature errors */
+	bool			signature_en;
+
+	struct list_head	qps_list;
+	struct list_head	cq_recv_list;
+	struct list_head	cq_send_list;
+	struct mlx5_rate_limit	rl;
+	u32                     underlay_qpn;
+	bool			tunnel_offload_en;
+	/* storage for qp sub type when core qp type is IB_QPT_DRIVER */
+	enum ib_qp_type		qp_sub_type;
+};
+
+struct mlx5_ib_cq_buf {
+	struct mlx5_frag_buf_ctrl fbc;
+	struct ib_umem		*umem;
+	int			cqe_size;
+	int			nent;
+};
+
+enum mlx5_ib_qp_flags {
+	MLX5_IB_QP_LSO                          = IB_QP_CREATE_IPOIB_UD_LSO,
+	MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK     = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
+	MLX5_IB_QP_CROSS_CHANNEL            = IB_QP_CREATE_CROSS_CHANNEL,
+	MLX5_IB_QP_MANAGED_SEND             = IB_QP_CREATE_MANAGED_SEND,
+	MLX5_IB_QP_MANAGED_RECV             = IB_QP_CREATE_MANAGED_RECV,
+	MLX5_IB_QP_SIGNATURE_HANDLING           = 1 << 5,
+	/* QP uses 1 as its source QP number */
+	MLX5_IB_QP_SQPN_QP1			= 1 << 6,
+	MLX5_IB_QP_CAP_SCATTER_FCS		= 1 << 7,
+	MLX5_IB_QP_RSS				= 1 << 8,
+	MLX5_IB_QP_CVLAN_STRIPPING		= 1 << 9,
+	MLX5_IB_QP_UNDERLAY			= 1 << 10,
+	MLX5_IB_QP_PCI_WRITE_END_PADDING	= 1 << 11,
+	MLX5_IB_QP_TUNNEL_OFFLOAD		= 1 << 12,
+};
+
+struct mlx5_umr_wr {
+	struct ib_send_wr		wr;
+	u64				virt_addr;
+	u64				offset;
+	struct ib_pd		       *pd;
+	unsigned int			page_shift;
+	unsigned int			xlt_size;
+	u64				length;
+	int				access_flags;
+	u32				mkey;
+};
+
+static inline struct mlx5_umr_wr *umr_wr(struct ib_send_wr *wr)
+{
+	return container_of(wr, struct mlx5_umr_wr, wr);
+}
+
+struct mlx5_shared_mr_info {
+	int mr_id;
+	struct ib_umem		*umem;
+};
+
+enum mlx5_ib_cq_pr_flags {
+	MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD	= 1 << 0,
+};
+
+struct mlx5_ib_cq {
+	struct ib_cq		ibcq;
+	struct mlx5_core_cq	mcq;
+	struct mlx5_ib_cq_buf	buf;
+	struct mlx5_db		db;
+
+	/* serialize access to the CQ
+	 */
+	spinlock_t		lock;
+
+	/* protect resize cq
+	 */
+	struct mutex		resize_mutex;
+	struct mlx5_ib_cq_buf  *resize_buf;
+	struct ib_umem	       *resize_umem;
+	int			cqe_size;
+	struct list_head	list_send_qp;
+	struct list_head	list_recv_qp;
+	u32			create_flags;
+	struct list_head	wc_list;
+	enum ib_cq_notify_flags notify_flags;
+	struct work_struct	notify_work;
+	u16			private_flags; /* Use mlx5_ib_cq_pr_flags */
+};
+
+struct mlx5_ib_wc {
+	struct ib_wc wc;
+	struct list_head list;
+};
+
+struct mlx5_ib_srq {
+	struct ib_srq		ibsrq;
+	struct mlx5_core_srq	msrq;
+	struct mlx5_frag_buf	buf;
+	struct mlx5_db		db;
+	u64		       *wrid;
+	/* protect SRQ hanlding
+	 */
+	spinlock_t		lock;
+	int			head;
+	int			tail;
+	u16			wqe_ctr;
+	struct ib_umem	       *umem;
+	/* serialize arming a SRQ
+	 */
+	struct mutex		mutex;
+	int			wq_sig;
+};
+
+struct mlx5_ib_xrcd {
+	struct ib_xrcd		ibxrcd;
+	u32			xrcdn;
+};
+
+enum mlx5_ib_mtt_access_flags {
+	MLX5_IB_MTT_READ  = (1 << 0),
+	MLX5_IB_MTT_WRITE = (1 << 1),
+};
+
+struct mlx5_ib_dm {
+	struct ib_dm		ibdm;
+	phys_addr_t		dev_addr;
+};
+
+#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
+
+#define MLX5_IB_DM_ALLOWED_ACCESS (IB_ACCESS_LOCAL_WRITE   |\
+				   IB_ACCESS_REMOTE_WRITE  |\
+				   IB_ACCESS_REMOTE_READ   |\
+				   IB_ACCESS_REMOTE_ATOMIC |\
+				   IB_ZERO_BASED)
+
+struct mlx5_ib_mr {
+	struct ib_mr		ibmr;
+	void			*descs;
+	dma_addr_t		desc_map;
+	int			ndescs;
+	int			max_descs;
+	int			desc_size;
+	int			access_mode;
+	struct mlx5_core_mkey	mmkey;
+	struct ib_umem	       *umem;
+	struct mlx5_shared_mr_info	*smr_info;
+	struct list_head	list;
+	int			order;
+	bool			allocated_from_cache;
+	int			npages;
+	struct mlx5_ib_dev     *dev;
+	u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
+	struct mlx5_core_sig_ctx    *sig;
+	int			live;
+	void			*descs_alloc;
+	int			access_flags; /* Needed for rereg MR */
+
+	struct mlx5_ib_mr      *parent;
+	atomic_t		num_leaf_free;
+	wait_queue_head_t       q_leaf_free;
+};
+
+struct mlx5_ib_mw {
+	struct ib_mw		ibmw;
+	struct mlx5_core_mkey	mmkey;
+	int			ndescs;
+};
+
+struct mlx5_ib_umr_context {
+	struct ib_cqe		cqe;
+	enum ib_wc_status	status;
+	struct completion	done;
+};
+
+struct umr_common {
+	struct ib_pd	*pd;
+	struct ib_cq	*cq;
+	struct ib_qp	*qp;
+	/* control access to UMR QP
+	 */
+	struct semaphore	sem;
+};
+
+enum {
+	MLX5_FMR_INVALID,
+	MLX5_FMR_VALID,
+	MLX5_FMR_BUSY,
+};
+
+struct mlx5_cache_ent {
+	struct list_head	head;
+	/* sync access to the cahce entry
+	 */
+	spinlock_t		lock;
+
+
+	struct dentry	       *dir;
+	char                    name[4];
+	u32                     order;
+	u32			xlt;
+	u32			access_mode;
+	u32			page;
+
+	u32			size;
+	u32                     cur;
+	u32                     miss;
+	u32			limit;
+
+	struct dentry          *fsize;
+	struct dentry          *fcur;
+	struct dentry          *fmiss;
+	struct dentry          *flimit;
+
+	struct mlx5_ib_dev     *dev;
+	struct work_struct	work;
+	struct delayed_work	dwork;
+	int			pending;
+	struct completion	compl;
+};
+
+struct mlx5_mr_cache {
+	struct workqueue_struct *wq;
+	struct mlx5_cache_ent	ent[MAX_MR_CACHE_ENTRIES];
+	int			stopped;
+	struct dentry		*root;
+	unsigned long		last_add;
+};
+
+struct mlx5_ib_gsi_qp;
+
+struct mlx5_ib_port_resources {
+	struct mlx5_ib_resources *devr;
+	struct mlx5_ib_gsi_qp *gsi;
+	struct work_struct pkey_change_work;
+};
+
+struct mlx5_ib_resources {
+	struct ib_cq	*c0;
+	struct ib_xrcd	*x0;
+	struct ib_xrcd	*x1;
+	struct ib_pd	*p0;
+	struct ib_srq	*s0;
+	struct ib_srq	*s1;
+	struct mlx5_ib_port_resources ports[2];
+	/* Protects changes to the port resources */
+	struct mutex	mutex;
+};
+
+struct mlx5_ib_counters {
+	const char **names;
+	size_t *offsets;
+	u32 num_q_counters;
+	u32 num_cong_counters;
+	u16 set_id;
+	bool set_id_valid;
+};
+
+struct mlx5_ib_multiport_info;
+
+struct mlx5_ib_multiport {
+	struct mlx5_ib_multiport_info *mpi;
+	/* To be held when accessing the multiport info */
+	spinlock_t mpi_lock;
+};
+
+struct mlx5_ib_port {
+	struct mlx5_ib_counters cnts;
+	struct mlx5_ib_multiport mp;
+	struct mlx5_ib_dbg_cc_params	*dbg_cc_params;
+};
+
+struct mlx5_roce {
+	/* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL
+	 * netdev pointer
+	 */
+	rwlock_t		netdev_lock;
+	struct net_device	*netdev;
+	struct notifier_block	nb;
+	atomic_t		next_port;
+	enum ib_port_state last_port_state;
+	struct mlx5_ib_dev	*dev;
+	u8			native_port_num;
+};
+
+struct mlx5_ib_dbg_param {
+	int			offset;
+	struct mlx5_ib_dev	*dev;
+	struct dentry		*dentry;
+	u8			port_num;
+};
+
+enum mlx5_ib_dbg_cc_types {
+	MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE,
+	MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI,
+	MLX5_IB_DBG_CC_RP_TIME_RESET,
+	MLX5_IB_DBG_CC_RP_BYTE_RESET,
+	MLX5_IB_DBG_CC_RP_THRESHOLD,
+	MLX5_IB_DBG_CC_RP_AI_RATE,
+	MLX5_IB_DBG_CC_RP_HAI_RATE,
+	MLX5_IB_DBG_CC_RP_MIN_DEC_FAC,
+	MLX5_IB_DBG_CC_RP_MIN_RATE,
+	MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP,
+	MLX5_IB_DBG_CC_RP_DCE_TCP_G,
+	MLX5_IB_DBG_CC_RP_DCE_TCP_RTT,
+	MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD,
+	MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE,
+	MLX5_IB_DBG_CC_RP_GD,
+	MLX5_IB_DBG_CC_NP_CNP_DSCP,
+	MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE,
+	MLX5_IB_DBG_CC_NP_CNP_PRIO,
+	MLX5_IB_DBG_CC_MAX,
+};
+
+struct mlx5_ib_dbg_cc_params {
+	struct dentry			*root;
+	struct mlx5_ib_dbg_param	params[MLX5_IB_DBG_CC_MAX];
+};
+
+enum {
+	MLX5_MAX_DELAY_DROP_TIMEOUT_MS = 100,
+};
+
+struct mlx5_ib_dbg_delay_drop {
+	struct dentry		*dir_debugfs;
+	struct dentry		*rqs_cnt_debugfs;
+	struct dentry		*events_cnt_debugfs;
+	struct dentry		*timeout_debugfs;
+};
+
+struct mlx5_ib_delay_drop {
+	struct mlx5_ib_dev     *dev;
+	struct work_struct	delay_drop_work;
+	/* serialize setting of delay drop */
+	struct mutex		lock;
+	u32			timeout;
+	bool			activate;
+	atomic_t		events_cnt;
+	atomic_t		rqs_cnt;
+	struct mlx5_ib_dbg_delay_drop *dbg;
+};
+
+enum mlx5_ib_stages {
+	MLX5_IB_STAGE_INIT,
+	MLX5_IB_STAGE_FLOW_DB,
+	MLX5_IB_STAGE_CAPS,
+	MLX5_IB_STAGE_NON_DEFAULT_CB,
+	MLX5_IB_STAGE_ROCE,
+	MLX5_IB_STAGE_DEVICE_RESOURCES,
+	MLX5_IB_STAGE_ODP,
+	MLX5_IB_STAGE_COUNTERS,
+	MLX5_IB_STAGE_CONG_DEBUGFS,
+	MLX5_IB_STAGE_UAR,
+	MLX5_IB_STAGE_BFREG,
+	MLX5_IB_STAGE_PRE_IB_REG_UMR,
+	MLX5_IB_STAGE_SPECS,
+	MLX5_IB_STAGE_IB_REG,
+	MLX5_IB_STAGE_POST_IB_REG_UMR,
+	MLX5_IB_STAGE_DELAY_DROP,
+	MLX5_IB_STAGE_CLASS_ATTR,
+	MLX5_IB_STAGE_REP_REG,
+	MLX5_IB_STAGE_MAX,
+};
+
+struct mlx5_ib_stage {
+	int (*init)(struct mlx5_ib_dev *dev);
+	void (*cleanup)(struct mlx5_ib_dev *dev);
+};
+
+#define STAGE_CREATE(_stage, _init, _cleanup) \
+	.stage[_stage] = {.init = _init, .cleanup = _cleanup}
+
+struct mlx5_ib_profile {
+	struct mlx5_ib_stage stage[MLX5_IB_STAGE_MAX];
+};
+
+struct mlx5_ib_multiport_info {
+	struct list_head list;
+	struct mlx5_ib_dev *ibdev;
+	struct mlx5_core_dev *mdev;
+	struct completion unref_comp;
+	u64 sys_image_guid;
+	u32 mdev_refcnt;
+	bool is_master;
+	bool unaffiliate;
+};
+
+struct mlx5_ib_flow_action {
+	struct ib_flow_action		ib_action;
+	union {
+		struct {
+			u64			    ib_flags;
+			struct mlx5_accel_esp_xfrm *ctx;
+		} esp_aes_gcm;
+	};
+};
+
+struct mlx5_memic {
+	struct mlx5_core_dev *dev;
+	spinlock_t		memic_lock;
+	DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES);
+};
+
+struct mlx5_ib_dev {
+	struct ib_device		ib_dev;
+	struct mlx5_core_dev		*mdev;
+	struct mlx5_roce		roce[MLX5_MAX_PORTS];
+	int				num_ports;
+	/* serialize update of capability mask
+	 */
+	struct mutex			cap_mask_mutex;
+	bool				ib_active;
+	struct umr_common		umrc;
+	/* sync used page count stats
+	 */
+	struct mlx5_ib_resources	devr;
+	struct mlx5_mr_cache		cache;
+	struct timer_list		delay_timer;
+	/* Prevents soft lock on massive reg MRs */
+	struct mutex			slow_path_mutex;
+	int				fill_delay;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	struct ib_odp_caps	odp_caps;
+	u64			odp_max_size;
+	/*
+	 * Sleepable RCU that prevents destruction of MRs while they are still
+	 * being used by a page fault handler.
+	 */
+	struct srcu_struct      mr_srcu;
+	u32			null_mkey;
+#endif
+	struct mlx5_ib_flow_db	*flow_db;
+	/* protect resources needed as part of reset flow */
+	spinlock_t		reset_flow_resource_lock;
+	struct list_head	qp_list;
+	/* Array with num_ports elements */
+	struct mlx5_ib_port	*port;
+	struct mlx5_sq_bfreg	bfreg;
+	struct mlx5_sq_bfreg	fp_bfreg;
+	struct mlx5_ib_delay_drop	delay_drop;
+	const struct mlx5_ib_profile	*profile;
+	struct mlx5_eswitch_rep		*rep;
+
+	/* protect the user_td */
+	struct mutex		lb_mutex;
+	u32			user_td;
+	u8			umr_fence;
+	struct list_head	ib_dev_list;
+	u64			sys_image_guid;
+	struct mlx5_memic	memic;
+};
+
+static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
+{
+	return container_of(mcq, struct mlx5_ib_cq, mcq);
+}
+
+static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd)
+{
+	return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd);
+}
+
+static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mlx5_ib_dev, ib_dev);
+}
+
+static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mlx5_ib_cq, ibcq);
+}
+
+static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp)
+{
+	return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp;
+}
+
+static inline struct mlx5_ib_rwq *to_mibrwq(struct mlx5_core_qp *core_qp)
+{
+	return container_of(core_qp, struct mlx5_ib_rwq, core_qp);
+}
+
+static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mkey *mmkey)
+{
+	return container_of(mmkey, struct mlx5_ib_mr, mmkey);
+}
+
+static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mlx5_ib_pd, ibpd);
+}
+
+static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct mlx5_ib_srq, ibsrq);
+}
+
+static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mlx5_ib_qp, ibqp);
+}
+
+static inline struct mlx5_ib_rwq *to_mrwq(struct ib_wq *ibwq)
+{
+	return container_of(ibwq, struct mlx5_ib_rwq, ibwq);
+}
+
+static inline struct mlx5_ib_rwq_ind_table *to_mrwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
+{
+	return container_of(ib_rwq_ind_tbl, struct mlx5_ib_rwq_ind_table, ib_rwq_ind_tbl);
+}
+
+static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq)
+{
+	return container_of(msrq, struct mlx5_ib_srq, msrq);
+}
+
+static inline struct mlx5_ib_dm *to_mdm(struct ib_dm *ibdm)
+{
+	return container_of(ibdm, struct mlx5_ib_dm, ibdm);
+}
+
+static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mlx5_ib_mr, ibmr);
+}
+
+static inline struct mlx5_ib_mw *to_mmw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct mlx5_ib_mw, ibmw);
+}
+
+static inline struct mlx5_ib_flow_action *
+to_mflow_act(struct ib_flow_action *ibact)
+{
+	return container_of(ibact, struct mlx5_ib_flow_action, ib_action);
+}
+
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+			struct mlx5_db *db);
+void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db);
+void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
+void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
+void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index);
+int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+		 u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		 const void *in_mad, void *response_mad);
+struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
+				struct ib_udata *udata);
+int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
+int mlx5_ib_destroy_ah(struct ib_ah *ah);
+struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata);
+int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr);
+int mlx5_ib_destroy_srq(struct ib_srq *srq);
+int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr);
+struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata);
+int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata);
+int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr);
+int mlx5_ib_destroy_qp(struct ib_qp *qp);
+int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n);
+int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
+			  void *buffer, u32 length,
+			  struct mlx5_ib_qp_base *base);
+struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
+				const struct ib_cq_init_attr *attr,
+				struct ib_ucontext *context,
+				struct ib_udata *udata);
+int mlx5_ib_destroy_cq(struct ib_cq *cq);
+int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
+struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
+struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata);
+struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
+			       struct ib_udata *udata);
+int mlx5_ib_dealloc_mw(struct ib_mw *mw);
+int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
+		       int page_shift, int flags);
+struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+					     int access_flags);
+void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
+int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
+			  u64 length, u64 virt_addr, int access_flags,
+			  struct ib_pd *pd, struct ib_udata *udata);
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
+struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
+			       enum ib_mr_type mr_type,
+			       u32 max_num_sg);
+int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		      unsigned int *sg_offset);
+int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			const struct ib_mad_hdr *in, size_t in_mad_size,
+			struct ib_mad_hdr *out, size_t *out_mad_size,
+			u16 *out_mad_pkey_index);
+struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
+					  struct ib_ucontext *context,
+					  struct ib_udata *udata);
+int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd);
+int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset);
+int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port);
+int mlx5_query_mad_ifc_smp_attr_node_info(struct ib_device *ibdev,
+					  struct ib_smp *out_mad);
+int mlx5_query_mad_ifc_system_image_guid(struct ib_device *ibdev,
+					 __be64 *sys_image_guid);
+int mlx5_query_mad_ifc_max_pkeys(struct ib_device *ibdev,
+				 u16 *max_pkeys);
+int mlx5_query_mad_ifc_vendor_id(struct ib_device *ibdev,
+				 u32 *vendor_id);
+int mlx5_query_mad_ifc_node_desc(struct mlx5_ib_dev *dev, char *node_desc);
+int mlx5_query_mad_ifc_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid);
+int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			    u16 *pkey);
+int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u8 port, int index,
+			    union ib_gid *gid);
+int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port,
+			    struct ib_port_attr *props);
+int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
+		       struct ib_port_attr *props);
+int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev);
+void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev);
+void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
+			unsigned long max_page_shift,
+			int *count, int *shift,
+			int *ncont, int *order);
+void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+			    int page_shift, size_t offset, size_t num_pages,
+			    __be64 *pas, int access_flags);
+void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+			  int page_shift, __be64 *pas, int access_flags);
+void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
+int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq);
+int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
+int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
+
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry);
+void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
+int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
+			    struct ib_mr_status *mr_status);
+struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
+				struct ib_wq_init_attr *init_attr,
+				struct ib_udata *udata);
+int mlx5_ib_destroy_wq(struct ib_wq *wq);
+int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+		      u32 wq_attr_mask, struct ib_udata *udata);
+struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
+						      struct ib_rwq_ind_table_init_attr *init_attr,
+						      struct ib_udata *udata);
+int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
+bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev);
+struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
+			       struct ib_ucontext *context,
+			       struct ib_dm_alloc_attr *attr,
+			       struct uverbs_attr_bundle *attrs);
+int mlx5_ib_dealloc_dm(struct ib_dm *ibdm);
+struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
+				struct ib_dm_mr_attr *attr,
+				struct uverbs_attr_bundle *attrs);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev);
+void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
+		    struct mlx5_pagefault *pfault);
+int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
+int __init mlx5_ib_odp_init(void);
+void mlx5_ib_odp_cleanup(void);
+void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
+			      unsigned long end);
+void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
+void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
+			   size_t nentries, struct mlx5_ib_mr *mr, int flags);
+#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
+{
+	return;
+}
+
+static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
+static inline int mlx5_ib_odp_init(void) { return 0; }
+static inline void mlx5_ib_odp_cleanup(void)				    {}
+static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
+static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
+					 size_t nentries, struct mlx5_ib_mr *mr,
+					 int flags) {}
+
+#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
+/* Needed for rep profile */
+int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev);
+void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_ib_stage_rep_flow_db_init(struct mlx5_ib_dev *dev);
+int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev);
+int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev);
+int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev);
+void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev);
+void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev);
+void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev);
+void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev);
+void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev);
+void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev);
+int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev);
+void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
+		      const struct mlx5_ib_profile *profile,
+		      int stage);
+void *__mlx5_ib_add(struct mlx5_ib_dev *dev,
+		    const struct mlx5_ib_profile *profile);
+
+int mlx5_ib_get_vf_config(struct ib_device *device, int vf,
+			  u8 port, struct ifla_vf_info *info);
+int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf,
+			      u8 port, int state);
+int mlx5_ib_get_vf_stats(struct ib_device *device, int vf,
+			 u8 port, struct ifla_vf_stats *stats);
+int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port,
+			u64 guid, int type);
+
+__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
+			       int index);
+int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num,
+			   int index, enum ib_gid_type *gid_type);
+
+void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
+int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
+
+/* GSI QP helper functions */
+struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd,
+				    struct ib_qp_init_attr *init_attr);
+int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp);
+int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
+			  int attr_mask);
+int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+			 int qp_attr_mask,
+			 struct ib_qp_init_attr *qp_init_attr);
+int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr);
+int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr);
+void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi);
+
+int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc);
+
+void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi,
+			int bfregn);
+struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi);
+struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *dev,
+						   u8 ib_port_num,
+						   u8 *native_port_num);
+void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev,
+				  u8 port_num);
+
+static inline void init_query_mad(struct ib_smp *mad)
+{
+	mad->base_version  = 1;
+	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	mad->class_version = 1;
+	mad->method	   = IB_MGMT_METHOD_GET;
+}
+
+static inline u8 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
+	       MLX5_PERM_LOCAL_READ;
+}
+
+static inline int is_qp1(enum ib_qp_type qp_type)
+{
+	return qp_type == MLX5_IB_QPT_HW_GSI;
+}
+
+#define MLX5_MAX_UMR_SHIFT 16
+#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
+
+static inline u32 check_cq_create_flags(u32 flags)
+{
+	/*
+	 * It returns non-zero value for unsupported CQ
+	 * create flags, otherwise it returns zero.
+	 */
+	return (flags & ~(IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN |
+			  IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION));
+}
+
+static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx,
+				     u32 *user_index)
+{
+	if (cqe_version) {
+		if ((cmd_uidx == MLX5_IB_DEFAULT_UIDX) ||
+		    (cmd_uidx & ~MLX5_USER_ASSIGNED_UIDX_MASK))
+			return -EINVAL;
+		*user_index = cmd_uidx;
+	} else {
+		*user_index = MLX5_IB_DEFAULT_UIDX;
+	}
+
+	return 0;
+}
+
+static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext,
+				    struct mlx5_ib_create_qp *ucmd,
+				    int inlen,
+				    u32 *user_index)
+{
+	u8 cqe_version = ucontext->cqe_version;
+
+	if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) &&
+	    !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+		return 0;
+
+	if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) !=
+	       !!cqe_version))
+		return -EINVAL;
+
+	return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
+}
+
+static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext,
+				     struct mlx5_ib_create_srq *ucmd,
+				     int inlen,
+				     u32 *user_index)
+{
+	u8 cqe_version = ucontext->cqe_version;
+
+	if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) &&
+	    !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+		return 0;
+
+	if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) !=
+	       !!cqe_version))
+		return -EINVAL;
+
+	return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
+}
+
+static inline int get_uars_per_sys_page(struct mlx5_ib_dev *dev, bool lib_support)
+{
+	return lib_support && MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+				MLX5_UARS_IN_PAGE : 1;
+}
+
+static inline int get_num_static_uars(struct mlx5_ib_dev *dev,
+				      struct mlx5_bfreg_info *bfregi)
+{
+	return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages;
+}
+
+unsigned long mlx5_ib_get_xlt_emergency_page(void);
+void mlx5_ib_put_xlt_emergency_page(void);
+
+#endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
new file mode 100644
index 0000000..90a9c46
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -0,0 +1,2000 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <linux/kref.h>
+#include <linux/random.h>
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+#include <rdma/ib_verbs.h>
+#include "mlx5_ib.h"
+
+enum {
+	MAX_PENDING_REG_MR = 8,
+};
+
+#define MLX5_UMR_ALIGN 2048
+
+static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
+static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
+static int mr_cache_max_order(struct mlx5_ib_dev *dev);
+static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
+static bool umr_can_modify_entity_size(struct mlx5_ib_dev *dev)
+{
+	return !MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled);
+}
+
+static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
+{
+	return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
+}
+
+static bool use_umr(struct mlx5_ib_dev *dev, int order)
+{
+	return order <= mr_cache_max_order(dev) &&
+		umr_can_modify_entity_size(dev);
+}
+
+static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+	int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	/* Wait until all page fault handlers using the mr complete. */
+	synchronize_srcu(&dev->mr_srcu);
+#endif
+
+	return err;
+}
+
+static int order2idx(struct mlx5_ib_dev *dev, int order)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+
+	if (order < cache->ent[0].order)
+		return 0;
+	else
+		return order - cache->ent[0].order;
+}
+
+static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
+{
+	return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >=
+		length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
+}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+static void update_odp_mr(struct mlx5_ib_mr *mr)
+{
+	if (mr->umem->odp_data) {
+		/*
+		 * This barrier prevents the compiler from moving the
+		 * setting of umem->odp_data->private to point to our
+		 * MR, before reg_umr finished, to ensure that the MR
+		 * initialization have finished before starting to
+		 * handle invalidations.
+		 */
+		smp_wmb();
+		mr->umem->odp_data->private = mr;
+		/*
+		 * Make sure we will see the new
+		 * umem->odp_data->private value in the invalidation
+		 * routines, before we can get page faults on the
+		 * MR. Page faults can happen once we put the MR in
+		 * the tree, below this line. Without the barrier,
+		 * there can be a fault handling and an invalidation
+		 * before umem->odp_data->private == mr is visible to
+		 * the invalidation handler.
+		 */
+		smp_wmb();
+	}
+}
+#endif
+
+static void reg_mr_callback(int status, void *context)
+{
+	struct mlx5_ib_mr *mr = context;
+	struct mlx5_ib_dev *dev = mr->dev;
+	struct mlx5_mr_cache *cache = &dev->cache;
+	int c = order2idx(dev, mr->order);
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	u8 key;
+	unsigned long flags;
+	struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table;
+	int err;
+
+	spin_lock_irqsave(&ent->lock, flags);
+	ent->pending--;
+	spin_unlock_irqrestore(&ent->lock, flags);
+	if (status) {
+		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
+		kfree(mr);
+		dev->fill_delay = 1;
+		mod_timer(&dev->delay_timer, jiffies + HZ);
+		return;
+	}
+
+	mr->mmkey.type = MLX5_MKEY_MR;
+	spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
+	key = dev->mdev->priv.mkey_key++;
+	spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
+	mr->mmkey.key = mlx5_idx_to_mkey(MLX5_GET(create_mkey_out, mr->out, mkey_index)) | key;
+
+	cache->last_add = jiffies;
+
+	spin_lock_irqsave(&ent->lock, flags);
+	list_add_tail(&mr->list, &ent->head);
+	ent->cur++;
+	ent->size++;
+	spin_unlock_irqrestore(&ent->lock, flags);
+
+	write_lock_irqsave(&table->lock, flags);
+	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmkey.key),
+				&mr->mmkey);
+	if (err)
+		pr_err("Error inserting to mkey tree. 0x%x\n", -err);
+	write_unlock_irqrestore(&table->lock, flags);
+
+	if (!completion_done(&ent->compl))
+		complete(&ent->compl);
+}
+
+static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	struct mlx5_ib_mr *mr;
+	void *mkc;
+	u32 *in;
+	int err = 0;
+	int i;
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	for (i = 0; i < num; i++) {
+		if (ent->pending >= MAX_PENDING_REG_MR) {
+			err = -EAGAIN;
+			break;
+		}
+
+		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+		if (!mr) {
+			err = -ENOMEM;
+			break;
+		}
+		mr->order = ent->order;
+		mr->allocated_from_cache = 1;
+		mr->dev = dev;
+
+		MLX5_SET(mkc, mkc, free, 1);
+		MLX5_SET(mkc, mkc, umr_en, 1);
+		MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
+		MLX5_SET(mkc, mkc, access_mode_4_2,
+			 (ent->access_mode >> 2) & 0x7);
+
+		MLX5_SET(mkc, mkc, qpn, 0xffffff);
+		MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
+		MLX5_SET(mkc, mkc, log_page_size, ent->page);
+
+		spin_lock_irq(&ent->lock);
+		ent->pending++;
+		spin_unlock_irq(&ent->lock);
+		err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey,
+					       in, inlen,
+					       mr->out, sizeof(mr->out),
+					       reg_mr_callback, mr);
+		if (err) {
+			spin_lock_irq(&ent->lock);
+			ent->pending--;
+			spin_unlock_irq(&ent->lock);
+			mlx5_ib_warn(dev, "create mkey failed %d\n", err);
+			kfree(mr);
+			break;
+		}
+	}
+
+	kfree(in);
+	return err;
+}
+
+static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	struct mlx5_ib_mr *tmp_mr;
+	struct mlx5_ib_mr *mr;
+	LIST_HEAD(del_list);
+	int i;
+
+	for (i = 0; i < num; i++) {
+		spin_lock_irq(&ent->lock);
+		if (list_empty(&ent->head)) {
+			spin_unlock_irq(&ent->lock);
+			break;
+		}
+		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
+		list_move(&mr->list, &del_list);
+		ent->cur--;
+		ent->size--;
+		spin_unlock_irq(&ent->lock);
+		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
+	}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	synchronize_srcu(&dev->mr_srcu);
+#endif
+
+	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
+		list_del(&mr->list);
+		kfree(mr);
+	}
+}
+
+static ssize_t size_write(struct file *filp, const char __user *buf,
+			  size_t count, loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	struct mlx5_ib_dev *dev = ent->dev;
+	char lbuf[20];
+	u32 var;
+	int err;
+	int c;
+
+	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
+		return -EFAULT;
+
+	c = order2idx(dev, ent->order);
+	lbuf[sizeof(lbuf) - 1] = 0;
+
+	if (sscanf(lbuf, "%u", &var) != 1)
+		return -EINVAL;
+
+	if (var < ent->limit)
+		return -EINVAL;
+
+	if (var > ent->size) {
+		do {
+			err = add_keys(dev, c, var - ent->size);
+			if (err && err != -EAGAIN)
+				return err;
+
+			usleep_range(3000, 5000);
+		} while (err);
+	} else if (var < ent->size) {
+		remove_keys(dev, c, ent->size - var);
+	}
+
+	return count;
+}
+
+static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
+			 loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	char lbuf[20];
+	int err;
+
+	if (*pos)
+		return 0;
+
+	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size);
+	if (err < 0)
+		return err;
+
+	if (copy_to_user(buf, lbuf, err))
+		return -EFAULT;
+
+	*pos += err;
+
+	return err;
+}
+
+static const struct file_operations size_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= size_write,
+	.read	= size_read,
+};
+
+static ssize_t limit_write(struct file *filp, const char __user *buf,
+			   size_t count, loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	struct mlx5_ib_dev *dev = ent->dev;
+	char lbuf[20];
+	u32 var;
+	int err;
+	int c;
+
+	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
+		return -EFAULT;
+
+	c = order2idx(dev, ent->order);
+	lbuf[sizeof(lbuf) - 1] = 0;
+
+	if (sscanf(lbuf, "%u", &var) != 1)
+		return -EINVAL;
+
+	if (var > ent->size)
+		return -EINVAL;
+
+	ent->limit = var;
+
+	if (ent->cur < ent->limit) {
+		err = add_keys(dev, c, 2 * ent->limit - ent->cur);
+		if (err)
+			return err;
+	}
+
+	return count;
+}
+
+static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
+			  loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	char lbuf[20];
+	int err;
+
+	if (*pos)
+		return 0;
+
+	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
+	if (err < 0)
+		return err;
+
+	if (copy_to_user(buf, lbuf, err))
+		return -EFAULT;
+
+	*pos += err;
+
+	return err;
+}
+
+static const struct file_operations limit_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= limit_write,
+	.read	= limit_read,
+};
+
+static int someone_adding(struct mlx5_mr_cache *cache)
+{
+	int i;
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		if (cache->ent[i].cur < cache->ent[i].limit)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void __cache_work_func(struct mlx5_cache_ent *ent)
+{
+	struct mlx5_ib_dev *dev = ent->dev;
+	struct mlx5_mr_cache *cache = &dev->cache;
+	int i = order2idx(dev, ent->order);
+	int err;
+
+	if (cache->stopped)
+		return;
+
+	ent = &dev->cache.ent[i];
+	if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
+		err = add_keys(dev, i, 1);
+		if (ent->cur < 2 * ent->limit) {
+			if (err == -EAGAIN) {
+				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
+					    i + 2);
+				queue_delayed_work(cache->wq, &ent->dwork,
+						   msecs_to_jiffies(3));
+			} else if (err) {
+				mlx5_ib_warn(dev, "command failed order %d, err %d\n",
+					     i + 2, err);
+				queue_delayed_work(cache->wq, &ent->dwork,
+						   msecs_to_jiffies(1000));
+			} else {
+				queue_work(cache->wq, &ent->work);
+			}
+		}
+	} else if (ent->cur > 2 * ent->limit) {
+		/*
+		 * The remove_keys() logic is performed as garbage collection
+		 * task. Such task is intended to be run when no other active
+		 * processes are running.
+		 *
+		 * The need_resched() will return TRUE if there are user tasks
+		 * to be activated in near future.
+		 *
+		 * In such case, we don't execute remove_keys() and postpone
+		 * the garbage collection work to try to run in next cycle,
+		 * in order to free CPU resources to other tasks.
+		 */
+		if (!need_resched() && !someone_adding(cache) &&
+		    time_after(jiffies, cache->last_add + 300 * HZ)) {
+			remove_keys(dev, i, 1);
+			if (ent->cur > ent->limit)
+				queue_work(cache->wq, &ent->work);
+		} else {
+			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
+		}
+	}
+}
+
+static void delayed_cache_work_func(struct work_struct *work)
+{
+	struct mlx5_cache_ent *ent;
+
+	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
+	__cache_work_func(ent);
+}
+
+static void cache_work_func(struct work_struct *work)
+{
+	struct mlx5_cache_ent *ent;
+
+	ent = container_of(work, struct mlx5_cache_ent, work);
+	__cache_work_func(ent);
+}
+
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	struct mlx5_ib_mr *mr;
+	int err;
+
+	if (entry < 0 || entry >= MAX_MR_CACHE_ENTRIES) {
+		mlx5_ib_err(dev, "cache entry %d is out of range\n", entry);
+		return NULL;
+	}
+
+	ent = &cache->ent[entry];
+	while (1) {
+		spin_lock_irq(&ent->lock);
+		if (list_empty(&ent->head)) {
+			spin_unlock_irq(&ent->lock);
+
+			err = add_keys(dev, entry, 1);
+			if (err && err != -EAGAIN)
+				return ERR_PTR(err);
+
+			wait_for_completion(&ent->compl);
+		} else {
+			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
+					      list);
+			list_del(&mr->list);
+			ent->cur--;
+			spin_unlock_irq(&ent->lock);
+			if (ent->cur < ent->limit)
+				queue_work(cache->wq, &ent->work);
+			return mr;
+		}
+	}
+}
+
+static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_ib_mr *mr = NULL;
+	struct mlx5_cache_ent *ent;
+	int last_umr_cache_entry;
+	int c;
+	int i;
+
+	c = order2idx(dev, order);
+	last_umr_cache_entry = order2idx(dev, mr_cache_max_order(dev));
+	if (c < 0 || c > last_umr_cache_entry) {
+		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
+		return NULL;
+	}
+
+	for (i = c; i <= last_umr_cache_entry; i++) {
+		ent = &cache->ent[i];
+
+		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
+
+		spin_lock_irq(&ent->lock);
+		if (!list_empty(&ent->head)) {
+			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
+					      list);
+			list_del(&mr->list);
+			ent->cur--;
+			spin_unlock_irq(&ent->lock);
+			if (ent->cur < ent->limit)
+				queue_work(cache->wq, &ent->work);
+			break;
+		}
+		spin_unlock_irq(&ent->lock);
+
+		queue_work(cache->wq, &ent->work);
+	}
+
+	if (!mr)
+		cache->ent[c].miss++;
+
+	return mr;
+}
+
+void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	int shrink = 0;
+	int c;
+
+	c = order2idx(dev, mr->order);
+	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
+		mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
+		return;
+	}
+
+	if (unreg_umr(dev, mr))
+		return;
+
+	ent = &cache->ent[c];
+	spin_lock_irq(&ent->lock);
+	list_add_tail(&mr->list, &ent->head);
+	ent->cur++;
+	if (ent->cur > 2 * ent->limit)
+		shrink = 1;
+	spin_unlock_irq(&ent->lock);
+
+	if (shrink)
+		queue_work(cache->wq, &ent->work);
+}
+
+static void clean_keys(struct mlx5_ib_dev *dev, int c)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	struct mlx5_ib_mr *tmp_mr;
+	struct mlx5_ib_mr *mr;
+	LIST_HEAD(del_list);
+
+	cancel_delayed_work(&ent->dwork);
+	while (1) {
+		spin_lock_irq(&ent->lock);
+		if (list_empty(&ent->head)) {
+			spin_unlock_irq(&ent->lock);
+			break;
+		}
+		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
+		list_move(&mr->list, &del_list);
+		ent->cur--;
+		ent->size--;
+		spin_unlock_irq(&ent->lock);
+		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
+	}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	synchronize_srcu(&dev->mr_srcu);
+#endif
+
+	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
+		list_del(&mr->list);
+		kfree(mr);
+	}
+}
+
+static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
+{
+	if (!mlx5_debugfs_root || dev->rep)
+		return;
+
+	debugfs_remove_recursive(dev->cache.root);
+	dev->cache.root = NULL;
+}
+
+static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	int i;
+
+	if (!mlx5_debugfs_root || dev->rep)
+		return 0;
+
+	cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
+	if (!cache->root)
+		return -ENOMEM;
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		ent = &cache->ent[i];
+		sprintf(ent->name, "%d", ent->order);
+		ent->dir = debugfs_create_dir(ent->name,  cache->root);
+		if (!ent->dir)
+			goto err;
+
+		ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent,
+						 &size_fops);
+		if (!ent->fsize)
+			goto err;
+
+		ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent,
+						  &limit_fops);
+		if (!ent->flimit)
+			goto err;
+
+		ent->fcur = debugfs_create_u32("cur", 0400, ent->dir,
+					       &ent->cur);
+		if (!ent->fcur)
+			goto err;
+
+		ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir,
+						&ent->miss);
+		if (!ent->fmiss)
+			goto err;
+	}
+
+	return 0;
+err:
+	mlx5_mr_cache_debugfs_cleanup(dev);
+
+	return -ENOMEM;
+}
+
+static void delay_time_func(struct timer_list *t)
+{
+	struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
+
+	dev->fill_delay = 0;
+}
+
+int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	int err;
+	int i;
+
+	mutex_init(&dev->slow_path_mutex);
+	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
+	if (!cache->wq) {
+		mlx5_ib_warn(dev, "failed to create work queue\n");
+		return -ENOMEM;
+	}
+
+	timer_setup(&dev->delay_timer, delay_time_func, 0);
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		ent = &cache->ent[i];
+		INIT_LIST_HEAD(&ent->head);
+		spin_lock_init(&ent->lock);
+		ent->order = i + 2;
+		ent->dev = dev;
+		ent->limit = 0;
+
+		init_completion(&ent->compl);
+		INIT_WORK(&ent->work, cache_work_func);
+		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
+		queue_work(cache->wq, &ent->work);
+
+		if (i > MR_CACHE_LAST_STD_ENTRY) {
+			mlx5_odp_init_mr_cache_entry(ent);
+			continue;
+		}
+
+		if (ent->order > mr_cache_max_order(dev))
+			continue;
+
+		ent->page = PAGE_SHIFT;
+		ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
+			   MLX5_IB_UMR_OCTOWORD;
+		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
+		if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
+		    !dev->rep &&
+		    mlx5_core_is_pf(dev->mdev))
+			ent->limit = dev->mdev->profile->mr_cache[i].limit;
+		else
+			ent->limit = 0;
+	}
+
+	err = mlx5_mr_cache_debugfs_init(dev);
+	if (err)
+		mlx5_ib_warn(dev, "cache debugfs failure\n");
+
+	/*
+	 * We don't want to fail driver if debugfs failed to initialize,
+	 * so we are not forwarding error to the user.
+	 */
+
+	return 0;
+}
+
+static void wait_for_async_commands(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	int total = 0;
+	int i;
+	int j;
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		ent = &cache->ent[i];
+		for (j = 0 ; j < 1000; j++) {
+			if (!ent->pending)
+				break;
+			msleep(50);
+		}
+	}
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		ent = &cache->ent[i];
+		total += ent->pending;
+	}
+
+	if (total)
+		mlx5_ib_warn(dev, "aborted while there are %d pending mr requests\n", total);
+	else
+		mlx5_ib_warn(dev, "done with all pending requests\n");
+}
+
+int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
+{
+	int i;
+
+	if (!dev->cache.wq)
+		return 0;
+
+	dev->cache.stopped = 1;
+	flush_workqueue(dev->cache.wq);
+
+	mlx5_mr_cache_debugfs_cleanup(dev);
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
+		clean_keys(dev, i);
+
+	destroy_workqueue(dev->cache.wq);
+	wait_for_async_commands(dev);
+	del_timer_sync(&dev->delay_timer);
+
+	return 0;
+}
+
+struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_ib_mr *mr;
+	void *mkc;
+	u32 *in;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
+	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
+	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
+	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
+	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
+	MLX5_SET(mkc, mkc, lr, 1);
+
+	MLX5_SET(mkc, mkc, length64, 1);
+	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET64(mkc, mkc, start_addr, 0);
+
+	err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
+	if (err)
+		goto err_in;
+
+	kfree(in);
+	mr->mmkey.type = MLX5_MKEY_MR;
+	mr->ibmr.lkey = mr->mmkey.key;
+	mr->ibmr.rkey = mr->mmkey.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_in:
+	kfree(in);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+static int get_octo_len(u64 addr, u64 len, int page_shift)
+{
+	u64 page_size = 1ULL << page_shift;
+	u64 offset;
+	int npages;
+
+	offset = addr & (page_size - 1);
+	npages = ALIGN(len + offset, page_size) >> page_shift;
+	return (npages + 1) / 2;
+}
+
+static int mr_cache_max_order(struct mlx5_ib_dev *dev)
+{
+	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+		return MR_CACHE_LAST_STD_ENTRY + 2;
+	return MLX5_MAX_UMR_SHIFT;
+}
+
+static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
+		       int access_flags, struct ib_umem **umem,
+		       int *npages, int *page_shift, int *ncont,
+		       int *order)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct ib_umem *u;
+	int err;
+
+	*umem = NULL;
+
+	u = ib_umem_get(pd->uobject->context, start, length, access_flags, 0);
+	err = PTR_ERR_OR_ZERO(u);
+	if (err) {
+		mlx5_ib_dbg(dev, "umem get failed (%d)\n", err);
+		return err;
+	}
+
+	mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
+			   page_shift, ncont, order);
+	if (!*npages) {
+		mlx5_ib_warn(dev, "avoid zero region\n");
+		ib_umem_release(u);
+		return -EINVAL;
+	}
+
+	*umem = u;
+
+	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
+		    *npages, *ncont, *order, *page_shift);
+
+	return 0;
+}
+
+static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct mlx5_ib_umr_context *context =
+		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
+
+	context->status = wc->status;
+	complete(&context->done);
+}
+
+static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
+{
+	context->cqe.done = mlx5_ib_umr_done;
+	context->status = -1;
+	init_completion(&context->done);
+}
+
+static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
+				  struct mlx5_umr_wr *umrwr)
+{
+	struct umr_common *umrc = &dev->umrc;
+	struct ib_send_wr *bad;
+	int err;
+	struct mlx5_ib_umr_context umr_context;
+
+	mlx5_ib_init_umr_context(&umr_context);
+	umrwr->wr.wr_cqe = &umr_context.cqe;
+
+	down(&umrc->sem);
+	err = ib_post_send(umrc->qp, &umrwr->wr, &bad);
+	if (err) {
+		mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
+	} else {
+		wait_for_completion(&umr_context.done);
+		if (umr_context.status != IB_WC_SUCCESS) {
+			mlx5_ib_warn(dev, "reg umr failed (%u)\n",
+				     umr_context.status);
+			err = -EFAULT;
+		}
+	}
+	up(&umrc->sem);
+	return err;
+}
+
+static struct mlx5_ib_mr *alloc_mr_from_cache(
+				  struct ib_pd *pd, struct ib_umem *umem,
+				  u64 virt_addr, u64 len, int npages,
+				  int page_shift, int order, int access_flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_mr *mr;
+	int err = 0;
+	int i;
+
+	for (i = 0; i < 1; i++) {
+		mr = alloc_cached_mr(dev, order);
+		if (mr)
+			break;
+
+		err = add_keys(dev, order2idx(dev, order), 1);
+		if (err && err != -EAGAIN) {
+			mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
+			break;
+		}
+	}
+
+	if (!mr)
+		return ERR_PTR(-EAGAIN);
+
+	mr->ibmr.pd = pd;
+	mr->umem = umem;
+	mr->access_flags = access_flags;
+	mr->desc_size = sizeof(struct mlx5_mtt);
+	mr->mmkey.iova = virt_addr;
+	mr->mmkey.size = len;
+	mr->mmkey.pd = to_mpd(pd)->pdn;
+
+	return mr;
+}
+
+static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages,
+			       void *xlt, int page_shift, size_t size,
+			       int flags)
+{
+	struct mlx5_ib_dev *dev = mr->dev;
+	struct ib_umem *umem = mr->umem;
+
+	if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
+		if (!umr_can_use_indirect_mkey(dev))
+			return -EPERM;
+		mlx5_odp_populate_klm(xlt, idx, npages, mr, flags);
+		return npages;
+	}
+
+	npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx);
+
+	if (!(flags & MLX5_IB_UPD_XLT_ZAP)) {
+		__mlx5_ib_populate_pas(dev, umem, page_shift,
+				       idx, npages, xlt,
+				       MLX5_IB_MTT_PRESENT);
+		/* Clear padding after the pages
+		 * brought from the umem.
+		 */
+		memset(xlt + (npages * sizeof(struct mlx5_mtt)), 0,
+		       size - npages * sizeof(struct mlx5_mtt));
+	}
+
+	return npages;
+}
+
+#define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
+			    MLX5_UMR_MTT_ALIGNMENT)
+#define MLX5_SPARE_UMR_CHUNK 0x10000
+
+int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
+		       int page_shift, int flags)
+{
+	struct mlx5_ib_dev *dev = mr->dev;
+	struct device *ddev = dev->ib_dev.dev.parent;
+	int size;
+	void *xlt;
+	dma_addr_t dma;
+	struct mlx5_umr_wr wr;
+	struct ib_sge sg;
+	int err = 0;
+	int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
+			       ? sizeof(struct mlx5_klm)
+			       : sizeof(struct mlx5_mtt);
+	const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
+	const int page_mask = page_align - 1;
+	size_t pages_mapped = 0;
+	size_t pages_to_map = 0;
+	size_t pages_iter = 0;
+	gfp_t gfp;
+	bool use_emergency_page = false;
+
+	if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
+	    !umr_can_use_indirect_mkey(dev))
+		return -EPERM;
+
+	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
+	 * so we need to align the offset and length accordingly
+	 */
+	if (idx & page_mask) {
+		npages += idx & page_mask;
+		idx &= ~page_mask;
+	}
+
+	gfp = flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : GFP_KERNEL;
+	gfp |= __GFP_ZERO | __GFP_NOWARN;
+
+	pages_to_map = ALIGN(npages, page_align);
+	size = desc_size * pages_to_map;
+	size = min_t(int, size, MLX5_MAX_UMR_CHUNK);
+
+	xlt = (void *)__get_free_pages(gfp, get_order(size));
+	if (!xlt && size > MLX5_SPARE_UMR_CHUNK) {
+		mlx5_ib_dbg(dev, "Failed to allocate %d bytes of order %d. fallback to spare UMR allocation od %d bytes\n",
+			    size, get_order(size), MLX5_SPARE_UMR_CHUNK);
+
+		size = MLX5_SPARE_UMR_CHUNK;
+		xlt = (void *)__get_free_pages(gfp, get_order(size));
+	}
+
+	if (!xlt) {
+		mlx5_ib_warn(dev, "Using XLT emergency buffer\n");
+		xlt = (void *)mlx5_ib_get_xlt_emergency_page();
+		size = PAGE_SIZE;
+		memset(xlt, 0, size);
+		use_emergency_page = true;
+	}
+	pages_iter = size / desc_size;
+	dma = dma_map_single(ddev, xlt, size, DMA_TO_DEVICE);
+	if (dma_mapping_error(ddev, dma)) {
+		mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
+		err = -ENOMEM;
+		goto free_xlt;
+	}
+
+	sg.addr = dma;
+	sg.lkey = dev->umrc.pd->local_dma_lkey;
+
+	memset(&wr, 0, sizeof(wr));
+	wr.wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
+	if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
+		wr.wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
+	wr.wr.sg_list = &sg;
+	wr.wr.num_sge = 1;
+	wr.wr.opcode = MLX5_IB_WR_UMR;
+
+	wr.pd = mr->ibmr.pd;
+	wr.mkey = mr->mmkey.key;
+	wr.length = mr->mmkey.size;
+	wr.virt_addr = mr->mmkey.iova;
+	wr.access_flags = mr->access_flags;
+	wr.page_shift = page_shift;
+
+	for (pages_mapped = 0;
+	     pages_mapped < pages_to_map && !err;
+	     pages_mapped += pages_iter, idx += pages_iter) {
+		npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
+		dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
+		npages = populate_xlt(mr, idx, npages, xlt,
+				      page_shift, size, flags);
+
+		dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
+
+		sg.length = ALIGN(npages * desc_size,
+				  MLX5_UMR_MTT_ALIGNMENT);
+
+		if (pages_mapped + pages_iter >= pages_to_map) {
+			if (flags & MLX5_IB_UPD_XLT_ENABLE)
+				wr.wr.send_flags |=
+					MLX5_IB_SEND_UMR_ENABLE_MR |
+					MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
+					MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
+			if (flags & MLX5_IB_UPD_XLT_PD ||
+			    flags & MLX5_IB_UPD_XLT_ACCESS)
+				wr.wr.send_flags |=
+					MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
+			if (flags & MLX5_IB_UPD_XLT_ADDR)
+				wr.wr.send_flags |=
+					MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
+		}
+
+		wr.offset = idx * desc_size;
+		wr.xlt_size = sg.length;
+
+		err = mlx5_ib_post_send_wait(dev, &wr);
+	}
+	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
+
+free_xlt:
+	if (use_emergency_page)
+		mlx5_ib_put_xlt_emergency_page();
+	else
+		free_pages((unsigned long)xlt, get_order(size));
+
+	return err;
+}
+
+/*
+ * If ibmr is NULL it will be allocated by reg_create.
+ * Else, the given ibmr will be used.
+ */
+static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
+				     u64 virt_addr, u64 length,
+				     struct ib_umem *umem, int npages,
+				     int page_shift, int access_flags,
+				     bool populate)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_mr *mr;
+	__be64 *pas;
+	void *mkc;
+	int inlen;
+	u32 *in;
+	int err;
+	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
+
+	mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->ibmr.pd = pd;
+	mr->access_flags = access_flags;
+
+	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	if (populate)
+		inlen += sizeof(*pas) * roundup(npages, 2);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_1;
+	}
+	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+	if (populate && !(access_flags & IB_ACCESS_ON_DEMAND))
+		mlx5_ib_populate_pas(dev, umem, page_shift, pas,
+				     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
+
+	/* The pg_access bit allows setting the access flags
+	 * in the page list submitted with the command. */
+	MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, free, !populate);
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+	MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
+	MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
+	MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
+	MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
+	MLX5_SET(mkc, mkc, lr, 1);
+	MLX5_SET(mkc, mkc, umr_en, 1);
+
+	MLX5_SET64(mkc, mkc, start_addr, virt_addr);
+	MLX5_SET64(mkc, mkc, len, length);
+	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
+	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
+	MLX5_SET(mkc, mkc, translations_octword_size,
+		 get_octo_len(virt_addr, length, page_shift));
+	MLX5_SET(mkc, mkc, log_page_size, page_shift);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	if (populate) {
+		MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
+			 get_octo_len(virt_addr, length, page_shift));
+	}
+
+	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
+	if (err) {
+		mlx5_ib_warn(dev, "create mkey failed\n");
+		goto err_2;
+	}
+	mr->mmkey.type = MLX5_MKEY_MR;
+	mr->desc_size = sizeof(struct mlx5_mtt);
+	mr->dev = dev;
+	kvfree(in);
+
+	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
+
+	return mr;
+
+err_2:
+	kvfree(in);
+
+err_1:
+	if (!ibmr)
+		kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
+			  int npages, u64 length, int access_flags)
+{
+	mr->npages = npages;
+	atomic_add(npages, &dev->mdev->priv.reg_pages);
+	mr->ibmr.lkey = mr->mmkey.key;
+	mr->ibmr.rkey = mr->mmkey.key;
+	mr->ibmr.length = length;
+	mr->access_flags = access_flags;
+}
+
+static struct ib_mr *mlx5_ib_get_memic_mr(struct ib_pd *pd, u64 memic_addr,
+					  u64 length, int acc)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_ib_mr *mr;
+	void *mkc;
+	u32 *in;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MEMIC & 0x3);
+	MLX5_SET(mkc, mkc, access_mode_4_2,
+		 (MLX5_MKC_ACCESS_MODE_MEMIC >> 2) & 0x7);
+	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
+	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
+	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
+	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
+	MLX5_SET(mkc, mkc, lr, 1);
+
+	MLX5_SET64(mkc, mkc, len, length);
+	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET64(mkc, mkc, start_addr,
+		   memic_addr - pci_resource_start(dev->mdev->pdev, 0));
+
+	err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
+	if (err)
+		goto err_in;
+
+	kfree(in);
+
+	mr->umem = NULL;
+	set_mr_fileds(dev, mr, 0, length, acc);
+
+	return &mr->ibmr;
+
+err_in:
+	kfree(in);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
+				struct ib_dm_mr_attr *attr,
+				struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_dm *mdm = to_mdm(dm);
+	u64 memic_addr;
+
+	if (attr->access_flags & ~MLX5_IB_DM_ALLOWED_ACCESS)
+		return ERR_PTR(-EINVAL);
+
+	memic_addr = mdm->dev_addr + attr->offset;
+
+	return mlx5_ib_get_memic_mr(pd, memic_addr, attr->length,
+				    attr->access_flags);
+}
+
+struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_mr *mr = NULL;
+	bool populate_mtts = false;
+	struct ib_umem *umem;
+	int page_shift;
+	int npages;
+	int ncont;
+	int order;
+	int err;
+
+	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
+		    start, virt_addr, length, access_flags);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (!start && length == U64_MAX) {
+		if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
+		    !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
+			return ERR_PTR(-EINVAL);
+
+		mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
+		if (IS_ERR(mr))
+			return ERR_CAST(mr);
+		return &mr->ibmr;
+	}
+#endif
+
+	err = mr_umem_get(pd, start, length, access_flags, &umem, &npages,
+			   &page_shift, &ncont, &order);
+
+	if (err < 0)
+		return ERR_PTR(err);
+
+	if (use_umr(dev, order)) {
+		mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
+					 page_shift, order, access_flags);
+		if (PTR_ERR(mr) == -EAGAIN) {
+			mlx5_ib_dbg(dev, "cache empty for order %d\n", order);
+			mr = NULL;
+		}
+		populate_mtts = false;
+	} else if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) {
+		if (access_flags & IB_ACCESS_ON_DEMAND) {
+			err = -EINVAL;
+			pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB\n");
+			goto error;
+		}
+		populate_mtts = true;
+	}
+
+	if (!mr) {
+		if (!umr_can_modify_entity_size(dev))
+			populate_mtts = true;
+		mutex_lock(&dev->slow_path_mutex);
+		mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
+				page_shift, access_flags, populate_mtts);
+		mutex_unlock(&dev->slow_path_mutex);
+	}
+
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+		goto error;
+	}
+
+	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
+
+	mr->umem = umem;
+	set_mr_fileds(dev, mr, npages, length, access_flags);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	update_odp_mr(mr);
+#endif
+
+	if (!populate_mtts) {
+		int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
+
+		if (access_flags & IB_ACCESS_ON_DEMAND)
+			update_xlt_flags |= MLX5_IB_UPD_XLT_ZAP;
+
+		err = mlx5_ib_update_xlt(mr, 0, ncont, page_shift,
+					 update_xlt_flags);
+
+		if (err) {
+			dereg_mr(dev, mr);
+			return ERR_PTR(err);
+		}
+	}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	mr->live = 1;
+#endif
+	return &mr->ibmr;
+error:
+	ib_umem_release(umem);
+	return ERR_PTR(err);
+}
+
+static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_umr_wr umrwr = {};
+
+	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+		return 0;
+
+	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
+			      MLX5_IB_SEND_UMR_FAIL_IF_FREE;
+	umrwr.wr.opcode = MLX5_IB_WR_UMR;
+	umrwr.mkey = mr->mmkey.key;
+
+	return mlx5_ib_post_send_wait(dev, &umrwr);
+}
+
+static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr,
+		     int access_flags, int flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_umr_wr umrwr = {};
+	int err;
+
+	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE;
+
+	umrwr.wr.opcode = MLX5_IB_WR_UMR;
+	umrwr.mkey = mr->mmkey.key;
+
+	if (flags & IB_MR_REREG_PD || flags & IB_MR_REREG_ACCESS) {
+		umrwr.pd = pd;
+		umrwr.access_flags = access_flags;
+		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
+	}
+
+	err = mlx5_ib_post_send_wait(dev, &umrwr);
+
+	return err;
+}
+
+int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
+			  u64 length, u64 virt_addr, int new_access_flags,
+			  struct ib_pd *new_pd, struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
+	struct mlx5_ib_mr *mr = to_mmr(ib_mr);
+	struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd;
+	int access_flags = flags & IB_MR_REREG_ACCESS ?
+			    new_access_flags :
+			    mr->access_flags;
+	int page_shift = 0;
+	int upd_flags = 0;
+	int npages = 0;
+	int ncont = 0;
+	int order = 0;
+	u64 addr, len;
+	int err;
+
+	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
+		    start, virt_addr, length, access_flags);
+
+	atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
+
+	if (!mr->umem)
+		return -EINVAL;
+
+	if (flags & IB_MR_REREG_TRANS) {
+		addr = virt_addr;
+		len = length;
+	} else {
+		addr = mr->umem->address;
+		len = mr->umem->length;
+	}
+
+	if (flags != IB_MR_REREG_PD) {
+		/*
+		 * Replace umem. This needs to be done whether or not UMR is
+		 * used.
+		 */
+		flags |= IB_MR_REREG_TRANS;
+		ib_umem_release(mr->umem);
+		mr->umem = NULL;
+		err = mr_umem_get(pd, addr, len, access_flags, &mr->umem,
+				  &npages, &page_shift, &ncont, &order);
+		if (err)
+			goto err;
+	}
+
+	if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) {
+		/*
+		 * UMR can't be used - MKey needs to be replaced.
+		 */
+		if (mr->allocated_from_cache)
+			err = unreg_umr(dev, mr);
+		else
+			err = destroy_mkey(dev, mr);
+		if (err)
+			goto err;
+
+		mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont,
+				page_shift, access_flags, true);
+
+		if (IS_ERR(mr)) {
+			err = PTR_ERR(mr);
+			mr = to_mmr(ib_mr);
+			goto err;
+		}
+
+		mr->allocated_from_cache = 0;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+		mr->live = 1;
+#endif
+	} else {
+		/*
+		 * Send a UMR WQE
+		 */
+		mr->ibmr.pd = pd;
+		mr->access_flags = access_flags;
+		mr->mmkey.iova = addr;
+		mr->mmkey.size = len;
+		mr->mmkey.pd = to_mpd(pd)->pdn;
+
+		if (flags & IB_MR_REREG_TRANS) {
+			upd_flags = MLX5_IB_UPD_XLT_ADDR;
+			if (flags & IB_MR_REREG_PD)
+				upd_flags |= MLX5_IB_UPD_XLT_PD;
+			if (flags & IB_MR_REREG_ACCESS)
+				upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
+			err = mlx5_ib_update_xlt(mr, 0, npages, page_shift,
+						 upd_flags);
+		} else {
+			err = rereg_umr(pd, mr, access_flags, flags);
+		}
+
+		if (err)
+			goto err;
+	}
+
+	set_mr_fileds(dev, mr, npages, len, access_flags);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	update_odp_mr(mr);
+#endif
+	return 0;
+
+err:
+	if (mr->umem) {
+		ib_umem_release(mr->umem);
+		mr->umem = NULL;
+	}
+	clean_mr(dev, mr);
+	return err;
+}
+
+static int
+mlx5_alloc_priv_descs(struct ib_device *device,
+		      struct mlx5_ib_mr *mr,
+		      int ndescs,
+		      int desc_size)
+{
+	int size = ndescs * desc_size;
+	int add_size;
+	int ret;
+
+	add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
+
+	mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
+	if (!mr->descs_alloc)
+		return -ENOMEM;
+
+	mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
+
+	mr->desc_map = dma_map_single(device->dev.parent, mr->descs,
+				      size, DMA_TO_DEVICE);
+	if (dma_mapping_error(device->dev.parent, mr->desc_map)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	return 0;
+err:
+	kfree(mr->descs_alloc);
+
+	return ret;
+}
+
+static void
+mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
+{
+	if (mr->descs) {
+		struct ib_device *device = mr->ibmr.device;
+		int size = mr->max_descs * mr->desc_size;
+
+		dma_unmap_single(device->dev.parent, mr->desc_map,
+				 size, DMA_TO_DEVICE);
+		kfree(mr->descs_alloc);
+		mr->descs = NULL;
+	}
+}
+
+static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+	int allocated_from_cache = mr->allocated_from_cache;
+
+	if (mr->sig) {
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_memory.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
+				     mr->sig->psv_memory.psv_idx);
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_wire.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
+				     mr->sig->psv_wire.psv_idx);
+		kfree(mr->sig);
+		mr->sig = NULL;
+	}
+
+	mlx5_free_priv_descs(mr);
+
+	if (!allocated_from_cache)
+		destroy_mkey(dev, mr);
+}
+
+static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+	int npages = mr->npages;
+	struct ib_umem *umem = mr->umem;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (umem && umem->odp_data) {
+		/* Prevent new page faults from succeeding */
+		mr->live = 0;
+		/* Wait for all running page-fault handlers to finish. */
+		synchronize_srcu(&dev->mr_srcu);
+		/* Destroy all page mappings */
+		if (umem->odp_data->page_list)
+			mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
+						 ib_umem_end(umem));
+		else
+			mlx5_ib_free_implicit_mr(mr);
+		/*
+		 * We kill the umem before the MR for ODP,
+		 * so that there will not be any invalidations in
+		 * flight, looking at the *mr struct.
+		 */
+		ib_umem_release(umem);
+		atomic_sub(npages, &dev->mdev->priv.reg_pages);
+
+		/* Avoid double-freeing the umem. */
+		umem = NULL;
+	}
+#endif
+
+	clean_mr(dev, mr);
+
+	if (umem) {
+		ib_umem_release(umem);
+		atomic_sub(npages, &dev->mdev->priv.reg_pages);
+	}
+
+	if (!mr->allocated_from_cache)
+		kfree(mr);
+	else
+		mlx5_mr_cache_free(dev, mr);
+}
+
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+{
+	dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr));
+	return 0;
+}
+
+struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
+			       enum ib_mr_type mr_type,
+			       u32 max_num_sg)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	int ndescs = ALIGN(max_num_sg, 4);
+	struct mlx5_ib_mr *mr;
+	void *mkc;
+	u32 *in;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, free, 1);
+	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
+
+	if (mr_type == IB_MR_TYPE_MEM_REG) {
+		mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
+		MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
+		err = mlx5_alloc_priv_descs(pd->device, mr,
+					    ndescs, sizeof(struct mlx5_mtt));
+		if (err)
+			goto err_free_in;
+
+		mr->desc_size = sizeof(struct mlx5_mtt);
+		mr->max_descs = ndescs;
+	} else if (mr_type == IB_MR_TYPE_SG_GAPS) {
+		mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
+
+		err = mlx5_alloc_priv_descs(pd->device, mr,
+					    ndescs, sizeof(struct mlx5_klm));
+		if (err)
+			goto err_free_in;
+		mr->desc_size = sizeof(struct mlx5_klm);
+		mr->max_descs = ndescs;
+	} else if (mr_type == IB_MR_TYPE_SIGNATURE) {
+		u32 psv_index[2];
+
+		MLX5_SET(mkc, mkc, bsf_en, 1);
+		MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
+		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
+		if (!mr->sig) {
+			err = -ENOMEM;
+			goto err_free_in;
+		}
+
+		/* create mem & wire PSVs */
+		err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn,
+					   2, psv_index);
+		if (err)
+			goto err_free_sig;
+
+		mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
+		mr->sig->psv_memory.psv_idx = psv_index[0];
+		mr->sig->psv_wire.psv_idx = psv_index[1];
+
+		mr->sig->sig_status_checked = true;
+		mr->sig->sig_err_exists = false;
+		/* Next UMR, Arm SIGERR */
+		++mr->sig->sigerr_count;
+	} else {
+		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
+		err = -EINVAL;
+		goto err_free_in;
+	}
+
+	MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3);
+	MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7);
+	MLX5_SET(mkc, mkc, umr_en, 1);
+
+	mr->ibmr.device = pd->device;
+	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
+	if (err)
+		goto err_destroy_psv;
+
+	mr->mmkey.type = MLX5_MKEY_MR;
+	mr->ibmr.lkey = mr->mmkey.key;
+	mr->ibmr.rkey = mr->mmkey.key;
+	mr->umem = NULL;
+	kfree(in);
+
+	return &mr->ibmr;
+
+err_destroy_psv:
+	if (mr->sig) {
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_memory.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
+				     mr->sig->psv_memory.psv_idx);
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_wire.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
+				     mr->sig->psv_wire.psv_idx);
+	}
+	mlx5_free_priv_descs(mr);
+err_free_sig:
+	kfree(mr->sig);
+err_free_in:
+	kfree(in);
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
+			       struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	struct mlx5_ib_mw *mw = NULL;
+	u32 *in = NULL;
+	void *mkc;
+	int ndescs;
+	int err;
+	struct mlx5_ib_alloc_mw req = {};
+	struct {
+		__u32	comp_mask;
+		__u32	response_length;
+	} resp = {};
+
+	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
+	if (err)
+		return ERR_PTR(err);
+
+	if (req.comp_mask || req.reserved1 || req.reserved2)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (udata->inlen > sizeof(req) &&
+	    !ib_is_udata_cleared(udata, sizeof(req),
+				 udata->inlen - sizeof(req)))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
+
+	mw = kzalloc(sizeof(*mw), GFP_KERNEL);
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!mw || !in) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+	MLX5_SET(mkc, mkc, free, 1);
+	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
+	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
+	MLX5_SET(mkc, mkc, umr_en, 1);
+	MLX5_SET(mkc, mkc, lr, 1);
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
+	MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2)));
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+
+	err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, inlen);
+	if (err)
+		goto free;
+
+	mw->mmkey.type = MLX5_MKEY_MW;
+	mw->ibmw.rkey = mw->mmkey.key;
+	mw->ndescs = ndescs;
+
+	resp.response_length = min(offsetof(typeof(resp), response_length) +
+				   sizeof(resp.response_length), udata->outlen);
+	if (resp.response_length) {
+		err = ib_copy_to_udata(udata, &resp, resp.response_length);
+		if (err) {
+			mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
+			goto free;
+		}
+	}
+
+	kfree(in);
+	return &mw->ibmw;
+
+free:
+	kfree(mw);
+	kfree(in);
+	return ERR_PTR(err);
+}
+
+int mlx5_ib_dealloc_mw(struct ib_mw *mw)
+{
+	struct mlx5_ib_mw *mmw = to_mmw(mw);
+	int err;
+
+	err =  mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev,
+				      &mmw->mmkey);
+	if (!err)
+		kfree(mmw);
+	return err;
+}
+
+int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
+			    struct ib_mr_status *mr_status)
+{
+	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
+	int ret = 0;
+
+	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
+		pr_err("Invalid status check mask\n");
+		ret = -EINVAL;
+		goto done;
+	}
+
+	mr_status->fail_status = 0;
+	if (check_mask & IB_MR_CHECK_SIG_STATUS) {
+		if (!mmr->sig) {
+			ret = -EINVAL;
+			pr_err("signature status check requested on a non-signature enabled MR\n");
+			goto done;
+		}
+
+		mmr->sig->sig_status_checked = true;
+		if (!mmr->sig->sig_err_exists)
+			goto done;
+
+		if (ibmr->lkey == mmr->sig->err_item.key)
+			memcpy(&mr_status->sig_err, &mmr->sig->err_item,
+			       sizeof(mr_status->sig_err));
+		else {
+			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
+			mr_status->sig_err.sig_err_offset = 0;
+			mr_status->sig_err.key = mmr->sig->err_item.key;
+		}
+
+		mmr->sig->sig_err_exists = false;
+		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
+	}
+
+done:
+	return ret;
+}
+
+static int
+mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
+		   struct scatterlist *sgl,
+		   unsigned short sg_nents,
+		   unsigned int *sg_offset_p)
+{
+	struct scatterlist *sg = sgl;
+	struct mlx5_klm *klms = mr->descs;
+	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
+	u32 lkey = mr->ibmr.pd->local_dma_lkey;
+	int i;
+
+	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
+	mr->ibmr.length = 0;
+
+	for_each_sg(sgl, sg, sg_nents, i) {
+		if (unlikely(i >= mr->max_descs))
+			break;
+		klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
+		klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
+		klms[i].key = cpu_to_be32(lkey);
+		mr->ibmr.length += sg_dma_len(sg) - sg_offset;
+
+		sg_offset = 0;
+	}
+	mr->ndescs = i;
+
+	if (sg_offset_p)
+		*sg_offset_p = sg_offset;
+
+	return i;
+}
+
+static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	__be64 *descs;
+
+	if (unlikely(mr->ndescs == mr->max_descs))
+		return -ENOMEM;
+
+	descs = mr->descs;
+	descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
+
+	return 0;
+}
+
+int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		      unsigned int *sg_offset)
+{
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	int n;
+
+	mr->ndescs = 0;
+
+	ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
+				   mr->desc_size * mr->max_descs,
+				   DMA_TO_DEVICE);
+
+	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
+		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset);
+	else
+		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
+				mlx5_set_page);
+
+	ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
+				      mr->desc_size * mr->max_descs,
+				      DMA_TO_DEVICE);
+
+	return n;
+}
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
new file mode 100644
index 0000000..f1a87a6
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -0,0 +1,1228 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+#include <linux/kernel.h>
+
+#include "mlx5_ib.h"
+#include "cmd.h"
+
+#define MAX_PREFETCH_LEN (4*1024*1024U)
+
+/* Timeout in ms to wait for an active mmu notifier to complete when handling
+ * a pagefault. */
+#define MMU_NOTIFIER_TIMEOUT 1000
+
+#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
+#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
+#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
+#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
+#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
+
+#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
+
+static u64 mlx5_imr_ksm_entries;
+
+static int check_parent(struct ib_umem_odp *odp,
+			       struct mlx5_ib_mr *parent)
+{
+	struct mlx5_ib_mr *mr = odp->private;
+
+	return mr && mr->parent == parent && !odp->dying;
+}
+
+static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp)
+{
+	struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent;
+	struct ib_ucontext *ctx = odp->umem->context;
+	struct rb_node *rb;
+
+	down_read(&ctx->umem_rwsem);
+	while (1) {
+		rb = rb_next(&odp->interval_tree.rb);
+		if (!rb)
+			goto not_found;
+		odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
+		if (check_parent(odp, parent))
+			goto end;
+	}
+not_found:
+	odp = NULL;
+end:
+	up_read(&ctx->umem_rwsem);
+	return odp;
+}
+
+static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx,
+				      u64 start, u64 length,
+				      struct mlx5_ib_mr *parent)
+{
+	struct ib_umem_odp *odp;
+	struct rb_node *rb;
+
+	down_read(&ctx->umem_rwsem);
+	odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length);
+	if (!odp)
+		goto end;
+
+	while (1) {
+		if (check_parent(odp, parent))
+			goto end;
+		rb = rb_next(&odp->interval_tree.rb);
+		if (!rb)
+			goto not_found;
+		odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
+		if (ib_umem_start(odp->umem) > start + length)
+			goto not_found;
+	}
+not_found:
+	odp = NULL;
+end:
+	up_read(&ctx->umem_rwsem);
+	return odp;
+}
+
+void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
+			   size_t nentries, struct mlx5_ib_mr *mr, int flags)
+{
+	struct ib_pd *pd = mr->ibmr.pd;
+	struct ib_ucontext *ctx = pd->uobject->context;
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct ib_umem_odp *odp;
+	unsigned long va;
+	int i;
+
+	if (flags & MLX5_IB_UPD_XLT_ZAP) {
+		for (i = 0; i < nentries; i++, pklm++) {
+			pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
+			pklm->key = cpu_to_be32(dev->null_mkey);
+			pklm->va = 0;
+		}
+		return;
+	}
+
+	odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE,
+			     nentries * MLX5_IMR_MTT_SIZE, mr);
+
+	for (i = 0; i < nentries; i++, pklm++) {
+		pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
+		va = (offset + i) * MLX5_IMR_MTT_SIZE;
+		if (odp && odp->umem->address == va) {
+			struct mlx5_ib_mr *mtt = odp->private;
+
+			pklm->key = cpu_to_be32(mtt->ibmr.lkey);
+			odp = odp_next(odp);
+		} else {
+			pklm->key = cpu_to_be32(dev->null_mkey);
+		}
+		mlx5_ib_dbg(dev, "[%d] va %lx key %x\n",
+			    i, va, be32_to_cpu(pklm->key));
+	}
+}
+
+static void mr_leaf_free_action(struct work_struct *work)
+{
+	struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
+	int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT;
+	struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
+
+	mr->parent = NULL;
+	synchronize_srcu(&mr->dev->mr_srcu);
+
+	ib_umem_release(odp->umem);
+	if (imr->live)
+		mlx5_ib_update_xlt(imr, idx, 1, 0,
+				   MLX5_IB_UPD_XLT_INDIRECT |
+				   MLX5_IB_UPD_XLT_ATOMIC);
+	mlx5_mr_cache_free(mr->dev, mr);
+
+	if (atomic_dec_and_test(&imr->num_leaf_free))
+		wake_up(&imr->q_leaf_free);
+}
+
+void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
+			      unsigned long end)
+{
+	struct mlx5_ib_mr *mr;
+	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
+				    sizeof(struct mlx5_mtt)) - 1;
+	u64 idx = 0, blk_start_idx = 0;
+	int in_block = 0;
+	u64 addr;
+
+	if (!umem || !umem->odp_data) {
+		pr_err("invalidation called on NULL umem or non-ODP umem\n");
+		return;
+	}
+
+	mr = umem->odp_data->private;
+
+	if (!mr || !mr->ibmr.pd)
+		return;
+
+	start = max_t(u64, ib_umem_start(umem), start);
+	end = min_t(u64, ib_umem_end(umem), end);
+
+	/*
+	 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
+	 * while we are doing the invalidation, no page fault will attempt to
+	 * overwrite the same MTTs.  Concurent invalidations might race us,
+	 * but they will write 0s as well, so no difference in the end result.
+	 */
+
+	for (addr = start; addr < end; addr += BIT(umem->page_shift)) {
+		idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
+		/*
+		 * Strive to write the MTTs in chunks, but avoid overwriting
+		 * non-existing MTTs. The huristic here can be improved to
+		 * estimate the cost of another UMR vs. the cost of bigger
+		 * UMR.
+		 */
+		if (umem->odp_data->dma_list[idx] &
+		    (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
+			if (!in_block) {
+				blk_start_idx = idx;
+				in_block = 1;
+			}
+		} else {
+			u64 umr_offset = idx & umr_block_mask;
+
+			if (in_block && umr_offset == 0) {
+				mlx5_ib_update_xlt(mr, blk_start_idx,
+						   idx - blk_start_idx, 0,
+						   MLX5_IB_UPD_XLT_ZAP |
+						   MLX5_IB_UPD_XLT_ATOMIC);
+				in_block = 0;
+			}
+		}
+	}
+	if (in_block)
+		mlx5_ib_update_xlt(mr, blk_start_idx,
+				   idx - blk_start_idx + 1, 0,
+				   MLX5_IB_UPD_XLT_ZAP |
+				   MLX5_IB_UPD_XLT_ATOMIC);
+	/*
+	 * We are now sure that the device will not access the
+	 * memory. We can safely unmap it, and mark it as dirty if
+	 * needed.
+	 */
+
+	ib_umem_odp_unmap_dma_pages(umem, start, end);
+
+	if (unlikely(!umem->npages && mr->parent &&
+		     !umem->odp_data->dying)) {
+		WRITE_ONCE(umem->odp_data->dying, 1);
+		atomic_inc(&mr->parent->num_leaf_free);
+		schedule_work(&umem->odp_data->work);
+	}
+}
+
+void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
+{
+	struct ib_odp_caps *caps = &dev->odp_caps;
+
+	memset(caps, 0, sizeof(*caps));
+
+	if (!MLX5_CAP_GEN(dev->mdev, pg))
+		return;
+
+	caps->general_caps = IB_ODP_SUPPORT;
+
+	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+		dev->odp_max_size = U64_MAX;
+	else
+		dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT);
+
+	if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
+		caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
+
+	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
+		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
+
+	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive))
+		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
+
+	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write))
+		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
+
+	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read))
+		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
+
+	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
+		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
+
+	if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
+	    MLX5_CAP_GEN(dev->mdev, null_mkey) &&
+	    MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+		caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
+
+	return;
+}
+
+static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
+				      struct mlx5_pagefault *pfault,
+				      int error)
+{
+	int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
+		     pfault->wqe.wq_num : pfault->token;
+	int ret = mlx5_core_page_fault_resume(dev->mdev,
+					      pfault->token,
+					      wq_num,
+					      pfault->type,
+					      error);
+	if (ret)
+		mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n",
+			    wq_num);
+}
+
+static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
+					    struct ib_umem *umem,
+					    bool ksm, int access_flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_mr *mr;
+	int err;
+
+	mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY :
+					    MLX5_IMR_MTT_CACHE_ENTRY);
+
+	if (IS_ERR(mr))
+		return mr;
+
+	mr->ibmr.pd = pd;
+
+	mr->dev = dev;
+	mr->access_flags = access_flags;
+	mr->mmkey.iova = 0;
+	mr->umem = umem;
+
+	if (ksm) {
+		err = mlx5_ib_update_xlt(mr, 0,
+					 mlx5_imr_ksm_entries,
+					 MLX5_KSM_PAGE_SHIFT,
+					 MLX5_IB_UPD_XLT_INDIRECT |
+					 MLX5_IB_UPD_XLT_ZAP |
+					 MLX5_IB_UPD_XLT_ENABLE);
+
+	} else {
+		err = mlx5_ib_update_xlt(mr, 0,
+					 MLX5_IMR_MTT_ENTRIES,
+					 PAGE_SHIFT,
+					 MLX5_IB_UPD_XLT_ZAP |
+					 MLX5_IB_UPD_XLT_ENABLE |
+					 MLX5_IB_UPD_XLT_ATOMIC);
+	}
+
+	if (err)
+		goto fail;
+
+	mr->ibmr.lkey = mr->mmkey.key;
+	mr->ibmr.rkey = mr->mmkey.key;
+
+	mr->live = 1;
+
+	mlx5_ib_dbg(dev, "key %x dev %p mr %p\n",
+		    mr->mmkey.key, dev->mdev, mr);
+
+	return mr;
+
+fail:
+	mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
+	mlx5_mr_cache_free(dev, mr);
+
+	return ERR_PTR(err);
+}
+
+static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr,
+						u64 io_virt, size_t bcnt)
+{
+	struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context;
+	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device);
+	struct ib_umem_odp *odp, *result = NULL;
+	u64 addr = io_virt & MLX5_IMR_MTT_MASK;
+	int nentries = 0, start_idx = 0, ret;
+	struct mlx5_ib_mr *mtt;
+	struct ib_umem *umem;
+
+	mutex_lock(&mr->umem->odp_data->umem_mutex);
+	odp = odp_lookup(ctx, addr, 1, mr);
+
+	mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n",
+		    io_virt, bcnt, addr, odp);
+
+next_mr:
+	if (likely(odp)) {
+		if (nentries)
+			nentries++;
+	} else {
+		umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE);
+		if (IS_ERR(umem)) {
+			mutex_unlock(&mr->umem->odp_data->umem_mutex);
+			return ERR_CAST(umem);
+		}
+
+		mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags);
+		if (IS_ERR(mtt)) {
+			mutex_unlock(&mr->umem->odp_data->umem_mutex);
+			ib_umem_release(umem);
+			return ERR_CAST(mtt);
+		}
+
+		odp = umem->odp_data;
+		odp->private = mtt;
+		mtt->umem = umem;
+		mtt->mmkey.iova = addr;
+		mtt->parent = mr;
+		INIT_WORK(&odp->work, mr_leaf_free_action);
+
+		if (!nentries)
+			start_idx = addr >> MLX5_IMR_MTT_SHIFT;
+		nentries++;
+	}
+
+	/* Return first odp if region not covered by single one */
+	if (likely(!result))
+		result = odp;
+
+	addr += MLX5_IMR_MTT_SIZE;
+	if (unlikely(addr < io_virt + bcnt)) {
+		odp = odp_next(odp);
+		if (odp && odp->umem->address != addr)
+			odp = NULL;
+		goto next_mr;
+	}
+
+	if (unlikely(nentries)) {
+		ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0,
+					 MLX5_IB_UPD_XLT_INDIRECT |
+					 MLX5_IB_UPD_XLT_ATOMIC);
+		if (ret) {
+			mlx5_ib_err(dev, "Failed to update PAS\n");
+			result = ERR_PTR(ret);
+		}
+	}
+
+	mutex_unlock(&mr->umem->odp_data->umem_mutex);
+	return result;
+}
+
+struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+					     int access_flags)
+{
+	struct ib_ucontext *ctx = pd->ibpd.uobject->context;
+	struct mlx5_ib_mr *imr;
+	struct ib_umem *umem;
+
+	umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0);
+	if (IS_ERR(umem))
+		return ERR_CAST(umem);
+
+	imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
+	if (IS_ERR(imr)) {
+		ib_umem_release(umem);
+		return ERR_CAST(imr);
+	}
+
+	imr->umem = umem;
+	init_waitqueue_head(&imr->q_leaf_free);
+	atomic_set(&imr->num_leaf_free, 0);
+
+	return imr;
+}
+
+static int mr_leaf_free(struct ib_umem *umem, u64 start,
+			u64 end, void *cookie)
+{
+	struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie;
+
+	if (mr->parent != imr)
+		return 0;
+
+	ib_umem_odp_unmap_dma_pages(umem,
+				    ib_umem_start(umem),
+				    ib_umem_end(umem));
+
+	if (umem->odp_data->dying)
+		return 0;
+
+	WRITE_ONCE(umem->odp_data->dying, 1);
+	atomic_inc(&imr->num_leaf_free);
+	schedule_work(&umem->odp_data->work);
+
+	return 0;
+}
+
+void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
+{
+	struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context;
+
+	down_read(&ctx->umem_rwsem);
+	rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
+				      mr_leaf_free, imr);
+	up_read(&ctx->umem_rwsem);
+
+	wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
+}
+
+static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
+			u64 io_virt, size_t bcnt, u32 *bytes_mapped)
+{
+	u64 access_mask = ODP_READ_ALLOWED_BIT;
+	int npages = 0, page_shift, np;
+	u64 start_idx, page_mask;
+	struct ib_umem_odp *odp;
+	int current_seq;
+	size_t size;
+	int ret;
+
+	if (!mr->umem->odp_data->page_list) {
+		odp = implicit_mr_get_data(mr, io_virt, bcnt);
+
+		if (IS_ERR(odp))
+			return PTR_ERR(odp);
+		mr = odp->private;
+
+	} else {
+		odp = mr->umem->odp_data;
+	}
+
+next_mr:
+	size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt);
+
+	page_shift = mr->umem->page_shift;
+	page_mask = ~(BIT(page_shift) - 1);
+	start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
+
+	if (mr->umem->writable)
+		access_mask |= ODP_WRITE_ALLOWED_BIT;
+
+	current_seq = READ_ONCE(odp->notifiers_seq);
+	/*
+	 * Ensure the sequence number is valid for some time before we call
+	 * gup.
+	 */
+	smp_rmb();
+
+	ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size,
+					access_mask, current_seq);
+
+	if (ret < 0)
+		goto out;
+
+	np = ret;
+
+	mutex_lock(&odp->umem_mutex);
+	if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
+		/*
+		 * No need to check whether the MTTs really belong to
+		 * this MR, since ib_umem_odp_map_dma_pages already
+		 * checks this.
+		 */
+		ret = mlx5_ib_update_xlt(mr, start_idx, np,
+					 page_shift, MLX5_IB_UPD_XLT_ATOMIC);
+	} else {
+		ret = -EAGAIN;
+	}
+	mutex_unlock(&odp->umem_mutex);
+
+	if (ret < 0) {
+		if (ret != -EAGAIN)
+			mlx5_ib_err(dev, "Failed to update mkey page tables\n");
+		goto out;
+	}
+
+	if (bytes_mapped) {
+		u32 new_mappings = (np << page_shift) -
+			(io_virt - round_down(io_virt, 1 << page_shift));
+		*bytes_mapped += min_t(u32, new_mappings, size);
+	}
+
+	npages += np << (page_shift - PAGE_SHIFT);
+	bcnt -= size;
+
+	if (unlikely(bcnt)) {
+		struct ib_umem_odp *next;
+
+		io_virt += size;
+		next = odp_next(odp);
+		if (unlikely(!next || next->umem->address != io_virt)) {
+			mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
+				    io_virt, next);
+			return -EAGAIN;
+		}
+		odp = next;
+		mr = odp->private;
+		goto next_mr;
+	}
+
+	return npages;
+
+out:
+	if (ret == -EAGAIN) {
+		if (mr->parent || !odp->dying) {
+			unsigned long timeout =
+				msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
+
+			if (!wait_for_completion_timeout(
+					&odp->notifier_completion,
+					timeout)) {
+				mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n",
+					     current_seq, odp->notifiers_seq);
+			}
+		} else {
+			/* The MR is being killed, kill the QP as well. */
+			ret = -EFAULT;
+		}
+	}
+
+	return ret;
+}
+
+struct pf_frame {
+	struct pf_frame *next;
+	u32 key;
+	u64 io_virt;
+	size_t bcnt;
+	int depth;
+};
+
+/*
+ * Handle a single data segment in a page-fault WQE or RDMA region.
+ *
+ * Returns number of OS pages retrieved on success. The caller may continue to
+ * the next data segment.
+ * Can return the following error codes:
+ * -EAGAIN to designate a temporary error. The caller will abort handling the
+ *  page fault and resolve it.
+ * -EFAULT when there's an error mapping the requested pages. The caller will
+ *  abort the page fault handling.
+ */
+static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
+					 u32 key, u64 io_virt, size_t bcnt,
+					 u32 *bytes_committed,
+					 u32 *bytes_mapped)
+{
+	int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
+	struct pf_frame *head = NULL, *frame;
+	struct mlx5_core_mkey *mmkey;
+	struct mlx5_ib_mw *mw;
+	struct mlx5_ib_mr *mr;
+	struct mlx5_klm *pklm;
+	u32 *out = NULL;
+	size_t offset;
+
+	srcu_key = srcu_read_lock(&dev->mr_srcu);
+
+	io_virt += *bytes_committed;
+	bcnt -= *bytes_committed;
+
+next_mr:
+	mmkey = __mlx5_mr_lookup(dev->mdev, mlx5_base_mkey(key));
+	if (!mmkey || mmkey->key != key) {
+		mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
+		ret = -EFAULT;
+		goto srcu_unlock;
+	}
+
+	switch (mmkey->type) {
+	case MLX5_MKEY_MR:
+		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+		if (!mr->live || !mr->ibmr.pd) {
+			mlx5_ib_dbg(dev, "got dead MR\n");
+			ret = -EFAULT;
+			goto srcu_unlock;
+		}
+
+		ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped);
+		if (ret < 0)
+			goto srcu_unlock;
+
+		npages += ret;
+		ret = 0;
+		break;
+
+	case MLX5_MKEY_MW:
+		mw = container_of(mmkey, struct mlx5_ib_mw, mmkey);
+
+		if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) {
+			mlx5_ib_dbg(dev, "indirection level exceeded\n");
+			ret = -EFAULT;
+			goto srcu_unlock;
+		}
+
+		outlen = MLX5_ST_SZ_BYTES(query_mkey_out) +
+			sizeof(*pklm) * (mw->ndescs - 2);
+
+		if (outlen > cur_outlen) {
+			kfree(out);
+			out = kzalloc(outlen, GFP_KERNEL);
+			if (!out) {
+				ret = -ENOMEM;
+				goto srcu_unlock;
+			}
+			cur_outlen = outlen;
+		}
+
+		pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out,
+						       bsf0_klm0_pas_mtt0_1);
+
+		ret = mlx5_core_query_mkey(dev->mdev, &mw->mmkey, out, outlen);
+		if (ret)
+			goto srcu_unlock;
+
+		offset = io_virt - MLX5_GET64(query_mkey_out, out,
+					      memory_key_mkey_entry.start_addr);
+
+		for (i = 0; bcnt && i < mw->ndescs; i++, pklm++) {
+			if (offset >= be32_to_cpu(pklm->bcount)) {
+				offset -= be32_to_cpu(pklm->bcount);
+				continue;
+			}
+
+			frame = kzalloc(sizeof(*frame), GFP_KERNEL);
+			if (!frame) {
+				ret = -ENOMEM;
+				goto srcu_unlock;
+			}
+
+			frame->key = be32_to_cpu(pklm->key);
+			frame->io_virt = be64_to_cpu(pklm->va) + offset;
+			frame->bcnt = min_t(size_t, bcnt,
+					    be32_to_cpu(pklm->bcount) - offset);
+			frame->depth = depth + 1;
+			frame->next = head;
+			head = frame;
+
+			bcnt -= frame->bcnt;
+		}
+		break;
+
+	default:
+		mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type);
+		ret = -EFAULT;
+		goto srcu_unlock;
+	}
+
+	if (head) {
+		frame = head;
+		head = frame->next;
+
+		key = frame->key;
+		io_virt = frame->io_virt;
+		bcnt = frame->bcnt;
+		depth = frame->depth;
+		kfree(frame);
+
+		goto next_mr;
+	}
+
+srcu_unlock:
+	while (head) {
+		frame = head;
+		head = frame->next;
+		kfree(frame);
+	}
+	kfree(out);
+
+	srcu_read_unlock(&dev->mr_srcu, srcu_key);
+	*bytes_committed = 0;
+	return ret ? ret : npages;
+}
+
+/**
+ * Parse a series of data segments for page fault handling.
+ *
+ * @qp the QP on which the fault occurred.
+ * @pfault contains page fault information.
+ * @wqe points at the first data segment in the WQE.
+ * @wqe_end points after the end of the WQE.
+ * @bytes_mapped receives the number of bytes that the function was able to
+ *               map. This allows the caller to decide intelligently whether
+ *               enough memory was mapped to resolve the page fault
+ *               successfully (e.g. enough for the next MTU, or the entire
+ *               WQE).
+ * @total_wqe_bytes receives the total data size of this WQE in bytes (minus
+ *                  the committed bytes).
+ *
+ * Returns the number of pages loaded if positive, zero for an empty WQE, or a
+ * negative error code.
+ */
+static int pagefault_data_segments(struct mlx5_ib_dev *dev,
+				   struct mlx5_pagefault *pfault,
+				   struct mlx5_ib_qp *qp, void *wqe,
+				   void *wqe_end, u32 *bytes_mapped,
+				   u32 *total_wqe_bytes, int receive_queue)
+{
+	int ret = 0, npages = 0;
+	u64 io_virt;
+	u32 key;
+	u32 byte_count;
+	size_t bcnt;
+	int inline_segment;
+
+	/* Skip SRQ next-WQE segment. */
+	if (receive_queue && qp->ibqp.srq)
+		wqe += sizeof(struct mlx5_wqe_srq_next_seg);
+
+	if (bytes_mapped)
+		*bytes_mapped = 0;
+	if (total_wqe_bytes)
+		*total_wqe_bytes = 0;
+
+	while (wqe < wqe_end) {
+		struct mlx5_wqe_data_seg *dseg = wqe;
+
+		io_virt = be64_to_cpu(dseg->addr);
+		key = be32_to_cpu(dseg->lkey);
+		byte_count = be32_to_cpu(dseg->byte_count);
+		inline_segment = !!(byte_count &  MLX5_INLINE_SEG);
+		bcnt	       = byte_count & ~MLX5_INLINE_SEG;
+
+		if (inline_segment) {
+			bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK;
+			wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt,
+				     16);
+		} else {
+			wqe += sizeof(*dseg);
+		}
+
+		/* receive WQE end of sg list. */
+		if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY &&
+		    io_virt == 0)
+			break;
+
+		if (!inline_segment && total_wqe_bytes) {
+			*total_wqe_bytes += bcnt - min_t(size_t, bcnt,
+					pfault->bytes_committed);
+		}
+
+		/* A zero length data segment designates a length of 2GB. */
+		if (bcnt == 0)
+			bcnt = 1U << 31;
+
+		if (inline_segment || bcnt <= pfault->bytes_committed) {
+			pfault->bytes_committed -=
+				min_t(size_t, bcnt,
+				      pfault->bytes_committed);
+			continue;
+		}
+
+		ret = pagefault_single_data_segment(dev, key, io_virt, bcnt,
+						    &pfault->bytes_committed,
+						    bytes_mapped);
+		if (ret < 0)
+			break;
+		npages += ret;
+	}
+
+	return ret < 0 ? ret : npages;
+}
+
+static const u32 mlx5_ib_odp_opcode_cap[] = {
+	[MLX5_OPCODE_SEND]	       = IB_ODP_SUPPORT_SEND,
+	[MLX5_OPCODE_SEND_IMM]	       = IB_ODP_SUPPORT_SEND,
+	[MLX5_OPCODE_SEND_INVAL]       = IB_ODP_SUPPORT_SEND,
+	[MLX5_OPCODE_RDMA_WRITE]       = IB_ODP_SUPPORT_WRITE,
+	[MLX5_OPCODE_RDMA_WRITE_IMM]   = IB_ODP_SUPPORT_WRITE,
+	[MLX5_OPCODE_RDMA_READ]	       = IB_ODP_SUPPORT_READ,
+	[MLX5_OPCODE_ATOMIC_CS]	       = IB_ODP_SUPPORT_ATOMIC,
+	[MLX5_OPCODE_ATOMIC_FA]	       = IB_ODP_SUPPORT_ATOMIC,
+};
+
+/*
+ * Parse initiator WQE. Advances the wqe pointer to point at the
+ * scatter-gather list, and set wqe_end to the end of the WQE.
+ */
+static int mlx5_ib_mr_initiator_pfault_handler(
+	struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
+	struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
+{
+	struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
+	u16 wqe_index = pfault->wqe.wqe_index;
+	u32 transport_caps;
+	struct mlx5_base_av *av;
+	unsigned ds, opcode;
+#if defined(DEBUG)
+	u32 ctrl_wqe_index, ctrl_qpn;
+#endif
+	u32 qpn = qp->trans_qp.base.mqp.qpn;
+
+	ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
+	if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
+		mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
+			    ds, wqe_length);
+		return -EFAULT;
+	}
+
+	if (ds == 0) {
+		mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
+			    wqe_index, qpn);
+		return -EFAULT;
+	}
+
+#if defined(DEBUG)
+	ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) &
+			MLX5_WQE_CTRL_WQE_INDEX_MASK) >>
+			MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
+	if (wqe_index != ctrl_wqe_index) {
+		mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
+			    wqe_index, qpn,
+			    ctrl_wqe_index);
+		return -EFAULT;
+	}
+
+	ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
+		MLX5_WQE_CTRL_QPN_SHIFT;
+	if (qpn != ctrl_qpn) {
+		mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
+			    wqe_index, qpn,
+			    ctrl_qpn);
+		return -EFAULT;
+	}
+#endif /* DEBUG */
+
+	*wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
+	*wqe += sizeof(*ctrl);
+
+	opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
+		 MLX5_WQE_CTRL_OPCODE_MASK;
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_RC:
+		transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps;
+		break;
+	case IB_QPT_UD:
+		transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps;
+		break;
+	default:
+		mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n",
+			    qp->ibqp.qp_type);
+		return -EFAULT;
+	}
+
+	if (unlikely(opcode >= ARRAY_SIZE(mlx5_ib_odp_opcode_cap) ||
+		     !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) {
+		mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n",
+			    opcode);
+		return -EFAULT;
+	}
+
+	if (qp->ibqp.qp_type != IB_QPT_RC) {
+		av = *wqe;
+		if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
+			*wqe += sizeof(struct mlx5_av);
+		else
+			*wqe += sizeof(struct mlx5_base_av);
+	}
+
+	switch (opcode) {
+	case MLX5_OPCODE_RDMA_WRITE:
+	case MLX5_OPCODE_RDMA_WRITE_IMM:
+	case MLX5_OPCODE_RDMA_READ:
+		*wqe += sizeof(struct mlx5_wqe_raddr_seg);
+		break;
+	case MLX5_OPCODE_ATOMIC_CS:
+	case MLX5_OPCODE_ATOMIC_FA:
+		*wqe += sizeof(struct mlx5_wqe_raddr_seg);
+		*wqe += sizeof(struct mlx5_wqe_atomic_seg);
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ * Parse responder WQE. Advances the wqe pointer to point at the
+ * scatter-gather list, and set wqe_end to the end of the WQE.
+ */
+static int mlx5_ib_mr_responder_pfault_handler(
+	struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
+	struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
+{
+	struct mlx5_ib_wq *wq = &qp->rq;
+	int wqe_size = 1 << wq->wqe_shift;
+
+	if (qp->ibqp.srq) {
+		mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n");
+		return -EFAULT;
+	}
+
+	if (qp->wq_sig) {
+		mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
+		return -EFAULT;
+	}
+
+	if (wqe_size > wqe_length) {
+		mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
+		return -EFAULT;
+	}
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_RC:
+		if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
+		      IB_ODP_SUPPORT_RECV))
+			goto invalid_transport_or_opcode;
+		break;
+	default:
+invalid_transport_or_opcode:
+		mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
+			    qp->ibqp.qp_type);
+		return -EFAULT;
+	}
+
+	*wqe_end = *wqe + wqe_size;
+
+	return 0;
+}
+
+static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev,
+					      u32 wq_num)
+{
+	struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num);
+
+	if (!mqp) {
+		mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num);
+		return NULL;
+	}
+
+	return to_mibqp(mqp);
+}
+
+static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
+					  struct mlx5_pagefault *pfault)
+{
+	int ret;
+	void *wqe, *wqe_end;
+	u32 bytes_mapped, total_wqe_bytes;
+	char *buffer = NULL;
+	int resume_with_error = 1;
+	u16 wqe_index = pfault->wqe.wqe_index;
+	int requestor = pfault->type & MLX5_PFAULT_REQUESTOR;
+	struct mlx5_ib_qp *qp;
+
+	buffer = (char *)__get_free_page(GFP_KERNEL);
+	if (!buffer) {
+		mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
+		goto resolve_page_fault;
+	}
+
+	qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num);
+	if (!qp)
+		goto resolve_page_fault;
+
+	ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
+				    PAGE_SIZE, &qp->trans_qp.base);
+	if (ret < 0) {
+		mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n",
+			    ret, wqe_index, pfault->token);
+		goto resolve_page_fault;
+	}
+
+	wqe = buffer;
+	if (requestor)
+		ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe,
+							  &wqe_end, ret);
+	else
+		ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe,
+							  &wqe_end, ret);
+	if (ret < 0)
+		goto resolve_page_fault;
+
+	if (wqe >= wqe_end) {
+		mlx5_ib_err(dev, "ODP fault on invalid WQE.\n");
+		goto resolve_page_fault;
+	}
+
+	ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end,
+				      &bytes_mapped, &total_wqe_bytes,
+				      !requestor);
+	if (ret == -EAGAIN) {
+		resume_with_error = 0;
+		goto resolve_page_fault;
+	} else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
+		goto resolve_page_fault;
+	}
+
+	resume_with_error = 0;
+resolve_page_fault:
+	mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
+	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
+		    pfault->wqe.wq_num, resume_with_error,
+		    pfault->type);
+	free_page((unsigned long)buffer);
+}
+
+static int pages_in_range(u64 address, u32 length)
+{
+	return (ALIGN(address + length, PAGE_SIZE) -
+		(address & PAGE_MASK)) >> PAGE_SHIFT;
+}
+
+static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
+					   struct mlx5_pagefault *pfault)
+{
+	u64 address;
+	u32 length;
+	u32 prefetch_len = pfault->bytes_committed;
+	int prefetch_activated = 0;
+	u32 rkey = pfault->rdma.r_key;
+	int ret;
+
+	/* The RDMA responder handler handles the page fault in two parts.
+	 * First it brings the necessary pages for the current packet
+	 * (and uses the pfault context), and then (after resuming the QP)
+	 * prefetches more pages. The second operation cannot use the pfault
+	 * context and therefore uses the dummy_pfault context allocated on
+	 * the stack */
+	pfault->rdma.rdma_va += pfault->bytes_committed;
+	pfault->rdma.rdma_op_len -= min(pfault->bytes_committed,
+					 pfault->rdma.rdma_op_len);
+	pfault->bytes_committed = 0;
+
+	address = pfault->rdma.rdma_va;
+	length  = pfault->rdma.rdma_op_len;
+
+	/* For some operations, the hardware cannot tell the exact message
+	 * length, and in those cases it reports zero. Use prefetch
+	 * logic. */
+	if (length == 0) {
+		prefetch_activated = 1;
+		length = pfault->rdma.packet_size;
+		prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
+	}
+
+	ret = pagefault_single_data_segment(dev, rkey, address, length,
+					    &pfault->bytes_committed, NULL);
+	if (ret == -EAGAIN) {
+		/* We're racing with an invalidation, don't prefetch */
+		prefetch_activated = 0;
+	} else if (ret < 0 || pages_in_range(address, length) > ret) {
+		mlx5_ib_page_fault_resume(dev, pfault, 1);
+		if (ret != -ENOENT)
+			mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
+				    ret, pfault->token, pfault->type);
+		return;
+	}
+
+	mlx5_ib_page_fault_resume(dev, pfault, 0);
+	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
+		    pfault->token, pfault->type,
+		    prefetch_activated);
+
+	/* At this point, there might be a new pagefault already arriving in
+	 * the eq, switch to the dummy pagefault for the rest of the
+	 * processing. We're still OK with the objects being alive as the
+	 * work-queue is being fenced. */
+
+	if (prefetch_activated) {
+		u32 bytes_committed = 0;
+
+		ret = pagefault_single_data_segment(dev, rkey, address,
+						    prefetch_len,
+						    &bytes_committed, NULL);
+		if (ret < 0 && ret != -EAGAIN) {
+			mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
+				    ret, pfault->token, address, prefetch_len);
+		}
+	}
+}
+
+void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
+		    struct mlx5_pagefault *pfault)
+{
+	struct mlx5_ib_dev *dev = context;
+	u8 event_subtype = pfault->event_subtype;
+
+	switch (event_subtype) {
+	case MLX5_PFAULT_SUBTYPE_WQE:
+		mlx5_ib_mr_wqe_pfault_handler(dev, pfault);
+		break;
+	case MLX5_PFAULT_SUBTYPE_RDMA:
+		mlx5_ib_mr_rdma_pfault_handler(dev, pfault);
+		break;
+	default:
+		mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n",
+			    event_subtype);
+		mlx5_ib_page_fault_resume(dev, pfault, 1);
+	}
+}
+
+void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
+{
+	if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
+		return;
+
+	switch (ent->order - 2) {
+	case MLX5_IMR_MTT_CACHE_ENTRY:
+		ent->page = PAGE_SHIFT;
+		ent->xlt = MLX5_IMR_MTT_ENTRIES *
+			   sizeof(struct mlx5_mtt) /
+			   MLX5_IB_UMR_OCTOWORD;
+		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
+		ent->limit = 0;
+		break;
+
+	case MLX5_IMR_KSM_CACHE_ENTRY:
+		ent->page = MLX5_KSM_PAGE_SHIFT;
+		ent->xlt = mlx5_imr_ksm_entries *
+			   sizeof(struct mlx5_klm) /
+			   MLX5_IB_UMR_OCTOWORD;
+		ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
+		ent->limit = 0;
+		break;
+	}
+}
+
+int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
+{
+	int ret;
+
+	if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
+		ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
+		if (ret) {
+			mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+int mlx5_ib_odp_init(void)
+{
+	mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
+				       MLX5_IMR_MTT_BITS);
+
+	return 0;
+}
+
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
new file mode 100644
index 0000000..2193dc1
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -0,0 +1,5670 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/mlx5/fs.h>
+#include "mlx5_ib.h"
+#include "ib_rep.h"
+
+/* not supported currently */
+static int wq_signature;
+
+enum {
+	MLX5_IB_ACK_REQ_FREQ	= 8,
+};
+
+enum {
+	MLX5_IB_DEFAULT_SCHED_QUEUE	= 0x83,
+	MLX5_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
+	MLX5_IB_LINK_TYPE_IB		= 0,
+	MLX5_IB_LINK_TYPE_ETH		= 1
+};
+
+enum {
+	MLX5_IB_SQ_STRIDE	= 6,
+};
+
+static const u32 mlx5_ib_opcode[] = {
+	[IB_WR_SEND]				= MLX5_OPCODE_SEND,
+	[IB_WR_LSO]				= MLX5_OPCODE_LSO,
+	[IB_WR_SEND_WITH_IMM]			= MLX5_OPCODE_SEND_IMM,
+	[IB_WR_RDMA_WRITE]			= MLX5_OPCODE_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM]		= MLX5_OPCODE_RDMA_WRITE_IMM,
+	[IB_WR_RDMA_READ]			= MLX5_OPCODE_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP]		= MLX5_OPCODE_ATOMIC_CS,
+	[IB_WR_ATOMIC_FETCH_AND_ADD]		= MLX5_OPCODE_ATOMIC_FA,
+	[IB_WR_SEND_WITH_INV]			= MLX5_OPCODE_SEND_INVAL,
+	[IB_WR_LOCAL_INV]			= MLX5_OPCODE_UMR,
+	[IB_WR_REG_MR]				= MLX5_OPCODE_UMR,
+	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= MLX5_OPCODE_ATOMIC_MASKED_CS,
+	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= MLX5_OPCODE_ATOMIC_MASKED_FA,
+	[MLX5_IB_WR_UMR]			= MLX5_OPCODE_UMR,
+};
+
+struct mlx5_wqe_eth_pad {
+	u8 rsvd0[16];
+};
+
+enum raw_qp_set_mask_map {
+	MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID		= 1UL << 0,
+	MLX5_RAW_QP_RATE_LIMIT			= 1UL << 1,
+};
+
+struct mlx5_modify_raw_qp_param {
+	u16 operation;
+
+	u32 set_mask; /* raw_qp_set_mask_map */
+
+	struct mlx5_rate_limit rl;
+
+	u8 rq_q_ctr_id;
+};
+
+static void get_cqs(enum ib_qp_type qp_type,
+		    struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
+		    struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq);
+
+static int is_qp0(enum ib_qp_type qp_type)
+{
+	return qp_type == IB_QPT_SMI;
+}
+
+static int is_sqp(enum ib_qp_type qp_type)
+{
+	return is_qp0(qp_type) || is_qp1(qp_type);
+}
+
+static void *get_wqe(struct mlx5_ib_qp *qp, int offset)
+{
+	return mlx5_buf_offset(&qp->buf, offset);
+}
+
+static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE));
+}
+
+/**
+ * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space.
+ *
+ * @qp: QP to copy from.
+ * @send: copy from the send queue when non-zero, use the receive queue
+ *	  otherwise.
+ * @wqe_index:  index to start copying from. For send work queues, the
+ *		wqe_index is in units of MLX5_SEND_WQE_BB.
+ *		For receive work queue, it is the number of work queue
+ *		element in the queue.
+ * @buffer: destination buffer.
+ * @length: maximum number of bytes to copy.
+ *
+ * Copies at least a single WQE, but may copy more data.
+ *
+ * Return: the number of bytes copied, or an error code.
+ */
+int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
+			  void *buffer, u32 length,
+			  struct mlx5_ib_qp_base *base)
+{
+	struct ib_device *ibdev = qp->ibqp.device;
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq;
+	size_t offset;
+	size_t wq_end;
+	struct ib_umem *umem = base->ubuffer.umem;
+	u32 first_copy_length;
+	int wqe_length;
+	int ret;
+
+	if (wq->wqe_cnt == 0) {
+		mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n",
+			    qp->ibqp.qp_type);
+		return -EINVAL;
+	}
+
+	offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift);
+	wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift);
+
+	if (send && length < sizeof(struct mlx5_wqe_ctrl_seg))
+		return -EINVAL;
+
+	if (offset > umem->length ||
+	    (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length))
+		return -EINVAL;
+
+	first_copy_length = min_t(u32, offset + length, wq_end) - offset;
+	ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length);
+	if (ret)
+		return ret;
+
+	if (send) {
+		struct mlx5_wqe_ctrl_seg *ctrl = buffer;
+		int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
+
+		wqe_length = ds * MLX5_WQE_DS_UNITS;
+	} else {
+		wqe_length = 1 << wq->wqe_shift;
+	}
+
+	if (wqe_length <= first_copy_length)
+		return first_copy_length;
+
+	ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset,
+				wqe_length - first_copy_length);
+	if (ret)
+		return ret;
+
+	return wqe_length;
+}
+
+static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
+{
+	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
+	struct ib_event event;
+
+	if (type == MLX5_EVENT_TYPE_PATH_MIG) {
+		/* This event is only valid for trans_qps */
+		to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port;
+	}
+
+	if (ibqp->event_handler) {
+		event.device     = ibqp->device;
+		event.element.qp = ibqp;
+		switch (type) {
+		case MLX5_EVENT_TYPE_PATH_MIG:
+			event.event = IB_EVENT_PATH_MIG;
+			break;
+		case MLX5_EVENT_TYPE_COMM_EST:
+			event.event = IB_EVENT_COMM_EST;
+			break;
+		case MLX5_EVENT_TYPE_SQ_DRAINED:
+			event.event = IB_EVENT_SQ_DRAINED;
+			break;
+		case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			break;
+		case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+			event.event = IB_EVENT_QP_FATAL;
+			break;
+		case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+			event.event = IB_EVENT_PATH_MIG_ERR;
+			break;
+		case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			event.event = IB_EVENT_QP_REQ_ERR;
+			break;
+		case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+			event.event = IB_EVENT_QP_ACCESS_ERR;
+			break;
+		default:
+			pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn);
+			return;
+		}
+
+		ibqp->event_handler(&event, ibqp->qp_context);
+	}
+}
+
+static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap,
+		       int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd)
+{
+	int wqe_size;
+	int wq_size;
+
+	/* Sanity check RQ size before proceeding */
+	if (cap->max_recv_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)))
+		return -EINVAL;
+
+	if (!has_rq) {
+		qp->rq.max_gs = 0;
+		qp->rq.wqe_cnt = 0;
+		qp->rq.wqe_shift = 0;
+		cap->max_recv_wr = 0;
+		cap->max_recv_sge = 0;
+	} else {
+		if (ucmd) {
+			qp->rq.wqe_cnt = ucmd->rq_wqe_count;
+			if (ucmd->rq_wqe_shift > BITS_PER_BYTE * sizeof(ucmd->rq_wqe_shift))
+				return -EINVAL;
+			qp->rq.wqe_shift = ucmd->rq_wqe_shift;
+			if ((1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) < qp->wq_sig)
+				return -EINVAL;
+			qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
+			qp->rq.max_post = qp->rq.wqe_cnt;
+		} else {
+			wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0;
+			wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg);
+			wqe_size = roundup_pow_of_two(wqe_size);
+			wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size;
+			wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB);
+			qp->rq.wqe_cnt = wq_size / wqe_size;
+			if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq)) {
+				mlx5_ib_dbg(dev, "wqe_size %d, max %d\n",
+					    wqe_size,
+					    MLX5_CAP_GEN(dev->mdev,
+							 max_wqe_sz_rq));
+				return -EINVAL;
+			}
+			qp->rq.wqe_shift = ilog2(wqe_size);
+			qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
+			qp->rq.max_post = qp->rq.wqe_cnt;
+		}
+	}
+
+	return 0;
+}
+
+static int sq_overhead(struct ib_qp_init_attr *attr)
+{
+	int size = 0;
+
+	switch (attr->qp_type) {
+	case IB_QPT_XRC_INI:
+		size += sizeof(struct mlx5_wqe_xrc_seg);
+		/* fall through */
+	case IB_QPT_RC:
+		size += sizeof(struct mlx5_wqe_ctrl_seg) +
+			max(sizeof(struct mlx5_wqe_atomic_seg) +
+			    sizeof(struct mlx5_wqe_raddr_seg),
+			    sizeof(struct mlx5_wqe_umr_ctrl_seg) +
+			    sizeof(struct mlx5_mkey_seg));
+		break;
+
+	case IB_QPT_XRC_TGT:
+		return 0;
+
+	case IB_QPT_UC:
+		size += sizeof(struct mlx5_wqe_ctrl_seg) +
+			max(sizeof(struct mlx5_wqe_raddr_seg),
+			    sizeof(struct mlx5_wqe_umr_ctrl_seg) +
+			    sizeof(struct mlx5_mkey_seg));
+		break;
+
+	case IB_QPT_UD:
+		if (attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
+			size += sizeof(struct mlx5_wqe_eth_pad) +
+				sizeof(struct mlx5_wqe_eth_seg);
+		/* fall through */
+	case IB_QPT_SMI:
+	case MLX5_IB_QPT_HW_GSI:
+		size += sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_datagram_seg);
+		break;
+
+	case MLX5_IB_QPT_REG_UMR:
+		size += sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_umr_ctrl_seg) +
+			sizeof(struct mlx5_mkey_seg);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return size;
+}
+
+static int calc_send_wqe(struct ib_qp_init_attr *attr)
+{
+	int inl_size = 0;
+	int size;
+
+	size = sq_overhead(attr);
+	if (size < 0)
+		return size;
+
+	if (attr->cap.max_inline_data) {
+		inl_size = size + sizeof(struct mlx5_wqe_inline_seg) +
+			attr->cap.max_inline_data;
+	}
+
+	size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg);
+	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN &&
+	    ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE)
+			return MLX5_SIG_WQE_SIZE;
+	else
+		return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB);
+}
+
+static int get_send_sge(struct ib_qp_init_attr *attr, int wqe_size)
+{
+	int max_sge;
+
+	if (attr->qp_type == IB_QPT_RC)
+		max_sge = (min_t(int, wqe_size, 512) -
+			   sizeof(struct mlx5_wqe_ctrl_seg) -
+			   sizeof(struct mlx5_wqe_raddr_seg)) /
+			sizeof(struct mlx5_wqe_data_seg);
+	else if (attr->qp_type == IB_QPT_XRC_INI)
+		max_sge = (min_t(int, wqe_size, 512) -
+			   sizeof(struct mlx5_wqe_ctrl_seg) -
+			   sizeof(struct mlx5_wqe_xrc_seg) -
+			   sizeof(struct mlx5_wqe_raddr_seg)) /
+			sizeof(struct mlx5_wqe_data_seg);
+	else
+		max_sge = (wqe_size - sq_overhead(attr)) /
+			sizeof(struct mlx5_wqe_data_seg);
+
+	return min_t(int, max_sge, wqe_size - sq_overhead(attr) /
+		     sizeof(struct mlx5_wqe_data_seg));
+}
+
+static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
+			struct mlx5_ib_qp *qp)
+{
+	int wqe_size;
+	int wq_size;
+
+	if (!attr->cap.max_send_wr)
+		return 0;
+
+	wqe_size = calc_send_wqe(attr);
+	mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size);
+	if (wqe_size < 0)
+		return wqe_size;
+
+	if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) {
+		mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n",
+			    wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq));
+		return -EINVAL;
+	}
+
+	qp->max_inline_data = wqe_size - sq_overhead(attr) -
+			      sizeof(struct mlx5_wqe_inline_seg);
+	attr->cap.max_inline_data = qp->max_inline_data;
+
+	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
+		qp->signature_en = true;
+
+	wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size);
+	qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
+	if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) {
+		mlx5_ib_dbg(dev, "send queue size (%d * %d / %d -> %d) exceeds limits(%d)\n",
+			    attr->cap.max_send_wr, wqe_size, MLX5_SEND_WQE_BB,
+			    qp->sq.wqe_cnt,
+			    1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz));
+		return -ENOMEM;
+	}
+	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
+	qp->sq.max_gs = get_send_sge(attr, wqe_size);
+	if (qp->sq.max_gs < attr->cap.max_send_sge)
+		return -ENOMEM;
+
+	attr->cap.max_send_sge = qp->sq.max_gs;
+	qp->sq.max_post = wq_size / wqe_size;
+	attr->cap.max_send_wr = qp->sq.max_post;
+
+	return wq_size;
+}
+
+static int set_user_buf_size(struct mlx5_ib_dev *dev,
+			    struct mlx5_ib_qp *qp,
+			    struct mlx5_ib_create_qp *ucmd,
+			    struct mlx5_ib_qp_base *base,
+			    struct ib_qp_init_attr *attr)
+{
+	int desc_sz = 1 << qp->sq.wqe_shift;
+
+	if (desc_sz > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) {
+		mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n",
+			     desc_sz, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq));
+		return -EINVAL;
+	}
+
+	if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) {
+		mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n",
+			     ucmd->sq_wqe_count, ucmd->sq_wqe_count);
+		return -EINVAL;
+	}
+
+	qp->sq.wqe_cnt = ucmd->sq_wqe_count;
+
+	if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) {
+		mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n",
+			     qp->sq.wqe_cnt,
+			     1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz));
+		return -EINVAL;
+	}
+
+	if (attr->qp_type == IB_QPT_RAW_PACKET ||
+	    qp->flags & MLX5_IB_QP_UNDERLAY) {
+		base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+		qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6;
+	} else {
+		base->ubuffer.buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+					 (qp->sq.wqe_cnt << 6);
+	}
+
+	return 0;
+}
+
+static int qp_has_rq(struct ib_qp_init_attr *attr)
+{
+	if (attr->qp_type == IB_QPT_XRC_INI ||
+	    attr->qp_type == IB_QPT_XRC_TGT || attr->srq ||
+	    attr->qp_type == MLX5_IB_QPT_REG_UMR ||
+	    !attr->cap.max_recv_wr)
+		return 0;
+
+	return 1;
+}
+
+enum {
+	/* this is the first blue flame register in the array of bfregs assigned
+	 * to a processes. Since we do not use it for blue flame but rather
+	 * regular 64 bit doorbells, we do not need a lock for maintaiing
+	 * "odd/even" order
+	 */
+	NUM_NON_BLUE_FLAME_BFREGS = 1,
+};
+
+static int max_bfregs(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi)
+{
+	return get_num_static_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR;
+}
+
+static int num_med_bfreg(struct mlx5_ib_dev *dev,
+			 struct mlx5_bfreg_info *bfregi)
+{
+	int n;
+
+	n = max_bfregs(dev, bfregi) - bfregi->num_low_latency_bfregs -
+	    NUM_NON_BLUE_FLAME_BFREGS;
+
+	return n >= 0 ? n : 0;
+}
+
+static int first_med_bfreg(struct mlx5_ib_dev *dev,
+			   struct mlx5_bfreg_info *bfregi)
+{
+	return num_med_bfreg(dev, bfregi) ? 1 : -ENOMEM;
+}
+
+static int first_hi_bfreg(struct mlx5_ib_dev *dev,
+			  struct mlx5_bfreg_info *bfregi)
+{
+	int med;
+
+	med = num_med_bfreg(dev, bfregi);
+	return ++med;
+}
+
+static int alloc_high_class_bfreg(struct mlx5_ib_dev *dev,
+				  struct mlx5_bfreg_info *bfregi)
+{
+	int i;
+
+	for (i = first_hi_bfreg(dev, bfregi); i < max_bfregs(dev, bfregi); i++) {
+		if (!bfregi->count[i]) {
+			bfregi->count[i]++;
+			return i;
+		}
+	}
+
+	return -ENOMEM;
+}
+
+static int alloc_med_class_bfreg(struct mlx5_ib_dev *dev,
+				 struct mlx5_bfreg_info *bfregi)
+{
+	int minidx = first_med_bfreg(dev, bfregi);
+	int i;
+
+	if (minidx < 0)
+		return minidx;
+
+	for (i = minidx; i < first_hi_bfreg(dev, bfregi); i++) {
+		if (bfregi->count[i] < bfregi->count[minidx])
+			minidx = i;
+		if (!bfregi->count[minidx])
+			break;
+	}
+
+	bfregi->count[minidx]++;
+	return minidx;
+}
+
+static int alloc_bfreg(struct mlx5_ib_dev *dev,
+		       struct mlx5_bfreg_info *bfregi,
+		       enum mlx5_ib_latency_class lat)
+{
+	int bfregn = -EINVAL;
+
+	mutex_lock(&bfregi->lock);
+	switch (lat) {
+	case MLX5_IB_LATENCY_CLASS_LOW:
+		BUILD_BUG_ON(NUM_NON_BLUE_FLAME_BFREGS != 1);
+		bfregn = 0;
+		bfregi->count[bfregn]++;
+		break;
+
+	case MLX5_IB_LATENCY_CLASS_MEDIUM:
+		if (bfregi->ver < 2)
+			bfregn = -ENOMEM;
+		else
+			bfregn = alloc_med_class_bfreg(dev, bfregi);
+		break;
+
+	case MLX5_IB_LATENCY_CLASS_HIGH:
+		if (bfregi->ver < 2)
+			bfregn = -ENOMEM;
+		else
+			bfregn = alloc_high_class_bfreg(dev, bfregi);
+		break;
+	}
+	mutex_unlock(&bfregi->lock);
+
+	return bfregn;
+}
+
+void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn)
+{
+	mutex_lock(&bfregi->lock);
+	bfregi->count[bfregn]--;
+	mutex_unlock(&bfregi->lock);
+}
+
+static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:	return MLX5_QP_STATE_RST;
+	case IB_QPS_INIT:	return MLX5_QP_STATE_INIT;
+	case IB_QPS_RTR:	return MLX5_QP_STATE_RTR;
+	case IB_QPS_RTS:	return MLX5_QP_STATE_RTS;
+	case IB_QPS_SQD:	return MLX5_QP_STATE_SQD;
+	case IB_QPS_SQE:	return MLX5_QP_STATE_SQER;
+	case IB_QPS_ERR:	return MLX5_QP_STATE_ERR;
+	default:		return -1;
+	}
+}
+
+static int to_mlx5_st(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_RC:			return MLX5_QP_ST_RC;
+	case IB_QPT_UC:			return MLX5_QP_ST_UC;
+	case IB_QPT_UD:			return MLX5_QP_ST_UD;
+	case MLX5_IB_QPT_REG_UMR:	return MLX5_QP_ST_REG_UMR;
+	case IB_QPT_XRC_INI:
+	case IB_QPT_XRC_TGT:		return MLX5_QP_ST_XRC;
+	case IB_QPT_SMI:		return MLX5_QP_ST_QP0;
+	case MLX5_IB_QPT_HW_GSI:	return MLX5_QP_ST_QP1;
+	case MLX5_IB_QPT_DCI:		return MLX5_QP_ST_DCI;
+	case IB_QPT_RAW_IPV6:		return MLX5_QP_ST_RAW_IPV6;
+	case IB_QPT_RAW_PACKET:
+	case IB_QPT_RAW_ETHERTYPE:	return MLX5_QP_ST_RAW_ETHERTYPE;
+	case IB_QPT_MAX:
+	default:		return -EINVAL;
+	}
+}
+
+static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq,
+			     struct mlx5_ib_cq *recv_cq);
+static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq,
+			       struct mlx5_ib_cq *recv_cq);
+
+static int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
+			       struct mlx5_bfreg_info *bfregi, int bfregn,
+			       bool dyn_bfreg)
+{
+	int bfregs_per_sys_page;
+	int index_of_sys_page;
+	int offset;
+
+	bfregs_per_sys_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k) *
+				MLX5_NON_FP_BFREGS_PER_UAR;
+	index_of_sys_page = bfregn / bfregs_per_sys_page;
+
+	if (dyn_bfreg) {
+		index_of_sys_page += bfregi->num_static_sys_pages;
+		if (bfregn > bfregi->num_dyn_bfregs ||
+		    bfregi->sys_pages[index_of_sys_page] == MLX5_IB_INVALID_UAR_INDEX) {
+			mlx5_ib_dbg(dev, "Invalid dynamic uar index\n");
+			return -EINVAL;
+		}
+	}
+
+	offset = bfregn % bfregs_per_sys_page / MLX5_NON_FP_BFREGS_PER_UAR;
+	return bfregi->sys_pages[index_of_sys_page] + offset;
+}
+
+static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev,
+			    struct ib_pd *pd,
+			    unsigned long addr, size_t size,
+			    struct ib_umem **umem,
+			    int *npages, int *page_shift, int *ncont,
+			    u32 *offset)
+{
+	int err;
+
+	*umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0);
+	if (IS_ERR(*umem)) {
+		mlx5_ib_dbg(dev, "umem_get failed\n");
+		return PTR_ERR(*umem);
+	}
+
+	mlx5_ib_cont_pages(*umem, addr, 0, npages, page_shift, ncont, NULL);
+
+	err = mlx5_ib_get_buf_offset(addr, *page_shift, offset);
+	if (err) {
+		mlx5_ib_warn(dev, "bad offset\n");
+		goto err_umem;
+	}
+
+	mlx5_ib_dbg(dev, "addr 0x%lx, size %zu, npages %d, page_shift %d, ncont %d, offset %d\n",
+		    addr, size, *npages, *page_shift, *ncont, *offset);
+
+	return 0;
+
+err_umem:
+	ib_umem_release(*umem);
+	*umem = NULL;
+
+	return err;
+}
+
+static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			    struct mlx5_ib_rwq *rwq)
+{
+	struct mlx5_ib_ucontext *context;
+
+	if (rwq->create_flags & MLX5_IB_WQ_FLAGS_DELAY_DROP)
+		atomic_dec(&dev->delay_drop.rqs_cnt);
+
+	context = to_mucontext(pd->uobject->context);
+	mlx5_ib_db_unmap_user(context, &rwq->db);
+	if (rwq->umem)
+		ib_umem_release(rwq->umem);
+}
+
+static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			  struct mlx5_ib_rwq *rwq,
+			  struct mlx5_ib_create_wq *ucmd)
+{
+	struct mlx5_ib_ucontext *context;
+	int page_shift = 0;
+	int npages;
+	u32 offset = 0;
+	int ncont = 0;
+	int err;
+
+	if (!ucmd->buf_addr)
+		return -EINVAL;
+
+	context = to_mucontext(pd->uobject->context);
+	rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr,
+			       rwq->buf_size, 0, 0);
+	if (IS_ERR(rwq->umem)) {
+		mlx5_ib_dbg(dev, "umem_get failed\n");
+		err = PTR_ERR(rwq->umem);
+		return err;
+	}
+
+	mlx5_ib_cont_pages(rwq->umem, ucmd->buf_addr, 0, &npages, &page_shift,
+			   &ncont, NULL);
+	err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift,
+				     &rwq->rq_page_offset);
+	if (err) {
+		mlx5_ib_warn(dev, "bad offset\n");
+		goto err_umem;
+	}
+
+	rwq->rq_num_pas = ncont;
+	rwq->page_shift = page_shift;
+	rwq->log_page_size =  page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+	rwq->wq_sig = !!(ucmd->flags & MLX5_WQ_FLAG_SIGNATURE);
+
+	mlx5_ib_dbg(dev, "addr 0x%llx, size %zd, npages %d, page_shift %d, ncont %d, offset %d\n",
+		    (unsigned long long)ucmd->buf_addr, rwq->buf_size,
+		    npages, page_shift, ncont, offset);
+
+	err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db);
+	if (err) {
+		mlx5_ib_dbg(dev, "map failed\n");
+		goto err_umem;
+	}
+
+	rwq->create_type = MLX5_WQ_USER;
+	return 0;
+
+err_umem:
+	ib_umem_release(rwq->umem);
+	return err;
+}
+
+static int adjust_bfregn(struct mlx5_ib_dev *dev,
+			 struct mlx5_bfreg_info *bfregi, int bfregn)
+{
+	return bfregn / MLX5_NON_FP_BFREGS_PER_UAR * MLX5_BFREGS_PER_UAR +
+				bfregn % MLX5_NON_FP_BFREGS_PER_UAR;
+}
+
+static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			  struct mlx5_ib_qp *qp, struct ib_udata *udata,
+			  struct ib_qp_init_attr *attr,
+			  u32 **in,
+			  struct mlx5_ib_create_qp_resp *resp, int *inlen,
+			  struct mlx5_ib_qp_base *base)
+{
+	struct mlx5_ib_ucontext *context;
+	struct mlx5_ib_create_qp ucmd;
+	struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer;
+	int page_shift = 0;
+	int uar_index = 0;
+	int npages;
+	u32 offset = 0;
+	int bfregn;
+	int ncont = 0;
+	__be64 *pas;
+	void *qpc;
+	int err;
+
+	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
+	if (err) {
+		mlx5_ib_dbg(dev, "copy failed\n");
+		return err;
+	}
+
+	context = to_mucontext(pd->uobject->context);
+	if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) {
+		uar_index = bfregn_to_uar_index(dev, &context->bfregi,
+						ucmd.bfreg_index, true);
+		if (uar_index < 0)
+			return uar_index;
+
+		bfregn = MLX5_IB_INVALID_BFREG;
+	} else if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) {
+		/*
+		 * TBD: should come from the verbs when we have the API
+		 */
+		/* In CROSS_CHANNEL CQ and QP must use the same UAR */
+		bfregn = MLX5_CROSS_CHANNEL_BFREG;
+	}
+	else {
+		bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_HIGH);
+		if (bfregn < 0) {
+			mlx5_ib_dbg(dev, "failed to allocate low latency BFREG\n");
+			mlx5_ib_dbg(dev, "reverting to medium latency\n");
+			bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_MEDIUM);
+			if (bfregn < 0) {
+				mlx5_ib_dbg(dev, "failed to allocate medium latency BFREG\n");
+				mlx5_ib_dbg(dev, "reverting to high latency\n");
+				bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_LOW);
+				if (bfregn < 0) {
+					mlx5_ib_warn(dev, "bfreg allocation failed\n");
+					return bfregn;
+				}
+			}
+		}
+	}
+
+	mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index);
+	if (bfregn != MLX5_IB_INVALID_BFREG)
+		uar_index = bfregn_to_uar_index(dev, &context->bfregi, bfregn,
+						false);
+
+	qp->rq.offset = 0;
+	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
+	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+
+	err = set_user_buf_size(dev, qp, &ucmd, base, attr);
+	if (err)
+		goto err_bfreg;
+
+	if (ucmd.buf_addr && ubuffer->buf_size) {
+		ubuffer->buf_addr = ucmd.buf_addr;
+		err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr,
+				       ubuffer->buf_size,
+				       &ubuffer->umem, &npages, &page_shift,
+				       &ncont, &offset);
+		if (err)
+			goto err_bfreg;
+	} else {
+		ubuffer->umem = NULL;
+	}
+
+	*inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
+		 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * ncont;
+	*in = kvzalloc(*inlen, GFP_KERNEL);
+	if (!*in) {
+		err = -ENOMEM;
+		goto err_umem;
+	}
+
+	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas);
+	if (ubuffer->umem)
+		mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, pas, 0);
+
+	qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc);
+
+	MLX5_SET(qpc, qpc, log_page_size, page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET(qpc, qpc, page_offset, offset);
+
+	MLX5_SET(qpc, qpc, uar_page, uar_index);
+	if (bfregn != MLX5_IB_INVALID_BFREG)
+		resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn);
+	else
+		resp->bfreg_index = MLX5_IB_INVALID_BFREG;
+	qp->bfregn = bfregn;
+
+	err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
+	if (err) {
+		mlx5_ib_dbg(dev, "map failed\n");
+		goto err_free;
+	}
+
+	err = ib_copy_to_udata(udata, resp, min(udata->outlen, sizeof(*resp)));
+	if (err) {
+		mlx5_ib_dbg(dev, "copy failed\n");
+		goto err_unmap;
+	}
+	qp->create_type = MLX5_QP_USER;
+
+	return 0;
+
+err_unmap:
+	mlx5_ib_db_unmap_user(context, &qp->db);
+
+err_free:
+	kvfree(*in);
+
+err_umem:
+	if (ubuffer->umem)
+		ib_umem_release(ubuffer->umem);
+
+err_bfreg:
+	if (bfregn != MLX5_IB_INVALID_BFREG)
+		mlx5_ib_free_bfreg(dev, &context->bfregi, bfregn);
+	return err;
+}
+
+static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			    struct mlx5_ib_qp *qp, struct mlx5_ib_qp_base *base)
+{
+	struct mlx5_ib_ucontext *context;
+
+	context = to_mucontext(pd->uobject->context);
+	mlx5_ib_db_unmap_user(context, &qp->db);
+	if (base->ubuffer.umem)
+		ib_umem_release(base->ubuffer.umem);
+
+	/*
+	 * Free only the BFREGs which are handled by the kernel.
+	 * BFREGs of UARs allocated dynamically are handled by user.
+	 */
+	if (qp->bfregn != MLX5_IB_INVALID_BFREG)
+		mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn);
+}
+
+static int create_kernel_qp(struct mlx5_ib_dev *dev,
+			    struct ib_qp_init_attr *init_attr,
+			    struct mlx5_ib_qp *qp,
+			    u32 **in, int *inlen,
+			    struct mlx5_ib_qp_base *base)
+{
+	int uar_index;
+	void *qpc;
+	int err;
+
+	if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN |
+					IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
+					IB_QP_CREATE_IPOIB_UD_LSO |
+					IB_QP_CREATE_NETIF_QP |
+					mlx5_ib_create_qp_sqpn_qp1()))
+		return -EINVAL;
+
+	if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)
+		qp->bf.bfreg = &dev->fp_bfreg;
+	else
+		qp->bf.bfreg = &dev->bfreg;
+
+	/* We need to divide by two since each register is comprised of
+	 * two buffers of identical size, namely odd and even
+	 */
+	qp->bf.buf_size = (1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size)) / 2;
+	uar_index = qp->bf.bfreg->index;
+
+	err = calc_sq_size(dev, init_attr, qp);
+	if (err < 0) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		return err;
+	}
+
+	qp->rq.offset = 0;
+	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+	base->ubuffer.buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
+
+	err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, &qp->buf);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		return err;
+	}
+
+	qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt);
+	*inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
+		 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * qp->buf.npages;
+	*in = kvzalloc(*inlen, GFP_KERNEL);
+	if (!*in) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+
+	qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc);
+	MLX5_SET(qpc, qpc, uar_page, uar_index);
+	MLX5_SET(qpc, qpc, log_page_size, qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+
+	/* Set "fast registration enabled" for all kernel QPs */
+	MLX5_SET(qpc, qpc, fre, 1);
+	MLX5_SET(qpc, qpc, rlky, 1);
+
+	if (init_attr->create_flags & mlx5_ib_create_qp_sqpn_qp1()) {
+		MLX5_SET(qpc, qpc, deth_sqpn, 1);
+		qp->flags |= MLX5_IB_QP_SQPN_QP1;
+	}
+
+	mlx5_fill_page_array(&qp->buf,
+			     (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas));
+
+	err = mlx5_db_alloc(dev->mdev, &qp->db);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		goto err_free;
+	}
+
+	qp->sq.wrid = kvmalloc_array(qp->sq.wqe_cnt,
+				     sizeof(*qp->sq.wrid), GFP_KERNEL);
+	qp->sq.wr_data = kvmalloc_array(qp->sq.wqe_cnt,
+					sizeof(*qp->sq.wr_data), GFP_KERNEL);
+	qp->rq.wrid = kvmalloc_array(qp->rq.wqe_cnt,
+				     sizeof(*qp->rq.wrid), GFP_KERNEL);
+	qp->sq.w_list = kvmalloc_array(qp->sq.wqe_cnt,
+				       sizeof(*qp->sq.w_list), GFP_KERNEL);
+	qp->sq.wqe_head = kvmalloc_array(qp->sq.wqe_cnt,
+					 sizeof(*qp->sq.wqe_head), GFP_KERNEL);
+
+	if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid ||
+	    !qp->sq.w_list || !qp->sq.wqe_head) {
+		err = -ENOMEM;
+		goto err_wrid;
+	}
+	qp->create_type = MLX5_QP_KERNEL;
+
+	return 0;
+
+err_wrid:
+	kvfree(qp->sq.wqe_head);
+	kvfree(qp->sq.w_list);
+	kvfree(qp->sq.wrid);
+	kvfree(qp->sq.wr_data);
+	kvfree(qp->rq.wrid);
+	mlx5_db_free(dev->mdev, &qp->db);
+
+err_free:
+	kvfree(*in);
+
+err_buf:
+	mlx5_buf_free(dev->mdev, &qp->buf);
+	return err;
+}
+
+static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+{
+	kvfree(qp->sq.wqe_head);
+	kvfree(qp->sq.w_list);
+	kvfree(qp->sq.wrid);
+	kvfree(qp->sq.wr_data);
+	kvfree(qp->rq.wrid);
+	mlx5_db_free(dev->mdev, &qp->db);
+	mlx5_buf_free(dev->mdev, &qp->buf);
+}
+
+static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
+{
+	if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) ||
+	    (attr->qp_type == MLX5_IB_QPT_DCI) ||
+	    (attr->qp_type == IB_QPT_XRC_INI))
+		return MLX5_SRQ_RQ;
+	else if (!qp->has_rq)
+		return MLX5_ZERO_LEN_RQ;
+	else
+		return MLX5_NON_ZERO_RQ;
+}
+
+static int is_connected(enum ib_qp_type qp_type)
+{
+	if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC)
+		return 1;
+
+	return 0;
+}
+
+static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
+				    struct mlx5_ib_qp *qp,
+				    struct mlx5_ib_sq *sq, u32 tdn)
+{
+	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0};
+	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+
+	MLX5_SET(tisc, tisc, transport_domain, tdn);
+	if (qp->flags & MLX5_IB_QP_UNDERLAY)
+		MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn);
+
+	return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn);
+}
+
+static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
+				      struct mlx5_ib_sq *sq)
+{
+	mlx5_core_destroy_tis(dev->mdev, sq->tisn);
+}
+
+static void destroy_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
+				       struct mlx5_ib_sq *sq)
+{
+	if (sq->flow_rule)
+		mlx5_del_flow_rules(sq->flow_rule);
+}
+
+static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
+				   struct mlx5_ib_sq *sq, void *qpin,
+				   struct ib_pd *pd)
+{
+	struct mlx5_ib_ubuffer *ubuffer = &sq->ubuffer;
+	__be64 *pas;
+	void *in;
+	void *sqc;
+	void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc);
+	void *wq;
+	int inlen;
+	int err;
+	int page_shift = 0;
+	int npages;
+	int ncont = 0;
+	u32 offset = 0;
+
+	err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size,
+			       &sq->ubuffer.umem, &npages, &page_shift,
+			       &ncont, &offset);
+	if (err)
+		return err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * ncont;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_umem;
+	}
+
+	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
+	MLX5_SET(sqc, sqc, flush_in_error_en, 1);
+	if (MLX5_CAP_ETH(dev->mdev, multi_pkt_send_wqe))
+		MLX5_SET(sqc, sqc, allow_multi_pkt_send_wqe, 1);
+	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
+	MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index));
+	MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd));
+	MLX5_SET(sqc, sqc, tis_lst_sz, 1);
+	MLX5_SET(sqc, sqc, tis_num_0, sq->tisn);
+	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
+	    MLX5_CAP_ETH(dev->mdev, swp))
+		MLX5_SET(sqc, sqc, allow_swp, 1);
+
+	wq = MLX5_ADDR_OF(sqc, sqc, wq);
+	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+	MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd));
+	MLX5_SET(wq, wq, uar_page, MLX5_GET(qpc, qpc, uar_page));
+	MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr));
+	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
+	MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_sq_size));
+	MLX5_SET(wq, wq, log_wq_pg_sz,  page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET(wq, wq, page_offset, offset);
+
+	pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
+	mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0);
+
+	err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &sq->base.mqp);
+
+	kvfree(in);
+
+	if (err)
+		goto err_umem;
+
+	err = create_flow_rule_vport_sq(dev, sq);
+	if (err)
+		goto err_flow;
+
+	return 0;
+
+err_flow:
+	mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp);
+
+err_umem:
+	ib_umem_release(sq->ubuffer.umem);
+	sq->ubuffer.umem = NULL;
+
+	return err;
+}
+
+static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
+				     struct mlx5_ib_sq *sq)
+{
+	destroy_flow_rule_vport_sq(dev, sq);
+	mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp);
+	ib_umem_release(sq->ubuffer.umem);
+}
+
+static size_t get_rq_pas_size(void *qpc)
+{
+	u32 log_page_size = MLX5_GET(qpc, qpc, log_page_size) + 12;
+	u32 log_rq_stride = MLX5_GET(qpc, qpc, log_rq_stride);
+	u32 log_rq_size   = MLX5_GET(qpc, qpc, log_rq_size);
+	u32 page_offset   = MLX5_GET(qpc, qpc, page_offset);
+	u32 po_quanta	  = 1 << (log_page_size - 6);
+	u32 rq_sz	  = 1 << (log_rq_size + 4 + log_rq_stride);
+	u32 page_size	  = 1 << log_page_size;
+	u32 rq_sz_po      = rq_sz + (page_offset * po_quanta);
+	u32 rq_num_pas	  = (rq_sz_po + page_size - 1) / page_size;
+
+	return rq_num_pas * sizeof(u64);
+}
+
+static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
+				   struct mlx5_ib_rq *rq, void *qpin,
+				   size_t qpinlen)
+{
+	struct mlx5_ib_qp *mqp = rq->base.container_mibqp;
+	__be64 *pas;
+	__be64 *qp_pas;
+	void *in;
+	void *rqc;
+	void *wq;
+	void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc);
+	size_t rq_pas_size = get_rq_pas_size(qpc);
+	size_t inlen;
+	int err;
+
+	if (qpinlen < rq_pas_size + MLX5_BYTE_OFF(create_qp_in, pas))
+		return -EINVAL;
+
+	inlen = MLX5_ST_SZ_BYTES(create_rq_in) + rq_pas_size;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+	if (!(rq->flags & MLX5_IB_RQ_CVLAN_STRIPPING))
+		MLX5_SET(rqc, rqc, vsd, 1);
+	MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE);
+	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
+	MLX5_SET(rqc, rqc, flush_in_error_en, 1);
+	MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index));
+	MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv));
+
+	if (mqp->flags & MLX5_IB_QP_CAP_SCATTER_FCS)
+		MLX5_SET(rqc, rqc, scatter_fcs, 1);
+
+	wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+	if (rq->flags & MLX5_IB_RQ_PCI_WRITE_END_PADDING)
+		MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
+	MLX5_SET(wq, wq, page_offset, MLX5_GET(qpc, qpc, page_offset));
+	MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd));
+	MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr));
+	MLX5_SET(wq, wq, log_wq_stride, MLX5_GET(qpc, qpc, log_rq_stride) + 4);
+	MLX5_SET(wq, wq, log_wq_pg_sz, MLX5_GET(qpc, qpc, log_page_size));
+	MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_rq_size));
+
+	pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
+	qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas);
+	memcpy(pas, qp_pas, rq_pas_size);
+
+	err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rq->base.mqp);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
+				     struct mlx5_ib_rq *rq)
+{
+	mlx5_core_destroy_rq_tracked(dev->mdev, &rq->base.mqp);
+}
+
+static bool tunnel_offload_supported(struct mlx5_core_dev *dev)
+{
+	return  (MLX5_CAP_ETH(dev, tunnel_stateless_vxlan) ||
+		 MLX5_CAP_ETH(dev, tunnel_stateless_gre) ||
+		 MLX5_CAP_ETH(dev, tunnel_stateless_geneve_rx));
+}
+
+static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
+				    struct mlx5_ib_rq *rq, u32 tdn,
+				    bool tunnel_offload_en)
+{
+	u32 *in;
+	void *tirc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT);
+	MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn);
+	MLX5_SET(tirc, tirc, transport_domain, tdn);
+	if (tunnel_offload_en)
+		MLX5_SET(tirc, tirc, tunneled_offload_en, 1);
+
+	if (dev->rep)
+		MLX5_SET(tirc, tirc, self_lb_block,
+			 MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_);
+
+	err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
+				      struct mlx5_ib_rq *rq)
+{
+	mlx5_core_destroy_tir(dev->mdev, rq->tirn);
+}
+
+static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+				u32 *in, size_t inlen,
+				struct ib_pd *pd)
+{
+	struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+	struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+	struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+	struct ib_uobject *uobj = pd->uobject;
+	struct ib_ucontext *ucontext = uobj->context;
+	struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+	int err;
+	u32 tdn = mucontext->tdn;
+
+	if (qp->sq.wqe_cnt) {
+		err = create_raw_packet_qp_tis(dev, qp, sq, tdn);
+		if (err)
+			return err;
+
+		err = create_raw_packet_qp_sq(dev, sq, in, pd);
+		if (err)
+			goto err_destroy_tis;
+
+		sq->base.container_mibqp = qp;
+		sq->base.mqp.event = mlx5_ib_qp_event;
+	}
+
+	if (qp->rq.wqe_cnt) {
+		rq->base.container_mibqp = qp;
+
+		if (qp->flags & MLX5_IB_QP_CVLAN_STRIPPING)
+			rq->flags |= MLX5_IB_RQ_CVLAN_STRIPPING;
+		if (qp->flags & MLX5_IB_QP_PCI_WRITE_END_PADDING)
+			rq->flags |= MLX5_IB_RQ_PCI_WRITE_END_PADDING;
+		err = create_raw_packet_qp_rq(dev, rq, in, inlen);
+		if (err)
+			goto err_destroy_sq;
+
+
+		err = create_raw_packet_qp_tir(dev, rq, tdn,
+					       qp->tunnel_offload_en);
+		if (err)
+			goto err_destroy_rq;
+	}
+
+	qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn :
+						     rq->base.mqp.qpn;
+
+	return 0;
+
+err_destroy_rq:
+	destroy_raw_packet_qp_rq(dev, rq);
+err_destroy_sq:
+	if (!qp->sq.wqe_cnt)
+		return err;
+	destroy_raw_packet_qp_sq(dev, sq);
+err_destroy_tis:
+	destroy_raw_packet_qp_tis(dev, sq);
+
+	return err;
+}
+
+static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev,
+				  struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+	struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+	struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+
+	if (qp->rq.wqe_cnt) {
+		destroy_raw_packet_qp_tir(dev, rq);
+		destroy_raw_packet_qp_rq(dev, rq);
+	}
+
+	if (qp->sq.wqe_cnt) {
+		destroy_raw_packet_qp_sq(dev, sq);
+		destroy_raw_packet_qp_tis(dev, sq);
+	}
+}
+
+static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp,
+				    struct mlx5_ib_raw_packet_qp *raw_packet_qp)
+{
+	struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+	struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+
+	sq->sq = &qp->sq;
+	rq->rq = &qp->rq;
+	sq->doorbell = &qp->db;
+	rq->doorbell = &qp->db;
+}
+
+static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+{
+	mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn);
+}
+
+static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+				 struct ib_pd *pd,
+				 struct ib_qp_init_attr *init_attr,
+				 struct ib_udata *udata)
+{
+	struct ib_uobject *uobj = pd->uobject;
+	struct ib_ucontext *ucontext = uobj->context;
+	struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+	struct mlx5_ib_create_qp_resp resp = {};
+	int inlen;
+	int err;
+	u32 *in;
+	void *tirc;
+	void *hfso;
+	u32 selected_fields = 0;
+	u32 outer_l4;
+	size_t min_resp_len;
+	u32 tdn = mucontext->tdn;
+	struct mlx5_ib_create_qp_rss ucmd = {};
+	size_t required_cmd_sz;
+
+	if (init_attr->qp_type != IB_QPT_RAW_PACKET)
+		return -EOPNOTSUPP;
+
+	if (init_attr->create_flags || init_attr->send_cq)
+		return -EINVAL;
+
+	min_resp_len = offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index);
+	if (udata->outlen < min_resp_len)
+		return -EINVAL;
+
+	required_cmd_sz = offsetof(typeof(ucmd), flags) + sizeof(ucmd.flags);
+	if (udata->inlen < required_cmd_sz) {
+		mlx5_ib_dbg(dev, "invalid inlen\n");
+		return -EINVAL;
+	}
+
+	if (udata->inlen > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 udata->inlen - sizeof(ucmd))) {
+		mlx5_ib_dbg(dev, "inlen is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
+		mlx5_ib_dbg(dev, "copy failed\n");
+		return -EFAULT;
+	}
+
+	if (ucmd.comp_mask) {
+		mlx5_ib_dbg(dev, "invalid comp mask\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (ucmd.flags & ~MLX5_QP_FLAG_TUNNEL_OFFLOADS) {
+		mlx5_ib_dbg(dev, "invalid flags\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS &&
+	    !tunnel_offload_supported(dev->mdev)) {
+		mlx5_ib_dbg(dev, "tunnel offloads isn't supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_INNER &&
+	    !(ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS)) {
+		mlx5_ib_dbg(dev, "Tunnel offloads must be set for inner RSS\n");
+		return -EOPNOTSUPP;
+	}
+
+	err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp)));
+	if (err) {
+		mlx5_ib_dbg(dev, "copy failed\n");
+		return -EINVAL;
+	}
+
+	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+	MLX5_SET(tirc, tirc, disp_type,
+		 MLX5_TIRC_DISP_TYPE_INDIRECT);
+	MLX5_SET(tirc, tirc, indirect_table,
+		 init_attr->rwq_ind_tbl->ind_tbl_num);
+	MLX5_SET(tirc, tirc, transport_domain, tdn);
+
+	hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
+
+	if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS)
+		MLX5_SET(tirc, tirc, tunneled_offload_en, 1);
+
+	if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_INNER)
+		hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner);
+	else
+		hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
+
+	switch (ucmd.rx_hash_function) {
+	case MLX5_RX_HASH_FUNC_TOEPLITZ:
+	{
+		void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
+		size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key);
+
+		if (len != ucmd.rx_key_len) {
+			err = -EINVAL;
+			goto err;
+		}
+
+		MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
+		MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+		memcpy(rss_key, ucmd.rx_hash_key, len);
+		break;
+	}
+	default:
+		err = -EOPNOTSUPP;
+		goto err;
+	}
+
+	if (!ucmd.rx_hash_fields_mask) {
+		/* special case when this TIR serves as steering entry without hashing */
+		if (!init_attr->rwq_ind_tbl->log_ind_tbl_size)
+			goto create_tir;
+		err = -EINVAL;
+		goto err;
+	}
+
+	if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+	     (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) &&
+	     ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
+	     (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) {
+		err = -EINVAL;
+		goto err;
+	}
+
+	/* If none of IPV4 & IPV6 SRC/DST was set - this bit field is ignored */
+	if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+	    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4))
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+	else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
+		 (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+
+	outer_l4 = ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+		    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) << 0 |
+		   ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
+		    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) << 1 |
+		   (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI) << 2;
+
+	/* Check that only one l4 protocol is set */
+	if (outer_l4 & (outer_l4 - 1)) {
+		err = -EINVAL;
+		goto err;
+	}
+
+	/* If none of TCP & UDP SRC/DST was set - this bit field is ignored */
+	if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+	    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP))
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_TCP);
+	else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
+		 (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_UDP);
+
+	if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+	    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6))
+		selected_fields |= MLX5_HASH_FIELD_SEL_SRC_IP;
+
+	if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) ||
+	    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
+		selected_fields |= MLX5_HASH_FIELD_SEL_DST_IP;
+
+	if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+	    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP))
+		selected_fields |= MLX5_HASH_FIELD_SEL_L4_SPORT;
+
+	if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) ||
+	    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
+		selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT;
+
+	if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI)
+		selected_fields |= MLX5_HASH_FIELD_SEL_IPSEC_SPI;
+
+	MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields);
+
+create_tir:
+	if (dev->rep)
+		MLX5_SET(tirc, tirc, self_lb_block,
+			 MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_);
+
+	err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn);
+
+	if (err)
+		goto err;
+
+	kvfree(in);
+	/* qpn is reserved for that QP */
+	qp->trans_qp.base.mqp.qpn = 0;
+	qp->flags |= MLX5_IB_QP_RSS;
+	return 0;
+
+err:
+	kvfree(in);
+	return err;
+}
+
+static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata, struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_resources *devr = &dev->devr;
+	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_ib_create_qp_resp resp;
+	struct mlx5_ib_cq *send_cq;
+	struct mlx5_ib_cq *recv_cq;
+	unsigned long flags;
+	u32 uidx = MLX5_IB_DEFAULT_UIDX;
+	struct mlx5_ib_create_qp ucmd;
+	struct mlx5_ib_qp_base *base;
+	int mlx5_st;
+	void *qpc;
+	u32 *in;
+	int err;
+
+	mutex_init(&qp->mutex);
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+
+	mlx5_st = to_mlx5_st(init_attr->qp_type);
+	if (mlx5_st < 0)
+		return -EINVAL;
+
+	if (init_attr->rwq_ind_tbl) {
+		if (!udata)
+			return -ENOSYS;
+
+		err = create_rss_raw_qp_tir(dev, qp, pd, init_attr, udata);
+		return err;
+	}
+
+	if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
+		if (!MLX5_CAP_GEN(mdev, block_lb_mc)) {
+			mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n");
+			return -EINVAL;
+		} else {
+			qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+		}
+	}
+
+	if (init_attr->create_flags &
+			(IB_QP_CREATE_CROSS_CHANNEL |
+			 IB_QP_CREATE_MANAGED_SEND |
+			 IB_QP_CREATE_MANAGED_RECV)) {
+		if (!MLX5_CAP_GEN(mdev, cd)) {
+			mlx5_ib_dbg(dev, "cross-channel isn't supported\n");
+			return -EINVAL;
+		}
+		if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL)
+			qp->flags |= MLX5_IB_QP_CROSS_CHANNEL;
+		if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND)
+			qp->flags |= MLX5_IB_QP_MANAGED_SEND;
+		if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV)
+			qp->flags |= MLX5_IB_QP_MANAGED_RECV;
+	}
+
+	if (init_attr->qp_type == IB_QPT_UD &&
+	    (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO))
+		if (!MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
+			mlx5_ib_dbg(dev, "ipoib UD lso qp isn't supported\n");
+			return -EOPNOTSUPP;
+		}
+
+	if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) {
+		if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
+			mlx5_ib_dbg(dev, "Scatter FCS is supported only for Raw Packet QPs");
+			return -EOPNOTSUPP;
+		}
+		if (!MLX5_CAP_GEN(dev->mdev, eth_net_offloads) ||
+		    !MLX5_CAP_ETH(dev->mdev, scatter_fcs)) {
+			mlx5_ib_dbg(dev, "Scatter FCS isn't supported\n");
+			return -EOPNOTSUPP;
+		}
+		qp->flags |= MLX5_IB_QP_CAP_SCATTER_FCS;
+	}
+
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
+
+	if (init_attr->create_flags & IB_QP_CREATE_CVLAN_STRIPPING) {
+		if (!(MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
+		      MLX5_CAP_ETH(dev->mdev, vlan_cap)) ||
+		    (init_attr->qp_type != IB_QPT_RAW_PACKET))
+			return -EOPNOTSUPP;
+		qp->flags |= MLX5_IB_QP_CVLAN_STRIPPING;
+	}
+
+	if (pd && pd->uobject) {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+			mlx5_ib_dbg(dev, "copy failed\n");
+			return -EFAULT;
+		}
+
+		err = get_qp_user_index(to_mucontext(pd->uobject->context),
+					&ucmd, udata->inlen, &uidx);
+		if (err)
+			return err;
+
+		qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE);
+		qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE);
+		if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) {
+			if (init_attr->qp_type != IB_QPT_RAW_PACKET ||
+			    !tunnel_offload_supported(mdev)) {
+				mlx5_ib_dbg(dev, "Tunnel offload isn't supported\n");
+				return -EOPNOTSUPP;
+			}
+			qp->tunnel_offload_en = true;
+		}
+
+		if (init_attr->create_flags & IB_QP_CREATE_SOURCE_QPN) {
+			if (init_attr->qp_type != IB_QPT_UD ||
+			    (MLX5_CAP_GEN(dev->mdev, port_type) !=
+			     MLX5_CAP_PORT_TYPE_IB) ||
+			    !mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) {
+				mlx5_ib_dbg(dev, "Source QP option isn't supported\n");
+				return -EOPNOTSUPP;
+			}
+
+			qp->flags |= MLX5_IB_QP_UNDERLAY;
+			qp->underlay_qpn = init_attr->source_qpn;
+		}
+	} else {
+		qp->wq_sig = !!wq_signature;
+	}
+
+	base = (init_attr->qp_type == IB_QPT_RAW_PACKET ||
+		qp->flags & MLX5_IB_QP_UNDERLAY) ?
+	       &qp->raw_packet_qp.rq.base :
+	       &qp->trans_qp.base;
+
+	qp->has_rq = qp_has_rq(init_attr);
+	err = set_rq_size(dev, &init_attr->cap, qp->has_rq,
+			  qp, (pd && pd->uobject) ? &ucmd : NULL);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		return err;
+	}
+
+	if (pd) {
+		if (pd->uobject) {
+			__u32 max_wqes =
+				1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
+			mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count);
+			if (ucmd.rq_wqe_shift != qp->rq.wqe_shift ||
+			    ucmd.rq_wqe_count != qp->rq.wqe_cnt) {
+				mlx5_ib_dbg(dev, "invalid rq params\n");
+				return -EINVAL;
+			}
+			if (ucmd.sq_wqe_count > max_wqes) {
+				mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n",
+					    ucmd.sq_wqe_count, max_wqes);
+				return -EINVAL;
+			}
+			if (init_attr->create_flags &
+			    mlx5_ib_create_qp_sqpn_qp1()) {
+				mlx5_ib_dbg(dev, "user-space is not allowed to create UD QPs spoofing as QP1\n");
+				return -EINVAL;
+			}
+			err = create_user_qp(dev, pd, qp, udata, init_attr, &in,
+					     &resp, &inlen, base);
+			if (err)
+				mlx5_ib_dbg(dev, "err %d\n", err);
+		} else {
+			err = create_kernel_qp(dev, init_attr, qp, &in, &inlen,
+					       base);
+			if (err)
+				mlx5_ib_dbg(dev, "err %d\n", err);
+		}
+
+		if (err)
+			return err;
+	} else {
+		in = kvzalloc(inlen, GFP_KERNEL);
+		if (!in)
+			return -ENOMEM;
+
+		qp->create_type = MLX5_QP_EMPTY;
+	}
+
+	if (is_sqp(init_attr->qp_type))
+		qp->port = init_attr->port_num;
+
+	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+
+	MLX5_SET(qpc, qpc, st, mlx5_st);
+	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+
+	if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR)
+		MLX5_SET(qpc, qpc, pd, to_mpd(pd ? pd : devr->p0)->pdn);
+	else
+		MLX5_SET(qpc, qpc, latency_sensitive, 1);
+
+
+	if (qp->wq_sig)
+		MLX5_SET(qpc, qpc, wq_signature, 1);
+
+	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		MLX5_SET(qpc, qpc, block_lb_mc, 1);
+
+	if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+		MLX5_SET(qpc, qpc, cd_master, 1);
+	if (qp->flags & MLX5_IB_QP_MANAGED_SEND)
+		MLX5_SET(qpc, qpc, cd_slave_send, 1);
+	if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
+		MLX5_SET(qpc, qpc, cd_slave_receive, 1);
+
+	if (qp->scat_cqe && is_connected(init_attr->qp_type)) {
+		int rcqe_sz;
+		int scqe_sz;
+
+		rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq);
+		scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq);
+
+		if (rcqe_sz == 128)
+			MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE);
+		else
+			MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE);
+
+		if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) {
+			if (scqe_sz == 128)
+				MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA64_CQE);
+			else
+				MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE);
+		}
+	}
+
+	if (qp->rq.wqe_cnt) {
+		MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4);
+		MLX5_SET(qpc, qpc, log_rq_size, ilog2(qp->rq.wqe_cnt));
+	}
+
+	MLX5_SET(qpc, qpc, rq_type, get_rx_type(qp, init_attr));
+
+	if (qp->sq.wqe_cnt) {
+		MLX5_SET(qpc, qpc, log_sq_size, ilog2(qp->sq.wqe_cnt));
+	} else {
+		MLX5_SET(qpc, qpc, no_sq, 1);
+		if (init_attr->srq &&
+		    init_attr->srq->srq_type == IB_SRQT_TM)
+			MLX5_SET(qpc, qpc, offload_type,
+				 MLX5_QPC_OFFLOAD_TYPE_RNDV);
+	}
+
+	/* Set default resources */
+	switch (init_attr->qp_type) {
+	case IB_QPT_XRC_TGT:
+		MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn);
+		MLX5_SET(qpc, qpc, cqn_snd, to_mcq(devr->c0)->mcq.cqn);
+		MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn);
+		MLX5_SET(qpc, qpc, xrcd, to_mxrcd(init_attr->xrcd)->xrcdn);
+		break;
+	case IB_QPT_XRC_INI:
+		MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn);
+		MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn);
+		MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn);
+		break;
+	default:
+		if (init_attr->srq) {
+			MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x0)->xrcdn);
+			MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(init_attr->srq)->msrq.srqn);
+		} else {
+			MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn);
+			MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s1)->msrq.srqn);
+		}
+	}
+
+	if (init_attr->send_cq)
+		MLX5_SET(qpc, qpc, cqn_snd, to_mcq(init_attr->send_cq)->mcq.cqn);
+
+	if (init_attr->recv_cq)
+		MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(init_attr->recv_cq)->mcq.cqn);
+
+	MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
+
+	/* 0xffffff means we ask to work with cqe version 0 */
+	if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1)
+		MLX5_SET(qpc, qpc, user_index, uidx);
+
+	/* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */
+	if (init_attr->qp_type == IB_QPT_UD &&
+	    (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) {
+		MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1);
+		qp->flags |= MLX5_IB_QP_LSO;
+	}
+
+	if (init_attr->create_flags & IB_QP_CREATE_PCI_WRITE_END_PADDING) {
+		if (!MLX5_CAP_GEN(dev->mdev, end_pad)) {
+			mlx5_ib_dbg(dev, "scatter end padding is not supported\n");
+			err = -EOPNOTSUPP;
+			goto err;
+		} else if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
+			MLX5_SET(qpc, qpc, end_padding_mode,
+				 MLX5_WQ_END_PAD_MODE_ALIGN);
+		} else {
+			qp->flags |= MLX5_IB_QP_PCI_WRITE_END_PADDING;
+		}
+	}
+
+	if (inlen < 0) {
+		err = -EINVAL;
+		goto err;
+	}
+
+	if (init_attr->qp_type == IB_QPT_RAW_PACKET ||
+	    qp->flags & MLX5_IB_QP_UNDERLAY) {
+		qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr;
+		raw_packet_qp_copy_info(qp, &qp->raw_packet_qp);
+		err = create_raw_packet_qp(dev, qp, in, inlen, pd);
+	} else {
+		err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen);
+	}
+
+	if (err) {
+		mlx5_ib_dbg(dev, "create qp failed\n");
+		goto err_create;
+	}
+
+	kvfree(in);
+
+	base->container_mibqp = qp;
+	base->mqp.event = mlx5_ib_qp_event;
+
+	get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq,
+		&send_cq, &recv_cq);
+	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+	mlx5_ib_lock_cqs(send_cq, recv_cq);
+	/* Maintain device to QPs access, needed for further handling via reset
+	 * flow
+	 */
+	list_add_tail(&qp->qps_list, &dev->qp_list);
+	/* Maintain CQ to QPs access, needed for further handling via reset flow
+	 */
+	if (send_cq)
+		list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp);
+	if (recv_cq)
+		list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp);
+	mlx5_ib_unlock_cqs(send_cq, recv_cq);
+	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+
+	return 0;
+
+err_create:
+	if (qp->create_type == MLX5_QP_USER)
+		destroy_qp_user(dev, pd, qp, base);
+	else if (qp->create_type == MLX5_QP_KERNEL)
+		destroy_qp_kernel(dev, qp);
+
+err:
+	kvfree(in);
+	return err;
+}
+
+static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq)
+	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
+{
+	if (send_cq) {
+		if (recv_cq) {
+			if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
+				spin_lock(&send_cq->lock);
+				spin_lock_nested(&recv_cq->lock,
+						 SINGLE_DEPTH_NESTING);
+			} else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
+				spin_lock(&send_cq->lock);
+				__acquire(&recv_cq->lock);
+			} else {
+				spin_lock(&recv_cq->lock);
+				spin_lock_nested(&send_cq->lock,
+						 SINGLE_DEPTH_NESTING);
+			}
+		} else {
+			spin_lock(&send_cq->lock);
+			__acquire(&recv_cq->lock);
+		}
+	} else if (recv_cq) {
+		spin_lock(&recv_cq->lock);
+		__acquire(&send_cq->lock);
+	} else {
+		__acquire(&send_cq->lock);
+		__acquire(&recv_cq->lock);
+	}
+}
+
+static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq)
+	__releases(&send_cq->lock) __releases(&recv_cq->lock)
+{
+	if (send_cq) {
+		if (recv_cq) {
+			if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
+				spin_unlock(&recv_cq->lock);
+				spin_unlock(&send_cq->lock);
+			} else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
+				__release(&recv_cq->lock);
+				spin_unlock(&send_cq->lock);
+			} else {
+				spin_unlock(&send_cq->lock);
+				spin_unlock(&recv_cq->lock);
+			}
+		} else {
+			__release(&recv_cq->lock);
+			spin_unlock(&send_cq->lock);
+		}
+	} else if (recv_cq) {
+		__release(&send_cq->lock);
+		spin_unlock(&recv_cq->lock);
+	} else {
+		__release(&recv_cq->lock);
+		__release(&send_cq->lock);
+	}
+}
+
+static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp)
+{
+	return to_mpd(qp->ibqp.pd);
+}
+
+static void get_cqs(enum ib_qp_type qp_type,
+		    struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
+		    struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq)
+{
+	switch (qp_type) {
+	case IB_QPT_XRC_TGT:
+		*send_cq = NULL;
+		*recv_cq = NULL;
+		break;
+	case MLX5_IB_QPT_REG_UMR:
+	case IB_QPT_XRC_INI:
+		*send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL;
+		*recv_cq = NULL;
+		break;
+
+	case IB_QPT_SMI:
+	case MLX5_IB_QPT_HW_GSI:
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	case IB_QPT_RAW_IPV6:
+	case IB_QPT_RAW_ETHERTYPE:
+	case IB_QPT_RAW_PACKET:
+		*send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL;
+		*recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL;
+		break;
+
+	case IB_QPT_MAX:
+	default:
+		*send_cq = NULL;
+		*recv_cq = NULL;
+		break;
+	}
+}
+
+static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+				const struct mlx5_modify_raw_qp_param *raw_qp_param,
+				u8 lag_tx_affinity);
+
+static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_cq *send_cq, *recv_cq;
+	struct mlx5_ib_qp_base *base;
+	unsigned long flags;
+	int err;
+
+	if (qp->ibqp.rwq_ind_tbl) {
+		destroy_rss_raw_qp_tir(dev, qp);
+		return;
+	}
+
+	base = (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
+		qp->flags & MLX5_IB_QP_UNDERLAY) ?
+	       &qp->raw_packet_qp.rq.base :
+	       &qp->trans_qp.base;
+
+	if (qp->state != IB_QPS_RESET) {
+		if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET &&
+		    !(qp->flags & MLX5_IB_QP_UNDERLAY)) {
+			err = mlx5_core_qp_modify(dev->mdev,
+						  MLX5_CMD_OP_2RST_QP, 0,
+						  NULL, &base->mqp);
+		} else {
+			struct mlx5_modify_raw_qp_param raw_qp_param = {
+				.operation = MLX5_CMD_OP_2RST_QP
+			};
+
+			err = modify_raw_packet_qp(dev, qp, &raw_qp_param, 0);
+		}
+		if (err)
+			mlx5_ib_warn(dev, "mlx5_ib: modify QP 0x%06x to RESET failed\n",
+				     base->mqp.qpn);
+	}
+
+	get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
+		&send_cq, &recv_cq);
+
+	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+	mlx5_ib_lock_cqs(send_cq, recv_cq);
+	/* del from lists under both locks above to protect reset flow paths */
+	list_del(&qp->qps_list);
+	if (send_cq)
+		list_del(&qp->cq_send_list);
+
+	if (recv_cq)
+		list_del(&qp->cq_recv_list);
+
+	if (qp->create_type == MLX5_QP_KERNEL) {
+		__mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
+				   qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+		if (send_cq != recv_cq)
+			__mlx5_ib_cq_clean(send_cq, base->mqp.qpn,
+					   NULL);
+	}
+	mlx5_ib_unlock_cqs(send_cq, recv_cq);
+	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+
+	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
+	    qp->flags & MLX5_IB_QP_UNDERLAY) {
+		destroy_raw_packet_qp(dev, qp);
+	} else {
+		err = mlx5_core_destroy_qp(dev->mdev, &base->mqp);
+		if (err)
+			mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n",
+				     base->mqp.qpn);
+	}
+
+	if (qp->create_type == MLX5_QP_KERNEL)
+		destroy_qp_kernel(dev, qp);
+	else if (qp->create_type == MLX5_QP_USER)
+		destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base);
+}
+
+static const char *ib_qp_type_str(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_SMI:
+		return "IB_QPT_SMI";
+	case IB_QPT_GSI:
+		return "IB_QPT_GSI";
+	case IB_QPT_RC:
+		return "IB_QPT_RC";
+	case IB_QPT_UC:
+		return "IB_QPT_UC";
+	case IB_QPT_UD:
+		return "IB_QPT_UD";
+	case IB_QPT_RAW_IPV6:
+		return "IB_QPT_RAW_IPV6";
+	case IB_QPT_RAW_ETHERTYPE:
+		return "IB_QPT_RAW_ETHERTYPE";
+	case IB_QPT_XRC_INI:
+		return "IB_QPT_XRC_INI";
+	case IB_QPT_XRC_TGT:
+		return "IB_QPT_XRC_TGT";
+	case IB_QPT_RAW_PACKET:
+		return "IB_QPT_RAW_PACKET";
+	case MLX5_IB_QPT_REG_UMR:
+		return "MLX5_IB_QPT_REG_UMR";
+	case IB_QPT_DRIVER:
+		return "IB_QPT_DRIVER";
+	case IB_QPT_MAX:
+	default:
+		return "Invalid QP type";
+	}
+}
+
+static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd,
+					struct ib_qp_init_attr *attr,
+					struct mlx5_ib_create_qp *ucmd)
+{
+	struct mlx5_ib_qp *qp;
+	int err = 0;
+	u32 uidx = MLX5_IB_DEFAULT_UIDX;
+	void *dctc;
+
+	if (!attr->srq || !attr->recv_cq)
+		return ERR_PTR(-EINVAL);
+
+	err = get_qp_user_index(to_mucontext(pd->uobject->context),
+				ucmd, sizeof(*ucmd), &uidx);
+	if (err)
+		return ERR_PTR(err);
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->dct.in = kzalloc(MLX5_ST_SZ_BYTES(create_dct_in), GFP_KERNEL);
+	if (!qp->dct.in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry);
+	qp->qp_sub_type = MLX5_IB_QPT_DCT;
+	MLX5_SET(dctc, dctc, pd, to_mpd(pd)->pdn);
+	MLX5_SET(dctc, dctc, srqn_xrqn, to_msrq(attr->srq)->msrq.srqn);
+	MLX5_SET(dctc, dctc, cqn, to_mcq(attr->recv_cq)->mcq.cqn);
+	MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key);
+	MLX5_SET(dctc, dctc, user_index, uidx);
+
+	qp->state = IB_QPS_RESET;
+
+	return &qp->ibqp;
+err_free:
+	kfree(qp);
+	return ERR_PTR(err);
+}
+
+static int set_mlx_qp_type(struct mlx5_ib_dev *dev,
+			   struct ib_qp_init_attr *init_attr,
+			   struct mlx5_ib_create_qp *ucmd,
+			   struct ib_udata *udata)
+{
+	enum { MLX_QP_FLAGS = MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI };
+	int err;
+
+	if (!udata)
+		return -EINVAL;
+
+	if (udata->inlen < sizeof(*ucmd)) {
+		mlx5_ib_dbg(dev, "create_qp user command is smaller than expected\n");
+		return -EINVAL;
+	}
+	err = ib_copy_from_udata(ucmd, udata, sizeof(*ucmd));
+	if (err)
+		return err;
+
+	if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCI) {
+		init_attr->qp_type = MLX5_IB_QPT_DCI;
+	} else {
+		if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCT) {
+			init_attr->qp_type = MLX5_IB_QPT_DCT;
+		} else {
+			mlx5_ib_dbg(dev, "Invalid QP flags\n");
+			return -EINVAL;
+		}
+	}
+
+	if (!MLX5_CAP_GEN(dev->mdev, dct)) {
+		mlx5_ib_dbg(dev, "DC transport is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *verbs_init_attr,
+				struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev;
+	struct mlx5_ib_qp *qp;
+	u16 xrcdn = 0;
+	int err;
+	struct ib_qp_init_attr mlx_init_attr;
+	struct ib_qp_init_attr *init_attr = verbs_init_attr;
+
+	if (pd) {
+		dev = to_mdev(pd->device);
+
+		if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
+			if (!pd->uobject) {
+				mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n");
+				return ERR_PTR(-EINVAL);
+			} else if (!to_mucontext(pd->uobject->context)->cqe_version) {
+				mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n");
+				return ERR_PTR(-EINVAL);
+			}
+		}
+	} else {
+		/* being cautious here */
+		if (init_attr->qp_type != IB_QPT_XRC_TGT &&
+		    init_attr->qp_type != MLX5_IB_QPT_REG_UMR) {
+			pr_warn("%s: no PD for transport %s\n", __func__,
+				ib_qp_type_str(init_attr->qp_type));
+			return ERR_PTR(-EINVAL);
+		}
+		dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
+	}
+
+	if (init_attr->qp_type == IB_QPT_DRIVER) {
+		struct mlx5_ib_create_qp ucmd;
+
+		init_attr = &mlx_init_attr;
+		memcpy(init_attr, verbs_init_attr, sizeof(*verbs_init_attr));
+		err = set_mlx_qp_type(dev, init_attr, &ucmd, udata);
+		if (err)
+			return ERR_PTR(err);
+
+		if (init_attr->qp_type == MLX5_IB_QPT_DCI) {
+			if (init_attr->cap.max_recv_wr ||
+			    init_attr->cap.max_recv_sge) {
+				mlx5_ib_dbg(dev, "DCI QP requires zero size receive queue\n");
+				return ERR_PTR(-EINVAL);
+			}
+		} else {
+			return mlx5_ib_create_dct(pd, init_attr, &ucmd);
+		}
+	}
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_XRC_TGT:
+	case IB_QPT_XRC_INI:
+		if (!MLX5_CAP_GEN(dev->mdev, xrc)) {
+			mlx5_ib_dbg(dev, "XRC not supported\n");
+			return ERR_PTR(-ENOSYS);
+		}
+		init_attr->recv_cq = NULL;
+		if (init_attr->qp_type == IB_QPT_XRC_TGT) {
+			xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
+			init_attr->send_cq = NULL;
+		}
+
+		/* fall through */
+	case IB_QPT_RAW_PACKET:
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	case IB_QPT_SMI:
+	case MLX5_IB_QPT_HW_GSI:
+	case MLX5_IB_QPT_REG_UMR:
+	case MLX5_IB_QPT_DCI:
+		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		err = create_qp_common(dev, pd, init_attr, udata, qp);
+		if (err) {
+			mlx5_ib_dbg(dev, "create_qp_common failed\n");
+			kfree(qp);
+			return ERR_PTR(err);
+		}
+
+		if (is_qp0(init_attr->qp_type))
+			qp->ibqp.qp_num = 0;
+		else if (is_qp1(init_attr->qp_type))
+			qp->ibqp.qp_num = 1;
+		else
+			qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn;
+
+		mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n",
+			    qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn,
+			    init_attr->recv_cq ? to_mcq(init_attr->recv_cq)->mcq.cqn : -1,
+			    init_attr->send_cq ? to_mcq(init_attr->send_cq)->mcq.cqn : -1);
+
+		qp->trans_qp.xrcdn = xrcdn;
+
+		break;
+
+	case IB_QPT_GSI:
+		return mlx5_ib_gsi_create_qp(pd, init_attr);
+
+	case IB_QPT_RAW_IPV6:
+	case IB_QPT_RAW_ETHERTYPE:
+	case IB_QPT_MAX:
+	default:
+		mlx5_ib_dbg(dev, "unsupported qp type %d\n",
+			    init_attr->qp_type);
+		/* Don't support raw QPs */
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (verbs_init_attr->qp_type == IB_QPT_DRIVER)
+		qp->qp_sub_type = init_attr->qp_type;
+
+	return &qp->ibqp;
+}
+
+static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(mqp->ibqp.device);
+
+	if (mqp->state == IB_QPS_RTR) {
+		int err;
+
+		err = mlx5_core_destroy_dct(dev->mdev, &mqp->dct.mdct);
+		if (err) {
+			mlx5_ib_warn(dev, "failed to destroy DCT %d\n", err);
+			return err;
+		}
+	}
+
+	kfree(mqp->dct.in);
+	kfree(mqp);
+	return 0;
+}
+
+int mlx5_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+
+	if (unlikely(qp->qp_type == IB_QPT_GSI))
+		return mlx5_ib_gsi_destroy_qp(qp);
+
+	if (mqp->qp_sub_type == MLX5_IB_QPT_DCT)
+		return mlx5_ib_destroy_dct(mqp);
+
+	destroy_qp_common(dev, mqp);
+
+	kfree(mqp);
+
+	return 0;
+}
+
+static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr,
+				   int attr_mask)
+{
+	u32 hw_access_flags = 0;
+	u8 dest_rd_atomic;
+	u32 access_flags;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->trans_qp.resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->trans_qp.atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MLX5_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX);
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MLX5_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
+enum {
+	MLX5_PATH_FLAG_FL	= 1 << 0,
+	MLX5_PATH_FLAG_FREE_AR	= 1 << 1,
+	MLX5_PATH_FLAG_COUNTER	= 1 << 2,
+};
+
+static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
+{
+	if (rate == IB_RATE_PORT_CURRENT)
+		return 0;
+
+	if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS)
+		return -EINVAL;
+
+	while (rate != IB_RATE_PORT_CURRENT &&
+	       !(1 << (rate + MLX5_STAT_RATE_OFFSET) &
+		 MLX5_CAP_GEN(dev->mdev, stat_rate_support)))
+		--rate;
+
+	return rate ? rate + MLX5_STAT_RATE_OFFSET : rate;
+}
+
+static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev,
+				      struct mlx5_ib_sq *sq, u8 sl)
+{
+	void *in;
+	void *tisc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_tis_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_tis_in, in, bitmask.prio, 1);
+
+	tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx);
+	MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1));
+
+	err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev,
+					 struct mlx5_ib_sq *sq, u8 tx_affinity)
+{
+	void *in;
+	void *tisc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_tis_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_tis_in, in, bitmask.lag_tx_port_affinity, 1);
+
+	tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx);
+	MLX5_SET(tisc, tisc, lag_tx_port_affinity, tx_affinity);
+
+	err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+			 const struct rdma_ah_attr *ah,
+			 struct mlx5_qp_path *path, u8 port, int attr_mask,
+			 u32 path_flags, const struct ib_qp_attr *attr,
+			 bool alt)
+{
+	const struct ib_global_route *grh = rdma_ah_read_grh(ah);
+	int err;
+	enum ib_gid_type gid_type;
+	u8 ah_flags = rdma_ah_get_ah_flags(ah);
+	u8 sl = rdma_ah_get_sl(ah);
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		path->pkey_index = cpu_to_be16(alt ? attr->alt_pkey_index :
+						     attr->pkey_index);
+
+	if (ah_flags & IB_AH_GRH) {
+		if (grh->sgid_index >=
+		    dev->mdev->port_caps[port - 1].gid_table_len) {
+			pr_err("sgid_index (%u) too large. max is %d\n",
+			       grh->sgid_index,
+			       dev->mdev->port_caps[port - 1].gid_table_len);
+			return -EINVAL;
+		}
+	}
+
+	if (ah->type == RDMA_AH_ATTR_TYPE_ROCE) {
+		if (!(ah_flags & IB_AH_GRH))
+			return -EINVAL;
+		err = mlx5_get_roce_gid_type(dev, port, grh->sgid_index,
+					     &gid_type);
+		if (err)
+			return err;
+		memcpy(path->rmac, ah->roce.dmac, sizeof(ah->roce.dmac));
+		if (qp->ibqp.qp_type == IB_QPT_RC ||
+		    qp->ibqp.qp_type == IB_QPT_UC ||
+		    qp->ibqp.qp_type == IB_QPT_XRC_INI ||
+		    qp->ibqp.qp_type == IB_QPT_XRC_TGT)
+			path->udp_sport = mlx5_get_roce_udp_sport(dev, port,
+								  grh->sgid_index);
+		path->dci_cfi_prio_sl = (sl & 0x7) << 4;
+		if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+			path->ecn_dscp = (grh->traffic_class >> 2) & 0x3f;
+	} else {
+		path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
+		path->fl_free_ar |=
+			(path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0;
+		path->rlid = cpu_to_be16(rdma_ah_get_dlid(ah));
+		path->grh_mlid = rdma_ah_get_path_bits(ah) & 0x7f;
+		if (ah_flags & IB_AH_GRH)
+			path->grh_mlid	|= 1 << 7;
+		path->dci_cfi_prio_sl = sl & 0xf;
+	}
+
+	if (ah_flags & IB_AH_GRH) {
+		path->mgid_index = grh->sgid_index;
+		path->hop_limit  = grh->hop_limit;
+		path->tclass_flowlabel =
+			cpu_to_be32((grh->traffic_class << 20) |
+				    (grh->flow_label));
+		memcpy(path->rgid, grh->dgid.raw, 16);
+	}
+
+	err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah));
+	if (err < 0)
+		return err;
+	path->static_rate = err;
+	path->port = port;
+
+	if (attr_mask & IB_QP_TIMEOUT)
+		path->ackto_lt = (alt ? attr->alt_timeout : attr->timeout) << 3;
+
+	if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt)
+		return modify_raw_packet_eth_prio(dev->mdev,
+						  &qp->raw_packet_qp.sq,
+						  sl & 0xf);
+
+	return 0;
+}
+
+static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = {
+	[MLX5_QP_STATE_INIT] = {
+		[MLX5_QP_STATE_INIT] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_PRI_PORT,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_PRI_PORT,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_Q_KEY		|
+					  MLX5_QP_OPTPAR_PRI_PORT,
+		},
+		[MLX5_QP_STATE_RTR] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
+					  MLX5_QP_OPTPAR_RRE            |
+					  MLX5_QP_OPTPAR_RAE            |
+					  MLX5_QP_OPTPAR_RWE            |
+					  MLX5_QP_OPTPAR_PKEY_INDEX,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
+					  MLX5_QP_OPTPAR_RWE            |
+					  MLX5_QP_OPTPAR_PKEY_INDEX,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX     |
+					  MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX	|
+					   MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH |
+					  MLX5_QP_OPTPAR_RRE            |
+					  MLX5_QP_OPTPAR_RAE            |
+					  MLX5_QP_OPTPAR_RWE            |
+					  MLX5_QP_OPTPAR_PKEY_INDEX,
+		},
+	},
+	[MLX5_QP_STATE_RTR] = {
+		[MLX5_QP_STATE_RTS] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH	|
+					  MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_RNR_TIMEOUT,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH	|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PM_STATE,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY,
+		},
+	},
+	[MLX5_QP_STATE_RTS] = {
+		[MLX5_QP_STATE_RTS] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_RNR_TIMEOUT	|
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_ALT_ADDR_PATH,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_ALT_ADDR_PATH,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY		|
+					  MLX5_QP_OPTPAR_SRQN		|
+					  MLX5_QP_OPTPAR_CQN_RCV,
+		},
+	},
+	[MLX5_QP_STATE_SQER] = {
+		[MLX5_QP_STATE_RTS] = {
+			[MLX5_QP_ST_UD]	 = MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_UC]	 = MLX5_QP_OPTPAR_RWE,
+			[MLX5_QP_ST_RC]	 = MLX5_QP_OPTPAR_RNR_TIMEOUT	|
+					   MLX5_QP_OPTPAR_RWE		|
+					   MLX5_QP_OPTPAR_RAE		|
+					   MLX5_QP_OPTPAR_RRE,
+		},
+	},
+};
+
+static int ib_nr_to_mlx5_nr(int ib_mask)
+{
+	switch (ib_mask) {
+	case IB_QP_STATE:
+		return 0;
+	case IB_QP_CUR_STATE:
+		return 0;
+	case IB_QP_EN_SQD_ASYNC_NOTIFY:
+		return 0;
+	case IB_QP_ACCESS_FLAGS:
+		return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE |
+			MLX5_QP_OPTPAR_RAE;
+	case IB_QP_PKEY_INDEX:
+		return MLX5_QP_OPTPAR_PKEY_INDEX;
+	case IB_QP_PORT:
+		return MLX5_QP_OPTPAR_PRI_PORT;
+	case IB_QP_QKEY:
+		return MLX5_QP_OPTPAR_Q_KEY;
+	case IB_QP_AV:
+		return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH |
+			MLX5_QP_OPTPAR_PRI_PORT;
+	case IB_QP_PATH_MTU:
+		return 0;
+	case IB_QP_TIMEOUT:
+		return MLX5_QP_OPTPAR_ACK_TIMEOUT;
+	case IB_QP_RETRY_CNT:
+		return MLX5_QP_OPTPAR_RETRY_COUNT;
+	case IB_QP_RNR_RETRY:
+		return MLX5_QP_OPTPAR_RNR_RETRY;
+	case IB_QP_RQ_PSN:
+		return 0;
+	case IB_QP_MAX_QP_RD_ATOMIC:
+		return MLX5_QP_OPTPAR_SRA_MAX;
+	case IB_QP_ALT_PATH:
+		return MLX5_QP_OPTPAR_ALT_ADDR_PATH;
+	case IB_QP_MIN_RNR_TIMER:
+		return MLX5_QP_OPTPAR_RNR_TIMEOUT;
+	case IB_QP_SQ_PSN:
+		return 0;
+	case IB_QP_MAX_DEST_RD_ATOMIC:
+		return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE |
+			MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE;
+	case IB_QP_PATH_MIG_STATE:
+		return MLX5_QP_OPTPAR_PM_STATE;
+	case IB_QP_CAP:
+		return 0;
+	case IB_QP_DEST_QPN:
+		return 0;
+	}
+	return 0;
+}
+
+static int ib_mask_to_mlx5_opt(int ib_mask)
+{
+	int result = 0;
+	int i;
+
+	for (i = 0; i < 8 * sizeof(int); i++) {
+		if ((1 << i) & ib_mask)
+			result |= ib_nr_to_mlx5_nr(1 << i);
+	}
+
+	return result;
+}
+
+static int modify_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
+				   struct mlx5_ib_rq *rq, int new_state,
+				   const struct mlx5_modify_raw_qp_param *raw_qp_param)
+{
+	void *in;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_rq_in, in, rq_state, rq->state);
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+	MLX5_SET(rqc, rqc, state, new_state);
+
+	if (raw_qp_param->set_mask & MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID) {
+		if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) {
+			MLX5_SET64(modify_rq_in, in, modify_bitmask,
+				   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID);
+			MLX5_SET(rqc, rqc, counter_set_id, raw_qp_param->rq_q_ctr_id);
+		} else
+			pr_info_once("%s: RAW PACKET QP counters are not supported on current FW\n",
+				     dev->ib_dev.name);
+	}
+
+	err = mlx5_core_modify_rq(dev->mdev, rq->base.mqp.qpn, in, inlen);
+	if (err)
+		goto out;
+
+	rq->state = new_state;
+
+out:
+	kvfree(in);
+	return err;
+}
+
+static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev,
+				   struct mlx5_ib_sq *sq,
+				   int new_state,
+				   const struct mlx5_modify_raw_qp_param *raw_qp_param)
+{
+	struct mlx5_ib_qp *ibqp = sq->base.container_mibqp;
+	struct mlx5_rate_limit old_rl = ibqp->rl;
+	struct mlx5_rate_limit new_rl = old_rl;
+	bool new_rate_added = false;
+	u16 rl_index = 0;
+	void *in;
+	void *sqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_sq_in, in, sq_state, sq->state);
+
+	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
+	MLX5_SET(sqc, sqc, state, new_state);
+
+	if (raw_qp_param->set_mask & MLX5_RAW_QP_RATE_LIMIT) {
+		if (new_state != MLX5_SQC_STATE_RDY)
+			pr_warn("%s: Rate limit can only be changed when SQ is moving to RDY\n",
+				__func__);
+		else
+			new_rl = raw_qp_param->rl;
+	}
+
+	if (!mlx5_rl_are_equal(&old_rl, &new_rl)) {
+		if (new_rl.rate) {
+			err = mlx5_rl_add_rate(dev, &rl_index, &new_rl);
+			if (err) {
+				pr_err("Failed configuring rate limit(err %d): \
+				       rate %u, max_burst_sz %u, typical_pkt_sz %u\n",
+				       err, new_rl.rate, new_rl.max_burst_sz,
+				       new_rl.typical_pkt_sz);
+
+				goto out;
+			}
+			new_rate_added = true;
+		}
+
+		MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
+		/* index 0 means no limit */
+		MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
+	}
+
+	err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in, inlen);
+	if (err) {
+		/* Remove new rate from table if failed */
+		if (new_rate_added)
+			mlx5_rl_remove_rate(dev, &new_rl);
+		goto out;
+	}
+
+	/* Only remove the old rate after new rate was set */
+	if ((old_rl.rate &&
+	     !mlx5_rl_are_equal(&old_rl, &new_rl)) ||
+	    (new_state != MLX5_SQC_STATE_RDY))
+		mlx5_rl_remove_rate(dev, &old_rl);
+
+	ibqp->rl = new_rl;
+	sq->state = new_state;
+
+out:
+	kvfree(in);
+	return err;
+}
+
+static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+				const struct mlx5_modify_raw_qp_param *raw_qp_param,
+				u8 tx_affinity)
+{
+	struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+	struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+	struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+	int modify_rq = !!qp->rq.wqe_cnt;
+	int modify_sq = !!qp->sq.wqe_cnt;
+	int rq_state;
+	int sq_state;
+	int err;
+
+	switch (raw_qp_param->operation) {
+	case MLX5_CMD_OP_RST2INIT_QP:
+		rq_state = MLX5_RQC_STATE_RDY;
+		sq_state = MLX5_SQC_STATE_RDY;
+		break;
+	case MLX5_CMD_OP_2ERR_QP:
+		rq_state = MLX5_RQC_STATE_ERR;
+		sq_state = MLX5_SQC_STATE_ERR;
+		break;
+	case MLX5_CMD_OP_2RST_QP:
+		rq_state = MLX5_RQC_STATE_RST;
+		sq_state = MLX5_SQC_STATE_RST;
+		break;
+	case MLX5_CMD_OP_RTR2RTS_QP:
+	case MLX5_CMD_OP_RTS2RTS_QP:
+		if (raw_qp_param->set_mask ==
+		    MLX5_RAW_QP_RATE_LIMIT) {
+			modify_rq = 0;
+			sq_state = sq->state;
+		} else {
+			return raw_qp_param->set_mask ? -EINVAL : 0;
+		}
+		break;
+	case MLX5_CMD_OP_INIT2INIT_QP:
+	case MLX5_CMD_OP_INIT2RTR_QP:
+		if (raw_qp_param->set_mask)
+			return -EINVAL;
+		else
+			return 0;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (modify_rq) {
+		err =  modify_raw_packet_qp_rq(dev, rq, rq_state, raw_qp_param);
+		if (err)
+			return err;
+	}
+
+	if (modify_sq) {
+		if (tx_affinity) {
+			err = modify_raw_packet_tx_affinity(dev->mdev, sq,
+							    tx_affinity);
+			if (err)
+				return err;
+		}
+
+		return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state, raw_qp_param);
+	}
+
+	return 0;
+}
+
+static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
+			       const struct ib_qp_attr *attr, int attr_mask,
+			       enum ib_qp_state cur_state, enum ib_qp_state new_state,
+			       const struct mlx5_ib_modify_qp *ucmd)
+{
+	static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
+		[MLX5_QP_STATE_RST] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_INIT]	= MLX5_CMD_OP_RST2INIT_QP,
+		},
+		[MLX5_QP_STATE_INIT]  = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_INIT]	= MLX5_CMD_OP_INIT2INIT_QP,
+			[MLX5_QP_STATE_RTR]	= MLX5_CMD_OP_INIT2RTR_QP,
+		},
+		[MLX5_QP_STATE_RTR]   = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_RTR2RTS_QP,
+		},
+		[MLX5_QP_STATE_RTS]   = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_RTS2RTS_QP,
+		},
+		[MLX5_QP_STATE_SQD] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+		},
+		[MLX5_QP_STATE_SQER] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_SQERR2RTS_QP,
+		},
+		[MLX5_QP_STATE_ERR] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+		}
+	};
+
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
+	struct mlx5_ib_cq *send_cq, *recv_cq;
+	struct mlx5_qp_context *context;
+	struct mlx5_ib_pd *pd;
+	struct mlx5_ib_port *mibport = NULL;
+	enum mlx5_qp_state mlx5_cur, mlx5_new;
+	enum mlx5_qp_optpar optpar;
+	int mlx5_st;
+	int err;
+	u16 op;
+	u8 tx_affinity = 0;
+
+	mlx5_st = to_mlx5_st(ibqp->qp_type == IB_QPT_DRIVER ?
+			     qp->qp_sub_type : ibqp->qp_type);
+	if (mlx5_st < 0)
+		return -EINVAL;
+
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	context->flags = cpu_to_be32(mlx5_st << 16);
+
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE)) {
+		context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
+	} else {
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+
+	if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) {
+		if ((ibqp->qp_type == IB_QPT_RC) ||
+		    (ibqp->qp_type == IB_QPT_UD &&
+		     !(qp->flags & MLX5_IB_QP_SQPN_QP1)) ||
+		    (ibqp->qp_type == IB_QPT_UC) ||
+		    (ibqp->qp_type == IB_QPT_RAW_PACKET) ||
+		    (ibqp->qp_type == IB_QPT_XRC_INI) ||
+		    (ibqp->qp_type == IB_QPT_XRC_TGT)) {
+			if (mlx5_lag_is_active(dev->mdev)) {
+				u8 p = mlx5_core_native_port_num(dev->mdev);
+				tx_affinity = (unsigned int)atomic_add_return(1,
+						&dev->roce[p].next_port) %
+						MLX5_MAX_PORTS + 1;
+				context->flags |= cpu_to_be32(tx_affinity << 24);
+			}
+		}
+	}
+
+	if (is_sqp(ibqp->qp_type)) {
+		context->mtu_msgmax = (IB_MTU_256 << 5) | 8;
+	} else if ((ibqp->qp_type == IB_QPT_UD &&
+		    !(qp->flags & MLX5_IB_QP_UNDERLAY)) ||
+		   ibqp->qp_type == MLX5_IB_QPT_REG_UMR) {
+		context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
+	} else if (attr_mask & IB_QP_PATH_MTU) {
+		if (attr->path_mtu < IB_MTU_256 ||
+		    attr->path_mtu > IB_MTU_4096) {
+			mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu);
+			err = -EINVAL;
+			goto out;
+		}
+		context->mtu_msgmax = (attr->path_mtu << 5) |
+				      (u8)MLX5_CAP_GEN(dev->mdev, log_max_msg);
+	}
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num);
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index);
+
+	/* todo implement counter_index functionality */
+
+	if (is_sqp(ibqp->qp_type))
+		context->pri_path.port = qp->port;
+
+	if (attr_mask & IB_QP_PORT)
+		context->pri_path.port = attr->port_num;
+
+	if (attr_mask & IB_QP_AV) {
+		err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path,
+				    attr_mask & IB_QP_PORT ? attr->port_num : qp->port,
+				    attr_mask, 0, attr, false);
+		if (err)
+			goto out;
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT)
+		context->pri_path.ackto_lt |= attr->timeout << 3;
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		err = mlx5_set_path(dev, qp, &attr->alt_ah_attr,
+				    &context->alt_path,
+				    attr->alt_port_num,
+				    attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT,
+				    0, attr, true);
+		if (err)
+			goto out;
+	}
+
+	pd = get_pd(qp);
+	get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
+		&send_cq, &recv_cq);
+
+	context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn);
+	context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0;
+	context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0;
+	context->params1  = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28);
+
+	if (attr_mask & IB_QP_RNR_RETRY)
+		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+
+	if (attr_mask & IB_QP_RETRY_CNT)
+		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic)
+			context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		context->next_send_psn = cpu_to_be32(attr->sq_psn);
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic)
+			context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC))
+		context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+
+	if (attr_mask & IB_QP_RQ_PSN)
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	if (attr_mask & IB_QP_QKEY)
+		context->qkey = cpu_to_be32(attr->qkey);
+
+	if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->db_rec_addr = cpu_to_be64(qp->db.dma);
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num :
+			       qp->port) - 1;
+
+		/* Underlay port should be used - index 0 function per port */
+		if (qp->flags & MLX5_IB_QP_UNDERLAY)
+			port_num = 0;
+
+		mibport = &dev->port[port_num];
+		context->qp_counter_set_usr_page |=
+			cpu_to_be32((u32)(mibport->cnts.set_id) << 24);
+	}
+
+	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->sq_crq_size |= cpu_to_be16(1 << 4);
+
+	if (qp->flags & MLX5_IB_QP_SQPN_QP1)
+		context->deth_sqpn = cpu_to_be32(1);
+
+	mlx5_cur = to_mlx5_state(cur_state);
+	mlx5_new = to_mlx5_state(new_state);
+
+	if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE ||
+	    !optab[mlx5_cur][mlx5_new]) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	op = optab[mlx5_cur][mlx5_new];
+	optpar = ib_mask_to_mlx5_opt(attr_mask);
+	optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
+
+	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
+	    qp->flags & MLX5_IB_QP_UNDERLAY) {
+		struct mlx5_modify_raw_qp_param raw_qp_param = {};
+
+		raw_qp_param.operation = op;
+		if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+			raw_qp_param.rq_q_ctr_id = mibport->cnts.set_id;
+			raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID;
+		}
+
+		if (attr_mask & IB_QP_RATE_LIMIT) {
+			raw_qp_param.rl.rate = attr->rate_limit;
+
+			if (ucmd->burst_info.max_burst_sz) {
+				if (attr->rate_limit &&
+				    MLX5_CAP_QOS(dev->mdev, packet_pacing_burst_bound)) {
+					raw_qp_param.rl.max_burst_sz =
+						ucmd->burst_info.max_burst_sz;
+				} else {
+					err = -EINVAL;
+					goto out;
+				}
+			}
+
+			if (ucmd->burst_info.typical_pkt_sz) {
+				if (attr->rate_limit &&
+				    MLX5_CAP_QOS(dev->mdev, packet_pacing_typical_size)) {
+					raw_qp_param.rl.typical_pkt_sz =
+						ucmd->burst_info.typical_pkt_sz;
+				} else {
+					err = -EINVAL;
+					goto out;
+				}
+			}
+
+			raw_qp_param.set_mask |= MLX5_RAW_QP_RATE_LIMIT;
+		}
+
+		err = modify_raw_packet_qp(dev, qp, &raw_qp_param, tx_affinity);
+	} else {
+		err = mlx5_core_qp_modify(dev->mdev, op, optpar, context,
+					  &base->mqp);
+	}
+
+	if (err)
+		goto out;
+
+	qp->state = new_state;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->trans_qp.atomic_rd_en = attr->qp_access_flags;
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->trans_qp.resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT)
+		qp->port = attr->port_num;
+	if (attr_mask & IB_QP_ALT_PATH)
+		qp->trans_qp.alt_port = attr->alt_port_num;
+
+	/*
+	 * If we moved a kernel QP to RESET, clean up all old CQ
+	 * entries and reinitialize the QP.
+	 */
+	if (new_state == IB_QPS_RESET &&
+	    !ibqp->uobject && ibqp->qp_type != IB_QPT_XRC_TGT) {
+		mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
+				 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+		if (send_cq != recv_cq)
+			mlx5_ib_cq_clean(send_cq, base->mqp.qpn, NULL);
+
+		qp->rq.head = 0;
+		qp->rq.tail = 0;
+		qp->sq.head = 0;
+		qp->sq.tail = 0;
+		qp->sq.cur_post = 0;
+		qp->sq.last_poll = 0;
+		qp->db.db[MLX5_RCV_DBR] = 0;
+		qp->db.db[MLX5_SND_DBR] = 0;
+	}
+
+out:
+	kfree(context);
+	return err;
+}
+
+static inline bool is_valid_mask(int mask, int req, int opt)
+{
+	if ((mask & req) != req)
+		return false;
+
+	if (mask & ~(req | opt))
+		return false;
+
+	return true;
+}
+
+/* check valid transition for driver QP types
+ * for now the only QP type that this function supports is DCI
+ */
+static bool modify_dci_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state new_state,
+				enum ib_qp_attr_mask attr_mask)
+{
+	int req = IB_QP_STATE;
+	int opt = 0;
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		req |= IB_QP_PKEY_INDEX | IB_QP_PORT;
+		return is_valid_mask(attr_mask, req, opt);
+	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) {
+		opt = IB_QP_PKEY_INDEX | IB_QP_PORT;
+		return is_valid_mask(attr_mask, req, opt);
+	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		req |= IB_QP_PATH_MTU;
+		opt = IB_QP_PKEY_INDEX;
+		return is_valid_mask(attr_mask, req, opt);
+	} else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) {
+		req |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
+		       IB_QP_MAX_QP_RD_ATOMIC | IB_QP_SQ_PSN;
+		opt = IB_QP_MIN_RNR_TIMER;
+		return is_valid_mask(attr_mask, req, opt);
+	} else if (cur_state == IB_QPS_RTS && new_state == IB_QPS_RTS) {
+		opt = IB_QP_MIN_RNR_TIMER;
+		return is_valid_mask(attr_mask, req, opt);
+	} else if (cur_state != IB_QPS_RESET && new_state == IB_QPS_ERR) {
+		return is_valid_mask(attr_mask, req, opt);
+	}
+	return false;
+}
+
+/* mlx5_ib_modify_dct: modify a DCT QP
+ * valid transitions are:
+ * RESET to INIT: must set access_flags, pkey_index and port
+ * INIT  to RTR : must set min_rnr_timer, tclass, flow_label,
+ *			   mtu, gid_index and hop_limit
+ * Other transitions and attributes are illegal
+ */
+static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+			      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	enum ib_qp_state cur_state, new_state;
+	int err = 0;
+	int required = IB_QP_STATE;
+	void *dctc;
+
+	if (!(attr_mask & IB_QP_STATE))
+		return -EINVAL;
+
+	cur_state = qp->state;
+	new_state = attr->qp_state;
+
+	dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry);
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		required |= IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
+		if (!is_valid_mask(attr_mask, required, 0))
+			return -EINVAL;
+
+		if (attr->port_num == 0 ||
+		    attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports)) {
+			mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n",
+				    attr->port_num, dev->num_ports);
+			return -EINVAL;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
+			MLX5_SET(dctc, dctc, rre, 1);
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
+			MLX5_SET(dctc, dctc, rwe, 1);
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) {
+			if (!mlx5_ib_dc_atomic_is_supported(dev))
+				return -EOPNOTSUPP;
+			MLX5_SET(dctc, dctc, rae, 1);
+			MLX5_SET(dctc, dctc, atomic_mode, MLX5_ATOMIC_MODE_DCT_CX);
+		}
+		MLX5_SET(dctc, dctc, pkey_index, attr->pkey_index);
+		MLX5_SET(dctc, dctc, port, attr->port_num);
+		MLX5_SET(dctc, dctc, counter_set_id, dev->port[attr->port_num - 1].cnts.set_id);
+
+	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		struct mlx5_ib_modify_qp_resp resp = {};
+		u32 min_resp_len = offsetof(typeof(resp), dctn) +
+				   sizeof(resp.dctn);
+
+		if (udata->outlen < min_resp_len)
+			return -EINVAL;
+		resp.response_length = min_resp_len;
+
+		required |= IB_QP_MIN_RNR_TIMER | IB_QP_AV | IB_QP_PATH_MTU;
+		if (!is_valid_mask(attr_mask, required, 0))
+			return -EINVAL;
+		MLX5_SET(dctc, dctc, min_rnr_nak, attr->min_rnr_timer);
+		MLX5_SET(dctc, dctc, tclass, attr->ah_attr.grh.traffic_class);
+		MLX5_SET(dctc, dctc, flow_label, attr->ah_attr.grh.flow_label);
+		MLX5_SET(dctc, dctc, mtu, attr->path_mtu);
+		MLX5_SET(dctc, dctc, my_addr_index, attr->ah_attr.grh.sgid_index);
+		MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit);
+
+		err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in,
+					   MLX5_ST_SZ_BYTES(create_dct_in));
+		if (err)
+			return err;
+		resp.dctn = qp->dct.mdct.mqp.qpn;
+		err = ib_copy_to_udata(udata, &resp, resp.response_length);
+		if (err) {
+			mlx5_core_destroy_dct(dev->mdev, &qp->dct.mdct);
+			return err;
+		}
+	} else {
+		mlx5_ib_warn(dev, "Modify DCT: Invalid transition from %d to %d\n", cur_state, new_state);
+		return -EINVAL;
+	}
+	if (err)
+		qp->state = IB_QPS_ERR;
+	else
+		qp->state = new_state;
+	return err;
+}
+
+int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_ib_modify_qp ucmd = {};
+	enum ib_qp_type qp_type;
+	enum ib_qp_state cur_state, new_state;
+	size_t required_cmd_sz;
+	int err = -EINVAL;
+	int port;
+	enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED;
+
+	if (ibqp->rwq_ind_tbl)
+		return -ENOSYS;
+
+	if (udata && udata->inlen) {
+		required_cmd_sz = offsetof(typeof(ucmd), reserved) +
+			sizeof(ucmd.reserved);
+		if (udata->inlen < required_cmd_sz)
+			return -EINVAL;
+
+		if (udata->inlen > sizeof(ucmd) &&
+		    !ib_is_udata_cleared(udata, sizeof(ucmd),
+					 udata->inlen - sizeof(ucmd)))
+			return -EOPNOTSUPP;
+
+		if (ib_copy_from_udata(&ucmd, udata,
+				       min(udata->inlen, sizeof(ucmd))))
+			return -EFAULT;
+
+		if (ucmd.comp_mask ||
+		    memchr_inv(&ucmd.reserved, 0, sizeof(ucmd.reserved)) ||
+		    memchr_inv(&ucmd.burst_info.reserved, 0,
+			       sizeof(ucmd.burst_info.reserved)))
+			return -EOPNOTSUPP;
+	}
+
+	if (unlikely(ibqp->qp_type == IB_QPT_GSI))
+		return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask);
+
+	if (ibqp->qp_type == IB_QPT_DRIVER)
+		qp_type = qp->qp_sub_type;
+	else
+		qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ?
+			IB_QPT_GSI : ibqp->qp_type;
+
+	if (qp_type == MLX5_IB_QPT_DCT)
+		return mlx5_ib_modify_dct(ibqp, attr, attr_mask, udata);
+
+	mutex_lock(&qp->mutex);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) {
+		port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port);
+	}
+
+	if (qp->flags & MLX5_IB_QP_UNDERLAY) {
+		if (attr_mask & ~(IB_QP_STATE | IB_QP_CUR_STATE)) {
+			mlx5_ib_dbg(dev, "invalid attr_mask 0x%x when underlay QP is used\n",
+				    attr_mask);
+			goto out;
+		}
+	} else if (qp_type != MLX5_IB_QPT_REG_UMR &&
+		   qp_type != MLX5_IB_QPT_DCI &&
+		   !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) {
+		mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n",
+			    cur_state, new_state, ibqp->qp_type, attr_mask);
+		goto out;
+	} else if (qp_type == MLX5_IB_QPT_DCI &&
+		   !modify_dci_qp_is_ok(cur_state, new_state, attr_mask)) {
+		mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n",
+			    cur_state, new_state, qp_type, attr_mask);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 ||
+	     attr->port_num > dev->num_ports)) {
+		mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n",
+			    attr->port_num, dev->num_ports);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		if (attr->pkey_index >=
+		    dev->mdev->port_caps[port - 1].pkey_table_len) {
+			mlx5_ib_dbg(dev, "invalid pkey index %d\n",
+				    attr->pkey_index);
+			goto out;
+		}
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic >
+	    (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) {
+		mlx5_ib_dbg(dev, "invalid max_rd_atomic value %d\n",
+			    attr->max_rd_atomic);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic >
+	    (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) {
+		mlx5_ib_dbg(dev, "invalid max_dest_rd_atomic value %d\n",
+			    attr->max_dest_rd_atomic);
+		goto out;
+	}
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state,
+				  new_state, &ucmd);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+	struct mlx5_ib_cq *cq;
+	unsigned cur;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max_post))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max_post;
+}
+
+static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg,
+			 struct ib_send_wr *wr, void *qend,
+			 struct mlx5_ib_qp *qp, int *size)
+{
+	void *seg = eseg;
+
+	memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg));
+
+	if (wr->send_flags & IB_SEND_IP_CSUM)
+		eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM |
+				 MLX5_ETH_WQE_L4_CSUM;
+
+	seg += sizeof(struct mlx5_wqe_eth_seg);
+	*size += sizeof(struct mlx5_wqe_eth_seg) / 16;
+
+	if (wr->opcode == IB_WR_LSO) {
+		struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr);
+		int size_of_inl_hdr_start = sizeof(eseg->inline_hdr.start);
+		u64 left, leftlen, copysz;
+		void *pdata = ud_wr->header;
+
+		left = ud_wr->hlen;
+		eseg->mss = cpu_to_be16(ud_wr->mss);
+		eseg->inline_hdr.sz = cpu_to_be16(left);
+
+		/*
+		 * check if there is space till the end of queue, if yes,
+		 * copy all in one shot, otherwise copy till the end of queue,
+		 * rollback and than the copy the left
+		 */
+		leftlen = qend - (void *)eseg->inline_hdr.start;
+		copysz = min_t(u64, leftlen, left);
+
+		memcpy(seg - size_of_inl_hdr_start, pdata, copysz);
+
+		if (likely(copysz > size_of_inl_hdr_start)) {
+			seg += ALIGN(copysz - size_of_inl_hdr_start, 16);
+			*size += ALIGN(copysz - size_of_inl_hdr_start, 16) / 16;
+		}
+
+		if (unlikely(copysz < left)) { /* the last wqe in the queue */
+			seg = mlx5_get_send_wqe(qp, 0);
+			left -= copysz;
+			pdata += copysz;
+			memcpy(seg, pdata, left);
+			seg += ALIGN(left, 16);
+			*size += ALIGN(left, 16) / 16;
+		}
+	}
+
+	return seg;
+}
+
+static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,
+			     struct ib_send_wr *wr)
+{
+	memcpy(&dseg->av, &to_mah(ud_wr(wr)->ah)->av, sizeof(struct mlx5_av));
+	dseg->av.dqp_dct = cpu_to_be32(ud_wr(wr)->remote_qpn | MLX5_EXTENDED_UD_AV);
+	dseg->av.key.qkey.qkey = cpu_to_be32(ud_wr(wr)->remote_qkey);
+}
+
+static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static u64 get_xlt_octo(u64 bytes)
+{
+	return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) /
+	       MLX5_IB_UMR_OCTOWORD;
+}
+
+static __be64 frwr_mkey_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LEN		|
+		MLX5_MKEY_MASK_PAGE_SIZE	|
+		MLX5_MKEY_MASK_START_ADDR	|
+		MLX5_MKEY_MASK_EN_RINVAL	|
+		MLX5_MKEY_MASK_KEY		|
+		MLX5_MKEY_MASK_LR		|
+		MLX5_MKEY_MASK_LW		|
+		MLX5_MKEY_MASK_RR		|
+		MLX5_MKEY_MASK_RW		|
+		MLX5_MKEY_MASK_A		|
+		MLX5_MKEY_MASK_SMALL_FENCE	|
+		MLX5_MKEY_MASK_FREE;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 sig_mkey_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LEN		|
+		MLX5_MKEY_MASK_PAGE_SIZE	|
+		MLX5_MKEY_MASK_START_ADDR	|
+		MLX5_MKEY_MASK_EN_SIGERR	|
+		MLX5_MKEY_MASK_EN_RINVAL	|
+		MLX5_MKEY_MASK_KEY		|
+		MLX5_MKEY_MASK_LR		|
+		MLX5_MKEY_MASK_LW		|
+		MLX5_MKEY_MASK_RR		|
+		MLX5_MKEY_MASK_RW		|
+		MLX5_MKEY_MASK_SMALL_FENCE	|
+		MLX5_MKEY_MASK_FREE		|
+		MLX5_MKEY_MASK_BSF_EN;
+
+	return cpu_to_be64(result);
+}
+
+static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr,
+			    struct mlx5_ib_mr *mr)
+{
+	int size = mr->ndescs * mr->desc_size;
+
+	memset(umr, 0, sizeof(*umr));
+
+	umr->flags = MLX5_UMR_CHECK_NOT_FREE;
+	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size));
+	umr->mkey_mask = frwr_mkey_mask();
+}
+
+static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr)
+{
+	memset(umr, 0, sizeof(*umr));
+	umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
+	umr->flags = MLX5_UMR_INLINE;
+}
+
+static __be64 get_umr_enable_mr_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_KEY |
+		 MLX5_MKEY_MASK_FREE;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 get_umr_disable_mr_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_FREE;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 get_umr_update_translation_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LEN |
+		 MLX5_MKEY_MASK_PAGE_SIZE |
+		 MLX5_MKEY_MASK_START_ADDR;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 get_umr_update_access_mask(int atomic)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LR |
+		 MLX5_MKEY_MASK_LW |
+		 MLX5_MKEY_MASK_RR |
+		 MLX5_MKEY_MASK_RW;
+
+	if (atomic)
+		result |= MLX5_MKEY_MASK_A;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 get_umr_update_pd_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_PD;
+
+	return cpu_to_be64(result);
+}
+
+static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask)
+{
+	if ((mask & MLX5_MKEY_MASK_PAGE_SIZE &&
+	     MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) ||
+	    (mask & MLX5_MKEY_MASK_A &&
+	     MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)))
+		return -EPERM;
+	return 0;
+}
+
+static int set_reg_umr_segment(struct mlx5_ib_dev *dev,
+			       struct mlx5_wqe_umr_ctrl_seg *umr,
+			       struct ib_send_wr *wr, int atomic)
+{
+	struct mlx5_umr_wr *umrwr = umr_wr(wr);
+
+	memset(umr, 0, sizeof(*umr));
+
+	if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE)
+		umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */
+	else
+		umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */
+
+	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(umrwr->xlt_size));
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_XLT) {
+		u64 offset = get_xlt_octo(umrwr->offset);
+
+		umr->xlt_offset = cpu_to_be16(offset & 0xffff);
+		umr->xlt_offset_47_16 = cpu_to_be32(offset >> 16);
+		umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
+	}
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION)
+		umr->mkey_mask |= get_umr_update_translation_mask();
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS) {
+		umr->mkey_mask |= get_umr_update_access_mask(atomic);
+		umr->mkey_mask |= get_umr_update_pd_mask();
+	}
+	if (wr->send_flags & MLX5_IB_SEND_UMR_ENABLE_MR)
+		umr->mkey_mask |= get_umr_enable_mr_mask();
+	if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR)
+		umr->mkey_mask |= get_umr_disable_mr_mask();
+
+	if (!wr->num_sge)
+		umr->flags |= MLX5_UMR_INLINE;
+
+	return umr_check_mkey_mask(dev, be64_to_cpu(umr->mkey_mask));
+}
+
+static u8 get_umr_flags(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
+		MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN;
+}
+
+static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg,
+			     struct mlx5_ib_mr *mr,
+			     u32 key, int access)
+{
+	int ndescs = ALIGN(mr->ndescs, 8) >> 1;
+
+	memset(seg, 0, sizeof(*seg));
+
+	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_MTT)
+		seg->log2_page_size = ilog2(mr->ibmr.page_size);
+	else if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
+		/* KLMs take twice the size of MTTs */
+		ndescs *= 2;
+
+	seg->flags = get_umr_flags(access) | mr->access_mode;
+	seg->qpn_mkey7_0 = cpu_to_be32((key & 0xff) | 0xffffff00);
+	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL);
+	seg->start_addr = cpu_to_be64(mr->ibmr.iova);
+	seg->len = cpu_to_be64(mr->ibmr.length);
+	seg->xlt_oct_size = cpu_to_be32(ndescs);
+}
+
+static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg)
+{
+	memset(seg, 0, sizeof(*seg));
+	seg->status = MLX5_MKEY_STATUS_FREE;
+}
+
+static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr)
+{
+	struct mlx5_umr_wr *umrwr = umr_wr(wr);
+
+	memset(seg, 0, sizeof(*seg));
+	if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR)
+		seg->status = MLX5_MKEY_STATUS_FREE;
+
+	seg->flags = convert_access(umrwr->access_flags);
+	if (umrwr->pd)
+		seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn);
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION &&
+	    !umrwr->length)
+		seg->flags_pd |= cpu_to_be32(MLX5_MKEY_LEN64);
+
+	seg->start_addr = cpu_to_be64(umrwr->virt_addr);
+	seg->len = cpu_to_be64(umrwr->length);
+	seg->log2_page_size = umrwr->page_shift;
+	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
+				       mlx5_mkey_variant(umrwr->mkey));
+}
+
+static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg,
+			     struct mlx5_ib_mr *mr,
+			     struct mlx5_ib_pd *pd)
+{
+	int bcount = mr->desc_size * mr->ndescs;
+
+	dseg->addr = cpu_to_be64(mr->desc_map);
+	dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64));
+	dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey);
+}
+
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return wr->ex.imm_data;
+
+	case IB_WR_SEND_WITH_INV:
+		return cpu_to_be32(wr->ex.invalidate_rkey);
+
+	default:
+		return 0;
+	}
+}
+
+static u8 calc_sig(void *wqe, int size)
+{
+	u8 *p = wqe;
+	u8 res = 0;
+	int i;
+
+	for (i = 0; i < size; i++)
+		res ^= p[i];
+
+	return ~res;
+}
+
+static u8 wq_sig(void *wqe)
+{
+	return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4);
+}
+
+static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr,
+			    void *wqe, int *sz)
+{
+	struct mlx5_wqe_inline_seg *seg;
+	void *qend = qp->sq.qend;
+	void *addr;
+	int inl = 0;
+	int copy;
+	int len;
+	int i;
+
+	seg = wqe;
+	wqe += sizeof(*seg);
+	for (i = 0; i < wr->num_sge; i++) {
+		addr = (void *)(unsigned long)(wr->sg_list[i].addr);
+		len  = wr->sg_list[i].length;
+		inl += len;
+
+		if (unlikely(inl > qp->max_inline_data))
+			return -ENOMEM;
+
+		if (unlikely(wqe + len > qend)) {
+			copy = qend - wqe;
+			memcpy(wqe, addr, copy);
+			addr += copy;
+			len -= copy;
+			wqe = mlx5_get_send_wqe(qp, 0);
+		}
+		memcpy(wqe, addr, len);
+		wqe += len;
+	}
+
+	seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG);
+
+	*sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16;
+
+	return 0;
+}
+
+static u16 prot_field_size(enum ib_signature_type type)
+{
+	switch (type) {
+	case IB_SIG_TYPE_T10_DIF:
+		return MLX5_DIF_SIZE;
+	default:
+		return 0;
+	}
+}
+
+static u8 bs_selector(int block_size)
+{
+	switch (block_size) {
+	case 512:	    return 0x1;
+	case 520:	    return 0x2;
+	case 4096:	    return 0x3;
+	case 4160:	    return 0x4;
+	case 1073741824:    return 0x5;
+	default:	    return 0;
+	}
+}
+
+static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain,
+			      struct mlx5_bsf_inl *inl)
+{
+	/* Valid inline section and allow BSF refresh */
+	inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID |
+				       MLX5_BSF_REFRESH_DIF);
+	inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag);
+	inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag);
+	/* repeating block */
+	inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK;
+	inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ?
+			MLX5_DIF_CRC : MLX5_DIF_IPCS;
+
+	if (domain->sig.dif.ref_remap)
+		inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG;
+
+	if (domain->sig.dif.app_escape) {
+		if (domain->sig.dif.ref_escape)
+			inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE;
+		else
+			inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE;
+	}
+
+	inl->dif_app_bitmask_check =
+		cpu_to_be16(domain->sig.dif.apptag_check_mask);
+}
+
+static int mlx5_set_bsf(struct ib_mr *sig_mr,
+			struct ib_sig_attrs *sig_attrs,
+			struct mlx5_bsf *bsf, u32 data_size)
+{
+	struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig;
+	struct mlx5_bsf_basic *basic = &bsf->basic;
+	struct ib_sig_domain *mem = &sig_attrs->mem;
+	struct ib_sig_domain *wire = &sig_attrs->wire;
+
+	memset(bsf, 0, sizeof(*bsf));
+
+	/* Basic + Extended + Inline */
+	basic->bsf_size_sbs = 1 << 7;
+	/* Input domain check byte mask */
+	basic->check_byte_mask = sig_attrs->check_mask;
+	basic->raw_data_size = cpu_to_be32(data_size);
+
+	/* Memory domain */
+	switch (sig_attrs->mem.sig_type) {
+	case IB_SIG_TYPE_NONE:
+		break;
+	case IB_SIG_TYPE_T10_DIF:
+		basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval);
+		basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx);
+		mlx5_fill_inl_bsf(mem, &bsf->m_inl);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Wire domain */
+	switch (sig_attrs->wire.sig_type) {
+	case IB_SIG_TYPE_NONE:
+		break;
+	case IB_SIG_TYPE_T10_DIF:
+		if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval &&
+		    mem->sig_type == wire->sig_type) {
+			/* Same block structure */
+			basic->bsf_size_sbs |= 1 << 4;
+			if (mem->sig.dif.bg_type == wire->sig.dif.bg_type)
+				basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK;
+			if (mem->sig.dif.app_tag == wire->sig.dif.app_tag)
+				basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK;
+			if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag)
+				basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK;
+		} else
+			basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval);
+
+		basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx);
+		mlx5_fill_inl_bsf(wire, &bsf->w_inl);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int set_sig_data_segment(struct ib_sig_handover_wr *wr,
+				struct mlx5_ib_qp *qp, void **seg, int *size)
+{
+	struct ib_sig_attrs *sig_attrs = wr->sig_attrs;
+	struct ib_mr *sig_mr = wr->sig_mr;
+	struct mlx5_bsf *bsf;
+	u32 data_len = wr->wr.sg_list->length;
+	u32 data_key = wr->wr.sg_list->lkey;
+	u64 data_va = wr->wr.sg_list->addr;
+	int ret;
+	int wqe_size;
+
+	if (!wr->prot ||
+	    (data_key == wr->prot->lkey &&
+	     data_va == wr->prot->addr &&
+	     data_len == wr->prot->length)) {
+		/**
+		 * Source domain doesn't contain signature information
+		 * or data and protection are interleaved in memory.
+		 * So need construct:
+		 *                  ------------------
+		 *                 |     data_klm     |
+		 *                  ------------------
+		 *                 |       BSF        |
+		 *                  ------------------
+		 **/
+		struct mlx5_klm *data_klm = *seg;
+
+		data_klm->bcount = cpu_to_be32(data_len);
+		data_klm->key = cpu_to_be32(data_key);
+		data_klm->va = cpu_to_be64(data_va);
+		wqe_size = ALIGN(sizeof(*data_klm), 64);
+	} else {
+		/**
+		 * Source domain contains signature information
+		 * So need construct a strided block format:
+		 *               ---------------------------
+		 *              |     stride_block_ctrl     |
+		 *               ---------------------------
+		 *              |          data_klm         |
+		 *               ---------------------------
+		 *              |          prot_klm         |
+		 *               ---------------------------
+		 *              |             BSF           |
+		 *               ---------------------------
+		 **/
+		struct mlx5_stride_block_ctrl_seg *sblock_ctrl;
+		struct mlx5_stride_block_entry *data_sentry;
+		struct mlx5_stride_block_entry *prot_sentry;
+		u32 prot_key = wr->prot->lkey;
+		u64 prot_va = wr->prot->addr;
+		u16 block_size = sig_attrs->mem.sig.dif.pi_interval;
+		int prot_size;
+
+		sblock_ctrl = *seg;
+		data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl);
+		prot_sentry = (void *)data_sentry + sizeof(*data_sentry);
+
+		prot_size = prot_field_size(sig_attrs->mem.sig_type);
+		if (!prot_size) {
+			pr_err("Bad block size given: %u\n", block_size);
+			return -EINVAL;
+		}
+		sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size +
+							    prot_size);
+		sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP);
+		sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size);
+		sblock_ctrl->num_entries = cpu_to_be16(2);
+
+		data_sentry->bcount = cpu_to_be16(block_size);
+		data_sentry->key = cpu_to_be32(data_key);
+		data_sentry->va = cpu_to_be64(data_va);
+		data_sentry->stride = cpu_to_be16(block_size);
+
+		prot_sentry->bcount = cpu_to_be16(prot_size);
+		prot_sentry->key = cpu_to_be32(prot_key);
+		prot_sentry->va = cpu_to_be64(prot_va);
+		prot_sentry->stride = cpu_to_be16(prot_size);
+
+		wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) +
+				 sizeof(*prot_sentry), 64);
+	}
+
+	*seg += wqe_size;
+	*size += wqe_size / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	bsf = *seg;
+	ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len);
+	if (ret)
+		return -EINVAL;
+
+	*seg += sizeof(*bsf);
+	*size += sizeof(*bsf) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	return 0;
+}
+
+static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg,
+				 struct ib_sig_handover_wr *wr, u32 size,
+				 u32 length, u32 pdn)
+{
+	struct ib_mr *sig_mr = wr->sig_mr;
+	u32 sig_key = sig_mr->rkey;
+	u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1;
+
+	memset(seg, 0, sizeof(*seg));
+
+	seg->flags = get_umr_flags(wr->access_flags) |
+				   MLX5_MKC_ACCESS_MODE_KLMS;
+	seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00);
+	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 |
+				    MLX5_MKEY_BSF_EN | pdn);
+	seg->len = cpu_to_be64(length);
+	seg->xlt_oct_size = cpu_to_be32(get_xlt_octo(size));
+	seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE);
+}
+
+static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
+				u32 size)
+{
+	memset(umr, 0, sizeof(*umr));
+
+	umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE;
+	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size));
+	umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE);
+	umr->mkey_mask = sig_mkey_mask();
+}
+
+
+static int set_sig_umr_wr(struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp,
+			  void **seg, int *size)
+{
+	struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr);
+	struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr);
+	u32 pdn = get_pd(qp)->pdn;
+	u32 xlt_size;
+	int region_len, ret;
+
+	if (unlikely(wr->wr.num_sge != 1) ||
+	    unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) ||
+	    unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) ||
+	    unlikely(!sig_mr->sig->sig_status_checked))
+		return -EINVAL;
+
+	/* length of the protected region, data + protection */
+	region_len = wr->wr.sg_list->length;
+	if (wr->prot &&
+	    (wr->prot->lkey != wr->wr.sg_list->lkey  ||
+	     wr->prot->addr != wr->wr.sg_list->addr  ||
+	     wr->prot->length != wr->wr.sg_list->length))
+		region_len += wr->prot->length;
+
+	/**
+	 * KLM octoword size - if protection was provided
+	 * then we use strided block format (3 octowords),
+	 * else we use single KLM (1 octoword)
+	 **/
+	xlt_size = wr->prot ? 0x30 : sizeof(struct mlx5_klm);
+
+	set_sig_umr_segment(*seg, xlt_size);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	set_sig_mkey_segment(*seg, wr, xlt_size, region_len, pdn);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	ret = set_sig_data_segment(wr, qp, seg, size);
+	if (ret)
+		return ret;
+
+	sig_mr->sig->sig_status_checked = false;
+	return 0;
+}
+
+static int set_psv_wr(struct ib_sig_domain *domain,
+		      u32 psv_idx, void **seg, int *size)
+{
+	struct mlx5_seg_set_psv *psv_seg = *seg;
+
+	memset(psv_seg, 0, sizeof(*psv_seg));
+	psv_seg->psv_num = cpu_to_be32(psv_idx);
+	switch (domain->sig_type) {
+	case IB_SIG_TYPE_NONE:
+		break;
+	case IB_SIG_TYPE_T10_DIF:
+		psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 |
+						     domain->sig.dif.app_tag);
+		psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag);
+		break;
+	default:
+		pr_err("Bad signature type (%d) is given.\n",
+		       domain->sig_type);
+		return -EINVAL;
+	}
+
+	*seg += sizeof(*psv_seg);
+	*size += sizeof(*psv_seg) / 16;
+
+	return 0;
+}
+
+static int set_reg_wr(struct mlx5_ib_qp *qp,
+		      struct ib_reg_wr *wr,
+		      void **seg, int *size)
+{
+	struct mlx5_ib_mr *mr = to_mmr(wr->mr);
+	struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd);
+
+	if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) {
+		mlx5_ib_warn(to_mdev(qp->ibqp.device),
+			     "Invalid IB_SEND_INLINE send flag\n");
+		return -EINVAL;
+	}
+
+	set_reg_umr_seg(*seg, mr);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	set_reg_mkey_seg(*seg, mr, wr->key, wr->access);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	set_reg_data_seg(*seg, mr, pd);
+	*seg += sizeof(struct mlx5_wqe_data_seg);
+	*size += (sizeof(struct mlx5_wqe_data_seg) / 16);
+
+	return 0;
+}
+
+static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size)
+{
+	set_linv_umr_seg(*seg);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+	set_linv_mkey_seg(*seg);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+}
+
+static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16)
+{
+	__be32 *p = NULL;
+	int tidx = idx;
+	int i, j;
+
+	pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx));
+	for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
+		if ((i & 0xf) == 0) {
+			void *buf = mlx5_get_send_wqe(qp, tidx);
+			tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1);
+			p = buf;
+			j = 0;
+		}
+		pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]),
+			 be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]),
+			 be32_to_cpu(p[j + 3]));
+	}
+}
+
+static int begin_wqe(struct mlx5_ib_qp *qp, void **seg,
+		     struct mlx5_wqe_ctrl_seg **ctrl,
+		     struct ib_send_wr *wr, unsigned *idx,
+		     int *size, int nreq)
+{
+	if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)))
+		return -ENOMEM;
+
+	*idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
+	*seg = mlx5_get_send_wqe(qp, *idx);
+	*ctrl = *seg;
+	*(uint32_t *)(*seg + 8) = 0;
+	(*ctrl)->imm = send_ieth(wr);
+	(*ctrl)->fm_ce_se = qp->sq_signal_bits |
+		(wr->send_flags & IB_SEND_SIGNALED ?
+		 MLX5_WQE_CTRL_CQ_UPDATE : 0) |
+		(wr->send_flags & IB_SEND_SOLICITED ?
+		 MLX5_WQE_CTRL_SOLICITED : 0);
+
+	*seg += sizeof(**ctrl);
+	*size = sizeof(**ctrl) / 16;
+
+	return 0;
+}
+
+static void finish_wqe(struct mlx5_ib_qp *qp,
+		       struct mlx5_wqe_ctrl_seg *ctrl,
+		       u8 size, unsigned idx, u64 wr_id,
+		       int nreq, u8 fence, u32 mlx5_opcode)
+{
+	u8 opmod = 0;
+
+	ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) |
+					     mlx5_opcode | ((u32)opmod << 24));
+	ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8));
+	ctrl->fm_ce_se |= fence;
+	if (unlikely(qp->wq_sig))
+		ctrl->signature = wq_sig(ctrl);
+
+	qp->sq.wrid[idx] = wr_id;
+	qp->sq.w_list[idx].opcode = mlx5_opcode;
+	qp->sq.wqe_head[idx] = qp->sq.head + nreq;
+	qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
+	qp->sq.w_list[idx].next = qp->sq.cur_post;
+}
+
+
+int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_ib_qp *qp;
+	struct mlx5_ib_mr *mr;
+	struct mlx5_wqe_data_seg *dpseg;
+	struct mlx5_wqe_xrc_seg *xrc;
+	struct mlx5_bf *bf;
+	int uninitialized_var(size);
+	void *qend;
+	unsigned long flags;
+	unsigned idx;
+	int err = 0;
+	int num_sge;
+	void *seg;
+	int nreq;
+	int i;
+	u8 next_fence = 0;
+	u8 fence;
+
+	if (unlikely(ibqp->qp_type == IB_QPT_GSI))
+		return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr);
+
+	qp = to_mqp(ibqp);
+	bf = &qp->bf;
+	qend = qp->sq.qend;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) {
+			mlx5_ib_warn(dev, "\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		num_sge = wr->num_sge;
+		if (unlikely(num_sge > qp->sq.max_gs)) {
+			mlx5_ib_warn(dev, "\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq);
+		if (err) {
+			mlx5_ib_warn(dev, "\n");
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (wr->opcode == IB_WR_LOCAL_INV ||
+		    wr->opcode == IB_WR_REG_MR) {
+			fence = dev->umr_fence;
+			next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+		} else if (wr->send_flags & IB_SEND_FENCE) {
+			if (qp->next_fence)
+				fence = MLX5_FENCE_MODE_SMALL_AND_FENCE;
+			else
+				fence = MLX5_FENCE_MODE_FENCE;
+		} else {
+			fence = qp->next_fence;
+		}
+
+		switch (ibqp->qp_type) {
+		case IB_QPT_XRC_INI:
+			xrc = seg;
+			seg += sizeof(*xrc);
+			size += sizeof(*xrc) / 16;
+			/* fall through */
+		case IB_QPT_RC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(seg, rdma_wr(wr)->remote_addr,
+					      rdma_wr(wr)->rkey);
+				seg += sizeof(struct mlx5_wqe_raddr_seg);
+				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
+				break;
+
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+				mlx5_ib_warn(dev, "Atomic operations are not supported yet\n");
+				err = -ENOSYS;
+				*bad_wr = wr;
+				goto out;
+
+			case IB_WR_LOCAL_INV:
+				qp->sq.wr_data[idx] = IB_WR_LOCAL_INV;
+				ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey);
+				set_linv_wr(qp, &seg, &size);
+				num_sge = 0;
+				break;
+
+			case IB_WR_REG_MR:
+				qp->sq.wr_data[idx] = IB_WR_REG_MR;
+				ctrl->imm = cpu_to_be32(reg_wr(wr)->key);
+				err = set_reg_wr(qp, reg_wr(wr), &seg, &size);
+				if (err) {
+					*bad_wr = wr;
+					goto out;
+				}
+				num_sge = 0;
+				break;
+
+			case IB_WR_REG_SIG_MR:
+				qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR;
+				mr = to_mmr(sig_handover_wr(wr)->sig_mr);
+
+				ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
+				err = set_sig_umr_wr(wr, qp, &seg, &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+
+				finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq,
+					   fence, MLX5_OPCODE_UMR);
+				/*
+				 * SET_PSV WQEs are not signaled and solicited
+				 * on error
+				 */
+				wr->send_flags &= ~IB_SEND_SIGNALED;
+				wr->send_flags |= IB_SEND_SOLICITED;
+				err = begin_wqe(qp, &seg, &ctrl, wr,
+						&idx, &size, nreq);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+
+				err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->mem,
+						 mr->sig->psv_memory.psv_idx, &seg,
+						 &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+
+				finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq,
+					   fence, MLX5_OPCODE_SET_PSV);
+				err = begin_wqe(qp, &seg, &ctrl, wr,
+						&idx, &size, nreq);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+
+				err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->wire,
+						 mr->sig->psv_wire.psv_idx, &seg,
+						 &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+
+				finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq,
+					   fence, MLX5_OPCODE_SET_PSV);
+				qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+				num_sge = 0;
+				goto skip_psv;
+
+			default:
+				break;
+			}
+			break;
+
+		case IB_QPT_UC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(seg, rdma_wr(wr)->remote_addr,
+					      rdma_wr(wr)->rkey);
+				seg  += sizeof(struct mlx5_wqe_raddr_seg);
+				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
+				break;
+
+			default:
+				break;
+			}
+			break;
+
+		case IB_QPT_SMI:
+			if (unlikely(!mdev->port_caps[qp->port - 1].has_smi)) {
+				mlx5_ib_warn(dev, "Send SMP MADs is not allowed\n");
+				err = -EPERM;
+				*bad_wr = wr;
+				goto out;
+			}
+			/* fall through */
+		case MLX5_IB_QPT_HW_GSI:
+			set_datagram_seg(seg, wr);
+			seg += sizeof(struct mlx5_wqe_datagram_seg);
+			size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
+			if (unlikely((seg == qend)))
+				seg = mlx5_get_send_wqe(qp, 0);
+			break;
+		case IB_QPT_UD:
+			set_datagram_seg(seg, wr);
+			seg += sizeof(struct mlx5_wqe_datagram_seg);
+			size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
+
+			if (unlikely((seg == qend)))
+				seg = mlx5_get_send_wqe(qp, 0);
+
+			/* handle qp that supports ud offload */
+			if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) {
+				struct mlx5_wqe_eth_pad *pad;
+
+				pad = seg;
+				memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad));
+				seg += sizeof(struct mlx5_wqe_eth_pad);
+				size += sizeof(struct mlx5_wqe_eth_pad) / 16;
+
+				seg = set_eth_seg(seg, wr, qend, qp, &size);
+
+				if (unlikely((seg == qend)))
+					seg = mlx5_get_send_wqe(qp, 0);
+			}
+			break;
+		case MLX5_IB_QPT_REG_UMR:
+			if (wr->opcode != MLX5_IB_WR_UMR) {
+				err = -EINVAL;
+				mlx5_ib_warn(dev, "bad opcode\n");
+				goto out;
+			}
+			qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
+			ctrl->imm = cpu_to_be32(umr_wr(wr)->mkey);
+			err = set_reg_umr_segment(dev, seg, wr, !!(MLX5_CAP_GEN(mdev, atomic)));
+			if (unlikely(err))
+				goto out;
+			seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+			size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+			if (unlikely((seg == qend)))
+				seg = mlx5_get_send_wqe(qp, 0);
+			set_reg_mkey_segment(seg, wr);
+			seg += sizeof(struct mlx5_mkey_seg);
+			size += sizeof(struct mlx5_mkey_seg) / 16;
+			if (unlikely((seg == qend)))
+				seg = mlx5_get_send_wqe(qp, 0);
+			break;
+
+		default:
+			break;
+		}
+
+		if (wr->send_flags & IB_SEND_INLINE && num_sge) {
+			int uninitialized_var(sz);
+
+			err = set_data_inl_seg(qp, wr, seg, &sz);
+			if (unlikely(err)) {
+				mlx5_ib_warn(dev, "\n");
+				*bad_wr = wr;
+				goto out;
+			}
+			size += sz;
+		} else {
+			dpseg = seg;
+			for (i = 0; i < num_sge; i++) {
+				if (unlikely(dpseg == qend)) {
+					seg = mlx5_get_send_wqe(qp, 0);
+					dpseg = seg;
+				}
+				if (likely(wr->sg_list[i].length)) {
+					set_data_ptr_seg(dpseg, wr->sg_list + i);
+					size += sizeof(struct mlx5_wqe_data_seg) / 16;
+					dpseg++;
+				}
+			}
+		}
+
+		qp->next_fence = next_fence;
+		finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, fence,
+			   mlx5_ib_opcode[wr->opcode]);
+skip_psv:
+		if (0)
+			dump_wqe(qp, idx, size);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->sq.head += nreq;
+
+		/* Make sure that descriptors are written before
+		 * updating doorbell record and ringing the doorbell
+		 */
+		wmb();
+
+		qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);
+
+		/* Make sure doorbell record is visible to the HCA before
+		 * we hit doorbell */
+		wmb();
+
+		/* currently we support only regular doorbells */
+		mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset, NULL);
+		/* Make sure doorbells don't leak out of SQ spinlock
+		 * and reach the HCA out of order.
+		 */
+		mmiowb();
+		bf->offset ^= bf->buf_size;
+	}
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+	return err;
+}
+
+static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size)
+{
+	sig->signature = calc_sig(sig, size);
+}
+
+int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_wqe_data_seg *scat;
+	struct mlx5_rwqe_sig *sig;
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int i;
+
+	if (unlikely(ibqp->qp_type == IB_QPT_GSI))
+		return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr);
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
+
+	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		scat = get_recv_wqe(qp, ind);
+		if (qp->wq_sig)
+			scat++;
+
+		for (i = 0; i < wr->num_sge; i++)
+			set_data_ptr_seg(scat + i, wr->sg_list + i);
+
+		if (i < qp->rq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX5_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+
+		if (qp->wq_sig) {
+			sig = (struct mlx5_rwqe_sig *)scat;
+			set_sig_seg(sig, (qp->rq.max_gs + 1) << 2);
+		}
+
+		qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/* Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	return err;
+}
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state)
+{
+	switch (mlx5_state) {
+	case MLX5_QP_STATE_RST:      return IB_QPS_RESET;
+	case MLX5_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MLX5_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MLX5_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MLX5_QP_STATE_SQ_DRAINING:
+	case MLX5_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MLX5_QP_STATE_SQER:     return IB_QPS_SQE;
+	case MLX5_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:		     return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state)
+{
+	switch (mlx5_mig_state) {
+	case MLX5_QP_PM_ARMED:		return IB_MIG_ARMED;
+	case MLX5_QP_PM_REARM:		return IB_MIG_REARM;
+	case MLX5_QP_PM_MIGRATED:	return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mlx5_flags)
+{
+	int ib_flags = 0;
+
+	if (mlx5_flags & MLX5_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mlx5_flags & MLX5_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mlx5_flags & MLX5_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_rdma_ah_attr(struct mlx5_ib_dev *ibdev,
+			    struct rdma_ah_attr *ah_attr,
+			    struct mlx5_qp_path *path)
+{
+
+	memset(ah_attr, 0, sizeof(*ah_attr));
+
+	if (!path->port || path->port > ibdev->num_ports)
+		return;
+
+	ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, path->port);
+
+	rdma_ah_set_port_num(ah_attr, path->port);
+	rdma_ah_set_sl(ah_attr, path->dci_cfi_prio_sl & 0xf);
+
+	rdma_ah_set_dlid(ah_attr, be16_to_cpu(path->rlid));
+	rdma_ah_set_path_bits(ah_attr, path->grh_mlid & 0x7f);
+	rdma_ah_set_static_rate(ah_attr,
+				path->static_rate ? path->static_rate - 5 : 0);
+	if (path->grh_mlid & (1 << 7)) {
+		u32 tc_fl = be32_to_cpu(path->tclass_flowlabel);
+
+		rdma_ah_set_grh(ah_attr, NULL,
+				tc_fl & 0xfffff,
+				path->mgid_index,
+				path->hop_limit,
+				(tc_fl >> 20) & 0xff);
+		rdma_ah_set_dgid_raw(ah_attr, path->rgid);
+	}
+}
+
+static int query_raw_packet_qp_sq_state(struct mlx5_ib_dev *dev,
+					struct mlx5_ib_sq *sq,
+					u8 *sq_state)
+{
+	int err;
+
+	err = mlx5_core_query_sq_state(dev->mdev, sq->base.mqp.qpn, sq_state);
+	if (err)
+		goto out;
+	sq->state = *sq_state;
+
+out:
+	return err;
+}
+
+static int query_raw_packet_qp_rq_state(struct mlx5_ib_dev *dev,
+					struct mlx5_ib_rq *rq,
+					u8 *rq_state)
+{
+	void *out;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(query_rq_out);
+	out = kvzalloc(inlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_core_query_rq(dev->mdev, rq->base.mqp.qpn, out);
+	if (err)
+		goto out;
+
+	rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context);
+	*rq_state = MLX5_GET(rqc, rqc, state);
+	rq->state = *rq_state;
+
+out:
+	kvfree(out);
+	return err;
+}
+
+static int sqrq_state_to_qp_state(u8 sq_state, u8 rq_state,
+				  struct mlx5_ib_qp *qp, u8 *qp_state)
+{
+	static const u8 sqrq_trans[MLX5_RQ_NUM_STATE][MLX5_SQ_NUM_STATE] = {
+		[MLX5_RQC_STATE_RST] = {
+			[MLX5_SQC_STATE_RST]	= IB_QPS_RESET,
+			[MLX5_SQC_STATE_RDY]	= MLX5_QP_STATE_BAD,
+			[MLX5_SQC_STATE_ERR]	= MLX5_QP_STATE_BAD,
+			[MLX5_SQ_STATE_NA]	= IB_QPS_RESET,
+		},
+		[MLX5_RQC_STATE_RDY] = {
+			[MLX5_SQC_STATE_RST]	= MLX5_QP_STATE_BAD,
+			[MLX5_SQC_STATE_RDY]	= MLX5_QP_STATE,
+			[MLX5_SQC_STATE_ERR]	= IB_QPS_SQE,
+			[MLX5_SQ_STATE_NA]	= MLX5_QP_STATE,
+		},
+		[MLX5_RQC_STATE_ERR] = {
+			[MLX5_SQC_STATE_RST]    = MLX5_QP_STATE_BAD,
+			[MLX5_SQC_STATE_RDY]	= MLX5_QP_STATE_BAD,
+			[MLX5_SQC_STATE_ERR]	= IB_QPS_ERR,
+			[MLX5_SQ_STATE_NA]	= IB_QPS_ERR,
+		},
+		[MLX5_RQ_STATE_NA] = {
+			[MLX5_SQC_STATE_RST]    = IB_QPS_RESET,
+			[MLX5_SQC_STATE_RDY]	= MLX5_QP_STATE,
+			[MLX5_SQC_STATE_ERR]	= MLX5_QP_STATE,
+			[MLX5_SQ_STATE_NA]	= MLX5_QP_STATE_BAD,
+		},
+	};
+
+	*qp_state = sqrq_trans[rq_state][sq_state];
+
+	if (*qp_state == MLX5_QP_STATE_BAD) {
+		WARN(1, "Buggy Raw Packet QP state, SQ 0x%x state: 0x%x, RQ 0x%x state: 0x%x",
+		     qp->raw_packet_qp.sq.base.mqp.qpn, sq_state,
+		     qp->raw_packet_qp.rq.base.mqp.qpn, rq_state);
+		return -EINVAL;
+	}
+
+	if (*qp_state == MLX5_QP_STATE)
+		*qp_state = qp->state;
+
+	return 0;
+}
+
+static int query_raw_packet_qp_state(struct mlx5_ib_dev *dev,
+				     struct mlx5_ib_qp *qp,
+				     u8 *raw_packet_qp_state)
+{
+	struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+	struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+	struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+	int err;
+	u8 sq_state = MLX5_SQ_STATE_NA;
+	u8 rq_state = MLX5_RQ_STATE_NA;
+
+	if (qp->sq.wqe_cnt) {
+		err = query_raw_packet_qp_sq_state(dev, sq, &sq_state);
+		if (err)
+			return err;
+	}
+
+	if (qp->rq.wqe_cnt) {
+		err = query_raw_packet_qp_rq_state(dev, rq, &rq_state);
+		if (err)
+			return err;
+	}
+
+	return sqrq_state_to_qp_state(sq_state, rq_state, qp,
+				      raw_packet_qp_state);
+}
+
+static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+			 struct ib_qp_attr *qp_attr)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_qp_out);
+	struct mlx5_qp_context *context;
+	int mlx5_state;
+	u32 *outb;
+	int err = 0;
+
+	outb = kzalloc(outlen, GFP_KERNEL);
+	if (!outb)
+		return -ENOMEM;
+
+	err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, outb,
+				 outlen);
+	if (err)
+		goto out;
+
+	/* FIXME: use MLX5_GET rather than mlx5_qp_context manual struct */
+	context = (struct mlx5_qp_context *)MLX5_ADDR_OF(query_qp_out, outb, qpc);
+
+	mlx5_state = be32_to_cpu(context->flags) >> 28;
+
+	qp->state		     = to_ib_qp_state(mlx5_state);
+	qp_attr->path_mtu	     = context->mtu_msgmax >> 5;
+	qp_attr->path_mig_state	     =
+		to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
+	qp_attr->qkey		     = be32_to_cpu(context->qkey);
+	qp_attr->rq_psn		     = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn		     = be32_to_cpu(context->next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num	     = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context->params2));
+
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
+		to_rdma_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
+		to_rdma_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
+		qp_attr->alt_pkey_index =
+			be16_to_cpu(context->alt_path.pkey_index);
+		qp_attr->alt_port_num	=
+			rdma_ah_get_port_num(&qp_attr->alt_ah_attr);
+	}
+
+	qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index);
+	qp_attr->port_num = context->pri_path.port;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context->params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer	    =
+		(be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout	    = context->pri_path.ackto_lt >> 3;
+	qp_attr->retry_cnt	    = (be32_to_cpu(context->params1) >> 16) & 0x7;
+	qp_attr->rnr_retry	    = (be32_to_cpu(context->params1) >> 13) & 0x7;
+	qp_attr->alt_timeout	    = context->alt_path.ackto_lt >> 3;
+
+out:
+	kfree(outb);
+	return err;
+}
+
+static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp,
+				struct ib_qp_attr *qp_attr, int qp_attr_mask,
+				struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx5_core_dct	*dct = &mqp->dct.mdct;
+	u32 *out;
+	u32 access_flags = 0;
+	int outlen = MLX5_ST_SZ_BYTES(query_dct_out);
+	void *dctc;
+	int err;
+	int supported_mask = IB_QP_STATE |
+			     IB_QP_ACCESS_FLAGS |
+			     IB_QP_PORT |
+			     IB_QP_MIN_RNR_TIMER |
+			     IB_QP_AV |
+			     IB_QP_PATH_MTU |
+			     IB_QP_PKEY_INDEX;
+
+	if (qp_attr_mask & ~supported_mask)
+		return -EINVAL;
+	if (mqp->state != IB_QPS_RTR)
+		return -EINVAL;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_core_dct_query(dev->mdev, dct, out, outlen);
+	if (err)
+		goto out;
+
+	dctc = MLX5_ADDR_OF(query_dct_out, out, dct_context_entry);
+
+	if (qp_attr_mask & IB_QP_STATE)
+		qp_attr->qp_state = IB_QPS_RTR;
+
+	if (qp_attr_mask & IB_QP_ACCESS_FLAGS) {
+		if (MLX5_GET(dctc, dctc, rre))
+			access_flags |= IB_ACCESS_REMOTE_READ;
+		if (MLX5_GET(dctc, dctc, rwe))
+			access_flags |= IB_ACCESS_REMOTE_WRITE;
+		if (MLX5_GET(dctc, dctc, rae))
+			access_flags |= IB_ACCESS_REMOTE_ATOMIC;
+		qp_attr->qp_access_flags = access_flags;
+	}
+
+	if (qp_attr_mask & IB_QP_PORT)
+		qp_attr->port_num = MLX5_GET(dctc, dctc, port);
+	if (qp_attr_mask & IB_QP_MIN_RNR_TIMER)
+		qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak);
+	if (qp_attr_mask & IB_QP_AV) {
+		qp_attr->ah_attr.grh.traffic_class = MLX5_GET(dctc, dctc, tclass);
+		qp_attr->ah_attr.grh.flow_label = MLX5_GET(dctc, dctc, flow_label);
+		qp_attr->ah_attr.grh.sgid_index = MLX5_GET(dctc, dctc, my_addr_index);
+		qp_attr->ah_attr.grh.hop_limit = MLX5_GET(dctc, dctc, hop_limit);
+	}
+	if (qp_attr_mask & IB_QP_PATH_MTU)
+		qp_attr->path_mtu = MLX5_GET(dctc, dctc, mtu);
+	if (qp_attr_mask & IB_QP_PKEY_INDEX)
+		qp_attr->pkey_index = MLX5_GET(dctc, dctc, pkey_index);
+out:
+	kfree(out);
+	return err;
+}
+
+int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		     int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	int err = 0;
+	u8 raw_packet_qp_state;
+
+	if (ibqp->rwq_ind_tbl)
+		return -ENOSYS;
+
+	if (unlikely(ibqp->qp_type == IB_QPT_GSI))
+		return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask,
+					    qp_init_attr);
+
+	/* Not all of output fields are applicable, make sure to zero them */
+	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
+	memset(qp_attr, 0, sizeof(*qp_attr));
+
+	if (unlikely(qp->qp_sub_type == MLX5_IB_QPT_DCT))
+		return mlx5_ib_dct_query_qp(dev, qp, qp_attr,
+					    qp_attr_mask, qp_init_attr);
+
+	mutex_lock(&qp->mutex);
+
+	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
+	    qp->flags & MLX5_IB_QP_UNDERLAY) {
+		err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state);
+		if (err)
+			goto out;
+		qp->state = raw_packet_qp_state;
+		qp_attr->port_num = 1;
+	} else {
+		err = query_qp_attr(dev, qp, qp_attr);
+		if (err)
+			goto out;
+	}
+
+	qp_attr->qp_state	     = qp->state;
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
+	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+
+	if (!ibqp->uobject) {
+		qp_attr->cap.max_send_wr  = qp->sq.max_post;
+		qp_attr->cap.max_send_sge = qp->sq.max_gs;
+		qp_init_attr->qp_context = ibqp->qp_context;
+	} else {
+		qp_attr->cap.max_send_wr  = 0;
+		qp_attr->cap.max_send_sge = 0;
+	}
+
+	qp_init_attr->qp_type = ibqp->qp_type;
+	qp_init_attr->recv_cq = ibqp->recv_cq;
+	qp_init_attr->send_cq = ibqp->send_cq;
+	qp_init_attr->srq = ibqp->srq;
+	qp_attr->cap.max_inline_data = qp->max_inline_data;
+
+	qp_init_attr->cap	     = qp_attr->cap;
+
+	qp_init_attr->create_flags = 0;
+	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+		qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL;
+	if (qp->flags & MLX5_IB_QP_MANAGED_SEND)
+		qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND;
+	if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
+		qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV;
+	if (qp->flags & MLX5_IB_QP_SQPN_QP1)
+		qp_init_attr->create_flags |= mlx5_ib_create_qp_sqpn_qp1();
+
+	qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ?
+		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
+					  struct ib_ucontext *context,
+					  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_xrcd *xrcd;
+	int err;
+
+	if (!MLX5_CAP_GEN(dev->mdev, xrc))
+		return ERR_PTR(-ENOSYS);
+
+	xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL);
+	if (!xrcd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn);
+	if (err) {
+		kfree(xrcd);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return &xrcd->ibxrcd;
+}
+
+int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	struct mlx5_ib_dev *dev = to_mdev(xrcd->device);
+	u32 xrcdn = to_mxrcd(xrcd)->xrcdn;
+	int err;
+
+	err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn);
+	if (err)
+		mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn);
+
+	kfree(xrcd);
+	return 0;
+}
+
+static void mlx5_ib_wq_event(struct mlx5_core_qp *core_qp, int type)
+{
+	struct mlx5_ib_rwq *rwq = to_mibrwq(core_qp);
+	struct mlx5_ib_dev *dev = to_mdev(rwq->ibwq.device);
+	struct ib_event event;
+
+	if (rwq->ibwq.event_handler) {
+		event.device     = rwq->ibwq.device;
+		event.element.wq = &rwq->ibwq;
+		switch (type) {
+		case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+			event.event = IB_EVENT_WQ_FATAL;
+			break;
+		default:
+			mlx5_ib_warn(dev, "Unexpected event type %d on WQ %06x\n", type, core_qp->qpn);
+			return;
+		}
+
+		rwq->ibwq.event_handler(&event, rwq->ibwq.wq_context);
+	}
+}
+
+static int set_delay_drop(struct mlx5_ib_dev *dev)
+{
+	int err = 0;
+
+	mutex_lock(&dev->delay_drop.lock);
+	if (dev->delay_drop.activate)
+		goto out;
+
+	err = mlx5_core_set_delay_drop(dev->mdev, dev->delay_drop.timeout);
+	if (err)
+		goto out;
+
+	dev->delay_drop.activate = true;
+out:
+	mutex_unlock(&dev->delay_drop.lock);
+
+	if (!err)
+		atomic_inc(&dev->delay_drop.rqs_cnt);
+	return err;
+}
+
+static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
+		      struct ib_wq_init_attr *init_attr)
+{
+	struct mlx5_ib_dev *dev;
+	int has_net_offloads;
+	__be64 *rq_pas0;
+	void *in;
+	void *rqc;
+	void *wq;
+	int inlen;
+	int err;
+
+	dev = to_mdev(pd->device);
+
+	inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rwq->rq_num_pas;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+	MLX5_SET(rqc,  rqc, mem_rq_type,
+		 MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE);
+	MLX5_SET(rqc, rqc, user_index, rwq->user_index);
+	MLX5_SET(rqc,  rqc, cqn, to_mcq(init_attr->cq)->mcq.cqn);
+	MLX5_SET(rqc,  rqc, state, MLX5_RQC_STATE_RST);
+	MLX5_SET(rqc,  rqc, flush_in_error_en, 1);
+	wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	MLX5_SET(wq, wq, wq_type,
+		 rwq->create_flags & MLX5_IB_WQ_FLAGS_STRIDING_RQ ?
+		 MLX5_WQ_TYPE_CYCLIC_STRIDING_RQ : MLX5_WQ_TYPE_CYCLIC);
+	if (init_attr->create_flags & IB_WQ_FLAGS_PCI_WRITE_END_PADDING) {
+		if (!MLX5_CAP_GEN(dev->mdev, end_pad)) {
+			mlx5_ib_dbg(dev, "Scatter end padding is not supported\n");
+			err = -EOPNOTSUPP;
+			goto out;
+		} else {
+			MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
+		}
+	}
+	MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride);
+	if (rwq->create_flags & MLX5_IB_WQ_FLAGS_STRIDING_RQ) {
+		MLX5_SET(wq, wq, two_byte_shift_en, rwq->two_byte_shift_en);
+		MLX5_SET(wq, wq, log_wqe_stride_size,
+			 rwq->single_stride_log_num_of_bytes -
+			 MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES);
+		MLX5_SET(wq, wq, log_wqe_num_of_strides, rwq->log_num_strides -
+			 MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES);
+	}
+	MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size);
+	MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn);
+	MLX5_SET(wq, wq, page_offset, rwq->rq_page_offset);
+	MLX5_SET(wq, wq, log_wq_pg_sz, rwq->log_page_size);
+	MLX5_SET(wq, wq, wq_signature, rwq->wq_sig);
+	MLX5_SET64(wq, wq, dbr_addr, rwq->db.dma);
+	has_net_offloads = MLX5_CAP_GEN(dev->mdev, eth_net_offloads);
+	if (init_attr->create_flags & IB_WQ_FLAGS_CVLAN_STRIPPING) {
+		if (!(has_net_offloads && MLX5_CAP_ETH(dev->mdev, vlan_cap))) {
+			mlx5_ib_dbg(dev, "VLAN offloads are not supported\n");
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+	} else {
+		MLX5_SET(rqc, rqc, vsd, 1);
+	}
+	if (init_attr->create_flags & IB_WQ_FLAGS_SCATTER_FCS) {
+		if (!(has_net_offloads && MLX5_CAP_ETH(dev->mdev, scatter_fcs))) {
+			mlx5_ib_dbg(dev, "Scatter FCS is not supported\n");
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+		MLX5_SET(rqc, rqc, scatter_fcs, 1);
+	}
+	if (init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) {
+		if (!(dev->ib_dev.attrs.raw_packet_caps &
+		      IB_RAW_PACKET_CAP_DELAY_DROP)) {
+			mlx5_ib_dbg(dev, "Delay drop is not supported\n");
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+		MLX5_SET(rqc, rqc, delay_drop_en, 1);
+	}
+	rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
+	mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0);
+	err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp);
+	if (!err && init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) {
+		err = set_delay_drop(dev);
+		if (err) {
+			mlx5_ib_warn(dev, "Failed to enable delay drop err=%d\n",
+				     err);
+			mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
+		} else {
+			rwq->create_flags |= MLX5_IB_WQ_FLAGS_DELAY_DROP;
+		}
+	}
+out:
+	kvfree(in);
+	return err;
+}
+
+static int set_user_rq_size(struct mlx5_ib_dev *dev,
+			    struct ib_wq_init_attr *wq_init_attr,
+			    struct mlx5_ib_create_wq *ucmd,
+			    struct mlx5_ib_rwq *rwq)
+{
+	/* Sanity check RQ size before proceeding */
+	if (wq_init_attr->max_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_wq_sz)))
+		return -EINVAL;
+
+	if (!ucmd->rq_wqe_count)
+		return -EINVAL;
+
+	rwq->wqe_count = ucmd->rq_wqe_count;
+	rwq->wqe_shift = ucmd->rq_wqe_shift;
+	rwq->buf_size = (rwq->wqe_count << rwq->wqe_shift);
+	rwq->log_rq_stride = rwq->wqe_shift;
+	rwq->log_rq_size = ilog2(rwq->wqe_count);
+	return 0;
+}
+
+static int prepare_user_rq(struct ib_pd *pd,
+			   struct ib_wq_init_attr *init_attr,
+			   struct ib_udata *udata,
+			   struct mlx5_ib_rwq *rwq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_create_wq ucmd = {};
+	int err;
+	size_t required_cmd_sz;
+
+	required_cmd_sz = offsetof(typeof(ucmd), single_stride_log_num_of_bytes)
+		+ sizeof(ucmd.single_stride_log_num_of_bytes);
+	if (udata->inlen < required_cmd_sz) {
+		mlx5_ib_dbg(dev, "invalid inlen\n");
+		return -EINVAL;
+	}
+
+	if (udata->inlen > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 udata->inlen - sizeof(ucmd))) {
+		mlx5_ib_dbg(dev, "inlen is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
+		mlx5_ib_dbg(dev, "copy failed\n");
+		return -EFAULT;
+	}
+
+	if (ucmd.comp_mask & (~MLX5_IB_CREATE_WQ_STRIDING_RQ)) {
+		mlx5_ib_dbg(dev, "invalid comp mask\n");
+		return -EOPNOTSUPP;
+	} else if (ucmd.comp_mask & MLX5_IB_CREATE_WQ_STRIDING_RQ) {
+		if (!MLX5_CAP_GEN(dev->mdev, striding_rq)) {
+			mlx5_ib_dbg(dev, "Striding RQ is not supported\n");
+			return -EOPNOTSUPP;
+		}
+		if ((ucmd.single_stride_log_num_of_bytes <
+		    MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES) ||
+		    (ucmd.single_stride_log_num_of_bytes >
+		     MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES)) {
+			mlx5_ib_dbg(dev, "Invalid log stride size (%u. Range is %u - %u)\n",
+				    ucmd.single_stride_log_num_of_bytes,
+				    MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES,
+				    MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES);
+			return -EINVAL;
+		}
+		if ((ucmd.single_wqe_log_num_of_strides >
+		    MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES) ||
+		     (ucmd.single_wqe_log_num_of_strides <
+			MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES)) {
+			mlx5_ib_dbg(dev, "Invalid log num strides (%u. Range is %u - %u)\n",
+				    ucmd.single_wqe_log_num_of_strides,
+				    MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES,
+				    MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES);
+			return -EINVAL;
+		}
+		rwq->single_stride_log_num_of_bytes =
+			ucmd.single_stride_log_num_of_bytes;
+		rwq->log_num_strides = ucmd.single_wqe_log_num_of_strides;
+		rwq->two_byte_shift_en = !!ucmd.two_byte_shift_en;
+		rwq->create_flags |= MLX5_IB_WQ_FLAGS_STRIDING_RQ;
+	}
+
+	err = set_user_rq_size(dev, init_attr, &ucmd, rwq);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		return err;
+	}
+
+	err = create_user_rq(dev, pd, rwq, &ucmd);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		if (err)
+			return err;
+	}
+
+	rwq->user_index = ucmd.user_index;
+	return 0;
+}
+
+struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
+				struct ib_wq_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev;
+	struct mlx5_ib_rwq *rwq;
+	struct mlx5_ib_create_wq_resp resp = {};
+	size_t min_resp_len;
+	int err;
+
+	if (!udata)
+		return ERR_PTR(-ENOSYS);
+
+	min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+	if (udata->outlen && udata->outlen < min_resp_len)
+		return ERR_PTR(-EINVAL);
+
+	dev = to_mdev(pd->device);
+	switch (init_attr->wq_type) {
+	case IB_WQT_RQ:
+		rwq = kzalloc(sizeof(*rwq), GFP_KERNEL);
+		if (!rwq)
+			return ERR_PTR(-ENOMEM);
+		err = prepare_user_rq(pd, init_attr, udata, rwq);
+		if (err)
+			goto err;
+		err = create_rq(rwq, pd, init_attr);
+		if (err)
+			goto err_user_rq;
+		break;
+	default:
+		mlx5_ib_dbg(dev, "unsupported wq type %d\n",
+			    init_attr->wq_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	rwq->ibwq.wq_num = rwq->core_qp.qpn;
+	rwq->ibwq.state = IB_WQS_RESET;
+	if (udata->outlen) {
+		resp.response_length = offsetof(typeof(resp), response_length) +
+				sizeof(resp.response_length);
+		err = ib_copy_to_udata(udata, &resp, resp.response_length);
+		if (err)
+			goto err_copy;
+	}
+
+	rwq->core_qp.event = mlx5_ib_wq_event;
+	rwq->ibwq.event_handler = init_attr->event_handler;
+	return &rwq->ibwq;
+
+err_copy:
+	mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
+err_user_rq:
+	destroy_user_rq(dev, pd, rwq);
+err:
+	kfree(rwq);
+	return ERR_PTR(err);
+}
+
+int mlx5_ib_destroy_wq(struct ib_wq *wq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(wq->device);
+	struct mlx5_ib_rwq *rwq = to_mrwq(wq);
+
+	mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
+	destroy_user_rq(dev, wq->pd, rwq);
+	kfree(rwq);
+
+	return 0;
+}
+
+struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
+						      struct ib_rwq_ind_table_init_attr *init_attr,
+						      struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+	struct mlx5_ib_rwq_ind_table *rwq_ind_tbl;
+	int sz = 1 << init_attr->log_ind_tbl_size;
+	struct mlx5_ib_create_rwq_ind_tbl_resp resp = {};
+	size_t min_resp_len;
+	int inlen;
+	int err;
+	int i;
+	u32 *in;
+	void *rqtc;
+
+	if (udata->inlen > 0 &&
+	    !ib_is_udata_cleared(udata, 0,
+				 udata->inlen))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (init_attr->log_ind_tbl_size >
+	    MLX5_CAP_GEN(dev->mdev, log_max_rqt_size)) {
+		mlx5_ib_dbg(dev, "log_ind_tbl_size = %d is bigger than supported = %d\n",
+			    init_attr->log_ind_tbl_size,
+			    MLX5_CAP_GEN(dev->mdev, log_max_rqt_size));
+		return ERR_PTR(-EINVAL);
+	}
+
+	min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+	if (udata->outlen && udata->outlen < min_resp_len)
+		return ERR_PTR(-EINVAL);
+
+	rwq_ind_tbl = kzalloc(sizeof(*rwq_ind_tbl), GFP_KERNEL);
+	if (!rwq_ind_tbl)
+		return ERR_PTR(-ENOMEM);
+
+	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
+
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
+	MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
+
+	for (i = 0; i < sz; i++)
+		MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num);
+
+	err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn);
+	kvfree(in);
+
+	if (err)
+		goto err;
+
+	rwq_ind_tbl->ib_rwq_ind_tbl.ind_tbl_num = rwq_ind_tbl->rqtn;
+	if (udata->outlen) {
+		resp.response_length = offsetof(typeof(resp), response_length) +
+					sizeof(resp.response_length);
+		err = ib_copy_to_udata(udata, &resp, resp.response_length);
+		if (err)
+			goto err_copy;
+	}
+
+	return &rwq_ind_tbl->ib_rwq_ind_tbl;
+
+err_copy:
+	mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn);
+err:
+	kfree(rwq_ind_tbl);
+	return ERR_PTR(err);
+}
+
+int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
+{
+	struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl);
+	struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device);
+
+	mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn);
+
+	kfree(rwq_ind_tbl);
+	return 0;
+}
+
+int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+		      u32 wq_attr_mask, struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(wq->device);
+	struct mlx5_ib_rwq *rwq = to_mrwq(wq);
+	struct mlx5_ib_modify_wq ucmd = {};
+	size_t required_cmd_sz;
+	int curr_wq_state;
+	int wq_state;
+	int inlen;
+	int err;
+	void *rqc;
+	void *in;
+
+	required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved);
+	if (udata->inlen < required_cmd_sz)
+		return -EINVAL;
+
+	if (udata->inlen > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 udata->inlen - sizeof(ucmd)))
+		return -EOPNOTSUPP;
+
+	if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)))
+		return -EFAULT;
+
+	if (ucmd.comp_mask || ucmd.reserved)
+		return -EOPNOTSUPP;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+	curr_wq_state = (wq_attr_mask & IB_WQ_CUR_STATE) ?
+		wq_attr->curr_wq_state : wq->state;
+	wq_state = (wq_attr_mask & IB_WQ_STATE) ?
+		wq_attr->wq_state : curr_wq_state;
+	if (curr_wq_state == IB_WQS_ERR)
+		curr_wq_state = MLX5_RQC_STATE_ERR;
+	if (wq_state == IB_WQS_ERR)
+		wq_state = MLX5_RQC_STATE_ERR;
+	MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state);
+	MLX5_SET(rqc, rqc, state, wq_state);
+
+	if (wq_attr_mask & IB_WQ_FLAGS) {
+		if (wq_attr->flags_mask & IB_WQ_FLAGS_CVLAN_STRIPPING) {
+			if (!(MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
+			      MLX5_CAP_ETH(dev->mdev, vlan_cap))) {
+				mlx5_ib_dbg(dev, "VLAN offloads are not "
+					    "supported\n");
+				err = -EOPNOTSUPP;
+				goto out;
+			}
+			MLX5_SET64(modify_rq_in, in, modify_bitmask,
+				   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD);
+			MLX5_SET(rqc, rqc, vsd,
+				 (wq_attr->flags & IB_WQ_FLAGS_CVLAN_STRIPPING) ? 0 : 1);
+		}
+
+		if (wq_attr->flags_mask & IB_WQ_FLAGS_PCI_WRITE_END_PADDING) {
+			mlx5_ib_dbg(dev, "Modifying scatter end padding is not supported\n");
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+	}
+
+	if (curr_wq_state == IB_WQS_RESET && wq_state == IB_WQS_RDY) {
+		if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) {
+			MLX5_SET64(modify_rq_in, in, modify_bitmask,
+				   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID);
+			MLX5_SET(rqc, rqc, counter_set_id,
+				 dev->port->cnts.set_id);
+		} else
+			pr_info_once("%s: Receive WQ counters are not supported on current FW\n",
+				     dev->ib_dev.name);
+	}
+
+	err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in, inlen);
+	if (!err)
+		rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state;
+
+out:
+	kvfree(in);
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
new file mode 100644
index 0000000..3c7522d
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/srq.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "mlx5_ib.h"
+
+/* not supported currently */
+static int srq_signature;
+
+static void *get_wqe(struct mlx5_ib_srq *srq, int n)
+{
+	return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
+}
+
+static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type)
+{
+	struct ib_event event;
+	struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq;
+
+	if (ibsrq->event_handler) {
+		event.device      = ibsrq->device;
+		event.element.srq = ibsrq;
+		switch (type) {
+		case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+			event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			break;
+		case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+			event.event = IB_EVENT_SRQ_ERR;
+			break;
+		default:
+			pr_warn("mlx5_ib: Unexpected event type %d on SRQ %06x\n",
+				type, srq->srqn);
+			return;
+		}
+
+		ibsrq->event_handler(&event, ibsrq->srq_context);
+	}
+}
+
+static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
+			   struct mlx5_srq_attr *in,
+			   struct ib_udata *udata, int buf_size)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_create_srq ucmd = {};
+	size_t ucmdlen;
+	int err;
+	int npages;
+	int page_shift;
+	int ncont;
+	u32 offset;
+	u32 uidx = MLX5_IB_DEFAULT_UIDX;
+
+	ucmdlen = min(udata->inlen, sizeof(ucmd));
+
+	if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) {
+		mlx5_ib_dbg(dev, "failed copy udata\n");
+		return -EFAULT;
+	}
+
+	if (ucmd.reserved0 || ucmd.reserved1)
+		return -EINVAL;
+
+	if (udata->inlen > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 udata->inlen - sizeof(ucmd)))
+		return -EINVAL;
+
+	if (in->type != IB_SRQT_BASIC) {
+		err = get_srq_user_index(to_mucontext(pd->uobject->context),
+					 &ucmd, udata->inlen, &uidx);
+		if (err)
+			return err;
+	}
+
+	srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
+
+	srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size,
+				0, 0);
+	if (IS_ERR(srq->umem)) {
+		mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size);
+		err = PTR_ERR(srq->umem);
+		return err;
+	}
+
+	mlx5_ib_cont_pages(srq->umem, ucmd.buf_addr, 0, &npages,
+			   &page_shift, &ncont, NULL);
+	err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift,
+				     &offset);
+	if (err) {
+		mlx5_ib_warn(dev, "bad offset\n");
+		goto err_umem;
+	}
+
+	in->pas = kvzalloc(sizeof(*in->pas) * ncont, GFP_KERNEL);
+	if (!in->pas) {
+		err = -ENOMEM;
+		goto err_umem;
+	}
+
+	mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0);
+
+	err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context),
+				  ucmd.db_addr, &srq->db);
+	if (err) {
+		mlx5_ib_dbg(dev, "map doorbell failed\n");
+		goto err_in;
+	}
+
+	in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+	in->page_offset = offset;
+	if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
+	    in->type != IB_SRQT_BASIC)
+		in->user_index = uidx;
+
+	return 0;
+
+err_in:
+	kvfree(in->pas);
+
+err_umem:
+	ib_umem_release(srq->umem);
+
+	return err;
+}
+
+static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
+			     struct mlx5_srq_attr *in, int buf_size)
+{
+	int err;
+	int i;
+	struct mlx5_wqe_srq_next_seg *next;
+
+	err = mlx5_db_alloc(dev->mdev, &srq->db);
+	if (err) {
+		mlx5_ib_warn(dev, "alloc dbell rec failed\n");
+		return err;
+	}
+
+	if (mlx5_buf_alloc(dev->mdev, buf_size, &srq->buf)) {
+		mlx5_ib_dbg(dev, "buf alloc failed\n");
+		err = -ENOMEM;
+		goto err_db;
+	}
+
+	srq->head    = 0;
+	srq->tail    = srq->msrq.max - 1;
+	srq->wqe_ctr = 0;
+
+	for (i = 0; i < srq->msrq.max; i++) {
+		next = get_wqe(srq, i);
+		next->next_wqe_index =
+			cpu_to_be16((i + 1) & (srq->msrq.max - 1));
+	}
+
+	mlx5_ib_dbg(dev, "srq->buf.page_shift = %d\n", srq->buf.page_shift);
+	in->pas = kvzalloc(sizeof(*in->pas) * srq->buf.npages, GFP_KERNEL);
+	if (!in->pas) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+	mlx5_fill_page_array(&srq->buf, in->pas);
+
+	srq->wrid = kvmalloc_array(srq->msrq.max, sizeof(u64), GFP_KERNEL);
+	if (!srq->wrid) {
+		err = -ENOMEM;
+		goto err_in;
+	}
+	srq->wq_sig = !!srq_signature;
+
+	in->log_page_size = srq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+	if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
+	    in->type != IB_SRQT_BASIC)
+		in->user_index = MLX5_IB_DEFAULT_UIDX;
+
+	return 0;
+
+err_in:
+	kvfree(in->pas);
+
+err_buf:
+	mlx5_buf_free(dev->mdev, &srq->buf);
+
+err_db:
+	mlx5_db_free(dev->mdev, &srq->db);
+	return err;
+}
+
+static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq)
+{
+	mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+	ib_umem_release(srq->umem);
+}
+
+
+static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq)
+{
+	kvfree(srq->wrid);
+	mlx5_buf_free(dev->mdev, &srq->buf);
+	mlx5_db_free(dev->mdev, &srq->db);
+}
+
+struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_srq *srq;
+	size_t desc_size;
+	size_t buf_size;
+	int err;
+	struct mlx5_srq_attr in = {0};
+	__u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
+
+	/* Sanity check SRQ size before proceeding */
+	if (init_attr->attr.max_wr >= max_srq_wqes) {
+		mlx5_ib_dbg(dev, "max_wr %d, cap %d\n",
+			    init_attr->attr.max_wr,
+			    max_srq_wqes);
+		return ERR_PTR(-EINVAL);
+	}
+
+	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&srq->mutex);
+	spin_lock_init(&srq->lock);
+	srq->msrq.max    = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+	srq->msrq.max_gs = init_attr->attr.max_sge;
+
+	desc_size = sizeof(struct mlx5_wqe_srq_next_seg) +
+		    srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg);
+	if (desc_size == 0 || srq->msrq.max_gs > desc_size)
+		return ERR_PTR(-EINVAL);
+	desc_size = roundup_pow_of_two(desc_size);
+	desc_size = max_t(size_t, 32, desc_size);
+	if (desc_size < sizeof(struct mlx5_wqe_srq_next_seg))
+		return ERR_PTR(-EINVAL);
+	srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) /
+		sizeof(struct mlx5_wqe_data_seg);
+	srq->msrq.wqe_shift = ilog2(desc_size);
+	buf_size = srq->msrq.max * desc_size;
+	if (buf_size < desc_size)
+		return ERR_PTR(-EINVAL);
+	in.type = init_attr->srq_type;
+
+	if (pd->uobject)
+		err = create_srq_user(pd, srq, &in, udata, buf_size);
+	else
+		err = create_srq_kernel(dev, srq, &in, buf_size);
+
+	if (err) {
+		mlx5_ib_warn(dev, "create srq %s failed, err %d\n",
+			     pd->uobject ? "user" : "kernel", err);
+		goto err_srq;
+	}
+
+	in.log_size = ilog2(srq->msrq.max);
+	in.wqe_shift = srq->msrq.wqe_shift - 4;
+	if (srq->wq_sig)
+		in.flags |= MLX5_SRQ_FLAG_WQ_SIG;
+
+	if (init_attr->srq_type == IB_SRQT_XRC)
+		in.xrcd = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn;
+	else
+		in.xrcd = to_mxrcd(dev->devr.x0)->xrcdn;
+
+	if (init_attr->srq_type == IB_SRQT_TM) {
+		in.tm_log_list_size =
+			ilog2(init_attr->ext.tag_matching.max_num_tags) + 1;
+		if (in.tm_log_list_size >
+		    MLX5_CAP_GEN(dev->mdev, log_tag_matching_list_sz)) {
+			mlx5_ib_dbg(dev, "TM SRQ max_num_tags exceeding limit\n");
+			err = -EINVAL;
+			goto err_usr_kern_srq;
+		}
+		in.flags |= MLX5_SRQ_FLAG_RNDV;
+	}
+
+	if (ib_srq_has_cq(init_attr->srq_type))
+		in.cqn = to_mcq(init_attr->ext.cq)->mcq.cqn;
+	else
+		in.cqn = to_mcq(dev->devr.c0)->mcq.cqn;
+
+	in.pd = to_mpd(pd)->pdn;
+	in.db_record = srq->db.dma;
+	err = mlx5_core_create_srq(dev->mdev, &srq->msrq, &in);
+	kvfree(in.pas);
+	if (err) {
+		mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err);
+		goto err_usr_kern_srq;
+	}
+
+	mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn);
+
+	srq->msrq.event = mlx5_ib_srq_event;
+	srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn;
+
+	if (pd->uobject)
+		if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) {
+			mlx5_ib_dbg(dev, "copy to user failed\n");
+			err = -EFAULT;
+			goto err_core;
+		}
+
+	init_attr->attr.max_wr = srq->msrq.max - 1;
+
+	return &srq->ibsrq;
+
+err_core:
+	mlx5_core_destroy_srq(dev->mdev, &srq->msrq);
+
+err_usr_kern_srq:
+	if (pd->uobject)
+		destroy_srq_user(pd, srq);
+	else
+		destroy_srq_kernel(dev, srq);
+
+err_srq:
+	kfree(srq);
+
+	return ERR_PTR(err);
+}
+
+int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx5_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+
+	/* We don't support resizing SRQs yet */
+	if (attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		if (attr->srq_limit >= srq->msrq.max)
+			return -EINVAL;
+
+		mutex_lock(&srq->mutex);
+		ret = mlx5_core_arm_srq(dev->mdev, &srq->msrq, attr->srq_limit, 1);
+		mutex_unlock(&srq->mutex);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx5_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+	struct mlx5_srq_attr *out;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	ret = mlx5_core_query_srq(dev->mdev, &srq->msrq, out);
+	if (ret)
+		goto out_box;
+
+	srq_attr->srq_limit = out->lwm;
+	srq_attr->max_wr    = srq->msrq.max - 1;
+	srq_attr->max_sge   = srq->msrq.max_gs;
+
+out_box:
+	kfree(out);
+	return ret;
+}
+
+int mlx5_ib_destroy_srq(struct ib_srq *srq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(srq->device);
+	struct mlx5_ib_srq *msrq = to_msrq(srq);
+
+	mlx5_core_destroy_srq(dev->mdev, &msrq->msrq);
+
+	if (srq->uobject) {
+		mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+		ib_umem_release(msrq->umem);
+	} else {
+		destroy_srq_kernel(dev, msrq);
+	}
+
+	kfree(srq);
+	return 0;
+}
+
+void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index)
+{
+	struct mlx5_wqe_srq_next_seg *next;
+
+	/* always called with interrupts disabled. */
+	spin_lock(&srq->lock);
+
+	next = get_wqe(srq, srq->tail);
+	next->next_wqe_index = cpu_to_be16(wqe_index);
+	srq->tail = wqe_index;
+
+	spin_unlock(&srq->lock);
+}
+
+int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr)
+{
+	struct mlx5_ib_srq *srq = to_msrq(ibsrq);
+	struct mlx5_wqe_srq_next_seg *next;
+	struct mlx5_wqe_data_seg *scat;
+	struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		goto out;
+	}
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (unlikely(srq->head == srq->tail)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		srq->wrid[srq->head] = wr->wr_id;
+
+		next      = get_wqe(srq, srq->head);
+		srq->head = be16_to_cpu(next->next_wqe_index);
+		scat      = (struct mlx5_wqe_data_seg *)(next + 1);
+
+		for (i = 0; i < wr->num_sge; i++) {
+			scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+			scat[i].lkey       = cpu_to_be32(wr->sg_list[i].lkey);
+			scat[i].addr       = cpu_to_be64(wr->sg_list[i].addr);
+		}
+
+		if (i < srq->msrq.max_avail_gather) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX5_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+	}
+
+	if (likely(nreq)) {
+		srq->wqe_ctr += nreq;
+
+		/* Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*srq->db.db = cpu_to_be32(srq->wqe_ctr);
+	}
+out:
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mthca/Kconfig b/drivers/infiniband/hw/mthca/Kconfig
new file mode 100644
index 0000000..da314c3
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/Kconfig
@@ -0,0 +1,17 @@
+config INFINIBAND_MTHCA
+	tristate "Mellanox HCA support"
+	depends on PCI
+	---help---
+	  This is a low-level driver for Mellanox InfiniHost host
+	  channel adapters (HCAs), including the MT23108 PCI-X HCA
+	  ("Tavor") and the MT25208 PCI Express HCA ("Arbel").
+
+config INFINIBAND_MTHCA_DEBUG
+	bool "Verbose debugging output" if EXPERT
+	depends on INFINIBAND_MTHCA
+	default y
+	---help---
+	  This option causes debugging code to be compiled into the
+	  mthca driver.  The output can be turned on via the
+	  debug_level module parameter (which can also be set after
+	  the driver is loaded through sysfs).
diff --git a/drivers/infiniband/hw/mthca/Makefile b/drivers/infiniband/hw/mthca/Makefile
new file mode 100644
index 0000000..3a09e9f
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o
+
+ib_mthca-y :=	mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \
+		mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \
+		mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \
+		mthca_provider.o mthca_memfree.o mthca_uar.o mthca_srq.o \
+		mthca_catas.o
diff --git a/drivers/infiniband/hw/mthca/mthca_allocator.c b/drivers/infiniband/hw/mthca/mthca_allocator.c
new file mode 100644
index 0000000..b4e0cf4
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_allocator.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+
+#include "mthca_dev.h"
+
+/* Trivial bitmap-based allocator */
+u32 mthca_alloc(struct mthca_alloc *alloc)
+{
+	unsigned long flags;
+	u32 obj;
+
+	spin_lock_irqsave(&alloc->lock, flags);
+
+	obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last);
+	if (obj >= alloc->max) {
+		alloc->top = (alloc->top + alloc->max) & alloc->mask;
+		obj = find_first_zero_bit(alloc->table, alloc->max);
+	}
+
+	if (obj < alloc->max) {
+		set_bit(obj, alloc->table);
+		obj |= alloc->top;
+	} else
+		obj = -1;
+
+	spin_unlock_irqrestore(&alloc->lock, flags);
+
+	return obj;
+}
+
+void mthca_free(struct mthca_alloc *alloc, u32 obj)
+{
+	unsigned long flags;
+
+	obj &= alloc->max - 1;
+
+	spin_lock_irqsave(&alloc->lock, flags);
+
+	clear_bit(obj, alloc->table);
+	alloc->last = min(alloc->last, obj);
+	alloc->top = (alloc->top + alloc->max) & alloc->mask;
+
+	spin_unlock_irqrestore(&alloc->lock, flags);
+}
+
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+		     u32 reserved)
+{
+	int i;
+
+	/* num must be a power of 2 */
+	if (num != 1 << (ffs(num) - 1))
+		return -EINVAL;
+
+	alloc->last = 0;
+	alloc->top  = 0;
+	alloc->max  = num;
+	alloc->mask = mask;
+	spin_lock_init(&alloc->lock);
+	alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof (long),
+			       GFP_KERNEL);
+	if (!alloc->table)
+		return -ENOMEM;
+
+	bitmap_zero(alloc->table, num);
+	for (i = 0; i < reserved; ++i)
+		set_bit(i, alloc->table);
+
+	return 0;
+}
+
+void mthca_alloc_cleanup(struct mthca_alloc *alloc)
+{
+	kfree(alloc->table);
+}
+
+/*
+ * Array of pointers with lazy allocation of leaf pages.  Callers of
+ * _get, _set and _clear methods must use a lock or otherwise
+ * serialize access to the array.
+ */
+
+#define MTHCA_ARRAY_MASK (PAGE_SIZE / sizeof (void *) - 1)
+
+void *mthca_array_get(struct mthca_array *array, int index)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	if (array->page_list[p].page)
+		return array->page_list[p].page[index & MTHCA_ARRAY_MASK];
+	else
+		return NULL;
+}
+
+int mthca_array_set(struct mthca_array *array, int index, void *value)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	/* Allocate with GFP_ATOMIC because we'll be called with locks held. */
+	if (!array->page_list[p].page)
+		array->page_list[p].page = (void **) get_zeroed_page(GFP_ATOMIC);
+
+	if (!array->page_list[p].page)
+		return -ENOMEM;
+
+	array->page_list[p].page[index & MTHCA_ARRAY_MASK] = value;
+	++array->page_list[p].used;
+
+	return 0;
+}
+
+void mthca_array_clear(struct mthca_array *array, int index)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	if (--array->page_list[p].used == 0) {
+		free_page((unsigned long) array->page_list[p].page);
+		array->page_list[p].page = NULL;
+	} else
+		array->page_list[p].page[index & MTHCA_ARRAY_MASK] = NULL;
+
+	if (array->page_list[p].used < 0)
+		pr_debug("Array %p index %d page %d with ref count %d < 0\n",
+			 array, index, p, array->page_list[p].used);
+}
+
+int mthca_array_init(struct mthca_array *array, int nent)
+{
+	int npage = (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE;
+	int i;
+
+	array->page_list = kmalloc(npage * sizeof *array->page_list, GFP_KERNEL);
+	if (!array->page_list)
+		return -ENOMEM;
+
+	for (i = 0; i < npage; ++i) {
+		array->page_list[i].page = NULL;
+		array->page_list[i].used = 0;
+	}
+
+	return 0;
+}
+
+void mthca_array_cleanup(struct mthca_array *array, int nent)
+{
+	int i;
+
+	for (i = 0; i < (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+		free_page((unsigned long) array->page_list[i].page);
+
+	kfree(array->page_list);
+}
+
+/*
+ * Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0.  If the
+ * requested size is > max_direct, we split the allocation into
+ * multiple pages, so we don't require too much contiguous memory.
+ */
+
+int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
+		    union mthca_buf *buf, int *is_direct, struct mthca_pd *pd,
+		    int hca_write, struct mthca_mr *mr)
+{
+	int err = -ENOMEM;
+	int npages, shift;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	int i;
+
+	if (size <= max_direct) {
+		*is_direct = 1;
+		npages     = 1;
+		shift      = get_order(size) + PAGE_SHIFT;
+
+		buf->direct.buf = dma_alloc_coherent(&dev->pdev->dev,
+						     size, &t, GFP_KERNEL);
+		if (!buf->direct.buf)
+			return -ENOMEM;
+
+		dma_unmap_addr_set(&buf->direct, mapping, t);
+
+		memset(buf->direct.buf, 0, size);
+
+		while (t & ((1 << shift) - 1)) {
+			--shift;
+			npages *= 2;
+		}
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			goto err_free;
+
+		for (i = 0; i < npages; ++i)
+			dma_list[i] = t + i * (1 << shift);
+	} else {
+		*is_direct = 0;
+		npages     = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		shift      = PAGE_SHIFT;
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			return -ENOMEM;
+
+		buf->page_list = kmalloc(npages * sizeof *buf->page_list,
+					 GFP_KERNEL);
+		if (!buf->page_list)
+			goto err_out;
+
+		for (i = 0; i < npages; ++i)
+			buf->page_list[i].buf = NULL;
+
+		for (i = 0; i < npages; ++i) {
+			buf->page_list[i].buf =
+				dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+						   &t, GFP_KERNEL);
+			if (!buf->page_list[i].buf)
+				goto err_free;
+
+			dma_list[i] = t;
+			dma_unmap_addr_set(&buf->page_list[i], mapping, t);
+
+			clear_page(buf->page_list[i].buf);
+		}
+	}
+
+	err = mthca_mr_alloc_phys(dev, pd->pd_num,
+				  dma_list, shift, npages,
+				  0, size,
+				  MTHCA_MPT_FLAG_LOCAL_READ |
+				  (hca_write ? MTHCA_MPT_FLAG_LOCAL_WRITE : 0),
+				  mr);
+	if (err)
+		goto err_free;
+
+	kfree(dma_list);
+
+	return 0;
+
+err_free:
+	mthca_buf_free(dev, size, buf, *is_direct, NULL);
+
+err_out:
+	kfree(dma_list);
+
+	return err;
+}
+
+void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf,
+		    int is_direct, struct mthca_mr *mr)
+{
+	int i;
+
+	if (mr)
+		mthca_free_mr(dev, mr);
+
+	if (is_direct)
+		dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
+				  dma_unmap_addr(&buf->direct, mapping));
+	else {
+		for (i = 0; i < (size + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+					  buf->page_list[i].buf,
+					  dma_unmap_addr(&buf->page_list[i],
+							 mapping));
+		kfree(buf->page_list);
+	}
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
new file mode 100644
index 0000000..e7f6223
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+
+#include "mthca_dev.h"
+
+enum {
+      MTHCA_RATE_TAVOR_FULL   = 0,
+      MTHCA_RATE_TAVOR_1X     = 1,
+      MTHCA_RATE_TAVOR_4X     = 2,
+      MTHCA_RATE_TAVOR_1X_DDR = 3
+};
+
+enum {
+      MTHCA_RATE_MEMFREE_FULL    = 0,
+      MTHCA_RATE_MEMFREE_QUARTER = 1,
+      MTHCA_RATE_MEMFREE_EIGHTH  = 2,
+      MTHCA_RATE_MEMFREE_HALF    = 3
+};
+
+struct mthca_av {
+	__be32 port_pd;
+	u8     reserved1;
+	u8     g_slid;
+	__be16 dlid;
+	u8     reserved2;
+	u8     gid_index;
+	u8     msg_sr;
+	u8     hop_limit;
+	__be32 sl_tclass_flowlabel;
+	__be32 dgid[4];
+};
+
+static enum ib_rate memfree_rate_to_ib(u8 mthca_rate, u8 port_rate)
+{
+	switch (mthca_rate) {
+	case MTHCA_RATE_MEMFREE_EIGHTH:
+		return mult_to_ib_rate(port_rate >> 3);
+	case MTHCA_RATE_MEMFREE_QUARTER:
+		return mult_to_ib_rate(port_rate >> 2);
+	case MTHCA_RATE_MEMFREE_HALF:
+		return mult_to_ib_rate(port_rate >> 1);
+	case MTHCA_RATE_MEMFREE_FULL:
+	default:
+		return mult_to_ib_rate(port_rate);
+	}
+}
+
+static enum ib_rate tavor_rate_to_ib(u8 mthca_rate, u8 port_rate)
+{
+	switch (mthca_rate) {
+	case MTHCA_RATE_TAVOR_1X:     return IB_RATE_2_5_GBPS;
+	case MTHCA_RATE_TAVOR_1X_DDR: return IB_RATE_5_GBPS;
+	case MTHCA_RATE_TAVOR_4X:     return IB_RATE_10_GBPS;
+	default:		      return mult_to_ib_rate(port_rate);
+	}
+}
+
+enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port)
+{
+	if (mthca_is_memfree(dev)) {
+		/* Handle old Arbel FW */
+		if (dev->limits.stat_rate_support == 0x3 && mthca_rate)
+			return IB_RATE_2_5_GBPS;
+
+		return memfree_rate_to_ib(mthca_rate, dev->rate[port - 1]);
+	} else
+		return tavor_rate_to_ib(mthca_rate, dev->rate[port - 1]);
+}
+
+static u8 ib_rate_to_memfree(u8 req_rate, u8 cur_rate)
+{
+	if (cur_rate <= req_rate)
+		return 0;
+
+	/*
+	 * Inter-packet delay (IPD) to get from rate X down to a rate
+	 * no more than Y is (X - 1) / Y.
+	 */
+	switch ((cur_rate - 1) / req_rate) {
+	case 0:	 return MTHCA_RATE_MEMFREE_FULL;
+	case 1:	 return MTHCA_RATE_MEMFREE_HALF;
+	case 2:	 /* fall through */
+	case 3:	 return MTHCA_RATE_MEMFREE_QUARTER;
+	default: return MTHCA_RATE_MEMFREE_EIGHTH;
+	}
+}
+
+static u8 ib_rate_to_tavor(u8 static_rate)
+{
+	switch (static_rate) {
+	case IB_RATE_2_5_GBPS: return MTHCA_RATE_TAVOR_1X;
+	case IB_RATE_5_GBPS:   return MTHCA_RATE_TAVOR_1X_DDR;
+	case IB_RATE_10_GBPS:  return MTHCA_RATE_TAVOR_4X;
+	default:	       return MTHCA_RATE_TAVOR_FULL;
+	}
+}
+
+u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port)
+{
+	u8 rate;
+
+	if (!static_rate || ib_rate_to_mult(static_rate) >= dev->rate[port - 1])
+		return 0;
+
+	if (mthca_is_memfree(dev))
+		rate = ib_rate_to_memfree(ib_rate_to_mult(static_rate),
+					  dev->rate[port - 1]);
+	else
+		rate = ib_rate_to_tavor(static_rate);
+
+	if (!(dev->limits.stat_rate_support & (1 << rate)))
+		rate = 1;
+
+	return rate;
+}
+
+int mthca_create_ah(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct rdma_ah_attr *ah_attr,
+		    struct mthca_ah *ah)
+{
+	u32 index = -1;
+	struct mthca_av *av = NULL;
+
+	ah->type = MTHCA_AH_PCI_POOL;
+
+	if (mthca_is_memfree(dev)) {
+		ah->av   = kmalloc(sizeof *ah->av, GFP_ATOMIC);
+		if (!ah->av)
+			return -ENOMEM;
+
+		ah->type = MTHCA_AH_KMALLOC;
+		av       = ah->av;
+	} else if (!atomic_read(&pd->sqp_count) &&
+		 !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		index = mthca_alloc(&dev->av_table.alloc);
+
+		/* fall back to allocate in host memory */
+		if (index == -1)
+			goto on_hca_fail;
+
+		av = kmalloc(sizeof *av, GFP_ATOMIC);
+		if (!av)
+			goto on_hca_fail;
+
+		ah->type = MTHCA_AH_ON_HCA;
+		ah->avdma  = dev->av_table.ddr_av_base +
+			index * MTHCA_AV_SIZE;
+	}
+
+on_hca_fail:
+	if (ah->type == MTHCA_AH_PCI_POOL) {
+		ah->av = dma_pool_zalloc(dev->av_table.pool,
+					 GFP_ATOMIC, &ah->avdma);
+		if (!ah->av)
+			return -ENOMEM;
+
+		av = ah->av;
+	}
+
+	ah->key = pd->ntmr.ibmr.lkey;
+
+	av->port_pd = cpu_to_be32(pd->pd_num |
+				  (rdma_ah_get_port_num(ah_attr) << 24));
+	av->g_slid  = rdma_ah_get_path_bits(ah_attr);
+	av->dlid    = cpu_to_be16(rdma_ah_get_dlid(ah_attr));
+	av->msg_sr  = (3 << 4) | /* 2K message */
+		mthca_get_rate(dev, rdma_ah_get_static_rate(ah_attr),
+			       rdma_ah_get_port_num(ah_attr));
+	av->sl_tclass_flowlabel = cpu_to_be32(rdma_ah_get_sl(ah_attr) << 28);
+	if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
+		const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
+
+		av->g_slid |= 0x80;
+		av->gid_index = (rdma_ah_get_port_num(ah_attr) - 1) *
+				  dev->limits.gid_table_len +
+				  grh->sgid_index;
+		av->hop_limit = grh->hop_limit;
+		av->sl_tclass_flowlabel |=
+			cpu_to_be32((grh->traffic_class << 20) |
+				    grh->flow_label);
+		memcpy(av->dgid, grh->dgid.raw, 16);
+	} else {
+		/* Arbel workaround -- low byte of GID must be 2 */
+		av->dgid[3] = cpu_to_be32(2);
+	}
+
+	if (0) {
+		int j;
+
+		mthca_dbg(dev, "Created UDAV at %p/%08lx:\n",
+			  av, (unsigned long) ah->avdma);
+		for (j = 0; j < 8; ++j)
+			printk(KERN_DEBUG "  [%2x] %08x\n",
+			       j * 4, be32_to_cpu(((__be32 *) av)[j]));
+	}
+
+	if (ah->type == MTHCA_AH_ON_HCA) {
+		memcpy_toio(dev->av_table.av_map + index * MTHCA_AV_SIZE,
+			    av, MTHCA_AV_SIZE);
+		kfree(av);
+	}
+
+	return 0;
+}
+
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah)
+{
+	switch (ah->type) {
+	case MTHCA_AH_ON_HCA:
+		mthca_free(&dev->av_table.alloc,
+			   (ah->avdma - dev->av_table.ddr_av_base) /
+			   MTHCA_AV_SIZE);
+		break;
+
+	case MTHCA_AH_PCI_POOL:
+		dma_pool_free(dev->av_table.pool, ah->av, ah->avdma);
+		break;
+
+	case MTHCA_AH_KMALLOC:
+		kfree(ah->av);
+		break;
+	}
+
+	return 0;
+}
+
+int mthca_ah_grh_present(struct mthca_ah *ah)
+{
+	return !!(ah->av->g_slid & 0x80);
+}
+
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+		  struct ib_ud_header *header)
+{
+	if (ah->type == MTHCA_AH_ON_HCA)
+		return -EINVAL;
+
+	header->lrh.service_level   = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28;
+	header->lrh.destination_lid = ah->av->dlid;
+	header->lrh.source_lid      = cpu_to_be16(ah->av->g_slid & 0x7f);
+	if (mthca_ah_grh_present(ah)) {
+		header->grh.traffic_class =
+			(be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff;
+		header->grh.flow_label    =
+			ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		header->grh.hop_limit     = ah->av->hop_limit;
+		ib_get_cached_gid(&dev->ib_dev,
+				  be32_to_cpu(ah->av->port_pd) >> 24,
+				  ah->av->gid_index % dev->limits.gid_table_len,
+				  &header->grh.source_gid, NULL);
+		memcpy(header->grh.destination_gid.raw,
+		       ah->av->dgid, 16);
+	}
+
+	return 0;
+}
+
+int mthca_ah_query(struct ib_ah *ibah, struct rdma_ah_attr *attr)
+{
+	struct mthca_ah *ah   = to_mah(ibah);
+	struct mthca_dev *dev = to_mdev(ibah->device);
+	u8 port_num = be32_to_cpu(ah->av->port_pd) >> 24;
+
+	/* Only implement for MAD and memfree ah for now. */
+	if (ah->type == MTHCA_AH_ON_HCA)
+		return -ENOSYS;
+
+	memset(attr, 0, sizeof *attr);
+	attr->type = ibah->type;
+	rdma_ah_set_dlid(attr, be16_to_cpu(ah->av->dlid));
+	rdma_ah_set_sl(attr, be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28);
+	rdma_ah_set_port_num(attr, port_num);
+	rdma_ah_set_static_rate(attr,
+				mthca_rate_to_ib(dev, ah->av->msg_sr & 0x7,
+						 port_num));
+	rdma_ah_set_path_bits(attr, ah->av->g_slid & 0x7F);
+	if (mthca_ah_grh_present(ah)) {
+		u32 tc_fl = be32_to_cpu(ah->av->sl_tclass_flowlabel);
+
+		rdma_ah_set_grh(attr, NULL,
+				tc_fl & 0xfffff,
+				ah->av->gid_index &
+				(dev->limits.gid_table_len - 1),
+				ah->av->hop_limit,
+				(tc_fl >> 20) & 0xff);
+		rdma_ah_set_dgid_raw(attr, ah->av->dgid);
+	}
+
+	return 0;
+}
+
+int mthca_init_av_table(struct mthca_dev *dev)
+{
+	int err;
+
+	if (mthca_is_memfree(dev))
+		return 0;
+
+	err = mthca_alloc_init(&dev->av_table.alloc,
+			       dev->av_table.num_ddr_avs,
+			       dev->av_table.num_ddr_avs - 1,
+			       0);
+	if (err)
+		return err;
+
+	dev->av_table.pool = dma_pool_create("mthca_av", &dev->pdev->dev,
+					     MTHCA_AV_SIZE,
+					     MTHCA_AV_SIZE, 0);
+	if (!dev->av_table.pool)
+		goto out_free_alloc;
+
+	if (!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		dev->av_table.av_map = ioremap(pci_resource_start(dev->pdev, 4) +
+					       dev->av_table.ddr_av_base -
+					       dev->ddr_start,
+					       dev->av_table.num_ddr_avs *
+					       MTHCA_AV_SIZE);
+		if (!dev->av_table.av_map)
+			goto out_free_pool;
+	} else
+		dev->av_table.av_map = NULL;
+
+	return 0;
+
+ out_free_pool:
+	dma_pool_destroy(dev->av_table.pool);
+
+ out_free_alloc:
+	mthca_alloc_cleanup(&dev->av_table.alloc);
+	return -ENOMEM;
+}
+
+void mthca_cleanup_av_table(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev))
+		return;
+
+	if (dev->av_table.av_map)
+		iounmap(dev->av_table.av_map);
+	dma_pool_destroy(dev->av_table.pool);
+	mthca_alloc_cleanup(&dev->av_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c
new file mode 100644
index 0000000..ffb98ea
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_catas.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+
+#include "mthca_dev.h"
+
+enum {
+	MTHCA_CATAS_POLL_INTERVAL	= 5 * HZ,
+
+	MTHCA_CATAS_TYPE_INTERNAL	= 0,
+	MTHCA_CATAS_TYPE_UPLINK		= 3,
+	MTHCA_CATAS_TYPE_DDR		= 4,
+	MTHCA_CATAS_TYPE_PARITY		= 5,
+};
+
+static DEFINE_SPINLOCK(catas_lock);
+
+static LIST_HEAD(catas_list);
+static struct workqueue_struct *catas_wq;
+static struct work_struct catas_work;
+
+static int catas_reset_disable;
+module_param_named(catas_reset_disable, catas_reset_disable, int, 0644);
+MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero");
+
+static void catas_reset(struct work_struct *work)
+{
+	struct mthca_dev *dev, *tmpdev;
+	LIST_HEAD(tlist);
+	int ret;
+
+	mutex_lock(&mthca_device_mutex);
+
+	spin_lock_irq(&catas_lock);
+	list_splice_init(&catas_list, &tlist);
+	spin_unlock_irq(&catas_lock);
+
+	list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) {
+		struct pci_dev *pdev = dev->pdev;
+		ret = __mthca_restart_one(dev->pdev);
+		/* 'dev' now is not valid */
+		if (ret)
+			printk(KERN_ERR "mthca %s: Reset failed (%d)\n",
+			       pci_name(pdev), ret);
+		else {
+			struct mthca_dev *d = pci_get_drvdata(pdev);
+			mthca_dbg(d, "Reset succeeded\n");
+		}
+	}
+
+	mutex_unlock(&mthca_device_mutex);
+}
+
+static void handle_catas(struct mthca_dev *dev)
+{
+	struct ib_event event;
+	unsigned long flags;
+	const char *type;
+	int i;
+
+	event.device = &dev->ib_dev;
+	event.event  = IB_EVENT_DEVICE_FATAL;
+	event.element.port_num = 0;
+	dev->active = false;
+
+	ib_dispatch_event(&event);
+
+	switch (swab32(readl(dev->catas_err.map)) >> 24) {
+	case MTHCA_CATAS_TYPE_INTERNAL:
+		type = "internal error";
+		break;
+	case MTHCA_CATAS_TYPE_UPLINK:
+		type = "uplink bus error";
+		break;
+	case MTHCA_CATAS_TYPE_DDR:
+		type = "DDR data error";
+		break;
+	case MTHCA_CATAS_TYPE_PARITY:
+		type = "internal parity error";
+		break;
+	default:
+		type = "unknown error";
+		break;
+	}
+
+	mthca_err(dev, "Catastrophic error detected: %s\n", type);
+	for (i = 0; i < dev->catas_err.size; ++i)
+		mthca_err(dev, "  buf[%02x]: %08x\n",
+			  i, swab32(readl(dev->catas_err.map + i)));
+
+	if (catas_reset_disable)
+		return;
+
+	spin_lock_irqsave(&catas_lock, flags);
+	list_add(&dev->catas_err.list, &catas_list);
+	queue_work(catas_wq, &catas_work);
+	spin_unlock_irqrestore(&catas_lock, flags);
+}
+
+static void poll_catas(struct timer_list *t)
+{
+	struct mthca_dev *dev = from_timer(dev, t, catas_err.timer);
+	int i;
+
+	for (i = 0; i < dev->catas_err.size; ++i)
+		if (readl(dev->catas_err.map + i)) {
+			handle_catas(dev);
+			return;
+		}
+
+	mod_timer(&dev->catas_err.timer,
+		  round_jiffies(jiffies + MTHCA_CATAS_POLL_INTERVAL));
+}
+
+void mthca_start_catas_poll(struct mthca_dev *dev)
+{
+	phys_addr_t addr;
+
+	timer_setup(&dev->catas_err.timer, poll_catas, 0);
+	dev->catas_err.map  = NULL;
+
+	addr = pci_resource_start(dev->pdev, 0) +
+		((pci_resource_len(dev->pdev, 0) - 1) &
+		 dev->catas_err.addr);
+
+	dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4);
+	if (!dev->catas_err.map) {
+		mthca_warn(dev, "couldn't map catastrophic error region "
+			   "at 0x%llx/0x%x\n", (unsigned long long) addr,
+			   dev->catas_err.size * 4);
+		return;
+	}
+
+	dev->catas_err.timer.expires  = jiffies + MTHCA_CATAS_POLL_INTERVAL;
+	INIT_LIST_HEAD(&dev->catas_err.list);
+	add_timer(&dev->catas_err.timer);
+}
+
+void mthca_stop_catas_poll(struct mthca_dev *dev)
+{
+	del_timer_sync(&dev->catas_err.timer);
+
+	if (dev->catas_err.map)
+		iounmap(dev->catas_err.map);
+
+	spin_lock_irq(&catas_lock);
+	list_del(&dev->catas_err.list);
+	spin_unlock_irq(&catas_lock);
+}
+
+int __init mthca_catas_init(void)
+{
+	INIT_WORK(&catas_work, catas_reset);
+
+	catas_wq = alloc_ordered_workqueue("mthca_catas", WQ_MEM_RECLAIM);
+	if (!catas_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mthca_catas_cleanup(void)
+{
+	destroy_workqueue(catas_wq);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
new file mode 100644
index 0000000..419a2a2
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -0,0 +1,1977 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/io.h>
+#include <rdma/ib_mad.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+#define CMD_POLL_TOKEN 0xffff
+
+enum {
+	HCR_IN_PARAM_OFFSET    = 0x00,
+	HCR_IN_MODIFIER_OFFSET = 0x08,
+	HCR_OUT_PARAM_OFFSET   = 0x0c,
+	HCR_TOKEN_OFFSET       = 0x14,
+	HCR_STATUS_OFFSET      = 0x18,
+
+	HCR_OPMOD_SHIFT        = 12,
+	HCA_E_BIT              = 22,
+	HCR_GO_BIT             = 23
+};
+
+enum {
+	/* initialization and general commands */
+	CMD_SYS_EN          = 0x1,
+	CMD_SYS_DIS         = 0x2,
+	CMD_MAP_FA          = 0xfff,
+	CMD_UNMAP_FA        = 0xffe,
+	CMD_RUN_FW          = 0xff6,
+	CMD_MOD_STAT_CFG    = 0x34,
+	CMD_QUERY_DEV_LIM   = 0x3,
+	CMD_QUERY_FW        = 0x4,
+	CMD_ENABLE_LAM      = 0xff8,
+	CMD_DISABLE_LAM     = 0xff7,
+	CMD_QUERY_DDR       = 0x5,
+	CMD_QUERY_ADAPTER   = 0x6,
+	CMD_INIT_HCA        = 0x7,
+	CMD_CLOSE_HCA       = 0x8,
+	CMD_INIT_IB         = 0x9,
+	CMD_CLOSE_IB        = 0xa,
+	CMD_QUERY_HCA       = 0xb,
+	CMD_SET_IB          = 0xc,
+	CMD_ACCESS_DDR      = 0x2e,
+	CMD_MAP_ICM         = 0xffa,
+	CMD_UNMAP_ICM       = 0xff9,
+	CMD_MAP_ICM_AUX     = 0xffc,
+	CMD_UNMAP_ICM_AUX   = 0xffb,
+	CMD_SET_ICM_SIZE    = 0xffd,
+
+	/* TPT commands */
+	CMD_SW2HW_MPT 	    = 0xd,
+	CMD_QUERY_MPT 	    = 0xe,
+	CMD_HW2SW_MPT 	    = 0xf,
+	CMD_READ_MTT        = 0x10,
+	CMD_WRITE_MTT       = 0x11,
+	CMD_SYNC_TPT        = 0x2f,
+
+	/* EQ commands */
+	CMD_MAP_EQ          = 0x12,
+	CMD_SW2HW_EQ 	    = 0x13,
+	CMD_HW2SW_EQ 	    = 0x14,
+	CMD_QUERY_EQ        = 0x15,
+
+	/* CQ commands */
+	CMD_SW2HW_CQ 	    = 0x16,
+	CMD_HW2SW_CQ 	    = 0x17,
+	CMD_QUERY_CQ 	    = 0x18,
+	CMD_RESIZE_CQ       = 0x2c,
+
+	/* SRQ commands */
+	CMD_SW2HW_SRQ 	    = 0x35,
+	CMD_HW2SW_SRQ 	    = 0x36,
+	CMD_QUERY_SRQ       = 0x37,
+	CMD_ARM_SRQ         = 0x40,
+
+	/* QP/EE commands */
+	CMD_RST2INIT_QPEE   = 0x19,
+	CMD_INIT2RTR_QPEE   = 0x1a,
+	CMD_RTR2RTS_QPEE    = 0x1b,
+	CMD_RTS2RTS_QPEE    = 0x1c,
+	CMD_SQERR2RTS_QPEE  = 0x1d,
+	CMD_2ERR_QPEE       = 0x1e,
+	CMD_RTS2SQD_QPEE    = 0x1f,
+	CMD_SQD2SQD_QPEE    = 0x38,
+	CMD_SQD2RTS_QPEE    = 0x20,
+	CMD_ERR2RST_QPEE    = 0x21,
+	CMD_QUERY_QPEE      = 0x22,
+	CMD_INIT2INIT_QPEE  = 0x2d,
+	CMD_SUSPEND_QPEE    = 0x32,
+	CMD_UNSUSPEND_QPEE  = 0x33,
+	/* special QPs and management commands */
+	CMD_CONF_SPECIAL_QP = 0x23,
+	CMD_MAD_IFC         = 0x24,
+
+	/* multicast commands */
+	CMD_READ_MGM        = 0x25,
+	CMD_WRITE_MGM       = 0x26,
+	CMD_MGID_HASH       = 0x27,
+
+	/* miscellaneous commands */
+	CMD_DIAG_RPRT       = 0x30,
+	CMD_NOP             = 0x31,
+
+	/* debug commands */
+	CMD_QUERY_DEBUG_MSG = 0x2a,
+	CMD_SET_DEBUG_MSG   = 0x2b,
+};
+
+/*
+ * According to Mellanox code, FW may be starved and never complete
+ * commands.  So we can't use strict timeouts described in PRM -- we
+ * just arbitrarily select 60 seconds for now.
+ */
+#if 0
+/*
+ * Round up and add 1 to make sure we get the full wait time (since we
+ * will be starting in the middle of a jiffy)
+ */
+enum {
+	CMD_TIME_CLASS_A = (HZ + 999) / 1000 + 1,
+	CMD_TIME_CLASS_B = (HZ +  99) /  100 + 1,
+	CMD_TIME_CLASS_C = (HZ +   9) /   10 + 1,
+	CMD_TIME_CLASS_D = 60 * HZ
+};
+#else
+enum {
+	CMD_TIME_CLASS_A = 60 * HZ,
+	CMD_TIME_CLASS_B = 60 * HZ,
+	CMD_TIME_CLASS_C = 60 * HZ,
+	CMD_TIME_CLASS_D = 60 * HZ
+};
+#endif
+
+enum {
+	GO_BIT_TIMEOUT = HZ * 10
+};
+
+struct mthca_cmd_context {
+	struct completion done;
+	int               result;
+	int               next;
+	u64               out_param;
+	u16               token;
+	u8                status;
+};
+
+static int fw_cmd_doorbell = 0;
+module_param(fw_cmd_doorbell, int, 0644);
+MODULE_PARM_DESC(fw_cmd_doorbell, "post FW commands through doorbell page if nonzero "
+		 "(and supported by FW)");
+
+static inline int go_bit(struct mthca_dev *dev)
+{
+	return readl(dev->hcr + HCR_STATUS_OFFSET) &
+		swab32(1 << HCR_GO_BIT);
+}
+
+static void mthca_cmd_post_dbell(struct mthca_dev *dev,
+				 u64 in_param,
+				 u64 out_param,
+				 u32 in_modifier,
+				 u8 op_modifier,
+				 u16 op,
+				 u16 token)
+{
+	void __iomem *ptr = dev->cmd.dbell_map;
+	u16 *offs = dev->cmd.dbell_offsets;
+
+	__raw_writel((__force u32) cpu_to_be32(in_param >> 32),           ptr + offs[0]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  ptr + offs[1]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(in_modifier),              ptr + offs[2]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(out_param >> 32),          ptr + offs[3]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), ptr + offs[4]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(token << 16),              ptr + offs[5]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)                |
+					       (1 << HCA_E_BIT)                 |
+					       (op_modifier << HCR_OPMOD_SHIFT) |
+						op),			  ptr + offs[6]);
+	wmb();
+	__raw_writel((__force u32) 0,                                     ptr + offs[7]);
+	wmb();
+}
+
+static int mthca_cmd_post_hcr(struct mthca_dev *dev,
+			      u64 in_param,
+			      u64 out_param,
+			      u32 in_modifier,
+			      u8 op_modifier,
+			      u16 op,
+			      u16 token,
+			      int event)
+{
+	if (event) {
+		unsigned long end = jiffies + GO_BIT_TIMEOUT;
+
+		while (go_bit(dev) && time_before(jiffies, end)) {
+			set_current_state(TASK_RUNNING);
+			schedule();
+		}
+	}
+
+	if (go_bit(dev))
+		return -EAGAIN;
+
+	/*
+	 * We use writel (instead of something like memcpy_toio)
+	 * because writes of less than 32 bits to the HCR don't work
+	 * (and some architectures such as ia64 implement memcpy_toio
+	 * in terms of writeb).
+	 */
+	__raw_writel((__force u32) cpu_to_be32(in_param >> 32),           dev->hcr + 0 * 4);
+	__raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  dev->hcr + 1 * 4);
+	__raw_writel((__force u32) cpu_to_be32(in_modifier),              dev->hcr + 2 * 4);
+	__raw_writel((__force u32) cpu_to_be32(out_param >> 32),          dev->hcr + 3 * 4);
+	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), dev->hcr + 4 * 4);
+	__raw_writel((__force u32) cpu_to_be32(token << 16),              dev->hcr + 5 * 4);
+
+	/* __raw_writel may not order writes. */
+	wmb();
+
+	__raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)                |
+					       (event ? (1 << HCA_E_BIT) : 0)   |
+					       (op_modifier << HCR_OPMOD_SHIFT) |
+					       op),                       dev->hcr + 6 * 4);
+
+	return 0;
+}
+
+static int mthca_cmd_post(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 out_param,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  u16 token,
+			  int event)
+{
+	int err = 0;
+
+	mutex_lock(&dev->cmd.hcr_mutex);
+
+	if (event && dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS && fw_cmd_doorbell)
+		mthca_cmd_post_dbell(dev, in_param, out_param, in_modifier,
+					   op_modifier, op, token);
+	else
+		err = mthca_cmd_post_hcr(dev, in_param, out_param, in_modifier,
+					 op_modifier, op, token, event);
+
+	/*
+	 * Make sure that our HCR writes don't get mixed in with
+	 * writes from another CPU starting a FW command.
+	 */
+	mmiowb();
+
+	mutex_unlock(&dev->cmd.hcr_mutex);
+	return err;
+}
+
+
+static int mthca_status_to_errno(u8 status)
+{
+	static const int trans_table[] = {
+		[MTHCA_CMD_STAT_INTERNAL_ERR]   = -EIO,
+		[MTHCA_CMD_STAT_BAD_OP]         = -EPERM,
+		[MTHCA_CMD_STAT_BAD_PARAM]      = -EINVAL,
+		[MTHCA_CMD_STAT_BAD_SYS_STATE]  = -ENXIO,
+		[MTHCA_CMD_STAT_BAD_RESOURCE]   = -EBADF,
+		[MTHCA_CMD_STAT_RESOURCE_BUSY]  = -EBUSY,
+		[MTHCA_CMD_STAT_DDR_MEM_ERR]    = -ENOMEM,
+		[MTHCA_CMD_STAT_EXCEED_LIM]     = -ENOMEM,
+		[MTHCA_CMD_STAT_BAD_RES_STATE]  = -EBADF,
+		[MTHCA_CMD_STAT_BAD_INDEX]      = -EBADF,
+		[MTHCA_CMD_STAT_BAD_NVMEM]      = -EFAULT,
+		[MTHCA_CMD_STAT_BAD_QPEE_STATE] = -EINVAL,
+		[MTHCA_CMD_STAT_BAD_SEG_PARAM]  = -EFAULT,
+		[MTHCA_CMD_STAT_REG_BOUND]      = -EBUSY,
+		[MTHCA_CMD_STAT_LAM_NOT_PRE]    = -EAGAIN,
+		[MTHCA_CMD_STAT_BAD_PKT]        = -EBADMSG,
+		[MTHCA_CMD_STAT_BAD_SIZE]       = -ENOMEM,
+	};
+
+	if (status >= ARRAY_SIZE(trans_table) ||
+			(status != MTHCA_CMD_STAT_OK
+			 && trans_table[status] == 0))
+		return -EINVAL;
+
+	return trans_table[status];
+}
+
+
+static int mthca_cmd_poll(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 *out_param,
+			  int out_is_imm,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  unsigned long timeout)
+{
+	int err = 0;
+	unsigned long end;
+	u8 status;
+
+	down(&dev->cmd.poll_sem);
+
+	err = mthca_cmd_post(dev, in_param,
+			     out_param ? *out_param : 0,
+			     in_modifier, op_modifier,
+			     op, CMD_POLL_TOKEN, 0);
+	if (err)
+		goto out;
+
+	end = timeout + jiffies;
+	while (go_bit(dev) && time_before(jiffies, end)) {
+		set_current_state(TASK_RUNNING);
+		schedule();
+	}
+
+	if (go_bit(dev)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	if (out_is_imm && out_param) {
+		*out_param =
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET)) << 32 |
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET + 4));
+	} else if (out_is_imm) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	status = be32_to_cpu((__force __be32) __raw_readl(dev->hcr + HCR_STATUS_OFFSET)) >> 24;
+	if (status) {
+		mthca_dbg(dev, "Command %02x completed with status %02x\n",
+			  op, status);
+		err = mthca_status_to_errno(status);
+	}
+
+out:
+	up(&dev->cmd.poll_sem);
+	return err;
+}
+
+void mthca_cmd_event(struct mthca_dev *dev,
+		     u16 token,
+		     u8  status,
+		     u64 out_param)
+{
+	struct mthca_cmd_context *context =
+		&dev->cmd.context[token & dev->cmd.token_mask];
+
+	/* previously timed out command completing at long last */
+	if (token != context->token)
+		return;
+
+	context->result    = 0;
+	context->status    = status;
+	context->out_param = out_param;
+
+	complete(&context->done);
+}
+
+static int mthca_cmd_wait(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 *out_param,
+			  int out_is_imm,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  unsigned long timeout)
+{
+	int err = 0;
+	struct mthca_cmd_context *context;
+
+	down(&dev->cmd.event_sem);
+
+	spin_lock(&dev->cmd.context_lock);
+	BUG_ON(dev->cmd.free_head < 0);
+	context = &dev->cmd.context[dev->cmd.free_head];
+	context->token += dev->cmd.token_mask + 1;
+	dev->cmd.free_head = context->next;
+	spin_unlock(&dev->cmd.context_lock);
+
+	init_completion(&context->done);
+
+	err = mthca_cmd_post(dev, in_param,
+			     out_param ? *out_param : 0,
+			     in_modifier, op_modifier,
+			     op, context->token, 1);
+	if (err)
+		goto out;
+
+	if (!wait_for_completion_timeout(&context->done, timeout)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = context->result;
+	if (err)
+		goto out;
+
+	if (context->status) {
+		mthca_dbg(dev, "Command %02x completed with status %02x\n",
+			  op, context->status);
+		err = mthca_status_to_errno(context->status);
+	}
+
+	if (out_is_imm && out_param) {
+		*out_param = context->out_param;
+	} else if (out_is_imm) {
+		err = -EINVAL;
+		goto out;
+	}
+
+out:
+	spin_lock(&dev->cmd.context_lock);
+	context->next = dev->cmd.free_head;
+	dev->cmd.free_head = context - dev->cmd.context;
+	spin_unlock(&dev->cmd.context_lock);
+
+	up(&dev->cmd.event_sem);
+	return err;
+}
+
+/* Invoke a command with an output mailbox */
+static int mthca_cmd_box(struct mthca_dev *dev,
+			 u64 in_param,
+			 u64 out_param,
+			 u32 in_modifier,
+			 u8 op_modifier,
+			 u16 op,
+			 unsigned long timeout)
+{
+	if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS)
+		return mthca_cmd_wait(dev, in_param, &out_param, 0,
+				      in_modifier, op_modifier, op,
+				      timeout);
+	else
+		return mthca_cmd_poll(dev, in_param, &out_param, 0,
+				      in_modifier, op_modifier, op,
+				      timeout);
+}
+
+/* Invoke a command with no output parameter */
+static int mthca_cmd(struct mthca_dev *dev,
+		     u64 in_param,
+		     u32 in_modifier,
+		     u8 op_modifier,
+		     u16 op,
+		     unsigned long timeout)
+{
+	return mthca_cmd_box(dev, in_param, 0, in_modifier,
+			     op_modifier, op, timeout);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static int mthca_cmd_imm(struct mthca_dev *dev,
+			 u64 in_param,
+			 u64 *out_param,
+			 u32 in_modifier,
+			 u8 op_modifier,
+			 u16 op,
+			 unsigned long timeout)
+{
+	if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS)
+		return mthca_cmd_wait(dev, in_param, out_param, 1,
+				      in_modifier, op_modifier, op,
+				      timeout);
+	else
+		return mthca_cmd_poll(dev, in_param, out_param, 1,
+				      in_modifier, op_modifier, op,
+				      timeout);
+}
+
+int mthca_cmd_init(struct mthca_dev *dev)
+{
+	mutex_init(&dev->cmd.hcr_mutex);
+	sema_init(&dev->cmd.poll_sem, 1);
+	dev->cmd.flags = 0;
+
+	dev->hcr = ioremap(pci_resource_start(dev->pdev, 0) + MTHCA_HCR_BASE,
+			   MTHCA_HCR_SIZE);
+	if (!dev->hcr) {
+		mthca_err(dev, "Couldn't map command register.");
+		return -ENOMEM;
+	}
+
+	dev->cmd.pool = dma_pool_create("mthca_cmd", &dev->pdev->dev,
+					MTHCA_MAILBOX_SIZE,
+					MTHCA_MAILBOX_SIZE, 0);
+	if (!dev->cmd.pool) {
+		iounmap(dev->hcr);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void mthca_cmd_cleanup(struct mthca_dev *dev)
+{
+	dma_pool_destroy(dev->cmd.pool);
+	iounmap(dev->hcr);
+	if (dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS)
+		iounmap(dev->cmd.dbell_map);
+}
+
+/*
+ * Switch to using events to issue FW commands (should be called after
+ * event queue to command events has been initialized).
+ */
+int mthca_cmd_use_events(struct mthca_dev *dev)
+{
+	int i;
+
+	dev->cmd.context = kmalloc(dev->cmd.max_cmds *
+				   sizeof (struct mthca_cmd_context),
+				   GFP_KERNEL);
+	if (!dev->cmd.context)
+		return -ENOMEM;
+
+	for (i = 0; i < dev->cmd.max_cmds; ++i) {
+		dev->cmd.context[i].token = i;
+		dev->cmd.context[i].next = i + 1;
+	}
+
+	dev->cmd.context[dev->cmd.max_cmds - 1].next = -1;
+	dev->cmd.free_head = 0;
+
+	sema_init(&dev->cmd.event_sem, dev->cmd.max_cmds);
+	spin_lock_init(&dev->cmd.context_lock);
+
+	for (dev->cmd.token_mask = 1;
+	     dev->cmd.token_mask < dev->cmd.max_cmds;
+	     dev->cmd.token_mask <<= 1)
+		; /* nothing */
+	--dev->cmd.token_mask;
+
+	dev->cmd.flags |= MTHCA_CMD_USE_EVENTS;
+
+	down(&dev->cmd.poll_sem);
+
+	return 0;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mthca_cmd_use_polling(struct mthca_dev *dev)
+{
+	int i;
+
+	dev->cmd.flags &= ~MTHCA_CMD_USE_EVENTS;
+
+	for (i = 0; i < dev->cmd.max_cmds; ++i)
+		down(&dev->cmd.event_sem);
+
+	kfree(dev->cmd.context);
+
+	up(&dev->cmd.poll_sem);
+}
+
+struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev,
+					  gfp_t gfp_mask)
+{
+	struct mthca_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof *mailbox, gfp_mask);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = dma_pool_alloc(dev->cmd.pool, gfp_mask, &mailbox->dma);
+	if (!mailbox->buf) {
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return mailbox;
+}
+
+void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox)
+{
+	if (!mailbox)
+		return;
+
+	dma_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+
+int mthca_SYS_EN(struct mthca_dev *dev)
+{
+	u64 out;
+	int ret;
+
+	ret = mthca_cmd_imm(dev, 0, &out, 0, 0, CMD_SYS_EN, CMD_TIME_CLASS_D);
+
+	if (ret == -ENOMEM)
+		mthca_warn(dev, "SYS_EN DDR error: syn=%x, sock=%d, "
+			   "sladdr=%d, SPD source=%s\n",
+			   (int) (out >> 6) & 0xf, (int) (out >> 4) & 3,
+			   (int) (out >> 1) & 7, (int) out & 1 ? "NVMEM" : "DIMM");
+
+	return ret;
+}
+
+int mthca_SYS_DIS(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C);
+}
+
+static int mthca_map_cmd(struct mthca_dev *dev, u16 op, struct mthca_icm *icm,
+			 u64 virt)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_icm_iter iter;
+	__be64 *pages;
+	int lg;
+	int nent = 0;
+	int i;
+	int err = 0;
+	int ts = 0, tc = 0;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	memset(mailbox->buf, 0, MTHCA_MAILBOX_SIZE);
+	pages = mailbox->buf;
+
+	for (mthca_icm_first(icm, &iter);
+	     !mthca_icm_last(&iter);
+	     mthca_icm_next(&iter)) {
+		/*
+		 * We have to pass pages that are aligned to their
+		 * size, so find the least significant 1 in the
+		 * address or size and use that as our log2 size.
+		 */
+		lg = ffs(mthca_icm_addr(&iter) | mthca_icm_size(&iter)) - 1;
+		if (lg < MTHCA_ICM_PAGE_SHIFT) {
+			mthca_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n",
+				   MTHCA_ICM_PAGE_SIZE,
+				   (unsigned long long) mthca_icm_addr(&iter),
+				   mthca_icm_size(&iter));
+			err = -EINVAL;
+			goto out;
+		}
+		for (i = 0; i < mthca_icm_size(&iter) >> lg; ++i) {
+			if (virt != -1) {
+				pages[nent * 2] = cpu_to_be64(virt);
+				virt += 1ULL << lg;
+			}
+
+			pages[nent * 2 + 1] =
+				cpu_to_be64((mthca_icm_addr(&iter) + (i << lg)) |
+					    (lg - MTHCA_ICM_PAGE_SHIFT));
+			ts += 1 << (lg - 10);
+			++tc;
+
+			if (++nent == MTHCA_MAILBOX_SIZE / 16) {
+				err = mthca_cmd(dev, mailbox->dma, nent, 0, op,
+						CMD_TIME_CLASS_B);
+				if (err)
+					goto out;
+				nent = 0;
+			}
+		}
+	}
+
+	if (nent)
+		err = mthca_cmd(dev, mailbox->dma, nent, 0, op,
+				CMD_TIME_CLASS_B);
+
+	switch (op) {
+	case CMD_MAP_FA:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts);
+		break;
+	case CMD_MAP_ICM_AUX:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts);
+		break;
+	case CMD_MAP_ICM:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n",
+			  tc, ts, (unsigned long long) virt - (ts << 10));
+		break;
+	}
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm)
+{
+	return mthca_map_cmd(dev, CMD_MAP_FA, icm, -1);
+}
+
+int mthca_UNMAP_FA(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_FA, CMD_TIME_CLASS_B);
+}
+
+int mthca_RUN_FW(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_RUN_FW, CMD_TIME_CLASS_A);
+}
+
+static void mthca_setup_cmd_doorbells(struct mthca_dev *dev, u64 base)
+{
+	phys_addr_t addr;
+	u16 max_off = 0;
+	int i;
+
+	for (i = 0; i < 8; ++i)
+		max_off = max(max_off, dev->cmd.dbell_offsets[i]);
+
+	if ((base & PAGE_MASK) != ((base + max_off) & PAGE_MASK)) {
+		mthca_warn(dev, "Firmware doorbell region at 0x%016llx, "
+			   "length 0x%x crosses a page boundary\n",
+			   (unsigned long long) base, max_off);
+		return;
+	}
+
+	addr = pci_resource_start(dev->pdev, 2) +
+		((pci_resource_len(dev->pdev, 2) - 1) & base);
+	dev->cmd.dbell_map = ioremap(addr, max_off + sizeof(u32));
+	if (!dev->cmd.dbell_map)
+		return;
+
+	dev->cmd.flags |= MTHCA_CMD_POST_DOORBELLS;
+	mthca_dbg(dev, "Mapped doorbell page for posting FW commands\n");
+}
+
+int mthca_QUERY_FW(struct mthca_dev *dev)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *outbox;
+	u64 base;
+	u32 tmp;
+	int err = 0;
+	u8 lg;
+	int i;
+
+#define QUERY_FW_OUT_SIZE             0x100
+#define QUERY_FW_VER_OFFSET            0x00
+#define QUERY_FW_MAX_CMD_OFFSET        0x0f
+#define QUERY_FW_ERR_START_OFFSET      0x30
+#define QUERY_FW_ERR_SIZE_OFFSET       0x38
+
+#define QUERY_FW_CMD_DB_EN_OFFSET      0x10
+#define QUERY_FW_CMD_DB_OFFSET         0x50
+#define QUERY_FW_CMD_DB_BASE           0x60
+
+#define QUERY_FW_START_OFFSET          0x20
+#define QUERY_FW_END_OFFSET            0x28
+
+#define QUERY_FW_SIZE_OFFSET           0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET   0x20
+#define QUERY_FW_EQ_ARM_BASE_OFFSET    0x40
+#define QUERY_FW_EQ_SET_CI_BASE_OFFSET 0x48
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_FW,
+			    CMD_TIME_CLASS_A);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->fw_ver,   outbox, QUERY_FW_VER_OFFSET);
+	/*
+	 * FW subminor version is at more significant bits than minor
+	 * version, so swap here.
+	 */
+	dev->fw_ver = (dev->fw_ver & 0xffff00000000ull) |
+		((dev->fw_ver & 0xffff0000ull) >> 16) |
+		((dev->fw_ver & 0x0000ffffull) << 16);
+
+	MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+	dev->cmd.max_cmds = 1 << lg;
+
+	mthca_dbg(dev, "FW version %012llx, max commands %d\n",
+		  (unsigned long long) dev->fw_ver, dev->cmd.max_cmds);
+
+	MTHCA_GET(dev->catas_err.addr, outbox, QUERY_FW_ERR_START_OFFSET);
+	MTHCA_GET(dev->catas_err.size, outbox, QUERY_FW_ERR_SIZE_OFFSET);
+
+	mthca_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x\n",
+		  (unsigned long long) dev->catas_err.addr, dev->catas_err.size);
+
+	MTHCA_GET(tmp, outbox, QUERY_FW_CMD_DB_EN_OFFSET);
+	if (tmp & 0x1) {
+		mthca_dbg(dev, "FW supports commands through doorbells\n");
+
+		MTHCA_GET(base, outbox, QUERY_FW_CMD_DB_BASE);
+		for (i = 0; i < MTHCA_CMD_NUM_DBELL_DWORDS; ++i)
+			MTHCA_GET(dev->cmd.dbell_offsets[i], outbox,
+				  QUERY_FW_CMD_DB_OFFSET + (i << 1));
+
+		mthca_setup_cmd_doorbells(dev, base);
+	}
+
+	if (mthca_is_memfree(dev)) {
+		MTHCA_GET(dev->fw.arbel.fw_pages,       outbox, QUERY_FW_SIZE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.clr_int_base,   outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.eq_arm_base,    outbox, QUERY_FW_EQ_ARM_BASE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.eq_set_ci_base, outbox, QUERY_FW_EQ_SET_CI_BASE_OFFSET);
+		mthca_dbg(dev, "FW size %d KB\n", dev->fw.arbel.fw_pages << 2);
+
+		/*
+		 * Round up number of system pages needed in case
+		 * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE.
+		 */
+		dev->fw.arbel.fw_pages =
+			ALIGN(dev->fw.arbel.fw_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >>
+				(PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT);
+
+		mthca_dbg(dev, "Clear int @ %llx, EQ arm @ %llx, EQ set CI @ %llx\n",
+			  (unsigned long long) dev->fw.arbel.clr_int_base,
+			  (unsigned long long) dev->fw.arbel.eq_arm_base,
+			  (unsigned long long) dev->fw.arbel.eq_set_ci_base);
+	} else {
+		MTHCA_GET(dev->fw.tavor.fw_start, outbox, QUERY_FW_START_OFFSET);
+		MTHCA_GET(dev->fw.tavor.fw_end,   outbox, QUERY_FW_END_OFFSET);
+
+		mthca_dbg(dev, "FW size %d KB (start %llx, end %llx)\n",
+			  (int) ((dev->fw.tavor.fw_end - dev->fw.tavor.fw_start) >> 10),
+			  (unsigned long long) dev->fw.tavor.fw_start,
+			  (unsigned long long) dev->fw.tavor.fw_end);
+	}
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_ENABLE_LAM(struct mthca_dev *dev)
+{
+	struct mthca_mailbox *mailbox;
+	u8 info;
+	u32 *outbox;
+	int err = 0;
+
+#define ENABLE_LAM_OUT_SIZE         0x100
+#define ENABLE_LAM_START_OFFSET     0x00
+#define ENABLE_LAM_END_OFFSET       0x08
+#define ENABLE_LAM_INFO_OFFSET      0x13
+
+#define ENABLE_LAM_INFO_HIDDEN_FLAG (1 << 4)
+#define ENABLE_LAM_INFO_ECC_MASK    0x3
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_ENABLE_LAM,
+			    CMD_TIME_CLASS_C);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->ddr_start, outbox, ENABLE_LAM_START_OFFSET);
+	MTHCA_GET(dev->ddr_end,   outbox, ENABLE_LAM_END_OFFSET);
+	MTHCA_GET(info,           outbox, ENABLE_LAM_INFO_OFFSET);
+
+	if (!!(info & ENABLE_LAM_INFO_HIDDEN_FLAG) !=
+	    !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		mthca_info(dev, "FW reports that HCA-attached memory "
+			   "is %s hidden; does not match PCI config\n",
+			   (info & ENABLE_LAM_INFO_HIDDEN_FLAG) ?
+			   "" : "not");
+	}
+	if (info & ENABLE_LAM_INFO_HIDDEN_FLAG)
+		mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+	mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+		  (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+		  (unsigned long long) dev->ddr_start,
+		  (unsigned long long) dev->ddr_end);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_DISABLE_LAM(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C);
+}
+
+int mthca_QUERY_DDR(struct mthca_dev *dev)
+{
+	struct mthca_mailbox *mailbox;
+	u8 info;
+	u32 *outbox;
+	int err = 0;
+
+#define QUERY_DDR_OUT_SIZE         0x100
+#define QUERY_DDR_START_OFFSET     0x00
+#define QUERY_DDR_END_OFFSET       0x08
+#define QUERY_DDR_INFO_OFFSET      0x13
+
+#define QUERY_DDR_INFO_HIDDEN_FLAG (1 << 4)
+#define QUERY_DDR_INFO_ECC_MASK    0x3
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DDR,
+			    CMD_TIME_CLASS_A);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->ddr_start, outbox, QUERY_DDR_START_OFFSET);
+	MTHCA_GET(dev->ddr_end,   outbox, QUERY_DDR_END_OFFSET);
+	MTHCA_GET(info,           outbox, QUERY_DDR_INFO_OFFSET);
+
+	if (!!(info & QUERY_DDR_INFO_HIDDEN_FLAG) !=
+	    !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		mthca_info(dev, "FW reports that HCA-attached memory "
+			   "is %s hidden; does not match PCI config\n",
+			   (info & QUERY_DDR_INFO_HIDDEN_FLAG) ?
+			   "" : "not");
+	}
+	if (info & QUERY_DDR_INFO_HIDDEN_FLAG)
+		mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+	mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+		  (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+		  (unsigned long long) dev->ddr_start,
+		  (unsigned long long) dev->ddr_end);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+			struct mthca_dev_lim *dev_lim)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u16 size;
+	u16 stat_rate;
+	int err;
+
+#define QUERY_DEV_LIM_OUT_SIZE             0x100
+#define QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET     0x10
+#define QUERY_DEV_LIM_MAX_QP_SZ_OFFSET      0x11
+#define QUERY_DEV_LIM_RSVD_QP_OFFSET        0x12
+#define QUERY_DEV_LIM_MAX_QP_OFFSET         0x13
+#define QUERY_DEV_LIM_RSVD_SRQ_OFFSET       0x14
+#define QUERY_DEV_LIM_MAX_SRQ_OFFSET        0x15
+#define QUERY_DEV_LIM_RSVD_EEC_OFFSET       0x16
+#define QUERY_DEV_LIM_MAX_EEC_OFFSET        0x17
+#define QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET      0x19
+#define QUERY_DEV_LIM_RSVD_CQ_OFFSET        0x1a
+#define QUERY_DEV_LIM_MAX_CQ_OFFSET         0x1b
+#define QUERY_DEV_LIM_MAX_MPT_OFFSET        0x1d
+#define QUERY_DEV_LIM_RSVD_EQ_OFFSET        0x1e
+#define QUERY_DEV_LIM_MAX_EQ_OFFSET         0x1f
+#define QUERY_DEV_LIM_RSVD_MTT_OFFSET       0x20
+#define QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET     0x21
+#define QUERY_DEV_LIM_RSVD_MRW_OFFSET       0x22
+#define QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET    0x23
+#define QUERY_DEV_LIM_MAX_AV_OFFSET         0x27
+#define QUERY_DEV_LIM_MAX_REQ_QP_OFFSET     0x29
+#define QUERY_DEV_LIM_MAX_RES_QP_OFFSET     0x2b
+#define QUERY_DEV_LIM_MAX_RDMA_OFFSET       0x2f
+#define QUERY_DEV_LIM_RSZ_SRQ_OFFSET        0x33
+#define QUERY_DEV_LIM_ACK_DELAY_OFFSET      0x35
+#define QUERY_DEV_LIM_MTU_WIDTH_OFFSET      0x36
+#define QUERY_DEV_LIM_VL_PORT_OFFSET        0x37
+#define QUERY_DEV_LIM_MAX_GID_OFFSET        0x3b
+#define QUERY_DEV_LIM_RATE_SUPPORT_OFFSET   0x3c
+#define QUERY_DEV_LIM_MAX_PKEY_OFFSET       0x3f
+#define QUERY_DEV_LIM_FLAGS_OFFSET          0x44
+#define QUERY_DEV_LIM_RSVD_UAR_OFFSET       0x48
+#define QUERY_DEV_LIM_UAR_SZ_OFFSET         0x49
+#define QUERY_DEV_LIM_PAGE_SZ_OFFSET        0x4b
+#define QUERY_DEV_LIM_MAX_SG_OFFSET         0x51
+#define QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET    0x52
+#define QUERY_DEV_LIM_MAX_SG_RQ_OFFSET      0x55
+#define QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET 0x56
+#define QUERY_DEV_LIM_MAX_QP_MCG_OFFSET     0x61
+#define QUERY_DEV_LIM_RSVD_MCG_OFFSET       0x62
+#define QUERY_DEV_LIM_MAX_MCG_OFFSET        0x63
+#define QUERY_DEV_LIM_RSVD_PD_OFFSET        0x64
+#define QUERY_DEV_LIM_MAX_PD_OFFSET         0x65
+#define QUERY_DEV_LIM_RSVD_RDD_OFFSET       0x66
+#define QUERY_DEV_LIM_MAX_RDD_OFFSET        0x67
+#define QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET   0x80
+#define QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET   0x82
+#define QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET  0x84
+#define QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET  0x86
+#define QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET   0x88
+#define QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET   0x8a
+#define QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET   0x8c
+#define QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET   0x8e
+#define QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET   0x90
+#define QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET   0x92
+#define QUERY_DEV_LIM_PBL_SZ_OFFSET         0x96
+#define QUERY_DEV_LIM_BMME_FLAGS_OFFSET     0x97
+#define QUERY_DEV_LIM_RSVD_LKEY_OFFSET      0x98
+#define QUERY_DEV_LIM_LAMR_OFFSET           0x9f
+#define QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET     0xa0
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DEV_LIM,
+			    CMD_TIME_CLASS_A);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_QP_OFFSET);
+	dev_lim->reserved_qps = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_OFFSET);
+	dev_lim->max_qps = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_SRQ_OFFSET);
+	dev_lim->reserved_srqs = 1 << (field >> 4);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_OFFSET);
+	dev_lim->max_srqs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EEC_OFFSET);
+	dev_lim->reserved_eecs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EEC_OFFSET);
+	dev_lim->max_eecs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET);
+	dev_lim->max_cq_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_CQ_OFFSET);
+	dev_lim->reserved_cqs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_OFFSET);
+	dev_lim->max_cqs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MPT_OFFSET);
+	dev_lim->max_mpts = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EQ_OFFSET);
+	dev_lim->reserved_eqs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET);
+	dev_lim->max_eqs = 1 << (field & 0x7);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET);
+	if (mthca_is_memfree(dev))
+		dev_lim->reserved_mtts = ALIGN((1 << (field >> 4)) * sizeof(u64),
+					       dev->limits.mtt_seg_size) / dev->limits.mtt_seg_size;
+	else
+		dev_lim->reserved_mtts = 1 << (field >> 4);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET);
+	dev_lim->max_mrw_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET);
+	dev_lim->reserved_mrws = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET);
+	dev_lim->max_mtt_seg = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_REQ_QP_OFFSET);
+	dev_lim->max_requester_per_qp = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RES_QP_OFFSET);
+	dev_lim->max_responder_per_qp = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDMA_OFFSET);
+	dev_lim->max_rdma_global = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_ACK_DELAY_OFFSET);
+	dev_lim->local_ca_ack_delay = field & 0x1f;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MTU_WIDTH_OFFSET);
+	dev_lim->max_mtu        = field >> 4;
+	dev_lim->max_port_width = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_VL_PORT_OFFSET);
+	dev_lim->max_vl    = field >> 4;
+	dev_lim->num_ports = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_GID_OFFSET);
+	dev_lim->max_gids = 1 << (field & 0xf);
+	MTHCA_GET(stat_rate, outbox, QUERY_DEV_LIM_RATE_SUPPORT_OFFSET);
+	dev_lim->stat_rate_support = stat_rate;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PKEY_OFFSET);
+	dev_lim->max_pkeys = 1 << (field & 0xf);
+	MTHCA_GET(dev_lim->flags, outbox, QUERY_DEV_LIM_FLAGS_OFFSET);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_UAR_OFFSET);
+	dev_lim->reserved_uars = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_UAR_SZ_OFFSET);
+	dev_lim->uar_size = 1 << ((field & 0x3f) + 20);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_PAGE_SZ_OFFSET);
+	dev_lim->min_page_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_OFFSET);
+	dev_lim->max_sg = field;
+
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET);
+	dev_lim->max_desc_sz = size;
+
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_MCG_OFFSET);
+	dev_lim->max_qp_per_mcg = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MCG_OFFSET);
+	dev_lim->reserved_mgms = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MCG_OFFSET);
+	dev_lim->max_mcgs = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_PD_OFFSET);
+	dev_lim->reserved_pds = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PD_OFFSET);
+	dev_lim->max_pds = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_RDD_OFFSET);
+	dev_lim->reserved_rdds = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDD_OFFSET);
+	dev_lim->max_rdds = 1 << (field & 0x3f);
+
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET);
+	dev_lim->eec_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET);
+	dev_lim->qpc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET);
+	dev_lim->eeec_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET);
+	dev_lim->eqpc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET);
+	dev_lim->eqc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET);
+	dev_lim->cqc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET);
+	dev_lim->srq_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET);
+	dev_lim->uar_scratch_entry_sz = size;
+
+	if (mthca_is_memfree(dev)) {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+		dev_lim->max_srq_sz = 1 << field;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+		dev_lim->max_qp_sz = 1 << field;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSZ_SRQ_OFFSET);
+		dev_lim->hca.arbel.resize_srq = field & 1;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_RQ_OFFSET);
+		dev_lim->max_sg = min_t(int, field, dev_lim->max_sg);
+		MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET);
+		dev_lim->max_desc_sz = min_t(int, size, dev_lim->max_desc_sz);
+		MTHCA_GET(size, outbox, QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET);
+		dev_lim->mpt_entry_sz = size;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_PBL_SZ_OFFSET);
+		dev_lim->hca.arbel.max_pbl_sz = 1 << (field & 0x3f);
+		MTHCA_GET(dev_lim->hca.arbel.bmme_flags, outbox,
+			  QUERY_DEV_LIM_BMME_FLAGS_OFFSET);
+		MTHCA_GET(dev_lim->hca.arbel.reserved_lkey, outbox,
+			  QUERY_DEV_LIM_RSVD_LKEY_OFFSET);
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_LAMR_OFFSET);
+		dev_lim->hca.arbel.lam_required = field & 1;
+		MTHCA_GET(dev_lim->hca.arbel.max_icm_sz, outbox,
+			  QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET);
+
+		if (dev_lim->hca.arbel.bmme_flags & 1)
+			mthca_dbg(dev, "Base MM extensions: yes "
+				  "(flags %d, max PBL %d, rsvd L_Key %08x)\n",
+				  dev_lim->hca.arbel.bmme_flags,
+				  dev_lim->hca.arbel.max_pbl_sz,
+				  dev_lim->hca.arbel.reserved_lkey);
+		else
+			mthca_dbg(dev, "Base MM extensions: no\n");
+
+		mthca_dbg(dev, "Max ICM size %lld MB\n",
+			  (unsigned long long) dev_lim->hca.arbel.max_icm_sz >> 20);
+	} else {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+		dev_lim->max_srq_sz = (1 << field) - 1;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+		dev_lim->max_qp_sz = (1 << field) - 1;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_AV_OFFSET);
+		dev_lim->hca.tavor.max_avs = 1 << (field & 0x3f);
+		dev_lim->mpt_entry_sz = MTHCA_MPT_ENTRY_SIZE;
+	}
+
+	mthca_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+		  dev_lim->max_qps, dev_lim->reserved_qps, dev_lim->qpc_entry_sz);
+	mthca_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
+		  dev_lim->max_srqs, dev_lim->reserved_srqs, dev_lim->srq_entry_sz);
+	mthca_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+		  dev_lim->max_cqs, dev_lim->reserved_cqs, dev_lim->cqc_entry_sz);
+	mthca_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
+		  dev_lim->max_eqs, dev_lim->reserved_eqs, dev_lim->eqc_entry_sz);
+	mthca_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+		  dev_lim->reserved_mrws, dev_lim->reserved_mtts);
+	mthca_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+		  dev_lim->max_pds, dev_lim->reserved_pds, dev_lim->reserved_uars);
+	mthca_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+		  dev_lim->max_pds, dev_lim->reserved_mgms);
+	mthca_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
+		  dev_lim->max_cq_sz, dev_lim->max_qp_sz, dev_lim->max_srq_sz);
+
+	mthca_dbg(dev, "Flags: %08x\n", dev_lim->flags);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+static void get_board_id(void *vsd, char *board_id)
+{
+	int i;
+
+#define VSD_OFFSET_SIG1		0x00
+#define VSD_OFFSET_SIG2		0xde
+#define VSD_OFFSET_MLX_BOARD_ID	0xd0
+#define VSD_OFFSET_TS_BOARD_ID	0x20
+
+#define VSD_SIGNATURE_TOPSPIN	0x5ad
+
+	memset(board_id, 0, MTHCA_BOARD_ID_LEN);
+
+	if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
+	    be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) {
+		strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MTHCA_BOARD_ID_LEN);
+	} else {
+		/*
+		 * The board ID is a string but the firmware byte
+		 * swaps each 4-byte word before passing it back to
+		 * us.  Therefore we need to swab it before printing.
+		 */
+		for (i = 0; i < 4; ++i)
+			((u32 *) board_id)[i] =
+				swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4));
+	}
+}
+
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+			struct mthca_adapter *adapter)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *outbox;
+	int err;
+
+#define QUERY_ADAPTER_OUT_SIZE             0x100
+#define QUERY_ADAPTER_VENDOR_ID_OFFSET     0x00
+#define QUERY_ADAPTER_DEVICE_ID_OFFSET     0x04
+#define QUERY_ADAPTER_REVISION_ID_OFFSET   0x08
+#define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
+#define QUERY_ADAPTER_VSD_OFFSET           0x20
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_ADAPTER,
+			    CMD_TIME_CLASS_A);
+
+	if (err)
+		goto out;
+
+	if (!mthca_is_memfree(dev)) {
+		MTHCA_GET(adapter->vendor_id, outbox,
+			  QUERY_ADAPTER_VENDOR_ID_OFFSET);
+		MTHCA_GET(adapter->device_id, outbox,
+			  QUERY_ADAPTER_DEVICE_ID_OFFSET);
+		MTHCA_GET(adapter->revision_id, outbox,
+			  QUERY_ADAPTER_REVISION_ID_OFFSET);
+	}
+	MTHCA_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+	get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
+		     adapter->board_id);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_INIT_HCA(struct mthca_dev *dev,
+		   struct mthca_init_hca_param *param)
+{
+	struct mthca_mailbox *mailbox;
+	__be32 *inbox;
+	int err;
+
+#define INIT_HCA_IN_SIZE             	 0x200
+#define INIT_HCA_FLAGS1_OFFSET           0x00c
+#define INIT_HCA_FLAGS2_OFFSET           0x014
+#define INIT_HCA_QPC_OFFSET          	 0x020
+#define  INIT_HCA_QPC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x10)
+#define  INIT_HCA_LOG_QP_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x17)
+#define  INIT_HCA_EEC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x20)
+#define  INIT_HCA_LOG_EEC_OFFSET     	 (INIT_HCA_QPC_OFFSET + 0x27)
+#define  INIT_HCA_SRQC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x28)
+#define  INIT_HCA_LOG_SRQ_OFFSET     	 (INIT_HCA_QPC_OFFSET + 0x2f)
+#define  INIT_HCA_CQC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x30)
+#define  INIT_HCA_LOG_CQ_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x37)
+#define  INIT_HCA_EQPC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x40)
+#define  INIT_HCA_EEEC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x50)
+#define  INIT_HCA_EQC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x60)
+#define  INIT_HCA_LOG_EQ_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x67)
+#define  INIT_HCA_RDB_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x70)
+#define INIT_HCA_UDAV_OFFSET         	 0x0b0
+#define  INIT_HCA_UDAV_LKEY_OFFSET   	 (INIT_HCA_UDAV_OFFSET + 0x0)
+#define  INIT_HCA_UDAV_PD_OFFSET     	 (INIT_HCA_UDAV_OFFSET + 0x4)
+#define INIT_HCA_MCAST_OFFSET        	 0x0c0
+#define  INIT_HCA_MC_BASE_OFFSET         (INIT_HCA_MCAST_OFFSET + 0x00)
+#define  INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define  INIT_HCA_MC_HASH_SZ_OFFSET      (INIT_HCA_MCAST_OFFSET + 0x16)
+#define  INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define INIT_HCA_TPT_OFFSET              0x0f0
+#define  INIT_HCA_MPT_BASE_OFFSET        (INIT_HCA_TPT_OFFSET + 0x00)
+#define  INIT_HCA_MTT_SEG_SZ_OFFSET      (INIT_HCA_TPT_OFFSET + 0x09)
+#define  INIT_HCA_LOG_MPT_SZ_OFFSET      (INIT_HCA_TPT_OFFSET + 0x0b)
+#define  INIT_HCA_MTT_BASE_OFFSET        (INIT_HCA_TPT_OFFSET + 0x10)
+#define INIT_HCA_UAR_OFFSET              0x120
+#define  INIT_HCA_UAR_BASE_OFFSET        (INIT_HCA_UAR_OFFSET + 0x00)
+#define  INIT_HCA_UARC_SZ_OFFSET         (INIT_HCA_UAR_OFFSET + 0x09)
+#define  INIT_HCA_LOG_UAR_SZ_OFFSET      (INIT_HCA_UAR_OFFSET + 0x0a)
+#define  INIT_HCA_UAR_PAGE_SZ_OFFSET     (INIT_HCA_UAR_OFFSET + 0x0b)
+#define  INIT_HCA_UAR_SCATCH_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x10)
+#define  INIT_HCA_UAR_CTX_BASE_OFFSET    (INIT_HCA_UAR_OFFSET + 0x18)
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, INIT_HCA_IN_SIZE);
+
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		MTHCA_PUT(inbox, 0x1, INIT_HCA_FLAGS1_OFFSET);
+
+#if defined(__LITTLE_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+	/* Check port for UD address vector: */
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1);
+
+	/* Enable IPoIB checksumming if we can: */
+	if (dev->device_cap_flags & IB_DEVICE_UD_IP_CSUM)
+		*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(7 << 3);
+
+	/* We leave wqe_quota, responder_exu, etc as 0 (default) */
+
+	/* QPC/EEC/CQC/EQC/RDB attributes */
+
+	MTHCA_PUT(inbox, param->qpc_base,     INIT_HCA_QPC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_qps,  INIT_HCA_LOG_QP_OFFSET);
+	MTHCA_PUT(inbox, param->eec_base,     INIT_HCA_EEC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_eecs, INIT_HCA_LOG_EEC_OFFSET);
+	MTHCA_PUT(inbox, param->srqc_base,    INIT_HCA_SRQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET);
+	MTHCA_PUT(inbox, param->cqc_base,     INIT_HCA_CQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_cqs,  INIT_HCA_LOG_CQ_OFFSET);
+	MTHCA_PUT(inbox, param->eqpc_base,    INIT_HCA_EQPC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->eeec_base,    INIT_HCA_EEEC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->eqc_base,     INIT_HCA_EQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_eqs,  INIT_HCA_LOG_EQ_OFFSET);
+	MTHCA_PUT(inbox, param->rdb_base,     INIT_HCA_RDB_BASE_OFFSET);
+
+	/* UD AV attributes */
+
+	/* multicast attributes */
+
+	MTHCA_PUT(inbox, param->mc_base,         INIT_HCA_MC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->mc_hash_sz,      INIT_HCA_MC_HASH_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+
+	/* TPT attributes */
+
+	MTHCA_PUT(inbox, param->mpt_base,   INIT_HCA_MPT_BASE_OFFSET);
+	if (!mthca_is_memfree(dev))
+		MTHCA_PUT(inbox, param->mtt_seg_sz, INIT_HCA_MTT_SEG_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
+
+	/* UAR attributes */
+	{
+		u8 uar_page_sz = PAGE_SHIFT - 12;
+		MTHCA_PUT(inbox, uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	}
+
+	MTHCA_PUT(inbox, param->uar_scratch_base, INIT_HCA_UAR_SCATCH_BASE_OFFSET);
+
+	if (mthca_is_memfree(dev)) {
+		MTHCA_PUT(inbox, param->log_uarc_sz, INIT_HCA_UARC_SZ_OFFSET);
+		MTHCA_PUT(inbox, param->log_uar_sz,  INIT_HCA_LOG_UAR_SZ_OFFSET);
+		MTHCA_PUT(inbox, param->uarc_base,   INIT_HCA_UAR_CTX_BASE_OFFSET);
+	}
+
+	err = mthca_cmd(dev, mailbox->dma, 0, 0,
+			CMD_INIT_HCA, CMD_TIME_CLASS_D);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_INIT_IB(struct mthca_dev *dev,
+		  struct mthca_init_ib_param *param,
+		  int port)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *inbox;
+	int err;
+	u32 flags;
+
+#define INIT_IB_IN_SIZE          56
+#define INIT_IB_FLAGS_OFFSET     0x00
+#define INIT_IB_FLAG_SIG         (1 << 18)
+#define INIT_IB_FLAG_NG          (1 << 17)
+#define INIT_IB_FLAG_G0          (1 << 16)
+#define INIT_IB_VL_SHIFT         4
+#define INIT_IB_PORT_WIDTH_SHIFT 8
+#define INIT_IB_MTU_SHIFT        12
+#define INIT_IB_MAX_GID_OFFSET   0x06
+#define INIT_IB_MAX_PKEY_OFFSET  0x0a
+#define INIT_IB_GUID0_OFFSET     0x10
+#define INIT_IB_NODE_GUID_OFFSET 0x18
+#define INIT_IB_SI_GUID_OFFSET   0x20
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, INIT_IB_IN_SIZE);
+
+	flags = 0;
+	flags |= param->set_guid0     ? INIT_IB_FLAG_G0  : 0;
+	flags |= param->set_node_guid ? INIT_IB_FLAG_NG  : 0;
+	flags |= param->set_si_guid   ? INIT_IB_FLAG_SIG : 0;
+	flags |= param->vl_cap << INIT_IB_VL_SHIFT;
+	flags |= param->port_width << INIT_IB_PORT_WIDTH_SHIFT;
+	flags |= param->mtu_cap << INIT_IB_MTU_SHIFT;
+	MTHCA_PUT(inbox, flags, INIT_IB_FLAGS_OFFSET);
+
+	MTHCA_PUT(inbox, param->gid_cap,   INIT_IB_MAX_GID_OFFSET);
+	MTHCA_PUT(inbox, param->pkey_cap,  INIT_IB_MAX_PKEY_OFFSET);
+	MTHCA_PUT(inbox, param->guid0,     INIT_IB_GUID0_OFFSET);
+	MTHCA_PUT(inbox, param->node_guid, INIT_IB_NODE_GUID_OFFSET);
+	MTHCA_PUT(inbox, param->si_guid,   INIT_IB_SI_GUID_OFFSET);
+
+	err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_INIT_IB,
+			CMD_TIME_CLASS_A);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port)
+{
+	return mthca_cmd(dev, 0, port, 0, CMD_CLOSE_IB, CMD_TIME_CLASS_A);
+}
+
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic)
+{
+	return mthca_cmd(dev, 0, 0, panic, CMD_CLOSE_HCA, CMD_TIME_CLASS_C);
+}
+
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+		 int port)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *inbox;
+	int err;
+	u32 flags = 0;
+
+#define SET_IB_IN_SIZE         0x40
+#define SET_IB_FLAGS_OFFSET    0x00
+#define SET_IB_FLAG_SIG        (1 << 18)
+#define SET_IB_FLAG_RQK        (1 <<  0)
+#define SET_IB_CAP_MASK_OFFSET 0x04
+#define SET_IB_SI_GUID_OFFSET  0x08
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, SET_IB_IN_SIZE);
+
+	flags |= param->set_si_guid     ? SET_IB_FLAG_SIG : 0;
+	flags |= param->reset_qkey_viol ? SET_IB_FLAG_RQK : 0;
+	MTHCA_PUT(inbox, flags, SET_IB_FLAGS_OFFSET);
+
+	MTHCA_PUT(inbox, param->cap_mask, SET_IB_CAP_MASK_OFFSET);
+	MTHCA_PUT(inbox, param->si_guid,  SET_IB_SI_GUID_OFFSET);
+
+	err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_SET_IB,
+			CMD_TIME_CLASS_B);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt)
+{
+	return mthca_map_cmd(dev, CMD_MAP_ICM, icm, virt);
+}
+
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt)
+{
+	struct mthca_mailbox *mailbox;
+	__be64 *inbox;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	inbox[0] = cpu_to_be64(virt);
+	inbox[1] = cpu_to_be64(dma_addr);
+
+	err = mthca_cmd(dev, mailbox->dma, 1, 0, CMD_MAP_ICM,
+			CMD_TIME_CLASS_B);
+
+	mthca_free_mailbox(dev, mailbox);
+
+	if (!err)
+		mthca_dbg(dev, "Mapped page at %llx to %llx for ICM.\n",
+			  (unsigned long long) dma_addr, (unsigned long long) virt);
+
+	return err;
+}
+
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count)
+{
+	mthca_dbg(dev, "Unmapping %d pages at %llx from ICM.\n",
+		  page_count, (unsigned long long) virt);
+
+	return mthca_cmd(dev, virt, page_count, 0,
+			CMD_UNMAP_ICM, CMD_TIME_CLASS_B);
+}
+
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm)
+{
+	return mthca_map_cmd(dev, CMD_MAP_ICM_AUX, icm, -1);
+}
+
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_ICM_AUX, CMD_TIME_CLASS_B);
+}
+
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages)
+{
+	int ret = mthca_cmd_imm(dev, icm_size, aux_pages, 0,
+			0, CMD_SET_ICM_SIZE, CMD_TIME_CLASS_A);
+
+	if (ret)
+		return ret;
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	*aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT);
+
+	return 0;
+}
+
+int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index)
+{
+	return mthca_cmd(dev, mailbox->dma, mpt_index, 0, CMD_SW2HW_MPT,
+			 CMD_TIME_CLASS_B);
+}
+
+int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index)
+{
+	return mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index,
+			     !mailbox, CMD_HW2SW_MPT,
+			     CMD_TIME_CLASS_B);
+}
+
+int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int num_mtt)
+{
+	return mthca_cmd(dev, mailbox->dma, num_mtt, 0, CMD_WRITE_MTT,
+			 CMD_TIME_CLASS_B);
+}
+
+int mthca_SYNC_TPT(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYNC_TPT, CMD_TIME_CLASS_B);
+}
+
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+		 int eq_num)
+{
+	mthca_dbg(dev, "%s mask %016llx for eqn %d\n",
+		  unmap ? "Clearing" : "Setting",
+		  (unsigned long long) event_mask, eq_num);
+	return mthca_cmd(dev, event_mask, (unmap << 31) | eq_num,
+			 0, CMD_MAP_EQ, CMD_TIME_CLASS_B);
+}
+
+int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num)
+{
+	return mthca_cmd(dev, mailbox->dma, eq_num, 0, CMD_SW2HW_EQ,
+			 CMD_TIME_CLASS_A);
+}
+
+int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, eq_num, 0,
+			     CMD_HW2SW_EQ,
+			     CMD_TIME_CLASS_A);
+}
+
+int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num)
+{
+	return mthca_cmd(dev, mailbox->dma, cq_num, 0, CMD_SW2HW_CQ,
+			CMD_TIME_CLASS_A);
+}
+
+int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, cq_num, 0,
+			     CMD_HW2SW_CQ,
+			     CMD_TIME_CLASS_A);
+}
+
+int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size)
+{
+	struct mthca_mailbox *mailbox;
+	__be32 *inbox;
+	int err;
+
+#define RESIZE_CQ_IN_SIZE		0x40
+#define RESIZE_CQ_LOG_SIZE_OFFSET	0x0c
+#define RESIZE_CQ_LKEY_OFFSET		0x1c
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, RESIZE_CQ_IN_SIZE);
+	/*
+	 * Leave start address fields zeroed out -- mthca assumes that
+	 * MRs for CQs always start at virtual address 0.
+	 */
+	MTHCA_PUT(inbox, log_size, RESIZE_CQ_LOG_SIZE_OFFSET);
+	MTHCA_PUT(inbox, lkey,     RESIZE_CQ_LKEY_OFFSET);
+
+	err = mthca_cmd(dev, mailbox->dma, cq_num, 1, CMD_RESIZE_CQ,
+			CMD_TIME_CLASS_B);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num)
+{
+	return mthca_cmd(dev, mailbox->dma, srq_num, 0, CMD_SW2HW_SRQ,
+			CMD_TIME_CLASS_A);
+}
+
+int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, srq_num, 0,
+			     CMD_HW2SW_SRQ,
+			     CMD_TIME_CLASS_A);
+}
+
+int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num,
+		    struct mthca_mailbox *mailbox)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, num, 0,
+			     CMD_QUERY_SRQ, CMD_TIME_CLASS_A);
+}
+
+int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit)
+{
+	return mthca_cmd(dev, limit, srq_num, 0, CMD_ARM_SRQ,
+			 CMD_TIME_CLASS_B);
+}
+
+int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur,
+		    enum ib_qp_state next, u32 num, int is_ee,
+		    struct mthca_mailbox *mailbox, u32 optmask)
+{
+	static const u16 op[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+		[IB_QPS_RESET] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_INIT]	= CMD_RST2INIT_QPEE,
+		},
+		[IB_QPS_INIT]  = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_INIT]	= CMD_INIT2INIT_QPEE,
+			[IB_QPS_RTR]	= CMD_INIT2RTR_QPEE,
+		},
+		[IB_QPS_RTR]   = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_RTR2RTS_QPEE,
+		},
+		[IB_QPS_RTS]   = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_RTS2RTS_QPEE,
+			[IB_QPS_SQD]	= CMD_RTS2SQD_QPEE,
+		},
+		[IB_QPS_SQD] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_SQD2RTS_QPEE,
+			[IB_QPS_SQD]	= CMD_SQD2SQD_QPEE,
+		},
+		[IB_QPS_SQE] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_SQERR2RTS_QPEE,
+		},
+		[IB_QPS_ERR] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+		}
+	};
+
+	u8 op_mod = 0;
+	int my_mailbox = 0;
+	int err;
+
+	if (op[cur][next] == CMD_ERR2RST_QPEE) {
+		op_mod = 3;	/* don't write outbox, any->reset */
+
+		/* For debugging */
+		if (!mailbox) {
+			mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+			if (!IS_ERR(mailbox)) {
+				my_mailbox = 1;
+				op_mod     = 2;	/* write outbox, any->reset */
+			} else
+				mailbox = NULL;
+		}
+
+		err = mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0,
+				    (!!is_ee << 24) | num, op_mod,
+				    op[cur][next], CMD_TIME_CLASS_C);
+
+		if (0 && mailbox) {
+			int i;
+			mthca_dbg(dev, "Dumping QP context:\n");
+			printk(" %08x\n", be32_to_cpup(mailbox->buf));
+			for (i = 0; i < 0x100 / 4; ++i) {
+				if (i % 8 == 0)
+					printk("[%02x] ", i * 4);
+				printk(" %08x",
+				       be32_to_cpu(((__be32 *) mailbox->buf)[i + 2]));
+				if ((i + 1) % 8 == 0)
+					printk("\n");
+			}
+		}
+
+		if (my_mailbox)
+			mthca_free_mailbox(dev, mailbox);
+	} else {
+		if (0) {
+			int i;
+			mthca_dbg(dev, "Dumping QP context:\n");
+			printk("  opt param mask: %08x\n", be32_to_cpup(mailbox->buf));
+			for (i = 0; i < 0x100 / 4; ++i) {
+				if (i % 8 == 0)
+					printk("  [%02x] ", i * 4);
+				printk(" %08x",
+				       be32_to_cpu(((__be32 *) mailbox->buf)[i + 2]));
+				if ((i + 1) % 8 == 0)
+					printk("\n");
+			}
+		}
+
+		err = mthca_cmd(dev, mailbox->dma, optmask | (!!is_ee << 24) | num,
+				op_mod, op[cur][next], CMD_TIME_CLASS_C);
+	}
+
+	return err;
+}
+
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+		   struct mthca_mailbox *mailbox)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, (!!is_ee << 24) | num, 0,
+			     CMD_QUERY_QPEE, CMD_TIME_CLASS_A);
+}
+
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn)
+{
+	u8 op_mod;
+
+	switch (type) {
+	case IB_QPT_SMI:
+		op_mod = 0;
+		break;
+	case IB_QPT_GSI:
+		op_mod = 1;
+		break;
+	case IB_QPT_RAW_IPV6:
+		op_mod = 2;
+		break;
+	case IB_QPT_RAW_ETHERTYPE:
+		op_mod = 3;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return mthca_cmd(dev, 0, qpn, op_mod, CMD_CONF_SPECIAL_QP,
+			 CMD_TIME_CLASS_B);
+}
+
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+		  int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		  const void *in_mad, void *response_mad)
+{
+	struct mthca_mailbox *inmailbox, *outmailbox;
+	void *inbox;
+	int err;
+	u32 in_modifier = port;
+	u8 op_modifier = 0;
+
+#define MAD_IFC_BOX_SIZE      0x400
+#define MAD_IFC_MY_QPN_OFFSET 0x100
+#define MAD_IFC_RQPN_OFFSET   0x108
+#define MAD_IFC_SL_OFFSET     0x10c
+#define MAD_IFC_G_PATH_OFFSET 0x10d
+#define MAD_IFC_RLID_OFFSET   0x10e
+#define MAD_IFC_PKEY_OFFSET   0x112
+#define MAD_IFC_GRH_OFFSET    0x140
+
+	inmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+	inbox = inmailbox->buf;
+
+	outmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(outmailbox)) {
+		mthca_free_mailbox(dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	memcpy(inbox, in_mad, 256);
+
+	/*
+	 * Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if (ignore_mkey || !in_wc)
+		op_modifier |= 0x1;
+	if (ignore_bkey || !in_wc)
+		op_modifier |= 0x2;
+
+	if (in_wc) {
+		u8 val;
+
+		memset(inbox + 256, 0, 256);
+
+		MTHCA_PUT(inbox, in_wc->qp->qp_num, MAD_IFC_MY_QPN_OFFSET);
+		MTHCA_PUT(inbox, in_wc->src_qp,     MAD_IFC_RQPN_OFFSET);
+
+		val = in_wc->sl << 4;
+		MTHCA_PUT(inbox, val,               MAD_IFC_SL_OFFSET);
+
+		val = in_wc->dlid_path_bits |
+			(in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+		MTHCA_PUT(inbox, val,               MAD_IFC_G_PATH_OFFSET);
+
+		MTHCA_PUT(inbox, ib_lid_cpu16(in_wc->slid), MAD_IFC_RLID_OFFSET);
+		MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET);
+
+		if (in_grh)
+			memcpy(inbox + MAD_IFC_GRH_OFFSET, in_grh, 40);
+
+		op_modifier |= 0x4;
+
+		in_modifier |= ib_lid_cpu16(in_wc->slid) << 16;
+	}
+
+	err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma,
+			    in_modifier, op_modifier,
+			    CMD_MAD_IFC, CMD_TIME_CLASS_C);
+
+	if (!err)
+		memcpy(response_mad, outmailbox->buf, 256);
+
+	mthca_free_mailbox(dev, inmailbox);
+	mthca_free_mailbox(dev, outmailbox);
+	return err;
+}
+
+int mthca_READ_MGM(struct mthca_dev *dev, int index,
+		   struct mthca_mailbox *mailbox)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, index, 0,
+			     CMD_READ_MGM, CMD_TIME_CLASS_A);
+}
+
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index,
+		    struct mthca_mailbox *mailbox)
+{
+	return mthca_cmd(dev, mailbox->dma, index, 0, CMD_WRITE_MGM,
+			 CMD_TIME_CLASS_A);
+}
+
+int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    u16 *hash)
+{
+	u64 imm;
+	int err;
+
+	err = mthca_cmd_imm(dev, mailbox->dma, &imm, 0, 0, CMD_MGID_HASH,
+			    CMD_TIME_CLASS_A);
+
+	*hash = imm;
+	return err;
+}
+
+int mthca_NOP(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0x1f, 0, CMD_NOP, msecs_to_jiffies(100));
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.h b/drivers/infiniband/hw/mthca/mthca_cmd.h
new file mode 100644
index 0000000..d2e5b19
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_CMD_H
+#define MTHCA_CMD_H
+
+#include <rdma/ib_verbs.h>
+
+#define MTHCA_MAILBOX_SIZE 4096
+
+enum {
+	/* command completed successfully: */
+	MTHCA_CMD_STAT_OK 	      = 0x00,
+	/* Internal error (such as a bus error) occurred while processing command: */
+	MTHCA_CMD_STAT_INTERNAL_ERR   = 0x01,
+	/* Operation/command not supported or opcode modifier not supported: */
+	MTHCA_CMD_STAT_BAD_OP 	      = 0x02,
+	/* Parameter not supported or parameter out of range: */
+	MTHCA_CMD_STAT_BAD_PARAM      = 0x03,
+	/* System not enabled or bad system state: */
+	MTHCA_CMD_STAT_BAD_SYS_STATE  = 0x04,
+	/* Attempt to access reserved or unallocaterd resource: */
+	MTHCA_CMD_STAT_BAD_RESOURCE   = 0x05,
+	/* Requested resource is currently executing a command, or is otherwise busy: */
+	MTHCA_CMD_STAT_RESOURCE_BUSY  = 0x06,
+	/* memory error: */
+	MTHCA_CMD_STAT_DDR_MEM_ERR    = 0x07,
+	/* Required capability exceeds device limits: */
+	MTHCA_CMD_STAT_EXCEED_LIM     = 0x08,
+	/* Resource is not in the appropriate state or ownership: */
+	MTHCA_CMD_STAT_BAD_RES_STATE  = 0x09,
+	/* Index out of range: */
+	MTHCA_CMD_STAT_BAD_INDEX      = 0x0a,
+	/* FW image corrupted: */
+	MTHCA_CMD_STAT_BAD_NVMEM      = 0x0b,
+	/* Attempt to modify a QP/EE which is not in the presumed state: */
+	MTHCA_CMD_STAT_BAD_QPEE_STATE = 0x10,
+	/* Bad segment parameters (Address/Size): */
+	MTHCA_CMD_STAT_BAD_SEG_PARAM  = 0x20,
+	/* Memory Region has Memory Windows bound to: */
+	MTHCA_CMD_STAT_REG_BOUND      = 0x21,
+	/* HCA local attached memory not present: */
+	MTHCA_CMD_STAT_LAM_NOT_PRE    = 0x22,
+	/* Bad management packet (silently discarded): */
+	MTHCA_CMD_STAT_BAD_PKT 	      = 0x30,
+	/* More outstanding CQEs in CQ than new CQ size: */
+	MTHCA_CMD_STAT_BAD_SIZE       = 0x40
+};
+
+enum {
+	MTHCA_TRANS_INVALID = 0,
+	MTHCA_TRANS_RST2INIT,
+	MTHCA_TRANS_INIT2INIT,
+	MTHCA_TRANS_INIT2RTR,
+	MTHCA_TRANS_RTR2RTS,
+	MTHCA_TRANS_RTS2RTS,
+	MTHCA_TRANS_SQERR2RTS,
+	MTHCA_TRANS_ANY2ERR,
+	MTHCA_TRANS_RTS2SQD,
+	MTHCA_TRANS_SQD2SQD,
+	MTHCA_TRANS_SQD2RTS,
+	MTHCA_TRANS_ANY2RST,
+};
+
+enum {
+	DEV_LIM_FLAG_RC                 = 1 << 0,
+	DEV_LIM_FLAG_UC                 = 1 << 1,
+	DEV_LIM_FLAG_UD                 = 1 << 2,
+	DEV_LIM_FLAG_RD                 = 1 << 3,
+	DEV_LIM_FLAG_RAW_IPV6           = 1 << 4,
+	DEV_LIM_FLAG_RAW_ETHER          = 1 << 5,
+	DEV_LIM_FLAG_SRQ                = 1 << 6,
+	DEV_LIM_FLAG_IPOIB_CSUM		= 1 << 7,
+	DEV_LIM_FLAG_BAD_PKEY_CNTR      = 1 << 8,
+	DEV_LIM_FLAG_BAD_QKEY_CNTR      = 1 << 9,
+	DEV_LIM_FLAG_MW                 = 1 << 16,
+	DEV_LIM_FLAG_AUTO_PATH_MIG      = 1 << 17,
+	DEV_LIM_FLAG_ATOMIC             = 1 << 18,
+	DEV_LIM_FLAG_RAW_MULTI          = 1 << 19,
+	DEV_LIM_FLAG_UD_AV_PORT_ENFORCE = 1 << 20,
+	DEV_LIM_FLAG_UD_MULTI           = 1 << 21,
+};
+
+struct mthca_mailbox {
+	dma_addr_t dma;
+	void      *buf;
+};
+
+struct mthca_dev_lim {
+	int max_srq_sz;
+	int max_qp_sz;
+	int reserved_qps;
+	int max_qps;
+	int reserved_srqs;
+	int max_srqs;
+	int reserved_eecs;
+	int max_eecs;
+	int max_cq_sz;
+	int reserved_cqs;
+	int max_cqs;
+	int max_mpts;
+	int reserved_eqs;
+	int max_eqs;
+	int reserved_mtts;
+	int max_mrw_sz;
+	int reserved_mrws;
+	int max_mtt_seg;
+	int max_requester_per_qp;
+	int max_responder_per_qp;
+	int max_rdma_global;
+	int local_ca_ack_delay;
+	int max_mtu;
+	int max_port_width;
+	int max_vl;
+	int num_ports;
+	int max_gids;
+	u16 stat_rate_support;
+	int max_pkeys;
+	u32 flags;
+	int reserved_uars;
+	int uar_size;
+	int min_page_sz;
+	int max_sg;
+	int max_desc_sz;
+	int max_qp_per_mcg;
+	int reserved_mgms;
+	int max_mcgs;
+	int reserved_pds;
+	int max_pds;
+	int reserved_rdds;
+	int max_rdds;
+	int eec_entry_sz;
+	int qpc_entry_sz;
+	int eeec_entry_sz;
+	int eqpc_entry_sz;
+	int eqc_entry_sz;
+	int cqc_entry_sz;
+	int srq_entry_sz;
+	int uar_scratch_entry_sz;
+	int mpt_entry_sz;
+	union {
+		struct {
+			int max_avs;
+		} tavor;
+		struct {
+			int resize_srq;
+			int max_pbl_sz;
+			u8  bmme_flags;
+			u32 reserved_lkey;
+			int lam_required;
+			u64 max_icm_sz;
+		} arbel;
+	} hca;
+};
+
+struct mthca_adapter {
+	u32  vendor_id;
+	u32  device_id;
+	u32  revision_id;
+	char board_id[MTHCA_BOARD_ID_LEN];
+	u8   inta_pin;
+};
+
+struct mthca_init_hca_param {
+	u64 qpc_base;
+	u64 eec_base;
+	u64 srqc_base;
+	u64 cqc_base;
+	u64 eqpc_base;
+	u64 eeec_base;
+	u64 eqc_base;
+	u64 rdb_base;
+	u64 mc_base;
+	u64 mpt_base;
+	u64 mtt_base;
+	u64 uar_scratch_base;
+	u64 uarc_base;
+	u16 log_mc_entry_sz;
+	u16 mc_hash_sz;
+	u8  log_num_qps;
+	u8  log_num_eecs;
+	u8  log_num_srqs;
+	u8  log_num_cqs;
+	u8  log_num_eqs;
+	u8  log_mc_table_sz;
+	u8  mtt_seg_sz;
+	u8  log_mpt_sz;
+	u8  log_uar_sz;
+	u8  log_uarc_sz;
+};
+
+struct mthca_init_ib_param {
+	int port_width;
+	int vl_cap;
+	int mtu_cap;
+	u16 gid_cap;
+	u16 pkey_cap;
+	int set_guid0;
+	u64 guid0;
+	int set_node_guid;
+	u64 node_guid;
+	int set_si_guid;
+	u64 si_guid;
+};
+
+struct mthca_set_ib_param {
+	int set_si_guid;
+	int reset_qkey_viol;
+	u64 si_guid;
+	u32 cap_mask;
+};
+
+int mthca_cmd_init(struct mthca_dev *dev);
+void mthca_cmd_cleanup(struct mthca_dev *dev);
+int mthca_cmd_use_events(struct mthca_dev *dev);
+void mthca_cmd_use_polling(struct mthca_dev *dev);
+void mthca_cmd_event(struct mthca_dev *dev, u16 token,
+		     u8  status, u64 out_param);
+
+struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev,
+					  gfp_t gfp_mask);
+void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox);
+
+int mthca_SYS_EN(struct mthca_dev *dev);
+int mthca_SYS_DIS(struct mthca_dev *dev);
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm);
+int mthca_UNMAP_FA(struct mthca_dev *dev);
+int mthca_RUN_FW(struct mthca_dev *dev);
+int mthca_QUERY_FW(struct mthca_dev *dev);
+int mthca_ENABLE_LAM(struct mthca_dev *dev);
+int mthca_DISABLE_LAM(struct mthca_dev *dev);
+int mthca_QUERY_DDR(struct mthca_dev *dev);
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+			struct mthca_dev_lim *dev_lim);
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+			struct mthca_adapter *adapter);
+int mthca_INIT_HCA(struct mthca_dev *dev,
+		   struct mthca_init_hca_param *param);
+int mthca_INIT_IB(struct mthca_dev *dev,
+		  struct mthca_init_ib_param *param,
+		  int port);
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port);
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic);
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+		 int port);
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt);
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt);
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count);
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm);
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev);
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages);
+int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index);
+int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index);
+int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int num_mtt);
+int mthca_SYNC_TPT(struct mthca_dev *dev);
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+		 int eq_num);
+int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num);
+int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num);
+int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num);
+int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num);
+int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size);
+int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num);
+int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num);
+int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num,
+		    struct mthca_mailbox *mailbox);
+int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit);
+int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur,
+		    enum ib_qp_state next, u32 num, int is_ee,
+		    struct mthca_mailbox *mailbox, u32 optmask);
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+		   struct mthca_mailbox *mailbox);
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn);
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+		  int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		  const void *in_mad, void *response_mad);
+int mthca_READ_MGM(struct mthca_dev *dev, int index,
+		   struct mthca_mailbox *mailbox);
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index,
+		    struct mthca_mailbox *mailbox);
+int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    u16 *hash);
+int mthca_NOP(struct mthca_dev *dev);
+
+#endif /* MTHCA_CMD_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_config_reg.h b/drivers/infiniband/hw/mthca/mthca_config_reg.h
new file mode 100644
index 0000000..155bc66
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_config_reg.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_CONFIG_REG_H
+#define MTHCA_CONFIG_REG_H
+
+#define MTHCA_HCR_BASE         0x80680
+#define MTHCA_HCR_SIZE         0x0001c
+#define MTHCA_ECR_BASE         0x80700
+#define MTHCA_ECR_SIZE         0x00008
+#define MTHCA_ECR_CLR_BASE     0x80708
+#define MTHCA_ECR_CLR_SIZE     0x00008
+#define MTHCA_MAP_ECR_SIZE     (MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE)
+#define MTHCA_CLR_INT_BASE     0xf00d8
+#define MTHCA_CLR_INT_SIZE     0x00008
+#define MTHCA_EQ_SET_CI_SIZE   (8 * 32)
+
+#endif /* MTHCA_CONFIG_REG_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
new file mode 100644
index 0000000..a6531ff
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -0,0 +1,981 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/gfp.h>
+#include <linux/hardirq.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include <rdma/ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+enum {
+	MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE
+};
+
+enum {
+	MTHCA_CQ_ENTRY_SIZE = 0x20
+};
+
+enum {
+	MTHCA_ATOMIC_BYTE_LEN = 8
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_cq_context {
+	__be32 flags;
+	__be64 start;
+	__be32 logsize_usrpage;
+	__be32 error_eqn;	/* Tavor only */
+	__be32 comp_eqn;
+	__be32 pd;
+	__be32 lkey;
+	__be32 last_notified_index;
+	__be32 solicit_producer_index;
+	__be32 consumer_index;
+	__be32 producer_index;
+	__be32 cqn;
+	__be32 ci_db;		/* Arbel only */
+	__be32 state_db;	/* Arbel only */
+	u32    reserved;
+} __attribute__((packed));
+
+#define MTHCA_CQ_STATUS_OK          ( 0 << 28)
+#define MTHCA_CQ_STATUS_OVERFLOW    ( 9 << 28)
+#define MTHCA_CQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MTHCA_CQ_FLAG_TR            ( 1 << 18)
+#define MTHCA_CQ_FLAG_OI            ( 1 << 17)
+#define MTHCA_CQ_STATE_DISARMED     ( 0 <<  8)
+#define MTHCA_CQ_STATE_ARMED        ( 1 <<  8)
+#define MTHCA_CQ_STATE_ARMED_SOL    ( 4 <<  8)
+#define MTHCA_EQ_STATE_FIRED        (10 <<  8)
+
+enum {
+	MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe
+};
+
+enum {
+	SYNDROME_LOCAL_LENGTH_ERR 	 = 0x01,
+	SYNDROME_LOCAL_QP_OP_ERR  	 = 0x02,
+	SYNDROME_LOCAL_EEC_OP_ERR 	 = 0x03,
+	SYNDROME_LOCAL_PROT_ERR   	 = 0x04,
+	SYNDROME_WR_FLUSH_ERR     	 = 0x05,
+	SYNDROME_MW_BIND_ERR      	 = 0x06,
+	SYNDROME_BAD_RESP_ERR     	 = 0x10,
+	SYNDROME_LOCAL_ACCESS_ERR 	 = 0x11,
+	SYNDROME_REMOTE_INVAL_REQ_ERR 	 = 0x12,
+	SYNDROME_REMOTE_ACCESS_ERR 	 = 0x13,
+	SYNDROME_REMOTE_OP_ERR     	 = 0x14,
+	SYNDROME_RETRY_EXC_ERR 		 = 0x15,
+	SYNDROME_RNR_RETRY_EXC_ERR 	 = 0x16,
+	SYNDROME_LOCAL_RDD_VIOL_ERR 	 = 0x20,
+	SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21,
+	SYNDROME_REMOTE_ABORTED_ERR 	 = 0x22,
+	SYNDROME_INVAL_EECN_ERR 	 = 0x23,
+	SYNDROME_INVAL_EEC_STATE_ERR 	 = 0x24
+};
+
+struct mthca_cqe {
+	__be32 my_qpn;
+	__be32 my_ee;
+	__be32 rqpn;
+	u8     sl_ipok;
+	u8     g_mlpath;
+	__be16 rlid;
+	__be32 imm_etype_pkey_eec;
+	__be32 byte_cnt;
+	__be32 wqe;
+	u8     opcode;
+	u8     is_send;
+	u8     reserved;
+	u8     owner;
+};
+
+struct mthca_err_cqe {
+	__be32 my_qpn;
+	u32    reserved1[3];
+	u8     syndrome;
+	u8     vendor_err;
+	__be16 db_cnt;
+	u32    reserved2;
+	__be32 wqe;
+	u8     opcode;
+	u8     reserved3[2];
+	u8     owner;
+};
+
+#define MTHCA_CQ_ENTRY_OWNER_SW      (0 << 7)
+#define MTHCA_CQ_ENTRY_OWNER_HW      (1 << 7)
+
+#define MTHCA_TAVOR_CQ_DB_INC_CI       (1 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT      (2 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL  (3 << 24)
+#define MTHCA_TAVOR_CQ_DB_SET_CI       (4 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24)
+
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL  (1 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT      (2 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24)
+
+static inline struct mthca_cqe *get_cqe_from_buf(struct mthca_cq_buf *buf,
+						 int entry)
+{
+	if (buf->is_direct)
+		return buf->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE);
+	else
+		return buf->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf
+			+ (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE;
+}
+
+static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
+{
+	return get_cqe_from_buf(&cq->buf, entry);
+}
+
+static inline struct mthca_cqe *cqe_sw(struct mthca_cqe *cqe)
+{
+	return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe;
+}
+
+static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq)
+{
+	return cqe_sw(get_cqe(cq, cq->cons_index & cq->ibcq.cqe));
+}
+
+static inline void set_cqe_hw(struct mthca_cqe *cqe)
+{
+	cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW;
+}
+
+static void dump_cqe(struct mthca_dev *dev, void *cqe_ptr)
+{
+	__be32 *cqe = cqe_ptr;
+
+	(void) cqe;	/* avoid warning if mthca_dbg compiled away... */
+	mthca_dbg(dev, "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+		  be32_to_cpu(cqe[0]), be32_to_cpu(cqe[1]), be32_to_cpu(cqe[2]),
+		  be32_to_cpu(cqe[3]), be32_to_cpu(cqe[4]), be32_to_cpu(cqe[5]),
+		  be32_to_cpu(cqe[6]), be32_to_cpu(cqe[7]));
+}
+
+/*
+ * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index
+ * should be correct before calling update_cons_index().
+ */
+static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
+				     int incr)
+{
+	if (mthca_is_memfree(dev)) {
+		*cq->set_ci_db = cpu_to_be32(cq->cons_index);
+		wmb();
+	} else {
+		mthca_write64(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn, incr - 1,
+			      dev->kar + MTHCA_CQ_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+		/*
+		 * Make sure doorbells don't leak out of CQ spinlock
+		 * and reach the HCA out of order:
+		 */
+		mmiowb();
+	}
+}
+
+void mthca_cq_completion(struct mthca_dev *dev, u32 cqn)
+{
+	struct mthca_cq *cq;
+
+	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+
+	if (!cq) {
+		mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	++cq->arm_sn;
+
+	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
+		    enum ib_event_type event_type)
+{
+	struct mthca_cq *cq;
+	struct ib_event event;
+
+	spin_lock(&dev->cq_table.lock);
+
+	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+	if (cq)
+		++cq->refcount;
+
+	spin_unlock(&dev->cq_table.lock);
+
+	if (!cq) {
+		mthca_warn(dev, "Async event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.cq  = &cq->ibcq;
+	if (cq->ibcq.event_handler)
+		cq->ibcq.event_handler(&event, cq->ibcq.cq_context);
+
+	spin_lock(&dev->cq_table.lock);
+	if (!--cq->refcount)
+		wake_up(&cq->wait);
+	spin_unlock(&dev->cq_table.lock);
+}
+
+static inline int is_recv_cqe(struct mthca_cqe *cqe)
+{
+	if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+	    MTHCA_ERROR_CQE_OPCODE_MASK)
+		return !(cqe->opcode & 0x01);
+	else
+		return !(cqe->is_send & 0x80);
+}
+
+void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
+		    struct mthca_srq *srq)
+{
+	struct mthca_cqe *cqe;
+	u32 prod_index;
+	int i, nfreed = 0;
+
+	spin_lock_irq(&cq->lock);
+
+	/*
+	 * First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->cons_index;
+	     cqe_sw(get_cqe(cq, prod_index & cq->ibcq.cqe));
+	     ++prod_index)
+		if (prod_index == cq->cons_index + cq->ibcq.cqe)
+			break;
+
+	if (0)
+		mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n",
+			  qpn, cq->cqn, cq->cons_index, prod_index);
+
+	/*
+	 * Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		if (cqe->my_qpn == cpu_to_be32(qpn)) {
+			if (srq && is_recv_cqe(cqe))
+				mthca_free_srq_wqe(srq, be32_to_cpu(cqe->wqe));
+			++nfreed;
+		} else if (nfreed)
+			memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe),
+			       cqe, MTHCA_CQ_ENTRY_SIZE);
+	}
+
+	if (nfreed) {
+		for (i = 0; i < nfreed; ++i)
+			set_cqe_hw(get_cqe(cq, (cq->cons_index + i) & cq->ibcq.cqe));
+		wmb();
+		cq->cons_index += nfreed;
+		update_cons_index(dev, cq, nfreed);
+	}
+
+	spin_unlock_irq(&cq->lock);
+}
+
+void mthca_cq_resize_copy_cqes(struct mthca_cq *cq)
+{
+	int i;
+
+	/*
+	 * In Tavor mode, the hardware keeps the consumer and producer
+	 * indices mod the CQ size.  Since we might be making the CQ
+	 * bigger, we need to deal with the case where the producer
+	 * index wrapped around before the CQ was resized.
+	 */
+	if (!mthca_is_memfree(to_mdev(cq->ibcq.device)) &&
+	    cq->ibcq.cqe < cq->resize_buf->cqe) {
+		cq->cons_index &= cq->ibcq.cqe;
+		if (cqe_sw(get_cqe(cq, cq->ibcq.cqe)))
+			cq->cons_index -= cq->ibcq.cqe + 1;
+	}
+
+	for (i = cq->cons_index; cqe_sw(get_cqe(cq, i & cq->ibcq.cqe)); ++i)
+		memcpy(get_cqe_from_buf(&cq->resize_buf->buf,
+					i & cq->resize_buf->cqe),
+		       get_cqe(cq, i & cq->ibcq.cqe), MTHCA_CQ_ENTRY_SIZE);
+}
+
+int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent)
+{
+	int ret;
+	int i;
+
+	ret = mthca_buf_alloc(dev, nent * MTHCA_CQ_ENTRY_SIZE,
+			      MTHCA_MAX_DIRECT_CQ_SIZE,
+			      &buf->queue, &buf->is_direct,
+			      &dev->driver_pd, 1, &buf->mr);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < nent; ++i)
+		set_cqe_hw(get_cqe_from_buf(buf, i));
+
+	return 0;
+}
+
+void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe)
+{
+	mthca_buf_free(dev, (cqe + 1) * MTHCA_CQ_ENTRY_SIZE, &buf->queue,
+		       buf->is_direct, &buf->mr);
+}
+
+static void handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq,
+			     struct mthca_qp *qp, int wqe_index, int is_send,
+			     struct mthca_err_cqe *cqe,
+			     struct ib_wc *entry, int *free_cqe)
+{
+	int dbd;
+	__be32 new_wqe;
+
+	if (cqe->syndrome == SYNDROME_LOCAL_QP_OP_ERR) {
+		mthca_dbg(dev, "local QP operation err "
+			  "(QPN %06x, WQE @ %08x, CQN %06x, index %d)\n",
+			  be32_to_cpu(cqe->my_qpn), be32_to_cpu(cqe->wqe),
+			  cq->cqn, cq->cons_index);
+		dump_cqe(dev, cqe);
+	}
+
+	/*
+	 * For completions in error, only work request ID, status, vendor error
+	 * (and freed resource count for RD) have to be set.
+	 */
+	switch (cqe->syndrome) {
+	case SYNDROME_LOCAL_LENGTH_ERR:
+		entry->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case SYNDROME_LOCAL_QP_OP_ERR:
+		entry->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case SYNDROME_LOCAL_EEC_OP_ERR:
+		entry->status = IB_WC_LOC_EEC_OP_ERR;
+		break;
+	case SYNDROME_LOCAL_PROT_ERR:
+		entry->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case SYNDROME_WR_FLUSH_ERR:
+		entry->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case SYNDROME_MW_BIND_ERR:
+		entry->status = IB_WC_MW_BIND_ERR;
+		break;
+	case SYNDROME_BAD_RESP_ERR:
+		entry->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case SYNDROME_LOCAL_ACCESS_ERR:
+		entry->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case SYNDROME_REMOTE_INVAL_REQ_ERR:
+		entry->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case SYNDROME_REMOTE_ACCESS_ERR:
+		entry->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case SYNDROME_REMOTE_OP_ERR:
+		entry->status = IB_WC_REM_OP_ERR;
+		break;
+	case SYNDROME_RETRY_EXC_ERR:
+		entry->status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case SYNDROME_RNR_RETRY_EXC_ERR:
+		entry->status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case SYNDROME_LOCAL_RDD_VIOL_ERR:
+		entry->status = IB_WC_LOC_RDD_VIOL_ERR;
+		break;
+	case SYNDROME_REMOTE_INVAL_RD_REQ_ERR:
+		entry->status = IB_WC_REM_INV_RD_REQ_ERR;
+		break;
+	case SYNDROME_REMOTE_ABORTED_ERR:
+		entry->status = IB_WC_REM_ABORT_ERR;
+		break;
+	case SYNDROME_INVAL_EECN_ERR:
+		entry->status = IB_WC_INV_EECN_ERR;
+		break;
+	case SYNDROME_INVAL_EEC_STATE_ERR:
+		entry->status = IB_WC_INV_EEC_STATE_ERR;
+		break;
+	default:
+		entry->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	entry->vendor_err = cqe->vendor_err;
+
+	/*
+	 * Mem-free HCAs always generate one CQE per WQE, even in the
+	 * error case, so we don't have to check the doorbell count, etc.
+	 */
+	if (mthca_is_memfree(dev))
+		return;
+
+	mthca_free_err_wqe(dev, qp, is_send, wqe_index, &dbd, &new_wqe);
+
+	/*
+	 * If we're at the end of the WQE chain, or we've used up our
+	 * doorbell count, free the CQE.  Otherwise just update it for
+	 * the next poll operation.
+	 */
+	if (!(new_wqe & cpu_to_be32(0x3f)) || (!cqe->db_cnt && dbd))
+		return;
+
+	be16_add_cpu(&cqe->db_cnt, -dbd);
+	cqe->wqe      = new_wqe;
+	cqe->syndrome = SYNDROME_WR_FLUSH_ERR;
+
+	*free_cqe = 0;
+}
+
+static inline int mthca_poll_one(struct mthca_dev *dev,
+				 struct mthca_cq *cq,
+				 struct mthca_qp **cur_qp,
+				 int *freed,
+				 struct ib_wc *entry)
+{
+	struct mthca_wq *wq;
+	struct mthca_cqe *cqe;
+	int wqe_index;
+	int is_error;
+	int is_send;
+	int free_cqe = 1;
+	int err = 0;
+	u16 checksum;
+
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	if (0) {
+		mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n",
+			  cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
+			  be32_to_cpu(cqe->wqe));
+		dump_cqe(dev, cqe);
+	}
+
+	is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+		MTHCA_ERROR_CQE_OPCODE_MASK;
+	is_send  = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80;
+
+	if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) {
+		/*
+		 * We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		*cur_qp = mthca_array_get(&dev->qp_table.qp,
+					  be32_to_cpu(cqe->my_qpn) &
+					  (dev->limits.num_qps - 1));
+		if (!*cur_qp) {
+			mthca_warn(dev, "CQ entry for unknown QP %06x\n",
+				   be32_to_cpu(cqe->my_qpn) & 0xffffff);
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	entry->qp = &(*cur_qp)->ibqp;
+
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		wqe_index = ((be32_to_cpu(cqe->wqe) - (*cur_qp)->send_wqe_offset)
+			     >> wq->wqe_shift);
+		entry->wr_id = (*cur_qp)->wrid[wqe_index +
+					       (*cur_qp)->rq.max];
+	} else if ((*cur_qp)->ibqp.srq) {
+		struct mthca_srq *srq = to_msrq((*cur_qp)->ibqp.srq);
+		u32 wqe = be32_to_cpu(cqe->wqe);
+		wq = NULL;
+		wqe_index = wqe >> srq->wqe_shift;
+		entry->wr_id = srq->wrid[wqe_index];
+		mthca_free_srq_wqe(srq, wqe);
+	} else {
+		s32 wqe;
+		wq = &(*cur_qp)->rq;
+		wqe = be32_to_cpu(cqe->wqe);
+		wqe_index = wqe >> wq->wqe_shift;
+		/*
+		 * WQE addr == base - 1 might be reported in receive completion
+		 * with error instead of (rq size - 1) by Sinai FW 1.0.800 and
+		 * Arbel FW 5.1.400.  This bug should be fixed in later FW revs.
+		 */
+		if (unlikely(wqe_index < 0))
+			wqe_index = wq->max - 1;
+		entry->wr_id = (*cur_qp)->wrid[wqe_index];
+	}
+
+	if (wq) {
+		if (wq->last_comp < wqe_index)
+			wq->tail += wqe_index - wq->last_comp;
+		else
+			wq->tail += wqe_index + wq->max - wq->last_comp;
+
+		wq->last_comp = wqe_index;
+	}
+
+	if (is_error) {
+		handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send,
+				 (struct mthca_err_cqe *) cqe,
+				 entry, &free_cqe);
+		goto out;
+	}
+
+	if (is_send) {
+		entry->wc_flags = 0;
+		switch (cqe->opcode) {
+		case MTHCA_OPCODE_RDMA_WRITE:
+			entry->opcode    = IB_WC_RDMA_WRITE;
+			break;
+		case MTHCA_OPCODE_RDMA_WRITE_IMM:
+			entry->opcode    = IB_WC_RDMA_WRITE;
+			entry->wc_flags |= IB_WC_WITH_IMM;
+			break;
+		case MTHCA_OPCODE_SEND:
+			entry->opcode    = IB_WC_SEND;
+			break;
+		case MTHCA_OPCODE_SEND_IMM:
+			entry->opcode    = IB_WC_SEND;
+			entry->wc_flags |= IB_WC_WITH_IMM;
+			break;
+		case MTHCA_OPCODE_RDMA_READ:
+			entry->opcode    = IB_WC_RDMA_READ;
+			entry->byte_len  = be32_to_cpu(cqe->byte_cnt);
+			break;
+		case MTHCA_OPCODE_ATOMIC_CS:
+			entry->opcode    = IB_WC_COMP_SWAP;
+			entry->byte_len  = MTHCA_ATOMIC_BYTE_LEN;
+			break;
+		case MTHCA_OPCODE_ATOMIC_FA:
+			entry->opcode    = IB_WC_FETCH_ADD;
+			entry->byte_len  = MTHCA_ATOMIC_BYTE_LEN;
+			break;
+		default:
+			entry->opcode    = MTHCA_OPCODE_INVALID;
+			break;
+		}
+	} else {
+		entry->byte_len = be32_to_cpu(cqe->byte_cnt);
+		switch (cqe->opcode & 0x1f) {
+		case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
+		case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
+			entry->wc_flags = IB_WC_WITH_IMM;
+			entry->ex.imm_data = cqe->imm_etype_pkey_eec;
+			entry->opcode = IB_WC_RECV;
+			break;
+		case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+		case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
+			entry->wc_flags = IB_WC_WITH_IMM;
+			entry->ex.imm_data = cqe->imm_etype_pkey_eec;
+			entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+			break;
+		default:
+			entry->wc_flags = 0;
+			entry->opcode = IB_WC_RECV;
+			break;
+		}
+		entry->slid 	   = be16_to_cpu(cqe->rlid);
+		entry->sl   	   = cqe->sl_ipok >> 4;
+		entry->src_qp 	   = be32_to_cpu(cqe->rqpn) & 0xffffff;
+		entry->dlid_path_bits = cqe->g_mlpath & 0x7f;
+		entry->pkey_index  = be32_to_cpu(cqe->imm_etype_pkey_eec) >> 16;
+		entry->wc_flags   |= cqe->g_mlpath & 0x80 ? IB_WC_GRH : 0;
+		checksum = (be32_to_cpu(cqe->rqpn) >> 24) |
+				((be32_to_cpu(cqe->my_ee) >> 16) & 0xff00);
+		entry->wc_flags	  |=  (cqe->sl_ipok & 1 && checksum == 0xffff) ?
+							IB_WC_IP_CSUM_OK : 0;
+	}
+
+	entry->status = IB_WC_SUCCESS;
+
+ out:
+	if (likely(free_cqe)) {
+		set_cqe_hw(cqe);
+		++(*freed);
+		++cq->cons_index;
+	}
+
+	return err;
+}
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+		  struct ib_wc *entry)
+{
+	struct mthca_dev *dev = to_mdev(ibcq->device);
+	struct mthca_cq *cq = to_mcq(ibcq);
+	struct mthca_qp *qp = NULL;
+	unsigned long flags;
+	int err = 0;
+	int freed = 0;
+	int npolled;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	npolled = 0;
+repoll:
+	while (npolled < num_entries) {
+		err = mthca_poll_one(dev, cq, &qp,
+				     &freed, entry + npolled);
+		if (err)
+			break;
+		++npolled;
+	}
+
+	if (freed) {
+		wmb();
+		update_cons_index(dev, cq, freed);
+	}
+
+	/*
+	 * If a CQ resize is in progress and we discovered that the
+	 * old buffer is empty, then peek in the new buffer, and if
+	 * it's not empty, switch to the new buffer and continue
+	 * polling there.
+	 */
+	if (unlikely(err == -EAGAIN && cq->resize_buf &&
+		     cq->resize_buf->state == CQ_RESIZE_READY)) {
+		/*
+		 * In Tavor mode, the hardware keeps the producer
+		 * index modulo the CQ size.  Since we might be making
+		 * the CQ bigger, we need to mask our consumer index
+		 * using the size of the old CQ buffer before looking
+		 * in the new CQ buffer.
+		 */
+		if (!mthca_is_memfree(dev))
+			cq->cons_index &= cq->ibcq.cqe;
+
+		if (cqe_sw(get_cqe_from_buf(&cq->resize_buf->buf,
+					    cq->cons_index & cq->resize_buf->cqe))) {
+			struct mthca_cq_buf tbuf;
+			int tcqe;
+
+			tbuf         = cq->buf;
+			tcqe         = cq->ibcq.cqe;
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+
+			cq->resize_buf->buf   = tbuf;
+			cq->resize_buf->cqe   = tcqe;
+			cq->resize_buf->state = CQ_RESIZE_SWAPPED;
+
+			goto repoll;
+		}
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return err == 0 || err == -EAGAIN ? npolled : err;
+}
+
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags)
+{
+	u32 dbhi = ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :
+		    MTHCA_TAVOR_CQ_DB_REQ_NOT) |
+		to_mcq(cq)->cqn;
+
+	mthca_write64(dbhi, 0xffffffff, to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock));
+
+	return 0;
+}
+
+int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct mthca_cq *cq = to_mcq(ibcq);
+	__be32 db_rec[2];
+	u32 dbhi;
+	u32 sn = cq->arm_sn & 3;
+
+	db_rec[0] = cpu_to_be32(cq->cons_index);
+	db_rec[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) |
+				((flags & IB_CQ_SOLICITED_MASK) ==
+				 IB_CQ_SOLICITED ? 1 : 2));
+
+	mthca_write_db_rec(db_rec, cq->arm_db);
+
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	dbhi = (sn << 28) |
+		((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		 MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL :
+		 MTHCA_ARBEL_CQ_DB_REQ_NOT) | cq->cqn;
+
+	mthca_write64(dbhi, cq->cons_index,
+		      to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock));
+
+	return 0;
+}
+
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_ucontext *ctx, u32 pdn,
+		  struct mthca_cq *cq)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_cq_context *cq_context;
+	int err = -ENOMEM;
+
+	cq->ibcq.cqe  = nent - 1;
+	cq->is_kernel = !ctx;
+
+	cq->cqn = mthca_alloc(&dev->cq_table.alloc);
+	if (cq->cqn == -1)
+		return -ENOMEM;
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->cq_table.table, cq->cqn);
+		if (err)
+			goto err_out;
+
+		if (cq->is_kernel) {
+			cq->arm_sn = 1;
+
+			err = -ENOMEM;
+
+			cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI,
+							     cq->cqn, &cq->set_ci_db);
+			if (cq->set_ci_db_index < 0)
+				goto err_out_icm;
+
+			cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM,
+							  cq->cqn, &cq->arm_db);
+			if (cq->arm_db_index < 0)
+				goto err_out_ci;
+		}
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		goto err_out_arm;
+
+	cq_context = mailbox->buf;
+
+	if (cq->is_kernel) {
+		err = mthca_alloc_cq_buf(dev, &cq->buf, nent);
+		if (err)
+			goto err_out_mailbox;
+	}
+
+	spin_lock_init(&cq->lock);
+	cq->refcount = 1;
+	init_waitqueue_head(&cq->wait);
+	mutex_init(&cq->mutex);
+
+	memset(cq_context, 0, sizeof *cq_context);
+	cq_context->flags           = cpu_to_be32(MTHCA_CQ_STATUS_OK      |
+						  MTHCA_CQ_STATE_DISARMED |
+						  MTHCA_CQ_FLAG_TR);
+	cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24);
+	if (ctx)
+		cq_context->logsize_usrpage |= cpu_to_be32(ctx->uar.index);
+	else
+		cq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
+	cq_context->error_eqn       = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+	cq_context->comp_eqn        = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn);
+	cq_context->pd              = cpu_to_be32(pdn);
+	cq_context->lkey            = cpu_to_be32(cq->buf.mr.ibmr.lkey);
+	cq_context->cqn             = cpu_to_be32(cq->cqn);
+
+	if (mthca_is_memfree(dev)) {
+		cq_context->ci_db    = cpu_to_be32(cq->set_ci_db_index);
+		cq_context->state_db = cpu_to_be32(cq->arm_db_index);
+	}
+
+	err = mthca_SW2HW_CQ(dev, mailbox, cq->cqn);
+	if (err) {
+		mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err);
+		goto err_out_free_mr;
+	}
+
+	spin_lock_irq(&dev->cq_table.lock);
+	if (mthca_array_set(&dev->cq_table.cq,
+			    cq->cqn & (dev->limits.num_cqs - 1),
+			    cq)) {
+		spin_unlock_irq(&dev->cq_table.lock);
+		goto err_out_free_mr;
+	}
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	cq->cons_index = 0;
+
+	mthca_free_mailbox(dev, mailbox);
+
+	return 0;
+
+err_out_free_mr:
+	if (cq->is_kernel)
+		mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+
+err_out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_arm:
+	if (cq->is_kernel && mthca_is_memfree(dev))
+		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index);
+
+err_out_ci:
+	if (cq->is_kernel && mthca_is_memfree(dev))
+		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+
+err_out_icm:
+	mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+
+err_out:
+	mthca_free(&dev->cq_table.alloc, cq->cqn);
+
+	return err;
+}
+
+static inline int get_cq_refcount(struct mthca_dev *dev, struct mthca_cq *cq)
+{
+	int c;
+
+	spin_lock_irq(&dev->cq_table.lock);
+	c = cq->refcount;
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	return c;
+}
+
+void mthca_free_cq(struct mthca_dev *dev,
+		   struct mthca_cq *cq)
+{
+	struct mthca_mailbox *mailbox;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		mthca_warn(dev, "No memory for mailbox to free CQ.\n");
+		return;
+	}
+
+	err = mthca_HW2SW_CQ(dev, mailbox, cq->cqn);
+	if (err)
+		mthca_warn(dev, "HW2SW_CQ failed (%d)\n", err);
+
+	if (0) {
+		__be32 *ctx = mailbox->buf;
+		int j;
+
+		printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n",
+		       cq->cqn, cq->cons_index,
+		       cq->is_kernel ? !!next_cqe_sw(cq) : 0);
+		for (j = 0; j < 16; ++j)
+			printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j]));
+	}
+
+	spin_lock_irq(&dev->cq_table.lock);
+	mthca_array_clear(&dev->cq_table.cq,
+			  cq->cqn & (dev->limits.num_cqs - 1));
+	--cq->refcount;
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X)
+		synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector);
+	else
+		synchronize_irq(dev->pdev->irq);
+
+	wait_event(cq->wait, !get_cq_refcount(dev, cq));
+
+	if (cq->is_kernel) {
+		mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+		if (mthca_is_memfree(dev)) {
+			mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM,    cq->arm_db_index);
+			mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+		}
+	}
+
+	mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+	mthca_free(&dev->cq_table.alloc, cq->cqn);
+	mthca_free_mailbox(dev, mailbox);
+}
+
+int mthca_init_cq_table(struct mthca_dev *dev)
+{
+	int err;
+
+	spin_lock_init(&dev->cq_table.lock);
+
+	err = mthca_alloc_init(&dev->cq_table.alloc,
+			       dev->limits.num_cqs,
+			       (1 << 24) - 1,
+			       dev->limits.reserved_cqs);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->cq_table.cq,
+			       dev->limits.num_cqs);
+	if (err)
+		mthca_alloc_cleanup(&dev->cq_table.alloc);
+
+	return err;
+}
+
+void mthca_cleanup_cq_table(struct mthca_dev *dev)
+{
+	mthca_array_cleanup(&dev->cq_table.cq, dev->limits.num_cqs);
+	mthca_alloc_cleanup(&dev->cq_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
new file mode 100644
index 0000000..5508afb
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -0,0 +1,597 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_DEV_H
+#define MTHCA_DEV_H
+
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/timer.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/semaphore.h>
+
+#include "mthca_provider.h"
+#include "mthca_doorbell.h"
+
+#define DRV_NAME	"ib_mthca"
+#define PFX		DRV_NAME ": "
+#define DRV_VERSION	"1.0"
+#define DRV_RELDATE	"April 4, 2008"
+
+enum {
+	MTHCA_FLAG_DDR_HIDDEN = 1 << 1,
+	MTHCA_FLAG_SRQ        = 1 << 2,
+	MTHCA_FLAG_MSI_X      = 1 << 3,
+	MTHCA_FLAG_NO_LAM     = 1 << 4,
+	MTHCA_FLAG_FMR        = 1 << 5,
+	MTHCA_FLAG_MEMFREE    = 1 << 6,
+	MTHCA_FLAG_PCIE       = 1 << 7,
+	MTHCA_FLAG_SINAI_OPT  = 1 << 8
+};
+
+enum {
+	MTHCA_MAX_PORTS = 2
+};
+
+enum {
+	MTHCA_BOARD_ID_LEN = 64
+};
+
+enum {
+	MTHCA_EQ_CONTEXT_SIZE =  0x40,
+	MTHCA_CQ_CONTEXT_SIZE =  0x40,
+	MTHCA_QP_CONTEXT_SIZE = 0x200,
+	MTHCA_RDB_ENTRY_SIZE  =  0x20,
+	MTHCA_AV_SIZE         =  0x20,
+	MTHCA_MGM_ENTRY_SIZE  = 0x100,
+
+	/* Arbel FW gives us these, but we need them for Tavor */
+	MTHCA_MPT_ENTRY_SIZE  =  0x40,
+	MTHCA_MTT_SEG_SIZE    =  0x40,
+
+	MTHCA_QP_PER_MGM      = 4 * (MTHCA_MGM_ENTRY_SIZE / 16 - 2)
+};
+
+enum {
+	MTHCA_EQ_CMD,
+	MTHCA_EQ_ASYNC,
+	MTHCA_EQ_COMP,
+	MTHCA_NUM_EQ
+};
+
+enum {
+	MTHCA_OPCODE_NOP            = 0x00,
+	MTHCA_OPCODE_RDMA_WRITE     = 0x08,
+	MTHCA_OPCODE_RDMA_WRITE_IMM = 0x09,
+	MTHCA_OPCODE_SEND           = 0x0a,
+	MTHCA_OPCODE_SEND_IMM       = 0x0b,
+	MTHCA_OPCODE_RDMA_READ      = 0x10,
+	MTHCA_OPCODE_ATOMIC_CS      = 0x11,
+	MTHCA_OPCODE_ATOMIC_FA      = 0x12,
+	MTHCA_OPCODE_BIND_MW        = 0x18,
+	MTHCA_OPCODE_INVALID        = 0xff
+};
+
+enum {
+	MTHCA_CMD_USE_EVENTS         = 1 << 0,
+	MTHCA_CMD_POST_DOORBELLS     = 1 << 1
+};
+
+enum {
+	MTHCA_CMD_NUM_DBELL_DWORDS = 8
+};
+
+struct mthca_cmd {
+	struct dma_pool          *pool;
+	struct mutex              hcr_mutex;
+	struct semaphore 	  poll_sem;
+	struct semaphore 	  event_sem;
+	int              	  max_cmds;
+	spinlock_t                context_lock;
+	int                       free_head;
+	struct mthca_cmd_context *context;
+	u16                       token_mask;
+	u32                       flags;
+	void __iomem             *dbell_map;
+	u16                       dbell_offsets[MTHCA_CMD_NUM_DBELL_DWORDS];
+};
+
+struct mthca_limits {
+	int      num_ports;
+	int      vl_cap;
+	int      mtu_cap;
+	int      gid_table_len;
+	int      pkey_table_len;
+	int      local_ca_ack_delay;
+	int      num_uars;
+	int      max_sg;
+	int      num_qps;
+	int      max_wqes;
+	int	 max_desc_sz;
+	int	 max_qp_init_rdma;
+	int      reserved_qps;
+	int      num_srqs;
+	int      max_srq_wqes;
+	int      max_srq_sge;
+	int      reserved_srqs;
+	int      num_eecs;
+	int      reserved_eecs;
+	int      num_cqs;
+	int      max_cqes;
+	int      reserved_cqs;
+	int      num_eqs;
+	int      reserved_eqs;
+	int      num_mpts;
+	int      num_mtt_segs;
+	int	 mtt_seg_size;
+	int      fmr_reserved_mtts;
+	int      reserved_mtts;
+	int      reserved_mrws;
+	int      reserved_uars;
+	int      num_mgms;
+	int      num_amgms;
+	int      reserved_mcgs;
+	int      num_pds;
+	int      reserved_pds;
+	u32      page_size_cap;
+	u32      flags;
+	u16      stat_rate_support;
+	u8       port_width_cap;
+};
+
+struct mthca_alloc {
+	u32            last;
+	u32            top;
+	u32            max;
+	u32            mask;
+	spinlock_t     lock;
+	unsigned long *table;
+};
+
+struct mthca_array {
+	struct {
+		void    **page;
+		int       used;
+	} *page_list;
+};
+
+struct mthca_uar_table {
+	struct mthca_alloc alloc;
+	u64                uarc_base;
+	int                uarc_size;
+};
+
+struct mthca_pd_table {
+	struct mthca_alloc alloc;
+};
+
+struct mthca_buddy {
+	unsigned long **bits;
+	int	       *num_free;
+	int             max_order;
+	spinlock_t      lock;
+};
+
+struct mthca_mr_table {
+	struct mthca_alloc      mpt_alloc;
+	struct mthca_buddy      mtt_buddy;
+	struct mthca_buddy     *fmr_mtt_buddy;
+	u64                     mtt_base;
+	u64                     mpt_base;
+	struct mthca_icm_table *mtt_table;
+	struct mthca_icm_table *mpt_table;
+	struct {
+		void __iomem   *mpt_base;
+		void __iomem   *mtt_base;
+		struct mthca_buddy mtt_buddy;
+	} tavor_fmr;
+};
+
+struct mthca_eq_table {
+	struct mthca_alloc alloc;
+	void __iomem      *clr_int;
+	u32                clr_mask;
+	u32                arm_mask;
+	struct mthca_eq    eq[MTHCA_NUM_EQ];
+	u64                icm_virt;
+	struct page       *icm_page;
+	dma_addr_t         icm_dma;
+	int                have_irq;
+	u8                 inta_pin;
+};
+
+struct mthca_cq_table {
+	struct mthca_alloc 	alloc;
+	spinlock_t         	lock;
+	struct mthca_array      cq;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_srq_table {
+	struct mthca_alloc 	alloc;
+	spinlock_t         	lock;
+	struct mthca_array      srq;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_qp_table {
+	struct mthca_alloc     	alloc;
+	u32                    	rdb_base;
+	int                    	rdb_shift;
+	int                    	sqp_start;
+	spinlock_t             	lock;
+	struct mthca_array     	qp;
+	struct mthca_icm_table *qp_table;
+	struct mthca_icm_table *eqp_table;
+	struct mthca_icm_table *rdb_table;
+};
+
+struct mthca_av_table {
+	struct dma_pool   *pool;
+	int                num_ddr_avs;
+	u64                ddr_av_base;
+	void __iomem      *av_map;
+	struct mthca_alloc alloc;
+};
+
+struct mthca_mcg_table {
+	struct mutex		mutex;
+	struct mthca_alloc 	alloc;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_catas_err {
+	u64			addr;
+	u32 __iomem	       *map;
+	u32			size;
+	struct timer_list	timer;
+	struct list_head	list;
+};
+
+extern struct mutex mthca_device_mutex;
+
+struct mthca_dev {
+	struct ib_device  ib_dev;
+	struct pci_dev   *pdev;
+
+	int          	 hca_type;
+	unsigned long	 mthca_flags;
+	unsigned long    device_cap_flags;
+
+	u32              rev_id;
+	char             board_id[MTHCA_BOARD_ID_LEN];
+
+	/* firmware info */
+	u64              fw_ver;
+	union {
+		struct {
+			u64 fw_start;
+			u64 fw_end;
+		}        tavor;
+		struct {
+			u64 clr_int_base;
+			u64 eq_arm_base;
+			u64 eq_set_ci_base;
+			struct mthca_icm *fw_icm;
+			struct mthca_icm *aux_icm;
+			u16 fw_pages;
+		}        arbel;
+	}                fw;
+
+	u64              ddr_start;
+	u64              ddr_end;
+
+	MTHCA_DECLARE_DOORBELL_LOCK(doorbell_lock)
+	struct mutex cap_mask_mutex;
+
+	void __iomem    *hcr;
+	void __iomem    *kar;
+	void __iomem    *clr_base;
+	union {
+		struct {
+			void __iomem *ecr_base;
+		} tavor;
+		struct {
+			void __iomem *eq_arm;
+			void __iomem *eq_set_ci_base;
+		} arbel;
+	} eq_regs;
+
+	struct mthca_cmd    cmd;
+	struct mthca_limits limits;
+
+	struct mthca_uar_table uar_table;
+	struct mthca_pd_table  pd_table;
+	struct mthca_mr_table  mr_table;
+	struct mthca_eq_table  eq_table;
+	struct mthca_cq_table  cq_table;
+	struct mthca_srq_table srq_table;
+	struct mthca_qp_table  qp_table;
+	struct mthca_av_table  av_table;
+	struct mthca_mcg_table mcg_table;
+
+	struct mthca_catas_err catas_err;
+
+	struct mthca_uar       driver_uar;
+	struct mthca_db_table *db_tab;
+	struct mthca_pd        driver_pd;
+	struct mthca_mr        driver_mr;
+
+	struct ib_mad_agent  *send_agent[MTHCA_MAX_PORTS][2];
+	struct ib_ah         *sm_ah[MTHCA_MAX_PORTS];
+	spinlock_t            sm_lock;
+	u8                    rate[MTHCA_MAX_PORTS];
+	bool		      active;
+};
+
+#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
+extern int mthca_debug_level;
+
+#define mthca_dbg(mdev, format, arg...)					\
+	do {								\
+		if (mthca_debug_level)					\
+			dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \
+	} while (0)
+
+#else /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#define mthca_dbg(mdev, format, arg...) do { (void) mdev; } while (0)
+
+#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#define mthca_err(mdev, format, arg...) \
+	dev_err(&mdev->pdev->dev, format, ## arg)
+#define mthca_info(mdev, format, arg...) \
+	dev_info(&mdev->pdev->dev, format, ## arg)
+#define mthca_warn(mdev, format, arg...) \
+	dev_warn(&mdev->pdev->dev, format, ## arg)
+
+extern void __buggy_use_of_MTHCA_GET(void);
+extern void __buggy_use_of_MTHCA_PUT(void);
+
+#define MTHCA_GET(dest, source, offset)                               \
+	do {                                                          \
+		void *__p = (char *) (source) + (offset);             \
+		switch (sizeof (dest)) {                              \
+		case 1: (dest) = *(u8 *) __p;       break;	      \
+		case 2: (dest) = be16_to_cpup(__p); break;	      \
+		case 4: (dest) = be32_to_cpup(__p); break;	      \
+		case 8: (dest) = be64_to_cpup(__p); break;	      \
+		default: __buggy_use_of_MTHCA_GET();		      \
+		}                                                     \
+	} while (0)
+
+#define MTHCA_PUT(dest, source, offset)                               \
+	do {                                                          \
+		void *__d = ((char *) (dest) + (offset));	      \
+		switch (sizeof(source)) {                             \
+		case 1: *(u8 *) __d = (source);                break; \
+		case 2:	*(__be16 *) __d = cpu_to_be16(source); break; \
+		case 4:	*(__be32 *) __d = cpu_to_be32(source); break; \
+		case 8:	*(__be64 *) __d = cpu_to_be64(source); break; \
+		default: __buggy_use_of_MTHCA_PUT();		      \
+		}                                                     \
+	} while (0)
+
+int mthca_reset(struct mthca_dev *mdev);
+
+u32 mthca_alloc(struct mthca_alloc *alloc);
+void mthca_free(struct mthca_alloc *alloc, u32 obj);
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+		     u32 reserved);
+void mthca_alloc_cleanup(struct mthca_alloc *alloc);
+void *mthca_array_get(struct mthca_array *array, int index);
+int mthca_array_set(struct mthca_array *array, int index, void *value);
+void mthca_array_clear(struct mthca_array *array, int index);
+int mthca_array_init(struct mthca_array *array, int nent);
+void mthca_array_cleanup(struct mthca_array *array, int nent);
+int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
+		    union mthca_buf *buf, int *is_direct, struct mthca_pd *pd,
+		    int hca_write, struct mthca_mr *mr);
+void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf,
+		    int is_direct, struct mthca_mr *mr);
+
+int mthca_init_uar_table(struct mthca_dev *dev);
+int mthca_init_pd_table(struct mthca_dev *dev);
+int mthca_init_mr_table(struct mthca_dev *dev);
+int mthca_init_eq_table(struct mthca_dev *dev);
+int mthca_init_cq_table(struct mthca_dev *dev);
+int mthca_init_srq_table(struct mthca_dev *dev);
+int mthca_init_qp_table(struct mthca_dev *dev);
+int mthca_init_av_table(struct mthca_dev *dev);
+int mthca_init_mcg_table(struct mthca_dev *dev);
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev);
+void mthca_cleanup_pd_table(struct mthca_dev *dev);
+void mthca_cleanup_mr_table(struct mthca_dev *dev);
+void mthca_cleanup_eq_table(struct mthca_dev *dev);
+void mthca_cleanup_cq_table(struct mthca_dev *dev);
+void mthca_cleanup_srq_table(struct mthca_dev *dev);
+void mthca_cleanup_qp_table(struct mthca_dev *dev);
+void mthca_cleanup_av_table(struct mthca_dev *dev);
+void mthca_cleanup_mcg_table(struct mthca_dev *dev);
+
+int mthca_register_device(struct mthca_dev *dev);
+void mthca_unregister_device(struct mthca_dev *dev);
+
+void mthca_start_catas_poll(struct mthca_dev *dev);
+void mthca_stop_catas_poll(struct mthca_dev *dev);
+int __mthca_restart_one(struct pci_dev *pdev);
+int mthca_catas_init(void);
+void mthca_catas_cleanup(void);
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
+
+int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd);
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd);
+
+int mthca_write_mtt_size(struct mthca_dev *dev);
+
+struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size);
+void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt);
+int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+		    int start_index, u64 *buffer_list, int list_len);
+int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift,
+		   u64 iova, u64 total_size, u32 access, struct mthca_mr *mr);
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+			   u32 access, struct mthca_mr *mr);
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+			u64 *buffer_list, int buffer_size_shift,
+			int list_len, u64 iova, u64 total_size,
+			u32 access, struct mthca_mr *mr);
+void mthca_free_mr(struct mthca_dev *dev,  struct mthca_mr *mr);
+
+int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
+		    u32 access, struct mthca_fmr *fmr);
+int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova);
+void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
+int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova);
+void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
+int mthca_free_fmr(struct mthca_dev *dev,  struct mthca_fmr *fmr);
+
+int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt);
+void mthca_unmap_eq_icm(struct mthca_dev *dev);
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+		  struct ib_wc *entry);
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_ucontext *ctx, u32 pdn,
+		  struct mthca_cq *cq);
+void mthca_free_cq(struct mthca_dev *dev,
+		   struct mthca_cq *cq);
+void mthca_cq_completion(struct mthca_dev *dev, u32 cqn);
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
+		    enum ib_event_type event_type);
+void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
+		    struct mthca_srq *srq);
+void mthca_cq_resize_copy_cqes(struct mthca_cq *cq);
+int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent);
+void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe);
+
+int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
+		    struct ib_srq_attr *attr, struct mthca_srq *srq);
+void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq);
+int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mthca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int mthca_max_srq_sge(struct mthca_dev *dev);
+void mthca_srq_event(struct mthca_dev *dev, u32 srqn,
+		     enum ib_event_type event_type);
+void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr);
+int mthca_tavor_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr);
+int mthca_arbel_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr);
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+		    enum ib_event_type event_type);
+int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		   struct ib_qp_init_attr *qp_init_attr);
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		    struct ib_udata *udata);
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr);
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr);
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr);
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr);
+void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+			int index, int *dbd, __be32 *new_wqe);
+int mthca_alloc_qp(struct mthca_dev *dev,
+		   struct mthca_pd *pd,
+		   struct mthca_cq *send_cq,
+		   struct mthca_cq *recv_cq,
+		   enum ib_qp_type type,
+		   enum ib_sig_type send_policy,
+		   struct ib_qp_cap *cap,
+		   struct mthca_qp *qp);
+int mthca_alloc_sqp(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct mthca_cq *send_cq,
+		    struct mthca_cq *recv_cq,
+		    enum ib_sig_type send_policy,
+		    struct ib_qp_cap *cap,
+		    int qpn,
+		    int port,
+		    struct mthca_sqp *sqp);
+void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp);
+int mthca_create_ah(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct rdma_ah_attr *ah_attr,
+		    struct mthca_ah *ah);
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah);
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+		  struct ib_ud_header *header);
+int mthca_ah_query(struct ib_ah *ibah, struct rdma_ah_attr *attr);
+int mthca_ah_grh_present(struct mthca_ah *ah);
+u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port);
+enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port);
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int mthca_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      const struct ib_wc *in_wc,
+		      const struct ib_grh *in_grh,
+		      const struct ib_mad_hdr *in, size_t in_mad_size,
+		      struct ib_mad_hdr *out, size_t *out_mad_size,
+		      u16 *out_mad_pkey_index);
+int mthca_create_agents(struct mthca_dev *dev);
+void mthca_free_agents(struct mthca_dev *dev);
+
+static inline struct mthca_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mthca_dev, ib_dev);
+}
+
+static inline int mthca_is_memfree(struct mthca_dev *dev)
+{
+	return dev->mthca_flags & MTHCA_FLAG_MEMFREE;
+}
+
+#endif /* MTHCA_DEV_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_doorbell.h b/drivers/infiniband/hw/mthca/mthca_doorbell.h
new file mode 100644
index 0000000..14f51ef
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_doorbell.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+
+#define MTHCA_RD_DOORBELL      0x00
+#define MTHCA_SEND_DOORBELL    0x10
+#define MTHCA_RECEIVE_DOORBELL 0x18
+#define MTHCA_CQ_DOORBELL      0x20
+#define MTHCA_EQ_DOORBELL      0x28
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name)
+#define MTHCA_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MTHCA_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writeq((__force u64) val, dest);
+}
+
+static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest,
+				 spinlock_t *doorbell_lock)
+{
+	__raw_writeq((__force u64) cpu_to_be64((u64) hi << 32 | lo), dest);
+}
+
+static inline void mthca_write_db_rec(__be32 val[2], __be32 *db)
+{
+	*(u64 *) db = *(u64 *) val;
+}
+
+#else
+
+/*
+ * Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MTHCA_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MTHCA_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writel(((__force u32 *) &val)[0], dest);
+	__raw_writel(((__force u32 *) &val)[1], dest + 4);
+}
+
+static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest,
+				 spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	hi = (__force u32) cpu_to_be32(hi);
+	lo = (__force u32) cpu_to_be32(lo);
+
+	spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel(hi, dest);
+	__raw_writel(lo, dest + 4);
+	spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+static inline void mthca_write_db_rec(__be32 val[2], __be32 *db)
+{
+	db[0] = val[0];
+	wmb();
+	db[1] = val[1];
+}
+
+#endif
diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c
new file mode 100644
index 0000000..6902017
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -0,0 +1,905 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_config_reg.h"
+
+enum {
+	MTHCA_NUM_ASYNC_EQE = 0x80,
+	MTHCA_NUM_CMD_EQE   = 0x80,
+	MTHCA_NUM_SPARE_EQE = 0x80,
+	MTHCA_EQ_ENTRY_SIZE = 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_eq_context {
+	__be32 flags;
+	__be64 start;
+	__be32 logsize_usrpage;
+	__be32 tavor_pd;	/* reserved for Arbel */
+	u8     reserved1[3];
+	u8     intr;
+	__be32 arbel_pd;	/* lost_count for Tavor */
+	__be32 lkey;
+	u32    reserved2[2];
+	__be32 consumer_index;
+	__be32 producer_index;
+	u32    reserved3[4];
+} __attribute__((packed));
+
+#define MTHCA_EQ_STATUS_OK          ( 0 << 28)
+#define MTHCA_EQ_STATUS_OVERFLOW    ( 9 << 28)
+#define MTHCA_EQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MTHCA_EQ_OWNER_SW           ( 0 << 24)
+#define MTHCA_EQ_OWNER_HW           ( 1 << 24)
+#define MTHCA_EQ_FLAG_TR            ( 1 << 18)
+#define MTHCA_EQ_FLAG_OI            ( 1 << 17)
+#define MTHCA_EQ_STATE_ARMED        ( 1 <<  8)
+#define MTHCA_EQ_STATE_FIRED        ( 2 <<  8)
+#define MTHCA_EQ_STATE_ALWAYS_ARMED ( 3 <<  8)
+#define MTHCA_EQ_STATE_ARBEL        ( 8 <<  8)
+
+enum {
+	MTHCA_EVENT_TYPE_COMP       	    = 0x00,
+	MTHCA_EVENT_TYPE_PATH_MIG   	    = 0x01,
+	MTHCA_EVENT_TYPE_COMM_EST   	    = 0x02,
+	MTHCA_EVENT_TYPE_SQ_DRAINED 	    = 0x03,
+	MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE    = 0x13,
+	MTHCA_EVENT_TYPE_SRQ_LIMIT	    = 0x14,
+	MTHCA_EVENT_TYPE_CQ_ERROR   	    = 0x04,
+	MTHCA_EVENT_TYPE_WQ_CATAS_ERROR     = 0x05,
+	MTHCA_EVENT_TYPE_EEC_CATAS_ERROR    = 0x06,
+	MTHCA_EVENT_TYPE_PATH_MIG_FAILED    = 0x07,
+	MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR    = 0x11,
+	MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR    = 0x12,
+	MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR  = 0x08,
+	MTHCA_EVENT_TYPE_PORT_CHANGE        = 0x09,
+	MTHCA_EVENT_TYPE_EQ_OVERFLOW        = 0x0f,
+	MTHCA_EVENT_TYPE_ECC_DETECT         = 0x0e,
+	MTHCA_EVENT_TYPE_CMD                = 0x0a
+};
+
+#define MTHCA_ASYNC_EVENT_MASK ((1ULL << MTHCA_EVENT_TYPE_PATH_MIG)           | \
+				(1ULL << MTHCA_EVENT_TYPE_COMM_EST)           | \
+				(1ULL << MTHCA_EVENT_TYPE_SQ_DRAINED)         | \
+				(1ULL << MTHCA_EVENT_TYPE_CQ_ERROR)           | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_CATAS_ERROR)     | \
+				(1ULL << MTHCA_EVENT_TYPE_EEC_CATAS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_PATH_MIG_FAILED)    | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR)  | \
+				(1ULL << MTHCA_EVENT_TYPE_PORT_CHANGE)        | \
+				(1ULL << MTHCA_EVENT_TYPE_ECC_DETECT))
+#define MTHCA_SRQ_EVENT_MASK   ((1ULL << MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE)    | \
+				(1ULL << MTHCA_EVENT_TYPE_SRQ_LIMIT))
+#define MTHCA_CMD_EVENT_MASK    (1ULL << MTHCA_EVENT_TYPE_CMD)
+
+#define MTHCA_EQ_DB_INC_CI     (1 << 24)
+#define MTHCA_EQ_DB_REQ_NOT    (2 << 24)
+#define MTHCA_EQ_DB_DISARM_CQ  (3 << 24)
+#define MTHCA_EQ_DB_SET_CI     (4 << 24)
+#define MTHCA_EQ_DB_ALWAYS_ARM (5 << 24)
+
+struct mthca_eqe {
+	u8 reserved1;
+	u8 type;
+	u8 reserved2;
+	u8 subtype;
+	union {
+		u32 raw[6];
+		struct {
+			__be32 cqn;
+		} __attribute__((packed)) comp;
+		struct {
+			u16    reserved1;
+			__be16 token;
+			u32    reserved2;
+			u8     reserved3[3];
+			u8     status;
+			__be64 out_param;
+		} __attribute__((packed)) cmd;
+		struct {
+			__be32 qpn;
+		} __attribute__((packed)) qp;
+		struct {
+			__be32 srqn;
+		} __attribute__((packed)) srq;
+		struct {
+			__be32 cqn;
+			u32    reserved1;
+			u8     reserved2[3];
+			u8     syndrome;
+		} __attribute__((packed)) cq_err;
+		struct {
+			u32    reserved1[2];
+			__be32 port;
+		} __attribute__((packed)) port_change;
+	} event;
+	u8 reserved3[3];
+	u8 owner;
+} __attribute__((packed));
+
+#define  MTHCA_EQ_ENTRY_OWNER_SW      (0 << 7)
+#define  MTHCA_EQ_ENTRY_OWNER_HW      (1 << 7)
+
+static inline u64 async_mask(struct mthca_dev *dev)
+{
+	return dev->mthca_flags & MTHCA_FLAG_SRQ ?
+		MTHCA_ASYNC_EVENT_MASK | MTHCA_SRQ_EVENT_MASK :
+		MTHCA_ASYNC_EVENT_MASK;
+}
+
+static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	/*
+	 * This barrier makes sure that all updates to ownership bits
+	 * done by set_eqe_hw() hit memory before the consumer index
+	 * is updated.  set_eq_ci() allows the HCA to possibly write
+	 * more EQ entries, and we want to avoid the exceedingly
+	 * unlikely possibility of the HCA writing an entry and then
+	 * having set_eqe_hw() overwrite the owner field.
+	 */
+	wmb();
+	mthca_write64(MTHCA_EQ_DB_SET_CI | eq->eqn, ci & (eq->nent - 1),
+		      dev->kar + MTHCA_EQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	/* See comment in tavor_set_eq_ci() above. */
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(ci),
+		     dev->eq_regs.arbel.eq_set_ci_base + eq->eqn * 8);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+static inline void set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	if (mthca_is_memfree(dev))
+		arbel_set_eq_ci(dev, eq, ci);
+	else
+		tavor_set_eq_ci(dev, eq, ci);
+}
+
+static inline void tavor_eq_req_not(struct mthca_dev *dev, int eqn)
+{
+	mthca_write64(MTHCA_EQ_DB_REQ_NOT | eqn, 0,
+		      dev->kar + MTHCA_EQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_eq_req_not(struct mthca_dev *dev, u32 eqn_mask)
+{
+	writel(eqn_mask, dev->eq_regs.arbel.eq_arm);
+}
+
+static inline void disarm_cq(struct mthca_dev *dev, int eqn, int cqn)
+{
+	if (!mthca_is_memfree(dev)) {
+		mthca_write64(MTHCA_EQ_DB_DISARM_CQ | eqn, cqn,
+			      dev->kar + MTHCA_EQ_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+}
+
+static inline struct mthca_eqe *get_eqe(struct mthca_eq *eq, u32 entry)
+{
+	unsigned long off = (entry & (eq->nent - 1)) * MTHCA_EQ_ENTRY_SIZE;
+	return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE;
+}
+
+static inline struct mthca_eqe *next_eqe_sw(struct mthca_eq *eq)
+{
+	struct mthca_eqe *eqe;
+	eqe = get_eqe(eq, eq->cons_index);
+	return (MTHCA_EQ_ENTRY_OWNER_HW & eqe->owner) ? NULL : eqe;
+}
+
+static inline void set_eqe_hw(struct mthca_eqe *eqe)
+{
+	eqe->owner =  MTHCA_EQ_ENTRY_OWNER_HW;
+}
+
+static void port_change(struct mthca_dev *dev, int port, int active)
+{
+	struct ib_event record;
+
+	mthca_dbg(dev, "Port change to %s for port %d\n",
+		  active ? "active" : "down", port);
+
+	record.device = &dev->ib_dev;
+	record.event  = active ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+	record.element.port_num = port;
+
+	ib_dispatch_event(&record);
+}
+
+static int mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq)
+{
+	struct mthca_eqe *eqe;
+	int disarm_cqn;
+	int eqes_found = 0;
+	int set_ci = 0;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		rmb();
+
+		switch (eqe->type) {
+		case MTHCA_EVENT_TYPE_COMP:
+			disarm_cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+			disarm_cq(dev, eq->eqn, disarm_cqn);
+			mthca_cq_completion(dev, disarm_cqn);
+			break;
+
+		case MTHCA_EVENT_TYPE_PATH_MIG:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_PATH_MIG);
+			break;
+
+		case MTHCA_EVENT_TYPE_COMM_EST:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_COMM_EST);
+			break;
+
+		case MTHCA_EVENT_TYPE_SQ_DRAINED:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_SQ_DRAINED);
+			break;
+
+		case MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_LAST_WQE_REACHED);
+			break;
+
+		case MTHCA_EVENT_TYPE_SRQ_LIMIT:
+			mthca_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff,
+					IB_EVENT_SRQ_LIMIT_REACHED);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_CATAS_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_FATAL);
+			break;
+
+		case MTHCA_EVENT_TYPE_PATH_MIG_FAILED:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_PATH_MIG_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_REQ_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_ACCESS_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_CMD:
+			mthca_cmd_event(dev,
+					be16_to_cpu(eqe->event.cmd.token),
+					eqe->event.cmd.status,
+					be64_to_cpu(eqe->event.cmd.out_param));
+			break;
+
+		case MTHCA_EVENT_TYPE_PORT_CHANGE:
+			port_change(dev,
+				    (be32_to_cpu(eqe->event.port_change.port) >> 28) & 3,
+				    eqe->subtype == 0x4);
+			break;
+
+		case MTHCA_EVENT_TYPE_CQ_ERROR:
+			mthca_warn(dev, "CQ %s on CQN %06x\n",
+				   eqe->event.cq_err.syndrome == 1 ?
+				   "overrun" : "access violation",
+				   be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
+			mthca_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn),
+				       IB_EVENT_CQ_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_EQ_OVERFLOW:
+			mthca_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+			break;
+
+		case MTHCA_EVENT_TYPE_EEC_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_ECC_DETECT:
+		default:
+			mthca_warn(dev, "Unhandled event %02x(%02x) on EQ %d\n",
+				   eqe->type, eqe->subtype, eq->eqn);
+			break;
+		}
+
+		set_eqe_hw(eqe);
+		++eq->cons_index;
+		eqes_found = 1;
+		++set_ci;
+
+		/*
+		 * The HCA will think the queue has overflowed if we
+		 * don't tell it we've been processing events.  We
+		 * create our EQs with MTHCA_NUM_SPARE_EQE extra
+		 * entries, so we must update our consumer index at
+		 * least that often.
+		 */
+		if (unlikely(set_ci >= MTHCA_NUM_SPARE_EQE)) {
+			/*
+			 * Conditional on hca_type is OK here because
+			 * this is a rare case, not the fast path.
+			 */
+			set_eq_ci(dev, eq, eq->cons_index);
+			set_ci = 0;
+		}
+	}
+
+	/*
+	 * Rely on caller to set consumer index so that we don't have
+	 * to test hca_type in our interrupt handling fast path.
+	 */
+	return eqes_found;
+}
+
+static irqreturn_t mthca_tavor_interrupt(int irq, void *dev_ptr)
+{
+	struct mthca_dev *dev = dev_ptr;
+	u32 ecr;
+	int i;
+
+	if (dev->eq_table.clr_mask)
+		writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+	ecr = readl(dev->eq_regs.tavor.ecr_base + 4);
+	if (!ecr)
+		return IRQ_NONE;
+
+	writel(ecr, dev->eq_regs.tavor.ecr_base +
+	       MTHCA_ECR_CLR_BASE - MTHCA_ECR_BASE + 4);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (ecr & dev->eq_table.eq[i].eqn_mask) {
+			if (mthca_eq_int(dev, &dev->eq_table.eq[i]))
+				tavor_set_eq_ci(dev, &dev->eq_table.eq[i],
+						dev->eq_table.eq[i].cons_index);
+			tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+		}
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mthca_tavor_msi_x_interrupt(int irq, void *eq_ptr)
+{
+	struct mthca_eq  *eq  = eq_ptr;
+	struct mthca_dev *dev = eq->dev;
+
+	mthca_eq_int(dev, eq);
+	tavor_set_eq_ci(dev, eq, eq->cons_index);
+	tavor_eq_req_not(dev, eq->eqn);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mthca_arbel_interrupt(int irq, void *dev_ptr)
+{
+	struct mthca_dev *dev = dev_ptr;
+	int work = 0;
+	int i;
+
+	if (dev->eq_table.clr_mask)
+		writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (mthca_eq_int(dev, &dev->eq_table.eq[i])) {
+			work = 1;
+			arbel_set_eq_ci(dev, &dev->eq_table.eq[i],
+					dev->eq_table.eq[i].cons_index);
+		}
+
+	arbel_eq_req_not(dev, dev->eq_table.arm_mask);
+
+	return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mthca_arbel_msi_x_interrupt(int irq, void *eq_ptr)
+{
+	struct mthca_eq  *eq  = eq_ptr;
+	struct mthca_dev *dev = eq->dev;
+
+	mthca_eq_int(dev, eq);
+	arbel_set_eq_ci(dev, eq, eq->cons_index);
+	arbel_eq_req_not(dev, eq->eqn_mask);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static int mthca_create_eq(struct mthca_dev *dev,
+			   int nent,
+			   u8 intr,
+			   struct mthca_eq *eq)
+{
+	int npages;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	struct mthca_mailbox *mailbox;
+	struct mthca_eq_context *eq_context;
+	int err = -ENOMEM;
+	int i;
+
+	eq->dev  = dev;
+	eq->nent = roundup_pow_of_two(max(nent, 2));
+	npages = ALIGN(eq->nent * MTHCA_EQ_ENTRY_SIZE, PAGE_SIZE) / PAGE_SIZE;
+
+	eq->page_list = kmalloc(npages * sizeof *eq->page_list,
+				GFP_KERNEL);
+	if (!eq->page_list)
+		goto err_out;
+
+	for (i = 0; i < npages; ++i)
+		eq->page_list[i].buf = NULL;
+
+	dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+	if (!dma_list)
+		goto err_out_free;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		goto err_out_free;
+	eq_context = mailbox->buf;
+
+	for (i = 0; i < npages; ++i) {
+		eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev,
+							  PAGE_SIZE, &t, GFP_KERNEL);
+		if (!eq->page_list[i].buf)
+			goto err_out_free_pages;
+
+		dma_list[i] = t;
+		dma_unmap_addr_set(&eq->page_list[i], mapping, t);
+
+		clear_page(eq->page_list[i].buf);
+	}
+
+	for (i = 0; i < eq->nent; ++i)
+		set_eqe_hw(get_eqe(eq, i));
+
+	eq->eqn = mthca_alloc(&dev->eq_table.alloc);
+	if (eq->eqn == -1)
+		goto err_out_free_pages;
+
+	err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num,
+				  dma_list, PAGE_SHIFT, npages,
+				  0, npages * PAGE_SIZE,
+				  MTHCA_MPT_FLAG_LOCAL_WRITE |
+				  MTHCA_MPT_FLAG_LOCAL_READ,
+				  &eq->mr);
+	if (err)
+		goto err_out_free_eq;
+
+	memset(eq_context, 0, sizeof *eq_context);
+	eq_context->flags           = cpu_to_be32(MTHCA_EQ_STATUS_OK   |
+						  MTHCA_EQ_OWNER_HW    |
+						  MTHCA_EQ_STATE_ARMED |
+						  MTHCA_EQ_FLAG_TR);
+	if (mthca_is_memfree(dev))
+		eq_context->flags  |= cpu_to_be32(MTHCA_EQ_STATE_ARBEL);
+
+	eq_context->logsize_usrpage = cpu_to_be32((ffs(eq->nent) - 1) << 24);
+	if (mthca_is_memfree(dev)) {
+		eq_context->arbel_pd = cpu_to_be32(dev->driver_pd.pd_num);
+	} else {
+		eq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
+		eq_context->tavor_pd         = cpu_to_be32(dev->driver_pd.pd_num);
+	}
+	eq_context->intr            = intr;
+	eq_context->lkey            = cpu_to_be32(eq->mr.ibmr.lkey);
+
+	err = mthca_SW2HW_EQ(dev, mailbox, eq->eqn);
+	if (err) {
+		mthca_warn(dev, "SW2HW_EQ returned %d\n", err);
+		goto err_out_free_mr;
+	}
+
+	kfree(dma_list);
+	mthca_free_mailbox(dev, mailbox);
+
+	eq->eqn_mask   = swab32(1 << eq->eqn);
+	eq->cons_index = 0;
+
+	dev->eq_table.arm_mask |= eq->eqn_mask;
+
+	mthca_dbg(dev, "Allocated EQ %d with %d entries\n",
+		  eq->eqn, eq->nent);
+
+	return err;
+
+ err_out_free_mr:
+	mthca_free_mr(dev, &eq->mr);
+
+ err_out_free_eq:
+	mthca_free(&dev->eq_table.alloc, eq->eqn);
+
+ err_out_free_pages:
+	for (i = 0; i < npages; ++i)
+		if (eq->page_list[i].buf)
+			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+					  eq->page_list[i].buf,
+					  dma_unmap_addr(&eq->page_list[i],
+							 mapping));
+
+	mthca_free_mailbox(dev, mailbox);
+
+ err_out_free:
+	kfree(eq->page_list);
+	kfree(dma_list);
+
+ err_out:
+	return err;
+}
+
+static void mthca_free_eq(struct mthca_dev *dev,
+			  struct mthca_eq *eq)
+{
+	struct mthca_mailbox *mailbox;
+	int err;
+	int npages = (eq->nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) /
+		PAGE_SIZE;
+	int i;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return;
+
+	err = mthca_HW2SW_EQ(dev, mailbox, eq->eqn);
+	if (err)
+		mthca_warn(dev, "HW2SW_EQ returned %d\n", err);
+
+	dev->eq_table.arm_mask &= ~eq->eqn_mask;
+
+	if (0) {
+		mthca_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
+		for (i = 0; i < sizeof (struct mthca_eq_context) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpup(mailbox->buf + i * 4));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	mthca_free_mr(dev, &eq->mr);
+	for (i = 0; i < npages; ++i)
+		pci_free_consistent(dev->pdev, PAGE_SIZE,
+				    eq->page_list[i].buf,
+				    dma_unmap_addr(&eq->page_list[i], mapping));
+
+	kfree(eq->page_list);
+	mthca_free_mailbox(dev, mailbox);
+}
+
+static void mthca_free_irqs(struct mthca_dev *dev)
+{
+	int i;
+
+	if (dev->eq_table.have_irq)
+		free_irq(dev->pdev->irq, dev);
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (dev->eq_table.eq[i].have_irq) {
+			free_irq(dev->eq_table.eq[i].msi_x_vector,
+				 dev->eq_table.eq + i);
+			dev->eq_table.eq[i].have_irq = 0;
+		}
+}
+
+static int mthca_map_reg(struct mthca_dev *dev,
+			 unsigned long offset, unsigned long size,
+			 void __iomem **map)
+{
+	phys_addr_t base = pci_resource_start(dev->pdev, 0);
+
+	*map = ioremap(base + offset, size);
+	if (!*map)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int mthca_map_eq_regs(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev)) {
+		/*
+		 * We assume that the EQ arm and EQ set CI registers
+		 * fall within the first BAR.  We can't trust the
+		 * values firmware gives us, since those addresses are
+		 * valid on the HCA's side of the PCI bus but not
+		 * necessarily the host side.
+		 */
+		if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+				  dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+				  &dev->clr_base)) {
+			mthca_err(dev, "Couldn't map interrupt clear register, "
+				  "aborting.\n");
+			return -ENOMEM;
+		}
+
+		/*
+		 * Add 4 because we limit ourselves to EQs 0 ... 31,
+		 * so we only need the low word of the register.
+		 */
+		if (mthca_map_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+					dev->fw.arbel.eq_arm_base) + 4, 4,
+				  &dev->eq_regs.arbel.eq_arm)) {
+			mthca_err(dev, "Couldn't map EQ arm register, aborting.\n");
+			iounmap(dev->clr_base);
+			return -ENOMEM;
+		}
+
+		if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+				  dev->fw.arbel.eq_set_ci_base,
+				  MTHCA_EQ_SET_CI_SIZE,
+				  &dev->eq_regs.arbel.eq_set_ci_base)) {
+			mthca_err(dev, "Couldn't map EQ CI register, aborting.\n");
+			iounmap(dev->eq_regs.arbel.eq_arm);
+			iounmap(dev->clr_base);
+			return -ENOMEM;
+		}
+	} else {
+		if (mthca_map_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+				  &dev->clr_base)) {
+			mthca_err(dev, "Couldn't map interrupt clear register, "
+				  "aborting.\n");
+			return -ENOMEM;
+		}
+
+		if (mthca_map_reg(dev, MTHCA_ECR_BASE,
+				  MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE,
+				  &dev->eq_regs.tavor.ecr_base)) {
+			mthca_err(dev, "Couldn't map ecr register, "
+				  "aborting.\n");
+			iounmap(dev->clr_base);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+
+}
+
+static void mthca_unmap_eq_regs(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev)) {
+		iounmap(dev->eq_regs.arbel.eq_set_ci_base);
+		iounmap(dev->eq_regs.arbel.eq_arm);
+		iounmap(dev->clr_base);
+	} else {
+		iounmap(dev->eq_regs.tavor.ecr_base);
+		iounmap(dev->clr_base);
+	}
+}
+
+int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt)
+{
+	int ret;
+
+	/*
+	 * We assume that mapping one page is enough for the whole EQ
+	 * context table.  This is fine with all current HCAs, because
+	 * we only use 32 EQs and each EQ uses 32 bytes of context
+	 * memory, or 1 KB total.
+	 */
+	dev->eq_table.icm_virt = icm_virt;
+	dev->eq_table.icm_page = alloc_page(GFP_HIGHUSER);
+	if (!dev->eq_table.icm_page)
+		return -ENOMEM;
+	dev->eq_table.icm_dma  = pci_map_page(dev->pdev, dev->eq_table.icm_page, 0,
+					      PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+	if (pci_dma_mapping_error(dev->pdev, dev->eq_table.icm_dma)) {
+		__free_page(dev->eq_table.icm_page);
+		return -ENOMEM;
+	}
+
+	ret = mthca_MAP_ICM_page(dev, dev->eq_table.icm_dma, icm_virt);
+	if (ret) {
+		pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+			       PCI_DMA_BIDIRECTIONAL);
+		__free_page(dev->eq_table.icm_page);
+	}
+
+	return ret;
+}
+
+void mthca_unmap_eq_icm(struct mthca_dev *dev)
+{
+	mthca_UNMAP_ICM(dev, dev->eq_table.icm_virt, 1);
+	pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+		       PCI_DMA_BIDIRECTIONAL);
+	__free_page(dev->eq_table.icm_page);
+}
+
+int mthca_init_eq_table(struct mthca_dev *dev)
+{
+	int err;
+	u8 intr;
+	int i;
+
+	err = mthca_alloc_init(&dev->eq_table.alloc,
+			       dev->limits.num_eqs,
+			       dev->limits.num_eqs - 1,
+			       dev->limits.reserved_eqs);
+	if (err)
+		return err;
+
+	err = mthca_map_eq_regs(dev);
+	if (err)
+		goto err_out_free;
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+		dev->eq_table.clr_mask = 0;
+	} else {
+		dev->eq_table.clr_mask =
+			swab32(1 << (dev->eq_table.inta_pin & 31));
+		dev->eq_table.clr_int  = dev->clr_base +
+			(dev->eq_table.inta_pin < 32 ? 4 : 0);
+	}
+
+	dev->eq_table.arm_mask = 0;
+
+	intr = dev->eq_table.inta_pin;
+
+	err = mthca_create_eq(dev, dev->limits.num_cqs + MTHCA_NUM_SPARE_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 128 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_COMP]);
+	if (err)
+		goto err_out_unmap;
+
+	err = mthca_create_eq(dev, MTHCA_NUM_ASYNC_EQE + MTHCA_NUM_SPARE_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 129 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+	if (err)
+		goto err_out_comp;
+
+	err = mthca_create_eq(dev, MTHCA_NUM_CMD_EQE + MTHCA_NUM_SPARE_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 130 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_CMD]);
+	if (err)
+		goto err_out_async;
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+		static const char *eq_name[] = {
+			[MTHCA_EQ_COMP]  = DRV_NAME "-comp",
+			[MTHCA_EQ_ASYNC] = DRV_NAME "-async",
+			[MTHCA_EQ_CMD]   = DRV_NAME "-cmd"
+		};
+
+		for (i = 0; i < MTHCA_NUM_EQ; ++i) {
+			snprintf(dev->eq_table.eq[i].irq_name,
+				 IB_DEVICE_NAME_MAX,
+				 "%s@pci:%s", eq_name[i],
+				 pci_name(dev->pdev));
+			err = request_irq(dev->eq_table.eq[i].msi_x_vector,
+					  mthca_is_memfree(dev) ?
+					  mthca_arbel_msi_x_interrupt :
+					  mthca_tavor_msi_x_interrupt,
+					  0, dev->eq_table.eq[i].irq_name,
+					  dev->eq_table.eq + i);
+			if (err)
+				goto err_out_cmd;
+			dev->eq_table.eq[i].have_irq = 1;
+		}
+	} else {
+		snprintf(dev->eq_table.eq[0].irq_name, IB_DEVICE_NAME_MAX,
+			 DRV_NAME "@pci:%s", pci_name(dev->pdev));
+		err = request_irq(dev->pdev->irq,
+				  mthca_is_memfree(dev) ?
+				  mthca_arbel_interrupt :
+				  mthca_tavor_interrupt,
+				  IRQF_SHARED, dev->eq_table.eq[0].irq_name, dev);
+		if (err)
+			goto err_out_cmd;
+		dev->eq_table.have_irq = 1;
+	}
+
+	err = mthca_MAP_EQ(dev, async_mask(dev),
+			   0, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+	if (err)
+		mthca_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+			   dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, err);
+
+	err = mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+			   0, dev->eq_table.eq[MTHCA_EQ_CMD].eqn);
+	if (err)
+		mthca_warn(dev, "MAP_EQ for cmd EQ %d failed (%d)\n",
+			   dev->eq_table.eq[MTHCA_EQ_CMD].eqn, err);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (mthca_is_memfree(dev))
+			arbel_eq_req_not(dev, dev->eq_table.eq[i].eqn_mask);
+		else
+			tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+
+	return 0;
+
+err_out_cmd:
+	mthca_free_irqs(dev);
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_CMD]);
+
+err_out_async:
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+
+err_out_comp:
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_COMP]);
+
+err_out_unmap:
+	mthca_unmap_eq_regs(dev);
+
+err_out_free:
+	mthca_alloc_cleanup(&dev->eq_table.alloc);
+	return err;
+}
+
+void mthca_cleanup_eq_table(struct mthca_dev *dev)
+{
+	int i;
+
+	mthca_free_irqs(dev);
+
+	mthca_MAP_EQ(dev, async_mask(dev),
+		     1, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+	mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+		     1, dev->eq_table.eq[MTHCA_EQ_CMD].eqn);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		mthca_free_eq(dev, &dev->eq_table.eq[i]);
+
+	mthca_unmap_eq_regs(dev);
+
+	mthca_alloc_cleanup(&dev->eq_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
new file mode 100644
index 0000000..093f775
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+	MTHCA_VENDOR_CLASS1 = 0x9,
+	MTHCA_VENDOR_CLASS2 = 0xa
+};
+
+static int mthca_update_rate(struct mthca_dev *dev, u8 port_num)
+{
+	struct ib_port_attr *tprops = NULL;
+	int                  ret;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		return -ENOMEM;
+
+	ret = ib_query_port(&dev->ib_dev, port_num, tprops);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_port failed (%d) for %s port %d\n",
+		       ret, dev->ib_dev.name, port_num);
+		goto out;
+	}
+
+	dev->rate[port_num - 1] = tprops->active_speed *
+				  ib_width_enum_to_int(tprops->active_width);
+
+out:
+	kfree(tprops);
+	return ret;
+}
+
+static void update_sm_ah(struct mthca_dev *dev,
+			 u8 port_num, u16 lid, u8 sl)
+{
+	struct ib_ah *new_ah;
+	struct rdma_ah_attr ah_attr;
+	unsigned long flags;
+
+	if (!dev->send_agent[port_num - 1][0])
+		return;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.type = rdma_ah_find_type(&dev->ib_dev, port_num);
+	rdma_ah_set_dlid(&ah_attr, lid);
+	rdma_ah_set_sl(&ah_attr, sl);
+	rdma_ah_set_port_num(&ah_attr, port_num);
+
+	new_ah = rdma_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+				&ah_attr);
+	if (IS_ERR(new_ah))
+		return;
+
+	spin_lock_irqsave(&dev->sm_lock, flags);
+	if (dev->sm_ah[port_num - 1])
+		rdma_destroy_ah(dev->sm_ah[port_num - 1]);
+	dev->sm_ah[port_num - 1] = new_ah;
+	spin_unlock_irqrestore(&dev->sm_lock, flags);
+}
+
+/*
+ * Snoop SM MADs for port info and P_Key table sets, so we can
+ * synthesize LID change and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev,
+		      u8 port_num,
+		      const struct ib_mad *mad,
+		      u16 prev_lid)
+{
+	struct ib_event event;
+
+	if ((mad->mad_hdr.mgmt_class  == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class  == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method     == IB_MGMT_METHOD_SET) {
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
+			struct ib_port_info *pinfo =
+				(struct ib_port_info *) ((struct ib_smp *) mad)->data;
+			u16 lid = be16_to_cpu(pinfo->lid);
+
+			mthca_update_rate(to_mdev(ibdev), port_num);
+			update_sm_ah(to_mdev(ibdev), port_num,
+				     be16_to_cpu(pinfo->sm_lid),
+				     pinfo->neighbormtu_mastersmsl & 0xf);
+
+			event.device           = ibdev;
+			event.element.port_num = port_num;
+
+			if (pinfo->clientrereg_resv_subnetto & 0x80) {
+				event.event    = IB_EVENT_CLIENT_REREGISTER;
+				ib_dispatch_event(&event);
+			}
+
+			if (prev_lid != lid) {
+				event.event    = IB_EVENT_LID_CHANGE;
+				ib_dispatch_event(&event);
+			}
+		}
+
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+			event.device           = ibdev;
+			event.event            = IB_EVENT_PKEY_CHANGE;
+			event.element.port_num = port_num;
+			ib_dispatch_event(&event);
+		}
+	}
+}
+
+static void node_desc_override(struct ib_device *dev,
+			       struct ib_mad *mad)
+{
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+	    mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+		mutex_lock(&to_mdev(dev)->cap_mask_mutex);
+		memcpy(((struct ib_smp *) mad)->data, dev->node_desc,
+		       IB_DEVICE_NODE_DESC_MAX);
+		mutex_unlock(&to_mdev(dev)->cap_mask_mutex);
+	}
+}
+
+static void forward_trap(struct mthca_dev *dev,
+			 u8 port_num,
+			 const struct ib_mad *mad)
+{
+	int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+	int ret;
+	unsigned long flags;
+
+	if (agent) {
+		send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+					      IB_MGMT_MAD_DATA, GFP_ATOMIC,
+					      IB_MGMT_BASE_VERSION);
+		if (IS_ERR(send_buf))
+			return;
+		/*
+		 * We rely here on the fact that MLX QPs don't use the
+		 * address handle after the send is posted (this is
+		 * wrong following the IB spec strictly, but we know
+		 * it's OK for our devices).
+		 */
+		spin_lock_irqsave(&dev->sm_lock, flags);
+		memcpy(send_buf->mad, mad, sizeof *mad);
+		if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+			ret = ib_post_send_mad(send_buf, NULL);
+		else
+			ret = -EINVAL;
+		spin_unlock_irqrestore(&dev->sm_lock, flags);
+
+		if (ret)
+			ib_free_send_mad(send_buf);
+	}
+}
+
+int mthca_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      const struct ib_wc *in_wc,
+		      const struct ib_grh *in_grh,
+		      const struct ib_mad_hdr *in, size_t in_mad_size,
+		      struct ib_mad_hdr *out, size_t *out_mad_size,
+		      u16 *out_mad_pkey_index)
+{
+	int err;
+	u16 slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
+	u16 prev_lid = 0;
+	struct ib_port_attr pattr;
+	const struct ib_mad *in_mad = (const struct ib_mad *)in;
+	struct ib_mad *out_mad = (struct ib_mad *)out;
+
+	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
+			 *out_mad_size != sizeof(*out_mad)))
+		return IB_MAD_RESULT_FAILURE;
+
+	/* Forward locally generated traps to the SM */
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP &&
+	    slid == 0) {
+		forward_trap(to_mdev(ibdev), port_num, in_mad);
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+	}
+
+	/*
+	 * Only handle SM gets, sets and trap represses for SM class
+	 *
+	 * Only handle PMA and Mellanox vendor-specific class gets and
+	 * sets for other classes.
+	 */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/*
+		 * Don't process SMInfo queries or vendor-specific
+		 * MADs -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
+		    ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) ==
+		     IB_SMP_ATTR_VENDOR_MASK))
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS1     ||
+		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS2) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else
+		return IB_MAD_RESULT_SUCCESS;
+	if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
+	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+	    !ib_query_port(ibdev, port_num, &pattr))
+		prev_lid = ib_lid_cpu16(pattr.lid);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev),
+			    mad_flags & IB_MAD_IGNORE_MKEY,
+			    mad_flags & IB_MAD_IGNORE_BKEY,
+			    port_num, in_wc, in_grh, in_mad, out_mad);
+	if (err == -EBADMSG)
+		return IB_MAD_RESULT_SUCCESS;
+	else if (err) {
+		mthca_err(to_mdev(ibdev), "MAD_IFC returned %d\n", err);
+		return IB_MAD_RESULT_FAILURE;
+	}
+
+	if (!out_mad->mad_hdr.status) {
+		smp_snoop(ibdev, port_num, in_mad, prev_lid);
+		node_desc_override(ibdev, out_mad);
+	}
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mthca_create_agents(struct mthca_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+	int ret;
+
+	spin_lock_init(&dev->sm_lock);
+
+	for (p = 0; p < dev->limits.num_ports; ++p)
+		for (q = 0; q <= 1; ++q) {
+			agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+						      q ? IB_QPT_GSI : IB_QPT_SMI,
+						      NULL, 0, send_handler,
+						      NULL, NULL, 0);
+			if (IS_ERR(agent)) {
+				ret = PTR_ERR(agent);
+				goto err;
+			}
+			dev->send_agent[p][q] = agent;
+		}
+
+
+	for (p = 1; p <= dev->limits.num_ports; ++p) {
+		ret = mthca_update_rate(dev, p);
+		if (ret) {
+			mthca_err(dev, "Failed to obtain port %d rate."
+				  " aborting.\n", p);
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	for (p = 0; p < dev->limits.num_ports; ++p)
+		for (q = 0; q <= 1; ++q)
+			if (dev->send_agent[p][q])
+				ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+	return ret;
+}
+
+void mthca_free_agents(struct mthca_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+
+	for (p = 0; p < dev->limits.num_ports; ++p) {
+		for (q = 0; q <= 1; ++q) {
+			agent = dev->send_agent[p][q];
+			dev->send_agent[p][q] = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+
+		if (dev->sm_ah[p])
+			rdma_destroy_ah(dev->sm_ah[p]);
+	}
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
new file mode 100644
index 0000000..f3e80de
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -0,0 +1,1272 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/gfp.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_profile.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
+
+int mthca_debug_level = 0;
+module_param_named(debug_level, mthca_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x = 1;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static int tune_pci = 0;
+module_param(tune_pci, int, 0444);
+MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero");
+
+DEFINE_MUTEX(mthca_device_mutex);
+
+#define MTHCA_DEFAULT_NUM_QP            (1 << 16)
+#define MTHCA_DEFAULT_RDB_PER_QP        (1 << 2)
+#define MTHCA_DEFAULT_NUM_CQ            (1 << 16)
+#define MTHCA_DEFAULT_NUM_MCG           (1 << 13)
+#define MTHCA_DEFAULT_NUM_MPT           (1 << 17)
+#define MTHCA_DEFAULT_NUM_MTT           (1 << 20)
+#define MTHCA_DEFAULT_NUM_UDAV          (1 << 15)
+#define MTHCA_DEFAULT_NUM_RESERVED_MTTS (1 << 18)
+#define MTHCA_DEFAULT_NUM_UARC_SIZE     (1 << 18)
+
+static struct mthca_profile hca_profile = {
+	.num_qp             = MTHCA_DEFAULT_NUM_QP,
+	.rdb_per_qp         = MTHCA_DEFAULT_RDB_PER_QP,
+	.num_cq             = MTHCA_DEFAULT_NUM_CQ,
+	.num_mcg            = MTHCA_DEFAULT_NUM_MCG,
+	.num_mpt            = MTHCA_DEFAULT_NUM_MPT,
+	.num_mtt            = MTHCA_DEFAULT_NUM_MTT,
+	.num_udav           = MTHCA_DEFAULT_NUM_UDAV,          /* Tavor only */
+	.fmr_reserved_mtts  = MTHCA_DEFAULT_NUM_RESERVED_MTTS, /* Tavor only */
+	.uarc_size          = MTHCA_DEFAULT_NUM_UARC_SIZE,     /* Arbel only */
+};
+
+module_param_named(num_qp, hca_profile.num_qp, int, 0444);
+MODULE_PARM_DESC(num_qp, "maximum number of QPs per HCA");
+
+module_param_named(rdb_per_qp, hca_profile.rdb_per_qp, int, 0444);
+MODULE_PARM_DESC(rdb_per_qp, "number of RDB buffers per QP");
+
+module_param_named(num_cq, hca_profile.num_cq, int, 0444);
+MODULE_PARM_DESC(num_cq, "maximum number of CQs per HCA");
+
+module_param_named(num_mcg, hca_profile.num_mcg, int, 0444);
+MODULE_PARM_DESC(num_mcg, "maximum number of multicast groups per HCA");
+
+module_param_named(num_mpt, hca_profile.num_mpt, int, 0444);
+MODULE_PARM_DESC(num_mpt,
+		"maximum number of memory protection table entries per HCA");
+
+module_param_named(num_mtt, hca_profile.num_mtt, int, 0444);
+MODULE_PARM_DESC(num_mtt,
+		 "maximum number of memory translation table segments per HCA");
+
+module_param_named(num_udav, hca_profile.num_udav, int, 0444);
+MODULE_PARM_DESC(num_udav, "maximum number of UD address vectors per HCA");
+
+module_param_named(fmr_reserved_mtts, hca_profile.fmr_reserved_mtts, int, 0444);
+MODULE_PARM_DESC(fmr_reserved_mtts,
+		 "number of memory translation table segments reserved for FMR");
+
+static int log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-5)");
+
+static char mthca_version[] =
+	DRV_NAME ": Mellanox InfiniBand HCA driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static int mthca_tune_pci(struct mthca_dev *mdev)
+{
+	if (!tune_pci)
+		return 0;
+
+	/* First try to max out Read Byte Count */
+	if (pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX)) {
+		if (pcix_set_mmrbc(mdev->pdev, pcix_get_max_mmrbc(mdev->pdev))) {
+			mthca_err(mdev, "Couldn't set PCI-X max read count, "
+				"aborting.\n");
+			return -ENODEV;
+		}
+	} else if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE))
+		mthca_info(mdev, "No PCI-X capability, not setting RBC.\n");
+
+	if (pci_is_pcie(mdev->pdev)) {
+		if (pcie_set_readrq(mdev->pdev, 4096)) {
+			mthca_err(mdev, "Couldn't write PCI Express read request, "
+				"aborting.\n");
+			return -ENODEV;
+		}
+	} else if (mdev->mthca_flags & MTHCA_FLAG_PCIE)
+		mthca_info(mdev, "No PCI Express capability, "
+			   "not setting Max Read Request Size.\n");
+
+	return 0;
+}
+
+static int mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim)
+{
+	int err;
+
+	mdev->limits.mtt_seg_size = (1 << log_mtts_per_seg) * 8;
+	err = mthca_QUERY_DEV_LIM(mdev, dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command returned %d"
+				", aborting.\n", err);
+		return err;
+	}
+	if (dev_lim->min_page_sz > PAGE_SIZE) {
+		mthca_err(mdev, "HCA minimum page size of %d bigger than "
+			  "kernel PAGE_SIZE of %ld, aborting.\n",
+			  dev_lim->min_page_sz, PAGE_SIZE);
+		return -ENODEV;
+	}
+	if (dev_lim->num_ports > MTHCA_MAX_PORTS) {
+		mthca_err(mdev, "HCA has %d ports, but we only support %d, "
+			  "aborting.\n",
+			  dev_lim->num_ports, MTHCA_MAX_PORTS);
+		return -ENODEV;
+	}
+
+	if (dev_lim->uar_size > pci_resource_len(mdev->pdev, 2)) {
+		mthca_err(mdev, "HCA reported UAR size of 0x%x bigger than "
+			  "PCI resource 2 size of 0x%llx, aborting.\n",
+			  dev_lim->uar_size,
+			  (unsigned long long)pci_resource_len(mdev->pdev, 2));
+		return -ENODEV;
+	}
+
+	mdev->limits.num_ports      	= dev_lim->num_ports;
+	mdev->limits.vl_cap             = dev_lim->max_vl;
+	mdev->limits.mtu_cap            = dev_lim->max_mtu;
+	mdev->limits.gid_table_len  	= dev_lim->max_gids;
+	mdev->limits.pkey_table_len 	= dev_lim->max_pkeys;
+	mdev->limits.local_ca_ack_delay = dev_lim->local_ca_ack_delay;
+	/*
+	 * Need to allow for worst case send WQE overhead and check
+	 * whether max_desc_sz imposes a lower limit than max_sg; UD
+	 * send has the biggest overhead.
+	 */
+	mdev->limits.max_sg		= min_t(int, dev_lim->max_sg,
+					      (dev_lim->max_desc_sz -
+					       sizeof (struct mthca_next_seg) -
+					       (mthca_is_memfree(mdev) ?
+						sizeof (struct mthca_arbel_ud_seg) :
+						sizeof (struct mthca_tavor_ud_seg))) /
+						sizeof (struct mthca_data_seg));
+	mdev->limits.max_wqes           = dev_lim->max_qp_sz;
+	mdev->limits.max_qp_init_rdma   = dev_lim->max_requester_per_qp;
+	mdev->limits.reserved_qps       = dev_lim->reserved_qps;
+	mdev->limits.max_srq_wqes       = dev_lim->max_srq_sz;
+	mdev->limits.reserved_srqs      = dev_lim->reserved_srqs;
+	mdev->limits.reserved_eecs      = dev_lim->reserved_eecs;
+	mdev->limits.max_desc_sz        = dev_lim->max_desc_sz;
+	mdev->limits.max_srq_sge	= mthca_max_srq_sge(mdev);
+	/*
+	 * Subtract 1 from the limit because we need to allocate a
+	 * spare CQE so the HCA HW can tell the difference between an
+	 * empty CQ and a full CQ.
+	 */
+	mdev->limits.max_cqes           = dev_lim->max_cq_sz - 1;
+	mdev->limits.reserved_cqs       = dev_lim->reserved_cqs;
+	mdev->limits.reserved_eqs       = dev_lim->reserved_eqs;
+	mdev->limits.reserved_mtts      = dev_lim->reserved_mtts;
+	mdev->limits.reserved_mrws      = dev_lim->reserved_mrws;
+	mdev->limits.reserved_uars      = dev_lim->reserved_uars;
+	mdev->limits.reserved_pds       = dev_lim->reserved_pds;
+	mdev->limits.port_width_cap     = dev_lim->max_port_width;
+	mdev->limits.page_size_cap      = ~(u32) (dev_lim->min_page_sz - 1);
+	mdev->limits.flags              = dev_lim->flags;
+	/*
+	 * For old FW that doesn't return static rate support, use a
+	 * value of 0x3 (only static rate values of 0 or 1 are handled),
+	 * except on Sinai, where even old FW can handle static rate
+	 * values of 2 and 3.
+	 */
+	if (dev_lim->stat_rate_support)
+		mdev->limits.stat_rate_support = dev_lim->stat_rate_support;
+	else if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		mdev->limits.stat_rate_support = 0xf;
+	else
+		mdev->limits.stat_rate_support = 0x3;
+
+	/* IB_DEVICE_RESIZE_MAX_WR not supported by driver.
+	   May be doable since hardware supports it for SRQ.
+
+	   IB_DEVICE_N_NOTIFY_CQ is supported by hardware but not by driver.
+
+	   IB_DEVICE_SRQ_RESIZE is supported by hardware but SRQ is not
+	   supported by driver. */
+	mdev->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT |
+		IB_DEVICE_SYS_IMAGE_GUID |
+		IB_DEVICE_RC_RNR_NAK_GEN;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_BAD_PKEY_CNTR)
+		mdev->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_BAD_QKEY_CNTR)
+		mdev->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_RAW_MULTI)
+		mdev->device_cap_flags |= IB_DEVICE_RAW_MULTI;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_AUTO_PATH_MIG)
+		mdev->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_UD_AV_PORT_ENFORCE)
+		mdev->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_SRQ)
+		mdev->mthca_flags |= MTHCA_FLAG_SRQ;
+
+	if (mthca_is_memfree(mdev))
+		if (dev_lim->flags & DEV_LIM_FLAG_IPOIB_CSUM)
+			mdev->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+
+	return 0;
+}
+
+static int mthca_init_tavor(struct mthca_dev *mdev)
+{
+	s64 size;
+	int err;
+	struct mthca_dev_lim        dev_lim;
+	struct mthca_profile        profile;
+	struct mthca_init_hca_param init_hca;
+
+	err = mthca_SYS_EN(mdev);
+	if (err) {
+		mthca_err(mdev, "SYS_EN command returned %d, aborting.\n", err);
+		return err;
+	}
+
+	err = mthca_QUERY_FW(mdev);
+	if (err) {
+		mthca_err(mdev, "QUERY_FW command returned %d,"
+				" aborting.\n", err);
+		goto err_disable;
+	}
+	err = mthca_QUERY_DDR(mdev);
+	if (err) {
+		mthca_err(mdev, "QUERY_DDR command returned %d, aborting.\n", err);
+		goto err_disable;
+	}
+
+	err = mthca_dev_lim(mdev, &dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command returned %d, aborting.\n", err);
+		goto err_disable;
+	}
+
+	profile = hca_profile;
+	profile.num_uar   = dev_lim.uar_size / PAGE_SIZE;
+	profile.uarc_size = 0;
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		profile.num_srq = dev_lim.max_srqs;
+
+	size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+	if (size < 0) {
+		err = size;
+		goto err_disable;
+	}
+
+	err = mthca_INIT_HCA(mdev, &init_hca);
+	if (err) {
+		mthca_err(mdev, "INIT_HCA command returned %d, aborting.\n", err);
+		goto err_disable;
+	}
+
+	return 0;
+
+err_disable:
+	mthca_SYS_DIS(mdev);
+
+	return err;
+}
+
+static int mthca_load_fw(struct mthca_dev *mdev)
+{
+	int err;
+
+	/* FIXME: use HCA-attached memory for FW if present */
+
+	mdev->fw.arbel.fw_icm =
+		mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages,
+				GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!mdev->fw.arbel.fw_icm) {
+		mthca_err(mdev, "Couldn't allocate FW area, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mthca_MAP_FA(mdev, mdev->fw.arbel.fw_icm);
+	if (err) {
+		mthca_err(mdev, "MAP_FA command returned %d, aborting.\n", err);
+		goto err_free;
+	}
+	err = mthca_RUN_FW(mdev);
+	if (err) {
+		mthca_err(mdev, "RUN_FW command returned %d, aborting.\n", err);
+		goto err_unmap_fa;
+	}
+
+	return 0;
+
+err_unmap_fa:
+	mthca_UNMAP_FA(mdev);
+
+err_free:
+	mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+	return err;
+}
+
+static int mthca_init_icm(struct mthca_dev *mdev,
+			  struct mthca_dev_lim *dev_lim,
+			  struct mthca_init_hca_param *init_hca,
+			  u64 icm_size)
+{
+	u64 aux_pages;
+	int err;
+
+	err = mthca_SET_ICM_SIZE(mdev, icm_size, &aux_pages);
+	if (err) {
+		mthca_err(mdev, "SET_ICM_SIZE command returned %d, aborting.\n", err);
+		return err;
+	}
+
+	mthca_dbg(mdev, "%lld KB of HCA context requires %lld KB aux memory.\n",
+		  (unsigned long long) icm_size >> 10,
+		  (unsigned long long) aux_pages << 2);
+
+	mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages,
+						 GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!mdev->fw.arbel.aux_icm) {
+		mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mthca_MAP_ICM_AUX(mdev, mdev->fw.arbel.aux_icm);
+	if (err) {
+		mthca_err(mdev, "MAP_ICM_AUX returned %d, aborting.\n", err);
+		goto err_free_aux;
+	}
+
+	err = mthca_map_eq_icm(mdev, init_hca->eqc_base);
+	if (err) {
+		mthca_err(mdev, "Failed to map EQ context memory, aborting.\n");
+		goto err_unmap_aux;
+	}
+
+	/* CPU writes to non-reserved MTTs, while HCA might DMA to reserved mtts */
+	mdev->limits.reserved_mtts = ALIGN(mdev->limits.reserved_mtts * mdev->limits.mtt_seg_size,
+					   dma_get_cache_alignment()) / mdev->limits.mtt_seg_size;
+
+	mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base,
+							 mdev->limits.mtt_seg_size,
+							 mdev->limits.num_mtt_segs,
+							 mdev->limits.reserved_mtts,
+							 1, 0);
+	if (!mdev->mr_table.mtt_table) {
+		mthca_err(mdev, "Failed to map MTT context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_eq;
+	}
+
+	mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base,
+							 dev_lim->mpt_entry_sz,
+							 mdev->limits.num_mpts,
+							 mdev->limits.reserved_mrws,
+							 1, 1);
+	if (!mdev->mr_table.mpt_table) {
+		mthca_err(mdev, "Failed to map MPT context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_mtt;
+	}
+
+	mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base,
+							dev_lim->qpc_entry_sz,
+							mdev->limits.num_qps,
+							mdev->limits.reserved_qps,
+							0, 0);
+	if (!mdev->qp_table.qp_table) {
+		mthca_err(mdev, "Failed to map QP context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_mpt;
+	}
+
+	mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base,
+							 dev_lim->eqpc_entry_sz,
+							 mdev->limits.num_qps,
+							 mdev->limits.reserved_qps,
+							 0, 0);
+	if (!mdev->qp_table.eqp_table) {
+		mthca_err(mdev, "Failed to map EQP context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_qp;
+	}
+
+	mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base,
+							 MTHCA_RDB_ENTRY_SIZE,
+							 mdev->limits.num_qps <<
+							 mdev->qp_table.rdb_shift, 0,
+							 0, 0);
+	if (!mdev->qp_table.rdb_table) {
+		mthca_err(mdev, "Failed to map RDB context memory, aborting\n");
+		err = -ENOMEM;
+		goto err_unmap_eqp;
+	}
+
+	mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base,
+						     dev_lim->cqc_entry_sz,
+						     mdev->limits.num_cqs,
+						     mdev->limits.reserved_cqs,
+						     0, 0);
+	if (!mdev->cq_table.table) {
+		mthca_err(mdev, "Failed to map CQ context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_rdb;
+	}
+
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ) {
+		mdev->srq_table.table =
+			mthca_alloc_icm_table(mdev, init_hca->srqc_base,
+					      dev_lim->srq_entry_sz,
+					      mdev->limits.num_srqs,
+					      mdev->limits.reserved_srqs,
+					      0, 0);
+		if (!mdev->srq_table.table) {
+			mthca_err(mdev, "Failed to map SRQ context memory, "
+				  "aborting.\n");
+			err = -ENOMEM;
+			goto err_unmap_cq;
+		}
+	}
+
+	/*
+	 * It's not strictly required, but for simplicity just map the
+	 * whole multicast group table now.  The table isn't very big
+	 * and it's a lot easier than trying to track ref counts.
+	 */
+	mdev->mcg_table.table = mthca_alloc_icm_table(mdev, init_hca->mc_base,
+						      MTHCA_MGM_ENTRY_SIZE,
+						      mdev->limits.num_mgms +
+						      mdev->limits.num_amgms,
+						      mdev->limits.num_mgms +
+						      mdev->limits.num_amgms,
+						      0, 0);
+	if (!mdev->mcg_table.table) {
+		mthca_err(mdev, "Failed to map MCG context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_srq;
+	}
+
+	return 0;
+
+err_unmap_srq:
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		mthca_free_icm_table(mdev, mdev->srq_table.table);
+
+err_unmap_cq:
+	mthca_free_icm_table(mdev, mdev->cq_table.table);
+
+err_unmap_rdb:
+	mthca_free_icm_table(mdev, mdev->qp_table.rdb_table);
+
+err_unmap_eqp:
+	mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+
+err_unmap_qp:
+	mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+
+err_unmap_mpt:
+	mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+
+err_unmap_mtt:
+	mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+
+err_unmap_eq:
+	mthca_unmap_eq_icm(mdev);
+
+err_unmap_aux:
+	mthca_UNMAP_ICM_AUX(mdev);
+
+err_free_aux:
+	mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
+
+	return err;
+}
+
+static void mthca_free_icms(struct mthca_dev *mdev)
+{
+
+	mthca_free_icm_table(mdev, mdev->mcg_table.table);
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		mthca_free_icm_table(mdev, mdev->srq_table.table);
+	mthca_free_icm_table(mdev, mdev->cq_table.table);
+	mthca_free_icm_table(mdev, mdev->qp_table.rdb_table);
+	mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+	mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+	mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+	mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+	mthca_unmap_eq_icm(mdev);
+
+	mthca_UNMAP_ICM_AUX(mdev);
+	mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
+}
+
+static int mthca_init_arbel(struct mthca_dev *mdev)
+{
+	struct mthca_dev_lim        dev_lim;
+	struct mthca_profile        profile;
+	struct mthca_init_hca_param init_hca;
+	s64 icm_size;
+	int err;
+
+	err = mthca_QUERY_FW(mdev);
+	if (err) {
+		mthca_err(mdev, "QUERY_FW command failed %d, aborting.\n", err);
+		return err;
+	}
+
+	err = mthca_ENABLE_LAM(mdev);
+	if (err == -EAGAIN) {
+		mthca_dbg(mdev, "No HCA-attached memory (running in MemFree mode)\n");
+		mdev->mthca_flags |= MTHCA_FLAG_NO_LAM;
+	} else if (err) {
+		mthca_err(mdev, "ENABLE_LAM returned %d, aborting.\n", err);
+		return err;
+	}
+
+	err = mthca_load_fw(mdev);
+	if (err) {
+		mthca_err(mdev, "Loading FW returned %d, aborting.\n", err);
+		goto err_disable;
+	}
+
+	err = mthca_dev_lim(mdev, &dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM returned %d, aborting.\n", err);
+		goto err_stop_fw;
+	}
+
+	profile = hca_profile;
+	profile.num_uar  = dev_lim.uar_size / PAGE_SIZE;
+	profile.num_udav = 0;
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		profile.num_srq = dev_lim.max_srqs;
+
+	icm_size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+	if (icm_size < 0) {
+		err = icm_size;
+		goto err_stop_fw;
+	}
+
+	err = mthca_init_icm(mdev, &dev_lim, &init_hca, icm_size);
+	if (err)
+		goto err_stop_fw;
+
+	err = mthca_INIT_HCA(mdev, &init_hca);
+	if (err) {
+		mthca_err(mdev, "INIT_HCA command returned %d, aborting.\n", err);
+		goto err_free_icm;
+	}
+
+	return 0;
+
+err_free_icm:
+	mthca_free_icms(mdev);
+
+err_stop_fw:
+	mthca_UNMAP_FA(mdev);
+	mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+
+err_disable:
+	if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+		mthca_DISABLE_LAM(mdev);
+
+	return err;
+}
+
+static void mthca_close_hca(struct mthca_dev *mdev)
+{
+	mthca_CLOSE_HCA(mdev, 0);
+
+	if (mthca_is_memfree(mdev)) {
+		mthca_free_icms(mdev);
+
+		mthca_UNMAP_FA(mdev);
+		mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+
+		if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+			mthca_DISABLE_LAM(mdev);
+	} else
+		mthca_SYS_DIS(mdev);
+}
+
+static int mthca_init_hca(struct mthca_dev *mdev)
+{
+	int err;
+	struct mthca_adapter adapter;
+
+	if (mthca_is_memfree(mdev))
+		err = mthca_init_arbel(mdev);
+	else
+		err = mthca_init_tavor(mdev);
+
+	if (err)
+		return err;
+
+	err = mthca_QUERY_ADAPTER(mdev, &adapter);
+	if (err) {
+		mthca_err(mdev, "QUERY_ADAPTER command returned %d, aborting.\n", err);
+		goto err_close;
+	}
+
+	mdev->eq_table.inta_pin = adapter.inta_pin;
+	if (!mthca_is_memfree(mdev))
+		mdev->rev_id = adapter.revision_id;
+	memcpy(mdev->board_id, adapter.board_id, sizeof mdev->board_id);
+
+	return 0;
+
+err_close:
+	mthca_close_hca(mdev);
+	return err;
+}
+
+static int mthca_setup_hca(struct mthca_dev *dev)
+{
+	int err;
+
+	MTHCA_INIT_DOORBELL_LOCK(&dev->doorbell_lock);
+
+	err = mthca_init_uar_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "user access region table, aborting.\n");
+		return err;
+	}
+
+	err = mthca_uar_alloc(dev, &dev->driver_uar);
+	if (err) {
+		mthca_err(dev, "Failed to allocate driver access region, "
+			  "aborting.\n");
+		goto err_uar_table_free;
+	}
+
+	dev->kar = ioremap((phys_addr_t) dev->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!dev->kar) {
+		mthca_err(dev, "Couldn't map kernel access region, "
+			  "aborting.\n");
+		err = -ENOMEM;
+		goto err_uar_free;
+	}
+
+	err = mthca_init_pd_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "protection domain table, aborting.\n");
+		goto err_kar_unmap;
+	}
+
+	err = mthca_init_mr_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "memory region table, aborting.\n");
+		goto err_pd_table_free;
+	}
+
+	err = mthca_pd_alloc(dev, 1, &dev->driver_pd);
+	if (err) {
+		mthca_err(dev, "Failed to create driver PD, "
+			  "aborting.\n");
+		goto err_mr_table_free;
+	}
+
+	err = mthca_init_eq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "event queue table, aborting.\n");
+		goto err_pd_free;
+	}
+
+	err = mthca_cmd_use_events(dev);
+	if (err) {
+		mthca_err(dev, "Failed to switch to event-driven "
+			  "firmware commands, aborting.\n");
+		goto err_eq_table_free;
+	}
+
+	err = mthca_NOP(dev);
+	if (err) {
+		if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+			mthca_warn(dev, "NOP command failed to generate interrupt "
+				   "(IRQ %d).\n",
+				   dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector);
+			mthca_warn(dev, "Trying again with MSI-X disabled.\n");
+		} else {
+			mthca_err(dev, "NOP command failed to generate interrupt "
+				  "(IRQ %d), aborting.\n",
+				  dev->pdev->irq);
+			mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+		}
+
+		goto err_cmd_poll;
+	}
+
+	mthca_dbg(dev, "NOP command IRQ test passed\n");
+
+	err = mthca_init_cq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "completion queue table, aborting.\n");
+		goto err_cmd_poll;
+	}
+
+	err = mthca_init_srq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "shared receive queue table, aborting.\n");
+		goto err_cq_table_free;
+	}
+
+	err = mthca_init_qp_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "queue pair table, aborting.\n");
+		goto err_srq_table_free;
+	}
+
+	err = mthca_init_av_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "address vector table, aborting.\n");
+		goto err_qp_table_free;
+	}
+
+	err = mthca_init_mcg_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "multicast group table, aborting.\n");
+		goto err_av_table_free;
+	}
+
+	return 0;
+
+err_av_table_free:
+	mthca_cleanup_av_table(dev);
+
+err_qp_table_free:
+	mthca_cleanup_qp_table(dev);
+
+err_srq_table_free:
+	mthca_cleanup_srq_table(dev);
+
+err_cq_table_free:
+	mthca_cleanup_cq_table(dev);
+
+err_cmd_poll:
+	mthca_cmd_use_polling(dev);
+
+err_eq_table_free:
+	mthca_cleanup_eq_table(dev);
+
+err_pd_free:
+	mthca_pd_free(dev, &dev->driver_pd);
+
+err_mr_table_free:
+	mthca_cleanup_mr_table(dev);
+
+err_pd_table_free:
+	mthca_cleanup_pd_table(dev);
+
+err_kar_unmap:
+	iounmap(dev->kar);
+
+err_uar_free:
+	mthca_uar_free(dev, &dev->driver_uar);
+
+err_uar_table_free:
+	mthca_cleanup_uar_table(dev);
+	return err;
+}
+
+static int mthca_enable_msi_x(struct mthca_dev *mdev)
+{
+	int err;
+
+	err = pci_alloc_irq_vectors(mdev->pdev, 3, 3, PCI_IRQ_MSIX);
+	if (err < 0)
+		return err;
+
+	mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector =
+			pci_irq_vector(mdev->pdev, 0);
+	mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector =
+			pci_irq_vector(mdev->pdev, 1);
+	mdev->eq_table.eq[MTHCA_EQ_CMD  ].msi_x_vector =
+			pci_irq_vector(mdev->pdev, 2);
+
+	return 0;
+}
+
+/* Types of supported HCA */
+enum {
+	TAVOR,			/* MT23108                        */
+	ARBEL_COMPAT,		/* MT25208 in Tavor compat mode   */
+	ARBEL_NATIVE,		/* MT25208 with extended features */
+	SINAI			/* MT25204 */
+};
+
+#define MTHCA_FW_VER(major, minor, subminor) \
+	(((u64) (major) << 32) | ((u64) (minor) << 16) | (u64) (subminor))
+
+static struct {
+	u64 latest_fw;
+	u32 flags;
+} mthca_hca_table[] = {
+	[TAVOR]        = { .latest_fw = MTHCA_FW_VER(3, 5, 0),
+			   .flags     = 0 },
+	[ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 8, 200),
+			   .flags     = MTHCA_FLAG_PCIE },
+	[ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 3, 0),
+			   .flags     = MTHCA_FLAG_MEMFREE |
+					MTHCA_FLAG_PCIE },
+	[SINAI]        = { .latest_fw = MTHCA_FW_VER(1, 2, 0),
+			   .flags     = MTHCA_FLAG_MEMFREE |
+					MTHCA_FLAG_PCIE    |
+					MTHCA_FLAG_SINAI_OPT }
+};
+
+static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
+{
+	int ddr_hidden = 0;
+	int err;
+	struct mthca_dev *mdev;
+
+	printk(KERN_INFO PFX "Initializing %s\n",
+	       pci_name(pdev));
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, "
+			"aborting.\n");
+		return err;
+	}
+
+	/*
+	 * Check for BARs.  We expect 0: 1MB, 2: 8MB, 4: DDR (may not
+	 * be present)
+	 */
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+	    pci_resource_len(pdev, 0) != 1 << 20) {
+		dev_err(&pdev->dev, "Missing DCS, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing UAR, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM))
+		ddr_hidden = 1;
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot obtain PCI resources, "
+			"aborting.\n");
+		goto err_disable_pdev;
+	}
+
+	pci_set_master(pdev);
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
+			goto err_free_res;
+		}
+	}
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
+			 "consistent PCI DMA mask.\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
+				"aborting.\n");
+			goto err_free_res;
+		}
+	}
+
+	/* We can handle large RDMA requests, so allow larger segments. */
+	dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024);
+
+	mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev);
+	if (!mdev) {
+		dev_err(&pdev->dev, "Device struct alloc failed, "
+			"aborting.\n");
+		err = -ENOMEM;
+		goto err_free_res;
+	}
+
+	mdev->pdev = pdev;
+
+	mdev->mthca_flags = mthca_hca_table[hca_type].flags;
+	if (ddr_hidden)
+		mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;
+
+	/*
+	 * Now reset the HCA before we touch the PCI capabilities or
+	 * attempt a firmware command, since a boot ROM may have left
+	 * the HCA in an undefined state.
+	 */
+	err = mthca_reset(mdev);
+	if (err) {
+		mthca_err(mdev, "Failed to reset HCA, aborting.\n");
+		goto err_free_dev;
+	}
+
+	if (mthca_cmd_init(mdev)) {
+		mthca_err(mdev, "Failed to init command interface, aborting.\n");
+		goto err_free_dev;
+	}
+
+	err = mthca_tune_pci(mdev);
+	if (err)
+		goto err_cmd;
+
+	err = mthca_init_hca(mdev);
+	if (err)
+		goto err_cmd;
+
+	if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) {
+		mthca_warn(mdev, "HCA FW version %d.%d.%03d is old (%d.%d.%03d is current).\n",
+			   (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff,
+			   (int) (mdev->fw_ver & 0xffff),
+			   (int) (mthca_hca_table[hca_type].latest_fw >> 32),
+			   (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff,
+			   (int) (mthca_hca_table[hca_type].latest_fw & 0xffff));
+		mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n");
+	}
+
+	if (msi_x && !mthca_enable_msi_x(mdev))
+		mdev->mthca_flags |= MTHCA_FLAG_MSI_X;
+
+	err = mthca_setup_hca(mdev);
+	if (err == -EBUSY && (mdev->mthca_flags & MTHCA_FLAG_MSI_X)) {
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+			pci_free_irq_vectors(pdev);
+		mdev->mthca_flags &= ~MTHCA_FLAG_MSI_X;
+
+		err = mthca_setup_hca(mdev);
+	}
+
+	if (err)
+		goto err_close;
+
+	err = mthca_register_device(mdev);
+	if (err)
+		goto err_cleanup;
+
+	err = mthca_create_agents(mdev);
+	if (err)
+		goto err_unregister;
+
+	pci_set_drvdata(pdev, mdev);
+	mdev->hca_type = hca_type;
+
+	mdev->active = true;
+
+	return 0;
+
+err_unregister:
+	mthca_unregister_device(mdev);
+
+err_cleanup:
+	mthca_cleanup_mcg_table(mdev);
+	mthca_cleanup_av_table(mdev);
+	mthca_cleanup_qp_table(mdev);
+	mthca_cleanup_srq_table(mdev);
+	mthca_cleanup_cq_table(mdev);
+	mthca_cmd_use_polling(mdev);
+	mthca_cleanup_eq_table(mdev);
+
+	mthca_pd_free(mdev, &mdev->driver_pd);
+
+	mthca_cleanup_mr_table(mdev);
+	mthca_cleanup_pd_table(mdev);
+	mthca_cleanup_uar_table(mdev);
+
+err_close:
+	if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+		pci_free_irq_vectors(pdev);
+
+	mthca_close_hca(mdev);
+
+err_cmd:
+	mthca_cmd_cleanup(mdev);
+
+err_free_dev:
+	ib_dealloc_device(&mdev->ib_dev);
+
+err_free_res:
+	pci_release_regions(pdev);
+
+err_disable_pdev:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return err;
+}
+
+static void __mthca_remove_one(struct pci_dev *pdev)
+{
+	struct mthca_dev *mdev = pci_get_drvdata(pdev);
+	int p;
+
+	if (mdev) {
+		mthca_free_agents(mdev);
+		mthca_unregister_device(mdev);
+
+		for (p = 1; p <= mdev->limits.num_ports; ++p)
+			mthca_CLOSE_IB(mdev, p);
+
+		mthca_cleanup_mcg_table(mdev);
+		mthca_cleanup_av_table(mdev);
+		mthca_cleanup_qp_table(mdev);
+		mthca_cleanup_srq_table(mdev);
+		mthca_cleanup_cq_table(mdev);
+		mthca_cmd_use_polling(mdev);
+		mthca_cleanup_eq_table(mdev);
+
+		mthca_pd_free(mdev, &mdev->driver_pd);
+
+		mthca_cleanup_mr_table(mdev);
+		mthca_cleanup_pd_table(mdev);
+
+		iounmap(mdev->kar);
+		mthca_uar_free(mdev, &mdev->driver_uar);
+		mthca_cleanup_uar_table(mdev);
+		mthca_close_hca(mdev);
+		mthca_cmd_cleanup(mdev);
+
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+			pci_free_irq_vectors(pdev);
+
+		ib_dealloc_device(&mdev->ib_dev);
+		pci_release_regions(pdev);
+		pci_disable_device(pdev);
+		pci_set_drvdata(pdev, NULL);
+	}
+}
+
+int __mthca_restart_one(struct pci_dev *pdev)
+{
+	struct mthca_dev *mdev;
+	int hca_type;
+
+	mdev = pci_get_drvdata(pdev);
+	if (!mdev)
+		return -ENODEV;
+	hca_type = mdev->hca_type;
+	__mthca_remove_one(pdev);
+	return __mthca_init_one(pdev, hca_type);
+}
+
+static int mthca_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	int ret;
+
+	mutex_lock(&mthca_device_mutex);
+
+	printk_once(KERN_INFO "%s", mthca_version);
+
+	if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) {
+		printk(KERN_ERR PFX "%s has invalid driver data %lx\n",
+		       pci_name(pdev), id->driver_data);
+		mutex_unlock(&mthca_device_mutex);
+		return -ENODEV;
+	}
+
+	ret = __mthca_init_one(pdev, id->driver_data);
+
+	mutex_unlock(&mthca_device_mutex);
+
+	return ret;
+}
+
+static void mthca_remove_one(struct pci_dev *pdev)
+{
+	mutex_lock(&mthca_device_mutex);
+	__mthca_remove_one(pdev);
+	mutex_unlock(&mthca_device_mutex);
+}
+
+static const struct pci_device_id mthca_pci_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR),
+	  .driver_data = TAVOR },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_TAVOR),
+	  .driver_data = TAVOR },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+	  .driver_data = ARBEL_COMPAT },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+	  .driver_data = ARBEL_COMPAT },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL),
+	  .driver_data = ARBEL_NATIVE },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL),
+	  .driver_data = ARBEL_NATIVE },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI),
+	  .driver_data = SINAI },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI),
+	  .driver_data = SINAI },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI_OLD),
+	  .driver_data = SINAI },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI_OLD),
+	  .driver_data = SINAI },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mthca_pci_table);
+
+static struct pci_driver mthca_driver = {
+	.name		= DRV_NAME,
+	.id_table	= mthca_pci_table,
+	.probe		= mthca_init_one,
+	.remove		= mthca_remove_one,
+};
+
+static void __init __mthca_check_profile_val(const char *name, int *pval,
+					     int pval_default)
+{
+	/* value must be positive and power of 2 */
+	int old_pval = *pval;
+
+	if (old_pval <= 0)
+		*pval = pval_default;
+	else
+		*pval = roundup_pow_of_two(old_pval);
+
+	if (old_pval != *pval) {
+		printk(KERN_WARNING PFX "Invalid value %d for %s in module parameter.\n",
+		       old_pval, name);
+		printk(KERN_WARNING PFX "Corrected %s to %d.\n", name, *pval);
+	}
+}
+
+#define mthca_check_profile_val(name, default)				\
+	__mthca_check_profile_val(#name, &hca_profile.name, default)
+
+static void __init mthca_validate_profile(void)
+{
+	mthca_check_profile_val(num_qp,            MTHCA_DEFAULT_NUM_QP);
+	mthca_check_profile_val(rdb_per_qp,        MTHCA_DEFAULT_RDB_PER_QP);
+	mthca_check_profile_val(num_cq,            MTHCA_DEFAULT_NUM_CQ);
+	mthca_check_profile_val(num_mcg, 	   MTHCA_DEFAULT_NUM_MCG);
+	mthca_check_profile_val(num_mpt, 	   MTHCA_DEFAULT_NUM_MPT);
+	mthca_check_profile_val(num_mtt, 	   MTHCA_DEFAULT_NUM_MTT);
+	mthca_check_profile_val(num_udav,          MTHCA_DEFAULT_NUM_UDAV);
+	mthca_check_profile_val(fmr_reserved_mtts, MTHCA_DEFAULT_NUM_RESERVED_MTTS);
+
+	if (hca_profile.fmr_reserved_mtts >= hca_profile.num_mtt) {
+		printk(KERN_WARNING PFX "Invalid fmr_reserved_mtts module parameter %d.\n",
+		       hca_profile.fmr_reserved_mtts);
+		printk(KERN_WARNING PFX "(Must be smaller than num_mtt %d)\n",
+		       hca_profile.num_mtt);
+		hca_profile.fmr_reserved_mtts = hca_profile.num_mtt / 2;
+		printk(KERN_WARNING PFX "Corrected fmr_reserved_mtts to %d.\n",
+		       hca_profile.fmr_reserved_mtts);
+	}
+
+	if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) {
+		printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %d\n",
+		       log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8));
+		log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
+	}
+}
+
+static int __init mthca_init(void)
+{
+	int ret;
+
+	mthca_validate_profile();
+
+	ret = mthca_catas_init();
+	if (ret)
+		return ret;
+
+	ret = pci_register_driver(&mthca_driver);
+	if (ret < 0) {
+		mthca_catas_cleanup();
+		return ret;
+	}
+
+	return 0;
+}
+
+static void __exit mthca_cleanup(void)
+{
+	pci_unregister_driver(&mthca_driver);
+	mthca_catas_cleanup();
+}
+
+module_init(mthca_init);
+module_exit(mthca_cleanup);
diff --git a/drivers/infiniband/hw/mthca/mthca_mcg.c b/drivers/infiniband/hw/mthca/mthca_mcg.c
new file mode 100644
index 0000000..6304ae8
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mcg.c
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/gfp.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+struct mthca_mgm {
+	__be32 next_gid_index;
+	u32    reserved[3];
+	u8     gid[16];
+	__be32 qp[MTHCA_QP_PER_MGM];
+};
+
+static const u8 zero_gid[16];	/* automatically initialized to 0 */
+
+/*
+ * Caller must hold MCG table semaphore.  gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ *  Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_mgm(struct mthca_dev *dev,
+		    u8 *gid, struct mthca_mailbox *mgm_mailbox,
+		    u16 *hash, int *prev, int *index)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_mgm *mgm = mgm_mailbox->buf;
+	u8 *mgid;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+	mgid = mailbox->buf;
+
+	memcpy(mgid, gid, 16);
+
+	err = mthca_MGID_HASH(dev, mailbox, hash);
+	if (err) {
+		mthca_err(dev, "MGID_HASH failed (%d)\n", err);
+		goto out;
+	}
+
+	if (0)
+		mthca_dbg(dev, "Hash for %pI6 is %04x\n", gid, *hash);
+
+	*index = *hash;
+	*prev  = -1;
+
+	do {
+		err = mthca_READ_MGM(dev, *index, mgm_mailbox);
+		if (err) {
+			mthca_err(dev, "READ_MGM failed (%d)\n", err);
+			goto out;
+		}
+
+		if (!memcmp(mgm->gid, zero_gid, 16)) {
+			if (*index != *hash) {
+				mthca_err(dev, "Found zero MGID in AMGM.\n");
+				err = -EINVAL;
+			}
+			goto out;
+		}
+
+		if (!memcmp(mgm->gid, gid, 16))
+			goto out;
+
+		*prev = *index;
+		*index = be32_to_cpu(mgm->next_gid_index) >> 6;
+	} while (*index);
+
+	*index = -1;
+
+ out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_mailbox *mailbox;
+	struct mthca_mgm *mgm;
+	u16 hash;
+	int index, prev;
+	int link = 0;
+	int i;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&dev->mcg_table.mutex);
+
+	err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index != -1) {
+		if (!memcmp(mgm->gid, zero_gid, 16))
+			memcpy(mgm->gid, gid->raw, 16);
+	} else {
+		link = 1;
+
+		index = mthca_alloc(&dev->mcg_table.alloc);
+		if (index == -1) {
+			mthca_err(dev, "No AMGM entries left\n");
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = mthca_READ_MGM(dev, index, mailbox);
+		if (err) {
+			mthca_err(dev, "READ_MGM failed (%d)\n", err);
+			goto out;
+		}
+		memset(mgm, 0, sizeof *mgm);
+		memcpy(mgm->gid, gid->raw, 16);
+	}
+
+	for (i = 0; i < MTHCA_QP_PER_MGM; ++i)
+		if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31))) {
+			mthca_dbg(dev, "QP %06x already a member of MGM\n",
+				  ibqp->qp_num);
+			err = 0;
+			goto out;
+		} else if (!(mgm->qp[i] & cpu_to_be32(1 << 31))) {
+			mgm->qp[i] = cpu_to_be32(ibqp->qp_num | (1 << 31));
+			break;
+		}
+
+	if (i == MTHCA_QP_PER_MGM) {
+		mthca_err(dev, "MGM at index %x is full.\n", index);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = mthca_WRITE_MGM(dev, index, mailbox);
+	if (err) {
+		mthca_err(dev, "WRITE_MGM failed %d\n", err);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (!link)
+		goto out;
+
+	err = mthca_READ_MGM(dev, prev, mailbox);
+	if (err) {
+		mthca_err(dev, "READ_MGM failed %d\n", err);
+		goto out;
+	}
+
+	mgm->next_gid_index = cpu_to_be32(index << 6);
+
+	err = mthca_WRITE_MGM(dev, prev, mailbox);
+	if (err)
+		mthca_err(dev, "WRITE_MGM returned %d\n", err);
+
+ out:
+	if (err && link && index != -1) {
+		BUG_ON(index < dev->limits.num_mgms);
+		mthca_free(&dev->mcg_table.alloc, index);
+	}
+	mutex_unlock(&dev->mcg_table.mutex);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_mailbox *mailbox;
+	struct mthca_mgm *mgm;
+	u16 hash;
+	int prev, index;
+	int i, loc;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&dev->mcg_table.mutex);
+
+	err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index == -1) {
+		mthca_err(dev, "MGID %pI6 not found\n", gid->raw);
+		err = -EINVAL;
+		goto out;
+	}
+
+	for (loc = -1, i = 0; i < MTHCA_QP_PER_MGM; ++i) {
+		if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31)))
+			loc = i;
+		if (!(mgm->qp[i] & cpu_to_be32(1 << 31)))
+			break;
+	}
+
+	if (loc == -1) {
+		mthca_err(dev, "QP %06x not found in MGM\n", ibqp->qp_num);
+		err = -EINVAL;
+		goto out;
+	}
+
+	mgm->qp[loc]   = mgm->qp[i - 1];
+	mgm->qp[i - 1] = 0;
+
+	err = mthca_WRITE_MGM(dev, index, mailbox);
+	if (err) {
+		mthca_err(dev, "WRITE_MGM returned %d\n", err);
+		goto out;
+	}
+
+	if (i != 1)
+		goto out;
+
+	if (prev == -1) {
+		/* Remove entry from MGM */
+		int amgm_index_to_free = be32_to_cpu(mgm->next_gid_index) >> 6;
+		if (amgm_index_to_free) {
+			err = mthca_READ_MGM(dev, amgm_index_to_free,
+					     mailbox);
+			if (err) {
+				mthca_err(dev, "READ_MGM returned %d\n", err);
+				goto out;
+			}
+		} else
+			memset(mgm->gid, 0, 16);
+
+		err = mthca_WRITE_MGM(dev, index, mailbox);
+		if (err) {
+			mthca_err(dev, "WRITE_MGM returned %d\n", err);
+			goto out;
+		}
+		if (amgm_index_to_free) {
+			BUG_ON(amgm_index_to_free < dev->limits.num_mgms);
+			mthca_free(&dev->mcg_table.alloc, amgm_index_to_free);
+		}
+	} else {
+		/* Remove entry from AMGM */
+		int curr_next_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		err = mthca_READ_MGM(dev, prev, mailbox);
+		if (err) {
+			mthca_err(dev, "READ_MGM returned %d\n", err);
+			goto out;
+		}
+
+		mgm->next_gid_index = cpu_to_be32(curr_next_index << 6);
+
+		err = mthca_WRITE_MGM(dev, prev, mailbox);
+		if (err) {
+			mthca_err(dev, "WRITE_MGM returned %d\n", err);
+			goto out;
+		}
+		BUG_ON(index < dev->limits.num_mgms);
+		mthca_free(&dev->mcg_table.alloc, index);
+	}
+
+ out:
+	mutex_unlock(&dev->mcg_table.mutex);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_init_mcg_table(struct mthca_dev *dev)
+{
+	int err;
+	int table_size = dev->limits.num_mgms + dev->limits.num_amgms;
+
+	err = mthca_alloc_init(&dev->mcg_table.alloc,
+			       table_size,
+			       table_size - 1,
+			       dev->limits.num_mgms);
+	if (err)
+		return err;
+
+	mutex_init(&dev->mcg_table.mutex);
+
+	return 0;
+}
+
+void mthca_cleanup_mcg_table(struct mthca_dev *dev)
+{
+	mthca_alloc_cleanup(&dev->mcg_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
new file mode 100644
index 0000000..2fe503e
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -0,0 +1,758 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/page.h>
+
+#include "mthca_memfree.h"
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+/*
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk.
+ */
+enum {
+	MTHCA_ICM_ALLOC_SIZE   = 1 << 18,
+	MTHCA_TABLE_CHUNK_SIZE = 1 << 18
+};
+
+struct mthca_user_db_table {
+	struct mutex mutex;
+	struct {
+		u64                uvirt;
+		struct scatterlist mem;
+		int                refcount;
+	}                page[0];
+};
+
+static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
+{
+	int i;
+
+	if (chunk->nsg > 0)
+		pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+			     PCI_DMA_BIDIRECTIONAL);
+
+	for (i = 0; i < chunk->npages; ++i)
+		__free_pages(sg_page(&chunk->mem[i]),
+			     get_order(chunk->mem[i].length));
+}
+
+static void mthca_free_icm_coherent(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
+{
+	int i;
+
+	for (i = 0; i < chunk->npages; ++i) {
+		dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+				  lowmem_page_address(sg_page(&chunk->mem[i])),
+				  sg_dma_address(&chunk->mem[i]));
+	}
+}
+
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent)
+{
+	struct mthca_icm_chunk *chunk, *tmp;
+
+	if (!icm)
+		return;
+
+	list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+		if (coherent)
+			mthca_free_icm_coherent(dev, chunk);
+		else
+			mthca_free_icm_pages(dev, chunk);
+
+		kfree(chunk);
+	}
+
+	kfree(icm);
+}
+
+static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask)
+{
+	struct page *page;
+
+	/*
+	 * Use __GFP_ZERO because buggy firmware assumes ICM pages are
+	 * cleared, and subtle failures are seen if they aren't.
+	 */
+	page = alloc_pages(gfp_mask | __GFP_ZERO, order);
+	if (!page)
+		return -ENOMEM;
+
+	sg_set_page(mem, page, PAGE_SIZE << order, 0);
+	return 0;
+}
+
+static int mthca_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
+				    int order, gfp_t gfp_mask)
+{
+	void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, &sg_dma_address(mem),
+				       gfp_mask);
+	if (!buf)
+		return -ENOMEM;
+
+	sg_set_buf(mem, buf, PAGE_SIZE << order);
+	BUG_ON(mem->offset);
+	sg_dma_len(mem) = PAGE_SIZE << order;
+	return 0;
+}
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+				  gfp_t gfp_mask, int coherent)
+{
+	struct mthca_icm *icm;
+	struct mthca_icm_chunk *chunk = NULL;
+	int cur_order;
+	int ret;
+
+	/* We use sg_set_buf for coherent allocs, which assumes low memory */
+	BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
+
+	icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+	if (!icm)
+		return icm;
+
+	icm->refcount = 0;
+	INIT_LIST_HEAD(&icm->chunk_list);
+
+	cur_order = get_order(MTHCA_ICM_ALLOC_SIZE);
+
+	while (npages > 0) {
+		if (!chunk) {
+			chunk = kmalloc(sizeof *chunk,
+					gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+			if (!chunk)
+				goto fail;
+
+			sg_init_table(chunk->mem, MTHCA_ICM_CHUNK_LEN);
+			chunk->npages = 0;
+			chunk->nsg    = 0;
+			list_add_tail(&chunk->list, &icm->chunk_list);
+		}
+
+		while (1 << cur_order > npages)
+			--cur_order;
+
+		if (coherent)
+			ret = mthca_alloc_icm_coherent(&dev->pdev->dev,
+						       &chunk->mem[chunk->npages],
+						       cur_order, gfp_mask);
+		else
+			ret = mthca_alloc_icm_pages(&chunk->mem[chunk->npages],
+						    cur_order, gfp_mask);
+
+		if (!ret) {
+			++chunk->npages;
+
+			if (coherent)
+				++chunk->nsg;
+			else if (chunk->npages == MTHCA_ICM_CHUNK_LEN) {
+				chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+							chunk->npages,
+							PCI_DMA_BIDIRECTIONAL);
+
+				if (chunk->nsg <= 0)
+					goto fail;
+			}
+
+			if (chunk->npages == MTHCA_ICM_CHUNK_LEN)
+				chunk = NULL;
+
+			npages -= 1 << cur_order;
+		} else {
+			--cur_order;
+			if (cur_order < 0)
+				goto fail;
+		}
+	}
+
+	if (!coherent && chunk) {
+		chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+					chunk->npages,
+					PCI_DMA_BIDIRECTIONAL);
+
+		if (chunk->nsg <= 0)
+			goto fail;
+	}
+
+	return icm;
+
+fail:
+	mthca_free_icm(dev, icm, coherent);
+	return NULL;
+}
+
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+	int i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+	int ret = 0;
+
+	mutex_lock(&table->mutex);
+
+	if (table->icm[i]) {
+		++table->icm[i]->refcount;
+		goto out;
+	}
+
+	table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+					(table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+					__GFP_NOWARN, table->coherent);
+	if (!table->icm[i]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (mthca_MAP_ICM(dev, table->icm[i],
+			  table->virt + i * MTHCA_TABLE_CHUNK_SIZE)) {
+		mthca_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	++table->icm[i]->refcount;
+
+out:
+	mutex_unlock(&table->mutex);
+	return ret;
+}
+
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+
+	mutex_lock(&table->mutex);
+
+	if (--table->icm[i]->refcount == 0) {
+		mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+				MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE);
+		mthca_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+	}
+
+	mutex_unlock(&table->mutex);
+}
+
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle)
+{
+	int idx, offset, dma_offset, i;
+	struct mthca_icm_chunk *chunk;
+	struct mthca_icm *icm;
+	struct page *page = NULL;
+
+	if (!table->lowmem)
+		return NULL;
+
+	mutex_lock(&table->mutex);
+
+	idx = (obj & (table->num_obj - 1)) * table->obj_size;
+	icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE];
+	dma_offset = offset = idx % MTHCA_TABLE_CHUNK_SIZE;
+
+	if (!icm)
+		goto out;
+
+	list_for_each_entry(chunk, &icm->chunk_list, list) {
+		for (i = 0; i < chunk->npages; ++i) {
+			if (dma_handle && dma_offset >= 0) {
+				if (sg_dma_len(&chunk->mem[i]) > dma_offset)
+					*dma_handle = sg_dma_address(&chunk->mem[i]) +
+						dma_offset;
+				dma_offset -= sg_dma_len(&chunk->mem[i]);
+			}
+			/* DMA mapping can merge pages but not split them,
+			 * so if we found the page, dma_handle has already
+			 * been assigned to. */
+			if (chunk->mem[i].length > offset) {
+				page = sg_page(&chunk->mem[i]);
+				goto out;
+			}
+			offset -= chunk->mem[i].length;
+		}
+	}
+
+out:
+	mutex_unlock(&table->mutex);
+	return page ? lowmem_page_address(page) + offset : NULL;
+}
+
+int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			  int start, int end)
+{
+	int inc = MTHCA_TABLE_CHUNK_SIZE / table->obj_size;
+	int i, err;
+
+	for (i = start; i <= end; i += inc) {
+		err = mthca_table_get(dev, table, i);
+		if (err)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (i > start) {
+		i -= inc;
+		mthca_table_put(dev, table, i);
+	}
+
+	return err;
+}
+
+void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			   int start, int end)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	for (i = start; i <= end; i += MTHCA_TABLE_CHUNK_SIZE / table->obj_size)
+		mthca_table_put(dev, table, i);
+}
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+					      u64 virt, int obj_size,
+					      int nobj, int reserved,
+					      int use_lowmem, int use_coherent)
+{
+	struct mthca_icm_table *table;
+	int obj_per_chunk;
+	int num_icm;
+	unsigned chunk_size;
+	int i;
+
+	obj_per_chunk = MTHCA_TABLE_CHUNK_SIZE / obj_size;
+	num_icm = DIV_ROUND_UP(nobj, obj_per_chunk);
+
+	table = kmalloc(sizeof *table + num_icm * sizeof *table->icm, GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	table->virt     = virt;
+	table->num_icm  = num_icm;
+	table->num_obj  = nobj;
+	table->obj_size = obj_size;
+	table->lowmem   = use_lowmem;
+	table->coherent = use_coherent;
+	mutex_init(&table->mutex);
+
+	for (i = 0; i < num_icm; ++i)
+		table->icm[i] = NULL;
+
+	for (i = 0; i * MTHCA_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+		chunk_size = MTHCA_TABLE_CHUNK_SIZE;
+		if ((i + 1) * MTHCA_TABLE_CHUNK_SIZE > nobj * obj_size)
+			chunk_size = nobj * obj_size - i * MTHCA_TABLE_CHUNK_SIZE;
+
+		table->icm[i] = mthca_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
+						(use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+						__GFP_NOWARN, use_coherent);
+		if (!table->icm[i])
+			goto err;
+		if (mthca_MAP_ICM(dev, table->icm[i],
+				  virt + i * MTHCA_TABLE_CHUNK_SIZE)) {
+			mthca_free_icm(dev, table->icm[i], table->coherent);
+			table->icm[i] = NULL;
+			goto err;
+		}
+
+		/*
+		 * Add a reference to this ICM chunk so that it never
+		 * gets freed (since it contains reserved firmware objects).
+		 */
+		++table->icm[i]->refcount;
+	}
+
+	return table;
+
+err:
+	for (i = 0; i < num_icm; ++i)
+		if (table->icm[i]) {
+			mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE,
+					MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE);
+			mthca_free_icm(dev, table->icm[i], table->coherent);
+		}
+
+	kfree(table);
+
+	return NULL;
+}
+
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table)
+{
+	int i;
+
+	for (i = 0; i < table->num_icm; ++i)
+		if (table->icm[i]) {
+			mthca_UNMAP_ICM(dev,
+					table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+					MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE);
+			mthca_free_icm(dev, table->icm[i], table->coherent);
+		}
+
+	kfree(table);
+}
+
+static u64 mthca_uarc_virt(struct mthca_dev *dev, struct mthca_uar *uar, int page)
+{
+	return dev->uar_table.uarc_base +
+		uar->index * dev->uar_table.uarc_size +
+		page * MTHCA_ICM_PAGE_SIZE;
+}
+
+int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+		      struct mthca_user_db_table *db_tab, int index, u64 uaddr)
+{
+	struct page *pages[1];
+	int ret = 0;
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return 0;
+
+	if (index < 0 || index > dev->uar_table.uarc_size / 8)
+		return -EINVAL;
+
+	mutex_lock(&db_tab->mutex);
+
+	i = index / MTHCA_DB_REC_PER_PAGE;
+
+	if ((db_tab->page[i].refcount >= MTHCA_DB_REC_PER_PAGE)       ||
+	    (db_tab->page[i].uvirt && db_tab->page[i].uvirt != uaddr) ||
+	    (uaddr & 4095)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (db_tab->page[i].refcount) {
+		++db_tab->page[i].refcount;
+		goto out;
+	}
+
+	ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages);
+	if (ret < 0)
+		goto out;
+
+	sg_set_page(&db_tab->page[i].mem, pages[0], MTHCA_ICM_PAGE_SIZE,
+			uaddr & ~PAGE_MASK);
+
+	ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+	if (ret < 0) {
+		put_page(pages[0]);
+		goto out;
+	}
+
+	ret = mthca_MAP_ICM_page(dev, sg_dma_address(&db_tab->page[i].mem),
+				 mthca_uarc_virt(dev, uar, i));
+	if (ret) {
+		pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+		put_page(sg_page(&db_tab->page[i].mem));
+		goto out;
+	}
+
+	db_tab->page[i].uvirt    = uaddr;
+	db_tab->page[i].refcount = 1;
+
+out:
+	mutex_unlock(&db_tab->mutex);
+	return ret;
+}
+
+void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+			 struct mthca_user_db_table *db_tab, int index)
+{
+	if (!mthca_is_memfree(dev))
+		return;
+
+	/*
+	 * To make our bookkeeping simpler, we don't unmap DB
+	 * pages until we clean up the whole db table.
+	 */
+
+	mutex_lock(&db_tab->mutex);
+
+	--db_tab->page[index / MTHCA_DB_REC_PER_PAGE].refcount;
+
+	mutex_unlock(&db_tab->mutex);
+}
+
+struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev)
+{
+	struct mthca_user_db_table *db_tab;
+	int npages;
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return NULL;
+
+	npages = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE;
+	db_tab = kmalloc(sizeof *db_tab + npages * sizeof *db_tab->page, GFP_KERNEL);
+	if (!db_tab)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&db_tab->mutex);
+	for (i = 0; i < npages; ++i) {
+		db_tab->page[i].refcount = 0;
+		db_tab->page[i].uvirt    = 0;
+		sg_init_table(&db_tab->page[i].mem, 1);
+	}
+
+	return db_tab;
+}
+
+void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
+			       struct mthca_user_db_table *db_tab)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	for (i = 0; i < dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE; ++i) {
+		if (db_tab->page[i].uvirt) {
+			mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1);
+			pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+			put_page(sg_page(&db_tab->page[i].mem));
+		}
+	}
+
+	kfree(db_tab);
+}
+
+int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type,
+		   u32 qn, __be32 **db)
+{
+	int group;
+	int start, end, dir;
+	int i, j;
+	struct mthca_db_page *page;
+	int ret = 0;
+
+	mutex_lock(&dev->db_tab->mutex);
+
+	switch (type) {
+	case MTHCA_DB_TYPE_CQ_ARM:
+	case MTHCA_DB_TYPE_SQ:
+		group = 0;
+		start = 0;
+		end   = dev->db_tab->max_group1;
+		dir   = 1;
+		break;
+
+	case MTHCA_DB_TYPE_CQ_SET_CI:
+	case MTHCA_DB_TYPE_RQ:
+	case MTHCA_DB_TYPE_SRQ:
+		group = 1;
+		start = dev->db_tab->npages - 1;
+		end   = dev->db_tab->min_group2;
+		dir   = -1;
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = start; i != end; i += dir)
+		if (dev->db_tab->page[i].db_rec &&
+		    !bitmap_full(dev->db_tab->page[i].used,
+				 MTHCA_DB_REC_PER_PAGE)) {
+			page = dev->db_tab->page + i;
+			goto found;
+		}
+
+	for (i = start; i != end; i += dir)
+		if (!dev->db_tab->page[i].db_rec) {
+			page = dev->db_tab->page + i;
+			goto alloc;
+		}
+
+	if (dev->db_tab->max_group1 >= dev->db_tab->min_group2 - 1) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (group == 0)
+		++dev->db_tab->max_group1;
+	else
+		--dev->db_tab->min_group2;
+
+	page = dev->db_tab->page + end;
+
+alloc:
+	page->db_rec = dma_zalloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+					   &page->mapping, GFP_KERNEL);
+	if (!page->db_rec) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = mthca_MAP_ICM_page(dev, page->mapping,
+				 mthca_uarc_virt(dev, &dev->driver_uar, i));
+	if (ret) {
+		dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+				  page->db_rec, page->mapping);
+		goto out;
+	}
+
+	bitmap_zero(page->used, MTHCA_DB_REC_PER_PAGE);
+
+found:
+	j = find_first_zero_bit(page->used, MTHCA_DB_REC_PER_PAGE);
+	set_bit(j, page->used);
+
+	if (group == 1)
+		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+
+	ret = i * MTHCA_DB_REC_PER_PAGE + j;
+
+	page->db_rec[j] = cpu_to_be64((qn << 8) | (type << 5));
+
+	*db = (__be32 *) &page->db_rec[j];
+
+out:
+	mutex_unlock(&dev->db_tab->mutex);
+
+	return ret;
+}
+
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index)
+{
+	int i, j;
+	struct mthca_db_page *page;
+
+	i = db_index / MTHCA_DB_REC_PER_PAGE;
+	j = db_index % MTHCA_DB_REC_PER_PAGE;
+
+	page = dev->db_tab->page + i;
+
+	mutex_lock(&dev->db_tab->mutex);
+
+	page->db_rec[j] = 0;
+	if (i >= dev->db_tab->min_group2)
+		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+	clear_bit(j, page->used);
+
+	if (bitmap_empty(page->used, MTHCA_DB_REC_PER_PAGE) &&
+	    i >= dev->db_tab->max_group1 - 1) {
+		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1);
+
+		dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+				  page->db_rec, page->mapping);
+		page->db_rec = NULL;
+
+		if (i == dev->db_tab->max_group1) {
+			--dev->db_tab->max_group1;
+			/* XXX may be able to unmap more pages now */
+		}
+		if (i == dev->db_tab->min_group2)
+			++dev->db_tab->min_group2;
+	}
+
+	mutex_unlock(&dev->db_tab->mutex);
+}
+
+int mthca_init_db_tab(struct mthca_dev *dev)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return 0;
+
+	dev->db_tab = kmalloc(sizeof *dev->db_tab, GFP_KERNEL);
+	if (!dev->db_tab)
+		return -ENOMEM;
+
+	mutex_init(&dev->db_tab->mutex);
+
+	dev->db_tab->npages     = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE;
+	dev->db_tab->max_group1 = 0;
+	dev->db_tab->min_group2 = dev->db_tab->npages - 1;
+
+	dev->db_tab->page = kmalloc(dev->db_tab->npages *
+				    sizeof *dev->db_tab->page,
+				    GFP_KERNEL);
+	if (!dev->db_tab->page) {
+		kfree(dev->db_tab);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < dev->db_tab->npages; ++i)
+		dev->db_tab->page[i].db_rec = NULL;
+
+	return 0;
+}
+
+void mthca_cleanup_db_tab(struct mthca_dev *dev)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	/*
+	 * Because we don't always free our UARC pages when they
+	 * become empty to make mthca_free_db() simpler we need to
+	 * make a sweep through the doorbell pages and free any
+	 * leftover pages now.
+	 */
+	for (i = 0; i < dev->db_tab->npages; ++i) {
+		if (!dev->db_tab->page[i].db_rec)
+			continue;
+
+		if (!bitmap_empty(dev->db_tab->page[i].used, MTHCA_DB_REC_PER_PAGE))
+			mthca_warn(dev, "Kernel UARC page %d not empty\n", i);
+
+		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1);
+
+		dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+				  dev->db_tab->page[i].db_rec,
+				  dev->db_tab->page[i].mapping);
+	}
+
+	kfree(dev->db_tab->page);
+	kfree(dev->db_tab);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.h b/drivers/infiniband/hw/mthca/mthca_memfree.h
new file mode 100644
index 0000000..da9b8f9
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_MEMFREE_H
+#define MTHCA_MEMFREE_H
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+#define MTHCA_ICM_CHUNK_LEN \
+	((256 - sizeof (struct list_head) - 2 * sizeof (int)) /		\
+	 (sizeof (struct scatterlist)))
+
+enum {
+	MTHCA_ICM_PAGE_SHIFT	= 12,
+	MTHCA_ICM_PAGE_SIZE	= 1 << MTHCA_ICM_PAGE_SHIFT,
+	MTHCA_DB_REC_PER_PAGE	= MTHCA_ICM_PAGE_SIZE / 8
+};
+
+struct mthca_icm_chunk {
+	struct list_head   list;
+	int                npages;
+	int                nsg;
+	struct scatterlist mem[MTHCA_ICM_CHUNK_LEN];
+};
+
+struct mthca_icm {
+	struct list_head chunk_list;
+	int              refcount;
+};
+
+struct mthca_icm_table {
+	u64               virt;
+	int               num_icm;
+	int               num_obj;
+	int               obj_size;
+	int               lowmem;
+	int               coherent;
+	struct mutex      mutex;
+	struct mthca_icm *icm[0];
+};
+
+struct mthca_icm_iter {
+	struct mthca_icm       *icm;
+	struct mthca_icm_chunk *chunk;
+	int                     page_idx;
+};
+
+struct mthca_dev;
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+				  gfp_t gfp_mask, int coherent);
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent);
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+					      u64 virt, int obj_size,
+					      int nobj, int reserved,
+					      int use_lowmem, int use_coherent);
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table);
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle);
+int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			  int start, int end);
+void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			   int start, int end);
+
+static inline void mthca_icm_first(struct mthca_icm *icm,
+				   struct mthca_icm_iter *iter)
+{
+	iter->icm      = icm;
+	iter->chunk    = list_empty(&icm->chunk_list) ?
+		NULL : list_entry(icm->chunk_list.next,
+				  struct mthca_icm_chunk, list);
+	iter->page_idx = 0;
+}
+
+static inline int mthca_icm_last(struct mthca_icm_iter *iter)
+{
+	return !iter->chunk;
+}
+
+static inline void mthca_icm_next(struct mthca_icm_iter *iter)
+{
+	if (++iter->page_idx >= iter->chunk->nsg) {
+		if (iter->chunk->list.next == &iter->icm->chunk_list) {
+			iter->chunk = NULL;
+			return;
+		}
+
+		iter->chunk = list_entry(iter->chunk->list.next,
+					 struct mthca_icm_chunk, list);
+		iter->page_idx = 0;
+	}
+}
+
+static inline dma_addr_t mthca_icm_addr(struct mthca_icm_iter *iter)
+{
+	return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+static inline unsigned long mthca_icm_size(struct mthca_icm_iter *iter)
+{
+	return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+}
+
+struct mthca_db_page {
+	DECLARE_BITMAP(used, MTHCA_DB_REC_PER_PAGE);
+	__be64    *db_rec;
+	dma_addr_t mapping;
+};
+
+struct mthca_db_table {
+	int 	       	      npages;
+	int 	       	      max_group1;
+	int 	       	      min_group2;
+	struct mthca_db_page *page;
+	struct mutex          mutex;
+};
+
+enum mthca_db_type {
+	MTHCA_DB_TYPE_INVALID   = 0x0,
+	MTHCA_DB_TYPE_CQ_SET_CI = 0x1,
+	MTHCA_DB_TYPE_CQ_ARM    = 0x2,
+	MTHCA_DB_TYPE_SQ        = 0x3,
+	MTHCA_DB_TYPE_RQ        = 0x4,
+	MTHCA_DB_TYPE_SRQ       = 0x5,
+	MTHCA_DB_TYPE_GROUP_SEP = 0x7
+};
+
+struct mthca_user_db_table;
+struct mthca_uar;
+
+int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+		      struct mthca_user_db_table *db_tab, int index, u64 uaddr);
+void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+			 struct mthca_user_db_table *db_tab, int index);
+struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
+			       struct mthca_user_db_table *db_tab);
+
+int mthca_init_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_db_tab(struct mthca_dev *dev);
+int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type,
+		   u32 qn, __be32 **db);
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index);
+
+#endif /* MTHCA_MEMFREE_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c
new file mode 100644
index 0000000..ed9a989
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -0,0 +1,965 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+struct mthca_mtt {
+	struct mthca_buddy *buddy;
+	int                 order;
+	u32                 first_seg;
+};
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_mpt_entry {
+	__be32 flags;
+	__be32 page_size;
+	__be32 key;
+	__be32 pd;
+	__be64 start;
+	__be64 length;
+	__be32 lkey;
+	__be32 window_count;
+	__be32 window_count_limit;
+	__be64 mtt_seg;
+	__be32 mtt_sz;		/* Arbel only */
+	u32    reserved[2];
+} __attribute__((packed));
+
+#define MTHCA_MPT_FLAG_SW_OWNS       (0xfUL << 28)
+#define MTHCA_MPT_FLAG_MIO           (1 << 17)
+#define MTHCA_MPT_FLAG_BIND_ENABLE   (1 << 15)
+#define MTHCA_MPT_FLAG_PHYSICAL      (1 <<  9)
+#define MTHCA_MPT_FLAG_REGION        (1 <<  8)
+
+#define MTHCA_MTT_FLAG_PRESENT       1
+
+#define MTHCA_MPT_STATUS_SW 0xF0
+#define MTHCA_MPT_STATUS_HW 0x00
+
+#define SINAI_FMR_KEY_INC 0x1000000
+
+/*
+ * Buddy allocator for MTT segments (currently not very efficient
+ * since it doesn't keep a free list and just searches linearly
+ * through the bitmaps)
+ */
+
+static u32 mthca_buddy_alloc(struct mthca_buddy *buddy, int order)
+{
+	int o;
+	int m;
+	u32 seg;
+
+	spin_lock(&buddy->lock);
+
+	for (o = order; o <= buddy->max_order; ++o)
+		if (buddy->num_free[o]) {
+			m = 1 << (buddy->max_order - o);
+			seg = find_first_bit(buddy->bits[o], m);
+			if (seg < m)
+				goto found;
+		}
+
+	spin_unlock(&buddy->lock);
+	return -1;
+
+ found:
+	clear_bit(seg, buddy->bits[o]);
+	--buddy->num_free[o];
+
+	while (o > order) {
+		--o;
+		seg <<= 1;
+		set_bit(seg ^ 1, buddy->bits[o]);
+		++buddy->num_free[o];
+	}
+
+	spin_unlock(&buddy->lock);
+
+	seg <<= order;
+
+	return seg;
+}
+
+static void mthca_buddy_free(struct mthca_buddy *buddy, u32 seg, int order)
+{
+	seg >>= order;
+
+	spin_lock(&buddy->lock);
+
+	while (test_bit(seg ^ 1, buddy->bits[order])) {
+		clear_bit(seg ^ 1, buddy->bits[order]);
+		--buddy->num_free[order];
+		seg >>= 1;
+		++order;
+	}
+
+	set_bit(seg, buddy->bits[order]);
+	++buddy->num_free[order];
+
+	spin_unlock(&buddy->lock);
+}
+
+static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order)
+{
+	int i, s;
+
+	buddy->max_order = max_order;
+	spin_lock_init(&buddy->lock);
+
+	buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *),
+			      GFP_KERNEL);
+	buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free,
+				  GFP_KERNEL);
+	if (!buddy->bits || !buddy->num_free)
+		goto err_out;
+
+	for (i = 0; i <= buddy->max_order; ++i) {
+		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
+		buddy->bits[i] = kmalloc(s * sizeof (long), GFP_KERNEL);
+		if (!buddy->bits[i])
+			goto err_out_free;
+		bitmap_zero(buddy->bits[i],
+			    1 << (buddy->max_order - i));
+	}
+
+	set_bit(0, buddy->bits[buddy->max_order]);
+	buddy->num_free[buddy->max_order] = 1;
+
+	return 0;
+
+err_out_free:
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+err_out:
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+
+	return -ENOMEM;
+}
+
+static void mthca_buddy_cleanup(struct mthca_buddy *buddy)
+{
+	int i;
+
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+}
+
+static u32 mthca_alloc_mtt_range(struct mthca_dev *dev, int order,
+				 struct mthca_buddy *buddy)
+{
+	u32 seg = mthca_buddy_alloc(buddy, order);
+
+	if (seg == -1)
+		return -1;
+
+	if (mthca_is_memfree(dev))
+		if (mthca_table_get_range(dev, dev->mr_table.mtt_table, seg,
+					  seg + (1 << order) - 1)) {
+			mthca_buddy_free(buddy, seg, order);
+			seg = -1;
+		}
+
+	return seg;
+}
+
+static struct mthca_mtt *__mthca_alloc_mtt(struct mthca_dev *dev, int size,
+					   struct mthca_buddy *buddy)
+{
+	struct mthca_mtt *mtt;
+	int i;
+
+	if (size <= 0)
+		return ERR_PTR(-EINVAL);
+
+	mtt = kmalloc(sizeof *mtt, GFP_KERNEL);
+	if (!mtt)
+		return ERR_PTR(-ENOMEM);
+
+	mtt->buddy = buddy;
+	mtt->order = 0;
+	for (i = dev->limits.mtt_seg_size / 8; i < size; i <<= 1)
+		++mtt->order;
+
+	mtt->first_seg = mthca_alloc_mtt_range(dev, mtt->order, buddy);
+	if (mtt->first_seg == -1) {
+		kfree(mtt);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return mtt;
+}
+
+struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size)
+{
+	return __mthca_alloc_mtt(dev, size, &dev->mr_table.mtt_buddy);
+}
+
+void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt)
+{
+	if (!mtt)
+		return;
+
+	mthca_buddy_free(mtt->buddy, mtt->first_seg, mtt->order);
+
+	mthca_table_put_range(dev, dev->mr_table.mtt_table,
+			      mtt->first_seg,
+			      mtt->first_seg + (1 << mtt->order) - 1);
+
+	kfree(mtt);
+}
+
+static int __mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+			     int start_index, u64 *buffer_list, int list_len)
+{
+	struct mthca_mailbox *mailbox;
+	__be64 *mtt_entry;
+	int err = 0;
+	int i;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mtt_entry = mailbox->buf;
+
+	while (list_len > 0) {
+		mtt_entry[0] = cpu_to_be64(dev->mr_table.mtt_base +
+					   mtt->first_seg * dev->limits.mtt_seg_size +
+					   start_index * 8);
+		mtt_entry[1] = 0;
+		for (i = 0; i < list_len && i < MTHCA_MAILBOX_SIZE / 8 - 2; ++i)
+			mtt_entry[i + 2] = cpu_to_be64(buffer_list[i] |
+						       MTHCA_MTT_FLAG_PRESENT);
+
+		/*
+		 * If we have an odd number of entries to write, add
+		 * one more dummy entry for firmware efficiency.
+		 */
+		if (i & 1)
+			mtt_entry[i + 2] = 0;
+
+		err = mthca_WRITE_MTT(dev, mailbox, (i + 1) & ~1);
+		if (err) {
+			mthca_warn(dev, "WRITE_MTT failed (%d)\n", err);
+			goto out;
+		}
+
+		list_len    -= i;
+		start_index += i;
+		buffer_list += i;
+	}
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_write_mtt_size(struct mthca_dev *dev)
+{
+	if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy ||
+	    !(dev->mthca_flags & MTHCA_FLAG_FMR))
+		/*
+		 * Be friendly to WRITE_MTT command
+		 * and leave two empty slots for the
+		 * index and reserved fields of the
+		 * mailbox.
+		 */
+		return PAGE_SIZE / sizeof (u64) - 2;
+
+	/* For Arbel, all MTTs must fit in the same page. */
+	return mthca_is_memfree(dev) ? (PAGE_SIZE / sizeof (u64)) : 0x7ffffff;
+}
+
+static void mthca_tavor_write_mtt_seg(struct mthca_dev *dev,
+				      struct mthca_mtt *mtt, int start_index,
+				      u64 *buffer_list, int list_len)
+{
+	u64 __iomem *mtts;
+	int i;
+
+	mtts = dev->mr_table.tavor_fmr.mtt_base + mtt->first_seg * dev->limits.mtt_seg_size +
+		start_index * sizeof (u64);
+	for (i = 0; i < list_len; ++i)
+		mthca_write64_raw(cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT),
+				  mtts + i);
+}
+
+static void mthca_arbel_write_mtt_seg(struct mthca_dev *dev,
+				      struct mthca_mtt *mtt, int start_index,
+				      u64 *buffer_list, int list_len)
+{
+	__be64 *mtts;
+	dma_addr_t dma_handle;
+	int i;
+	int s = start_index * sizeof (u64);
+
+	/* For Arbel, all MTTs must fit in the same page. */
+	BUG_ON(s / PAGE_SIZE != (s + list_len * sizeof(u64) - 1) / PAGE_SIZE);
+	/* Require full segments */
+	BUG_ON(s % dev->limits.mtt_seg_size);
+
+	mtts = mthca_table_find(dev->mr_table.mtt_table, mtt->first_seg +
+				s / dev->limits.mtt_seg_size, &dma_handle);
+
+	BUG_ON(!mtts);
+
+	dma_sync_single_for_cpu(&dev->pdev->dev, dma_handle,
+				list_len * sizeof (u64), DMA_TO_DEVICE);
+
+	for (i = 0; i < list_len; ++i)
+		mtts[i] = cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT);
+
+	dma_sync_single_for_device(&dev->pdev->dev, dma_handle,
+				   list_len * sizeof (u64), DMA_TO_DEVICE);
+}
+
+int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+		    int start_index, u64 *buffer_list, int list_len)
+{
+	int size = mthca_write_mtt_size(dev);
+	int chunk;
+
+	if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy ||
+	    !(dev->mthca_flags & MTHCA_FLAG_FMR))
+		return __mthca_write_mtt(dev, mtt, start_index, buffer_list, list_len);
+
+	while (list_len > 0) {
+		chunk = min(size, list_len);
+		if (mthca_is_memfree(dev))
+			mthca_arbel_write_mtt_seg(dev, mtt, start_index,
+						  buffer_list, chunk);
+		else
+			mthca_tavor_write_mtt_seg(dev, mtt, start_index,
+						  buffer_list, chunk);
+
+		list_len    -= chunk;
+		start_index += chunk;
+		buffer_list += chunk;
+	}
+
+	return 0;
+}
+
+static inline u32 tavor_hw_index_to_key(u32 ind)
+{
+	return ind;
+}
+
+static inline u32 tavor_key_to_hw_index(u32 key)
+{
+	return key;
+}
+
+static inline u32 arbel_hw_index_to_key(u32 ind)
+{
+	return (ind >> 24) | (ind << 8);
+}
+
+static inline u32 arbel_key_to_hw_index(u32 key)
+{
+	return (key << 24) | (key >> 8);
+}
+
+static inline u32 hw_index_to_key(struct mthca_dev *dev, u32 ind)
+{
+	if (mthca_is_memfree(dev))
+		return arbel_hw_index_to_key(ind);
+	else
+		return tavor_hw_index_to_key(ind);
+}
+
+static inline u32 key_to_hw_index(struct mthca_dev *dev, u32 key)
+{
+	if (mthca_is_memfree(dev))
+		return arbel_key_to_hw_index(key);
+	else
+		return tavor_key_to_hw_index(key);
+}
+
+static inline u32 adjust_key(struct mthca_dev *dev, u32 key)
+{
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		return ((key << 20) & 0x800000) | (key & 0x7fffff);
+	else
+		return key;
+}
+
+int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift,
+		   u64 iova, u64 total_size, u32 access, struct mthca_mr *mr)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_mpt_entry *mpt_entry;
+	u32 key;
+	int i;
+	int err;
+
+	WARN_ON(buffer_size_shift >= 32);
+
+	key = mthca_alloc(&dev->mr_table.mpt_alloc);
+	if (key == -1)
+		return -ENOMEM;
+	key = adjust_key(dev, key);
+	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
+		if (err)
+			goto err_out_mpt_free;
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_out_table;
+	}
+	mpt_entry = mailbox->buf;
+
+	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
+				       MTHCA_MPT_FLAG_MIO         |
+				       MTHCA_MPT_FLAG_REGION      |
+				       access);
+	if (!mr->mtt)
+		mpt_entry->flags |= cpu_to_be32(MTHCA_MPT_FLAG_PHYSICAL);
+
+	mpt_entry->page_size = cpu_to_be32(buffer_size_shift - 12);
+	mpt_entry->key       = cpu_to_be32(key);
+	mpt_entry->pd        = cpu_to_be32(pd);
+	mpt_entry->start     = cpu_to_be64(iova);
+	mpt_entry->length    = cpu_to_be64(total_size);
+
+	memset(&mpt_entry->lkey, 0,
+	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey));
+
+	if (mr->mtt)
+		mpt_entry->mtt_seg =
+			cpu_to_be64(dev->mr_table.mtt_base +
+				    mr->mtt->first_seg * dev->limits.mtt_seg_size);
+
+	if (0) {
+		mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+		for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i]));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	err = mthca_SW2HW_MPT(dev, mailbox,
+			      key & (dev->limits.num_mpts - 1));
+	if (err) {
+		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_out_mailbox;
+	}
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+
+err_out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_table:
+	mthca_table_put(dev, dev->mr_table.mpt_table, key);
+
+err_out_mpt_free:
+	mthca_free(&dev->mr_table.mpt_alloc, key);
+	return err;
+}
+
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+			   u32 access, struct mthca_mr *mr)
+{
+	mr->mtt = NULL;
+	return mthca_mr_alloc(dev, pd, 12, 0, ~0ULL, access, mr);
+}
+
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+			u64 *buffer_list, int buffer_size_shift,
+			int list_len, u64 iova, u64 total_size,
+			u32 access, struct mthca_mr *mr)
+{
+	int err;
+
+	mr->mtt = mthca_alloc_mtt(dev, list_len);
+	if (IS_ERR(mr->mtt))
+		return PTR_ERR(mr->mtt);
+
+	err = mthca_write_mtt(dev, mr->mtt, 0, buffer_list, list_len);
+	if (err) {
+		mthca_free_mtt(dev, mr->mtt);
+		return err;
+	}
+
+	err = mthca_mr_alloc(dev, pd, buffer_size_shift, iova,
+			     total_size, access, mr);
+	if (err)
+		mthca_free_mtt(dev, mr->mtt);
+
+	return err;
+}
+
+/* Free mr or fmr */
+static void mthca_free_region(struct mthca_dev *dev, u32 lkey)
+{
+	mthca_table_put(dev, dev->mr_table.mpt_table,
+			key_to_hw_index(dev, lkey));
+
+	mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, lkey));
+}
+
+void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
+{
+	int err;
+
+	err = mthca_HW2SW_MPT(dev, NULL,
+			      key_to_hw_index(dev, mr->ibmr.lkey) &
+			      (dev->limits.num_mpts - 1));
+	if (err)
+		mthca_warn(dev, "HW2SW_MPT failed (%d)\n", err);
+
+	mthca_free_region(dev, mr->ibmr.lkey);
+	mthca_free_mtt(dev, mr->mtt);
+}
+
+int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
+		    u32 access, struct mthca_fmr *mr)
+{
+	struct mthca_mpt_entry *mpt_entry;
+	struct mthca_mailbox *mailbox;
+	u64 mtt_seg;
+	u32 key, idx;
+	int list_len = mr->attr.max_pages;
+	int err = -ENOMEM;
+	int i;
+
+	if (mr->attr.page_shift < 12 || mr->attr.page_shift >= 32)
+		return -EINVAL;
+
+	/* For Arbel, all MTTs must fit in the same page. */
+	if (mthca_is_memfree(dev) &&
+	    mr->attr.max_pages * sizeof *mr->mem.arbel.mtts > PAGE_SIZE)
+		return -EINVAL;
+
+	mr->maps = 0;
+
+	key = mthca_alloc(&dev->mr_table.mpt_alloc);
+	if (key == -1)
+		return -ENOMEM;
+	key = adjust_key(dev, key);
+
+	idx = key & (dev->limits.num_mpts - 1);
+	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
+		if (err)
+			goto err_out_mpt_free;
+
+		mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL);
+		BUG_ON(!mr->mem.arbel.mpt);
+	} else
+		mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base +
+			sizeof *(mr->mem.tavor.mpt) * idx;
+
+	mr->mtt = __mthca_alloc_mtt(dev, list_len, dev->mr_table.fmr_mtt_buddy);
+	if (IS_ERR(mr->mtt)) {
+		err = PTR_ERR(mr->mtt);
+		goto err_out_table;
+	}
+
+	mtt_seg = mr->mtt->first_seg * dev->limits.mtt_seg_size;
+
+	if (mthca_is_memfree(dev)) {
+		mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table,
+						      mr->mtt->first_seg,
+						      &mr->mem.arbel.dma_handle);
+		BUG_ON(!mr->mem.arbel.mtts);
+	} else
+		mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_out_free_mtt;
+	}
+
+	mpt_entry = mailbox->buf;
+
+	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
+				       MTHCA_MPT_FLAG_MIO         |
+				       MTHCA_MPT_FLAG_REGION      |
+				       access);
+
+	mpt_entry->page_size = cpu_to_be32(mr->attr.page_shift - 12);
+	mpt_entry->key       = cpu_to_be32(key);
+	mpt_entry->pd        = cpu_to_be32(pd);
+	memset(&mpt_entry->start, 0,
+	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, start));
+	mpt_entry->mtt_seg   = cpu_to_be64(dev->mr_table.mtt_base + mtt_seg);
+
+	if (0) {
+		mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+		for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i]));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	err = mthca_SW2HW_MPT(dev, mailbox,
+			      key & (dev->limits.num_mpts - 1));
+	if (err) {
+		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_out_mailbox_free;
+	}
+
+	mthca_free_mailbox(dev, mailbox);
+	return 0;
+
+err_out_mailbox_free:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_free_mtt:
+	mthca_free_mtt(dev, mr->mtt);
+
+err_out_table:
+	mthca_table_put(dev, dev->mr_table.mpt_table, key);
+
+err_out_mpt_free:
+	mthca_free(&dev->mr_table.mpt_alloc, key);
+	return err;
+}
+
+int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	if (fmr->maps)
+		return -EBUSY;
+
+	mthca_free_region(dev, fmr->ibmr.lkey);
+	mthca_free_mtt(dev, fmr->mtt);
+
+	return 0;
+}
+
+static inline int mthca_check_fmr(struct mthca_fmr *fmr, u64 *page_list,
+				  int list_len, u64 iova)
+{
+	int i, page_mask;
+
+	if (list_len > fmr->attr.max_pages)
+		return -EINVAL;
+
+	page_mask = (1 << fmr->attr.page_shift) - 1;
+
+	/* We are getting page lists, so va must be page aligned. */
+	if (iova & page_mask)
+		return -EINVAL;
+
+	/* Trust the user not to pass misaligned data in page_list */
+	if (0)
+		for (i = 0; i < list_len; ++i) {
+			if (page_list[i] & ~page_mask)
+				return -EINVAL;
+		}
+
+	if (fmr->maps >= fmr->attr.max_maps)
+		return -EINVAL;
+
+	return 0;
+}
+
+
+int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova)
+{
+	struct mthca_fmr *fmr = to_mfmr(ibfmr);
+	struct mthca_dev *dev = to_mdev(ibfmr->device);
+	struct mthca_mpt_entry mpt_entry;
+	u32 key;
+	int i, err;
+
+	err = mthca_check_fmr(fmr, page_list, list_len, iova);
+	if (err)
+		return err;
+
+	++fmr->maps;
+
+	key = tavor_key_to_hw_index(fmr->ibmr.lkey);
+	key += dev->limits.num_mpts;
+	fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key);
+
+	writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+
+	for (i = 0; i < list_len; ++i) {
+		__be64 mtt_entry = cpu_to_be64(page_list[i] |
+					       MTHCA_MTT_FLAG_PRESENT);
+		mthca_write64_raw(mtt_entry, fmr->mem.tavor.mtts + i);
+	}
+
+	mpt_entry.lkey   = cpu_to_be32(key);
+	mpt_entry.length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
+	mpt_entry.start  = cpu_to_be64(iova);
+
+	__raw_writel((__force u32) mpt_entry.lkey, &fmr->mem.tavor.mpt->key);
+	memcpy_toio(&fmr->mem.tavor.mpt->start, &mpt_entry.start,
+		    offsetof(struct mthca_mpt_entry, window_count) -
+		    offsetof(struct mthca_mpt_entry, start));
+
+	writeb(MTHCA_MPT_STATUS_HW, fmr->mem.tavor.mpt);
+
+	return 0;
+}
+
+int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova)
+{
+	struct mthca_fmr *fmr = to_mfmr(ibfmr);
+	struct mthca_dev *dev = to_mdev(ibfmr->device);
+	u32 key;
+	int i, err;
+
+	err = mthca_check_fmr(fmr, page_list, list_len, iova);
+	if (err)
+		return err;
+
+	++fmr->maps;
+
+	key = arbel_key_to_hw_index(fmr->ibmr.lkey);
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		key += SINAI_FMR_KEY_INC;
+	else
+		key += dev->limits.num_mpts;
+	fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key);
+
+	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
+
+	wmb();
+
+	dma_sync_single_for_cpu(&dev->pdev->dev, fmr->mem.arbel.dma_handle,
+				list_len * sizeof(u64), DMA_TO_DEVICE);
+
+	for (i = 0; i < list_len; ++i)
+		fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] |
+						     MTHCA_MTT_FLAG_PRESENT);
+
+	dma_sync_single_for_device(&dev->pdev->dev, fmr->mem.arbel.dma_handle,
+				   list_len * sizeof(u64), DMA_TO_DEVICE);
+
+	fmr->mem.arbel.mpt->key    = cpu_to_be32(key);
+	fmr->mem.arbel.mpt->lkey   = cpu_to_be32(key);
+	fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
+	fmr->mem.arbel.mpt->start  = cpu_to_be64(iova);
+
+	wmb();
+
+	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_HW;
+
+	wmb();
+
+	return 0;
+}
+
+void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	if (!fmr->maps)
+		return;
+
+	fmr->maps = 0;
+
+	writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+}
+
+void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	if (!fmr->maps)
+		return;
+
+	fmr->maps = 0;
+
+	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
+}
+
+int mthca_init_mr_table(struct mthca_dev *dev)
+{
+	phys_addr_t addr;
+	int mpts, mtts, err, i;
+
+	err = mthca_alloc_init(&dev->mr_table.mpt_alloc,
+			       dev->limits.num_mpts,
+			       ~0, dev->limits.reserved_mrws);
+	if (err)
+		return err;
+
+	if (!mthca_is_memfree(dev) &&
+	    (dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN))
+		dev->limits.fmr_reserved_mtts = 0;
+	else
+		dev->mthca_flags |= MTHCA_FLAG_FMR;
+
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		mthca_dbg(dev, "Memory key throughput optimization activated.\n");
+
+	err = mthca_buddy_init(&dev->mr_table.mtt_buddy,
+			       fls(dev->limits.num_mtt_segs - 1));
+
+	if (err)
+		goto err_mtt_buddy;
+
+	dev->mr_table.tavor_fmr.mpt_base = NULL;
+	dev->mr_table.tavor_fmr.mtt_base = NULL;
+
+	if (dev->limits.fmr_reserved_mtts) {
+		i = fls(dev->limits.fmr_reserved_mtts - 1);
+
+		if (i >= 31) {
+			mthca_warn(dev, "Unable to reserve 2^31 FMR MTTs.\n");
+			err = -EINVAL;
+			goto err_fmr_mpt;
+		}
+		mpts = mtts = 1 << i;
+	} else {
+		mtts = dev->limits.num_mtt_segs;
+		mpts = dev->limits.num_mpts;
+	}
+
+	if (!mthca_is_memfree(dev) &&
+	    (dev->mthca_flags & MTHCA_FLAG_FMR)) {
+
+		addr = pci_resource_start(dev->pdev, 4) +
+			((pci_resource_len(dev->pdev, 4) - 1) &
+			 dev->mr_table.mpt_base);
+
+		dev->mr_table.tavor_fmr.mpt_base =
+			ioremap(addr, mpts * sizeof(struct mthca_mpt_entry));
+
+		if (!dev->mr_table.tavor_fmr.mpt_base) {
+			mthca_warn(dev, "MPT ioremap for FMR failed.\n");
+			err = -ENOMEM;
+			goto err_fmr_mpt;
+		}
+
+		addr = pci_resource_start(dev->pdev, 4) +
+			((pci_resource_len(dev->pdev, 4) - 1) &
+			 dev->mr_table.mtt_base);
+
+		dev->mr_table.tavor_fmr.mtt_base =
+			ioremap(addr, mtts * dev->limits.mtt_seg_size);
+		if (!dev->mr_table.tavor_fmr.mtt_base) {
+			mthca_warn(dev, "MTT ioremap for FMR failed.\n");
+			err = -ENOMEM;
+			goto err_fmr_mtt;
+		}
+	}
+
+	if (dev->limits.fmr_reserved_mtts) {
+		err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, fls(mtts - 1));
+		if (err)
+			goto err_fmr_mtt_buddy;
+
+		/* Prevent regular MRs from using FMR keys */
+		err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, fls(mtts - 1));
+		if (err)
+			goto err_reserve_fmr;
+
+		dev->mr_table.fmr_mtt_buddy =
+			&dev->mr_table.tavor_fmr.mtt_buddy;
+	} else
+		dev->mr_table.fmr_mtt_buddy = &dev->mr_table.mtt_buddy;
+
+	/* FMR table is always the first, take reserved MTTs out of there */
+	if (dev->limits.reserved_mtts) {
+		i = fls(dev->limits.reserved_mtts - 1);
+
+		if (mthca_alloc_mtt_range(dev, i,
+					  dev->mr_table.fmr_mtt_buddy) == -1) {
+			mthca_warn(dev, "MTT table of order %d is too small.\n",
+				  dev->mr_table.fmr_mtt_buddy->max_order);
+			err = -ENOMEM;
+			goto err_reserve_mtts;
+		}
+	}
+
+	return 0;
+
+err_reserve_mtts:
+err_reserve_fmr:
+	if (dev->limits.fmr_reserved_mtts)
+		mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+
+err_fmr_mtt_buddy:
+	if (dev->mr_table.tavor_fmr.mtt_base)
+		iounmap(dev->mr_table.tavor_fmr.mtt_base);
+
+err_fmr_mtt:
+	if (dev->mr_table.tavor_fmr.mpt_base)
+		iounmap(dev->mr_table.tavor_fmr.mpt_base);
+
+err_fmr_mpt:
+	mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+
+err_mtt_buddy:
+	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+
+	return err;
+}
+
+void mthca_cleanup_mr_table(struct mthca_dev *dev)
+{
+	/* XXX check if any MRs are still allocated? */
+	if (dev->limits.fmr_reserved_mtts)
+		mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+
+	mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+
+	if (dev->mr_table.tavor_fmr.mtt_base)
+		iounmap(dev->mr_table.tavor_fmr.mtt_base);
+	if (dev->mr_table.tavor_fmr.mpt_base)
+		iounmap(dev->mr_table.tavor_fmr.mpt_base);
+
+	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_pd.c b/drivers/infiniband/hw/mthca/mthca_pd.c
new file mode 100644
index 0000000..266f14e
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_pd.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+
+int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd)
+{
+	int err = 0;
+
+	pd->privileged = privileged;
+
+	atomic_set(&pd->sqp_count, 0);
+	pd->pd_num = mthca_alloc(&dev->pd_table.alloc);
+	if (pd->pd_num == -1)
+		return -ENOMEM;
+
+	if (privileged) {
+		err = mthca_mr_alloc_notrans(dev, pd->pd_num,
+					     MTHCA_MPT_FLAG_LOCAL_READ |
+					     MTHCA_MPT_FLAG_LOCAL_WRITE,
+					     &pd->ntmr);
+		if (err)
+			mthca_free(&dev->pd_table.alloc, pd->pd_num);
+	}
+
+	return err;
+}
+
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd)
+{
+	if (pd->privileged)
+		mthca_free_mr(dev, &pd->ntmr);
+	mthca_free(&dev->pd_table.alloc, pd->pd_num);
+}
+
+int mthca_init_pd_table(struct mthca_dev *dev)
+{
+	return mthca_alloc_init(&dev->pd_table.alloc,
+				dev->limits.num_pds,
+				(1 << 24) - 1,
+				dev->limits.reserved_pds);
+}
+
+void mthca_cleanup_pd_table(struct mthca_dev *dev)
+{
+	/* XXX check if any PDs are still allocated? */
+	mthca_alloc_cleanup(&dev->pd_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c
new file mode 100644
index 0000000..15d0644
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_profile.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include "mthca_profile.h"
+
+enum {
+	MTHCA_RES_QP,
+	MTHCA_RES_EEC,
+	MTHCA_RES_SRQ,
+	MTHCA_RES_CQ,
+	MTHCA_RES_EQP,
+	MTHCA_RES_EEEC,
+	MTHCA_RES_EQ,
+	MTHCA_RES_RDB,
+	MTHCA_RES_MCG,
+	MTHCA_RES_MPT,
+	MTHCA_RES_MTT,
+	MTHCA_RES_UAR,
+	MTHCA_RES_UDAV,
+	MTHCA_RES_UARC,
+	MTHCA_RES_NUM
+};
+
+enum {
+	MTHCA_NUM_EQS = 32,
+	MTHCA_NUM_PDS = 1 << 15
+};
+
+s64 mthca_make_profile(struct mthca_dev *dev,
+		       struct mthca_profile *request,
+		       struct mthca_dev_lim *dev_lim,
+		       struct mthca_init_hca_param *init_hca)
+{
+	struct mthca_resource {
+		u64 size;
+		u64 start;
+		int type;
+		int num;
+		int log_num;
+	};
+
+	u64 mem_base, mem_avail;
+	s64 total_size = 0;
+	struct mthca_resource *profile;
+	int i, j;
+
+	profile = kzalloc(MTHCA_RES_NUM * sizeof *profile, GFP_KERNEL);
+	if (!profile)
+		return -ENOMEM;
+
+	profile[MTHCA_RES_QP].size   = dev_lim->qpc_entry_sz;
+	profile[MTHCA_RES_EEC].size  = dev_lim->eec_entry_sz;
+	profile[MTHCA_RES_SRQ].size  = dev_lim->srq_entry_sz;
+	profile[MTHCA_RES_CQ].size   = dev_lim->cqc_entry_sz;
+	profile[MTHCA_RES_EQP].size  = dev_lim->eqpc_entry_sz;
+	profile[MTHCA_RES_EEEC].size = dev_lim->eeec_entry_sz;
+	profile[MTHCA_RES_EQ].size   = dev_lim->eqc_entry_sz;
+	profile[MTHCA_RES_RDB].size  = MTHCA_RDB_ENTRY_SIZE;
+	profile[MTHCA_RES_MCG].size  = MTHCA_MGM_ENTRY_SIZE;
+	profile[MTHCA_RES_MPT].size  = dev_lim->mpt_entry_sz;
+	profile[MTHCA_RES_MTT].size  = dev->limits.mtt_seg_size;
+	profile[MTHCA_RES_UAR].size  = dev_lim->uar_scratch_entry_sz;
+	profile[MTHCA_RES_UDAV].size = MTHCA_AV_SIZE;
+	profile[MTHCA_RES_UARC].size = request->uarc_size;
+
+	profile[MTHCA_RES_QP].num    = request->num_qp;
+	profile[MTHCA_RES_SRQ].num   = request->num_srq;
+	profile[MTHCA_RES_EQP].num   = request->num_qp;
+	profile[MTHCA_RES_RDB].num   = request->num_qp * request->rdb_per_qp;
+	profile[MTHCA_RES_CQ].num    = request->num_cq;
+	profile[MTHCA_RES_EQ].num    = MTHCA_NUM_EQS;
+	profile[MTHCA_RES_MCG].num   = request->num_mcg;
+	profile[MTHCA_RES_MPT].num   = request->num_mpt;
+	profile[MTHCA_RES_MTT].num   = request->num_mtt;
+	profile[MTHCA_RES_UAR].num   = request->num_uar;
+	profile[MTHCA_RES_UARC].num  = request->num_uar;
+	profile[MTHCA_RES_UDAV].num  = request->num_udav;
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		profile[i].type     = i;
+		profile[i].log_num  = max(ffs(profile[i].num) - 1, 0);
+		profile[i].size    *= profile[i].num;
+		if (mthca_is_memfree(dev))
+			profile[i].size = max(profile[i].size, (u64) PAGE_SIZE);
+	}
+
+	if (mthca_is_memfree(dev)) {
+		mem_base  = 0;
+		mem_avail = dev_lim->hca.arbel.max_icm_sz;
+	} else {
+		mem_base  = dev->ddr_start;
+		mem_avail = dev->fw.tavor.fw_start - dev->ddr_start;
+	}
+
+	/*
+	 * Sort the resources in decreasing order of size.  Since they
+	 * all have sizes that are powers of 2, we'll be able to keep
+	 * resources aligned to their size and pack them without gaps
+	 * using the sorted order.
+	 */
+	for (i = MTHCA_RES_NUM; i > 0; --i)
+		for (j = 1; j < i; ++j) {
+			if (profile[j].size > profile[j - 1].size)
+				swap(profile[j], profile[j - 1]);
+		}
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		if (profile[i].size) {
+			profile[i].start = mem_base + total_size;
+			total_size      += profile[i].size;
+		}
+		if (total_size > mem_avail) {
+			mthca_err(dev, "Profile requires 0x%llx bytes; "
+				  "won't fit in 0x%llx bytes of context memory.\n",
+				  (unsigned long long) total_size,
+				  (unsigned long long) mem_avail);
+			kfree(profile);
+			return -ENOMEM;
+		}
+
+		if (profile[i].size)
+			mthca_dbg(dev, "profile[%2d]--%2d/%2d @ 0x%16llx "
+				  "(size 0x%8llx)\n",
+				  i, profile[i].type, profile[i].log_num,
+				  (unsigned long long) profile[i].start,
+				  (unsigned long long) profile[i].size);
+	}
+
+	if (mthca_is_memfree(dev))
+		mthca_dbg(dev, "HCA context memory: reserving %d KB\n",
+			  (int) (total_size >> 10));
+	else
+		mthca_dbg(dev, "HCA memory: allocated %d KB/%d KB (%d KB free)\n",
+			  (int) (total_size >> 10), (int) (mem_avail >> 10),
+			  (int) ((mem_avail - total_size) >> 10));
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		switch (profile[i].type) {
+		case MTHCA_RES_QP:
+			dev->limits.num_qps   = profile[i].num;
+			init_hca->qpc_base    = profile[i].start;
+			init_hca->log_num_qps = profile[i].log_num;
+			break;
+		case MTHCA_RES_EEC:
+			dev->limits.num_eecs   = profile[i].num;
+			init_hca->eec_base     = profile[i].start;
+			init_hca->log_num_eecs = profile[i].log_num;
+			break;
+		case MTHCA_RES_SRQ:
+			dev->limits.num_srqs   = profile[i].num;
+			init_hca->srqc_base    = profile[i].start;
+			init_hca->log_num_srqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_CQ:
+			dev->limits.num_cqs   = profile[i].num;
+			init_hca->cqc_base    = profile[i].start;
+			init_hca->log_num_cqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_EQP:
+			init_hca->eqpc_base = profile[i].start;
+			break;
+		case MTHCA_RES_EEEC:
+			init_hca->eeec_base = profile[i].start;
+			break;
+		case MTHCA_RES_EQ:
+			dev->limits.num_eqs   = profile[i].num;
+			init_hca->eqc_base    = profile[i].start;
+			init_hca->log_num_eqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_RDB:
+			for (dev->qp_table.rdb_shift = 0;
+			     request->num_qp << dev->qp_table.rdb_shift < profile[i].num;
+			     ++dev->qp_table.rdb_shift)
+				; /* nothing */
+			dev->qp_table.rdb_base    = (u32) profile[i].start;
+			init_hca->rdb_base        = profile[i].start;
+			break;
+		case MTHCA_RES_MCG:
+			dev->limits.num_mgms      = profile[i].num >> 1;
+			dev->limits.num_amgms     = profile[i].num >> 1;
+			init_hca->mc_base         = profile[i].start;
+			init_hca->log_mc_entry_sz = ffs(MTHCA_MGM_ENTRY_SIZE) - 1;
+			init_hca->log_mc_table_sz = profile[i].log_num;
+			init_hca->mc_hash_sz      = 1 << (profile[i].log_num - 1);
+			break;
+		case MTHCA_RES_MPT:
+			dev->limits.num_mpts   = profile[i].num;
+			dev->mr_table.mpt_base = profile[i].start;
+			init_hca->mpt_base     = profile[i].start;
+			init_hca->log_mpt_sz   = profile[i].log_num;
+			break;
+		case MTHCA_RES_MTT:
+			dev->limits.num_mtt_segs = profile[i].num;
+			dev->mr_table.mtt_base   = profile[i].start;
+			init_hca->mtt_base       = profile[i].start;
+			init_hca->mtt_seg_sz     = ffs(dev->limits.mtt_seg_size) - 7;
+			break;
+		case MTHCA_RES_UAR:
+			dev->limits.num_uars       = profile[i].num;
+			init_hca->uar_scratch_base = profile[i].start;
+			break;
+		case MTHCA_RES_UDAV:
+			dev->av_table.ddr_av_base = profile[i].start;
+			dev->av_table.num_ddr_avs = profile[i].num;
+			break;
+		case MTHCA_RES_UARC:
+			dev->uar_table.uarc_size = request->uarc_size;
+			dev->uar_table.uarc_base = profile[i].start;
+			init_hca->uarc_base   	 = profile[i].start;
+			init_hca->log_uarc_sz 	 = ffs(request->uarc_size) - 13;
+			init_hca->log_uar_sz  	 = ffs(request->num_uar) - 1;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * PDs don't take any HCA memory, but we assign them as part
+	 * of the HCA profile anyway.
+	 */
+	dev->limits.num_pds = MTHCA_NUM_PDS;
+
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT &&
+	    init_hca->log_mpt_sz > 23) {
+		mthca_warn(dev, "MPT table too large (requested size 2^%d >= 2^24)\n",
+			   init_hca->log_mpt_sz);
+		mthca_warn(dev, "Disabling memory key throughput optimization.\n");
+		dev->mthca_flags &= ~MTHCA_FLAG_SINAI_OPT;
+	}
+
+	/*
+	 * For Tavor, FMRs use ioremapped PCI memory. For 32 bit
+	 * systems it may use too much vmalloc space to map all MTT
+	 * memory, so we reserve some MTTs for FMR access, taking them
+	 * out of the MR pool. They don't use additional memory, but
+	 * we assign them as part of the HCA profile anyway.
+	 */
+	if (mthca_is_memfree(dev) || BITS_PER_LONG == 64)
+		dev->limits.fmr_reserved_mtts = 0;
+	else
+		dev->limits.fmr_reserved_mtts = request->fmr_reserved_mtts;
+
+	kfree(profile);
+	return total_size;
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.h b/drivers/infiniband/hw/mthca/mthca_profile.h
new file mode 100644
index 0000000..62b009c
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_profile.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_PROFILE_H
+#define MTHCA_PROFILE_H
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+struct mthca_profile {
+	int num_qp;
+	int rdb_per_qp;
+	int num_srq;
+	int num_cq;
+	int num_mcg;
+	int num_mpt;
+	int num_mtt;
+	int num_udav;
+	int num_uar;
+	int uarc_size;
+	int fmr_reserved_mtts;
+};
+
+s64 mthca_make_profile(struct mthca_dev *mdev,
+		       struct mthca_profile *request,
+		       struct mthca_dev_lim *dev_lim,
+		       struct mthca_init_hca_param *init_hca);
+
+#endif /* MTHCA_PROFILE_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
new file mode 100644
index 0000000..541f237
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -0,0 +1,1321 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include <rdma/mthca-abi.h>
+#include "mthca_memfree.h"
+
+static void init_query_mad(struct ib_smp *mad)
+{
+	mad->base_version  = 1;
+	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	mad->class_version = 1;
+	mad->method    	   = IB_MGMT_METHOD_GET;
+}
+
+static int mthca_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+			      struct ib_udata *uhw)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	struct mthca_dev *mdev = to_mdev(ibdev);
+
+	if (uhw->inlen || uhw->outlen)
+		return -EINVAL;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	props->fw_ver              = mdev->fw_ver;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mthca_MAD_IFC(mdev, 1, 1,
+			    1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	props->device_cap_flags    = mdev->device_cap_flags;
+	props->vendor_id           = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+		0xffffff;
+	props->vendor_part_id      = be16_to_cpup((__be16 *) (out_mad->data + 30));
+	props->hw_ver              = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&props->sys_image_guid, out_mad->data +  4, 8);
+
+	props->max_mr_size         = ~0ull;
+	props->page_size_cap       = mdev->limits.page_size_cap;
+	props->max_qp              = mdev->limits.num_qps - mdev->limits.reserved_qps;
+	props->max_qp_wr           = mdev->limits.max_wqes;
+	props->max_sge             = mdev->limits.max_sg;
+	props->max_sge_rd          = props->max_sge;
+	props->max_cq              = mdev->limits.num_cqs - mdev->limits.reserved_cqs;
+	props->max_cqe             = mdev->limits.max_cqes;
+	props->max_mr              = mdev->limits.num_mpts - mdev->limits.reserved_mrws;
+	props->max_pd              = mdev->limits.num_pds - mdev->limits.reserved_pds;
+	props->max_qp_rd_atom      = 1 << mdev->qp_table.rdb_shift;
+	props->max_qp_init_rd_atom = mdev->limits.max_qp_init_rdma;
+	props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
+	props->max_srq             = mdev->limits.num_srqs - mdev->limits.reserved_srqs;
+	props->max_srq_wr          = mdev->limits.max_srq_wqes;
+	props->max_srq_sge         = mdev->limits.max_srq_sge;
+	props->local_ca_ack_delay  = mdev->limits.local_ca_ack_delay;
+	props->atomic_cap          = mdev->limits.flags & DEV_LIM_FLAG_ATOMIC ?
+					IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+	props->max_pkeys           = mdev->limits.pkey_table_len;
+	props->max_mcast_grp       = mdev->limits.num_mgms + mdev->limits.num_amgms;
+	props->max_mcast_qp_attach = MTHCA_QP_PER_MGM;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+					   props->max_mcast_grp;
+	/*
+	 * If Sinai memory key optimization is being used, then only
+	 * the 8-bit key portion will change.  For other HCAs, the
+	 * unused index bits will also be used for FMR remapping.
+	 */
+	if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		props->max_map_per_fmr = 255;
+	else
+		props->max_map_per_fmr =
+			(1 << (32 - ilog2(mdev->limits.num_mpts))) - 1;
+
+	err = 0;
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_query_port(struct ib_device *ibdev,
+			    u8 port, struct ib_port_attr *props)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	/* props being zeroed by the caller, avoid zeroing it here */
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	props->lid               = be16_to_cpup((__be16 *) (out_mad->data + 16));
+	props->lmc               = out_mad->data[34] & 0x7;
+	props->sm_lid            = be16_to_cpup((__be16 *) (out_mad->data + 18));
+	props->sm_sl             = out_mad->data[36] & 0xf;
+	props->state             = out_mad->data[32] & 0xf;
+	props->phys_state        = out_mad->data[33] >> 4;
+	props->port_cap_flags    = be32_to_cpup((__be32 *) (out_mad->data + 20));
+	props->gid_tbl_len       = to_mdev(ibdev)->limits.gid_table_len;
+	props->max_msg_sz        = 0x80000000;
+	props->pkey_tbl_len      = to_mdev(ibdev)->limits.pkey_table_len;
+	props->bad_pkey_cntr     = be16_to_cpup((__be16 *) (out_mad->data + 46));
+	props->qkey_viol_cntr    = be16_to_cpup((__be16 *) (out_mad->data + 48));
+	props->active_width      = out_mad->data[31] & 0xf;
+	props->active_speed      = out_mad->data[35] >> 4;
+	props->max_mtu           = out_mad->data[41] & 0xf;
+	props->active_mtu        = out_mad->data[36] >> 4;
+	props->subnet_timeout    = out_mad->data[51] & 0x1f;
+	props->max_vl_num        = out_mad->data[37] >> 4;
+	props->init_type_reply   = out_mad->data[41] >> 4;
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_modify_device(struct ib_device *ibdev,
+			       int mask,
+			       struct ib_device_modify *props)
+{
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex))
+			return -ERESTARTSYS;
+		memcpy(ibdev->node_desc, props->node_desc,
+		       IB_DEVICE_NODE_DESC_MAX);
+		mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+	}
+
+	return 0;
+}
+
+static int mthca_modify_port(struct ib_device *ibdev,
+			     u8 port, int port_modify_mask,
+			     struct ib_port_modify *props)
+{
+	struct mthca_set_ib_param set_ib;
+	struct ib_port_attr attr;
+	int err;
+
+	if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex))
+		return -ERESTARTSYS;
+
+	err = ib_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	set_ib.set_si_guid     = 0;
+	set_ib.reset_qkey_viol = !!(port_modify_mask & IB_PORT_RESET_QKEY_CNTR);
+
+	set_ib.cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mthca_SET_IB(to_mdev(ibdev), &set_ib, port);
+	if (err)
+		goto out;
+out:
+	mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+	return err;
+}
+
+static int mthca_query_pkey(struct ib_device *ibdev,
+			    u8 port, u16 index, u16 *pkey)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	*pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_query_gid(struct ib_device *ibdev, u8 port,
+			   int index, union ib_gid *gid)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod = cpu_to_be32(index / 8);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev,
+						struct ib_udata *udata)
+{
+	struct mthca_alloc_ucontext_resp uresp;
+	struct mthca_ucontext           *context;
+	int                              err;
+
+	if (!(to_mdev(ibdev)->active))
+		return ERR_PTR(-EAGAIN);
+
+	memset(&uresp, 0, sizeof uresp);
+
+	uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps;
+	if (mthca_is_memfree(to_mdev(ibdev)))
+		uresp.uarc_size = to_mdev(ibdev)->uar_table.uarc_size;
+	else
+		uresp.uarc_size = 0;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_uar_alloc(to_mdev(ibdev), &context->uar);
+	if (err) {
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev));
+	if (IS_ERR(context->db_tab)) {
+		err = PTR_ERR(context->db_tab);
+		mthca_uar_free(to_mdev(ibdev), &context->uar);
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+		mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar, context->db_tab);
+		mthca_uar_free(to_mdev(ibdev), &context->uar);
+		kfree(context);
+		return ERR_PTR(-EFAULT);
+	}
+
+	context->reg_mr_warned = 0;
+
+	return &context->ibucontext;
+}
+
+static int mthca_dealloc_ucontext(struct ib_ucontext *context)
+{
+	mthca_cleanup_user_db_tab(to_mdev(context->device), &to_mucontext(context)->uar,
+				  to_mucontext(context)->db_tab);
+	mthca_uar_free(to_mdev(context->device), &to_mucontext(context)->uar);
+	kfree(to_mucontext(context));
+
+	return 0;
+}
+
+static int mthca_mmap_uar(struct ib_ucontext *context,
+			  struct vm_area_struct *vma)
+{
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	if (io_remap_pfn_range(vma, vma->vm_start,
+			       to_mucontext(context)->uar.pfn,
+			       PAGE_SIZE, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev,
+				    struct ib_ucontext *context,
+				    struct ib_udata *udata)
+{
+	struct mthca_pd *pd;
+	int err;
+
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_pd_alloc(to_mdev(ibdev), !context, pd);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context) {
+		if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) {
+			mthca_pd_free(to_mdev(ibdev), pd);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	return &pd->ibpd;
+}
+
+static int mthca_dealloc_pd(struct ib_pd *pd)
+{
+	mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
+	kfree(pd);
+
+	return 0;
+}
+
+static struct ib_ah *mthca_ah_create(struct ib_pd *pd,
+				     struct rdma_ah_attr *ah_attr,
+				     struct ib_udata *udata)
+
+{
+	int err;
+	struct mthca_ah *ah;
+
+	ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah);
+	if (err) {
+		kfree(ah);
+		return ERR_PTR(err);
+	}
+
+	return &ah->ibah;
+}
+
+static int mthca_ah_destroy(struct ib_ah *ah)
+{
+	mthca_destroy_ah(to_mdev(ah->device), to_mah(ah));
+	kfree(ah);
+
+	return 0;
+}
+
+static struct ib_srq *mthca_create_srq(struct ib_pd *pd,
+				       struct ib_srq_init_attr *init_attr,
+				       struct ib_udata *udata)
+{
+	struct mthca_create_srq ucmd;
+	struct mthca_ucontext *context = NULL;
+	struct mthca_srq *srq;
+	int err;
+
+	if (init_attr->srq_type != IB_SRQT_BASIC)
+		return ERR_PTR(-ENOSYS);
+
+	srq = kmalloc(sizeof *srq, GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	if (pd->uobject) {
+		context = to_mucontext(pd->uobject->context);
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_free;
+		}
+
+		err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+					context->db_tab, ucmd.db_index,
+					ucmd.db_page);
+
+		if (err)
+			goto err_free;
+
+		srq->mr.ibmr.lkey = ucmd.lkey;
+		srq->db_index     = ucmd.db_index;
+	}
+
+	err = mthca_alloc_srq(to_mdev(pd->device), to_mpd(pd),
+			      &init_attr->attr, srq);
+
+	if (err && pd->uobject)
+		mthca_unmap_user_db(to_mdev(pd->device), &context->uar,
+				    context->db_tab, ucmd.db_index);
+
+	if (err)
+		goto err_free;
+
+	if (context && ib_copy_to_udata(udata, &srq->srqn, sizeof (__u32))) {
+		mthca_free_srq(to_mdev(pd->device), srq);
+		err = -EFAULT;
+		goto err_free;
+	}
+
+	return &srq->ibsrq;
+
+err_free:
+	kfree(srq);
+
+	return ERR_PTR(err);
+}
+
+static int mthca_destroy_srq(struct ib_srq *srq)
+{
+	struct mthca_ucontext *context;
+
+	if (srq->uobject) {
+		context = to_mucontext(srq->uobject->context);
+
+		mthca_unmap_user_db(to_mdev(srq->device), &context->uar,
+				    context->db_tab, to_msrq(srq)->db_index);
+	}
+
+	mthca_free_srq(to_mdev(srq->device), to_msrq(srq));
+	kfree(srq);
+
+	return 0;
+}
+
+static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
+				     struct ib_qp_init_attr *init_attr,
+				     struct ib_udata *udata)
+{
+	struct mthca_create_qp ucmd;
+	struct mthca_qp *qp;
+	int err;
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	{
+		struct mthca_ucontext *context;
+
+		qp = kmalloc(sizeof *qp, GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		if (pd->uobject) {
+			context = to_mucontext(pd->uobject->context);
+
+			if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+				kfree(qp);
+				return ERR_PTR(-EFAULT);
+			}
+
+			err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+						context->db_tab,
+						ucmd.sq_db_index, ucmd.sq_db_page);
+			if (err) {
+				kfree(qp);
+				return ERR_PTR(err);
+			}
+
+			err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+						context->db_tab,
+						ucmd.rq_db_index, ucmd.rq_db_page);
+			if (err) {
+				mthca_unmap_user_db(to_mdev(pd->device),
+						    &context->uar,
+						    context->db_tab,
+						    ucmd.sq_db_index);
+				kfree(qp);
+				return ERR_PTR(err);
+			}
+
+			qp->mr.ibmr.lkey = ucmd.lkey;
+			qp->sq.db_index  = ucmd.sq_db_index;
+			qp->rq.db_index  = ucmd.rq_db_index;
+		}
+
+		err = mthca_alloc_qp(to_mdev(pd->device), to_mpd(pd),
+				     to_mcq(init_attr->send_cq),
+				     to_mcq(init_attr->recv_cq),
+				     init_attr->qp_type, init_attr->sq_sig_type,
+				     &init_attr->cap, qp);
+
+		if (err && pd->uobject) {
+			context = to_mucontext(pd->uobject->context);
+
+			mthca_unmap_user_db(to_mdev(pd->device),
+					    &context->uar,
+					    context->db_tab,
+					    ucmd.sq_db_index);
+			mthca_unmap_user_db(to_mdev(pd->device),
+					    &context->uar,
+					    context->db_tab,
+					    ucmd.rq_db_index);
+		}
+
+		qp->ibqp.qp_num = qp->qpn;
+		break;
+	}
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	{
+		/* Don't allow userspace to create special QPs */
+		if (pd->uobject)
+			return ERR_PTR(-EINVAL);
+
+		qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+		err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd),
+				      to_mcq(init_attr->send_cq),
+				      to_mcq(init_attr->recv_cq),
+				      init_attr->sq_sig_type, &init_attr->cap,
+				      qp->ibqp.qp_num, init_attr->port_num,
+				      to_msqp(qp));
+		break;
+	}
+	default:
+		/* Don't support raw QPs */
+		return ERR_PTR(-ENOSYS);
+	}
+
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	init_attr->cap.max_send_wr     = qp->sq.max;
+	init_attr->cap.max_recv_wr     = qp->rq.max;
+	init_attr->cap.max_send_sge    = qp->sq.max_gs;
+	init_attr->cap.max_recv_sge    = qp->rq.max_gs;
+	init_attr->cap.max_inline_data = qp->max_inline_data;
+
+	return &qp->ibqp;
+}
+
+static int mthca_destroy_qp(struct ib_qp *qp)
+{
+	if (qp->uobject) {
+		mthca_unmap_user_db(to_mdev(qp->device),
+				    &to_mucontext(qp->uobject->context)->uar,
+				    to_mucontext(qp->uobject->context)->db_tab,
+				    to_mqp(qp)->sq.db_index);
+		mthca_unmap_user_db(to_mdev(qp->device),
+				    &to_mucontext(qp->uobject->context)->uar,
+				    to_mucontext(qp->uobject->context)->db_tab,
+				    to_mqp(qp)->rq.db_index);
+	}
+	mthca_free_qp(to_mdev(qp->device), to_mqp(qp));
+	kfree(qp);
+	return 0;
+}
+
+static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
+				     const struct ib_cq_init_attr *attr,
+				     struct ib_ucontext *context,
+				     struct ib_udata *udata)
+{
+	int entries = attr->cqe;
+	struct mthca_create_cq ucmd;
+	struct mthca_cq *cq;
+	int nent;
+	int err;
+
+	if (attr->flags)
+		return ERR_PTR(-EINVAL);
+
+	if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes)
+		return ERR_PTR(-EINVAL);
+
+	if (context) {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+			return ERR_PTR(-EFAULT);
+
+		err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+					to_mucontext(context)->db_tab,
+					ucmd.set_db_index, ucmd.set_db_page);
+		if (err)
+			return ERR_PTR(err);
+
+		err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+					to_mucontext(context)->db_tab,
+					ucmd.arm_db_index, ucmd.arm_db_page);
+		if (err)
+			goto err_unmap_set;
+	}
+
+	cq = kmalloc(sizeof *cq, GFP_KERNEL);
+	if (!cq) {
+		err = -ENOMEM;
+		goto err_unmap_arm;
+	}
+
+	if (context) {
+		cq->buf.mr.ibmr.lkey = ucmd.lkey;
+		cq->set_ci_db_index  = ucmd.set_db_index;
+		cq->arm_db_index     = ucmd.arm_db_index;
+	}
+
+	for (nent = 1; nent <= entries; nent <<= 1)
+		; /* nothing */
+
+	err = mthca_init_cq(to_mdev(ibdev), nent,
+			    context ? to_mucontext(context) : NULL,
+			    context ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num,
+			    cq);
+	if (err)
+		goto err_free;
+
+	if (context && ib_copy_to_udata(udata, &cq->cqn, sizeof (__u32))) {
+		mthca_free_cq(to_mdev(ibdev), cq);
+		err = -EFAULT;
+		goto err_free;
+	}
+
+	cq->resize_buf = NULL;
+
+	return &cq->ibcq;
+
+err_free:
+	kfree(cq);
+
+err_unmap_arm:
+	if (context)
+		mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+				    to_mucontext(context)->db_tab, ucmd.arm_db_index);
+
+err_unmap_set:
+	if (context)
+		mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+				    to_mucontext(context)->db_tab, ucmd.set_db_index);
+
+	return ERR_PTR(err);
+}
+
+static int mthca_alloc_resize_buf(struct mthca_dev *dev, struct mthca_cq *cq,
+				  int entries)
+{
+	int ret;
+
+	spin_lock_irq(&cq->lock);
+	if (cq->resize_buf) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+	if (!cq->resize_buf) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	cq->resize_buf->state = CQ_RESIZE_ALLOC;
+
+	ret = 0;
+
+unlock:
+	spin_unlock_irq(&cq->lock);
+
+	if (ret)
+		return ret;
+
+	ret = mthca_alloc_cq_buf(dev, &cq->resize_buf->buf, entries);
+	if (ret) {
+		spin_lock_irq(&cq->lock);
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		spin_unlock_irq(&cq->lock);
+		return ret;
+	}
+
+	cq->resize_buf->cqe = entries - 1;
+
+	spin_lock_irq(&cq->lock);
+	cq->resize_buf->state = CQ_RESIZE_READY;
+	spin_unlock_irq(&cq->lock);
+
+	return 0;
+}
+
+static int mthca_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(ibcq->device);
+	struct mthca_cq *cq = to_mcq(ibcq);
+	struct mthca_resize_cq ucmd;
+	u32 lkey;
+	int ret;
+
+	if (entries < 1 || entries > dev->limits.max_cqes)
+		return -EINVAL;
+
+	mutex_lock(&cq->mutex);
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries == ibcq->cqe + 1) {
+		ret = 0;
+		goto out;
+	}
+
+	if (cq->is_kernel) {
+		ret = mthca_alloc_resize_buf(dev, cq, entries);
+		if (ret)
+			goto out;
+		lkey = cq->resize_buf->buf.mr.ibmr.lkey;
+	} else {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		lkey = ucmd.lkey;
+	}
+
+	ret = mthca_RESIZE_CQ(dev, cq->cqn, lkey, ilog2(entries));
+
+	if (ret) {
+		if (cq->resize_buf) {
+			mthca_free_cq_buf(dev, &cq->resize_buf->buf,
+					  cq->resize_buf->cqe);
+			kfree(cq->resize_buf);
+			spin_lock_irq(&cq->lock);
+			cq->resize_buf = NULL;
+			spin_unlock_irq(&cq->lock);
+		}
+		goto out;
+	}
+
+	if (cq->is_kernel) {
+		struct mthca_cq_buf tbuf;
+		int tcqe;
+
+		spin_lock_irq(&cq->lock);
+		if (cq->resize_buf->state == CQ_RESIZE_READY) {
+			mthca_cq_resize_copy_cqes(cq);
+			tbuf         = cq->buf;
+			tcqe         = cq->ibcq.cqe;
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+		} else {
+			tbuf = cq->resize_buf->buf;
+			tcqe = cq->resize_buf->cqe;
+		}
+
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		spin_unlock_irq(&cq->lock);
+
+		mthca_free_cq_buf(dev, &tbuf, tcqe);
+	} else
+		ibcq->cqe = entries - 1;
+
+out:
+	mutex_unlock(&cq->mutex);
+
+	return ret;
+}
+
+static int mthca_destroy_cq(struct ib_cq *cq)
+{
+	if (cq->uobject) {
+		mthca_unmap_user_db(to_mdev(cq->device),
+				    &to_mucontext(cq->uobject->context)->uar,
+				    to_mucontext(cq->uobject->context)->db_tab,
+				    to_mcq(cq)->arm_db_index);
+		mthca_unmap_user_db(to_mdev(cq->device),
+				    &to_mucontext(cq->uobject->context)->uar,
+				    to_mucontext(cq->uobject->context)->db_tab,
+				    to_mcq(cq)->set_ci_db_index);
+	}
+	mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
+	kfree(cq);
+
+	return 0;
+}
+
+static inline u32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_MPT_FLAG_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MTHCA_MPT_FLAG_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MTHCA_MPT_FLAG_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MTHCA_MPT_FLAG_LOCAL_WRITE  : 0) |
+	       MTHCA_MPT_FLAG_LOCAL_READ;
+}
+
+static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mthca_mr *mr;
+	int err;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_mr_alloc_notrans(to_mdev(pd->device),
+				     to_mpd(pd)->pd_num,
+				     convert_access(acc), mr);
+
+	if (err) {
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+}
+
+static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				       u64 virt, int acc, struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(pd->device);
+	struct scatterlist *sg;
+	struct mthca_mr *mr;
+	struct mthca_reg_mr ucmd;
+	u64 *pages;
+	int shift, n, len;
+	int i, k, entry;
+	int err = 0;
+	int write_mtt_size;
+
+	if (udata->inlen < sizeof ucmd) {
+		if (!to_mucontext(pd->uobject->context)->reg_mr_warned) {
+			mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n",
+				   current->comm);
+			mthca_warn(dev, "  Update libmthca to fix this.\n");
+		}
+		++to_mucontext(pd->uobject->context)->reg_mr_warned;
+		ucmd.mr_attrs = 0;
+	} else if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+		return ERR_PTR(-EFAULT);
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->umem = ib_umem_get(pd->uobject->context, start, length, acc,
+			       ucmd.mr_attrs & MTHCA_MR_DMASYNC);
+
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		goto err;
+	}
+
+	shift = mr->umem->page_shift;
+	n = mr->umem->nmap;
+
+	mr->mtt = mthca_alloc_mtt(dev, n);
+	if (IS_ERR(mr->mtt)) {
+		err = PTR_ERR(mr->mtt);
+		goto err_umem;
+	}
+
+	pages = (u64 *) __get_free_page(GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err_mtt;
+	}
+
+	i = n = 0;
+
+	write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages));
+
+	for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
+		len = sg_dma_len(sg) >> shift;
+		for (k = 0; k < len; ++k) {
+			pages[i++] = sg_dma_address(sg) + (k << shift);
+			/*
+			 * Be friendly to write_mtt and pass it chunks
+			 * of appropriate size.
+			 */
+			if (i == write_mtt_size) {
+				err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+				if (err)
+					goto mtt_done;
+				n += i;
+				i = 0;
+			}
+		}
+	}
+
+	if (i)
+		err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+mtt_done:
+	free_page((unsigned long) pages);
+	if (err)
+		goto err_mtt;
+
+	err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, virt, length,
+			     convert_access(acc), mr);
+
+	if (err)
+		goto err_mtt;
+
+	return &mr->ibmr;
+
+err_mtt:
+	mthca_free_mtt(dev, mr->mtt);
+
+err_umem:
+	ib_umem_release(mr->umem);
+
+err:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+static int mthca_dereg_mr(struct ib_mr *mr)
+{
+	struct mthca_mr *mmr = to_mmr(mr);
+
+	mthca_free_mr(to_mdev(mr->device), mmr);
+	if (mmr->umem)
+		ib_umem_release(mmr->umem);
+	kfree(mmr);
+
+	return 0;
+}
+
+static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+				      struct ib_fmr_attr *fmr_attr)
+{
+	struct mthca_fmr *fmr;
+	int err;
+
+	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
+	if (!fmr)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr);
+	err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num,
+			     convert_access(mr_access_flags), fmr);
+
+	if (err) {
+		kfree(fmr);
+		return ERR_PTR(err);
+	}
+
+	return &fmr->ibmr;
+}
+
+static int mthca_dealloc_fmr(struct ib_fmr *fmr)
+{
+	struct mthca_fmr *mfmr = to_mfmr(fmr);
+	int err;
+
+	err = mthca_free_fmr(to_mdev(fmr->device), mfmr);
+	if (err)
+		return err;
+
+	kfree(mfmr);
+	return 0;
+}
+
+static int mthca_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *fmr;
+	int err;
+	struct mthca_dev *mdev = NULL;
+
+	list_for_each_entry(fmr, fmr_list, list) {
+		if (mdev && to_mdev(fmr->device) != mdev)
+			return -EINVAL;
+		mdev = to_mdev(fmr->device);
+	}
+
+	if (!mdev)
+		return 0;
+
+	if (mthca_is_memfree(mdev)) {
+		list_for_each_entry(fmr, fmr_list, list)
+			mthca_arbel_fmr_unmap(mdev, to_mfmr(fmr));
+
+		wmb();
+	} else
+		list_for_each_entry(fmr, fmr_list, list)
+			mthca_tavor_fmr_unmap(mdev, to_mfmr(fmr));
+
+	err = mthca_SYNC_TPT(mdev);
+	return err;
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	return sprintf(buf, "%x\n", dev->rev_id);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	switch (dev->pdev->device) {
+	case PCI_DEVICE_ID_MELLANOX_TAVOR:
+		return sprintf(buf, "MT23108\n");
+	case PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT:
+		return sprintf(buf, "MT25208 (MT23108 compat mode)\n");
+	case PCI_DEVICE_ID_MELLANOX_ARBEL:
+		return sprintf(buf, "MT25208\n");
+	case PCI_DEVICE_ID_MELLANOX_SINAI:
+	case PCI_DEVICE_ID_MELLANOX_SINAI_OLD:
+		return sprintf(buf, "MT25204\n");
+	default:
+		return sprintf(buf, "unknown\n");
+	}
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id);
+}
+
+static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+
+static struct device_attribute *mthca_dev_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+static int mthca_init_node_data(struct mthca_dev *dev)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+	err = mthca_MAD_IFC(dev, 1, 1,
+			    1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(dev->ib_dev.node_desc, out_mad->data, IB_DEVICE_NODE_DESC_MAX);
+
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mthca_MAD_IFC(dev, 1, 1,
+			    1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	if (mthca_is_memfree(dev))
+		dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_port_immutable(struct ib_device *ibdev, u8 port_num,
+			        struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+	return 0;
+}
+
+static void get_dev_fw_str(struct ib_device *device, char *str)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev);
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d",
+		 (int) (dev->fw_ver >> 32),
+		 (int) (dev->fw_ver >> 16) & 0xffff,
+		 (int) dev->fw_ver & 0xffff);
+}
+
+int mthca_register_device(struct mthca_dev *dev)
+{
+	int ret;
+	int i;
+
+	ret = mthca_init_node_data(dev);
+	if (ret)
+		return ret;
+
+	strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX);
+	dev->ib_dev.owner                = THIS_MODULE;
+
+	dev->ib_dev.uverbs_abi_ver	 = MTHCA_UVERBS_ABI_VERSION;
+	dev->ib_dev.uverbs_cmd_mask	 =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST);
+	dev->ib_dev.node_type            = RDMA_NODE_IB_CA;
+	dev->ib_dev.phys_port_cnt        = dev->limits.num_ports;
+	dev->ib_dev.num_comp_vectors     = 1;
+	dev->ib_dev.dev.parent           = &dev->pdev->dev;
+	dev->ib_dev.query_device         = mthca_query_device;
+	dev->ib_dev.query_port           = mthca_query_port;
+	dev->ib_dev.modify_device        = mthca_modify_device;
+	dev->ib_dev.modify_port          = mthca_modify_port;
+	dev->ib_dev.query_pkey           = mthca_query_pkey;
+	dev->ib_dev.query_gid            = mthca_query_gid;
+	dev->ib_dev.alloc_ucontext       = mthca_alloc_ucontext;
+	dev->ib_dev.dealloc_ucontext     = mthca_dealloc_ucontext;
+	dev->ib_dev.mmap                 = mthca_mmap_uar;
+	dev->ib_dev.alloc_pd             = mthca_alloc_pd;
+	dev->ib_dev.dealloc_pd           = mthca_dealloc_pd;
+	dev->ib_dev.create_ah            = mthca_ah_create;
+	dev->ib_dev.query_ah             = mthca_ah_query;
+	dev->ib_dev.destroy_ah           = mthca_ah_destroy;
+
+	if (dev->mthca_flags & MTHCA_FLAG_SRQ) {
+		dev->ib_dev.create_srq           = mthca_create_srq;
+		dev->ib_dev.modify_srq           = mthca_modify_srq;
+		dev->ib_dev.query_srq            = mthca_query_srq;
+		dev->ib_dev.destroy_srq          = mthca_destroy_srq;
+		dev->ib_dev.uverbs_cmd_mask	|=
+			(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+		if (mthca_is_memfree(dev))
+			dev->ib_dev.post_srq_recv = mthca_arbel_post_srq_recv;
+		else
+			dev->ib_dev.post_srq_recv = mthca_tavor_post_srq_recv;
+	}
+
+	dev->ib_dev.create_qp            = mthca_create_qp;
+	dev->ib_dev.modify_qp            = mthca_modify_qp;
+	dev->ib_dev.query_qp             = mthca_query_qp;
+	dev->ib_dev.destroy_qp           = mthca_destroy_qp;
+	dev->ib_dev.create_cq            = mthca_create_cq;
+	dev->ib_dev.resize_cq            = mthca_resize_cq;
+	dev->ib_dev.destroy_cq           = mthca_destroy_cq;
+	dev->ib_dev.poll_cq              = mthca_poll_cq;
+	dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
+	dev->ib_dev.reg_user_mr          = mthca_reg_user_mr;
+	dev->ib_dev.dereg_mr             = mthca_dereg_mr;
+	dev->ib_dev.get_port_immutable   = mthca_port_immutable;
+	dev->ib_dev.get_dev_fw_str       = get_dev_fw_str;
+
+	if (dev->mthca_flags & MTHCA_FLAG_FMR) {
+		dev->ib_dev.alloc_fmr            = mthca_alloc_fmr;
+		dev->ib_dev.unmap_fmr            = mthca_unmap_fmr;
+		dev->ib_dev.dealloc_fmr          = mthca_dealloc_fmr;
+		if (mthca_is_memfree(dev))
+			dev->ib_dev.map_phys_fmr = mthca_arbel_map_phys_fmr;
+		else
+			dev->ib_dev.map_phys_fmr = mthca_tavor_map_phys_fmr;
+	}
+
+	dev->ib_dev.attach_mcast         = mthca_multicast_attach;
+	dev->ib_dev.detach_mcast         = mthca_multicast_detach;
+	dev->ib_dev.process_mad          = mthca_process_mad;
+
+	if (mthca_is_memfree(dev)) {
+		dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq;
+		dev->ib_dev.post_send     = mthca_arbel_post_send;
+		dev->ib_dev.post_recv     = mthca_arbel_post_receive;
+	} else {
+		dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq;
+		dev->ib_dev.post_send     = mthca_tavor_post_send;
+		dev->ib_dev.post_recv     = mthca_tavor_post_receive;
+	}
+
+	mutex_init(&dev->cap_mask_mutex);
+
+	dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA;
+	ret = ib_register_device(&dev->ib_dev, NULL);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(mthca_dev_attributes); ++i) {
+		ret = device_create_file(&dev->ib_dev.dev,
+					 mthca_dev_attributes[i]);
+		if (ret) {
+			ib_unregister_device(&dev->ib_dev);
+			return ret;
+		}
+	}
+
+	mthca_start_catas_poll(dev);
+
+	return 0;
+}
+
+void mthca_unregister_device(struct mthca_dev *dev)
+{
+	mthca_stop_catas_poll(dev);
+	ib_unregister_device(&dev->ib_dev);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h
new file mode 100644
index 0000000..596acc4
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_PROVIDER_H
+#define MTHCA_PROVIDER_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+
+#define MTHCA_MPT_FLAG_ATOMIC        (1 << 14)
+#define MTHCA_MPT_FLAG_REMOTE_WRITE  (1 << 13)
+#define MTHCA_MPT_FLAG_REMOTE_READ   (1 << 12)
+#define MTHCA_MPT_FLAG_LOCAL_WRITE   (1 << 11)
+#define MTHCA_MPT_FLAG_LOCAL_READ    (1 << 10)
+
+struct mthca_buf_list {
+	void *buf;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+};
+
+union mthca_buf {
+	struct mthca_buf_list direct;
+	struct mthca_buf_list *page_list;
+};
+
+struct mthca_uar {
+	unsigned long pfn;
+	int           index;
+};
+
+struct mthca_user_db_table;
+
+struct mthca_ucontext {
+	struct ib_ucontext          ibucontext;
+	struct mthca_uar            uar;
+	struct mthca_user_db_table *db_tab;
+	int			    reg_mr_warned;
+};
+
+struct mthca_mtt;
+
+struct mthca_mr {
+	struct ib_mr      ibmr;
+	struct ib_umem   *umem;
+	struct mthca_mtt *mtt;
+};
+
+struct mthca_fmr {
+	struct ib_fmr      ibmr;
+	struct ib_fmr_attr attr;
+	struct mthca_mtt  *mtt;
+	int                maps;
+	union {
+		struct {
+			struct mthca_mpt_entry __iomem *mpt;
+			u64 __iomem *mtts;
+		} tavor;
+		struct {
+			struct mthca_mpt_entry *mpt;
+			__be64 *mtts;
+			dma_addr_t dma_handle;
+		} arbel;
+	} mem;
+};
+
+struct mthca_pd {
+	struct ib_pd    ibpd;
+	u32             pd_num;
+	atomic_t        sqp_count;
+	struct mthca_mr ntmr;
+	int             privileged;
+};
+
+struct mthca_eq {
+	struct mthca_dev      *dev;
+	int                    eqn;
+	u32                    eqn_mask;
+	u32                    cons_index;
+	u16                    msi_x_vector;
+	u16                    msi_x_entry;
+	int                    have_irq;
+	int                    nent;
+	struct mthca_buf_list *page_list;
+	struct mthca_mr        mr;
+	char		       irq_name[IB_DEVICE_NAME_MAX];
+};
+
+struct mthca_av;
+
+enum mthca_ah_type {
+	MTHCA_AH_ON_HCA,
+	MTHCA_AH_PCI_POOL,
+	MTHCA_AH_KMALLOC
+};
+
+struct mthca_ah {
+	struct ib_ah       ibah;
+	enum mthca_ah_type type;
+	u32                key;
+	struct mthca_av   *av;
+	dma_addr_t         avdma;
+};
+
+/*
+ * Quick description of our CQ/QP locking scheme:
+ *
+ * We have one global lock that protects dev->cq/qp_table.  Each
+ * struct mthca_cq/qp also has its own lock.  An individual qp lock
+ * may be taken inside of an individual cq lock.  Both cqs attached to
+ * a qp may be locked, with the cq with the lower cqn locked first.
+ * No other nesting should be done.
+ *
+ * Each struct mthca_cq/qp also has an ref count, protected by the
+ * corresponding table lock.  The pointer from the cq/qp_table to the
+ * struct counts as one reference.  This reference also is good for
+ * access through the consumer API, so modifying the CQ/QP etc doesn't
+ * need to take another reference.  Access to a QP because of a
+ * completion being polled does not need a reference either.
+ *
+ * Finally, each struct mthca_cq/qp has a wait_queue_head_t for the
+ * destroy function to sleep on.
+ *
+ * This means that access from the consumer API requires nothing but
+ * taking the struct's lock.
+ *
+ * Access because of a completion event should go as follows:
+ * - lock cq/qp_table and look up struct
+ * - increment ref count in struct
+ * - drop cq/qp_table lock
+ * - lock struct, do your thing, and unlock struct
+ * - decrement ref count; if zero, wake up waiters
+ *
+ * To destroy a CQ/QP, we can do the following:
+ * - lock cq/qp_table
+ * - remove pointer and decrement ref count
+ * - unlock cq/qp_table lock
+ * - wait_event until ref count is zero
+ *
+ * It is the consumer's responsibilty to make sure that no QP
+ * operations (WQE posting or state modification) are pending when a
+ * QP is destroyed.  Also, the consumer must make sure that calls to
+ * qp_modify are serialized.  Similarly, the consumer is responsible
+ * for ensuring that no CQ resize operations are pending when a CQ
+ * is destroyed.
+ *
+ * Possible optimizations (wait for profile data to see if/where we
+ * have locks bouncing between CPUs):
+ * - split cq/qp table lock into n separate (cache-aligned) locks,
+ *   indexed (say) by the page in the table
+ * - split QP struct lock into three (one for common info, one for the
+ *   send queue and one for the receive queue)
+ */
+
+struct mthca_cq_buf {
+	union mthca_buf		queue;
+	struct mthca_mr		mr;
+	int			is_direct;
+};
+
+struct mthca_cq_resize {
+	struct mthca_cq_buf	buf;
+	int			cqe;
+	enum {
+		CQ_RESIZE_ALLOC,
+		CQ_RESIZE_READY,
+		CQ_RESIZE_SWAPPED
+	}			state;
+};
+
+struct mthca_cq {
+	struct ib_cq		ibcq;
+	spinlock_t		lock;
+	int			refcount;
+	int			cqn;
+	u32			cons_index;
+	struct mthca_cq_buf	buf;
+	struct mthca_cq_resize *resize_buf;
+	int			is_kernel;
+
+	/* Next fields are Arbel only */
+	int			set_ci_db_index;
+	__be32		       *set_ci_db;
+	int			arm_db_index;
+	__be32		       *arm_db;
+	int			arm_sn;
+
+	wait_queue_head_t	wait;
+	struct mutex		mutex;
+};
+
+struct mthca_srq {
+	struct ib_srq		ibsrq;
+	spinlock_t		lock;
+	int			refcount;
+	int			srqn;
+	int			max;
+	int			max_gs;
+	int			wqe_shift;
+	int			first_free;
+	int			last_free;
+	u16			counter;  /* Arbel only */
+	int			db_index; /* Arbel only */
+	__be32		       *db;       /* Arbel only */
+	void		       *last;
+
+	int			is_direct;
+	u64		       *wrid;
+	union mthca_buf		queue;
+	struct mthca_mr		mr;
+
+	wait_queue_head_t	wait;
+	struct mutex		mutex;
+};
+
+struct mthca_wq {
+	spinlock_t lock;
+	int        max;
+	unsigned   next_ind;
+	unsigned   last_comp;
+	unsigned   head;
+	unsigned   tail;
+	void      *last;
+	int        max_gs;
+	int        wqe_shift;
+
+	int        db_index;	/* Arbel only */
+	__be32    *db;
+};
+
+struct mthca_qp {
+	struct ib_qp           ibqp;
+	int                    refcount;
+	u32                    qpn;
+	int                    is_direct;
+	u8                     port; /* for SQP and memfree use only */
+	u8                     alt_port; /* for memfree use only */
+	u8                     transport;
+	u8                     state;
+	u8                     atomic_rd_en;
+	u8                     resp_depth;
+
+	struct mthca_mr        mr;
+
+	struct mthca_wq        rq;
+	struct mthca_wq        sq;
+	enum ib_sig_type       sq_policy;
+	int                    send_wqe_offset;
+	int                    max_inline_data;
+
+	u64                   *wrid;
+	union mthca_buf	       queue;
+
+	wait_queue_head_t      wait;
+	struct mutex	       mutex;
+};
+
+struct mthca_sqp {
+	struct mthca_qp qp;
+	int             pkey_index;
+	u32             qkey;
+	u32             send_psn;
+	struct ib_ud_header ud_header;
+	int             header_buf_size;
+	void           *header_buf;
+	dma_addr_t      header_dma;
+};
+
+static inline struct mthca_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mthca_ucontext, ibucontext);
+}
+
+static inline struct mthca_fmr *to_mfmr(struct ib_fmr *ibmr)
+{
+	return container_of(ibmr, struct mthca_fmr, ibmr);
+}
+
+static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mthca_mr, ibmr);
+}
+
+static inline struct mthca_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mthca_pd, ibpd);
+}
+
+static inline struct mthca_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mthca_ah, ibah);
+}
+
+static inline struct mthca_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mthca_cq, ibcq);
+}
+
+static inline struct mthca_srq *to_msrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct mthca_srq, ibsrq);
+}
+
+static inline struct mthca_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mthca_qp, ibqp);
+}
+
+static inline struct mthca_sqp *to_msqp(struct mthca_qp *qp)
+{
+	return container_of(qp, struct mthca_sqp, qp);
+}
+
+#endif /* MTHCA_PROVIDER_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
new file mode 100644
index 0000000..d21960c
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -0,0 +1,2323 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+enum {
+	MTHCA_MAX_DIRECT_QP_SIZE = 4 * PAGE_SIZE,
+	MTHCA_ACK_REQ_FREQ       = 10,
+	MTHCA_FLIGHT_LIMIT       = 9,
+	MTHCA_UD_HEADER_SIZE     = 72, /* largest UD header possible */
+	MTHCA_INLINE_HEADER_SIZE = 4,  /* data segment overhead for inline */
+	MTHCA_INLINE_CHUNK_SIZE  = 16  /* inline data segment chunk */
+};
+
+enum {
+	MTHCA_QP_STATE_RST  = 0,
+	MTHCA_QP_STATE_INIT = 1,
+	MTHCA_QP_STATE_RTR  = 2,
+	MTHCA_QP_STATE_RTS  = 3,
+	MTHCA_QP_STATE_SQE  = 4,
+	MTHCA_QP_STATE_SQD  = 5,
+	MTHCA_QP_STATE_ERR  = 6,
+	MTHCA_QP_STATE_DRAINING = 7
+};
+
+enum {
+	MTHCA_QP_ST_RC 	= 0x0,
+	MTHCA_QP_ST_UC 	= 0x1,
+	MTHCA_QP_ST_RD 	= 0x2,
+	MTHCA_QP_ST_UD 	= 0x3,
+	MTHCA_QP_ST_MLX = 0x7
+};
+
+enum {
+	MTHCA_QP_PM_MIGRATED = 0x3,
+	MTHCA_QP_PM_ARMED    = 0x0,
+	MTHCA_QP_PM_REARM    = 0x1
+};
+
+enum {
+	/* qp_context flags */
+	MTHCA_QP_BIT_DE  = 1 <<  8,
+	/* params1 */
+	MTHCA_QP_BIT_SRE = 1 << 15,
+	MTHCA_QP_BIT_SWE = 1 << 14,
+	MTHCA_QP_BIT_SAE = 1 << 13,
+	MTHCA_QP_BIT_SIC = 1 <<  4,
+	MTHCA_QP_BIT_SSC = 1 <<  3,
+	/* params2 */
+	MTHCA_QP_BIT_RRE = 1 << 15,
+	MTHCA_QP_BIT_RWE = 1 << 14,
+	MTHCA_QP_BIT_RAE = 1 << 13,
+	MTHCA_QP_BIT_RIC = 1 <<  4,
+	MTHCA_QP_BIT_RSC = 1 <<  3
+};
+
+enum {
+	MTHCA_SEND_DOORBELL_FENCE = 1 << 5
+};
+
+struct mthca_qp_path {
+	__be32 port_pkey;
+	u8     rnr_retry;
+	u8     g_mylmc;
+	__be16 rlid;
+	u8     ackto;
+	u8     mgid_index;
+	u8     static_rate;
+	u8     hop_limit;
+	__be32 sl_tclass_flowlabel;
+	u8     rgid[16];
+} __attribute__((packed));
+
+struct mthca_qp_context {
+	__be32 flags;
+	__be32 tavor_sched_queue; /* Reserved on Arbel */
+	u8     mtu_msgmax;
+	u8     rq_size_stride;	/* Reserved on Tavor */
+	u8     sq_size_stride;	/* Reserved on Tavor */
+	u8     rlkey_arbel_sched_queue;	/* Reserved on Tavor */
+	__be32 usr_page;
+	__be32 local_qpn;
+	__be32 remote_qpn;
+	u32    reserved1[2];
+	struct mthca_qp_path pri_path;
+	struct mthca_qp_path alt_path;
+	__be32 rdd;
+	__be32 pd;
+	__be32 wqe_base;
+	__be32 wqe_lkey;
+	__be32 params1;
+	__be32 reserved2;
+	__be32 next_send_psn;
+	__be32 cqn_snd;
+	__be32 snd_wqe_base_l;	/* Next send WQE on Tavor */
+	__be32 snd_db_index;	/* (debugging only entries) */
+	__be32 last_acked_psn;
+	__be32 ssn;
+	__be32 params2;
+	__be32 rnr_nextrecvpsn;
+	__be32 ra_buff_indx;
+	__be32 cqn_rcv;
+	__be32 rcv_wqe_base_l;	/* Next recv WQE on Tavor */
+	__be32 rcv_db_index;	/* (debugging only entries) */
+	__be32 qkey;
+	__be32 srqn;
+	__be32 rmsn;
+	__be16 rq_wqe_counter;	/* reserved on Tavor */
+	__be16 sq_wqe_counter;	/* reserved on Tavor */
+	u32    reserved3[18];
+} __attribute__((packed));
+
+struct mthca_qp_param {
+	__be32 opt_param_mask;
+	u32    reserved1;
+	struct mthca_qp_context context;
+	u32    reserved2[62];
+} __attribute__((packed));
+
+enum {
+	MTHCA_QP_OPTPAR_ALT_ADDR_PATH     = 1 << 0,
+	MTHCA_QP_OPTPAR_RRE               = 1 << 1,
+	MTHCA_QP_OPTPAR_RAE               = 1 << 2,
+	MTHCA_QP_OPTPAR_RWE               = 1 << 3,
+	MTHCA_QP_OPTPAR_PKEY_INDEX        = 1 << 4,
+	MTHCA_QP_OPTPAR_Q_KEY             = 1 << 5,
+	MTHCA_QP_OPTPAR_RNR_TIMEOUT       = 1 << 6,
+	MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7,
+	MTHCA_QP_OPTPAR_SRA_MAX           = 1 << 8,
+	MTHCA_QP_OPTPAR_RRA_MAX           = 1 << 9,
+	MTHCA_QP_OPTPAR_PM_STATE          = 1 << 10,
+	MTHCA_QP_OPTPAR_PORT_NUM          = 1 << 11,
+	MTHCA_QP_OPTPAR_RETRY_COUNT       = 1 << 12,
+	MTHCA_QP_OPTPAR_ALT_RNR_RETRY     = 1 << 13,
+	MTHCA_QP_OPTPAR_ACK_TIMEOUT       = 1 << 14,
+	MTHCA_QP_OPTPAR_RNR_RETRY         = 1 << 15,
+	MTHCA_QP_OPTPAR_SCHED_QUEUE       = 1 << 16
+};
+
+static const u8 mthca_opcode[] = {
+	[IB_WR_SEND]                 = MTHCA_OPCODE_SEND,
+	[IB_WR_SEND_WITH_IMM]        = MTHCA_OPCODE_SEND_IMM,
+	[IB_WR_RDMA_WRITE]           = MTHCA_OPCODE_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM]  = MTHCA_OPCODE_RDMA_WRITE_IMM,
+	[IB_WR_RDMA_READ]            = MTHCA_OPCODE_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP]   = MTHCA_OPCODE_ATOMIC_CS,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
+};
+
+static int is_sqp(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	return qp->qpn >= dev->qp_table.sqp_start &&
+		qp->qpn <= dev->qp_table.sqp_start + 3;
+}
+
+static int is_qp0(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	return qp->qpn >= dev->qp_table.sqp_start &&
+		qp->qpn <= dev->qp_table.sqp_start + 1;
+}
+
+static void *get_recv_wqe(struct mthca_qp *qp, int n)
+{
+	if (qp->is_direct)
+		return qp->queue.direct.buf + (n << qp->rq.wqe_shift);
+	else
+		return qp->queue.page_list[(n << qp->rq.wqe_shift) >> PAGE_SHIFT].buf +
+			((n << qp->rq.wqe_shift) & (PAGE_SIZE - 1));
+}
+
+static void *get_send_wqe(struct mthca_qp *qp, int n)
+{
+	if (qp->is_direct)
+		return qp->queue.direct.buf + qp->send_wqe_offset +
+			(n << qp->sq.wqe_shift);
+	else
+		return qp->queue.page_list[(qp->send_wqe_offset +
+					    (n << qp->sq.wqe_shift)) >>
+					   PAGE_SHIFT].buf +
+			((qp->send_wqe_offset + (n << qp->sq.wqe_shift)) &
+			 (PAGE_SIZE - 1));
+}
+
+static void mthca_wq_reset(struct mthca_wq *wq)
+{
+	wq->next_ind  = 0;
+	wq->last_comp = wq->max - 1;
+	wq->head      = 0;
+	wq->tail      = 0;
+}
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+		    enum ib_event_type event_type)
+{
+	struct mthca_qp *qp;
+	struct ib_event event;
+
+	spin_lock(&dev->qp_table.lock);
+	qp = mthca_array_get(&dev->qp_table.qp, qpn & (dev->limits.num_qps - 1));
+	if (qp)
+		++qp->refcount;
+	spin_unlock(&dev->qp_table.lock);
+
+	if (!qp) {
+		mthca_warn(dev, "Async event %d for bogus QP %08x\n",
+			   event_type, qpn);
+		return;
+	}
+
+	if (event_type == IB_EVENT_PATH_MIG)
+		qp->port = qp->alt_port;
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.qp  = &qp->ibqp;
+	if (qp->ibqp.event_handler)
+		qp->ibqp.event_handler(&event, qp->ibqp.qp_context);
+
+	spin_lock(&dev->qp_table.lock);
+	if (!--qp->refcount)
+		wake_up(&qp->wait);
+	spin_unlock(&dev->qp_table.lock);
+}
+
+static int to_mthca_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET: return MTHCA_QP_STATE_RST;
+	case IB_QPS_INIT:  return MTHCA_QP_STATE_INIT;
+	case IB_QPS_RTR:   return MTHCA_QP_STATE_RTR;
+	case IB_QPS_RTS:   return MTHCA_QP_STATE_RTS;
+	case IB_QPS_SQD:   return MTHCA_QP_STATE_SQD;
+	case IB_QPS_SQE:   return MTHCA_QP_STATE_SQE;
+	case IB_QPS_ERR:   return MTHCA_QP_STATE_ERR;
+	default:                return -1;
+	}
+}
+
+enum { RC, UC, UD, RD, RDEE, MLX, NUM_TRANS };
+
+static int to_mthca_st(int transport)
+{
+	switch (transport) {
+	case RC:  return MTHCA_QP_ST_RC;
+	case UC:  return MTHCA_QP_ST_UC;
+	case UD:  return MTHCA_QP_ST_UD;
+	case RD:  return MTHCA_QP_ST_RD;
+	case MLX: return MTHCA_QP_ST_MLX;
+	default:  return -1;
+	}
+}
+
+static void store_attrs(struct mthca_sqp *sqp, const struct ib_qp_attr *attr,
+			int attr_mask)
+{
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		sqp->pkey_index = attr->pkey_index;
+	if (attr_mask & IB_QP_QKEY)
+		sqp->qkey = attr->qkey;
+	if (attr_mask & IB_QP_SQ_PSN)
+		sqp->send_psn = attr->sq_psn;
+}
+
+static void init_port(struct mthca_dev *dev, int port)
+{
+	int err;
+	struct mthca_init_ib_param param;
+
+	memset(&param, 0, sizeof param);
+
+	param.port_width = dev->limits.port_width_cap;
+	param.vl_cap     = dev->limits.vl_cap;
+	param.mtu_cap    = dev->limits.mtu_cap;
+	param.gid_cap    = dev->limits.gid_table_len;
+	param.pkey_cap   = dev->limits.pkey_table_len;
+
+	err = mthca_INIT_IB(dev, &param, port);
+	if (err)
+		mthca_warn(dev, "INIT_IB failed, return code %d.\n", err);
+}
+
+static __be32 get_hw_access_flags(struct mthca_qp *qp, const struct ib_qp_attr *attr,
+				  int attr_mask)
+{
+	u8 dest_rd_atomic;
+	u32 access_flags;
+	u32 hw_access_flags = 0;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MTHCA_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= MTHCA_QP_BIT_RAE;
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MTHCA_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
+static inline enum ib_qp_state to_ib_qp_state(int mthca_state)
+{
+	switch (mthca_state) {
+	case MTHCA_QP_STATE_RST:      return IB_QPS_RESET;
+	case MTHCA_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MTHCA_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MTHCA_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MTHCA_QP_STATE_DRAINING:
+	case MTHCA_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MTHCA_QP_STATE_SQE:      return IB_QPS_SQE;
+	case MTHCA_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:                      return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mthca_mig_state)
+{
+	switch (mthca_mig_state) {
+	case 0:  return IB_MIG_ARMED;
+	case 1:  return IB_MIG_REARM;
+	case 3:  return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mthca_flags)
+{
+	int ib_flags = 0;
+
+	if (mthca_flags & MTHCA_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mthca_flags & MTHCA_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mthca_flags & MTHCA_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_rdma_ah_attr(struct mthca_dev *dev,
+			    struct rdma_ah_attr *ah_attr,
+			    struct mthca_qp_path *path)
+{
+	u8 port_num = (be32_to_cpu(path->port_pkey) >> 24) & 0x3;
+
+	memset(ah_attr, 0, sizeof(*ah_attr));
+
+	if (port_num == 0 || port_num > dev->limits.num_ports)
+		return;
+	ah_attr->type = rdma_ah_find_type(&dev->ib_dev, port_num);
+	rdma_ah_set_port_num(ah_attr, port_num);
+
+	rdma_ah_set_dlid(ah_attr, be16_to_cpu(path->rlid));
+	rdma_ah_set_sl(ah_attr, be32_to_cpu(path->sl_tclass_flowlabel) >> 28);
+	rdma_ah_set_path_bits(ah_attr, path->g_mylmc & 0x7f);
+	rdma_ah_set_static_rate(ah_attr,
+				mthca_rate_to_ib(dev,
+						 path->static_rate & 0xf,
+						 port_num));
+	if (path->g_mylmc & (1 << 7)) {
+		u32 tc_fl = be32_to_cpu(path->sl_tclass_flowlabel);
+
+		rdma_ah_set_grh(ah_attr, NULL,
+				tc_fl & 0xfffff,
+				path->mgid_index &
+				(dev->limits.gid_table_len - 1),
+				path->hop_limit,
+				(tc_fl >> 20) & 0xff);
+		rdma_ah_set_dgid_raw(ah_attr, path->rgid);
+	}
+}
+
+int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	int err = 0;
+	struct mthca_mailbox *mailbox = NULL;
+	struct mthca_qp_param *qp_param;
+	struct mthca_qp_context *context;
+	int mthca_state;
+
+	mutex_lock(&qp->mutex);
+
+	if (qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		goto done;
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto out;
+	}
+
+	err = mthca_QUERY_QP(dev, qp->qpn, 0, mailbox);
+	if (err) {
+		mthca_warn(dev, "QUERY_QP failed (%d)\n", err);
+		goto out_mailbox;
+	}
+
+	qp_param    = mailbox->buf;
+	context     = &qp_param->context;
+	mthca_state = be32_to_cpu(context->flags) >> 28;
+
+	qp->state		     = to_ib_qp_state(mthca_state);
+	qp_attr->qp_state	     = qp->state;
+	qp_attr->path_mtu 	     = context->mtu_msgmax >> 5;
+	qp_attr->path_mig_state      =
+		to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
+	qp_attr->qkey 		     = be32_to_cpu(context->qkey);
+	qp_attr->rq_psn 	     = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn 	     = be32_to_cpu(context->next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num 	     = be32_to_cpu(context->remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context->params2));
+
+	if (qp->transport == RC || qp->transport == UC) {
+		to_rdma_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
+		to_rdma_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
+		qp_attr->alt_pkey_index =
+			be32_to_cpu(context->alt_path.port_pkey) & 0x7f;
+		qp_attr->alt_port_num	=
+			rdma_ah_get_port_num(&qp_attr->alt_ah_attr);
+	}
+
+	qp_attr->pkey_index = be32_to_cpu(context->pri_path.port_pkey) & 0x7f;
+	qp_attr->port_num   =
+		(be32_to_cpu(context->pri_path.port_pkey) >> 24) & 0x3;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mthca_state == MTHCA_QP_STATE_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context->params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer 	    =
+		(be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout 	    = context->pri_path.ackto >> 3;
+	qp_attr->retry_cnt 	    = (be32_to_cpu(context->params1) >> 16) & 0x7;
+	qp_attr->rnr_retry 	    = context->pri_path.rnr_retry >> 5;
+	qp_attr->alt_timeout 	    = context->alt_path.ackto >> 3;
+
+done:
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_send_wr     = qp->sq.max;
+	qp_attr->cap.max_recv_wr     = qp->rq.max;
+	qp_attr->cap.max_send_sge    = qp->sq.max_gs;
+	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+	qp_attr->cap.max_inline_data = qp->max_inline_data;
+
+	qp_init_attr->cap	     = qp_attr->cap;
+	qp_init_attr->sq_sig_type    = qp->sq_policy;
+
+out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int mthca_path_set(struct mthca_dev *dev, const struct rdma_ah_attr *ah,
+			  struct mthca_qp_path *path, u8 port)
+{
+	path->g_mylmc     = rdma_ah_get_path_bits(ah) & 0x7f;
+	path->rlid        = cpu_to_be16(rdma_ah_get_dlid(ah));
+	path->static_rate = mthca_get_rate(dev, rdma_ah_get_static_rate(ah),
+					   port);
+
+	if (rdma_ah_get_ah_flags(ah) & IB_AH_GRH) {
+		const struct ib_global_route *grh = rdma_ah_read_grh(ah);
+
+		if (grh->sgid_index >= dev->limits.gid_table_len) {
+			mthca_dbg(dev, "sgid_index (%u) too large. max is %d\n",
+				  grh->sgid_index,
+				  dev->limits.gid_table_len - 1);
+			return -1;
+		}
+
+		path->g_mylmc   |= 1 << 7;
+		path->mgid_index = grh->sgid_index;
+		path->hop_limit  = grh->hop_limit;
+		path->sl_tclass_flowlabel =
+			cpu_to_be32((rdma_ah_get_sl(ah) << 28) |
+				    (grh->traffic_class << 20) |
+				    (grh->flow_label));
+		memcpy(path->rgid, grh->dgid.raw, 16);
+	} else {
+		path->sl_tclass_flowlabel = cpu_to_be32(rdma_ah_get_sl(ah) <<
+							28);
+	}
+
+	return 0;
+}
+
+static int __mthca_modify_qp(struct ib_qp *ibqp,
+			     const struct ib_qp_attr *attr, int attr_mask,
+			     enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	struct mthca_mailbox *mailbox;
+	struct mthca_qp_param *qp_param;
+	struct mthca_qp_context *qp_context;
+	u32 sqd_event = 0;
+	int err = -EINVAL;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto out;
+	}
+	qp_param = mailbox->buf;
+	qp_context = &qp_param->context;
+	memset(qp_param, 0, sizeof *qp_param);
+
+	qp_context->flags      = cpu_to_be32((to_mthca_state(new_state) << 28) |
+					     (to_mthca_st(qp->transport) << 16));
+	qp_context->flags     |= cpu_to_be32(MTHCA_QP_BIT_DE);
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+		qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+	else {
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PM_STATE);
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+
+	/* leave tavor_sched_queue as 0 */
+
+	if (qp->transport == MLX || qp->transport == UD)
+		qp_context->mtu_msgmax = (IB_MTU_2048 << 5) | 11;
+	else if (attr_mask & IB_QP_PATH_MTU) {
+		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_2048) {
+			mthca_dbg(dev, "path MTU (%u) is invalid\n",
+				  attr->path_mtu);
+			goto out_mailbox;
+		}
+		qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+	}
+
+	if (mthca_is_memfree(dev)) {
+		if (qp->rq.max)
+			qp_context->rq_size_stride = ilog2(qp->rq.max) << 3;
+		qp_context->rq_size_stride |= qp->rq.wqe_shift - 4;
+
+		if (qp->sq.max)
+			qp_context->sq_size_stride = ilog2(qp->sq.max) << 3;
+		qp_context->sq_size_stride |= qp->sq.wqe_shift - 4;
+	}
+
+	/* leave arbel_sched_queue as 0 */
+
+	if (qp->ibqp.uobject)
+		qp_context->usr_page =
+			cpu_to_be32(to_mucontext(qp->ibqp.uobject->context)->uar.index);
+	else
+		qp_context->usr_page = cpu_to_be32(dev->driver_uar.index);
+	qp_context->local_qpn  = cpu_to_be32(qp->qpn);
+	if (attr_mask & IB_QP_DEST_QPN) {
+		qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+	}
+
+	if (qp->transport == MLX)
+		qp_context->pri_path.port_pkey |=
+			cpu_to_be32(qp->port << 24);
+	else {
+		if (attr_mask & IB_QP_PORT) {
+			qp_context->pri_path.port_pkey |=
+				cpu_to_be32(attr->port_num << 24);
+			qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PORT_NUM);
+		}
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		qp_context->pri_path.port_pkey |=
+			cpu_to_be32(attr->pkey_index);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PKEY_INDEX);
+	}
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		qp_context->alt_path.rnr_retry = qp_context->pri_path.rnr_retry =
+			attr->rnr_retry << 5;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY |
+							MTHCA_QP_OPTPAR_ALT_RNR_RETRY);
+	}
+
+	if (attr_mask & IB_QP_AV) {
+		if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path,
+				   attr_mask & IB_QP_PORT ? attr->port_num : qp->port))
+			goto out_mailbox;
+
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH);
+	}
+
+	if (ibqp->qp_type == IB_QPT_RC &&
+	    cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		u8 sched_queue = ibqp->uobject ? 0x2 : 0x1;
+
+		if (mthca_is_memfree(dev))
+			qp_context->rlkey_arbel_sched_queue |= sched_queue;
+		else
+			qp_context->tavor_sched_queue |= cpu_to_be32(sched_queue);
+
+		qp_param->opt_param_mask |=
+			cpu_to_be32(MTHCA_QP_OPTPAR_SCHED_QUEUE);
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		qp_context->pri_path.ackto = attr->timeout << 3;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT);
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_pkey_index >= dev->limits.pkey_table_len) {
+			mthca_dbg(dev, "Alternate P_Key index (%u) too large. max is %d\n",
+				  attr->alt_pkey_index, dev->limits.pkey_table_len-1);
+			goto out_mailbox;
+		}
+
+		if (attr->alt_port_num == 0 || attr->alt_port_num > dev->limits.num_ports) {
+			mthca_dbg(dev, "Alternate port number (%u) is invalid\n",
+				attr->alt_port_num);
+			goto out_mailbox;
+		}
+
+		if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path,
+				   rdma_ah_get_port_num(&attr->alt_ah_attr)))
+			goto out_mailbox;
+
+		qp_context->alt_path.port_pkey |= cpu_to_be32(attr->alt_pkey_index |
+							      attr->alt_port_num << 24);
+		qp_context->alt_path.ackto = attr->alt_timeout << 3;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ALT_ADDR_PATH);
+	}
+
+	/* leave rdd as 0 */
+	qp_context->pd         = cpu_to_be32(to_mpd(ibqp->pd)->pd_num);
+	/* leave wqe_base as 0 (we always create an MR based at 0 for WQs) */
+	qp_context->wqe_lkey   = cpu_to_be32(qp->mr.ibmr.lkey);
+	qp_context->params1    = cpu_to_be32((MTHCA_ACK_REQ_FREQ << 28) |
+					     (MTHCA_FLIGHT_LIMIT << 24) |
+					     MTHCA_QP_BIT_SWE);
+	if (qp->sq_policy == IB_SIGNAL_ALL_WR)
+		qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC);
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		qp_context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RETRY_COUNT);
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic) {
+			qp_context->params1 |=
+				cpu_to_be32(MTHCA_QP_BIT_SRE |
+					    MTHCA_QP_BIT_SAE);
+			qp_context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+		}
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SRA_MAX);
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		qp_context->next_send_psn = cpu_to_be32(attr->sq_psn);
+	qp_context->cqn_snd = cpu_to_be32(to_mcq(ibqp->send_cq)->cqn);
+
+	if (mthca_is_memfree(dev)) {
+		qp_context->snd_wqe_base_l = cpu_to_be32(qp->send_wqe_offset);
+		qp_context->snd_db_index   = cpu_to_be32(qp->sq.db_index);
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic)
+			qp_context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX);
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+		qp_context->params2      |= get_hw_access_flags(qp, attr, attr_mask);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
+							MTHCA_QP_OPTPAR_RRE |
+							MTHCA_QP_OPTPAR_RAE);
+	}
+
+	qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC);
+
+	if (ibqp->srq)
+		qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RIC);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_TIMEOUT);
+	}
+	if (attr_mask & IB_QP_RQ_PSN)
+		qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	qp_context->ra_buff_indx =
+		cpu_to_be32(dev->qp_table.rdb_base +
+			    ((qp->qpn & (dev->limits.num_qps - 1)) * MTHCA_RDB_ENTRY_SIZE <<
+			     dev->qp_table.rdb_shift));
+
+	qp_context->cqn_rcv = cpu_to_be32(to_mcq(ibqp->recv_cq)->cqn);
+
+	if (mthca_is_memfree(dev))
+		qp_context->rcv_db_index   = cpu_to_be32(qp->rq.db_index);
+
+	if (attr_mask & IB_QP_QKEY) {
+		qp_context->qkey = cpu_to_be32(attr->qkey);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_Q_KEY);
+	}
+
+	if (ibqp->srq)
+		qp_context->srqn = cpu_to_be32(1 << 24 |
+					       to_msrq(ibqp->srq)->srqn);
+
+	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
+	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY		&&
+	    attr->en_sqd_async_notify)
+		sqd_event = 1 << 31;
+
+	err = mthca_MODIFY_QP(dev, cur_state, new_state, qp->qpn, 0,
+			      mailbox, sqd_event);
+	if (err) {
+		mthca_warn(dev, "modify QP %d->%d returned %d.\n",
+			   cur_state, new_state, err);
+		goto out_mailbox;
+	}
+
+	qp->state = new_state;
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->atomic_rd_en = attr->qp_access_flags;
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT)
+		qp->port = attr->port_num;
+	if (attr_mask & IB_QP_ALT_PATH)
+		qp->alt_port = attr->alt_port_num;
+
+	if (is_sqp(dev, qp))
+		store_attrs(to_msqp(qp), attr, attr_mask);
+
+	/*
+	 * If we moved QP0 to RTR, bring the IB link up; if we moved
+	 * QP0 to RESET or ERROR, bring the link back down.
+	 */
+	if (is_qp0(dev, qp)) {
+		if (cur_state != IB_QPS_RTR &&
+		    new_state == IB_QPS_RTR)
+			init_port(dev, qp->port);
+
+		if (cur_state != IB_QPS_RESET &&
+		    cur_state != IB_QPS_ERR &&
+		    (new_state == IB_QPS_RESET ||
+		     new_state == IB_QPS_ERR))
+			mthca_CLOSE_IB(dev, qp->port);
+	}
+
+	/*
+	 * If we moved a kernel QP to RESET, clean up all old CQ
+	 * entries and reinitialize the QP.
+	 */
+	if (new_state == IB_QPS_RESET && !qp->ibqp.uobject) {
+		mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq), qp->qpn,
+			       qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+		if (qp->ibqp.send_cq != qp->ibqp.recv_cq)
+			mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq), qp->qpn, NULL);
+
+		mthca_wq_reset(&qp->sq);
+		qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
+
+		mthca_wq_reset(&qp->rq);
+		qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
+
+		if (mthca_is_memfree(dev)) {
+			*qp->sq.db = 0;
+			*qp->rq.db = 0;
+		}
+	}
+
+out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+out:
+	return err;
+}
+
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		    struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int err = -EINVAL;
+
+	mutex_lock(&qp->mutex);
+	if (attr_mask & IB_QP_CUR_STATE) {
+		cur_state = attr->cur_qp_state;
+	} else {
+		spin_lock_irq(&qp->sq.lock);
+		spin_lock(&qp->rq.lock);
+		cur_state = qp->state;
+		spin_unlock(&qp->rq.lock);
+		spin_unlock_irq(&qp->sq.lock);
+	}
+
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask,
+				IB_LINK_LAYER_UNSPECIFIED)) {
+		mthca_dbg(dev, "Bad QP transition (transport %d) "
+			  "%d->%d with attr 0x%08x\n",
+			  qp->transport, cur_state, new_state,
+			  attr_mask);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PKEY_INDEX) &&
+	     attr->pkey_index >= dev->limits.pkey_table_len) {
+		mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n",
+			  attr->pkey_index, dev->limits.pkey_table_len-1);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
+		mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
+		mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
+			  attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
+		mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
+			  attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
+		goto out;
+	}
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int mthca_max_data_size(struct mthca_dev *dev, struct mthca_qp *qp, int desc_sz)
+{
+	/*
+	 * Calculate the maximum size of WQE s/g segments, excluding
+	 * the next segment and other non-data segments.
+	 */
+	int max_data_size = desc_sz - sizeof (struct mthca_next_seg);
+
+	switch (qp->transport) {
+	case MLX:
+		max_data_size -= 2 * sizeof (struct mthca_data_seg);
+		break;
+
+	case UD:
+		if (mthca_is_memfree(dev))
+			max_data_size -= sizeof (struct mthca_arbel_ud_seg);
+		else
+			max_data_size -= sizeof (struct mthca_tavor_ud_seg);
+		break;
+
+	default:
+		max_data_size -= sizeof (struct mthca_raddr_seg);
+		break;
+	}
+
+	return max_data_size;
+}
+
+static inline int mthca_max_inline_data(struct mthca_pd *pd, int max_data_size)
+{
+	/* We don't support inline data for kernel QPs (yet). */
+	return pd->ibpd.uobject ? max_data_size - MTHCA_INLINE_HEADER_SIZE : 0;
+}
+
+static void mthca_adjust_qp_caps(struct mthca_dev *dev,
+				 struct mthca_pd *pd,
+				 struct mthca_qp *qp)
+{
+	int max_data_size = mthca_max_data_size(dev, qp,
+						min(dev->limits.max_desc_sz,
+						    1 << qp->sq.wqe_shift));
+
+	qp->max_inline_data = mthca_max_inline_data(pd, max_data_size);
+
+	qp->sq.max_gs = min_t(int, dev->limits.max_sg,
+			      max_data_size / sizeof (struct mthca_data_seg));
+	qp->rq.max_gs = min_t(int, dev->limits.max_sg,
+			       (min(dev->limits.max_desc_sz, 1 << qp->rq.wqe_shift) -
+				sizeof (struct mthca_next_seg)) /
+			       sizeof (struct mthca_data_seg));
+}
+
+/*
+ * Allocate and register buffer for WQEs.  qp->rq.max, sq.max,
+ * rq.max_gs and sq.max_gs must all be assigned.
+ * mthca_alloc_wqe_buf will calculate rq.wqe_shift and
+ * sq.wqe_shift (as well as send_wqe_offset, is_direct, and
+ * queue)
+ */
+static int mthca_alloc_wqe_buf(struct mthca_dev *dev,
+			       struct mthca_pd *pd,
+			       struct mthca_qp *qp)
+{
+	int size;
+	int err = -ENOMEM;
+
+	size = sizeof (struct mthca_next_seg) +
+		qp->rq.max_gs * sizeof (struct mthca_data_seg);
+
+	if (size > dev->limits.max_desc_sz)
+		return -EINVAL;
+
+	for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
+	     qp->rq.wqe_shift++)
+		; /* nothing */
+
+	size = qp->sq.max_gs * sizeof (struct mthca_data_seg);
+	switch (qp->transport) {
+	case MLX:
+		size += 2 * sizeof (struct mthca_data_seg);
+		break;
+
+	case UD:
+		size += mthca_is_memfree(dev) ?
+			sizeof (struct mthca_arbel_ud_seg) :
+			sizeof (struct mthca_tavor_ud_seg);
+		break;
+
+	case UC:
+		size += sizeof (struct mthca_raddr_seg);
+		break;
+
+	case RC:
+		size += sizeof (struct mthca_raddr_seg);
+		/*
+		 * An atomic op will require an atomic segment, a
+		 * remote address segment and one scatter entry.
+		 */
+		size = max_t(int, size,
+			     sizeof (struct mthca_atomic_seg) +
+			     sizeof (struct mthca_raddr_seg) +
+			     sizeof (struct mthca_data_seg));
+		break;
+
+	default:
+		break;
+	}
+
+	/* Make sure that we have enough space for a bind request */
+	size = max_t(int, size, sizeof (struct mthca_bind_seg));
+
+	size += sizeof (struct mthca_next_seg);
+
+	if (size > dev->limits.max_desc_sz)
+		return -EINVAL;
+
+	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
+	     qp->sq.wqe_shift++)
+		; /* nothing */
+
+	qp->send_wqe_offset = ALIGN(qp->rq.max << qp->rq.wqe_shift,
+				    1 << qp->sq.wqe_shift);
+
+	/*
+	 * If this is a userspace QP, we don't actually have to
+	 * allocate anything.  All we need is to calculate the WQE
+	 * sizes and the send_wqe_offset, so we're done now.
+	 */
+	if (pd->ibpd.uobject)
+		return 0;
+
+	size = PAGE_ALIGN(qp->send_wqe_offset +
+			  (qp->sq.max << qp->sq.wqe_shift));
+
+	qp->wrid = kmalloc((qp->rq.max + qp->sq.max) * sizeof (u64),
+			   GFP_KERNEL);
+	if (!qp->wrid)
+		goto err_out;
+
+	err = mthca_buf_alloc(dev, size, MTHCA_MAX_DIRECT_QP_SIZE,
+			      &qp->queue, &qp->is_direct, pd, 0, &qp->mr);
+	if (err)
+		goto err_out;
+
+	return 0;
+
+err_out:
+	kfree(qp->wrid);
+	return err;
+}
+
+static void mthca_free_wqe_buf(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	mthca_buf_free(dev, PAGE_ALIGN(qp->send_wqe_offset +
+				       (qp->sq.max << qp->sq.wqe_shift)),
+		       &qp->queue, qp->is_direct, &qp->mr);
+	kfree(qp->wrid);
+}
+
+static int mthca_map_memfree(struct mthca_dev *dev,
+			     struct mthca_qp *qp)
+{
+	int ret;
+
+	if (mthca_is_memfree(dev)) {
+		ret = mthca_table_get(dev, dev->qp_table.qp_table, qp->qpn);
+		if (ret)
+			return ret;
+
+		ret = mthca_table_get(dev, dev->qp_table.eqp_table, qp->qpn);
+		if (ret)
+			goto err_qpc;
+
+		ret = mthca_table_get(dev, dev->qp_table.rdb_table,
+				      qp->qpn << dev->qp_table.rdb_shift);
+		if (ret)
+			goto err_eqpc;
+
+	}
+
+	return 0;
+
+err_eqpc:
+	mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+
+err_qpc:
+	mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+
+	return ret;
+}
+
+static void mthca_unmap_memfree(struct mthca_dev *dev,
+				struct mthca_qp *qp)
+{
+	mthca_table_put(dev, dev->qp_table.rdb_table,
+			qp->qpn << dev->qp_table.rdb_shift);
+	mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+	mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+}
+
+static int mthca_alloc_memfree(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	if (mthca_is_memfree(dev)) {
+		qp->rq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_RQ,
+						 qp->qpn, &qp->rq.db);
+		if (qp->rq.db_index < 0)
+			return -ENOMEM;
+
+		qp->sq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SQ,
+						 qp->qpn, &qp->sq.db);
+		if (qp->sq.db_index < 0) {
+			mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static void mthca_free_memfree(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	if (mthca_is_memfree(dev)) {
+		mthca_free_db(dev, MTHCA_DB_TYPE_SQ, qp->sq.db_index);
+		mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+	}
+}
+
+static int mthca_alloc_qp_common(struct mthca_dev *dev,
+				 struct mthca_pd *pd,
+				 struct mthca_cq *send_cq,
+				 struct mthca_cq *recv_cq,
+				 enum ib_sig_type send_policy,
+				 struct mthca_qp *qp)
+{
+	int ret;
+	int i;
+	struct mthca_next_seg *next;
+
+	qp->refcount = 1;
+	init_waitqueue_head(&qp->wait);
+	mutex_init(&qp->mutex);
+	qp->state    	 = IB_QPS_RESET;
+	qp->atomic_rd_en = 0;
+	qp->resp_depth   = 0;
+	qp->sq_policy    = send_policy;
+	mthca_wq_reset(&qp->sq);
+	mthca_wq_reset(&qp->rq);
+
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+
+	ret = mthca_map_memfree(dev, qp);
+	if (ret)
+		return ret;
+
+	ret = mthca_alloc_wqe_buf(dev, pd, qp);
+	if (ret) {
+		mthca_unmap_memfree(dev, qp);
+		return ret;
+	}
+
+	mthca_adjust_qp_caps(dev, pd, qp);
+
+	/*
+	 * If this is a userspace QP, we're done now.  The doorbells
+	 * will be allocated and buffers will be initialized in
+	 * userspace.
+	 */
+	if (pd->ibpd.uobject)
+		return 0;
+
+	ret = mthca_alloc_memfree(dev, qp);
+	if (ret) {
+		mthca_free_wqe_buf(dev, qp);
+		mthca_unmap_memfree(dev, qp);
+		return ret;
+	}
+
+	if (mthca_is_memfree(dev)) {
+		struct mthca_data_seg *scatter;
+		int size = (sizeof (struct mthca_next_seg) +
+			    qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16;
+
+		for (i = 0; i < qp->rq.max; ++i) {
+			next = get_recv_wqe(qp, i);
+			next->nda_op = cpu_to_be32(((i + 1) & (qp->rq.max - 1)) <<
+						   qp->rq.wqe_shift);
+			next->ee_nds = cpu_to_be32(size);
+
+			for (scatter = (void *) (next + 1);
+			     (void *) scatter < (void *) next + (1 << qp->rq.wqe_shift);
+			     ++scatter)
+				scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY);
+		}
+
+		for (i = 0; i < qp->sq.max; ++i) {
+			next = get_send_wqe(qp, i);
+			next->nda_op = cpu_to_be32((((i + 1) & (qp->sq.max - 1)) <<
+						    qp->sq.wqe_shift) +
+						   qp->send_wqe_offset);
+		}
+	} else {
+		for (i = 0; i < qp->rq.max; ++i) {
+			next = get_recv_wqe(qp, i);
+			next->nda_op = htonl((((i + 1) % qp->rq.max) <<
+					      qp->rq.wqe_shift) | 1);
+		}
+
+	}
+
+	qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
+	qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
+
+	return 0;
+}
+
+static int mthca_set_qp_size(struct mthca_dev *dev, struct ib_qp_cap *cap,
+			     struct mthca_pd *pd, struct mthca_qp *qp)
+{
+	int max_data_size = mthca_max_data_size(dev, qp, dev->limits.max_desc_sz);
+
+	/* Sanity check QP size before proceeding */
+	if (cap->max_send_wr  	 > dev->limits.max_wqes ||
+	    cap->max_recv_wr  	 > dev->limits.max_wqes ||
+	    cap->max_send_sge 	 > dev->limits.max_sg   ||
+	    cap->max_recv_sge 	 > dev->limits.max_sg   ||
+	    cap->max_inline_data > mthca_max_inline_data(pd, max_data_size))
+		return -EINVAL;
+
+	/*
+	 * For MLX transport we need 2 extra send gather entries:
+	 * one for the header and one for the checksum at the end
+	 */
+	if (qp->transport == MLX && cap->max_send_sge + 2 > dev->limits.max_sg)
+		return -EINVAL;
+
+	if (mthca_is_memfree(dev)) {
+		qp->rq.max = cap->max_recv_wr ?
+			roundup_pow_of_two(cap->max_recv_wr) : 0;
+		qp->sq.max = cap->max_send_wr ?
+			roundup_pow_of_two(cap->max_send_wr) : 0;
+	} else {
+		qp->rq.max = cap->max_recv_wr;
+		qp->sq.max = cap->max_send_wr;
+	}
+
+	qp->rq.max_gs = cap->max_recv_sge;
+	qp->sq.max_gs = max_t(int, cap->max_send_sge,
+			      ALIGN(cap->max_inline_data + MTHCA_INLINE_HEADER_SIZE,
+				    MTHCA_INLINE_CHUNK_SIZE) /
+			      sizeof (struct mthca_data_seg));
+
+	return 0;
+}
+
+int mthca_alloc_qp(struct mthca_dev *dev,
+		   struct mthca_pd *pd,
+		   struct mthca_cq *send_cq,
+		   struct mthca_cq *recv_cq,
+		   enum ib_qp_type type,
+		   enum ib_sig_type send_policy,
+		   struct ib_qp_cap *cap,
+		   struct mthca_qp *qp)
+{
+	int err;
+
+	switch (type) {
+	case IB_QPT_RC: qp->transport = RC; break;
+	case IB_QPT_UC: qp->transport = UC; break;
+	case IB_QPT_UD: qp->transport = UD; break;
+	default: return -EINVAL;
+	}
+
+	err = mthca_set_qp_size(dev, cap, pd, qp);
+	if (err)
+		return err;
+
+	qp->qpn = mthca_alloc(&dev->qp_table.alloc);
+	if (qp->qpn == -1)
+		return -ENOMEM;
+
+	/* initialize port to zero for error-catching. */
+	qp->port = 0;
+
+	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+				    send_policy, qp);
+	if (err) {
+		mthca_free(&dev->qp_table.alloc, qp->qpn);
+		return err;
+	}
+
+	spin_lock_irq(&dev->qp_table.lock);
+	mthca_array_set(&dev->qp_table.qp,
+			qp->qpn & (dev->limits.num_qps - 1), qp);
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	return 0;
+}
+
+static void mthca_lock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq)
+	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		spin_lock_irq(&send_cq->lock);
+		__acquire(&recv_cq->lock);
+	} else if (send_cq->cqn < recv_cq->cqn) {
+		spin_lock_irq(&send_cq->lock);
+		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock_irq(&recv_cq->lock);
+		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static void mthca_unlock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq)
+	__releases(&send_cq->lock) __releases(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		__release(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else if (send_cq->cqn < recv_cq->cqn) {
+		spin_unlock(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else {
+		spin_unlock(&send_cq->lock);
+		spin_unlock_irq(&recv_cq->lock);
+	}
+}
+
+int mthca_alloc_sqp(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct mthca_cq *send_cq,
+		    struct mthca_cq *recv_cq,
+		    enum ib_sig_type send_policy,
+		    struct ib_qp_cap *cap,
+		    int qpn,
+		    int port,
+		    struct mthca_sqp *sqp)
+{
+	u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1;
+	int err;
+
+	sqp->qp.transport = MLX;
+	err = mthca_set_qp_size(dev, cap, pd, &sqp->qp);
+	if (err)
+		return err;
+
+	sqp->header_buf_size = sqp->qp.sq.max * MTHCA_UD_HEADER_SIZE;
+	sqp->header_buf = dma_alloc_coherent(&dev->pdev->dev, sqp->header_buf_size,
+					     &sqp->header_dma, GFP_KERNEL);
+	if (!sqp->header_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&dev->qp_table.lock);
+	if (mthca_array_get(&dev->qp_table.qp, mqpn))
+		err = -EBUSY;
+	else
+		mthca_array_set(&dev->qp_table.qp, mqpn, sqp);
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	if (err)
+		goto err_out;
+
+	sqp->qp.port      = port;
+	sqp->qp.qpn       = mqpn;
+	sqp->qp.transport = MLX;
+
+	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+				    send_policy, &sqp->qp);
+	if (err)
+		goto err_out_free;
+
+	atomic_inc(&pd->sqp_count);
+
+	return 0;
+
+ err_out_free:
+	/*
+	 * Lock CQs here, so that CQ polling code can do QP lookup
+	 * without taking a lock.
+	 */
+	mthca_lock_cqs(send_cq, recv_cq);
+
+	spin_lock(&dev->qp_table.lock);
+	mthca_array_clear(&dev->qp_table.qp, mqpn);
+	spin_unlock(&dev->qp_table.lock);
+
+	mthca_unlock_cqs(send_cq, recv_cq);
+
+ err_out:
+	dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size,
+			  sqp->header_buf, sqp->header_dma);
+
+	return err;
+}
+
+static inline int get_qp_refcount(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	int c;
+
+	spin_lock_irq(&dev->qp_table.lock);
+	c = qp->refcount;
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	return c;
+}
+
+void mthca_free_qp(struct mthca_dev *dev,
+		   struct mthca_qp *qp)
+{
+	struct mthca_cq *send_cq;
+	struct mthca_cq *recv_cq;
+
+	send_cq = to_mcq(qp->ibqp.send_cq);
+	recv_cq = to_mcq(qp->ibqp.recv_cq);
+
+	/*
+	 * Lock CQs here, so that CQ polling code can do QP lookup
+	 * without taking a lock.
+	 */
+	mthca_lock_cqs(send_cq, recv_cq);
+
+	spin_lock(&dev->qp_table.lock);
+	mthca_array_clear(&dev->qp_table.qp,
+			  qp->qpn & (dev->limits.num_qps - 1));
+	--qp->refcount;
+	spin_unlock(&dev->qp_table.lock);
+
+	mthca_unlock_cqs(send_cq, recv_cq);
+
+	wait_event(qp->wait, !get_qp_refcount(dev, qp));
+
+	if (qp->state != IB_QPS_RESET)
+		mthca_MODIFY_QP(dev, qp->state, IB_QPS_RESET, qp->qpn, 0,
+				NULL, 0);
+
+	/*
+	 * If this is a userspace QP, the buffers, MR, CQs and so on
+	 * will be cleaned up in userspace, so all we have to do is
+	 * unref the mem-free tables and free the QPN in our table.
+	 */
+	if (!qp->ibqp.uobject) {
+		mthca_cq_clean(dev, recv_cq, qp->qpn,
+			       qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+		if (send_cq != recv_cq)
+			mthca_cq_clean(dev, send_cq, qp->qpn, NULL);
+
+		mthca_free_memfree(dev, qp);
+		mthca_free_wqe_buf(dev, qp);
+	}
+
+	mthca_unmap_memfree(dev, qp);
+
+	if (is_sqp(dev, qp)) {
+		atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count));
+		dma_free_coherent(&dev->pdev->dev,
+				  to_msqp(qp)->header_buf_size,
+				  to_msqp(qp)->header_buf,
+				  to_msqp(qp)->header_dma);
+	} else
+		mthca_free(&dev->qp_table.alloc, qp->qpn);
+}
+
+/* Create UD header for an MLX send and build a data segment for it */
+static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp,
+			    int ind, struct ib_ud_wr *wr,
+			    struct mthca_mlx_seg *mlx,
+			    struct mthca_data_seg *data)
+{
+	int header_size;
+	int err;
+	u16 pkey;
+
+	ib_ud_header_init(256, /* assume a MAD */ 1, 0, 0,
+			  mthca_ah_grh_present(to_mah(wr->ah)), 0, 0, 0,
+			  &sqp->ud_header);
+
+	err = mthca_read_ah(dev, to_mah(wr->ah), &sqp->ud_header);
+	if (err)
+		return err;
+	mlx->flags &= ~cpu_to_be32(MTHCA_NEXT_SOLICIT | 1);
+	mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MTHCA_MLX_VL15 : 0) |
+				  (sqp->ud_header.lrh.destination_lid ==
+				   IB_LID_PERMISSIVE ? MTHCA_MLX_SLR : 0) |
+				  (sqp->ud_header.lrh.service_level << 8));
+	mlx->rlid = sqp->ud_header.lrh.destination_lid;
+	mlx->vcrc = 0;
+
+	switch (wr->wr.opcode) {
+	case IB_WR_SEND:
+		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+		sqp->ud_header.immediate_present = 0;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->ud_header.immediate_present = 1;
+		sqp->ud_header.immediate_data = wr->wr.ex.imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+	if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+		sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+	sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED);
+	if (!sqp->qp.ibqp.qp_num)
+		ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port,
+				   sqp->pkey_index, &pkey);
+	else
+		ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port,
+				   wr->pkey_index, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn);
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->ud_header.deth.qkey = cpu_to_be32(wr->remote_qkey & 0x80000000 ?
+					       sqp->qkey : wr->remote_qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+	header_size = ib_ud_header_pack(&sqp->ud_header,
+					sqp->header_buf +
+					ind * MTHCA_UD_HEADER_SIZE);
+
+	data->byte_count = cpu_to_be32(header_size);
+	data->lkey       = cpu_to_be32(to_mpd(sqp->qp.ibqp.pd)->ntmr.ibmr.lkey);
+	data->addr       = cpu_to_be64(sqp->header_dma +
+				       ind * MTHCA_UD_HEADER_SIZE);
+
+	return 0;
+}
+
+static inline int mthca_wq_overflow(struct mthca_wq *wq, int nreq,
+				    struct ib_cq *ib_cq)
+{
+	unsigned cur;
+	struct mthca_cq *cq;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max;
+}
+
+static __always_inline void set_raddr_seg(struct mthca_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+static __always_inline void set_atomic_seg(struct mthca_atomic_seg *aseg,
+					   struct ib_atomic_wr *wr)
+{
+	if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->swap_add = cpu_to_be64(wr->swap);
+		aseg->compare  = cpu_to_be64(wr->compare_add);
+	} else {
+		aseg->swap_add = cpu_to_be64(wr->compare_add);
+		aseg->compare  = 0;
+	}
+
+}
+
+static void set_tavor_ud_seg(struct mthca_tavor_ud_seg *useg,
+			     struct ib_ud_wr *wr)
+{
+	useg->lkey    = cpu_to_be32(to_mah(wr->ah)->key);
+	useg->av_addr =	cpu_to_be64(to_mah(wr->ah)->avdma);
+	useg->dqpn    =	cpu_to_be32(wr->remote_qpn);
+	useg->qkey    =	cpu_to_be32(wr->remote_qkey);
+
+}
+
+static void set_arbel_ud_seg(struct mthca_arbel_ud_seg *useg,
+			     struct ib_ud_wr *wr)
+{
+	memcpy(useg->av, to_mah(wr->ah)->av, MTHCA_AV_SIZE);
+	useg->dqpn = cpu_to_be32(wr->remote_qpn);
+	useg->qkey = cpu_to_be32(wr->remote_qkey);
+}
+
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	void *prev_wqe;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	/*
+	 * f0 and size0 are only used if nreq != 0, and they will
+	 * always be initialized the first time through the main loop
+	 * before nreq is incremented.  So nreq cannot become non-zero
+	 * without initializing f0 and size0, and they are in fact
+	 * never used uninitialized.
+	 */
+	int uninitialized_var(size0);
+	u32 uninitialized_var(f0);
+	int ind;
+	u8 op0 = 0;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.next_ind;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->sq.head, qp->sq.tail,
+					qp->sq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind);
+		prev_wqe = qp->sq.last;
+		qp->sq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->nda_op = 0;
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		((struct mthca_next_seg *) wqe)->flags =
+			((wr->send_flags & IB_SEND_SIGNALED) ?
+			 cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+			((wr->send_flags & IB_SEND_SOLICITED) ?
+			 cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0)   |
+			cpu_to_be32(1);
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		switch (qp->transport) {
+		case RC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, atomic_wr(wr)->remote_addr,
+					      atomic_wr(wr)->rkey);
+				wqe += sizeof (struct mthca_raddr_seg);
+
+				set_atomic_seg(wqe, atomic_wr(wr));
+				wqe += sizeof (struct mthca_atomic_seg);
+				size += (sizeof (struct mthca_raddr_seg) +
+					 sizeof (struct mthca_atomic_seg)) / 16;
+				break;
+
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+			case IB_WR_RDMA_READ:
+				set_raddr_seg(wqe, rdma_wr(wr)->remote_addr,
+					      rdma_wr(wr)->rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, rdma_wr(wr)->remote_addr,
+					      rdma_wr(wr)->rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UD:
+			set_tavor_ud_seg(wqe, ud_wr(wr));
+			wqe  += sizeof (struct mthca_tavor_ud_seg);
+			size += sizeof (struct mthca_tavor_ud_seg) / 16;
+			break;
+
+		case MLX:
+			err = build_mlx_header(dev, to_msqp(qp), ind, ud_wr(wr),
+					       wqe - sizeof (struct mthca_next_seg),
+					       wqe);
+			if (err) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+			break;
+		}
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			mthca_err(dev, "too many gathers\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe  += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		/* Add one more inline data segment for ICRC */
+		if (qp->transport == MLX) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32((1 << 31) | 4);
+			((u32 *) wqe)[1] = 0;
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+		if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+			mthca_err(dev, "opcode invalid\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		((struct mthca_next_seg *) prev_wqe)->nda_op =
+			cpu_to_be32(((ind << qp->sq.wqe_shift) +
+				     qp->send_wqe_offset) |
+				    mthca_opcode[wr->opcode]);
+		wmb();
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32((nreq ? 0 : MTHCA_NEXT_DBD) | size |
+				    ((wr->send_flags & IB_SEND_FENCE) ?
+				    MTHCA_NEXT_FENCE : 0));
+
+		if (!nreq) {
+			size0 = size;
+			op0   = mthca_opcode[wr->opcode];
+			f0    = wr->send_flags & IB_SEND_FENCE ?
+				MTHCA_SEND_DOORBELL_FENCE : 0;
+		}
+
+		++ind;
+		if (unlikely(ind >= qp->sq.max))
+			ind -= qp->sq.max;
+	}
+
+out:
+	if (likely(nreq)) {
+		wmb();
+
+		mthca_write64(((qp->sq.next_ind << qp->sq.wqe_shift) +
+			       qp->send_wqe_offset) | f0 | op0,
+			      (qp->qpn << 8) | size0,
+			      dev->kar + MTHCA_SEND_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+		/*
+		 * Make sure doorbells don't leak out of SQ spinlock
+		 * and reach the HCA out of order:
+		 */
+		mmiowb();
+	}
+
+	qp->sq.next_ind = ind;
+	qp->sq.head    += nreq;
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+	return err;
+}
+
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	/*
+	 * size0 is only used if nreq != 0, and it will always be
+	 * initialized the first time through the main loop before
+	 * nreq is incremented.  So nreq cannot become non-zero
+	 * without initializing size0, and it is in fact never used
+	 * uninitialized.
+	 */
+	int uninitialized_var(size0);
+	int ind;
+	void *wqe;
+	void *prev_wqe;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	/* XXX check that state is OK to post receive */
+
+	ind = qp->rq.next_ind;
+
+	for (nreq = 0; wr; wr = wr->next) {
+		if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->rq.head, qp->rq.tail,
+					qp->rq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_recv_wqe(qp, ind);
+		prev_wqe = qp->rq.last;
+		qp->rq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD);
+		((struct mthca_next_seg *) wqe)->flags = 0;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe  += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind] = wr->wr_id;
+
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD | size);
+
+		if (!nreq)
+			size0 = size;
+
+		++ind;
+		if (unlikely(ind >= qp->rq.max))
+			ind -= qp->rq.max;
+
+		++nreq;
+		if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
+			nreq = 0;
+
+			wmb();
+
+			mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0,
+				      qp->qpn << 8, dev->kar + MTHCA_RECEIVE_DOORBELL,
+				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+
+			qp->rq.next_ind = ind;
+			qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB;
+		}
+	}
+
+out:
+	if (likely(nreq)) {
+		wmb();
+
+		mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0,
+			      qp->qpn << 8 | nreq, dev->kar + MTHCA_RECEIVE_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	qp->rq.next_ind = ind;
+	qp->rq.head    += nreq;
+
+	/*
+	 * Make sure doorbells don't leak out of RQ spinlock and reach
+	 * the HCA out of order:
+	 */
+	mmiowb();
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	u32 dbhi;
+	void *wqe;
+	void *prev_wqe;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	/*
+	 * f0 and size0 are only used if nreq != 0, and they will
+	 * always be initialized the first time through the main loop
+	 * before nreq is incremented.  So nreq cannot become non-zero
+	 * without initializing f0 and size0, and they are in fact
+	 * never used uninitialized.
+	 */
+	int uninitialized_var(size0);
+	u32 uninitialized_var(f0);
+	int ind;
+	u8 op0 = 0;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.head & (qp->sq.max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (unlikely(nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB)) {
+			nreq = 0;
+
+			dbhi = (MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |
+				((qp->sq.head & 0xffff) << 8) | f0 | op0;
+
+			qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB;
+
+			/*
+			 * Make sure that descriptors are written before
+			 * doorbell record.
+			 */
+			wmb();
+			*qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff);
+
+			/*
+			 * Make sure doorbell record is written before we
+			 * write MMIO send doorbell.
+			 */
+			wmb();
+
+			mthca_write64(dbhi, (qp->qpn << 8) | size0,
+				      dev->kar + MTHCA_SEND_DOORBELL,
+				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+		}
+
+		if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->sq.head, qp->sq.tail,
+					qp->sq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind);
+		prev_wqe = qp->sq.last;
+		qp->sq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->flags =
+			((wr->send_flags & IB_SEND_SIGNALED) ?
+			 cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+			((wr->send_flags & IB_SEND_SOLICITED) ?
+			 cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0)   |
+			((wr->send_flags & IB_SEND_IP_CSUM) ?
+			 cpu_to_be32(MTHCA_NEXT_IP_CSUM | MTHCA_NEXT_TCP_UDP_CSUM) : 0) |
+			cpu_to_be32(1);
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		switch (qp->transport) {
+		case RC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, atomic_wr(wr)->remote_addr,
+					      atomic_wr(wr)->rkey);
+				wqe += sizeof (struct mthca_raddr_seg);
+
+				set_atomic_seg(wqe, atomic_wr(wr));
+				wqe  += sizeof (struct mthca_atomic_seg);
+				size += (sizeof (struct mthca_raddr_seg) +
+					 sizeof (struct mthca_atomic_seg)) / 16;
+				break;
+
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, rdma_wr(wr)->remote_addr,
+					      rdma_wr(wr)->rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, rdma_wr(wr)->remote_addr,
+					      rdma_wr(wr)->rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UD:
+			set_arbel_ud_seg(wqe, ud_wr(wr));
+			wqe  += sizeof (struct mthca_arbel_ud_seg);
+			size += sizeof (struct mthca_arbel_ud_seg) / 16;
+			break;
+
+		case MLX:
+			err = build_mlx_header(dev, to_msqp(qp), ind, ud_wr(wr),
+					       wqe - sizeof (struct mthca_next_seg),
+					       wqe);
+			if (err) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+			break;
+		}
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			mthca_err(dev, "too many gathers\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe  += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		/* Add one more inline data segment for ICRC */
+		if (qp->transport == MLX) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32((1 << 31) | 4);
+			((u32 *) wqe)[1] = 0;
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+		if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+			mthca_err(dev, "opcode invalid\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		((struct mthca_next_seg *) prev_wqe)->nda_op =
+			cpu_to_be32(((ind << qp->sq.wqe_shift) +
+				     qp->send_wqe_offset) |
+				    mthca_opcode[wr->opcode]);
+		wmb();
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD | size |
+				    ((wr->send_flags & IB_SEND_FENCE) ?
+				     MTHCA_NEXT_FENCE : 0));
+
+		if (!nreq) {
+			size0 = size;
+			op0   = mthca_opcode[wr->opcode];
+			f0    = wr->send_flags & IB_SEND_FENCE ?
+				MTHCA_SEND_DOORBELL_FENCE : 0;
+		}
+
+		++ind;
+		if (unlikely(ind >= qp->sq.max))
+			ind -= qp->sq.max;
+	}
+
+out:
+	if (likely(nreq)) {
+		dbhi = (nreq << 24) | ((qp->sq.head & 0xffff) << 8) | f0 | op0;
+
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+		*qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff);
+
+		/*
+		 * Make sure doorbell record is written before we
+		 * write MMIO send doorbell.
+		 */
+		wmb();
+
+		mthca_write64(dbhi, (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	/*
+	 * Make sure doorbells don't leak out of SQ spinlock and reach
+	 * the HCA out of order:
+	 */
+	mmiowb();
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int i;
+	void *wqe;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	/* XXX check that state is OK to post receive */
+
+	ind = qp->rq.head & (qp->rq.max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->rq.head, qp->rq.tail,
+					qp->rq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_recv_wqe(qp, ind);
+
+		((struct mthca_next_seg *) wqe)->flags = 0;
+
+		wqe += sizeof (struct mthca_next_seg);
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe += sizeof (struct mthca_data_seg);
+		}
+
+		if (i < qp->rq.max_gs)
+			mthca_set_data_seg_inval(wqe);
+
+		qp->wrid[ind] = wr->wr_id;
+
+		++ind;
+		if (unlikely(ind >= qp->rq.max))
+			ind -= qp->rq.max;
+	}
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+		*qp->rq.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+	return err;
+}
+
+void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+			int index, int *dbd, __be32 *new_wqe)
+{
+	struct mthca_next_seg *next;
+
+	/*
+	 * For SRQs, all receive WQEs generate a CQE, so we're always
+	 * at the end of the doorbell chain.
+	 */
+	if (qp->ibqp.srq && !is_send) {
+		*new_wqe = 0;
+		return;
+	}
+
+	if (is_send)
+		next = get_send_wqe(qp, index);
+	else
+		next = get_recv_wqe(qp, index);
+
+	*dbd = !!(next->ee_nds & cpu_to_be32(MTHCA_NEXT_DBD));
+	if (next->ee_nds & cpu_to_be32(0x3f))
+		*new_wqe = (next->nda_op & cpu_to_be32(~0x3f)) |
+			(next->ee_nds & cpu_to_be32(0x3f));
+	else
+		*new_wqe = 0;
+}
+
+int mthca_init_qp_table(struct mthca_dev *dev)
+{
+	int err;
+	int i;
+
+	spin_lock_init(&dev->qp_table.lock);
+
+	/*
+	 * We reserve 2 extra QPs per port for the special QPs.  The
+	 * special QP for port 1 has to be even, so round up.
+	 */
+	dev->qp_table.sqp_start = (dev->limits.reserved_qps + 1) & ~1UL;
+	err = mthca_alloc_init(&dev->qp_table.alloc,
+			       dev->limits.num_qps,
+			       (1 << 24) - 1,
+			       dev->qp_table.sqp_start +
+			       MTHCA_MAX_PORTS * 2);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->qp_table.qp,
+			       dev->limits.num_qps);
+	if (err) {
+		mthca_alloc_cleanup(&dev->qp_table.alloc);
+		return err;
+	}
+
+	for (i = 0; i < 2; ++i) {
+		err = mthca_CONF_SPECIAL_QP(dev, i ? IB_QPT_GSI : IB_QPT_SMI,
+				    dev->qp_table.sqp_start + i * 2);
+		if (err) {
+			mthca_warn(dev, "CONF_SPECIAL_QP returned "
+				   "%d, aborting.\n", err);
+			goto err_out;
+		}
+	}
+	return 0;
+
+ err_out:
+	for (i = 0; i < 2; ++i)
+		mthca_CONF_SPECIAL_QP(dev, i, 0);
+
+	mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps);
+	mthca_alloc_cleanup(&dev->qp_table.alloc);
+
+	return err;
+}
+
+void mthca_cleanup_qp_table(struct mthca_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < 2; ++i)
+		mthca_CONF_SPECIAL_QP(dev, i, 0);
+
+	mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps);
+	mthca_alloc_cleanup(&dev->qp_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c
new file mode 100644
index 0000000..2a6979e
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_reset.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+int mthca_reset(struct mthca_dev *mdev)
+{
+	int i;
+	int err = 0;
+	u32 *hca_header    = NULL;
+	u32 *bridge_header = NULL;
+	struct pci_dev *bridge = NULL;
+	int bridge_pcix_cap = 0;
+	int hca_pcie_cap = 0;
+	int hca_pcix_cap = 0;
+
+	u16 devctl;
+	u16 linkctl;
+
+#define MTHCA_RESET_OFFSET 0xf0010
+#define MTHCA_RESET_VALUE  swab32(1)
+
+	/*
+	 * Reset the chip.  This is somewhat ugly because we have to
+	 * save off the PCI header before reset and then restore it
+	 * after the chip reboots.  We skip config space offsets 22
+	 * and 23 since those have a special meaning.
+	 *
+	 * To make matters worse, for Tavor (PCI-X HCA) we have to
+	 * find the associated bridge device and save off its PCI
+	 * header as well.
+	 */
+
+	if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE)) {
+		/* Look for the bridge -- its device ID will be 2 more
+		   than HCA's device ID. */
+		while ((bridge = pci_get_device(mdev->pdev->vendor,
+						mdev->pdev->device + 2,
+						bridge)) != NULL) {
+			if (bridge->hdr_type    == PCI_HEADER_TYPE_BRIDGE &&
+			    bridge->subordinate == mdev->pdev->bus) {
+				mthca_dbg(mdev, "Found bridge: %s\n",
+					  pci_name(bridge));
+				break;
+			}
+		}
+
+		if (!bridge) {
+			/*
+			 * Didn't find a bridge for a Tavor device --
+			 * assume we're in no-bridge mode and hope for
+			 * the best.
+			 */
+			mthca_warn(mdev, "No bridge found for %s\n",
+				  pci_name(mdev->pdev));
+		}
+
+	}
+
+	/* For Arbel do we need to save off the full 4K PCI Express header?? */
+	hca_header = kmalloc(256, GFP_KERNEL);
+	if (!hca_header) {
+		err = -ENOMEM;
+		goto put_dev;
+	}
+
+	for (i = 0; i < 64; ++i) {
+		if (i == 22 || i == 23)
+			continue;
+		if (pci_read_config_dword(mdev->pdev, i * 4, hca_header + i)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't save HCA "
+				  "PCI header, aborting.\n");
+			goto free_hca;
+		}
+	}
+
+	hca_pcix_cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX);
+	hca_pcie_cap = pci_pcie_cap(mdev->pdev);
+
+	if (bridge) {
+		bridge_header = kmalloc(256, GFP_KERNEL);
+		if (!bridge_header) {
+			err = -ENOMEM;
+			goto free_hca;
+		}
+
+		for (i = 0; i < 64; ++i) {
+			if (i == 22 || i == 23)
+				continue;
+			if (pci_read_config_dword(bridge, i * 4, bridge_header + i)) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't save HCA bridge "
+					  "PCI header, aborting.\n");
+				goto free_bh;
+			}
+		}
+		bridge_pcix_cap = pci_find_capability(bridge, PCI_CAP_ID_PCIX);
+		if (!bridge_pcix_cap) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't locate HCA bridge "
+					  "PCI-X capability, aborting.\n");
+				goto free_bh;
+		}
+	}
+
+	/* actually hit reset */
+	{
+		void __iomem *reset = ioremap(pci_resource_start(mdev->pdev, 0) +
+					      MTHCA_RESET_OFFSET, 4);
+
+		if (!reset) {
+			err = -ENOMEM;
+			mthca_err(mdev, "Couldn't map HCA reset register, "
+				  "aborting.\n");
+			goto free_bh;
+		}
+
+		writel(MTHCA_RESET_VALUE, reset);
+		iounmap(reset);
+	}
+
+	/* Docs say to wait one second before accessing device */
+	msleep(1000);
+
+	/* Now wait for PCI device to start responding again */
+	{
+		u32 v;
+		int c = 0;
+
+		for (c = 0; c < 100; ++c) {
+			if (pci_read_config_dword(bridge ? bridge : mdev->pdev, 0, &v)) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't access HCA after reset, "
+					  "aborting.\n");
+				goto free_bh;
+			}
+
+			if (v != 0xffffffff)
+				goto good;
+
+			msleep(100);
+		}
+
+		err = -ENODEV;
+		mthca_err(mdev, "PCI device did not come back after reset, "
+			  "aborting.\n");
+		goto free_bh;
+	}
+
+good:
+	/* Now restore the PCI headers */
+	if (bridge) {
+		if (pci_write_config_dword(bridge, bridge_pcix_cap + 0x8,
+				 bridge_header[(bridge_pcix_cap + 0x8) / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge Upstream "
+				  "split transaction control, aborting.\n");
+			goto free_bh;
+		}
+		if (pci_write_config_dword(bridge, bridge_pcix_cap + 0xc,
+				 bridge_header[(bridge_pcix_cap + 0xc) / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge Downstream "
+				  "split transaction control, aborting.\n");
+			goto free_bh;
+		}
+		/*
+		 * Bridge control register is at 0x3e, so we'll
+		 * naturally restore it last in this loop.
+		 */
+		for (i = 0; i < 16; ++i) {
+			if (i * 4 == PCI_COMMAND)
+				continue;
+
+			if (pci_write_config_dword(bridge, i * 4, bridge_header[i])) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't restore HCA bridge reg %x, "
+					  "aborting.\n", i);
+				goto free_bh;
+			}
+		}
+
+		if (pci_write_config_dword(bridge, PCI_COMMAND,
+					   bridge_header[PCI_COMMAND / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, "
+				  "aborting.\n");
+			goto free_bh;
+		}
+	}
+
+	if (hca_pcix_cap) {
+		if (pci_write_config_dword(mdev->pdev, hca_pcix_cap,
+				 hca_header[hca_pcix_cap / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA PCI-X "
+				  "command register, aborting.\n");
+			goto free_bh;
+		}
+	}
+
+	if (hca_pcie_cap) {
+		devctl = hca_header[(hca_pcie_cap + PCI_EXP_DEVCTL) / 4];
+		if (pcie_capability_write_word(mdev->pdev, PCI_EXP_DEVCTL,
+					       devctl)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA PCI Express "
+				  "Device Control register, aborting.\n");
+			goto free_bh;
+		}
+		linkctl = hca_header[(hca_pcie_cap + PCI_EXP_LNKCTL) / 4];
+		if (pcie_capability_write_word(mdev->pdev, PCI_EXP_LNKCTL,
+					       linkctl)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA PCI Express "
+				  "Link control register, aborting.\n");
+			goto free_bh;
+		}
+	}
+
+	for (i = 0; i < 16; ++i) {
+		if (i * 4 == PCI_COMMAND)
+			continue;
+
+		if (pci_write_config_dword(mdev->pdev, i * 4, hca_header[i])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA reg %x, "
+				  "aborting.\n", i);
+			goto free_bh;
+		}
+	}
+
+	if (pci_write_config_dword(mdev->pdev, PCI_COMMAND,
+				   hca_header[PCI_COMMAND / 4])) {
+		err = -ENODEV;
+		mthca_err(mdev, "Couldn't restore HCA COMMAND, "
+			  "aborting.\n");
+	}
+free_bh:
+	kfree(bridge_header);
+free_hca:
+	kfree(hca_header);
+put_dev:
+	pci_dev_put(bridge);
+	return err;
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
new file mode 100644
index 0000000..d22f970
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+enum {
+	MTHCA_MAX_DIRECT_SRQ_SIZE = 4 * PAGE_SIZE
+};
+
+struct mthca_tavor_srq_context {
+	__be64 wqe_base_ds;	/* low 6 bits is descriptor size */
+	__be32 state_pd;
+	__be32 lkey;
+	__be32 uar;
+	__be16 limit_watermark;
+	__be16 wqe_cnt;
+	u32    reserved[2];
+};
+
+struct mthca_arbel_srq_context {
+	__be32 state_logsize_srqn;
+	__be32 lkey;
+	__be32 db_index;
+	__be32 logstride_usrpage;
+	__be64 wqe_base;
+	__be32 eq_pd;
+	__be16 limit_watermark;
+	__be16 wqe_cnt;
+	u16    reserved1;
+	__be16 wqe_counter;
+	u32    reserved2[3];
+};
+
+static void *get_wqe(struct mthca_srq *srq, int n)
+{
+	if (srq->is_direct)
+		return srq->queue.direct.buf + (n << srq->wqe_shift);
+	else
+		return srq->queue.page_list[(n << srq->wqe_shift) >> PAGE_SHIFT].buf +
+			((n << srq->wqe_shift) & (PAGE_SIZE - 1));
+}
+
+/*
+ * Return a pointer to the location within a WQE that we're using as a
+ * link when the WQE is in the free list.  We use the imm field
+ * because in the Tavor case, posting a WQE may overwrite the next
+ * segment of the previous WQE, but a receive WQE will never touch the
+ * imm field.  This avoids corrupting our free list if the previous
+ * WQE has already completed and been put on the free list when we
+ * post the next WQE.
+ */
+static inline int *wqe_to_link(void *wqe)
+{
+	return (int *) (wqe + offsetof(struct mthca_next_seg, imm));
+}
+
+static void mthca_tavor_init_srq_context(struct mthca_dev *dev,
+					 struct mthca_pd *pd,
+					 struct mthca_srq *srq,
+					 struct mthca_tavor_srq_context *context)
+{
+	memset(context, 0, sizeof *context);
+
+	context->wqe_base_ds = cpu_to_be64(1 << (srq->wqe_shift - 4));
+	context->state_pd    = cpu_to_be32(pd->pd_num);
+	context->lkey        = cpu_to_be32(srq->mr.ibmr.lkey);
+
+	if (pd->ibpd.uobject)
+		context->uar =
+			cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+	else
+		context->uar = cpu_to_be32(dev->driver_uar.index);
+}
+
+static void mthca_arbel_init_srq_context(struct mthca_dev *dev,
+					 struct mthca_pd *pd,
+					 struct mthca_srq *srq,
+					 struct mthca_arbel_srq_context *context)
+{
+	int logsize, max;
+
+	memset(context, 0, sizeof *context);
+
+	/*
+	 * Put max in a temporary variable to work around gcc bug
+	 * triggered by ilog2() on sparc64.
+	 */
+	max = srq->max;
+	logsize = ilog2(max);
+	context->state_logsize_srqn = cpu_to_be32(logsize << 24 | srq->srqn);
+	context->lkey = cpu_to_be32(srq->mr.ibmr.lkey);
+	context->db_index = cpu_to_be32(srq->db_index);
+	context->logstride_usrpage = cpu_to_be32((srq->wqe_shift - 4) << 29);
+	if (pd->ibpd.uobject)
+		context->logstride_usrpage |=
+			cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+	else
+		context->logstride_usrpage |= cpu_to_be32(dev->driver_uar.index);
+	context->eq_pd = cpu_to_be32(MTHCA_EQ_ASYNC << 24 | pd->pd_num);
+}
+
+static void mthca_free_srq_buf(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+	mthca_buf_free(dev, srq->max << srq->wqe_shift, &srq->queue,
+		       srq->is_direct, &srq->mr);
+	kfree(srq->wrid);
+}
+
+static int mthca_alloc_srq_buf(struct mthca_dev *dev, struct mthca_pd *pd,
+			       struct mthca_srq *srq)
+{
+	struct mthca_data_seg *scatter;
+	void *wqe;
+	int err;
+	int i;
+
+	if (pd->ibpd.uobject)
+		return 0;
+
+	srq->wrid = kmalloc(srq->max * sizeof (u64), GFP_KERNEL);
+	if (!srq->wrid)
+		return -ENOMEM;
+
+	err = mthca_buf_alloc(dev, srq->max << srq->wqe_shift,
+			      MTHCA_MAX_DIRECT_SRQ_SIZE,
+			      &srq->queue, &srq->is_direct, pd, 1, &srq->mr);
+	if (err) {
+		kfree(srq->wrid);
+		return err;
+	}
+
+	/*
+	 * Now initialize the SRQ buffer so that all of the WQEs are
+	 * linked into the list of free WQEs.  In addition, set the
+	 * scatter list L_Keys to the sentry value of 0x100.
+	 */
+	for (i = 0; i < srq->max; ++i) {
+		struct mthca_next_seg *next;
+
+		next = wqe = get_wqe(srq, i);
+
+		if (i < srq->max - 1) {
+			*wqe_to_link(wqe) = i + 1;
+			next->nda_op = htonl(((i + 1) << srq->wqe_shift) | 1);
+		} else {
+			*wqe_to_link(wqe) = -1;
+			next->nda_op = 0;
+		}
+
+		for (scatter = wqe + sizeof (struct mthca_next_seg);
+		     (void *) scatter < wqe + (1 << srq->wqe_shift);
+		     ++scatter)
+			scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY);
+	}
+
+	srq->last = get_wqe(srq, srq->max - 1);
+
+	return 0;
+}
+
+int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
+		    struct ib_srq_attr *attr, struct mthca_srq *srq)
+{
+	struct mthca_mailbox *mailbox;
+	int ds;
+	int err;
+
+	/* Sanity check SRQ size before proceeding */
+	if (attr->max_wr  > dev->limits.max_srq_wqes ||
+	    attr->max_sge > dev->limits.max_srq_sge)
+		return -EINVAL;
+
+	srq->max      = attr->max_wr;
+	srq->max_gs   = attr->max_sge;
+	srq->counter  = 0;
+
+	if (mthca_is_memfree(dev))
+		srq->max = roundup_pow_of_two(srq->max + 1);
+	else
+		srq->max = srq->max + 1;
+
+	ds = max(64UL,
+		 roundup_pow_of_two(sizeof (struct mthca_next_seg) +
+				    srq->max_gs * sizeof (struct mthca_data_seg)));
+
+	if (!mthca_is_memfree(dev) && (ds > dev->limits.max_desc_sz))
+		return -EINVAL;
+
+	srq->wqe_shift = ilog2(ds);
+
+	srq->srqn = mthca_alloc(&dev->srq_table.alloc);
+	if (srq->srqn == -1)
+		return -ENOMEM;
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->srq_table.table, srq->srqn);
+		if (err)
+			goto err_out;
+
+		if (!pd->ibpd.uobject) {
+			srq->db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SRQ,
+						       srq->srqn, &srq->db);
+			if (srq->db_index < 0) {
+				err = -ENOMEM;
+				goto err_out_icm;
+			}
+		}
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_out_db;
+	}
+
+	err = mthca_alloc_srq_buf(dev, pd, srq);
+	if (err)
+		goto err_out_mailbox;
+
+	spin_lock_init(&srq->lock);
+	srq->refcount = 1;
+	init_waitqueue_head(&srq->wait);
+	mutex_init(&srq->mutex);
+
+	if (mthca_is_memfree(dev))
+		mthca_arbel_init_srq_context(dev, pd, srq, mailbox->buf);
+	else
+		mthca_tavor_init_srq_context(dev, pd, srq, mailbox->buf);
+
+	err = mthca_SW2HW_SRQ(dev, mailbox, srq->srqn);
+
+	if (err) {
+		mthca_warn(dev, "SW2HW_SRQ failed (%d)\n", err);
+		goto err_out_free_buf;
+	}
+
+	spin_lock_irq(&dev->srq_table.lock);
+	if (mthca_array_set(&dev->srq_table.srq,
+			    srq->srqn & (dev->limits.num_srqs - 1),
+			    srq)) {
+		spin_unlock_irq(&dev->srq_table.lock);
+		goto err_out_free_srq;
+	}
+	spin_unlock_irq(&dev->srq_table.lock);
+
+	mthca_free_mailbox(dev, mailbox);
+
+	srq->first_free = 0;
+	srq->last_free  = srq->max - 1;
+
+	attr->max_wr    = srq->max - 1;
+	attr->max_sge   = srq->max_gs;
+
+	return 0;
+
+err_out_free_srq:
+	err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn);
+	if (err)
+		mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err);
+
+err_out_free_buf:
+	if (!pd->ibpd.uobject)
+		mthca_free_srq_buf(dev, srq);
+
+err_out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_db:
+	if (!pd->ibpd.uobject && mthca_is_memfree(dev))
+		mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index);
+
+err_out_icm:
+	mthca_table_put(dev, dev->srq_table.table, srq->srqn);
+
+err_out:
+	mthca_free(&dev->srq_table.alloc, srq->srqn);
+
+	return err;
+}
+
+static inline int get_srq_refcount(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+	int c;
+
+	spin_lock_irq(&dev->srq_table.lock);
+	c = srq->refcount;
+	spin_unlock_irq(&dev->srq_table.lock);
+
+	return c;
+}
+
+void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+	struct mthca_mailbox *mailbox;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		mthca_warn(dev, "No memory for mailbox to free SRQ.\n");
+		return;
+	}
+
+	err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn);
+	if (err)
+		mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err);
+
+	spin_lock_irq(&dev->srq_table.lock);
+	mthca_array_clear(&dev->srq_table.srq,
+			  srq->srqn & (dev->limits.num_srqs - 1));
+	--srq->refcount;
+	spin_unlock_irq(&dev->srq_table.lock);
+
+	wait_event(srq->wait, !get_srq_refcount(dev, srq));
+
+	if (!srq->ibsrq.uobject) {
+		mthca_free_srq_buf(dev, srq);
+		if (mthca_is_memfree(dev))
+			mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index);
+	}
+
+	mthca_table_put(dev, dev->srq_table.table, srq->srqn);
+	mthca_free(&dev->srq_table.alloc, srq->srqn);
+	mthca_free_mailbox(dev, mailbox);
+}
+
+int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	int ret = 0;
+
+	/* We don't support resizing SRQs (yet?) */
+	if (attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		u32 max_wr = mthca_is_memfree(dev) ? srq->max - 1 : srq->max;
+		if (attr->srq_limit > max_wr)
+			return -EINVAL;
+
+		mutex_lock(&srq->mutex);
+		ret = mthca_ARM_SRQ(dev, srq->srqn, attr->srq_limit);
+		mutex_unlock(&srq->mutex);
+	}
+
+	return ret;
+}
+
+int mthca_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	struct mthca_mailbox *mailbox;
+	struct mthca_arbel_srq_context *arbel_ctx;
+	struct mthca_tavor_srq_context *tavor_ctx;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mthca_QUERY_SRQ(dev, srq->srqn, mailbox);
+	if (err)
+		goto out;
+
+	if (mthca_is_memfree(dev)) {
+		arbel_ctx = mailbox->buf;
+		srq_attr->srq_limit = be16_to_cpu(arbel_ctx->limit_watermark);
+	} else {
+		tavor_ctx = mailbox->buf;
+		srq_attr->srq_limit = be16_to_cpu(tavor_ctx->limit_watermark);
+	}
+
+	srq_attr->max_wr  = srq->max - 1;
+	srq_attr->max_sge = srq->max_gs;
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+
+	return err;
+}
+
+void mthca_srq_event(struct mthca_dev *dev, u32 srqn,
+		     enum ib_event_type event_type)
+{
+	struct mthca_srq *srq;
+	struct ib_event event;
+
+	spin_lock(&dev->srq_table.lock);
+	srq = mthca_array_get(&dev->srq_table.srq, srqn & (dev->limits.num_srqs - 1));
+	if (srq)
+		++srq->refcount;
+	spin_unlock(&dev->srq_table.lock);
+
+	if (!srq) {
+		mthca_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
+		return;
+	}
+
+	if (!srq->ibsrq.event_handler)
+		goto out;
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.srq = &srq->ibsrq;
+	srq->ibsrq.event_handler(&event, srq->ibsrq.srq_context);
+
+out:
+	spin_lock(&dev->srq_table.lock);
+	if (!--srq->refcount)
+		wake_up(&srq->wait);
+	spin_unlock(&dev->srq_table.lock);
+}
+
+/*
+ * This function must be called with IRQs disabled.
+ */
+void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr)
+{
+	int ind;
+	struct mthca_next_seg *last_free;
+
+	ind = wqe_addr >> srq->wqe_shift;
+
+	spin_lock(&srq->lock);
+
+	last_free = get_wqe(srq, srq->last_free);
+	*wqe_to_link(last_free) = ind;
+	last_free->nda_op = htonl((ind << srq->wqe_shift) | 1);
+	*wqe_to_link(get_wqe(srq, ind)) = -1;
+	srq->last_free = ind;
+
+	spin_unlock(&srq->lock);
+}
+
+int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	unsigned long flags;
+	int err = 0;
+	int first_ind;
+	int ind;
+	int next_ind;
+	int nreq;
+	int i;
+	void *wqe;
+	void *prev_wqe;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	first_ind = srq->first_free;
+
+	for (nreq = 0; wr; wr = wr->next) {
+		ind       = srq->first_free;
+		wqe       = get_wqe(srq, ind);
+		next_ind  = *wqe_to_link(wqe);
+
+		if (unlikely(next_ind < 0)) {
+			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		prev_wqe  = srq->last;
+		srq->last = wqe;
+
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		/* flags field will always remain 0 */
+
+		wqe += sizeof (struct mthca_next_seg);
+
+		if (unlikely(wr->num_sge > srq->max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			srq->last = prev_wqe;
+			break;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe += sizeof (struct mthca_data_seg);
+		}
+
+		if (i < srq->max_gs)
+			mthca_set_data_seg_inval(wqe);
+
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD);
+
+		srq->wrid[ind]  = wr->wr_id;
+		srq->first_free = next_ind;
+
+		++nreq;
+		if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
+			nreq = 0;
+
+			/*
+			 * Make sure that descriptors are written
+			 * before doorbell is rung.
+			 */
+			wmb();
+
+			mthca_write64(first_ind << srq->wqe_shift, srq->srqn << 8,
+				      dev->kar + MTHCA_RECEIVE_DOORBELL,
+				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+
+			first_ind = srq->first_free;
+		}
+	}
+
+	if (likely(nreq)) {
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell is rung.
+		 */
+		wmb();
+
+		mthca_write64(first_ind << srq->wqe_shift, (srq->srqn << 8) | nreq,
+			      dev->kar + MTHCA_RECEIVE_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	/*
+	 * Make sure doorbells don't leak out of SRQ spinlock and
+	 * reach the HCA out of order:
+	 */
+	mmiowb();
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	unsigned long flags;
+	int err = 0;
+	int ind;
+	int next_ind;
+	int nreq;
+	int i;
+	void *wqe;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		ind       = srq->first_free;
+		wqe       = get_wqe(srq, ind);
+		next_ind  = *wqe_to_link(wqe);
+
+		if (unlikely(next_ind < 0)) {
+			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		/* flags field will always remain 0 */
+
+		wqe += sizeof (struct mthca_next_seg);
+
+		if (unlikely(wr->num_sge > srq->max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe += sizeof (struct mthca_data_seg);
+		}
+
+		if (i < srq->max_gs)
+			mthca_set_data_seg_inval(wqe);
+
+		srq->wrid[ind]  = wr->wr_id;
+		srq->first_free = next_ind;
+	}
+
+	if (likely(nreq)) {
+		srq->counter += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * we write doorbell record.
+		 */
+		wmb();
+		*srq->db = cpu_to_be32(srq->counter);
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+	return err;
+}
+
+int mthca_max_srq_sge(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev))
+		return dev->limits.max_sg;
+
+	/*
+	 * SRQ allocations are based on powers of 2 for Tavor,
+	 * (although they only need to be multiples of 16 bytes).
+	 *
+	 * Therefore, we need to base the max number of sg entries on
+	 * the largest power of 2 descriptor size that is <= to the
+	 * actual max WQE descriptor size, rather than return the
+	 * max_sg value given by the firmware (which is based on WQE
+	 * sizes as multiples of 16, not powers of 2).
+	 *
+	 * If SRQ implementation is changed for Tavor to be based on
+	 * multiples of 16, the calculation below can be deleted and
+	 * the FW max_sg value returned.
+	 */
+	return min_t(int, dev->limits.max_sg,
+		     ((1 << (fls(dev->limits.max_desc_sz) - 1)) -
+		      sizeof (struct mthca_next_seg)) /
+		     sizeof (struct mthca_data_seg));
+}
+
+int mthca_init_srq_table(struct mthca_dev *dev)
+{
+	int err;
+
+	if (!(dev->mthca_flags & MTHCA_FLAG_SRQ))
+		return 0;
+
+	spin_lock_init(&dev->srq_table.lock);
+
+	err = mthca_alloc_init(&dev->srq_table.alloc,
+			       dev->limits.num_srqs,
+			       dev->limits.num_srqs - 1,
+			       dev->limits.reserved_srqs);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->srq_table.srq,
+			       dev->limits.num_srqs);
+	if (err)
+		mthca_alloc_cleanup(&dev->srq_table.alloc);
+
+	return err;
+}
+
+void mthca_cleanup_srq_table(struct mthca_dev *dev)
+{
+	if (!(dev->mthca_flags & MTHCA_FLAG_SRQ))
+		return;
+
+	mthca_array_cleanup(&dev->srq_table.srq, dev->limits.num_srqs);
+	mthca_alloc_cleanup(&dev->srq_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_uar.c b/drivers/infiniband/hw/mthca/mthca_uar.c
new file mode 100644
index 0000000..ca5900c
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_uar.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/page.h>		/* PAGE_SHIFT */
+
+#include "mthca_dev.h"
+#include "mthca_memfree.h"
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+	uar->index = mthca_alloc(&dev->uar_table.alloc);
+	if (uar->index == -1)
+		return -ENOMEM;
+
+	uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index;
+
+	return 0;
+}
+
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+	mthca_free(&dev->uar_table.alloc, uar->index);
+}
+
+int mthca_init_uar_table(struct mthca_dev *dev)
+{
+	int ret;
+
+	ret = mthca_alloc_init(&dev->uar_table.alloc,
+			       dev->limits.num_uars,
+			       dev->limits.num_uars - 1,
+			       dev->limits.reserved_uars + 1);
+	if (ret)
+		return ret;
+
+	ret = mthca_init_db_tab(dev);
+	if (ret)
+		mthca_alloc_cleanup(&dev->uar_table.alloc);
+
+	return ret;
+}
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev)
+{
+	mthca_cleanup_db_tab(dev);
+
+	/* XXX check if any UARs are still allocated? */
+	mthca_alloc_cleanup(&dev->uar_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_wqe.h b/drivers/infiniband/hw/mthca/mthca_wqe.h
new file mode 100644
index 0000000..341a5ae
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_wqe.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_WQE_H
+#define MTHCA_WQE_H
+
+#include <linux/types.h>
+
+enum {
+	MTHCA_NEXT_DBD		= 1 << 7,
+	MTHCA_NEXT_FENCE	= 1 << 6,
+	MTHCA_NEXT_CQ_UPDATE	= 1 << 3,
+	MTHCA_NEXT_EVENT_GEN	= 1 << 2,
+	MTHCA_NEXT_SOLICIT	= 1 << 1,
+	MTHCA_NEXT_IP_CSUM	= 1 << 4,
+	MTHCA_NEXT_TCP_UDP_CSUM = 1 << 5,
+
+	MTHCA_MLX_VL15		= 1 << 17,
+	MTHCA_MLX_SLR		= 1 << 16
+};
+
+enum {
+	MTHCA_INVAL_LKEY			= 0x100,
+	MTHCA_TAVOR_MAX_WQES_PER_RECV_DB	= 256,
+	MTHCA_ARBEL_MAX_WQES_PER_SEND_DB	= 255
+};
+
+struct mthca_next_seg {
+	__be32 nda_op;		/* [31:6] next WQE [4:0] next opcode */
+	__be32 ee_nds;		/* [31:8] next EE  [7] DBD [6] F [5:0] next WQE size */
+	__be32 flags;		/* [3] CQ [2] Event [1] Solicit */
+	__be32 imm;		/* immediate data */
+};
+
+struct mthca_tavor_ud_seg {
+	u32    reserved1;
+	__be32 lkey;
+	__be64 av_addr;
+	u32    reserved2[4];
+	__be32 dqpn;
+	__be32 qkey;
+	u32    reserved3[2];
+};
+
+struct mthca_arbel_ud_seg {
+	__be32 av[8];
+	__be32 dqpn;
+	__be32 qkey;
+	u32    reserved[2];
+};
+
+struct mthca_bind_seg {
+	__be32 flags;		/* [31] Atomic [30] rem write [29] rem read */
+	u32    reserved;
+	__be32 new_rkey;
+	__be32 lkey;
+	__be64 addr;
+	__be64 length;
+};
+
+struct mthca_raddr_seg {
+	__be64 raddr;
+	__be32 rkey;
+	u32    reserved;
+};
+
+struct mthca_atomic_seg {
+	__be64 swap_add;
+	__be64 compare;
+};
+
+struct mthca_data_seg {
+	__be32 byte_count;
+	__be32 lkey;
+	__be64 addr;
+};
+
+struct mthca_mlx_seg {
+	__be32 nda_op;
+	__be32 nds;
+	__be32 flags;		/* [17] VL15 [16] SLR [14:12] static rate
+				   [11:8] SL [3] C [2] E */
+	__be16 rlid;
+	__be16 vcrc;
+};
+
+static __always_inline void mthca_set_data_seg(struct mthca_data_seg *dseg,
+					       struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static __always_inline void mthca_set_data_seg_inval(struct mthca_data_seg *dseg)
+{
+	dseg->byte_count = 0;
+	dseg->lkey       = cpu_to_be32(MTHCA_INVAL_LKEY);
+	dseg->addr       = 0;
+}
+
+#endif /* MTHCA_WQE_H */
diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig
new file mode 100644
index 0000000..7964eba
--- /dev/null
+++ b/drivers/infiniband/hw/nes/Kconfig
@@ -0,0 +1,15 @@
+config INFINIBAND_NES
+	tristate "NetEffect RNIC Driver"
+	depends on PCI && INET && INFINIBAND
+	select LIBCRC32C
+	---help---
+	  This is the RDMA Network Interface Card (RNIC) driver for
+	  NetEffect Ethernet Cluster Server Adapters.
+
+config INFINIBAND_NES_DEBUG
+	bool "Verbose debugging output"
+	depends on INFINIBAND_NES
+	default n
+	---help---
+	  This option enables debug messages from the NetEffect RNIC
+	  driver.  Select this if you are diagnosing a problem.
diff --git a/drivers/infiniband/hw/nes/Makefile b/drivers/infiniband/hw/nes/Makefile
new file mode 100644
index 0000000..97820c2
--- /dev/null
+++ b/drivers/infiniband/hw/nes/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_INFINIBAND_NES) += iw_nes.o
+
+iw_nes-objs := nes.o nes_hw.o nes_nic.o nes_utils.o nes_verbs.o nes_cm.o nes_mgt.o
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
new file mode 100644
index 0000000..42b68aa
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -0,0 +1,1208 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/if_arp.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/iw_cm.h>
+
+#include "nes.h"
+
+#include <net/netevent.h>
+#include <net/neighbour.h>
+#include <linux/route.h>
+#include <net/ip_fib.h>
+
+MODULE_AUTHOR("NetEffect");
+MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+int interrupt_mod_interval = 0;
+
+/* Interoperability */
+int mpa_version = 1;
+module_param(mpa_version, int, 0644);
+MODULE_PARM_DESC(mpa_version, "MPA version to be used int MPA Req/Resp (0 or 1)");
+
+/* Interoperability */
+int disable_mpa_crc = 0;
+module_param(disable_mpa_crc, int, 0644);
+MODULE_PARM_DESC(disable_mpa_crc, "Disable checking of MPA CRC");
+
+unsigned int nes_drv_opt = NES_DRV_OPT_DISABLE_INT_MOD | NES_DRV_OPT_ENABLE_PAU;
+module_param(nes_drv_opt, int, 0644);
+MODULE_PARM_DESC(nes_drv_opt, "Driver option parameters");
+
+unsigned int nes_debug_level = 0;
+module_param_named(debug_level, nes_debug_level, uint, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug output level");
+
+unsigned int wqm_quanta = 0x10000;
+module_param(wqm_quanta, int, 0644);
+MODULE_PARM_DESC(wqm_quanta, "WQM quanta");
+
+static bool limit_maxrdreqsz;
+module_param(limit_maxrdreqsz, bool, 0644);
+MODULE_PARM_DESC(limit_maxrdreqsz, "Limit max read request size to 256 Bytes");
+
+LIST_HEAD(nes_adapter_list);
+static LIST_HEAD(nes_dev_list);
+
+atomic_t qps_destroyed;
+
+static unsigned int ee_flsh_adapter;
+static unsigned int sysfs_nonidx_addr;
+static unsigned int sysfs_idx_addr;
+
+static const struct pci_device_id nes_pci_table[] = {
+	{ PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020), },
+	{ PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020_KR), },
+	{0}
+};
+
+MODULE_DEVICE_TABLE(pci, nes_pci_table);
+
+static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *);
+static int nes_net_event(struct notifier_block *, unsigned long, void *);
+static int nes_notifiers_registered;
+
+
+static struct notifier_block nes_inetaddr_notifier = {
+	.notifier_call = nes_inetaddr_event
+};
+
+static struct notifier_block nes_net_notifier = {
+	.notifier_call = nes_net_event
+};
+
+/**
+ * nes_inetaddr_event
+ */
+static int nes_inetaddr_event(struct notifier_block *notifier,
+		unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+	struct net_device *event_netdev = ifa->ifa_dev->dev;
+	struct nes_device *nesdev;
+	struct net_device *netdev;
+	struct net_device *upper_dev;
+	struct nes_vnic *nesvnic;
+	unsigned int is_bonded;
+
+	nes_debug(NES_DBG_NETDEV, "nes_inetaddr_event: ip address %pI4, netmask %pI4.\n",
+		  &ifa->ifa_address, &ifa->ifa_mask);
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p. (%s)\n",
+				nesdev, nesdev->netdev[0]->name);
+		netdev = nesdev->netdev[0];
+		nesvnic = netdev_priv(netdev);
+		upper_dev = netdev_master_upper_dev_get(netdev);
+		is_bonded = netif_is_bond_slave(netdev) &&
+			    (upper_dev == event_netdev);
+		if ((netdev == event_netdev) || is_bonded) {
+			if (nesvnic->rdma_enabled == 0) {
+				nes_debug(NES_DBG_NETDEV, "Returning without processing event for %s since"
+						" RDMA is not enabled.\n",
+						netdev->name);
+				return NOTIFY_OK;
+			}
+			/* we have ifa->ifa_address/mask here if we need it */
+			switch (event) {
+				case NETDEV_DOWN:
+					nes_debug(NES_DBG_NETDEV, "event:DOWN\n");
+					nes_write_indexed(nesdev,
+							NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), 0);
+
+					nes_manage_arp_cache(netdev, netdev->dev_addr,
+							ntohl(nesvnic->local_ipaddr), NES_ARP_DELETE);
+					nesvnic->local_ipaddr = 0;
+					if (is_bonded)
+						continue;
+					else
+						return NOTIFY_OK;
+					break;
+				case NETDEV_UP:
+					nes_debug(NES_DBG_NETDEV, "event:UP\n");
+
+					if (nesvnic->local_ipaddr != 0) {
+						nes_debug(NES_DBG_NETDEV, "Interface already has local_ipaddr\n");
+						return NOTIFY_OK;
+					}
+					/* fall through */
+				case NETDEV_CHANGEADDR:
+					/* Add the address to the IP table */
+					if (upper_dev) {
+						struct in_device *in;
+
+						rcu_read_lock();
+						in = __in_dev_get_rcu(upper_dev);
+						nesvnic->local_ipaddr = in->ifa_list->ifa_address;
+						rcu_read_unlock();
+					} else {
+						nesvnic->local_ipaddr = ifa->ifa_address;
+					}
+
+					nes_write_indexed(nesdev,
+							NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)),
+							ntohl(nesvnic->local_ipaddr));
+					nes_manage_arp_cache(netdev, netdev->dev_addr,
+							ntohl(nesvnic->local_ipaddr), NES_ARP_ADD);
+					if (is_bonded)
+						continue;
+					else
+						return NOTIFY_OK;
+					break;
+				default:
+					break;
+			}
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+
+/**
+ * nes_net_event
+ */
+static int nes_net_event(struct notifier_block *notifier,
+		unsigned long event, void *ptr)
+{
+	struct neighbour *neigh = ptr;
+	struct nes_device *nesdev;
+	struct net_device *netdev;
+	struct nes_vnic *nesvnic;
+
+	switch (event) {
+		case NETEVENT_NEIGH_UPDATE:
+			list_for_each_entry(nesdev, &nes_dev_list, list) {
+				/* nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p.\n", nesdev); */
+				netdev = nesdev->netdev[0];
+				nesvnic = netdev_priv(netdev);
+				if (netdev == neigh->dev) {
+					if (nesvnic->rdma_enabled == 0) {
+						nes_debug(NES_DBG_NETDEV, "Skipping device %s since no RDMA\n",
+								netdev->name);
+					} else {
+						if (neigh->nud_state & NUD_VALID) {
+							nes_manage_arp_cache(neigh->dev, neigh->ha,
+									ntohl(*(__be32 *)neigh->primary_key), NES_ARP_ADD);
+						} else {
+							nes_manage_arp_cache(neigh->dev, neigh->ha,
+									ntohl(*(__be32 *)neigh->primary_key), NES_ARP_DELETE);
+						}
+					}
+					return NOTIFY_OK;
+				}
+			}
+			break;
+		default:
+			nes_debug(NES_DBG_NETDEV, "NETEVENT_ %lu undefined\n", event);
+			break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+
+/**
+ * nes_add_ref
+ */
+void nes_add_ref(struct ib_qp *ibqp)
+{
+	struct nes_qp *nesqp;
+
+	nesqp = to_nesqp(ibqp);
+	nes_debug(NES_DBG_QP, "Bumping refcount for QP%u.  Pre-inc value = %u\n",
+			ibqp->qp_num, atomic_read(&nesqp->refcount));
+	atomic_inc(&nesqp->refcount);
+}
+
+static void nes_cqp_rem_ref_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
+{
+	unsigned long flags;
+	struct nes_qp *nesqp = cqp_request->cqp_callback_pointer;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	atomic_inc(&qps_destroyed);
+
+	/* Free the control structures */
+
+	if (nesqp->pbl_vbase) {
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+				nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase);
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase);
+		nesqp->pbl_vbase = NULL;
+
+	} else {
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+				nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase);
+	}
+	nes_free_resource(nesadapter, nesadapter->allocated_qps, nesqp->hwqp.qp_id);
+
+	nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = NULL;
+	kfree(nesqp->allocated_buffer);
+
+}
+
+/**
+ * nes_rem_ref
+ */
+void nes_rem_ref(struct ib_qp *ibqp)
+{
+	u64 u64temp;
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	u32 opcode;
+
+	nesqp = to_nesqp(ibqp);
+
+	if (atomic_read(&nesqp->refcount) == 0) {
+		printk(KERN_INFO PFX "%s: Reference count already 0 for QP%d, last aeq = 0x%04X.\n",
+				__func__, ibqp->qp_num, nesqp->last_aeq);
+		BUG();
+	}
+
+	if (atomic_dec_and_test(&nesqp->refcount)) {
+		if (nesqp->pau_mode)
+			nes_destroy_pau_qp(nesdev, nesqp);
+
+		/* Destroy the QP */
+		cqp_request = nes_get_cqp_request(nesdev);
+		if (cqp_request == NULL) {
+			nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
+			return;
+		}
+		cqp_request->waiting = 0;
+		cqp_request->callback = 1;
+		cqp_request->cqp_callback = nes_cqp_rem_ref_callback;
+		cqp_request->cqp_callback_pointer = nesqp;
+		cqp_wqe = &cqp_request->cqp_wqe;
+
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+		opcode = NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_IWARP;
+
+		if (nesqp->hte_added) {
+			opcode  |= NES_CQP_QP_DEL_HTE;
+			nesqp->hte_added = 0;
+		}
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+		u64temp = (u64)nesqp->nesqp_context_pbase;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+		nes_post_cqp_request(nesdev, cqp_request);
+	}
+}
+
+
+/**
+ * nes_get_qp
+ */
+struct ib_qp *nes_get_qp(struct ib_device *device, int qpn)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if ((qpn < NES_FIRST_QPN) || (qpn >= (NES_FIRST_QPN + nesadapter->max_qp)))
+		return NULL;
+
+	return &nesadapter->qp_table[qpn - NES_FIRST_QPN]->ibqp;
+}
+
+
+/**
+ * nes_print_macaddr
+ */
+static void nes_print_macaddr(struct net_device *netdev)
+{
+	nes_debug(NES_DBG_INIT, "%s: %pM, IRQ %u\n",
+		  netdev->name, netdev->dev_addr, netdev->irq);
+}
+
+/**
+ * nes_interrupt - handle interrupts
+ */
+static irqreturn_t nes_interrupt(int irq, void *dev_id)
+{
+	struct nes_device *nesdev = (struct nes_device *)dev_id;
+	int handled = 0;
+	u32 int_mask;
+	u32 int_req;
+	u32 int_stat;
+	u32 intf_int_stat;
+	u32 timer_stat;
+
+	if (nesdev->msi_enabled) {
+		/* No need to read the interrupt pending register if msi is enabled */
+		handled = 1;
+	} else {
+		if (unlikely(nesdev->nesadapter->hw_rev == NE020_REV)) {
+			/* Master interrupt enable provides synchronization for kicking off bottom half
+			  when interrupt sharing is going on */
+			int_mask = nes_read32(nesdev->regs + NES_INT_MASK);
+			if (int_mask & 0x80000000) {
+				/* Check interrupt status to see if this might be ours */
+				int_stat = nes_read32(nesdev->regs + NES_INT_STAT);
+				int_req = nesdev->int_req;
+				if (int_stat&int_req) {
+					/* if interesting CEQ or AEQ is pending, claim the interrupt */
+					if ((int_stat&int_req) & (~(NES_INT_TIMER|NES_INT_INTF))) {
+						handled = 1;
+					} else {
+						if (((int_stat & int_req) & NES_INT_TIMER) == NES_INT_TIMER) {
+							/* Timer might be running but might be for another function */
+							timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT);
+							if ((timer_stat & nesdev->timer_int_req) != 0) {
+								handled = 1;
+							}
+						}
+						if ((((int_stat & int_req) & NES_INT_INTF) == NES_INT_INTF) &&
+								(handled == 0)) {
+							intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT);
+							if ((intf_int_stat & nesdev->intf_int_req) != 0) {
+								handled = 1;
+							}
+						}
+					}
+					if (handled) {
+						nes_write32(nesdev->regs+NES_INT_MASK, int_mask & (~0x80000000));
+						int_mask = nes_read32(nesdev->regs+NES_INT_MASK);
+						/* Save off the status to save an additional read */
+						nesdev->int_stat = int_stat;
+						nesdev->napi_isr_ran = 1;
+					}
+				}
+			}
+		} else {
+			handled = nes_read32(nesdev->regs+NES_INT_PENDING);
+		}
+	}
+
+	if (handled) {
+
+		if (nes_napi_isr(nesdev) == 0) {
+			tasklet_schedule(&nesdev->dpc_tasklet);
+
+		}
+		return IRQ_HANDLED;
+	} else {
+		return IRQ_NONE;
+	}
+}
+
+
+/**
+ * nes_probe - Device initialization
+ */
+static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
+{
+	struct net_device *netdev = NULL;
+	struct nes_device *nesdev = NULL;
+	int ret = 0;
+	void __iomem *mmio_regs = NULL;
+	u8 hw_rev;
+
+	assert(pcidev != NULL);
+	assert(ent != NULL);
+
+	printk(KERN_INFO PFX "NetEffect RNIC driver v%s loading. (%s)\n",
+			DRV_VERSION, pci_name(pcidev));
+
+	ret = pci_enable_device(pcidev);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to enable PCI device. (%s)\n", pci_name(pcidev));
+		goto bail0;
+	}
+
+	nes_debug(NES_DBG_INIT, "BAR0 (@0x%08lX) size = 0x%lX bytes\n",
+			(long unsigned int)pci_resource_start(pcidev, BAR_0),
+			(long unsigned int)pci_resource_len(pcidev, BAR_0));
+	nes_debug(NES_DBG_INIT, "BAR1 (@0x%08lX) size = 0x%lX bytes\n",
+			(long unsigned int)pci_resource_start(pcidev, BAR_1),
+			(long unsigned int)pci_resource_len(pcidev, BAR_1));
+
+	/* Make sure PCI base addr are MMIO */
+	if (!(pci_resource_flags(pcidev, BAR_0) & IORESOURCE_MEM) ||
+			!(pci_resource_flags(pcidev, BAR_1) & IORESOURCE_MEM)) {
+		printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
+		ret = -ENODEV;
+		goto bail1;
+	}
+
+	/* Reserve PCI I/O and memory resources */
+	ret = pci_request_regions(pcidev, DRV_NAME);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to request regions. (%s)\n", pci_name(pcidev));
+		goto bail1;
+	}
+
+	if ((sizeof(dma_addr_t) > 4)) {
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64));
+		if (ret < 0) {
+			printk(KERN_ERR PFX "64b DMA mask configuration failed\n");
+			goto bail2;
+		}
+		ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(64));
+		if (ret) {
+			printk(KERN_ERR PFX "64b DMA consistent mask configuration failed\n");
+			goto bail2;
+		}
+	} else {
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
+		if (ret < 0) {
+			printk(KERN_ERR PFX "32b DMA mask configuration failed\n");
+			goto bail2;
+		}
+		ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(32));
+		if (ret) {
+			printk(KERN_ERR PFX "32b DMA consistent mask configuration failed\n");
+			goto bail2;
+		}
+	}
+
+	pci_set_master(pcidev);
+
+	/* Allocate hardware structure */
+	nesdev = kzalloc(sizeof(struct nes_device), GFP_KERNEL);
+	if (!nesdev) {
+		ret = -ENOMEM;
+		goto bail2;
+	}
+
+	nes_debug(NES_DBG_INIT, "Allocated nes device at %p\n", nesdev);
+	nesdev->pcidev = pcidev;
+	pci_set_drvdata(pcidev, nesdev);
+
+	pci_read_config_byte(pcidev, 0x0008, &hw_rev);
+	nes_debug(NES_DBG_INIT, "hw_rev=%u\n", hw_rev);
+
+	spin_lock_init(&nesdev->indexed_regs_lock);
+
+	/* Remap the PCI registers in adapter BAR0 to kernel VA space */
+	mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0),
+				    pci_resource_len(pcidev, BAR_0));
+	if (mmio_regs == NULL) {
+		printk(KERN_ERR PFX "Unable to remap BAR0\n");
+		ret = -EIO;
+		goto bail3;
+	}
+	nesdev->regs = mmio_regs;
+	nesdev->index_reg = 0x50 + (PCI_FUNC(pcidev->devfn)*8) + mmio_regs;
+
+	/* Ensure interrupts are disabled */
+	nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff);
+
+	if (nes_drv_opt & NES_DRV_OPT_ENABLE_MSI) {
+		if (!pci_enable_msi(nesdev->pcidev)) {
+			nesdev->msi_enabled = 1;
+			nes_debug(NES_DBG_INIT, "MSI is enabled for device %s\n",
+					pci_name(pcidev));
+		} else {
+			nes_debug(NES_DBG_INIT, "MSI is disabled by linux for device %s\n",
+					pci_name(pcidev));
+		}
+	} else {
+		nes_debug(NES_DBG_INIT, "MSI not requested due to driver options for device %s\n",
+				pci_name(pcidev));
+	}
+
+	nesdev->csr_start = pci_resource_start(nesdev->pcidev, BAR_0);
+	nesdev->doorbell_region = pci_resource_start(nesdev->pcidev, BAR_1);
+
+	/* Init the adapter */
+	nesdev->nesadapter = nes_init_adapter(nesdev, hw_rev);
+	if (!nesdev->nesadapter) {
+		printk(KERN_ERR PFX "Unable to initialize adapter.\n");
+		ret = -ENOMEM;
+		goto bail5;
+	}
+	nesdev->nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval;
+	nesdev->nesadapter->wqm_quanta = wqm_quanta;
+
+	/* nesdev->base_doorbell_index =
+			nesdev->nesadapter->pd_config_base[PCI_FUNC(nesdev->pcidev->devfn)]; */
+	nesdev->base_doorbell_index = 1;
+	nesdev->doorbell_start = nesdev->nesadapter->doorbell_start;
+	if (nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) {
+		switch (PCI_FUNC(nesdev->pcidev->devfn) %
+			nesdev->nesadapter->port_count) {
+		case 1:
+			nesdev->mac_index = 2;
+			break;
+		case 2:
+			nesdev->mac_index = 1;
+			break;
+		case 3:
+			nesdev->mac_index = 3;
+			break;
+		case 0:
+		default:
+			nesdev->mac_index = 0;
+		}
+	} else {
+		nesdev->mac_index = PCI_FUNC(nesdev->pcidev->devfn) %
+						nesdev->nesadapter->port_count;
+	}
+
+	if ((limit_maxrdreqsz ||
+	     ((nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_GLADIUS) &&
+	      (hw_rev == NE020_REV1))) &&
+	    (pcie_get_readrq(pcidev) > 256)) {
+		if (pcie_set_readrq(pcidev, 256))
+			printk(KERN_ERR PFX "Unable to set max read request"
+				" to 256 bytes\n");
+		else
+			nes_debug(NES_DBG_INIT, "Max read request size set"
+				" to 256 bytes\n");
+	}
+
+	tasklet_init(&nesdev->dpc_tasklet, nes_dpc, (unsigned long)nesdev);
+
+	/* bring up the Control QP */
+	if (nes_init_cqp(nesdev)) {
+		ret = -ENODEV;
+		goto bail6;
+	}
+
+	/* Arm the CCQ */
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			PCI_FUNC(nesdev->pcidev->devfn));
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+
+	/* Enable the interrupts */
+	nesdev->int_req = (0x101 << PCI_FUNC(nesdev->pcidev->devfn)) |
+			(1 << (PCI_FUNC(nesdev->pcidev->devfn)+16));
+	if (PCI_FUNC(nesdev->pcidev->devfn) < 4) {
+		nesdev->int_req |= (1 << (PCI_FUNC(nesdev->mac_index)+24));
+	}
+
+	/* TODO: This really should be the first driver to load, not function 0 */
+	if (PCI_FUNC(nesdev->pcidev->devfn) == 0) {
+		/* pick up PCI and critical errors if the first driver to load */
+		nesdev->intf_int_req = NES_INTF_INT_PCIERR | NES_INTF_INT_CRITERR;
+		nesdev->int_req |= NES_INT_INTF;
+	} else {
+		nesdev->intf_int_req = 0;
+	}
+	nesdev->intf_int_req |= (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16));
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0, 0);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 0);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS2, 0x00001265);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS4, 0x18021804);
+
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS3, 0x17801790);
+
+	/* deal with both periodic and one_shot */
+	nesdev->timer_int_req = 0x101 << PCI_FUNC(nesdev->pcidev->devfn);
+	nesdev->nesadapter->timer_int_req |= nesdev->timer_int_req;
+	nes_debug(NES_DBG_INIT, "setting int_req for function %u, nesdev = 0x%04X, adapter = 0x%04X\n",
+			PCI_FUNC(nesdev->pcidev->devfn),
+			nesdev->timer_int_req, nesdev->nesadapter->timer_int_req);
+
+	nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+
+	list_add_tail(&nesdev->list, &nes_dev_list);
+
+	/* Request an interrupt line for the driver */
+	ret = request_irq(pcidev->irq, nes_interrupt, IRQF_SHARED, DRV_NAME, nesdev);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
+				pci_name(pcidev), pcidev->irq);
+		goto bail65;
+	}
+
+	nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+
+	if (nes_notifiers_registered == 0) {
+		register_inetaddr_notifier(&nes_inetaddr_notifier);
+		register_netevent_notifier(&nes_net_notifier);
+	}
+	nes_notifiers_registered++;
+
+	INIT_DELAYED_WORK(&nesdev->work, nes_recheck_link_status);
+
+	/* Initialize network devices */
+	netdev = nes_netdev_init(nesdev, mmio_regs);
+	if (netdev == NULL) {
+		ret = -ENOMEM;
+		goto bail7;
+	}
+
+	/* Register network device */
+	ret = register_netdev(netdev);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n", ret);
+		nes_netdev_destroy(netdev);
+		goto bail7;
+	}
+
+	nes_print_macaddr(netdev);
+
+	nesdev->netdev_count++;
+	nesdev->nesadapter->netdev_count++;
+
+	printk(KERN_INFO PFX "%s: NetEffect RNIC driver successfully loaded.\n",
+			pci_name(pcidev));
+	return 0;
+
+	bail7:
+	printk(KERN_ERR PFX "bail7\n");
+	while (nesdev->netdev_count > 0) {
+		nesdev->netdev_count--;
+		nesdev->nesadapter->netdev_count--;
+
+		unregister_netdev(nesdev->netdev[nesdev->netdev_count]);
+		nes_netdev_destroy(nesdev->netdev[nesdev->netdev_count]);
+	}
+
+	nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n",
+			nesdev->netdev_count, nesdev->nesadapter->netdev_count);
+
+	nes_notifiers_registered--;
+	if (nes_notifiers_registered == 0) {
+		unregister_netevent_notifier(&nes_net_notifier);
+		unregister_inetaddr_notifier(&nes_inetaddr_notifier);
+	}
+
+	list_del(&nesdev->list);
+	nes_destroy_cqp(nesdev);
+
+	bail65:
+	printk(KERN_ERR PFX "bail65\n");
+	free_irq(pcidev->irq, nesdev);
+	if (nesdev->msi_enabled) {
+		pci_disable_msi(pcidev);
+	}
+	bail6:
+	printk(KERN_ERR PFX "bail6\n");
+	tasklet_kill(&nesdev->dpc_tasklet);
+	/* Deallocate the Adapter Structure */
+	nes_destroy_adapter(nesdev->nesadapter);
+
+	bail5:
+	printk(KERN_ERR PFX "bail5\n");
+	iounmap(nesdev->regs);
+
+	bail3:
+	printk(KERN_ERR PFX "bail3\n");
+	kfree(nesdev);
+
+	bail2:
+	pci_release_regions(pcidev);
+
+	bail1:
+	pci_disable_device(pcidev);
+
+	bail0:
+	return ret;
+}
+
+
+/**
+ * nes_remove - unload from kernel
+ */
+static void nes_remove(struct pci_dev *pcidev)
+{
+	struct nes_device *nesdev = pci_get_drvdata(pcidev);
+	struct net_device *netdev;
+	int netdev_index = 0;
+	unsigned long flags;
+
+	if (nesdev->netdev_count) {
+		netdev = nesdev->netdev[netdev_index];
+		if (netdev) {
+			netif_stop_queue(netdev);
+			unregister_netdev(netdev);
+			nes_netdev_destroy(netdev);
+
+			nesdev->netdev[netdev_index] = NULL;
+			nesdev->netdev_count--;
+			nesdev->nesadapter->netdev_count--;
+		}
+	}
+
+	nes_notifiers_registered--;
+	if (nes_notifiers_registered == 0) {
+		unregister_netevent_notifier(&nes_net_notifier);
+		unregister_inetaddr_notifier(&nes_inetaddr_notifier);
+	}
+
+	list_del(&nesdev->list);
+	nes_destroy_cqp(nesdev);
+
+	free_irq(pcidev->irq, nesdev);
+	tasklet_kill(&nesdev->dpc_tasklet);
+
+	spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+	if (nesdev->link_recheck) {
+		spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+		cancel_delayed_work_sync(&nesdev->work);
+	} else {
+		spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+	}
+
+	/* Deallocate the Adapter Structure */
+	nes_destroy_adapter(nesdev->nesadapter);
+
+	if (nesdev->msi_enabled) {
+		pci_disable_msi(pcidev);
+	}
+
+	iounmap(nesdev->regs);
+	kfree(nesdev);
+
+	/* nes_debug(NES_DBG_SHUTDOWN, "calling pci_release_regions.\n"); */
+	pci_release_regions(pcidev);
+	pci_disable_device(pcidev);
+	pci_set_drvdata(pcidev, NULL);
+}
+
+
+static ssize_t adapter_show(struct device_driver *ddp, char *buf)
+{
+	unsigned int  devfn = 0xffffffff;
+	unsigned char bus_number = 0xff;
+	unsigned int  i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			devfn = nesdev->pcidev->devfn;
+			bus_number = nesdev->pcidev->bus->number;
+			break;
+		}
+		i++;
+	}
+
+	return snprintf(buf, PAGE_SIZE, "%x:%x\n", bus_number, devfn);
+}
+
+static ssize_t adapter_store(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+
+	ee_flsh_adapter = simple_strtoul(p, &p, 10);
+	return strnlen(buf, count);
+}
+
+static ssize_t eeprom_cmd_show(struct device_driver *ddp, char *buf)
+{
+	u32 eeprom_cmd = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			eeprom_cmd = nes_read32(nesdev->regs + NES_EEPROM_COMMAND);
+			break;
+		}
+		i++;
+	}
+	return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_cmd);
+}
+
+static ssize_t eeprom_cmd_store(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_EEPROM_COMMAND, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t eeprom_data_show(struct device_driver *ddp, char *buf)
+{
+	u32 eeprom_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			eeprom_data = nes_read32(nesdev->regs + NES_EEPROM_DATA);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_data);
+}
+
+static ssize_t eeprom_data_store(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_EEPROM_DATA, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t flash_cmd_show(struct device_driver *ddp, char *buf)
+{
+	u32 flash_cmd = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			flash_cmd = nes_read32(nesdev->regs + NES_FLASH_COMMAND);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", flash_cmd);
+}
+
+static ssize_t flash_cmd_store(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_FLASH_COMMAND, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t flash_data_show(struct device_driver *ddp, char *buf)
+{
+	u32 flash_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			flash_data = nes_read32(nesdev->regs + NES_FLASH_DATA);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", flash_data);
+}
+
+static ssize_t flash_data_store(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_FLASH_DATA, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nonidx_addr_show(struct device_driver *ddp, char *buf)
+{
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_nonidx_addr);
+}
+
+static ssize_t nonidx_addr_store(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X')
+		sysfs_nonidx_addr = simple_strtoul(p, &p, 16);
+
+	return strnlen(buf, count);
+}
+
+static ssize_t nonidx_data_show(struct device_driver *ddp, char *buf)
+{
+	u32 nonidx_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			nonidx_data = nes_read32(nesdev->regs + sysfs_nonidx_addr);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", nonidx_data);
+}
+
+static ssize_t nonidx_data_store(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + sysfs_nonidx_addr, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t idx_addr_show(struct device_driver *ddp, char *buf)
+{
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_idx_addr);
+}
+
+static ssize_t idx_addr_store(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X')
+		sysfs_idx_addr = simple_strtoul(p, &p, 16);
+
+	return strnlen(buf, count);
+}
+
+static ssize_t idx_data_show(struct device_driver *ddp, char *buf)
+{
+	u32 idx_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			idx_data = nes_read_indexed(nesdev, sysfs_idx_addr);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", idx_data);
+}
+
+static ssize_t idx_data_store(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write_indexed(nesdev, sysfs_idx_addr, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t wqm_quanta_show(struct device_driver *ddp, char *buf)
+{
+	u32 wqm_quanta_value = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			wqm_quanta_value = nesdev->nesadapter->wqm_quanta;
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%X\n", wqm_quanta_value);
+}
+
+static ssize_t wqm_quanta_store(struct device_driver *ddp, const char *buf,
+				size_t count)
+{
+	unsigned long wqm_quanta_value;
+	u32 wqm_config1;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (kstrtoul(buf, 0, &wqm_quanta_value) < 0)
+		return -EINVAL;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			nesdev->nesadapter->wqm_quanta = wqm_quanta_value;
+			wqm_config1 = nes_read_indexed(nesdev,
+						NES_IDX_WQM_CONFIG1);
+			nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1,
+					((wqm_quanta_value << 1) |
+					(wqm_config1 & 0x00000001)));
+			break;
+		}
+		i++;
+	}
+	return strnlen(buf, count);
+}
+
+static DRIVER_ATTR_RW(adapter);
+static DRIVER_ATTR_RW(eeprom_cmd);
+static DRIVER_ATTR_RW(eeprom_data);
+static DRIVER_ATTR_RW(flash_cmd);
+static DRIVER_ATTR_RW(flash_data);
+static DRIVER_ATTR_RW(nonidx_addr);
+static DRIVER_ATTR_RW(nonidx_data);
+static DRIVER_ATTR_RW(idx_addr);
+static DRIVER_ATTR_RW(idx_data);
+static DRIVER_ATTR_RW(wqm_quanta);
+
+static struct attribute *nes_attrs[] = {
+	&driver_attr_adapter.attr,
+	&driver_attr_eeprom_cmd.attr,
+	&driver_attr_eeprom_data.attr,
+	&driver_attr_flash_cmd.attr,
+	&driver_attr_flash_data.attr,
+	&driver_attr_nonidx_addr.attr,
+	&driver_attr_nonidx_data.attr,
+	&driver_attr_idx_addr.attr,
+	&driver_attr_idx_data.attr,
+	&driver_attr_wqm_quanta.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(nes);
+
+static struct pci_driver nes_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = nes_pci_table,
+	.probe = nes_probe,
+	.remove = nes_remove,
+	.groups = nes_groups,
+};
+
+
+/**
+ * nes_init_module - module initialization entry point
+ */
+static int __init nes_init_module(void)
+{
+	int retval;
+
+	retval = nes_cm_start();
+	if (retval) {
+		printk(KERN_ERR PFX "Unable to start NetEffect iWARP CM.\n");
+		return retval;
+	}
+	return pci_register_driver(&nes_pci_driver);
+}
+
+
+/**
+ * nes_exit_module - module unload entry point
+ */
+static void __exit nes_exit_module(void)
+{
+	nes_cm_stop();
+
+	pci_unregister_driver(&nes_pci_driver);
+}
+
+
+module_init(nes_init_module);
+module_exit(nes_exit_module);
diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h
new file mode 100644
index 0000000..00c2729
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes.h
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NES_H
+#define __NES_H
+
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <asm/io.h>
+#include <linux/crc32c.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/iw_cm.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/iw_portmap.h>
+
+#define NES_SEND_FIRST_WRITE
+
+#define QUEUE_DISCONNECTS
+
+#define DRV_NAME    "iw_nes"
+#define DRV_VERSION "1.5.0.1"
+#define PFX         DRV_NAME ": "
+
+/*
+ * NetEffect PCI vendor id and NE010 PCI device id.
+ */
+#ifndef PCI_VENDOR_ID_NETEFFECT	/* not in pci.ids yet */
+#define PCI_VENDOR_ID_NETEFFECT          0x1678
+#define PCI_DEVICE_ID_NETEFFECT_NE020    0x0100
+#define PCI_DEVICE_ID_NETEFFECT_NE020_KR 0x0110
+#endif
+
+#define NE020_REV   4
+#define NE020_REV1  5
+
+#define BAR_0       0
+#define BAR_1       2
+
+#define RX_BUF_SIZE             (1536 + 8)
+#define NES_REG0_SIZE           (4 * 1024)
+#define NES_TX_TIMEOUT          (6*HZ)
+#define NES_FIRST_QPN           64
+#define NES_SW_CONTEXT_ALIGN    1024
+
+#define NES_MAX_MTU		9000
+
+#define NES_NIC_MAX_NICS        16
+#define NES_MAX_ARP_TABLE_SIZE  4096
+
+#define NES_NIC_CEQ_SIZE        8
+/* NICs will be on a separate CQ */
+#define NES_CCEQ_SIZE ((nesadapter->max_cq / nesadapter->port_count) - 32)
+
+#define NES_MAX_PORT_COUNT 4
+
+#define MAX_DPC_ITERATIONS               128
+
+#define NES_DRV_OPT_ENABLE_MPA_VER_0     0x00000001
+#define NES_DRV_OPT_DISABLE_MPA_CRC      0x00000002
+#define NES_DRV_OPT_DISABLE_FIRST_WRITE  0x00000004
+#define NES_DRV_OPT_DISABLE_INTF         0x00000008
+#define NES_DRV_OPT_ENABLE_MSI           0x00000010
+#define NES_DRV_OPT_DUAL_LOGICAL_PORT    0x00000020
+#define NES_DRV_OPT_SUPRESS_OPTION_BC    0x00000040
+#define NES_DRV_OPT_NO_INLINE_DATA       0x00000080
+#define NES_DRV_OPT_DISABLE_INT_MOD      0x00000100
+#define NES_DRV_OPT_DISABLE_VIRT_WQ      0x00000200
+#define NES_DRV_OPT_ENABLE_PAU           0x00000400
+
+#define NES_AEQ_EVENT_TIMEOUT         2500
+#define NES_DISCONNECT_EVENT_TIMEOUT  2000
+
+/* debug levels */
+/* must match userspace */
+#define NES_DBG_HW          0x00000001
+#define NES_DBG_INIT        0x00000002
+#define NES_DBG_ISR         0x00000004
+#define NES_DBG_PHY         0x00000008
+#define NES_DBG_NETDEV      0x00000010
+#define NES_DBG_CM          0x00000020
+#define NES_DBG_CM1         0x00000040
+#define NES_DBG_NIC_RX      0x00000080
+#define NES_DBG_NIC_TX      0x00000100
+#define NES_DBG_CQP         0x00000200
+#define NES_DBG_MMAP        0x00000400
+#define NES_DBG_MR          0x00000800
+#define NES_DBG_PD          0x00001000
+#define NES_DBG_CQ          0x00002000
+#define NES_DBG_QP          0x00004000
+#define NES_DBG_MOD_QP      0x00008000
+#define NES_DBG_AEQ         0x00010000
+#define NES_DBG_IW_RX       0x00020000
+#define NES_DBG_IW_TX       0x00040000
+#define NES_DBG_SHUTDOWN    0x00080000
+#define NES_DBG_PAU         0x00100000
+#define NES_DBG_NLMSG       0x00200000
+#define NES_DBG_RSVD1       0x10000000
+#define NES_DBG_RSVD2       0x20000000
+#define NES_DBG_RSVD3       0x40000000
+#define NES_DBG_RSVD4       0x80000000
+#define NES_DBG_ALL         0xffffffff
+
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+#define nes_debug(level, fmt, args...) \
+do { \
+	if (level & nes_debug_level) \
+		printk(KERN_ERR PFX "%s[%u]: " fmt, __func__, __LINE__, ##args); \
+} while (0)
+
+#define assert(expr) \
+do { \
+	if (!(expr)) { \
+		printk(KERN_ERR PFX "Assertion failed! %s, %s, %s, line %d\n", \
+			   #expr, __FILE__, __func__, __LINE__); \
+	} \
+} while (0)
+
+#define NES_EVENT_TIMEOUT   1200000
+#else
+#define nes_debug(level, fmt, args...)
+#define assert(expr)          do {} while (0)
+
+#define NES_EVENT_TIMEOUT   100000
+#endif
+
+#include "nes_hw.h"
+#include "nes_verbs.h"
+#include "nes_context.h"
+#include <rdma/nes-abi.h>
+#include "nes_cm.h"
+#include "nes_mgt.h"
+
+extern int interrupt_mod_interval;
+extern int nes_if_count;
+extern int mpa_version;
+extern int disable_mpa_crc;
+extern unsigned int nes_drv_opt;
+extern unsigned int nes_debug_level;
+extern unsigned int wqm_quanta;
+extern struct list_head nes_adapter_list;
+
+extern atomic_t cm_connects;
+extern atomic_t cm_accepts;
+extern atomic_t cm_disconnects;
+extern atomic_t cm_closes;
+extern atomic_t cm_connecteds;
+extern atomic_t cm_connect_reqs;
+extern atomic_t cm_rejects;
+extern atomic_t mod_qp_timouts;
+extern atomic_t qps_created;
+extern atomic_t qps_destroyed;
+extern atomic_t sw_qps_destroyed;
+extern u32 mh_detected;
+extern u32 mh_pauses_sent;
+extern u32 cm_packets_sent;
+extern u32 cm_packets_bounced;
+extern u32 cm_packets_created;
+extern u32 cm_packets_received;
+extern u32 cm_packets_dropped;
+extern u32 cm_packets_retrans;
+extern atomic_t cm_listens_created;
+extern atomic_t cm_listens_destroyed;
+extern u32 cm_backlog_drops;
+extern atomic_t cm_loopbacks;
+extern atomic_t cm_nodes_created;
+extern atomic_t cm_nodes_destroyed;
+extern atomic_t cm_accel_dropped_pkts;
+extern atomic_t cm_resets_recvd;
+extern atomic_t pau_qps_created;
+extern atomic_t pau_qps_destroyed;
+
+extern u32 int_mod_timer_init;
+extern u32 int_mod_cq_depth_256;
+extern u32 int_mod_cq_depth_128;
+extern u32 int_mod_cq_depth_32;
+extern u32 int_mod_cq_depth_24;
+extern u32 int_mod_cq_depth_16;
+extern u32 int_mod_cq_depth_4;
+extern u32 int_mod_cq_depth_1;
+
+struct nes_device {
+	struct nes_adapter	   *nesadapter;
+	void __iomem           *regs;
+	void __iomem           *index_reg;
+	struct pci_dev         *pcidev;
+	struct net_device      *netdev[NES_NIC_MAX_NICS];
+	u64                    link_status_interrupts;
+	struct tasklet_struct  dpc_tasklet;
+	spinlock_t             indexed_regs_lock;
+	unsigned long          csr_start;
+	unsigned long          doorbell_region;
+	unsigned long          doorbell_start;
+	unsigned long          mac_tx_errors;
+	unsigned long          mac_pause_frames_sent;
+	unsigned long          mac_pause_frames_received;
+	unsigned long          mac_rx_errors;
+	unsigned long          mac_rx_crc_errors;
+	unsigned long          mac_rx_symbol_err_frames;
+	unsigned long          mac_rx_jabber_frames;
+	unsigned long          mac_rx_oversized_frames;
+	unsigned long          mac_rx_short_frames;
+	unsigned long          port_rx_discards;
+	unsigned long          port_tx_discards;
+	unsigned int           mac_index;
+	unsigned int           nes_stack_start;
+
+	/* Control Structures */
+	void                   *cqp_vbase;
+	dma_addr_t             cqp_pbase;
+	u32                    cqp_mem_size;
+	u8                     ceq_index;
+	u8                     nic_ceq_index;
+	struct nes_hw_cqp      cqp;
+	struct nes_hw_cq       ccq;
+	struct list_head       cqp_avail_reqs;
+	struct list_head       cqp_pending_reqs;
+	struct nes_cqp_request *nes_cqp_requests;
+
+	u32                    int_req;
+	u32                    int_stat;
+	u32                    timer_int_req;
+	u32                    timer_only_int_count;
+	u32                    intf_int_req;
+	u32                    last_mac_tx_pauses;
+	u32                    last_used_chunks_tx;
+	struct list_head       list;
+
+	u16                    base_doorbell_index;
+	u16                    currcq_count;
+	u16                    deepcq_count;
+	u8                     iw_status;
+	u8                     msi_enabled;
+	u8                     netdev_count;
+	u8                     napi_isr_ran;
+	u8                     disable_rx_flow_control;
+	u8                     disable_tx_flow_control;
+
+	struct delayed_work    work;
+	u8                     link_recheck;
+};
+
+/* Receive skb private area - must fit in skb->cb area */
+struct nes_rskb_cb {
+	u64                    busaddr;
+	u32                    maplen;
+	u32                    seqnum;
+	u8                     *data_start;
+	struct nes_qp          *nesqp;
+};
+
+static inline __le32 get_crc_value(struct nes_v4_quad *nes_quad)
+{
+	u32 crc_value;
+	crc_value = crc32c(~0, (void *)nes_quad, sizeof (struct nes_v4_quad));
+
+	/*
+	 * With commit ef19454b ("[LIB] crc32c: Keep intermediate crc
+	 * state in cpu order"), behavior of crc32c changes on
+	 * big-endian platforms.  Our algorithm expects the previous
+	 * behavior; otherwise we have RDMA connection establishment
+	 * issue on big-endian.
+	 */
+	return cpu_to_le32(crc_value);
+}
+
+static inline void
+set_wqe_64bit_value(__le32 *wqe_words, u32 index, u64 value)
+{
+	wqe_words[index]     = cpu_to_le32((u32) value);
+	wqe_words[index + 1] = cpu_to_le32(upper_32_bits(value));
+}
+
+static inline void
+set_wqe_32bit_value(__le32 *wqe_words, u32 index, u32 value)
+{
+	wqe_words[index] = cpu_to_le32(value);
+}
+
+static inline void
+nes_fill_init_cqp_wqe(struct nes_hw_cqp_wqe *cqp_wqe, struct nes_device *nesdev)
+{
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_LOW_IDX]       = 0;
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX]      = 0;
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX]   = 0;
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX]  = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_LEN_IDX]       = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_LOW_IDX]       = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_LOW_IDX]        = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_HIGH_IDX]       = 0;
+}
+
+static inline void
+nes_fill_init_qp_wqe(struct nes_hw_qp_wqe *wqe, struct nes_qp *nesqp, u32 head)
+{
+	u32 value;
+	value = ((u32)((unsigned long) nesqp)) | head;
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX,
+			(u32)(upper_32_bits((unsigned long)(nesqp))));
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, value);
+}
+
+/* Read from memory-mapped device */
+static inline u32 nes_read_indexed(struct nes_device *nesdev, u32 reg_index)
+{
+	unsigned long flags;
+	void __iomem *addr = nesdev->index_reg;
+	u32 value;
+
+	spin_lock_irqsave(&nesdev->indexed_regs_lock, flags);
+
+	writel(reg_index, addr);
+	value = readl((void __iomem *)addr + 4);
+
+	spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags);
+	return value;
+}
+
+static inline u32 nes_read32(const void __iomem *addr)
+{
+	return readl(addr);
+}
+
+static inline u16 nes_read16(const void __iomem *addr)
+{
+	return readw(addr);
+}
+
+static inline u8 nes_read8(const void __iomem *addr)
+{
+	return readb(addr);
+}
+
+/* Write to memory-mapped device */
+static inline void nes_write_indexed(struct nes_device *nesdev, u32 reg_index, u32 val)
+{
+	unsigned long flags;
+	void __iomem *addr = nesdev->index_reg;
+
+	spin_lock_irqsave(&nesdev->indexed_regs_lock, flags);
+
+	writel(reg_index, addr);
+	writel(val, (void __iomem *)addr + 4);
+
+	spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags);
+}
+
+static inline void nes_write32(void __iomem *addr, u32 val)
+{
+	writel(val, addr);
+}
+
+static inline void nes_write16(void __iomem *addr, u16 val)
+{
+	writew(val, addr);
+}
+
+static inline void nes_write8(void __iomem *addr, u8 val)
+{
+	writeb(val, addr);
+}
+
+enum nes_resource {
+	NES_RESOURCE_MW = 1,
+	NES_RESOURCE_FAST_MR,
+	NES_RESOURCE_PHYS_MR,
+	NES_RESOURCE_USER_MR,
+	NES_RESOURCE_PD,
+	NES_RESOURCE_QP,
+	NES_RESOURCE_CQ,
+	NES_RESOURCE_ARP
+};
+
+static inline int nes_alloc_resource(struct nes_adapter *nesadapter,
+		unsigned long *resource_array, u32 max_resources,
+		u32 *req_resource_num, u32 *next, enum nes_resource resource_type)
+{
+	unsigned long flags;
+	u32 resource_num;
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+
+	resource_num = find_next_zero_bit(resource_array, max_resources, *next);
+	if (resource_num >= max_resources) {
+		resource_num = find_first_zero_bit(resource_array, max_resources);
+		if (resource_num >= max_resources) {
+			printk(KERN_ERR PFX "%s: No available resources [type=%u].\n", __func__, resource_type);
+			spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+			return -EMFILE;
+		}
+	}
+	set_bit(resource_num, resource_array);
+	*next = resource_num+1;
+	if (*next == max_resources) {
+		*next = 0;
+	}
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+	*req_resource_num = resource_num;
+
+	return 0;
+}
+
+static inline int nes_is_resource_allocated(struct nes_adapter *nesadapter,
+		unsigned long *resource_array, u32 resource_num)
+{
+	unsigned long flags;
+	int bit_is_set;
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+
+	bit_is_set = test_bit(resource_num, resource_array);
+	nes_debug(NES_DBG_HW, "resource_num %u is%s allocated.\n",
+			resource_num, (bit_is_set ? "": " not"));
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+
+	return bit_is_set;
+}
+
+static inline void nes_free_resource(struct nes_adapter *nesadapter,
+		unsigned long *resource_array, u32 resource_num)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+	clear_bit(resource_num, resource_array);
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+}
+
+static inline struct nes_vnic *to_nesvnic(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct nes_ib_device, ibdev)->nesvnic;
+}
+
+static inline struct nes_pd *to_nespd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct nes_pd, ibpd);
+}
+
+static inline struct nes_ucontext *to_nesucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct nes_ucontext, ibucontext);
+}
+
+static inline struct nes_mr *to_nesmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct nes_mr, ibmr);
+}
+
+static inline struct nes_mr *to_nesmr_from_ibfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct nes_mr, ibfmr);
+}
+
+static inline struct nes_mr *to_nesmw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct nes_mr, ibmw);
+}
+
+static inline struct nes_fmr *to_nesfmr(struct nes_mr *nesmr)
+{
+	return container_of(nesmr, struct nes_fmr, nesmr);
+}
+
+static inline struct nes_cq *to_nescq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct nes_cq, ibcq);
+}
+
+static inline struct nes_qp *to_nesqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct nes_qp, ibqp);
+}
+
+
+
+/* nes.c */
+void nes_add_ref(struct ib_qp *);
+void nes_rem_ref(struct ib_qp *);
+struct ib_qp *nes_get_qp(struct ib_device *, int);
+
+
+/* nes_hw.c */
+struct nes_adapter *nes_init_adapter(struct nes_device *, u8);
+void  nes_nic_init_timer_defaults(struct nes_device *, u8);
+void nes_destroy_adapter(struct nes_adapter *);
+int nes_init_cqp(struct nes_device *);
+int nes_init_phy(struct nes_device *);
+int nes_init_nic_qp(struct nes_device *, struct net_device *);
+void nes_destroy_nic_qp(struct nes_vnic *);
+int nes_napi_isr(struct nes_device *);
+void nes_dpc(unsigned long);
+void nes_nic_ce_handler(struct nes_device *, struct nes_hw_nic_cq *);
+void nes_iwarp_ce_handler(struct nes_device *, struct nes_hw_cq *);
+int nes_destroy_cqp(struct nes_device *);
+int nes_nic_cm_xmit(struct sk_buff *, struct net_device *);
+void nes_recheck_link_status(struct work_struct *work);
+void nes_terminate_timeout(struct timer_list *t);
+
+/* nes_nic.c */
+struct net_device *nes_netdev_init(struct nes_device *, void __iomem *);
+void nes_netdev_destroy(struct net_device *);
+int nes_nic_cm_xmit(struct sk_buff *, struct net_device *);
+
+/* nes_cm.c */
+void *nes_cm_create(struct net_device *);
+int nes_cm_recv(struct sk_buff *, struct net_device *);
+void nes_update_arp(unsigned char *, u32, u32, u16, u16);
+void nes_manage_arp_cache(struct net_device *, unsigned char *, u32, u32);
+void nes_sock_release(struct nes_qp *, unsigned long *);
+void flush_wqes(struct nes_device *nesdev, struct nes_qp *, u32, u32);
+int nes_manage_apbvt(struct nes_vnic *, u32, u32, u32);
+int nes_cm_disconn(struct nes_qp *);
+void nes_cm_disconn_worker(void *);
+
+/* nes_verbs.c */
+int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32, u32);
+int nes_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *);
+struct nes_ib_device *nes_init_ofa_device(struct net_device *);
+void  nes_port_ibevent(struct nes_vnic *nesvnic);
+void nes_destroy_ofa_device(struct nes_ib_device *);
+int nes_register_ofa_device(struct nes_ib_device *);
+
+/* nes_util.c */
+int nes_read_eeprom_values(struct nes_device *, struct nes_adapter *);
+void nes_write_1G_phy_reg(struct nes_device *, u8, u8, u16);
+void nes_read_1G_phy_reg(struct nes_device *, u8, u8, u16 *);
+void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16, u16);
+void nes_read_10G_phy_reg(struct nes_device *, u8, u8, u16);
+struct nes_cqp_request *nes_get_cqp_request(struct nes_device *);
+void nes_free_cqp_request(struct nes_device *nesdev,
+			  struct nes_cqp_request *cqp_request);
+void nes_put_cqp_request(struct nes_device *nesdev,
+			 struct nes_cqp_request *cqp_request);
+void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *);
+int nes_arp_table(struct nes_device *, u32, u8 *, u32);
+void nes_mh_fix(struct timer_list *t);
+void nes_clc(struct timer_list *t);
+void nes_dump_mem(unsigned int, void *, int);
+u32 nes_crc32(u32, u32, u32, u32, u8 *, u32, u32, u32);
+
+#endif	/* __NES_H */
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
new file mode 100644
index 0000000..6cdfbf8
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -0,0 +1,3989 @@
+/*
+ * Copyright (c) 2006 - 2014 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#define TCPOPT_TIMESTAMP 8
+
+#include <linux/atomic.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/notifier.h>
+#include <linux/net.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/delay.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/random.h>
+#include <linux/list.h>
+#include <linux/threads.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <net/arp.h>
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/tcp.h>
+#include <linux/fcntl.h>
+
+#include "nes.h"
+
+u32 cm_packets_sent;
+u32 cm_packets_bounced;
+u32 cm_packets_dropped;
+u32 cm_packets_retrans;
+u32 cm_packets_created;
+u32 cm_packets_received;
+atomic_t cm_listens_created;
+atomic_t cm_listens_destroyed;
+u32 cm_backlog_drops;
+atomic_t cm_loopbacks;
+atomic_t cm_nodes_created;
+atomic_t cm_nodes_destroyed;
+atomic_t cm_accel_dropped_pkts;
+atomic_t cm_resets_recvd;
+
+static inline int mini_cm_accelerated(struct nes_cm_core *, struct nes_cm_node *);
+static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *, struct nes_vnic *, struct nes_cm_info *);
+static int mini_cm_del_listen(struct nes_cm_core *, struct nes_cm_listener *);
+static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *, struct nes_vnic *, u16, void *, struct nes_cm_info *);
+static int mini_cm_close(struct nes_cm_core *, struct nes_cm_node *);
+static int mini_cm_accept(struct nes_cm_core *, struct nes_cm_node *);
+static int mini_cm_reject(struct nes_cm_core *, struct nes_cm_node *);
+static int mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *);
+static int mini_cm_dealloc_core(struct nes_cm_core *);
+static int mini_cm_get(struct nes_cm_core *);
+static int mini_cm_set(struct nes_cm_core *, u32, u32);
+
+static void form_cm_frame(struct sk_buff *, struct nes_cm_node *, void *, u32, void *, u32, u8);
+static int add_ref_cm_node(struct nes_cm_node *);
+static int rem_ref_cm_node(struct nes_cm_core *, struct nes_cm_node *);
+
+static int nes_cm_disconn_true(struct nes_qp *);
+static int nes_cm_post_event(struct nes_cm_event *event);
+static int nes_disconnect(struct nes_qp *nesqp, int abrupt);
+static void nes_disconnect_worker(struct work_struct *work);
+
+static int send_mpa_request(struct nes_cm_node *, struct sk_buff *);
+static int send_mpa_reject(struct nes_cm_node *);
+static int send_syn(struct nes_cm_node *, u32, struct sk_buff *);
+static int send_reset(struct nes_cm_node *, struct sk_buff *);
+static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb);
+static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb);
+static void process_packet(struct nes_cm_node *, struct sk_buff *, struct nes_cm_core *);
+
+static void active_open_err(struct nes_cm_node *, struct sk_buff *, int);
+static void passive_open_err(struct nes_cm_node *, struct sk_buff *, int);
+static void cleanup_retrans_entry(struct nes_cm_node *);
+static void handle_rcv_mpa(struct nes_cm_node *, struct sk_buff *);
+static void free_retrans_entry(struct nes_cm_node *cm_node);
+static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, struct sk_buff *skb, int optionsize, int passive);
+
+/* CM event handler functions */
+static void cm_event_connected(struct nes_cm_event *);
+static void cm_event_connect_error(struct nes_cm_event *);
+static void cm_event_reset(struct nes_cm_event *);
+static void cm_event_mpa_req(struct nes_cm_event *);
+static void cm_event_mpa_reject(struct nes_cm_event *);
+static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node);
+
+/* MPA build functions */
+static int cm_build_mpa_frame(struct nes_cm_node *, u8 **, u16 *, u8 *, u8);
+static void build_mpa_v2(struct nes_cm_node *, void *, u8);
+static void build_mpa_v1(struct nes_cm_node *, void *, u8);
+static void build_rdma0_msg(struct nes_cm_node *, struct nes_qp **);
+
+static void print_core(struct nes_cm_core *core);
+static void record_ird_ord(struct nes_cm_node *, u16, u16);
+
+/* External CM API Interface */
+/* instance of function pointers for client API */
+/* set address of this instance to cm_core->cm_ops at cm_core alloc */
+static const struct nes_cm_ops nes_cm_api = {
+	.accelerated = mini_cm_accelerated,
+	.listen = mini_cm_listen,
+	.stop_listener = mini_cm_del_listen,
+	.connect = mini_cm_connect,
+	.close = mini_cm_close,
+	.accept = mini_cm_accept,
+	.reject = mini_cm_reject,
+	.recv_pkt = mini_cm_recv_pkt,
+	.destroy_cm_core = mini_cm_dealloc_core,
+	.get = mini_cm_get,
+	.set = mini_cm_set
+};
+
+static struct nes_cm_core *g_cm_core;
+
+atomic_t cm_connects;
+atomic_t cm_accepts;
+atomic_t cm_disconnects;
+atomic_t cm_closes;
+atomic_t cm_connecteds;
+atomic_t cm_connect_reqs;
+atomic_t cm_rejects;
+
+int nes_add_ref_cm_node(struct nes_cm_node *cm_node)
+{
+	return add_ref_cm_node(cm_node);
+}
+
+int nes_rem_ref_cm_node(struct nes_cm_node *cm_node)
+{
+	return rem_ref_cm_node(cm_node->cm_core, cm_node);
+}
+/**
+ * create_event
+ */
+static struct nes_cm_event *create_event(struct nes_cm_node *	cm_node,
+					 enum nes_cm_event_type type)
+{
+	struct nes_cm_event *event;
+
+	if (!cm_node->cm_id)
+		return NULL;
+
+	/* allocate an empty event */
+	event = kzalloc(sizeof(*event), GFP_ATOMIC);
+
+	if (!event)
+		return NULL;
+
+	event->type = type;
+	event->cm_node = cm_node;
+	event->cm_info.rem_addr = cm_node->rem_addr;
+	event->cm_info.loc_addr = cm_node->loc_addr;
+	event->cm_info.rem_port = cm_node->rem_port;
+	event->cm_info.loc_port = cm_node->loc_port;
+	event->cm_info.cm_id = cm_node->cm_id;
+
+	nes_debug(NES_DBG_CM, "cm_node=%p Created event=%p, type=%u, "
+		  "dst_addr=%08x[%x], src_addr=%08x[%x]\n",
+		  cm_node, event, type, event->cm_info.loc_addr,
+		  event->cm_info.loc_port, event->cm_info.rem_addr,
+		  event->cm_info.rem_port);
+
+	nes_cm_post_event(event);
+	return event;
+}
+
+
+/**
+ * send_mpa_request
+ */
+static int send_mpa_request(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	u8 start_addr = 0;
+	u8 *start_ptr = &start_addr;
+	u8 **start_buff = &start_ptr;
+	u16 buff_len = 0;
+
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "skb set to NULL\n");
+		return -1;
+	}
+
+	/* send an MPA Request frame */
+	cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REQUEST);
+	form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK);
+
+	return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+}
+
+
+
+static int send_mpa_reject(struct nes_cm_node *cm_node)
+{
+	struct sk_buff *skb = NULL;
+	u8 start_addr = 0;
+	u8 *start_ptr = &start_addr;
+	u8 **start_buff = &start_ptr;
+	u16 buff_len = 0;
+	struct ietf_mpa_v1 *mpa_frame;
+
+	skb = dev_alloc_skb(MAX_CM_BUFFER);
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -ENOMEM;
+	}
+
+	/* send an MPA reject frame */
+	cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REPLY);
+	mpa_frame = (struct ietf_mpa_v1 *)*start_buff;
+	mpa_frame->flags |= IETF_MPA_FLAGS_REJECT;
+	form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK | SET_FIN);
+
+	cm_node->state = NES_CM_STATE_FIN_WAIT1;
+	return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+}
+
+
+/**
+ * recv_mpa - process a received TCP pkt, we are expecting an
+ * IETF MPA frame
+ */
+static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type,
+		     u32 len)
+{
+	struct ietf_mpa_v1 *mpa_frame;
+	struct ietf_mpa_v2 *mpa_v2_frame;
+	struct ietf_rtr_msg *rtr_msg;
+	int mpa_hdr_len;
+	int priv_data_len;
+
+	*type = NES_MPA_REQUEST_ACCEPT;
+
+	/* assume req frame is in tcp data payload */
+	if (len < sizeof(struct ietf_mpa_v1)) {
+		nes_debug(NES_DBG_CM, "The received ietf buffer was too small (%x)\n", len);
+		return -EINVAL;
+	}
+
+	/* points to the beginning of the frame, which could be MPA V1 or V2 */
+	mpa_frame = (struct ietf_mpa_v1 *)buffer;
+	mpa_hdr_len = sizeof(struct ietf_mpa_v1);
+	priv_data_len = ntohs(mpa_frame->priv_data_len);
+
+	/* make sure mpa private data len is less than 512 bytes */
+	if (priv_data_len > IETF_MAX_PRIV_DATA_LEN) {
+		nes_debug(NES_DBG_CM, "The received Length of Private"
+			  " Data field exceeds 512 octets\n");
+		return -EINVAL;
+	}
+	/*
+	 * make sure MPA receiver interoperate with the
+	 * received MPA version and MPA key information
+	 *
+	 */
+	if (mpa_frame->rev != IETF_MPA_V1 && mpa_frame->rev != IETF_MPA_V2) {
+		nes_debug(NES_DBG_CM, "The received mpa version"
+			  " is not supported\n");
+		return -EINVAL;
+	}
+	/*
+	* backwards compatibility only
+	*/
+	if (mpa_frame->rev > cm_node->mpa_frame_rev) {
+		nes_debug(NES_DBG_CM, "The received mpa version"
+			" can not be interoperated\n");
+		return -EINVAL;
+	} else {
+		cm_node->mpa_frame_rev = mpa_frame->rev;
+	}
+
+	if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) {
+		if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE)) {
+			nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n");
+			return -EINVAL;
+		}
+	} else {
+		if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE)) {
+			nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n");
+			return -EINVAL;
+		}
+	}
+
+	if (priv_data_len + mpa_hdr_len != len) {
+		nes_debug(NES_DBG_CM, "The received ietf buffer was not right"
+			" complete (%x + %x != %x)\n",
+			priv_data_len, mpa_hdr_len, len);
+		return -EINVAL;
+	}
+	/* make sure it does not exceed the max size */
+	if (len > MAX_CM_BUFFER) {
+		nes_debug(NES_DBG_CM, "The received ietf buffer was too large"
+			" (%x + %x != %x)\n",
+			priv_data_len, mpa_hdr_len, len);
+		return -EINVAL;
+	}
+
+	cm_node->mpa_frame_size = priv_data_len;
+
+	switch (mpa_frame->rev) {
+	case IETF_MPA_V2: {
+		u16 ird_size;
+		u16 ord_size;
+		u16 rtr_ctrl_ird;
+		u16 rtr_ctrl_ord;
+
+		mpa_v2_frame = (struct ietf_mpa_v2 *)buffer;
+		mpa_hdr_len += IETF_RTR_MSG_SIZE;
+		cm_node->mpa_frame_size -= IETF_RTR_MSG_SIZE;
+		rtr_msg = &mpa_v2_frame->rtr_msg;
+
+		/* parse rtr message */
+		rtr_ctrl_ird = ntohs(rtr_msg->ctrl_ird);
+		rtr_ctrl_ord = ntohs(rtr_msg->ctrl_ord);
+		ird_size = rtr_ctrl_ird & IETF_NO_IRD_ORD;
+		ord_size = rtr_ctrl_ord & IETF_NO_IRD_ORD;
+
+		if (!(rtr_ctrl_ird & IETF_PEER_TO_PEER)) {
+			/* send reset */
+			return -EINVAL;
+		}
+		if (ird_size == IETF_NO_IRD_ORD || ord_size == IETF_NO_IRD_ORD)
+			cm_node->mpav2_ird_ord = IETF_NO_IRD_ORD;
+
+		if (cm_node->mpav2_ird_ord != IETF_NO_IRD_ORD) {
+			/* responder */
+			if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) {
+				/* we are still negotiating */
+				if (ord_size > NES_MAX_IRD) {
+					cm_node->ird_size = NES_MAX_IRD;
+				} else {
+					cm_node->ird_size = ord_size;
+					if (ord_size == 0 &&
+					(rtr_ctrl_ord & IETF_RDMA0_READ)) {
+						cm_node->ird_size = 1;
+						nes_debug(NES_DBG_CM,
+						"%s: Remote peer doesn't support RDMA0_READ (ord=%u)\n",
+							__func__, ord_size);
+					}
+				}
+				if (ird_size > NES_MAX_ORD)
+					cm_node->ord_size = NES_MAX_ORD;
+				else
+					cm_node->ord_size = ird_size;
+			} else { /* initiator */
+				if (ord_size > NES_MAX_IRD) {
+					nes_debug(NES_DBG_CM,
+					"%s: Unable to support the requested (ord =%u)\n",
+							__func__, ord_size);
+					return -EINVAL;
+				}
+				cm_node->ird_size = ord_size;
+
+				if (ird_size > NES_MAX_ORD) {
+					cm_node->ord_size = NES_MAX_ORD;
+				} else {
+					if (ird_size == 0 &&
+					(rtr_ctrl_ord & IETF_RDMA0_READ)) {
+						nes_debug(NES_DBG_CM,
+						"%s: Remote peer doesn't support RDMA0_READ (ird=%u)\n",
+							__func__, ird_size);
+						return -EINVAL;
+					} else {
+						cm_node->ord_size = ird_size;
+					}
+				}
+			}
+		}
+
+		if (rtr_ctrl_ord & IETF_RDMA0_READ) {
+			cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
+
+		} else if (rtr_ctrl_ord & IETF_RDMA0_WRITE) {
+			cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO;
+		} else {        /* Not supported RDMA0 operation */
+			return -EINVAL;
+		}
+		break;
+	}
+	case IETF_MPA_V1:
+	default:
+		break;
+	}
+
+	/* copy entire MPA frame to our cm_node's frame */
+	memcpy(cm_node->mpa_frame_buf, buffer + mpa_hdr_len, cm_node->mpa_frame_size);
+
+	if (mpa_frame->flags & IETF_MPA_FLAGS_REJECT)
+		*type = NES_MPA_REQUEST_REJECT;
+	return 0;
+}
+
+
+/**
+ * form_cm_frame - get a free packet and build empty frame Use
+ * node info to build.
+ */
+static void form_cm_frame(struct sk_buff *skb,
+			  struct nes_cm_node *cm_node, void *options, u32 optionsize,
+			  void *data, u32 datasize, u8 flags)
+{
+	struct tcphdr *tcph;
+	struct iphdr *iph;
+	struct ethhdr *ethh;
+	u8 *buf;
+	u16 packetsize = sizeof(*iph);
+
+	packetsize += sizeof(*tcph);
+	packetsize += optionsize + datasize;
+
+	skb_trim(skb, 0);
+	memset(skb->data, 0x00, ETH_HLEN + sizeof(*iph) + sizeof(*tcph));
+
+	buf = skb_put(skb, packetsize + ETH_HLEN);
+
+	ethh = (struct ethhdr *)buf;
+	buf += ETH_HLEN;
+
+	iph = (struct iphdr *)buf;
+	buf += sizeof(*iph);
+	tcph = (struct tcphdr *)buf;
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, ETH_HLEN);
+	skb_set_transport_header(skb, ETH_HLEN + sizeof(*iph));
+	buf += sizeof(*tcph);
+
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	if (!(cm_node->netdev->features & NETIF_F_IP_CSUM))
+		skb->ip_summed = CHECKSUM_NONE;
+	skb->protocol = htons(0x800);
+	skb->data_len = 0;
+	skb->mac_len = ETH_HLEN;
+
+	memcpy(ethh->h_dest, cm_node->rem_mac, ETH_ALEN);
+	memcpy(ethh->h_source, cm_node->loc_mac, ETH_ALEN);
+	ethh->h_proto = htons(0x0800);
+
+	iph->version = IPVERSION;
+	iph->ihl = 5;           /* 5 * 4Byte words, IP headr len */
+	iph->tos = 0;
+	iph->tot_len = htons(packetsize);
+	iph->id = htons(++cm_node->tcp_cntxt.loc_id);
+
+	iph->frag_off = htons(0x4000);
+	iph->ttl = 0x40;
+	iph->protocol = 0x06;   /* IPPROTO_TCP */
+
+	iph->saddr = htonl(cm_node->loc_addr);
+	iph->daddr = htonl(cm_node->rem_addr);
+
+	tcph->source = htons(cm_node->loc_port);
+	tcph->dest = htons(cm_node->rem_port);
+	tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num);
+
+	if (flags & SET_ACK) {
+		cm_node->tcp_cntxt.loc_ack_num = cm_node->tcp_cntxt.rcv_nxt;
+		tcph->ack_seq = htonl(cm_node->tcp_cntxt.loc_ack_num);
+		tcph->ack = 1;
+	} else {
+		tcph->ack_seq = 0;
+	}
+
+	if (flags & SET_SYN) {
+		cm_node->tcp_cntxt.loc_seq_num++;
+		tcph->syn = 1;
+	} else {
+		cm_node->tcp_cntxt.loc_seq_num += datasize;
+	}
+
+	if (flags & SET_FIN) {
+		cm_node->tcp_cntxt.loc_seq_num++;
+		tcph->fin = 1;
+	}
+
+	if (flags & SET_RST)
+		tcph->rst = 1;
+
+	tcph->doff = (u16)((sizeof(*tcph) + optionsize + 3) >> 2);
+	tcph->window = htons(cm_node->tcp_cntxt.rcv_wnd);
+	tcph->urg_ptr = 0;
+	if (optionsize)
+		memcpy(buf, options, optionsize);
+	buf += optionsize;
+	if (datasize)
+		memcpy(buf, data, datasize);
+
+	skb_shinfo(skb)->nr_frags = 0;
+	cm_packets_created++;
+}
+
+/**
+ * print_core - dump a cm core
+ */
+static void print_core(struct nes_cm_core *core)
+{
+	nes_debug(NES_DBG_CM, "---------------------------------------------\n");
+	nes_debug(NES_DBG_CM, "CM Core  -- (core = %p )\n", core);
+	if (!core)
+		return;
+	nes_debug(NES_DBG_CM, "---------------------------------------------\n");
+
+	nes_debug(NES_DBG_CM, "State         : %u \n", core->state);
+
+	nes_debug(NES_DBG_CM, "Listen Nodes  : %u \n", atomic_read(&core->listen_node_cnt));
+	nes_debug(NES_DBG_CM, "Active Nodes  : %u \n", atomic_read(&core->node_cnt));
+
+	nes_debug(NES_DBG_CM, "core          : %p \n", core);
+
+	nes_debug(NES_DBG_CM, "-------------- end core ---------------\n");
+}
+
+static void record_ird_ord(struct nes_cm_node *cm_node,
+					u16 conn_ird, u16 conn_ord)
+{
+	if (conn_ird > NES_MAX_IRD)
+		conn_ird = NES_MAX_IRD;
+
+	if (conn_ord > NES_MAX_ORD)
+		conn_ord = NES_MAX_ORD;
+
+	cm_node->ird_size = conn_ird;
+	cm_node->ord_size = conn_ord;
+}
+
+/**
+ * cm_build_mpa_frame - build a MPA V1 frame or MPA V2 frame
+ */
+static int cm_build_mpa_frame(struct nes_cm_node *cm_node, u8 **start_buff,
+			      u16 *buff_len, u8 *pci_mem, u8 mpa_key)
+{
+	int ret = 0;
+
+	*start_buff = (pci_mem) ? pci_mem : &cm_node->mpa_frame_buf[0];
+
+	switch (cm_node->mpa_frame_rev) {
+	case IETF_MPA_V1:
+		*start_buff = (u8 *)*start_buff + sizeof(struct ietf_rtr_msg);
+		*buff_len = sizeof(struct ietf_mpa_v1) + cm_node->mpa_frame_size;
+		build_mpa_v1(cm_node, *start_buff, mpa_key);
+		break;
+	case IETF_MPA_V2:
+		*buff_len = sizeof(struct ietf_mpa_v2) + cm_node->mpa_frame_size;
+		build_mpa_v2(cm_node, *start_buff, mpa_key);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/**
+ * build_mpa_v2 - build a MPA V2 frame
+ */
+static void build_mpa_v2(struct nes_cm_node *cm_node,
+			 void *start_addr, u8 mpa_key)
+{
+	struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr;
+	struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg;
+	u16 ctrl_ird;
+	u16 ctrl_ord;
+
+	/* initialize the upper 5 bytes of the frame */
+	build_mpa_v1(cm_node, start_addr, mpa_key);
+	mpa_frame->flags |= IETF_MPA_V2_FLAG; /* set a bit to indicate MPA V2 */
+	mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE);
+
+	/* initialize RTR msg */
+	if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
+		ctrl_ird = IETF_NO_IRD_ORD;
+		ctrl_ord = IETF_NO_IRD_ORD;
+	} else {
+		ctrl_ird = cm_node->ird_size & IETF_NO_IRD_ORD;
+		ctrl_ord = cm_node->ord_size & IETF_NO_IRD_ORD;
+	}
+	ctrl_ird |= IETF_PEER_TO_PEER;
+
+	switch (mpa_key) {
+	case MPA_KEY_REQUEST:
+		ctrl_ord |= IETF_RDMA0_WRITE;
+		ctrl_ord |= IETF_RDMA0_READ;
+		break;
+	case MPA_KEY_REPLY:
+		switch (cm_node->send_rdma0_op) {
+		case SEND_RDMA_WRITE_ZERO:
+			ctrl_ord |= IETF_RDMA0_WRITE;
+			break;
+		case SEND_RDMA_READ_ZERO:
+			ctrl_ord |= IETF_RDMA0_READ;
+			break;
+		}
+	}
+	rtr_msg->ctrl_ird = htons(ctrl_ird);
+	rtr_msg->ctrl_ord = htons(ctrl_ord);
+}
+
+/**
+ * build_mpa_v1 - build a MPA V1 frame
+ */
+static void build_mpa_v1(struct nes_cm_node *cm_node, void *start_addr, u8 mpa_key)
+{
+	struct ietf_mpa_v1 *mpa_frame = (struct ietf_mpa_v1 *)start_addr;
+
+	switch (mpa_key) {
+	case MPA_KEY_REQUEST:
+		memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE);
+		break;
+	case MPA_KEY_REPLY:
+		memcpy(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE);
+		break;
+	}
+	mpa_frame->flags = IETF_MPA_FLAGS_CRC;
+	mpa_frame->rev = cm_node->mpa_frame_rev;
+	mpa_frame->priv_data_len = htons(cm_node->mpa_frame_size);
+}
+
+static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_addr)
+{
+	u64 u64temp;
+	struct nes_qp *nesqp = *nesqp_addr;
+	struct nes_hw_qp_wqe *wqe = &nesqp->hwqp.sq_vbase[0];
+
+	u64temp = (unsigned long)nesqp->nesuqp_addr;
+	u64temp |= NES_SW_CONTEXT_ALIGN >> 1;
+	set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, u64temp);
+
+	wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] = 0;
+	wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] = 0;
+
+	switch (cm_node->send_rdma0_op) {
+	case SEND_RDMA_WRITE_ZERO:
+		nes_debug(NES_DBG_CM, "Sending first write.\n");
+		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
+			cpu_to_le32(NES_IWARP_SQ_OP_RDMAW);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = 0;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0;
+		break;
+
+	case SEND_RDMA_READ_ZERO:
+	default:
+		if (cm_node->send_rdma0_op != SEND_RDMA_READ_ZERO)
+			WARN(1, "Unsupported RDMA0 len operation=%u\n",
+			     cm_node->send_rdma0_op);
+		nes_debug(NES_DBG_CM, "Sending first rdma operation.\n");
+		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
+			cpu_to_le32(NES_IWARP_SQ_OP_RDMAR);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX] = 1;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX] = 0;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = 0;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_STAG_IDX] = 1;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 1;
+		break;
+	}
+
+	if (nesqp->sq_kmapped) {
+		nesqp->sq_kmapped = 0;
+		kunmap(nesqp->page);
+	}
+
+	/*use the reserved spot on the WQ for the extra first WQE*/
+	nesqp->nesqp_context->ird_ord_sizes &= cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT |
+							     NES_QPCONTEXT_ORDIRD_WRPDU |
+							     NES_QPCONTEXT_ORDIRD_ALSMM));
+	nesqp->skip_lsmm = 1;
+	nesqp->hwqp.sq_tail = 0;
+}
+
+/**
+ * schedule_nes_timer
+ * note - cm_node needs to be protected before calling this. Encase in:
+ *			rem_ref_cm_node(cm_core, cm_node);add_ref_cm_node(cm_node);
+ */
+int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb,
+		       enum nes_timer_type type, int send_retrans,
+		       int close_when_complete)
+{
+	unsigned long flags;
+	struct nes_cm_core *cm_core = cm_node->cm_core;
+	struct nes_timer_entry *new_send;
+	int ret = 0;
+
+	new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC);
+	if (!new_send)
+		return -ENOMEM;
+
+	/* new_send->timetosend = currenttime */
+	new_send->retrycount = NES_DEFAULT_RETRYS;
+	new_send->retranscount = NES_DEFAULT_RETRANS;
+	new_send->skb = skb;
+	new_send->timetosend = jiffies;
+	new_send->type = type;
+	new_send->netdev = cm_node->netdev;
+	new_send->send_retrans = send_retrans;
+	new_send->close_when_complete = close_when_complete;
+
+	if (type == NES_TIMER_TYPE_CLOSE) {
+		new_send->timetosend += (HZ / 10);
+		if (cm_node->recv_entry) {
+			kfree(new_send);
+			WARN_ON(1);
+			return -EINVAL;
+		}
+		cm_node->recv_entry = new_send;
+	}
+
+	if (type == NES_TIMER_TYPE_SEND) {
+		new_send->seq_num = ntohl(tcp_hdr(skb)->seq);
+		refcount_inc(&new_send->skb->users);
+		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+		cm_node->send_entry = new_send;
+		add_ref_cm_node(cm_node);
+		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+		new_send->timetosend = jiffies + NES_RETRY_TIMEOUT;
+
+		ret = nes_nic_cm_xmit(new_send->skb, cm_node->netdev);
+		if (ret != NETDEV_TX_OK) {
+			nes_debug(NES_DBG_CM, "Error sending packet %p "
+				  "(jiffies = %lu)\n", new_send, jiffies);
+			new_send->timetosend = jiffies;
+			ret = NETDEV_TX_OK;
+		} else {
+			cm_packets_sent++;
+			if (!send_retrans) {
+				cleanup_retrans_entry(cm_node);
+				if (close_when_complete)
+					rem_ref_cm_node(cm_core, cm_node);
+				return ret;
+			}
+		}
+	}
+
+	if (!timer_pending(&cm_core->tcp_timer))
+		mod_timer(&cm_core->tcp_timer, new_send->timetosend);
+
+	return ret;
+}
+
+static void nes_retrans_expired(struct nes_cm_node *cm_node)
+{
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	enum nes_cm_node_state state = cm_node->state;
+	cm_node->state = NES_CM_STATE_CLOSED;
+
+	switch (state) {
+	case NES_CM_STATE_SYN_RCVD:
+	case NES_CM_STATE_CLOSING:
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		break;
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_FIN_WAIT1:
+		if (cm_node->cm_id)
+			cm_id->rem_ref(cm_id);
+		send_reset(cm_node, NULL);
+		break;
+	default:
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, NULL);
+		create_event(cm_node, NES_CM_EVENT_ABORTED);
+	}
+}
+
+static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node)
+{
+	struct nes_timer_entry *recv_entry = cm_node->recv_entry;
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	struct nes_qp *nesqp;
+	unsigned long qplockflags;
+
+	if (!recv_entry)
+		return;
+	nesqp = (struct nes_qp *)recv_entry->skb;
+	if (nesqp) {
+		spin_lock_irqsave(&nesqp->lock, qplockflags);
+		if (nesqp->cm_id) {
+			nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
+				  "refcount = %d: HIT A "
+				  "NES_TIMER_TYPE_CLOSE with something "
+				  "to do!!!\n", nesqp->hwqp.qp_id, cm_id,
+				  atomic_read(&nesqp->refcount));
+			nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+			nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
+			nesqp->ibqp_state = IB_QPS_ERR;
+			spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+			nes_cm_disconn(nesqp);
+		} else {
+			spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+			nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
+				  "refcount = %d: HIT A "
+				  "NES_TIMER_TYPE_CLOSE with nothing "
+				  "to do!!!\n", nesqp->hwqp.qp_id, cm_id,
+				  atomic_read(&nesqp->refcount));
+		}
+	} else if (rem_node) {
+		/* TIME_WAIT state */
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+	}
+	if (cm_node->cm_id)
+		cm_id->rem_ref(cm_id);
+	kfree(recv_entry);
+	cm_node->recv_entry = NULL;
+}
+
+/**
+ * nes_cm_timer_tick
+ */
+static void nes_cm_timer_tick(struct timer_list *unused)
+{
+	unsigned long flags;
+	unsigned long nexttimeout = jiffies + NES_LONG_TIME;
+	struct nes_cm_node *cm_node;
+	struct nes_timer_entry *send_entry, *recv_entry;
+	struct list_head *list_core_temp;
+	struct list_head *list_node;
+	struct nes_cm_core *cm_core = g_cm_core;
+	u32 settimer = 0;
+	unsigned long timetosend;
+	int ret = NETDEV_TX_OK;
+
+	struct list_head timer_list;
+
+	INIT_LIST_HEAD(&timer_list);
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+
+	list_for_each_safe(list_node, list_core_temp,
+			   &cm_core->connected_nodes) {
+		cm_node = container_of(list_node, struct nes_cm_node, list);
+		if ((cm_node->recv_entry) || (cm_node->send_entry)) {
+			add_ref_cm_node(cm_node);
+			list_add(&cm_node->timer_entry, &timer_list);
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	list_for_each_safe(list_node, list_core_temp, &timer_list) {
+		cm_node = container_of(list_node, struct nes_cm_node,
+				       timer_entry);
+		recv_entry = cm_node->recv_entry;
+
+		if (recv_entry) {
+			if (time_after(recv_entry->timetosend, jiffies)) {
+				if (nexttimeout > recv_entry->timetosend ||
+				    !settimer) {
+					nexttimeout = recv_entry->timetosend;
+					settimer = 1;
+				}
+			} else {
+				handle_recv_entry(cm_node, 1);
+			}
+		}
+
+		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+		do {
+			send_entry = cm_node->send_entry;
+			if (!send_entry)
+				break;
+			if (time_after(send_entry->timetosend, jiffies)) {
+				if (cm_node->state != NES_CM_STATE_TSA) {
+					if ((nexttimeout >
+					     send_entry->timetosend) ||
+					    !settimer) {
+						nexttimeout =
+							send_entry->timetosend;
+						settimer = 1;
+					}
+				} else {
+					free_retrans_entry(cm_node);
+				}
+				break;
+			}
+
+			if ((cm_node->state == NES_CM_STATE_TSA) ||
+			    (cm_node->state == NES_CM_STATE_CLOSED)) {
+				free_retrans_entry(cm_node);
+				break;
+			}
+
+			if (!send_entry->retranscount ||
+			    !send_entry->retrycount) {
+				cm_packets_dropped++;
+				free_retrans_entry(cm_node);
+
+				spin_unlock_irqrestore(
+					&cm_node->retrans_list_lock, flags);
+				nes_retrans_expired(cm_node);
+				cm_node->state = NES_CM_STATE_CLOSED;
+				spin_lock_irqsave(&cm_node->retrans_list_lock,
+						  flags);
+				break;
+			}
+			refcount_inc(&send_entry->skb->users);
+			cm_packets_retrans++;
+			nes_debug(NES_DBG_CM, "Retransmitting send_entry %p "
+				  "for node %p, jiffies = %lu, time to send = "
+				  "%lu, retranscount = %u, send_entry->seq_num = "
+				  "0x%08X, cm_node->tcp_cntxt.rem_ack_num = "
+				  "0x%08X\n", send_entry, cm_node, jiffies,
+				  send_entry->timetosend,
+				  send_entry->retranscount,
+				  send_entry->seq_num,
+				  cm_node->tcp_cntxt.rem_ack_num);
+
+			spin_unlock_irqrestore(&cm_node->retrans_list_lock,
+					       flags);
+			ret = nes_nic_cm_xmit(send_entry->skb, cm_node->netdev);
+			spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+			if (ret != NETDEV_TX_OK) {
+				nes_debug(NES_DBG_CM, "rexmit failed for "
+					  "node=%p\n", cm_node);
+				cm_packets_bounced++;
+				send_entry->retrycount--;
+				nexttimeout = jiffies + NES_SHORT_TIME;
+				settimer = 1;
+				break;
+			} else {
+				cm_packets_sent++;
+			}
+			nes_debug(NES_DBG_CM, "Packet Sent: retrans count = "
+				  "%u, retry count = %u.\n",
+				  send_entry->retranscount,
+				  send_entry->retrycount);
+			if (send_entry->send_retrans) {
+				send_entry->retranscount--;
+				timetosend = (NES_RETRY_TIMEOUT <<
+					      (NES_DEFAULT_RETRANS - send_entry->retranscount));
+
+				send_entry->timetosend = jiffies +
+							 min(timetosend, NES_MAX_TIMEOUT);
+				if (nexttimeout > send_entry->timetosend ||
+				    !settimer) {
+					nexttimeout = send_entry->timetosend;
+					settimer = 1;
+				}
+			} else {
+				int close_when_complete;
+				close_when_complete =
+					send_entry->close_when_complete;
+				nes_debug(NES_DBG_CM, "cm_node=%p state=%d\n",
+					  cm_node, cm_node->state);
+				free_retrans_entry(cm_node);
+				if (close_when_complete)
+					rem_ref_cm_node(cm_node->cm_core,
+							cm_node);
+			}
+		} while (0);
+
+		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+	}
+
+	if (settimer) {
+		if (!timer_pending(&cm_core->tcp_timer))
+			mod_timer(&cm_core->tcp_timer, nexttimeout);
+	}
+}
+
+
+/**
+ * send_syn
+ */
+static int send_syn(struct nes_cm_node *cm_node, u32 sendack,
+		    struct sk_buff *skb)
+{
+	int ret;
+	int flags = SET_SYN;
+	char optionsbuffer[sizeof(struct option_mss) +
+			   sizeof(struct option_windowscale) + sizeof(struct option_base) +
+			   TCP_OPTIONS_PADDING];
+
+	int optionssize = 0;
+	/* Sending MSS option */
+	union all_known_options *options;
+
+	if (!cm_node)
+		return -EINVAL;
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_mss.optionnum = OPTION_NUMBER_MSS;
+	options->as_mss.length = sizeof(struct option_mss);
+	options->as_mss.mss = htons(cm_node->tcp_cntxt.mss);
+	optionssize += sizeof(struct option_mss);
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_windowscale.optionnum = OPTION_NUMBER_WINDOW_SCALE;
+	options->as_windowscale.length = sizeof(struct option_windowscale);
+	options->as_windowscale.shiftcount = cm_node->tcp_cntxt.rcv_wscale;
+	optionssize += sizeof(struct option_windowscale);
+
+	if (sendack && !(NES_DRV_OPT_SUPRESS_OPTION_BC & nes_drv_opt)) {
+		options = (union all_known_options *)&optionsbuffer[optionssize];
+		options->as_base.optionnum = OPTION_NUMBER_WRITE0;
+		options->as_base.length = sizeof(struct option_base);
+		optionssize += sizeof(struct option_base);
+		/* we need the size to be a multiple of 4 */
+		options = (union all_known_options *)&optionsbuffer[optionssize];
+		options->as_end = 1;
+		optionssize += 1;
+		options = (union all_known_options *)&optionsbuffer[optionssize];
+		options->as_end = 1;
+		optionssize += 1;
+	}
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_end = OPTION_NUMBER_END;
+	optionssize += 1;
+
+	if (!skb)
+		skb = dev_alloc_skb(MAX_CM_BUFFER);
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	if (sendack)
+		flags |= SET_ACK;
+
+	form_cm_frame(skb, cm_node, optionsbuffer, optionssize, NULL, 0, flags);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+
+	return ret;
+}
+
+
+/**
+ * send_reset
+ */
+static int send_reset(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret;
+	int flags = SET_RST | SET_ACK;
+
+	if (!skb)
+		skb = dev_alloc_skb(MAX_CM_BUFFER);
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -ENOMEM;
+	}
+
+	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, flags);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 1);
+
+	return ret;
+}
+
+
+/**
+ * send_ack
+ */
+static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret;
+
+	if (!skb)
+		skb = dev_alloc_skb(MAX_CM_BUFFER);
+
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 0);
+
+	return ret;
+}
+
+
+/**
+ * send_fin
+ */
+static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret;
+
+	/* if we didn't get a frame get one */
+	if (!skb)
+		skb = dev_alloc_skb(MAX_CM_BUFFER);
+
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK | SET_FIN);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+
+	return ret;
+}
+
+
+/**
+ * find_node - find a cm node that matches the reference cm node
+ */
+static struct nes_cm_node *find_node(struct nes_cm_core *cm_core,
+				     u16 rem_port, nes_addr_t rem_addr, u16 loc_port, nes_addr_t loc_addr)
+{
+	unsigned long flags;
+	struct list_head *hte;
+	struct nes_cm_node *cm_node;
+
+	/* get a handle on the hte */
+	hte = &cm_core->connected_nodes;
+
+	/* walk list and find cm_node associated with this session ID */
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+	list_for_each_entry(cm_node, hte, list) {
+		/* compare quad, return node handle if a match */
+		nes_debug(NES_DBG_CM, "finding node %x:%x =? %x:%x ^ %x:%x =? %x:%x\n",
+			  cm_node->loc_addr, cm_node->loc_port,
+			  loc_addr, loc_port,
+			  cm_node->rem_addr, cm_node->rem_port,
+			  rem_addr, rem_port);
+		if ((cm_node->loc_addr == loc_addr) &&
+		    (cm_node->loc_port == loc_port) &&
+		    (cm_node->rem_addr == rem_addr) &&
+		    (cm_node->rem_port == rem_port)) {
+			add_ref_cm_node(cm_node);
+			spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+			return cm_node;
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	/* no owner node */
+	return NULL;
+}
+
+
+/**
+ * find_listener - find a cm node listening on this addr-port pair
+ */
+static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core,
+					     nes_addr_t dst_addr, u16 dst_port,
+					     enum nes_cm_listener_state listener_state)
+{
+	unsigned long flags;
+	struct nes_cm_listener *listen_node;
+	nes_addr_t listen_addr;
+	u16 listen_port;
+
+	/* walk list and find cm_node associated with this session ID */
+	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+	list_for_each_entry(listen_node, &cm_core->listen_list.list, list) {
+		listen_addr = listen_node->loc_addr;
+		listen_port = listen_node->loc_port;
+
+		/* compare node pair, return node handle if a match */
+		if (((listen_addr == dst_addr) ||
+		     listen_addr == 0x00000000) &&
+		    (listen_port == dst_port) &&
+		    (listener_state & listen_node->listener_state)) {
+			atomic_inc(&listen_node->ref_count);
+			spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+			return listen_node;
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+
+	/* no listener */
+	return NULL;
+}
+
+/**
+ * add_hte_node - add a cm node to the hash table
+ */
+static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	unsigned long flags;
+	struct list_head *hte;
+
+	if (!cm_node || !cm_core)
+		return -EINVAL;
+
+	nes_debug(NES_DBG_CM, "Adding Node %p to Active Connection HT\n",
+		  cm_node);
+
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+
+	/* get a handle on the hash table element (list head for this slot) */
+	hte = &cm_core->connected_nodes;
+	list_add_tail(&cm_node->list, hte);
+	atomic_inc(&cm_core->ht_node_cnt);
+
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	return 0;
+}
+
+
+/**
+ * mini_cm_dec_refcnt_listen
+ */
+static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core,
+				     struct nes_cm_listener *listener, int free_hanging_nodes)
+{
+	int ret = -EINVAL;
+	int err = 0;
+	unsigned long flags;
+	struct list_head *list_pos = NULL;
+	struct list_head *list_temp = NULL;
+	struct nes_cm_node *cm_node = NULL;
+	struct list_head reset_list;
+
+	nes_debug(NES_DBG_CM, "attempting listener= %p free_nodes= %d, "
+		  "refcnt=%d\n", listener, free_hanging_nodes,
+		  atomic_read(&listener->ref_count));
+	/* free non-accelerated child nodes for this listener */
+	INIT_LIST_HEAD(&reset_list);
+	if (free_hanging_nodes) {
+		spin_lock_irqsave(&cm_core->ht_lock, flags);
+		list_for_each_safe(list_pos, list_temp,
+				   &g_cm_core->connected_nodes) {
+			cm_node = container_of(list_pos, struct nes_cm_node,
+					       list);
+			if ((cm_node->listener == listener) &&
+			    (!cm_node->accelerated)) {
+				add_ref_cm_node(cm_node);
+				list_add(&cm_node->reset_entry, &reset_list);
+			}
+		}
+		spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+	}
+
+	list_for_each_safe(list_pos, list_temp, &reset_list) {
+		cm_node = container_of(list_pos, struct nes_cm_node,
+				       reset_entry);
+		{
+			struct nes_cm_node *loopback = cm_node->loopbackpartner;
+			enum nes_cm_node_state old_state;
+			if (NES_CM_STATE_FIN_WAIT1 <= cm_node->state) {
+				rem_ref_cm_node(cm_node->cm_core, cm_node);
+			} else {
+				if (!loopback) {
+					cleanup_retrans_entry(cm_node);
+					err = send_reset(cm_node, NULL);
+					if (err) {
+						cm_node->state =
+							NES_CM_STATE_CLOSED;
+						WARN_ON(1);
+					} else {
+						old_state = cm_node->state;
+						cm_node->state = NES_CM_STATE_LISTENER_DESTROYED;
+						if (old_state != NES_CM_STATE_MPAREQ_RCVD)
+							rem_ref_cm_node(
+								cm_node->cm_core,
+								cm_node);
+					}
+				} else {
+					struct nes_cm_event event;
+
+					event.cm_node = loopback;
+					event.cm_info.rem_addr =
+							loopback->rem_addr;
+					event.cm_info.loc_addr =
+							loopback->loc_addr;
+					event.cm_info.rem_port =
+							loopback->rem_port;
+					event.cm_info.loc_port =
+							 loopback->loc_port;
+					event.cm_info.cm_id = loopback->cm_id;
+					add_ref_cm_node(loopback);
+					loopback->state = NES_CM_STATE_CLOSED;
+					cm_event_connect_error(&event);
+					cm_node->state = NES_CM_STATE_LISTENER_DESTROYED;
+
+					rem_ref_cm_node(cm_node->cm_core,
+							 cm_node);
+
+				}
+			}
+		}
+	}
+
+	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+	if (!atomic_dec_return(&listener->ref_count)) {
+		list_del(&listener->list);
+
+		/* decrement our listen node count */
+		atomic_dec(&cm_core->listen_node_cnt);
+
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+
+		if (listener->nesvnic) {
+			nes_manage_apbvt(listener->nesvnic,
+				listener->loc_port,
+				PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn),
+				NES_MANAGE_APBVT_DEL);
+
+			nes_debug(NES_DBG_NLMSG,
+					"Delete APBVT loc_port = %04X\n",
+					listener->loc_port);
+		}
+
+		nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener);
+
+		kfree(listener);
+		listener = NULL;
+		ret = 0;
+		atomic_inc(&cm_listens_destroyed);
+	} else {
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+	}
+	if (listener) {
+		if (atomic_read(&listener->pend_accepts_cnt) > 0)
+			nes_debug(NES_DBG_CM, "destroying listener (%p)"
+				  " with non-zero pending accepts=%u\n",
+				  listener, atomic_read(&listener->pend_accepts_cnt));
+	}
+
+	return ret;
+}
+
+
+/**
+ * mini_cm_del_listen
+ */
+static int mini_cm_del_listen(struct nes_cm_core *cm_core,
+			      struct nes_cm_listener *listener)
+{
+	listener->listener_state = NES_CM_LISTENER_PASSIVE_STATE;
+	listener->cm_id = NULL; /* going to be destroyed pretty soon */
+	return mini_cm_dec_refcnt_listen(cm_core, listener, 1);
+}
+
+
+/**
+ * mini_cm_accelerated
+ */
+static inline int mini_cm_accelerated(struct nes_cm_core *cm_core,
+				      struct nes_cm_node *cm_node)
+{
+	cm_node->accelerated = true;
+
+	if (cm_node->accept_pend) {
+		BUG_ON(!cm_node->listener);
+		atomic_dec(&cm_node->listener->pend_accepts_cnt);
+		cm_node->accept_pend = 0;
+		BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
+	}
+
+	if (!timer_pending(&cm_core->tcp_timer))
+		mod_timer(&cm_core->tcp_timer, (jiffies + NES_SHORT_TIME));
+
+	return 0;
+}
+
+
+/**
+ * nes_addr_resolve_neigh
+ */
+static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpindex)
+{
+	struct rtable *rt;
+	struct neighbour *neigh;
+	int rc = arpindex;
+	struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
+	__be32 dst_ipaddr = htonl(dst_ip);
+
+	rt = ip_route_output(&init_net, dst_ipaddr, nesvnic->local_ipaddr, 0, 0);
+	if (IS_ERR(rt)) {
+		printk(KERN_ERR "%s: ip_route_output_key failed for 0x%08X\n",
+		       __func__, dst_ip);
+		return rc;
+	}
+
+	neigh = dst_neigh_lookup(&rt->dst, &dst_ipaddr);
+
+	rcu_read_lock();
+	if (neigh) {
+		if (neigh->nud_state & NUD_VALID) {
+			nes_debug(NES_DBG_CM, "Neighbor MAC address for 0x%08X"
+				  " is %pM, Gateway is 0x%08X \n", dst_ip,
+				  neigh->ha, ntohl(rt->rt_gateway));
+
+			if (arpindex >= 0) {
+				if (ether_addr_equal(nesadapter->arp_table[arpindex].mac_addr, neigh->ha)) {
+					/* Mac address same as in nes_arp_table */
+					goto out;
+				}
+
+				nes_manage_arp_cache(nesvnic->netdev,
+						     nesadapter->arp_table[arpindex].mac_addr,
+						     dst_ip, NES_ARP_DELETE);
+			}
+
+			nes_manage_arp_cache(nesvnic->netdev, neigh->ha,
+					     dst_ip, NES_ARP_ADD);
+			rc = nes_arp_table(nesvnic->nesdev, dst_ip, NULL,
+					   NES_ARP_RESOLVE);
+		} else {
+			neigh_event_send(neigh, NULL);
+		}
+	}
+out:
+	rcu_read_unlock();
+
+	if (neigh)
+		neigh_release(neigh);
+
+	ip_rt_put(rt);
+	return rc;
+}
+
+/**
+ * make_cm_node - create a new instance of a cm node
+ */
+static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core,
+					struct nes_vnic *nesvnic, struct nes_cm_info *cm_info,
+					struct nes_cm_listener *listener)
+{
+	struct nes_cm_node *cm_node;
+	struct timespec ts;
+	int oldarpindex = 0;
+	int arpindex = 0;
+	struct nes_device *nesdev;
+	struct nes_adapter *nesadapter;
+
+	/* create an hte and cm_node for this instance */
+	cm_node = kzalloc(sizeof(*cm_node), GFP_ATOMIC);
+	if (!cm_node)
+		return NULL;
+
+	/* set our node specific transport info */
+	if (listener) {
+		cm_node->loc_addr = listener->loc_addr;
+		cm_node->loc_port = listener->loc_port;
+	} else {
+		cm_node->loc_addr = cm_info->loc_addr;
+		cm_node->loc_port = cm_info->loc_port;
+	}
+	cm_node->rem_addr = cm_info->rem_addr;
+	cm_node->rem_port = cm_info->rem_port;
+
+	cm_node->mpa_frame_rev = mpa_version;
+	cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
+	cm_node->mpav2_ird_ord = 0;
+	cm_node->ird_size = 0;
+	cm_node->ord_size = 0;
+
+	nes_debug(NES_DBG_CM, "Make node addresses : loc = %pI4:%x, rem = %pI4:%x\n",
+		  &cm_node->loc_addr, cm_node->loc_port,
+		  &cm_node->rem_addr, cm_node->rem_port);
+	cm_node->listener = listener;
+	if (listener)
+		cm_node->tos = listener->tos;
+	cm_node->netdev = nesvnic->netdev;
+	cm_node->cm_id = cm_info->cm_id;
+	memcpy(cm_node->loc_mac, nesvnic->netdev->dev_addr, ETH_ALEN);
+
+	nes_debug(NES_DBG_CM, "listener=%p, cm_id=%p\n", cm_node->listener,
+		  cm_node->cm_id);
+
+	spin_lock_init(&cm_node->retrans_list_lock);
+
+	cm_node->loopbackpartner = NULL;
+	atomic_set(&cm_node->ref_count, 1);
+	/* associate our parent CM core */
+	cm_node->cm_core = cm_core;
+	cm_node->tcp_cntxt.loc_id = NES_CM_DEF_LOCAL_ID;
+	cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE;
+	cm_node->tcp_cntxt.rcv_wnd = NES_CM_DEFAULT_RCV_WND_SCALED >>
+				     NES_CM_DEFAULT_RCV_WND_SCALE;
+	ts = current_kernel_time();
+	cm_node->tcp_cntxt.loc_seq_num = htonl(ts.tv_nsec);
+	cm_node->tcp_cntxt.mss = nesvnic->max_frame_size - sizeof(struct iphdr) -
+				 sizeof(struct tcphdr) - ETH_HLEN - VLAN_HLEN;
+	cm_node->tcp_cntxt.rcv_nxt = 0;
+	/* get a unique session ID , add thread_id to an upcounter to handle race */
+	atomic_inc(&cm_core->node_cnt);
+	cm_node->conn_type = cm_info->conn_type;
+	cm_node->apbvt_set = 0;
+	cm_node->accept_pend = 0;
+
+	cm_node->nesvnic = nesvnic;
+	/* get some device handles, for arp lookup */
+	nesdev = nesvnic->nesdev;
+	nesadapter = nesdev->nesadapter;
+
+	cm_node->loopbackpartner = NULL;
+
+	/* get the mac addr for the remote node */
+	oldarpindex = nes_arp_table(nesdev, cm_node->rem_addr,
+				    NULL, NES_ARP_RESOLVE);
+	arpindex = nes_addr_resolve_neigh(nesvnic, cm_node->rem_addr,
+					  oldarpindex);
+	if (arpindex < 0) {
+		kfree(cm_node);
+		return NULL;
+	}
+
+	/* copy the mac addr to node context */
+	memcpy(cm_node->rem_mac, nesadapter->arp_table[arpindex].mac_addr, ETH_ALEN);
+	nes_debug(NES_DBG_CM, "Remote mac addr from arp table: %pM\n",
+		  cm_node->rem_mac);
+
+	add_hte_node(cm_core, cm_node);
+	atomic_inc(&cm_nodes_created);
+
+	return cm_node;
+}
+
+
+/**
+ * add_ref_cm_node - destroy an instance of a cm node
+ */
+static int add_ref_cm_node(struct nes_cm_node *cm_node)
+{
+	atomic_inc(&cm_node->ref_count);
+	return 0;
+}
+
+
+/**
+ * rem_ref_cm_node - destroy an instance of a cm node
+ */
+static int rem_ref_cm_node(struct nes_cm_core *cm_core,
+			   struct nes_cm_node *cm_node)
+{
+	unsigned long flags;
+	struct nes_qp *nesqp;
+
+	if (!cm_node)
+		return -EINVAL;
+
+	spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags);
+	if (atomic_dec_return(&cm_node->ref_count)) {
+		spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
+		return 0;
+	}
+	list_del(&cm_node->list);
+	atomic_dec(&cm_core->ht_node_cnt);
+	spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
+
+	/* if the node is destroyed before connection was accelerated */
+	if (!cm_node->accelerated && cm_node->accept_pend) {
+		BUG_ON(!cm_node->listener);
+		atomic_dec(&cm_node->listener->pend_accepts_cnt);
+		BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
+	}
+	WARN_ON(cm_node->send_entry);
+	if (cm_node->recv_entry)
+		handle_recv_entry(cm_node, 0);
+	if (cm_node->listener) {
+		mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0);
+	} else {
+		if (cm_node->apbvt_set && cm_node->nesvnic) {
+			nes_manage_apbvt(cm_node->nesvnic, cm_node->loc_port,
+					 PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn),
+					 NES_MANAGE_APBVT_DEL);
+		}
+		nes_debug(NES_DBG_NLMSG, "Delete APBVT loc_port = %04X\n",
+			  cm_node->loc_port);
+	}
+
+	atomic_dec(&cm_core->node_cnt);
+	atomic_inc(&cm_nodes_destroyed);
+	nesqp = cm_node->nesqp;
+	if (nesqp) {
+		nesqp->cm_node = NULL;
+		nes_rem_ref(&nesqp->ibqp);
+		cm_node->nesqp = NULL;
+	}
+
+	kfree(cm_node);
+	return 0;
+}
+
+/**
+ * process_options
+ */
+static int process_options(struct nes_cm_node *cm_node, u8 *optionsloc,
+			   u32 optionsize, u32 syn_packet)
+{
+	u32 tmp;
+	u32 offset = 0;
+	union all_known_options *all_options;
+	char got_mss_option = 0;
+
+	while (offset < optionsize) {
+		all_options = (union all_known_options *)(optionsloc + offset);
+		switch (all_options->as_base.optionnum) {
+		case OPTION_NUMBER_END:
+			offset = optionsize;
+			break;
+		case OPTION_NUMBER_NONE:
+			offset += 1;
+			continue;
+		case OPTION_NUMBER_MSS:
+			nes_debug(NES_DBG_CM, "%s: MSS Length: %d Offset: %d "
+				  "Size: %d\n", __func__,
+				  all_options->as_mss.length, offset, optionsize);
+			got_mss_option = 1;
+			if (all_options->as_mss.length != 4) {
+				return 1;
+			} else {
+				tmp = ntohs(all_options->as_mss.mss);
+				if (tmp > 0 && tmp <
+				    cm_node->tcp_cntxt.mss)
+					cm_node->tcp_cntxt.mss = tmp;
+			}
+			break;
+		case OPTION_NUMBER_WINDOW_SCALE:
+			cm_node->tcp_cntxt.snd_wscale =
+				all_options->as_windowscale.shiftcount;
+			break;
+		default:
+			nes_debug(NES_DBG_CM, "TCP Option not understood: %x\n",
+				  all_options->as_base.optionnum);
+			break;
+		}
+		offset += all_options->as_base.length;
+	}
+	if ((!got_mss_option) && (syn_packet))
+		cm_node->tcp_cntxt.mss = NES_CM_DEFAULT_MSS;
+	return 0;
+}
+
+static void drop_packet(struct sk_buff *skb)
+{
+	atomic_inc(&cm_accel_dropped_pkts);
+	dev_kfree_skb_any(skb);
+}
+
+static void handle_fin_pkt(struct nes_cm_node *cm_node)
+{
+	nes_debug(NES_DBG_CM, "Received FIN, cm_node = %p, state = %u. "
+		  "refcnt=%d\n", cm_node, cm_node->state,
+		  atomic_read(&cm_node->ref_count));
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_RCVD:
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_MPAREJ_RCVD:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_LAST_ACK;
+		send_fin(cm_node, NULL);
+		break;
+	case NES_CM_STATE_MPAREQ_SENT:
+		create_event(cm_node, NES_CM_EVENT_ABORTED);
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, NULL);
+		break;
+	case NES_CM_STATE_FIN_WAIT1:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSING;
+		send_ack(cm_node, NULL);
+		/* Wait for ACK as this is simultaneous close..
+		* After we receive ACK, do not send anything..
+		* Just rm the node.. Done.. */
+		break;
+	case NES_CM_STATE_FIN_WAIT2:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_TIME_WAIT;
+		send_ack(cm_node, NULL);
+		schedule_nes_timer(cm_node, NULL,  NES_TIMER_TYPE_CLOSE, 1, 0);
+		break;
+	case NES_CM_STATE_TIME_WAIT:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		break;
+	case NES_CM_STATE_TSA:
+	default:
+		nes_debug(NES_DBG_CM, "Error Rcvd FIN for node-%p state = %d\n",
+			cm_node, cm_node->state);
+		break;
+	}
+}
+
+
+static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
+	struct tcphdr *tcph)
+{
+
+	int	reset = 0;	/* whether to send reset in case of err.. */
+	atomic_inc(&cm_resets_recvd);
+	nes_debug(NES_DBG_CM, "Received Reset, cm_node = %p, state = %u."
+			" refcnt=%d\n", cm_node, cm_node->state,
+			atomic_read(&cm_node->ref_count));
+	cleanup_retrans_entry(cm_node);
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_MPAREQ_SENT:
+		nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
+			"listener=%p state=%d\n", __func__, __LINE__, cm_node,
+			cm_node->listener, cm_node->state);
+		switch (cm_node->mpa_frame_rev) {
+		case IETF_MPA_V2:
+			cm_node->mpa_frame_rev = IETF_MPA_V1;
+			/* send a syn and goto syn sent state */
+			cm_node->state = NES_CM_STATE_SYN_SENT;
+			if (send_syn(cm_node, 0, NULL)) {
+				active_open_err(cm_node, skb, reset);
+			}
+			break;
+		case IETF_MPA_V1:
+		default:
+			active_open_err(cm_node, skb, reset);
+			break;
+		}
+		break;
+	case NES_CM_STATE_MPAREQ_RCVD:
+		atomic_inc(&cm_node->passive_state);
+		dev_kfree_skb_any(skb);
+		break;
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_SYN_RCVD:
+	case NES_CM_STATE_LISTENING:
+		nes_debug(NES_DBG_CM, "Bad state %s[%u]\n", __func__, __LINE__);
+		passive_open_err(cm_node, skb, reset);
+		break;
+	case NES_CM_STATE_TSA:
+		active_open_err(cm_node, skb, reset);
+		break;
+	case NES_CM_STATE_CLOSED:
+		drop_packet(skb);
+		break;
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_FIN_WAIT1:
+	case NES_CM_STATE_LAST_ACK:
+		cm_node->cm_id->rem_ref(cm_node->cm_id);
+		/* fall through */
+	case NES_CM_STATE_TIME_WAIT:
+		cm_node->state = NES_CM_STATE_CLOSED;
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		drop_packet(skb);
+		break;
+	default:
+		drop_packet(skb);
+		break;
+	}
+}
+
+
+static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret = 0;
+	int datasize = skb->len;
+	u8 *dataloc = skb->data;
+
+	enum nes_cm_event_type type = NES_CM_EVENT_UNKNOWN;
+	u32 res_type;
+
+	ret = parse_mpa(cm_node, dataloc, &res_type, datasize);
+	if (ret) {
+		nes_debug(NES_DBG_CM, "didn't like MPA Request\n");
+		if (cm_node->state == NES_CM_STATE_MPAREQ_SENT) {
+			nes_debug(NES_DBG_CM, "%s[%u] create abort for "
+				  "cm_node=%p listener=%p state=%d\n", __func__,
+				  __LINE__, cm_node, cm_node->listener,
+				  cm_node->state);
+			active_open_err(cm_node, skb, 1);
+		} else {
+			passive_open_err(cm_node, skb, 1);
+		}
+		return;
+	}
+
+	switch (cm_node->state) {
+	case NES_CM_STATE_ESTABLISHED:
+		if (res_type == NES_MPA_REQUEST_REJECT)
+			/*BIG problem as we are receiving the MPA.. So should
+			 * not be REJECT.. This is Passive Open.. We can
+			 * only receive it Reject for Active Open...*/
+			WARN_ON(1);
+		cm_node->state = NES_CM_STATE_MPAREQ_RCVD;
+		type = NES_CM_EVENT_MPA_REQ;
+		atomic_set(&cm_node->passive_state,
+			   NES_PASSIVE_STATE_INDICATED);
+		break;
+	case NES_CM_STATE_MPAREQ_SENT:
+		cleanup_retrans_entry(cm_node);
+		if (res_type == NES_MPA_REQUEST_REJECT) {
+			type = NES_CM_EVENT_MPA_REJECT;
+			cm_node->state = NES_CM_STATE_MPAREJ_RCVD;
+		} else {
+			type = NES_CM_EVENT_CONNECTED;
+			cm_node->state = NES_CM_STATE_TSA;
+		}
+		send_ack(cm_node, NULL);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+	dev_kfree_skb_any(skb);
+	create_event(cm_node, type);
+}
+
+static void indicate_pkt_err(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_MPAREQ_SENT:
+		nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
+			  "listener=%p state=%d\n", __func__, __LINE__, cm_node,
+			  cm_node->listener, cm_node->state);
+		active_open_err(cm_node, skb, 1);
+		break;
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_SYN_RCVD:
+		passive_open_err(cm_node, skb, 1);
+		break;
+	case NES_CM_STATE_TSA:
+	default:
+		drop_packet(skb);
+	}
+}
+
+static int check_syn(struct nes_cm_node *cm_node, struct tcphdr *tcph,
+		     struct sk_buff *skb)
+{
+	int err;
+
+	err = ((ntohl(tcph->ack_seq) == cm_node->tcp_cntxt.loc_seq_num)) ? 0 : 1;
+	if (err)
+		active_open_err(cm_node, skb, 1);
+
+	return err;
+}
+
+static int check_seq(struct nes_cm_node *cm_node, struct tcphdr *tcph,
+		     struct sk_buff *skb)
+{
+	int err = 0;
+	u32 seq;
+	u32 ack_seq;
+	u32 loc_seq_num = cm_node->tcp_cntxt.loc_seq_num;
+	u32 rcv_nxt = cm_node->tcp_cntxt.rcv_nxt;
+	u32 rcv_wnd;
+
+	seq = ntohl(tcph->seq);
+	ack_seq = ntohl(tcph->ack_seq);
+	rcv_wnd = cm_node->tcp_cntxt.rcv_wnd;
+	if (ack_seq != loc_seq_num)
+		err = 1;
+	else if (!between(seq, rcv_nxt, (rcv_nxt + rcv_wnd)))
+		err = 1;
+	if (err) {
+		nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
+			  "listener=%p state=%d\n", __func__, __LINE__, cm_node,
+			  cm_node->listener, cm_node->state);
+		indicate_pkt_err(cm_node, skb);
+		nes_debug(NES_DBG_CM, "seq ERROR cm_node =%p seq=0x%08X "
+			  "rcv_nxt=0x%08X rcv_wnd=0x%x\n", cm_node, seq, rcv_nxt,
+			  rcv_wnd);
+	}
+	return err;
+}
+
+/*
+ * handle_syn_pkt() is for Passive node. The syn packet is received when a node
+ * is created with a listener or it may comein as rexmitted packet which in
+ * that case will be just dropped.
+ */
+static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			   struct tcphdr *tcph)
+{
+	int ret;
+	u32 inc_sequence;
+	int optionsize;
+
+	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
+	skb_trim(skb, 0);
+	inc_sequence = ntohl(tcph->seq);
+
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_MPAREQ_SENT:
+		/* Rcvd syn on active open connection*/
+		active_open_err(cm_node, skb, 1);
+		break;
+	case NES_CM_STATE_LISTENING:
+		/* Passive OPEN */
+		if (atomic_read(&cm_node->listener->pend_accepts_cnt) >
+		    cm_node->listener->backlog) {
+			nes_debug(NES_DBG_CM, "drop syn due to backlog "
+				  "pressure \n");
+			cm_backlog_drops++;
+			passive_open_err(cm_node, skb, 0);
+			break;
+		}
+		ret = handle_tcp_options(cm_node, tcph, skb, optionsize,
+					 1);
+		if (ret) {
+			passive_open_err(cm_node, skb, 0);
+			/* drop pkt */
+			break;
+		}
+		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
+		BUG_ON(cm_node->send_entry);
+		cm_node->accept_pend = 1;
+		atomic_inc(&cm_node->listener->pend_accepts_cnt);
+
+		cm_node->state = NES_CM_STATE_SYN_RCVD;
+		send_syn(cm_node, 1, skb);
+		break;
+	case NES_CM_STATE_CLOSED:
+		cleanup_retrans_entry(cm_node);
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_TSA:
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_FIN_WAIT1:
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_MPAREQ_RCVD:
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_CLOSING:
+	case NES_CM_STATE_UNKNOWN:
+	default:
+		drop_packet(skb);
+		break;
+	}
+}
+
+static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			      struct tcphdr *tcph)
+{
+	int ret;
+	u32 inc_sequence;
+	int optionsize;
+
+	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
+	skb_trim(skb, 0);
+	inc_sequence = ntohl(tcph->seq);
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_SENT:
+		cleanup_retrans_entry(cm_node);
+		/* active open */
+		if (check_syn(cm_node, tcph, skb))
+			return;
+		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
+		/* setup options */
+		ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 0);
+		if (ret) {
+			nes_debug(NES_DBG_CM, "cm_node=%p tcp_options failed\n",
+				  cm_node);
+			break;
+		}
+		cleanup_retrans_entry(cm_node);
+		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
+		send_mpa_request(cm_node, skb);
+		cm_node->state = NES_CM_STATE_MPAREQ_SENT;
+		break;
+	case NES_CM_STATE_MPAREQ_RCVD:
+		/* passive open, so should not be here */
+		passive_open_err(cm_node, skb, 1);
+		break;
+	case NES_CM_STATE_LISTENING:
+		cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_CLOSED:
+		cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
+		cleanup_retrans_entry(cm_node);
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_FIN_WAIT1:
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_TSA:
+	case NES_CM_STATE_CLOSING:
+	case NES_CM_STATE_UNKNOWN:
+	case NES_CM_STATE_MPAREQ_SENT:
+	default:
+		drop_packet(skb);
+		break;
+	}
+}
+
+static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			  struct tcphdr *tcph)
+{
+	int datasize = 0;
+	u32 inc_sequence;
+	int ret = 0;
+	int optionsize;
+
+	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
+
+	if (check_seq(cm_node, tcph, skb))
+		return -EINVAL;
+
+	skb_pull(skb, tcph->doff << 2);
+	inc_sequence = ntohl(tcph->seq);
+	datasize = skb->len;
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_RCVD:
+		/* Passive OPEN */
+		cleanup_retrans_entry(cm_node);
+		ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 1);
+		if (ret)
+			break;
+		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
+		cm_node->state = NES_CM_STATE_ESTABLISHED;
+		if (datasize) {
+			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
+			handle_rcv_mpa(cm_node, skb);
+		} else { /* rcvd ACK only */
+			dev_kfree_skb_any(skb);
+		}
+		break;
+	case NES_CM_STATE_ESTABLISHED:
+		/* Passive OPEN */
+		cleanup_retrans_entry(cm_node);
+		if (datasize) {
+			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
+			handle_rcv_mpa(cm_node, skb);
+		} else {
+			drop_packet(skb);
+		}
+		break;
+	case NES_CM_STATE_MPAREQ_SENT:
+		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
+		if (datasize) {
+			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
+			handle_rcv_mpa(cm_node, skb);
+		} else { /* Could be just an ack pkt.. */
+			dev_kfree_skb_any(skb);
+		}
+		break;
+	case NES_CM_STATE_LISTENING:
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_CLOSED:
+		cleanup_retrans_entry(cm_node);
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_CLOSING:
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		cm_node->cm_id->rem_ref(cm_node->cm_id);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		drop_packet(skb);
+		break;
+	case NES_CM_STATE_FIN_WAIT1:
+		cleanup_retrans_entry(cm_node);
+		drop_packet(skb);
+		cm_node->state = NES_CM_STATE_FIN_WAIT2;
+		break;
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_TSA:
+	case NES_CM_STATE_MPAREQ_RCVD:
+	case NES_CM_STATE_UNKNOWN:
+	default:
+		cleanup_retrans_entry(cm_node);
+		drop_packet(skb);
+		break;
+	}
+	return ret;
+}
+
+
+
+static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph,
+			      struct sk_buff *skb, int optionsize, int passive)
+{
+	u8 *optionsloc = (u8 *)&tcph[1];
+
+	if (optionsize) {
+		if (process_options(cm_node, optionsloc, optionsize,
+				    (u32)tcph->syn)) {
+			nes_debug(NES_DBG_CM, "%s: Node %p, Sending RESET\n",
+				  __func__, cm_node);
+			if (passive)
+				passive_open_err(cm_node, skb, 1);
+			else
+				active_open_err(cm_node, skb, 1);
+			return 1;
+		}
+	}
+
+	cm_node->tcp_cntxt.snd_wnd = ntohs(tcph->window) <<
+				     cm_node->tcp_cntxt.snd_wscale;
+
+	if (cm_node->tcp_cntxt.snd_wnd > cm_node->tcp_cntxt.max_snd_wnd)
+		cm_node->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.snd_wnd;
+	return 0;
+}
+
+/*
+ * active_open_err() will send reset() if flag set..
+ * It will also send ABORT event.
+ */
+static void active_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			    int reset)
+{
+	cleanup_retrans_entry(cm_node);
+	if (reset) {
+		nes_debug(NES_DBG_CM, "ERROR active err called for cm_node=%p, "
+			  "state=%d\n", cm_node, cm_node->state);
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, skb);
+	} else {
+		dev_kfree_skb_any(skb);
+	}
+
+	cm_node->state = NES_CM_STATE_CLOSED;
+	create_event(cm_node, NES_CM_EVENT_ABORTED);
+}
+
+/*
+ * passive_open_err() will either do a reset() or will free up the skb and
+ * remove the cm_node.
+ */
+static void passive_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			     int reset)
+{
+	cleanup_retrans_entry(cm_node);
+	cm_node->state = NES_CM_STATE_CLOSED;
+	if (reset) {
+		nes_debug(NES_DBG_CM, "passive_open_err sending RST for "
+			  "cm_node=%p state =%d\n", cm_node, cm_node->state);
+		send_reset(cm_node, skb);
+	} else {
+		dev_kfree_skb_any(skb);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+	}
+}
+
+/*
+ * free_retrans_entry() routines assumes that the retrans_list_lock has
+ * been acquired before calling.
+ */
+static void free_retrans_entry(struct nes_cm_node *cm_node)
+{
+	struct nes_timer_entry *send_entry;
+
+	send_entry = cm_node->send_entry;
+	if (send_entry) {
+		cm_node->send_entry = NULL;
+		dev_kfree_skb_any(send_entry->skb);
+		kfree(send_entry);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+	}
+}
+
+static void cleanup_retrans_entry(struct nes_cm_node *cm_node)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+	free_retrans_entry(cm_node);
+	spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+}
+
+/**
+ * process_packet
+ * Returns skb if to be freed, else it will return NULL if already used..
+ */
+static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			   struct nes_cm_core *cm_core)
+{
+	enum nes_tcpip_pkt_type pkt_type = NES_PKT_TYPE_UNKNOWN;
+	struct tcphdr *tcph = tcp_hdr(skb);
+	u32 fin_set = 0;
+	int ret = 0;
+
+	skb_pull(skb, ip_hdr(skb)->ihl << 2);
+
+	nes_debug(NES_DBG_CM, "process_packet: cm_node=%p state =%d syn=%d "
+		  "ack=%d rst=%d fin=%d\n", cm_node, cm_node->state, tcph->syn,
+		  tcph->ack, tcph->rst, tcph->fin);
+
+	if (tcph->rst) {
+		pkt_type = NES_PKT_TYPE_RST;
+	} else if (tcph->syn) {
+		pkt_type = NES_PKT_TYPE_SYN;
+		if (tcph->ack)
+			pkt_type = NES_PKT_TYPE_SYNACK;
+	} else if (tcph->ack) {
+		pkt_type = NES_PKT_TYPE_ACK;
+	}
+	if (tcph->fin)
+		fin_set = 1;
+
+	switch (pkt_type) {
+	case NES_PKT_TYPE_SYN:
+		handle_syn_pkt(cm_node, skb, tcph);
+		break;
+	case NES_PKT_TYPE_SYNACK:
+		handle_synack_pkt(cm_node, skb, tcph);
+		break;
+	case NES_PKT_TYPE_ACK:
+		ret = handle_ack_pkt(cm_node, skb, tcph);
+		if (fin_set && !ret)
+			handle_fin_pkt(cm_node);
+		break;
+	case NES_PKT_TYPE_RST:
+		handle_rst_pkt(cm_node, skb, tcph);
+		break;
+	default:
+		if ((fin_set) && (!check_seq(cm_node, tcph, skb)))
+			handle_fin_pkt(cm_node);
+		drop_packet(skb);
+		break;
+	}
+}
+
+/**
+ * mini_cm_listen - create a listen node with params
+ */
+static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core,
+			struct nes_vnic *nesvnic, struct nes_cm_info *cm_info)
+{
+	struct nes_cm_listener *listener;
+	unsigned long flags;
+
+	nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n",
+		  cm_info->loc_addr, cm_info->loc_port);
+
+	/* cannot have multiple matching listeners */
+	listener = find_listener(cm_core, cm_info->loc_addr, cm_info->loc_port,
+				NES_CM_LISTENER_EITHER_STATE);
+
+	if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) {
+		/* find automatically incs ref count ??? */
+		atomic_dec(&listener->ref_count);
+		nes_debug(NES_DBG_CM, "Not creating listener since it already exists\n");
+		return NULL;
+	}
+
+	if (!listener) {
+		/* create a CM listen node (1/2 node to compare incoming traffic to) */
+		listener = kzalloc(sizeof(*listener), GFP_ATOMIC);
+		if (!listener)
+			return NULL;
+
+		listener->loc_addr = cm_info->loc_addr;
+		listener->loc_port = cm_info->loc_port;
+		listener->reused_node = 0;
+
+		atomic_set(&listener->ref_count, 1);
+	}
+	/* pasive case */
+	/* find already inc'ed the ref count */
+	else {
+		listener->reused_node = 1;
+	}
+
+	listener->cm_id = cm_info->cm_id;
+	atomic_set(&listener->pend_accepts_cnt, 0);
+	listener->cm_core = cm_core;
+	listener->nesvnic = nesvnic;
+	atomic_inc(&cm_core->node_cnt);
+
+	listener->conn_type = cm_info->conn_type;
+	listener->backlog = cm_info->backlog;
+	listener->listener_state = NES_CM_LISTENER_ACTIVE_STATE;
+
+	if (!listener->reused_node) {
+		spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+		list_add(&listener->list, &cm_core->listen_list.list);
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+		atomic_inc(&cm_core->listen_node_cnt);
+	}
+
+	nes_debug(NES_DBG_CM, "Api - listen(): addr=0x%08X, port=0x%04x,"
+		  " listener = %p, backlog = %d, cm_id = %p.\n",
+		  cm_info->loc_addr, cm_info->loc_port,
+		  listener, listener->backlog, listener->cm_id);
+
+	return listener;
+}
+
+
+/**
+ * mini_cm_connect - make a connection node with params
+ */
+static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core,
+					   struct nes_vnic *nesvnic, u16 private_data_len,
+					   void *private_data, struct nes_cm_info *cm_info)
+{
+	int ret = 0;
+	struct nes_cm_node *cm_node;
+	struct nes_cm_listener *loopbackremotelistener;
+	struct nes_cm_node *loopbackremotenode;
+	struct nes_cm_info loopback_cm_info;
+	u8 *start_buff;
+
+	/* create a CM connection node */
+	cm_node = make_cm_node(cm_core, nesvnic, cm_info, NULL);
+	if (!cm_node)
+		return NULL;
+
+	/* set our node side to client (active) side */
+	cm_node->tcp_cntxt.client = 1;
+	cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE;
+
+	if (cm_info->loc_addr == cm_info->rem_addr) {
+		loopbackremotelistener = find_listener(cm_core,
+			cm_node->loc_addr, cm_node->rem_port,
+			NES_CM_LISTENER_ACTIVE_STATE);
+		if (loopbackremotelistener == NULL) {
+			create_event(cm_node, NES_CM_EVENT_ABORTED);
+		} else {
+			loopback_cm_info = *cm_info;
+			loopback_cm_info.loc_port = cm_info->rem_port;
+			loopback_cm_info.rem_port = cm_info->loc_port;
+			loopback_cm_info.loc_port =
+				cm_info->rem_port;
+			loopback_cm_info.rem_port =
+				cm_info->loc_port;
+			loopback_cm_info.cm_id = loopbackremotelistener->cm_id;
+			loopbackremotenode = make_cm_node(cm_core, nesvnic,
+							  &loopback_cm_info, loopbackremotelistener);
+			if (!loopbackremotenode) {
+				rem_ref_cm_node(cm_node->cm_core, cm_node);
+				return NULL;
+			}
+			atomic_inc(&cm_loopbacks);
+			loopbackremotenode->loopbackpartner = cm_node;
+			loopbackremotenode->tcp_cntxt.rcv_wscale =
+				NES_CM_DEFAULT_RCV_WND_SCALE;
+			cm_node->loopbackpartner = loopbackremotenode;
+			memcpy(loopbackremotenode->mpa_frame_buf, private_data,
+			       private_data_len);
+			loopbackremotenode->mpa_frame_size = private_data_len;
+
+			/* we are done handling this state. */
+			/* set node to a TSA state */
+			cm_node->state = NES_CM_STATE_TSA;
+			cm_node->tcp_cntxt.rcv_nxt =
+				loopbackremotenode->tcp_cntxt.loc_seq_num;
+			loopbackremotenode->tcp_cntxt.rcv_nxt =
+				cm_node->tcp_cntxt.loc_seq_num;
+			cm_node->tcp_cntxt.max_snd_wnd =
+				loopbackremotenode->tcp_cntxt.rcv_wnd;
+			loopbackremotenode->tcp_cntxt.max_snd_wnd =
+				cm_node->tcp_cntxt.rcv_wnd;
+			cm_node->tcp_cntxt.snd_wnd =
+				loopbackremotenode->tcp_cntxt.rcv_wnd;
+			loopbackremotenode->tcp_cntxt.snd_wnd =
+				cm_node->tcp_cntxt.rcv_wnd;
+			cm_node->tcp_cntxt.snd_wscale =
+				loopbackremotenode->tcp_cntxt.rcv_wscale;
+			loopbackremotenode->tcp_cntxt.snd_wscale =
+				cm_node->tcp_cntxt.rcv_wscale;
+			loopbackremotenode->state = NES_CM_STATE_MPAREQ_RCVD;
+			create_event(loopbackremotenode, NES_CM_EVENT_MPA_REQ);
+		}
+		return cm_node;
+	}
+
+	start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2);
+	cm_node->mpa_frame_size = private_data_len;
+
+	memcpy(start_buff, private_data, private_data_len);
+
+	/* send a syn and goto syn sent state */
+	cm_node->state = NES_CM_STATE_SYN_SENT;
+	ret = send_syn(cm_node, 0, NULL);
+
+	if (ret) {
+		/* error in sending the syn free up the cm_node struct */
+		nes_debug(NES_DBG_CM, "Api - connect() FAILED: dest "
+			  "addr=0x%08X, port=0x%04x, cm_node=%p, cm_id = %p.\n",
+			  cm_node->rem_addr, cm_node->rem_port, cm_node,
+			  cm_node->cm_id);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		cm_node = NULL;
+	}
+
+	if (cm_node) {
+		nes_debug(NES_DBG_CM, "Api - connect(): dest addr=0x%08X,"
+			  "port=0x%04x, cm_node=%p, cm_id = %p.\n",
+			  cm_node->rem_addr, cm_node->rem_port, cm_node,
+			  cm_node->cm_id);
+	}
+
+	return cm_node;
+}
+
+
+/**
+ * mini_cm_accept - accept a connection
+ * This function is never called
+ */
+static int mini_cm_accept(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	return 0;
+}
+
+
+/**
+ * mini_cm_reject - reject and teardown a connection
+ */
+static int mini_cm_reject(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	int ret = 0;
+	int err = 0;
+	int passive_state;
+	struct nes_cm_event event;
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	struct nes_cm_node *loopback = cm_node->loopbackpartner;
+
+	nes_debug(NES_DBG_CM, "%s cm_node=%p type=%d state=%d\n",
+		  __func__, cm_node, cm_node->tcp_cntxt.client, cm_node->state);
+
+	if (cm_node->tcp_cntxt.client)
+		return ret;
+	cleanup_retrans_entry(cm_node);
+
+	if (!loopback) {
+		passive_state = atomic_add_return(1, &cm_node->passive_state);
+		if (passive_state == NES_SEND_RESET_EVENT) {
+			cm_node->state = NES_CM_STATE_CLOSED;
+			rem_ref_cm_node(cm_core, cm_node);
+		} else {
+			if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) {
+				rem_ref_cm_node(cm_core, cm_node);
+			} else {
+				ret = send_mpa_reject(cm_node);
+				if (ret) {
+					cm_node->state = NES_CM_STATE_CLOSED;
+					err = send_reset(cm_node, NULL);
+					if (err)
+						WARN_ON(1);
+				} else {
+					cm_id->add_ref(cm_id);
+				}
+			}
+		}
+	} else {
+		cm_node->cm_id = NULL;
+		if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) {
+			rem_ref_cm_node(cm_core, cm_node);
+			rem_ref_cm_node(cm_core, loopback);
+		} else {
+			event.cm_node = loopback;
+			event.cm_info.rem_addr = loopback->rem_addr;
+			event.cm_info.loc_addr = loopback->loc_addr;
+			event.cm_info.rem_port = loopback->rem_port;
+			event.cm_info.loc_port = loopback->loc_port;
+			event.cm_info.cm_id = loopback->cm_id;
+			cm_event_mpa_reject(&event);
+			rem_ref_cm_node(cm_core, cm_node);
+			loopback->state = NES_CM_STATE_CLOSING;
+
+			cm_id = loopback->cm_id;
+			rem_ref_cm_node(cm_core, loopback);
+			cm_id->rem_ref(cm_id);
+		}
+	}
+
+	return ret;
+}
+
+
+/**
+ * mini_cm_close
+ */
+static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	int ret = 0;
+
+	if (!cm_core || !cm_node)
+		return -EINVAL;
+
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_RCVD:
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_ONE_SIDE_ESTABLISHED:
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_ACCEPTING:
+	case NES_CM_STATE_MPAREQ_SENT:
+	case NES_CM_STATE_MPAREQ_RCVD:
+		cleanup_retrans_entry(cm_node);
+		send_reset(cm_node, NULL);
+		break;
+	case NES_CM_STATE_CLOSE_WAIT:
+		cm_node->state = NES_CM_STATE_LAST_ACK;
+		send_fin(cm_node, NULL);
+		break;
+	case NES_CM_STATE_FIN_WAIT1:
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_TIME_WAIT:
+	case NES_CM_STATE_CLOSING:
+		ret = -1;
+		break;
+	case NES_CM_STATE_LISTENING:
+		cleanup_retrans_entry(cm_node);
+		send_reset(cm_node, NULL);
+		break;
+	case NES_CM_STATE_MPAREJ_RCVD:
+	case NES_CM_STATE_UNKNOWN:
+	case NES_CM_STATE_INITED:
+	case NES_CM_STATE_CLOSED:
+	case NES_CM_STATE_LISTENER_DESTROYED:
+		ret = rem_ref_cm_node(cm_core, cm_node);
+		break;
+	case NES_CM_STATE_TSA:
+		if (cm_node->send_entry)
+			printk(KERN_ERR "ERROR Close got called from STATE_TSA "
+			       "send_entry=%p\n", cm_node->send_entry);
+		ret = rem_ref_cm_node(cm_core, cm_node);
+		break;
+	}
+	return ret;
+}
+
+
+/**
+ * recv_pkt - recv an ETHERNET packet, and process it through CM
+ * node state machine
+ */
+static int mini_cm_recv_pkt(struct nes_cm_core *cm_core,
+			    struct nes_vnic *nesvnic, struct sk_buff *skb)
+{
+	struct nes_cm_node *cm_node = NULL;
+	struct nes_cm_listener *listener = NULL;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct nes_cm_info nfo;
+	int skb_handled = 1;
+	__be32 tmp_daddr, tmp_saddr;
+
+	if (!skb)
+		return 0;
+	if (skb->len < sizeof(struct iphdr) + sizeof(struct tcphdr))
+		return 0;
+
+	iph = (struct iphdr *)skb->data;
+	tcph = (struct tcphdr *)(skb->data + sizeof(struct iphdr));
+
+	nfo.loc_addr = ntohl(iph->daddr);
+	nfo.loc_port = ntohs(tcph->dest);
+	nfo.rem_addr = ntohl(iph->saddr);
+	nfo.rem_port = ntohs(tcph->source);
+
+	tmp_daddr = cpu_to_be32(iph->daddr);
+	tmp_saddr = cpu_to_be32(iph->saddr);
+
+	nes_debug(NES_DBG_CM, "Received packet: dest=%pI4:0x%04X src=%pI4:0x%04X\n",
+		  &tmp_daddr, tcph->dest, &tmp_saddr, tcph->source);
+
+	do {
+		cm_node = find_node(cm_core,
+				    nfo.rem_port, nfo.rem_addr,
+				    nfo.loc_port, nfo.loc_addr);
+
+		if (!cm_node) {
+			/* Only type of packet accepted are for */
+			/* the PASSIVE open (syn only) */
+			if ((!tcph->syn) || (tcph->ack)) {
+				skb_handled = 0;
+				break;
+			}
+			listener = find_listener(cm_core, nfo.loc_addr,
+						 nfo.loc_port,
+						 NES_CM_LISTENER_ACTIVE_STATE);
+			if (!listener) {
+				nfo.cm_id = NULL;
+				nfo.conn_type = 0;
+				nes_debug(NES_DBG_CM, "Unable to find listener for the pkt\n");
+				skb_handled = 0;
+				break;
+			}
+			nfo.cm_id = listener->cm_id;
+			nfo.conn_type = listener->conn_type;
+			cm_node = make_cm_node(cm_core, nesvnic, &nfo,
+					       listener);
+			if (!cm_node) {
+				nes_debug(NES_DBG_CM, "Unable to allocate "
+					  "node\n");
+				cm_packets_dropped++;
+				atomic_dec(&listener->ref_count);
+				dev_kfree_skb_any(skb);
+				break;
+			}
+			if (!tcph->rst && !tcph->fin) {
+				cm_node->state = NES_CM_STATE_LISTENING;
+			} else {
+				cm_packets_dropped++;
+				rem_ref_cm_node(cm_core, cm_node);
+				dev_kfree_skb_any(skb);
+				break;
+			}
+			add_ref_cm_node(cm_node);
+		} else if (cm_node->state == NES_CM_STATE_TSA) {
+			if (cm_node->nesqp->pau_mode)
+				nes_queue_mgt_skbs(skb, nesvnic, cm_node->nesqp);
+			else {
+				rem_ref_cm_node(cm_core, cm_node);
+				atomic_inc(&cm_accel_dropped_pkts);
+				dev_kfree_skb_any(skb);
+			}
+			break;
+		}
+		skb_reset_network_header(skb);
+		skb_set_transport_header(skb, sizeof(*tcph));
+		skb->len = ntohs(iph->tot_len);
+		process_packet(cm_node, skb, cm_core);
+		rem_ref_cm_node(cm_core, cm_node);
+	} while (0);
+	return skb_handled;
+}
+
+
+/**
+ * nes_cm_alloc_core - allocate a top level instance of a cm core
+ */
+static struct nes_cm_core *nes_cm_alloc_core(void)
+{
+	struct nes_cm_core *cm_core;
+
+	/* setup the CM core */
+	/* alloc top level core control structure */
+	cm_core = kzalloc(sizeof(*cm_core), GFP_KERNEL);
+	if (!cm_core)
+		return NULL;
+
+	INIT_LIST_HEAD(&cm_core->connected_nodes);
+	timer_setup(&cm_core->tcp_timer, nes_cm_timer_tick, 0);
+
+	cm_core->mtu = NES_CM_DEFAULT_MTU;
+	cm_core->state = NES_CM_STATE_INITED;
+	cm_core->free_tx_pkt_max = NES_CM_DEFAULT_FREE_PKTS;
+
+	atomic_set(&cm_core->events_posted, 0);
+
+	cm_core->api = &nes_cm_api;
+
+	spin_lock_init(&cm_core->ht_lock);
+	spin_lock_init(&cm_core->listen_list_lock);
+
+	INIT_LIST_HEAD(&cm_core->listen_list.list);
+
+	nes_debug(NES_DBG_CM, "Init CM Core completed -- cm_core=%p\n", cm_core);
+
+	nes_debug(NES_DBG_CM, "Enable QUEUE EVENTS\n");
+	cm_core->event_wq = alloc_ordered_workqueue("nesewq", 0);
+	if (!cm_core->event_wq)
+		goto out_free_cmcore;
+	cm_core->post_event = nes_cm_post_event;
+	nes_debug(NES_DBG_CM, "Enable QUEUE DISCONNECTS\n");
+	cm_core->disconn_wq = alloc_ordered_workqueue("nesdwq", 0);
+	if (!cm_core->disconn_wq)
+		goto out_free_wq;
+
+	print_core(cm_core);
+	return cm_core;
+
+out_free_wq:
+	destroy_workqueue(cm_core->event_wq);
+out_free_cmcore:
+	kfree(cm_core);
+	return NULL;
+}
+
+
+/**
+ * mini_cm_dealloc_core - deallocate a top level instance of a cm core
+ */
+static int mini_cm_dealloc_core(struct nes_cm_core *cm_core)
+{
+	nes_debug(NES_DBG_CM, "De-Alloc CM Core (%p)\n", cm_core);
+
+	if (!cm_core)
+		return -EINVAL;
+
+	barrier();
+
+	if (timer_pending(&cm_core->tcp_timer))
+		del_timer(&cm_core->tcp_timer);
+
+	destroy_workqueue(cm_core->event_wq);
+	destroy_workqueue(cm_core->disconn_wq);
+	nes_debug(NES_DBG_CM, "\n");
+	kfree(cm_core);
+
+	return 0;
+}
+
+
+/**
+ * mini_cm_get
+ */
+static int mini_cm_get(struct nes_cm_core *cm_core)
+{
+	return cm_core->state;
+}
+
+
+/**
+ * mini_cm_set
+ */
+static int mini_cm_set(struct nes_cm_core *cm_core, u32 type, u32 value)
+{
+	int ret = 0;
+
+	switch (type) {
+	case NES_CM_SET_PKT_SIZE:
+		cm_core->mtu = value;
+		break;
+	case NES_CM_SET_FREE_PKT_Q_SIZE:
+		cm_core->free_tx_pkt_max = value;
+		break;
+	default:
+		/* unknown set option */
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+
+/**
+ * nes_cm_init_tsa_conn setup HW; MPA frames must be
+ * successfully exchanged when this is called
+ */
+static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_node)
+{
+	int ret = 0;
+
+	if (!nesqp)
+		return -EINVAL;
+
+	nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_IPV4 |
+						  NES_QPCONTEXT_MISC_NO_NAGLE | NES_QPCONTEXT_MISC_DO_NOT_FRAG |
+						  NES_QPCONTEXT_MISC_DROS);
+
+	if (cm_node->tcp_cntxt.snd_wscale || cm_node->tcp_cntxt.rcv_wscale)
+		nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WSCALE);
+
+	nesqp->nesqp_context->misc2 |= cpu_to_le32(64 << NES_QPCONTEXT_MISC2_TTL_SHIFT);
+
+	nesqp->nesqp_context->misc2 |= cpu_to_le32(
+		cm_node->tos << NES_QPCONTEXT_MISC2_TOS_SHIFT);
+
+	nesqp->nesqp_context->mss |= cpu_to_le32(((u32)cm_node->tcp_cntxt.mss) << 16);
+
+	nesqp->nesqp_context->tcp_state_flow_label |= cpu_to_le32(
+		(u32)NES_QPCONTEXT_TCPSTATE_EST << NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT);
+
+	nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32(
+		(cm_node->tcp_cntxt.snd_wscale << NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT) &
+		NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK);
+
+	nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32(
+		(cm_node->tcp_cntxt.rcv_wscale << NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT) &
+		NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK);
+
+	nesqp->nesqp_context->keepalive = cpu_to_le32(0x80);
+	nesqp->nesqp_context->ts_recent = 0;
+	nesqp->nesqp_context->ts_age = 0;
+	nesqp->nesqp_context->snd_nxt = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.snd_wnd);
+	nesqp->nesqp_context->rcv_nxt = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
+	nesqp->nesqp_context->rcv_wnd = cpu_to_le32(cm_node->tcp_cntxt.rcv_wnd <<
+						    cm_node->tcp_cntxt.rcv_wscale);
+	nesqp->nesqp_context->snd_max = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->snd_una = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->srtt = 0;
+	nesqp->nesqp_context->rttvar = cpu_to_le32(0x6);
+	nesqp->nesqp_context->ssthresh = cpu_to_le32(0x3FFFC000);
+	nesqp->nesqp_context->cwnd = cpu_to_le32(2 * cm_node->tcp_cntxt.mss);
+	nesqp->nesqp_context->snd_wl1 = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
+	nesqp->nesqp_context->snd_wl2 = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->max_snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.max_snd_wnd);
+
+	nes_debug(NES_DBG_CM, "QP%u: rcv_nxt = 0x%08X, snd_nxt = 0x%08X,"
+		  " Setting MSS to %u, PDWscale = 0x%08X, rcv_wnd = %u, context misc = 0x%08X.\n",
+		  nesqp->hwqp.qp_id, le32_to_cpu(nesqp->nesqp_context->rcv_nxt),
+		  le32_to_cpu(nesqp->nesqp_context->snd_nxt),
+		  cm_node->tcp_cntxt.mss, le32_to_cpu(nesqp->nesqp_context->pd_index_wscale),
+		  le32_to_cpu(nesqp->nesqp_context->rcv_wnd),
+		  le32_to_cpu(nesqp->nesqp_context->misc));
+	nes_debug(NES_DBG_CM, "  snd_wnd  = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->snd_wnd));
+	nes_debug(NES_DBG_CM, "  snd_cwnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->cwnd));
+	nes_debug(NES_DBG_CM, "  max_swnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->max_snd_wnd));
+
+	nes_debug(NES_DBG_CM, "Change cm_node state to TSA\n");
+	cm_node->state = NES_CM_STATE_TSA;
+
+	return ret;
+}
+
+
+/**
+ * nes_cm_disconn
+ */
+int nes_cm_disconn(struct nes_qp *nesqp)
+{
+	struct disconn_work *work;
+
+	work = kzalloc(sizeof *work, GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;  /* Timer will clean up */
+
+	nes_add_ref(&nesqp->ibqp);
+	work->nesqp = nesqp;
+	INIT_WORK(&work->work, nes_disconnect_worker);
+	queue_work(g_cm_core->disconn_wq, &work->work);
+	return 0;
+}
+
+
+/**
+ * nes_disconnect_worker
+ */
+static void nes_disconnect_worker(struct work_struct *work)
+{
+	struct disconn_work *dwork = container_of(work, struct disconn_work, work);
+	struct nes_qp *nesqp = dwork->nesqp;
+
+	kfree(dwork);
+	nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n",
+		  nesqp->last_aeq, nesqp->hwqp.qp_id);
+	nes_cm_disconn_true(nesqp);
+	nes_rem_ref(&nesqp->ibqp);
+}
+
+
+/**
+ * nes_cm_disconn_true
+ */
+static int nes_cm_disconn_true(struct nes_qp *nesqp)
+{
+	unsigned long flags;
+	int ret = 0;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	struct nes_vnic *nesvnic;
+	u16 last_ae;
+	u8 original_hw_tcp_state;
+	u8 original_ibqp_state;
+	int disconn_status = 0;
+	int issue_disconn = 0;
+	int issue_close = 0;
+	int issue_flush = 0;
+	u32 flush_q = NES_CQP_FLUSH_RQ;
+	struct ib_event ibevent;
+
+	if (!nesqp) {
+		nes_debug(NES_DBG_CM, "disconnect_worker nesqp is NULL\n");
+		return -1;
+	}
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+	cm_id = nesqp->cm_id;
+	/* make sure we havent already closed this connection */
+	if (!cm_id) {
+		nes_debug(NES_DBG_CM, "QP%u disconnect_worker cmid is NULL\n",
+			  nesqp->hwqp.qp_id);
+		spin_unlock_irqrestore(&nesqp->lock, flags);
+		return -1;
+	}
+
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	nes_debug(NES_DBG_CM, "Disconnecting QP%u\n", nesqp->hwqp.qp_id);
+
+	original_hw_tcp_state = nesqp->hw_tcp_state;
+	original_ibqp_state = nesqp->ibqp_state;
+	last_ae = nesqp->last_aeq;
+
+	if (nesqp->term_flags) {
+		issue_disconn = 1;
+		issue_close = 1;
+		nesqp->cm_id = NULL;
+		del_timer(&nesqp->terminate_timer);
+		if (nesqp->flush_issued == 0) {
+			nesqp->flush_issued = 1;
+			issue_flush = 1;
+		}
+	} else if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
+			((original_ibqp_state == IB_QPS_RTS) &&
+			(last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
+		issue_disconn = 1;
+		if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET)
+			disconn_status = -ECONNRESET;
+	}
+
+	if (((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) ||
+		 (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) ||
+		 (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) ||
+		 (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
+		issue_close = 1;
+		nesqp->cm_id = NULL;
+		if (nesqp->flush_issued == 0) {
+			nesqp->flush_issued = 1;
+			issue_flush = 1;
+		}
+	}
+
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	if ((issue_flush) && (nesqp->destroyed == 0)) {
+		/* Flush the queue(s) */
+		if (nesqp->hw_iwarp_state >= NES_AEQE_IWARP_STATE_TERMINATE)
+			flush_q |= NES_CQP_FLUSH_SQ;
+		flush_wqes(nesvnic->nesdev, nesqp, flush_q, 1);
+
+		if (nesqp->term_flags) {
+			ibevent.device = nesqp->ibqp.device;
+			ibevent.event = nesqp->terminate_eventtype;
+			ibevent.element.qp = &nesqp->ibqp;
+			if (nesqp->ibqp.event_handler)
+				nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context);
+		}
+	}
+
+	if ((cm_id) && (cm_id->event_handler)) {
+		if (issue_disconn) {
+			atomic_inc(&cm_disconnects);
+			cm_event.event = IW_CM_EVENT_DISCONNECT;
+			cm_event.status = disconn_status;
+			cm_event.local_addr = cm_id->m_local_addr;
+			cm_event.remote_addr = cm_id->m_remote_addr;
+			cm_event.private_data = NULL;
+			cm_event.private_data_len = 0;
+
+			nes_debug(NES_DBG_CM, "Generating a CM Disconnect Event"
+				  " for  QP%u, SQ Head = %u, SQ Tail = %u. "
+				  "cm_id = %p, refcount = %u.\n",
+				  nesqp->hwqp.qp_id, nesqp->hwqp.sq_head,
+				  nesqp->hwqp.sq_tail, cm_id,
+				  atomic_read(&nesqp->refcount));
+
+			ret = cm_id->event_handler(cm_id, &cm_event);
+			if (ret)
+				nes_debug(NES_DBG_CM, "OFA CM event_handler "
+					  "returned, ret=%d\n", ret);
+		}
+
+		if (issue_close) {
+			atomic_inc(&cm_closes);
+			nes_disconnect(nesqp, 1);
+
+			cm_id->provider_data = nesqp;
+			/* Send up the close complete event */
+			cm_event.event = IW_CM_EVENT_CLOSE;
+			cm_event.status = 0;
+			cm_event.provider_data = cm_id->provider_data;
+			cm_event.local_addr = cm_id->m_local_addr;
+			cm_event.remote_addr = cm_id->m_remote_addr;
+			cm_event.private_data = NULL;
+			cm_event.private_data_len = 0;
+
+			ret = cm_id->event_handler(cm_id, &cm_event);
+			if (ret)
+				nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+
+			cm_id->rem_ref(cm_id);
+		}
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_disconnect
+ */
+static int nes_disconnect(struct nes_qp *nesqp, int abrupt)
+{
+	int ret = 0;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_ib_device *nesibdev;
+
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	if (!nesvnic)
+		return -EINVAL;
+
+	nesdev = nesvnic->nesdev;
+	nesibdev = nesvnic->nesibdev;
+
+	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
+			netdev_refcnt_read(nesvnic->netdev));
+
+	if (nesqp->active_conn) {
+
+		/* indicate this connection is NOT active */
+		nesqp->active_conn = 0;
+	} else {
+		/* Need to free the Last Streaming Mode Message */
+		if (nesqp->ietf_frame) {
+			if (nesqp->lsmm_mr)
+				nesibdev->ibdev.dereg_mr(nesqp->lsmm_mr);
+			pci_free_consistent(nesdev->pcidev,
+					    nesqp->private_data_len + nesqp->ietf_frame_size,
+					    nesqp->ietf_frame, nesqp->ietf_frame_pbase);
+		}
+	}
+
+	/* close the CM node down if it is still active */
+	if (nesqp->cm_node) {
+		nes_debug(NES_DBG_CM, "Call close API\n");
+
+		g_cm_core->api->close(g_cm_core, nesqp->cm_node);
+	}
+
+	return ret;
+}
+
+
+/**
+ * nes_accept
+ */
+int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	u64 u64temp;
+	struct ib_qp *ibqp;
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_cm_node *cm_node;
+	struct nes_adapter *adapter;
+	struct ib_qp_attr attr;
+	struct iw_cm_event cm_event;
+	struct nes_hw_qp_wqe *wqe;
+	struct nes_v4_quad nes_quad;
+	u32 crc_value;
+	int ret;
+	int passive_state;
+	struct ib_mr *ibmr = NULL;
+	struct nes_pd *nespd;
+	u64 tagged_offset;
+	u8 mpa_frame_offset = 0;
+	struct ietf_mpa_v2 *mpa_v2_frame;
+	u8 start_addr = 0;
+	u8 *start_ptr = &start_addr;
+	u8 **start_buff = &start_ptr;
+	u16 buff_len = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
+
+	ibqp = nes_get_qp(cm_id->device, conn_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+
+	/* get all our handles */
+	nesqp = to_nesqp(ibqp);
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	nesdev = nesvnic->nesdev;
+	adapter = nesdev->nesadapter;
+
+	cm_node = (struct nes_cm_node *)cm_id->provider_data;
+	nes_debug(NES_DBG_CM, "nes_accept: cm_node= %p nesvnic=%p, netdev=%p,"
+		"%s\n", cm_node, nesvnic, nesvnic->netdev,
+		nesvnic->netdev->name);
+
+	if (NES_CM_STATE_LISTENER_DESTROYED == cm_node->state) {
+		if (cm_node->loopbackpartner)
+			rem_ref_cm_node(cm_node->cm_core, cm_node->loopbackpartner);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		return -EINVAL;
+	}
+
+	passive_state = atomic_add_return(1, &cm_node->passive_state);
+	if (passive_state == NES_SEND_RESET_EVENT) {
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		return -ECONNRESET;
+	}
+	/* associate the node with the QP */
+	nesqp->cm_node = (void *)cm_node;
+	cm_node->nesqp = nesqp;
+
+
+	nes_debug(NES_DBG_CM, "QP%u, cm_node=%p, jiffies = %lu listener = %p\n",
+		nesqp->hwqp.qp_id, cm_node, jiffies, cm_node->listener);
+	atomic_inc(&cm_accepts);
+
+	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
+			netdev_refcnt_read(nesvnic->netdev));
+
+	nesqp->ietf_frame_size = sizeof(struct ietf_mpa_v2);
+	/* allocate the ietf frame and space for private data */
+	nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev,
+						 nesqp->ietf_frame_size + conn_param->private_data_len,
+						 &nesqp->ietf_frame_pbase);
+
+	if (!nesqp->ietf_frame) {
+		nes_debug(NES_DBG_CM, "Unable to allocate memory for private data\n");
+		return -ENOMEM;
+	}
+	mpa_v2_frame = (struct ietf_mpa_v2 *)nesqp->ietf_frame;
+
+	if (cm_node->mpa_frame_rev == IETF_MPA_V1)
+		mpa_frame_offset = 4;
+
+	if (cm_node->mpa_frame_rev == IETF_MPA_V1 ||
+			cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
+		record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord);
+	}
+
+	memcpy(mpa_v2_frame->priv_data, conn_param->private_data,
+	       conn_param->private_data_len);
+
+	cm_build_mpa_frame(cm_node, start_buff, &buff_len, nesqp->ietf_frame, MPA_KEY_REPLY);
+	nesqp->private_data_len = conn_param->private_data_len;
+
+	/* setup our first outgoing iWarp send WQE (the IETF frame response) */
+	wqe = &nesqp->hwqp.sq_vbase[0];
+
+	if (raddr->sin_addr.s_addr != laddr->sin_addr.s_addr) {
+		u64temp = (unsigned long)nesqp;
+		nespd = nesqp->nespd;
+		tagged_offset = (u64)(unsigned long)*start_buff;
+		ibmr = nes_reg_phys_mr(&nespd->ibpd,
+				nesqp->ietf_frame_pbase + mpa_frame_offset,
+				buff_len, IB_ACCESS_LOCAL_WRITE,
+				&tagged_offset);
+		if (IS_ERR(ibmr)) {
+			nes_debug(NES_DBG_CM, "Unable to register memory region"
+				  "for lSMM for cm_node = %p \n",
+				  cm_node);
+			pci_free_consistent(nesdev->pcidev,
+					    nesqp->private_data_len + nesqp->ietf_frame_size,
+					    nesqp->ietf_frame, nesqp->ietf_frame_pbase);
+			return PTR_ERR(ibmr);
+		}
+
+		ibmr->pd = &nespd->ibpd;
+		ibmr->device = nespd->ibpd.device;
+		nesqp->lsmm_mr = ibmr;
+
+		u64temp |= NES_SW_CONTEXT_ALIGN >> 1;
+		set_wqe_64bit_value(wqe->wqe_words,
+				    NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX,
+				    u64temp);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
+			cpu_to_le32(NES_IWARP_SQ_WQE_STREAMING |
+				    NES_IWARP_SQ_WQE_WRPDU);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] =
+			cpu_to_le32(buff_len);
+		set_wqe_64bit_value(wqe->wqe_words,
+				    NES_IWARP_SQ_WQE_FRAG0_LOW_IDX,
+				    (u64)(unsigned long)(*start_buff));
+		wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] =
+			cpu_to_le32(buff_len);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = ibmr->lkey;
+		if (nesqp->sq_kmapped) {
+			nesqp->sq_kmapped = 0;
+			kunmap(nesqp->page);
+		}
+
+		nesqp->nesqp_context->ird_ord_sizes |=
+			cpu_to_le32(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT |
+				    NES_QPCONTEXT_ORDIRD_WRPDU);
+	} else {
+		nesqp->nesqp_context->ird_ord_sizes |=
+			cpu_to_le32(NES_QPCONTEXT_ORDIRD_WRPDU);
+	}
+	nesqp->skip_lsmm = 1;
+
+	/* Cache the cm_id in the qp */
+	nesqp->cm_id = cm_id;
+	cm_node->cm_id = cm_id;
+
+	/*  nesqp->cm_node = (void *)cm_id->provider_data; */
+	cm_id->provider_data = nesqp;
+	nesqp->active_conn = 0;
+
+	if (cm_node->state == NES_CM_STATE_TSA)
+		nes_debug(NES_DBG_CM, "Already state = TSA for cm_node=%p\n",
+			  cm_node);
+
+	nes_cm_init_tsa_conn(nesqp, cm_node);
+
+	nesqp->nesqp_context->tcpPorts[0] =
+				cpu_to_le16(cm_node->loc_port);
+	nesqp->nesqp_context->tcpPorts[1] =
+				cpu_to_le16(cm_node->rem_port);
+
+	nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->rem_addr);
+
+	nesqp->nesqp_context->misc2 |= cpu_to_le32(
+		(u32)PCI_FUNC(nesdev->pcidev->devfn) <<
+		NES_QPCONTEXT_MISC2_SRC_IP_SHIFT);
+
+	nesqp->nesqp_context->arp_index_vlan |=
+		cpu_to_le32(nes_arp_table(nesdev,
+					  le32_to_cpu(nesqp->nesqp_context->ip0), NULL,
+					  NES_ARP_RESOLVE) << 16);
+
+	nesqp->nesqp_context->ts_val_delta = cpu_to_le32(
+		jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW));
+
+	nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id);
+
+	nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(
+		((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT));
+	nesqp->nesqp_context->ird_ord_sizes |=
+		cpu_to_le32((u32)cm_node->ord_size);
+
+	memset(&nes_quad, 0, sizeof(nes_quad));
+	nes_quad.DstIpAdrIndex =
+		cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
+	nes_quad.SrcIpadr = htonl(cm_node->rem_addr);
+	nes_quad.TcpPorts[0] = htons(cm_node->rem_port);
+	nes_quad.TcpPorts[1] = htons(cm_node->loc_port);
+
+	/* Produce hash key */
+	crc_value = get_crc_value(&nes_quad);
+	nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
+	nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, CRC = 0x%08X\n",
+		  nesqp->hte_index, nesqp->hte_index & adapter->hte_index_mask);
+
+	nesqp->hte_index &= adapter->hte_index_mask;
+	nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
+
+	cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node);
+
+	nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = "
+		  "0x%08X:0x%04X, rcv_nxt=0x%08X, snd_nxt=0x%08X, mpa + "
+		  "private data length=%u.\n", nesqp->hwqp.qp_id,
+		  ntohl(raddr->sin_addr.s_addr), ntohs(raddr->sin_port),
+		  ntohl(laddr->sin_addr.s_addr), ntohs(laddr->sin_port),
+		  le32_to_cpu(nesqp->nesqp_context->rcv_nxt),
+		  le32_to_cpu(nesqp->nesqp_context->snd_nxt),
+		  buff_len);
+
+	/* notify OF layer that accept event was successful */
+	cm_id->add_ref(cm_id);
+	nes_add_ref(&nesqp->ibqp);
+
+	cm_event.event = IW_CM_EVENT_ESTABLISHED;
+	cm_event.status = 0;
+	cm_event.provider_data = (void *)nesqp;
+	cm_event.local_addr = cm_id->m_local_addr;
+	cm_event.remote_addr = cm_id->m_remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+	cm_event.ird = cm_node->ird_size;
+	cm_event.ord = cm_node->ord_size;
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	attr.qp_state = IB_QPS_RTS;
+	nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
+	if (cm_node->loopbackpartner) {
+		cm_node->loopbackpartner->mpa_frame_size =
+			nesqp->private_data_len;
+		/* copy entire MPA frame to our cm_node's frame */
+		memcpy(cm_node->loopbackpartner->mpa_frame_buf,
+		       conn_param->private_data, conn_param->private_data_len);
+		create_event(cm_node->loopbackpartner, NES_CM_EVENT_CONNECTED);
+	}
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
+		       "ret=%d\n", __func__, __LINE__, ret);
+
+	return 0;
+}
+
+
+/**
+ * nes_reject
+ */
+int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	struct nes_cm_node *cm_node;
+	struct nes_cm_node *loopback;
+	struct nes_cm_core *cm_core;
+	u8 *start_buff;
+
+	atomic_inc(&cm_rejects);
+	cm_node = (struct nes_cm_node *)cm_id->provider_data;
+	loopback = cm_node->loopbackpartner;
+	cm_core = cm_node->cm_core;
+	cm_node->cm_id = cm_id;
+
+	if (pdata_len + sizeof(struct ietf_mpa_v2) > MAX_CM_BUFFER)
+		return -EINVAL;
+
+	if (loopback) {
+		memcpy(&loopback->mpa_frame.priv_data, pdata, pdata_len);
+		loopback->mpa_frame.priv_data_len = pdata_len;
+		loopback->mpa_frame_size = pdata_len;
+	} else {
+		start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2);
+		cm_node->mpa_frame_size = pdata_len;
+		memcpy(start_buff, pdata, pdata_len);
+	}
+	return cm_core->api->reject(cm_core, cm_node);
+}
+
+
+/**
+ * nes_connect
+ * setup and launch cm connect node
+ */
+int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct ib_qp *ibqp;
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_cm_node *cm_node;
+	struct nes_cm_info cm_info;
+	int apbvt_set = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
+
+	if (cm_id->remote_addr.ss_family != AF_INET)
+		return -ENOSYS;
+	ibqp = nes_get_qp(cm_id->device, conn_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+	nesqp = to_nesqp(ibqp);
+	if (!nesqp)
+		return -EINVAL;
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	if (!nesvnic)
+		return -EINVAL;
+	nesdev = nesvnic->nesdev;
+	if (!nesdev)
+		return -EINVAL;
+
+	if (!laddr->sin_port || !raddr->sin_port)
+		return -EINVAL;
+
+	nes_debug(NES_DBG_CM, "QP%u, current IP = 0x%08X, Destination IP = "
+		  "0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", nesqp->hwqp.qp_id,
+		  ntohl(nesvnic->local_ipaddr), ntohl(raddr->sin_addr.s_addr),
+		  ntohs(raddr->sin_port), ntohl(laddr->sin_addr.s_addr),
+		  ntohs(laddr->sin_port));
+
+	atomic_inc(&cm_connects);
+	nesqp->active_conn = 1;
+
+	/* cache the cm_id in the qp */
+	nesqp->cm_id = cm_id;
+	cm_id->provider_data = nesqp;
+	nesqp->private_data_len = conn_param->private_data_len;
+
+	nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord);
+	nes_debug(NES_DBG_CM, "mpa private data len =%u\n",
+		  conn_param->private_data_len);
+
+	/* set up the connection params for the node */
+	cm_info.loc_addr = ntohl(laddr->sin_addr.s_addr);
+	cm_info.loc_port = ntohs(laddr->sin_port);
+	cm_info.rem_addr = ntohl(raddr->sin_addr.s_addr);
+	cm_info.rem_port = ntohs(raddr->sin_port);
+	cm_info.cm_id = cm_id;
+	cm_info.conn_type = NES_CM_IWARP_CONN_TYPE;
+
+	if (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr) {
+		nes_manage_apbvt(nesvnic, cm_info.loc_port,
+				 PCI_FUNC(nesdev->pcidev->devfn),
+				 NES_MANAGE_APBVT_ADD);
+		apbvt_set = 1;
+	}
+
+	cm_id->add_ref(cm_id);
+
+	/* create a connect CM node connection */
+	cm_node = g_cm_core->api->connect(g_cm_core, nesvnic,
+					  conn_param->private_data_len, (void *)conn_param->private_data,
+					  &cm_info);
+	if (!cm_node) {
+		if (apbvt_set)
+			nes_manage_apbvt(nesvnic, cm_info.loc_port,
+					 PCI_FUNC(nesdev->pcidev->devfn),
+					 NES_MANAGE_APBVT_DEL);
+
+		nes_debug(NES_DBG_NLMSG, "Delete loc_port = %04X\n",
+			  cm_info.loc_port);
+		cm_id->rem_ref(cm_id);
+		return -ENOMEM;
+	}
+
+	record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord);
+	if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO &&
+				cm_node->ord_size == 0)
+		cm_node->ord_size = 1;
+
+	cm_node->apbvt_set = apbvt_set;
+	cm_node->tos = cm_id->tos;
+	nesqp->cm_node = cm_node;
+	cm_node->nesqp = nesqp;
+	nes_add_ref(&nesqp->ibqp);
+
+	return 0;
+}
+
+
+/**
+ * nes_create_listen
+ */
+int nes_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	struct nes_vnic *nesvnic;
+	struct nes_cm_listener *cm_node;
+	struct nes_cm_info cm_info;
+	int err;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
+
+	nes_debug(NES_DBG_CM, "cm_id = %p, local port = 0x%04X.\n",
+		  cm_id, ntohs(laddr->sin_port));
+
+	if (cm_id->m_local_addr.ss_family != AF_INET)
+		return -ENOSYS;
+	nesvnic = to_nesvnic(cm_id->device);
+	if (!nesvnic)
+		return -EINVAL;
+
+	nes_debug(NES_DBG_CM, "nesvnic=%p, netdev=%p, %s\n",
+			nesvnic, nesvnic->netdev, nesvnic->netdev->name);
+
+	nes_debug(NES_DBG_CM, "nesvnic->local_ipaddr=0x%08x, sin_addr.s_addr=0x%08x\n",
+			nesvnic->local_ipaddr, laddr->sin_addr.s_addr);
+
+	/* setup listen params in our api call struct */
+	cm_info.loc_addr = ntohl(nesvnic->local_ipaddr);
+	cm_info.loc_port = ntohs(laddr->sin_port);
+	cm_info.backlog = backlog;
+	cm_info.cm_id = cm_id;
+
+	cm_info.conn_type = NES_CM_IWARP_CONN_TYPE;
+
+	cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info);
+	if (!cm_node) {
+		printk(KERN_ERR "%s[%u] Error returned from listen API call\n",
+		       __func__, __LINE__);
+		return -ENOMEM;
+	}
+
+	cm_id->provider_data = cm_node;
+	cm_node->tos = cm_id->tos;
+
+	if (!cm_node->reused_node) {
+		err = nes_manage_apbvt(nesvnic, cm_node->loc_port,
+				       PCI_FUNC(nesvnic->nesdev->pcidev->devfn),
+				       NES_MANAGE_APBVT_ADD);
+		if (err) {
+			printk(KERN_ERR "nes_manage_apbvt call returned %d.\n",
+			       err);
+			g_cm_core->api->stop_listener(g_cm_core, (void *)cm_node);
+			return err;
+		}
+		atomic_inc(&cm_listens_created);
+	}
+
+	cm_id->add_ref(cm_id);
+	cm_id->provider_data = (void *)cm_node;
+
+
+	return 0;
+}
+
+
+/**
+ * nes_destroy_listen
+ */
+int nes_destroy_listen(struct iw_cm_id *cm_id)
+{
+	if (cm_id->provider_data)
+		g_cm_core->api->stop_listener(g_cm_core, cm_id->provider_data);
+	else
+		nes_debug(NES_DBG_CM, "cm_id->provider_data was NULL\n");
+
+	cm_id->rem_ref(cm_id);
+
+	return 0;
+}
+
+
+/**
+ * nes_cm_recv
+ */
+int nes_cm_recv(struct sk_buff *skb, struct net_device *netdevice)
+{
+	int rc = 0;
+
+	cm_packets_received++;
+	if ((g_cm_core) && (g_cm_core->api))
+		rc = g_cm_core->api->recv_pkt(g_cm_core, netdev_priv(netdevice), skb);
+	else
+		nes_debug(NES_DBG_CM, "Unable to process packet for CM,"
+			  " cm is not setup properly.\n");
+
+	return rc;
+}
+
+
+/**
+ * nes_cm_start
+ * Start and init a cm core module
+ */
+int nes_cm_start(void)
+{
+	nes_debug(NES_DBG_CM, "\n");
+	/* create the primary CM core, pass this handle to subsequent core inits */
+	g_cm_core = nes_cm_alloc_core();
+	if (g_cm_core)
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+
+/**
+ * nes_cm_stop
+ * stop and dealloc all cm core instances
+ */
+int nes_cm_stop(void)
+{
+	g_cm_core->api->destroy_cm_core(g_cm_core);
+	return 0;
+}
+
+
+/**
+ * cm_event_connected
+ * handle a connected event, setup QPs and HW
+ */
+static void cm_event_connected(struct nes_cm_event *event)
+{
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_cm_node *cm_node;
+	struct nes_adapter *nesadapter;
+	struct ib_qp_attr attr;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	struct nes_v4_quad nes_quad;
+	u32 crc_value;
+	int ret;
+	struct sockaddr_in *laddr;
+	struct sockaddr_in *raddr;
+	struct sockaddr_in *cm_event_laddr;
+
+	/* get all our handles */
+	cm_node = event->cm_node;
+	cm_id = cm_node->cm_id;
+	nes_debug(NES_DBG_CM, "cm_event_connected - %p - cm_id = %p\n", cm_node, cm_id);
+	nesqp = (struct nes_qp *)cm_id->provider_data;
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	nesdev = nesvnic->nesdev;
+	nesadapter = nesdev->nesadapter;
+	laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
+	raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
+	cm_event_laddr = (struct sockaddr_in *)&cm_event.local_addr;
+
+	if (nesqp->destroyed)
+		return;
+	atomic_inc(&cm_connecteds);
+	nes_debug(NES_DBG_CM, "QP%u attempting to connect to  0x%08X:0x%04X on"
+		  " local port 0x%04X. jiffies = %lu.\n",
+		  nesqp->hwqp.qp_id, ntohl(raddr->sin_addr.s_addr),
+		  ntohs(raddr->sin_port), ntohs(laddr->sin_port), jiffies);
+
+	nes_cm_init_tsa_conn(nesqp, cm_node);
+
+	/* set the QP tsa context */
+	nesqp->nesqp_context->tcpPorts[0] =
+			cpu_to_le16(cm_node->loc_port);
+	nesqp->nesqp_context->tcpPorts[1] =
+			cpu_to_le16(cm_node->rem_port);
+	nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->rem_addr);
+
+	nesqp->nesqp_context->misc2 |= cpu_to_le32(
+			(u32)PCI_FUNC(nesdev->pcidev->devfn) <<
+			NES_QPCONTEXT_MISC2_SRC_IP_SHIFT);
+	nesqp->nesqp_context->arp_index_vlan |= cpu_to_le32(
+			nes_arp_table(nesdev,
+			le32_to_cpu(nesqp->nesqp_context->ip0),
+			NULL, NES_ARP_RESOLVE) << 16);
+	nesqp->nesqp_context->ts_val_delta = cpu_to_le32(
+			jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW));
+	nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id);
+	nesqp->nesqp_context->ird_ord_sizes |=
+			cpu_to_le32((u32)1 <<
+			NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT);
+	nesqp->nesqp_context->ird_ord_sizes |=
+			cpu_to_le32((u32)cm_node->ord_size);
+
+	/* Adjust tail for not having a LSMM */
+	/*nesqp->hwqp.sq_tail = 1;*/
+
+	build_rdma0_msg(cm_node, &nesqp);
+
+	nes_write32(nesdev->regs + NES_WQE_ALLOC,
+		    (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id);
+
+	memset(&nes_quad, 0, sizeof(nes_quad));
+
+	nes_quad.DstIpAdrIndex =
+		cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
+	nes_quad.SrcIpadr = htonl(cm_node->rem_addr);
+	nes_quad.TcpPorts[0] = htons(cm_node->rem_port);
+	nes_quad.TcpPorts[1] = htons(cm_node->loc_port);
+
+	/* Produce hash key */
+	crc_value = get_crc_value(&nes_quad);
+	nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
+	nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, After CRC = 0x%08X\n",
+		  nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask);
+
+	nesqp->hte_index &= nesadapter->hte_index_mask;
+	nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
+
+	nesqp->ietf_frame = &cm_node->mpa_frame;
+	nesqp->private_data_len = (u8)cm_node->mpa_frame_size;
+	cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node);
+
+	/* notify OF layer we successfully created the requested connection */
+	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+	cm_event.status = 0;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event_laddr->sin_family = AF_INET;
+	cm_event_laddr->sin_port = laddr->sin_port;
+	cm_event.remote_addr = cm_id->m_remote_addr;
+
+	cm_event.private_data = (void *)event->cm_node->mpa_frame_buf;
+	cm_event.private_data_len = (u8)event->cm_node->mpa_frame_size;
+	cm_event.ird = cm_node->ird_size;
+	cm_event.ord = cm_node->ord_size;
+
+	cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr);
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
+		       "ret=%d\n", __func__, __LINE__, ret);
+	attr.qp_state = IB_QPS_RTS;
+	nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
+
+	nes_debug(NES_DBG_CM, "Exiting connect thread for QP%u. jiffies = "
+		  "%lu\n", nesqp->hwqp.qp_id, jiffies);
+
+	return;
+}
+
+
+/**
+ * cm_event_connect_error
+ */
+static void cm_event_connect_error(struct nes_cm_event *event)
+{
+	struct nes_qp *nesqp;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	/* struct nes_cm_info cm_info; */
+	int ret;
+
+	if (!event->cm_node)
+		return;
+
+	cm_id = event->cm_node->cm_id;
+	if (!cm_id)
+		return;
+
+	nes_debug(NES_DBG_CM, "cm_node=%p, cm_id=%p\n", event->cm_node, cm_id);
+	nesqp = cm_id->provider_data;
+
+	if (!nesqp)
+		return;
+
+	/* notify OF layer about this connection error event */
+	/* cm_id->rem_ref(cm_id); */
+	nesqp->cm_id = NULL;
+	cm_id->provider_data = NULL;
+	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+	cm_event.status = -ECONNRESET;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event.local_addr = cm_id->m_local_addr;
+	cm_event.remote_addr = cm_id->m_remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+	{
+		struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
+						     &cm_event.local_addr;
+		struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
+						     &cm_event.remote_addr;
+		nes_debug(NES_DBG_CM, "call CM_EVENT REJECTED, local_addr=%08x, remote_addr=%08x\n",
+			  cm_event_laddr->sin_addr.s_addr, cm_event_raddr->sin_addr.s_addr);
+	}
+#endif
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
+		       "ret=%d\n", __func__, __LINE__, ret);
+	cm_id->rem_ref(cm_id);
+
+	rem_ref_cm_node(event->cm_node->cm_core, event->cm_node);
+	return;
+}
+
+
+/**
+ * cm_event_reset
+ */
+static void cm_event_reset(struct nes_cm_event *event)
+{
+	struct nes_qp *nesqp;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	/* struct nes_cm_info cm_info; */
+	int ret;
+
+	if (!event->cm_node)
+		return;
+
+	if (!event->cm_node->cm_id)
+		return;
+
+	cm_id = event->cm_node->cm_id;
+
+	nes_debug(NES_DBG_CM, "%p - cm_id = %p\n", event->cm_node, cm_id);
+	nesqp = cm_id->provider_data;
+	if (!nesqp)
+		return;
+
+	nesqp->cm_id = NULL;
+	/* cm_id->provider_data = NULL; */
+	cm_event.event = IW_CM_EVENT_DISCONNECT;
+	cm_event.status = -ECONNRESET;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event.local_addr = cm_id->m_local_addr;
+	cm_event.remote_addr = cm_id->m_remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+
+	cm_id->add_ref(cm_id);
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	atomic_inc(&cm_closes);
+	cm_event.event = IW_CM_EVENT_CLOSE;
+	cm_event.status = 0;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event.local_addr = cm_id->m_local_addr;
+	cm_event.remote_addr = cm_id->m_remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+	nes_debug(NES_DBG_CM, "NODE %p Generating CLOSE\n", event->cm_node);
+	ret = cm_id->event_handler(cm_id, &cm_event);
+
+	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+
+
+	/* notify OF layer about this connection error event */
+	cm_id->rem_ref(cm_id);
+
+	return;
+}
+
+
+/**
+ * cm_event_mpa_req
+ */
+static void cm_event_mpa_req(struct nes_cm_event *event)
+{
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	int ret;
+	struct nes_cm_node *cm_node;
+	struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
+					     &cm_event.local_addr;
+	struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
+					     &cm_event.remote_addr;
+
+	cm_node = event->cm_node;
+	if (!cm_node)
+		return;
+	cm_id = cm_node->cm_id;
+
+	atomic_inc(&cm_connect_reqs);
+	nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n",
+		  cm_node, cm_id, jiffies);
+
+	cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
+	cm_event.status = 0;
+	cm_event.provider_data = (void *)cm_node;
+
+	cm_event_laddr->sin_family = AF_INET;
+	cm_event_laddr->sin_port = htons(event->cm_info.loc_port);
+	cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr);
+
+	cm_event_raddr->sin_family = AF_INET;
+	cm_event_raddr->sin_port = htons(event->cm_info.rem_port);
+	cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);
+	cm_event.private_data = cm_node->mpa_frame_buf;
+	cm_event.private_data_len = (u8)cm_node->mpa_frame_size;
+	if (cm_node->mpa_frame_rev == IETF_MPA_V1) {
+		cm_event.ird = NES_MAX_IRD;
+		cm_event.ord = NES_MAX_ORD;
+	} else {
+	cm_event.ird = cm_node->ird_size;
+	cm_event.ord = cm_node->ord_size;
+	}
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n",
+		       __func__, __LINE__, ret);
+	return;
+}
+
+
+static void cm_event_mpa_reject(struct nes_cm_event *event)
+{
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	struct nes_cm_node *cm_node;
+	int ret;
+	struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
+					     &cm_event.local_addr;
+	struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
+					     &cm_event.remote_addr;
+
+	cm_node = event->cm_node;
+	if (!cm_node)
+		return;
+	cm_id = cm_node->cm_id;
+
+	atomic_inc(&cm_connect_reqs);
+	nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n",
+		  cm_node, cm_id, jiffies);
+
+	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+	cm_event.status = -ECONNREFUSED;
+	cm_event.provider_data = cm_id->provider_data;
+
+	cm_event_laddr->sin_family = AF_INET;
+	cm_event_laddr->sin_port = htons(event->cm_info.loc_port);
+	cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr);
+
+	cm_event_raddr->sin_family = AF_INET;
+	cm_event_raddr->sin_port = htons(event->cm_info.rem_port);
+	cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);
+
+	cm_event.private_data = cm_node->mpa_frame_buf;
+	cm_event.private_data_len = (u8)cm_node->mpa_frame_size;
+
+	nes_debug(NES_DBG_CM, "call CM_EVENT_MPA_REJECTED, local_addr=%08x, "
+		  "remove_addr=%08x\n",
+		  cm_event_laddr->sin_addr.s_addr,
+		  cm_event_raddr->sin_addr.s_addr);
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n",
+		       __func__, __LINE__, ret);
+
+	return;
+}
+
+
+static void nes_cm_event_handler(struct work_struct *);
+
+/**
+ * nes_cm_post_event
+ * post an event to the cm event handler
+ */
+static int nes_cm_post_event(struct nes_cm_event *event)
+{
+	atomic_inc(&event->cm_node->cm_core->events_posted);
+	add_ref_cm_node(event->cm_node);
+	event->cm_info.cm_id->add_ref(event->cm_info.cm_id);
+	INIT_WORK(&event->event_work, nes_cm_event_handler);
+	nes_debug(NES_DBG_CM, "cm_node=%p queue_work, event=%p\n",
+		  event->cm_node, event);
+
+	queue_work(event->cm_node->cm_core->event_wq, &event->event_work);
+
+	nes_debug(NES_DBG_CM, "Exit\n");
+	return 0;
+}
+
+
+/**
+ * nes_cm_event_handler
+ * worker function to handle cm events
+ * will free instance of nes_cm_event
+ */
+static void nes_cm_event_handler(struct work_struct *work)
+{
+	struct nes_cm_event *event = container_of(work, struct nes_cm_event,
+						  event_work);
+	struct nes_cm_core *cm_core;
+
+	if ((!event) || (!event->cm_node) || (!event->cm_node->cm_core))
+		return;
+
+	cm_core = event->cm_node->cm_core;
+	nes_debug(NES_DBG_CM, "event=%p, event->type=%u, events posted=%u\n",
+		  event, event->type, atomic_read(&cm_core->events_posted));
+
+	switch (event->type) {
+	case NES_CM_EVENT_MPA_REQ:
+		cm_event_mpa_req(event);
+		nes_debug(NES_DBG_CM, "cm_node=%p CM Event: MPA REQUEST\n",
+			  event->cm_node);
+		break;
+	case NES_CM_EVENT_RESET:
+		nes_debug(NES_DBG_CM, "cm_node = %p CM Event: RESET\n",
+			  event->cm_node);
+		cm_event_reset(event);
+		break;
+	case NES_CM_EVENT_CONNECTED:
+		if ((!event->cm_node->cm_id) ||
+		    (event->cm_node->state != NES_CM_STATE_TSA))
+			break;
+		cm_event_connected(event);
+		nes_debug(NES_DBG_CM, "CM Event: CONNECTED\n");
+		break;
+	case NES_CM_EVENT_MPA_REJECT:
+		if ((!event->cm_node->cm_id) ||
+		    (event->cm_node->state == NES_CM_STATE_TSA))
+			break;
+		cm_event_mpa_reject(event);
+		nes_debug(NES_DBG_CM, "CM Event: REJECT\n");
+		break;
+
+	case NES_CM_EVENT_ABORTED:
+		if ((!event->cm_node->cm_id) ||
+		    (event->cm_node->state == NES_CM_STATE_TSA))
+			break;
+		cm_event_connect_error(event);
+		nes_debug(NES_DBG_CM, "CM Event: ABORTED\n");
+		break;
+	case NES_CM_EVENT_DROPPED_PKT:
+		nes_debug(NES_DBG_CM, "CM Event: DROPPED PKT\n");
+		break;
+	default:
+		nes_debug(NES_DBG_CM, "CM Event: UNKNOWN EVENT TYPE\n");
+		break;
+	}
+
+	atomic_dec(&cm_core->events_posted);
+	event->cm_info.cm_id->rem_ref(event->cm_info.cm_id);
+	rem_ref_cm_node(cm_core, event->cm_node);
+	kfree(event);
+
+	return;
+}
diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h
new file mode 100644
index 0000000..b9cc02b
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_cm.h
@@ -0,0 +1,470 @@
+/*
+ * Copyright (c) 2006 - 2014 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef NES_CM_H
+#define NES_CM_H
+
+#define QUEUE_EVENTS
+
+#define NES_MANAGE_APBVT_DEL 0
+#define NES_MANAGE_APBVT_ADD 1
+
+#define NES_MPA_REQUEST_ACCEPT  1
+#define NES_MPA_REQUEST_REJECT  2
+
+/* IETF MPA -- defines, enums, structs */
+#define IEFT_MPA_KEY_REQ  "MPA ID Req Frame"
+#define IEFT_MPA_KEY_REP  "MPA ID Rep Frame"
+#define IETF_MPA_KEY_SIZE 16
+#define IETF_MPA_VERSION  1
+#define IETF_MAX_PRIV_DATA_LEN 512
+#define IETF_MPA_FRAME_SIZE    20
+#define IETF_RTR_MSG_SIZE      4
+#define IETF_MPA_V2_FLAG       0x10
+
+/* IETF RTR MSG Fields               */
+#define IETF_PEER_TO_PEER       0x8000
+#define IETF_FLPDU_ZERO_LEN     0x4000
+#define IETF_RDMA0_WRITE        0x8000
+#define IETF_RDMA0_READ         0x4000
+#define IETF_NO_IRD_ORD         0x3FFF
+#define NES_MAX_IRD		 0x40
+#define NES_MAX_ORD		 0x7F
+
+enum ietf_mpa_flags {
+	IETF_MPA_FLAGS_MARKERS = 0x80,	/* receive Markers */
+	IETF_MPA_FLAGS_CRC     = 0x40,	/* receive Markers */
+	IETF_MPA_FLAGS_REJECT  = 0x20,	/* Reject */
+};
+
+struct ietf_mpa_v1 {
+	u8 key[IETF_MPA_KEY_SIZE];
+	u8 flags;
+	u8 rev;
+	__be16 priv_data_len;
+	u8 priv_data[0];
+};
+
+#define ietf_mpa_req_resp_frame ietf_mpa_frame
+
+struct ietf_rtr_msg {
+	__be16 ctrl_ird;
+	__be16 ctrl_ord;
+};
+
+struct ietf_mpa_v2 {
+	u8 key[IETF_MPA_KEY_SIZE];
+	u8 flags;
+	u8 rev;
+	 __be16 priv_data_len;
+	struct ietf_rtr_msg rtr_msg;
+	u8 priv_data[0];
+};
+
+struct nes_v4_quad {
+	u32 rsvd0;
+	__le32 DstIpAdrIndex;	/* Only most significant 5 bits are valid */
+	__be32 SrcIpadr;
+	__be16 TcpPorts[2];		/* src is low, dest is high */
+};
+
+struct nes_cm_node;
+enum nes_timer_type {
+	NES_TIMER_TYPE_SEND,
+	NES_TIMER_TYPE_RECV,
+	NES_TIMER_NODE_CLEANUP,
+	NES_TIMER_TYPE_CLOSE,
+};
+
+#define NES_PASSIVE_STATE_INDICATED	0
+#define NES_DO_NOT_SEND_RESET_EVENT	1
+#define NES_SEND_RESET_EVENT		2
+
+#define MAX_NES_IFS 4
+
+#define SET_ACK 1
+#define SET_SYN 2
+#define SET_FIN 4
+#define SET_RST 8
+
+#define TCP_OPTIONS_PADDING	3
+
+struct option_base {
+	u8 optionnum;
+	u8 length;
+};
+
+enum option_numbers {
+	OPTION_NUMBER_END,
+	OPTION_NUMBER_NONE,
+	OPTION_NUMBER_MSS,
+	OPTION_NUMBER_WINDOW_SCALE,
+	OPTION_NUMBER_SACK_PERM,
+	OPTION_NUMBER_SACK,
+	OPTION_NUMBER_WRITE0 = 0xbc
+};
+
+struct option_mss {
+	u8 optionnum;
+	u8 length;
+	__be16 mss;
+};
+
+struct option_windowscale {
+	u8 optionnum;
+	u8 length;
+	u8 shiftcount;
+};
+
+union all_known_options {
+	char as_end;
+	struct option_base as_base;
+	struct option_mss as_mss;
+	struct option_windowscale as_windowscale;
+};
+
+struct nes_timer_entry {
+	struct list_head list;
+	unsigned long timetosend;	/* jiffies */
+	struct sk_buff *skb;
+	u32 type;
+	u32 retrycount;
+	u32 retranscount;
+	u32 context;
+	u32 seq_num;
+	u32 send_retrans;
+	int close_when_complete;
+	struct net_device *netdev;
+};
+
+#define NES_DEFAULT_RETRYS  64
+#define NES_DEFAULT_RETRANS 8
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+#define NES_RETRY_TIMEOUT   (1000*HZ/1000)
+#else
+#define NES_RETRY_TIMEOUT   (3000*HZ/1000)
+#endif
+#define NES_SHORT_TIME      (10)
+#define NES_LONG_TIME       (2000*HZ/1000)
+#define NES_MAX_TIMEOUT     ((unsigned long) (12*HZ))
+
+#define NES_CM_HASHTABLE_SIZE         1024
+#define NES_CM_TCP_TIMER_INTERVAL     3000
+#define NES_CM_DEFAULT_MTU            1540
+#define NES_CM_DEFAULT_FRAME_CNT      10
+#define NES_CM_THREAD_STACK_SIZE      256
+#define NES_CM_DEFAULT_RCV_WND        64240	// before we know that window scaling is allowed
+#define NES_CM_DEFAULT_RCV_WND_SCALED 256960  // after we know that window scaling is allowed
+#define NES_CM_DEFAULT_RCV_WND_SCALE  2
+#define NES_CM_DEFAULT_FREE_PKTS      0x000A
+#define NES_CM_FREE_PKT_LO_WATERMARK  2
+
+#define NES_CM_DEFAULT_MSS   536
+
+#define NES_CM_DEF_SEQ       0x159bf75f
+#define NES_CM_DEF_LOCAL_ID  0x3b47
+
+#define NES_CM_DEF_SEQ2      0x18ed5740
+#define NES_CM_DEF_LOCAL_ID2 0xb807
+#define	MAX_CM_BUFFER	(IETF_MPA_FRAME_SIZE + IETF_RTR_MSG_SIZE + IETF_MAX_PRIV_DATA_LEN)
+
+typedef u32 nes_addr_t;
+
+#define nes_cm_tsa_context nes_qp_context
+
+struct nes_qp;
+
+/* cm node transition states */
+enum nes_cm_node_state {
+	NES_CM_STATE_UNKNOWN,
+	NES_CM_STATE_INITED,
+	NES_CM_STATE_LISTENING,
+	NES_CM_STATE_SYN_RCVD,
+	NES_CM_STATE_SYN_SENT,
+	NES_CM_STATE_ONE_SIDE_ESTABLISHED,
+	NES_CM_STATE_ESTABLISHED,
+	NES_CM_STATE_ACCEPTING,
+	NES_CM_STATE_MPAREQ_SENT,
+	NES_CM_STATE_MPAREQ_RCVD,
+	NES_CM_STATE_MPAREJ_RCVD,
+	NES_CM_STATE_TSA,
+	NES_CM_STATE_FIN_WAIT1,
+	NES_CM_STATE_FIN_WAIT2,
+	NES_CM_STATE_CLOSE_WAIT,
+	NES_CM_STATE_TIME_WAIT,
+	NES_CM_STATE_LAST_ACK,
+	NES_CM_STATE_CLOSING,
+	NES_CM_STATE_LISTENER_DESTROYED,
+	NES_CM_STATE_CLOSED
+};
+
+enum mpa_frame_version {
+	IETF_MPA_V1 = 1,
+	IETF_MPA_V2 = 2
+};
+
+enum mpa_frame_key {
+	MPA_KEY_REQUEST,
+	MPA_KEY_REPLY
+};
+
+enum send_rdma0 {
+	SEND_RDMA_READ_ZERO = 1,
+	SEND_RDMA_WRITE_ZERO = 2
+};
+
+enum nes_tcpip_pkt_type {
+	NES_PKT_TYPE_UNKNOWN,
+	NES_PKT_TYPE_SYN,
+	NES_PKT_TYPE_SYNACK,
+	NES_PKT_TYPE_ACK,
+	NES_PKT_TYPE_FIN,
+	NES_PKT_TYPE_RST
+};
+
+
+/* type of nes connection */
+enum nes_cm_conn_type {
+	NES_CM_IWARP_CONN_TYPE,
+};
+
+/* CM context params */
+struct nes_cm_tcp_context {
+	u8  client;
+
+	u32 loc_seq_num;
+	u32 loc_ack_num;
+	u32 rem_ack_num;
+	u32 rcv_nxt;
+
+	u32 loc_id;
+	u32 rem_id;
+
+	u32 snd_wnd;
+	u32 max_snd_wnd;
+
+	u32 rcv_wnd;
+	u32 mss;
+	u8  snd_wscale;
+	u8  rcv_wscale;
+
+	struct nes_cm_tsa_context tsa_cntxt;
+};
+
+
+enum nes_cm_listener_state {
+	NES_CM_LISTENER_PASSIVE_STATE = 1,
+	NES_CM_LISTENER_ACTIVE_STATE = 2,
+	NES_CM_LISTENER_EITHER_STATE = 3
+};
+
+struct nes_cm_listener {
+	struct list_head           list;
+	struct nes_cm_core         *cm_core;
+	u8                         loc_mac[ETH_ALEN];
+	nes_addr_t                 loc_addr;
+	u16                        loc_port;
+	struct iw_cm_id            *cm_id;
+	enum nes_cm_conn_type      conn_type;
+	atomic_t                   ref_count;
+	struct nes_vnic            *nesvnic;
+	atomic_t                   pend_accepts_cnt;
+	int                        backlog;
+	enum nes_cm_listener_state listener_state;
+	u32                        reused_node;
+	u8			   tos;
+};
+
+/* per connection node and node state information */
+struct nes_cm_node {
+	nes_addr_t                loc_addr, rem_addr;
+	u16                       loc_port, rem_port;
+
+	u8                        loc_mac[ETH_ALEN];
+	u8                        rem_mac[ETH_ALEN];
+
+	enum nes_cm_node_state    state;
+	struct nes_cm_tcp_context tcp_cntxt;
+	struct nes_cm_core        *cm_core;
+	struct sk_buff_head       resend_list;
+	atomic_t                  ref_count;
+	struct net_device         *netdev;
+
+	struct nes_cm_node        *loopbackpartner;
+
+	struct nes_timer_entry	  *send_entry;
+	struct nes_timer_entry    *recv_entry;
+	spinlock_t                retrans_list_lock;
+	enum send_rdma0           send_rdma0_op;
+
+	union {
+		struct ietf_mpa_v1 mpa_frame;
+		struct ietf_mpa_v2 mpa_v2_frame;
+		u8                 mpa_frame_buf[MAX_CM_BUFFER];
+	};
+	enum mpa_frame_version    mpa_frame_rev;
+	u16			  ird_size;
+	u16                       ord_size;
+	u16			  mpav2_ird_ord;
+
+	u16                       mpa_frame_size;
+	struct iw_cm_id           *cm_id;
+	struct list_head          list;
+	bool                      accelerated;
+	struct nes_cm_listener    *listener;
+	enum nes_cm_conn_type     conn_type;
+	struct nes_vnic           *nesvnic;
+	int                       apbvt_set;
+	int                       accept_pend;
+	struct list_head	timer_entry;
+	struct list_head	reset_entry;
+	struct nes_qp		*nesqp;
+	atomic_t 		passive_state;
+	u8			tos;
+};
+
+/* structure for client or CM to fill when making CM api calls. */
+/*	- only need to set relevant data, based on op. */
+struct nes_cm_info {
+	union {
+		struct iw_cm_id   *cm_id;
+		struct net_device *netdev;
+	};
+
+	u16 loc_port;
+	u16 rem_port;
+	nes_addr_t loc_addr;
+	nes_addr_t rem_addr;
+	enum nes_cm_conn_type  conn_type;
+	int backlog;
+};
+
+/* CM event codes */
+enum  nes_cm_event_type {
+	NES_CM_EVENT_UNKNOWN,
+	NES_CM_EVENT_ESTABLISHED,
+	NES_CM_EVENT_MPA_REQ,
+	NES_CM_EVENT_MPA_CONNECT,
+	NES_CM_EVENT_MPA_ACCEPT,
+	NES_CM_EVENT_MPA_REJECT,
+	NES_CM_EVENT_MPA_ESTABLISHED,
+	NES_CM_EVENT_CONNECTED,
+	NES_CM_EVENT_CLOSED,
+	NES_CM_EVENT_RESET,
+	NES_CM_EVENT_DROPPED_PKT,
+	NES_CM_EVENT_CLOSE_IMMED,
+	NES_CM_EVENT_CLOSE_HARD,
+	NES_CM_EVENT_CLOSE_CLEAN,
+	NES_CM_EVENT_ABORTED,
+	NES_CM_EVENT_SEND_FIRST
+};
+
+/* event to post to CM event handler */
+struct nes_cm_event {
+	enum nes_cm_event_type type;
+
+	struct nes_cm_info cm_info;
+	struct work_struct event_work;
+	struct nes_cm_node *cm_node;
+};
+
+struct nes_cm_core {
+	enum nes_cm_node_state  state;
+
+	atomic_t                listen_node_cnt;
+	struct nes_cm_node      listen_list;
+	spinlock_t              listen_list_lock;
+
+	u32                     mtu;
+	u32                     free_tx_pkt_max;
+	u32                     rx_pkt_posted;
+	atomic_t                ht_node_cnt;
+	struct list_head        connected_nodes;
+	/* struct list_head hashtable[NES_CM_HASHTABLE_SIZE]; */
+	spinlock_t              ht_lock;
+
+	struct timer_list       tcp_timer;
+
+	const struct nes_cm_ops *api;
+
+	int (*post_event)(struct nes_cm_event *event);
+	atomic_t                events_posted;
+	struct workqueue_struct *event_wq;
+	struct workqueue_struct *disconn_wq;
+
+	atomic_t                node_cnt;
+	u64                     aborted_connects;
+	u32                     options;
+
+	struct nes_cm_node      *current_listen_node;
+};
+
+
+#define NES_CM_SET_PKT_SIZE        (1 << 1)
+#define NES_CM_SET_FREE_PKT_Q_SIZE (1 << 2)
+
+/* CM ops/API for client interface */
+struct nes_cm_ops {
+	int (*accelerated)(struct nes_cm_core *, struct nes_cm_node *);
+	struct nes_cm_listener * (*listen)(struct nes_cm_core *, struct nes_vnic *,
+			struct nes_cm_info *);
+	int (*stop_listener)(struct nes_cm_core *, struct nes_cm_listener *);
+	struct nes_cm_node * (*connect)(struct nes_cm_core *,
+			struct nes_vnic *, u16, void *,
+			struct nes_cm_info *);
+	int (*close)(struct nes_cm_core *, struct nes_cm_node *);
+	int (*accept)(struct nes_cm_core *, struct nes_cm_node *);
+	int (*reject)(struct nes_cm_core *, struct nes_cm_node *);
+	int (*recv_pkt)(struct nes_cm_core *, struct nes_vnic *,
+			struct sk_buff *);
+	int (*destroy_cm_core)(struct nes_cm_core *);
+	int (*get)(struct nes_cm_core *);
+	int (*set)(struct nes_cm_core *, u32, u32);
+};
+
+int schedule_nes_timer(struct nes_cm_node *, struct sk_buff *,
+		enum nes_timer_type, int, int);
+
+int nes_accept(struct iw_cm_id *, struct iw_cm_conn_param *);
+int nes_reject(struct iw_cm_id *, const void *, u8);
+int nes_connect(struct iw_cm_id *, struct iw_cm_conn_param *);
+int nes_create_listen(struct iw_cm_id *, int);
+int nes_destroy_listen(struct iw_cm_id *);
+
+int nes_cm_recv(struct sk_buff *, struct net_device *);
+int nes_cm_start(void);
+int nes_cm_stop(void);
+int nes_add_ref_cm_node(struct nes_cm_node *cm_node);
+int nes_rem_ref_cm_node(struct nes_cm_node *cm_node);
+
+#endif			/* NES_CM_H */
diff --git a/drivers/infiniband/hw/nes/nes_context.h b/drivers/infiniband/hw/nes/nes_context.h
new file mode 100644
index 0000000..a69eef1
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_context.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef NES_CONTEXT_H
+#define NES_CONTEXT_H
+
+struct nes_qp_context {
+	__le32   misc;
+	__le32   cqs;
+	__le32   sq_addr_low;
+	__le32   sq_addr_high;
+	__le32   rq_addr_low;
+	__le32   rq_addr_high;
+	__le32   misc2;
+	__le16   tcpPorts[2];
+	__le32   ip0;
+	__le32   ip1;
+	__le32   ip2;
+	__le32   ip3;
+	__le32   mss;
+	__le32   arp_index_vlan;
+	__le32   tcp_state_flow_label;
+	__le32   pd_index_wscale;
+	__le32   keepalive;
+	u32   ts_recent;
+	u32   ts_age;
+	__le32   snd_nxt;
+	__le32   snd_wnd;
+	__le32   rcv_nxt;
+	__le32   rcv_wnd;
+	__le32   snd_max;
+	__le32   snd_una;
+	u32   srtt;
+	__le32   rttvar;
+	__le32   ssthresh;
+	__le32   cwnd;
+	__le32   snd_wl1;
+	__le32   snd_wl2;
+	__le32   max_snd_wnd;
+	__le32   ts_val_delta;
+	u32   retransmit;
+	u32   probe_cnt;
+	u32   hte_index;
+	__le32   q2_addr_low;
+	__le32   q2_addr_high;
+	__le32   ird_index;
+	u32   Rsvd3;
+	__le32   ird_ord_sizes;
+	u32   mrkr_offset;
+	__le32   aeq_token_low;
+	__le32   aeq_token_high;
+};
+
+/* QP Context Misc Field */
+
+#define NES_QPCONTEXT_MISC_IWARP_VER_MASK    0x00000003
+#define NES_QPCONTEXT_MISC_IWARP_VER_SHIFT   0
+#define NES_QPCONTEXT_MISC_EFB_SIZE_MASK     0x000000C0
+#define NES_QPCONTEXT_MISC_EFB_SIZE_SHIFT    6
+#define NES_QPCONTEXT_MISC_RQ_SIZE_MASK      0x00000300
+#define NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT     8
+#define NES_QPCONTEXT_MISC_SQ_SIZE_MASK      0x00000c00
+#define NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT     10
+#define NES_QPCONTEXT_MISC_PCI_FCN_MASK      0x00007000
+#define NES_QPCONTEXT_MISC_PCI_FCN_SHIFT     12
+#define NES_QPCONTEXT_MISC_DUP_ACKS_MASK     0x00070000
+#define NES_QPCONTEXT_MISC_DUP_ACKS_SHIFT    16
+
+enum nes_qp_context_misc_bits {
+	NES_QPCONTEXT_MISC_RX_WQE_SIZE         = 0x00000004,
+	NES_QPCONTEXT_MISC_IPV4                = 0x00000008,
+	NES_QPCONTEXT_MISC_DO_NOT_FRAG         = 0x00000010,
+	NES_QPCONTEXT_MISC_INSERT_VLAN         = 0x00000020,
+	NES_QPCONTEXT_MISC_DROS                = 0x00008000,
+	NES_QPCONTEXT_MISC_WSCALE              = 0x00080000,
+	NES_QPCONTEXT_MISC_KEEPALIVE           = 0x00100000,
+	NES_QPCONTEXT_MISC_TIMESTAMP           = 0x00200000,
+	NES_QPCONTEXT_MISC_SACK                = 0x00400000,
+	NES_QPCONTEXT_MISC_RDMA_WRITE_EN       = 0x00800000,
+	NES_QPCONTEXT_MISC_RDMA_READ_EN        = 0x01000000,
+	NES_QPCONTEXT_MISC_WBIND_EN            = 0x10000000,
+	NES_QPCONTEXT_MISC_FAST_REGISTER_EN    = 0x20000000,
+	NES_QPCONTEXT_MISC_PRIV_EN             = 0x40000000,
+	NES_QPCONTEXT_MISC_NO_NAGLE            = 0x80000000
+};
+
+enum nes_qp_acc_wq_sizes {
+	HCONTEXT_TSA_WQ_SIZE_4 = 0,
+	HCONTEXT_TSA_WQ_SIZE_32 = 1,
+	HCONTEXT_TSA_WQ_SIZE_128 = 2,
+	HCONTEXT_TSA_WQ_SIZE_512 = 3
+};
+
+/* QP Context Misc2 Fields */
+#define NES_QPCONTEXT_MISC2_TTL_MASK            0x000000ff
+#define NES_QPCONTEXT_MISC2_TTL_SHIFT           0
+#define NES_QPCONTEXT_MISC2_HOP_LIMIT_MASK      0x000000ff
+#define NES_QPCONTEXT_MISC2_HOP_LIMIT_SHIFT     0
+#define NES_QPCONTEXT_MISC2_LIMIT_MASK          0x00000300
+#define NES_QPCONTEXT_MISC2_LIMIT_SHIFT         8
+#define NES_QPCONTEXT_MISC2_NIC_INDEX_MASK      0x0000fc00
+#define NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT     10
+#define NES_QPCONTEXT_MISC2_SRC_IP_MASK         0x001f0000
+#define NES_QPCONTEXT_MISC2_SRC_IP_SHIFT        16
+#define NES_QPCONTEXT_MISC2_TOS_MASK            0xff000000
+#define NES_QPCONTEXT_MISC2_TOS_SHIFT           24
+#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_MASK  0xff000000
+#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_SHIFT 24
+
+/* QP Context Tcp State/Flow Label Fields */
+#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_MASK   0x000fffff
+#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_SHIFT  0
+#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_MASK    0xf0000000
+#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT   28
+
+enum nes_qp_tcp_state {
+	NES_QPCONTEXT_TCPSTATE_CLOSED = 1,
+	NES_QPCONTEXT_TCPSTATE_EST = 5,
+	NES_QPCONTEXT_TCPSTATE_TIME_WAIT = 11,
+};
+
+/* QP Context PD Index/wscale Fields */
+#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK  0x0000000f
+#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT 0
+#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK  0x00000f00
+#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT 8
+#define NES_QPCONTEXT_PDWSCALE_PDINDEX_MASK     0xffff0000
+#define NES_QPCONTEXT_PDWSCALE_PDINDEX_SHIFT    16
+
+/* QP Context Keepalive Fields */
+#define NES_QPCONTEXT_KEEPALIVE_DELTA_MASK      0x0000ffff
+#define NES_QPCONTEXT_KEEPALIVE_DELTA_SHIFT     0
+#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_MASK  0x00ff0000
+#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_SHIFT 16
+#define NES_QPCONTEXT_KEEPALIVE_INTV_MASK       0xff000000
+#define NES_QPCONTEXT_KEEPALIVE_INTV_SHIFT      24
+
+/* QP Context ORD/IRD Fields */
+#define NES_QPCONTEXT_ORDIRD_ORDSIZE_MASK       0x0000007f
+#define NES_QPCONTEXT_ORDIRD_ORDSIZE_SHIFT      0
+#define NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK       0x00030000
+#define NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT      16
+#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_MASK    0x30000000
+#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT   28
+
+enum nes_ord_ird_bits {
+	NES_QPCONTEXT_ORDIRD_WRPDU                   = 0x02000000,
+	NES_QPCONTEXT_ORDIRD_LSMM_PRESENT            = 0x04000000,
+	NES_QPCONTEXT_ORDIRD_ALSMM                   = 0x08000000,
+	NES_QPCONTEXT_ORDIRD_AAH                     = 0x40000000,
+	NES_QPCONTEXT_ORDIRD_RNMC                    = 0x80000000
+};
+
+enum nes_iwarp_qp_state {
+	NES_QPCONTEXT_IWARP_STATE_NONEXIST  = 0,
+	NES_QPCONTEXT_IWARP_STATE_IDLE      = 1,
+	NES_QPCONTEXT_IWARP_STATE_RTS       = 2,
+	NES_QPCONTEXT_IWARP_STATE_CLOSING   = 3,
+	NES_QPCONTEXT_IWARP_STATE_TERMINATE = 5,
+	NES_QPCONTEXT_IWARP_STATE_ERROR     = 6
+};
+
+
+#endif		/* NES_CONTEXT_H */
diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
new file mode 100644
index 0000000..18a7de1
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_hw.c
@@ -0,0 +1,3889 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/slab.h>
+
+#include "nes.h"
+
+static int wide_ppm_offset;
+module_param(wide_ppm_offset, int, 0644);
+MODULE_PARM_DESC(wide_ppm_offset, "Increase CX4 interface clock ppm offset, 0=100ppm (default), 1=300ppm");
+
+static u32 crit_err_count;
+u32 int_mod_timer_init;
+u32 int_mod_cq_depth_256;
+u32 int_mod_cq_depth_128;
+u32 int_mod_cq_depth_32;
+u32 int_mod_cq_depth_24;
+u32 int_mod_cq_depth_16;
+u32 int_mod_cq_depth_4;
+u32 int_mod_cq_depth_1;
+static const u8 nes_max_critical_error_count = 100;
+#include "nes_cm.h"
+
+static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq);
+static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count);
+static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count,
+				struct nes_adapter *nesadapter, u8  OneG_Mode);
+static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq);
+static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq);
+static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq);
+static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
+				   struct nes_hw_aeqe *aeqe);
+static void process_critical_error(struct nes_device *nesdev);
+static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number);
+static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode);
+static void nes_terminate_start_timer(struct nes_qp *nesqp);
+
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+static unsigned char *nes_iwarp_state_str[] = {
+	"Non-Existent",
+	"Idle",
+	"RTS",
+	"Closing",
+	"RSVD1",
+	"Terminate",
+	"Error",
+	"RSVD2",
+};
+
+static unsigned char *nes_tcp_state_str[] = {
+	"Non-Existent",
+	"Closed",
+	"Listen",
+	"SYN Sent",
+	"SYN Rcvd",
+	"Established",
+	"Close Wait",
+	"FIN Wait 1",
+	"Closing",
+	"Last Ack",
+	"FIN Wait 2",
+	"Time Wait",
+	"RSVD1",
+	"RSVD2",
+	"RSVD3",
+	"RSVD4",
+};
+#endif
+
+static inline void print_ip(struct nes_cm_node *cm_node)
+{
+	unsigned char *rem_addr;
+	if (cm_node) {
+		rem_addr = (unsigned char *)&cm_node->rem_addr;
+		printk(KERN_ERR PFX "Remote IP addr: %pI4\n", rem_addr);
+	}
+}
+
+/**
+ * nes_nic_init_timer_defaults
+ */
+void  nes_nic_init_timer_defaults(struct nes_device *nesdev, u8 jumbomode)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+
+	shared_timer->timer_in_use_min = NES_NIC_FAST_TIMER_LOW;
+	shared_timer->timer_in_use_max = NES_NIC_FAST_TIMER_HIGH;
+	if (jumbomode) {
+		shared_timer->threshold_low    = DEFAULT_JUMBO_NES_QL_LOW;
+		shared_timer->threshold_target = DEFAULT_JUMBO_NES_QL_TARGET;
+		shared_timer->threshold_high   = DEFAULT_JUMBO_NES_QL_HIGH;
+	} else {
+		shared_timer->threshold_low    = DEFAULT_NES_QL_LOW;
+		shared_timer->threshold_target = DEFAULT_NES_QL_TARGET;
+		shared_timer->threshold_high   = DEFAULT_NES_QL_HIGH;
+	}
+
+	/* todo use netdev->mtu to set thresholds */
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+}
+
+
+/**
+ * nes_nic_init_timer
+ */
+static void  nes_nic_init_timer(struct nes_device *nesdev)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+
+	if (shared_timer->timer_in_use_old == 0) {
+		nesdev->deepcq_count = 0;
+		shared_timer->timer_direction_upward = 0;
+		shared_timer->timer_direction_downward = 0;
+		shared_timer->timer_in_use = NES_NIC_FAST_TIMER;
+		shared_timer->timer_in_use_old = 0;
+
+	}
+	if (shared_timer->timer_in_use != shared_timer->timer_in_use_old) {
+		shared_timer->timer_in_use_old = shared_timer->timer_in_use;
+		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL,
+			0x80000000 | ((u32)(shared_timer->timer_in_use*8)));
+	}
+	/* todo use netdev->mtu to set thresholds */
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+}
+
+
+/**
+ * nes_nic_tune_timer
+ */
+static void nes_nic_tune_timer(struct nes_device *nesdev)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+	u16 cq_count = nesdev->currcq_count;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+
+	if (shared_timer->cq_count_old <= cq_count)
+		shared_timer->cq_direction_downward = 0;
+	else
+		shared_timer->cq_direction_downward++;
+	shared_timer->cq_count_old = cq_count;
+	if (shared_timer->cq_direction_downward > NES_NIC_CQ_DOWNWARD_TREND) {
+		if (cq_count <= shared_timer->threshold_low &&
+		    shared_timer->threshold_low > 4) {
+			shared_timer->threshold_low = shared_timer->threshold_low/2;
+			shared_timer->cq_direction_downward=0;
+			nesdev->currcq_count = 0;
+			spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+			return;
+		}
+	}
+
+	if (cq_count > 1) {
+		nesdev->deepcq_count += cq_count;
+		if (cq_count <= shared_timer->threshold_low) {       /* increase timer gently */
+			shared_timer->timer_direction_upward++;
+			shared_timer->timer_direction_downward = 0;
+		} else if (cq_count <= shared_timer->threshold_target) { /* balanced */
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward = 0;
+		} else if (cq_count <= shared_timer->threshold_high) {  /* decrease timer gently */
+			shared_timer->timer_direction_downward++;
+			shared_timer->timer_direction_upward = 0;
+		} else if (cq_count <= (shared_timer->threshold_high) * 2) {
+			shared_timer->timer_in_use -= 2;
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward++;
+		} else {
+			shared_timer->timer_in_use -= 4;
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward++;
+		}
+
+		if (shared_timer->timer_direction_upward > 3 ) {  /* using history */
+			shared_timer->timer_in_use += 3;
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward = 0;
+		}
+		if (shared_timer->timer_direction_downward > 5) { /* using history */
+			shared_timer->timer_in_use -= 4 ;
+			shared_timer->timer_direction_downward = 0;
+			shared_timer->timer_direction_upward = 0;
+		}
+	}
+
+	/* boundary checking */
+	if (shared_timer->timer_in_use > shared_timer->threshold_high)
+		shared_timer->timer_in_use = shared_timer->threshold_high;
+	else if (shared_timer->timer_in_use < shared_timer->threshold_low)
+		shared_timer->timer_in_use = shared_timer->threshold_low;
+
+	nesdev->currcq_count = 0;
+
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+}
+
+
+/**
+ * nes_init_adapter - initialize adapter
+ */
+struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) {
+	struct nes_adapter *nesadapter = NULL;
+	unsigned long num_pds;
+	u32 u32temp;
+	u32 port_count;
+	u16 max_rq_wrs;
+	u16 max_sq_wrs;
+	u32 max_mr;
+	u32 max_256pbl;
+	u32 max_4kpbl;
+	u32 max_qp;
+	u32 max_irrq;
+	u32 max_cq;
+	u32 hte_index_mask;
+	u32 adapter_size;
+	u32 arp_table_size;
+	u16 vendor_id;
+	u16 device_id;
+	u8  OneG_Mode;
+	u8  func_index;
+
+	/* search the list of existing adapters */
+	list_for_each_entry(nesadapter, &nes_adapter_list, list) {
+		nes_debug(NES_DBG_INIT, "Searching Adapter list for PCI devfn = 0x%X,"
+				" adapter PCI slot/bus = %u/%u, pci devices PCI slot/bus = %u/%u, .\n",
+				nesdev->pcidev->devfn,
+				PCI_SLOT(nesadapter->devfn),
+				nesadapter->bus_number,
+				PCI_SLOT(nesdev->pcidev->devfn),
+				nesdev->pcidev->bus->number );
+		if ((PCI_SLOT(nesadapter->devfn) == PCI_SLOT(nesdev->pcidev->devfn)) &&
+				(nesadapter->bus_number == nesdev->pcidev->bus->number)) {
+			nesadapter->ref_count++;
+			return nesadapter;
+		}
+	}
+
+	/* no adapter found */
+	num_pds = pci_resource_len(nesdev->pcidev, BAR_1) >> PAGE_SHIFT;
+	if ((hw_rev != NE020_REV) && (hw_rev != NE020_REV1)) {
+		nes_debug(NES_DBG_INIT, "NE020 driver detected unknown hardware revision 0x%x\n",
+				hw_rev);
+		return NULL;
+	}
+
+	nes_debug(NES_DBG_INIT, "Determine Soft Reset, QP_control=0x%x, CPU0=0x%x, CPU1=0x%x, CPU2=0x%x\n",
+			nes_read_indexed(nesdev, NES_IDX_QP_CONTROL + PCI_FUNC(nesdev->pcidev->devfn) * 8),
+			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS),
+			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 4),
+			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 8));
+
+	nes_debug(NES_DBG_INIT, "Reset and init NE020\n");
+
+
+	if ((port_count = nes_reset_adapter_ne020(nesdev, &OneG_Mode)) == 0)
+		return NULL;
+
+	max_qp = nes_read_indexed(nesdev, NES_IDX_QP_CTX_SIZE);
+	nes_debug(NES_DBG_INIT, "QP_CTX_SIZE=%u\n", max_qp);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_QUAD_HASH_TABLE_SIZE);
+	if (max_qp > ((u32)1 << (u32temp & 0x001f))) {
+		nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to hash table size = 0x%08X\n",
+				max_qp, u32temp);
+		max_qp = (u32)1 << (u32temp & 0x001f);
+	}
+
+	hte_index_mask = ((u32)1 << ((u32temp & 0x001f)+1))-1;
+	nes_debug(NES_DBG_INIT, "Max QP = %u, hte_index_mask = 0x%08X.\n",
+			max_qp, hte_index_mask);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_IRRQ_COUNT);
+
+	max_irrq = 1 << (u32temp & 0x001f);
+
+	if (max_qp > max_irrq) {
+		max_qp = max_irrq;
+		nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to Available Q1s.\n",
+				max_qp);
+	}
+
+	/* there should be no reason to allocate more pds than qps */
+	if (num_pds > max_qp)
+		num_pds = max_qp;
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_MRT_SIZE);
+	max_mr = (u32)8192 << (u32temp & 0x7);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_PBL_REGION_SIZE);
+	max_256pbl = (u32)1 << (u32temp & 0x0000001f);
+	max_4kpbl = (u32)1 << ((u32temp >> 16) & 0x0000001f);
+	max_cq = nes_read_indexed(nesdev, NES_IDX_CQ_CTX_SIZE);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_ARP_CACHE_SIZE);
+	arp_table_size = 1 << u32temp;
+
+	adapter_size = (sizeof(struct nes_adapter) +
+			(sizeof(unsigned long)-1)) & (~(sizeof(unsigned long)-1));
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_qp);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_mr);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_cq);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(num_pds);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(arp_table_size);
+	adapter_size += sizeof(struct nes_qp **) * max_qp;
+
+	/* allocate a new adapter struct */
+	nesadapter = kzalloc(adapter_size, GFP_KERNEL);
+	if (!nesadapter)
+		return NULL;
+
+	nes_debug(NES_DBG_INIT, "Allocating new nesadapter @ %p, size = %u (actual size = %u).\n",
+			nesadapter, (u32)sizeof(struct nes_adapter), adapter_size);
+
+	if (nes_read_eeprom_values(nesdev, nesadapter)) {
+		printk(KERN_ERR PFX "Unable to read EEPROM data.\n");
+		kfree(nesadapter);
+		return NULL;
+	}
+
+	nesadapter->vendor_id = (((u32) nesadapter->mac_addr_high) << 8) |
+				(nesadapter->mac_addr_low >> 24);
+
+	pci_bus_read_config_word(nesdev->pcidev->bus, nesdev->pcidev->devfn,
+				 PCI_DEVICE_ID, &device_id);
+	nesadapter->vendor_part_id = device_id;
+
+	if (nes_init_serdes(nesdev, hw_rev, port_count, nesadapter,
+							OneG_Mode)) {
+		kfree(nesadapter);
+		return NULL;
+	}
+	nes_init_csr_ne020(nesdev, hw_rev, port_count);
+
+	memset(nesadapter->pft_mcast_map, 255,
+	       sizeof nesadapter->pft_mcast_map);
+
+	/* populate the new nesadapter */
+	nesadapter->nesdev = nesdev;
+	nesadapter->devfn = nesdev->pcidev->devfn;
+	nesadapter->bus_number = nesdev->pcidev->bus->number;
+	nesadapter->ref_count = 1;
+	nesadapter->timer_int_req = 0xffff0000;
+	nesadapter->OneG_Mode = OneG_Mode;
+	nesadapter->doorbell_start = nesdev->doorbell_region;
+
+	/* nesadapter->tick_delta = clk_divisor; */
+	nesadapter->hw_rev = hw_rev;
+	nesadapter->port_count = port_count;
+
+	nesadapter->max_qp = max_qp;
+	nesadapter->hte_index_mask = hte_index_mask;
+	nesadapter->max_irrq = max_irrq;
+	nesadapter->max_mr = max_mr;
+	nesadapter->max_256pbl = max_256pbl - 1;
+	nesadapter->max_4kpbl = max_4kpbl - 1;
+	nesadapter->max_cq = max_cq;
+	nesadapter->free_256pbl = max_256pbl - 1;
+	nesadapter->free_4kpbl = max_4kpbl - 1;
+	nesadapter->max_pd = num_pds;
+	nesadapter->arp_table_size = arp_table_size;
+
+	nesadapter->et_pkt_rate_low = NES_TIMER_ENABLE_LIMIT;
+	if (nes_drv_opt & NES_DRV_OPT_DISABLE_INT_MOD) {
+		nesadapter->et_use_adaptive_rx_coalesce = 0;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT;
+		nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval;
+	} else {
+		nesadapter->et_use_adaptive_rx_coalesce = 1;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC;
+		nesadapter->et_rx_coalesce_usecs_irq = 0;
+		printk(PFX "%s: Using Adaptive Interrupt Moderation\n", __func__);
+	}
+	/* Setup and enable the periodic timer */
+	if (nesadapter->et_rx_coalesce_usecs_irq)
+		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x80000000 |
+				((u32)(nesadapter->et_rx_coalesce_usecs_irq * 8)));
+	else
+		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x00000000);
+
+	nesadapter->base_pd = 1;
+
+	nesadapter->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
+				       IB_DEVICE_MEM_WINDOW |
+				       IB_DEVICE_MEM_MGT_EXTENSIONS;
+
+	nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter)
+			[(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]);
+	nesadapter->allocated_cqs = &nesadapter->allocated_qps[BITS_TO_LONGS(max_qp)];
+	nesadapter->allocated_mrs = &nesadapter->allocated_cqs[BITS_TO_LONGS(max_cq)];
+	nesadapter->allocated_pds = &nesadapter->allocated_mrs[BITS_TO_LONGS(max_mr)];
+	nesadapter->allocated_arps = &nesadapter->allocated_pds[BITS_TO_LONGS(num_pds)];
+	nesadapter->qp_table = (struct nes_qp **)(&nesadapter->allocated_arps[BITS_TO_LONGS(arp_table_size)]);
+
+
+	/* mark the usual suspect QPs, MR and CQs as in use */
+	for (u32temp = 0; u32temp < NES_FIRST_QPN; u32temp++) {
+		set_bit(u32temp, nesadapter->allocated_qps);
+		set_bit(u32temp, nesadapter->allocated_cqs);
+	}
+	set_bit(0, nesadapter->allocated_mrs);
+
+	for (u32temp = 0; u32temp < 20; u32temp++)
+		set_bit(u32temp, nesadapter->allocated_pds);
+	u32temp = nes_read_indexed(nesdev, NES_IDX_QP_MAX_CFG_SIZES);
+
+	max_rq_wrs = ((u32temp >> 8) & 3);
+	switch (max_rq_wrs) {
+		case 0:
+			max_rq_wrs = 4;
+			break;
+		case 1:
+			max_rq_wrs = 16;
+			break;
+		case 2:
+			max_rq_wrs = 32;
+			break;
+		case 3:
+			max_rq_wrs = 512;
+			break;
+	}
+
+	max_sq_wrs = (u32temp & 3);
+	switch (max_sq_wrs) {
+		case 0:
+			max_sq_wrs = 4;
+			break;
+		case 1:
+			max_sq_wrs = 16;
+			break;
+		case 2:
+			max_sq_wrs = 32;
+			break;
+		case 3:
+			max_sq_wrs = 512;
+			break;
+	}
+	nesadapter->max_qp_wr = min(max_rq_wrs, max_sq_wrs);
+	nesadapter->max_irrq_wr = (u32temp >> 16) & 3;
+
+	nesadapter->max_sge = 4;
+	nesadapter->max_cqe = 32766;
+
+	if (nes_read_eeprom_values(nesdev, nesadapter)) {
+		printk(KERN_ERR PFX "Unable to read EEPROM data.\n");
+		kfree(nesadapter);
+		return NULL;
+	}
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG);
+	nes_write_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG,
+			(u32temp & 0xff000000) | (nesadapter->tcp_timer_core_clk_divisor & 0x00ffffff));
+
+	/* setup port configuration */
+	if (nesadapter->port_count == 1) {
+		nesadapter->log_port = 0x00000000;
+		if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT)
+			nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000002);
+		else
+			nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003);
+	} else {
+		if (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) {
+			nesadapter->log_port = 0x000000D8;
+		} else {
+			if (nesadapter->port_count == 2)
+				nesadapter->log_port = 0x00000044;
+			else
+				nesadapter->log_port = 0x000000e4;
+		}
+		nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003);
+	}
+
+	nes_write_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT,
+						nesadapter->log_port);
+	nes_debug(NES_DBG_INIT, "Probe time, LOG2PHY=%u\n",
+			nes_read_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT));
+
+	spin_lock_init(&nesadapter->resource_lock);
+	spin_lock_init(&nesadapter->phy_lock);
+	spin_lock_init(&nesadapter->pbl_lock);
+	spin_lock_init(&nesadapter->periodic_timer_lock);
+
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[0]);
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[1]);
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[2]);
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[3]);
+
+	if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) {
+		u32 pcs_control_status0, pcs_control_status1;
+		u32 reset_value;
+		u32 i = 0;
+		u32 int_cnt = 0;
+		u32 ext_cnt = 0;
+		unsigned long flags;
+		u32 j = 0;
+
+		pcs_control_status0 = nes_read_indexed(nesdev,
+			NES_IDX_PHY_PCS_CONTROL_STATUS0);
+		pcs_control_status1 = nes_read_indexed(nesdev,
+			NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+
+		for (i = 0; i < NES_MAX_LINK_CHECK; i++) {
+			pcs_control_status0 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0);
+			pcs_control_status1 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+			if ((0x0F000100 == (pcs_control_status0 & 0x0F000100))
+			    || (0x0F000100 == (pcs_control_status1 & 0x0F000100)))
+				int_cnt++;
+			usleep_range(1000, 2000);
+		}
+		if (int_cnt > 1) {
+			spin_lock_irqsave(&nesadapter->phy_lock, flags);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
+			mh_detected++;
+			reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+			reset_value |= 0x0000003d;
+			nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+			while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+				& 0x00000040) != 0x00000040) && (j++ < 5000));
+			spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+
+			pcs_control_status0 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0);
+			pcs_control_status1 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+
+			for (i = 0; i < NES_MAX_LINK_CHECK; i++) {
+				pcs_control_status0 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0);
+				pcs_control_status1 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+				if ((0x0F000100 == (pcs_control_status0 & 0x0F000100))
+					|| (0x0F000100 == (pcs_control_status1 & 0x0F000100))) {
+					if (++ext_cnt > int_cnt) {
+						spin_lock_irqsave(&nesadapter->phy_lock, flags);
+						nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1,
+								0x0000F088);
+						mh_detected++;
+						reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+						reset_value |= 0x0000003d;
+						nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+						while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+							& 0x00000040) != 0x00000040) && (j++ < 5000));
+						spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+						break;
+					}
+				}
+				usleep_range(1000, 2000);
+			}
+		}
+	}
+
+	if (nesadapter->hw_rev == NE020_REV) {
+		timer_setup(&nesadapter->mh_timer, nes_mh_fix, 0);
+		nesadapter->mh_timer.expires = jiffies + (HZ/5);  /* 1 second */
+		add_timer(&nesadapter->mh_timer);
+	} else {
+		nes_write32(nesdev->regs+NES_INTF_INT_STAT, 0x0f000000);
+	}
+
+	timer_setup(&nesadapter->lc_timer, nes_clc, 0);
+	nesadapter->lc_timer.expires = jiffies + 3600 * HZ;  /* 1 hour */
+	add_timer(&nesadapter->lc_timer);
+
+	list_add_tail(&nesadapter->list, &nes_adapter_list);
+
+	for (func_index = 0; func_index < 8; func_index++) {
+		pci_bus_read_config_word(nesdev->pcidev->bus,
+					PCI_DEVFN(PCI_SLOT(nesdev->pcidev->devfn),
+					func_index), 0, &vendor_id);
+		if (vendor_id == 0xffff)
+			break;
+	}
+	nes_debug(NES_DBG_INIT, "%s %d functions found for %s.\n", __func__,
+		func_index, pci_name(nesdev->pcidev));
+	nesadapter->adapter_fcn_count = func_index;
+
+	return nesadapter;
+}
+
+
+/**
+ * nes_reset_adapter_ne020
+ */
+static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode)
+{
+	u32 port_count;
+	u32 u32temp;
+	u32 i;
+
+	u32temp = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+	port_count = ((u32temp & 0x00000300) >> 8) + 1;
+	/* TODO: assuming that both SERDES are set the same for now */
+	*OneG_Mode = (u32temp & 0x00003c00) ? 0 : 1;
+	nes_debug(NES_DBG_INIT, "Initial Software Reset = 0x%08X, port_count=%u\n",
+			u32temp, port_count);
+	if (*OneG_Mode)
+		nes_debug(NES_DBG_INIT, "Running in 1G mode.\n");
+	u32temp &= 0xff00ffc0;
+	switch (port_count) {
+		case 1:
+			u32temp |= 0x00ee0000;
+			break;
+		case 2:
+			u32temp |= 0x00cc0000;
+			break;
+		case 4:
+			u32temp |= 0x00000000;
+			break;
+		default:
+			return 0;
+			break;
+	}
+
+	/* check and do full reset if needed */
+	if (nes_read_indexed(nesdev, NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))) {
+		nes_debug(NES_DBG_INIT, "Issuing Full Soft reset = 0x%08X\n", u32temp | 0xd);
+		nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd);
+
+		i = 0;
+		while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000)
+			mdelay(1);
+		if (i > 10000) {
+			nes_debug(NES_DBG_INIT, "Did not see full soft reset done.\n");
+			return 0;
+		}
+
+		i = 0;
+		while ((nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS) != 0x80) && i++ < 10000)
+			mdelay(1);
+		if (i > 10000) {
+			printk(KERN_ERR PFX "Internal CPU not ready, status = %02X\n",
+			       nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS));
+			return 0;
+		}
+	}
+
+	/* port reset */
+	switch (port_count) {
+		case 1:
+			u32temp |= 0x00ee0010;
+			break;
+		case 2:
+			u32temp |= 0x00cc0030;
+			break;
+		case 4:
+			u32temp |= 0x00000030;
+			break;
+	}
+
+	nes_debug(NES_DBG_INIT, "Issuing Port Soft reset = 0x%08X\n", u32temp | 0xd);
+	nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd);
+
+	i = 0;
+	while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000)
+		mdelay(1);
+	if (i > 10000) {
+		nes_debug(NES_DBG_INIT, "Did not see port soft reset done.\n");
+		return 0;
+	}
+
+	/* serdes 0 */
+	i = 0;
+	while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0)
+			& 0x0000000f)) != 0x0000000f) && i++ < 5000)
+		mdelay(1);
+	if (i > 5000) {
+		nes_debug(NES_DBG_INIT, "Serdes 0 not ready, status=%x\n", u32temp);
+		return 0;
+	}
+
+	/* serdes 1 */
+	if (port_count > 1) {
+		i = 0;
+		while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1)
+				& 0x0000000f)) != 0x0000000f) && i++ < 5000)
+			mdelay(1);
+		if (i > 5000) {
+			nes_debug(NES_DBG_INIT, "Serdes 1 not ready, status=%x\n", u32temp);
+			return 0;
+		}
+	}
+
+	return port_count;
+}
+
+
+/**
+ * nes_init_serdes
+ */
+static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count,
+				struct nes_adapter *nesadapter, u8  OneG_Mode)
+{
+	int i;
+	u32 u32temp;
+	u32 sds;
+
+	if (hw_rev != NE020_REV) {
+		/* init serdes 0 */
+		switch (nesadapter->phy_type[0]) {
+		case NES_PHY_TYPE_CX4:
+			if (wide_ppm_offset)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000FFFAA);
+			else
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			break;
+		case NES_PHY_TYPE_KR:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000);
+			break;
+		case NES_PHY_TYPE_PUMA_1G:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0);
+			sds |= 0x00000100;
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds);
+			break;
+		default:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			break;
+		}
+
+		if (!OneG_Mode)
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000);
+
+		if (port_count < 2)
+			return 0;
+
+		/* init serdes 1 */
+		if (!(OneG_Mode && (nesadapter->phy_type[1] != NES_PHY_TYPE_PUMA_1G)))
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000FF);
+
+		switch (nesadapter->phy_type[1]) {
+		case NES_PHY_TYPE_ARGUS:
+		case NES_PHY_TYPE_SFP_D:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000);
+			break;
+		case NES_PHY_TYPE_CX4:
+			if (wide_ppm_offset)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000FFFAA);
+			break;
+		case NES_PHY_TYPE_KR:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000);
+			break;
+		case NES_PHY_TYPE_PUMA_1G:
+			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			sds |= 0x000000100;
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds);
+		}
+		if (!OneG_Mode) {
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1, 0x11110000);
+			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			sds &= 0xFFFFFFBF;
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds);
+		}
+	} else {
+		/* init serdes 0 */
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008);
+		i = 0;
+		while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0)
+				& 0x0000000f)) != 0x0000000f) && i++ < 5000)
+			mdelay(1);
+		if (i > 5000) {
+			nes_debug(NES_DBG_PHY, "Init: serdes 0 not ready, status=%x\n", u32temp);
+			return 1;
+		}
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000);
+		if (OneG_Mode)
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222);
+		else
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222);
+
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff);
+		if (port_count > 1) {
+			/* init serdes 1 */
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x00000048);
+			i = 0;
+			while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1)
+				& 0x0000000f)) != 0x0000000f) && (i++ < 5000))
+				mdelay(1);
+			if (i > 5000) {
+				printk("%s: Init: serdes 1 not ready, status=%x\n", __func__, u32temp);
+				/* return 1; */
+			}
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x000bdef7);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE1, 0x9ce73000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE1, 0x0ff00000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET1, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS1, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL1, 0xf0002222);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000ff);
+		}
+	}
+	return 0;
+}
+
+
+/**
+ * nes_init_csr_ne020
+ * Initialize registers for ne020 hardware
+ */
+static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count)
+{
+	u32 u32temp;
+
+	nes_debug(NES_DBG_INIT, "port_count=%d\n", port_count);
+
+	nes_write_indexed(nesdev, 0x000001E4, 0x00000007);
+	/* nes_write_indexed(nesdev, 0x000001E8, 0x000208C4); */
+	nes_write_indexed(nesdev, 0x000001E8, 0x00020874);
+	nes_write_indexed(nesdev, 0x000001D8, 0x00048002);
+	/* nes_write_indexed(nesdev, 0x000001D8, 0x0004B002); */
+	nes_write_indexed(nesdev, 0x000001FC, 0x00050005);
+	nes_write_indexed(nesdev, 0x00000600, 0x55555555);
+	nes_write_indexed(nesdev, 0x00000604, 0x55555555);
+
+	/* TODO: move these MAC register settings to NIC bringup */
+	nes_write_indexed(nesdev, 0x00002000, 0x00000001);
+	nes_write_indexed(nesdev, 0x00002004, 0x00000001);
+	nes_write_indexed(nesdev, 0x00002008, 0x0000FFFF);
+	nes_write_indexed(nesdev, 0x0000200C, 0x00000001);
+	nes_write_indexed(nesdev, 0x00002010, 0x000003c1);
+	nes_write_indexed(nesdev, 0x0000201C, 0x75345678);
+	if (port_count > 1) {
+		nes_write_indexed(nesdev, 0x00002200, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002204, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002208, 0x0000FFFF);
+		nes_write_indexed(nesdev, 0x0000220C, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002210, 0x000003c1);
+		nes_write_indexed(nesdev, 0x0000221C, 0x75345678);
+		nes_write_indexed(nesdev, 0x00000908, 0x20000001);
+	}
+	if (port_count > 2) {
+		nes_write_indexed(nesdev, 0x00002400, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002404, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002408, 0x0000FFFF);
+		nes_write_indexed(nesdev, 0x0000240C, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002410, 0x000003c1);
+		nes_write_indexed(nesdev, 0x0000241C, 0x75345678);
+		nes_write_indexed(nesdev, 0x00000910, 0x20000001);
+
+		nes_write_indexed(nesdev, 0x00002600, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002604, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002608, 0x0000FFFF);
+		nes_write_indexed(nesdev, 0x0000260C, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002610, 0x000003c1);
+		nes_write_indexed(nesdev, 0x0000261C, 0x75345678);
+		nes_write_indexed(nesdev, 0x00000918, 0x20000001);
+	}
+
+	nes_write_indexed(nesdev, 0x00005000, 0x00018000);
+	/* nes_write_indexed(nesdev, 0x00005000, 0x00010000); */
+	nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1, (wqm_quanta << 1) |
+							 0x00000001);
+	nes_write_indexed(nesdev, 0x00005008, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00005010, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00005018, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00005020, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00006090, 0xFFFFFFFF);
+
+	/* TODO: move this to code, get from EEPROM */
+	nes_write_indexed(nesdev, 0x00000900, 0x20000001);
+	nes_write_indexed(nesdev, 0x000060C0, 0x0000028e);
+	nes_write_indexed(nesdev, 0x000060C8, 0x00000020);
+
+	nes_write_indexed(nesdev, 0x000001EC, 0x7b2625a0);
+	/* nes_write_indexed(nesdev, 0x000001EC, 0x5f2625a0); */
+
+	if (hw_rev != NE020_REV) {
+		u32temp = nes_read_indexed(nesdev, 0x000008e8);
+		u32temp |= 0x80000000;
+		nes_write_indexed(nesdev, 0x000008e8, u32temp);
+		u32temp = nes_read_indexed(nesdev, 0x000021f8);
+		u32temp &= 0x7fffffff;
+		u32temp |= 0x7fff0010;
+		nes_write_indexed(nesdev, 0x000021f8, u32temp);
+		if (port_count > 1) {
+			u32temp = nes_read_indexed(nesdev, 0x000023f8);
+			u32temp &= 0x7fffffff;
+			u32temp |= 0x7fff0010;
+			nes_write_indexed(nesdev, 0x000023f8, u32temp);
+		}
+	}
+}
+
+
+/**
+ * nes_destroy_adapter - destroy the adapter structure
+ */
+void nes_destroy_adapter(struct nes_adapter *nesadapter)
+{
+	struct nes_adapter *tmp_adapter;
+
+	list_for_each_entry(tmp_adapter, &nes_adapter_list, list) {
+		nes_debug(NES_DBG_SHUTDOWN, "Nes Adapter list entry = 0x%p.\n",
+				tmp_adapter);
+	}
+
+	nesadapter->ref_count--;
+	if (!nesadapter->ref_count) {
+		if (nesadapter->hw_rev == NE020_REV) {
+			del_timer(&nesadapter->mh_timer);
+		}
+		del_timer(&nesadapter->lc_timer);
+
+		list_del(&nesadapter->list);
+		kfree(nesadapter);
+	}
+}
+
+
+/**
+ * nes_init_cqp
+ */
+int nes_init_cqp(struct nes_device *nesdev)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_cqp_qp_context *cqp_qp_context;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_hw_ceq *ceq;
+	struct nes_hw_ceq *nic_ceq;
+	struct nes_hw_aeq *aeq;
+	void *vmem;
+	dma_addr_t pmem;
+	u32 count=0;
+	u32 cqp_head;
+	u64 u64temp;
+	u32 u32temp;
+
+	/* allocate CQP memory */
+	/* Need to add max_cq to the aeq size once cq overflow checking is added back */
+	/* SQ is 512 byte aligned, others are 256 byte aligned */
+	nesdev->cqp_mem_size = 512 +
+			(sizeof(struct nes_hw_cqp_wqe) * NES_CQP_SQ_SIZE) +
+			(sizeof(struct nes_hw_cqe) * NES_CCQ_SIZE) +
+			max(((u32)sizeof(struct nes_hw_ceqe) * NES_CCEQ_SIZE), (u32)256) +
+			max(((u32)sizeof(struct nes_hw_ceqe) * NES_NIC_CEQ_SIZE), (u32)256) +
+			(sizeof(struct nes_hw_aeqe) * nesadapter->max_qp) +
+			sizeof(struct nes_hw_cqp_qp_context);
+
+	nesdev->cqp_vbase = pci_zalloc_consistent(nesdev->pcidev,
+						  nesdev->cqp_mem_size,
+						  &nesdev->cqp_pbase);
+	if (!nesdev->cqp_vbase) {
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory for host descriptor rings\n");
+		return -ENOMEM;
+	}
+
+	/* Allocate a twice the number of CQP requests as the SQ size */
+	nesdev->nes_cqp_requests = kzalloc(sizeof(struct nes_cqp_request) *
+			2 * NES_CQP_SQ_SIZE, GFP_KERNEL);
+	if (!nesdev->nes_cqp_requests) {
+		pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase,
+				nesdev->cqp.sq_pbase);
+		return -ENOMEM;
+	}
+
+	nes_debug(NES_DBG_INIT, "Allocated CQP structures at %p (phys = %016lX), size = %u.\n",
+			nesdev->cqp_vbase, (unsigned long)nesdev->cqp_pbase, nesdev->cqp_mem_size);
+
+	spin_lock_init(&nesdev->cqp.lock);
+	init_waitqueue_head(&nesdev->cqp.waitq);
+
+	/* Setup Various Structures */
+	vmem = (void *)(((unsigned long)nesdev->cqp_vbase + (512 - 1)) &
+			~(unsigned long)(512 - 1));
+	pmem = (dma_addr_t)(((unsigned long long)nesdev->cqp_pbase + (512 - 1)) &
+			~(unsigned long long)(512 - 1));
+
+	nesdev->cqp.sq_vbase = vmem;
+	nesdev->cqp.sq_pbase = pmem;
+	nesdev->cqp.sq_size = NES_CQP_SQ_SIZE;
+	nesdev->cqp.sq_head = 0;
+	nesdev->cqp.sq_tail = 0;
+	nesdev->cqp.qp_id = PCI_FUNC(nesdev->pcidev->devfn);
+
+	vmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size);
+	pmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size);
+
+	nesdev->ccq.cq_vbase = vmem;
+	nesdev->ccq.cq_pbase = pmem;
+	nesdev->ccq.cq_size = NES_CCQ_SIZE;
+	nesdev->ccq.cq_head = 0;
+	nesdev->ccq.ce_handler = nes_cqp_ce_handler;
+	nesdev->ccq.cq_number = PCI_FUNC(nesdev->pcidev->devfn);
+
+	vmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size);
+	pmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size);
+
+	nesdev->ceq_index = PCI_FUNC(nesdev->pcidev->devfn);
+	ceq = &nesadapter->ceq[nesdev->ceq_index];
+	ceq->ceq_vbase = vmem;
+	ceq->ceq_pbase = pmem;
+	ceq->ceq_size = NES_CCEQ_SIZE;
+	ceq->ceq_head = 0;
+
+	vmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256);
+	pmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256);
+
+	nesdev->nic_ceq_index = PCI_FUNC(nesdev->pcidev->devfn) + 8;
+	nic_ceq = &nesadapter->ceq[nesdev->nic_ceq_index];
+	nic_ceq->ceq_vbase = vmem;
+	nic_ceq->ceq_pbase = pmem;
+	nic_ceq->ceq_size = NES_NIC_CEQ_SIZE;
+	nic_ceq->ceq_head = 0;
+
+	vmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256);
+	pmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256);
+
+	aeq = &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)];
+	aeq->aeq_vbase = vmem;
+	aeq->aeq_pbase = pmem;
+	aeq->aeq_size = nesadapter->max_qp;
+	aeq->aeq_head = 0;
+
+	/* Setup QP Context */
+	vmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size);
+	pmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size);
+
+	cqp_qp_context = vmem;
+	cqp_qp_context->context_words[0] =
+			cpu_to_le32((PCI_FUNC(nesdev->pcidev->devfn) << 12) + (2 << 10));
+	cqp_qp_context->context_words[1] = 0;
+	cqp_qp_context->context_words[2] = cpu_to_le32((u32)nesdev->cqp.sq_pbase);
+	cqp_qp_context->context_words[3] = cpu_to_le32(((u64)nesdev->cqp.sq_pbase) >> 32);
+
+
+	/* Write the address to Create CQP */
+	if ((sizeof(dma_addr_t) > 4)) {
+		nes_write_indexed(nesdev,
+				NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8),
+				((u64)pmem) >> 32);
+	} else {
+		nes_write_indexed(nesdev,
+				NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), 0);
+	}
+	nes_write_indexed(nesdev,
+			NES_IDX_CREATE_CQP_LOW + (PCI_FUNC(nesdev->pcidev->devfn) * 8),
+			(u32)pmem);
+
+	INIT_LIST_HEAD(&nesdev->cqp_avail_reqs);
+	INIT_LIST_HEAD(&nesdev->cqp_pending_reqs);
+
+	for (count = 0; count < 2*NES_CQP_SQ_SIZE; count++) {
+		init_waitqueue_head(&nesdev->nes_cqp_requests[count].waitq);
+		list_add_tail(&nesdev->nes_cqp_requests[count].list, &nesdev->cqp_avail_reqs);
+	}
+
+	/* Write Create CCQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			(NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			NES_CQP_CQ_CHK_OVERFLOW | ((u32)nesdev->ccq.cq_size << 16)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+			    (nesdev->ccq.cq_number |
+			     ((u32)nesdev->ceq_index << 16)));
+	u64temp = (u64)nesdev->ccq.cq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
+	u64temp = (unsigned long)&nesdev->ccq;
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =
+			cpu_to_le32((u32)(u64temp >> 1));
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+
+	/* Write Create CEQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			    (NES_CQP_CREATE_CEQ + ((u32)nesdev->ceq_index << 8)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, ceq->ceq_size);
+	u64temp = (u64)ceq->ceq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+
+	/* Write Create AEQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			(NES_CQP_CREATE_AEQ + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX, aeq->aeq_size);
+	u64temp = (u64)aeq->aeq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+
+	/* Write Create NIC CEQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			(NES_CQP_CREATE_CEQ + ((u32)nesdev->nic_ceq_index << 8)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, nic_ceq->ceq_size);
+	u64temp = (u64)nic_ceq->ceq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+
+	/* Poll until CCQP done */
+	count = 0;
+	do {
+		if (count++ > 1000) {
+			printk(KERN_ERR PFX "Error creating CQP\n");
+			pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size,
+					nesdev->cqp_vbase, nesdev->cqp_pbase);
+			return -1;
+		}
+		udelay(10);
+	} while (!(nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn) * 8)) & (1 << 8)));
+
+	nes_debug(NES_DBG_INIT, "CQP Status = 0x%08X\n", nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
+
+	u32temp = 0x04800000;
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, u32temp | nesdev->cqp.qp_id);
+
+	/* wait for the CCQ, CEQ, and AEQ to get created */
+	count = 0;
+	do {
+		if (count++ > 1000) {
+			printk(KERN_ERR PFX "Error creating CCQ, CEQ, and AEQ\n");
+			pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size,
+					nesdev->cqp_vbase, nesdev->cqp_pbase);
+			return -1;
+		}
+		udelay(10);
+	} while (((nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15<<8)) != (15<<8)));
+
+	/* dump the QP status value */
+	nes_debug(NES_DBG_INIT, "QP Status = 0x%08X\n", nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
+
+	nesdev->cqp.sq_tail++;
+
+	return 0;
+}
+
+
+/**
+ * nes_destroy_cqp
+ */
+int nes_destroy_cqp(struct nes_device *nesdev)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 count = 0;
+	u32 cqp_head;
+	unsigned long flags;
+
+	do {
+		if (count++ > 1000)
+			break;
+		udelay(10);
+	} while (!(nesdev->cqp.sq_head == nesdev->cqp.sq_tail));
+
+	/* Reset CCQ */
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_RESET |
+			nesdev->ccq.cq_number);
+
+	/* Disable device interrupts */
+	nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff);
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+	/* Destroy the AEQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_AEQ |
+			((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8));
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0;
+
+	/* Destroy the NIC CEQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ |
+			((u32)nesdev->nic_ceq_index << 8));
+
+	/* Destroy the CEQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ |
+			(nesdev->ceq_index << 8));
+
+	/* Destroy the CCQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CQ);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->ccq.cq_number |
+			((u32)nesdev->ceq_index << 16));
+
+	/* Destroy CQP */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_QP |
+			NES_CQP_QP_TYPE_CQP);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->cqp.qp_id);
+
+	barrier();
+	/* Ring doorbell (5 WQEs) */
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x05800000 | nesdev->cqp.qp_id);
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+	/* wait for the CCQ, CEQ, and AEQ to get destroyed */
+	count = 0;
+	do {
+		if (count++ > 1000) {
+			printk(KERN_ERR PFX "Function%d: Error destroying CCQ, CEQ, and AEQ\n",
+					PCI_FUNC(nesdev->pcidev->devfn));
+			break;
+		}
+		udelay(10);
+	} while (((nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15 << 8)) != 0));
+
+	/* dump the QP status value */
+	nes_debug(NES_DBG_SHUTDOWN, "Function%d: QP Status = 0x%08X\n",
+			PCI_FUNC(nesdev->pcidev->devfn),
+			nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
+
+	kfree(nesdev->nes_cqp_requests);
+
+	/* Free the control structures */
+	pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase,
+			nesdev->cqp.sq_pbase);
+
+	return 0;
+}
+
+
+/**
+ * nes_init_1g_phy
+ */
+static int nes_init_1g_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
+{
+	u32 counter = 0;
+	u16 phy_data;
+	int ret = 0;
+
+	nes_read_1G_phy_reg(nesdev, 1, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 23, phy_index, 0xb000);
+
+	/* Reset the PHY */
+	nes_write_1G_phy_reg(nesdev, 0, phy_index, 0x8000);
+	udelay(100);
+	counter = 0;
+	do {
+		nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+		if (counter++ > 100) {
+			ret = -1;
+			break;
+		}
+	} while (phy_data & 0x8000);
+
+	/* Setting no phy loopback */
+	phy_data &= 0xbfff;
+	phy_data |= 0x1140;
+	nes_write_1G_phy_reg(nesdev, 0, phy_index,  phy_data);
+	nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+	nes_read_1G_phy_reg(nesdev, 0x17, phy_index, &phy_data);
+	nes_read_1G_phy_reg(nesdev, 0x1e, phy_index, &phy_data);
+
+	/* Setting the interrupt mask */
+	nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 0x19, phy_index, 0xffee);
+	nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
+
+	/* turning on flow control */
+	nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 4, phy_index, (phy_data & ~(0x03E0)) | 0xc00);
+	nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
+
+	/* Clear Half duplex */
+	nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 9, phy_index, phy_data & ~(0x0100));
+	nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
+
+	nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data | 0x0300);
+
+	return ret;
+}
+
+
+/**
+ * nes_init_2025_phy
+ */
+static int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
+{
+	u32 temp_phy_data = 0;
+	u32 temp_phy_data2 = 0;
+	u32 counter = 0;
+	u32 sds;
+	u32 mac_index = nesdev->mac_index;
+	int ret = 0;
+	unsigned int first_attempt = 1;
+
+	/* Check firmware heartbeat */
+	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	udelay(1500);
+	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+	temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+	if (temp_phy_data != temp_phy_data2) {
+		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
+		temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+		if ((temp_phy_data & 0xff) > 0x20)
+			return 0;
+		printk(PFX "Reinitialize external PHY\n");
+	}
+
+	/* no heartbeat, configure the PHY */
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+
+	switch (phy_type) {
+	case NES_PHY_TYPE_ARGUS:
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
+
+		/* setup LEDs */
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
+		break;
+
+	case NES_PHY_TYPE_SFP_D:
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x0004);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0038);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
+
+		/* setup LEDs */
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
+		break;
+
+	case NES_PHY_TYPE_KR:
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0010);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0080);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
+
+		/* setup LEDs */
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x000B);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x0003);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0004);
+
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0022, 0x406D);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0023, 0x0020);
+		break;
+	}
+
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0028, 0xA528);
+
+	/* Bring PHY out of reset */
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0002);
+
+	/* Check for heartbeat */
+	counter = 0;
+	mdelay(690);
+	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	do {
+		if (counter++ > 150) {
+			printk(PFX "No PHY heartbeat\n");
+			break;
+		}
+		mdelay(1);
+		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+		temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	} while ((temp_phy_data2 == temp_phy_data));
+
+	/* wait for tracking */
+	counter = 0;
+	do {
+		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
+		temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+		if (counter++ > 300) {
+			if (((temp_phy_data & 0xff) == 0x0) && first_attempt) {
+				first_attempt = 0;
+				counter = 0;
+				/* reset AMCC PHY and try again */
+				nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0);
+				nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040);
+				continue;
+			} else {
+				ret = 1;
+				break;
+			}
+		}
+		mdelay(10);
+	} while ((temp_phy_data & 0xff) < 0x30);
+
+	/* setup signal integrity */
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00D, 0x00FE);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00E, 0x0032);
+	if (phy_type == NES_PHY_TYPE_KR) {
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x000C);
+	} else {
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x0002);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc314, 0x0063);
+	}
+
+	/* reset serdes */
+	sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200);
+	sds |= 0x1;
+	nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
+	sds &= 0xfffffffe;
+	nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
+
+	counter = 0;
+	while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040)
+			&& (counter++ < 5000))
+		;
+
+	return ret;
+}
+
+
+/**
+ * nes_init_phy
+ */
+int nes_init_phy(struct nes_device *nesdev)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 mac_index = nesdev->mac_index;
+	u32 tx_config = 0;
+	unsigned long flags;
+	u8  phy_type = nesadapter->phy_type[mac_index];
+	u8  phy_index = nesadapter->phy_index[mac_index];
+	int ret = 0;
+
+	tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
+	if (phy_type == NES_PHY_TYPE_1G) {
+		/* setup 1G MDIO operation */
+		tx_config &= 0xFFFFFFE3;
+		tx_config |= 0x04;
+	} else {
+		/* setup 10G MDIO operation */
+		tx_config &= 0xFFFFFFE3;
+		tx_config |= 0x1D;
+	}
+	nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
+
+	spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+
+	switch (phy_type) {
+	case NES_PHY_TYPE_1G:
+		ret = nes_init_1g_phy(nesdev, phy_type, phy_index);
+		break;
+	case NES_PHY_TYPE_ARGUS:
+	case NES_PHY_TYPE_SFP_D:
+	case NES_PHY_TYPE_KR:
+		ret = nes_init_2025_phy(nesdev, phy_type, phy_index);
+		break;
+	}
+
+	spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+
+	return ret;
+}
+
+
+/**
+ * nes_replenish_nic_rq
+ */
+static void nes_replenish_nic_rq(struct nes_vnic *nesvnic)
+{
+	unsigned long flags;
+	dma_addr_t bus_address;
+	struct sk_buff *skb;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_hw_nic *nesnic;
+	struct nes_device *nesdev;
+	struct nes_rskb_cb *cb;
+	u32 rx_wqes_posted = 0;
+
+	nesnic = &nesvnic->nic;
+	nesdev = nesvnic->nesdev;
+	spin_lock_irqsave(&nesnic->rq_lock, flags);
+	if (nesnic->replenishing_rq !=0) {
+		if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) &&
+				(atomic_read(&nesvnic->rx_skb_timer_running) == 0)) {
+			atomic_set(&nesvnic->rx_skb_timer_running, 1);
+			spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+			nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2);	/* 1/2 second */
+			add_timer(&nesvnic->rq_wqes_timer);
+		} else
+		spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+		return;
+	}
+	nesnic->replenishing_rq = 1;
+	spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+	do {
+		skb = dev_alloc_skb(nesvnic->max_frame_size);
+		if (skb) {
+			skb->dev = nesvnic->netdev;
+
+			bus_address = pci_map_single(nesdev->pcidev,
+					skb->data, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+			cb->busaddr = bus_address;
+			cb->maplen = nesvnic->max_frame_size;
+
+			nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_head];
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] =
+					cpu_to_le32(nesvnic->max_frame_size);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] =
+					cpu_to_le32((u32)bus_address);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] =
+					cpu_to_le32((u32)((u64)bus_address >> 32));
+			nesnic->rx_skb[nesnic->rq_head] = skb;
+			nesnic->rq_head++;
+			nesnic->rq_head &= nesnic->rq_size - 1;
+			atomic_dec(&nesvnic->rx_skbs_needed);
+			barrier();
+			if (++rx_wqes_posted == 255) {
+				nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id);
+				rx_wqes_posted = 0;
+			}
+		} else {
+			spin_lock_irqsave(&nesnic->rq_lock, flags);
+			if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) &&
+					(atomic_read(&nesvnic->rx_skb_timer_running) == 0)) {
+				atomic_set(&nesvnic->rx_skb_timer_running, 1);
+				spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+				nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2);	/* 1/2 second */
+				add_timer(&nesvnic->rq_wqes_timer);
+			} else
+				spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+			break;
+		}
+	} while (atomic_read(&nesvnic->rx_skbs_needed));
+	barrier();
+	if (rx_wqes_posted)
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id);
+	nesnic->replenishing_rq = 0;
+}
+
+
+/**
+ * nes_rq_wqes_timeout
+ */
+static void nes_rq_wqes_timeout(struct timer_list *t)
+{
+	struct nes_vnic *nesvnic = from_timer(nesvnic, t, rq_wqes_timer);
+	printk("%s: Timer fired.\n", __func__);
+	atomic_set(&nesvnic->rx_skb_timer_running, 0);
+	if (atomic_read(&nesvnic->rx_skbs_needed))
+		nes_replenish_nic_rq(nesvnic);
+}
+
+
+/**
+ * nes_init_nic_qp
+ */
+int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct nes_hw_nic_qp_context *nic_context;
+	struct sk_buff *skb;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	unsigned long flags;
+	void *vmem;
+	dma_addr_t pmem;
+	u64 u64temp;
+	int ret;
+	u32 cqp_head;
+	u32 counter;
+	u32 wqe_count;
+	struct nes_rskb_cb *cb;
+	u8 jumbomode=0;
+
+	/* Allocate fragment, SQ, RQ, and CQ; Reuse CEQ based on the PCI function */
+	nesvnic->nic_mem_size = 256 +
+			(NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)) +
+			(NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)) +
+			(NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)) +
+			(NES_NIC_WQ_SIZE * 2 * sizeof(struct nes_hw_nic_cqe)) +
+			sizeof(struct nes_hw_nic_qp_context);
+
+	nesvnic->nic_vbase = pci_zalloc_consistent(nesdev->pcidev,
+						   nesvnic->nic_mem_size,
+						   &nesvnic->nic_pbase);
+	if (!nesvnic->nic_vbase) {
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory for NIC host descriptor rings\n");
+		return -ENOMEM;
+	}
+	nes_debug(NES_DBG_INIT, "Allocated NIC QP structures at %p (phys = %016lX), size = %u.\n",
+			nesvnic->nic_vbase, (unsigned long)nesvnic->nic_pbase, nesvnic->nic_mem_size);
+
+	vmem = (void *)(((unsigned long)nesvnic->nic_vbase + (256 - 1)) &
+			~(unsigned long)(256 - 1));
+	pmem = (dma_addr_t)(((unsigned long long)nesvnic->nic_pbase + (256 - 1)) &
+			~(unsigned long long)(256 - 1));
+
+	/* Setup the first Fragment buffers */
+	nesvnic->nic.first_frag_vbase = vmem;
+
+	for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) {
+		nesvnic->nic.frag_paddr[counter] = pmem;
+		pmem += sizeof(struct nes_first_frag);
+	}
+
+	/* setup the SQ */
+	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag));
+
+	nesvnic->nic.sq_vbase = (void *)vmem;
+	nesvnic->nic.sq_pbase = pmem;
+	nesvnic->nic.sq_head = 0;
+	nesvnic->nic.sq_tail = 0;
+	nesvnic->nic.sq_size = NES_NIC_WQ_SIZE;
+	for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) {
+		nic_sqe = &nesvnic->nic.sq_vbase[counter];
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_MISC_IDX] =
+				cpu_to_le32(NES_NIC_SQ_WQE_DISABLE_CHKSUM |
+				NES_NIC_SQ_WQE_COMPLETION);
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX] =
+				cpu_to_le32((u32)NES_FIRST_FRAG_SIZE << 16);
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX] =
+				cpu_to_le32((u32)nesvnic->nic.frag_paddr[counter]);
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX] =
+				cpu_to_le32((u32)((u64)nesvnic->nic.frag_paddr[counter] >> 32));
+	}
+
+	nesvnic->get_cqp_request = nes_get_cqp_request;
+	nesvnic->post_cqp_request = nes_post_cqp_request;
+	nesvnic->mcrq_mcast_filter = NULL;
+
+	spin_lock_init(&nesvnic->nic.rq_lock);
+
+	/* setup the RQ */
+	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe));
+	pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe));
+
+
+	nesvnic->nic.rq_vbase = vmem;
+	nesvnic->nic.rq_pbase = pmem;
+	nesvnic->nic.rq_head = 0;
+	nesvnic->nic.rq_tail = 0;
+	nesvnic->nic.rq_size = NES_NIC_WQ_SIZE;
+
+	/* setup the CQ */
+	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe));
+	pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe));
+
+	if (nesdev->nesadapter->netdev_count > 2)
+		nesvnic->mcrq_qp_id = nesvnic->nic_index + 32;
+	else
+		nesvnic->mcrq_qp_id = nesvnic->nic.qp_id + 4;
+
+	nesvnic->nic_cq.cq_vbase = vmem;
+	nesvnic->nic_cq.cq_pbase = pmem;
+	nesvnic->nic_cq.cq_head = 0;
+	nesvnic->nic_cq.cq_size = NES_NIC_WQ_SIZE * 2;
+
+	nesvnic->nic_cq.ce_handler = nes_nic_napi_ce_handler;
+
+	/* Send CreateCQ request to CQP */
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+	cqp_head = nesdev->cqp.sq_head;
+
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
+			NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			((u32)nesvnic->nic_cq.cq_size << 16));
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(
+			nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16));
+	u64temp = (u64)nesvnic->nic_cq.cq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =  0;
+	u64temp = (unsigned long)&nesvnic->nic_cq;
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =  cpu_to_le32((u32)(u64temp >> 1));
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	/* Send CreateQP request to CQP */
+	nic_context = (void *)(&nesvnic->nic_cq.cq_vbase[nesvnic->nic_cq.cq_size]);
+	nic_context->context_words[NES_NIC_CTX_MISC_IDX] =
+			cpu_to_le32((u32)NES_NIC_CTX_SIZE |
+			((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12));
+	nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n",
+			nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE),
+			nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE));
+	if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) {
+		nic_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE);
+	}
+
+	u64temp = (u64)nesvnic->nic.sq_pbase;
+	nic_context->context_words[NES_NIC_CTX_SQ_LOW_IDX]  = cpu_to_le32((u32)u64temp);
+	nic_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+	u64temp = (u64)nesvnic->nic.rq_pbase;
+	nic_context->context_words[NES_NIC_CTX_RQ_LOW_IDX]  = cpu_to_le32((u32)u64temp);
+	nic_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP |
+			NES_CQP_QP_TYPE_NIC);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesvnic->nic.qp_id);
+	u64temp = (u64)nesvnic->nic_cq.cq_pbase +
+			(nesvnic->nic_cq.cq_size * sizeof(struct nes_hw_nic_cqe));
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+	nesdev->cqp.sq_head = cqp_head;
+
+	barrier();
+
+	/* Ring doorbell (2 WQEs) */
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	nes_debug(NES_DBG_INIT, "Waiting for create NIC QP%u to complete.\n",
+			nesvnic->nic.qp_id);
+
+	ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_INIT, "Create NIC QP%u completed, wait_event_timeout ret = %u.\n",
+			nesvnic->nic.qp_id, ret);
+	if (!ret) {
+		nes_debug(NES_DBG_INIT, "NIC QP%u create timeout expired\n", nesvnic->nic.qp_id);
+		pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase,
+				nesvnic->nic_pbase);
+		return -EIO;
+	}
+
+	/* Populate the RQ */
+	for (counter = 0; counter < (NES_NIC_WQ_SIZE - 1); counter++) {
+		skb = dev_alloc_skb(nesvnic->max_frame_size);
+		if (!skb) {
+			nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name);
+
+			nes_destroy_nic_qp(nesvnic);
+			return -ENOMEM;
+		}
+
+		skb->dev = netdev;
+
+		pmem = pci_map_single(nesdev->pcidev, skb->data,
+				nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+		cb = (struct nes_rskb_cb *)&skb->cb[0];
+		cb->busaddr = pmem;
+		cb->maplen = nesvnic->max_frame_size;
+
+		nic_rqe = &nesvnic->nic.rq_vbase[counter];
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32(nesvnic->max_frame_size);
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]  = cpu_to_le32((u32)pmem);
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32));
+		nesvnic->nic.rx_skb[counter] = skb;
+	}
+
+	wqe_count = NES_NIC_WQ_SIZE - 1;
+	nesvnic->nic.rq_head = wqe_count;
+	barrier();
+	do {
+		counter = min(wqe_count, ((u32)255));
+		wqe_count -= counter;
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter << 24) | nesvnic->nic.qp_id);
+	} while (wqe_count);
+	timer_setup(&nesvnic->rq_wqes_timer, nes_rq_wqes_timeout, 0);
+	nes_debug(NES_DBG_INIT, "NAPI support Enabled\n");
+	if (nesdev->nesadapter->et_use_adaptive_rx_coalesce)
+	{
+		nes_nic_init_timer(nesdev);
+		if (netdev->mtu > 1500)
+			jumbomode = 1;
+		nes_nic_init_timer_defaults(nesdev, jumbomode);
+	}
+	if ((nesdev->nesadapter->allow_unaligned_fpdus) &&
+		(nes_init_mgt_qp(nesdev, netdev, nesvnic))) {
+		nes_debug(NES_DBG_INIT, "%s: Out of memory for pau nic\n",
+			  netdev->name);
+		nes_destroy_nic_qp(nesvnic);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_destroy_nic_qp
+ */
+void nes_destroy_nic_qp(struct nes_vnic *nesvnic)
+{
+	u64 u64temp;
+	dma_addr_t bus_address;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	__le16 *wqe_fragment_length;
+	u16  wqe_fragment_index;
+	u32 cqp_head;
+	u32 wqm_cfg0;
+	unsigned long flags;
+	struct sk_buff *rx_skb;
+	struct nes_rskb_cb *cb;
+	int ret;
+
+	if (nesdev->nesadapter->allow_unaligned_fpdus)
+		nes_destroy_mgt(nesvnic);
+
+	/* clear wqe stall before destroying NIC QP */
+	wqm_cfg0 = nes_read_indexed(nesdev, NES_IDX_WQM_CONFIG0);
+	nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0 & 0xFFFF7FFF);
+
+	/* Free remaining NIC receive buffers */
+	while (nesvnic->nic.rq_head != nesvnic->nic.rq_tail) {
+		rx_skb = nesvnic->nic.rx_skb[nesvnic->nic.rq_tail];
+		cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
+		pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen,
+			PCI_DMA_FROMDEVICE);
+
+		dev_kfree_skb(nesvnic->nic.rx_skb[nesvnic->nic.rq_tail++]);
+		nesvnic->nic.rq_tail &= (nesvnic->nic.rq_size - 1);
+	}
+
+	/* Free remaining NIC transmit buffers */
+	while (nesvnic->nic.sq_head != nesvnic->nic.sq_tail) {
+		nic_sqe = &nesvnic->nic.sq_vbase[nesvnic->nic.sq_tail];
+		wqe_fragment_index = 1;
+		wqe_fragment_length = (__le16 *)
+			&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+		/* bump past the vlan tag */
+		wqe_fragment_length++;
+		if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) {
+			u64temp = (u64)le32_to_cpu(
+				nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+
+				wqe_fragment_index*2]);
+			u64temp += ((u64)le32_to_cpu(
+				nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX
+				+ wqe_fragment_index*2]))<<32;
+			bus_address = (dma_addr_t)u64temp;
+			if (test_and_clear_bit(nesvnic->nic.sq_tail,
+					nesvnic->nic.first_frag_overflow)) {
+				pci_unmap_single(nesdev->pcidev,
+						bus_address,
+						le16_to_cpu(wqe_fragment_length[
+							wqe_fragment_index++]),
+						PCI_DMA_TODEVICE);
+			}
+			for (; wqe_fragment_index < 5; wqe_fragment_index++) {
+				if (wqe_fragment_length[wqe_fragment_index]) {
+					u64temp = le32_to_cpu(
+						nic_sqe->wqe_words[
+						NES_NIC_SQ_WQE_FRAG0_LOW_IDX+
+						wqe_fragment_index*2]);
+					u64temp += ((u64)le32_to_cpu(
+						nic_sqe->wqe_words[
+						NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+
+						wqe_fragment_index*2]))<<32;
+					bus_address = (dma_addr_t)u64temp;
+					pci_unmap_page(nesdev->pcidev,
+							bus_address,
+							le16_to_cpu(
+							wqe_fragment_length[
+							wqe_fragment_index]),
+							PCI_DMA_TODEVICE);
+				} else
+					break;
+			}
+		}
+		if (nesvnic->nic.tx_skb[nesvnic->nic.sq_tail])
+			dev_kfree_skb(
+				nesvnic->nic.tx_skb[nesvnic->nic.sq_tail]);
+
+		nesvnic->nic.sq_tail = (nesvnic->nic.sq_tail + 1)
+					& (nesvnic->nic.sq_size - 1);
+	}
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+	/* Destroy NIC QP */
+	cqp_head = nesdev->cqp.sq_head;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+		(NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+		nesvnic->nic.qp_id);
+
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+
+	/* Destroy NIC CQ */
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+		(NES_CQP_DESTROY_CQ | ((u32)nesvnic->nic_cq.cq_size << 16)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+		(nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16)));
+
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+
+	nesdev->cqp.sq_head = cqp_head;
+	barrier();
+
+	/* Ring doorbell (2 WQEs) */
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u,"
+			" cqp.sq_tail=%u, cqp.sq_size=%u\n",
+			cqp_head, nesdev->cqp.sq_head,
+			nesdev->cqp.sq_tail, nesdev->cqp.sq_size);
+
+	ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+			NES_EVENT_TIMEOUT);
+
+	nes_debug(NES_DBG_SHUTDOWN, "Destroy NIC QP returned, wait_event_timeout ret = %u, cqp_head=%u,"
+			" cqp.sq_head=%u, cqp.sq_tail=%u\n",
+			ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
+	if (!ret) {
+		nes_debug(NES_DBG_SHUTDOWN, "NIC QP%u destroy timeout expired\n",
+				nesvnic->nic.qp_id);
+	}
+
+	pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase,
+			nesvnic->nic_pbase);
+
+	/* restore old wqm_cfg0 value */
+	nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0);
+}
+
+/**
+ * nes_napi_isr
+ */
+int nes_napi_isr(struct nes_device *nesdev)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 int_stat;
+
+	if (nesdev->napi_isr_ran) {
+		/* interrupt status has already been read in ISR */
+		int_stat = nesdev->int_stat;
+	} else {
+		int_stat = nes_read32(nesdev->regs + NES_INT_STAT);
+		nesdev->int_stat = int_stat;
+		nesdev->napi_isr_ran = 1;
+	}
+
+	int_stat &= nesdev->int_req;
+	/* iff NIC, process here, else wait for DPC */
+	if ((int_stat) && ((int_stat & 0x0000ff00) == int_stat)) {
+		nesdev->napi_isr_ran = 0;
+		nes_write32(nesdev->regs + NES_INT_STAT,
+			(int_stat &
+			~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 | NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3)));
+
+		/* Process the CEQs */
+		nes_process_ceq(nesdev, &nesdev->nesadapter->ceq[nesdev->nic_ceq_index]);
+
+		if (unlikely((((nesadapter->et_rx_coalesce_usecs_irq) &&
+					(!nesadapter->et_use_adaptive_rx_coalesce)) ||
+					((nesadapter->et_use_adaptive_rx_coalesce) &&
+					 (nesdev->deepcq_count > nesadapter->et_pkt_rate_low))))) {
+			if ((nesdev->int_req & NES_INT_TIMER) == 0) {
+				/* Enable Periodic timer interrupts */
+				nesdev->int_req |= NES_INT_TIMER;
+				/* ack any pending periodic timer interrupts so we don't get an immediate interrupt */
+				/* TODO: need to also ack other unused periodic timer values, get from nesadapter */
+				nes_write32(nesdev->regs+NES_TIMER_STAT,
+						nesdev->timer_int_req  | ~(nesdev->nesadapter->timer_int_req));
+				nes_write32(nesdev->regs+NES_INTF_INT_MASK,
+						~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER));
+			}
+
+			if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
+			{
+				nes_nic_init_timer(nesdev);
+			}
+			/* Enable interrupts, except CEQs */
+			nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+		} else {
+			/* Enable interrupts, make sure timer is off */
+			nesdev->int_req &= ~NES_INT_TIMER;
+			nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+		}
+		nesdev->deepcq_count = 0;
+		return 1;
+	} else {
+		return 0;
+	}
+}
+
+static void process_critical_error(struct nes_device *nesdev)
+{
+	u32 debug_error;
+	u32 nes_idx_debug_error_masks0 = 0;
+	u16 error_module = 0;
+
+	debug_error = nes_read_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS);
+	printk(KERN_ERR PFX "Critical Error reported by device!!! 0x%02X\n",
+			(u16)debug_error);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS,
+			0x01010000 | (debug_error & 0x0000ffff));
+	if (crit_err_count++ > 10)
+		nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 1 << 0x17);
+	error_module = (u16) (debug_error & 0x1F00) >> 8;
+	if (++nesdev->nesadapter->crit_error_count[error_module-1] >=
+			nes_max_critical_error_count) {
+		printk(KERN_ERR PFX "Masking off critical error for module "
+			"0x%02X\n", (u16)error_module);
+		nes_idx_debug_error_masks0 = nes_read_indexed(nesdev,
+			NES_IDX_DEBUG_ERROR_MASKS0);
+		nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0,
+			nes_idx_debug_error_masks0 | (1 << error_module));
+	}
+}
+/**
+ * nes_dpc
+ */
+void nes_dpc(unsigned long param)
+{
+	struct nes_device *nesdev = (struct nes_device *)param;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 counter;
+	u32 loop_counter = 0;
+	u32 int_status_bit;
+	u32 int_stat;
+	u32 timer_stat;
+	u32 temp_int_stat;
+	u32 intf_int_stat;
+	u32 processed_intf_int = 0;
+	u16 processed_timer_int = 0;
+	u16 completion_ints = 0;
+	u16 timer_ints = 0;
+
+	/* nes_debug(NES_DBG_ISR, "\n"); */
+
+	do {
+		timer_stat = 0;
+		if (nesdev->napi_isr_ran) {
+			nesdev->napi_isr_ran = 0;
+			int_stat = nesdev->int_stat;
+		} else
+			int_stat = nes_read32(nesdev->regs+NES_INT_STAT);
+		if (processed_intf_int != 0)
+			int_stat &= nesdev->int_req & ~NES_INT_INTF;
+		else
+			int_stat &= nesdev->int_req;
+		if (processed_timer_int == 0) {
+			processed_timer_int = 1;
+			if (int_stat & NES_INT_TIMER) {
+				timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT);
+				if ((timer_stat & nesdev->timer_int_req) == 0) {
+					int_stat &= ~NES_INT_TIMER;
+				}
+			}
+		} else {
+			int_stat &= ~NES_INT_TIMER;
+		}
+
+		if (int_stat) {
+			if (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0|
+					NES_INT_MAC1|NES_INT_MAC2 | NES_INT_MAC3)) {
+				/* Ack the interrupts */
+				nes_write32(nesdev->regs+NES_INT_STAT,
+					(int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0|
+					NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3)));
+			}
+
+			temp_int_stat = int_stat;
+			for (counter = 0, int_status_bit = 1; counter < 16; counter++) {
+				if (int_stat & int_status_bit) {
+					nes_process_ceq(nesdev, &nesadapter->ceq[counter]);
+					temp_int_stat &= ~int_status_bit;
+					completion_ints = 1;
+				}
+				if (!(temp_int_stat & 0x0000ffff))
+					break;
+				int_status_bit <<= 1;
+			}
+
+			/* Process the AEQ for this pci function */
+			int_status_bit = 1 << (16 + PCI_FUNC(nesdev->pcidev->devfn));
+			if (int_stat & int_status_bit) {
+				nes_process_aeq(nesdev, &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]);
+			}
+
+			/* Process the MAC interrupt for this pci function */
+			int_status_bit = 1 << (24 + nesdev->mac_index);
+			if (int_stat & int_status_bit) {
+				nes_process_mac_intr(nesdev, nesdev->mac_index);
+			}
+
+			if (int_stat & NES_INT_TIMER) {
+				if (timer_stat & nesdev->timer_int_req) {
+					nes_write32(nesdev->regs + NES_TIMER_STAT,
+							(timer_stat & nesdev->timer_int_req) |
+							~(nesdev->nesadapter->timer_int_req));
+					timer_ints = 1;
+				}
+			}
+
+			if (int_stat & NES_INT_INTF) {
+				processed_intf_int = 1;
+				intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT);
+				intf_int_stat &= nesdev->intf_int_req;
+				if (NES_INTF_INT_CRITERR & intf_int_stat) {
+					process_critical_error(nesdev);
+				}
+				if (NES_INTF_INT_PCIERR & intf_int_stat) {
+					printk(KERN_ERR PFX "PCI Error reported by device!!!\n");
+					BUG();
+				}
+				if (NES_INTF_INT_AEQ_OFLOW & intf_int_stat) {
+					printk(KERN_ERR PFX "AEQ Overflow reported by device!!!\n");
+					BUG();
+				}
+				nes_write32(nesdev->regs+NES_INTF_INT_STAT, intf_int_stat);
+			}
+
+			if (int_stat & NES_INT_TSW) {
+			}
+		}
+		/* Don't use the interface interrupt bit stay in loop */
+		int_stat &= ~NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 |
+				NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3;
+	} while ((int_stat != 0) && (loop_counter++ < MAX_DPC_ITERATIONS));
+
+	if (timer_ints == 1) {
+		if ((nesadapter->et_rx_coalesce_usecs_irq) || (nesadapter->et_use_adaptive_rx_coalesce)) {
+			if (completion_ints == 0) {
+				nesdev->timer_only_int_count++;
+				if (nesdev->timer_only_int_count>=nesadapter->timer_int_limit) {
+					nesdev->timer_only_int_count = 0;
+					nesdev->int_req &= ~NES_INT_TIMER;
+					nes_write32(nesdev->regs + NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+					nes_write32(nesdev->regs + NES_INT_MASK, ~nesdev->int_req);
+				} else {
+					nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+				}
+			} else {
+				if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
+				{
+					nes_nic_init_timer(nesdev);
+				}
+				nesdev->timer_only_int_count = 0;
+				nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+			}
+		} else {
+			nesdev->timer_only_int_count = 0;
+			nesdev->int_req &= ~NES_INT_TIMER;
+			nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+			nes_write32(nesdev->regs+NES_TIMER_STAT,
+					nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req));
+			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+		}
+	} else {
+		if ( (completion_ints == 1) &&
+			 (((nesadapter->et_rx_coalesce_usecs_irq) &&
+			   (!nesadapter->et_use_adaptive_rx_coalesce)) ||
+			  ((nesdev->deepcq_count > nesadapter->et_pkt_rate_low) &&
+			   (nesadapter->et_use_adaptive_rx_coalesce) )) ) {
+			/* nes_debug(NES_DBG_ISR, "Enabling periodic timer interrupt.\n" ); */
+			nesdev->timer_only_int_count = 0;
+			nesdev->int_req |= NES_INT_TIMER;
+			nes_write32(nesdev->regs+NES_TIMER_STAT,
+					nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req));
+			nes_write32(nesdev->regs+NES_INTF_INT_MASK,
+					~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER));
+			nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+		} else {
+			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+		}
+	}
+	nesdev->deepcq_count = 0;
+}
+
+
+/**
+ * nes_process_ceq
+ */
+static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq)
+{
+	u64 u64temp;
+	struct nes_hw_cq *cq;
+	u32 head;
+	u32 ceq_size;
+
+	/* nes_debug(NES_DBG_CQ, "\n"); */
+	head = ceq->ceq_head;
+	ceq_size = ceq->ceq_size;
+
+	do {
+		if (le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]) &
+				NES_CEQE_VALID) {
+			u64temp = (((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]))) << 32) |
+						((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_LOW_IDX])));
+			u64temp <<= 1;
+			cq = *((struct nes_hw_cq **)&u64temp);
+			/* nes_debug(NES_DBG_CQ, "pCQ = %p\n", cq); */
+			barrier();
+			ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX] = 0;
+
+			/* call the event handler */
+			cq->ce_handler(nesdev, cq);
+
+			if (++head >= ceq_size)
+				head = 0;
+		} else {
+			break;
+		}
+
+	} while (1);
+
+	ceq->ceq_head = head;
+}
+
+
+/**
+ * nes_process_aeq
+ */
+static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq)
+{
+	/* u64 u64temp; */
+	u32 head;
+	u32 aeq_size;
+	u32 aeqe_misc;
+	u32 aeqe_cq_id;
+	struct nes_hw_aeqe volatile *aeqe;
+
+	head = aeq->aeq_head;
+	aeq_size = aeq->aeq_size;
+
+	do {
+		aeqe = &aeq->aeq_vbase[head];
+		if ((le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]) & NES_AEQE_VALID) == 0)
+			break;
+		aeqe_misc  = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+		aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]);
+		if (aeqe_misc & (NES_AEQE_QP|NES_AEQE_CQ)) {
+			if (aeqe_cq_id >= NES_FIRST_QPN) {
+				/* dealing with an accelerated QP related AE */
+				/*
+				 * u64temp = (((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX]))) << 32) |
+				 *	     ((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX])));
+				 */
+				nes_process_iwarp_aeqe(nesdev, (struct nes_hw_aeqe *)aeqe);
+			} else {
+				/* TODO: dealing with a CQP related AE */
+				nes_debug(NES_DBG_AEQ, "Processing CQP related AE, misc = 0x%04X\n",
+						(u16)(aeqe_misc >> 16));
+			}
+		}
+
+		aeqe->aeqe_words[NES_AEQE_MISC_IDX] = 0;
+
+		if (++head >= aeq_size)
+			head = 0;
+
+		nes_write32(nesdev->regs + NES_AEQ_ALLOC, 1 << 16);
+	}
+	while (1);
+	aeq->aeq_head = head;
+}
+
+static void nes_reset_link(struct nes_device *nesdev, u32 mac_index)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 reset_value;
+	u32 i=0;
+	u32 u32temp;
+
+	if (nesadapter->hw_rev == NE020_REV) {
+		return;
+	}
+	mh_detected++;
+
+	reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+
+	if ((mac_index == 0) || ((mac_index == 1) && (nesadapter->OneG_Mode)))
+		reset_value |= 0x0000001d;
+	else
+		reset_value |= 0x0000002d;
+
+	if (4 <= (nesadapter->link_interrupt_count[mac_index] / ((u16)NES_MAX_LINK_INTERRUPTS))) {
+		if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) {
+			nesadapter->link_interrupt_count[0] = 0;
+			nesadapter->link_interrupt_count[1] = 0;
+			u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			if (0x00000040 & u32temp)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088);
+			else
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
+
+			reset_value |= 0x0000003d;
+		}
+		nesadapter->link_interrupt_count[mac_index] = 0;
+	}
+
+	nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+	while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+			& 0x00000040) != 0x00000040) && (i++ < 5000));
+
+	if (0x0000003d == (reset_value & 0x0000003d)) {
+		u32 pcs_control_status0, pcs_control_status1;
+
+		for (i = 0; i < 10; i++) {
+			pcs_control_status0 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0);
+			pcs_control_status1 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+			if (((0x0F000000 == (pcs_control_status0 & 0x0F000000))
+			     && (pcs_control_status0 & 0x00100000))
+			    || ((0x0F000000 == (pcs_control_status1 & 0x0F000000))
+				&& (pcs_control_status1 & 0x00100000)))
+				continue;
+			else
+				break;
+		}
+		if (10 == i) {
+			u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			if (0x00000040 & u32temp)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088);
+			else
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
+
+			nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+			while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET)
+				 & 0x00000040) != 0x00000040) && (i++ < 5000));
+		}
+	}
+}
+
+/**
+ * nes_process_mac_intr
+ */
+static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number)
+{
+	unsigned long flags;
+	u32 pcs_control_status;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_vnic *nesvnic;
+	u32 mac_status;
+	u32 mac_index = nesdev->mac_index;
+	u32 u32temp;
+	u16 phy_data;
+	u16 temp_phy_data;
+	u32 pcs_val  = 0x0f0f0000;
+	u32 pcs_mask = 0x0f1f0000;
+	u32 cdr_ctrl;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+	if (nesadapter->mac_sw_state[mac_number] != NES_MAC_SW_IDLE) {
+		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+		return;
+	}
+	nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_INTERRUPT;
+
+	/* ack the MAC interrupt */
+	mac_status = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200));
+	/* Clear the interrupt */
+	nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200), mac_status);
+
+	nes_debug(NES_DBG_PHY, "MAC%u interrupt status = 0x%X.\n", mac_number, mac_status);
+
+	if (mac_status & (NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT)) {
+		nesdev->link_status_interrupts++;
+		if (0 == (++nesadapter->link_interrupt_count[mac_index] % ((u16)NES_MAX_LINK_INTERRUPTS)))
+			nes_reset_link(nesdev, mac_index);
+
+		/* read the PHY interrupt status register */
+		if ((nesadapter->OneG_Mode) &&
+		(nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) {
+			do {
+				nes_read_1G_phy_reg(nesdev, 0x1a,
+						nesadapter->phy_index[mac_index], &phy_data);
+				nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1a = 0x%X.\n",
+						nesadapter->phy_index[mac_index], phy_data);
+			} while (phy_data&0x8000);
+
+			temp_phy_data = 0;
+			do {
+				nes_read_1G_phy_reg(nesdev, 0x11,
+						nesadapter->phy_index[mac_index], &phy_data);
+				nes_debug(NES_DBG_PHY, "Phy%d data from register 0x11 = 0x%X.\n",
+						nesadapter->phy_index[mac_index], phy_data);
+				if (temp_phy_data == phy_data)
+					break;
+				temp_phy_data = phy_data;
+			} while (1);
+
+			nes_read_1G_phy_reg(nesdev, 0x1e,
+					nesadapter->phy_index[mac_index], &phy_data);
+			nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1e = 0x%X.\n",
+					nesadapter->phy_index[mac_index], phy_data);
+
+			nes_read_1G_phy_reg(nesdev, 1,
+					nesadapter->phy_index[mac_index], &phy_data);
+			nes_debug(NES_DBG_PHY, "1G phy%u data from register 1 = 0x%X\n",
+					nesadapter->phy_index[mac_index], phy_data);
+
+			if (temp_phy_data & 0x1000) {
+				nes_debug(NES_DBG_PHY, "The Link is up according to the PHY\n");
+				phy_data = 4;
+			} else {
+				nes_debug(NES_DBG_PHY, "The Link is down according to the PHY\n");
+			}
+		}
+		nes_debug(NES_DBG_PHY, "Eth SERDES Common Status: 0=0x%08X, 1=0x%08X\n",
+				nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0),
+				nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0+0x200));
+
+		if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_PUMA_1G) {
+			switch (mac_index) {
+			case 1:
+			case 3:
+				pcs_control_status = nes_read_indexed(nesdev,
+						NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+				break;
+			default:
+				pcs_control_status = nes_read_indexed(nesdev,
+						NES_IDX_PHY_PCS_CONTROL_STATUS0);
+				break;
+			}
+		} else {
+			pcs_control_status = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200));
+			pcs_control_status = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200));
+		}
+
+		nes_debug(NES_DBG_PHY, "PCS PHY Control/Status%u: 0x%08X\n",
+				mac_index, pcs_control_status);
+		if ((nesadapter->OneG_Mode) &&
+				(nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) {
+			u32temp = 0x01010000;
+			if (nesadapter->port_count > 2) {
+				u32temp |= 0x02020000;
+			}
+			if ((pcs_control_status & u32temp)!= u32temp) {
+				phy_data = 0;
+				nes_debug(NES_DBG_PHY, "PCS says the link is down\n");
+			}
+		} else {
+			switch (nesadapter->phy_type[mac_index]) {
+			case NES_PHY_TYPE_ARGUS:
+			case NES_PHY_TYPE_SFP_D:
+			case NES_PHY_TYPE_KR:
+				/* clear the alarms */
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0x0008);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc001);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc002);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc005);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc006);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9004);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9005);
+				/* check link status */
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
+				temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
+				nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
+				phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+				phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
+
+				nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n",
+					__func__, phy_data, nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP");
+				break;
+
+			case NES_PHY_TYPE_PUMA_1G:
+				if (mac_index < 2)
+					pcs_val = pcs_mask = 0x01010000;
+				else
+					pcs_val = pcs_mask = 0x02020000;
+				/* fall through */
+			default:
+				phy_data = (pcs_val == (pcs_control_status & pcs_mask)) ? 0x4 : 0x0;
+				break;
+			}
+		}
+
+		if (phy_data & 0x0004) {
+			if (wide_ppm_offset &&
+			    (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) &&
+			    (nesadapter->hw_rev != NE020_REV)) {
+				cdr_ctrl = nes_read_indexed(nesdev,
+							    NES_IDX_ETH_SERDES_CDR_CONTROL0 +
+							    mac_index * 0x200);
+				nes_write_indexed(nesdev,
+						  NES_IDX_ETH_SERDES_CDR_CONTROL0 +
+						  mac_index * 0x200,
+						  cdr_ctrl | 0x000F0000);
+			}
+			nesadapter->mac_link_down[mac_index] = 0;
+			list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+				nes_debug(NES_DBG_PHY, "The Link is UP!!.  linkup was %d\n",
+						nesvnic->linkup);
+				if (nesvnic->linkup == 0) {
+					printk(PFX "The Link is now up for port %s, netdev %p.\n",
+							nesvnic->netdev->name, nesvnic->netdev);
+					if (netif_queue_stopped(nesvnic->netdev))
+						netif_start_queue(nesvnic->netdev);
+					nesvnic->linkup = 1;
+					netif_carrier_on(nesvnic->netdev);
+
+					spin_lock(&nesvnic->port_ibevent_lock);
+					if (nesvnic->of_device_registered) {
+						if (nesdev->iw_status == 0) {
+							nesdev->iw_status = 1;
+							nes_port_ibevent(nesvnic);
+						}
+					}
+					spin_unlock(&nesvnic->port_ibevent_lock);
+				}
+			}
+		} else {
+			if (wide_ppm_offset &&
+			    (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) &&
+			    (nesadapter->hw_rev != NE020_REV)) {
+				cdr_ctrl = nes_read_indexed(nesdev,
+							    NES_IDX_ETH_SERDES_CDR_CONTROL0 +
+							    mac_index * 0x200);
+				nes_write_indexed(nesdev,
+						  NES_IDX_ETH_SERDES_CDR_CONTROL0 +
+						  mac_index * 0x200,
+						  cdr_ctrl & 0xFFF0FFFF);
+			}
+			nesadapter->mac_link_down[mac_index] = 1;
+			list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+				nes_debug(NES_DBG_PHY, "The Link is Down!!. linkup was %d\n",
+						nesvnic->linkup);
+				if (nesvnic->linkup == 1) {
+					printk(PFX "The Link is now down for port %s, netdev %p.\n",
+							nesvnic->netdev->name, nesvnic->netdev);
+					if (!(netif_queue_stopped(nesvnic->netdev)))
+						netif_stop_queue(nesvnic->netdev);
+					nesvnic->linkup = 0;
+					netif_carrier_off(nesvnic->netdev);
+
+					spin_lock(&nesvnic->port_ibevent_lock);
+					if (nesvnic->of_device_registered) {
+						if (nesdev->iw_status == 1) {
+							nesdev->iw_status = 0;
+							nes_port_ibevent(nesvnic);
+						}
+					}
+					spin_unlock(&nesvnic->port_ibevent_lock);
+				}
+			}
+		}
+		if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_SFP_D) {
+			nesdev->link_recheck = 1;
+			mod_delayed_work(system_wq, &nesdev->work,
+					 NES_LINK_RECHECK_DELAY);
+		}
+	}
+
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+
+	nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_IDLE;
+}
+
+void nes_recheck_link_status(struct work_struct *work)
+{
+	unsigned long flags;
+	struct nes_device *nesdev = container_of(work, struct nes_device, work.work);
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_vnic *nesvnic;
+	u32 mac_index = nesdev->mac_index;
+	u16 phy_data;
+	u16 temp_phy_data;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+
+	/* check link status */
+	nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
+	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+	nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
+	nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
+	phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+	phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
+
+	nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n",
+		__func__, phy_data,
+		nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP");
+
+	if (phy_data & 0x0004) {
+		nesadapter->mac_link_down[mac_index] = 0;
+		list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+			if (nesvnic->linkup == 0) {
+				printk(PFX "The Link is now up for port %s, netdev %p.\n",
+						nesvnic->netdev->name, nesvnic->netdev);
+				if (netif_queue_stopped(nesvnic->netdev))
+					netif_start_queue(nesvnic->netdev);
+				nesvnic->linkup = 1;
+				netif_carrier_on(nesvnic->netdev);
+
+				spin_lock(&nesvnic->port_ibevent_lock);
+				if (nesvnic->of_device_registered) {
+					if (nesdev->iw_status == 0) {
+						nesdev->iw_status = 1;
+						nes_port_ibevent(nesvnic);
+					}
+				}
+				spin_unlock(&nesvnic->port_ibevent_lock);
+			}
+		}
+
+	} else {
+		nesadapter->mac_link_down[mac_index] = 1;
+		list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+			if (nesvnic->linkup == 1) {
+				printk(PFX "The Link is now down for port %s, netdev %p.\n",
+						nesvnic->netdev->name, nesvnic->netdev);
+				if (!(netif_queue_stopped(nesvnic->netdev)))
+					netif_stop_queue(nesvnic->netdev);
+				nesvnic->linkup = 0;
+				netif_carrier_off(nesvnic->netdev);
+
+				spin_lock(&nesvnic->port_ibevent_lock);
+				if (nesvnic->of_device_registered) {
+					if (nesdev->iw_status == 1) {
+						nesdev->iw_status = 0;
+						nes_port_ibevent(nesvnic);
+					}
+				}
+				spin_unlock(&nesvnic->port_ibevent_lock);
+			}
+		}
+	}
+	if (nesdev->link_recheck++ < NES_LINK_RECHECK_MAX)
+		schedule_delayed_work(&nesdev->work, NES_LINK_RECHECK_DELAY);
+	else
+		nesdev->link_recheck = 0;
+
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+}
+
+
+static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
+{
+	struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
+
+	napi_schedule(&nesvnic->napi);
+}
+
+
+/* The MAX_RQES_TO_PROCESS defines how many max read requests to complete before
+* getting out of nic_ce_handler
+*/
+#define	MAX_RQES_TO_PROCESS	384
+
+/**
+ * nes_nic_ce_handler
+ */
+void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
+{
+	u64 u64temp;
+	dma_addr_t bus_address;
+	struct nes_hw_nic *nesnic;
+	struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct sk_buff *skb;
+	struct sk_buff *rx_skb;
+	struct nes_rskb_cb *cb;
+	__le16 *wqe_fragment_length;
+	u32 head;
+	u32 cq_size;
+	u32 rx_pkt_size;
+	u32 cqe_count=0;
+	u32 cqe_errv;
+	u32 cqe_misc;
+	u16 wqe_fragment_index = 1;	/* first fragment (0) is used by copy buffer */
+	u16 vlan_tag;
+	u16 pkt_type;
+	u16 rqes_processed = 0;
+	u8 sq_cqes = 0;
+
+	head = cq->cq_head;
+	cq_size = cq->cq_size;
+	cq->cqes_pending = 1;
+	do {
+		if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]) &
+				NES_NIC_CQE_VALID) {
+			nesnic = &nesvnic->nic;
+			cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]);
+			if (cqe_misc & NES_NIC_CQE_SQ) {
+				sq_cqes++;
+				wqe_fragment_index = 1;
+				nic_sqe = &nesnic->sq_vbase[nesnic->sq_tail];
+				skb = nesnic->tx_skb[nesnic->sq_tail];
+				wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+				/* bump past the vlan tag */
+				wqe_fragment_length++;
+				if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) {
+					u64temp = (u64) le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX +
+							wqe_fragment_index * 2]);
+					u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX +
+							wqe_fragment_index * 2])) << 32;
+					bus_address = (dma_addr_t)u64temp;
+					if (test_and_clear_bit(nesnic->sq_tail, nesnic->first_frag_overflow)) {
+						pci_unmap_single(nesdev->pcidev,
+								bus_address,
+								le16_to_cpu(wqe_fragment_length[wqe_fragment_index++]),
+								PCI_DMA_TODEVICE);
+					}
+					for (; wqe_fragment_index < 5; wqe_fragment_index++) {
+						if (wqe_fragment_length[wqe_fragment_index]) {
+							u64temp = le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX +
+										wqe_fragment_index * 2]);
+							u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX
+										+ wqe_fragment_index * 2])) <<32;
+							bus_address = (dma_addr_t)u64temp;
+							pci_unmap_page(nesdev->pcidev,
+									bus_address,
+									le16_to_cpu(wqe_fragment_length[wqe_fragment_index]),
+									PCI_DMA_TODEVICE);
+						} else
+							break;
+					}
+				}
+				if (skb)
+					dev_kfree_skb_any(skb);
+				nesnic->sq_tail++;
+				nesnic->sq_tail &= nesnic->sq_size-1;
+				if (sq_cqes > 128) {
+					barrier();
+					/* restart the queue if it had been stopped */
+					if (netif_queue_stopped(nesvnic->netdev))
+						netif_wake_queue(nesvnic->netdev);
+					sq_cqes = 0;
+				}
+			} else {
+				rqes_processed ++;
+
+				cq->rx_cqes_completed++;
+				cq->rx_pkts_indicated++;
+				rx_pkt_size = cqe_misc & 0x0000ffff;
+				nic_rqe = &nesnic->rq_vbase[nesnic->rq_tail];
+				/* Get the skb */
+				rx_skb = nesnic->rx_skb[nesnic->rq_tail];
+				nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_tail];
+				bus_address = (dma_addr_t)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]);
+				bus_address += ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32;
+				pci_unmap_single(nesdev->pcidev, bus_address,
+						nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+				cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
+				cb->busaddr = 0;
+				/* rx_skb->tail = rx_skb->data + rx_pkt_size; */
+				/* rx_skb->len = rx_pkt_size; */
+				rx_skb->len = 0;  /* TODO: see if this is necessary */
+				skb_put(rx_skb, rx_pkt_size);
+				rx_skb->protocol = eth_type_trans(rx_skb, nesvnic->netdev);
+				nesnic->rq_tail++;
+				nesnic->rq_tail &= nesnic->rq_size - 1;
+
+				atomic_inc(&nesvnic->rx_skbs_needed);
+				if (atomic_read(&nesvnic->rx_skbs_needed) > (nesvnic->nic.rq_size>>1)) {
+					nes_write32(nesdev->regs+NES_CQE_ALLOC,
+							cq->cq_number | (cqe_count << 16));
+					/* nesadapter->tune_timer.cq_count += cqe_count; */
+					nesdev->currcq_count += cqe_count;
+					cqe_count = 0;
+					nes_replenish_nic_rq(nesvnic);
+				}
+				pkt_type = (u16)(le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX]));
+				cqe_errv = (cqe_misc & NES_NIC_CQE_ERRV_MASK) >> NES_NIC_CQE_ERRV_SHIFT;
+				rx_skb->ip_summed = CHECKSUM_NONE;
+
+				if ((NES_PKT_TYPE_TCPV4_BITS == (pkt_type & NES_PKT_TYPE_TCPV4_MASK)) ||
+						(NES_PKT_TYPE_UDPV4_BITS == (pkt_type & NES_PKT_TYPE_UDPV4_MASK))) {
+					if ((cqe_errv &
+							(NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR |
+							NES_NIC_ERRV_BITS_IPH_ERR | NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) {
+						if (nesvnic->netdev->features & NETIF_F_RXCSUM)
+							rx_skb->ip_summed = CHECKSUM_UNNECESSARY;
+					} else
+						nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet."
+								" errv = 0x%X, pkt_type = 0x%X.\n",
+								nesvnic->netdev->name, cqe_errv, pkt_type);
+
+				} else if ((pkt_type & NES_PKT_TYPE_IPV4_MASK) == NES_PKT_TYPE_IPV4_BITS) {
+					if ((cqe_errv &
+							(NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_IPH_ERR |
+							NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) {
+						if (nesvnic->netdev->features & NETIF_F_RXCSUM) {
+							rx_skb->ip_summed = CHECKSUM_UNNECESSARY;
+							/* nes_debug(NES_DBG_CQ, "%s: Reporting successfully checksummed IPv4 packet.\n",
+								  nesvnic->netdev->name); */
+						}
+					} else
+						nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet."
+								" errv = 0x%X, pkt_type = 0x%X.\n",
+								nesvnic->netdev->name, cqe_errv, pkt_type);
+					}
+				/* nes_debug(NES_DBG_CQ, "pkt_type=%x, APBVT_MASK=%x\n",
+							pkt_type, (pkt_type & NES_PKT_TYPE_APBVT_MASK)); */
+
+				if ((pkt_type & NES_PKT_TYPE_APBVT_MASK) == NES_PKT_TYPE_APBVT_BITS) {
+					if (nes_cm_recv(rx_skb, nesvnic->netdev))
+						rx_skb = NULL;
+				}
+				if (rx_skb == NULL)
+					goto skip_rx_indicate0;
+
+
+				if (cqe_misc & NES_NIC_CQE_TAG_VALID) {
+					vlan_tag = (u16)(le32_to_cpu(
+							cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX])
+							>> 16);
+					nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n",
+							nesvnic->netdev->name, vlan_tag);
+
+					__vlan_hwaccel_put_tag(rx_skb, htons(ETH_P_8021Q), vlan_tag);
+				}
+				napi_gro_receive(&nesvnic->napi, rx_skb);
+
+skip_rx_indicate0:
+				;
+				/* nesvnic->netstats.rx_packets++; */
+				/* nesvnic->netstats.rx_bytes += rx_pkt_size; */
+			}
+
+			cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0;
+			/* Accounting... */
+			cqe_count++;
+			if (++head >= cq_size)
+				head = 0;
+			if (cqe_count == 255) {
+				/* Replenish Nic CQ */
+				nes_write32(nesdev->regs+NES_CQE_ALLOC,
+						cq->cq_number | (cqe_count << 16));
+				/* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */
+				nesdev->currcq_count += cqe_count;
+				cqe_count = 0;
+			}
+
+			if (cq->rx_cqes_completed >= nesvnic->budget)
+				break;
+		} else {
+			cq->cqes_pending = 0;
+			break;
+		}
+
+	} while (1);
+
+	if (sq_cqes) {
+		barrier();
+		/* restart the queue if it had been stopped */
+		if (netif_queue_stopped(nesvnic->netdev))
+			netif_wake_queue(nesvnic->netdev);
+	}
+	cq->cq_head = head;
+	/* nes_debug(NES_DBG_CQ, "CQ%u Processed = %u cqes, new head = %u.\n",
+			cq->cq_number, cqe_count, cq->cq_head); */
+	cq->cqe_allocs_pending = cqe_count;
+	if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
+	{
+		/* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */
+		nesdev->currcq_count += cqe_count;
+		nes_nic_tune_timer(nesdev);
+	}
+	if (atomic_read(&nesvnic->rx_skbs_needed))
+		nes_replenish_nic_rq(nesvnic);
+}
+
+
+
+/**
+ * nes_cqp_ce_handler
+ */
+static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq)
+{
+	u64 u64temp;
+	unsigned long flags;
+	struct nes_hw_cqp *cqp = NULL;
+	struct nes_cqp_request *cqp_request;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 head;
+	u32 cq_size;
+	u32 cqe_count=0;
+	u32 error_code;
+	u32 opcode;
+	u32 ctx_index;
+	/* u32 counter; */
+
+	head = cq->cq_head;
+	cq_size = cq->cq_size;
+
+	do {
+		/* process the CQE */
+		/* nes_debug(NES_DBG_CQP, "head=%u cqe_words=%08X\n", head,
+			  le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])); */
+
+		opcode = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]);
+		if (opcode & NES_CQE_VALID) {
+			cqp = &nesdev->cqp;
+
+			error_code = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX]);
+			if (error_code) {
+				nes_debug(NES_DBG_CQP, "Bad Completion code for opcode 0x%02X from CQP,"
+						" Major/Minor codes = 0x%04X:%04X.\n",
+						le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])&0x3f,
+						(u16)(error_code >> 16),
+						(u16)error_code);
+			}
+
+			u64temp = (((u64)(le32_to_cpu(cq->cq_vbase[head].
+					cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]))) << 32) |
+					((u64)(le32_to_cpu(cq->cq_vbase[head].
+					cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX])));
+
+			cqp_request = (struct nes_cqp_request *)(unsigned long)u64temp;
+			if (cqp_request) {
+				if (cqp_request->waiting) {
+					/* nes_debug(NES_DBG_CQP, "%s: Waking up requestor\n"); */
+					cqp_request->major_code = (u16)(error_code >> 16);
+					cqp_request->minor_code = (u16)error_code;
+					barrier();
+					cqp_request->request_done = 1;
+					wake_up(&cqp_request->waitq);
+					nes_put_cqp_request(nesdev, cqp_request);
+				} else {
+					if (cqp_request->callback)
+						cqp_request->cqp_callback(nesdev, cqp_request);
+					nes_free_cqp_request(nesdev, cqp_request);
+				}
+			} else {
+				wake_up(&nesdev->cqp.waitq);
+			}
+
+			cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0;
+			nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (1 << 16));
+			if (++cqp->sq_tail >= cqp->sq_size)
+				cqp->sq_tail = 0;
+
+			/* Accounting... */
+			cqe_count++;
+			if (++head >= cq_size)
+				head = 0;
+		} else {
+			break;
+		}
+	} while (1);
+	cq->cq_head = head;
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+	while ((!list_empty(&nesdev->cqp_pending_reqs)) &&
+			((((nesdev->cqp.sq_tail+nesdev->cqp.sq_size)-nesdev->cqp.sq_head) &
+			(nesdev->cqp.sq_size - 1)) != 1)) {
+		cqp_request = list_entry(nesdev->cqp_pending_reqs.next,
+				struct nes_cqp_request, list);
+		list_del_init(&cqp_request->list);
+		head = nesdev->cqp.sq_head++;
+		nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+		cqp_wqe = &nesdev->cqp.sq_vbase[head];
+		memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe));
+		barrier();
+
+		opcode = le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]);
+		if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT)
+			ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX;
+		else
+			ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX;
+		cqp_wqe->wqe_words[ctx_index] =
+			cpu_to_le32((u32)((unsigned long)cqp_request));
+		cqp_wqe->wqe_words[ctx_index + 1] =
+			cpu_to_le32((u32)(upper_32_bits((unsigned long)cqp_request)));
+		nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) put on CQPs SQ wqe%u.\n",
+				cqp_request, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, head);
+		/* Ring doorbell (1 WQEs) */
+		barrier();
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
+	}
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+	/* Arm the CCQ */
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			cq->cq_number);
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+}
+
+static u8 *locate_mpa(u8 *pkt, u32 aeq_info)
+{
+	if (aeq_info & NES_AEQE_Q2_DATA_ETHERNET) {
+		/* skip over ethernet header */
+		pkt += ETH_HLEN;
+
+		/* Skip over IP and TCP headers */
+		pkt += 4 * (pkt[0] & 0x0f);
+		pkt += 4 * ((pkt[12] >> 4) & 0x0f);
+	}
+	return pkt;
+}
+
+/* Determine if incoming error pkt is rdma layer */
+static u32 iwarp_opcode(struct nes_qp *nesqp, u32 aeq_info)
+{
+	u8 *pkt;
+	u16 *mpa;
+	u32 opcode = 0xffffffff;
+
+	if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
+		pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
+		mpa = (u16 *)locate_mpa(pkt, aeq_info);
+		opcode = be16_to_cpu(mpa[1]) & 0xf;
+	}
+
+	return opcode;
+}
+
+/* Build iWARP terminate header */
+static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 aeq_info)
+{
+	u8 *pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
+	u16 ddp_seg_len;
+	int copy_len = 0;
+	u8 is_tagged = 0;
+	u8 flush_code = 0;
+	struct nes_terminate_hdr *termhdr;
+
+	termhdr = (struct nes_terminate_hdr *)nesqp->hwqp.q2_vbase;
+	memset(termhdr, 0, 64);
+
+	if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
+
+		/* Use data from offending packet to fill in ddp & rdma hdrs */
+		pkt = locate_mpa(pkt, aeq_info);
+		ddp_seg_len = be16_to_cpu(*(u16 *)pkt);
+		if (ddp_seg_len) {
+			copy_len = 2;
+			termhdr->hdrct = DDP_LEN_FLAG;
+			if (pkt[2] & 0x80) {
+				is_tagged = 1;
+				if (ddp_seg_len >= TERM_DDP_LEN_TAGGED) {
+					copy_len += TERM_DDP_LEN_TAGGED;
+					termhdr->hdrct |= DDP_HDR_FLAG;
+				}
+			} else {
+				if (ddp_seg_len >= TERM_DDP_LEN_UNTAGGED) {
+					copy_len += TERM_DDP_LEN_UNTAGGED;
+					termhdr->hdrct |= DDP_HDR_FLAG;
+				}
+
+				if (ddp_seg_len >= (TERM_DDP_LEN_UNTAGGED + TERM_RDMA_LEN)) {
+					if ((pkt[3] & RDMA_OPCODE_MASK) == RDMA_READ_REQ_OPCODE) {
+						copy_len += TERM_RDMA_LEN;
+						termhdr->hdrct |= RDMA_HDR_FLAG;
+					}
+				}
+			}
+		}
+	}
+
+	switch (async_event_id) {
+	case NES_AEQE_AEID_AMP_UNALLOCATED_STAG:
+		switch (iwarp_opcode(nesqp, aeq_info)) {
+		case IWARP_OPCODE_WRITE:
+			flush_code = IB_WC_LOC_PROT_ERR;
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
+			termhdr->error_code = DDP_TAGGED_INV_STAG;
+			break;
+		default:
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_INV_STAG;
+		}
+		break;
+	case NES_AEQE_AEID_AMP_INVALID_STAG:
+		flush_code = IB_WC_REM_ACCESS_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+		termhdr->error_code = RDMAP_INV_STAG;
+		break;
+	case NES_AEQE_AEID_AMP_BAD_QP:
+		flush_code = IB_WC_LOC_QP_OP_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_QN;
+		break;
+	case NES_AEQE_AEID_AMP_BAD_STAG_KEY:
+	case NES_AEQE_AEID_AMP_BAD_STAG_INDEX:
+		switch (iwarp_opcode(nesqp, aeq_info)) {
+		case IWARP_OPCODE_SEND_INV:
+		case IWARP_OPCODE_SEND_SE_INV:
+			flush_code = IB_WC_REM_OP_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
+			termhdr->error_code = RDMAP_CANT_INV_STAG;
+			break;
+		default:
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_INV_STAG;
+		}
+		break;
+	case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION:
+		if (aeq_info & (NES_AEQE_Q2_DATA_ETHERNET | NES_AEQE_Q2_DATA_MPA)) {
+			flush_code = IB_WC_LOC_PROT_ERR;
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
+			termhdr->error_code = DDP_TAGGED_BOUNDS;
+		} else {
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_INV_BOUNDS;
+		}
+		break;
+	case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION:
+	case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
+	case NES_AEQE_AEID_PRIV_OPERATION_DENIED:
+		flush_code = IB_WC_REM_ACCESS_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+		termhdr->error_code = RDMAP_ACCESS;
+		break;
+	case NES_AEQE_AEID_AMP_TO_WRAP:
+		flush_code = IB_WC_REM_ACCESS_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+		termhdr->error_code = RDMAP_TO_WRAP;
+		break;
+	case NES_AEQE_AEID_AMP_BAD_PD:
+		switch (iwarp_opcode(nesqp, aeq_info)) {
+		case IWARP_OPCODE_WRITE:
+			flush_code = IB_WC_LOC_PROT_ERR;
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
+			termhdr->error_code = DDP_TAGGED_UNASSOC_STAG;
+			break;
+		case IWARP_OPCODE_SEND_INV:
+		case IWARP_OPCODE_SEND_SE_INV:
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_CANT_INV_STAG;
+			break;
+		default:
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_UNASSOC_STAG;
+		}
+		break;
+	case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH:
+		flush_code = IB_WC_LOC_LEN_ERR;
+		termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP;
+		termhdr->error_code = MPA_MARKER;
+		break;
+	case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP;
+		termhdr->error_code = MPA_CRC;
+		break;
+	case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE:
+	case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL:
+		flush_code = IB_WC_LOC_LEN_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC;
+		termhdr->error_code = DDP_CATASTROPHIC_LOCAL;
+		break;
+	case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC:
+	case NES_AEQE_AEID_DDP_NO_L_BIT:
+		flush_code = IB_WC_FATAL_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC;
+		termhdr->error_code = DDP_CATASTROPHIC_LOCAL;
+		break;
+	case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN:
+	case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_MSN_RANGE;
+		break;
+	case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
+		flush_code = IB_WC_LOC_LEN_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_TOO_LONG;
+		break;
+	case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION:
+		flush_code = IB_WC_GENERAL_ERR;
+		if (is_tagged) {
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
+			termhdr->error_code = DDP_TAGGED_INV_DDP_VER;
+		} else {
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+			termhdr->error_code = DDP_UNTAGGED_INV_DDP_VER;
+		}
+		break;
+	case NES_AEQE_AEID_DDP_UBE_INVALID_MO:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_MO;
+		break;
+	case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE:
+		flush_code = IB_WC_REM_OP_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_MSN_NO_BUF;
+		break;
+	case NES_AEQE_AEID_DDP_UBE_INVALID_QN:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_QN;
+		break;
+	case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
+		termhdr->error_code = RDMAP_INV_RDMAP_VER;
+		break;
+	case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE:
+		flush_code = IB_WC_LOC_QP_OP_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
+		termhdr->error_code = RDMAP_UNEXPECTED_OP;
+		break;
+	default:
+		flush_code = IB_WC_FATAL_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
+		termhdr->error_code = RDMAP_UNSPECIFIED;
+		break;
+	}
+
+	if (copy_len)
+		memcpy(termhdr + 1, pkt, copy_len);
+
+	if ((flush_code) && ((NES_AEQE_INBOUND_RDMA & aeq_info) == 0)) {
+		if (aeq_info & NES_AEQE_SQ)
+			nesqp->term_sq_flush_code = flush_code;
+		else
+			nesqp->term_rq_flush_code = flush_code;
+	}
+
+	return sizeof(struct nes_terminate_hdr) + copy_len;
+}
+
+static void nes_terminate_connection(struct nes_device *nesdev, struct nes_qp *nesqp,
+		 struct nes_hw_aeqe *aeqe, enum ib_event_type eventtype)
+{
+	u64 context;
+	unsigned long flags;
+	u32 aeq_info;
+	u16 async_event_id;
+	u8 tcp_state;
+	u8 iwarp_state;
+	u32 termlen = 0;
+	u32 mod_qp_flags = NES_CQP_QP_IWARP_STATE_TERMINATE |
+			   NES_CQP_QP_TERM_DONT_SEND_FIN;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if (nesqp->term_flags & NES_TERM_SENT)
+		return; /* Sanity check */
+
+	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
+	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
+	async_event_id = (u16)aeq_info;
+
+	context = (unsigned long)nesadapter->qp_table[le32_to_cpu(
+		aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN];
+	if (!context) {
+		WARN_ON(!context);
+		return;
+	}
+
+	nesqp = (struct nes_qp *)(unsigned long)context;
+	spin_lock_irqsave(&nesqp->lock, flags);
+	nesqp->hw_iwarp_state = iwarp_state;
+	nesqp->hw_tcp_state = tcp_state;
+	nesqp->last_aeq = async_event_id;
+	nesqp->terminate_eventtype = eventtype;
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	if (nesadapter->send_term_ok)
+		termlen = nes_bld_terminate_hdr(nesqp, async_event_id, aeq_info);
+	else
+		mod_qp_flags |= NES_CQP_QP_TERM_DONT_SEND_TERM_MSG;
+
+	if (!nesdev->iw_status)  {
+		nesqp->term_flags = NES_TERM_DONE;
+		nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_ERROR, 0, 0);
+		nes_cm_disconn(nesqp);
+	} else {
+		nes_terminate_start_timer(nesqp);
+		nesqp->term_flags |= NES_TERM_SENT;
+		nes_hw_modify_qp(nesdev, nesqp, mod_qp_flags, termlen, 0);
+	}
+}
+
+static void nes_terminate_send_fin(struct nes_device *nesdev,
+			  struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe)
+{
+	u32 aeq_info;
+	u16 async_event_id;
+	u8 tcp_state;
+	u8 iwarp_state;
+	unsigned long flags;
+
+	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
+	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
+	async_event_id = (u16)aeq_info;
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+	nesqp->hw_iwarp_state = iwarp_state;
+	nesqp->hw_tcp_state = tcp_state;
+	nesqp->last_aeq = async_event_id;
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	/* Send the fin only */
+	nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_TERMINATE |
+		NES_CQP_QP_TERM_DONT_SEND_TERM_MSG, 0, 0);
+}
+
+/* Cleanup after a terminate sent or received */
+static void nes_terminate_done(struct nes_qp *nesqp, int timeout_occurred)
+{
+	u32 next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
+	unsigned long flags;
+	struct nes_vnic *nesvnic = to_nesvnic(nesqp->ibqp.device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u8 first_time = 0;
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+	if (nesqp->hte_added) {
+		nesqp->hte_added = 0;
+		next_iwarp_state |= NES_CQP_QP_DEL_HTE;
+	}
+
+	first_time = (nesqp->term_flags & NES_TERM_DONE) == 0;
+	nesqp->term_flags |= NES_TERM_DONE;
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	/* Make sure we go through this only once */
+	if (first_time) {
+		if (timeout_occurred == 0)
+			del_timer(&nesqp->terminate_timer);
+		else
+			next_iwarp_state |= NES_CQP_QP_RESET;
+
+		nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
+		nes_cm_disconn(nesqp);
+	}
+}
+
+static void nes_terminate_received(struct nes_device *nesdev,
+				struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe)
+{
+	u32 aeq_info;
+	u8 *pkt;
+	u32 *mpa;
+	u8 ddp_ctl;
+	u8 rdma_ctl;
+	u16 aeq_id = 0;
+
+	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+	if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
+		/* Terminate is not a performance path so the silicon */
+		/* did not validate the frame - do it now */
+		pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
+		mpa = (u32 *)locate_mpa(pkt, aeq_info);
+		ddp_ctl = (be32_to_cpu(mpa[0]) >> 8) & 0xff;
+		rdma_ctl = be32_to_cpu(mpa[0]) & 0xff;
+		if ((ddp_ctl & 0xc0) != 0x40)
+			aeq_id = NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC;
+		else if ((ddp_ctl & 0x03) != 1)
+			aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION;
+		else if (be32_to_cpu(mpa[2]) != 2)
+			aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_QN;
+		else if (be32_to_cpu(mpa[3]) != 1)
+			aeq_id = NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN;
+		else if (be32_to_cpu(mpa[4]) != 0)
+			aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_MO;
+		else if ((rdma_ctl & 0xc0) != 0x40)
+			aeq_id = NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION;
+
+		if (aeq_id) {
+			/* Bad terminate recvd - send back a terminate */
+			aeq_info = (aeq_info & 0xffff0000) | aeq_id;
+			aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info);
+			nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
+			return;
+		}
+	}
+
+	nesqp->term_flags |= NES_TERM_RCVD;
+	nesqp->terminate_eventtype = IB_EVENT_QP_FATAL;
+	nes_terminate_start_timer(nesqp);
+	nes_terminate_send_fin(nesdev, nesqp, aeqe);
+}
+
+/* Timeout routine in case terminate fails to complete */
+void nes_terminate_timeout(struct timer_list *t)
+{
+	struct nes_qp *nesqp = from_timer(nesqp, t, terminate_timer);
+
+	nes_terminate_done(nesqp, 1);
+}
+
+/* Set a timer in case hw cannot complete the terminate sequence */
+static void nes_terminate_start_timer(struct nes_qp *nesqp)
+{
+	mod_timer(&nesqp->terminate_timer, (jiffies + HZ));
+}
+
+/**
+ * nes_process_iwarp_aeqe
+ */
+static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
+				   struct nes_hw_aeqe *aeqe)
+{
+	u64 context;
+	unsigned long flags;
+	struct nes_qp *nesqp;
+	struct nes_hw_cq *hw_cq;
+	struct nes_cq *nescq;
+	int resource_allocated;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 aeq_info;
+	u32 next_iwarp_state = 0;
+	u32 aeqe_cq_id;
+	u16 async_event_id;
+	u8 tcp_state;
+	u8 iwarp_state;
+	struct ib_event ibevent;
+
+	nes_debug(NES_DBG_AEQ, "\n");
+	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+	if ((NES_AEQE_INBOUND_RDMA & aeq_info) || (!(NES_AEQE_QP & aeq_info))) {
+		context  = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]);
+		context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32;
+	} else {
+		context = (unsigned long)nesadapter->qp_table[le32_to_cpu(
+						aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN];
+		BUG_ON(!context);
+	}
+
+	/* context is nesqp unless async_event_id == CQ ERROR */
+	nesqp = (struct nes_qp *)(unsigned long)context;
+	async_event_id = (u16)aeq_info;
+	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
+	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
+	nes_debug(NES_DBG_AEQ, "aeid = 0x%04X, qp-cq id = %d, aeqe = %p,"
+			" Tcp state = %s, iWARP state = %s\n",
+			async_event_id,
+			le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), aeqe,
+			nes_tcp_state_str[tcp_state], nes_iwarp_state_str[iwarp_state]);
+
+	aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]);
+	if (aeq_info & NES_AEQE_QP) {
+		if (!nes_is_resource_allocated(nesadapter,
+				nesadapter->allocated_qps,
+				aeqe_cq_id))
+			return;
+	}
+
+	switch (async_event_id) {
+		case NES_AEQE_AEID_LLP_FIN_RECEIVED:
+			if (nesqp->term_flags)
+				return; /* Ignore it, wait for close complete */
+
+			if (atomic_inc_return(&nesqp->close_timer_started) == 1) {
+				if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) &&
+					(nesqp->ibqp_state == IB_QPS_RTS)) {
+					spin_lock_irqsave(&nesqp->lock, flags);
+					nesqp->hw_iwarp_state = iwarp_state;
+					nesqp->hw_tcp_state = tcp_state;
+					nesqp->last_aeq = async_event_id;
+					next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
+					nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
+					spin_unlock_irqrestore(&nesqp->lock, flags);
+					nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
+					nes_cm_disconn(nesqp);
+				}
+				nesqp->cm_id->add_ref(nesqp->cm_id);
+				schedule_nes_timer(nesqp->cm_node, (struct sk_buff *)nesqp,
+						NES_TIMER_TYPE_CLOSE, 1, 0);
+				nes_debug(NES_DBG_AEQ, "QP%u Not decrementing QP refcount (%d),"
+						" need ae to finish up, original_last_aeq = 0x%04X."
+						" last_aeq = 0x%04X, scheduling timer. TCP state = %d\n",
+						nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+						async_event_id, nesqp->last_aeq, tcp_state);
+			}
+			break;
+		case NES_AEQE_AEID_LLP_CLOSE_COMPLETE:
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_cm_disconn(nesqp);
+			break;
+
+		case NES_AEQE_AEID_RESET_SENT:
+			tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			nesqp->hte_added = 0;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE;
+			nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
+			nes_cm_disconn(nesqp);
+			break;
+
+		case NES_AEQE_AEID_LLP_CONNECTION_RESET:
+			if (atomic_read(&nesqp->close_timer_started))
+				return;
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_cm_disconn(nesqp);
+			break;
+
+		case NES_AEQE_AEID_TERMINATE_SENT:
+			nes_terminate_send_fin(nesdev, nesqp, aeqe);
+			break;
+
+		case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED:
+			nes_terminate_received(nesdev, nesqp, aeqe);
+			break;
+
+		case NES_AEQE_AEID_AMP_BAD_STAG_KEY:
+		case NES_AEQE_AEID_AMP_BAD_STAG_INDEX:
+		case NES_AEQE_AEID_AMP_UNALLOCATED_STAG:
+		case NES_AEQE_AEID_AMP_INVALID_STAG:
+		case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION:
+		case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
+		case NES_AEQE_AEID_PRIV_OPERATION_DENIED:
+		case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
+		case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION:
+		case NES_AEQE_AEID_AMP_TO_WRAP:
+			printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_ACCESS_ERR\n",
+					nesqp->hwqp.qp_id, async_event_id);
+			nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR);
+			break;
+
+		case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE:
+		case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL:
+		case NES_AEQE_AEID_DDP_UBE_INVALID_MO:
+		case NES_AEQE_AEID_DDP_UBE_INVALID_QN:
+			if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) {
+				aeq_info &= 0xffff0000;
+				aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE;
+				aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info);
+			}
+			/* fall through */
+		case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE:
+		case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES:
+		case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE:
+		case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR:
+		case NES_AEQE_AEID_AMP_BAD_QP:
+		case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH:
+		case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC:
+		case NES_AEQE_AEID_DDP_NO_L_BIT:
+		case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN:
+		case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID:
+		case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION:
+		case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION:
+		case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE:
+		case NES_AEQE_AEID_AMP_BAD_PD:
+		case NES_AEQE_AEID_AMP_FASTREG_SHARED:
+		case NES_AEQE_AEID_AMP_FASTREG_VALID_STAG:
+		case NES_AEQE_AEID_AMP_FASTREG_MW_STAG:
+		case NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS:
+		case NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW:
+		case NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH:
+		case NES_AEQE_AEID_AMP_INVALIDATE_SHARED:
+		case NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS:
+		case NES_AEQE_AEID_AMP_MWBIND_VALID_STAG:
+		case NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG:
+		case NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG:
+		case NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG:
+		case NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS:
+		case NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS:
+		case NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT:
+		case NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED:
+		case NES_AEQE_AEID_BAD_CLOSE:
+		case NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO:
+		case NES_AEQE_AEID_STAG_ZERO_INVALID:
+		case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST:
+		case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP:
+			printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n",
+					nesqp->hwqp.qp_id, async_event_id);
+			print_ip(nesqp->cm_node);
+			if (!atomic_read(&nesqp->close_timer_started))
+				nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
+			break;
+
+		case NES_AEQE_AEID_CQ_OPERATION_ERROR:
+			context <<= 1;
+			nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u, %p\n",
+					le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), (void *)(unsigned long)context);
+			resource_allocated = nes_is_resource_allocated(nesadapter, nesadapter->allocated_cqs,
+					le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]));
+			if (resource_allocated) {
+				printk(KERN_ERR PFX "%s: Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u\n",
+						__func__, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]));
+				hw_cq = (struct nes_hw_cq *)(unsigned long)context;
+				if (hw_cq) {
+					nescq = container_of(hw_cq, struct nes_cq, hw_cq);
+					if (nescq->ibcq.event_handler) {
+						ibevent.device = nescq->ibcq.device;
+						ibevent.event = IB_EVENT_CQ_ERR;
+						ibevent.element.cq = &nescq->ibcq;
+						nescq->ibcq.event_handler(&ibevent, nescq->ibcq.cq_context);
+					}
+				}
+			}
+			break;
+
+		default:
+			nes_debug(NES_DBG_AEQ, "Processing an iWARP related AE for QP, misc = 0x%04X\n",
+					async_event_id);
+			break;
+	}
+
+}
+
+/**
+ * nes_iwarp_ce_handler
+ */
+void nes_iwarp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *hw_cq)
+{
+	struct nes_cq *nescq = container_of(hw_cq, struct nes_cq, hw_cq);
+
+	/* nes_debug(NES_DBG_CQ, "Processing completion event for iWARP CQ%u.\n",
+			nescq->hw_cq.cq_number); */
+	nes_write32(nesdev->regs+NES_CQ_ACK, nescq->hw_cq.cq_number);
+
+	if (nescq->ibcq.comp_handler)
+		nescq->ibcq.comp_handler(&nescq->ibcq, nescq->ibcq.cq_context);
+
+	return;
+}
+
+
+/**
+ * nes_manage_apbvt()
+ */
+int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port,
+		u32 nic_index, u32 add_port)
+{
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	int ret = 0;
+	u16 major_code;
+
+	/* Send manage APBVT request to CQP */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	nes_debug(NES_DBG_QP, "%s APBV for local port=%u(0x%04x), nic_index=%u\n",
+			(add_port == NES_MANAGE_APBVT_ADD) ? "ADD" : "DEL",
+			accel_local_port, accel_local_port, nic_index);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, (NES_CQP_MANAGE_APBVT |
+			((add_port == NES_MANAGE_APBVT_ADD) ? NES_CQP_APBVT_ADD : 0)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+			((nic_index << NES_CQP_APBVT_NIC_SHIFT) | accel_local_port));
+
+	nes_debug(NES_DBG_QP, "Waiting for CQP completion for APBVT.\n");
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	if (add_port == NES_MANAGE_APBVT_ADD)
+		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+				NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_QP, "Completed, ret=%u,  CQP Major:Minor codes = 0x%04X:0x%04X\n",
+			ret, cqp_request->major_code, cqp_request->minor_code);
+	major_code = cqp_request->major_code;
+
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (!ret)
+		return -ETIME;
+	else if (major_code)
+		return -EIO;
+	else
+		return 0;
+}
+
+
+/**
+ * nes_manage_arp_cache
+ */
+void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr,
+		u32 ip_addr, u32 action)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev;
+	struct nes_cqp_request *cqp_request;
+	int arp_index;
+
+	nesdev = nesvnic->nesdev;
+	arp_index = nes_arp_table(nesdev, ip_addr, mac_addr, action);
+	if (arp_index == -1) {
+		return;
+	}
+
+	/* update the ARP entry */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_NETDEV, "Failed to get a cqp_request.\n");
+		return;
+	}
+	cqp_request->waiting = 0;
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
+			NES_CQP_MANAGE_ARP_CACHE | NES_CQP_ARP_PERM);
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(
+			(u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_CQP_ARP_AEQ_INDEX_SHIFT);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(arp_index);
+
+	if (action == NES_ARP_ADD) {
+		cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_ARP_VALID);
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = cpu_to_le32(
+				(((u32)mac_addr[2]) << 24) | (((u32)mac_addr[3]) << 16) |
+				(((u32)mac_addr[4]) << 8)  | (u32)mac_addr[5]);
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = cpu_to_le32(
+				(((u32)mac_addr[0]) << 8) | (u32)mac_addr[1]);
+	} else {
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = 0;
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = 0;
+	}
+
+	nes_debug(NES_DBG_NETDEV, "Not waiting for CQP, cqp.sq_head=%u, cqp.sq_tail=%u\n",
+			nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
+
+	atomic_set(&cqp_request->refcount, 1);
+	nes_post_cqp_request(nesdev, cqp_request);
+}
+
+
+/**
+ * flush_wqes
+ */
+void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp,
+		u32 which_wq, u32 wait_completion)
+{
+	struct nes_cqp_request *cqp_request;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 sq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH;
+	u32 rq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH;
+	int ret;
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
+		return;
+	}
+	if (wait_completion) {
+		cqp_request->waiting = 1;
+		atomic_set(&cqp_request->refcount, 2);
+	} else {
+		cqp_request->waiting = 0;
+	}
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	/* If wqe in error was identified, set code to be put into cqe */
+	if ((nesqp->term_sq_flush_code) && (which_wq & NES_CQP_FLUSH_SQ)) {
+		which_wq |= NES_CQP_FLUSH_MAJ_MIN;
+		sq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_sq_flush_code;
+		nesqp->term_sq_flush_code = 0;
+	}
+
+	if ((nesqp->term_rq_flush_code) && (which_wq & NES_CQP_FLUSH_RQ)) {
+		which_wq |= NES_CQP_FLUSH_MAJ_MIN;
+		rq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_rq_flush_code;
+		nesqp->term_rq_flush_code = 0;
+	}
+
+	if (which_wq & NES_CQP_FLUSH_MAJ_MIN) {
+		cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_SQ_CODE] = cpu_to_le32(sq_code);
+		cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_RQ_CODE] = cpu_to_le32(rq_code);
+	}
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] =
+			cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id);
+
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	if (wait_completion) {
+		/* Wait for CQP */
+		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+				NES_EVENT_TIMEOUT);
+		nes_debug(NES_DBG_QP, "Flush SQ QP WQEs completed, ret=%u,"
+				" CQP Major:Minor codes = 0x%04X:0x%04X\n",
+				ret, cqp_request->major_code, cqp_request->minor_code);
+		nes_put_cqp_request(nesdev, cqp_request);
+	}
+}
diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h
new file mode 100644
index 0000000..3c56470
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_hw.h
@@ -0,0 +1,1380 @@
+/*
+* Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenIB.org BSD license below:
+*
+*     Redistribution and use in source and binary forms, with or
+*     without modification, are permitted provided that the following
+*     conditions are met:
+*
+*      - Redistributions of source code must retain the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer.
+*
+*      - Redistributions in binary form must reproduce the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer in the documentation and/or other materials
+*        provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+
+#ifndef __NES_HW_H
+#define __NES_HW_H
+
+#define NES_PHY_TYPE_CX4       1
+#define NES_PHY_TYPE_1G        2
+#define NES_PHY_TYPE_ARGUS     4
+#define NES_PHY_TYPE_PUMA_1G   5
+#define NES_PHY_TYPE_PUMA_10G  6
+#define NES_PHY_TYPE_GLADIUS   7
+#define NES_PHY_TYPE_SFP_D     8
+#define NES_PHY_TYPE_KR	       9
+
+#define NES_MULTICAST_PF_MAX 8
+#define NES_A0 3
+
+#define NES_ENABLE_PAU 0x07000001
+#define NES_DISABLE_PAU 0x07000000
+#define NES_PAU_COUNTER 10
+#define NES_CQP_OPCODE_MASK 0x3f
+
+enum pci_regs {
+	NES_INT_STAT = 0x0000,
+	NES_INT_MASK = 0x0004,
+	NES_INT_PENDING = 0x0008,
+	NES_INTF_INT_STAT = 0x000C,
+	NES_INTF_INT_MASK = 0x0010,
+	NES_TIMER_STAT = 0x0014,
+	NES_PERIODIC_CONTROL = 0x0018,
+	NES_ONE_SHOT_CONTROL = 0x001C,
+	NES_EEPROM_COMMAND = 0x0020,
+	NES_EEPROM_DATA = 0x0024,
+	NES_FLASH_COMMAND = 0x0028,
+	NES_FLASH_DATA  = 0x002C,
+	NES_SOFTWARE_RESET = 0x0030,
+	NES_CQ_ACK = 0x0034,
+	NES_WQE_ALLOC = 0x0040,
+	NES_CQE_ALLOC = 0x0044,
+	NES_AEQ_ALLOC = 0x0048
+};
+
+enum indexed_regs {
+	NES_IDX_CREATE_CQP_LOW = 0x0000,
+	NES_IDX_CREATE_CQP_HIGH = 0x0004,
+	NES_IDX_QP_CONTROL = 0x0040,
+	NES_IDX_FLM_CONTROL = 0x0080,
+	NES_IDX_INT_CPU_STATUS = 0x00a0,
+	NES_IDX_GPR_TRIGGER = 0x00bc,
+	NES_IDX_GPIO_CONTROL = 0x00f0,
+	NES_IDX_GPIO_DATA = 0x00f4,
+	NES_IDX_GPR2 = 0x010c,
+	NES_IDX_TCP_CONFIG0 = 0x01e4,
+	NES_IDX_TCP_TIMER_CONFIG = 0x01ec,
+	NES_IDX_TCP_NOW = 0x01f0,
+	NES_IDX_QP_MAX_CFG_SIZES = 0x0200,
+	NES_IDX_QP_CTX_SIZE = 0x0218,
+	NES_IDX_TCP_TIMER_SIZE0 = 0x0238,
+	NES_IDX_TCP_TIMER_SIZE1 = 0x0240,
+	NES_IDX_ARP_CACHE_SIZE = 0x0258,
+	NES_IDX_CQ_CTX_SIZE = 0x0260,
+	NES_IDX_MRT_SIZE = 0x0278,
+	NES_IDX_PBL_REGION_SIZE = 0x0280,
+	NES_IDX_IRRQ_COUNT = 0x02b0,
+	NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x02f0,
+	NES_IDX_RX_WINDOW_BUFFER_SIZE = 0x0300,
+	NES_IDX_DST_IP_ADDR = 0x0400,
+	NES_IDX_PCIX_DIAG = 0x08e8,
+	NES_IDX_MPP_DEBUG = 0x0a00,
+	NES_IDX_PORT_RX_DISCARDS = 0x0a30,
+	NES_IDX_PORT_TX_DISCARDS = 0x0a34,
+	NES_IDX_MPP_LB_DEBUG = 0x0b00,
+	NES_IDX_DENALI_CTL_22 = 0x1058,
+	NES_IDX_MAC_TX_CONTROL = 0x2000,
+	NES_IDX_MAC_TX_CONFIG = 0x2004,
+	NES_IDX_MAC_TX_PAUSE_QUANTA = 0x2008,
+	NES_IDX_MAC_RX_CONTROL = 0x200c,
+	NES_IDX_MAC_RX_CONFIG = 0x2010,
+	NES_IDX_MAC_EXACT_MATCH_BOTTOM = 0x201c,
+	NES_IDX_MAC_MDIO_CONTROL = 0x2084,
+	NES_IDX_MAC_TX_OCTETS_LOW = 0x2100,
+	NES_IDX_MAC_TX_OCTETS_HIGH = 0x2104,
+	NES_IDX_MAC_TX_FRAMES_LOW = 0x2108,
+	NES_IDX_MAC_TX_FRAMES_HIGH = 0x210c,
+	NES_IDX_MAC_TX_PAUSE_FRAMES = 0x2118,
+	NES_IDX_MAC_TX_ERRORS = 0x2138,
+	NES_IDX_MAC_RX_OCTETS_LOW = 0x213c,
+	NES_IDX_MAC_RX_OCTETS_HIGH = 0x2140,
+	NES_IDX_MAC_RX_FRAMES_LOW = 0x2144,
+	NES_IDX_MAC_RX_FRAMES_HIGH = 0x2148,
+	NES_IDX_MAC_RX_BC_FRAMES_LOW = 0x214c,
+	NES_IDX_MAC_RX_MC_FRAMES_HIGH = 0x2150,
+	NES_IDX_MAC_RX_PAUSE_FRAMES = 0x2154,
+	NES_IDX_MAC_RX_SHORT_FRAMES = 0x2174,
+	NES_IDX_MAC_RX_OVERSIZED_FRAMES = 0x2178,
+	NES_IDX_MAC_RX_JABBER_FRAMES = 0x217c,
+	NES_IDX_MAC_RX_CRC_ERR_FRAMES = 0x2180,
+	NES_IDX_MAC_RX_LENGTH_ERR_FRAMES = 0x2184,
+	NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES = 0x2188,
+	NES_IDX_MAC_INT_STATUS = 0x21f0,
+	NES_IDX_MAC_INT_MASK = 0x21f4,
+	NES_IDX_PHY_PCS_CONTROL_STATUS0 = 0x2800,
+	NES_IDX_PHY_PCS_CONTROL_STATUS1 = 0x2a00,
+	NES_IDX_ETH_SERDES_COMMON_CONTROL0 = 0x2808,
+	NES_IDX_ETH_SERDES_COMMON_CONTROL1 = 0x2a08,
+	NES_IDX_ETH_SERDES_COMMON_STATUS0 = 0x280c,
+	NES_IDX_ETH_SERDES_COMMON_STATUS1 = 0x2a0c,
+	NES_IDX_ETH_SERDES_TX_EMP0 = 0x2810,
+	NES_IDX_ETH_SERDES_TX_EMP1 = 0x2a10,
+	NES_IDX_ETH_SERDES_TX_DRIVE0 = 0x2814,
+	NES_IDX_ETH_SERDES_TX_DRIVE1 = 0x2a14,
+	NES_IDX_ETH_SERDES_RX_MODE0 = 0x2818,
+	NES_IDX_ETH_SERDES_RX_MODE1 = 0x2a18,
+	NES_IDX_ETH_SERDES_RX_SIGDET0 = 0x281c,
+	NES_IDX_ETH_SERDES_RX_SIGDET1 = 0x2a1c,
+	NES_IDX_ETH_SERDES_BYPASS0 = 0x2820,
+	NES_IDX_ETH_SERDES_BYPASS1 = 0x2a20,
+	NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0 = 0x2824,
+	NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1 = 0x2a24,
+	NES_IDX_ETH_SERDES_RX_EQ_CONTROL0 = 0x2828,
+	NES_IDX_ETH_SERDES_RX_EQ_CONTROL1 = 0x2a28,
+	NES_IDX_ETH_SERDES_RX_EQ_STATUS0 = 0x282c,
+	NES_IDX_ETH_SERDES_RX_EQ_STATUS1 = 0x2a2c,
+	NES_IDX_ETH_SERDES_CDR_RESET0 = 0x2830,
+	NES_IDX_ETH_SERDES_CDR_RESET1 = 0x2a30,
+	NES_IDX_ETH_SERDES_CDR_CONTROL0 = 0x2834,
+	NES_IDX_ETH_SERDES_CDR_CONTROL1 = 0x2a34,
+	NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0 = 0x2838,
+	NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1 = 0x2a38,
+	NES_IDX_ENDNODE0_NSTAT_RX_DISCARD = 0x3080,
+	NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO = 0x3000,
+	NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI = 0x3004,
+	NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO = 0x3008,
+	NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI = 0x300c,
+	NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO = 0x7000,
+	NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI = 0x7004,
+	NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO = 0x7008,
+	NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI = 0x700c,
+	NES_IDX_WQM_CONFIG0 = 0x5000,
+	NES_IDX_WQM_CONFIG1 = 0x5004,
+	NES_IDX_CM_CONFIG = 0x5100,
+	NES_IDX_NIC_LOGPORT_TO_PHYPORT = 0x6000,
+	NES_IDX_NIC_PHYPORT_TO_USW = 0x6008,
+	NES_IDX_NIC_ACTIVE = 0x6010,
+	NES_IDX_NIC_UNICAST_ALL = 0x6018,
+	NES_IDX_NIC_MULTICAST_ALL = 0x6020,
+	NES_IDX_NIC_MULTICAST_ENABLE = 0x6028,
+	NES_IDX_NIC_BROADCAST_ON = 0x6030,
+	NES_IDX_USED_CHUNKS_TX = 0x60b0,
+	NES_IDX_TX_POOL_SIZE = 0x60b8,
+	NES_IDX_QUAD_HASH_TABLE_SIZE = 0x6148,
+	NES_IDX_PERFECT_FILTER_LOW = 0x6200,
+	NES_IDX_PERFECT_FILTER_HIGH = 0x6204,
+	NES_IDX_IPV4_TCP_REXMITS = 0x7080,
+	NES_IDX_DEBUG_ERROR_CONTROL_STATUS = 0x913c,
+	NES_IDX_DEBUG_ERROR_MASKS0 = 0x9140,
+	NES_IDX_DEBUG_ERROR_MASKS1 = 0x9144,
+	NES_IDX_DEBUG_ERROR_MASKS2 = 0x9148,
+	NES_IDX_DEBUG_ERROR_MASKS3 = 0x914c,
+	NES_IDX_DEBUG_ERROR_MASKS4 = 0x9150,
+	NES_IDX_DEBUG_ERROR_MASKS5 = 0x9154,
+};
+
+#define NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE   1
+#define NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE (1 << 17)
+
+enum nes_cqp_opcodes {
+	NES_CQP_CREATE_QP = 0x00,
+	NES_CQP_MODIFY_QP = 0x01,
+	NES_CQP_DESTROY_QP = 0x02,
+	NES_CQP_CREATE_CQ = 0x03,
+	NES_CQP_MODIFY_CQ = 0x04,
+	NES_CQP_DESTROY_CQ = 0x05,
+	NES_CQP_ALLOCATE_STAG = 0x09,
+	NES_CQP_REGISTER_STAG = 0x0a,
+	NES_CQP_QUERY_STAG = 0x0b,
+	NES_CQP_REGISTER_SHARED_STAG = 0x0c,
+	NES_CQP_DEALLOCATE_STAG = 0x0d,
+	NES_CQP_MANAGE_ARP_CACHE = 0x0f,
+	NES_CQP_DOWNLOAD_SEGMENT = 0x10,
+	NES_CQP_SUSPEND_QPS = 0x11,
+	NES_CQP_UPLOAD_CONTEXT = 0x13,
+	NES_CQP_CREATE_CEQ = 0x16,
+	NES_CQP_DESTROY_CEQ = 0x18,
+	NES_CQP_CREATE_AEQ = 0x19,
+	NES_CQP_DESTROY_AEQ = 0x1b,
+	NES_CQP_LMI_ACCESS = 0x20,
+	NES_CQP_FLUSH_WQES = 0x22,
+	NES_CQP_MANAGE_APBVT = 0x23,
+	NES_CQP_MANAGE_QUAD_HASH = 0x25
+};
+
+enum nes_cqp_wqe_word_idx {
+	NES_CQP_WQE_OPCODE_IDX = 0,
+	NES_CQP_WQE_ID_IDX = 1,
+	NES_CQP_WQE_COMP_CTX_LOW_IDX = 2,
+	NES_CQP_WQE_COMP_CTX_HIGH_IDX = 3,
+	NES_CQP_WQE_COMP_SCRATCH_LOW_IDX = 4,
+	NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX = 5,
+};
+
+enum nes_cqp_wqe_word_download_idx { /* format differs from other cqp ops */
+	NES_CQP_WQE_DL_OPCODE_IDX = 0,
+	NES_CQP_WQE_DL_COMP_CTX_LOW_IDX = 1,
+	NES_CQP_WQE_DL_COMP_CTX_HIGH_IDX = 2,
+	NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX = 3
+	/* For index values 4-15 use NES_NIC_SQ_WQE_ values */
+};
+
+enum nes_cqp_cq_wqeword_idx {
+	NES_CQP_CQ_WQE_PBL_LOW_IDX = 6,
+	NES_CQP_CQ_WQE_PBL_HIGH_IDX = 7,
+	NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX = 8,
+	NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX = 9,
+	NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX = 10,
+};
+
+enum nes_cqp_stag_wqeword_idx {
+	NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX = 1,
+	NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX = 6,
+	NES_CQP_STAG_WQE_LEN_LOW_IDX = 7,
+	NES_CQP_STAG_WQE_STAG_IDX = 8,
+	NES_CQP_STAG_WQE_VA_LOW_IDX = 10,
+	NES_CQP_STAG_WQE_VA_HIGH_IDX = 11,
+	NES_CQP_STAG_WQE_PA_LOW_IDX = 12,
+	NES_CQP_STAG_WQE_PA_HIGH_IDX = 13,
+	NES_CQP_STAG_WQE_PBL_LEN_IDX = 14
+};
+
+#define NES_CQP_OP_LOGICAL_PORT_SHIFT 26
+#define NES_CQP_OP_IWARP_STATE_SHIFT 28
+#define NES_CQP_OP_TERMLEN_SHIFT     28
+
+enum nes_cqp_qp_bits {
+	NES_CQP_QP_ARP_VALID = (1<<8),
+	NES_CQP_QP_WINBUF_VALID = (1<<9),
+	NES_CQP_QP_CONTEXT_VALID = (1<<10),
+	NES_CQP_QP_ORD_VALID = (1<<11),
+	NES_CQP_QP_WINBUF_DATAIND_EN = (1<<12),
+	NES_CQP_QP_VIRT_WQS = (1<<13),
+	NES_CQP_QP_DEL_HTE = (1<<14),
+	NES_CQP_QP_CQS_VALID = (1<<15),
+	NES_CQP_QP_TYPE_TSA = 0,
+	NES_CQP_QP_TYPE_IWARP = (1<<16),
+	NES_CQP_QP_TYPE_CQP = (4<<16),
+	NES_CQP_QP_TYPE_NIC = (5<<16),
+	NES_CQP_QP_MSS_CHG = (1<<20),
+	NES_CQP_QP_STATIC_RESOURCES = (1<<21),
+	NES_CQP_QP_IGNORE_MW_BOUND = (1<<22),
+	NES_CQP_QP_VWQ_USE_LMI = (1<<23),
+	NES_CQP_QP_IWARP_STATE_IDLE = (1<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_RTS = (2<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_CLOSING = (3<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_TERMINATE = (5<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_ERROR = (6<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_MASK = (7<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_TERM_DONT_SEND_FIN = (1<<24),
+	NES_CQP_QP_TERM_DONT_SEND_TERM_MSG = (1<<25),
+	NES_CQP_QP_RESET = (1<<31),
+};
+
+enum nes_cqp_qp_wqe_word_idx {
+	NES_CQP_QP_WQE_CONTEXT_LOW_IDX = 6,
+	NES_CQP_QP_WQE_CONTEXT_HIGH_IDX = 7,
+	NES_CQP_QP_WQE_FLUSH_SQ_CODE = 8,
+	NES_CQP_QP_WQE_FLUSH_RQ_CODE = 9,
+	NES_CQP_QP_WQE_NEW_MSS_IDX = 15,
+};
+
+enum nes_nic_ctx_bits {
+	NES_NIC_CTX_RQ_SIZE_32 = (3<<8),
+	NES_NIC_CTX_RQ_SIZE_512 = (3<<8),
+	NES_NIC_CTX_SQ_SIZE_32 = (1<<10),
+	NES_NIC_CTX_SQ_SIZE_512 = (3<<10),
+};
+
+enum nes_nic_qp_ctx_word_idx {
+	NES_NIC_CTX_MISC_IDX = 0,
+	NES_NIC_CTX_SQ_LOW_IDX = 2,
+	NES_NIC_CTX_SQ_HIGH_IDX = 3,
+	NES_NIC_CTX_RQ_LOW_IDX = 4,
+	NES_NIC_CTX_RQ_HIGH_IDX = 5,
+};
+
+enum nes_cqp_cq_bits {
+	NES_CQP_CQ_CEQE_MASK = (1<<9),
+	NES_CQP_CQ_CEQ_VALID = (1<<10),
+	NES_CQP_CQ_RESIZE = (1<<11),
+	NES_CQP_CQ_CHK_OVERFLOW = (1<<12),
+	NES_CQP_CQ_4KB_CHUNK = (1<<14),
+	NES_CQP_CQ_VIRT = (1<<15),
+};
+
+enum nes_cqp_stag_bits {
+	NES_CQP_STAG_VA_TO = (1<<9),
+	NES_CQP_STAG_DEALLOC_PBLS = (1<<10),
+	NES_CQP_STAG_PBL_BLK_SIZE = (1<<11),
+	NES_CQP_STAG_MR = (1<<13),
+	NES_CQP_STAG_RIGHTS_LOCAL_READ = (1<<16),
+	NES_CQP_STAG_RIGHTS_LOCAL_WRITE = (1<<17),
+	NES_CQP_STAG_RIGHTS_REMOTE_READ = (1<<18),
+	NES_CQP_STAG_RIGHTS_REMOTE_WRITE = (1<<19),
+	NES_CQP_STAG_RIGHTS_WINDOW_BIND = (1<<20),
+	NES_CQP_STAG_REM_ACC_EN = (1<<21),
+	NES_CQP_STAG_LEAVE_PENDING = (1<<31),
+};
+
+enum nes_cqp_ceq_wqeword_idx {
+	NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX = 1,
+	NES_CQP_CEQ_WQE_PBL_LOW_IDX = 6,
+	NES_CQP_CEQ_WQE_PBL_HIGH_IDX = 7,
+};
+
+enum nes_cqp_ceq_bits {
+	NES_CQP_CEQ_4KB_CHUNK = (1<<14),
+	NES_CQP_CEQ_VIRT = (1<<15),
+};
+
+enum nes_cqp_aeq_wqeword_idx {
+	NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX = 1,
+	NES_CQP_AEQ_WQE_PBL_LOW_IDX = 6,
+	NES_CQP_AEQ_WQE_PBL_HIGH_IDX = 7,
+};
+
+enum nes_cqp_aeq_bits {
+	NES_CQP_AEQ_4KB_CHUNK = (1<<14),
+	NES_CQP_AEQ_VIRT = (1<<15),
+};
+
+enum nes_cqp_lmi_wqeword_idx {
+	NES_CQP_LMI_WQE_LMI_OFFSET_IDX = 1,
+	NES_CQP_LMI_WQE_FRAG_LOW_IDX = 8,
+	NES_CQP_LMI_WQE_FRAG_HIGH_IDX = 9,
+	NES_CQP_LMI_WQE_FRAG_LEN_IDX = 10,
+};
+
+enum nes_cqp_arp_wqeword_idx {
+	NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX = 6,
+	NES_CQP_ARP_WQE_MAC_HIGH_IDX = 7,
+	NES_CQP_ARP_WQE_REACHABILITY_MAX_IDX = 1,
+};
+
+enum nes_cqp_upload_wqeword_idx {
+	NES_CQP_UPLOAD_WQE_CTXT_LOW_IDX = 6,
+	NES_CQP_UPLOAD_WQE_CTXT_HIGH_IDX = 7,
+	NES_CQP_UPLOAD_WQE_HTE_IDX = 8,
+};
+
+enum nes_cqp_arp_bits {
+	NES_CQP_ARP_VALID = (1<<8),
+	NES_CQP_ARP_PERM = (1<<9),
+};
+
+enum nes_cqp_flush_bits {
+	NES_CQP_FLUSH_SQ = (1<<30),
+	NES_CQP_FLUSH_RQ = (1<<31),
+	NES_CQP_FLUSH_MAJ_MIN = (1<<28),
+};
+
+enum nes_cqe_opcode_bits {
+	NES_CQE_STAG_VALID = (1<<6),
+	NES_CQE_ERROR = (1<<7),
+	NES_CQE_SQ = (1<<8),
+	NES_CQE_SE = (1<<9),
+	NES_CQE_PSH = (1<<29),
+	NES_CQE_FIN = (1<<30),
+	NES_CQE_VALID = (1<<31),
+};
+
+
+enum nes_cqe_word_idx {
+	NES_CQE_PAYLOAD_LENGTH_IDX = 0,
+	NES_CQE_COMP_COMP_CTX_LOW_IDX = 2,
+	NES_CQE_COMP_COMP_CTX_HIGH_IDX = 3,
+	NES_CQE_INV_STAG_IDX = 4,
+	NES_CQE_QP_ID_IDX = 5,
+	NES_CQE_ERROR_CODE_IDX = 6,
+	NES_CQE_OPCODE_IDX = 7,
+};
+
+enum nes_ceqe_word_idx {
+	NES_CEQE_CQ_CTX_LOW_IDX = 0,
+	NES_CEQE_CQ_CTX_HIGH_IDX = 1,
+};
+
+enum nes_ceqe_status_bit {
+	NES_CEQE_VALID = (1<<31),
+};
+
+enum nes_int_bits {
+	NES_INT_CEQ0 = (1<<0),
+	NES_INT_CEQ1 = (1<<1),
+	NES_INT_CEQ2 = (1<<2),
+	NES_INT_CEQ3 = (1<<3),
+	NES_INT_CEQ4 = (1<<4),
+	NES_INT_CEQ5 = (1<<5),
+	NES_INT_CEQ6 = (1<<6),
+	NES_INT_CEQ7 = (1<<7),
+	NES_INT_CEQ8 = (1<<8),
+	NES_INT_CEQ9 = (1<<9),
+	NES_INT_CEQ10 = (1<<10),
+	NES_INT_CEQ11 = (1<<11),
+	NES_INT_CEQ12 = (1<<12),
+	NES_INT_CEQ13 = (1<<13),
+	NES_INT_CEQ14 = (1<<14),
+	NES_INT_CEQ15 = (1<<15),
+	NES_INT_AEQ0 = (1<<16),
+	NES_INT_AEQ1 = (1<<17),
+	NES_INT_AEQ2 = (1<<18),
+	NES_INT_AEQ3 = (1<<19),
+	NES_INT_AEQ4 = (1<<20),
+	NES_INT_AEQ5 = (1<<21),
+	NES_INT_AEQ6 = (1<<22),
+	NES_INT_AEQ7 = (1<<23),
+	NES_INT_MAC0 = (1<<24),
+	NES_INT_MAC1 = (1<<25),
+	NES_INT_MAC2 = (1<<26),
+	NES_INT_MAC3 = (1<<27),
+	NES_INT_TSW = (1<<28),
+	NES_INT_TIMER = (1<<29),
+	NES_INT_INTF = (1<<30),
+};
+
+enum nes_intf_int_bits {
+	NES_INTF_INT_PCIERR = (1<<0),
+	NES_INTF_PERIODIC_TIMER = (1<<2),
+	NES_INTF_ONE_SHOT_TIMER = (1<<3),
+	NES_INTF_INT_CRITERR = (1<<14),
+	NES_INTF_INT_AEQ0_OFLOW = (1<<16),
+	NES_INTF_INT_AEQ1_OFLOW = (1<<17),
+	NES_INTF_INT_AEQ2_OFLOW = (1<<18),
+	NES_INTF_INT_AEQ3_OFLOW = (1<<19),
+	NES_INTF_INT_AEQ4_OFLOW = (1<<20),
+	NES_INTF_INT_AEQ5_OFLOW = (1<<21),
+	NES_INTF_INT_AEQ6_OFLOW = (1<<22),
+	NES_INTF_INT_AEQ7_OFLOW = (1<<23),
+	NES_INTF_INT_AEQ_OFLOW = (0xff<<16),
+};
+
+enum nes_mac_int_bits {
+	NES_MAC_INT_LINK_STAT_CHG = (1<<1),
+	NES_MAC_INT_XGMII_EXT = (1<<2),
+	NES_MAC_INT_TX_UNDERFLOW = (1<<6),
+	NES_MAC_INT_TX_ERROR = (1<<7),
+};
+
+enum nes_cqe_allocate_bits {
+	NES_CQE_ALLOC_INC_SELECT = (1<<28),
+	NES_CQE_ALLOC_NOTIFY_NEXT = (1<<29),
+	NES_CQE_ALLOC_NOTIFY_SE = (1<<30),
+	NES_CQE_ALLOC_RESET = (1<<31),
+};
+
+enum nes_nic_rq_wqe_word_idx {
+	NES_NIC_RQ_WQE_LENGTH_1_0_IDX = 0,
+	NES_NIC_RQ_WQE_LENGTH_3_2_IDX = 1,
+	NES_NIC_RQ_WQE_FRAG0_LOW_IDX = 2,
+	NES_NIC_RQ_WQE_FRAG0_HIGH_IDX = 3,
+	NES_NIC_RQ_WQE_FRAG1_LOW_IDX = 4,
+	NES_NIC_RQ_WQE_FRAG1_HIGH_IDX = 5,
+	NES_NIC_RQ_WQE_FRAG2_LOW_IDX = 6,
+	NES_NIC_RQ_WQE_FRAG2_HIGH_IDX = 7,
+	NES_NIC_RQ_WQE_FRAG3_LOW_IDX = 8,
+	NES_NIC_RQ_WQE_FRAG3_HIGH_IDX = 9,
+};
+
+enum nes_nic_sq_wqe_word_idx {
+	NES_NIC_SQ_WQE_MISC_IDX = 0,
+	NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX = 1,
+	NES_NIC_SQ_WQE_LSO_INFO_IDX = 2,
+	NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX = 3,
+	NES_NIC_SQ_WQE_LENGTH_2_1_IDX = 4,
+	NES_NIC_SQ_WQE_LENGTH_4_3_IDX = 5,
+	NES_NIC_SQ_WQE_FRAG0_LOW_IDX = 6,
+	NES_NIC_SQ_WQE_FRAG0_HIGH_IDX = 7,
+	NES_NIC_SQ_WQE_FRAG1_LOW_IDX = 8,
+	NES_NIC_SQ_WQE_FRAG1_HIGH_IDX = 9,
+	NES_NIC_SQ_WQE_FRAG2_LOW_IDX = 10,
+	NES_NIC_SQ_WQE_FRAG2_HIGH_IDX = 11,
+	NES_NIC_SQ_WQE_FRAG3_LOW_IDX = 12,
+	NES_NIC_SQ_WQE_FRAG3_HIGH_IDX = 13,
+	NES_NIC_SQ_WQE_FRAG4_LOW_IDX = 14,
+	NES_NIC_SQ_WQE_FRAG4_HIGH_IDX = 15,
+};
+
+enum nes_iwarp_sq_wqe_word_idx {
+	NES_IWARP_SQ_WQE_MISC_IDX = 0,
+	NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX = 1,
+	NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX = 2,
+	NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX = 3,
+	NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX = 4,
+	NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX = 5,
+	NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX = 7,
+	NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX = 8,
+	NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX = 9,
+	NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX = 10,
+	NES_IWARP_SQ_WQE_RDMA_STAG_IDX = 11,
+	NES_IWARP_SQ_WQE_IMM_DATA_START_IDX = 12,
+	NES_IWARP_SQ_WQE_FRAG0_LOW_IDX = 16,
+	NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX = 17,
+	NES_IWARP_SQ_WQE_LENGTH0_IDX = 18,
+	NES_IWARP_SQ_WQE_STAG0_IDX = 19,
+	NES_IWARP_SQ_WQE_FRAG1_LOW_IDX = 20,
+	NES_IWARP_SQ_WQE_FRAG1_HIGH_IDX = 21,
+	NES_IWARP_SQ_WQE_LENGTH1_IDX = 22,
+	NES_IWARP_SQ_WQE_STAG1_IDX = 23,
+	NES_IWARP_SQ_WQE_FRAG2_LOW_IDX = 24,
+	NES_IWARP_SQ_WQE_FRAG2_HIGH_IDX = 25,
+	NES_IWARP_SQ_WQE_LENGTH2_IDX = 26,
+	NES_IWARP_SQ_WQE_STAG2_IDX = 27,
+	NES_IWARP_SQ_WQE_FRAG3_LOW_IDX = 28,
+	NES_IWARP_SQ_WQE_FRAG3_HIGH_IDX = 29,
+	NES_IWARP_SQ_WQE_LENGTH3_IDX = 30,
+	NES_IWARP_SQ_WQE_STAG3_IDX = 31,
+};
+
+enum nes_iwarp_sq_bind_wqe_word_idx {
+	NES_IWARP_SQ_BIND_WQE_MR_IDX = 6,
+	NES_IWARP_SQ_BIND_WQE_MW_IDX = 7,
+	NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX = 8,
+	NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX = 9,
+	NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX = 10,
+	NES_IWARP_SQ_BIND_WQE_VA_FBO_HIGH_IDX = 11,
+};
+
+enum nes_iwarp_sq_fmr_wqe_word_idx {
+	NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX = 7,
+	NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX = 8,
+	NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX = 9,
+	NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX = 10,
+	NES_IWARP_SQ_FMR_WQE_VA_FBO_HIGH_IDX = 11,
+	NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX = 12,
+	NES_IWARP_SQ_FMR_WQE_PBL_ADDR_HIGH_IDX = 13,
+	NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX = 14,
+};
+
+enum nes_iwarp_sq_fmr_opcodes {
+	NES_IWARP_SQ_FMR_WQE_ZERO_BASED			= (1<<6),
+	NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K		= (0<<7),
+	NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M		= (1<<7),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ	= (1<<16),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE 	= (1<<17),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ 	= (1<<18),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE = (1<<19),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND 	= (1<<20),
+};
+
+#define NES_IWARP_SQ_FMR_WQE_MR_LENGTH_HIGH_MASK	0xFF;
+
+enum nes_iwarp_sq_locinv_wqe_word_idx {
+	NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX = 6,
+};
+
+enum nes_iwarp_rq_wqe_word_idx {
+	NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX = 1,
+	NES_IWARP_RQ_WQE_COMP_CTX_LOW_IDX = 2,
+	NES_IWARP_RQ_WQE_COMP_CTX_HIGH_IDX = 3,
+	NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX = 4,
+	NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX = 5,
+	NES_IWARP_RQ_WQE_FRAG0_LOW_IDX = 8,
+	NES_IWARP_RQ_WQE_FRAG0_HIGH_IDX = 9,
+	NES_IWARP_RQ_WQE_LENGTH0_IDX = 10,
+	NES_IWARP_RQ_WQE_STAG0_IDX = 11,
+	NES_IWARP_RQ_WQE_FRAG1_LOW_IDX = 12,
+	NES_IWARP_RQ_WQE_FRAG1_HIGH_IDX = 13,
+	NES_IWARP_RQ_WQE_LENGTH1_IDX = 14,
+	NES_IWARP_RQ_WQE_STAG1_IDX = 15,
+	NES_IWARP_RQ_WQE_FRAG2_LOW_IDX = 16,
+	NES_IWARP_RQ_WQE_FRAG2_HIGH_IDX = 17,
+	NES_IWARP_RQ_WQE_LENGTH2_IDX = 18,
+	NES_IWARP_RQ_WQE_STAG2_IDX = 19,
+	NES_IWARP_RQ_WQE_FRAG3_LOW_IDX = 20,
+	NES_IWARP_RQ_WQE_FRAG3_HIGH_IDX = 21,
+	NES_IWARP_RQ_WQE_LENGTH3_IDX = 22,
+	NES_IWARP_RQ_WQE_STAG3_IDX = 23,
+};
+
+enum nes_nic_sq_wqe_bits {
+	NES_NIC_SQ_WQE_PHDR_CS_READY =  (1<<21),
+	NES_NIC_SQ_WQE_LSO_ENABLE = (1<<22),
+	NES_NIC_SQ_WQE_TAGVALUE_ENABLE = (1<<23),
+	NES_NIC_SQ_WQE_DISABLE_CHKSUM = (1<<30),
+	NES_NIC_SQ_WQE_COMPLETION = (1<<31),
+};
+
+enum nes_nic_cqe_word_idx {
+	NES_NIC_CQE_ACCQP_ID_IDX = 0,
+	NES_NIC_CQE_HASH_RCVNXT = 1,
+	NES_NIC_CQE_TAG_PKT_TYPE_IDX = 2,
+	NES_NIC_CQE_MISC_IDX = 3,
+};
+
+#define NES_PKT_TYPE_APBVT_BITS 0xC112
+#define NES_PKT_TYPE_APBVT_MASK 0xff3e
+
+#define NES_PKT_TYPE_PVALID_BITS 0x10000000
+#define NES_PKT_TYPE_PVALID_MASK 0x30000000
+
+#define NES_PKT_TYPE_TCPV4_BITS 0x0110
+#define NES_PKT_TYPE_TCPV4_MASK 0x3f30
+
+#define NES_PKT_TYPE_UDPV4_BITS 0x0210
+#define NES_PKT_TYPE_UDPV4_MASK 0x3f30
+
+#define NES_PKT_TYPE_IPV4_BITS  0x0010
+#define NES_PKT_TYPE_IPV4_MASK  0x3f30
+
+#define NES_PKT_TYPE_OTHER_BITS 0x0000
+#define NES_PKT_TYPE_OTHER_MASK 0x0030
+
+#define NES_NIC_CQE_ERRV_SHIFT 16
+enum nes_nic_ev_bits {
+	NES_NIC_ERRV_BITS_MODE = (1<<0),
+	NES_NIC_ERRV_BITS_IPV4_CSUM_ERR = (1<<1),
+	NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR = (1<<2),
+	NES_NIC_ERRV_BITS_WQE_OVERRUN = (1<<3),
+	NES_NIC_ERRV_BITS_IPH_ERR = (1<<4),
+};
+
+enum nes_nic_cqe_bits {
+	NES_NIC_CQE_ERRV_MASK = (0xff<<NES_NIC_CQE_ERRV_SHIFT),
+	NES_NIC_CQE_SQ = (1<<24),
+	NES_NIC_CQE_ACCQP_PORT = (1<<28),
+	NES_NIC_CQE_ACCQP_VALID = (1<<29),
+	NES_NIC_CQE_TAG_VALID = (1<<30),
+	NES_NIC_CQE_VALID = (1<<31),
+};
+
+enum nes_aeqe_word_idx {
+	NES_AEQE_COMP_CTXT_LOW_IDX = 0,
+	NES_AEQE_COMP_CTXT_HIGH_IDX = 1,
+	NES_AEQE_COMP_QP_CQ_ID_IDX = 2,
+	NES_AEQE_MISC_IDX = 3,
+};
+
+enum nes_aeqe_bits {
+	NES_AEQE_QP = (1<<16),
+	NES_AEQE_CQ = (1<<17),
+	NES_AEQE_SQ = (1<<18),
+	NES_AEQE_INBOUND_RDMA = (1<<19),
+	NES_AEQE_IWARP_STATE_MASK = (7<<20),
+	NES_AEQE_TCP_STATE_MASK = (0xf<<24),
+	NES_AEQE_Q2_DATA_WRITTEN = (0x3<<28),
+	NES_AEQE_VALID = (1<<31),
+};
+
+#define NES_AEQE_IWARP_STATE_SHIFT	20
+#define NES_AEQE_TCP_STATE_SHIFT	24
+#define NES_AEQE_Q2_DATA_ETHERNET       (1<<28)
+#define NES_AEQE_Q2_DATA_MPA            (1<<29)
+
+enum nes_aeqe_iwarp_state {
+	NES_AEQE_IWARP_STATE_NON_EXISTANT = 0,
+	NES_AEQE_IWARP_STATE_IDLE = 1,
+	NES_AEQE_IWARP_STATE_RTS = 2,
+	NES_AEQE_IWARP_STATE_CLOSING = 3,
+	NES_AEQE_IWARP_STATE_TERMINATE = 5,
+	NES_AEQE_IWARP_STATE_ERROR = 6
+};
+
+enum nes_aeqe_tcp_state {
+	NES_AEQE_TCP_STATE_NON_EXISTANT = 0,
+	NES_AEQE_TCP_STATE_CLOSED = 1,
+	NES_AEQE_TCP_STATE_LISTEN = 2,
+	NES_AEQE_TCP_STATE_SYN_SENT = 3,
+	NES_AEQE_TCP_STATE_SYN_RCVD = 4,
+	NES_AEQE_TCP_STATE_ESTABLISHED = 5,
+	NES_AEQE_TCP_STATE_CLOSE_WAIT = 6,
+	NES_AEQE_TCP_STATE_FIN_WAIT_1 = 7,
+	NES_AEQE_TCP_STATE_CLOSING = 8,
+	NES_AEQE_TCP_STATE_LAST_ACK = 9,
+	NES_AEQE_TCP_STATE_FIN_WAIT_2 = 10,
+	NES_AEQE_TCP_STATE_TIME_WAIT = 11
+};
+
+enum nes_aeqe_aeid {
+	NES_AEQE_AEID_AMP_UNALLOCATED_STAG                            = 0x0102,
+	NES_AEQE_AEID_AMP_INVALID_STAG                                = 0x0103,
+	NES_AEQE_AEID_AMP_BAD_QP                                      = 0x0104,
+	NES_AEQE_AEID_AMP_BAD_PD                                      = 0x0105,
+	NES_AEQE_AEID_AMP_BAD_STAG_KEY                                = 0x0106,
+	NES_AEQE_AEID_AMP_BAD_STAG_INDEX                              = 0x0107,
+	NES_AEQE_AEID_AMP_BOUNDS_VIOLATION                            = 0x0108,
+	NES_AEQE_AEID_AMP_RIGHTS_VIOLATION                            = 0x0109,
+	NES_AEQE_AEID_AMP_TO_WRAP                                     = 0x010a,
+	NES_AEQE_AEID_AMP_FASTREG_SHARED                              = 0x010b,
+	NES_AEQE_AEID_AMP_FASTREG_VALID_STAG                          = 0x010c,
+	NES_AEQE_AEID_AMP_FASTREG_MW_STAG                             = 0x010d,
+	NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS                      = 0x010e,
+	NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW                  = 0x010f,
+	NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH                      = 0x0110,
+	NES_AEQE_AEID_AMP_INVALIDATE_SHARED                           = 0x0111,
+	NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS          = 0x0112,
+	NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS            = 0x0113,
+	NES_AEQE_AEID_AMP_MWBIND_VALID_STAG                           = 0x0114,
+	NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG                           = 0x0115,
+	NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG                   = 0x0116,
+	NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG                           = 0x0117,
+	NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS                       = 0x0118,
+	NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS                       = 0x0119,
+	NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT                    = 0x011a,
+	NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED                        = 0x011b,
+	NES_AEQE_AEID_BAD_CLOSE                                       = 0x0201,
+	NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE                         = 0x0202,
+	NES_AEQE_AEID_CQ_OPERATION_ERROR                              = 0x0203,
+	NES_AEQE_AEID_PRIV_OPERATION_DENIED                           = 0x0204,
+	NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO                        = 0x0205,
+	NES_AEQE_AEID_STAG_ZERO_INVALID                               = 0x0206,
+	NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN                      = 0x0301,
+	NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID              = 0x0302,
+	NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER = 0x0303,
+	NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION                     = 0x0304,
+	NES_AEQE_AEID_DDP_UBE_INVALID_MO                              = 0x0305,
+	NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE         = 0x0306,
+	NES_AEQE_AEID_DDP_UBE_INVALID_QN                              = 0x0307,
+	NES_AEQE_AEID_DDP_NO_L_BIT                                    = 0x0308,
+	NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION                 = 0x0311,
+	NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE                     = 0x0312,
+	NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST                   = 0x0313,
+	NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP             = 0x0314,
+	NES_AEQE_AEID_INVALID_ARP_ENTRY                               = 0x0401,
+	NES_AEQE_AEID_INVALID_TCP_OPTION_RCVD                         = 0x0402,
+	NES_AEQE_AEID_STALE_ARP_ENTRY                                 = 0x0403,
+	NES_AEQE_AEID_LLP_CLOSE_COMPLETE                              = 0x0501,
+	NES_AEQE_AEID_LLP_CONNECTION_RESET                            = 0x0502,
+	NES_AEQE_AEID_LLP_FIN_RECEIVED                                = 0x0503,
+	NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH =  0x0504,
+	NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR                      = 0x0505,
+	NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE                           = 0x0506,
+	NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL                           = 0x0507,
+	NES_AEQE_AEID_LLP_SYN_RECEIVED                                = 0x0508,
+	NES_AEQE_AEID_LLP_TERMINATE_RECEIVED                          = 0x0509,
+	NES_AEQE_AEID_LLP_TOO_MANY_RETRIES                            = 0x050a,
+	NES_AEQE_AEID_LLP_TOO_MANY_KEEPALIVE_RETRIES                  = 0x050b,
+	NES_AEQE_AEID_RESET_SENT                                      = 0x0601,
+	NES_AEQE_AEID_TERMINATE_SENT                                  = 0x0602,
+	NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC                      = 0x0700
+};
+
+enum nes_iwarp_sq_opcodes {
+	NES_IWARP_SQ_WQE_WRPDU = (1<<15),
+	NES_IWARP_SQ_WQE_PSH = (1<<21),
+	NES_IWARP_SQ_WQE_STREAMING = (1<<23),
+	NES_IWARP_SQ_WQE_IMM_DATA = (1<<28),
+	NES_IWARP_SQ_WQE_READ_FENCE = (1<<29),
+	NES_IWARP_SQ_WQE_LOCAL_FENCE = (1<<30),
+	NES_IWARP_SQ_WQE_SIGNALED_COMPL = (1<<31),
+};
+
+enum nes_iwarp_sq_wqe_bits {
+	NES_IWARP_SQ_OP_RDMAW = 0,
+	NES_IWARP_SQ_OP_RDMAR = 1,
+	NES_IWARP_SQ_OP_SEND = 3,
+	NES_IWARP_SQ_OP_SENDINV = 4,
+	NES_IWARP_SQ_OP_SENDSE = 5,
+	NES_IWARP_SQ_OP_SENDSEINV = 6,
+	NES_IWARP_SQ_OP_BIND = 8,
+	NES_IWARP_SQ_OP_FAST_REG = 9,
+	NES_IWARP_SQ_OP_LOCINV = 10,
+	NES_IWARP_SQ_OP_RDMAR_LOCINV = 11,
+	NES_IWARP_SQ_OP_NOP = 12,
+};
+
+enum nes_iwarp_cqe_major_code {
+	NES_IWARP_CQE_MAJOR_FLUSH = 1,
+	NES_IWARP_CQE_MAJOR_DRV = 0x8000
+};
+
+enum nes_iwarp_cqe_minor_code {
+	NES_IWARP_CQE_MINOR_FLUSH = 1
+};
+
+#define NES_EEPROM_READ_REQUEST (1<<16)
+#define NES_MAC_ADDR_VALID      (1<<20)
+
+/*
+ * NES index registers init values.
+ */
+struct nes_init_values {
+	u32 index;
+	u32 data;
+	u8  wrt;
+};
+
+/*
+ * NES registers in BAR0.
+ */
+struct nes_pci_regs {
+	u32 int_status;
+	u32 int_mask;
+	u32 int_pending;
+	u32 intf_int_status;
+	u32 intf_int_mask;
+	u32 other_regs[59];	 /* pad out to 256 bytes for now */
+};
+
+#define NES_CQP_SQ_SIZE    128
+#define NES_CCQ_SIZE       128
+#define NES_NIC_WQ_SIZE    512
+#define NES_NIC_CTX_SIZE   ((NES_NIC_CTX_RQ_SIZE_512) | (NES_NIC_CTX_SQ_SIZE_512))
+#define NES_NIC_BACK_STORE 0x00038000
+
+struct nes_device;
+
+struct nes_hw_nic_qp_context {
+	__le32 context_words[6];
+};
+
+struct nes_hw_nic_sq_wqe {
+	__le32 wqe_words[16];
+};
+
+struct nes_hw_nic_rq_wqe {
+	__le32 wqe_words[16];
+};
+
+struct nes_hw_nic_cqe {
+	__le32 cqe_words[4];
+};
+
+struct nes_hw_cqp_qp_context {
+	__le32 context_words[4];
+};
+
+struct nes_hw_cqp_wqe {
+	__le32 wqe_words[16];
+};
+
+struct nes_hw_qp_wqe {
+	__le32 wqe_words[32];
+};
+
+struct nes_hw_cqe {
+	__le32 cqe_words[8];
+};
+
+struct nes_hw_ceqe {
+	__le32 ceqe_words[2];
+};
+
+struct nes_hw_aeqe {
+	__le32 aeqe_words[4];
+};
+
+struct nes_cqp_request {
+	union {
+		u64 cqp_callback_context;
+		void *cqp_callback_pointer;
+	};
+	wait_queue_head_t     waitq;
+	struct nes_hw_cqp_wqe cqp_wqe;
+	struct list_head      list;
+	atomic_t              refcount;
+	void (*cqp_callback)(struct nes_device *nesdev, struct nes_cqp_request *cqp_request);
+	u16                   major_code;
+	u16                   minor_code;
+	u8                    waiting;
+	u8                    request_done;
+	u8                    dynamic;
+	u8                    callback;
+};
+
+struct nes_hw_cqp {
+	struct nes_hw_cqp_wqe *sq_vbase;
+	dma_addr_t            sq_pbase;
+	spinlock_t            lock;
+	wait_queue_head_t     waitq;
+	u16                   qp_id;
+	u16                   sq_head;
+	u16                   sq_tail;
+	u16                   sq_size;
+};
+
+#define NES_FIRST_FRAG_SIZE 128
+struct nes_first_frag {
+	u8 buffer[NES_FIRST_FRAG_SIZE];
+};
+
+struct nes_hw_nic {
+	struct nes_first_frag    *first_frag_vbase;	/* virtual address of first frags */
+	struct nes_hw_nic_sq_wqe *sq_vbase;			/* virtual address of sq */
+	struct nes_hw_nic_rq_wqe *rq_vbase;			/* virtual address of rq */
+	struct sk_buff           *tx_skb[NES_NIC_WQ_SIZE];
+	struct sk_buff           *rx_skb[NES_NIC_WQ_SIZE];
+	dma_addr_t frag_paddr[NES_NIC_WQ_SIZE];
+	unsigned long first_frag_overflow[BITS_TO_LONGS(NES_NIC_WQ_SIZE)];
+	dma_addr_t sq_pbase;			/* PCI memory for host rings */
+	dma_addr_t rq_pbase;			/* PCI memory for host rings */
+
+	u16 qp_id;
+	u16 sq_head;
+	u16 sq_tail;
+	u16 sq_size;
+	u16 rq_head;
+	u16 rq_tail;
+	u16 rq_size;
+	u8 replenishing_rq;
+	u8 reserved;
+
+	spinlock_t rq_lock;
+};
+
+struct nes_hw_nic_cq {
+	struct nes_hw_nic_cqe volatile *cq_vbase;	/* PCI memory for host rings */
+	void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_nic_cq *cq);
+	dma_addr_t cq_pbase;	/* PCI memory for host rings */
+	int rx_cqes_completed;
+	int cqe_allocs_pending;
+	int rx_pkts_indicated;
+	u16 cq_head;
+	u16 cq_size;
+	u16 cq_number;
+	u8  cqes_pending;
+};
+
+struct nes_hw_qp {
+	struct nes_hw_qp_wqe *sq_vbase;		/* PCI memory for host rings */
+	struct nes_hw_qp_wqe *rq_vbase;		/* PCI memory for host rings */
+	void                 *q2_vbase;			/* PCI memory for host rings */
+	dma_addr_t sq_pbase;	/* PCI memory for host rings */
+	dma_addr_t rq_pbase;	/* PCI memory for host rings */
+	dma_addr_t q2_pbase;	/* PCI memory for host rings */
+	u32 qp_id;
+	u16 sq_head;
+	u16 sq_tail;
+	u16 sq_size;
+	u16 rq_head;
+	u16 rq_tail;
+	u16 rq_size;
+	u8  rq_encoded_size;
+	u8  sq_encoded_size;
+};
+
+struct nes_hw_cq {
+	struct nes_hw_cqe *cq_vbase;	/* PCI memory for host rings */
+	void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_cq *cq);
+	dma_addr_t cq_pbase;	/* PCI memory for host rings */
+	u16 cq_head;
+	u16 cq_size;
+	u16 cq_number;
+};
+
+struct nes_hw_ceq {
+	struct nes_hw_ceqe volatile *ceq_vbase;	/* PCI memory for host rings */
+	dma_addr_t ceq_pbase;	/* PCI memory for host rings */
+	u16 ceq_head;
+	u16 ceq_size;
+};
+
+struct nes_hw_aeq {
+	struct nes_hw_aeqe volatile *aeq_vbase;	/* PCI memory for host rings */
+	dma_addr_t aeq_pbase;	/* PCI memory for host rings */
+	u16 aeq_head;
+	u16 aeq_size;
+};
+
+struct nic_qp_map {
+	u8 qpid;
+	u8 nic_index;
+	u8 logical_port;
+	u8 is_hnic;
+};
+
+#define	NES_CQP_ARP_AEQ_INDEX_MASK  0x000f0000
+#define	NES_CQP_ARP_AEQ_INDEX_SHIFT 16
+
+#define NES_CQP_APBVT_ADD			0x00008000
+#define NES_CQP_APBVT_NIC_SHIFT		16
+
+#define NES_ARP_ADD     1
+#define NES_ARP_DELETE  2
+#define NES_ARP_RESOLVE 3
+
+#define NES_MAC_SW_IDLE      0
+#define NES_MAC_SW_INTERRUPT 1
+#define NES_MAC_SW_MH        2
+
+struct nes_arp_entry {
+	u32 ip_addr;
+	u8  mac_addr[ETH_ALEN];
+};
+
+#define NES_NIC_FAST_TIMER          96
+#define NES_NIC_FAST_TIMER_LOW      40
+#define NES_NIC_FAST_TIMER_HIGH     1000
+#define DEFAULT_NES_QL_HIGH         256
+#define DEFAULT_NES_QL_LOW          16
+#define DEFAULT_NES_QL_TARGET       64
+#define DEFAULT_JUMBO_NES_QL_LOW    12
+#define DEFAULT_JUMBO_NES_QL_TARGET 40
+#define DEFAULT_JUMBO_NES_QL_HIGH   128
+#define NES_NIC_CQ_DOWNWARD_TREND   16
+#define NES_PFT_SIZE		    48
+
+#define NES_MGT_WQ_COUNT 32
+#define NES_MGT_CTX_SIZE ((NES_NIC_CTX_RQ_SIZE_32) | (NES_NIC_CTX_SQ_SIZE_32))
+#define NES_MGT_QP_OFFSET 36
+#define NES_MGT_QP_COUNT 4
+
+struct nes_hw_tune_timer {
+    /* u16 cq_count; */
+    u16 threshold_low;
+    u16 threshold_target;
+    u16 threshold_high;
+    u16 timer_in_use;
+    u16 timer_in_use_old;
+    u16 timer_in_use_min;
+    u16 timer_in_use_max;
+    u8  timer_direction_upward;
+    u8  timer_direction_downward;
+    u16 cq_count_old;
+    u8  cq_direction_downward;
+};
+
+#define NES_TIMER_INT_LIMIT         2
+#define NES_TIMER_INT_LIMIT_DYNAMIC 10
+#define NES_TIMER_ENABLE_LIMIT      4
+#define NES_MAX_LINK_INTERRUPTS     128
+#define NES_MAX_LINK_CHECK          200
+
+struct nes_adapter {
+	u64              fw_ver;
+	unsigned long    *allocated_qps;
+	unsigned long    *allocated_cqs;
+	unsigned long    *allocated_mrs;
+	unsigned long    *allocated_pds;
+	unsigned long    *allocated_arps;
+	struct nes_qp    **qp_table;
+	struct workqueue_struct *work_q;
+
+	struct list_head list;
+	struct list_head active_listeners;
+	/* list of the netdev's associated with each logical port */
+	struct list_head nesvnic_list[4];
+
+	struct timer_list  mh_timer;
+	struct timer_list  lc_timer;
+	struct work_struct work;
+	spinlock_t         resource_lock;
+	spinlock_t         phy_lock;
+	spinlock_t         pbl_lock;
+	spinlock_t         periodic_timer_lock;
+
+	struct nes_arp_entry arp_table[NES_MAX_ARP_TABLE_SIZE];
+
+	/* Adapter CEQ and AEQs */
+	struct nes_hw_ceq ceq[16];
+	struct nes_hw_aeq aeq[8];
+
+	struct nes_hw_tune_timer tune_timer;
+
+	unsigned long doorbell_start;
+
+	u32 hw_rev;
+	u32 vendor_id;
+	u32 vendor_part_id;
+	u32 device_cap_flags;
+	u32 tick_delta;
+	u32 timer_int_req;
+	u32 arp_table_size;
+	u32 next_arp_index;
+
+	u32 max_mr;
+	u32 max_256pbl;
+	u32 max_4kpbl;
+	u32 free_256pbl;
+	u32 free_4kpbl;
+	u32 max_mr_size;
+	u32 max_qp;
+	u32 next_qp;
+	u32 max_irrq;
+	u32 max_qp_wr;
+	u32 max_sge;
+	u32 max_cq;
+	u32 next_cq;
+	u32 max_cqe;
+	u32 max_pd;
+	u32 base_pd;
+	u32 next_pd;
+	u32 hte_index_mask;
+
+	/* EEPROM information */
+	u32 rx_pool_size;
+	u32 tx_pool_size;
+	u32 rx_threshold;
+	u32 tcp_timer_core_clk_divisor;
+	u32 iwarp_config;
+	u32 cm_config;
+	u32 sws_timer_config;
+	u32 tcp_config1;
+	u32 wqm_wat;
+	u32 core_clock;
+	u32 firmware_version;
+	u32 eeprom_version;
+
+	u32 nic_rx_eth_route_err;
+
+	u32 et_rx_coalesce_usecs;
+	u32 et_rx_max_coalesced_frames;
+	u32 et_rx_coalesce_usecs_irq;
+	u32 et_rx_max_coalesced_frames_irq;
+	u32 et_pkt_rate_low;
+	u32 et_rx_coalesce_usecs_low;
+	u32 et_rx_max_coalesced_frames_low;
+	u32 et_pkt_rate_high;
+	u32 et_rx_coalesce_usecs_high;
+	u32 et_rx_max_coalesced_frames_high;
+	u32 et_rate_sample_interval;
+	u32 timer_int_limit;
+	u32 wqm_quanta;
+	u8 allow_unaligned_fpdus;
+
+	/* Adapter base MAC address */
+	u32 mac_addr_low;
+	u16 mac_addr_high;
+
+	u16 firmware_eeprom_offset;
+	u16 software_eeprom_offset;
+
+	u16 max_irrq_wr;
+
+	/* pd config for each port */
+	u16 pd_config_size[4];
+	u16 pd_config_base[4];
+
+	u16 link_interrupt_count[4];
+	u8 crit_error_count[32];
+
+	/* the phy index for each port */
+	u8  phy_index[4];
+	u8  mac_sw_state[4];
+	u8  mac_link_down[4];
+	u8  phy_type[4];
+	u8  log_port;
+
+	/* PCI information */
+	struct nes_device *nesdev;
+	unsigned int  devfn;
+	unsigned char bus_number;
+	unsigned char OneG_Mode;
+
+	unsigned char ref_count;
+	u8            netdev_count;
+	u8            netdev_max;	/* from host nic address count in EEPROM */
+	u8            port_count;
+	u8            virtwq;
+	u8            send_term_ok;
+	u8            et_use_adaptive_rx_coalesce;
+	u8            adapter_fcn_count;
+	u8 pft_mcast_map[NES_PFT_SIZE];
+};
+
+struct nes_pbl {
+	u64              *pbl_vbase;
+	dma_addr_t       pbl_pbase;
+	struct page      *page;
+	unsigned long    user_base;
+	u32              pbl_size;
+	struct list_head list;
+	/* TODO: need to add list for two level tables */
+};
+
+#define NES_4K_PBL_CHUNK_SIZE	4096
+
+struct nes_fast_mr_wqe_pbl {
+	u64		*kva;
+	dma_addr_t	paddr;
+};
+
+struct nes_listener {
+	struct work_struct      work;
+	struct workqueue_struct *wq;
+	struct nes_vnic         *nesvnic;
+	struct iw_cm_id         *cm_id;
+	struct list_head        list;
+	unsigned long           socket;
+	u8                      accept_failed;
+};
+
+struct nes_ib_device;
+
+#define NES_EVENT_DELAY msecs_to_jiffies(100)
+
+struct nes_vnic {
+	struct nes_ib_device *nesibdev;
+	u64 sq_full;
+	u64 tso_requests;
+	u64 segmented_tso_requests;
+	u64 linearized_skbs;
+	u64 tx_sw_dropped;
+	u64 endnode_nstat_rx_discard;
+	u64 endnode_nstat_rx_octets;
+	u64 endnode_nstat_rx_frames;
+	u64 endnode_nstat_tx_octets;
+	u64 endnode_nstat_tx_frames;
+	u64 endnode_ipv4_tcp_retransmits;
+	/* void *mem; */
+	struct nes_device *nesdev;
+	struct net_device *netdev;
+	atomic_t          rx_skbs_needed;
+	atomic_t          rx_skb_timer_running;
+	int               budget;
+	u32               msg_enable;
+	/* u32 tx_avail; */
+	__be32            local_ipaddr;
+	struct napi_struct   napi;
+	spinlock_t           tx_lock;	/* could use netdev tx lock? */
+	struct timer_list    rq_wqes_timer;
+	u32                  nic_mem_size;
+	void                 *nic_vbase;
+	dma_addr_t           nic_pbase;
+	struct nes_hw_nic    nic;
+	struct nes_hw_nic_cq nic_cq;
+	u32    mcrq_qp_id;
+	struct nes_ucontext *mcrq_ucontext;
+	struct nes_cqp_request* (*get_cqp_request)(struct nes_device *nesdev);
+	void (*post_cqp_request)(struct nes_device*, struct nes_cqp_request *);
+	int (*mcrq_mcast_filter)( struct nes_vnic* nesvnic, __u8* dmi_addr );
+	struct net_device_stats netstats;
+	/* used to put the netdev on the adapters logical port list */
+	struct list_head list;
+	u16 max_frame_size;
+	u8  netdev_open;
+	u8  linkup;
+	u8  logical_port;
+	u8  netdev_index;  /* might not be needed, indexes nesdev->netdev */
+	u8  perfect_filter_index;
+	u8  nic_index;
+	u8  qp_nic_index[4];
+	u8  next_qp_nic_index;
+	u8  of_device_registered;
+	u8  rdma_enabled;
+	struct timer_list event_timer;
+	enum ib_event_type delayed_event;
+	enum ib_event_type last_dispatched_event;
+	spinlock_t port_ibevent_lock;
+	u32 mgt_mem_size;
+	void *mgt_vbase;
+	dma_addr_t mgt_pbase;
+	struct nes_vnic_mgt *mgtvnic[NES_MGT_QP_COUNT];
+	struct task_struct *mgt_thread;
+	wait_queue_head_t mgt_wait_queue;
+	struct sk_buff_head mgt_skb_list;
+
+};
+
+struct nes_ib_device {
+	struct ib_device ibdev;
+	struct nes_vnic *nesvnic;
+
+	/* Virtual RNIC Limits */
+	u32 max_mr;
+	u32 max_qp;
+	u32 max_cq;
+	u32 max_pd;
+	u32 num_mr;
+	u32 num_qp;
+	u32 num_cq;
+	u32 num_pd;
+};
+
+enum nes_hdrct_flags {
+	DDP_LEN_FLAG                    = 0x80,
+	DDP_HDR_FLAG                    = 0x40,
+	RDMA_HDR_FLAG                   = 0x20
+};
+
+enum nes_term_layers {
+	LAYER_RDMA			= 0,
+	LAYER_DDP			= 1,
+	LAYER_MPA			= 2
+};
+
+enum nes_term_error_types {
+	RDMAP_CATASTROPHIC		= 0,
+	RDMAP_REMOTE_PROT		= 1,
+	RDMAP_REMOTE_OP			= 2,
+	DDP_CATASTROPHIC		= 0,
+	DDP_TAGGED_BUFFER		= 1,
+	DDP_UNTAGGED_BUFFER		= 2,
+	DDP_LLP				= 3
+};
+
+enum nes_term_rdma_errors {
+	RDMAP_INV_STAG			= 0x00,
+	RDMAP_INV_BOUNDS		= 0x01,
+	RDMAP_ACCESS			= 0x02,
+	RDMAP_UNASSOC_STAG		= 0x03,
+	RDMAP_TO_WRAP			= 0x04,
+	RDMAP_INV_RDMAP_VER		= 0x05,
+	RDMAP_UNEXPECTED_OP		= 0x06,
+	RDMAP_CATASTROPHIC_LOCAL	= 0x07,
+	RDMAP_CATASTROPHIC_GLOBAL	= 0x08,
+	RDMAP_CANT_INV_STAG		= 0x09,
+	RDMAP_UNSPECIFIED		= 0xff
+};
+
+enum nes_term_ddp_errors {
+	DDP_CATASTROPHIC_LOCAL		= 0x00,
+	DDP_TAGGED_INV_STAG		= 0x00,
+	DDP_TAGGED_BOUNDS		= 0x01,
+	DDP_TAGGED_UNASSOC_STAG		= 0x02,
+	DDP_TAGGED_TO_WRAP		= 0x03,
+	DDP_TAGGED_INV_DDP_VER		= 0x04,
+	DDP_UNTAGGED_INV_QN		= 0x01,
+	DDP_UNTAGGED_INV_MSN_NO_BUF	= 0x02,
+	DDP_UNTAGGED_INV_MSN_RANGE	= 0x03,
+	DDP_UNTAGGED_INV_MO		= 0x04,
+	DDP_UNTAGGED_INV_TOO_LONG	= 0x05,
+	DDP_UNTAGGED_INV_DDP_VER	= 0x06
+};
+
+enum nes_term_mpa_errors {
+	MPA_CLOSED			= 0x01,
+	MPA_CRC				= 0x02,
+	MPA_MARKER			= 0x03,
+	MPA_REQ_RSP			= 0x04,
+};
+
+struct nes_terminate_hdr {
+	u8 layer_etype;
+	u8 error_code;
+	u8 hdrct;
+	u8 rsvd;
+};
+
+/* Used to determine how to fill in terminate error codes */
+#define IWARP_OPCODE_WRITE		0
+#define IWARP_OPCODE_READREQ		1
+#define IWARP_OPCODE_READRSP		2
+#define IWARP_OPCODE_SEND		3
+#define IWARP_OPCODE_SEND_INV		4
+#define IWARP_OPCODE_SEND_SE		5
+#define IWARP_OPCODE_SEND_SE_INV	6
+#define IWARP_OPCODE_TERM		7
+
+/* These values are used only during terminate processing */
+#define TERM_DDP_LEN_TAGGED	14
+#define TERM_DDP_LEN_UNTAGGED	18
+#define TERM_RDMA_LEN		28
+#define RDMA_OPCODE_MASK	0x0f
+#define RDMA_READ_REQ_OPCODE	1
+#define BAD_FRAME_OFFSET	64
+#define CQE_MAJOR_DRV		0x8000
+
+/* Used for link status recheck after interrupt processing */
+#define NES_LINK_RECHECK_DELAY	msecs_to_jiffies(50)
+#define NES_LINK_RECHECK_MAX	60
+
+#endif		/* __NES_HW_H */
diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c
new file mode 100644
index 0000000..21e0ebd
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_mgt.c
@@ -0,0 +1,1156 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel-NE, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/kthread.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+#include "nes.h"
+#include "nes_mgt.h"
+
+atomic_t pau_qps_created;
+atomic_t pau_qps_destroyed;
+
+static void nes_replenish_mgt_rq(struct nes_vnic_mgt *mgtvnic)
+{
+	unsigned long flags;
+	dma_addr_t bus_address;
+	struct sk_buff *skb;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_hw_mgt *nesmgt;
+	struct nes_device *nesdev;
+	struct nes_rskb_cb *cb;
+	u32 rx_wqes_posted = 0;
+
+	nesmgt = &mgtvnic->mgt;
+	nesdev = mgtvnic->nesvnic->nesdev;
+	spin_lock_irqsave(&nesmgt->rq_lock, flags);
+	if (nesmgt->replenishing_rq != 0) {
+		if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) &&
+		    (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) {
+			atomic_set(&mgtvnic->rx_skb_timer_running, 1);
+			spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+			mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2);      /* 1/2 second */
+			add_timer(&mgtvnic->rq_wqes_timer);
+		} else {
+			spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+		}
+		return;
+	}
+	nesmgt->replenishing_rq = 1;
+	spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+	do {
+		skb = dev_alloc_skb(mgtvnic->nesvnic->max_frame_size);
+		if (skb) {
+			skb->dev = mgtvnic->nesvnic->netdev;
+
+			bus_address = pci_map_single(nesdev->pcidev,
+						     skb->data, mgtvnic->nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+			cb->busaddr = bus_address;
+			cb->maplen = mgtvnic->nesvnic->max_frame_size;
+
+			nic_rqe = &nesmgt->rq_vbase[mgtvnic->mgt.rq_head];
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] =
+				cpu_to_le32(mgtvnic->nesvnic->max_frame_size);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] =
+				cpu_to_le32((u32)bus_address);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] =
+				cpu_to_le32((u32)((u64)bus_address >> 32));
+			nesmgt->rx_skb[nesmgt->rq_head] = skb;
+			nesmgt->rq_head++;
+			nesmgt->rq_head &= nesmgt->rq_size - 1;
+			atomic_dec(&mgtvnic->rx_skbs_needed);
+			barrier();
+			if (++rx_wqes_posted == 255) {
+				nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id);
+				rx_wqes_posted = 0;
+			}
+		} else {
+			spin_lock_irqsave(&nesmgt->rq_lock, flags);
+			if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) &&
+			    (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) {
+				atomic_set(&mgtvnic->rx_skb_timer_running, 1);
+				spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+				mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2);      /* 1/2 second */
+				add_timer(&mgtvnic->rq_wqes_timer);
+			} else {
+				spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+			}
+			break;
+		}
+	} while (atomic_read(&mgtvnic->rx_skbs_needed));
+	barrier();
+	if (rx_wqes_posted)
+		nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id);
+	nesmgt->replenishing_rq = 0;
+}
+
+/**
+ * nes_mgt_rq_wqes_timeout
+ */
+static void nes_mgt_rq_wqes_timeout(struct timer_list *t)
+{
+	struct nes_vnic_mgt *mgtvnic = from_timer(mgtvnic, t,
+						       rq_wqes_timer);
+
+	atomic_set(&mgtvnic->rx_skb_timer_running, 0);
+	if (atomic_read(&mgtvnic->rx_skbs_needed))
+		nes_replenish_mgt_rq(mgtvnic);
+}
+
+/**
+ * nes_mgt_free_skb - unmap and free skb
+ */
+static void nes_mgt_free_skb(struct nes_device *nesdev, struct sk_buff *skb, u32 dir)
+{
+	struct nes_rskb_cb *cb;
+
+	cb = (struct nes_rskb_cb *)&skb->cb[0];
+	pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, dir);
+	cb->busaddr = 0;
+	dev_kfree_skb_any(skb);
+}
+
+/**
+ * nes_download_callback - handle download completions
+ */
+static void nes_download_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
+{
+	struct pau_fpdu_info *fpdu_info = cqp_request->cqp_callback_pointer;
+	struct nes_qp *nesqp = fpdu_info->nesqp;
+	struct sk_buff *skb;
+	int i;
+
+	for (i = 0; i < fpdu_info->frag_cnt; i++) {
+		skb = fpdu_info->frags[i].skb;
+		if (fpdu_info->frags[i].cmplt) {
+			nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE);
+			nes_rem_ref_cm_node(nesqp->cm_node);
+		}
+	}
+
+	if (fpdu_info->hdr_vbase)
+		pci_free_consistent(nesdev->pcidev, fpdu_info->hdr_len,
+				    fpdu_info->hdr_vbase, fpdu_info->hdr_pbase);
+	kfree(fpdu_info);
+}
+
+/**
+ * nes_get_seq - Get the seq, ack_seq and window from the packet
+ */
+static u32 nes_get_seq(struct sk_buff *skb, u32 *ack, u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd)
+{
+	struct nes_rskb_cb *cb = (struct nes_rskb_cb *)&skb->cb[0];
+	struct iphdr *iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
+	struct tcphdr *tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+
+	*ack = be32_to_cpu(tcph->ack_seq);
+	*wnd = be16_to_cpu(tcph->window);
+	*fin_rcvd = tcph->fin;
+	*rst_rcvd = tcph->rst;
+	return be32_to_cpu(tcph->seq);
+}
+
+/**
+ * nes_get_next_skb - Get the next skb based on where current skb is in the queue
+ */
+static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp *nesqp,
+					struct sk_buff *skb, u32 nextseq, u32 *ack,
+					u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd)
+{
+	u32 seq;
+	bool processacks;
+	struct sk_buff *old_skb;
+
+	if (skb) {
+		/* Continue processing fpdu */
+		if (skb->next == (struct sk_buff *)&nesqp->pau_list)
+			goto out;
+		skb = skb->next;
+		processacks = false;
+	} else {
+		/* Starting a new one */
+		if (skb_queue_empty(&nesqp->pau_list))
+			goto out;
+		skb = skb_peek(&nesqp->pau_list);
+		processacks = true;
+	}
+
+	while (1) {
+		if (skb_queue_empty(&nesqp->pau_list))
+			goto out;
+
+		seq = nes_get_seq(skb, ack, wnd, fin_rcvd, rst_rcvd);
+		if (seq == nextseq) {
+			if (skb->len || processacks)
+				break;
+		} else if (after(seq, nextseq)) {
+			goto out;
+		}
+
+		old_skb = skb;
+		skb = skb->next;
+		skb_unlink(old_skb, &nesqp->pau_list);
+		nes_mgt_free_skb(nesdev, old_skb, PCI_DMA_TODEVICE);
+		nes_rem_ref_cm_node(nesqp->cm_node);
+		if (skb == (struct sk_buff *)&nesqp->pau_list)
+			goto out;
+	}
+	return skb;
+
+out:
+	return NULL;
+}
+
+/**
+ * get_fpdu_info - Find the next complete fpdu and return its fragments.
+ */
+static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp,
+			 struct pau_fpdu_info **pau_fpdu_info)
+{
+	struct sk_buff *skb;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct nes_rskb_cb *cb;
+	struct pau_fpdu_info *fpdu_info = NULL;
+	struct pau_fpdu_frag frags[MAX_FPDU_FRAGS];
+	u32 fpdu_len = 0;
+	u32 tmp_len;
+	int frag_cnt = 0;
+	u32 tot_len;
+	u32 frag_tot;
+	u32 ack;
+	u32 fin_rcvd;
+	u32 rst_rcvd;
+	u16 wnd;
+	int i;
+	int rc = 0;
+
+	*pau_fpdu_info = NULL;
+
+	skb = nes_get_next_skb(nesdev, nesqp, NULL, nesqp->pau_rcv_nxt, &ack, &wnd, &fin_rcvd, &rst_rcvd);
+	if (!skb)
+		goto out;
+
+	cb = (struct nes_rskb_cb *)&skb->cb[0];
+	if (skb->len) {
+		fpdu_len = be16_to_cpu(*(__be16 *) skb->data) + MPA_FRAMING;
+		fpdu_len = (fpdu_len + 3) & 0xfffffffc;
+		tmp_len = fpdu_len;
+
+		/* See if we have all of the fpdu */
+		frag_tot = 0;
+		memset(&frags, 0, sizeof frags);
+		for (i = 0; i < MAX_FPDU_FRAGS; i++) {
+			frags[i].physaddr = cb->busaddr;
+			frags[i].physaddr += skb->data - cb->data_start;
+			frags[i].frag_len = min(tmp_len, skb->len);
+			frags[i].skb = skb;
+			frags[i].cmplt = (skb->len == frags[i].frag_len);
+			frag_tot += frags[i].frag_len;
+			frag_cnt++;
+
+			tmp_len -= frags[i].frag_len;
+			if (tmp_len == 0)
+				break;
+
+			skb = nes_get_next_skb(nesdev, nesqp, skb,
+					       nesqp->pau_rcv_nxt + frag_tot, &ack, &wnd, &fin_rcvd, &rst_rcvd);
+			if (!skb)
+				goto out;
+			if (rst_rcvd) {
+				/* rst received in the middle of fpdu */
+				for (; i >= 0; i--) {
+					skb_unlink(frags[i].skb, &nesqp->pau_list);
+					nes_mgt_free_skb(nesdev, frags[i].skb, PCI_DMA_TODEVICE);
+				}
+				cb = (struct nes_rskb_cb *)&skb->cb[0];
+				frags[0].physaddr = cb->busaddr;
+				frags[0].physaddr += skb->data - cb->data_start;
+				frags[0].frag_len = skb->len;
+				frags[0].skb = skb;
+				frags[0].cmplt = true;
+				frag_cnt = 1;
+				break;
+			}
+
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+		}
+	} else {
+		/* no data */
+		frags[0].physaddr = cb->busaddr;
+		frags[0].frag_len = 0;
+		frags[0].skb = skb;
+		frags[0].cmplt = true;
+		frag_cnt = 1;
+	}
+
+	/* Found one */
+	fpdu_info = kzalloc(sizeof(*fpdu_info), GFP_ATOMIC);
+	if (!fpdu_info) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	fpdu_info->cqp_request = nes_get_cqp_request(nesdev);
+	if (fpdu_info->cqp_request == NULL) {
+		nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	cb = (struct nes_rskb_cb *)&frags[0].skb->cb[0];
+	iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
+	tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+	fpdu_info->hdr_len = (((unsigned char *)tcph) + 4 * (tcph->doff)) - cb->data_start;
+	fpdu_info->data_len = fpdu_len;
+	tot_len = fpdu_info->hdr_len + fpdu_len - ETH_HLEN;
+
+	if (frags[0].cmplt) {
+		fpdu_info->hdr_pbase = cb->busaddr;
+		fpdu_info->hdr_vbase = NULL;
+	} else {
+		fpdu_info->hdr_vbase = pci_alloc_consistent(nesdev->pcidev,
+							    fpdu_info->hdr_len, &fpdu_info->hdr_pbase);
+		if (!fpdu_info->hdr_vbase) {
+			nes_debug(NES_DBG_PAU, "Unable to allocate memory for pau first frag\n");
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		/* Copy hdrs, adjusting len and seqnum */
+		memcpy(fpdu_info->hdr_vbase, cb->data_start, fpdu_info->hdr_len);
+		iph = (struct iphdr *)(fpdu_info->hdr_vbase + ETH_HLEN);
+		tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+	}
+
+	iph->tot_len = cpu_to_be16(tot_len);
+	iph->saddr = cpu_to_be32(0x7f000001);
+
+	tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt);
+	tcph->ack_seq = cpu_to_be32(ack);
+	tcph->window = cpu_to_be16(wnd);
+
+	nesqp->pau_rcv_nxt += fpdu_len + fin_rcvd;
+
+	memcpy(fpdu_info->frags, frags, sizeof(fpdu_info->frags));
+	fpdu_info->frag_cnt = frag_cnt;
+	fpdu_info->nesqp = nesqp;
+	*pau_fpdu_info = fpdu_info;
+
+	/* Update skb's for next pass */
+	for (i = 0; i < frag_cnt; i++) {
+		cb = (struct nes_rskb_cb *)&frags[i].skb->cb[0];
+		skb_pull(frags[i].skb, frags[i].frag_len);
+
+		if (frags[i].skb->len == 0) {
+			/* Pull skb off the list - it will be freed in the callback */
+			if (!skb_queue_empty(&nesqp->pau_list))
+				skb_unlink(frags[i].skb, &nesqp->pau_list);
+		} else {
+			/* Last skb still has data so update the seq */
+			iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
+			tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+			tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt);
+		}
+	}
+
+out:
+	if (rc) {
+		if (fpdu_info) {
+			if (fpdu_info->cqp_request)
+				nes_put_cqp_request(nesdev, fpdu_info->cqp_request);
+			kfree(fpdu_info);
+		}
+	}
+	return rc;
+}
+
+/**
+ * forward_fpdu - send complete fpdus, one at a time
+ */
+static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct pau_fpdu_info *fpdu_info;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	u64 u64tmp;
+	u32 u32tmp;
+	int rc;
+
+	while (1) {
+		spin_lock_irqsave(&nesqp->pau_lock, flags);
+		rc = get_fpdu_info(nesdev, nesqp, &fpdu_info);
+		if (rc || (fpdu_info == NULL)) {
+			spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+			return rc;
+		}
+
+		cqp_request = fpdu_info->cqp_request;
+		cqp_wqe = &cqp_request->cqp_wqe;
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_OPCODE_IDX,
+				    NES_CQP_DOWNLOAD_SEGMENT |
+				    (((u32)nesvnic->logical_port) << NES_CQP_OP_LOGICAL_PORT_SHIFT));
+
+		u32tmp = fpdu_info->hdr_len << 16;
+		u32tmp |= fpdu_info->hdr_len + (u32)fpdu_info->data_len;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX,
+				    u32tmp);
+
+		u32tmp = (fpdu_info->frags[1].frag_len << 16) | fpdu_info->frags[0].frag_len;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_2_1_IDX,
+				    u32tmp);
+
+		u32tmp = (fpdu_info->frags[3].frag_len << 16) | fpdu_info->frags[2].frag_len;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_4_3_IDX,
+				    u32tmp);
+
+		u64tmp = (u64)fpdu_info->hdr_pbase;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX,
+				    lower_32_bits(u64tmp));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_HIGH_IDX,
+				    upper_32_bits(u64tmp));
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
+				    lower_32_bits(fpdu_info->frags[0].physaddr));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_HIGH_IDX,
+				    upper_32_bits(fpdu_info->frags[0].physaddr));
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_LOW_IDX,
+				    lower_32_bits(fpdu_info->frags[1].physaddr));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_HIGH_IDX,
+				    upper_32_bits(fpdu_info->frags[1].physaddr));
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_LOW_IDX,
+				    lower_32_bits(fpdu_info->frags[2].physaddr));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_HIGH_IDX,
+				    upper_32_bits(fpdu_info->frags[2].physaddr));
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_LOW_IDX,
+				    lower_32_bits(fpdu_info->frags[3].physaddr));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_HIGH_IDX,
+				    upper_32_bits(fpdu_info->frags[3].physaddr));
+
+		cqp_request->cqp_callback_pointer = fpdu_info;
+		cqp_request->callback = 1;
+		cqp_request->cqp_callback = nes_download_callback;
+
+		atomic_set(&cqp_request->refcount, 1);
+		nes_post_cqp_request(nesdev, cqp_request);
+		spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+	}
+
+	return 0;
+}
+
+static void process_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	int again = 1;
+	unsigned long flags;
+
+	do {
+		/* Ignore rc - if it failed, tcp retries will cause it to try again */
+		forward_fpdus(nesvnic, nesqp);
+
+		spin_lock_irqsave(&nesqp->pau_lock, flags);
+		if (nesqp->pau_pending) {
+			nesqp->pau_pending = 0;
+		} else {
+			nesqp->pau_busy = 0;
+			again = 0;
+		}
+
+		spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+	} while (again);
+}
+
+/**
+ * queue_fpdus - Handle fpdu's that hw passed up to sw
+ */
+static void queue_fpdus(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	struct sk_buff *tmpskb;
+	struct nes_rskb_cb *cb;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	unsigned char *tcph_end;
+	u32 rcv_nxt;
+	u32 rcv_wnd;
+	u32 seqnum;
+	u32 len;
+	bool process_it = false;
+	unsigned long flags;
+
+	/* Move data ptr to after tcp header */
+	iph = (struct iphdr *)skb->data;
+	tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+	seqnum = be32_to_cpu(tcph->seq);
+	tcph_end = (((char *)tcph) + (4 * tcph->doff));
+
+	len = be16_to_cpu(iph->tot_len);
+	if (skb->len > len)
+		skb_trim(skb, len);
+	skb_pull(skb, tcph_end - skb->data);
+
+	/* Initialize tracking values */
+	cb = (struct nes_rskb_cb *)&skb->cb[0];
+	cb->seqnum = seqnum;
+
+	/* Make sure data is in the receive window */
+	rcv_nxt = nesqp->pau_rcv_nxt;
+	rcv_wnd = le32_to_cpu(nesqp->nesqp_context->rcv_wnd);
+	if (!between(seqnum, rcv_nxt, (rcv_nxt + rcv_wnd))) {
+		nes_mgt_free_skb(nesvnic->nesdev, skb, PCI_DMA_TODEVICE);
+		nes_rem_ref_cm_node(nesqp->cm_node);
+		return;
+	}
+
+	spin_lock_irqsave(&nesqp->pau_lock, flags);
+
+	if (nesqp->pau_busy)
+		nesqp->pau_pending = 1;
+	else
+		nesqp->pau_busy = 1;
+
+	/* Queue skb by sequence number */
+	if (skb_queue_len(&nesqp->pau_list) == 0) {
+		skb_queue_head(&nesqp->pau_list, skb);
+	} else {
+		tmpskb = nesqp->pau_list.next;
+		while (tmpskb != (struct sk_buff *)&nesqp->pau_list) {
+			cb = (struct nes_rskb_cb *)&tmpskb->cb[0];
+			if (before(seqnum, cb->seqnum))
+				break;
+			tmpskb = tmpskb->next;
+		}
+		skb_insert(tmpskb, skb, &nesqp->pau_list);
+	}
+	if (nesqp->pau_state == PAU_READY)
+		process_it = true;
+	spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+
+	if (process_it)
+		process_fpdus(nesvnic, nesqp);
+
+	return;
+}
+
+/**
+ * mgt_thread - Handle mgt skbs in a safe context
+ */
+static int mgt_thread(void *context)
+{
+	struct nes_vnic *nesvnic = context;
+	struct sk_buff *skb;
+	struct nes_rskb_cb *cb;
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(nesvnic->mgt_wait_queue,
+					 skb_queue_len(&nesvnic->mgt_skb_list) || kthread_should_stop());
+		while ((skb_queue_len(&nesvnic->mgt_skb_list)) && !kthread_should_stop()) {
+			skb = skb_dequeue(&nesvnic->mgt_skb_list);
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+			cb->data_start = skb->data - ETH_HLEN;
+			cb->busaddr = pci_map_single(nesvnic->nesdev->pcidev, cb->data_start,
+						     nesvnic->max_frame_size, PCI_DMA_TODEVICE);
+			queue_fpdus(skb, nesvnic, cb->nesqp);
+		}
+	}
+
+	/* Closing down so delete any entries on the queue */
+	while (skb_queue_len(&nesvnic->mgt_skb_list)) {
+		skb = skb_dequeue(&nesvnic->mgt_skb_list);
+		cb = (struct nes_rskb_cb *)&skb->cb[0];
+		nes_rem_ref_cm_node(cb->nesqp->cm_node);
+		dev_kfree_skb_any(skb);
+	}
+	return 0;
+}
+
+/**
+ * nes_queue_skbs - Queue skb so it can be handled in a thread context
+ */
+void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	struct nes_rskb_cb *cb;
+
+	cb = (struct nes_rskb_cb *)&skb->cb[0];
+	cb->nesqp = nesqp;
+	skb_queue_tail(&nesvnic->mgt_skb_list, skb);
+	wake_up_interruptible(&nesvnic->mgt_wait_queue);
+}
+
+void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp)
+{
+	struct sk_buff *skb;
+	unsigned long flags;
+	atomic_inc(&pau_qps_destroyed);
+
+	/* Free packets that have not yet been forwarded */
+	/* Lock is acquired by skb_dequeue when removing the skb */
+	spin_lock_irqsave(&nesqp->pau_lock, flags);
+	while (skb_queue_len(&nesqp->pau_list)) {
+		skb = skb_dequeue(&nesqp->pau_list);
+		nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE);
+		nes_rem_ref_cm_node(nesqp->cm_node);
+	}
+	spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+}
+
+static void nes_chg_qh_handler(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
+{
+	struct pau_qh_chg *qh_chg = cqp_request->cqp_callback_pointer;
+	struct nes_cqp_request *new_request;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_adapter *nesadapter;
+	struct nes_qp *nesqp;
+	struct nes_v4_quad nes_quad;
+	u32 crc_value;
+	u64 u64temp;
+
+	nesadapter = nesdev->nesadapter;
+	nesqp = qh_chg->nesqp;
+
+	/* Should we handle the bad completion */
+	if (cqp_request->major_code)
+		WARN(1, PFX "Invalid cqp_request major_code=0x%x\n",
+		       cqp_request->major_code);
+
+	switch (nesqp->pau_state) {
+	case PAU_DEL_QH:
+		/* Old hash code deleted, now set the new one */
+		nesqp->pau_state = PAU_ADD_LB_QH;
+		new_request = nes_get_cqp_request(nesdev);
+		if (new_request == NULL) {
+			nes_debug(NES_DBG_PAU, "Failed to get a new_request.\n");
+			WARN_ON(1);
+			return;
+		}
+
+		memset(&nes_quad, 0, sizeof(nes_quad));
+		nes_quad.DstIpAdrIndex =
+			cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
+		nes_quad.SrcIpadr = cpu_to_be32(0x7f000001);
+		nes_quad.TcpPorts[0] = swab16(nesqp->nesqp_context->tcpPorts[1]);
+		nes_quad.TcpPorts[1] = swab16(nesqp->nesqp_context->tcpPorts[0]);
+
+		/* Produce hash key */
+		crc_value = get_crc_value(&nes_quad);
+		nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
+		nes_debug(NES_DBG_PAU, "new HTE Index = 0x%08X, CRC = 0x%08X\n",
+			  nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask);
+
+		nesqp->hte_index &= nesadapter->hte_index_mask;
+		nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
+		nesqp->nesqp_context->ip0 = cpu_to_le32(0x7f000001);
+		nesqp->nesqp_context->rcv_nxt = cpu_to_le32(nesqp->pau_rcv_nxt);
+
+		cqp_wqe = &new_request->cqp_wqe;
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+		set_wqe_32bit_value(cqp_wqe->wqe_words,
+				    NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH |
+				    NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+		u64temp = (u64)nesqp->nesqp_context_pbase;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+		nes_debug(NES_DBG_PAU, "Waiting for CQP completion for adding the quad hash.\n");
+
+		new_request->cqp_callback_pointer = qh_chg;
+		new_request->callback = 1;
+		new_request->cqp_callback = nes_chg_qh_handler;
+		atomic_set(&new_request->refcount, 1);
+		nes_post_cqp_request(nesdev, new_request);
+		break;
+
+	case PAU_ADD_LB_QH:
+		/* Start processing the queued fpdu's */
+		nesqp->pau_state = PAU_READY;
+		process_fpdus(qh_chg->nesvnic, qh_chg->nesqp);
+		kfree(qh_chg);
+		break;
+	}
+}
+
+/**
+ * nes_change_quad_hash
+ */
+static int nes_change_quad_hash(struct nes_device *nesdev,
+				struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	struct nes_cqp_request *cqp_request = NULL;
+	struct pau_qh_chg *qh_chg = NULL;
+	u64 u64temp;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	int ret = 0;
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n");
+		ret = -ENOMEM;
+		goto chg_qh_err;
+	}
+
+	qh_chg = kmalloc(sizeof *qh_chg, GFP_ATOMIC);
+	if (!qh_chg) {
+		ret = -ENOMEM;
+		goto chg_qh_err;
+	}
+	qh_chg->nesdev = nesdev;
+	qh_chg->nesvnic = nesvnic;
+	qh_chg->nesqp = nesqp;
+	nesqp->pau_state = PAU_DEL_QH;
+
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words,
+			    NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH | NES_CQP_QP_DEL_HTE |
+			    NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+	u64temp = (u64)nesqp->nesqp_context_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+	nes_debug(NES_DBG_PAU, "Waiting for CQP completion for deleting the quad hash.\n");
+
+	cqp_request->cqp_callback_pointer = qh_chg;
+	cqp_request->callback = 1;
+	cqp_request->cqp_callback = nes_chg_qh_handler;
+	atomic_set(&cqp_request->refcount, 1);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	return ret;
+
+chg_qh_err:
+	kfree(qh_chg);
+	if (cqp_request)
+		nes_put_cqp_request(nesdev, cqp_request);
+	return ret;
+}
+
+/**
+ * nes_mgt_ce_handler
+ * This management code deals with any packed and unaligned (pau) fpdu's
+ * that the hardware cannot handle.
+ */
+static void nes_mgt_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
+{
+	struct nes_vnic_mgt *mgtvnic = container_of(cq, struct nes_vnic_mgt, mgt_cq);
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 head;
+	u32 cq_size;
+	u32 cqe_count = 0;
+	u32 cqe_misc;
+	u32 qp_id = 0;
+	u32 skbs_needed;
+	unsigned long context;
+	struct nes_qp *nesqp;
+	struct sk_buff *rx_skb;
+	struct nes_rskb_cb *cb;
+
+	head = cq->cq_head;
+	cq_size = cq->cq_size;
+
+	while (1) {
+		cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]);
+		if (!(cqe_misc & NES_NIC_CQE_VALID))
+			break;
+
+		nesqp = NULL;
+		if (cqe_misc & NES_NIC_CQE_ACCQP_VALID) {
+			qp_id = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_ACCQP_ID_IDX]);
+			qp_id &= 0x001fffff;
+			if (qp_id < nesadapter->max_qp) {
+				context = (unsigned long)nesadapter->qp_table[qp_id - NES_FIRST_QPN];
+				nesqp = (struct nes_qp *)context;
+			}
+		}
+
+		if (nesqp) {
+			if (nesqp->pau_mode == false) {
+				nesqp->pau_mode = true; /* First time for this qp */
+				nesqp->pau_rcv_nxt = le32_to_cpu(
+					cq->cq_vbase[head].cqe_words[NES_NIC_CQE_HASH_RCVNXT]);
+				skb_queue_head_init(&nesqp->pau_list);
+				spin_lock_init(&nesqp->pau_lock);
+				atomic_inc(&pau_qps_created);
+				nes_change_quad_hash(nesdev, mgtvnic->nesvnic, nesqp);
+			}
+
+			rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail];
+			rx_skb->len = 0;
+			skb_put(rx_skb, cqe_misc & 0x0000ffff);
+			rx_skb->protocol = eth_type_trans(rx_skb, mgtvnic->nesvnic->netdev);
+			cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
+			pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, PCI_DMA_FROMDEVICE);
+			cb->busaddr = 0;
+			mgtvnic->mgt.rq_tail++;
+			mgtvnic->mgt.rq_tail &= mgtvnic->mgt.rq_size - 1;
+
+			nes_add_ref_cm_node(nesqp->cm_node);
+			nes_queue_mgt_skbs(rx_skb, mgtvnic->nesvnic, nesqp);
+		} else {
+			printk(KERN_ERR PFX "Invalid QP %d for packed/unaligned handling\n", qp_id);
+		}
+
+		cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0;
+		cqe_count++;
+		if (++head >= cq_size)
+			head = 0;
+
+		if (cqe_count == 255) {
+			/* Replenish mgt CQ */
+			nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (cqe_count << 16));
+			nesdev->currcq_count += cqe_count;
+			cqe_count = 0;
+		}
+
+		skbs_needed = atomic_inc_return(&mgtvnic->rx_skbs_needed);
+		if (skbs_needed > (mgtvnic->mgt.rq_size >> 1))
+			nes_replenish_mgt_rq(mgtvnic);
+	}
+
+	cq->cq_head = head;
+	nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+		    cq->cq_number | (cqe_count << 16));
+	nes_read32(nesdev->regs + NES_CQE_ALLOC);
+	nesdev->currcq_count += cqe_count;
+}
+
+/**
+ * nes_init_mgt_qp
+ */
+int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic)
+{
+	struct nes_vnic_mgt *mgtvnic;
+	u32 counter;
+	void *vmem;
+	dma_addr_t pmem;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 cqp_head;
+	unsigned long flags;
+	struct nes_hw_nic_qp_context *mgt_context;
+	u64 u64temp;
+	struct nes_hw_nic_rq_wqe *mgt_rqe;
+	struct sk_buff *skb;
+	u32 wqe_count;
+	struct nes_rskb_cb *cb;
+	u32 mgt_mem_size;
+	void *mgt_vbase;
+	dma_addr_t mgt_pbase;
+	int i;
+	int ret;
+
+	/* Allocate space the all mgt QPs once */
+	mgtvnic = kzalloc(NES_MGT_QP_COUNT * sizeof(struct nes_vnic_mgt), GFP_KERNEL);
+	if (!mgtvnic)
+		return -ENOMEM;
+
+	/* Allocate fragment, RQ, and CQ; Reuse CEQ based on the PCI function */
+	/* We are not sending from this NIC so sq is not allocated */
+	mgt_mem_size = 256 +
+		       (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)) +
+		       (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_cqe)) +
+		       sizeof(struct nes_hw_nic_qp_context);
+	mgt_mem_size = (mgt_mem_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+	mgt_vbase = pci_alloc_consistent(nesdev->pcidev, NES_MGT_QP_COUNT * mgt_mem_size, &mgt_pbase);
+	if (!mgt_vbase) {
+		kfree(mgtvnic);
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt host descriptor rings\n");
+		return -ENOMEM;
+	}
+
+	nesvnic->mgt_mem_size = NES_MGT_QP_COUNT * mgt_mem_size;
+	nesvnic->mgt_vbase = mgt_vbase;
+	nesvnic->mgt_pbase = mgt_pbase;
+
+	skb_queue_head_init(&nesvnic->mgt_skb_list);
+	init_waitqueue_head(&nesvnic->mgt_wait_queue);
+	nesvnic->mgt_thread = kthread_run(mgt_thread, nesvnic, "nes_mgt_thread");
+
+	for (i = 0; i < NES_MGT_QP_COUNT; i++) {
+		mgtvnic->nesvnic = nesvnic;
+		mgtvnic->mgt.qp_id = nesdev->mac_index + NES_MGT_QP_OFFSET + i;
+		memset(mgt_vbase, 0, mgt_mem_size);
+		nes_debug(NES_DBG_INIT, "Allocated mgt QP structures at %p (phys = %016lX), size = %u.\n",
+			  mgt_vbase, (unsigned long)mgt_pbase, mgt_mem_size);
+
+		vmem = (void *)(((unsigned long)mgt_vbase + (256 - 1)) &
+				~(unsigned long)(256 - 1));
+		pmem = (dma_addr_t)(((unsigned long long)mgt_pbase + (256 - 1)) &
+				    ~(unsigned long long)(256 - 1));
+
+		spin_lock_init(&mgtvnic->mgt.rq_lock);
+
+		/* setup the RQ */
+		mgtvnic->mgt.rq_vbase = vmem;
+		mgtvnic->mgt.rq_pbase = pmem;
+		mgtvnic->mgt.rq_head = 0;
+		mgtvnic->mgt.rq_tail = 0;
+		mgtvnic->mgt.rq_size = NES_MGT_WQ_COUNT;
+
+		/* setup the CQ */
+		vmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe));
+		pmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe));
+
+		mgtvnic->mgt_cq.cq_number = mgtvnic->mgt.qp_id;
+		mgtvnic->mgt_cq.cq_vbase = vmem;
+		mgtvnic->mgt_cq.cq_pbase = pmem;
+		mgtvnic->mgt_cq.cq_head = 0;
+		mgtvnic->mgt_cq.cq_size = NES_MGT_WQ_COUNT;
+
+		mgtvnic->mgt_cq.ce_handler = nes_mgt_ce_handler;
+
+		/* Send CreateCQ request to CQP */
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+		cqp_head = nesdev->cqp.sq_head;
+
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+		cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
+			NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			((u32)mgtvnic->mgt_cq.cq_size << 16));
+		cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(
+			mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16));
+		u64temp = (u64)mgtvnic->mgt_cq.cq_pbase;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
+		u64temp = (unsigned long)&mgtvnic->mgt_cq;
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = cpu_to_le32((u32)(u64temp >> 1));
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+
+		if (++cqp_head >= nesdev->cqp.sq_size)
+			cqp_head = 0;
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+		/* Send CreateQP request to CQP */
+		mgt_context = (void *)(&mgtvnic->mgt_cq.cq_vbase[mgtvnic->mgt_cq.cq_size]);
+		mgt_context->context_words[NES_NIC_CTX_MISC_IDX] =
+			cpu_to_le32((u32)NES_MGT_CTX_SIZE |
+				    ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12));
+		nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n",
+			  nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE),
+			  nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE));
+		if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0)
+			mgt_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE);
+
+		u64temp = (u64)mgtvnic->mgt.rq_pbase;
+		mgt_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp);
+		mgt_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+		u64temp = (u64)mgtvnic->mgt.rq_pbase;
+		mgt_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp);
+		mgt_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+
+		cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP |
+									 NES_CQP_QP_TYPE_NIC);
+		cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(mgtvnic->mgt.qp_id);
+		u64temp = (u64)mgtvnic->mgt_cq.cq_pbase +
+			  (mgtvnic->mgt_cq.cq_size * sizeof(struct nes_hw_nic_cqe));
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+		if (++cqp_head >= nesdev->cqp.sq_size)
+			cqp_head = 0;
+		nesdev->cqp.sq_head = cqp_head;
+
+		barrier();
+
+		/* Ring doorbell (2 WQEs) */
+		nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+		nes_debug(NES_DBG_INIT, "Waiting for create MGT QP%u to complete.\n",
+			  mgtvnic->mgt.qp_id);
+
+		ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+					 NES_EVENT_TIMEOUT);
+		nes_debug(NES_DBG_INIT, "Create MGT QP%u completed, wait_event_timeout ret = %u.\n",
+			  mgtvnic->mgt.qp_id, ret);
+		if (!ret) {
+			nes_debug(NES_DBG_INIT, "MGT QP%u create timeout expired\n", mgtvnic->mgt.qp_id);
+			if (i == 0) {
+				pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase,
+						    nesvnic->mgt_pbase);
+				kfree(mgtvnic);
+			} else {
+				nes_destroy_mgt(nesvnic);
+			}
+			return -EIO;
+		}
+
+		/* Populate the RQ */
+		for (counter = 0; counter < (NES_MGT_WQ_COUNT - 1); counter++) {
+			skb = dev_alloc_skb(nesvnic->max_frame_size);
+			if (!skb) {
+				nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name);
+				return -ENOMEM;
+			}
+
+			skb->dev = netdev;
+
+			pmem = pci_map_single(nesdev->pcidev, skb->data,
+					      nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+			cb->busaddr = pmem;
+			cb->maplen = nesvnic->max_frame_size;
+
+			mgt_rqe = &mgtvnic->mgt.rq_vbase[counter];
+			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32((u32)nesvnic->max_frame_size);
+			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem);
+			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32));
+			mgtvnic->mgt.rx_skb[counter] = skb;
+		}
+
+		timer_setup(&mgtvnic->rq_wqes_timer, nes_mgt_rq_wqes_timeout,
+			    0);
+
+		wqe_count = NES_MGT_WQ_COUNT - 1;
+		mgtvnic->mgt.rq_head = wqe_count;
+		barrier();
+		do {
+			counter = min(wqe_count, ((u32)255));
+			wqe_count -= counter;
+			nes_write32(nesdev->regs + NES_WQE_ALLOC, (counter << 24) | mgtvnic->mgt.qp_id);
+		} while (wqe_count);
+
+		nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			    mgtvnic->mgt_cq.cq_number);
+		nes_read32(nesdev->regs + NES_CQE_ALLOC);
+
+		mgt_vbase += mgt_mem_size;
+		mgt_pbase += mgt_mem_size;
+		nesvnic->mgtvnic[i] = mgtvnic++;
+	}
+	return 0;
+}
+
+
+void nes_destroy_mgt(struct nes_vnic *nesvnic)
+{
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_vnic_mgt *mgtvnic;
+	struct nes_vnic_mgt *first_mgtvnic;
+	unsigned long flags;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 cqp_head;
+	struct sk_buff *rx_skb;
+	int i;
+	int ret;
+
+	kthread_stop(nesvnic->mgt_thread);
+
+	/* Free remaining NIC receive buffers */
+	first_mgtvnic = nesvnic->mgtvnic[0];
+	for (i = 0; i < NES_MGT_QP_COUNT; i++) {
+		mgtvnic = nesvnic->mgtvnic[i];
+		if (mgtvnic == NULL)
+			continue;
+
+		while (mgtvnic->mgt.rq_head != mgtvnic->mgt.rq_tail) {
+			rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail];
+			nes_mgt_free_skb(nesdev, rx_skb, PCI_DMA_FROMDEVICE);
+			mgtvnic->mgt.rq_tail++;
+			mgtvnic->mgt.rq_tail &= (mgtvnic->mgt.rq_size - 1);
+		}
+
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+		/* Destroy NIC QP */
+		cqp_head = nesdev->cqp.sq_head;
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+				    (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+				    mgtvnic->mgt.qp_id);
+
+		if (++cqp_head >= nesdev->cqp.sq_size)
+			cqp_head = 0;
+
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+
+		/* Destroy NIC CQ */
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+				    (NES_CQP_DESTROY_CQ | ((u32)mgtvnic->mgt_cq.cq_size << 16)));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+				    (mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16)));
+
+		if (++cqp_head >= nesdev->cqp.sq_size)
+			cqp_head = 0;
+
+		nesdev->cqp.sq_head = cqp_head;
+		barrier();
+
+		/* Ring doorbell (2 WQEs) */
+		nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+		nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u,"
+			  " cqp.sq_tail=%u, cqp.sq_size=%u\n",
+			  cqp_head, nesdev->cqp.sq_head,
+			  nesdev->cqp.sq_tail, nesdev->cqp.sq_size);
+
+		ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+					 NES_EVENT_TIMEOUT);
+
+		nes_debug(NES_DBG_SHUTDOWN, "Destroy MGT QP returned, wait_event_timeout ret = %u, cqp_head=%u,"
+			  " cqp.sq_head=%u, cqp.sq_tail=%u\n",
+			  ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
+		if (!ret)
+			nes_debug(NES_DBG_SHUTDOWN, "MGT QP%u destroy timeout expired\n",
+				  mgtvnic->mgt.qp_id);
+
+		nesvnic->mgtvnic[i] = NULL;
+	}
+
+	if (nesvnic->mgt_vbase) {
+		pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase,
+				    nesvnic->mgt_pbase);
+		nesvnic->mgt_vbase = NULL;
+		nesvnic->mgt_pbase = 0;
+	}
+
+	kfree(first_mgtvnic);
+}
diff --git a/drivers/infiniband/hw/nes/nes_mgt.h b/drivers/infiniband/hw/nes/nes_mgt.h
new file mode 100644
index 0000000..4f7f701
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_mgt.h
@@ -0,0 +1,97 @@
+/*
+* Copyright (c) 2006 - 2011 Intel-NE, Inc.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenIB.org BSD license below:
+*
+*     Redistribution and use in source and binary forms, with or
+*     without modification, are permitted provided that the following
+*     conditions are met:
+*
+*      - Redistributions of source code must retain the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer.
+*
+*      - Redistributions in binary form must reproduce the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer in the documentation and/or other materials
+*        provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+
+#ifndef __NES_MGT_H
+#define __NES_MGT_H
+
+#define MPA_FRAMING 6	/* length is 2 bytes, crc is 4 bytes */
+
+int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic);
+void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp);
+void nes_destroy_mgt(struct nes_vnic *nesvnic);
+void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp);
+
+struct nes_hw_mgt {
+	struct nes_hw_nic_rq_wqe *rq_vbase;	/* virtual address of rq */
+	dma_addr_t rq_pbase;			/* PCI memory for host rings */
+	struct sk_buff *rx_skb[NES_NIC_WQ_SIZE];
+	u16 qp_id;
+	u16 sq_head;
+	u16 rq_head;
+	u16 rq_tail;
+	u16 rq_size;
+	u8 replenishing_rq;
+	u8 reserved;
+	spinlock_t rq_lock;
+};
+
+struct nes_vnic_mgt {
+	struct nes_vnic        *nesvnic;
+	struct nes_hw_mgt      mgt;
+	struct nes_hw_nic_cq   mgt_cq;
+	atomic_t               rx_skbs_needed;
+	struct timer_list      rq_wqes_timer;
+	atomic_t               rx_skb_timer_running;
+};
+
+#define MAX_FPDU_FRAGS 4
+struct pau_fpdu_frag {
+	struct sk_buff         *skb;
+	u64                    physaddr;
+	u32                    frag_len;
+	bool                   cmplt;
+};
+
+struct pau_fpdu_info {
+	struct nes_qp          *nesqp;
+	struct nes_cqp_request *cqp_request;
+	void                   *hdr_vbase;
+	dma_addr_t             hdr_pbase;
+	int                    hdr_len;
+	u16                    data_len;
+	u16                    frag_cnt;
+	struct pau_fpdu_frag   frags[MAX_FPDU_FRAGS];
+};
+
+enum pau_qh_state {
+	PAU_DEL_QH,
+	PAU_ADD_LB_QH,
+	PAU_READY
+};
+
+struct pau_qh_chg {
+	struct nes_device *nesdev;
+	struct nes_vnic *nesvnic;
+	struct nes_qp *nesqp;
+};
+
+#endif          /* __NES_MGT_H */
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
new file mode 100644
index 0000000..007d5e8
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -0,0 +1,1872 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/ethtool.h>
+#include <linux/slab.h>
+#include <net/tcp.h>
+
+#include <net/inet_common.h>
+#include <linux/inet.h>
+
+#include "nes.h"
+
+static struct nic_qp_map nic_qp_mapping_0[] = {
+	{16,0,0,1},{24,4,0,0},{28,8,0,0},{32,12,0,0},
+	{20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0},
+	{18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0},
+	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_1[] = {
+	{18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0},
+	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_2[] = {
+	{20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_3[] = {
+	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_4[] = {
+	{28,8,0,0},{32,12,0,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_5[] = {
+	{29,9,1,0},{33,13,1,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_6[] = {
+	{30,10,2,0},{34,14,2,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_7[] = {
+	{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map *nic_qp_mapping_per_function[] = {
+	nic_qp_mapping_0, nic_qp_mapping_1, nic_qp_mapping_2, nic_qp_mapping_3,
+	nic_qp_mapping_4, nic_qp_mapping_5, nic_qp_mapping_6, nic_qp_mapping_7
+};
+
+static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
+		| NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
+static int debug = -1;
+static int nics_per_function = 1;
+
+/**
+ * nes_netdev_poll
+ */
+static int nes_netdev_poll(struct napi_struct *napi, int budget)
+{
+	struct nes_vnic *nesvnic = container_of(napi, struct nes_vnic, napi);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_nic_cq *nescq = &nesvnic->nic_cq;
+
+	nesvnic->budget = budget;
+	nescq->cqes_pending = 0;
+	nescq->rx_cqes_completed = 0;
+	nescq->cqe_allocs_pending = 0;
+	nescq->rx_pkts_indicated = 0;
+
+	nes_nic_ce_handler(nesdev, nescq);
+
+	if (nescq->cqes_pending == 0) {
+		napi_complete(napi);
+		/* clear out completed cqes and arm */
+		nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+				nescq->cq_number | (nescq->cqe_allocs_pending << 16));
+		nes_read32(nesdev->regs+NES_CQE_ALLOC);
+	} else {
+		/* clear out completed cqes but don't arm */
+		nes_write32(nesdev->regs+NES_CQE_ALLOC,
+				nescq->cq_number | (nescq->cqe_allocs_pending << 16));
+		nes_debug(NES_DBG_NETDEV, "%s: exiting with work pending\n",
+				nesvnic->netdev->name);
+	}
+	return nescq->rx_pkts_indicated;
+}
+
+
+/**
+ * nes_netdev_open - Activate the network interface; ifconfig
+ * ethx up.
+ */
+static int nes_netdev_open(struct net_device *netdev)
+{
+	u32 macaddr_low;
+	u16 macaddr_high;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	int ret;
+	int i;
+	struct nes_vnic *first_nesvnic = NULL;
+	u32 nic_active_bit;
+	u32 nic_active;
+	struct list_head *list_pos, *list_temp;
+	unsigned long flags;
+
+	assert(nesdev != NULL);
+
+	if (nesvnic->netdev_open == 1)
+		return 0;
+
+	if (netif_msg_ifup(nesvnic))
+		printk(KERN_INFO PFX "%s: enabling interface\n", netdev->name);
+
+	ret = nes_init_nic_qp(nesdev, netdev);
+	if (ret) {
+		return ret;
+	}
+
+	netif_carrier_off(netdev);
+	netif_stop_queue(netdev);
+
+	if ((!nesvnic->of_device_registered) && (nesvnic->rdma_enabled)) {
+		nesvnic->nesibdev = nes_init_ofa_device(netdev);
+		if (nesvnic->nesibdev == NULL) {
+			printk(KERN_ERR PFX "%s: nesvnic->nesibdev alloc failed", netdev->name);
+		} else {
+			nesvnic->nesibdev->nesvnic = nesvnic;
+			ret = nes_register_ofa_device(nesvnic->nesibdev);
+			if (ret) {
+				printk(KERN_ERR PFX "%s: Unable to register RDMA device, ret = %d\n",
+						netdev->name, ret);
+			}
+		}
+	}
+	/* Set packet filters */
+	nic_active_bit = 1 << nesvnic->nic_index;
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active);
+
+	macaddr_high  = ((u16)netdev->dev_addr[0]) << 8;
+	macaddr_high += (u16)netdev->dev_addr[1];
+
+	macaddr_low   = ((u32)netdev->dev_addr[2]) << 24;
+	macaddr_low  += ((u32)netdev->dev_addr[3]) << 16;
+	macaddr_low  += ((u32)netdev->dev_addr[4]) << 8;
+	macaddr_low  += (u32)netdev->dev_addr[5];
+
+	/* Program the various MAC regs */
+	for (i = 0; i < NES_MAX_PORT_COUNT; i++) {
+		if (nesvnic->qp_nic_index[i] == 0xf) {
+			break;
+		}
+		nes_debug(NES_DBG_NETDEV, "i=%d, perfect filter table index= %d, PERF FILTER LOW"
+				" (Addr:%08X) = %08X, HIGH = %08X.\n",
+				i, nesvnic->qp_nic_index[i],
+				NES_IDX_PERFECT_FILTER_LOW+
+					(nesvnic->qp_nic_index[i] * 8),
+				macaddr_low,
+				(u32)macaddr_high | NES_MAC_ADDR_VALID |
+				((((u32)nesvnic->nic_index) << 16)));
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8),
+				macaddr_low);
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8),
+				(u32)macaddr_high | NES_MAC_ADDR_VALID |
+				((((u32)nesvnic->nic_index) << 16)));
+	}
+
+
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			nesvnic->nic_cq.cq_number);
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+	list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) {
+		first_nesvnic = container_of(list_pos, struct nes_vnic, list);
+		if (first_nesvnic->netdev_open == 1)
+			break;
+	}
+	if (first_nesvnic->netdev_open == 0) {
+		nes_debug(NES_DBG_INIT, "Setting up MAC interrupt mask.\n");
+		nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK + (0x200 * nesdev->mac_index),
+				~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT |
+				NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR));
+		first_nesvnic = nesvnic;
+	}
+
+	if (first_nesvnic->linkup) {
+		/* Enable network packets */
+		nesvnic->linkup = 1;
+		netif_start_queue(netdev);
+		netif_carrier_on(netdev);
+	}
+
+	spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+	if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_SFP_D) {
+		nesdev->link_recheck = 1;
+		mod_delayed_work(system_wq, &nesdev->work,
+				 NES_LINK_RECHECK_DELAY);
+	}
+	spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+
+	spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags);
+	if (nesvnic->of_device_registered) {
+		nesdev->nesadapter->send_term_ok = 1;
+		if (nesvnic->linkup == 1) {
+			if (nesdev->iw_status == 0) {
+				nesdev->iw_status = 1;
+				nes_port_ibevent(nesvnic);
+			}
+		} else {
+			nesdev->iw_status = 0;
+		}
+	}
+	spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags);
+
+	napi_enable(&nesvnic->napi);
+	nesvnic->netdev_open = 1;
+
+	return 0;
+}
+
+
+/**
+ * nes_netdev_stop
+ */
+static int nes_netdev_stop(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u32 nic_active_mask;
+	u32 nic_active;
+	struct nes_vnic *first_nesvnic = NULL;
+	struct list_head *list_pos, *list_temp;
+	unsigned long flags;
+
+	nes_debug(NES_DBG_SHUTDOWN, "nesvnic=%p, nesdev=%p, netdev=%p %s\n",
+			nesvnic, nesdev, netdev, netdev->name);
+	if (nesvnic->netdev_open == 0)
+		return 0;
+
+	if (netif_msg_ifdown(nesvnic))
+		printk(KERN_INFO PFX "%s: disabling interface\n", netdev->name);
+	netif_carrier_off(netdev);
+
+	/* Disable network packets */
+	napi_disable(&nesvnic->napi);
+	netif_stop_queue(netdev);
+	list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) {
+		first_nesvnic = container_of(list_pos, struct nes_vnic, list);
+		if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic))
+			break;
+	}
+
+	if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic)  &&
+		(PCI_FUNC(first_nesvnic->nesdev->pcidev->devfn) !=
+		PCI_FUNC(nesvnic->nesdev->pcidev->devfn))) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+
+				(0x200*nesdev->mac_index), 0xffffffff);
+			nes_write_indexed(first_nesvnic->nesdev,
+				NES_IDX_MAC_INT_MASK+
+				(0x200*first_nesvnic->nesdev->mac_index),
+			~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT |
+			NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR));
+	} else {
+		nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+(0x200*nesdev->mac_index), 0xffffffff);
+	}
+
+	nic_active_mask = ~((u32)(1 << nesvnic->nic_index));
+	nes_write_indexed(nesdev, NES_IDX_PERFECT_FILTER_HIGH+
+			(nesvnic->perfect_filter_index*8), 0);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active);
+
+	spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags);
+	if (nesvnic->of_device_registered) {
+		nesdev->nesadapter->send_term_ok = 0;
+		nesdev->iw_status = 0;
+		if (nesvnic->linkup == 1)
+			nes_port_ibevent(nesvnic);
+	}
+	del_timer_sync(&nesvnic->event_timer);
+	nesvnic->event_timer.function = NULL;
+	spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags);
+
+	nes_destroy_nic_qp(nesvnic);
+
+	nesvnic->netdev_open = 0;
+
+	return 0;
+}
+
+
+/**
+ * nes_nic_send
+ */
+static bool nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_nic *nesnic = &nesvnic->nic;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct tcphdr *tcph;
+	__le16 *wqe_fragment_length;
+	u32 wqe_misc;
+	u16 wqe_fragment_index = 1;	/* first fragment (0) is used by copy buffer */
+	u16 skb_fragment_index;
+	dma_addr_t bus_address;
+
+	nic_sqe = &nesnic->sq_vbase[nesnic->sq_head];
+	wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+
+	/* setup the VLAN tag if present */
+	if (skb_vlan_tag_present(skb)) {
+		nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
+				netdev->name, skb_vlan_tag_get(skb));
+		wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
+		wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
+	} else
+		wqe_misc = 0;
+
+	/* bump past the vlan tag */
+	wqe_fragment_length++;
+	/*	wqe_fragment_address = (u64 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX]; */
+	wqe_misc |= NES_NIC_SQ_WQE_COMPLETION;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		if (skb_is_gso(skb)) {
+			tcph = tcp_hdr(skb);
+			/* nes_debug(NES_DBG_NIC_TX, "%s: TSO request... is_gso = %u seg size = %u\n",
+					netdev->name, skb_is_gso(skb), skb_shinfo(skb)->gso_size); */
+			wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE | (u16)skb_shinfo(skb)->gso_size;
+			set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX,
+					((u32)tcph->doff) |
+					(((u32)(((unsigned char *)tcph) - skb->data)) << 4));
+		}
+	} else {	/* CHECKSUM_HW */
+		wqe_misc |= NES_NIC_SQ_WQE_DISABLE_CHKSUM;
+	}
+
+	set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX,
+				skb->len);
+	memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer,
+			skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), skb_headlen(skb)));
+	wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE),
+			skb_headlen(skb)));
+	wqe_fragment_length[1] = 0;
+	if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) {
+		if ((skb_shinfo(skb)->nr_frags + 1) > 4) {
+			nes_debug(NES_DBG_NIC_TX, "%s: Packet with %u fragments not sent, skb_headlen=%u\n",
+					netdev->name, skb_shinfo(skb)->nr_frags + 2, skb_headlen(skb));
+			kfree_skb(skb);
+			nesvnic->tx_sw_dropped++;
+			return false;
+		}
+		set_bit(nesnic->sq_head, nesnic->first_frag_overflow);
+		bus_address = pci_map_single(nesdev->pcidev, skb->data + NES_FIRST_FRAG_SIZE,
+				skb_headlen(skb) - NES_FIRST_FRAG_SIZE, PCI_DMA_TODEVICE);
+		wqe_fragment_length[wqe_fragment_index++] =
+				cpu_to_le16(skb_headlen(skb) - NES_FIRST_FRAG_SIZE);
+		wqe_fragment_length[wqe_fragment_index] = 0;
+		set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
+				((u64)(bus_address)));
+		nesnic->tx_skb[nesnic->sq_head] = skb;
+	}
+
+	if (skb_headlen(skb) == skb->len) {
+		if (skb_headlen(skb) <= NES_FIRST_FRAG_SIZE) {
+			nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_2_1_IDX] = 0;
+			nesnic->tx_skb[nesnic->sq_head] = skb;
+		}
+	} else {
+		/* Deal with Fragments */
+		nesnic->tx_skb[nesnic->sq_head] = skb;
+		for (skb_fragment_index = 0; skb_fragment_index < skb_shinfo(skb)->nr_frags;
+				skb_fragment_index++) {
+			skb_frag_t *frag =
+				&skb_shinfo(skb)->frags[skb_fragment_index];
+			bus_address = skb_frag_dma_map(&nesdev->pcidev->dev,
+						       frag, 0, skb_frag_size(frag),
+						       DMA_TO_DEVICE);
+			wqe_fragment_length[wqe_fragment_index] =
+					cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[skb_fragment_index]));
+			set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index),
+				bus_address);
+			wqe_fragment_index++;
+			if (wqe_fragment_index < 5)
+				wqe_fragment_length[wqe_fragment_index] = 0;
+		}
+	}
+
+	set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, wqe_misc);
+	nesnic->sq_head++;
+	nesnic->sq_head &= nesnic->sq_size - 1;
+	return true;
+}
+
+
+/**
+ * nes_netdev_start_xmit
+ */
+static netdev_tx_t nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_nic *nesnic = &nesvnic->nic;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct tcphdr *tcph;
+	/* struct udphdr *udph; */
+#define NES_MAX_TSO_FRAGS MAX_SKB_FRAGS
+	/* 64K segment plus overflow on each side */
+	dma_addr_t tso_bus_address[NES_MAX_TSO_FRAGS];
+	dma_addr_t bus_address;
+	u32 tso_frag_index;
+	u32 tso_frag_count;
+	u32 tso_wqe_length;
+	u32 curr_tcp_seq;
+	u32 wqe_count=1;
+	struct iphdr *iph;
+	__le16 *wqe_fragment_length;
+	u32 nr_frags;
+	u32 original_first_length;
+	/* u64 *wqe_fragment_address; */
+	/* first fragment (0) is used by copy buffer */
+	u16 wqe_fragment_index=1;
+	u16 hoffset;
+	u16 nhoffset;
+	u16 wqes_needed;
+	u16 wqes_available;
+	u32 wqe_misc;
+
+	/*
+	 * nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u,"
+	 *		" (%u frags), tso_size=%u\n",
+	 *		netdev->name, skb->len, skb_headlen(skb),
+	 *		skb_shinfo(skb)->nr_frags, skb_is_gso(skb));
+	 */
+
+	if (netif_queue_stopped(netdev))
+		return NETDEV_TX_BUSY;
+
+	/* Check if SQ is full */
+	if ((((nesnic->sq_tail+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) == 1) {
+		if (!netif_queue_stopped(netdev)) {
+			netif_stop_queue(netdev);
+			barrier();
+			if ((((((volatile u16)nesnic->sq_tail)+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) != 1) {
+				netif_start_queue(netdev);
+				goto sq_no_longer_full;
+			}
+		}
+		nesvnic->sq_full++;
+		return NETDEV_TX_BUSY;
+	}
+
+sq_no_longer_full:
+	nr_frags = skb_shinfo(skb)->nr_frags;
+	if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) {
+		nr_frags++;
+	}
+	/* Check if too many fragments */
+	if (unlikely((nr_frags > 4))) {
+		if (skb_is_gso(skb)) {
+			nesvnic->segmented_tso_requests++;
+			nesvnic->tso_requests++;
+			/* Basically 4 fragments available per WQE with extended fragments */
+			wqes_needed = nr_frags >> 2;
+			wqes_needed += (nr_frags&3)?1:0;
+			wqes_available = (((nesnic->sq_tail+nesnic->sq_size)-nesnic->sq_head) - 1) &
+					(nesnic->sq_size - 1);
+
+			if (unlikely(wqes_needed > wqes_available)) {
+				if (!netif_queue_stopped(netdev)) {
+					netif_stop_queue(netdev);
+					barrier();
+					wqes_available = (((((volatile u16)nesnic->sq_tail)+nesnic->sq_size)-nesnic->sq_head) - 1) &
+						(nesnic->sq_size - 1);
+					if (wqes_needed <= wqes_available) {
+						netif_start_queue(netdev);
+						goto tso_sq_no_longer_full;
+					}
+				}
+				nesvnic->sq_full++;
+				nes_debug(NES_DBG_NIC_TX, "%s: HNIC SQ full- TSO request has too many frags!\n",
+						netdev->name);
+				return NETDEV_TX_BUSY;
+			}
+tso_sq_no_longer_full:
+			/* Map all the buffers */
+			for (tso_frag_count=0; tso_frag_count < skb_shinfo(skb)->nr_frags;
+					tso_frag_count++) {
+				skb_frag_t *frag =
+					&skb_shinfo(skb)->frags[tso_frag_count];
+				tso_bus_address[tso_frag_count] =
+					skb_frag_dma_map(&nesdev->pcidev->dev,
+							 frag, 0, skb_frag_size(frag),
+							 DMA_TO_DEVICE);
+			}
+
+			tso_frag_index = 0;
+			curr_tcp_seq = ntohl(tcp_hdr(skb)->seq);
+			hoffset = skb_transport_header(skb) - skb->data;
+			nhoffset = skb_network_header(skb) - skb->data;
+			original_first_length = hoffset + ((((struct tcphdr *)skb_transport_header(skb))->doff)<<2);
+
+			for (wqe_count=0; wqe_count<((u32)wqes_needed); wqe_count++) {
+				tso_wqe_length = 0;
+				nic_sqe = &nesnic->sq_vbase[nesnic->sq_head];
+				wqe_fragment_length =
+						(__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+				/* setup the VLAN tag if present */
+				if (skb_vlan_tag_present(skb)) {
+					nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
+							netdev->name,
+						  skb_vlan_tag_get(skb));
+					wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
+					wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
+				} else
+					wqe_misc = 0;
+
+				/* bump past the vlan tag */
+				wqe_fragment_length++;
+
+				/* Assumes header totally fits in allocated buffer and is in first fragment */
+				if (original_first_length > NES_FIRST_FRAG_SIZE) {
+					nes_debug(NES_DBG_NIC_TX, "ERROR: SKB header too big, headlen=%u, FIRST_FRAG_SIZE=%u\n",
+							original_first_length, NES_FIRST_FRAG_SIZE);
+					nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u,"
+							" (%u frags), is_gso = %u tso_size=%u\n",
+							netdev->name,
+							skb->len, skb_headlen(skb),
+							skb_shinfo(skb)->nr_frags, skb_is_gso(skb), skb_shinfo(skb)->gso_size);
+				}
+				memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer,
+						skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE),
+						original_first_length));
+				iph = (struct iphdr *)
+				(&nesnic->first_frag_vbase[nesnic->sq_head].buffer[nhoffset]);
+				tcph = (struct tcphdr *)
+				(&nesnic->first_frag_vbase[nesnic->sq_head].buffer[hoffset]);
+				if ((wqe_count+1)!=(u32)wqes_needed) {
+					tcph->fin = 0;
+					tcph->psh = 0;
+					tcph->rst = 0;
+					tcph->urg = 0;
+				}
+				if (wqe_count) {
+					tcph->syn = 0;
+				}
+				tcph->seq = htonl(curr_tcp_seq);
+				wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE),
+						original_first_length));
+
+				wqe_fragment_index = 1;
+				if ((wqe_count==0) && (skb_headlen(skb) > original_first_length)) {
+					set_bit(nesnic->sq_head, nesnic->first_frag_overflow);
+					bus_address = pci_map_single(nesdev->pcidev, skb->data + original_first_length,
+							skb_headlen(skb) - original_first_length, PCI_DMA_TODEVICE);
+					wqe_fragment_length[wqe_fragment_index++] =
+						cpu_to_le16(skb_headlen(skb) - original_first_length);
+					wqe_fragment_length[wqe_fragment_index] = 0;
+					set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
+									bus_address);
+					tso_wqe_length += skb_headlen(skb) -
+							original_first_length;
+				}
+				while (wqe_fragment_index < 5) {
+					wqe_fragment_length[wqe_fragment_index] =
+							cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index]));
+					set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index),
+						(u64)tso_bus_address[tso_frag_index]);
+					wqe_fragment_index++;
+					tso_wqe_length += skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index++]);
+					if (wqe_fragment_index < 5)
+						wqe_fragment_length[wqe_fragment_index] = 0;
+					if (tso_frag_index == tso_frag_count)
+						break;
+				}
+				if ((wqe_count+1) == (u32)wqes_needed) {
+					nesnic->tx_skb[nesnic->sq_head] = skb;
+				} else {
+					nesnic->tx_skb[nesnic->sq_head] = NULL;
+				}
+				wqe_misc |= NES_NIC_SQ_WQE_COMPLETION | (u16)skb_shinfo(skb)->gso_size;
+				if ((tso_wqe_length + original_first_length) > skb_shinfo(skb)->gso_size) {
+					wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE;
+				} else {
+					iph->tot_len = htons(tso_wqe_length + original_first_length - nhoffset);
+				}
+
+				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX,
+						 wqe_misc);
+				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX,
+						((u32)tcph->doff) | (((u32)hoffset) << 4));
+
+				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX,
+						tso_wqe_length + original_first_length);
+				curr_tcp_seq += tso_wqe_length;
+				nesnic->sq_head++;
+				nesnic->sq_head &= nesnic->sq_size-1;
+			}
+		} else {
+			hoffset = skb_transport_header(skb) - skb->data;
+			nhoffset = skb_network_header(skb) - skb->data;
+			if (skb_linearize(skb)) {
+				nesvnic->tx_sw_dropped++;
+				kfree_skb(skb);
+				return NETDEV_TX_OK;
+			}
+			nesvnic->linearized_skbs++;
+			skb_set_transport_header(skb, hoffset);
+			skb_set_network_header(skb, nhoffset);
+			if (!nes_nic_send(skb, netdev))
+				return NETDEV_TX_OK;
+		}
+	} else {
+		if (!nes_nic_send(skb, netdev))
+			return NETDEV_TX_OK;
+	}
+
+	barrier();
+
+	if (wqe_count)
+		nes_write32(nesdev->regs+NES_WQE_ALLOC,
+				(wqe_count << 24) | (1 << 23) | nesvnic->nic.qp_id);
+
+	netif_trans_update(netdev);
+
+	return NETDEV_TX_OK;
+}
+
+
+/**
+ * nes_netdev_get_stats
+ */
+static struct net_device_stats *nes_netdev_get_stats(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u64 u64temp;
+	u32 u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + (nesvnic->nic_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->endnode_nstat_rx_discard += u32temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_rx_octets += u64temp;
+	nesvnic->netstats.rx_bytes += u64temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_rx_frames += u64temp;
+	nesvnic->netstats.rx_packets += u64temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_tx_octets += u64temp;
+	nesvnic->netstats.tx_bytes += u64temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_tx_frames += u64temp;
+	nesvnic->netstats.tx_packets += u64temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_short_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_oversized_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_jabber_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_length_errors += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_crc_errors += u32temp;
+	nesvnic->netstats.rx_crc_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_tx_errors += u32temp;
+	nesvnic->netstats.tx_errors += u32temp;
+
+	return &nesvnic->netstats;
+}
+
+
+/**
+ * nes_netdev_tx_timeout
+ */
+static void nes_netdev_tx_timeout(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	if (netif_msg_timer(nesvnic))
+		nes_debug(NES_DBG_NIC_TX, "%s: tx timeout\n", netdev->name);
+}
+
+
+/**
+ * nes_netdev_set_mac_address
+ */
+static int nes_netdev_set_mac_address(struct net_device *netdev, void *p)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct sockaddr *mac_addr = p;
+	int i;
+	u32 macaddr_low;
+	u16 macaddr_high;
+
+	if (!is_valid_ether_addr(mac_addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	memcpy(netdev->dev_addr, mac_addr->sa_data, netdev->addr_len);
+	printk(PFX "%s: Address length = %d, Address = %pM\n",
+	       __func__, netdev->addr_len, mac_addr->sa_data);
+	macaddr_high  = ((u16)netdev->dev_addr[0]) << 8;
+	macaddr_high += (u16)netdev->dev_addr[1];
+	macaddr_low   = ((u32)netdev->dev_addr[2]) << 24;
+	macaddr_low  += ((u32)netdev->dev_addr[3]) << 16;
+	macaddr_low  += ((u32)netdev->dev_addr[4]) << 8;
+	macaddr_low  += (u32)netdev->dev_addr[5];
+
+	for (i = 0; i < NES_MAX_PORT_COUNT; i++) {
+		if (nesvnic->qp_nic_index[i] == 0xf) {
+			break;
+		}
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8),
+				macaddr_low);
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8),
+				(u32)macaddr_high | NES_MAC_ADDR_VALID |
+				((((u32)nesvnic->nic_index) << 16)));
+	}
+	return 0;
+}
+
+
+static void set_allmulti(struct nes_device *nesdev, u32 nic_active_bit)
+{
+	u32 nic_active;
+
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+	nic_active &= ~nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+}
+
+#define get_addr(addrs, index) ((addrs) + (index) * ETH_ALEN)
+
+/**
+ * nes_netdev_set_multicast_list
+ */
+static void nes_netdev_set_multicast_list(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
+	u32 nic_active_bit;
+	u32 nic_active;
+	u32 perfect_filter_register_address;
+	u32 macaddr_low;
+	u16 macaddr_high;
+	u8 mc_all_on = 0;
+	u8 mc_index;
+	int mc_nic_index = -1;
+	u8 pft_entries_preallocated = max(nesadapter->adapter_fcn_count *
+					nics_per_function, 4);
+	u8 max_pft_entries_avaiable = NES_PFT_SIZE - pft_entries_preallocated;
+	unsigned long flags;
+	int mc_count = netdev_mc_count(netdev);
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+	nic_active_bit = 1 << nesvnic->nic_index;
+
+	if (netdev->flags & IFF_PROMISC) {
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+		nic_active |= nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+		nic_active |= nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+		mc_all_on = 1;
+	} else if ((netdev->flags & IFF_ALLMULTI) ||
+			   (nesvnic->nic_index > 3)) {
+		set_allmulti(nesdev, nic_active_bit);
+		mc_all_on = 1;
+	} else {
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+		nic_active &= ~nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+		nic_active &= ~nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+	}
+
+	nes_debug(NES_DBG_NIC_RX, "Number of MC entries = %d, Promiscuous = %d, All Multicast = %d.\n",
+		  mc_count, !!(netdev->flags & IFF_PROMISC),
+		  !!(netdev->flags & IFF_ALLMULTI));
+	if (!mc_all_on) {
+		char *addrs;
+		int i;
+		struct netdev_hw_addr *ha;
+
+		addrs = kmalloc(ETH_ALEN * mc_count, GFP_ATOMIC);
+		if (!addrs) {
+			set_allmulti(nesdev, nic_active_bit);
+			goto unlock;
+		}
+		i = 0;
+		netdev_for_each_mc_addr(ha, netdev)
+			memcpy(get_addr(addrs, i++), ha->addr, ETH_ALEN);
+
+		perfect_filter_register_address = NES_IDX_PERFECT_FILTER_LOW +
+						pft_entries_preallocated * 0x8;
+		for (i = 0, mc_index = 0; mc_index < max_pft_entries_avaiable;
+		     mc_index++) {
+			while (i < mc_count && nesvnic->mcrq_mcast_filter &&
+			((mc_nic_index = nesvnic->mcrq_mcast_filter(nesvnic,
+					get_addr(addrs, i++))) == 0));
+			if (mc_nic_index < 0)
+				mc_nic_index = nesvnic->nic_index;
+			while (nesadapter->pft_mcast_map[mc_index] < 16 &&
+				nesadapter->pft_mcast_map[mc_index] !=
+					nesvnic->nic_index &&
+					mc_index < max_pft_entries_avaiable) {
+				nes_debug(NES_DBG_NIC_RX,
+					  "mc_index=%d skipping nic_index=%d, used for=%d\n",
+					  mc_index, nesvnic->nic_index,
+					  nesadapter->pft_mcast_map[mc_index]);
+				mc_index++;
+			}
+			if (mc_index >= max_pft_entries_avaiable)
+				break;
+			if (i < mc_count) {
+				char *addr = get_addr(addrs, i++);
+
+				nes_debug(NES_DBG_NIC_RX, "Assigning MC Address %pM to register 0x%04X nic_idx=%d\n",
+					  addr,
+					  perfect_filter_register_address+(mc_index * 8),
+					  mc_nic_index);
+				macaddr_high  = ((u8) addr[0]) << 8;
+				macaddr_high += (u8) addr[1];
+				macaddr_low   = ((u8) addr[2]) << 24;
+				macaddr_low  += ((u8) addr[3]) << 16;
+				macaddr_low  += ((u8) addr[4]) << 8;
+				macaddr_low  += (u8) addr[5];
+
+				nes_write_indexed(nesdev,
+						perfect_filter_register_address+(mc_index * 8),
+						macaddr_low);
+				nes_write_indexed(nesdev,
+						perfect_filter_register_address+4+(mc_index * 8),
+						(u32)macaddr_high | NES_MAC_ADDR_VALID |
+						((((u32)(1<<mc_nic_index)) << 16)));
+				nesadapter->pft_mcast_map[mc_index] =
+							nesvnic->nic_index;
+			} else {
+				nes_debug(NES_DBG_NIC_RX, "Clearing MC Address at register 0x%04X\n",
+						  perfect_filter_register_address+(mc_index * 8));
+				nes_write_indexed(nesdev,
+						perfect_filter_register_address+4+(mc_index * 8),
+						0);
+				nesadapter->pft_mcast_map[mc_index] = 255;
+			}
+		}
+		kfree(addrs);
+		/* PFT is not large enough */
+		if (i < mc_count)
+			set_allmulti(nesdev, nic_active_bit);
+	}
+
+unlock:
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+}
+
+
+/**
+ * nes_netdev_change_mtu
+ */
+static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct nes_vnic	*nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u8 jumbomode = 0;
+	u32 nic_active;
+	u32 nic_active_bit;
+	u32 uc_all_active;
+	u32 mc_all_active;
+
+	netdev->mtu = new_mtu;
+	nesvnic->max_frame_size	= new_mtu + VLAN_ETH_HLEN;
+
+	if (netdev->mtu	> ETH_DATA_LEN)	{
+		jumbomode=1;
+	}
+	nes_nic_init_timer_defaults(nesdev, jumbomode);
+
+	if (netif_running(netdev)) {
+		nic_active_bit = 1 << nesvnic->nic_index;
+		mc_all_active = nes_read_indexed(nesdev,
+				NES_IDX_NIC_MULTICAST_ALL) & nic_active_bit;
+		uc_all_active = nes_read_indexed(nesdev,
+				NES_IDX_NIC_UNICAST_ALL)  & nic_active_bit;
+
+		nes_netdev_stop(netdev);
+		nes_netdev_open(netdev);
+
+		nic_active = nes_read_indexed(nesdev,
+					NES_IDX_NIC_MULTICAST_ALL);
+		nic_active |= mc_all_active;
+		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL,
+							nic_active);
+
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+		nic_active |= uc_all_active;
+		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+	}
+
+	return 0;
+}
+
+
+static const char nes_ethtool_stringset[][ETH_GSTRING_LEN] = {
+	"Link Change Interrupts",
+	"Linearized SKBs",
+	"T/GSO Requests",
+	"Pause Frames Sent",
+	"Pause Frames Received",
+	"Internal Routing Errors",
+	"SQ SW Dropped SKBs",
+	"SQ Full",
+	"Segmented TSO Requests",
+	"Rx Symbol Errors",
+	"Rx Jabber Errors",
+	"Rx Oversized Frames",
+	"Rx Short Frames",
+	"Rx Length Errors",
+	"Rx CRC Errors",
+	"Rx Port Discard",
+	"Endnode Rx Discards",
+	"Endnode Rx Octets",
+	"Endnode Rx Frames",
+	"Endnode Tx Octets",
+	"Endnode Tx Frames",
+	"Tx Errors",
+	"mh detected",
+	"mh pauses",
+	"Retransmission Count",
+	"CM Connects",
+	"CM Accepts",
+	"Disconnects",
+	"Connected Events",
+	"Connect Requests",
+	"CM Rejects",
+	"ModifyQP Timeouts",
+	"CreateQPs",
+	"SW DestroyQPs",
+	"DestroyQPs",
+	"CM Closes",
+	"CM Packets Sent",
+	"CM Packets Bounced",
+	"CM Packets Created",
+	"CM Packets Rcvd",
+	"CM Packets Dropped",
+	"CM Packets Retrans",
+	"CM Listens Created",
+	"CM Listens Destroyed",
+	"CM Backlog Drops",
+	"CM Loopbacks",
+	"CM Nodes Created",
+	"CM Nodes Destroyed",
+	"CM Accel Drops",
+	"CM Resets Received",
+	"Free 4Kpbls",
+	"Free 256pbls",
+	"Timer Inits",
+	"PAU CreateQPs",
+	"PAU DestroyQPs",
+};
+#define NES_ETHTOOL_STAT_COUNT  ARRAY_SIZE(nes_ethtool_stringset)
+
+
+/**
+ * nes_netdev_get_sset_count
+ */
+static int nes_netdev_get_sset_count(struct net_device *netdev, int stringset)
+{
+	if (stringset == ETH_SS_STATS)
+		return NES_ETHTOOL_STAT_COUNT;
+	else
+		return -EINVAL;
+}
+
+
+/**
+ * nes_netdev_get_strings
+ */
+static void nes_netdev_get_strings(struct net_device *netdev, u32 stringset,
+		u8 *ethtool_strings)
+{
+	if (stringset == ETH_SS_STATS)
+		memcpy(ethtool_strings,
+				&nes_ethtool_stringset,
+				sizeof(nes_ethtool_stringset));
+}
+
+
+/**
+ * nes_netdev_get_ethtool_stats
+ */
+
+static void nes_netdev_get_ethtool_stats(struct net_device *netdev,
+		struct ethtool_stats *target_ethtool_stats, u64 *target_stat_values)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 nic_count;
+	u32 u32temp;
+	u32 index = 0;
+
+	target_ethtool_stats->n_stats = NES_ETHTOOL_STAT_COUNT;
+	target_stat_values[index] = nesvnic->nesdev->link_status_interrupts;
+	target_stat_values[++index] = nesvnic->linearized_skbs;
+	target_stat_values[++index] = nesvnic->tso_requests;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_TX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_pause_frames_sent += u32temp;
+	target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_sent;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_pause_frames_received += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_PORT_RX_DISCARDS + (nesvnic->nesdev->mac_index*0x40));
+	nesvnic->nesdev->port_rx_discards += u32temp;
+	nesvnic->netstats.rx_dropped += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_PORT_TX_DISCARDS + (nesvnic->nesdev->mac_index*0x40));
+	nesvnic->nesdev->port_tx_discards += u32temp;
+	nesvnic->netstats.tx_dropped += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_short_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_oversized_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_jabber_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_length_errors += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_crc_errors += u32temp;
+	nesvnic->netstats.rx_crc_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_tx_errors += u32temp;
+	nesvnic->netstats.tx_errors += u32temp;
+
+	for (nic_count = 0; nic_count < NES_MAX_PORT_COUNT; nic_count++) {
+		if (nesvnic->qp_nic_index[nic_count] == 0xf)
+			break;
+
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_DISCARD +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		nesvnic->netstats.rx_dropped += u32temp;
+		nesvnic->endnode_nstat_rx_discard += u32temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_rx_octets += u64temp;
+		nesvnic->netstats.rx_bytes += u64temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_rx_frames += u64temp;
+		nesvnic->netstats.rx_packets += u64temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_tx_octets += u64temp;
+		nesvnic->netstats.tx_bytes += u64temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_tx_frames += u64temp;
+		nesvnic->netstats.tx_packets += u64temp;
+
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_IPV4_TCP_REXMITS + (nesvnic->qp_nic_index[nic_count]*0x200));
+		nesvnic->endnode_ipv4_tcp_retransmits += u32temp;
+	}
+
+	target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_received;
+	target_stat_values[++index] = nesdev->nesadapter->nic_rx_eth_route_err;
+	target_stat_values[++index] = nesvnic->tx_sw_dropped;
+	target_stat_values[++index] = nesvnic->sq_full;
+	target_stat_values[++index] = nesvnic->segmented_tso_requests;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_symbol_err_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_jabber_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_oversized_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_short_frames;
+	target_stat_values[++index] = nesvnic->netstats.rx_length_errors;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_crc_errors;
+	target_stat_values[++index] = nesvnic->nesdev->port_rx_discards;
+	target_stat_values[++index] = nesvnic->endnode_nstat_rx_discard;
+	target_stat_values[++index] = nesvnic->endnode_nstat_rx_octets;
+	target_stat_values[++index] = nesvnic->endnode_nstat_rx_frames;
+	target_stat_values[++index] = nesvnic->endnode_nstat_tx_octets;
+	target_stat_values[++index] = nesvnic->endnode_nstat_tx_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_tx_errors;
+	target_stat_values[++index] = mh_detected;
+	target_stat_values[++index] = mh_pauses_sent;
+	target_stat_values[++index] = nesvnic->endnode_ipv4_tcp_retransmits;
+	target_stat_values[++index] = atomic_read(&cm_connects);
+	target_stat_values[++index] = atomic_read(&cm_accepts);
+	target_stat_values[++index] = atomic_read(&cm_disconnects);
+	target_stat_values[++index] = atomic_read(&cm_connecteds);
+	target_stat_values[++index] = atomic_read(&cm_connect_reqs);
+	target_stat_values[++index] = atomic_read(&cm_rejects);
+	target_stat_values[++index] = atomic_read(&mod_qp_timouts);
+	target_stat_values[++index] = atomic_read(&qps_created);
+	target_stat_values[++index] = atomic_read(&sw_qps_destroyed);
+	target_stat_values[++index] = atomic_read(&qps_destroyed);
+	target_stat_values[++index] = atomic_read(&cm_closes);
+	target_stat_values[++index] = cm_packets_sent;
+	target_stat_values[++index] = cm_packets_bounced;
+	target_stat_values[++index] = cm_packets_created;
+	target_stat_values[++index] = cm_packets_received;
+	target_stat_values[++index] = cm_packets_dropped;
+	target_stat_values[++index] = cm_packets_retrans;
+	target_stat_values[++index] = atomic_read(&cm_listens_created);
+	target_stat_values[++index] = atomic_read(&cm_listens_destroyed);
+	target_stat_values[++index] = cm_backlog_drops;
+	target_stat_values[++index] = atomic_read(&cm_loopbacks);
+	target_stat_values[++index] = atomic_read(&cm_nodes_created);
+	target_stat_values[++index] = atomic_read(&cm_nodes_destroyed);
+	target_stat_values[++index] = atomic_read(&cm_accel_dropped_pkts);
+	target_stat_values[++index] = atomic_read(&cm_resets_recvd);
+	target_stat_values[++index] = nesadapter->free_4kpbl;
+	target_stat_values[++index] = nesadapter->free_256pbl;
+	target_stat_values[++index] = int_mod_timer_init;
+	target_stat_values[++index] = atomic_read(&pau_qps_created);
+	target_stat_values[++index] = atomic_read(&pau_qps_destroyed);
+}
+
+/**
+ * nes_netdev_get_drvinfo
+ */
+static void nes_netdev_get_drvinfo(struct net_device *netdev,
+		struct ethtool_drvinfo *drvinfo)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
+
+	strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->bus_info, pci_name(nesvnic->nesdev->pcidev),
+		sizeof(drvinfo->bus_info));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "%u.%u", nesadapter->firmware_version >> 16,
+		 nesadapter->firmware_version & 0x000000ff);
+	strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version));
+}
+
+
+/**
+ * nes_netdev_set_coalesce
+ */
+static int nes_netdev_set_coalesce(struct net_device *netdev,
+		struct ethtool_coalesce	*et_coalesce)
+{
+	struct nes_vnic	*nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+	if (et_coalesce->rx_max_coalesced_frames_low) {
+		shared_timer->threshold_low = et_coalesce->rx_max_coalesced_frames_low;
+	}
+	if (et_coalesce->rx_max_coalesced_frames_irq) {
+		shared_timer->threshold_target = et_coalesce->rx_max_coalesced_frames_irq;
+	}
+	if (et_coalesce->rx_max_coalesced_frames_high) {
+		shared_timer->threshold_high = et_coalesce->rx_max_coalesced_frames_high;
+	}
+	if (et_coalesce->rx_coalesce_usecs_low) {
+		shared_timer->timer_in_use_min = et_coalesce->rx_coalesce_usecs_low;
+	}
+	if (et_coalesce->rx_coalesce_usecs_high) {
+		shared_timer->timer_in_use_max = et_coalesce->rx_coalesce_usecs_high;
+	}
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+
+	/* using this to drive total interrupt moderation */
+	nesadapter->et_rx_coalesce_usecs_irq = et_coalesce->rx_coalesce_usecs_irq;
+	if (et_coalesce->use_adaptive_rx_coalesce) {
+		nesadapter->et_use_adaptive_rx_coalesce	= 1;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC;
+		nesadapter->et_rx_coalesce_usecs_irq = 0;
+		if (et_coalesce->pkt_rate_low) {
+			nesadapter->et_pkt_rate_low = et_coalesce->pkt_rate_low;
+		}
+	} else {
+		nesadapter->et_use_adaptive_rx_coalesce	= 0;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT;
+		if (nesadapter->et_rx_coalesce_usecs_irq) {
+			nes_write32(nesdev->regs+NES_PERIODIC_CONTROL,
+					0x80000000 | ((u32)(nesadapter->et_rx_coalesce_usecs_irq*8)));
+		}
+	}
+	return 0;
+}
+
+
+/**
+ * nes_netdev_get_coalesce
+ */
+static int nes_netdev_get_coalesce(struct net_device *netdev,
+		struct ethtool_coalesce	*et_coalesce)
+{
+	struct nes_vnic	*nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct ethtool_coalesce	temp_et_coalesce;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+	unsigned long flags;
+
+	memset(&temp_et_coalesce, 0, sizeof(temp_et_coalesce));
+	temp_et_coalesce.rx_coalesce_usecs_irq    = nesadapter->et_rx_coalesce_usecs_irq;
+	temp_et_coalesce.use_adaptive_rx_coalesce = nesadapter->et_use_adaptive_rx_coalesce;
+	temp_et_coalesce.rate_sample_interval     = nesadapter->et_rate_sample_interval;
+	temp_et_coalesce.pkt_rate_low =	nesadapter->et_pkt_rate_low;
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock,	flags);
+	temp_et_coalesce.rx_max_coalesced_frames_low  = shared_timer->threshold_low;
+	temp_et_coalesce.rx_max_coalesced_frames_irq  = shared_timer->threshold_target;
+	temp_et_coalesce.rx_max_coalesced_frames_high = shared_timer->threshold_high;
+	temp_et_coalesce.rx_coalesce_usecs_low  = shared_timer->timer_in_use_min;
+	temp_et_coalesce.rx_coalesce_usecs_high = shared_timer->timer_in_use_max;
+	if (nesadapter->et_use_adaptive_rx_coalesce) {
+		temp_et_coalesce.rx_coalesce_usecs_irq = shared_timer->timer_in_use;
+	}
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+	memcpy(et_coalesce, &temp_et_coalesce, sizeof(*et_coalesce));
+	return 0;
+}
+
+
+/**
+ * nes_netdev_get_pauseparam
+ */
+static void nes_netdev_get_pauseparam(struct net_device *netdev,
+		struct ethtool_pauseparam *et_pauseparam)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	et_pauseparam->autoneg = 0;
+	et_pauseparam->rx_pause = (nesvnic->nesdev->disable_rx_flow_control == 0) ? 1:0;
+	et_pauseparam->tx_pause = (nesvnic->nesdev->disable_tx_flow_control == 0) ? 1:0;
+}
+
+
+/**
+ * nes_netdev_set_pauseparam
+ */
+static int nes_netdev_set_pauseparam(struct net_device *netdev,
+		struct ethtool_pauseparam *et_pauseparam)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u32 u32temp;
+
+	if (et_pauseparam->autoneg) {
+		/* TODO: should return unsupported */
+		return 0;
+	}
+	if ((et_pauseparam->tx_pause == 1) && (nesdev->disable_tx_flow_control == 1)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200));
+		u32temp |= NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp);
+		nesdev->disable_tx_flow_control = 0;
+	} else if ((et_pauseparam->tx_pause == 0) && (nesdev->disable_tx_flow_control == 0)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200));
+		u32temp &= ~NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp);
+		nesdev->disable_tx_flow_control = 1;
+	}
+	if ((et_pauseparam->rx_pause == 1) && (nesdev->disable_rx_flow_control == 1)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40));
+		u32temp &= ~NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp);
+		nesdev->disable_rx_flow_control = 0;
+	} else if ((et_pauseparam->rx_pause == 0) && (nesdev->disable_rx_flow_control == 0)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40));
+		u32temp |= NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp);
+		nesdev->disable_rx_flow_control = 1;
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_netdev_get_settings
+ */
+static int nes_netdev_get_link_ksettings(struct net_device *netdev,
+					 struct ethtool_link_ksettings *cmd)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 mac_index = nesdev->mac_index;
+	u8 phy_type = nesadapter->phy_type[mac_index];
+	u8 phy_index = nesadapter->phy_index[mac_index];
+	u16 phy_data;
+	u32 supported, advertising;
+
+	cmd->base.duplex = DUPLEX_FULL;
+	cmd->base.port   = PORT_MII;
+
+	if (nesadapter->OneG_Mode) {
+		cmd->base.speed = SPEED_1000;
+		if (phy_type == NES_PHY_TYPE_PUMA_1G) {
+			supported   = SUPPORTED_1000baseT_Full;
+			advertising = ADVERTISED_1000baseT_Full;
+			cmd->base.autoneg     = AUTONEG_DISABLE;
+			cmd->base.phy_address = mac_index;
+		} else {
+			unsigned long flags;
+
+			supported = SUPPORTED_1000baseT_Full
+				| SUPPORTED_Autoneg;
+			advertising = ADVERTISED_1000baseT_Full
+				| ADVERTISED_Autoneg;
+			spin_lock_irqsave(&nesadapter->phy_lock, flags);
+			nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+			spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+			if (phy_data & 0x1000)
+				cmd->base.autoneg = AUTONEG_ENABLE;
+			else
+				cmd->base.autoneg = AUTONEG_DISABLE;
+			cmd->base.phy_address = phy_index;
+		}
+		ethtool_convert_legacy_u32_to_link_mode(
+			cmd->link_modes.supported, supported);
+		ethtool_convert_legacy_u32_to_link_mode(
+			cmd->link_modes.advertising, advertising);
+		return 0;
+	}
+	if ((phy_type == NES_PHY_TYPE_ARGUS) ||
+	    (phy_type == NES_PHY_TYPE_SFP_D) ||
+	    (phy_type == NES_PHY_TYPE_KR)) {
+		cmd->base.port        = PORT_FIBRE;
+		supported   = SUPPORTED_FIBRE;
+		advertising = ADVERTISED_FIBRE;
+		cmd->base.phy_address = phy_index;
+	} else {
+		supported   = SUPPORTED_10000baseT_Full;
+		advertising = ADVERTISED_10000baseT_Full;
+		cmd->base.phy_address = mac_index;
+	}
+	cmd->base.speed = SPEED_10000;
+	cmd->base.autoneg = AUTONEG_DISABLE;
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
+						supported);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
+						advertising);
+
+	return 0;
+}
+
+
+/**
+ * nes_netdev_set_settings
+ */
+static int
+nes_netdev_set_link_ksettings(struct net_device *netdev,
+			      const struct ethtool_link_ksettings *cmd)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if ((nesadapter->OneG_Mode) &&
+	    (nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_PUMA_1G)) {
+		unsigned long flags;
+		u16 phy_data;
+		u8 phy_index = nesadapter->phy_index[nesdev->mac_index];
+
+		spin_lock_irqsave(&nesadapter->phy_lock, flags);
+		nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+		if (cmd->base.autoneg) {
+			/* Turn on Full duplex, Autoneg, and restart autonegotiation */
+			phy_data |= 0x1300;
+		} else {
+			/* Turn off autoneg */
+			phy_data &= ~0x1000;
+		}
+		nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data);
+		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+	}
+
+	return 0;
+}
+
+
+static const struct ethtool_ops nes_ethtool_ops = {
+	.get_link = ethtool_op_get_link,
+	.get_strings = nes_netdev_get_strings,
+	.get_sset_count = nes_netdev_get_sset_count,
+	.get_ethtool_stats = nes_netdev_get_ethtool_stats,
+	.get_drvinfo = nes_netdev_get_drvinfo,
+	.get_coalesce = nes_netdev_get_coalesce,
+	.set_coalesce = nes_netdev_set_coalesce,
+	.get_pauseparam = nes_netdev_get_pauseparam,
+	.set_pauseparam = nes_netdev_set_pauseparam,
+	.get_link_ksettings = nes_netdev_get_link_ksettings,
+	.set_link_ksettings = nes_netdev_set_link_ksettings,
+};
+
+static void nes_vlan_mode(struct net_device *netdev, struct nes_device *nesdev, netdev_features_t features)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 u32temp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+
+	nes_debug(NES_DBG_NETDEV, "%s: %s\n", __func__, netdev->name);
+
+	/* Enable/Disable VLAN Stripping */
+	u32temp = nes_read_indexed(nesdev, NES_IDX_PCIX_DIAG);
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		u32temp &= 0xfdffffff;
+	else
+		u32temp	|= 0x02000000;
+
+	nes_write_indexed(nesdev, NES_IDX_PCIX_DIAG, u32temp);
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+}
+
+static netdev_features_t nes_fix_features(struct net_device *netdev, netdev_features_t features)
+{
+	/*
+	 * Since there is no support for separate rx/tx vlan accel
+	 * enable/disable make sure tx flag is always in same state as rx.
+	 */
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
+	else
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
+
+	return features;
+}
+
+static int nes_set_features(struct net_device *netdev, netdev_features_t features)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u32 changed = netdev->features ^ features;
+
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
+		nes_vlan_mode(netdev, nesdev, features);
+
+	return 0;
+}
+
+static const struct net_device_ops nes_netdev_ops = {
+	.ndo_open		= nes_netdev_open,
+	.ndo_stop		= nes_netdev_stop,
+	.ndo_start_xmit		= nes_netdev_start_xmit,
+	.ndo_get_stats		= nes_netdev_get_stats,
+	.ndo_tx_timeout		= nes_netdev_tx_timeout,
+	.ndo_set_mac_address	= nes_netdev_set_mac_address,
+	.ndo_set_rx_mode	= nes_netdev_set_multicast_list,
+	.ndo_change_mtu		= nes_netdev_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_fix_features	= nes_fix_features,
+	.ndo_set_features	= nes_set_features,
+};
+
+/**
+ * nes_netdev_init - initialize network device
+ */
+struct net_device *nes_netdev_init(struct nes_device *nesdev,
+		void __iomem *mmio_addr)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic;
+	struct net_device *netdev;
+	struct nic_qp_map *curr_qp_map;
+	u8 phy_type = nesdev->nesadapter->phy_type[nesdev->mac_index];
+
+	netdev = alloc_etherdev(sizeof(struct nes_vnic));
+	if (!netdev) {
+		printk(KERN_ERR PFX "nesvnic etherdev alloc failed");
+		return NULL;
+	}
+	nesvnic = netdev_priv(netdev);
+
+	nes_debug(NES_DBG_INIT, "netdev = %p, %s\n", netdev, netdev->name);
+
+	SET_NETDEV_DEV(netdev, &nesdev->pcidev->dev);
+
+	netdev->watchdog_timeo = NES_TX_TIMEOUT;
+	netdev->irq = nesdev->pcidev->irq;
+	netdev->max_mtu = NES_MAX_MTU;
+	netdev->hard_header_len = ETH_HLEN;
+	netdev->addr_len = ETH_ALEN;
+	netdev->type = ARPHRD_ETHER;
+	netdev->netdev_ops = &nes_netdev_ops;
+	netdev->ethtool_ops = &nes_ethtool_ops;
+	netif_napi_add(netdev, &nesvnic->napi, nes_netdev_poll, 128);
+	nes_debug(NES_DBG_INIT, "Enabling VLAN Insert/Delete.\n");
+
+	/* Fill in the port structure */
+	nesvnic->netdev = netdev;
+	nesvnic->nesdev = nesdev;
+	nesvnic->msg_enable = netif_msg_init(debug, default_msg);
+	nesvnic->netdev_index = nesdev->netdev_count;
+	nesvnic->perfect_filter_index = nesdev->nesadapter->netdev_count;
+	nesvnic->max_frame_size = netdev->mtu + netdev->hard_header_len + VLAN_HLEN;
+
+	curr_qp_map = nic_qp_mapping_per_function[PCI_FUNC(nesdev->pcidev->devfn)];
+	nesvnic->nic.qp_id = curr_qp_map[nesdev->netdev_count].qpid;
+	nesvnic->nic_index = curr_qp_map[nesdev->netdev_count].nic_index;
+	nesvnic->logical_port = curr_qp_map[nesdev->netdev_count].logical_port;
+
+	/* Setup the burned in MAC address */
+	u64temp = (u64)nesdev->nesadapter->mac_addr_low;
+	u64temp += ((u64)nesdev->nesadapter->mac_addr_high) << 32;
+	u64temp += nesvnic->nic_index;
+	netdev->dev_addr[0] = (u8)(u64temp>>40);
+	netdev->dev_addr[1] = (u8)(u64temp>>32);
+	netdev->dev_addr[2] = (u8)(u64temp>>24);
+	netdev->dev_addr[3] = (u8)(u64temp>>16);
+	netdev->dev_addr[4] = (u8)(u64temp>>8);
+	netdev->dev_addr[5] = (u8)u64temp;
+
+	netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX;
+	if ((nesvnic->logical_port < 2) || (nesdev->nesadapter->hw_rev != NE020_REV))
+		netdev->hw_features |= NETIF_F_TSO;
+
+	netdev->features = netdev->hw_features | NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX;
+
+	nes_debug(NES_DBG_INIT, "nesvnic = %p, reported features = 0x%lX, QPid = %d,"
+			" nic_index = %d, logical_port = %d, mac_index = %d.\n",
+			nesvnic, (unsigned long)netdev->features, nesvnic->nic.qp_id,
+			nesvnic->nic_index, nesvnic->logical_port,  nesdev->mac_index);
+
+	if (nesvnic->nesdev->nesadapter->port_count == 1 &&
+		nesvnic->nesdev->nesadapter->adapter_fcn_count == 1) {
+
+		nesvnic->qp_nic_index[0] = nesvnic->nic_index;
+		nesvnic->qp_nic_index[1] = nesvnic->nic_index + 1;
+		if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) {
+			nesvnic->qp_nic_index[2] = 0xf;
+			nesvnic->qp_nic_index[3] = 0xf;
+		} else {
+			nesvnic->qp_nic_index[2] = nesvnic->nic_index + 2;
+			nesvnic->qp_nic_index[3] = nesvnic->nic_index + 3;
+		}
+	} else {
+		if (nesvnic->nesdev->nesadapter->port_count == 2 ||
+			(nesvnic->nesdev->nesadapter->port_count == 1 &&
+			nesvnic->nesdev->nesadapter->adapter_fcn_count == 2)) {
+				nesvnic->qp_nic_index[0] = nesvnic->nic_index;
+				nesvnic->qp_nic_index[1] = nesvnic->nic_index
+									+ 2;
+				nesvnic->qp_nic_index[2] = 0xf;
+				nesvnic->qp_nic_index[3] = 0xf;
+		} else {
+			nesvnic->qp_nic_index[0] = nesvnic->nic_index;
+			nesvnic->qp_nic_index[1] = 0xf;
+			nesvnic->qp_nic_index[2] = 0xf;
+			nesvnic->qp_nic_index[3] = 0xf;
+		}
+	}
+	nesvnic->next_qp_nic_index = 0;
+
+	if (nesdev->netdev_count == 0) {
+		nesvnic->rdma_enabled = 1;
+	} else {
+		nesvnic->rdma_enabled = 0;
+	}
+	nesvnic->nic_cq.cq_number = nesvnic->nic.qp_id;
+	timer_setup(&nesvnic->event_timer, NULL, 0);
+	spin_lock_init(&nesvnic->tx_lock);
+	spin_lock_init(&nesvnic->port_ibevent_lock);
+	nesdev->netdev[nesdev->netdev_count] = netdev;
+
+	nes_debug(NES_DBG_INIT, "Adding nesvnic (%p) to the adapters nesvnic_list for MAC%d.\n",
+			nesvnic, nesdev->mac_index);
+	list_add_tail(&nesvnic->list, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]);
+
+	if ((nesdev->netdev_count == 0) &&
+	    ((PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index) ||
+	     ((phy_type == NES_PHY_TYPE_PUMA_1G) &&
+	      (((PCI_FUNC(nesdev->pcidev->devfn) == 1) && (nesdev->mac_index == 2)) ||
+	       ((PCI_FUNC(nesdev->pcidev->devfn) == 2) && (nesdev->mac_index == 1)))))) {
+		u32 u32temp;
+		u32 link_mask = 0;
+		u32 link_val = 0;
+		u16 temp_phy_data;
+		u16 phy_data = 0;
+		unsigned long flags;
+
+		u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
+				(0x200 * (nesdev->mac_index & 1)));
+		if (phy_type != NES_PHY_TYPE_PUMA_1G) {
+			u32temp |= 0x00200000;
+			nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
+				(0x200 * (nesdev->mac_index & 1)), u32temp);
+		}
+
+		/* Check and set linkup here.  This is for back to back */
+		/* configuration where second port won't get link interrupt */
+		switch (phy_type) {
+		case NES_PHY_TYPE_PUMA_1G:
+			if (nesdev->mac_index < 2) {
+				link_mask = 0x01010000;
+				link_val = 0x01010000;
+			} else {
+				link_mask = 0x02020000;
+				link_val = 0x02020000;
+			}
+			break;
+		case NES_PHY_TYPE_SFP_D:
+			spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+			nes_read_10G_phy_reg(nesdev,
+					     nesdev->nesadapter->phy_index[nesdev->mac_index],
+					     1, 0x9003);
+			temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+			nes_read_10G_phy_reg(nesdev,
+					     nesdev->nesadapter->phy_index[nesdev->mac_index],
+					     3, 0x0021);
+			nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+			nes_read_10G_phy_reg(nesdev,
+					     nesdev->nesadapter->phy_index[nesdev->mac_index],
+					     3, 0x0021);
+			phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+			spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+			phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
+			break;
+		default:
+			link_mask = 0x0f1f0000;
+			link_val = 0x0f0f0000;
+			break;
+		}
+
+		u32temp = nes_read_indexed(nesdev,
+					   NES_IDX_PHY_PCS_CONTROL_STATUS0 +
+					   (0x200 * (nesdev->mac_index & 1)));
+
+		if (phy_type == NES_PHY_TYPE_SFP_D) {
+			if (phy_data & 0x0004)
+				nesvnic->linkup = 1;
+		} else {
+			if ((u32temp & link_mask) == link_val)
+				nesvnic->linkup = 1;
+		}
+
+		/* clear the MAC interrupt status, assumes direct logical to physical mapping */
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index));
+		nes_debug(NES_DBG_INIT, "Phy interrupt status = 0x%X.\n", u32temp);
+		nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index), u32temp);
+
+		nes_init_phy(nesdev);
+	}
+
+	nes_vlan_mode(netdev, nesdev, netdev->features);
+
+	return netdev;
+}
+
+
+/**
+ * nes_netdev_destroy - destroy network device structure
+ */
+void nes_netdev_destroy(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	/* make sure 'stop' method is called by Linux stack */
+	/* nes_netdev_stop(netdev); */
+
+	list_del(&nesvnic->list);
+
+	if (nesvnic->of_device_registered) {
+		nes_destroy_ofa_device(nesvnic->nesibdev);
+	}
+
+	free_netdev(netdev);
+}
+
+
+/**
+ * nes_nic_cm_xmit -- CM calls this to send out pkts
+ */
+int nes_nic_cm_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	int ret;
+
+	skb->dev = netdev;
+	ret = dev_queue_xmit(skb);
+	if (ret) {
+		nes_debug(NES_DBG_CM, "Bad return code from dev_queue_xmit %d\n", ret);
+	}
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
new file mode 100644
index 0000000..21b4a83
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_utils.c
@@ -0,0 +1,916 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include "nes.h"
+
+static u16 nes_read16_eeprom(void __iomem *addr, u16 offset);
+
+u32 mh_detected;
+u32 mh_pauses_sent;
+
+static u32 nes_set_pau(struct nes_device *nesdev)
+{
+	u32 ret = 0;
+	u32 counter;
+
+	nes_write_indexed(nesdev, NES_IDX_GPR2, NES_ENABLE_PAU);
+	nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1);
+
+	for (counter = 0; counter < NES_PAU_COUNTER; counter++) {
+		udelay(30);
+		if (!nes_read_indexed(nesdev, NES_IDX_GPR2)) {
+			printk(KERN_INFO PFX "PAU is supported.\n");
+			break;
+		}
+		nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1);
+	}
+	if (counter == NES_PAU_COUNTER) {
+		printk(KERN_INFO PFX "PAU is not supported.\n");
+		return -EPERM;
+	}
+	return ret;
+}
+
+/**
+ * nes_read_eeprom_values -
+ */
+int nes_read_eeprom_values(struct nes_device *nesdev, struct nes_adapter *nesadapter)
+{
+	u32 mac_addr_low;
+	u16 mac_addr_high;
+	u16 eeprom_data;
+	u16 eeprom_offset;
+	u16 next_section_address;
+	u16 sw_section_ver;
+	u8  major_ver = 0;
+	u8  minor_ver = 0;
+
+	/* TODO: deal with EEPROM endian issues */
+	if (nesadapter->firmware_eeprom_offset == 0) {
+		/* Read the EEPROM Parameters */
+		eeprom_data = nes_read16_eeprom(nesdev->regs, 0);
+		nes_debug(NES_DBG_HW, "EEPROM Offset 0  = 0x%04X\n", eeprom_data);
+		eeprom_offset = 2 + (((eeprom_data & 0x007f) << 3) <<
+				((eeprom_data & 0x0080) >> 7));
+		nes_debug(NES_DBG_HW, "Firmware Offset = 0x%04X\n", eeprom_offset);
+		nesadapter->firmware_eeprom_offset = eeprom_offset;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4);
+		if (eeprom_data != 0x5746) {
+			nes_debug(NES_DBG_HW, "Not a valid Firmware Image = 0x%04X\n", eeprom_data);
+			return -1;
+		}
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		eeprom_offset += ((eeprom_data & 0x00ff) << 3) << ((eeprom_data & 0x0100) >> 8);
+		nes_debug(NES_DBG_HW, "Software Offset = 0x%04X\n", eeprom_offset);
+		nesadapter->software_eeprom_offset = eeprom_offset;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4);
+		if (eeprom_data != 0x5753) {
+			printk("Not a valid Software Image = 0x%04X\n", eeprom_data);
+			return -1;
+		}
+		sw_section_ver = nes_read16_eeprom(nesdev->regs, nesadapter->software_eeprom_offset  + 6);
+		nes_debug(NES_DBG_HW, "Software section version number = 0x%04X\n",
+				sw_section_ver);
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) <<
+				((eeprom_data & 0x0100) >> 8));
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x414d) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) <<
+				((eeprom_data & 0x0100) >> 8));
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x4f52) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x4f52 but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x5746) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5746 but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x5753) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5753 but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x414d) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x464e) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x464e but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 8);
+		printk(PFX "Firmware version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data);
+		major_ver = (u8)(eeprom_data >> 8);
+		minor_ver = (u8)(eeprom_data);
+
+		if (nes_drv_opt & NES_DRV_OPT_DISABLE_VIRT_WQ) {
+			nes_debug(NES_DBG_HW, "Virtual WQs have been disabled\n");
+		} else if (((major_ver == 2) && (minor_ver > 21)) || ((major_ver > 2) && (major_ver != 255))) {
+			nesadapter->virtwq = 1;
+		}
+		if (((major_ver == 3) && (minor_ver >= 16)) || (major_ver > 3))
+			nesadapter->send_term_ok = 1;
+
+		if (nes_drv_opt & NES_DRV_OPT_ENABLE_PAU) {
+			if (!nes_set_pau(nesdev))
+				nesadapter->allow_unaligned_fpdus = 1;
+		}
+
+		nesadapter->firmware_version = (((u32)(u8)(eeprom_data>>8))  <<  16) +
+				(u32)((u8)eeprom_data);
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 10);
+		printk(PFX "EEPROM version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data);
+		nesadapter->eeprom_version = (((u32)(u8)(eeprom_data>>8)) << 16) +
+				(u32)((u8)eeprom_data);
+
+no_fw_rev:
+		/* eeprom is valid */
+		eeprom_offset = nesadapter->software_eeprom_offset;
+		eeprom_offset += 8;
+		nesadapter->netdev_max = (u8)nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		mac_addr_high = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		mac_addr_low = (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		mac_addr_low <<= 16;
+		mac_addr_low += (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "Base MAC Address = 0x%04X%08X\n",
+				mac_addr_high, mac_addr_low);
+		nes_debug(NES_DBG_HW, "MAC Address count = %u\n", nesadapter->netdev_max);
+
+		nesadapter->mac_addr_low = mac_addr_low;
+		nesadapter->mac_addr_high = mac_addr_high;
+
+		/* Read the Phy Type array */
+		eeprom_offset += 10;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->phy_type[0] = (u8)(eeprom_data >> 8);
+		nesadapter->phy_type[1] = (u8)eeprom_data;
+
+		/* Read the port array */
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->phy_type[2] = (u8)(eeprom_data >> 8);
+		nesadapter->phy_type[3] = (u8)eeprom_data;
+		/* port_count is set by soft reset reg */
+		nes_debug(NES_DBG_HW, "port_count = %u, port 0 -> %u, port 1 -> %u,"
+				" port 2 -> %u, port 3 -> %u\n",
+				nesadapter->port_count,
+				nesadapter->phy_type[0], nesadapter->phy_type[1],
+				nesadapter->phy_type[2], nesadapter->phy_type[3]);
+
+		/* Read PD config array */
+		eeprom_offset += 10;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[0] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[0] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD0 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[0], nesadapter->pd_config_base[0]);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[1] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[1] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD1 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[1], nesadapter->pd_config_base[1]);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[2] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[2] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD2 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[2], nesadapter->pd_config_base[2]);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[3] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[3] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD3 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[3], nesadapter->pd_config_base[3]);
+
+		/* Read Rx Pool Size */
+		eeprom_offset += 22;   /* 46 */
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->rx_pool_size = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "rx_pool_size = 0x%08X\n", nesadapter->rx_pool_size);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->tx_pool_size = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "tx_pool_size = 0x%08X\n", nesadapter->tx_pool_size);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->rx_threshold = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "rx_threshold = 0x%08X\n", nesadapter->rx_threshold);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->tcp_timer_core_clk_divisor = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "tcp_timer_core_clk_divisor = 0x%08X\n",
+				nesadapter->tcp_timer_core_clk_divisor);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->iwarp_config = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "iwarp_config = 0x%08X\n", nesadapter->iwarp_config);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->cm_config = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "cm_config = 0x%08X\n", nesadapter->cm_config);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->sws_timer_config = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "sws_timer_config = 0x%08X\n", nesadapter->sws_timer_config);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->tcp_config1 = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "tcp_config1 = 0x%08X\n", nesadapter->tcp_config1);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->wqm_wat = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "wqm_wat = 0x%08X\n", nesadapter->wqm_wat);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->core_clock = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "core_clock = 0x%08X\n", nesadapter->core_clock);
+
+		if ((sw_section_ver) && (nesadapter->hw_rev != NE020_REV)) {
+			eeprom_offset += 2;
+			eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+			nesadapter->phy_index[0] = (eeprom_data & 0xff00)>>8;
+			nesadapter->phy_index[1] = eeprom_data & 0x00ff;
+			eeprom_offset += 2;
+			eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+			nesadapter->phy_index[2] = (eeprom_data & 0xff00)>>8;
+			nesadapter->phy_index[3] = eeprom_data & 0x00ff;
+		} else {
+			nesadapter->phy_index[0] = 4;
+			nesadapter->phy_index[1] = 5;
+			nesadapter->phy_index[2] = 6;
+			nesadapter->phy_index[3] = 7;
+		}
+		nes_debug(NES_DBG_HW, "Phy address map = 0 > %u,  1 > %u, 2 > %u, 3 > %u\n",
+			   nesadapter->phy_index[0],nesadapter->phy_index[1],
+			   nesadapter->phy_index[2],nesadapter->phy_index[3]);
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_read16_eeprom
+ */
+static u16 nes_read16_eeprom(void __iomem *addr, u16 offset)
+{
+	writel(NES_EEPROM_READ_REQUEST + (offset >> 1),
+			(void __iomem *)addr + NES_EEPROM_COMMAND);
+
+	do {
+	} while (readl((void __iomem *)addr + NES_EEPROM_COMMAND) &
+			NES_EEPROM_READ_REQUEST);
+
+	return readw((void __iomem *)addr + NES_EEPROM_DATA);
+}
+
+
+/**
+ * nes_write_1G_phy_reg
+ */
+void nes_write_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 data)
+{
+	u32 u32temp;
+	u32 counter;
+
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x50020000 | data | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			/* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+}
+
+
+/**
+ * nes_read_1G_phy_reg
+ * This routine only issues the read, the data must be read
+ * separately.
+ */
+void nes_read_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 *data)
+{
+	u32 u32temp;
+	u32 counter;
+
+	/* nes_debug(NES_DBG_PHY, "phy addr = %d, mac_index = %d\n",
+			phy_addr, nesdev->mac_index); */
+
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x60020000 | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			/* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1)) {
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+		*data = 0xffff;
+	} else {
+		*data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	}
+}
+
+
+/**
+ * nes_write_10G_phy_reg
+ */
+void nes_write_10G_phy_reg(struct nes_device *nesdev, u16 phy_addr, u8 dev_addr, u16 phy_reg,
+		u16 data)
+{
+	u32 port_addr;
+	u32 u32temp;
+	u32 counter;
+
+	port_addr = phy_addr;
+
+	/* set address */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+
+	/* set data */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x10020000 | (u32)data | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+}
+
+
+/**
+ * nes_read_10G_phy_reg
+ * This routine only issues the read, the data must be read
+ * separately.
+ */
+void nes_read_10G_phy_reg(struct nes_device *nesdev, u8 phy_addr, u8 dev_addr, u16 phy_reg)
+{
+	u32 port_addr;
+	u32 u32temp;
+	u32 counter;
+
+	port_addr = phy_addr;
+
+	/* set address */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+
+	/* issue read */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x30020000 | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+}
+
+
+/**
+ * nes_get_cqp_request
+ */
+struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev)
+{
+	unsigned long flags;
+	struct nes_cqp_request *cqp_request = NULL;
+
+	if (!list_empty(&nesdev->cqp_avail_reqs)) {
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+		if (!list_empty(&nesdev->cqp_avail_reqs)) {
+			cqp_request = list_entry(nesdev->cqp_avail_reqs.next,
+				struct nes_cqp_request, list);
+			list_del_init(&cqp_request->list);
+		}
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	}
+	if (cqp_request == NULL) {
+		cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_ATOMIC);
+		if (cqp_request) {
+			cqp_request->dynamic = 1;
+			INIT_LIST_HEAD(&cqp_request->list);
+		}
+	}
+
+	if (cqp_request) {
+		init_waitqueue_head(&cqp_request->waitq);
+		cqp_request->waiting = 0;
+		cqp_request->request_done = 0;
+		cqp_request->callback = 0;
+		init_waitqueue_head(&cqp_request->waitq);
+		nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n",
+				cqp_request);
+	} else
+		printk(KERN_ERR PFX "%s: Could not allocated a CQP request.\n",
+			   __func__);
+
+	return cqp_request;
+}
+
+void nes_free_cqp_request(struct nes_device *nesdev,
+			  struct nes_cqp_request *cqp_request)
+{
+	unsigned long flags;
+
+	nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n",
+		  cqp_request,
+		  le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX]) & 0x3f);
+
+	if (cqp_request->dynamic) {
+		kfree(cqp_request);
+	} else {
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+		list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	}
+}
+
+void nes_put_cqp_request(struct nes_device *nesdev,
+			 struct nes_cqp_request *cqp_request)
+{
+	if (atomic_dec_and_test(&cqp_request->refcount))
+		nes_free_cqp_request(nesdev, cqp_request);
+}
+
+
+/**
+ * nes_post_cqp_request
+ */
+void nes_post_cqp_request(struct nes_device *nesdev,
+			  struct nes_cqp_request *cqp_request)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	unsigned long flags;
+	u32 cqp_head;
+	u64 u64temp;
+	u32 opcode;
+	int ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX;
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+	if (((((nesdev->cqp.sq_tail+(nesdev->cqp.sq_size*2))-nesdev->cqp.sq_head) &
+			(nesdev->cqp.sq_size - 1)) != 1)
+			&& (list_empty(&nesdev->cqp_pending_reqs))) {
+		cqp_head = nesdev->cqp.sq_head++;
+		nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+		memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe));
+		opcode = le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]);
+		if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT)
+			ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX;
+		barrier();
+		u64temp = (unsigned long)cqp_request;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, ctx_index, u64temp);
+		nes_debug(NES_DBG_CQP, "CQP request (opcode 0x%02X), line 1 = 0x%08X put on CQPs SQ,"
+			" request = %p, cqp_head = %u, cqp_tail = %u, cqp_size = %u,"
+			" waiting = %d, refcount = %d.\n",
+			opcode & NES_CQP_OPCODE_MASK,
+			le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX]), cqp_request,
+			nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size,
+			cqp_request->waiting, atomic_read(&cqp_request->refcount));
+
+		barrier();
+
+		/* Ring doorbell (1 WQEs) */
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
+
+		barrier();
+	} else {
+		nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X), line 1 = 0x%08X"
+				" put on the pending queue.\n",
+				cqp_request,
+				le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f,
+				le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_ID_IDX]));
+		list_add_tail(&cqp_request->list, &nesdev->cqp_pending_reqs);
+	}
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+	return;
+}
+
+/**
+ * nes_arp_table
+ */
+int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 action)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	int arp_index;
+	int err = 0;
+	__be32 tmp_addr;
+
+	for (arp_index = 0; (u32) arp_index < nesadapter->arp_table_size; arp_index++) {
+		if (nesadapter->arp_table[arp_index].ip_addr == ip_addr)
+			break;
+	}
+
+	if (action == NES_ARP_ADD) {
+		if (arp_index != nesadapter->arp_table_size) {
+			return -1;
+		}
+
+		arp_index = 0;
+		err = nes_alloc_resource(nesadapter, nesadapter->allocated_arps,
+				nesadapter->arp_table_size, (u32 *)&arp_index, &nesadapter->next_arp_index, NES_RESOURCE_ARP);
+		if (err) {
+			nes_debug(NES_DBG_NETDEV, "nes_alloc_resource returned error = %u\n", err);
+			return err;
+		}
+		nes_debug(NES_DBG_NETDEV, "ADD, arp_index=%d\n", arp_index);
+
+		nesadapter->arp_table[arp_index].ip_addr = ip_addr;
+		memcpy(nesadapter->arp_table[arp_index].mac_addr, mac_addr, ETH_ALEN);
+		return arp_index;
+	}
+
+	/* DELETE or RESOLVE */
+	if (arp_index == nesadapter->arp_table_size) {
+		tmp_addr = cpu_to_be32(ip_addr);
+		nes_debug(NES_DBG_NETDEV, "MAC for %pI4 not in ARP table - cannot %s\n",
+			  &tmp_addr, action == NES_ARP_RESOLVE ? "resolve" : "delete");
+		return -1;
+	}
+
+	if (action == NES_ARP_RESOLVE) {
+		nes_debug(NES_DBG_NETDEV, "RESOLVE, arp_index=%d\n", arp_index);
+		return arp_index;
+	}
+
+	if (action == NES_ARP_DELETE) {
+		nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index);
+		nesadapter->arp_table[arp_index].ip_addr = 0;
+		eth_zero_addr(nesadapter->arp_table[arp_index].mac_addr);
+		nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index);
+		return arp_index;
+	}
+
+	return -1;
+}
+
+
+/**
+ * nes_mh_fix
+ */
+void nes_mh_fix(struct timer_list *t)
+{
+	struct nes_adapter *nesadapter = from_timer(nesadapter, t, mh_timer);
+	struct nes_device *nesdev = nesadapter->nesdev;
+	unsigned long flags;
+	struct nes_vnic *nesvnic;
+	u32 used_chunks_tx;
+	u32 temp_used_chunks_tx;
+	u32 temp_last_used_chunks_tx;
+	u32 used_chunks_mask;
+	u32 mac_tx_frames_low;
+	u32 mac_tx_frames_high;
+	u32 mac_tx_pauses;
+	u32 reset_value;
+	u32 tx_control;
+	u32 tx_config;
+	u32 tx_pause_quanta;
+	u32 rx_control;
+	u32 rx_config;
+	u32 mac_exact_match;
+	u32 mpp_debug;
+	u32 i=0;
+	u32 chunks_tx_progress = 0;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+	if ((nesadapter->mac_sw_state[0] != NES_MAC_SW_IDLE) || (nesadapter->mac_link_down[0])) {
+		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+		goto no_mh_work;
+	}
+	nesadapter->mac_sw_state[0] = NES_MAC_SW_MH;
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+	do {
+		mac_tx_frames_low = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_LOW);
+		mac_tx_frames_high = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_HIGH);
+		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
+		used_chunks_tx = nes_read_indexed(nesdev, NES_IDX_USED_CHUNKS_TX);
+		nesdev->mac_pause_frames_sent += mac_tx_pauses;
+		used_chunks_mask = 0;
+		temp_used_chunks_tx = used_chunks_tx;
+		temp_last_used_chunks_tx = nesdev->last_used_chunks_tx;
+
+		if (nesdev->netdev[0]) {
+			nesvnic = netdev_priv(nesdev->netdev[0]);
+		} else {
+			break;
+		}
+
+		for (i=0; i<4; i++) {
+			used_chunks_mask <<= 8;
+			if (nesvnic->qp_nic_index[i] != 0xff) {
+				used_chunks_mask |= 0xff;
+				if ((temp_used_chunks_tx&0xff)<(temp_last_used_chunks_tx&0xff)) {
+					chunks_tx_progress = 1;
+				}
+			}
+			temp_used_chunks_tx >>= 8;
+			temp_last_used_chunks_tx >>= 8;
+		}
+		if ((mac_tx_frames_low) || (mac_tx_frames_high) ||
+			(!(used_chunks_tx&used_chunks_mask)) ||
+			(!(nesdev->last_used_chunks_tx&used_chunks_mask)) ||
+			(chunks_tx_progress) ) {
+			nesdev->last_used_chunks_tx = used_chunks_tx;
+			break;
+		}
+		nesdev->last_used_chunks_tx = used_chunks_tx;
+		barrier();
+
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000005);
+		mh_pauses_sent++;
+		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
+		if (mac_tx_pauses) {
+			nesdev->mac_pause_frames_sent += mac_tx_pauses;
+			break;
+		}
+
+		tx_control = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONTROL);
+		tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
+		tx_pause_quanta = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA);
+		rx_control = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONTROL);
+		rx_config = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONFIG);
+		mac_exact_match = nes_read_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM);
+		mpp_debug = nes_read_indexed(nesdev, NES_IDX_MPP_DEBUG);
+
+		/* one last ditch effort to avoid a false positive */
+		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
+		if (mac_tx_pauses) {
+			nesdev->last_mac_tx_pauses = nesdev->mac_pause_frames_sent;
+			nes_debug(NES_DBG_HW, "failsafe caught slow outbound pause\n");
+			break;
+		}
+		mh_detected++;
+
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, 0x00000000);
+		reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+
+		nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value | 0x0000001d);
+
+		while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+				& 0x00000040) != 0x00000040) && (i++ < 5000)) {
+			/* mdelay(1); */
+		}
+
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008);
+		nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0);
+
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000);
+		if (nesadapter->OneG_Mode) {
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222);
+		} else {
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222);
+		}
+		nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_STATUS0);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff);
+
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, tx_control);
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA, tx_pause_quanta);
+		nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONTROL, rx_control);
+		nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONFIG, rx_config);
+		nes_write_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM, mac_exact_match);
+		nes_write_indexed(nesdev, NES_IDX_MPP_DEBUG, mpp_debug);
+
+	} while (0);
+
+	nesadapter->mac_sw_state[0] = NES_MAC_SW_IDLE;
+no_mh_work:
+	nesdev->nesadapter->mh_timer.expires = jiffies + (HZ/5);
+	add_timer(&nesdev->nesadapter->mh_timer);
+}
+
+/**
+ * nes_clc
+ */
+void nes_clc(struct timer_list *t)
+{
+	struct nes_adapter *nesadapter = from_timer(nesadapter, t, lc_timer);
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+	nesadapter->link_interrupt_count[0] = 0;
+	nesadapter->link_interrupt_count[1] = 0;
+	nesadapter->link_interrupt_count[2] = 0;
+	nesadapter->link_interrupt_count[3] = 0;
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+
+	nesadapter->lc_timer.expires = jiffies + 3600 * HZ;  /* 1 hour */
+	add_timer(&nesadapter->lc_timer);
+}
+
+
+/**
+ * nes_dump_mem
+ */
+void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length)
+{
+	if (!(nes_debug_level & dump_debug_level)) {
+		return;
+	}
+
+	if (length > 0x100) {
+		nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100);
+		length = 0x100;
+	}
+	nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", addr, length, length);
+
+	print_hex_dump(KERN_ERR, PFX, DUMP_PREFIX_NONE, 16, 1, addr, length, true);
+}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
new file mode 100644
index 0000000..1040a6e
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -0,0 +1,3905 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/random.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "nes.h"
+
+#include <rdma/ib_umem.h>
+
+atomic_t mod_qp_timouts;
+atomic_t qps_created;
+atomic_t sw_qps_destroyed;
+
+static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev);
+static int nes_dereg_mr(struct ib_mr *ib_mr);
+
+/**
+ * nes_alloc_mw
+ */
+static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type,
+				  struct ib_udata *udata)
+{
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_cqp_request *cqp_request;
+	struct nes_mr *nesmr;
+	struct ib_mw *ibmw;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	int ret;
+	u32 stag;
+	u32 stag_index = 0;
+	u32 next_stag_index = 0;
+	u32 driver_key = 0;
+	u8 stag_key = 0;
+
+	if (type != IB_MW_TYPE_1)
+		return ERR_PTR(-EINVAL);
+
+	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+	stag_key = (u8)next_stag_index;
+
+	driver_key = 0;
+
+	next_stag_index >>= 8;
+	next_stag_index %= nesadapter->max_mr;
+
+	ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
+			nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_MW);
+	if (ret) {
+		return ERR_PTR(ret);
+	}
+
+	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+	if (!nesmr) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	stag = stag_index << 8;
+	stag |= driver_key;
+	stag += (u32)stag_key;
+
+	nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n",
+			stag, stag_index);
+
+	/* Register the region with the adapter */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		kfree(nesmr);
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] =
+			cpu_to_le32( NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_RIGHTS_REMOTE_READ |
+			NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_VA_TO |
+			NES_CQP_STAG_REM_ACC_EN);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			stag, ret, cqp_request->major_code, cqp_request->minor_code);
+	if ((!ret) || (cqp_request->major_code)) {
+		nes_put_cqp_request(nesdev, cqp_request);
+		kfree(nesmr);
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		if (!ret) {
+			return ERR_PTR(-ETIME);
+		} else {
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	nesmr->ibmw.rkey = stag;
+	nesmr->mode = IWNES_MEMREG_TYPE_MW;
+	ibmw = &nesmr->ibmw;
+	nesmr->pbl_4k = 0;
+	nesmr->pbls_used = 0;
+
+	return ibmw;
+}
+
+
+/**
+ * nes_dealloc_mw
+ */
+static int nes_dealloc_mw(struct ib_mw *ibmw)
+{
+	struct nes_mr *nesmr = to_nesmw(ibmw);
+	struct nes_vnic *nesvnic = to_nesvnic(ibmw->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	int err = 0;
+	int ret;
+
+	/* Deallocate the window with the adapter */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, NES_CQP_DEALLOCATE_STAG);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ibmw->rkey);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X to complete.\n",
+			ibmw->rkey);
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Deallocate STag completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			ret, cqp_request->major_code, cqp_request->minor_code);
+	if (!ret)
+		err = -ETIME;
+	else if (cqp_request->major_code)
+		err = -EIO;
+
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+			(ibmw->rkey & 0x0fffff00) >> 8);
+	kfree(nesmr);
+
+	return err;
+}
+
+
+/*
+ * nes_alloc_fast_mr
+ */
+static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
+			     u32 stag, u32 page_count)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	int ret;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 opcode = 0;
+	u16 major_code;
+	u64 region_length = page_count * PAGE_SIZE;
+
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	nes_debug(NES_DBG_MR, "alloc_fast_reg_mr: page_count = %d, "
+			      "region_length = %llu\n",
+			      page_count, region_length);
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	if (nesadapter->free_4kpbl > 0) {
+		nesadapter->free_4kpbl--;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	} else {
+		/* No 4kpbl's available: */
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		nes_debug(NES_DBG_MR, "Out of Pbls\n");
+		nes_free_cqp_request(nesdev, cqp_request);
+		return -ENOMEM;
+	}
+
+	opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_MR |
+		 NES_CQP_STAG_PBL_BLK_SIZE | NES_CQP_STAG_VA_TO |
+		 NES_CQP_STAG_REM_ACC_EN;
+	/*
+	 * The current OFED API does not support the zero based TO option.
+	 * If added then need to changed the NES_CQP_STAG_VA* option.  Also,
+	 * the API does not support that ability to have the MR set for local
+	 * access only when created and not allow the SQ op to override. Given
+	 * this the remote enable must be set here.
+	 */
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, 1);
+
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] =
+			cpu_to_le32((u32)(region_length >> 8) & 0xff000000);
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |=
+			cpu_to_le32(nespd->pd_id & 0x00007fff);
+
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, 0);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, 0);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, 0);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (page_count * 8));
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE);
+	barrier();
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	ret = wait_event_timeout(cqp_request->waitq,
+				 (0 != cqp_request->request_done),
+				 NES_EVENT_TIMEOUT);
+
+	nes_debug(NES_DBG_MR, "Allocate STag 0x%08X completed, "
+		  "wait_event_timeout ret = %u, CQP Major:Minor codes = "
+		  "0x%04X:0x%04X.\n", stag, ret, cqp_request->major_code,
+		  cqp_request->minor_code);
+	major_code = cqp_request->major_code;
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (!ret || major_code) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_4kpbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+
+	if (!ret)
+		return -ETIME;
+	else if (major_code)
+		return -EIO;
+	return 0;
+}
+
+/*
+ * nes_alloc_mr
+ */
+static struct ib_mr *nes_alloc_mr(struct ib_pd *ibpd,
+				  enum ib_mr_type mr_type,
+				  u32 max_num_sg)
+{
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	u32 next_stag_index;
+	u8 stag_key = 0;
+	u32 driver_key = 0;
+	int err = 0;
+	u32 stag_index = 0;
+	struct nes_mr *nesmr;
+	u32 stag;
+	int ret;
+	struct ib_mr *ibmr;
+
+	if (mr_type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EINVAL);
+
+	if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
+		return ERR_PTR(-E2BIG);
+
+/*
+ * Note:  Set to always use a fixed length single page entry PBL.  This is to allow
+ *	 for the fast_reg_mr operation to always know the size of the PBL.
+ */
+	if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
+		return ERR_PTR(-E2BIG);
+
+	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+	stag_key = (u8)next_stag_index;
+	next_stag_index >>= 8;
+	next_stag_index %= nesadapter->max_mr;
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
+				 nesadapter->max_mr, &stag_index,
+				 &next_stag_index, NES_RESOURCE_FAST_MR);
+	if (err)
+		return ERR_PTR(err);
+
+	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+	if (!nesmr) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	stag = stag_index << 8;
+	stag |= driver_key;
+	stag += (u32)stag_key;
+
+	nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n",
+		  stag, stag_index);
+
+	ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_num_sg);
+
+	if (ret == 0) {
+		nesmr->ibmr.rkey = stag;
+		nesmr->ibmr.lkey = stag;
+		nesmr->mode = IWNES_MEMREG_TYPE_FMEM;
+		ibmr = &nesmr->ibmr;
+	} else {
+		kfree(nesmr);
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nesmr->pages = pci_alloc_consistent(nesdev->pcidev,
+					    max_num_sg * sizeof(u64),
+					    &nesmr->paddr);
+	if (!nesmr->paddr)
+		goto err;
+
+	nesmr->max_pages = max_num_sg;
+
+	return ibmr;
+
+err:
+	nes_dereg_mr(ibmr);
+
+	return ERR_PTR(-ENOMEM);
+}
+
+static int nes_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct nes_mr *nesmr = to_nesmr(ibmr);
+
+	if (unlikely(nesmr->npages == nesmr->max_pages))
+		return -ENOMEM;
+
+	nesmr->pages[nesmr->npages++] = cpu_to_le64(addr);
+
+	return 0;
+}
+
+static int nes_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+			 int sg_nents, unsigned int *sg_offset)
+{
+	struct nes_mr *nesmr = to_nesmr(ibmr);
+
+	nesmr->npages = 0;
+
+	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, nes_set_page);
+}
+
+/**
+ * nes_query_device
+ */
+static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+			    struct ib_udata *uhw)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
+
+	if (uhw->inlen || uhw->outlen)
+		return -EINVAL;
+
+	memset(props, 0, sizeof(*props));
+	memcpy(&props->sys_image_guid, nesvnic->netdev->dev_addr, 6);
+
+	props->fw_ver = nesdev->nesadapter->firmware_version;
+	props->device_cap_flags = nesdev->nesadapter->device_cap_flags;
+	props->vendor_id = nesdev->nesadapter->vendor_id;
+	props->vendor_part_id = nesdev->nesadapter->vendor_part_id;
+	props->hw_ver = nesdev->nesadapter->hw_rev;
+	props->max_mr_size = 0x80000000;
+	props->max_qp = nesibdev->max_qp;
+	props->max_qp_wr = nesdev->nesadapter->max_qp_wr - 2;
+	props->max_sge = nesdev->nesadapter->max_sge;
+	props->max_cq = nesibdev->max_cq;
+	props->max_cqe = nesdev->nesadapter->max_cqe;
+	props->max_mr = nesibdev->max_mr;
+	props->max_mw = nesibdev->max_mr;
+	props->max_pd = nesibdev->max_pd;
+	props->max_sge_rd = 1;
+	switch (nesdev->nesadapter->max_irrq_wr) {
+		case 0:
+			props->max_qp_rd_atom = 2;
+			break;
+		case 1:
+			props->max_qp_rd_atom = 8;
+			break;
+		case 2:
+			props->max_qp_rd_atom = 32;
+			break;
+		case 3:
+			props->max_qp_rd_atom = 64;
+			break;
+		default:
+			props->max_qp_rd_atom = 0;
+	}
+	props->max_qp_init_rd_atom = props->max_qp_rd_atom;
+	props->atomic_cap = IB_ATOMIC_NONE;
+	props->max_map_per_fmr = 1;
+
+	return 0;
+}
+
+
+/**
+ * nes_query_port
+ */
+static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct net_device *netdev = nesvnic->netdev;
+
+	/* props being zeroed by the caller, avoid zeroing it here */
+
+	props->max_mtu = IB_MTU_4096;
+	props->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
+
+	props->lid = 1;
+	if (netif_queue_stopped(netdev))
+		props->state = IB_PORT_DOWN;
+	else if (nesvnic->linkup)
+		props->state = IB_PORT_ACTIVE;
+	else
+		props->state = IB_PORT_DOWN;
+	props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
+			IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->active_width = IB_WIDTH_4X;
+	props->active_speed = IB_SPEED_SDR;
+	props->max_msg_sz = 0x80000000;
+
+	return 0;
+}
+
+/**
+ * nes_query_pkey
+ */
+static int nes_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	*pkey = 0;
+	return 0;
+}
+
+
+/**
+ * nes_query_gid
+ */
+static int nes_query_gid(struct ib_device *ibdev, u8 port,
+		int index, union ib_gid *gid)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), nesvnic->netdev->dev_addr, 6);
+
+	return 0;
+}
+
+
+/**
+ * nes_alloc_ucontext - Allocate the user context data structure. This keeps track
+ * of all objects associated with a particular user-mode client.
+ */
+static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev,
+		struct ib_udata *udata)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_alloc_ucontext_req req;
+	struct nes_alloc_ucontext_resp uresp;
+	struct nes_ucontext *nes_ucontext;
+	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
+
+
+	if (ib_copy_from_udata(&req, udata, sizeof(struct nes_alloc_ucontext_req))) {
+		printk(KERN_ERR PFX "Invalid structure size on allocate user context.\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (req.userspace_ver != NES_ABI_USERSPACE_VER) {
+		printk(KERN_ERR PFX "Invalid userspace driver version detected. Detected version %d, should be %d\n",
+			req.userspace_ver, NES_ABI_USERSPACE_VER);
+		return ERR_PTR(-EINVAL);
+	}
+
+
+	memset(&uresp, 0, sizeof uresp);
+
+	uresp.max_qps = nesibdev->max_qp;
+	uresp.max_pds = nesibdev->max_pd;
+	uresp.wq_size = nesdev->nesadapter->max_qp_wr * 2;
+	uresp.virtwq = nesadapter->virtwq;
+	uresp.kernel_ver = NES_ABI_KERNEL_VER;
+
+	nes_ucontext = kzalloc(sizeof *nes_ucontext, GFP_KERNEL);
+	if (!nes_ucontext)
+		return ERR_PTR(-ENOMEM);
+
+	nes_ucontext->nesdev = nesdev;
+	nes_ucontext->mmap_wq_offset = uresp.max_pds;
+	nes_ucontext->mmap_cq_offset = nes_ucontext->mmap_wq_offset +
+			((sizeof(struct nes_hw_qp_wqe) * uresp.max_qps * 2) + PAGE_SIZE-1) /
+			PAGE_SIZE;
+
+
+	if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+		kfree(nes_ucontext);
+		return ERR_PTR(-EFAULT);
+	}
+
+	INIT_LIST_HEAD(&nes_ucontext->cq_reg_mem_list);
+	INIT_LIST_HEAD(&nes_ucontext->qp_reg_mem_list);
+	atomic_set(&nes_ucontext->usecnt, 1);
+	return &nes_ucontext->ibucontext;
+}
+
+
+/**
+ * nes_dealloc_ucontext
+ */
+static int nes_dealloc_ucontext(struct ib_ucontext *context)
+{
+	/* struct nes_vnic *nesvnic = to_nesvnic(context->device); */
+	/* struct nes_device *nesdev = nesvnic->nesdev; */
+	struct nes_ucontext *nes_ucontext = to_nesucontext(context);
+
+	if (!atomic_dec_and_test(&nes_ucontext->usecnt))
+	  return 0;
+	kfree(nes_ucontext);
+	return 0;
+}
+
+
+/**
+ * nes_mmap
+ */
+static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	unsigned long index;
+	struct nes_vnic *nesvnic = to_nesvnic(context->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	/* struct nes_adapter *nesadapter = nesdev->nesadapter; */
+	struct nes_ucontext *nes_ucontext;
+	struct nes_qp *nesqp;
+
+	nes_ucontext = to_nesucontext(context);
+
+
+	if (vma->vm_pgoff >= nes_ucontext->mmap_wq_offset) {
+		index = (vma->vm_pgoff - nes_ucontext->mmap_wq_offset) * PAGE_SIZE;
+		index /= ((sizeof(struct nes_hw_qp_wqe) * nesdev->nesadapter->max_qp_wr * 2) +
+				PAGE_SIZE-1) & (~(PAGE_SIZE-1));
+		if (!test_bit(index, nes_ucontext->allocated_wqs)) {
+			nes_debug(NES_DBG_MMAP, "wq %lu not allocated\n", index);
+			return -EFAULT;
+		}
+		nesqp = nes_ucontext->mmap_nesqp[index];
+		if (nesqp == NULL) {
+			nes_debug(NES_DBG_MMAP, "wq %lu has a NULL QP base.\n", index);
+			return -EFAULT;
+		}
+		if (remap_pfn_range(vma, vma->vm_start,
+				virt_to_phys(nesqp->hwqp.sq_vbase) >> PAGE_SHIFT,
+				vma->vm_end - vma->vm_start,
+				vma->vm_page_prot)) {
+			nes_debug(NES_DBG_MMAP, "remap_pfn_range failed.\n");
+			return -EAGAIN;
+		}
+		vma->vm_private_data = nesqp;
+		return 0;
+	} else {
+		index = vma->vm_pgoff;
+		if (!test_bit(index, nes_ucontext->allocated_doorbells))
+			return -EFAULT;
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				(nesdev->doorbell_start +
+				((nes_ucontext->mmap_db_index[index] - nesdev->base_doorbell_index) * 4096))
+				>> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+		vma->vm_private_data = nes_ucontext;
+		return 0;
+	}
+
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_alloc_pd
+ */
+static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
+		struct ib_ucontext *context, struct ib_udata *udata)
+{
+	struct nes_pd *nespd;
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_ucontext *nesucontext;
+	struct nes_alloc_pd_resp uresp;
+	u32 pd_num = 0;
+	int err;
+
+	nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
+			nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context,
+			netdev_refcnt_read(nesvnic->netdev));
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
+			nesadapter->max_pd, &pd_num, &nesadapter->next_pd, NES_RESOURCE_PD);
+	if (err) {
+		return ERR_PTR(err);
+	}
+
+	nespd = kzalloc(sizeof (struct nes_pd), GFP_KERNEL);
+	if (!nespd) {
+		nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n",
+			nespd, nesvnic->nesibdev->ibdev.name);
+
+	nespd->pd_id = (pd_num << (PAGE_SHIFT-12)) + nesadapter->base_pd;
+
+	if (context) {
+		nesucontext = to_nesucontext(context);
+		nespd->mmap_db_index = find_next_zero_bit(nesucontext->allocated_doorbells,
+				NES_MAX_USER_DB_REGIONS, nesucontext->first_free_db);
+		nes_debug(NES_DBG_PD, "find_first_zero_biton doorbells returned %u, mapping pd_id %u.\n",
+				nespd->mmap_db_index, nespd->pd_id);
+		if (nespd->mmap_db_index >= NES_MAX_USER_DB_REGIONS) {
+			nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n");
+			nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
+			kfree(nespd);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		uresp.pd_id = nespd->pd_id;
+		uresp.mmap_db_index = nespd->mmap_db_index;
+		if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) {
+			nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
+			kfree(nespd);
+			return ERR_PTR(-EFAULT);
+		}
+
+		set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
+		nesucontext->mmap_db_index[nespd->mmap_db_index] = nespd->pd_id;
+		nesucontext->first_free_db = nespd->mmap_db_index + 1;
+	}
+
+	nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd);
+	return &nespd->ibpd;
+}
+
+
+/**
+ * nes_dealloc_pd
+ */
+static int nes_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct nes_ucontext *nesucontext;
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if ((ibpd->uobject) && (ibpd->uobject->context)) {
+		nesucontext = to_nesucontext(ibpd->uobject->context);
+		nes_debug(NES_DBG_PD, "Clearing bit %u from allocated doorbells\n",
+				nespd->mmap_db_index);
+		clear_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
+		nesucontext->mmap_db_index[nespd->mmap_db_index] = 0;
+		if (nesucontext->first_free_db > nespd->mmap_db_index) {
+			nesucontext->first_free_db = nespd->mmap_db_index;
+		}
+	}
+
+	nes_debug(NES_DBG_PD, "Deallocating PD%u structure located @%p.\n",
+			nespd->pd_id, nespd);
+	nes_free_resource(nesadapter, nesadapter->allocated_pds,
+			(nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12));
+	kfree(nespd);
+
+	return 0;
+}
+
+
+/**
+ * nes_create_ah
+ */
+static struct ib_ah *nes_create_ah(struct ib_pd *pd,
+				   struct rdma_ah_attr *ah_attr,
+				   struct ib_udata *udata)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+
+/**
+ * nes_destroy_ah
+ */
+static int nes_destroy_ah(struct ib_ah *ah)
+{
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_get_encoded_size
+ */
+static inline u8 nes_get_encoded_size(int *size)
+{
+	u8 encoded_size = 0;
+	if (*size <= 32) {
+		*size = 32;
+		encoded_size = 1;
+	} else if (*size <= 128) {
+		*size = 128;
+		encoded_size = 2;
+	} else if (*size <= 512) {
+		*size = 512;
+		encoded_size = 3;
+	}
+	return (encoded_size);
+}
+
+
+
+/**
+ * nes_setup_virt_qp
+ */
+static int nes_setup_virt_qp(struct nes_qp *nesqp, struct nes_pbl *nespbl,
+		struct nes_vnic *nesvnic, int sq_size, int rq_size)
+{
+	unsigned long flags;
+	void *mem;
+	__le64 *pbl = NULL;
+	__le64 *tpbl;
+	__le64 *pblbuffer;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 pbl_entries;
+	u8 rq_pbl_entries;
+	u8 sq_pbl_entries;
+
+	pbl_entries = nespbl->pbl_size >> 3;
+	nes_debug(NES_DBG_QP, "Userspace PBL, pbl_size=%u, pbl_entries = %d pbl_vbase=%p, pbl_pbase=%lx\n",
+			nespbl->pbl_size, pbl_entries,
+			(void *)nespbl->pbl_vbase,
+			(unsigned long) nespbl->pbl_pbase);
+	pbl = (__le64 *) nespbl->pbl_vbase; /* points to first pbl entry */
+	/* now lets set the sq_vbase as well as rq_vbase addrs we will assign */
+	/* the first pbl to be fro the rq_vbase... */
+	rq_pbl_entries = (rq_size * sizeof(struct nes_hw_qp_wqe)) >> 12;
+	sq_pbl_entries = (sq_size * sizeof(struct nes_hw_qp_wqe)) >> 12;
+	nesqp->hwqp.sq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32);
+	if (!nespbl->page) {
+		nes_debug(NES_DBG_QP, "QP nespbl->page is NULL \n");
+		kfree(nespbl);
+		return -ENOMEM;
+	}
+
+	nesqp->hwqp.sq_vbase = kmap(nespbl->page);
+	nesqp->page = nespbl->page;
+	if (!nesqp->hwqp.sq_vbase) {
+		nes_debug(NES_DBG_QP, "QP sq_vbase kmap failed\n");
+		kfree(nespbl);
+		return -ENOMEM;
+	}
+
+	/* Now to get to sq.. we need to calculate how many */
+	/* PBL entries were used by the rq.. */
+	pbl += sq_pbl_entries;
+	nesqp->hwqp.rq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32);
+	/* nesqp->hwqp.rq_vbase = bus_to_virt(*pbl); */
+	/*nesqp->hwqp.rq_vbase = phys_to_virt(*pbl); */
+
+	nes_debug(NES_DBG_QP, "QP sq_vbase= %p sq_pbase=%lx rq_vbase=%p rq_pbase=%lx\n",
+		  nesqp->hwqp.sq_vbase, (unsigned long) nesqp->hwqp.sq_pbase,
+		  nesqp->hwqp.rq_vbase, (unsigned long) nesqp->hwqp.rq_pbase);
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	if (!nesadapter->free_256pbl) {
+		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+				nespbl->pbl_pbase);
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		kunmap(nesqp->page);
+		kfree(nespbl);
+		return -ENOMEM;
+	}
+	nesadapter->free_256pbl--;
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	nesqp->pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 256, &nesqp->pbl_pbase);
+	pblbuffer = nesqp->pbl_vbase;
+	if (!nesqp->pbl_vbase) {
+		/* memory allocated during nes_reg_user_mr() */
+		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+				    nespbl->pbl_pbase);
+		kfree(nespbl);
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		kunmap(nesqp->page);
+		return -ENOMEM;
+	}
+	memset(nesqp->pbl_vbase, 0, 256);
+	/* fill in the page address in the pbl buffer.. */
+	tpbl = pblbuffer + 16;
+	pbl = (__le64 *)nespbl->pbl_vbase;
+	while (sq_pbl_entries--)
+		*tpbl++ = *pbl++;
+	tpbl = pblbuffer;
+	while (rq_pbl_entries--)
+		*tpbl++ = *pbl++;
+
+	/* done with memory allocated during nes_reg_user_mr() */
+	pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+			    nespbl->pbl_pbase);
+	kfree(nespbl);
+
+	nesqp->qp_mem_size =
+			max((u32)sizeof(struct nes_qp_context), ((u32)256)) + 256;     /* this is Q2 */
+	/* Round up to a multiple of a page */
+	nesqp->qp_mem_size += PAGE_SIZE - 1;
+	nesqp->qp_mem_size &= ~(PAGE_SIZE - 1);
+
+	mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+			&nesqp->hwqp.q2_pbase);
+
+	if (!mem) {
+		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase);
+		nesqp->pbl_vbase = NULL;
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		kunmap(nesqp->page);
+		return -ENOMEM;
+	}
+	nesqp->sq_kmapped = 1;
+	nesqp->hwqp.q2_vbase = mem;
+	mem += 256;
+	memset(nesqp->hwqp.q2_vbase, 0, 256);
+	nesqp->nesqp_context = mem;
+	memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context));
+	nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256;
+
+	return 0;
+}
+
+
+/**
+ * nes_setup_mmap_qp
+ */
+static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic,
+		int sq_size, int rq_size)
+{
+	void *mem;
+	struct nes_device *nesdev = nesvnic->nesdev;
+
+	nesqp->qp_mem_size = (sizeof(struct nes_hw_qp_wqe) * sq_size) +
+			(sizeof(struct nes_hw_qp_wqe) * rq_size) +
+			max((u32)sizeof(struct nes_qp_context), ((u32)256)) +
+			256; /* this is Q2 */
+	/* Round up to a multiple of a page */
+	nesqp->qp_mem_size += PAGE_SIZE - 1;
+	nesqp->qp_mem_size &= ~(PAGE_SIZE - 1);
+
+	mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+			&nesqp->hwqp.sq_pbase);
+	if (!mem)
+		return -ENOMEM;
+	nes_debug(NES_DBG_QP, "PCI consistent memory for "
+			"host descriptor rings located @ %p (pa = 0x%08lX.) size = %u.\n",
+			mem, (unsigned long)nesqp->hwqp.sq_pbase, nesqp->qp_mem_size);
+
+	memset(mem, 0, nesqp->qp_mem_size);
+
+	nesqp->hwqp.sq_vbase = mem;
+	mem += sizeof(struct nes_hw_qp_wqe) * sq_size;
+
+	nesqp->hwqp.rq_vbase = mem;
+	nesqp->hwqp.rq_pbase = nesqp->hwqp.sq_pbase +
+			sizeof(struct nes_hw_qp_wqe) * sq_size;
+	mem += sizeof(struct nes_hw_qp_wqe) * rq_size;
+
+	nesqp->hwqp.q2_vbase = mem;
+	nesqp->hwqp.q2_pbase = nesqp->hwqp.rq_pbase +
+			sizeof(struct nes_hw_qp_wqe) * rq_size;
+	mem += 256;
+	memset(nesqp->hwqp.q2_vbase, 0, 256);
+
+	nesqp->nesqp_context = mem;
+	nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256;
+	memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context));
+	return 0;
+}
+
+
+/**
+ * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory.
+ */
+static void nes_free_qp_mem(struct nes_device *nesdev,
+		struct nes_qp *nesqp, int virt_wqs)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	if (!virt_wqs) {
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+				nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase);
+	}else {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase);
+		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase );
+		nesqp->pbl_vbase = NULL;
+		if (nesqp->sq_kmapped) {
+			nesqp->sq_kmapped = 0;
+			kunmap(nesqp->page);
+		}
+	}
+}
+
+
+/**
+ * nes_create_qp
+ */
+static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
+		struct ib_qp_init_attr *init_attr, struct ib_udata *udata)
+{
+	u64 u64temp= 0;
+	u64 u64nesqp = 0;
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_qp *nesqp;
+	struct nes_cq *nescq;
+	struct nes_ucontext *nes_ucontext;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	struct nes_create_qp_req req;
+	struct nes_create_qp_resp uresp;
+	struct nes_pbl  *nespbl = NULL;
+	u32 qp_num = 0;
+	u32 opcode = 0;
+	/* u32 counter = 0; */
+	void *mem;
+	unsigned long flags;
+	int ret;
+	int err;
+	int virt_wqs = 0;
+	int sq_size;
+	int rq_size;
+	u8 sq_encoded_size;
+	u8 rq_encoded_size;
+	/* int counter; */
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	atomic_inc(&qps_created);
+	switch (init_attr->qp_type) {
+		case IB_QPT_RC:
+			if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) {
+				init_attr->cap.max_inline_data = 0;
+			} else {
+				init_attr->cap.max_inline_data = 64;
+			}
+			sq_size = init_attr->cap.max_send_wr;
+			rq_size = init_attr->cap.max_recv_wr;
+
+			/* check if the encoded sizes are OK or not... */
+			sq_encoded_size = nes_get_encoded_size(&sq_size);
+			rq_encoded_size = nes_get_encoded_size(&rq_size);
+
+			if ((!sq_encoded_size) || (!rq_encoded_size)) {
+				nes_debug(NES_DBG_QP, "ERROR bad rq (%u) or sq (%u) size\n",
+						rq_size, sq_size);
+				return ERR_PTR(-EINVAL);
+			}
+
+			init_attr->cap.max_send_wr = sq_size -2;
+			init_attr->cap.max_recv_wr = rq_size -1;
+			nes_debug(NES_DBG_QP, "RQ size=%u, SQ Size=%u\n", rq_size, sq_size);
+
+			ret = nes_alloc_resource(nesadapter, nesadapter->allocated_qps,
+					nesadapter->max_qp, &qp_num, &nesadapter->next_qp, NES_RESOURCE_QP);
+			if (ret) {
+				return ERR_PTR(ret);
+			}
+
+			/* Need 512 (actually now 1024) byte alignment on this structure */
+			mem = kzalloc(sizeof(*nesqp)+NES_SW_CONTEXT_ALIGN-1, GFP_KERNEL);
+			if (!mem) {
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				return ERR_PTR(-ENOMEM);
+			}
+			u64nesqp = (unsigned long)mem;
+			u64nesqp += ((u64)NES_SW_CONTEXT_ALIGN) - 1;
+			u64temp = ((u64)NES_SW_CONTEXT_ALIGN) - 1;
+			u64nesqp &= ~u64temp;
+			nesqp = (struct nes_qp *)(unsigned long)u64nesqp;
+			/* nes_debug(NES_DBG_QP, "nesqp=%p, allocated buffer=%p.  Rounded to closest %u\n",
+					nesqp, mem, NES_SW_CONTEXT_ALIGN); */
+			nesqp->allocated_buffer = mem;
+
+			if (udata) {
+				if (ib_copy_from_udata(&req, udata, sizeof(struct nes_create_qp_req))) {
+					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+					kfree(nesqp->allocated_buffer);
+					nes_debug(NES_DBG_QP, "ib_copy_from_udata() Failed \n");
+					return ERR_PTR(-EFAULT);
+				}
+				if (req.user_wqe_buffers) {
+					virt_wqs = 1;
+				}
+				if (req.user_qp_buffer)
+					nesqp->nesuqp_addr = req.user_qp_buffer;
+				if ((ibpd->uobject) && (ibpd->uobject->context)) {
+					nesqp->user_mode = 1;
+					nes_ucontext = to_nesucontext(ibpd->uobject->context);
+					if (virt_wqs) {
+						err = 1;
+						list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) {
+							if (nespbl->user_base == (unsigned long )req.user_wqe_buffers) {
+								list_del(&nespbl->list);
+								err = 0;
+								nes_debug(NES_DBG_QP, "Found PBL for virtual QP. nespbl=%p. user_base=0x%lx\n",
+									  nespbl, nespbl->user_base);
+								break;
+							}
+						}
+						if (err) {
+							nes_debug(NES_DBG_QP, "Didn't Find PBL for virtual QP. address = %llx.\n",
+								  (long long unsigned int)req.user_wqe_buffers);
+							nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+							kfree(nesqp->allocated_buffer);
+							return ERR_PTR(-EFAULT);
+						}
+					}
+
+					nes_ucontext = to_nesucontext(ibpd->uobject->context);
+					nesqp->mmap_sq_db_index =
+						find_next_zero_bit(nes_ucontext->allocated_wqs,
+								   NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq);
+					/* nes_debug(NES_DBG_QP, "find_first_zero_biton wqs returned %u\n",
+							nespd->mmap_db_index); */
+					if (nesqp->mmap_sq_db_index >= NES_MAX_USER_WQ_REGIONS) {
+						nes_debug(NES_DBG_QP,
+							  "db index > max user regions, failing create QP\n");
+						nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+						if (virt_wqs) {
+							pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+									    nespbl->pbl_pbase);
+							kfree(nespbl);
+						}
+						kfree(nesqp->allocated_buffer);
+						return ERR_PTR(-ENOMEM);
+					}
+					set_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
+					nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = nesqp;
+					nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index + 1;
+				} else {
+					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+					kfree(nesqp->allocated_buffer);
+					return ERR_PTR(-EFAULT);
+				}
+			}
+			err = (!virt_wqs) ? nes_setup_mmap_qp(nesqp, nesvnic, sq_size, rq_size) :
+					nes_setup_virt_qp(nesqp, nespbl, nesvnic, sq_size, rq_size);
+			if (err) {
+				nes_debug(NES_DBG_QP,
+					  "error geting qp mem code = %d\n", err);
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				kfree(nesqp->allocated_buffer);
+				return ERR_PTR(-ENOMEM);
+			}
+
+			nesqp->hwqp.sq_size = sq_size;
+			nesqp->hwqp.sq_encoded_size = sq_encoded_size;
+			nesqp->hwqp.sq_head = 1;
+			nesqp->hwqp.rq_size = rq_size;
+			nesqp->hwqp.rq_encoded_size = rq_encoded_size;
+			/* nes_debug(NES_DBG_QP, "nesqp->nesqp_context_pbase = %p\n",
+					(void *)nesqp->nesqp_context_pbase);
+			*/
+			nesqp->hwqp.qp_id = qp_num;
+			nesqp->ibqp.qp_num = nesqp->hwqp.qp_id;
+			nesqp->nespd = nespd;
+
+			nescq = to_nescq(init_attr->send_cq);
+			nesqp->nesscq = nescq;
+			nescq = to_nescq(init_attr->recv_cq);
+			nesqp->nesrcq = nescq;
+
+			nesqp->nesqp_context->misc |= cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) <<
+					NES_QPCONTEXT_MISC_PCI_FCN_SHIFT);
+			nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.rq_encoded_size <<
+					NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT);
+			nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.sq_encoded_size <<
+					NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT);
+			if (!udata) {
+				nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_PRIV_EN);
+				nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_FAST_REGISTER_EN);
+			}
+			nesqp->nesqp_context->cqs = cpu_to_le32(nesqp->nesscq->hw_cq.cq_number +
+					((u32)nesqp->nesrcq->hw_cq.cq_number << 16));
+			u64temp = (u64)nesqp->hwqp.sq_pbase;
+			nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp);
+			nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+
+
+			if (!virt_wqs) {
+				u64temp = (u64)nesqp->hwqp.sq_pbase;
+				nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp);
+				nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+				u64temp = (u64)nesqp->hwqp.rq_pbase;
+				nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp);
+				nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+			} else {
+				u64temp = (u64)nesqp->pbl_pbase;
+				nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp);
+				nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+			}
+
+			/* nes_debug(NES_DBG_QP, "next_qp_nic_index=%u, using nic_index=%d\n",
+					nesvnic->next_qp_nic_index,
+					nesvnic->qp_nic_index[nesvnic->next_qp_nic_index]); */
+			spin_lock_irqsave(&nesdev->cqp.lock, flags);
+			nesqp->nesqp_context->misc2 |= cpu_to_le32(
+					(u32)nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] <<
+					NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT);
+			nesvnic->next_qp_nic_index++;
+			if ((nesvnic->next_qp_nic_index > 3) ||
+					(nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] == 0xf)) {
+				nesvnic->next_qp_nic_index = 0;
+			}
+			spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+			nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32((u32)nesqp->nespd->pd_id << 16);
+			u64temp = (u64)nesqp->hwqp.q2_pbase;
+			nesqp->nesqp_context->q2_addr_low = cpu_to_le32((u32)u64temp);
+			nesqp->nesqp_context->q2_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+			nesqp->nesqp_context->aeq_token_low =  cpu_to_le32((u32)((unsigned long)(nesqp)));
+			nesqp->nesqp_context->aeq_token_high =  cpu_to_le32((u32)(upper_32_bits((unsigned long)(nesqp))));
+			nesqp->nesqp_context->ird_ord_sizes = cpu_to_le32(NES_QPCONTEXT_ORDIRD_ALSMM |
+					NES_QPCONTEXT_ORDIRD_AAH |
+					((((u32)nesadapter->max_irrq_wr) <<
+					NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT) & NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK));
+			if (disable_mpa_crc) {
+				nes_debug(NES_DBG_QP, "Disabling MPA crc checking due to module option.\n");
+				nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_RNMC);
+			}
+
+
+			/* Create the QP */
+			cqp_request = nes_get_cqp_request(nesdev);
+			if (cqp_request == NULL) {
+				nes_debug(NES_DBG_QP, "Failed to get a cqp_request\n");
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				nes_free_qp_mem(nesdev, nesqp,virt_wqs);
+				kfree(nesqp->allocated_buffer);
+				return ERR_PTR(-ENOMEM);
+			}
+			cqp_request->waiting = 1;
+			cqp_wqe = &cqp_request->cqp_wqe;
+
+			if (!virt_wqs) {
+				opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP |
+					NES_CQP_QP_IWARP_STATE_IDLE;
+			} else {
+				opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_VIRT_WQS |
+					NES_CQP_QP_IWARP_STATE_IDLE;
+			}
+			opcode |= NES_CQP_QP_CQS_VALID;
+			nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+			set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+			set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+
+			u64temp = (u64)nesqp->nesqp_context_pbase;
+			set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+			atomic_set(&cqp_request->refcount, 2);
+			nes_post_cqp_request(nesdev, cqp_request);
+
+			/* Wait for CQP */
+			nes_debug(NES_DBG_QP, "Waiting for create iWARP QP%u to complete.\n",
+					nesqp->hwqp.qp_id);
+			ret = wait_event_timeout(cqp_request->waitq,
+					(cqp_request->request_done != 0), NES_EVENT_TIMEOUT);
+			nes_debug(NES_DBG_QP, "Create iwarp QP%u completed, wait_event_timeout ret=%u,"
+					" nesdev->cqp_head = %u, nesdev->cqp.sq_tail = %u,"
+					" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+					nesqp->hwqp.qp_id, ret, nesdev->cqp.sq_head, nesdev->cqp.sq_tail,
+					cqp_request->major_code, cqp_request->minor_code);
+			if ((!ret) || (cqp_request->major_code)) {
+				nes_put_cqp_request(nesdev, cqp_request);
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				nes_free_qp_mem(nesdev, nesqp,virt_wqs);
+				kfree(nesqp->allocated_buffer);
+				if (!ret) {
+					return ERR_PTR(-ETIME);
+				} else {
+					return ERR_PTR(-EIO);
+				}
+			}
+
+			nes_put_cqp_request(nesdev, cqp_request);
+
+			if (ibpd->uobject) {
+				uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index;
+				uresp.mmap_rq_db_index = 0;
+				uresp.actual_sq_size = sq_size;
+				uresp.actual_rq_size = rq_size;
+				uresp.qp_id = nesqp->hwqp.qp_id;
+				uresp.nes_drv_opt = nes_drv_opt;
+				if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+					nes_free_qp_mem(nesdev, nesqp,virt_wqs);
+					kfree(nesqp->allocated_buffer);
+					return ERR_PTR(-EFAULT);
+				}
+			}
+
+			nes_debug(NES_DBG_QP, "QP%u structure located @%p.Size = %u.\n",
+					nesqp->hwqp.qp_id, nesqp, (u32)sizeof(*nesqp));
+			spin_lock_init(&nesqp->lock);
+			nes_add_ref(&nesqp->ibqp);
+			break;
+		default:
+			nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type);
+			return ERR_PTR(-EINVAL);
+	}
+	init_completion(&nesqp->sq_drained);
+	init_completion(&nesqp->rq_drained);
+
+	nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR);
+	timer_setup(&nesqp->terminate_timer, nes_terminate_timeout, 0);
+
+	/* update the QP table */
+	nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp;
+	nes_debug(NES_DBG_QP, "netdev refcnt=%u\n",
+			netdev_refcnt_read(nesvnic->netdev));
+
+	return &nesqp->ibqp;
+}
+
+/**
+ * nes_clean_cq
+ */
+static void nes_clean_cq(struct nes_qp *nesqp, struct nes_cq *nescq)
+{
+	u32 cq_head;
+	u32 lo;
+	u32 hi;
+	u64 u64temp;
+	unsigned long flags = 0;
+
+	spin_lock_irqsave(&nescq->lock, flags);
+
+	cq_head = nescq->hw_cq.cq_head;
+	while (le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) {
+		rmb();
+		lo = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
+		hi = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]);
+		u64temp = (((u64)hi) << 32) | ((u64)lo);
+		u64temp &= ~(NES_SW_CONTEXT_ALIGN-1);
+		if (u64temp == (u64)(unsigned long)nesqp) {
+			/* Zero the context value so cqe will be ignored */
+			nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = 0;
+			nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX] = 0;
+		}
+
+		if (++cq_head >= nescq->hw_cq.cq_size)
+			cq_head = 0;
+	}
+
+	spin_unlock_irqrestore(&nescq->lock, flags);
+}
+
+
+/**
+ * nes_destroy_qp
+ */
+static int nes_destroy_qp(struct ib_qp *ibqp)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_ucontext *nes_ucontext;
+	struct ib_qp_attr attr;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	int ret = 0;
+
+	atomic_inc(&sw_qps_destroyed);
+	nesqp->destroyed = 1;
+
+	/* Blow away the connection if it exists. */
+	if (nesqp->ibqp_state >= IB_QPS_INIT && nesqp->ibqp_state <= IB_QPS_RTS) {
+		/* if (nesqp->ibqp_state == IB_QPS_RTS) { */
+		attr.qp_state = IB_QPS_ERR;
+		nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
+	}
+
+	if (((nesqp->ibqp_state == IB_QPS_INIT) ||
+			(nesqp->ibqp_state == IB_QPS_RTR)) && (nesqp->cm_id)) {
+		cm_id = nesqp->cm_id;
+		cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+		cm_event.status = -ETIMEDOUT;
+		cm_event.local_addr = cm_id->local_addr;
+		cm_event.remote_addr = cm_id->remote_addr;
+		cm_event.private_data = NULL;
+		cm_event.private_data_len = 0;
+
+		nes_debug(NES_DBG_QP, "Generating a CM Timeout Event for "
+				"QP%u. cm_id = %p, refcount = %u. \n",
+				nesqp->hwqp.qp_id, cm_id, atomic_read(&nesqp->refcount));
+
+		cm_id->rem_ref(cm_id);
+		ret = cm_id->event_handler(cm_id, &cm_event);
+		if (ret)
+			nes_debug(NES_DBG_QP, "OFA CM event_handler returned, ret=%d\n", ret);
+	}
+
+	if (nesqp->user_mode) {
+		if ((ibqp->uobject)&&(ibqp->uobject->context)) {
+			nes_ucontext = to_nesucontext(ibqp->uobject->context);
+			clear_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
+			nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = NULL;
+			if (nes_ucontext->first_free_wq > nesqp->mmap_sq_db_index) {
+				nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index;
+			}
+		}
+		if (nesqp->pbl_pbase && nesqp->sq_kmapped) {
+			nesqp->sq_kmapped = 0;
+			kunmap(nesqp->page);
+		}
+	} else {
+		/* Clean any pending completions from the cq(s) */
+		if (nesqp->nesscq)
+			nes_clean_cq(nesqp, nesqp->nesscq);
+
+		if ((nesqp->nesrcq) && (nesqp->nesrcq != nesqp->nesscq))
+			nes_clean_cq(nesqp, nesqp->nesrcq);
+	}
+	nes_rem_ref(&nesqp->ibqp);
+	return 0;
+}
+
+
+/**
+ * nes_create_cq
+ */
+static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
+				   const struct ib_cq_init_attr *attr,
+				   struct ib_ucontext *context,
+				   struct ib_udata *udata)
+{
+	int entries = attr->cqe;
+	u64 u64temp;
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_cq *nescq;
+	struct nes_ucontext *nes_ucontext = NULL;
+	struct nes_cqp_request *cqp_request;
+	void *mem = NULL;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_pbl *nespbl = NULL;
+	struct nes_create_cq_req req;
+	struct nes_create_cq_resp resp;
+	u32 cq_num = 0;
+	u32 opcode = 0;
+	u32 pbl_entries = 1;
+	int err;
+	unsigned long flags;
+	int ret;
+
+	if (attr->flags)
+		return ERR_PTR(-EINVAL);
+
+	if (entries > nesadapter->max_cqe)
+		return ERR_PTR(-EINVAL);
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs,
+			nesadapter->max_cq, &cq_num, &nesadapter->next_cq, NES_RESOURCE_CQ);
+	if (err) {
+		return ERR_PTR(err);
+	}
+
+	nescq = kzalloc(sizeof(struct nes_cq), GFP_KERNEL);
+	if (!nescq) {
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nescq->hw_cq.cq_size = max(entries + 1, 5);
+	nescq->hw_cq.cq_number = cq_num;
+	nescq->ibcq.cqe = nescq->hw_cq.cq_size - 1;
+
+
+	if (context) {
+		nes_ucontext = to_nesucontext(context);
+		if (ib_copy_from_udata(&req, udata, sizeof (struct nes_create_cq_req))) {
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-EFAULT);
+		}
+		nesvnic->mcrq_ucontext = nes_ucontext;
+		nes_ucontext->mcrqf = req.mcrqf;
+		if (nes_ucontext->mcrqf) {
+			if (nes_ucontext->mcrqf & 0x80000000)
+				nescq->hw_cq.cq_number = nesvnic->nic.qp_id + 28 + 2 * ((nes_ucontext->mcrqf & 0xf) - 1);
+			else if (nes_ucontext->mcrqf & 0x40000000)
+				nescq->hw_cq.cq_number = nes_ucontext->mcrqf & 0xffff;
+			else
+				nescq->hw_cq.cq_number = nesvnic->mcrq_qp_id + nes_ucontext->mcrqf-1;
+			nescq->mcrqf = nes_ucontext->mcrqf;
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		}
+		nes_debug(NES_DBG_CQ, "CQ Virtual Address = %08lX, size = %u.\n",
+				(unsigned long)req.user_cq_buffer, entries);
+		err = 1;
+		list_for_each_entry(nespbl, &nes_ucontext->cq_reg_mem_list, list) {
+			if (nespbl->user_base == (unsigned long )req.user_cq_buffer) {
+				list_del(&nespbl->list);
+				err = 0;
+				nes_debug(NES_DBG_CQ, "Found PBL for virtual CQ. nespbl=%p.\n",
+						nespbl);
+				break;
+			}
+		}
+		if (err) {
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-EFAULT);
+		}
+
+		pbl_entries = nespbl->pbl_size >> 3;
+		nescq->cq_mem_size = 0;
+	} else {
+		nescq->cq_mem_size = nescq->hw_cq.cq_size * sizeof(struct nes_hw_cqe);
+		nes_debug(NES_DBG_CQ, "Attempting to allocate pci memory (%u entries, %u bytes) for CQ%u.\n",
+				entries, nescq->cq_mem_size, nescq->hw_cq.cq_number);
+
+		/* allocate the physical buffer space */
+		mem = pci_zalloc_consistent(nesdev->pcidev, nescq->cq_mem_size,
+					    &nescq->hw_cq.cq_pbase);
+		if (!mem) {
+			printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n");
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		nescq->hw_cq.cq_vbase = mem;
+		nescq->hw_cq.cq_head = 0;
+		nes_debug(NES_DBG_CQ, "CQ%u virtual address @ %p, phys = 0x%08X\n",
+				nescq->hw_cq.cq_number, nescq->hw_cq.cq_vbase,
+				(u32)nescq->hw_cq.cq_pbase);
+	}
+
+	nescq->hw_cq.ce_handler = nes_iwarp_ce_handler;
+	spin_lock_init(&nescq->lock);
+
+	/* send CreateCQ request to CQP */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n");
+		if (!context)
+			pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+					nescq->hw_cq.cq_pbase);
+		else {
+			pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
+					    nespbl->pbl_vbase, nespbl->pbl_pbase);
+			kfree(nespbl);
+		}
+
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		kfree(nescq);
+		return ERR_PTR(-ENOMEM);
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	opcode = NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			NES_CQP_CQ_CHK_OVERFLOW |
+			NES_CQP_CQ_CEQE_MASK | ((u32)nescq->hw_cq.cq_size << 16);
+
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+
+	if (pbl_entries != 1) {
+		if (pbl_entries > 32) {
+			/* use 4k pbl */
+			nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 4k PBL\n", pbl_entries);
+			if (nesadapter->free_4kpbl == 0) {
+				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				nes_free_cqp_request(nesdev, cqp_request);
+				if (!context)
+					pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+							nescq->hw_cq.cq_pbase);
+				else {
+					pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
+							    nespbl->pbl_vbase, nespbl->pbl_pbase);
+					kfree(nespbl);
+				}
+				nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+				kfree(nescq);
+				return ERR_PTR(-ENOMEM);
+			} else {
+				opcode |= (NES_CQP_CQ_VIRT | NES_CQP_CQ_4KB_CHUNK);
+				nescq->virtual_cq = 2;
+				nesadapter->free_4kpbl--;
+			}
+		} else {
+			/* use 256 byte pbl */
+			nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 256 byte PBL\n", pbl_entries);
+			if (nesadapter->free_256pbl == 0) {
+				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				nes_free_cqp_request(nesdev, cqp_request);
+				if (!context)
+					pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+							nescq->hw_cq.cq_pbase);
+				else {
+					pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
+							    nespbl->pbl_vbase, nespbl->pbl_pbase);
+					kfree(nespbl);
+				}
+				nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+				kfree(nescq);
+				return ERR_PTR(-ENOMEM);
+			} else {
+				opcode |= NES_CQP_CQ_VIRT;
+				nescq->virtual_cq = 1;
+				nesadapter->free_256pbl--;
+			}
+		}
+	}
+
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+			(nescq->hw_cq.cq_number | ((u32)nesdev->ceq_index << 16)));
+
+	if (context) {
+		if (pbl_entries != 1)
+			u64temp = (u64)nespbl->pbl_pbase;
+		else
+			u64temp	= le64_to_cpu(nespbl->pbl_vbase[0]);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX,
+				nes_ucontext->mmap_db_index[0]);
+	} else {
+		u64temp = (u64)nescq->hw_cq.cq_pbase;
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+	}
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
+	u64temp = (u64)(unsigned long)&nescq->hw_cq;
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =
+			cpu_to_le32((u32)(u64temp >> 1));
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_CQ, "Waiting for create iWARP CQ%u to complete.\n",
+			nescq->hw_cq.cq_number);
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT * 2);
+	nes_debug(NES_DBG_CQ, "Create iWARP CQ%u completed, wait_event_timeout ret = %d.\n",
+			nescq->hw_cq.cq_number, ret);
+	if ((!ret) || (cqp_request->major_code)) {
+		nes_put_cqp_request(nesdev, cqp_request);
+		if (!context)
+			pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+					nescq->hw_cq.cq_pbase);
+		else {
+			pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
+					    nespbl->pbl_vbase, nespbl->pbl_pbase);
+			kfree(nespbl);
+		}
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		kfree(nescq);
+		return ERR_PTR(-EIO);
+	}
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (context) {
+		/* free the nespbl */
+		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+				nespbl->pbl_pbase);
+		kfree(nespbl);
+		resp.cq_id = nescq->hw_cq.cq_number;
+		resp.cq_size = nescq->hw_cq.cq_size;
+		resp.mmap_db_index = 0;
+		if (ib_copy_to_udata(udata, &resp, sizeof resp - sizeof resp.reserved)) {
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	return &nescq->ibcq;
+}
+
+
+/**
+ * nes_destroy_cq
+ */
+static int nes_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct nes_cq *nescq;
+	struct nes_device *nesdev;
+	struct nes_vnic *nesvnic;
+	struct nes_adapter *nesadapter;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	u32 opcode = 0;
+	int ret;
+
+	if (ib_cq == NULL)
+		return 0;
+
+	nescq = to_nescq(ib_cq);
+	nesvnic = to_nesvnic(ib_cq->device);
+	nesdev = nesvnic->nesdev;
+	nesadapter = nesdev->nesadapter;
+
+	nes_debug(NES_DBG_CQ, "Destroy CQ%u\n", nescq->hw_cq.cq_number);
+
+	/* Send DestroyCQ request to CQP */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+	opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16);
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	if (nescq->virtual_cq == 1) {
+		nesadapter->free_256pbl++;
+		if (nesadapter->free_256pbl > nesadapter->max_256pbl) {
+			printk(KERN_ERR PFX "%s: free 256B PBLs(%u) has exceeded the max(%u)\n",
+					__func__, nesadapter->free_256pbl, nesadapter->max_256pbl);
+		}
+	} else if (nescq->virtual_cq == 2) {
+		nesadapter->free_4kpbl++;
+		if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) {
+			printk(KERN_ERR PFX "%s: free 4K PBLs(%u) has exceeded the max(%u)\n",
+					__func__, nesadapter->free_4kpbl, nesadapter->max_4kpbl);
+		}
+		opcode |= NES_CQP_CQ_4KB_CHUNK;
+	}
+
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+		(nescq->hw_cq.cq_number | ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 16)));
+	if (!nescq->mcrqf)
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, nescq->hw_cq.cq_number);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n",
+			nescq->hw_cq.cq_number);
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_CQ, "Destroy iWARP CQ%u completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			nescq->hw_cq.cq_number, ret, cqp_request->major_code,
+			cqp_request->minor_code);
+	if (!ret) {
+		nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n",
+					nescq->hw_cq.cq_number);
+		ret = -ETIME;
+	} else if (cqp_request->major_code) {
+		nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n",
+					nescq->hw_cq.cq_number);
+		ret = -EIO;
+	} else {
+		ret = 0;
+	}
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (nescq->cq_mem_size)
+		pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size,
+				    nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase);
+	kfree(nescq);
+
+	return ret;
+}
+
+/**
+ * root_256
+ */
+static u32 root_256(struct nes_device *nesdev,
+		    struct nes_root_vpbl *root_vpbl,
+		    struct nes_root_vpbl *new_root,
+		    u16 pbl_count_4k)
+{
+	u64 leaf_pbl;
+	int i, j, k;
+
+	if (pbl_count_4k == 1) {
+		new_root->pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
+						512, &new_root->pbl_pbase);
+
+		if (new_root->pbl_vbase == NULL)
+			return 0;
+
+		leaf_pbl = (u64)root_vpbl->pbl_pbase;
+		for (i = 0; i < 16; i++) {
+			new_root->pbl_vbase[i].pa_low =
+				cpu_to_le32((u32)leaf_pbl);
+			new_root->pbl_vbase[i].pa_high =
+				cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
+			leaf_pbl += 256;
+		}
+	} else {
+		for (i = 3; i >= 0; i--) {
+			j = i * 16;
+			root_vpbl->pbl_vbase[j] = root_vpbl->pbl_vbase[i];
+			leaf_pbl = le32_to_cpu(root_vpbl->pbl_vbase[j].pa_low) +
+			    (((u64)le32_to_cpu(root_vpbl->pbl_vbase[j].pa_high))
+				<< 32);
+			for (k = 1; k < 16; k++) {
+				leaf_pbl += 256;
+				root_vpbl->pbl_vbase[j + k].pa_low =
+						cpu_to_le32((u32)leaf_pbl);
+				root_vpbl->pbl_vbase[j + k].pa_high =
+				    cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
+			}
+		}
+	}
+
+	return 1;
+}
+
+
+/**
+ * nes_reg_mr
+ */
+static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
+		u32 stag, u64 region_length, struct nes_root_vpbl *root_vpbl,
+		dma_addr_t single_buffer, u16 pbl_count_4k,
+		u16 residual_page_count_4k, int acc, u64 *iova_start,
+		u16 *actual_pbl_cnt, u8 *used_4k_pbls)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	int ret;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	uint pg_cnt = 0;
+	u16 pbl_count_256 = 0;
+	u16 pbl_count = 0;
+	u8  use_256_pbls = 0;
+	u8  use_4k_pbls = 0;
+	u16 use_two_level = (pbl_count_4k > 1) ? 1 : 0;
+	struct nes_root_vpbl new_root = { 0, NULL, NULL };
+	u32 opcode = 0;
+	u16 major_code;
+
+	/* Register the region with the adapter */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	if (pbl_count_4k) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+
+		pg_cnt = ((pbl_count_4k - 1) * 512) + residual_page_count_4k;
+		pbl_count_256 = (pg_cnt + 31) / 32;
+		if (pg_cnt <= 32) {
+			if (pbl_count_256 <= nesadapter->free_256pbl)
+				use_256_pbls = 1;
+			else if (pbl_count_4k <= nesadapter->free_4kpbl)
+				use_4k_pbls = 1;
+		} else if (pg_cnt <= 2048) {
+			if (((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) &&
+			    (nesadapter->free_4kpbl > (nesadapter->max_4kpbl >> 1))) {
+				use_4k_pbls = 1;
+			} else if ((pbl_count_256 + 1) <= nesadapter->free_256pbl) {
+				use_256_pbls = 1;
+				use_two_level = 1;
+			} else if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
+				use_4k_pbls = 1;
+			}
+		} else {
+			if ((pbl_count_4k + 1) <= nesadapter->free_4kpbl)
+				use_4k_pbls = 1;
+		}
+
+		if (use_256_pbls) {
+			pbl_count = pbl_count_256;
+			nesadapter->free_256pbl -= pbl_count + use_two_level;
+		} else if (use_4k_pbls) {
+			pbl_count =  pbl_count_4k;
+			nesadapter->free_4kpbl -= pbl_count + use_two_level;
+		} else {
+			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+			nes_debug(NES_DBG_MR, "Out of Pbls\n");
+			nes_free_cqp_request(nesdev, cqp_request);
+			return -ENOMEM;
+		}
+
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+
+	if (use_256_pbls && use_two_level) {
+		if (root_256(nesdev, root_vpbl, &new_root, pbl_count_4k) == 1) {
+			if (new_root.pbl_pbase != 0)
+				root_vpbl = &new_root;
+		} else {
+			spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+			nesadapter->free_256pbl += pbl_count_256 + use_two_level;
+			use_256_pbls = 0;
+
+			if (pbl_count_4k == 1)
+				use_two_level = 0;
+			pbl_count = pbl_count_4k;
+
+			if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
+				nesadapter->free_4kpbl -= pbl_count + use_two_level;
+				use_4k_pbls = 1;
+			}
+			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+			if (use_4k_pbls == 0)
+				return -ENOMEM;
+		}
+	}
+
+	opcode = NES_CQP_REGISTER_STAG | NES_CQP_STAG_RIGHTS_LOCAL_READ |
+					NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR;
+	if (acc & IB_ACCESS_LOCAL_WRITE)
+		opcode |= NES_CQP_STAG_RIGHTS_LOCAL_WRITE;
+	if (acc & IB_ACCESS_REMOTE_WRITE)
+		opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_REM_ACC_EN;
+	if (acc & IB_ACCESS_REMOTE_READ)
+		opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_REM_ACC_EN;
+	if (acc & IB_ACCESS_MW_BIND)
+		opcode |= NES_CQP_STAG_RIGHTS_WINDOW_BIND | NES_CQP_STAG_REM_ACC_EN;
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, *iova_start);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, region_length);
+
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] =
+			cpu_to_le32((u32)(region_length >> 8) & 0xff000000);
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |=
+			cpu_to_le32(nespd->pd_id & 0x00007fff);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
+
+	if (pbl_count == 0) {
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, single_buffer);
+	} else {
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, root_vpbl->pbl_pbase);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, pbl_count);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (pg_cnt * 8));
+
+		if (use_4k_pbls)
+			cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE);
+	}
+	barrier();
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			stag, ret, cqp_request->major_code, cqp_request->minor_code);
+	major_code = cqp_request->major_code;
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if ((!ret || major_code) && pbl_count != 0) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		if (use_256_pbls)
+			nesadapter->free_256pbl += pbl_count + use_two_level;
+		else if (use_4k_pbls)
+			nesadapter->free_4kpbl += pbl_count + use_two_level;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+	if (new_root.pbl_pbase)
+		pci_free_consistent(nesdev->pcidev, 512, new_root.pbl_vbase,
+				    new_root.pbl_pbase);
+
+	if (!ret)
+		return -ETIME;
+	else if (major_code)
+		return -EIO;
+
+	*actual_pbl_cnt = pbl_count + use_two_level;
+	*used_4k_pbls = use_4k_pbls;
+	return 0;
+}
+
+
+/**
+ * nes_reg_phys_mr
+ */
+struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size,
+		int acc, u64 *iova_start)
+{
+	u64 region_length;
+	struct nes_pd *nespd = to_nespd(ib_pd);
+	struct nes_vnic *nesvnic = to_nesvnic(ib_pd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_mr *nesmr;
+	struct ib_mr *ibmr;
+	struct nes_vpbl vpbl;
+	struct nes_root_vpbl root_vpbl;
+	u32 stag;
+	unsigned long mask;
+	u32 stag_index = 0;
+	u32 next_stag_index = 0;
+	u32 driver_key = 0;
+	int err = 0;
+	int ret = 0;
+	u16 pbl_count = 0;
+	u8 single_page = 1;
+	u8 stag_key = 0;
+
+	region_length = 0;
+	vpbl.pbl_vbase = NULL;
+	root_vpbl.pbl_vbase = NULL;
+	root_vpbl.pbl_pbase = 0;
+
+	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+	stag_key = (u8)next_stag_index;
+
+	driver_key = 0;
+
+	next_stag_index >>= 8;
+	next_stag_index %= nesadapter->max_mr;
+
+	if ((addr ^ *iova_start) & ~PAGE_MASK)
+		return ERR_PTR(-EINVAL);
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr,
+			&stag_index, &next_stag_index, NES_RESOURCE_PHYS_MR);
+	if (err) {
+		return ERR_PTR(err);
+	}
+
+	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+	if (!nesmr) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Allocate a 4K buffer for the PBL */
+	vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+			&vpbl.pbl_pbase);
+	nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n",
+			vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase);
+	if (!vpbl.pbl_vbase) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		ibmr = ERR_PTR(-ENOMEM);
+		kfree(nesmr);
+		goto reg_phys_err;
+	}
+
+
+	mask = !size;
+
+	if (mask & ~PAGE_MASK) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n");
+		ibmr = ERR_PTR(-EINVAL);
+		kfree(nesmr);
+		goto reg_phys_err;
+	}
+
+	region_length += size;
+	vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)addr & PAGE_MASK);
+	vpbl.pbl_vbase[0].pa_high = cpu_to_le32((u32)((((u64)addr) >> 32)));
+
+	stag = stag_index << 8;
+	stag |= driver_key;
+	stag += (u32)stag_key;
+
+	nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%016lX,"
+			" length = 0x%016lX, index = 0x%08X\n",
+			stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index);
+
+	/* Make the leaf PBL the root if only one PBL */
+	root_vpbl.pbl_pbase = vpbl.pbl_pbase;
+
+	if (single_page) {
+		pbl_count = 0;
+	} else {
+		pbl_count = 1;
+	}
+	ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl,
+			addr, pbl_count, 1, acc, iova_start,
+			&nesmr->pbls_used, &nesmr->pbl_4k);
+
+	if (ret == 0) {
+		nesmr->ibmr.rkey = stag;
+		nesmr->ibmr.lkey = stag;
+		nesmr->mode = IWNES_MEMREG_TYPE_MEM;
+		ibmr = &nesmr->ibmr;
+	} else {
+		kfree(nesmr);
+		ibmr = ERR_PTR(-ENOMEM);
+	}
+
+reg_phys_err:
+	/* single PBL case */
+	pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase);
+	return ibmr;
+}
+
+
+/**
+ * nes_get_dma_mr
+ */
+static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	u64 kva = 0;
+
+	nes_debug(NES_DBG_MR, "\n");
+
+	return nes_reg_phys_mr(pd, 0, 0xffffffffffULL, acc, &kva);
+}
+
+/**
+ * nes_reg_user_mr
+ */
+static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+		u64 virt, int acc, struct ib_udata *udata)
+{
+	u64 iova_start;
+	__le64 *pbl;
+	u64 region_length;
+	dma_addr_t last_dma_addr = 0;
+	dma_addr_t first_dma_addr = 0;
+	struct nes_pd *nespd = to_nespd(pd);
+	struct nes_vnic *nesvnic = to_nesvnic(pd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct ib_mr *ibmr = ERR_PTR(-EINVAL);
+	struct scatterlist *sg;
+	struct nes_ucontext *nes_ucontext;
+	struct nes_pbl *nespbl;
+	struct nes_mr *nesmr;
+	struct ib_umem *region;
+	struct nes_mem_reg_req req;
+	struct nes_vpbl vpbl;
+	struct nes_root_vpbl root_vpbl;
+	int entry, page_index;
+	int page_count = 0;
+	int err, pbl_depth = 0;
+	int chunk_pages;
+	int ret;
+	u32 stag;
+	u32 stag_index = 0;
+	u32 next_stag_index;
+	u32 driver_key;
+	u32 root_pbl_index = 0;
+	u32 cur_pbl_index = 0;
+	u32 skip_pages;
+	u16 pbl_count;
+	u8 single_page = 1;
+	u8 stag_key;
+	int first_page = 1;
+
+	region = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(region)) {
+		return (struct ib_mr *)region;
+	}
+
+	nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u,"
+			" offset = %u, page size = %lu.\n",
+			(unsigned long int)start, (unsigned long int)virt, (u32)length,
+			ib_umem_offset(region), BIT(region->page_shift));
+
+	skip_pages = ((u32)ib_umem_offset(region)) >> 12;
+
+	if (ib_copy_from_udata(&req, udata, sizeof(req))) {
+		ib_umem_release(region);
+		return ERR_PTR(-EFAULT);
+	}
+	nes_debug(NES_DBG_MR, "Memory Registration type = %08X.\n", req.reg_type);
+
+	switch (req.reg_type) {
+		case IWNES_MEMREG_TYPE_MEM:
+			pbl_depth = 0;
+			region_length = 0;
+			vpbl.pbl_vbase = NULL;
+			root_vpbl.pbl_vbase = NULL;
+			root_vpbl.pbl_pbase = 0;
+
+			get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+			stag_key = (u8)next_stag_index;
+
+			driver_key = next_stag_index & 0x70000000;
+
+			next_stag_index >>= 8;
+			next_stag_index %= nesadapter->max_mr;
+
+			err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
+					nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_USER_MR);
+			if (err) {
+				ib_umem_release(region);
+				return ERR_PTR(err);
+			}
+
+			nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+			if (!nesmr) {
+				ib_umem_release(region);
+				nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+				return ERR_PTR(-ENOMEM);
+			}
+			nesmr->region = region;
+
+			for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
+				if (sg_dma_address(sg) & ~PAGE_MASK) {
+					ib_umem_release(region);
+					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+					nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n",
+						  (unsigned int) sg_dma_address(sg));
+					ibmr = ERR_PTR(-EINVAL);
+					kfree(nesmr);
+					goto reg_user_mr_err;
+				}
+
+				if (!sg_dma_len(sg)) {
+					ib_umem_release(region);
+					nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+							  stag_index);
+					nes_debug(NES_DBG_MR, "Invalid Buffer Size\n");
+					ibmr = ERR_PTR(-EINVAL);
+					kfree(nesmr);
+					goto reg_user_mr_err;
+				}
+
+				region_length += sg_dma_len(sg);
+				chunk_pages = sg_dma_len(sg) >> 12;
+				region_length -= skip_pages << 12;
+				for (page_index = skip_pages; page_index < chunk_pages; page_index++) {
+					skip_pages = 0;
+					if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length)
+						goto enough_pages;
+					if ((page_count&0x01FF) == 0) {
+						if (page_count >= 1024 * 512) {
+							ib_umem_release(region);
+							nes_free_resource(nesadapter,
+									  nesadapter->allocated_mrs, stag_index);
+							kfree(nesmr);
+							ibmr = ERR_PTR(-E2BIG);
+							goto reg_user_mr_err;
+						}
+						if (root_pbl_index == 1) {
+							root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
+									8192, &root_vpbl.pbl_pbase);
+							nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
+								  root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
+							if (!root_vpbl.pbl_vbase) {
+								ib_umem_release(region);
+								pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+										    vpbl.pbl_pbase);
+								nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+										  stag_index);
+								kfree(nesmr);
+								ibmr = ERR_PTR(-ENOMEM);
+								goto reg_user_mr_err;
+							}
+							root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024,
+									GFP_KERNEL);
+							if (!root_vpbl.leaf_vpbl) {
+								ib_umem_release(region);
+								pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+										    root_vpbl.pbl_pbase);
+								pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+										    vpbl.pbl_pbase);
+								nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+										  stag_index);
+								kfree(nesmr);
+								ibmr = ERR_PTR(-ENOMEM);
+								goto reg_user_mr_err;
+							}
+							root_vpbl.pbl_vbase[0].pa_low =
+									cpu_to_le32((u32)vpbl.pbl_pbase);
+							root_vpbl.pbl_vbase[0].pa_high =
+									cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+							root_vpbl.leaf_vpbl[0] = vpbl;
+						}
+						vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+								&vpbl.pbl_pbase);
+						nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n",
+							  vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase);
+						if (!vpbl.pbl_vbase) {
+							ib_umem_release(region);
+							nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+							ibmr = ERR_PTR(-ENOMEM);
+							kfree(nesmr);
+							goto reg_user_mr_err;
+						}
+						if (1 <= root_pbl_index) {
+							root_vpbl.pbl_vbase[root_pbl_index].pa_low =
+									cpu_to_le32((u32)vpbl.pbl_pbase);
+							root_vpbl.pbl_vbase[root_pbl_index].pa_high =
+									cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32)));
+							root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
+						}
+						root_pbl_index++;
+						cur_pbl_index = 0;
+					}
+					if (single_page) {
+						if (page_count != 0) {
+							if ((last_dma_addr+4096) !=
+									(sg_dma_address(sg)+
+									(page_index*4096)))
+								single_page = 0;
+							last_dma_addr = sg_dma_address(sg)+
+									(page_index*4096);
+						} else {
+							first_dma_addr = sg_dma_address(sg)+
+									(page_index*4096);
+							last_dma_addr = first_dma_addr;
+						}
+					}
+
+					vpbl.pbl_vbase[cur_pbl_index].pa_low =
+							cpu_to_le32((u32)(sg_dma_address(sg)+
+							(page_index*4096)));
+					vpbl.pbl_vbase[cur_pbl_index].pa_high =
+							cpu_to_le32((u32)((((u64)(sg_dma_address(sg)+
+							(page_index*4096))) >> 32)));
+					cur_pbl_index++;
+					page_count++;
+				}
+			}
+
+			enough_pages:
+			nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x,"
+					" stag_key=0x%08x\n",
+					stag_index, driver_key, stag_key);
+			stag = stag_index << 8;
+			stag |= driver_key;
+			stag += (u32)stag_key;
+
+			iova_start = virt;
+			/* Make the leaf PBL the root if only one PBL */
+			if (root_pbl_index == 1) {
+				root_vpbl.pbl_pbase = vpbl.pbl_pbase;
+			}
+
+			if (single_page) {
+				pbl_count = 0;
+			} else {
+				pbl_count = root_pbl_index;
+				first_dma_addr = 0;
+			}
+			nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%08X, length = 0x%08X,"
+					" index = 0x%08X, region->length=0x%08llx, pbl_count = %u\n",
+					stag, (unsigned int)iova_start,
+					(unsigned int)region_length, stag_index,
+					(unsigned long long)region->length, pbl_count);
+			ret = nes_reg_mr(nesdev, nespd, stag, region->length, &root_vpbl,
+					 first_dma_addr, pbl_count, (u16)cur_pbl_index, acc,
+					 &iova_start, &nesmr->pbls_used, &nesmr->pbl_4k);
+
+			nes_debug(NES_DBG_MR, "ret=%d\n", ret);
+
+			if (ret == 0) {
+				nesmr->ibmr.rkey = stag;
+				nesmr->ibmr.lkey = stag;
+				nesmr->mode = IWNES_MEMREG_TYPE_MEM;
+				ibmr = &nesmr->ibmr;
+			} else {
+				ib_umem_release(region);
+				kfree(nesmr);
+				ibmr = ERR_PTR(-ENOMEM);
+			}
+
+			reg_user_mr_err:
+			/* free the resources */
+			if (root_pbl_index == 1) {
+				pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+						vpbl.pbl_pbase);
+			} else {
+				for (page_index=0; page_index<root_pbl_index; page_index++) {
+					pci_free_consistent(nesdev->pcidev, 4096,
+							root_vpbl.leaf_vpbl[page_index].pbl_vbase,
+							root_vpbl.leaf_vpbl[page_index].pbl_pbase);
+				}
+				kfree(root_vpbl.leaf_vpbl);
+				pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+						root_vpbl.pbl_pbase);
+			}
+
+			nes_debug(NES_DBG_MR, "Leaving, ibmr=%p", ibmr);
+
+			return ibmr;
+		case IWNES_MEMREG_TYPE_QP:
+		case IWNES_MEMREG_TYPE_CQ:
+			if (!region->length) {
+				nes_debug(NES_DBG_MR, "Unable to register zero length region for CQ\n");
+				ib_umem_release(region);
+				return ERR_PTR(-EINVAL);
+			}
+			nespbl = kzalloc(sizeof(*nespbl), GFP_KERNEL);
+			if (!nespbl) {
+				ib_umem_release(region);
+				return ERR_PTR(-ENOMEM);
+			}
+			nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+			if (!nesmr) {
+				ib_umem_release(region);
+				kfree(nespbl);
+				return ERR_PTR(-ENOMEM);
+			}
+			nesmr->region = region;
+			nes_ucontext = to_nesucontext(pd->uobject->context);
+			pbl_depth = region->length >> 12;
+			pbl_depth += (region->length & (4096-1)) ? 1 : 0;
+			nespbl->pbl_size = pbl_depth*sizeof(u64);
+			if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
+				nes_debug(NES_DBG_MR, "Attempting to allocate QP PBL memory");
+			} else {
+				nes_debug(NES_DBG_MR, "Attempting to allocate CP PBL memory");
+			}
+
+			nes_debug(NES_DBG_MR, " %u bytes, %u entries.\n",
+					nespbl->pbl_size, pbl_depth);
+			pbl = pci_alloc_consistent(nesdev->pcidev, nespbl->pbl_size,
+					&nespbl->pbl_pbase);
+			if (!pbl) {
+				ib_umem_release(region);
+				kfree(nesmr);
+				kfree(nespbl);
+				nes_debug(NES_DBG_MR, "Unable to allocate PBL memory\n");
+				return ERR_PTR(-ENOMEM);
+			}
+
+			nespbl->pbl_vbase = (u64 *)pbl;
+			nespbl->user_base = start;
+			nes_debug(NES_DBG_MR, "Allocated PBL memory, %u bytes, pbl_pbase=%lx,"
+					" pbl_vbase=%p user_base=0x%lx\n",
+				  nespbl->pbl_size, (unsigned long) nespbl->pbl_pbase,
+				  (void *) nespbl->pbl_vbase, nespbl->user_base);
+
+			for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
+				chunk_pages = sg_dma_len(sg) >> 12;
+				chunk_pages += (sg_dma_len(sg) & (4096-1)) ? 1 : 0;
+				if (first_page) {
+					nespbl->page = sg_page(sg);
+					first_page = 0;
+				}
+
+				for (page_index = 0; page_index < chunk_pages; page_index++) {
+					((__le32 *)pbl)[0] = cpu_to_le32((u32)
+							(sg_dma_address(sg)+
+							(page_index*4096)));
+					((__le32 *)pbl)[1] = cpu_to_le32(((u64)
+							(sg_dma_address(sg)+
+							(page_index*4096)))>>32);
+					nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl,
+						  (unsigned long long)*pbl,
+						  le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0]));
+					pbl++;
+				}
+			}
+
+			if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
+				list_add_tail(&nespbl->list, &nes_ucontext->qp_reg_mem_list);
+			} else {
+				list_add_tail(&nespbl->list, &nes_ucontext->cq_reg_mem_list);
+			}
+			nesmr->ibmr.rkey = -1;
+			nesmr->ibmr.lkey = -1;
+			nesmr->mode = req.reg_type;
+			return &nesmr->ibmr;
+	}
+
+	ib_umem_release(region);
+	return ERR_PTR(-ENOSYS);
+}
+
+
+/**
+ * nes_dereg_mr
+ */
+static int nes_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct nes_mr *nesmr = to_nesmr(ib_mr);
+	struct nes_vnic *nesvnic = to_nesvnic(ib_mr->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	int ret;
+	u16 major_code;
+	u16 minor_code;
+
+
+	if (nesmr->pages)
+		pci_free_consistent(nesdev->pcidev,
+				    nesmr->max_pages * sizeof(u64),
+				    nesmr->pages,
+				    nesmr->paddr);
+
+	if (nesmr->region) {
+		ib_umem_release(nesmr->region);
+	}
+	if (nesmr->mode != IWNES_MEMREG_TYPE_MEM) {
+		kfree(nesmr);
+		return 0;
+	}
+
+	/* Deallocate the region with the adapter */
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			NES_CQP_DEALLOCATE_STAG | NES_CQP_STAG_VA_TO |
+			NES_CQP_STAG_DEALLOC_PBLS | NES_CQP_STAG_MR);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ib_mr->rkey);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X completed\n", ib_mr->rkey);
+	ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Deallocate STag 0x%08X completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X\n",
+			ib_mr->rkey, ret, cqp_request->major_code, cqp_request->minor_code);
+
+	major_code = cqp_request->major_code;
+	minor_code = cqp_request->minor_code;
+
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (!ret) {
+		nes_debug(NES_DBG_MR, "Timeout waiting to destroy STag,"
+				" ib_mr=%p, rkey = 0x%08X\n",
+				ib_mr, ib_mr->rkey);
+		return -ETIME;
+	} else if (major_code) {
+		nes_debug(NES_DBG_MR, "Error (0x%04X:0x%04X) while attempting"
+				" to destroy STag, ib_mr=%p, rkey = 0x%08X\n",
+				major_code, minor_code, ib_mr, ib_mr->rkey);
+		return -EIO;
+	}
+
+	if (nesmr->pbls_used != 0) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		if (nesmr->pbl_4k) {
+			nesadapter->free_4kpbl += nesmr->pbls_used;
+			if (nesadapter->free_4kpbl > nesadapter->max_4kpbl)
+				printk(KERN_ERR PFX "free 4KB PBLs(%u) has "
+					"exceeded the max(%u)\n",
+					nesadapter->free_4kpbl,
+					nesadapter->max_4kpbl);
+		} else {
+			nesadapter->free_256pbl += nesmr->pbls_used;
+			if (nesadapter->free_256pbl > nesadapter->max_256pbl)
+				printk(KERN_ERR PFX "free 256B PBLs(%u) has "
+					"exceeded the max(%u)\n",
+					nesadapter->free_256pbl,
+					nesadapter->max_256pbl);
+		}
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+			(ib_mr->rkey & 0x0fffff00) >> 8);
+
+	kfree(nesmr);
+
+	return 0;
+}
+
+
+/**
+ * show_rev
+ */
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct nes_ib_device *nesibdev =
+			container_of(dev, struct nes_ib_device, ibdev.dev);
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "%x\n", nesvnic->nesdev->nesadapter->hw_rev);
+}
+
+
+/**
+ * show_hca
+ */
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+		        char *buf)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "NES020\n");
+}
+
+
+/**
+ * show_board
+ */
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "%.*s\n", 32, "NES020 Board ID");
+}
+
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *nes_dev_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+
+/**
+ * nes_query_qp
+ */
+static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+
+	nes_debug(NES_DBG_QP, "\n");
+
+	attr->qp_access_flags = 0;
+	attr->cap.max_send_wr = nesqp->hwqp.sq_size;
+	attr->cap.max_recv_wr = nesqp->hwqp.rq_size;
+	attr->cap.max_recv_sge = 1;
+	if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA)
+		attr->cap.max_inline_data = 0;
+	else
+		attr->cap.max_inline_data = 64;
+
+	init_attr->event_handler = nesqp->ibqp.event_handler;
+	init_attr->qp_context = nesqp->ibqp.qp_context;
+	init_attr->send_cq = nesqp->ibqp.send_cq;
+	init_attr->recv_cq = nesqp->ibqp.recv_cq;
+	init_attr->srq = nesqp->ibqp.srq;
+	init_attr->cap = attr->cap;
+
+	return 0;
+}
+
+
+/**
+ * nes_hw_modify_qp
+ */
+int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp,
+		u32 next_iwarp_state, u32 termlen, u32 wait_completion)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	/* struct iw_cm_id *cm_id = nesqp->cm_id; */
+	/* struct iw_cm_event cm_event; */
+	struct nes_cqp_request *cqp_request;
+	int ret;
+	u16 major_code;
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u, refcount=%d\n",
+			nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MOD_QP, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	if (wait_completion) {
+		cqp_request->waiting = 1;
+	} else {
+		cqp_request->waiting = 0;
+	}
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			NES_CQP_MODIFY_QP | NES_CQP_QP_TYPE_IWARP | next_iwarp_state);
+	nes_debug(NES_DBG_MOD_QP, "using next_iwarp_state=%08x, wqe_words=%08x\n",
+			next_iwarp_state, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]));
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase);
+
+	/* If sending a terminate message, fill in the length (in words) */
+	if (((next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) == NES_CQP_QP_IWARP_STATE_TERMINATE) &&
+	    !(next_iwarp_state & NES_CQP_QP_TERM_DONT_SEND_TERM_MSG)) {
+		termlen = ((termlen + 3) >> 2) << NES_CQP_OP_TERMLEN_SHIFT;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_NEW_MSS_IDX, termlen);
+	}
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	if (wait_completion) {
+		/* nes_debug(NES_DBG_MOD_QP, "Waiting for modify iWARP QP%u to complete.\n",
+				nesqp->hwqp.qp_id); */
+		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+				NES_EVENT_TIMEOUT);
+		nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u completed, wait_event_timeout ret=%u, "
+				"CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+				nesqp->hwqp.qp_id, ret, cqp_request->major_code, cqp_request->minor_code);
+		major_code = cqp_request->major_code;
+		if (major_code) {
+			nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u failed"
+					"CQP Major:Minor codes = 0x%04X:0x%04X, intended next state = 0x%08X.\n",
+					nesqp->hwqp.qp_id, cqp_request->major_code,
+					cqp_request->minor_code, next_iwarp_state);
+		}
+
+		nes_put_cqp_request(nesdev, cqp_request);
+
+		if (!ret)
+			return -ETIME;
+		else if (major_code)
+			return -EIO;
+		else
+			return 0;
+	} else {
+		return 0;
+	}
+}
+
+
+/**
+ * nes_modify_qp
+ */
+int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		int attr_mask, struct ib_udata *udata)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	/* u32 cqp_head; */
+	/* u32 counter; */
+	u32 next_iwarp_state = 0;
+	int err;
+	unsigned long qplockflags;
+	int ret;
+	u16 original_last_aeq;
+	u8 issue_modify_qp = 0;
+	u8 dont_wait = 0;
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u: QP State=%u, cur QP State=%u,"
+			" iwarp_state=0x%X, refcount=%d\n",
+			nesqp->hwqp.qp_id, attr->qp_state, nesqp->ibqp_state,
+			nesqp->iwarp_state, atomic_read(&nesqp->refcount));
+
+	spin_lock_irqsave(&nesqp->lock, qplockflags);
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u: hw_iwarp_state=0x%X, hw_tcp_state=0x%X,"
+			" QP Access Flags=0x%X, attr_mask = 0x%0x\n",
+			nesqp->hwqp.qp_id, nesqp->hw_iwarp_state,
+			nesqp->hw_tcp_state, attr->qp_access_flags, attr_mask);
+
+	if (attr_mask & IB_QP_STATE) {
+		switch (attr->qp_state) {
+			case IB_QPS_INIT:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = init\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_IDLE) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE;
+				issue_modify_qp = 1;
+				break;
+			case IB_QPS_RTR:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rtr\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_IDLE) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE;
+				issue_modify_qp = 1;
+				break;
+			case IB_QPS_RTS:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rts\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_RTS) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				if (nesqp->cm_id == NULL) {
+					nes_debug(NES_DBG_MOD_QP, "QP%u: Failing attempt to move QP to RTS without a CM_ID. \n",
+							nesqp->hwqp.qp_id );
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_RTS;
+				if (nesqp->iwarp_state != NES_CQP_QP_IWARP_STATE_RTS)
+					next_iwarp_state |= NES_CQP_QP_CONTEXT_VALID |
+							NES_CQP_QP_ARP_VALID | NES_CQP_QP_ORD_VALID;
+				issue_modify_qp = 1;
+				nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_ESTABLISHED;
+				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_RTS;
+				nesqp->hte_added = 1;
+				break;
+			case IB_QPS_SQD:
+				issue_modify_qp = 1;
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state=closing. SQ head=%u, SQ tail=%u\n",
+						nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, nesqp->hwqp.sq_tail);
+				if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_CLOSING) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return 0;
+				} else {
+					if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_CLOSING) {
+						nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing"
+								" ignored due to current iWARP state\n",
+								nesqp->hwqp.qp_id);
+						spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+						return -EINVAL;
+					}
+					if (nesqp->hw_iwarp_state != NES_AEQE_IWARP_STATE_RTS) {
+						nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing"
+								" already done based on hw state.\n",
+								nesqp->hwqp.qp_id);
+						issue_modify_qp = 0;
+					}
+					switch (nesqp->hw_iwarp_state) {
+						case NES_AEQE_IWARP_STATE_CLOSING:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
+							break;
+						case NES_AEQE_IWARP_STATE_TERMINATE:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE;
+							break;
+						case NES_AEQE_IWARP_STATE_ERROR:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
+							break;
+						default:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
+							nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
+							break;
+					}
+				}
+				break;
+			case IB_QPS_SQE:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = terminate\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state>=(u32)NES_CQP_QP_IWARP_STATE_TERMINATE) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				/* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE;
+				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE;
+				issue_modify_qp = 1;
+				break;
+			case IB_QPS_ERR:
+			case IB_QPS_RESET:
+				if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_ERROR) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = error\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->term_flags)
+					del_timer(&nesqp->terminate_timer);
+
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
+				/* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */
+				if (nesqp->hte_added) {
+					nes_debug(NES_DBG_MOD_QP, "set CQP_QP_DEL_HTE\n");
+					next_iwarp_state |= NES_CQP_QP_DEL_HTE;
+					nesqp->hte_added = 0;
+				}
+				if ((nesqp->hw_tcp_state > NES_AEQE_TCP_STATE_CLOSED) &&
+						(nesdev->iw_status) &&
+						(nesqp->hw_tcp_state != NES_AEQE_TCP_STATE_TIME_WAIT)) {
+					next_iwarp_state |= NES_CQP_QP_RESET;
+				} else {
+					nes_debug(NES_DBG_MOD_QP, "QP%u NOT setting NES_CQP_QP_RESET since TCP state = %u\n",
+							nesqp->hwqp.qp_id, nesqp->hw_tcp_state);
+					dont_wait = 1;
+				}
+				issue_modify_qp = 1;
+				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR;
+				break;
+			default:
+				spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+				return -EINVAL;
+				break;
+		}
+
+		nesqp->ibqp_state = attr->qp_state;
+		nesqp->iwarp_state = next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK;
+		nes_debug(NES_DBG_MOD_QP, "Change nesqp->iwarp_state=%08x\n",
+				nesqp->iwarp_state);
+	}
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS) {
+		if (attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN |
+					NES_QPCONTEXT_MISC_RDMA_READ_EN);
+			issue_modify_qp = 1;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN);
+			issue_modify_qp = 1;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_READ_EN);
+			issue_modify_qp = 1;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_MW_BIND) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WBIND_EN);
+			issue_modify_qp = 1;
+		}
+
+		if (nesqp->user_mode) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN |
+					NES_QPCONTEXT_MISC_RDMA_READ_EN);
+			issue_modify_qp = 1;
+		}
+	}
+
+	original_last_aeq = nesqp->last_aeq;
+	spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+
+	nes_debug(NES_DBG_MOD_QP, "issue_modify_qp=%u\n", issue_modify_qp);
+
+	ret = 0;
+
+
+	if (issue_modify_qp) {
+		nes_debug(NES_DBG_MOD_QP, "call nes_hw_modify_qp\n");
+		ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 1);
+		if (ret)
+			nes_debug(NES_DBG_MOD_QP, "nes_hw_modify_qp (next_iwarp_state = 0x%08X)"
+					" failed for QP%u.\n",
+					next_iwarp_state, nesqp->hwqp.qp_id);
+
+	}
+
+	if ((issue_modify_qp) && (nesqp->ibqp_state > IB_QPS_RTS)) {
+		nes_debug(NES_DBG_MOD_QP, "QP%u Issued ModifyQP refcount (%d),"
+				" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+				nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+				original_last_aeq, nesqp->last_aeq);
+		if (!ret || original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) {
+			if (dont_wait) {
+				if (nesqp->cm_id && nesqp->hw_tcp_state != 0) {
+					nes_debug(NES_DBG_MOD_QP, "QP%u Queuing fake disconnect for QP refcount (%d),"
+							" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+							original_last_aeq, nesqp->last_aeq);
+					/* this one is for the cm_disconnect thread */
+					spin_lock_irqsave(&nesqp->lock, qplockflags);
+					nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+					nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_cm_disconn(nesqp);
+				} else {
+					nes_debug(NES_DBG_MOD_QP, "QP%u No fake disconnect, QP refcount=%d\n",
+							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
+				}
+			} else {
+				spin_lock_irqsave(&nesqp->lock, qplockflags);
+				if (nesqp->cm_id) {
+					/* These two are for the timer thread */
+					if (atomic_inc_return(&nesqp->close_timer_started) == 1) {
+						nesqp->cm_id->add_ref(nesqp->cm_id);
+						nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d),"
+								" need ae to finish up, original_last_aeq = 0x%04X."
+								" last_aeq = 0x%04X, scheduling timer.\n",
+								nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+								original_last_aeq, nesqp->last_aeq);
+						schedule_nes_timer(nesqp->cm_node, (struct sk_buff *) nesqp, NES_TIMER_TYPE_CLOSE, 1, 0);
+					}
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+				} else {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d),"
+							" need ae to finish up, original_last_aeq = 0x%04X."
+							" last_aeq = 0x%04X.\n",
+							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+							original_last_aeq, nesqp->last_aeq);
+				}
+			}
+		} else {
+			nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up,"
+					" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+					nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+					original_last_aeq, nesqp->last_aeq);
+		}
+	} else {
+		nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up,"
+				" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+				nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+				original_last_aeq, nesqp->last_aeq);
+	}
+
+	err = 0;
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u Leaving, refcount=%d\n",
+			nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
+
+	return err;
+}
+
+
+/**
+ * nes_muticast_attach
+ */
+static int nes_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_multicast_detach
+ */
+static int nes_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_process_mad
+ */
+static int nes_process_mad(struct ib_device *ibdev, int mad_flags,
+		u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		const struct ib_mad_hdr *in, size_t in_mad_size,
+		struct ib_mad_hdr *out, size_t *out_mad_size,
+		u16 *out_mad_pkey_index)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return -ENOSYS;
+}
+
+static inline void
+fill_wqe_sg_send(struct nes_hw_qp_wqe *wqe, struct ib_send_wr *ib_wr, u32 uselkey)
+{
+	int sge_index;
+	int total_payload_length = 0;
+	for (sge_index = 0; sge_index < ib_wr->num_sge; sge_index++) {
+		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX+(sge_index*4),
+			ib_wr->sg_list[sge_index].addr);
+		set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_LENGTH0_IDX + (sge_index*4),
+			ib_wr->sg_list[sge_index].length);
+		if (uselkey)
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4),
+						(ib_wr->sg_list[sge_index].lkey));
+		else
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), 0);
+
+		total_payload_length += ib_wr->sg_list[sge_index].length;
+	}
+	nes_debug(NES_DBG_IW_TX, "UC UC UC, sending total_payload_length=%u \n",
+			total_payload_length);
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
+				total_payload_length);
+}
+
+/**
+ * nes_post_send
+ */
+static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+		struct ib_send_wr **bad_wr)
+{
+	u64 u64temp;
+	unsigned long flags = 0;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_hw_qp_wqe *wqe;
+	int err = 0;
+	u32 qsize = nesqp->hwqp.sq_size;
+	u32 head;
+	u32 wqe_misc = 0;
+	u32 wqe_count = 0;
+	u32 counter;
+
+	if (nesqp->ibqp_state > IB_QPS_RTS) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+
+	head = nesqp->hwqp.sq_head;
+
+	while (ib_wr) {
+		/* Check for QP error */
+		if (nesqp->term_flags) {
+			err = -EINVAL;
+			break;
+		}
+
+		/* Check for SQ overflow */
+		if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
+			err = -ENOMEM;
+			break;
+		}
+
+		wqe = &nesqp->hwqp.sq_vbase[head];
+		/* nes_debug(NES_DBG_IW_TX, "processing sq wqe for QP%u at %p, head = %u.\n",
+				nesqp->hwqp.qp_id, wqe, head); */
+		nes_fill_init_qp_wqe(wqe, nesqp, head);
+		u64temp = (u64)(ib_wr->wr_id);
+		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX,
+					u64temp);
+		switch (ib_wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_INV:
+			if (IB_WR_SEND == ib_wr->opcode) {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					wqe_misc = NES_IWARP_SQ_OP_SENDSE;
+				else
+					wqe_misc = NES_IWARP_SQ_OP_SEND;
+			} else {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					wqe_misc = NES_IWARP_SQ_OP_SENDSEINV;
+				else
+					wqe_misc = NES_IWARP_SQ_OP_SENDINV;
+
+				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX,
+						    ib_wr->ex.invalidate_rkey);
+			}
+
+			if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
+				err = -EINVAL;
+				break;
+			}
+
+			if (ib_wr->send_flags & IB_SEND_FENCE)
+				wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
+
+			if ((ib_wr->send_flags & IB_SEND_INLINE) &&
+			    ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) &&
+			     (ib_wr->sg_list[0].length <= 64)) {
+				memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX],
+				       (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length);
+				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
+						    ib_wr->sg_list[0].length);
+				wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA;
+			} else {
+				fill_wqe_sg_send(wqe, ib_wr, 1);
+			}
+
+			break;
+		case IB_WR_RDMA_WRITE:
+			wqe_misc = NES_IWARP_SQ_OP_RDMAW;
+			if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
+				nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n",
+					  ib_wr->num_sge, nesdev->nesadapter->max_sge);
+				err = -EINVAL;
+				break;
+			}
+
+			if (ib_wr->send_flags & IB_SEND_FENCE)
+				wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
+
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX,
+					    rdma_wr(ib_wr)->rkey);
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX,
+					    rdma_wr(ib_wr)->remote_addr);
+
+			if ((ib_wr->send_flags & IB_SEND_INLINE) &&
+			    ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) &&
+			     (ib_wr->sg_list[0].length <= 64)) {
+				memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX],
+				       (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length);
+				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
+						    ib_wr->sg_list[0].length);
+				wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA;
+			} else {
+				fill_wqe_sg_send(wqe, ib_wr, 1);
+			}
+
+			wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] =
+				wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX];
+			break;
+		case IB_WR_RDMA_READ:
+		case IB_WR_RDMA_READ_WITH_INV:
+			/* iWARP only supports 1 sge for RDMA reads */
+			if (ib_wr->num_sge > 1) {
+				nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n",
+					  ib_wr->num_sge);
+				err = -EINVAL;
+				break;
+			}
+			if (ib_wr->opcode == IB_WR_RDMA_READ) {
+				wqe_misc = NES_IWARP_SQ_OP_RDMAR;
+			} else {
+				wqe_misc = NES_IWARP_SQ_OP_RDMAR_LOCINV;
+				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX,
+						    ib_wr->ex.invalidate_rkey);
+			}
+
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX,
+					    rdma_wr(ib_wr)->remote_addr);
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX,
+					    rdma_wr(ib_wr)->rkey);
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX,
+					    ib_wr->sg_list->length);
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX,
+					    ib_wr->sg_list->addr);
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX,
+					    ib_wr->sg_list->lkey);
+			break;
+		case IB_WR_LOCAL_INV:
+			wqe_misc = NES_IWARP_SQ_OP_LOCINV;
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX,
+					    ib_wr->ex.invalidate_rkey);
+			break;
+		case IB_WR_REG_MR:
+		{
+			struct nes_mr *mr = to_nesmr(reg_wr(ib_wr)->mr);
+			int page_shift = ilog2(reg_wr(ib_wr)->mr->page_size);
+			int flags = reg_wr(ib_wr)->access;
+
+			if (mr->npages > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) {
+				nes_debug(NES_DBG_IW_TX, "SQ_FMR: bad page_list_len\n");
+				err = -EINVAL;
+				break;
+			}
+			wqe_misc = NES_IWARP_SQ_OP_FAST_REG;
+			set_wqe_64bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX,
+					    mr->ibmr.iova);
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX,
+					    lower_32_bits(mr->ibmr.length));
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX, 0);
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX,
+					    reg_wr(ib_wr)->key);
+
+			if (page_shift == 12) {
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K;
+			} else if (page_shift == 21) {
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M;
+			} else {
+				nes_debug(NES_DBG_IW_TX, "Invalid page shift,"
+					  " ib_wr=%u, max=1\n", ib_wr->num_sge);
+				err = -EINVAL;
+				break;
+			}
+
+			/* Set access_flags */
+			wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ;
+			if (flags & IB_ACCESS_LOCAL_WRITE)
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE;
+
+			if (flags & IB_ACCESS_REMOTE_WRITE)
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE;
+
+			if (flags & IB_ACCESS_REMOTE_READ)
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ;
+
+			if (flags & IB_ACCESS_MW_BIND)
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND;
+
+			/* Fill in PBL info: */
+			set_wqe_64bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX,
+					    mr->paddr);
+
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX,
+					    mr->npages * 8);
+
+			nes_debug(NES_DBG_IW_TX, "SQ_REG_MR: iova_start: %llx, "
+				  "length: %lld, rkey: %0x, pgl_paddr: %llx, "
+				  "page_list_len: %u, wqe_misc: %x\n",
+				  (unsigned long long) mr->ibmr.iova,
+				  mr->ibmr.length,
+				  reg_wr(ib_wr)->key,
+				  (unsigned long long) mr->paddr,
+				  mr->npages,
+				  wqe_misc);
+			break;
+		}
+		default:
+			/* error */
+			err = -EINVAL;
+			break;
+		}
+
+		if (err)
+			break;
+
+		if ((ib_wr->send_flags & IB_SEND_SIGNALED) || nesqp->sig_all)
+			wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL;
+
+		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(wqe_misc);
+
+		ib_wr = ib_wr->next;
+		head++;
+		wqe_count++;
+		if (head >= qsize)
+			head = 0;
+
+	}
+
+	nesqp->hwqp.sq_head = head;
+	barrier();
+	while (wqe_count) {
+		counter = min(wqe_count, ((u32)255));
+		wqe_count -= counter;
+		nes_write32(nesdev->regs + NES_WQE_ALLOC,
+				(counter << 24) | 0x00800000 | nesqp->hwqp.qp_id);
+	}
+
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+out:
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+
+/**
+ * nes_post_recv
+ */
+static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+		struct ib_recv_wr **bad_wr)
+{
+	u64 u64temp;
+	unsigned long flags = 0;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_hw_qp_wqe *wqe;
+	int err = 0;
+	int sge_index;
+	u32 qsize = nesqp->hwqp.rq_size;
+	u32 head;
+	u32 wqe_count = 0;
+	u32 counter;
+	u32 total_payload_length;
+
+	if (nesqp->ibqp_state > IB_QPS_RTS) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+
+	head = nesqp->hwqp.rq_head;
+
+	while (ib_wr) {
+		/* Check for QP error */
+		if (nesqp->term_flags) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
+			err = -EINVAL;
+			break;
+		}
+		/* Check for RQ overflow */
+		if (((head + (2 * qsize) - nesqp->hwqp.rq_tail) % qsize) == (qsize - 1)) {
+			err = -ENOMEM;
+			break;
+		}
+
+		nes_debug(NES_DBG_IW_RX, "ibwr sge count = %u.\n", ib_wr->num_sge);
+		wqe = &nesqp->hwqp.rq_vbase[head];
+
+		/* nes_debug(NES_DBG_IW_RX, "QP%u:processing rq wqe at %p, head = %u.\n",
+				nesqp->hwqp.qp_id, wqe, head); */
+		nes_fill_init_qp_wqe(wqe, nesqp, head);
+		u64temp = (u64)(ib_wr->wr_id);
+		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX,
+					u64temp);
+		total_payload_length = 0;
+		for (sge_index=0; sge_index < ib_wr->num_sge; sge_index++) {
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_FRAG0_LOW_IDX+(sge_index*4),
+					ib_wr->sg_list[sge_index].addr);
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_LENGTH0_IDX+(sge_index*4),
+					ib_wr->sg_list[sge_index].length);
+			set_wqe_32bit_value(wqe->wqe_words,NES_IWARP_RQ_WQE_STAG0_IDX+(sge_index*4),
+					ib_wr->sg_list[sge_index].lkey);
+
+			total_payload_length += ib_wr->sg_list[sge_index].length;
+		}
+		set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX,
+					total_payload_length);
+
+		ib_wr = ib_wr->next;
+		head++;
+		wqe_count++;
+		if (head >= qsize)
+			head = 0;
+	}
+
+	nesqp->hwqp.rq_head = head;
+	barrier();
+	while (wqe_count) {
+		counter = min(wqe_count, ((u32)255));
+		wqe_count -= counter;
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter<<24) | nesqp->hwqp.qp_id);
+	}
+
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+out:
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+/**
+ * nes_drain_sq - drain sq
+ * @ibqp: pointer to ibqp
+ */
+static void nes_drain_sq(struct ib_qp *ibqp)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+
+	if (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)
+		wait_for_completion(&nesqp->sq_drained);
+}
+
+/**
+ * nes_drain_rq - drain rq
+ * @ibqp: pointer to ibqp
+ */
+static void nes_drain_rq(struct ib_qp *ibqp)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+
+	if (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)
+		wait_for_completion(&nesqp->rq_drained);
+}
+
+/**
+ * nes_poll_cq
+ */
+static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	u64 u64temp;
+	u64 wrid;
+	unsigned long flags = 0;
+	struct nes_vnic *nesvnic = to_nesvnic(ibcq->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_cq *nescq = to_nescq(ibcq);
+	struct nes_qp *nesqp;
+	struct nes_hw_cqe cqe;
+	u32 head;
+	u32 wq_tail = 0;
+	u32 cq_size;
+	u32 cqe_count = 0;
+	u32 wqe_index;
+	u32 u32temp;
+	u32 move_cq_head = 1;
+	u32 err_code;
+
+	nes_debug(NES_DBG_CQ, "\n");
+
+	spin_lock_irqsave(&nescq->lock, flags);
+
+	head = nescq->hw_cq.cq_head;
+	cq_size = nescq->hw_cq.cq_size;
+
+	while (cqe_count < num_entries) {
+		if ((le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) &
+				NES_CQE_VALID) == 0)
+			break;
+
+		/*
+		 * Make sure we read CQ entry contents *after*
+		 * we've checked the valid bit.
+		 */
+		rmb();
+
+		cqe = nescq->hw_cq.cq_vbase[head];
+		u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
+		wqe_index = u32temp & (nesdev->nesadapter->max_qp_wr - 1);
+		u32temp &= ~(NES_SW_CONTEXT_ALIGN-1);
+		/* parse CQE, get completion context from WQE (either rq or sq) */
+		u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) |
+				((u64)u32temp);
+
+		if (u64temp) {
+			nesqp = (struct nes_qp *)(unsigned long)u64temp;
+			memset(entry, 0, sizeof *entry);
+			if (cqe.cqe_words[NES_CQE_ERROR_CODE_IDX] == 0) {
+				entry->status = IB_WC_SUCCESS;
+			} else {
+				err_code = le32_to_cpu(cqe.cqe_words[NES_CQE_ERROR_CODE_IDX]);
+				if (NES_IWARP_CQE_MAJOR_DRV == (err_code >> 16)) {
+					entry->status = err_code & 0x0000ffff;
+
+					/* The rest of the cqe's will be marked as flushed */
+					nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX] =
+						cpu_to_le32((NES_IWARP_CQE_MAJOR_FLUSH << 16) |
+							    NES_IWARP_CQE_MINOR_FLUSH);
+				} else
+					entry->status = IB_WC_WR_FLUSH_ERR;
+			}
+
+			entry->qp = &nesqp->ibqp;
+			entry->src_qp = nesqp->hwqp.qp_id;
+
+			if (le32_to_cpu(cqe.cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_SQ) {
+				if (nesqp->skip_lsmm) {
+					nesqp->skip_lsmm = 0;
+					nesqp->hwqp.sq_tail++;
+				}
+
+				/* Working on a SQ Completion*/
+				wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index].
+						wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) |
+						((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index].
+						wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX])));
+				entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
+						wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]);
+
+				switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
+						wqe_words[NES_IWARP_SQ_WQE_MISC_IDX]) & 0x3f) {
+					case NES_IWARP_SQ_OP_RDMAW:
+						nes_debug(NES_DBG_CQ, "Operation = RDMA WRITE.\n");
+						entry->opcode = IB_WC_RDMA_WRITE;
+						break;
+					case NES_IWARP_SQ_OP_RDMAR:
+						nes_debug(NES_DBG_CQ, "Operation = RDMA READ.\n");
+						entry->opcode = IB_WC_RDMA_READ;
+						entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
+								wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX]);
+						break;
+					case NES_IWARP_SQ_OP_SENDINV:
+					case NES_IWARP_SQ_OP_SENDSEINV:
+					case NES_IWARP_SQ_OP_SEND:
+					case NES_IWARP_SQ_OP_SENDSE:
+						nes_debug(NES_DBG_CQ, "Operation = Send.\n");
+						entry->opcode = IB_WC_SEND;
+						break;
+					case NES_IWARP_SQ_OP_LOCINV:
+						entry->opcode = IB_WC_LOCAL_INV;
+						break;
+					case NES_IWARP_SQ_OP_FAST_REG:
+						entry->opcode = IB_WC_REG_MR;
+						break;
+				}
+
+				nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1);
+				if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)) {
+					move_cq_head = 0;
+					wq_tail = nesqp->hwqp.sq_tail;
+				}
+			} else {
+				/* Working on a RQ Completion*/
+				entry->byte_len = le32_to_cpu(cqe.cqe_words[NES_CQE_PAYLOAD_LENGTH_IDX]);
+				wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) |
+					((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32);
+				entry->opcode = IB_WC_RECV;
+
+				nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1);
+				if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)) {
+					move_cq_head = 0;
+					wq_tail = nesqp->hwqp.rq_tail;
+				}
+			}
+
+			if (nesqp->iwarp_state > NES_CQP_QP_IWARP_STATE_RTS) {
+				if (nesqp->hwqp.sq_tail == nesqp->hwqp.sq_head)
+					complete(&nesqp->sq_drained);
+				if (nesqp->hwqp.rq_tail == nesqp->hwqp.rq_head)
+					complete(&nesqp->rq_drained);
+			}
+
+			entry->wr_id = wrid;
+			entry++;
+			cqe_count++;
+		}
+
+		if (move_cq_head) {
+			nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0;
+			if (++head >= cq_size)
+				head = 0;
+			nescq->polled_completions++;
+
+			if ((nescq->polled_completions > (cq_size / 2)) ||
+					(nescq->polled_completions == 255)) {
+				nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes"
+					" are pending %u of %u.\n",
+					nescq->hw_cq.cq_number, nescq->polled_completions, cq_size);
+				nes_write32(nesdev->regs+NES_CQE_ALLOC,
+					nescq->hw_cq.cq_number | (nescq->polled_completions << 16));
+				nescq->polled_completions = 0;
+			}
+		} else {
+			/* Update the wqe index and set status to flush */
+			wqe_index = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
+			wqe_index = (wqe_index & (~(nesdev->nesadapter->max_qp_wr - 1))) | wq_tail;
+			nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] =
+				cpu_to_le32(wqe_index);
+			move_cq_head = 1; /* ready for next pass */
+		}
+	}
+
+	if (nescq->polled_completions) {
+		nes_write32(nesdev->regs+NES_CQE_ALLOC,
+				nescq->hw_cq.cq_number | (nescq->polled_completions << 16));
+		nescq->polled_completions = 0;
+	}
+
+	nescq->hw_cq.cq_head = head;
+	nes_debug(NES_DBG_CQ, "Reporting %u completions for CQ%u.\n",
+			cqe_count, nescq->hw_cq.cq_number);
+
+	spin_unlock_irqrestore(&nescq->lock, flags);
+
+	return cqe_count;
+}
+
+
+/**
+ * nes_req_notify_cq
+ */
+static int nes_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+		{
+	struct nes_vnic *nesvnic = to_nesvnic(ibcq->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_cq *nescq = to_nescq(ibcq);
+	u32 cq_arm;
+
+	nes_debug(NES_DBG_CQ, "Requesting notification for CQ%u.\n",
+			nescq->hw_cq.cq_number);
+
+	cq_arm = nescq->hw_cq.cq_number;
+	if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
+		cq_arm |= NES_CQE_ALLOC_NOTIFY_NEXT;
+	else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		cq_arm |= NES_CQE_ALLOC_NOTIFY_SE;
+	else
+		return -EINVAL;
+
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, cq_arm);
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+
+	return 0;
+}
+
+static int nes_port_immutable(struct ib_device *ibdev, u8 port_num,
+			      struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+	err = nes_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	return 0;
+}
+
+static void get_dev_fw_str(struct ib_device *dev, char *str)
+{
+	struct nes_ib_device *nesibdev =
+			container_of(dev, struct nes_ib_device, ibdev);
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+
+	nes_debug(NES_DBG_INIT, "\n");
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u",
+		 (nesvnic->nesdev->nesadapter->firmware_version >> 16),
+		 (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff));
+}
+
+/**
+ * nes_init_ofa_device
+ */
+struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
+{
+	struct nes_ib_device *nesibdev;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+
+	nesibdev = (struct nes_ib_device *)ib_alloc_device(sizeof(struct nes_ib_device));
+	if (nesibdev == NULL) {
+		return NULL;
+	}
+	strlcpy(nesibdev->ibdev.name, "nes%d", IB_DEVICE_NAME_MAX);
+	nesibdev->ibdev.owner = THIS_MODULE;
+
+	nesibdev->ibdev.node_type = RDMA_NODE_RNIC;
+	memset(&nesibdev->ibdev.node_guid, 0, sizeof(nesibdev->ibdev.node_guid));
+	memcpy(&nesibdev->ibdev.node_guid, netdev->dev_addr, 6);
+
+	nesibdev->ibdev.uverbs_cmd_mask =
+			(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+			(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+			(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+			(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+			(1ull << IB_USER_VERBS_CMD_REG_MR) |
+			(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
+			(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+			(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+			(1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+			(1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
+			(1ull << IB_USER_VERBS_CMD_BIND_MW) |
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW) |
+			(1ull << IB_USER_VERBS_CMD_POST_RECV) |
+			(1ull << IB_USER_VERBS_CMD_POST_SEND);
+
+	nesibdev->ibdev.phys_port_cnt = 1;
+	nesibdev->ibdev.num_comp_vectors = 1;
+	nesibdev->ibdev.dev.parent = &nesdev->pcidev->dev;
+	nesibdev->ibdev.query_device = nes_query_device;
+	nesibdev->ibdev.query_port = nes_query_port;
+	nesibdev->ibdev.query_pkey = nes_query_pkey;
+	nesibdev->ibdev.query_gid = nes_query_gid;
+	nesibdev->ibdev.alloc_ucontext = nes_alloc_ucontext;
+	nesibdev->ibdev.dealloc_ucontext = nes_dealloc_ucontext;
+	nesibdev->ibdev.mmap = nes_mmap;
+	nesibdev->ibdev.alloc_pd = nes_alloc_pd;
+	nesibdev->ibdev.dealloc_pd = nes_dealloc_pd;
+	nesibdev->ibdev.create_ah = nes_create_ah;
+	nesibdev->ibdev.destroy_ah = nes_destroy_ah;
+	nesibdev->ibdev.create_qp = nes_create_qp;
+	nesibdev->ibdev.modify_qp = nes_modify_qp;
+	nesibdev->ibdev.query_qp = nes_query_qp;
+	nesibdev->ibdev.destroy_qp = nes_destroy_qp;
+	nesibdev->ibdev.create_cq = nes_create_cq;
+	nesibdev->ibdev.destroy_cq = nes_destroy_cq;
+	nesibdev->ibdev.poll_cq = nes_poll_cq;
+	nesibdev->ibdev.get_dma_mr = nes_get_dma_mr;
+	nesibdev->ibdev.reg_user_mr = nes_reg_user_mr;
+	nesibdev->ibdev.dereg_mr = nes_dereg_mr;
+	nesibdev->ibdev.alloc_mw = nes_alloc_mw;
+	nesibdev->ibdev.dealloc_mw = nes_dealloc_mw;
+
+	nesibdev->ibdev.alloc_mr = nes_alloc_mr;
+	nesibdev->ibdev.map_mr_sg = nes_map_mr_sg;
+
+	nesibdev->ibdev.attach_mcast = nes_multicast_attach;
+	nesibdev->ibdev.detach_mcast = nes_multicast_detach;
+	nesibdev->ibdev.process_mad = nes_process_mad;
+
+	nesibdev->ibdev.req_notify_cq = nes_req_notify_cq;
+	nesibdev->ibdev.post_send = nes_post_send;
+	nesibdev->ibdev.post_recv = nes_post_recv;
+	nesibdev->ibdev.drain_sq = nes_drain_sq;
+	nesibdev->ibdev.drain_rq = nes_drain_rq;
+
+	nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL);
+	if (nesibdev->ibdev.iwcm == NULL) {
+		ib_dealloc_device(&nesibdev->ibdev);
+		return NULL;
+	}
+	nesibdev->ibdev.iwcm->add_ref = nes_add_ref;
+	nesibdev->ibdev.iwcm->rem_ref = nes_rem_ref;
+	nesibdev->ibdev.iwcm->get_qp = nes_get_qp;
+	nesibdev->ibdev.iwcm->connect = nes_connect;
+	nesibdev->ibdev.iwcm->accept = nes_accept;
+	nesibdev->ibdev.iwcm->reject = nes_reject;
+	nesibdev->ibdev.iwcm->create_listen = nes_create_listen;
+	nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen;
+	nesibdev->ibdev.get_port_immutable   = nes_port_immutable;
+	nesibdev->ibdev.get_dev_fw_str   = get_dev_fw_str;
+	memcpy(nesibdev->ibdev.iwcm->ifname, netdev->name,
+	       sizeof(nesibdev->ibdev.iwcm->ifname));
+
+	return nesibdev;
+}
+
+
+/**
+ * nes_handle_delayed_event
+ */
+static void nes_handle_delayed_event(struct timer_list *t)
+{
+	struct nes_vnic *nesvnic = from_timer(nesvnic, t, event_timer);
+
+	if (nesvnic->delayed_event != nesvnic->last_dispatched_event) {
+		struct ib_event event;
+
+		event.device = &nesvnic->nesibdev->ibdev;
+		if (!event.device)
+			goto stop_timer;
+		event.event = nesvnic->delayed_event;
+		event.element.port_num = nesvnic->logical_port + 1;
+		ib_dispatch_event(&event);
+	}
+
+stop_timer:
+	nesvnic->event_timer.function = NULL;
+}
+
+
+void  nes_port_ibevent(struct nes_vnic *nesvnic)
+{
+	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct ib_event event;
+	event.device = &nesibdev->ibdev;
+	event.element.port_num = nesvnic->logical_port + 1;
+	event.event = nesdev->iw_status ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+
+	if (!nesvnic->event_timer.function) {
+		ib_dispatch_event(&event);
+		nesvnic->last_dispatched_event = event.event;
+		nesvnic->event_timer.function = nes_handle_delayed_event;
+		nesvnic->event_timer.expires = jiffies + NES_EVENT_DELAY;
+		add_timer(&nesvnic->event_timer);
+	} else {
+		mod_timer(&nesvnic->event_timer, jiffies + NES_EVENT_DELAY);
+	}
+	nesvnic->delayed_event = event.event;
+}
+
+
+/**
+ * nes_destroy_ofa_device
+ */
+void nes_destroy_ofa_device(struct nes_ib_device *nesibdev)
+{
+	if (nesibdev == NULL)
+		return;
+
+	nes_unregister_ofa_device(nesibdev);
+
+	kfree(nesibdev->ibdev.iwcm);
+	ib_dealloc_device(&nesibdev->ibdev);
+}
+
+
+/**
+ * nes_register_ofa_device
+ */
+int nes_register_ofa_device(struct nes_ib_device *nesibdev)
+{
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	int i, ret;
+
+	nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES;
+	ret = ib_register_device(&nesvnic->nesibdev->ibdev, NULL);
+	if (ret) {
+		return ret;
+	}
+
+	/* Get the resources allocated to this device */
+	nesibdev->max_cq = (nesadapter->max_cq-NES_FIRST_QPN) / nesadapter->port_count;
+	nesibdev->max_mr = nesadapter->max_mr / nesadapter->port_count;
+	nesibdev->max_qp = (nesadapter->max_qp-NES_FIRST_QPN) / nesadapter->port_count;
+	nesibdev->max_pd = nesadapter->max_pd / nesadapter->port_count;
+
+	for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) {
+		ret = device_create_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]);
+		if (ret) {
+			while (i > 0) {
+				i--;
+				device_remove_file(&nesibdev->ibdev.dev,
+						   nes_dev_attributes[i]);
+			}
+			ib_unregister_device(&nesibdev->ibdev);
+			return ret;
+		}
+	}
+
+	nesvnic->of_device_registered = 1;
+
+	return 0;
+}
+
+
+/**
+ * nes_unregister_ofa_device
+ */
+static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev)
+{
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) {
+		device_remove_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]);
+	}
+
+	if (nesvnic->of_device_registered) {
+		ib_unregister_device(&nesibdev->ibdev);
+	}
+
+	nesvnic->of_device_registered = 0;
+}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h
new file mode 100644
index 0000000..e02a566
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_verbs.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef NES_VERBS_H
+#define NES_VERBS_H
+
+struct nes_device;
+
+#define NES_MAX_USER_DB_REGIONS  4096
+#define NES_MAX_USER_WQ_REGIONS  4096
+
+#define NES_TERM_SENT            0x01
+#define NES_TERM_RCVD            0x02
+#define NES_TERM_DONE            0x04
+
+struct nes_ucontext {
+	struct ib_ucontext ibucontext;
+	struct nes_device  *nesdev;
+	unsigned long      mmap_wq_offset;
+	unsigned long      mmap_cq_offset; /* to be removed */
+	int                index;		/* rnic index (minor) */
+	unsigned long      allocated_doorbells[BITS_TO_LONGS(NES_MAX_USER_DB_REGIONS)];
+	u16                mmap_db_index[NES_MAX_USER_DB_REGIONS];
+	u16                first_free_db;
+	unsigned long      allocated_wqs[BITS_TO_LONGS(NES_MAX_USER_WQ_REGIONS)];
+	struct nes_qp      *mmap_nesqp[NES_MAX_USER_WQ_REGIONS];
+	u16                first_free_wq;
+	struct list_head   cq_reg_mem_list;
+	struct list_head   qp_reg_mem_list;
+	u32                mcrqf;
+	atomic_t	   usecnt;
+};
+
+struct nes_pd {
+	struct ib_pd ibpd;
+	u16          pd_id;
+	atomic_t     sqp_count;
+	u16          mmap_db_index;
+};
+
+struct nes_mr {
+	union {
+		struct ib_mr  ibmr;
+		struct ib_mw  ibmw;
+		struct ib_fmr ibfmr;
+	};
+	struct ib_umem    *region;
+	u16               pbls_used;
+	u8                mode;
+	u8                pbl_4k;
+	__le64            *pages;
+	dma_addr_t        paddr;
+	u32               max_pages;
+	u32		  npages;
+};
+
+struct nes_hw_pb {
+	__le32 pa_low;
+	__le32 pa_high;
+};
+
+struct nes_vpbl {
+	dma_addr_t       pbl_pbase;
+	struct nes_hw_pb *pbl_vbase;
+};
+
+struct nes_root_vpbl {
+	dma_addr_t       pbl_pbase;
+	struct nes_hw_pb *pbl_vbase;
+	struct nes_vpbl  *leaf_vpbl;
+};
+
+struct nes_fmr {
+	struct nes_mr        nesmr;
+	u32                  leaf_pbl_cnt;
+	struct nes_root_vpbl root_vpbl;
+	struct ib_qp         *ib_qp;
+	int                  access_rights;
+	struct ib_fmr_attr   attr;
+};
+
+struct nes_av;
+
+struct nes_cq {
+	struct ib_cq     ibcq;
+	struct nes_hw_cq hw_cq;
+	u32              polled_completions;
+	u32              cq_mem_size;
+	spinlock_t       lock;
+	u8               virtual_cq;
+	u8               pad[3];
+	u32		 mcrqf;
+};
+
+struct nes_wq {
+	spinlock_t lock;
+};
+
+struct disconn_work {
+	struct work_struct    work;
+	struct nes_qp         *nesqp;
+};
+
+struct iw_cm_id;
+struct ietf_mpa_frame;
+
+struct nes_qp {
+	struct ib_qp          ibqp;
+	void                  *allocated_buffer;
+	struct iw_cm_id       *cm_id;
+	struct nes_cq         *nesscq;
+	struct nes_cq         *nesrcq;
+	struct nes_pd         *nespd;
+	void *cm_node; /* handle of the node this QP is associated with */
+	void                  *ietf_frame;
+	u8                    ietf_frame_size;
+	dma_addr_t            ietf_frame_pbase;
+	struct ib_mr          *lsmm_mr;
+	struct nes_hw_qp      hwqp;
+	struct work_struct    work;
+	enum ib_qp_state      ibqp_state;
+	u32                   iwarp_state;
+	u32                   hte_index;
+	u32                   last_aeq;
+	u32                   qp_mem_size;
+	atomic_t              refcount;
+	atomic_t              close_timer_started;
+	u32                   mmap_sq_db_index;
+	u32                   mmap_rq_db_index;
+	spinlock_t            lock;
+	spinlock_t            pau_lock;
+	struct nes_qp_context *nesqp_context;
+	dma_addr_t            nesqp_context_pbase;
+	void	              *pbl_vbase;
+	dma_addr_t            pbl_pbase;
+	struct page           *page;
+	struct timer_list     terminate_timer;
+	enum ib_event_type    terminate_eventtype;
+	struct sk_buff_head   pau_list;
+	u32                   pau_rcv_nxt;
+	u16                   active_conn:1;
+	u16                   skip_lsmm:1;
+	u16                   user_mode:1;
+	u16                   hte_added:1;
+	u16                   flush_issued:1;
+	u16                   destroyed:1;
+	u16                   sig_all:1;
+	u16                   pau_mode:1;
+	u16                   rsvd:8;
+	u16                   private_data_len;
+	u16                   term_sq_flush_code;
+	u16                   term_rq_flush_code;
+	u8                    hw_iwarp_state;
+	u8                    hw_tcp_state;
+	u8                    term_flags;
+	u8                    sq_kmapped;
+	u8                    pau_busy;
+	u8                    pau_pending;
+	u8                    pau_state;
+	__u64                 nesuqp_addr;
+	struct completion     sq_drained;
+	struct completion     rq_drained;
+};
+
+struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
+		u64 addr, u64 size, int acc, u64 *iova_start);
+
+#endif			/* NES_VERBS_H */
diff --git a/drivers/infiniband/hw/ocrdma/Kconfig b/drivers/infiniband/hw/ocrdma/Kconfig
new file mode 100644
index 0000000..c0cddc0
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/Kconfig
@@ -0,0 +1,8 @@
+config INFINIBAND_OCRDMA
+	tristate "Emulex One Connect HCA support"
+	depends on ETHERNET && NETDEVICES && PCI && INET && (IPV6 || IPV6=n)
+	select NET_VENDOR_EMULEX
+	select BE2NET
+	---help---
+	  This driver provides low-level InfiniBand over Ethernet
+	  support for Emulex One Connect host channel adapters (HCAs).
diff --git a/drivers/infiniband/hw/ocrdma/Makefile b/drivers/infiniband/hw/ocrdma/Makefile
new file mode 100644
index 0000000..d1bfd4f
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/Makefile
@@ -0,0 +1,5 @@
+ccflags-y := -Idrivers/net/ethernet/emulex/benet
+
+obj-$(CONFIG_INFINIBAND_OCRDMA)	+= ocrdma.o
+
+ocrdma-y :=	ocrdma_main.o ocrdma_verbs.o ocrdma_hw.o ocrdma_ah.o ocrdma_stats.o
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h
new file mode 100644
index 0000000..7baedc7
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma.h
@@ -0,0 +1,609 @@
+/* This file is part of the Emulex RoCE Device Driver for
+ * RoCE (RDMA over Converged Ethernet) adapters.
+ * Copyright (C) 2012-2015 Emulex. All rights reserved.
+ * EMULEX and SLI are trademarks of Emulex.
+ * www.emulex.com
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License (GPL) Version 2, available from the file COPYING in the main
+ * directory of this source tree, or the BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#ifndef __OCRDMA_H__
+#define __OCRDMA_H__
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include <be_roce.h>
+#include "ocrdma_sli.h"
+
+#define OCRDMA_ROCE_DRV_VERSION "11.0.0.0"
+
+#define OCRDMA_ROCE_DRV_DESC "Emulex OneConnect RoCE Driver"
+#define OCRDMA_NODE_DESC "Emulex OneConnect RoCE HCA"
+
+#define OC_NAME_SH	OCRDMA_NODE_DESC "(Skyhawk)"
+#define OC_NAME_UNKNOWN OCRDMA_NODE_DESC "(Unknown)"
+
+#define OC_SKH_DEVICE_PF 0x720
+#define OC_SKH_DEVICE_VF 0x728
+#define OCRDMA_MAX_AH 512
+
+#define OCRDMA_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME)
+
+#define convert_to_64bit(lo, hi) ((u64)hi << 32 | (u64)lo)
+#define EQ_INTR_PER_SEC_THRSH_HI 150000
+#define EQ_INTR_PER_SEC_THRSH_LOW 100000
+#define EQ_AIC_MAX_EQD 20
+#define EQ_AIC_MIN_EQD 0
+
+void ocrdma_eqd_set_task(struct work_struct *work);
+
+struct ocrdma_dev_attr {
+	u8 fw_ver[32];
+	u32 vendor_id;
+	u32 device_id;
+	u16 max_pd;
+	u16 max_dpp_pds;
+	u16 max_cq;
+	u16 max_cqe;
+	u16 max_qp;
+	u16 max_wqe;
+	u16 max_rqe;
+	u16 max_srq;
+	u32 max_inline_data;
+	int max_send_sge;
+	int max_recv_sge;
+	int max_srq_sge;
+	int max_rdma_sge;
+	int max_mr;
+	u64 max_mr_size;
+	u32 max_num_mr_pbl;
+	int max_mw;
+	int max_fmr;
+	int max_map_per_fmr;
+	int max_pages_per_frmr;
+	u16 max_ord_per_qp;
+	u16 max_ird_per_qp;
+
+	int device_cap_flags;
+	u8 cq_overflow_detect;
+	u8 srq_supported;
+
+	u32 wqe_size;
+	u32 rqe_size;
+	u32 ird_page_size;
+	u8 local_ca_ack_delay;
+	u8 ird;
+	u8 num_ird_pages;
+	u8 udp_encap;
+};
+
+struct ocrdma_dma_mem {
+	void *va;
+	dma_addr_t pa;
+	u32 size;
+};
+
+struct ocrdma_pbl {
+	void *va;
+	dma_addr_t pa;
+};
+
+struct ocrdma_queue_info {
+	void *va;
+	dma_addr_t dma;
+	u32 size;
+	u16 len;
+	u16 entry_size;		/* Size of an element in the queue */
+	u16 id;			/* qid, where to ring the doorbell. */
+	u16 head, tail;
+	bool created;
+};
+
+struct ocrdma_aic_obj {         /* Adaptive interrupt coalescing (AIC) info */
+	u32 prev_eqd;
+	u64 eq_intr_cnt;
+	u64 prev_eq_intr_cnt;
+};
+
+struct ocrdma_eq {
+	struct ocrdma_queue_info q;
+	u32 vector;
+	int cq_cnt;
+	struct ocrdma_dev *dev;
+	char irq_name[32];
+	struct ocrdma_aic_obj aic_obj;
+};
+
+struct ocrdma_mq {
+	struct ocrdma_queue_info sq;
+	struct ocrdma_queue_info cq;
+	bool rearm_cq;
+};
+
+struct mqe_ctx {
+	struct mutex lock; /* for serializing mailbox commands on MQ */
+	wait_queue_head_t cmd_wait;
+	u32 tag;
+	u16 cqe_status;
+	u16 ext_status;
+	bool cmd_done;
+	bool fw_error_state;
+};
+
+struct ocrdma_hw_mr {
+	u32 lkey;
+	u8 fr_mr;
+	u8 remote_atomic;
+	u8 remote_rd;
+	u8 remote_wr;
+	u8 local_rd;
+	u8 local_wr;
+	u8 mw_bind;
+	u8 rsvd;
+	u64 len;
+	struct ocrdma_pbl *pbl_table;
+	u32 num_pbls;
+	u32 num_pbes;
+	u32 pbl_size;
+	u32 pbe_size;
+	u64 fbo;
+	u64 va;
+};
+
+struct ocrdma_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct ocrdma_hw_mr hwmr;
+	u64 *pages;
+	u32 npages;
+};
+
+struct ocrdma_stats {
+	u8 type;
+	struct ocrdma_dev *dev;
+};
+
+struct ocrdma_pd_resource_mgr {
+	u32 pd_norm_start;
+	u16 pd_norm_count;
+	u16 pd_norm_thrsh;
+	u16 max_normal_pd;
+	u32 pd_dpp_start;
+	u16 pd_dpp_count;
+	u16 pd_dpp_thrsh;
+	u16 max_dpp_pd;
+	u16 dpp_page_index;
+	unsigned long *pd_norm_bitmap;
+	unsigned long *pd_dpp_bitmap;
+	bool pd_prealloc_valid;
+};
+
+struct stats_mem {
+	struct ocrdma_mqe mqe;
+	void *va;
+	dma_addr_t pa;
+	u32 size;
+	char *debugfs_mem;
+};
+
+struct phy_info {
+	u16 auto_speeds_supported;
+	u16 fixed_speeds_supported;
+	u16 phy_type;
+	u16 interface_type;
+};
+
+enum ocrdma_flags {
+	OCRDMA_FLAGS_LINK_STATUS_INIT = 0x01
+};
+
+struct ocrdma_dev {
+	struct ib_device ibdev;
+	struct ocrdma_dev_attr attr;
+
+	struct mutex dev_lock; /* provides syncronise access to device data */
+	spinlock_t flush_q_lock ____cacheline_aligned;
+
+	struct ocrdma_cq **cq_tbl;
+	struct ocrdma_qp **qp_tbl;
+
+	struct ocrdma_eq *eq_tbl;
+	int eq_cnt;
+	struct delayed_work eqd_work;
+	u16 base_eqid;
+	u16 max_eq;
+
+	/* provided synchronization to sgid table for
+	 * updating gid entries triggered by notifier.
+	 */
+	spinlock_t sgid_lock;
+
+	int gsi_qp_created;
+	struct ocrdma_cq *gsi_sqcq;
+	struct ocrdma_cq *gsi_rqcq;
+
+	struct {
+		struct ocrdma_av *va;
+		dma_addr_t pa;
+		u32 size;
+		u32 num_ah;
+		/* provide synchronization for av
+		 * entry allocations.
+		 */
+		spinlock_t lock;
+		u32 ahid;
+		struct ocrdma_pbl pbl;
+	} av_tbl;
+
+	void *mbx_cmd;
+	struct ocrdma_mq mq;
+	struct mqe_ctx mqe_ctx;
+
+	struct be_dev_info nic_info;
+	struct phy_info phy;
+	char model_number[32];
+	u32 hba_port_num;
+
+	struct list_head entry;
+	int id;
+	u64 *stag_arr;
+	u8 sl; /* service level */
+	bool pfc_state;
+	atomic_t update_sl;
+	u16 pvid;
+	u32 asic_id;
+	u32 flags;
+
+	ulong last_stats_time;
+	struct mutex stats_lock; /* provide synch for debugfs operations */
+	struct stats_mem stats_mem;
+	struct ocrdma_stats rsrc_stats;
+	struct ocrdma_stats rx_stats;
+	struct ocrdma_stats wqe_stats;
+	struct ocrdma_stats tx_stats;
+	struct ocrdma_stats db_err_stats;
+	struct ocrdma_stats tx_qp_err_stats;
+	struct ocrdma_stats rx_qp_err_stats;
+	struct ocrdma_stats tx_dbg_stats;
+	struct ocrdma_stats rx_dbg_stats;
+	struct ocrdma_stats driver_stats;
+	struct ocrdma_stats reset_stats;
+	struct dentry *dir;
+	atomic_t async_err_stats[OCRDMA_MAX_ASYNC_ERRORS];
+	atomic_t cqe_err_stats[OCRDMA_MAX_CQE_ERR];
+	struct ocrdma_pd_resource_mgr *pd_mgr;
+};
+
+struct ocrdma_cq {
+	struct ib_cq ibcq;
+	struct ocrdma_cqe *va;
+	u32 phase;
+	u32 getp;	/* pointer to pending wrs to
+			 * return to stack, wrap arounds
+			 * at max_hw_cqe
+			 */
+	u32 max_hw_cqe;
+	bool phase_change;
+	spinlock_t cq_lock ____cacheline_aligned; /* provide synchronization
+						   * to cq polling
+						   */
+	/* syncronizes cq completion handler invoked from multiple context */
+	spinlock_t comp_handler_lock ____cacheline_aligned;
+	u16 id;
+	u16 eqn;
+
+	struct ocrdma_ucontext *ucontext;
+	dma_addr_t pa;
+	u32 len;
+	u32 cqe_cnt;
+
+	/* head of all qp's sq and rq for which cqes need to be flushed
+	 * by the software.
+	 */
+	struct list_head sq_head, rq_head;
+};
+
+struct ocrdma_pd {
+	struct ib_pd ibpd;
+	struct ocrdma_ucontext *uctx;
+	u32 id;
+	int num_dpp_qp;
+	u32 dpp_page;
+	bool dpp_enabled;
+};
+
+struct ocrdma_ah {
+	struct ib_ah ibah;
+	struct ocrdma_av *av;
+	u16 sgid_index;
+	u32 id;
+	u8 hdr_type;
+};
+
+struct ocrdma_qp_hwq_info {
+	u8 *va;			/* virtual address */
+	u32 max_sges;
+	u32 head, tail;
+	u32 entry_size;
+	u32 max_cnt;
+	u32 max_wqe_idx;
+	u16 dbid;		/* qid, where to ring the doorbell. */
+	u32 len;
+	dma_addr_t pa;
+};
+
+struct ocrdma_srq {
+	struct ib_srq ibsrq;
+	u8 __iomem *db;
+	struct ocrdma_qp_hwq_info rq;
+	u64 *rqe_wr_id_tbl;
+	u32 *idx_bit_fields;
+	u32 bit_fields_len;
+
+	/* provide synchronization to multiple context(s) posting rqe */
+	spinlock_t q_lock ____cacheline_aligned;
+
+	struct ocrdma_pd *pd;
+	u32 id;
+};
+
+struct ocrdma_qp {
+	struct ib_qp ibqp;
+
+	u8 __iomem *sq_db;
+	struct ocrdma_qp_hwq_info sq;
+	struct {
+		uint64_t wrid;
+		uint16_t dpp_wqe_idx;
+		uint16_t dpp_wqe;
+		uint8_t  signaled;
+		uint8_t  rsvd[3];
+	} *wqe_wr_id_tbl;
+	u32 max_inline_data;
+
+	/* provide synchronization to multiple context(s) posting wqe, rqe */
+	spinlock_t q_lock ____cacheline_aligned;
+	struct ocrdma_cq *sq_cq;
+	/* list maintained per CQ to flush SQ errors */
+	struct list_head sq_entry;
+
+	u8 __iomem *rq_db;
+	struct ocrdma_qp_hwq_info rq;
+	u64 *rqe_wr_id_tbl;
+	struct ocrdma_cq *rq_cq;
+	struct ocrdma_srq *srq;
+	/* list maintained per CQ to flush RQ errors */
+	struct list_head rq_entry;
+
+	enum ocrdma_qp_state state;	/*  QP state */
+	int cap_flags;
+	u32 max_ord, max_ird;
+
+	u32 id;
+	struct ocrdma_pd *pd;
+
+	enum ib_qp_type qp_type;
+
+	int sgid_idx;
+	u32 qkey;
+	bool dpp_enabled;
+	u8 *ird_q_va;
+	bool signaled;
+};
+
+struct ocrdma_ucontext {
+	struct ib_ucontext ibucontext;
+
+	struct list_head mm_head;
+	struct mutex mm_list_lock; /* protects list entries of mm type */
+	struct ocrdma_pd *cntxt_pd;
+	int pd_in_use;
+
+	struct {
+		u32 *va;
+		dma_addr_t pa;
+		u32 len;
+	} ah_tbl;
+};
+
+struct ocrdma_mm {
+	struct {
+		u64 phy_addr;
+		unsigned long len;
+	} key;
+	struct list_head entry;
+};
+
+static inline struct ocrdma_dev *get_ocrdma_dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct ocrdma_dev, ibdev);
+}
+
+static inline struct ocrdma_ucontext *get_ocrdma_ucontext(struct ib_ucontext
+							  *ibucontext)
+{
+	return container_of(ibucontext, struct ocrdma_ucontext, ibucontext);
+}
+
+static inline struct ocrdma_pd *get_ocrdma_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct ocrdma_pd, ibpd);
+}
+
+static inline struct ocrdma_cq *get_ocrdma_cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct ocrdma_cq, ibcq);
+}
+
+static inline struct ocrdma_qp *get_ocrdma_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct ocrdma_qp, ibqp);
+}
+
+static inline struct ocrdma_mr *get_ocrdma_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct ocrdma_mr, ibmr);
+}
+
+static inline struct ocrdma_ah *get_ocrdma_ah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct ocrdma_ah, ibah);
+}
+
+static inline struct ocrdma_srq *get_ocrdma_srq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct ocrdma_srq, ibsrq);
+}
+
+static inline int is_cqe_valid(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe)
+{
+	int cqe_valid;
+	cqe_valid = le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_VALID;
+	return (cqe_valid == cq->phase);
+}
+
+static inline int is_cqe_for_sq(struct ocrdma_cqe *cqe)
+{
+	return (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_QTYPE) ? 0 : 1;
+}
+
+static inline int is_cqe_invalidated(struct ocrdma_cqe *cqe)
+{
+	return (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_INVALIDATE) ? 1 : 0;
+}
+
+static inline int is_cqe_imm(struct ocrdma_cqe *cqe)
+{
+	return (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_IMM) ? 1 : 0;
+}
+
+static inline int is_cqe_wr_imm(struct ocrdma_cqe *cqe)
+{
+	return (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_WRITE_IMM) ? 1 : 0;
+}
+
+static inline int ocrdma_resolve_dmac(struct ocrdma_dev *dev,
+		struct rdma_ah_attr *ah_attr, u8 *mac_addr)
+{
+	struct in6_addr in6;
+
+	memcpy(&in6, rdma_ah_read_grh(ah_attr)->dgid.raw, sizeof(in6));
+	if (rdma_is_multicast_addr(&in6))
+		rdma_get_mcast_mac(&in6, mac_addr);
+	else if (rdma_link_local_addr(&in6))
+		rdma_get_ll_mac(&in6, mac_addr);
+	else
+		memcpy(mac_addr, ah_attr->roce.dmac, ETH_ALEN);
+	return 0;
+}
+
+static inline char *hca_name(struct ocrdma_dev *dev)
+{
+	switch (dev->nic_info.pdev->device) {
+	case OC_SKH_DEVICE_PF:
+	case OC_SKH_DEVICE_VF:
+		return OC_NAME_SH;
+	default:
+		return OC_NAME_UNKNOWN;
+	}
+}
+
+static inline int ocrdma_get_eq_table_index(struct ocrdma_dev *dev,
+		int eqid)
+{
+	int indx;
+
+	for (indx = 0; indx < dev->eq_cnt; indx++) {
+		if (dev->eq_tbl[indx].q.id == eqid)
+			return indx;
+	}
+
+	return -EINVAL;
+}
+
+static inline u8 ocrdma_get_asic_type(struct ocrdma_dev *dev)
+{
+	if (dev->nic_info.dev_family == 0xF && !dev->asic_id) {
+		pci_read_config_dword(
+			dev->nic_info.pdev,
+			OCRDMA_SLI_ASIC_ID_OFFSET, &dev->asic_id);
+	}
+
+	return (dev->asic_id & OCRDMA_SLI_ASIC_GEN_NUM_MASK) >>
+				OCRDMA_SLI_ASIC_GEN_NUM_SHIFT;
+}
+
+static inline u8 ocrdma_get_pfc_prio(u8 *pfc, u8 prio)
+{
+	return *(pfc + prio);
+}
+
+static inline u8 ocrdma_get_app_prio(u8 *app_prio, u8 prio)
+{
+	return *(app_prio + prio);
+}
+
+static inline u8 ocrdma_is_enabled_and_synced(u32 state)
+{	/* May also be used to interpret TC-state, QCN-state
+	 * Appl-state and Logical-link-state in future.
+	 */
+	return (state & OCRDMA_STATE_FLAG_ENABLED) &&
+		(state & OCRDMA_STATE_FLAG_SYNC);
+}
+
+static inline u8 ocrdma_get_ae_link_state(u32 ae_state)
+{
+	return ((ae_state & OCRDMA_AE_LSC_LS_MASK) >> OCRDMA_AE_LSC_LS_SHIFT);
+}
+
+static inline bool ocrdma_is_udp_encap_supported(struct ocrdma_dev *dev)
+{
+	return (dev->attr.udp_encap & OCRDMA_L3_TYPE_IPV4) ||
+	       (dev->attr.udp_encap & OCRDMA_L3_TYPE_IPV6);
+}
+
+#endif
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
new file mode 100644
index 0000000..3897b64
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -0,0 +1,302 @@
+/* This file is part of the Emulex RoCE Device Driver for
+ * RoCE (RDMA over Converged Ethernet) adapters.
+ * Copyright (C) 2012-2015 Emulex. All rights reserved.
+ * EMULEX and SLI are trademarks of Emulex.
+ * www.emulex.com
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License (GPL) Version 2, available from the file COPYING in the main
+ * directory of this source tree, or the BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include <net/neighbour.h>
+#include <net/netevent.h>
+
+#include <rdma/ib_addr.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_cache.h>
+
+#include "ocrdma.h"
+#include "ocrdma_verbs.h"
+#include "ocrdma_ah.h"
+#include "ocrdma_hw.h"
+#include "ocrdma_stats.h"
+
+#define OCRDMA_VID_PCP_SHIFT	0xD
+
+static u16 ocrdma_hdr_type_to_proto_num(int devid, u8 hdr_type)
+{
+	switch (hdr_type) {
+	case OCRDMA_L3_TYPE_IB_GRH:
+		return (u16)ETH_P_IBOE;
+	case OCRDMA_L3_TYPE_IPV4:
+		return (u16)0x0800;
+	case OCRDMA_L3_TYPE_IPV6:
+		return (u16)0x86dd;
+	default:
+		pr_err("ocrdma%d: Invalid network header\n", devid);
+		return 0;
+	}
+}
+
+static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
+			struct rdma_ah_attr *attr, union ib_gid *sgid,
+			int pdid, bool *isvlan, u16 vlan_tag)
+{
+	int status;
+	struct ocrdma_eth_vlan eth;
+	struct ocrdma_grh grh;
+	int eth_sz;
+	u16 proto_num = 0;
+	u8 nxthdr = 0x11;
+	struct iphdr ipv4;
+	const struct ib_global_route *ib_grh;
+	union {
+		struct sockaddr     _sockaddr;
+		struct sockaddr_in  _sockaddr_in;
+		struct sockaddr_in6 _sockaddr_in6;
+	} sgid_addr, dgid_addr;
+
+	memset(&eth, 0, sizeof(eth));
+	memset(&grh, 0, sizeof(grh));
+
+	/* Protocol Number */
+	proto_num = ocrdma_hdr_type_to_proto_num(dev->id, ah->hdr_type);
+	if (!proto_num)
+		return -EINVAL;
+	nxthdr = (proto_num == ETH_P_IBOE) ? 0x1b : 0x11;
+	/* VLAN */
+	if (!vlan_tag || (vlan_tag > 0xFFF))
+		vlan_tag = dev->pvid;
+	if (vlan_tag || dev->pfc_state) {
+		if (!vlan_tag) {
+			pr_err("ocrdma%d:Using VLAN with PFC is recommended\n",
+				dev->id);
+			pr_err("ocrdma%d:Using VLAN 0 for this connection\n",
+				dev->id);
+		}
+		eth.eth_type = cpu_to_be16(0x8100);
+		eth.roce_eth_type = cpu_to_be16(proto_num);
+		vlan_tag |= (dev->sl & 0x07) << OCRDMA_VID_PCP_SHIFT;
+		eth.vlan_tag = cpu_to_be16(vlan_tag);
+		eth_sz = sizeof(struct ocrdma_eth_vlan);
+		*isvlan = true;
+	} else {
+		eth.eth_type = cpu_to_be16(proto_num);
+		eth_sz = sizeof(struct ocrdma_eth_basic);
+	}
+	/* MAC */
+	memcpy(&eth.smac[0], &dev->nic_info.mac_addr[0], ETH_ALEN);
+	status = ocrdma_resolve_dmac(dev, attr, &eth.dmac[0]);
+	if (status)
+		return status;
+	ib_grh = rdma_ah_read_grh(attr);
+	ah->sgid_index = ib_grh->sgid_index;
+	/* Eth HDR */
+	memcpy(&ah->av->eth_hdr, &eth, eth_sz);
+	if (ah->hdr_type == RDMA_NETWORK_IPV4) {
+		*((__be16 *)&ipv4) = htons((4 << 12) | (5 << 8) |
+					   ib_grh->traffic_class);
+		ipv4.id = cpu_to_be16(pdid);
+		ipv4.frag_off = htons(IP_DF);
+		ipv4.tot_len = htons(0);
+		ipv4.ttl = ib_grh->hop_limit;
+		ipv4.protocol = nxthdr;
+		rdma_gid2ip(&sgid_addr._sockaddr, sgid);
+		ipv4.saddr = sgid_addr._sockaddr_in.sin_addr.s_addr;
+		rdma_gid2ip(&dgid_addr._sockaddr, &ib_grh->dgid);
+		ipv4.daddr = dgid_addr._sockaddr_in.sin_addr.s_addr;
+		memcpy((u8 *)ah->av + eth_sz, &ipv4, sizeof(struct iphdr));
+	} else {
+		memcpy(&grh.sgid[0], sgid->raw, sizeof(union ib_gid));
+		grh.tclass_flow = cpu_to_be32((6 << 28) |
+					      (ib_grh->traffic_class << 24) |
+					      ib_grh->flow_label);
+		memcpy(&grh.dgid[0], ib_grh->dgid.raw,
+		       sizeof(ib_grh->dgid.raw));
+		grh.pdid_hoplimit = cpu_to_be32((pdid << 16) |
+						(nxthdr << 8) |
+						ib_grh->hop_limit);
+		memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh));
+	}
+	if (*isvlan)
+		ah->av->valid |= OCRDMA_AV_VLAN_VALID;
+	ah->av->valid = cpu_to_le32(ah->av->valid);
+	return status;
+}
+
+struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
+			       struct ib_udata *udata)
+{
+	u32 *ahid_addr;
+	int status;
+	struct ocrdma_ah *ah;
+	bool isvlan = false;
+	u16 vlan_tag = 0xffff;
+	struct ib_gid_attr sgid_attr;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	const struct ib_global_route *grh;
+	union ib_gid sgid;
+
+	if ((attr->type != RDMA_AH_ATTR_TYPE_ROCE) ||
+	    !(rdma_ah_get_ah_flags(attr) & IB_AH_GRH))
+		return ERR_PTR(-EINVAL);
+
+	grh = rdma_ah_read_grh(attr);
+	if (atomic_cmpxchg(&dev->update_sl, 1, 0))
+		ocrdma_init_service_level(dev);
+
+	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	status = ocrdma_alloc_av(dev, ah);
+	if (status)
+		goto av_err;
+
+	status = ib_get_cached_gid(&dev->ibdev, 1, grh->sgid_index, &sgid,
+				   &sgid_attr);
+	if (status) {
+		pr_err("%s(): Failed to query sgid, status = %d\n",
+		      __func__, status);
+		goto av_conf_err;
+	}
+	if (is_vlan_dev(sgid_attr.ndev))
+		vlan_tag = vlan_dev_vlan_id(sgid_attr.ndev);
+	dev_put(sgid_attr.ndev);
+	/* Get network header type for this GID */
+	ah->hdr_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid);
+
+	status = set_av_attr(dev, ah, attr, &sgid, pd->id, &isvlan, vlan_tag);
+	if (status)
+		goto av_conf_err;
+
+	/* if pd is for the user process, pass the ah_id to user space */
+	if ((pd->uctx) && (pd->uctx->ah_tbl.va)) {
+		ahid_addr = pd->uctx->ah_tbl.va + rdma_ah_get_dlid(attr);
+		*ahid_addr = 0;
+		*ahid_addr |= ah->id & OCRDMA_AH_ID_MASK;
+		if (ocrdma_is_udp_encap_supported(dev)) {
+			*ahid_addr |= ((u32)ah->hdr_type &
+				       OCRDMA_AH_L3_TYPE_MASK) <<
+				       OCRDMA_AH_L3_TYPE_SHIFT;
+		}
+		if (isvlan)
+			*ahid_addr |= (OCRDMA_AH_VLAN_VALID_MASK <<
+				       OCRDMA_AH_VLAN_VALID_SHIFT);
+	}
+
+	return &ah->ibah;
+
+av_conf_err:
+	ocrdma_free_av(dev, ah);
+av_err:
+	kfree(ah);
+	return ERR_PTR(status);
+}
+
+int ocrdma_destroy_ah(struct ib_ah *ibah)
+{
+	struct ocrdma_ah *ah = get_ocrdma_ah(ibah);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device);
+
+	ocrdma_free_av(dev, ah);
+	kfree(ah);
+	return 0;
+}
+
+int ocrdma_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
+{
+	struct ocrdma_ah *ah = get_ocrdma_ah(ibah);
+	struct ocrdma_av *av = ah->av;
+	struct ocrdma_grh *grh;
+
+	attr->type = ibah->type;
+	if (ah->av->valid & OCRDMA_AV_VALID) {
+		grh = (struct ocrdma_grh *)((u8 *)ah->av +
+				sizeof(struct ocrdma_eth_vlan));
+		rdma_ah_set_sl(attr, be16_to_cpu(av->eth_hdr.vlan_tag) >> 13);
+	} else {
+		grh = (struct ocrdma_grh *)((u8 *)ah->av +
+					sizeof(struct ocrdma_eth_basic));
+		rdma_ah_set_sl(attr, 0);
+	}
+	rdma_ah_set_grh(attr, NULL,
+			be32_to_cpu(grh->tclass_flow) & 0xffffffff,
+			ah->sgid_index,
+			be32_to_cpu(grh->pdid_hoplimit) & 0xff,
+			be32_to_cpu(grh->tclass_flow) >> 24);
+	rdma_ah_set_dgid_raw(attr, &grh->dgid[0]);
+	return 0;
+}
+
+int ocrdma_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
+{
+	/* modify_ah is unsupported */
+	return -ENOSYS;
+}
+
+int ocrdma_process_mad(struct ib_device *ibdev,
+		       int process_mad_flags,
+		       u8 port_num,
+		       const struct ib_wc *in_wc,
+		       const struct ib_grh *in_grh,
+		       const struct ib_mad_hdr *in, size_t in_mad_size,
+		       struct ib_mad_hdr *out, size_t *out_mad_size,
+		       u16 *out_mad_pkey_index)
+{
+	int status;
+	struct ocrdma_dev *dev;
+	const struct ib_mad *in_mad = (const struct ib_mad *)in;
+	struct ib_mad *out_mad = (struct ib_mad *)out;
+
+	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
+			 *out_mad_size != sizeof(*out_mad)))
+		return IB_MAD_RESULT_FAILURE;
+
+	switch (in_mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_PERF_MGMT:
+		dev = get_ocrdma_dev(ibdev);
+		if (!ocrdma_pma_counters(dev, out_mad))
+			status = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+		else
+			status = IB_MAD_RESULT_SUCCESS;
+		break;
+	default:
+		status = IB_MAD_RESULT_SUCCESS;
+		break;
+	}
+	return status;
+}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
new file mode 100644
index 0000000..1a65c47
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
@@ -0,0 +1,68 @@
+/* This file is part of the Emulex RoCE Device Driver for
+ * RoCE (RDMA over Converged Ethernet) adapters.
+ * Copyright (C) 2012-2015 Emulex. All rights reserved.
+ * EMULEX and SLI are trademarks of Emulex.
+ * www.emulex.com
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License (GPL) Version 2, available from the file COPYING in the main
+ * directory of this source tree, or the BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#ifndef __OCRDMA_AH_H__
+#define __OCRDMA_AH_H__
+
+enum {
+	OCRDMA_AH_ID_MASK		= 0x3FF,
+	OCRDMA_AH_VLAN_VALID_MASK	= 0x01,
+	OCRDMA_AH_VLAN_VALID_SHIFT	= 0x1F,
+	OCRDMA_AH_L3_TYPE_MASK		= 0x03,
+	OCRDMA_AH_L3_TYPE_SHIFT		= 0x1D /* 29 bits */
+};
+
+struct ib_ah *ocrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
+			       struct ib_udata *udata);
+int ocrdma_destroy_ah(struct ib_ah *ah);
+int ocrdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
+int ocrdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
+
+int ocrdma_process_mad(struct ib_device *,
+		       int process_mad_flags,
+		       u8 port_num,
+		       const struct ib_wc *in_wc,
+		       const struct ib_grh *in_grh,
+		       const struct ib_mad_hdr *in, size_t in_mad_size,
+		       struct ib_mad_hdr *out, size_t *out_mad_size,
+		       u16 *out_mad_pkey_index);
+#endif				/* __OCRDMA_AH_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
new file mode 100644
index 0000000..2c260e1
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
@@ -0,0 +1,3254 @@
+/* This file is part of the Emulex RoCE Device Driver for
+ * RoCE (RDMA over Converged Ethernet) adapters.
+ * Copyright (C) 2012-2015 Emulex. All rights reserved.
+ * EMULEX and SLI are trademarks of Emulex.
+ * www.emulex.com
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License (GPL) Version 2, available from the file COPYING in the main
+ * directory of this source tree, or the BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/log2.h>
+#include <linux/dma-mapping.h>
+#include <linux/if_ether.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_cache.h>
+
+#include "ocrdma.h"
+#include "ocrdma_hw.h"
+#include "ocrdma_verbs.h"
+#include "ocrdma_ah.h"
+
+enum mbx_status {
+	OCRDMA_MBX_STATUS_FAILED		= 1,
+	OCRDMA_MBX_STATUS_ILLEGAL_FIELD		= 3,
+	OCRDMA_MBX_STATUS_OOR			= 100,
+	OCRDMA_MBX_STATUS_INVALID_PD		= 101,
+	OCRDMA_MBX_STATUS_PD_INUSE		= 102,
+	OCRDMA_MBX_STATUS_INVALID_CQ		= 103,
+	OCRDMA_MBX_STATUS_INVALID_QP		= 104,
+	OCRDMA_MBX_STATUS_INVALID_LKEY		= 105,
+	OCRDMA_MBX_STATUS_ORD_EXCEEDS		= 106,
+	OCRDMA_MBX_STATUS_IRD_EXCEEDS		= 107,
+	OCRDMA_MBX_STATUS_SENDQ_WQE_EXCEEDS	= 108,
+	OCRDMA_MBX_STATUS_RECVQ_RQE_EXCEEDS	= 109,
+	OCRDMA_MBX_STATUS_SGE_SEND_EXCEEDS	= 110,
+	OCRDMA_MBX_STATUS_SGE_WRITE_EXCEEDS	= 111,
+	OCRDMA_MBX_STATUS_SGE_RECV_EXCEEDS	= 112,
+	OCRDMA_MBX_STATUS_INVALID_STATE_CHANGE	= 113,
+	OCRDMA_MBX_STATUS_MW_BOUND		= 114,
+	OCRDMA_MBX_STATUS_INVALID_VA		= 115,
+	OCRDMA_MBX_STATUS_INVALID_LENGTH	= 116,
+	OCRDMA_MBX_STATUS_INVALID_FBO		= 117,
+	OCRDMA_MBX_STATUS_INVALID_ACC_RIGHTS	= 118,
+	OCRDMA_MBX_STATUS_INVALID_PBE_SIZE	= 119,
+	OCRDMA_MBX_STATUS_INVALID_PBL_ENTRY	= 120,
+	OCRDMA_MBX_STATUS_INVALID_PBL_SHIFT	= 121,
+	OCRDMA_MBX_STATUS_INVALID_SRQ_ID	= 129,
+	OCRDMA_MBX_STATUS_SRQ_ERROR		= 133,
+	OCRDMA_MBX_STATUS_RQE_EXCEEDS		= 134,
+	OCRDMA_MBX_STATUS_MTU_EXCEEDS		= 135,
+	OCRDMA_MBX_STATUS_MAX_QP_EXCEEDS	= 136,
+	OCRDMA_MBX_STATUS_SRQ_LIMIT_EXCEEDS	= 137,
+	OCRDMA_MBX_STATUS_SRQ_SIZE_UNDERUNS	= 138,
+	OCRDMA_MBX_STATUS_QP_BOUND		= 130,
+	OCRDMA_MBX_STATUS_INVALID_CHANGE	= 139,
+	OCRDMA_MBX_STATUS_ATOMIC_OPS_UNSUP	= 140,
+	OCRDMA_MBX_STATUS_INVALID_RNR_NAK_TIMER	= 141,
+	OCRDMA_MBX_STATUS_MW_STILL_BOUND	= 142,
+	OCRDMA_MBX_STATUS_PKEY_INDEX_INVALID	= 143,
+	OCRDMA_MBX_STATUS_PKEY_INDEX_EXCEEDS	= 144
+};
+
+enum additional_status {
+	OCRDMA_MBX_ADDI_STATUS_INSUFFICIENT_RESOURCES = 22
+};
+
+enum cqe_status {
+	OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_PRIVILEDGES	= 1,
+	OCRDMA_MBX_CQE_STATUS_INVALID_PARAMETER		= 2,
+	OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_RESOURCES	= 3,
+	OCRDMA_MBX_CQE_STATUS_QUEUE_FLUSHING		= 4,
+	OCRDMA_MBX_CQE_STATUS_DMA_FAILED		= 5
+};
+
+static inline void *ocrdma_get_eqe(struct ocrdma_eq *eq)
+{
+	return eq->q.va + (eq->q.tail * sizeof(struct ocrdma_eqe));
+}
+
+static inline void ocrdma_eq_inc_tail(struct ocrdma_eq *eq)
+{
+	eq->q.tail = (eq->q.tail + 1) & (OCRDMA_EQ_LEN - 1);
+}
+
+static inline void *ocrdma_get_mcqe(struct ocrdma_dev *dev)
+{
+	struct ocrdma_mcqe *cqe = (struct ocrdma_mcqe *)
+	    (dev->mq.cq.va + (dev->mq.cq.tail * sizeof(struct ocrdma_mcqe)));
+
+	if (!(le32_to_cpu(cqe->valid_ae_cmpl_cons) & OCRDMA_MCQE_VALID_MASK))
+		return NULL;
+	return cqe;
+}
+
+static inline void ocrdma_mcq_inc_tail(struct ocrdma_dev *dev)
+{
+	dev->mq.cq.tail = (dev->mq.cq.tail + 1) & (OCRDMA_MQ_CQ_LEN - 1);
+}
+
+static inline struct ocrdma_mqe *ocrdma_get_mqe(struct ocrdma_dev *dev)
+{
+	return dev->mq.sq.va + (dev->mq.sq.head * sizeof(struct ocrdma_mqe));
+}
+
+static inline void ocrdma_mq_inc_head(struct ocrdma_dev *dev)
+{
+	dev->mq.sq.head = (dev->mq.sq.head + 1) & (OCRDMA_MQ_LEN - 1);
+}
+
+static inline void *ocrdma_get_mqe_rsp(struct ocrdma_dev *dev)
+{
+	return dev->mq.sq.va + (dev->mqe_ctx.tag * sizeof(struct ocrdma_mqe));
+}
+
+enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps)
+{
+	switch (qps) {
+	case OCRDMA_QPS_RST:
+		return IB_QPS_RESET;
+	case OCRDMA_QPS_INIT:
+		return IB_QPS_INIT;
+	case OCRDMA_QPS_RTR:
+		return IB_QPS_RTR;
+	case OCRDMA_QPS_RTS:
+		return IB_QPS_RTS;
+	case OCRDMA_QPS_SQD:
+	case OCRDMA_QPS_SQ_DRAINING:
+		return IB_QPS_SQD;
+	case OCRDMA_QPS_SQE:
+		return IB_QPS_SQE;
+	case OCRDMA_QPS_ERR:
+		return IB_QPS_ERR;
+	}
+	return IB_QPS_ERR;
+}
+
+static enum ocrdma_qp_state get_ocrdma_qp_state(enum ib_qp_state qps)
+{
+	switch (qps) {
+	case IB_QPS_RESET:
+		return OCRDMA_QPS_RST;
+	case IB_QPS_INIT:
+		return OCRDMA_QPS_INIT;
+	case IB_QPS_RTR:
+		return OCRDMA_QPS_RTR;
+	case IB_QPS_RTS:
+		return OCRDMA_QPS_RTS;
+	case IB_QPS_SQD:
+		return OCRDMA_QPS_SQD;
+	case IB_QPS_SQE:
+		return OCRDMA_QPS_SQE;
+	case IB_QPS_ERR:
+		return OCRDMA_QPS_ERR;
+	}
+	return OCRDMA_QPS_ERR;
+}
+
+static int ocrdma_get_mbx_errno(u32 status)
+{
+	int err_num;
+	u8 mbox_status = (status & OCRDMA_MBX_RSP_STATUS_MASK) >>
+					OCRDMA_MBX_RSP_STATUS_SHIFT;
+	u8 add_status = (status & OCRDMA_MBX_RSP_ASTATUS_MASK) >>
+					OCRDMA_MBX_RSP_ASTATUS_SHIFT;
+
+	switch (mbox_status) {
+	case OCRDMA_MBX_STATUS_OOR:
+	case OCRDMA_MBX_STATUS_MAX_QP_EXCEEDS:
+		err_num = -EAGAIN;
+		break;
+
+	case OCRDMA_MBX_STATUS_INVALID_PD:
+	case OCRDMA_MBX_STATUS_INVALID_CQ:
+	case OCRDMA_MBX_STATUS_INVALID_SRQ_ID:
+	case OCRDMA_MBX_STATUS_INVALID_QP:
+	case OCRDMA_MBX_STATUS_INVALID_CHANGE:
+	case OCRDMA_MBX_STATUS_MTU_EXCEEDS:
+	case OCRDMA_MBX_STATUS_INVALID_RNR_NAK_TIMER:
+	case OCRDMA_MBX_STATUS_PKEY_INDEX_INVALID:
+	case OCRDMA_MBX_STATUS_PKEY_INDEX_EXCEEDS:
+	case OCRDMA_MBX_STATUS_ILLEGAL_FIELD:
+	case OCRDMA_MBX_STATUS_INVALID_PBL_ENTRY:
+	case OCRDMA_MBX_STATUS_INVALID_LKEY:
+	case OCRDMA_MBX_STATUS_INVALID_VA:
+	case OCRDMA_MBX_STATUS_INVALID_LENGTH:
+	case OCRDMA_MBX_STATUS_INVALID_FBO:
+	case OCRDMA_MBX_STATUS_INVALID_ACC_RIGHTS:
+	case OCRDMA_MBX_STATUS_INVALID_PBE_SIZE:
+	case OCRDMA_MBX_STATUS_ATOMIC_OPS_UNSUP:
+	case OCRDMA_MBX_STATUS_SRQ_ERROR:
+	case OCRDMA_MBX_STATUS_SRQ_SIZE_UNDERUNS:
+		err_num = -EINVAL;
+		break;
+
+	case OCRDMA_MBX_STATUS_PD_INUSE:
+	case OCRDMA_MBX_STATUS_QP_BOUND:
+	case OCRDMA_MBX_STATUS_MW_STILL_BOUND:
+	case OCRDMA_MBX_STATUS_MW_BOUND:
+		err_num = -EBUSY;
+		break;
+
+	case OCRDMA_MBX_STATUS_RECVQ_RQE_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SGE_RECV_EXCEEDS:
+	case OCRDMA_MBX_STATUS_RQE_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SRQ_LIMIT_EXCEEDS:
+	case OCRDMA_MBX_STATUS_ORD_EXCEEDS:
+	case OCRDMA_MBX_STATUS_IRD_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SENDQ_WQE_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SGE_SEND_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SGE_WRITE_EXCEEDS:
+		err_num = -ENOBUFS;
+		break;
+
+	case OCRDMA_MBX_STATUS_FAILED:
+		switch (add_status) {
+		case OCRDMA_MBX_ADDI_STATUS_INSUFFICIENT_RESOURCES:
+			err_num = -EAGAIN;
+			break;
+		default:
+			err_num = -EFAULT;
+		}
+		break;
+	default:
+		err_num = -EFAULT;
+	}
+	return err_num;
+}
+
+char *port_speed_string(struct ocrdma_dev *dev)
+{
+	char *str = "";
+	u16 speeds_supported;
+
+	speeds_supported = dev->phy.fixed_speeds_supported |
+				dev->phy.auto_speeds_supported;
+	if (speeds_supported & OCRDMA_PHY_SPEED_40GBPS)
+		str = "40Gbps ";
+	else if (speeds_supported & OCRDMA_PHY_SPEED_10GBPS)
+		str = "10Gbps ";
+	else if (speeds_supported & OCRDMA_PHY_SPEED_1GBPS)
+		str = "1Gbps ";
+
+	return str;
+}
+
+static int ocrdma_get_mbx_cqe_errno(u16 cqe_status)
+{
+	int err_num = -EINVAL;
+
+	switch (cqe_status) {
+	case OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_PRIVILEDGES:
+		err_num = -EPERM;
+		break;
+	case OCRDMA_MBX_CQE_STATUS_INVALID_PARAMETER:
+		err_num = -EINVAL;
+		break;
+	case OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_RESOURCES:
+	case OCRDMA_MBX_CQE_STATUS_QUEUE_FLUSHING:
+		err_num = -EINVAL;
+		break;
+	case OCRDMA_MBX_CQE_STATUS_DMA_FAILED:
+	default:
+		err_num = -EINVAL;
+		break;
+	}
+	return err_num;
+}
+
+void ocrdma_ring_cq_db(struct ocrdma_dev *dev, u16 cq_id, bool armed,
+		       bool solicited, u16 cqe_popped)
+{
+	u32 val = cq_id & OCRDMA_DB_CQ_RING_ID_MASK;
+
+	val |= ((cq_id & OCRDMA_DB_CQ_RING_ID_EXT_MASK) <<
+	     OCRDMA_DB_CQ_RING_ID_EXT_MASK_SHIFT);
+
+	if (armed)
+		val |= (1 << OCRDMA_DB_CQ_REARM_SHIFT);
+	if (solicited)
+		val |= (1 << OCRDMA_DB_CQ_SOLICIT_SHIFT);
+	val |= (cqe_popped << OCRDMA_DB_CQ_NUM_POPPED_SHIFT);
+	iowrite32(val, dev->nic_info.db + OCRDMA_DB_CQ_OFFSET);
+}
+
+static void ocrdma_ring_mq_db(struct ocrdma_dev *dev)
+{
+	u32 val = 0;
+
+	val |= dev->mq.sq.id & OCRDMA_MQ_ID_MASK;
+	val |= 1 << OCRDMA_MQ_NUM_MQE_SHIFT;
+	iowrite32(val, dev->nic_info.db + OCRDMA_DB_MQ_OFFSET);
+}
+
+static void ocrdma_ring_eq_db(struct ocrdma_dev *dev, u16 eq_id,
+			      bool arm, bool clear_int, u16 num_eqe)
+{
+	u32 val = 0;
+
+	val |= eq_id & OCRDMA_EQ_ID_MASK;
+	val |= ((eq_id & OCRDMA_EQ_ID_EXT_MASK) << OCRDMA_EQ_ID_EXT_MASK_SHIFT);
+	if (arm)
+		val |= (1 << OCRDMA_REARM_SHIFT);
+	if (clear_int)
+		val |= (1 << OCRDMA_EQ_CLR_SHIFT);
+	val |= (1 << OCRDMA_EQ_TYPE_SHIFT);
+	val |= (num_eqe << OCRDMA_NUM_EQE_SHIFT);
+	iowrite32(val, dev->nic_info.db + OCRDMA_DB_EQ_OFFSET);
+}
+
+static void ocrdma_init_mch(struct ocrdma_mbx_hdr *cmd_hdr,
+			    u8 opcode, u8 subsys, u32 cmd_len)
+{
+	cmd_hdr->subsys_op = (opcode | (subsys << OCRDMA_MCH_SUBSYS_SHIFT));
+	cmd_hdr->timeout = 20; /* seconds */
+	cmd_hdr->cmd_len = cmd_len - sizeof(struct ocrdma_mbx_hdr);
+}
+
+static void *ocrdma_init_emb_mqe(u8 opcode, u32 cmd_len)
+{
+	struct ocrdma_mqe *mqe;
+
+	mqe = kzalloc(sizeof(struct ocrdma_mqe), GFP_KERNEL);
+	if (!mqe)
+		return NULL;
+	mqe->hdr.spcl_sge_cnt_emb |=
+		(OCRDMA_MQE_EMBEDDED << OCRDMA_MQE_HDR_EMB_SHIFT) &
+					OCRDMA_MQE_HDR_EMB_MASK;
+	mqe->hdr.pyld_len = cmd_len - sizeof(struct ocrdma_mqe_hdr);
+
+	ocrdma_init_mch(&mqe->u.emb_req.mch, opcode, OCRDMA_SUBSYS_ROCE,
+			mqe->hdr.pyld_len);
+	return mqe;
+}
+
+static void ocrdma_free_q(struct ocrdma_dev *dev, struct ocrdma_queue_info *q)
+{
+	dma_free_coherent(&dev->nic_info.pdev->dev, q->size, q->va, q->dma);
+}
+
+static int ocrdma_alloc_q(struct ocrdma_dev *dev,
+			  struct ocrdma_queue_info *q, u16 len, u16 entry_size)
+{
+	memset(q, 0, sizeof(*q));
+	q->len = len;
+	q->entry_size = entry_size;
+	q->size = len * entry_size;
+	q->va = dma_zalloc_coherent(&dev->nic_info.pdev->dev, q->size,
+				    &q->dma, GFP_KERNEL);
+	if (!q->va)
+		return -ENOMEM;
+	return 0;
+}
+
+static void ocrdma_build_q_pages(struct ocrdma_pa *q_pa, int cnt,
+					dma_addr_t host_pa, int hw_page_size)
+{
+	int i;
+
+	for (i = 0; i < cnt; i++) {
+		q_pa[i].lo = (u32) (host_pa & 0xffffffff);
+		q_pa[i].hi = (u32) upper_32_bits(host_pa);
+		host_pa += hw_page_size;
+	}
+}
+
+static int ocrdma_mbx_delete_q(struct ocrdma_dev *dev,
+			       struct ocrdma_queue_info *q, int queue_type)
+{
+	u8 opcode = 0;
+	int status;
+	struct ocrdma_delete_q_req *cmd = dev->mbx_cmd;
+
+	switch (queue_type) {
+	case QTYPE_MCCQ:
+		opcode = OCRDMA_CMD_DELETE_MQ;
+		break;
+	case QTYPE_CQ:
+		opcode = OCRDMA_CMD_DELETE_CQ;
+		break;
+	case QTYPE_EQ:
+		opcode = OCRDMA_CMD_DELETE_EQ;
+		break;
+	default:
+		BUG();
+	}
+	memset(cmd, 0, sizeof(*cmd));
+	ocrdma_init_mch(&cmd->req, opcode, OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+	cmd->id = q->id;
+
+	status = be_roce_mcc_cmd(dev->nic_info.netdev,
+				 cmd, sizeof(*cmd), NULL, NULL);
+	if (!status)
+		q->created = false;
+	return status;
+}
+
+static int ocrdma_mbx_create_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
+{
+	int status;
+	struct ocrdma_create_eq_req *cmd = dev->mbx_cmd;
+	struct ocrdma_create_eq_rsp *rsp = dev->mbx_cmd;
+
+	memset(cmd, 0, sizeof(*cmd));
+	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_EQ, OCRDMA_SUBSYS_COMMON,
+			sizeof(*cmd));
+
+	cmd->req.rsvd_version = 2;
+	cmd->num_pages = 4;
+	cmd->valid = OCRDMA_CREATE_EQ_VALID;
+	cmd->cnt = 4 << OCRDMA_CREATE_EQ_CNT_SHIFT;
+
+	ocrdma_build_q_pages(&cmd->pa[0], cmd->num_pages, eq->q.dma,
+			     PAGE_SIZE_4K);
+	status = be_roce_mcc_cmd(dev->nic_info.netdev, cmd, sizeof(*cmd), NULL,
+				 NULL);
+	if (!status) {
+		eq->q.id = rsp->vector_eqid & 0xffff;
+		eq->vector = (rsp->vector_eqid >> 16) & 0xffff;
+		eq->q.created = true;
+	}
+	return status;
+}
+
+static int ocrdma_create_eq(struct ocrdma_dev *dev,
+			    struct ocrdma_eq *eq, u16 q_len)
+{
+	int status;
+
+	status = ocrdma_alloc_q(dev, &eq->q, OCRDMA_EQ_LEN,
+				sizeof(struct ocrdma_eqe));
+	if (status)
+		return status;
+
+	status = ocrdma_mbx_create_eq(dev, eq);
+	if (status)
+		goto mbx_err;
+	eq->dev = dev;
+	ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0);
+
+	return 0;
+mbx_err:
+	ocrdma_free_q(dev, &eq->q);
+	return status;
+}
+
+int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
+{
+	int irq;
+
+	if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX)
+		irq = dev->nic_info.pdev->irq;
+	else
+		irq = dev->nic_info.msix.vector_list[eq->vector];
+	return irq;
+}
+
+static void _ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
+{
+	if (eq->q.created) {
+		ocrdma_mbx_delete_q(dev, &eq->q, QTYPE_EQ);
+		ocrdma_free_q(dev, &eq->q);
+	}
+}
+
+static void ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
+{
+	int irq;
+
+	/* disarm EQ so that interrupts are not generated
+	 * during freeing and EQ delete is in progress.
+	 */
+	ocrdma_ring_eq_db(dev, eq->q.id, false, false, 0);
+
+	irq = ocrdma_get_irq(dev, eq);
+	free_irq(irq, eq);
+	_ocrdma_destroy_eq(dev, eq);
+}
+
+static void ocrdma_destroy_eqs(struct ocrdma_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->eq_cnt; i++)
+		ocrdma_destroy_eq(dev, &dev->eq_tbl[i]);
+}
+
+static int ocrdma_mbx_mq_cq_create(struct ocrdma_dev *dev,
+				   struct ocrdma_queue_info *cq,
+				   struct ocrdma_queue_info *eq)
+{
+	struct ocrdma_create_cq_cmd *cmd = dev->mbx_cmd;
+	struct ocrdma_create_cq_cmd_rsp *rsp = dev->mbx_cmd;
+	int status;
+
+	memset(cmd, 0, sizeof(*cmd));
+	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_CQ,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	cmd->req.rsvd_version = OCRDMA_CREATE_CQ_VER2;
+	cmd->pgsz_pgcnt = (cq->size / OCRDMA_MIN_Q_PAGE_SIZE) <<
+		OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT;
+	cmd->pgsz_pgcnt |= PAGES_4K_SPANNED(cq->va, cq->size);
+
+	cmd->ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS;
+	cmd->eqn = eq->id;
+	cmd->pdid_cqecnt = cq->size / sizeof(struct ocrdma_mcqe);
+
+	ocrdma_build_q_pages(&cmd->pa[0], cq->size / OCRDMA_MIN_Q_PAGE_SIZE,
+			     cq->dma, PAGE_SIZE_4K);
+	status = be_roce_mcc_cmd(dev->nic_info.netdev,
+				 cmd, sizeof(*cmd), NULL, NULL);
+	if (!status) {
+		cq->id = (u16) (rsp->cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK);
+		cq->created = true;
+	}
+	return status;
+}
+
+static u32 ocrdma_encoded_q_len(int q_len)
+{
+	u32 len_encoded = fls(q_len);	/* log2(len) + 1 */
+
+	if (len_encoded == 16)
+		len_encoded = 0;
+	return len_encoded;
+}
+
+static int ocrdma_mbx_create_mq(struct ocrdma_dev *dev,
+				struct ocrdma_queue_info *mq,
+				struct ocrdma_queue_info *cq)
+{
+	int num_pages, status;
+	struct ocrdma_create_mq_req *cmd = dev->mbx_cmd;
+	struct ocrdma_create_mq_rsp *rsp = dev->mbx_cmd;
+	struct ocrdma_pa *pa;
+
+	memset(cmd, 0, sizeof(*cmd));
+	num_pages = PAGES_4K_SPANNED(mq->va, mq->size);
+
+	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_MQ_EXT,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+	cmd->req.rsvd_version = 1;
+	cmd->cqid_pages = num_pages;
+	cmd->cqid_pages |= (cq->id << OCRDMA_CREATE_MQ_CQ_ID_SHIFT);
+	cmd->async_cqid_valid = OCRDMA_CREATE_MQ_ASYNC_CQ_VALID;
+
+	cmd->async_event_bitmap = BIT(OCRDMA_ASYNC_GRP5_EVE_CODE);
+	cmd->async_event_bitmap |= BIT(OCRDMA_ASYNC_RDMA_EVE_CODE);
+	/* Request link events on this  MQ. */
+	cmd->async_event_bitmap |= BIT(OCRDMA_ASYNC_LINK_EVE_CODE);
+
+	cmd->async_cqid_ringsize = cq->id;
+	cmd->async_cqid_ringsize |= (ocrdma_encoded_q_len(mq->len) <<
+				OCRDMA_CREATE_MQ_RING_SIZE_SHIFT);
+	cmd->valid = OCRDMA_CREATE_MQ_VALID;
+	pa = &cmd->pa[0];
+
+	ocrdma_build_q_pages(pa, num_pages, mq->dma, PAGE_SIZE_4K);
+	status = be_roce_mcc_cmd(dev->nic_info.netdev,
+				 cmd, sizeof(*cmd), NULL, NULL);
+	if (!status) {
+		mq->id = rsp->id;
+		mq->created = true;
+	}
+	return status;
+}
+
+static int ocrdma_create_mq(struct ocrdma_dev *dev)
+{
+	int status;
+
+	/* Alloc completion queue for Mailbox queue */
+	status = ocrdma_alloc_q(dev, &dev->mq.cq, OCRDMA_MQ_CQ_LEN,
+				sizeof(struct ocrdma_mcqe));
+	if (status)
+		goto alloc_err;
+
+	dev->eq_tbl[0].cq_cnt++;
+	status = ocrdma_mbx_mq_cq_create(dev, &dev->mq.cq, &dev->eq_tbl[0].q);
+	if (status)
+		goto mbx_cq_free;
+
+	memset(&dev->mqe_ctx, 0, sizeof(dev->mqe_ctx));
+	init_waitqueue_head(&dev->mqe_ctx.cmd_wait);
+	mutex_init(&dev->mqe_ctx.lock);
+
+	/* Alloc Mailbox queue */
+	status = ocrdma_alloc_q(dev, &dev->mq.sq, OCRDMA_MQ_LEN,
+				sizeof(struct ocrdma_mqe));
+	if (status)
+		goto mbx_cq_destroy;
+	status = ocrdma_mbx_create_mq(dev, &dev->mq.sq, &dev->mq.cq);
+	if (status)
+		goto mbx_q_free;
+	ocrdma_ring_cq_db(dev, dev->mq.cq.id, true, false, 0);
+	return 0;
+
+mbx_q_free:
+	ocrdma_free_q(dev, &dev->mq.sq);
+mbx_cq_destroy:
+	ocrdma_mbx_delete_q(dev, &dev->mq.cq, QTYPE_CQ);
+mbx_cq_free:
+	ocrdma_free_q(dev, &dev->mq.cq);
+alloc_err:
+	return status;
+}
+
+static void ocrdma_destroy_mq(struct ocrdma_dev *dev)
+{
+	struct ocrdma_queue_info *mbxq, *cq;
+
+	/* mqe_ctx lock synchronizes with any other pending cmds. */
+	mutex_lock(&dev->mqe_ctx.lock);
+	mbxq = &dev->mq.sq;
+	if (mbxq->created) {
+		ocrdma_mbx_delete_q(dev, mbxq, QTYPE_MCCQ);
+		ocrdma_free_q(dev, mbxq);
+	}
+	mutex_unlock(&dev->mqe_ctx.lock);
+
+	cq = &dev->mq.cq;
+	if (cq->created) {
+		ocrdma_mbx_delete_q(dev, cq, QTYPE_CQ);
+		ocrdma_free_q(dev, cq);
+	}
+}
+
+static void ocrdma_process_qpcat_error(struct ocrdma_dev *dev,
+				       struct ocrdma_qp *qp)
+{
+	enum ib_qp_state new_ib_qps = IB_QPS_ERR;
+	enum ib_qp_state old_ib_qps;
+
+	if (qp == NULL)
+		BUG();
+	ocrdma_qp_state_change(qp, new_ib_qps, &old_ib_qps);
+}
+
+static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev,
+				    struct ocrdma_ae_mcqe *cqe)
+{
+	struct ocrdma_qp *qp = NULL;
+	struct ocrdma_cq *cq = NULL;
+	struct ib_event ib_evt;
+	int cq_event = 0;
+	int qp_event = 1;
+	int srq_event = 0;
+	int dev_event = 0;
+	int type = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_TYPE_MASK) >>
+	    OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT;
+	u16 qpid = cqe->qpvalid_qpid & OCRDMA_AE_MCQE_QPID_MASK;
+	u16 cqid = cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQID_MASK;
+
+	/*
+	 * Some FW version returns wrong qp or cq ids in CQEs.
+	 * Checking whether the IDs are valid
+	 */
+
+	if (cqe->qpvalid_qpid & OCRDMA_AE_MCQE_QPVALID) {
+		if (qpid < dev->attr.max_qp)
+			qp = dev->qp_tbl[qpid];
+		if (qp == NULL) {
+			pr_err("ocrdma%d:Async event - qpid %u is not valid\n",
+			       dev->id, qpid);
+			return;
+		}
+	}
+
+	if (cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQVALID) {
+		if (cqid < dev->attr.max_cq)
+			cq = dev->cq_tbl[cqid];
+		if (cq == NULL) {
+			pr_err("ocrdma%d:Async event - cqid %u is not valid\n",
+			       dev->id, cqid);
+			return;
+		}
+	}
+
+	memset(&ib_evt, 0, sizeof(ib_evt));
+
+	ib_evt.device = &dev->ibdev;
+
+	switch (type) {
+	case OCRDMA_CQ_ERROR:
+		ib_evt.element.cq = &cq->ibcq;
+		ib_evt.event = IB_EVENT_CQ_ERR;
+		cq_event = 1;
+		qp_event = 0;
+		break;
+	case OCRDMA_CQ_OVERRUN_ERROR:
+		ib_evt.element.cq = &cq->ibcq;
+		ib_evt.event = IB_EVENT_CQ_ERR;
+		cq_event = 1;
+		qp_event = 0;
+		break;
+	case OCRDMA_CQ_QPCAT_ERROR:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_QP_FATAL;
+		ocrdma_process_qpcat_error(dev, qp);
+		break;
+	case OCRDMA_QP_ACCESS_ERROR:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_QP_ACCESS_ERR;
+		break;
+	case OCRDMA_QP_COMM_EST_EVENT:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_COMM_EST;
+		break;
+	case OCRDMA_SQ_DRAINED_EVENT:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_SQ_DRAINED;
+		break;
+	case OCRDMA_DEVICE_FATAL_EVENT:
+		ib_evt.element.port_num = 1;
+		ib_evt.event = IB_EVENT_DEVICE_FATAL;
+		qp_event = 0;
+		dev_event = 1;
+		break;
+	case OCRDMA_SRQCAT_ERROR:
+		ib_evt.element.srq = &qp->srq->ibsrq;
+		ib_evt.event = IB_EVENT_SRQ_ERR;
+		srq_event = 1;
+		qp_event = 0;
+		break;
+	case OCRDMA_SRQ_LIMIT_EVENT:
+		ib_evt.element.srq = &qp->srq->ibsrq;
+		ib_evt.event = IB_EVENT_SRQ_LIMIT_REACHED;
+		srq_event = 1;
+		qp_event = 0;
+		break;
+	case OCRDMA_QP_LAST_WQE_EVENT:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		break;
+	default:
+		cq_event = 0;
+		qp_event = 0;
+		srq_event = 0;
+		dev_event = 0;
+		pr_err("%s() unknown type=0x%x\n", __func__, type);
+		break;
+	}
+
+	if (type < OCRDMA_MAX_ASYNC_ERRORS)
+		atomic_inc(&dev->async_err_stats[type]);
+
+	if (qp_event) {
+		if (qp->ibqp.event_handler)
+			qp->ibqp.event_handler(&ib_evt, qp->ibqp.qp_context);
+	} else if (cq_event) {
+		if (cq->ibcq.event_handler)
+			cq->ibcq.event_handler(&ib_evt, cq->ibcq.cq_context);
+	} else if (srq_event) {
+		if (qp->srq->ibsrq.event_handler)
+			qp->srq->ibsrq.event_handler(&ib_evt,
+						     qp->srq->ibsrq.
+						     srq_context);
+	} else if (dev_event) {
+		pr_err("%s: Fatal event received\n", dev->ibdev.name);
+		ib_dispatch_event(&ib_evt);
+	}
+
+}
+
+static void ocrdma_process_grp5_aync(struct ocrdma_dev *dev,
+					struct ocrdma_ae_mcqe *cqe)
+{
+	struct ocrdma_ae_pvid_mcqe *evt;
+	int type = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_TYPE_MASK) >>
+			OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT;
+
+	switch (type) {
+	case OCRDMA_ASYNC_EVENT_PVID_STATE:
+		evt = (struct ocrdma_ae_pvid_mcqe *)cqe;
+		if ((evt->tag_enabled & OCRDMA_AE_PVID_MCQE_ENABLED_MASK) >>
+			OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT)
+			dev->pvid = ((evt->tag_enabled &
+					OCRDMA_AE_PVID_MCQE_TAG_MASK) >>
+					OCRDMA_AE_PVID_MCQE_TAG_SHIFT);
+		break;
+
+	case OCRDMA_ASYNC_EVENT_COS_VALUE:
+		atomic_set(&dev->update_sl, 1);
+		break;
+	default:
+		/* Not interested evts. */
+		break;
+	}
+}
+
+static void ocrdma_process_link_state(struct ocrdma_dev *dev,
+				      struct ocrdma_ae_mcqe *cqe)
+{
+	struct ocrdma_ae_lnkst_mcqe *evt;
+	u8 lstate;
+
+	evt = (struct ocrdma_ae_lnkst_mcqe *)cqe;
+	lstate = ocrdma_get_ae_link_state(evt->speed_state_ptn);
+
+	if (!(lstate & OCRDMA_AE_LSC_LLINK_MASK))
+		return;
+
+	if (dev->flags & OCRDMA_FLAGS_LINK_STATUS_INIT)
+		ocrdma_update_link_state(dev, (lstate & OCRDMA_LINK_ST_MASK));
+}
+
+static void ocrdma_process_acqe(struct ocrdma_dev *dev, void *ae_cqe)
+{
+	/* async CQE processing */
+	struct ocrdma_ae_mcqe *cqe = ae_cqe;
+	u32 evt_code = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_CODE_MASK) >>
+			OCRDMA_AE_MCQE_EVENT_CODE_SHIFT;
+	switch (evt_code) {
+	case OCRDMA_ASYNC_LINK_EVE_CODE:
+		ocrdma_process_link_state(dev, cqe);
+		break;
+	case OCRDMA_ASYNC_RDMA_EVE_CODE:
+		ocrdma_dispatch_ibevent(dev, cqe);
+		break;
+	case OCRDMA_ASYNC_GRP5_EVE_CODE:
+		ocrdma_process_grp5_aync(dev, cqe);
+		break;
+	default:
+		pr_err("%s(%d) invalid evt code=0x%x\n", __func__,
+		       dev->id, evt_code);
+	}
+}
+
+static void ocrdma_process_mcqe(struct ocrdma_dev *dev, struct ocrdma_mcqe *cqe)
+{
+	if (dev->mqe_ctx.tag == cqe->tag_lo && dev->mqe_ctx.cmd_done == false) {
+		dev->mqe_ctx.cqe_status = (cqe->status &
+		     OCRDMA_MCQE_STATUS_MASK) >> OCRDMA_MCQE_STATUS_SHIFT;
+		dev->mqe_ctx.ext_status =
+		    (cqe->status & OCRDMA_MCQE_ESTATUS_MASK)
+		    >> OCRDMA_MCQE_ESTATUS_SHIFT;
+		dev->mqe_ctx.cmd_done = true;
+		wake_up(&dev->mqe_ctx.cmd_wait);
+	} else
+		pr_err("%s() cqe for invalid tag0x%x.expected=0x%x\n",
+		       __func__, cqe->tag_lo, dev->mqe_ctx.tag);
+}
+
+static int ocrdma_mq_cq_handler(struct ocrdma_dev *dev, u16 cq_id)
+{
+	u16 cqe_popped = 0;
+	struct ocrdma_mcqe *cqe;
+
+	while (1) {
+		cqe = ocrdma_get_mcqe(dev);
+		if (cqe == NULL)
+			break;
+		ocrdma_le32_to_cpu(cqe, sizeof(*cqe));
+		cqe_popped += 1;
+		if (cqe->valid_ae_cmpl_cons & OCRDMA_MCQE_AE_MASK)
+			ocrdma_process_acqe(dev, cqe);
+		else if (cqe->valid_ae_cmpl_cons & OCRDMA_MCQE_CMPL_MASK)
+			ocrdma_process_mcqe(dev, cqe);
+		memset(cqe, 0, sizeof(struct ocrdma_mcqe));
+		ocrdma_mcq_inc_tail(dev);
+	}
+	ocrdma_ring_cq_db(dev, dev->mq.cq.id, true, false, cqe_popped);
+	return 0;
+}
+
+static struct ocrdma_cq *_ocrdma_qp_buddy_cq_handler(struct ocrdma_dev *dev,
+				struct ocrdma_cq *cq, bool sq)
+{
+	struct ocrdma_qp *qp;
+	struct list_head *cur;
+	struct ocrdma_cq *bcq = NULL;
+	struct list_head *head = sq?(&cq->sq_head):(&cq->rq_head);
+
+	list_for_each(cur, head) {
+		if (sq)
+			qp = list_entry(cur, struct ocrdma_qp, sq_entry);
+		else
+			qp = list_entry(cur, struct ocrdma_qp, rq_entry);
+
+		if (qp->srq)
+			continue;
+		/* if wq and rq share the same cq, than comp_handler
+		 * is already invoked.
+		 */
+		if (qp->sq_cq == qp->rq_cq)
+			continue;
+		/* if completion came on sq, rq's cq is buddy cq.
+		 * if completion came on rq, sq's cq is buddy cq.
+		 */
+		if (qp->sq_cq == cq)
+			bcq = qp->rq_cq;
+		else
+			bcq = qp->sq_cq;
+		return bcq;
+	}
+	return NULL;
+}
+
+static void ocrdma_qp_buddy_cq_handler(struct ocrdma_dev *dev,
+				       struct ocrdma_cq *cq)
+{
+	unsigned long flags;
+	struct ocrdma_cq *bcq = NULL;
+
+	/* Go through list of QPs in error state which are using this CQ
+	 * and invoke its callback handler to trigger CQE processing for
+	 * error/flushed CQE. It is rare to find more than few entries in
+	 * this list as most consumers stops after getting error CQE.
+	 * List is traversed only once when a matching buddy cq found for a QP.
+	 */
+	spin_lock_irqsave(&dev->flush_q_lock, flags);
+	/* Check if buddy CQ is present.
+	 * true - Check for  SQ CQ
+	 * false - Check for RQ CQ
+	 */
+	bcq = _ocrdma_qp_buddy_cq_handler(dev, cq, true);
+	if (bcq == NULL)
+		bcq = _ocrdma_qp_buddy_cq_handler(dev, cq, false);
+	spin_unlock_irqrestore(&dev->flush_q_lock, flags);
+
+	/* if there is valid buddy cq, look for its completion handler */
+	if (bcq && bcq->ibcq.comp_handler) {
+		spin_lock_irqsave(&bcq->comp_handler_lock, flags);
+		(*bcq->ibcq.comp_handler) (&bcq->ibcq, bcq->ibcq.cq_context);
+		spin_unlock_irqrestore(&bcq->comp_handler_lock, flags);
+	}
+}
+
+static void ocrdma_qp_cq_handler(struct ocrdma_dev *dev, u16 cq_idx)
+{
+	unsigned long flags;
+	struct ocrdma_cq *cq;
+
+	if (cq_idx >= OCRDMA_MAX_CQ)
+		BUG();
+
+	cq = dev->cq_tbl[cq_idx];
+	if (cq == NULL)
+		return;
+
+	if (cq->ibcq.comp_handler) {
+		spin_lock_irqsave(&cq->comp_handler_lock, flags);
+		(*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
+		spin_unlock_irqrestore(&cq->comp_handler_lock, flags);
+	}
+	ocrdma_qp_buddy_cq_handler(dev, cq);
+}
+
+static void ocrdma_cq_handler(struct ocrdma_dev *dev, u16 cq_id)
+{
+	/* process the MQ-CQE. */
+	if (cq_id == dev->mq.cq.id)
+		ocrdma_mq_cq_handler(dev, cq_id);
+	else
+		ocrdma_qp_cq_handler(dev, cq_id);
+}
+
+static irqreturn_t ocrdma_irq_handler(int irq, void *handle)
+{
+	struct ocrdma_eq *eq = handle;
+	struct ocrdma_dev *dev = eq->dev;
+	struct ocrdma_eqe eqe;
+	struct ocrdma_eqe *ptr;
+	u16 cq_id;
+	u8 mcode;
+	int budget = eq->cq_cnt;
+
+	do {
+		ptr = ocrdma_get_eqe(eq);
+		eqe = *ptr;
+		ocrdma_le32_to_cpu(&eqe, sizeof(eqe));
+		mcode = (eqe.id_valid & OCRDMA_EQE_MAJOR_CODE_MASK)
+				>> OCRDMA_EQE_MAJOR_CODE_SHIFT;
+		if (mcode == OCRDMA_MAJOR_CODE_SENTINAL)
+			pr_err("EQ full on eqid = 0x%x, eqe = 0x%x\n",
+			       eq->q.id, eqe.id_valid);
+		if ((eqe.id_valid & OCRDMA_EQE_VALID_MASK) == 0)
+			break;
+
+		ptr->id_valid = 0;
+		/* ring eq doorbell as soon as its consumed. */
+		ocrdma_ring_eq_db(dev, eq->q.id, false, true, 1);
+		/* check whether its CQE or not. */
+		if ((eqe.id_valid & OCRDMA_EQE_FOR_CQE_MASK) == 0) {
+			cq_id = eqe.id_valid >> OCRDMA_EQE_RESOURCE_ID_SHIFT;
+			ocrdma_cq_handler(dev, cq_id);
+		}
+		ocrdma_eq_inc_tail(eq);
+
+		/* There can be a stale EQE after the last bound CQ is
+		 * destroyed. EQE valid and budget == 0 implies this.
+		 */
+		if (budget)
+			budget--;
+
+	} while (budget);
+
+	eq->aic_obj.eq_intr_cnt++;
+	ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0);
+	return IRQ_HANDLED;
+}
+
+static void ocrdma_post_mqe(struct ocrdma_dev *dev, struct ocrdma_mqe *cmd)
+{
+	struct ocrdma_mqe *mqe;
+
+	dev->mqe_ctx.tag = dev->mq.sq.head;
+	dev->mqe_ctx.cmd_done = false;
+	mqe = ocrdma_get_mqe(dev);
+	cmd->hdr.tag_lo = dev->mq.sq.head;
+	ocrdma_copy_cpu_to_le32(mqe, cmd, sizeof(*mqe));
+	/* make sure descriptor is written before ringing doorbell */
+	wmb();
+	ocrdma_mq_inc_head(dev);
+	ocrdma_ring_mq_db(dev);
+}
+
+static int ocrdma_wait_mqe_cmpl(struct ocrdma_dev *dev)
+{
+	long status;
+	/* 30 sec timeout */
+	status = wait_event_timeout(dev->mqe_ctx.cmd_wait,
+				    (dev->mqe_ctx.cmd_done != false),
+				    msecs_to_jiffies(30000));
+	if (status)
+		return 0;
+	else {
+		dev->mqe_ctx.fw_error_state = true;
+		pr_err("%s(%d) mailbox timeout: fw not responding\n",
+		       __func__, dev->id);
+		return -1;
+	}
+}
+
+/* issue a mailbox command on the MQ */
+static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe)
+{
+	int status = 0;
+	u16 cqe_status, ext_status;
+	struct ocrdma_mqe *rsp_mqe;
+	struct ocrdma_mbx_rsp *rsp = NULL;
+
+	mutex_lock(&dev->mqe_ctx.lock);
+	if (dev->mqe_ctx.fw_error_state)
+		goto mbx_err;
+	ocrdma_post_mqe(dev, mqe);
+	status = ocrdma_wait_mqe_cmpl(dev);
+	if (status)
+		goto mbx_err;
+	cqe_status = dev->mqe_ctx.cqe_status;
+	ext_status = dev->mqe_ctx.ext_status;
+	rsp_mqe = ocrdma_get_mqe_rsp(dev);
+	ocrdma_copy_le32_to_cpu(mqe, rsp_mqe, (sizeof(*mqe)));
+	if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >>
+				OCRDMA_MQE_HDR_EMB_SHIFT)
+		rsp = &mqe->u.rsp;
+
+	if (cqe_status || ext_status) {
+		pr_err("%s() cqe_status=0x%x, ext_status=0x%x,\n",
+		       __func__, cqe_status, ext_status);
+		if (rsp) {
+			/* This is for embedded cmds. */
+			pr_err("opcode=0x%x, subsystem=0x%x\n",
+			       (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >>
+				OCRDMA_MBX_RSP_OPCODE_SHIFT,
+				(rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >>
+				OCRDMA_MBX_RSP_SUBSYS_SHIFT);
+		}
+		status = ocrdma_get_mbx_cqe_errno(cqe_status);
+		goto mbx_err;
+	}
+	/* For non embedded, rsp errors are handled in ocrdma_nonemb_mbx_cmd */
+	if (rsp && (mqe->u.rsp.status & OCRDMA_MBX_RSP_STATUS_MASK))
+		status = ocrdma_get_mbx_errno(mqe->u.rsp.status);
+mbx_err:
+	mutex_unlock(&dev->mqe_ctx.lock);
+	return status;
+}
+
+static int ocrdma_nonemb_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe,
+				 void *payload_va)
+{
+	int status;
+	struct ocrdma_mbx_rsp *rsp = payload_va;
+
+	if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >>
+				OCRDMA_MQE_HDR_EMB_SHIFT)
+		BUG();
+
+	status = ocrdma_mbx_cmd(dev, mqe);
+	if (!status)
+		/* For non embedded, only CQE failures are handled in
+		 * ocrdma_mbx_cmd. We need to check for RSP errors.
+		 */
+		if (rsp->status & OCRDMA_MBX_RSP_STATUS_MASK)
+			status = ocrdma_get_mbx_errno(rsp->status);
+
+	if (status)
+		pr_err("opcode=0x%x, subsystem=0x%x\n",
+		       (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >>
+			OCRDMA_MBX_RSP_OPCODE_SHIFT,
+			(rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >>
+			OCRDMA_MBX_RSP_SUBSYS_SHIFT);
+	return status;
+}
+
+static void ocrdma_get_attr(struct ocrdma_dev *dev,
+			      struct ocrdma_dev_attr *attr,
+			      struct ocrdma_mbx_query_config *rsp)
+{
+	attr->max_pd =
+	    (rsp->max_pd_ca_ack_delay & OCRDMA_MBX_QUERY_CFG_MAX_PD_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT;
+	attr->udp_encap = (rsp->max_pd_ca_ack_delay &
+			   OCRDMA_MBX_QUERY_CFG_L3_TYPE_MASK) >>
+			   OCRDMA_MBX_QUERY_CFG_L3_TYPE_SHIFT;
+	attr->max_dpp_pds =
+	   (rsp->max_dpp_pds_credits & OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET;
+	attr->max_qp =
+	    (rsp->qp_srq_cq_ird_ord & OCRDMA_MBX_QUERY_CFG_MAX_QP_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT;
+	attr->max_srq =
+		(rsp->max_srq_rpir_qps & OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK) >>
+		OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET;
+	attr->max_send_sge = ((rsp->max_recv_send_sge &
+			       OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >>
+			      OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT);
+	attr->max_recv_sge = (rsp->max_recv_send_sge &
+			      OCRDMA_MBX_QUERY_CFG_MAX_RECV_SGE_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_RECV_SGE_SHIFT;
+	attr->max_srq_sge = (rsp->max_srq_rqe_sge &
+			      OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET;
+	attr->max_rdma_sge = (rsp->max_wr_rd_sge &
+			      OCRDMA_MBX_QUERY_CFG_MAX_RD_SGE_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_RD_SGE_SHIFT;
+	attr->max_ord_per_qp = (rsp->max_ird_ord_per_qp &
+				OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT;
+	attr->max_ird_per_qp = (rsp->max_ird_ord_per_qp &
+				OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT;
+	attr->cq_overflow_detect = (rsp->qp_srq_cq_ird_ord &
+				    OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_SHIFT;
+	attr->srq_supported = (rsp->qp_srq_cq_ird_ord &
+			       OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_SHIFT;
+	attr->local_ca_ack_delay = (rsp->max_pd_ca_ack_delay &
+				    OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT;
+	attr->max_mw = rsp->max_mw;
+	attr->max_mr = rsp->max_mr;
+	attr->max_mr_size = ((u64)rsp->max_mr_size_hi << 32) |
+			      rsp->max_mr_size_lo;
+	attr->max_fmr = 0;
+	attr->max_pages_per_frmr = rsp->max_pages_per_frmr;
+	attr->max_num_mr_pbl = rsp->max_num_mr_pbl;
+	attr->max_cqe = rsp->max_cq_cqes_per_cq &
+			OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_MASK;
+	attr->max_cq = (rsp->max_cq_cqes_per_cq &
+			OCRDMA_MBX_QUERY_CFG_MAX_CQ_MASK) >>
+			OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET;
+	attr->wqe_size = ((rsp->wqe_rqe_stride_max_dpp_cqs &
+		OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_MASK) >>
+		OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET) *
+		OCRDMA_WQE_STRIDE;
+	attr->rqe_size = ((rsp->wqe_rqe_stride_max_dpp_cqs &
+		OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_MASK) >>
+		OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET) *
+		OCRDMA_WQE_STRIDE;
+	attr->max_inline_data =
+	    attr->wqe_size - (sizeof(struct ocrdma_hdr_wqe) +
+			      sizeof(struct ocrdma_sge));
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		attr->ird = 1;
+		attr->ird_page_size = OCRDMA_MIN_Q_PAGE_SIZE;
+		attr->num_ird_pages = MAX_OCRDMA_IRD_PAGES;
+	}
+	dev->attr.max_wqe = rsp->max_wqes_rqes_per_q >>
+		 OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET;
+	dev->attr.max_rqe = rsp->max_wqes_rqes_per_q &
+		OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_MASK;
+}
+
+static int ocrdma_check_fw_config(struct ocrdma_dev *dev,
+				   struct ocrdma_fw_conf_rsp *conf)
+{
+	u32 fn_mode;
+
+	fn_mode = conf->fn_mode & OCRDMA_FN_MODE_RDMA;
+	if (fn_mode != OCRDMA_FN_MODE_RDMA)
+		return -EINVAL;
+	dev->base_eqid = conf->base_eqid;
+	dev->max_eq = conf->max_eq;
+	return 0;
+}
+
+/* can be issued only during init time. */
+static int ocrdma_mbx_query_fw_ver(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mqe *cmd;
+	struct ocrdma_fw_ver_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_GET_FW_VER, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0],
+			OCRDMA_CMD_GET_FW_VER,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_fw_ver_rsp *)cmd;
+	memset(&dev->attr.fw_ver[0], 0, sizeof(dev->attr.fw_ver));
+	memcpy(&dev->attr.fw_ver[0], &rsp->running_ver[0],
+	       sizeof(rsp->running_ver));
+	ocrdma_le32_to_cpu(dev->attr.fw_ver, sizeof(rsp->running_ver));
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+/* can be issued only during init time. */
+static int ocrdma_mbx_query_fw_config(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mqe *cmd;
+	struct ocrdma_fw_conf_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_GET_FW_CONFIG, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0],
+			OCRDMA_CMD_GET_FW_CONFIG,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_fw_conf_rsp *)cmd;
+	status = ocrdma_check_fw_config(dev, rsp);
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_rdma_stats(struct ocrdma_dev *dev, bool reset)
+{
+	struct ocrdma_rdma_stats_req *req = dev->stats_mem.va;
+	struct ocrdma_mqe *mqe = &dev->stats_mem.mqe;
+	struct ocrdma_rdma_stats_resp *old_stats;
+	int status;
+
+	old_stats = kmalloc(sizeof(*old_stats), GFP_KERNEL);
+	if (old_stats == NULL)
+		return -ENOMEM;
+
+	memset(mqe, 0, sizeof(*mqe));
+	mqe->hdr.pyld_len = dev->stats_mem.size;
+	mqe->hdr.spcl_sge_cnt_emb |=
+			(1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) &
+				OCRDMA_MQE_HDR_SGE_CNT_MASK;
+	mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dev->stats_mem.pa & 0xffffffff);
+	mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dev->stats_mem.pa);
+	mqe->u.nonemb_req.sge[0].len = dev->stats_mem.size;
+
+	/* Cache the old stats */
+	memcpy(old_stats, req, sizeof(struct ocrdma_rdma_stats_resp));
+	memset(req, 0, dev->stats_mem.size);
+
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)req,
+			OCRDMA_CMD_GET_RDMA_STATS,
+			OCRDMA_SUBSYS_ROCE,
+			dev->stats_mem.size);
+	if (reset)
+		req->reset_stats = reset;
+
+	status = ocrdma_nonemb_mbx_cmd(dev, mqe, dev->stats_mem.va);
+	if (status)
+		/* Copy from cache, if mbox fails */
+		memcpy(req, old_stats, sizeof(struct ocrdma_rdma_stats_resp));
+	else
+		ocrdma_le32_to_cpu(req, dev->stats_mem.size);
+
+	kfree(old_stats);
+	return status;
+}
+
+static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_dma_mem dma;
+	struct ocrdma_mqe *mqe;
+	struct ocrdma_get_ctrl_attribs_rsp *ctrl_attr_rsp;
+	struct mgmt_hba_attribs *hba_attribs;
+
+	mqe = kzalloc(sizeof(struct ocrdma_mqe), GFP_KERNEL);
+	if (!mqe)
+		return status;
+
+	dma.size = sizeof(struct ocrdma_get_ctrl_attribs_rsp);
+	dma.va	 = dma_alloc_coherent(&dev->nic_info.pdev->dev,
+					dma.size, &dma.pa, GFP_KERNEL);
+	if (!dma.va)
+		goto free_mqe;
+
+	mqe->hdr.pyld_len = dma.size;
+	mqe->hdr.spcl_sge_cnt_emb |=
+			(1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) &
+			OCRDMA_MQE_HDR_SGE_CNT_MASK;
+	mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dma.pa & 0xffffffff);
+	mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dma.pa);
+	mqe->u.nonemb_req.sge[0].len = dma.size;
+
+	memset(dma.va, 0, dma.size);
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)dma.va,
+			OCRDMA_CMD_GET_CTRL_ATTRIBUTES,
+			OCRDMA_SUBSYS_COMMON,
+			dma.size);
+
+	status = ocrdma_nonemb_mbx_cmd(dev, mqe, dma.va);
+	if (!status) {
+		ctrl_attr_rsp = (struct ocrdma_get_ctrl_attribs_rsp *)dma.va;
+		hba_attribs = &ctrl_attr_rsp->ctrl_attribs.hba_attribs;
+
+		dev->hba_port_num = (hba_attribs->ptpnum_maxdoms_hbast_cv &
+					OCRDMA_HBA_ATTRB_PTNUM_MASK)
+					>> OCRDMA_HBA_ATTRB_PTNUM_SHIFT;
+		strncpy(dev->model_number,
+			hba_attribs->controller_model_number, 31);
+	}
+	dma_free_coherent(&dev->nic_info.pdev->dev, dma.size, dma.va, dma.pa);
+free_mqe:
+	kfree(mqe);
+	return status;
+}
+
+static int ocrdma_mbx_query_dev(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mbx_query_config *rsp;
+	struct ocrdma_mqe *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_CONFIG, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_mbx_query_config *)cmd;
+	ocrdma_get_attr(dev, &dev->attr, rsp);
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed,
+			      u8 *lnk_state)
+{
+	int status = -ENOMEM;
+	struct ocrdma_get_link_speed_rsp *rsp;
+	struct ocrdma_mqe *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1,
+				  sizeof(*cmd));
+	if (!cmd)
+		return status;
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0],
+			OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	((struct ocrdma_mbx_hdr *)cmd->u.cmd)->rsvd_version = 0x1;
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+	rsp = (struct ocrdma_get_link_speed_rsp *)cmd;
+	if (lnk_speed)
+		*lnk_speed = (rsp->pflt_pps_ld_pnum & OCRDMA_PHY_PS_MASK)
+			      >> OCRDMA_PHY_PS_SHIFT;
+	if (lnk_state)
+		*lnk_state = (rsp->res_lnk_st & OCRDMA_LINK_ST_MASK);
+
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_mbx_get_phy_info(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mqe *cmd;
+	struct ocrdma_get_phy_info_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_PHY_DETAILS, sizeof(*cmd));
+	if (!cmd)
+		return status;
+
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0],
+			OCRDMA_CMD_PHY_DETAILS, OCRDMA_SUBSYS_COMMON,
+			sizeof(*cmd));
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+	rsp = (struct ocrdma_get_phy_info_rsp *)cmd;
+	dev->phy.phy_type =
+			(rsp->ityp_ptyp & OCRDMA_PHY_TYPE_MASK);
+	dev->phy.interface_type =
+			(rsp->ityp_ptyp & OCRDMA_IF_TYPE_MASK)
+				>> OCRDMA_IF_TYPE_SHIFT;
+	dev->phy.auto_speeds_supported  =
+			(rsp->fspeed_aspeed & OCRDMA_ASPEED_SUPP_MASK);
+	dev->phy.fixed_speeds_supported =
+			(rsp->fspeed_aspeed & OCRDMA_FSPEED_SUPP_MASK)
+				>> OCRDMA_FSPEED_SUPP_SHIFT;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd)
+{
+	int status = -ENOMEM;
+	struct ocrdma_alloc_pd *cmd;
+	struct ocrdma_alloc_pd_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_PD, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	if (pd->dpp_enabled)
+		cmd->enable_dpp_rsvd |= OCRDMA_ALLOC_PD_ENABLE_DPP;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_alloc_pd_rsp *)cmd;
+	pd->id = rsp->dpp_page_pdid & OCRDMA_ALLOC_PD_RSP_PDID_MASK;
+	if (rsp->dpp_page_pdid & OCRDMA_ALLOC_PD_RSP_DPP) {
+		pd->dpp_enabled = true;
+		pd->dpp_page = rsp->dpp_page_pdid >>
+				OCRDMA_ALLOC_PD_RSP_DPP_PAGE_SHIFT;
+	} else {
+		pd->dpp_enabled = false;
+		pd->num_dpp_qp = 0;
+	}
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_dealloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd)
+{
+	int status = -ENOMEM;
+	struct ocrdma_dealloc_pd *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_PD, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->id = pd->id;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	kfree(cmd);
+	return status;
+}
+
+
+static int ocrdma_mbx_alloc_pd_range(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	size_t pd_bitmap_size;
+	struct ocrdma_alloc_pd_range *cmd;
+	struct ocrdma_alloc_pd_range_rsp *rsp;
+
+	/* Pre allocate the DPP PDs */
+	if (dev->attr.max_dpp_pds) {
+		cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_PD_RANGE,
+					  sizeof(*cmd));
+		if (!cmd)
+			return -ENOMEM;
+		cmd->pd_count = dev->attr.max_dpp_pds;
+		cmd->enable_dpp_rsvd |= OCRDMA_ALLOC_PD_ENABLE_DPP;
+		status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+		rsp = (struct ocrdma_alloc_pd_range_rsp *)cmd;
+
+		if (!status && (rsp->dpp_page_pdid & OCRDMA_ALLOC_PD_RSP_DPP) &&
+		    rsp->pd_count) {
+			dev->pd_mgr->dpp_page_index = rsp->dpp_page_pdid >>
+					OCRDMA_ALLOC_PD_RSP_DPP_PAGE_SHIFT;
+			dev->pd_mgr->pd_dpp_start = rsp->dpp_page_pdid &
+					OCRDMA_ALLOC_PD_RNG_RSP_START_PDID_MASK;
+			dev->pd_mgr->max_dpp_pd = rsp->pd_count;
+			pd_bitmap_size =
+				BITS_TO_LONGS(rsp->pd_count) * sizeof(long);
+			dev->pd_mgr->pd_dpp_bitmap = kzalloc(pd_bitmap_size,
+							     GFP_KERNEL);
+		}
+		kfree(cmd);
+	}
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_PD_RANGE, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+
+	cmd->pd_count = dev->attr.max_pd - dev->attr.max_dpp_pds;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	rsp = (struct ocrdma_alloc_pd_range_rsp *)cmd;
+	if (!status && rsp->pd_count) {
+		dev->pd_mgr->pd_norm_start = rsp->dpp_page_pdid &
+					OCRDMA_ALLOC_PD_RNG_RSP_START_PDID_MASK;
+		dev->pd_mgr->max_normal_pd = rsp->pd_count;
+		pd_bitmap_size = BITS_TO_LONGS(rsp->pd_count) * sizeof(long);
+		dev->pd_mgr->pd_norm_bitmap = kzalloc(pd_bitmap_size,
+						      GFP_KERNEL);
+	}
+	kfree(cmd);
+
+	if (dev->pd_mgr->pd_norm_bitmap || dev->pd_mgr->pd_dpp_bitmap) {
+		/* Enable PD resource manager */
+		dev->pd_mgr->pd_prealloc_valid = true;
+		return 0;
+	}
+	return status;
+}
+
+static void ocrdma_mbx_dealloc_pd_range(struct ocrdma_dev *dev)
+{
+	struct ocrdma_dealloc_pd_range *cmd;
+
+	/* return normal PDs to firmware */
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_PD_RANGE, sizeof(*cmd));
+	if (!cmd)
+		goto mbx_err;
+
+	if (dev->pd_mgr->max_normal_pd) {
+		cmd->start_pd_id = dev->pd_mgr->pd_norm_start;
+		cmd->pd_count = dev->pd_mgr->max_normal_pd;
+		ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	}
+
+	if (dev->pd_mgr->max_dpp_pd) {
+		kfree(cmd);
+		/* return DPP PDs to firmware */
+		cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_PD_RANGE,
+					  sizeof(*cmd));
+		if (!cmd)
+			goto mbx_err;
+
+		cmd->start_pd_id = dev->pd_mgr->pd_dpp_start;
+		cmd->pd_count = dev->pd_mgr->max_dpp_pd;
+		ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	}
+mbx_err:
+	kfree(cmd);
+}
+
+void ocrdma_alloc_pd_pool(struct ocrdma_dev *dev)
+{
+	int status;
+
+	dev->pd_mgr = kzalloc(sizeof(struct ocrdma_pd_resource_mgr),
+			      GFP_KERNEL);
+	if (!dev->pd_mgr)
+		return;
+
+	status = ocrdma_mbx_alloc_pd_range(dev);
+	if (status) {
+		pr_err("%s(%d) Unable to initialize PD pool, using default.\n",
+			 __func__, dev->id);
+	}
+}
+
+static void ocrdma_free_pd_pool(struct ocrdma_dev *dev)
+{
+	ocrdma_mbx_dealloc_pd_range(dev);
+	kfree(dev->pd_mgr->pd_norm_bitmap);
+	kfree(dev->pd_mgr->pd_dpp_bitmap);
+	kfree(dev->pd_mgr);
+}
+
+static int ocrdma_build_q_conf(u32 *num_entries, int entry_size,
+			       int *num_pages, int *page_size)
+{
+	int i;
+	int mem_size;
+
+	*num_entries = roundup_pow_of_two(*num_entries);
+	mem_size = *num_entries * entry_size;
+	/* find the possible lowest possible multiplier */
+	for (i = 0; i < OCRDMA_MAX_Q_PAGE_SIZE_CNT; i++) {
+		if (mem_size <= (OCRDMA_Q_PAGE_BASE_SIZE << i))
+			break;
+	}
+	if (i >= OCRDMA_MAX_Q_PAGE_SIZE_CNT)
+		return -EINVAL;
+	mem_size = roundup(mem_size,
+		       ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES));
+	*num_pages =
+	    mem_size / ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES);
+	*page_size = ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES);
+	*num_entries = mem_size / entry_size;
+	return 0;
+}
+
+static int ocrdma_mbx_create_ah_tbl(struct ocrdma_dev *dev)
+{
+	int i;
+	int status = -ENOMEM;
+	int max_ah;
+	struct ocrdma_create_ah_tbl *cmd;
+	struct ocrdma_create_ah_tbl_rsp *rsp;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	dma_addr_t pa;
+	struct ocrdma_pbe *pbes;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_AH_TBL, sizeof(*cmd));
+	if (!cmd)
+		return status;
+
+	max_ah = OCRDMA_MAX_AH;
+	dev->av_tbl.size = sizeof(struct ocrdma_av) * max_ah;
+
+	/* number of PBEs in PBL */
+	cmd->ah_conf = (OCRDMA_AH_TBL_PAGES <<
+				OCRDMA_CREATE_AH_NUM_PAGES_SHIFT) &
+				OCRDMA_CREATE_AH_NUM_PAGES_MASK;
+
+	/* page size */
+	for (i = 0; i < OCRDMA_MAX_Q_PAGE_SIZE_CNT; i++) {
+		if (PAGE_SIZE == (OCRDMA_MIN_Q_PAGE_SIZE << i))
+			break;
+	}
+	cmd->ah_conf |= (i << OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT) &
+				OCRDMA_CREATE_AH_PAGE_SIZE_MASK;
+
+	/* ah_entry size */
+	cmd->ah_conf |= (sizeof(struct ocrdma_av) <<
+				OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT) &
+				OCRDMA_CREATE_AH_ENTRY_SIZE_MASK;
+
+	dev->av_tbl.pbl.va = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
+						&dev->av_tbl.pbl.pa,
+						GFP_KERNEL);
+	if (dev->av_tbl.pbl.va == NULL)
+		goto mem_err;
+
+	dev->av_tbl.va = dma_alloc_coherent(&pdev->dev, dev->av_tbl.size,
+					    &pa, GFP_KERNEL);
+	if (dev->av_tbl.va == NULL)
+		goto mem_err_ah;
+	dev->av_tbl.pa = pa;
+	dev->av_tbl.num_ah = max_ah;
+	memset(dev->av_tbl.va, 0, dev->av_tbl.size);
+
+	pbes = (struct ocrdma_pbe *)dev->av_tbl.pbl.va;
+	for (i = 0; i < dev->av_tbl.size / OCRDMA_MIN_Q_PAGE_SIZE; i++) {
+		pbes[i].pa_lo = (u32)cpu_to_le32(pa & 0xffffffff);
+		pbes[i].pa_hi = (u32)cpu_to_le32(upper_32_bits(pa));
+		pa += PAGE_SIZE;
+	}
+	cmd->tbl_addr[0].lo = (u32)(dev->av_tbl.pbl.pa & 0xFFFFFFFF);
+	cmd->tbl_addr[0].hi = (u32)upper_32_bits(dev->av_tbl.pbl.pa);
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_create_ah_tbl_rsp *)cmd;
+	dev->av_tbl.ahid = rsp->ahid & 0xFFFF;
+	kfree(cmd);
+	return 0;
+
+mbx_err:
+	dma_free_coherent(&pdev->dev, dev->av_tbl.size, dev->av_tbl.va,
+			  dev->av_tbl.pa);
+	dev->av_tbl.va = NULL;
+mem_err_ah:
+	dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->av_tbl.pbl.va,
+			  dev->av_tbl.pbl.pa);
+	dev->av_tbl.pbl.va = NULL;
+	dev->av_tbl.size = 0;
+mem_err:
+	kfree(cmd);
+	return status;
+}
+
+static void ocrdma_mbx_delete_ah_tbl(struct ocrdma_dev *dev)
+{
+	struct ocrdma_delete_ah_tbl *cmd;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+
+	if (dev->av_tbl.va == NULL)
+		return;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_AH_TBL, sizeof(*cmd));
+	if (!cmd)
+		return;
+	cmd->ahid = dev->av_tbl.ahid;
+
+	ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	dma_free_coherent(&pdev->dev, dev->av_tbl.size, dev->av_tbl.va,
+			  dev->av_tbl.pa);
+	dev->av_tbl.va = NULL;
+	dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->av_tbl.pbl.va,
+			  dev->av_tbl.pbl.pa);
+	kfree(cmd);
+}
+
+/* Multiple CQs uses the EQ. This routine returns least used
+ * EQ to associate with CQ. This will distributes the interrupt
+ * processing and CPU load to associated EQ, vector and so to that CPU.
+ */
+static u16 ocrdma_bind_eq(struct ocrdma_dev *dev)
+{
+	int i, selected_eq = 0, cq_cnt = 0;
+	u16 eq_id;
+
+	mutex_lock(&dev->dev_lock);
+	cq_cnt = dev->eq_tbl[0].cq_cnt;
+	eq_id = dev->eq_tbl[0].q.id;
+	/* find the EQ which is has the least number of
+	 * CQs associated with it.
+	 */
+	for (i = 0; i < dev->eq_cnt; i++) {
+		if (dev->eq_tbl[i].cq_cnt < cq_cnt) {
+			cq_cnt = dev->eq_tbl[i].cq_cnt;
+			eq_id = dev->eq_tbl[i].q.id;
+			selected_eq = i;
+		}
+	}
+	dev->eq_tbl[selected_eq].cq_cnt += 1;
+	mutex_unlock(&dev->dev_lock);
+	return eq_id;
+}
+
+static void ocrdma_unbind_eq(struct ocrdma_dev *dev, u16 eq_id)
+{
+	int i;
+
+	mutex_lock(&dev->dev_lock);
+	i = ocrdma_get_eq_table_index(dev, eq_id);
+	if (i == -EINVAL)
+		BUG();
+	dev->eq_tbl[i].cq_cnt -= 1;
+	mutex_unlock(&dev->dev_lock);
+}
+
+int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
+			 int entries, int dpp_cq, u16 pd_id)
+{
+	int status = -ENOMEM; int max_hw_cqe;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	struct ocrdma_create_cq *cmd;
+	struct ocrdma_create_cq_rsp *rsp;
+	u32 hw_pages, cqe_size, page_size, cqe_count;
+
+	if (entries > dev->attr.max_cqe) {
+		pr_err("%s(%d) max_cqe=0x%x, requester_cqe=0x%x\n",
+		       __func__, dev->id, dev->attr.max_cqe, entries);
+		return -EINVAL;
+	}
+	if (dpp_cq && (ocrdma_get_asic_type(dev) != OCRDMA_ASIC_GEN_SKH_R))
+		return -EINVAL;
+
+	if (dpp_cq) {
+		cq->max_hw_cqe = 1;
+		max_hw_cqe = 1;
+		cqe_size = OCRDMA_DPP_CQE_SIZE;
+		hw_pages = 1;
+	} else {
+		cq->max_hw_cqe = dev->attr.max_cqe;
+		max_hw_cqe = dev->attr.max_cqe;
+		cqe_size = sizeof(struct ocrdma_cqe);
+		hw_pages = OCRDMA_CREATE_CQ_MAX_PAGES;
+	}
+
+	cq->len = roundup(max_hw_cqe * cqe_size, OCRDMA_MIN_Q_PAGE_SIZE);
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_CQ, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	ocrdma_init_mch(&cmd->cmd.req, OCRDMA_CMD_CREATE_CQ,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+	cq->va = dma_zalloc_coherent(&pdev->dev, cq->len, &cq->pa, GFP_KERNEL);
+	if (!cq->va) {
+		status = -ENOMEM;
+		goto mem_err;
+	}
+	page_size = cq->len / hw_pages;
+	cmd->cmd.pgsz_pgcnt = (page_size / OCRDMA_MIN_Q_PAGE_SIZE) <<
+					OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT;
+	cmd->cmd.pgsz_pgcnt |= hw_pages;
+	cmd->cmd.ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS;
+
+	cq->eqn = ocrdma_bind_eq(dev);
+	cmd->cmd.req.rsvd_version = OCRDMA_CREATE_CQ_VER3;
+	cqe_count = cq->len / cqe_size;
+	cq->cqe_cnt = cqe_count;
+	if (cqe_count > 1024) {
+		/* Set cnt to 3 to indicate more than 1024 cq entries */
+		cmd->cmd.ev_cnt_flags |= (0x3 << OCRDMA_CREATE_CQ_CNT_SHIFT);
+	} else {
+		u8 count = 0;
+		switch (cqe_count) {
+		case 256:
+			count = 0;
+			break;
+		case 512:
+			count = 1;
+			break;
+		case 1024:
+			count = 2;
+			break;
+		default:
+			goto mbx_err;
+		}
+		cmd->cmd.ev_cnt_flags |= (count << OCRDMA_CREATE_CQ_CNT_SHIFT);
+	}
+	/* shared eq between all the consumer cqs. */
+	cmd->cmd.eqn = cq->eqn;
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		if (dpp_cq)
+			cmd->cmd.pgsz_pgcnt |= OCRDMA_CREATE_CQ_DPP <<
+				OCRDMA_CREATE_CQ_TYPE_SHIFT;
+		cq->phase_change = false;
+		cmd->cmd.pdid_cqecnt = (cq->len / cqe_size);
+	} else {
+		cmd->cmd.pdid_cqecnt = (cq->len / cqe_size) - 1;
+		cmd->cmd.ev_cnt_flags |= OCRDMA_CREATE_CQ_FLAGS_AUTO_VALID;
+		cq->phase_change = true;
+	}
+
+	/* pd_id valid only for v3 */
+	cmd->cmd.pdid_cqecnt |= (pd_id <<
+		OCRDMA_CREATE_CQ_CMD_PDID_SHIFT);
+	ocrdma_build_q_pages(&cmd->cmd.pa[0], hw_pages, cq->pa, page_size);
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+	rsp = (struct ocrdma_create_cq_rsp *)cmd;
+	cq->id = (u16) (rsp->rsp.cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK);
+	kfree(cmd);
+	return 0;
+mbx_err:
+	ocrdma_unbind_eq(dev, cq->eqn);
+	dma_free_coherent(&pdev->dev, cq->len, cq->va, cq->pa);
+mem_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq)
+{
+	int status = -ENOMEM;
+	struct ocrdma_destroy_cq *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_CQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_DELETE_CQ,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	cmd->bypass_flush_qid |=
+	    (cq->id << OCRDMA_DESTROY_CQ_QID_SHIFT) &
+	    OCRDMA_DESTROY_CQ_QID_MASK;
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	ocrdma_unbind_eq(dev, cq->eqn);
+	dma_free_coherent(&dev->nic_info.pdev->dev, cq->len, cq->va, cq->pa);
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr,
+			  u32 pdid, int addr_check)
+{
+	int status = -ENOMEM;
+	struct ocrdma_alloc_lkey *cmd;
+	struct ocrdma_alloc_lkey_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_LKEY, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->pdid = pdid;
+	cmd->pbl_sz_flags |= addr_check;
+	cmd->pbl_sz_flags |= (hwmr->fr_mr << OCRDMA_ALLOC_LKEY_FMR_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->remote_wr << OCRDMA_ALLOC_LKEY_REMOTE_WR_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->remote_rd << OCRDMA_ALLOC_LKEY_REMOTE_RD_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->local_wr << OCRDMA_ALLOC_LKEY_LOCAL_WR_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->remote_atomic << OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->num_pbls << OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT);
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_alloc_lkey_rsp *)cmd;
+	hwmr->lkey = rsp->lrkey;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_dealloc_lkey(struct ocrdma_dev *dev, int fr_mr, u32 lkey)
+{
+	int status;
+	struct ocrdma_dealloc_lkey *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_LKEY, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	cmd->lkey = lkey;
+	cmd->rsvd_frmr = fr_mr ? 1 : 0;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_mbx_reg_mr(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr,
+			     u32 pdid, u32 pbl_cnt, u32 pbe_size, u32 last)
+{
+	int status = -ENOMEM;
+	int i;
+	struct ocrdma_reg_nsmr *cmd;
+	struct ocrdma_reg_nsmr_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_REGISTER_NSMR, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	cmd->num_pbl_pdid =
+	    pdid | (hwmr->num_pbls << OCRDMA_REG_NSMR_NUM_PBL_SHIFT);
+	cmd->fr_mr = hwmr->fr_mr;
+
+	cmd->flags_hpage_pbe_sz |= (hwmr->remote_wr <<
+				    OCRDMA_REG_NSMR_REMOTE_WR_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (hwmr->remote_rd <<
+				    OCRDMA_REG_NSMR_REMOTE_RD_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (hwmr->local_wr <<
+				    OCRDMA_REG_NSMR_LOCAL_WR_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (hwmr->remote_atomic <<
+				    OCRDMA_REG_NSMR_REMOTE_ATOMIC_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (hwmr->mw_bind <<
+				    OCRDMA_REG_NSMR_BIND_MEMWIN_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (last << OCRDMA_REG_NSMR_LAST_SHIFT);
+
+	cmd->flags_hpage_pbe_sz |= (hwmr->pbe_size / OCRDMA_MIN_HPAGE_SIZE);
+	cmd->flags_hpage_pbe_sz |= (hwmr->pbl_size / OCRDMA_MIN_HPAGE_SIZE) <<
+					OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT;
+	cmd->totlen_low = hwmr->len;
+	cmd->totlen_high = upper_32_bits(hwmr->len);
+	cmd->fbo_low = (u32) (hwmr->fbo & 0xffffffff);
+	cmd->fbo_high = (u32) upper_32_bits(hwmr->fbo);
+	cmd->va_loaddr = (u32) hwmr->va;
+	cmd->va_hiaddr = (u32) upper_32_bits(hwmr->va);
+
+	for (i = 0; i < pbl_cnt; i++) {
+		cmd->pbl[i].lo = (u32) (hwmr->pbl_table[i].pa & 0xffffffff);
+		cmd->pbl[i].hi = upper_32_bits(hwmr->pbl_table[i].pa);
+	}
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_reg_nsmr_rsp *)cmd;
+	hwmr->lkey = rsp->lrkey;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_mbx_reg_mr_cont(struct ocrdma_dev *dev,
+				  struct ocrdma_hw_mr *hwmr, u32 pbl_cnt,
+				  u32 pbl_offset, u32 last)
+{
+	int status;
+	int i;
+	struct ocrdma_reg_nsmr_cont *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_REGISTER_NSMR_CONT, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	cmd->lrkey = hwmr->lkey;
+	cmd->num_pbl_offset = (pbl_cnt << OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT) |
+	    (pbl_offset & OCRDMA_REG_NSMR_CONT_PBL_SHIFT_MASK);
+	cmd->last = last << OCRDMA_REG_NSMR_CONT_LAST_SHIFT;
+
+	for (i = 0; i < pbl_cnt; i++) {
+		cmd->pbl[i].lo =
+		    (u32) (hwmr->pbl_table[i + pbl_offset].pa & 0xffffffff);
+		cmd->pbl[i].hi =
+		    upper_32_bits(hwmr->pbl_table[i + pbl_offset].pa);
+	}
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_reg_mr(struct ocrdma_dev *dev,
+		  struct ocrdma_hw_mr *hwmr, u32 pdid, int acc)
+{
+	int status;
+	u32 last = 0;
+	u32 cur_pbl_cnt, pbl_offset;
+	u32 pending_pbl_cnt = hwmr->num_pbls;
+
+	pbl_offset = 0;
+	cur_pbl_cnt = min(pending_pbl_cnt, MAX_OCRDMA_NSMR_PBL);
+	if (cur_pbl_cnt == pending_pbl_cnt)
+		last = 1;
+
+	status = ocrdma_mbx_reg_mr(dev, hwmr, pdid,
+				   cur_pbl_cnt, hwmr->pbe_size, last);
+	if (status) {
+		pr_err("%s() status=%d\n", __func__, status);
+		return status;
+	}
+	/* if there is no more pbls to register then exit. */
+	if (last)
+		return 0;
+
+	while (!last) {
+		pbl_offset += cur_pbl_cnt;
+		pending_pbl_cnt -= cur_pbl_cnt;
+		cur_pbl_cnt = min(pending_pbl_cnt, MAX_OCRDMA_NSMR_PBL);
+		/* if we reach the end of the pbls, then need to set the last
+		 * bit, indicating no more pbls to register for this memory key.
+		 */
+		if (cur_pbl_cnt == pending_pbl_cnt)
+			last = 1;
+
+		status = ocrdma_mbx_reg_mr_cont(dev, hwmr, cur_pbl_cnt,
+						pbl_offset, last);
+		if (status)
+			break;
+	}
+	if (status)
+		pr_err("%s() err. status=%d\n", __func__, status);
+
+	return status;
+}
+
+bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *cq, struct ocrdma_qp *qp)
+{
+	struct ocrdma_qp *tmp;
+	bool found = false;
+	list_for_each_entry(tmp, &cq->sq_head, sq_entry) {
+		if (qp == tmp) {
+			found = true;
+			break;
+		}
+	}
+	return found;
+}
+
+bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *cq, struct ocrdma_qp *qp)
+{
+	struct ocrdma_qp *tmp;
+	bool found = false;
+	list_for_each_entry(tmp, &cq->rq_head, rq_entry) {
+		if (qp == tmp) {
+			found = true;
+			break;
+		}
+	}
+	return found;
+}
+
+void ocrdma_flush_qp(struct ocrdma_qp *qp)
+{
+	bool found;
+	unsigned long flags;
+	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
+
+	spin_lock_irqsave(&dev->flush_q_lock, flags);
+	found = ocrdma_is_qp_in_sq_flushlist(qp->sq_cq, qp);
+	if (!found)
+		list_add_tail(&qp->sq_entry, &qp->sq_cq->sq_head);
+	if (!qp->srq) {
+		found = ocrdma_is_qp_in_rq_flushlist(qp->rq_cq, qp);
+		if (!found)
+			list_add_tail(&qp->rq_entry, &qp->rq_cq->rq_head);
+	}
+	spin_unlock_irqrestore(&dev->flush_q_lock, flags);
+}
+
+static void ocrdma_init_hwq_ptr(struct ocrdma_qp *qp)
+{
+	qp->sq.head = 0;
+	qp->sq.tail = 0;
+	qp->rq.head = 0;
+	qp->rq.tail = 0;
+}
+
+int ocrdma_qp_state_change(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state,
+			   enum ib_qp_state *old_ib_state)
+{
+	unsigned long flags;
+	enum ocrdma_qp_state new_state;
+	new_state = get_ocrdma_qp_state(new_ib_state);
+
+	/* sync with wqe and rqe posting */
+	spin_lock_irqsave(&qp->q_lock, flags);
+
+	if (old_ib_state)
+		*old_ib_state = get_ibqp_state(qp->state);
+	if (new_state == qp->state) {
+		spin_unlock_irqrestore(&qp->q_lock, flags);
+		return 1;
+	}
+
+
+	if (new_state == OCRDMA_QPS_INIT) {
+		ocrdma_init_hwq_ptr(qp);
+		ocrdma_del_flush_qp(qp);
+	} else if (new_state == OCRDMA_QPS_ERR) {
+		ocrdma_flush_qp(qp);
+	}
+
+	qp->state = new_state;
+
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+	return 0;
+}
+
+static u32 ocrdma_set_create_qp_mbx_access_flags(struct ocrdma_qp *qp)
+{
+	u32 flags = 0;
+	if (qp->cap_flags & OCRDMA_QP_INB_RD)
+		flags |= OCRDMA_CREATE_QP_REQ_INB_RDEN_MASK;
+	if (qp->cap_flags & OCRDMA_QP_INB_WR)
+		flags |= OCRDMA_CREATE_QP_REQ_INB_WREN_MASK;
+	if (qp->cap_flags & OCRDMA_QP_MW_BIND)
+		flags |= OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_MASK;
+	if (qp->cap_flags & OCRDMA_QP_LKEY0)
+		flags |= OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_MASK;
+	if (qp->cap_flags & OCRDMA_QP_FAST_REG)
+		flags |= OCRDMA_CREATE_QP_REQ_FMR_EN_MASK;
+	return flags;
+}
+
+static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd,
+					struct ib_qp_init_attr *attrs,
+					struct ocrdma_qp *qp)
+{
+	int status;
+	u32 len, hw_pages, hw_page_size;
+	dma_addr_t pa;
+	struct ocrdma_pd *pd = qp->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	u32 max_wqe_allocated;
+	u32 max_sges = attrs->cap.max_send_sge;
+
+	/* QP1 may exceed 127 */
+	max_wqe_allocated = min_t(u32, attrs->cap.max_send_wr + 1,
+				dev->attr.max_wqe);
+
+	status = ocrdma_build_q_conf(&max_wqe_allocated,
+		dev->attr.wqe_size, &hw_pages, &hw_page_size);
+	if (status) {
+		pr_err("%s() req. max_send_wr=0x%x\n", __func__,
+		       max_wqe_allocated);
+		return -EINVAL;
+	}
+	qp->sq.max_cnt = max_wqe_allocated;
+	len = (hw_pages * hw_page_size);
+
+	qp->sq.va = dma_zalloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
+	if (!qp->sq.va)
+		return -EINVAL;
+	qp->sq.len = len;
+	qp->sq.pa = pa;
+	qp->sq.entry_size = dev->attr.wqe_size;
+	ocrdma_build_q_pages(&cmd->wq_addr[0], hw_pages, pa, hw_page_size);
+
+	cmd->type_pgsz_pdn |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE)
+				<< OCRDMA_CREATE_QP_REQ_SQ_PAGE_SIZE_SHIFT);
+	cmd->num_wq_rq_pages |= (hw_pages <<
+				 OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT) &
+	    OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_MASK;
+	cmd->max_sge_send_write |= (max_sges <<
+				    OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT) &
+	    OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_MASK;
+	cmd->max_sge_send_write |= (max_sges <<
+				    OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_SHIFT) &
+					OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_MASK;
+	cmd->max_wqe_rqe |= (ilog2(qp->sq.max_cnt) <<
+			     OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_WQE_MASK;
+	cmd->wqe_rqe_size |= (dev->attr.wqe_size <<
+			      OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_WQE_SIZE_MASK;
+	return 0;
+}
+
+static int ocrdma_set_create_qp_rq_cmd(struct ocrdma_create_qp_req *cmd,
+					struct ib_qp_init_attr *attrs,
+					struct ocrdma_qp *qp)
+{
+	int status;
+	u32 len, hw_pages, hw_page_size;
+	dma_addr_t pa = 0;
+	struct ocrdma_pd *pd = qp->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	u32 max_rqe_allocated = attrs->cap.max_recv_wr + 1;
+
+	status = ocrdma_build_q_conf(&max_rqe_allocated, dev->attr.rqe_size,
+				     &hw_pages, &hw_page_size);
+	if (status) {
+		pr_err("%s() req. max_recv_wr=0x%x\n", __func__,
+		       attrs->cap.max_recv_wr + 1);
+		return status;
+	}
+	qp->rq.max_cnt = max_rqe_allocated;
+	len = (hw_pages * hw_page_size);
+
+	qp->rq.va = dma_zalloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
+	if (!qp->rq.va)
+		return -ENOMEM;
+	qp->rq.pa = pa;
+	qp->rq.len = len;
+	qp->rq.entry_size = dev->attr.rqe_size;
+
+	ocrdma_build_q_pages(&cmd->rq_addr[0], hw_pages, pa, hw_page_size);
+	cmd->type_pgsz_pdn |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE) <<
+		OCRDMA_CREATE_QP_REQ_RQ_PAGE_SIZE_SHIFT);
+	cmd->num_wq_rq_pages |=
+	    (hw_pages << OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_SHIFT) &
+	    OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_MASK;
+	cmd->max_sge_recv_flags |= (attrs->cap.max_recv_sge <<
+				OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_MASK;
+	cmd->max_wqe_rqe |= (ilog2(qp->rq.max_cnt) <<
+				OCRDMA_CREATE_QP_REQ_MAX_RQE_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_RQE_MASK;
+	cmd->wqe_rqe_size |= (dev->attr.rqe_size <<
+			OCRDMA_CREATE_QP_REQ_RQE_SIZE_SHIFT) &
+			OCRDMA_CREATE_QP_REQ_RQE_SIZE_MASK;
+	return 0;
+}
+
+static void ocrdma_set_create_qp_dpp_cmd(struct ocrdma_create_qp_req *cmd,
+					 struct ocrdma_pd *pd,
+					 struct ocrdma_qp *qp,
+					 u8 enable_dpp_cq, u16 dpp_cq_id)
+{
+	pd->num_dpp_qp--;
+	qp->dpp_enabled = true;
+	cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK;
+	if (!enable_dpp_cq)
+		return;
+	cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK;
+	cmd->dpp_credits_cqid = dpp_cq_id;
+	cmd->dpp_credits_cqid |= OCRDMA_CREATE_QP_REQ_DPP_CREDIT_LIMIT <<
+					OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT;
+}
+
+static int ocrdma_set_create_qp_ird_cmd(struct ocrdma_create_qp_req *cmd,
+					struct ocrdma_qp *qp)
+{
+	struct ocrdma_pd *pd = qp->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	dma_addr_t pa = 0;
+	int ird_page_size = dev->attr.ird_page_size;
+	int ird_q_len = dev->attr.num_ird_pages * ird_page_size;
+	struct ocrdma_hdr_wqe *rqe;
+	int i  = 0;
+
+	if (dev->attr.ird == 0)
+		return 0;
+
+	qp->ird_q_va = dma_zalloc_coherent(&pdev->dev, ird_q_len, &pa,
+					   GFP_KERNEL);
+	if (!qp->ird_q_va)
+		return -ENOMEM;
+	ocrdma_build_q_pages(&cmd->ird_addr[0], dev->attr.num_ird_pages,
+			     pa, ird_page_size);
+	for (; i < ird_q_len / dev->attr.rqe_size; i++) {
+		rqe = (struct ocrdma_hdr_wqe *)(qp->ird_q_va +
+			(i * dev->attr.rqe_size));
+		rqe->cw = 0;
+		rqe->cw |= 2;
+		rqe->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT);
+		rqe->cw |= (8 << OCRDMA_WQE_SIZE_SHIFT);
+		rqe->cw |= (8 << OCRDMA_WQE_NXT_WQE_SIZE_SHIFT);
+	}
+	return 0;
+}
+
+static void ocrdma_get_create_qp_rsp(struct ocrdma_create_qp_rsp *rsp,
+				     struct ocrdma_qp *qp,
+				     struct ib_qp_init_attr *attrs,
+				     u16 *dpp_offset, u16 *dpp_credit_lmt)
+{
+	u32 max_wqe_allocated, max_rqe_allocated;
+	qp->id = rsp->qp_id & OCRDMA_CREATE_QP_RSP_QP_ID_MASK;
+	qp->rq.dbid = rsp->sq_rq_id & OCRDMA_CREATE_QP_RSP_RQ_ID_MASK;
+	qp->sq.dbid = rsp->sq_rq_id >> OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT;
+	qp->max_ird = rsp->max_ord_ird & OCRDMA_CREATE_QP_RSP_MAX_IRD_MASK;
+	qp->max_ord = (rsp->max_ord_ird >> OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT);
+	qp->dpp_enabled = false;
+	if (rsp->dpp_response & OCRDMA_CREATE_QP_RSP_DPP_ENABLED_MASK) {
+		qp->dpp_enabled = true;
+		*dpp_credit_lmt = (rsp->dpp_response &
+				OCRDMA_CREATE_QP_RSP_DPP_CREDITS_MASK) >>
+				OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT;
+		*dpp_offset = (rsp->dpp_response &
+				OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_MASK) >>
+				OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT;
+	}
+	max_wqe_allocated =
+		rsp->max_wqe_rqe >> OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT;
+	max_wqe_allocated = 1 << max_wqe_allocated;
+	max_rqe_allocated = 1 << ((u16)rsp->max_wqe_rqe);
+
+	qp->sq.max_cnt = max_wqe_allocated;
+	qp->sq.max_wqe_idx = max_wqe_allocated - 1;
+
+	if (!attrs->srq) {
+		qp->rq.max_cnt = max_rqe_allocated;
+		qp->rq.max_wqe_idx = max_rqe_allocated - 1;
+	}
+}
+
+int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs,
+			 u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset,
+			 u16 *dpp_credit_lmt)
+{
+	int status = -ENOMEM;
+	u32 flags = 0;
+	struct ocrdma_pd *pd = qp->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	struct ocrdma_cq *cq;
+	struct ocrdma_create_qp_req *cmd;
+	struct ocrdma_create_qp_rsp *rsp;
+	int qptype;
+
+	switch (attrs->qp_type) {
+	case IB_QPT_GSI:
+		qptype = OCRDMA_QPT_GSI;
+		break;
+	case IB_QPT_RC:
+		qptype = OCRDMA_QPT_RC;
+		break;
+	case IB_QPT_UD:
+		qptype = OCRDMA_QPT_UD;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_QP, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->type_pgsz_pdn |= (qptype << OCRDMA_CREATE_QP_REQ_QPT_SHIFT) &
+						OCRDMA_CREATE_QP_REQ_QPT_MASK;
+	status = ocrdma_set_create_qp_sq_cmd(cmd, attrs, qp);
+	if (status)
+		goto sq_err;
+
+	if (attrs->srq) {
+		struct ocrdma_srq *srq = get_ocrdma_srq(attrs->srq);
+		cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_USE_SRQ_MASK;
+		cmd->rq_addr[0].lo = srq->id;
+		qp->srq = srq;
+	} else {
+		status = ocrdma_set_create_qp_rq_cmd(cmd, attrs, qp);
+		if (status)
+			goto rq_err;
+	}
+
+	status = ocrdma_set_create_qp_ird_cmd(cmd, qp);
+	if (status)
+		goto mbx_err;
+
+	cmd->type_pgsz_pdn |= (pd->id << OCRDMA_CREATE_QP_REQ_PD_ID_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_PD_ID_MASK;
+
+	flags = ocrdma_set_create_qp_mbx_access_flags(qp);
+
+	cmd->max_sge_recv_flags |= flags;
+	cmd->max_ord_ird |= (dev->attr.max_ord_per_qp <<
+			     OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_ORD_MASK;
+	cmd->max_ord_ird |= (dev->attr.max_ird_per_qp <<
+			     OCRDMA_CREATE_QP_REQ_MAX_IRD_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_IRD_MASK;
+	cq = get_ocrdma_cq(attrs->send_cq);
+	cmd->wq_rq_cqid |= (cq->id << OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_WQ_CQID_MASK;
+	qp->sq_cq = cq;
+	cq = get_ocrdma_cq(attrs->recv_cq);
+	cmd->wq_rq_cqid |= (cq->id << OCRDMA_CREATE_QP_REQ_RQ_CQID_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK;
+	qp->rq_cq = cq;
+
+	if (pd->dpp_enabled && attrs->cap.max_inline_data && pd->num_dpp_qp &&
+	    (attrs->cap.max_inline_data <= dev->attr.max_inline_data)) {
+		ocrdma_set_create_qp_dpp_cmd(cmd, pd, qp, enable_dpp_cq,
+					     dpp_cq_id);
+	}
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_create_qp_rsp *)cmd;
+	ocrdma_get_create_qp_rsp(rsp, qp, attrs, dpp_offset, dpp_credit_lmt);
+	qp->state = OCRDMA_QPS_RST;
+	kfree(cmd);
+	return 0;
+mbx_err:
+	if (qp->rq.va)
+		dma_free_coherent(&pdev->dev, qp->rq.len, qp->rq.va, qp->rq.pa);
+rq_err:
+	pr_err("%s(%d) rq_err\n", __func__, dev->id);
+	dma_free_coherent(&pdev->dev, qp->sq.len, qp->sq.va, qp->sq.pa);
+sq_err:
+	pr_err("%s(%d) sq_err\n", __func__, dev->id);
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_query_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp,
+			struct ocrdma_qp_params *param)
+{
+	int status = -ENOMEM;
+	struct ocrdma_query_qp *cmd;
+	struct ocrdma_query_qp_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_QP, sizeof(*rsp));
+	if (!cmd)
+		return status;
+	cmd->qp_id = qp->id;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_query_qp_rsp *)cmd;
+	memcpy(param, &rsp->params, sizeof(struct ocrdma_qp_params));
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_set_av_params(struct ocrdma_qp *qp,
+				struct ocrdma_modify_qp *cmd,
+				struct ib_qp_attr *attrs,
+				int attr_mask)
+{
+	int status;
+	struct rdma_ah_attr *ah_attr = &attrs->ah_attr;
+	union ib_gid sgid;
+	struct ib_gid_attr sgid_attr;
+	u32 vlan_id = 0xFFFF;
+	u8 mac_addr[6], hdr_type;
+	union {
+		struct sockaddr     _sockaddr;
+		struct sockaddr_in  _sockaddr_in;
+		struct sockaddr_in6 _sockaddr_in6;
+	} sgid_addr, dgid_addr;
+	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
+	const struct ib_global_route *grh;
+
+	if ((rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) == 0)
+		return -EINVAL;
+	grh = rdma_ah_read_grh(ah_attr);
+	if (atomic_cmpxchg(&dev->update_sl, 1, 0))
+		ocrdma_init_service_level(dev);
+	cmd->params.tclass_sq_psn |=
+	    (grh->traffic_class << OCRDMA_QP_PARAMS_TCLASS_SHIFT);
+	cmd->params.rnt_rc_sl_fl |=
+	    (grh->flow_label & OCRDMA_QP_PARAMS_FLOW_LABEL_MASK);
+	cmd->params.rnt_rc_sl_fl |= (rdma_ah_get_sl(ah_attr) <<
+				     OCRDMA_QP_PARAMS_SL_SHIFT);
+	cmd->params.hop_lmt_rq_psn |=
+	    (grh->hop_limit << OCRDMA_QP_PARAMS_HOP_LMT_SHIFT);
+	cmd->flags |= OCRDMA_QP_PARA_FLOW_LBL_VALID;
+
+	/* GIDs */
+	memcpy(&cmd->params.dgid[0], &grh->dgid.raw[0],
+	       sizeof(cmd->params.dgid));
+
+	status = ib_get_cached_gid(&dev->ibdev, 1, grh->sgid_index,
+				   &sgid, &sgid_attr);
+	if (!status) {
+		vlan_id = rdma_vlan_dev_vlan_id(sgid_attr.ndev);
+		memcpy(mac_addr, sgid_attr.ndev->dev_addr, ETH_ALEN);
+		dev_put(sgid_attr.ndev);
+	}
+
+	qp->sgid_idx = grh->sgid_index;
+	memcpy(&cmd->params.sgid[0], &sgid.raw[0], sizeof(cmd->params.sgid));
+	status = ocrdma_resolve_dmac(dev, ah_attr, &mac_addr[0]);
+	if (status)
+		return status;
+	cmd->params.dmac_b0_to_b3 = mac_addr[0] | (mac_addr[1] << 8) |
+				(mac_addr[2] << 16) | (mac_addr[3] << 24);
+
+	hdr_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid);
+	if (hdr_type == RDMA_NETWORK_IPV4) {
+		rdma_gid2ip(&sgid_addr._sockaddr, &sgid);
+		rdma_gid2ip(&dgid_addr._sockaddr, &grh->dgid);
+		memcpy(&cmd->params.dgid[0],
+		       &dgid_addr._sockaddr_in.sin_addr.s_addr, 4);
+		memcpy(&cmd->params.sgid[0],
+		       &sgid_addr._sockaddr_in.sin_addr.s_addr, 4);
+	}
+	/* convert them to LE format. */
+	ocrdma_cpu_to_le32(&cmd->params.dgid[0], sizeof(cmd->params.dgid));
+	ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid));
+	cmd->params.vlan_dmac_b4_to_b5 = mac_addr[4] | (mac_addr[5] << 8);
+
+	if (vlan_id == 0xFFFF)
+		vlan_id = 0;
+	if (vlan_id || dev->pfc_state) {
+		if (!vlan_id) {
+			pr_err("ocrdma%d:Using VLAN with PFC is recommended\n",
+			       dev->id);
+			pr_err("ocrdma%d:Using VLAN 0 for this connection\n",
+			       dev->id);
+		}
+		cmd->params.vlan_dmac_b4_to_b5 |=
+		    vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT;
+		cmd->flags |= OCRDMA_QP_PARA_VLAN_EN_VALID;
+		cmd->params.rnt_rc_sl_fl |=
+			(dev->sl & 0x07) << OCRDMA_QP_PARAMS_SL_SHIFT;
+	}
+	cmd->params.max_sge_recv_flags |= ((hdr_type <<
+					OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_SHIFT) &
+					OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_MASK);
+	return 0;
+}
+
+static int ocrdma_set_qp_params(struct ocrdma_qp *qp,
+				struct ocrdma_modify_qp *cmd,
+				struct ib_qp_attr *attrs, int attr_mask)
+{
+	int status = 0;
+	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		cmd->params.path_mtu_pkey_indx |= (attrs->pkey_index &
+					    OCRDMA_QP_PARAMS_PKEY_INDEX_MASK);
+		cmd->flags |= OCRDMA_QP_PARA_PKEY_VALID;
+	}
+	if (attr_mask & IB_QP_QKEY) {
+		qp->qkey = attrs->qkey;
+		cmd->params.qkey = attrs->qkey;
+		cmd->flags |= OCRDMA_QP_PARA_QKEY_VALID;
+	}
+	if (attr_mask & IB_QP_AV) {
+		status = ocrdma_set_av_params(qp, cmd, attrs, attr_mask);
+		if (status)
+			return status;
+	} else if (qp->qp_type == IB_QPT_GSI || qp->qp_type == IB_QPT_UD) {
+		/* set the default mac address for UD, GSI QPs */
+		cmd->params.dmac_b0_to_b3 = dev->nic_info.mac_addr[0] |
+			(dev->nic_info.mac_addr[1] << 8) |
+			(dev->nic_info.mac_addr[2] << 16) |
+			(dev->nic_info.mac_addr[3] << 24);
+		cmd->params.vlan_dmac_b4_to_b5 = dev->nic_info.mac_addr[4] |
+					(dev->nic_info.mac_addr[5] << 8);
+	}
+	if ((attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) &&
+	    attrs->en_sqd_async_notify) {
+		cmd->params.max_sge_recv_flags |=
+			OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC;
+		cmd->flags |= OCRDMA_QP_PARA_DST_QPN_VALID;
+	}
+	if (attr_mask & IB_QP_DEST_QPN) {
+		cmd->params.ack_to_rnr_rtc_dest_qpn |= (attrs->dest_qp_num &
+				OCRDMA_QP_PARAMS_DEST_QPN_MASK);
+		cmd->flags |= OCRDMA_QP_PARA_DST_QPN_VALID;
+	}
+	if (attr_mask & IB_QP_PATH_MTU) {
+		if (attrs->path_mtu < IB_MTU_512 ||
+		    attrs->path_mtu > IB_MTU_4096) {
+			pr_err("ocrdma%d: IB MTU %d is not supported\n",
+			       dev->id, ib_mtu_enum_to_int(attrs->path_mtu));
+			status = -EINVAL;
+			goto pmtu_err;
+		}
+		cmd->params.path_mtu_pkey_indx |=
+		    (ib_mtu_enum_to_int(attrs->path_mtu) <<
+		     OCRDMA_QP_PARAMS_PATH_MTU_SHIFT) &
+		    OCRDMA_QP_PARAMS_PATH_MTU_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_PMTU_VALID;
+	}
+	if (attr_mask & IB_QP_TIMEOUT) {
+		cmd->params.ack_to_rnr_rtc_dest_qpn |= attrs->timeout <<
+		    OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT;
+		cmd->flags |= OCRDMA_QP_PARA_ACK_TO_VALID;
+	}
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		cmd->params.rnt_rc_sl_fl |= (attrs->retry_cnt <<
+				      OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT) &
+		    OCRDMA_QP_PARAMS_RETRY_CNT_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_RETRY_CNT_VALID;
+	}
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		cmd->params.rnt_rc_sl_fl |= (attrs->min_rnr_timer <<
+				      OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT) &
+		    OCRDMA_QP_PARAMS_RNR_NAK_TIMER_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_RNT_VALID;
+	}
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		cmd->params.ack_to_rnr_rtc_dest_qpn |= (attrs->rnr_retry <<
+			OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT)
+			& OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_RRC_VALID;
+	}
+	if (attr_mask & IB_QP_SQ_PSN) {
+		cmd->params.tclass_sq_psn |= (attrs->sq_psn & 0x00ffffff);
+		cmd->flags |= OCRDMA_QP_PARA_SQPSN_VALID;
+	}
+	if (attr_mask & IB_QP_RQ_PSN) {
+		cmd->params.hop_lmt_rq_psn |= (attrs->rq_psn & 0x00ffffff);
+		cmd->flags |= OCRDMA_QP_PARA_RQPSN_VALID;
+	}
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attrs->max_rd_atomic > dev->attr.max_ord_per_qp) {
+			status = -EINVAL;
+			goto pmtu_err;
+		}
+		qp->max_ord = attrs->max_rd_atomic;
+		cmd->flags |= OCRDMA_QP_PARA_MAX_ORD_VALID;
+	}
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attrs->max_dest_rd_atomic > dev->attr.max_ird_per_qp) {
+			status = -EINVAL;
+			goto pmtu_err;
+		}
+		qp->max_ird = attrs->max_dest_rd_atomic;
+		cmd->flags |= OCRDMA_QP_PARA_MAX_IRD_VALID;
+	}
+	cmd->params.max_ord_ird = (qp->max_ord <<
+				OCRDMA_QP_PARAMS_MAX_ORD_SHIFT) |
+				(qp->max_ird & OCRDMA_QP_PARAMS_MAX_IRD_MASK);
+pmtu_err:
+	return status;
+}
+
+int ocrdma_mbx_modify_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp,
+			 struct ib_qp_attr *attrs, int attr_mask)
+{
+	int status = -ENOMEM;
+	struct ocrdma_modify_qp *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_QP, sizeof(*cmd));
+	if (!cmd)
+		return status;
+
+	cmd->params.id = qp->id;
+	cmd->flags = 0;
+	if (attr_mask & IB_QP_STATE) {
+		cmd->params.max_sge_recv_flags |=
+		    (get_ocrdma_qp_state(attrs->qp_state) <<
+		     OCRDMA_QP_PARAMS_STATE_SHIFT) &
+		    OCRDMA_QP_PARAMS_STATE_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_QPS_VALID;
+	} else {
+		cmd->params.max_sge_recv_flags |=
+		    (qp->state << OCRDMA_QP_PARAMS_STATE_SHIFT) &
+		    OCRDMA_QP_PARAMS_STATE_MASK;
+	}
+
+	status = ocrdma_set_qp_params(qp, cmd, attrs, attr_mask);
+	if (status)
+		goto mbx_err;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_destroy_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
+{
+	int status = -ENOMEM;
+	struct ocrdma_destroy_qp *cmd;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_QP, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->qp_id = qp->id;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+mbx_err:
+	kfree(cmd);
+	if (qp->sq.va)
+		dma_free_coherent(&pdev->dev, qp->sq.len, qp->sq.va, qp->sq.pa);
+	if (!qp->srq && qp->rq.va)
+		dma_free_coherent(&pdev->dev, qp->rq.len, qp->rq.va, qp->rq.pa);
+	if (qp->dpp_enabled)
+		qp->pd->num_dpp_qp++;
+	return status;
+}
+
+int ocrdma_mbx_create_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq,
+			  struct ib_srq_init_attr *srq_attr,
+			  struct ocrdma_pd *pd)
+{
+	int status = -ENOMEM;
+	int hw_pages, hw_page_size;
+	int len;
+	struct ocrdma_create_srq_rsp *rsp;
+	struct ocrdma_create_srq *cmd;
+	dma_addr_t pa;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	u32 max_rqe_allocated;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_SRQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+
+	cmd->pgsz_pdid = pd->id & OCRDMA_CREATE_SRQ_PD_ID_MASK;
+	max_rqe_allocated = srq_attr->attr.max_wr + 1;
+	status = ocrdma_build_q_conf(&max_rqe_allocated,
+				dev->attr.rqe_size,
+				&hw_pages, &hw_page_size);
+	if (status) {
+		pr_err("%s() req. max_wr=0x%x\n", __func__,
+		       srq_attr->attr.max_wr);
+		status = -EINVAL;
+		goto ret;
+	}
+	len = hw_pages * hw_page_size;
+	srq->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
+	if (!srq->rq.va) {
+		status = -ENOMEM;
+		goto ret;
+	}
+	ocrdma_build_q_pages(&cmd->rq_addr[0], hw_pages, pa, hw_page_size);
+
+	srq->rq.entry_size = dev->attr.rqe_size;
+	srq->rq.pa = pa;
+	srq->rq.len = len;
+	srq->rq.max_cnt = max_rqe_allocated;
+
+	cmd->max_sge_rqe = ilog2(max_rqe_allocated);
+	cmd->max_sge_rqe |= srq_attr->attr.max_sge <<
+				OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT;
+
+	cmd->pgsz_pdid |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE)
+		<< OCRDMA_CREATE_SRQ_PG_SZ_SHIFT);
+	cmd->pages_rqe_sz |= (dev->attr.rqe_size
+		<< OCRDMA_CREATE_SRQ_RQE_SIZE_SHIFT)
+		& OCRDMA_CREATE_SRQ_RQE_SIZE_MASK;
+	cmd->pages_rqe_sz |= hw_pages << OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT;
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_create_srq_rsp *)cmd;
+	srq->id = rsp->id;
+	srq->rq.dbid = rsp->id;
+	max_rqe_allocated = ((rsp->max_sge_rqe_allocated &
+		OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_MASK) >>
+		OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_SHIFT);
+	max_rqe_allocated = (1 << max_rqe_allocated);
+	srq->rq.max_cnt = max_rqe_allocated;
+	srq->rq.max_wqe_idx = max_rqe_allocated - 1;
+	srq->rq.max_sges = (rsp->max_sge_rqe_allocated &
+		OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_MASK) >>
+		OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT;
+	goto ret;
+mbx_err:
+	dma_free_coherent(&pdev->dev, srq->rq.len, srq->rq.va, pa);
+ret:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_modify_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr)
+{
+	int status = -ENOMEM;
+	struct ocrdma_modify_srq *cmd;
+	struct ocrdma_pd *pd = srq->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_SRQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->id = srq->id;
+	cmd->limit_max_rqe |= srq_attr->srq_limit <<
+	    OCRDMA_MODIFY_SRQ_LIMIT_SHIFT;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_query_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr)
+{
+	int status = -ENOMEM;
+	struct ocrdma_query_srq *cmd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(srq->ibsrq.device);
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_SRQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->id = srq->rq.dbid;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status == 0) {
+		struct ocrdma_query_srq_rsp *rsp =
+		    (struct ocrdma_query_srq_rsp *)cmd;
+		srq_attr->max_sge =
+		    rsp->srq_lmt_max_sge &
+		    OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_MASK;
+		srq_attr->max_wr =
+		    rsp->max_rqe_pdid >> OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT;
+		srq_attr->srq_limit = rsp->srq_lmt_max_sge >>
+		    OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT;
+	}
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq)
+{
+	int status = -ENOMEM;
+	struct ocrdma_destroy_srq *cmd;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_SRQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->id = srq->id;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (srq->rq.va)
+		dma_free_coherent(&pdev->dev, srq->rq.len,
+				  srq->rq.va, srq->rq.pa);
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype,
+				      struct ocrdma_dcbx_cfg *dcbxcfg)
+{
+	int status;
+	dma_addr_t pa;
+	struct ocrdma_mqe cmd;
+
+	struct ocrdma_get_dcbx_cfg_req *req = NULL;
+	struct ocrdma_get_dcbx_cfg_rsp *rsp = NULL;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	struct ocrdma_mqe_sge *mqe_sge = cmd.u.nonemb_req.sge;
+
+	memset(&cmd, 0, sizeof(struct ocrdma_mqe));
+	cmd.hdr.pyld_len = max_t (u32, sizeof(struct ocrdma_get_dcbx_cfg_rsp),
+					sizeof(struct ocrdma_get_dcbx_cfg_req));
+	req = dma_alloc_coherent(&pdev->dev, cmd.hdr.pyld_len, &pa, GFP_KERNEL);
+	if (!req) {
+		status = -ENOMEM;
+		goto mem_err;
+	}
+
+	cmd.hdr.spcl_sge_cnt_emb |= (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) &
+					OCRDMA_MQE_HDR_SGE_CNT_MASK;
+	mqe_sge->pa_lo = (u32) (pa & 0xFFFFFFFFUL);
+	mqe_sge->pa_hi = (u32) upper_32_bits(pa);
+	mqe_sge->len = cmd.hdr.pyld_len;
+
+	memset(req, 0, sizeof(struct ocrdma_get_dcbx_cfg_req));
+	ocrdma_init_mch(&req->hdr, OCRDMA_CMD_GET_DCBX_CONFIG,
+			OCRDMA_SUBSYS_DCBX, cmd.hdr.pyld_len);
+	req->param_type = ptype;
+
+	status = ocrdma_mbx_cmd(dev, &cmd);
+	if (status)
+		goto mbx_err;
+
+	rsp = (struct ocrdma_get_dcbx_cfg_rsp *)req;
+	ocrdma_le32_to_cpu(rsp, sizeof(struct ocrdma_get_dcbx_cfg_rsp));
+	memcpy(dcbxcfg, &rsp->cfg, sizeof(struct ocrdma_dcbx_cfg));
+
+mbx_err:
+	dma_free_coherent(&pdev->dev, cmd.hdr.pyld_len, req, pa);
+mem_err:
+	return status;
+}
+
+#define OCRDMA_MAX_SERVICE_LEVEL_INDEX	0x08
+#define OCRDMA_DEFAULT_SERVICE_LEVEL	0x05
+
+static int ocrdma_parse_dcbxcfg_rsp(struct ocrdma_dev *dev, int ptype,
+				    struct ocrdma_dcbx_cfg *dcbxcfg,
+				    u8 *srvc_lvl)
+{
+	int status = -EINVAL, indx, slindx;
+	int ventry_cnt;
+	struct ocrdma_app_parameter *app_param;
+	u8 valid, proto_sel;
+	u8 app_prio, pfc_prio;
+	u16 proto;
+
+	if (!(dcbxcfg->tcv_aev_opv_st & OCRDMA_DCBX_STATE_MASK)) {
+		pr_info("%s ocrdma%d DCBX is disabled\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id);
+		goto out;
+	}
+
+	if (!ocrdma_is_enabled_and_synced(dcbxcfg->pfc_state)) {
+		pr_info("%s ocrdma%d priority flow control(%s) is %s%s\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id,
+			(ptype > 0 ? "operational" : "admin"),
+			(dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_ENABLED) ?
+			"enabled" : "disabled",
+			(dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_SYNC) ?
+			"" : ", not sync'ed");
+		goto out;
+	} else {
+		pr_info("%s ocrdma%d priority flow control is enabled and sync'ed\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id);
+	}
+
+	ventry_cnt = (dcbxcfg->tcv_aev_opv_st >>
+				OCRDMA_DCBX_APP_ENTRY_SHIFT)
+				& OCRDMA_DCBX_STATE_MASK;
+
+	for (indx = 0; indx < ventry_cnt; indx++) {
+		app_param = &dcbxcfg->app_param[indx];
+		valid = (app_param->valid_proto_app >>
+				OCRDMA_APP_PARAM_VALID_SHIFT)
+				& OCRDMA_APP_PARAM_VALID_MASK;
+		proto_sel = (app_param->valid_proto_app
+				>>  OCRDMA_APP_PARAM_PROTO_SEL_SHIFT)
+				& OCRDMA_APP_PARAM_PROTO_SEL_MASK;
+		proto = app_param->valid_proto_app &
+				OCRDMA_APP_PARAM_APP_PROTO_MASK;
+
+		if (
+			valid && proto == ETH_P_IBOE &&
+			proto_sel == OCRDMA_PROTO_SELECT_L2) {
+			for (slindx = 0; slindx <
+				OCRDMA_MAX_SERVICE_LEVEL_INDEX; slindx++) {
+				app_prio = ocrdma_get_app_prio(
+						(u8 *)app_param->app_prio,
+						slindx);
+				pfc_prio = ocrdma_get_pfc_prio(
+						(u8 *)dcbxcfg->pfc_prio,
+						slindx);
+
+				if (app_prio && pfc_prio) {
+					*srvc_lvl = slindx;
+					status = 0;
+					goto out;
+				}
+			}
+			if (slindx == OCRDMA_MAX_SERVICE_LEVEL_INDEX) {
+				pr_info("%s ocrdma%d application priority not set for 0x%x protocol\n",
+					dev_name(&dev->nic_info.pdev->dev),
+					dev->id, proto);
+			}
+		}
+	}
+
+out:
+	return status;
+}
+
+void ocrdma_init_service_level(struct ocrdma_dev *dev)
+{
+	int status = 0, indx;
+	struct ocrdma_dcbx_cfg dcbxcfg;
+	u8 srvc_lvl = OCRDMA_DEFAULT_SERVICE_LEVEL;
+	int ptype = OCRDMA_PARAMETER_TYPE_OPER;
+
+	for (indx = 0; indx < 2; indx++) {
+		status = ocrdma_mbx_get_dcbx_config(dev, ptype, &dcbxcfg);
+		if (status) {
+			pr_err("%s(): status=%d\n", __func__, status);
+			ptype = OCRDMA_PARAMETER_TYPE_ADMIN;
+			continue;
+		}
+
+		status = ocrdma_parse_dcbxcfg_rsp(dev, ptype,
+						  &dcbxcfg, &srvc_lvl);
+		if (status) {
+			ptype = OCRDMA_PARAMETER_TYPE_ADMIN;
+			continue;
+		}
+
+		break;
+	}
+
+	if (status)
+		pr_info("%s ocrdma%d service level default\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id);
+	else
+		pr_info("%s ocrdma%d service level %d\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id,
+			srvc_lvl);
+
+	dev->pfc_state = ocrdma_is_enabled_and_synced(dcbxcfg.pfc_state);
+	dev->sl = srvc_lvl;
+}
+
+int ocrdma_alloc_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah)
+{
+	int i;
+	int status = -EINVAL;
+	struct ocrdma_av *av;
+	unsigned long flags;
+
+	av = dev->av_tbl.va;
+	spin_lock_irqsave(&dev->av_tbl.lock, flags);
+	for (i = 0; i < dev->av_tbl.num_ah; i++) {
+		if (av->valid == 0) {
+			av->valid = OCRDMA_AV_VALID;
+			ah->av = av;
+			ah->id = i;
+			status = 0;
+			break;
+		}
+		av++;
+	}
+	if (i == dev->av_tbl.num_ah)
+		status = -EAGAIN;
+	spin_unlock_irqrestore(&dev->av_tbl.lock, flags);
+	return status;
+}
+
+int ocrdma_free_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&dev->av_tbl.lock, flags);
+	ah->av->valid = 0;
+	spin_unlock_irqrestore(&dev->av_tbl.lock, flags);
+	return 0;
+}
+
+static int ocrdma_create_eqs(struct ocrdma_dev *dev)
+{
+	int num_eq, i, status = 0;
+	int irq;
+	unsigned long flags = 0;
+
+	num_eq = dev->nic_info.msix.num_vectors -
+			dev->nic_info.msix.start_vector;
+	if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX) {
+		num_eq = 1;
+		flags = IRQF_SHARED;
+	} else {
+		num_eq = min_t(u32, num_eq, num_online_cpus());
+	}
+
+	if (!num_eq)
+		return -EINVAL;
+
+	dev->eq_tbl = kzalloc(sizeof(struct ocrdma_eq) * num_eq, GFP_KERNEL);
+	if (!dev->eq_tbl)
+		return -ENOMEM;
+
+	for (i = 0; i < num_eq; i++) {
+		status = ocrdma_create_eq(dev, &dev->eq_tbl[i],
+					OCRDMA_EQ_LEN);
+		if (status) {
+			status = -EINVAL;
+			break;
+		}
+		sprintf(dev->eq_tbl[i].irq_name, "ocrdma%d-%d",
+			dev->id, i);
+		irq = ocrdma_get_irq(dev, &dev->eq_tbl[i]);
+		status = request_irq(irq, ocrdma_irq_handler, flags,
+				     dev->eq_tbl[i].irq_name,
+				     &dev->eq_tbl[i]);
+		if (status)
+			goto done;
+		dev->eq_cnt += 1;
+	}
+	/* one eq is sufficient for data path to work */
+	return 0;
+done:
+	ocrdma_destroy_eqs(dev);
+	return status;
+}
+
+static int ocrdma_mbx_modify_eqd(struct ocrdma_dev *dev, struct ocrdma_eq *eq,
+				 int num)
+{
+	int i, status;
+	struct ocrdma_modify_eqd_req *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_EQ_DELAY, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+
+	ocrdma_init_mch(&cmd->cmd.req, OCRDMA_CMD_MODIFY_EQ_DELAY,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	cmd->cmd.num_eq = num;
+	for (i = 0; i < num; i++) {
+		cmd->cmd.set_eqd[i].eq_id = eq[i].q.id;
+		cmd->cmd.set_eqd[i].phase = 0;
+		cmd->cmd.set_eqd[i].delay_multiplier =
+				(eq[i].aic_obj.prev_eqd * 65)/100;
+	}
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_modify_eqd(struct ocrdma_dev *dev, struct ocrdma_eq *eq,
+			     int num)
+{
+	int num_eqs, i = 0;
+	if (num > 8) {
+		while (num) {
+			num_eqs = min(num, 8);
+			ocrdma_mbx_modify_eqd(dev, &eq[i], num_eqs);
+			i += num_eqs;
+			num -= num_eqs;
+		}
+	} else {
+		ocrdma_mbx_modify_eqd(dev, eq, num);
+	}
+	return 0;
+}
+
+void ocrdma_eqd_set_task(struct work_struct *work)
+{
+	struct ocrdma_dev *dev =
+		container_of(work, struct ocrdma_dev, eqd_work.work);
+	struct ocrdma_eq *eq = NULL;
+	int i, num = 0;
+	u64 eq_intr;
+
+	for (i = 0; i < dev->eq_cnt; i++) {
+		eq = &dev->eq_tbl[i];
+		if (eq->aic_obj.eq_intr_cnt > eq->aic_obj.prev_eq_intr_cnt) {
+			eq_intr = eq->aic_obj.eq_intr_cnt -
+				  eq->aic_obj.prev_eq_intr_cnt;
+			if ((eq_intr > EQ_INTR_PER_SEC_THRSH_HI) &&
+			    (eq->aic_obj.prev_eqd == EQ_AIC_MIN_EQD)) {
+				eq->aic_obj.prev_eqd = EQ_AIC_MAX_EQD;
+				num++;
+			} else if ((eq_intr < EQ_INTR_PER_SEC_THRSH_LOW) &&
+				   (eq->aic_obj.prev_eqd == EQ_AIC_MAX_EQD)) {
+				eq->aic_obj.prev_eqd = EQ_AIC_MIN_EQD;
+				num++;
+			}
+		}
+		eq->aic_obj.prev_eq_intr_cnt = eq->aic_obj.eq_intr_cnt;
+	}
+
+	if (num)
+		ocrdma_modify_eqd(dev, &dev->eq_tbl[0], num);
+	schedule_delayed_work(&dev->eqd_work, msecs_to_jiffies(1000));
+}
+
+int ocrdma_init_hw(struct ocrdma_dev *dev)
+{
+	int status;
+
+	/* create the eqs  */
+	status = ocrdma_create_eqs(dev);
+	if (status)
+		goto qpeq_err;
+	status = ocrdma_create_mq(dev);
+	if (status)
+		goto mq_err;
+	status = ocrdma_mbx_query_fw_config(dev);
+	if (status)
+		goto conf_err;
+	status = ocrdma_mbx_query_dev(dev);
+	if (status)
+		goto conf_err;
+	status = ocrdma_mbx_query_fw_ver(dev);
+	if (status)
+		goto conf_err;
+	status = ocrdma_mbx_create_ah_tbl(dev);
+	if (status)
+		goto conf_err;
+	status = ocrdma_mbx_get_phy_info(dev);
+	if (status)
+		goto info_attrb_err;
+	status = ocrdma_mbx_get_ctrl_attribs(dev);
+	if (status)
+		goto info_attrb_err;
+
+	return 0;
+
+info_attrb_err:
+	ocrdma_mbx_delete_ah_tbl(dev);
+conf_err:
+	ocrdma_destroy_mq(dev);
+mq_err:
+	ocrdma_destroy_eqs(dev);
+qpeq_err:
+	pr_err("%s() status=%d\n", __func__, status);
+	return status;
+}
+
+void ocrdma_cleanup_hw(struct ocrdma_dev *dev)
+{
+	ocrdma_free_pd_pool(dev);
+	ocrdma_mbx_delete_ah_tbl(dev);
+
+	/* cleanup the control path */
+	ocrdma_destroy_mq(dev);
+
+	/* cleanup the eqs */
+	ocrdma_destroy_eqs(dev);
+}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h
new file mode 100644
index 0000000..ebc1f44
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h
@@ -0,0 +1,159 @@
+/* This file is part of the Emulex RoCE Device Driver for
+ * RoCE (RDMA over Converged Ethernet) adapters.
+ * Copyright (C) 2012-2015 Emulex. All rights reserved.
+ * EMULEX and SLI are trademarks of Emulex.
+ * www.emulex.com
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License (GPL) Version 2, available from the file COPYING in the main
+ * directory of this source tree, or the BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#ifndef __OCRDMA_HW_H__
+#define __OCRDMA_HW_H__
+
+#include "ocrdma_sli.h"
+
+static inline void ocrdma_cpu_to_le32(void *dst, u32 len)
+{
+#ifdef __BIG_ENDIAN
+	int i = 0;
+	u32 *src_ptr = dst;
+	u32 *dst_ptr = dst;
+	for (; i < (len / 4); i++)
+		*(dst_ptr + i) = cpu_to_le32p(src_ptr + i);
+#endif
+}
+
+static inline void ocrdma_le32_to_cpu(void *dst, u32 len)
+{
+#ifdef __BIG_ENDIAN
+	int i = 0;
+	u32 *src_ptr = dst;
+	u32 *dst_ptr = dst;
+	for (; i < (len / sizeof(u32)); i++)
+		*(dst_ptr + i) = le32_to_cpu(*(src_ptr + i));
+#endif
+}
+
+static inline void ocrdma_copy_cpu_to_le32(void *dst, void *src, u32 len)
+{
+#ifdef __BIG_ENDIAN
+	int i = 0;
+	u32 *src_ptr = src;
+	u32 *dst_ptr = dst;
+	for (; i < (len / sizeof(u32)); i++)
+		*(dst_ptr + i) = cpu_to_le32p(src_ptr + i);
+#else
+	memcpy(dst, src, len);
+#endif
+}
+
+static inline void ocrdma_copy_le32_to_cpu(void *dst, void *src, u32 len)
+{
+#ifdef __BIG_ENDIAN
+	int i = 0;
+	u32 *src_ptr = src;
+	u32 *dst_ptr = dst;
+	for (; i < len / sizeof(u32); i++)
+		*(dst_ptr + i) = le32_to_cpu(*(src_ptr + i));
+#else
+	memcpy(dst, src, len);
+#endif
+}
+
+static inline u64 ocrdma_get_db_addr(struct ocrdma_dev *dev, u32 pdid)
+{
+	return dev->nic_info.unmapped_db + (pdid * dev->nic_info.db_page_size);
+}
+
+int ocrdma_init_hw(struct ocrdma_dev *);
+void ocrdma_cleanup_hw(struct ocrdma_dev *);
+
+enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps);
+void ocrdma_ring_cq_db(struct ocrdma_dev *, u16 cq_id, bool armed,
+		       bool solicited, u16 cqe_popped);
+
+/* verbs specific mailbox commands */
+int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed,
+			      u8 *lnk_st);
+int ocrdma_query_config(struct ocrdma_dev *,
+			struct ocrdma_mbx_query_config *config);
+
+int ocrdma_mbx_alloc_pd(struct ocrdma_dev *, struct ocrdma_pd *);
+int ocrdma_mbx_dealloc_pd(struct ocrdma_dev *, struct ocrdma_pd *);
+
+int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr,
+			  u32 pd_id, int addr_check);
+int ocrdma_mbx_dealloc_lkey(struct ocrdma_dev *, int fmr, u32 lkey);
+
+int ocrdma_reg_mr(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr,
+			u32 pd_id, int acc);
+int ocrdma_mbx_create_cq(struct ocrdma_dev *, struct ocrdma_cq *,
+				int entries, int dpp_cq, u16 pd_id);
+int ocrdma_mbx_destroy_cq(struct ocrdma_dev *, struct ocrdma_cq *);
+
+int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs,
+			 u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset,
+			 u16 *dpp_credit_lmt);
+int ocrdma_mbx_modify_qp(struct ocrdma_dev *, struct ocrdma_qp *,
+			 struct ib_qp_attr *attrs, int attr_mask);
+int ocrdma_mbx_query_qp(struct ocrdma_dev *, struct ocrdma_qp *,
+			struct ocrdma_qp_params *param);
+int ocrdma_mbx_destroy_qp(struct ocrdma_dev *, struct ocrdma_qp *);
+int ocrdma_mbx_create_srq(struct ocrdma_dev *, struct ocrdma_srq *,
+			  struct ib_srq_init_attr *,
+			  struct ocrdma_pd *);
+int ocrdma_mbx_modify_srq(struct ocrdma_srq *, struct ib_srq_attr *);
+int ocrdma_mbx_query_srq(struct ocrdma_srq *, struct ib_srq_attr *);
+int ocrdma_mbx_destroy_srq(struct ocrdma_dev *, struct ocrdma_srq *);
+
+int ocrdma_alloc_av(struct ocrdma_dev *, struct ocrdma_ah *);
+int ocrdma_free_av(struct ocrdma_dev *, struct ocrdma_ah *);
+
+int ocrdma_qp_state_change(struct ocrdma_qp *, enum ib_qp_state new_state,
+			    enum ib_qp_state *old_ib_state);
+bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *);
+bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *);
+void ocrdma_flush_qp(struct ocrdma_qp *);
+int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq);
+
+int ocrdma_mbx_rdma_stats(struct ocrdma_dev *, bool reset);
+char *port_speed_string(struct ocrdma_dev *dev);
+void ocrdma_init_service_level(struct ocrdma_dev *);
+void ocrdma_alloc_pd_pool(struct ocrdma_dev *dev);
+void ocrdma_free_pd_range(struct ocrdma_dev *dev);
+void ocrdma_update_link_state(struct ocrdma_dev *dev, u8 lstate);
+
+#endif				/* __OCRDMA_HW_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
new file mode 100644
index 0000000..eb8b6a9
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -0,0 +1,478 @@
+/* This file is part of the Emulex RoCE Device Driver for
+ * RoCE (RDMA over Converged Ethernet) adapters.
+ * Copyright (C) 2012-2015 Emulex. All rights reserved.
+ * EMULEX and SLI are trademarks of Emulex.
+ * www.emulex.com
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License (GPL) Version 2, available from the file COPYING in the main
+ * directory of this source tree, or the BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include <linux/module.h>
+#include <linux/idr.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_mad.h>
+
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
+
+#include "ocrdma.h"
+#include "ocrdma_verbs.h"
+#include "ocrdma_ah.h"
+#include "be_roce.h"
+#include "ocrdma_hw.h"
+#include "ocrdma_stats.h"
+#include <rdma/ocrdma-abi.h>
+
+MODULE_DESCRIPTION(OCRDMA_ROCE_DRV_DESC " " OCRDMA_ROCE_DRV_VERSION);
+MODULE_AUTHOR("Emulex Corporation");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static DEFINE_IDR(ocrdma_dev_id);
+
+void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
+{
+	u8 mac_addr[6];
+
+	memcpy(&mac_addr[0], &dev->nic_info.mac_addr[0], ETH_ALEN);
+	guid[0] = mac_addr[0] ^ 2;
+	guid[1] = mac_addr[1];
+	guid[2] = mac_addr[2];
+	guid[3] = 0xff;
+	guid[4] = 0xfe;
+	guid[5] = mac_addr[3];
+	guid[6] = mac_addr[4];
+	guid[7] = mac_addr[5];
+}
+static enum rdma_link_layer ocrdma_link_layer(struct ib_device *device,
+					      u8 port_num)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+static int ocrdma_port_immutable(struct ib_device *ibdev, u8 port_num,
+			         struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	struct ocrdma_dev *dev;
+	int err;
+
+	dev = get_ocrdma_dev(ibdev);
+	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+	if (ocrdma_is_udp_encap_supported(dev))
+		immutable->core_cap_flags |= RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+	return 0;
+}
+
+static void get_dev_fw_str(struct ib_device *device, char *str)
+{
+	struct ocrdma_dev *dev = get_ocrdma_dev(device);
+
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", &dev->attr.fw_ver[0]);
+}
+
+static int ocrdma_register_device(struct ocrdma_dev *dev)
+{
+	strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX);
+	ocrdma_get_guid(dev, (u8 *)&dev->ibdev.node_guid);
+	BUILD_BUG_ON(sizeof(OCRDMA_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX);
+	memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC,
+	       sizeof(OCRDMA_NODE_DESC));
+	dev->ibdev.owner = THIS_MODULE;
+	dev->ibdev.uverbs_abi_ver = OCRDMA_ABI_VERSION;
+	dev->ibdev.uverbs_cmd_mask =
+	    OCRDMA_UVERBS(GET_CONTEXT) |
+	    OCRDMA_UVERBS(QUERY_DEVICE) |
+	    OCRDMA_UVERBS(QUERY_PORT) |
+	    OCRDMA_UVERBS(ALLOC_PD) |
+	    OCRDMA_UVERBS(DEALLOC_PD) |
+	    OCRDMA_UVERBS(REG_MR) |
+	    OCRDMA_UVERBS(DEREG_MR) |
+	    OCRDMA_UVERBS(CREATE_COMP_CHANNEL) |
+	    OCRDMA_UVERBS(CREATE_CQ) |
+	    OCRDMA_UVERBS(RESIZE_CQ) |
+	    OCRDMA_UVERBS(DESTROY_CQ) |
+	    OCRDMA_UVERBS(REQ_NOTIFY_CQ) |
+	    OCRDMA_UVERBS(CREATE_QP) |
+	    OCRDMA_UVERBS(MODIFY_QP) |
+	    OCRDMA_UVERBS(QUERY_QP) |
+	    OCRDMA_UVERBS(DESTROY_QP) |
+	    OCRDMA_UVERBS(POLL_CQ) |
+	    OCRDMA_UVERBS(POST_SEND) |
+	    OCRDMA_UVERBS(POST_RECV);
+
+	dev->ibdev.uverbs_cmd_mask |=
+	    OCRDMA_UVERBS(CREATE_AH) |
+	     OCRDMA_UVERBS(MODIFY_AH) |
+	     OCRDMA_UVERBS(QUERY_AH) |
+	     OCRDMA_UVERBS(DESTROY_AH);
+
+	dev->ibdev.node_type = RDMA_NODE_IB_CA;
+	dev->ibdev.phys_port_cnt = 1;
+	dev->ibdev.num_comp_vectors = dev->eq_cnt;
+
+	/* mandatory verbs. */
+	dev->ibdev.query_device = ocrdma_query_device;
+	dev->ibdev.query_port = ocrdma_query_port;
+	dev->ibdev.modify_port = ocrdma_modify_port;
+	dev->ibdev.get_netdev = ocrdma_get_netdev;
+	dev->ibdev.get_link_layer = ocrdma_link_layer;
+	dev->ibdev.alloc_pd = ocrdma_alloc_pd;
+	dev->ibdev.dealloc_pd = ocrdma_dealloc_pd;
+
+	dev->ibdev.create_cq = ocrdma_create_cq;
+	dev->ibdev.destroy_cq = ocrdma_destroy_cq;
+	dev->ibdev.resize_cq = ocrdma_resize_cq;
+
+	dev->ibdev.create_qp = ocrdma_create_qp;
+	dev->ibdev.modify_qp = ocrdma_modify_qp;
+	dev->ibdev.query_qp = ocrdma_query_qp;
+	dev->ibdev.destroy_qp = ocrdma_destroy_qp;
+
+	dev->ibdev.query_pkey = ocrdma_query_pkey;
+	dev->ibdev.create_ah = ocrdma_create_ah;
+	dev->ibdev.destroy_ah = ocrdma_destroy_ah;
+	dev->ibdev.query_ah = ocrdma_query_ah;
+	dev->ibdev.modify_ah = ocrdma_modify_ah;
+
+	dev->ibdev.poll_cq = ocrdma_poll_cq;
+	dev->ibdev.post_send = ocrdma_post_send;
+	dev->ibdev.post_recv = ocrdma_post_recv;
+	dev->ibdev.req_notify_cq = ocrdma_arm_cq;
+
+	dev->ibdev.get_dma_mr = ocrdma_get_dma_mr;
+	dev->ibdev.dereg_mr = ocrdma_dereg_mr;
+	dev->ibdev.reg_user_mr = ocrdma_reg_user_mr;
+
+	dev->ibdev.alloc_mr = ocrdma_alloc_mr;
+	dev->ibdev.map_mr_sg = ocrdma_map_mr_sg;
+
+	/* mandatory to support user space verbs consumer. */
+	dev->ibdev.alloc_ucontext = ocrdma_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = ocrdma_dealloc_ucontext;
+	dev->ibdev.mmap = ocrdma_mmap;
+	dev->ibdev.dev.parent = &dev->nic_info.pdev->dev;
+
+	dev->ibdev.process_mad = ocrdma_process_mad;
+	dev->ibdev.get_port_immutable = ocrdma_port_immutable;
+	dev->ibdev.get_dev_fw_str     = get_dev_fw_str;
+
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		dev->ibdev.uverbs_cmd_mask |=
+		     OCRDMA_UVERBS(CREATE_SRQ) |
+		     OCRDMA_UVERBS(MODIFY_SRQ) |
+		     OCRDMA_UVERBS(QUERY_SRQ) |
+		     OCRDMA_UVERBS(DESTROY_SRQ) |
+		     OCRDMA_UVERBS(POST_SRQ_RECV);
+
+		dev->ibdev.create_srq = ocrdma_create_srq;
+		dev->ibdev.modify_srq = ocrdma_modify_srq;
+		dev->ibdev.query_srq = ocrdma_query_srq;
+		dev->ibdev.destroy_srq = ocrdma_destroy_srq;
+		dev->ibdev.post_srq_recv = ocrdma_post_srq_recv;
+	}
+	dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA;
+	return ib_register_device(&dev->ibdev, NULL);
+}
+
+static int ocrdma_alloc_resources(struct ocrdma_dev *dev)
+{
+	mutex_init(&dev->dev_lock);
+	dev->cq_tbl = kzalloc(sizeof(struct ocrdma_cq *) *
+			      OCRDMA_MAX_CQ, GFP_KERNEL);
+	if (!dev->cq_tbl)
+		goto alloc_err;
+
+	if (dev->attr.max_qp) {
+		dev->qp_tbl = kzalloc(sizeof(struct ocrdma_qp *) *
+				      OCRDMA_MAX_QP, GFP_KERNEL);
+		if (!dev->qp_tbl)
+			goto alloc_err;
+	}
+
+	dev->stag_arr = kzalloc(sizeof(u64) * OCRDMA_MAX_STAG, GFP_KERNEL);
+	if (dev->stag_arr == NULL)
+		goto alloc_err;
+
+	ocrdma_alloc_pd_pool(dev);
+
+	if (!ocrdma_alloc_stats_resources(dev)) {
+		pr_err("%s: stats resource allocation failed\n", __func__);
+		goto alloc_err;
+	}
+
+	spin_lock_init(&dev->av_tbl.lock);
+	spin_lock_init(&dev->flush_q_lock);
+	return 0;
+alloc_err:
+	pr_err("%s(%d) error.\n", __func__, dev->id);
+	return -ENOMEM;
+}
+
+static void ocrdma_free_resources(struct ocrdma_dev *dev)
+{
+	ocrdma_release_stats_resources(dev);
+	kfree(dev->stag_arr);
+	kfree(dev->qp_tbl);
+	kfree(dev->cq_tbl);
+}
+
+/* OCRDMA sysfs interface */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct ocrdma_dev *dev = dev_get_drvdata(device);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor);
+}
+
+static ssize_t show_hca_type(struct device *device,
+			     struct device_attribute *attr, char *buf)
+{
+	struct ocrdma_dev *dev = dev_get_drvdata(device);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]);
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL);
+
+static struct device_attribute *ocrdma_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type
+};
+
+static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++)
+		device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]);
+}
+
+static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
+{
+	int status = 0, i;
+	u8 lstate = 0;
+	struct ocrdma_dev *dev;
+
+	dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev));
+	if (!dev) {
+		pr_err("Unable to allocate ib device\n");
+		return NULL;
+	}
+	dev->mbx_cmd = kzalloc(sizeof(struct ocrdma_mqe_emb_cmd), GFP_KERNEL);
+	if (!dev->mbx_cmd)
+		goto idr_err;
+
+	memcpy(&dev->nic_info, dev_info, sizeof(*dev_info));
+	dev->id = idr_alloc(&ocrdma_dev_id, NULL, 0, 0, GFP_KERNEL);
+	if (dev->id < 0)
+		goto idr_err;
+
+	status = ocrdma_init_hw(dev);
+	if (status)
+		goto init_err;
+
+	status = ocrdma_alloc_resources(dev);
+	if (status)
+		goto alloc_err;
+
+	ocrdma_init_service_level(dev);
+	status = ocrdma_register_device(dev);
+	if (status)
+		goto alloc_err;
+
+	/* Query Link state and update */
+	status = ocrdma_mbx_get_link_speed(dev, NULL, &lstate);
+	if (!status)
+		ocrdma_update_link_state(dev, lstate);
+
+	for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++)
+		if (device_create_file(&dev->ibdev.dev, ocrdma_attributes[i]))
+			goto sysfs_err;
+	/* Init stats */
+	ocrdma_add_port_stats(dev);
+	/* Interrupt Moderation */
+	INIT_DELAYED_WORK(&dev->eqd_work, ocrdma_eqd_set_task);
+	schedule_delayed_work(&dev->eqd_work, msecs_to_jiffies(1000));
+
+	pr_info("%s %s: %s \"%s\" port %d\n",
+		dev_name(&dev->nic_info.pdev->dev), hca_name(dev),
+		port_speed_string(dev), dev->model_number,
+		dev->hba_port_num);
+	pr_info("%s ocrdma%d driver loaded successfully\n",
+		dev_name(&dev->nic_info.pdev->dev), dev->id);
+	return dev;
+
+sysfs_err:
+	ocrdma_remove_sysfiles(dev);
+alloc_err:
+	ocrdma_free_resources(dev);
+	ocrdma_cleanup_hw(dev);
+init_err:
+	idr_remove(&ocrdma_dev_id, dev->id);
+idr_err:
+	kfree(dev->mbx_cmd);
+	ib_dealloc_device(&dev->ibdev);
+	pr_err("%s() leaving. ret=%d\n", __func__, status);
+	return NULL;
+}
+
+static void ocrdma_remove_free(struct ocrdma_dev *dev)
+{
+
+	idr_remove(&ocrdma_dev_id, dev->id);
+	kfree(dev->mbx_cmd);
+	ib_dealloc_device(&dev->ibdev);
+}
+
+static void ocrdma_remove(struct ocrdma_dev *dev)
+{
+	/* first unregister with stack to stop all the active traffic
+	 * of the registered clients.
+	 */
+	cancel_delayed_work_sync(&dev->eqd_work);
+	ocrdma_remove_sysfiles(dev);
+	ib_unregister_device(&dev->ibdev);
+
+	ocrdma_rem_port_stats(dev);
+	ocrdma_free_resources(dev);
+	ocrdma_cleanup_hw(dev);
+	ocrdma_remove_free(dev);
+}
+
+static int ocrdma_dispatch_port_active(struct ocrdma_dev *dev)
+{
+	struct ib_event port_event;
+
+	port_event.event = IB_EVENT_PORT_ACTIVE;
+	port_event.element.port_num = 1;
+	port_event.device = &dev->ibdev;
+	ib_dispatch_event(&port_event);
+	return 0;
+}
+
+static int ocrdma_dispatch_port_error(struct ocrdma_dev *dev)
+{
+	struct ib_event err_event;
+
+	err_event.event = IB_EVENT_PORT_ERR;
+	err_event.element.port_num = 1;
+	err_event.device = &dev->ibdev;
+	ib_dispatch_event(&err_event);
+	return 0;
+}
+
+static void ocrdma_shutdown(struct ocrdma_dev *dev)
+{
+	ocrdma_dispatch_port_error(dev);
+	ocrdma_remove(dev);
+}
+
+/* event handling via NIC driver ensures that all the NIC specific
+ * initialization done before RoCE driver notifies
+ * event to stack.
+ */
+static void ocrdma_event_handler(struct ocrdma_dev *dev, u32 event)
+{
+	switch (event) {
+	case BE_DEV_SHUTDOWN:
+		ocrdma_shutdown(dev);
+		break;
+	default:
+		break;
+	}
+}
+
+void ocrdma_update_link_state(struct ocrdma_dev *dev, u8 lstate)
+{
+	if (!(dev->flags & OCRDMA_FLAGS_LINK_STATUS_INIT)) {
+		dev->flags |= OCRDMA_FLAGS_LINK_STATUS_INIT;
+		if (!lstate)
+			return;
+	}
+
+	if (!lstate)
+		ocrdma_dispatch_port_error(dev);
+	else
+		ocrdma_dispatch_port_active(dev);
+}
+
+static struct ocrdma_driver ocrdma_drv = {
+	.name			= "ocrdma_driver",
+	.add			= ocrdma_add,
+	.remove			= ocrdma_remove,
+	.state_change_handler	= ocrdma_event_handler,
+	.be_abi_version		= OCRDMA_BE_ROCE_ABI_VERSION,
+};
+
+static int __init ocrdma_init_module(void)
+{
+	int status;
+
+	ocrdma_init_debugfs();
+
+	status = be_roce_register_driver(&ocrdma_drv);
+	if (status)
+		goto err_be_reg;
+
+	return 0;
+
+err_be_reg:
+
+	return status;
+}
+
+static void __exit ocrdma_exit_module(void)
+{
+	be_roce_unregister_driver(&ocrdma_drv);
+	ocrdma_rem_debugfs();
+	idr_destroy(&ocrdma_dev_id);
+}
+
+module_init(ocrdma_init_module);
+module_exit(ocrdma_exit_module);
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
new file mode 100644
index 0000000..6ef89c2
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
@@ -0,0 +1,2240 @@
+/* This file is part of the Emulex RoCE Device Driver for
+ * RoCE (RDMA over Converged Ethernet) adapters.
+ * Copyright (C) 2012-2015 Emulex. All rights reserved.
+ * EMULEX and SLI are trademarks of Emulex.
+ * www.emulex.com
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License (GPL) Version 2, available from the file COPYING in the main
+ * directory of this source tree, or the BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#ifndef __OCRDMA_SLI_H__
+#define __OCRDMA_SLI_H__
+
+enum {
+	OCRDMA_ASIC_GEN_SKH_R = 0x04,
+	OCRDMA_ASIC_GEN_LANCER = 0x0B
+};
+
+enum {
+	OCRDMA_ASIC_REV_A0 = 0x00,
+	OCRDMA_ASIC_REV_B0 = 0x10,
+	OCRDMA_ASIC_REV_C0 = 0x20
+};
+
+#define OCRDMA_SUBSYS_ROCE 10
+enum {
+	OCRDMA_CMD_QUERY_CONFIG = 1,
+	OCRDMA_CMD_ALLOC_PD = 2,
+	OCRDMA_CMD_DEALLOC_PD = 3,
+
+	OCRDMA_CMD_CREATE_AH_TBL = 4,
+	OCRDMA_CMD_DELETE_AH_TBL = 5,
+
+	OCRDMA_CMD_CREATE_QP = 6,
+	OCRDMA_CMD_QUERY_QP = 7,
+	OCRDMA_CMD_MODIFY_QP = 8 ,
+	OCRDMA_CMD_DELETE_QP = 9,
+
+	OCRDMA_CMD_RSVD1 = 10,
+	OCRDMA_CMD_ALLOC_LKEY = 11,
+	OCRDMA_CMD_DEALLOC_LKEY = 12,
+	OCRDMA_CMD_REGISTER_NSMR = 13,
+	OCRDMA_CMD_REREGISTER_NSMR = 14,
+	OCRDMA_CMD_REGISTER_NSMR_CONT = 15,
+	OCRDMA_CMD_QUERY_NSMR = 16,
+	OCRDMA_CMD_ALLOC_MW = 17,
+	OCRDMA_CMD_QUERY_MW = 18,
+
+	OCRDMA_CMD_CREATE_SRQ = 19,
+	OCRDMA_CMD_QUERY_SRQ = 20,
+	OCRDMA_CMD_MODIFY_SRQ = 21,
+	OCRDMA_CMD_DELETE_SRQ = 22,
+
+	OCRDMA_CMD_ATTACH_MCAST = 23,
+	OCRDMA_CMD_DETACH_MCAST = 24,
+
+	OCRDMA_CMD_CREATE_RBQ = 25,
+	OCRDMA_CMD_DESTROY_RBQ = 26,
+
+	OCRDMA_CMD_GET_RDMA_STATS = 27,
+	OCRDMA_CMD_ALLOC_PD_RANGE = 28,
+	OCRDMA_CMD_DEALLOC_PD_RANGE = 29,
+
+	OCRDMA_CMD_MAX
+};
+
+#define OCRDMA_SUBSYS_COMMON 1
+enum {
+	OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1 = 5,
+	OCRDMA_CMD_CREATE_CQ		= 12,
+	OCRDMA_CMD_CREATE_EQ		= 13,
+	OCRDMA_CMD_CREATE_MQ		= 21,
+	OCRDMA_CMD_GET_CTRL_ATTRIBUTES  = 32,
+	OCRDMA_CMD_GET_FW_VER		= 35,
+	OCRDMA_CMD_MODIFY_EQ_DELAY      = 41,
+	OCRDMA_CMD_DELETE_MQ		= 53,
+	OCRDMA_CMD_DELETE_CQ		= 54,
+	OCRDMA_CMD_DELETE_EQ		= 55,
+	OCRDMA_CMD_GET_FW_CONFIG	= 58,
+	OCRDMA_CMD_CREATE_MQ_EXT	= 90,
+	OCRDMA_CMD_PHY_DETAILS		= 102
+};
+
+enum {
+	QTYPE_EQ	= 1,
+	QTYPE_CQ	= 2,
+	QTYPE_MCCQ	= 3
+};
+
+#define OCRDMA_MAX_SGID		16
+
+#define OCRDMA_MAX_QP    2048
+#define OCRDMA_MAX_CQ    2048
+#define OCRDMA_MAX_STAG 16384
+
+enum {
+	OCRDMA_DB_RQ_OFFSET		= 0xE0,
+	OCRDMA_DB_GEN2_RQ_OFFSET        = 0x100,
+	OCRDMA_DB_SQ_OFFSET		= 0x60,
+	OCRDMA_DB_GEN2_SQ_OFFSET	= 0x1C0,
+	OCRDMA_DB_SRQ_OFFSET		= OCRDMA_DB_RQ_OFFSET,
+	OCRDMA_DB_GEN2_SRQ_OFFSET	= OCRDMA_DB_GEN2_RQ_OFFSET,
+	OCRDMA_DB_CQ_OFFSET		= 0x120,
+	OCRDMA_DB_EQ_OFFSET		= OCRDMA_DB_CQ_OFFSET,
+	OCRDMA_DB_MQ_OFFSET		= 0x140,
+
+	OCRDMA_DB_SQ_SHIFT		= 16,
+	OCRDMA_DB_RQ_SHIFT		= 24
+};
+
+enum {
+	OCRDMA_L3_TYPE_IB_GRH   = 0x00,
+	OCRDMA_L3_TYPE_IPV4     = 0x01,
+	OCRDMA_L3_TYPE_IPV6     = 0x02
+};
+
+#define OCRDMA_DB_CQ_RING_ID_MASK       0x3FF	/* bits 0 - 9 */
+#define OCRDMA_DB_CQ_RING_ID_EXT_MASK  0x0C00	/* bits 10-11 of qid at 12-11 */
+/* qid #2 msbits at 12-11 */
+#define OCRDMA_DB_CQ_RING_ID_EXT_MASK_SHIFT  0x1
+#define OCRDMA_DB_CQ_NUM_POPPED_SHIFT	16	/* bits 16 - 28 */
+/* Rearm bit */
+#define OCRDMA_DB_CQ_REARM_SHIFT	29	/* bit 29 */
+/* solicited bit */
+#define OCRDMA_DB_CQ_SOLICIT_SHIFT	31	/* bit 31 */
+
+#define OCRDMA_EQ_ID_MASK		0x1FF	/* bits 0 - 8 */
+#define OCRDMA_EQ_ID_EXT_MASK		0x3e00	/* bits 9-13 */
+#define OCRDMA_EQ_ID_EXT_MASK_SHIFT	2	/* qid bits 9-13 at 11-15 */
+
+/* Clear the interrupt for this eq */
+#define OCRDMA_EQ_CLR_SHIFT		9	/* bit 9 */
+/* Must be 1 */
+#define OCRDMA_EQ_TYPE_SHIFT		10	/* bit 10 */
+/* Number of event entries processed */
+#define OCRDMA_NUM_EQE_SHIFT		16	/* bits 16 - 28 */
+/* Rearm bit */
+#define OCRDMA_REARM_SHIFT		29	/* bit 29 */
+
+#define OCRDMA_MQ_ID_MASK		0x7FF	/* bits 0 - 10 */
+/* Number of entries posted */
+#define OCRDMA_MQ_NUM_MQE_SHIFT	16	/* bits 16 - 29 */
+
+#define OCRDMA_MIN_HPAGE_SIZE	4096
+
+#define OCRDMA_MIN_Q_PAGE_SIZE	4096
+#define OCRDMA_MAX_Q_PAGES	8
+
+#define OCRDMA_SLI_ASIC_ID_OFFSET	0x9C
+#define OCRDMA_SLI_ASIC_REV_MASK	0x000000FF
+#define OCRDMA_SLI_ASIC_GEN_NUM_MASK	0x0000FF00
+#define OCRDMA_SLI_ASIC_GEN_NUM_SHIFT	0x08
+/*
+# 0: 4K Bytes
+# 1: 8K Bytes
+# 2: 16K Bytes
+# 3: 32K Bytes
+# 4: 64K Bytes
+# 5: 128K Bytes
+# 6: 256K Bytes
+# 7: 512K Bytes
+*/
+#define OCRDMA_MAX_Q_PAGE_SIZE_CNT	8
+#define OCRDMA_Q_PAGE_BASE_SIZE (OCRDMA_MIN_Q_PAGE_SIZE * OCRDMA_MAX_Q_PAGES)
+
+#define MAX_OCRDMA_QP_PAGES		8
+#define OCRDMA_MAX_WQE_MEM_SIZE (MAX_OCRDMA_QP_PAGES * OCRDMA_MIN_HQ_PAGE_SIZE)
+
+#define OCRDMA_CREATE_CQ_MAX_PAGES	4
+#define OCRDMA_DPP_CQE_SIZE		4
+
+#define OCRDMA_GEN2_MAX_CQE 1024
+#define OCRDMA_GEN2_CQ_PAGE_SIZE 4096
+#define OCRDMA_GEN2_WQE_SIZE 256
+#define OCRDMA_MAX_CQE  4095
+#define OCRDMA_CQ_PAGE_SIZE 16384
+#define OCRDMA_WQE_SIZE 128
+#define OCRDMA_WQE_STRIDE 8
+#define OCRDMA_WQE_ALIGN_BYTES 16
+
+#define MAX_OCRDMA_SRQ_PAGES MAX_OCRDMA_QP_PAGES
+
+enum {
+	OCRDMA_MCH_OPCODE_SHIFT	= 0,
+	OCRDMA_MCH_OPCODE_MASK	= 0xFF,
+	OCRDMA_MCH_SUBSYS_SHIFT	= 8,
+	OCRDMA_MCH_SUBSYS_MASK	= 0xFF00
+};
+
+/* mailbox cmd header */
+struct ocrdma_mbx_hdr {
+	u32 subsys_op;
+	u32 timeout;		/* in seconds */
+	u32 cmd_len;
+	u32 rsvd_version;
+};
+
+enum {
+	OCRDMA_MBX_RSP_OPCODE_SHIFT	= 0,
+	OCRDMA_MBX_RSP_OPCODE_MASK	= 0xFF,
+	OCRDMA_MBX_RSP_SUBSYS_SHIFT	= 8,
+	OCRDMA_MBX_RSP_SUBSYS_MASK	= 0xFF << OCRDMA_MBX_RSP_SUBSYS_SHIFT,
+
+	OCRDMA_MBX_RSP_STATUS_SHIFT	= 0,
+	OCRDMA_MBX_RSP_STATUS_MASK	= 0xFF,
+	OCRDMA_MBX_RSP_ASTATUS_SHIFT	= 8,
+	OCRDMA_MBX_RSP_ASTATUS_MASK	= 0xFF << OCRDMA_MBX_RSP_ASTATUS_SHIFT
+};
+
+/* mailbox cmd response */
+struct ocrdma_mbx_rsp {
+	u32 subsys_op;
+	u32 status;
+	u32 rsp_len;
+	u32 add_rsp_len;
+};
+
+enum {
+	OCRDMA_MQE_EMBEDDED	= 1,
+	OCRDMA_MQE_NONEMBEDDED	= 0
+};
+
+struct ocrdma_mqe_sge {
+	u32 pa_lo;
+	u32 pa_hi;
+	u32 len;
+};
+
+enum {
+	OCRDMA_MQE_HDR_EMB_SHIFT	= 0,
+	OCRDMA_MQE_HDR_EMB_MASK		= BIT(0),
+	OCRDMA_MQE_HDR_SGE_CNT_SHIFT	= 3,
+	OCRDMA_MQE_HDR_SGE_CNT_MASK	= 0x1F << OCRDMA_MQE_HDR_SGE_CNT_SHIFT,
+	OCRDMA_MQE_HDR_SPECIAL_SHIFT	= 24,
+	OCRDMA_MQE_HDR_SPECIAL_MASK	= 0xFF << OCRDMA_MQE_HDR_SPECIAL_SHIFT
+};
+
+struct ocrdma_mqe_hdr {
+	u32 spcl_sge_cnt_emb;
+	u32 pyld_len;
+	u32 tag_lo;
+	u32 tag_hi;
+	u32 rsvd3;
+};
+
+struct ocrdma_mqe_emb_cmd {
+	struct ocrdma_mbx_hdr mch;
+	u8 pyld[220];
+};
+
+struct ocrdma_mqe {
+	struct ocrdma_mqe_hdr hdr;
+	union {
+		struct ocrdma_mqe_emb_cmd emb_req;
+		struct {
+			struct ocrdma_mqe_sge sge[19];
+		} nonemb_req;
+		u8 cmd[236];
+		struct ocrdma_mbx_rsp rsp;
+	} u;
+};
+
+#define OCRDMA_EQ_LEN       4096
+#define OCRDMA_MQ_CQ_LEN    256
+#define OCRDMA_MQ_LEN       128
+
+#define PAGE_SHIFT_4K		12
+#define PAGE_SIZE_4K		(1 << PAGE_SHIFT_4K)
+
+/* Returns number of pages spanned by the data starting at the given addr */
+#define PAGES_4K_SPANNED(_address, size) \
+	((u32)((((size_t)(_address) & (PAGE_SIZE_4K - 1)) +	\
+			(size) + (PAGE_SIZE_4K - 1)) >> PAGE_SHIFT_4K))
+
+struct ocrdma_delete_q_req {
+	struct ocrdma_mbx_hdr req;
+	u32 id;
+};
+
+struct ocrdma_pa {
+	u32 lo;
+	u32 hi;
+};
+
+#define MAX_OCRDMA_EQ_PAGES	8
+struct ocrdma_create_eq_req {
+	struct ocrdma_mbx_hdr req;
+	u32 num_pages;
+	u32 valid;
+	u32 cnt;
+	u32 delay;
+	u32 rsvd;
+	struct ocrdma_pa pa[MAX_OCRDMA_EQ_PAGES];
+};
+
+enum {
+	OCRDMA_CREATE_EQ_VALID	= BIT(29),
+	OCRDMA_CREATE_EQ_CNT_SHIFT	= 26,
+	OCRDMA_CREATE_CQ_DELAY_SHIFT	= 13,
+};
+
+struct ocrdma_create_eq_rsp {
+	struct ocrdma_mbx_rsp rsp;
+	u32 vector_eqid;
+};
+
+#define OCRDMA_EQ_MINOR_OTHER	0x1
+
+struct ocrmda_set_eqd {
+	u32 eq_id;
+	u32 phase;
+	u32 delay_multiplier;
+};
+
+struct ocrdma_modify_eqd_cmd {
+	struct ocrdma_mbx_hdr req;
+	u32 num_eq;
+	struct ocrmda_set_eqd set_eqd[8];
+} __packed;
+
+struct ocrdma_modify_eqd_req {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_modify_eqd_cmd cmd;
+};
+
+
+struct ocrdma_modify_eq_delay_rsp {
+	struct ocrdma_mbx_rsp hdr;
+	u32 rsvd0;
+} __packed;
+
+enum {
+	OCRDMA_MCQE_STATUS_SHIFT	= 0,
+	OCRDMA_MCQE_STATUS_MASK		= 0xFFFF,
+	OCRDMA_MCQE_ESTATUS_SHIFT	= 16,
+	OCRDMA_MCQE_ESTATUS_MASK	= 0xFFFF << OCRDMA_MCQE_ESTATUS_SHIFT,
+	OCRDMA_MCQE_CONS_SHIFT		= 27,
+	OCRDMA_MCQE_CONS_MASK		= BIT(27),
+	OCRDMA_MCQE_CMPL_SHIFT		= 28,
+	OCRDMA_MCQE_CMPL_MASK		= BIT(28),
+	OCRDMA_MCQE_AE_SHIFT		= 30,
+	OCRDMA_MCQE_AE_MASK		= BIT(30),
+	OCRDMA_MCQE_VALID_SHIFT		= 31,
+	OCRDMA_MCQE_VALID_MASK		= BIT(31)
+};
+
+struct ocrdma_mcqe {
+	u32 status;
+	u32 tag_lo;
+	u32 tag_hi;
+	u32 valid_ae_cmpl_cons;
+};
+
+enum {
+	OCRDMA_AE_MCQE_QPVALID		= BIT(31),
+	OCRDMA_AE_MCQE_QPID_MASK	= 0xFFFF,
+
+	OCRDMA_AE_MCQE_CQVALID		= BIT(31),
+	OCRDMA_AE_MCQE_CQID_MASK	= 0xFFFF,
+	OCRDMA_AE_MCQE_VALID		= BIT(31),
+	OCRDMA_AE_MCQE_AE		= BIT(30),
+	OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT	= 16,
+	OCRDMA_AE_MCQE_EVENT_TYPE_MASK	=
+					0xFF << OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT,
+	OCRDMA_AE_MCQE_EVENT_CODE_SHIFT	= 8,
+	OCRDMA_AE_MCQE_EVENT_CODE_MASK	=
+					0xFF << OCRDMA_AE_MCQE_EVENT_CODE_SHIFT
+};
+struct ocrdma_ae_mcqe {
+	u32 qpvalid_qpid;
+	u32 cqvalid_cqid;
+	u32 evt_tag;
+	u32 valid_ae_event;
+};
+
+enum {
+	OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT = 0,
+	OCRDMA_AE_PVID_MCQE_ENABLED_MASK  = 0xFF,
+	OCRDMA_AE_PVID_MCQE_TAG_SHIFT = 16,
+	OCRDMA_AE_PVID_MCQE_TAG_MASK = 0xFFFF << OCRDMA_AE_PVID_MCQE_TAG_SHIFT
+};
+
+struct ocrdma_ae_pvid_mcqe {
+	u32 tag_enabled;
+	u32 event_tag;
+	u32 rsvd1;
+	u32 rsvd2;
+};
+
+enum {
+	OCRDMA_AE_MPA_MCQE_REQ_ID_SHIFT		= 16,
+	OCRDMA_AE_MPA_MCQE_REQ_ID_MASK		= 0xFFFF <<
+					OCRDMA_AE_MPA_MCQE_REQ_ID_SHIFT,
+
+	OCRDMA_AE_MPA_MCQE_EVENT_CODE_SHIFT	= 8,
+	OCRDMA_AE_MPA_MCQE_EVENT_CODE_MASK	= 0xFF <<
+					OCRDMA_AE_MPA_MCQE_EVENT_CODE_SHIFT,
+	OCRDMA_AE_MPA_MCQE_EVENT_TYPE_SHIFT	= 16,
+	OCRDMA_AE_MPA_MCQE_EVENT_TYPE_MASK	= 0xFF <<
+					OCRDMA_AE_MPA_MCQE_EVENT_TYPE_SHIFT,
+	OCRDMA_AE_MPA_MCQE_EVENT_AE_SHIFT	= 30,
+	OCRDMA_AE_MPA_MCQE_EVENT_AE_MASK	= BIT(30),
+	OCRDMA_AE_MPA_MCQE_EVENT_VALID_SHIFT	= 31,
+	OCRDMA_AE_MPA_MCQE_EVENT_VALID_MASK	= BIT(31)
+};
+
+struct ocrdma_ae_mpa_mcqe {
+	u32 req_id;
+	u32 w1;
+	u32 w2;
+	u32 valid_ae_event;
+};
+
+enum {
+	OCRDMA_AE_QP_MCQE_NEW_QP_STATE_SHIFT	= 0,
+	OCRDMA_AE_QP_MCQE_NEW_QP_STATE_MASK	= 0xFFFF,
+	OCRDMA_AE_QP_MCQE_QP_ID_SHIFT		= 16,
+	OCRDMA_AE_QP_MCQE_QP_ID_MASK		= 0xFFFF <<
+						OCRDMA_AE_QP_MCQE_QP_ID_SHIFT,
+
+	OCRDMA_AE_QP_MCQE_EVENT_CODE_SHIFT	= 8,
+	OCRDMA_AE_QP_MCQE_EVENT_CODE_MASK	= 0xFF <<
+				OCRDMA_AE_QP_MCQE_EVENT_CODE_SHIFT,
+	OCRDMA_AE_QP_MCQE_EVENT_TYPE_SHIFT	= 16,
+	OCRDMA_AE_QP_MCQE_EVENT_TYPE_MASK	= 0xFF <<
+				OCRDMA_AE_QP_MCQE_EVENT_TYPE_SHIFT,
+	OCRDMA_AE_QP_MCQE_EVENT_AE_SHIFT	= 30,
+	OCRDMA_AE_QP_MCQE_EVENT_AE_MASK		= BIT(30),
+	OCRDMA_AE_QP_MCQE_EVENT_VALID_SHIFT	= 31,
+	OCRDMA_AE_QP_MCQE_EVENT_VALID_MASK	= BIT(31)
+};
+
+struct ocrdma_ae_qp_mcqe {
+	u32 qp_id_state;
+	u32 w1;
+	u32 w2;
+	u32 valid_ae_event;
+};
+
+enum ocrdma_async_event_code {
+	OCRDMA_ASYNC_LINK_EVE_CODE	= 0x01,
+	OCRDMA_ASYNC_GRP5_EVE_CODE	= 0x05,
+	OCRDMA_ASYNC_RDMA_EVE_CODE	= 0x14
+};
+
+enum ocrdma_async_grp5_events {
+	OCRDMA_ASYNC_EVENT_QOS_VALUE	= 0x01,
+	OCRDMA_ASYNC_EVENT_COS_VALUE	= 0x02,
+	OCRDMA_ASYNC_EVENT_PVID_STATE	= 0x03
+};
+
+enum OCRDMA_ASYNC_EVENT_TYPE {
+	OCRDMA_CQ_ERROR			= 0x00,
+	OCRDMA_CQ_OVERRUN_ERROR		= 0x01,
+	OCRDMA_CQ_QPCAT_ERROR		= 0x02,
+	OCRDMA_QP_ACCESS_ERROR		= 0x03,
+	OCRDMA_QP_COMM_EST_EVENT	= 0x04,
+	OCRDMA_SQ_DRAINED_EVENT		= 0x05,
+	OCRDMA_DEVICE_FATAL_EVENT	= 0x08,
+	OCRDMA_SRQCAT_ERROR		= 0x0E,
+	OCRDMA_SRQ_LIMIT_EVENT		= 0x0F,
+	OCRDMA_QP_LAST_WQE_EVENT	= 0x10,
+
+	OCRDMA_MAX_ASYNC_ERRORS
+};
+
+struct ocrdma_ae_lnkst_mcqe {
+	u32 speed_state_ptn;
+	u32 qos_reason_falut;
+	u32 evt_tag;
+	u32 valid_ae_event;
+};
+
+enum {
+	OCRDMA_AE_LSC_PORT_NUM_MASK	= 0x3F,
+	OCRDMA_AE_LSC_PT_SHIFT		= 0x06,
+	OCRDMA_AE_LSC_PT_MASK		= (0x03 <<
+			OCRDMA_AE_LSC_PT_SHIFT),
+	OCRDMA_AE_LSC_LS_SHIFT		= 0x08,
+	OCRDMA_AE_LSC_LS_MASK		= (0xFF <<
+			OCRDMA_AE_LSC_LS_SHIFT),
+	OCRDMA_AE_LSC_LD_SHIFT		= 0x10,
+	OCRDMA_AE_LSC_LD_MASK		= (0xFF <<
+			OCRDMA_AE_LSC_LD_SHIFT),
+	OCRDMA_AE_LSC_PPS_SHIFT		= 0x18,
+	OCRDMA_AE_LSC_PPS_MASK		= (0xFF <<
+			OCRDMA_AE_LSC_PPS_SHIFT),
+	OCRDMA_AE_LSC_PPF_MASK		= 0xFF,
+	OCRDMA_AE_LSC_ER_SHIFT		= 0x08,
+	OCRDMA_AE_LSC_ER_MASK		= (0xFF <<
+			OCRDMA_AE_LSC_ER_SHIFT),
+	OCRDMA_AE_LSC_QOS_SHIFT		= 0x10,
+	OCRDMA_AE_LSC_QOS_MASK		= (0xFFFF <<
+			OCRDMA_AE_LSC_QOS_SHIFT)
+};
+
+enum {
+	OCRDMA_AE_LSC_PLINK_DOWN	= 0x00,
+	OCRDMA_AE_LSC_PLINK_UP		= 0x01,
+	OCRDMA_AE_LSC_LLINK_DOWN	= 0x02,
+	OCRDMA_AE_LSC_LLINK_MASK	= 0x02,
+	OCRDMA_AE_LSC_LLINK_UP		= 0x03
+};
+
+/* mailbox command request and responses */
+enum {
+	OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_SHIFT		= 2,
+	OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_MASK		= BIT(2),
+	OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_SHIFT	= 3,
+	OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_MASK		= BIT(3),
+	OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT		= 8,
+	OCRDMA_MBX_QUERY_CFG_MAX_QP_MASK		= 0xFFFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_PD_MASK		= 0xFFFF <<
+					OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT,
+	OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT		= 8,
+	OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK		= 0xFF <<
+				OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT,
+	OCRDMA_MBX_QUERY_CFG_L3_TYPE_SHIFT		= 3,
+	OCRDMA_MBX_QUERY_CFG_L3_TYPE_MASK		= 0x18,
+	OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT		= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK		= 0xFFFF,
+	OCRDMA_MBX_QUERY_CFG_MAX_RECV_SGE_SHIFT	= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_RECV_SGE_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_RECV_SGE_SHIFT,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK	= 0xFFFF,
+	OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT	= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET	= 24,
+	OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_MASK		= 0xFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET	= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_MASK		= 0xFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_OFFSET		= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET	= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_CQ_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_OFFSET		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET		= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_RD_SGE_SHIFT		= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_RD_SGE_MASK		= 0xFFFF,
+};
+
+struct ocrdma_mbx_query_config {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	u32 qp_srq_cq_ird_ord;
+	u32 max_pd_ca_ack_delay;
+	u32 max_recv_send_sge;
+	u32 max_ird_ord_per_qp;
+	u32 max_shared_ird_ord;
+	u32 max_mr;
+	u32 max_mr_size_hi;
+	u32 max_mr_size_lo;
+	u32 max_num_mr_pbl;
+	u32 max_mw;
+	u32 max_fmr;
+	u32 max_pages_per_frmr;
+	u32 max_mcast_group;
+	u32 max_mcast_qp_attach;
+	u32 max_total_mcast_qp_attach;
+	u32 wqe_rqe_stride_max_dpp_cqs;
+	u32 max_srq_rpir_qps;
+	u32 max_dpp_pds_credits;
+	u32 max_dpp_credits_pds_per_pd;
+	u32 max_wqes_rqes_per_q;
+	u32 max_cq_cqes_per_cq;
+	u32 max_srq_rqe_sge;
+	u32 max_wr_rd_sge;
+	u32 ird_pgsz_num_pages;
+};
+
+struct ocrdma_fw_ver_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u8 running_ver[32];
+};
+
+struct ocrdma_fw_conf_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 config_num;
+	u32 asic_revision;
+	u32 phy_port;
+	u32 fn_mode;
+	struct {
+		u32 mode;
+		u32 nic_wqid_base;
+		u32 nic_wq_tot;
+		u32 prot_wqid_base;
+		u32 prot_wq_tot;
+		u32 prot_rqid_base;
+		u32 prot_rqid_tot;
+		u32 rsvd[6];
+	} ulp[2];
+	u32 fn_capabilities;
+	u32 rsvd1;
+	u32 rsvd2;
+	u32 base_eqid;
+	u32 max_eq;
+
+};
+
+enum {
+	OCRDMA_FN_MODE_RDMA	= 0x4
+};
+
+enum {
+	OCRDMA_IF_TYPE_MASK		= 0xFFFF0000,
+	OCRDMA_IF_TYPE_SHIFT		= 0x10,
+	OCRDMA_PHY_TYPE_MASK		= 0x0000FFFF,
+	OCRDMA_FUTURE_DETAILS_MASK	= 0xFFFF0000,
+	OCRDMA_FUTURE_DETAILS_SHIFT	= 0x10,
+	OCRDMA_EX_PHY_DETAILS_MASK	= 0x0000FFFF,
+	OCRDMA_FSPEED_SUPP_MASK		= 0xFFFF0000,
+	OCRDMA_FSPEED_SUPP_SHIFT	= 0x10,
+	OCRDMA_ASPEED_SUPP_MASK		= 0x0000FFFF
+};
+
+struct ocrdma_get_phy_info_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 ityp_ptyp;
+	u32 misc_params;
+	u32 ftrdtl_exphydtl;
+	u32 fspeed_aspeed;
+	u32 future_use[2];
+};
+
+enum {
+	OCRDMA_PHY_SPEED_ZERO = 0x0,
+	OCRDMA_PHY_SPEED_10MBPS = 0x1,
+	OCRDMA_PHY_SPEED_100MBPS = 0x2,
+	OCRDMA_PHY_SPEED_1GBPS = 0x4,
+	OCRDMA_PHY_SPEED_10GBPS = 0x8,
+	OCRDMA_PHY_SPEED_40GBPS = 0x20
+};
+
+enum {
+	OCRDMA_PORT_NUM_MASK	= 0x3F,
+	OCRDMA_PT_MASK		= 0xC0,
+	OCRDMA_PT_SHIFT		= 0x6,
+	OCRDMA_LINK_DUP_MASK	= 0x0000FF00,
+	OCRDMA_LINK_DUP_SHIFT	= 0x8,
+	OCRDMA_PHY_PS_MASK	= 0x00FF0000,
+	OCRDMA_PHY_PS_SHIFT	= 0x10,
+	OCRDMA_PHY_PFLT_MASK	= 0xFF000000,
+	OCRDMA_PHY_PFLT_SHIFT	= 0x18,
+	OCRDMA_QOS_LNKSP_MASK	= 0xFFFF0000,
+	OCRDMA_QOS_LNKSP_SHIFT	= 0x10,
+	OCRDMA_LINK_ST_MASK	= 0x01,
+	OCRDMA_PLFC_MASK	= 0x00000400,
+	OCRDMA_PLFC_SHIFT	= 0x8,
+	OCRDMA_PLRFC_MASK	= 0x00000200,
+	OCRDMA_PLRFC_SHIFT	= 0x8,
+	OCRDMA_PLTFC_MASK	= 0x00000100,
+	OCRDMA_PLTFC_SHIFT	= 0x8
+};
+
+struct ocrdma_get_link_speed_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 pflt_pps_ld_pnum;
+	u32 qos_lsp;
+	u32 res_lnk_st;
+};
+
+enum {
+	OCRDMA_PHYS_LINK_SPEED_ZERO = 0x0,
+	OCRDMA_PHYS_LINK_SPEED_10MBPS = 0x1,
+	OCRDMA_PHYS_LINK_SPEED_100MBPS = 0x2,
+	OCRDMA_PHYS_LINK_SPEED_1GBPS = 0x3,
+	OCRDMA_PHYS_LINK_SPEED_10GBPS = 0x4,
+	OCRDMA_PHYS_LINK_SPEED_20GBPS = 0x5,
+	OCRDMA_PHYS_LINK_SPEED_25GBPS = 0x6,
+	OCRDMA_PHYS_LINK_SPEED_40GBPS = 0x7,
+	OCRDMA_PHYS_LINK_SPEED_100GBPS = 0x8
+};
+
+enum {
+	OCRDMA_CREATE_CQ_VER2			= 2,
+	OCRDMA_CREATE_CQ_VER3			= 3,
+
+	OCRDMA_CREATE_CQ_PAGE_CNT_MASK		= 0xFFFF,
+	OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT	= 16,
+	OCRDMA_CREATE_CQ_PAGE_SIZE_MASK		= 0xFF,
+
+	OCRDMA_CREATE_CQ_COALESCWM_SHIFT	= 12,
+	OCRDMA_CREATE_CQ_COALESCWM_MASK		= BIT(13) | BIT(12),
+	OCRDMA_CREATE_CQ_FLAGS_NODELAY		= BIT(14),
+	OCRDMA_CREATE_CQ_FLAGS_AUTO_VALID	= BIT(15),
+
+	OCRDMA_CREATE_CQ_EQ_ID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_CQ_CQE_COUNT_MASK		= 0xFFFF
+};
+
+enum {
+	OCRDMA_CREATE_CQ_VER0			= 0,
+	OCRDMA_CREATE_CQ_DPP			= 1,
+	OCRDMA_CREATE_CQ_TYPE_SHIFT		= 24,
+	OCRDMA_CREATE_CQ_EQID_SHIFT		= 22,
+
+	OCRDMA_CREATE_CQ_CNT_SHIFT		= 27,
+	OCRDMA_CREATE_CQ_FLAGS_VALID		= BIT(29),
+	OCRDMA_CREATE_CQ_FLAGS_EVENTABLE	= BIT(31),
+	OCRDMA_CREATE_CQ_DEF_FLAGS		= OCRDMA_CREATE_CQ_FLAGS_VALID |
+					OCRDMA_CREATE_CQ_FLAGS_EVENTABLE |
+					OCRDMA_CREATE_CQ_FLAGS_NODELAY
+};
+
+struct ocrdma_create_cq_cmd {
+	struct ocrdma_mbx_hdr req;
+	u32 pgsz_pgcnt;
+	u32 ev_cnt_flags;
+	u32 eqn;
+	u32 pdid_cqecnt;
+	u32 rsvd6;
+	struct ocrdma_pa pa[OCRDMA_CREATE_CQ_MAX_PAGES];
+};
+
+struct ocrdma_create_cq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_create_cq_cmd cmd;
+};
+
+enum {
+	OCRDMA_CREATE_CQ_CMD_PDID_SHIFT	= 0x10
+};
+
+enum {
+	OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK	= 0xFFFF
+};
+
+struct ocrdma_create_cq_cmd_rsp {
+	struct ocrdma_mbx_rsp rsp;
+	u32 cq_id;
+};
+
+struct ocrdma_create_cq_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_create_cq_cmd_rsp rsp;
+};
+
+enum {
+	OCRDMA_CREATE_MQ_V0_CQ_ID_SHIFT		= 22,
+	OCRDMA_CREATE_MQ_CQ_ID_SHIFT		= 16,
+	OCRDMA_CREATE_MQ_RING_SIZE_SHIFT	= 16,
+	OCRDMA_CREATE_MQ_VALID			= BIT(31),
+	OCRDMA_CREATE_MQ_ASYNC_CQ_VALID		= BIT(0)
+};
+
+struct ocrdma_create_mq_req {
+	struct ocrdma_mbx_hdr req;
+	u32 cqid_pages;
+	u32 async_event_bitmap;
+	u32 async_cqid_ringsize;
+	u32 valid;
+	u32 async_cqid_valid;
+	u32 rsvd;
+	struct ocrdma_pa pa[8];
+};
+
+struct ocrdma_create_mq_rsp {
+	struct ocrdma_mbx_rsp rsp;
+	u32 id;
+};
+
+enum {
+	OCRDMA_DESTROY_CQ_QID_SHIFT			= 0,
+	OCRDMA_DESTROY_CQ_QID_MASK			= 0xFFFF,
+	OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_SHIFT	= 16,
+	OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_MASK		= 0xFFFF <<
+				OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_SHIFT
+};
+
+struct ocrdma_destroy_cq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 bypass_flush_qid;
+};
+
+struct ocrdma_destroy_cq_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+enum {
+	OCRDMA_QPT_GSI	= 1,
+	OCRDMA_QPT_RC	= 2,
+	OCRDMA_QPT_UD	= 4,
+};
+
+enum {
+	OCRDMA_CREATE_QP_REQ_PD_ID_SHIFT	= 0,
+	OCRDMA_CREATE_QP_REQ_PD_ID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_SQ_PAGE_SIZE_SHIFT	= 16,
+	OCRDMA_CREATE_QP_REQ_RQ_PAGE_SIZE_SHIFT	= 19,
+	OCRDMA_CREATE_QP_REQ_QPT_SHIFT		= 29,
+	OCRDMA_CREATE_QP_REQ_QPT_MASK		= BIT(31) | BIT(30) | BIT(29),
+
+	OCRDMA_CREATE_QP_REQ_MAX_RQE_SHIFT	= 0,
+	OCRDMA_CREATE_QP_REQ_MAX_RQE_MASK	= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT	= 16,
+	OCRDMA_CREATE_QP_REQ_MAX_WQE_MASK	= 0xFFFF <<
+					OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_SHIFT	= 0,
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_MASK		= 0xFFFF <<
+					OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_FMR_EN_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_FMR_EN_MASK		= BIT(0),
+	OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_SHIFT		= 1,
+	OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_MASK		= BIT(1),
+	OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_SHIFT		= 2,
+	OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_MASK		= BIT(2),
+	OCRDMA_CREATE_QP_REQ_INB_WREN_SHIFT		= 3,
+	OCRDMA_CREATE_QP_REQ_INB_WREN_MASK		= BIT(3),
+	OCRDMA_CREATE_QP_REQ_INB_RDEN_SHIFT		= 4,
+	OCRDMA_CREATE_QP_REQ_INB_RDEN_MASK		= BIT(4),
+	OCRDMA_CREATE_QP_REQ_USE_SRQ_SHIFT		= 5,
+	OCRDMA_CREATE_QP_REQ_USE_SRQ_MASK		= BIT(5),
+	OCRDMA_CREATE_QP_REQ_ENABLE_RPIR_SHIFT		= 6,
+	OCRDMA_CREATE_QP_REQ_ENABLE_RPIR_MASK		= BIT(6),
+	OCRDMA_CREATE_QP_REQ_ENABLE_DPP_SHIFT		= 7,
+	OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK		= BIT(7),
+	OCRDMA_CREATE_QP_REQ_ENABLE_DPP_CQ_SHIFT	= 8,
+	OCRDMA_CREATE_QP_REQ_ENABLE_DPP_CQ_MASK		= BIT(8),
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_MAX_IRD_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_MAX_IRD_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_MAX_ORD_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_RQE_SIZE_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_RQE_SIZE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_WQE_SIZE_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_RQ_CQID_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_WQ_CQID_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_DPP_CQPID_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_DPP_CQPID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_DPP_CREDIT_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT
+};
+
+enum {
+	OCRDMA_CREATE_QP_REQ_DPP_CREDIT_LIMIT	= 16,
+	OCRDMA_CREATE_QP_RSP_DPP_PAGE_SHIFT	= 1
+};
+
+#define MAX_OCRDMA_IRD_PAGES 4
+
+enum ocrdma_qp_flags {
+	OCRDMA_QP_MW_BIND	= 1,
+	OCRDMA_QP_LKEY0		= (1 << 1),
+	OCRDMA_QP_FAST_REG	= (1 << 2),
+	OCRDMA_QP_INB_RD	= (1 << 6),
+	OCRDMA_QP_INB_WR	= (1 << 7),
+};
+
+enum ocrdma_qp_state {
+	OCRDMA_QPS_RST		= 0,
+	OCRDMA_QPS_INIT		= 1,
+	OCRDMA_QPS_RTR		= 2,
+	OCRDMA_QPS_RTS		= 3,
+	OCRDMA_QPS_SQE		= 4,
+	OCRDMA_QPS_SQ_DRAINING	= 5,
+	OCRDMA_QPS_ERR		= 6,
+	OCRDMA_QPS_SQD		= 7
+};
+
+struct ocrdma_create_qp_req {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 type_pgsz_pdn;
+	u32 max_wqe_rqe;
+	u32 max_sge_send_write;
+	u32 max_sge_recv_flags;
+	u32 max_ord_ird;
+	u32 num_wq_rq_pages;
+	u32 wqe_rqe_size;
+	u32 wq_rq_cqid;
+	struct ocrdma_pa wq_addr[MAX_OCRDMA_QP_PAGES];
+	struct ocrdma_pa rq_addr[MAX_OCRDMA_QP_PAGES];
+	u32 dpp_credits_cqid;
+	u32 rpir_lkey;
+	struct ocrdma_pa ird_addr[MAX_OCRDMA_IRD_PAGES];
+};
+
+enum {
+	OCRDMA_CREATE_QP_RSP_QP_ID_SHIFT		= 0,
+	OCRDMA_CREATE_QP_RSP_QP_ID_MASK			= 0xFFFF,
+
+	OCRDMA_CREATE_QP_RSP_MAX_RQE_SHIFT		= 0,
+	OCRDMA_CREATE_QP_RSP_MAX_RQE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_MAX_WQE_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_WRITE_SHIFT	= 0,
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_WRITE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_MAX_IRD_SHIFT		= 0,
+	OCRDMA_CREATE_QP_RSP_MAX_IRD_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_MAX_ORD_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_RQ_ID_SHIFT		= 0,
+	OCRDMA_CREATE_QP_RSP_RQ_ID_MASK			= 0xFFFF,
+	OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_SQ_ID_MASK			= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_DPP_ENABLED_MASK		= BIT(0),
+	OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT	= 1,
+	OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_MASK	= 0x7FFF <<
+				OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT,
+	OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_DPP_CREDITS_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT,
+};
+
+struct ocrdma_create_qp_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 qp_id;
+	u32 max_wqe_rqe;
+	u32 max_sge_send_write;
+	u32 max_sge_recv;
+	u32 max_ord_ird;
+	u32 sq_rq_id;
+	u32 dpp_response;
+};
+
+struct ocrdma_destroy_qp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 qp_id;
+};
+
+struct ocrdma_destroy_qp_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+enum {
+	OCRDMA_MODIFY_QP_ID_SHIFT	= 0,
+	OCRDMA_MODIFY_QP_ID_MASK	= 0xFFFF,
+
+	OCRDMA_QP_PARA_QPS_VALID	= BIT(0),
+	OCRDMA_QP_PARA_SQD_ASYNC_VALID	= BIT(1),
+	OCRDMA_QP_PARA_PKEY_VALID	= BIT(2),
+	OCRDMA_QP_PARA_QKEY_VALID	= BIT(3),
+	OCRDMA_QP_PARA_PMTU_VALID	= BIT(4),
+	OCRDMA_QP_PARA_ACK_TO_VALID	= BIT(5),
+	OCRDMA_QP_PARA_RETRY_CNT_VALID	= BIT(6),
+	OCRDMA_QP_PARA_RRC_VALID	= BIT(7),
+	OCRDMA_QP_PARA_RQPSN_VALID	= BIT(8),
+	OCRDMA_QP_PARA_MAX_IRD_VALID	= BIT(9),
+	OCRDMA_QP_PARA_MAX_ORD_VALID	= BIT(10),
+	OCRDMA_QP_PARA_RNT_VALID	= BIT(11),
+	OCRDMA_QP_PARA_SQPSN_VALID	= BIT(12),
+	OCRDMA_QP_PARA_DST_QPN_VALID	= BIT(13),
+	OCRDMA_QP_PARA_MAX_WQE_VALID	= BIT(14),
+	OCRDMA_QP_PARA_MAX_RQE_VALID	= BIT(15),
+	OCRDMA_QP_PARA_SGE_SEND_VALID	= BIT(16),
+	OCRDMA_QP_PARA_SGE_RECV_VALID	= BIT(17),
+	OCRDMA_QP_PARA_SGE_WR_VALID	= BIT(18),
+	OCRDMA_QP_PARA_INB_RDEN_VALID	= BIT(19),
+	OCRDMA_QP_PARA_INB_WREN_VALID	= BIT(20),
+	OCRDMA_QP_PARA_FLOW_LBL_VALID	= BIT(21),
+	OCRDMA_QP_PARA_BIND_EN_VALID	= BIT(22),
+	OCRDMA_QP_PARA_ZLKEY_EN_VALID	= BIT(23),
+	OCRDMA_QP_PARA_FMR_EN_VALID	= BIT(24),
+	OCRDMA_QP_PARA_INBAT_EN_VALID	= BIT(25),
+	OCRDMA_QP_PARA_VLAN_EN_VALID	= BIT(26),
+
+	OCRDMA_MODIFY_QP_FLAGS_RD	= BIT(0),
+	OCRDMA_MODIFY_QP_FLAGS_WR	= BIT(1),
+	OCRDMA_MODIFY_QP_FLAGS_SEND	= BIT(2),
+	OCRDMA_MODIFY_QP_FLAGS_ATOMIC	= BIT(3)
+};
+
+enum {
+	OCRDMA_QP_PARAMS_SRQ_ID_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_SRQ_ID_MASK		= 0xFFFF,
+
+	OCRDMA_QP_PARAMS_MAX_RQE_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_MAX_RQE_MASK		= 0xFFFF,
+	OCRDMA_QP_PARAMS_MAX_WQE_SHIFT		= 16,
+	OCRDMA_QP_PARAMS_MAX_WQE_MASK		= 0xFFFF <<
+	    OCRDMA_QP_PARAMS_MAX_WQE_SHIFT,
+
+	OCRDMA_QP_PARAMS_MAX_SGE_WRITE_SHIFT	= 0,
+	OCRDMA_QP_PARAMS_MAX_SGE_WRITE_MASK	= 0xFFFF,
+	OCRDMA_QP_PARAMS_MAX_SGE_SEND_SHIFT	= 16,
+	OCRDMA_QP_PARAMS_MAX_SGE_SEND_MASK	= 0xFFFF <<
+					OCRDMA_QP_PARAMS_MAX_SGE_SEND_SHIFT,
+
+	OCRDMA_QP_PARAMS_FLAGS_FMR_EN		= BIT(0),
+	OCRDMA_QP_PARAMS_FLAGS_LKEY_0_EN	= BIT(1),
+	OCRDMA_QP_PARAMS_FLAGS_BIND_MW_EN	= BIT(2),
+	OCRDMA_QP_PARAMS_FLAGS_INBWR_EN		= BIT(3),
+	OCRDMA_QP_PARAMS_FLAGS_INBRD_EN		= BIT(4),
+	OCRDMA_QP_PARAMS_STATE_SHIFT		= 5,
+	OCRDMA_QP_PARAMS_STATE_MASK		= BIT(5) | BIT(6) | BIT(7),
+	OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC	= BIT(8),
+	OCRDMA_QP_PARAMS_FLAGS_INB_ATEN		= BIT(9),
+	OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_SHIFT    = 11,
+	OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_MASK     = BIT(11) | BIT(12) | BIT(13),
+	OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT	= 16,
+	OCRDMA_QP_PARAMS_MAX_SGE_RECV_MASK	= 0xFFFF <<
+					OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT,
+
+	OCRDMA_QP_PARAMS_MAX_IRD_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_MAX_IRD_MASK		= 0xFFFF,
+	OCRDMA_QP_PARAMS_MAX_ORD_SHIFT		= 16,
+	OCRDMA_QP_PARAMS_MAX_ORD_MASK		= 0xFFFF <<
+					OCRDMA_QP_PARAMS_MAX_ORD_SHIFT,
+
+	OCRDMA_QP_PARAMS_RQ_CQID_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_RQ_CQID_MASK		= 0xFFFF,
+	OCRDMA_QP_PARAMS_WQ_CQID_SHIFT		= 16,
+	OCRDMA_QP_PARAMS_WQ_CQID_MASK		= 0xFFFF <<
+					OCRDMA_QP_PARAMS_WQ_CQID_SHIFT,
+
+	OCRDMA_QP_PARAMS_RQ_PSN_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_RQ_PSN_MASK		= 0xFFFFFF,
+	OCRDMA_QP_PARAMS_HOP_LMT_SHIFT		= 24,
+	OCRDMA_QP_PARAMS_HOP_LMT_MASK		= 0xFF <<
+					OCRDMA_QP_PARAMS_HOP_LMT_SHIFT,
+
+	OCRDMA_QP_PARAMS_SQ_PSN_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_SQ_PSN_MASK		= 0xFFFFFF,
+	OCRDMA_QP_PARAMS_TCLASS_SHIFT		= 24,
+	OCRDMA_QP_PARAMS_TCLASS_MASK		= 0xFF <<
+					OCRDMA_QP_PARAMS_TCLASS_SHIFT,
+
+	OCRDMA_QP_PARAMS_DEST_QPN_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_DEST_QPN_MASK		= 0xFFFFFF,
+	OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT	= 24,
+	OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK	= 0x7 <<
+					OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT,
+	OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT	= 27,
+	OCRDMA_QP_PARAMS_ACK_TIMEOUT_MASK	= 0x1F <<
+					OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT,
+
+	OCRDMA_QP_PARAMS_PKEY_IDNEX_SHIFT	= 0,
+	OCRDMA_QP_PARAMS_PKEY_INDEX_MASK	= 0xFFFF,
+	OCRDMA_QP_PARAMS_PATH_MTU_SHIFT		= 18,
+	OCRDMA_QP_PARAMS_PATH_MTU_MASK		= 0x3FFF <<
+					OCRDMA_QP_PARAMS_PATH_MTU_SHIFT,
+
+	OCRDMA_QP_PARAMS_FLOW_LABEL_SHIFT	= 0,
+	OCRDMA_QP_PARAMS_FLOW_LABEL_MASK	= 0xFFFFF,
+	OCRDMA_QP_PARAMS_SL_SHIFT		= 20,
+	OCRDMA_QP_PARAMS_SL_MASK		= 0xF <<
+					OCRDMA_QP_PARAMS_SL_SHIFT,
+	OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT	= 24,
+	OCRDMA_QP_PARAMS_RETRY_CNT_MASK		= 0x7 <<
+					OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT,
+	OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT	= 27,
+	OCRDMA_QP_PARAMS_RNR_NAK_TIMER_MASK	= 0x1F <<
+					OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT,
+
+	OCRDMA_QP_PARAMS_DMAC_B4_TO_B5_SHIFT	= 0,
+	OCRDMA_QP_PARAMS_DMAC_B4_TO_B5_MASK	= 0xFFFF,
+	OCRDMA_QP_PARAMS_VLAN_SHIFT		= 16,
+	OCRDMA_QP_PARAMS_VLAN_MASK		= 0xFFFF <<
+					OCRDMA_QP_PARAMS_VLAN_SHIFT
+};
+
+struct ocrdma_qp_params {
+	u32 id;
+	u32 max_wqe_rqe;
+	u32 max_sge_send_write;
+	u32 max_sge_recv_flags;
+	u32 max_ord_ird;
+	u32 wq_rq_cqid;
+	u32 hop_lmt_rq_psn;
+	u32 tclass_sq_psn;
+	u32 ack_to_rnr_rtc_dest_qpn;
+	u32 path_mtu_pkey_indx;
+	u32 rnt_rc_sl_fl;
+	u8 sgid[16];
+	u8 dgid[16];
+	u32 dmac_b0_to_b3;
+	u32 vlan_dmac_b4_to_b5;
+	u32 qkey;
+};
+
+
+struct ocrdma_modify_qp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	struct ocrdma_qp_params params;
+	u32 flags;
+	u32 rdma_flags;
+	u32 num_outstanding_atomic_rd;
+};
+
+enum {
+	OCRDMA_MODIFY_QP_RSP_MAX_RQE_SHIFT	= 0,
+	OCRDMA_MODIFY_QP_RSP_MAX_RQE_MASK	= 0xFFFF,
+	OCRDMA_MODIFY_QP_RSP_MAX_WQE_SHIFT	= 16,
+	OCRDMA_MODIFY_QP_RSP_MAX_WQE_MASK	= 0xFFFF <<
+					OCRDMA_MODIFY_QP_RSP_MAX_WQE_SHIFT,
+
+	OCRDMA_MODIFY_QP_RSP_MAX_IRD_SHIFT	= 0,
+	OCRDMA_MODIFY_QP_RSP_MAX_IRD_MASK	= 0xFFFF,
+	OCRDMA_MODIFY_QP_RSP_MAX_ORD_SHIFT	= 16,
+	OCRDMA_MODIFY_QP_RSP_MAX_ORD_MASK	= 0xFFFF <<
+					OCRDMA_MODIFY_QP_RSP_MAX_ORD_SHIFT
+};
+
+struct ocrdma_modify_qp_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 max_wqe_rqe;
+	u32 max_ord_ird;
+};
+
+struct ocrdma_query_qp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+#define OCRDMA_QUERY_UP_QP_ID_SHIFT	0
+#define OCRDMA_QUERY_UP_QP_ID_MASK	0xFFFFFF
+	u32 qp_id;
+};
+
+struct ocrdma_query_qp_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	struct ocrdma_qp_params params;
+	u32 dpp_credits_cqid;
+	u32 rbq_id;
+};
+
+enum {
+	OCRDMA_CREATE_SRQ_PD_ID_SHIFT		= 0,
+	OCRDMA_CREATE_SRQ_PD_ID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_SRQ_PG_SZ_SHIFT		= 16,
+	OCRDMA_CREATE_SRQ_PG_SZ_MASK		= 0x3 <<
+					OCRDMA_CREATE_SRQ_PG_SZ_SHIFT,
+
+	OCRDMA_CREATE_SRQ_MAX_RQE_SHIFT		= 0,
+	OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT	= 16,
+	OCRDMA_CREATE_SRQ_MAX_SGE_RECV_MASK	= 0xFFFF <<
+					OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT,
+
+	OCRDMA_CREATE_SRQ_RQE_SIZE_SHIFT	= 0,
+	OCRDMA_CREATE_SRQ_RQE_SIZE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT	= 16,
+	OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_MASK	= 0xFFFF <<
+					OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT
+};
+
+struct ocrdma_create_srq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 pgsz_pdid;
+	u32 max_sge_rqe;
+	u32 pages_rqe_sz;
+	struct ocrdma_pa rq_addr[MAX_OCRDMA_SRQ_PAGES];
+};
+
+enum {
+	OCRDMA_CREATE_SRQ_RSP_SRQ_ID_SHIFT			= 0,
+	OCRDMA_CREATE_SRQ_RSP_SRQ_ID_MASK			= 0xFFFFFF,
+
+	OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_SHIFT		= 0,
+	OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_MASK		= 0xFFFF,
+	OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT	= 16,
+	OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_MASK	= 0xFFFF <<
+			OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT
+};
+
+struct ocrdma_create_srq_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 id;
+	u32 max_sge_rqe_allocated;
+};
+
+enum {
+	OCRDMA_MODIFY_SRQ_ID_SHIFT	= 0,
+	OCRDMA_MODIFY_SRQ_ID_MASK	= 0xFFFFFF,
+
+	OCRDMA_MODIFY_SRQ_MAX_RQE_SHIFT	= 0,
+	OCRDMA_MODIFY_SRQ_MAX_RQE_MASK	= 0xFFFF,
+	OCRDMA_MODIFY_SRQ_LIMIT_SHIFT	= 16,
+	OCRDMA_MODIFY_SRQ__LIMIT_MASK	= 0xFFFF <<
+					OCRDMA_MODIFY_SRQ_LIMIT_SHIFT
+};
+
+struct ocrdma_modify_srq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rep;
+
+	u32 id;
+	u32 limit_max_rqe;
+};
+
+enum {
+	OCRDMA_QUERY_SRQ_ID_SHIFT	= 0,
+	OCRDMA_QUERY_SRQ_ID_MASK	= 0xFFFFFF
+};
+
+struct ocrdma_query_srq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp req;
+
+	u32 id;
+};
+
+enum {
+	OCRDMA_QUERY_SRQ_RSP_PD_ID_SHIFT	= 0,
+	OCRDMA_QUERY_SRQ_RSP_PD_ID_MASK		= 0xFFFF,
+	OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT	= 16,
+	OCRDMA_QUERY_SRQ_RSP_MAX_RQE_MASK	= 0xFFFF <<
+					OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT,
+
+	OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_SHIFT	= 0,
+	OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_MASK	= 0xFFFF,
+	OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT	= 16,
+	OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_MASK	= 0xFFFF <<
+					OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT
+};
+
+struct ocrdma_query_srq_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp req;
+
+	u32 max_rqe_pdid;
+	u32 srq_lmt_max_sge;
+};
+
+enum {
+	OCRDMA_DESTROY_SRQ_ID_SHIFT	= 0,
+	OCRDMA_DESTROY_SRQ_ID_MASK	= 0xFFFFFF
+};
+
+struct ocrdma_destroy_srq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp req;
+
+	u32 id;
+};
+
+enum {
+	OCRDMA_ALLOC_PD_ENABLE_DPP	= BIT(16),
+	OCRDMA_DPP_PAGE_SIZE		= 4096
+};
+
+struct ocrdma_alloc_pd {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 enable_dpp_rsvd;
+};
+
+enum {
+	OCRDMA_ALLOC_PD_RSP_DPP			= BIT(16),
+	OCRDMA_ALLOC_PD_RSP_DPP_PAGE_SHIFT	= 20,
+	OCRDMA_ALLOC_PD_RSP_PDID_MASK		= 0xFFFF,
+};
+
+struct ocrdma_alloc_pd_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	u32 dpp_page_pdid;
+};
+
+struct ocrdma_dealloc_pd {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 id;
+};
+
+struct ocrdma_dealloc_pd_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+struct ocrdma_alloc_pd_range {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 enable_dpp_rsvd;
+	u32 pd_count;
+};
+
+struct ocrdma_alloc_pd_range_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	u32 dpp_page_pdid;
+	u32 pd_count;
+};
+
+enum {
+	OCRDMA_ALLOC_PD_RNG_RSP_START_PDID_MASK = 0xFFFF,
+};
+
+struct ocrdma_dealloc_pd_range {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 start_pd_id;
+	u32 pd_count;
+};
+
+struct ocrdma_dealloc_pd_range_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 rsvd;
+};
+
+enum {
+	OCRDMA_ADDR_CHECK_ENABLE	= 1,
+	OCRDMA_ADDR_CHECK_DISABLE	= 0
+};
+
+enum {
+	OCRDMA_ALLOC_LKEY_PD_ID_SHIFT		= 0,
+	OCRDMA_ALLOC_LKEY_PD_ID_MASK		= 0xFFFF,
+
+	OCRDMA_ALLOC_LKEY_ADDR_CHECK_SHIFT	= 0,
+	OCRDMA_ALLOC_LKEY_ADDR_CHECK_MASK	= BIT(0),
+	OCRDMA_ALLOC_LKEY_FMR_SHIFT		= 1,
+	OCRDMA_ALLOC_LKEY_FMR_MASK		= BIT(1),
+	OCRDMA_ALLOC_LKEY_REMOTE_INV_SHIFT	= 2,
+	OCRDMA_ALLOC_LKEY_REMOTE_INV_MASK	= BIT(2),
+	OCRDMA_ALLOC_LKEY_REMOTE_WR_SHIFT	= 3,
+	OCRDMA_ALLOC_LKEY_REMOTE_WR_MASK	= BIT(3),
+	OCRDMA_ALLOC_LKEY_REMOTE_RD_SHIFT	= 4,
+	OCRDMA_ALLOC_LKEY_REMOTE_RD_MASK	= BIT(4),
+	OCRDMA_ALLOC_LKEY_LOCAL_WR_SHIFT	= 5,
+	OCRDMA_ALLOC_LKEY_LOCAL_WR_MASK		= BIT(5),
+	OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_MASK	= BIT(6),
+	OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_SHIFT	= 6,
+	OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT	= 16,
+	OCRDMA_ALLOC_LKEY_PBL_SIZE_MASK		= 0xFFFF <<
+						OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT
+};
+
+struct ocrdma_alloc_lkey {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 pdid;
+	u32 pbl_sz_flags;
+};
+
+struct ocrdma_alloc_lkey_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 lrkey;
+	u32 num_pbl_rsvd;
+};
+
+struct ocrdma_dealloc_lkey {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 lkey;
+	u32 rsvd_frmr;
+};
+
+struct ocrdma_dealloc_lkey_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+#define MAX_OCRDMA_NSMR_PBL    (u32)22
+#define MAX_OCRDMA_PBL_SIZE     65536
+#define MAX_OCRDMA_PBL_PER_LKEY	32767
+
+enum {
+	OCRDMA_REG_NSMR_LRKEY_INDEX_SHIFT	= 0,
+	OCRDMA_REG_NSMR_LRKEY_INDEX_MASK	= 0xFFFFFF,
+	OCRDMA_REG_NSMR_LRKEY_SHIFT		= 24,
+	OCRDMA_REG_NSMR_LRKEY_MASK		= 0xFF <<
+					OCRDMA_REG_NSMR_LRKEY_SHIFT,
+
+	OCRDMA_REG_NSMR_PD_ID_SHIFT		= 0,
+	OCRDMA_REG_NSMR_PD_ID_MASK		= 0xFFFF,
+	OCRDMA_REG_NSMR_NUM_PBL_SHIFT		= 16,
+	OCRDMA_REG_NSMR_NUM_PBL_MASK		= 0xFFFF <<
+					OCRDMA_REG_NSMR_NUM_PBL_SHIFT,
+
+	OCRDMA_REG_NSMR_PBE_SIZE_SHIFT		= 0,
+	OCRDMA_REG_NSMR_PBE_SIZE_MASK		= 0xFFFF,
+	OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT	= 16,
+	OCRDMA_REG_NSMR_HPAGE_SIZE_MASK		= 0xFF <<
+					OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT,
+	OCRDMA_REG_NSMR_BIND_MEMWIN_SHIFT	= 24,
+	OCRDMA_REG_NSMR_BIND_MEMWIN_MASK	= BIT(24),
+	OCRDMA_REG_NSMR_ZB_SHIFT		= 25,
+	OCRDMA_REG_NSMR_ZB_SHIFT_MASK		= BIT(25),
+	OCRDMA_REG_NSMR_REMOTE_INV_SHIFT	= 26,
+	OCRDMA_REG_NSMR_REMOTE_INV_MASK		= BIT(26),
+	OCRDMA_REG_NSMR_REMOTE_WR_SHIFT		= 27,
+	OCRDMA_REG_NSMR_REMOTE_WR_MASK		= BIT(27),
+	OCRDMA_REG_NSMR_REMOTE_RD_SHIFT		= 28,
+	OCRDMA_REG_NSMR_REMOTE_RD_MASK		= BIT(28),
+	OCRDMA_REG_NSMR_LOCAL_WR_SHIFT		= 29,
+	OCRDMA_REG_NSMR_LOCAL_WR_MASK		= BIT(29),
+	OCRDMA_REG_NSMR_REMOTE_ATOMIC_SHIFT	= 30,
+	OCRDMA_REG_NSMR_REMOTE_ATOMIC_MASK	= BIT(30),
+	OCRDMA_REG_NSMR_LAST_SHIFT		= 31,
+	OCRDMA_REG_NSMR_LAST_MASK		= BIT(31)
+};
+
+struct ocrdma_reg_nsmr {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr cmd;
+
+	u32 fr_mr;
+	u32 num_pbl_pdid;
+	u32 flags_hpage_pbe_sz;
+	u32 totlen_low;
+	u32 totlen_high;
+	u32 fbo_low;
+	u32 fbo_high;
+	u32 va_loaddr;
+	u32 va_hiaddr;
+	struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL];
+};
+
+enum {
+	OCRDMA_REG_NSMR_CONT_PBL_SHIFT		= 0,
+	OCRDMA_REG_NSMR_CONT_PBL_SHIFT_MASK	= 0xFFFF,
+	OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT	= 16,
+	OCRDMA_REG_NSMR_CONT_NUM_PBL_MASK	= 0xFFFF <<
+					OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT,
+
+	OCRDMA_REG_NSMR_CONT_LAST_SHIFT		= 31,
+	OCRDMA_REG_NSMR_CONT_LAST_MASK		= BIT(31)
+};
+
+struct ocrdma_reg_nsmr_cont {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr cmd;
+
+	u32 lrkey;
+	u32 num_pbl_offset;
+	u32 last;
+
+	struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL];
+};
+
+struct ocrdma_pbe {
+	u32 pa_hi;
+	u32 pa_lo;
+};
+
+enum {
+	OCRDMA_REG_NSMR_RSP_NUM_PBL_SHIFT	= 16,
+	OCRDMA_REG_NSMR_RSP_NUM_PBL_MASK	= 0xFFFF0000
+};
+struct ocrdma_reg_nsmr_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 lrkey;
+	u32 num_pbl;
+};
+
+enum {
+	OCRDMA_REG_NSMR_CONT_RSP_LRKEY_INDEX_SHIFT	= 0,
+	OCRDMA_REG_NSMR_CONT_RSP_LRKEY_INDEX_MASK	= 0xFFFFFF,
+	OCRDMA_REG_NSMR_CONT_RSP_LRKEY_SHIFT		= 24,
+	OCRDMA_REG_NSMR_CONT_RSP_LRKEY_MASK		= 0xFF <<
+					OCRDMA_REG_NSMR_CONT_RSP_LRKEY_SHIFT,
+
+	OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_SHIFT		= 16,
+	OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_MASK		= 0xFFFF <<
+					OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_SHIFT
+};
+
+struct ocrdma_reg_nsmr_cont_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 lrkey_key_index;
+	u32 num_pbl;
+};
+
+enum {
+	OCRDMA_ALLOC_MW_PD_ID_SHIFT	= 0,
+	OCRDMA_ALLOC_MW_PD_ID_MASK	= 0xFFFF
+};
+
+struct ocrdma_alloc_mw {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 pdid;
+};
+
+enum {
+	OCRDMA_ALLOC_MW_RSP_LRKEY_INDEX_SHIFT	= 0,
+	OCRDMA_ALLOC_MW_RSP_LRKEY_INDEX_MASK	= 0xFFFFFF
+};
+
+struct ocrdma_alloc_mw_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 lrkey_index;
+};
+
+struct ocrdma_attach_mcast {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 qp_id;
+	u8 mgid[16];
+	u32 mac_b0_to_b3;
+	u32 vlan_mac_b4_to_b5;
+};
+
+struct ocrdma_attach_mcast_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+struct ocrdma_detach_mcast {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 qp_id;
+	u8 mgid[16];
+	u32 mac_b0_to_b3;
+	u32 vlan_mac_b4_to_b5;
+};
+
+struct ocrdma_detach_mcast_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+enum {
+	OCRDMA_CREATE_AH_NUM_PAGES_SHIFT	= 19,
+	OCRDMA_CREATE_AH_NUM_PAGES_MASK		= 0xF <<
+					OCRDMA_CREATE_AH_NUM_PAGES_SHIFT,
+
+	OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT	= 16,
+	OCRDMA_CREATE_AH_PAGE_SIZE_MASK		= 0x7 <<
+					OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT,
+
+	OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT	= 23,
+	OCRDMA_CREATE_AH_ENTRY_SIZE_MASK	= 0x1FF <<
+					OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT,
+};
+
+#define OCRDMA_AH_TBL_PAGES 8
+
+struct ocrdma_create_ah_tbl {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 ah_conf;
+	struct ocrdma_pa tbl_addr[8];
+};
+
+struct ocrdma_create_ah_tbl_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	u32 ahid;
+};
+
+struct ocrdma_delete_ah_tbl {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 ahid;
+};
+
+struct ocrdma_delete_ah_tbl_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+enum {
+	OCRDMA_EQE_VALID_SHIFT		= 0,
+	OCRDMA_EQE_VALID_MASK		= BIT(0),
+	OCRDMA_EQE_MAJOR_CODE_MASK      = 0x0E,
+	OCRDMA_EQE_MAJOR_CODE_SHIFT     = 0x01,
+	OCRDMA_EQE_FOR_CQE_MASK		= 0xFFFE,
+	OCRDMA_EQE_RESOURCE_ID_SHIFT	= 16,
+	OCRDMA_EQE_RESOURCE_ID_MASK	= 0xFFFF <<
+				OCRDMA_EQE_RESOURCE_ID_SHIFT,
+};
+
+enum major_code {
+	OCRDMA_MAJOR_CODE_COMPLETION    = 0x00,
+	OCRDMA_MAJOR_CODE_SENTINAL      = 0x01
+};
+
+struct ocrdma_eqe {
+	u32 id_valid;
+};
+
+enum OCRDMA_CQE_STATUS {
+	OCRDMA_CQE_SUCCESS = 0,
+	OCRDMA_CQE_LOC_LEN_ERR,
+	OCRDMA_CQE_LOC_QP_OP_ERR,
+	OCRDMA_CQE_LOC_EEC_OP_ERR,
+	OCRDMA_CQE_LOC_PROT_ERR,
+	OCRDMA_CQE_WR_FLUSH_ERR,
+	OCRDMA_CQE_MW_BIND_ERR,
+	OCRDMA_CQE_BAD_RESP_ERR,
+	OCRDMA_CQE_LOC_ACCESS_ERR,
+	OCRDMA_CQE_REM_INV_REQ_ERR,
+	OCRDMA_CQE_REM_ACCESS_ERR,
+	OCRDMA_CQE_REM_OP_ERR,
+	OCRDMA_CQE_RETRY_EXC_ERR,
+	OCRDMA_CQE_RNR_RETRY_EXC_ERR,
+	OCRDMA_CQE_LOC_RDD_VIOL_ERR,
+	OCRDMA_CQE_REM_INV_RD_REQ_ERR,
+	OCRDMA_CQE_REM_ABORT_ERR,
+	OCRDMA_CQE_INV_EECN_ERR,
+	OCRDMA_CQE_INV_EEC_STATE_ERR,
+	OCRDMA_CQE_FATAL_ERR,
+	OCRDMA_CQE_RESP_TIMEOUT_ERR,
+	OCRDMA_CQE_GENERAL_ERR,
+
+	OCRDMA_MAX_CQE_ERR
+};
+
+enum {
+	/* w0 */
+	OCRDMA_CQE_WQEIDX_SHIFT		= 0,
+	OCRDMA_CQE_WQEIDX_MASK		= 0xFFFF,
+
+	/* w1 */
+	OCRDMA_CQE_UD_XFER_LEN_SHIFT	= 16,
+	OCRDMA_CQE_UD_XFER_LEN_MASK     = 0x1FFF,
+	OCRDMA_CQE_PKEY_SHIFT		= 0,
+	OCRDMA_CQE_PKEY_MASK		= 0xFFFF,
+	OCRDMA_CQE_UD_L3TYPE_SHIFT      = 29,
+	OCRDMA_CQE_UD_L3TYPE_MASK       = 0x07,
+
+	/* w2 */
+	OCRDMA_CQE_QPN_SHIFT		= 0,
+	OCRDMA_CQE_QPN_MASK		= 0x0000FFFF,
+
+	OCRDMA_CQE_BUFTAG_SHIFT		= 16,
+	OCRDMA_CQE_BUFTAG_MASK		= 0xFFFF << OCRDMA_CQE_BUFTAG_SHIFT,
+
+	/* w3 */
+	OCRDMA_CQE_UD_STATUS_SHIFT	= 24,
+	OCRDMA_CQE_UD_STATUS_MASK	= 0x7 << OCRDMA_CQE_UD_STATUS_SHIFT,
+	OCRDMA_CQE_STATUS_SHIFT		= 16,
+	OCRDMA_CQE_STATUS_MASK		= 0xFF << OCRDMA_CQE_STATUS_SHIFT,
+	OCRDMA_CQE_VALID		= BIT(31),
+	OCRDMA_CQE_INVALIDATE		= BIT(30),
+	OCRDMA_CQE_QTYPE		= BIT(29),
+	OCRDMA_CQE_IMM			= BIT(28),
+	OCRDMA_CQE_WRITE_IMM		= BIT(27),
+	OCRDMA_CQE_QTYPE_SQ		= 0,
+	OCRDMA_CQE_QTYPE_RQ		= 1,
+	OCRDMA_CQE_SRCQP_MASK		= 0xFFFFFF
+};
+
+struct ocrdma_cqe {
+	union {
+		/* w0 to w2 */
+		struct {
+			u32 wqeidx;
+			u32 bytes_xfered;
+			u32 qpn;
+		} wq;
+		struct {
+			u32 lkey_immdt;
+			u32 rxlen;
+			u32 buftag_qpn;
+		} rq;
+		struct {
+			u32 lkey_immdt;
+			u32 rxlen_pkey;
+			u32 buftag_qpn;
+		} ud;
+		struct {
+			u32 word_0;
+			u32 word_1;
+			u32 qpn;
+		} cmn;
+	};
+	u32 flags_status_srcqpn;	/* w3 */
+};
+
+struct ocrdma_sge {
+	u32 addr_hi;
+	u32 addr_lo;
+	u32 lrkey;
+	u32 len;
+};
+
+enum {
+	OCRDMA_FLAG_SIG		= 0x1,
+	OCRDMA_FLAG_INV		= 0x2,
+	OCRDMA_FLAG_FENCE_L	= 0x4,
+	OCRDMA_FLAG_FENCE_R	= 0x8,
+	OCRDMA_FLAG_SOLICIT	= 0x10,
+	OCRDMA_FLAG_IMM		= 0x20,
+	OCRDMA_FLAG_AH_VLAN_PR  = 0x40,
+
+	/* Stag flags */
+	OCRDMA_LKEY_FLAG_LOCAL_WR	= 0x1,
+	OCRDMA_LKEY_FLAG_REMOTE_RD	= 0x2,
+	OCRDMA_LKEY_FLAG_REMOTE_WR	= 0x4,
+	OCRDMA_LKEY_FLAG_VATO		= 0x8,
+};
+
+enum OCRDMA_WQE_OPCODE {
+	OCRDMA_WRITE		= 0x06,
+	OCRDMA_READ		= 0x0C,
+	OCRDMA_RESV0		= 0x02,
+	OCRDMA_SEND		= 0x00,
+	OCRDMA_CMP_SWP		= 0x14,
+	OCRDMA_BIND_MW		= 0x10,
+	OCRDMA_FR_MR            = 0x11,
+	OCRDMA_RESV1		= 0x0A,
+	OCRDMA_LKEY_INV		= 0x15,
+	OCRDMA_FETCH_ADD	= 0x13,
+	OCRDMA_POST_RQ		= 0x12
+};
+
+enum {
+	OCRDMA_TYPE_INLINE	= 0x0,
+	OCRDMA_TYPE_LKEY	= 0x1,
+};
+
+enum {
+	OCRDMA_WQE_OPCODE_SHIFT		= 0,
+	OCRDMA_WQE_OPCODE_MASK		= 0x0000001F,
+	OCRDMA_WQE_FLAGS_SHIFT		= 5,
+	OCRDMA_WQE_TYPE_SHIFT		= 16,
+	OCRDMA_WQE_TYPE_MASK		= 0x00030000,
+	OCRDMA_WQE_SIZE_SHIFT		= 18,
+	OCRDMA_WQE_SIZE_MASK		= 0xFF,
+	OCRDMA_WQE_NXT_WQE_SIZE_SHIFT	= 25,
+
+	OCRDMA_WQE_LKEY_FLAGS_SHIFT	= 0,
+	OCRDMA_WQE_LKEY_FLAGS_MASK	= 0xF
+};
+
+/* header WQE for all the SQ and RQ operations */
+struct ocrdma_hdr_wqe {
+	u32 cw;
+	union {
+		u32 rsvd_tag;
+		u32 rsvd_lkey_flags;
+	};
+	union {
+		u32 immdt;
+		u32 lkey;
+	};
+	u32 total_len;
+};
+
+struct ocrdma_ewqe_ud_hdr {
+	u32 rsvd_dest_qpn;
+	u32 qkey;
+	u32 rsvd_ahid;
+	u32 hdr_type;
+};
+
+/* extended wqe followed by hdr_wqe for Fast Memory register */
+struct ocrdma_ewqe_fr {
+	u32 va_hi;
+	u32 va_lo;
+	u32 fbo_hi;
+	u32 fbo_lo;
+	u32 size_sge;
+	u32 num_sges;
+	u32 rsvd;
+	u32 rsvd2;
+};
+
+struct ocrdma_eth_basic {
+	u8 dmac[6];
+	u8 smac[6];
+	__be16 eth_type;
+} __packed;
+
+struct ocrdma_eth_vlan {
+	u8 dmac[6];
+	u8 smac[6];
+	__be16 eth_type;
+	__be16 vlan_tag;
+	__be16 roce_eth_type;
+} __packed;
+
+struct ocrdma_grh {
+	__be32	tclass_flow;
+	__be32	pdid_hoplimit;
+	u8	sgid[16];
+	u8	dgid[16];
+	u16	rsvd;
+} __packed;
+
+#define OCRDMA_AV_VALID		BIT(7)
+#define OCRDMA_AV_VLAN_VALID	BIT(1)
+
+struct ocrdma_av {
+	struct ocrdma_eth_vlan eth_hdr;
+	struct ocrdma_grh grh;
+	u32 valid;
+} __packed;
+
+struct ocrdma_rsrc_stats {
+	u32 dpp_pds;
+	u32 non_dpp_pds;
+	u32 rc_dpp_qps;
+	u32 uc_dpp_qps;
+	u32 ud_dpp_qps;
+	u32 rc_non_dpp_qps;
+	u32 rsvd;
+	u32 uc_non_dpp_qps;
+	u32 ud_non_dpp_qps;
+	u32 rsvd1;
+	u32 srqs;
+	u32 rbqs;
+	u32 r64K_nsmr;
+	u32 r64K_to_2M_nsmr;
+	u32 r2M_to_44M_nsmr;
+	u32 r44M_to_1G_nsmr;
+	u32 r1G_to_4G_nsmr;
+	u32 nsmr_count_4G_to_32G;
+	u32 r32G_to_64G_nsmr;
+	u32 r64G_to_128G_nsmr;
+	u32 r128G_to_higher_nsmr;
+	u32 embedded_nsmr;
+	u32 frmr;
+	u32 prefetch_qps;
+	u32 ondemand_qps;
+	u32 phy_mr;
+	u32 mw;
+	u32 rsvd2[7];
+};
+
+struct ocrdma_db_err_stats {
+	u32 sq_doorbell_errors;
+	u32 cq_doorbell_errors;
+	u32 rq_srq_doorbell_errors;
+	u32 cq_overflow_errors;
+	u32 rsvd[4];
+};
+
+struct ocrdma_wqe_stats {
+	u32 large_send_rc_wqes_lo;
+	u32 large_send_rc_wqes_hi;
+	u32 large_write_rc_wqes_lo;
+	u32 large_write_rc_wqes_hi;
+	u32 rsvd[4];
+	u32 read_wqes_lo;
+	u32 read_wqes_hi;
+	u32 frmr_wqes_lo;
+	u32 frmr_wqes_hi;
+	u32 mw_bind_wqes_lo;
+	u32 mw_bind_wqes_hi;
+	u32 invalidate_wqes_lo;
+	u32 invalidate_wqes_hi;
+	u32 rsvd1[2];
+	u32 dpp_wqe_drops;
+	u32 rsvd2[5];
+};
+
+struct ocrdma_tx_stats {
+	u32 send_pkts_lo;
+	u32 send_pkts_hi;
+	u32 write_pkts_lo;
+	u32 write_pkts_hi;
+	u32 read_pkts_lo;
+	u32 read_pkts_hi;
+	u32 read_rsp_pkts_lo;
+	u32 read_rsp_pkts_hi;
+	u32 ack_pkts_lo;
+	u32 ack_pkts_hi;
+	u32 send_bytes_lo;
+	u32 send_bytes_hi;
+	u32 write_bytes_lo;
+	u32 write_bytes_hi;
+	u32 read_req_bytes_lo;
+	u32 read_req_bytes_hi;
+	u32 read_rsp_bytes_lo;
+	u32 read_rsp_bytes_hi;
+	u32 ack_timeouts;
+	u32 rsvd[5];
+};
+
+
+struct ocrdma_tx_qp_err_stats {
+	u32 local_length_errors;
+	u32 local_protection_errors;
+	u32 local_qp_operation_errors;
+	u32 retry_count_exceeded_errors;
+	u32 rnr_retry_count_exceeded_errors;
+	u32 rsvd[3];
+};
+
+struct ocrdma_rx_stats {
+	u32 roce_frame_bytes_lo;
+	u32 roce_frame_bytes_hi;
+	u32 roce_frame_icrc_drops;
+	u32 roce_frame_payload_len_drops;
+	u32 ud_drops;
+	u32 qp1_drops;
+	u32 psn_error_request_packets;
+	u32 psn_error_resp_packets;
+	u32 rnr_nak_timeouts;
+	u32 rnr_nak_receives;
+	u32 roce_frame_rxmt_drops;
+	u32 nak_count_psn_sequence_errors;
+	u32 rc_drop_count_lookup_errors;
+	u32 rq_rnr_naks;
+	u32 srq_rnr_naks;
+	u32 roce_frames_lo;
+	u32 roce_frames_hi;
+	u32 rsvd;
+};
+
+struct ocrdma_rx_qp_err_stats {
+	u32 nak_invalid_requst_errors;
+	u32 nak_remote_operation_errors;
+	u32 nak_count_remote_access_errors;
+	u32 local_length_errors;
+	u32 local_protection_errors;
+	u32 local_qp_operation_errors;
+	u32 rsvd[2];
+};
+
+struct ocrdma_tx_dbg_stats {
+	u32 data[100];
+};
+
+struct ocrdma_rx_dbg_stats {
+	u32 data[200];
+};
+
+struct ocrdma_rdma_stats_req {
+	struct ocrdma_mbx_hdr hdr;
+	u8 reset_stats;
+	u8 rsvd[3];
+} __packed;
+
+struct ocrdma_rdma_stats_resp {
+	struct ocrdma_mbx_hdr hdr;
+	struct ocrdma_rsrc_stats act_rsrc_stats;
+	struct ocrdma_rsrc_stats th_rsrc_stats;
+	struct ocrdma_db_err_stats	db_err_stats;
+	struct ocrdma_wqe_stats		wqe_stats;
+	struct ocrdma_tx_stats		tx_stats;
+	struct ocrdma_tx_qp_err_stats	tx_qp_err_stats;
+	struct ocrdma_rx_stats		rx_stats;
+	struct ocrdma_rx_qp_err_stats	rx_qp_err_stats;
+	struct ocrdma_tx_dbg_stats	tx_dbg_stats;
+	struct ocrdma_rx_dbg_stats	rx_dbg_stats;
+} __packed;
+
+enum {
+	OCRDMA_HBA_ATTRB_EPROM_VER_LO_MASK	= 0xFF,
+	OCRDMA_HBA_ATTRB_EPROM_VER_HI_MASK	= 0xFF00,
+	OCRDMA_HBA_ATTRB_EPROM_VER_HI_SHIFT	= 0x08,
+	OCRDMA_HBA_ATTRB_CDBLEN_MASK		= 0xFFFF,
+	OCRDMA_HBA_ATTRB_ASIC_REV_MASK		= 0xFF0000,
+	OCRDMA_HBA_ATTRB_ASIC_REV_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_GUID0_MASK		= 0xFF000000,
+	OCRDMA_HBA_ATTRB_GUID0_SHIFT		= 0x18,
+	OCRDMA_HBA_ATTRB_GUID13_MASK		= 0xFF,
+	OCRDMA_HBA_ATTRB_GUID14_MASK		= 0xFF00,
+	OCRDMA_HBA_ATTRB_GUID14_SHIFT		= 0x08,
+	OCRDMA_HBA_ATTRB_GUID15_MASK		= 0xFF0000,
+	OCRDMA_HBA_ATTRB_GUID15_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_PCNT_MASK		= 0xFF000000,
+	OCRDMA_HBA_ATTRB_PCNT_SHIFT		= 0x18,
+	OCRDMA_HBA_ATTRB_LDTOUT_MASK		= 0xFFFF,
+	OCRDMA_HBA_ATTRB_ISCSI_VER_MASK		= 0xFF0000,
+	OCRDMA_HBA_ATTRB_ISCSI_VER_SHIFT	= 0x10,
+	OCRDMA_HBA_ATTRB_MFUNC_DEV_MASK		= 0xFF000000,
+	OCRDMA_HBA_ATTRB_MFUNC_DEV_SHIFT	= 0x18,
+	OCRDMA_HBA_ATTRB_CV_MASK		= 0xFF,
+	OCRDMA_HBA_ATTRB_HBA_ST_MASK		= 0xFF00,
+	OCRDMA_HBA_ATTRB_HBA_ST_SHIFT		= 0x08,
+	OCRDMA_HBA_ATTRB_MAX_DOMS_MASK		= 0xFF0000,
+	OCRDMA_HBA_ATTRB_MAX_DOMS_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_PTNUM_MASK		= 0x3F000000,
+	OCRDMA_HBA_ATTRB_PTNUM_SHIFT		= 0x18,
+	OCRDMA_HBA_ATTRB_PT_MASK		= 0xC0000000,
+	OCRDMA_HBA_ATTRB_PT_SHIFT		= 0x1E,
+	OCRDMA_HBA_ATTRB_ISCSI_FET_MASK		= 0xFF,
+	OCRDMA_HBA_ATTRB_ASIC_GEN_MASK		= 0xFF00,
+	OCRDMA_HBA_ATTRB_ASIC_GEN_SHIFT		= 0x08,
+	OCRDMA_HBA_ATTRB_PCI_VID_MASK		= 0xFFFF,
+	OCRDMA_HBA_ATTRB_PCI_DID_MASK		= 0xFFFF0000,
+	OCRDMA_HBA_ATTRB_PCI_DID_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_PCI_SVID_MASK		= 0xFFFF,
+	OCRDMA_HBA_ATTRB_PCI_SSID_MASK		= 0xFFFF0000,
+	OCRDMA_HBA_ATTRB_PCI_SSID_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_PCI_BUSNUM_MASK	= 0xFF,
+	OCRDMA_HBA_ATTRB_PCI_DEVNUM_MASK	= 0xFF00,
+	OCRDMA_HBA_ATTRB_PCI_DEVNUM_SHIFT	= 0x08,
+	OCRDMA_HBA_ATTRB_PCI_FUNCNUM_MASK	= 0xFF0000,
+	OCRDMA_HBA_ATTRB_PCI_FUNCNUM_SHIFT	= 0x10,
+	OCRDMA_HBA_ATTRB_IF_TYPE_MASK		= 0xFF000000,
+	OCRDMA_HBA_ATTRB_IF_TYPE_SHIFT		= 0x18,
+	OCRDMA_HBA_ATTRB_NETFIL_MASK		=0xFF
+};
+
+struct mgmt_hba_attribs {
+	u8 flashrom_version_string[32];
+	u8 manufacturer_name[32];
+	u32 supported_modes;
+	u32 rsvd_eprom_verhi_verlo;
+	u32 mbx_ds_ver;
+	u32 epfw_ds_ver;
+	u8 ncsi_ver_string[12];
+	u32 default_extended_timeout;
+	u8 controller_model_number[32];
+	u8 controller_description[64];
+	u8 controller_serial_number[32];
+	u8 ip_version_string[32];
+	u8 firmware_version_string[32];
+	u8 bios_version_string[32];
+	u8 redboot_version_string[32];
+	u8 driver_version_string[32];
+	u8 fw_on_flash_version_string[32];
+	u32 functionalities_supported;
+	u32 guid0_asicrev_cdblen;
+	u8 generational_guid[12];
+	u32 portcnt_guid15;
+	u32 mfuncdev_iscsi_ldtout;
+	u32 ptpnum_maxdoms_hbast_cv;
+	u32 firmware_post_status;
+	u32 hba_mtu[8];
+	u32 res_asicgen_iscsi_feaures;
+	u32 rsvd1[3];
+};
+
+struct mgmt_controller_attrib {
+	struct mgmt_hba_attribs hba_attribs;
+	u32 pci_did_vid;
+	u32 pci_ssid_svid;
+	u32 ityp_fnum_devnum_bnum;
+	u32 uid_hi;
+	u32 uid_lo;
+	u32 res_nnetfil;
+	u32 rsvd0[4];
+};
+
+struct ocrdma_get_ctrl_attribs_rsp {
+	struct ocrdma_mbx_hdr hdr;
+	struct mgmt_controller_attrib ctrl_attribs;
+};
+
+#define OCRDMA_SUBSYS_DCBX 0x10
+
+enum OCRDMA_DCBX_OPCODE {
+	OCRDMA_CMD_GET_DCBX_CONFIG = 0x01
+};
+
+enum OCRDMA_DCBX_PARAM_TYPE {
+	OCRDMA_PARAMETER_TYPE_ADMIN	= 0x00,
+	OCRDMA_PARAMETER_TYPE_OPER	= 0x01,
+	OCRDMA_PARAMETER_TYPE_PEER	= 0x02
+};
+
+enum OCRDMA_DCBX_PROTO {
+	OCRDMA_PROTO_SELECT_L2	= 0x00,
+	OCRDMA_PROTO_SELECT_L4	= 0x01
+};
+
+enum OCRDMA_DCBX_APP_PARAM {
+	OCRDMA_APP_PARAM_APP_PROTO_MASK = 0xFFFF,
+	OCRDMA_APP_PARAM_PROTO_SEL_MASK = 0xFF,
+	OCRDMA_APP_PARAM_PROTO_SEL_SHIFT = 0x10,
+	OCRDMA_APP_PARAM_VALID_MASK	= 0xFF,
+	OCRDMA_APP_PARAM_VALID_SHIFT	= 0x18
+};
+
+enum OCRDMA_DCBX_STATE_FLAGS {
+	OCRDMA_STATE_FLAG_ENABLED	= 0x01,
+	OCRDMA_STATE_FLAG_ADDVERTISED	= 0x02,
+	OCRDMA_STATE_FLAG_WILLING	= 0x04,
+	OCRDMA_STATE_FLAG_SYNC		= 0x08,
+	OCRDMA_STATE_FLAG_UNSUPPORTED	= 0x40000000,
+	OCRDMA_STATE_FLAG_NEG_FAILD	= 0x80000000
+};
+
+enum OCRDMA_TCV_AEV_OPV_ST {
+	OCRDMA_DCBX_TC_SUPPORT_MASK	= 0xFF,
+	OCRDMA_DCBX_TC_SUPPORT_SHIFT	= 0x18,
+	OCRDMA_DCBX_APP_ENTRY_SHIFT	= 0x10,
+	OCRDMA_DCBX_OP_PARAM_SHIFT	= 0x08,
+	OCRDMA_DCBX_STATE_MASK		= 0xFF
+};
+
+struct ocrdma_app_parameter {
+	u32 valid_proto_app;
+	u32 oui;
+	u32 app_prio[2];
+};
+
+struct ocrdma_dcbx_cfg {
+	u32 tcv_aev_opv_st;
+	u32 tc_state;
+	u32 pfc_state;
+	u32 qcn_state;
+	u32 appl_state;
+	u32 ll_state;
+	u32 tc_bw[2];
+	u32 tc_prio[8];
+	u32 pfc_prio[2];
+	struct ocrdma_app_parameter app_param[15];
+};
+
+struct ocrdma_get_dcbx_cfg_req {
+	struct ocrdma_mbx_hdr hdr;
+	u32 param_type;
+} __packed;
+
+struct ocrdma_get_dcbx_cfg_rsp {
+	struct ocrdma_mbx_rsp hdr;
+	struct ocrdma_dcbx_cfg cfg;
+} __packed;
+
+#endif				/* __OCRDMA_SLI_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
new file mode 100644
index 0000000..24d20a4
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
@@ -0,0 +1,862 @@
+/* This file is part of the Emulex RoCE Device Driver for
+ * RoCE (RDMA over Converged Ethernet) adapters.
+ * Copyright (C) 2012-2015 Emulex. All rights reserved.
+ * EMULEX and SLI are trademarks of Emulex.
+ * www.emulex.com
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License (GPL) Version 2, available from the file COPYING in the main
+ * directory of this source tree, or the BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include <rdma/ib_addr.h>
+#include <rdma/ib_pma.h>
+#include "ocrdma_stats.h"
+
+static struct dentry *ocrdma_dbgfs_dir;
+
+static int ocrdma_add_stat(char *start, char *pcur,
+				char *name, u64 count)
+{
+	char buff[128] = {0};
+	int cpy_len = 0;
+
+	snprintf(buff, 128, "%s: %llu\n", name, count);
+	cpy_len = strlen(buff);
+
+	if (pcur + cpy_len > start + OCRDMA_MAX_DBGFS_MEM) {
+		pr_err("%s: No space in stats buff\n", __func__);
+		return 0;
+	}
+
+	memcpy(pcur, buff, cpy_len);
+	return cpy_len;
+}
+
+bool ocrdma_alloc_stats_resources(struct ocrdma_dev *dev)
+{
+	struct stats_mem *mem = &dev->stats_mem;
+
+	mutex_init(&dev->stats_lock);
+	/* Alloc mbox command mem*/
+	mem->size = max_t(u32, sizeof(struct ocrdma_rdma_stats_req),
+			sizeof(struct ocrdma_rdma_stats_resp));
+
+	mem->va = dma_zalloc_coherent(&dev->nic_info.pdev->dev, mem->size,
+				      &mem->pa, GFP_KERNEL);
+	if (!mem->va) {
+		pr_err("%s: stats mbox allocation failed\n", __func__);
+		return false;
+	}
+
+	/* Alloc debugfs mem */
+	mem->debugfs_mem = kzalloc(OCRDMA_MAX_DBGFS_MEM, GFP_KERNEL);
+	if (!mem->debugfs_mem)
+		return false;
+
+	return true;
+}
+
+void ocrdma_release_stats_resources(struct ocrdma_dev *dev)
+{
+	struct stats_mem *mem = &dev->stats_mem;
+
+	if (mem->va)
+		dma_free_coherent(&dev->nic_info.pdev->dev, mem->size,
+				  mem->va, mem->pa);
+	mem->va = NULL;
+	kfree(mem->debugfs_mem);
+}
+
+static char *ocrdma_resource_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+			(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rsrc_stats *rsrc_stats = &rdma_stats->act_rsrc_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "active_dpp_pds",
+				(u64)rsrc_stats->dpp_pds);
+	pcur += ocrdma_add_stat(stats, pcur, "active_non_dpp_pds",
+				(u64)rsrc_stats->non_dpp_pds);
+	pcur += ocrdma_add_stat(stats, pcur, "active_rc_dpp_qps",
+				(u64)rsrc_stats->rc_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_uc_dpp_qps",
+				(u64)rsrc_stats->uc_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_ud_dpp_qps",
+				(u64)rsrc_stats->ud_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_rc_non_dpp_qps",
+				(u64)rsrc_stats->rc_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_uc_non_dpp_qps",
+				(u64)rsrc_stats->uc_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_ud_non_dpp_qps",
+				(u64)rsrc_stats->ud_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_srqs",
+				(u64)rsrc_stats->srqs);
+	pcur += ocrdma_add_stat(stats, pcur, "active_rbqs",
+				(u64)rsrc_stats->rbqs);
+	pcur += ocrdma_add_stat(stats, pcur, "active_64K_nsmr",
+				(u64)rsrc_stats->r64K_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_64K_to_2M_nsmr",
+				(u64)rsrc_stats->r64K_to_2M_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_2M_to_44M_nsmr",
+				(u64)rsrc_stats->r2M_to_44M_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_44M_to_1G_nsmr",
+				(u64)rsrc_stats->r44M_to_1G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_1G_to_4G_nsmr",
+				(u64)rsrc_stats->r1G_to_4G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_nsmr_count_4G_to_32G",
+				(u64)rsrc_stats->nsmr_count_4G_to_32G);
+	pcur += ocrdma_add_stat(stats, pcur, "active_32G_to_64G_nsmr",
+				(u64)rsrc_stats->r32G_to_64G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_64G_to_128G_nsmr",
+				(u64)rsrc_stats->r64G_to_128G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_128G_to_higher_nsmr",
+				(u64)rsrc_stats->r128G_to_higher_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_embedded_nsmr",
+				(u64)rsrc_stats->embedded_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_frmr",
+				(u64)rsrc_stats->frmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_prefetch_qps",
+				(u64)rsrc_stats->prefetch_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_ondemand_qps",
+				(u64)rsrc_stats->ondemand_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_phy_mr",
+				(u64)rsrc_stats->phy_mr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_mw",
+				(u64)rsrc_stats->mw);
+
+	/* Print the threshold stats */
+	rsrc_stats = &rdma_stats->th_rsrc_stats;
+
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_dpp_pds",
+				(u64)rsrc_stats->dpp_pds);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_non_dpp_pds",
+				(u64)rsrc_stats->non_dpp_pds);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_dpp_qps",
+				(u64)rsrc_stats->rc_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_dpp_qps",
+				(u64)rsrc_stats->uc_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_dpp_qps",
+				(u64)rsrc_stats->ud_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_non_dpp_qps",
+				(u64)rsrc_stats->rc_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_non_dpp_qps",
+				(u64)rsrc_stats->uc_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_non_dpp_qps",
+				(u64)rsrc_stats->ud_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_srqs",
+				(u64)rsrc_stats->srqs);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_rbqs",
+				(u64)rsrc_stats->rbqs);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_nsmr",
+				(u64)rsrc_stats->r64K_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_to_2M_nsmr",
+				(u64)rsrc_stats->r64K_to_2M_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_2M_to_44M_nsmr",
+				(u64)rsrc_stats->r2M_to_44M_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_44M_to_1G_nsmr",
+				(u64)rsrc_stats->r44M_to_1G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_1G_to_4G_nsmr",
+				(u64)rsrc_stats->r1G_to_4G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_nsmr_count_4G_to_32G",
+				(u64)rsrc_stats->nsmr_count_4G_to_32G);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_32G_to_64G_nsmr",
+				(u64)rsrc_stats->r32G_to_64G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_64G_to_128G_nsmr",
+				(u64)rsrc_stats->r64G_to_128G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_128G_to_higher_nsmr",
+				(u64)rsrc_stats->r128G_to_higher_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_embedded_nsmr",
+				(u64)rsrc_stats->embedded_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_frmr",
+				(u64)rsrc_stats->frmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_prefetch_qps",
+				(u64)rsrc_stats->prefetch_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_ondemand_qps",
+				(u64)rsrc_stats->ondemand_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_phy_mr",
+				(u64)rsrc_stats->phy_mr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_mw",
+				(u64)rsrc_stats->mw);
+	return stats;
+}
+
+static char *ocrdma_rx_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat
+		(stats, pcur, "roce_frame_bytes",
+		 convert_to_64bit(rx_stats->roce_frame_bytes_lo,
+		 rx_stats->roce_frame_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "roce_frame_icrc_drops",
+				(u64)rx_stats->roce_frame_icrc_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "roce_frame_payload_len_drops",
+				(u64)rx_stats->roce_frame_payload_len_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "ud_drops",
+				(u64)rx_stats->ud_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "qp1_drops",
+				(u64)rx_stats->qp1_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "psn_error_request_packets",
+				(u64)rx_stats->psn_error_request_packets);
+	pcur += ocrdma_add_stat(stats, pcur, "psn_error_resp_packets",
+				(u64)rx_stats->psn_error_resp_packets);
+	pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_timeouts",
+				(u64)rx_stats->rnr_nak_timeouts);
+	pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_receives",
+				(u64)rx_stats->rnr_nak_receives);
+	pcur += ocrdma_add_stat(stats, pcur, "roce_frame_rxmt_drops",
+				(u64)rx_stats->roce_frame_rxmt_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "nak_count_psn_sequence_errors",
+				(u64)rx_stats->nak_count_psn_sequence_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "rc_drop_count_lookup_errors",
+				(u64)rx_stats->rc_drop_count_lookup_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "rq_rnr_naks",
+				(u64)rx_stats->rq_rnr_naks);
+	pcur += ocrdma_add_stat(stats, pcur, "srq_rnr_naks",
+				(u64)rx_stats->srq_rnr_naks);
+	pcur += ocrdma_add_stat(stats, pcur, "roce_frames",
+				convert_to_64bit(rx_stats->roce_frames_lo,
+						 rx_stats->roce_frames_hi));
+
+	return stats;
+}
+
+static u64 ocrdma_sysfs_rcv_pkts(struct ocrdma_dev *dev)
+{
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats;
+
+	return convert_to_64bit(rx_stats->roce_frames_lo,
+		rx_stats->roce_frames_hi) + (u64)rx_stats->roce_frame_icrc_drops
+		+ (u64)rx_stats->roce_frame_payload_len_drops;
+}
+
+static u64 ocrdma_sysfs_rcv_data(struct ocrdma_dev *dev)
+{
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats;
+
+	return (convert_to_64bit(rx_stats->roce_frame_bytes_lo,
+		rx_stats->roce_frame_bytes_hi))/4;
+}
+
+static char *ocrdma_tx_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "send_pkts",
+				convert_to_64bit(tx_stats->send_pkts_lo,
+						 tx_stats->send_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "write_pkts",
+				convert_to_64bit(tx_stats->write_pkts_lo,
+						 tx_stats->write_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_pkts",
+				convert_to_64bit(tx_stats->read_pkts_lo,
+						 tx_stats->read_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_rsp_pkts",
+				convert_to_64bit(tx_stats->read_rsp_pkts_lo,
+						 tx_stats->read_rsp_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "ack_pkts",
+				convert_to_64bit(tx_stats->ack_pkts_lo,
+						 tx_stats->ack_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "send_bytes",
+				convert_to_64bit(tx_stats->send_bytes_lo,
+						 tx_stats->send_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "write_bytes",
+				convert_to_64bit(tx_stats->write_bytes_lo,
+						 tx_stats->write_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_req_bytes",
+				convert_to_64bit(tx_stats->read_req_bytes_lo,
+						 tx_stats->read_req_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_rsp_bytes",
+				convert_to_64bit(tx_stats->read_rsp_bytes_lo,
+						 tx_stats->read_rsp_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "ack_timeouts",
+				(u64)tx_stats->ack_timeouts);
+
+	return stats;
+}
+
+static u64 ocrdma_sysfs_xmit_pkts(struct ocrdma_dev *dev)
+{
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats;
+
+	return (convert_to_64bit(tx_stats->send_pkts_lo,
+				 tx_stats->send_pkts_hi) +
+	convert_to_64bit(tx_stats->write_pkts_lo, tx_stats->write_pkts_hi) +
+	convert_to_64bit(tx_stats->read_pkts_lo, tx_stats->read_pkts_hi) +
+	convert_to_64bit(tx_stats->read_rsp_pkts_lo,
+			 tx_stats->read_rsp_pkts_hi) +
+	convert_to_64bit(tx_stats->ack_pkts_lo, tx_stats->ack_pkts_hi));
+}
+
+static u64 ocrdma_sysfs_xmit_data(struct ocrdma_dev *dev)
+{
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats;
+
+	return (convert_to_64bit(tx_stats->send_bytes_lo,
+				 tx_stats->send_bytes_hi) +
+		convert_to_64bit(tx_stats->write_bytes_lo,
+				 tx_stats->write_bytes_hi) +
+		convert_to_64bit(tx_stats->read_req_bytes_lo,
+				 tx_stats->read_req_bytes_hi) +
+		convert_to_64bit(tx_stats->read_rsp_bytes_lo,
+				 tx_stats->read_rsp_bytes_hi))/4;
+}
+
+static char *ocrdma_wqe_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_wqe_stats	*wqe_stats = &rdma_stats->wqe_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "large_send_rc_wqes",
+		convert_to_64bit(wqe_stats->large_send_rc_wqes_lo,
+				 wqe_stats->large_send_rc_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "large_write_rc_wqes",
+		convert_to_64bit(wqe_stats->large_write_rc_wqes_lo,
+				 wqe_stats->large_write_rc_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_wqes",
+				convert_to_64bit(wqe_stats->read_wqes_lo,
+						 wqe_stats->read_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "frmr_wqes",
+				convert_to_64bit(wqe_stats->frmr_wqes_lo,
+						 wqe_stats->frmr_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "mw_bind_wqes",
+				convert_to_64bit(wqe_stats->mw_bind_wqes_lo,
+						 wqe_stats->mw_bind_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "invalidate_wqes",
+		convert_to_64bit(wqe_stats->invalidate_wqes_lo,
+				 wqe_stats->invalidate_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "dpp_wqe_drops",
+				(u64)wqe_stats->dpp_wqe_drops);
+	return stats;
+}
+
+static char *ocrdma_db_errstats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_db_err_stats *db_err_stats = &rdma_stats->db_err_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "sq_doorbell_errors",
+				(u64)db_err_stats->sq_doorbell_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "cq_doorbell_errors",
+				(u64)db_err_stats->cq_doorbell_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "rq_srq_doorbell_errors",
+				(u64)db_err_stats->rq_srq_doorbell_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "cq_overflow_errors",
+				(u64)db_err_stats->cq_overflow_errors);
+	return stats;
+}
+
+static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rx_qp_err_stats *rx_qp_err_stats =
+		 &rdma_stats->rx_qp_err_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "nak_invalid_requst_errors",
+			(u64)rx_qp_err_stats->nak_invalid_requst_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "nak_remote_operation_errors",
+			(u64)rx_qp_err_stats->nak_remote_operation_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "nak_count_remote_access_errors",
+			(u64)rx_qp_err_stats->nak_count_remote_access_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_length_errors",
+			(u64)rx_qp_err_stats->local_length_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors",
+			(u64)rx_qp_err_stats->local_protection_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors",
+			(u64)rx_qp_err_stats->local_qp_operation_errors);
+	return stats;
+}
+
+static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_tx_qp_err_stats *tx_qp_err_stats =
+		&rdma_stats->tx_qp_err_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "local_length_errors",
+			(u64)tx_qp_err_stats->local_length_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors",
+			(u64)tx_qp_err_stats->local_protection_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors",
+			(u64)tx_qp_err_stats->local_qp_operation_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "retry_count_exceeded_errors",
+			(u64)tx_qp_err_stats->retry_count_exceeded_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "rnr_retry_count_exceeded_errors",
+			(u64)tx_qp_err_stats->rnr_retry_count_exceeded_errors);
+	return stats;
+}
+
+static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev)
+{
+	int i;
+	char *pstats = dev->stats_mem.debugfs_mem;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_tx_dbg_stats *tx_dbg_stats =
+		&rdma_stats->tx_dbg_stats;
+
+	memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	for (i = 0; i < 100; i++)
+		pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i,
+				 tx_dbg_stats->data[i]);
+
+	return dev->stats_mem.debugfs_mem;
+}
+
+static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev)
+{
+	int i;
+	char *pstats = dev->stats_mem.debugfs_mem;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rx_dbg_stats *rx_dbg_stats =
+		&rdma_stats->rx_dbg_stats;
+
+	memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	for (i = 0; i < 200; i++)
+		pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i,
+				 rx_dbg_stats->data[i]);
+
+	return dev->stats_mem.debugfs_mem;
+}
+
+static char *ocrdma_driver_dbg_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "async_cq_err",
+				(u64)(dev->async_err_stats
+				[OCRDMA_CQ_ERROR].counter));
+	pcur += ocrdma_add_stat(stats, pcur, "async_cq_overrun_err",
+				(u64)dev->async_err_stats
+				[OCRDMA_CQ_OVERRUN_ERROR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_cq_qpcat_err",
+				(u64)dev->async_err_stats
+				[OCRDMA_CQ_QPCAT_ERROR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_qp_access_err",
+				(u64)dev->async_err_stats
+				[OCRDMA_QP_ACCESS_ERROR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_qp_commm_est_evt",
+				(u64)dev->async_err_stats
+				[OCRDMA_QP_COMM_EST_EVENT].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_sq_drained_evt",
+				(u64)dev->async_err_stats
+				[OCRDMA_SQ_DRAINED_EVENT].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_dev_fatal_evt",
+				(u64)dev->async_err_stats
+				[OCRDMA_DEVICE_FATAL_EVENT].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_srqcat_err",
+				(u64)dev->async_err_stats
+				[OCRDMA_SRQCAT_ERROR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_srq_limit_evt",
+				(u64)dev->async_err_stats
+				[OCRDMA_SRQ_LIMIT_EVENT].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_qp_last_wqe_evt",
+				(u64)dev->async_err_stats
+				[OCRDMA_QP_LAST_WQE_EVENT].counter);
+
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_len_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_LOC_LEN_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_qp_op_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_LOC_QP_OP_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_eec_op_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_LOC_EEC_OP_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_prot_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_LOC_PROT_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_wr_flush_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_WR_FLUSH_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_mw_bind_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_MW_BIND_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_bad_resp_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_BAD_RESP_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_access_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_LOC_ACCESS_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_inv_req_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_REM_INV_REQ_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_access_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_REM_ACCESS_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_op_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_REM_OP_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_retry_exc_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_RETRY_EXC_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_rnr_retry_exc_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_RNR_RETRY_EXC_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_rdd_viol_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_LOC_RDD_VIOL_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_inv_rd_req_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_REM_INV_RD_REQ_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_abort_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_REM_ABORT_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_inv_eecn_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_INV_EECN_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_inv_eec_state_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_INV_EEC_STATE_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_fatal_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_FATAL_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_resp_timeout_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_RESP_TIMEOUT_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_general_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_GENERAL_ERR].counter);
+	return stats;
+}
+
+static void ocrdma_update_stats(struct ocrdma_dev *dev)
+{
+	ulong now = jiffies, secs;
+	int status;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		      (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rsrc_stats *rsrc_stats = &rdma_stats->act_rsrc_stats;
+
+	secs = jiffies_to_msecs(now - dev->last_stats_time) / 1000U;
+	if (secs) {
+		/* update */
+		status = ocrdma_mbx_rdma_stats(dev, false);
+		if (status)
+			pr_err("%s: stats mbox failed with status = %d\n",
+			       __func__, status);
+		/* Update PD counters from PD resource manager */
+		if (dev->pd_mgr->pd_prealloc_valid) {
+			rsrc_stats->dpp_pds = dev->pd_mgr->pd_dpp_count;
+			rsrc_stats->non_dpp_pds = dev->pd_mgr->pd_norm_count;
+			/* Threshold stata*/
+			rsrc_stats = &rdma_stats->th_rsrc_stats;
+			rsrc_stats->dpp_pds = dev->pd_mgr->pd_dpp_thrsh;
+			rsrc_stats->non_dpp_pds = dev->pd_mgr->pd_norm_thrsh;
+		}
+		dev->last_stats_time = jiffies;
+	}
+}
+
+static ssize_t ocrdma_dbgfs_ops_write(struct file *filp,
+					const char __user *buffer,
+					size_t count, loff_t *ppos)
+{
+	char tmp_str[32];
+	long reset;
+	int status;
+	struct ocrdma_stats *pstats = filp->private_data;
+	struct ocrdma_dev *dev = pstats->dev;
+
+	if (*ppos != 0 || count == 0 || count > sizeof(tmp_str))
+		goto err;
+
+	if (copy_from_user(tmp_str, buffer, count))
+		goto err;
+
+	tmp_str[count-1] = '\0';
+	if (kstrtol(tmp_str, 10, &reset))
+		goto err;
+
+	switch (pstats->type) {
+	case OCRDMA_RESET_STATS:
+		if (reset) {
+			status = ocrdma_mbx_rdma_stats(dev, true);
+			if (status) {
+				pr_err("Failed to reset stats = %d\n", status);
+				goto err;
+			}
+		}
+		break;
+	default:
+		goto err;
+	}
+
+	return count;
+err:
+	return -EFAULT;
+}
+
+int ocrdma_pma_counters(struct ocrdma_dev *dev,
+			struct ib_mad *out_mad)
+{
+	struct ib_pma_portcounters *pma_cnt;
+
+	memset(out_mad->data, 0, sizeof out_mad->data);
+	pma_cnt = (void *)(out_mad->data + 40);
+	ocrdma_update_stats(dev);
+
+	pma_cnt->port_xmit_data    = cpu_to_be32(ocrdma_sysfs_xmit_data(dev));
+	pma_cnt->port_rcv_data     = cpu_to_be32(ocrdma_sysfs_rcv_data(dev));
+	pma_cnt->port_xmit_packets = cpu_to_be32(ocrdma_sysfs_xmit_pkts(dev));
+	pma_cnt->port_rcv_packets  = cpu_to_be32(ocrdma_sysfs_rcv_pkts(dev));
+	return 0;
+}
+
+static ssize_t ocrdma_dbgfs_ops_read(struct file *filp, char __user *buffer,
+					size_t usr_buf_len, loff_t *ppos)
+{
+	struct ocrdma_stats *pstats = filp->private_data;
+	struct ocrdma_dev *dev = pstats->dev;
+	ssize_t status = 0;
+	char *data = NULL;
+
+	/* No partial reads */
+	if (*ppos != 0)
+		return 0;
+
+	mutex_lock(&dev->stats_lock);
+
+	ocrdma_update_stats(dev);
+
+	switch (pstats->type) {
+	case OCRDMA_RSRC_STATS:
+		data = ocrdma_resource_stats(dev);
+		break;
+	case OCRDMA_RXSTATS:
+		data = ocrdma_rx_stats(dev);
+		break;
+	case OCRDMA_WQESTATS:
+		data = ocrdma_wqe_stats(dev);
+		break;
+	case OCRDMA_TXSTATS:
+		data = ocrdma_tx_stats(dev);
+		break;
+	case OCRDMA_DB_ERRSTATS:
+		data = ocrdma_db_errstats(dev);
+		break;
+	case OCRDMA_RXQP_ERRSTATS:
+		data = ocrdma_rxqp_errstats(dev);
+		break;
+	case OCRDMA_TXQP_ERRSTATS:
+		data = ocrdma_txqp_errstats(dev);
+		break;
+	case OCRDMA_TX_DBG_STATS:
+		data = ocrdma_tx_dbg_stats(dev);
+		break;
+	case OCRDMA_RX_DBG_STATS:
+		data = ocrdma_rx_dbg_stats(dev);
+		break;
+	case OCRDMA_DRV_STATS:
+		data = ocrdma_driver_dbg_stats(dev);
+		break;
+
+	default:
+		status = -EFAULT;
+		goto exit;
+	}
+
+	if (usr_buf_len < strlen(data)) {
+		status = -ENOSPC;
+		goto exit;
+	}
+
+	status = simple_read_from_buffer(buffer, usr_buf_len, ppos, data,
+					 strlen(data));
+exit:
+	mutex_unlock(&dev->stats_lock);
+	return status;
+}
+
+static const struct file_operations ocrdma_dbg_ops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = ocrdma_dbgfs_ops_read,
+	.write = ocrdma_dbgfs_ops_write,
+};
+
+void ocrdma_add_port_stats(struct ocrdma_dev *dev)
+{
+	if (!ocrdma_dbgfs_dir)
+		return;
+
+	/* Create post stats base dir */
+	dev->dir = debugfs_create_dir(dev->ibdev.name, ocrdma_dbgfs_dir);
+	if (!dev->dir)
+		goto err;
+
+	dev->rsrc_stats.type = OCRDMA_RSRC_STATS;
+	dev->rsrc_stats.dev = dev;
+	if (!debugfs_create_file("resource_stats", S_IRUSR, dev->dir,
+				 &dev->rsrc_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->rx_stats.type = OCRDMA_RXSTATS;
+	dev->rx_stats.dev = dev;
+	if (!debugfs_create_file("rx_stats", S_IRUSR, dev->dir,
+				 &dev->rx_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->wqe_stats.type = OCRDMA_WQESTATS;
+	dev->wqe_stats.dev = dev;
+	if (!debugfs_create_file("wqe_stats", S_IRUSR, dev->dir,
+				 &dev->wqe_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->tx_stats.type = OCRDMA_TXSTATS;
+	dev->tx_stats.dev = dev;
+	if (!debugfs_create_file("tx_stats", S_IRUSR, dev->dir,
+				 &dev->tx_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->db_err_stats.type = OCRDMA_DB_ERRSTATS;
+	dev->db_err_stats.dev = dev;
+	if (!debugfs_create_file("db_err_stats", S_IRUSR, dev->dir,
+				 &dev->db_err_stats, &ocrdma_dbg_ops))
+		goto err;
+
+
+	dev->tx_qp_err_stats.type = OCRDMA_TXQP_ERRSTATS;
+	dev->tx_qp_err_stats.dev = dev;
+	if (!debugfs_create_file("tx_qp_err_stats", S_IRUSR, dev->dir,
+				 &dev->tx_qp_err_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->rx_qp_err_stats.type = OCRDMA_RXQP_ERRSTATS;
+	dev->rx_qp_err_stats.dev = dev;
+	if (!debugfs_create_file("rx_qp_err_stats", S_IRUSR, dev->dir,
+				 &dev->rx_qp_err_stats, &ocrdma_dbg_ops))
+		goto err;
+
+
+	dev->tx_dbg_stats.type = OCRDMA_TX_DBG_STATS;
+	dev->tx_dbg_stats.dev = dev;
+	if (!debugfs_create_file("tx_dbg_stats", S_IRUSR, dev->dir,
+				 &dev->tx_dbg_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->rx_dbg_stats.type = OCRDMA_RX_DBG_STATS;
+	dev->rx_dbg_stats.dev = dev;
+	if (!debugfs_create_file("rx_dbg_stats", S_IRUSR, dev->dir,
+				 &dev->rx_dbg_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->driver_stats.type = OCRDMA_DRV_STATS;
+	dev->driver_stats.dev = dev;
+	if (!debugfs_create_file("driver_dbg_stats", S_IRUSR, dev->dir,
+					&dev->driver_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->reset_stats.type = OCRDMA_RESET_STATS;
+	dev->reset_stats.dev = dev;
+	if (!debugfs_create_file("reset_stats", 0200, dev->dir,
+				&dev->reset_stats, &ocrdma_dbg_ops))
+		goto err;
+
+
+	return;
+err:
+	debugfs_remove_recursive(dev->dir);
+	dev->dir = NULL;
+}
+
+void ocrdma_rem_port_stats(struct ocrdma_dev *dev)
+{
+	if (!dev->dir)
+		return;
+	debugfs_remove_recursive(dev->dir);
+}
+
+void ocrdma_init_debugfs(void)
+{
+	/* Create base dir in debugfs root dir */
+	ocrdma_dbgfs_dir = debugfs_create_dir("ocrdma", NULL);
+}
+
+void ocrdma_rem_debugfs(void)
+{
+	debugfs_remove_recursive(ocrdma_dbgfs_dir);
+}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.h b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h
new file mode 100644
index 0000000..bba1fec
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h
@@ -0,0 +1,75 @@
+/* This file is part of the Emulex RoCE Device Driver for
+ * RoCE (RDMA over Converged Ethernet) adapters.
+ * Copyright (C) 2012-2015 Emulex. All rights reserved.
+ * EMULEX and SLI are trademarks of Emulex.
+ * www.emulex.com
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License (GPL) Version 2, available from the file COPYING in the main
+ * directory of this source tree, or the BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#ifndef __OCRDMA_STATS_H__
+#define __OCRDMA_STATS_H__
+
+#include <linux/debugfs.h>
+#include "ocrdma.h"
+#include "ocrdma_hw.h"
+
+#define OCRDMA_MAX_DBGFS_MEM 4096
+
+enum OCRDMA_STATS_TYPE {
+	OCRDMA_RSRC_STATS,
+	OCRDMA_RXSTATS,
+	OCRDMA_WQESTATS,
+	OCRDMA_TXSTATS,
+	OCRDMA_DB_ERRSTATS,
+	OCRDMA_RXQP_ERRSTATS,
+	OCRDMA_TXQP_ERRSTATS,
+	OCRDMA_TX_DBG_STATS,
+	OCRDMA_RX_DBG_STATS,
+	OCRDMA_DRV_STATS,
+	OCRDMA_RESET_STATS
+};
+
+void ocrdma_rem_debugfs(void);
+void ocrdma_init_debugfs(void);
+bool ocrdma_alloc_stats_resources(struct ocrdma_dev *dev);
+void ocrdma_release_stats_resources(struct ocrdma_dev *dev);
+void ocrdma_rem_port_stats(struct ocrdma_dev *dev);
+void ocrdma_add_port_stats(struct ocrdma_dev *dev);
+int ocrdma_pma_counters(struct ocrdma_dev *dev,
+			struct ib_mad *out_mad);
+
+#endif	/* __OCRDMA_STATS_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
new file mode 100644
index 0000000..784ed6b
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -0,0 +1,3052 @@
+/* This file is part of the Emulex RoCE Device Driver for
+ * RoCE (RDMA over Converged Ethernet) adapters.
+ * Copyright (C) 2012-2015 Emulex. All rights reserved.
+ * EMULEX and SLI are trademarks of Emulex.
+ * www.emulex.com
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License (GPL) Version 2, available from the file COPYING in the main
+ * directory of this source tree, or the BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include <linux/dma-mapping.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include "ocrdma.h"
+#include "ocrdma_hw.h"
+#include "ocrdma_verbs.h"
+#include <rdma/ocrdma-abi.h>
+
+int ocrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	if (index > 1)
+		return -EINVAL;
+
+	*pkey = 0xffff;
+	return 0;
+}
+
+int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr,
+			struct ib_udata *uhw)
+{
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
+
+	if (uhw->inlen || uhw->outlen)
+		return -EINVAL;
+
+	memset(attr, 0, sizeof *attr);
+	memcpy(&attr->fw_ver, &dev->attr.fw_ver[0],
+	       min(sizeof(dev->attr.fw_ver), sizeof(attr->fw_ver)));
+	ocrdma_get_guid(dev, (u8 *)&attr->sys_image_guid);
+	attr->max_mr_size = dev->attr.max_mr_size;
+	attr->page_size_cap = 0xffff000;
+	attr->vendor_id = dev->nic_info.pdev->vendor;
+	attr->vendor_part_id = dev->nic_info.pdev->device;
+	attr->hw_ver = dev->asic_id;
+	attr->max_qp = dev->attr.max_qp;
+	attr->max_ah = OCRDMA_MAX_AH;
+	attr->max_qp_wr = dev->attr.max_wqe;
+
+	attr->device_cap_flags = IB_DEVICE_CURR_QP_STATE_MOD |
+					IB_DEVICE_RC_RNR_NAK_GEN |
+					IB_DEVICE_SHUTDOWN_PORT |
+					IB_DEVICE_SYS_IMAGE_GUID |
+					IB_DEVICE_LOCAL_DMA_LKEY |
+					IB_DEVICE_MEM_MGT_EXTENSIONS;
+	attr->max_sge = min(dev->attr.max_send_sge, dev->attr.max_recv_sge);
+	attr->max_sge_rd = dev->attr.max_rdma_sge;
+	attr->max_cq = dev->attr.max_cq;
+	attr->max_cqe = dev->attr.max_cqe;
+	attr->max_mr = dev->attr.max_mr;
+	attr->max_mw = dev->attr.max_mw;
+	attr->max_pd = dev->attr.max_pd;
+	attr->atomic_cap = 0;
+	attr->max_fmr = 0;
+	attr->max_map_per_fmr = 0;
+	attr->max_qp_rd_atom =
+	    min(dev->attr.max_ord_per_qp, dev->attr.max_ird_per_qp);
+	attr->max_qp_init_rd_atom = dev->attr.max_ord_per_qp;
+	attr->max_srq = dev->attr.max_srq;
+	attr->max_srq_sge = dev->attr.max_srq_sge;
+	attr->max_srq_wr = dev->attr.max_rqe;
+	attr->local_ca_ack_delay = dev->attr.local_ca_ack_delay;
+	attr->max_fast_reg_page_list_len = dev->attr.max_pages_per_frmr;
+	attr->max_pkeys = 1;
+	return 0;
+}
+
+struct net_device *ocrdma_get_netdev(struct ib_device *ibdev, u8 port_num)
+{
+	struct ocrdma_dev *dev;
+	struct net_device *ndev = NULL;
+
+	rcu_read_lock();
+
+	dev = get_ocrdma_dev(ibdev);
+	if (dev)
+		ndev = dev->nic_info.netdev;
+	if (ndev)
+		dev_hold(ndev);
+
+	rcu_read_unlock();
+
+	return ndev;
+}
+
+static inline void get_link_speed_and_width(struct ocrdma_dev *dev,
+					    u8 *ib_speed, u8 *ib_width)
+{
+	int status;
+	u8 speed;
+
+	status = ocrdma_mbx_get_link_speed(dev, &speed, NULL);
+	if (status)
+		speed = OCRDMA_PHYS_LINK_SPEED_ZERO;
+
+	switch (speed) {
+	case OCRDMA_PHYS_LINK_SPEED_1GBPS:
+		*ib_speed = IB_SPEED_SDR;
+		*ib_width = IB_WIDTH_1X;
+		break;
+
+	case OCRDMA_PHYS_LINK_SPEED_10GBPS:
+		*ib_speed = IB_SPEED_QDR;
+		*ib_width = IB_WIDTH_1X;
+		break;
+
+	case OCRDMA_PHYS_LINK_SPEED_20GBPS:
+		*ib_speed = IB_SPEED_DDR;
+		*ib_width = IB_WIDTH_4X;
+		break;
+
+	case OCRDMA_PHYS_LINK_SPEED_40GBPS:
+		*ib_speed = IB_SPEED_QDR;
+		*ib_width = IB_WIDTH_4X;
+		break;
+
+	default:
+		/* Unsupported */
+		*ib_speed = IB_SPEED_SDR;
+		*ib_width = IB_WIDTH_1X;
+	}
+}
+
+int ocrdma_query_port(struct ib_device *ibdev,
+		      u8 port, struct ib_port_attr *props)
+{
+	enum ib_port_state port_state;
+	struct ocrdma_dev *dev;
+	struct net_device *netdev;
+
+	/* props being zeroed by the caller, avoid zeroing it here */
+	dev = get_ocrdma_dev(ibdev);
+	if (port > 1) {
+		pr_err("%s(%d) invalid_port=0x%x\n", __func__,
+		       dev->id, port);
+		return -EINVAL;
+	}
+	netdev = dev->nic_info.netdev;
+	if (netif_running(netdev) && netif_oper_up(netdev)) {
+		port_state = IB_PORT_ACTIVE;
+		props->phys_state = 5;
+	} else {
+		port_state = IB_PORT_DOWN;
+		props->phys_state = 3;
+	}
+	props->max_mtu = IB_MTU_4096;
+	props->active_mtu = iboe_get_mtu(netdev->mtu);
+	props->lid = 0;
+	props->lmc = 0;
+	props->sm_lid = 0;
+	props->sm_sl = 0;
+	props->state = port_state;
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP |
+	    IB_PORT_IP_BASED_GIDS;
+	props->gid_tbl_len = OCRDMA_MAX_SGID;
+	props->pkey_tbl_len = 1;
+	props->bad_pkey_cntr = 0;
+	props->qkey_viol_cntr = 0;
+	get_link_speed_and_width(dev, &props->active_speed,
+				 &props->active_width);
+	props->max_msg_sz = 0x80000000;
+	props->max_vl_num = 4;
+	return 0;
+}
+
+int ocrdma_modify_port(struct ib_device *ibdev, u8 port, int mask,
+		       struct ib_port_modify *props)
+{
+	struct ocrdma_dev *dev;
+
+	dev = get_ocrdma_dev(ibdev);
+	if (port > 1) {
+		pr_err("%s(%d) invalid_port=0x%x\n", __func__, dev->id, port);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int ocrdma_add_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr,
+			   unsigned long len)
+{
+	struct ocrdma_mm *mm;
+
+	mm = kzalloc(sizeof(*mm), GFP_KERNEL);
+	if (mm == NULL)
+		return -ENOMEM;
+	mm->key.phy_addr = phy_addr;
+	mm->key.len = len;
+	INIT_LIST_HEAD(&mm->entry);
+
+	mutex_lock(&uctx->mm_list_lock);
+	list_add_tail(&mm->entry, &uctx->mm_head);
+	mutex_unlock(&uctx->mm_list_lock);
+	return 0;
+}
+
+static void ocrdma_del_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr,
+			    unsigned long len)
+{
+	struct ocrdma_mm *mm, *tmp;
+
+	mutex_lock(&uctx->mm_list_lock);
+	list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) {
+		if (len != mm->key.len && phy_addr != mm->key.phy_addr)
+			continue;
+
+		list_del(&mm->entry);
+		kfree(mm);
+		break;
+	}
+	mutex_unlock(&uctx->mm_list_lock);
+}
+
+static bool ocrdma_search_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr,
+			      unsigned long len)
+{
+	bool found = false;
+	struct ocrdma_mm *mm;
+
+	mutex_lock(&uctx->mm_list_lock);
+	list_for_each_entry(mm, &uctx->mm_head, entry) {
+		if (len != mm->key.len && phy_addr != mm->key.phy_addr)
+			continue;
+
+		found = true;
+		break;
+	}
+	mutex_unlock(&uctx->mm_list_lock);
+	return found;
+}
+
+
+static u16 _ocrdma_pd_mgr_get_bitmap(struct ocrdma_dev *dev, bool dpp_pool)
+{
+	u16 pd_bitmap_idx = 0;
+	const unsigned long *pd_bitmap;
+
+	if (dpp_pool) {
+		pd_bitmap = dev->pd_mgr->pd_dpp_bitmap;
+		pd_bitmap_idx = find_first_zero_bit(pd_bitmap,
+						    dev->pd_mgr->max_dpp_pd);
+		__set_bit(pd_bitmap_idx, dev->pd_mgr->pd_dpp_bitmap);
+		dev->pd_mgr->pd_dpp_count++;
+		if (dev->pd_mgr->pd_dpp_count > dev->pd_mgr->pd_dpp_thrsh)
+			dev->pd_mgr->pd_dpp_thrsh = dev->pd_mgr->pd_dpp_count;
+	} else {
+		pd_bitmap = dev->pd_mgr->pd_norm_bitmap;
+		pd_bitmap_idx = find_first_zero_bit(pd_bitmap,
+						    dev->pd_mgr->max_normal_pd);
+		__set_bit(pd_bitmap_idx, dev->pd_mgr->pd_norm_bitmap);
+		dev->pd_mgr->pd_norm_count++;
+		if (dev->pd_mgr->pd_norm_count > dev->pd_mgr->pd_norm_thrsh)
+			dev->pd_mgr->pd_norm_thrsh = dev->pd_mgr->pd_norm_count;
+	}
+	return pd_bitmap_idx;
+}
+
+static int _ocrdma_pd_mgr_put_bitmap(struct ocrdma_dev *dev, u16 pd_id,
+					bool dpp_pool)
+{
+	u16 pd_count;
+	u16 pd_bit_index;
+
+	pd_count = dpp_pool ? dev->pd_mgr->pd_dpp_count :
+			      dev->pd_mgr->pd_norm_count;
+	if (pd_count == 0)
+		return -EINVAL;
+
+	if (dpp_pool) {
+		pd_bit_index = pd_id - dev->pd_mgr->pd_dpp_start;
+		if (pd_bit_index >= dev->pd_mgr->max_dpp_pd) {
+			return -EINVAL;
+		} else {
+			__clear_bit(pd_bit_index, dev->pd_mgr->pd_dpp_bitmap);
+			dev->pd_mgr->pd_dpp_count--;
+		}
+	} else {
+		pd_bit_index = pd_id - dev->pd_mgr->pd_norm_start;
+		if (pd_bit_index >= dev->pd_mgr->max_normal_pd) {
+			return -EINVAL;
+		} else {
+			__clear_bit(pd_bit_index, dev->pd_mgr->pd_norm_bitmap);
+			dev->pd_mgr->pd_norm_count--;
+		}
+	}
+
+	return 0;
+}
+
+static int ocrdma_put_pd_num(struct ocrdma_dev *dev, u16 pd_id,
+				   bool dpp_pool)
+{
+	int status;
+
+	mutex_lock(&dev->dev_lock);
+	status = _ocrdma_pd_mgr_put_bitmap(dev, pd_id, dpp_pool);
+	mutex_unlock(&dev->dev_lock);
+	return status;
+}
+
+static int ocrdma_get_pd_num(struct ocrdma_dev *dev, struct ocrdma_pd *pd)
+{
+	u16 pd_idx = 0;
+	int status = 0;
+
+	mutex_lock(&dev->dev_lock);
+	if (pd->dpp_enabled) {
+		/* try allocating DPP PD, if not available then normal PD */
+		if (dev->pd_mgr->pd_dpp_count < dev->pd_mgr->max_dpp_pd) {
+			pd_idx = _ocrdma_pd_mgr_get_bitmap(dev, true);
+			pd->id = dev->pd_mgr->pd_dpp_start + pd_idx;
+			pd->dpp_page = dev->pd_mgr->dpp_page_index + pd_idx;
+		} else if (dev->pd_mgr->pd_norm_count <
+			   dev->pd_mgr->max_normal_pd) {
+			pd_idx = _ocrdma_pd_mgr_get_bitmap(dev, false);
+			pd->id = dev->pd_mgr->pd_norm_start + pd_idx;
+			pd->dpp_enabled = false;
+		} else {
+			status = -EINVAL;
+		}
+	} else {
+		if (dev->pd_mgr->pd_norm_count < dev->pd_mgr->max_normal_pd) {
+			pd_idx = _ocrdma_pd_mgr_get_bitmap(dev, false);
+			pd->id = dev->pd_mgr->pd_norm_start + pd_idx;
+		} else {
+			status = -EINVAL;
+		}
+	}
+	mutex_unlock(&dev->dev_lock);
+	return status;
+}
+
+static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev,
+					  struct ocrdma_ucontext *uctx,
+					  struct ib_udata *udata)
+{
+	struct ocrdma_pd *pd = NULL;
+	int status;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	if (udata && uctx && dev->attr.max_dpp_pds) {
+		pd->dpp_enabled =
+			ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R;
+		pd->num_dpp_qp =
+			pd->dpp_enabled ? (dev->nic_info.db_page_size /
+					   dev->attr.wqe_size) : 0;
+	}
+
+	if (dev->pd_mgr->pd_prealloc_valid) {
+		status = ocrdma_get_pd_num(dev, pd);
+		if (status == 0) {
+			return pd;
+		} else {
+			kfree(pd);
+			return ERR_PTR(status);
+		}
+	}
+
+retry:
+	status = ocrdma_mbx_alloc_pd(dev, pd);
+	if (status) {
+		if (pd->dpp_enabled) {
+			pd->dpp_enabled = false;
+			pd->num_dpp_qp = 0;
+			goto retry;
+		} else {
+			kfree(pd);
+			return ERR_PTR(status);
+		}
+	}
+
+	return pd;
+}
+
+static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx,
+				 struct ocrdma_pd *pd)
+{
+	return (uctx->cntxt_pd == pd);
+}
+
+static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev,
+			      struct ocrdma_pd *pd)
+{
+	int status;
+
+	if (dev->pd_mgr->pd_prealloc_valid)
+		status = ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled);
+	else
+		status = ocrdma_mbx_dealloc_pd(dev, pd);
+
+	kfree(pd);
+	return status;
+}
+
+static int ocrdma_alloc_ucontext_pd(struct ocrdma_dev *dev,
+				    struct ocrdma_ucontext *uctx,
+				    struct ib_udata *udata)
+{
+	int status = 0;
+
+	uctx->cntxt_pd = _ocrdma_alloc_pd(dev, uctx, udata);
+	if (IS_ERR(uctx->cntxt_pd)) {
+		status = PTR_ERR(uctx->cntxt_pd);
+		uctx->cntxt_pd = NULL;
+		goto err;
+	}
+
+	uctx->cntxt_pd->uctx = uctx;
+	uctx->cntxt_pd->ibpd.device = &dev->ibdev;
+err:
+	return status;
+}
+
+static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx)
+{
+	struct ocrdma_pd *pd = uctx->cntxt_pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+
+	if (uctx->pd_in_use) {
+		pr_err("%s(%d) Freeing in use pdid=0x%x.\n",
+		       __func__, dev->id, pd->id);
+	}
+	uctx->cntxt_pd = NULL;
+	(void)_ocrdma_dealloc_pd(dev, pd);
+	return 0;
+}
+
+static struct ocrdma_pd *ocrdma_get_ucontext_pd(struct ocrdma_ucontext *uctx)
+{
+	struct ocrdma_pd *pd = NULL;
+
+	mutex_lock(&uctx->mm_list_lock);
+	if (!uctx->pd_in_use) {
+		uctx->pd_in_use = true;
+		pd = uctx->cntxt_pd;
+	}
+	mutex_unlock(&uctx->mm_list_lock);
+
+	return pd;
+}
+
+static void ocrdma_release_ucontext_pd(struct ocrdma_ucontext *uctx)
+{
+	mutex_lock(&uctx->mm_list_lock);
+	uctx->pd_in_use = false;
+	mutex_unlock(&uctx->mm_list_lock);
+}
+
+struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev,
+					  struct ib_udata *udata)
+{
+	int status;
+	struct ocrdma_ucontext *ctx;
+	struct ocrdma_alloc_ucontext_resp resp;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	u32 map_len = roundup(sizeof(u32) * 2048, PAGE_SIZE);
+
+	if (!udata)
+		return ERR_PTR(-EFAULT);
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&ctx->mm_head);
+	mutex_init(&ctx->mm_list_lock);
+
+	ctx->ah_tbl.va = dma_zalloc_coherent(&pdev->dev, map_len,
+					     &ctx->ah_tbl.pa, GFP_KERNEL);
+	if (!ctx->ah_tbl.va) {
+		kfree(ctx);
+		return ERR_PTR(-ENOMEM);
+	}
+	ctx->ah_tbl.len = map_len;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.ah_tbl_len = ctx->ah_tbl.len;
+	resp.ah_tbl_page = virt_to_phys(ctx->ah_tbl.va);
+
+	status = ocrdma_add_mmap(ctx, resp.ah_tbl_page, resp.ah_tbl_len);
+	if (status)
+		goto map_err;
+
+	status = ocrdma_alloc_ucontext_pd(dev, ctx, udata);
+	if (status)
+		goto pd_err;
+
+	resp.dev_id = dev->id;
+	resp.max_inline_data = dev->attr.max_inline_data;
+	resp.wqe_size = dev->attr.wqe_size;
+	resp.rqe_size = dev->attr.rqe_size;
+	resp.dpp_wqe_size = dev->attr.wqe_size;
+
+	memcpy(resp.fw_ver, dev->attr.fw_ver, sizeof(resp.fw_ver));
+	status = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (status)
+		goto cpy_err;
+	return &ctx->ibucontext;
+
+cpy_err:
+pd_err:
+	ocrdma_del_mmap(ctx, ctx->ah_tbl.pa, ctx->ah_tbl.len);
+map_err:
+	dma_free_coherent(&pdev->dev, ctx->ah_tbl.len, ctx->ah_tbl.va,
+			  ctx->ah_tbl.pa);
+	kfree(ctx);
+	return ERR_PTR(status);
+}
+
+int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
+{
+	int status;
+	struct ocrdma_mm *mm, *tmp;
+	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+
+	status = ocrdma_dealloc_ucontext_pd(uctx);
+
+	ocrdma_del_mmap(uctx, uctx->ah_tbl.pa, uctx->ah_tbl.len);
+	dma_free_coherent(&pdev->dev, uctx->ah_tbl.len, uctx->ah_tbl.va,
+			  uctx->ah_tbl.pa);
+
+	list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) {
+		list_del(&mm->entry);
+		kfree(mm);
+	}
+	kfree(uctx);
+	return status;
+}
+
+int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct ocrdma_ucontext *ucontext = get_ocrdma_ucontext(context);
+	struct ocrdma_dev *dev = get_ocrdma_dev(context->device);
+	unsigned long vm_page = vma->vm_pgoff << PAGE_SHIFT;
+	u64 unmapped_db = (u64) dev->nic_info.unmapped_db;
+	unsigned long len = (vma->vm_end - vma->vm_start);
+	int status;
+	bool found;
+
+	if (vma->vm_start & (PAGE_SIZE - 1))
+		return -EINVAL;
+	found = ocrdma_search_mmap(ucontext, vma->vm_pgoff << PAGE_SHIFT, len);
+	if (!found)
+		return -EINVAL;
+
+	if ((vm_page >= unmapped_db) && (vm_page <= (unmapped_db +
+		dev->nic_info.db_total_size)) &&
+		(len <=	dev->nic_info.db_page_size)) {
+		if (vma->vm_flags & VM_READ)
+			return -EPERM;
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		status = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+					    len, vma->vm_page_prot);
+	} else if (dev->nic_info.dpp_unmapped_len &&
+		(vm_page >= (u64) dev->nic_info.dpp_unmapped_addr) &&
+		(vm_page <= (u64) (dev->nic_info.dpp_unmapped_addr +
+			dev->nic_info.dpp_unmapped_len)) &&
+		(len <= dev->nic_info.dpp_unmapped_len)) {
+		if (vma->vm_flags & VM_READ)
+			return -EPERM;
+
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+		status = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+					    len, vma->vm_page_prot);
+	} else {
+		status = remap_pfn_range(vma, vma->vm_start,
+					 vma->vm_pgoff, len, vma->vm_page_prot);
+	}
+	return status;
+}
+
+static int ocrdma_copy_pd_uresp(struct ocrdma_dev *dev, struct ocrdma_pd *pd,
+				struct ib_ucontext *ib_ctx,
+				struct ib_udata *udata)
+{
+	int status;
+	u64 db_page_addr;
+	u64 dpp_page_addr = 0;
+	u32 db_page_size;
+	struct ocrdma_alloc_pd_uresp rsp;
+	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx);
+
+	memset(&rsp, 0, sizeof(rsp));
+	rsp.id = pd->id;
+	rsp.dpp_enabled = pd->dpp_enabled;
+	db_page_addr = ocrdma_get_db_addr(dev, pd->id);
+	db_page_size = dev->nic_info.db_page_size;
+
+	status = ocrdma_add_mmap(uctx, db_page_addr, db_page_size);
+	if (status)
+		return status;
+
+	if (pd->dpp_enabled) {
+		dpp_page_addr = dev->nic_info.dpp_unmapped_addr +
+				(pd->id * PAGE_SIZE);
+		status = ocrdma_add_mmap(uctx, dpp_page_addr,
+				 PAGE_SIZE);
+		if (status)
+			goto dpp_map_err;
+		rsp.dpp_page_addr_hi = upper_32_bits(dpp_page_addr);
+		rsp.dpp_page_addr_lo = dpp_page_addr;
+	}
+
+	status = ib_copy_to_udata(udata, &rsp, sizeof(rsp));
+	if (status)
+		goto ucopy_err;
+
+	pd->uctx = uctx;
+	return 0;
+
+ucopy_err:
+	if (pd->dpp_enabled)
+		ocrdma_del_mmap(pd->uctx, dpp_page_addr, PAGE_SIZE);
+dpp_map_err:
+	ocrdma_del_mmap(pd->uctx, db_page_addr, db_page_size);
+	return status;
+}
+
+struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev,
+			      struct ib_ucontext *context,
+			      struct ib_udata *udata)
+{
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
+	struct ocrdma_pd *pd;
+	struct ocrdma_ucontext *uctx = NULL;
+	int status;
+	u8 is_uctx_pd = false;
+
+	if (udata && context) {
+		uctx = get_ocrdma_ucontext(context);
+		pd = ocrdma_get_ucontext_pd(uctx);
+		if (pd) {
+			is_uctx_pd = true;
+			goto pd_mapping;
+		}
+	}
+
+	pd = _ocrdma_alloc_pd(dev, uctx, udata);
+	if (IS_ERR(pd)) {
+		status = PTR_ERR(pd);
+		goto exit;
+	}
+
+pd_mapping:
+	if (udata && context) {
+		status = ocrdma_copy_pd_uresp(dev, pd, context, udata);
+		if (status)
+			goto err;
+	}
+	return &pd->ibpd;
+
+err:
+	if (is_uctx_pd) {
+		ocrdma_release_ucontext_pd(uctx);
+	} else {
+		if (_ocrdma_dealloc_pd(dev, pd))
+			pr_err("%s: _ocrdma_dealloc_pd() failed\n", __func__);
+	}
+exit:
+	return ERR_PTR(status);
+}
+
+int ocrdma_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	struct ocrdma_ucontext *uctx = NULL;
+	int status = 0;
+	u64 usr_db;
+
+	uctx = pd->uctx;
+	if (uctx) {
+		u64 dpp_db = dev->nic_info.dpp_unmapped_addr +
+			(pd->id * PAGE_SIZE);
+		if (pd->dpp_enabled)
+			ocrdma_del_mmap(pd->uctx, dpp_db, PAGE_SIZE);
+		usr_db = ocrdma_get_db_addr(dev, pd->id);
+		ocrdma_del_mmap(pd->uctx, usr_db, dev->nic_info.db_page_size);
+
+		if (is_ucontext_pd(uctx, pd)) {
+			ocrdma_release_ucontext_pd(uctx);
+			return status;
+		}
+	}
+	status = _ocrdma_dealloc_pd(dev, pd);
+	return status;
+}
+
+static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
+			    u32 pdid, int acc, u32 num_pbls, u32 addr_check)
+{
+	int status;
+
+	mr->hwmr.fr_mr = 0;
+	mr->hwmr.local_rd = 1;
+	mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0;
+	mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0;
+	mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0;
+	mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
+	mr->hwmr.num_pbls = num_pbls;
+
+	status = ocrdma_mbx_alloc_lkey(dev, &mr->hwmr, pdid, addr_check);
+	if (status)
+		return status;
+
+	mr->ibmr.lkey = mr->hwmr.lkey;
+	if (mr->hwmr.remote_wr || mr->hwmr.remote_rd)
+		mr->ibmr.rkey = mr->hwmr.lkey;
+	return 0;
+}
+
+struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *ibpd, int acc)
+{
+	int status;
+	struct ocrdma_mr *mr;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+
+	if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE)) {
+		pr_err("%s err, invalid access rights\n", __func__);
+		return ERR_PTR(-EINVAL);
+	}
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	status = ocrdma_alloc_lkey(dev, mr, pd->id, acc, 0,
+				   OCRDMA_ADDR_CHECK_DISABLE);
+	if (status) {
+		kfree(mr);
+		return ERR_PTR(status);
+	}
+
+	return &mr->ibmr;
+}
+
+static void ocrdma_free_mr_pbl_tbl(struct ocrdma_dev *dev,
+				   struct ocrdma_hw_mr *mr)
+{
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	int i = 0;
+
+	if (mr->pbl_table) {
+		for (i = 0; i < mr->num_pbls; i++) {
+			if (!mr->pbl_table[i].va)
+				continue;
+			dma_free_coherent(&pdev->dev, mr->pbl_size,
+					  mr->pbl_table[i].va,
+					  mr->pbl_table[i].pa);
+		}
+		kfree(mr->pbl_table);
+		mr->pbl_table = NULL;
+	}
+}
+
+static int ocrdma_get_pbl_info(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
+			      u32 num_pbes)
+{
+	u32 num_pbls = 0;
+	u32 idx = 0;
+	int status = 0;
+	u32 pbl_size;
+
+	do {
+		pbl_size = OCRDMA_MIN_HPAGE_SIZE * (1 << idx);
+		if (pbl_size > MAX_OCRDMA_PBL_SIZE) {
+			status = -EFAULT;
+			break;
+		}
+		num_pbls = roundup(num_pbes, (pbl_size / sizeof(u64)));
+		num_pbls = num_pbls / (pbl_size / sizeof(u64));
+		idx++;
+	} while (num_pbls >= dev->attr.max_num_mr_pbl);
+
+	mr->hwmr.num_pbes = num_pbes;
+	mr->hwmr.num_pbls = num_pbls;
+	mr->hwmr.pbl_size = pbl_size;
+	return status;
+}
+
+static int ocrdma_build_pbl_tbl(struct ocrdma_dev *dev, struct ocrdma_hw_mr *mr)
+{
+	int status = 0;
+	int i;
+	u32 dma_len = mr->pbl_size;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	void *va;
+	dma_addr_t pa;
+
+	mr->pbl_table = kzalloc(sizeof(struct ocrdma_pbl) *
+				mr->num_pbls, GFP_KERNEL);
+
+	if (!mr->pbl_table)
+		return -ENOMEM;
+
+	for (i = 0; i < mr->num_pbls; i++) {
+		va = dma_zalloc_coherent(&pdev->dev, dma_len, &pa, GFP_KERNEL);
+		if (!va) {
+			ocrdma_free_mr_pbl_tbl(dev, mr);
+			status = -ENOMEM;
+			break;
+		}
+		mr->pbl_table[i].va = va;
+		mr->pbl_table[i].pa = pa;
+	}
+	return status;
+}
+
+static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
+			    u32 num_pbes)
+{
+	struct ocrdma_pbe *pbe;
+	struct scatterlist *sg;
+	struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table;
+	struct ib_umem *umem = mr->umem;
+	int shift, pg_cnt, pages, pbe_cnt, entry, total_num_pbes = 0;
+
+	if (!mr->hwmr.num_pbes)
+		return;
+
+	pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+	pbe_cnt = 0;
+
+	shift = umem->page_shift;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		pages = sg_dma_len(sg) >> shift;
+		for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) {
+			/* store the page address in pbe */
+			pbe->pa_lo =
+			    cpu_to_le32(sg_dma_address(sg) +
+					(pg_cnt << shift));
+			pbe->pa_hi =
+			    cpu_to_le32(upper_32_bits(sg_dma_address(sg) +
+					 (pg_cnt << shift)));
+			pbe_cnt += 1;
+			total_num_pbes += 1;
+			pbe++;
+
+			/* if done building pbes, issue the mbx cmd. */
+			if (total_num_pbes == num_pbes)
+				return;
+
+			/* if the given pbl is full storing the pbes,
+			 * move to next pbl.
+			 */
+			if (pbe_cnt ==
+				(mr->hwmr.pbl_size / sizeof(u64))) {
+				pbl_tbl++;
+				pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+				pbe_cnt = 0;
+			}
+
+		}
+	}
+}
+
+struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
+				 u64 usr_addr, int acc, struct ib_udata *udata)
+{
+	int status = -ENOMEM;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	struct ocrdma_mr *mr;
+	struct ocrdma_pd *pd;
+	u32 num_pbes;
+
+	pd = get_ocrdma_pd(ibpd);
+
+	if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE))
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(status);
+	mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0);
+	if (IS_ERR(mr->umem)) {
+		status = -EFAULT;
+		goto umem_err;
+	}
+	num_pbes = ib_umem_page_count(mr->umem);
+	status = ocrdma_get_pbl_info(dev, mr, num_pbes);
+	if (status)
+		goto umem_err;
+
+	mr->hwmr.pbe_size = BIT(mr->umem->page_shift);
+	mr->hwmr.fbo = ib_umem_offset(mr->umem);
+	mr->hwmr.va = usr_addr;
+	mr->hwmr.len = len;
+	mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0;
+	mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0;
+	mr->hwmr.local_rd = 1;
+	mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
+	status = ocrdma_build_pbl_tbl(dev, &mr->hwmr);
+	if (status)
+		goto umem_err;
+	build_user_pbes(dev, mr, num_pbes);
+	status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc);
+	if (status)
+		goto mbx_err;
+	mr->ibmr.lkey = mr->hwmr.lkey;
+	if (mr->hwmr.remote_wr || mr->hwmr.remote_rd)
+		mr->ibmr.rkey = mr->hwmr.lkey;
+
+	return &mr->ibmr;
+
+mbx_err:
+	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
+umem_err:
+	kfree(mr);
+	return ERR_PTR(status);
+}
+
+int ocrdma_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct ocrdma_mr *mr = get_ocrdma_mr(ib_mr);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ib_mr->device);
+
+	(void) ocrdma_mbx_dealloc_lkey(dev, mr->hwmr.fr_mr, mr->hwmr.lkey);
+
+	kfree(mr->pages);
+	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
+
+	/* it could be user registered memory. */
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+	kfree(mr);
+
+	/* Don't stop cleanup, in case FW is unresponsive */
+	if (dev->mqe_ctx.fw_error_state) {
+		pr_err("%s(%d) fw not responding.\n",
+		       __func__, dev->id);
+	}
+	return 0;
+}
+
+static int ocrdma_copy_cq_uresp(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
+				struct ib_udata *udata,
+				struct ib_ucontext *ib_ctx)
+{
+	int status;
+	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx);
+	struct ocrdma_create_cq_uresp uresp;
+
+	memset(&uresp, 0, sizeof(uresp));
+	uresp.cq_id = cq->id;
+	uresp.page_size = PAGE_ALIGN(cq->len);
+	uresp.num_pages = 1;
+	uresp.max_hw_cqe = cq->max_hw_cqe;
+	uresp.page_addr[0] = virt_to_phys(cq->va);
+	uresp.db_page_addr =  ocrdma_get_db_addr(dev, uctx->cntxt_pd->id);
+	uresp.db_page_size = dev->nic_info.db_page_size;
+	uresp.phase_change = cq->phase_change ? 1 : 0;
+	status = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (status) {
+		pr_err("%s(%d) copy error cqid=0x%x.\n",
+		       __func__, dev->id, cq->id);
+		goto err;
+	}
+	status = ocrdma_add_mmap(uctx, uresp.db_page_addr, uresp.db_page_size);
+	if (status)
+		goto err;
+	status = ocrdma_add_mmap(uctx, uresp.page_addr[0], uresp.page_size);
+	if (status) {
+		ocrdma_del_mmap(uctx, uresp.db_page_addr, uresp.db_page_size);
+		goto err;
+	}
+	cq->ucontext = uctx;
+err:
+	return status;
+}
+
+struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
+			       const struct ib_cq_init_attr *attr,
+			       struct ib_ucontext *ib_ctx,
+			       struct ib_udata *udata)
+{
+	int entries = attr->cqe;
+	struct ocrdma_cq *cq;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
+	struct ocrdma_ucontext *uctx = NULL;
+	u16 pd_id = 0;
+	int status;
+	struct ocrdma_create_cq_ureq ureq;
+
+	if (attr->flags)
+		return ERR_PTR(-EINVAL);
+
+	if (udata) {
+		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
+			return ERR_PTR(-EFAULT);
+	} else
+		ureq.dpp_cq = 0;
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&cq->cq_lock);
+	spin_lock_init(&cq->comp_handler_lock);
+	INIT_LIST_HEAD(&cq->sq_head);
+	INIT_LIST_HEAD(&cq->rq_head);
+
+	if (ib_ctx) {
+		uctx = get_ocrdma_ucontext(ib_ctx);
+		pd_id = uctx->cntxt_pd->id;
+	}
+
+	status = ocrdma_mbx_create_cq(dev, cq, entries, ureq.dpp_cq, pd_id);
+	if (status) {
+		kfree(cq);
+		return ERR_PTR(status);
+	}
+	if (ib_ctx) {
+		status = ocrdma_copy_cq_uresp(dev, cq, udata, ib_ctx);
+		if (status)
+			goto ctx_err;
+	}
+	cq->phase = OCRDMA_CQE_VALID;
+	dev->cq_tbl[cq->id] = cq;
+	return &cq->ibcq;
+
+ctx_err:
+	ocrdma_mbx_destroy_cq(dev, cq);
+	kfree(cq);
+	return ERR_PTR(status);
+}
+
+int ocrdma_resize_cq(struct ib_cq *ibcq, int new_cnt,
+		     struct ib_udata *udata)
+{
+	int status = 0;
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
+
+	if (new_cnt < 1 || new_cnt > cq->max_hw_cqe) {
+		status = -EINVAL;
+		return status;
+	}
+	ibcq->cqe = new_cnt;
+	return status;
+}
+
+static void ocrdma_flush_cq(struct ocrdma_cq *cq)
+{
+	int cqe_cnt;
+	int valid_count = 0;
+	unsigned long flags;
+
+	struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device);
+	struct ocrdma_cqe *cqe = NULL;
+
+	cqe = cq->va;
+	cqe_cnt = cq->cqe_cnt;
+
+	/* Last irq might have scheduled a polling thread
+	 * sync-up with it before hard flushing.
+	 */
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	while (cqe_cnt) {
+		if (is_cqe_valid(cq, cqe))
+			valid_count++;
+		cqe++;
+		cqe_cnt--;
+	}
+	ocrdma_ring_cq_db(dev, cq->id, false, false, valid_count);
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+}
+
+int ocrdma_destroy_cq(struct ib_cq *ibcq)
+{
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
+	struct ocrdma_eq *eq = NULL;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device);
+	int pdid = 0;
+	u32 irq, indx;
+
+	dev->cq_tbl[cq->id] = NULL;
+	indx = ocrdma_get_eq_table_index(dev, cq->eqn);
+	BUG_ON(indx == -EINVAL);
+
+	eq = &dev->eq_tbl[indx];
+	irq = ocrdma_get_irq(dev, eq);
+	synchronize_irq(irq);
+	ocrdma_flush_cq(cq);
+
+	(void)ocrdma_mbx_destroy_cq(dev, cq);
+	if (cq->ucontext) {
+		pdid = cq->ucontext->cntxt_pd->id;
+		ocrdma_del_mmap(cq->ucontext, (u64) cq->pa,
+				PAGE_ALIGN(cq->len));
+		ocrdma_del_mmap(cq->ucontext,
+				ocrdma_get_db_addr(dev, pdid),
+				dev->nic_info.db_page_size);
+	}
+
+	kfree(cq);
+	return 0;
+}
+
+static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
+{
+	int status = -EINVAL;
+
+	if (qp->id < OCRDMA_MAX_QP && dev->qp_tbl[qp->id] == NULL) {
+		dev->qp_tbl[qp->id] = qp;
+		status = 0;
+	}
+	return status;
+}
+
+static void ocrdma_del_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
+{
+	dev->qp_tbl[qp->id] = NULL;
+}
+
+static int ocrdma_check_qp_params(struct ib_pd *ibpd, struct ocrdma_dev *dev,
+				  struct ib_qp_init_attr *attrs)
+{
+	if ((attrs->qp_type != IB_QPT_GSI) &&
+	    (attrs->qp_type != IB_QPT_RC) &&
+	    (attrs->qp_type != IB_QPT_UC) &&
+	    (attrs->qp_type != IB_QPT_UD)) {
+		pr_err("%s(%d) unsupported qp type=0x%x requested\n",
+		       __func__, dev->id, attrs->qp_type);
+		return -EINVAL;
+	}
+	/* Skip the check for QP1 to support CM size of 128 */
+	if ((attrs->qp_type != IB_QPT_GSI) &&
+	    (attrs->cap.max_send_wr > dev->attr.max_wqe)) {
+		pr_err("%s(%d) unsupported send_wr=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_send_wr);
+		pr_err("%s(%d) supported send_wr=0x%x\n",
+		       __func__, dev->id, dev->attr.max_wqe);
+		return -EINVAL;
+	}
+	if (!attrs->srq && (attrs->cap.max_recv_wr > dev->attr.max_rqe)) {
+		pr_err("%s(%d) unsupported recv_wr=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_recv_wr);
+		pr_err("%s(%d) supported recv_wr=0x%x\n",
+		       __func__, dev->id, dev->attr.max_rqe);
+		return -EINVAL;
+	}
+	if (attrs->cap.max_inline_data > dev->attr.max_inline_data) {
+		pr_err("%s(%d) unsupported inline data size=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_inline_data);
+		pr_err("%s(%d) supported inline data size=0x%x\n",
+		       __func__, dev->id, dev->attr.max_inline_data);
+		return -EINVAL;
+	}
+	if (attrs->cap.max_send_sge > dev->attr.max_send_sge) {
+		pr_err("%s(%d) unsupported send_sge=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_send_sge);
+		pr_err("%s(%d) supported send_sge=0x%x\n",
+		       __func__, dev->id, dev->attr.max_send_sge);
+		return -EINVAL;
+	}
+	if (attrs->cap.max_recv_sge > dev->attr.max_recv_sge) {
+		pr_err("%s(%d) unsupported recv_sge=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_recv_sge);
+		pr_err("%s(%d) supported recv_sge=0x%x\n",
+		       __func__, dev->id, dev->attr.max_recv_sge);
+		return -EINVAL;
+	}
+	/* unprivileged user space cannot create special QP */
+	if (ibpd->uobject && attrs->qp_type == IB_QPT_GSI) {
+		pr_err
+		    ("%s(%d) Userspace can't create special QPs of type=0x%x\n",
+		     __func__, dev->id, attrs->qp_type);
+		return -EINVAL;
+	}
+	/* allow creating only one GSI type of QP */
+	if (attrs->qp_type == IB_QPT_GSI && dev->gsi_qp_created) {
+		pr_err("%s(%d) GSI special QPs already created.\n",
+		       __func__, dev->id);
+		return -EINVAL;
+	}
+	/* verify consumer QPs are not trying to use GSI QP's CQ */
+	if ((attrs->qp_type != IB_QPT_GSI) && (dev->gsi_qp_created)) {
+		if ((dev->gsi_sqcq == get_ocrdma_cq(attrs->send_cq)) ||
+			(dev->gsi_rqcq == get_ocrdma_cq(attrs->recv_cq))) {
+			pr_err("%s(%d) Consumer QP cannot use GSI CQs.\n",
+				__func__, dev->id);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int ocrdma_copy_qp_uresp(struct ocrdma_qp *qp,
+				struct ib_udata *udata, int dpp_offset,
+				int dpp_credit_lmt, int srq)
+{
+	int status;
+	u64 usr_db;
+	struct ocrdma_create_qp_uresp uresp;
+	struct ocrdma_pd *pd = qp->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+
+	memset(&uresp, 0, sizeof(uresp));
+	usr_db = dev->nic_info.unmapped_db +
+			(pd->id * dev->nic_info.db_page_size);
+	uresp.qp_id = qp->id;
+	uresp.sq_dbid = qp->sq.dbid;
+	uresp.num_sq_pages = 1;
+	uresp.sq_page_size = PAGE_ALIGN(qp->sq.len);
+	uresp.sq_page_addr[0] = virt_to_phys(qp->sq.va);
+	uresp.num_wqe_allocated = qp->sq.max_cnt;
+	if (!srq) {
+		uresp.rq_dbid = qp->rq.dbid;
+		uresp.num_rq_pages = 1;
+		uresp.rq_page_size = PAGE_ALIGN(qp->rq.len);
+		uresp.rq_page_addr[0] = virt_to_phys(qp->rq.va);
+		uresp.num_rqe_allocated = qp->rq.max_cnt;
+	}
+	uresp.db_page_addr = usr_db;
+	uresp.db_page_size = dev->nic_info.db_page_size;
+	uresp.db_sq_offset = OCRDMA_DB_GEN2_SQ_OFFSET;
+	uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET;
+	uresp.db_shift = OCRDMA_DB_RQ_SHIFT;
+
+	if (qp->dpp_enabled) {
+		uresp.dpp_credit = dpp_credit_lmt;
+		uresp.dpp_offset = dpp_offset;
+	}
+	status = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (status) {
+		pr_err("%s(%d) user copy error.\n", __func__, dev->id);
+		goto err;
+	}
+	status = ocrdma_add_mmap(pd->uctx, uresp.sq_page_addr[0],
+				 uresp.sq_page_size);
+	if (status)
+		goto err;
+
+	if (!srq) {
+		status = ocrdma_add_mmap(pd->uctx, uresp.rq_page_addr[0],
+					 uresp.rq_page_size);
+		if (status)
+			goto rq_map_err;
+	}
+	return status;
+rq_map_err:
+	ocrdma_del_mmap(pd->uctx, uresp.sq_page_addr[0], uresp.sq_page_size);
+err:
+	return status;
+}
+
+static void ocrdma_set_qp_db(struct ocrdma_dev *dev, struct ocrdma_qp *qp,
+			     struct ocrdma_pd *pd)
+{
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		qp->sq_db = dev->nic_info.db +
+			(pd->id * dev->nic_info.db_page_size) +
+			OCRDMA_DB_GEN2_SQ_OFFSET;
+		qp->rq_db = dev->nic_info.db +
+			(pd->id * dev->nic_info.db_page_size) +
+			OCRDMA_DB_GEN2_RQ_OFFSET;
+	} else {
+		qp->sq_db = dev->nic_info.db +
+			(pd->id * dev->nic_info.db_page_size) +
+			OCRDMA_DB_SQ_OFFSET;
+		qp->rq_db = dev->nic_info.db +
+			(pd->id * dev->nic_info.db_page_size) +
+			OCRDMA_DB_RQ_OFFSET;
+	}
+}
+
+static int ocrdma_alloc_wr_id_tbl(struct ocrdma_qp *qp)
+{
+	qp->wqe_wr_id_tbl =
+	    kzalloc(sizeof(*(qp->wqe_wr_id_tbl)) * qp->sq.max_cnt,
+		    GFP_KERNEL);
+	if (qp->wqe_wr_id_tbl == NULL)
+		return -ENOMEM;
+	qp->rqe_wr_id_tbl =
+	    kzalloc(sizeof(u64) * qp->rq.max_cnt, GFP_KERNEL);
+	if (qp->rqe_wr_id_tbl == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void ocrdma_set_qp_init_params(struct ocrdma_qp *qp,
+				      struct ocrdma_pd *pd,
+				      struct ib_qp_init_attr *attrs)
+{
+	qp->pd = pd;
+	spin_lock_init(&qp->q_lock);
+	INIT_LIST_HEAD(&qp->sq_entry);
+	INIT_LIST_HEAD(&qp->rq_entry);
+
+	qp->qp_type = attrs->qp_type;
+	qp->cap_flags = OCRDMA_QP_INB_RD | OCRDMA_QP_INB_WR;
+	qp->max_inline_data = attrs->cap.max_inline_data;
+	qp->sq.max_sges = attrs->cap.max_send_sge;
+	qp->rq.max_sges = attrs->cap.max_recv_sge;
+	qp->state = OCRDMA_QPS_RST;
+	qp->signaled = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false;
+}
+
+static void ocrdma_store_gsi_qp_cq(struct ocrdma_dev *dev,
+				   struct ib_qp_init_attr *attrs)
+{
+	if (attrs->qp_type == IB_QPT_GSI) {
+		dev->gsi_qp_created = 1;
+		dev->gsi_sqcq = get_ocrdma_cq(attrs->send_cq);
+		dev->gsi_rqcq = get_ocrdma_cq(attrs->recv_cq);
+	}
+}
+
+struct ib_qp *ocrdma_create_qp(struct ib_pd *ibpd,
+			       struct ib_qp_init_attr *attrs,
+			       struct ib_udata *udata)
+{
+	int status;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_qp *qp;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	struct ocrdma_create_qp_ureq ureq;
+	u16 dpp_credit_lmt, dpp_offset;
+
+	status = ocrdma_check_qp_params(ibpd, dev, attrs);
+	if (status)
+		goto gen_err;
+
+	memset(&ureq, 0, sizeof(ureq));
+	if (udata) {
+		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
+			return ERR_PTR(-EFAULT);
+	}
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp) {
+		status = -ENOMEM;
+		goto gen_err;
+	}
+	ocrdma_set_qp_init_params(qp, pd, attrs);
+	if (udata == NULL)
+		qp->cap_flags |= (OCRDMA_QP_MW_BIND | OCRDMA_QP_LKEY0 |
+					OCRDMA_QP_FAST_REG);
+
+	mutex_lock(&dev->dev_lock);
+	status = ocrdma_mbx_create_qp(qp, attrs, ureq.enable_dpp_cq,
+					ureq.dpp_cq_id,
+					&dpp_offset, &dpp_credit_lmt);
+	if (status)
+		goto mbx_err;
+
+	/* user space QP's wr_id table are managed in library */
+	if (udata == NULL) {
+		status = ocrdma_alloc_wr_id_tbl(qp);
+		if (status)
+			goto map_err;
+	}
+
+	status = ocrdma_add_qpn_map(dev, qp);
+	if (status)
+		goto map_err;
+	ocrdma_set_qp_db(dev, qp, pd);
+	if (udata) {
+		status = ocrdma_copy_qp_uresp(qp, udata, dpp_offset,
+					      dpp_credit_lmt,
+					      (attrs->srq != NULL));
+		if (status)
+			goto cpy_err;
+	}
+	ocrdma_store_gsi_qp_cq(dev, attrs);
+	qp->ibqp.qp_num = qp->id;
+	mutex_unlock(&dev->dev_lock);
+	return &qp->ibqp;
+
+cpy_err:
+	ocrdma_del_qpn_map(dev, qp);
+map_err:
+	ocrdma_mbx_destroy_qp(dev, qp);
+mbx_err:
+	mutex_unlock(&dev->dev_lock);
+	kfree(qp->wqe_wr_id_tbl);
+	kfree(qp->rqe_wr_id_tbl);
+	kfree(qp);
+	pr_err("%s(%d) error=%d\n", __func__, dev->id, status);
+gen_err:
+	return ERR_PTR(status);
+}
+
+int _ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask)
+{
+	int status = 0;
+	struct ocrdma_qp *qp;
+	struct ocrdma_dev *dev;
+	enum ib_qp_state old_qps;
+
+	qp = get_ocrdma_qp(ibqp);
+	dev = get_ocrdma_dev(ibqp->device);
+	if (attr_mask & IB_QP_STATE)
+		status = ocrdma_qp_state_change(qp, attr->qp_state, &old_qps);
+	/* if new and previous states are same hw doesn't need to
+	 * know about it.
+	 */
+	if (status < 0)
+		return status;
+	return ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask);
+}
+
+int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_udata *udata)
+{
+	unsigned long flags;
+	int status = -EINVAL;
+	struct ocrdma_qp *qp;
+	struct ocrdma_dev *dev;
+	enum ib_qp_state old_qps, new_qps;
+
+	qp = get_ocrdma_qp(ibqp);
+	dev = get_ocrdma_dev(ibqp->device);
+
+	/* syncronize with multiple context trying to change, retrive qps */
+	mutex_lock(&dev->dev_lock);
+	/* syncronize with wqe, rqe posting and cqe processing contexts */
+	spin_lock_irqsave(&qp->q_lock, flags);
+	old_qps = get_ibqp_state(qp->state);
+	if (attr_mask & IB_QP_STATE)
+		new_qps = attr->qp_state;
+	else
+		new_qps = old_qps;
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+
+	if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask,
+				IB_LINK_LAYER_ETHERNET)) {
+		pr_err("%s(%d) invalid attribute mask=0x%x specified for\n"
+		       "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n",
+		       __func__, dev->id, attr_mask, qp->id, ibqp->qp_type,
+		       old_qps, new_qps);
+		goto param_err;
+	}
+
+	status = _ocrdma_modify_qp(ibqp, attr, attr_mask);
+	if (status > 0)
+		status = 0;
+param_err:
+	mutex_unlock(&dev->dev_lock);
+	return status;
+}
+
+static enum ib_mtu ocrdma_mtu_int_to_enum(u16 mtu)
+{
+	switch (mtu) {
+	case 256:
+		return IB_MTU_256;
+	case 512:
+		return IB_MTU_512;
+	case 1024:
+		return IB_MTU_1024;
+	case 2048:
+		return IB_MTU_2048;
+	case 4096:
+		return IB_MTU_4096;
+	default:
+		return IB_MTU_1024;
+	}
+}
+
+static int ocrdma_to_ib_qp_acc_flags(int qp_cap_flags)
+{
+	int ib_qp_acc_flags = 0;
+
+	if (qp_cap_flags & OCRDMA_QP_INB_WR)
+		ib_qp_acc_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (qp_cap_flags & OCRDMA_QP_INB_RD)
+		ib_qp_acc_flags |= IB_ACCESS_LOCAL_WRITE;
+	return ib_qp_acc_flags;
+}
+
+int ocrdma_query_qp(struct ib_qp *ibqp,
+		    struct ib_qp_attr *qp_attr,
+		    int attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	int status;
+	u32 qp_state;
+	struct ocrdma_qp_params params;
+	struct ocrdma_qp *qp = get_ocrdma_qp(ibqp);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibqp->device);
+
+	memset(&params, 0, sizeof(params));
+	mutex_lock(&dev->dev_lock);
+	status = ocrdma_mbx_query_qp(dev, qp, &params);
+	mutex_unlock(&dev->dev_lock);
+	if (status)
+		goto mbx_err;
+	if (qp->qp_type == IB_QPT_UD)
+		qp_attr->qkey = params.qkey;
+	qp_attr->path_mtu =
+		ocrdma_mtu_int_to_enum(params.path_mtu_pkey_indx &
+				OCRDMA_QP_PARAMS_PATH_MTU_MASK) >>
+				OCRDMA_QP_PARAMS_PATH_MTU_SHIFT;
+	qp_attr->path_mig_state = IB_MIG_MIGRATED;
+	qp_attr->rq_psn = params.hop_lmt_rq_psn & OCRDMA_QP_PARAMS_RQ_PSN_MASK;
+	qp_attr->sq_psn = params.tclass_sq_psn & OCRDMA_QP_PARAMS_SQ_PSN_MASK;
+	qp_attr->dest_qp_num =
+	    params.ack_to_rnr_rtc_dest_qpn & OCRDMA_QP_PARAMS_DEST_QPN_MASK;
+
+	qp_attr->qp_access_flags = ocrdma_to_ib_qp_acc_flags(qp->cap_flags);
+	qp_attr->cap.max_send_wr = qp->sq.max_cnt - 1;
+	qp_attr->cap.max_recv_wr = qp->rq.max_cnt - 1;
+	qp_attr->cap.max_send_sge = qp->sq.max_sges;
+	qp_attr->cap.max_recv_sge = qp->rq.max_sges;
+	qp_attr->cap.max_inline_data = qp->max_inline_data;
+	qp_init_attr->cap = qp_attr->cap;
+	qp_attr->ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
+
+	rdma_ah_set_grh(&qp_attr->ah_attr, NULL,
+			params.rnt_rc_sl_fl &
+			  OCRDMA_QP_PARAMS_FLOW_LABEL_MASK,
+			qp->sgid_idx,
+			(params.hop_lmt_rq_psn &
+			 OCRDMA_QP_PARAMS_HOP_LMT_MASK) >>
+			 OCRDMA_QP_PARAMS_HOP_LMT_SHIFT,
+			(params.tclass_sq_psn &
+			 OCRDMA_QP_PARAMS_TCLASS_MASK) >>
+			 OCRDMA_QP_PARAMS_TCLASS_SHIFT);
+	rdma_ah_set_dgid_raw(&qp_attr->ah_attr, &params.dgid[0]);
+
+	rdma_ah_set_port_num(&qp_attr->ah_attr, 1);
+	rdma_ah_set_sl(&qp_attr->ah_attr, (params.rnt_rc_sl_fl &
+					   OCRDMA_QP_PARAMS_SL_MASK) >>
+					   OCRDMA_QP_PARAMS_SL_SHIFT);
+	qp_attr->timeout = (params.ack_to_rnr_rtc_dest_qpn &
+			    OCRDMA_QP_PARAMS_ACK_TIMEOUT_MASK) >>
+				OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT;
+	qp_attr->rnr_retry = (params.ack_to_rnr_rtc_dest_qpn &
+			      OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK) >>
+				OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT;
+	qp_attr->retry_cnt =
+	    (params.rnt_rc_sl_fl & OCRDMA_QP_PARAMS_RETRY_CNT_MASK) >>
+		OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT;
+	qp_attr->min_rnr_timer = 0;
+	qp_attr->pkey_index = 0;
+	qp_attr->port_num = 1;
+	rdma_ah_set_path_bits(&qp_attr->ah_attr, 0);
+	rdma_ah_set_static_rate(&qp_attr->ah_attr, 0);
+	qp_attr->alt_pkey_index = 0;
+	qp_attr->alt_port_num = 0;
+	qp_attr->alt_timeout = 0;
+	memset(&qp_attr->alt_ah_attr, 0, sizeof(qp_attr->alt_ah_attr));
+	qp_state = (params.max_sge_recv_flags & OCRDMA_QP_PARAMS_STATE_MASK) >>
+		    OCRDMA_QP_PARAMS_STATE_SHIFT;
+	qp_attr->qp_state = get_ibqp_state(qp_state);
+	qp_attr->cur_qp_state = qp_attr->qp_state;
+	qp_attr->sq_draining = (qp_state == OCRDMA_QPS_SQ_DRAINING) ? 1 : 0;
+	qp_attr->max_dest_rd_atomic =
+	    params.max_ord_ird >> OCRDMA_QP_PARAMS_MAX_ORD_SHIFT;
+	qp_attr->max_rd_atomic =
+	    params.max_ord_ird & OCRDMA_QP_PARAMS_MAX_IRD_MASK;
+	qp_attr->en_sqd_async_notify = (params.max_sge_recv_flags &
+				OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC) ? 1 : 0;
+	/* Sync driver QP state with FW */
+	ocrdma_qp_state_change(qp, qp_attr->qp_state, NULL);
+mbx_err:
+	return status;
+}
+
+static void ocrdma_srq_toggle_bit(struct ocrdma_srq *srq, unsigned int idx)
+{
+	unsigned int i = idx / 32;
+	u32 mask = (1U << (idx % 32));
+
+	srq->idx_bit_fields[i] ^= mask;
+}
+
+static int ocrdma_hwq_free_cnt(struct ocrdma_qp_hwq_info *q)
+{
+	return ((q->max_wqe_idx - q->head) + q->tail) % q->max_cnt;
+}
+
+static int is_hw_sq_empty(struct ocrdma_qp *qp)
+{
+	return (qp->sq.tail == qp->sq.head);
+}
+
+static int is_hw_rq_empty(struct ocrdma_qp *qp)
+{
+	return (qp->rq.tail == qp->rq.head);
+}
+
+static void *ocrdma_hwq_head(struct ocrdma_qp_hwq_info *q)
+{
+	return q->va + (q->head * q->entry_size);
+}
+
+static void *ocrdma_hwq_head_from_idx(struct ocrdma_qp_hwq_info *q,
+				      u32 idx)
+{
+	return q->va + (idx * q->entry_size);
+}
+
+static void ocrdma_hwq_inc_head(struct ocrdma_qp_hwq_info *q)
+{
+	q->head = (q->head + 1) & q->max_wqe_idx;
+}
+
+static void ocrdma_hwq_inc_tail(struct ocrdma_qp_hwq_info *q)
+{
+	q->tail = (q->tail + 1) & q->max_wqe_idx;
+}
+
+/* discard the cqe for a given QP */
+static void ocrdma_discard_cqes(struct ocrdma_qp *qp, struct ocrdma_cq *cq)
+{
+	unsigned long cq_flags;
+	unsigned long flags;
+	int discard_cnt = 0;
+	u32 cur_getp, stop_getp;
+	struct ocrdma_cqe *cqe;
+	u32 qpn = 0, wqe_idx = 0;
+
+	spin_lock_irqsave(&cq->cq_lock, cq_flags);
+
+	/* traverse through the CQEs in the hw CQ,
+	 * find the matching CQE for a given qp,
+	 * mark the matching one discarded by clearing qpn.
+	 * ring the doorbell in the poll_cq() as
+	 * we don't complete out of order cqe.
+	 */
+
+	cur_getp = cq->getp;
+	/* find upto when do we reap the cq. */
+	stop_getp = cur_getp;
+	do {
+		if (is_hw_sq_empty(qp) && (!qp->srq && is_hw_rq_empty(qp)))
+			break;
+
+		cqe = cq->va + cur_getp;
+		/* if (a) done reaping whole hw cq, or
+		 *    (b) qp_xq becomes empty.
+		 * then exit
+		 */
+		qpn = cqe->cmn.qpn & OCRDMA_CQE_QPN_MASK;
+		/* if previously discarded cqe found, skip that too. */
+		/* check for matching qp */
+		if (qpn == 0 || qpn != qp->id)
+			goto skip_cqe;
+
+		if (is_cqe_for_sq(cqe)) {
+			ocrdma_hwq_inc_tail(&qp->sq);
+		} else {
+			if (qp->srq) {
+				wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >>
+					OCRDMA_CQE_BUFTAG_SHIFT) &
+					qp->srq->rq.max_wqe_idx;
+				BUG_ON(wqe_idx < 1);
+				spin_lock_irqsave(&qp->srq->q_lock, flags);
+				ocrdma_hwq_inc_tail(&qp->srq->rq);
+				ocrdma_srq_toggle_bit(qp->srq, wqe_idx - 1);
+				spin_unlock_irqrestore(&qp->srq->q_lock, flags);
+
+			} else {
+				ocrdma_hwq_inc_tail(&qp->rq);
+			}
+		}
+		/* mark cqe discarded so that it is not picked up later
+		 * in the poll_cq().
+		 */
+		discard_cnt += 1;
+		cqe->cmn.qpn = 0;
+skip_cqe:
+		cur_getp = (cur_getp + 1) % cq->max_hw_cqe;
+	} while (cur_getp != stop_getp);
+	spin_unlock_irqrestore(&cq->cq_lock, cq_flags);
+}
+
+void ocrdma_del_flush_qp(struct ocrdma_qp *qp)
+{
+	int found = false;
+	unsigned long flags;
+	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
+	/* sync with any active CQ poll */
+
+	spin_lock_irqsave(&dev->flush_q_lock, flags);
+	found = ocrdma_is_qp_in_sq_flushlist(qp->sq_cq, qp);
+	if (found)
+		list_del(&qp->sq_entry);
+	if (!qp->srq) {
+		found = ocrdma_is_qp_in_rq_flushlist(qp->rq_cq, qp);
+		if (found)
+			list_del(&qp->rq_entry);
+	}
+	spin_unlock_irqrestore(&dev->flush_q_lock, flags);
+}
+
+int ocrdma_destroy_qp(struct ib_qp *ibqp)
+{
+	struct ocrdma_pd *pd;
+	struct ocrdma_qp *qp;
+	struct ocrdma_dev *dev;
+	struct ib_qp_attr attrs;
+	int attr_mask;
+	unsigned long flags;
+
+	qp = get_ocrdma_qp(ibqp);
+	dev = get_ocrdma_dev(ibqp->device);
+
+	pd = qp->pd;
+
+	/* change the QP state to ERROR */
+	if (qp->state != OCRDMA_QPS_RST) {
+		attrs.qp_state = IB_QPS_ERR;
+		attr_mask = IB_QP_STATE;
+		_ocrdma_modify_qp(ibqp, &attrs, attr_mask);
+	}
+	/* ensure that CQEs for newly created QP (whose id may be same with
+	 * one which just getting destroyed are same), dont get
+	 * discarded until the old CQEs are discarded.
+	 */
+	mutex_lock(&dev->dev_lock);
+	(void) ocrdma_mbx_destroy_qp(dev, qp);
+
+	/*
+	 * acquire CQ lock while destroy is in progress, in order to
+	 * protect against proessing in-flight CQEs for this QP.
+	 */
+	spin_lock_irqsave(&qp->sq_cq->cq_lock, flags);
+	if (qp->rq_cq && (qp->rq_cq != qp->sq_cq))
+		spin_lock(&qp->rq_cq->cq_lock);
+
+	ocrdma_del_qpn_map(dev, qp);
+
+	if (qp->rq_cq && (qp->rq_cq != qp->sq_cq))
+		spin_unlock(&qp->rq_cq->cq_lock);
+	spin_unlock_irqrestore(&qp->sq_cq->cq_lock, flags);
+
+	if (!pd->uctx) {
+		ocrdma_discard_cqes(qp, qp->sq_cq);
+		ocrdma_discard_cqes(qp, qp->rq_cq);
+	}
+	mutex_unlock(&dev->dev_lock);
+
+	if (pd->uctx) {
+		ocrdma_del_mmap(pd->uctx, (u64) qp->sq.pa,
+				PAGE_ALIGN(qp->sq.len));
+		if (!qp->srq)
+			ocrdma_del_mmap(pd->uctx, (u64) qp->rq.pa,
+					PAGE_ALIGN(qp->rq.len));
+	}
+
+	ocrdma_del_flush_qp(qp);
+
+	kfree(qp->wqe_wr_id_tbl);
+	kfree(qp->rqe_wr_id_tbl);
+	kfree(qp);
+	return 0;
+}
+
+static int ocrdma_copy_srq_uresp(struct ocrdma_dev *dev, struct ocrdma_srq *srq,
+				struct ib_udata *udata)
+{
+	int status;
+	struct ocrdma_create_srq_uresp uresp;
+
+	memset(&uresp, 0, sizeof(uresp));
+	uresp.rq_dbid = srq->rq.dbid;
+	uresp.num_rq_pages = 1;
+	uresp.rq_page_addr[0] = virt_to_phys(srq->rq.va);
+	uresp.rq_page_size = srq->rq.len;
+	uresp.db_page_addr = dev->nic_info.unmapped_db +
+	    (srq->pd->id * dev->nic_info.db_page_size);
+	uresp.db_page_size = dev->nic_info.db_page_size;
+	uresp.num_rqe_allocated = srq->rq.max_cnt;
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET;
+		uresp.db_shift = 24;
+	} else {
+		uresp.db_rq_offset = OCRDMA_DB_RQ_OFFSET;
+		uresp.db_shift = 16;
+	}
+
+	status = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (status)
+		return status;
+	status = ocrdma_add_mmap(srq->pd->uctx, uresp.rq_page_addr[0],
+				 uresp.rq_page_size);
+	if (status)
+		return status;
+	return status;
+}
+
+struct ib_srq *ocrdma_create_srq(struct ib_pd *ibpd,
+				 struct ib_srq_init_attr *init_attr,
+				 struct ib_udata *udata)
+{
+	int status = -ENOMEM;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	struct ocrdma_srq *srq;
+
+	if (init_attr->attr.max_sge > dev->attr.max_recv_sge)
+		return ERR_PTR(-EINVAL);
+	if (init_attr->attr.max_wr > dev->attr.max_rqe)
+		return ERR_PTR(-EINVAL);
+
+	srq = kzalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(status);
+
+	spin_lock_init(&srq->q_lock);
+	srq->pd = pd;
+	srq->db = dev->nic_info.db + (pd->id * dev->nic_info.db_page_size);
+	status = ocrdma_mbx_create_srq(dev, srq, init_attr, pd);
+	if (status)
+		goto err;
+
+	if (udata == NULL) {
+		status = -ENOMEM;
+		srq->rqe_wr_id_tbl = kzalloc(sizeof(u64) * srq->rq.max_cnt,
+			    GFP_KERNEL);
+		if (srq->rqe_wr_id_tbl == NULL)
+			goto arm_err;
+
+		srq->bit_fields_len = (srq->rq.max_cnt / 32) +
+		    (srq->rq.max_cnt % 32 ? 1 : 0);
+		srq->idx_bit_fields =
+		    kmalloc(srq->bit_fields_len * sizeof(u32), GFP_KERNEL);
+		if (srq->idx_bit_fields == NULL)
+			goto arm_err;
+		memset(srq->idx_bit_fields, 0xff,
+		       srq->bit_fields_len * sizeof(u32));
+	}
+
+	if (init_attr->attr.srq_limit) {
+		status = ocrdma_mbx_modify_srq(srq, &init_attr->attr);
+		if (status)
+			goto arm_err;
+	}
+
+	if (udata) {
+		status = ocrdma_copy_srq_uresp(dev, srq, udata);
+		if (status)
+			goto arm_err;
+	}
+
+	return &srq->ibsrq;
+
+arm_err:
+	ocrdma_mbx_destroy_srq(dev, srq);
+err:
+	kfree(srq->rqe_wr_id_tbl);
+	kfree(srq->idx_bit_fields);
+	kfree(srq);
+	return ERR_PTR(status);
+}
+
+int ocrdma_modify_srq(struct ib_srq *ibsrq,
+		      struct ib_srq_attr *srq_attr,
+		      enum ib_srq_attr_mask srq_attr_mask,
+		      struct ib_udata *udata)
+{
+	int status;
+	struct ocrdma_srq *srq;
+
+	srq = get_ocrdma_srq(ibsrq);
+	if (srq_attr_mask & IB_SRQ_MAX_WR)
+		status = -EINVAL;
+	else
+		status = ocrdma_mbx_modify_srq(srq, srq_attr);
+	return status;
+}
+
+int ocrdma_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	int status;
+	struct ocrdma_srq *srq;
+
+	srq = get_ocrdma_srq(ibsrq);
+	status = ocrdma_mbx_query_srq(srq, srq_attr);
+	return status;
+}
+
+int ocrdma_destroy_srq(struct ib_srq *ibsrq)
+{
+	int status;
+	struct ocrdma_srq *srq;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibsrq->device);
+
+	srq = get_ocrdma_srq(ibsrq);
+
+	status = ocrdma_mbx_destroy_srq(dev, srq);
+
+	if (srq->pd->uctx)
+		ocrdma_del_mmap(srq->pd->uctx, (u64) srq->rq.pa,
+				PAGE_ALIGN(srq->rq.len));
+
+	kfree(srq->idx_bit_fields);
+	kfree(srq->rqe_wr_id_tbl);
+	kfree(srq);
+	return status;
+}
+
+/* unprivileged verbs and their support functions. */
+static void ocrdma_build_ud_hdr(struct ocrdma_qp *qp,
+				struct ocrdma_hdr_wqe *hdr,
+				struct ib_send_wr *wr)
+{
+	struct ocrdma_ewqe_ud_hdr *ud_hdr =
+		(struct ocrdma_ewqe_ud_hdr *)(hdr + 1);
+	struct ocrdma_ah *ah = get_ocrdma_ah(ud_wr(wr)->ah);
+
+	ud_hdr->rsvd_dest_qpn = ud_wr(wr)->remote_qpn;
+	if (qp->qp_type == IB_QPT_GSI)
+		ud_hdr->qkey = qp->qkey;
+	else
+		ud_hdr->qkey = ud_wr(wr)->remote_qkey;
+	ud_hdr->rsvd_ahid = ah->id;
+	ud_hdr->hdr_type = ah->hdr_type;
+	if (ah->av->valid & OCRDMA_AV_VLAN_VALID)
+		hdr->cw |= (OCRDMA_FLAG_AH_VLAN_PR << OCRDMA_WQE_FLAGS_SHIFT);
+}
+
+static void ocrdma_build_sges(struct ocrdma_hdr_wqe *hdr,
+			      struct ocrdma_sge *sge, int num_sge,
+			      struct ib_sge *sg_list)
+{
+	int i;
+
+	for (i = 0; i < num_sge; i++) {
+		sge[i].lrkey = sg_list[i].lkey;
+		sge[i].addr_lo = sg_list[i].addr;
+		sge[i].addr_hi = upper_32_bits(sg_list[i].addr);
+		sge[i].len = sg_list[i].length;
+		hdr->total_len += sg_list[i].length;
+	}
+	if (num_sge == 0)
+		memset(sge, 0, sizeof(*sge));
+}
+
+static inline uint32_t ocrdma_sglist_len(struct ib_sge *sg_list, int num_sge)
+{
+	uint32_t total_len = 0, i;
+
+	for (i = 0; i < num_sge; i++)
+		total_len += sg_list[i].length;
+	return total_len;
+}
+
+
+static int ocrdma_build_inline_sges(struct ocrdma_qp *qp,
+				    struct ocrdma_hdr_wqe *hdr,
+				    struct ocrdma_sge *sge,
+				    struct ib_send_wr *wr, u32 wqe_size)
+{
+	int i;
+	char *dpp_addr;
+
+	if (wr->send_flags & IB_SEND_INLINE && qp->qp_type != IB_QPT_UD) {
+		hdr->total_len = ocrdma_sglist_len(wr->sg_list, wr->num_sge);
+		if (unlikely(hdr->total_len > qp->max_inline_data)) {
+			pr_err("%s() supported_len=0x%x,\n"
+			       " unsupported len req=0x%x\n", __func__,
+				qp->max_inline_data, hdr->total_len);
+			return -EINVAL;
+		}
+		dpp_addr = (char *)sge;
+		for (i = 0; i < wr->num_sge; i++) {
+			memcpy(dpp_addr,
+			       (void *)(unsigned long)wr->sg_list[i].addr,
+			       wr->sg_list[i].length);
+			dpp_addr += wr->sg_list[i].length;
+		}
+
+		wqe_size += roundup(hdr->total_len, OCRDMA_WQE_ALIGN_BYTES);
+		if (0 == hdr->total_len)
+			wqe_size += sizeof(struct ocrdma_sge);
+		hdr->cw |= (OCRDMA_TYPE_INLINE << OCRDMA_WQE_TYPE_SHIFT);
+	} else {
+		ocrdma_build_sges(hdr, sge, wr->num_sge, wr->sg_list);
+		if (wr->num_sge)
+			wqe_size += (wr->num_sge * sizeof(struct ocrdma_sge));
+		else
+			wqe_size += sizeof(struct ocrdma_sge);
+		hdr->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT);
+	}
+	hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT);
+	return 0;
+}
+
+static int ocrdma_build_send(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,
+			     struct ib_send_wr *wr)
+{
+	int status;
+	struct ocrdma_sge *sge;
+	u32 wqe_size = sizeof(*hdr);
+
+	if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) {
+		ocrdma_build_ud_hdr(qp, hdr, wr);
+		sge = (struct ocrdma_sge *)(hdr + 2);
+		wqe_size += sizeof(struct ocrdma_ewqe_ud_hdr);
+	} else {
+		sge = (struct ocrdma_sge *)(hdr + 1);
+	}
+
+	status = ocrdma_build_inline_sges(qp, hdr, sge, wr, wqe_size);
+	return status;
+}
+
+static int ocrdma_build_write(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,
+			      struct ib_send_wr *wr)
+{
+	int status;
+	struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1);
+	struct ocrdma_sge *sge = ext_rw + 1;
+	u32 wqe_size = sizeof(*hdr) + sizeof(*ext_rw);
+
+	status = ocrdma_build_inline_sges(qp, hdr, sge, wr, wqe_size);
+	if (status)
+		return status;
+	ext_rw->addr_lo = rdma_wr(wr)->remote_addr;
+	ext_rw->addr_hi = upper_32_bits(rdma_wr(wr)->remote_addr);
+	ext_rw->lrkey = rdma_wr(wr)->rkey;
+	ext_rw->len = hdr->total_len;
+	return 0;
+}
+
+static void ocrdma_build_read(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,
+			      struct ib_send_wr *wr)
+{
+	struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1);
+	struct ocrdma_sge *sge = ext_rw + 1;
+	u32 wqe_size = ((wr->num_sge + 1) * sizeof(struct ocrdma_sge)) +
+	    sizeof(struct ocrdma_hdr_wqe);
+
+	ocrdma_build_sges(hdr, sge, wr->num_sge, wr->sg_list);
+	hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT);
+	hdr->cw |= (OCRDMA_READ << OCRDMA_WQE_OPCODE_SHIFT);
+	hdr->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT);
+
+	ext_rw->addr_lo = rdma_wr(wr)->remote_addr;
+	ext_rw->addr_hi = upper_32_bits(rdma_wr(wr)->remote_addr);
+	ext_rw->lrkey = rdma_wr(wr)->rkey;
+	ext_rw->len = hdr->total_len;
+}
+
+static int get_encoded_page_size(int pg_sz)
+{
+	/* Max size is 256M 4096 << 16 */
+	int i = 0;
+	for (; i < 17; i++)
+		if (pg_sz == (4096 << i))
+			break;
+	return i;
+}
+
+static int ocrdma_build_reg(struct ocrdma_qp *qp,
+			    struct ocrdma_hdr_wqe *hdr,
+			    struct ib_reg_wr *wr)
+{
+	u64 fbo;
+	struct ocrdma_ewqe_fr *fast_reg = (struct ocrdma_ewqe_fr *)(hdr + 1);
+	struct ocrdma_mr *mr = get_ocrdma_mr(wr->mr);
+	struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table;
+	struct ocrdma_pbe *pbe;
+	u32 wqe_size = sizeof(*fast_reg) + sizeof(*hdr);
+	int num_pbes = 0, i;
+
+	wqe_size = roundup(wqe_size, OCRDMA_WQE_ALIGN_BYTES);
+
+	hdr->cw |= (OCRDMA_FR_MR << OCRDMA_WQE_OPCODE_SHIFT);
+	hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT);
+
+	if (wr->access & IB_ACCESS_LOCAL_WRITE)
+		hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_LOCAL_WR;
+	if (wr->access & IB_ACCESS_REMOTE_WRITE)
+		hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_REMOTE_WR;
+	if (wr->access & IB_ACCESS_REMOTE_READ)
+		hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_REMOTE_RD;
+	hdr->lkey = wr->key;
+	hdr->total_len = mr->ibmr.length;
+
+	fbo = mr->ibmr.iova - mr->pages[0];
+
+	fast_reg->va_hi = upper_32_bits(mr->ibmr.iova);
+	fast_reg->va_lo = (u32) (mr->ibmr.iova & 0xffffffff);
+	fast_reg->fbo_hi = upper_32_bits(fbo);
+	fast_reg->fbo_lo = (u32) fbo & 0xffffffff;
+	fast_reg->num_sges = mr->npages;
+	fast_reg->size_sge = get_encoded_page_size(mr->ibmr.page_size);
+
+	pbe = pbl_tbl->va;
+	for (i = 0; i < mr->npages; i++) {
+		u64 buf_addr = mr->pages[i];
+
+		pbe->pa_lo = cpu_to_le32((u32) (buf_addr & PAGE_MASK));
+		pbe->pa_hi = cpu_to_le32((u32) upper_32_bits(buf_addr));
+		num_pbes += 1;
+		pbe++;
+
+		/* if the pbl is full storing the pbes,
+		 * move to next pbl.
+		*/
+		if (num_pbes == (mr->hwmr.pbl_size/sizeof(u64))) {
+			pbl_tbl++;
+			pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+		}
+	}
+
+	return 0;
+}
+
+static void ocrdma_ring_sq_db(struct ocrdma_qp *qp)
+{
+	u32 val = qp->sq.dbid | (1 << OCRDMA_DB_SQ_SHIFT);
+
+	iowrite32(val, qp->sq_db);
+}
+
+int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		     struct ib_send_wr **bad_wr)
+{
+	int status = 0;
+	struct ocrdma_qp *qp = get_ocrdma_qp(ibqp);
+	struct ocrdma_hdr_wqe *hdr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->q_lock, flags);
+	if (qp->state != OCRDMA_QPS_RTS && qp->state != OCRDMA_QPS_SQD) {
+		spin_unlock_irqrestore(&qp->q_lock, flags);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
+	while (wr) {
+		if (qp->qp_type == IB_QPT_UD &&
+		    (wr->opcode != IB_WR_SEND &&
+		     wr->opcode != IB_WR_SEND_WITH_IMM)) {
+			*bad_wr = wr;
+			status = -EINVAL;
+			break;
+		}
+		if (ocrdma_hwq_free_cnt(&qp->sq) == 0 ||
+		    wr->num_sge > qp->sq.max_sges) {
+			*bad_wr = wr;
+			status = -ENOMEM;
+			break;
+		}
+		hdr = ocrdma_hwq_head(&qp->sq);
+		hdr->cw = 0;
+		if (wr->send_flags & IB_SEND_SIGNALED || qp->signaled)
+			hdr->cw |= (OCRDMA_FLAG_SIG << OCRDMA_WQE_FLAGS_SHIFT);
+		if (wr->send_flags & IB_SEND_FENCE)
+			hdr->cw |=
+			    (OCRDMA_FLAG_FENCE_L << OCRDMA_WQE_FLAGS_SHIFT);
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			hdr->cw |=
+			    (OCRDMA_FLAG_SOLICIT << OCRDMA_WQE_FLAGS_SHIFT);
+		hdr->total_len = 0;
+		switch (wr->opcode) {
+		case IB_WR_SEND_WITH_IMM:
+			hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT);
+			hdr->immdt = ntohl(wr->ex.imm_data);
+			/* fall through */
+		case IB_WR_SEND:
+			hdr->cw |= (OCRDMA_SEND << OCRDMA_WQE_OPCODE_SHIFT);
+			ocrdma_build_send(qp, hdr, wr);
+			break;
+		case IB_WR_SEND_WITH_INV:
+			hdr->cw |= (OCRDMA_FLAG_INV << OCRDMA_WQE_FLAGS_SHIFT);
+			hdr->cw |= (OCRDMA_SEND << OCRDMA_WQE_OPCODE_SHIFT);
+			hdr->lkey = wr->ex.invalidate_rkey;
+			status = ocrdma_build_send(qp, hdr, wr);
+			break;
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT);
+			hdr->immdt = ntohl(wr->ex.imm_data);
+			/* fall through */
+		case IB_WR_RDMA_WRITE:
+			hdr->cw |= (OCRDMA_WRITE << OCRDMA_WQE_OPCODE_SHIFT);
+			status = ocrdma_build_write(qp, hdr, wr);
+			break;
+		case IB_WR_RDMA_READ:
+			ocrdma_build_read(qp, hdr, wr);
+			break;
+		case IB_WR_LOCAL_INV:
+			hdr->cw |=
+			    (OCRDMA_LKEY_INV << OCRDMA_WQE_OPCODE_SHIFT);
+			hdr->cw |= ((sizeof(struct ocrdma_hdr_wqe) +
+					sizeof(struct ocrdma_sge)) /
+				OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT;
+			hdr->lkey = wr->ex.invalidate_rkey;
+			break;
+		case IB_WR_REG_MR:
+			status = ocrdma_build_reg(qp, hdr, reg_wr(wr));
+			break;
+		default:
+			status = -EINVAL;
+			break;
+		}
+		if (status) {
+			*bad_wr = wr;
+			break;
+		}
+		if (wr->send_flags & IB_SEND_SIGNALED || qp->signaled)
+			qp->wqe_wr_id_tbl[qp->sq.head].signaled = 1;
+		else
+			qp->wqe_wr_id_tbl[qp->sq.head].signaled = 0;
+		qp->wqe_wr_id_tbl[qp->sq.head].wrid = wr->wr_id;
+		ocrdma_cpu_to_le32(hdr, ((hdr->cw >> OCRDMA_WQE_SIZE_SHIFT) &
+				   OCRDMA_WQE_SIZE_MASK) * OCRDMA_WQE_STRIDE);
+		/* make sure wqe is written before adapter can access it */
+		wmb();
+		/* inform hw to start processing it */
+		ocrdma_ring_sq_db(qp);
+
+		/* update pointer, counter for next wr */
+		ocrdma_hwq_inc_head(&qp->sq);
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+	return status;
+}
+
+static void ocrdma_ring_rq_db(struct ocrdma_qp *qp)
+{
+	u32 val = qp->rq.dbid | (1 << OCRDMA_DB_RQ_SHIFT);
+
+	iowrite32(val, qp->rq_db);
+}
+
+static void ocrdma_build_rqe(struct ocrdma_hdr_wqe *rqe, struct ib_recv_wr *wr,
+			     u16 tag)
+{
+	u32 wqe_size = 0;
+	struct ocrdma_sge *sge;
+	if (wr->num_sge)
+		wqe_size = (wr->num_sge * sizeof(*sge)) + sizeof(*rqe);
+	else
+		wqe_size = sizeof(*sge) + sizeof(*rqe);
+
+	rqe->cw = ((wqe_size / OCRDMA_WQE_STRIDE) <<
+				OCRDMA_WQE_SIZE_SHIFT);
+	rqe->cw |= (OCRDMA_FLAG_SIG << OCRDMA_WQE_FLAGS_SHIFT);
+	rqe->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT);
+	rqe->total_len = 0;
+	rqe->rsvd_tag = tag;
+	sge = (struct ocrdma_sge *)(rqe + 1);
+	ocrdma_build_sges(rqe, sge, wr->num_sge, wr->sg_list);
+	ocrdma_cpu_to_le32(rqe, wqe_size);
+}
+
+int ocrdma_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		     struct ib_recv_wr **bad_wr)
+{
+	int status = 0;
+	unsigned long flags;
+	struct ocrdma_qp *qp = get_ocrdma_qp(ibqp);
+	struct ocrdma_hdr_wqe *rqe;
+
+	spin_lock_irqsave(&qp->q_lock, flags);
+	if (qp->state == OCRDMA_QPS_RST || qp->state == OCRDMA_QPS_ERR) {
+		spin_unlock_irqrestore(&qp->q_lock, flags);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+	while (wr) {
+		if (ocrdma_hwq_free_cnt(&qp->rq) == 0 ||
+		    wr->num_sge > qp->rq.max_sges) {
+			*bad_wr = wr;
+			status = -ENOMEM;
+			break;
+		}
+		rqe = ocrdma_hwq_head(&qp->rq);
+		ocrdma_build_rqe(rqe, wr, 0);
+
+		qp->rqe_wr_id_tbl[qp->rq.head] = wr->wr_id;
+		/* make sure rqe is written before adapter can access it */
+		wmb();
+
+		/* inform hw to start processing it */
+		ocrdma_ring_rq_db(qp);
+
+		/* update pointer, counter for next wr */
+		ocrdma_hwq_inc_head(&qp->rq);
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+	return status;
+}
+
+/* cqe for srq's rqe can potentially arrive out of order.
+ * index gives the entry in the shadow table where to store
+ * the wr_id. tag/index is returned in cqe to reference back
+ * for a given rqe.
+ */
+static int ocrdma_srq_get_idx(struct ocrdma_srq *srq)
+{
+	int row = 0;
+	int indx = 0;
+
+	for (row = 0; row < srq->bit_fields_len; row++) {
+		if (srq->idx_bit_fields[row]) {
+			indx = ffs(srq->idx_bit_fields[row]);
+			indx = (row * 32) + (indx - 1);
+			BUG_ON(indx >= srq->rq.max_cnt);
+			ocrdma_srq_toggle_bit(srq, indx);
+			break;
+		}
+	}
+
+	BUG_ON(row == srq->bit_fields_len);
+	return indx + 1; /* Use from index 1 */
+}
+
+static void ocrdma_ring_srq_db(struct ocrdma_srq *srq)
+{
+	u32 val = srq->rq.dbid | (1 << 16);
+
+	iowrite32(val, srq->db + OCRDMA_DB_GEN2_SRQ_OFFSET);
+}
+
+int ocrdma_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			 struct ib_recv_wr **bad_wr)
+{
+	int status = 0;
+	unsigned long flags;
+	struct ocrdma_srq *srq;
+	struct ocrdma_hdr_wqe *rqe;
+	u16 tag;
+
+	srq = get_ocrdma_srq(ibsrq);
+
+	spin_lock_irqsave(&srq->q_lock, flags);
+	while (wr) {
+		if (ocrdma_hwq_free_cnt(&srq->rq) == 0 ||
+		    wr->num_sge > srq->rq.max_sges) {
+			status = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+		tag = ocrdma_srq_get_idx(srq);
+		rqe = ocrdma_hwq_head(&srq->rq);
+		ocrdma_build_rqe(rqe, wr, tag);
+
+		srq->rqe_wr_id_tbl[tag] = wr->wr_id;
+		/* make sure rqe is written before adapter can perform DMA */
+		wmb();
+		/* inform hw to start processing it */
+		ocrdma_ring_srq_db(srq);
+		/* update pointer, counter for next wr */
+		ocrdma_hwq_inc_head(&srq->rq);
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&srq->q_lock, flags);
+	return status;
+}
+
+static enum ib_wc_status ocrdma_to_ibwc_err(u16 status)
+{
+	enum ib_wc_status ibwc_status;
+
+	switch (status) {
+	case OCRDMA_CQE_GENERAL_ERR:
+		ibwc_status = IB_WC_GENERAL_ERR;
+		break;
+	case OCRDMA_CQE_LOC_LEN_ERR:
+		ibwc_status = IB_WC_LOC_LEN_ERR;
+		break;
+	case OCRDMA_CQE_LOC_QP_OP_ERR:
+		ibwc_status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case OCRDMA_CQE_LOC_EEC_OP_ERR:
+		ibwc_status = IB_WC_LOC_EEC_OP_ERR;
+		break;
+	case OCRDMA_CQE_LOC_PROT_ERR:
+		ibwc_status = IB_WC_LOC_PROT_ERR;
+		break;
+	case OCRDMA_CQE_WR_FLUSH_ERR:
+		ibwc_status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case OCRDMA_CQE_MW_BIND_ERR:
+		ibwc_status = IB_WC_MW_BIND_ERR;
+		break;
+	case OCRDMA_CQE_BAD_RESP_ERR:
+		ibwc_status = IB_WC_BAD_RESP_ERR;
+		break;
+	case OCRDMA_CQE_LOC_ACCESS_ERR:
+		ibwc_status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case OCRDMA_CQE_REM_INV_REQ_ERR:
+		ibwc_status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case OCRDMA_CQE_REM_ACCESS_ERR:
+		ibwc_status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case OCRDMA_CQE_REM_OP_ERR:
+		ibwc_status = IB_WC_REM_OP_ERR;
+		break;
+	case OCRDMA_CQE_RETRY_EXC_ERR:
+		ibwc_status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case OCRDMA_CQE_RNR_RETRY_EXC_ERR:
+		ibwc_status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case OCRDMA_CQE_LOC_RDD_VIOL_ERR:
+		ibwc_status = IB_WC_LOC_RDD_VIOL_ERR;
+		break;
+	case OCRDMA_CQE_REM_INV_RD_REQ_ERR:
+		ibwc_status = IB_WC_REM_INV_RD_REQ_ERR;
+		break;
+	case OCRDMA_CQE_REM_ABORT_ERR:
+		ibwc_status = IB_WC_REM_ABORT_ERR;
+		break;
+	case OCRDMA_CQE_INV_EECN_ERR:
+		ibwc_status = IB_WC_INV_EECN_ERR;
+		break;
+	case OCRDMA_CQE_INV_EEC_STATE_ERR:
+		ibwc_status = IB_WC_INV_EEC_STATE_ERR;
+		break;
+	case OCRDMA_CQE_FATAL_ERR:
+		ibwc_status = IB_WC_FATAL_ERR;
+		break;
+	case OCRDMA_CQE_RESP_TIMEOUT_ERR:
+		ibwc_status = IB_WC_RESP_TIMEOUT_ERR;
+		break;
+	default:
+		ibwc_status = IB_WC_GENERAL_ERR;
+		break;
+	}
+	return ibwc_status;
+}
+
+static void ocrdma_update_wc(struct ocrdma_qp *qp, struct ib_wc *ibwc,
+		      u32 wqe_idx)
+{
+	struct ocrdma_hdr_wqe *hdr;
+	struct ocrdma_sge *rw;
+	int opcode;
+
+	hdr = ocrdma_hwq_head_from_idx(&qp->sq, wqe_idx);
+
+	ibwc->wr_id = qp->wqe_wr_id_tbl[wqe_idx].wrid;
+	/* Undo the hdr->cw swap */
+	opcode = le32_to_cpu(hdr->cw) & OCRDMA_WQE_OPCODE_MASK;
+	switch (opcode) {
+	case OCRDMA_WRITE:
+		ibwc->opcode = IB_WC_RDMA_WRITE;
+		break;
+	case OCRDMA_READ:
+		rw = (struct ocrdma_sge *)(hdr + 1);
+		ibwc->opcode = IB_WC_RDMA_READ;
+		ibwc->byte_len = rw->len;
+		break;
+	case OCRDMA_SEND:
+		ibwc->opcode = IB_WC_SEND;
+		break;
+	case OCRDMA_FR_MR:
+		ibwc->opcode = IB_WC_REG_MR;
+		break;
+	case OCRDMA_LKEY_INV:
+		ibwc->opcode = IB_WC_LOCAL_INV;
+		break;
+	default:
+		ibwc->status = IB_WC_GENERAL_ERR;
+		pr_err("%s() invalid opcode received = 0x%x\n",
+		       __func__, hdr->cw & OCRDMA_WQE_OPCODE_MASK);
+		break;
+	}
+}
+
+static void ocrdma_set_cqe_status_flushed(struct ocrdma_qp *qp,
+						struct ocrdma_cqe *cqe)
+{
+	if (is_cqe_for_sq(cqe)) {
+		cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+				cqe->flags_status_srcqpn) &
+					~OCRDMA_CQE_STATUS_MASK);
+		cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+				cqe->flags_status_srcqpn) |
+				(OCRDMA_CQE_WR_FLUSH_ERR <<
+					OCRDMA_CQE_STATUS_SHIFT));
+	} else {
+		if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) {
+			cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+					cqe->flags_status_srcqpn) &
+						~OCRDMA_CQE_UD_STATUS_MASK);
+			cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+					cqe->flags_status_srcqpn) |
+					(OCRDMA_CQE_WR_FLUSH_ERR <<
+						OCRDMA_CQE_UD_STATUS_SHIFT));
+		} else {
+			cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+					cqe->flags_status_srcqpn) &
+						~OCRDMA_CQE_STATUS_MASK);
+			cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+					cqe->flags_status_srcqpn) |
+					(OCRDMA_CQE_WR_FLUSH_ERR <<
+						OCRDMA_CQE_STATUS_SHIFT));
+		}
+	}
+}
+
+static bool ocrdma_update_err_cqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe,
+				  struct ocrdma_qp *qp, int status)
+{
+	bool expand = false;
+
+	ibwc->byte_len = 0;
+	ibwc->qp = &qp->ibqp;
+	ibwc->status = ocrdma_to_ibwc_err(status);
+
+	ocrdma_flush_qp(qp);
+	ocrdma_qp_state_change(qp, IB_QPS_ERR, NULL);
+
+	/* if wqe/rqe pending for which cqe needs to be returned,
+	 * trigger inflating it.
+	 */
+	if (!is_hw_rq_empty(qp) || !is_hw_sq_empty(qp)) {
+		expand = true;
+		ocrdma_set_cqe_status_flushed(qp, cqe);
+	}
+	return expand;
+}
+
+static int ocrdma_update_err_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe,
+				  struct ocrdma_qp *qp, int status)
+{
+	ibwc->opcode = IB_WC_RECV;
+	ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail];
+	ocrdma_hwq_inc_tail(&qp->rq);
+
+	return ocrdma_update_err_cqe(ibwc, cqe, qp, status);
+}
+
+static int ocrdma_update_err_scqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe,
+				  struct ocrdma_qp *qp, int status)
+{
+	ocrdma_update_wc(qp, ibwc, qp->sq.tail);
+	ocrdma_hwq_inc_tail(&qp->sq);
+
+	return ocrdma_update_err_cqe(ibwc, cqe, qp, status);
+}
+
+
+static bool ocrdma_poll_err_scqe(struct ocrdma_qp *qp,
+				 struct ocrdma_cqe *cqe, struct ib_wc *ibwc,
+				 bool *polled, bool *stop)
+{
+	bool expand;
+	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
+	int status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT;
+	if (status < OCRDMA_MAX_CQE_ERR)
+		atomic_inc(&dev->cqe_err_stats[status]);
+
+	/* when hw sq is empty, but rq is not empty, so we continue
+	 * to keep the cqe in order to get the cq event again.
+	 */
+	if (is_hw_sq_empty(qp) && !is_hw_rq_empty(qp)) {
+		/* when cq for rq and sq is same, it is safe to return
+		 * flush cqe for RQEs.
+		 */
+		if (!qp->srq && (qp->sq_cq == qp->rq_cq)) {
+			*polled = true;
+			status = OCRDMA_CQE_WR_FLUSH_ERR;
+			expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status);
+		} else {
+			/* stop processing further cqe as this cqe is used for
+			 * triggering cq event on buddy cq of RQ.
+			 * When QP is destroyed, this cqe will be removed
+			 * from the cq's hardware q.
+			 */
+			*polled = false;
+			*stop = true;
+			expand = false;
+		}
+	} else if (is_hw_sq_empty(qp)) {
+		/* Do nothing */
+		expand = false;
+		*polled = false;
+		*stop = false;
+	} else {
+		*polled = true;
+		expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status);
+	}
+	return expand;
+}
+
+static bool ocrdma_poll_success_scqe(struct ocrdma_qp *qp,
+				     struct ocrdma_cqe *cqe,
+				     struct ib_wc *ibwc, bool *polled)
+{
+	bool expand = false;
+	int tail = qp->sq.tail;
+	u32 wqe_idx;
+
+	if (!qp->wqe_wr_id_tbl[tail].signaled) {
+		*polled = false;    /* WC cannot be consumed yet */
+	} else {
+		ibwc->status = IB_WC_SUCCESS;
+		ibwc->wc_flags = 0;
+		ibwc->qp = &qp->ibqp;
+		ocrdma_update_wc(qp, ibwc, tail);
+		*polled = true;
+	}
+	wqe_idx = (le32_to_cpu(cqe->wq.wqeidx) &
+			OCRDMA_CQE_WQEIDX_MASK) & qp->sq.max_wqe_idx;
+	if (tail != wqe_idx)
+		expand = true; /* Coalesced CQE can't be consumed yet */
+
+	ocrdma_hwq_inc_tail(&qp->sq);
+	return expand;
+}
+
+static bool ocrdma_poll_scqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe,
+			     struct ib_wc *ibwc, bool *polled, bool *stop)
+{
+	int status;
+	bool expand;
+
+	status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT;
+
+	if (status == OCRDMA_CQE_SUCCESS)
+		expand = ocrdma_poll_success_scqe(qp, cqe, ibwc, polled);
+	else
+		expand = ocrdma_poll_err_scqe(qp, cqe, ibwc, polled, stop);
+	return expand;
+}
+
+static int ocrdma_update_ud_rcqe(struct ocrdma_dev *dev, struct ib_wc *ibwc,
+				 struct ocrdma_cqe *cqe)
+{
+	int status;
+	u16 hdr_type = 0;
+
+	status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_UD_STATUS_MASK) >> OCRDMA_CQE_UD_STATUS_SHIFT;
+	ibwc->src_qp = le32_to_cpu(cqe->flags_status_srcqpn) &
+						OCRDMA_CQE_SRCQP_MASK;
+	ibwc->pkey_index = 0;
+	ibwc->wc_flags = IB_WC_GRH;
+	ibwc->byte_len = (le32_to_cpu(cqe->ud.rxlen_pkey) >>
+			  OCRDMA_CQE_UD_XFER_LEN_SHIFT) &
+			  OCRDMA_CQE_UD_XFER_LEN_MASK;
+
+	if (ocrdma_is_udp_encap_supported(dev)) {
+		hdr_type = (le32_to_cpu(cqe->ud.rxlen_pkey) >>
+			    OCRDMA_CQE_UD_L3TYPE_SHIFT) &
+			    OCRDMA_CQE_UD_L3TYPE_MASK;
+		ibwc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
+		ibwc->network_hdr_type = hdr_type;
+	}
+
+	return status;
+}
+
+static void ocrdma_update_free_srq_cqe(struct ib_wc *ibwc,
+				       struct ocrdma_cqe *cqe,
+				       struct ocrdma_qp *qp)
+{
+	unsigned long flags;
+	struct ocrdma_srq *srq;
+	u32 wqe_idx;
+
+	srq = get_ocrdma_srq(qp->ibqp.srq);
+	wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >>
+		OCRDMA_CQE_BUFTAG_SHIFT) & srq->rq.max_wqe_idx;
+	BUG_ON(wqe_idx < 1);
+
+	ibwc->wr_id = srq->rqe_wr_id_tbl[wqe_idx];
+	spin_lock_irqsave(&srq->q_lock, flags);
+	ocrdma_srq_toggle_bit(srq, wqe_idx - 1);
+	spin_unlock_irqrestore(&srq->q_lock, flags);
+	ocrdma_hwq_inc_tail(&srq->rq);
+}
+
+static bool ocrdma_poll_err_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe,
+				struct ib_wc *ibwc, bool *polled, bool *stop,
+				int status)
+{
+	bool expand;
+	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
+
+	if (status < OCRDMA_MAX_CQE_ERR)
+		atomic_inc(&dev->cqe_err_stats[status]);
+
+	/* when hw_rq is empty, but wq is not empty, so continue
+	 * to keep the cqe to get the cq event again.
+	 */
+	if (is_hw_rq_empty(qp) && !is_hw_sq_empty(qp)) {
+		if (!qp->srq && (qp->sq_cq == qp->rq_cq)) {
+			*polled = true;
+			status = OCRDMA_CQE_WR_FLUSH_ERR;
+			expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status);
+		} else {
+			*polled = false;
+			*stop = true;
+			expand = false;
+		}
+	} else if (is_hw_rq_empty(qp)) {
+		/* Do nothing */
+		expand = false;
+		*polled = false;
+		*stop = false;
+	} else {
+		*polled = true;
+		expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status);
+	}
+	return expand;
+}
+
+static void ocrdma_poll_success_rcqe(struct ocrdma_qp *qp,
+				     struct ocrdma_cqe *cqe, struct ib_wc *ibwc)
+{
+	struct ocrdma_dev *dev;
+
+	dev = get_ocrdma_dev(qp->ibqp.device);
+	ibwc->opcode = IB_WC_RECV;
+	ibwc->qp = &qp->ibqp;
+	ibwc->status = IB_WC_SUCCESS;
+
+	if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI)
+		ocrdma_update_ud_rcqe(dev, ibwc, cqe);
+	else
+		ibwc->byte_len = le32_to_cpu(cqe->rq.rxlen);
+
+	if (is_cqe_imm(cqe)) {
+		ibwc->ex.imm_data = htonl(le32_to_cpu(cqe->rq.lkey_immdt));
+		ibwc->wc_flags |= IB_WC_WITH_IMM;
+	} else if (is_cqe_wr_imm(cqe)) {
+		ibwc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		ibwc->ex.imm_data = htonl(le32_to_cpu(cqe->rq.lkey_immdt));
+		ibwc->wc_flags |= IB_WC_WITH_IMM;
+	} else if (is_cqe_invalidated(cqe)) {
+		ibwc->ex.invalidate_rkey = le32_to_cpu(cqe->rq.lkey_immdt);
+		ibwc->wc_flags |= IB_WC_WITH_INVALIDATE;
+	}
+	if (qp->ibqp.srq) {
+		ocrdma_update_free_srq_cqe(ibwc, cqe, qp);
+	} else {
+		ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail];
+		ocrdma_hwq_inc_tail(&qp->rq);
+	}
+}
+
+static bool ocrdma_poll_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe,
+			     struct ib_wc *ibwc, bool *polled, bool *stop)
+{
+	int status;
+	bool expand = false;
+
+	ibwc->wc_flags = 0;
+	if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) {
+		status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+					OCRDMA_CQE_UD_STATUS_MASK) >>
+					OCRDMA_CQE_UD_STATUS_SHIFT;
+	} else {
+		status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+			     OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT;
+	}
+
+	if (status == OCRDMA_CQE_SUCCESS) {
+		*polled = true;
+		ocrdma_poll_success_rcqe(qp, cqe, ibwc);
+	} else {
+		expand = ocrdma_poll_err_rcqe(qp, cqe, ibwc, polled, stop,
+					      status);
+	}
+	return expand;
+}
+
+static void ocrdma_change_cq_phase(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe,
+				   u16 cur_getp)
+{
+	if (cq->phase_change) {
+		if (cur_getp == 0)
+			cq->phase = (~cq->phase & OCRDMA_CQE_VALID);
+	} else {
+		/* clear valid bit */
+		cqe->flags_status_srcqpn = 0;
+	}
+}
+
+static int ocrdma_poll_hwcq(struct ocrdma_cq *cq, int num_entries,
+			    struct ib_wc *ibwc)
+{
+	u16 qpn = 0;
+	int i = 0;
+	bool expand = false;
+	int polled_hw_cqes = 0;
+	struct ocrdma_qp *qp = NULL;
+	struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device);
+	struct ocrdma_cqe *cqe;
+	u16 cur_getp; bool polled = false; bool stop = false;
+
+	cur_getp = cq->getp;
+	while (num_entries) {
+		cqe = cq->va + cur_getp;
+		/* check whether valid cqe or not */
+		if (!is_cqe_valid(cq, cqe))
+			break;
+		qpn = (le32_to_cpu(cqe->cmn.qpn) & OCRDMA_CQE_QPN_MASK);
+		/* ignore discarded cqe */
+		if (qpn == 0)
+			goto skip_cqe;
+		qp = dev->qp_tbl[qpn];
+		BUG_ON(qp == NULL);
+
+		if (is_cqe_for_sq(cqe)) {
+			expand = ocrdma_poll_scqe(qp, cqe, ibwc, &polled,
+						  &stop);
+		} else {
+			expand = ocrdma_poll_rcqe(qp, cqe, ibwc, &polled,
+						  &stop);
+		}
+		if (expand)
+			goto expand_cqe;
+		if (stop)
+			goto stop_cqe;
+		/* clear qpn to avoid duplicate processing by discard_cqe() */
+		cqe->cmn.qpn = 0;
+skip_cqe:
+		polled_hw_cqes += 1;
+		cur_getp = (cur_getp + 1) % cq->max_hw_cqe;
+		ocrdma_change_cq_phase(cq, cqe, cur_getp);
+expand_cqe:
+		if (polled) {
+			num_entries -= 1;
+			i += 1;
+			ibwc = ibwc + 1;
+			polled = false;
+		}
+	}
+stop_cqe:
+	cq->getp = cur_getp;
+
+	if (polled_hw_cqes)
+		ocrdma_ring_cq_db(dev, cq->id, false, false, polled_hw_cqes);
+
+	return i;
+}
+
+/* insert error cqe if the QP's SQ or RQ's CQ matches the CQ under poll. */
+static int ocrdma_add_err_cqe(struct ocrdma_cq *cq, int num_entries,
+			      struct ocrdma_qp *qp, struct ib_wc *ibwc)
+{
+	int err_cqes = 0;
+
+	while (num_entries) {
+		if (is_hw_sq_empty(qp) && is_hw_rq_empty(qp))
+			break;
+		if (!is_hw_sq_empty(qp) && qp->sq_cq == cq) {
+			ocrdma_update_wc(qp, ibwc, qp->sq.tail);
+			ocrdma_hwq_inc_tail(&qp->sq);
+		} else if (!is_hw_rq_empty(qp) && qp->rq_cq == cq) {
+			ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail];
+			ocrdma_hwq_inc_tail(&qp->rq);
+		} else {
+			return err_cqes;
+		}
+		ibwc->byte_len = 0;
+		ibwc->status = IB_WC_WR_FLUSH_ERR;
+		ibwc = ibwc + 1;
+		err_cqes += 1;
+		num_entries -= 1;
+	}
+	return err_cqes;
+}
+
+int ocrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	int cqes_to_poll = num_entries;
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device);
+	int num_os_cqe = 0, err_cqes = 0;
+	struct ocrdma_qp *qp;
+	unsigned long flags;
+
+	/* poll cqes from adapter CQ */
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	num_os_cqe = ocrdma_poll_hwcq(cq, cqes_to_poll, wc);
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+	cqes_to_poll -= num_os_cqe;
+
+	if (cqes_to_poll) {
+		wc = wc + num_os_cqe;
+		/* adapter returns single error cqe when qp moves to
+		 * error state. So insert error cqes with wc_status as
+		 * FLUSHED for pending WQEs and RQEs of QP's SQ and RQ
+		 * respectively which uses this CQ.
+		 */
+		spin_lock_irqsave(&dev->flush_q_lock, flags);
+		list_for_each_entry(qp, &cq->sq_head, sq_entry) {
+			if (cqes_to_poll == 0)
+				break;
+			err_cqes = ocrdma_add_err_cqe(cq, cqes_to_poll, qp, wc);
+			cqes_to_poll -= err_cqes;
+			num_os_cqe += err_cqes;
+			wc = wc + err_cqes;
+		}
+		spin_unlock_irqrestore(&dev->flush_q_lock, flags);
+	}
+	return num_os_cqe;
+}
+
+int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags)
+{
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device);
+	u16 cq_id;
+	unsigned long flags;
+	bool arm_needed = false, sol_needed = false;
+
+	cq_id = cq->id;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	if (cq_flags & IB_CQ_NEXT_COMP || cq_flags & IB_CQ_SOLICITED)
+		arm_needed = true;
+	if (cq_flags & IB_CQ_SOLICITED)
+		sol_needed = true;
+
+	ocrdma_ring_cq_db(dev, cq_id, arm_needed, sol_needed, 0);
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+	return 0;
+}
+
+struct ib_mr *ocrdma_alloc_mr(struct ib_pd *ibpd,
+			      enum ib_mr_type mr_type,
+			      u32 max_num_sg)
+{
+	int status;
+	struct ocrdma_mr *mr;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+
+	if (mr_type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EINVAL);
+
+	if (max_num_sg > dev->attr.max_pages_per_frmr)
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->pages = kcalloc(max_num_sg, sizeof(u64), GFP_KERNEL);
+	if (!mr->pages) {
+		status = -ENOMEM;
+		goto pl_err;
+	}
+
+	status = ocrdma_get_pbl_info(dev, mr, max_num_sg);
+	if (status)
+		goto pbl_err;
+	mr->hwmr.fr_mr = 1;
+	mr->hwmr.remote_rd = 0;
+	mr->hwmr.remote_wr = 0;
+	mr->hwmr.local_rd = 0;
+	mr->hwmr.local_wr = 0;
+	mr->hwmr.mw_bind = 0;
+	status = ocrdma_build_pbl_tbl(dev, &mr->hwmr);
+	if (status)
+		goto pbl_err;
+	status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, 0);
+	if (status)
+		goto mbx_err;
+	mr->ibmr.rkey = mr->hwmr.lkey;
+	mr->ibmr.lkey = mr->hwmr.lkey;
+	dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] =
+		(unsigned long) mr;
+	return &mr->ibmr;
+mbx_err:
+	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
+pbl_err:
+	kfree(mr->pages);
+pl_err:
+	kfree(mr);
+	return ERR_PTR(-ENOMEM);
+}
+
+static int ocrdma_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct ocrdma_mr *mr = get_ocrdma_mr(ibmr);
+
+	if (unlikely(mr->npages == mr->hwmr.num_pbes))
+		return -ENOMEM;
+
+	mr->pages[mr->npages++] = addr;
+
+	return 0;
+}
+
+int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		     unsigned int *sg_offset)
+{
+	struct ocrdma_mr *mr = get_ocrdma_mr(ibmr);
+
+	mr->npages = 0;
+
+	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, ocrdma_set_page);
+}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
new file mode 100644
index 0000000..9a99717
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -0,0 +1,116 @@
+/* This file is part of the Emulex RoCE Device Driver for
+ * RoCE (RDMA over Converged Ethernet) adapters.
+ * Copyright (C) 2012-2015 Emulex. All rights reserved.
+ * EMULEX and SLI are trademarks of Emulex.
+ * www.emulex.com
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License (GPL) Version 2, available from the file COPYING in the main
+ * directory of this source tree, or the BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#ifndef __OCRDMA_VERBS_H__
+#define __OCRDMA_VERBS_H__
+
+int ocrdma_post_send(struct ib_qp *, struct ib_send_wr *,
+		     struct ib_send_wr **bad_wr);
+int ocrdma_post_recv(struct ib_qp *, struct ib_recv_wr *,
+		     struct ib_recv_wr **bad_wr);
+
+int ocrdma_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *wc);
+int ocrdma_arm_cq(struct ib_cq *, enum ib_cq_notify_flags flags);
+
+int ocrdma_query_device(struct ib_device *, struct ib_device_attr *props,
+			struct ib_udata *uhw);
+int ocrdma_query_port(struct ib_device *, u8 port, struct ib_port_attr *props);
+int ocrdma_modify_port(struct ib_device *, u8 port, int mask,
+		       struct ib_port_modify *props);
+
+enum rdma_protocol_type
+ocrdma_query_protocol(struct ib_device *device, u8 port_num);
+
+void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid);
+struct net_device *ocrdma_get_netdev(struct ib_device *device, u8 port_num);
+int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
+
+struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *,
+					  struct ib_udata *);
+int ocrdma_dealloc_ucontext(struct ib_ucontext *);
+
+int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
+
+struct ib_pd *ocrdma_alloc_pd(struct ib_device *,
+			      struct ib_ucontext *, struct ib_udata *);
+int ocrdma_dealloc_pd(struct ib_pd *pd);
+
+struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
+			       const struct ib_cq_init_attr *attr,
+			       struct ib_ucontext *ib_ctx,
+			       struct ib_udata *udata);
+int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
+int ocrdma_destroy_cq(struct ib_cq *);
+
+struct ib_qp *ocrdma_create_qp(struct ib_pd *,
+			       struct ib_qp_init_attr *attrs,
+			       struct ib_udata *);
+int _ocrdma_modify_qp(struct ib_qp *, struct ib_qp_attr *attr,
+		      int attr_mask);
+int ocrdma_modify_qp(struct ib_qp *, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_udata *udata);
+int ocrdma_query_qp(struct ib_qp *,
+		    struct ib_qp_attr *qp_attr,
+		    int qp_attr_mask, struct ib_qp_init_attr *);
+int ocrdma_destroy_qp(struct ib_qp *);
+void ocrdma_del_flush_qp(struct ocrdma_qp *qp);
+
+struct ib_srq *ocrdma_create_srq(struct ib_pd *, struct ib_srq_init_attr *,
+				 struct ib_udata *);
+int ocrdma_modify_srq(struct ib_srq *, struct ib_srq_attr *,
+		      enum ib_srq_attr_mask, struct ib_udata *);
+int ocrdma_query_srq(struct ib_srq *, struct ib_srq_attr *);
+int ocrdma_destroy_srq(struct ib_srq *);
+int ocrdma_post_srq_recv(struct ib_srq *, struct ib_recv_wr *,
+			 struct ib_recv_wr **bad_recv_wr);
+
+int ocrdma_dereg_mr(struct ib_mr *);
+struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc);
+struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
+				 u64 virt, int acc, struct ib_udata *);
+struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd,
+			      enum ib_mr_type mr_type,
+			      u32 max_num_sg);
+int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		     unsigned int *sg_offset);
+
+#endif				/* __OCRDMA_VERBS_H__ */
diff --git a/drivers/infiniband/hw/qedr/Kconfig b/drivers/infiniband/hw/qedr/Kconfig
new file mode 100644
index 0000000..9b9e3b1
--- /dev/null
+++ b/drivers/infiniband/hw/qedr/Kconfig
@@ -0,0 +1,10 @@
+config INFINIBAND_QEDR
+	tristate "QLogic RoCE driver"
+	depends on 64BIT && QEDE
+	depends on PCI
+	select QED_LL2
+	select QED_OOO
+	select QED_RDMA
+	---help---
+	  This driver provides low-level InfiniBand over Ethernet
+	  support for QLogic QED host channel adapters (HCAs).
diff --git a/drivers/infiniband/hw/qedr/Makefile b/drivers/infiniband/hw/qedr/Makefile
new file mode 100644
index 0000000..1c0bc4f
--- /dev/null
+++ b/drivers/infiniband/hw/qedr/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_INFINIBAND_QEDR) := qedr.o
+
+qedr-y := main.o verbs.o qedr_roce_cm.o qedr_iw_cm.o
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
new file mode 100644
index 0000000..f4cb60b
--- /dev/null
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -0,0 +1,991 @@
+/* QLogic qedr NIC Driver
+ * Copyright (c) 2015-2016  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/ib_mad.h>
+#include <linux/netdevice.h>
+#include <linux/iommu.h>
+#include <linux/pci.h>
+#include <net/addrconf.h>
+#include <linux/idr.h>
+
+#include <linux/qed/qed_chain.h>
+#include <linux/qed/qed_if.h>
+#include "qedr.h"
+#include "verbs.h"
+#include <rdma/qedr-abi.h>
+#include "qedr_iw_cm.h"
+
+MODULE_DESCRIPTION("QLogic 40G/100G ROCE Driver");
+MODULE_AUTHOR("QLogic Corporation");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define QEDR_WQ_MULTIPLIER_DFT	(3)
+
+static void qedr_ib_dispatch_event(struct qedr_dev *dev, u8 port_num,
+				   enum ib_event_type type)
+{
+	struct ib_event ibev;
+
+	ibev.device = &dev->ibdev;
+	ibev.element.port_num = port_num;
+	ibev.event = type;
+
+	ib_dispatch_event(&ibev);
+}
+
+static enum rdma_link_layer qedr_link_layer(struct ib_device *device,
+					    u8 port_num)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+static void qedr_get_dev_fw_str(struct ib_device *ibdev, char *str)
+{
+	struct qedr_dev *qedr = get_qedr_dev(ibdev);
+	u32 fw_ver = (u32)qedr->attr.fw_ver;
+
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d. %d. %d. %d",
+		 (fw_ver >> 24) & 0xFF, (fw_ver >> 16) & 0xFF,
+		 (fw_ver >> 8) & 0xFF, fw_ver & 0xFF);
+}
+
+static struct net_device *qedr_get_netdev(struct ib_device *dev, u8 port_num)
+{
+	struct qedr_dev *qdev;
+
+	qdev = get_qedr_dev(dev);
+	dev_hold(qdev->ndev);
+
+	/* The HW vendor's device driver must guarantee
+	 * that this function returns NULL before the net device has finished
+	 * NETDEV_UNREGISTER state.
+	 */
+	return qdev->ndev;
+}
+
+static int qedr_roce_port_immutable(struct ib_device *ibdev, u8 port_num,
+				    struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	err = qedr_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE |
+	    RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+	return 0;
+}
+
+static int qedr_iw_port_immutable(struct ib_device *ibdev, u8 port_num,
+				  struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	err = qedr_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = 1;
+	immutable->gid_tbl_len = 1;
+	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+	immutable->max_mad_size = 0;
+
+	return 0;
+}
+
+static int qedr_iw_register_device(struct qedr_dev *dev)
+{
+	dev->ibdev.node_type = RDMA_NODE_RNIC;
+	dev->ibdev.query_gid = qedr_iw_query_gid;
+
+	dev->ibdev.get_port_immutable = qedr_iw_port_immutable;
+
+	dev->ibdev.iwcm = kzalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL);
+	if (!dev->ibdev.iwcm)
+		return -ENOMEM;
+
+	dev->ibdev.iwcm->connect = qedr_iw_connect;
+	dev->ibdev.iwcm->accept = qedr_iw_accept;
+	dev->ibdev.iwcm->reject = qedr_iw_reject;
+	dev->ibdev.iwcm->create_listen = qedr_iw_create_listen;
+	dev->ibdev.iwcm->destroy_listen = qedr_iw_destroy_listen;
+	dev->ibdev.iwcm->add_ref = qedr_iw_qp_add_ref;
+	dev->ibdev.iwcm->rem_ref = qedr_iw_qp_rem_ref;
+	dev->ibdev.iwcm->get_qp = qedr_iw_get_qp;
+
+	memcpy(dev->ibdev.iwcm->ifname,
+	       dev->ndev->name, sizeof(dev->ibdev.iwcm->ifname));
+
+	return 0;
+}
+
+static void qedr_roce_register_device(struct qedr_dev *dev)
+{
+	dev->ibdev.node_type = RDMA_NODE_IB_CA;
+
+	dev->ibdev.get_port_immutable = qedr_roce_port_immutable;
+}
+
+static int qedr_register_device(struct qedr_dev *dev)
+{
+	int rc;
+
+	strlcpy(dev->ibdev.name, "qedr%d", IB_DEVICE_NAME_MAX);
+
+	dev->ibdev.node_guid = dev->attr.node_guid;
+	memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC));
+	dev->ibdev.owner = THIS_MODULE;
+	dev->ibdev.uverbs_abi_ver = QEDR_ABI_VERSION;
+
+	dev->ibdev.uverbs_cmd_mask = QEDR_UVERBS(GET_CONTEXT) |
+				     QEDR_UVERBS(QUERY_DEVICE) |
+				     QEDR_UVERBS(QUERY_PORT) |
+				     QEDR_UVERBS(ALLOC_PD) |
+				     QEDR_UVERBS(DEALLOC_PD) |
+				     QEDR_UVERBS(CREATE_COMP_CHANNEL) |
+				     QEDR_UVERBS(CREATE_CQ) |
+				     QEDR_UVERBS(RESIZE_CQ) |
+				     QEDR_UVERBS(DESTROY_CQ) |
+				     QEDR_UVERBS(REQ_NOTIFY_CQ) |
+				     QEDR_UVERBS(CREATE_QP) |
+				     QEDR_UVERBS(MODIFY_QP) |
+				     QEDR_UVERBS(QUERY_QP) |
+				     QEDR_UVERBS(DESTROY_QP) |
+				     QEDR_UVERBS(REG_MR) |
+				     QEDR_UVERBS(DEREG_MR) |
+				     QEDR_UVERBS(POLL_CQ) |
+				     QEDR_UVERBS(POST_SEND) |
+				     QEDR_UVERBS(POST_RECV);
+
+	if (IS_IWARP(dev)) {
+		rc = qedr_iw_register_device(dev);
+		if (rc)
+			return rc;
+	} else {
+		qedr_roce_register_device(dev);
+	}
+
+	dev->ibdev.phys_port_cnt = 1;
+	dev->ibdev.num_comp_vectors = dev->num_cnq;
+
+	dev->ibdev.query_device = qedr_query_device;
+	dev->ibdev.query_port = qedr_query_port;
+	dev->ibdev.modify_port = qedr_modify_port;
+
+	dev->ibdev.alloc_ucontext = qedr_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = qedr_dealloc_ucontext;
+	dev->ibdev.mmap = qedr_mmap;
+
+	dev->ibdev.alloc_pd = qedr_alloc_pd;
+	dev->ibdev.dealloc_pd = qedr_dealloc_pd;
+
+	dev->ibdev.create_cq = qedr_create_cq;
+	dev->ibdev.destroy_cq = qedr_destroy_cq;
+	dev->ibdev.resize_cq = qedr_resize_cq;
+	dev->ibdev.req_notify_cq = qedr_arm_cq;
+
+	dev->ibdev.create_qp = qedr_create_qp;
+	dev->ibdev.modify_qp = qedr_modify_qp;
+	dev->ibdev.query_qp = qedr_query_qp;
+	dev->ibdev.destroy_qp = qedr_destroy_qp;
+
+	dev->ibdev.query_pkey = qedr_query_pkey;
+
+	dev->ibdev.create_ah = qedr_create_ah;
+	dev->ibdev.destroy_ah = qedr_destroy_ah;
+
+	dev->ibdev.get_dma_mr = qedr_get_dma_mr;
+	dev->ibdev.dereg_mr = qedr_dereg_mr;
+	dev->ibdev.reg_user_mr = qedr_reg_user_mr;
+	dev->ibdev.alloc_mr = qedr_alloc_mr;
+	dev->ibdev.map_mr_sg = qedr_map_mr_sg;
+
+	dev->ibdev.poll_cq = qedr_poll_cq;
+	dev->ibdev.post_send = qedr_post_send;
+	dev->ibdev.post_recv = qedr_post_recv;
+
+	dev->ibdev.process_mad = qedr_process_mad;
+
+	dev->ibdev.get_netdev = qedr_get_netdev;
+
+	dev->ibdev.dev.parent = &dev->pdev->dev;
+
+	dev->ibdev.get_link_layer = qedr_link_layer;
+	dev->ibdev.get_dev_fw_str = qedr_get_dev_fw_str;
+
+	dev->ibdev.driver_id = RDMA_DRIVER_QEDR;
+	return ib_register_device(&dev->ibdev, NULL);
+}
+
+/* This function allocates fast-path status block memory */
+static int qedr_alloc_mem_sb(struct qedr_dev *dev,
+			     struct qed_sb_info *sb_info, u16 sb_id)
+{
+	struct status_block_e4 *sb_virt;
+	dma_addr_t sb_phys;
+	int rc;
+
+	sb_virt = dma_alloc_coherent(&dev->pdev->dev,
+				     sizeof(*sb_virt), &sb_phys, GFP_KERNEL);
+	if (!sb_virt)
+		return -ENOMEM;
+
+	rc = dev->ops->common->sb_init(dev->cdev, sb_info,
+				       sb_virt, sb_phys, sb_id,
+				       QED_SB_TYPE_CNQ);
+	if (rc) {
+		pr_err("Status block initialization failed\n");
+		dma_free_coherent(&dev->pdev->dev, sizeof(*sb_virt),
+				  sb_virt, sb_phys);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void qedr_free_mem_sb(struct qedr_dev *dev,
+			     struct qed_sb_info *sb_info, int sb_id)
+{
+	if (sb_info->sb_virt) {
+		dev->ops->common->sb_release(dev->cdev, sb_info, sb_id);
+		dma_free_coherent(&dev->pdev->dev, sizeof(*sb_info->sb_virt),
+				  (void *)sb_info->sb_virt, sb_info->sb_phys);
+	}
+}
+
+static void qedr_free_resources(struct qedr_dev *dev)
+{
+	int i;
+
+	if (IS_IWARP(dev))
+		destroy_workqueue(dev->iwarp_wq);
+
+	for (i = 0; i < dev->num_cnq; i++) {
+		qedr_free_mem_sb(dev, &dev->sb_array[i], dev->sb_start + i);
+		dev->ops->common->chain_free(dev->cdev, &dev->cnq_array[i].pbl);
+	}
+
+	kfree(dev->cnq_array);
+	kfree(dev->sb_array);
+	kfree(dev->sgid_tbl);
+}
+
+static int qedr_alloc_resources(struct qedr_dev *dev)
+{
+	struct qedr_cnq *cnq;
+	__le16 *cons_pi;
+	u16 n_entries;
+	int i, rc;
+
+	dev->sgid_tbl = kzalloc(sizeof(union ib_gid) *
+				QEDR_MAX_SGID, GFP_KERNEL);
+	if (!dev->sgid_tbl)
+		return -ENOMEM;
+
+	spin_lock_init(&dev->sgid_lock);
+
+	if (IS_IWARP(dev)) {
+		spin_lock_init(&dev->idr_lock);
+		idr_init(&dev->qpidr);
+		dev->iwarp_wq = create_singlethread_workqueue("qedr_iwarpq");
+	}
+
+	/* Allocate Status blocks for CNQ */
+	dev->sb_array = kcalloc(dev->num_cnq, sizeof(*dev->sb_array),
+				GFP_KERNEL);
+	if (!dev->sb_array) {
+		rc = -ENOMEM;
+		goto err1;
+	}
+
+	dev->cnq_array = kcalloc(dev->num_cnq,
+				 sizeof(*dev->cnq_array), GFP_KERNEL);
+	if (!dev->cnq_array) {
+		rc = -ENOMEM;
+		goto err2;
+	}
+
+	dev->sb_start = dev->ops->rdma_get_start_sb(dev->cdev);
+
+	/* Allocate CNQ PBLs */
+	n_entries = min_t(u32, QED_RDMA_MAX_CNQ_SIZE, QEDR_ROCE_MAX_CNQ_SIZE);
+	for (i = 0; i < dev->num_cnq; i++) {
+		cnq = &dev->cnq_array[i];
+
+		rc = qedr_alloc_mem_sb(dev, &dev->sb_array[i],
+				       dev->sb_start + i);
+		if (rc)
+			goto err3;
+
+		rc = dev->ops->common->chain_alloc(dev->cdev,
+						   QED_CHAIN_USE_TO_CONSUME,
+						   QED_CHAIN_MODE_PBL,
+						   QED_CHAIN_CNT_TYPE_U16,
+						   n_entries,
+						   sizeof(struct regpair *),
+						   &cnq->pbl, NULL);
+		if (rc)
+			goto err4;
+
+		cnq->dev = dev;
+		cnq->sb = &dev->sb_array[i];
+		cons_pi = dev->sb_array[i].sb_virt->pi_array;
+		cnq->hw_cons_ptr = &cons_pi[QED_ROCE_PROTOCOL_INDEX];
+		cnq->index = i;
+		sprintf(cnq->name, "qedr%d@pci:%s", i, pci_name(dev->pdev));
+
+		DP_DEBUG(dev, QEDR_MSG_INIT, "cnq[%d].cons=%d\n",
+			 i, qed_chain_get_cons_idx(&cnq->pbl));
+	}
+
+	return 0;
+err4:
+	qedr_free_mem_sb(dev, &dev->sb_array[i], dev->sb_start + i);
+err3:
+	for (--i; i >= 0; i--) {
+		dev->ops->common->chain_free(dev->cdev, &dev->cnq_array[i].pbl);
+		qedr_free_mem_sb(dev, &dev->sb_array[i], dev->sb_start + i);
+	}
+	kfree(dev->cnq_array);
+err2:
+	kfree(dev->sb_array);
+err1:
+	kfree(dev->sgid_tbl);
+	return rc;
+}
+
+/* QEDR sysfs interface */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct qedr_dev *dev = dev_get_drvdata(device);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->pdev->vendor);
+}
+
+static ssize_t show_hca_type(struct device *device,
+			     struct device_attribute *attr, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%s\n", "HCA_TYPE_TO_SET");
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL);
+
+static struct device_attribute *qedr_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type
+};
+
+static void qedr_remove_sysfiles(struct qedr_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(qedr_attributes); i++)
+		device_remove_file(&dev->ibdev.dev, qedr_attributes[i]);
+}
+
+static void qedr_pci_set_atomic(struct qedr_dev *dev, struct pci_dev *pdev)
+{
+	int rc = pci_enable_atomic_ops_to_root(pdev,
+					       PCI_EXP_DEVCAP2_ATOMIC_COMP64);
+
+	if (rc) {
+		dev->atomic_cap = IB_ATOMIC_NONE;
+		DP_DEBUG(dev, QEDR_MSG_INIT, "Atomic capability disabled\n");
+	} else {
+		dev->atomic_cap = IB_ATOMIC_GLOB;
+		DP_DEBUG(dev, QEDR_MSG_INIT, "Atomic capability enabled\n");
+	}
+}
+
+static const struct qed_rdma_ops *qed_ops;
+
+#define HILO_U64(hi, lo)		((((u64)(hi)) << 32) + (lo))
+
+static irqreturn_t qedr_irq_handler(int irq, void *handle)
+{
+	u16 hw_comp_cons, sw_comp_cons;
+	struct qedr_cnq *cnq = handle;
+	struct regpair *cq_handle;
+	struct qedr_cq *cq;
+
+	qed_sb_ack(cnq->sb, IGU_INT_DISABLE, 0);
+
+	qed_sb_update_sb_idx(cnq->sb);
+
+	hw_comp_cons = le16_to_cpu(*cnq->hw_cons_ptr);
+	sw_comp_cons = qed_chain_get_cons_idx(&cnq->pbl);
+
+	/* Align protocol-index and chain reads */
+	rmb();
+
+	while (sw_comp_cons != hw_comp_cons) {
+		cq_handle = (struct regpair *)qed_chain_consume(&cnq->pbl);
+		cq = (struct qedr_cq *)(uintptr_t)HILO_U64(cq_handle->hi,
+				cq_handle->lo);
+
+		if (cq == NULL) {
+			DP_ERR(cnq->dev,
+			       "Received NULL CQ cq_handle->hi=%d cq_handle->lo=%d sw_comp_cons=%d hw_comp_cons=%d\n",
+			       cq_handle->hi, cq_handle->lo, sw_comp_cons,
+			       hw_comp_cons);
+
+			break;
+		}
+
+		if (cq->sig != QEDR_CQ_MAGIC_NUMBER) {
+			DP_ERR(cnq->dev,
+			       "Problem with cq signature, cq_handle->hi=%d ch_handle->lo=%d cq=%p\n",
+			       cq_handle->hi, cq_handle->lo, cq);
+			break;
+		}
+
+		cq->arm_flags = 0;
+
+		if (!cq->destroyed && cq->ibcq.comp_handler)
+			(*cq->ibcq.comp_handler)
+				(&cq->ibcq, cq->ibcq.cq_context);
+
+		/* The CQ's CNQ notification counter is checked before
+		 * destroying the CQ in a busy-wait loop that waits for all of
+		 * the CQ's CNQ interrupts to be processed. It is increased
+		 * here, only after the completion handler, to ensure that the
+		 * the handler is not running when the CQ is destroyed.
+		 */
+		cq->cnq_notif++;
+
+		sw_comp_cons = qed_chain_get_cons_idx(&cnq->pbl);
+
+		cnq->n_comp++;
+	}
+
+	qed_ops->rdma_cnq_prod_update(cnq->dev->rdma_ctx, cnq->index,
+				      sw_comp_cons);
+
+	qed_sb_ack(cnq->sb, IGU_INT_ENABLE, 1);
+
+	return IRQ_HANDLED;
+}
+
+static void qedr_sync_free_irqs(struct qedr_dev *dev)
+{
+	u32 vector;
+	int i;
+
+	for (i = 0; i < dev->int_info.used_cnt; i++) {
+		if (dev->int_info.msix_cnt) {
+			vector = dev->int_info.msix[i * dev->num_hwfns].vector;
+			synchronize_irq(vector);
+			free_irq(vector, &dev->cnq_array[i]);
+		}
+	}
+
+	dev->int_info.used_cnt = 0;
+}
+
+static int qedr_req_msix_irqs(struct qedr_dev *dev)
+{
+	int i, rc = 0;
+
+	if (dev->num_cnq > dev->int_info.msix_cnt) {
+		DP_ERR(dev,
+		       "Interrupt mismatch: %d CNQ queues > %d MSI-x vectors\n",
+		       dev->num_cnq, dev->int_info.msix_cnt);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < dev->num_cnq; i++) {
+		rc = request_irq(dev->int_info.msix[i * dev->num_hwfns].vector,
+				 qedr_irq_handler, 0, dev->cnq_array[i].name,
+				 &dev->cnq_array[i]);
+		if (rc) {
+			DP_ERR(dev, "Request cnq %d irq failed\n", i);
+			qedr_sync_free_irqs(dev);
+		} else {
+			DP_DEBUG(dev, QEDR_MSG_INIT,
+				 "Requested cnq irq for %s [entry %d]. Cookie is at %p\n",
+				 dev->cnq_array[i].name, i,
+				 &dev->cnq_array[i]);
+			dev->int_info.used_cnt++;
+		}
+	}
+
+	return rc;
+}
+
+static int qedr_setup_irqs(struct qedr_dev *dev)
+{
+	int rc;
+
+	DP_DEBUG(dev, QEDR_MSG_INIT, "qedr_setup_irqs\n");
+
+	/* Learn Interrupt configuration */
+	rc = dev->ops->rdma_set_rdma_int(dev->cdev, dev->num_cnq);
+	if (rc < 0)
+		return rc;
+
+	rc = dev->ops->rdma_get_rdma_int(dev->cdev, &dev->int_info);
+	if (rc) {
+		DP_DEBUG(dev, QEDR_MSG_INIT, "get_rdma_int failed\n");
+		return rc;
+	}
+
+	if (dev->int_info.msix_cnt) {
+		DP_DEBUG(dev, QEDR_MSG_INIT, "rdma msix_cnt = %d\n",
+			 dev->int_info.msix_cnt);
+		rc = qedr_req_msix_irqs(dev);
+		if (rc)
+			return rc;
+	}
+
+	DP_DEBUG(dev, QEDR_MSG_INIT, "qedr_setup_irqs succeeded\n");
+
+	return 0;
+}
+
+static int qedr_set_device_attr(struct qedr_dev *dev)
+{
+	struct qed_rdma_device *qed_attr;
+	struct qedr_device_attr *attr;
+	u32 page_size;
+
+	/* Part 1 - query core capabilities */
+	qed_attr = dev->ops->rdma_query_device(dev->rdma_ctx);
+
+	/* Part 2 - check capabilities */
+	page_size = ~dev->attr.page_size_caps + 1;
+	if (page_size > PAGE_SIZE) {
+		DP_ERR(dev,
+		       "Kernel PAGE_SIZE is %ld which is smaller than minimum page size (%d) required by qedr\n",
+		       PAGE_SIZE, page_size);
+		return -ENODEV;
+	}
+
+	/* Part 3 - copy and update capabilities */
+	attr = &dev->attr;
+	attr->vendor_id = qed_attr->vendor_id;
+	attr->vendor_part_id = qed_attr->vendor_part_id;
+	attr->hw_ver = qed_attr->hw_ver;
+	attr->fw_ver = qed_attr->fw_ver;
+	attr->node_guid = qed_attr->node_guid;
+	attr->sys_image_guid = qed_attr->sys_image_guid;
+	attr->max_cnq = qed_attr->max_cnq;
+	attr->max_sge = qed_attr->max_sge;
+	attr->max_inline = qed_attr->max_inline;
+	attr->max_sqe = min_t(u32, qed_attr->max_wqe, QEDR_MAX_SQE);
+	attr->max_rqe = min_t(u32, qed_attr->max_wqe, QEDR_MAX_RQE);
+	attr->max_qp_resp_rd_atomic_resc = qed_attr->max_qp_resp_rd_atomic_resc;
+	attr->max_qp_req_rd_atomic_resc = qed_attr->max_qp_req_rd_atomic_resc;
+	attr->max_dev_resp_rd_atomic_resc =
+	    qed_attr->max_dev_resp_rd_atomic_resc;
+	attr->max_cq = qed_attr->max_cq;
+	attr->max_qp = qed_attr->max_qp;
+	attr->max_mr = qed_attr->max_mr;
+	attr->max_mr_size = qed_attr->max_mr_size;
+	attr->max_cqe = min_t(u64, qed_attr->max_cqe, QEDR_MAX_CQES);
+	attr->max_mw = qed_attr->max_mw;
+	attr->max_fmr = qed_attr->max_fmr;
+	attr->max_mr_mw_fmr_pbl = qed_attr->max_mr_mw_fmr_pbl;
+	attr->max_mr_mw_fmr_size = qed_attr->max_mr_mw_fmr_size;
+	attr->max_pd = qed_attr->max_pd;
+	attr->max_ah = qed_attr->max_ah;
+	attr->max_pkey = qed_attr->max_pkey;
+	attr->max_srq = qed_attr->max_srq;
+	attr->max_srq_wr = qed_attr->max_srq_wr;
+	attr->dev_caps = qed_attr->dev_caps;
+	attr->page_size_caps = qed_attr->page_size_caps;
+	attr->dev_ack_delay = qed_attr->dev_ack_delay;
+	attr->reserved_lkey = qed_attr->reserved_lkey;
+	attr->bad_pkey_counter = qed_attr->bad_pkey_counter;
+	attr->max_stats_queues = qed_attr->max_stats_queues;
+
+	return 0;
+}
+
+static void qedr_unaffiliated_event(void *context, u8 event_code)
+{
+	pr_err("unaffiliated event not implemented yet\n");
+}
+
+static void qedr_affiliated_event(void *context, u8 e_code, void *fw_handle)
+{
+#define EVENT_TYPE_NOT_DEFINED	0
+#define EVENT_TYPE_CQ		1
+#define EVENT_TYPE_QP		2
+	struct qedr_dev *dev = (struct qedr_dev *)context;
+	struct regpair *async_handle = (struct regpair *)fw_handle;
+	u64 roce_handle64 = ((u64) async_handle->hi << 32) + async_handle->lo;
+	u8 event_type = EVENT_TYPE_NOT_DEFINED;
+	struct ib_event event;
+	struct ib_cq *ibcq;
+	struct ib_qp *ibqp;
+	struct qedr_cq *cq;
+	struct qedr_qp *qp;
+
+	switch (e_code) {
+	case ROCE_ASYNC_EVENT_CQ_OVERFLOW_ERR:
+		event.event = IB_EVENT_CQ_ERR;
+		event_type = EVENT_TYPE_CQ;
+		break;
+	case ROCE_ASYNC_EVENT_SQ_DRAINED:
+		event.event = IB_EVENT_SQ_DRAINED;
+		event_type = EVENT_TYPE_QP;
+		break;
+	case ROCE_ASYNC_EVENT_QP_CATASTROPHIC_ERR:
+		event.event = IB_EVENT_QP_FATAL;
+		event_type = EVENT_TYPE_QP;
+		break;
+	case ROCE_ASYNC_EVENT_LOCAL_INVALID_REQUEST_ERR:
+		event.event = IB_EVENT_QP_REQ_ERR;
+		event_type = EVENT_TYPE_QP;
+		break;
+	case ROCE_ASYNC_EVENT_LOCAL_ACCESS_ERR:
+		event.event = IB_EVENT_QP_ACCESS_ERR;
+		event_type = EVENT_TYPE_QP;
+		break;
+	default:
+		DP_ERR(dev, "unsupported event %d on handle=%llx\n", e_code,
+		       roce_handle64);
+	}
+
+	switch (event_type) {
+	case EVENT_TYPE_CQ:
+		cq = (struct qedr_cq *)(uintptr_t)roce_handle64;
+		if (cq) {
+			ibcq = &cq->ibcq;
+			if (ibcq->event_handler) {
+				event.device = ibcq->device;
+				event.element.cq = ibcq;
+				ibcq->event_handler(&event, ibcq->cq_context);
+			}
+		} else {
+			WARN(1,
+			     "Error: CQ event with NULL pointer ibcq. Handle=%llx\n",
+			     roce_handle64);
+		}
+		DP_ERR(dev, "CQ event %d on handle %p\n", e_code, cq);
+		break;
+	case EVENT_TYPE_QP:
+		qp = (struct qedr_qp *)(uintptr_t)roce_handle64;
+		if (qp) {
+			ibqp = &qp->ibqp;
+			if (ibqp->event_handler) {
+				event.device = ibqp->device;
+				event.element.qp = ibqp;
+				ibqp->event_handler(&event, ibqp->qp_context);
+			}
+		} else {
+			WARN(1,
+			     "Error: QP event with NULL pointer ibqp. Handle=%llx\n",
+			     roce_handle64);
+		}
+		DP_ERR(dev, "QP event %d on handle %p\n", e_code, qp);
+		break;
+	default:
+		break;
+	}
+}
+
+static int qedr_init_hw(struct qedr_dev *dev)
+{
+	struct qed_rdma_add_user_out_params out_params;
+	struct qed_rdma_start_in_params *in_params;
+	struct qed_rdma_cnq_params *cur_pbl;
+	struct qed_rdma_events events;
+	dma_addr_t p_phys_table;
+	u32 page_cnt;
+	int rc = 0;
+	int i;
+
+	in_params =  kzalloc(sizeof(*in_params), GFP_KERNEL);
+	if (!in_params) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	in_params->desired_cnq = dev->num_cnq;
+	for (i = 0; i < dev->num_cnq; i++) {
+		cur_pbl = &in_params->cnq_pbl_list[i];
+
+		page_cnt = qed_chain_get_page_cnt(&dev->cnq_array[i].pbl);
+		cur_pbl->num_pbl_pages = page_cnt;
+
+		p_phys_table = qed_chain_get_pbl_phys(&dev->cnq_array[i].pbl);
+		cur_pbl->pbl_ptr = (u64)p_phys_table;
+	}
+
+	events.affiliated_event = qedr_affiliated_event;
+	events.unaffiliated_event = qedr_unaffiliated_event;
+	events.context = dev;
+
+	in_params->events = &events;
+	in_params->cq_mode = QED_RDMA_CQ_MODE_32_BITS;
+	in_params->max_mtu = dev->ndev->mtu;
+	dev->iwarp_max_mtu = dev->ndev->mtu;
+	ether_addr_copy(&in_params->mac_addr[0], dev->ndev->dev_addr);
+
+	rc = dev->ops->rdma_init(dev->cdev, in_params);
+	if (rc)
+		goto out;
+
+	rc = dev->ops->rdma_add_user(dev->rdma_ctx, &out_params);
+	if (rc)
+		goto out;
+
+	dev->db_addr = (void __iomem *)(uintptr_t)out_params.dpi_addr;
+	dev->db_phys_addr = out_params.dpi_phys_addr;
+	dev->db_size = out_params.dpi_size;
+	dev->dpi = out_params.dpi;
+
+	rc = qedr_set_device_attr(dev);
+out:
+	kfree(in_params);
+	if (rc)
+		DP_ERR(dev, "Init HW Failed rc = %d\n", rc);
+
+	return rc;
+}
+
+static void qedr_stop_hw(struct qedr_dev *dev)
+{
+	dev->ops->rdma_remove_user(dev->rdma_ctx, dev->dpi);
+	dev->ops->rdma_stop(dev->rdma_ctx);
+}
+
+static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev,
+				 struct net_device *ndev)
+{
+	struct qed_dev_rdma_info dev_info;
+	struct qedr_dev *dev;
+	int rc = 0, i;
+
+	dev = (struct qedr_dev *)ib_alloc_device(sizeof(*dev));
+	if (!dev) {
+		pr_err("Unable to allocate ib device\n");
+		return NULL;
+	}
+
+	DP_DEBUG(dev, QEDR_MSG_INIT, "qedr add device called\n");
+
+	dev->pdev = pdev;
+	dev->ndev = ndev;
+	dev->cdev = cdev;
+
+	qed_ops = qed_get_rdma_ops();
+	if (!qed_ops) {
+		DP_ERR(dev, "Failed to get qed roce operations\n");
+		goto init_err;
+	}
+
+	dev->ops = qed_ops;
+	rc = qed_ops->fill_dev_info(cdev, &dev_info);
+	if (rc)
+		goto init_err;
+
+	dev->user_dpm_enabled = dev_info.user_dpm_enabled;
+	dev->rdma_type = dev_info.rdma_type;
+	dev->num_hwfns = dev_info.common.num_hwfns;
+	dev->rdma_ctx = dev->ops->rdma_get_rdma_ctx(cdev);
+
+	dev->num_cnq = dev->ops->rdma_get_min_cnq_msix(cdev);
+	if (!dev->num_cnq) {
+		DP_ERR(dev, "Failed. At least one CNQ is required.\n");
+		rc = -ENOMEM;
+		goto init_err;
+	}
+
+	dev->wq_multiplier = QEDR_WQ_MULTIPLIER_DFT;
+
+	qedr_pci_set_atomic(dev, pdev);
+
+	rc = qedr_alloc_resources(dev);
+	if (rc)
+		goto init_err;
+
+	rc = qedr_init_hw(dev);
+	if (rc)
+		goto alloc_err;
+
+	rc = qedr_setup_irqs(dev);
+	if (rc)
+		goto irq_err;
+
+	rc = qedr_register_device(dev);
+	if (rc) {
+		DP_ERR(dev, "Unable to allocate register device\n");
+		goto reg_err;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(qedr_attributes); i++)
+		if (device_create_file(&dev->ibdev.dev, qedr_attributes[i]))
+			goto sysfs_err;
+
+	if (!test_and_set_bit(QEDR_ENET_STATE_BIT, &dev->enet_state))
+		qedr_ib_dispatch_event(dev, QEDR_PORT, IB_EVENT_PORT_ACTIVE);
+
+	DP_DEBUG(dev, QEDR_MSG_INIT, "qedr driver loaded successfully\n");
+	return dev;
+
+sysfs_err:
+	ib_unregister_device(&dev->ibdev);
+reg_err:
+	qedr_sync_free_irqs(dev);
+irq_err:
+	qedr_stop_hw(dev);
+alloc_err:
+	qedr_free_resources(dev);
+init_err:
+	ib_dealloc_device(&dev->ibdev);
+	DP_ERR(dev, "qedr driver load failed rc=%d\n", rc);
+
+	return NULL;
+}
+
+static void qedr_remove(struct qedr_dev *dev)
+{
+	/* First unregister with stack to stop all the active traffic
+	 * of the registered clients.
+	 */
+	qedr_remove_sysfiles(dev);
+	ib_unregister_device(&dev->ibdev);
+
+	qedr_stop_hw(dev);
+	qedr_sync_free_irqs(dev);
+	qedr_free_resources(dev);
+	ib_dealloc_device(&dev->ibdev);
+}
+
+static void qedr_close(struct qedr_dev *dev)
+{
+	if (test_and_clear_bit(QEDR_ENET_STATE_BIT, &dev->enet_state))
+		qedr_ib_dispatch_event(dev, QEDR_PORT, IB_EVENT_PORT_ERR);
+}
+
+static void qedr_shutdown(struct qedr_dev *dev)
+{
+	qedr_close(dev);
+	qedr_remove(dev);
+}
+
+static void qedr_open(struct qedr_dev *dev)
+{
+	if (!test_and_set_bit(QEDR_ENET_STATE_BIT, &dev->enet_state))
+		qedr_ib_dispatch_event(dev, QEDR_PORT, IB_EVENT_PORT_ACTIVE);
+}
+
+static void qedr_mac_address_change(struct qedr_dev *dev)
+{
+	union ib_gid *sgid = &dev->sgid_tbl[0];
+	u8 guid[8], mac_addr[6];
+	int rc;
+
+	/* Update SGID */
+	ether_addr_copy(&mac_addr[0], dev->ndev->dev_addr);
+	guid[0] = mac_addr[0] ^ 2;
+	guid[1] = mac_addr[1];
+	guid[2] = mac_addr[2];
+	guid[3] = 0xff;
+	guid[4] = 0xfe;
+	guid[5] = mac_addr[3];
+	guid[6] = mac_addr[4];
+	guid[7] = mac_addr[5];
+	sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+	memcpy(&sgid->raw[8], guid, sizeof(guid));
+
+	/* Update LL2 */
+	rc = dev->ops->ll2_set_mac_filter(dev->cdev,
+					  dev->gsi_ll2_mac_address,
+					  dev->ndev->dev_addr);
+
+	ether_addr_copy(dev->gsi_ll2_mac_address, dev->ndev->dev_addr);
+
+	qedr_ib_dispatch_event(dev, QEDR_PORT, IB_EVENT_GID_CHANGE);
+
+	if (rc)
+		DP_ERR(dev, "Error updating mac filter\n");
+}
+
+/* event handling via NIC driver ensures that all the NIC specific
+ * initialization done before RoCE driver notifies
+ * event to stack.
+ */
+static void qedr_notify(struct qedr_dev *dev, enum qede_rdma_event event)
+{
+	switch (event) {
+	case QEDE_UP:
+		qedr_open(dev);
+		break;
+	case QEDE_DOWN:
+		qedr_close(dev);
+		break;
+	case QEDE_CLOSE:
+		qedr_shutdown(dev);
+		break;
+	case QEDE_CHANGE_ADDR:
+		qedr_mac_address_change(dev);
+		break;
+	default:
+		pr_err("Event not supported\n");
+	}
+}
+
+static struct qedr_driver qedr_drv = {
+	.name = "qedr_driver",
+	.add = qedr_add,
+	.remove = qedr_remove,
+	.notify = qedr_notify,
+};
+
+static int __init qedr_init_module(void)
+{
+	return qede_rdma_register_driver(&qedr_drv);
+}
+
+static void __exit qedr_exit_module(void)
+{
+	qede_rdma_unregister_driver(&qedr_drv);
+}
+
+module_init(qedr_init_module);
+module_exit(qedr_exit_module);
diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
new file mode 100644
index 0000000..86d4511
--- /dev/null
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -0,0 +1,541 @@
+/* QLogic qedr NIC Driver
+ * Copyright (c) 2015-2016  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __QEDR_H__
+#define __QEDR_H__
+
+#include <linux/pci.h>
+#include <linux/idr.h>
+#include <rdma/ib_addr.h>
+#include <linux/qed/qed_if.h>
+#include <linux/qed/qed_chain.h>
+#include <linux/qed/qed_rdma_if.h>
+#include <linux/qed/qede_rdma.h>
+#include <linux/qed/roce_common.h>
+#include "qedr_hsi_rdma.h"
+
+#define QEDR_NODE_DESC "QLogic 579xx RoCE HCA"
+#define DP_NAME(dev) ((dev)->ibdev.name)
+#define IS_IWARP(_dev) ((_dev)->rdma_type == QED_RDMA_TYPE_IWARP)
+#define IS_ROCE(_dev) ((_dev)->rdma_type == QED_RDMA_TYPE_ROCE)
+
+#define DP_DEBUG(dev, module, fmt, ...)					\
+	pr_debug("(%s) " module ": " fmt,				\
+		 DP_NAME(dev) ? DP_NAME(dev) : "", ## __VA_ARGS__)
+
+#define QEDR_MSG_INIT "INIT"
+#define QEDR_MSG_MISC "MISC"
+#define QEDR_MSG_CQ   "  CQ"
+#define QEDR_MSG_MR   "  MR"
+#define QEDR_MSG_RQ   "  RQ"
+#define QEDR_MSG_SQ   "  SQ"
+#define QEDR_MSG_QP   "  QP"
+#define QEDR_MSG_GSI  " GSI"
+#define QEDR_MSG_IWARP  " IW"
+
+#define QEDR_CQ_MAGIC_NUMBER	(0x11223344)
+
+#define FW_PAGE_SIZE		(RDMA_RING_PAGE_SIZE)
+#define FW_PAGE_SHIFT		(12)
+
+struct qedr_dev;
+
+struct qedr_cnq {
+	struct qedr_dev		*dev;
+	struct qed_chain	pbl;
+	struct qed_sb_info	*sb;
+	char			name[32];
+	u64			n_comp;
+	__le16			*hw_cons_ptr;
+	u8			index;
+};
+
+#define QEDR_MAX_SGID 128
+
+struct qedr_device_attr {
+	u32	vendor_id;
+	u32	vendor_part_id;
+	u32	hw_ver;
+	u64	fw_ver;
+	u64	node_guid;
+	u64	sys_image_guid;
+	u8	max_cnq;
+	u8	max_sge;
+	u16	max_inline;
+	u32	max_sqe;
+	u32	max_rqe;
+	u8	max_qp_resp_rd_atomic_resc;
+	u8	max_qp_req_rd_atomic_resc;
+	u64	max_dev_resp_rd_atomic_resc;
+	u32	max_cq;
+	u32	max_qp;
+	u32	max_mr;
+	u64	max_mr_size;
+	u32	max_cqe;
+	u32	max_mw;
+	u32	max_fmr;
+	u32	max_mr_mw_fmr_pbl;
+	u64	max_mr_mw_fmr_size;
+	u32	max_pd;
+	u32	max_ah;
+	u8	max_pkey;
+	u32	max_srq;
+	u32	max_srq_wr;
+	u8	max_srq_sge;
+	u8	max_stats_queues;
+	u32	dev_caps;
+
+	u64	page_size_caps;
+	u8	dev_ack_delay;
+	u32	reserved_lkey;
+	u32	bad_pkey_counter;
+	struct qed_rdma_events events;
+};
+
+#define QEDR_ENET_STATE_BIT	(0)
+
+struct qedr_dev {
+	struct ib_device	ibdev;
+	struct qed_dev		*cdev;
+	struct pci_dev		*pdev;
+	struct net_device	*ndev;
+
+	enum ib_atomic_cap	atomic_cap;
+
+	void *rdma_ctx;
+	struct qedr_device_attr attr;
+
+	const struct qed_rdma_ops *ops;
+	struct qed_int_info	int_info;
+
+	struct qed_sb_info	*sb_array;
+	struct qedr_cnq		*cnq_array;
+	int			num_cnq;
+	int			sb_start;
+
+	void __iomem		*db_addr;
+	u64			db_phys_addr;
+	u32			db_size;
+	u16			dpi;
+
+	union ib_gid *sgid_tbl;
+
+	/* Lock for sgid table */
+	spinlock_t sgid_lock;
+
+	u64			guid;
+
+	u32			dp_module;
+	u8			dp_level;
+	u8			num_hwfns;
+	u8			gsi_ll2_handle;
+
+	uint			wq_multiplier;
+	u8			gsi_ll2_mac_address[ETH_ALEN];
+	int			gsi_qp_created;
+	struct qedr_cq		*gsi_sqcq;
+	struct qedr_cq		*gsi_rqcq;
+	struct qedr_qp		*gsi_qp;
+	enum qed_rdma_type	rdma_type;
+	spinlock_t		idr_lock; /* Protect qpidr data-structure */
+	struct idr		qpidr;
+	struct workqueue_struct *iwarp_wq;
+	u16			iwarp_max_mtu;
+
+	unsigned long enet_state;
+
+	u8 user_dpm_enabled;
+};
+
+#define QEDR_MAX_SQ_PBL			(0x8000)
+#define QEDR_MAX_SQ_PBL_ENTRIES		(0x10000 / sizeof(void *))
+#define QEDR_SQE_ELEMENT_SIZE		(sizeof(struct rdma_sq_sge))
+#define QEDR_MAX_SQE_ELEMENTS_PER_SQE	(ROCE_REQ_MAX_SINGLE_SQ_WQE_SIZE / \
+					 QEDR_SQE_ELEMENT_SIZE)
+#define QEDR_MAX_SQE_ELEMENTS_PER_PAGE	((RDMA_RING_PAGE_SIZE) / \
+					 QEDR_SQE_ELEMENT_SIZE)
+#define QEDR_MAX_SQE			((QEDR_MAX_SQ_PBL_ENTRIES) *\
+					 (RDMA_RING_PAGE_SIZE) / \
+					 (QEDR_SQE_ELEMENT_SIZE) /\
+					 (QEDR_MAX_SQE_ELEMENTS_PER_SQE))
+/* RQ */
+#define QEDR_MAX_RQ_PBL			(0x2000)
+#define QEDR_MAX_RQ_PBL_ENTRIES		(0x10000 / sizeof(void *))
+#define QEDR_RQE_ELEMENT_SIZE		(sizeof(struct rdma_rq_sge))
+#define QEDR_MAX_RQE_ELEMENTS_PER_RQE	(RDMA_MAX_SGE_PER_RQ_WQE)
+#define QEDR_MAX_RQE_ELEMENTS_PER_PAGE	((RDMA_RING_PAGE_SIZE) / \
+					 QEDR_RQE_ELEMENT_SIZE)
+#define QEDR_MAX_RQE			((QEDR_MAX_RQ_PBL_ENTRIES) *\
+					 (RDMA_RING_PAGE_SIZE) / \
+					 (QEDR_RQE_ELEMENT_SIZE) /\
+					 (QEDR_MAX_RQE_ELEMENTS_PER_RQE))
+
+#define QEDR_CQE_SIZE	(sizeof(union rdma_cqe))
+#define QEDR_MAX_CQE_PBL_SIZE (512 * 1024)
+#define QEDR_MAX_CQE_PBL_ENTRIES (((QEDR_MAX_CQE_PBL_SIZE) / \
+				  sizeof(u64)) - 1)
+#define QEDR_MAX_CQES ((u32)((QEDR_MAX_CQE_PBL_ENTRIES) * \
+			     (QED_CHAIN_PAGE_SIZE) / QEDR_CQE_SIZE))
+
+#define QEDR_ROCE_MAX_CNQ_SIZE		(0x4000)
+
+#define QEDR_MAX_PORT			(1)
+#define QEDR_PORT			(1)
+
+#define QEDR_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME)
+
+#define QEDR_ROCE_PKEY_MAX 1
+#define QEDR_ROCE_PKEY_TABLE_LEN 1
+#define QEDR_ROCE_PKEY_DEFAULT 0xffff
+
+struct qedr_pbl {
+	struct list_head list_entry;
+	void *va;
+	dma_addr_t pa;
+};
+
+struct qedr_ucontext {
+	struct ib_ucontext ibucontext;
+	struct qedr_dev *dev;
+	struct qedr_pd *pd;
+	u64 dpi_addr;
+	u64 dpi_phys_addr;
+	u32 dpi_size;
+	u16 dpi;
+
+	struct list_head mm_head;
+
+	/* Lock to protect mm list */
+	struct mutex mm_list_lock;
+};
+
+union db_prod64 {
+	struct rdma_pwm_val32_data data;
+	u64 raw;
+};
+
+enum qedr_cq_type {
+	QEDR_CQ_TYPE_GSI,
+	QEDR_CQ_TYPE_KERNEL,
+	QEDR_CQ_TYPE_USER,
+};
+
+struct qedr_pbl_info {
+	u32 num_pbls;
+	u32 num_pbes;
+	u32 pbl_size;
+	u32 pbe_size;
+	bool two_layered;
+};
+
+struct qedr_userq {
+	struct ib_umem *umem;
+	struct qedr_pbl_info pbl_info;
+	struct qedr_pbl *pbl_tbl;
+	u64 buf_addr;
+	size_t buf_len;
+};
+
+struct qedr_cq {
+	struct ib_cq ibcq;
+
+	enum qedr_cq_type cq_type;
+	u32 sig;
+
+	u16 icid;
+
+	/* Lock to protect multiplem CQ's */
+	spinlock_t cq_lock;
+	u8 arm_flags;
+	struct qed_chain pbl;
+
+	void __iomem *db_addr;
+	union db_prod64 db;
+
+	u8 pbl_toggle;
+	union rdma_cqe *latest_cqe;
+	union rdma_cqe *toggle_cqe;
+
+	u32 cq_cons;
+
+	struct qedr_userq q;
+	u8 destroyed;
+	u16 cnq_notif;
+};
+
+struct qedr_pd {
+	struct ib_pd ibpd;
+	u32 pd_id;
+	struct qedr_ucontext *uctx;
+};
+
+struct qedr_mm {
+	struct {
+		u64 phy_addr;
+		unsigned long len;
+	} key;
+	struct list_head entry;
+};
+
+union db_prod32 {
+	struct rdma_pwm_val16_data data;
+	u32 raw;
+};
+
+struct qedr_qp_hwq_info {
+	/* WQE Elements */
+	struct qed_chain pbl;
+	u64 p_phys_addr_tbl;
+	u32 max_sges;
+
+	/* WQE */
+	u16 prod;
+	u16 cons;
+	u16 wqe_cons;
+	u16 gsi_cons;
+	u16 max_wr;
+
+	/* DB */
+	void __iomem *db;
+	union db_prod32 db_data;
+
+	void __iomem *iwarp_db2;
+	union db_prod32 iwarp_db2_data;
+};
+
+#define QEDR_INC_SW_IDX(p_info, index)					\
+	do {								\
+		p_info->index = (p_info->index + 1) &			\
+				qed_chain_get_capacity(p_info->pbl)	\
+	} while (0)
+
+enum qedr_qp_err_bitmap {
+	QEDR_QP_ERR_SQ_FULL = 1,
+	QEDR_QP_ERR_RQ_FULL = 2,
+	QEDR_QP_ERR_BAD_SR = 4,
+	QEDR_QP_ERR_BAD_RR = 8,
+	QEDR_QP_ERR_SQ_PBL_FULL = 16,
+	QEDR_QP_ERR_RQ_PBL_FULL = 32,
+};
+
+struct qedr_qp {
+	struct ib_qp ibqp;	/* must be first */
+	struct qedr_dev *dev;
+	struct qedr_iw_ep *ep;
+	struct qedr_qp_hwq_info sq;
+	struct qedr_qp_hwq_info rq;
+
+	u32 max_inline_data;
+
+	/* Lock for QP's */
+	spinlock_t q_lock;
+	struct qedr_cq *sq_cq;
+	struct qedr_cq *rq_cq;
+	struct qedr_srq *srq;
+	enum qed_roce_qp_state state;
+	u32 id;
+	struct qedr_pd *pd;
+	enum ib_qp_type qp_type;
+	struct qed_rdma_qp *qed_qp;
+	u32 qp_id;
+	u16 icid;
+	u16 mtu;
+	int sgid_idx;
+	u32 rq_psn;
+	u32 sq_psn;
+	u32 qkey;
+	u32 dest_qp_num;
+
+	/* Relevant to qps created from kernel space only (ULPs) */
+	u8 prev_wqe_size;
+	u16 wqe_cons;
+	u32 err_bitmap;
+	bool signaled;
+
+	/* SQ shadow */
+	struct {
+		u64 wr_id;
+		enum ib_wc_opcode opcode;
+		u32 bytes_len;
+		u8 wqe_size;
+		bool signaled;
+		dma_addr_t icrc_mapping;
+		u32 *icrc;
+		struct qedr_mr *mr;
+	} *wqe_wr_id;
+
+	/* RQ shadow */
+	struct {
+		u64 wr_id;
+		struct ib_sge sg_list[RDMA_MAX_SGE_PER_RQ_WQE];
+		u8 wqe_size;
+
+		u8 smac[ETH_ALEN];
+		u16 vlan;
+		int rc;
+	} *rqe_wr_id;
+
+	/* Relevant to qps created from user space only (applications) */
+	struct qedr_userq usq;
+	struct qedr_userq urq;
+	atomic_t refcnt;
+	bool destroyed;
+};
+
+struct qedr_ah {
+	struct ib_ah ibah;
+	struct rdma_ah_attr attr;
+};
+
+enum qedr_mr_type {
+	QEDR_MR_USER,
+	QEDR_MR_KERNEL,
+	QEDR_MR_DMA,
+	QEDR_MR_FRMR,
+};
+
+struct mr_info {
+	struct qedr_pbl *pbl_table;
+	struct qedr_pbl_info pbl_info;
+	struct list_head free_pbl_list;
+	struct list_head inuse_pbl_list;
+	u32 completed;
+	u32 completed_handled;
+};
+
+struct qedr_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+
+	struct qed_rdma_register_tid_in_params hw_mr;
+	enum qedr_mr_type type;
+
+	struct qedr_dev *dev;
+	struct mr_info info;
+
+	u64 *pages;
+	u32 npages;
+};
+
+#define SET_FIELD2(value, name, flag) ((value) |= ((flag) << (name ## _SHIFT)))
+
+#define QEDR_RESP_IMM	(RDMA_CQE_RESPONDER_IMM_FLG_MASK << \
+			 RDMA_CQE_RESPONDER_IMM_FLG_SHIFT)
+#define QEDR_RESP_RDMA	(RDMA_CQE_RESPONDER_RDMA_FLG_MASK << \
+			 RDMA_CQE_RESPONDER_RDMA_FLG_SHIFT)
+#define QEDR_RESP_INV	(RDMA_CQE_RESPONDER_INV_FLG_MASK << \
+			 RDMA_CQE_RESPONDER_INV_FLG_SHIFT)
+
+static inline void qedr_inc_sw_cons(struct qedr_qp_hwq_info *info)
+{
+	info->cons = (info->cons + 1) % info->max_wr;
+	info->wqe_cons++;
+}
+
+static inline void qedr_inc_sw_prod(struct qedr_qp_hwq_info *info)
+{
+	info->prod = (info->prod + 1) % info->max_wr;
+}
+
+static inline int qedr_get_dmac(struct qedr_dev *dev,
+				struct rdma_ah_attr *ah_attr, u8 *mac_addr)
+{
+	union ib_gid zero_sgid = { { 0 } };
+	struct in6_addr in6;
+	const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
+	u8 *dmac;
+
+	if (!memcmp(&grh->dgid, &zero_sgid, sizeof(union ib_gid))) {
+		DP_ERR(dev, "Local port GID not supported\n");
+		eth_zero_addr(mac_addr);
+		return -EINVAL;
+	}
+
+	memcpy(&in6, grh->dgid.raw, sizeof(in6));
+	dmac = rdma_ah_retrieve_dmac(ah_attr);
+	if (!dmac)
+		return -EINVAL;
+	ether_addr_copy(mac_addr, dmac);
+
+	return 0;
+}
+
+struct qedr_iw_listener {
+	struct qedr_dev *dev;
+	struct iw_cm_id *cm_id;
+	int		backlog;
+	void		*qed_handle;
+};
+
+struct qedr_iw_ep {
+	struct qedr_dev	*dev;
+	struct iw_cm_id	*cm_id;
+	struct qedr_qp	*qp;
+	void		*qed_context;
+	u8		during_connect;
+};
+
+static inline
+struct qedr_ucontext *get_qedr_ucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct qedr_ucontext, ibucontext);
+}
+
+static inline struct qedr_dev *get_qedr_dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct qedr_dev, ibdev);
+}
+
+static inline struct qedr_pd *get_qedr_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct qedr_pd, ibpd);
+}
+
+static inline struct qedr_cq *get_qedr_cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct qedr_cq, ibcq);
+}
+
+static inline struct qedr_qp *get_qedr_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct qedr_qp, ibqp);
+}
+
+static inline struct qedr_ah *get_qedr_ah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct qedr_ah, ibah);
+}
+
+static inline struct qedr_mr *get_qedr_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct qedr_mr, ibmr);
+}
+#endif
diff --git a/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h b/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h
new file mode 100644
index 0000000..b816c80
--- /dev/null
+++ b/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h
@@ -0,0 +1,769 @@
+/* QLogic qedr NIC Driver
+ * Copyright (c) 2015-2016  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __QED_HSI_RDMA__
+#define __QED_HSI_RDMA__
+
+#include <linux/qed/rdma_common.h>
+
+/* rdma completion notification queue element */
+struct rdma_cnqe {
+	struct regpair	cq_handle;
+};
+
+struct rdma_cqe_responder {
+	struct regpair srq_wr_id;
+	struct regpair qp_handle;
+	__le32 imm_data_or_inv_r_Key;
+	__le32 length;
+	__le32 imm_data_hi;
+	__le16 rq_cons_or_srq_id;
+	u8 flags;
+#define RDMA_CQE_RESPONDER_TOGGLE_BIT_MASK  0x1
+#define RDMA_CQE_RESPONDER_TOGGLE_BIT_SHIFT 0
+#define RDMA_CQE_RESPONDER_TYPE_MASK        0x3
+#define RDMA_CQE_RESPONDER_TYPE_SHIFT       1
+#define RDMA_CQE_RESPONDER_INV_FLG_MASK     0x1
+#define RDMA_CQE_RESPONDER_INV_FLG_SHIFT    3
+#define RDMA_CQE_RESPONDER_IMM_FLG_MASK     0x1
+#define RDMA_CQE_RESPONDER_IMM_FLG_SHIFT    4
+#define RDMA_CQE_RESPONDER_RDMA_FLG_MASK    0x1
+#define RDMA_CQE_RESPONDER_RDMA_FLG_SHIFT   5
+#define RDMA_CQE_RESPONDER_RESERVED2_MASK   0x3
+#define RDMA_CQE_RESPONDER_RESERVED2_SHIFT  6
+	u8 status;
+};
+
+struct rdma_cqe_requester {
+	__le16 sq_cons;
+	__le16 reserved0;
+	__le32 reserved1;
+	struct regpair qp_handle;
+	struct regpair reserved2;
+	__le32 reserved3;
+	__le16 reserved4;
+	u8 flags;
+#define RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK  0x1
+#define RDMA_CQE_REQUESTER_TOGGLE_BIT_SHIFT 0
+#define RDMA_CQE_REQUESTER_TYPE_MASK        0x3
+#define RDMA_CQE_REQUESTER_TYPE_SHIFT       1
+#define RDMA_CQE_REQUESTER_RESERVED5_MASK   0x1F
+#define RDMA_CQE_REQUESTER_RESERVED5_SHIFT  3
+	u8 status;
+};
+
+struct rdma_cqe_common {
+	struct regpair reserved0;
+	struct regpair qp_handle;
+	__le16 reserved1[7];
+	u8 flags;
+#define RDMA_CQE_COMMON_TOGGLE_BIT_MASK  0x1
+#define RDMA_CQE_COMMON_TOGGLE_BIT_SHIFT 0
+#define RDMA_CQE_COMMON_TYPE_MASK        0x3
+#define RDMA_CQE_COMMON_TYPE_SHIFT       1
+#define RDMA_CQE_COMMON_RESERVED2_MASK   0x1F
+#define RDMA_CQE_COMMON_RESERVED2_SHIFT  3
+	u8 status;
+};
+
+/* rdma completion queue element */
+union rdma_cqe {
+	struct rdma_cqe_responder resp;
+	struct rdma_cqe_requester req;
+	struct rdma_cqe_common cmn;
+};
+
+/* * CQE requester status enumeration */
+enum rdma_cqe_requester_status_enum {
+	RDMA_CQE_REQ_STS_OK,
+	RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR,
+	RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR,
+	RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR,
+	RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR,
+	RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR,
+	RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR,
+	RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR,
+	RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR,
+	RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR,
+	RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR,
+	RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR,
+	RDMA_CQE_REQ_STS_XRC_VOILATION_ERR,
+	MAX_RDMA_CQE_REQUESTER_STATUS_ENUM
+};
+
+/* CQE responder status enumeration */
+enum rdma_cqe_responder_status_enum {
+	RDMA_CQE_RESP_STS_OK,
+	RDMA_CQE_RESP_STS_LOCAL_ACCESS_ERR,
+	RDMA_CQE_RESP_STS_LOCAL_LENGTH_ERR,
+	RDMA_CQE_RESP_STS_LOCAL_QP_OPERATION_ERR,
+	RDMA_CQE_RESP_STS_LOCAL_PROTECTION_ERR,
+	RDMA_CQE_RESP_STS_MEMORY_MGT_OPERATION_ERR,
+	RDMA_CQE_RESP_STS_REMOTE_INVALID_REQUEST_ERR,
+	RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR,
+	MAX_RDMA_CQE_RESPONDER_STATUS_ENUM
+};
+
+/* CQE type enumeration */
+enum rdma_cqe_type {
+	RDMA_CQE_TYPE_REQUESTER,
+	RDMA_CQE_TYPE_RESPONDER_RQ,
+	RDMA_CQE_TYPE_RESPONDER_SRQ,
+	RDMA_CQE_TYPE_RESPONDER_XRC_SRQ,
+	RDMA_CQE_TYPE_INVALID,
+	MAX_RDMA_CQE_TYPE
+};
+
+struct rdma_sq_sge {
+	__le32 length;
+	struct regpair	addr;
+	__le32 l_key;
+};
+
+struct rdma_rq_sge {
+	struct regpair addr;
+	__le32 length;
+	__le32 flags;
+#define RDMA_RQ_SGE_L_KEY_MASK      0x3FFFFFF
+#define RDMA_RQ_SGE_L_KEY_SHIFT     0
+#define RDMA_RQ_SGE_NUM_SGES_MASK   0x7
+#define RDMA_RQ_SGE_NUM_SGES_SHIFT  26
+#define RDMA_RQ_SGE_RESERVED0_MASK  0x7
+#define RDMA_RQ_SGE_RESERVED0_SHIFT 29
+};
+
+struct rdma_srq_sge {
+	struct regpair addr;
+	__le32 length;
+	__le32 l_key;
+};
+
+/* Rdma doorbell data for flags update */
+struct rdma_pwm_flags_data {
+	__le16 icid; /* internal CID */
+	u8 agg_flags; /* aggregative flags */
+	u8 reserved;
+};
+
+/* Rdma doorbell data for SQ and RQ */
+struct rdma_pwm_val16_data {
+	__le16 icid;
+	__le16 value;
+};
+
+union rdma_pwm_val16_data_union {
+	struct rdma_pwm_val16_data as_struct;
+	__le32 as_dword;
+};
+
+/* Rdma doorbell data for CQ */
+struct rdma_pwm_val32_data {
+	__le16 icid;
+	u8 agg_flags;
+	u8 params;
+#define RDMA_PWM_VAL32_DATA_AGG_CMD_MASK		0x3
+#define RDMA_PWM_VAL32_DATA_AGG_CMD_SHIFT		0
+#define RDMA_PWM_VAL32_DATA_BYPASS_EN_MASK		0x1
+#define RDMA_PWM_VAL32_DATA_BYPASS_EN_SHIFT		2
+#define RDMA_PWM_VAL32_DATA_CONN_TYPE_IS_IWARP_MASK	0x1
+#define RDMA_PWM_VAL32_DATA_CONN_TYPE_IS_IWARP_SHIFT	3
+#define RDMA_PWM_VAL32_DATA_SET_16B_VAL_MASK		0x1
+#define RDMA_PWM_VAL32_DATA_SET_16B_VAL_SHIFT		4
+#define RDMA_PWM_VAL32_DATA_RESERVED_MASK		0x7
+#define RDMA_PWM_VAL32_DATA_RESERVED_SHIFT		5
+	__le32 value;
+};
+
+/* DIF Block size options */
+enum rdma_dif_block_size {
+	RDMA_DIF_BLOCK_512 = 0,
+	RDMA_DIF_BLOCK_4096 = 1,
+	MAX_RDMA_DIF_BLOCK_SIZE
+};
+
+/* DIF CRC initial value */
+enum rdma_dif_crc_seed {
+	RDMA_DIF_CRC_SEED_0000 = 0,
+	RDMA_DIF_CRC_SEED_FFFF = 1,
+	MAX_RDMA_DIF_CRC_SEED
+};
+
+/* RDMA DIF Error Result Structure */
+struct rdma_dif_error_result {
+	__le32 error_intervals;
+	__le32 dif_error_1st_interval;
+	u8 flags;
+#define RDMA_DIF_ERROR_RESULT_DIF_ERROR_TYPE_CRC_MASK      0x1
+#define RDMA_DIF_ERROR_RESULT_DIF_ERROR_TYPE_CRC_SHIFT     0
+#define RDMA_DIF_ERROR_RESULT_DIF_ERROR_TYPE_APP_TAG_MASK  0x1
+#define RDMA_DIF_ERROR_RESULT_DIF_ERROR_TYPE_APP_TAG_SHIFT 1
+#define RDMA_DIF_ERROR_RESULT_DIF_ERROR_TYPE_REF_TAG_MASK  0x1
+#define RDMA_DIF_ERROR_RESULT_DIF_ERROR_TYPE_REF_TAG_SHIFT 2
+#define RDMA_DIF_ERROR_RESULT_RESERVED0_MASK               0xF
+#define RDMA_DIF_ERROR_RESULT_RESERVED0_SHIFT              3
+#define RDMA_DIF_ERROR_RESULT_TOGGLE_BIT_MASK              0x1
+#define RDMA_DIF_ERROR_RESULT_TOGGLE_BIT_SHIFT             7
+	u8 reserved1[55];
+};
+
+/* DIF IO direction */
+enum rdma_dif_io_direction_flg {
+	RDMA_DIF_DIR_RX = 0,
+	RDMA_DIF_DIR_TX = 1,
+	MAX_RDMA_DIF_IO_DIRECTION_FLG
+};
+
+/* RDMA DIF Runt Result Structure */
+struct rdma_dif_runt_result {
+	__le16 guard_tag;
+	__le16 reserved[3];
+};
+
+/* Memory window type enumeration */
+enum rdma_mw_type {
+	RDMA_MW_TYPE_1,
+	RDMA_MW_TYPE_2A,
+	MAX_RDMA_MW_TYPE
+};
+
+struct rdma_sq_atomic_wqe {
+	__le32 reserved1;
+	__le32 length;
+	__le32 xrc_srq;
+	u8 req_type;
+	u8 flags;
+#define RDMA_SQ_ATOMIC_WQE_COMP_FLG_MASK         0x1
+#define RDMA_SQ_ATOMIC_WQE_COMP_FLG_SHIFT        0
+#define RDMA_SQ_ATOMIC_WQE_RD_FENCE_FLG_MASK     0x1
+#define RDMA_SQ_ATOMIC_WQE_RD_FENCE_FLG_SHIFT    1
+#define RDMA_SQ_ATOMIC_WQE_INV_FENCE_FLG_MASK    0x1
+#define RDMA_SQ_ATOMIC_WQE_INV_FENCE_FLG_SHIFT   2
+#define RDMA_SQ_ATOMIC_WQE_SE_FLG_MASK           0x1
+#define RDMA_SQ_ATOMIC_WQE_SE_FLG_SHIFT          3
+#define RDMA_SQ_ATOMIC_WQE_INLINE_FLG_MASK       0x1
+#define RDMA_SQ_ATOMIC_WQE_INLINE_FLG_SHIFT      4
+#define RDMA_SQ_ATOMIC_WQE_DIF_ON_HOST_FLG_MASK  0x1
+#define RDMA_SQ_ATOMIC_WQE_DIF_ON_HOST_FLG_SHIFT 5
+#define RDMA_SQ_ATOMIC_WQE_RESERVED0_MASK        0x3
+#define RDMA_SQ_ATOMIC_WQE_RESERVED0_SHIFT       6
+	u8 wqe_size;
+	u8 prev_wqe_size;
+	struct regpair remote_va;
+	__le32 r_key;
+	__le32 reserved2;
+	struct regpair cmp_data;
+	struct regpair swap_data;
+};
+
+/* First element (16 bytes) of atomic wqe */
+struct rdma_sq_atomic_wqe_1st {
+	__le32 reserved1;
+	__le32 length;
+	__le32 xrc_srq;
+	u8 req_type;
+	u8 flags;
+#define RDMA_SQ_ATOMIC_WQE_1ST_COMP_FLG_MASK       0x1
+#define RDMA_SQ_ATOMIC_WQE_1ST_COMP_FLG_SHIFT      0
+#define RDMA_SQ_ATOMIC_WQE_1ST_RD_FENCE_FLG_MASK   0x1
+#define RDMA_SQ_ATOMIC_WQE_1ST_RD_FENCE_FLG_SHIFT  1
+#define RDMA_SQ_ATOMIC_WQE_1ST_INV_FENCE_FLG_MASK  0x1
+#define RDMA_SQ_ATOMIC_WQE_1ST_INV_FENCE_FLG_SHIFT 2
+#define RDMA_SQ_ATOMIC_WQE_1ST_SE_FLG_MASK         0x1
+#define RDMA_SQ_ATOMIC_WQE_1ST_SE_FLG_SHIFT        3
+#define RDMA_SQ_ATOMIC_WQE_1ST_INLINE_FLG_MASK     0x1
+#define RDMA_SQ_ATOMIC_WQE_1ST_INLINE_FLG_SHIFT    4
+#define RDMA_SQ_ATOMIC_WQE_1ST_RESERVED0_MASK      0x7
+#define RDMA_SQ_ATOMIC_WQE_1ST_RESERVED0_SHIFT     5
+	u8 wqe_size;
+	u8 prev_wqe_size;
+};
+
+/* Second element (16 bytes) of atomic wqe */
+struct rdma_sq_atomic_wqe_2nd {
+	struct regpair remote_va;
+	__le32 r_key;
+	__le32 reserved2;
+};
+
+/* Third element (16 bytes) of atomic wqe */
+struct rdma_sq_atomic_wqe_3rd {
+	struct regpair cmp_data;
+	struct regpair swap_data;
+};
+
+struct rdma_sq_bind_wqe {
+	struct regpair addr;
+	__le32 l_key;
+	u8 req_type;
+	u8 flags;
+#define RDMA_SQ_BIND_WQE_COMP_FLG_MASK       0x1
+#define RDMA_SQ_BIND_WQE_COMP_FLG_SHIFT      0
+#define RDMA_SQ_BIND_WQE_RD_FENCE_FLG_MASK   0x1
+#define RDMA_SQ_BIND_WQE_RD_FENCE_FLG_SHIFT  1
+#define RDMA_SQ_BIND_WQE_INV_FENCE_FLG_MASK  0x1
+#define RDMA_SQ_BIND_WQE_INV_FENCE_FLG_SHIFT 2
+#define RDMA_SQ_BIND_WQE_SE_FLG_MASK         0x1
+#define RDMA_SQ_BIND_WQE_SE_FLG_SHIFT        3
+#define RDMA_SQ_BIND_WQE_INLINE_FLG_MASK     0x1
+#define RDMA_SQ_BIND_WQE_INLINE_FLG_SHIFT    4
+#define RDMA_SQ_BIND_WQE_RESERVED0_MASK      0x7
+#define RDMA_SQ_BIND_WQE_RESERVED0_SHIFT     5
+	u8 wqe_size;
+	u8 prev_wqe_size;
+	u8 bind_ctrl;
+#define RDMA_SQ_BIND_WQE_ZERO_BASED_MASK     0x1
+#define RDMA_SQ_BIND_WQE_ZERO_BASED_SHIFT    0
+#define RDMA_SQ_BIND_WQE_MW_TYPE_MASK        0x1
+#define RDMA_SQ_BIND_WQE_MW_TYPE_SHIFT       1
+#define RDMA_SQ_BIND_WQE_RESERVED1_MASK      0x3F
+#define RDMA_SQ_BIND_WQE_RESERVED1_SHIFT     2
+	u8 access_ctrl;
+#define RDMA_SQ_BIND_WQE_REMOTE_READ_MASK    0x1
+#define RDMA_SQ_BIND_WQE_REMOTE_READ_SHIFT   0
+#define RDMA_SQ_BIND_WQE_REMOTE_WRITE_MASK   0x1
+#define RDMA_SQ_BIND_WQE_REMOTE_WRITE_SHIFT  1
+#define RDMA_SQ_BIND_WQE_ENABLE_ATOMIC_MASK  0x1
+#define RDMA_SQ_BIND_WQE_ENABLE_ATOMIC_SHIFT 2
+#define RDMA_SQ_BIND_WQE_LOCAL_READ_MASK     0x1
+#define RDMA_SQ_BIND_WQE_LOCAL_READ_SHIFT    3
+#define RDMA_SQ_BIND_WQE_LOCAL_WRITE_MASK    0x1
+#define RDMA_SQ_BIND_WQE_LOCAL_WRITE_SHIFT   4
+#define RDMA_SQ_BIND_WQE_RESERVED2_MASK      0x7
+#define RDMA_SQ_BIND_WQE_RESERVED2_SHIFT     5
+	u8 reserved3;
+	u8 length_hi;
+	__le32 length_lo;
+	__le32 parent_l_key;
+	__le32 reserved4;
+};
+
+/* First element (16 bytes) of bind wqe */
+struct rdma_sq_bind_wqe_1st {
+	struct regpair addr;
+	__le32 l_key;
+	u8 req_type;
+	u8 flags;
+#define RDMA_SQ_BIND_WQE_1ST_COMP_FLG_MASK       0x1
+#define RDMA_SQ_BIND_WQE_1ST_COMP_FLG_SHIFT      0
+#define RDMA_SQ_BIND_WQE_1ST_RD_FENCE_FLG_MASK   0x1
+#define RDMA_SQ_BIND_WQE_1ST_RD_FENCE_FLG_SHIFT  1
+#define RDMA_SQ_BIND_WQE_1ST_INV_FENCE_FLG_MASK  0x1
+#define RDMA_SQ_BIND_WQE_1ST_INV_FENCE_FLG_SHIFT 2
+#define RDMA_SQ_BIND_WQE_1ST_SE_FLG_MASK         0x1
+#define RDMA_SQ_BIND_WQE_1ST_SE_FLG_SHIFT        3
+#define RDMA_SQ_BIND_WQE_1ST_INLINE_FLG_MASK     0x1
+#define RDMA_SQ_BIND_WQE_1ST_INLINE_FLG_SHIFT    4
+#define RDMA_SQ_BIND_WQE_1ST_RESERVED0_MASK      0x7
+#define RDMA_SQ_BIND_WQE_1ST_RESERVED0_SHIFT     5
+	u8 wqe_size;
+	u8 prev_wqe_size;
+};
+
+/* Second element (16 bytes) of bind wqe */
+struct rdma_sq_bind_wqe_2nd {
+	u8 bind_ctrl;
+#define RDMA_SQ_BIND_WQE_2ND_ZERO_BASED_MASK     0x1
+#define RDMA_SQ_BIND_WQE_2ND_ZERO_BASED_SHIFT    0
+#define RDMA_SQ_BIND_WQE_2ND_MW_TYPE_MASK        0x1
+#define RDMA_SQ_BIND_WQE_2ND_MW_TYPE_SHIFT       1
+#define RDMA_SQ_BIND_WQE_2ND_RESERVED1_MASK      0x3F
+#define RDMA_SQ_BIND_WQE_2ND_RESERVED1_SHIFT     2
+	u8 access_ctrl;
+#define RDMA_SQ_BIND_WQE_2ND_REMOTE_READ_MASK    0x1
+#define RDMA_SQ_BIND_WQE_2ND_REMOTE_READ_SHIFT   0
+#define RDMA_SQ_BIND_WQE_2ND_REMOTE_WRITE_MASK   0x1
+#define RDMA_SQ_BIND_WQE_2ND_REMOTE_WRITE_SHIFT  1
+#define RDMA_SQ_BIND_WQE_2ND_ENABLE_ATOMIC_MASK  0x1
+#define RDMA_SQ_BIND_WQE_2ND_ENABLE_ATOMIC_SHIFT 2
+#define RDMA_SQ_BIND_WQE_2ND_LOCAL_READ_MASK     0x1
+#define RDMA_SQ_BIND_WQE_2ND_LOCAL_READ_SHIFT    3
+#define RDMA_SQ_BIND_WQE_2ND_LOCAL_WRITE_MASK    0x1
+#define RDMA_SQ_BIND_WQE_2ND_LOCAL_WRITE_SHIFT   4
+#define RDMA_SQ_BIND_WQE_2ND_RESERVED2_MASK      0x7
+#define RDMA_SQ_BIND_WQE_2ND_RESERVED2_SHIFT     5
+	u8 reserved3;
+	u8 length_hi;
+	__le32 length_lo;
+	__le32 parent_l_key;
+	__le32 reserved4;
+};
+
+/* Structure with only the SQ WQE common
+ * fields. Size is of one SQ element (16B)
+ */
+struct rdma_sq_common_wqe {
+	__le32 reserved1[3];
+	u8 req_type;
+	u8 flags;
+#define RDMA_SQ_COMMON_WQE_COMP_FLG_MASK       0x1
+#define RDMA_SQ_COMMON_WQE_COMP_FLG_SHIFT      0
+#define RDMA_SQ_COMMON_WQE_RD_FENCE_FLG_MASK   0x1
+#define RDMA_SQ_COMMON_WQE_RD_FENCE_FLG_SHIFT  1
+#define RDMA_SQ_COMMON_WQE_INV_FENCE_FLG_MASK  0x1
+#define RDMA_SQ_COMMON_WQE_INV_FENCE_FLG_SHIFT 2
+#define RDMA_SQ_COMMON_WQE_SE_FLG_MASK         0x1
+#define RDMA_SQ_COMMON_WQE_SE_FLG_SHIFT        3
+#define RDMA_SQ_COMMON_WQE_INLINE_FLG_MASK     0x1
+#define RDMA_SQ_COMMON_WQE_INLINE_FLG_SHIFT    4
+#define RDMA_SQ_COMMON_WQE_RESERVED0_MASK      0x7
+#define RDMA_SQ_COMMON_WQE_RESERVED0_SHIFT     5
+	u8 wqe_size;
+	u8 prev_wqe_size;
+};
+
+struct rdma_sq_fmr_wqe {
+	struct regpair addr;
+	__le32 l_key;
+	u8 req_type;
+	u8 flags;
+#define RDMA_SQ_FMR_WQE_COMP_FLG_MASK                0x1
+#define RDMA_SQ_FMR_WQE_COMP_FLG_SHIFT               0
+#define RDMA_SQ_FMR_WQE_RD_FENCE_FLG_MASK            0x1
+#define RDMA_SQ_FMR_WQE_RD_FENCE_FLG_SHIFT           1
+#define RDMA_SQ_FMR_WQE_INV_FENCE_FLG_MASK           0x1
+#define RDMA_SQ_FMR_WQE_INV_FENCE_FLG_SHIFT          2
+#define RDMA_SQ_FMR_WQE_SE_FLG_MASK                  0x1
+#define RDMA_SQ_FMR_WQE_SE_FLG_SHIFT                 3
+#define RDMA_SQ_FMR_WQE_INLINE_FLG_MASK              0x1
+#define RDMA_SQ_FMR_WQE_INLINE_FLG_SHIFT             4
+#define RDMA_SQ_FMR_WQE_DIF_ON_HOST_FLG_MASK         0x1
+#define RDMA_SQ_FMR_WQE_DIF_ON_HOST_FLG_SHIFT        5
+#define RDMA_SQ_FMR_WQE_RESERVED0_MASK               0x3
+#define RDMA_SQ_FMR_WQE_RESERVED0_SHIFT              6
+	u8 wqe_size;
+	u8 prev_wqe_size;
+	u8 fmr_ctrl;
+#define RDMA_SQ_FMR_WQE_PAGE_SIZE_LOG_MASK           0x1F
+#define RDMA_SQ_FMR_WQE_PAGE_SIZE_LOG_SHIFT          0
+#define RDMA_SQ_FMR_WQE_ZERO_BASED_MASK              0x1
+#define RDMA_SQ_FMR_WQE_ZERO_BASED_SHIFT             5
+#define RDMA_SQ_FMR_WQE_BIND_EN_MASK                 0x1
+#define RDMA_SQ_FMR_WQE_BIND_EN_SHIFT                6
+#define RDMA_SQ_FMR_WQE_RESERVED1_MASK               0x1
+#define RDMA_SQ_FMR_WQE_RESERVED1_SHIFT              7
+	u8 access_ctrl;
+#define RDMA_SQ_FMR_WQE_REMOTE_READ_MASK             0x1
+#define RDMA_SQ_FMR_WQE_REMOTE_READ_SHIFT            0
+#define RDMA_SQ_FMR_WQE_REMOTE_WRITE_MASK            0x1
+#define RDMA_SQ_FMR_WQE_REMOTE_WRITE_SHIFT           1
+#define RDMA_SQ_FMR_WQE_ENABLE_ATOMIC_MASK           0x1
+#define RDMA_SQ_FMR_WQE_ENABLE_ATOMIC_SHIFT          2
+#define RDMA_SQ_FMR_WQE_LOCAL_READ_MASK              0x1
+#define RDMA_SQ_FMR_WQE_LOCAL_READ_SHIFT             3
+#define RDMA_SQ_FMR_WQE_LOCAL_WRITE_MASK             0x1
+#define RDMA_SQ_FMR_WQE_LOCAL_WRITE_SHIFT            4
+#define RDMA_SQ_FMR_WQE_RESERVED2_MASK               0x7
+#define RDMA_SQ_FMR_WQE_RESERVED2_SHIFT              5
+	u8 reserved3;
+	u8 length_hi;
+	__le32 length_lo;
+	struct regpair pbl_addr;
+	__le32 dif_base_ref_tag;
+	__le16 dif_app_tag;
+	__le16 dif_app_tag_mask;
+	__le16 dif_runt_crc_value;
+	__le16 dif_flags;
+#define RDMA_SQ_FMR_WQE_DIF_IO_DIRECTION_FLG_MASK	0x1
+#define RDMA_SQ_FMR_WQE_DIF_IO_DIRECTION_FLG_SHIFT	0
+#define RDMA_SQ_FMR_WQE_DIF_BLOCK_SIZE_MASK		0x1
+#define RDMA_SQ_FMR_WQE_DIF_BLOCK_SIZE_SHIFT		1
+#define RDMA_SQ_FMR_WQE_DIF_RUNT_VALID_FLG_MASK		0x1
+#define RDMA_SQ_FMR_WQE_DIF_RUNT_VALID_FLG_SHIFT	2
+#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_CRC_GUARD_MASK	0x1
+#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_CRC_GUARD_SHIFT	3
+#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_REF_TAG_MASK	0x1
+#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_REF_TAG_SHIFT	4
+#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_APP_TAG_MASK	0x1
+#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_APP_TAG_SHIFT	5
+#define RDMA_SQ_FMR_WQE_DIF_CRC_SEED_MASK		0x1
+#define RDMA_SQ_FMR_WQE_DIF_CRC_SEED_SHIFT		6
+#define RDMA_SQ_FMR_WQE_DIF_RX_REF_TAG_CONST_MASK	0x1
+#define RDMA_SQ_FMR_WQE_DIF_RX_REF_TAG_CONST_SHIFT	7
+#define RDMA_SQ_FMR_WQE_RESERVED4_MASK			0xFF
+#define RDMA_SQ_FMR_WQE_RESERVED4_SHIFT			8
+	__le32 reserved5;
+};
+
+/* First element (16 bytes) of fmr wqe */
+struct rdma_sq_fmr_wqe_1st {
+	struct regpair addr;
+	__le32 l_key;
+	u8 req_type;
+	u8 flags;
+#define RDMA_SQ_FMR_WQE_1ST_COMP_FLG_MASK         0x1
+#define RDMA_SQ_FMR_WQE_1ST_COMP_FLG_SHIFT        0
+#define RDMA_SQ_FMR_WQE_1ST_RD_FENCE_FLG_MASK     0x1
+#define RDMA_SQ_FMR_WQE_1ST_RD_FENCE_FLG_SHIFT    1
+#define RDMA_SQ_FMR_WQE_1ST_INV_FENCE_FLG_MASK    0x1
+#define RDMA_SQ_FMR_WQE_1ST_INV_FENCE_FLG_SHIFT   2
+#define RDMA_SQ_FMR_WQE_1ST_SE_FLG_MASK           0x1
+#define RDMA_SQ_FMR_WQE_1ST_SE_FLG_SHIFT          3
+#define RDMA_SQ_FMR_WQE_1ST_INLINE_FLG_MASK       0x1
+#define RDMA_SQ_FMR_WQE_1ST_INLINE_FLG_SHIFT      4
+#define RDMA_SQ_FMR_WQE_1ST_DIF_ON_HOST_FLG_MASK  0x1
+#define RDMA_SQ_FMR_WQE_1ST_DIF_ON_HOST_FLG_SHIFT 5
+#define RDMA_SQ_FMR_WQE_1ST_RESERVED0_MASK        0x3
+#define RDMA_SQ_FMR_WQE_1ST_RESERVED0_SHIFT       6
+	u8 wqe_size;
+	u8 prev_wqe_size;
+};
+
+/* Second element (16 bytes) of fmr wqe */
+struct rdma_sq_fmr_wqe_2nd {
+	u8 fmr_ctrl;
+#define RDMA_SQ_FMR_WQE_2ND_PAGE_SIZE_LOG_MASK  0x1F
+#define RDMA_SQ_FMR_WQE_2ND_PAGE_SIZE_LOG_SHIFT 0
+#define RDMA_SQ_FMR_WQE_2ND_ZERO_BASED_MASK     0x1
+#define RDMA_SQ_FMR_WQE_2ND_ZERO_BASED_SHIFT    5
+#define RDMA_SQ_FMR_WQE_2ND_BIND_EN_MASK        0x1
+#define RDMA_SQ_FMR_WQE_2ND_BIND_EN_SHIFT       6
+#define RDMA_SQ_FMR_WQE_2ND_RESERVED1_MASK      0x1
+#define RDMA_SQ_FMR_WQE_2ND_RESERVED1_SHIFT     7
+	u8 access_ctrl;
+#define RDMA_SQ_FMR_WQE_2ND_REMOTE_READ_MASK    0x1
+#define RDMA_SQ_FMR_WQE_2ND_REMOTE_READ_SHIFT   0
+#define RDMA_SQ_FMR_WQE_2ND_REMOTE_WRITE_MASK   0x1
+#define RDMA_SQ_FMR_WQE_2ND_REMOTE_WRITE_SHIFT  1
+#define RDMA_SQ_FMR_WQE_2ND_ENABLE_ATOMIC_MASK  0x1
+#define RDMA_SQ_FMR_WQE_2ND_ENABLE_ATOMIC_SHIFT 2
+#define RDMA_SQ_FMR_WQE_2ND_LOCAL_READ_MASK     0x1
+#define RDMA_SQ_FMR_WQE_2ND_LOCAL_READ_SHIFT    3
+#define RDMA_SQ_FMR_WQE_2ND_LOCAL_WRITE_MASK    0x1
+#define RDMA_SQ_FMR_WQE_2ND_LOCAL_WRITE_SHIFT   4
+#define RDMA_SQ_FMR_WQE_2ND_RESERVED2_MASK      0x7
+#define RDMA_SQ_FMR_WQE_2ND_RESERVED2_SHIFT     5
+	u8 reserved3;
+	u8 length_hi;
+	__le32 length_lo;
+	struct regpair pbl_addr;
+};
+
+/* Third element (16 bytes) of fmr wqe */
+struct rdma_sq_fmr_wqe_3rd {
+	__le32 dif_base_ref_tag;
+	__le16 dif_app_tag;
+	__le16 dif_app_tag_mask;
+	__le16 dif_runt_crc_value;
+	__le16 dif_flags;
+#define RDMA_SQ_FMR_WQE_3RD_DIF_IO_DIRECTION_FLG_MASK		0x1
+#define RDMA_SQ_FMR_WQE_3RD_DIF_IO_DIRECTION_FLG_SHIFT		0
+#define RDMA_SQ_FMR_WQE_3RD_DIF_BLOCK_SIZE_MASK			0x1
+#define RDMA_SQ_FMR_WQE_3RD_DIF_BLOCK_SIZE_SHIFT		1
+#define RDMA_SQ_FMR_WQE_3RD_DIF_RUNT_VALID_FLG_MASK		0x1
+#define RDMA_SQ_FMR_WQE_3RD_DIF_RUNT_VALID_FLG_SHIFT		2
+#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_CRC_GUARD_MASK		0x1
+#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_CRC_GUARD_SHIFT	3
+#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_REF_TAG_MASK		0x1
+#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_REF_TAG_SHIFT		4
+#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_APP_TAG_MASK		0x1
+#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_APP_TAG_SHIFT		5
+#define RDMA_SQ_FMR_WQE_3RD_DIF_CRC_SEED_MASK			0x1
+#define RDMA_SQ_FMR_WQE_3RD_DIF_CRC_SEED_SHIFT			6
+#define RDMA_SQ_FMR_WQE_3RD_DIF_RX_REF_TAG_CONST_MASK		0x1
+#define RDMA_SQ_FMR_WQE_3RD_DIF_RX_REF_TAG_CONST_SHIFT		7
+#define RDMA_SQ_FMR_WQE_3RD_RESERVED4_MASK			0xFF
+#define RDMA_SQ_FMR_WQE_RESERVED4_SHIFT				8
+	__le32 reserved5;
+};
+
+struct rdma_sq_local_inv_wqe {
+	struct regpair reserved;
+	__le32 inv_l_key;
+	u8 req_type;
+	u8 flags;
+#define RDMA_SQ_LOCAL_INV_WQE_COMP_FLG_MASK         0x1
+#define RDMA_SQ_LOCAL_INV_WQE_COMP_FLG_SHIFT        0
+#define RDMA_SQ_LOCAL_INV_WQE_RD_FENCE_FLG_MASK     0x1
+#define RDMA_SQ_LOCAL_INV_WQE_RD_FENCE_FLG_SHIFT    1
+#define RDMA_SQ_LOCAL_INV_WQE_INV_FENCE_FLG_MASK    0x1
+#define RDMA_SQ_LOCAL_INV_WQE_INV_FENCE_FLG_SHIFT   2
+#define RDMA_SQ_LOCAL_INV_WQE_SE_FLG_MASK           0x1
+#define RDMA_SQ_LOCAL_INV_WQE_SE_FLG_SHIFT          3
+#define RDMA_SQ_LOCAL_INV_WQE_INLINE_FLG_MASK       0x1
+#define RDMA_SQ_LOCAL_INV_WQE_INLINE_FLG_SHIFT      4
+#define RDMA_SQ_LOCAL_INV_WQE_DIF_ON_HOST_FLG_MASK  0x1
+#define RDMA_SQ_LOCAL_INV_WQE_DIF_ON_HOST_FLG_SHIFT 5
+#define RDMA_SQ_LOCAL_INV_WQE_RESERVED0_MASK        0x3
+#define RDMA_SQ_LOCAL_INV_WQE_RESERVED0_SHIFT       6
+	u8 wqe_size;
+	u8 prev_wqe_size;
+};
+
+struct rdma_sq_rdma_wqe {
+	__le32 imm_data;
+	__le32 length;
+	__le32 xrc_srq;
+	u8 req_type;
+	u8 flags;
+#define RDMA_SQ_RDMA_WQE_COMP_FLG_MASK		0x1
+#define RDMA_SQ_RDMA_WQE_COMP_FLG_SHIFT		0
+#define RDMA_SQ_RDMA_WQE_RD_FENCE_FLG_MASK	0x1
+#define RDMA_SQ_RDMA_WQE_RD_FENCE_FLG_SHIFT	1
+#define RDMA_SQ_RDMA_WQE_INV_FENCE_FLG_MASK	0x1
+#define RDMA_SQ_RDMA_WQE_INV_FENCE_FLG_SHIFT	2
+#define RDMA_SQ_RDMA_WQE_SE_FLG_MASK		0x1
+#define RDMA_SQ_RDMA_WQE_SE_FLG_SHIFT		3
+#define RDMA_SQ_RDMA_WQE_INLINE_FLG_MASK	0x1
+#define RDMA_SQ_RDMA_WQE_INLINE_FLG_SHIFT	4
+#define RDMA_SQ_RDMA_WQE_DIF_ON_HOST_FLG_MASK	0x1
+#define RDMA_SQ_RDMA_WQE_DIF_ON_HOST_FLG_SHIFT	5
+#define RDMA_SQ_RDMA_WQE_READ_INV_FLG_MASK	0x1
+#define RDMA_SQ_RDMA_WQE_READ_INV_FLG_SHIFT	6
+#define RDMA_SQ_RDMA_WQE_RESERVED0_MASK		0x1
+#define RDMA_SQ_RDMA_WQE_RESERVED0_SHIFT	7
+	u8 wqe_size;
+	u8 prev_wqe_size;
+	struct regpair remote_va;
+	__le32 r_key;
+	u8 dif_flags;
+#define RDMA_SQ_RDMA_WQE_DIF_BLOCK_SIZE_MASK            0x1
+#define RDMA_SQ_RDMA_WQE_DIF_BLOCK_SIZE_SHIFT           0
+#define RDMA_SQ_RDMA_WQE_DIF_FIRST_RDMA_IN_IO_FLG_MASK  0x1
+#define RDMA_SQ_RDMA_WQE_DIF_FIRST_RDMA_IN_IO_FLG_SHIFT 1
+#define RDMA_SQ_RDMA_WQE_DIF_LAST_RDMA_IN_IO_FLG_MASK   0x1
+#define RDMA_SQ_RDMA_WQE_DIF_LAST_RDMA_IN_IO_FLG_SHIFT  2
+#define RDMA_SQ_RDMA_WQE_RESERVED1_MASK                 0x1F
+#define RDMA_SQ_RDMA_WQE_RESERVED1_SHIFT                3
+	u8 reserved2[3];
+};
+
+/* First element (16 bytes) of rdma wqe */
+struct rdma_sq_rdma_wqe_1st {
+	__le32 imm_data;
+	__le32 length;
+	__le32 xrc_srq;
+	u8 req_type;
+	u8 flags;
+#define RDMA_SQ_RDMA_WQE_1ST_COMP_FLG_MASK         0x1
+#define RDMA_SQ_RDMA_WQE_1ST_COMP_FLG_SHIFT        0
+#define RDMA_SQ_RDMA_WQE_1ST_RD_FENCE_FLG_MASK     0x1
+#define RDMA_SQ_RDMA_WQE_1ST_RD_FENCE_FLG_SHIFT    1
+#define RDMA_SQ_RDMA_WQE_1ST_INV_FENCE_FLG_MASK    0x1
+#define RDMA_SQ_RDMA_WQE_1ST_INV_FENCE_FLG_SHIFT   2
+#define RDMA_SQ_RDMA_WQE_1ST_SE_FLG_MASK           0x1
+#define RDMA_SQ_RDMA_WQE_1ST_SE_FLG_SHIFT          3
+#define RDMA_SQ_RDMA_WQE_1ST_INLINE_FLG_MASK       0x1
+#define RDMA_SQ_RDMA_WQE_1ST_INLINE_FLG_SHIFT      4
+#define RDMA_SQ_RDMA_WQE_1ST_DIF_ON_HOST_FLG_MASK  0x1
+#define RDMA_SQ_RDMA_WQE_1ST_DIF_ON_HOST_FLG_SHIFT 5
+#define RDMA_SQ_RDMA_WQE_1ST_READ_INV_FLG_MASK     0x1
+#define RDMA_SQ_RDMA_WQE_1ST_READ_INV_FLG_SHIFT    6
+#define RDMA_SQ_RDMA_WQE_1ST_RESERVED0_MASK        0x1
+#define RDMA_SQ_RDMA_WQE_1ST_RESERVED0_SHIFT       7
+	u8 wqe_size;
+	u8 prev_wqe_size;
+};
+
+/* Second element (16 bytes) of rdma wqe */
+struct rdma_sq_rdma_wqe_2nd {
+	struct regpair remote_va;
+	__le32 r_key;
+	u8 dif_flags;
+#define RDMA_SQ_RDMA_WQE_2ND_DIF_BLOCK_SIZE_MASK         0x1
+#define RDMA_SQ_RDMA_WQE_2ND_DIF_BLOCK_SIZE_SHIFT        0
+#define RDMA_SQ_RDMA_WQE_2ND_DIF_FIRST_SEGMENT_FLG_MASK  0x1
+#define RDMA_SQ_RDMA_WQE_2ND_DIF_FIRST_SEGMENT_FLG_SHIFT 1
+#define RDMA_SQ_RDMA_WQE_2ND_DIF_LAST_SEGMENT_FLG_MASK   0x1
+#define RDMA_SQ_RDMA_WQE_2ND_DIF_LAST_SEGMENT_FLG_SHIFT  2
+#define RDMA_SQ_RDMA_WQE_2ND_RESERVED1_MASK              0x1F
+#define RDMA_SQ_RDMA_WQE_2ND_RESERVED1_SHIFT             3
+	u8 reserved2[3];
+};
+
+/* SQ WQE req type enumeration */
+enum rdma_sq_req_type {
+	RDMA_SQ_REQ_TYPE_SEND,
+	RDMA_SQ_REQ_TYPE_SEND_WITH_IMM,
+	RDMA_SQ_REQ_TYPE_SEND_WITH_INVALIDATE,
+	RDMA_SQ_REQ_TYPE_RDMA_WR,
+	RDMA_SQ_REQ_TYPE_RDMA_WR_WITH_IMM,
+	RDMA_SQ_REQ_TYPE_RDMA_RD,
+	RDMA_SQ_REQ_TYPE_ATOMIC_CMP_AND_SWAP,
+	RDMA_SQ_REQ_TYPE_ATOMIC_ADD,
+	RDMA_SQ_REQ_TYPE_LOCAL_INVALIDATE,
+	RDMA_SQ_REQ_TYPE_FAST_MR,
+	RDMA_SQ_REQ_TYPE_BIND,
+	RDMA_SQ_REQ_TYPE_INVALID,
+	MAX_RDMA_SQ_REQ_TYPE
+};
+
+struct rdma_sq_send_wqe {
+	__le32 inv_key_or_imm_data;
+	__le32 length;
+	__le32 xrc_srq;
+	u8 req_type;
+	u8 flags;
+#define RDMA_SQ_SEND_WQE_COMP_FLG_MASK         0x1
+#define RDMA_SQ_SEND_WQE_COMP_FLG_SHIFT        0
+#define RDMA_SQ_SEND_WQE_RD_FENCE_FLG_MASK     0x1
+#define RDMA_SQ_SEND_WQE_RD_FENCE_FLG_SHIFT    1
+#define RDMA_SQ_SEND_WQE_INV_FENCE_FLG_MASK    0x1
+#define RDMA_SQ_SEND_WQE_INV_FENCE_FLG_SHIFT   2
+#define RDMA_SQ_SEND_WQE_SE_FLG_MASK           0x1
+#define RDMA_SQ_SEND_WQE_SE_FLG_SHIFT          3
+#define RDMA_SQ_SEND_WQE_INLINE_FLG_MASK       0x1
+#define RDMA_SQ_SEND_WQE_INLINE_FLG_SHIFT      4
+#define RDMA_SQ_SEND_WQE_DIF_ON_HOST_FLG_MASK  0x1
+#define RDMA_SQ_SEND_WQE_DIF_ON_HOST_FLG_SHIFT 5
+#define RDMA_SQ_SEND_WQE_RESERVED0_MASK        0x3
+#define RDMA_SQ_SEND_WQE_RESERVED0_SHIFT       6
+	u8 wqe_size;
+	u8 prev_wqe_size;
+	__le32 reserved1[4];
+};
+
+struct rdma_sq_send_wqe_1st {
+	__le32 inv_key_or_imm_data;
+	__le32 length;
+	__le32 xrc_srq;
+	u8 req_type;
+	u8 flags;
+#define RDMA_SQ_SEND_WQE_1ST_COMP_FLG_MASK       0x1
+#define RDMA_SQ_SEND_WQE_1ST_COMP_FLG_SHIFT      0
+#define RDMA_SQ_SEND_WQE_1ST_RD_FENCE_FLG_MASK   0x1
+#define RDMA_SQ_SEND_WQE_1ST_RD_FENCE_FLG_SHIFT  1
+#define RDMA_SQ_SEND_WQE_1ST_INV_FENCE_FLG_MASK  0x1
+#define RDMA_SQ_SEND_WQE_1ST_INV_FENCE_FLG_SHIFT 2
+#define RDMA_SQ_SEND_WQE_1ST_SE_FLG_MASK         0x1
+#define RDMA_SQ_SEND_WQE_1ST_SE_FLG_SHIFT        3
+#define RDMA_SQ_SEND_WQE_1ST_INLINE_FLG_MASK     0x1
+#define RDMA_SQ_SEND_WQE_1ST_INLINE_FLG_SHIFT    4
+#define RDMA_SQ_SEND_WQE_1ST_RESERVED0_MASK      0x7
+#define RDMA_SQ_SEND_WQE_1ST_RESERVED0_SHIFT     5
+	u8 wqe_size;
+	u8 prev_wqe_size;
+};
+
+struct rdma_sq_send_wqe_2st {
+	__le32 reserved1[4];
+};
+
+#endif /* __QED_HSI_RDMA__ */
diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
new file mode 100644
index 0000000..26dc374
--- /dev/null
+++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
@@ -0,0 +1,752 @@
+/* QLogic qedr NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/udp.h>
+#include <net/addrconf.h>
+#include <net/route.h>
+#include <net/ip6_route.h>
+#include <net/flow.h>
+#include "qedr.h"
+#include "qedr_iw_cm.h"
+
+static inline void
+qedr_fill_sockaddr4(const struct qed_iwarp_cm_info *cm_info,
+		    struct iw_cm_event *event)
+{
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&event->local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&event->remote_addr;
+
+	laddr->sin_family = AF_INET;
+	raddr->sin_family = AF_INET;
+
+	laddr->sin_port = htons(cm_info->local_port);
+	raddr->sin_port = htons(cm_info->remote_port);
+
+	laddr->sin_addr.s_addr = htonl(cm_info->local_ip[0]);
+	raddr->sin_addr.s_addr = htonl(cm_info->remote_ip[0]);
+}
+
+static inline void
+qedr_fill_sockaddr6(const struct qed_iwarp_cm_info *cm_info,
+		    struct iw_cm_event *event)
+{
+	struct sockaddr_in6 *laddr6 = (struct sockaddr_in6 *)&event->local_addr;
+	struct sockaddr_in6 *raddr6 =
+	    (struct sockaddr_in6 *)&event->remote_addr;
+	int i;
+
+	laddr6->sin6_family = AF_INET6;
+	raddr6->sin6_family = AF_INET6;
+
+	laddr6->sin6_port = htons(cm_info->local_port);
+	raddr6->sin6_port = htons(cm_info->remote_port);
+
+	for (i = 0; i < 4; i++) {
+		laddr6->sin6_addr.in6_u.u6_addr32[i] =
+		    htonl(cm_info->local_ip[i]);
+		raddr6->sin6_addr.in6_u.u6_addr32[i] =
+		    htonl(cm_info->remote_ip[i]);
+	}
+}
+
+static void
+qedr_iw_mpa_request(void *context, struct qed_iwarp_cm_event_params *params)
+{
+	struct qedr_iw_listener *listener = (struct qedr_iw_listener *)context;
+	struct qedr_dev *dev = listener->dev;
+	struct iw_cm_event event;
+	struct qedr_iw_ep *ep;
+
+	ep = kzalloc(sizeof(*ep), GFP_ATOMIC);
+	if (!ep)
+		return;
+
+	ep->dev = dev;
+	ep->qed_context = params->ep_context;
+
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REQUEST;
+	event.status = params->status;
+
+	if (!IS_ENABLED(CONFIG_IPV6) ||
+	    params->cm_info->ip_version == QED_TCP_IPV4)
+		qedr_fill_sockaddr4(params->cm_info, &event);
+	else
+		qedr_fill_sockaddr6(params->cm_info, &event);
+
+	event.provider_data = (void *)ep;
+	event.private_data = (void *)params->cm_info->private_data;
+	event.private_data_len = (u8)params->cm_info->private_data_len;
+	event.ord = params->cm_info->ord;
+	event.ird = params->cm_info->ird;
+
+	listener->cm_id->event_handler(listener->cm_id, &event);
+}
+
+static void
+qedr_iw_issue_event(void *context,
+		    struct qed_iwarp_cm_event_params *params,
+		    enum iw_cm_event_type event_type)
+{
+	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
+	struct iw_cm_event event;
+
+	memset(&event, 0, sizeof(event));
+	event.status = params->status;
+	event.event = event_type;
+
+	if (params->cm_info) {
+		event.ird = params->cm_info->ird;
+		event.ord = params->cm_info->ord;
+		event.private_data_len = params->cm_info->private_data_len;
+		event.private_data = (void *)params->cm_info->private_data;
+	}
+
+	if (ep->cm_id)
+		ep->cm_id->event_handler(ep->cm_id, &event);
+}
+
+static void
+qedr_iw_close_event(void *context, struct qed_iwarp_cm_event_params *params)
+{
+	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
+
+	if (ep->cm_id) {
+		qedr_iw_issue_event(context, params, IW_CM_EVENT_CLOSE);
+
+		ep->cm_id->rem_ref(ep->cm_id);
+		ep->cm_id = NULL;
+	}
+}
+
+static void
+qedr_iw_qp_event(void *context,
+		 struct qed_iwarp_cm_event_params *params,
+		 enum ib_event_type ib_event, char *str)
+{
+	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
+	struct qedr_dev *dev = ep->dev;
+	struct ib_qp *ibqp = &ep->qp->ibqp;
+	struct ib_event event;
+
+	DP_NOTICE(dev, "QP error received: %s\n", str);
+
+	if (ibqp->event_handler) {
+		event.event = ib_event;
+		event.device = ibqp->device;
+		event.element.qp = ibqp;
+		ibqp->event_handler(&event, ibqp->qp_context);
+	}
+}
+
+struct qedr_discon_work {
+	struct work_struct		work;
+	struct qedr_iw_ep		*ep;
+	enum qed_iwarp_event_type	event;
+	int				status;
+};
+
+static void qedr_iw_disconnect_worker(struct work_struct *work)
+{
+	struct qedr_discon_work *dwork =
+	    container_of(work, struct qedr_discon_work, work);
+	struct qed_rdma_modify_qp_in_params qp_params = { 0 };
+	struct qedr_iw_ep *ep = dwork->ep;
+	struct qedr_dev *dev = ep->dev;
+	struct qedr_qp *qp = ep->qp;
+	struct iw_cm_event event;
+
+	if (qp->destroyed) {
+		kfree(dwork);
+		qedr_iw_qp_rem_ref(&qp->ibqp);
+		return;
+	}
+
+	memset(&event, 0, sizeof(event));
+	event.status = dwork->status;
+	event.event = IW_CM_EVENT_DISCONNECT;
+
+	/* Success means graceful disconnect was requested. modifying
+	 * to SQD is translated to graceful disconnect. O/w reset is sent
+	 */
+	if (dwork->status)
+		qp_params.new_state = QED_ROCE_QP_STATE_ERR;
+	else
+		qp_params.new_state = QED_ROCE_QP_STATE_SQD;
+
+	kfree(dwork);
+
+	if (ep->cm_id)
+		ep->cm_id->event_handler(ep->cm_id, &event);
+
+	SET_FIELD(qp_params.modify_flags,
+		  QED_RDMA_MODIFY_QP_VALID_NEW_STATE, 1);
+
+	dev->ops->rdma_modify_qp(dev->rdma_ctx, qp->qed_qp, &qp_params);
+
+	qedr_iw_qp_rem_ref(&qp->ibqp);
+}
+
+static void
+qedr_iw_disconnect_event(void *context,
+			 struct qed_iwarp_cm_event_params *params)
+{
+	struct qedr_discon_work *work;
+	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
+	struct qedr_dev *dev = ep->dev;
+	struct qedr_qp *qp = ep->qp;
+
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return;
+
+	qedr_iw_qp_add_ref(&qp->ibqp);
+	work->ep = ep;
+	work->event = params->event;
+	work->status = params->status;
+
+	INIT_WORK(&work->work, qedr_iw_disconnect_worker);
+	queue_work(dev->iwarp_wq, &work->work);
+}
+
+static void
+qedr_iw_passive_complete(void *context,
+			 struct qed_iwarp_cm_event_params *params)
+{
+	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
+	struct qedr_dev *dev = ep->dev;
+
+	/* We will only reach the following state if MPA_REJECT was called on
+	 * passive. In this case there will be no associated QP.
+	 */
+	if ((params->status == -ECONNREFUSED) && (!ep->qp)) {
+		DP_DEBUG(dev, QEDR_MSG_IWARP,
+			 "PASSIVE connection refused releasing ep...\n");
+		kfree(ep);
+		return;
+	}
+
+	qedr_iw_issue_event(context, params, IW_CM_EVENT_ESTABLISHED);
+
+	if (params->status < 0)
+		qedr_iw_close_event(context, params);
+}
+
+static int
+qedr_iw_mpa_reply(void *context, struct qed_iwarp_cm_event_params *params)
+{
+	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
+	struct qedr_dev *dev = ep->dev;
+	struct qed_iwarp_send_rtr_in rtr_in;
+
+	rtr_in.ep_context = params->ep_context;
+
+	return dev->ops->iwarp_send_rtr(dev->rdma_ctx, &rtr_in);
+}
+
+static int
+qedr_iw_event_handler(void *context, struct qed_iwarp_cm_event_params *params)
+{
+	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
+	struct qedr_dev *dev = ep->dev;
+
+	switch (params->event) {
+	case QED_IWARP_EVENT_MPA_REQUEST:
+		qedr_iw_mpa_request(context, params);
+		break;
+	case QED_IWARP_EVENT_ACTIVE_MPA_REPLY:
+		qedr_iw_mpa_reply(context, params);
+		break;
+	case QED_IWARP_EVENT_PASSIVE_COMPLETE:
+		ep->during_connect = 0;
+		qedr_iw_passive_complete(context, params);
+		break;
+
+	case QED_IWARP_EVENT_ACTIVE_COMPLETE:
+		ep->during_connect = 0;
+		qedr_iw_issue_event(context,
+				    params,
+				    IW_CM_EVENT_CONNECT_REPLY);
+		if (params->status < 0) {
+			struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context;
+
+			ep->cm_id->rem_ref(ep->cm_id);
+			ep->cm_id = NULL;
+		}
+		break;
+	case QED_IWARP_EVENT_DISCONNECT:
+		qedr_iw_disconnect_event(context, params);
+		break;
+	case QED_IWARP_EVENT_CLOSE:
+		ep->during_connect = 0;
+		qedr_iw_close_event(context, params);
+		break;
+	case QED_IWARP_EVENT_RQ_EMPTY:
+		qedr_iw_qp_event(context, params, IB_EVENT_QP_FATAL,
+				 "QED_IWARP_EVENT_RQ_EMPTY");
+		break;
+	case QED_IWARP_EVENT_IRQ_FULL:
+		qedr_iw_qp_event(context, params, IB_EVENT_QP_FATAL,
+				 "QED_IWARP_EVENT_IRQ_FULL");
+		break;
+	case QED_IWARP_EVENT_LLP_TIMEOUT:
+		qedr_iw_qp_event(context, params, IB_EVENT_QP_FATAL,
+				 "QED_IWARP_EVENT_LLP_TIMEOUT");
+		break;
+	case QED_IWARP_EVENT_REMOTE_PROTECTION_ERROR:
+		qedr_iw_qp_event(context, params, IB_EVENT_QP_ACCESS_ERR,
+				 "QED_IWARP_EVENT_REMOTE_PROTECTION_ERROR");
+		break;
+	case QED_IWARP_EVENT_CQ_OVERFLOW:
+		qedr_iw_qp_event(context, params, IB_EVENT_QP_FATAL,
+				 "QED_IWARP_EVENT_CQ_OVERFLOW");
+		break;
+	case QED_IWARP_EVENT_QP_CATASTROPHIC:
+		qedr_iw_qp_event(context, params, IB_EVENT_QP_FATAL,
+				 "QED_IWARP_EVENT_QP_CATASTROPHIC");
+		break;
+	case QED_IWARP_EVENT_LOCAL_ACCESS_ERROR:
+		qedr_iw_qp_event(context, params, IB_EVENT_QP_ACCESS_ERR,
+				 "QED_IWARP_EVENT_LOCAL_ACCESS_ERROR");
+		break;
+	case QED_IWARP_EVENT_REMOTE_OPERATION_ERROR:
+		qedr_iw_qp_event(context, params, IB_EVENT_QP_FATAL,
+				 "QED_IWARP_EVENT_REMOTE_OPERATION_ERROR");
+		break;
+	case QED_IWARP_EVENT_TERMINATE_RECEIVED:
+		DP_NOTICE(dev, "Got terminate message\n");
+		break;
+	default:
+		DP_NOTICE(dev, "Unknown event received %d\n", params->event);
+		break;
+	};
+	return 0;
+}
+
+static u16 qedr_iw_get_vlan_ipv4(struct qedr_dev *dev, u32 *addr)
+{
+	struct net_device *ndev;
+	u16 vlan_id = 0;
+
+	ndev = ip_dev_find(&init_net, htonl(addr[0]));
+
+	if (ndev) {
+		vlan_id = rdma_vlan_dev_vlan_id(ndev);
+		dev_put(ndev);
+	}
+	if (vlan_id == 0xffff)
+		vlan_id = 0;
+	return vlan_id;
+}
+
+static u16 qedr_iw_get_vlan_ipv6(u32 *addr)
+{
+	struct net_device *ndev = NULL;
+	struct in6_addr laddr6;
+	u16 vlan_id = 0;
+	int i;
+
+	if (!IS_ENABLED(CONFIG_IPV6))
+		return vlan_id;
+
+	for (i = 0; i < 4; i++)
+		laddr6.in6_u.u6_addr32[i] = htonl(addr[i]);
+
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, ndev) {
+		if (ipv6_chk_addr(&init_net, &laddr6, ndev, 1)) {
+			vlan_id = rdma_vlan_dev_vlan_id(ndev);
+			break;
+		}
+	}
+
+	rcu_read_unlock();
+	if (vlan_id == 0xffff)
+		vlan_id = 0;
+
+	return vlan_id;
+}
+
+static int
+qedr_addr4_resolve(struct qedr_dev *dev,
+		   struct sockaddr_in *src_in,
+		   struct sockaddr_in *dst_in, u8 *dst_mac)
+{
+	__be32 src_ip = src_in->sin_addr.s_addr;
+	__be32 dst_ip = dst_in->sin_addr.s_addr;
+	struct neighbour *neigh = NULL;
+	struct rtable *rt = NULL;
+	int rc = 0;
+
+	rt = ip_route_output(&init_net, dst_ip, src_ip, 0, 0);
+	if (IS_ERR(rt)) {
+		DP_ERR(dev, "ip_route_output returned error\n");
+		return -EINVAL;
+	}
+
+	neigh = dst_neigh_lookup(&rt->dst, &dst_ip);
+
+	if (neigh) {
+		rcu_read_lock();
+		if (neigh->nud_state & NUD_VALID) {
+			ether_addr_copy(dst_mac, neigh->ha);
+			DP_DEBUG(dev, QEDR_MSG_QP, "mac_addr=[%pM]\n", dst_mac);
+		} else {
+			neigh_event_send(neigh, NULL);
+		}
+		rcu_read_unlock();
+		neigh_release(neigh);
+	}
+
+	ip_rt_put(rt);
+
+	return rc;
+}
+
+static int
+qedr_addr6_resolve(struct qedr_dev *dev,
+		   struct sockaddr_in6 *src_in,
+		   struct sockaddr_in6 *dst_in, u8 *dst_mac)
+{
+	struct neighbour *neigh = NULL;
+	struct dst_entry *dst;
+	struct flowi6 fl6;
+	int rc = 0;
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.daddr = dst_in->sin6_addr;
+	fl6.saddr = src_in->sin6_addr;
+
+	dst = ip6_route_output(&init_net, NULL, &fl6);
+
+	if ((!dst) || dst->error) {
+		if (dst) {
+			dst_release(dst);
+			DP_ERR(dev,
+			       "ip6_route_output returned dst->error = %d\n",
+			       dst->error);
+		}
+		return -EINVAL;
+	}
+	neigh = dst_neigh_lookup(dst, &fl6.daddr);
+	if (neigh) {
+		rcu_read_lock();
+		if (neigh->nud_state & NUD_VALID) {
+			ether_addr_copy(dst_mac, neigh->ha);
+			DP_DEBUG(dev, QEDR_MSG_QP, "mac_addr=[%pM]\n", dst_mac);
+		} else {
+			neigh_event_send(neigh, NULL);
+		}
+		rcu_read_unlock();
+		neigh_release(neigh);
+	}
+
+	dst_release(dst);
+
+	return rc;
+}
+
+int qedr_iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct qedr_dev *dev = get_qedr_dev(cm_id->device);
+	struct qed_iwarp_connect_out out_params;
+	struct qed_iwarp_connect_in in_params;
+	struct qed_iwarp_cm_info *cm_info;
+	struct sockaddr_in6 *laddr6;
+	struct sockaddr_in6 *raddr6;
+	struct sockaddr_in *laddr;
+	struct sockaddr_in *raddr;
+	struct qedr_iw_ep *ep;
+	struct qedr_qp *qp;
+	int rc = 0;
+	int i;
+
+	qp = idr_find(&dev->qpidr, conn_param->qpn);
+
+	laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
+	raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
+	laddr6 = (struct sockaddr_in6 *)&cm_id->m_local_addr;
+	raddr6 = (struct sockaddr_in6 *)&cm_id->m_remote_addr;
+
+	DP_DEBUG(dev, QEDR_MSG_IWARP, "MAPPED %d %d\n",
+		 ntohs(((struct sockaddr_in *)&cm_id->remote_addr)->sin_port),
+		 ntohs(raddr->sin_port));
+
+	DP_DEBUG(dev, QEDR_MSG_IWARP,
+		 "Connect source address: %pISpc, remote address: %pISpc\n",
+		 &cm_id->local_addr, &cm_id->remote_addr);
+
+	if (!laddr->sin_port || !raddr->sin_port)
+		return -EINVAL;
+
+	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+	if (!ep)
+		return -ENOMEM;
+
+	ep->dev = dev;
+	ep->qp = qp;
+	qp->ep = ep;
+	cm_id->add_ref(cm_id);
+	ep->cm_id = cm_id;
+
+	in_params.event_cb = qedr_iw_event_handler;
+	in_params.cb_context = ep;
+
+	cm_info = &in_params.cm_info;
+	memset(cm_info->local_ip, 0, sizeof(cm_info->local_ip));
+	memset(cm_info->remote_ip, 0, sizeof(cm_info->remote_ip));
+
+	if (!IS_ENABLED(CONFIG_IPV6) ||
+	    cm_id->remote_addr.ss_family == AF_INET) {
+		cm_info->ip_version = QED_TCP_IPV4;
+
+		cm_info->remote_ip[0] = ntohl(raddr->sin_addr.s_addr);
+		cm_info->local_ip[0] = ntohl(laddr->sin_addr.s_addr);
+		cm_info->remote_port = ntohs(raddr->sin_port);
+		cm_info->local_port = ntohs(laddr->sin_port);
+		cm_info->vlan = qedr_iw_get_vlan_ipv4(dev, cm_info->local_ip);
+
+		rc = qedr_addr4_resolve(dev, laddr, raddr,
+					(u8 *)in_params.remote_mac_addr);
+
+		in_params.mss = dev->iwarp_max_mtu -
+		    (sizeof(struct iphdr) + sizeof(struct tcphdr));
+
+	} else {
+		in_params.cm_info.ip_version = QED_TCP_IPV6;
+
+		for (i = 0; i < 4; i++) {
+			cm_info->remote_ip[i] =
+			    ntohl(raddr6->sin6_addr.in6_u.u6_addr32[i]);
+			cm_info->local_ip[i] =
+			    ntohl(laddr6->sin6_addr.in6_u.u6_addr32[i]);
+		}
+
+		cm_info->local_port = ntohs(laddr6->sin6_port);
+		cm_info->remote_port = ntohs(raddr6->sin6_port);
+
+		in_params.mss = dev->iwarp_max_mtu -
+		    (sizeof(struct ipv6hdr) + sizeof(struct tcphdr));
+
+		cm_info->vlan = qedr_iw_get_vlan_ipv6(cm_info->local_ip);
+
+		rc = qedr_addr6_resolve(dev, laddr6, raddr6,
+					(u8 *)in_params.remote_mac_addr);
+	}
+	if (rc)
+		goto err;
+
+	DP_DEBUG(dev, QEDR_MSG_IWARP,
+		 "ord = %d ird=%d private_data=%p private_data_len=%d rq_psn=%d\n",
+		 conn_param->ord, conn_param->ird, conn_param->private_data,
+		 conn_param->private_data_len, qp->rq_psn);
+
+	cm_info->ord = conn_param->ord;
+	cm_info->ird = conn_param->ird;
+	cm_info->private_data = conn_param->private_data;
+	cm_info->private_data_len = conn_param->private_data_len;
+	in_params.qp = qp->qed_qp;
+	memcpy(in_params.local_mac_addr, dev->ndev->dev_addr, ETH_ALEN);
+
+	ep->during_connect = 1;
+	rc = dev->ops->iwarp_connect(dev->rdma_ctx, &in_params, &out_params);
+	if (rc)
+		goto err;
+
+	return rc;
+
+err:
+	cm_id->rem_ref(cm_id);
+	kfree(ep);
+	return rc;
+}
+
+int qedr_iw_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	struct qedr_dev *dev = get_qedr_dev(cm_id->device);
+	struct qedr_iw_listener *listener;
+	struct qed_iwarp_listen_in iparams;
+	struct qed_iwarp_listen_out oparams;
+	struct sockaddr_in *laddr;
+	struct sockaddr_in6 *laddr6;
+	int rc;
+	int i;
+
+	laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
+	laddr6 = (struct sockaddr_in6 *)&cm_id->m_local_addr;
+
+	DP_DEBUG(dev, QEDR_MSG_IWARP,
+		 "Create Listener address: %pISpc\n", &cm_id->local_addr);
+
+	listener = kzalloc(sizeof(*listener), GFP_KERNEL);
+	if (!listener)
+		return -ENOMEM;
+
+	listener->dev = dev;
+	cm_id->add_ref(cm_id);
+	listener->cm_id = cm_id;
+	listener->backlog = backlog;
+
+	iparams.cb_context = listener;
+	iparams.event_cb = qedr_iw_event_handler;
+	iparams.max_backlog = backlog;
+
+	if (!IS_ENABLED(CONFIG_IPV6) ||
+	    cm_id->local_addr.ss_family == AF_INET) {
+		iparams.ip_version = QED_TCP_IPV4;
+		memset(iparams.ip_addr, 0, sizeof(iparams.ip_addr));
+
+		iparams.ip_addr[0] = ntohl(laddr->sin_addr.s_addr);
+		iparams.port = ntohs(laddr->sin_port);
+		iparams.vlan = qedr_iw_get_vlan_ipv4(dev, iparams.ip_addr);
+	} else {
+		iparams.ip_version = QED_TCP_IPV6;
+
+		for (i = 0; i < 4; i++) {
+			iparams.ip_addr[i] =
+			    ntohl(laddr6->sin6_addr.in6_u.u6_addr32[i]);
+		}
+
+		iparams.port = ntohs(laddr6->sin6_port);
+
+		iparams.vlan = qedr_iw_get_vlan_ipv6(iparams.ip_addr);
+	}
+	rc = dev->ops->iwarp_create_listen(dev->rdma_ctx, &iparams, &oparams);
+	if (rc)
+		goto err;
+
+	listener->qed_handle = oparams.handle;
+	cm_id->provider_data = listener;
+	return rc;
+
+err:
+	cm_id->rem_ref(cm_id);
+	kfree(listener);
+	return rc;
+}
+
+int qedr_iw_destroy_listen(struct iw_cm_id *cm_id)
+{
+	struct qedr_iw_listener *listener = cm_id->provider_data;
+	struct qedr_dev *dev = get_qedr_dev(cm_id->device);
+	int rc = 0;
+
+	if (listener->qed_handle)
+		rc = dev->ops->iwarp_destroy_listen(dev->rdma_ctx,
+						    listener->qed_handle);
+
+	cm_id->rem_ref(cm_id);
+	return rc;
+}
+
+int qedr_iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)cm_id->provider_data;
+	struct qedr_dev *dev = ep->dev;
+	struct qedr_qp *qp;
+	struct qed_iwarp_accept_in params;
+	int rc;
+
+	DP_DEBUG(dev, QEDR_MSG_IWARP, "Accept on qpid=%d\n", conn_param->qpn);
+
+	qp = idr_find(&dev->qpidr, conn_param->qpn);
+	if (!qp) {
+		DP_ERR(dev, "Invalid QP number %d\n", conn_param->qpn);
+		return -EINVAL;
+	}
+
+	ep->qp = qp;
+	qp->ep = ep;
+	cm_id->add_ref(cm_id);
+	ep->cm_id = cm_id;
+
+	params.ep_context = ep->qed_context;
+	params.cb_context = ep;
+	params.qp = ep->qp->qed_qp;
+	params.private_data = conn_param->private_data;
+	params.private_data_len = conn_param->private_data_len;
+	params.ird = conn_param->ird;
+	params.ord = conn_param->ord;
+
+	ep->during_connect = 1;
+	rc = dev->ops->iwarp_accept(dev->rdma_ctx, &params);
+	if (rc)
+		goto err;
+
+	return rc;
+err:
+	ep->during_connect = 0;
+	cm_id->rem_ref(cm_id);
+	return rc;
+}
+
+int qedr_iw_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	struct qedr_iw_ep *ep = (struct qedr_iw_ep *)cm_id->provider_data;
+	struct qedr_dev *dev = ep->dev;
+	struct qed_iwarp_reject_in params;
+
+	params.ep_context = ep->qed_context;
+	params.cb_context = ep;
+	params.private_data = pdata;
+	params.private_data_len = pdata_len;
+	ep->qp = NULL;
+
+	return dev->ops->iwarp_reject(dev->rdma_ctx, &params);
+}
+
+void qedr_iw_qp_add_ref(struct ib_qp *ibqp)
+{
+	struct qedr_qp *qp = get_qedr_qp(ibqp);
+
+	atomic_inc(&qp->refcnt);
+}
+
+void qedr_iw_qp_rem_ref(struct ib_qp *ibqp)
+{
+	struct qedr_qp *qp = get_qedr_qp(ibqp);
+
+	if (atomic_dec_and_test(&qp->refcnt)) {
+		spin_lock_irq(&qp->dev->idr_lock);
+		idr_remove(&qp->dev->qpidr, qp->qp_id);
+		spin_unlock_irq(&qp->dev->idr_lock);
+		kfree(qp);
+	}
+}
+
+struct ib_qp *qedr_iw_get_qp(struct ib_device *ibdev, int qpn)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibdev);
+
+	return idr_find(&dev->qpidr, qpn);
+}
diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.h b/drivers/infiniband/hw/qedr/qedr_iw_cm.h
new file mode 100644
index 0000000..08f4b10
--- /dev/null
+++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.h
@@ -0,0 +1,49 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <rdma/iw_cm.h>
+
+int qedr_iw_connect(struct iw_cm_id *cm_id,
+		    struct iw_cm_conn_param *conn_param);
+
+int qedr_iw_create_listen(struct iw_cm_id *cm_id, int backlog);
+
+int qedr_iw_destroy_listen(struct iw_cm_id *cm_id);
+
+int qedr_iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+
+int qedr_iw_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
+
+void qedr_iw_qp_add_ref(struct ib_qp *qp);
+
+void qedr_iw_qp_rem_ref(struct ib_qp *qp);
+
+struct ib_qp *qedr_iw_get_qp(struct ib_device *dev, int qpn);
diff --git a/drivers/infiniband/hw/qedr/qedr_roce_cm.c b/drivers/infiniband/hw/qedr/qedr_roce_cm.c
new file mode 100644
index 0000000..0f14e68
--- /dev/null
+++ b/drivers/infiniband/hw/qedr/qedr_roce_cm.c
@@ -0,0 +1,741 @@
+/* QLogic qedr NIC Driver
+ * Copyright (c) 2015-2016  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/dma-mapping.h>
+#include <linux/crc32.h>
+#include <linux/iommu.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/udp.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include <linux/qed/qed_if.h>
+#include <linux/qed/qed_rdma_if.h>
+#include "qedr.h"
+#include "verbs.h"
+#include <rdma/qedr-abi.h>
+#include "qedr_roce_cm.h"
+
+void qedr_inc_sw_gsi_cons(struct qedr_qp_hwq_info *info)
+{
+	info->gsi_cons = (info->gsi_cons + 1) % info->max_wr;
+}
+
+void qedr_store_gsi_qp_cq(struct qedr_dev *dev, struct qedr_qp *qp,
+			  struct ib_qp_init_attr *attrs)
+{
+	dev->gsi_qp_created = 1;
+	dev->gsi_sqcq = get_qedr_cq(attrs->send_cq);
+	dev->gsi_rqcq = get_qedr_cq(attrs->recv_cq);
+	dev->gsi_qp = qp;
+}
+
+static void qedr_ll2_complete_tx_packet(void *cxt, u8 connection_handle,
+					void *cookie,
+					dma_addr_t first_frag_addr,
+					bool b_last_fragment,
+					bool b_last_packet)
+{
+	struct qedr_dev *dev = (struct qedr_dev *)cxt;
+	struct qed_roce_ll2_packet *pkt = cookie;
+	struct qedr_cq *cq = dev->gsi_sqcq;
+	struct qedr_qp *qp = dev->gsi_qp;
+	unsigned long flags;
+
+	DP_DEBUG(dev, QEDR_MSG_GSI,
+		 "LL2 TX CB: gsi_sqcq=%p, gsi_rqcq=%p, gsi_cons=%d, ibcq_comp=%s\n",
+		 dev->gsi_sqcq, dev->gsi_rqcq, qp->sq.gsi_cons,
+		 cq->ibcq.comp_handler ? "Yes" : "No");
+
+	dma_free_coherent(&dev->pdev->dev, pkt->header.len, pkt->header.vaddr,
+			  pkt->header.baddr);
+	kfree(pkt);
+
+	spin_lock_irqsave(&qp->q_lock, flags);
+	qedr_inc_sw_gsi_cons(&qp->sq);
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+
+	if (cq->ibcq.comp_handler)
+		(*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
+}
+
+static void qedr_ll2_complete_rx_packet(void *cxt,
+					struct qed_ll2_comp_rx_data *data)
+{
+	struct qedr_dev *dev = (struct qedr_dev *)cxt;
+	struct qedr_cq *cq = dev->gsi_rqcq;
+	struct qedr_qp *qp = dev->gsi_qp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->q_lock, flags);
+
+	qp->rqe_wr_id[qp->rq.gsi_cons].rc = data->u.data_length_error ?
+		-EINVAL : 0;
+	qp->rqe_wr_id[qp->rq.gsi_cons].vlan = data->vlan;
+	/* note: length stands for data length i.e. GRH is excluded */
+	qp->rqe_wr_id[qp->rq.gsi_cons].sg_list[0].length =
+		data->length.data_length;
+	*((u32 *)&qp->rqe_wr_id[qp->rq.gsi_cons].smac[0]) =
+		ntohl(data->opaque_data_0);
+	*((u16 *)&qp->rqe_wr_id[qp->rq.gsi_cons].smac[4]) =
+		ntohs((u16)data->opaque_data_1);
+
+	qedr_inc_sw_gsi_cons(&qp->rq);
+
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+
+	if (cq->ibcq.comp_handler)
+		(*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
+}
+
+static void qedr_ll2_release_rx_packet(void *cxt, u8 connection_handle,
+				       void *cookie, dma_addr_t rx_buf_addr,
+				       bool b_last_packet)
+{
+	/* Do nothing... */
+}
+
+static void qedr_destroy_gsi_cq(struct qedr_dev *dev,
+				struct ib_qp_init_attr *attrs)
+{
+	struct qed_rdma_destroy_cq_in_params iparams;
+	struct qed_rdma_destroy_cq_out_params oparams;
+	struct qedr_cq *cq;
+
+	cq = get_qedr_cq(attrs->send_cq);
+	iparams.icid = cq->icid;
+	dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams);
+	dev->ops->common->chain_free(dev->cdev, &cq->pbl);
+
+	cq = get_qedr_cq(attrs->recv_cq);
+	/* if a dedicated recv_cq was used, delete it too */
+	if (iparams.icid != cq->icid) {
+		iparams.icid = cq->icid;
+		dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams);
+		dev->ops->common->chain_free(dev->cdev, &cq->pbl);
+	}
+}
+
+static inline int qedr_check_gsi_qp_attrs(struct qedr_dev *dev,
+					  struct ib_qp_init_attr *attrs)
+{
+	if (attrs->cap.max_recv_sge > QEDR_GSI_MAX_RECV_SGE) {
+		DP_ERR(dev,
+		       " create gsi qp: failed. max_recv_sge is larger the max %d>%d\n",
+		       attrs->cap.max_recv_sge, QEDR_GSI_MAX_RECV_SGE);
+		return -EINVAL;
+	}
+
+	if (attrs->cap.max_recv_wr > QEDR_GSI_MAX_RECV_WR) {
+		DP_ERR(dev,
+		       " create gsi qp: failed. max_recv_wr is too large %d>%d\n",
+		       attrs->cap.max_recv_wr, QEDR_GSI_MAX_RECV_WR);
+		return -EINVAL;
+	}
+
+	if (attrs->cap.max_send_wr > QEDR_GSI_MAX_SEND_WR) {
+		DP_ERR(dev,
+		       " create gsi qp: failed. max_send_wr is too large %d>%d\n",
+		       attrs->cap.max_send_wr, QEDR_GSI_MAX_SEND_WR);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int qedr_ll2_post_tx(struct qedr_dev *dev,
+			    struct qed_roce_ll2_packet *pkt)
+{
+	enum qed_ll2_roce_flavor_type roce_flavor;
+	struct qed_ll2_tx_pkt_info ll2_tx_pkt;
+	int rc;
+	int i;
+
+	memset(&ll2_tx_pkt, 0, sizeof(ll2_tx_pkt));
+
+	roce_flavor = (pkt->roce_mode == ROCE_V1) ?
+	    QED_LL2_ROCE : QED_LL2_RROCE;
+
+	if (pkt->roce_mode == ROCE_V2_IPV4)
+		ll2_tx_pkt.enable_ip_cksum = 1;
+
+	ll2_tx_pkt.num_of_bds = 1 /* hdr */  + pkt->n_seg;
+	ll2_tx_pkt.vlan = 0;
+	ll2_tx_pkt.tx_dest = pkt->tx_dest;
+	ll2_tx_pkt.qed_roce_flavor = roce_flavor;
+	ll2_tx_pkt.first_frag = pkt->header.baddr;
+	ll2_tx_pkt.first_frag_len = pkt->header.len;
+	ll2_tx_pkt.cookie = pkt;
+
+	/* tx header */
+	rc = dev->ops->ll2_prepare_tx_packet(dev->rdma_ctx,
+					     dev->gsi_ll2_handle,
+					     &ll2_tx_pkt, 1);
+	if (rc) {
+		/* TX failed while posting header - release resources */
+		dma_free_coherent(&dev->pdev->dev, pkt->header.len,
+				  pkt->header.vaddr, pkt->header.baddr);
+		kfree(pkt);
+
+		DP_ERR(dev, "roce ll2 tx: header failed (rc=%d)\n", rc);
+		return rc;
+	}
+
+	/* tx payload */
+	for (i = 0; i < pkt->n_seg; i++) {
+		rc = dev->ops->ll2_set_fragment_of_tx_packet(
+			dev->rdma_ctx,
+			dev->gsi_ll2_handle,
+			pkt->payload[i].baddr,
+			pkt->payload[i].len);
+
+		if (rc) {
+			/* if failed not much to do here, partial packet has
+			 * been posted we can't free memory, will need to wait
+			 * for completion
+			 */
+			DP_ERR(dev, "ll2 tx: payload failed (rc=%d)\n", rc);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static int qedr_ll2_stop(struct qedr_dev *dev)
+{
+	int rc;
+
+	if (dev->gsi_ll2_handle == QED_LL2_UNUSED_HANDLE)
+		return 0;
+
+	/* remove LL2 MAC address filter */
+	rc = dev->ops->ll2_set_mac_filter(dev->cdev,
+					  dev->gsi_ll2_mac_address, NULL);
+
+	rc = dev->ops->ll2_terminate_connection(dev->rdma_ctx,
+						dev->gsi_ll2_handle);
+	if (rc)
+		DP_ERR(dev, "Failed to terminate LL2 connection (rc=%d)\n", rc);
+
+	dev->ops->ll2_release_connection(dev->rdma_ctx, dev->gsi_ll2_handle);
+
+	dev->gsi_ll2_handle = QED_LL2_UNUSED_HANDLE;
+
+	return rc;
+}
+
+static int qedr_ll2_start(struct qedr_dev *dev,
+			  struct ib_qp_init_attr *attrs, struct qedr_qp *qp)
+{
+	struct qed_ll2_acquire_data data;
+	struct qed_ll2_cbs cbs;
+	int rc;
+
+	/* configure and start LL2 */
+	cbs.rx_comp_cb = qedr_ll2_complete_rx_packet;
+	cbs.tx_comp_cb = qedr_ll2_complete_tx_packet;
+	cbs.rx_release_cb = qedr_ll2_release_rx_packet;
+	cbs.tx_release_cb = qedr_ll2_complete_tx_packet;
+	cbs.cookie = dev;
+
+	memset(&data, 0, sizeof(data));
+	data.input.conn_type = QED_LL2_TYPE_ROCE;
+	data.input.mtu = dev->ndev->mtu;
+	data.input.rx_num_desc = attrs->cap.max_recv_wr;
+	data.input.rx_drop_ttl0_flg = true;
+	data.input.rx_vlan_removal_en = false;
+	data.input.tx_num_desc = attrs->cap.max_send_wr;
+	data.input.tx_tc = 0;
+	data.input.tx_dest = QED_LL2_TX_DEST_NW;
+	data.input.ai_err_packet_too_big = QED_LL2_DROP_PACKET;
+	data.input.ai_err_no_buf = QED_LL2_DROP_PACKET;
+	data.input.gsi_enable = 1;
+	data.p_connection_handle = &dev->gsi_ll2_handle;
+	data.cbs = &cbs;
+
+	rc = dev->ops->ll2_acquire_connection(dev->rdma_ctx, &data);
+	if (rc) {
+		DP_ERR(dev,
+		       "ll2 start: failed to acquire LL2 connection (rc=%d)\n",
+		       rc);
+		return rc;
+	}
+
+	rc = dev->ops->ll2_establish_connection(dev->rdma_ctx,
+						dev->gsi_ll2_handle);
+	if (rc) {
+		DP_ERR(dev,
+		       "ll2 start: failed to establish LL2 connection (rc=%d)\n",
+		       rc);
+		goto err1;
+	}
+
+	rc = dev->ops->ll2_set_mac_filter(dev->cdev, NULL, dev->ndev->dev_addr);
+	if (rc)
+		goto err2;
+
+	return 0;
+
+err2:
+	dev->ops->ll2_terminate_connection(dev->rdma_ctx, dev->gsi_ll2_handle);
+err1:
+	dev->ops->ll2_release_connection(dev->rdma_ctx, dev->gsi_ll2_handle);
+
+	return rc;
+}
+
+struct ib_qp *qedr_create_gsi_qp(struct qedr_dev *dev,
+				 struct ib_qp_init_attr *attrs,
+				 struct qedr_qp *qp)
+{
+	int rc;
+
+	rc = qedr_check_gsi_qp_attrs(dev, attrs);
+	if (rc)
+		return ERR_PTR(rc);
+
+	rc = qedr_ll2_start(dev, attrs, qp);
+	if (rc) {
+		DP_ERR(dev, "create gsi qp: failed on ll2 start. rc=%d\n", rc);
+		return ERR_PTR(rc);
+	}
+
+	/* create QP */
+	qp->ibqp.qp_num = 1;
+	qp->rq.max_wr = attrs->cap.max_recv_wr;
+	qp->sq.max_wr = attrs->cap.max_send_wr;
+
+	qp->rqe_wr_id = kcalloc(qp->rq.max_wr, sizeof(*qp->rqe_wr_id),
+				GFP_KERNEL);
+	if (!qp->rqe_wr_id)
+		goto err;
+	qp->wqe_wr_id = kcalloc(qp->sq.max_wr, sizeof(*qp->wqe_wr_id),
+				GFP_KERNEL);
+	if (!qp->wqe_wr_id)
+		goto err;
+
+	qedr_store_gsi_qp_cq(dev, qp, attrs);
+	ether_addr_copy(dev->gsi_ll2_mac_address, dev->ndev->dev_addr);
+
+	/* the GSI CQ is handled by the driver so remove it from the FW */
+	qedr_destroy_gsi_cq(dev, attrs);
+	dev->gsi_rqcq->cq_type = QEDR_CQ_TYPE_GSI;
+	dev->gsi_rqcq->cq_type = QEDR_CQ_TYPE_GSI;
+
+	DP_DEBUG(dev, QEDR_MSG_GSI, "created GSI QP %p\n", qp);
+
+	return &qp->ibqp;
+
+err:
+	kfree(qp->rqe_wr_id);
+
+	rc = qedr_ll2_stop(dev);
+	if (rc)
+		DP_ERR(dev, "create gsi qp: failed destroy on create\n");
+
+	return ERR_PTR(-ENOMEM);
+}
+
+int qedr_destroy_gsi_qp(struct qedr_dev *dev)
+{
+	return qedr_ll2_stop(dev);
+}
+
+#define QEDR_MAX_UD_HEADER_SIZE	(100)
+#define QEDR_GSI_QPN		(1)
+static inline int qedr_gsi_build_header(struct qedr_dev *dev,
+					struct qedr_qp *qp,
+					struct ib_send_wr *swr,
+					struct ib_ud_header *udh,
+					int *roce_mode)
+{
+	bool has_vlan = false, has_grh_ipv6 = true;
+	struct rdma_ah_attr *ah_attr = &get_qedr_ah(ud_wr(swr)->ah)->attr;
+	const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
+	union ib_gid sgid;
+	int send_size = 0;
+	u16 vlan_id = 0;
+	u16 ether_type;
+	struct ib_gid_attr sgid_attr;
+	int rc;
+	int ip_ver = 0;
+
+	bool has_udp = false;
+	int i;
+
+	send_size = 0;
+	for (i = 0; i < swr->num_sge; ++i)
+		send_size += swr->sg_list[i].length;
+
+	rc = ib_get_cached_gid(qp->ibqp.device, rdma_ah_get_port_num(ah_attr),
+			       grh->sgid_index, &sgid, &sgid_attr);
+	if (rc) {
+		DP_ERR(dev,
+		       "gsi post send: failed to get cached GID (port=%d, ix=%d)\n",
+		       rdma_ah_get_port_num(ah_attr),
+		       grh->sgid_index);
+		return rc;
+	}
+
+	vlan_id = rdma_vlan_dev_vlan_id(sgid_attr.ndev);
+	if (vlan_id < VLAN_CFI_MASK)
+		has_vlan = true;
+
+	dev_put(sgid_attr.ndev);
+
+	has_udp = (sgid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP);
+	if (!has_udp) {
+		/* RoCE v1 */
+		ether_type = ETH_P_IBOE;
+		*roce_mode = ROCE_V1;
+	} else if (ipv6_addr_v4mapped((struct in6_addr *)&sgid)) {
+		/* RoCE v2 IPv4 */
+		ip_ver = 4;
+		ether_type = ETH_P_IP;
+		has_grh_ipv6 = false;
+		*roce_mode = ROCE_V2_IPV4;
+	} else {
+		/* RoCE v2 IPv6 */
+		ip_ver = 6;
+		ether_type = ETH_P_IPV6;
+		*roce_mode = ROCE_V2_IPV6;
+	}
+
+	rc = ib_ud_header_init(send_size, false, true, has_vlan,
+			       has_grh_ipv6, ip_ver, has_udp, 0, udh);
+	if (rc) {
+		DP_ERR(dev, "gsi post send: failed to init header\n");
+		return rc;
+	}
+
+	/* ENET + VLAN headers */
+	ether_addr_copy(udh->eth.dmac_h, ah_attr->roce.dmac);
+	ether_addr_copy(udh->eth.smac_h, dev->ndev->dev_addr);
+	if (has_vlan) {
+		udh->eth.type = htons(ETH_P_8021Q);
+		udh->vlan.tag = htons(vlan_id);
+		udh->vlan.type = htons(ether_type);
+	} else {
+		udh->eth.type = htons(ether_type);
+	}
+
+	/* BTH */
+	udh->bth.solicited_event = !!(swr->send_flags & IB_SEND_SOLICITED);
+	udh->bth.pkey = QEDR_ROCE_PKEY_DEFAULT;
+	udh->bth.destination_qpn = htonl(ud_wr(swr)->remote_qpn);
+	udh->bth.psn = htonl((qp->sq_psn++) & ((1 << 24) - 1));
+	udh->bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+
+	/* DETH */
+	udh->deth.qkey = htonl(0x80010000);
+	udh->deth.source_qpn = htonl(QEDR_GSI_QPN);
+
+	if (has_grh_ipv6) {
+		/* GRH / IPv6 header */
+		udh->grh.traffic_class = grh->traffic_class;
+		udh->grh.flow_label = grh->flow_label;
+		udh->grh.hop_limit = grh->hop_limit;
+		udh->grh.destination_gid = grh->dgid;
+		memcpy(&udh->grh.source_gid.raw, &sgid.raw,
+		       sizeof(udh->grh.source_gid.raw));
+	} else {
+		/* IPv4 header */
+		u32 ipv4_addr;
+
+		udh->ip4.protocol = IPPROTO_UDP;
+		udh->ip4.tos = htonl(grh->flow_label);
+		udh->ip4.frag_off = htons(IP_DF);
+		udh->ip4.ttl = grh->hop_limit;
+
+		ipv4_addr = qedr_get_ipv4_from_gid(sgid.raw);
+		udh->ip4.saddr = ipv4_addr;
+		ipv4_addr = qedr_get_ipv4_from_gid(grh->dgid.raw);
+		udh->ip4.daddr = ipv4_addr;
+		/* note: checksum is calculated by the device */
+	}
+
+	/* UDP */
+	if (has_udp) {
+		udh->udp.sport = htons(QEDR_ROCE_V2_UDP_SPORT);
+		udh->udp.dport = htons(ROCE_V2_UDP_DPORT);
+		udh->udp.csum = 0;
+		/* UDP length is untouched hence is zero */
+	}
+	return 0;
+}
+
+static inline int qedr_gsi_build_packet(struct qedr_dev *dev,
+					struct qedr_qp *qp,
+					struct ib_send_wr *swr,
+					struct qed_roce_ll2_packet **p_packet)
+{
+	u8 ud_header_buffer[QEDR_MAX_UD_HEADER_SIZE];
+	struct qed_roce_ll2_packet *packet;
+	struct pci_dev *pdev = dev->pdev;
+	int roce_mode, header_size;
+	struct ib_ud_header udh;
+	int i, rc;
+
+	*p_packet = NULL;
+
+	rc = qedr_gsi_build_header(dev, qp, swr, &udh, &roce_mode);
+	if (rc)
+		return rc;
+
+	header_size = ib_ud_header_pack(&udh, &ud_header_buffer);
+
+	packet = kzalloc(sizeof(*packet), GFP_ATOMIC);
+	if (!packet)
+		return -ENOMEM;
+
+	packet->header.vaddr = dma_alloc_coherent(&pdev->dev, header_size,
+						  &packet->header.baddr,
+						  GFP_ATOMIC);
+	if (!packet->header.vaddr) {
+		kfree(packet);
+		return -ENOMEM;
+	}
+
+	if (ether_addr_equal(udh.eth.smac_h, udh.eth.dmac_h))
+		packet->tx_dest = QED_ROCE_LL2_TX_DEST_LB;
+	else
+		packet->tx_dest = QED_ROCE_LL2_TX_DEST_NW;
+
+	packet->roce_mode = roce_mode;
+	memcpy(packet->header.vaddr, ud_header_buffer, header_size);
+	packet->header.len = header_size;
+	packet->n_seg = swr->num_sge;
+	for (i = 0; i < packet->n_seg; i++) {
+		packet->payload[i].baddr = swr->sg_list[i].addr;
+		packet->payload[i].len = swr->sg_list[i].length;
+	}
+
+	*p_packet = packet;
+
+	return 0;
+}
+
+int qedr_gsi_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		       struct ib_send_wr **bad_wr)
+{
+	struct qed_roce_ll2_packet *pkt = NULL;
+	struct qedr_qp *qp = get_qedr_qp(ibqp);
+	struct qedr_dev *dev = qp->dev;
+	unsigned long flags;
+	int rc;
+
+	if (qp->state != QED_ROCE_QP_STATE_RTS) {
+		*bad_wr = wr;
+		DP_ERR(dev,
+		       "gsi post recv: failed to post rx buffer. state is %d and not QED_ROCE_QP_STATE_RTS\n",
+		       qp->state);
+		return -EINVAL;
+	}
+
+	if (wr->num_sge > RDMA_MAX_SGE_PER_SQ_WQE) {
+		DP_ERR(dev, "gsi post send: num_sge is too large (%d>%d)\n",
+		       wr->num_sge, RDMA_MAX_SGE_PER_SQ_WQE);
+		rc = -EINVAL;
+		goto err;
+	}
+
+	if (wr->opcode != IB_WR_SEND) {
+		DP_ERR(dev,
+		       "gsi post send: failed due to unsupported opcode %d\n",
+		       wr->opcode);
+		rc = -EINVAL;
+		goto err;
+	}
+
+	spin_lock_irqsave(&qp->q_lock, flags);
+
+	rc = qedr_gsi_build_packet(dev, qp, wr, &pkt);
+	if (rc) {
+		spin_unlock_irqrestore(&qp->q_lock, flags);
+		goto err;
+	}
+
+	rc = qedr_ll2_post_tx(dev, pkt);
+
+	if (!rc) {
+		qp->wqe_wr_id[qp->sq.prod].wr_id = wr->wr_id;
+		qedr_inc_sw_prod(&qp->sq);
+		DP_DEBUG(qp->dev, QEDR_MSG_GSI,
+			 "gsi post send: opcode=%d, in_irq=%ld, irqs_disabled=%d, wr_id=%llx\n",
+			 wr->opcode, in_irq(), irqs_disabled(), wr->wr_id);
+	} else {
+		DP_ERR(dev, "gsi post send: failed to transmit (rc=%d)\n", rc);
+		rc = -EAGAIN;
+		*bad_wr = wr;
+	}
+
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+
+	if (wr->next) {
+		DP_ERR(dev,
+		       "gsi post send: failed second WR. Only one WR may be passed at a time\n");
+		*bad_wr = wr->next;
+		rc = -EINVAL;
+	}
+
+	return rc;
+
+err:
+	*bad_wr = wr;
+	return rc;
+}
+
+int qedr_gsi_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		       struct ib_recv_wr **bad_wr)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibqp->device);
+	struct qedr_qp *qp = get_qedr_qp(ibqp);
+	unsigned long flags;
+	int rc = 0;
+
+	if ((qp->state != QED_ROCE_QP_STATE_RTR) &&
+	    (qp->state != QED_ROCE_QP_STATE_RTS)) {
+		*bad_wr = wr;
+		DP_ERR(dev,
+		       "gsi post recv: failed to post rx buffer. state is %d and not QED_ROCE_QP_STATE_RTR/S\n",
+		       qp->state);
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&qp->q_lock, flags);
+
+	while (wr) {
+		if (wr->num_sge > QEDR_GSI_MAX_RECV_SGE) {
+			DP_ERR(dev,
+			       "gsi post recv: failed to post rx buffer. too many sges %d>%d\n",
+			       wr->num_sge, QEDR_GSI_MAX_RECV_SGE);
+			goto err;
+		}
+
+		rc = dev->ops->ll2_post_rx_buffer(dev->rdma_ctx,
+						  dev->gsi_ll2_handle,
+						  wr->sg_list[0].addr,
+						  wr->sg_list[0].length,
+						  NULL /* cookie */,
+						  1 /* notify_fw */);
+		if (rc) {
+			DP_ERR(dev,
+			       "gsi post recv: failed to post rx buffer (rc=%d)\n",
+			       rc);
+			goto err;
+		}
+
+		memset(&qp->rqe_wr_id[qp->rq.prod], 0,
+		       sizeof(qp->rqe_wr_id[qp->rq.prod]));
+		qp->rqe_wr_id[qp->rq.prod].sg_list[0] = wr->sg_list[0];
+		qp->rqe_wr_id[qp->rq.prod].wr_id = wr->wr_id;
+
+		qedr_inc_sw_prod(&qp->rq);
+
+		wr = wr->next;
+	}
+
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+
+	return rc;
+err:
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+	*bad_wr = wr;
+	return -ENOMEM;
+}
+
+int qedr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibcq->device);
+	struct qedr_cq *cq = get_qedr_cq(ibcq);
+	struct qedr_qp *qp = dev->gsi_qp;
+	unsigned long flags;
+	u16 vlan_id;
+	int i = 0;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+
+	while (i < num_entries && qp->rq.cons != qp->rq.gsi_cons) {
+		memset(&wc[i], 0, sizeof(*wc));
+
+		wc[i].qp = &qp->ibqp;
+		wc[i].wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id;
+		wc[i].opcode = IB_WC_RECV;
+		wc[i].pkey_index = 0;
+		wc[i].status = (qp->rqe_wr_id[qp->rq.cons].rc) ?
+		    IB_WC_GENERAL_ERR : IB_WC_SUCCESS;
+		/* 0 - currently only one recv sg is supported */
+		wc[i].byte_len = qp->rqe_wr_id[qp->rq.cons].sg_list[0].length;
+		wc[i].wc_flags |= IB_WC_GRH | IB_WC_IP_CSUM_OK;
+		ether_addr_copy(wc[i].smac, qp->rqe_wr_id[qp->rq.cons].smac);
+		wc[i].wc_flags |= IB_WC_WITH_SMAC;
+
+		vlan_id = qp->rqe_wr_id[qp->rq.cons].vlan &
+			  VLAN_VID_MASK;
+		if (vlan_id) {
+			wc[i].wc_flags |= IB_WC_WITH_VLAN;
+			wc[i].vlan_id = vlan_id;
+			wc[i].sl = (qp->rqe_wr_id[qp->rq.cons].vlan &
+				    VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+		}
+
+		qedr_inc_sw_cons(&qp->rq);
+		i++;
+	}
+
+	while (i < num_entries && qp->sq.cons != qp->sq.gsi_cons) {
+		memset(&wc[i], 0, sizeof(*wc));
+
+		wc[i].qp = &qp->ibqp;
+		wc[i].wr_id = qp->wqe_wr_id[qp->sq.cons].wr_id;
+		wc[i].opcode = IB_WC_SEND;
+		wc[i].status = IB_WC_SUCCESS;
+
+		qedr_inc_sw_cons(&qp->sq);
+		i++;
+	}
+
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+	DP_DEBUG(dev, QEDR_MSG_GSI,
+		 "gsi poll_cq: requested entries=%d, actual=%d, qp->rq.cons=%d, qp->rq.gsi_cons=%x, qp->sq.cons=%d, qp->sq.gsi_cons=%d, qp_num=%d\n",
+		 num_entries, i, qp->rq.cons, qp->rq.gsi_cons, qp->sq.cons,
+		 qp->sq.gsi_cons, qp->ibqp.qp_num);
+
+	return i;
+}
diff --git a/drivers/infiniband/hw/qedr/qedr_roce_cm.h b/drivers/infiniband/hw/qedr/qedr_roce_cm.h
new file mode 100644
index 0000000..a559163
--- /dev/null
+++ b/drivers/infiniband/hw/qedr/qedr_roce_cm.h
@@ -0,0 +1,60 @@
+/* QLogic qedr NIC Driver
+ * Copyright (c) 2015-2016  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef LINUX_QEDR_CM_H_
+#define LINUX_QEDR_CM_H_
+
+#define QEDR_GSI_MAX_RECV_WR	(4096)
+#define QEDR_GSI_MAX_SEND_WR	(4096)
+
+#define QEDR_GSI_MAX_RECV_SGE	(1)	/* LL2 FW limitation */
+
+#define QEDR_ROCE_V2_UDP_SPORT	(0000)
+
+static inline u32 qedr_get_ipv4_from_gid(const u8 *gid)
+{
+	return *(u32 *)(void *)&gid[12];
+}
+
+/* RDMA CM */
+int qedr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int qedr_gsi_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		       struct ib_recv_wr **bad_wr);
+int qedr_gsi_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		       struct ib_send_wr **bad_wr);
+struct ib_qp *qedr_create_gsi_qp(struct qedr_dev *dev,
+				 struct ib_qp_init_attr *attrs,
+				 struct qedr_qp *qp);
+void qedr_store_gsi_qp_cq(struct qedr_dev *dev,
+			  struct qedr_qp *qp, struct ib_qp_init_attr *attrs);
+int qedr_destroy_gsi_qp(struct qedr_dev *dev);
+void qedr_inc_sw_gsi_cons(struct qedr_qp_hwq_info *info);
+#endif
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
new file mode 100644
index 0000000..3f9afc0
--- /dev/null
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -0,0 +1,3792 @@
+/* QLogic qedr NIC Driver
+ * Copyright (c) 2015-2016  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/dma-mapping.h>
+#include <linux/crc32.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/udp.h>
+#include <linux/iommu.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include <linux/qed/common_hsi.h>
+#include "qedr_hsi_rdma.h"
+#include <linux/qed/qed_if.h>
+#include "qedr.h"
+#include "verbs.h"
+#include <rdma/qedr-abi.h>
+#include "qedr_roce_cm.h"
+
+#define DB_ADDR_SHIFT(addr)		((addr) << DB_PWM_ADDR_OFFSET_SHIFT)
+
+static inline int qedr_ib_copy_to_udata(struct ib_udata *udata, void *src,
+					size_t len)
+{
+	size_t min_len = min_t(size_t, len, udata->outlen);
+
+	return ib_copy_to_udata(udata, src, min_len);
+}
+
+int qedr_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	if (index > QEDR_ROCE_PKEY_TABLE_LEN)
+		return -EINVAL;
+
+	*pkey = QEDR_ROCE_PKEY_DEFAULT;
+	return 0;
+}
+
+int qedr_iw_query_gid(struct ib_device *ibdev, u8 port,
+		      int index, union ib_gid *sgid)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibdev);
+
+	memset(sgid->raw, 0, sizeof(sgid->raw));
+	ether_addr_copy(sgid->raw, dev->ndev->dev_addr);
+
+	DP_DEBUG(dev, QEDR_MSG_INIT, "QUERY sgid[%d]=%llx:%llx\n", index,
+		 sgid->global.interface_id, sgid->global.subnet_prefix);
+
+	return 0;
+}
+
+int qedr_query_device(struct ib_device *ibdev,
+		      struct ib_device_attr *attr, struct ib_udata *udata)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibdev);
+	struct qedr_device_attr *qattr = &dev->attr;
+
+	if (!dev->rdma_ctx) {
+		DP_ERR(dev,
+		       "qedr_query_device called with invalid params rdma_ctx=%p\n",
+		       dev->rdma_ctx);
+		return -EINVAL;
+	}
+
+	memset(attr, 0, sizeof(*attr));
+
+	attr->fw_ver = qattr->fw_ver;
+	attr->sys_image_guid = qattr->sys_image_guid;
+	attr->max_mr_size = qattr->max_mr_size;
+	attr->page_size_cap = qattr->page_size_caps;
+	attr->vendor_id = qattr->vendor_id;
+	attr->vendor_part_id = qattr->vendor_part_id;
+	attr->hw_ver = qattr->hw_ver;
+	attr->max_qp = qattr->max_qp;
+	attr->max_qp_wr = max_t(u32, qattr->max_sqe, qattr->max_rqe);
+	attr->device_cap_flags = IB_DEVICE_CURR_QP_STATE_MOD |
+	    IB_DEVICE_RC_RNR_NAK_GEN |
+	    IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_MGT_EXTENSIONS;
+
+	attr->max_sge = qattr->max_sge;
+	attr->max_sge_rd = qattr->max_sge;
+	attr->max_cq = qattr->max_cq;
+	attr->max_cqe = qattr->max_cqe;
+	attr->max_mr = qattr->max_mr;
+	attr->max_mw = qattr->max_mw;
+	attr->max_pd = qattr->max_pd;
+	attr->atomic_cap = dev->atomic_cap;
+	attr->max_fmr = qattr->max_fmr;
+	attr->max_map_per_fmr = 16;
+	attr->max_qp_init_rd_atom =
+	    1 << (fls(qattr->max_qp_req_rd_atomic_resc) - 1);
+	attr->max_qp_rd_atom =
+	    min(1 << (fls(qattr->max_qp_resp_rd_atomic_resc) - 1),
+		attr->max_qp_init_rd_atom);
+
+	attr->max_srq = qattr->max_srq;
+	attr->max_srq_sge = qattr->max_srq_sge;
+	attr->max_srq_wr = qattr->max_srq_wr;
+
+	attr->local_ca_ack_delay = qattr->dev_ack_delay;
+	attr->max_fast_reg_page_list_len = qattr->max_mr / 8;
+	attr->max_pkeys = QEDR_ROCE_PKEY_MAX;
+	attr->max_ah = qattr->max_ah;
+
+	return 0;
+}
+
+#define QEDR_SPEED_SDR		(1)
+#define QEDR_SPEED_DDR		(2)
+#define QEDR_SPEED_QDR		(4)
+#define QEDR_SPEED_FDR10	(8)
+#define QEDR_SPEED_FDR		(16)
+#define QEDR_SPEED_EDR		(32)
+
+static inline void get_link_speed_and_width(int speed, u8 *ib_speed,
+					    u8 *ib_width)
+{
+	switch (speed) {
+	case 1000:
+		*ib_speed = QEDR_SPEED_SDR;
+		*ib_width = IB_WIDTH_1X;
+		break;
+	case 10000:
+		*ib_speed = QEDR_SPEED_QDR;
+		*ib_width = IB_WIDTH_1X;
+		break;
+
+	case 20000:
+		*ib_speed = QEDR_SPEED_DDR;
+		*ib_width = IB_WIDTH_4X;
+		break;
+
+	case 25000:
+		*ib_speed = QEDR_SPEED_EDR;
+		*ib_width = IB_WIDTH_1X;
+		break;
+
+	case 40000:
+		*ib_speed = QEDR_SPEED_QDR;
+		*ib_width = IB_WIDTH_4X;
+		break;
+
+	case 50000:
+		*ib_speed = QEDR_SPEED_QDR;
+		*ib_width = IB_WIDTH_4X;
+		break;
+
+	case 100000:
+		*ib_speed = QEDR_SPEED_EDR;
+		*ib_width = IB_WIDTH_4X;
+		break;
+
+	default:
+		/* Unsupported */
+		*ib_speed = QEDR_SPEED_SDR;
+		*ib_width = IB_WIDTH_1X;
+	}
+}
+
+int qedr_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *attr)
+{
+	struct qedr_dev *dev;
+	struct qed_rdma_port *rdma_port;
+
+	dev = get_qedr_dev(ibdev);
+	if (port > 1) {
+		DP_ERR(dev, "invalid_port=0x%x\n", port);
+		return -EINVAL;
+	}
+
+	if (!dev->rdma_ctx) {
+		DP_ERR(dev, "rdma_ctx is NULL\n");
+		return -EINVAL;
+	}
+
+	rdma_port = dev->ops->rdma_query_port(dev->rdma_ctx);
+
+	/* *attr being zeroed by the caller, avoid zeroing it here */
+	if (rdma_port->port_state == QED_RDMA_PORT_UP) {
+		attr->state = IB_PORT_ACTIVE;
+		attr->phys_state = 5;
+	} else {
+		attr->state = IB_PORT_DOWN;
+		attr->phys_state = 3;
+	}
+	attr->max_mtu = IB_MTU_4096;
+	attr->active_mtu = iboe_get_mtu(dev->ndev->mtu);
+	attr->lid = 0;
+	attr->lmc = 0;
+	attr->sm_lid = 0;
+	attr->sm_sl = 0;
+	attr->port_cap_flags = IB_PORT_IP_BASED_GIDS;
+	if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
+		attr->gid_tbl_len = 1;
+		attr->pkey_tbl_len = 1;
+	} else {
+		attr->gid_tbl_len = QEDR_MAX_SGID;
+		attr->pkey_tbl_len = QEDR_ROCE_PKEY_TABLE_LEN;
+	}
+	attr->bad_pkey_cntr = rdma_port->pkey_bad_counter;
+	attr->qkey_viol_cntr = 0;
+	get_link_speed_and_width(rdma_port->link_speed,
+				 &attr->active_speed, &attr->active_width);
+	attr->max_msg_sz = rdma_port->max_msg_size;
+	attr->max_vl_num = 4;
+
+	return 0;
+}
+
+int qedr_modify_port(struct ib_device *ibdev, u8 port, int mask,
+		     struct ib_port_modify *props)
+{
+	struct qedr_dev *dev;
+
+	dev = get_qedr_dev(ibdev);
+	if (port > 1) {
+		DP_ERR(dev, "invalid_port=0x%x\n", port);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int qedr_add_mmap(struct qedr_ucontext *uctx, u64 phy_addr,
+			 unsigned long len)
+{
+	struct qedr_mm *mm;
+
+	mm = kzalloc(sizeof(*mm), GFP_KERNEL);
+	if (!mm)
+		return -ENOMEM;
+
+	mm->key.phy_addr = phy_addr;
+	/* This function might be called with a length which is not a multiple
+	 * of PAGE_SIZE, while the mapping is PAGE_SIZE grained and the kernel
+	 * forces this granularity by increasing the requested size if needed.
+	 * When qedr_mmap is called, it will search the list with the updated
+	 * length as a key. To prevent search failures, the length is rounded up
+	 * in advance to PAGE_SIZE.
+	 */
+	mm->key.len = roundup(len, PAGE_SIZE);
+	INIT_LIST_HEAD(&mm->entry);
+
+	mutex_lock(&uctx->mm_list_lock);
+	list_add(&mm->entry, &uctx->mm_head);
+	mutex_unlock(&uctx->mm_list_lock);
+
+	DP_DEBUG(uctx->dev, QEDR_MSG_MISC,
+		 "added (addr=0x%llx,len=0x%lx) for ctx=%p\n",
+		 (unsigned long long)mm->key.phy_addr,
+		 (unsigned long)mm->key.len, uctx);
+
+	return 0;
+}
+
+static bool qedr_search_mmap(struct qedr_ucontext *uctx, u64 phy_addr,
+			     unsigned long len)
+{
+	bool found = false;
+	struct qedr_mm *mm;
+
+	mutex_lock(&uctx->mm_list_lock);
+	list_for_each_entry(mm, &uctx->mm_head, entry) {
+		if (len != mm->key.len || phy_addr != mm->key.phy_addr)
+			continue;
+
+		found = true;
+		break;
+	}
+	mutex_unlock(&uctx->mm_list_lock);
+	DP_DEBUG(uctx->dev, QEDR_MSG_MISC,
+		 "searched for (addr=0x%llx,len=0x%lx) for ctx=%p, result=%d\n",
+		 mm->key.phy_addr, mm->key.len, uctx, found);
+
+	return found;
+}
+
+struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev,
+					struct ib_udata *udata)
+{
+	int rc;
+	struct qedr_ucontext *ctx;
+	struct qedr_alloc_ucontext_resp uresp;
+	struct qedr_dev *dev = get_qedr_dev(ibdev);
+	struct qed_rdma_add_user_out_params oparams;
+
+	if (!udata)
+		return ERR_PTR(-EFAULT);
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	rc = dev->ops->rdma_add_user(dev->rdma_ctx, &oparams);
+	if (rc) {
+		DP_ERR(dev,
+		       "failed to allocate a DPI for a new RoCE application, rc=%d. To overcome this consider to increase the number of DPIs, increase the doorbell BAR size or just close unnecessary RoCE applications. In order to increase the number of DPIs consult the qedr readme\n",
+		       rc);
+		goto err;
+	}
+
+	ctx->dpi = oparams.dpi;
+	ctx->dpi_addr = oparams.dpi_addr;
+	ctx->dpi_phys_addr = oparams.dpi_phys_addr;
+	ctx->dpi_size = oparams.dpi_size;
+	INIT_LIST_HEAD(&ctx->mm_head);
+	mutex_init(&ctx->mm_list_lock);
+
+	memset(&uresp, 0, sizeof(uresp));
+
+	uresp.dpm_enabled = dev->user_dpm_enabled;
+	uresp.wids_enabled = 1;
+	uresp.wid_count = oparams.wid_count;
+	uresp.db_pa = ctx->dpi_phys_addr;
+	uresp.db_size = ctx->dpi_size;
+	uresp.max_send_wr = dev->attr.max_sqe;
+	uresp.max_recv_wr = dev->attr.max_rqe;
+	uresp.max_srq_wr = dev->attr.max_srq_wr;
+	uresp.sges_per_send_wr = QEDR_MAX_SQE_ELEMENTS_PER_SQE;
+	uresp.sges_per_recv_wr = QEDR_MAX_RQE_ELEMENTS_PER_RQE;
+	uresp.sges_per_srq_wr = dev->attr.max_srq_sge;
+	uresp.max_cqes = QEDR_MAX_CQES;
+
+	rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (rc)
+		goto err;
+
+	ctx->dev = dev;
+
+	rc = qedr_add_mmap(ctx, ctx->dpi_phys_addr, ctx->dpi_size);
+	if (rc)
+		goto err;
+
+	DP_DEBUG(dev, QEDR_MSG_INIT, "Allocating user context %p\n",
+		 &ctx->ibucontext);
+	return &ctx->ibucontext;
+
+err:
+	kfree(ctx);
+	return ERR_PTR(rc);
+}
+
+int qedr_dealloc_ucontext(struct ib_ucontext *ibctx)
+{
+	struct qedr_ucontext *uctx = get_qedr_ucontext(ibctx);
+	struct qedr_mm *mm, *tmp;
+	int status = 0;
+
+	DP_DEBUG(uctx->dev, QEDR_MSG_INIT, "Deallocating user context %p\n",
+		 uctx);
+	uctx->dev->ops->rdma_remove_user(uctx->dev->rdma_ctx, uctx->dpi);
+
+	list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) {
+		DP_DEBUG(uctx->dev, QEDR_MSG_MISC,
+			 "deleted (addr=0x%llx,len=0x%lx) for ctx=%p\n",
+			 mm->key.phy_addr, mm->key.len, uctx);
+		list_del(&mm->entry);
+		kfree(mm);
+	}
+
+	kfree(uctx);
+	return status;
+}
+
+int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct qedr_ucontext *ucontext = get_qedr_ucontext(context);
+	struct qedr_dev *dev = get_qedr_dev(context->device);
+	unsigned long phys_addr = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long len = (vma->vm_end - vma->vm_start);
+	unsigned long dpi_start;
+
+	dpi_start = dev->db_phys_addr + (ucontext->dpi * ucontext->dpi_size);
+
+	DP_DEBUG(dev, QEDR_MSG_INIT,
+		 "mmap invoked with vm_start=0x%pK, vm_end=0x%pK,vm_pgoff=0x%pK; dpi_start=0x%pK dpi_size=0x%x\n",
+		 (void *)vma->vm_start, (void *)vma->vm_end,
+		 (void *)vma->vm_pgoff, (void *)dpi_start, ucontext->dpi_size);
+
+	if ((vma->vm_start & (PAGE_SIZE - 1)) || (len & (PAGE_SIZE - 1))) {
+		DP_ERR(dev,
+		       "failed mmap, adrresses must be page aligned: start=0x%pK, end=0x%pK\n",
+		       (void *)vma->vm_start, (void *)vma->vm_end);
+		return -EINVAL;
+	}
+
+	if (!qedr_search_mmap(ucontext, phys_addr, len)) {
+		DP_ERR(dev, "failed mmap, vm_pgoff=0x%lx is not authorized\n",
+		       vma->vm_pgoff);
+		return -EINVAL;
+	}
+
+	if (phys_addr < dpi_start ||
+	    ((phys_addr + len) > (dpi_start + ucontext->dpi_size))) {
+		DP_ERR(dev,
+		       "failed mmap, pages are outside of dpi; page address=0x%pK, dpi_start=0x%pK, dpi_size=0x%x\n",
+		       (void *)phys_addr, (void *)dpi_start,
+		       ucontext->dpi_size);
+		return -EINVAL;
+	}
+
+	if (vma->vm_flags & VM_READ) {
+		DP_ERR(dev, "failed mmap, cannot map doorbell bar for read\n");
+		return -EINVAL;
+	}
+
+	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+	return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, len,
+				  vma->vm_page_prot);
+}
+
+struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev,
+			    struct ib_ucontext *context, struct ib_udata *udata)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibdev);
+	struct qedr_pd *pd;
+	u16 pd_id;
+	int rc;
+
+	DP_DEBUG(dev, QEDR_MSG_INIT, "Function called from: %s\n",
+		 (udata && context) ? "User Lib" : "Kernel");
+
+	if (!dev->rdma_ctx) {
+		DP_ERR(dev, "invalid RDMA context\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	rc = dev->ops->rdma_alloc_pd(dev->rdma_ctx, &pd_id);
+	if (rc)
+		goto err;
+
+	pd->pd_id = pd_id;
+
+	if (udata && context) {
+		struct qedr_alloc_pd_uresp uresp = {
+			.pd_id = pd_id,
+		};
+
+		rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+		if (rc) {
+			DP_ERR(dev, "copy error pd_id=0x%x.\n", pd_id);
+			dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd_id);
+			goto err;
+		}
+
+		pd->uctx = get_qedr_ucontext(context);
+		pd->uctx->pd = pd;
+	}
+
+	return &pd->ibpd;
+
+err:
+	kfree(pd);
+	return ERR_PTR(rc);
+}
+
+int qedr_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
+	struct qedr_pd *pd = get_qedr_pd(ibpd);
+
+	if (!pd) {
+		pr_err("Invalid PD received in dealloc_pd\n");
+		return -EINVAL;
+	}
+
+	DP_DEBUG(dev, QEDR_MSG_INIT, "Deallocating PD %d\n", pd->pd_id);
+	dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd->pd_id);
+
+	kfree(pd);
+
+	return 0;
+}
+
+static void qedr_free_pbl(struct qedr_dev *dev,
+			  struct qedr_pbl_info *pbl_info, struct qedr_pbl *pbl)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int i;
+
+	for (i = 0; i < pbl_info->num_pbls; i++) {
+		if (!pbl[i].va)
+			continue;
+		dma_free_coherent(&pdev->dev, pbl_info->pbl_size,
+				  pbl[i].va, pbl[i].pa);
+	}
+
+	kfree(pbl);
+}
+
+#define MIN_FW_PBL_PAGE_SIZE (4 * 1024)
+#define MAX_FW_PBL_PAGE_SIZE (64 * 1024)
+
+#define NUM_PBES_ON_PAGE(_page_size) (_page_size / sizeof(u64))
+#define MAX_PBES_ON_PAGE NUM_PBES_ON_PAGE(MAX_FW_PBL_PAGE_SIZE)
+#define MAX_PBES_TWO_LAYER (MAX_PBES_ON_PAGE * MAX_PBES_ON_PAGE)
+
+static struct qedr_pbl *qedr_alloc_pbl_tbl(struct qedr_dev *dev,
+					   struct qedr_pbl_info *pbl_info,
+					   gfp_t flags)
+{
+	struct pci_dev *pdev = dev->pdev;
+	struct qedr_pbl *pbl_table;
+	dma_addr_t *pbl_main_tbl;
+	dma_addr_t pa;
+	void *va;
+	int i;
+
+	pbl_table = kcalloc(pbl_info->num_pbls, sizeof(*pbl_table), flags);
+	if (!pbl_table)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < pbl_info->num_pbls; i++) {
+		va = dma_zalloc_coherent(&pdev->dev, pbl_info->pbl_size,
+					 &pa, flags);
+		if (!va)
+			goto err;
+
+		pbl_table[i].va = va;
+		pbl_table[i].pa = pa;
+	}
+
+	/* Two-Layer PBLs, if we have more than one pbl we need to initialize
+	 * the first one with physical pointers to all of the rest
+	 */
+	pbl_main_tbl = (dma_addr_t *)pbl_table[0].va;
+	for (i = 0; i < pbl_info->num_pbls - 1; i++)
+		pbl_main_tbl[i] = pbl_table[i + 1].pa;
+
+	return pbl_table;
+
+err:
+	for (i--; i >= 0; i--)
+		dma_free_coherent(&pdev->dev, pbl_info->pbl_size,
+				  pbl_table[i].va, pbl_table[i].pa);
+
+	qedr_free_pbl(dev, pbl_info, pbl_table);
+
+	return ERR_PTR(-ENOMEM);
+}
+
+static int qedr_prepare_pbl_tbl(struct qedr_dev *dev,
+				struct qedr_pbl_info *pbl_info,
+				u32 num_pbes, int two_layer_capable)
+{
+	u32 pbl_capacity;
+	u32 pbl_size;
+	u32 num_pbls;
+
+	if ((num_pbes > MAX_PBES_ON_PAGE) && two_layer_capable) {
+		if (num_pbes > MAX_PBES_TWO_LAYER) {
+			DP_ERR(dev, "prepare pbl table: too many pages %d\n",
+			       num_pbes);
+			return -EINVAL;
+		}
+
+		/* calculate required pbl page size */
+		pbl_size = MIN_FW_PBL_PAGE_SIZE;
+		pbl_capacity = NUM_PBES_ON_PAGE(pbl_size) *
+			       NUM_PBES_ON_PAGE(pbl_size);
+
+		while (pbl_capacity < num_pbes) {
+			pbl_size *= 2;
+			pbl_capacity = pbl_size / sizeof(u64);
+			pbl_capacity = pbl_capacity * pbl_capacity;
+		}
+
+		num_pbls = DIV_ROUND_UP(num_pbes, NUM_PBES_ON_PAGE(pbl_size));
+		num_pbls++;	/* One for the layer0 ( points to the pbls) */
+		pbl_info->two_layered = true;
+	} else {
+		/* One layered PBL */
+		num_pbls = 1;
+		pbl_size = max_t(u32, MIN_FW_PBL_PAGE_SIZE,
+				 roundup_pow_of_two((num_pbes * sizeof(u64))));
+		pbl_info->two_layered = false;
+	}
+
+	pbl_info->num_pbls = num_pbls;
+	pbl_info->pbl_size = pbl_size;
+	pbl_info->num_pbes = num_pbes;
+
+	DP_DEBUG(dev, QEDR_MSG_MR,
+		 "prepare pbl table: num_pbes=%d, num_pbls=%d, pbl_size=%d\n",
+		 pbl_info->num_pbes, pbl_info->num_pbls, pbl_info->pbl_size);
+
+	return 0;
+}
+
+static void qedr_populate_pbls(struct qedr_dev *dev, struct ib_umem *umem,
+			       struct qedr_pbl *pbl,
+			       struct qedr_pbl_info *pbl_info, u32 pg_shift)
+{
+	int shift, pg_cnt, pages, pbe_cnt, total_num_pbes = 0;
+	u32 fw_pg_cnt, fw_pg_per_umem_pg;
+	struct qedr_pbl *pbl_tbl;
+	struct scatterlist *sg;
+	struct regpair *pbe;
+	u64 pg_addr;
+	int entry;
+
+	if (!pbl_info->num_pbes)
+		return;
+
+	/* If we have a two layered pbl, the first pbl points to the rest
+	 * of the pbls and the first entry lays on the second pbl in the table
+	 */
+	if (pbl_info->two_layered)
+		pbl_tbl = &pbl[1];
+	else
+		pbl_tbl = pbl;
+
+	pbe = (struct regpair *)pbl_tbl->va;
+	if (!pbe) {
+		DP_ERR(dev, "cannot populate PBL due to a NULL PBE\n");
+		return;
+	}
+
+	pbe_cnt = 0;
+
+	shift = umem->page_shift;
+
+	fw_pg_per_umem_pg = BIT(umem->page_shift - pg_shift);
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		pages = sg_dma_len(sg) >> shift;
+		pg_addr = sg_dma_address(sg);
+		for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) {
+			for (fw_pg_cnt = 0; fw_pg_cnt < fw_pg_per_umem_pg;) {
+				pbe->lo = cpu_to_le32(pg_addr);
+				pbe->hi = cpu_to_le32(upper_32_bits(pg_addr));
+
+				pg_addr += BIT(pg_shift);
+				pbe_cnt++;
+				total_num_pbes++;
+				pbe++;
+
+				if (total_num_pbes == pbl_info->num_pbes)
+					return;
+
+				/* If the given pbl is full storing the pbes,
+				 * move to next pbl.
+				 */
+				if (pbe_cnt ==
+				    (pbl_info->pbl_size / sizeof(u64))) {
+					pbl_tbl++;
+					pbe = (struct regpair *)pbl_tbl->va;
+					pbe_cnt = 0;
+				}
+
+				fw_pg_cnt++;
+			}
+		}
+	}
+}
+
+static int qedr_copy_cq_uresp(struct qedr_dev *dev,
+			      struct qedr_cq *cq, struct ib_udata *udata)
+{
+	struct qedr_create_cq_uresp uresp;
+	int rc;
+
+	memset(&uresp, 0, sizeof(uresp));
+
+	uresp.db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT);
+	uresp.icid = cq->icid;
+
+	rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (rc)
+		DP_ERR(dev, "copy error cqid=0x%x.\n", cq->icid);
+
+	return rc;
+}
+
+static void consume_cqe(struct qedr_cq *cq)
+{
+	if (cq->latest_cqe == cq->toggle_cqe)
+		cq->pbl_toggle ^= RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK;
+
+	cq->latest_cqe = qed_chain_consume(&cq->pbl);
+}
+
+static inline int qedr_align_cq_entries(int entries)
+{
+	u64 size, aligned_size;
+
+	/* We allocate an extra entry that we don't report to the FW. */
+	size = (entries + 1) * QEDR_CQE_SIZE;
+	aligned_size = ALIGN(size, PAGE_SIZE);
+
+	return aligned_size / QEDR_CQE_SIZE;
+}
+
+static inline int qedr_init_user_queue(struct ib_ucontext *ib_ctx,
+				       struct qedr_dev *dev,
+				       struct qedr_userq *q,
+				       u64 buf_addr, size_t buf_len,
+				       int access, int dmasync,
+				       int alloc_and_init)
+{
+	u32 fw_pages;
+	int rc;
+
+	q->buf_addr = buf_addr;
+	q->buf_len = buf_len;
+	q->umem = ib_umem_get(ib_ctx, q->buf_addr, q->buf_len, access, dmasync);
+	if (IS_ERR(q->umem)) {
+		DP_ERR(dev, "create user queue: failed ib_umem_get, got %ld\n",
+		       PTR_ERR(q->umem));
+		return PTR_ERR(q->umem);
+	}
+
+	fw_pages = ib_umem_page_count(q->umem) <<
+	    (q->umem->page_shift - FW_PAGE_SHIFT);
+
+	rc = qedr_prepare_pbl_tbl(dev, &q->pbl_info, fw_pages, 0);
+	if (rc)
+		goto err0;
+
+	if (alloc_and_init) {
+		q->pbl_tbl = qedr_alloc_pbl_tbl(dev, &q->pbl_info, GFP_KERNEL);
+		if (IS_ERR(q->pbl_tbl)) {
+			rc = PTR_ERR(q->pbl_tbl);
+			goto err0;
+		}
+		qedr_populate_pbls(dev, q->umem, q->pbl_tbl, &q->pbl_info,
+				   FW_PAGE_SHIFT);
+	} else {
+		q->pbl_tbl = kzalloc(sizeof(*q->pbl_tbl), GFP_KERNEL);
+		if (!q->pbl_tbl) {
+			rc = -ENOMEM;
+			goto err0;
+		}
+	}
+
+	return 0;
+
+err0:
+	ib_umem_release(q->umem);
+	q->umem = NULL;
+
+	return rc;
+}
+
+static inline void qedr_init_cq_params(struct qedr_cq *cq,
+				       struct qedr_ucontext *ctx,
+				       struct qedr_dev *dev, int vector,
+				       int chain_entries, int page_cnt,
+				       u64 pbl_ptr,
+				       struct qed_rdma_create_cq_in_params
+				       *params)
+{
+	memset(params, 0, sizeof(*params));
+	params->cq_handle_hi = upper_32_bits((uintptr_t)cq);
+	params->cq_handle_lo = lower_32_bits((uintptr_t)cq);
+	params->cnq_id = vector;
+	params->cq_size = chain_entries - 1;
+	params->dpi = (ctx) ? ctx->dpi : dev->dpi;
+	params->pbl_num_pages = page_cnt;
+	params->pbl_ptr = pbl_ptr;
+	params->pbl_two_level = 0;
+}
+
+static void doorbell_cq(struct qedr_cq *cq, u32 cons, u8 flags)
+{
+	cq->db.data.agg_flags = flags;
+	cq->db.data.value = cpu_to_le32(cons);
+	writeq(cq->db.raw, cq->db_addr);
+
+	/* Make sure write would stick */
+	mmiowb();
+}
+
+int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct qedr_cq *cq = get_qedr_cq(ibcq);
+	unsigned long sflags;
+	struct qedr_dev *dev;
+
+	dev = get_qedr_dev(ibcq->device);
+
+	if (cq->destroyed) {
+		DP_ERR(dev,
+		       "warning: arm was invoked after destroy for cq %p (icid=%d)\n",
+		       cq, cq->icid);
+		return -EINVAL;
+	}
+
+
+	if (cq->cq_type == QEDR_CQ_TYPE_GSI)
+		return 0;
+
+	spin_lock_irqsave(&cq->cq_lock, sflags);
+
+	cq->arm_flags = 0;
+
+	if (flags & IB_CQ_SOLICITED)
+		cq->arm_flags |= DQ_UCM_ROCE_CQ_ARM_SE_CF_CMD;
+
+	if (flags & IB_CQ_NEXT_COMP)
+		cq->arm_flags |= DQ_UCM_ROCE_CQ_ARM_CF_CMD;
+
+	doorbell_cq(cq, cq->cq_cons - 1, cq->arm_flags);
+
+	spin_unlock_irqrestore(&cq->cq_lock, sflags);
+
+	return 0;
+}
+
+struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
+			     const struct ib_cq_init_attr *attr,
+			     struct ib_ucontext *ib_ctx, struct ib_udata *udata)
+{
+	struct qedr_ucontext *ctx = get_qedr_ucontext(ib_ctx);
+	struct qed_rdma_destroy_cq_out_params destroy_oparams;
+	struct qed_rdma_destroy_cq_in_params destroy_iparams;
+	struct qedr_dev *dev = get_qedr_dev(ibdev);
+	struct qed_rdma_create_cq_in_params params;
+	struct qedr_create_cq_ureq ureq;
+	int vector = attr->comp_vector;
+	int entries = attr->cqe;
+	struct qedr_cq *cq;
+	int chain_entries;
+	int page_cnt;
+	u64 pbl_ptr;
+	u16 icid;
+	int rc;
+
+	DP_DEBUG(dev, QEDR_MSG_INIT,
+		 "create_cq: called from %s. entries=%d, vector=%d\n",
+		 udata ? "User Lib" : "Kernel", entries, vector);
+
+	if (entries > QEDR_MAX_CQES) {
+		DP_ERR(dev,
+		       "create cq: the number of entries %d is too high. Must be equal or below %d.\n",
+		       entries, QEDR_MAX_CQES);
+		return ERR_PTR(-EINVAL);
+	}
+
+	chain_entries = qedr_align_cq_entries(entries);
+	chain_entries = min_t(int, chain_entries, QEDR_MAX_CQES);
+
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	if (udata) {
+		memset(&ureq, 0, sizeof(ureq));
+		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
+			DP_ERR(dev,
+			       "create cq: problem copying data from user space\n");
+			goto err0;
+		}
+
+		if (!ureq.len) {
+			DP_ERR(dev,
+			       "create cq: cannot create a cq with 0 entries\n");
+			goto err0;
+		}
+
+		cq->cq_type = QEDR_CQ_TYPE_USER;
+
+		rc = qedr_init_user_queue(ib_ctx, dev, &cq->q, ureq.addr,
+					  ureq.len, IB_ACCESS_LOCAL_WRITE,
+					  1, 1);
+		if (rc)
+			goto err0;
+
+		pbl_ptr = cq->q.pbl_tbl->pa;
+		page_cnt = cq->q.pbl_info.num_pbes;
+
+		cq->ibcq.cqe = chain_entries;
+	} else {
+		cq->cq_type = QEDR_CQ_TYPE_KERNEL;
+
+		rc = dev->ops->common->chain_alloc(dev->cdev,
+						   QED_CHAIN_USE_TO_CONSUME,
+						   QED_CHAIN_MODE_PBL,
+						   QED_CHAIN_CNT_TYPE_U32,
+						   chain_entries,
+						   sizeof(union rdma_cqe),
+						   &cq->pbl, NULL);
+		if (rc)
+			goto err1;
+
+		page_cnt = qed_chain_get_page_cnt(&cq->pbl);
+		pbl_ptr = qed_chain_get_pbl_phys(&cq->pbl);
+		cq->ibcq.cqe = cq->pbl.capacity;
+	}
+
+	qedr_init_cq_params(cq, ctx, dev, vector, chain_entries, page_cnt,
+			    pbl_ptr, &params);
+
+	rc = dev->ops->rdma_create_cq(dev->rdma_ctx, &params, &icid);
+	if (rc)
+		goto err2;
+
+	cq->icid = icid;
+	cq->sig = QEDR_CQ_MAGIC_NUMBER;
+	spin_lock_init(&cq->cq_lock);
+
+	if (ib_ctx) {
+		rc = qedr_copy_cq_uresp(dev, cq, udata);
+		if (rc)
+			goto err3;
+	} else {
+		/* Generate doorbell address. */
+		cq->db_addr = dev->db_addr +
+		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT);
+		cq->db.data.icid = cq->icid;
+		cq->db.data.params = DB_AGG_CMD_SET <<
+		    RDMA_PWM_VAL32_DATA_AGG_CMD_SHIFT;
+
+		/* point to the very last element, passing it we will toggle */
+		cq->toggle_cqe = qed_chain_get_last_elem(&cq->pbl);
+		cq->pbl_toggle = RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK;
+		cq->latest_cqe = NULL;
+		consume_cqe(cq);
+		cq->cq_cons = qed_chain_get_cons_idx_u32(&cq->pbl);
+	}
+
+	DP_DEBUG(dev, QEDR_MSG_CQ,
+		 "create cq: icid=0x%0x, addr=%p, size(entries)=0x%0x\n",
+		 cq->icid, cq, params.cq_size);
+
+	return &cq->ibcq;
+
+err3:
+	destroy_iparams.icid = cq->icid;
+	dev->ops->rdma_destroy_cq(dev->rdma_ctx, &destroy_iparams,
+				  &destroy_oparams);
+err2:
+	if (udata)
+		qedr_free_pbl(dev, &cq->q.pbl_info, cq->q.pbl_tbl);
+	else
+		dev->ops->common->chain_free(dev->cdev, &cq->pbl);
+err1:
+	if (udata)
+		ib_umem_release(cq->q.umem);
+err0:
+	kfree(cq);
+	return ERR_PTR(-EINVAL);
+}
+
+int qedr_resize_cq(struct ib_cq *ibcq, int new_cnt, struct ib_udata *udata)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibcq->device);
+	struct qedr_cq *cq = get_qedr_cq(ibcq);
+
+	DP_ERR(dev, "cq %p RESIZE NOT SUPPORTED\n", cq);
+
+	return 0;
+}
+
+#define QEDR_DESTROY_CQ_MAX_ITERATIONS		(10)
+#define QEDR_DESTROY_CQ_ITER_DURATION		(10)
+
+int qedr_destroy_cq(struct ib_cq *ibcq)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibcq->device);
+	struct qed_rdma_destroy_cq_out_params oparams;
+	struct qed_rdma_destroy_cq_in_params iparams;
+	struct qedr_cq *cq = get_qedr_cq(ibcq);
+	int iter;
+	int rc;
+
+	DP_DEBUG(dev, QEDR_MSG_CQ, "destroy cq %p (icid=%d)\n", cq, cq->icid);
+
+	cq->destroyed = 1;
+
+	/* GSIs CQs are handled by driver, so they don't exist in the FW */
+	if (cq->cq_type == QEDR_CQ_TYPE_GSI)
+		goto done;
+
+	iparams.icid = cq->icid;
+	rc = dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams);
+	if (rc)
+		return rc;
+
+	dev->ops->common->chain_free(dev->cdev, &cq->pbl);
+
+	if (ibcq->uobject && ibcq->uobject->context) {
+		qedr_free_pbl(dev, &cq->q.pbl_info, cq->q.pbl_tbl);
+		ib_umem_release(cq->q.umem);
+	}
+
+	/* We don't want the IRQ handler to handle a non-existing CQ so we
+	 * wait until all CNQ interrupts, if any, are received. This will always
+	 * happen and will always happen very fast. If not, then a serious error
+	 * has occured. That is why we can use a long delay.
+	 * We spin for a short time so we donât lose time on context switching
+	 * in case all the completions are handled in that span. Otherwise
+	 * we sleep for a while and check again. Since the CNQ may be
+	 * associated with (only) the current CPU we use msleep to allow the
+	 * current CPU to be freed.
+	 * The CNQ notification is increased in qedr_irq_handler().
+	 */
+	iter = QEDR_DESTROY_CQ_MAX_ITERATIONS;
+	while (oparams.num_cq_notif != READ_ONCE(cq->cnq_notif) && iter) {
+		udelay(QEDR_DESTROY_CQ_ITER_DURATION);
+		iter--;
+	}
+
+	iter = QEDR_DESTROY_CQ_MAX_ITERATIONS;
+	while (oparams.num_cq_notif != READ_ONCE(cq->cnq_notif) && iter) {
+		msleep(QEDR_DESTROY_CQ_ITER_DURATION);
+		iter--;
+	}
+
+	if (oparams.num_cq_notif != cq->cnq_notif)
+		goto err;
+
+	/* Note that we don't need to have explicit code to wait for the
+	 * completion of the event handler because it is invoked from the EQ.
+	 * Since the destroy CQ ramrod has also been received on the EQ we can
+	 * be certain that there's no event handler in process.
+	 */
+done:
+	cq->sig = ~cq->sig;
+
+	kfree(cq);
+
+	return 0;
+
+err:
+	DP_ERR(dev,
+	       "CQ %p (icid=%d) not freed, expecting %d ints but got %d ints\n",
+	       cq, cq->icid, oparams.num_cq_notif, cq->cnq_notif);
+
+	return -EINVAL;
+}
+
+static inline int get_gid_info_from_table(struct ib_qp *ibqp,
+					  struct ib_qp_attr *attr,
+					  int attr_mask,
+					  struct qed_rdma_modify_qp_in_params
+					  *qp_params)
+{
+	enum rdma_network_type nw_type;
+	struct ib_gid_attr gid_attr;
+	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
+	union ib_gid gid;
+	u32 ipv4_addr;
+	int rc = 0;
+	int i;
+
+	rc = ib_get_cached_gid(ibqp->device,
+			       rdma_ah_get_port_num(&attr->ah_attr),
+			       grh->sgid_index, &gid, &gid_attr);
+	if (rc)
+		return rc;
+
+	qp_params->vlan_id = rdma_vlan_dev_vlan_id(gid_attr.ndev);
+
+	dev_put(gid_attr.ndev);
+	nw_type = ib_gid_to_network_type(gid_attr.gid_type, &gid);
+	switch (nw_type) {
+	case RDMA_NETWORK_IPV6:
+		memcpy(&qp_params->sgid.bytes[0], &gid.raw[0],
+		       sizeof(qp_params->sgid));
+		memcpy(&qp_params->dgid.bytes[0],
+		       &grh->dgid,
+		       sizeof(qp_params->dgid));
+		qp_params->roce_mode = ROCE_V2_IPV6;
+		SET_FIELD(qp_params->modify_flags,
+			  QED_ROCE_MODIFY_QP_VALID_ROCE_MODE, 1);
+		break;
+	case RDMA_NETWORK_IB:
+		memcpy(&qp_params->sgid.bytes[0], &gid.raw[0],
+		       sizeof(qp_params->sgid));
+		memcpy(&qp_params->dgid.bytes[0],
+		       &grh->dgid,
+		       sizeof(qp_params->dgid));
+		qp_params->roce_mode = ROCE_V1;
+		break;
+	case RDMA_NETWORK_IPV4:
+		memset(&qp_params->sgid, 0, sizeof(qp_params->sgid));
+		memset(&qp_params->dgid, 0, sizeof(qp_params->dgid));
+		ipv4_addr = qedr_get_ipv4_from_gid(gid.raw);
+		qp_params->sgid.ipv4_addr = ipv4_addr;
+		ipv4_addr =
+		    qedr_get_ipv4_from_gid(grh->dgid.raw);
+		qp_params->dgid.ipv4_addr = ipv4_addr;
+		SET_FIELD(qp_params->modify_flags,
+			  QED_ROCE_MODIFY_QP_VALID_ROCE_MODE, 1);
+		qp_params->roce_mode = ROCE_V2_IPV4;
+		break;
+	}
+
+	for (i = 0; i < 4; i++) {
+		qp_params->sgid.dwords[i] = ntohl(qp_params->sgid.dwords[i]);
+		qp_params->dgid.dwords[i] = ntohl(qp_params->dgid.dwords[i]);
+	}
+
+	if (qp_params->vlan_id >= VLAN_CFI_MASK)
+		qp_params->vlan_id = 0;
+
+	return 0;
+}
+
+static int qedr_check_qp_attrs(struct ib_pd *ibpd, struct qedr_dev *dev,
+			       struct ib_qp_init_attr *attrs)
+{
+	struct qedr_device_attr *qattr = &dev->attr;
+
+	/* QP0... attrs->qp_type == IB_QPT_GSI */
+	if (attrs->qp_type != IB_QPT_RC && attrs->qp_type != IB_QPT_GSI) {
+		DP_DEBUG(dev, QEDR_MSG_QP,
+			 "create qp: unsupported qp type=0x%x requested\n",
+			 attrs->qp_type);
+		return -EINVAL;
+	}
+
+	if (attrs->cap.max_send_wr > qattr->max_sqe) {
+		DP_ERR(dev,
+		       "create qp: cannot create a SQ with %d elements (max_send_wr=0x%x)\n",
+		       attrs->cap.max_send_wr, qattr->max_sqe);
+		return -EINVAL;
+	}
+
+	if (attrs->cap.max_inline_data > qattr->max_inline) {
+		DP_ERR(dev,
+		       "create qp: unsupported inline data size=0x%x requested (max_inline=0x%x)\n",
+		       attrs->cap.max_inline_data, qattr->max_inline);
+		return -EINVAL;
+	}
+
+	if (attrs->cap.max_send_sge > qattr->max_sge) {
+		DP_ERR(dev,
+		       "create qp: unsupported send_sge=0x%x requested (max_send_sge=0x%x)\n",
+		       attrs->cap.max_send_sge, qattr->max_sge);
+		return -EINVAL;
+	}
+
+	if (attrs->cap.max_recv_sge > qattr->max_sge) {
+		DP_ERR(dev,
+		       "create qp: unsupported recv_sge=0x%x requested (max_recv_sge=0x%x)\n",
+		       attrs->cap.max_recv_sge, qattr->max_sge);
+		return -EINVAL;
+	}
+
+	/* Unprivileged user space cannot create special QP */
+	if (ibpd->uobject && attrs->qp_type == IB_QPT_GSI) {
+		DP_ERR(dev,
+		       "create qp: userspace can't create special QPs of type=0x%x\n",
+		       attrs->qp_type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void qedr_copy_rq_uresp(struct qedr_dev *dev,
+			       struct qedr_create_qp_uresp *uresp,
+			       struct qedr_qp *qp)
+{
+	/* iWARP requires two doorbells per RQ. */
+	if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
+		uresp->rq_db_offset =
+		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_IWARP_RQ_PROD);
+		uresp->rq_db2_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_FLAGS);
+	} else {
+		uresp->rq_db_offset =
+		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD);
+	}
+
+	uresp->rq_icid = qp->icid;
+}
+
+static void qedr_copy_sq_uresp(struct qedr_dev *dev,
+			       struct qedr_create_qp_uresp *uresp,
+			       struct qedr_qp *qp)
+{
+	uresp->sq_db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD);
+
+	/* iWARP uses the same cid for rq and sq */
+	if (rdma_protocol_iwarp(&dev->ibdev, 1))
+		uresp->sq_icid = qp->icid;
+	else
+		uresp->sq_icid = qp->icid + 1;
+}
+
+static int qedr_copy_qp_uresp(struct qedr_dev *dev,
+			      struct qedr_qp *qp, struct ib_udata *udata)
+{
+	struct qedr_create_qp_uresp uresp;
+	int rc;
+
+	memset(&uresp, 0, sizeof(uresp));
+	qedr_copy_sq_uresp(dev, &uresp, qp);
+	qedr_copy_rq_uresp(dev, &uresp, qp);
+
+	uresp.atomic_supported = dev->atomic_cap != IB_ATOMIC_NONE;
+	uresp.qp_id = qp->qp_id;
+
+	rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (rc)
+		DP_ERR(dev,
+		       "create qp: failed a copy to user space with qp icid=0x%x.\n",
+		       qp->icid);
+
+	return rc;
+}
+
+static void qedr_set_common_qp_params(struct qedr_dev *dev,
+				      struct qedr_qp *qp,
+				      struct qedr_pd *pd,
+				      struct ib_qp_init_attr *attrs)
+{
+	spin_lock_init(&qp->q_lock);
+	atomic_set(&qp->refcnt, 1);
+	qp->pd = pd;
+	qp->qp_type = attrs->qp_type;
+	qp->max_inline_data = attrs->cap.max_inline_data;
+	qp->sq.max_sges = attrs->cap.max_send_sge;
+	qp->state = QED_ROCE_QP_STATE_RESET;
+	qp->signaled = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false;
+	qp->sq_cq = get_qedr_cq(attrs->send_cq);
+	qp->rq_cq = get_qedr_cq(attrs->recv_cq);
+	qp->dev = dev;
+	qp->rq.max_sges = attrs->cap.max_recv_sge;
+
+	DP_DEBUG(dev, QEDR_MSG_QP,
+		 "RQ params:\trq_max_sges = %d, rq_cq_id = %d\n",
+		 qp->rq.max_sges, qp->rq_cq->icid);
+	DP_DEBUG(dev, QEDR_MSG_QP,
+		 "QP params:\tpd = %d, qp_type = %d, max_inline_data = %d, state = %d, signaled = %d, use_srq=%d\n",
+		 pd->pd_id, qp->qp_type, qp->max_inline_data,
+		 qp->state, qp->signaled, (attrs->srq) ? 1 : 0);
+	DP_DEBUG(dev, QEDR_MSG_QP,
+		 "SQ params:\tsq_max_sges = %d, sq_cq_id = %d\n",
+		 qp->sq.max_sges, qp->sq_cq->icid);
+}
+
+static void qedr_set_roce_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
+{
+	qp->sq.db = dev->db_addr +
+		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD);
+	qp->sq.db_data.data.icid = qp->icid + 1;
+	qp->rq.db = dev->db_addr +
+		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD);
+	qp->rq.db_data.data.icid = qp->icid;
+}
+
+static inline void
+qedr_init_common_qp_in_params(struct qedr_dev *dev,
+			      struct qedr_pd *pd,
+			      struct qedr_qp *qp,
+			      struct ib_qp_init_attr *attrs,
+			      bool fmr_and_reserved_lkey,
+			      struct qed_rdma_create_qp_in_params *params)
+{
+	/* QP handle to be written in an async event */
+	params->qp_handle_async_lo = lower_32_bits((uintptr_t) qp);
+	params->qp_handle_async_hi = upper_32_bits((uintptr_t) qp);
+
+	params->signal_all = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR);
+	params->fmr_and_reserved_lkey = fmr_and_reserved_lkey;
+	params->pd = pd->pd_id;
+	params->dpi = pd->uctx ? pd->uctx->dpi : dev->dpi;
+	params->sq_cq_id = get_qedr_cq(attrs->send_cq)->icid;
+	params->stats_queue = 0;
+	params->rq_cq_id = get_qedr_cq(attrs->recv_cq)->icid;
+	params->srq_id = 0;
+	params->use_srq = false;
+}
+
+static inline void qedr_qp_user_print(struct qedr_dev *dev, struct qedr_qp *qp)
+{
+	DP_DEBUG(dev, QEDR_MSG_QP, "create qp: successfully created user QP. "
+		 "qp=%p. "
+		 "sq_addr=0x%llx, "
+		 "sq_len=%zd, "
+		 "rq_addr=0x%llx, "
+		 "rq_len=%zd"
+		 "\n",
+		 qp,
+		 qp->usq.buf_addr,
+		 qp->usq.buf_len, qp->urq.buf_addr, qp->urq.buf_len);
+}
+
+static int qedr_idr_add(struct qedr_dev *dev, void *ptr, u32 id)
+{
+	int rc;
+
+	if (!rdma_protocol_iwarp(&dev->ibdev, 1))
+		return 0;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(&dev->idr_lock);
+
+	rc = idr_alloc(&dev->qpidr, ptr, id, id + 1, GFP_ATOMIC);
+
+	spin_unlock_irq(&dev->idr_lock);
+	idr_preload_end();
+
+	return rc < 0 ? rc : 0;
+}
+
+static void qedr_idr_remove(struct qedr_dev *dev, u32 id)
+{
+	if (!rdma_protocol_iwarp(&dev->ibdev, 1))
+		return;
+
+	spin_lock_irq(&dev->idr_lock);
+	idr_remove(&dev->qpidr, id);
+	spin_unlock_irq(&dev->idr_lock);
+}
+
+static inline void
+qedr_iwarp_populate_user_qp(struct qedr_dev *dev,
+			    struct qedr_qp *qp,
+			    struct qed_rdma_create_qp_out_params *out_params)
+{
+	qp->usq.pbl_tbl->va = out_params->sq_pbl_virt;
+	qp->usq.pbl_tbl->pa = out_params->sq_pbl_phys;
+
+	qedr_populate_pbls(dev, qp->usq.umem, qp->usq.pbl_tbl,
+			   &qp->usq.pbl_info, FW_PAGE_SHIFT);
+
+	qp->urq.pbl_tbl->va = out_params->rq_pbl_virt;
+	qp->urq.pbl_tbl->pa = out_params->rq_pbl_phys;
+
+	qedr_populate_pbls(dev, qp->urq.umem, qp->urq.pbl_tbl,
+			   &qp->urq.pbl_info, FW_PAGE_SHIFT);
+}
+
+static void qedr_cleanup_user(struct qedr_dev *dev, struct qedr_qp *qp)
+{
+	if (qp->usq.umem)
+		ib_umem_release(qp->usq.umem);
+	qp->usq.umem = NULL;
+
+	if (qp->urq.umem)
+		ib_umem_release(qp->urq.umem);
+	qp->urq.umem = NULL;
+}
+
+static int qedr_create_user_qp(struct qedr_dev *dev,
+			       struct qedr_qp *qp,
+			       struct ib_pd *ibpd,
+			       struct ib_udata *udata,
+			       struct ib_qp_init_attr *attrs)
+{
+	struct qed_rdma_create_qp_in_params in_params;
+	struct qed_rdma_create_qp_out_params out_params;
+	struct qedr_pd *pd = get_qedr_pd(ibpd);
+	struct ib_ucontext *ib_ctx = NULL;
+	struct qedr_create_qp_ureq ureq;
+	int alloc_and_init = rdma_protocol_roce(&dev->ibdev, 1);
+	int rc = -EINVAL;
+
+	ib_ctx = ibpd->uobject->context;
+
+	memset(&ureq, 0, sizeof(ureq));
+	rc = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
+	if (rc) {
+		DP_ERR(dev, "Problem copying data from user space\n");
+		return rc;
+	}
+
+	/* SQ - read access only (0), dma sync not required (0) */
+	rc = qedr_init_user_queue(ib_ctx, dev, &qp->usq, ureq.sq_addr,
+				  ureq.sq_len, 0, 0, alloc_and_init);
+	if (rc)
+		return rc;
+
+	/* RQ - read access only (0), dma sync not required (0) */
+	rc = qedr_init_user_queue(ib_ctx, dev, &qp->urq, ureq.rq_addr,
+				  ureq.rq_len, 0, 0, alloc_and_init);
+	if (rc)
+		return rc;
+
+	memset(&in_params, 0, sizeof(in_params));
+	qedr_init_common_qp_in_params(dev, pd, qp, attrs, false, &in_params);
+	in_params.qp_handle_lo = ureq.qp_handle_lo;
+	in_params.qp_handle_hi = ureq.qp_handle_hi;
+	in_params.sq_num_pages = qp->usq.pbl_info.num_pbes;
+	in_params.sq_pbl_ptr = qp->usq.pbl_tbl->pa;
+	in_params.rq_num_pages = qp->urq.pbl_info.num_pbes;
+	in_params.rq_pbl_ptr = qp->urq.pbl_tbl->pa;
+
+	qp->qed_qp = dev->ops->rdma_create_qp(dev->rdma_ctx,
+					      &in_params, &out_params);
+
+	if (!qp->qed_qp) {
+		rc = -ENOMEM;
+		goto err1;
+	}
+
+	if (rdma_protocol_iwarp(&dev->ibdev, 1))
+		qedr_iwarp_populate_user_qp(dev, qp, &out_params);
+
+	qp->qp_id = out_params.qp_id;
+	qp->icid = out_params.icid;
+
+	rc = qedr_copy_qp_uresp(dev, qp, udata);
+	if (rc)
+		goto err;
+
+	qedr_qp_user_print(dev, qp);
+
+	return 0;
+err:
+	rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
+	if (rc)
+		DP_ERR(dev, "create qp: fatal fault. rc=%d", rc);
+
+err1:
+	qedr_cleanup_user(dev, qp);
+	return rc;
+}
+
+static void qedr_set_iwarp_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
+{
+	qp->sq.db = dev->db_addr +
+	    DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD);
+	qp->sq.db_data.data.icid = qp->icid;
+
+	qp->rq.db = dev->db_addr +
+		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_IWARP_RQ_PROD);
+	qp->rq.db_data.data.icid = qp->icid;
+	qp->rq.iwarp_db2 = dev->db_addr +
+			   DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_FLAGS);
+	qp->rq.iwarp_db2_data.data.icid = qp->icid;
+	qp->rq.iwarp_db2_data.data.value = DQ_TCM_IWARP_POST_RQ_CF_CMD;
+}
+
+static int
+qedr_roce_create_kernel_qp(struct qedr_dev *dev,
+			   struct qedr_qp *qp,
+			   struct qed_rdma_create_qp_in_params *in_params,
+			   u32 n_sq_elems, u32 n_rq_elems)
+{
+	struct qed_rdma_create_qp_out_params out_params;
+	int rc;
+
+	rc = dev->ops->common->chain_alloc(dev->cdev,
+					   QED_CHAIN_USE_TO_PRODUCE,
+					   QED_CHAIN_MODE_PBL,
+					   QED_CHAIN_CNT_TYPE_U32,
+					   n_sq_elems,
+					   QEDR_SQE_ELEMENT_SIZE,
+					   &qp->sq.pbl, NULL);
+
+	if (rc)
+		return rc;
+
+	in_params->sq_num_pages = qed_chain_get_page_cnt(&qp->sq.pbl);
+	in_params->sq_pbl_ptr = qed_chain_get_pbl_phys(&qp->sq.pbl);
+
+	rc = dev->ops->common->chain_alloc(dev->cdev,
+					   QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+					   QED_CHAIN_MODE_PBL,
+					   QED_CHAIN_CNT_TYPE_U32,
+					   n_rq_elems,
+					   QEDR_RQE_ELEMENT_SIZE,
+					   &qp->rq.pbl, NULL);
+	if (rc)
+		return rc;
+
+	in_params->rq_num_pages = qed_chain_get_page_cnt(&qp->rq.pbl);
+	in_params->rq_pbl_ptr = qed_chain_get_pbl_phys(&qp->rq.pbl);
+
+	qp->qed_qp = dev->ops->rdma_create_qp(dev->rdma_ctx,
+					      in_params, &out_params);
+
+	if (!qp->qed_qp)
+		return -EINVAL;
+
+	qp->qp_id = out_params.qp_id;
+	qp->icid = out_params.icid;
+
+	qedr_set_roce_db_info(dev, qp);
+	return rc;
+}
+
+static int
+qedr_iwarp_create_kernel_qp(struct qedr_dev *dev,
+			    struct qedr_qp *qp,
+			    struct qed_rdma_create_qp_in_params *in_params,
+			    u32 n_sq_elems, u32 n_rq_elems)
+{
+	struct qed_rdma_create_qp_out_params out_params;
+	struct qed_chain_ext_pbl ext_pbl;
+	int rc;
+
+	in_params->sq_num_pages = QED_CHAIN_PAGE_CNT(n_sq_elems,
+						     QEDR_SQE_ELEMENT_SIZE,
+						     QED_CHAIN_MODE_PBL);
+	in_params->rq_num_pages = QED_CHAIN_PAGE_CNT(n_rq_elems,
+						     QEDR_RQE_ELEMENT_SIZE,
+						     QED_CHAIN_MODE_PBL);
+
+	qp->qed_qp = dev->ops->rdma_create_qp(dev->rdma_ctx,
+					      in_params, &out_params);
+
+	if (!qp->qed_qp)
+		return -EINVAL;
+
+	/* Now we allocate the chain */
+	ext_pbl.p_pbl_virt = out_params.sq_pbl_virt;
+	ext_pbl.p_pbl_phys = out_params.sq_pbl_phys;
+
+	rc = dev->ops->common->chain_alloc(dev->cdev,
+					   QED_CHAIN_USE_TO_PRODUCE,
+					   QED_CHAIN_MODE_PBL,
+					   QED_CHAIN_CNT_TYPE_U32,
+					   n_sq_elems,
+					   QEDR_SQE_ELEMENT_SIZE,
+					   &qp->sq.pbl, &ext_pbl);
+
+	if (rc)
+		goto err;
+
+	ext_pbl.p_pbl_virt = out_params.rq_pbl_virt;
+	ext_pbl.p_pbl_phys = out_params.rq_pbl_phys;
+
+	rc = dev->ops->common->chain_alloc(dev->cdev,
+					   QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+					   QED_CHAIN_MODE_PBL,
+					   QED_CHAIN_CNT_TYPE_U32,
+					   n_rq_elems,
+					   QEDR_RQE_ELEMENT_SIZE,
+					   &qp->rq.pbl, &ext_pbl);
+
+	if (rc)
+		goto err;
+
+	qp->qp_id = out_params.qp_id;
+	qp->icid = out_params.icid;
+
+	qedr_set_iwarp_db_info(dev, qp);
+	return rc;
+
+err:
+	dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
+
+	return rc;
+}
+
+static void qedr_cleanup_kernel(struct qedr_dev *dev, struct qedr_qp *qp)
+{
+	dev->ops->common->chain_free(dev->cdev, &qp->sq.pbl);
+	kfree(qp->wqe_wr_id);
+
+	dev->ops->common->chain_free(dev->cdev, &qp->rq.pbl);
+	kfree(qp->rqe_wr_id);
+}
+
+static int qedr_create_kernel_qp(struct qedr_dev *dev,
+				 struct qedr_qp *qp,
+				 struct ib_pd *ibpd,
+				 struct ib_qp_init_attr *attrs)
+{
+	struct qed_rdma_create_qp_in_params in_params;
+	struct qedr_pd *pd = get_qedr_pd(ibpd);
+	int rc = -EINVAL;
+	u32 n_rq_elems;
+	u32 n_sq_elems;
+	u32 n_sq_entries;
+
+	memset(&in_params, 0, sizeof(in_params));
+
+	/* A single work request may take up to QEDR_MAX_SQ_WQE_SIZE elements in
+	 * the ring. The ring should allow at least a single WR, even if the
+	 * user requested none, due to allocation issues.
+	 * We should add an extra WR since the prod and cons indices of
+	 * wqe_wr_id are managed in such a way that the WQ is considered full
+	 * when (prod+1)%max_wr==cons. We currently don't do that because we
+	 * double the number of entries due an iSER issue that pushes far more
+	 * WRs than indicated. If we decline its ib_post_send() then we get
+	 * error prints in the dmesg we'd like to avoid.
+	 */
+	qp->sq.max_wr = min_t(u32, attrs->cap.max_send_wr * dev->wq_multiplier,
+			      dev->attr.max_sqe);
+
+	qp->wqe_wr_id = kzalloc(qp->sq.max_wr * sizeof(*qp->wqe_wr_id),
+				GFP_KERNEL);
+	if (!qp->wqe_wr_id) {
+		DP_ERR(dev, "create qp: failed SQ shadow memory allocation\n");
+		return -ENOMEM;
+	}
+
+	/* QP handle to be written in CQE */
+	in_params.qp_handle_lo = lower_32_bits((uintptr_t) qp);
+	in_params.qp_handle_hi = upper_32_bits((uintptr_t) qp);
+
+	/* A single work request may take up to QEDR_MAX_RQ_WQE_SIZE elements in
+	 * the ring. There ring should allow at least a single WR, even if the
+	 * user requested none, due to allocation issues.
+	 */
+	qp->rq.max_wr = (u16) max_t(u32, attrs->cap.max_recv_wr, 1);
+
+	/* Allocate driver internal RQ array */
+	qp->rqe_wr_id = kzalloc(qp->rq.max_wr * sizeof(*qp->rqe_wr_id),
+				GFP_KERNEL);
+	if (!qp->rqe_wr_id) {
+		DP_ERR(dev,
+		       "create qp: failed RQ shadow memory allocation\n");
+		kfree(qp->wqe_wr_id);
+		return -ENOMEM;
+	}
+
+	qedr_init_common_qp_in_params(dev, pd, qp, attrs, true, &in_params);
+
+	n_sq_entries = attrs->cap.max_send_wr;
+	n_sq_entries = min_t(u32, n_sq_entries, dev->attr.max_sqe);
+	n_sq_entries = max_t(u32, n_sq_entries, 1);
+	n_sq_elems = n_sq_entries * QEDR_MAX_SQE_ELEMENTS_PER_SQE;
+
+	n_rq_elems = qp->rq.max_wr * QEDR_MAX_RQE_ELEMENTS_PER_RQE;
+
+	if (rdma_protocol_iwarp(&dev->ibdev, 1))
+		rc = qedr_iwarp_create_kernel_qp(dev, qp, &in_params,
+						 n_sq_elems, n_rq_elems);
+	else
+		rc = qedr_roce_create_kernel_qp(dev, qp, &in_params,
+						n_sq_elems, n_rq_elems);
+	if (rc)
+		qedr_cleanup_kernel(dev, qp);
+
+	return rc;
+}
+
+struct ib_qp *qedr_create_qp(struct ib_pd *ibpd,
+			     struct ib_qp_init_attr *attrs,
+			     struct ib_udata *udata)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
+	struct qedr_pd *pd = get_qedr_pd(ibpd);
+	struct qedr_qp *qp;
+	struct ib_qp *ibqp;
+	int rc = 0;
+
+	DP_DEBUG(dev, QEDR_MSG_QP, "create qp: called from %s, pd=%p\n",
+		 udata ? "user library" : "kernel", pd);
+
+	rc = qedr_check_qp_attrs(ibpd, dev, attrs);
+	if (rc)
+		return ERR_PTR(rc);
+
+	if (attrs->srq)
+		return ERR_PTR(-EINVAL);
+
+	DP_DEBUG(dev, QEDR_MSG_QP,
+		 "create qp: called from %s, event_handler=%p, eepd=%p sq_cq=%p, sq_icid=%d, rq_cq=%p, rq_icid=%d\n",
+		 udata ? "user library" : "kernel", attrs->event_handler, pd,
+		 get_qedr_cq(attrs->send_cq),
+		 get_qedr_cq(attrs->send_cq)->icid,
+		 get_qedr_cq(attrs->recv_cq),
+		 get_qedr_cq(attrs->recv_cq)->icid);
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp) {
+		DP_ERR(dev, "create qp: failed allocating memory\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	qedr_set_common_qp_params(dev, qp, pd, attrs);
+
+	if (attrs->qp_type == IB_QPT_GSI) {
+		ibqp = qedr_create_gsi_qp(dev, attrs, qp);
+		if (IS_ERR(ibqp))
+			kfree(qp);
+		return ibqp;
+	}
+
+	if (udata)
+		rc = qedr_create_user_qp(dev, qp, ibpd, udata, attrs);
+	else
+		rc = qedr_create_kernel_qp(dev, qp, ibpd, attrs);
+
+	if (rc)
+		goto err;
+
+	qp->ibqp.qp_num = qp->qp_id;
+
+	rc = qedr_idr_add(dev, qp, qp->qp_id);
+	if (rc)
+		goto err;
+
+	return &qp->ibqp;
+
+err:
+	kfree(qp);
+
+	return ERR_PTR(-EFAULT);
+}
+
+static enum ib_qp_state qedr_get_ibqp_state(enum qed_roce_qp_state qp_state)
+{
+	switch (qp_state) {
+	case QED_ROCE_QP_STATE_RESET:
+		return IB_QPS_RESET;
+	case QED_ROCE_QP_STATE_INIT:
+		return IB_QPS_INIT;
+	case QED_ROCE_QP_STATE_RTR:
+		return IB_QPS_RTR;
+	case QED_ROCE_QP_STATE_RTS:
+		return IB_QPS_RTS;
+	case QED_ROCE_QP_STATE_SQD:
+		return IB_QPS_SQD;
+	case QED_ROCE_QP_STATE_ERR:
+		return IB_QPS_ERR;
+	case QED_ROCE_QP_STATE_SQE:
+		return IB_QPS_SQE;
+	}
+	return IB_QPS_ERR;
+}
+
+static enum qed_roce_qp_state qedr_get_state_from_ibqp(
+					enum ib_qp_state qp_state)
+{
+	switch (qp_state) {
+	case IB_QPS_RESET:
+		return QED_ROCE_QP_STATE_RESET;
+	case IB_QPS_INIT:
+		return QED_ROCE_QP_STATE_INIT;
+	case IB_QPS_RTR:
+		return QED_ROCE_QP_STATE_RTR;
+	case IB_QPS_RTS:
+		return QED_ROCE_QP_STATE_RTS;
+	case IB_QPS_SQD:
+		return QED_ROCE_QP_STATE_SQD;
+	case IB_QPS_ERR:
+		return QED_ROCE_QP_STATE_ERR;
+	default:
+		return QED_ROCE_QP_STATE_ERR;
+	}
+}
+
+static void qedr_reset_qp_hwq_info(struct qedr_qp_hwq_info *qph)
+{
+	qed_chain_reset(&qph->pbl);
+	qph->prod = 0;
+	qph->cons = 0;
+	qph->wqe_cons = 0;
+	qph->db_data.data.value = cpu_to_le16(0);
+}
+
+static int qedr_update_qp_state(struct qedr_dev *dev,
+				struct qedr_qp *qp,
+				enum qed_roce_qp_state cur_state,
+				enum qed_roce_qp_state new_state)
+{
+	int status = 0;
+
+	if (new_state == cur_state)
+		return 0;
+
+	switch (cur_state) {
+	case QED_ROCE_QP_STATE_RESET:
+		switch (new_state) {
+		case QED_ROCE_QP_STATE_INIT:
+			qp->prev_wqe_size = 0;
+			qedr_reset_qp_hwq_info(&qp->sq);
+			qedr_reset_qp_hwq_info(&qp->rq);
+			break;
+		default:
+			status = -EINVAL;
+			break;
+		};
+		break;
+	case QED_ROCE_QP_STATE_INIT:
+		switch (new_state) {
+		case QED_ROCE_QP_STATE_RTR:
+			/* Update doorbell (in case post_recv was
+			 * done before move to RTR)
+			 */
+
+			if (rdma_protocol_roce(&dev->ibdev, 1)) {
+				writel(qp->rq.db_data.raw, qp->rq.db);
+				/* Make sure write takes effect */
+				mmiowb();
+			}
+			break;
+		case QED_ROCE_QP_STATE_ERR:
+			break;
+		default:
+			/* Invalid state change. */
+			status = -EINVAL;
+			break;
+		};
+		break;
+	case QED_ROCE_QP_STATE_RTR:
+		/* RTR->XXX */
+		switch (new_state) {
+		case QED_ROCE_QP_STATE_RTS:
+			break;
+		case QED_ROCE_QP_STATE_ERR:
+			break;
+		default:
+			/* Invalid state change. */
+			status = -EINVAL;
+			break;
+		};
+		break;
+	case QED_ROCE_QP_STATE_RTS:
+		/* RTS->XXX */
+		switch (new_state) {
+		case QED_ROCE_QP_STATE_SQD:
+			break;
+		case QED_ROCE_QP_STATE_ERR:
+			break;
+		default:
+			/* Invalid state change. */
+			status = -EINVAL;
+			break;
+		};
+		break;
+	case QED_ROCE_QP_STATE_SQD:
+		/* SQD->XXX */
+		switch (new_state) {
+		case QED_ROCE_QP_STATE_RTS:
+		case QED_ROCE_QP_STATE_ERR:
+			break;
+		default:
+			/* Invalid state change. */
+			status = -EINVAL;
+			break;
+		};
+		break;
+	case QED_ROCE_QP_STATE_ERR:
+		/* ERR->XXX */
+		switch (new_state) {
+		case QED_ROCE_QP_STATE_RESET:
+			if ((qp->rq.prod != qp->rq.cons) ||
+			    (qp->sq.prod != qp->sq.cons)) {
+				DP_NOTICE(dev,
+					  "Error->Reset with rq/sq not empty rq.prod=%x rq.cons=%x sq.prod=%x sq.cons=%x\n",
+					  qp->rq.prod, qp->rq.cons, qp->sq.prod,
+					  qp->sq.cons);
+				status = -EINVAL;
+			}
+			break;
+		default:
+			status = -EINVAL;
+			break;
+		};
+		break;
+	default:
+		status = -EINVAL;
+		break;
+	};
+
+	return status;
+}
+
+int qedr_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		   int attr_mask, struct ib_udata *udata)
+{
+	struct qedr_qp *qp = get_qedr_qp(ibqp);
+	struct qed_rdma_modify_qp_in_params qp_params = { 0 };
+	struct qedr_dev *dev = get_qedr_dev(&qp->dev->ibdev);
+	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
+	enum ib_qp_state old_qp_state, new_qp_state;
+	enum qed_roce_qp_state cur_state;
+	int rc = 0;
+
+	DP_DEBUG(dev, QEDR_MSG_QP,
+		 "modify qp: qp %p attr_mask=0x%x, state=%d", qp, attr_mask,
+		 attr->qp_state);
+
+	old_qp_state = qedr_get_ibqp_state(qp->state);
+	if (attr_mask & IB_QP_STATE)
+		new_qp_state = attr->qp_state;
+	else
+		new_qp_state = old_qp_state;
+
+	if (rdma_protocol_roce(&dev->ibdev, 1)) {
+		if (!ib_modify_qp_is_ok(old_qp_state, new_qp_state,
+					ibqp->qp_type, attr_mask,
+					IB_LINK_LAYER_ETHERNET)) {
+			DP_ERR(dev,
+			       "modify qp: invalid attribute mask=0x%x specified for\n"
+			       "qpn=0x%x of type=0x%x old_qp_state=0x%x, new_qp_state=0x%x\n",
+			       attr_mask, qp->qp_id, ibqp->qp_type,
+			       old_qp_state, new_qp_state);
+			rc = -EINVAL;
+			goto err;
+		}
+	}
+
+	/* Translate the masks... */
+	if (attr_mask & IB_QP_STATE) {
+		SET_FIELD(qp_params.modify_flags,
+			  QED_RDMA_MODIFY_QP_VALID_NEW_STATE, 1);
+		qp_params.new_state = qedr_get_state_from_ibqp(attr->qp_state);
+	}
+
+	if (attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY)
+		qp_params.sqd_async = true;
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		SET_FIELD(qp_params.modify_flags,
+			  QED_ROCE_MODIFY_QP_VALID_PKEY, 1);
+		if (attr->pkey_index >= QEDR_ROCE_PKEY_TABLE_LEN) {
+			rc = -EINVAL;
+			goto err;
+		}
+
+		qp_params.pkey = QEDR_ROCE_PKEY_DEFAULT;
+	}
+
+	if (attr_mask & IB_QP_QKEY)
+		qp->qkey = attr->qkey;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS) {
+		SET_FIELD(qp_params.modify_flags,
+			  QED_RDMA_MODIFY_QP_VALID_RDMA_OPS_EN, 1);
+		qp_params.incoming_rdma_read_en = attr->qp_access_flags &
+						  IB_ACCESS_REMOTE_READ;
+		qp_params.incoming_rdma_write_en = attr->qp_access_flags &
+						   IB_ACCESS_REMOTE_WRITE;
+		qp_params.incoming_atomic_en = attr->qp_access_flags &
+					       IB_ACCESS_REMOTE_ATOMIC;
+	}
+
+	if (attr_mask & (IB_QP_AV | IB_QP_PATH_MTU)) {
+		if (attr_mask & IB_QP_PATH_MTU) {
+			if (attr->path_mtu < IB_MTU_256 ||
+			    attr->path_mtu > IB_MTU_4096) {
+				pr_err("error: Only MTU sizes of 256, 512, 1024, 2048 and 4096 are supported by RoCE\n");
+				rc = -EINVAL;
+				goto err;
+			}
+			qp->mtu = min(ib_mtu_enum_to_int(attr->path_mtu),
+				      ib_mtu_enum_to_int(iboe_get_mtu
+							 (dev->ndev->mtu)));
+		}
+
+		if (!qp->mtu) {
+			qp->mtu =
+			ib_mtu_enum_to_int(iboe_get_mtu(dev->ndev->mtu));
+			pr_err("Fixing zeroed MTU to qp->mtu = %d\n", qp->mtu);
+		}
+
+		SET_FIELD(qp_params.modify_flags,
+			  QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR, 1);
+
+		qp_params.traffic_class_tos = grh->traffic_class;
+		qp_params.flow_label = grh->flow_label;
+		qp_params.hop_limit_ttl = grh->hop_limit;
+
+		qp->sgid_idx = grh->sgid_index;
+
+		rc = get_gid_info_from_table(ibqp, attr, attr_mask, &qp_params);
+		if (rc) {
+			DP_ERR(dev,
+			       "modify qp: problems with GID index %d (rc=%d)\n",
+			       grh->sgid_index, rc);
+			return rc;
+		}
+
+		rc = qedr_get_dmac(dev, &attr->ah_attr,
+				   qp_params.remote_mac_addr);
+		if (rc)
+			return rc;
+
+		qp_params.use_local_mac = true;
+		ether_addr_copy(qp_params.local_mac_addr, dev->ndev->dev_addr);
+
+		DP_DEBUG(dev, QEDR_MSG_QP, "dgid=%x:%x:%x:%x\n",
+			 qp_params.dgid.dwords[0], qp_params.dgid.dwords[1],
+			 qp_params.dgid.dwords[2], qp_params.dgid.dwords[3]);
+		DP_DEBUG(dev, QEDR_MSG_QP, "sgid=%x:%x:%x:%x\n",
+			 qp_params.sgid.dwords[0], qp_params.sgid.dwords[1],
+			 qp_params.sgid.dwords[2], qp_params.sgid.dwords[3]);
+		DP_DEBUG(dev, QEDR_MSG_QP, "remote_mac=[%pM]\n",
+			 qp_params.remote_mac_addr);
+
+		qp_params.mtu = qp->mtu;
+		qp_params.lb_indication = false;
+	}
+
+	if (!qp_params.mtu) {
+		/* Stay with current MTU */
+		if (qp->mtu)
+			qp_params.mtu = qp->mtu;
+		else
+			qp_params.mtu =
+			    ib_mtu_enum_to_int(iboe_get_mtu(dev->ndev->mtu));
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		SET_FIELD(qp_params.modify_flags,
+			  QED_ROCE_MODIFY_QP_VALID_ACK_TIMEOUT, 1);
+
+		/* The received timeout value is an exponent used like this:
+		 *    "12.7.34 LOCAL ACK TIMEOUT
+		 *    Value representing the transport (ACK) timeout for use by
+		 *    the remote, expressed as: 4.096 * 2^timeout [usec]"
+		 * The FW expects timeout in msec so we need to divide the usec
+		 * result by 1000. We'll approximate 1000~2^10, and 4.096 ~ 2^2,
+		 * so we get: 2^2 * 2^timeout / 2^10 = 2^(timeout - 8).
+		 * The value of zero means infinite so we use a 'max_t' to make
+		 * sure that sub 1 msec values will be configured as 1 msec.
+		 */
+		if (attr->timeout)
+			qp_params.ack_timeout =
+					1 << max_t(int, attr->timeout - 8, 0);
+		else
+			qp_params.ack_timeout = 0;
+	}
+
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		SET_FIELD(qp_params.modify_flags,
+			  QED_ROCE_MODIFY_QP_VALID_RETRY_CNT, 1);
+		qp_params.retry_cnt = attr->retry_cnt;
+	}
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		SET_FIELD(qp_params.modify_flags,
+			  QED_ROCE_MODIFY_QP_VALID_RNR_RETRY_CNT, 1);
+		qp_params.rnr_retry_cnt = attr->rnr_retry;
+	}
+
+	if (attr_mask & IB_QP_RQ_PSN) {
+		SET_FIELD(qp_params.modify_flags,
+			  QED_ROCE_MODIFY_QP_VALID_RQ_PSN, 1);
+		qp_params.rq_psn = attr->rq_psn;
+		qp->rq_psn = attr->rq_psn;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic > dev->attr.max_qp_req_rd_atomic_resc) {
+			rc = -EINVAL;
+			DP_ERR(dev,
+			       "unsupported max_rd_atomic=%d, supported=%d\n",
+			       attr->max_rd_atomic,
+			       dev->attr.max_qp_req_rd_atomic_resc);
+			goto err;
+		}
+
+		SET_FIELD(qp_params.modify_flags,
+			  QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_REQ, 1);
+		qp_params.max_rd_atomic_req = attr->max_rd_atomic;
+	}
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		SET_FIELD(qp_params.modify_flags,
+			  QED_ROCE_MODIFY_QP_VALID_MIN_RNR_NAK_TIMER, 1);
+		qp_params.min_rnr_nak_timer = attr->min_rnr_timer;
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN) {
+		SET_FIELD(qp_params.modify_flags,
+			  QED_ROCE_MODIFY_QP_VALID_SQ_PSN, 1);
+		qp_params.sq_psn = attr->sq_psn;
+		qp->sq_psn = attr->sq_psn;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic >
+		    dev->attr.max_qp_resp_rd_atomic_resc) {
+			DP_ERR(dev,
+			       "unsupported max_dest_rd_atomic=%d, supported=%d\n",
+			       attr->max_dest_rd_atomic,
+			       dev->attr.max_qp_resp_rd_atomic_resc);
+
+			rc = -EINVAL;
+			goto err;
+		}
+
+		SET_FIELD(qp_params.modify_flags,
+			  QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_RESP, 1);
+		qp_params.max_rd_atomic_resp = attr->max_dest_rd_atomic;
+	}
+
+	if (attr_mask & IB_QP_DEST_QPN) {
+		SET_FIELD(qp_params.modify_flags,
+			  QED_ROCE_MODIFY_QP_VALID_DEST_QP, 1);
+
+		qp_params.dest_qp = attr->dest_qp_num;
+		qp->dest_qp_num = attr->dest_qp_num;
+	}
+
+	cur_state = qp->state;
+
+	/* Update the QP state before the actual ramrod to prevent a race with
+	 * fast path. Modifying the QP state to error will cause the device to
+	 * flush the CQEs and while polling the flushed CQEs will considered as
+	 * a potential issue if the QP isn't in error state.
+	 */
+	if ((attr_mask & IB_QP_STATE) && qp->qp_type != IB_QPT_GSI &&
+	    !udata && qp_params.new_state == QED_ROCE_QP_STATE_ERR)
+		qp->state = QED_ROCE_QP_STATE_ERR;
+
+	if (qp->qp_type != IB_QPT_GSI)
+		rc = dev->ops->rdma_modify_qp(dev->rdma_ctx,
+					      qp->qed_qp, &qp_params);
+
+	if (attr_mask & IB_QP_STATE) {
+		if ((qp->qp_type != IB_QPT_GSI) && (!udata))
+			rc = qedr_update_qp_state(dev, qp, cur_state,
+						  qp_params.new_state);
+		qp->state = qp_params.new_state;
+	}
+
+err:
+	return rc;
+}
+
+static int qedr_to_ib_qp_acc_flags(struct qed_rdma_query_qp_out_params *params)
+{
+	int ib_qp_acc_flags = 0;
+
+	if (params->incoming_rdma_write_en)
+		ib_qp_acc_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (params->incoming_rdma_read_en)
+		ib_qp_acc_flags |= IB_ACCESS_REMOTE_READ;
+	if (params->incoming_atomic_en)
+		ib_qp_acc_flags |= IB_ACCESS_REMOTE_ATOMIC;
+	ib_qp_acc_flags |= IB_ACCESS_LOCAL_WRITE;
+	return ib_qp_acc_flags;
+}
+
+int qedr_query_qp(struct ib_qp *ibqp,
+		  struct ib_qp_attr *qp_attr,
+		  int attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	struct qed_rdma_query_qp_out_params params;
+	struct qedr_qp *qp = get_qedr_qp(ibqp);
+	struct qedr_dev *dev = qp->dev;
+	int rc = 0;
+
+	memset(&params, 0, sizeof(params));
+
+	rc = dev->ops->rdma_query_qp(dev->rdma_ctx, qp->qed_qp, &params);
+	if (rc)
+		goto err;
+
+	memset(qp_attr, 0, sizeof(*qp_attr));
+	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
+
+	qp_attr->qp_state = qedr_get_ibqp_state(params.state);
+	qp_attr->cur_qp_state = qedr_get_ibqp_state(params.state);
+	qp_attr->path_mtu = ib_mtu_int_to_enum(params.mtu);
+	qp_attr->path_mig_state = IB_MIG_MIGRATED;
+	qp_attr->rq_psn = params.rq_psn;
+	qp_attr->sq_psn = params.sq_psn;
+	qp_attr->dest_qp_num = params.dest_qp;
+
+	qp_attr->qp_access_flags = qedr_to_ib_qp_acc_flags(&params);
+
+	qp_attr->cap.max_send_wr = qp->sq.max_wr;
+	qp_attr->cap.max_recv_wr = qp->rq.max_wr;
+	qp_attr->cap.max_send_sge = qp->sq.max_sges;
+	qp_attr->cap.max_recv_sge = qp->rq.max_sges;
+	qp_attr->cap.max_inline_data = ROCE_REQ_MAX_INLINE_DATA_SIZE;
+	qp_init_attr->cap = qp_attr->cap;
+
+	qp_attr->ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
+	rdma_ah_set_grh(&qp_attr->ah_attr, NULL,
+			params.flow_label, qp->sgid_idx,
+			params.hop_limit_ttl, params.traffic_class_tos);
+	rdma_ah_set_dgid_raw(&qp_attr->ah_attr, &params.dgid.bytes[0]);
+	rdma_ah_set_port_num(&qp_attr->ah_attr, 1);
+	rdma_ah_set_sl(&qp_attr->ah_attr, 0);
+	qp_attr->timeout = params.timeout;
+	qp_attr->rnr_retry = params.rnr_retry;
+	qp_attr->retry_cnt = params.retry_cnt;
+	qp_attr->min_rnr_timer = params.min_rnr_nak_timer;
+	qp_attr->pkey_index = params.pkey_index;
+	qp_attr->port_num = 1;
+	rdma_ah_set_path_bits(&qp_attr->ah_attr, 0);
+	rdma_ah_set_static_rate(&qp_attr->ah_attr, 0);
+	qp_attr->alt_pkey_index = 0;
+	qp_attr->alt_port_num = 0;
+	qp_attr->alt_timeout = 0;
+	memset(&qp_attr->alt_ah_attr, 0, sizeof(qp_attr->alt_ah_attr));
+
+	qp_attr->sq_draining = (params.state == QED_ROCE_QP_STATE_SQD) ? 1 : 0;
+	qp_attr->max_dest_rd_atomic = params.max_dest_rd_atomic;
+	qp_attr->max_rd_atomic = params.max_rd_atomic;
+	qp_attr->en_sqd_async_notify = (params.sqd_async) ? 1 : 0;
+
+	DP_DEBUG(dev, QEDR_MSG_QP, "QEDR_QUERY_QP: max_inline_data=%d\n",
+		 qp_attr->cap.max_inline_data);
+
+err:
+	return rc;
+}
+
+static int qedr_free_qp_resources(struct qedr_dev *dev, struct qedr_qp *qp)
+{
+	int rc = 0;
+
+	if (qp->qp_type != IB_QPT_GSI) {
+		rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
+		if (rc)
+			return rc;
+	}
+
+	if (qp->ibqp.uobject && qp->ibqp.uobject->context)
+		qedr_cleanup_user(dev, qp);
+	else
+		qedr_cleanup_kernel(dev, qp);
+
+	return 0;
+}
+
+int qedr_destroy_qp(struct ib_qp *ibqp)
+{
+	struct qedr_qp *qp = get_qedr_qp(ibqp);
+	struct qedr_dev *dev = qp->dev;
+	struct ib_qp_attr attr;
+	int attr_mask = 0;
+	int rc = 0;
+
+	DP_DEBUG(dev, QEDR_MSG_QP, "destroy qp: destroying %p, qp type=%d\n",
+		 qp, qp->qp_type);
+
+	if (rdma_protocol_roce(&dev->ibdev, 1)) {
+		if ((qp->state != QED_ROCE_QP_STATE_RESET) &&
+		    (qp->state != QED_ROCE_QP_STATE_ERR) &&
+		    (qp->state != QED_ROCE_QP_STATE_INIT)) {
+
+			attr.qp_state = IB_QPS_ERR;
+			attr_mask |= IB_QP_STATE;
+
+			/* Change the QP state to ERROR */
+			qedr_modify_qp(ibqp, &attr, attr_mask, NULL);
+		}
+	} else {
+		/* Wait for the connect/accept to complete */
+		if (qp->ep) {
+			int wait_count = 1;
+
+			while (qp->ep->during_connect) {
+				DP_DEBUG(dev, QEDR_MSG_QP,
+					 "Still in during connect/accept\n");
+
+				msleep(100);
+				if (wait_count++ > 200) {
+					DP_NOTICE(dev,
+						  "during connect timeout\n");
+					break;
+				}
+			}
+		}
+	}
+
+	if (qp->qp_type == IB_QPT_GSI)
+		qedr_destroy_gsi_qp(dev);
+
+	qedr_free_qp_resources(dev, qp);
+
+	if (atomic_dec_and_test(&qp->refcnt)) {
+		qedr_idr_remove(dev, qp->qp_id);
+		kfree(qp);
+	}
+	return rc;
+}
+
+struct ib_ah *qedr_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
+			     struct ib_udata *udata)
+{
+	struct qedr_ah *ah;
+
+	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	ah->attr = *attr;
+
+	return &ah->ibah;
+}
+
+int qedr_destroy_ah(struct ib_ah *ibah)
+{
+	struct qedr_ah *ah = get_qedr_ah(ibah);
+
+	kfree(ah);
+	return 0;
+}
+
+static void free_mr_info(struct qedr_dev *dev, struct mr_info *info)
+{
+	struct qedr_pbl *pbl, *tmp;
+
+	if (info->pbl_table)
+		list_add_tail(&info->pbl_table->list_entry,
+			      &info->free_pbl_list);
+
+	if (!list_empty(&info->inuse_pbl_list))
+		list_splice(&info->inuse_pbl_list, &info->free_pbl_list);
+
+	list_for_each_entry_safe(pbl, tmp, &info->free_pbl_list, list_entry) {
+		list_del(&pbl->list_entry);
+		qedr_free_pbl(dev, &info->pbl_info, pbl);
+	}
+}
+
+static int init_mr_info(struct qedr_dev *dev, struct mr_info *info,
+			size_t page_list_len, bool two_layered)
+{
+	struct qedr_pbl *tmp;
+	int rc;
+
+	INIT_LIST_HEAD(&info->free_pbl_list);
+	INIT_LIST_HEAD(&info->inuse_pbl_list);
+
+	rc = qedr_prepare_pbl_tbl(dev, &info->pbl_info,
+				  page_list_len, two_layered);
+	if (rc)
+		goto done;
+
+	info->pbl_table = qedr_alloc_pbl_tbl(dev, &info->pbl_info, GFP_KERNEL);
+	if (IS_ERR(info->pbl_table)) {
+		rc = PTR_ERR(info->pbl_table);
+		goto done;
+	}
+
+	DP_DEBUG(dev, QEDR_MSG_MR, "pbl_table_pa = %pa\n",
+		 &info->pbl_table->pa);
+
+	/* in usual case we use 2 PBLs, so we add one to free
+	 * list and allocating another one
+	 */
+	tmp = qedr_alloc_pbl_tbl(dev, &info->pbl_info, GFP_KERNEL);
+	if (IS_ERR(tmp)) {
+		DP_DEBUG(dev, QEDR_MSG_MR, "Extra PBL is not allocated\n");
+		goto done;
+	}
+
+	list_add_tail(&tmp->list_entry, &info->free_pbl_list);
+
+	DP_DEBUG(dev, QEDR_MSG_MR, "extra pbl_table_pa = %pa\n", &tmp->pa);
+
+done:
+	if (rc)
+		free_mr_info(dev, info);
+
+	return rc;
+}
+
+struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
+			       u64 usr_addr, int acc, struct ib_udata *udata)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
+	struct qedr_mr *mr;
+	struct qedr_pd *pd;
+	int rc = -ENOMEM;
+
+	pd = get_qedr_pd(ibpd);
+	DP_DEBUG(dev, QEDR_MSG_MR,
+		 "qedr_register user mr pd = %d start = %lld, len = %lld, usr_addr = %lld, acc = %d\n",
+		 pd->pd_id, start, len, usr_addr, acc);
+
+	if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE))
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(rc);
+
+	mr->type = QEDR_MR_USER;
+
+	mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0);
+	if (IS_ERR(mr->umem)) {
+		rc = -EFAULT;
+		goto err0;
+	}
+
+	rc = init_mr_info(dev, &mr->info, ib_umem_page_count(mr->umem), 1);
+	if (rc)
+		goto err1;
+
+	qedr_populate_pbls(dev, mr->umem, mr->info.pbl_table,
+			   &mr->info.pbl_info, mr->umem->page_shift);
+
+	rc = dev->ops->rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid);
+	if (rc) {
+		DP_ERR(dev, "roce alloc tid returned an error %d\n", rc);
+		goto err1;
+	}
+
+	/* Index only, 18 bit long, lkey = itid << 8 | key */
+	mr->hw_mr.tid_type = QED_RDMA_TID_REGISTERED_MR;
+	mr->hw_mr.key = 0;
+	mr->hw_mr.pd = pd->pd_id;
+	mr->hw_mr.local_read = 1;
+	mr->hw_mr.local_write = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0;
+	mr->hw_mr.remote_read = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0;
+	mr->hw_mr.remote_write = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	mr->hw_mr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
+	mr->hw_mr.mw_bind = false;
+	mr->hw_mr.pbl_ptr = mr->info.pbl_table[0].pa;
+	mr->hw_mr.pbl_two_level = mr->info.pbl_info.two_layered;
+	mr->hw_mr.pbl_page_size_log = ilog2(mr->info.pbl_info.pbl_size);
+	mr->hw_mr.page_size_log = mr->umem->page_shift;
+	mr->hw_mr.fbo = ib_umem_offset(mr->umem);
+	mr->hw_mr.length = len;
+	mr->hw_mr.vaddr = usr_addr;
+	mr->hw_mr.zbva = false;
+	mr->hw_mr.phy_mr = false;
+	mr->hw_mr.dma_mr = false;
+
+	rc = dev->ops->rdma_register_tid(dev->rdma_ctx, &mr->hw_mr);
+	if (rc) {
+		DP_ERR(dev, "roce register tid returned an error %d\n", rc);
+		goto err2;
+	}
+
+	mr->ibmr.lkey = mr->hw_mr.itid << 8 | mr->hw_mr.key;
+	if (mr->hw_mr.remote_write || mr->hw_mr.remote_read ||
+	    mr->hw_mr.remote_atomic)
+		mr->ibmr.rkey = mr->hw_mr.itid << 8 | mr->hw_mr.key;
+
+	DP_DEBUG(dev, QEDR_MSG_MR, "register user mr lkey: %x\n",
+		 mr->ibmr.lkey);
+	return &mr->ibmr;
+
+err2:
+	dev->ops->rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid);
+err1:
+	qedr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table);
+err0:
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+int qedr_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct qedr_mr *mr = get_qedr_mr(ib_mr);
+	struct qedr_dev *dev = get_qedr_dev(ib_mr->device);
+	int rc = 0;
+
+	rc = dev->ops->rdma_deregister_tid(dev->rdma_ctx, mr->hw_mr.itid);
+	if (rc)
+		return rc;
+
+	dev->ops->rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid);
+
+	if ((mr->type != QEDR_MR_DMA) && (mr->type != QEDR_MR_FRMR))
+		qedr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table);
+
+	/* it could be user registered memory. */
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+
+	kfree(mr);
+
+	return rc;
+}
+
+static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd,
+				       int max_page_list_len)
+{
+	struct qedr_pd *pd = get_qedr_pd(ibpd);
+	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
+	struct qedr_mr *mr;
+	int rc = -ENOMEM;
+
+	DP_DEBUG(dev, QEDR_MSG_MR,
+		 "qedr_alloc_frmr pd = %d max_page_list_len= %d\n", pd->pd_id,
+		 max_page_list_len);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(rc);
+
+	mr->dev = dev;
+	mr->type = QEDR_MR_FRMR;
+
+	rc = init_mr_info(dev, &mr->info, max_page_list_len, 1);
+	if (rc)
+		goto err0;
+
+	rc = dev->ops->rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid);
+	if (rc) {
+		DP_ERR(dev, "roce alloc tid returned an error %d\n", rc);
+		goto err0;
+	}
+
+	/* Index only, 18 bit long, lkey = itid << 8 | key */
+	mr->hw_mr.tid_type = QED_RDMA_TID_FMR;
+	mr->hw_mr.key = 0;
+	mr->hw_mr.pd = pd->pd_id;
+	mr->hw_mr.local_read = 1;
+	mr->hw_mr.local_write = 0;
+	mr->hw_mr.remote_read = 0;
+	mr->hw_mr.remote_write = 0;
+	mr->hw_mr.remote_atomic = 0;
+	mr->hw_mr.mw_bind = false;
+	mr->hw_mr.pbl_ptr = 0;
+	mr->hw_mr.pbl_two_level = mr->info.pbl_info.two_layered;
+	mr->hw_mr.pbl_page_size_log = ilog2(mr->info.pbl_info.pbl_size);
+	mr->hw_mr.fbo = 0;
+	mr->hw_mr.length = 0;
+	mr->hw_mr.vaddr = 0;
+	mr->hw_mr.zbva = false;
+	mr->hw_mr.phy_mr = true;
+	mr->hw_mr.dma_mr = false;
+
+	rc = dev->ops->rdma_register_tid(dev->rdma_ctx, &mr->hw_mr);
+	if (rc) {
+		DP_ERR(dev, "roce register tid returned an error %d\n", rc);
+		goto err1;
+	}
+
+	mr->ibmr.lkey = mr->hw_mr.itid << 8 | mr->hw_mr.key;
+	mr->ibmr.rkey = mr->ibmr.lkey;
+
+	DP_DEBUG(dev, QEDR_MSG_MR, "alloc frmr: %x\n", mr->ibmr.lkey);
+	return mr;
+
+err1:
+	dev->ops->rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid);
+err0:
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+struct ib_mr *qedr_alloc_mr(struct ib_pd *ibpd,
+			    enum ib_mr_type mr_type, u32 max_num_sg)
+{
+	struct qedr_mr *mr;
+
+	if (mr_type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EINVAL);
+
+	mr = __qedr_alloc_mr(ibpd, max_num_sg);
+
+	if (IS_ERR(mr))
+		return ERR_PTR(-EINVAL);
+
+	return &mr->ibmr;
+}
+
+static int qedr_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct qedr_mr *mr = get_qedr_mr(ibmr);
+	struct qedr_pbl *pbl_table;
+	struct regpair *pbe;
+	u32 pbes_in_page;
+
+	if (unlikely(mr->npages == mr->info.pbl_info.num_pbes)) {
+		DP_ERR(mr->dev, "qedr_set_page failes when %d\n", mr->npages);
+		return -ENOMEM;
+	}
+
+	DP_DEBUG(mr->dev, QEDR_MSG_MR, "qedr_set_page pages[%d] = 0x%llx\n",
+		 mr->npages, addr);
+
+	pbes_in_page = mr->info.pbl_info.pbl_size / sizeof(u64);
+	pbl_table = mr->info.pbl_table + (mr->npages / pbes_in_page);
+	pbe = (struct regpair *)pbl_table->va;
+	pbe +=  mr->npages % pbes_in_page;
+	pbe->lo = cpu_to_le32((u32)addr);
+	pbe->hi = cpu_to_le32((u32)upper_32_bits(addr));
+
+	mr->npages++;
+
+	return 0;
+}
+
+static void handle_completed_mrs(struct qedr_dev *dev, struct mr_info *info)
+{
+	int work = info->completed - info->completed_handled - 1;
+
+	DP_DEBUG(dev, QEDR_MSG_MR, "Special FMR work = %d\n", work);
+	while (work-- > 0 && !list_empty(&info->inuse_pbl_list)) {
+		struct qedr_pbl *pbl;
+
+		/* Free all the page list that are possible to be freed
+		 * (all the ones that were invalidated), under the assumption
+		 * that if an FMR was completed successfully that means that
+		 * if there was an invalidate operation before it also ended
+		 */
+		pbl = list_first_entry(&info->inuse_pbl_list,
+				       struct qedr_pbl, list_entry);
+		list_move_tail(&pbl->list_entry, &info->free_pbl_list);
+		info->completed_handled++;
+	}
+}
+
+int qedr_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+		   int sg_nents, unsigned int *sg_offset)
+{
+	struct qedr_mr *mr = get_qedr_mr(ibmr);
+
+	mr->npages = 0;
+
+	handle_completed_mrs(mr->dev, &mr->info);
+	return ib_sg_to_pages(ibmr, sg, sg_nents, NULL, qedr_set_page);
+}
+
+struct ib_mr *qedr_get_dma_mr(struct ib_pd *ibpd, int acc)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
+	struct qedr_pd *pd = get_qedr_pd(ibpd);
+	struct qedr_mr *mr;
+	int rc;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->type = QEDR_MR_DMA;
+
+	rc = dev->ops->rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid);
+	if (rc) {
+		DP_ERR(dev, "roce alloc tid returned an error %d\n", rc);
+		goto err1;
+	}
+
+	/* index only, 18 bit long, lkey = itid << 8 | key */
+	mr->hw_mr.tid_type = QED_RDMA_TID_REGISTERED_MR;
+	mr->hw_mr.pd = pd->pd_id;
+	mr->hw_mr.local_read = 1;
+	mr->hw_mr.local_write = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0;
+	mr->hw_mr.remote_read = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0;
+	mr->hw_mr.remote_write = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	mr->hw_mr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
+	mr->hw_mr.dma_mr = true;
+
+	rc = dev->ops->rdma_register_tid(dev->rdma_ctx, &mr->hw_mr);
+	if (rc) {
+		DP_ERR(dev, "roce register tid returned an error %d\n", rc);
+		goto err2;
+	}
+
+	mr->ibmr.lkey = mr->hw_mr.itid << 8 | mr->hw_mr.key;
+	if (mr->hw_mr.remote_write || mr->hw_mr.remote_read ||
+	    mr->hw_mr.remote_atomic)
+		mr->ibmr.rkey = mr->hw_mr.itid << 8 | mr->hw_mr.key;
+
+	DP_DEBUG(dev, QEDR_MSG_MR, "get dma mr: lkey = %x\n", mr->ibmr.lkey);
+	return &mr->ibmr;
+
+err2:
+	dev->ops->rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid);
+err1:
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+static inline int qedr_wq_is_full(struct qedr_qp_hwq_info *wq)
+{
+	return (((wq->prod + 1) % wq->max_wr) == wq->cons);
+}
+
+static int sge_data_len(struct ib_sge *sg_list, int num_sge)
+{
+	int i, len = 0;
+
+	for (i = 0; i < num_sge; i++)
+		len += sg_list[i].length;
+
+	return len;
+}
+
+static void swap_wqe_data64(u64 *p)
+{
+	int i;
+
+	for (i = 0; i < QEDR_SQE_ELEMENT_SIZE / sizeof(u64); i++, p++)
+		*p = cpu_to_be64(cpu_to_le64(*p));
+}
+
+static u32 qedr_prepare_sq_inline_data(struct qedr_dev *dev,
+				       struct qedr_qp *qp, u8 *wqe_size,
+				       struct ib_send_wr *wr,
+				       struct ib_send_wr **bad_wr, u8 *bits,
+				       u8 bit)
+{
+	u32 data_size = sge_data_len(wr->sg_list, wr->num_sge);
+	char *seg_prt, *wqe;
+	int i, seg_siz;
+
+	if (data_size > ROCE_REQ_MAX_INLINE_DATA_SIZE) {
+		DP_ERR(dev, "Too much inline data in WR: %d\n", data_size);
+		*bad_wr = wr;
+		return 0;
+	}
+
+	if (!data_size)
+		return data_size;
+
+	*bits |= bit;
+
+	seg_prt = NULL;
+	wqe = NULL;
+	seg_siz = 0;
+
+	/* Copy data inline */
+	for (i = 0; i < wr->num_sge; i++) {
+		u32 len = wr->sg_list[i].length;
+		void *src = (void *)(uintptr_t)wr->sg_list[i].addr;
+
+		while (len > 0) {
+			u32 cur;
+
+			/* New segment required */
+			if (!seg_siz) {
+				wqe = (char *)qed_chain_produce(&qp->sq.pbl);
+				seg_prt = wqe;
+				seg_siz = sizeof(struct rdma_sq_common_wqe);
+				(*wqe_size)++;
+			}
+
+			/* Calculate currently allowed length */
+			cur = min_t(u32, len, seg_siz);
+			memcpy(seg_prt, src, cur);
+
+			/* Update segment variables */
+			seg_prt += cur;
+			seg_siz -= cur;
+
+			/* Update sge variables */
+			src += cur;
+			len -= cur;
+
+			/* Swap fully-completed segments */
+			if (!seg_siz)
+				swap_wqe_data64((u64 *)wqe);
+		}
+	}
+
+	/* swap last not completed segment */
+	if (seg_siz)
+		swap_wqe_data64((u64 *)wqe);
+
+	return data_size;
+}
+
+#define RQ_SGE_SET(sge, vaddr, vlength, vflags)			\
+	do {							\
+		DMA_REGPAIR_LE(sge->addr, vaddr);		\
+		(sge)->length = cpu_to_le32(vlength);		\
+		(sge)->flags = cpu_to_le32(vflags);		\
+	} while (0)
+
+#define SRQ_HDR_SET(hdr, vwr_id, num_sge)			\
+	do {							\
+		DMA_REGPAIR_LE(hdr->wr_id, vwr_id);		\
+		(hdr)->num_sges = num_sge;			\
+	} while (0)
+
+#define SRQ_SGE_SET(sge, vaddr, vlength, vlkey)			\
+	do {							\
+		DMA_REGPAIR_LE(sge->addr, vaddr);		\
+		(sge)->length = cpu_to_le32(vlength);		\
+		(sge)->l_key = cpu_to_le32(vlkey);		\
+	} while (0)
+
+static u32 qedr_prepare_sq_sges(struct qedr_qp *qp, u8 *wqe_size,
+				struct ib_send_wr *wr)
+{
+	u32 data_size = 0;
+	int i;
+
+	for (i = 0; i < wr->num_sge; i++) {
+		struct rdma_sq_sge *sge = qed_chain_produce(&qp->sq.pbl);
+
+		DMA_REGPAIR_LE(sge->addr, wr->sg_list[i].addr);
+		sge->l_key = cpu_to_le32(wr->sg_list[i].lkey);
+		sge->length = cpu_to_le32(wr->sg_list[i].length);
+		data_size += wr->sg_list[i].length;
+	}
+
+	if (wqe_size)
+		*wqe_size += wr->num_sge;
+
+	return data_size;
+}
+
+static u32 qedr_prepare_sq_rdma_data(struct qedr_dev *dev,
+				     struct qedr_qp *qp,
+				     struct rdma_sq_rdma_wqe_1st *rwqe,
+				     struct rdma_sq_rdma_wqe_2nd *rwqe2,
+				     struct ib_send_wr *wr,
+				     struct ib_send_wr **bad_wr)
+{
+	rwqe2->r_key = cpu_to_le32(rdma_wr(wr)->rkey);
+	DMA_REGPAIR_LE(rwqe2->remote_va, rdma_wr(wr)->remote_addr);
+
+	if (wr->send_flags & IB_SEND_INLINE &&
+	    (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+	     wr->opcode == IB_WR_RDMA_WRITE)) {
+		u8 flags = 0;
+
+		SET_FIELD2(flags, RDMA_SQ_RDMA_WQE_1ST_INLINE_FLG, 1);
+		return qedr_prepare_sq_inline_data(dev, qp, &rwqe->wqe_size, wr,
+						   bad_wr, &rwqe->flags, flags);
+	}
+
+	return qedr_prepare_sq_sges(qp, &rwqe->wqe_size, wr);
+}
+
+static u32 qedr_prepare_sq_send_data(struct qedr_dev *dev,
+				     struct qedr_qp *qp,
+				     struct rdma_sq_send_wqe_1st *swqe,
+				     struct rdma_sq_send_wqe_2st *swqe2,
+				     struct ib_send_wr *wr,
+				     struct ib_send_wr **bad_wr)
+{
+	memset(swqe2, 0, sizeof(*swqe2));
+	if (wr->send_flags & IB_SEND_INLINE) {
+		u8 flags = 0;
+
+		SET_FIELD2(flags, RDMA_SQ_SEND_WQE_INLINE_FLG, 1);
+		return qedr_prepare_sq_inline_data(dev, qp, &swqe->wqe_size, wr,
+						   bad_wr, &swqe->flags, flags);
+	}
+
+	return qedr_prepare_sq_sges(qp, &swqe->wqe_size, wr);
+}
+
+static int qedr_prepare_reg(struct qedr_qp *qp,
+			    struct rdma_sq_fmr_wqe_1st *fwqe1,
+			    struct ib_reg_wr *wr)
+{
+	struct qedr_mr *mr = get_qedr_mr(wr->mr);
+	struct rdma_sq_fmr_wqe_2nd *fwqe2;
+
+	fwqe2 = (struct rdma_sq_fmr_wqe_2nd *)qed_chain_produce(&qp->sq.pbl);
+	fwqe1->addr.hi = upper_32_bits(mr->ibmr.iova);
+	fwqe1->addr.lo = lower_32_bits(mr->ibmr.iova);
+	fwqe1->l_key = wr->key;
+
+	fwqe2->access_ctrl = 0;
+
+	SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_2ND_REMOTE_READ,
+		   !!(wr->access & IB_ACCESS_REMOTE_READ));
+	SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_2ND_REMOTE_WRITE,
+		   !!(wr->access & IB_ACCESS_REMOTE_WRITE));
+	SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_2ND_ENABLE_ATOMIC,
+		   !!(wr->access & IB_ACCESS_REMOTE_ATOMIC));
+	SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_2ND_LOCAL_READ, 1);
+	SET_FIELD2(fwqe2->access_ctrl, RDMA_SQ_FMR_WQE_2ND_LOCAL_WRITE,
+		   !!(wr->access & IB_ACCESS_LOCAL_WRITE));
+	fwqe2->fmr_ctrl = 0;
+
+	SET_FIELD2(fwqe2->fmr_ctrl, RDMA_SQ_FMR_WQE_2ND_PAGE_SIZE_LOG,
+		   ilog2(mr->ibmr.page_size) - 12);
+
+	fwqe2->length_hi = 0;
+	fwqe2->length_lo = mr->ibmr.length;
+	fwqe2->pbl_addr.hi = upper_32_bits(mr->info.pbl_table->pa);
+	fwqe2->pbl_addr.lo = lower_32_bits(mr->info.pbl_table->pa);
+
+	qp->wqe_wr_id[qp->sq.prod].mr = mr;
+
+	return 0;
+}
+
+static enum ib_wc_opcode qedr_ib_to_wc_opcode(enum ib_wr_opcode opcode)
+{
+	switch (opcode) {
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return IB_WC_RDMA_WRITE;
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_INV:
+		return IB_WC_SEND;
+	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_READ_WITH_INV:
+		return IB_WC_RDMA_READ;
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		return IB_WC_COMP_SWAP;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		return IB_WC_FETCH_ADD;
+	case IB_WR_REG_MR:
+		return IB_WC_REG_MR;
+	case IB_WR_LOCAL_INV:
+		return IB_WC_LOCAL_INV;
+	default:
+		return IB_WC_SEND;
+	}
+}
+
+static inline bool qedr_can_post_send(struct qedr_qp *qp, struct ib_send_wr *wr)
+{
+	int wq_is_full, err_wr, pbl_is_full;
+	struct qedr_dev *dev = qp->dev;
+
+	/* prevent SQ overflow and/or processing of a bad WR */
+	err_wr = wr->num_sge > qp->sq.max_sges;
+	wq_is_full = qedr_wq_is_full(&qp->sq);
+	pbl_is_full = qed_chain_get_elem_left_u32(&qp->sq.pbl) <
+		      QEDR_MAX_SQE_ELEMENTS_PER_SQE;
+	if (wq_is_full || err_wr || pbl_is_full) {
+		if (wq_is_full && !(qp->err_bitmap & QEDR_QP_ERR_SQ_FULL)) {
+			DP_ERR(dev,
+			       "error: WQ is full. Post send on QP %p failed (this error appears only once)\n",
+			       qp);
+			qp->err_bitmap |= QEDR_QP_ERR_SQ_FULL;
+		}
+
+		if (err_wr && !(qp->err_bitmap & QEDR_QP_ERR_BAD_SR)) {
+			DP_ERR(dev,
+			       "error: WR is bad. Post send on QP %p failed (this error appears only once)\n",
+			       qp);
+			qp->err_bitmap |= QEDR_QP_ERR_BAD_SR;
+		}
+
+		if (pbl_is_full &&
+		    !(qp->err_bitmap & QEDR_QP_ERR_SQ_PBL_FULL)) {
+			DP_ERR(dev,
+			       "error: WQ PBL is full. Post send on QP %p failed (this error appears only once)\n",
+			       qp);
+			qp->err_bitmap |= QEDR_QP_ERR_SQ_PBL_FULL;
+		}
+		return false;
+	}
+	return true;
+}
+
+static int __qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		     struct ib_send_wr **bad_wr)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibqp->device);
+	struct qedr_qp *qp = get_qedr_qp(ibqp);
+	struct rdma_sq_atomic_wqe_1st *awqe1;
+	struct rdma_sq_atomic_wqe_2nd *awqe2;
+	struct rdma_sq_atomic_wqe_3rd *awqe3;
+	struct rdma_sq_send_wqe_2st *swqe2;
+	struct rdma_sq_local_inv_wqe *iwqe;
+	struct rdma_sq_rdma_wqe_2nd *rwqe2;
+	struct rdma_sq_send_wqe_1st *swqe;
+	struct rdma_sq_rdma_wqe_1st *rwqe;
+	struct rdma_sq_fmr_wqe_1st *fwqe1;
+	struct rdma_sq_common_wqe *wqe;
+	u32 length;
+	int rc = 0;
+	bool comp;
+
+	if (!qedr_can_post_send(qp, wr)) {
+		*bad_wr = wr;
+		return -ENOMEM;
+	}
+
+	wqe = qed_chain_produce(&qp->sq.pbl);
+	qp->wqe_wr_id[qp->sq.prod].signaled =
+		!!(wr->send_flags & IB_SEND_SIGNALED) || qp->signaled;
+
+	wqe->flags = 0;
+	SET_FIELD2(wqe->flags, RDMA_SQ_SEND_WQE_SE_FLG,
+		   !!(wr->send_flags & IB_SEND_SOLICITED));
+	comp = (!!(wr->send_flags & IB_SEND_SIGNALED)) || qp->signaled;
+	SET_FIELD2(wqe->flags, RDMA_SQ_SEND_WQE_COMP_FLG, comp);
+	SET_FIELD2(wqe->flags, RDMA_SQ_SEND_WQE_RD_FENCE_FLG,
+		   !!(wr->send_flags & IB_SEND_FENCE));
+	wqe->prev_wqe_size = qp->prev_wqe_size;
+
+	qp->wqe_wr_id[qp->sq.prod].opcode = qedr_ib_to_wc_opcode(wr->opcode);
+
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+		if (unlikely(rdma_protocol_iwarp(&dev->ibdev, 1))) {
+			rc = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+		wqe->req_type = RDMA_SQ_REQ_TYPE_SEND_WITH_IMM;
+		swqe = (struct rdma_sq_send_wqe_1st *)wqe;
+		swqe->wqe_size = 2;
+		swqe2 = qed_chain_produce(&qp->sq.pbl);
+
+		swqe->inv_key_or_imm_data = cpu_to_le32(be32_to_cpu(wr->ex.imm_data));
+		length = qedr_prepare_sq_send_data(dev, qp, swqe, swqe2,
+						   wr, bad_wr);
+		swqe->length = cpu_to_le32(length);
+		qp->wqe_wr_id[qp->sq.prod].wqe_size = swqe->wqe_size;
+		qp->prev_wqe_size = swqe->wqe_size;
+		qp->wqe_wr_id[qp->sq.prod].bytes_len = swqe->length;
+		break;
+	case IB_WR_SEND:
+		wqe->req_type = RDMA_SQ_REQ_TYPE_SEND;
+		swqe = (struct rdma_sq_send_wqe_1st *)wqe;
+
+		swqe->wqe_size = 2;
+		swqe2 = qed_chain_produce(&qp->sq.pbl);
+		length = qedr_prepare_sq_send_data(dev, qp, swqe, swqe2,
+						   wr, bad_wr);
+		swqe->length = cpu_to_le32(length);
+		qp->wqe_wr_id[qp->sq.prod].wqe_size = swqe->wqe_size;
+		qp->prev_wqe_size = swqe->wqe_size;
+		qp->wqe_wr_id[qp->sq.prod].bytes_len = swqe->length;
+		break;
+	case IB_WR_SEND_WITH_INV:
+		wqe->req_type = RDMA_SQ_REQ_TYPE_SEND_WITH_INVALIDATE;
+		swqe = (struct rdma_sq_send_wqe_1st *)wqe;
+		swqe2 = qed_chain_produce(&qp->sq.pbl);
+		swqe->wqe_size = 2;
+		swqe->inv_key_or_imm_data = cpu_to_le32(wr->ex.invalidate_rkey);
+		length = qedr_prepare_sq_send_data(dev, qp, swqe, swqe2,
+						   wr, bad_wr);
+		swqe->length = cpu_to_le32(length);
+		qp->wqe_wr_id[qp->sq.prod].wqe_size = swqe->wqe_size;
+		qp->prev_wqe_size = swqe->wqe_size;
+		qp->wqe_wr_id[qp->sq.prod].bytes_len = swqe->length;
+		break;
+
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		if (unlikely(rdma_protocol_iwarp(&dev->ibdev, 1))) {
+			rc = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+		wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_WR_WITH_IMM;
+		rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe;
+
+		rwqe->wqe_size = 2;
+		rwqe->imm_data = htonl(cpu_to_le32(wr->ex.imm_data));
+		rwqe2 = qed_chain_produce(&qp->sq.pbl);
+		length = qedr_prepare_sq_rdma_data(dev, qp, rwqe, rwqe2,
+						   wr, bad_wr);
+		rwqe->length = cpu_to_le32(length);
+		qp->wqe_wr_id[qp->sq.prod].wqe_size = rwqe->wqe_size;
+		qp->prev_wqe_size = rwqe->wqe_size;
+		qp->wqe_wr_id[qp->sq.prod].bytes_len = rwqe->length;
+		break;
+	case IB_WR_RDMA_WRITE:
+		wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_WR;
+		rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe;
+
+		rwqe->wqe_size = 2;
+		rwqe2 = qed_chain_produce(&qp->sq.pbl);
+		length = qedr_prepare_sq_rdma_data(dev, qp, rwqe, rwqe2,
+						   wr, bad_wr);
+		rwqe->length = cpu_to_le32(length);
+		qp->wqe_wr_id[qp->sq.prod].wqe_size = rwqe->wqe_size;
+		qp->prev_wqe_size = rwqe->wqe_size;
+		qp->wqe_wr_id[qp->sq.prod].bytes_len = rwqe->length;
+		break;
+	case IB_WR_RDMA_READ_WITH_INV:
+		SET_FIELD2(wqe->flags, RDMA_SQ_RDMA_WQE_1ST_READ_INV_FLG, 1);
+		/* fallthrough -- same is identical to RDMA READ */
+
+	case IB_WR_RDMA_READ:
+		wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_RD;
+		rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe;
+
+		rwqe->wqe_size = 2;
+		rwqe2 = qed_chain_produce(&qp->sq.pbl);
+		length = qedr_prepare_sq_rdma_data(dev, qp, rwqe, rwqe2,
+						   wr, bad_wr);
+		rwqe->length = cpu_to_le32(length);
+		qp->wqe_wr_id[qp->sq.prod].wqe_size = rwqe->wqe_size;
+		qp->prev_wqe_size = rwqe->wqe_size;
+		qp->wqe_wr_id[qp->sq.prod].bytes_len = rwqe->length;
+		break;
+
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		awqe1 = (struct rdma_sq_atomic_wqe_1st *)wqe;
+		awqe1->wqe_size = 4;
+
+		awqe2 = qed_chain_produce(&qp->sq.pbl);
+		DMA_REGPAIR_LE(awqe2->remote_va, atomic_wr(wr)->remote_addr);
+		awqe2->r_key = cpu_to_le32(atomic_wr(wr)->rkey);
+
+		awqe3 = qed_chain_produce(&qp->sq.pbl);
+
+		if (wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+			wqe->req_type = RDMA_SQ_REQ_TYPE_ATOMIC_ADD;
+			DMA_REGPAIR_LE(awqe3->swap_data,
+				       atomic_wr(wr)->compare_add);
+		} else {
+			wqe->req_type = RDMA_SQ_REQ_TYPE_ATOMIC_CMP_AND_SWAP;
+			DMA_REGPAIR_LE(awqe3->swap_data,
+				       atomic_wr(wr)->swap);
+			DMA_REGPAIR_LE(awqe3->cmp_data,
+				       atomic_wr(wr)->compare_add);
+		}
+
+		qedr_prepare_sq_sges(qp, NULL, wr);
+
+		qp->wqe_wr_id[qp->sq.prod].wqe_size = awqe1->wqe_size;
+		qp->prev_wqe_size = awqe1->wqe_size;
+		break;
+
+	case IB_WR_LOCAL_INV:
+		iwqe = (struct rdma_sq_local_inv_wqe *)wqe;
+		iwqe->wqe_size = 1;
+
+		iwqe->req_type = RDMA_SQ_REQ_TYPE_LOCAL_INVALIDATE;
+		iwqe->inv_l_key = wr->ex.invalidate_rkey;
+		qp->wqe_wr_id[qp->sq.prod].wqe_size = iwqe->wqe_size;
+		qp->prev_wqe_size = iwqe->wqe_size;
+		break;
+	case IB_WR_REG_MR:
+		DP_DEBUG(dev, QEDR_MSG_CQ, "REG_MR\n");
+		wqe->req_type = RDMA_SQ_REQ_TYPE_FAST_MR;
+		fwqe1 = (struct rdma_sq_fmr_wqe_1st *)wqe;
+		fwqe1->wqe_size = 2;
+
+		rc = qedr_prepare_reg(qp, fwqe1, reg_wr(wr));
+		if (rc) {
+			DP_ERR(dev, "IB_REG_MR failed rc=%d\n", rc);
+			*bad_wr = wr;
+			break;
+		}
+
+		qp->wqe_wr_id[qp->sq.prod].wqe_size = fwqe1->wqe_size;
+		qp->prev_wqe_size = fwqe1->wqe_size;
+		break;
+	default:
+		DP_ERR(dev, "invalid opcode 0x%x!\n", wr->opcode);
+		rc = -EINVAL;
+		*bad_wr = wr;
+		break;
+	}
+
+	if (*bad_wr) {
+		u16 value;
+
+		/* Restore prod to its position before
+		 * this WR was processed
+		 */
+		value = le16_to_cpu(qp->sq.db_data.data.value);
+		qed_chain_set_prod(&qp->sq.pbl, value, wqe);
+
+		/* Restore prev_wqe_size */
+		qp->prev_wqe_size = wqe->prev_wqe_size;
+		rc = -EINVAL;
+		DP_ERR(dev, "POST SEND FAILED\n");
+	}
+
+	return rc;
+}
+
+int qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		   struct ib_send_wr **bad_wr)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibqp->device);
+	struct qedr_qp *qp = get_qedr_qp(ibqp);
+	unsigned long flags;
+	int rc = 0;
+
+	*bad_wr = NULL;
+
+	if (qp->qp_type == IB_QPT_GSI)
+		return qedr_gsi_post_send(ibqp, wr, bad_wr);
+
+	spin_lock_irqsave(&qp->q_lock, flags);
+
+	if (rdma_protocol_roce(&dev->ibdev, 1)) {
+		if ((qp->state != QED_ROCE_QP_STATE_RTS) &&
+		    (qp->state != QED_ROCE_QP_STATE_ERR) &&
+		    (qp->state != QED_ROCE_QP_STATE_SQD)) {
+			spin_unlock_irqrestore(&qp->q_lock, flags);
+			*bad_wr = wr;
+			DP_DEBUG(dev, QEDR_MSG_CQ,
+				 "QP in wrong state! QP icid=0x%x state %d\n",
+				 qp->icid, qp->state);
+			return -EINVAL;
+		}
+	}
+
+	while (wr) {
+		rc = __qedr_post_send(ibqp, wr, bad_wr);
+		if (rc)
+			break;
+
+		qp->wqe_wr_id[qp->sq.prod].wr_id = wr->wr_id;
+
+		qedr_inc_sw_prod(&qp->sq);
+
+		qp->sq.db_data.data.value++;
+
+		wr = wr->next;
+	}
+
+	/* Trigger doorbell
+	 * If there was a failure in the first WR then it will be triggered in
+	 * vane. However this is not harmful (as long as the producer value is
+	 * unchanged). For performance reasons we avoid checking for this
+	 * redundant doorbell.
+	 *
+	 * qp->wqe_wr_id is accessed during qedr_poll_cq, as
+	 * soon as we give the doorbell, we could get a completion
+	 * for this wr, therefore we need to make sure that the
+	 * memory is updated before giving the doorbell.
+	 * During qedr_poll_cq, rmb is called before accessing the
+	 * cqe. This covers for the smp_rmb as well.
+	 */
+	smp_wmb();
+	writel(qp->sq.db_data.raw, qp->sq.db);
+
+	/* Make sure write sticks */
+	mmiowb();
+
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+
+	return rc;
+}
+
+int qedr_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		   struct ib_recv_wr **bad_wr)
+{
+	struct qedr_qp *qp = get_qedr_qp(ibqp);
+	struct qedr_dev *dev = qp->dev;
+	unsigned long flags;
+	int status = 0;
+
+	if (qp->qp_type == IB_QPT_GSI)
+		return qedr_gsi_post_recv(ibqp, wr, bad_wr);
+
+	spin_lock_irqsave(&qp->q_lock, flags);
+
+	if (qp->state == QED_ROCE_QP_STATE_RESET) {
+		spin_unlock_irqrestore(&qp->q_lock, flags);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
+	while (wr) {
+		int i;
+
+		if (qed_chain_get_elem_left_u32(&qp->rq.pbl) <
+		    QEDR_MAX_RQE_ELEMENTS_PER_RQE ||
+		    wr->num_sge > qp->rq.max_sges) {
+			DP_ERR(dev, "Can't post WR  (%d < %d) || (%d > %d)\n",
+			       qed_chain_get_elem_left_u32(&qp->rq.pbl),
+			       QEDR_MAX_RQE_ELEMENTS_PER_RQE, wr->num_sge,
+			       qp->rq.max_sges);
+			status = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+		for (i = 0; i < wr->num_sge; i++) {
+			u32 flags = 0;
+			struct rdma_rq_sge *rqe =
+			    qed_chain_produce(&qp->rq.pbl);
+
+			/* First one must include the number
+			 * of SGE in the list
+			 */
+			if (!i)
+				SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES,
+					  wr->num_sge);
+
+			SET_FIELD(flags, RDMA_RQ_SGE_L_KEY,
+				  wr->sg_list[i].lkey);
+
+			RQ_SGE_SET(rqe, wr->sg_list[i].addr,
+				   wr->sg_list[i].length, flags);
+		}
+
+		/* Special case of no sges. FW requires between 1-4 sges...
+		 * in this case we need to post 1 sge with length zero. this is
+		 * because rdma write with immediate consumes an RQ.
+		 */
+		if (!wr->num_sge) {
+			u32 flags = 0;
+			struct rdma_rq_sge *rqe =
+			    qed_chain_produce(&qp->rq.pbl);
+
+			/* First one must include the number
+			 * of SGE in the list
+			 */
+			SET_FIELD(flags, RDMA_RQ_SGE_L_KEY, 0);
+			SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES, 1);
+
+			RQ_SGE_SET(rqe, 0, 0, flags);
+			i = 1;
+		}
+
+		qp->rqe_wr_id[qp->rq.prod].wr_id = wr->wr_id;
+		qp->rqe_wr_id[qp->rq.prod].wqe_size = i;
+
+		qedr_inc_sw_prod(&qp->rq);
+
+		/* qp->rqe_wr_id is accessed during qedr_poll_cq, as
+		 * soon as we give the doorbell, we could get a completion
+		 * for this wr, therefore we need to make sure that the
+		 * memory is update before giving the doorbell.
+		 * During qedr_poll_cq, rmb is called before accessing the
+		 * cqe. This covers for the smp_rmb as well.
+		 */
+		smp_wmb();
+
+		qp->rq.db_data.data.value++;
+
+		writel(qp->rq.db_data.raw, qp->rq.db);
+
+		/* Make sure write sticks */
+		mmiowb();
+
+		if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
+			writel(qp->rq.iwarp_db2_data.raw, qp->rq.iwarp_db2);
+			mmiowb();	/* for second doorbell */
+		}
+
+		wr = wr->next;
+	}
+
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+
+	return status;
+}
+
+static int is_valid_cqe(struct qedr_cq *cq, union rdma_cqe *cqe)
+{
+	struct rdma_cqe_requester *resp_cqe = &cqe->req;
+
+	return (resp_cqe->flags & RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK) ==
+		cq->pbl_toggle;
+}
+
+static struct qedr_qp *cqe_get_qp(union rdma_cqe *cqe)
+{
+	struct rdma_cqe_requester *resp_cqe = &cqe->req;
+	struct qedr_qp *qp;
+
+	qp = (struct qedr_qp *)(uintptr_t)HILO_GEN(resp_cqe->qp_handle.hi,
+						   resp_cqe->qp_handle.lo,
+						   u64);
+	return qp;
+}
+
+static enum rdma_cqe_type cqe_get_type(union rdma_cqe *cqe)
+{
+	struct rdma_cqe_requester *resp_cqe = &cqe->req;
+
+	return GET_FIELD(resp_cqe->flags, RDMA_CQE_REQUESTER_TYPE);
+}
+
+/* Return latest CQE (needs processing) */
+static union rdma_cqe *get_cqe(struct qedr_cq *cq)
+{
+	return cq->latest_cqe;
+}
+
+/* In fmr we need to increase the number of fmr completed counter for the fmr
+ * algorithm determining whether we can free a pbl or not.
+ * we need to perform this whether the work request was signaled or not. for
+ * this purpose we call this function from the condition that checks if a wr
+ * should be skipped, to make sure we don't miss it ( possibly this fmr
+ * operation was not signalted)
+ */
+static inline void qedr_chk_if_fmr(struct qedr_qp *qp)
+{
+	if (qp->wqe_wr_id[qp->sq.cons].opcode == IB_WC_REG_MR)
+		qp->wqe_wr_id[qp->sq.cons].mr->info.completed++;
+}
+
+static int process_req(struct qedr_dev *dev, struct qedr_qp *qp,
+		       struct qedr_cq *cq, int num_entries,
+		       struct ib_wc *wc, u16 hw_cons, enum ib_wc_status status,
+		       int force)
+{
+	u16 cnt = 0;
+
+	while (num_entries && qp->sq.wqe_cons != hw_cons) {
+		if (!qp->wqe_wr_id[qp->sq.cons].signaled && !force) {
+			qedr_chk_if_fmr(qp);
+			/* skip WC */
+			goto next_cqe;
+		}
+
+		/* fill WC */
+		wc->status = status;
+		wc->vendor_err = 0;
+		wc->wc_flags = 0;
+		wc->src_qp = qp->id;
+		wc->qp = &qp->ibqp;
+
+		wc->wr_id = qp->wqe_wr_id[qp->sq.cons].wr_id;
+		wc->opcode = qp->wqe_wr_id[qp->sq.cons].opcode;
+
+		switch (wc->opcode) {
+		case IB_WC_RDMA_WRITE:
+			wc->byte_len = qp->wqe_wr_id[qp->sq.cons].bytes_len;
+			break;
+		case IB_WC_COMP_SWAP:
+		case IB_WC_FETCH_ADD:
+			wc->byte_len = 8;
+			break;
+		case IB_WC_REG_MR:
+			qp->wqe_wr_id[qp->sq.cons].mr->info.completed++;
+			break;
+		case IB_WC_RDMA_READ:
+		case IB_WC_SEND:
+			wc->byte_len = qp->wqe_wr_id[qp->sq.cons].bytes_len;
+			break;
+		default:
+			break;
+		}
+
+		num_entries--;
+		wc++;
+		cnt++;
+next_cqe:
+		while (qp->wqe_wr_id[qp->sq.cons].wqe_size--)
+			qed_chain_consume(&qp->sq.pbl);
+		qedr_inc_sw_cons(&qp->sq);
+	}
+
+	return cnt;
+}
+
+static int qedr_poll_cq_req(struct qedr_dev *dev,
+			    struct qedr_qp *qp, struct qedr_cq *cq,
+			    int num_entries, struct ib_wc *wc,
+			    struct rdma_cqe_requester *req)
+{
+	int cnt = 0;
+
+	switch (req->status) {
+	case RDMA_CQE_REQ_STS_OK:
+		cnt = process_req(dev, qp, cq, num_entries, wc, req->sq_cons,
+				  IB_WC_SUCCESS, 0);
+		break;
+	case RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR:
+		if (qp->state != QED_ROCE_QP_STATE_ERR)
+			DP_DEBUG(dev, QEDR_MSG_CQ,
+				 "Error: POLL CQ with RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR. CQ icid=0x%x, QP icid=0x%x\n",
+				 cq->icid, qp->icid);
+		cnt = process_req(dev, qp, cq, num_entries, wc, req->sq_cons,
+				  IB_WC_WR_FLUSH_ERR, 1);
+		break;
+	default:
+		/* process all WQE before the cosumer */
+		qp->state = QED_ROCE_QP_STATE_ERR;
+		cnt = process_req(dev, qp, cq, num_entries, wc,
+				  req->sq_cons - 1, IB_WC_SUCCESS, 0);
+		wc += cnt;
+		/* if we have extra WC fill it with actual error info */
+		if (cnt < num_entries) {
+			enum ib_wc_status wc_status;
+
+			switch (req->status) {
+			case RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR:
+				DP_ERR(dev,
+				       "Error: POLL CQ with RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR. CQ icid=0x%x, QP icid=0x%x\n",
+				       cq->icid, qp->icid);
+				wc_status = IB_WC_BAD_RESP_ERR;
+				break;
+			case RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR:
+				DP_ERR(dev,
+				       "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR. CQ icid=0x%x, QP icid=0x%x\n",
+				       cq->icid, qp->icid);
+				wc_status = IB_WC_LOC_LEN_ERR;
+				break;
+			case RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR:
+				DP_ERR(dev,
+				       "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR. CQ icid=0x%x, QP icid=0x%x\n",
+				       cq->icid, qp->icid);
+				wc_status = IB_WC_LOC_QP_OP_ERR;
+				break;
+			case RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR:
+				DP_ERR(dev,
+				       "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR. CQ icid=0x%x, QP icid=0x%x\n",
+				       cq->icid, qp->icid);
+				wc_status = IB_WC_LOC_PROT_ERR;
+				break;
+			case RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR:
+				DP_ERR(dev,
+				       "Error: POLL CQ with RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR. CQ icid=0x%x, QP icid=0x%x\n",
+				       cq->icid, qp->icid);
+				wc_status = IB_WC_MW_BIND_ERR;
+				break;
+			case RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR:
+				DP_ERR(dev,
+				       "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR. CQ icid=0x%x, QP icid=0x%x\n",
+				       cq->icid, qp->icid);
+				wc_status = IB_WC_REM_INV_REQ_ERR;
+				break;
+			case RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR:
+				DP_ERR(dev,
+				       "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR. CQ icid=0x%x, QP icid=0x%x\n",
+				       cq->icid, qp->icid);
+				wc_status = IB_WC_REM_ACCESS_ERR;
+				break;
+			case RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR:
+				DP_ERR(dev,
+				       "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR. CQ icid=0x%x, QP icid=0x%x\n",
+				       cq->icid, qp->icid);
+				wc_status = IB_WC_REM_OP_ERR;
+				break;
+			case RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR:
+				DP_ERR(dev,
+				       "Error: POLL CQ with RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR. CQ icid=0x%x, QP icid=0x%x\n",
+				       cq->icid, qp->icid);
+				wc_status = IB_WC_RNR_RETRY_EXC_ERR;
+				break;
+			case RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR:
+				DP_ERR(dev,
+				       "Error: POLL CQ with ROCE_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR. CQ icid=0x%x, QP icid=0x%x\n",
+				       cq->icid, qp->icid);
+				wc_status = IB_WC_RETRY_EXC_ERR;
+				break;
+			default:
+				DP_ERR(dev,
+				       "Error: POLL CQ with IB_WC_GENERAL_ERR. CQ icid=0x%x, QP icid=0x%x\n",
+				       cq->icid, qp->icid);
+				wc_status = IB_WC_GENERAL_ERR;
+			}
+			cnt += process_req(dev, qp, cq, 1, wc, req->sq_cons,
+					   wc_status, 1);
+		}
+	}
+
+	return cnt;
+}
+
+static inline int qedr_cqe_resp_status_to_ib(u8 status)
+{
+	switch (status) {
+	case RDMA_CQE_RESP_STS_LOCAL_ACCESS_ERR:
+		return IB_WC_LOC_ACCESS_ERR;
+	case RDMA_CQE_RESP_STS_LOCAL_LENGTH_ERR:
+		return IB_WC_LOC_LEN_ERR;
+	case RDMA_CQE_RESP_STS_LOCAL_QP_OPERATION_ERR:
+		return IB_WC_LOC_QP_OP_ERR;
+	case RDMA_CQE_RESP_STS_LOCAL_PROTECTION_ERR:
+		return IB_WC_LOC_PROT_ERR;
+	case RDMA_CQE_RESP_STS_MEMORY_MGT_OPERATION_ERR:
+		return IB_WC_MW_BIND_ERR;
+	case RDMA_CQE_RESP_STS_REMOTE_INVALID_REQUEST_ERR:
+		return IB_WC_REM_INV_RD_REQ_ERR;
+	case RDMA_CQE_RESP_STS_OK:
+		return IB_WC_SUCCESS;
+	default:
+		return IB_WC_GENERAL_ERR;
+	}
+}
+
+static inline int qedr_set_ok_cqe_resp_wc(struct rdma_cqe_responder *resp,
+					  struct ib_wc *wc)
+{
+	wc->status = IB_WC_SUCCESS;
+	wc->byte_len = le32_to_cpu(resp->length);
+
+	if (resp->flags & QEDR_RESP_IMM) {
+		wc->ex.imm_data = cpu_to_be32(le32_to_cpu(resp->imm_data_or_inv_r_Key));
+		wc->wc_flags |= IB_WC_WITH_IMM;
+
+		if (resp->flags & QEDR_RESP_RDMA)
+			wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+
+		if (resp->flags & QEDR_RESP_INV)
+			return -EINVAL;
+
+	} else if (resp->flags & QEDR_RESP_INV) {
+		wc->ex.imm_data = le32_to_cpu(resp->imm_data_or_inv_r_Key);
+		wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+
+		if (resp->flags & QEDR_RESP_RDMA)
+			return -EINVAL;
+
+	} else if (resp->flags & QEDR_RESP_RDMA) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void __process_resp_one(struct qedr_dev *dev, struct qedr_qp *qp,
+			       struct qedr_cq *cq, struct ib_wc *wc,
+			       struct rdma_cqe_responder *resp, u64 wr_id)
+{
+	/* Must fill fields before qedr_set_ok_cqe_resp_wc() */
+	wc->opcode = IB_WC_RECV;
+	wc->wc_flags = 0;
+
+	if (likely(resp->status == RDMA_CQE_RESP_STS_OK)) {
+		if (qedr_set_ok_cqe_resp_wc(resp, wc))
+			DP_ERR(dev,
+			       "CQ %p (icid=%d) has invalid CQE responder flags=0x%x\n",
+			       cq, cq->icid, resp->flags);
+
+	} else {
+		wc->status = qedr_cqe_resp_status_to_ib(resp->status);
+		if (wc->status == IB_WC_GENERAL_ERR)
+			DP_ERR(dev,
+			       "CQ %p (icid=%d) contains an invalid CQE status %d\n",
+			       cq, cq->icid, resp->status);
+	}
+
+	/* Fill the rest of the WC */
+	wc->vendor_err = 0;
+	wc->src_qp = qp->id;
+	wc->qp = &qp->ibqp;
+	wc->wr_id = wr_id;
+}
+
+static int process_resp_one(struct qedr_dev *dev, struct qedr_qp *qp,
+			    struct qedr_cq *cq, struct ib_wc *wc,
+			    struct rdma_cqe_responder *resp)
+{
+	u64 wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id;
+
+	__process_resp_one(dev, qp, cq, wc, resp, wr_id);
+
+	while (qp->rqe_wr_id[qp->rq.cons].wqe_size--)
+		qed_chain_consume(&qp->rq.pbl);
+	qedr_inc_sw_cons(&qp->rq);
+
+	return 1;
+}
+
+static int process_resp_flush(struct qedr_qp *qp, struct qedr_cq *cq,
+			      int num_entries, struct ib_wc *wc, u16 hw_cons)
+{
+	u16 cnt = 0;
+
+	while (num_entries && qp->rq.wqe_cons != hw_cons) {
+		/* fill WC */
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		wc->vendor_err = 0;
+		wc->wc_flags = 0;
+		wc->src_qp = qp->id;
+		wc->byte_len = 0;
+		wc->wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id;
+		wc->qp = &qp->ibqp;
+		num_entries--;
+		wc++;
+		cnt++;
+		while (qp->rqe_wr_id[qp->rq.cons].wqe_size--)
+			qed_chain_consume(&qp->rq.pbl);
+		qedr_inc_sw_cons(&qp->rq);
+	}
+
+	return cnt;
+}
+
+static void try_consume_resp_cqe(struct qedr_cq *cq, struct qedr_qp *qp,
+				 struct rdma_cqe_responder *resp, int *update)
+{
+	if (le16_to_cpu(resp->rq_cons_or_srq_id) == qp->rq.wqe_cons) {
+		consume_cqe(cq);
+		*update |= 1;
+	}
+}
+
+static int qedr_poll_cq_resp(struct qedr_dev *dev, struct qedr_qp *qp,
+			     struct qedr_cq *cq, int num_entries,
+			     struct ib_wc *wc, struct rdma_cqe_responder *resp,
+			     int *update)
+{
+	int cnt;
+
+	if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) {
+		cnt = process_resp_flush(qp, cq, num_entries, wc,
+					 resp->rq_cons_or_srq_id);
+		try_consume_resp_cqe(cq, qp, resp, update);
+	} else {
+		cnt = process_resp_one(dev, qp, cq, wc, resp);
+		consume_cqe(cq);
+		*update |= 1;
+	}
+
+	return cnt;
+}
+
+static void try_consume_req_cqe(struct qedr_cq *cq, struct qedr_qp *qp,
+				struct rdma_cqe_requester *req, int *update)
+{
+	if (le16_to_cpu(req->sq_cons) == qp->sq.wqe_cons) {
+		consume_cqe(cq);
+		*update |= 1;
+	}
+}
+
+int qedr_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibcq->device);
+	struct qedr_cq *cq = get_qedr_cq(ibcq);
+	union rdma_cqe *cqe;
+	u32 old_cons, new_cons;
+	unsigned long flags;
+	int update = 0;
+	int done = 0;
+
+	if (cq->destroyed) {
+		DP_ERR(dev,
+		       "warning: poll was invoked after destroy for cq %p (icid=%d)\n",
+		       cq, cq->icid);
+		return 0;
+	}
+
+	if (cq->cq_type == QEDR_CQ_TYPE_GSI)
+		return qedr_gsi_poll_cq(ibcq, num_entries, wc);
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	cqe = cq->latest_cqe;
+	old_cons = qed_chain_get_cons_idx_u32(&cq->pbl);
+	while (num_entries && is_valid_cqe(cq, cqe)) {
+		struct qedr_qp *qp;
+		int cnt = 0;
+
+		/* prevent speculative reads of any field of CQE */
+		rmb();
+
+		qp = cqe_get_qp(cqe);
+		if (!qp) {
+			WARN(1, "Error: CQE QP pointer is NULL. CQE=%p\n", cqe);
+			break;
+		}
+
+		wc->qp = &qp->ibqp;
+
+		switch (cqe_get_type(cqe)) {
+		case RDMA_CQE_TYPE_REQUESTER:
+			cnt = qedr_poll_cq_req(dev, qp, cq, num_entries, wc,
+					       &cqe->req);
+			try_consume_req_cqe(cq, qp, &cqe->req, &update);
+			break;
+		case RDMA_CQE_TYPE_RESPONDER_RQ:
+			cnt = qedr_poll_cq_resp(dev, qp, cq, num_entries, wc,
+						&cqe->resp, &update);
+			break;
+		case RDMA_CQE_TYPE_INVALID:
+		default:
+			DP_ERR(dev, "Error: invalid CQE type = %d\n",
+			       cqe_get_type(cqe));
+		}
+		num_entries -= cnt;
+		wc += cnt;
+		done += cnt;
+
+		cqe = get_cqe(cq);
+	}
+	new_cons = qed_chain_get_cons_idx_u32(&cq->pbl);
+
+	cq->cq_cons += new_cons - old_cons;
+
+	if (update)
+		/* doorbell notifies abount latest VALID entry,
+		 * but chain already point to the next INVALID one
+		 */
+		doorbell_cq(cq, cq->cq_cons - 1, cq->arm_flags);
+
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+	return done;
+}
+
+int qedr_process_mad(struct ib_device *ibdev, int process_mad_flags,
+		     u8 port_num,
+		     const struct ib_wc *in_wc,
+		     const struct ib_grh *in_grh,
+		     const struct ib_mad_hdr *mad_hdr,
+		     size_t in_mad_size, struct ib_mad_hdr *out_mad,
+		     size_t *out_mad_size, u16 *out_mad_pkey_index)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibdev);
+
+	DP_DEBUG(dev, QEDR_MSG_GSI,
+		 "QEDR_PROCESS_MAD in_mad %x %x %x %x %x %x %x %x\n",
+		 mad_hdr->attr_id, mad_hdr->base_version, mad_hdr->attr_mod,
+		 mad_hdr->class_specific, mad_hdr->class_version,
+		 mad_hdr->method, mad_hdr->mgmt_class, mad_hdr->status);
+	return IB_MAD_RESULT_SUCCESS;
+}
diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h
new file mode 100644
index 0000000..2c57e4c
--- /dev/null
+++ b/drivers/infiniband/hw/qedr/verbs.h
@@ -0,0 +1,98 @@
+/* QLogic qedr NIC Driver
+ * Copyright (c) 2015-2016  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __QEDR_VERBS_H__
+#define __QEDR_VERBS_H__
+
+int qedr_query_device(struct ib_device *ibdev,
+		      struct ib_device_attr *attr, struct ib_udata *udata);
+int qedr_query_port(struct ib_device *, u8 port, struct ib_port_attr *props);
+int qedr_modify_port(struct ib_device *, u8 port, int mask,
+		     struct ib_port_modify *props);
+
+int qedr_iw_query_gid(struct ib_device *ibdev, u8 port,
+		      int index, union ib_gid *gid);
+
+int qedr_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
+
+struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *, struct ib_udata *);
+int qedr_dealloc_ucontext(struct ib_ucontext *);
+
+int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
+struct ib_pd *qedr_alloc_pd(struct ib_device *,
+			    struct ib_ucontext *, struct ib_udata *);
+int qedr_dealloc_pd(struct ib_pd *pd);
+
+struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
+			     const struct ib_cq_init_attr *attr,
+			     struct ib_ucontext *ib_ctx,
+			     struct ib_udata *udata);
+int qedr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
+int qedr_destroy_cq(struct ib_cq *);
+int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+struct ib_qp *qedr_create_qp(struct ib_pd *, struct ib_qp_init_attr *attrs,
+			     struct ib_udata *);
+int qedr_modify_qp(struct ib_qp *, struct ib_qp_attr *attr,
+		   int attr_mask, struct ib_udata *udata);
+int qedr_query_qp(struct ib_qp *, struct ib_qp_attr *qp_attr,
+		  int qp_attr_mask, struct ib_qp_init_attr *);
+int qedr_destroy_qp(struct ib_qp *ibqp);
+
+struct ib_ah *qedr_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
+			     struct ib_udata *udata);
+int qedr_destroy_ah(struct ib_ah *ibah);
+
+int qedr_dereg_mr(struct ib_mr *);
+struct ib_mr *qedr_get_dma_mr(struct ib_pd *, int acc);
+
+struct ib_mr *qedr_reg_user_mr(struct ib_pd *, u64 start, u64 length,
+			       u64 virt, int acc, struct ib_udata *);
+
+int qedr_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+		   int sg_nents, unsigned int *sg_offset);
+
+struct ib_mr *qedr_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			    u32 max_num_sg);
+int qedr_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *wc);
+int qedr_post_send(struct ib_qp *, struct ib_send_wr *,
+		   struct ib_send_wr **bad_wr);
+int qedr_post_recv(struct ib_qp *, struct ib_recv_wr *,
+		   struct ib_recv_wr **bad_wr);
+int qedr_process_mad(struct ib_device *ibdev, int process_mad_flags,
+		     u8 port_num, const struct ib_wc *in_wc,
+		     const struct ib_grh *in_grh,
+		     const struct ib_mad_hdr *in_mad,
+		     size_t in_mad_size, struct ib_mad_hdr *out_mad,
+		     size_t *out_mad_size, u16 *out_mad_pkey_index);
+
+int qedr_port_immutable(struct ib_device *ibdev, u8 port_num,
+			struct ib_port_immutable *immutable);
+#endif
diff --git a/drivers/infiniband/hw/qib/Kconfig b/drivers/infiniband/hw/qib/Kconfig
new file mode 100644
index 0000000..cb06314
--- /dev/null
+++ b/drivers/infiniband/hw/qib/Kconfig
@@ -0,0 +1,16 @@
+config INFINIBAND_QIB
+	tristate "Intel PCIe HCA support"
+	depends on 64BIT && INFINIBAND_RDMAVT
+	depends on PCI
+	---help---
+	This is a low-level driver for Intel PCIe QLE InfiniBand host
+	channel adapters.  This driver does not support the Intel
+	HyperTransport card (model QHT7140).
+
+config INFINIBAND_QIB_DCA
+	bool "QIB DCA support"
+	depends on INFINIBAND_QIB && DCA && SMP && !(INFINIBAND_QIB=y && DCA=m)
+	default y
+	---help---
+	Setting this enables DCA support on some Intel chip sets
+	with the iba7322 HCA.
diff --git a/drivers/infiniband/hw/qib/Makefile b/drivers/infiniband/hw/qib/Makefile
new file mode 100644
index 0000000..80ffab8
--- /dev/null
+++ b/drivers/infiniband/hw/qib/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_INFINIBAND_QIB) += ib_qib.o
+
+ib_qib-y := qib_diag.o qib_driver.o qib_eeprom.o \
+	qib_file_ops.o qib_fs.o qib_init.o qib_intr.o \
+	qib_mad.o qib_pcie.o qib_pio_copy.o \
+	qib_qp.o qib_qsfp.o qib_rc.o qib_ruc.o qib_sdma.o \
+	qib_sysfs.o qib_twsi.o qib_tx.o qib_uc.o qib_ud.o \
+	qib_user_pages.o qib_user_sdma.o qib_iba7220.o \
+	qib_sd7220.o qib_iba7322.o qib_verbs.o
+
+# 6120 has no fallback if no MSI interrupts, others can do INTx
+ib_qib-$(CONFIG_PCI_MSI) += qib_iba6120.o
+
+ib_qib-$(CONFIG_X86_64) += qib_wc_x86_64.o
+ib_qib-$(CONFIG_PPC64) += qib_wc_ppc64.o
+ib_qib-$(CONFIG_DEBUG_FS) += qib_debugfs.o
diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h
new file mode 100644
index 0000000..4607245
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib.h
@@ -0,0 +1,1525 @@
+#ifndef _QIB_KERNEL_H
+#define _QIB_KERNEL_H
+/*
+ * Copyright (c) 2012 - 2017 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This header file is the base header file for qlogic_ib kernel code
+ * qib_user.h serves a similar purpose for user code.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <rdma/ib_hdrs.h>
+#include <rdma/rdma_vt.h>
+
+#include "qib_common.h"
+#include "qib_verbs.h"
+
+/* only s/w major version of QLogic_IB we can handle */
+#define QIB_CHIP_VERS_MAJ 2U
+
+/* don't care about this except printing */
+#define QIB_CHIP_VERS_MIN 0U
+
+/* The Organization Unique Identifier (Mfg code), and its position in GUID */
+#define QIB_OUI 0x001175
+#define QIB_OUI_LSB 40
+
+/*
+ * per driver stats, either not device nor port-specific, or
+ * summed over all of the devices and ports.
+ * They are described by name via ipathfs filesystem, so layout
+ * and number of elements can change without breaking compatibility.
+ * If members are added or deleted qib_statnames[] in qib_fs.c must
+ * change to match.
+ */
+struct qlogic_ib_stats {
+	__u64 sps_ints; /* number of interrupts handled */
+	__u64 sps_errints; /* number of error interrupts */
+	__u64 sps_txerrs; /* tx-related packet errors */
+	__u64 sps_rcverrs; /* non-crc rcv packet errors */
+	__u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */
+	__u64 sps_nopiobufs; /* no pio bufs avail from kernel */
+	__u64 sps_ctxts; /* number of contexts currently open */
+	__u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */
+	__u64 sps_buffull;
+	__u64 sps_hdrfull;
+};
+
+extern struct qlogic_ib_stats qib_stats;
+extern const struct pci_error_handlers qib_pci_err_handler;
+
+#define QIB_CHIP_SWVERSION QIB_CHIP_VERS_MAJ
+/*
+ * First-cut critierion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define QIB_TRAFFIC_ACTIVE_THRESHOLD (2000)
+
+/*
+ * Below contains all data related to a single context (formerly called port).
+ */
+
+#ifdef CONFIG_DEBUG_FS
+struct qib_opcode_stats_perctx;
+#endif
+
+struct qib_ctxtdata {
+	void **rcvegrbuf;
+	dma_addr_t *rcvegrbuf_phys;
+	/* rcvhdrq base, needs mmap before useful */
+	void *rcvhdrq;
+	/* kernel virtual address where hdrqtail is updated */
+	void *rcvhdrtail_kvaddr;
+	/*
+	 * temp buffer for expected send setup, allocated at open, instead
+	 * of each setup call
+	 */
+	void *tid_pg_list;
+	/*
+	 * Shared page for kernel to signal user processes that send buffers
+	 * need disarming.  The process should call QIB_CMD_DISARM_BUFS
+	 * or QIB_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set.
+	 */
+	unsigned long *user_event_mask;
+	/* when waiting for rcv or pioavail */
+	wait_queue_head_t wait;
+	/*
+	 * rcvegr bufs base, physical, must fit
+	 * in 44 bits so 32 bit programs mmap64 44 bit works)
+	 */
+	dma_addr_t rcvegr_phys;
+	/* mmap of hdrq, must fit in 44 bits */
+	dma_addr_t rcvhdrq_phys;
+	dma_addr_t rcvhdrqtailaddr_phys;
+
+	/*
+	 * number of opens (including slave sub-contexts) on this instance
+	 * (ignoring forks, dup, etc. for now)
+	 */
+	int cnt;
+	/*
+	 * how much space to leave at start of eager TID entries for
+	 * protocol use, on each TID
+	 */
+	/* instead of calculating it */
+	unsigned ctxt;
+	/* local node of context */
+	int node_id;
+	/* non-zero if ctxt is being shared. */
+	u16 subctxt_cnt;
+	/* non-zero if ctxt is being shared. */
+	u16 subctxt_id;
+	/* number of eager TID entries. */
+	u16 rcvegrcnt;
+	/* index of first eager TID entry. */
+	u16 rcvegr_tid_base;
+	/* number of pio bufs for this ctxt (all procs, if shared) */
+	u32 piocnt;
+	/* first pio buffer for this ctxt */
+	u32 pio_base;
+	/* chip offset of PIO buffers for this ctxt */
+	u32 piobufs;
+	/* how many alloc_pages() chunks in rcvegrbuf_pages */
+	u32 rcvegrbuf_chunks;
+	/* how many egrbufs per chunk */
+	u16 rcvegrbufs_perchunk;
+	/* ilog2 of above */
+	u16 rcvegrbufs_perchunk_shift;
+	/* order for rcvegrbuf_pages */
+	size_t rcvegrbuf_size;
+	/* rcvhdrq size (for freeing) */
+	size_t rcvhdrq_size;
+	/* per-context flags for fileops/intr communication */
+	unsigned long flag;
+	/* next expected TID to check when looking for free */
+	u32 tidcursor;
+	/* WAIT_RCV that timed out, no interrupt */
+	u32 rcvwait_to;
+	/* WAIT_PIO that timed out, no interrupt */
+	u32 piowait_to;
+	/* WAIT_RCV already happened, no wait */
+	u32 rcvnowait;
+	/* WAIT_PIO already happened, no wait */
+	u32 pionowait;
+	/* total number of polled urgent packets */
+	u32 urgent;
+	/* saved total number of polled urgent packets for poll edge trigger */
+	u32 urgent_poll;
+	/* pid of process using this ctxt */
+	pid_t pid;
+	pid_t subpid[QLOGIC_IB_MAX_SUBCTXT];
+	/* same size as task_struct .comm[], command that opened context */
+	char comm[16];
+	/* pkeys set by this use of this ctxt */
+	u16 pkeys[4];
+	/* so file ops can get at unit */
+	struct qib_devdata *dd;
+	/* so funcs that need physical port can get it easily */
+	struct qib_pportdata *ppd;
+	/* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+	void *subctxt_uregbase;
+	/* An array of pages for the eager receive buffers * N */
+	void *subctxt_rcvegrbuf;
+	/* An array of pages for the eager header queue entries * N */
+	void *subctxt_rcvhdr_base;
+	/* The version of the library which opened this ctxt */
+	u32 userversion;
+	/* Bitmask of active slaves */
+	u32 active_slaves;
+	/* Type of packets or conditions we want to poll for */
+	u16 poll_type;
+	/* receive packet sequence counter */
+	u8 seq_cnt;
+	u8 redirect_seq_cnt;
+	/* ctxt rcvhdrq head offset */
+	u32 head;
+	/* QPs waiting for context processing */
+	struct list_head qp_wait_list;
+#ifdef CONFIG_DEBUG_FS
+	/* verbs stats per CTX */
+	struct qib_opcode_stats_perctx *opstats;
+#endif
+};
+
+struct rvt_sge_state;
+
+struct qib_sdma_txreq {
+	int                 flags;
+	int                 sg_count;
+	dma_addr_t          addr;
+	void              (*callback)(struct qib_sdma_txreq *, int);
+	u16                 start_idx;  /* sdma private */
+	u16                 next_descq_idx;  /* sdma private */
+	struct list_head    list;       /* sdma private */
+};
+
+struct qib_sdma_desc {
+	__le64 qw[2];
+};
+
+struct qib_verbs_txreq {
+	struct qib_sdma_txreq   txreq;
+	struct rvt_qp           *qp;
+	struct rvt_swqe         *wqe;
+	u32                     dwords;
+	u16                     hdr_dwords;
+	u16                     hdr_inx;
+	struct qib_pio_header	*align_buf;
+	struct rvt_mregion	*mr;
+	struct rvt_sge_state    *ss;
+};
+
+#define QIB_SDMA_TXREQ_F_USELARGEBUF  0x1
+#define QIB_SDMA_TXREQ_F_HEADTOHOST   0x2
+#define QIB_SDMA_TXREQ_F_INTREQ       0x4
+#define QIB_SDMA_TXREQ_F_FREEBUF      0x8
+#define QIB_SDMA_TXREQ_F_FREEDESC     0x10
+
+#define QIB_SDMA_TXREQ_S_OK        0
+#define QIB_SDMA_TXREQ_S_SENDERROR 1
+#define QIB_SDMA_TXREQ_S_ABORTED   2
+#define QIB_SDMA_TXREQ_S_SHUTDOWN  3
+
+/*
+ * Get/Set IB link-level config parameters for f_get/set_ib_cfg()
+ * Mostly for MADs that set or query link parameters, also ipath
+ * config interfaces
+ */
+#define QIB_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */
+#define QIB_IB_CFG_LWID_ENB 2 /* allowed Link-width */
+#define QIB_IB_CFG_LWID 3 /* currently active Link-width */
+#define QIB_IB_CFG_SPD_ENB 4 /* allowed Link speeds */
+#define QIB_IB_CFG_SPD 5 /* current Link spd */
+#define QIB_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */
+#define QIB_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */
+#define QIB_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */
+#define QIB_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */
+#define QIB_IB_CFG_OP_VLS 10 /* operational VLs */
+#define QIB_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */
+#define QIB_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */
+#define QIB_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */
+#define QIB_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */
+#define QIB_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */
+#define QIB_IB_CFG_PKEYS 16 /* update partition keys */
+#define QIB_IB_CFG_MTU 17 /* update MTU in IBC */
+#define QIB_IB_CFG_LSTATE 18 /* update linkcmd and linkinitcmd in IBC */
+#define QIB_IB_CFG_VL_HIGH_LIMIT 19
+#define QIB_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */
+#define QIB_IB_CFG_PORT 21 /* switch port we are connected to */
+
+/*
+ * for CFG_LSTATE: LINKCMD in upper 16 bits, LINKINITCMD in lower 16
+ * IB_LINKINITCMD_POLL and SLEEP are also used as set/get values for
+ * QIB_IB_CFG_LINKDEFAULT cmd
+ */
+#define   IB_LINKCMD_DOWN   (0 << 16)
+#define   IB_LINKCMD_ARMED  (1 << 16)
+#define   IB_LINKCMD_ACTIVE (2 << 16)
+#define   IB_LINKINITCMD_NOP     0
+#define   IB_LINKINITCMD_POLL    1
+#define   IB_LINKINITCMD_SLEEP   2
+#define   IB_LINKINITCMD_DISABLE 3
+
+/*
+ * valid states passed to qib_set_linkstate() user call
+ */
+#define QIB_IB_LINKDOWN         0
+#define QIB_IB_LINKARM          1
+#define QIB_IB_LINKACTIVE       2
+#define QIB_IB_LINKDOWN_ONLY    3
+#define QIB_IB_LINKDOWN_SLEEP   4
+#define QIB_IB_LINKDOWN_DISABLE 5
+
+/*
+ * These 7 values (SDR, DDR, and QDR may be ORed for auto-speed
+ * negotiation) are used for the 3rd argument to path_f_set_ib_cfg
+ * with cmd QIB_IB_CFG_SPD_ENB, by direct calls or via sysfs.  They
+ * are also the the possible values for qib_link_speed_enabled and active
+ * The values were chosen to match values used within the IB spec.
+ */
+#define QIB_IB_SDR 1
+#define QIB_IB_DDR 2
+#define QIB_IB_QDR 4
+
+#define QIB_DEFAULT_MTU 4096
+
+/* max number of IB ports supported per HCA */
+#define QIB_MAX_IB_PORTS 2
+
+/*
+ * Possible IB config parameters for f_get/set_ib_table()
+ */
+#define QIB_IB_TBL_VL_HIGH_ARB 1 /* Get/set VL high priority weights */
+#define QIB_IB_TBL_VL_LOW_ARB 2 /* Get/set VL low priority weights */
+
+/*
+ * Possible "operations" for f_rcvctrl(ppd, op, ctxt)
+ * these are bits so they can be combined, e.g.
+ * QIB_RCVCTRL_INTRAVAIL_ENB | QIB_RCVCTRL_CTXT_ENB
+ */
+#define QIB_RCVCTRL_TAILUPD_ENB 0x01
+#define QIB_RCVCTRL_TAILUPD_DIS 0x02
+#define QIB_RCVCTRL_CTXT_ENB 0x04
+#define QIB_RCVCTRL_CTXT_DIS 0x08
+#define QIB_RCVCTRL_INTRAVAIL_ENB 0x10
+#define QIB_RCVCTRL_INTRAVAIL_DIS 0x20
+#define QIB_RCVCTRL_PKEY_ENB 0x40  /* Note, default is enabled */
+#define QIB_RCVCTRL_PKEY_DIS 0x80
+#define QIB_RCVCTRL_BP_ENB 0x0100
+#define QIB_RCVCTRL_BP_DIS 0x0200
+#define QIB_RCVCTRL_TIDFLOW_ENB 0x0400
+#define QIB_RCVCTRL_TIDFLOW_DIS 0x0800
+
+/*
+ * Possible "operations" for f_sendctrl(ppd, op, var)
+ * these are bits so they can be combined, e.g.
+ * QIB_SENDCTRL_BUFAVAIL_ENB | QIB_SENDCTRL_ENB
+ * Some operations (e.g. DISARM, ABORT) are known to
+ * be "one-shot", so do not modify shadow.
+ */
+#define QIB_SENDCTRL_DISARM       (0x1000)
+#define QIB_SENDCTRL_DISARM_BUF(bufn) ((bufn) | QIB_SENDCTRL_DISARM)
+	/* available (0x2000) */
+#define QIB_SENDCTRL_AVAIL_DIS    (0x4000)
+#define QIB_SENDCTRL_AVAIL_ENB    (0x8000)
+#define QIB_SENDCTRL_AVAIL_BLIP  (0x10000)
+#define QIB_SENDCTRL_SEND_DIS    (0x20000)
+#define QIB_SENDCTRL_SEND_ENB    (0x40000)
+#define QIB_SENDCTRL_FLUSH       (0x80000)
+#define QIB_SENDCTRL_CLEAR      (0x100000)
+#define QIB_SENDCTRL_DISARM_ALL (0x200000)
+
+/*
+ * These are the generic indices for requesting per-port
+ * counter values via the f_portcntr function.  They
+ * are always returned as 64 bit values, although most
+ * are 32 bit counters.
+ */
+/* send-related counters */
+#define QIBPORTCNTR_PKTSEND         0U
+#define QIBPORTCNTR_WORDSEND        1U
+#define QIBPORTCNTR_PSXMITDATA      2U
+#define QIBPORTCNTR_PSXMITPKTS      3U
+#define QIBPORTCNTR_PSXMITWAIT      4U
+#define QIBPORTCNTR_SENDSTALL       5U
+/* receive-related counters */
+#define QIBPORTCNTR_PKTRCV          6U
+#define QIBPORTCNTR_PSRCVDATA       7U
+#define QIBPORTCNTR_PSRCVPKTS       8U
+#define QIBPORTCNTR_RCVEBP          9U
+#define QIBPORTCNTR_RCVOVFL         10U
+#define QIBPORTCNTR_WORDRCV         11U
+/* IB link related error counters */
+#define QIBPORTCNTR_RXLOCALPHYERR   12U
+#define QIBPORTCNTR_RXVLERR         13U
+#define QIBPORTCNTR_ERRICRC         14U
+#define QIBPORTCNTR_ERRVCRC         15U
+#define QIBPORTCNTR_ERRLPCRC        16U
+#define QIBPORTCNTR_BADFORMAT       17U
+#define QIBPORTCNTR_ERR_RLEN        18U
+#define QIBPORTCNTR_IBSYMBOLERR     19U
+#define QIBPORTCNTR_INVALIDRLEN     20U
+#define QIBPORTCNTR_UNSUPVL         21U
+#define QIBPORTCNTR_EXCESSBUFOVFL   22U
+#define QIBPORTCNTR_ERRLINK         23U
+#define QIBPORTCNTR_IBLINKDOWN      24U
+#define QIBPORTCNTR_IBLINKERRRECOV  25U
+#define QIBPORTCNTR_LLI             26U
+/* other error counters */
+#define QIBPORTCNTR_RXDROPPKT       27U
+#define QIBPORTCNTR_VL15PKTDROP     28U
+#define QIBPORTCNTR_ERRPKEY         29U
+#define QIBPORTCNTR_KHDROVFL        30U
+/* sampling counters (these are actually control registers) */
+#define QIBPORTCNTR_PSINTERVAL      31U
+#define QIBPORTCNTR_PSSTART         32U
+#define QIBPORTCNTR_PSSTAT          33U
+
+/* how often we check for packet activity for "power on hours (in seconds) */
+#define ACTIVITY_TIMER 5
+
+#define MAX_NAME_SIZE 64
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+struct qib_irq_notify;
+#endif
+
+struct qib_msix_entry {
+	void *arg;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	int dca;
+	int rcv;
+	struct qib_irq_notify *notifier;
+#endif
+	cpumask_var_t mask;
+};
+
+/* Below is an opaque struct. Each chip (device) can maintain
+ * private data needed for its operation, but not germane to the
+ * rest of the driver.  For convenience, we define another that
+ * is chip-specific, per-port
+ */
+struct qib_chip_specific;
+struct qib_chipport_specific;
+
+enum qib_sdma_states {
+	qib_sdma_state_s00_hw_down,
+	qib_sdma_state_s10_hw_start_up_wait,
+	qib_sdma_state_s20_idle,
+	qib_sdma_state_s30_sw_clean_up_wait,
+	qib_sdma_state_s40_hw_clean_up_wait,
+	qib_sdma_state_s50_hw_halt_wait,
+	qib_sdma_state_s99_running,
+};
+
+enum qib_sdma_events {
+	qib_sdma_event_e00_go_hw_down,
+	qib_sdma_event_e10_go_hw_start,
+	qib_sdma_event_e20_hw_started,
+	qib_sdma_event_e30_go_running,
+	qib_sdma_event_e40_sw_cleaned,
+	qib_sdma_event_e50_hw_cleaned,
+	qib_sdma_event_e60_hw_halted,
+	qib_sdma_event_e70_go_idle,
+	qib_sdma_event_e7220_err_halted,
+	qib_sdma_event_e7322_err_halted,
+	qib_sdma_event_e90_timer_tick,
+};
+
+struct sdma_set_state_action {
+	unsigned op_enable:1;
+	unsigned op_intenable:1;
+	unsigned op_halt:1;
+	unsigned op_drain:1;
+	unsigned go_s99_running_tofalse:1;
+	unsigned go_s99_running_totrue:1;
+};
+
+struct qib_sdma_state {
+	struct kref          kref;
+	struct completion    comp;
+	enum qib_sdma_states current_state;
+	struct sdma_set_state_action *set_state_action;
+	unsigned             current_op;
+	unsigned             go_s99_running;
+	unsigned             first_sendbuf;
+	unsigned             last_sendbuf; /* really last +1 */
+	/* debugging/devel */
+	enum qib_sdma_states previous_state;
+	unsigned             previous_op;
+	enum qib_sdma_events last_event;
+};
+
+struct xmit_wait {
+	struct timer_list timer;
+	u64 counter;
+	u8 flags;
+	struct cache {
+		u64 psxmitdata;
+		u64 psrcvdata;
+		u64 psxmitpkts;
+		u64 psrcvpkts;
+		u64 psxmitwait;
+	} counter_cache;
+};
+
+/*
+ * The structure below encapsulates data relevant to a physical IB Port.
+ * Current chips support only one such port, but the separation
+ * clarifies things a bit. Note that to conform to IB conventions,
+ * port-numbers are one-based. The first or only port is port1.
+ */
+struct qib_pportdata {
+	struct qib_ibport ibport_data;
+
+	struct qib_devdata *dd;
+	struct qib_chippport_specific *cpspec; /* chip-specific per-port */
+	struct kobject pport_kobj;
+	struct kobject pport_cc_kobj;
+	struct kobject sl2vl_kobj;
+	struct kobject diagc_kobj;
+
+	/* GUID for this interface, in network order */
+	__be64 guid;
+
+	/* QIB_POLL, etc. link-state specific flags, per port */
+	u32 lflags;
+	/* qib_lflags driver is waiting for */
+	u32 state_wanted;
+	spinlock_t lflags_lock;
+
+	/* ref count for each pkey */
+	atomic_t pkeyrefs[4];
+
+	/*
+	 * this address is mapped readonly into user processes so they can
+	 * get status cheaply, whenever they want.  One qword of status per port
+	 */
+	u64 *statusp;
+
+	/* SendDMA related entries */
+
+	/* read mostly */
+	struct qib_sdma_desc *sdma_descq;
+	struct workqueue_struct *qib_wq;
+	struct qib_sdma_state sdma_state;
+	dma_addr_t       sdma_descq_phys;
+	volatile __le64 *sdma_head_dma; /* DMA'ed by chip */
+	dma_addr_t       sdma_head_phys;
+	u16                   sdma_descq_cnt;
+
+	/* read/write using lock */
+	spinlock_t            sdma_lock ____cacheline_aligned_in_smp;
+	struct list_head      sdma_activelist;
+	struct list_head      sdma_userpending;
+	u64                   sdma_descq_added;
+	u64                   sdma_descq_removed;
+	u16                   sdma_descq_tail;
+	u16                   sdma_descq_head;
+	u8                    sdma_generation;
+	u8                    sdma_intrequest;
+
+	struct tasklet_struct sdma_sw_clean_up_task
+		____cacheline_aligned_in_smp;
+
+	wait_queue_head_t state_wait; /* for state_wanted */
+
+	/* HoL blocking for SMP replies */
+	unsigned          hol_state;
+	struct timer_list hol_timer;
+
+	/*
+	 * Shadow copies of registers; size indicates read access size.
+	 * Most of them are readonly, but some are write-only register,
+	 * where we manipulate the bits in the shadow copy, and then write
+	 * the shadow copy to qlogic_ib.
+	 *
+	 * We deliberately make most of these 32 bits, since they have
+	 * restricted range.  For any that we read, we won't to generate 32
+	 * bit accesses, since Opteron will generate 2 separate 32 bit HT
+	 * transactions for a 64 bit read, and we want to avoid unnecessary
+	 * bus transactions.
+	 */
+
+	/* This is the 64 bit group */
+	/* last ibcstatus.  opaque outside chip-specific code */
+	u64 lastibcstat;
+
+	/* these are the "32 bit" regs */
+
+	/*
+	 * the following two are 32-bit bitmasks, but {test,clear,set}_bit
+	 * all expect bit fields to be "unsigned long"
+	 */
+	unsigned long p_rcvctrl; /* shadow per-port rcvctrl */
+	unsigned long p_sendctrl; /* shadow per-port sendctrl */
+
+	u32 ibmtu; /* The MTU programmed for this unit */
+	/*
+	 * Current max size IB packet (in bytes) including IB headers, that
+	 * we can send. Changes when ibmtu changes.
+	 */
+	u32 ibmaxlen;
+	/*
+	 * ibmaxlen at init time, limited by chip and by receive buffer
+	 * size.  Not changed after init.
+	 */
+	u32 init_ibmaxlen;
+	/* LID programmed for this instance */
+	u16 lid;
+	/* list of pkeys programmed; 0 if not set */
+	u16 pkeys[4];
+	/* LID mask control */
+	u8 lmc;
+	u8 link_width_supported;
+	u8 link_speed_supported;
+	u8 link_width_enabled;
+	u8 link_speed_enabled;
+	u8 link_width_active;
+	u8 link_speed_active;
+	u8 vls_supported;
+	u8 vls_operational;
+	/* Rx Polarity inversion (compensate for ~tx on partner) */
+	u8 rx_pol_inv;
+
+	u8 hw_pidx;     /* physical port index */
+	u8 port;        /* IB port number and index into dd->pports - 1 */
+
+	u8 delay_mult;
+
+	/* used to override LED behavior */
+	u8 led_override;  /* Substituted for normal value, if non-zero */
+	u16 led_override_timeoff; /* delta to next timer event */
+	u8 led_override_vals[2]; /* Alternates per blink-frame */
+	u8 led_override_phase; /* Just counts, LSB picks from vals[] */
+	atomic_t led_override_timer_active;
+	/* Used to flash LEDs in override mode */
+	struct timer_list led_override_timer;
+	struct xmit_wait cong_stats;
+	struct timer_list symerr_clear_timer;
+
+	/* Synchronize access between driver writes and sysfs reads */
+	spinlock_t cc_shadow_lock
+		____cacheline_aligned_in_smp;
+
+	/* Shadow copy of the congestion control table */
+	struct cc_table_shadow *ccti_entries_shadow;
+
+	/* Shadow copy of the congestion control entries */
+	struct ib_cc_congestion_setting_attr_shadow *congestion_entries_shadow;
+
+	/* List of congestion control table entries */
+	struct ib_cc_table_entry_shadow *ccti_entries;
+
+	/* 16 congestion entries with each entry corresponding to a SL */
+	struct ib_cc_congestion_entry_shadow *congestion_entries;
+
+	/* Maximum number of congestion control entries that the agent expects
+	 * the manager to send.
+	 */
+	u16 cc_supported_table_entries;
+
+	/* Total number of congestion control table entries */
+	u16 total_cct_entry;
+
+	/* Bit map identifying service level */
+	u16 cc_sl_control_map;
+
+	/* maximum congestion control table index */
+	u16 ccti_limit;
+
+	/* CA's max number of 64 entry units in the congestion control table */
+	u8 cc_max_table_entries;
+};
+
+/* Observers. Not to be taken lightly, possibly not to ship. */
+/*
+ * If a diag read or write is to (bottom <= offset <= top),
+ * the "hoook" is called, allowing, e.g. shadows to be
+ * updated in sync with the driver. struct diag_observer
+ * is the "visible" part.
+ */
+struct diag_observer;
+
+typedef int (*diag_hook) (struct qib_devdata *dd,
+	const struct diag_observer *op,
+	u32 offs, u64 *data, u64 mask, int only_32);
+
+struct diag_observer {
+	diag_hook hook;
+	u32 bottom;
+	u32 top;
+};
+
+extern int qib_register_observer(struct qib_devdata *dd,
+	const struct diag_observer *op);
+
+/* Only declared here, not defined. Private to diags */
+struct diag_observer_list_elt;
+
+/* device data struct now contains only "general per-device" info.
+ * fields related to a physical IB port are in a qib_pportdata struct,
+ * described above) while fields only used by a particular chip-type are in
+ * a qib_chipdata struct, whose contents are opaque to this file.
+ */
+struct qib_devdata {
+	struct qib_ibdev verbs_dev;     /* must be first */
+	struct list_head list;
+	/* pointers to related structs for this device */
+	/* pci access data structure */
+	struct pci_dev *pcidev;
+	struct cdev *user_cdev;
+	struct cdev *diag_cdev;
+	struct device *user_device;
+	struct device *diag_device;
+
+	/* mem-mapped pointer to base of chip regs */
+	u64 __iomem *kregbase;
+	/* end of mem-mapped chip space excluding sendbuf and user regs */
+	u64 __iomem *kregend;
+	/* physical address of chip for io_remap, etc. */
+	resource_size_t physaddr;
+	/* qib_cfgctxts pointers */
+	struct qib_ctxtdata **rcd; /* Receive Context Data */
+
+	/* qib_pportdata, points to array of (physical) port-specific
+	 * data structs, indexed by pidx (0..n-1)
+	 */
+	struct qib_pportdata *pport;
+	struct qib_chip_specific *cspec; /* chip-specific */
+
+	/* kvirt address of 1st 2k pio buffer */
+	void __iomem *pio2kbase;
+	/* kvirt address of 1st 4k pio buffer */
+	void __iomem *pio4kbase;
+	/* mem-mapped pointer to base of PIO buffers (if using WC PAT) */
+	void __iomem *piobase;
+	/* mem-mapped pointer to base of user chip regs (if using WC PAT) */
+	u64 __iomem *userbase;
+	void __iomem *piovl15base; /* base of VL15 buffers, if not WC */
+	/*
+	 * points to area where PIOavail registers will be DMA'ed.
+	 * Has to be on a page of it's own, because the page will be
+	 * mapped into user program space.  This copy is *ONLY* ever
+	 * written by DMA, not by the driver!  Need a copy per device
+	 * when we get to multiple devices
+	 */
+	volatile __le64 *pioavailregs_dma; /* DMA'ed by chip */
+	/* physical address where updates occur */
+	dma_addr_t pioavailregs_phys;
+
+	/* device-specific implementations of functions needed by
+	 * common code. Contrary to previous consensus, we can't
+	 * really just point to a device-specific table, because we
+	 * may need to "bend", e.g. *_f_put_tid
+	 */
+	/* fallback to alternate interrupt type if possible */
+	int (*f_intr_fallback)(struct qib_devdata *);
+	/* hard reset chip */
+	int (*f_reset)(struct qib_devdata *);
+	void (*f_quiet_serdes)(struct qib_pportdata *);
+	int (*f_bringup_serdes)(struct qib_pportdata *);
+	int (*f_early_init)(struct qib_devdata *);
+	void (*f_clear_tids)(struct qib_devdata *, struct qib_ctxtdata *);
+	void (*f_put_tid)(struct qib_devdata *, u64 __iomem*,
+				u32, unsigned long);
+	void (*f_cleanup)(struct qib_devdata *);
+	void (*f_setextled)(struct qib_pportdata *, u32);
+	/* fill out chip-specific fields */
+	int (*f_get_base_info)(struct qib_ctxtdata *, struct qib_base_info *);
+	/* free irq */
+	void (*f_free_irq)(struct qib_devdata *);
+	struct qib_message_header *(*f_get_msgheader)
+					(struct qib_devdata *, __le32 *);
+	void (*f_config_ctxts)(struct qib_devdata *);
+	int (*f_get_ib_cfg)(struct qib_pportdata *, int);
+	int (*f_set_ib_cfg)(struct qib_pportdata *, int, u32);
+	int (*f_set_ib_loopback)(struct qib_pportdata *, const char *);
+	int (*f_get_ib_table)(struct qib_pportdata *, int, void *);
+	int (*f_set_ib_table)(struct qib_pportdata *, int, void *);
+	u32 (*f_iblink_state)(u64);
+	u8 (*f_ibphys_portstate)(u64);
+	void (*f_xgxs_reset)(struct qib_pportdata *);
+	/* per chip actions needed for IB Link up/down changes */
+	int (*f_ib_updown)(struct qib_pportdata *, int, u64);
+	u32 __iomem *(*f_getsendbuf)(struct qib_pportdata *, u64, u32 *);
+	/* Read/modify/write of GPIO pins (potentially chip-specific */
+	int (*f_gpio_mod)(struct qib_devdata *dd, u32 out, u32 dir,
+		u32 mask);
+	/* Enable writes to config EEPROM (if supported) */
+	int (*f_eeprom_wen)(struct qib_devdata *dd, int wen);
+	/*
+	 * modify rcvctrl shadow[s] and write to appropriate chip-regs.
+	 * see above QIB_RCVCTRL_xxx_ENB/DIS for operations.
+	 * (ctxt == -1) means "all contexts", only meaningful for
+	 * clearing. Could remove if chip_spec shutdown properly done.
+	 */
+	void (*f_rcvctrl)(struct qib_pportdata *, unsigned int op,
+		int ctxt);
+	/* Read/modify/write sendctrl appropriately for op and port. */
+	void (*f_sendctrl)(struct qib_pportdata *, u32 op);
+	void (*f_set_intr_state)(struct qib_devdata *, u32);
+	void (*f_set_armlaunch)(struct qib_devdata *, u32);
+	void (*f_wantpiobuf_intr)(struct qib_devdata *, u32);
+	int (*f_late_initreg)(struct qib_devdata *);
+	int (*f_init_sdma_regs)(struct qib_pportdata *);
+	u16 (*f_sdma_gethead)(struct qib_pportdata *);
+	int (*f_sdma_busy)(struct qib_pportdata *);
+	void (*f_sdma_update_tail)(struct qib_pportdata *, u16);
+	void (*f_sdma_set_desc_cnt)(struct qib_pportdata *, unsigned);
+	void (*f_sdma_sendctrl)(struct qib_pportdata *, unsigned);
+	void (*f_sdma_hw_clean_up)(struct qib_pportdata *);
+	void (*f_sdma_hw_start_up)(struct qib_pportdata *);
+	void (*f_sdma_init_early)(struct qib_pportdata *);
+	void (*f_set_cntr_sample)(struct qib_pportdata *, u32, u32);
+	void (*f_update_usrhead)(struct qib_ctxtdata *, u64, u32, u32, u32);
+	u32 (*f_hdrqempty)(struct qib_ctxtdata *);
+	u64 (*f_portcntr)(struct qib_pportdata *, u32);
+	u32 (*f_read_cntrs)(struct qib_devdata *, loff_t, char **,
+		u64 **);
+	u32 (*f_read_portcntrs)(struct qib_devdata *, loff_t, u32,
+		char **, u64 **);
+	u32 (*f_setpbc_control)(struct qib_pportdata *, u32, u8, u8);
+	void (*f_initvl15_bufs)(struct qib_devdata *);
+	void (*f_init_ctxt)(struct qib_ctxtdata *);
+	void (*f_txchk_change)(struct qib_devdata *, u32, u32, u32,
+		struct qib_ctxtdata *);
+	void (*f_writescratch)(struct qib_devdata *, u32);
+	int (*f_tempsense_rd)(struct qib_devdata *, int regnum);
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	int (*f_notify_dca)(struct qib_devdata *, unsigned long event);
+#endif
+
+	char *boardname; /* human readable board info */
+
+	/* template for writing TIDs  */
+	u64 tidtemplate;
+	/* value to write to free TIDs */
+	u64 tidinvalid;
+
+	/* number of registers used for pioavail */
+	u32 pioavregs;
+	/* device (not port) flags, basically device capabilities */
+	u32 flags;
+	/* last buffer for user use */
+	u32 lastctxt_piobuf;
+
+	/* reset value */
+	u64 z_int_counter;
+	/* percpu intcounter */
+	u64 __percpu *int_counter;
+
+	/* pio bufs allocated per ctxt */
+	u32 pbufsctxt;
+	/* if remainder on bufs/ctxt, ctxts < extrabuf get 1 extra */
+	u32 ctxts_extrabuf;
+	/*
+	 * number of ctxts configured as max; zero is set to number chip
+	 * supports, less gives more pio bufs/ctxt, etc.
+	 */
+	u32 cfgctxts;
+	/*
+	 * number of ctxts available for PSM open
+	 */
+	u32 freectxts;
+
+	/*
+	 * hint that we should update pioavailshadow before
+	 * looking for a PIO buffer
+	 */
+	u32 upd_pio_shadow;
+
+	/* internal debugging stats */
+	u32 maxpkts_call;
+	u32 avgpkts_call;
+	u64 nopiobufs;
+
+	/* PCI Vendor ID (here for NodeInfo) */
+	u16 vendorid;
+	/* PCI Device ID (here for NodeInfo) */
+	u16 deviceid;
+	/* for write combining settings */
+	int wc_cookie;
+	unsigned long wc_base;
+	unsigned long wc_len;
+
+	/* shadow copy of struct page *'s for exp tid pages */
+	struct page **pageshadow;
+	/* shadow copy of dma handles for exp tid pages */
+	dma_addr_t *physshadow;
+	u64 __iomem *egrtidbase;
+	spinlock_t sendctrl_lock; /* protect changes to sendctrl shadow */
+	/* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
+	spinlock_t uctxt_lock; /* rcd and user context changes */
+	/*
+	 * per unit status, see also portdata statusp
+	 * mapped readonly into user processes so they can get unit and
+	 * IB link status cheaply
+	 */
+	u64 *devstatusp;
+	char *freezemsg; /* freeze msg if hw error put chip in freeze */
+	u32 freezelen; /* max length of freezemsg */
+	/* timer used to prevent stats overflow, error throttling, etc. */
+	struct timer_list stats_timer;
+
+	/* timer to verify interrupts work, and fallback if possible */
+	struct timer_list intrchk_timer;
+	unsigned long ureg_align; /* user register alignment */
+
+	/*
+	 * Protects pioavailshadow, pioavailkernel, pio_need_disarm, and
+	 * pio_writing.
+	 */
+	spinlock_t pioavail_lock;
+	/*
+	 * index of last buffer to optimize search for next
+	 */
+	u32 last_pio;
+	/*
+	 * min kernel pio buffer to optimize search
+	 */
+	u32 min_kernel_pio;
+	/*
+	 * Shadow copies of registers; size indicates read access size.
+	 * Most of them are readonly, but some are write-only register,
+	 * where we manipulate the bits in the shadow copy, and then write
+	 * the shadow copy to qlogic_ib.
+	 *
+	 * We deliberately make most of these 32 bits, since they have
+	 * restricted range.  For any that we read, we won't to generate 32
+	 * bit accesses, since Opteron will generate 2 separate 32 bit HT
+	 * transactions for a 64 bit read, and we want to avoid unnecessary
+	 * bus transactions.
+	 */
+
+	/* This is the 64 bit group */
+
+	unsigned long pioavailshadow[6];
+	/* bitmap of send buffers available for the kernel to use with PIO. */
+	unsigned long pioavailkernel[6];
+	/* bitmap of send buffers which need to be disarmed. */
+	unsigned long pio_need_disarm[3];
+	/* bitmap of send buffers which are being written to. */
+	unsigned long pio_writing[3];
+	/* kr_revision shadow */
+	u64 revision;
+	/* Base GUID for device (from eeprom, network order) */
+	__be64 base_guid;
+
+	/*
+	 * kr_sendpiobufbase value (chip offset of pio buffers), and the
+	 * base of the 2KB buffer s(user processes only use 2K)
+	 */
+	u64 piobufbase;
+	u32 pio2k_bufbase;
+
+	/* these are the "32 bit" regs */
+
+	/* number of GUIDs in the flash for this interface */
+	u32 nguid;
+	/*
+	 * the following two are 32-bit bitmasks, but {test,clear,set}_bit
+	 * all expect bit fields to be "unsigned long"
+	 */
+	unsigned long rcvctrl; /* shadow per device rcvctrl */
+	unsigned long sendctrl; /* shadow per device sendctrl */
+
+	/* value we put in kr_rcvhdrcnt */
+	u32 rcvhdrcnt;
+	/* value we put in kr_rcvhdrsize */
+	u32 rcvhdrsize;
+	/* value we put in kr_rcvhdrentsize */
+	u32 rcvhdrentsize;
+	/* kr_ctxtcnt value */
+	u32 ctxtcnt;
+	/* kr_pagealign value */
+	u32 palign;
+	/* number of "2KB" PIO buffers */
+	u32 piobcnt2k;
+	/* size in bytes of "2KB" PIO buffers */
+	u32 piosize2k;
+	/* max usable size in dwords of a "2KB" PIO buffer before going "4KB" */
+	u32 piosize2kmax_dwords;
+	/* number of "4KB" PIO buffers */
+	u32 piobcnt4k;
+	/* size in bytes of "4KB" PIO buffers */
+	u32 piosize4k;
+	/* kr_rcvegrbase value */
+	u32 rcvegrbase;
+	/* kr_rcvtidbase value */
+	u32 rcvtidbase;
+	/* kr_rcvtidcnt value */
+	u32 rcvtidcnt;
+	/* kr_userregbase */
+	u32 uregbase;
+	/* shadow the control register contents */
+	u32 control;
+
+	/* chip address space used by 4k pio buffers */
+	u32 align4k;
+	/* size of each rcvegrbuffer */
+	u16 rcvegrbufsize;
+	/* log2 of above */
+	u16 rcvegrbufsize_shift;
+	/* localbus width (1, 2,4,8,16,32) from config space  */
+	u32 lbus_width;
+	/* localbus speed in MHz */
+	u32 lbus_speed;
+	int unit; /* unit # of this chip */
+
+	/* start of CHIP_SPEC move to chipspec, but need code changes */
+	/* low and high portions of MSI capability/vector */
+	u32 msi_lo;
+	/* saved after PCIe init for restore after reset */
+	u32 msi_hi;
+	/* MSI data (vector) saved for restore */
+	u16 msi_data;
+	/* so we can rewrite it after a chip reset */
+	u32 pcibar0;
+	/* so we can rewrite it after a chip reset */
+	u32 pcibar1;
+	u64 rhdrhead_intr_off;
+
+	/*
+	 * ASCII serial number, from flash, large enough for original
+	 * all digit strings, and longer QLogic serial number format
+	 */
+	u8 serial[16];
+	/* human readable board version */
+	u8 boardversion[96];
+	u8 lbus_info[32]; /* human readable localbus info */
+	/* chip major rev, from qib_revision */
+	u8 majrev;
+	/* chip minor rev, from qib_revision */
+	u8 minrev;
+
+	/* Misc small ints */
+	/* Number of physical ports available */
+	u8 num_pports;
+	/* Lowest context number which can be used by user processes */
+	u8 first_user_ctxt;
+	u8 n_krcv_queues;
+	u8 qpn_mask;
+	u8 skip_kctxt_mask;
+
+	u16 rhf_offset; /* offset of RHF within receive header entry */
+
+	/*
+	 * GPIO pins for twsi-connected devices, and device code for eeprom
+	 */
+	u8 gpio_sda_num;
+	u8 gpio_scl_num;
+	u8 twsi_eeprom_dev;
+	u8 board_atten;
+
+	/* Support (including locks) for EEPROM logging of errors and time */
+	/* control access to actual counters, timer */
+	spinlock_t eep_st_lock;
+	/* control high-level access to EEPROM */
+	struct mutex eep_lock;
+	uint64_t traffic_wds;
+	struct qib_diag_client *diag_client;
+	spinlock_t qib_diag_trans_lock; /* protect diag observer ops */
+	struct diag_observer_list_elt *diag_observer_list;
+
+	u8 psxmitwait_supported;
+	/* cycle length of PS* counters in HW (in picoseconds) */
+	u16 psxmitwait_check_rate;
+	/* high volume overflow errors defered to tasklet */
+	struct tasklet_struct error_tasklet;
+
+	int assigned_node_id; /* NUMA node closest to HCA */
+};
+
+/* hol_state values */
+#define QIB_HOL_UP       0
+#define QIB_HOL_INIT     1
+
+#define QIB_SDMA_SENDCTRL_OP_ENABLE    (1U << 0)
+#define QIB_SDMA_SENDCTRL_OP_INTENABLE (1U << 1)
+#define QIB_SDMA_SENDCTRL_OP_HALT      (1U << 2)
+#define QIB_SDMA_SENDCTRL_OP_CLEANUP   (1U << 3)
+#define QIB_SDMA_SENDCTRL_OP_DRAIN     (1U << 4)
+
+/* operation types for f_txchk_change() */
+#define TXCHK_CHG_TYPE_DIS1  3
+#define TXCHK_CHG_TYPE_ENAB1 2
+#define TXCHK_CHG_TYPE_KERN  1
+#define TXCHK_CHG_TYPE_USER  0
+
+#define QIB_CHASE_TIME msecs_to_jiffies(145)
+#define QIB_CHASE_DIS_TIME msecs_to_jiffies(160)
+
+/* Private data for file operations */
+struct qib_filedata {
+	struct qib_ctxtdata *rcd;
+	unsigned subctxt;
+	unsigned tidcursor;
+	struct qib_user_sdma_queue *pq;
+	int rec_cpu_num; /* for cpu affinity; -1 if none */
+};
+
+extern struct list_head qib_dev_list;
+extern spinlock_t qib_devs_lock;
+extern struct qib_devdata *qib_lookup(int unit);
+extern u32 qib_cpulist_count;
+extern unsigned long *qib_cpulist;
+extern unsigned qib_cc_table_size;
+
+int qib_init(struct qib_devdata *, int);
+int init_chip_wc_pat(struct qib_devdata *dd, u32);
+int qib_enable_wc(struct qib_devdata *dd);
+void qib_disable_wc(struct qib_devdata *dd);
+int qib_count_units(int *npresentp, int *nupp);
+int qib_count_active_units(void);
+
+int qib_cdev_init(int minor, const char *name,
+		  const struct file_operations *fops,
+		  struct cdev **cdevp, struct device **devp);
+void qib_cdev_cleanup(struct cdev **cdevp, struct device **devp);
+int qib_dev_init(void);
+void qib_dev_cleanup(void);
+
+int qib_diag_add(struct qib_devdata *);
+void qib_diag_remove(struct qib_devdata *);
+void qib_handle_e_ibstatuschanged(struct qib_pportdata *, u64);
+void qib_sdma_update_tail(struct qib_pportdata *, u16); /* hold sdma_lock */
+
+int qib_decode_err(struct qib_devdata *dd, char *buf, size_t blen, u64 err);
+void qib_bad_intrstatus(struct qib_devdata *);
+void qib_handle_urcv(struct qib_devdata *, u64);
+
+/* clean up any per-chip chip-specific stuff */
+void qib_chip_cleanup(struct qib_devdata *);
+/* clean up any chip type-specific stuff */
+void qib_chip_done(void);
+
+/* check to see if we have to force ordering for write combining */
+int qib_unordered_wc(void);
+void qib_pio_copy(void __iomem *to, const void *from, size_t count);
+
+void qib_disarm_piobufs(struct qib_devdata *, unsigned, unsigned);
+int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *);
+void qib_disarm_piobufs_set(struct qib_devdata *, unsigned long *, unsigned);
+void qib_cancel_sends(struct qib_pportdata *);
+
+int qib_create_rcvhdrq(struct qib_devdata *, struct qib_ctxtdata *);
+int qib_setup_eagerbufs(struct qib_ctxtdata *);
+void qib_set_ctxtcnt(struct qib_devdata *);
+int qib_create_ctxts(struct qib_devdata *dd);
+struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32, int);
+int qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8);
+void qib_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *);
+
+u32 qib_kreceive(struct qib_ctxtdata *, u32 *, u32 *);
+int qib_reset_device(int);
+int qib_wait_linkstate(struct qib_pportdata *, u32, int);
+int qib_set_linkstate(struct qib_pportdata *, u8);
+int qib_set_mtu(struct qib_pportdata *, u16);
+int qib_set_lid(struct qib_pportdata *, u32, u8);
+void qib_hol_down(struct qib_pportdata *);
+void qib_hol_init(struct qib_pportdata *);
+void qib_hol_up(struct qib_pportdata *);
+void qib_hol_event(struct timer_list *);
+void qib_disable_after_error(struct qib_devdata *);
+int qib_set_uevent_bits(struct qib_pportdata *, const int);
+
+/* for use in system calls, where we want to know device type, etc. */
+#define ctxt_fp(fp) \
+	(((struct qib_filedata *)(fp)->private_data)->rcd)
+#define subctxt_fp(fp) \
+	(((struct qib_filedata *)(fp)->private_data)->subctxt)
+#define tidcursor_fp(fp) \
+	(((struct qib_filedata *)(fp)->private_data)->tidcursor)
+#define user_sdma_queue_fp(fp) \
+	(((struct qib_filedata *)(fp)->private_data)->pq)
+
+static inline struct qib_devdata *dd_from_ppd(struct qib_pportdata *ppd)
+{
+	return ppd->dd;
+}
+
+static inline struct qib_devdata *dd_from_dev(struct qib_ibdev *dev)
+{
+	return container_of(dev, struct qib_devdata, verbs_dev);
+}
+
+static inline struct qib_devdata *dd_from_ibdev(struct ib_device *ibdev)
+{
+	return dd_from_dev(to_idev(ibdev));
+}
+
+static inline struct qib_pportdata *ppd_from_ibp(struct qib_ibport *ibp)
+{
+	return container_of(ibp, struct qib_pportdata, ibport_data);
+}
+
+static inline struct qib_ibport *to_iport(struct ib_device *ibdev, u8 port)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	WARN_ON(pidx >= dd->num_pports);
+	return &dd->pport[pidx].ibport_data;
+}
+
+/*
+ * values for dd->flags (_device_ related flags) and
+ */
+#define QIB_HAS_LINK_LATENCY  0x1 /* supports link latency (IB 1.2) */
+#define QIB_INITTED           0x2 /* chip and driver up and initted */
+#define QIB_DOING_RESET       0x4  /* in the middle of doing chip reset */
+#define QIB_PRESENT           0x8  /* chip accesses can be done */
+#define QIB_PIO_FLUSH_WC      0x10 /* Needs Write combining flush for PIO */
+#define QIB_HAS_THRESH_UPDATE 0x40
+#define QIB_HAS_SDMA_TIMEOUT  0x80
+#define QIB_USE_SPCL_TRIG     0x100 /* SpecialTrigger launch enabled */
+#define QIB_NODMA_RTAIL       0x200 /* rcvhdrtail register DMA enabled */
+#define QIB_HAS_INTX          0x800 /* Supports INTx interrupts */
+#define QIB_HAS_SEND_DMA      0x1000 /* Supports Send DMA */
+#define QIB_HAS_VLSUPP        0x2000 /* Supports multiple VLs; PBC different */
+#define QIB_HAS_HDRSUPP       0x4000 /* Supports header suppression */
+#define QIB_BADINTR           0x8000 /* severe interrupt problems */
+#define QIB_DCA_ENABLED       0x10000 /* Direct Cache Access enabled */
+#define QIB_HAS_QSFP          0x20000 /* device (card instance) has QSFP */
+
+/*
+ * values for ppd->lflags (_ib_port_ related flags)
+ */
+#define QIBL_LINKV             0x1 /* IB link state valid */
+#define QIBL_LINKDOWN          0x8 /* IB link is down */
+#define QIBL_LINKINIT          0x10 /* IB link level is up */
+#define QIBL_LINKARMED         0x20 /* IB link is ARMED */
+#define QIBL_LINKACTIVE        0x40 /* IB link is ACTIVE */
+/* leave a gap for more IB-link state */
+#define QIBL_IB_AUTONEG_INPROG 0x1000 /* non-IBTA DDR/QDR neg active */
+#define QIBL_IB_AUTONEG_FAILED 0x2000 /* non-IBTA DDR/QDR neg failed */
+#define QIBL_IB_LINK_DISABLED  0x4000 /* Linkdown-disable forced,
+				       * Do not try to bring up */
+#define QIBL_IB_FORCE_NOTIFY   0x8000 /* force notify on next ib change */
+
+/* IB dword length mask in PBC (lower 11 bits); same for all chips */
+#define QIB_PBC_LENGTH_MASK                     ((1 << 11) - 1)
+
+
+/* ctxt_flag bit offsets */
+		/* waiting for a packet to arrive */
+#define QIB_CTXT_WAITING_RCV   2
+		/* master has not finished initializing */
+#define QIB_CTXT_MASTER_UNINIT 4
+		/* waiting for an urgent packet to arrive */
+#define QIB_CTXT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+void qib_free_data(struct qib_ctxtdata *dd);
+void qib_chg_pioavailkernel(struct qib_devdata *, unsigned, unsigned,
+			    u32, struct qib_ctxtdata *);
+struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *,
+					   const struct pci_device_id *);
+struct qib_devdata *qib_init_iba7220_funcs(struct pci_dev *,
+					   const struct pci_device_id *);
+struct qib_devdata *qib_init_iba6120_funcs(struct pci_dev *,
+					   const struct pci_device_id *);
+void qib_free_devdata(struct qib_devdata *);
+struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra);
+
+#define QIB_TWSI_NO_DEV 0xFF
+/* Below qib_twsi_ functions must be called with eep_lock held */
+int qib_twsi_reset(struct qib_devdata *dd);
+int qib_twsi_blk_rd(struct qib_devdata *dd, int dev, int addr, void *buffer,
+		    int len);
+int qib_twsi_blk_wr(struct qib_devdata *dd, int dev, int addr,
+		    const void *buffer, int len);
+void qib_get_eeprom_info(struct qib_devdata *);
+void qib_dump_lookup_output_queue(struct qib_devdata *);
+void qib_force_pio_avail_update(struct qib_devdata *);
+void qib_clear_symerror_on_linkup(struct timer_list *t);
+
+/*
+ * Set LED override, only the two LSBs have "public" meaning, but
+ * any non-zero value substitutes them for the Link and LinkTrain
+ * LED states.
+ */
+#define QIB_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
+#define QIB_LED_LOG 2  /* Logical (link) YELLOW LED */
+void qib_set_led_override(struct qib_pportdata *ppd, unsigned int val);
+
+/* send dma routines */
+int qib_setup_sdma(struct qib_pportdata *);
+void qib_teardown_sdma(struct qib_pportdata *);
+void __qib_sdma_intr(struct qib_pportdata *);
+void qib_sdma_intr(struct qib_pportdata *);
+void qib_user_sdma_send_desc(struct qib_pportdata *dd,
+			struct list_head *pktlist);
+int qib_sdma_verbs_send(struct qib_pportdata *, struct rvt_sge_state *,
+			u32, struct qib_verbs_txreq *);
+/* ppd->sdma_lock should be locked before calling this. */
+int qib_sdma_make_progress(struct qib_pportdata *dd);
+
+static inline int qib_sdma_empty(const struct qib_pportdata *ppd)
+{
+	return ppd->sdma_descq_added == ppd->sdma_descq_removed;
+}
+
+/* must be called under qib_sdma_lock */
+static inline u16 qib_sdma_descq_freecnt(const struct qib_pportdata *ppd)
+{
+	return ppd->sdma_descq_cnt -
+		(ppd->sdma_descq_added - ppd->sdma_descq_removed) - 1;
+}
+
+static inline int __qib_sdma_running(struct qib_pportdata *ppd)
+{
+	return ppd->sdma_state.current_state == qib_sdma_state_s99_running;
+}
+int qib_sdma_running(struct qib_pportdata *);
+void dump_sdma_state(struct qib_pportdata *ppd);
+void __qib_sdma_process_event(struct qib_pportdata *, enum qib_sdma_events);
+void qib_sdma_process_event(struct qib_pportdata *, enum qib_sdma_events);
+
+/*
+ * number of words used for protocol header if not set by qib_userinit();
+ */
+#define QIB_DFLT_RCVHDRSIZE 9
+
+/*
+ * We need to be able to handle an IB header of at least 24 dwords.
+ * We need the rcvhdrq large enough to handle largest IB header, but
+ * still have room for a 2KB MTU standard IB packet.
+ * Additionally, some processor/memory controller combinations
+ * benefit quite strongly from having the DMA'ed data be cacheline
+ * aligned and a cacheline multiple, so we set the size to 32 dwords
+ * (2 64-byte primary cachelines for pretty much all processors of
+ * interest).  The alignment hurts nothing, other than using somewhat
+ * more memory.
+ */
+#define QIB_RCVHDR_ENTSIZE 32
+
+int qib_get_user_pages(unsigned long, size_t, struct page **);
+void qib_release_user_pages(struct page **, size_t);
+int qib_eeprom_read(struct qib_devdata *, u8, void *, int);
+int qib_eeprom_write(struct qib_devdata *, u8, const void *, int);
+u32 __iomem *qib_getsendbuf_range(struct qib_devdata *, u32 *, u32, u32);
+void qib_sendbuf_done(struct qib_devdata *, unsigned);
+
+static inline void qib_clear_rcvhdrtail(const struct qib_ctxtdata *rcd)
+{
+	*((u64 *) rcd->rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 qib_get_rcvhdrtail(const struct qib_ctxtdata *rcd)
+{
+	/*
+	 * volatile because it's a DMA target from the chip, routine is
+	 * inlined, and don't want register caching or reordering.
+	 */
+	return (u32) le64_to_cpu(
+		*((volatile __le64 *)rcd->rcvhdrtail_kvaddr)); /* DMA'ed */
+}
+
+static inline u32 qib_get_hdrqtail(const struct qib_ctxtdata *rcd)
+{
+	const struct qib_devdata *dd = rcd->dd;
+	u32 hdrqtail;
+
+	if (dd->flags & QIB_NODMA_RTAIL) {
+		__le32 *rhf_addr;
+		u32 seq;
+
+		rhf_addr = (__le32 *) rcd->rcvhdrq +
+			rcd->head + dd->rhf_offset;
+		seq = qib_hdrget_seq(rhf_addr);
+		hdrqtail = rcd->head;
+		if (seq == rcd->seq_cnt)
+			hdrqtail++;
+	} else
+		hdrqtail = qib_get_rcvhdrtail(rcd);
+
+	return hdrqtail;
+}
+
+/*
+ * sysfs interface.
+ */
+
+extern const char ib_qib_version[];
+
+int qib_device_create(struct qib_devdata *);
+void qib_device_remove(struct qib_devdata *);
+
+int qib_create_port_files(struct ib_device *ibdev, u8 port_num,
+			  struct kobject *kobj);
+int qib_verbs_register_sysfs(struct qib_devdata *);
+void qib_verbs_unregister_sysfs(struct qib_devdata *);
+/* Hook for sysfs read of QSFP */
+extern int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len);
+
+int __init qib_init_qibfs(void);
+int __exit qib_exit_qibfs(void);
+
+int qibfs_add(struct qib_devdata *);
+int qibfs_remove(struct qib_devdata *);
+
+int qib_pcie_init(struct pci_dev *, const struct pci_device_id *);
+int qib_pcie_ddinit(struct qib_devdata *, struct pci_dev *,
+		    const struct pci_device_id *);
+void qib_pcie_ddcleanup(struct qib_devdata *);
+int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent);
+void qib_free_irq(struct qib_devdata *dd);
+int qib_reinit_intr(struct qib_devdata *dd);
+void qib_pcie_getcmd(struct qib_devdata *, u16 *, u8 *, u8 *);
+void qib_pcie_reenable(struct qib_devdata *, u16, u8, u8);
+/* interrupts for device */
+u64 qib_int_counter(struct qib_devdata *);
+/* interrupt for all devices */
+u64 qib_sps_ints(void);
+
+/*
+ * dma_addr wrappers - all 0's invalid for hw
+ */
+dma_addr_t qib_map_page(struct pci_dev *, struct page *, unsigned long,
+			  size_t, int);
+struct pci_dev *qib_get_pci_dev(struct rvt_dev_info *rdi);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+static inline void qib_flush_wc(void)
+{
+#if defined(CONFIG_X86_64)
+	asm volatile("sfence" : : : "memory");
+#else
+	wmb(); /* no reorder around wc flush */
+#endif
+}
+
+/* global module parameter variables */
+extern unsigned qib_ibmtu;
+extern ushort qib_cfgctxts;
+extern ushort qib_num_cfg_vls;
+extern ushort qib_mini_init; /* If set, do few (ideally 0) writes to chip */
+extern unsigned qib_n_krcv_queues;
+extern unsigned qib_sdma_fetch_arb;
+extern unsigned qib_compat_ddr_negotiate;
+extern int qib_special_trigger;
+extern unsigned qib_numa_aware;
+
+extern struct mutex qib_mutex;
+
+/* Number of seconds before our card status check...  */
+#define STATUS_TIMEOUT 60
+
+#define QIB_DRV_NAME            "ib_qib"
+#define QIB_USER_MINOR_BASE     0
+#define QIB_TRACE_MINOR         127
+#define QIB_DIAGPKT_MINOR       128
+#define QIB_DIAG_MINOR_BASE     129
+#define QIB_NMINORS             255
+
+#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
+#define PCI_VENDOR_ID_QLOGIC 0x1077
+#define PCI_DEVICE_ID_QLOGIC_IB_6120 0x10
+#define PCI_DEVICE_ID_QLOGIC_IB_7220 0x7220
+#define PCI_DEVICE_ID_QLOGIC_IB_7322 0x7322
+
+/*
+ * qib_early_err is used (only!) to print early errors before devdata is
+ * allocated, or when dd->pcidev may not be valid, and at the tail end of
+ * cleanup when devdata may have been freed, etc.  qib_dev_porterr is
+ * the same as qib_dev_err, but is used when the message really needs
+ * the IB port# to be definitive as to what's happening..
+ * All of these go to the trace log, and the trace log entry is done
+ * first to avoid possible serial port delays from printk.
+ */
+#define qib_early_err(dev, fmt, ...) \
+	dev_err(dev, fmt, ##__VA_ARGS__)
+
+#define qib_dev_err(dd, fmt, ...) \
+	dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
+		rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
+
+#define qib_dev_warn(dd, fmt, ...) \
+	dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
+		 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
+
+#define qib_dev_porterr(dd, port, fmt, ...) \
+	dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \
+		rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), (dd)->unit, (port), \
+		##__VA_ARGS__)
+
+#define qib_devinfo(pcidev, fmt, ...) \
+	dev_info(&(pcidev)->dev, fmt, ##__VA_ARGS__)
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct qib_hwerror_msgs {
+	u64 mask;
+	const char *msg;
+	size_t sz;
+};
+
+#define QLOGIC_IB_HWE_MSG(a, b) { .mask = a, .msg = b }
+
+/* in qib_intr.c... */
+void qib_format_hwerrors(u64 hwerrs,
+			 const struct qib_hwerror_msgs *hwerrmsgs,
+			 size_t nhwerrmsgs, char *msg, size_t lmsg);
+
+void qib_stop_send_queue(struct rvt_qp *qp);
+void qib_quiesce_qp(struct rvt_qp *qp);
+void qib_flush_qp_waiters(struct rvt_qp *qp);
+int qib_mtu_to_path_mtu(u32 mtu);
+u32 qib_mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu);
+void qib_notify_error_qp(struct rvt_qp *qp);
+int qib_get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+			   struct ib_qp_attr *attr);
+
+#endif                          /* _QIB_KERNEL_H */
diff --git a/drivers/infiniband/hw/qib/qib_6120_regs.h b/drivers/infiniband/hw/qib/qib_6120_regs.h
new file mode 100644
index 0000000..e16cb6f
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_6120_regs.h
@@ -0,0 +1,977 @@
+/*
+ * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* This file is mechanically generated from RTL. Any hand-edits will be lost! */
+
+#define QIB_6120_Revision_OFFS 0x0
+#define QIB_6120_Revision_R_Simulator_LSB 0x3F
+#define QIB_6120_Revision_R_Simulator_RMASK 0x1
+#define QIB_6120_Revision_Reserved_LSB 0x28
+#define QIB_6120_Revision_Reserved_RMASK 0x7FFFFF
+#define QIB_6120_Revision_BoardID_LSB 0x20
+#define QIB_6120_Revision_BoardID_RMASK 0xFF
+#define QIB_6120_Revision_R_SW_LSB 0x18
+#define QIB_6120_Revision_R_SW_RMASK 0xFF
+#define QIB_6120_Revision_R_Arch_LSB 0x10
+#define QIB_6120_Revision_R_Arch_RMASK 0xFF
+#define QIB_6120_Revision_R_ChipRevMajor_LSB 0x8
+#define QIB_6120_Revision_R_ChipRevMajor_RMASK 0xFF
+#define QIB_6120_Revision_R_ChipRevMinor_LSB 0x0
+#define QIB_6120_Revision_R_ChipRevMinor_RMASK 0xFF
+
+#define QIB_6120_Control_OFFS 0x8
+#define QIB_6120_Control_TxLatency_LSB 0x4
+#define QIB_6120_Control_TxLatency_RMASK 0x1
+#define QIB_6120_Control_PCIERetryBufDiagEn_LSB 0x3
+#define QIB_6120_Control_PCIERetryBufDiagEn_RMASK 0x1
+#define QIB_6120_Control_LinkEn_LSB 0x2
+#define QIB_6120_Control_LinkEn_RMASK 0x1
+#define QIB_6120_Control_FreezeMode_LSB 0x1
+#define QIB_6120_Control_FreezeMode_RMASK 0x1
+#define QIB_6120_Control_SyncReset_LSB 0x0
+#define QIB_6120_Control_SyncReset_RMASK 0x1
+
+#define QIB_6120_PageAlign_OFFS 0x10
+
+#define QIB_6120_PortCnt_OFFS 0x18
+
+#define QIB_6120_SendRegBase_OFFS 0x30
+
+#define QIB_6120_UserRegBase_OFFS 0x38
+
+#define QIB_6120_CntrRegBase_OFFS 0x40
+
+#define QIB_6120_Scratch_OFFS 0x48
+#define QIB_6120_Scratch_TopHalf_LSB 0x20
+#define QIB_6120_Scratch_TopHalf_RMASK 0xFFFFFFFF
+#define QIB_6120_Scratch_BottomHalf_LSB 0x0
+#define QIB_6120_Scratch_BottomHalf_RMASK 0xFFFFFFFF
+
+#define QIB_6120_IntBlocked_OFFS 0x60
+#define QIB_6120_IntBlocked_ErrorIntBlocked_LSB 0x1F
+#define QIB_6120_IntBlocked_ErrorIntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_PioSetIntBlocked_LSB 0x1E
+#define QIB_6120_IntBlocked_PioSetIntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_PioBufAvailIntBlocked_LSB 0x1D
+#define QIB_6120_IntBlocked_PioBufAvailIntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_assertGPIOIntBlocked_LSB 0x1C
+#define QIB_6120_IntBlocked_assertGPIOIntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_Reserved_LSB 0xF
+#define QIB_6120_IntBlocked_Reserved_RMASK 0x1FFF
+#define QIB_6120_IntBlocked_RcvAvail4IntBlocked_LSB 0x10
+#define QIB_6120_IntBlocked_RcvAvail4IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvAvail3IntBlocked_LSB 0xF
+#define QIB_6120_IntBlocked_RcvAvail3IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvAvail2IntBlocked_LSB 0xE
+#define QIB_6120_IntBlocked_RcvAvail2IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvAvail1IntBlocked_LSB 0xD
+#define QIB_6120_IntBlocked_RcvAvail1IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvAvail0IntBlocked_LSB 0xC
+#define QIB_6120_IntBlocked_RcvAvail0IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_Reserved1_LSB 0x5
+#define QIB_6120_IntBlocked_Reserved1_RMASK 0x7F
+#define QIB_6120_IntBlocked_RcvUrg4IntBlocked_LSB 0x4
+#define QIB_6120_IntBlocked_RcvUrg4IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvUrg3IntBlocked_LSB 0x3
+#define QIB_6120_IntBlocked_RcvUrg3IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvUrg2IntBlocked_LSB 0x2
+#define QIB_6120_IntBlocked_RcvUrg2IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvUrg1IntBlocked_LSB 0x1
+#define QIB_6120_IntBlocked_RcvUrg1IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvUrg0IntBlocked_LSB 0x0
+#define QIB_6120_IntBlocked_RcvUrg0IntBlocked_RMASK 0x1
+
+#define QIB_6120_IntMask_OFFS 0x68
+#define QIB_6120_IntMask_ErrorIntMask_LSB 0x1F
+#define QIB_6120_IntMask_ErrorIntMask_RMASK 0x1
+#define QIB_6120_IntMask_PioSetIntMask_LSB 0x1E
+#define QIB_6120_IntMask_PioSetIntMask_RMASK 0x1
+#define QIB_6120_IntMask_PioBufAvailIntMask_LSB 0x1D
+#define QIB_6120_IntMask_PioBufAvailIntMask_RMASK 0x1
+#define QIB_6120_IntMask_assertGPIOIntMask_LSB 0x1C
+#define QIB_6120_IntMask_assertGPIOIntMask_RMASK 0x1
+#define QIB_6120_IntMask_Reserved_LSB 0x11
+#define QIB_6120_IntMask_Reserved_RMASK 0x7FF
+#define QIB_6120_IntMask_RcvAvail4IntMask_LSB 0x10
+#define QIB_6120_IntMask_RcvAvail4IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvAvail3IntMask_LSB 0xF
+#define QIB_6120_IntMask_RcvAvail3IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvAvail2IntMask_LSB 0xE
+#define QIB_6120_IntMask_RcvAvail2IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvAvail1IntMask_LSB 0xD
+#define QIB_6120_IntMask_RcvAvail1IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvAvail0IntMask_LSB 0xC
+#define QIB_6120_IntMask_RcvAvail0IntMask_RMASK 0x1
+#define QIB_6120_IntMask_Reserved1_LSB 0x5
+#define QIB_6120_IntMask_Reserved1_RMASK 0x7F
+#define QIB_6120_IntMask_RcvUrg4IntMask_LSB 0x4
+#define QIB_6120_IntMask_RcvUrg4IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvUrg3IntMask_LSB 0x3
+#define QIB_6120_IntMask_RcvUrg3IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvUrg2IntMask_LSB 0x2
+#define QIB_6120_IntMask_RcvUrg2IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvUrg1IntMask_LSB 0x1
+#define QIB_6120_IntMask_RcvUrg1IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvUrg0IntMask_LSB 0x0
+#define QIB_6120_IntMask_RcvUrg0IntMask_RMASK 0x1
+
+#define QIB_6120_IntStatus_OFFS 0x70
+#define QIB_6120_IntStatus_Error_LSB 0x1F
+#define QIB_6120_IntStatus_Error_RMASK 0x1
+#define QIB_6120_IntStatus_PioSent_LSB 0x1E
+#define QIB_6120_IntStatus_PioSent_RMASK 0x1
+#define QIB_6120_IntStatus_PioBufAvail_LSB 0x1D
+#define QIB_6120_IntStatus_PioBufAvail_RMASK 0x1
+#define QIB_6120_IntStatus_assertGPIO_LSB 0x1C
+#define QIB_6120_IntStatus_assertGPIO_RMASK 0x1
+#define QIB_6120_IntStatus_Reserved_LSB 0xF
+#define QIB_6120_IntStatus_Reserved_RMASK 0x1FFF
+#define QIB_6120_IntStatus_RcvAvail4_LSB 0x10
+#define QIB_6120_IntStatus_RcvAvail4_RMASK 0x1
+#define QIB_6120_IntStatus_RcvAvail3_LSB 0xF
+#define QIB_6120_IntStatus_RcvAvail3_RMASK 0x1
+#define QIB_6120_IntStatus_RcvAvail2_LSB 0xE
+#define QIB_6120_IntStatus_RcvAvail2_RMASK 0x1
+#define QIB_6120_IntStatus_RcvAvail1_LSB 0xD
+#define QIB_6120_IntStatus_RcvAvail1_RMASK 0x1
+#define QIB_6120_IntStatus_RcvAvail0_LSB 0xC
+#define QIB_6120_IntStatus_RcvAvail0_RMASK 0x1
+#define QIB_6120_IntStatus_Reserved1_LSB 0x5
+#define QIB_6120_IntStatus_Reserved1_RMASK 0x7F
+#define QIB_6120_IntStatus_RcvUrg4_LSB 0x4
+#define QIB_6120_IntStatus_RcvUrg4_RMASK 0x1
+#define QIB_6120_IntStatus_RcvUrg3_LSB 0x3
+#define QIB_6120_IntStatus_RcvUrg3_RMASK 0x1
+#define QIB_6120_IntStatus_RcvUrg2_LSB 0x2
+#define QIB_6120_IntStatus_RcvUrg2_RMASK 0x1
+#define QIB_6120_IntStatus_RcvUrg1_LSB 0x1
+#define QIB_6120_IntStatus_RcvUrg1_RMASK 0x1
+#define QIB_6120_IntStatus_RcvUrg0_LSB 0x0
+#define QIB_6120_IntStatus_RcvUrg0_RMASK 0x1
+
+#define QIB_6120_IntClear_OFFS 0x78
+#define QIB_6120_IntClear_ErrorIntClear_LSB 0x1F
+#define QIB_6120_IntClear_ErrorIntClear_RMASK 0x1
+#define QIB_6120_IntClear_PioSetIntClear_LSB 0x1E
+#define QIB_6120_IntClear_PioSetIntClear_RMASK 0x1
+#define QIB_6120_IntClear_PioBufAvailIntClear_LSB 0x1D
+#define QIB_6120_IntClear_PioBufAvailIntClear_RMASK 0x1
+#define QIB_6120_IntClear_assertGPIOIntClear_LSB 0x1C
+#define QIB_6120_IntClear_assertGPIOIntClear_RMASK 0x1
+#define QIB_6120_IntClear_Reserved_LSB 0xF
+#define QIB_6120_IntClear_Reserved_RMASK 0x1FFF
+#define QIB_6120_IntClear_RcvAvail4IntClear_LSB 0x10
+#define QIB_6120_IntClear_RcvAvail4IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvAvail3IntClear_LSB 0xF
+#define QIB_6120_IntClear_RcvAvail3IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvAvail2IntClear_LSB 0xE
+#define QIB_6120_IntClear_RcvAvail2IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvAvail1IntClear_LSB 0xD
+#define QIB_6120_IntClear_RcvAvail1IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvAvail0IntClear_LSB 0xC
+#define QIB_6120_IntClear_RcvAvail0IntClear_RMASK 0x1
+#define QIB_6120_IntClear_Reserved1_LSB 0x5
+#define QIB_6120_IntClear_Reserved1_RMASK 0x7F
+#define QIB_6120_IntClear_RcvUrg4IntClear_LSB 0x4
+#define QIB_6120_IntClear_RcvUrg4IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvUrg3IntClear_LSB 0x3
+#define QIB_6120_IntClear_RcvUrg3IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvUrg2IntClear_LSB 0x2
+#define QIB_6120_IntClear_RcvUrg2IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvUrg1IntClear_LSB 0x1
+#define QIB_6120_IntClear_RcvUrg1IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvUrg0IntClear_LSB 0x0
+#define QIB_6120_IntClear_RcvUrg0IntClear_RMASK 0x1
+
+#define QIB_6120_ErrMask_OFFS 0x80
+#define QIB_6120_ErrMask_Reserved_LSB 0x34
+#define QIB_6120_ErrMask_Reserved_RMASK 0xFFF
+#define QIB_6120_ErrMask_HardwareErrMask_LSB 0x33
+#define QIB_6120_ErrMask_HardwareErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_ResetNegatedMask_LSB 0x32
+#define QIB_6120_ErrMask_ResetNegatedMask_RMASK 0x1
+#define QIB_6120_ErrMask_InvalidAddrErrMask_LSB 0x31
+#define QIB_6120_ErrMask_InvalidAddrErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_IBStatusChangedMask_LSB 0x30
+#define QIB_6120_ErrMask_IBStatusChangedMask_RMASK 0x1
+#define QIB_6120_ErrMask_Reserved1_LSB 0x26
+#define QIB_6120_ErrMask_Reserved1_RMASK 0x3FF
+#define QIB_6120_ErrMask_SendUnsupportedVLErrMask_LSB 0x25
+#define QIB_6120_ErrMask_SendUnsupportedVLErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendUnexpectedPktNumErrMask_LSB 0x24
+#define QIB_6120_ErrMask_SendUnexpectedPktNumErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendPioArmLaunchErrMask_LSB 0x23
+#define QIB_6120_ErrMask_SendPioArmLaunchErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendDroppedDataPktErrMask_LSB 0x22
+#define QIB_6120_ErrMask_SendDroppedDataPktErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendDroppedSmpPktErrMask_LSB 0x21
+#define QIB_6120_ErrMask_SendDroppedSmpPktErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendPktLenErrMask_LSB 0x20
+#define QIB_6120_ErrMask_SendPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendUnderRunErrMask_LSB 0x1F
+#define QIB_6120_ErrMask_SendUnderRunErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendMaxPktLenErrMask_LSB 0x1E
+#define QIB_6120_ErrMask_SendMaxPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendMinPktLenErrMask_LSB 0x1D
+#define QIB_6120_ErrMask_SendMinPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_Reserved2_LSB 0x12
+#define QIB_6120_ErrMask_Reserved2_RMASK 0x7FF
+#define QIB_6120_ErrMask_RcvIBLostLinkErrMask_LSB 0x11
+#define QIB_6120_ErrMask_RcvIBLostLinkErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvHdrErrMask_LSB 0x10
+#define QIB_6120_ErrMask_RcvHdrErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvHdrLenErrMask_LSB 0xF
+#define QIB_6120_ErrMask_RcvHdrLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvBadTidErrMask_LSB 0xE
+#define QIB_6120_ErrMask_RcvBadTidErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvHdrFullErrMask_LSB 0xD
+#define QIB_6120_ErrMask_RcvHdrFullErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvEgrFullErrMask_LSB 0xC
+#define QIB_6120_ErrMask_RcvEgrFullErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvBadVersionErrMask_LSB 0xB
+#define QIB_6120_ErrMask_RcvBadVersionErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvIBFlowErrMask_LSB 0xA
+#define QIB_6120_ErrMask_RcvIBFlowErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvEBPErrMask_LSB 0x9
+#define QIB_6120_ErrMask_RcvEBPErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvUnsupportedVLErrMask_LSB 0x8
+#define QIB_6120_ErrMask_RcvUnsupportedVLErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvUnexpectedCharErrMask_LSB 0x7
+#define QIB_6120_ErrMask_RcvUnexpectedCharErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvShortPktLenErrMask_LSB 0x6
+#define QIB_6120_ErrMask_RcvShortPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvLongPktLenErrMask_LSB 0x5
+#define QIB_6120_ErrMask_RcvLongPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvMaxPktLenErrMask_LSB 0x4
+#define QIB_6120_ErrMask_RcvMaxPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvMinPktLenErrMask_LSB 0x3
+#define QIB_6120_ErrMask_RcvMinPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvICRCErrMask_LSB 0x2
+#define QIB_6120_ErrMask_RcvICRCErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvVCRCErrMask_LSB 0x1
+#define QIB_6120_ErrMask_RcvVCRCErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvFormatErrMask_LSB 0x0
+#define QIB_6120_ErrMask_RcvFormatErrMask_RMASK 0x1
+
+#define QIB_6120_ErrStatus_OFFS 0x88
+#define QIB_6120_ErrStatus_Reserved_LSB 0x34
+#define QIB_6120_ErrStatus_Reserved_RMASK 0xFFF
+#define QIB_6120_ErrStatus_HardwareErr_LSB 0x33
+#define QIB_6120_ErrStatus_HardwareErr_RMASK 0x1
+#define QIB_6120_ErrStatus_ResetNegated_LSB 0x32
+#define QIB_6120_ErrStatus_ResetNegated_RMASK 0x1
+#define QIB_6120_ErrStatus_InvalidAddrErr_LSB 0x31
+#define QIB_6120_ErrStatus_InvalidAddrErr_RMASK 0x1
+#define QIB_6120_ErrStatus_IBStatusChanged_LSB 0x30
+#define QIB_6120_ErrStatus_IBStatusChanged_RMASK 0x1
+#define QIB_6120_ErrStatus_Reserved1_LSB 0x26
+#define QIB_6120_ErrStatus_Reserved1_RMASK 0x3FF
+#define QIB_6120_ErrStatus_SendUnsupportedVLErr_LSB 0x25
+#define QIB_6120_ErrStatus_SendUnsupportedVLErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendUnexpectedPktNumErr_LSB 0x24
+#define QIB_6120_ErrStatus_SendUnexpectedPktNumErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendPioArmLaunchErr_LSB 0x23
+#define QIB_6120_ErrStatus_SendPioArmLaunchErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendDroppedDataPktErr_LSB 0x22
+#define QIB_6120_ErrStatus_SendDroppedDataPktErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendDroppedSmpPktErr_LSB 0x21
+#define QIB_6120_ErrStatus_SendDroppedSmpPktErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendPktLenErr_LSB 0x20
+#define QIB_6120_ErrStatus_SendPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendUnderRunErr_LSB 0x1F
+#define QIB_6120_ErrStatus_SendUnderRunErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendMaxPktLenErr_LSB 0x1E
+#define QIB_6120_ErrStatus_SendMaxPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendMinPktLenErr_LSB 0x1D
+#define QIB_6120_ErrStatus_SendMinPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_Reserved2_LSB 0x12
+#define QIB_6120_ErrStatus_Reserved2_RMASK 0x7FF
+#define QIB_6120_ErrStatus_RcvIBLostLinkErr_LSB 0x11
+#define QIB_6120_ErrStatus_RcvIBLostLinkErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvHdrErr_LSB 0x10
+#define QIB_6120_ErrStatus_RcvHdrErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvHdrLenErr_LSB 0xF
+#define QIB_6120_ErrStatus_RcvHdrLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvBadTidErr_LSB 0xE
+#define QIB_6120_ErrStatus_RcvBadTidErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvHdrFullErr_LSB 0xD
+#define QIB_6120_ErrStatus_RcvHdrFullErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvEgrFullErr_LSB 0xC
+#define QIB_6120_ErrStatus_RcvEgrFullErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvBadVersionErr_LSB 0xB
+#define QIB_6120_ErrStatus_RcvBadVersionErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvIBFlowErr_LSB 0xA
+#define QIB_6120_ErrStatus_RcvIBFlowErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvEBPErr_LSB 0x9
+#define QIB_6120_ErrStatus_RcvEBPErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvUnsupportedVLErr_LSB 0x8
+#define QIB_6120_ErrStatus_RcvUnsupportedVLErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvUnexpectedCharErr_LSB 0x7
+#define QIB_6120_ErrStatus_RcvUnexpectedCharErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvShortPktLenErr_LSB 0x6
+#define QIB_6120_ErrStatus_RcvShortPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvLongPktLenErr_LSB 0x5
+#define QIB_6120_ErrStatus_RcvLongPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvMaxPktLenErr_LSB 0x4
+#define QIB_6120_ErrStatus_RcvMaxPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvMinPktLenErr_LSB 0x3
+#define QIB_6120_ErrStatus_RcvMinPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvICRCErr_LSB 0x2
+#define QIB_6120_ErrStatus_RcvICRCErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvVCRCErr_LSB 0x1
+#define QIB_6120_ErrStatus_RcvVCRCErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvFormatErr_LSB 0x0
+#define QIB_6120_ErrStatus_RcvFormatErr_RMASK 0x1
+
+#define QIB_6120_ErrClear_OFFS 0x90
+#define QIB_6120_ErrClear_Reserved_LSB 0x34
+#define QIB_6120_ErrClear_Reserved_RMASK 0xFFF
+#define QIB_6120_ErrClear_HardwareErrClear_LSB 0x33
+#define QIB_6120_ErrClear_HardwareErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_ResetNegatedClear_LSB 0x32
+#define QIB_6120_ErrClear_ResetNegatedClear_RMASK 0x1
+#define QIB_6120_ErrClear_InvalidAddrErrClear_LSB 0x31
+#define QIB_6120_ErrClear_InvalidAddrErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_IBStatusChangedClear_LSB 0x30
+#define QIB_6120_ErrClear_IBStatusChangedClear_RMASK 0x1
+#define QIB_6120_ErrClear_Reserved1_LSB 0x26
+#define QIB_6120_ErrClear_Reserved1_RMASK 0x3FF
+#define QIB_6120_ErrClear_SendUnsupportedVLErrClear_LSB 0x25
+#define QIB_6120_ErrClear_SendUnsupportedVLErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendUnexpectedPktNumErrClear_LSB 0x24
+#define QIB_6120_ErrClear_SendUnexpectedPktNumErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendPioArmLaunchErrClear_LSB 0x23
+#define QIB_6120_ErrClear_SendPioArmLaunchErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendDroppedDataPktErrClear_LSB 0x22
+#define QIB_6120_ErrClear_SendDroppedDataPktErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendDroppedSmpPktErrClear_LSB 0x21
+#define QIB_6120_ErrClear_SendDroppedSmpPktErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendPktLenErrClear_LSB 0x20
+#define QIB_6120_ErrClear_SendPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendUnderRunErrClear_LSB 0x1F
+#define QIB_6120_ErrClear_SendUnderRunErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendMaxPktLenErrClear_LSB 0x1E
+#define QIB_6120_ErrClear_SendMaxPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendMinPktLenErrClear_LSB 0x1D
+#define QIB_6120_ErrClear_SendMinPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_Reserved2_LSB 0x12
+#define QIB_6120_ErrClear_Reserved2_RMASK 0x7FF
+#define QIB_6120_ErrClear_RcvIBLostLinkErrClear_LSB 0x11
+#define QIB_6120_ErrClear_RcvIBLostLinkErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvHdrErrClear_LSB 0x10
+#define QIB_6120_ErrClear_RcvHdrErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvHdrLenErrClear_LSB 0xF
+#define QIB_6120_ErrClear_RcvHdrLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvBadTidErrClear_LSB 0xE
+#define QIB_6120_ErrClear_RcvBadTidErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvHdrFullErrClear_LSB 0xD
+#define QIB_6120_ErrClear_RcvHdrFullErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvEgrFullErrClear_LSB 0xC
+#define QIB_6120_ErrClear_RcvEgrFullErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvBadVersionErrClear_LSB 0xB
+#define QIB_6120_ErrClear_RcvBadVersionErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvIBFlowErrClear_LSB 0xA
+#define QIB_6120_ErrClear_RcvIBFlowErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvEBPErrClear_LSB 0x9
+#define QIB_6120_ErrClear_RcvEBPErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvUnsupportedVLErrClear_LSB 0x8
+#define QIB_6120_ErrClear_RcvUnsupportedVLErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvUnexpectedCharErrClear_LSB 0x7
+#define QIB_6120_ErrClear_RcvUnexpectedCharErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvShortPktLenErrClear_LSB 0x6
+#define QIB_6120_ErrClear_RcvShortPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvLongPktLenErrClear_LSB 0x5
+#define QIB_6120_ErrClear_RcvLongPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvMaxPktLenErrClear_LSB 0x4
+#define QIB_6120_ErrClear_RcvMaxPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvMinPktLenErrClear_LSB 0x3
+#define QIB_6120_ErrClear_RcvMinPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvICRCErrClear_LSB 0x2
+#define QIB_6120_ErrClear_RcvICRCErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvVCRCErrClear_LSB 0x1
+#define QIB_6120_ErrClear_RcvVCRCErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvFormatErrClear_LSB 0x0
+#define QIB_6120_ErrClear_RcvFormatErrClear_RMASK 0x1
+
+#define QIB_6120_HwErrMask_OFFS 0x98
+#define QIB_6120_HwErrMask_IBCBusFromSPCParityErrMask_LSB 0x3F
+#define QIB_6120_HwErrMask_IBCBusFromSPCParityErrMask_RMASK 0x1
+#define QIB_6120_HwErrMask_IBCBusToSPCParityErrMask_LSB 0x3E
+#define QIB_6120_HwErrMask_IBCBusToSPCParityErrMask_RMASK 0x1
+#define QIB_6120_HwErrMask_Reserved_LSB 0x3D
+#define QIB_6120_HwErrMask_Reserved_RMASK 0x1
+#define QIB_6120_HwErrMask_IBSerdesPClkNotDetectMask_LSB 0x3C
+#define QIB_6120_HwErrMask_IBSerdesPClkNotDetectMask_RMASK 0x1
+#define QIB_6120_HwErrMask_PCIESerdesQ0PClkNotDetectMask_LSB 0x3B
+#define QIB_6120_HwErrMask_PCIESerdesQ0PClkNotDetectMask_RMASK 0x1
+#define QIB_6120_HwErrMask_PCIESerdesQ1PClkNotDetectMask_LSB 0x3A
+#define QIB_6120_HwErrMask_PCIESerdesQ1PClkNotDetectMask_RMASK 0x1
+#define QIB_6120_HwErrMask_Reserved1_LSB 0x39
+#define QIB_6120_HwErrMask_Reserved1_RMASK 0x1
+#define QIB_6120_HwErrMask_IBPLLrfSlipMask_LSB 0x38
+#define QIB_6120_HwErrMask_IBPLLrfSlipMask_RMASK 0x1
+#define QIB_6120_HwErrMask_IBPLLfbSlipMask_LSB 0x37
+#define QIB_6120_HwErrMask_IBPLLfbSlipMask_RMASK 0x1
+#define QIB_6120_HwErrMask_PowerOnBISTFailedMask_LSB 0x36
+#define QIB_6120_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1
+#define QIB_6120_HwErrMask_Reserved2_LSB 0x33
+#define QIB_6120_HwErrMask_Reserved2_RMASK 0x7
+#define QIB_6120_HwErrMask_RXEMemParityErrMask_LSB 0x2C
+#define QIB_6120_HwErrMask_RXEMemParityErrMask_RMASK 0x7F
+#define QIB_6120_HwErrMask_TXEMemParityErrMask_LSB 0x28
+#define QIB_6120_HwErrMask_TXEMemParityErrMask_RMASK 0xF
+#define QIB_6120_HwErrMask_Reserved3_LSB 0x22
+#define QIB_6120_HwErrMask_Reserved3_RMASK 0x3F
+#define QIB_6120_HwErrMask_PCIeBusParityErrMask_LSB 0x1F
+#define QIB_6120_HwErrMask_PCIeBusParityErrMask_RMASK 0x7
+#define QIB_6120_HwErrMask_PcieCplTimeoutMask_LSB 0x1E
+#define QIB_6120_HwErrMask_PcieCplTimeoutMask_RMASK 0x1
+#define QIB_6120_HwErrMask_PoisonedTLPMask_LSB 0x1D
+#define QIB_6120_HwErrMask_PoisonedTLPMask_RMASK 0x1
+#define QIB_6120_HwErrMask_Reserved4_LSB 0x6
+#define QIB_6120_HwErrMask_Reserved4_RMASK 0x7FFFFF
+#define QIB_6120_HwErrMask_PCIeMemParityErrMask_LSB 0x0
+#define QIB_6120_HwErrMask_PCIeMemParityErrMask_RMASK 0x3F
+
+#define QIB_6120_HwErrStatus_OFFS 0xA0
+#define QIB_6120_HwErrStatus_IBCBusFromSPCParityErr_LSB 0x3F
+#define QIB_6120_HwErrStatus_IBCBusFromSPCParityErr_RMASK 0x1
+#define QIB_6120_HwErrStatus_IBCBusToSPCParityErr_LSB 0x3E
+#define QIB_6120_HwErrStatus_IBCBusToSPCParityErr_RMASK 0x1
+#define QIB_6120_HwErrStatus_Reserved_LSB 0x3D
+#define QIB_6120_HwErrStatus_Reserved_RMASK 0x1
+#define QIB_6120_HwErrStatus_IBSerdesPClkNotDetect_LSB 0x3C
+#define QIB_6120_HwErrStatus_IBSerdesPClkNotDetect_RMASK 0x1
+#define QIB_6120_HwErrStatus_PCIESerdesQ0PClkNotDetect_LSB 0x3B
+#define QIB_6120_HwErrStatus_PCIESerdesQ0PClkNotDetect_RMASK 0x1
+#define QIB_6120_HwErrStatus_PCIESerdesQ1PClkNotDetect_LSB 0x3A
+#define QIB_6120_HwErrStatus_PCIESerdesQ1PClkNotDetect_RMASK 0x1
+#define QIB_6120_HwErrStatus_Reserved1_LSB 0x39
+#define QIB_6120_HwErrStatus_Reserved1_RMASK 0x1
+#define QIB_6120_HwErrStatus_IBPLLrfSlip_LSB 0x38
+#define QIB_6120_HwErrStatus_IBPLLrfSlip_RMASK 0x1
+#define QIB_6120_HwErrStatus_IBPLLfbSlip_LSB 0x37
+#define QIB_6120_HwErrStatus_IBPLLfbSlip_RMASK 0x1
+#define QIB_6120_HwErrStatus_PowerOnBISTFailed_LSB 0x36
+#define QIB_6120_HwErrStatus_PowerOnBISTFailed_RMASK 0x1
+#define QIB_6120_HwErrStatus_Reserved2_LSB 0x33
+#define QIB_6120_HwErrStatus_Reserved2_RMASK 0x7
+#define QIB_6120_HwErrStatus_RXEMemParity_LSB 0x2C
+#define QIB_6120_HwErrStatus_RXEMemParity_RMASK 0x7F
+#define QIB_6120_HwErrStatus_TXEMemParity_LSB 0x28
+#define QIB_6120_HwErrStatus_TXEMemParity_RMASK 0xF
+#define QIB_6120_HwErrStatus_Reserved3_LSB 0x22
+#define QIB_6120_HwErrStatus_Reserved3_RMASK 0x3F
+#define QIB_6120_HwErrStatus_PCIeBusParity_LSB 0x1F
+#define QIB_6120_HwErrStatus_PCIeBusParity_RMASK 0x7
+#define QIB_6120_HwErrStatus_PcieCplTimeout_LSB 0x1E
+#define QIB_6120_HwErrStatus_PcieCplTimeout_RMASK 0x1
+#define QIB_6120_HwErrStatus_PoisenedTLP_LSB 0x1D
+#define QIB_6120_HwErrStatus_PoisenedTLP_RMASK 0x1
+#define QIB_6120_HwErrStatus_Reserved4_LSB 0x6
+#define QIB_6120_HwErrStatus_Reserved4_RMASK 0x7FFFFF
+#define QIB_6120_HwErrStatus_PCIeMemParity_LSB 0x0
+#define QIB_6120_HwErrStatus_PCIeMemParity_RMASK 0x3F
+
+#define QIB_6120_HwErrClear_OFFS 0xA8
+#define QIB_6120_HwErrClear_IBCBusFromSPCParityErrClear_LSB 0x3F
+#define QIB_6120_HwErrClear_IBCBusFromSPCParityErrClear_RMASK 0x1
+#define QIB_6120_HwErrClear_IBCBusToSPCparityErrClear_LSB 0x3E
+#define QIB_6120_HwErrClear_IBCBusToSPCparityErrClear_RMASK 0x1
+#define QIB_6120_HwErrClear_Reserved_LSB 0x3D
+#define QIB_6120_HwErrClear_Reserved_RMASK 0x1
+#define QIB_6120_HwErrClear_IBSerdesPClkNotDetectClear_LSB 0x3C
+#define QIB_6120_HwErrClear_IBSerdesPClkNotDetectClear_RMASK 0x1
+#define QIB_6120_HwErrClear_PCIESerdesQ0PClkNotDetectClear_LSB 0x3B
+#define QIB_6120_HwErrClear_PCIESerdesQ0PClkNotDetectClear_RMASK 0x1
+#define QIB_6120_HwErrClear_PCIESerdesQ1PClkNotDetectClear_LSB 0x3A
+#define QIB_6120_HwErrClear_PCIESerdesQ1PClkNotDetectClear_RMASK 0x1
+#define QIB_6120_HwErrClear_Reserved1_LSB 0x39
+#define QIB_6120_HwErrClear_Reserved1_RMASK 0x1
+#define QIB_6120_HwErrClear_IBPLLrfSlipClear_LSB 0x38
+#define QIB_6120_HwErrClear_IBPLLrfSlipClear_RMASK 0x1
+#define QIB_6120_HwErrClear_IBPLLfbSlipClear_LSB 0x37
+#define QIB_6120_HwErrClear_IBPLLfbSlipClear_RMASK 0x1
+#define QIB_6120_HwErrClear_PowerOnBISTFailedClear_LSB 0x36
+#define QIB_6120_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1
+#define QIB_6120_HwErrClear_Reserved2_LSB 0x33
+#define QIB_6120_HwErrClear_Reserved2_RMASK 0x7
+#define QIB_6120_HwErrClear_RXEMemParityClear_LSB 0x2C
+#define QIB_6120_HwErrClear_RXEMemParityClear_RMASK 0x7F
+#define QIB_6120_HwErrClear_TXEMemParityClear_LSB 0x28
+#define QIB_6120_HwErrClear_TXEMemParityClear_RMASK 0xF
+#define QIB_6120_HwErrClear_Reserved3_LSB 0x22
+#define QIB_6120_HwErrClear_Reserved3_RMASK 0x3F
+#define QIB_6120_HwErrClear_PCIeBusParityClr_LSB 0x1F
+#define QIB_6120_HwErrClear_PCIeBusParityClr_RMASK 0x7
+#define QIB_6120_HwErrClear_PcieCplTimeoutClear_LSB 0x1E
+#define QIB_6120_HwErrClear_PcieCplTimeoutClear_RMASK 0x1
+#define QIB_6120_HwErrClear_PoisonedTLPClear_LSB 0x1D
+#define QIB_6120_HwErrClear_PoisonedTLPClear_RMASK 0x1
+#define QIB_6120_HwErrClear_Reserved4_LSB 0x6
+#define QIB_6120_HwErrClear_Reserved4_RMASK 0x7FFFFF
+#define QIB_6120_HwErrClear_PCIeMemParityClr_LSB 0x0
+#define QIB_6120_HwErrClear_PCIeMemParityClr_RMASK 0x3F
+
+#define QIB_6120_HwDiagCtrl_OFFS 0xB0
+#define QIB_6120_HwDiagCtrl_ForceIBCBusFromSPCParityErr_LSB 0x3F
+#define QIB_6120_HwDiagCtrl_ForceIBCBusFromSPCParityErr_RMASK 0x1
+#define QIB_6120_HwDiagCtrl_ForceIBCBusToSPCParityErr_LSB 0x3E
+#define QIB_6120_HwDiagCtrl_ForceIBCBusToSPCParityErr_RMASK 0x1
+#define QIB_6120_HwDiagCtrl_CounterWrEnable_LSB 0x3D
+#define QIB_6120_HwDiagCtrl_CounterWrEnable_RMASK 0x1
+#define QIB_6120_HwDiagCtrl_CounterDisable_LSB 0x3C
+#define QIB_6120_HwDiagCtrl_CounterDisable_RMASK 0x1
+#define QIB_6120_HwDiagCtrl_Reserved_LSB 0x33
+#define QIB_6120_HwDiagCtrl_Reserved_RMASK 0x1FF
+#define QIB_6120_HwDiagCtrl_ForceRxMemParityErr_LSB 0x2C
+#define QIB_6120_HwDiagCtrl_ForceRxMemParityErr_RMASK 0x7F
+#define QIB_6120_HwDiagCtrl_ForceTxMemparityErr_LSB 0x28
+#define QIB_6120_HwDiagCtrl_ForceTxMemparityErr_RMASK 0xF
+#define QIB_6120_HwDiagCtrl_Reserved1_LSB 0x23
+#define QIB_6120_HwDiagCtrl_Reserved1_RMASK 0x1F
+#define QIB_6120_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F
+#define QIB_6120_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF
+#define QIB_6120_HwDiagCtrl_Reserved2_LSB 0x6
+#define QIB_6120_HwDiagCtrl_Reserved2_RMASK 0x1FFFFFF
+#define QIB_6120_HwDiagCtrl_forcePCIeMemParity_LSB 0x0
+#define QIB_6120_HwDiagCtrl_forcePCIeMemParity_RMASK 0x3F
+
+#define QIB_6120_IBCStatus_OFFS 0xC0
+#define QIB_6120_IBCStatus_TxCreditOk_LSB 0x1F
+#define QIB_6120_IBCStatus_TxCreditOk_RMASK 0x1
+#define QIB_6120_IBCStatus_TxReady_LSB 0x1E
+#define QIB_6120_IBCStatus_TxReady_RMASK 0x1
+#define QIB_6120_IBCStatus_Reserved_LSB 0x7
+#define QIB_6120_IBCStatus_Reserved_RMASK 0x7FFFFF
+#define QIB_6120_IBCStatus_LinkState_LSB 0x4
+#define QIB_6120_IBCStatus_LinkState_RMASK 0x7
+#define QIB_6120_IBCStatus_LinkTrainingState_LSB 0x0
+#define QIB_6120_IBCStatus_LinkTrainingState_RMASK 0xF
+
+#define QIB_6120_IBCCtrl_OFFS 0xC8
+#define QIB_6120_IBCCtrl_Loopback_LSB 0x3F
+#define QIB_6120_IBCCtrl_Loopback_RMASK 0x1
+#define QIB_6120_IBCCtrl_LinkDownDefaultState_LSB 0x3E
+#define QIB_6120_IBCCtrl_LinkDownDefaultState_RMASK 0x1
+#define QIB_6120_IBCCtrl_Reserved_LSB 0x2B
+#define QIB_6120_IBCCtrl_Reserved_RMASK 0x7FFFF
+#define QIB_6120_IBCCtrl_CreditScale_LSB 0x28
+#define QIB_6120_IBCCtrl_CreditScale_RMASK 0x7
+#define QIB_6120_IBCCtrl_OverrunThreshold_LSB 0x24
+#define QIB_6120_IBCCtrl_OverrunThreshold_RMASK 0xF
+#define QIB_6120_IBCCtrl_PhyerrThreshold_LSB 0x20
+#define QIB_6120_IBCCtrl_PhyerrThreshold_RMASK 0xF
+#define QIB_6120_IBCCtrl_Reserved1_LSB 0x1F
+#define QIB_6120_IBCCtrl_Reserved1_RMASK 0x1
+#define QIB_6120_IBCCtrl_MaxPktLen_LSB 0x14
+#define QIB_6120_IBCCtrl_MaxPktLen_RMASK 0x7FF
+#define QIB_6120_IBCCtrl_LinkCmd_LSB 0x12
+#define QIB_6120_IBCCtrl_LinkCmd_RMASK 0x3
+#define QIB_6120_IBCCtrl_LinkInitCmd_LSB 0x10
+#define QIB_6120_IBCCtrl_LinkInitCmd_RMASK 0x3
+#define QIB_6120_IBCCtrl_FlowCtrlWaterMark_LSB 0x8
+#define QIB_6120_IBCCtrl_FlowCtrlWaterMark_RMASK 0xFF
+#define QIB_6120_IBCCtrl_FlowCtrlPeriod_LSB 0x0
+#define QIB_6120_IBCCtrl_FlowCtrlPeriod_RMASK 0xFF
+
+#define QIB_6120_EXTStatus_OFFS 0xD0
+#define QIB_6120_EXTStatus_GPIOIn_LSB 0x30
+#define QIB_6120_EXTStatus_GPIOIn_RMASK 0xFFFF
+#define QIB_6120_EXTStatus_Reserved_LSB 0x20
+#define QIB_6120_EXTStatus_Reserved_RMASK 0xFFFF
+#define QIB_6120_EXTStatus_Reserved1_LSB 0x10
+#define QIB_6120_EXTStatus_Reserved1_RMASK 0xFFFF
+#define QIB_6120_EXTStatus_MemBISTFoundErr_LSB 0xF
+#define QIB_6120_EXTStatus_MemBISTFoundErr_RMASK 0x1
+#define QIB_6120_EXTStatus_MemBISTEndTest_LSB 0xE
+#define QIB_6120_EXTStatus_MemBISTEndTest_RMASK 0x1
+#define QIB_6120_EXTStatus_Reserved2_LSB 0x0
+#define QIB_6120_EXTStatus_Reserved2_RMASK 0x3FFF
+
+#define QIB_6120_EXTCtrl_OFFS 0xD8
+#define QIB_6120_EXTCtrl_GPIOOe_LSB 0x30
+#define QIB_6120_EXTCtrl_GPIOOe_RMASK 0xFFFF
+#define QIB_6120_EXTCtrl_GPIOInvert_LSB 0x20
+#define QIB_6120_EXTCtrl_GPIOInvert_RMASK 0xFFFF
+#define QIB_6120_EXTCtrl_Reserved_LSB 0x4
+#define QIB_6120_EXTCtrl_Reserved_RMASK 0xFFFFFFF
+#define QIB_6120_EXTCtrl_LEDPriPortGreenOn_LSB 0x3
+#define QIB_6120_EXTCtrl_LEDPriPortGreenOn_RMASK 0x1
+#define QIB_6120_EXTCtrl_LEDPriPortYellowOn_LSB 0x2
+#define QIB_6120_EXTCtrl_LEDPriPortYellowOn_RMASK 0x1
+#define QIB_6120_EXTCtrl_LEDGblOkGreenOn_LSB 0x1
+#define QIB_6120_EXTCtrl_LEDGblOkGreenOn_RMASK 0x1
+#define QIB_6120_EXTCtrl_LEDGblErrRedOff_LSB 0x0
+#define QIB_6120_EXTCtrl_LEDGblErrRedOff_RMASK 0x1
+
+#define QIB_6120_GPIOOut_OFFS 0xE0
+
+#define QIB_6120_GPIOMask_OFFS 0xE8
+
+#define QIB_6120_GPIOStatus_OFFS 0xF0
+
+#define QIB_6120_GPIOClear_OFFS 0xF8
+
+#define QIB_6120_RcvCtrl_OFFS 0x100
+#define QIB_6120_RcvCtrl_TailUpd_LSB 0x1F
+#define QIB_6120_RcvCtrl_TailUpd_RMASK 0x1
+#define QIB_6120_RcvCtrl_RcvPartitionKeyDisable_LSB 0x1E
+#define QIB_6120_RcvCtrl_RcvPartitionKeyDisable_RMASK 0x1
+#define QIB_6120_RcvCtrl_Reserved_LSB 0x15
+#define QIB_6120_RcvCtrl_Reserved_RMASK 0x1FF
+#define QIB_6120_RcvCtrl_IntrAvail_LSB 0x10
+#define QIB_6120_RcvCtrl_IntrAvail_RMASK 0x1F
+#define QIB_6120_RcvCtrl_Reserved1_LSB 0x9
+#define QIB_6120_RcvCtrl_Reserved1_RMASK 0x7F
+#define QIB_6120_RcvCtrl_Reserved2_LSB 0x5
+#define QIB_6120_RcvCtrl_Reserved2_RMASK 0xF
+#define QIB_6120_RcvCtrl_PortEnable_LSB 0x0
+#define QIB_6120_RcvCtrl_PortEnable_RMASK 0x1F
+
+#define QIB_6120_RcvBTHQP_OFFS 0x108
+#define QIB_6120_RcvBTHQP_BTHQP_Mask_LSB 0x1E
+#define QIB_6120_RcvBTHQP_BTHQP_Mask_RMASK 0x3
+#define QIB_6120_RcvBTHQP_Reserved_LSB 0x18
+#define QIB_6120_RcvBTHQP_Reserved_RMASK 0x3F
+#define QIB_6120_RcvBTHQP_RcvBTHQP_LSB 0x0
+#define QIB_6120_RcvBTHQP_RcvBTHQP_RMASK 0xFFFFFF
+
+#define QIB_6120_RcvHdrSize_OFFS 0x110
+
+#define QIB_6120_RcvHdrCnt_OFFS 0x118
+
+#define QIB_6120_RcvHdrEntSize_OFFS 0x120
+
+#define QIB_6120_RcvTIDBase_OFFS 0x128
+
+#define QIB_6120_RcvTIDCnt_OFFS 0x130
+
+#define QIB_6120_RcvEgrBase_OFFS 0x138
+
+#define QIB_6120_RcvEgrCnt_OFFS 0x140
+
+#define QIB_6120_RcvBufBase_OFFS 0x148
+
+#define QIB_6120_RcvBufSize_OFFS 0x150
+
+#define QIB_6120_RxIntMemBase_OFFS 0x158
+
+#define QIB_6120_RxIntMemSize_OFFS 0x160
+
+#define QIB_6120_RcvPartitionKey_OFFS 0x168
+
+#define QIB_6120_RcvPktLEDCnt_OFFS 0x178
+#define QIB_6120_RcvPktLEDCnt_ONperiod_LSB 0x20
+#define QIB_6120_RcvPktLEDCnt_ONperiod_RMASK 0xFFFFFFFF
+#define QIB_6120_RcvPktLEDCnt_OFFperiod_LSB 0x0
+#define QIB_6120_RcvPktLEDCnt_OFFperiod_RMASK 0xFFFFFFFF
+
+#define QIB_6120_SendCtrl_OFFS 0x1C0
+#define QIB_6120_SendCtrl_Disarm_LSB 0x1F
+#define QIB_6120_SendCtrl_Disarm_RMASK 0x1
+#define QIB_6120_SendCtrl_Reserved_LSB 0x17
+#define QIB_6120_SendCtrl_Reserved_RMASK 0xFF
+#define QIB_6120_SendCtrl_DisarmPIOBuf_LSB 0x10
+#define QIB_6120_SendCtrl_DisarmPIOBuf_RMASK 0x7F
+#define QIB_6120_SendCtrl_Reserved1_LSB 0x4
+#define QIB_6120_SendCtrl_Reserved1_RMASK 0xFFF
+#define QIB_6120_SendCtrl_PIOEnable_LSB 0x3
+#define QIB_6120_SendCtrl_PIOEnable_RMASK 0x1
+#define QIB_6120_SendCtrl_PIOBufAvailUpd_LSB 0x2
+#define QIB_6120_SendCtrl_PIOBufAvailUpd_RMASK 0x1
+#define QIB_6120_SendCtrl_PIOIntBufAvail_LSB 0x1
+#define QIB_6120_SendCtrl_PIOIntBufAvail_RMASK 0x1
+#define QIB_6120_SendCtrl_Abort_LSB 0x0
+#define QIB_6120_SendCtrl_Abort_RMASK 0x1
+
+#define QIB_6120_SendPIOBufBase_OFFS 0x1C8
+#define QIB_6120_SendPIOBufBase_Reserved_LSB 0x35
+#define QIB_6120_SendPIOBufBase_Reserved_RMASK 0x7FF
+#define QIB_6120_SendPIOBufBase_BaseAddr_LargePIO_LSB 0x20
+#define QIB_6120_SendPIOBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF
+#define QIB_6120_SendPIOBufBase_Reserved1_LSB 0x15
+#define QIB_6120_SendPIOBufBase_Reserved1_RMASK 0x7FF
+#define QIB_6120_SendPIOBufBase_BaseAddr_SmallPIO_LSB 0x0
+#define QIB_6120_SendPIOBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF
+
+#define QIB_6120_SendPIOSize_OFFS 0x1D0
+#define QIB_6120_SendPIOSize_Reserved_LSB 0x2D
+#define QIB_6120_SendPIOSize_Reserved_RMASK 0xFFFFF
+#define QIB_6120_SendPIOSize_Size_LargePIO_LSB 0x20
+#define QIB_6120_SendPIOSize_Size_LargePIO_RMASK 0x1FFF
+#define QIB_6120_SendPIOSize_Reserved1_LSB 0xC
+#define QIB_6120_SendPIOSize_Reserved1_RMASK 0xFFFFF
+#define QIB_6120_SendPIOSize_Size_SmallPIO_LSB 0x0
+#define QIB_6120_SendPIOSize_Size_SmallPIO_RMASK 0xFFF
+
+#define QIB_6120_SendPIOBufCnt_OFFS 0x1D8
+#define QIB_6120_SendPIOBufCnt_Reserved_LSB 0x24
+#define QIB_6120_SendPIOBufCnt_Reserved_RMASK 0xFFFFFFF
+#define QIB_6120_SendPIOBufCnt_Num_LargePIO_LSB 0x20
+#define QIB_6120_SendPIOBufCnt_Num_LargePIO_RMASK 0xF
+#define QIB_6120_SendPIOBufCnt_Reserved1_LSB 0x9
+#define QIB_6120_SendPIOBufCnt_Reserved1_RMASK 0x7FFFFF
+#define QIB_6120_SendPIOBufCnt_Num_SmallPIO_LSB 0x0
+#define QIB_6120_SendPIOBufCnt_Num_SmallPIO_RMASK 0x1FF
+
+#define QIB_6120_SendPIOAvailAddr_OFFS 0x1E0
+#define QIB_6120_SendPIOAvailAddr_SendPIOAvailAddr_LSB 0x6
+#define QIB_6120_SendPIOAvailAddr_SendPIOAvailAddr_RMASK 0x3FFFFFFFF
+#define QIB_6120_SendPIOAvailAddr_Reserved_LSB 0x0
+#define QIB_6120_SendPIOAvailAddr_Reserved_RMASK 0x3F
+
+#define QIB_6120_SendBufErr0_OFFS 0x240
+#define QIB_6120_SendBufErr0_SendBufErrPIO_63_0_LSB 0x0
+#define QIB_6120_SendBufErr0_SendBufErrPIO_63_0_RMASK 0x0
+
+#define QIB_6120_RcvHdrAddr0_OFFS 0x280
+#define QIB_6120_RcvHdrAddr0_RcvHdrAddr0_LSB 0x2
+#define QIB_6120_RcvHdrAddr0_RcvHdrAddr0_RMASK 0x3FFFFFFFFF
+#define QIB_6120_RcvHdrAddr0_Reserved_LSB 0x0
+#define QIB_6120_RcvHdrAddr0_Reserved_RMASK 0x3
+
+#define QIB_6120_RcvHdrTailAddr0_OFFS 0x300
+#define QIB_6120_RcvHdrTailAddr0_RcvHdrTailAddr0_LSB 0x2
+#define QIB_6120_RcvHdrTailAddr0_RcvHdrTailAddr0_RMASK 0x3FFFFFFFFF
+#define QIB_6120_RcvHdrTailAddr0_Reserved_LSB 0x0
+#define QIB_6120_RcvHdrTailAddr0_Reserved_RMASK 0x3
+
+#define QIB_6120_SerdesCfg0_OFFS 0x3C0
+#define QIB_6120_SerdesCfg0_DisableIBTxIdleDetect_LSB 0x3F
+#define QIB_6120_SerdesCfg0_DisableIBTxIdleDetect_RMASK 0x1
+#define QIB_6120_SerdesCfg0_Reserved_LSB 0x38
+#define QIB_6120_SerdesCfg0_Reserved_RMASK 0x7F
+#define QIB_6120_SerdesCfg0_RxEqCtl_LSB 0x36
+#define QIB_6120_SerdesCfg0_RxEqCtl_RMASK 0x3
+#define QIB_6120_SerdesCfg0_TxTermAdj_LSB 0x34
+#define QIB_6120_SerdesCfg0_TxTermAdj_RMASK 0x3
+#define QIB_6120_SerdesCfg0_RxTermAdj_LSB 0x32
+#define QIB_6120_SerdesCfg0_RxTermAdj_RMASK 0x3
+#define QIB_6120_SerdesCfg0_TermAdj1_LSB 0x31
+#define QIB_6120_SerdesCfg0_TermAdj1_RMASK 0x1
+#define QIB_6120_SerdesCfg0_TermAdj0_LSB 0x30
+#define QIB_6120_SerdesCfg0_TermAdj0_RMASK 0x1
+#define QIB_6120_SerdesCfg0_LPBKA_LSB 0x2F
+#define QIB_6120_SerdesCfg0_LPBKA_RMASK 0x1
+#define QIB_6120_SerdesCfg0_LPBKB_LSB 0x2E
+#define QIB_6120_SerdesCfg0_LPBKB_RMASK 0x1
+#define QIB_6120_SerdesCfg0_LPBKC_LSB 0x2D
+#define QIB_6120_SerdesCfg0_LPBKC_RMASK 0x1
+#define QIB_6120_SerdesCfg0_LPBKD_LSB 0x2C
+#define QIB_6120_SerdesCfg0_LPBKD_RMASK 0x1
+#define QIB_6120_SerdesCfg0_PW_LSB 0x2B
+#define QIB_6120_SerdesCfg0_PW_RMASK 0x1
+#define QIB_6120_SerdesCfg0_RefSel_LSB 0x29
+#define QIB_6120_SerdesCfg0_RefSel_RMASK 0x3
+#define QIB_6120_SerdesCfg0_ParReset_LSB 0x28
+#define QIB_6120_SerdesCfg0_ParReset_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ParLPBK_LSB 0x27
+#define QIB_6120_SerdesCfg0_ParLPBK_RMASK 0x1
+#define QIB_6120_SerdesCfg0_OffsetEn_LSB 0x26
+#define QIB_6120_SerdesCfg0_OffsetEn_RMASK 0x1
+#define QIB_6120_SerdesCfg0_Offset_LSB 0x1E
+#define QIB_6120_SerdesCfg0_Offset_RMASK 0xFF
+#define QIB_6120_SerdesCfg0_L2PwrDn_LSB 0x1D
+#define QIB_6120_SerdesCfg0_L2PwrDn_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetPLL_LSB 0x1C
+#define QIB_6120_SerdesCfg0_ResetPLL_RMASK 0x1
+#define QIB_6120_SerdesCfg0_RxTermEnX_LSB 0x18
+#define QIB_6120_SerdesCfg0_RxTermEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_BeaconTxEnX_LSB 0x14
+#define QIB_6120_SerdesCfg0_BeaconTxEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_RxDetEnX_LSB 0x10
+#define QIB_6120_SerdesCfg0_RxDetEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_TxIdeEnX_LSB 0xC
+#define QIB_6120_SerdesCfg0_TxIdeEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_RxIdleEnX_LSB 0x8
+#define QIB_6120_SerdesCfg0_RxIdleEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_L1PwrDnA_LSB 0x7
+#define QIB_6120_SerdesCfg0_L1PwrDnA_RMASK 0x1
+#define QIB_6120_SerdesCfg0_L1PwrDnB_LSB 0x6
+#define QIB_6120_SerdesCfg0_L1PwrDnB_RMASK 0x1
+#define QIB_6120_SerdesCfg0_L1PwrDnC_LSB 0x5
+#define QIB_6120_SerdesCfg0_L1PwrDnC_RMASK 0x1
+#define QIB_6120_SerdesCfg0_L1PwrDnD_LSB 0x4
+#define QIB_6120_SerdesCfg0_L1PwrDnD_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetA_LSB 0x3
+#define QIB_6120_SerdesCfg0_ResetA_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetB_LSB 0x2
+#define QIB_6120_SerdesCfg0_ResetB_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetC_LSB 0x1
+#define QIB_6120_SerdesCfg0_ResetC_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetD_LSB 0x0
+#define QIB_6120_SerdesCfg0_ResetD_RMASK 0x1
+
+#define QIB_6120_SerdesStat_OFFS 0x3D0
+#define QIB_6120_SerdesStat_Reserved_LSB 0xC
+#define QIB_6120_SerdesStat_Reserved_RMASK 0xFFFFFFFFFFFFF
+#define QIB_6120_SerdesStat_BeaconDetA_LSB 0xB
+#define QIB_6120_SerdesStat_BeaconDetA_RMASK 0x1
+#define QIB_6120_SerdesStat_BeaconDetB_LSB 0xA
+#define QIB_6120_SerdesStat_BeaconDetB_RMASK 0x1
+#define QIB_6120_SerdesStat_BeaconDetC_LSB 0x9
+#define QIB_6120_SerdesStat_BeaconDetC_RMASK 0x1
+#define QIB_6120_SerdesStat_BeaconDetD_LSB 0x8
+#define QIB_6120_SerdesStat_BeaconDetD_RMASK 0x1
+#define QIB_6120_SerdesStat_RxDetA_LSB 0x7
+#define QIB_6120_SerdesStat_RxDetA_RMASK 0x1
+#define QIB_6120_SerdesStat_RxDetB_LSB 0x6
+#define QIB_6120_SerdesStat_RxDetB_RMASK 0x1
+#define QIB_6120_SerdesStat_RxDetC_LSB 0x5
+#define QIB_6120_SerdesStat_RxDetC_RMASK 0x1
+#define QIB_6120_SerdesStat_RxDetD_LSB 0x4
+#define QIB_6120_SerdesStat_RxDetD_RMASK 0x1
+#define QIB_6120_SerdesStat_TxIdleDetA_LSB 0x3
+#define QIB_6120_SerdesStat_TxIdleDetA_RMASK 0x1
+#define QIB_6120_SerdesStat_TxIdleDetB_LSB 0x2
+#define QIB_6120_SerdesStat_TxIdleDetB_RMASK 0x1
+#define QIB_6120_SerdesStat_TxIdleDetC_LSB 0x1
+#define QIB_6120_SerdesStat_TxIdleDetC_RMASK 0x1
+#define QIB_6120_SerdesStat_TxIdleDetD_LSB 0x0
+#define QIB_6120_SerdesStat_TxIdleDetD_RMASK 0x1
+
+#define QIB_6120_XGXSCfg_OFFS 0x3D8
+#define QIB_6120_XGXSCfg_ArmLaunchErrorDisable_LSB 0x3F
+#define QIB_6120_XGXSCfg_ArmLaunchErrorDisable_RMASK 0x1
+#define QIB_6120_XGXSCfg_Reserved_LSB 0x17
+#define QIB_6120_XGXSCfg_Reserved_RMASK 0xFFFFFFFFFF
+#define QIB_6120_XGXSCfg_polarity_inv_LSB 0x13
+#define QIB_6120_XGXSCfg_polarity_inv_RMASK 0xF
+#define QIB_6120_XGXSCfg_link_sync_mask_LSB 0x9
+#define QIB_6120_XGXSCfg_link_sync_mask_RMASK 0x3FF
+#define QIB_6120_XGXSCfg_port_addr_LSB 0x4
+#define QIB_6120_XGXSCfg_port_addr_RMASK 0x1F
+#define QIB_6120_XGXSCfg_mdd_30_LSB 0x3
+#define QIB_6120_XGXSCfg_mdd_30_RMASK 0x1
+#define QIB_6120_XGXSCfg_xcv_resetn_LSB 0x2
+#define QIB_6120_XGXSCfg_xcv_resetn_RMASK 0x1
+#define QIB_6120_XGXSCfg_Reserved1_LSB 0x1
+#define QIB_6120_XGXSCfg_Reserved1_RMASK 0x1
+#define QIB_6120_XGXSCfg_tx_rx_resetn_LSB 0x0
+#define QIB_6120_XGXSCfg_tx_rx_resetn_RMASK 0x1
+
+#define QIB_6120_LBIntCnt_OFFS 0x12000
+
+#define QIB_6120_LBFlowStallCnt_OFFS 0x12008
+
+#define QIB_6120_TxUnsupVLErrCnt_OFFS 0x12018
+
+#define QIB_6120_TxDataPktCnt_OFFS 0x12020
+
+#define QIB_6120_TxFlowPktCnt_OFFS 0x12028
+
+#define QIB_6120_TxDwordCnt_OFFS 0x12030
+
+#define QIB_6120_TxLenErrCnt_OFFS 0x12038
+
+#define QIB_6120_TxMaxMinLenErrCnt_OFFS 0x12040
+
+#define QIB_6120_TxUnderrunCnt_OFFS 0x12048
+
+#define QIB_6120_TxFlowStallCnt_OFFS 0x12050
+
+#define QIB_6120_TxDroppedPktCnt_OFFS 0x12058
+
+#define QIB_6120_RxDroppedPktCnt_OFFS 0x12060
+
+#define QIB_6120_RxDataPktCnt_OFFS 0x12068
+
+#define QIB_6120_RxFlowPktCnt_OFFS 0x12070
+
+#define QIB_6120_RxDwordCnt_OFFS 0x12078
+
+#define QIB_6120_RxLenErrCnt_OFFS 0x12080
+
+#define QIB_6120_RxMaxMinLenErrCnt_OFFS 0x12088
+
+#define QIB_6120_RxICRCErrCnt_OFFS 0x12090
+
+#define QIB_6120_RxVCRCErrCnt_OFFS 0x12098
+
+#define QIB_6120_RxFlowCtrlErrCnt_OFFS 0x120A0
+
+#define QIB_6120_RxBadFormatCnt_OFFS 0x120A8
+
+#define QIB_6120_RxLinkProblemCnt_OFFS 0x120B0
+
+#define QIB_6120_RxEBPCnt_OFFS 0x120B8
+
+#define QIB_6120_RxLPCRCErrCnt_OFFS 0x120C0
+
+#define QIB_6120_RxBufOvflCnt_OFFS 0x120C8
+
+#define QIB_6120_RxTIDFullErrCnt_OFFS 0x120D0
+
+#define QIB_6120_RxTIDValidErrCnt_OFFS 0x120D8
+
+#define QIB_6120_RxPKeyMismatchCnt_OFFS 0x120E0
+
+#define QIB_6120_RxP0HdrEgrOvflCnt_OFFS 0x120E8
+
+#define QIB_6120_IBStatusChangeCnt_OFFS 0x12140
+
+#define QIB_6120_IBLinkErrRecoveryCnt_OFFS 0x12148
+
+#define QIB_6120_IBLinkDownedCnt_OFFS 0x12150
+
+#define QIB_6120_IBSymbolErrCnt_OFFS 0x12158
+
+#define QIB_6120_PcieRetryBufDiagQwordCnt_OFFS 0x12170
+
+#define QIB_6120_RcvEgrArray0_OFFS 0x14000
+
+#define QIB_6120_RcvTIDArray0_OFFS 0x54000
+
+#define QIB_6120_PIOLaunchFIFO_OFFS 0x64000
+
+#define QIB_6120_SendPIOpbcCache_OFFS 0x64800
+
+#define QIB_6120_RcvBuf1_OFFS 0x72000
+
+#define QIB_6120_RcvBuf2_OFFS 0x75000
+
+#define QIB_6120_RcvFlags_OFFS 0x77000
+
+#define QIB_6120_RcvLookupBuf1_OFFS 0x79000
+
+#define QIB_6120_RcvDMABuf_OFFS 0x7B000
+
+#define QIB_6120_MiscRXEIntMem_OFFS 0x7C000
+
+#define QIB_6120_PCIERcvBuf_OFFS 0x80000
+
+#define QIB_6120_PCIERetryBuf_OFFS 0x82000
+
+#define QIB_6120_PCIERcvBufRdToWrAddr_OFFS 0x84000
+
+#define QIB_6120_PIOBuf0_MA_OFFS 0x100000
diff --git a/drivers/infiniband/hw/qib/qib_7220.h b/drivers/infiniband/hw/qib/qib_7220.h
new file mode 100644
index 0000000..9ecaab6
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_7220.h
@@ -0,0 +1,149 @@
+#ifndef _QIB_7220_H
+#define _QIB_7220_H
+/*
+ * Copyright (c) 2007, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* grab register-defs auto-generated by HW */
+#include "qib_7220_regs.h"
+
+/* The number of eager receive TIDs for context zero. */
+#define IBA7220_KRCVEGRCNT      2048U
+
+#define IB_7220_LT_STATE_CFGRCVFCFG      0x09
+#define IB_7220_LT_STATE_CFGWAITRMT      0x0a
+#define IB_7220_LT_STATE_TXREVLANES      0x0d
+#define IB_7220_LT_STATE_CFGENH          0x10
+
+struct qib_chip_specific {
+	u64 __iomem *cregbase;
+	u64 *cntrs;
+	u64 *portcntrs;
+	spinlock_t sdepb_lock; /* serdes EPB bus */
+	spinlock_t rcvmod_lock; /* protect rcvctrl shadow changes */
+	spinlock_t gpio_lock; /* RMW of shadows/regs for ExtCtrl and GPIO */
+	u64 hwerrmask;
+	u64 errormask;
+	u64 gpio_out; /* shadow of kr_gpio_out, for rmw ops */
+	u64 gpio_mask; /* shadow the gpio mask register */
+	u64 extctrl; /* shadow the gpio output enable, etc... */
+	u32 ncntrs;
+	u32 nportcntrs;
+	u32 cntrnamelen;
+	u32 portcntrnamelen;
+	u32 numctxts;
+	u32 rcvegrcnt;
+	u32 autoneg_tries;
+	u32 serdes_first_init_done;
+	u32 sdmabufcnt;
+	u32 lastbuf_for_pio;
+	u32 updthresh; /* current AvailUpdThld */
+	u32 updthresh_dflt; /* default AvailUpdThld */
+	u8 presets_needed;
+	u8 relock_timer_active;
+	char emsgbuf[128];
+	char sdmamsgbuf[192];
+	char bitsmsgbuf[64];
+	struct timer_list relock_timer;
+	unsigned int relock_interval; /* in jiffies */
+	struct qib_devdata *dd;
+};
+
+struct qib_chippport_specific {
+	struct qib_pportdata pportdata;
+	wait_queue_head_t autoneg_wait;
+	struct delayed_work autoneg_work;
+	struct timer_list chase_timer;
+	/*
+	 * these 5 fields are used to establish deltas for IB symbol
+	 * errors and linkrecovery errors.  They can be reported on
+	 * some chips during link negotiation prior to INIT, and with
+	 * DDR when faking DDR negotiations with non-IBTA switches.
+	 * The chip counters are adjusted at driver unload if there is
+	 * a non-zero delta.
+	 */
+	u64 ibdeltainprog;
+	u64 ibsymdelta;
+	u64 ibsymsnap;
+	u64 iblnkerrdelta;
+	u64 iblnkerrsnap;
+	u64 ibcctrl; /* kr_ibcctrl shadow */
+	u64 ibcddrctrl; /* kr_ibcddrctrl shadow */
+	unsigned long chase_end;
+	u32 last_delay_mult;
+};
+
+/*
+ * This header file provides the declarations and common definitions
+ * for (mostly) manipulation of the SerDes blocks within the IBA7220.
+ * the functions declared should only be called from within other
+ * 7220-related files such as qib_iba7220.c or qib_sd7220.c.
+ */
+int qib_sd7220_presets(struct qib_devdata *dd);
+int qib_sd7220_init(struct qib_devdata *dd);
+void qib_sd7220_clr_ibpar(struct qib_devdata *);
+/*
+ * Below used for sdnum parameter, selecting one of the two sections
+ * used for PCIe, or the single SerDes used for IB, which is the
+ * only one currently used
+ */
+#define IB_7220_SERDES 2
+
+static inline u32 qib_read_kreg32(const struct qib_devdata *dd,
+				  const u16 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+	return readl((u32 __iomem *)&dd->kregbase[regno]);
+}
+
+static inline u64 qib_read_kreg64(const struct qib_devdata *dd,
+				  const u16 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+
+	return readq(&dd->kregbase[regno]);
+}
+
+static inline void qib_write_kreg(const struct qib_devdata *dd,
+				  const u16 regno, u64 value)
+{
+	if (dd->kregbase)
+		writeq(value, &dd->kregbase[regno]);
+}
+
+void set_7220_relock_poll(struct qib_devdata *, int);
+void shutdown_7220_relock_poll(struct qib_devdata *);
+void toggle_7220_rclkrls(struct qib_devdata *);
+
+
+#endif /* _QIB_7220_H */
diff --git a/drivers/infiniband/hw/qib/qib_7220_regs.h b/drivers/infiniband/hw/qib/qib_7220_regs.h
new file mode 100644
index 0000000..0da5bb7
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_7220_regs.h
@@ -0,0 +1,1496 @@
+/*
+ * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* This file is mechanically generated from RTL. Any hand-edits will be lost! */
+
+#define QIB_7220_Revision_OFFS 0x0
+#define QIB_7220_Revision_R_Simulator_LSB 0x3F
+#define QIB_7220_Revision_R_Simulator_RMASK 0x1
+#define QIB_7220_Revision_R_Emulation_LSB 0x3E
+#define QIB_7220_Revision_R_Emulation_RMASK 0x1
+#define QIB_7220_Revision_R_Emulation_Revcode_LSB 0x28
+#define QIB_7220_Revision_R_Emulation_Revcode_RMASK 0x3FFFFF
+#define QIB_7220_Revision_BoardID_LSB 0x20
+#define QIB_7220_Revision_BoardID_RMASK 0xFF
+#define QIB_7220_Revision_R_SW_LSB 0x18
+#define QIB_7220_Revision_R_SW_RMASK 0xFF
+#define QIB_7220_Revision_R_Arch_LSB 0x10
+#define QIB_7220_Revision_R_Arch_RMASK 0xFF
+#define QIB_7220_Revision_R_ChipRevMajor_LSB 0x8
+#define QIB_7220_Revision_R_ChipRevMajor_RMASK 0xFF
+#define QIB_7220_Revision_R_ChipRevMinor_LSB 0x0
+#define QIB_7220_Revision_R_ChipRevMinor_RMASK 0xFF
+
+#define QIB_7220_Control_OFFS 0x8
+#define QIB_7220_Control_SyncResetExceptPcieIRAMRST_LSB 0x7
+#define QIB_7220_Control_SyncResetExceptPcieIRAMRST_RMASK 0x1
+#define QIB_7220_Control_PCIECplQDiagEn_LSB 0x6
+#define QIB_7220_Control_PCIECplQDiagEn_RMASK 0x1
+#define QIB_7220_Control_Reserved_LSB 0x5
+#define QIB_7220_Control_Reserved_RMASK 0x1
+#define QIB_7220_Control_TxLatency_LSB 0x4
+#define QIB_7220_Control_TxLatency_RMASK 0x1
+#define QIB_7220_Control_PCIERetryBufDiagEn_LSB 0x3
+#define QIB_7220_Control_PCIERetryBufDiagEn_RMASK 0x1
+#define QIB_7220_Control_LinkEn_LSB 0x2
+#define QIB_7220_Control_LinkEn_RMASK 0x1
+#define QIB_7220_Control_FreezeMode_LSB 0x1
+#define QIB_7220_Control_FreezeMode_RMASK 0x1
+#define QIB_7220_Control_SyncReset_LSB 0x0
+#define QIB_7220_Control_SyncReset_RMASK 0x1
+
+#define QIB_7220_PageAlign_OFFS 0x10
+
+#define QIB_7220_PortCnt_OFFS 0x18
+
+#define QIB_7220_SendRegBase_OFFS 0x30
+
+#define QIB_7220_UserRegBase_OFFS 0x38
+
+#define QIB_7220_CntrRegBase_OFFS 0x40
+
+#define QIB_7220_Scratch_OFFS 0x48
+
+#define QIB_7220_IntMask_OFFS 0x68
+#define QIB_7220_IntMask_SDmaIntMask_LSB 0x3F
+#define QIB_7220_IntMask_SDmaIntMask_RMASK 0x1
+#define QIB_7220_IntMask_SDmaDisabledMasked_LSB 0x3E
+#define QIB_7220_IntMask_SDmaDisabledMasked_RMASK 0x1
+#define QIB_7220_IntMask_Reserved_LSB 0x31
+#define QIB_7220_IntMask_Reserved_RMASK 0x1FFF
+#define QIB_7220_IntMask_RcvUrg16IntMask_LSB 0x30
+#define QIB_7220_IntMask_RcvUrg16IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg15IntMask_LSB 0x2F
+#define QIB_7220_IntMask_RcvUrg15IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg14IntMask_LSB 0x2E
+#define QIB_7220_IntMask_RcvUrg14IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg13IntMask_LSB 0x2D
+#define QIB_7220_IntMask_RcvUrg13IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg12IntMask_LSB 0x2C
+#define QIB_7220_IntMask_RcvUrg12IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg11IntMask_LSB 0x2B
+#define QIB_7220_IntMask_RcvUrg11IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg10IntMask_LSB 0x2A
+#define QIB_7220_IntMask_RcvUrg10IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg9IntMask_LSB 0x29
+#define QIB_7220_IntMask_RcvUrg9IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg8IntMask_LSB 0x28
+#define QIB_7220_IntMask_RcvUrg8IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg7IntMask_LSB 0x27
+#define QIB_7220_IntMask_RcvUrg7IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg6IntMask_LSB 0x26
+#define QIB_7220_IntMask_RcvUrg6IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg5IntMask_LSB 0x25
+#define QIB_7220_IntMask_RcvUrg5IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg4IntMask_LSB 0x24
+#define QIB_7220_IntMask_RcvUrg4IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg3IntMask_LSB 0x23
+#define QIB_7220_IntMask_RcvUrg3IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg2IntMask_LSB 0x22
+#define QIB_7220_IntMask_RcvUrg2IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg1IntMask_LSB 0x21
+#define QIB_7220_IntMask_RcvUrg1IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg0IntMask_LSB 0x20
+#define QIB_7220_IntMask_RcvUrg0IntMask_RMASK 0x1
+#define QIB_7220_IntMask_ErrorIntMask_LSB 0x1F
+#define QIB_7220_IntMask_ErrorIntMask_RMASK 0x1
+#define QIB_7220_IntMask_PioSetIntMask_LSB 0x1E
+#define QIB_7220_IntMask_PioSetIntMask_RMASK 0x1
+#define QIB_7220_IntMask_PioBufAvailIntMask_LSB 0x1D
+#define QIB_7220_IntMask_PioBufAvailIntMask_RMASK 0x1
+#define QIB_7220_IntMask_assertGPIOIntMask_LSB 0x1C
+#define QIB_7220_IntMask_assertGPIOIntMask_RMASK 0x1
+#define QIB_7220_IntMask_IBSerdesTrimDoneIntMask_LSB 0x1B
+#define QIB_7220_IntMask_IBSerdesTrimDoneIntMask_RMASK 0x1
+#define QIB_7220_IntMask_JIntMask_LSB 0x1A
+#define QIB_7220_IntMask_JIntMask_RMASK 0x1
+#define QIB_7220_IntMask_Reserved1_LSB 0x11
+#define QIB_7220_IntMask_Reserved1_RMASK 0x1FF
+#define QIB_7220_IntMask_RcvAvail16IntMask_LSB 0x10
+#define QIB_7220_IntMask_RcvAvail16IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail15IntMask_LSB 0xF
+#define QIB_7220_IntMask_RcvAvail15IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail14IntMask_LSB 0xE
+#define QIB_7220_IntMask_RcvAvail14IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail13IntMask_LSB 0xD
+#define QIB_7220_IntMask_RcvAvail13IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail12IntMask_LSB 0xC
+#define QIB_7220_IntMask_RcvAvail12IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail11IntMask_LSB 0xB
+#define QIB_7220_IntMask_RcvAvail11IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail10IntMask_LSB 0xA
+#define QIB_7220_IntMask_RcvAvail10IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail9IntMask_LSB 0x9
+#define QIB_7220_IntMask_RcvAvail9IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail8IntMask_LSB 0x8
+#define QIB_7220_IntMask_RcvAvail8IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail7IntMask_LSB 0x7
+#define QIB_7220_IntMask_RcvAvail7IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail6IntMask_LSB 0x6
+#define QIB_7220_IntMask_RcvAvail6IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail5IntMask_LSB 0x5
+#define QIB_7220_IntMask_RcvAvail5IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail4IntMask_LSB 0x4
+#define QIB_7220_IntMask_RcvAvail4IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail3IntMask_LSB 0x3
+#define QIB_7220_IntMask_RcvAvail3IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail2IntMask_LSB 0x2
+#define QIB_7220_IntMask_RcvAvail2IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail1IntMask_LSB 0x1
+#define QIB_7220_IntMask_RcvAvail1IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail0IntMask_LSB 0x0
+#define QIB_7220_IntMask_RcvAvail0IntMask_RMASK 0x1
+
+#define QIB_7220_IntStatus_OFFS 0x70
+#define QIB_7220_IntStatus_SDmaInt_LSB 0x3F
+#define QIB_7220_IntStatus_SDmaInt_RMASK 0x1
+#define QIB_7220_IntStatus_SDmaDisabled_LSB 0x3E
+#define QIB_7220_IntStatus_SDmaDisabled_RMASK 0x1
+#define QIB_7220_IntStatus_Reserved_LSB 0x31
+#define QIB_7220_IntStatus_Reserved_RMASK 0x1FFF
+#define QIB_7220_IntStatus_RcvUrg16_LSB 0x30
+#define QIB_7220_IntStatus_RcvUrg16_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg15_LSB 0x2F
+#define QIB_7220_IntStatus_RcvUrg15_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg14_LSB 0x2E
+#define QIB_7220_IntStatus_RcvUrg14_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg13_LSB 0x2D
+#define QIB_7220_IntStatus_RcvUrg13_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg12_LSB 0x2C
+#define QIB_7220_IntStatus_RcvUrg12_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg11_LSB 0x2B
+#define QIB_7220_IntStatus_RcvUrg11_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg10_LSB 0x2A
+#define QIB_7220_IntStatus_RcvUrg10_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg9_LSB 0x29
+#define QIB_7220_IntStatus_RcvUrg9_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg8_LSB 0x28
+#define QIB_7220_IntStatus_RcvUrg8_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg7_LSB 0x27
+#define QIB_7220_IntStatus_RcvUrg7_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg6_LSB 0x26
+#define QIB_7220_IntStatus_RcvUrg6_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg5_LSB 0x25
+#define QIB_7220_IntStatus_RcvUrg5_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg4_LSB 0x24
+#define QIB_7220_IntStatus_RcvUrg4_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg3_LSB 0x23
+#define QIB_7220_IntStatus_RcvUrg3_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg2_LSB 0x22
+#define QIB_7220_IntStatus_RcvUrg2_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg1_LSB 0x21
+#define QIB_7220_IntStatus_RcvUrg1_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg0_LSB 0x20
+#define QIB_7220_IntStatus_RcvUrg0_RMASK 0x1
+#define QIB_7220_IntStatus_Error_LSB 0x1F
+#define QIB_7220_IntStatus_Error_RMASK 0x1
+#define QIB_7220_IntStatus_PioSent_LSB 0x1E
+#define QIB_7220_IntStatus_PioSent_RMASK 0x1
+#define QIB_7220_IntStatus_PioBufAvail_LSB 0x1D
+#define QIB_7220_IntStatus_PioBufAvail_RMASK 0x1
+#define QIB_7220_IntStatus_assertGPIO_LSB 0x1C
+#define QIB_7220_IntStatus_assertGPIO_RMASK 0x1
+#define QIB_7220_IntStatus_IBSerdesTrimDone_LSB 0x1B
+#define QIB_7220_IntStatus_IBSerdesTrimDone_RMASK 0x1
+#define QIB_7220_IntStatus_JInt_LSB 0x1A
+#define QIB_7220_IntStatus_JInt_RMASK 0x1
+#define QIB_7220_IntStatus_Reserved1_LSB 0x11
+#define QIB_7220_IntStatus_Reserved1_RMASK 0x1FF
+#define QIB_7220_IntStatus_RcvAvail16_LSB 0x10
+#define QIB_7220_IntStatus_RcvAvail16_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail15_LSB 0xF
+#define QIB_7220_IntStatus_RcvAvail15_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail14_LSB 0xE
+#define QIB_7220_IntStatus_RcvAvail14_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail13_LSB 0xD
+#define QIB_7220_IntStatus_RcvAvail13_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail12_LSB 0xC
+#define QIB_7220_IntStatus_RcvAvail12_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail11_LSB 0xB
+#define QIB_7220_IntStatus_RcvAvail11_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail10_LSB 0xA
+#define QIB_7220_IntStatus_RcvAvail10_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail9_LSB 0x9
+#define QIB_7220_IntStatus_RcvAvail9_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail8_LSB 0x8
+#define QIB_7220_IntStatus_RcvAvail8_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail7_LSB 0x7
+#define QIB_7220_IntStatus_RcvAvail7_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail6_LSB 0x6
+#define QIB_7220_IntStatus_RcvAvail6_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail5_LSB 0x5
+#define QIB_7220_IntStatus_RcvAvail5_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail4_LSB 0x4
+#define QIB_7220_IntStatus_RcvAvail4_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail3_LSB 0x3
+#define QIB_7220_IntStatus_RcvAvail3_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail2_LSB 0x2
+#define QIB_7220_IntStatus_RcvAvail2_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail1_LSB 0x1
+#define QIB_7220_IntStatus_RcvAvail1_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail0_LSB 0x0
+#define QIB_7220_IntStatus_RcvAvail0_RMASK 0x1
+
+#define QIB_7220_IntClear_OFFS 0x78
+#define QIB_7220_IntClear_SDmaIntClear_LSB 0x3F
+#define QIB_7220_IntClear_SDmaIntClear_RMASK 0x1
+#define QIB_7220_IntClear_SDmaDisabledClear_LSB 0x3E
+#define QIB_7220_IntClear_SDmaDisabledClear_RMASK 0x1
+#define QIB_7220_IntClear_Reserved_LSB 0x31
+#define QIB_7220_IntClear_Reserved_RMASK 0x1FFF
+#define QIB_7220_IntClear_RcvUrg16IntClear_LSB 0x30
+#define QIB_7220_IntClear_RcvUrg16IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg15IntClear_LSB 0x2F
+#define QIB_7220_IntClear_RcvUrg15IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg14IntClear_LSB 0x2E
+#define QIB_7220_IntClear_RcvUrg14IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg13IntClear_LSB 0x2D
+#define QIB_7220_IntClear_RcvUrg13IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg12IntClear_LSB 0x2C
+#define QIB_7220_IntClear_RcvUrg12IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg11IntClear_LSB 0x2B
+#define QIB_7220_IntClear_RcvUrg11IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg10IntClear_LSB 0x2A
+#define QIB_7220_IntClear_RcvUrg10IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg9IntClear_LSB 0x29
+#define QIB_7220_IntClear_RcvUrg9IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg8IntClear_LSB 0x28
+#define QIB_7220_IntClear_RcvUrg8IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg7IntClear_LSB 0x27
+#define QIB_7220_IntClear_RcvUrg7IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg6IntClear_LSB 0x26
+#define QIB_7220_IntClear_RcvUrg6IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg5IntClear_LSB 0x25
+#define QIB_7220_IntClear_RcvUrg5IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg4IntClear_LSB 0x24
+#define QIB_7220_IntClear_RcvUrg4IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg3IntClear_LSB 0x23
+#define QIB_7220_IntClear_RcvUrg3IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg2IntClear_LSB 0x22
+#define QIB_7220_IntClear_RcvUrg2IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg1IntClear_LSB 0x21
+#define QIB_7220_IntClear_RcvUrg1IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg0IntClear_LSB 0x20
+#define QIB_7220_IntClear_RcvUrg0IntClear_RMASK 0x1
+#define QIB_7220_IntClear_ErrorIntClear_LSB 0x1F
+#define QIB_7220_IntClear_ErrorIntClear_RMASK 0x1
+#define QIB_7220_IntClear_PioSetIntClear_LSB 0x1E
+#define QIB_7220_IntClear_PioSetIntClear_RMASK 0x1
+#define QIB_7220_IntClear_PioBufAvailIntClear_LSB 0x1D
+#define QIB_7220_IntClear_PioBufAvailIntClear_RMASK 0x1
+#define QIB_7220_IntClear_assertGPIOIntClear_LSB 0x1C
+#define QIB_7220_IntClear_assertGPIOIntClear_RMASK 0x1
+#define QIB_7220_IntClear_IBSerdesTrimDoneClear_LSB 0x1B
+#define QIB_7220_IntClear_IBSerdesTrimDoneClear_RMASK 0x1
+#define QIB_7220_IntClear_JIntClear_LSB 0x1A
+#define QIB_7220_IntClear_JIntClear_RMASK 0x1
+#define QIB_7220_IntClear_Reserved1_LSB 0x11
+#define QIB_7220_IntClear_Reserved1_RMASK 0x1FF
+#define QIB_7220_IntClear_RcvAvail16IntClear_LSB 0x10
+#define QIB_7220_IntClear_RcvAvail16IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail15IntClear_LSB 0xF
+#define QIB_7220_IntClear_RcvAvail15IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail14IntClear_LSB 0xE
+#define QIB_7220_IntClear_RcvAvail14IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail13IntClear_LSB 0xD
+#define QIB_7220_IntClear_RcvAvail13IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail12IntClear_LSB 0xC
+#define QIB_7220_IntClear_RcvAvail12IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail11IntClear_LSB 0xB
+#define QIB_7220_IntClear_RcvAvail11IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail10IntClear_LSB 0xA
+#define QIB_7220_IntClear_RcvAvail10IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail9IntClear_LSB 0x9
+#define QIB_7220_IntClear_RcvAvail9IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail8IntClear_LSB 0x8
+#define QIB_7220_IntClear_RcvAvail8IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail7IntClear_LSB 0x7
+#define QIB_7220_IntClear_RcvAvail7IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail6IntClear_LSB 0x6
+#define QIB_7220_IntClear_RcvAvail6IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail5IntClear_LSB 0x5
+#define QIB_7220_IntClear_RcvAvail5IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail4IntClear_LSB 0x4
+#define QIB_7220_IntClear_RcvAvail4IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail3IntClear_LSB 0x3
+#define QIB_7220_IntClear_RcvAvail3IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail2IntClear_LSB 0x2
+#define QIB_7220_IntClear_RcvAvail2IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail1IntClear_LSB 0x1
+#define QIB_7220_IntClear_RcvAvail1IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail0IntClear_LSB 0x0
+#define QIB_7220_IntClear_RcvAvail0IntClear_RMASK 0x1
+
+#define QIB_7220_ErrMask_OFFS 0x80
+#define QIB_7220_ErrMask_Reserved_LSB 0x36
+#define QIB_7220_ErrMask_Reserved_RMASK 0x3FF
+#define QIB_7220_ErrMask_InvalidEEPCmdMask_LSB 0x35
+#define QIB_7220_ErrMask_InvalidEEPCmdMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaDescAddrMisalignErrMask_LSB 0x34
+#define QIB_7220_ErrMask_SDmaDescAddrMisalignErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_HardwareErrMask_LSB 0x33
+#define QIB_7220_ErrMask_HardwareErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_ResetNegatedMask_LSB 0x32
+#define QIB_7220_ErrMask_ResetNegatedMask_RMASK 0x1
+#define QIB_7220_ErrMask_InvalidAddrErrMask_LSB 0x31
+#define QIB_7220_ErrMask_InvalidAddrErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_IBStatusChangedMask_LSB 0x30
+#define QIB_7220_ErrMask_IBStatusChangedMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaUnexpDataErrMask_LSB 0x2F
+#define QIB_7220_ErrMask_SDmaUnexpDataErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaMissingDwErrMask_LSB 0x2E
+#define QIB_7220_ErrMask_SDmaMissingDwErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaDwEnErrMask_LSB 0x2D
+#define QIB_7220_ErrMask_SDmaDwEnErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaRpyTagErrMask_LSB 0x2C
+#define QIB_7220_ErrMask_SDmaRpyTagErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDma1stDescErrMask_LSB 0x2B
+#define QIB_7220_ErrMask_SDma1stDescErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaBaseErrMask_LSB 0x2A
+#define QIB_7220_ErrMask_SDmaBaseErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaTailOutOfBoundErrMask_LSB 0x29
+#define QIB_7220_ErrMask_SDmaTailOutOfBoundErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaOutOfBoundErrMask_LSB 0x28
+#define QIB_7220_ErrMask_SDmaOutOfBoundErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaGenMismatchErrMask_LSB 0x27
+#define QIB_7220_ErrMask_SDmaGenMismatchErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendBufMisuseErrMask_LSB 0x26
+#define QIB_7220_ErrMask_SendBufMisuseErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendUnsupportedVLErrMask_LSB 0x25
+#define QIB_7220_ErrMask_SendUnsupportedVLErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendUnexpectedPktNumErrMask_LSB 0x24
+#define QIB_7220_ErrMask_SendUnexpectedPktNumErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendPioArmLaunchErrMask_LSB 0x23
+#define QIB_7220_ErrMask_SendPioArmLaunchErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendDroppedDataPktErrMask_LSB 0x22
+#define QIB_7220_ErrMask_SendDroppedDataPktErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendDroppedSmpPktErrMask_LSB 0x21
+#define QIB_7220_ErrMask_SendDroppedSmpPktErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendPktLenErrMask_LSB 0x20
+#define QIB_7220_ErrMask_SendPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendUnderRunErrMask_LSB 0x1F
+#define QIB_7220_ErrMask_SendUnderRunErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendMaxPktLenErrMask_LSB 0x1E
+#define QIB_7220_ErrMask_SendMaxPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendMinPktLenErrMask_LSB 0x1D
+#define QIB_7220_ErrMask_SendMinPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaDisabledErrMask_LSB 0x1C
+#define QIB_7220_ErrMask_SDmaDisabledErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendSpecialTriggerErrMask_LSB 0x1B
+#define QIB_7220_ErrMask_SendSpecialTriggerErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_Reserved1_LSB 0x12
+#define QIB_7220_ErrMask_Reserved1_RMASK 0x1FF
+#define QIB_7220_ErrMask_RcvIBLostLinkErrMask_LSB 0x11
+#define QIB_7220_ErrMask_RcvIBLostLinkErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvHdrErrMask_LSB 0x10
+#define QIB_7220_ErrMask_RcvHdrErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvHdrLenErrMask_LSB 0xF
+#define QIB_7220_ErrMask_RcvHdrLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvBadTidErrMask_LSB 0xE
+#define QIB_7220_ErrMask_RcvBadTidErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvHdrFullErrMask_LSB 0xD
+#define QIB_7220_ErrMask_RcvHdrFullErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvEgrFullErrMask_LSB 0xC
+#define QIB_7220_ErrMask_RcvEgrFullErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvBadVersionErrMask_LSB 0xB
+#define QIB_7220_ErrMask_RcvBadVersionErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvIBFlowErrMask_LSB 0xA
+#define QIB_7220_ErrMask_RcvIBFlowErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvEBPErrMask_LSB 0x9
+#define QIB_7220_ErrMask_RcvEBPErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvUnsupportedVLErrMask_LSB 0x8
+#define QIB_7220_ErrMask_RcvUnsupportedVLErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvUnexpectedCharErrMask_LSB 0x7
+#define QIB_7220_ErrMask_RcvUnexpectedCharErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvShortPktLenErrMask_LSB 0x6
+#define QIB_7220_ErrMask_RcvShortPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvLongPktLenErrMask_LSB 0x5
+#define QIB_7220_ErrMask_RcvLongPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvMaxPktLenErrMask_LSB 0x4
+#define QIB_7220_ErrMask_RcvMaxPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvMinPktLenErrMask_LSB 0x3
+#define QIB_7220_ErrMask_RcvMinPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvICRCErrMask_LSB 0x2
+#define QIB_7220_ErrMask_RcvICRCErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvVCRCErrMask_LSB 0x1
+#define QIB_7220_ErrMask_RcvVCRCErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvFormatErrMask_LSB 0x0
+#define QIB_7220_ErrMask_RcvFormatErrMask_RMASK 0x1
+
+#define QIB_7220_ErrStatus_OFFS 0x88
+#define QIB_7220_ErrStatus_Reserved_LSB 0x36
+#define QIB_7220_ErrStatus_Reserved_RMASK 0x3FF
+#define QIB_7220_ErrStatus_InvalidEEPCmdErr_LSB 0x35
+#define QIB_7220_ErrStatus_InvalidEEPCmdErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaDescAddrMisalignErr_LSB 0x34
+#define QIB_7220_ErrStatus_SDmaDescAddrMisalignErr_RMASK 0x1
+#define QIB_7220_ErrStatus_HardwareErr_LSB 0x33
+#define QIB_7220_ErrStatus_HardwareErr_RMASK 0x1
+#define QIB_7220_ErrStatus_ResetNegated_LSB 0x32
+#define QIB_7220_ErrStatus_ResetNegated_RMASK 0x1
+#define QIB_7220_ErrStatus_InvalidAddrErr_LSB 0x31
+#define QIB_7220_ErrStatus_InvalidAddrErr_RMASK 0x1
+#define QIB_7220_ErrStatus_IBStatusChanged_LSB 0x30
+#define QIB_7220_ErrStatus_IBStatusChanged_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaUnexpDataErr_LSB 0x2F
+#define QIB_7220_ErrStatus_SDmaUnexpDataErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaMissingDwErr_LSB 0x2E
+#define QIB_7220_ErrStatus_SDmaMissingDwErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaDwEnErr_LSB 0x2D
+#define QIB_7220_ErrStatus_SDmaDwEnErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaRpyTagErr_LSB 0x2C
+#define QIB_7220_ErrStatus_SDmaRpyTagErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDma1stDescErr_LSB 0x2B
+#define QIB_7220_ErrStatus_SDma1stDescErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaBaseErr_LSB 0x2A
+#define QIB_7220_ErrStatus_SDmaBaseErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaTailOutOfBoundErr_LSB 0x29
+#define QIB_7220_ErrStatus_SDmaTailOutOfBoundErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaOutOfBoundErr_LSB 0x28
+#define QIB_7220_ErrStatus_SDmaOutOfBoundErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaGenMismatchErr_LSB 0x27
+#define QIB_7220_ErrStatus_SDmaGenMismatchErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendBufMisuseErr_LSB 0x26
+#define QIB_7220_ErrStatus_SendBufMisuseErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendUnsupportedVLErr_LSB 0x25
+#define QIB_7220_ErrStatus_SendUnsupportedVLErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendUnexpectedPktNumErr_LSB 0x24
+#define QIB_7220_ErrStatus_SendUnexpectedPktNumErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendPioArmLaunchErr_LSB 0x23
+#define QIB_7220_ErrStatus_SendPioArmLaunchErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendDroppedDataPktErr_LSB 0x22
+#define QIB_7220_ErrStatus_SendDroppedDataPktErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendDroppedSmpPktErr_LSB 0x21
+#define QIB_7220_ErrStatus_SendDroppedSmpPktErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendPktLenErr_LSB 0x20
+#define QIB_7220_ErrStatus_SendPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendUnderRunErr_LSB 0x1F
+#define QIB_7220_ErrStatus_SendUnderRunErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendMaxPktLenErr_LSB 0x1E
+#define QIB_7220_ErrStatus_SendMaxPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendMinPktLenErr_LSB 0x1D
+#define QIB_7220_ErrStatus_SendMinPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaDisabledErr_LSB 0x1C
+#define QIB_7220_ErrStatus_SDmaDisabledErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendSpecialTriggerErr_LSB 0x1B
+#define QIB_7220_ErrStatus_SendSpecialTriggerErr_RMASK 0x1
+#define QIB_7220_ErrStatus_Reserved1_LSB 0x12
+#define QIB_7220_ErrStatus_Reserved1_RMASK 0x1FF
+#define QIB_7220_ErrStatus_RcvIBLostLinkErr_LSB 0x11
+#define QIB_7220_ErrStatus_RcvIBLostLinkErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvHdrErr_LSB 0x10
+#define QIB_7220_ErrStatus_RcvHdrErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvHdrLenErr_LSB 0xF
+#define QIB_7220_ErrStatus_RcvHdrLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvBadTidErr_LSB 0xE
+#define QIB_7220_ErrStatus_RcvBadTidErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvHdrFullErr_LSB 0xD
+#define QIB_7220_ErrStatus_RcvHdrFullErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvEgrFullErr_LSB 0xC
+#define QIB_7220_ErrStatus_RcvEgrFullErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvBadVersionErr_LSB 0xB
+#define QIB_7220_ErrStatus_RcvBadVersionErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvIBFlowErr_LSB 0xA
+#define QIB_7220_ErrStatus_RcvIBFlowErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvEBPErr_LSB 0x9
+#define QIB_7220_ErrStatus_RcvEBPErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvUnsupportedVLErr_LSB 0x8
+#define QIB_7220_ErrStatus_RcvUnsupportedVLErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvUnexpectedCharErr_LSB 0x7
+#define QIB_7220_ErrStatus_RcvUnexpectedCharErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvShortPktLenErr_LSB 0x6
+#define QIB_7220_ErrStatus_RcvShortPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvLongPktLenErr_LSB 0x5
+#define QIB_7220_ErrStatus_RcvLongPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvMaxPktLenErr_LSB 0x4
+#define QIB_7220_ErrStatus_RcvMaxPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvMinPktLenErr_LSB 0x3
+#define QIB_7220_ErrStatus_RcvMinPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvICRCErr_LSB 0x2
+#define QIB_7220_ErrStatus_RcvICRCErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvVCRCErr_LSB 0x1
+#define QIB_7220_ErrStatus_RcvVCRCErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvFormatErr_LSB 0x0
+#define QIB_7220_ErrStatus_RcvFormatErr_RMASK 0x1
+
+#define QIB_7220_ErrClear_OFFS 0x90
+#define QIB_7220_ErrClear_Reserved_LSB 0x36
+#define QIB_7220_ErrClear_Reserved_RMASK 0x3FF
+#define QIB_7220_ErrClear_InvalidEEPCmdErrClear_LSB 0x35
+#define QIB_7220_ErrClear_InvalidEEPCmdErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaDescAddrMisalignErrClear_LSB 0x34
+#define QIB_7220_ErrClear_SDmaDescAddrMisalignErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_HardwareErrClear_LSB 0x33
+#define QIB_7220_ErrClear_HardwareErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_ResetNegatedClear_LSB 0x32
+#define QIB_7220_ErrClear_ResetNegatedClear_RMASK 0x1
+#define QIB_7220_ErrClear_InvalidAddrErrClear_LSB 0x31
+#define QIB_7220_ErrClear_InvalidAddrErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_IBStatusChangedClear_LSB 0x30
+#define QIB_7220_ErrClear_IBStatusChangedClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaUnexpDataErrClear_LSB 0x2F
+#define QIB_7220_ErrClear_SDmaUnexpDataErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaMissingDwErrClear_LSB 0x2E
+#define QIB_7220_ErrClear_SDmaMissingDwErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaDwEnErrClear_LSB 0x2D
+#define QIB_7220_ErrClear_SDmaDwEnErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaRpyTagErrClear_LSB 0x2C
+#define QIB_7220_ErrClear_SDmaRpyTagErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDma1stDescErrClear_LSB 0x2B
+#define QIB_7220_ErrClear_SDma1stDescErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaBaseErrClear_LSB 0x2A
+#define QIB_7220_ErrClear_SDmaBaseErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaTailOutOfBoundErrClear_LSB 0x29
+#define QIB_7220_ErrClear_SDmaTailOutOfBoundErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaOutOfBoundErrClear_LSB 0x28
+#define QIB_7220_ErrClear_SDmaOutOfBoundErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaGenMismatchErrClear_LSB 0x27
+#define QIB_7220_ErrClear_SDmaGenMismatchErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendBufMisuseErrClear_LSB 0x26
+#define QIB_7220_ErrClear_SendBufMisuseErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendUnsupportedVLErrClear_LSB 0x25
+#define QIB_7220_ErrClear_SendUnsupportedVLErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendUnexpectedPktNumErrClear_LSB 0x24
+#define QIB_7220_ErrClear_SendUnexpectedPktNumErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendPioArmLaunchErrClear_LSB 0x23
+#define QIB_7220_ErrClear_SendPioArmLaunchErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendDroppedDataPktErrClear_LSB 0x22
+#define QIB_7220_ErrClear_SendDroppedDataPktErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendDroppedSmpPktErrClear_LSB 0x21
+#define QIB_7220_ErrClear_SendDroppedSmpPktErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendPktLenErrClear_LSB 0x20
+#define QIB_7220_ErrClear_SendPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendUnderRunErrClear_LSB 0x1F
+#define QIB_7220_ErrClear_SendUnderRunErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendMaxPktLenErrClear_LSB 0x1E
+#define QIB_7220_ErrClear_SendMaxPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendMinPktLenErrClear_LSB 0x1D
+#define QIB_7220_ErrClear_SendMinPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaDisabledErrClear_LSB 0x1C
+#define QIB_7220_ErrClear_SDmaDisabledErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendSpecialTriggerErrClear_LSB 0x1B
+#define QIB_7220_ErrClear_SendSpecialTriggerErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_Reserved1_LSB 0x12
+#define QIB_7220_ErrClear_Reserved1_RMASK 0x1FF
+#define QIB_7220_ErrClear_RcvIBLostLinkErrClear_LSB 0x11
+#define QIB_7220_ErrClear_RcvIBLostLinkErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvHdrErrClear_LSB 0x10
+#define QIB_7220_ErrClear_RcvHdrErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvHdrLenErrClear_LSB 0xF
+#define QIB_7220_ErrClear_RcvHdrLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvBadTidErrClear_LSB 0xE
+#define QIB_7220_ErrClear_RcvBadTidErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvHdrFullErrClear_LSB 0xD
+#define QIB_7220_ErrClear_RcvHdrFullErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvEgrFullErrClear_LSB 0xC
+#define QIB_7220_ErrClear_RcvEgrFullErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvBadVersionErrClear_LSB 0xB
+#define QIB_7220_ErrClear_RcvBadVersionErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvIBFlowErrClear_LSB 0xA
+#define QIB_7220_ErrClear_RcvIBFlowErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvEBPErrClear_LSB 0x9
+#define QIB_7220_ErrClear_RcvEBPErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvUnsupportedVLErrClear_LSB 0x8
+#define QIB_7220_ErrClear_RcvUnsupportedVLErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvUnexpectedCharErrClear_LSB 0x7
+#define QIB_7220_ErrClear_RcvUnexpectedCharErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvShortPktLenErrClear_LSB 0x6
+#define QIB_7220_ErrClear_RcvShortPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvLongPktLenErrClear_LSB 0x5
+#define QIB_7220_ErrClear_RcvLongPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvMaxPktLenErrClear_LSB 0x4
+#define QIB_7220_ErrClear_RcvMaxPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvMinPktLenErrClear_LSB 0x3
+#define QIB_7220_ErrClear_RcvMinPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvICRCErrClear_LSB 0x2
+#define QIB_7220_ErrClear_RcvICRCErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvVCRCErrClear_LSB 0x1
+#define QIB_7220_ErrClear_RcvVCRCErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvFormatErrClear_LSB 0x0
+#define QIB_7220_ErrClear_RcvFormatErrClear_RMASK 0x1
+
+#define QIB_7220_HwErrMask_OFFS 0x98
+#define QIB_7220_HwErrMask_IBCBusFromSPCParityErrMask_LSB 0x3F
+#define QIB_7220_HwErrMask_IBCBusFromSPCParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_IBCBusToSPCParityErrMask_LSB 0x3E
+#define QIB_7220_HwErrMask_IBCBusToSPCParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Clk_uC_PLLNotLockedMask_LSB 0x3D
+#define QIB_7220_HwErrMask_Clk_uC_PLLNotLockedMask_RMASK 0x1
+#define QIB_7220_HwErrMask_IBSerdesPClkNotDetectMask_LSB 0x3C
+#define QIB_7220_HwErrMask_IBSerdesPClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIESerdesQ3PClkNotDetectMask_LSB 0x3B
+#define QIB_7220_HwErrMask_PCIESerdesQ3PClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIESerdesQ2PClkNotDetectMask_LSB 0x3A
+#define QIB_7220_HwErrMask_PCIESerdesQ2PClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIESerdesQ1PClkNotDetectMask_LSB 0x39
+#define QIB_7220_HwErrMask_PCIESerdesQ1PClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIESerdesQ0PClkNotDetectMask_LSB 0x38
+#define QIB_7220_HwErrMask_PCIESerdesQ0PClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Reserved_LSB 0x37
+#define QIB_7220_HwErrMask_Reserved_RMASK 0x1
+#define QIB_7220_HwErrMask_PowerOnBISTFailedMask_LSB 0x36
+#define QIB_7220_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Reserved1_LSB 0x33
+#define QIB_7220_HwErrMask_Reserved1_RMASK 0x7
+#define QIB_7220_HwErrMask_RXEMemParityErrMask_LSB 0x2C
+#define QIB_7220_HwErrMask_RXEMemParityErrMask_RMASK 0x7F
+#define QIB_7220_HwErrMask_TXEMemParityErrMask_LSB 0x28
+#define QIB_7220_HwErrMask_TXEMemParityErrMask_RMASK 0xF
+#define QIB_7220_HwErrMask_DDSRXEQMemoryParityErrMask_LSB 0x27
+#define QIB_7220_HwErrMask_DDSRXEQMemoryParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_IB_uC_MemoryParityErrMask_LSB 0x26
+#define QIB_7220_HwErrMask_IB_uC_MemoryParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIEOct1_uC_MemoryParityErrMask_LSB 0x25
+#define QIB_7220_HwErrMask_PCIEOct1_uC_MemoryParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIEOct0_uC_MemoryParityErrMask_LSB 0x24
+#define QIB_7220_HwErrMask_PCIEOct0_uC_MemoryParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Reserved2_LSB 0x22
+#define QIB_7220_HwErrMask_Reserved2_RMASK 0x3
+#define QIB_7220_HwErrMask_PCIeBusParityErrMask_LSB 0x1F
+#define QIB_7220_HwErrMask_PCIeBusParityErrMask_RMASK 0x7
+#define QIB_7220_HwErrMask_PcieCplTimeoutMask_LSB 0x1E
+#define QIB_7220_HwErrMask_PcieCplTimeoutMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PoisonedTLPMask_LSB 0x1D
+#define QIB_7220_HwErrMask_PoisonedTLPMask_RMASK 0x1
+#define QIB_7220_HwErrMask_SDmaMemReadErrMask_LSB 0x1C
+#define QIB_7220_HwErrMask_SDmaMemReadErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Reserved3_LSB 0x8
+#define QIB_7220_HwErrMask_Reserved3_RMASK 0xFFFFF
+#define QIB_7220_HwErrMask_PCIeMemParityErrMask_LSB 0x0
+#define QIB_7220_HwErrMask_PCIeMemParityErrMask_RMASK 0xFF
+
+#define QIB_7220_HwErrStatus_OFFS 0xA0
+#define QIB_7220_HwErrStatus_IBCBusFromSPCParityErr_LSB 0x3F
+#define QIB_7220_HwErrStatus_IBCBusFromSPCParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_IBCBusToSPCParityErr_LSB 0x3E
+#define QIB_7220_HwErrStatus_IBCBusToSPCParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_Clk_uC_PLLNotLocked_LSB 0x3D
+#define QIB_7220_HwErrStatus_Clk_uC_PLLNotLocked_RMASK 0x1
+#define QIB_7220_HwErrStatus_IBSerdesPClkNotDetect_LSB 0x3C
+#define QIB_7220_HwErrStatus_IBSerdesPClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIESerdesQ3PClkNotDetect_LSB 0x3B
+#define QIB_7220_HwErrStatus_PCIESerdesQ3PClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIESerdesQ2PClkNotDetect_LSB 0x3A
+#define QIB_7220_HwErrStatus_PCIESerdesQ2PClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIESerdesQ1PClkNotDetect_LSB 0x39
+#define QIB_7220_HwErrStatus_PCIESerdesQ1PClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIESerdesQ0PClkNotDetect_LSB 0x38
+#define QIB_7220_HwErrStatus_PCIESerdesQ0PClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_Reserved_LSB 0x37
+#define QIB_7220_HwErrStatus_Reserved_RMASK 0x1
+#define QIB_7220_HwErrStatus_PowerOnBISTFailed_LSB 0x36
+#define QIB_7220_HwErrStatus_PowerOnBISTFailed_RMASK 0x1
+#define QIB_7220_HwErrStatus_Reserved1_LSB 0x33
+#define QIB_7220_HwErrStatus_Reserved1_RMASK 0x7
+#define QIB_7220_HwErrStatus_RXEMemParity_LSB 0x2C
+#define QIB_7220_HwErrStatus_RXEMemParity_RMASK 0x7F
+#define QIB_7220_HwErrStatus_TXEMemParity_LSB 0x28
+#define QIB_7220_HwErrStatus_TXEMemParity_RMASK 0xF
+#define QIB_7220_HwErrStatus_DDSRXEQMemoryParityErr_LSB 0x27
+#define QIB_7220_HwErrStatus_DDSRXEQMemoryParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_IB_uC_MemoryParityErr_LSB 0x26
+#define QIB_7220_HwErrStatus_IB_uC_MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIE_uC_Oct1MemoryParityErr_LSB 0x25
+#define QIB_7220_HwErrStatus_PCIE_uC_Oct1MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIE_uC_Oct0MemoryParityErr_LSB 0x24
+#define QIB_7220_HwErrStatus_PCIE_uC_Oct0MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_Reserved2_LSB 0x22
+#define QIB_7220_HwErrStatus_Reserved2_RMASK 0x3
+#define QIB_7220_HwErrStatus_PCIeBusParity_LSB 0x1F
+#define QIB_7220_HwErrStatus_PCIeBusParity_RMASK 0x7
+#define QIB_7220_HwErrStatus_PcieCplTimeout_LSB 0x1E
+#define QIB_7220_HwErrStatus_PcieCplTimeout_RMASK 0x1
+#define QIB_7220_HwErrStatus_PoisenedTLP_LSB 0x1D
+#define QIB_7220_HwErrStatus_PoisenedTLP_RMASK 0x1
+#define QIB_7220_HwErrStatus_SDmaMemReadErr_LSB 0x1C
+#define QIB_7220_HwErrStatus_SDmaMemReadErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_Reserved3_LSB 0x8
+#define QIB_7220_HwErrStatus_Reserved3_RMASK 0xFFFFF
+#define QIB_7220_HwErrStatus_PCIeMemParity_LSB 0x0
+#define QIB_7220_HwErrStatus_PCIeMemParity_RMASK 0xFF
+
+#define QIB_7220_HwErrClear_OFFS 0xA8
+#define QIB_7220_HwErrClear_IBCBusFromSPCParityErrClear_LSB 0x3F
+#define QIB_7220_HwErrClear_IBCBusFromSPCParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_IBCBusToSPCparityErrClear_LSB 0x3E
+#define QIB_7220_HwErrClear_IBCBusToSPCparityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Clk_uC_PLLNotLockedClear_LSB 0x3D
+#define QIB_7220_HwErrClear_Clk_uC_PLLNotLockedClear_RMASK 0x1
+#define QIB_7220_HwErrClear_IBSerdesPClkNotDetectClear_LSB 0x3C
+#define QIB_7220_HwErrClear_IBSerdesPClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIESerdesQ3PClkNotDetectClear_LSB 0x3B
+#define QIB_7220_HwErrClear_PCIESerdesQ3PClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIESerdesQ2PClkNotDetectClear_LSB 0x3A
+#define QIB_7220_HwErrClear_PCIESerdesQ2PClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIESerdesQ1PClkNotDetectClear_LSB 0x39
+#define QIB_7220_HwErrClear_PCIESerdesQ1PClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIESerdesQ0PClkNotDetectClear_LSB 0x38
+#define QIB_7220_HwErrClear_PCIESerdesQ0PClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Reserved_LSB 0x37
+#define QIB_7220_HwErrClear_Reserved_RMASK 0x1
+#define QIB_7220_HwErrClear_PowerOnBISTFailedClear_LSB 0x36
+#define QIB_7220_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Reserved1_LSB 0x33
+#define QIB_7220_HwErrClear_Reserved1_RMASK 0x7
+#define QIB_7220_HwErrClear_RXEMemParityClear_LSB 0x2C
+#define QIB_7220_HwErrClear_RXEMemParityClear_RMASK 0x7F
+#define QIB_7220_HwErrClear_TXEMemParityClear_LSB 0x28
+#define QIB_7220_HwErrClear_TXEMemParityClear_RMASK 0xF
+#define QIB_7220_HwErrClear_DDSRXEQMemoryParityErrClear_LSB 0x27
+#define QIB_7220_HwErrClear_DDSRXEQMemoryParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_IB_uC_MemoryParityErrClear_LSB 0x26
+#define QIB_7220_HwErrClear_IB_uC_MemoryParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIE_uC_Oct1MemoryParityErrClear_LSB 0x25
+#define QIB_7220_HwErrClear_PCIE_uC_Oct1MemoryParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIE_uC_Oct0MemoryParityErrClear_LSB 0x24
+#define QIB_7220_HwErrClear_PCIE_uC_Oct0MemoryParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Reserved2_LSB 0x22
+#define QIB_7220_HwErrClear_Reserved2_RMASK 0x3
+#define QIB_7220_HwErrClear_PCIeBusParityClr_LSB 0x1F
+#define QIB_7220_HwErrClear_PCIeBusParityClr_RMASK 0x7
+#define QIB_7220_HwErrClear_PcieCplTimeoutClear_LSB 0x1E
+#define QIB_7220_HwErrClear_PcieCplTimeoutClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PoisonedTLPClear_LSB 0x1D
+#define QIB_7220_HwErrClear_PoisonedTLPClear_RMASK 0x1
+#define QIB_7220_HwErrClear_SDmaMemReadErrClear_LSB 0x1C
+#define QIB_7220_HwErrClear_SDmaMemReadErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Reserved3_LSB 0x8
+#define QIB_7220_HwErrClear_Reserved3_RMASK 0xFFFFF
+#define QIB_7220_HwErrClear_PCIeMemParityClr_LSB 0x0
+#define QIB_7220_HwErrClear_PCIeMemParityClr_RMASK 0xFF
+
+#define QIB_7220_HwDiagCtrl_OFFS 0xB0
+#define QIB_7220_HwDiagCtrl_ForceIBCBusFromSPCParityErr_LSB 0x3F
+#define QIB_7220_HwDiagCtrl_ForceIBCBusFromSPCParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_ForceIBCBusToSPCParityErr_LSB 0x3E
+#define QIB_7220_HwDiagCtrl_ForceIBCBusToSPCParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_CounterWrEnable_LSB 0x3D
+#define QIB_7220_HwDiagCtrl_CounterWrEnable_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_CounterDisable_LSB 0x3C
+#define QIB_7220_HwDiagCtrl_CounterDisable_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_Reserved_LSB 0x33
+#define QIB_7220_HwDiagCtrl_Reserved_RMASK 0x1FF
+#define QIB_7220_HwDiagCtrl_ForceRxMemParityErr_LSB 0x2C
+#define QIB_7220_HwDiagCtrl_ForceRxMemParityErr_RMASK 0x7F
+#define QIB_7220_HwDiagCtrl_ForceTxMemparityErr_LSB 0x28
+#define QIB_7220_HwDiagCtrl_ForceTxMemparityErr_RMASK 0xF
+#define QIB_7220_HwDiagCtrl_ForceDDSRXEQMemoryParityErr_LSB 0x27
+#define QIB_7220_HwDiagCtrl_ForceDDSRXEQMemoryParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_ForceIB_uC_MemoryParityErr_LSB 0x26
+#define QIB_7220_HwDiagCtrl_ForceIB_uC_MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct1MemoryParityErr_LSB 0x25
+#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct1MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct0MemoryParityErr_LSB 0x24
+#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct0MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_Reserved1_LSB 0x23
+#define QIB_7220_HwDiagCtrl_Reserved1_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F
+#define QIB_7220_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF
+#define QIB_7220_HwDiagCtrl_Reserved2_LSB 0x8
+#define QIB_7220_HwDiagCtrl_Reserved2_RMASK 0x7FFFFF
+#define QIB_7220_HwDiagCtrl_forcePCIeMemParity_LSB 0x0
+#define QIB_7220_HwDiagCtrl_forcePCIeMemParity_RMASK 0xFF
+
+#define QIB_7220_REG_0000B8_OFFS 0xB8
+
+#define QIB_7220_IBCStatus_OFFS 0xC0
+#define QIB_7220_IBCStatus_TxCreditOk_LSB 0x1F
+#define QIB_7220_IBCStatus_TxCreditOk_RMASK 0x1
+#define QIB_7220_IBCStatus_TxReady_LSB 0x1E
+#define QIB_7220_IBCStatus_TxReady_RMASK 0x1
+#define QIB_7220_IBCStatus_Reserved_LSB 0xE
+#define QIB_7220_IBCStatus_Reserved_RMASK 0xFFFF
+#define QIB_7220_IBCStatus_IBTxLaneReversed_LSB 0xD
+#define QIB_7220_IBCStatus_IBTxLaneReversed_RMASK 0x1
+#define QIB_7220_IBCStatus_IBRxLaneReversed_LSB 0xC
+#define QIB_7220_IBCStatus_IBRxLaneReversed_RMASK 0x1
+#define QIB_7220_IBCStatus_IB_SERDES_TRIM_DONE_LSB 0xB
+#define QIB_7220_IBCStatus_IB_SERDES_TRIM_DONE_RMASK 0x1
+#define QIB_7220_IBCStatus_DDS_RXEQ_FAIL_LSB 0xA
+#define QIB_7220_IBCStatus_DDS_RXEQ_FAIL_RMASK 0x1
+#define QIB_7220_IBCStatus_LinkWidthActive_LSB 0x9
+#define QIB_7220_IBCStatus_LinkWidthActive_RMASK 0x1
+#define QIB_7220_IBCStatus_LinkSpeedActive_LSB 0x8
+#define QIB_7220_IBCStatus_LinkSpeedActive_RMASK 0x1
+#define QIB_7220_IBCStatus_LinkState_LSB 0x5
+#define QIB_7220_IBCStatus_LinkState_RMASK 0x7
+#define QIB_7220_IBCStatus_LinkTrainingState_LSB 0x0
+#define QIB_7220_IBCStatus_LinkTrainingState_RMASK 0x1F
+
+#define QIB_7220_IBCCtrl_OFFS 0xC8
+#define QIB_7220_IBCCtrl_Loopback_LSB 0x3F
+#define QIB_7220_IBCCtrl_Loopback_RMASK 0x1
+#define QIB_7220_IBCCtrl_LinkDownDefaultState_LSB 0x3E
+#define QIB_7220_IBCCtrl_LinkDownDefaultState_RMASK 0x1
+#define QIB_7220_IBCCtrl_Reserved_LSB 0x2B
+#define QIB_7220_IBCCtrl_Reserved_RMASK 0x7FFFF
+#define QIB_7220_IBCCtrl_CreditScale_LSB 0x28
+#define QIB_7220_IBCCtrl_CreditScale_RMASK 0x7
+#define QIB_7220_IBCCtrl_OverrunThreshold_LSB 0x24
+#define QIB_7220_IBCCtrl_OverrunThreshold_RMASK 0xF
+#define QIB_7220_IBCCtrl_PhyerrThreshold_LSB 0x20
+#define QIB_7220_IBCCtrl_PhyerrThreshold_RMASK 0xF
+#define QIB_7220_IBCCtrl_MaxPktLen_LSB 0x15
+#define QIB_7220_IBCCtrl_MaxPktLen_RMASK 0x7FF
+#define QIB_7220_IBCCtrl_LinkCmd_LSB 0x13
+#define QIB_7220_IBCCtrl_LinkCmd_RMASK 0x3
+#define QIB_7220_IBCCtrl_LinkInitCmd_LSB 0x10
+#define QIB_7220_IBCCtrl_LinkInitCmd_RMASK 0x7
+#define QIB_7220_IBCCtrl_FlowCtrlWaterMark_LSB 0x8
+#define QIB_7220_IBCCtrl_FlowCtrlWaterMark_RMASK 0xFF
+#define QIB_7220_IBCCtrl_FlowCtrlPeriod_LSB 0x0
+#define QIB_7220_IBCCtrl_FlowCtrlPeriod_RMASK 0xFF
+
+#define QIB_7220_EXTStatus_OFFS 0xD0
+#define QIB_7220_EXTStatus_GPIOIn_LSB 0x30
+#define QIB_7220_EXTStatus_GPIOIn_RMASK 0xFFFF
+#define QIB_7220_EXTStatus_Reserved_LSB 0x20
+#define QIB_7220_EXTStatus_Reserved_RMASK 0xFFFF
+#define QIB_7220_EXTStatus_Reserved1_LSB 0x10
+#define QIB_7220_EXTStatus_Reserved1_RMASK 0xFFFF
+#define QIB_7220_EXTStatus_MemBISTDisabled_LSB 0xF
+#define QIB_7220_EXTStatus_MemBISTDisabled_RMASK 0x1
+#define QIB_7220_EXTStatus_MemBISTEndTest_LSB 0xE
+#define QIB_7220_EXTStatus_MemBISTEndTest_RMASK 0x1
+#define QIB_7220_EXTStatus_Reserved2_LSB 0x0
+#define QIB_7220_EXTStatus_Reserved2_RMASK 0x3FFF
+
+#define QIB_7220_EXTCtrl_OFFS 0xD8
+#define QIB_7220_EXTCtrl_GPIOOe_LSB 0x30
+#define QIB_7220_EXTCtrl_GPIOOe_RMASK 0xFFFF
+#define QIB_7220_EXTCtrl_GPIOInvert_LSB 0x20
+#define QIB_7220_EXTCtrl_GPIOInvert_RMASK 0xFFFF
+#define QIB_7220_EXTCtrl_Reserved_LSB 0x4
+#define QIB_7220_EXTCtrl_Reserved_RMASK 0xFFFFFFF
+#define QIB_7220_EXTCtrl_LEDPriPortGreenOn_LSB 0x3
+#define QIB_7220_EXTCtrl_LEDPriPortGreenOn_RMASK 0x1
+#define QIB_7220_EXTCtrl_LEDPriPortYellowOn_LSB 0x2
+#define QIB_7220_EXTCtrl_LEDPriPortYellowOn_RMASK 0x1
+#define QIB_7220_EXTCtrl_LEDGblOkGreenOn_LSB 0x1
+#define QIB_7220_EXTCtrl_LEDGblOkGreenOn_RMASK 0x1
+#define QIB_7220_EXTCtrl_LEDGblErrRedOff_LSB 0x0
+#define QIB_7220_EXTCtrl_LEDGblErrRedOff_RMASK 0x1
+
+#define QIB_7220_GPIOOut_OFFS 0xE0
+
+#define QIB_7220_GPIOMask_OFFS 0xE8
+
+#define QIB_7220_GPIOStatus_OFFS 0xF0
+
+#define QIB_7220_GPIOClear_OFFS 0xF8
+
+#define QIB_7220_RcvCtrl_OFFS 0x100
+#define QIB_7220_RcvCtrl_Reserved_LSB 0x27
+#define QIB_7220_RcvCtrl_Reserved_RMASK 0x1FFFFFF
+#define QIB_7220_RcvCtrl_RcvQPMapEnable_LSB 0x26
+#define QIB_7220_RcvCtrl_RcvQPMapEnable_RMASK 0x1
+#define QIB_7220_RcvCtrl_PortCfg_LSB 0x24
+#define QIB_7220_RcvCtrl_PortCfg_RMASK 0x3
+#define QIB_7220_RcvCtrl_TailUpd_LSB 0x23
+#define QIB_7220_RcvCtrl_TailUpd_RMASK 0x1
+#define QIB_7220_RcvCtrl_RcvPartitionKeyDisable_LSB 0x22
+#define QIB_7220_RcvCtrl_RcvPartitionKeyDisable_RMASK 0x1
+#define QIB_7220_RcvCtrl_IntrAvail_LSB 0x11
+#define QIB_7220_RcvCtrl_IntrAvail_RMASK 0x1FFFF
+#define QIB_7220_RcvCtrl_PortEnable_LSB 0x0
+#define QIB_7220_RcvCtrl_PortEnable_RMASK 0x1FFFF
+
+#define QIB_7220_RcvBTHQP_OFFS 0x108
+#define QIB_7220_RcvBTHQP_Reserved_LSB 0x18
+#define QIB_7220_RcvBTHQP_Reserved_RMASK 0xFF
+#define QIB_7220_RcvBTHQP_RcvBTHQP_LSB 0x0
+#define QIB_7220_RcvBTHQP_RcvBTHQP_RMASK 0xFFFFFF
+
+#define QIB_7220_RcvHdrSize_OFFS 0x110
+
+#define QIB_7220_RcvHdrCnt_OFFS 0x118
+
+#define QIB_7220_RcvHdrEntSize_OFFS 0x120
+
+#define QIB_7220_RcvTIDBase_OFFS 0x128
+
+#define QIB_7220_RcvTIDCnt_OFFS 0x130
+
+#define QIB_7220_RcvEgrBase_OFFS 0x138
+
+#define QIB_7220_RcvEgrCnt_OFFS 0x140
+
+#define QIB_7220_RcvBufBase_OFFS 0x148
+
+#define QIB_7220_RcvBufSize_OFFS 0x150
+
+#define QIB_7220_RxIntMemBase_OFFS 0x158
+
+#define QIB_7220_RxIntMemSize_OFFS 0x160
+
+#define QIB_7220_RcvPartitionKey_OFFS 0x168
+
+#define QIB_7220_RcvQPMulticastPort_OFFS 0x170
+#define QIB_7220_RcvQPMulticastPort_Reserved_LSB 0x5
+#define QIB_7220_RcvQPMulticastPort_Reserved_RMASK 0x7FFFFFFFFFFFFFF
+#define QIB_7220_RcvQPMulticastPort_RcvQpMcPort_LSB 0x0
+#define QIB_7220_RcvQPMulticastPort_RcvQpMcPort_RMASK 0x1F
+
+#define QIB_7220_RcvPktLEDCnt_OFFS 0x178
+#define QIB_7220_RcvPktLEDCnt_ONperiod_LSB 0x20
+#define QIB_7220_RcvPktLEDCnt_ONperiod_RMASK 0xFFFFFFFF
+#define QIB_7220_RcvPktLEDCnt_OFFperiod_LSB 0x0
+#define QIB_7220_RcvPktLEDCnt_OFFperiod_RMASK 0xFFFFFFFF
+
+#define QIB_7220_IBCDDRCtrl_OFFS 0x180
+#define QIB_7220_IBCDDRCtrl_IB_DLID_MASK_LSB 0x30
+#define QIB_7220_IBCDDRCtrl_IB_DLID_MASK_RMASK 0xFFFF
+#define QIB_7220_IBCDDRCtrl_IB_DLID_LSB 0x20
+#define QIB_7220_IBCDDRCtrl_IB_DLID_RMASK 0xFFFF
+#define QIB_7220_IBCDDRCtrl_Reserved_LSB 0x1B
+#define QIB_7220_IBCDDRCtrl_Reserved_RMASK 0x1F
+#define QIB_7220_IBCDDRCtrl_HRTBT_REQ_LSB 0x1A
+#define QIB_7220_IBCDDRCtrl_HRTBT_REQ_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_HRTBT_PORT_LSB 0x12
+#define QIB_7220_IBCDDRCtrl_HRTBT_PORT_RMASK 0xFF
+#define QIB_7220_IBCDDRCtrl_HRTBT_AUTO_LSB 0x11
+#define QIB_7220_IBCDDRCtrl_HRTBT_AUTO_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_HRTBT_ENB_LSB 0x10
+#define QIB_7220_IBCDDRCtrl_HRTBT_ENB_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_DDS_LSB 0xC
+#define QIB_7220_IBCDDRCtrl_SD_DDS_RMASK 0xF
+#define QIB_7220_IBCDDRCtrl_SD_DDSV_LSB 0xB
+#define QIB_7220_IBCDDRCtrl_SD_DDSV_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_ADD_ENB_LSB 0xA
+#define QIB_7220_IBCDDRCtrl_SD_ADD_ENB_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_RX_EQUAL_ENABLE_LSB 0x9
+#define QIB_7220_IBCDDRCtrl_SD_RX_EQUAL_ENABLE_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_IB_LANE_REV_SUPPORTED_LSB 0x8
+#define QIB_7220_IBCDDRCtrl_IB_LANE_REV_SUPPORTED_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_IB_POLARITY_REV_SUPP_LSB 0x7
+#define QIB_7220_IBCDDRCtrl_IB_POLARITY_REV_SUPP_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_IB_NUM_CHANNELS_LSB 0x5
+#define QIB_7220_IBCDDRCtrl_IB_NUM_CHANNELS_RMASK 0x3
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_QDR_LSB 0x4
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_QDR_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_DDR_LSB 0x3
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_DDR_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_SDR_LSB 0x2
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_SDR_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_LSB 0x1
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_IB_ENHANCED_MODE_LSB 0x0
+#define QIB_7220_IBCDDRCtrl_IB_ENHANCED_MODE_RMASK 0x1
+
+#define QIB_7220_HRTBT_GUID_OFFS 0x188
+
+#define QIB_7220_IBCDDRCtrl2_OFFS 0x1A0
+#define QIB_7220_IBCDDRCtrl2_IB_BACK_PORCH_LSB 0x5
+#define QIB_7220_IBCDDRCtrl2_IB_BACK_PORCH_RMASK 0x1F
+#define QIB_7220_IBCDDRCtrl2_IB_FRONT_PORCH_LSB 0x0
+#define QIB_7220_IBCDDRCtrl2_IB_FRONT_PORCH_RMASK 0x1F
+
+#define QIB_7220_IBCDDRStatus_OFFS 0x1A8
+#define QIB_7220_IBCDDRStatus_heartbeat_timed_out_LSB 0x24
+#define QIB_7220_IBCDDRStatus_heartbeat_timed_out_RMASK 0x1
+#define QIB_7220_IBCDDRStatus_heartbeat_crosstalk_LSB 0x20
+#define QIB_7220_IBCDDRStatus_heartbeat_crosstalk_RMASK 0xF
+#define QIB_7220_IBCDDRStatus_RxEqLocalDevice_LSB 0x1E
+#define QIB_7220_IBCDDRStatus_RxEqLocalDevice_RMASK 0x3
+#define QIB_7220_IBCDDRStatus_ReqDDSLocalFromRmt_LSB 0x1A
+#define QIB_7220_IBCDDRStatus_ReqDDSLocalFromRmt_RMASK 0xF
+#define QIB_7220_IBCDDRStatus_LinkRoundTripLatency_LSB 0x0
+#define QIB_7220_IBCDDRStatus_LinkRoundTripLatency_RMASK 0x3FFFFFF
+
+#define QIB_7220_JIntReload_OFFS 0x1B0
+#define QIB_7220_JIntReload_J_limit_reload_LSB 0x10
+#define QIB_7220_JIntReload_J_limit_reload_RMASK 0xFFFF
+#define QIB_7220_JIntReload_J_reload_LSB 0x0
+#define QIB_7220_JIntReload_J_reload_RMASK 0xFFFF
+
+#define QIB_7220_IBNCModeCtrl_OFFS 0x1B8
+#define QIB_7220_IBNCModeCtrl_Reserved_LSB 0x1A
+#define QIB_7220_IBNCModeCtrl_Reserved_RMASK 0x3FFFFFFFFF
+#define QIB_7220_IBNCModeCtrl_TSMCode_TS2_LSB 0x11
+#define QIB_7220_IBNCModeCtrl_TSMCode_TS2_RMASK 0x1FF
+#define QIB_7220_IBNCModeCtrl_TSMCode_TS1_LSB 0x8
+#define QIB_7220_IBNCModeCtrl_TSMCode_TS1_RMASK 0x1FF
+#define QIB_7220_IBNCModeCtrl_Reserved1_LSB 0x3
+#define QIB_7220_IBNCModeCtrl_Reserved1_RMASK 0x1F
+#define QIB_7220_IBNCModeCtrl_TSMEnable_ignore_TSM_on_rx_LSB 0x2
+#define QIB_7220_IBNCModeCtrl_TSMEnable_ignore_TSM_on_rx_RMASK 0x1
+#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS2_LSB 0x1
+#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS2_RMASK 0x1
+#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS1_LSB 0x0
+#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS1_RMASK 0x1
+
+#define QIB_7220_SendCtrl_OFFS 0x1C0
+#define QIB_7220_SendCtrl_Disarm_LSB 0x1F
+#define QIB_7220_SendCtrl_Disarm_RMASK 0x1
+#define QIB_7220_SendCtrl_Reserved_LSB 0x1D
+#define QIB_7220_SendCtrl_Reserved_RMASK 0x3
+#define QIB_7220_SendCtrl_AvailUpdThld_LSB 0x18
+#define QIB_7220_SendCtrl_AvailUpdThld_RMASK 0x1F
+#define QIB_7220_SendCtrl_DisarmPIOBuf_LSB 0x10
+#define QIB_7220_SendCtrl_DisarmPIOBuf_RMASK 0xFF
+#define QIB_7220_SendCtrl_Reserved1_LSB 0xD
+#define QIB_7220_SendCtrl_Reserved1_RMASK 0x7
+#define QIB_7220_SendCtrl_SDmaHalt_LSB 0xC
+#define QIB_7220_SendCtrl_SDmaHalt_RMASK 0x1
+#define QIB_7220_SendCtrl_SDmaEnable_LSB 0xB
+#define QIB_7220_SendCtrl_SDmaEnable_RMASK 0x1
+#define QIB_7220_SendCtrl_SDmaSingleDescriptor_LSB 0xA
+#define QIB_7220_SendCtrl_SDmaSingleDescriptor_RMASK 0x1
+#define QIB_7220_SendCtrl_SDmaIntEnable_LSB 0x9
+#define QIB_7220_SendCtrl_SDmaIntEnable_RMASK 0x1
+#define QIB_7220_SendCtrl_Reserved2_LSB 0x5
+#define QIB_7220_SendCtrl_Reserved2_RMASK 0xF
+#define QIB_7220_SendCtrl_SSpecialTriggerEn_LSB 0x4
+#define QIB_7220_SendCtrl_SSpecialTriggerEn_RMASK 0x1
+#define QIB_7220_SendCtrl_SPioEnable_LSB 0x3
+#define QIB_7220_SendCtrl_SPioEnable_RMASK 0x1
+#define QIB_7220_SendCtrl_SendBufAvailUpd_LSB 0x2
+#define QIB_7220_SendCtrl_SendBufAvailUpd_RMASK 0x1
+#define QIB_7220_SendCtrl_SendIntBufAvail_LSB 0x1
+#define QIB_7220_SendCtrl_SendIntBufAvail_RMASK 0x1
+#define QIB_7220_SendCtrl_Abort_LSB 0x0
+#define QIB_7220_SendCtrl_Abort_RMASK 0x1
+
+#define QIB_7220_SendBufBase_OFFS 0x1C8
+#define QIB_7220_SendBufBase_Reserved_LSB 0x35
+#define QIB_7220_SendBufBase_Reserved_RMASK 0x7FF
+#define QIB_7220_SendBufBase_BaseAddr_LargePIO_LSB 0x20
+#define QIB_7220_SendBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF
+#define QIB_7220_SendBufBase_Reserved1_LSB 0x15
+#define QIB_7220_SendBufBase_Reserved1_RMASK 0x7FF
+#define QIB_7220_SendBufBase_BaseAddr_SmallPIO_LSB 0x0
+#define QIB_7220_SendBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF
+
+#define QIB_7220_SendBufSize_OFFS 0x1D0
+#define QIB_7220_SendBufSize_Reserved_LSB 0x2D
+#define QIB_7220_SendBufSize_Reserved_RMASK 0xFFFFF
+#define QIB_7220_SendBufSize_Size_LargePIO_LSB 0x20
+#define QIB_7220_SendBufSize_Size_LargePIO_RMASK 0x1FFF
+#define QIB_7220_SendBufSize_Reserved1_LSB 0xC
+#define QIB_7220_SendBufSize_Reserved1_RMASK 0xFFFFF
+#define QIB_7220_SendBufSize_Size_SmallPIO_LSB 0x0
+#define QIB_7220_SendBufSize_Size_SmallPIO_RMASK 0xFFF
+
+#define QIB_7220_SendBufCnt_OFFS 0x1D8
+#define QIB_7220_SendBufCnt_Reserved_LSB 0x24
+#define QIB_7220_SendBufCnt_Reserved_RMASK 0xFFFFFFF
+#define QIB_7220_SendBufCnt_Num_LargeBuffers_LSB 0x20
+#define QIB_7220_SendBufCnt_Num_LargeBuffers_RMASK 0xF
+#define QIB_7220_SendBufCnt_Reserved1_LSB 0x9
+#define QIB_7220_SendBufCnt_Reserved1_RMASK 0x7FFFFF
+#define QIB_7220_SendBufCnt_Num_SmallBuffers_LSB 0x0
+#define QIB_7220_SendBufCnt_Num_SmallBuffers_RMASK 0x1FF
+
+#define QIB_7220_SendBufAvailAddr_OFFS 0x1E0
+#define QIB_7220_SendBufAvailAddr_SendBufAvailAddr_LSB 0x6
+#define QIB_7220_SendBufAvailAddr_SendBufAvailAddr_RMASK 0x3FFFFFFFF
+#define QIB_7220_SendBufAvailAddr_Reserved_LSB 0x0
+#define QIB_7220_SendBufAvailAddr_Reserved_RMASK 0x3F
+
+#define QIB_7220_TxIntMemBase_OFFS 0x1E8
+
+#define QIB_7220_TxIntMemSize_OFFS 0x1F0
+
+#define QIB_7220_SendDmaBase_OFFS 0x1F8
+#define QIB_7220_SendDmaBase_Reserved_LSB 0x30
+#define QIB_7220_SendDmaBase_Reserved_RMASK 0xFFFF
+#define QIB_7220_SendDmaBase_SendDmaBase_LSB 0x0
+#define QIB_7220_SendDmaBase_SendDmaBase_RMASK 0xFFFFFFFFFFFF
+
+#define QIB_7220_SendDmaLenGen_OFFS 0x200
+#define QIB_7220_SendDmaLenGen_Reserved_LSB 0x13
+#define QIB_7220_SendDmaLenGen_Reserved_RMASK 0x1FFFFFFFFFFF
+#define QIB_7220_SendDmaLenGen_Generation_LSB 0x10
+#define QIB_7220_SendDmaLenGen_Generation_MSB 0x12
+#define QIB_7220_SendDmaLenGen_Generation_RMASK 0x7
+#define QIB_7220_SendDmaLenGen_Length_LSB 0x0
+#define QIB_7220_SendDmaLenGen_Length_RMASK 0xFFFF
+
+#define QIB_7220_SendDmaTail_OFFS 0x208
+#define QIB_7220_SendDmaTail_Reserved_LSB 0x10
+#define QIB_7220_SendDmaTail_Reserved_RMASK 0xFFFFFFFFFFFF
+#define QIB_7220_SendDmaTail_SendDmaTail_LSB 0x0
+#define QIB_7220_SendDmaTail_SendDmaTail_RMASK 0xFFFF
+
+#define QIB_7220_SendDmaHead_OFFS 0x210
+#define QIB_7220_SendDmaHead_Reserved_LSB 0x30
+#define QIB_7220_SendDmaHead_Reserved_RMASK 0xFFFF
+#define QIB_7220_SendDmaHead_InternalSendDmaHead_LSB 0x20
+#define QIB_7220_SendDmaHead_InternalSendDmaHead_RMASK 0xFFFF
+#define QIB_7220_SendDmaHead_Reserved1_LSB 0x10
+#define QIB_7220_SendDmaHead_Reserved1_RMASK 0xFFFF
+#define QIB_7220_SendDmaHead_SendDmaHead_LSB 0x0
+#define QIB_7220_SendDmaHead_SendDmaHead_RMASK 0xFFFF
+
+#define QIB_7220_SendDmaHeadAddr_OFFS 0x218
+#define QIB_7220_SendDmaHeadAddr_Reserved_LSB 0x30
+#define QIB_7220_SendDmaHeadAddr_Reserved_RMASK 0xFFFF
+#define QIB_7220_SendDmaHeadAddr_SendDmaHeadAddr_LSB 0x0
+#define QIB_7220_SendDmaHeadAddr_SendDmaHeadAddr_RMASK 0xFFFFFFFFFFFF
+
+#define QIB_7220_SendDmaBufMask0_OFFS 0x220
+#define QIB_7220_SendDmaBufMask0_BufMask_63_0_LSB 0x0
+#define QIB_7220_SendDmaBufMask0_BufMask_63_0_RMASK 0x0
+
+#define QIB_7220_SendDmaStatus_OFFS 0x238
+#define QIB_7220_SendDmaStatus_ScoreBoardDrainInProg_LSB 0x3F
+#define QIB_7220_SendDmaStatus_ScoreBoardDrainInProg_RMASK 0x1
+#define QIB_7220_SendDmaStatus_AbortInProg_LSB 0x3E
+#define QIB_7220_SendDmaStatus_AbortInProg_RMASK 0x1
+#define QIB_7220_SendDmaStatus_InternalSDmaEnable_LSB 0x3D
+#define QIB_7220_SendDmaStatus_InternalSDmaEnable_RMASK 0x1
+#define QIB_7220_SendDmaStatus_ScbDescIndex_13_0_LSB 0x2F
+#define QIB_7220_SendDmaStatus_ScbDescIndex_13_0_RMASK 0x3FFF
+#define QIB_7220_SendDmaStatus_RpyLowAddr_6_0_LSB 0x28
+#define QIB_7220_SendDmaStatus_RpyLowAddr_6_0_RMASK 0x7F
+#define QIB_7220_SendDmaStatus_RpyTag_7_0_LSB 0x20
+#define QIB_7220_SendDmaStatus_RpyTag_7_0_RMASK 0xFF
+#define QIB_7220_SendDmaStatus_ScbFull_LSB 0x1F
+#define QIB_7220_SendDmaStatus_ScbFull_RMASK 0x1
+#define QIB_7220_SendDmaStatus_ScbEmpty_LSB 0x1E
+#define QIB_7220_SendDmaStatus_ScbEmpty_RMASK 0x1
+#define QIB_7220_SendDmaStatus_ScbEntryValid_LSB 0x1D
+#define QIB_7220_SendDmaStatus_ScbEntryValid_RMASK 0x1
+#define QIB_7220_SendDmaStatus_ScbFetchDescFlag_LSB 0x1C
+#define QIB_7220_SendDmaStatus_ScbFetchDescFlag_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoReadyToGo_LSB 0x1B
+#define QIB_7220_SendDmaStatus_SplFifoReadyToGo_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoDisarmed_LSB 0x1A
+#define QIB_7220_SendDmaStatus_SplFifoDisarmed_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoEmpty_LSB 0x19
+#define QIB_7220_SendDmaStatus_SplFifoEmpty_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoFull_LSB 0x18
+#define QIB_7220_SendDmaStatus_SplFifoFull_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoBufNum_LSB 0x10
+#define QIB_7220_SendDmaStatus_SplFifoBufNum_RMASK 0xFF
+#define QIB_7220_SendDmaStatus_SplFifoDescIndex_LSB 0x0
+#define QIB_7220_SendDmaStatus_SplFifoDescIndex_RMASK 0xFFFF
+
+#define QIB_7220_SendBufErr0_OFFS 0x240
+#define QIB_7220_SendBufErr0_SendBufErr_63_0_LSB 0x0
+#define QIB_7220_SendBufErr0_SendBufErr_63_0_RMASK 0x0
+
+#define QIB_7220_RcvHdrAddr0_OFFS 0x270
+#define QIB_7220_RcvHdrAddr0_RcvHdrAddr0_LSB 0x2
+#define QIB_7220_RcvHdrAddr0_RcvHdrAddr0_RMASK 0x3FFFFFFFFF
+#define QIB_7220_RcvHdrAddr0_Reserved_LSB 0x0
+#define QIB_7220_RcvHdrAddr0_Reserved_RMASK 0x3
+
+#define QIB_7220_RcvHdrTailAddr0_OFFS 0x300
+#define QIB_7220_RcvHdrTailAddr0_RcvHdrTailAddr0_LSB 0x2
+#define QIB_7220_RcvHdrTailAddr0_RcvHdrTailAddr0_RMASK 0x3FFFFFFFFF
+#define QIB_7220_RcvHdrTailAddr0_Reserved_LSB 0x0
+#define QIB_7220_RcvHdrTailAddr0_Reserved_RMASK 0x3
+
+#define QIB_7220_ibsd_epb_access_ctrl_OFFS 0x3C0
+#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_granted_LSB 0x8
+#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_granted_RMASK 0x1
+#define QIB_7220_ibsd_epb_access_ctrl_Reserved_LSB 0x1
+#define QIB_7220_ibsd_epb_access_ctrl_Reserved_RMASK 0x7F
+#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_LSB 0x0
+#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_RMASK 0x1
+
+#define QIB_7220_ibsd_epb_transaction_reg_OFFS 0x3C8
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_rdy_LSB 0x1F
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_rdy_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_req_error_LSB 0x1E
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_req_error_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved_LSB 0x1D
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_mem_data_parity_LSB 0x1C
+#define QIB_7220_ibsd_epb_transaction_reg_mem_data_parity_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved1_LSB 0x1B
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved1_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_cs_LSB 0x19
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_cs_RMASK 0x3
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_read_write_LSB 0x18
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_read_write_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved2_LSB 0x17
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved2_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_address_LSB 0x8
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_address_RMASK 0x7FFF
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_data_LSB 0x0
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_data_RMASK 0xFF
+
+#define QIB_7220_XGXSCfg_OFFS 0x3D8
+#define QIB_7220_XGXSCfg_sel_link_down_for_fctrl_lane_sync_reset_LSB 0x3F
+#define QIB_7220_XGXSCfg_sel_link_down_for_fctrl_lane_sync_reset_RMASK 0x1
+#define QIB_7220_XGXSCfg_Reserved_LSB 0x13
+#define QIB_7220_XGXSCfg_Reserved_RMASK 0xFFFFFFFFFFF
+#define QIB_7220_XGXSCfg_link_sync_mask_LSB 0x9
+#define QIB_7220_XGXSCfg_link_sync_mask_RMASK 0x3FF
+#define QIB_7220_XGXSCfg_Reserved1_LSB 0x3
+#define QIB_7220_XGXSCfg_Reserved1_RMASK 0x3F
+#define QIB_7220_XGXSCfg_xcv_reset_LSB 0x2
+#define QIB_7220_XGXSCfg_xcv_reset_RMASK 0x1
+#define QIB_7220_XGXSCfg_Reserved2_LSB 0x1
+#define QIB_7220_XGXSCfg_Reserved2_RMASK 0x1
+#define QIB_7220_XGXSCfg_tx_rx_reset_LSB 0x0
+#define QIB_7220_XGXSCfg_tx_rx_reset_RMASK 0x1
+
+#define QIB_7220_IBSerDesCtrl_OFFS 0x3E0
+#define QIB_7220_IBSerDesCtrl_Reserved_LSB 0x2D
+#define QIB_7220_IBSerDesCtrl_Reserved_RMASK 0x7FFFF
+#define QIB_7220_IBSerDesCtrl_INT_uC_LSB 0x2C
+#define QIB_7220_IBSerDesCtrl_INT_uC_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_CKSEL_uC_LSB 0x2A
+#define QIB_7220_IBSerDesCtrl_CKSEL_uC_RMASK 0x3
+#define QIB_7220_IBSerDesCtrl_PLLN_LSB 0x28
+#define QIB_7220_IBSerDesCtrl_PLLN_RMASK 0x3
+#define QIB_7220_IBSerDesCtrl_PLLM_LSB 0x25
+#define QIB_7220_IBSerDesCtrl_PLLM_RMASK 0x7
+#define QIB_7220_IBSerDesCtrl_TXOBPD_LSB 0x24
+#define QIB_7220_IBSerDesCtrl_TXOBPD_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_TWC_LSB 0x23
+#define QIB_7220_IBSerDesCtrl_TWC_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_RXIDLE_LSB 0x22
+#define QIB_7220_IBSerDesCtrl_RXIDLE_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_RXINV_LSB 0x21
+#define QIB_7220_IBSerDesCtrl_RXINV_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_TXINV_LSB 0x20
+#define QIB_7220_IBSerDesCtrl_TXINV_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_Reserved1_LSB 0x12
+#define QIB_7220_IBSerDesCtrl_Reserved1_RMASK 0x3FFF
+#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForRXEQ_LSB 0xD
+#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForRXEQ_RMASK 0x1F
+#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForDDS_LSB 0x8
+#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForDDS_RMASK 0x1F
+#define QIB_7220_IBSerDesCtrl_Reserved2_LSB 0x1
+#define QIB_7220_IBSerDesCtrl_Reserved2_RMASK 0x7F
+#define QIB_7220_IBSerDesCtrl_ResetIB_uC_Core_LSB 0x0
+#define QIB_7220_IBSerDesCtrl_ResetIB_uC_Core_RMASK 0x1
+
+#define QIB_7220_pciesd_epb_access_ctrl_OFFS 0x400
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_granted_LSB 0x8
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_granted_RMASK 0x1
+#define QIB_7220_pciesd_epb_access_ctrl_Reserved_LSB 0x3
+#define QIB_7220_pciesd_epb_access_ctrl_Reserved_RMASK 0x1F
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcieepb_star_en_LSB 0x1
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcieepb_star_en_RMASK 0x3
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_LSB 0x0
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_RMASK 0x1
+
+#define QIB_7220_pciesd_epb_transaction_reg_OFFS 0x408
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_rdy_LSB 0x1F
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_rdy_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_req_error_LSB 0x1E
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_req_error_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_Reserved_LSB 0x1D
+#define QIB_7220_pciesd_epb_transaction_reg_Reserved_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_mem_data_parity_LSB 0x1C
+#define QIB_7220_pciesd_epb_transaction_reg_mem_data_parity_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_cs_LSB 0x19
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_cs_RMASK 0x7
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_read_write_LSB 0x18
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_read_write_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_Reserved1_LSB 0x17
+#define QIB_7220_pciesd_epb_transaction_reg_Reserved1_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_address_LSB 0x8
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_address_RMASK 0x7FFF
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_data_LSB 0x0
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_data_RMASK 0xFF
+
+#define QIB_7220_SerDes_DDSRXEQ0_OFFS 0x500
+#define QIB_7220_SerDes_DDSRXEQ0_reg_addr_LSB 0x4
+#define QIB_7220_SerDes_DDSRXEQ0_reg_addr_RMASK 0x3F
+#define QIB_7220_SerDes_DDSRXEQ0_element_num_LSB 0x0
+#define QIB_7220_SerDes_DDSRXEQ0_element_num_RMASK 0xF
+
+#define QIB_7220_LBIntCnt_OFFS 0x13000
+
+#define QIB_7220_LBFlowStallCnt_OFFS 0x13008
+
+#define QIB_7220_TxSDmaDescCnt_OFFS 0x13010
+
+#define QIB_7220_TxUnsupVLErrCnt_OFFS 0x13018
+
+#define QIB_7220_TxDataPktCnt_OFFS 0x13020
+
+#define QIB_7220_TxFlowPktCnt_OFFS 0x13028
+
+#define QIB_7220_TxDwordCnt_OFFS 0x13030
+
+#define QIB_7220_TxLenErrCnt_OFFS 0x13038
+
+#define QIB_7220_TxMaxMinLenErrCnt_OFFS 0x13040
+
+#define QIB_7220_TxUnderrunCnt_OFFS 0x13048
+
+#define QIB_7220_TxFlowStallCnt_OFFS 0x13050
+
+#define QIB_7220_TxDroppedPktCnt_OFFS 0x13058
+
+#define QIB_7220_RxDroppedPktCnt_OFFS 0x13060
+
+#define QIB_7220_RxDataPktCnt_OFFS 0x13068
+
+#define QIB_7220_RxFlowPktCnt_OFFS 0x13070
+
+#define QIB_7220_RxDwordCnt_OFFS 0x13078
+
+#define QIB_7220_RxLenErrCnt_OFFS 0x13080
+
+#define QIB_7220_RxMaxMinLenErrCnt_OFFS 0x13088
+
+#define QIB_7220_RxICRCErrCnt_OFFS 0x13090
+
+#define QIB_7220_RxVCRCErrCnt_OFFS 0x13098
+
+#define QIB_7220_RxFlowCtrlViolCnt_OFFS 0x130A0
+
+#define QIB_7220_RxVersionErrCnt_OFFS 0x130A8
+
+#define QIB_7220_RxLinkMalformCnt_OFFS 0x130B0
+
+#define QIB_7220_RxEBPCnt_OFFS 0x130B8
+
+#define QIB_7220_RxLPCRCErrCnt_OFFS 0x130C0
+
+#define QIB_7220_RxBufOvflCnt_OFFS 0x130C8
+
+#define QIB_7220_RxTIDFullErrCnt_OFFS 0x130D0
+
+#define QIB_7220_RxTIDValidErrCnt_OFFS 0x130D8
+
+#define QIB_7220_RxPKeyMismatchCnt_OFFS 0x130E0
+
+#define QIB_7220_RxP0HdrEgrOvflCnt_OFFS 0x130E8
+
+#define QIB_7220_IBStatusChangeCnt_OFFS 0x13170
+
+#define QIB_7220_IBLinkErrRecoveryCnt_OFFS 0x13178
+
+#define QIB_7220_IBLinkDownedCnt_OFFS 0x13180
+
+#define QIB_7220_IBSymbolErrCnt_OFFS 0x13188
+
+#define QIB_7220_RxVL15DroppedPktCnt_OFFS 0x13190
+
+#define QIB_7220_RxOtherLocalPhyErrCnt_OFFS 0x13198
+
+#define QIB_7220_PcieRetryBufDiagQwordCnt_OFFS 0x131A0
+
+#define QIB_7220_ExcessBufferOvflCnt_OFFS 0x131A8
+
+#define QIB_7220_LocalLinkIntegrityErrCnt_OFFS 0x131B0
+
+#define QIB_7220_RxVlErrCnt_OFFS 0x131B8
+
+#define QIB_7220_RxDlidFltrCnt_OFFS 0x131C0
+
+#define QIB_7220_CNT_0131C8_OFFS 0x131C8
+
+#define QIB_7220_PSStat_OFFS 0x13200
+
+#define QIB_7220_PSStart_OFFS 0x13208
+
+#define QIB_7220_PSInterval_OFFS 0x13210
+
+#define QIB_7220_PSRcvDataCount_OFFS 0x13218
+
+#define QIB_7220_PSRcvPktsCount_OFFS 0x13220
+
+#define QIB_7220_PSXmitDataCount_OFFS 0x13228
+
+#define QIB_7220_PSXmitPktsCount_OFFS 0x13230
+
+#define QIB_7220_PSXmitWaitCount_OFFS 0x13238
+
+#define QIB_7220_CNT_013240_OFFS 0x13240
+
+#define QIB_7220_RcvEgrArray_OFFS 0x14000
+
+#define QIB_7220_MEM_038000_OFFS 0x38000
+
+#define QIB_7220_RcvTIDArray0_OFFS 0x53000
+
+#define QIB_7220_PIOLaunchFIFO_OFFS 0x64000
+
+#define QIB_7220_MEM_064480_OFFS 0x64480
+
+#define QIB_7220_SendPIOpbcCache_OFFS 0x64800
+
+#define QIB_7220_MEM_064C80_OFFS 0x64C80
+
+#define QIB_7220_PreLaunchFIFO_OFFS 0x65000
+
+#define QIB_7220_MEM_065080_OFFS 0x65080
+
+#define QIB_7220_ScoreBoard_OFFS 0x65400
+
+#define QIB_7220_MEM_065440_OFFS 0x65440
+
+#define QIB_7220_DescriptorFIFO_OFFS 0x65800
+
+#define QIB_7220_MEM_065880_OFFS 0x65880
+
+#define QIB_7220_RcvBuf1_OFFS 0x72000
+
+#define QIB_7220_MEM_074800_OFFS 0x74800
+
+#define QIB_7220_RcvBuf2_OFFS 0x75000
+
+#define QIB_7220_MEM_076400_OFFS 0x76400
+
+#define QIB_7220_RcvFlags_OFFS 0x77000
+
+#define QIB_7220_MEM_078400_OFFS 0x78400
+
+#define QIB_7220_RcvLookupBuf1_OFFS 0x79000
+
+#define QIB_7220_MEM_07A400_OFFS 0x7A400
+
+#define QIB_7220_RcvDMADatBuf_OFFS 0x7B000
+
+#define QIB_7220_RcvDMAHdrBuf_OFFS 0x7B800
+
+#define QIB_7220_MiscRXEIntMem_OFFS 0x7C000
+
+#define QIB_7220_MEM_07D400_OFFS 0x7D400
+
+#define QIB_7220_PCIERcvBuf_OFFS 0x80000
+
+#define QIB_7220_PCIERetryBuf_OFFS 0x84000
+
+#define QIB_7220_PCIERcvBufRdToWrAddr_OFFS 0x88000
+
+#define QIB_7220_PCIECplBuf_OFFS 0x90000
+
+#define QIB_7220_IBSerDesMappTable_OFFS 0x94000
+
+#define QIB_7220_MEM_095000_OFFS 0x95000
+
+#define QIB_7220_SendBuf0_MA_OFFS 0x100000
+
+#define QIB_7220_MEM_1A0000_OFFS 0x1A0000
diff --git a/drivers/infiniband/hw/qib/qib_7322_regs.h b/drivers/infiniband/hw/qib/qib_7322_regs.h
new file mode 100644
index 0000000..32dc81f
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_7322_regs.h
@@ -0,0 +1,3163 @@
+/*
+ * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* This file is mechanically generated from RTL. Any hand-edits will be lost! */
+
+#define QIB_7322_Revision_OFFS 0x0
+#define QIB_7322_Revision_DEF 0x0000000002010601
+#define QIB_7322_Revision_R_Simulator_LSB 0x3F
+#define QIB_7322_Revision_R_Simulator_MSB 0x3F
+#define QIB_7322_Revision_R_Simulator_RMASK 0x1
+#define QIB_7322_Revision_R_Emulation_LSB 0x3E
+#define QIB_7322_Revision_R_Emulation_MSB 0x3E
+#define QIB_7322_Revision_R_Emulation_RMASK 0x1
+#define QIB_7322_Revision_R_Emulation_Revcode_LSB 0x28
+#define QIB_7322_Revision_R_Emulation_Revcode_MSB 0x3D
+#define QIB_7322_Revision_R_Emulation_Revcode_RMASK 0x3FFFFF
+#define QIB_7322_Revision_BoardID_LSB 0x20
+#define QIB_7322_Revision_BoardID_MSB 0x27
+#define QIB_7322_Revision_BoardID_RMASK 0xFF
+#define QIB_7322_Revision_R_SW_LSB 0x18
+#define QIB_7322_Revision_R_SW_MSB 0x1F
+#define QIB_7322_Revision_R_SW_RMASK 0xFF
+#define QIB_7322_Revision_R_Arch_LSB 0x10
+#define QIB_7322_Revision_R_Arch_MSB 0x17
+#define QIB_7322_Revision_R_Arch_RMASK 0xFF
+#define QIB_7322_Revision_R_ChipRevMajor_LSB 0x8
+#define QIB_7322_Revision_R_ChipRevMajor_MSB 0xF
+#define QIB_7322_Revision_R_ChipRevMajor_RMASK 0xFF
+#define QIB_7322_Revision_R_ChipRevMinor_LSB 0x0
+#define QIB_7322_Revision_R_ChipRevMinor_MSB 0x7
+#define QIB_7322_Revision_R_ChipRevMinor_RMASK 0xFF
+
+#define QIB_7322_Control_OFFS 0x8
+#define QIB_7322_Control_DEF 0x0000000000000000
+#define QIB_7322_Control_PCIECplQDiagEn_LSB 0x6
+#define QIB_7322_Control_PCIECplQDiagEn_MSB 0x6
+#define QIB_7322_Control_PCIECplQDiagEn_RMASK 0x1
+#define QIB_7322_Control_PCIEPostQDiagEn_LSB 0x5
+#define QIB_7322_Control_PCIEPostQDiagEn_MSB 0x5
+#define QIB_7322_Control_PCIEPostQDiagEn_RMASK 0x1
+#define QIB_7322_Control_SDmaDescFetchPriorityEn_LSB 0x4
+#define QIB_7322_Control_SDmaDescFetchPriorityEn_MSB 0x4
+#define QIB_7322_Control_SDmaDescFetchPriorityEn_RMASK 0x1
+#define QIB_7322_Control_PCIERetryBufDiagEn_LSB 0x3
+#define QIB_7322_Control_PCIERetryBufDiagEn_MSB 0x3
+#define QIB_7322_Control_PCIERetryBufDiagEn_RMASK 0x1
+#define QIB_7322_Control_FreezeMode_LSB 0x1
+#define QIB_7322_Control_FreezeMode_MSB 0x1
+#define QIB_7322_Control_FreezeMode_RMASK 0x1
+#define QIB_7322_Control_SyncReset_LSB 0x0
+#define QIB_7322_Control_SyncReset_MSB 0x0
+#define QIB_7322_Control_SyncReset_RMASK 0x1
+
+#define QIB_7322_PageAlign_OFFS 0x10
+#define QIB_7322_PageAlign_DEF 0x0000000000001000
+
+#define QIB_7322_ContextCnt_OFFS 0x18
+#define QIB_7322_ContextCnt_DEF 0x0000000000000012
+
+#define QIB_7322_Scratch_OFFS 0x20
+#define QIB_7322_Scratch_DEF 0x0000000000000000
+
+#define QIB_7322_CntrRegBase_OFFS 0x28
+#define QIB_7322_CntrRegBase_DEF 0x0000000000011000
+
+#define QIB_7322_SendRegBase_OFFS 0x30
+#define QIB_7322_SendRegBase_DEF 0x0000000000003000
+
+#define QIB_7322_UserRegBase_OFFS 0x38
+#define QIB_7322_UserRegBase_DEF 0x0000000000200000
+
+#define QIB_7322_IntMask_OFFS 0x68
+#define QIB_7322_IntMask_DEF 0x0000000000000000
+#define QIB_7322_IntMask_SDmaIntMask_1_LSB 0x3F
+#define QIB_7322_IntMask_SDmaIntMask_1_MSB 0x3F
+#define QIB_7322_IntMask_SDmaIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SDmaIntMask_0_LSB 0x3E
+#define QIB_7322_IntMask_SDmaIntMask_0_MSB 0x3E
+#define QIB_7322_IntMask_SDmaIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_SDmaProgressIntMask_1_LSB 0x3D
+#define QIB_7322_IntMask_SDmaProgressIntMask_1_MSB 0x3D
+#define QIB_7322_IntMask_SDmaProgressIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SDmaProgressIntMask_0_LSB 0x3C
+#define QIB_7322_IntMask_SDmaProgressIntMask_0_MSB 0x3C
+#define QIB_7322_IntMask_SDmaProgressIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_SDmaIdleIntMask_1_LSB 0x3B
+#define QIB_7322_IntMask_SDmaIdleIntMask_1_MSB 0x3B
+#define QIB_7322_IntMask_SDmaIdleIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SDmaIdleIntMask_0_LSB 0x3A
+#define QIB_7322_IntMask_SDmaIdleIntMask_0_MSB 0x3A
+#define QIB_7322_IntMask_SDmaIdleIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_LSB 0x39
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_MSB 0x39
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_LSB 0x38
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_MSB 0x38
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg17IntMask_LSB 0x31
+#define QIB_7322_IntMask_RcvUrg17IntMask_MSB 0x31
+#define QIB_7322_IntMask_RcvUrg17IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg16IntMask_LSB 0x30
+#define QIB_7322_IntMask_RcvUrg16IntMask_MSB 0x30
+#define QIB_7322_IntMask_RcvUrg16IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg15IntMask_LSB 0x2F
+#define QIB_7322_IntMask_RcvUrg15IntMask_MSB 0x2F
+#define QIB_7322_IntMask_RcvUrg15IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg14IntMask_LSB 0x2E
+#define QIB_7322_IntMask_RcvUrg14IntMask_MSB 0x2E
+#define QIB_7322_IntMask_RcvUrg14IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg13IntMask_LSB 0x2D
+#define QIB_7322_IntMask_RcvUrg13IntMask_MSB 0x2D
+#define QIB_7322_IntMask_RcvUrg13IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg12IntMask_LSB 0x2C
+#define QIB_7322_IntMask_RcvUrg12IntMask_MSB 0x2C
+#define QIB_7322_IntMask_RcvUrg12IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg11IntMask_LSB 0x2B
+#define QIB_7322_IntMask_RcvUrg11IntMask_MSB 0x2B
+#define QIB_7322_IntMask_RcvUrg11IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg10IntMask_LSB 0x2A
+#define QIB_7322_IntMask_RcvUrg10IntMask_MSB 0x2A
+#define QIB_7322_IntMask_RcvUrg10IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg9IntMask_LSB 0x29
+#define QIB_7322_IntMask_RcvUrg9IntMask_MSB 0x29
+#define QIB_7322_IntMask_RcvUrg9IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg8IntMask_LSB 0x28
+#define QIB_7322_IntMask_RcvUrg8IntMask_MSB 0x28
+#define QIB_7322_IntMask_RcvUrg8IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg7IntMask_LSB 0x27
+#define QIB_7322_IntMask_RcvUrg7IntMask_MSB 0x27
+#define QIB_7322_IntMask_RcvUrg7IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg6IntMask_LSB 0x26
+#define QIB_7322_IntMask_RcvUrg6IntMask_MSB 0x26
+#define QIB_7322_IntMask_RcvUrg6IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg5IntMask_LSB 0x25
+#define QIB_7322_IntMask_RcvUrg5IntMask_MSB 0x25
+#define QIB_7322_IntMask_RcvUrg5IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg4IntMask_LSB 0x24
+#define QIB_7322_IntMask_RcvUrg4IntMask_MSB 0x24
+#define QIB_7322_IntMask_RcvUrg4IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg3IntMask_LSB 0x23
+#define QIB_7322_IntMask_RcvUrg3IntMask_MSB 0x23
+#define QIB_7322_IntMask_RcvUrg3IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg2IntMask_LSB 0x22
+#define QIB_7322_IntMask_RcvUrg2IntMask_MSB 0x22
+#define QIB_7322_IntMask_RcvUrg2IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg1IntMask_LSB 0x21
+#define QIB_7322_IntMask_RcvUrg1IntMask_MSB 0x21
+#define QIB_7322_IntMask_RcvUrg1IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg0IntMask_LSB 0x20
+#define QIB_7322_IntMask_RcvUrg0IntMask_MSB 0x20
+#define QIB_7322_IntMask_RcvUrg0IntMask_RMASK 0x1
+#define QIB_7322_IntMask_ErrIntMask_1_LSB 0x1F
+#define QIB_7322_IntMask_ErrIntMask_1_MSB 0x1F
+#define QIB_7322_IntMask_ErrIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_ErrIntMask_0_LSB 0x1E
+#define QIB_7322_IntMask_ErrIntMask_0_MSB 0x1E
+#define QIB_7322_IntMask_ErrIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_ErrIntMask_LSB 0x1D
+#define QIB_7322_IntMask_ErrIntMask_MSB 0x1D
+#define QIB_7322_IntMask_ErrIntMask_RMASK 0x1
+#define QIB_7322_IntMask_AssertGPIOIntMask_LSB 0x1C
+#define QIB_7322_IntMask_AssertGPIOIntMask_MSB 0x1C
+#define QIB_7322_IntMask_AssertGPIOIntMask_RMASK 0x1
+#define QIB_7322_IntMask_SendDoneIntMask_1_LSB 0x19
+#define QIB_7322_IntMask_SendDoneIntMask_1_MSB 0x19
+#define QIB_7322_IntMask_SendDoneIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SendDoneIntMask_0_LSB 0x18
+#define QIB_7322_IntMask_SendDoneIntMask_0_MSB 0x18
+#define QIB_7322_IntMask_SendDoneIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_SendBufAvailIntMask_LSB 0x17
+#define QIB_7322_IntMask_SendBufAvailIntMask_MSB 0x17
+#define QIB_7322_IntMask_SendBufAvailIntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail17IntMask_LSB 0x11
+#define QIB_7322_IntMask_RcvAvail17IntMask_MSB 0x11
+#define QIB_7322_IntMask_RcvAvail17IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail16IntMask_LSB 0x10
+#define QIB_7322_IntMask_RcvAvail16IntMask_MSB 0x10
+#define QIB_7322_IntMask_RcvAvail16IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail15IntMask_LSB 0xF
+#define QIB_7322_IntMask_RcvAvail15IntMask_MSB 0xF
+#define QIB_7322_IntMask_RcvAvail15IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail14IntMask_LSB 0xE
+#define QIB_7322_IntMask_RcvAvail14IntMask_MSB 0xE
+#define QIB_7322_IntMask_RcvAvail14IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail13IntMask_LSB 0xD
+#define QIB_7322_IntMask_RcvAvail13IntMask_MSB 0xD
+#define QIB_7322_IntMask_RcvAvail13IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail12IntMask_LSB 0xC
+#define QIB_7322_IntMask_RcvAvail12IntMask_MSB 0xC
+#define QIB_7322_IntMask_RcvAvail12IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail11IntMask_LSB 0xB
+#define QIB_7322_IntMask_RcvAvail11IntMask_MSB 0xB
+#define QIB_7322_IntMask_RcvAvail11IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail10IntMask_LSB 0xA
+#define QIB_7322_IntMask_RcvAvail10IntMask_MSB 0xA
+#define QIB_7322_IntMask_RcvAvail10IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail9IntMask_LSB 0x9
+#define QIB_7322_IntMask_RcvAvail9IntMask_MSB 0x9
+#define QIB_7322_IntMask_RcvAvail9IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail8IntMask_LSB 0x8
+#define QIB_7322_IntMask_RcvAvail8IntMask_MSB 0x8
+#define QIB_7322_IntMask_RcvAvail8IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail7IntMask_LSB 0x7
+#define QIB_7322_IntMask_RcvAvail7IntMask_MSB 0x7
+#define QIB_7322_IntMask_RcvAvail7IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail6IntMask_LSB 0x6
+#define QIB_7322_IntMask_RcvAvail6IntMask_MSB 0x6
+#define QIB_7322_IntMask_RcvAvail6IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail5IntMask_LSB 0x5
+#define QIB_7322_IntMask_RcvAvail5IntMask_MSB 0x5
+#define QIB_7322_IntMask_RcvAvail5IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail4IntMask_LSB 0x4
+#define QIB_7322_IntMask_RcvAvail4IntMask_MSB 0x4
+#define QIB_7322_IntMask_RcvAvail4IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail3IntMask_LSB 0x3
+#define QIB_7322_IntMask_RcvAvail3IntMask_MSB 0x3
+#define QIB_7322_IntMask_RcvAvail3IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail2IntMask_LSB 0x2
+#define QIB_7322_IntMask_RcvAvail2IntMask_MSB 0x2
+#define QIB_7322_IntMask_RcvAvail2IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail1IntMask_LSB 0x1
+#define QIB_7322_IntMask_RcvAvail1IntMask_MSB 0x1
+#define QIB_7322_IntMask_RcvAvail1IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail0IntMask_LSB 0x0
+#define QIB_7322_IntMask_RcvAvail0IntMask_MSB 0x0
+#define QIB_7322_IntMask_RcvAvail0IntMask_RMASK 0x1
+
+#define QIB_7322_IntStatus_OFFS 0x70
+#define QIB_7322_IntStatus_DEF 0x0000000000000000
+#define QIB_7322_IntStatus_SDmaInt_1_LSB 0x3F
+#define QIB_7322_IntStatus_SDmaInt_1_MSB 0x3F
+#define QIB_7322_IntStatus_SDmaInt_1_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaInt_0_LSB 0x3E
+#define QIB_7322_IntStatus_SDmaInt_0_MSB 0x3E
+#define QIB_7322_IntStatus_SDmaInt_0_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaProgressInt_1_LSB 0x3D
+#define QIB_7322_IntStatus_SDmaProgressInt_1_MSB 0x3D
+#define QIB_7322_IntStatus_SDmaProgressInt_1_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaProgressInt_0_LSB 0x3C
+#define QIB_7322_IntStatus_SDmaProgressInt_0_MSB 0x3C
+#define QIB_7322_IntStatus_SDmaProgressInt_0_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaIdleInt_1_LSB 0x3B
+#define QIB_7322_IntStatus_SDmaIdleInt_1_MSB 0x3B
+#define QIB_7322_IntStatus_SDmaIdleInt_1_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaIdleInt_0_LSB 0x3A
+#define QIB_7322_IntStatus_SDmaIdleInt_0_MSB 0x3A
+#define QIB_7322_IntStatus_SDmaIdleInt_0_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaCleanupDone_1_LSB 0x39
+#define QIB_7322_IntStatus_SDmaCleanupDone_1_MSB 0x39
+#define QIB_7322_IntStatus_SDmaCleanupDone_1_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaCleanupDone_0_LSB 0x38
+#define QIB_7322_IntStatus_SDmaCleanupDone_0_MSB 0x38
+#define QIB_7322_IntStatus_SDmaCleanupDone_0_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg17_LSB 0x31
+#define QIB_7322_IntStatus_RcvUrg17_MSB 0x31
+#define QIB_7322_IntStatus_RcvUrg17_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg16_LSB 0x30
+#define QIB_7322_IntStatus_RcvUrg16_MSB 0x30
+#define QIB_7322_IntStatus_RcvUrg16_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg15_LSB 0x2F
+#define QIB_7322_IntStatus_RcvUrg15_MSB 0x2F
+#define QIB_7322_IntStatus_RcvUrg15_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg14_LSB 0x2E
+#define QIB_7322_IntStatus_RcvUrg14_MSB 0x2E
+#define QIB_7322_IntStatus_RcvUrg14_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg13_LSB 0x2D
+#define QIB_7322_IntStatus_RcvUrg13_MSB 0x2D
+#define QIB_7322_IntStatus_RcvUrg13_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg12_LSB 0x2C
+#define QIB_7322_IntStatus_RcvUrg12_MSB 0x2C
+#define QIB_7322_IntStatus_RcvUrg12_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg11_LSB 0x2B
+#define QIB_7322_IntStatus_RcvUrg11_MSB 0x2B
+#define QIB_7322_IntStatus_RcvUrg11_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg10_LSB 0x2A
+#define QIB_7322_IntStatus_RcvUrg10_MSB 0x2A
+#define QIB_7322_IntStatus_RcvUrg10_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg9_LSB 0x29
+#define QIB_7322_IntStatus_RcvUrg9_MSB 0x29
+#define QIB_7322_IntStatus_RcvUrg9_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg8_LSB 0x28
+#define QIB_7322_IntStatus_RcvUrg8_MSB 0x28
+#define QIB_7322_IntStatus_RcvUrg8_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg7_LSB 0x27
+#define QIB_7322_IntStatus_RcvUrg7_MSB 0x27
+#define QIB_7322_IntStatus_RcvUrg7_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg6_LSB 0x26
+#define QIB_7322_IntStatus_RcvUrg6_MSB 0x26
+#define QIB_7322_IntStatus_RcvUrg6_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg5_LSB 0x25
+#define QIB_7322_IntStatus_RcvUrg5_MSB 0x25
+#define QIB_7322_IntStatus_RcvUrg5_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg4_LSB 0x24
+#define QIB_7322_IntStatus_RcvUrg4_MSB 0x24
+#define QIB_7322_IntStatus_RcvUrg4_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg3_LSB 0x23
+#define QIB_7322_IntStatus_RcvUrg3_MSB 0x23
+#define QIB_7322_IntStatus_RcvUrg3_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg2_LSB 0x22
+#define QIB_7322_IntStatus_RcvUrg2_MSB 0x22
+#define QIB_7322_IntStatus_RcvUrg2_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg1_LSB 0x21
+#define QIB_7322_IntStatus_RcvUrg1_MSB 0x21
+#define QIB_7322_IntStatus_RcvUrg1_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg0_LSB 0x20
+#define QIB_7322_IntStatus_RcvUrg0_MSB 0x20
+#define QIB_7322_IntStatus_RcvUrg0_RMASK 0x1
+#define QIB_7322_IntStatus_Err_1_LSB 0x1F
+#define QIB_7322_IntStatus_Err_1_MSB 0x1F
+#define QIB_7322_IntStatus_Err_1_RMASK 0x1
+#define QIB_7322_IntStatus_Err_0_LSB 0x1E
+#define QIB_7322_IntStatus_Err_0_MSB 0x1E
+#define QIB_7322_IntStatus_Err_0_RMASK 0x1
+#define QIB_7322_IntStatus_Err_LSB 0x1D
+#define QIB_7322_IntStatus_Err_MSB 0x1D
+#define QIB_7322_IntStatus_Err_RMASK 0x1
+#define QIB_7322_IntStatus_AssertGPIO_LSB 0x1C
+#define QIB_7322_IntStatus_AssertGPIO_MSB 0x1C
+#define QIB_7322_IntStatus_AssertGPIO_RMASK 0x1
+#define QIB_7322_IntStatus_SendDone_1_LSB 0x19
+#define QIB_7322_IntStatus_SendDone_1_MSB 0x19
+#define QIB_7322_IntStatus_SendDone_1_RMASK 0x1
+#define QIB_7322_IntStatus_SendDone_0_LSB 0x18
+#define QIB_7322_IntStatus_SendDone_0_MSB 0x18
+#define QIB_7322_IntStatus_SendDone_0_RMASK 0x1
+#define QIB_7322_IntStatus_SendBufAvail_LSB 0x17
+#define QIB_7322_IntStatus_SendBufAvail_MSB 0x17
+#define QIB_7322_IntStatus_SendBufAvail_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail17_LSB 0x11
+#define QIB_7322_IntStatus_RcvAvail17_MSB 0x11
+#define QIB_7322_IntStatus_RcvAvail17_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail16_LSB 0x10
+#define QIB_7322_IntStatus_RcvAvail16_MSB 0x10
+#define QIB_7322_IntStatus_RcvAvail16_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail15_LSB 0xF
+#define QIB_7322_IntStatus_RcvAvail15_MSB 0xF
+#define QIB_7322_IntStatus_RcvAvail15_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail14_LSB 0xE
+#define QIB_7322_IntStatus_RcvAvail14_MSB 0xE
+#define QIB_7322_IntStatus_RcvAvail14_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail13_LSB 0xD
+#define QIB_7322_IntStatus_RcvAvail13_MSB 0xD
+#define QIB_7322_IntStatus_RcvAvail13_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail12_LSB 0xC
+#define QIB_7322_IntStatus_RcvAvail12_MSB 0xC
+#define QIB_7322_IntStatus_RcvAvail12_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail11_LSB 0xB
+#define QIB_7322_IntStatus_RcvAvail11_MSB 0xB
+#define QIB_7322_IntStatus_RcvAvail11_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail10_LSB 0xA
+#define QIB_7322_IntStatus_RcvAvail10_MSB 0xA
+#define QIB_7322_IntStatus_RcvAvail10_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail9_LSB 0x9
+#define QIB_7322_IntStatus_RcvAvail9_MSB 0x9
+#define QIB_7322_IntStatus_RcvAvail9_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail8_LSB 0x8
+#define QIB_7322_IntStatus_RcvAvail8_MSB 0x8
+#define QIB_7322_IntStatus_RcvAvail8_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail7_LSB 0x7
+#define QIB_7322_IntStatus_RcvAvail7_MSB 0x7
+#define QIB_7322_IntStatus_RcvAvail7_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail6_LSB 0x6
+#define QIB_7322_IntStatus_RcvAvail6_MSB 0x6
+#define QIB_7322_IntStatus_RcvAvail6_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail5_LSB 0x5
+#define QIB_7322_IntStatus_RcvAvail5_MSB 0x5
+#define QIB_7322_IntStatus_RcvAvail5_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail4_LSB 0x4
+#define QIB_7322_IntStatus_RcvAvail4_MSB 0x4
+#define QIB_7322_IntStatus_RcvAvail4_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail3_LSB 0x3
+#define QIB_7322_IntStatus_RcvAvail3_MSB 0x3
+#define QIB_7322_IntStatus_RcvAvail3_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail2_LSB 0x2
+#define QIB_7322_IntStatus_RcvAvail2_MSB 0x2
+#define QIB_7322_IntStatus_RcvAvail2_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail1_LSB 0x1
+#define QIB_7322_IntStatus_RcvAvail1_MSB 0x1
+#define QIB_7322_IntStatus_RcvAvail1_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail0_LSB 0x0
+#define QIB_7322_IntStatus_RcvAvail0_MSB 0x0
+#define QIB_7322_IntStatus_RcvAvail0_RMASK 0x1
+
+#define QIB_7322_IntClear_OFFS 0x78
+#define QIB_7322_IntClear_DEF 0x0000000000000000
+#define QIB_7322_IntClear_SDmaIntClear_1_LSB 0x3F
+#define QIB_7322_IntClear_SDmaIntClear_1_MSB 0x3F
+#define QIB_7322_IntClear_SDmaIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SDmaIntClear_0_LSB 0x3E
+#define QIB_7322_IntClear_SDmaIntClear_0_MSB 0x3E
+#define QIB_7322_IntClear_SDmaIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_SDmaProgressIntClear_1_LSB 0x3D
+#define QIB_7322_IntClear_SDmaProgressIntClear_1_MSB 0x3D
+#define QIB_7322_IntClear_SDmaProgressIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SDmaProgressIntClear_0_LSB 0x3C
+#define QIB_7322_IntClear_SDmaProgressIntClear_0_MSB 0x3C
+#define QIB_7322_IntClear_SDmaProgressIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_SDmaIdleIntClear_1_LSB 0x3B
+#define QIB_7322_IntClear_SDmaIdleIntClear_1_MSB 0x3B
+#define QIB_7322_IntClear_SDmaIdleIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SDmaIdleIntClear_0_LSB 0x3A
+#define QIB_7322_IntClear_SDmaIdleIntClear_0_MSB 0x3A
+#define QIB_7322_IntClear_SDmaIdleIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_LSB 0x39
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_MSB 0x39
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_LSB 0x38
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_MSB 0x38
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg17IntClear_LSB 0x31
+#define QIB_7322_IntClear_RcvUrg17IntClear_MSB 0x31
+#define QIB_7322_IntClear_RcvUrg17IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg16IntClear_LSB 0x30
+#define QIB_7322_IntClear_RcvUrg16IntClear_MSB 0x30
+#define QIB_7322_IntClear_RcvUrg16IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg15IntClear_LSB 0x2F
+#define QIB_7322_IntClear_RcvUrg15IntClear_MSB 0x2F
+#define QIB_7322_IntClear_RcvUrg15IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg14IntClear_LSB 0x2E
+#define QIB_7322_IntClear_RcvUrg14IntClear_MSB 0x2E
+#define QIB_7322_IntClear_RcvUrg14IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg13IntClear_LSB 0x2D
+#define QIB_7322_IntClear_RcvUrg13IntClear_MSB 0x2D
+#define QIB_7322_IntClear_RcvUrg13IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg12IntClear_LSB 0x2C
+#define QIB_7322_IntClear_RcvUrg12IntClear_MSB 0x2C
+#define QIB_7322_IntClear_RcvUrg12IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg11IntClear_LSB 0x2B
+#define QIB_7322_IntClear_RcvUrg11IntClear_MSB 0x2B
+#define QIB_7322_IntClear_RcvUrg11IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg10IntClear_LSB 0x2A
+#define QIB_7322_IntClear_RcvUrg10IntClear_MSB 0x2A
+#define QIB_7322_IntClear_RcvUrg10IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg9IntClear_LSB 0x29
+#define QIB_7322_IntClear_RcvUrg9IntClear_MSB 0x29
+#define QIB_7322_IntClear_RcvUrg9IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg8IntClear_LSB 0x28
+#define QIB_7322_IntClear_RcvUrg8IntClear_MSB 0x28
+#define QIB_7322_IntClear_RcvUrg8IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg7IntClear_LSB 0x27
+#define QIB_7322_IntClear_RcvUrg7IntClear_MSB 0x27
+#define QIB_7322_IntClear_RcvUrg7IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg6IntClear_LSB 0x26
+#define QIB_7322_IntClear_RcvUrg6IntClear_MSB 0x26
+#define QIB_7322_IntClear_RcvUrg6IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg5IntClear_LSB 0x25
+#define QIB_7322_IntClear_RcvUrg5IntClear_MSB 0x25
+#define QIB_7322_IntClear_RcvUrg5IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg4IntClear_LSB 0x24
+#define QIB_7322_IntClear_RcvUrg4IntClear_MSB 0x24
+#define QIB_7322_IntClear_RcvUrg4IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg3IntClear_LSB 0x23
+#define QIB_7322_IntClear_RcvUrg3IntClear_MSB 0x23
+#define QIB_7322_IntClear_RcvUrg3IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg2IntClear_LSB 0x22
+#define QIB_7322_IntClear_RcvUrg2IntClear_MSB 0x22
+#define QIB_7322_IntClear_RcvUrg2IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg1IntClear_LSB 0x21
+#define QIB_7322_IntClear_RcvUrg1IntClear_MSB 0x21
+#define QIB_7322_IntClear_RcvUrg1IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg0IntClear_LSB 0x20
+#define QIB_7322_IntClear_RcvUrg0IntClear_MSB 0x20
+#define QIB_7322_IntClear_RcvUrg0IntClear_RMASK 0x1
+#define QIB_7322_IntClear_ErrIntClear_1_LSB 0x1F
+#define QIB_7322_IntClear_ErrIntClear_1_MSB 0x1F
+#define QIB_7322_IntClear_ErrIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_ErrIntClear_0_LSB 0x1E
+#define QIB_7322_IntClear_ErrIntClear_0_MSB 0x1E
+#define QIB_7322_IntClear_ErrIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_ErrIntClear_LSB 0x1D
+#define QIB_7322_IntClear_ErrIntClear_MSB 0x1D
+#define QIB_7322_IntClear_ErrIntClear_RMASK 0x1
+#define QIB_7322_IntClear_AssertGPIOIntClear_LSB 0x1C
+#define QIB_7322_IntClear_AssertGPIOIntClear_MSB 0x1C
+#define QIB_7322_IntClear_AssertGPIOIntClear_RMASK 0x1
+#define QIB_7322_IntClear_SendDoneIntClear_1_LSB 0x19
+#define QIB_7322_IntClear_SendDoneIntClear_1_MSB 0x19
+#define QIB_7322_IntClear_SendDoneIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SendDoneIntClear_0_LSB 0x18
+#define QIB_7322_IntClear_SendDoneIntClear_0_MSB 0x18
+#define QIB_7322_IntClear_SendDoneIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_SendBufAvailIntClear_LSB 0x17
+#define QIB_7322_IntClear_SendBufAvailIntClear_MSB 0x17
+#define QIB_7322_IntClear_SendBufAvailIntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail17IntClear_LSB 0x11
+#define QIB_7322_IntClear_RcvAvail17IntClear_MSB 0x11
+#define QIB_7322_IntClear_RcvAvail17IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail16IntClear_LSB 0x10
+#define QIB_7322_IntClear_RcvAvail16IntClear_MSB 0x10
+#define QIB_7322_IntClear_RcvAvail16IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail15IntClear_LSB 0xF
+#define QIB_7322_IntClear_RcvAvail15IntClear_MSB 0xF
+#define QIB_7322_IntClear_RcvAvail15IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail14IntClear_LSB 0xE
+#define QIB_7322_IntClear_RcvAvail14IntClear_MSB 0xE
+#define QIB_7322_IntClear_RcvAvail14IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail13IntClear_LSB 0xD
+#define QIB_7322_IntClear_RcvAvail13IntClear_MSB 0xD
+#define QIB_7322_IntClear_RcvAvail13IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail12IntClear_LSB 0xC
+#define QIB_7322_IntClear_RcvAvail12IntClear_MSB 0xC
+#define QIB_7322_IntClear_RcvAvail12IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail11IntClear_LSB 0xB
+#define QIB_7322_IntClear_RcvAvail11IntClear_MSB 0xB
+#define QIB_7322_IntClear_RcvAvail11IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail10IntClear_LSB 0xA
+#define QIB_7322_IntClear_RcvAvail10IntClear_MSB 0xA
+#define QIB_7322_IntClear_RcvAvail10IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail9IntClear_LSB 0x9
+#define QIB_7322_IntClear_RcvAvail9IntClear_MSB 0x9
+#define QIB_7322_IntClear_RcvAvail9IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail8IntClear_LSB 0x8
+#define QIB_7322_IntClear_RcvAvail8IntClear_MSB 0x8
+#define QIB_7322_IntClear_RcvAvail8IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail7IntClear_LSB 0x7
+#define QIB_7322_IntClear_RcvAvail7IntClear_MSB 0x7
+#define QIB_7322_IntClear_RcvAvail7IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail6IntClear_LSB 0x6
+#define QIB_7322_IntClear_RcvAvail6IntClear_MSB 0x6
+#define QIB_7322_IntClear_RcvAvail6IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail5IntClear_LSB 0x5
+#define QIB_7322_IntClear_RcvAvail5IntClear_MSB 0x5
+#define QIB_7322_IntClear_RcvAvail5IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail4IntClear_LSB 0x4
+#define QIB_7322_IntClear_RcvAvail4IntClear_MSB 0x4
+#define QIB_7322_IntClear_RcvAvail4IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail3IntClear_LSB 0x3
+#define QIB_7322_IntClear_RcvAvail3IntClear_MSB 0x3
+#define QIB_7322_IntClear_RcvAvail3IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail2IntClear_LSB 0x2
+#define QIB_7322_IntClear_RcvAvail2IntClear_MSB 0x2
+#define QIB_7322_IntClear_RcvAvail2IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail1IntClear_LSB 0x1
+#define QIB_7322_IntClear_RcvAvail1IntClear_MSB 0x1
+#define QIB_7322_IntClear_RcvAvail1IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail0IntClear_LSB 0x0
+#define QIB_7322_IntClear_RcvAvail0IntClear_MSB 0x0
+#define QIB_7322_IntClear_RcvAvail0IntClear_RMASK 0x1
+
+#define QIB_7322_ErrMask_OFFS 0x80
+#define QIB_7322_ErrMask_DEF 0x0000000000000000
+#define QIB_7322_ErrMask_ResetNegatedMask_LSB 0x3F
+#define QIB_7322_ErrMask_ResetNegatedMask_MSB 0x3F
+#define QIB_7322_ErrMask_ResetNegatedMask_RMASK 0x1
+#define QIB_7322_ErrMask_HardwareErrMask_LSB 0x3E
+#define QIB_7322_ErrMask_HardwareErrMask_MSB 0x3E
+#define QIB_7322_ErrMask_HardwareErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_InvalidAddrErrMask_LSB 0x3D
+#define QIB_7322_ErrMask_InvalidAddrErrMask_MSB 0x3D
+#define QIB_7322_ErrMask_InvalidAddrErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SDmaVL15ErrMask_LSB 0x38
+#define QIB_7322_ErrMask_SDmaVL15ErrMask_MSB 0x38
+#define QIB_7322_ErrMask_SDmaVL15ErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_LSB 0x37
+#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_MSB 0x37
+#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_InvalidEEPCmdMask_LSB 0x35
+#define QIB_7322_ErrMask_InvalidEEPCmdMask_MSB 0x35
+#define QIB_7322_ErrMask_InvalidEEPCmdMask_RMASK 0x1
+#define QIB_7322_ErrMask_RcvContextShareErrMask_LSB 0x34
+#define QIB_7322_ErrMask_RcvContextShareErrMask_MSB 0x34
+#define QIB_7322_ErrMask_RcvContextShareErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SendVLMismatchErrMask_LSB 0x24
+#define QIB_7322_ErrMask_SendVLMismatchErrMask_MSB 0x24
+#define QIB_7322_ErrMask_SendVLMismatchErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SendArmLaunchErrMask_LSB 0x23
+#define QIB_7322_ErrMask_SendArmLaunchErrMask_MSB 0x23
+#define QIB_7322_ErrMask_SendArmLaunchErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_LSB 0x1B
+#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_MSB 0x1B
+#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SDmaWrongPortErrMask_LSB 0x1A
+#define QIB_7322_ErrMask_SDmaWrongPortErrMask_MSB 0x1A
+#define QIB_7322_ErrMask_SDmaWrongPortErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_LSB 0x19
+#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_MSB 0x19
+#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_RcvHdrFullErrMask_LSB 0xD
+#define QIB_7322_ErrMask_RcvHdrFullErrMask_MSB 0xD
+#define QIB_7322_ErrMask_RcvHdrFullErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_RcvEgrFullErrMask_LSB 0xC
+#define QIB_7322_ErrMask_RcvEgrFullErrMask_MSB 0xC
+#define QIB_7322_ErrMask_RcvEgrFullErrMask_RMASK 0x1
+
+#define QIB_7322_ErrStatus_OFFS 0x88
+#define QIB_7322_ErrStatus_DEF 0x0000000000000000
+#define QIB_7322_ErrStatus_ResetNegated_LSB 0x3F
+#define QIB_7322_ErrStatus_ResetNegated_MSB 0x3F
+#define QIB_7322_ErrStatus_ResetNegated_RMASK 0x1
+#define QIB_7322_ErrStatus_HardwareErr_LSB 0x3E
+#define QIB_7322_ErrStatus_HardwareErr_MSB 0x3E
+#define QIB_7322_ErrStatus_HardwareErr_RMASK 0x1
+#define QIB_7322_ErrStatus_InvalidAddrErr_LSB 0x3D
+#define QIB_7322_ErrStatus_InvalidAddrErr_MSB 0x3D
+#define QIB_7322_ErrStatus_InvalidAddrErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SDmaVL15Err_LSB 0x38
+#define QIB_7322_ErrStatus_SDmaVL15Err_MSB 0x38
+#define QIB_7322_ErrStatus_SDmaVL15Err_RMASK 0x1
+#define QIB_7322_ErrStatus_SBufVL15MisUseErr_LSB 0x37
+#define QIB_7322_ErrStatus_SBufVL15MisUseErr_MSB 0x37
+#define QIB_7322_ErrStatus_SBufVL15MisUseErr_RMASK 0x1
+#define QIB_7322_ErrStatus_InvalidEEPCmdErr_LSB 0x35
+#define QIB_7322_ErrStatus_InvalidEEPCmdErr_MSB 0x35
+#define QIB_7322_ErrStatus_InvalidEEPCmdErr_RMASK 0x1
+#define QIB_7322_ErrStatus_RcvContextShareErr_LSB 0x34
+#define QIB_7322_ErrStatus_RcvContextShareErr_MSB 0x34
+#define QIB_7322_ErrStatus_RcvContextShareErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SendVLMismatchErr_LSB 0x24
+#define QIB_7322_ErrStatus_SendVLMismatchErr_MSB 0x24
+#define QIB_7322_ErrStatus_SendVLMismatchErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SendArmLaunchErr_LSB 0x23
+#define QIB_7322_ErrStatus_SendArmLaunchErr_MSB 0x23
+#define QIB_7322_ErrStatus_SendArmLaunchErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SendSpecialTriggerErr_LSB 0x1B
+#define QIB_7322_ErrStatus_SendSpecialTriggerErr_MSB 0x1B
+#define QIB_7322_ErrStatus_SendSpecialTriggerErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SDmaWrongPortErr_LSB 0x1A
+#define QIB_7322_ErrStatus_SDmaWrongPortErr_MSB 0x1A
+#define QIB_7322_ErrStatus_SDmaWrongPortErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_LSB 0x19
+#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_MSB 0x19
+#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_RMASK 0x1
+#define QIB_7322_ErrStatus_RcvHdrFullErr_LSB 0xD
+#define QIB_7322_ErrStatus_RcvHdrFullErr_MSB 0xD
+#define QIB_7322_ErrStatus_RcvHdrFullErr_RMASK 0x1
+#define QIB_7322_ErrStatus_RcvEgrFullErr_LSB 0xC
+#define QIB_7322_ErrStatus_RcvEgrFullErr_MSB 0xC
+#define QIB_7322_ErrStatus_RcvEgrFullErr_RMASK 0x1
+
+#define QIB_7322_ErrClear_OFFS 0x90
+#define QIB_7322_ErrClear_DEF 0x0000000000000000
+#define QIB_7322_ErrClear_ResetNegatedClear_LSB 0x3F
+#define QIB_7322_ErrClear_ResetNegatedClear_MSB 0x3F
+#define QIB_7322_ErrClear_ResetNegatedClear_RMASK 0x1
+#define QIB_7322_ErrClear_HardwareErrClear_LSB 0x3E
+#define QIB_7322_ErrClear_HardwareErrClear_MSB 0x3E
+#define QIB_7322_ErrClear_HardwareErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_InvalidAddrErrClear_LSB 0x3D
+#define QIB_7322_ErrClear_InvalidAddrErrClear_MSB 0x3D
+#define QIB_7322_ErrClear_InvalidAddrErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SDmaVL15ErrClear_LSB 0x38
+#define QIB_7322_ErrClear_SDmaVL15ErrClear_MSB 0x38
+#define QIB_7322_ErrClear_SDmaVL15ErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_LSB 0x37
+#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_MSB 0x37
+#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_LSB 0x35
+#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_MSB 0x35
+#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_RcvContextShareErrClear_LSB 0x34
+#define QIB_7322_ErrClear_RcvContextShareErrClear_MSB 0x34
+#define QIB_7322_ErrClear_RcvContextShareErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SendVLMismatchErrMask_LSB 0x24
+#define QIB_7322_ErrClear_SendVLMismatchErrMask_MSB 0x24
+#define QIB_7322_ErrClear_SendVLMismatchErrMask_RMASK 0x1
+#define QIB_7322_ErrClear_SendArmLaunchErrClear_LSB 0x23
+#define QIB_7322_ErrClear_SendArmLaunchErrClear_MSB 0x23
+#define QIB_7322_ErrClear_SendArmLaunchErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_LSB 0x1B
+#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_MSB 0x1B
+#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SDmaWrongPortErrClear_LSB 0x1A
+#define QIB_7322_ErrClear_SDmaWrongPortErrClear_MSB 0x1A
+#define QIB_7322_ErrClear_SDmaWrongPortErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_LSB 0x19
+#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_MSB 0x19
+#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_RcvHdrFullErrClear_LSB 0xD
+#define QIB_7322_ErrClear_RcvHdrFullErrClear_MSB 0xD
+#define QIB_7322_ErrClear_RcvHdrFullErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_RcvEgrFullErrClear_LSB 0xC
+#define QIB_7322_ErrClear_RcvEgrFullErrClear_MSB 0xC
+#define QIB_7322_ErrClear_RcvEgrFullErrClear_RMASK 0x1
+
+#define QIB_7322_HwErrMask_OFFS 0x98
+#define QIB_7322_HwErrMask_DEF 0x0000000000000000
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_LSB 0x3F
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_MSB 0x3F
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_RMASK 0x1
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_LSB 0x3E
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_MSB 0x3E
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_RMASK 0x1
+#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_LSB 0x37
+#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_MSB 0x37
+#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_RMASK 0x1
+#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_LSB 0x36
+#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_MSB 0x36
+#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1
+#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_LSB 0x35
+#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_MSB 0x35
+#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_RMASK 0x1
+#define QIB_7322_HwErrMask_MemoryErrMask_LSB 0x30
+#define QIB_7322_HwErrMask_MemoryErrMask_MSB 0x30
+#define QIB_7322_HwErrMask_MemoryErrMask_RMASK 0x1
+#define QIB_7322_HwErrMask_pcie_phy_txParityErr_LSB 0x22
+#define QIB_7322_HwErrMask_pcie_phy_txParityErr_MSB 0x22
+#define QIB_7322_HwErrMask_pcie_phy_txParityErr_RMASK 0x1
+#define QIB_7322_HwErrMask_PCIeBusParityErrMask_LSB 0x1F
+#define QIB_7322_HwErrMask_PCIeBusParityErrMask_MSB 0x21
+#define QIB_7322_HwErrMask_PCIeBusParityErrMask_RMASK 0x7
+#define QIB_7322_HwErrMask_PcieCplTimeoutMask_LSB 0x1E
+#define QIB_7322_HwErrMask_PcieCplTimeoutMask_MSB 0x1E
+#define QIB_7322_HwErrMask_PcieCplTimeoutMask_RMASK 0x1
+#define QIB_7322_HwErrMask_PciePoisonedTLPMask_LSB 0x1D
+#define QIB_7322_HwErrMask_PciePoisonedTLPMask_MSB 0x1D
+#define QIB_7322_HwErrMask_PciePoisonedTLPMask_RMASK 0x1
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_LSB 0x1C
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_MSB 0x1C
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_RMASK 0x1
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_LSB 0x1B
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_MSB 0x1B
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_RMASK 0x1
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_LSB 0xF
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_MSB 0xF
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_RMASK 0x1
+#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_LSB 0xE
+#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_MSB 0xE
+#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_RMASK 0x1
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_LSB 0xD
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_MSB 0xD
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_RMASK 0x1
+#define QIB_7322_HwErrMask_statusValidNoEopMask_LSB 0xC
+#define QIB_7322_HwErrMask_statusValidNoEopMask_MSB 0xC
+#define QIB_7322_HwErrMask_statusValidNoEopMask_RMASK 0x1
+#define QIB_7322_HwErrMask_LATriggeredMask_LSB 0xB
+#define QIB_7322_HwErrMask_LATriggeredMask_MSB 0xB
+#define QIB_7322_HwErrMask_LATriggeredMask_RMASK 0x1
+
+#define QIB_7322_HwErrStatus_OFFS 0xA0
+#define QIB_7322_HwErrStatus_DEF 0x0000000000000000
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_LSB 0x3F
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_MSB 0x3F
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_RMASK 0x1
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_LSB 0x3E
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_MSB 0x3E
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_RMASK 0x1
+#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_LSB 0x37
+#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_MSB 0x37
+#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_RMASK 0x1
+#define QIB_7322_HwErrStatus_PowerOnBISTFailed_LSB 0x36
+#define QIB_7322_HwErrStatus_PowerOnBISTFailed_MSB 0x36
+#define QIB_7322_HwErrStatus_PowerOnBISTFailed_RMASK 0x1
+#define QIB_7322_HwErrStatus_TempsenseTholdReached_LSB 0x35
+#define QIB_7322_HwErrStatus_TempsenseTholdReached_MSB 0x35
+#define QIB_7322_HwErrStatus_TempsenseTholdReached_RMASK 0x1
+#define QIB_7322_HwErrStatus_MemoryErr_LSB 0x30
+#define QIB_7322_HwErrStatus_MemoryErr_MSB 0x30
+#define QIB_7322_HwErrStatus_MemoryErr_RMASK 0x1
+#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_LSB 0x22
+#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_MSB 0x22
+#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_RMASK 0x1
+#define QIB_7322_HwErrStatus_PCIeBusParity_LSB 0x1F
+#define QIB_7322_HwErrStatus_PCIeBusParity_MSB 0x21
+#define QIB_7322_HwErrStatus_PCIeBusParity_RMASK 0x7
+#define QIB_7322_HwErrStatus_PcieCplTimeout_LSB 0x1E
+#define QIB_7322_HwErrStatus_PcieCplTimeout_MSB 0x1E
+#define QIB_7322_HwErrStatus_PcieCplTimeout_RMASK 0x1
+#define QIB_7322_HwErrStatus_PciePoisonedTLP_LSB 0x1D
+#define QIB_7322_HwErrStatus_PciePoisonedTLP_MSB 0x1D
+#define QIB_7322_HwErrStatus_PciePoisonedTLP_RMASK 0x1
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_LSB 0x1C
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_MSB 0x1C
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_RMASK 0x1
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_LSB 0x1B
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_MSB 0x1B
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_RMASK 0x1
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_LSB 0xF
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_MSB 0xF
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_RMASK 0x1
+#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_LSB 0xE
+#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_MSB 0xE
+#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_RMASK 0x1
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_LSB 0xD
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_MSB 0xD
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_RMASK 0x1
+#define QIB_7322_HwErrStatus_statusValidNoEop_LSB 0xC
+#define QIB_7322_HwErrStatus_statusValidNoEop_MSB 0xC
+#define QIB_7322_HwErrStatus_statusValidNoEop_RMASK 0x1
+#define QIB_7322_HwErrStatus_LATriggered_LSB 0xB
+#define QIB_7322_HwErrStatus_LATriggered_MSB 0xB
+#define QIB_7322_HwErrStatus_LATriggered_RMASK 0x1
+
+#define QIB_7322_HwErrClear_OFFS 0xA8
+#define QIB_7322_HwErrClear_DEF 0x0000000000000000
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_LSB 0x3F
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_MSB 0x3F
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_RMASK 0x1
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_LSB 0x3E
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_MSB 0x3E
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_RMASK 0x1
+#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_LSB 0x37
+#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_MSB 0x37
+#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_RMASK 0x1
+#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_LSB 0x36
+#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_MSB 0x36
+#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1
+#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_LSB 0x35
+#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_MSB 0x35
+#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_RMASK 0x1
+#define QIB_7322_HwErrClear_MemoryErrClear_LSB 0x30
+#define QIB_7322_HwErrClear_MemoryErrClear_MSB 0x30
+#define QIB_7322_HwErrClear_MemoryErrClear_RMASK 0x1
+#define QIB_7322_HwErrClear_pcie_phy_txParityErr_LSB 0x22
+#define QIB_7322_HwErrClear_pcie_phy_txParityErr_MSB 0x22
+#define QIB_7322_HwErrClear_pcie_phy_txParityErr_RMASK 0x1
+#define QIB_7322_HwErrClear_PCIeBusParityClear_LSB 0x1F
+#define QIB_7322_HwErrClear_PCIeBusParityClear_MSB 0x21
+#define QIB_7322_HwErrClear_PCIeBusParityClear_RMASK 0x7
+#define QIB_7322_HwErrClear_PcieCplTimeoutClear_LSB 0x1E
+#define QIB_7322_HwErrClear_PcieCplTimeoutClear_MSB 0x1E
+#define QIB_7322_HwErrClear_PcieCplTimeoutClear_RMASK 0x1
+#define QIB_7322_HwErrClear_PciePoisonedTLPClear_LSB 0x1D
+#define QIB_7322_HwErrClear_PciePoisonedTLPClear_MSB 0x1D
+#define QIB_7322_HwErrClear_PciePoisonedTLPClear_RMASK 0x1
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_LSB 0x1C
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_MSB 0x1C
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_RMASK 0x1
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_LSB 0x1B
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_MSB 0x1B
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_RMASK 0x1
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_LSB 0xF
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_MSB 0xF
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_RMASK 0x1
+#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_LSB 0xE
+#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_MSB 0xE
+#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_RMASK 0x1
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_LSB 0xD
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_MSB 0xD
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_RMASK 0x1
+#define QIB_7322_HwErrClear_statusValidNoEopClear_LSB 0xC
+#define QIB_7322_HwErrClear_statusValidNoEopClear_MSB 0xC
+#define QIB_7322_HwErrClear_statusValidNoEopClear_RMASK 0x1
+#define QIB_7322_HwErrClear_LATriggeredClear_LSB 0xB
+#define QIB_7322_HwErrClear_LATriggeredClear_MSB 0xB
+#define QIB_7322_HwErrClear_LATriggeredClear_RMASK 0x1
+
+#define QIB_7322_HwDiagCtrl_OFFS 0xB0
+#define QIB_7322_HwDiagCtrl_DEF 0x0000000000000000
+#define QIB_7322_HwDiagCtrl_Diagnostic_LSB 0x3F
+#define QIB_7322_HwDiagCtrl_Diagnostic_MSB 0x3F
+#define QIB_7322_HwDiagCtrl_Diagnostic_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_CounterWrEnable_LSB 0x3D
+#define QIB_7322_HwDiagCtrl_CounterWrEnable_MSB 0x3D
+#define QIB_7322_HwDiagCtrl_CounterWrEnable_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_CounterDisable_LSB 0x3C
+#define QIB_7322_HwDiagCtrl_CounterDisable_MSB 0x3C
+#define QIB_7322_HwDiagCtrl_CounterDisable_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F
+#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_MSB 0x22
+#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_LSB 0xF
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_MSB 0xF
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_LSB 0xE
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_MSB 0xE
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_LSB 0xD
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_MSB 0xD
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_LSB 0xC
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_MSB 0xC
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_RMASK 0x1
+
+#define QIB_7322_EXTStatus_OFFS 0xC0
+#define QIB_7322_EXTStatus_DEF 0x000000000000X000
+#define QIB_7322_EXTStatus_GPIOIn_LSB 0x30
+#define QIB_7322_EXTStatus_GPIOIn_MSB 0x3F
+#define QIB_7322_EXTStatus_GPIOIn_RMASK 0xFFFF
+#define QIB_7322_EXTStatus_MemBISTDisabled_LSB 0xF
+#define QIB_7322_EXTStatus_MemBISTDisabled_MSB 0xF
+#define QIB_7322_EXTStatus_MemBISTDisabled_RMASK 0x1
+#define QIB_7322_EXTStatus_MemBISTEndTest_LSB 0xE
+#define QIB_7322_EXTStatus_MemBISTEndTest_MSB 0xE
+#define QIB_7322_EXTStatus_MemBISTEndTest_RMASK 0x1
+
+#define QIB_7322_EXTCtrl_OFFS 0xC8
+#define QIB_7322_EXTCtrl_DEF 0x0000000000000000
+#define QIB_7322_EXTCtrl_GPIOOe_LSB 0x30
+#define QIB_7322_EXTCtrl_GPIOOe_MSB 0x3F
+#define QIB_7322_EXTCtrl_GPIOOe_RMASK 0xFFFF
+#define QIB_7322_EXTCtrl_GPIOInvert_LSB 0x20
+#define QIB_7322_EXTCtrl_GPIOInvert_MSB 0x2F
+#define QIB_7322_EXTCtrl_GPIOInvert_RMASK 0xFFFF
+#define QIB_7322_EXTCtrl_LEDPort1GreenOn_LSB 0x3
+#define QIB_7322_EXTCtrl_LEDPort1GreenOn_MSB 0x3
+#define QIB_7322_EXTCtrl_LEDPort1GreenOn_RMASK 0x1
+#define QIB_7322_EXTCtrl_LEDPort1YellowOn_LSB 0x2
+#define QIB_7322_EXTCtrl_LEDPort1YellowOn_MSB 0x2
+#define QIB_7322_EXTCtrl_LEDPort1YellowOn_RMASK 0x1
+#define QIB_7322_EXTCtrl_LEDPort0GreenOn_LSB 0x1
+#define QIB_7322_EXTCtrl_LEDPort0GreenOn_MSB 0x1
+#define QIB_7322_EXTCtrl_LEDPort0GreenOn_RMASK 0x1
+#define QIB_7322_EXTCtrl_LEDPort0YellowOn_LSB 0x0
+#define QIB_7322_EXTCtrl_LEDPort0YellowOn_MSB 0x0
+#define QIB_7322_EXTCtrl_LEDPort0YellowOn_RMASK 0x1
+
+#define QIB_7322_GPIOOut_OFFS 0xE0
+#define QIB_7322_GPIOOut_DEF 0x0000000000000000
+
+#define QIB_7322_GPIOMask_OFFS 0xE8
+#define QIB_7322_GPIOMask_DEF 0x0000000000000000
+
+#define QIB_7322_GPIOStatus_OFFS 0xF0
+#define QIB_7322_GPIOStatus_DEF 0x0000000000000000
+
+#define QIB_7322_GPIOClear_OFFS 0xF8
+#define QIB_7322_GPIOClear_DEF 0x0000000000000000
+
+#define QIB_7322_RcvCtrl_OFFS 0x100
+#define QIB_7322_RcvCtrl_DEF 0x0000000000000000
+#define QIB_7322_RcvCtrl_TidReDirect_LSB 0x30
+#define QIB_7322_RcvCtrl_TidReDirect_MSB 0x3F
+#define QIB_7322_RcvCtrl_TidReDirect_RMASK 0xFFFF
+#define QIB_7322_RcvCtrl_TailUpd_LSB 0x2F
+#define QIB_7322_RcvCtrl_TailUpd_MSB 0x2F
+#define QIB_7322_RcvCtrl_TailUpd_RMASK 0x1
+#define QIB_7322_RcvCtrl_XrcTypeCode_LSB 0x2C
+#define QIB_7322_RcvCtrl_XrcTypeCode_MSB 0x2E
+#define QIB_7322_RcvCtrl_XrcTypeCode_RMASK 0x7
+#define QIB_7322_RcvCtrl_TidFlowEnable_LSB 0x2B
+#define QIB_7322_RcvCtrl_TidFlowEnable_MSB 0x2B
+#define QIB_7322_RcvCtrl_TidFlowEnable_RMASK 0x1
+#define QIB_7322_RcvCtrl_ContextCfg_LSB 0x29
+#define QIB_7322_RcvCtrl_ContextCfg_MSB 0x2A
+#define QIB_7322_RcvCtrl_ContextCfg_RMASK 0x3
+#define QIB_7322_RcvCtrl_IntrAvail_LSB 0x14
+#define QIB_7322_RcvCtrl_IntrAvail_MSB 0x25
+#define QIB_7322_RcvCtrl_IntrAvail_RMASK 0x3FFFF
+#define QIB_7322_RcvCtrl_dontDropRHQFull_LSB 0x0
+#define QIB_7322_RcvCtrl_dontDropRHQFull_MSB 0x11
+#define QIB_7322_RcvCtrl_dontDropRHQFull_RMASK 0x3FFFF
+
+#define QIB_7322_RcvHdrSize_OFFS 0x110
+#define QIB_7322_RcvHdrSize_DEF 0x0000000000000000
+
+#define QIB_7322_RcvHdrCnt_OFFS 0x118
+#define QIB_7322_RcvHdrCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RcvHdrEntSize_OFFS 0x120
+#define QIB_7322_RcvHdrEntSize_DEF 0x0000000000000000
+
+#define QIB_7322_RcvTIDBase_OFFS 0x128
+#define QIB_7322_RcvTIDBase_DEF 0x0000000000050000
+
+#define QIB_7322_RcvTIDCnt_OFFS 0x130
+#define QIB_7322_RcvTIDCnt_DEF 0x0000000000000200
+
+#define QIB_7322_RcvEgrBase_OFFS 0x138
+#define QIB_7322_RcvEgrBase_DEF 0x0000000000014000
+
+#define QIB_7322_RcvEgrCnt_OFFS 0x140
+#define QIB_7322_RcvEgrCnt_DEF 0x0000000000001000
+
+#define QIB_7322_RcvBufBase_OFFS 0x148
+#define QIB_7322_RcvBufBase_DEF 0x0000000000080000
+
+#define QIB_7322_RcvBufSize_OFFS 0x150
+#define QIB_7322_RcvBufSize_DEF 0x0000000000005000
+
+#define QIB_7322_RxIntMemBase_OFFS 0x158
+#define QIB_7322_RxIntMemBase_DEF 0x0000000000077000
+
+#define QIB_7322_RxIntMemSize_OFFS 0x160
+#define QIB_7322_RxIntMemSize_DEF 0x0000000000007000
+
+#define QIB_7322_feature_mask_OFFS 0x190
+#define QIB_7322_feature_mask_DEF 0x00000000000000XX
+
+#define QIB_7322_active_feature_mask_OFFS 0x198
+#define QIB_7322_active_feature_mask_DEF 0x00000000000000XX
+#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_LSB 0x5
+#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_MSB 0x5
+#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_LSB 0x4
+#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_MSB 0x4
+#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_LSB 0x3
+#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_MSB 0x3
+#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_LSB 0x2
+#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_MSB 0x2
+#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_LSB 0x1
+#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_MSB 0x1
+#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_LSB 0x0
+#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_MSB 0x0
+#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_RMASK 0x1
+
+#define QIB_7322_SendCtrl_OFFS 0x1C0
+#define QIB_7322_SendCtrl_DEF 0x0000000000000000
+#define QIB_7322_SendCtrl_Disarm_LSB 0x1F
+#define QIB_7322_SendCtrl_Disarm_MSB 0x1F
+#define QIB_7322_SendCtrl_Disarm_RMASK 0x1
+#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_LSB 0x1D
+#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_MSB 0x1D
+#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_RMASK 0x1
+#define QIB_7322_SendCtrl_AvailUpdThld_LSB 0x18
+#define QIB_7322_SendCtrl_AvailUpdThld_MSB 0x1C
+#define QIB_7322_SendCtrl_AvailUpdThld_RMASK 0x1F
+#define QIB_7322_SendCtrl_DisarmSendBuf_LSB 0x10
+#define QIB_7322_SendCtrl_DisarmSendBuf_MSB 0x17
+#define QIB_7322_SendCtrl_DisarmSendBuf_RMASK 0xFF
+#define QIB_7322_SendCtrl_SpecialTriggerEn_LSB 0x4
+#define QIB_7322_SendCtrl_SpecialTriggerEn_MSB 0x4
+#define QIB_7322_SendCtrl_SpecialTriggerEn_RMASK 0x1
+#define QIB_7322_SendCtrl_SendBufAvailUpd_LSB 0x2
+#define QIB_7322_SendCtrl_SendBufAvailUpd_MSB 0x2
+#define QIB_7322_SendCtrl_SendBufAvailUpd_RMASK 0x1
+#define QIB_7322_SendCtrl_SendIntBufAvail_LSB 0x1
+#define QIB_7322_SendCtrl_SendIntBufAvail_MSB 0x1
+#define QIB_7322_SendCtrl_SendIntBufAvail_RMASK 0x1
+
+#define QIB_7322_SendBufBase_OFFS 0x1C8
+#define QIB_7322_SendBufBase_DEF 0x0018000000100000
+#define QIB_7322_SendBufBase_BaseAddr_LargePIO_LSB 0x20
+#define QIB_7322_SendBufBase_BaseAddr_LargePIO_MSB 0x34
+#define QIB_7322_SendBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF
+#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_LSB 0x0
+#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_MSB 0x14
+#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF
+
+#define QIB_7322_SendBufSize_OFFS 0x1D0
+#define QIB_7322_SendBufSize_DEF 0x0000108000000880
+#define QIB_7322_SendBufSize_Size_LargePIO_LSB 0x20
+#define QIB_7322_SendBufSize_Size_LargePIO_MSB 0x2C
+#define QIB_7322_SendBufSize_Size_LargePIO_RMASK 0x1FFF
+#define QIB_7322_SendBufSize_Size_SmallPIO_LSB 0x0
+#define QIB_7322_SendBufSize_Size_SmallPIO_MSB 0xB
+#define QIB_7322_SendBufSize_Size_SmallPIO_RMASK 0xFFF
+
+#define QIB_7322_SendBufCnt_OFFS 0x1D8
+#define QIB_7322_SendBufCnt_DEF 0x0000002000000080
+#define QIB_7322_SendBufCnt_Num_LargeBuffers_LSB 0x20
+#define QIB_7322_SendBufCnt_Num_LargeBuffers_MSB 0x25
+#define QIB_7322_SendBufCnt_Num_LargeBuffers_RMASK 0x3F
+#define QIB_7322_SendBufCnt_Num_SmallBuffers_LSB 0x0
+#define QIB_7322_SendBufCnt_Num_SmallBuffers_MSB 0x8
+#define QIB_7322_SendBufCnt_Num_SmallBuffers_RMASK 0x1FF
+
+#define QIB_7322_SendBufAvailAddr_OFFS 0x1E0
+#define QIB_7322_SendBufAvailAddr_DEF 0x0000000000000000
+#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_LSB 0x6
+#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_MSB 0x27
+#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_RMASK 0x3FFFFFFFF
+
+#define QIB_7322_SendBufErr0_OFFS 0x240
+#define QIB_7322_SendBufErr0_DEF 0x0000000000000000
+#define QIB_7322_SendBufErr0_SendBufErr_63_0_LSB 0x0
+#define QIB_7322_SendBufErr0_SendBufErr_63_0_MSB 0x3F
+#define QIB_7322_SendBufErr0_SendBufErr_63_0_RMASK 0x0
+
+#define QIB_7322_AvailUpdCount_OFFS 0x268
+#define QIB_7322_AvailUpdCount_DEF 0x0000000000000000
+#define QIB_7322_AvailUpdCount_AvailUpdCount_LSB 0x0
+#define QIB_7322_AvailUpdCount_AvailUpdCount_MSB 0x4
+#define QIB_7322_AvailUpdCount_AvailUpdCount_RMASK 0x1F
+
+#define QIB_7322_RcvHdrAddr0_OFFS 0x280
+#define QIB_7322_RcvHdrAddr0_DEF 0x0000000000000000
+#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_LSB 0x2
+#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_MSB 0x27
+#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_RMASK 0x3FFFFFFFFF
+
+#define QIB_7322_RcvHdrTailAddr0_OFFS 0x340
+#define QIB_7322_RcvHdrTailAddr0_DEF 0x0000000000000000
+#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_LSB 0x2
+#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_MSB 0x27
+#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_RMASK 0x3FFFFFFFFF
+
+#define QIB_7322_ahb_access_ctrl_OFFS 0x460
+#define QIB_7322_ahb_access_ctrl_DEF 0x0000000000000000
+#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_LSB 0x1
+#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_MSB 0x2
+#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_RMASK 0x3
+#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_LSB 0x0
+#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_MSB 0x0
+#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_RMASK 0x1
+
+#define QIB_7322_ahb_transaction_reg_OFFS 0x468
+#define QIB_7322_ahb_transaction_reg_DEF 0x0000000080000000
+#define QIB_7322_ahb_transaction_reg_ahb_data_LSB 0x20
+#define QIB_7322_ahb_transaction_reg_ahb_data_MSB 0x3F
+#define QIB_7322_ahb_transaction_reg_ahb_data_RMASK 0xFFFFFFFF
+#define QIB_7322_ahb_transaction_reg_ahb_rdy_LSB 0x1F
+#define QIB_7322_ahb_transaction_reg_ahb_rdy_MSB 0x1F
+#define QIB_7322_ahb_transaction_reg_ahb_rdy_RMASK 0x1
+#define QIB_7322_ahb_transaction_reg_ahb_req_err_LSB 0x1E
+#define QIB_7322_ahb_transaction_reg_ahb_req_err_MSB 0x1E
+#define QIB_7322_ahb_transaction_reg_ahb_req_err_RMASK 0x1
+#define QIB_7322_ahb_transaction_reg_write_not_read_LSB 0x1B
+#define QIB_7322_ahb_transaction_reg_write_not_read_MSB 0x1B
+#define QIB_7322_ahb_transaction_reg_write_not_read_RMASK 0x1
+#define QIB_7322_ahb_transaction_reg_ahb_address_LSB 0x10
+#define QIB_7322_ahb_transaction_reg_ahb_address_MSB 0x1A
+#define QIB_7322_ahb_transaction_reg_ahb_address_RMASK 0x7FF
+
+#define QIB_7322_SPC_JTAG_ACCESS_REG_OFFS 0x470
+#define QIB_7322_SPC_JTAG_ACCESS_REG_DEF 0x0000000000000001
+#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_LSB 0xA
+#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_MSB 0xA
+#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_RMASK 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_LSB 0x5
+#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_MSB 0x9
+#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_RMASK 0x1F
+#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_LSB 0x3
+#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_MSB 0x4
+#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_RMASK 0x3
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_LSB 0x2
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_MSB 0x2
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_RMASK 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_LSB 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_MSB 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_RMASK 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_LSB 0x0
+#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_MSB 0x0
+#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_RMASK 0x1
+
+#define QIB_7322_SendCheckMask0_OFFS 0x4C0
+#define QIB_7322_SendCheckMask0_DEF 0x0000000000000000
+#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_LSB 0x0
+#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_MSB 0x3F
+#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_RMASK 0x0
+
+#define QIB_7322_SendGRHCheckMask0_OFFS 0x4E0
+#define QIB_7322_SendGRHCheckMask0_DEF 0x0000000000000000
+#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_LSB 0x0
+#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_MSB 0x3F
+#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_RMASK 0x0
+
+#define QIB_7322_SendIBPacketMask0_OFFS 0x500
+#define QIB_7322_SendIBPacketMask0_DEF 0x0000000000000000
+#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_LSB 0x0
+#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_MSB 0x3F
+#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_RMASK 0x0
+
+#define QIB_7322_IntRedirect0_OFFS 0x540
+#define QIB_7322_IntRedirect0_DEF 0x0000000000000000
+#define QIB_7322_IntRedirect0_vec11_LSB 0x37
+#define QIB_7322_IntRedirect0_vec11_MSB 0x3B
+#define QIB_7322_IntRedirect0_vec11_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec10_LSB 0x32
+#define QIB_7322_IntRedirect0_vec10_MSB 0x36
+#define QIB_7322_IntRedirect0_vec10_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec9_LSB 0x2D
+#define QIB_7322_IntRedirect0_vec9_MSB 0x31
+#define QIB_7322_IntRedirect0_vec9_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec8_LSB 0x28
+#define QIB_7322_IntRedirect0_vec8_MSB 0x2C
+#define QIB_7322_IntRedirect0_vec8_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec7_LSB 0x23
+#define QIB_7322_IntRedirect0_vec7_MSB 0x27
+#define QIB_7322_IntRedirect0_vec7_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec6_LSB 0x1E
+#define QIB_7322_IntRedirect0_vec6_MSB 0x22
+#define QIB_7322_IntRedirect0_vec6_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec5_LSB 0x19
+#define QIB_7322_IntRedirect0_vec5_MSB 0x1D
+#define QIB_7322_IntRedirect0_vec5_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec4_LSB 0x14
+#define QIB_7322_IntRedirect0_vec4_MSB 0x18
+#define QIB_7322_IntRedirect0_vec4_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec3_LSB 0xF
+#define QIB_7322_IntRedirect0_vec3_MSB 0x13
+#define QIB_7322_IntRedirect0_vec3_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec2_LSB 0xA
+#define QIB_7322_IntRedirect0_vec2_MSB 0xE
+#define QIB_7322_IntRedirect0_vec2_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec1_LSB 0x5
+#define QIB_7322_IntRedirect0_vec1_MSB 0x9
+#define QIB_7322_IntRedirect0_vec1_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec0_LSB 0x0
+#define QIB_7322_IntRedirect0_vec0_MSB 0x4
+#define QIB_7322_IntRedirect0_vec0_RMASK 0x1F
+
+#define QIB_7322_Int_Granted_OFFS 0x570
+#define QIB_7322_Int_Granted_DEF 0x0000000000000000
+
+#define QIB_7322_vec_clr_without_int_OFFS 0x578
+#define QIB_7322_vec_clr_without_int_DEF 0x0000000000000000
+
+#define QIB_7322_DCACtrlA_OFFS 0x580
+#define QIB_7322_DCACtrlA_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_LSB 0x4
+#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_MSB 0x4
+#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_RMASK 0x1
+#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_LSB 0x3
+#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_MSB 0x3
+#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_RMASK 0x1
+#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_LSB 0x2
+#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_MSB 0x2
+#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_RMASK 0x1
+#define QIB_7322_DCACtrlA_EagerDCAEnable_LSB 0x1
+#define QIB_7322_DCACtrlA_EagerDCAEnable_MSB 0x1
+#define QIB_7322_DCACtrlA_EagerDCAEnable_RMASK 0x1
+#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_LSB 0x0
+#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_MSB 0x0
+#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_RMASK 0x1
+
+#define QIB_7322_DCACtrlB_OFFS 0x588
+#define QIB_7322_DCACtrlB_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_LSB 0x36
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_MSB 0x3B
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_LSB 0x2E
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_MSB 0x35
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_LSB 0x28
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_MSB 0x2D
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_RMASK 0xFF
+
+#define QIB_7322_DCACtrlC_OFFS 0x590
+#define QIB_7322_DCACtrlC_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_LSB 0x36
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_MSB 0x3B
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_LSB 0x2E
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_MSB 0x35
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_LSB 0x28
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_MSB 0x2D
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_RMASK 0xFF
+
+#define QIB_7322_DCACtrlD_OFFS 0x598
+#define QIB_7322_DCACtrlD_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_LSB 0x36
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_MSB 0x3B
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_LSB 0x2E
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_MSB 0x35
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_LSB 0x28
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_MSB 0x2D
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_RMASK 0xFF
+
+#define QIB_7322_DCACtrlE_OFFS 0x5A0
+#define QIB_7322_DCACtrlE_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_LSB 0x36
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_MSB 0x3B
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_LSB 0x2E
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_MSB 0x35
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_LSB 0x28
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_MSB 0x2D
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_RMASK 0xFF
+
+#define QIB_7322_DCACtrlF_OFFS 0x5A8
+#define QIB_7322_DCACtrlF_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlF_SendDma1DCAOPH_LSB 0x28
+#define QIB_7322_DCACtrlF_SendDma1DCAOPH_MSB 0x2F
+#define QIB_7322_DCACtrlF_SendDma1DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlF_SendDma0DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlF_SendDma0DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlF_SendDma0DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_RMASK 0xFF
+
+#define QIB_7322_RcvAvailTimeOut0_OFFS 0xC00
+#define QIB_7322_RcvAvailTimeOut0_DEF 0x0000000000000000
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_LSB 0x10
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_MSB 0x1F
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_RMASK 0xFFFF
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_LSB 0x0
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_MSB 0xF
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_RMASK 0xFFFF
+
+#define QIB_7322_CntrRegBase_0_OFFS 0x1028
+#define QIB_7322_CntrRegBase_0_DEF 0x0000000000012000
+
+#define QIB_7322_ErrMask_0_OFFS 0x1080
+#define QIB_7322_ErrMask_0_DEF 0x0000000000000000
+#define QIB_7322_ErrMask_0_IBStatusChangedMask_LSB 0x3A
+#define QIB_7322_ErrMask_0_IBStatusChangedMask_MSB 0x3A
+#define QIB_7322_ErrMask_0_IBStatusChangedMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SHeadersErrMask_LSB 0x39
+#define QIB_7322_ErrMask_0_SHeadersErrMask_MSB 0x39
+#define QIB_7322_ErrMask_0_SHeadersErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_LSB 0x36
+#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_MSB 0x36
+#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaHaltErrMask_LSB 0x31
+#define QIB_7322_ErrMask_0_SDmaHaltErrMask_MSB 0x31
+#define QIB_7322_ErrMask_0_SDmaHaltErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_LSB 0x30
+#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_MSB 0x30
+#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_LSB 0x2F
+#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_MSB 0x2F
+#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_LSB 0x2E
+#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_MSB 0x2E
+#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_LSB 0x2D
+#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_MSB 0x2D
+#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_LSB 0x2C
+#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_MSB 0x2C
+#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDma1stDescErrMask_LSB 0x2B
+#define QIB_7322_ErrMask_0_SDma1stDescErrMask_MSB 0x2B
+#define QIB_7322_ErrMask_0_SDma1stDescErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaBaseErrMask_LSB 0x2A
+#define QIB_7322_ErrMask_0_SDmaBaseErrMask_MSB 0x2A
+#define QIB_7322_ErrMask_0_SDmaBaseErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_LSB 0x29
+#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_MSB 0x29
+#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_LSB 0x28
+#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_MSB 0x28
+#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_LSB 0x27
+#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_MSB 0x27
+#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_LSB 0x26
+#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_MSB 0x26
+#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_LSB 0x25
+#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_MSB 0x25
+#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_LSB 0x24
+#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_MSB 0x24
+#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_LSB 0x22
+#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_MSB 0x22
+#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_LSB 0x21
+#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_MSB 0x21
+#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendPktLenErrMask_LSB 0x20
+#define QIB_7322_ErrMask_0_SendPktLenErrMask_MSB 0x20
+#define QIB_7322_ErrMask_0_SendPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendUnderRunErrMask_LSB 0x1F
+#define QIB_7322_ErrMask_0_SendUnderRunErrMask_MSB 0x1F
+#define QIB_7322_ErrMask_0_SendUnderRunErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_LSB 0x1E
+#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_MSB 0x1E
+#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_LSB 0x1D
+#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_MSB 0x1D
+#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_LSB 0x11
+#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_MSB 0x11
+#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvHdrErrMask_LSB 0x10
+#define QIB_7322_ErrMask_0_RcvHdrErrMask_MSB 0x10
+#define QIB_7322_ErrMask_0_RcvHdrErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_LSB 0xF
+#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_MSB 0xF
+#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvBadTidErrMask_LSB 0xE
+#define QIB_7322_ErrMask_0_RcvBadTidErrMask_MSB 0xE
+#define QIB_7322_ErrMask_0_RcvBadTidErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_LSB 0xB
+#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_MSB 0xB
+#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_LSB 0xA
+#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_MSB 0xA
+#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvEBPErrMask_LSB 0x9
+#define QIB_7322_ErrMask_0_RcvEBPErrMask_MSB 0x9
+#define QIB_7322_ErrMask_0_RcvEBPErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_LSB 0x8
+#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_MSB 0x8
+#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_LSB 0x7
+#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_MSB 0x7
+#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_LSB 0x6
+#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_MSB 0x6
+#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_LSB 0x5
+#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_MSB 0x5
+#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_LSB 0x4
+#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_MSB 0x4
+#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_LSB 0x3
+#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_MSB 0x3
+#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvICRCErrMask_LSB 0x2
+#define QIB_7322_ErrMask_0_RcvICRCErrMask_MSB 0x2
+#define QIB_7322_ErrMask_0_RcvICRCErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvVCRCErrMask_LSB 0x1
+#define QIB_7322_ErrMask_0_RcvVCRCErrMask_MSB 0x1
+#define QIB_7322_ErrMask_0_RcvVCRCErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvFormatErrMask_LSB 0x0
+#define QIB_7322_ErrMask_0_RcvFormatErrMask_MSB 0x0
+#define QIB_7322_ErrMask_0_RcvFormatErrMask_RMASK 0x1
+
+#define QIB_7322_ErrStatus_0_OFFS 0x1088
+#define QIB_7322_ErrStatus_0_DEF 0x0000000000000000
+#define QIB_7322_ErrStatus_0_IBStatusChanged_LSB 0x3A
+#define QIB_7322_ErrStatus_0_IBStatusChanged_MSB 0x3A
+#define QIB_7322_ErrStatus_0_IBStatusChanged_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SHeadersErr_LSB 0x39
+#define QIB_7322_ErrStatus_0_SHeadersErr_MSB 0x39
+#define QIB_7322_ErrStatus_0_SHeadersErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_LSB 0x36
+#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_MSB 0x36
+#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaHaltErr_LSB 0x31
+#define QIB_7322_ErrStatus_0_SDmaHaltErr_MSB 0x31
+#define QIB_7322_ErrStatus_0_SDmaHaltErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_LSB 0x30
+#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_MSB 0x30
+#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_LSB 0x2F
+#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_MSB 0x2F
+#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_LSB 0x2E
+#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_MSB 0x2E
+#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaDwEnErr_LSB 0x2D
+#define QIB_7322_ErrStatus_0_SDmaDwEnErr_MSB 0x2D
+#define QIB_7322_ErrStatus_0_SDmaDwEnErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_LSB 0x2C
+#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_MSB 0x2C
+#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDma1stDescErr_LSB 0x2B
+#define QIB_7322_ErrStatus_0_SDma1stDescErr_MSB 0x2B
+#define QIB_7322_ErrStatus_0_SDma1stDescErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaBaseErr_LSB 0x2A
+#define QIB_7322_ErrStatus_0_SDmaBaseErr_MSB 0x2A
+#define QIB_7322_ErrStatus_0_SDmaBaseErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_LSB 0x29
+#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_MSB 0x29
+#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_LSB 0x28
+#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_MSB 0x28
+#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_LSB 0x27
+#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_MSB 0x27
+#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendBufMisuseErr_LSB 0x26
+#define QIB_7322_ErrStatus_0_SendBufMisuseErr_MSB 0x26
+#define QIB_7322_ErrStatus_0_SendBufMisuseErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_LSB 0x25
+#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_MSB 0x25
+#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_LSB 0x24
+#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_MSB 0x24
+#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_LSB 0x22
+#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_MSB 0x22
+#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_LSB 0x21
+#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_MSB 0x21
+#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendPktLenErr_LSB 0x20
+#define QIB_7322_ErrStatus_0_SendPktLenErr_MSB 0x20
+#define QIB_7322_ErrStatus_0_SendPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendUnderRunErr_LSB 0x1F
+#define QIB_7322_ErrStatus_0_SendUnderRunErr_MSB 0x1F
+#define QIB_7322_ErrStatus_0_SendUnderRunErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_LSB 0x1E
+#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_MSB 0x1E
+#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendMinPktLenErr_LSB 0x1D
+#define QIB_7322_ErrStatus_0_SendMinPktLenErr_MSB 0x1D
+#define QIB_7322_ErrStatus_0_SendMinPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_LSB 0x11
+#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_MSB 0x11
+#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvHdrErr_LSB 0x10
+#define QIB_7322_ErrStatus_0_RcvHdrErr_MSB 0x10
+#define QIB_7322_ErrStatus_0_RcvHdrErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvHdrLenErr_LSB 0xF
+#define QIB_7322_ErrStatus_0_RcvHdrLenErr_MSB 0xF
+#define QIB_7322_ErrStatus_0_RcvHdrLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvBadTidErr_LSB 0xE
+#define QIB_7322_ErrStatus_0_RcvBadTidErr_MSB 0xE
+#define QIB_7322_ErrStatus_0_RcvBadTidErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvBadVersionErr_LSB 0xB
+#define QIB_7322_ErrStatus_0_RcvBadVersionErr_MSB 0xB
+#define QIB_7322_ErrStatus_0_RcvBadVersionErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvIBFlowErr_LSB 0xA
+#define QIB_7322_ErrStatus_0_RcvIBFlowErr_MSB 0xA
+#define QIB_7322_ErrStatus_0_RcvIBFlowErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvEBPErr_LSB 0x9
+#define QIB_7322_ErrStatus_0_RcvEBPErr_MSB 0x9
+#define QIB_7322_ErrStatus_0_RcvEBPErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_LSB 0x8
+#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_MSB 0x8
+#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_LSB 0x7
+#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_MSB 0x7
+#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_LSB 0x6
+#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_MSB 0x6
+#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_LSB 0x5
+#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_MSB 0x5
+#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_LSB 0x4
+#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_MSB 0x4
+#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_LSB 0x3
+#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_MSB 0x3
+#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvICRCErr_LSB 0x2
+#define QIB_7322_ErrStatus_0_RcvICRCErr_MSB 0x2
+#define QIB_7322_ErrStatus_0_RcvICRCErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvVCRCErr_LSB 0x1
+#define QIB_7322_ErrStatus_0_RcvVCRCErr_MSB 0x1
+#define QIB_7322_ErrStatus_0_RcvVCRCErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvFormatErr_LSB 0x0
+#define QIB_7322_ErrStatus_0_RcvFormatErr_MSB 0x0
+#define QIB_7322_ErrStatus_0_RcvFormatErr_RMASK 0x1
+
+#define QIB_7322_ErrClear_0_OFFS 0x1090
+#define QIB_7322_ErrClear_0_DEF 0x0000000000000000
+#define QIB_7322_ErrClear_0_IBStatusChangedClear_LSB 0x3A
+#define QIB_7322_ErrClear_0_IBStatusChangedClear_MSB 0x3A
+#define QIB_7322_ErrClear_0_IBStatusChangedClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SHeadersErrClear_LSB 0x39
+#define QIB_7322_ErrClear_0_SHeadersErrClear_MSB 0x39
+#define QIB_7322_ErrClear_0_SHeadersErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_LSB 0x36
+#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_MSB 0x36
+#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaHaltErrClear_LSB 0x31
+#define QIB_7322_ErrClear_0_SDmaHaltErrClear_MSB 0x31
+#define QIB_7322_ErrClear_0_SDmaHaltErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_LSB 0x30
+#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_MSB 0x30
+#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_LSB 0x2F
+#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_MSB 0x2F
+#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_LSB 0x2E
+#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_MSB 0x2E
+#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_LSB 0x2D
+#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_MSB 0x2D
+#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_LSB 0x2C
+#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_MSB 0x2C
+#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDma1stDescErrClear_LSB 0x2B
+#define QIB_7322_ErrClear_0_SDma1stDescErrClear_MSB 0x2B
+#define QIB_7322_ErrClear_0_SDma1stDescErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaBaseErrClear_LSB 0x2A
+#define QIB_7322_ErrClear_0_SDmaBaseErrClear_MSB 0x2A
+#define QIB_7322_ErrClear_0_SDmaBaseErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_LSB 0x29
+#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_MSB 0x29
+#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_LSB 0x28
+#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_MSB 0x28
+#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_LSB 0x27
+#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_MSB 0x27
+#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_LSB 0x26
+#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_MSB 0x26
+#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_LSB 0x25
+#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_MSB 0x25
+#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_LSB 0x24
+#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_MSB 0x24
+#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_LSB 0x22
+#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_MSB 0x22
+#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_LSB 0x21
+#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_MSB 0x21
+#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendPktLenErrClear_LSB 0x20
+#define QIB_7322_ErrClear_0_SendPktLenErrClear_MSB 0x20
+#define QIB_7322_ErrClear_0_SendPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendUnderRunErrClear_LSB 0x1F
+#define QIB_7322_ErrClear_0_SendUnderRunErrClear_MSB 0x1F
+#define QIB_7322_ErrClear_0_SendUnderRunErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_LSB 0x1E
+#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_MSB 0x1E
+#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_LSB 0x1D
+#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_MSB 0x1D
+#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_LSB 0x11
+#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_MSB 0x11
+#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvHdrErrClear_LSB 0x10
+#define QIB_7322_ErrClear_0_RcvHdrErrClear_MSB 0x10
+#define QIB_7322_ErrClear_0_RcvHdrErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_LSB 0xF
+#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_MSB 0xF
+#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvBadTidErrClear_LSB 0xE
+#define QIB_7322_ErrClear_0_RcvBadTidErrClear_MSB 0xE
+#define QIB_7322_ErrClear_0_RcvBadTidErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_LSB 0xB
+#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_MSB 0xB
+#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_LSB 0xA
+#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_MSB 0xA
+#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvEBPErrClear_LSB 0x9
+#define QIB_7322_ErrClear_0_RcvEBPErrClear_MSB 0x9
+#define QIB_7322_ErrClear_0_RcvEBPErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_LSB 0x8
+#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_MSB 0x8
+#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_LSB 0x7
+#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_MSB 0x7
+#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_LSB 0x6
+#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_MSB 0x6
+#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_LSB 0x5
+#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_MSB 0x5
+#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_LSB 0x4
+#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_MSB 0x4
+#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_LSB 0x3
+#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_MSB 0x3
+#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvICRCErrClear_LSB 0x2
+#define QIB_7322_ErrClear_0_RcvICRCErrClear_MSB 0x2
+#define QIB_7322_ErrClear_0_RcvICRCErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvVCRCErrClear_LSB 0x1
+#define QIB_7322_ErrClear_0_RcvVCRCErrClear_MSB 0x1
+#define QIB_7322_ErrClear_0_RcvVCRCErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvFormatErrClear_LSB 0x0
+#define QIB_7322_ErrClear_0_RcvFormatErrClear_MSB 0x0
+#define QIB_7322_ErrClear_0_RcvFormatErrClear_RMASK 0x1
+
+#define QIB_7322_TXEStatus_0_OFFS 0x10B8
+#define QIB_7322_TXEStatus_0_DEF 0x0000000XC00080FF
+#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_LSB 0x1F
+#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_MSB 0x1F
+#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_RMASK 0x1
+#define QIB_7322_TXEStatus_0_RmFifoEmpty_LSB 0x1E
+#define QIB_7322_TXEStatus_0_RmFifoEmpty_MSB 0x1E
+#define QIB_7322_TXEStatus_0_RmFifoEmpty_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_LSB 0xF
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_MSB 0xF
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_LSB 0x7
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_MSB 0x7
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_LSB 0x6
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_MSB 0x6
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_LSB 0x5
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_MSB 0x5
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_LSB 0x4
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_MSB 0x4
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_LSB 0x3
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_MSB 0x3
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_LSB 0x2
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_MSB 0x2
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_LSB 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_MSB 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_LSB 0x0
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_MSB 0x0
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_RMASK 0x1
+
+#define QIB_7322_RcvCtrl_0_OFFS 0x1100
+#define QIB_7322_RcvCtrl_0_DEF 0x0000000000000000
+#define QIB_7322_RcvCtrl_0_RcvResetCredit_LSB 0x2A
+#define QIB_7322_RcvCtrl_0_RcvResetCredit_MSB 0x2A
+#define QIB_7322_RcvCtrl_0_RcvResetCredit_RMASK 0x1
+#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_LSB 0x29
+#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_MSB 0x29
+#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_RMASK 0x1
+#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_LSB 0x28
+#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_MSB 0x28
+#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_RMASK 0x1
+#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_LSB 0x27
+#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_MSB 0x27
+#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_RMASK 0x1
+#define QIB_7322_RcvCtrl_0_ContextEnableUser_LSB 0x2
+#define QIB_7322_RcvCtrl_0_ContextEnableUser_MSB 0x11
+#define QIB_7322_RcvCtrl_0_ContextEnableUser_RMASK 0xFFFF
+#define QIB_7322_RcvCtrl_0_ContextEnableKernel_LSB 0x0
+#define QIB_7322_RcvCtrl_0_ContextEnableKernel_MSB 0x0
+#define QIB_7322_RcvCtrl_0_ContextEnableKernel_RMASK 0x1
+
+#define QIB_7322_RcvBTHQP_0_OFFS 0x1108
+#define QIB_7322_RcvBTHQP_0_DEF 0x0000000000000000
+#define QIB_7322_RcvBTHQP_0_RcvBTHQP_LSB 0x0
+#define QIB_7322_RcvBTHQP_0_RcvBTHQP_MSB 0x17
+#define QIB_7322_RcvBTHQP_0_RcvBTHQP_RMASK 0xFFFFFF
+
+#define QIB_7322_RcvQPMapTableA_0_OFFS 0x1110
+#define QIB_7322_RcvQPMapTableA_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_LSB 0x19
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_MSB 0x1D
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_LSB 0x14
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_MSB 0x18
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_LSB 0xF
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_MSB 0x13
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_LSB 0xA
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_MSB 0xE
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_LSB 0x5
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_MSB 0x9
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_LSB 0x0
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_MSB 0x4
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableB_0_OFFS 0x1118
+#define QIB_7322_RcvQPMapTableB_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_LSB 0x19
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_MSB 0x1D
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_LSB 0x14
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_MSB 0x18
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_LSB 0xF
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_MSB 0x13
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_LSB 0xA
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_MSB 0xE
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_LSB 0x5
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_MSB 0x9
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_LSB 0x0
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_MSB 0x4
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableC_0_OFFS 0x1120
+#define QIB_7322_RcvQPMapTableC_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_LSB 0x19
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_MSB 0x1D
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_LSB 0x14
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_MSB 0x18
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_LSB 0xF
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_MSB 0x13
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_LSB 0xA
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_MSB 0xE
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_LSB 0x5
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_MSB 0x9
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_LSB 0x0
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_MSB 0x4
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableD_0_OFFS 0x1128
+#define QIB_7322_RcvQPMapTableD_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_LSB 0x19
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_MSB 0x1D
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_LSB 0x14
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_MSB 0x18
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_LSB 0xF
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_MSB 0x13
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_LSB 0xA
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_MSB 0xE
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_LSB 0x5
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_MSB 0x9
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_LSB 0x0
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_MSB 0x4
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableE_0_OFFS 0x1130
+#define QIB_7322_RcvQPMapTableE_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_LSB 0x19
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_MSB 0x1D
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_LSB 0x14
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_MSB 0x18
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_LSB 0xF
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_MSB 0x13
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_LSB 0xA
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_MSB 0xE
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_LSB 0x5
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_MSB 0x9
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_LSB 0x0
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_MSB 0x4
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableF_0_OFFS 0x1138
+#define QIB_7322_RcvQPMapTableF_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_LSB 0x5
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_MSB 0x9
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_LSB 0x0
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_MSB 0x4
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_RMASK 0x1F
+
+#define QIB_7322_PSStat_0_OFFS 0x1140
+#define QIB_7322_PSStat_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSStart_0_OFFS 0x1148
+#define QIB_7322_PSStart_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSInterval_0_OFFS 0x1150
+#define QIB_7322_PSInterval_0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvStatus_0_OFFS 0x1160
+#define QIB_7322_RcvStatus_0_DEF 0x0000000000000000
+#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_LSB 0x1
+#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_MSB 0x5
+#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_RMASK 0x1F
+#define QIB_7322_RcvStatus_0_RxPktInProgress_LSB 0x0
+#define QIB_7322_RcvStatus_0_RxPktInProgress_MSB 0x0
+#define QIB_7322_RcvStatus_0_RxPktInProgress_RMASK 0x1
+
+#define QIB_7322_RcvPartitionKey_0_OFFS 0x1168
+#define QIB_7322_RcvPartitionKey_0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvQPMulticastContext_0_OFFS 0x1170
+#define QIB_7322_RcvQPMulticastContext_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_LSB 0x0
+#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_MSB 0x4
+#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_RMASK 0x1F
+
+#define QIB_7322_RcvPktLEDCnt_0_OFFS 0x1178
+#define QIB_7322_RcvPktLEDCnt_0_DEF 0x0000000000000000
+#define QIB_7322_RcvPktLEDCnt_0_ONperiod_LSB 0x20
+#define QIB_7322_RcvPktLEDCnt_0_ONperiod_MSB 0x3F
+#define QIB_7322_RcvPktLEDCnt_0_ONperiod_RMASK 0xFFFFFFFF
+#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_LSB 0x0
+#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_MSB 0x1F
+#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_RMASK 0xFFFFFFFF
+
+#define QIB_7322_SendDmaIdleCnt_0_OFFS 0x1180
+#define QIB_7322_SendDmaIdleCnt_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_LSB 0x0
+#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_MSB 0xF
+#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaReloadCnt_0_OFFS 0x1188
+#define QIB_7322_SendDmaReloadCnt_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_LSB 0x0
+#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_MSB 0xF
+#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaDescCnt_0_OFFS 0x1190
+#define QIB_7322_SendDmaDescCnt_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_LSB 0x0
+#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_MSB 0xF
+#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_RMASK 0xFFFF
+
+#define QIB_7322_SendCtrl_0_OFFS 0x11C0
+#define QIB_7322_SendCtrl_0_DEF 0x0000000000000000
+#define QIB_7322_SendCtrl_0_IBVLArbiterEn_LSB 0xF
+#define QIB_7322_SendCtrl_0_IBVLArbiterEn_MSB 0xF
+#define QIB_7322_SendCtrl_0_IBVLArbiterEn_RMASK 0x1
+#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_LSB 0xE
+#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_MSB 0xE
+#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_RMASK 0x1
+#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_LSB 0xD
+#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_MSB 0xD
+#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaHalt_LSB 0xC
+#define QIB_7322_SendCtrl_0_SDmaHalt_MSB 0xC
+#define QIB_7322_SendCtrl_0_SDmaHalt_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaEnable_LSB 0xB
+#define QIB_7322_SendCtrl_0_SDmaEnable_MSB 0xB
+#define QIB_7322_SendCtrl_0_SDmaEnable_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_LSB 0xA
+#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_MSB 0xA
+#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaIntEnable_LSB 0x9
+#define QIB_7322_SendCtrl_0_SDmaIntEnable_MSB 0x9
+#define QIB_7322_SendCtrl_0_SDmaIntEnable_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaCleanup_LSB 0x8
+#define QIB_7322_SendCtrl_0_SDmaCleanup_MSB 0x8
+#define QIB_7322_SendCtrl_0_SDmaCleanup_RMASK 0x1
+#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_LSB 0x7
+#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_MSB 0x7
+#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SendEnable_LSB 0x3
+#define QIB_7322_SendCtrl_0_SendEnable_MSB 0x3
+#define QIB_7322_SendCtrl_0_SendEnable_RMASK 0x1
+#define QIB_7322_SendCtrl_0_TxeBypassIbc_LSB 0x1
+#define QIB_7322_SendCtrl_0_TxeBypassIbc_MSB 0x1
+#define QIB_7322_SendCtrl_0_TxeBypassIbc_RMASK 0x1
+#define QIB_7322_SendCtrl_0_TxeAbortIbc_LSB 0x0
+#define QIB_7322_SendCtrl_0_TxeAbortIbc_MSB 0x0
+#define QIB_7322_SendCtrl_0_TxeAbortIbc_RMASK 0x1
+
+#define QIB_7322_SendDmaBase_0_OFFS 0x11F8
+#define QIB_7322_SendDmaBase_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaBase_0_SendDmaBase_LSB 0x0
+#define QIB_7322_SendDmaBase_0_SendDmaBase_MSB 0x2F
+#define QIB_7322_SendDmaBase_0_SendDmaBase_RMASK 0xFFFFFFFFFFFF
+
+#define QIB_7322_SendDmaLenGen_0_OFFS 0x1200
+#define QIB_7322_SendDmaLenGen_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaLenGen_0_Generation_LSB 0x10
+#define QIB_7322_SendDmaLenGen_0_Generation_MSB 0x12
+#define QIB_7322_SendDmaLenGen_0_Generation_RMASK 0x7
+#define QIB_7322_SendDmaLenGen_0_Length_LSB 0x0
+#define QIB_7322_SendDmaLenGen_0_Length_MSB 0xF
+#define QIB_7322_SendDmaLenGen_0_Length_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaTail_0_OFFS 0x1208
+#define QIB_7322_SendDmaTail_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaTail_0_SendDmaTail_LSB 0x0
+#define QIB_7322_SendDmaTail_0_SendDmaTail_MSB 0xF
+#define QIB_7322_SendDmaTail_0_SendDmaTail_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaHead_0_OFFS 0x1210
+#define QIB_7322_SendDmaHead_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_LSB 0x20
+#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_MSB 0x2F
+#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_RMASK 0xFFFF
+#define QIB_7322_SendDmaHead_0_SendDmaHead_LSB 0x0
+#define QIB_7322_SendDmaHead_0_SendDmaHead_MSB 0xF
+#define QIB_7322_SendDmaHead_0_SendDmaHead_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaHeadAddr_0_OFFS 0x1218
+#define QIB_7322_SendDmaHeadAddr_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_LSB 0x0
+#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_MSB 0x2F
+#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_RMASK 0xFFFFFFFFFFFF
+
+#define QIB_7322_SendDmaBufMask0_0_OFFS 0x1220
+#define QIB_7322_SendDmaBufMask0_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_LSB 0x0
+#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_MSB 0x3F
+#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_RMASK 0x0
+
+#define QIB_7322_SendDmaStatus_0_OFFS 0x1238
+#define QIB_7322_SendDmaStatus_0_DEF 0x0000000042000000
+#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_LSB 0x3F
+#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_MSB 0x3F
+#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_HaltInProg_LSB 0x3E
+#define QIB_7322_SendDmaStatus_0_HaltInProg_MSB 0x3E
+#define QIB_7322_SendDmaStatus_0_HaltInProg_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_LSB 0x3D
+#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_MSB 0x3D
+#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_LSB 0x2F
+#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_MSB 0x3C
+#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_RMASK 0x3FFF
+#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_LSB 0x28
+#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_MSB 0x2E
+#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_RMASK 0x7F
+#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_LSB 0x20
+#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_MSB 0x27
+#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_RMASK 0xFF
+#define QIB_7322_SendDmaStatus_0_ScbFull_LSB 0x1F
+#define QIB_7322_SendDmaStatus_0_ScbFull_MSB 0x1F
+#define QIB_7322_SendDmaStatus_0_ScbFull_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_ScbEmpty_LSB 0x1E
+#define QIB_7322_SendDmaStatus_0_ScbEmpty_MSB 0x1E
+#define QIB_7322_SendDmaStatus_0_ScbEmpty_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_ScbEntryValid_LSB 0x1D
+#define QIB_7322_SendDmaStatus_0_ScbEntryValid_MSB 0x1D
+#define QIB_7322_SendDmaStatus_0_ScbEntryValid_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_LSB 0x1C
+#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_MSB 0x1C
+#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_LSB 0x1B
+#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_MSB 0x1B
+#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_LSB 0x1A
+#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_MSB 0x1A
+#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_LSB 0x19
+#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_MSB 0x19
+#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoFull_LSB 0x18
+#define QIB_7322_SendDmaStatus_0_SplFifoFull_MSB 0x18
+#define QIB_7322_SendDmaStatus_0_SplFifoFull_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_LSB 0x10
+#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_MSB 0x17
+#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_RMASK 0xFF
+#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_LSB 0x0
+#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_MSB 0xF
+#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaPriorityThld_0_OFFS 0x1258
+#define QIB_7322_SendDmaPriorityThld_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_LSB 0x0
+#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_MSB 0x3
+#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_RMASK 0xF
+
+#define QIB_7322_SendHdrErrSymptom_0_OFFS 0x1260
+#define QIB_7322_SendHdrErrSymptom_0_DEF 0x0000000000000000
+#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_LSB 0x6
+#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_MSB 0x6
+#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_GRHFail_LSB 0x5
+#define QIB_7322_SendHdrErrSymptom_0_GRHFail_MSB 0x5
+#define QIB_7322_SendHdrErrSymptom_0_GRHFail_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_LSB 0x4
+#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_MSB 0x4
+#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_QPFail_LSB 0x3
+#define QIB_7322_SendHdrErrSymptom_0_QPFail_MSB 0x3
+#define QIB_7322_SendHdrErrSymptom_0_QPFail_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_LSB 0x2
+#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_MSB 0x2
+#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_LSB 0x1
+#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_MSB 0x1
+#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_LSB 0x0
+#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_MSB 0x0
+#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_RMASK 0x1
+
+#define QIB_7322_RxCreditVL0_0_OFFS 0x1280
+#define QIB_7322_RxCreditVL0_0_DEF 0x0000000000000000
+#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_LSB 0x10
+#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_MSB 0x1B
+#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_RMASK 0xFFF
+#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_LSB 0x0
+#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_MSB 0xB
+#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_RMASK 0xFFF
+
+#define QIB_7322_SendDmaBufUsed0_0_OFFS 0x1480
+#define QIB_7322_SendDmaBufUsed0_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_LSB 0x0
+#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_MSB 0x3F
+#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_RMASK 0x0
+
+#define QIB_7322_SendCheckControl_0_OFFS 0x14A8
+#define QIB_7322_SendCheckControl_0_DEF 0x0000000000000000
+#define QIB_7322_SendCheckControl_0_PKey_En_LSB 0x4
+#define QIB_7322_SendCheckControl_0_PKey_En_MSB 0x4
+#define QIB_7322_SendCheckControl_0_PKey_En_RMASK 0x1
+#define QIB_7322_SendCheckControl_0_BTHQP_En_LSB 0x3
+#define QIB_7322_SendCheckControl_0_BTHQP_En_MSB 0x3
+#define QIB_7322_SendCheckControl_0_BTHQP_En_RMASK 0x1
+#define QIB_7322_SendCheckControl_0_SLID_En_LSB 0x2
+#define QIB_7322_SendCheckControl_0_SLID_En_MSB 0x2
+#define QIB_7322_SendCheckControl_0_SLID_En_RMASK 0x1
+#define QIB_7322_SendCheckControl_0_RawIPV6_En_LSB 0x1
+#define QIB_7322_SendCheckControl_0_RawIPV6_En_MSB 0x1
+#define QIB_7322_SendCheckControl_0_RawIPV6_En_RMASK 0x1
+#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_LSB 0x0
+#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_MSB 0x0
+#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_RMASK 0x1
+
+#define QIB_7322_SendIBSLIDMask_0_OFFS 0x14B0
+#define QIB_7322_SendIBSLIDMask_0_DEF 0x0000000000000000
+#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_LSB 0x0
+#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_MSB 0xF
+#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_RMASK 0xFFFF
+
+#define QIB_7322_SendIBSLIDAssign_0_OFFS 0x14B8
+#define QIB_7322_SendIBSLIDAssign_0_DEF 0x0000000000000000
+#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_LSB 0x0
+#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_MSB 0xF
+#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_RMASK 0xFFFF
+
+#define QIB_7322_IBCStatusA_0_OFFS 0x1540
+#define QIB_7322_IBCStatusA_0_DEF 0x0000000000000X02
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_LSB 0x27
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_MSB 0x27
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_LSB 0x26
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_MSB 0x26
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_LSB 0x25
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_MSB 0x25
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_LSB 0x24
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_MSB 0x24
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_LSB 0x23
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_MSB 0x23
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_LSB 0x22
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_MSB 0x22
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_LSB 0x21
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_MSB 0x21
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_LSB 0x20
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_MSB 0x20
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxReady_LSB 0x1E
+#define QIB_7322_IBCStatusA_0_TxReady_MSB 0x1E
+#define QIB_7322_IBCStatusA_0_TxReady_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_LSB 0x1D
+#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_MSB 0x1D
+#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_LSB 0xF
+#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_MSB 0xF
+#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_ScrambleEn_LSB 0xE
+#define QIB_7322_IBCStatusA_0_ScrambleEn_MSB 0xE
+#define QIB_7322_IBCStatusA_0_ScrambleEn_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_LSB 0xD
+#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_MSB 0xD
+#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_LSB 0xC
+#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_MSB 0xC
+#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_LSB 0xA
+#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_MSB 0xA
+#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_LinkWidthActive_LSB 0x9
+#define QIB_7322_IBCStatusA_0_LinkWidthActive_MSB 0x9
+#define QIB_7322_IBCStatusA_0_LinkWidthActive_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_LinkSpeedActive_LSB 0x8
+#define QIB_7322_IBCStatusA_0_LinkSpeedActive_MSB 0x8
+#define QIB_7322_IBCStatusA_0_LinkSpeedActive_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_LinkState_LSB 0x5
+#define QIB_7322_IBCStatusA_0_LinkState_MSB 0x7
+#define QIB_7322_IBCStatusA_0_LinkState_RMASK 0x7
+#define QIB_7322_IBCStatusA_0_LinkTrainingState_LSB 0x0
+#define QIB_7322_IBCStatusA_0_LinkTrainingState_MSB 0x4
+#define QIB_7322_IBCStatusA_0_LinkTrainingState_RMASK 0x1F
+
+#define QIB_7322_IBCStatusB_0_OFFS 0x1548
+#define QIB_7322_IBCStatusB_0_DEF 0x00000000XXXXXXXX
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_LSB 0x27
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_MSB 0x27
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_RMASK 0x1
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_LSB 0x26
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_MSB 0x26
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_RMASK 0x1
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_LSB 0x25
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_MSB 0x25
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_RMASK 0x1
+#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_LSB 0x24
+#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_MSB 0x24
+#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_RMASK 0x1
+#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_LSB 0x20
+#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_MSB 0x23
+#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_RMASK 0xF
+#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_LSB 0x1E
+#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_MSB 0x1F
+#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_RMASK 0x3
+#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_LSB 0x1A
+#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_MSB 0x1D
+#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_RMASK 0xF
+#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_LSB 0x0
+#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_MSB 0x19
+#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_RMASK 0x3FFFFFF
+
+#define QIB_7322_IBCCtrlA_0_OFFS 0x1560
+#define QIB_7322_IBCCtrlA_0_DEF 0x0000000000000000
+#define QIB_7322_IBCCtrlA_0_Loopback_LSB 0x3F
+#define QIB_7322_IBCCtrlA_0_Loopback_MSB 0x3F
+#define QIB_7322_IBCCtrlA_0_Loopback_RMASK 0x1
+#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_LSB 0x3E
+#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_MSB 0x3E
+#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_RMASK 0x1
+#define QIB_7322_IBCCtrlA_0_IBLinkEn_LSB 0x3D
+#define QIB_7322_IBCCtrlA_0_IBLinkEn_MSB 0x3D
+#define QIB_7322_IBCCtrlA_0_IBLinkEn_RMASK 0x1
+#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_LSB 0x3C
+#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_MSB 0x3C
+#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_RMASK 0x1
+#define QIB_7322_IBCCtrlA_0_NumVLane_LSB 0x30
+#define QIB_7322_IBCCtrlA_0_NumVLane_MSB 0x32
+#define QIB_7322_IBCCtrlA_0_NumVLane_RMASK 0x7
+#define QIB_7322_IBCCtrlA_0_OverrunThreshold_LSB 0x24
+#define QIB_7322_IBCCtrlA_0_OverrunThreshold_MSB 0x27
+#define QIB_7322_IBCCtrlA_0_OverrunThreshold_RMASK 0xF
+#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_LSB 0x20
+#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_MSB 0x23
+#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_RMASK 0xF
+#define QIB_7322_IBCCtrlA_0_MaxPktLen_LSB 0x15
+#define QIB_7322_IBCCtrlA_0_MaxPktLen_MSB 0x1F
+#define QIB_7322_IBCCtrlA_0_MaxPktLen_RMASK 0x7FF
+#define QIB_7322_IBCCtrlA_0_LinkCmd_LSB 0x13
+#define QIB_7322_IBCCtrlA_0_LinkCmd_MSB 0x14
+#define QIB_7322_IBCCtrlA_0_LinkCmd_RMASK 0x3
+#define QIB_7322_IBCCtrlA_0_LinkInitCmd_LSB 0x10
+#define QIB_7322_IBCCtrlA_0_LinkInitCmd_MSB 0x12
+#define QIB_7322_IBCCtrlA_0_LinkInitCmd_RMASK 0x7
+#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_LSB 0x8
+#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_MSB 0xF
+#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_RMASK 0xFF
+#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_LSB 0x0
+#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_MSB 0x7
+#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_RMASK 0xFF
+
+#define QIB_7322_IBCCtrlB_0_OFFS 0x1568
+#define QIB_7322_IBCCtrlB_0_DEF 0x00000000000305FF
+#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_LSB 0x30
+#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_MSB 0x3F
+#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_RMASK 0xFFFF
+#define QIB_7322_IBCCtrlB_0_IB_DLID_LSB 0x20
+#define QIB_7322_IBCCtrlB_0_IB_DLID_MSB 0x2F
+#define QIB_7322_IBCCtrlB_0_IB_DLID_RMASK 0xFFFF
+#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_LSB 0x1B
+#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_MSB 0x1B
+#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_LSB 0x1A
+#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_MSB 0x1A
+#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_LSB 0x12
+#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_MSB 0x19
+#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_RMASK 0xFF
+#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_LSB 0x11
+#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_MSB 0x11
+#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_LSB 0x10
+#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_MSB 0x10
+#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_DDS_LSB 0xC
+#define QIB_7322_IBCCtrlB_0_SD_DDS_MSB 0xF
+#define QIB_7322_IBCCtrlB_0_SD_DDS_RMASK 0xF
+#define QIB_7322_IBCCtrlB_0_SD_DDSV_LSB 0xB
+#define QIB_7322_IBCCtrlB_0_SD_DDSV_MSB 0xB
+#define QIB_7322_IBCCtrlB_0_SD_DDSV_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_LSB 0xA
+#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_MSB 0xA
+#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_LSB 0x9
+#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_MSB 0x9
+#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_LSB 0x8
+#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_MSB 0x8
+#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_LSB 0x7
+#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_MSB 0x7
+#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_LSB 0x5
+#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_MSB 0x6
+#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_RMASK 0x3
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_LSB 0x4
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_MSB 0x4
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_LSB 0x3
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_MSB 0x3
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_LSB 0x2
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_MSB 0x2
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_LSB 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_MSB 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_LSB 0x0
+#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_MSB 0x0
+#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_RMASK 0x1
+
+#define QIB_7322_IBCCtrlC_0_OFFS 0x1570
+#define QIB_7322_IBCCtrlC_0_DEF 0x0000000000000301
+#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_LSB 0x5
+#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_MSB 0x9
+#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_RMASK 0x1F
+#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_LSB 0x0
+#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_MSB 0x4
+#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_RMASK 0x1F
+
+#define QIB_7322_HRTBT_GUID_0_OFFS 0x1588
+#define QIB_7322_HRTBT_GUID_0_DEF 0x0000000000000000
+
+#define QIB_7322_IB_SDTEST_IF_TX_0_OFFS 0x1590
+#define QIB_7322_IB_SDTEST_IF_TX_0_DEF 0x0000000000000000
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_LSB 0x30
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_MSB 0x3F
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_RMASK 0xFFFF
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_LSB 0x20
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_MSB 0x2F
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_RMASK 0xFFFF
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_LSB 0xD
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_MSB 0xF
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_RMASK 0x7
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_LSB 0xB
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_MSB 0xC
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_RMASK 0x3
+#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_LSB 0x4
+#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_MSB 0x4
+#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_RMASK 0x1
+#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_LSB 0x2
+#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_MSB 0x3
+#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_RMASK 0x3
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_LSB 0x1
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_MSB 0x1
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_RMASK 0x1
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_LSB 0x0
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_MSB 0x0
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_RMASK 0x1
+
+#define QIB_7322_IB_SDTEST_IF_RX_0_OFFS 0x1598
+#define QIB_7322_IB_SDTEST_IF_RX_0_DEF 0x0000000000000000
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_LSB 0x30
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_MSB 0x3F
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_RMASK 0xFFFF
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_LSB 0x20
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_MSB 0x2F
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_RMASK 0xFFFF
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_LSB 0x18
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_MSB 0x1F
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_RMASK 0xFF
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_LSB 0x10
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_MSB 0x17
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_RMASK 0xFF
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_LSB 0x1
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_MSB 0x1
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_RMASK 0x1
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_LSB 0x0
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_MSB 0x0
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_RMASK 0x1
+
+#define QIB_7322_IBNCModeCtrl_0_OFFS 0x15B8
+#define QIB_7322_IBNCModeCtrl_0_DEF 0x0000000000000000
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_LSB 0x22
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_MSB 0x22
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_LSB 0x21
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_MSB 0x21
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_LSB 0x20
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_MSB 0x20
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_LSB 0x11
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_MSB 0x19
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_RMASK 0x1FF
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_LSB 0x8
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_MSB 0x10
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_RMASK 0x1FF
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_LSB 0x2
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_MSB 0x2
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_LSB 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_MSB 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_LSB 0x0
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_MSB 0x0
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_RMASK 0x1
+
+#define QIB_7322_IBSerdesStatus_0_OFFS 0x15D0
+#define QIB_7322_IBSerdesStatus_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBPCSConfig_0_OFFS 0x15D8
+#define QIB_7322_IBPCSConfig_0_DEF 0x0000000000000007
+#define QIB_7322_IBPCSConfig_0_link_sync_mask_LSB 0x9
+#define QIB_7322_IBPCSConfig_0_link_sync_mask_MSB 0x12
+#define QIB_7322_IBPCSConfig_0_link_sync_mask_RMASK 0x3FF
+#define QIB_7322_IBPCSConfig_0_xcv_rreset_LSB 0x2
+#define QIB_7322_IBPCSConfig_0_xcv_rreset_MSB 0x2
+#define QIB_7322_IBPCSConfig_0_xcv_rreset_RMASK 0x1
+#define QIB_7322_IBPCSConfig_0_xcv_treset_LSB 0x1
+#define QIB_7322_IBPCSConfig_0_xcv_treset_MSB 0x1
+#define QIB_7322_IBPCSConfig_0_xcv_treset_RMASK 0x1
+#define QIB_7322_IBPCSConfig_0_tx_rx_reset_LSB 0x0
+#define QIB_7322_IBPCSConfig_0_tx_rx_reset_MSB 0x0
+#define QIB_7322_IBPCSConfig_0_tx_rx_reset_RMASK 0x1
+
+#define QIB_7322_IBSerdesCtrl_0_OFFS 0x15E0
+#define QIB_7322_IBSerdesCtrl_0_DEF 0x0000000000FFA00F
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_LSB 0x1A
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_MSB 0x1A
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_LSB 0x19
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_MSB 0x19
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_LSB 0x18
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_MSB 0x18
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_LSB 0x14
+#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_MSB 0x17
+#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_RMASK 0xF
+#define QIB_7322_IBSerdesCtrl_0_CGMODE_LSB 0x10
+#define QIB_7322_IBSerdesCtrl_0_CGMODE_MSB 0x13
+#define QIB_7322_IBSerdesCtrl_0_CGMODE_RMASK 0xF
+#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_LSB 0xF
+#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_MSB 0xF
+#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_LSB 0xD
+#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_MSB 0xD
+#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_LPEN_LSB 0xC
+#define QIB_7322_IBSerdesCtrl_0_LPEN_MSB 0xC
+#define QIB_7322_IBSerdesCtrl_0_LPEN_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_PLLPD_LSB 0xB
+#define QIB_7322_IBSerdesCtrl_0_PLLPD_MSB 0xB
+#define QIB_7322_IBSerdesCtrl_0_PLLPD_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_TXPD_LSB 0xA
+#define QIB_7322_IBSerdesCtrl_0_TXPD_MSB 0xA
+#define QIB_7322_IBSerdesCtrl_0_TXPD_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_RXPD_LSB 0x9
+#define QIB_7322_IBSerdesCtrl_0_RXPD_MSB 0x9
+#define QIB_7322_IBSerdesCtrl_0_RXPD_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_TXIDLE_LSB 0x8
+#define QIB_7322_IBSerdesCtrl_0_TXIDLE_MSB 0x8
+#define QIB_7322_IBSerdesCtrl_0_TXIDLE_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_CMODE_LSB 0x0
+#define QIB_7322_IBSerdesCtrl_0_CMODE_MSB 0x6
+#define QIB_7322_IBSerdesCtrl_0_CMODE_RMASK 0x7F
+
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_OFFS 0x1600
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_DEF 0x0000000000000000
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_LSB 0x1F
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_MSB 0x1F
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_RMASK 0x1
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_LSB 0x1E
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_MSB 0x1E
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_RMASK 0x1
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_LSB 0xE
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_MSB 0x11
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_RMASK 0xF
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_LSB 0x9
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_MSB 0xD
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_RMASK 0x1F
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_LSB 0x5
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_MSB 0x8
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_RMASK 0xF
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_LSB 0x3
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_MSB 0x4
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_RMASK 0x3
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_LSB 0x0
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_MSB 0x2
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_RMASK 0x7
+
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_OFFS 0x1640
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_OFFS 0x1648
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_OFFS 0x1650
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_OFFS 0x1658
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_OFFS 0x1660
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_OFFS 0x1668
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_TIMER_THRESHOLD_0_OFFS 0x1670
+#define QIB_7322_ADAPT_DISABLE_TIMER_THRESHOLD_0_DEF 0x0000000000000000
+
+#define QIB_7322_HighPriorityLimit_0_OFFS 0x1BC0
+#define QIB_7322_HighPriorityLimit_0_DEF 0x0000000000000000
+#define QIB_7322_HighPriorityLimit_0_Limit_LSB 0x0
+#define QIB_7322_HighPriorityLimit_0_Limit_MSB 0x7
+#define QIB_7322_HighPriorityLimit_0_Limit_RMASK 0xFF
+
+#define QIB_7322_LowPriority0_0_OFFS 0x1C00
+#define QIB_7322_LowPriority0_0_DEF 0x0000000000000000
+#define QIB_7322_LowPriority0_0_VirtualLane_LSB 0x10
+#define QIB_7322_LowPriority0_0_VirtualLane_MSB 0x12
+#define QIB_7322_LowPriority0_0_VirtualLane_RMASK 0x7
+#define QIB_7322_LowPriority0_0_Weight_LSB 0x0
+#define QIB_7322_LowPriority0_0_Weight_MSB 0x7
+#define QIB_7322_LowPriority0_0_Weight_RMASK 0xFF
+
+#define QIB_7322_HighPriority0_0_OFFS 0x1E00
+#define QIB_7322_HighPriority0_0_DEF 0x0000000000000000
+#define QIB_7322_HighPriority0_0_VirtualLane_LSB 0x10
+#define QIB_7322_HighPriority0_0_VirtualLane_MSB 0x12
+#define QIB_7322_HighPriority0_0_VirtualLane_RMASK 0x7
+#define QIB_7322_HighPriority0_0_Weight_LSB 0x0
+#define QIB_7322_HighPriority0_0_Weight_MSB 0x7
+#define QIB_7322_HighPriority0_0_Weight_RMASK 0xFF
+
+#define QIB_7322_CntrRegBase_1_OFFS 0x2028
+#define QIB_7322_CntrRegBase_1_DEF 0x0000000000013000
+
+#define QIB_7322_RcvQPMulticastContext_1_OFFS 0x2170
+
+#define QIB_7322_SendCtrl_1_OFFS 0x21C0
+
+#define QIB_7322_SendBufAvail0_OFFS 0x3000
+#define QIB_7322_SendBufAvail0_DEF 0x0000000000000000
+#define QIB_7322_SendBufAvail0_SendBuf_31_0_LSB 0x0
+#define QIB_7322_SendBufAvail0_SendBuf_31_0_MSB 0x3F
+#define QIB_7322_SendBufAvail0_SendBuf_31_0_RMASK 0x0
+
+#define QIB_7322_MsixTable_OFFS 0x8000
+#define QIB_7322_MsixTable_DEF 0x0000000000000000
+
+#define QIB_7322_MsixPba_OFFS 0x9000
+#define QIB_7322_MsixPba_DEF 0x0000000000000000
+
+#define QIB_7322_LAMemory_OFFS 0xA000
+#define QIB_7322_LAMemory_DEF 0x0000000000000000
+
+#define QIB_7322_LBIntCnt_OFFS 0x11000
+#define QIB_7322_LBIntCnt_DEF 0x0000000000000000
+
+#define QIB_7322_LBFlowStallCnt_OFFS 0x11008
+#define QIB_7322_LBFlowStallCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RxTIDFullErrCnt_OFFS 0x110D0
+#define QIB_7322_RxTIDFullErrCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RxTIDValidErrCnt_OFFS 0x110D8
+#define QIB_7322_RxTIDValidErrCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RxP0HdrEgrOvflCnt_OFFS 0x110E8
+#define QIB_7322_RxP0HdrEgrOvflCnt_DEF 0x0000000000000000
+
+#define QIB_7322_PcieRetryBufDiagQwordCnt_OFFS 0x111A0
+#define QIB_7322_PcieRetryBufDiagQwordCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RxTidFlowDropCnt_OFFS 0x111E0
+#define QIB_7322_RxTidFlowDropCnt_DEF 0x0000000000000000
+
+#define QIB_7322_LBIntCnt_0_OFFS 0x12000
+#define QIB_7322_LBIntCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxCreditUpToDateTimeOut_0_OFFS 0x12008
+#define QIB_7322_TxCreditUpToDateTimeOut_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxSDmaDescCnt_0_OFFS 0x12010
+#define QIB_7322_TxSDmaDescCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxUnsupVLErrCnt_0_OFFS 0x12018
+#define QIB_7322_TxUnsupVLErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxDataPktCnt_0_OFFS 0x12020
+#define QIB_7322_TxDataPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxFlowPktCnt_0_OFFS 0x12028
+#define QIB_7322_TxFlowPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxDwordCnt_0_OFFS 0x12030
+#define QIB_7322_TxDwordCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxLenErrCnt_0_OFFS 0x12038
+#define QIB_7322_TxLenErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxMaxMinLenErrCnt_0_OFFS 0x12040
+#define QIB_7322_TxMaxMinLenErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxUnderrunCnt_0_OFFS 0x12048
+#define QIB_7322_TxUnderrunCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxFlowStallCnt_0_OFFS 0x12050
+#define QIB_7322_TxFlowStallCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxDroppedPktCnt_0_OFFS 0x12058
+#define QIB_7322_TxDroppedPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxDroppedPktCnt_0_OFFS 0x12060
+#define QIB_7322_RxDroppedPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxDataPktCnt_0_OFFS 0x12068
+#define QIB_7322_RxDataPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxFlowPktCnt_0_OFFS 0x12070
+#define QIB_7322_RxFlowPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxDwordCnt_0_OFFS 0x12078
+#define QIB_7322_RxDwordCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxLenErrCnt_0_OFFS 0x12080
+#define QIB_7322_RxLenErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxMaxMinLenErrCnt_0_OFFS 0x12088
+#define QIB_7322_RxMaxMinLenErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxICRCErrCnt_0_OFFS 0x12090
+#define QIB_7322_RxICRCErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxVCRCErrCnt_0_OFFS 0x12098
+#define QIB_7322_RxVCRCErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxFlowCtrlViolCnt_0_OFFS 0x120A0
+#define QIB_7322_RxFlowCtrlViolCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxVersionErrCnt_0_OFFS 0x120A8
+#define QIB_7322_RxVersionErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxLinkMalformCnt_0_OFFS 0x120B0
+#define QIB_7322_RxLinkMalformCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxEBPCnt_0_OFFS 0x120B8
+#define QIB_7322_RxEBPCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxLPCRCErrCnt_0_OFFS 0x120C0
+#define QIB_7322_RxLPCRCErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxBufOvflCnt_0_OFFS 0x120C8
+#define QIB_7322_RxBufOvflCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxLenTruncateCnt_0_OFFS 0x120D0
+#define QIB_7322_RxLenTruncateCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxPKeyMismatchCnt_0_OFFS 0x120E0
+#define QIB_7322_RxPKeyMismatchCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBLinkDownedCnt_0_OFFS 0x12180
+#define QIB_7322_IBLinkDownedCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBSymbolErrCnt_0_OFFS 0x12188
+#define QIB_7322_IBSymbolErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBStatusChangeCnt_0_OFFS 0x12190
+#define QIB_7322_IBStatusChangeCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBLinkErrRecoveryCnt_0_OFFS 0x12198
+#define QIB_7322_IBLinkErrRecoveryCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_ExcessBufferOvflCnt_0_OFFS 0x121A8
+#define QIB_7322_ExcessBufferOvflCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_LocalLinkIntegrityErrCnt_0_OFFS 0x121B0
+#define QIB_7322_LocalLinkIntegrityErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxVlErrCnt_0_OFFS 0x121B8
+#define QIB_7322_RxVlErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxDlidFltrCnt_0_OFFS 0x121C0
+#define QIB_7322_RxDlidFltrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxVL15DroppedPktCnt_0_OFFS 0x121C8
+#define QIB_7322_RxVL15DroppedPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxOtherLocalPhyErrCnt_0_OFFS 0x121D0
+#define QIB_7322_RxOtherLocalPhyErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxQPInvalidContextCnt_0_OFFS 0x121D8
+#define QIB_7322_RxQPInvalidContextCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxHeadersErrCnt_0_OFFS 0x121F8
+#define QIB_7322_TxHeadersErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSRcvDataCount_0_OFFS 0x12218
+#define QIB_7322_PSRcvDataCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSRcvPktsCount_0_OFFS 0x12220
+#define QIB_7322_PSRcvPktsCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitDataCount_0_OFFS 0x12228
+#define QIB_7322_PSXmitDataCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitPktsCount_0_OFFS 0x12230
+#define QIB_7322_PSXmitPktsCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitWaitCount_0_OFFS 0x12238
+#define QIB_7322_PSXmitWaitCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_LBIntCnt_1_OFFS 0x13000
+#define QIB_7322_LBIntCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxCreditUpToDateTimeOut_1_OFFS 0x13008
+#define QIB_7322_TxCreditUpToDateTimeOut_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxSDmaDescCnt_1_OFFS 0x13010
+#define QIB_7322_TxSDmaDescCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxUnsupVLErrCnt_1_OFFS 0x13018
+#define QIB_7322_TxUnsupVLErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxDataPktCnt_1_OFFS 0x13020
+#define QIB_7322_TxDataPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxFlowPktCnt_1_OFFS 0x13028
+#define QIB_7322_TxFlowPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxDwordCnt_1_OFFS 0x13030
+#define QIB_7322_TxDwordCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxLenErrCnt_1_OFFS 0x13038
+#define QIB_7322_TxLenErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxMaxMinLenErrCnt_1_OFFS 0x13040
+#define QIB_7322_TxMaxMinLenErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxUnderrunCnt_1_OFFS 0x13048
+#define QIB_7322_TxUnderrunCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxFlowStallCnt_1_OFFS 0x13050
+#define QIB_7322_TxFlowStallCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxDroppedPktCnt_1_OFFS 0x13058
+#define QIB_7322_TxDroppedPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxDroppedPktCnt_1_OFFS 0x13060
+#define QIB_7322_RxDroppedPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxDataPktCnt_1_OFFS 0x13068
+#define QIB_7322_RxDataPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxFlowPktCnt_1_OFFS 0x13070
+#define QIB_7322_RxFlowPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxDwordCnt_1_OFFS 0x13078
+#define QIB_7322_RxDwordCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxLenErrCnt_1_OFFS 0x13080
+#define QIB_7322_RxLenErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxMaxMinLenErrCnt_1_OFFS 0x13088
+#define QIB_7322_RxMaxMinLenErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxICRCErrCnt_1_OFFS 0x13090
+#define QIB_7322_RxICRCErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxVCRCErrCnt_1_OFFS 0x13098
+#define QIB_7322_RxVCRCErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxFlowCtrlViolCnt_1_OFFS 0x130A0
+#define QIB_7322_RxFlowCtrlViolCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxVersionErrCnt_1_OFFS 0x130A8
+#define QIB_7322_RxVersionErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxLinkMalformCnt_1_OFFS 0x130B0
+#define QIB_7322_RxLinkMalformCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxEBPCnt_1_OFFS 0x130B8
+#define QIB_7322_RxEBPCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxLPCRCErrCnt_1_OFFS 0x130C0
+#define QIB_7322_RxLPCRCErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxBufOvflCnt_1_OFFS 0x130C8
+#define QIB_7322_RxBufOvflCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxLenTruncateCnt_1_OFFS 0x130D0
+#define QIB_7322_RxLenTruncateCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxPKeyMismatchCnt_1_OFFS 0x130E0
+#define QIB_7322_RxPKeyMismatchCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_IBLinkDownedCnt_1_OFFS 0x13180
+#define QIB_7322_IBLinkDownedCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_IBSymbolErrCnt_1_OFFS 0x13188
+#define QIB_7322_IBSymbolErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_IBStatusChangeCnt_1_OFFS 0x13190
+#define QIB_7322_IBStatusChangeCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_IBLinkErrRecoveryCnt_1_OFFS 0x13198
+#define QIB_7322_IBLinkErrRecoveryCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_ExcessBufferOvflCnt_1_OFFS 0x131A8
+#define QIB_7322_ExcessBufferOvflCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_LocalLinkIntegrityErrCnt_1_OFFS 0x131B0
+#define QIB_7322_LocalLinkIntegrityErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxVlErrCnt_1_OFFS 0x131B8
+#define QIB_7322_RxVlErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxDlidFltrCnt_1_OFFS 0x131C0
+#define QIB_7322_RxDlidFltrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxVL15DroppedPktCnt_1_OFFS 0x131C8
+#define QIB_7322_RxVL15DroppedPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxOtherLocalPhyErrCnt_1_OFFS 0x131D0
+#define QIB_7322_RxOtherLocalPhyErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxQPInvalidContextCnt_1_OFFS 0x131D8
+#define QIB_7322_RxQPInvalidContextCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxHeadersErrCnt_1_OFFS 0x131F8
+#define QIB_7322_TxHeadersErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSRcvDataCount_1_OFFS 0x13218
+#define QIB_7322_PSRcvDataCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSRcvPktsCount_1_OFFS 0x13220
+#define QIB_7322_PSRcvPktsCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitDataCount_1_OFFS 0x13228
+#define QIB_7322_PSXmitDataCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitPktsCount_1_OFFS 0x13230
+#define QIB_7322_PSXmitPktsCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitWaitCount_1_OFFS 0x13238
+#define QIB_7322_PSXmitWaitCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_RcvEgrArray_OFFS 0x14000
+#define QIB_7322_RcvEgrArray_DEF 0x0000000000000000
+#define QIB_7322_RcvEgrArray_RT_BufSize_LSB 0x25
+#define QIB_7322_RcvEgrArray_RT_BufSize_MSB 0x27
+#define QIB_7322_RcvEgrArray_RT_BufSize_RMASK 0x7
+#define QIB_7322_RcvEgrArray_RT_Addr_LSB 0x0
+#define QIB_7322_RcvEgrArray_RT_Addr_MSB 0x24
+#define QIB_7322_RcvEgrArray_RT_Addr_RMASK 0x1FFFFFFFFF
+
+#define QIB_7322_RcvTIDArray0_OFFS 0x50000
+#define QIB_7322_RcvTIDArray0_DEF 0x0000000000000000
+#define QIB_7322_RcvTIDArray0_RT_BufSize_LSB 0x25
+#define QIB_7322_RcvTIDArray0_RT_BufSize_MSB 0x27
+#define QIB_7322_RcvTIDArray0_RT_BufSize_RMASK 0x7
+#define QIB_7322_RcvTIDArray0_RT_Addr_LSB 0x0
+#define QIB_7322_RcvTIDArray0_RT_Addr_MSB 0x24
+#define QIB_7322_RcvTIDArray0_RT_Addr_RMASK 0x1FFFFFFFFF
+
+#define QIB_7322_IBSD_DDS_MAP_TABLE_0_OFFS 0xD0000
+#define QIB_7322_IBSD_DDS_MAP_TABLE_0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvHdrTail0_OFFS 0x200000
+#define QIB_7322_RcvHdrTail0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvHdrHead0_OFFS 0x200008
+#define QIB_7322_RcvHdrHead0_DEF 0x0000000000000000
+#define QIB_7322_RcvHdrHead0_counter_LSB 0x20
+#define QIB_7322_RcvHdrHead0_counter_MSB 0x2F
+#define QIB_7322_RcvHdrHead0_counter_RMASK 0xFFFF
+#define QIB_7322_RcvHdrHead0_RcvHeadPointer_LSB 0x0
+#define QIB_7322_RcvHdrHead0_RcvHeadPointer_MSB 0x1F
+#define QIB_7322_RcvHdrHead0_RcvHeadPointer_RMASK 0xFFFFFFFF
+
+#define QIB_7322_RcvEgrIndexTail0_OFFS 0x200010
+#define QIB_7322_RcvEgrIndexTail0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvEgrIndexHead0_OFFS 0x200018
+#define QIB_7322_RcvEgrIndexHead0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvTIDFlowTable0_OFFS 0x201000
+#define QIB_7322_RcvTIDFlowTable0_DEF 0x0000000000000000
+#define QIB_7322_RcvTIDFlowTable0_GenMismatch_LSB 0x1C
+#define QIB_7322_RcvTIDFlowTable0_GenMismatch_MSB 0x1C
+#define QIB_7322_RcvTIDFlowTable0_GenMismatch_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_LSB 0x1B
+#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_MSB 0x1B
+#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_LSB 0x16
+#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_MSB 0x16
+#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_LSB 0x15
+#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_MSB 0x15
+#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_LSB 0x14
+#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_MSB 0x14
+#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_FlowValid_LSB 0x13
+#define QIB_7322_RcvTIDFlowTable0_FlowValid_MSB 0x13
+#define QIB_7322_RcvTIDFlowTable0_FlowValid_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_GenVal_LSB 0xB
+#define QIB_7322_RcvTIDFlowTable0_GenVal_MSB 0x12
+#define QIB_7322_RcvTIDFlowTable0_GenVal_RMASK 0xFF
+#define QIB_7322_RcvTIDFlowTable0_SeqNum_LSB 0x0
+#define QIB_7322_RcvTIDFlowTable0_SeqNum_MSB 0xA
+#define QIB_7322_RcvTIDFlowTable0_SeqNum_RMASK 0x7FF
diff --git a/drivers/infiniband/hw/qib/qib_common.h b/drivers/infiniband/hw/qib/qib_common.h
new file mode 100644
index 0000000..a4a1f56
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_common.h
@@ -0,0 +1,805 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QIB_COMMON_H
+#define _QIB_COMMON_H
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+/* This is the IEEE-assigned OUI for QLogic Inc. QLogic_IB */
+#define QIB_SRC_OUI_1 0x00
+#define QIB_SRC_OUI_2 0x11
+#define QIB_SRC_OUI_3 0x75
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * QIB_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fastpath code
+ * QIB_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in faspath code
+ * _QIB_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ */
+
+/*
+ * The value in the BTH QP field that QLogic_IB uses to differentiate
+ * an qlogic_ib protocol IB packet vs standard IB transport
+ * This it needs to be even (0x656b78), because the LSB is sometimes
+ * used for the MSB of context. The change may cause a problem
+ * interoperating with older software.
+ */
+#define QIB_KD_QP 0x656b78
+
+/*
+ * These are the status bits readable (in ascii form, 64bit value)
+ * from the "status" sysfs file.  For binary compatibility, values
+ * must remain as is; removed states can be reused for different
+ * purposes.
+ */
+#define QIB_STATUS_INITTED       0x1    /* basic initialization done */
+/* Chip has been found and initted */
+#define QIB_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define QIB_STATUS_IB_READY     0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define QIB_STATUS_IB_CONF      0x80
+/* A Fatal hardware error has occurred. */
+#define QIB_STATUS_HWERROR     0x200
+
+/*
+ * The list of usermode accessible registers.  Also see Reg_* later in file.
+ */
+enum qib_ureg {
+	/* (RO)  DMA RcvHdr to be used next. */
+	ur_rcvhdrtail = 0,
+	/* (RW)  RcvHdr entry to be processed next by host. */
+	ur_rcvhdrhead = 1,
+	/* (RO)  Index of next Eager index to use. */
+	ur_rcvegrindextail = 2,
+	/* (RW)  Eager TID to be processed next */
+	ur_rcvegrindexhead = 3,
+	/* For internal use only; max register number. */
+	_QIB_UregMax
+};
+
+/* bit values for spi_runtime_flags */
+#define QIB_RUNTIME_PCIE                0x0002
+#define QIB_RUNTIME_FORCE_WC_ORDER      0x0004
+#define QIB_RUNTIME_RCVHDR_COPY         0x0008
+#define QIB_RUNTIME_MASTER              0x0010
+#define QIB_RUNTIME_RCHK                0x0020
+#define QIB_RUNTIME_NODMA_RTAIL         0x0080
+#define QIB_RUNTIME_SPECIAL_TRIGGER     0x0100
+#define QIB_RUNTIME_SDMA                0x0200
+#define QIB_RUNTIME_FORCE_PIOAVAIL      0x0400
+#define QIB_RUNTIME_PIO_REGSWAPPED      0x0800
+#define QIB_RUNTIME_CTXT_MSB_IN_QP      0x1000
+#define QIB_RUNTIME_CTXT_REDIRECT       0x2000
+#define QIB_RUNTIME_HDRSUPP             0x4000
+
+/*
+ * This structure is returned by qib_userinit() immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explict pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct qib_base_info {
+	/* version of hardware, for feature checking. */
+	__u32 spi_hw_version;
+	/* version of software, for feature checking. */
+	__u32 spi_sw_version;
+	/* QLogic_IB context assigned, goes into sent packets */
+	__u16 spi_ctxt;
+	__u16 spi_subctxt;
+	/*
+	 * IB MTU, packets IB data must be less than this.
+	 * The MTU is in bytes, and will be a multiple of 4 bytes.
+	 */
+	__u32 spi_mtu;
+	/*
+	 * Size of a PIO buffer.  Any given packet's total size must be less
+	 * than this (in words).  Included is the starting control word, so
+	 * if 513 is returned, then total pkt size is 512 words or less.
+	 */
+	__u32 spi_piosize;
+	/* size of the TID cache in qlogic_ib, in entries */
+	__u32 spi_tidcnt;
+	/* size of the TID Eager list in qlogic_ib, in entries */
+	__u32 spi_tidegrcnt;
+	/* size of a single receive header queue entry in words. */
+	__u32 spi_rcvhdrent_size;
+	/*
+	 * Count of receive header queue entries allocated.
+	 * This may be less than the spu_rcvhdrcnt passed in!.
+	 */
+	__u32 spi_rcvhdr_cnt;
+
+	/* per-chip and other runtime features bitmap (QIB_RUNTIME_*) */
+	__u32 spi_runtime_flags;
+
+	/* address where hardware receive header queue is mapped */
+	__u64 spi_rcvhdr_base;
+
+	/* user program. */
+
+	/* base address of eager TID receive buffers used by hardware. */
+	__u64 spi_rcv_egrbufs;
+
+	/* Allocated by initialization code, not by protocol. */
+
+	/*
+	 * Size of each TID buffer in host memory, starting at
+	 * spi_rcv_egrbufs.  The buffers are virtually contiguous.
+	 */
+	__u32 spi_rcv_egrbufsize;
+	/*
+	 * The special QP (queue pair) value that identifies an qlogic_ib
+	 * protocol packet from standard IB packets.  More, probably much
+	 * more, to be added.
+	 */
+	__u32 spi_qpair;
+
+	/*
+	 * User register base for init code, not to be used directly by
+	 * protocol or applications.  Always points to chip registers,
+	 * for normal or shared context.
+	 */
+	__u64 spi_uregbase;
+	/*
+	 * Maximum buffer size in bytes that can be used in a single TID
+	 * entry (assuming the buffer is aligned to this boundary).  This is
+	 * the minimum of what the hardware and software support Guaranteed
+	 * to be a power of 2.
+	 */
+	__u32 spi_tid_maxsize;
+	/*
+	 * alignment of each pio send buffer (byte count
+	 * to add to spi_piobufbase to get to second buffer)
+	 */
+	__u32 spi_pioalign;
+	/*
+	 * The index of the first pio buffer available to this process;
+	 * needed to do lookup in spi_pioavailaddr; not added to
+	 * spi_piobufbase.
+	 */
+	__u32 spi_pioindex;
+	 /* number of buffers mapped for this process */
+	__u32 spi_piocnt;
+
+	/*
+	 * Base address of writeonly pio buffers for this process.
+	 * Each buffer has spi_piosize words, and is aligned on spi_pioalign
+	 * boundaries.  spi_piocnt buffers are mapped from this address
+	 */
+	__u64 spi_piobufbase;
+
+	/*
+	 * Base address of readonly memory copy of the pioavail registers.
+	 * There are 2 bits for each buffer.
+	 */
+	__u64 spi_pioavailaddr;
+
+	/*
+	 * Address where driver updates a copy of the interface and driver
+	 * status (QIB_STATUS_*) as a 64 bit value.  It's followed by a
+	 * link status qword (formerly combined with driver status), then a
+	 * string indicating hardware error, if there was one.
+	 */
+	__u64 spi_status;
+
+	/* number of chip ctxts available to user processes */
+	__u32 spi_nctxts;
+	__u16 spi_unit; /* unit number of chip we are using */
+	__u16 spi_port; /* IB port number we are using */
+	/* num bufs in each contiguous set */
+	__u32 spi_rcv_egrperchunk;
+	/* size in bytes of each contiguous set */
+	__u32 spi_rcv_egrchunksize;
+	/* total size of mmap to cover full rcvegrbuffers */
+	__u32 spi_rcv_egrbuftotlen;
+	__u32 spi_rhf_offset; /* dword offset in hdrqent for rcvhdr flags */
+	/* address of readonly memory copy of the rcvhdrq tail register. */
+	__u64 spi_rcvhdr_tailaddr;
+
+	/*
+	 * shared memory pages for subctxts if ctxt is shared; these cover
+	 * all the processes in the group sharing a single context.
+	 * all have enough space for the num_subcontexts value on this job.
+	 */
+	__u64 spi_subctxt_uregbase;
+	__u64 spi_subctxt_rcvegrbuf;
+	__u64 spi_subctxt_rcvhdr_base;
+
+	/* shared memory page for send buffer disarm status */
+	__u64 spi_sendbuf_status;
+} __aligned(8);
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of qib_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures
+ * change in an incompatible way.  The driver must be the same or higher
+ * for initialization to succeed.  In some cases, a higher version
+ * driver will not interoperate with older software, and initialization
+ * will return an error.
+ */
+#define QIB_USER_SWMAJOR 1
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference.
+ */
+#define QIB_USER_SWMINOR 13
+
+#define QIB_USER_SWVERSION ((QIB_USER_SWMAJOR << 16) | QIB_USER_SWMINOR)
+
+#ifndef QIB_KERN_TYPE
+#define QIB_KERN_TYPE 0
+#endif
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a QLogic release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of qib_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define QIB_KERN_SWVERSION ((QIB_KERN_TYPE << 31) | QIB_USER_SWVERSION)
+
+/*
+ * Define the driver version number.  This is something that refers only
+ * to the driver itself, not the software interfaces it supports.
+ */
+#define QIB_DRIVER_VERSION_BASE "1.11"
+
+/* create the final driver version string */
+#ifdef QIB_IDSTR
+#define QIB_DRIVER_VERSION QIB_DRIVER_VERSION_BASE " " QIB_IDSTR
+#else
+#define QIB_DRIVER_VERSION QIB_DRIVER_VERSION_BASE
+#endif
+
+/*
+ * If the unit is specified via open, HCA choice is fixed.  If port is
+ * specified, it's also fixed.  Otherwise we try to spread contexts
+ * across ports and HCAs, using different algorithims.  WITHIN is
+ * the old default, prior to this mechanism.
+ */
+#define QIB_PORT_ALG_ACROSS 0 /* round robin contexts across HCAs, then
+			       * ports; this is the default */
+#define QIB_PORT_ALG_WITHIN 1 /* use all contexts on an HCA (round robin
+			       * active ports within), then next HCA */
+#define QIB_PORT_ALG_COUNT 2 /* number of algorithm choices */
+
+/*
+ * This structure is passed to qib_userinit() to tell the driver where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct qib_user_info {
+	/*
+	 * version of user software, to detect compatibility issues.
+	 * Should be set to QIB_USER_SWVERSION.
+	 */
+	__u32 spu_userversion;
+
+	__u32 _spu_unused2;
+
+	/* size of struct base_info to write to */
+	__u32 spu_base_info_size;
+
+	__u32 spu_port_alg; /* which QIB_PORT_ALG_*; unused user minor < 11 */
+
+	/*
+	 * If two or more processes wish to share a context, each process
+	 * must set the spu_subctxt_cnt and spu_subctxt_id to the same
+	 * values.  The only restriction on the spu_subctxt_id is that
+	 * it be unique for a given node.
+	 */
+	__u16 spu_subctxt_cnt;
+	__u16 spu_subctxt_id;
+
+	__u32 spu_port; /* IB port requested by user if > 0 */
+
+	/*
+	 * address of struct base_info to write to
+	 */
+	__u64 spu_base_info;
+
+} __aligned(8);
+
+/* User commands. */
+
+/* 16 available, was: old set up userspace (for old user code) */
+#define QIB_CMD_CTXT_INFO       17      /* find out what resources we got */
+#define QIB_CMD_RECV_CTRL       18      /* control receipt of packets */
+#define QIB_CMD_TID_UPDATE      19      /* update expected TID entries */
+#define QIB_CMD_TID_FREE        20      /* free expected TID entries */
+#define QIB_CMD_SET_PART_KEY    21      /* add partition key */
+/* 22 available, was: return info on slave processes (for old user code) */
+#define QIB_CMD_ASSIGN_CTXT     23      /* allocate HCA and ctxt */
+#define QIB_CMD_USER_INIT       24      /* set up userspace */
+#define QIB_CMD_UNUSED_1        25
+#define QIB_CMD_UNUSED_2        26
+#define QIB_CMD_PIOAVAILUPD     27      /* force an update of PIOAvail reg */
+#define QIB_CMD_POLL_TYPE       28      /* set the kind of polling we want */
+#define QIB_CMD_ARMLAUNCH_CTRL  29      /* armlaunch detection control */
+/* 30 is unused */
+#define QIB_CMD_SDMA_INFLIGHT   31      /* sdma inflight counter request */
+#define QIB_CMD_SDMA_COMPLETE   32      /* sdma completion counter request */
+/* 33 available, was a testing feature  */
+#define QIB_CMD_DISARM_BUFS     34      /* disarm send buffers w/ errors */
+#define QIB_CMD_ACK_EVENT       35      /* ack & clear bits */
+#define QIB_CMD_CPUS_LIST       36      /* list of cpus allocated, for pinned
+					 * processes: qib_cpus_list */
+
+/*
+ * QIB_CMD_ACK_EVENT obsoletes QIB_CMD_DISARM_BUFS, but we keep it for
+ * compatibility with libraries from previous release.   The ACK_EVENT
+ * will take appropriate driver action (if any, just DISARM for now),
+ * then clear the bits passed in as part of the mask.  These bits are
+ * in the first 64bit word at spi_sendbuf_status, and are passed to
+ * the driver in the event_mask union as well.
+ */
+#define _QIB_EVENT_DISARM_BUFS_BIT	0
+#define _QIB_EVENT_LINKDOWN_BIT		1
+#define _QIB_EVENT_LID_CHANGE_BIT	2
+#define _QIB_EVENT_LMC_CHANGE_BIT	3
+#define _QIB_EVENT_SL2VL_CHANGE_BIT	4
+#define _QIB_MAX_EVENT_BIT _QIB_EVENT_SL2VL_CHANGE_BIT
+
+#define QIB_EVENT_DISARM_BUFS_BIT	(1UL << _QIB_EVENT_DISARM_BUFS_BIT)
+#define QIB_EVENT_LINKDOWN_BIT		(1UL << _QIB_EVENT_LINKDOWN_BIT)
+#define QIB_EVENT_LID_CHANGE_BIT	(1UL << _QIB_EVENT_LID_CHANGE_BIT)
+#define QIB_EVENT_LMC_CHANGE_BIT	(1UL << _QIB_EVENT_LMC_CHANGE_BIT)
+#define QIB_EVENT_SL2VL_CHANGE_BIT	(1UL << _QIB_EVENT_SL2VL_CHANGE_BIT)
+
+
+/*
+ * Poll types
+ */
+#define QIB_POLL_TYPE_ANYRCV     0x0
+#define QIB_POLL_TYPE_URGENT     0x1
+
+struct qib_ctxt_info {
+	__u16 num_active;       /* number of active units */
+	__u16 unit;             /* unit (chip) assigned to caller */
+	__u16 port;             /* IB port assigned to caller (1-based) */
+	__u16 ctxt;             /* ctxt on unit assigned to caller */
+	__u16 subctxt;          /* subctxt on unit assigned to caller */
+	__u16 num_ctxts;        /* number of ctxts available on unit */
+	__u16 num_subctxts;     /* number of subctxts opened on ctxt */
+	__u16 rec_cpu;          /* cpu # for affinity (ffff if none) */
+};
+
+struct qib_tid_info {
+	__u32 tidcnt;
+	/* make structure same size in 32 and 64 bit */
+	__u32 tid__unused;
+	/* virtual address of first page in transfer */
+	__u64 tidvaddr;
+	/* pointer (same size 32/64 bit) to __u16 tid array */
+	__u64 tidlist;
+
+	/*
+	 * pointer (same size 32/64 bit) to bitmap of TIDs used
+	 * for this call; checked for being large enough at open
+	 */
+	__u64 tidmap;
+};
+
+struct qib_cmd {
+	__u32 type;                     /* command type */
+	union {
+		struct qib_tid_info tid_info;
+		struct qib_user_info user_info;
+
+		/*
+		 * address in userspace where we should put the sdma
+		 * inflight counter
+		 */
+		__u64 sdma_inflight;
+		/*
+		 * address in userspace where we should put the sdma
+		 * completion counter
+		 */
+		__u64 sdma_complete;
+		/* address in userspace of struct qib_ctxt_info to
+		   write result to */
+		__u64 ctxt_info;
+		/* enable/disable receipt of packets */
+		__u32 recv_ctrl;
+		/* enable/disable armlaunch errors (non-zero to enable) */
+		__u32 armlaunch_ctrl;
+		/* partition key to set */
+		__u16 part_key;
+		/* user address of __u32 bitmask of active slaves */
+		__u64 slave_mask_addr;
+		/* type of polling we want */
+		__u16 poll_type;
+		/* back pressure enable bit for one particular context */
+		__u8 ctxt_bp;
+		/* qib_user_event_ack(), IPATH_EVENT_* bits */
+		__u64 event_mask;
+	} cmd;
+};
+
+struct qib_iovec {
+	/* Pointer to data, but same size 32 and 64 bit */
+	__u64 iov_base;
+
+	/*
+	 * Length of data; don't need 64 bits, but want
+	 * qib_sendpkt to remain same size as before 32 bit changes, so...
+	 */
+	__u64 iov_len;
+};
+
+/*
+ * Describes a single packet for send.  Each packet can have one or more
+ * buffers, but the total length (exclusive of IB headers) must be less
+ * than the MTU, and if using the PIO method, entire packet length,
+ * including IB headers, must be less than the qib_piosize value (words).
+ * Use of this necessitates including sys/uio.h
+ */
+struct __qib_sendpkt {
+	__u32 sps_flags;        /* flags for packet (TBD) */
+	__u32 sps_cnt;          /* number of entries to use in sps_iov */
+	/* array of iov's describing packet. TEMPORARY */
+	struct qib_iovec sps_iov[4];
+};
+
+/*
+ * Diagnostics can send a packet by "writing" the following
+ * structs to the diag data special file.
+ * This allows a custom
+ * pbc (+ static rate) qword, so that special modes and deliberate
+ * changes to CRCs can be used. The elements were also re-ordered
+ * for better alignment and to avoid padding issues.
+ */
+#define _DIAG_XPKT_VERS 3
+struct qib_diag_xpkt {
+	__u16 version;
+	__u16 unit;
+	__u16 port;
+	__u16 len;
+	__u64 data;
+	__u64 pbc_wd;
+};
+
+/*
+ * Data layout in I2C flash (for GUID, etc.)
+ * All fields are little-endian binary unless otherwise stated
+ */
+#define QIB_FLASH_VERSION 2
+struct qib_flash {
+	/* flash layout version (QIB_FLASH_VERSION) */
+	__u8 if_fversion;
+	/* checksum protecting if_length bytes */
+	__u8 if_csum;
+	/*
+	 * valid length (in use, protected by if_csum), including
+	 * if_fversion and if_csum themselves)
+	 */
+	__u8 if_length;
+	/* the GUID, in network order */
+	__u8 if_guid[8];
+	/* number of GUIDs to use, starting from if_guid */
+	__u8 if_numguid;
+	/* the (last 10 characters of) board serial number, in ASCII */
+	char if_serial[12];
+	/* board mfg date (YYYYMMDD ASCII) */
+	char if_mfgdate[8];
+	/* last board rework/test date (YYYYMMDD ASCII) */
+	char if_testdate[8];
+	/* logging of error counts, TBD */
+	__u8 if_errcntp[4];
+	/* powered on hours, updated at driver unload */
+	__u8 if_powerhour[2];
+	/* ASCII free-form comment field */
+	char if_comment[32];
+	/* Backwards compatible prefix for longer QLogic Serial Numbers */
+	char if_sprefix[4];
+	/* 82 bytes used, min flash size is 128 bytes */
+	__u8 if_future[46];
+};
+
+/*
+ * These are the counters implemented in the chip, and are listed in order.
+ * The InterCaps naming is taken straight from the chip spec.
+ */
+struct qlogic_ib_counters {
+	__u64 LBIntCnt;
+	__u64 LBFlowStallCnt;
+	__u64 TxSDmaDescCnt;    /* was Reserved1 */
+	__u64 TxUnsupVLErrCnt;
+	__u64 TxDataPktCnt;
+	__u64 TxFlowPktCnt;
+	__u64 TxDwordCnt;
+	__u64 TxLenErrCnt;
+	__u64 TxMaxMinLenErrCnt;
+	__u64 TxUnderrunCnt;
+	__u64 TxFlowStallCnt;
+	__u64 TxDroppedPktCnt;
+	__u64 RxDroppedPktCnt;
+	__u64 RxDataPktCnt;
+	__u64 RxFlowPktCnt;
+	__u64 RxDwordCnt;
+	__u64 RxLenErrCnt;
+	__u64 RxMaxMinLenErrCnt;
+	__u64 RxICRCErrCnt;
+	__u64 RxVCRCErrCnt;
+	__u64 RxFlowCtrlErrCnt;
+	__u64 RxBadFormatCnt;
+	__u64 RxLinkProblemCnt;
+	__u64 RxEBPCnt;
+	__u64 RxLPCRCErrCnt;
+	__u64 RxBufOvflCnt;
+	__u64 RxTIDFullErrCnt;
+	__u64 RxTIDValidErrCnt;
+	__u64 RxPKeyMismatchCnt;
+	__u64 RxP0HdrEgrOvflCnt;
+	__u64 RxP1HdrEgrOvflCnt;
+	__u64 RxP2HdrEgrOvflCnt;
+	__u64 RxP3HdrEgrOvflCnt;
+	__u64 RxP4HdrEgrOvflCnt;
+	__u64 RxP5HdrEgrOvflCnt;
+	__u64 RxP6HdrEgrOvflCnt;
+	__u64 RxP7HdrEgrOvflCnt;
+	__u64 RxP8HdrEgrOvflCnt;
+	__u64 RxP9HdrEgrOvflCnt;
+	__u64 RxP10HdrEgrOvflCnt;
+	__u64 RxP11HdrEgrOvflCnt;
+	__u64 RxP12HdrEgrOvflCnt;
+	__u64 RxP13HdrEgrOvflCnt;
+	__u64 RxP14HdrEgrOvflCnt;
+	__u64 RxP15HdrEgrOvflCnt;
+	__u64 RxP16HdrEgrOvflCnt;
+	__u64 IBStatusChangeCnt;
+	__u64 IBLinkErrRecoveryCnt;
+	__u64 IBLinkDownedCnt;
+	__u64 IBSymbolErrCnt;
+	__u64 RxVL15DroppedPktCnt;
+	__u64 RxOtherLocalPhyErrCnt;
+	__u64 PcieRetryBufDiagQwordCnt;
+	__u64 ExcessBufferOvflCnt;
+	__u64 LocalLinkIntegrityErrCnt;
+	__u64 RxVlErrCnt;
+	__u64 RxDlidFltrCnt;
+};
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software.
+ */
+
+/* RcvHdrFlags bits */
+#define QLOGIC_IB_RHF_LENGTH_MASK 0x7FF
+#define QLOGIC_IB_RHF_LENGTH_SHIFT 0
+#define QLOGIC_IB_RHF_RCVTYPE_MASK 0x7
+#define QLOGIC_IB_RHF_RCVTYPE_SHIFT 11
+#define QLOGIC_IB_RHF_EGRINDEX_MASK 0xFFF
+#define QLOGIC_IB_RHF_EGRINDEX_SHIFT 16
+#define QLOGIC_IB_RHF_SEQ_MASK 0xF
+#define QLOGIC_IB_RHF_SEQ_SHIFT 0
+#define QLOGIC_IB_RHF_HDRQ_OFFSET_MASK 0x7FF
+#define QLOGIC_IB_RHF_HDRQ_OFFSET_SHIFT 4
+#define QLOGIC_IB_RHF_H_ICRCERR   0x80000000
+#define QLOGIC_IB_RHF_H_VCRCERR   0x40000000
+#define QLOGIC_IB_RHF_H_PARITYERR 0x20000000
+#define QLOGIC_IB_RHF_H_LENERR    0x10000000
+#define QLOGIC_IB_RHF_H_MTUERR    0x08000000
+#define QLOGIC_IB_RHF_H_IHDRERR   0x04000000
+#define QLOGIC_IB_RHF_H_TIDERR    0x02000000
+#define QLOGIC_IB_RHF_H_MKERR     0x01000000
+#define QLOGIC_IB_RHF_H_IBERR     0x00800000
+#define QLOGIC_IB_RHF_H_ERR_MASK  0xFF800000
+#define QLOGIC_IB_RHF_L_USE_EGR   0x80000000
+#define QLOGIC_IB_RHF_L_SWA       0x00008000
+#define QLOGIC_IB_RHF_L_SWB       0x00004000
+
+/* qlogic_ib header fields */
+#define QLOGIC_IB_I_VERS_MASK 0xF
+#define QLOGIC_IB_I_VERS_SHIFT 28
+#define QLOGIC_IB_I_CTXT_MASK 0xF
+#define QLOGIC_IB_I_CTXT_SHIFT 24
+#define QLOGIC_IB_I_TID_MASK 0x7FF
+#define QLOGIC_IB_I_TID_SHIFT 13
+#define QLOGIC_IB_I_OFFSET_MASK 0x1FFF
+#define QLOGIC_IB_I_OFFSET_SHIFT 0
+
+/* K_PktFlags bits */
+#define QLOGIC_IB_KPF_INTR 0x1
+#define QLOGIC_IB_KPF_SUBCTXT_MASK 0x3
+#define QLOGIC_IB_KPF_SUBCTXT_SHIFT 1
+
+#define QLOGIC_IB_MAX_SUBCTXT   4
+
+/* SendPIO per-buffer control */
+#define QLOGIC_IB_SP_TEST    0x40
+#define QLOGIC_IB_SP_TESTEBP 0x20
+#define QLOGIC_IB_SP_TRIGGER_SHIFT  15
+
+/* SendPIOAvail bits */
+#define QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT 1
+#define QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT 0
+
+/* qlogic_ib header format */
+struct qib_header {
+	/*
+	 * Version - 4 bits, Context - 4 bits, TID - 10 bits and Offset -
+	 * 14 bits before ECO change ~28 Dec 03.  After that, Vers 4,
+	 * Context 4, TID 11, offset 13.
+	 */
+	__le32 ver_ctxt_tid_offset;
+	__le16 chksum;
+	__le16 pkt_flags;
+};
+
+/*
+ * qlogic_ib user message header format.
+ * This structure contains the first 4 fields common to all protocols
+ * that employ qlogic_ib.
+ */
+struct qib_message_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+	/* fields below this point are in host byte order */
+	struct qib_header iph;
+	/* fields below are simplified, but should match PSM */
+	/* some are accessed by driver when packet spliting is needed */
+	__u8 sub_opcode;
+	__u8 flags;
+	__u16 commidx;
+	__u32 ack_seq_num;
+	__u8 flowid;
+	__u8 hdr_dlen;
+	__u16 mqhdr;
+	__u32 uwords[4];
+};
+
+/* sequence number bits for message */
+union qib_seqnum {
+	struct {
+		__u32 seq:11;
+		__u32 gen:8;
+		__u32 flow:5;
+	};
+	struct {
+		__u32 pkt:16;
+		__u32 msg:8;
+	};
+	__u32 val;
+};
+
+/* qib receiving-dma tid-session-member */
+struct qib_tid_session_member {
+	__u16 tid;
+	__u16 offset;
+	__u16 length;
+};
+
+/* IB - LRH header consts */
+#define QIB_LRH_GRH 0x0003      /* 1. word of IB LRH - next header: GRH */
+#define QIB_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define QIB_DEFAULT_P_KEY 0xFFFF
+#define QIB_PSN_MASK 0xFFFFFF
+#define QIB_EAGER_TID_ID QLOGIC_IB_I_TID_MASK
+#define QIB_MULTICAST_QPN 0xFFFFFF
+
+/* Receive Header Queue: receive type (from qlogic_ib) */
+#define RCVHQ_RCV_TYPE_EXPECTED  0
+#define RCVHQ_RCV_TYPE_EAGER     1
+#define RCVHQ_RCV_TYPE_NON_KD    2
+#define RCVHQ_RCV_TYPE_ERROR     3
+
+#define QIB_HEADER_QUEUE_WORDS 9
+
+/* functions for extracting fields from rcvhdrq entries for the driver.
+ */
+static inline __u32 qib_hdrget_err_flags(const __le32 *rbuf)
+{
+	return __le32_to_cpu(rbuf[1]) & QLOGIC_IB_RHF_H_ERR_MASK;
+}
+
+static inline __u32 qib_hdrget_rcv_type(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_RCVTYPE_SHIFT) &
+		QLOGIC_IB_RHF_RCVTYPE_MASK;
+}
+
+static inline __u32 qib_hdrget_length_in_bytes(const __le32 *rbuf)
+{
+	return ((__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_LENGTH_SHIFT) &
+		QLOGIC_IB_RHF_LENGTH_MASK) << 2;
+}
+
+static inline __u32 qib_hdrget_index(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_EGRINDEX_SHIFT) &
+		QLOGIC_IB_RHF_EGRINDEX_MASK;
+}
+
+static inline __u32 qib_hdrget_seq(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> QLOGIC_IB_RHF_SEQ_SHIFT) &
+		QLOGIC_IB_RHF_SEQ_MASK;
+}
+
+static inline __u32 qib_hdrget_offset(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> QLOGIC_IB_RHF_HDRQ_OFFSET_SHIFT) &
+		QLOGIC_IB_RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline __u32 qib_hdrget_use_egr_buf(const __le32 *rbuf)
+{
+	return __le32_to_cpu(rbuf[0]) & QLOGIC_IB_RHF_L_USE_EGR;
+}
+
+static inline __u32 qib_hdrget_qib_ver(__le32 hdrword)
+{
+	return (__le32_to_cpu(hdrword) >> QLOGIC_IB_I_VERS_SHIFT) &
+		QLOGIC_IB_I_VERS_MASK;
+}
+
+#endif                          /* _QIB_COMMON_H */
diff --git a/drivers/infiniband/hw/qib/qib_debugfs.c b/drivers/infiniband/hw/qib/qib_debugfs.c
new file mode 100644
index 0000000..5ed1ed9
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_debugfs.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2013 - 2017 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+
+#include "qib.h"
+#include "qib_verbs.h"
+#include "qib_debugfs.h"
+
+static struct dentry *qib_dbg_root;
+
+#define DEBUGFS_FILE(name) \
+static const struct seq_operations _##name##_seq_ops = { \
+	.start = _##name##_seq_start, \
+	.next  = _##name##_seq_next, \
+	.stop  = _##name##_seq_stop, \
+	.show  = _##name##_seq_show \
+}; \
+static int _##name##_open(struct inode *inode, struct file *s) \
+{ \
+	struct seq_file *seq; \
+	int ret; \
+	ret =  seq_open(s, &_##name##_seq_ops); \
+	if (ret) \
+		return ret; \
+	seq = s->private_data; \
+	seq->private = inode->i_private; \
+	return 0; \
+} \
+static const struct file_operations _##name##_file_ops = { \
+	.owner   = THIS_MODULE, \
+	.open    = _##name##_open, \
+	.read    = seq_read, \
+	.llseek  = seq_lseek, \
+	.release = seq_release \
+};
+
+#define DEBUGFS_FILE_CREATE(name) \
+do { \
+	struct dentry *ent; \
+	ent = debugfs_create_file(#name , 0400, ibd->qib_ibdev_dbg, \
+		ibd, &_##name##_file_ops); \
+	if (!ent) \
+		pr_warn("create of " #name " failed\n"); \
+} while (0)
+
+static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct qib_opcode_stats_perctx *opstats;
+
+	if (*pos >= ARRAY_SIZE(opstats->stats))
+		return NULL;
+	return pos;
+}
+
+static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct qib_opcode_stats_perctx *opstats;
+
+	++*pos;
+	if (*pos >= ARRAY_SIZE(opstats->stats))
+		return NULL;
+	return pos;
+}
+
+
+static void _opcode_stats_seq_stop(struct seq_file *s, void *v)
+{
+	/* nothing allocated */
+}
+
+static int _opcode_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos = v;
+	loff_t i = *spos, j;
+	u64 n_packets = 0, n_bytes = 0;
+	struct qib_ibdev *ibd = (struct qib_ibdev *)s->private;
+	struct qib_devdata *dd = dd_from_dev(ibd);
+
+	for (j = 0; j < dd->first_user_ctxt; j++) {
+		if (!dd->rcd[j])
+			continue;
+		n_packets += dd->rcd[j]->opstats->stats[i].n_packets;
+		n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes;
+	}
+	if (!n_packets && !n_bytes)
+		return SEQ_SKIP;
+	seq_printf(s, "%02llx %llu/%llu\n", i,
+		(unsigned long long) n_packets,
+		(unsigned long long) n_bytes);
+
+	return 0;
+}
+
+DEBUGFS_FILE(opcode_stats)
+
+static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct qib_ibdev *ibd = (struct qib_ibdev *)s->private;
+	struct qib_devdata *dd = dd_from_dev(ibd);
+
+	if (!*pos)
+		return SEQ_START_TOKEN;
+	if (*pos >= dd->first_user_ctxt)
+		return NULL;
+	return pos;
+}
+
+static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct qib_ibdev *ibd = (struct qib_ibdev *)s->private;
+	struct qib_devdata *dd = dd_from_dev(ibd);
+
+	if (v == SEQ_START_TOKEN)
+		return pos;
+
+	++*pos;
+	if (*pos >= dd->first_user_ctxt)
+		return NULL;
+	return pos;
+}
+
+static void _ctx_stats_seq_stop(struct seq_file *s, void *v)
+{
+	/* nothing allocated */
+}
+
+static int _ctx_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos;
+	loff_t i, j;
+	u64 n_packets = 0;
+	struct qib_ibdev *ibd = (struct qib_ibdev *)s->private;
+	struct qib_devdata *dd = dd_from_dev(ibd);
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(s, "Ctx:npkts\n");
+		return 0;
+	}
+
+	spos = v;
+	i = *spos;
+
+	if (!dd->rcd[i])
+		return SEQ_SKIP;
+
+	for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++)
+		n_packets += dd->rcd[i]->opstats->stats[j].n_packets;
+
+	if (!n_packets)
+		return SEQ_SKIP;
+
+	seq_printf(s, "  %llu:%llu\n", i, n_packets);
+	return 0;
+}
+
+DEBUGFS_FILE(ctx_stats)
+
+static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
+	__acquires(RCU)
+{
+	struct rvt_qp_iter *iter;
+	loff_t n = *pos;
+
+	iter = rvt_qp_iter_init(s->private, 0, NULL);
+
+	/* stop calls rcu_read_unlock */
+	rcu_read_lock();
+
+	if (!iter)
+		return NULL;
+
+	do {
+		if (rvt_qp_iter_next(iter)) {
+			kfree(iter);
+			return NULL;
+		}
+	} while (n--);
+
+	return iter;
+}
+
+static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
+				   loff_t *pos)
+	__must_hold(RCU)
+{
+	struct rvt_qp_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (rvt_qp_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
+{
+	struct rvt_qp_iter *iter = iter_ptr;
+
+	if (!iter)
+		return 0;
+
+	qib_qp_iter_print(s, iter);
+
+	return 0;
+}
+
+DEBUGFS_FILE(qp_stats)
+
+void qib_dbg_ibdev_init(struct qib_ibdev *ibd)
+{
+	char name[10];
+
+	snprintf(name, sizeof(name), "qib%d", dd_from_dev(ibd)->unit);
+	ibd->qib_ibdev_dbg = debugfs_create_dir(name, qib_dbg_root);
+	if (!ibd->qib_ibdev_dbg) {
+		pr_warn("create of %s failed\n", name);
+		return;
+	}
+	DEBUGFS_FILE_CREATE(opcode_stats);
+	DEBUGFS_FILE_CREATE(ctx_stats);
+	DEBUGFS_FILE_CREATE(qp_stats);
+}
+
+void qib_dbg_ibdev_exit(struct qib_ibdev *ibd)
+{
+	if (!qib_dbg_root)
+		goto out;
+	debugfs_remove_recursive(ibd->qib_ibdev_dbg);
+out:
+	ibd->qib_ibdev_dbg = NULL;
+}
+
+void qib_dbg_init(void)
+{
+	qib_dbg_root = debugfs_create_dir(QIB_DRV_NAME, NULL);
+	if (!qib_dbg_root)
+		pr_warn("init of debugfs failed\n");
+}
+
+void qib_dbg_exit(void)
+{
+	debugfs_remove_recursive(qib_dbg_root);
+	qib_dbg_root = NULL;
+}
diff --git a/drivers/infiniband/hw/qib/qib_debugfs.h b/drivers/infiniband/hw/qib/qib_debugfs.h
new file mode 100644
index 0000000..7ae983a
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_debugfs.h
@@ -0,0 +1,45 @@
+#ifndef _QIB_DEBUGFS_H
+#define _QIB_DEBUGFS_H
+
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Copyright (c) 2013 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+struct qib_ibdev;
+void qib_dbg_ibdev_init(struct qib_ibdev *ibd);
+void qib_dbg_ibdev_exit(struct qib_ibdev *ibd);
+void qib_dbg_init(void);
+void qib_dbg_exit(void);
+
+#endif
+
+#endif                          /* _QIB_DEBUGFS_H */
diff --git a/drivers/infiniband/hw/qib/qib_diag.c b/drivers/infiniband/hw/qib/qib_diag.c
new file mode 100644
index 0000000..11da796
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_diag.c
@@ -0,0 +1,906 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains support for diagnostic functions.  It is accessed by
+ * opening the qib_diag device, normally minor number 129.  Diagnostic use
+ * of the QLogic_IB chip may render the chip or board unusable until the
+ * driver is unloaded, or in some cases, until the system is rebooted.
+ *
+ * Accesses to the chip through this interface are not similar to going
+ * through the /sys/bus/pci resource mmap interface.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+
+#include "qib.h"
+#include "qib_common.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt
+
+/*
+ * Each client that opens the diag device must read then write
+ * offset 0, to prevent lossage from random cat or od. diag_state
+ * sequences this "handshake".
+ */
+enum diag_state { UNUSED = 0, OPENED, INIT, READY };
+
+/* State for an individual client. PID so children cannot abuse handshake */
+static struct qib_diag_client {
+	struct qib_diag_client *next;
+	struct qib_devdata *dd;
+	pid_t pid;
+	enum diag_state state;
+} *client_pool;
+
+/*
+ * Get a client struct. Recycled if possible, else kmalloc.
+ * Must be called with qib_mutex held
+ */
+static struct qib_diag_client *get_client(struct qib_devdata *dd)
+{
+	struct qib_diag_client *dc;
+
+	dc = client_pool;
+	if (dc)
+		/* got from pool remove it and use */
+		client_pool = dc->next;
+	else
+		/* None in pool, alloc and init */
+		dc = kmalloc(sizeof(*dc), GFP_KERNEL);
+
+	if (dc) {
+		dc->next = NULL;
+		dc->dd = dd;
+		dc->pid = current->pid;
+		dc->state = OPENED;
+	}
+	return dc;
+}
+
+/*
+ * Return to pool. Must be called with qib_mutex held
+ */
+static void return_client(struct qib_diag_client *dc)
+{
+	struct qib_devdata *dd = dc->dd;
+	struct qib_diag_client *tdc, *rdc;
+
+	rdc = NULL;
+	if (dc == dd->diag_client) {
+		dd->diag_client = dc->next;
+		rdc = dc;
+	} else {
+		tdc = dc->dd->diag_client;
+		while (tdc) {
+			if (dc == tdc->next) {
+				tdc->next = dc->next;
+				rdc = dc;
+				break;
+			}
+			tdc = tdc->next;
+		}
+	}
+	if (rdc) {
+		rdc->state = UNUSED;
+		rdc->dd = NULL;
+		rdc->pid = 0;
+		rdc->next = client_pool;
+		client_pool = rdc;
+	}
+}
+
+static int qib_diag_open(struct inode *in, struct file *fp);
+static int qib_diag_release(struct inode *in, struct file *fp);
+static ssize_t qib_diag_read(struct file *fp, char __user *data,
+			     size_t count, loff_t *off);
+static ssize_t qib_diag_write(struct file *fp, const char __user *data,
+			      size_t count, loff_t *off);
+
+static const struct file_operations diag_file_ops = {
+	.owner = THIS_MODULE,
+	.write = qib_diag_write,
+	.read = qib_diag_read,
+	.open = qib_diag_open,
+	.release = qib_diag_release,
+	.llseek = default_llseek,
+};
+
+static atomic_t diagpkt_count = ATOMIC_INIT(0);
+static struct cdev *diagpkt_cdev;
+static struct device *diagpkt_device;
+
+static ssize_t qib_diagpkt_write(struct file *fp, const char __user *data,
+				 size_t count, loff_t *off);
+
+static const struct file_operations diagpkt_file_ops = {
+	.owner = THIS_MODULE,
+	.write = qib_diagpkt_write,
+	.llseek = noop_llseek,
+};
+
+int qib_diag_add(struct qib_devdata *dd)
+{
+	char name[16];
+	int ret = 0;
+
+	if (atomic_inc_return(&diagpkt_count) == 1) {
+		ret = qib_cdev_init(QIB_DIAGPKT_MINOR, "ipath_diagpkt",
+				    &diagpkt_file_ops, &diagpkt_cdev,
+				    &diagpkt_device);
+		if (ret)
+			goto done;
+	}
+
+	snprintf(name, sizeof(name), "ipath_diag%d", dd->unit);
+	ret = qib_cdev_init(QIB_DIAG_MINOR_BASE + dd->unit, name,
+			    &diag_file_ops, &dd->diag_cdev,
+			    &dd->diag_device);
+done:
+	return ret;
+}
+
+static void qib_unregister_observers(struct qib_devdata *dd);
+
+void qib_diag_remove(struct qib_devdata *dd)
+{
+	struct qib_diag_client *dc;
+
+	if (atomic_dec_and_test(&diagpkt_count))
+		qib_cdev_cleanup(&diagpkt_cdev, &diagpkt_device);
+
+	qib_cdev_cleanup(&dd->diag_cdev, &dd->diag_device);
+
+	/*
+	 * Return all diag_clients of this device. There should be none,
+	 * as we are "guaranteed" that no clients are still open
+	 */
+	while (dd->diag_client)
+		return_client(dd->diag_client);
+
+	/* Now clean up all unused client structs */
+	while (client_pool) {
+		dc = client_pool;
+		client_pool = dc->next;
+		kfree(dc);
+	}
+	/* Clean up observer list */
+	qib_unregister_observers(dd);
+}
+
+/* qib_remap_ioaddr32 - remap an offset into chip address space to __iomem *
+ *
+ * @dd: the qlogic_ib device
+ * @offs: the offset in chip-space
+ * @cntp: Pointer to max (byte) count for transfer starting at offset
+ * This returns a u32 __iomem * so it can be used for both 64 and 32-bit
+ * mapping. It is needed because with the use of PAT for control of
+ * write-combining, the logically contiguous address-space of the chip
+ * may be split into virtually non-contiguous spaces, with different
+ * attributes, which are them mapped to contiguous physical space
+ * based from the first BAR.
+ *
+ * The code below makes the same assumptions as were made in
+ * init_chip_wc_pat() (qib_init.c), copied here:
+ * Assumes chip address space looks like:
+ *		- kregs + sregs + cregs + uregs (in any order)
+ *		- piobufs (2K and 4K bufs in either order)
+ *	or:
+ *		- kregs + sregs + cregs (in any order)
+ *		- piobufs (2K and 4K bufs in either order)
+ *		- uregs
+ *
+ * If cntp is non-NULL, returns how many bytes from offset can be accessed
+ * Returns 0 if the offset is not mapped.
+ */
+static u32 __iomem *qib_remap_ioaddr32(struct qib_devdata *dd, u32 offset,
+				       u32 *cntp)
+{
+	u32 kreglen;
+	u32 snd_bottom, snd_lim = 0;
+	u32 __iomem *krb32 = (u32 __iomem *)dd->kregbase;
+	u32 __iomem *map = NULL;
+	u32 cnt = 0;
+	u32 tot4k, offs4k;
+
+	/* First, simplest case, offset is within the first map. */
+	kreglen = (dd->kregend - dd->kregbase) * sizeof(u64);
+	if (offset < kreglen) {
+		map = krb32 + (offset / sizeof(u32));
+		cnt = kreglen - offset;
+		goto mapped;
+	}
+
+	/*
+	 * Next check for user regs, the next most common case,
+	 * and a cheap check because if they are not in the first map
+	 * they are last in chip.
+	 */
+	if (dd->userbase) {
+		/* If user regs mapped, they are after send, so set limit. */
+		u32 ulim = (dd->cfgctxts * dd->ureg_align) + dd->uregbase;
+
+		if (!dd->piovl15base)
+			snd_lim = dd->uregbase;
+		krb32 = (u32 __iomem *)dd->userbase;
+		if (offset >= dd->uregbase && offset < ulim) {
+			map = krb32 + (offset - dd->uregbase) / sizeof(u32);
+			cnt = ulim - offset;
+			goto mapped;
+		}
+	}
+
+	/*
+	 * Lastly, check for offset within Send Buffers.
+	 * This is gnarly because struct devdata is deliberately vague
+	 * about things like 7322 VL15 buffers, and we are not in
+	 * chip-specific code here, so should not make many assumptions.
+	 * The one we _do_ make is that the only chip that has more sndbufs
+	 * than we admit is the 7322, and it has userregs above that, so
+	 * we know the snd_lim.
+	 */
+	/* Assume 2K buffers are first. */
+	snd_bottom = dd->pio2k_bufbase;
+	if (snd_lim == 0) {
+		u32 tot2k = dd->piobcnt2k * ALIGN(dd->piosize2k, dd->palign);
+
+		snd_lim = snd_bottom + tot2k;
+	}
+	/* If 4k buffers exist, account for them by bumping
+	 * appropriate limit.
+	 */
+	tot4k = dd->piobcnt4k * dd->align4k;
+	offs4k = dd->piobufbase >> 32;
+	if (dd->piobcnt4k) {
+		if (snd_bottom > offs4k)
+			snd_bottom = offs4k;
+		else {
+			/* 4k above 2k. Bump snd_lim, if needed*/
+			if (!dd->userbase || dd->piovl15base)
+				snd_lim = offs4k + tot4k;
+		}
+	}
+	/*
+	 * Judgement call: can we ignore the space between SendBuffs and
+	 * UserRegs, where we would like to see vl15 buffs, but not more?
+	 */
+	if (offset >= snd_bottom && offset < snd_lim) {
+		offset -= snd_bottom;
+		map = (u32 __iomem *)dd->piobase + (offset / sizeof(u32));
+		cnt = snd_lim - offset;
+	}
+
+	if (!map && offs4k && dd->piovl15base) {
+		snd_lim = offs4k + tot4k + 2 * dd->align4k;
+		if (offset >= (offs4k + tot4k) && offset < snd_lim) {
+			map = (u32 __iomem *)dd->piovl15base +
+				((offset - (offs4k + tot4k)) / sizeof(u32));
+			cnt = snd_lim - offset;
+		}
+	}
+
+mapped:
+	if (cntp)
+		*cntp = cnt;
+	return map;
+}
+
+/*
+ * qib_read_umem64 - read a 64-bit quantity from the chip into user space
+ * @dd: the qlogic_ib device
+ * @uaddr: the location to store the data in user memory
+ * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore)
+ * @count: number of bytes to copy (multiple of 32 bits)
+ *
+ * This function also localizes all chip memory accesses.
+ * The copy should be written such that we read full cacheline packets
+ * from the chip.  This is usually used for a single qword
+ *
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+static int qib_read_umem64(struct qib_devdata *dd, void __user *uaddr,
+			   u32 regoffs, size_t count)
+{
+	const u64 __iomem *reg_addr;
+	const u64 __iomem *reg_end;
+	u32 limit;
+	int ret;
+
+	reg_addr = (const u64 __iomem *)qib_remap_ioaddr32(dd, regoffs, &limit);
+	if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (count >= limit)
+		count = limit;
+	reg_end = reg_addr + (count / sizeof(u64));
+
+	/* not very efficient, but it works for now */
+	while (reg_addr < reg_end) {
+		u64 data = readq(reg_addr);
+
+		if (copy_to_user(uaddr, &data, sizeof(u64))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		reg_addr++;
+		uaddr += sizeof(u64);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/*
+ * qib_write_umem64 - write a 64-bit quantity to the chip from user space
+ * @dd: the qlogic_ib device
+ * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore)
+ * @uaddr: the source of the data in user memory
+ * @count: the number of bytes to copy (multiple of 32 bits)
+ *
+ * This is usually used for a single qword
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+
+static int qib_write_umem64(struct qib_devdata *dd, u32 regoffs,
+			    const void __user *uaddr, size_t count)
+{
+	u64 __iomem *reg_addr;
+	const u64 __iomem *reg_end;
+	u32 limit;
+	int ret;
+
+	reg_addr = (u64 __iomem *)qib_remap_ioaddr32(dd, regoffs, &limit);
+	if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (count >= limit)
+		count = limit;
+	reg_end = reg_addr + (count / sizeof(u64));
+
+	/* not very efficient, but it works for now */
+	while (reg_addr < reg_end) {
+		u64 data;
+
+		if (copy_from_user(&data, uaddr, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		writeq(data, reg_addr);
+
+		reg_addr++;
+		uaddr += sizeof(u64);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/*
+ * qib_read_umem32 - read a 32-bit quantity from the chip into user space
+ * @dd: the qlogic_ib device
+ * @uaddr: the location to store the data in user memory
+ * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore)
+ * @count: number of bytes to copy
+ *
+ * read 32 bit values, not 64 bit; for memories that only
+ * support 32 bit reads; usually a single dword.
+ */
+static int qib_read_umem32(struct qib_devdata *dd, void __user *uaddr,
+			   u32 regoffs, size_t count)
+{
+	const u32 __iomem *reg_addr;
+	const u32 __iomem *reg_end;
+	u32 limit;
+	int ret;
+
+	reg_addr = qib_remap_ioaddr32(dd, regoffs, &limit);
+	if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (count >= limit)
+		count = limit;
+	reg_end = reg_addr + (count / sizeof(u32));
+
+	/* not very efficient, but it works for now */
+	while (reg_addr < reg_end) {
+		u32 data = readl(reg_addr);
+
+		if (copy_to_user(uaddr, &data, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+
+		reg_addr++;
+		uaddr += sizeof(u32);
+
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/*
+ * qib_write_umem32 - write a 32-bit quantity to the chip from user space
+ * @dd: the qlogic_ib device
+ * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore)
+ * @uaddr: the source of the data in user memory
+ * @count: number of bytes to copy
+ *
+ * write 32 bit values, not 64 bit; for memories that only
+ * support 32 bit write; usually a single dword.
+ */
+
+static int qib_write_umem32(struct qib_devdata *dd, u32 regoffs,
+			    const void __user *uaddr, size_t count)
+{
+	u32 __iomem *reg_addr;
+	const u32 __iomem *reg_end;
+	u32 limit;
+	int ret;
+
+	reg_addr = qib_remap_ioaddr32(dd, regoffs, &limit);
+	if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (count >= limit)
+		count = limit;
+	reg_end = reg_addr + (count / sizeof(u32));
+
+	while (reg_addr < reg_end) {
+		u32 data;
+
+		if (copy_from_user(&data, uaddr, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		writel(data, reg_addr);
+
+		reg_addr++;
+		uaddr += sizeof(u32);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+static int qib_diag_open(struct inode *in, struct file *fp)
+{
+	int unit = iminor(in) - QIB_DIAG_MINOR_BASE;
+	struct qib_devdata *dd;
+	struct qib_diag_client *dc;
+	int ret;
+
+	mutex_lock(&qib_mutex);
+
+	dd = qib_lookup(unit);
+
+	if (dd == NULL || !(dd->flags & QIB_PRESENT) ||
+	    !dd->kregbase) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	dc = get_client(dd);
+	if (!dc) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	dc->next = dd->diag_client;
+	dd->diag_client = dc;
+	fp->private_data = dc;
+	ret = 0;
+bail:
+	mutex_unlock(&qib_mutex);
+
+	return ret;
+}
+
+/**
+ * qib_diagpkt_write - write an IB packet
+ * @fp: the diag data device file pointer
+ * @data: qib_diag_pkt structure saying where to get the packet
+ * @count: size of data to write
+ * @off: unused by this code
+ */
+static ssize_t qib_diagpkt_write(struct file *fp,
+				 const char __user *data,
+				 size_t count, loff_t *off)
+{
+	u32 __iomem *piobuf;
+	u32 plen, pbufn, maxlen_reserve;
+	struct qib_diag_xpkt dp;
+	u32 *tmpbuf = NULL;
+	struct qib_devdata *dd;
+	struct qib_pportdata *ppd;
+	ssize_t ret = 0;
+
+	if (count != sizeof(dp)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (copy_from_user(&dp, data, sizeof(dp))) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	dd = qib_lookup(dp.unit);
+	if (!dd || !(dd->flags & QIB_PRESENT) || !dd->kregbase) {
+		ret = -ENODEV;
+		goto bail;
+	}
+	if (!(dd->flags & QIB_INITTED)) {
+		/* no hardware, freeze, etc. */
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	if (dp.version != _DIAG_XPKT_VERS) {
+		qib_dev_err(dd, "Invalid version %u for diagpkt_write\n",
+			    dp.version);
+		ret = -EINVAL;
+		goto bail;
+	}
+	/* send count must be an exact number of dwords */
+	if (dp.len & 3) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (!dp.port || dp.port > dd->num_pports) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	ppd = &dd->pport[dp.port - 1];
+
+	/*
+	 * need total length before first word written, plus 2 Dwords. One Dword
+	 * is for padding so we get the full user data when not aligned on
+	 * a word boundary. The other Dword is to make sure we have room for the
+	 * ICRC which gets tacked on later.
+	 */
+	maxlen_reserve = 2 * sizeof(u32);
+	if (dp.len > ppd->ibmaxlen - maxlen_reserve) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	plen = sizeof(u32) + dp.len;
+
+	tmpbuf = vmalloc(plen);
+	if (!tmpbuf) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	if (copy_from_user(tmpbuf,
+			   u64_to_user_ptr(dp.data),
+			   dp.len)) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	plen >>= 2;             /* in dwords */
+
+	if (dp.pbc_wd == 0)
+		dp.pbc_wd = plen;
+
+	piobuf = dd->f_getsendbuf(ppd, dp.pbc_wd, &pbufn);
+	if (!piobuf) {
+		ret = -EBUSY;
+		goto bail;
+	}
+	/* disarm it just to be extra sure */
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbufn));
+
+	/* disable header check on pbufn for this packet */
+	dd->f_txchk_change(dd, pbufn, 1, TXCHK_CHG_TYPE_DIS1, NULL);
+
+	writeq(dp.pbc_wd, piobuf);
+	/*
+	 * Copy all but the trigger word, then flush, so it's written
+	 * to chip before trigger word, then write trigger word, then
+	 * flush again, so packet is sent.
+	 */
+	if (dd->flags & QIB_PIO_FLUSH_WC) {
+		qib_flush_wc();
+		qib_pio_copy(piobuf + 2, tmpbuf, plen - 1);
+		qib_flush_wc();
+		__raw_writel(tmpbuf[plen - 1], piobuf + plen + 1);
+	} else
+		qib_pio_copy(piobuf + 2, tmpbuf, plen);
+
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf + spcl_off);
+	}
+
+	/*
+	 * Ensure buffer is written to the chip, then re-enable
+	 * header checks (if supported by chip).  The txchk
+	 * code will ensure seen by chip before returning.
+	 */
+	qib_flush_wc();
+	qib_sendbuf_done(dd, pbufn);
+	dd->f_txchk_change(dd, pbufn, 1, TXCHK_CHG_TYPE_ENAB1, NULL);
+
+	ret = sizeof(dp);
+
+bail:
+	vfree(tmpbuf);
+	return ret;
+}
+
+static int qib_diag_release(struct inode *in, struct file *fp)
+{
+	mutex_lock(&qib_mutex);
+	return_client(fp->private_data);
+	fp->private_data = NULL;
+	mutex_unlock(&qib_mutex);
+	return 0;
+}
+
+/*
+ * Chip-specific code calls to register its interest in
+ * a specific range.
+ */
+struct diag_observer_list_elt {
+	struct diag_observer_list_elt *next;
+	const struct diag_observer *op;
+};
+
+int qib_register_observer(struct qib_devdata *dd,
+			  const struct diag_observer *op)
+{
+	struct diag_observer_list_elt *olp;
+	unsigned long flags;
+
+	if (!dd || !op)
+		return -EINVAL;
+	olp = vmalloc(sizeof(*olp));
+	if (!olp)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+	olp->op = op;
+	olp->next = dd->diag_observer_list;
+	dd->diag_observer_list = olp;
+	spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+
+	return 0;
+}
+
+/* Remove all registered observers when device is closed */
+static void qib_unregister_observers(struct qib_devdata *dd)
+{
+	struct diag_observer_list_elt *olp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+	olp = dd->diag_observer_list;
+	while (olp) {
+		/* Pop one observer, let go of lock */
+		dd->diag_observer_list = olp->next;
+		spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+		vfree(olp);
+		/* try again. */
+		spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+		olp = dd->diag_observer_list;
+	}
+	spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+}
+
+/*
+ * Find the observer, if any, for the specified address. Initial implementation
+ * is simple stack of observers. This must be called with diag transaction
+ * lock held.
+ */
+static const struct diag_observer *diag_get_observer(struct qib_devdata *dd,
+						     u32 addr)
+{
+	struct diag_observer_list_elt *olp;
+	const struct diag_observer *op = NULL;
+
+	olp = dd->diag_observer_list;
+	while (olp) {
+		op = olp->op;
+		if (addr >= op->bottom && addr <= op->top)
+			break;
+		olp = olp->next;
+	}
+	if (!olp)
+		op = NULL;
+
+	return op;
+}
+
+static ssize_t qib_diag_read(struct file *fp, char __user *data,
+			     size_t count, loff_t *off)
+{
+	struct qib_diag_client *dc = fp->private_data;
+	struct qib_devdata *dd = dc->dd;
+	ssize_t ret;
+
+	if (dc->pid != current->pid) {
+		ret = -EPERM;
+		goto bail;
+	}
+
+	if (count == 0)
+		ret = 0;
+	else if ((count % 4) || (*off % 4))
+		/* address or length is not 32-bit aligned, hence invalid */
+		ret = -EINVAL;
+	else if (dc->state < READY && (*off || count != 8))
+		ret = -EINVAL;  /* prevent cat /dev/qib_diag* */
+	else {
+		unsigned long flags;
+		u64 data64 = 0;
+		int use_32;
+		const struct diag_observer *op;
+
+		use_32 = (count % 8) || (*off % 8);
+		ret = -1;
+		spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+		/*
+		 * Check for observer on this address range.
+		 * we only support a single 32 or 64-bit read
+		 * via observer, currently.
+		 */
+		op = diag_get_observer(dd, *off);
+		if (op) {
+			u32 offset = *off;
+
+			ret = op->hook(dd, op, offset, &data64, 0, use_32);
+		}
+		/*
+		 * We need to release lock before any copy_to_user(),
+		 * whether implicit in qib_read_umem* or explicit below.
+		 */
+		spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+		if (!op) {
+			if (use_32)
+				/*
+				 * Address or length is not 64-bit aligned;
+				 * do 32-bit rd
+				 */
+				ret = qib_read_umem32(dd, data, (u32) *off,
+						      count);
+			else
+				ret = qib_read_umem64(dd, data, (u32) *off,
+						      count);
+		} else if (ret == count) {
+			/* Below finishes case where observer existed */
+			ret = copy_to_user(data, &data64, use_32 ?
+					   sizeof(u32) : sizeof(u64));
+			if (ret)
+				ret = -EFAULT;
+		}
+	}
+
+	if (ret >= 0) {
+		*off += count;
+		ret = count;
+		if (dc->state == OPENED)
+			dc->state = INIT;
+	}
+bail:
+	return ret;
+}
+
+static ssize_t qib_diag_write(struct file *fp, const char __user *data,
+			      size_t count, loff_t *off)
+{
+	struct qib_diag_client *dc = fp->private_data;
+	struct qib_devdata *dd = dc->dd;
+	ssize_t ret;
+
+	if (dc->pid != current->pid) {
+		ret = -EPERM;
+		goto bail;
+	}
+
+	if (count == 0)
+		ret = 0;
+	else if ((count % 4) || (*off % 4))
+		/* address or length is not 32-bit aligned, hence invalid */
+		ret = -EINVAL;
+	else if (dc->state < READY &&
+		((*off || count != 8) || dc->state != INIT))
+		/* No writes except second-step of init seq */
+		ret = -EINVAL;  /* before any other write allowed */
+	else {
+		unsigned long flags;
+		const struct diag_observer *op = NULL;
+		int use_32 =  (count % 8) || (*off % 8);
+
+		/*
+		 * Check for observer on this address range.
+		 * We only support a single 32 or 64-bit write
+		 * via observer, currently. This helps, because
+		 * we would otherwise have to jump through hoops
+		 * to make "diag transaction" meaningful when we
+		 * cannot do a copy_from_user while holding the lock.
+		 */
+		if (count == 4 || count == 8) {
+			u64 data64;
+			u32 offset = *off;
+
+			ret = copy_from_user(&data64, data, count);
+			if (ret) {
+				ret = -EFAULT;
+				goto bail;
+			}
+			spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+			op = diag_get_observer(dd, *off);
+			if (op)
+				ret = op->hook(dd, op, offset, &data64, ~0Ull,
+					       use_32);
+			spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+		}
+
+		if (!op) {
+			if (use_32)
+				/*
+				 * Address or length is not 64-bit aligned;
+				 * do 32-bit write
+				 */
+				ret = qib_write_umem32(dd, (u32) *off, data,
+						       count);
+			else
+				ret = qib_write_umem64(dd, (u32) *off, data,
+						       count);
+		}
+	}
+
+	if (ret >= 0) {
+		*off += count;
+		ret = count;
+		if (dc->state == INIT)
+			dc->state = READY; /* all read/write OK now */
+	}
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c
new file mode 100644
index 0000000..3117cc5
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_driver.c
@@ -0,0 +1,807 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/prefetch.h>
+
+#include "qib.h"
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the init code.
+ */
+const char ib_qib_version[] = QIB_DRIVER_VERSION "\n";
+
+DEFINE_SPINLOCK(qib_devs_lock);
+LIST_HEAD(qib_dev_list);
+DEFINE_MUTEX(qib_mutex);	/* general driver use */
+
+unsigned qib_ibmtu;
+module_param_named(ibmtu, qib_ibmtu, uint, S_IRUGO);
+MODULE_PARM_DESC(ibmtu, "Set max IB MTU (0=2KB, 1=256, 2=512, ... 5=4096");
+
+unsigned qib_compat_ddr_negotiate = 1;
+module_param_named(compat_ddr_negotiate, qib_compat_ddr_negotiate, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(compat_ddr_negotiate,
+		 "Attempt pre-IBTA 1.2 DDR speed negotiation");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Intel <ibsupport@intel.com>");
+MODULE_DESCRIPTION("Intel IB driver");
+
+/*
+ * QIB_PIO_MAXIBHDR is the max IB header size allowed for in our
+ * PIO send buffers.  This is well beyond anything currently
+ * defined in the InfiniBand spec.
+ */
+#define QIB_PIO_MAXIBHDR 128
+
+/*
+ * QIB_MAX_PKT_RCV is the max # if packets processed per receive interrupt.
+ */
+#define QIB_MAX_PKT_RECV 64
+
+struct qlogic_ib_stats qib_stats;
+
+struct pci_dev *qib_get_pci_dev(struct rvt_dev_info *rdi)
+{
+	struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi);
+	struct qib_devdata *dd = container_of(ibdev,
+					      struct qib_devdata, verbs_dev);
+	return dd->pcidev;
+}
+
+/*
+ * Return count of units with at least one port ACTIVE.
+ */
+int qib_count_active_units(void)
+{
+	struct qib_devdata *dd;
+	struct qib_pportdata *ppd;
+	unsigned long flags;
+	int pidx, nunits_active = 0;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+	list_for_each_entry(dd, &qib_dev_list, list) {
+		if (!(dd->flags & QIB_PRESENT) || !dd->kregbase)
+			continue;
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+			if (ppd->lid && (ppd->lflags & (QIBL_LINKINIT |
+					 QIBL_LINKARMED | QIBL_LINKACTIVE))) {
+				nunits_active++;
+				break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+	return nunits_active;
+}
+
+/*
+ * Return count of all units, optionally return in arguments
+ * the number of usable (present) units, and the number of
+ * ports that are up.
+ */
+int qib_count_units(int *npresentp, int *nupp)
+{
+	int nunits = 0, npresent = 0, nup = 0;
+	struct qib_devdata *dd;
+	unsigned long flags;
+	int pidx;
+	struct qib_pportdata *ppd;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+
+	list_for_each_entry(dd, &qib_dev_list, list) {
+		nunits++;
+		if ((dd->flags & QIB_PRESENT) && dd->kregbase)
+			npresent++;
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+			if (ppd->lid && (ppd->lflags & (QIBL_LINKINIT |
+					 QIBL_LINKARMED | QIBL_LINKACTIVE)))
+				nup++;
+		}
+	}
+
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+
+	if (npresentp)
+		*npresentp = npresent;
+	if (nupp)
+		*nupp = nup;
+
+	return nunits;
+}
+
+/**
+ * qib_wait_linkstate - wait for an IB link state change to occur
+ * @dd: the qlogic_ib device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * wait up to msecs milliseconds for IB link state change to occur for
+ * now, take the easy polling route.  Currently used only by
+ * qib_set_linkstate.  Returns 0 if state reached, otherwise
+ * -ETIMEDOUT state can have multiple states set, for any of several
+ * transitions.
+ */
+int qib_wait_linkstate(struct qib_pportdata *ppd, u32 state, int msecs)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	if (ppd->state_wanted) {
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		ret = -EBUSY;
+		goto bail;
+	}
+	ppd->state_wanted = state;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	wait_event_interruptible_timeout(ppd->state_wait,
+					 (ppd->lflags & state),
+					 msecs_to_jiffies(msecs));
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->state_wanted = 0;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+	if (!(ppd->lflags & state))
+		ret = -ETIMEDOUT;
+	else
+		ret = 0;
+bail:
+	return ret;
+}
+
+int qib_set_linkstate(struct qib_pportdata *ppd, u8 newstate)
+{
+	u32 lstate;
+	int ret;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	switch (newstate) {
+	case QIB_IB_LINKDOWN_ONLY:
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_DOWN | IB_LINKINITCMD_NOP);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case QIB_IB_LINKDOWN:
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_DOWN | IB_LINKINITCMD_POLL);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case QIB_IB_LINKDOWN_SLEEP:
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_DOWN | IB_LINKINITCMD_SLEEP);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case QIB_IB_LINKDOWN_DISABLE:
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_DOWN | IB_LINKINITCMD_DISABLE);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case QIB_IB_LINKARM:
+		if (ppd->lflags & QIBL_LINKARMED) {
+			ret = 0;
+			goto bail;
+		}
+		if (!(ppd->lflags & (QIBL_LINKINIT | QIBL_LINKACTIVE))) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		/*
+		 * Since the port can be ACTIVE when we ask for ARMED,
+		 * clear QIBL_LINKV so we can wait for a transition.
+		 * If the link isn't ARMED, then something else happened
+		 * and there is no point waiting for ARMED.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_LINKV;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_ARMED | IB_LINKINITCMD_NOP);
+		lstate = QIBL_LINKV;
+		break;
+
+	case QIB_IB_LINKACTIVE:
+		if (ppd->lflags & QIBL_LINKACTIVE) {
+			ret = 0;
+			goto bail;
+		}
+		if (!(ppd->lflags & QIBL_LINKARMED)) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_ACTIVE | IB_LINKINITCMD_NOP);
+		lstate = QIBL_LINKACTIVE;
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+	ret = qib_wait_linkstate(ppd, lstate, 10);
+
+bail:
+	return ret;
+}
+
+/*
+ * Get address of eager buffer from it's index (allocated in chunks, not
+ * contiguous).
+ */
+static inline void *qib_get_egrbuf(const struct qib_ctxtdata *rcd, u32 etail)
+{
+	const u32 chunk = etail >> rcd->rcvegrbufs_perchunk_shift;
+	const u32 idx =  etail & ((u32)rcd->rcvegrbufs_perchunk - 1);
+
+	return rcd->rcvegrbuf[chunk] + (idx << rcd->dd->rcvegrbufsize_shift);
+}
+
+/*
+ * Returns 1 if error was a CRC, else 0.
+ * Needed for some chip's synthesized error counters.
+ */
+static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd,
+			  u32 ctxt, u32 eflags, u32 l, u32 etail,
+			  __le32 *rhf_addr, struct qib_message_header *rhdr)
+{
+	u32 ret = 0;
+
+	if (eflags & (QLOGIC_IB_RHF_H_ICRCERR | QLOGIC_IB_RHF_H_VCRCERR))
+		ret = 1;
+	else if (eflags == QLOGIC_IB_RHF_H_TIDERR) {
+		/* For TIDERR and RC QPs premptively schedule a NAK */
+		struct ib_header *hdr = (struct ib_header *)rhdr;
+		struct ib_other_headers *ohdr = NULL;
+		struct qib_ibport *ibp = &ppd->ibport_data;
+		struct qib_devdata *dd = ppd->dd;
+		struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+		struct rvt_qp *qp = NULL;
+		u32 tlen = qib_hdrget_length_in_bytes(rhf_addr);
+		u16 lid  = be16_to_cpu(hdr->lrh[1]);
+		int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+		u32 qp_num;
+		u32 opcode;
+		u32 psn;
+		int diff;
+
+		/* Sanity check packet */
+		if (tlen < 24)
+			goto drop;
+
+		if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+			lid &= ~((1 << ppd->lmc) - 1);
+			if (unlikely(lid != ppd->lid))
+				goto drop;
+		}
+
+		/* Check for GRH */
+		if (lnh == QIB_LRH_BTH)
+			ohdr = &hdr->u.oth;
+		else if (lnh == QIB_LRH_GRH) {
+			u32 vtf;
+
+			ohdr = &hdr->u.l.oth;
+			if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+				goto drop;
+			vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+			if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+				goto drop;
+		} else
+			goto drop;
+
+		/* Get opcode and PSN from packet */
+		opcode = be32_to_cpu(ohdr->bth[0]);
+		opcode >>= 24;
+		psn = be32_to_cpu(ohdr->bth[2]);
+
+		/* Get the destination QP number. */
+		qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+		if (qp_num != QIB_MULTICAST_QPN) {
+			int ruc_res;
+
+			rcu_read_lock();
+			qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+			if (!qp) {
+				rcu_read_unlock();
+				goto drop;
+			}
+
+			/*
+			 * Handle only RC QPs - for other QP types drop error
+			 * packet.
+			 */
+			spin_lock(&qp->r_lock);
+
+			/* Check for valid receive state. */
+			if (!(ib_rvt_state_ops[qp->state] &
+			      RVT_PROCESS_RECV_OK)) {
+				ibp->rvp.n_pkt_drops++;
+				goto unlock;
+			}
+
+			switch (qp->ibqp.qp_type) {
+			case IB_QPT_RC:
+				ruc_res =
+					qib_ruc_check_hdr(
+						ibp, hdr,
+						lnh == QIB_LRH_GRH,
+						qp,
+						be32_to_cpu(ohdr->bth[0]));
+				if (ruc_res)
+					goto unlock;
+
+				/* Only deal with RDMA Writes for now */
+				if (opcode <
+				    IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
+					diff = qib_cmp24(psn, qp->r_psn);
+					if (!qp->r_nak_state && diff >= 0) {
+						ibp->rvp.n_rc_seqnak++;
+						qp->r_nak_state =
+							IB_NAK_PSN_ERROR;
+						/* Use the expected PSN. */
+						qp->r_ack_psn = qp->r_psn;
+						/*
+						 * Wait to send the sequence
+						 * NAK until all packets
+						 * in the receive queue have
+						 * been processed.
+						 * Otherwise, we end up
+						 * propagating congestion.
+						 */
+						if (list_empty(&qp->rspwait)) {
+							qp->r_flags |=
+								RVT_R_RSP_NAK;
+							rvt_get_qp(qp);
+							list_add_tail(
+							 &qp->rspwait,
+							 &rcd->qp_wait_list);
+						}
+					} /* Out of sequence NAK */
+				} /* QP Request NAKs */
+				break;
+			case IB_QPT_SMI:
+			case IB_QPT_GSI:
+			case IB_QPT_UD:
+			case IB_QPT_UC:
+			default:
+				/* For now don't handle any other QP types */
+				break;
+			}
+
+unlock:
+			spin_unlock(&qp->r_lock);
+			rcu_read_unlock();
+		} /* Unicast QP */
+	} /* Valid packet with TIDErr */
+
+drop:
+	return ret;
+}
+
+/*
+ * qib_kreceive - receive a packet
+ * @rcd: the qlogic_ib context
+ * @llic: gets count of good packets needed to clear lli,
+ *          (used with chips that need need to track crcs for lli)
+ *
+ * called from interrupt handler for errors or receive interrupt
+ * Returns number of CRC error packets, needed by some chips for
+ * local link integrity tracking.   crcs are adjusted down by following
+ * good packets, if any, and count of good packets is also tracked.
+ */
+u32 qib_kreceive(struct qib_ctxtdata *rcd, u32 *llic, u32 *npkts)
+{
+	struct qib_devdata *dd = rcd->dd;
+	struct qib_pportdata *ppd = rcd->ppd;
+	__le32 *rhf_addr;
+	void *ebuf;
+	const u32 rsize = dd->rcvhdrentsize;        /* words */
+	const u32 maxcnt = dd->rcvhdrcnt * rsize;   /* words */
+	u32 etail = -1, l, hdrqtail;
+	struct qib_message_header *hdr;
+	u32 eflags, etype, tlen, i = 0, updegr = 0, crcs = 0;
+	int last;
+	u64 lval;
+	struct rvt_qp *qp, *nqp;
+
+	l = rcd->head;
+	rhf_addr = (__le32 *) rcd->rcvhdrq + l + dd->rhf_offset;
+	if (dd->flags & QIB_NODMA_RTAIL) {
+		u32 seq = qib_hdrget_seq(rhf_addr);
+
+		if (seq != rcd->seq_cnt)
+			goto bail;
+		hdrqtail = 0;
+	} else {
+		hdrqtail = qib_get_rcvhdrtail(rcd);
+		if (l == hdrqtail)
+			goto bail;
+		smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
+	}
+
+	for (last = 0, i = 1; !last; i += !last) {
+		hdr = dd->f_get_msgheader(dd, rhf_addr);
+		eflags = qib_hdrget_err_flags(rhf_addr);
+		etype = qib_hdrget_rcv_type(rhf_addr);
+		/* total length */
+		tlen = qib_hdrget_length_in_bytes(rhf_addr);
+		ebuf = NULL;
+		if ((dd->flags & QIB_NODMA_RTAIL) ?
+		    qib_hdrget_use_egr_buf(rhf_addr) :
+		    (etype != RCVHQ_RCV_TYPE_EXPECTED)) {
+			etail = qib_hdrget_index(rhf_addr);
+			updegr = 1;
+			if (tlen > sizeof(*hdr) ||
+			    etype >= RCVHQ_RCV_TYPE_NON_KD) {
+				ebuf = qib_get_egrbuf(rcd, etail);
+				prefetch_range(ebuf, tlen - sizeof(*hdr));
+			}
+		}
+		if (!eflags) {
+			u16 lrh_len = be16_to_cpu(hdr->lrh[2]) << 2;
+
+			if (lrh_len != tlen) {
+				qib_stats.sps_lenerrs++;
+				goto move_along;
+			}
+		}
+		if (etype == RCVHQ_RCV_TYPE_NON_KD && !eflags &&
+		    ebuf == NULL &&
+		    tlen > (dd->rcvhdrentsize - 2 + 1 -
+				qib_hdrget_offset(rhf_addr)) << 2) {
+			goto move_along;
+		}
+
+		/*
+		 * Both tiderr and qibhdrerr are set for all plain IB
+		 * packets; only qibhdrerr should be set.
+		 */
+		if (unlikely(eflags))
+			crcs += qib_rcv_hdrerr(rcd, ppd, rcd->ctxt, eflags, l,
+					       etail, rhf_addr, hdr);
+		else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
+			qib_ib_rcv(rcd, hdr, ebuf, tlen);
+			if (crcs)
+				crcs--;
+			else if (llic && *llic)
+				--*llic;
+		}
+move_along:
+		l += rsize;
+		if (l >= maxcnt)
+			l = 0;
+		if (i == QIB_MAX_PKT_RECV)
+			last = 1;
+
+		rhf_addr = (__le32 *) rcd->rcvhdrq + l + dd->rhf_offset;
+		if (dd->flags & QIB_NODMA_RTAIL) {
+			u32 seq = qib_hdrget_seq(rhf_addr);
+
+			if (++rcd->seq_cnt > 13)
+				rcd->seq_cnt = 1;
+			if (seq != rcd->seq_cnt)
+				last = 1;
+		} else if (l == hdrqtail)
+			last = 1;
+		/*
+		 * Update head regs etc., every 16 packets, if not last pkt,
+		 * to help prevent rcvhdrq overflows, when many packets
+		 * are processed and queue is nearly full.
+		 * Don't request an interrupt for intermediate updates.
+		 */
+		lval = l;
+		if (!last && !(i & 0xf)) {
+			dd->f_update_usrhead(rcd, lval, updegr, etail, i);
+			updegr = 0;
+		}
+	}
+
+	rcd->head = l;
+
+	/*
+	 * Iterate over all QPs waiting to respond.
+	 * The list won't change since the IRQ is only run on one CPU.
+	 */
+	list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
+		list_del_init(&qp->rspwait);
+		if (qp->r_flags & RVT_R_RSP_NAK) {
+			qp->r_flags &= ~RVT_R_RSP_NAK;
+			qib_send_rc_ack(qp);
+		}
+		if (qp->r_flags & RVT_R_RSP_SEND) {
+			unsigned long flags;
+
+			qp->r_flags &= ~RVT_R_RSP_SEND;
+			spin_lock_irqsave(&qp->s_lock, flags);
+			if (ib_rvt_state_ops[qp->state] &
+					RVT_PROCESS_OR_FLUSH_SEND)
+				qib_schedule_send(qp);
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+		}
+		rvt_put_qp(qp);
+	}
+
+bail:
+	/* Report number of packets consumed */
+	if (npkts)
+		*npkts = i;
+
+	/*
+	 * Always write head at end, and setup rcv interrupt, even
+	 * if no packets were processed.
+	 */
+	lval = (u64)rcd->head | dd->rhdrhead_intr_off;
+	dd->f_update_usrhead(rcd, lval, updegr, etail, i);
+	return crcs;
+}
+
+/**
+ * qib_set_mtu - set the MTU
+ * @ppd: the perport data
+ * @arg: the new MTU
+ *
+ * We can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size.   For now, we don't do any
+ * sanity checking on this, and we don't deal with what happens to
+ * programs that are already running when the size changes.
+ * NOTE: changing the MTU will usually cause the IBC to go back to
+ * link INIT state...
+ */
+int qib_set_mtu(struct qib_pportdata *ppd, u16 arg)
+{
+	u32 piosize;
+	int ret, chk;
+
+	if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
+	    arg != 4096) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	chk = ib_mtu_enum_to_int(qib_ibmtu);
+	if (chk > 0 && arg > chk) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	piosize = ppd->ibmaxlen;
+	ppd->ibmtu = arg;
+
+	if (arg >= (piosize - QIB_PIO_MAXIBHDR)) {
+		/* Only if it's not the initial value (or reset to it) */
+		if (piosize != ppd->init_ibmaxlen) {
+			if (arg > piosize && arg <= ppd->init_ibmaxlen)
+				piosize = ppd->init_ibmaxlen - 2 * sizeof(u32);
+			ppd->ibmaxlen = piosize;
+		}
+	} else if ((arg + QIB_PIO_MAXIBHDR) != ppd->ibmaxlen) {
+		piosize = arg + QIB_PIO_MAXIBHDR - 2 * sizeof(u32);
+		ppd->ibmaxlen = piosize;
+	}
+
+	ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_MTU, 0);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int qib_set_lid(struct qib_pportdata *ppd, u32 lid, u8 lmc)
+{
+	struct qib_devdata *dd = ppd->dd;
+
+	ppd->lid = lid;
+	ppd->lmc = lmc;
+
+	dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LIDLMC,
+			 lid | (~((1U << lmc) - 1)) << 16);
+
+	qib_devinfo(dd->pcidev, "IB%u:%u got a lid: 0x%x\n",
+		    dd->unit, ppd->port, lid);
+
+	return 0;
+}
+
+/*
+ * Following deal with the "obviously simple" task of overriding the state
+ * of the LEDS, which normally indicate link physical and logical status.
+ * The complications arise in dealing with different hardware mappings
+ * and the board-dependent routine being called from interrupts.
+ * and then there's the requirement to _flash_ them.
+ */
+#define LED_OVER_FREQ_SHIFT 8
+#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
+/* Below is "non-zero" to force override, but both actual LEDs are off */
+#define LED_OVER_BOTH_OFF (8)
+
+static void qib_run_led_override(struct timer_list *t)
+{
+	struct qib_pportdata *ppd = from_timer(ppd, t,
+						    led_override_timer);
+	struct qib_devdata *dd = ppd->dd;
+	int timeoff;
+	int ph_idx;
+
+	if (!(dd->flags & QIB_INITTED))
+		return;
+
+	ph_idx = ppd->led_override_phase++ & 1;
+	ppd->led_override = ppd->led_override_vals[ph_idx];
+	timeoff = ppd->led_override_timeoff;
+
+	dd->f_setextled(ppd, 1);
+	/*
+	 * don't re-fire the timer if user asked for it to be off; we let
+	 * it fire one more time after they turn it off to simplify
+	 */
+	if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
+		mod_timer(&ppd->led_override_timer, jiffies + timeoff);
+}
+
+void qib_set_led_override(struct qib_pportdata *ppd, unsigned int val)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int timeoff, freq;
+
+	if (!(dd->flags & QIB_INITTED))
+		return;
+
+	/* First check if we are blinking. If not, use 1HZ polling */
+	timeoff = HZ;
+	freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
+
+	if (freq) {
+		/* For blink, set each phase from one nybble of val */
+		ppd->led_override_vals[0] = val & 0xF;
+		ppd->led_override_vals[1] = (val >> 4) & 0xF;
+		timeoff = (HZ << 4)/freq;
+	} else {
+		/* Non-blink set both phases the same. */
+		ppd->led_override_vals[0] = val & 0xF;
+		ppd->led_override_vals[1] = val & 0xF;
+	}
+	ppd->led_override_timeoff = timeoff;
+
+	/*
+	 * If the timer has not already been started, do so. Use a "quick"
+	 * timeout so the function will be called soon, to look at our request.
+	 */
+	if (atomic_inc_return(&ppd->led_override_timer_active) == 1) {
+		/* Need to start timer */
+		timer_setup(&ppd->led_override_timer, qib_run_led_override, 0);
+		ppd->led_override_timer.expires = jiffies + 1;
+		add_timer(&ppd->led_override_timer);
+	} else {
+		if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
+			mod_timer(&ppd->led_override_timer, jiffies + 1);
+		atomic_dec(&ppd->led_override_timer_active);
+	}
+}
+
+/**
+ * qib_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload).  We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize.  For
+ * now, we only allow this if no user contexts are open that use chip resources
+ */
+int qib_reset_device(int unit)
+{
+	int ret, i;
+	struct qib_devdata *dd = qib_lookup(unit);
+	struct qib_pportdata *ppd;
+	unsigned long flags;
+	int pidx;
+
+	if (!dd) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	qib_devinfo(dd->pcidev, "Reset on unit %u requested\n", unit);
+
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) {
+		qib_devinfo(dd->pcidev,
+			"Invalid unit number %u or not initialized or not present\n",
+			unit);
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	if (dd->rcd)
+		for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) {
+			if (!dd->rcd[i] || !dd->rcd[i]->cnt)
+				continue;
+			spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+			ret = -EBUSY;
+			goto bail;
+		}
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (atomic_read(&ppd->led_override_timer_active)) {
+			/* Need to stop LED timer, _then_ shut off LEDs */
+			del_timer_sync(&ppd->led_override_timer);
+			atomic_set(&ppd->led_override_timer_active, 0);
+		}
+
+		/* Shut off LEDs after we are sure timer is not running */
+		ppd->led_override = LED_OVER_BOTH_OFF;
+		dd->f_setextled(ppd, 0);
+		if (dd->flags & QIB_HAS_SEND_DMA)
+			qib_teardown_sdma(ppd);
+	}
+
+	ret = dd->f_reset(dd);
+	if (ret == 1)
+		ret = qib_init(dd, 1);
+	else
+		ret = -EAGAIN;
+	if (ret)
+		qib_dev_err(dd,
+			"Reinitialize unit %u after reset failed with %d\n",
+			unit, ret);
+	else
+		qib_devinfo(dd->pcidev,
+			"Reinitialized unit %u after resetting\n",
+			unit);
+
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_eeprom.c b/drivers/infiniband/hw/qib/qib_eeprom.c
new file mode 100644
index 0000000..5838b3b
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_eeprom.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "qib.h"
+
+/*
+ * Functions specific to the serial EEPROM on cards handled by ib_qib.
+ * The actual serail interface code is in qib_twsi.c. This file is a client
+ */
+
+/**
+ * qib_eeprom_read - receives bytes from the eeprom via I2C
+ * @dd: the qlogic_ib device
+ * @eeprom_offset: address to read from
+ * @buffer: where to store result
+ * @len: number of bytes to receive
+ */
+int qib_eeprom_read(struct qib_devdata *dd, u8 eeprom_offset,
+		    void *buff, int len)
+{
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (!ret) {
+		ret = qib_twsi_reset(dd);
+		if (ret)
+			qib_dev_err(dd, "EEPROM Reset for read failed\n");
+		else
+			ret = qib_twsi_blk_rd(dd, dd->twsi_eeprom_dev,
+					      eeprom_offset, buff, len);
+		mutex_unlock(&dd->eep_lock);
+	}
+
+	return ret;
+}
+
+/*
+ * Actually update the eeprom, first doing write enable if
+ * needed, then restoring write enable state.
+ * Must be called with eep_lock held
+ */
+static int eeprom_write_with_enable(struct qib_devdata *dd, u8 offset,
+		     const void *buf, int len)
+{
+	int ret, pwen;
+
+	pwen = dd->f_eeprom_wen(dd, 1);
+	ret = qib_twsi_reset(dd);
+	if (ret)
+		qib_dev_err(dd, "EEPROM Reset for write failed\n");
+	else
+		ret = qib_twsi_blk_wr(dd, dd->twsi_eeprom_dev,
+				      offset, buf, len);
+	dd->f_eeprom_wen(dd, pwen);
+	return ret;
+}
+
+/**
+ * qib_eeprom_write - writes data to the eeprom via I2C
+ * @dd: the qlogic_ib device
+ * @eeprom_offset: where to place data
+ * @buffer: data to write
+ * @len: number of bytes to write
+ */
+int qib_eeprom_write(struct qib_devdata *dd, u8 eeprom_offset,
+		     const void *buff, int len)
+{
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (!ret) {
+		ret = eeprom_write_with_enable(dd, eeprom_offset, buff, len);
+		mutex_unlock(&dd->eep_lock);
+	}
+
+	return ret;
+}
+
+static u8 flash_csum(struct qib_flash *ifp, int adjust)
+{
+	u8 *ip = (u8 *) ifp;
+	u8 csum = 0, len;
+
+	/*
+	 * Limit length checksummed to max length of actual data.
+	 * Checksum of erased eeprom will still be bad, but we avoid
+	 * reading past the end of the buffer we were passed.
+	 */
+	len = ifp->if_length;
+	if (len > sizeof(struct qib_flash))
+		len = sizeof(struct qib_flash);
+	while (len--)
+		csum += *ip++;
+	csum -= ifp->if_csum;
+	csum = ~csum;
+	if (adjust)
+		ifp->if_csum = csum;
+
+	return csum;
+}
+
+/**
+ * qib_get_eeprom_info- get the GUID et al. from the TSWI EEPROM device
+ * @dd: the qlogic_ib device
+ *
+ * We have the capability to use the nguid field, and get
+ * the guid from the first chip's flash, to use for all of them.
+ */
+void qib_get_eeprom_info(struct qib_devdata *dd)
+{
+	void *buf;
+	struct qib_flash *ifp;
+	__be64 guid;
+	int len, eep_stat;
+	u8 csum, *bguid;
+	int t = dd->unit;
+	struct qib_devdata *dd0 = qib_lookup(0);
+
+	if (t && dd0->nguid > 1 && t <= dd0->nguid) {
+		u8 oguid;
+
+		dd->base_guid = dd0->base_guid;
+		bguid = (u8 *) &dd->base_guid;
+
+		oguid = bguid[7];
+		bguid[7] += t;
+		if (oguid > bguid[7]) {
+			if (bguid[6] == 0xff) {
+				if (bguid[5] == 0xff) {
+					qib_dev_err(dd,
+						    "Can't set GUID from base, wraps to OUI!\n");
+					dd->base_guid = 0;
+					goto bail;
+				}
+				bguid[5]++;
+			}
+			bguid[6]++;
+		}
+		dd->nguid = 1;
+		goto bail;
+	}
+
+	/*
+	 * Read full flash, not just currently used part, since it may have
+	 * been written with a newer definition.
+	 * */
+	len = sizeof(struct qib_flash);
+	buf = vmalloc(len);
+	if (!buf)
+		goto bail;
+
+	/*
+	 * Use "public" eeprom read function, which does locking and
+	 * figures out device. This will migrate to chip-specific.
+	 */
+	eep_stat = qib_eeprom_read(dd, 0, buf, len);
+
+	if (eep_stat) {
+		qib_dev_err(dd, "Failed reading GUID from eeprom\n");
+		goto done;
+	}
+	ifp = (struct qib_flash *)buf;
+
+	csum = flash_csum(ifp, 0);
+	if (csum != ifp->if_csum) {
+		qib_devinfo(dd->pcidev,
+			"Bad I2C flash checksum: 0x%x, not 0x%x\n",
+			csum, ifp->if_csum);
+		goto done;
+	}
+	if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) ||
+	    *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) {
+		qib_dev_err(dd,
+			"Invalid GUID %llx from flash; ignoring\n",
+			*(unsigned long long *) ifp->if_guid);
+		/* don't allow GUID if all 0 or all 1's */
+		goto done;
+	}
+
+	/* complain, but allow it */
+	if (*(u64 *) ifp->if_guid == 0x100007511000000ULL)
+		qib_devinfo(dd->pcidev,
+			"Warning, GUID %llx is default, probably not correct!\n",
+			*(unsigned long long *) ifp->if_guid);
+
+	bguid = ifp->if_guid;
+	if (!bguid[0] && !bguid[1] && !bguid[2]) {
+		/*
+		 * Original incorrect GUID format in flash; fix in
+		 * core copy, by shifting up 2 octets; don't need to
+		 * change top octet, since both it and shifted are 0.
+		 */
+		bguid[1] = bguid[3];
+		bguid[2] = bguid[4];
+		bguid[3] = 0;
+		bguid[4] = 0;
+		guid = *(__be64 *) ifp->if_guid;
+	} else
+		guid = *(__be64 *) ifp->if_guid;
+	dd->base_guid = guid;
+	dd->nguid = ifp->if_numguid;
+	/*
+	 * Things are slightly complicated by the desire to transparently
+	 * support both the Pathscale 10-digit serial number and the QLogic
+	 * 13-character version.
+	 */
+	if ((ifp->if_fversion > 1) && ifp->if_sprefix[0] &&
+	    ((u8 *) ifp->if_sprefix)[0] != 0xFF) {
+		char *snp = dd->serial;
+
+		/*
+		 * This board has a Serial-prefix, which is stored
+		 * elsewhere for backward-compatibility.
+		 */
+		memcpy(snp, ifp->if_sprefix, sizeof(ifp->if_sprefix));
+		snp[sizeof(ifp->if_sprefix)] = '\0';
+		len = strlen(snp);
+		snp += len;
+		len = sizeof(dd->serial) - len;
+		if (len > sizeof(ifp->if_serial))
+			len = sizeof(ifp->if_serial);
+		memcpy(snp, ifp->if_serial, len);
+	} else {
+		memcpy(dd->serial, ifp->if_serial, sizeof(ifp->if_serial));
+	}
+	if (!strstr(ifp->if_comment, "Tested successfully"))
+		qib_dev_err(dd,
+			"Board SN %s did not pass functional test: %s\n",
+			dd->serial, ifp->if_comment);
+
+done:
+	vfree(buf);
+
+bail:;
+}
+
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
new file mode 100644
index 0000000..6a8800b
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -0,0 +1,2401 @@
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <asm/pgtable.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/uio.h>
+
+#include <rdma/ib.h>
+
+#include "qib.h"
+#include "qib_common.h"
+#include "qib_user_sdma.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt
+
+static int qib_open(struct inode *, struct file *);
+static int qib_close(struct inode *, struct file *);
+static ssize_t qib_write(struct file *, const char __user *, size_t, loff_t *);
+static ssize_t qib_write_iter(struct kiocb *, struct iov_iter *);
+static __poll_t qib_poll(struct file *, struct poll_table_struct *);
+static int qib_mmapf(struct file *, struct vm_area_struct *);
+
+/*
+ * This is really, really weird shit - write() and writev() here
+ * have completely unrelated semantics.  Sucky userland ABI,
+ * film at 11.
+ */
+static const struct file_operations qib_file_ops = {
+	.owner = THIS_MODULE,
+	.write = qib_write,
+	.write_iter = qib_write_iter,
+	.open = qib_open,
+	.release = qib_close,
+	.poll = qib_poll,
+	.mmap = qib_mmapf,
+	.llseek = noop_llseek,
+};
+
+/*
+ * Convert kernel virtual addresses to physical addresses so they don't
+ * potentially conflict with the chip addresses used as mmap offsets.
+ * It doesn't really matter what mmap offset we use as long as we can
+ * interpret it correctly.
+ */
+static u64 cvt_kvaddr(void *p)
+{
+	struct page *page;
+	u64 paddr = 0;
+
+	page = vmalloc_to_page(p);
+	if (page)
+		paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+	return paddr;
+}
+
+static int qib_get_base_info(struct file *fp, void __user *ubase,
+			     size_t ubase_size)
+{
+	struct qib_ctxtdata *rcd = ctxt_fp(fp);
+	int ret = 0;
+	struct qib_base_info *kinfo = NULL;
+	struct qib_devdata *dd = rcd->dd;
+	struct qib_pportdata *ppd = rcd->ppd;
+	unsigned subctxt_cnt;
+	int shared, master;
+	size_t sz;
+
+	subctxt_cnt = rcd->subctxt_cnt;
+	if (!subctxt_cnt) {
+		shared = 0;
+		master = 0;
+		subctxt_cnt = 1;
+	} else {
+		shared = 1;
+		master = !subctxt_fp(fp);
+	}
+
+	sz = sizeof(*kinfo);
+	/* If context sharing is not requested, allow the old size structure */
+	if (!shared)
+		sz -= 7 * sizeof(u64);
+	if (ubase_size < sz) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
+	if (kinfo == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	ret = dd->f_get_base_info(rcd, kinfo);
+	if (ret < 0)
+		goto bail;
+
+	kinfo->spi_rcvhdr_cnt = dd->rcvhdrcnt;
+	kinfo->spi_rcvhdrent_size = dd->rcvhdrentsize;
+	kinfo->spi_tidegrcnt = rcd->rcvegrcnt;
+	kinfo->spi_rcv_egrbufsize = dd->rcvegrbufsize;
+	/*
+	 * have to mmap whole thing
+	 */
+	kinfo->spi_rcv_egrbuftotlen =
+		rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size;
+	kinfo->spi_rcv_egrperchunk = rcd->rcvegrbufs_perchunk;
+	kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
+		rcd->rcvegrbuf_chunks;
+	kinfo->spi_tidcnt = dd->rcvtidcnt / subctxt_cnt;
+	if (master)
+		kinfo->spi_tidcnt += dd->rcvtidcnt % subctxt_cnt;
+	/*
+	 * for this use, may be cfgctxts summed over all chips that
+	 * are are configured and present
+	 */
+	kinfo->spi_nctxts = dd->cfgctxts;
+	/* unit (chip/board) our context is on */
+	kinfo->spi_unit = dd->unit;
+	kinfo->spi_port = ppd->port;
+	/* for now, only a single page */
+	kinfo->spi_tid_maxsize = PAGE_SIZE;
+
+	/*
+	 * Doing this per context, and based on the skip value, etc.  This has
+	 * to be the actual buffer size, since the protocol code treats it
+	 * as an array.
+	 *
+	 * These have to be set to user addresses in the user code via mmap.
+	 * These values are used on return to user code for the mmap target
+	 * addresses only.  For 32 bit, same 44 bit address problem, so use
+	 * the physical address, not virtual.  Before 2.6.11, using the
+	 * page_address() macro worked, but in 2.6.11, even that returns the
+	 * full 64 bit address (upper bits all 1's).  So far, using the
+	 * physical addresses (or chip offsets, for chip mapping) works, but
+	 * no doubt some future kernel release will change that, and we'll be
+	 * on to yet another method of dealing with this.
+	 * Normally only one of rcvhdr_tailaddr or rhf_offset is useful
+	 * since the chips with non-zero rhf_offset don't normally
+	 * enable tail register updates to host memory, but for testing,
+	 * both can be enabled and used.
+	 */
+	kinfo->spi_rcvhdr_base = (u64) rcd->rcvhdrq_phys;
+	kinfo->spi_rcvhdr_tailaddr = (u64) rcd->rcvhdrqtailaddr_phys;
+	kinfo->spi_rhf_offset = dd->rhf_offset;
+	kinfo->spi_rcv_egrbufs = (u64) rcd->rcvegr_phys;
+	kinfo->spi_pioavailaddr = (u64) dd->pioavailregs_phys;
+	/* setup per-unit (not port) status area for user programs */
+	kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
+		(char *) ppd->statusp -
+		(char *) dd->pioavailregs_dma;
+	kinfo->spi_uregbase = (u64) dd->uregbase + dd->ureg_align * rcd->ctxt;
+	if (!shared) {
+		kinfo->spi_piocnt = rcd->piocnt;
+		kinfo->spi_piobufbase = (u64) rcd->piobufs;
+		kinfo->spi_sendbuf_status = cvt_kvaddr(rcd->user_event_mask);
+	} else if (master) {
+		kinfo->spi_piocnt = (rcd->piocnt / subctxt_cnt) +
+				    (rcd->piocnt % subctxt_cnt);
+		/* Master's PIO buffers are after all the slave's */
+		kinfo->spi_piobufbase = (u64) rcd->piobufs +
+			dd->palign *
+			(rcd->piocnt - kinfo->spi_piocnt);
+	} else {
+		unsigned slave = subctxt_fp(fp) - 1;
+
+		kinfo->spi_piocnt = rcd->piocnt / subctxt_cnt;
+		kinfo->spi_piobufbase = (u64) rcd->piobufs +
+			dd->palign * kinfo->spi_piocnt * slave;
+	}
+
+	if (shared) {
+		kinfo->spi_sendbuf_status =
+			cvt_kvaddr(&rcd->user_event_mask[subctxt_fp(fp)]);
+		/* only spi_subctxt_* fields should be set in this block! */
+		kinfo->spi_subctxt_uregbase = cvt_kvaddr(rcd->subctxt_uregbase);
+
+		kinfo->spi_subctxt_rcvegrbuf =
+			cvt_kvaddr(rcd->subctxt_rcvegrbuf);
+		kinfo->spi_subctxt_rcvhdr_base =
+			cvt_kvaddr(rcd->subctxt_rcvhdr_base);
+	}
+
+	/*
+	 * All user buffers are 2KB buffers.  If we ever support
+	 * giving 4KB buffers to user processes, this will need some
+	 * work.  Can't use piobufbase directly, because it has
+	 * both 2K and 4K buffer base values.
+	 */
+	kinfo->spi_pioindex = (kinfo->spi_piobufbase - dd->pio2k_bufbase) /
+		dd->palign;
+	kinfo->spi_pioalign = dd->palign;
+	kinfo->spi_qpair = QIB_KD_QP;
+	/*
+	 * user mode PIO buffers are always 2KB, even when 4KB can
+	 * be received, and sent via the kernel; this is ibmaxlen
+	 * for 2K MTU.
+	 */
+	kinfo->spi_piosize = dd->piosize2k - 2 * sizeof(u32);
+	kinfo->spi_mtu = ppd->ibmaxlen; /* maxlen, not ibmtu */
+	kinfo->spi_ctxt = rcd->ctxt;
+	kinfo->spi_subctxt = subctxt_fp(fp);
+	kinfo->spi_sw_version = QIB_KERN_SWVERSION;
+	kinfo->spi_sw_version |= 1U << 31; /* QLogic-built, not kernel.org */
+	kinfo->spi_hw_version = dd->revision;
+
+	if (master)
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_MASTER;
+
+	sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
+	if (copy_to_user(ubase, kinfo, sz))
+		ret = -EFAULT;
+bail:
+	kfree(kinfo);
+	return ret;
+}
+
+/**
+ * qib_tid_update - update a context TID
+ * @rcd: the context
+ * @fp: the qib device file
+ * @ti: the TID information
+ *
+ * The new implementation as of Oct 2004 is that the driver assigns
+ * the tid and returns it to the caller.   To reduce search time, we
+ * keep a cursor for each context, walking the shadow tid array to find
+ * one that's not in use.
+ *
+ * For now, if we can't allocate the full list, we fail, although
+ * in the long run, we'll allocate as many as we can, and the
+ * caller will deal with that by trying the remaining pages later.
+ * That means that when we fail, we have to mark the tids as not in
+ * use again, in our shadow copy.
+ *
+ * It's up to the caller to free the tids when they are done.
+ * We'll unlock the pages as they free them.
+ *
+ * Also, right now we are locking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.
+ */
+static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp,
+			  const struct qib_tid_info *ti)
+{
+	int ret = 0, ntids;
+	u32 tid, ctxttid, cnt, i, tidcnt, tidoff;
+	u16 *tidlist;
+	struct qib_devdata *dd = rcd->dd;
+	u64 physaddr;
+	unsigned long vaddr;
+	u64 __iomem *tidbase;
+	unsigned long tidmap[8];
+	struct page **pagep = NULL;
+	unsigned subctxt = subctxt_fp(fp);
+
+	if (!dd->pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	cnt = ti->tidcnt;
+	if (!cnt) {
+		ret = -EFAULT;
+		goto done;
+	}
+	ctxttid = rcd->ctxt * dd->rcvtidcnt;
+	if (!rcd->subctxt_cnt) {
+		tidcnt = dd->rcvtidcnt;
+		tid = rcd->tidcursor;
+		tidoff = 0;
+	} else if (!subctxt) {
+		tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) +
+			 (dd->rcvtidcnt % rcd->subctxt_cnt);
+		tidoff = dd->rcvtidcnt - tidcnt;
+		ctxttid += tidoff;
+		tid = tidcursor_fp(fp);
+	} else {
+		tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt;
+		tidoff = tidcnt * (subctxt - 1);
+		ctxttid += tidoff;
+		tid = tidcursor_fp(fp);
+	}
+	if (cnt > tidcnt) {
+		/* make sure it all fits in tid_pg_list */
+		qib_devinfo(dd->pcidev,
+			"Process tried to allocate %u TIDs, only trying max (%u)\n",
+			cnt, tidcnt);
+		cnt = tidcnt;
+	}
+	pagep = (struct page **) rcd->tid_pg_list;
+	tidlist = (u16 *) &pagep[dd->rcvtidcnt];
+	pagep += tidoff;
+	tidlist += tidoff;
+
+	memset(tidmap, 0, sizeof(tidmap));
+	/* before decrement; chip actual # */
+	ntids = tidcnt;
+	tidbase = (u64 __iomem *) (((char __iomem *) dd->kregbase) +
+				   dd->rcvtidbase +
+				   ctxttid * sizeof(*tidbase));
+
+	/* virtual address of first page in transfer */
+	vaddr = ti->tidvaddr;
+	if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
+		       cnt * PAGE_SIZE)) {
+		ret = -EFAULT;
+		goto done;
+	}
+	ret = qib_get_user_pages(vaddr, cnt, pagep);
+	if (ret) {
+		/*
+		 * if (ret == -EBUSY)
+		 * We can't continue because the pagep array won't be
+		 * initialized. This should never happen,
+		 * unless perhaps the user has mpin'ed the pages
+		 * themselves.
+		 */
+		qib_devinfo(
+			dd->pcidev,
+			"Failed to lock addr %p, %u pages: errno %d\n",
+			(void *) vaddr, cnt, -ret);
+		goto done;
+	}
+	for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
+		for (; ntids--; tid++) {
+			if (tid == tidcnt)
+				tid = 0;
+			if (!dd->pageshadow[ctxttid + tid])
+				break;
+		}
+		if (ntids < 0) {
+			/*
+			 * Oops, wrapped all the way through their TIDs,
+			 * and didn't have enough free; see comments at
+			 * start of routine
+			 */
+			i--;    /* last tidlist[i] not filled in */
+			ret = -ENOMEM;
+			break;
+		}
+		tidlist[i] = tid + tidoff;
+		/* we "know" system pages and TID pages are same size */
+		dd->pageshadow[ctxttid + tid] = pagep[i];
+		dd->physshadow[ctxttid + tid] =
+			qib_map_page(dd->pcidev, pagep[i], 0, PAGE_SIZE,
+				     PCI_DMA_FROMDEVICE);
+		/*
+		 * don't need atomic or it's overhead
+		 */
+		__set_bit(tid, tidmap);
+		physaddr = dd->physshadow[ctxttid + tid];
+		/* PERFORMANCE: below should almost certainly be cached */
+		dd->f_put_tid(dd, &tidbase[tid],
+				  RCVHQ_RCV_TYPE_EXPECTED, physaddr);
+		/*
+		 * don't check this tid in qib_ctxtshadow, since we
+		 * just filled it in; start with the next one.
+		 */
+		tid++;
+	}
+
+	if (ret) {
+		u32 limit;
+cleanup:
+		/* jump here if copy out of updated info failed... */
+		/* same code that's in qib_free_tid() */
+		limit = sizeof(tidmap) * BITS_PER_BYTE;
+		if (limit > tidcnt)
+			/* just in case size changes in future */
+			limit = tidcnt;
+		tid = find_first_bit((const unsigned long *)tidmap, limit);
+		for (; tid < limit; tid++) {
+			if (!test_bit(tid, tidmap))
+				continue;
+			if (dd->pageshadow[ctxttid + tid]) {
+				dma_addr_t phys;
+
+				phys = dd->physshadow[ctxttid + tid];
+				dd->physshadow[ctxttid + tid] = dd->tidinvalid;
+				/* PERFORMANCE: below should almost certainly
+				 * be cached
+				 */
+				dd->f_put_tid(dd, &tidbase[tid],
+					      RCVHQ_RCV_TYPE_EXPECTED,
+					      dd->tidinvalid);
+				pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,
+					       PCI_DMA_FROMDEVICE);
+				dd->pageshadow[ctxttid + tid] = NULL;
+			}
+		}
+		qib_release_user_pages(pagep, cnt);
+	} else {
+		/*
+		 * Copy the updated array, with qib_tid's filled in, back
+		 * to user.  Since we did the copy in already, this "should
+		 * never fail" If it does, we have to clean up...
+		 */
+		if (copy_to_user((void __user *)
+				 (unsigned long) ti->tidlist,
+				 tidlist, cnt * sizeof(*tidlist))) {
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (copy_to_user(u64_to_user_ptr(ti->tidmap),
+				 tidmap, sizeof(tidmap))) {
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (tid == tidcnt)
+			tid = 0;
+		if (!rcd->subctxt_cnt)
+			rcd->tidcursor = tid;
+		else
+			tidcursor_fp(fp) = tid;
+	}
+
+done:
+	return ret;
+}
+
+/**
+ * qib_tid_free - free a context TID
+ * @rcd: the context
+ * @subctxt: the subcontext
+ * @ti: the TID info
+ *
+ * right now we are unlocking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.  We check that the TID is in range for this context
+ * but otherwise don't check validity; if user has an error and
+ * frees the wrong tid, it's only their own data that can thereby
+ * be corrupted.  We do check that the TID was in use, for sanity
+ * We always use our idea of the saved address, not the address that
+ * they pass in to us.
+ */
+static int qib_tid_free(struct qib_ctxtdata *rcd, unsigned subctxt,
+			const struct qib_tid_info *ti)
+{
+	int ret = 0;
+	u32 tid, ctxttid, cnt, limit, tidcnt;
+	struct qib_devdata *dd = rcd->dd;
+	u64 __iomem *tidbase;
+	unsigned long tidmap[8];
+
+	if (!dd->pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	if (copy_from_user(tidmap, u64_to_user_ptr(ti->tidmap),
+			   sizeof(tidmap))) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	ctxttid = rcd->ctxt * dd->rcvtidcnt;
+	if (!rcd->subctxt_cnt)
+		tidcnt = dd->rcvtidcnt;
+	else if (!subctxt) {
+		tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) +
+			 (dd->rcvtidcnt % rcd->subctxt_cnt);
+		ctxttid += dd->rcvtidcnt - tidcnt;
+	} else {
+		tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt;
+		ctxttid += tidcnt * (subctxt - 1);
+	}
+	tidbase = (u64 __iomem *) ((char __iomem *)(dd->kregbase) +
+				   dd->rcvtidbase +
+				   ctxttid * sizeof(*tidbase));
+
+	limit = sizeof(tidmap) * BITS_PER_BYTE;
+	if (limit > tidcnt)
+		/* just in case size changes in future */
+		limit = tidcnt;
+	tid = find_first_bit(tidmap, limit);
+	for (cnt = 0; tid < limit; tid++) {
+		/*
+		 * small optimization; if we detect a run of 3 or so without
+		 * any set, use find_first_bit again.  That's mainly to
+		 * accelerate the case where we wrapped, so we have some at
+		 * the beginning, and some at the end, and a big gap
+		 * in the middle.
+		 */
+		if (!test_bit(tid, tidmap))
+			continue;
+		cnt++;
+		if (dd->pageshadow[ctxttid + tid]) {
+			struct page *p;
+			dma_addr_t phys;
+
+			p = dd->pageshadow[ctxttid + tid];
+			dd->pageshadow[ctxttid + tid] = NULL;
+			phys = dd->physshadow[ctxttid + tid];
+			dd->physshadow[ctxttid + tid] = dd->tidinvalid;
+			/* PERFORMANCE: below should almost certainly be
+			 * cached
+			 */
+			dd->f_put_tid(dd, &tidbase[tid],
+				      RCVHQ_RCV_TYPE_EXPECTED, dd->tidinvalid);
+			pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,
+				       PCI_DMA_FROMDEVICE);
+			qib_release_user_pages(&p, 1);
+		}
+	}
+done:
+	return ret;
+}
+
+/**
+ * qib_set_part_key - set a partition key
+ * @rcd: the context
+ * @key: the key
+ *
+ * We can have up to 4 active at a time (other than the default, which is
+ * always allowed).  This is somewhat tricky, since multiple contexts may set
+ * the same key, so we reference count them, and clean up at exit.  All 4
+ * partition keys are packed into a single qlogic_ib register.  It's an
+ * error for a process to set the same pkey multiple times.  We provide no
+ * mechanism to de-allocate a pkey at this time, we may eventually need to
+ * do that.  I've used the atomic operations, and no locking, and only make
+ * a single pass through what's available.  This should be more than
+ * adequate for some time. I'll think about spinlocks or the like if and as
+ * it's necessary.
+ */
+static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key)
+{
+	struct qib_pportdata *ppd = rcd->ppd;
+	int i, pidx = -1;
+	bool any = false;
+	u16 lkey = key & 0x7FFF;
+
+	if (lkey == (QIB_DEFAULT_P_KEY & 0x7FFF))
+		/* nothing to do; this key always valid */
+		return 0;
+
+	if (!lkey)
+		return -EINVAL;
+
+	/*
+	 * Set the full membership bit, because it has to be
+	 * set in the register or the packet, and it seems
+	 * cleaner to set in the register than to force all
+	 * callers to set it.
+	 */
+	key |= 0x8000;
+
+	for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) {
+		if (!rcd->pkeys[i] && pidx == -1)
+			pidx = i;
+		if (rcd->pkeys[i] == key)
+			return -EEXIST;
+	}
+	if (pidx == -1)
+		return -EBUSY;
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (!ppd->pkeys[i]) {
+			any = true;
+			continue;
+		}
+		if (ppd->pkeys[i] == key) {
+			atomic_t *pkrefs = &ppd->pkeyrefs[i];
+
+			if (atomic_inc_return(pkrefs) > 1) {
+				rcd->pkeys[pidx] = key;
+				return 0;
+			}
+			/*
+			 * lost race, decrement count, catch below
+			 */
+			atomic_dec(pkrefs);
+			any = true;
+		}
+		if ((ppd->pkeys[i] & 0x7FFF) == lkey)
+			/*
+			 * It makes no sense to have both the limited and
+			 * full membership PKEY set at the same time since
+			 * the unlimited one will disable the limited one.
+			 */
+			return -EEXIST;
+	}
+	if (!any)
+		return -EBUSY;
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (!ppd->pkeys[i] &&
+		    atomic_inc_return(&ppd->pkeyrefs[i]) == 1) {
+			rcd->pkeys[pidx] = key;
+			ppd->pkeys[i] = key;
+			(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0);
+			return 0;
+		}
+	}
+	return -EBUSY;
+}
+
+/**
+ * qib_manage_rcvq - manage a context's receive queue
+ * @rcd: the context
+ * @subctxt: the subcontext
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the context, for use in queue
+ * overflow conditions.  start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int qib_manage_rcvq(struct qib_ctxtdata *rcd, unsigned subctxt,
+			   int start_stop)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned int rcvctrl_op;
+
+	if (subctxt)
+		goto bail;
+	/* atomically clear receive enable ctxt. */
+	if (start_stop) {
+		/*
+		 * On enable, force in-memory copy of the tail register to
+		 * 0, so that protocol code doesn't have to worry about
+		 * whether or not the chip has yet updated the in-memory
+		 * copy or not on return from the system call. The chip
+		 * always resets it's tail register back to 0 on a
+		 * transition from disabled to enabled.
+		 */
+		if (rcd->rcvhdrtail_kvaddr)
+			qib_clear_rcvhdrtail(rcd);
+		rcvctrl_op = QIB_RCVCTRL_CTXT_ENB;
+	} else
+		rcvctrl_op = QIB_RCVCTRL_CTXT_DIS;
+	dd->f_rcvctrl(rcd->ppd, rcvctrl_op, rcd->ctxt);
+	/* always; new head should be equal to new tail; see above */
+bail:
+	return 0;
+}
+
+static void qib_clean_part_key(struct qib_ctxtdata *rcd,
+			       struct qib_devdata *dd)
+{
+	int i, j, pchanged = 0;
+	struct qib_pportdata *ppd = rcd->ppd;
+
+	for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) {
+		if (!rcd->pkeys[i])
+			continue;
+		for (j = 0; j < ARRAY_SIZE(ppd->pkeys); j++) {
+			/* check for match independent of the global bit */
+			if ((ppd->pkeys[j] & 0x7fff) !=
+			    (rcd->pkeys[i] & 0x7fff))
+				continue;
+			if (atomic_dec_and_test(&ppd->pkeyrefs[j])) {
+				ppd->pkeys[j] = 0;
+				pchanged++;
+			}
+			break;
+		}
+		rcd->pkeys[i] = 0;
+	}
+	if (pchanged)
+		(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0);
+}
+
+/* common code for the mappings on dma_alloc_coherent mem */
+static int qib_mmap_mem(struct vm_area_struct *vma, struct qib_ctxtdata *rcd,
+			unsigned len, void *kvaddr, u32 write_ok, char *what)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned long pfn;
+	int ret;
+
+	if ((vma->vm_end - vma->vm_start) > len) {
+		qib_devinfo(dd->pcidev,
+			 "FAIL on %s: len %lx > %x\n", what,
+			 vma->vm_end - vma->vm_start, len);
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	/*
+	 * shared context user code requires rcvhdrq mapped r/w, others
+	 * only allowed readonly mapping.
+	 */
+	if (!write_ok) {
+		if (vma->vm_flags & VM_WRITE) {
+			qib_devinfo(dd->pcidev,
+				 "%s must be mapped readonly\n", what);
+			ret = -EPERM;
+			goto bail;
+		}
+
+		/* don't allow them to later change with mprotect */
+		vma->vm_flags &= ~VM_MAYWRITE;
+	}
+
+	pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
+	ret = remap_pfn_range(vma, vma->vm_start, pfn,
+			      len, vma->vm_page_prot);
+	if (ret)
+		qib_devinfo(dd->pcidev,
+			"%s ctxt%u mmap of %lx, %x bytes failed: %d\n",
+			what, rcd->ctxt, pfn, len, ret);
+bail:
+	return ret;
+}
+
+static int mmap_ureg(struct vm_area_struct *vma, struct qib_devdata *dd,
+		     u64 ureg)
+{
+	unsigned long phys;
+	unsigned long sz;
+	int ret;
+
+	/*
+	 * This is real hardware, so use io_remap.  This is the mechanism
+	 * for the user process to update the head registers for their ctxt
+	 * in the chip.
+	 */
+	sz = dd->flags & QIB_HAS_HDRSUPP ? 2 * PAGE_SIZE : PAGE_SIZE;
+	if ((vma->vm_end - vma->vm_start) > sz) {
+		qib_devinfo(dd->pcidev,
+			"FAIL mmap userreg: reqlen %lx > PAGE\n",
+			vma->vm_end - vma->vm_start);
+		ret = -EFAULT;
+	} else {
+		phys = dd->physaddr + ureg;
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 phys >> PAGE_SHIFT,
+					 vma->vm_end - vma->vm_start,
+					 vma->vm_page_prot);
+	}
+	return ret;
+}
+
+static int mmap_piobufs(struct vm_area_struct *vma,
+			struct qib_devdata *dd,
+			struct qib_ctxtdata *rcd,
+			unsigned piobufs, unsigned piocnt)
+{
+	unsigned long phys;
+	int ret;
+
+	/*
+	 * When we map the PIO buffers in the chip, we want to map them as
+	 * writeonly, no read possible; unfortunately, x86 doesn't allow
+	 * for this in hardware, but we still prevent users from asking
+	 * for it.
+	 */
+	if ((vma->vm_end - vma->vm_start) > (piocnt * dd->palign)) {
+		qib_devinfo(dd->pcidev,
+			"FAIL mmap piobufs: reqlen %lx > PAGE\n",
+			 vma->vm_end - vma->vm_start);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	phys = dd->physaddr + piobufs;
+
+#if defined(__powerpc__)
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+#endif
+
+	/*
+	 * don't allow them to later change to readable with mprotect (for when
+	 * not initially mapped readable, as is normally the case)
+	 */
+	vma->vm_flags &= ~VM_MAYREAD;
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+
+	/* We used PAT if wc_cookie == 0 */
+	if (!dd->wc_cookie)
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+
+	ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
+				 vma->vm_end - vma->vm_start,
+				 vma->vm_page_prot);
+bail:
+	return ret;
+}
+
+static int mmap_rcvegrbufs(struct vm_area_struct *vma,
+			   struct qib_ctxtdata *rcd)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned long start, size;
+	size_t total_size, i;
+	unsigned long pfn;
+	int ret;
+
+	size = rcd->rcvegrbuf_size;
+	total_size = rcd->rcvegrbuf_chunks * size;
+	if ((vma->vm_end - vma->vm_start) > total_size) {
+		qib_devinfo(dd->pcidev,
+			"FAIL on egr bufs: reqlen %lx > actual %lx\n",
+			 vma->vm_end - vma->vm_start,
+			 (unsigned long) total_size);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (vma->vm_flags & VM_WRITE) {
+		qib_devinfo(dd->pcidev,
+			"Can't map eager buffers as writable (flags=%lx)\n",
+			vma->vm_flags);
+		ret = -EPERM;
+		goto bail;
+	}
+	/* don't allow them to later change to writeable with mprotect */
+	vma->vm_flags &= ~VM_MAYWRITE;
+
+	start = vma->vm_start;
+
+	for (i = 0; i < rcd->rcvegrbuf_chunks; i++, start += size) {
+		pfn = virt_to_phys(rcd->rcvegrbuf[i]) >> PAGE_SHIFT;
+		ret = remap_pfn_range(vma, start, pfn, size,
+				      vma->vm_page_prot);
+		if (ret < 0)
+			goto bail;
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/*
+ * qib_file_vma_fault - handle a VMA page fault.
+ */
+static int qib_file_vma_fault(struct vm_fault *vmf)
+{
+	struct page *page;
+
+	page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+	if (!page)
+		return VM_FAULT_SIGBUS;
+
+	get_page(page);
+	vmf->page = page;
+
+	return 0;
+}
+
+static const struct vm_operations_struct qib_file_vm_ops = {
+	.fault = qib_file_vma_fault,
+};
+
+static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
+		       struct qib_ctxtdata *rcd, unsigned subctxt)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned subctxt_cnt;
+	unsigned long len;
+	void *addr;
+	size_t size;
+	int ret = 0;
+
+	subctxt_cnt = rcd->subctxt_cnt;
+	size = rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size;
+
+	/*
+	 * Each process has all the subctxt uregbase, rcvhdrq, and
+	 * rcvegrbufs mmapped - as an array for all the processes,
+	 * and also separately for this process.
+	 */
+	if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase)) {
+		addr = rcd->subctxt_uregbase;
+		size = PAGE_SIZE * subctxt_cnt;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base)) {
+		addr = rcd->subctxt_rcvhdr_base;
+		size = rcd->rcvhdrq_size * subctxt_cnt;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf)) {
+		addr = rcd->subctxt_rcvegrbuf;
+		size *= subctxt_cnt;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase +
+					PAGE_SIZE * subctxt)) {
+		addr = rcd->subctxt_uregbase + PAGE_SIZE * subctxt;
+		size = PAGE_SIZE;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base +
+					rcd->rcvhdrq_size * subctxt)) {
+		addr = rcd->subctxt_rcvhdr_base +
+			rcd->rcvhdrq_size * subctxt;
+		size = rcd->rcvhdrq_size;
+	} else if (pgaddr == cvt_kvaddr(&rcd->user_event_mask[subctxt])) {
+		addr = rcd->user_event_mask;
+		size = PAGE_SIZE;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf +
+					size * subctxt)) {
+		addr = rcd->subctxt_rcvegrbuf + size * subctxt;
+		/* rcvegrbufs are read-only on the slave */
+		if (vma->vm_flags & VM_WRITE) {
+			qib_devinfo(dd->pcidev,
+				 "Can't map eager buffers as writable (flags=%lx)\n",
+				 vma->vm_flags);
+			ret = -EPERM;
+			goto bail;
+		}
+		/*
+		 * Don't allow permission to later change to writeable
+		 * with mprotect.
+		 */
+		vma->vm_flags &= ~VM_MAYWRITE;
+	} else
+		goto bail;
+	len = vma->vm_end - vma->vm_start;
+	if (len > size) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
+	vma->vm_ops = &qib_file_vm_ops;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_mmapf - mmap various structures into user space
+ * @fp: the file pointer
+ * @vma: the VM area
+ *
+ * We use this to have a shared buffer between the kernel and the user code
+ * for the rcvhdr queue, egr buffers, and the per-context user regs and pio
+ * buffers in the chip.  We have the open and close entries so we can bump
+ * the ref count and keep the driver from being unloaded while still mapped.
+ */
+static int qib_mmapf(struct file *fp, struct vm_area_struct *vma)
+{
+	struct qib_ctxtdata *rcd;
+	struct qib_devdata *dd;
+	u64 pgaddr, ureg;
+	unsigned piobufs, piocnt;
+	int ret, match = 1;
+
+	rcd = ctxt_fp(fp);
+	if (!rcd || !(vma->vm_flags & VM_SHARED)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	dd = rcd->dd;
+
+	/*
+	 * This is the qib_do_user_init() code, mapping the shared buffers
+	 * and per-context user registers into the user process. The address
+	 * referred to by vm_pgoff is the file offset passed via mmap().
+	 * For shared contexts, this is the kernel vmalloc() address of the
+	 * pages to share with the master.
+	 * For non-shared or master ctxts, this is a physical address.
+	 * We only do one mmap for each space mapped.
+	 */
+	pgaddr = vma->vm_pgoff << PAGE_SHIFT;
+
+	/*
+	 * Check for 0 in case one of the allocations failed, but user
+	 * called mmap anyway.
+	 */
+	if (!pgaddr)  {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Physical addresses must fit in 40 bits for our hardware.
+	 * Check for kernel virtual addresses first, anything else must
+	 * match a HW or memory address.
+	 */
+	ret = mmap_kvaddr(vma, pgaddr, rcd, subctxt_fp(fp));
+	if (ret) {
+		if (ret > 0)
+			ret = 0;
+		goto bail;
+	}
+
+	ureg = dd->uregbase + dd->ureg_align * rcd->ctxt;
+	if (!rcd->subctxt_cnt) {
+		/* ctxt is not shared */
+		piocnt = rcd->piocnt;
+		piobufs = rcd->piobufs;
+	} else if (!subctxt_fp(fp)) {
+		/* caller is the master */
+		piocnt = (rcd->piocnt / rcd->subctxt_cnt) +
+			 (rcd->piocnt % rcd->subctxt_cnt);
+		piobufs = rcd->piobufs +
+			dd->palign * (rcd->piocnt - piocnt);
+	} else {
+		unsigned slave = subctxt_fp(fp) - 1;
+
+		/* caller is a slave */
+		piocnt = rcd->piocnt / rcd->subctxt_cnt;
+		piobufs = rcd->piobufs + dd->palign * piocnt * slave;
+	}
+
+	if (pgaddr == ureg)
+		ret = mmap_ureg(vma, dd, ureg);
+	else if (pgaddr == piobufs)
+		ret = mmap_piobufs(vma, dd, rcd, piobufs, piocnt);
+	else if (pgaddr == dd->pioavailregs_phys)
+		/* in-memory copy of pioavail registers */
+		ret = qib_mmap_mem(vma, rcd, PAGE_SIZE,
+				   (void *) dd->pioavailregs_dma, 0,
+				   "pioavail registers");
+	else if (pgaddr == rcd->rcvegr_phys)
+		ret = mmap_rcvegrbufs(vma, rcd);
+	else if (pgaddr == (u64) rcd->rcvhdrq_phys)
+		/*
+		 * The rcvhdrq itself; multiple pages, contiguous
+		 * from an i/o perspective.  Shared contexts need
+		 * to map r/w, so we allow writing.
+		 */
+		ret = qib_mmap_mem(vma, rcd, rcd->rcvhdrq_size,
+				   rcd->rcvhdrq, 1, "rcvhdrq");
+	else if (pgaddr == (u64) rcd->rcvhdrqtailaddr_phys)
+		/* in-memory copy of rcvhdrq tail register */
+		ret = qib_mmap_mem(vma, rcd, PAGE_SIZE,
+				   rcd->rcvhdrtail_kvaddr, 0,
+				   "rcvhdrq tail");
+	else
+		match = 0;
+	if (!match)
+		ret = -EINVAL;
+
+	vma->vm_private_data = NULL;
+
+	if (ret < 0)
+		qib_devinfo(dd->pcidev,
+			 "mmap Failure %d: off %llx len %lx\n",
+			 -ret, (unsigned long long)pgaddr,
+			 vma->vm_end - vma->vm_start);
+bail:
+	return ret;
+}
+
+static __poll_t qib_poll_urgent(struct qib_ctxtdata *rcd,
+				    struct file *fp,
+				    struct poll_table_struct *pt)
+{
+	struct qib_devdata *dd = rcd->dd;
+	__poll_t pollflag;
+
+	poll_wait(fp, &rcd->wait, pt);
+
+	spin_lock_irq(&dd->uctxt_lock);
+	if (rcd->urgent != rcd->urgent_poll) {
+		pollflag = EPOLLIN | EPOLLRDNORM;
+		rcd->urgent_poll = rcd->urgent;
+	} else {
+		pollflag = 0;
+		set_bit(QIB_CTXT_WAITING_URG, &rcd->flag);
+	}
+	spin_unlock_irq(&dd->uctxt_lock);
+
+	return pollflag;
+}
+
+static __poll_t qib_poll_next(struct qib_ctxtdata *rcd,
+				  struct file *fp,
+				  struct poll_table_struct *pt)
+{
+	struct qib_devdata *dd = rcd->dd;
+	__poll_t pollflag;
+
+	poll_wait(fp, &rcd->wait, pt);
+
+	spin_lock_irq(&dd->uctxt_lock);
+	if (dd->f_hdrqempty(rcd)) {
+		set_bit(QIB_CTXT_WAITING_RCV, &rcd->flag);
+		dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_INTRAVAIL_ENB, rcd->ctxt);
+		pollflag = 0;
+	} else
+		pollflag = EPOLLIN | EPOLLRDNORM;
+	spin_unlock_irq(&dd->uctxt_lock);
+
+	return pollflag;
+}
+
+static __poll_t qib_poll(struct file *fp, struct poll_table_struct *pt)
+{
+	struct qib_ctxtdata *rcd;
+	__poll_t pollflag;
+
+	rcd = ctxt_fp(fp);
+	if (!rcd)
+		pollflag = EPOLLERR;
+	else if (rcd->poll_type == QIB_POLL_TYPE_URGENT)
+		pollflag = qib_poll_urgent(rcd, fp, pt);
+	else  if (rcd->poll_type == QIB_POLL_TYPE_ANYRCV)
+		pollflag = qib_poll_next(rcd, fp, pt);
+	else /* invalid */
+		pollflag = EPOLLERR;
+
+	return pollflag;
+}
+
+static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
+{
+	struct qib_filedata *fd = fp->private_data;
+	const unsigned int weight = cpumask_weight(&current->cpus_allowed);
+	const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
+	int local_cpu;
+
+	/*
+	 * If process has NOT already set it's affinity, select and
+	 * reserve a processor for it on the local NUMA node.
+	 */
+	if ((weight >= qib_cpulist_count) &&
+		(cpumask_weight(local_mask) <= qib_cpulist_count)) {
+		for_each_cpu(local_cpu, local_mask)
+			if (!test_and_set_bit(local_cpu, qib_cpulist)) {
+				fd->rec_cpu_num = local_cpu;
+				return;
+			}
+	}
+
+	/*
+	 * If process has NOT already set it's affinity, select and
+	 * reserve a processor for it, as a rendevous for all
+	 * users of the driver.  If they don't actually later
+	 * set affinity to this cpu, or set it to some other cpu,
+	 * it just means that sooner or later we don't recommend
+	 * a cpu, and let the scheduler do it's best.
+	 */
+	if (weight >= qib_cpulist_count) {
+		int cpu;
+
+		cpu = find_first_zero_bit(qib_cpulist,
+					  qib_cpulist_count);
+		if (cpu == qib_cpulist_count)
+			qib_dev_err(dd,
+			"no cpus avail for affinity PID %u\n",
+			current->pid);
+		else {
+			__set_bit(cpu, qib_cpulist);
+			fd->rec_cpu_num = cpu;
+		}
+	}
+}
+
+/*
+ * Check that userland and driver are compatible for subcontexts.
+ */
+static int qib_compatible_subctxts(int user_swmajor, int user_swminor)
+{
+	/* this code is written long-hand for clarity */
+	if (QIB_USER_SWMAJOR != user_swmajor) {
+		/* no promise of compatibility if major mismatch */
+		return 0;
+	}
+	if (QIB_USER_SWMAJOR == 1) {
+		switch (QIB_USER_SWMINOR) {
+		case 0:
+		case 1:
+		case 2:
+			/* no subctxt implementation so cannot be compatible */
+			return 0;
+		case 3:
+			/* 3 is only compatible with itself */
+			return user_swminor == 3;
+		default:
+			/* >= 4 are compatible (or are expected to be) */
+			return user_swminor <= QIB_USER_SWMINOR;
+		}
+	}
+	/* make no promises yet for future major versions */
+	return 0;
+}
+
+static int init_subctxts(struct qib_devdata *dd,
+			 struct qib_ctxtdata *rcd,
+			 const struct qib_user_info *uinfo)
+{
+	int ret = 0;
+	unsigned num_subctxts;
+	size_t size;
+
+	/*
+	 * If the user is requesting zero subctxts,
+	 * skip the subctxt allocation.
+	 */
+	if (uinfo->spu_subctxt_cnt <= 0)
+		goto bail;
+	num_subctxts = uinfo->spu_subctxt_cnt;
+
+	/* Check for subctxt compatibility */
+	if (!qib_compatible_subctxts(uinfo->spu_userversion >> 16,
+		uinfo->spu_userversion & 0xffff)) {
+		qib_devinfo(dd->pcidev,
+			 "Mismatched user version (%d.%d) and driver version (%d.%d) while context sharing. Ensure that driver and library are from the same release.\n",
+			 (int) (uinfo->spu_userversion >> 16),
+			 (int) (uinfo->spu_userversion & 0xffff),
+			 QIB_USER_SWMAJOR, QIB_USER_SWMINOR);
+		goto bail;
+	}
+	if (num_subctxts > QLOGIC_IB_MAX_SUBCTXT) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	rcd->subctxt_uregbase = vmalloc_user(PAGE_SIZE * num_subctxts);
+	if (!rcd->subctxt_uregbase) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	/* Note: rcd->rcvhdrq_size isn't initialized yet. */
+	size = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize *
+		     sizeof(u32), PAGE_SIZE) * num_subctxts;
+	rcd->subctxt_rcvhdr_base = vmalloc_user(size);
+	if (!rcd->subctxt_rcvhdr_base) {
+		ret = -ENOMEM;
+		goto bail_ureg;
+	}
+
+	rcd->subctxt_rcvegrbuf = vmalloc_user(rcd->rcvegrbuf_chunks *
+					      rcd->rcvegrbuf_size *
+					      num_subctxts);
+	if (!rcd->subctxt_rcvegrbuf) {
+		ret = -ENOMEM;
+		goto bail_rhdr;
+	}
+
+	rcd->subctxt_cnt = uinfo->spu_subctxt_cnt;
+	rcd->subctxt_id = uinfo->spu_subctxt_id;
+	rcd->active_slaves = 1;
+	rcd->redirect_seq_cnt = 1;
+	set_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag);
+	goto bail;
+
+bail_rhdr:
+	vfree(rcd->subctxt_rcvhdr_base);
+bail_ureg:
+	vfree(rcd->subctxt_uregbase);
+	rcd->subctxt_uregbase = NULL;
+bail:
+	return ret;
+}
+
+static int setup_ctxt(struct qib_pportdata *ppd, int ctxt,
+		      struct file *fp, const struct qib_user_info *uinfo)
+{
+	struct qib_filedata *fd = fp->private_data;
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_ctxtdata *rcd;
+	void *ptmp = NULL;
+	int ret;
+	int numa_id;
+
+	assign_ctxt_affinity(fp, dd);
+
+	numa_id = qib_numa_aware ? ((fd->rec_cpu_num != -1) ?
+		cpu_to_node(fd->rec_cpu_num) :
+		numa_node_id()) : dd->assigned_node_id;
+
+	rcd = qib_create_ctxtdata(ppd, ctxt, numa_id);
+
+	/*
+	 * Allocate memory for use in qib_tid_update() at open to
+	 * reduce cost of expected send setup per message segment
+	 */
+	if (rcd)
+		ptmp = kmalloc(dd->rcvtidcnt * sizeof(u16) +
+			       dd->rcvtidcnt * sizeof(struct page **),
+			       GFP_KERNEL);
+
+	if (!rcd || !ptmp) {
+		qib_dev_err(dd,
+			"Unable to allocate ctxtdata memory, failing open\n");
+		ret = -ENOMEM;
+		goto bailerr;
+	}
+	rcd->userversion = uinfo->spu_userversion;
+	ret = init_subctxts(dd, rcd, uinfo);
+	if (ret)
+		goto bailerr;
+	rcd->tid_pg_list = ptmp;
+	rcd->pid = current->pid;
+	init_waitqueue_head(&dd->rcd[ctxt]->wait);
+	strlcpy(rcd->comm, current->comm, sizeof(rcd->comm));
+	ctxt_fp(fp) = rcd;
+	qib_stats.sps_ctxts++;
+	dd->freectxts--;
+	ret = 0;
+	goto bail;
+
+bailerr:
+	if (fd->rec_cpu_num != -1)
+		__clear_bit(fd->rec_cpu_num, qib_cpulist);
+
+	dd->rcd[ctxt] = NULL;
+	kfree(rcd);
+	kfree(ptmp);
+bail:
+	return ret;
+}
+
+static inline int usable(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+
+	return dd && (dd->flags & QIB_PRESENT) && dd->kregbase && ppd->lid &&
+		(ppd->lflags & QIBL_LINKACTIVE);
+}
+
+/*
+ * Select a context on the given device, either using a requested port
+ * or the port based on the context number.
+ */
+static int choose_port_ctxt(struct file *fp, struct qib_devdata *dd, u32 port,
+			    const struct qib_user_info *uinfo)
+{
+	struct qib_pportdata *ppd = NULL;
+	int ret, ctxt;
+
+	if (port) {
+		if (!usable(dd->pport + port - 1)) {
+			ret = -ENETDOWN;
+			goto done;
+		} else
+			ppd = dd->pport + port - 1;
+	}
+	for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts && dd->rcd[ctxt];
+	     ctxt++)
+		;
+	if (ctxt == dd->cfgctxts) {
+		ret = -EBUSY;
+		goto done;
+	}
+	if (!ppd) {
+		u32 pidx = ctxt % dd->num_pports;
+
+		if (usable(dd->pport + pidx))
+			ppd = dd->pport + pidx;
+		else {
+			for (pidx = 0; pidx < dd->num_pports && !ppd;
+			     pidx++)
+				if (usable(dd->pport + pidx))
+					ppd = dd->pport + pidx;
+		}
+	}
+	ret = ppd ? setup_ctxt(ppd, ctxt, fp, uinfo) : -ENETDOWN;
+done:
+	return ret;
+}
+
+static int find_free_ctxt(int unit, struct file *fp,
+			  const struct qib_user_info *uinfo)
+{
+	struct qib_devdata *dd = qib_lookup(unit);
+	int ret;
+
+	if (!dd || (uinfo->spu_port && uinfo->spu_port > dd->num_pports))
+		ret = -ENODEV;
+	else
+		ret = choose_port_ctxt(fp, dd, uinfo->spu_port, uinfo);
+
+	return ret;
+}
+
+static int get_a_ctxt(struct file *fp, const struct qib_user_info *uinfo,
+		      unsigned alg)
+{
+	struct qib_devdata *udd = NULL;
+	int ret = 0, devmax, npresent, nup, ndev, dusable = 0, i;
+	u32 port = uinfo->spu_port, ctxt;
+
+	devmax = qib_count_units(&npresent, &nup);
+	if (!npresent) {
+		ret = -ENXIO;
+		goto done;
+	}
+	if (nup == 0) {
+		ret = -ENETDOWN;
+		goto done;
+	}
+
+	if (alg == QIB_PORT_ALG_ACROSS) {
+		unsigned inuse = ~0U;
+
+		/* find device (with ACTIVE ports) with fewest ctxts in use */
+		for (ndev = 0; ndev < devmax; ndev++) {
+			struct qib_devdata *dd = qib_lookup(ndev);
+			unsigned cused = 0, cfree = 0, pusable = 0;
+
+			if (!dd)
+				continue;
+			if (port && port <= dd->num_pports &&
+			    usable(dd->pport + port - 1))
+				pusable = 1;
+			else
+				for (i = 0; i < dd->num_pports; i++)
+					if (usable(dd->pport + i))
+						pusable++;
+			if (!pusable)
+				continue;
+			for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts;
+			     ctxt++)
+				if (dd->rcd[ctxt])
+					cused++;
+				else
+					cfree++;
+			if (cfree && cused < inuse) {
+				udd = dd;
+				inuse = cused;
+			}
+		}
+		if (udd) {
+			ret = choose_port_ctxt(fp, udd, port, uinfo);
+			goto done;
+		}
+	} else {
+		for (ndev = 0; ndev < devmax; ndev++) {
+			struct qib_devdata *dd = qib_lookup(ndev);
+
+			if (dd) {
+				ret = choose_port_ctxt(fp, dd, port, uinfo);
+				if (!ret)
+					goto done;
+				if (ret == -EBUSY)
+					dusable++;
+			}
+		}
+	}
+	ret = dusable ? -EBUSY : -ENETDOWN;
+
+done:
+	return ret;
+}
+
+static int find_shared_ctxt(struct file *fp,
+			    const struct qib_user_info *uinfo)
+{
+	int devmax, ndev, i;
+	int ret = 0;
+
+	devmax = qib_count_units(NULL, NULL);
+
+	for (ndev = 0; ndev < devmax; ndev++) {
+		struct qib_devdata *dd = qib_lookup(ndev);
+
+		/* device portion of usable() */
+		if (!(dd && (dd->flags & QIB_PRESENT) && dd->kregbase))
+			continue;
+		for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) {
+			struct qib_ctxtdata *rcd = dd->rcd[i];
+
+			/* Skip ctxts which are not yet open */
+			if (!rcd || !rcd->cnt)
+				continue;
+			/* Skip ctxt if it doesn't match the requested one */
+			if (rcd->subctxt_id != uinfo->spu_subctxt_id)
+				continue;
+			/* Verify the sharing process matches the master */
+			if (rcd->subctxt_cnt != uinfo->spu_subctxt_cnt ||
+			    rcd->userversion != uinfo->spu_userversion ||
+			    rcd->cnt >= rcd->subctxt_cnt) {
+				ret = -EINVAL;
+				goto done;
+			}
+			ctxt_fp(fp) = rcd;
+			subctxt_fp(fp) = rcd->cnt++;
+			rcd->subpid[subctxt_fp(fp)] = current->pid;
+			tidcursor_fp(fp) = 0;
+			rcd->active_slaves |= 1 << subctxt_fp(fp);
+			ret = 1;
+			goto done;
+		}
+	}
+
+done:
+	return ret;
+}
+
+static int qib_open(struct inode *in, struct file *fp)
+{
+	/* The real work is performed later in qib_assign_ctxt() */
+	fp->private_data = kzalloc(sizeof(struct qib_filedata), GFP_KERNEL);
+	if (fp->private_data) /* no cpu affinity by default */
+		((struct qib_filedata *)fp->private_data)->rec_cpu_num = -1;
+	return fp->private_data ? 0 : -ENOMEM;
+}
+
+static int find_hca(unsigned int cpu, int *unit)
+{
+	int ret = 0, devmax, npresent, nup, ndev;
+
+	*unit = -1;
+
+	devmax = qib_count_units(&npresent, &nup);
+	if (!npresent) {
+		ret = -ENXIO;
+		goto done;
+	}
+	if (!nup) {
+		ret = -ENETDOWN;
+		goto done;
+	}
+	for (ndev = 0; ndev < devmax; ndev++) {
+		struct qib_devdata *dd = qib_lookup(ndev);
+
+		if (dd) {
+			if (pcibus_to_node(dd->pcidev->bus) < 0) {
+				ret = -EINVAL;
+				goto done;
+			}
+			if (cpu_to_node(cpu) ==
+				pcibus_to_node(dd->pcidev->bus)) {
+				*unit = ndev;
+				goto done;
+			}
+		}
+	}
+done:
+	return ret;
+}
+
+static int do_qib_user_sdma_queue_create(struct file *fp)
+{
+	struct qib_filedata *fd = fp->private_data;
+	struct qib_ctxtdata *rcd = fd->rcd;
+	struct qib_devdata *dd = rcd->dd;
+
+	if (dd->flags & QIB_HAS_SEND_DMA) {
+
+		fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev,
+						    dd->unit,
+						    rcd->ctxt,
+						    fd->subctxt);
+		if (!fd->pq)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Get ctxt early, so can set affinity prior to memory allocation.
+ */
+static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
+{
+	int ret;
+	int i_minor;
+	unsigned swmajor, swminor, alg = QIB_PORT_ALG_ACROSS;
+
+	/* Check to be sure we haven't already initialized this file */
+	if (ctxt_fp(fp)) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	/* for now, if major version is different, bail */
+	swmajor = uinfo->spu_userversion >> 16;
+	if (swmajor != QIB_USER_SWMAJOR) {
+		ret = -ENODEV;
+		goto done;
+	}
+
+	swminor = uinfo->spu_userversion & 0xffff;
+
+	if (swminor >= 11 && uinfo->spu_port_alg < QIB_PORT_ALG_COUNT)
+		alg = uinfo->spu_port_alg;
+
+	mutex_lock(&qib_mutex);
+
+	if (qib_compatible_subctxts(swmajor, swminor) &&
+	    uinfo->spu_subctxt_cnt) {
+		ret = find_shared_ctxt(fp, uinfo);
+		if (ret > 0) {
+			ret = do_qib_user_sdma_queue_create(fp);
+			if (!ret)
+				assign_ctxt_affinity(fp, (ctxt_fp(fp))->dd);
+			goto done_ok;
+		}
+	}
+
+	i_minor = iminor(file_inode(fp)) - QIB_USER_MINOR_BASE;
+	if (i_minor)
+		ret = find_free_ctxt(i_minor - 1, fp, uinfo);
+	else {
+		int unit;
+		const unsigned int cpu = cpumask_first(&current->cpus_allowed);
+		const unsigned int weight =
+			cpumask_weight(&current->cpus_allowed);
+
+		if (weight == 1 && !test_bit(cpu, qib_cpulist))
+			if (!find_hca(cpu, &unit) && unit >= 0)
+				if (!find_free_ctxt(unit, fp, uinfo)) {
+					ret = 0;
+					goto done_chk_sdma;
+				}
+		ret = get_a_ctxt(fp, uinfo, alg);
+	}
+
+done_chk_sdma:
+	if (!ret)
+		ret = do_qib_user_sdma_queue_create(fp);
+done_ok:
+	mutex_unlock(&qib_mutex);
+
+done:
+	return ret;
+}
+
+
+static int qib_do_user_init(struct file *fp,
+			    const struct qib_user_info *uinfo)
+{
+	int ret;
+	struct qib_ctxtdata *rcd = ctxt_fp(fp);
+	struct qib_devdata *dd;
+	unsigned uctxt;
+
+	/* Subctxts don't need to initialize anything since master did it. */
+	if (subctxt_fp(fp)) {
+		ret = wait_event_interruptible(rcd->wait,
+			!test_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag));
+		goto bail;
+	}
+
+	dd = rcd->dd;
+
+	/* some ctxts may get extra buffers, calculate that here */
+	uctxt = rcd->ctxt - dd->first_user_ctxt;
+	if (uctxt < dd->ctxts_extrabuf) {
+		rcd->piocnt = dd->pbufsctxt + 1;
+		rcd->pio_base = rcd->piocnt * uctxt;
+	} else {
+		rcd->piocnt = dd->pbufsctxt;
+		rcd->pio_base = rcd->piocnt * uctxt +
+			dd->ctxts_extrabuf;
+	}
+
+	/*
+	 * All user buffers are 2KB buffers.  If we ever support
+	 * giving 4KB buffers to user processes, this will need some
+	 * work.  Can't use piobufbase directly, because it has
+	 * both 2K and 4K buffer base values.  So check and handle.
+	 */
+	if ((rcd->pio_base + rcd->piocnt) > dd->piobcnt2k) {
+		if (rcd->pio_base >= dd->piobcnt2k) {
+			qib_dev_err(dd,
+				    "%u:ctxt%u: no 2KB buffers available\n",
+				    dd->unit, rcd->ctxt);
+			ret = -ENOBUFS;
+			goto bail;
+		}
+		rcd->piocnt = dd->piobcnt2k - rcd->pio_base;
+		qib_dev_err(dd, "Ctxt%u: would use 4KB bufs, using %u\n",
+			    rcd->ctxt, rcd->piocnt);
+	}
+
+	rcd->piobufs = dd->pio2k_bufbase + rcd->pio_base * dd->palign;
+	qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt,
+			       TXCHK_CHG_TYPE_USER, rcd);
+	/*
+	 * try to ensure that processes start up with consistent avail update
+	 * for their own range, at least.   If system very quiet, it might
+	 * have the in-memory copy out of date at startup for this range of
+	 * buffers, when a context gets re-used.  Do after the chg_pioavail
+	 * and before the rest of setup, so it's "almost certain" the dma
+	 * will have occurred (can't 100% guarantee, but should be many
+	 * decimals of 9s, with this ordering), given how much else happens
+	 * after this.
+	 */
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+
+	/*
+	 * Now allocate the rcvhdr Q and eager TIDs; skip the TID
+	 * array for time being.  If rcd->ctxt > chip-supported,
+	 * we need to do extra stuff here to handle by handling overflow
+	 * through ctxt 0, someday
+	 */
+	ret = qib_create_rcvhdrq(dd, rcd);
+	if (!ret)
+		ret = qib_setup_eagerbufs(rcd);
+	if (ret)
+		goto bail_pio;
+
+	rcd->tidcursor = 0; /* start at beginning after open */
+
+	/* initialize poll variables... */
+	rcd->urgent = 0;
+	rcd->urgent_poll = 0;
+
+	/*
+	 * Now enable the ctxt for receive.
+	 * For chips that are set to DMA the tail register to memory
+	 * when they change (and when the update bit transitions from
+	 * 0 to 1.  So for those chips, we turn it off and then back on.
+	 * This will (very briefly) affect any other open ctxts, but the
+	 * duration is very short, and therefore isn't an issue.  We
+	 * explicitly set the in-memory tail copy to 0 beforehand, so we
+	 * don't have to wait to be sure the DMA update has happened
+	 * (chip resets head/tail to 0 on transition to enable).
+	 */
+	if (rcd->rcvhdrtail_kvaddr)
+		qib_clear_rcvhdrtail(rcd);
+
+	dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_TIDFLOW_ENB,
+		      rcd->ctxt);
+
+	/* Notify any waiting slaves */
+	if (rcd->subctxt_cnt) {
+		clear_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag);
+		wake_up(&rcd->wait);
+	}
+	return 0;
+
+bail_pio:
+	qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt,
+			       TXCHK_CHG_TYPE_KERN, rcd);
+bail:
+	return ret;
+}
+
+/**
+ * unlock_exptid - unlock any expected TID entries context still had in use
+ * @rcd: ctxt
+ *
+ * We don't actually update the chip here, because we do a bulk update
+ * below, using f_clear_tids.
+ */
+static void unlock_expected_tids(struct qib_ctxtdata *rcd)
+{
+	struct qib_devdata *dd = rcd->dd;
+	int ctxt_tidbase = rcd->ctxt * dd->rcvtidcnt;
+	int i, cnt = 0, maxtid = ctxt_tidbase + dd->rcvtidcnt;
+
+	for (i = ctxt_tidbase; i < maxtid; i++) {
+		struct page *p = dd->pageshadow[i];
+		dma_addr_t phys;
+
+		if (!p)
+			continue;
+
+		phys = dd->physshadow[i];
+		dd->physshadow[i] = dd->tidinvalid;
+		dd->pageshadow[i] = NULL;
+		pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,
+			       PCI_DMA_FROMDEVICE);
+		qib_release_user_pages(&p, 1);
+		cnt++;
+	}
+}
+
+static int qib_close(struct inode *in, struct file *fp)
+{
+	int ret = 0;
+	struct qib_filedata *fd;
+	struct qib_ctxtdata *rcd;
+	struct qib_devdata *dd;
+	unsigned long flags;
+	unsigned ctxt;
+
+	mutex_lock(&qib_mutex);
+
+	fd = fp->private_data;
+	fp->private_data = NULL;
+	rcd = fd->rcd;
+	if (!rcd) {
+		mutex_unlock(&qib_mutex);
+		goto bail;
+	}
+
+	dd = rcd->dd;
+
+	/* ensure all pio buffer writes in progress are flushed */
+	qib_flush_wc();
+
+	/* drain user sdma queue */
+	if (fd->pq) {
+		qib_user_sdma_queue_drain(rcd->ppd, fd->pq);
+		qib_user_sdma_queue_destroy(fd->pq);
+	}
+
+	if (fd->rec_cpu_num != -1)
+		__clear_bit(fd->rec_cpu_num, qib_cpulist);
+
+	if (--rcd->cnt) {
+		/*
+		 * XXX If the master closes the context before the slave(s),
+		 * revoke the mmap for the eager receive queue so
+		 * the slave(s) don't wait for receive data forever.
+		 */
+		rcd->active_slaves &= ~(1 << fd->subctxt);
+		rcd->subpid[fd->subctxt] = 0;
+		mutex_unlock(&qib_mutex);
+		goto bail;
+	}
+
+	/* early; no interrupt users after this */
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	ctxt = rcd->ctxt;
+	dd->rcd[ctxt] = NULL;
+	rcd->pid = 0;
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+	if (rcd->rcvwait_to || rcd->piowait_to ||
+	    rcd->rcvnowait || rcd->pionowait) {
+		rcd->rcvwait_to = 0;
+		rcd->piowait_to = 0;
+		rcd->rcvnowait = 0;
+		rcd->pionowait = 0;
+	}
+	if (rcd->flag)
+		rcd->flag = 0;
+
+	if (dd->kregbase) {
+		/* atomically clear receive enable ctxt and intr avail. */
+		dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_DIS |
+				  QIB_RCVCTRL_INTRAVAIL_DIS, ctxt);
+
+		/* clean up the pkeys for this ctxt user */
+		qib_clean_part_key(rcd, dd);
+		qib_disarm_piobufs(dd, rcd->pio_base, rcd->piocnt);
+		qib_chg_pioavailkernel(dd, rcd->pio_base,
+				       rcd->piocnt, TXCHK_CHG_TYPE_KERN, NULL);
+
+		dd->f_clear_tids(dd, rcd);
+
+		if (dd->pageshadow)
+			unlock_expected_tids(rcd);
+		qib_stats.sps_ctxts--;
+		dd->freectxts++;
+	}
+
+	mutex_unlock(&qib_mutex);
+	qib_free_ctxtdata(dd, rcd); /* after releasing the mutex */
+
+bail:
+	kfree(fd);
+	return ret;
+}
+
+static int qib_ctxt_info(struct file *fp, struct qib_ctxt_info __user *uinfo)
+{
+	struct qib_ctxt_info info;
+	int ret;
+	size_t sz;
+	struct qib_ctxtdata *rcd = ctxt_fp(fp);
+	struct qib_filedata *fd;
+
+	fd = fp->private_data;
+
+	info.num_active = qib_count_active_units();
+	info.unit = rcd->dd->unit;
+	info.port = rcd->ppd->port;
+	info.ctxt = rcd->ctxt;
+	info.subctxt =  subctxt_fp(fp);
+	/* Number of user ctxts available for this device. */
+	info.num_ctxts = rcd->dd->cfgctxts - rcd->dd->first_user_ctxt;
+	info.num_subctxts = rcd->subctxt_cnt;
+	info.rec_cpu = fd->rec_cpu_num;
+	sz = sizeof(info);
+
+	if (copy_to_user(uinfo, &info, sz)) {
+		ret = -EFAULT;
+		goto bail;
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int qib_sdma_get_inflight(struct qib_user_sdma_queue *pq,
+				 u32 __user *inflightp)
+{
+	const u32 val = qib_user_sdma_inflight_counter(pq);
+
+	if (put_user(val, inflightp))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int qib_sdma_get_complete(struct qib_pportdata *ppd,
+				 struct qib_user_sdma_queue *pq,
+				 u32 __user *completep)
+{
+	u32 val;
+	int err;
+
+	if (!pq)
+		return -EINVAL;
+
+	err = qib_user_sdma_make_progress(ppd, pq);
+	if (err < 0)
+		return err;
+
+	val = qib_user_sdma_complete_counter(pq);
+	if (put_user(val, completep))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int disarm_req_delay(struct qib_ctxtdata *rcd)
+{
+	int ret = 0;
+
+	if (!usable(rcd->ppd)) {
+		int i;
+		/*
+		 * if link is down, or otherwise not usable, delay
+		 * the caller up to 30 seconds, so we don't thrash
+		 * in trying to get the chip back to ACTIVE, and
+		 * set flag so they make the call again.
+		 */
+		if (rcd->user_event_mask) {
+			/*
+			 * subctxt_cnt is 0 if not shared, so do base
+			 * separately, first, then remaining subctxt, if any
+			 */
+			set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+				&rcd->user_event_mask[0]);
+			for (i = 1; i < rcd->subctxt_cnt; i++)
+				set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+					&rcd->user_event_mask[i]);
+		}
+		for (i = 0; !usable(rcd->ppd) && i < 300; i++)
+			msleep(100);
+		ret = -ENETDOWN;
+	}
+	return ret;
+}
+
+/*
+ * Find all user contexts in use, and set the specified bit in their
+ * event mask.
+ * See also find_ctxt() for a similar use, that is specific to send buffers.
+ */
+int qib_set_uevent_bits(struct qib_pportdata *ppd, const int evtbit)
+{
+	struct qib_ctxtdata *rcd;
+	unsigned ctxt;
+	int ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->dd->uctxt_lock, flags);
+	for (ctxt = ppd->dd->first_user_ctxt; ctxt < ppd->dd->cfgctxts;
+	     ctxt++) {
+		rcd = ppd->dd->rcd[ctxt];
+		if (!rcd)
+			continue;
+		if (rcd->user_event_mask) {
+			int i;
+			/*
+			 * subctxt_cnt is 0 if not shared, so do base
+			 * separately, first, then remaining subctxt, if any
+			 */
+			set_bit(evtbit, &rcd->user_event_mask[0]);
+			for (i = 1; i < rcd->subctxt_cnt; i++)
+				set_bit(evtbit, &rcd->user_event_mask[i]);
+		}
+		ret = 1;
+		break;
+	}
+	spin_unlock_irqrestore(&ppd->dd->uctxt_lock, flags);
+
+	return ret;
+}
+
+/*
+ * clear the event notifier events for this context.
+ * For the DISARM_BUFS case, we also take action (this obsoletes
+ * the older QIB_CMD_DISARM_BUFS, but we keep it for backwards
+ * compatibility.
+ * Other bits don't currently require actions, just atomically clear.
+ * User process then performs actions appropriate to bit having been
+ * set, if desired, and checks again in future.
+ */
+static int qib_user_event_ack(struct qib_ctxtdata *rcd, int subctxt,
+			      unsigned long events)
+{
+	int ret = 0, i;
+
+	for (i = 0; i <= _QIB_MAX_EVENT_BIT; i++) {
+		if (!test_bit(i, &events))
+			continue;
+		if (i == _QIB_EVENT_DISARM_BUFS_BIT) {
+			(void)qib_disarm_piobufs_ifneeded(rcd);
+			ret = disarm_req_delay(rcd);
+		} else
+			clear_bit(i, &rcd->user_event_mask[subctxt]);
+	}
+	return ret;
+}
+
+static ssize_t qib_write(struct file *fp, const char __user *data,
+			 size_t count, loff_t *off)
+{
+	const struct qib_cmd __user *ucmd;
+	struct qib_ctxtdata *rcd;
+	const void __user *src;
+	size_t consumed, copy = 0;
+	struct qib_cmd cmd;
+	ssize_t ret = 0;
+	void *dest;
+
+	if (!ib_safe_file_access(fp)) {
+		pr_err_once("qib_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
+			    task_tgid_vnr(current), current->comm);
+		return -EACCES;
+	}
+
+	if (count < sizeof(cmd.type)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ucmd = (const struct qib_cmd __user *) data;
+
+	if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	consumed = sizeof(cmd.type);
+
+	switch (cmd.type) {
+	case QIB_CMD_ASSIGN_CTXT:
+	case QIB_CMD_USER_INIT:
+		copy = sizeof(cmd.cmd.user_info);
+		dest = &cmd.cmd.user_info;
+		src = &ucmd->cmd.user_info;
+		break;
+
+	case QIB_CMD_RECV_CTRL:
+		copy = sizeof(cmd.cmd.recv_ctrl);
+		dest = &cmd.cmd.recv_ctrl;
+		src = &ucmd->cmd.recv_ctrl;
+		break;
+
+	case QIB_CMD_CTXT_INFO:
+		copy = sizeof(cmd.cmd.ctxt_info);
+		dest = &cmd.cmd.ctxt_info;
+		src = &ucmd->cmd.ctxt_info;
+		break;
+
+	case QIB_CMD_TID_UPDATE:
+	case QIB_CMD_TID_FREE:
+		copy = sizeof(cmd.cmd.tid_info);
+		dest = &cmd.cmd.tid_info;
+		src = &ucmd->cmd.tid_info;
+		break;
+
+	case QIB_CMD_SET_PART_KEY:
+		copy = sizeof(cmd.cmd.part_key);
+		dest = &cmd.cmd.part_key;
+		src = &ucmd->cmd.part_key;
+		break;
+
+	case QIB_CMD_DISARM_BUFS:
+	case QIB_CMD_PIOAVAILUPD: /* force an update of PIOAvail reg */
+		copy = 0;
+		src = NULL;
+		dest = NULL;
+		break;
+
+	case QIB_CMD_POLL_TYPE:
+		copy = sizeof(cmd.cmd.poll_type);
+		dest = &cmd.cmd.poll_type;
+		src = &ucmd->cmd.poll_type;
+		break;
+
+	case QIB_CMD_ARMLAUNCH_CTRL:
+		copy = sizeof(cmd.cmd.armlaunch_ctrl);
+		dest = &cmd.cmd.armlaunch_ctrl;
+		src = &ucmd->cmd.armlaunch_ctrl;
+		break;
+
+	case QIB_CMD_SDMA_INFLIGHT:
+		copy = sizeof(cmd.cmd.sdma_inflight);
+		dest = &cmd.cmd.sdma_inflight;
+		src = &ucmd->cmd.sdma_inflight;
+		break;
+
+	case QIB_CMD_SDMA_COMPLETE:
+		copy = sizeof(cmd.cmd.sdma_complete);
+		dest = &cmd.cmd.sdma_complete;
+		src = &ucmd->cmd.sdma_complete;
+		break;
+
+	case QIB_CMD_ACK_EVENT:
+		copy = sizeof(cmd.cmd.event_mask);
+		dest = &cmd.cmd.event_mask;
+		src = &ucmd->cmd.event_mask;
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (copy) {
+		if ((count - consumed) < copy) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		if (copy_from_user(dest, src, copy)) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		consumed += copy;
+	}
+
+	rcd = ctxt_fp(fp);
+	if (!rcd && cmd.type != QIB_CMD_ASSIGN_CTXT) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	switch (cmd.type) {
+	case QIB_CMD_ASSIGN_CTXT:
+		if (rcd) {
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		ret = qib_assign_ctxt(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		break;
+
+	case QIB_CMD_USER_INIT:
+		ret = qib_do_user_init(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		ret = qib_get_base_info(fp, u64_to_user_ptr(
+					  cmd.cmd.user_info.spu_base_info),
+					cmd.cmd.user_info.spu_base_info_size);
+		break;
+
+	case QIB_CMD_RECV_CTRL:
+		ret = qib_manage_rcvq(rcd, subctxt_fp(fp), cmd.cmd.recv_ctrl);
+		break;
+
+	case QIB_CMD_CTXT_INFO:
+		ret = qib_ctxt_info(fp, (struct qib_ctxt_info __user *)
+				    (unsigned long) cmd.cmd.ctxt_info);
+		break;
+
+	case QIB_CMD_TID_UPDATE:
+		ret = qib_tid_update(rcd, fp, &cmd.cmd.tid_info);
+		break;
+
+	case QIB_CMD_TID_FREE:
+		ret = qib_tid_free(rcd, subctxt_fp(fp), &cmd.cmd.tid_info);
+		break;
+
+	case QIB_CMD_SET_PART_KEY:
+		ret = qib_set_part_key(rcd, cmd.cmd.part_key);
+		break;
+
+	case QIB_CMD_DISARM_BUFS:
+		(void)qib_disarm_piobufs_ifneeded(rcd);
+		ret = disarm_req_delay(rcd);
+		break;
+
+	case QIB_CMD_PIOAVAILUPD:
+		qib_force_pio_avail_update(rcd->dd);
+		break;
+
+	case QIB_CMD_POLL_TYPE:
+		rcd->poll_type = cmd.cmd.poll_type;
+		break;
+
+	case QIB_CMD_ARMLAUNCH_CTRL:
+		rcd->dd->f_set_armlaunch(rcd->dd, cmd.cmd.armlaunch_ctrl);
+		break;
+
+	case QIB_CMD_SDMA_INFLIGHT:
+		ret = qib_sdma_get_inflight(user_sdma_queue_fp(fp),
+					    (u32 __user *) (unsigned long)
+					    cmd.cmd.sdma_inflight);
+		break;
+
+	case QIB_CMD_SDMA_COMPLETE:
+		ret = qib_sdma_get_complete(rcd->ppd,
+					    user_sdma_queue_fp(fp),
+					    (u32 __user *) (unsigned long)
+					    cmd.cmd.sdma_complete);
+		break;
+
+	case QIB_CMD_ACK_EVENT:
+		ret = qib_user_event_ack(rcd, subctxt_fp(fp),
+					 cmd.cmd.event_mask);
+		break;
+	}
+
+	if (ret >= 0)
+		ret = consumed;
+
+bail:
+	return ret;
+}
+
+static ssize_t qib_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct qib_filedata *fp = iocb->ki_filp->private_data;
+	struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp);
+	struct qib_user_sdma_queue *pq = fp->pq;
+
+	if (!iter_is_iovec(from) || !from->nr_segs || !pq)
+		return -EINVAL;
+			 
+	return qib_user_sdma_writev(rcd, pq, from->iov, from->nr_segs);
+}
+
+static struct class *qib_class;
+static dev_t qib_dev;
+
+int qib_cdev_init(int minor, const char *name,
+		  const struct file_operations *fops,
+		  struct cdev **cdevp, struct device **devp)
+{
+	const dev_t dev = MKDEV(MAJOR(qib_dev), minor);
+	struct cdev *cdev;
+	struct device *device = NULL;
+	int ret;
+
+	cdev = cdev_alloc();
+	if (!cdev) {
+		pr_err("Could not allocate cdev for minor %d, %s\n",
+		       minor, name);
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	cdev->owner = THIS_MODULE;
+	cdev->ops = fops;
+	kobject_set_name(&cdev->kobj, name);
+
+	ret = cdev_add(cdev, dev, 1);
+	if (ret < 0) {
+		pr_err("Could not add cdev for minor %d, %s (err %d)\n",
+		       minor, name, -ret);
+		goto err_cdev;
+	}
+
+	device = device_create(qib_class, NULL, dev, NULL, "%s", name);
+	if (!IS_ERR(device))
+		goto done;
+	ret = PTR_ERR(device);
+	device = NULL;
+	pr_err("Could not create device for minor %d, %s (err %d)\n",
+	       minor, name, -ret);
+err_cdev:
+	cdev_del(cdev);
+	cdev = NULL;
+done:
+	*cdevp = cdev;
+	*devp = device;
+	return ret;
+}
+
+void qib_cdev_cleanup(struct cdev **cdevp, struct device **devp)
+{
+	struct device *device = *devp;
+
+	if (device) {
+		device_unregister(device);
+		*devp = NULL;
+	}
+
+	if (*cdevp) {
+		cdev_del(*cdevp);
+		*cdevp = NULL;
+	}
+}
+
+static struct cdev *wildcard_cdev;
+static struct device *wildcard_device;
+
+int __init qib_dev_init(void)
+{
+	int ret;
+
+	ret = alloc_chrdev_region(&qib_dev, 0, QIB_NMINORS, QIB_DRV_NAME);
+	if (ret < 0) {
+		pr_err("Could not allocate chrdev region (err %d)\n", -ret);
+		goto done;
+	}
+
+	qib_class = class_create(THIS_MODULE, "ipath");
+	if (IS_ERR(qib_class)) {
+		ret = PTR_ERR(qib_class);
+		pr_err("Could not create device class (err %d)\n", -ret);
+		unregister_chrdev_region(qib_dev, QIB_NMINORS);
+	}
+
+done:
+	return ret;
+}
+
+void qib_dev_cleanup(void)
+{
+	if (qib_class) {
+		class_destroy(qib_class);
+		qib_class = NULL;
+	}
+
+	unregister_chrdev_region(qib_dev, QIB_NMINORS);
+}
+
+static atomic_t user_count = ATOMIC_INIT(0);
+
+static void qib_user_remove(struct qib_devdata *dd)
+{
+	if (atomic_dec_return(&user_count) == 0)
+		qib_cdev_cleanup(&wildcard_cdev, &wildcard_device);
+
+	qib_cdev_cleanup(&dd->user_cdev, &dd->user_device);
+}
+
+static int qib_user_add(struct qib_devdata *dd)
+{
+	char name[10];
+	int ret;
+
+	if (atomic_inc_return(&user_count) == 1) {
+		ret = qib_cdev_init(0, "ipath", &qib_file_ops,
+				    &wildcard_cdev, &wildcard_device);
+		if (ret)
+			goto done;
+	}
+
+	snprintf(name, sizeof(name), "ipath%d", dd->unit);
+	ret = qib_cdev_init(dd->unit + 1, name, &qib_file_ops,
+			    &dd->user_cdev, &dd->user_device);
+	if (ret)
+		qib_user_remove(dd);
+done:
+	return ret;
+}
+
+/*
+ * Create per-unit files in /dev
+ */
+int qib_device_create(struct qib_devdata *dd)
+{
+	int r, ret;
+
+	r = qib_user_add(dd);
+	ret = qib_diag_add(dd);
+	if (r && !ret)
+		ret = r;
+	return ret;
+}
+
+/*
+ * Remove per-unit files in /dev
+ * void, core kernel returns no errors for this stuff
+ */
+void qib_device_remove(struct qib_devdata *dd)
+{
+	qib_user_remove(dd);
+	qib_diag_remove(dd);
+}
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
new file mode 100644
index 0000000..1d940a2
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -0,0 +1,605 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+
+#include "qib.h"
+
+#define QIBFS_MAGIC 0x726a77
+
+static struct super_block *qib_super;
+
+#define private2dd(file) (file_inode(file)->i_private)
+
+static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
+		       umode_t mode, const struct file_operations *fops,
+		       void *data)
+{
+	int error;
+	struct inode *inode = new_inode(dir->i_sb);
+
+	if (!inode) {
+		error = -EPERM;
+		goto bail;
+	}
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_uid = GLOBAL_ROOT_UID;
+	inode->i_gid = GLOBAL_ROOT_GID;
+	inode->i_blocks = 0;
+	inode->i_atime = current_time(inode);
+	inode->i_mtime = inode->i_atime;
+	inode->i_ctime = inode->i_atime;
+	inode->i_private = data;
+	if (S_ISDIR(mode)) {
+		inode->i_op = &simple_dir_inode_operations;
+		inc_nlink(inode);
+		inc_nlink(dir);
+	}
+
+	inode->i_fop = fops;
+
+	d_instantiate(dentry, inode);
+	error = 0;
+
+bail:
+	return error;
+}
+
+static int create_file(const char *name, umode_t mode,
+		       struct dentry *parent, struct dentry **dentry,
+		       const struct file_operations *fops, void *data)
+{
+	int error;
+
+	inode_lock(d_inode(parent));
+	*dentry = lookup_one_len(name, parent, strlen(name));
+	if (!IS_ERR(*dentry))
+		error = qibfs_mknod(d_inode(parent), *dentry,
+				    mode, fops, data);
+	else
+		error = PTR_ERR(*dentry);
+	inode_unlock(d_inode(parent));
+
+	return error;
+}
+
+static ssize_t driver_stats_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	qib_stats.sps_ints = qib_sps_ints();
+	return simple_read_from_buffer(buf, count, ppos, &qib_stats,
+				       sizeof(qib_stats));
+}
+
+/*
+ * driver stats field names, one line per stat, single string.  Used by
+ * programs like ipathstats to print the stats in a way which works for
+ * different versions of drivers, without changing program source.
+ * if qlogic_ib_stats changes, this needs to change.  Names need to be
+ * 12 chars or less (w/o newline), for proper display by ipathstats utility.
+ */
+static const char qib_statnames[] =
+	"KernIntr\n"
+	"ErrorIntr\n"
+	"Tx_Errs\n"
+	"Rcv_Errs\n"
+	"H/W_Errs\n"
+	"NoPIOBufs\n"
+	"CtxtsOpen\n"
+	"RcvLen_Errs\n"
+	"EgrBufFull\n"
+	"EgrHdrFull\n"
+	;
+
+static ssize_t driver_names_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, count, ppos, qib_statnames,
+		sizeof(qib_statnames) - 1); /* no null */
+}
+
+static const struct file_operations driver_ops[] = {
+	{ .read = driver_stats_read, .llseek = generic_file_llseek, },
+	{ .read = driver_names_read, .llseek = generic_file_llseek, },
+};
+
+/* read the per-device counters */
+static ssize_t dev_counters_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	u64 *counters;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_cntrs(dd, *ppos, NULL, &counters);
+	return simple_read_from_buffer(buf, count, ppos, counters, avail);
+}
+
+/* read the per-device counters */
+static ssize_t dev_names_read(struct file *file, char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	char *names;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_cntrs(dd, *ppos, &names, NULL);
+	return simple_read_from_buffer(buf, count, ppos, names, avail);
+}
+
+static const struct file_operations cntr_ops[] = {
+	{ .read = dev_counters_read, .llseek = generic_file_llseek, },
+	{ .read = dev_names_read, .llseek = generic_file_llseek, },
+};
+
+/*
+ * Could use file_inode(file)->i_ino to figure out which file,
+ * instead of separate routine for each, but for now, this works...
+ */
+
+/* read the per-port names (same for each port) */
+static ssize_t portnames_read(struct file *file, char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	char *names;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_portcntrs(dd, *ppos, 0, &names, NULL);
+	return simple_read_from_buffer(buf, count, ppos, names, avail);
+}
+
+/* read the per-port counters for port 1 (pidx 0) */
+static ssize_t portcntrs_1_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	u64 *counters;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_portcntrs(dd, *ppos, 0, NULL, &counters);
+	return simple_read_from_buffer(buf, count, ppos, counters, avail);
+}
+
+/* read the per-port counters for port 2 (pidx 1) */
+static ssize_t portcntrs_2_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	u64 *counters;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_portcntrs(dd, *ppos, 1, NULL, &counters);
+	return simple_read_from_buffer(buf, count, ppos, counters, avail);
+}
+
+static const struct file_operations portcntr_ops[] = {
+	{ .read = portnames_read, .llseek = generic_file_llseek, },
+	{ .read = portcntrs_1_read, .llseek = generic_file_llseek, },
+	{ .read = portcntrs_2_read, .llseek = generic_file_llseek, },
+};
+
+/*
+ * read the per-port QSFP data for port 1 (pidx 0)
+ */
+static ssize_t qsfp_1_read(struct file *file, char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct qib_devdata *dd = private2dd(file);
+	char *tmp;
+	int ret;
+
+	tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	ret = qib_qsfp_dump(dd->pport, tmp, PAGE_SIZE);
+	if (ret > 0)
+		ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
+	kfree(tmp);
+	return ret;
+}
+
+/*
+ * read the per-port QSFP data for port 2 (pidx 1)
+ */
+static ssize_t qsfp_2_read(struct file *file, char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct qib_devdata *dd = private2dd(file);
+	char *tmp;
+	int ret;
+
+	if (dd->num_pports < 2)
+		return -ENODEV;
+
+	tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	ret = qib_qsfp_dump(dd->pport + 1, tmp, PAGE_SIZE);
+	if (ret > 0)
+		ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
+	kfree(tmp);
+	return ret;
+}
+
+static const struct file_operations qsfp_ops[] = {
+	{ .read = qsfp_1_read, .llseek = generic_file_llseek, },
+	{ .read = qsfp_2_read, .llseek = generic_file_llseek, },
+};
+
+static ssize_t flash_read(struct file *file, char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	struct qib_devdata *dd;
+	ssize_t ret;
+	loff_t pos;
+	char *tmp;
+
+	pos = *ppos;
+
+	if (pos < 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (pos >= sizeof(struct qib_flash)) {
+		ret = 0;
+		goto bail;
+	}
+
+	if (count > sizeof(struct qib_flash) - pos)
+		count = sizeof(struct qib_flash) - pos;
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	dd = private2dd(file);
+	if (qib_eeprom_read(dd, pos, tmp, count)) {
+		qib_dev_err(dd, "failed to read from flash\n");
+		ret = -ENXIO;
+		goto bail_tmp;
+	}
+
+	if (copy_to_user(buf, tmp, count)) {
+		ret = -EFAULT;
+		goto bail_tmp;
+	}
+
+	*ppos = pos + count;
+	ret = count;
+
+bail_tmp:
+	kfree(tmp);
+
+bail:
+	return ret;
+}
+
+static ssize_t flash_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct qib_devdata *dd;
+	ssize_t ret;
+	loff_t pos;
+	char *tmp;
+
+	pos = *ppos;
+
+	if (pos != 0 || count != sizeof(struct qib_flash))
+		return -EINVAL;
+
+	tmp = memdup_user(buf, count);
+	if (IS_ERR(tmp))
+		return PTR_ERR(tmp);
+
+	dd = private2dd(file);
+	if (qib_eeprom_write(dd, pos, tmp, count)) {
+		ret = -ENXIO;
+		qib_dev_err(dd, "failed to write to flash\n");
+		goto bail_tmp;
+	}
+
+	*ppos = pos + count;
+	ret = count;
+
+bail_tmp:
+	kfree(tmp);
+	return ret;
+}
+
+static const struct file_operations flash_ops = {
+	.read = flash_read,
+	.write = flash_write,
+	.llseek = default_llseek,
+};
+
+static int add_cntr_files(struct super_block *sb, struct qib_devdata *dd)
+{
+	struct dentry *dir, *tmp;
+	char unit[10];
+	int ret, i;
+
+	/* create the per-unit directory */
+	snprintf(unit, sizeof(unit), "%u", dd->unit);
+	ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
+			  &simple_dir_operations, dd);
+	if (ret) {
+		pr_err("create_file(%s) failed: %d\n", unit, ret);
+		goto bail;
+	}
+
+	/* create the files in the new directory */
+	ret = create_file("counters", S_IFREG|S_IRUGO, dir, &tmp,
+			  &cntr_ops[0], dd);
+	if (ret) {
+		pr_err("create_file(%s/counters) failed: %d\n",
+		       unit, ret);
+		goto bail;
+	}
+	ret = create_file("counter_names", S_IFREG|S_IRUGO, dir, &tmp,
+			  &cntr_ops[1], dd);
+	if (ret) {
+		pr_err("create_file(%s/counter_names) failed: %d\n",
+		       unit, ret);
+		goto bail;
+	}
+	ret = create_file("portcounter_names", S_IFREG|S_IRUGO, dir, &tmp,
+			  &portcntr_ops[0], dd);
+	if (ret) {
+		pr_err("create_file(%s/%s) failed: %d\n",
+		       unit, "portcounter_names", ret);
+		goto bail;
+	}
+	for (i = 1; i <= dd->num_pports; i++) {
+		char fname[24];
+
+		sprintf(fname, "port%dcounters", i);
+		/* create the files in the new directory */
+		ret = create_file(fname, S_IFREG|S_IRUGO, dir, &tmp,
+				  &portcntr_ops[i], dd);
+		if (ret) {
+			pr_err("create_file(%s/%s) failed: %d\n",
+				unit, fname, ret);
+			goto bail;
+		}
+		if (!(dd->flags & QIB_HAS_QSFP))
+			continue;
+		sprintf(fname, "qsfp%d", i);
+		ret = create_file(fname, S_IFREG|S_IRUGO, dir, &tmp,
+				  &qsfp_ops[i - 1], dd);
+		if (ret) {
+			pr_err("create_file(%s/%s) failed: %d\n",
+				unit, fname, ret);
+			goto bail;
+		}
+	}
+
+	ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp,
+			  &flash_ops, dd);
+	if (ret)
+		pr_err("create_file(%s/flash) failed: %d\n",
+			unit, ret);
+bail:
+	return ret;
+}
+
+static int remove_file(struct dentry *parent, char *name)
+{
+	struct dentry *tmp;
+	int ret;
+
+	tmp = lookup_one_len(name, parent, strlen(name));
+
+	if (IS_ERR(tmp)) {
+		ret = PTR_ERR(tmp);
+		goto bail;
+	}
+
+	spin_lock(&tmp->d_lock);
+	if (simple_positive(tmp)) {
+		__d_drop(tmp);
+		spin_unlock(&tmp->d_lock);
+		simple_unlink(d_inode(parent), tmp);
+	} else {
+		spin_unlock(&tmp->d_lock);
+	}
+	dput(tmp);
+
+	ret = 0;
+bail:
+	/*
+	 * We don't expect clients to care about the return value, but
+	 * it's there if they need it.
+	 */
+	return ret;
+}
+
+static int remove_device_files(struct super_block *sb,
+			       struct qib_devdata *dd)
+{
+	struct dentry *dir, *root;
+	char unit[10];
+	int ret, i;
+
+	root = dget(sb->s_root);
+	inode_lock(d_inode(root));
+	snprintf(unit, sizeof(unit), "%u", dd->unit);
+	dir = lookup_one_len(unit, root, strlen(unit));
+
+	if (IS_ERR(dir)) {
+		ret = PTR_ERR(dir);
+		pr_err("Lookup of %s failed\n", unit);
+		goto bail;
+	}
+
+	inode_lock(d_inode(dir));
+	remove_file(dir, "counters");
+	remove_file(dir, "counter_names");
+	remove_file(dir, "portcounter_names");
+	for (i = 0; i < dd->num_pports; i++) {
+		char fname[24];
+
+		sprintf(fname, "port%dcounters", i + 1);
+		remove_file(dir, fname);
+		if (dd->flags & QIB_HAS_QSFP) {
+			sprintf(fname, "qsfp%d", i + 1);
+			remove_file(dir, fname);
+		}
+	}
+	remove_file(dir, "flash");
+	inode_unlock(d_inode(dir));
+	ret = simple_rmdir(d_inode(root), dir);
+	d_delete(dir);
+	dput(dir);
+
+bail:
+	inode_unlock(d_inode(root));
+	dput(root);
+	return ret;
+}
+
+/*
+ * This fills everything in when the fs is mounted, to handle umount/mount
+ * after device init.  The direct add_cntr_files() call handles adding
+ * them from the init code, when the fs is already mounted.
+ */
+static int qibfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct qib_devdata *dd, *tmp;
+	unsigned long flags;
+	int ret;
+
+	static const struct tree_descr files[] = {
+		[2] = {"driver_stats", &driver_ops[0], S_IRUGO},
+		[3] = {"driver_stats_names", &driver_ops[1], S_IRUGO},
+		{""},
+	};
+
+	ret = simple_fill_super(sb, QIBFS_MAGIC, files);
+	if (ret) {
+		pr_err("simple_fill_super failed: %d\n", ret);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+
+	list_for_each_entry_safe(dd, tmp, &qib_dev_list, list) {
+		spin_unlock_irqrestore(&qib_devs_lock, flags);
+		ret = add_cntr_files(sb, dd);
+		if (ret)
+			goto bail;
+		spin_lock_irqsave(&qib_devs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+
+bail:
+	return ret;
+}
+
+static struct dentry *qibfs_mount(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *data)
+{
+	struct dentry *ret;
+
+	ret = mount_single(fs_type, flags, data, qibfs_fill_super);
+	if (!IS_ERR(ret))
+		qib_super = ret->d_sb;
+	return ret;
+}
+
+static void qibfs_kill_super(struct super_block *s)
+{
+	kill_litter_super(s);
+	qib_super = NULL;
+}
+
+int qibfs_add(struct qib_devdata *dd)
+{
+	int ret;
+
+	/*
+	 * On first unit initialized, qib_super will not yet exist
+	 * because nobody has yet tried to mount the filesystem, so
+	 * we can't consider that to be an error; if an error occurs
+	 * during the mount, that will get a complaint, so this is OK.
+	 * add_cntr_files() for all units is done at mount from
+	 * qibfs_fill_super(), so one way or another, everything works.
+	 */
+	if (qib_super == NULL)
+		ret = 0;
+	else
+		ret = add_cntr_files(qib_super, dd);
+	return ret;
+}
+
+int qibfs_remove(struct qib_devdata *dd)
+{
+	int ret = 0;
+
+	if (qib_super)
+		ret = remove_device_files(qib_super, dd);
+
+	return ret;
+}
+
+static struct file_system_type qibfs_fs_type = {
+	.owner =        THIS_MODULE,
+	.name =         "ipathfs",
+	.mount =        qibfs_mount,
+	.kill_sb =      qibfs_kill_super,
+};
+MODULE_ALIAS_FS("ipathfs");
+
+int __init qib_init_qibfs(void)
+{
+	return register_filesystem(&qibfs_fs_type);
+}
+
+int __exit qib_exit_qibfs(void)
+{
+	return unregister_filesystem(&qibfs_fs_type);
+}
diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c
new file mode 100644
index 0000000..8a15e5c
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_iba6120.c
@@ -0,0 +1,3541 @@
+/*
+ * Copyright (c) 2013 - 2017 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * This file contains all of the code that is specific to the
+ * QLogic_IB 6120 PCIe chip.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <rdma/ib_verbs.h>
+
+#include "qib.h"
+#include "qib_6120_regs.h"
+
+static void qib_6120_setup_setextled(struct qib_pportdata *, u32);
+static void sendctrl_6120_mod(struct qib_pportdata *ppd, u32 op);
+static u8 qib_6120_phys_portstate(u64);
+static u32 qib_6120_iblink_state(u64);
+
+/*
+ * This file contains all the chip-specific register information and
+ * access functions for the Intel Intel_IB PCI-Express chip.
+ *
+ */
+
+/* KREG_IDX uses machine-generated #defines */
+#define KREG_IDX(regname) (QIB_6120_##regname##_OFFS / sizeof(u64))
+
+/* Use defines to tie machine-generated names to lower-case names */
+#define kr_extctrl KREG_IDX(EXTCtrl)
+#define kr_extstatus KREG_IDX(EXTStatus)
+#define kr_gpio_clear KREG_IDX(GPIOClear)
+#define kr_gpio_mask KREG_IDX(GPIOMask)
+#define kr_gpio_out KREG_IDX(GPIOOut)
+#define kr_gpio_status KREG_IDX(GPIOStatus)
+#define kr_rcvctrl KREG_IDX(RcvCtrl)
+#define kr_sendctrl KREG_IDX(SendCtrl)
+#define kr_partitionkey KREG_IDX(RcvPartitionKey)
+#define kr_hwdiagctrl KREG_IDX(HwDiagCtrl)
+#define kr_ibcstatus KREG_IDX(IBCStatus)
+#define kr_ibcctrl KREG_IDX(IBCCtrl)
+#define kr_sendbuffererror KREG_IDX(SendBufErr0)
+#define kr_rcvbthqp KREG_IDX(RcvBTHQP)
+#define kr_counterregbase KREG_IDX(CntrRegBase)
+#define kr_palign KREG_IDX(PageAlign)
+#define kr_rcvegrbase KREG_IDX(RcvEgrBase)
+#define kr_rcvegrcnt KREG_IDX(RcvEgrCnt)
+#define kr_rcvhdrcnt KREG_IDX(RcvHdrCnt)
+#define kr_rcvhdrentsize KREG_IDX(RcvHdrEntSize)
+#define kr_rcvhdrsize KREG_IDX(RcvHdrSize)
+#define kr_rcvtidbase KREG_IDX(RcvTIDBase)
+#define kr_rcvtidcnt KREG_IDX(RcvTIDCnt)
+#define kr_scratch KREG_IDX(Scratch)
+#define kr_sendctrl KREG_IDX(SendCtrl)
+#define kr_sendpioavailaddr KREG_IDX(SendPIOAvailAddr)
+#define kr_sendpiobufbase KREG_IDX(SendPIOBufBase)
+#define kr_sendpiobufcnt KREG_IDX(SendPIOBufCnt)
+#define kr_sendpiosize KREG_IDX(SendPIOSize)
+#define kr_sendregbase KREG_IDX(SendRegBase)
+#define kr_userregbase KREG_IDX(UserRegBase)
+#define kr_control KREG_IDX(Control)
+#define kr_intclear KREG_IDX(IntClear)
+#define kr_intmask KREG_IDX(IntMask)
+#define kr_intstatus KREG_IDX(IntStatus)
+#define kr_errclear KREG_IDX(ErrClear)
+#define kr_errmask KREG_IDX(ErrMask)
+#define kr_errstatus KREG_IDX(ErrStatus)
+#define kr_hwerrclear KREG_IDX(HwErrClear)
+#define kr_hwerrmask KREG_IDX(HwErrMask)
+#define kr_hwerrstatus KREG_IDX(HwErrStatus)
+#define kr_revision KREG_IDX(Revision)
+#define kr_portcnt KREG_IDX(PortCnt)
+#define kr_serdes_cfg0 KREG_IDX(SerdesCfg0)
+#define kr_serdes_cfg1 (kr_serdes_cfg0 + 1)
+#define kr_serdes_stat KREG_IDX(SerdesStat)
+#define kr_xgxs_cfg KREG_IDX(XGXSCfg)
+
+/* These must only be written via qib_write_kreg_ctxt() */
+#define kr_rcvhdraddr KREG_IDX(RcvHdrAddr0)
+#define kr_rcvhdrtailaddr KREG_IDX(RcvHdrTailAddr0)
+
+#define CREG_IDX(regname) ((QIB_6120_##regname##_OFFS - \
+			QIB_6120_LBIntCnt_OFFS) / sizeof(u64))
+
+#define cr_badformat CREG_IDX(RxBadFormatCnt)
+#define cr_erricrc CREG_IDX(RxICRCErrCnt)
+#define cr_errlink CREG_IDX(RxLinkProblemCnt)
+#define cr_errlpcrc CREG_IDX(RxLPCRCErrCnt)
+#define cr_errpkey CREG_IDX(RxPKeyMismatchCnt)
+#define cr_rcvflowctrl_err CREG_IDX(RxFlowCtrlErrCnt)
+#define cr_err_rlen CREG_IDX(RxLenErrCnt)
+#define cr_errslen CREG_IDX(TxLenErrCnt)
+#define cr_errtidfull CREG_IDX(RxTIDFullErrCnt)
+#define cr_errtidvalid CREG_IDX(RxTIDValidErrCnt)
+#define cr_errvcrc CREG_IDX(RxVCRCErrCnt)
+#define cr_ibstatuschange CREG_IDX(IBStatusChangeCnt)
+#define cr_lbint CREG_IDX(LBIntCnt)
+#define cr_invalidrlen CREG_IDX(RxMaxMinLenErrCnt)
+#define cr_invalidslen CREG_IDX(TxMaxMinLenErrCnt)
+#define cr_lbflowstall CREG_IDX(LBFlowStallCnt)
+#define cr_pktrcv CREG_IDX(RxDataPktCnt)
+#define cr_pktrcvflowctrl CREG_IDX(RxFlowPktCnt)
+#define cr_pktsend CREG_IDX(TxDataPktCnt)
+#define cr_pktsendflow CREG_IDX(TxFlowPktCnt)
+#define cr_portovfl CREG_IDX(RxP0HdrEgrOvflCnt)
+#define cr_rcvebp CREG_IDX(RxEBPCnt)
+#define cr_rcvovfl CREG_IDX(RxBufOvflCnt)
+#define cr_senddropped CREG_IDX(TxDroppedPktCnt)
+#define cr_sendstall CREG_IDX(TxFlowStallCnt)
+#define cr_sendunderrun CREG_IDX(TxUnderrunCnt)
+#define cr_wordrcv CREG_IDX(RxDwordCnt)
+#define cr_wordsend CREG_IDX(TxDwordCnt)
+#define cr_txunsupvl CREG_IDX(TxUnsupVLErrCnt)
+#define cr_rxdroppkt CREG_IDX(RxDroppedPktCnt)
+#define cr_iblinkerrrecov CREG_IDX(IBLinkErrRecoveryCnt)
+#define cr_iblinkdown CREG_IDX(IBLinkDownedCnt)
+#define cr_ibsymbolerr CREG_IDX(IBSymbolErrCnt)
+
+#define SYM_RMASK(regname, fldname) ((u64)              \
+	QIB_6120_##regname##_##fldname##_RMASK)
+#define SYM_MASK(regname, fldname) ((u64)               \
+	QIB_6120_##regname##_##fldname##_RMASK <<       \
+	 QIB_6120_##regname##_##fldname##_LSB)
+#define SYM_LSB(regname, fldname) (QIB_6120_##regname##_##fldname##_LSB)
+
+#define SYM_FIELD(value, regname, fldname) ((u64) \
+	(((value) >> SYM_LSB(regname, fldname)) & \
+	 SYM_RMASK(regname, fldname)))
+#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask)
+#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask)
+
+/* link training states, from IBC */
+#define IB_6120_LT_STATE_DISABLED        0x00
+#define IB_6120_LT_STATE_LINKUP          0x01
+#define IB_6120_LT_STATE_POLLACTIVE      0x02
+#define IB_6120_LT_STATE_POLLQUIET       0x03
+#define IB_6120_LT_STATE_SLEEPDELAY      0x04
+#define IB_6120_LT_STATE_SLEEPQUIET      0x05
+#define IB_6120_LT_STATE_CFGDEBOUNCE     0x08
+#define IB_6120_LT_STATE_CFGRCVFCFG      0x09
+#define IB_6120_LT_STATE_CFGWAITRMT      0x0a
+#define IB_6120_LT_STATE_CFGIDLE 0x0b
+#define IB_6120_LT_STATE_RECOVERRETRAIN  0x0c
+#define IB_6120_LT_STATE_RECOVERWAITRMT  0x0e
+#define IB_6120_LT_STATE_RECOVERIDLE     0x0f
+
+/* link state machine states from IBC */
+#define IB_6120_L_STATE_DOWN             0x0
+#define IB_6120_L_STATE_INIT             0x1
+#define IB_6120_L_STATE_ARM              0x2
+#define IB_6120_L_STATE_ACTIVE           0x3
+#define IB_6120_L_STATE_ACT_DEFER        0x4
+
+static const u8 qib_6120_physportstate[0x20] = {
+	[IB_6120_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+	[IB_6120_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+	[IB_6120_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+	[IB_6120_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+	[IB_6120_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_6120_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_6120_LT_STATE_CFGDEBOUNCE] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_6120_LT_STATE_CFGRCVFCFG] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_6120_LT_STATE_CFGWAITRMT] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_6120_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_6120_LT_STATE_RECOVERRETRAIN] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_6120_LT_STATE_RECOVERWAITRMT] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_6120_LT_STATE_RECOVERIDLE] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+
+struct qib_chip_specific {
+	u64 __iomem *cregbase;
+	u64 *cntrs;
+	u64 *portcntrs;
+	void *dummy_hdrq;   /* used after ctxt close */
+	dma_addr_t dummy_hdrq_phys;
+	spinlock_t kernel_tid_lock; /* no back to back kernel TID writes */
+	spinlock_t user_tid_lock; /* no back to back user TID writes */
+	spinlock_t rcvmod_lock; /* protect rcvctrl shadow changes */
+	spinlock_t gpio_lock; /* RMW of shadows/regs for ExtCtrl and GPIO */
+	u64 hwerrmask;
+	u64 errormask;
+	u64 gpio_out; /* shadow of kr_gpio_out, for rmw ops */
+	u64 gpio_mask; /* shadow the gpio mask register */
+	u64 extctrl; /* shadow the gpio output enable, etc... */
+	/*
+	 * these 5 fields are used to establish deltas for IB symbol
+	 * errors and linkrecovery errors.  They can be reported on
+	 * some chips during link negotiation prior to INIT, and with
+	 * DDR when faking DDR negotiations with non-IBTA switches.
+	 * The chip counters are adjusted at driver unload if there is
+	 * a non-zero delta.
+	 */
+	u64 ibdeltainprog;
+	u64 ibsymdelta;
+	u64 ibsymsnap;
+	u64 iblnkerrdelta;
+	u64 iblnkerrsnap;
+	u64 ibcctrl; /* shadow for kr_ibcctrl */
+	u32 lastlinkrecov; /* link recovery issue */
+	u32 cntrnamelen;
+	u32 portcntrnamelen;
+	u32 ncntrs;
+	u32 nportcntrs;
+	/* used with gpio interrupts to implement IB counters */
+	u32 rxfc_unsupvl_errs;
+	u32 overrun_thresh_errs;
+	/*
+	 * these count only cases where _successive_ LocalLinkIntegrity
+	 * errors were seen in the receive headers of IB standard packets
+	 */
+	u32 lli_errs;
+	u32 lli_counter;
+	u64 lli_thresh;
+	u64 sword; /* total dwords sent (sample result) */
+	u64 rword; /* total dwords received (sample result) */
+	u64 spkts; /* total packets sent (sample result) */
+	u64 rpkts; /* total packets received (sample result) */
+	u64 xmit_wait; /* # of ticks no data sent (sample result) */
+	struct timer_list pma_timer;
+	struct qib_pportdata *ppd;
+	char emsgbuf[128];
+	char bitsmsgbuf[64];
+	u8 pma_sample_status;
+};
+
+/* ibcctrl bits */
+#define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define QLOGIC_IB_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define QLOGIC_IB_IBCC_LINKINITCMD_SLEEP 3
+#define QLOGIC_IB_IBCC_LINKINITCMD_SHIFT 16
+
+#define QLOGIC_IB_IBCC_LINKCMD_DOWN 1           /* move to 0x11 */
+#define QLOGIC_IB_IBCC_LINKCMD_ARMED 2          /* move to 0x21 */
+#define QLOGIC_IB_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */
+#define QLOGIC_IB_IBCC_LINKCMD_SHIFT 18
+
+/*
+ * We could have a single register get/put routine, that takes a group type,
+ * but this is somewhat clearer and cleaner.  It also gives us some error
+ * checking.  64 bit register reads should always work, but are inefficient
+ * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
+ * so we use kreg32 wherever possible.  User register and counter register
+ * reads are always 32 bit reads, so only one form of those routines.
+ */
+
+/**
+ * qib_read_ureg32 - read 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @ctxt: context number
+ *
+ * Return the contents of a register that is virtualized to be per context.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 qib_read_ureg32(const struct qib_devdata *dd,
+				  enum qib_ureg regno, int ctxt)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+
+	if (dd->userbase)
+		return readl(regno + (u64 __iomem *)
+			     ((char __iomem *)dd->userbase +
+			      dd->ureg_align * ctxt));
+	else
+		return readl(regno + (u64 __iomem *)
+			     (dd->uregbase +
+			      (char __iomem *)dd->kregbase +
+			      dd->ureg_align * ctxt));
+}
+
+/**
+ * qib_write_ureg - write 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @ctxt: context
+ *
+ * Write the contents of a register that is virtualized to be per context.
+ */
+static inline void qib_write_ureg(const struct qib_devdata *dd,
+				  enum qib_ureg regno, u64 value, int ctxt)
+{
+	u64 __iomem *ubase;
+
+	if (dd->userbase)
+		ubase = (u64 __iomem *)
+			((char __iomem *) dd->userbase +
+			 dd->ureg_align * ctxt);
+	else
+		ubase = (u64 __iomem *)
+			(dd->uregbase +
+			 (char __iomem *) dd->kregbase +
+			 dd->ureg_align * ctxt);
+
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &ubase[regno]);
+}
+
+static inline u32 qib_read_kreg32(const struct qib_devdata *dd,
+				  const u16 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+	return readl((u32 __iomem *)&dd->kregbase[regno]);
+}
+
+static inline u64 qib_read_kreg64(const struct qib_devdata *dd,
+				  const u16 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+
+	return readq(&dd->kregbase[regno]);
+}
+
+static inline void qib_write_kreg(const struct qib_devdata *dd,
+				  const u16 regno, u64 value)
+{
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &dd->kregbase[regno]);
+}
+
+/**
+ * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register
+ * @dd: the qlogic_ib device
+ * @regno: the register number to write
+ * @ctxt: the context containing the register
+ * @value: the value to write
+ */
+static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd,
+				       const u16 regno, unsigned ctxt,
+				       u64 value)
+{
+	qib_write_kreg(dd, regno + ctxt, value);
+}
+
+static inline void write_6120_creg(const struct qib_devdata *dd,
+				   u16 regno, u64 value)
+{
+	if (dd->cspec->cregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &dd->cspec->cregbase[regno]);
+}
+
+static inline u64 read_6120_creg(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(&dd->cspec->cregbase[regno]);
+}
+
+static inline u32 read_6120_creg32(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(&dd->cspec->cregbase[regno]);
+}
+
+/* kr_control bits */
+#define QLOGIC_IB_C_RESET 1U
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define QLOGIC_IB_I_RCVURG_MASK ((1U << 5) - 1)
+#define QLOGIC_IB_I_RCVURG_SHIFT 0
+#define QLOGIC_IB_I_RCVAVAIL_MASK ((1U << 5) - 1)
+#define QLOGIC_IB_I_RCVAVAIL_SHIFT 12
+
+#define QLOGIC_IB_C_FREEZEMODE 0x00000002
+#define QLOGIC_IB_C_LINKENABLE 0x00000004
+#define QLOGIC_IB_I_ERROR               0x0000000080000000ULL
+#define QLOGIC_IB_I_SPIOSENT            0x0000000040000000ULL
+#define QLOGIC_IB_I_SPIOBUFAVAIL        0x0000000020000000ULL
+#define QLOGIC_IB_I_GPIO                0x0000000010000000ULL
+#define QLOGIC_IB_I_BITSEXTANT \
+		((QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT) | \
+		(QLOGIC_IB_I_RCVAVAIL_MASK << \
+		 QLOGIC_IB_I_RCVAVAIL_SHIFT) | \
+		QLOGIC_IB_I_ERROR | QLOGIC_IB_I_SPIOSENT | \
+		QLOGIC_IB_I_SPIOBUFAVAIL | QLOGIC_IB_I_GPIO)
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK  0x000000000000003fULL
+#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT 0
+#define QLOGIC_IB_HWE_PCIEPOISONEDTLP      0x0000000010000000ULL
+#define QLOGIC_IB_HWE_PCIECPLTIMEOUT       0x0000000020000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYXTLH    0x0000000040000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYXADM    0x0000000080000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYRADM    0x0000000100000000ULL
+#define QLOGIC_IB_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
+#define QLOGIC_IB_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
+#define QLOGIC_IB_HWE_PCIE1PLLFAILED       0x0400000000000000ULL
+#define QLOGIC_IB_HWE_PCIE0PLLFAILED       0x0800000000000000ULL
+#define QLOGIC_IB_HWE_SERDESPLLFAILED      0x1000000000000000ULL
+
+
+/* kr_extstatus bits */
+#define QLOGIC_IB_EXTS_FREQSEL 0x2
+#define QLOGIC_IB_EXTS_SERDESSEL 0x4
+#define QLOGIC_IB_EXTS_MEMBIST_ENDTEST     0x0000000000004000
+#define QLOGIC_IB_EXTS_MEMBIST_FOUND       0x0000000000008000
+
+/* kr_xgxsconfig bits */
+#define QLOGIC_IB_XGXS_RESET          0x5ULL
+
+#define _QIB_GPIO_SDA_NUM 1
+#define _QIB_GPIO_SCL_NUM 0
+
+/* Bits in GPIO for the added IB link interrupts */
+#define GPIO_RXUVL_BIT 3
+#define GPIO_OVRUN_BIT 4
+#define GPIO_LLI_BIT 5
+#define GPIO_ERRINTR_MASK 0x38
+
+
+#define QLOGIC_IB_RT_BUFSIZE_MASK 0xe0000000ULL
+#define QLOGIC_IB_RT_BUFSIZE_SHIFTVAL(tid) \
+	((((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) >> 29) + 11 - 1)
+#define QLOGIC_IB_RT_BUFSIZE(tid) (1 << QLOGIC_IB_RT_BUFSIZE_SHIFTVAL(tid))
+#define QLOGIC_IB_RT_IS_VALID(tid) \
+	(((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) && \
+	 ((((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) != QLOGIC_IB_RT_BUFSIZE_MASK)))
+#define QLOGIC_IB_RT_ADDR_MASK 0x1FFFFFFFULL /* 29 bits valid */
+#define QLOGIC_IB_RT_ADDR_SHIFT 10
+
+#define QLOGIC_IB_R_INTRAVAIL_SHIFT 16
+#define QLOGIC_IB_R_TAILUPD_SHIFT 31
+#define IBA6120_R_PKEY_DIS_SHIFT 30
+
+#define PBC_6120_VL15_SEND_CTRL (1ULL << 31) /* pbc; VL15; link_buf only */
+
+#define IBCBUSFRSPCPARITYERR HWE_MASK(IBCBusFromSPCParityErr)
+#define IBCBUSTOSPCPARITYERR HWE_MASK(IBCBusToSPCParityErr)
+
+#define SYM_MASK_BIT(regname, fldname, bit) ((u64) \
+	((1ULL << (SYM_LSB(regname, fldname) + (bit)))))
+
+#define TXEMEMPARITYERR_PIOBUF \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 0)
+#define TXEMEMPARITYERR_PIOPBC \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 1)
+#define TXEMEMPARITYERR_PIOLAUNCHFIFO \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 2)
+
+#define RXEMEMPARITYERR_RCVBUF \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 0)
+#define RXEMEMPARITYERR_LOOKUPQ \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 1)
+#define RXEMEMPARITYERR_EXPTID \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 2)
+#define RXEMEMPARITYERR_EAGERTID \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 3)
+#define RXEMEMPARITYERR_FLAGBUF \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 4)
+#define RXEMEMPARITYERR_DATAINFO \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 5)
+#define RXEMEMPARITYERR_HDRINFO \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 6)
+
+/* 6120 specific hardware errors... */
+static const struct qib_hwerror_msgs qib_6120_hwerror_msgs[] = {
+	/* generic hardware errors */
+	QLOGIC_IB_HWE_MSG(IBCBUSFRSPCPARITYERR, "QIB2IB Parity"),
+	QLOGIC_IB_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2QIB Parity"),
+
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOBUF,
+			  "TXE PIOBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOPBC,
+			  "TXE PIOPBC Memory Parity"),
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOLAUNCHFIFO,
+			  "TXE PIOLAUNCHFIFO Memory Parity"),
+
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_RCVBUF,
+			  "RXE RCVBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_LOOKUPQ,
+			  "RXE LOOKUPQ Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EAGERTID,
+			  "RXE EAGERTID Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EXPTID,
+			  "RXE EXPTID Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_FLAGBUF,
+			  "RXE FLAGBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_DATAINFO,
+			  "RXE DATAINFO Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_HDRINFO,
+			  "RXE HDRINFO Memory Parity"),
+
+	/* chip-specific hardware errors */
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEPOISONEDTLP,
+			  "PCIe Poisoned TLP"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLTIMEOUT,
+			  "PCIe completion timeout"),
+	/*
+	 * In practice, it's unlikely wthat we'll see PCIe PLL, or bus
+	 * parity or memory parity error failures, because most likely we
+	 * won't be able to talk to the core of the chip.  Nonetheless, we
+	 * might see them, if they are in parts of the PCIe core that aren't
+	 * essential.
+	 */
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE1PLLFAILED,
+			  "PCIePLL1"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE0PLLFAILED,
+			  "PCIePLL0"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXTLH,
+			  "PCIe XTLH core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXADM,
+			  "PCIe ADM TX core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYRADM,
+			  "PCIe ADM RX core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SERDESPLLFAILED,
+			  "SerDes PLL"),
+};
+
+#define TXE_PIO_PARITY (TXEMEMPARITYERR_PIOBUF | TXEMEMPARITYERR_PIOPBC)
+#define _QIB_PLL_FAIL (QLOGIC_IB_HWE_COREPLL_FBSLIP |   \
+		QLOGIC_IB_HWE_COREPLL_RFSLIP)
+
+	/* variables for sanity checking interrupt and errors */
+#define IB_HWE_BITSEXTANT \
+	(HWE_MASK(RXEMemParityErr) |					\
+	 HWE_MASK(TXEMemParityErr) |					\
+	 (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK <<			\
+	  QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) |			\
+	 QLOGIC_IB_HWE_PCIE1PLLFAILED |					\
+	 QLOGIC_IB_HWE_PCIE0PLLFAILED |					\
+	 QLOGIC_IB_HWE_PCIEPOISONEDTLP |				\
+	 QLOGIC_IB_HWE_PCIECPLTIMEOUT |					\
+	 QLOGIC_IB_HWE_PCIEBUSPARITYXTLH |				\
+	 QLOGIC_IB_HWE_PCIEBUSPARITYXADM |				\
+	 QLOGIC_IB_HWE_PCIEBUSPARITYRADM |				\
+	 HWE_MASK(PowerOnBISTFailed) |					\
+	 QLOGIC_IB_HWE_COREPLL_FBSLIP |					\
+	 QLOGIC_IB_HWE_COREPLL_RFSLIP |					\
+	 QLOGIC_IB_HWE_SERDESPLLFAILED |				\
+	 HWE_MASK(IBCBusToSPCParityErr) |				\
+	 HWE_MASK(IBCBusFromSPCParityErr))
+
+#define IB_E_BITSEXTANT \
+	(ERR_MASK(RcvFormatErr) | ERR_MASK(RcvVCRCErr) |		\
+	 ERR_MASK(RcvICRCErr) | ERR_MASK(RcvMinPktLenErr) |		\
+	 ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvLongPktLenErr) |	\
+	 ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvUnexpectedCharErr) | \
+	 ERR_MASK(RcvUnsupportedVLErr) | ERR_MASK(RcvEBPErr) |		\
+	 ERR_MASK(RcvIBFlowErr) | ERR_MASK(RcvBadVersionErr) |		\
+	 ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) |		\
+	 ERR_MASK(RcvBadTidErr) | ERR_MASK(RcvHdrLenErr) |		\
+	 ERR_MASK(RcvHdrErr) | ERR_MASK(RcvIBLostLinkErr) |		\
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendMaxPktLenErr) |	\
+	 ERR_MASK(SendUnderRunErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(SendDroppedSmpPktErr) |				\
+	 ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendPioArmLaunchErr) |				\
+	 ERR_MASK(SendUnexpectedPktNumErr) |				\
+	 ERR_MASK(SendUnsupportedVLErr) | ERR_MASK(IBStatusChanged) |	\
+	 ERR_MASK(InvalidAddrErr) | ERR_MASK(ResetNegated) |		\
+	 ERR_MASK(HardwareErr))
+
+#define QLOGIC_IB_E_PKTERRS ( \
+		ERR_MASK(SendPktLenErr) |				\
+		ERR_MASK(SendDroppedDataPktErr) |			\
+		ERR_MASK(RcvVCRCErr) |					\
+		ERR_MASK(RcvICRCErr) |					\
+		ERR_MASK(RcvShortPktLenErr) |				\
+		ERR_MASK(RcvEBPErr))
+
+/* These are all rcv-related errors which we want to count for stats */
+#define E_SUM_PKTERRS						\
+	(ERR_MASK(RcvHdrLenErr) | ERR_MASK(RcvBadTidErr) |		\
+	 ERR_MASK(RcvBadVersionErr) | ERR_MASK(RcvHdrErr) |		\
+	 ERR_MASK(RcvLongPktLenErr) | ERR_MASK(RcvShortPktLenErr) |	\
+	 ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvMinPktLenErr) |	\
+	 ERR_MASK(RcvFormatErr) | ERR_MASK(RcvUnsupportedVLErr) |	\
+	 ERR_MASK(RcvUnexpectedCharErr) | ERR_MASK(RcvEBPErr))
+
+/* These are all send-related errors which we want to count for stats */
+#define E_SUM_ERRS							\
+	(ERR_MASK(SendPioArmLaunchErr) |				\
+	 ERR_MASK(SendUnexpectedPktNumErr) |				\
+	 ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendDroppedSmpPktErr) |				\
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnsupportedVLErr) |	\
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(InvalidAddrErr))
+
+/*
+ * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
+ * errors not related to freeze and cancelling buffers.  Can't ignore
+ * armlaunch because could get more while still cleaning up, and need
+ * to cancel those as they happen.
+ */
+#define E_SPKT_ERRS_IGNORE \
+	(ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendDroppedSmpPktErr) |				\
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendMinPktLenErr) |	\
+	 ERR_MASK(SendPktLenErr))
+
+/*
+ * these are errors that can occur when the link changes state while
+ * a packet is being sent or received.  This doesn't cover things
+ * like EBP or VCRC that can be the result of a sending having the
+ * link change state, so we receive a "known bad" packet.
+ */
+#define E_SUM_LINK_PKTERRS		\
+	(ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendDroppedSmpPktErr) |				\
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvMinPktLenErr) |	\
+	 ERR_MASK(RcvUnexpectedCharErr))
+
+static void qib_6120_put_tid_2(struct qib_devdata *, u64 __iomem *,
+			       u32, unsigned long);
+
+/*
+ * On platforms using this chip, and not having ordered WC stores, we
+ * can get TXE parity errors due to speculative reads to the PIO buffers,
+ * and this, due to a chip issue can result in (many) false parity error
+ * reports.  So it's a debug print on those, and an info print on systems
+ * where the speculative reads don't occur.
+ */
+static void qib_6120_txe_recover(struct qib_devdata *dd)
+{
+	if (!qib_unordered_wc())
+		qib_devinfo(dd->pcidev,
+			    "Recovering from TXE PIO parity error\n");
+}
+
+/* enable/disable chip from delivering interrupts */
+static void qib_6120_set_intr_state(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		if (dd->flags & QIB_BADINTR)
+			return;
+		qib_write_kreg(dd, kr_intmask, ~0ULL);
+		/* force re-interrupt of any pending interrupts. */
+		qib_write_kreg(dd, kr_intclear, 0ULL);
+	} else
+		qib_write_kreg(dd, kr_intmask, 0ULL);
+}
+
+/*
+ * Try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ * This is in chip-specific code because of all of the register accesses,
+ * even though the details are similar on most chips
+ */
+static void qib_6120_clear_freeze(struct qib_devdata *dd)
+{
+	/* disable error interrupts, to avoid confusion */
+	qib_write_kreg(dd, kr_errmask, 0ULL);
+
+	/* also disable interrupts; errormask is sometimes overwritten */
+	qib_6120_set_intr_state(dd, 0);
+
+	qib_cancel_sends(dd->pport);
+
+	/* clear the freeze, and be sure chip saw it */
+	qib_write_kreg(dd, kr_control, dd->control);
+	qib_read_kreg32(dd, kr_scratch);
+
+	/* force in-memory update now we are out of freeze */
+	qib_force_pio_avail_update(dd);
+
+	/*
+	 * force new interrupt if any hwerr, error or interrupt bits are
+	 * still set, and clear "safe" send packet errors related to freeze
+	 * and cancelling sends.  Re-enable error interrupts before possible
+	 * force of re-interrupt on pending interrupts.
+	 */
+	qib_write_kreg(dd, kr_hwerrclear, 0ULL);
+	qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+	qib_6120_set_intr_state(dd, 1);
+}
+
+/**
+ * qib_handle_6120_hwerrors - display hardware errors.
+ * @dd: the qlogic_ib device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  Reuse the same message buffer as
+ * handle_6120_errors() to avoid excessive stack usage.
+ */
+static void qib_handle_6120_hwerrors(struct qib_devdata *dd, char *msg,
+				     size_t msgl)
+{
+	u64 hwerrs;
+	u32 bits, ctrl;
+	int isfatal = 0;
+	char *bitsmsg;
+
+	hwerrs = qib_read_kreg64(dd, kr_hwerrstatus);
+	if (!hwerrs)
+		return;
+	if (hwerrs == ~0ULL) {
+		qib_dev_err(dd,
+			"Read of hardware error status failed (all bits set); ignoring\n");
+		return;
+	}
+	qib_stats.sps_hwerrs++;
+
+	/* Always clear the error status register, except MEMBISTFAIL,
+	 * regardless of whether we continue or stop using the chip.
+	 * We want that set so we know it failed, even across driver reload.
+	 * We'll still ignore it in the hwerrmask.  We do this partly for
+	 * diagnostics, but also for support */
+	qib_write_kreg(dd, kr_hwerrclear,
+		       hwerrs & ~HWE_MASK(PowerOnBISTFailed));
+
+	hwerrs &= dd->cspec->hwerrmask;
+
+	/*
+	 * Make sure we get this much out, unless told to be quiet,
+	 * or it's occurred within the last 5 seconds.
+	 */
+	if (hwerrs & ~(TXE_PIO_PARITY | RXEMEMPARITYERR_EAGERTID))
+		qib_devinfo(dd->pcidev,
+			"Hardware error: hwerr=0x%llx (cleared)\n",
+			(unsigned long long) hwerrs);
+
+	if (hwerrs & ~IB_HWE_BITSEXTANT)
+		qib_dev_err(dd,
+			"hwerror interrupt with unknown errors %llx set\n",
+			(unsigned long long)(hwerrs & ~IB_HWE_BITSEXTANT));
+
+	ctrl = qib_read_kreg32(dd, kr_control);
+	if ((ctrl & QLOGIC_IB_C_FREEZEMODE) && !dd->diag_client) {
+		/*
+		 * Parity errors in send memory are recoverable,
+		 * just cancel the send (if indicated in * sendbuffererror),
+		 * count the occurrence, unfreeze (if no other handled
+		 * hardware error bits are set), and continue. They can
+		 * occur if a processor speculative read is done to the PIO
+		 * buffer while we are sending a packet, for example.
+		 */
+		if (hwerrs & TXE_PIO_PARITY) {
+			qib_6120_txe_recover(dd);
+			hwerrs &= ~TXE_PIO_PARITY;
+		}
+
+		if (!hwerrs) {
+			static u32 freeze_cnt;
+
+			freeze_cnt++;
+			qib_6120_clear_freeze(dd);
+		} else
+			isfatal = 1;
+	}
+
+	*msg = '\0';
+
+	if (hwerrs & HWE_MASK(PowerOnBISTFailed)) {
+		isfatal = 1;
+		strlcat(msg,
+			"[Memory BIST test failed, InfiniPath hardware unusable]",
+			msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	qib_format_hwerrors(hwerrs, qib_6120_hwerror_msgs,
+			    ARRAY_SIZE(qib_6120_hwerror_msgs), msg, msgl);
+
+	bitsmsg = dd->cspec->bitsmsgbuf;
+	if (hwerrs & (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK <<
+		      QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT)) {
+		bits = (u32) ((hwerrs >>
+			       QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) &
+			      QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK);
+		snprintf(bitsmsg, sizeof(dd->cspec->bitsmsgbuf),
+			 "[PCIe Mem Parity Errs %x] ", bits);
+		strlcat(msg, bitsmsg, msgl);
+	}
+
+	if (hwerrs & _QIB_PLL_FAIL) {
+		isfatal = 1;
+		snprintf(bitsmsg, sizeof(dd->cspec->bitsmsgbuf),
+			 "[PLL failed (%llx), InfiniPath hardware unusable]",
+			 (unsigned long long) hwerrs & _QIB_PLL_FAIL);
+		strlcat(msg, bitsmsg, msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~(hwerrs & _QIB_PLL_FAIL);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	if (hwerrs & QLOGIC_IB_HWE_SERDESPLLFAILED) {
+		/*
+		 * If it occurs, it is left masked since the external
+		 * interface is unused
+		 */
+		dd->cspec->hwerrmask &= ~QLOGIC_IB_HWE_SERDESPLLFAILED;
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	if (hwerrs)
+		/*
+		 * if any set that we aren't ignoring; only
+		 * make the complaint once, in case it's stuck
+		 * or recurring, and we get here multiple
+		 * times.
+		 */
+		qib_dev_err(dd, "%s hardware error\n", msg);
+	else
+		*msg = 0; /* recovered from all of them */
+
+	if (isfatal && !dd->diag_client) {
+		qib_dev_err(dd,
+			"Fatal Hardware Error, no longer usable, SN %.16s\n",
+			dd->serial);
+		/*
+		 * for /sys status file and user programs to print; if no
+		 * trailing brace is copied, we'll know it was truncated.
+		 */
+		if (dd->freezemsg)
+			snprintf(dd->freezemsg, dd->freezelen,
+				 "{%s}", msg);
+		qib_disable_after_error(dd);
+	}
+}
+
+/*
+ * Decode the error status into strings, deciding whether to always
+ * print * it or not depending on "normal packet errors" vs everything
+ * else.   Return 1 if "real" errors, otherwise 0 if only packet
+ * errors, so caller can decide what to print with the string.
+ */
+static int qib_decode_6120_err(struct qib_devdata *dd, char *buf, size_t blen,
+			       u64 err)
+{
+	int iserr = 1;
+
+	*buf = '\0';
+	if (err & QLOGIC_IB_E_PKTERRS) {
+		if (!(err & ~QLOGIC_IB_E_PKTERRS))
+			iserr = 0;
+		if ((err & ERR_MASK(RcvICRCErr)) &&
+		    !(err&(ERR_MASK(RcvVCRCErr)|ERR_MASK(RcvEBPErr))))
+			strlcat(buf, "CRC ", blen);
+		if (!iserr)
+			goto done;
+	}
+	if (err & ERR_MASK(RcvHdrLenErr))
+		strlcat(buf, "rhdrlen ", blen);
+	if (err & ERR_MASK(RcvBadTidErr))
+		strlcat(buf, "rbadtid ", blen);
+	if (err & ERR_MASK(RcvBadVersionErr))
+		strlcat(buf, "rbadversion ", blen);
+	if (err & ERR_MASK(RcvHdrErr))
+		strlcat(buf, "rhdr ", blen);
+	if (err & ERR_MASK(RcvLongPktLenErr))
+		strlcat(buf, "rlongpktlen ", blen);
+	if (err & ERR_MASK(RcvMaxPktLenErr))
+		strlcat(buf, "rmaxpktlen ", blen);
+	if (err & ERR_MASK(RcvMinPktLenErr))
+		strlcat(buf, "rminpktlen ", blen);
+	if (err & ERR_MASK(SendMinPktLenErr))
+		strlcat(buf, "sminpktlen ", blen);
+	if (err & ERR_MASK(RcvFormatErr))
+		strlcat(buf, "rformaterr ", blen);
+	if (err & ERR_MASK(RcvUnsupportedVLErr))
+		strlcat(buf, "runsupvl ", blen);
+	if (err & ERR_MASK(RcvUnexpectedCharErr))
+		strlcat(buf, "runexpchar ", blen);
+	if (err & ERR_MASK(RcvIBFlowErr))
+		strlcat(buf, "ribflow ", blen);
+	if (err & ERR_MASK(SendUnderRunErr))
+		strlcat(buf, "sunderrun ", blen);
+	if (err & ERR_MASK(SendPioArmLaunchErr))
+		strlcat(buf, "spioarmlaunch ", blen);
+	if (err & ERR_MASK(SendUnexpectedPktNumErr))
+		strlcat(buf, "sunexperrpktnum ", blen);
+	if (err & ERR_MASK(SendDroppedSmpPktErr))
+		strlcat(buf, "sdroppedsmppkt ", blen);
+	if (err & ERR_MASK(SendMaxPktLenErr))
+		strlcat(buf, "smaxpktlen ", blen);
+	if (err & ERR_MASK(SendUnsupportedVLErr))
+		strlcat(buf, "sunsupVL ", blen);
+	if (err & ERR_MASK(InvalidAddrErr))
+		strlcat(buf, "invalidaddr ", blen);
+	if (err & ERR_MASK(RcvEgrFullErr))
+		strlcat(buf, "rcvegrfull ", blen);
+	if (err & ERR_MASK(RcvHdrFullErr))
+		strlcat(buf, "rcvhdrfull ", blen);
+	if (err & ERR_MASK(IBStatusChanged))
+		strlcat(buf, "ibcstatuschg ", blen);
+	if (err & ERR_MASK(RcvIBLostLinkErr))
+		strlcat(buf, "riblostlink ", blen);
+	if (err & ERR_MASK(HardwareErr))
+		strlcat(buf, "hardware ", blen);
+	if (err & ERR_MASK(ResetNegated))
+		strlcat(buf, "reset ", blen);
+done:
+	return iserr;
+}
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ */
+static void qib_disarm_6120_senderrbufs(struct qib_pportdata *ppd)
+{
+	unsigned long sbuf[2];
+	struct qib_devdata *dd = ppd->dd;
+
+	/*
+	 * It's possible that sendbuffererror could have bits set; might
+	 * have already done this as a result of hardware error handling.
+	 */
+	sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror);
+	sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1);
+
+	if (sbuf[0] || sbuf[1])
+		qib_disarm_piobufs_set(dd, sbuf,
+				       dd->piobcnt2k + dd->piobcnt4k);
+}
+
+static int chk_6120_linkrecovery(struct qib_devdata *dd, u64 ibcs)
+{
+	int ret = 1;
+	u32 ibstate = qib_6120_iblink_state(ibcs);
+	u32 linkrecov = read_6120_creg32(dd, cr_iblinkerrrecov);
+
+	if (linkrecov != dd->cspec->lastlinkrecov) {
+		/* and no more until active again */
+		dd->cspec->lastlinkrecov = 0;
+		qib_set_linkstate(dd->pport, QIB_IB_LINKDOWN);
+		ret = 0;
+	}
+	if (ibstate == IB_PORT_ACTIVE)
+		dd->cspec->lastlinkrecov =
+			read_6120_creg32(dd, cr_iblinkerrrecov);
+	return ret;
+}
+
+static void handle_6120_errors(struct qib_devdata *dd, u64 errs)
+{
+	char *msg;
+	u64 ignore_this_time = 0;
+	u64 iserr = 0;
+	struct qib_pportdata *ppd = dd->pport;
+	u64 mask;
+
+	/* don't report errors that are masked */
+	errs &= dd->cspec->errormask;
+	msg = dd->cspec->emsgbuf;
+
+	/* do these first, they are most important */
+	if (errs & ERR_MASK(HardwareErr))
+		qib_handle_6120_hwerrors(dd, msg, sizeof(dd->cspec->emsgbuf));
+
+	if (errs & ~IB_E_BITSEXTANT)
+		qib_dev_err(dd,
+			"error interrupt with unknown errors %llx set\n",
+			(unsigned long long) (errs & ~IB_E_BITSEXTANT));
+
+	if (errs & E_SUM_ERRS) {
+		qib_disarm_6120_senderrbufs(ppd);
+		if ((errs & E_SUM_LINK_PKTERRS) &&
+		    !(ppd->lflags & QIBL_LINKACTIVE)) {
+			/*
+			 * This can happen when trying to bring the link
+			 * up, but the IB link changes state at the "wrong"
+			 * time. The IB logic then complains that the packet
+			 * isn't valid.  We don't want to confuse people, so
+			 * we just don't print them, except at debug
+			 */
+			ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+		}
+	} else if ((errs & E_SUM_LINK_PKTERRS) &&
+		   !(ppd->lflags & QIBL_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+	}
+
+	qib_write_kreg(dd, kr_errclear, errs);
+
+	errs &= ~ignore_this_time;
+	if (!errs)
+		goto done;
+
+	/*
+	 * The ones we mask off are handled specially below
+	 * or above.
+	 */
+	mask = ERR_MASK(IBStatusChanged) | ERR_MASK(RcvEgrFullErr) |
+		ERR_MASK(RcvHdrFullErr) | ERR_MASK(HardwareErr);
+	qib_decode_6120_err(dd, msg, sizeof(dd->cspec->emsgbuf), errs & ~mask);
+
+	if (errs & E_SUM_PKTERRS)
+		qib_stats.sps_rcverrs++;
+	if (errs & E_SUM_ERRS)
+		qib_stats.sps_txerrs++;
+
+	iserr = errs & ~(E_SUM_PKTERRS | QLOGIC_IB_E_PKTERRS);
+
+	if (errs & ERR_MASK(IBStatusChanged)) {
+		u64 ibcs = qib_read_kreg64(dd, kr_ibcstatus);
+		u32 ibstate = qib_6120_iblink_state(ibcs);
+		int handle = 1;
+
+		if (ibstate != IB_PORT_INIT && dd->cspec->lastlinkrecov)
+			handle = chk_6120_linkrecovery(dd, ibcs);
+		/*
+		 * Since going into a recovery state causes the link state
+		 * to go down and since recovery is transitory, it is better
+		 * if we "miss" ever seeing the link training state go into
+		 * recovery (i.e., ignore this transition for link state
+		 * special handling purposes) without updating lastibcstat.
+		 */
+		if (handle && qib_6120_phys_portstate(ibcs) ==
+					    IB_PHYSPORTSTATE_LINK_ERR_RECOVER)
+			handle = 0;
+		if (handle)
+			qib_handle_e_ibstatuschanged(ppd, ibcs);
+	}
+
+	if (errs & ERR_MASK(ResetNegated)) {
+		qib_dev_err(dd,
+			"Got reset, requires re-init (unload and reload driver)\n");
+		dd->flags &= ~QIB_INITTED;  /* needs re-init */
+		/* mark as having had error */
+		*dd->devstatusp |= QIB_STATUS_HWERROR;
+		*dd->pport->statusp &= ~QIB_STATUS_IB_CONF;
+	}
+
+	if (*msg && iserr)
+		qib_dev_porterr(dd, ppd->port, "%s error\n", msg);
+
+	if (ppd->state_wanted & ppd->lflags)
+		wake_up_interruptible(&ppd->state_wait);
+
+	/*
+	 * If there were hdrq or egrfull errors, wake up any processes
+	 * waiting in poll.  We used to try to check which contexts had
+	 * the overflow, but given the cost of that and the chip reads
+	 * to support it, it's better to just wake everybody up if we
+	 * get an overflow; waiters can poll again if it's not them.
+	 */
+	if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) {
+		qib_handle_urcv(dd, ~0U);
+		if (errs & ERR_MASK(RcvEgrFullErr))
+			qib_stats.sps_buffull++;
+		else
+			qib_stats.sps_hdrfull++;
+	}
+done:
+	return;
+}
+
+/**
+ * qib_6120_init_hwerrors - enable hardware errors
+ * @dd: the qlogic_ib device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void qib_6120_init_hwerrors(struct qib_devdata *dd)
+{
+	u64 val;
+	u64 extsval;
+
+	extsval = qib_read_kreg64(dd, kr_extstatus);
+
+	if (!(extsval & QLOGIC_IB_EXTS_MEMBIST_ENDTEST))
+		qib_dev_err(dd, "MemBIST did not complete!\n");
+
+	/* init so all hwerrors interrupt, and enter freeze, ajdust below */
+	val = ~0ULL;
+	if (dd->minrev < 2) {
+		/*
+		 * Avoid problem with internal interface bus parity
+		 * checking. Fixed in Rev2.
+		 */
+		val &= ~QLOGIC_IB_HWE_PCIEBUSPARITYRADM;
+	}
+	/* avoid some intel cpu's speculative read freeze mode issue */
+	val &= ~TXEMEMPARITYERR_PIOBUF;
+
+	dd->cspec->hwerrmask = val;
+
+	qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed));
+	qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+
+	/* clear all */
+	qib_write_kreg(dd, kr_errclear, ~0ULL);
+	/* enable errors that are masked, at least this first time. */
+	qib_write_kreg(dd, kr_errmask, ~0ULL);
+	dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask);
+	/* clear any interrupts up to this point (ints still not enabled) */
+	qib_write_kreg(dd, kr_intclear, ~0ULL);
+
+	qib_write_kreg(dd, kr_rcvbthqp,
+		       dd->qpn_mask << (QIB_6120_RcvBTHQP_BTHQP_Mask_LSB - 1) |
+		       QIB_KD_QP);
+}
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing
+ * on chips that are count-based, rather than trigger-based.  There is no
+ * reference counting, but that's also fine, given the intended use.
+ * Only chip-specific because it's all register accesses
+ */
+static void qib_set_6120_armlaunch(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		qib_write_kreg(dd, kr_errclear,
+			       ERR_MASK(SendPioArmLaunchErr));
+		dd->cspec->errormask |= ERR_MASK(SendPioArmLaunchErr);
+	} else
+		dd->cspec->errormask &= ~ERR_MASK(SendPioArmLaunchErr);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+}
+
+/*
+ * Formerly took parameter <which> in pre-shifted,
+ * pre-merged form with LinkCmd and LinkInitCmd
+ * together, and assuming the zero was NOP.
+ */
+static void qib_set_ib_6120_lstate(struct qib_pportdata *ppd, u16 linkcmd,
+				   u16 linitcmd)
+{
+	u64 mod_wd;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) {
+		/*
+		 * If we are told to disable, note that so link-recovery
+		 * code does not attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	} else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) {
+		/*
+		 * Any other linkinitcmd will lead to LINKDOWN and then
+		 * to INIT (if all is well), so clear flag to let
+		 * link-recovery code attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+
+	mod_wd = (linkcmd << QLOGIC_IB_IBCC_LINKCMD_SHIFT) |
+		(linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+
+	qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl | mod_wd);
+	/* write to chip to prevent back-to-back writes of control reg */
+	qib_write_kreg(dd, kr_scratch, 0);
+}
+
+/**
+ * qib_6120_bringup_serdes - bring up the serdes
+ * @dd: the qlogic_ib device
+ */
+static int qib_6120_bringup_serdes(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 val, config1, prev_val, hwstat, ibc;
+
+	/* Put IBC in reset, sends disabled */
+	dd->control &= ~QLOGIC_IB_C_LINKENABLE;
+	qib_write_kreg(dd, kr_control, 0ULL);
+
+	dd->cspec->ibdeltainprog = 1;
+	dd->cspec->ibsymsnap = read_6120_creg32(dd, cr_ibsymbolerr);
+	dd->cspec->iblnkerrsnap = read_6120_creg32(dd, cr_iblinkerrrecov);
+
+	/* flowcontrolwatermark is in units of KBytes */
+	ibc = 0x5ULL << SYM_LSB(IBCCtrl, FlowCtrlWaterMark);
+	/*
+	 * How often flowctrl sent.  More or less in usecs; balance against
+	 * watermark value, so that in theory senders always get a flow
+	 * control update in time to not let the IB link go idle.
+	 */
+	ibc |= 0x3ULL << SYM_LSB(IBCCtrl, FlowCtrlPeriod);
+	/* max error tolerance */
+	dd->cspec->lli_thresh = 0xf;
+	ibc |= (u64) dd->cspec->lli_thresh << SYM_LSB(IBCCtrl, PhyerrThreshold);
+	/* use "real" buffer space for */
+	ibc |= 4ULL << SYM_LSB(IBCCtrl, CreditScale);
+	/* IB credit flow control. */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrl, OverrunThreshold);
+	/*
+	 * set initial max size pkt IBC will send, including ICRC; it's the
+	 * PIO buffer size in dwords, less 1; also see qib_set_mtu()
+	 */
+	ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) << SYM_LSB(IBCCtrl, MaxPktLen);
+	dd->cspec->ibcctrl = ibc; /* without linkcmd or linkinitcmd! */
+
+	/* initially come up waiting for TS1, without sending anything. */
+	val = dd->cspec->ibcctrl | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE <<
+		QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+	qib_write_kreg(dd, kr_ibcctrl, val);
+
+	val = qib_read_kreg64(dd, kr_serdes_cfg0);
+	config1 = qib_read_kreg64(dd, kr_serdes_cfg1);
+
+	/*
+	 * Force reset on, also set rxdetect enable.  Must do before reading
+	 * serdesstatus at least for simulation, or some of the bits in
+	 * serdes status will come back as undefined and cause simulation
+	 * failures
+	 */
+	val |= SYM_MASK(SerdesCfg0, ResetPLL) |
+		SYM_MASK(SerdesCfg0, RxDetEnX) |
+		(SYM_MASK(SerdesCfg0, L1PwrDnA) |
+		 SYM_MASK(SerdesCfg0, L1PwrDnB) |
+		 SYM_MASK(SerdesCfg0, L1PwrDnC) |
+		 SYM_MASK(SerdesCfg0, L1PwrDnD));
+	qib_write_kreg(dd, kr_serdes_cfg0, val);
+	/* be sure chip saw it */
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(5);              /* need pll reset set at least for a bit */
+	/*
+	 * after PLL is reset, set the per-lane Resets and TxIdle and
+	 * clear the PLL reset and rxdetect (to get falling edge).
+	 * Leave L1PWR bits set (permanently)
+	 */
+	val &= ~(SYM_MASK(SerdesCfg0, RxDetEnX) |
+		 SYM_MASK(SerdesCfg0, ResetPLL) |
+		 (SYM_MASK(SerdesCfg0, L1PwrDnA) |
+		  SYM_MASK(SerdesCfg0, L1PwrDnB) |
+		  SYM_MASK(SerdesCfg0, L1PwrDnC) |
+		  SYM_MASK(SerdesCfg0, L1PwrDnD)));
+	val |= (SYM_MASK(SerdesCfg0, ResetA) |
+		SYM_MASK(SerdesCfg0, ResetB) |
+		SYM_MASK(SerdesCfg0, ResetC) |
+		SYM_MASK(SerdesCfg0, ResetD)) |
+		SYM_MASK(SerdesCfg0, TxIdeEnX);
+	qib_write_kreg(dd, kr_serdes_cfg0, val);
+	/* be sure chip saw it */
+	(void) qib_read_kreg64(dd, kr_scratch);
+	/* need PLL reset clear for at least 11 usec before lane
+	 * resets cleared; give it a few more to be sure */
+	udelay(15);
+	val &= ~((SYM_MASK(SerdesCfg0, ResetA) |
+		  SYM_MASK(SerdesCfg0, ResetB) |
+		  SYM_MASK(SerdesCfg0, ResetC) |
+		  SYM_MASK(SerdesCfg0, ResetD)) |
+		 SYM_MASK(SerdesCfg0, TxIdeEnX));
+
+	qib_write_kreg(dd, kr_serdes_cfg0, val);
+	/* be sure chip saw it */
+	(void) qib_read_kreg64(dd, kr_scratch);
+
+	val = qib_read_kreg64(dd, kr_xgxs_cfg);
+	prev_val = val;
+	if (val & QLOGIC_IB_XGXS_RESET)
+		val &= ~QLOGIC_IB_XGXS_RESET;
+	if (SYM_FIELD(val, XGXSCfg, polarity_inv) != ppd->rx_pol_inv) {
+		/* need to compensate for Tx inversion in partner */
+		val &= ~SYM_MASK(XGXSCfg, polarity_inv);
+		val |= (u64)ppd->rx_pol_inv << SYM_LSB(XGXSCfg, polarity_inv);
+	}
+	if (val != prev_val)
+		qib_write_kreg(dd, kr_xgxs_cfg, val);
+
+	val = qib_read_kreg64(dd, kr_serdes_cfg0);
+
+	/* clear current and de-emphasis bits */
+	config1 &= ~0x0ffffffff00ULL;
+	/* set current to 20ma */
+	config1 |= 0x00000000000ULL;
+	/* set de-emphasis to -5.68dB */
+	config1 |= 0x0cccc000000ULL;
+	qib_write_kreg(dd, kr_serdes_cfg1, config1);
+
+	/* base and port guid same for single port */
+	ppd->guid = dd->base_guid;
+
+	/*
+	 * the process of setting and un-resetting the serdes normally
+	 * causes a serdes PLL error, so check for that and clear it
+	 * here.  Also clearr hwerr bit in errstatus, but not others.
+	 */
+	hwstat = qib_read_kreg64(dd, kr_hwerrstatus);
+	if (hwstat) {
+		/* should just have PLL, clear all set, in an case */
+		qib_write_kreg(dd, kr_hwerrclear, hwstat);
+		qib_write_kreg(dd, kr_errclear, ERR_MASK(HardwareErr));
+	}
+
+	dd->control |= QLOGIC_IB_C_LINKENABLE;
+	dd->control &= ~QLOGIC_IB_C_FREEZEMODE;
+	qib_write_kreg(dd, kr_control, dd->control);
+
+	return 0;
+}
+
+/**
+ * qib_6120_quiet_serdes - set serdes to txidle
+ * @ppd: physical port of the qlogic_ib device
+ * Called when driver is being unloaded
+ */
+static void qib_6120_quiet_serdes(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 val;
+
+	qib_set_ib_6120_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+
+	/* disable IBC */
+	dd->control &= ~QLOGIC_IB_C_LINKENABLE;
+	qib_write_kreg(dd, kr_control,
+		       dd->control | QLOGIC_IB_C_FREEZEMODE);
+
+	if (dd->cspec->ibsymdelta || dd->cspec->iblnkerrdelta ||
+	    dd->cspec->ibdeltainprog) {
+		u64 diagc;
+
+		/* enable counter writes */
+		diagc = qib_read_kreg64(dd, kr_hwdiagctrl);
+		qib_write_kreg(dd, kr_hwdiagctrl,
+			       diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable));
+
+		if (dd->cspec->ibsymdelta || dd->cspec->ibdeltainprog) {
+			val = read_6120_creg32(dd, cr_ibsymbolerr);
+			if (dd->cspec->ibdeltainprog)
+				val -= val - dd->cspec->ibsymsnap;
+			val -= dd->cspec->ibsymdelta;
+			write_6120_creg(dd, cr_ibsymbolerr, val);
+		}
+		if (dd->cspec->iblnkerrdelta || dd->cspec->ibdeltainprog) {
+			val = read_6120_creg32(dd, cr_iblinkerrrecov);
+			if (dd->cspec->ibdeltainprog)
+				val -= val - dd->cspec->iblnkerrsnap;
+			val -= dd->cspec->iblnkerrdelta;
+			write_6120_creg(dd, cr_iblinkerrrecov, val);
+		}
+
+		/* and disable counter writes */
+		qib_write_kreg(dd, kr_hwdiagctrl, diagc);
+	}
+
+	val = qib_read_kreg64(dd, kr_serdes_cfg0);
+	val |= SYM_MASK(SerdesCfg0, TxIdeEnX);
+	qib_write_kreg(dd, kr_serdes_cfg0, val);
+}
+
+/**
+ * qib_6120_setup_setextled - set the state of the two external LEDs
+ * @dd: the qlogic_ib device
+ * @on: whether the link is up or not
+ *
+ * The exact combo of LEDs if on is true is determined by looking
+ * at the ibcstatus.
+
+ * These LEDs indicate the physical and logical state of IB link.
+ * For this chip (at least with recommended board pinouts), LED1
+ * is Yellow (logical state) and LED2 is Green (physical state),
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ *
+ */
+static void qib_6120_setup_setextled(struct qib_pportdata *ppd, u32 on)
+{
+	u64 extctl, val, lst, ltst;
+	unsigned long flags;
+	struct qib_devdata *dd = ppd->dd;
+
+	/*
+	 * The diags use the LED to indicate diag info, so we leave
+	 * the external LED alone when the diags are running.
+	 */
+	if (dd->diag_client)
+		return;
+
+	/* Allow override of LED display for, e.g. Locating system in rack */
+	if (ppd->led_override) {
+		ltst = (ppd->led_override & QIB_LED_PHYS) ?
+			IB_PHYSPORTSTATE_LINKUP : IB_PHYSPORTSTATE_DISABLED,
+		lst = (ppd->led_override & QIB_LED_LOG) ?
+			IB_PORT_ACTIVE : IB_PORT_DOWN;
+	} else if (on) {
+		val = qib_read_kreg64(dd, kr_ibcstatus);
+		ltst = qib_6120_phys_portstate(val);
+		lst = qib_6120_iblink_state(val);
+	} else {
+		ltst = 0;
+		lst = 0;
+	}
+
+	spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+	extctl = dd->cspec->extctrl & ~(SYM_MASK(EXTCtrl, LEDPriPortGreenOn) |
+				 SYM_MASK(EXTCtrl, LEDPriPortYellowOn));
+
+	if (ltst == IB_PHYSPORTSTATE_LINKUP)
+		extctl |= SYM_MASK(EXTCtrl, LEDPriPortYellowOn);
+	if (lst == IB_PORT_ACTIVE)
+		extctl |= SYM_MASK(EXTCtrl, LEDPriPortGreenOn);
+	dd->cspec->extctrl = extctl;
+	qib_write_kreg(dd, kr_extctrl, extctl);
+	spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+}
+
+/**
+ * qib_6120_setup_cleanup - clean up any per-chip chip-specific stuff
+ * @dd: the qlogic_ib device
+ *
+ * This is called during driver unload.
+*/
+static void qib_6120_setup_cleanup(struct qib_devdata *dd)
+{
+	qib_free_irq(dd);
+	kfree(dd->cspec->cntrs);
+	kfree(dd->cspec->portcntrs);
+	if (dd->cspec->dummy_hdrq) {
+		dma_free_coherent(&dd->pcidev->dev,
+				  ALIGN(dd->rcvhdrcnt *
+					dd->rcvhdrentsize *
+					sizeof(u32), PAGE_SIZE),
+				  dd->cspec->dummy_hdrq,
+				  dd->cspec->dummy_hdrq_phys);
+		dd->cspec->dummy_hdrq = NULL;
+	}
+}
+
+static void qib_wantpiobuf_6120_intr(struct qib_devdata *dd, u32 needint)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if (needint)
+		dd->sendctrl |= SYM_MASK(SendCtrl, PIOIntBufAvail);
+	else
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOIntBufAvail);
+	qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/*
+ * handle errors and unusual events first, separate function
+ * to improve cache hits for fast path interrupt handling
+ */
+static noinline void unlikely_6120_intr(struct qib_devdata *dd, u64 istat)
+{
+	if (unlikely(istat & ~QLOGIC_IB_I_BITSEXTANT))
+		qib_dev_err(dd, "interrupt with unknown interrupts %Lx set\n",
+			    istat & ~QLOGIC_IB_I_BITSEXTANT);
+
+	if (istat & QLOGIC_IB_I_ERROR) {
+		u64 estat = 0;
+
+		qib_stats.sps_errints++;
+		estat = qib_read_kreg64(dd, kr_errstatus);
+		if (!estat)
+			qib_devinfo(dd->pcidev,
+				"error interrupt (%Lx), but no error bits set!\n",
+				istat);
+		handle_6120_errors(dd, estat);
+	}
+
+	if (istat & QLOGIC_IB_I_GPIO) {
+		u32 gpiostatus;
+		u32 to_clear = 0;
+
+		/*
+		 * GPIO_3..5 on IBA6120 Rev2 chips indicate
+		 * errors that we need to count.
+		 */
+		gpiostatus = qib_read_kreg32(dd, kr_gpio_status);
+		/* First the error-counter case. */
+		if (gpiostatus & GPIO_ERRINTR_MASK) {
+			/* want to clear the bits we see asserted. */
+			to_clear |= (gpiostatus & GPIO_ERRINTR_MASK);
+
+			/*
+			 * Count appropriately, clear bits out of our copy,
+			 * as they have been "handled".
+			 */
+			if (gpiostatus & (1 << GPIO_RXUVL_BIT))
+				dd->cspec->rxfc_unsupvl_errs++;
+			if (gpiostatus & (1 << GPIO_OVRUN_BIT))
+				dd->cspec->overrun_thresh_errs++;
+			if (gpiostatus & (1 << GPIO_LLI_BIT))
+				dd->cspec->lli_errs++;
+			gpiostatus &= ~GPIO_ERRINTR_MASK;
+		}
+		if (gpiostatus) {
+			/*
+			 * Some unexpected bits remain. If they could have
+			 * caused the interrupt, complain and clear.
+			 * To avoid repetition of this condition, also clear
+			 * the mask. It is almost certainly due to error.
+			 */
+			const u32 mask = qib_read_kreg32(dd, kr_gpio_mask);
+
+			/*
+			 * Also check that the chip reflects our shadow,
+			 * and report issues, If they caused the interrupt.
+			 * we will suppress by refreshing from the shadow.
+			 */
+			if (mask & gpiostatus) {
+				to_clear |= (gpiostatus & mask);
+				dd->cspec->gpio_mask &= ~(gpiostatus & mask);
+				qib_write_kreg(dd, kr_gpio_mask,
+					       dd->cspec->gpio_mask);
+			}
+		}
+		if (to_clear)
+			qib_write_kreg(dd, kr_gpio_clear, (u64) to_clear);
+	}
+}
+
+static irqreturn_t qib_6120intr(int irq, void *data)
+{
+	struct qib_devdata *dd = data;
+	irqreturn_t ret;
+	u32 istat, ctxtrbits, rmask, crcs = 0;
+	unsigned i;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) {
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		ret = IRQ_HANDLED;
+		goto bail;
+	}
+
+	istat = qib_read_kreg32(dd, kr_intstatus);
+
+	if (unlikely(!istat)) {
+		ret = IRQ_NONE; /* not our interrupt, or already handled */
+		goto bail;
+	}
+	if (unlikely(istat == -1)) {
+		qib_bad_intrstatus(dd);
+		/* don't know if it was our interrupt or not */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	this_cpu_inc(*dd->int_counter);
+
+	if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT |
+			      QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR)))
+		unlikely_6120_intr(dd, istat);
+
+	/*
+	 * Clear the interrupt bits we found set, relatively early, so we
+	 * "know" know the chip will have seen this by the time we process
+	 * the queue, and will re-interrupt if necessary.  The processor
+	 * itself won't take the interrupt again until we return.
+	 */
+	qib_write_kreg(dd, kr_intclear, istat);
+
+	/*
+	 * Handle kernel receive queues before checking for pio buffers
+	 * available since receives can overflow; piobuf waiters can afford
+	 * a few extra cycles, since they were waiting anyway.
+	 */
+	ctxtrbits = istat &
+		((QLOGIC_IB_I_RCVAVAIL_MASK << QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+		 (QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT));
+	if (ctxtrbits) {
+		rmask = (1U << QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+			(1U << QLOGIC_IB_I_RCVURG_SHIFT);
+		for (i = 0; i < dd->first_user_ctxt; i++) {
+			if (ctxtrbits & rmask) {
+				ctxtrbits &= ~rmask;
+				crcs += qib_kreceive(dd->rcd[i],
+						     &dd->cspec->lli_counter,
+						     NULL);
+			}
+			rmask <<= 1;
+		}
+		if (crcs) {
+			u32 cntr = dd->cspec->lli_counter;
+
+			cntr += crcs;
+			if (cntr) {
+				if (cntr > dd->cspec->lli_thresh) {
+					dd->cspec->lli_counter = 0;
+					dd->cspec->lli_errs++;
+				} else
+					dd->cspec->lli_counter += cntr;
+			}
+		}
+
+
+		if (ctxtrbits) {
+			ctxtrbits =
+				(ctxtrbits >> QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+				(ctxtrbits >> QLOGIC_IB_I_RCVURG_SHIFT);
+			qib_handle_urcv(dd, ctxtrbits);
+		}
+	}
+
+	if ((istat & QLOGIC_IB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED))
+		qib_ib_piobufavail(dd);
+
+	ret = IRQ_HANDLED;
+bail:
+	return ret;
+}
+
+/*
+ * Set up our chip-specific interrupt handler
+ * The interrupt type has already been setup, so
+ * we just need to do the registration and error checking.
+ */
+static void qib_setup_6120_interrupt(struct qib_devdata *dd)
+{
+	int ret;
+
+	/*
+	 * If the chip supports added error indication via GPIO pins,
+	 * enable interrupts on those bits so the interrupt routine
+	 * can count the events. Also set flag so interrupt routine
+	 * can know they are expected.
+	 */
+	if (SYM_FIELD(dd->revision, Revision_R,
+		      ChipRevMinor) > 1) {
+		/* Rev2+ reports extra errors via internal GPIO pins */
+		dd->cspec->gpio_mask |= GPIO_ERRINTR_MASK;
+		qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+	}
+
+	ret = pci_request_irq(dd->pcidev, 0, qib_6120intr, NULL, dd,
+			      QIB_DRV_NAME);
+	if (ret)
+		qib_dev_err(dd,
+			    "Couldn't setup interrupt (irq=%d): %d\n",
+			    pci_irq_vector(dd->pcidev, 0), ret);
+}
+
+/**
+ * pe_boardname - fill in the board name
+ * @dd: the qlogic_ib device
+ *
+ * info is based on the board revision register
+ */
+static void pe_boardname(struct qib_devdata *dd)
+{
+	u32 boardid;
+
+	boardid = SYM_FIELD(dd->revision, Revision,
+			    BoardID);
+
+	switch (boardid) {
+	case 2:
+		dd->boardname = "InfiniPath_QLE7140";
+		break;
+	default:
+		qib_dev_err(dd, "Unknown 6120 board with ID %u\n", boardid);
+		dd->boardname = "Unknown_InfiniPath_6120";
+		break;
+	}
+
+	if (dd->majrev != 4 || !dd->minrev || dd->minrev > 2)
+		qib_dev_err(dd,
+			    "Unsupported InfiniPath hardware revision %u.%u!\n",
+			    dd->majrev, dd->minrev);
+
+	snprintf(dd->boardversion, sizeof(dd->boardversion),
+		 "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n",
+		 QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname,
+		 (unsigned int)SYM_FIELD(dd->revision, Revision_R, Arch),
+		 dd->majrev, dd->minrev,
+		 (unsigned int)SYM_FIELD(dd->revision, Revision_R, SW));
+}
+
+/*
+ * This routine sleeps, so it can only be called from user context, not
+ * from interrupt context.  If we need interrupt context, we can split
+ * it into two routines.
+ */
+static int qib_6120_setup_reset(struct qib_devdata *dd)
+{
+	u64 val;
+	int i;
+	int ret;
+	u16 cmdval;
+	u8 int_line, clinesz;
+
+	qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz);
+
+	/* Use ERROR so it shows up in logs, etc. */
+	qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit);
+
+	/* no interrupts till re-initted */
+	qib_6120_set_intr_state(dd, 0);
+
+	dd->cspec->ibdeltainprog = 0;
+	dd->cspec->ibsymdelta = 0;
+	dd->cspec->iblnkerrdelta = 0;
+
+	/*
+	 * Keep chip from being accessed until we are ready.  Use
+	 * writeq() directly, to allow the write even though QIB_PRESENT
+	 * isn't set.
+	 */
+	dd->flags &= ~(QIB_INITTED | QIB_PRESENT);
+	/* so we check interrupts work again */
+	dd->z_int_counter = qib_int_counter(dd);
+	val = dd->control | QLOGIC_IB_C_RESET;
+	writeq(val, &dd->kregbase[kr_control]);
+	mb(); /* prevent compiler re-ordering around actual reset */
+
+	for (i = 1; i <= 5; i++) {
+		/*
+		 * Allow MBIST, etc. to complete; longer on each retry.
+		 * We sometimes get machine checks from bus timeout if no
+		 * response, so for now, make it *really* long.
+		 */
+		msleep(1000 + (1 + i) * 2000);
+
+		qib_pcie_reenable(dd, cmdval, int_line, clinesz);
+
+		/*
+		 * Use readq directly, so we don't need to mark it as PRESENT
+		 * until we get a successful indication that all is well.
+		 */
+		val = readq(&dd->kregbase[kr_revision]);
+		if (val == dd->revision) {
+			dd->flags |= QIB_PRESENT; /* it's back */
+			ret = qib_reinit_intr(dd);
+			goto bail;
+		}
+	}
+	ret = 0; /* failed */
+
+bail:
+	if (ret) {
+		if (qib_pcie_params(dd, dd->lbus_width, NULL))
+			qib_dev_err(dd,
+				"Reset failed to setup PCIe or interrupts; continuing anyway\n");
+		/* clear the reset error, init error/hwerror mask */
+		qib_6120_init_hwerrors(dd);
+		/* for Rev2 error interrupts; nop for rev 1 */
+		qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+		/* clear the reset error, init error/hwerror mask */
+		qib_6120_init_hwerrors(dd);
+	}
+	return ret;
+}
+
+/**
+ * qib_6120_put_tid - write a TID in chip
+ * @dd: the qlogic_ib device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0)
+ * for expected
+ * @pa: physical address of in memory buffer; tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for special locking etc.
+ * It's used for both the full cleanup on exit, as well as the normal
+ * setup and teardown.
+ */
+static void qib_6120_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr,
+			     u32 type, unsigned long pa)
+{
+	u32 __iomem *tidp32 = (u32 __iomem *)tidptr;
+	unsigned long flags;
+	int tidx;
+	spinlock_t *tidlockp; /* select appropriate spinlock */
+
+	if (!dd->kregbase)
+		return;
+
+	if (pa != dd->tidinvalid) {
+		if (pa & ((1U << 11) - 1)) {
+			qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n",
+				    pa);
+			return;
+		}
+		pa >>= 11;
+		if (pa & ~QLOGIC_IB_RT_ADDR_MASK) {
+			qib_dev_err(dd,
+				"Physical page address 0x%lx larger than supported\n",
+				pa);
+			return;
+		}
+
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			pa |= dd->tidtemplate;
+		else /* for now, always full 4KB page */
+			pa |= 2 << 29;
+	}
+
+	/*
+	 * Avoid chip issue by writing the scratch register
+	 * before and after the TID, and with an io write barrier.
+	 * We use a spinlock around the writes, so they can't intermix
+	 * with other TID (eager or expected) writes (the chip problem
+	 * is triggered by back to back TID writes). Unfortunately, this
+	 * call can be done from interrupt level for the ctxt 0 eager TIDs,
+	 * so we have to use irqsave locks.
+	 */
+	/*
+	 * Assumes tidptr always > egrtidbase
+	 * if type == RCVHQ_RCV_TYPE_EAGER.
+	 */
+	tidx = tidptr - dd->egrtidbase;
+
+	tidlockp = (type == RCVHQ_RCV_TYPE_EAGER && tidx < dd->rcvhdrcnt)
+		? &dd->cspec->kernel_tid_lock : &dd->cspec->user_tid_lock;
+	spin_lock_irqsave(tidlockp, flags);
+	qib_write_kreg(dd, kr_scratch, 0xfeeddeaf);
+	writel(pa, tidp32);
+	qib_write_kreg(dd, kr_scratch, 0xdeadbeef);
+	mmiowb();
+	spin_unlock_irqrestore(tidlockp, flags);
+}
+
+/**
+ * qib_6120_put_tid_2 - write a TID in chip, Revision 2 or higher
+ * @dd: the qlogic_ib device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0)
+ * for expected
+ * @pa: physical address of in memory buffer; tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for selection of the
+ * appropriate "flavor". The static calls in cleanup just use the
+ * revision-agnostic form, as they are not performance critical.
+ */
+static void qib_6120_put_tid_2(struct qib_devdata *dd, u64 __iomem *tidptr,
+			       u32 type, unsigned long pa)
+{
+	u32 __iomem *tidp32 = (u32 __iomem *)tidptr;
+
+	if (!dd->kregbase)
+		return;
+
+	if (pa != dd->tidinvalid) {
+		if (pa & ((1U << 11) - 1)) {
+			qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n",
+				    pa);
+			return;
+		}
+		pa >>= 11;
+		if (pa & ~QLOGIC_IB_RT_ADDR_MASK) {
+			qib_dev_err(dd,
+				"Physical page address 0x%lx larger than supported\n",
+				pa);
+			return;
+		}
+
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			pa |= dd->tidtemplate;
+		else /* for now, always full 4KB page */
+			pa |= 2 << 29;
+	}
+	writel(pa, tidp32);
+	mmiowb();
+}
+
+
+/**
+ * qib_6120_clear_tids - clear all TID entries for a context, expected and eager
+ * @dd: the qlogic_ib device
+ * @ctxt: the context
+ *
+ * clear all TID entries for a context, expected and eager.
+ * Used from qib_close().  On this chip, TIDs are only 32 bits,
+ * not 64, but they are still on 64 bit boundaries, so tidbase
+ * is declared as u64 * for the pointer math, even though we write 32 bits
+ */
+static void qib_6120_clear_tids(struct qib_devdata *dd,
+				struct qib_ctxtdata *rcd)
+{
+	u64 __iomem *tidbase;
+	unsigned long tidinv;
+	u32 ctxt;
+	int i;
+
+	if (!dd->kregbase || !rcd)
+		return;
+
+	ctxt = rcd->ctxt;
+
+	tidinv = dd->tidinvalid;
+	tidbase = (u64 __iomem *)
+		((char __iomem *)(dd->kregbase) +
+		 dd->rcvtidbase +
+		 ctxt * dd->rcvtidcnt * sizeof(*tidbase));
+
+	for (i = 0; i < dd->rcvtidcnt; i++)
+		/* use func pointer because could be one of two funcs */
+		dd->f_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+				  tidinv);
+
+	tidbase = (u64 __iomem *)
+		((char __iomem *)(dd->kregbase) +
+		 dd->rcvegrbase +
+		 rcd->rcvegr_tid_base * sizeof(*tidbase));
+
+	for (i = 0; i < rcd->rcvegrcnt; i++)
+		/* use func pointer because could be one of two funcs */
+		dd->f_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+				  tidinv);
+}
+
+/**
+ * qib_6120_tidtemplate - setup constants for TID updates
+ * @dd: the qlogic_ib device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void qib_6120_tidtemplate(struct qib_devdata *dd)
+{
+	u32 egrsize = dd->rcvegrbufsize;
+
+	/*
+	 * For now, we always allocate 4KB buffers (at init) so we can
+	 * receive max size packets.  We may want a module parameter to
+	 * specify 2KB or 4KB and/or make be per ctxt instead of per device
+	 * for those who want to reduce memory footprint.  Note that the
+	 * rcvhdrentsize size must be large enough to hold the largest
+	 * IB header (currently 96 bytes) that we expect to handle (plus of
+	 * course the 2 dwords of RHF).
+	 */
+	if (egrsize == 2048)
+		dd->tidtemplate = 1U << 29;
+	else if (egrsize == 4096)
+		dd->tidtemplate = 2U << 29;
+	dd->tidinvalid = 0;
+}
+
+int __attribute__((weak)) qib_unordered_wc(void)
+{
+	return 0;
+}
+
+/**
+ * qib_6120_get_base_info - set chip-specific flags for user code
+ * @rcd: the qlogic_ib ctxt
+ * @kbase: qib_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithms.
+ */
+static int qib_6120_get_base_info(struct qib_ctxtdata *rcd,
+				  struct qib_base_info *kinfo)
+{
+	if (qib_unordered_wc())
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_FORCE_WC_ORDER;
+
+	kinfo->spi_runtime_flags |= QIB_RUNTIME_PCIE |
+		QIB_RUNTIME_FORCE_PIOAVAIL | QIB_RUNTIME_PIO_REGSWAPPED;
+	return 0;
+}
+
+
+static struct qib_message_header *
+qib_6120_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr)
+{
+	return (struct qib_message_header *)
+		&rhf_addr[sizeof(u64) / sizeof(u32)];
+}
+
+static void qib_6120_config_ctxts(struct qib_devdata *dd)
+{
+	dd->ctxtcnt = qib_read_kreg32(dd, kr_portcnt);
+	if (qib_n_krcv_queues > 1) {
+		dd->first_user_ctxt = qib_n_krcv_queues * dd->num_pports;
+		if (dd->first_user_ctxt > dd->ctxtcnt)
+			dd->first_user_ctxt = dd->ctxtcnt;
+		dd->qpn_mask = dd->first_user_ctxt <= 2 ? 2 : 6;
+	} else
+		dd->first_user_ctxt = dd->num_pports;
+	dd->n_krcv_queues = dd->first_user_ctxt;
+}
+
+static void qib_update_6120_usrhead(struct qib_ctxtdata *rcd, u64 hd,
+				    u32 updegr, u32 egrhd, u32 npkts)
+{
+	if (updegr)
+		qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt);
+	mmiowb();
+	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
+	mmiowb();
+}
+
+static u32 qib_6120_hdrqempty(struct qib_ctxtdata *rcd)
+{
+	u32 head, tail;
+
+	head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt);
+	if (rcd->rcvhdrtail_kvaddr)
+		tail = qib_get_rcvhdrtail(rcd);
+	else
+		tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt);
+	return head == tail;
+}
+
+/*
+ * Used when we close any ctxt, for DMA already in flight
+ * at close.  Can't be done until we know hdrq size, so not
+ * early in chip init.
+ */
+static void alloc_dummy_hdrq(struct qib_devdata *dd)
+{
+	dd->cspec->dummy_hdrq = dma_alloc_coherent(&dd->pcidev->dev,
+					dd->rcd[0]->rcvhdrq_size,
+					&dd->cspec->dummy_hdrq_phys,
+					GFP_ATOMIC | __GFP_COMP);
+	if (!dd->cspec->dummy_hdrq) {
+		qib_devinfo(dd->pcidev, "Couldn't allocate dummy hdrq\n");
+		/* fallback to just 0'ing */
+		dd->cspec->dummy_hdrq_phys = 0UL;
+	}
+}
+
+/*
+ * Modify the RCVCTRL register in chip-specific way. This
+ * is a function because bit positions and (future) register
+ * location is chip-specific, but the needed operations are
+ * generic. <op> is a bit-mask because we often want to
+ * do multiple modifications.
+ */
+static void rcvctrl_6120_mod(struct qib_pportdata *ppd, unsigned int op,
+			     int ctxt)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 mask, val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+
+	if (op & QIB_RCVCTRL_TAILUPD_ENB)
+		dd->rcvctrl |= (1ULL << QLOGIC_IB_R_TAILUPD_SHIFT);
+	if (op & QIB_RCVCTRL_TAILUPD_DIS)
+		dd->rcvctrl &= ~(1ULL << QLOGIC_IB_R_TAILUPD_SHIFT);
+	if (op & QIB_RCVCTRL_PKEY_ENB)
+		dd->rcvctrl &= ~(1ULL << IBA6120_R_PKEY_DIS_SHIFT);
+	if (op & QIB_RCVCTRL_PKEY_DIS)
+		dd->rcvctrl |= (1ULL << IBA6120_R_PKEY_DIS_SHIFT);
+	if (ctxt < 0)
+		mask = (1ULL << dd->ctxtcnt) - 1;
+	else
+		mask = (1ULL << ctxt);
+	if (op & QIB_RCVCTRL_CTXT_ENB) {
+		/* always done for specific ctxt */
+		dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, PortEnable));
+		if (!(dd->flags & QIB_NODMA_RTAIL))
+			dd->rcvctrl |= 1ULL << QLOGIC_IB_R_TAILUPD_SHIFT;
+		/* Write these registers before the context is enabled. */
+		qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt,
+			dd->rcd[ctxt]->rcvhdrqtailaddr_phys);
+		qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt,
+			dd->rcd[ctxt]->rcvhdrq_phys);
+
+		if (ctxt == 0 && !dd->cspec->dummy_hdrq)
+			alloc_dummy_hdrq(dd);
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS)
+		dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, PortEnable));
+	if (op & QIB_RCVCTRL_INTRAVAIL_ENB)
+		dd->rcvctrl |= (mask << QLOGIC_IB_R_INTRAVAIL_SHIFT);
+	if (op & QIB_RCVCTRL_INTRAVAIL_DIS)
+		dd->rcvctrl &= ~(mask << QLOGIC_IB_R_INTRAVAIL_SHIFT);
+	qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) && dd->rhdrhead_intr_off) {
+		/* arm rcv interrupt */
+		val = qib_read_ureg32(dd, ur_rcvhdrhead, ctxt) |
+			dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_ENB) {
+		/*
+		 * Init the context registers also; if we were
+		 * disabled, tail and head should both be zero
+		 * already from the enable, but since we don't
+		 * know, we have to do it explicitly.
+		 */
+		val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt);
+		qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt);
+
+		val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt);
+		dd->rcd[ctxt]->head = val;
+		/* If kctxt, interrupt on next receive. */
+		if (ctxt < dd->first_user_ctxt)
+			val |= dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS) {
+		/*
+		 * Be paranoid, and never write 0's to these, just use an
+		 * unused page.  Of course,
+		 * rcvhdraddr points to a large chunk of memory, so this
+		 * could still trash things, but at least it won't trash
+		 * page 0, and by disabling the ctxt, it should stop "soon",
+		 * even if a packet or two is in already in flight after we
+		 * disabled the ctxt.  Only 6120 has this issue.
+		 */
+		if (ctxt >= 0) {
+			qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt,
+					    dd->cspec->dummy_hdrq_phys);
+			qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt,
+					    dd->cspec->dummy_hdrq_phys);
+		} else {
+			unsigned i;
+
+			for (i = 0; i < dd->cfgctxts; i++) {
+				qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr,
+					    i, dd->cspec->dummy_hdrq_phys);
+				qib_write_kreg_ctxt(dd, kr_rcvhdraddr,
+					    i, dd->cspec->dummy_hdrq_phys);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+}
+
+/*
+ * Modify the SENDCTRL register in chip-specific way. This
+ * is a function there may be multiple such registers with
+ * slightly different layouts. Only operations actually used
+ * are implemented yet.
+ * Chip requires no back-back sendctrl writes, so write
+ * scratch register after writing sendctrl
+ */
+static void sendctrl_6120_mod(struct qib_pportdata *ppd, u32 op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 tmp_dd_sendctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+	/* First the ones that are "sticky", saved in shadow */
+	if (op & QIB_SENDCTRL_CLEAR)
+		dd->sendctrl = 0;
+	if (op & QIB_SENDCTRL_SEND_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOEnable);
+	else if (op & QIB_SENDCTRL_SEND_ENB)
+		dd->sendctrl |= SYM_MASK(SendCtrl, PIOEnable);
+	if (op & QIB_SENDCTRL_AVAIL_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOBufAvailUpd);
+	else if (op & QIB_SENDCTRL_AVAIL_ENB)
+		dd->sendctrl |= SYM_MASK(SendCtrl, PIOBufAvailUpd);
+
+	if (op & QIB_SENDCTRL_DISARM_ALL) {
+		u32 i, last;
+
+		tmp_dd_sendctrl = dd->sendctrl;
+		/*
+		 * disarm any that are not yet launched, disabling sends
+		 * and updates until done.
+		 */
+		last = dd->piobcnt2k + dd->piobcnt4k;
+		tmp_dd_sendctrl &=
+			~(SYM_MASK(SendCtrl, PIOEnable) |
+			  SYM_MASK(SendCtrl, PIOBufAvailUpd));
+		for (i = 0; i < last; i++) {
+			qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl |
+				       SYM_MASK(SendCtrl, Disarm) | i);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+	}
+
+	tmp_dd_sendctrl = dd->sendctrl;
+
+	if (op & QIB_SENDCTRL_FLUSH)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Abort);
+	if (op & QIB_SENDCTRL_DISARM)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) |
+			((op & QIB_6120_SendCtrl_DisarmPIOBuf_RMASK) <<
+			 SYM_LSB(SendCtrl, DisarmPIOBuf));
+	if (op & QIB_SENDCTRL_AVAIL_BLIP)
+		tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, PIOBufAvailUpd);
+
+	qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	if (op & QIB_SENDCTRL_AVAIL_BLIP) {
+		qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+
+	if (op & QIB_SENDCTRL_FLUSH) {
+		u32 v;
+		/*
+		 * ensure writes have hit chip, then do a few
+		 * more reads, to allow DMA of pioavail registers
+		 * to occur, so in-memory copy is in sync with
+		 * the chip.  Not always safe to sleep.
+		 */
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+}
+
+/**
+ * qib_portcntr_6120 - read a per-port counter
+ * @dd: the qlogic_ib device
+ * @creg: the counter to snapshot
+ */
+static u64 qib_portcntr_6120(struct qib_pportdata *ppd, u32 reg)
+{
+	u64 ret = 0ULL;
+	struct qib_devdata *dd = ppd->dd;
+	u16 creg;
+	/* 0xffff for unimplemented or synthesized counters */
+	static const u16 xlator[] = {
+		[QIBPORTCNTR_PKTSEND] = cr_pktsend,
+		[QIBPORTCNTR_WORDSEND] = cr_wordsend,
+		[QIBPORTCNTR_PSXMITDATA] = 0xffff,
+		[QIBPORTCNTR_PSXMITPKTS] = 0xffff,
+		[QIBPORTCNTR_PSXMITWAIT] = 0xffff,
+		[QIBPORTCNTR_SENDSTALL] = cr_sendstall,
+		[QIBPORTCNTR_PKTRCV] = cr_pktrcv,
+		[QIBPORTCNTR_PSRCVDATA] = 0xffff,
+		[QIBPORTCNTR_PSRCVPKTS] = 0xffff,
+		[QIBPORTCNTR_RCVEBP] = cr_rcvebp,
+		[QIBPORTCNTR_RCVOVFL] = cr_rcvovfl,
+		[QIBPORTCNTR_WORDRCV] = cr_wordrcv,
+		[QIBPORTCNTR_RXDROPPKT] = cr_rxdroppkt,
+		[QIBPORTCNTR_RXLOCALPHYERR] = 0xffff,
+		[QIBPORTCNTR_RXVLERR] = 0xffff,
+		[QIBPORTCNTR_ERRICRC] = cr_erricrc,
+		[QIBPORTCNTR_ERRVCRC] = cr_errvcrc,
+		[QIBPORTCNTR_ERRLPCRC] = cr_errlpcrc,
+		[QIBPORTCNTR_BADFORMAT] = cr_badformat,
+		[QIBPORTCNTR_ERR_RLEN] = cr_err_rlen,
+		[QIBPORTCNTR_IBSYMBOLERR] = cr_ibsymbolerr,
+		[QIBPORTCNTR_INVALIDRLEN] = cr_invalidrlen,
+		[QIBPORTCNTR_UNSUPVL] = cr_txunsupvl,
+		[QIBPORTCNTR_EXCESSBUFOVFL] = 0xffff,
+		[QIBPORTCNTR_ERRLINK] = cr_errlink,
+		[QIBPORTCNTR_IBLINKDOWN] = cr_iblinkdown,
+		[QIBPORTCNTR_IBLINKERRRECOV] = cr_iblinkerrrecov,
+		[QIBPORTCNTR_LLI] = 0xffff,
+		[QIBPORTCNTR_PSINTERVAL] = 0xffff,
+		[QIBPORTCNTR_PSSTART] = 0xffff,
+		[QIBPORTCNTR_PSSTAT] = 0xffff,
+		[QIBPORTCNTR_VL15PKTDROP] = 0xffff,
+		[QIBPORTCNTR_ERRPKEY] = cr_errpkey,
+		[QIBPORTCNTR_KHDROVFL] = 0xffff,
+	};
+
+	if (reg >= ARRAY_SIZE(xlator)) {
+		qib_devinfo(ppd->dd->pcidev,
+			 "Unimplemented portcounter %u\n", reg);
+		goto done;
+	}
+	creg = xlator[reg];
+
+	/* handle counters requests not implemented as chip counters */
+	if (reg == QIBPORTCNTR_LLI)
+		ret = dd->cspec->lli_errs;
+	else if (reg == QIBPORTCNTR_EXCESSBUFOVFL)
+		ret = dd->cspec->overrun_thresh_errs;
+	else if (reg == QIBPORTCNTR_KHDROVFL) {
+		int i;
+
+		/* sum over all kernel contexts */
+		for (i = 0; i < dd->first_user_ctxt; i++)
+			ret += read_6120_creg32(dd, cr_portovfl + i);
+	} else if (reg == QIBPORTCNTR_PSSTAT)
+		ret = dd->cspec->pma_sample_status;
+	if (creg == 0xffff)
+		goto done;
+
+	/*
+	 * only fast incrementing counters are 64bit; use 32 bit reads to
+	 * avoid two independent reads when on opteron
+	 */
+	if (creg == cr_wordsend || creg == cr_wordrcv ||
+	    creg == cr_pktsend || creg == cr_pktrcv)
+		ret = read_6120_creg(dd, creg);
+	else
+		ret = read_6120_creg32(dd, creg);
+	if (creg == cr_ibsymbolerr) {
+		if (dd->cspec->ibdeltainprog)
+			ret -= ret - dd->cspec->ibsymsnap;
+		ret -= dd->cspec->ibsymdelta;
+	} else if (creg == cr_iblinkerrrecov) {
+		if (dd->cspec->ibdeltainprog)
+			ret -= ret - dd->cspec->iblnkerrsnap;
+		ret -= dd->cspec->iblnkerrdelta;
+	}
+	if (reg == QIBPORTCNTR_RXDROPPKT) /* add special cased count */
+		ret += dd->cspec->rxfc_unsupvl_errs;
+
+done:
+	return ret;
+}
+
+/*
+ * Device counter names (not port-specific), one line per stat,
+ * single string.  Used by utilities like ipathstats to print the stats
+ * in a way which works for different versions of drivers, without changing
+ * the utility.  Names need to be 12 chars or less (w/o newline), for proper
+ * display by utility.
+ * Non-error counters are first.
+ * Start of "error" conters is indicated by a leading "E " on the first
+ * "error" counter, and doesn't count in label length.
+ * The EgrOvfl list needs to be last so we truncate them at the configured
+ * context count for the device.
+ * cntr6120indices contains the corresponding register indices.
+ */
+static const char cntr6120names[] =
+	"Interrupts\n"
+	"HostBusStall\n"
+	"E RxTIDFull\n"
+	"RxTIDInvalid\n"
+	"Ctxt0EgrOvfl\n"
+	"Ctxt1EgrOvfl\n"
+	"Ctxt2EgrOvfl\n"
+	"Ctxt3EgrOvfl\n"
+	"Ctxt4EgrOvfl\n";
+
+static const size_t cntr6120indices[] = {
+	cr_lbint,
+	cr_lbflowstall,
+	cr_errtidfull,
+	cr_errtidvalid,
+	cr_portovfl + 0,
+	cr_portovfl + 1,
+	cr_portovfl + 2,
+	cr_portovfl + 3,
+	cr_portovfl + 4,
+};
+
+/*
+ * same as cntr6120names and cntr6120indices, but for port-specific counters.
+ * portcntr6120indices is somewhat complicated by some registers needing
+ * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG
+ */
+static const char portcntr6120names[] =
+	"TxPkt\n"
+	"TxFlowPkt\n"
+	"TxWords\n"
+	"RxPkt\n"
+	"RxFlowPkt\n"
+	"RxWords\n"
+	"TxFlowStall\n"
+	"E IBStatusChng\n"
+	"IBLinkDown\n"
+	"IBLnkRecov\n"
+	"IBRxLinkErr\n"
+	"IBSymbolErr\n"
+	"RxLLIErr\n"
+	"RxBadFormat\n"
+	"RxBadLen\n"
+	"RxBufOvrfl\n"
+	"RxEBP\n"
+	"RxFlowCtlErr\n"
+	"RxICRCerr\n"
+	"RxLPCRCerr\n"
+	"RxVCRCerr\n"
+	"RxInvalLen\n"
+	"RxInvalPKey\n"
+	"RxPktDropped\n"
+	"TxBadLength\n"
+	"TxDropped\n"
+	"TxInvalLen\n"
+	"TxUnderrun\n"
+	"TxUnsupVL\n"
+	;
+
+#define _PORT_VIRT_FLAG 0x8000 /* "virtual", need adjustments */
+static const size_t portcntr6120indices[] = {
+	QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG,
+	cr_pktsendflow,
+	QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG,
+	cr_pktrcvflowctrl,
+	QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG,
+	cr_ibstatuschange,
+	QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_LLI | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG,
+	cr_rcvflowctrl_err,
+	QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG,
+	cr_invalidslen,
+	cr_senddropped,
+	cr_errslen,
+	cr_sendunderrun,
+	cr_txunsupvl,
+};
+
+/* do all the setup to make the counter reads efficient later */
+static void init_6120_cntrnames(struct qib_devdata *dd)
+{
+	int i, j = 0;
+	char *s;
+
+	for (i = 0, s = (char *)cntr6120names; s && j <= dd->cfgctxts;
+	     i++) {
+		/* we always have at least one counter before the egrovfl */
+		if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12))
+			j = 1;
+		s = strchr(s + 1, '\n');
+		if (s && j)
+			j++;
+	}
+	dd->cspec->ncntrs = i;
+	if (!s)
+		/* full list; size is without terminating null */
+		dd->cspec->cntrnamelen = sizeof(cntr6120names) - 1;
+	else
+		dd->cspec->cntrnamelen = 1 + s - cntr6120names;
+	dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs
+		* sizeof(u64), GFP_KERNEL);
+
+	for (i = 0, s = (char *)portcntr6120names; s; i++)
+		s = strchr(s + 1, '\n');
+	dd->cspec->nportcntrs = i - 1;
+	dd->cspec->portcntrnamelen = sizeof(portcntr6120names) - 1;
+	dd->cspec->portcntrs = kmalloc(dd->cspec->nportcntrs
+		* sizeof(u64), GFP_KERNEL);
+}
+
+static u32 qib_read_6120cntrs(struct qib_devdata *dd, loff_t pos, char **namep,
+			      u64 **cntrp)
+{
+	u32 ret;
+
+	if (namep) {
+		ret = dd->cspec->cntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+		else
+			*namep = (char *)cntr6120names;
+	} else {
+		u64 *cntr = dd->cspec->cntrs;
+		int i;
+
+		ret = dd->cspec->ncntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		if (pos >= ret) {
+			ret = 0; /* final read after getting everything */
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->ncntrs; i++)
+			*cntr++ = read_6120_creg32(dd, cntr6120indices[i]);
+	}
+done:
+	return ret;
+}
+
+static u32 qib_read_6120portcntrs(struct qib_devdata *dd, loff_t pos, u32 port,
+				  char **namep, u64 **cntrp)
+{
+	u32 ret;
+
+	if (namep) {
+		ret = dd->cspec->portcntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+		else
+			*namep = (char *)portcntr6120names;
+	} else {
+		u64 *cntr = dd->cspec->portcntrs;
+		struct qib_pportdata *ppd = &dd->pport[port];
+		int i;
+
+		ret = dd->cspec->nportcntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->nportcntrs; i++) {
+			if (portcntr6120indices[i] & _PORT_VIRT_FLAG)
+				*cntr++ = qib_portcntr_6120(ppd,
+					portcntr6120indices[i] &
+					~_PORT_VIRT_FLAG);
+			else
+				*cntr++ = read_6120_creg32(dd,
+					   portcntr6120indices[i]);
+		}
+	}
+done:
+	return ret;
+}
+
+static void qib_chk_6120_errormask(struct qib_devdata *dd)
+{
+	static u32 fixed;
+	u32 ctrl;
+	unsigned long errormask;
+	unsigned long hwerrs;
+
+	if (!dd->cspec->errormask || !(dd->flags & QIB_INITTED))
+		return;
+
+	errormask = qib_read_kreg64(dd, kr_errmask);
+
+	if (errormask == dd->cspec->errormask)
+		return;
+	fixed++;
+
+	hwerrs = qib_read_kreg64(dd, kr_hwerrstatus);
+	ctrl = qib_read_kreg32(dd, kr_control);
+
+	qib_write_kreg(dd, kr_errmask,
+		dd->cspec->errormask);
+
+	if ((hwerrs & dd->cspec->hwerrmask) ||
+	    (ctrl & QLOGIC_IB_C_FREEZEMODE)) {
+		qib_write_kreg(dd, kr_hwerrclear, 0ULL);
+		qib_write_kreg(dd, kr_errclear, 0ULL);
+		/* force re-interrupt of pending events, just in case */
+		qib_write_kreg(dd, kr_intclear, 0ULL);
+		qib_devinfo(dd->pcidev,
+			 "errormask fixed(%u) %lx->%lx, ctrl %x hwerr %lx\n",
+			 fixed, errormask, (unsigned long)dd->cspec->errormask,
+			 ctrl, hwerrs);
+	}
+}
+
+/**
+ * qib_get_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the qlogic_ib device qib_devdata
+ *
+ * This needs more work; in particular, decision on whether we really
+ * need traffic_wds done the way it is
+ * called from add_timer
+ */
+static void qib_get_6120_faststats(struct timer_list *t)
+{
+	struct qib_devdata *dd = from_timer(dd, t, stats_timer);
+	struct qib_pportdata *ppd = dd->pport;
+	unsigned long flags;
+	u64 traffic_wds;
+
+	/*
+	 * don't access the chip while running diags, or memory diags can
+	 * fail
+	 */
+	if (!(dd->flags & QIB_INITTED) || dd->diag_client)
+		/* but re-arm the timer, for diags case; won't hurt other */
+		goto done;
+
+	/*
+	 * We now try to maintain an activity timer, based on traffic
+	 * exceeding a threshold, so we need to check the word-counts
+	 * even if they are 64-bit.
+	 */
+	traffic_wds = qib_portcntr_6120(ppd, cr_wordsend) +
+		qib_portcntr_6120(ppd, cr_wordrcv);
+	spin_lock_irqsave(&dd->eep_st_lock, flags);
+	traffic_wds -= dd->traffic_wds;
+	dd->traffic_wds += traffic_wds;
+	spin_unlock_irqrestore(&dd->eep_st_lock, flags);
+
+	qib_chk_6120_errormask(dd);
+done:
+	mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER);
+}
+
+/* no interrupt fallback for these chips */
+static int qib_6120_nointr_fallback(struct qib_devdata *dd)
+{
+	return 0;
+}
+
+/*
+ * reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well.
+ */
+static void qib_6120_xgxs_reset(struct qib_pportdata *ppd)
+{
+	u64 val, prev_val;
+	struct qib_devdata *dd = ppd->dd;
+
+	prev_val = qib_read_kreg64(dd, kr_xgxs_cfg);
+	val = prev_val | QLOGIC_IB_XGXS_RESET;
+	prev_val &= ~QLOGIC_IB_XGXS_RESET; /* be sure */
+	qib_write_kreg(dd, kr_control,
+		       dd->control & ~QLOGIC_IB_C_LINKENABLE);
+	qib_write_kreg(dd, kr_xgxs_cfg, val);
+	qib_read_kreg32(dd, kr_scratch);
+	qib_write_kreg(dd, kr_xgxs_cfg, prev_val);
+	qib_write_kreg(dd, kr_control, dd->control);
+}
+
+static int qib_6120_get_ib_cfg(struct qib_pportdata *ppd, int which)
+{
+	int ret;
+
+	switch (which) {
+	case QIB_IB_CFG_LWID:
+		ret = ppd->link_width_active;
+		break;
+
+	case QIB_IB_CFG_SPD:
+		ret = ppd->link_speed_active;
+		break;
+
+	case QIB_IB_CFG_LWID_ENB:
+		ret = ppd->link_width_enabled;
+		break;
+
+	case QIB_IB_CFG_SPD_ENB:
+		ret = ppd->link_speed_enabled;
+		break;
+
+	case QIB_IB_CFG_OP_VLS:
+		ret = ppd->vls_operational;
+		break;
+
+	case QIB_IB_CFG_VL_HIGH_CAP:
+		ret = 0;
+		break;
+
+	case QIB_IB_CFG_VL_LOW_CAP:
+		ret = 0;
+		break;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		ret = SYM_FIELD(ppd->dd->cspec->ibcctrl, IBCCtrl,
+				OverrunThreshold);
+		break;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		ret = SYM_FIELD(ppd->dd->cspec->ibcctrl, IBCCtrl,
+				PhyerrThreshold);
+		break;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		ret = (ppd->dd->cspec->ibcctrl &
+		       SYM_MASK(IBCCtrl, LinkDownDefaultState)) ?
+			IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL;
+		break;
+
+	case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */
+		ret = 0; /* no heartbeat on this chip */
+		break;
+
+	case QIB_IB_CFG_PMA_TICKS:
+		ret = 250; /* 1 usec. */
+		break;
+
+	default:
+		ret =  -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * We assume range checking is already done, if needed.
+ */
+static int qib_6120_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret = 0;
+	u64 val64;
+	u16 lcmd, licmd;
+
+	switch (which) {
+	case QIB_IB_CFG_LWID_ENB:
+		ppd->link_width_enabled = val;
+		break;
+
+	case QIB_IB_CFG_SPD_ENB:
+		ppd->link_speed_enabled = val;
+		break;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		val64 = SYM_FIELD(dd->cspec->ibcctrl, IBCCtrl,
+				  OverrunThreshold);
+		if (val64 != val) {
+			dd->cspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, OverrunThreshold);
+			dd->cspec->ibcctrl |= (u64) val <<
+				SYM_LSB(IBCCtrl, OverrunThreshold);
+			qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+		break;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		val64 = SYM_FIELD(dd->cspec->ibcctrl, IBCCtrl,
+				  PhyerrThreshold);
+		if (val64 != val) {
+			dd->cspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, PhyerrThreshold);
+			dd->cspec->ibcctrl |= (u64) val <<
+				SYM_LSB(IBCCtrl, PhyerrThreshold);
+			qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+		break;
+
+	case QIB_IB_CFG_PKEYS: /* update pkeys */
+		val64 = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) |
+			((u64) ppd->pkeys[2] << 32) |
+			((u64) ppd->pkeys[3] << 48);
+		qib_write_kreg(dd, kr_partitionkey, val64);
+		break;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		if (val == IB_LINKINITCMD_POLL)
+			dd->cspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, LinkDownDefaultState);
+		else /* SLEEP */
+			dd->cspec->ibcctrl |=
+				SYM_MASK(IBCCtrl, LinkDownDefaultState);
+		qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		break;
+
+	case QIB_IB_CFG_MTU: /* update the MTU in IBC */
+		/*
+		 * Update our housekeeping variables, and set IBC max
+		 * size, same as init code; max IBC is max we allow in
+		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+		 * Set even if it's unchanged, print debug message only
+		 * on changes.
+		 */
+		val = (ppd->ibmaxlen >> 2) + 1;
+		dd->cspec->ibcctrl &= ~SYM_MASK(IBCCtrl, MaxPktLen);
+		dd->cspec->ibcctrl |= (u64)val <<
+			SYM_LSB(IBCCtrl, MaxPktLen);
+		qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		break;
+
+	case QIB_IB_CFG_LSTATE: /* set the IB link state */
+		switch (val & 0xffff0000) {
+		case IB_LINKCMD_DOWN:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN;
+			if (!dd->cspec->ibdeltainprog) {
+				dd->cspec->ibdeltainprog = 1;
+				dd->cspec->ibsymsnap =
+					read_6120_creg32(dd, cr_ibsymbolerr);
+				dd->cspec->iblnkerrsnap =
+					read_6120_creg32(dd, cr_iblinkerrrecov);
+			}
+			break;
+
+		case IB_LINKCMD_ARMED:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED;
+			break;
+
+		case IB_LINKCMD_ACTIVE:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE;
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16);
+			goto bail;
+		}
+		switch (val & 0xffff) {
+		case IB_LINKINITCMD_NOP:
+			licmd = 0;
+			break;
+
+		case IB_LINKINITCMD_POLL:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL;
+			break;
+
+		case IB_LINKINITCMD_SLEEP:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP;
+			break;
+
+		case IB_LINKINITCMD_DISABLE:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE;
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkinitcmd req 0x%x\n",
+				    val & 0xffff);
+			goto bail;
+		}
+		qib_set_ib_6120_lstate(ppd, lcmd, licmd);
+		goto bail;
+
+	case QIB_IB_CFG_HRTBT:
+		ret = -EINVAL;
+		break;
+
+	default:
+		ret = -EINVAL;
+	}
+bail:
+	return ret;
+}
+
+static int qib_6120_set_loopback(struct qib_pportdata *ppd, const char *what)
+{
+	int ret = 0;
+
+	if (!strncmp(what, "ibc", 3)) {
+		ppd->dd->cspec->ibcctrl |= SYM_MASK(IBCCtrl, Loopback);
+		qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n",
+			 ppd->dd->unit, ppd->port);
+	} else if (!strncmp(what, "off", 3)) {
+		ppd->dd->cspec->ibcctrl &= ~SYM_MASK(IBCCtrl, Loopback);
+		qib_devinfo(ppd->dd->pcidev,
+			"Disabling IB%u:%u IBC loopback (normal)\n",
+			ppd->dd->unit, ppd->port);
+	} else
+		ret = -EINVAL;
+	if (!ret) {
+		qib_write_kreg(ppd->dd, kr_ibcctrl, ppd->dd->cspec->ibcctrl);
+		qib_write_kreg(ppd->dd, kr_scratch, 0);
+	}
+	return ret;
+}
+
+static void pma_6120_timer(struct timer_list *t)
+{
+	struct qib_chip_specific *cs = from_timer(cs, t, pma_timer);
+	struct qib_pportdata *ppd = cs->ppd;
+	struct qib_ibport *ibp = &ppd->ibport_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	if (cs->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED) {
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
+		qib_snapshot_counters(ppd, &cs->sword, &cs->rword,
+				      &cs->spkts, &cs->rpkts, &cs->xmit_wait);
+		mod_timer(&cs->pma_timer,
+		      jiffies + usecs_to_jiffies(ibp->rvp.pma_sample_interval));
+	} else if (cs->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
+		u64 ta, tb, tc, td, te;
+
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+		qib_snapshot_counters(ppd, &ta, &tb, &tc, &td, &te);
+
+		cs->sword = ta - cs->sword;
+		cs->rword = tb - cs->rword;
+		cs->spkts = tc - cs->spkts;
+		cs->rpkts = td - cs->rpkts;
+		cs->xmit_wait = te - cs->xmit_wait;
+	}
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+}
+
+/*
+ * Note that the caller has the ibp->rvp.lock held.
+ */
+static void qib_set_cntr_6120_sample(struct qib_pportdata *ppd, u32 intv,
+				     u32 start)
+{
+	struct qib_chip_specific *cs = ppd->dd->cspec;
+
+	if (start && intv) {
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED;
+		mod_timer(&cs->pma_timer, jiffies + usecs_to_jiffies(start));
+	} else if (intv) {
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
+		qib_snapshot_counters(ppd, &cs->sword, &cs->rword,
+				      &cs->spkts, &cs->rpkts, &cs->xmit_wait);
+		mod_timer(&cs->pma_timer, jiffies + usecs_to_jiffies(intv));
+	} else {
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+		cs->sword = 0;
+		cs->rword = 0;
+		cs->spkts = 0;
+		cs->rpkts = 0;
+		cs->xmit_wait = 0;
+	}
+}
+
+static u32 qib_6120_iblink_state(u64 ibcs)
+{
+	u32 state = (u32)SYM_FIELD(ibcs, IBCStatus, LinkState);
+
+	switch (state) {
+	case IB_6120_L_STATE_INIT:
+		state = IB_PORT_INIT;
+		break;
+	case IB_6120_L_STATE_ARM:
+		state = IB_PORT_ARMED;
+		break;
+	case IB_6120_L_STATE_ACTIVE:
+		/* fall through */
+	case IB_6120_L_STATE_ACT_DEFER:
+		state = IB_PORT_ACTIVE;
+		break;
+	default: /* fall through */
+	case IB_6120_L_STATE_DOWN:
+		state = IB_PORT_DOWN;
+		break;
+	}
+	return state;
+}
+
+/* returns the IBTA port state, rather than the IBC link training state */
+static u8 qib_6120_phys_portstate(u64 ibcs)
+{
+	u8 state = (u8)SYM_FIELD(ibcs, IBCStatus, LinkTrainingState);
+	return qib_6120_physportstate[state];
+}
+
+static int qib_6120_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+	if (ibup) {
+		if (ppd->dd->cspec->ibdeltainprog) {
+			ppd->dd->cspec->ibdeltainprog = 0;
+			ppd->dd->cspec->ibsymdelta +=
+				read_6120_creg32(ppd->dd, cr_ibsymbolerr) -
+					ppd->dd->cspec->ibsymsnap;
+			ppd->dd->cspec->iblnkerrdelta +=
+				read_6120_creg32(ppd->dd, cr_iblinkerrrecov) -
+					ppd->dd->cspec->iblnkerrsnap;
+		}
+		qib_hol_init(ppd);
+	} else {
+		ppd->dd->cspec->lli_counter = 0;
+		if (!ppd->dd->cspec->ibdeltainprog) {
+			ppd->dd->cspec->ibdeltainprog = 1;
+			ppd->dd->cspec->ibsymsnap =
+				read_6120_creg32(ppd->dd, cr_ibsymbolerr);
+			ppd->dd->cspec->iblnkerrsnap =
+				read_6120_creg32(ppd->dd, cr_iblinkerrrecov);
+		}
+		qib_hol_down(ppd);
+	}
+
+	qib_6120_setup_setextled(ppd, ibup);
+
+	return 0;
+}
+
+/* Does read/modify/write to appropriate registers to
+ * set output and direction bits selected by mask.
+ * these are in their canonical postions (e.g. lsb of
+ * dir will end up in D48 of extctrl on existing chips).
+ * returns contents of GP Inputs.
+ */
+static int gpio_6120_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask)
+{
+	u64 read_val, new_out;
+	unsigned long flags;
+
+	if (mask) {
+		/* some bits being written, lock access to GPIO */
+		dir &= mask;
+		out &= mask;
+		spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+		dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe));
+		dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe));
+		new_out = (dd->cspec->gpio_out & ~mask) | out;
+
+		qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+		qib_write_kreg(dd, kr_gpio_out, new_out);
+		dd->cspec->gpio_out = new_out;
+		spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+	}
+	/*
+	 * It is unlikely that a read at this time would get valid
+	 * data on a pin whose direction line was set in the same
+	 * call to this function. We include the read here because
+	 * that allows us to potentially combine a change on one pin with
+	 * a read on another, and because the old code did something like
+	 * this.
+	 */
+	read_val = qib_read_kreg64(dd, kr_extstatus);
+	return SYM_FIELD(read_val, EXTStatus, GPIOIn);
+}
+
+/*
+ * Read fundamental info we need to use the chip.  These are
+ * the registers that describe chip capabilities, and are
+ * saved in shadow registers.
+ */
+static void get_6120_chip_params(struct qib_devdata *dd)
+{
+	u64 val;
+	u32 piobufs;
+	int mtu;
+
+	dd->uregbase = qib_read_kreg32(dd, kr_userregbase);
+
+	dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt);
+	dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase);
+	dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase);
+	dd->palign = qib_read_kreg32(dd, kr_palign);
+	dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase);
+	dd->pio2k_bufbase = dd->piobufbase & 0xffffffff;
+
+	dd->rcvhdrcnt = qib_read_kreg32(dd, kr_rcvegrcnt);
+
+	val = qib_read_kreg64(dd, kr_sendpiosize);
+	dd->piosize2k = val & ~0U;
+	dd->piosize4k = val >> 32;
+
+	mtu = ib_mtu_enum_to_int(qib_ibmtu);
+	if (mtu == -1)
+		mtu = QIB_DEFAULT_MTU;
+	dd->pport->ibmtu = (u32)mtu;
+
+	val = qib_read_kreg64(dd, kr_sendpiobufcnt);
+	dd->piobcnt2k = val & ~0U;
+	dd->piobcnt4k = val >> 32;
+	dd->last_pio = dd->piobcnt4k + dd->piobcnt2k - 1;
+	/* these may be adjusted in init_chip_wc_pat() */
+	dd->pio2kbase = (u32 __iomem *)
+		(((char __iomem *)dd->kregbase) + dd->pio2k_bufbase);
+	if (dd->piobcnt4k) {
+		dd->pio4kbase = (u32 __iomem *)
+			(((char __iomem *) dd->kregbase) +
+			 (dd->piobufbase >> 32));
+		/*
+		 * 4K buffers take 2 pages; we use roundup just to be
+		 * paranoid; we calculate it once here, rather than on
+		 * ever buf allocate
+		 */
+		dd->align4k = ALIGN(dd->piosize4k, dd->palign);
+	}
+
+	piobufs = dd->piobcnt4k + dd->piobcnt2k;
+
+	dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) /
+		(sizeof(u64) * BITS_PER_BYTE / 2);
+}
+
+/*
+ * The chip base addresses in cspec and cpspec have to be set
+ * after possible init_chip_wc_pat(), rather than in
+ * get_6120_chip_params(), so split out as separate function
+ */
+static void set_6120_baseaddrs(struct qib_devdata *dd)
+{
+	u32 cregbase;
+
+	cregbase = qib_read_kreg32(dd, kr_counterregbase);
+	dd->cspec->cregbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + cregbase);
+
+	dd->egrtidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + dd->rcvegrbase);
+}
+
+/*
+ * Write the final few registers that depend on some of the
+ * init setup.  Done late in init, just before bringing up
+ * the serdes.
+ */
+static int qib_late_6120_initreg(struct qib_devdata *dd)
+{
+	int ret = 0;
+	u64 val;
+
+	qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize);
+	qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize);
+	qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt);
+	qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys);
+	val = qib_read_kreg64(dd, kr_sendpioavailaddr);
+	if (val != dd->pioavailregs_phys) {
+		qib_dev_err(dd,
+			"Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n",
+			(unsigned long) dd->pioavailregs_phys,
+			(unsigned long long) val);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static int init_6120_variables(struct qib_devdata *dd)
+{
+	int ret = 0;
+	struct qib_pportdata *ppd;
+	u32 sbufs;
+
+	ppd = (struct qib_pportdata *)(dd + 1);
+	dd->pport = ppd;
+	dd->num_pports = 1;
+
+	dd->cspec = (struct qib_chip_specific *)(ppd + dd->num_pports);
+	dd->cspec->ppd = ppd;
+	ppd->cpspec = NULL; /* not used in this chip */
+
+	spin_lock_init(&dd->cspec->kernel_tid_lock);
+	spin_lock_init(&dd->cspec->user_tid_lock);
+	spin_lock_init(&dd->cspec->rcvmod_lock);
+	spin_lock_init(&dd->cspec->gpio_lock);
+
+	/* we haven't yet set QIB_PRESENT, so use read directly */
+	dd->revision = readq(&dd->kregbase[kr_revision]);
+
+	if ((dd->revision & 0xffffffffU) == 0xffffffffU) {
+		qib_dev_err(dd,
+			"Revision register read failure, giving up initialization\n");
+		ret = -ENODEV;
+		goto bail;
+	}
+	dd->flags |= QIB_PRESENT;  /* now register routines work */
+
+	dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R,
+				    ChipRevMajor);
+	dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R,
+				    ChipRevMinor);
+
+	get_6120_chip_params(dd);
+	pe_boardname(dd); /* fill in boardname */
+
+	/*
+	 * GPIO bits for TWSI data and clock,
+	 * used for serial EEPROM.
+	 */
+	dd->gpio_sda_num = _QIB_GPIO_SDA_NUM;
+	dd->gpio_scl_num = _QIB_GPIO_SCL_NUM;
+	dd->twsi_eeprom_dev = QIB_TWSI_NO_DEV;
+
+	if (qib_unordered_wc())
+		dd->flags |= QIB_PIO_FLUSH_WC;
+
+	ret = qib_init_pportdata(ppd, dd, 0, 1);
+	if (ret)
+		goto bail;
+	ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+	ppd->link_speed_supported = QIB_IB_SDR;
+	ppd->link_width_enabled = IB_WIDTH_4X;
+	ppd->link_speed_enabled = ppd->link_speed_supported;
+	/* these can't change for this chip, so set once */
+	ppd->link_width_active = ppd->link_width_enabled;
+	ppd->link_speed_active = ppd->link_speed_enabled;
+	ppd->vls_supported = IB_VL_VL0;
+	ppd->vls_operational = ppd->vls_supported;
+
+	dd->rcvhdrentsize = QIB_RCVHDR_ENTSIZE;
+	dd->rcvhdrsize = QIB_DFLT_RCVHDRSIZE;
+	dd->rhf_offset = 0;
+
+	/* we always allocate at least 2048 bytes for eager buffers */
+	ret = ib_mtu_enum_to_int(qib_ibmtu);
+	dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU;
+	BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
+	dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
+
+	qib_6120_tidtemplate(dd);
+
+	/*
+	 * We can request a receive interrupt for 1 or
+	 * more packets from current offset.  For now, we set this
+	 * up for a single packet.
+	 */
+	dd->rhdrhead_intr_off = 1ULL << 32;
+
+	/* setup the stats timer; the add_timer is done at end of init */
+	timer_setup(&dd->stats_timer, qib_get_6120_faststats, 0);
+	timer_setup(&dd->cspec->pma_timer, pma_6120_timer, 0);
+
+	dd->ureg_align = qib_read_kreg32(dd, kr_palign);
+
+	dd->piosize2kmax_dwords = dd->piosize2k >> 2;
+	qib_6120_config_ctxts(dd);
+	qib_set_ctxtcnt(dd);
+
+	ret = init_chip_wc_pat(dd, 0);
+	if (ret)
+		goto bail;
+	set_6120_baseaddrs(dd); /* set chip access pointers now */
+
+	ret = 0;
+	if (qib_mini_init)
+		goto bail;
+
+	qib_num_cfg_vls = 1; /* if any 6120's, only one VL */
+
+	ret = qib_create_ctxts(dd);
+	init_6120_cntrnames(dd);
+
+	/* use all of 4KB buffers for the kernel, otherwise 16 */
+	sbufs = dd->piobcnt4k ?  dd->piobcnt4k : 16;
+
+	dd->lastctxt_piobuf = dd->piobcnt2k + dd->piobcnt4k - sbufs;
+	dd->pbufsctxt = dd->lastctxt_piobuf /
+		(dd->cfgctxts - dd->first_user_ctxt);
+
+	if (ret)
+		goto bail;
+bail:
+	return ret;
+}
+
+/*
+ * For this chip, we want to use the same buffer every time
+ * when we are trying to bring the link up (they are always VL15
+ * packets).  At that link state the packet should always go out immediately
+ * (or at least be discarded at the tx interface if the link is down).
+ * If it doesn't, and the buffer isn't available, that means some other
+ * sender has gotten ahead of us, and is preventing our packet from going
+ * out.  In that case, we flush all packets, and try again.  If that still
+ * fails, we fail the request, and hope things work the next time around.
+ *
+ * We don't need very complicated heuristics on whether the packet had
+ * time to go out or not, since even at SDR 1X, it goes out in very short
+ * time periods, covered by the chip reads done here and as part of the
+ * flush.
+ */
+static u32 __iomem *get_6120_link_buf(struct qib_pportdata *ppd, u32 *bnum)
+{
+	u32 __iomem *buf;
+	u32 lbuf = ppd->dd->piobcnt2k + ppd->dd->piobcnt4k - 1;
+
+	/*
+	 * always blip to get avail list updated, since it's almost
+	 * always needed, and is fairly cheap.
+	 */
+	sendctrl_6120_mod(ppd->dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+	qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */
+	buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf);
+	if (buf)
+		goto done;
+
+	sendctrl_6120_mod(ppd, QIB_SENDCTRL_DISARM_ALL | QIB_SENDCTRL_FLUSH |
+			  QIB_SENDCTRL_AVAIL_BLIP);
+	ppd->dd->upd_pio_shadow  = 1; /* update our idea of what's busy */
+	qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */
+	buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf);
+done:
+	return buf;
+}
+
+static u32 __iomem *qib_6120_getsendbuf(struct qib_pportdata *ppd, u64 pbc,
+					u32 *pbufnum)
+{
+	u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK;
+	struct qib_devdata *dd = ppd->dd;
+	u32 __iomem *buf;
+
+	if (((pbc >> 32) & PBC_6120_VL15_SEND_CTRL) &&
+		!(ppd->lflags & (QIBL_IB_AUTONEG_INPROG | QIBL_LINKACTIVE)))
+		buf = get_6120_link_buf(ppd, pbufnum);
+	else {
+
+		if ((plen + 1) > dd->piosize2kmax_dwords)
+			first = dd->piobcnt2k;
+		else
+			first = 0;
+		/* try 4k if all 2k busy, so same last for both sizes */
+		last = dd->piobcnt2k + dd->piobcnt4k - 1;
+		buf = qib_getsendbuf_range(dd, pbufnum, first, last);
+	}
+	return buf;
+}
+
+static int init_sdma_6120_regs(struct qib_pportdata *ppd)
+{
+	return -ENODEV;
+}
+
+static u16 qib_sdma_6120_gethead(struct qib_pportdata *ppd)
+{
+	return 0;
+}
+
+static int qib_sdma_6120_busy(struct qib_pportdata *ppd)
+{
+	return 0;
+}
+
+static void qib_sdma_update_6120_tail(struct qib_pportdata *ppd, u16 tail)
+{
+}
+
+static void qib_6120_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op)
+{
+}
+
+static void qib_sdma_set_6120_desc_cnt(struct qib_pportdata *ppd, unsigned cnt)
+{
+}
+
+/*
+ * the pbc doesn't need a VL15 indicator, but we need it for link_buf.
+ * The chip ignores the bit if set.
+ */
+static u32 qib_6120_setpbc_control(struct qib_pportdata *ppd, u32 plen,
+				   u8 srate, u8 vl)
+{
+	return vl == 15 ? PBC_6120_VL15_SEND_CTRL : 0;
+}
+
+static void qib_6120_initvl15_bufs(struct qib_devdata *dd)
+{
+}
+
+static void qib_6120_init_ctxt(struct qib_ctxtdata *rcd)
+{
+	rcd->rcvegrcnt = rcd->dd->rcvhdrcnt;
+	rcd->rcvegr_tid_base = rcd->ctxt * rcd->rcvegrcnt;
+}
+
+static void qib_6120_txchk_change(struct qib_devdata *dd, u32 start,
+	u32 len, u32 avail, struct qib_ctxtdata *rcd)
+{
+}
+
+static void writescratch(struct qib_devdata *dd, u32 val)
+{
+	(void) qib_write_kreg(dd, kr_scratch, val);
+}
+
+static int qib_6120_tempsense_rd(struct qib_devdata *dd, int regnum)
+{
+	return -ENXIO;
+}
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+static int qib_6120_notify_dca(struct qib_devdata *dd, unsigned long event)
+{
+	return 0;
+}
+#endif
+
+/* Dummy function, as 6120 boards never disable EEPROM Write */
+static int qib_6120_eeprom_wen(struct qib_devdata *dd, int wen)
+{
+	return 1;
+}
+
+/**
+ * qib_init_iba6120_funcs - set up the chip-specific function pointers
+ * @pdev: pci_dev of the qlogic_ib device
+ * @ent: pci_device_id matching this chip
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ *
+ * It also allocates/partially-inits the qib_devdata struct for
+ * this device.
+ */
+struct qib_devdata *qib_init_iba6120_funcs(struct pci_dev *pdev,
+					   const struct pci_device_id *ent)
+{
+	struct qib_devdata *dd;
+	int ret;
+
+	dd = qib_alloc_devdata(pdev, sizeof(struct qib_pportdata) +
+			       sizeof(struct qib_chip_specific));
+	if (IS_ERR(dd))
+		goto bail;
+
+	dd->f_bringup_serdes    = qib_6120_bringup_serdes;
+	dd->f_cleanup           = qib_6120_setup_cleanup;
+	dd->f_clear_tids        = qib_6120_clear_tids;
+	dd->f_free_irq          = qib_free_irq;
+	dd->f_get_base_info     = qib_6120_get_base_info;
+	dd->f_get_msgheader     = qib_6120_get_msgheader;
+	dd->f_getsendbuf        = qib_6120_getsendbuf;
+	dd->f_gpio_mod          = gpio_6120_mod;
+	dd->f_eeprom_wen	= qib_6120_eeprom_wen;
+	dd->f_hdrqempty         = qib_6120_hdrqempty;
+	dd->f_ib_updown         = qib_6120_ib_updown;
+	dd->f_init_ctxt         = qib_6120_init_ctxt;
+	dd->f_initvl15_bufs     = qib_6120_initvl15_bufs;
+	dd->f_intr_fallback     = qib_6120_nointr_fallback;
+	dd->f_late_initreg      = qib_late_6120_initreg;
+	dd->f_setpbc_control    = qib_6120_setpbc_control;
+	dd->f_portcntr          = qib_portcntr_6120;
+	dd->f_put_tid           = (dd->minrev >= 2) ?
+				      qib_6120_put_tid_2 :
+				      qib_6120_put_tid;
+	dd->f_quiet_serdes      = qib_6120_quiet_serdes;
+	dd->f_rcvctrl           = rcvctrl_6120_mod;
+	dd->f_read_cntrs        = qib_read_6120cntrs;
+	dd->f_read_portcntrs    = qib_read_6120portcntrs;
+	dd->f_reset             = qib_6120_setup_reset;
+	dd->f_init_sdma_regs    = init_sdma_6120_regs;
+	dd->f_sdma_busy         = qib_sdma_6120_busy;
+	dd->f_sdma_gethead      = qib_sdma_6120_gethead;
+	dd->f_sdma_sendctrl     = qib_6120_sdma_sendctrl;
+	dd->f_sdma_set_desc_cnt = qib_sdma_set_6120_desc_cnt;
+	dd->f_sdma_update_tail  = qib_sdma_update_6120_tail;
+	dd->f_sendctrl          = sendctrl_6120_mod;
+	dd->f_set_armlaunch     = qib_set_6120_armlaunch;
+	dd->f_set_cntr_sample   = qib_set_cntr_6120_sample;
+	dd->f_iblink_state      = qib_6120_iblink_state;
+	dd->f_ibphys_portstate  = qib_6120_phys_portstate;
+	dd->f_get_ib_cfg        = qib_6120_get_ib_cfg;
+	dd->f_set_ib_cfg        = qib_6120_set_ib_cfg;
+	dd->f_set_ib_loopback   = qib_6120_set_loopback;
+	dd->f_set_intr_state    = qib_6120_set_intr_state;
+	dd->f_setextled         = qib_6120_setup_setextled;
+	dd->f_txchk_change      = qib_6120_txchk_change;
+	dd->f_update_usrhead    = qib_update_6120_usrhead;
+	dd->f_wantpiobuf_intr   = qib_wantpiobuf_6120_intr;
+	dd->f_xgxs_reset        = qib_6120_xgxs_reset;
+	dd->f_writescratch      = writescratch;
+	dd->f_tempsense_rd	= qib_6120_tempsense_rd;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dd->f_notify_dca = qib_6120_notify_dca;
+#endif
+	/*
+	 * Do remaining pcie setup and save pcie values in dd.
+	 * Any error printing is already done by the init code.
+	 * On return, we have the chip mapped and accessible,
+	 * but chip registers are not set up until start of
+	 * init_6120_variables.
+	 */
+	ret = qib_pcie_ddinit(dd, pdev, ent);
+	if (ret < 0)
+		goto bail_free;
+
+	/* initialize chip-specific variables */
+	ret = init_6120_variables(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	if (qib_mini_init)
+		goto bail;
+
+	if (qib_pcie_params(dd, 8, NULL))
+		qib_dev_err(dd,
+			"Failed to setup PCIe or interrupts; continuing anyway\n");
+	/* clear diagctrl register, in case diags were running and crashed */
+	qib_write_kreg(dd, kr_hwdiagctrl, 0);
+
+	if (qib_read_kreg64(dd, kr_hwerrstatus) &
+	    QLOGIC_IB_HWE_SERDESPLLFAILED)
+		qib_write_kreg(dd, kr_hwerrclear,
+			       QLOGIC_IB_HWE_SERDESPLLFAILED);
+
+	/* setup interrupt handler (interrupt type handled above) */
+	qib_setup_6120_interrupt(dd);
+	/* Note that qpn_mask is set by qib_6120_config_ctxts() first */
+	qib_6120_init_hwerrors(dd);
+
+	goto bail;
+
+bail_cleanup:
+	qib_pcie_ddcleanup(dd);
+bail_free:
+	qib_free_devdata(dd);
+	dd = ERR_PTR(ret);
+bail:
+	return dd;
+}
diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c
new file mode 100644
index 0000000..bdff232
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_iba7220.c
@@ -0,0 +1,4600 @@
+/*
+ * Copyright (c) 2011 - 2017 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * This file contains all of the code that is specific to the
+ * QLogic_IB 7220 chip (except that specific to the SerDes)
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <rdma/ib_verbs.h>
+
+#include "qib.h"
+#include "qib_7220.h"
+
+static void qib_setup_7220_setextled(struct qib_pportdata *, u32);
+static void qib_7220_handle_hwerrors(struct qib_devdata *, char *, size_t);
+static void sendctrl_7220_mod(struct qib_pportdata *ppd, u32 op);
+static u32 qib_7220_iblink_state(u64);
+static u8 qib_7220_phys_portstate(u64);
+static void qib_sdma_update_7220_tail(struct qib_pportdata *, u16);
+static void qib_set_ib_7220_lstate(struct qib_pportdata *, u16, u16);
+
+/*
+ * This file contains almost all the chip-specific register information and
+ * access functions for the QLogic QLogic_IB 7220 PCI-Express chip, with the
+ * exception of SerDes support, which in in qib_sd7220.c.
+ */
+
+/* Below uses machine-generated qib_chipnum_regs.h file */
+#define KREG_IDX(regname) (QIB_7220_##regname##_OFFS / sizeof(u64))
+
+/* Use defines to tie machine-generated names to lower-case names */
+#define kr_control KREG_IDX(Control)
+#define kr_counterregbase KREG_IDX(CntrRegBase)
+#define kr_errclear KREG_IDX(ErrClear)
+#define kr_errmask KREG_IDX(ErrMask)
+#define kr_errstatus KREG_IDX(ErrStatus)
+#define kr_extctrl KREG_IDX(EXTCtrl)
+#define kr_extstatus KREG_IDX(EXTStatus)
+#define kr_gpio_clear KREG_IDX(GPIOClear)
+#define kr_gpio_mask KREG_IDX(GPIOMask)
+#define kr_gpio_out KREG_IDX(GPIOOut)
+#define kr_gpio_status KREG_IDX(GPIOStatus)
+#define kr_hrtbt_guid KREG_IDX(HRTBT_GUID)
+#define kr_hwdiagctrl KREG_IDX(HwDiagCtrl)
+#define kr_hwerrclear KREG_IDX(HwErrClear)
+#define kr_hwerrmask KREG_IDX(HwErrMask)
+#define kr_hwerrstatus KREG_IDX(HwErrStatus)
+#define kr_ibcctrl KREG_IDX(IBCCtrl)
+#define kr_ibcddrctrl KREG_IDX(IBCDDRCtrl)
+#define kr_ibcddrstatus KREG_IDX(IBCDDRStatus)
+#define kr_ibcstatus KREG_IDX(IBCStatus)
+#define kr_ibserdesctrl KREG_IDX(IBSerDesCtrl)
+#define kr_intclear KREG_IDX(IntClear)
+#define kr_intmask KREG_IDX(IntMask)
+#define kr_intstatus KREG_IDX(IntStatus)
+#define kr_ncmodectrl KREG_IDX(IBNCModeCtrl)
+#define kr_palign KREG_IDX(PageAlign)
+#define kr_partitionkey KREG_IDX(RcvPartitionKey)
+#define kr_portcnt KREG_IDX(PortCnt)
+#define kr_rcvbthqp KREG_IDX(RcvBTHQP)
+#define kr_rcvctrl KREG_IDX(RcvCtrl)
+#define kr_rcvegrbase KREG_IDX(RcvEgrBase)
+#define kr_rcvegrcnt KREG_IDX(RcvEgrCnt)
+#define kr_rcvhdrcnt KREG_IDX(RcvHdrCnt)
+#define kr_rcvhdrentsize KREG_IDX(RcvHdrEntSize)
+#define kr_rcvhdrsize KREG_IDX(RcvHdrSize)
+#define kr_rcvpktledcnt KREG_IDX(RcvPktLEDCnt)
+#define kr_rcvtidbase KREG_IDX(RcvTIDBase)
+#define kr_rcvtidcnt KREG_IDX(RcvTIDCnt)
+#define kr_revision KREG_IDX(Revision)
+#define kr_scratch KREG_IDX(Scratch)
+#define kr_sendbuffererror KREG_IDX(SendBufErr0)
+#define kr_sendctrl KREG_IDX(SendCtrl)
+#define kr_senddmabase KREG_IDX(SendDmaBase)
+#define kr_senddmabufmask0 KREG_IDX(SendDmaBufMask0)
+#define kr_senddmabufmask1 (KREG_IDX(SendDmaBufMask0) + 1)
+#define kr_senddmabufmask2 (KREG_IDX(SendDmaBufMask0) + 2)
+#define kr_senddmahead KREG_IDX(SendDmaHead)
+#define kr_senddmaheadaddr KREG_IDX(SendDmaHeadAddr)
+#define kr_senddmalengen KREG_IDX(SendDmaLenGen)
+#define kr_senddmastatus KREG_IDX(SendDmaStatus)
+#define kr_senddmatail KREG_IDX(SendDmaTail)
+#define kr_sendpioavailaddr KREG_IDX(SendBufAvailAddr)
+#define kr_sendpiobufbase KREG_IDX(SendBufBase)
+#define kr_sendpiobufcnt KREG_IDX(SendBufCnt)
+#define kr_sendpiosize KREG_IDX(SendBufSize)
+#define kr_sendregbase KREG_IDX(SendRegBase)
+#define kr_userregbase KREG_IDX(UserRegBase)
+#define kr_xgxs_cfg KREG_IDX(XGXSCfg)
+
+/* These must only be written via qib_write_kreg_ctxt() */
+#define kr_rcvhdraddr KREG_IDX(RcvHdrAddr0)
+#define kr_rcvhdrtailaddr KREG_IDX(RcvHdrTailAddr0)
+
+
+#define CREG_IDX(regname) ((QIB_7220_##regname##_OFFS - \
+			QIB_7220_LBIntCnt_OFFS) / sizeof(u64))
+
+#define cr_badformat CREG_IDX(RxVersionErrCnt)
+#define cr_erricrc CREG_IDX(RxICRCErrCnt)
+#define cr_errlink CREG_IDX(RxLinkMalformCnt)
+#define cr_errlpcrc CREG_IDX(RxLPCRCErrCnt)
+#define cr_errpkey CREG_IDX(RxPKeyMismatchCnt)
+#define cr_rcvflowctrl_err CREG_IDX(RxFlowCtrlViolCnt)
+#define cr_err_rlen CREG_IDX(RxLenErrCnt)
+#define cr_errslen CREG_IDX(TxLenErrCnt)
+#define cr_errtidfull CREG_IDX(RxTIDFullErrCnt)
+#define cr_errtidvalid CREG_IDX(RxTIDValidErrCnt)
+#define cr_errvcrc CREG_IDX(RxVCRCErrCnt)
+#define cr_ibstatuschange CREG_IDX(IBStatusChangeCnt)
+#define cr_lbint CREG_IDX(LBIntCnt)
+#define cr_invalidrlen CREG_IDX(RxMaxMinLenErrCnt)
+#define cr_invalidslen CREG_IDX(TxMaxMinLenErrCnt)
+#define cr_lbflowstall CREG_IDX(LBFlowStallCnt)
+#define cr_pktrcv CREG_IDX(RxDataPktCnt)
+#define cr_pktrcvflowctrl CREG_IDX(RxFlowPktCnt)
+#define cr_pktsend CREG_IDX(TxDataPktCnt)
+#define cr_pktsendflow CREG_IDX(TxFlowPktCnt)
+#define cr_portovfl CREG_IDX(RxP0HdrEgrOvflCnt)
+#define cr_rcvebp CREG_IDX(RxEBPCnt)
+#define cr_rcvovfl CREG_IDX(RxBufOvflCnt)
+#define cr_senddropped CREG_IDX(TxDroppedPktCnt)
+#define cr_sendstall CREG_IDX(TxFlowStallCnt)
+#define cr_sendunderrun CREG_IDX(TxUnderrunCnt)
+#define cr_wordrcv CREG_IDX(RxDwordCnt)
+#define cr_wordsend CREG_IDX(TxDwordCnt)
+#define cr_txunsupvl CREG_IDX(TxUnsupVLErrCnt)
+#define cr_rxdroppkt CREG_IDX(RxDroppedPktCnt)
+#define cr_iblinkerrrecov CREG_IDX(IBLinkErrRecoveryCnt)
+#define cr_iblinkdown CREG_IDX(IBLinkDownedCnt)
+#define cr_ibsymbolerr CREG_IDX(IBSymbolErrCnt)
+#define cr_vl15droppedpkt CREG_IDX(RxVL15DroppedPktCnt)
+#define cr_rxotherlocalphyerr CREG_IDX(RxOtherLocalPhyErrCnt)
+#define cr_excessbufferovfl CREG_IDX(ExcessBufferOvflCnt)
+#define cr_locallinkintegrityerr CREG_IDX(LocalLinkIntegrityErrCnt)
+#define cr_rxvlerr CREG_IDX(RxVlErrCnt)
+#define cr_rxdlidfltr CREG_IDX(RxDlidFltrCnt)
+#define cr_psstat CREG_IDX(PSStat)
+#define cr_psstart CREG_IDX(PSStart)
+#define cr_psinterval CREG_IDX(PSInterval)
+#define cr_psrcvdatacount CREG_IDX(PSRcvDataCount)
+#define cr_psrcvpktscount CREG_IDX(PSRcvPktsCount)
+#define cr_psxmitdatacount CREG_IDX(PSXmitDataCount)
+#define cr_psxmitpktscount CREG_IDX(PSXmitPktsCount)
+#define cr_psxmitwaitcount CREG_IDX(PSXmitWaitCount)
+#define cr_txsdmadesc CREG_IDX(TxSDmaDescCnt)
+#define cr_pcieretrydiag CREG_IDX(PcieRetryBufDiagQwordCnt)
+
+#define SYM_RMASK(regname, fldname) ((u64)              \
+	QIB_7220_##regname##_##fldname##_RMASK)
+#define SYM_MASK(regname, fldname) ((u64)               \
+	QIB_7220_##regname##_##fldname##_RMASK <<       \
+	 QIB_7220_##regname##_##fldname##_LSB)
+#define SYM_LSB(regname, fldname) (QIB_7220_##regname##_##fldname##_LSB)
+#define SYM_FIELD(value, regname, fldname) ((u64) \
+	(((value) >> SYM_LSB(regname, fldname)) & \
+	 SYM_RMASK(regname, fldname)))
+#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask)
+#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask)
+
+/* ibcctrl bits */
+#define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define QLOGIC_IB_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define QLOGIC_IB_IBCC_LINKINITCMD_SLEEP 3
+#define QLOGIC_IB_IBCC_LINKINITCMD_SHIFT 16
+
+#define QLOGIC_IB_IBCC_LINKCMD_DOWN 1           /* move to 0x11 */
+#define QLOGIC_IB_IBCC_LINKCMD_ARMED 2          /* move to 0x21 */
+#define QLOGIC_IB_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */
+
+#define BLOB_7220_IBCHG 0x81
+
+/*
+ * We could have a single register get/put routine, that takes a group type,
+ * but this is somewhat clearer and cleaner.  It also gives us some error
+ * checking.  64 bit register reads should always work, but are inefficient
+ * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
+ * so we use kreg32 wherever possible.  User register and counter register
+ * reads are always 32 bit reads, so only one form of those routines.
+ */
+
+/**
+ * qib_read_ureg32 - read 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @ctxt: context number
+ *
+ * Return the contents of a register that is virtualized to be per context.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 qib_read_ureg32(const struct qib_devdata *dd,
+				  enum qib_ureg regno, int ctxt)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+
+	if (dd->userbase)
+		return readl(regno + (u64 __iomem *)
+			     ((char __iomem *)dd->userbase +
+			      dd->ureg_align * ctxt));
+	else
+		return readl(regno + (u64 __iomem *)
+			     (dd->uregbase +
+			      (char __iomem *)dd->kregbase +
+			      dd->ureg_align * ctxt));
+}
+
+/**
+ * qib_write_ureg - write 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @ctxt: context
+ *
+ * Write the contents of a register that is virtualized to be per context.
+ */
+static inline void qib_write_ureg(const struct qib_devdata *dd,
+				  enum qib_ureg regno, u64 value, int ctxt)
+{
+	u64 __iomem *ubase;
+
+	if (dd->userbase)
+		ubase = (u64 __iomem *)
+			((char __iomem *) dd->userbase +
+			 dd->ureg_align * ctxt);
+	else
+		ubase = (u64 __iomem *)
+			(dd->uregbase +
+			 (char __iomem *) dd->kregbase +
+			 dd->ureg_align * ctxt);
+
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &ubase[regno]);
+}
+
+/**
+ * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register
+ * @dd: the qlogic_ib device
+ * @regno: the register number to write
+ * @ctxt: the context containing the register
+ * @value: the value to write
+ */
+static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd,
+				       const u16 regno, unsigned ctxt,
+				       u64 value)
+{
+	qib_write_kreg(dd, regno + ctxt, value);
+}
+
+static inline void write_7220_creg(const struct qib_devdata *dd,
+				   u16 regno, u64 value)
+{
+	if (dd->cspec->cregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &dd->cspec->cregbase[regno]);
+}
+
+static inline u64 read_7220_creg(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(&dd->cspec->cregbase[regno]);
+}
+
+static inline u32 read_7220_creg32(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(&dd->cspec->cregbase[regno]);
+}
+
+/* kr_revision bits */
+#define QLOGIC_IB_R_EMULATORREV_MASK ((1ULL << 22) - 1)
+#define QLOGIC_IB_R_EMULATORREV_SHIFT 40
+
+/* kr_control bits */
+#define QLOGIC_IB_C_RESET (1U << 7)
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define QLOGIC_IB_I_RCVURG_MASK ((1ULL << 17) - 1)
+#define QLOGIC_IB_I_RCVURG_SHIFT 32
+#define QLOGIC_IB_I_RCVAVAIL_MASK ((1ULL << 17) - 1)
+#define QLOGIC_IB_I_RCVAVAIL_SHIFT 0
+#define QLOGIC_IB_I_SERDESTRIMDONE (1ULL << 27)
+
+#define QLOGIC_IB_C_FREEZEMODE 0x00000002
+#define QLOGIC_IB_C_LINKENABLE 0x00000004
+
+#define QLOGIC_IB_I_SDMAINT             0x8000000000000000ULL
+#define QLOGIC_IB_I_SDMADISABLED        0x4000000000000000ULL
+#define QLOGIC_IB_I_ERROR               0x0000000080000000ULL
+#define QLOGIC_IB_I_SPIOSENT            0x0000000040000000ULL
+#define QLOGIC_IB_I_SPIOBUFAVAIL        0x0000000020000000ULL
+#define QLOGIC_IB_I_GPIO                0x0000000010000000ULL
+
+/* variables for sanity checking interrupt and errors */
+#define QLOGIC_IB_I_BITSEXTANT \
+		(QLOGIC_IB_I_SDMAINT | QLOGIC_IB_I_SDMADISABLED | \
+		(QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT) | \
+		(QLOGIC_IB_I_RCVAVAIL_MASK << \
+		 QLOGIC_IB_I_RCVAVAIL_SHIFT) | \
+		QLOGIC_IB_I_ERROR | QLOGIC_IB_I_SPIOSENT | \
+		QLOGIC_IB_I_SPIOBUFAVAIL | QLOGIC_IB_I_GPIO | \
+		QLOGIC_IB_I_SERDESTRIMDONE)
+
+#define IB_HWE_BITSEXTANT \
+	       (HWE_MASK(RXEMemParityErr) | \
+		HWE_MASK(TXEMemParityErr) | \
+		(QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK <<	 \
+		 QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) | \
+		QLOGIC_IB_HWE_PCIE1PLLFAILED | \
+		QLOGIC_IB_HWE_PCIE0PLLFAILED | \
+		QLOGIC_IB_HWE_PCIEPOISONEDTLP | \
+		QLOGIC_IB_HWE_PCIECPLTIMEOUT | \
+		QLOGIC_IB_HWE_PCIEBUSPARITYXTLH | \
+		QLOGIC_IB_HWE_PCIEBUSPARITYXADM | \
+		QLOGIC_IB_HWE_PCIEBUSPARITYRADM | \
+		HWE_MASK(PowerOnBISTFailed) |	  \
+		QLOGIC_IB_HWE_COREPLL_FBSLIP | \
+		QLOGIC_IB_HWE_COREPLL_RFSLIP | \
+		QLOGIC_IB_HWE_SERDESPLLFAILED | \
+		HWE_MASK(IBCBusToSPCParityErr) | \
+		HWE_MASK(IBCBusFromSPCParityErr) | \
+		QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR | \
+		QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR | \
+		QLOGIC_IB_HWE_SDMAMEMREADERR | \
+		QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED | \
+		QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT | \
+		QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT | \
+		QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT | \
+		QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT | \
+		QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR | \
+		QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR | \
+		QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR | \
+		QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR)
+
+#define IB_E_BITSEXTANT							\
+	(ERR_MASK(RcvFormatErr) | ERR_MASK(RcvVCRCErr) |		\
+	 ERR_MASK(RcvICRCErr) | ERR_MASK(RcvMinPktLenErr) |		\
+	 ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvLongPktLenErr) |	\
+	 ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvUnexpectedCharErr) | \
+	 ERR_MASK(RcvUnsupportedVLErr) | ERR_MASK(RcvEBPErr) |		\
+	 ERR_MASK(RcvIBFlowErr) | ERR_MASK(RcvBadVersionErr) |		\
+	 ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) |		\
+	 ERR_MASK(RcvBadTidErr) | ERR_MASK(RcvHdrLenErr) |		\
+	 ERR_MASK(RcvHdrErr) | ERR_MASK(RcvIBLostLinkErr) |		\
+	 ERR_MASK(SendSpecialTriggerErr) |				\
+	 ERR_MASK(SDmaDisabledErr) | ERR_MASK(SendMinPktLenErr) |	\
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnderRunErr) |	\
+	 ERR_MASK(SendPktLenErr) | ERR_MASK(SendDroppedSmpPktErr) |	\
+	 ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendPioArmLaunchErr) |				\
+	 ERR_MASK(SendUnexpectedPktNumErr) |				\
+	 ERR_MASK(SendUnsupportedVLErr) | ERR_MASK(SendBufMisuseErr) |	\
+	 ERR_MASK(SDmaGenMismatchErr) | ERR_MASK(SDmaOutOfBoundErr) |	\
+	 ERR_MASK(SDmaTailOutOfBoundErr) | ERR_MASK(SDmaBaseErr) |	\
+	 ERR_MASK(SDma1stDescErr) | ERR_MASK(SDmaRpyTagErr) |		\
+	 ERR_MASK(SDmaDwEnErr) | ERR_MASK(SDmaMissingDwErr) |		\
+	 ERR_MASK(SDmaUnexpDataErr) |					\
+	 ERR_MASK(IBStatusChanged) | ERR_MASK(InvalidAddrErr) |		\
+	 ERR_MASK(ResetNegated) | ERR_MASK(HardwareErr) |		\
+	 ERR_MASK(SDmaDescAddrMisalignErr) |				\
+	 ERR_MASK(InvalidEEPCmd))
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK  0x00000000000000ffULL
+#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT 0
+#define QLOGIC_IB_HWE_PCIEPOISONEDTLP      0x0000000010000000ULL
+#define QLOGIC_IB_HWE_PCIECPLTIMEOUT       0x0000000020000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYXTLH    0x0000000040000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYXADM    0x0000000080000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYRADM    0x0000000100000000ULL
+#define QLOGIC_IB_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
+#define QLOGIC_IB_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
+#define QLOGIC_IB_HWE_PCIE1PLLFAILED       0x0400000000000000ULL
+#define QLOGIC_IB_HWE_PCIE0PLLFAILED       0x0800000000000000ULL
+#define QLOGIC_IB_HWE_SERDESPLLFAILED      0x1000000000000000ULL
+/* specific to this chip */
+#define QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR         0x0000000000000040ULL
+#define QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR          0x0000000000000080ULL
+#define QLOGIC_IB_HWE_SDMAMEMREADERR              0x0000000010000000ULL
+#define QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED          0x2000000000000000ULL
+#define QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT   0x0100000000000000ULL
+#define QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT   0x0200000000000000ULL
+#define QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT   0x0400000000000000ULL
+#define QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT   0x0800000000000000ULL
+#define QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR       0x0000008000000000ULL
+#define QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR        0x0000004000000000ULL
+#define QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR 0x0000001000000000ULL
+#define QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR 0x0000002000000000ULL
+
+#define IBA7220_IBCC_LINKCMD_SHIFT 19
+
+/* kr_ibcddrctrl bits */
+#define IBA7220_IBC_DLIDLMC_MASK        0xFFFFFFFFUL
+#define IBA7220_IBC_DLIDLMC_SHIFT       32
+
+#define IBA7220_IBC_HRTBT_MASK  (SYM_RMASK(IBCDDRCtrl, HRTBT_AUTO) | \
+				 SYM_RMASK(IBCDDRCtrl, HRTBT_ENB))
+#define IBA7220_IBC_HRTBT_SHIFT SYM_LSB(IBCDDRCtrl, HRTBT_ENB)
+
+#define IBA7220_IBC_LANE_REV_SUPPORTED (1<<8)
+#define IBA7220_IBC_LREV_MASK   1
+#define IBA7220_IBC_LREV_SHIFT  8
+#define IBA7220_IBC_RXPOL_MASK  1
+#define IBA7220_IBC_RXPOL_SHIFT 7
+#define IBA7220_IBC_WIDTH_SHIFT 5
+#define IBA7220_IBC_WIDTH_MASK  0x3
+#define IBA7220_IBC_WIDTH_1X_ONLY       (0 << IBA7220_IBC_WIDTH_SHIFT)
+#define IBA7220_IBC_WIDTH_4X_ONLY       (1 << IBA7220_IBC_WIDTH_SHIFT)
+#define IBA7220_IBC_WIDTH_AUTONEG       (2 << IBA7220_IBC_WIDTH_SHIFT)
+#define IBA7220_IBC_SPEED_AUTONEG       (1 << 1)
+#define IBA7220_IBC_SPEED_SDR           (1 << 2)
+#define IBA7220_IBC_SPEED_DDR           (1 << 3)
+#define IBA7220_IBC_SPEED_AUTONEG_MASK  (0x7 << 1)
+#define IBA7220_IBC_IBTA_1_2_MASK       (1)
+
+/* kr_ibcddrstatus */
+/* link latency shift is 0, don't bother defining */
+#define IBA7220_DDRSTAT_LINKLAT_MASK    0x3ffffff
+
+/* kr_extstatus bits */
+#define QLOGIC_IB_EXTS_FREQSEL 0x2
+#define QLOGIC_IB_EXTS_SERDESSEL 0x4
+#define QLOGIC_IB_EXTS_MEMBIST_ENDTEST     0x0000000000004000
+#define QLOGIC_IB_EXTS_MEMBIST_DISABLED    0x0000000000008000
+
+/* kr_xgxsconfig bits */
+#define QLOGIC_IB_XGXS_RESET          0x5ULL
+#define QLOGIC_IB_XGXS_FC_SAFE        (1ULL << 63)
+
+/* kr_rcvpktledcnt */
+#define IBA7220_LEDBLINK_ON_SHIFT 32 /* 4ns period on after packet */
+#define IBA7220_LEDBLINK_OFF_SHIFT 0 /* 4ns period off before next on */
+
+#define _QIB_GPIO_SDA_NUM 1
+#define _QIB_GPIO_SCL_NUM 0
+#define QIB_TWSI_EEPROM_DEV 0xA2 /* All Production 7220 cards. */
+#define QIB_TWSI_TEMP_DEV 0x98
+
+/* HW counter clock is at 4nsec */
+#define QIB_7220_PSXMITWAIT_CHECK_RATE 4000
+
+#define IBA7220_R_INTRAVAIL_SHIFT 17
+#define IBA7220_R_PKEY_DIS_SHIFT 34
+#define IBA7220_R_TAILUPD_SHIFT 35
+#define IBA7220_R_CTXTCFG_SHIFT 36
+
+#define IBA7220_HDRHEAD_PKTINT_SHIFT 32 /* interrupt cnt in upper 32 bits */
+
+/*
+ * the size bits give us 2^N, in KB units.  0 marks as invalid,
+ * and 7 is reserved.  We currently use only 2KB and 4KB
+ */
+#define IBA7220_TID_SZ_SHIFT 37 /* shift to 3bit size selector */
+#define IBA7220_TID_SZ_2K (1UL << IBA7220_TID_SZ_SHIFT) /* 2KB */
+#define IBA7220_TID_SZ_4K (2UL << IBA7220_TID_SZ_SHIFT) /* 4KB */
+#define IBA7220_TID_PA_SHIFT 11U /* TID addr in chip stored w/o low bits */
+#define PBC_7220_VL15_SEND (1ULL << 63) /* pbc; VL15, no credit check */
+#define PBC_7220_VL15_SEND_CTRL (1ULL << 31) /* control version of same */
+
+#define AUTONEG_TRIES 5 /* sequential retries to negotiate DDR */
+
+/* packet rate matching delay multiplier */
+static u8 rate_to_delay[2][2] = {
+	/* 1x, 4x */
+	{   8, 2 }, /* SDR */
+	{   4, 1 }  /* DDR */
+};
+
+static u8 ib_rate_to_delay[IB_RATE_120_GBPS + 1] = {
+	[IB_RATE_2_5_GBPS] = 8,
+	[IB_RATE_5_GBPS] = 4,
+	[IB_RATE_10_GBPS] = 2,
+	[IB_RATE_20_GBPS] = 1
+};
+
+#define IBA7220_LINKSPEED_SHIFT SYM_LSB(IBCStatus, LinkSpeedActive)
+#define IBA7220_LINKWIDTH_SHIFT SYM_LSB(IBCStatus, LinkWidthActive)
+
+/* link training states, from IBC */
+#define IB_7220_LT_STATE_DISABLED        0x00
+#define IB_7220_LT_STATE_LINKUP          0x01
+#define IB_7220_LT_STATE_POLLACTIVE      0x02
+#define IB_7220_LT_STATE_POLLQUIET       0x03
+#define IB_7220_LT_STATE_SLEEPDELAY      0x04
+#define IB_7220_LT_STATE_SLEEPQUIET      0x05
+#define IB_7220_LT_STATE_CFGDEBOUNCE     0x08
+#define IB_7220_LT_STATE_CFGRCVFCFG      0x09
+#define IB_7220_LT_STATE_CFGWAITRMT      0x0a
+#define IB_7220_LT_STATE_CFGIDLE 0x0b
+#define IB_7220_LT_STATE_RECOVERRETRAIN  0x0c
+#define IB_7220_LT_STATE_RECOVERWAITRMT  0x0e
+#define IB_7220_LT_STATE_RECOVERIDLE     0x0f
+
+/* link state machine states from IBC */
+#define IB_7220_L_STATE_DOWN             0x0
+#define IB_7220_L_STATE_INIT             0x1
+#define IB_7220_L_STATE_ARM              0x2
+#define IB_7220_L_STATE_ACTIVE           0x3
+#define IB_7220_L_STATE_ACT_DEFER        0x4
+
+static const u8 qib_7220_physportstate[0x20] = {
+	[IB_7220_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+	[IB_7220_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+	[IB_7220_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+	[IB_7220_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+	[IB_7220_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_7220_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_7220_LT_STATE_CFGDEBOUNCE] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7220_LT_STATE_CFGRCVFCFG] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7220_LT_STATE_CFGWAITRMT] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7220_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7220_LT_STATE_RECOVERRETRAIN] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7220_LT_STATE_RECOVERWAITRMT] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7220_LT_STATE_RECOVERIDLE] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+int qib_special_trigger;
+module_param_named(special_trigger, qib_special_trigger, int, S_IRUGO);
+MODULE_PARM_DESC(special_trigger, "Enable SpecialTrigger arm/launch");
+
+#define IBCBUSFRSPCPARITYERR HWE_MASK(IBCBusFromSPCParityErr)
+#define IBCBUSTOSPCPARITYERR HWE_MASK(IBCBusToSPCParityErr)
+
+#define SYM_MASK_BIT(regname, fldname, bit) ((u64) \
+	(1ULL << (SYM_LSB(regname, fldname) + (bit))))
+
+#define TXEMEMPARITYERR_PIOBUF \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 0)
+#define TXEMEMPARITYERR_PIOPBC \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 1)
+#define TXEMEMPARITYERR_PIOLAUNCHFIFO \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 2)
+
+#define RXEMEMPARITYERR_RCVBUF \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 0)
+#define RXEMEMPARITYERR_LOOKUPQ \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 1)
+#define RXEMEMPARITYERR_EXPTID \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 2)
+#define RXEMEMPARITYERR_EAGERTID \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 3)
+#define RXEMEMPARITYERR_FLAGBUF \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 4)
+#define RXEMEMPARITYERR_DATAINFO \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 5)
+#define RXEMEMPARITYERR_HDRINFO \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 6)
+
+/* 7220 specific hardware errors... */
+static const struct qib_hwerror_msgs qib_7220_hwerror_msgs[] = {
+	/* generic hardware errors */
+	QLOGIC_IB_HWE_MSG(IBCBUSFRSPCPARITYERR, "QIB2IB Parity"),
+	QLOGIC_IB_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2QIB Parity"),
+
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOBUF,
+			  "TXE PIOBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOPBC,
+			  "TXE PIOPBC Memory Parity"),
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOLAUNCHFIFO,
+			  "TXE PIOLAUNCHFIFO Memory Parity"),
+
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_RCVBUF,
+			  "RXE RCVBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_LOOKUPQ,
+			  "RXE LOOKUPQ Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EAGERTID,
+			  "RXE EAGERTID Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EXPTID,
+			  "RXE EXPTID Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_FLAGBUF,
+			  "RXE FLAGBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_DATAINFO,
+			  "RXE DATAINFO Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_HDRINFO,
+			  "RXE HDRINFO Memory Parity"),
+
+	/* chip-specific hardware errors */
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEPOISONEDTLP,
+			  "PCIe Poisoned TLP"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLTIMEOUT,
+			  "PCIe completion timeout"),
+	/*
+	 * In practice, it's unlikely wthat we'll see PCIe PLL, or bus
+	 * parity or memory parity error failures, because most likely we
+	 * won't be able to talk to the core of the chip.  Nonetheless, we
+	 * might see them, if they are in parts of the PCIe core that aren't
+	 * essential.
+	 */
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE1PLLFAILED,
+			  "PCIePLL1"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE0PLLFAILED,
+			  "PCIePLL0"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXTLH,
+			  "PCIe XTLH core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXADM,
+			  "PCIe ADM TX core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYRADM,
+			  "PCIe ADM RX core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SERDESPLLFAILED,
+			  "SerDes PLL"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR,
+			  "PCIe cpl header queue"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR,
+			  "PCIe cpl data queue"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SDMAMEMREADERR,
+			  "Send DMA memory read"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED,
+			  "uC PLL clock not locked"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT,
+			  "PCIe serdes Q0 no clock"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT,
+			  "PCIe serdes Q1 no clock"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT,
+			  "PCIe serdes Q2 no clock"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT,
+			  "PCIe serdes Q3 no clock"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR,
+			  "DDS RXEQ memory parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR,
+			  "IB uC memory parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR,
+			  "PCIe uC oct0 memory parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR,
+			  "PCIe uC oct1 memory parity"),
+};
+
+#define RXE_PARITY (RXEMEMPARITYERR_EAGERTID|RXEMEMPARITYERR_EXPTID)
+
+#define QLOGIC_IB_E_PKTERRS (\
+		ERR_MASK(SendPktLenErr) |				\
+		ERR_MASK(SendDroppedDataPktErr) |			\
+		ERR_MASK(RcvVCRCErr) |					\
+		ERR_MASK(RcvICRCErr) |					\
+		ERR_MASK(RcvShortPktLenErr) |				\
+		ERR_MASK(RcvEBPErr))
+
+/* Convenience for decoding Send DMA errors */
+#define QLOGIC_IB_E_SDMAERRS ( \
+		ERR_MASK(SDmaGenMismatchErr) |				\
+		ERR_MASK(SDmaOutOfBoundErr) |				\
+		ERR_MASK(SDmaTailOutOfBoundErr) | ERR_MASK(SDmaBaseErr) | \
+		ERR_MASK(SDma1stDescErr) | ERR_MASK(SDmaRpyTagErr) |	\
+		ERR_MASK(SDmaDwEnErr) | ERR_MASK(SDmaMissingDwErr) |	\
+		ERR_MASK(SDmaUnexpDataErr) |				\
+		ERR_MASK(SDmaDescAddrMisalignErr) |			\
+		ERR_MASK(SDmaDisabledErr) |				\
+		ERR_MASK(SendBufMisuseErr))
+
+/* These are all rcv-related errors which we want to count for stats */
+#define E_SUM_PKTERRS \
+	(ERR_MASK(RcvHdrLenErr) | ERR_MASK(RcvBadTidErr) |		\
+	 ERR_MASK(RcvBadVersionErr) | ERR_MASK(RcvHdrErr) |		\
+	 ERR_MASK(RcvLongPktLenErr) | ERR_MASK(RcvShortPktLenErr) |	\
+	 ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvMinPktLenErr) |	\
+	 ERR_MASK(RcvFormatErr) | ERR_MASK(RcvUnsupportedVLErr) |	\
+	 ERR_MASK(RcvUnexpectedCharErr) | ERR_MASK(RcvEBPErr))
+
+/* These are all send-related errors which we want to count for stats */
+#define E_SUM_ERRS \
+	(ERR_MASK(SendPioArmLaunchErr) | ERR_MASK(SendUnexpectedPktNumErr) | \
+	 ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnsupportedVLErr) |	\
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(InvalidAddrErr))
+
+/*
+ * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
+ * errors not related to freeze and cancelling buffers.  Can't ignore
+ * armlaunch because could get more while still cleaning up, and need
+ * to cancel those as they happen.
+ */
+#define E_SPKT_ERRS_IGNORE \
+	(ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendMinPktLenErr) |	\
+	 ERR_MASK(SendPktLenErr))
+
+/*
+ * these are errors that can occur when the link changes state while
+ * a packet is being sent or received.  This doesn't cover things
+ * like EBP or VCRC that can be the result of a sending having the
+ * link change state, so we receive a "known bad" packet.
+ */
+#define E_SUM_LINK_PKTERRS \
+	(ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvMinPktLenErr) |	\
+	 ERR_MASK(RcvUnexpectedCharErr))
+
+static void autoneg_7220_work(struct work_struct *);
+static u32 __iomem *qib_7220_getsendbuf(struct qib_pportdata *, u64, u32 *);
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ * because we don't need to force the update of pioavail.
+ */
+static void qib_disarm_7220_senderrbufs(struct qib_pportdata *ppd)
+{
+	unsigned long sbuf[3];
+	struct qib_devdata *dd = ppd->dd;
+
+	/*
+	 * It's possible that sendbuffererror could have bits set; might
+	 * have already done this as a result of hardware error handling.
+	 */
+	/* read these before writing errorclear */
+	sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror);
+	sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1);
+	sbuf[2] = qib_read_kreg64(dd, kr_sendbuffererror + 2);
+
+	if (sbuf[0] || sbuf[1] || sbuf[2])
+		qib_disarm_piobufs_set(dd, sbuf,
+				       dd->piobcnt2k + dd->piobcnt4k);
+}
+
+static void qib_7220_txe_recover(struct qib_devdata *dd)
+{
+	qib_devinfo(dd->pcidev, "Recovering from TXE PIO parity error\n");
+	qib_disarm_7220_senderrbufs(dd->pport);
+}
+
+/*
+ * This is called with interrupts disabled and sdma_lock held.
+ */
+static void qib_7220_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 set_sendctrl = 0;
+	u64 clr_sendctrl = 0;
+
+	if (op & QIB_SDMA_SENDCTRL_OP_ENABLE)
+		set_sendctrl |= SYM_MASK(SendCtrl, SDmaEnable);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl, SDmaEnable);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_INTENABLE)
+		set_sendctrl |= SYM_MASK(SendCtrl, SDmaIntEnable);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl, SDmaIntEnable);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_HALT)
+		set_sendctrl |= SYM_MASK(SendCtrl, SDmaHalt);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl, SDmaHalt);
+
+	spin_lock(&dd->sendctrl_lock);
+
+	dd->sendctrl |= set_sendctrl;
+	dd->sendctrl &= ~clr_sendctrl;
+
+	qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	spin_unlock(&dd->sendctrl_lock);
+}
+
+static void qib_decode_7220_sdma_errs(struct qib_pportdata *ppd,
+				      u64 err, char *buf, size_t blen)
+{
+	static const struct {
+		u64 err;
+		const char *msg;
+	} errs[] = {
+		{ ERR_MASK(SDmaGenMismatchErr),
+		  "SDmaGenMismatch" },
+		{ ERR_MASK(SDmaOutOfBoundErr),
+		  "SDmaOutOfBound" },
+		{ ERR_MASK(SDmaTailOutOfBoundErr),
+		  "SDmaTailOutOfBound" },
+		{ ERR_MASK(SDmaBaseErr),
+		  "SDmaBase" },
+		{ ERR_MASK(SDma1stDescErr),
+		  "SDma1stDesc" },
+		{ ERR_MASK(SDmaRpyTagErr),
+		  "SDmaRpyTag" },
+		{ ERR_MASK(SDmaDwEnErr),
+		  "SDmaDwEn" },
+		{ ERR_MASK(SDmaMissingDwErr),
+		  "SDmaMissingDw" },
+		{ ERR_MASK(SDmaUnexpDataErr),
+		  "SDmaUnexpData" },
+		{ ERR_MASK(SDmaDescAddrMisalignErr),
+		  "SDmaDescAddrMisalign" },
+		{ ERR_MASK(SendBufMisuseErr),
+		  "SendBufMisuse" },
+		{ ERR_MASK(SDmaDisabledErr),
+		  "SDmaDisabled" },
+	};
+	int i;
+	size_t bidx = 0;
+
+	for (i = 0; i < ARRAY_SIZE(errs); i++) {
+		if (err & errs[i].err)
+			bidx += scnprintf(buf + bidx, blen - bidx,
+					 "%s ", errs[i].msg);
+	}
+}
+
+/*
+ * This is called as part of link down clean up so disarm and flush
+ * all send buffers so that SMP packets can be sent.
+ */
+static void qib_7220_sdma_hw_clean_up(struct qib_pportdata *ppd)
+{
+	/* This will trigger the Abort interrupt */
+	sendctrl_7220_mod(ppd, QIB_SENDCTRL_DISARM_ALL | QIB_SENDCTRL_FLUSH |
+			  QIB_SENDCTRL_AVAIL_BLIP);
+	ppd->dd->upd_pio_shadow  = 1; /* update our idea of what's busy */
+}
+
+static void qib_sdma_7220_setlengen(struct qib_pportdata *ppd)
+{
+	/*
+	 * Set SendDmaLenGen and clear and set
+	 * the MSB of the generation count to enable generation checking
+	 * and load the internal generation counter.
+	 */
+	qib_write_kreg(ppd->dd, kr_senddmalengen, ppd->sdma_descq_cnt);
+	qib_write_kreg(ppd->dd, kr_senddmalengen,
+		       ppd->sdma_descq_cnt |
+		       (1ULL << QIB_7220_SendDmaLenGen_Generation_MSB));
+}
+
+static void qib_7220_sdma_hw_start_up(struct qib_pportdata *ppd)
+{
+	qib_sdma_7220_setlengen(ppd);
+	qib_sdma_update_7220_tail(ppd, 0); /* Set SendDmaTail */
+	ppd->sdma_head_dma[0] = 0;
+}
+
+#define DISABLES_SDMA (							\
+		ERR_MASK(SDmaDisabledErr) |				\
+		ERR_MASK(SDmaBaseErr) |					\
+		ERR_MASK(SDmaTailOutOfBoundErr) |			\
+		ERR_MASK(SDmaOutOfBoundErr) |				\
+		ERR_MASK(SDma1stDescErr) |				\
+		ERR_MASK(SDmaRpyTagErr) |				\
+		ERR_MASK(SDmaGenMismatchErr) |				\
+		ERR_MASK(SDmaDescAddrMisalignErr) |			\
+		ERR_MASK(SDmaMissingDwErr) |				\
+		ERR_MASK(SDmaDwEnErr))
+
+static void sdma_7220_errors(struct qib_pportdata *ppd, u64 errs)
+{
+	unsigned long flags;
+	struct qib_devdata *dd = ppd->dd;
+	char *msg;
+
+	errs &= QLOGIC_IB_E_SDMAERRS;
+
+	msg = dd->cspec->sdmamsgbuf;
+	qib_decode_7220_sdma_errs(ppd, errs, msg,
+		sizeof(dd->cspec->sdmamsgbuf));
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	if (errs & ERR_MASK(SendBufMisuseErr)) {
+		unsigned long sbuf[3];
+
+		sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror);
+		sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1);
+		sbuf[2] = qib_read_kreg64(dd, kr_sendbuffererror + 2);
+
+		qib_dev_err(ppd->dd,
+			    "IB%u:%u SendBufMisuse: %04lx %016lx %016lx\n",
+			    ppd->dd->unit, ppd->port, sbuf[2], sbuf[1],
+			    sbuf[0]);
+	}
+
+	if (errs & ERR_MASK(SDmaUnexpDataErr))
+		qib_dev_err(dd, "IB%u:%u SDmaUnexpData\n", ppd->dd->unit,
+			    ppd->port);
+
+	switch (ppd->sdma_state.current_state) {
+	case qib_sdma_state_s00_hw_down:
+		/* not expecting any interrupts */
+		break;
+
+	case qib_sdma_state_s10_hw_start_up_wait:
+		/* handled in intr path */
+		break;
+
+	case qib_sdma_state_s20_idle:
+		/* not expecting any interrupts */
+		break;
+
+	case qib_sdma_state_s30_sw_clean_up_wait:
+		/* not expecting any interrupts */
+		break;
+
+	case qib_sdma_state_s40_hw_clean_up_wait:
+		if (errs & ERR_MASK(SDmaDisabledErr))
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e50_hw_cleaned);
+		break;
+
+	case qib_sdma_state_s50_hw_halt_wait:
+		/* handled in intr path */
+		break;
+
+	case qib_sdma_state_s99_running:
+		if (errs & DISABLES_SDMA)
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e7220_err_halted);
+		break;
+	}
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+/*
+ * Decode the error status into strings, deciding whether to always
+ * print * it or not depending on "normal packet errors" vs everything
+ * else.   Return 1 if "real" errors, otherwise 0 if only packet
+ * errors, so caller can decide what to print with the string.
+ */
+static int qib_decode_7220_err(struct qib_devdata *dd, char *buf, size_t blen,
+			       u64 err)
+{
+	int iserr = 1;
+
+	*buf = '\0';
+	if (err & QLOGIC_IB_E_PKTERRS) {
+		if (!(err & ~QLOGIC_IB_E_PKTERRS))
+			iserr = 0;
+		if ((err & ERR_MASK(RcvICRCErr)) &&
+		    !(err & (ERR_MASK(RcvVCRCErr) | ERR_MASK(RcvEBPErr))))
+			strlcat(buf, "CRC ", blen);
+		if (!iserr)
+			goto done;
+	}
+	if (err & ERR_MASK(RcvHdrLenErr))
+		strlcat(buf, "rhdrlen ", blen);
+	if (err & ERR_MASK(RcvBadTidErr))
+		strlcat(buf, "rbadtid ", blen);
+	if (err & ERR_MASK(RcvBadVersionErr))
+		strlcat(buf, "rbadversion ", blen);
+	if (err & ERR_MASK(RcvHdrErr))
+		strlcat(buf, "rhdr ", blen);
+	if (err & ERR_MASK(SendSpecialTriggerErr))
+		strlcat(buf, "sendspecialtrigger ", blen);
+	if (err & ERR_MASK(RcvLongPktLenErr))
+		strlcat(buf, "rlongpktlen ", blen);
+	if (err & ERR_MASK(RcvMaxPktLenErr))
+		strlcat(buf, "rmaxpktlen ", blen);
+	if (err & ERR_MASK(RcvMinPktLenErr))
+		strlcat(buf, "rminpktlen ", blen);
+	if (err & ERR_MASK(SendMinPktLenErr))
+		strlcat(buf, "sminpktlen ", blen);
+	if (err & ERR_MASK(RcvFormatErr))
+		strlcat(buf, "rformaterr ", blen);
+	if (err & ERR_MASK(RcvUnsupportedVLErr))
+		strlcat(buf, "runsupvl ", blen);
+	if (err & ERR_MASK(RcvUnexpectedCharErr))
+		strlcat(buf, "runexpchar ", blen);
+	if (err & ERR_MASK(RcvIBFlowErr))
+		strlcat(buf, "ribflow ", blen);
+	if (err & ERR_MASK(SendUnderRunErr))
+		strlcat(buf, "sunderrun ", blen);
+	if (err & ERR_MASK(SendPioArmLaunchErr))
+		strlcat(buf, "spioarmlaunch ", blen);
+	if (err & ERR_MASK(SendUnexpectedPktNumErr))
+		strlcat(buf, "sunexperrpktnum ", blen);
+	if (err & ERR_MASK(SendDroppedSmpPktErr))
+		strlcat(buf, "sdroppedsmppkt ", blen);
+	if (err & ERR_MASK(SendMaxPktLenErr))
+		strlcat(buf, "smaxpktlen ", blen);
+	if (err & ERR_MASK(SendUnsupportedVLErr))
+		strlcat(buf, "sunsupVL ", blen);
+	if (err & ERR_MASK(InvalidAddrErr))
+		strlcat(buf, "invalidaddr ", blen);
+	if (err & ERR_MASK(RcvEgrFullErr))
+		strlcat(buf, "rcvegrfull ", blen);
+	if (err & ERR_MASK(RcvHdrFullErr))
+		strlcat(buf, "rcvhdrfull ", blen);
+	if (err & ERR_MASK(IBStatusChanged))
+		strlcat(buf, "ibcstatuschg ", blen);
+	if (err & ERR_MASK(RcvIBLostLinkErr))
+		strlcat(buf, "riblostlink ", blen);
+	if (err & ERR_MASK(HardwareErr))
+		strlcat(buf, "hardware ", blen);
+	if (err & ERR_MASK(ResetNegated))
+		strlcat(buf, "reset ", blen);
+	if (err & QLOGIC_IB_E_SDMAERRS)
+		qib_decode_7220_sdma_errs(dd->pport, err, buf, blen);
+	if (err & ERR_MASK(InvalidEEPCmd))
+		strlcat(buf, "invalideepromcmd ", blen);
+done:
+	return iserr;
+}
+
+static void reenable_7220_chase(struct timer_list *t)
+{
+	struct qib_chippport_specific *cpspec = from_timer(cpspec, t,
+							 chase_timer);
+	struct qib_pportdata *ppd = &cpspec->pportdata;
+
+	ppd->cpspec->chase_timer.expires = 0;
+	qib_set_ib_7220_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN,
+		QLOGIC_IB_IBCC_LINKINITCMD_POLL);
+}
+
+static void handle_7220_chase(struct qib_pportdata *ppd, u64 ibcst)
+{
+	u8 ibclt;
+	unsigned long tnow;
+
+	ibclt = (u8)SYM_FIELD(ibcst, IBCStatus, LinkTrainingState);
+
+	/*
+	 * Detect and handle the state chase issue, where we can
+	 * get stuck if we are unlucky on timing on both sides of
+	 * the link.   If we are, we disable, set a timer, and
+	 * then re-enable.
+	 */
+	switch (ibclt) {
+	case IB_7220_LT_STATE_CFGRCVFCFG:
+	case IB_7220_LT_STATE_CFGWAITRMT:
+	case IB_7220_LT_STATE_TXREVLANES:
+	case IB_7220_LT_STATE_CFGENH:
+		tnow = jiffies;
+		if (ppd->cpspec->chase_end &&
+		    time_after(tnow, ppd->cpspec->chase_end)) {
+			ppd->cpspec->chase_end = 0;
+			qib_set_ib_7220_lstate(ppd,
+				QLOGIC_IB_IBCC_LINKCMD_DOWN,
+				QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+			ppd->cpspec->chase_timer.expires = jiffies +
+				QIB_CHASE_DIS_TIME;
+			add_timer(&ppd->cpspec->chase_timer);
+		} else if (!ppd->cpspec->chase_end)
+			ppd->cpspec->chase_end = tnow + QIB_CHASE_TIME;
+		break;
+
+	default:
+		ppd->cpspec->chase_end = 0;
+		break;
+	}
+}
+
+static void handle_7220_errors(struct qib_devdata *dd, u64 errs)
+{
+	char *msg;
+	u64 ignore_this_time = 0;
+	u64 iserr = 0;
+	struct qib_pportdata *ppd = dd->pport;
+	u64 mask;
+
+	/* don't report errors that are masked */
+	errs &= dd->cspec->errormask;
+	msg = dd->cspec->emsgbuf;
+
+	/* do these first, they are most important */
+	if (errs & ERR_MASK(HardwareErr))
+		qib_7220_handle_hwerrors(dd, msg, sizeof(dd->cspec->emsgbuf));
+
+	if (errs & QLOGIC_IB_E_SDMAERRS)
+		sdma_7220_errors(ppd, errs);
+
+	if (errs & ~IB_E_BITSEXTANT)
+		qib_dev_err(dd,
+			"error interrupt with unknown errors %llx set\n",
+			(unsigned long long) (errs & ~IB_E_BITSEXTANT));
+
+	if (errs & E_SUM_ERRS) {
+		qib_disarm_7220_senderrbufs(ppd);
+		if ((errs & E_SUM_LINK_PKTERRS) &&
+		    !(ppd->lflags & QIBL_LINKACTIVE)) {
+			/*
+			 * This can happen when trying to bring the link
+			 * up, but the IB link changes state at the "wrong"
+			 * time. The IB logic then complains that the packet
+			 * isn't valid.  We don't want to confuse people, so
+			 * we just don't print them, except at debug
+			 */
+			ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+		}
+	} else if ((errs & E_SUM_LINK_PKTERRS) &&
+		   !(ppd->lflags & QIBL_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+	}
+
+	qib_write_kreg(dd, kr_errclear, errs);
+
+	errs &= ~ignore_this_time;
+	if (!errs)
+		goto done;
+
+	/*
+	 * The ones we mask off are handled specially below
+	 * or above.  Also mask SDMADISABLED by default as it
+	 * is too chatty.
+	 */
+	mask = ERR_MASK(IBStatusChanged) |
+		ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) |
+		ERR_MASK(HardwareErr) | ERR_MASK(SDmaDisabledErr);
+
+	qib_decode_7220_err(dd, msg, sizeof(dd->cspec->emsgbuf), errs & ~mask);
+
+	if (errs & E_SUM_PKTERRS)
+		qib_stats.sps_rcverrs++;
+	if (errs & E_SUM_ERRS)
+		qib_stats.sps_txerrs++;
+	iserr = errs & ~(E_SUM_PKTERRS | QLOGIC_IB_E_PKTERRS |
+			 ERR_MASK(SDmaDisabledErr));
+
+	if (errs & ERR_MASK(IBStatusChanged)) {
+		u64 ibcs;
+
+		ibcs = qib_read_kreg64(dd, kr_ibcstatus);
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+			handle_7220_chase(ppd, ibcs);
+
+		/* Update our picture of width and speed from chip */
+		ppd->link_width_active =
+			((ibcs >> IBA7220_LINKWIDTH_SHIFT) & 1) ?
+			    IB_WIDTH_4X : IB_WIDTH_1X;
+		ppd->link_speed_active =
+			((ibcs >> IBA7220_LINKSPEED_SHIFT) & 1) ?
+			    QIB_IB_DDR : QIB_IB_SDR;
+
+		/*
+		 * Since going into a recovery state causes the link state
+		 * to go down and since recovery is transitory, it is better
+		 * if we "miss" ever seeing the link training state go into
+		 * recovery (i.e., ignore this transition for link state
+		 * special handling purposes) without updating lastibcstat.
+		 */
+		if (qib_7220_phys_portstate(ibcs) !=
+					    IB_PHYSPORTSTATE_LINK_ERR_RECOVER)
+			qib_handle_e_ibstatuschanged(ppd, ibcs);
+	}
+
+	if (errs & ERR_MASK(ResetNegated)) {
+		qib_dev_err(dd,
+			"Got reset, requires re-init (unload and reload driver)\n");
+		dd->flags &= ~QIB_INITTED;  /* needs re-init */
+		/* mark as having had error */
+		*dd->devstatusp |= QIB_STATUS_HWERROR;
+		*dd->pport->statusp &= ~QIB_STATUS_IB_CONF;
+	}
+
+	if (*msg && iserr)
+		qib_dev_porterr(dd, ppd->port, "%s error\n", msg);
+
+	if (ppd->state_wanted & ppd->lflags)
+		wake_up_interruptible(&ppd->state_wait);
+
+	/*
+	 * If there were hdrq or egrfull errors, wake up any processes
+	 * waiting in poll.  We used to try to check which contexts had
+	 * the overflow, but given the cost of that and the chip reads
+	 * to support it, it's better to just wake everybody up if we
+	 * get an overflow; waiters can poll again if it's not them.
+	 */
+	if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) {
+		qib_handle_urcv(dd, ~0U);
+		if (errs & ERR_MASK(RcvEgrFullErr))
+			qib_stats.sps_buffull++;
+		else
+			qib_stats.sps_hdrfull++;
+	}
+done:
+	return;
+}
+
+/* enable/disable chip from delivering interrupts */
+static void qib_7220_set_intr_state(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		if (dd->flags & QIB_BADINTR)
+			return;
+		qib_write_kreg(dd, kr_intmask, ~0ULL);
+		/* force re-interrupt of any pending interrupts. */
+		qib_write_kreg(dd, kr_intclear, 0ULL);
+	} else
+		qib_write_kreg(dd, kr_intmask, 0ULL);
+}
+
+/*
+ * Try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ * This is in chip-specific code because of all of the register accesses,
+ * even though the details are similar on most chips.
+ */
+static void qib_7220_clear_freeze(struct qib_devdata *dd)
+{
+	/* disable error interrupts, to avoid confusion */
+	qib_write_kreg(dd, kr_errmask, 0ULL);
+
+	/* also disable interrupts; errormask is sometimes overwritten */
+	qib_7220_set_intr_state(dd, 0);
+
+	qib_cancel_sends(dd->pport);
+
+	/* clear the freeze, and be sure chip saw it */
+	qib_write_kreg(dd, kr_control, dd->control);
+	qib_read_kreg32(dd, kr_scratch);
+
+	/* force in-memory update now we are out of freeze */
+	qib_force_pio_avail_update(dd);
+
+	/*
+	 * force new interrupt if any hwerr, error or interrupt bits are
+	 * still set, and clear "safe" send packet errors related to freeze
+	 * and cancelling sends.  Re-enable error interrupts before possible
+	 * force of re-interrupt on pending interrupts.
+	 */
+	qib_write_kreg(dd, kr_hwerrclear, 0ULL);
+	qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+	qib_7220_set_intr_state(dd, 1);
+}
+
+/**
+ * qib_7220_handle_hwerrors - display hardware errors.
+ * @dd: the qlogic_ib device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  We reuse the same message buffer as
+ * handle_7220_errors() to avoid excessive stack usage.
+ */
+static void qib_7220_handle_hwerrors(struct qib_devdata *dd, char *msg,
+				     size_t msgl)
+{
+	u64 hwerrs;
+	u32 bits, ctrl;
+	int isfatal = 0;
+	char *bitsmsg;
+
+	hwerrs = qib_read_kreg64(dd, kr_hwerrstatus);
+	if (!hwerrs)
+		goto bail;
+	if (hwerrs == ~0ULL) {
+		qib_dev_err(dd,
+			"Read of hardware error status failed (all bits set); ignoring\n");
+		goto bail;
+	}
+	qib_stats.sps_hwerrs++;
+
+	/*
+	 * Always clear the error status register, except MEMBISTFAIL,
+	 * regardless of whether we continue or stop using the chip.
+	 * We want that set so we know it failed, even across driver reload.
+	 * We'll still ignore it in the hwerrmask.  We do this partly for
+	 * diagnostics, but also for support.
+	 */
+	qib_write_kreg(dd, kr_hwerrclear,
+		       hwerrs & ~HWE_MASK(PowerOnBISTFailed));
+
+	hwerrs &= dd->cspec->hwerrmask;
+
+	if (hwerrs & ~(TXEMEMPARITYERR_PIOBUF | TXEMEMPARITYERR_PIOPBC |
+		       RXE_PARITY))
+		qib_devinfo(dd->pcidev,
+			"Hardware error: hwerr=0x%llx (cleared)\n",
+			(unsigned long long) hwerrs);
+
+	if (hwerrs & ~IB_HWE_BITSEXTANT)
+		qib_dev_err(dd,
+			"hwerror interrupt with unknown errors %llx set\n",
+			(unsigned long long) (hwerrs & ~IB_HWE_BITSEXTANT));
+
+	if (hwerrs & QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR)
+		qib_sd7220_clr_ibpar(dd);
+
+	ctrl = qib_read_kreg32(dd, kr_control);
+	if ((ctrl & QLOGIC_IB_C_FREEZEMODE) && !dd->diag_client) {
+		/*
+		 * Parity errors in send memory are recoverable by h/w
+		 * just do housekeeping, exit freeze mode and continue.
+		 */
+		if (hwerrs & (TXEMEMPARITYERR_PIOBUF |
+			      TXEMEMPARITYERR_PIOPBC)) {
+			qib_7220_txe_recover(dd);
+			hwerrs &= ~(TXEMEMPARITYERR_PIOBUF |
+				    TXEMEMPARITYERR_PIOPBC);
+		}
+		if (hwerrs)
+			isfatal = 1;
+		else
+			qib_7220_clear_freeze(dd);
+	}
+
+	*msg = '\0';
+
+	if (hwerrs & HWE_MASK(PowerOnBISTFailed)) {
+		isfatal = 1;
+		strlcat(msg,
+			"[Memory BIST test failed, InfiniPath hardware unusable]",
+			msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	qib_format_hwerrors(hwerrs, qib_7220_hwerror_msgs,
+			    ARRAY_SIZE(qib_7220_hwerror_msgs), msg, msgl);
+
+	bitsmsg = dd->cspec->bitsmsgbuf;
+	if (hwerrs & (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK <<
+		      QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT)) {
+		bits = (u32) ((hwerrs >>
+			       QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) &
+			      QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK);
+		snprintf(bitsmsg, sizeof(dd->cspec->bitsmsgbuf),
+			 "[PCIe Mem Parity Errs %x] ", bits);
+		strlcat(msg, bitsmsg, msgl);
+	}
+
+#define _QIB_PLL_FAIL (QLOGIC_IB_HWE_COREPLL_FBSLIP |   \
+			 QLOGIC_IB_HWE_COREPLL_RFSLIP)
+
+	if (hwerrs & _QIB_PLL_FAIL) {
+		isfatal = 1;
+		snprintf(bitsmsg, sizeof(dd->cspec->bitsmsgbuf),
+			 "[PLL failed (%llx), InfiniPath hardware unusable]",
+			 (unsigned long long) hwerrs & _QIB_PLL_FAIL);
+		strlcat(msg, bitsmsg, msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~(hwerrs & _QIB_PLL_FAIL);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	if (hwerrs & QLOGIC_IB_HWE_SERDESPLLFAILED) {
+		/*
+		 * If it occurs, it is left masked since the eternal
+		 * interface is unused.
+		 */
+		dd->cspec->hwerrmask &= ~QLOGIC_IB_HWE_SERDESPLLFAILED;
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	qib_dev_err(dd, "%s hardware error\n", msg);
+
+	if (isfatal && !dd->diag_client) {
+		qib_dev_err(dd,
+			"Fatal Hardware Error, no longer usable, SN %.16s\n",
+			dd->serial);
+		/*
+		 * For /sys status file and user programs to print; if no
+		 * trailing brace is copied, we'll know it was truncated.
+		 */
+		if (dd->freezemsg)
+			snprintf(dd->freezemsg, dd->freezelen,
+				 "{%s}", msg);
+		qib_disable_after_error(dd);
+	}
+bail:;
+}
+
+/**
+ * qib_7220_init_hwerrors - enable hardware errors
+ * @dd: the qlogic_ib device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void qib_7220_init_hwerrors(struct qib_devdata *dd)
+{
+	u64 val;
+	u64 extsval;
+
+	extsval = qib_read_kreg64(dd, kr_extstatus);
+
+	if (!(extsval & (QLOGIC_IB_EXTS_MEMBIST_ENDTEST |
+			 QLOGIC_IB_EXTS_MEMBIST_DISABLED)))
+		qib_dev_err(dd, "MemBIST did not complete!\n");
+	if (extsval & QLOGIC_IB_EXTS_MEMBIST_DISABLED)
+		qib_devinfo(dd->pcidev, "MemBIST is disabled.\n");
+
+	val = ~0ULL;    /* default to all hwerrors become interrupts, */
+
+	val &= ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR;
+	dd->cspec->hwerrmask = val;
+
+	qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed));
+	qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+
+	/* clear all */
+	qib_write_kreg(dd, kr_errclear, ~0ULL);
+	/* enable errors that are masked, at least this first time. */
+	qib_write_kreg(dd, kr_errmask, ~0ULL);
+	dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask);
+	/* clear any interrupts up to this point (ints still not enabled) */
+	qib_write_kreg(dd, kr_intclear, ~0ULL);
+}
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing
+ * on chips that are count-based, rather than trigger-based.  There is no
+ * reference counting, but that's also fine, given the intended use.
+ * Only chip-specific because it's all register accesses
+ */
+static void qib_set_7220_armlaunch(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		qib_write_kreg(dd, kr_errclear, ERR_MASK(SendPioArmLaunchErr));
+		dd->cspec->errormask |= ERR_MASK(SendPioArmLaunchErr);
+	} else
+		dd->cspec->errormask &= ~ERR_MASK(SendPioArmLaunchErr);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+}
+
+/*
+ * Formerly took parameter <which> in pre-shifted,
+ * pre-merged form with LinkCmd and LinkInitCmd
+ * together, and assuming the zero was NOP.
+ */
+static void qib_set_ib_7220_lstate(struct qib_pportdata *ppd, u16 linkcmd,
+				   u16 linitcmd)
+{
+	u64 mod_wd;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) {
+		/*
+		 * If we are told to disable, note that so link-recovery
+		 * code does not attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	} else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) {
+		/*
+		 * Any other linkinitcmd will lead to LINKDOWN and then
+		 * to INIT (if all is well), so clear flag to let
+		 * link-recovery code attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+
+	mod_wd = (linkcmd << IBA7220_IBCC_LINKCMD_SHIFT) |
+		(linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+
+	qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl | mod_wd);
+	/* write to chip to prevent back-to-back writes of ibc reg */
+	qib_write_kreg(dd, kr_scratch, 0);
+}
+
+/*
+ * All detailed interaction with the SerDes has been moved to qib_sd7220.c
+ *
+ * The portion of IBA7220-specific bringup_serdes() that actually deals with
+ * registers and memory within the SerDes itself is qib_sd7220_init().
+ */
+
+/**
+ * qib_7220_bringup_serdes - bring up the serdes
+ * @ppd: physical port on the qlogic_ib device
+ */
+static int qib_7220_bringup_serdes(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 val, prev_val, guid, ibc;
+	int ret = 0;
+
+	/* Put IBC in reset, sends disabled */
+	dd->control &= ~QLOGIC_IB_C_LINKENABLE;
+	qib_write_kreg(dd, kr_control, 0ULL);
+
+	if (qib_compat_ddr_negotiate) {
+		ppd->cpspec->ibdeltainprog = 1;
+		ppd->cpspec->ibsymsnap = read_7220_creg32(dd, cr_ibsymbolerr);
+		ppd->cpspec->iblnkerrsnap =
+			read_7220_creg32(dd, cr_iblinkerrrecov);
+	}
+
+	/* flowcontrolwatermark is in units of KBytes */
+	ibc = 0x5ULL << SYM_LSB(IBCCtrl, FlowCtrlWaterMark);
+	/*
+	 * How often flowctrl sent.  More or less in usecs; balance against
+	 * watermark value, so that in theory senders always get a flow
+	 * control update in time to not let the IB link go idle.
+	 */
+	ibc |= 0x3ULL << SYM_LSB(IBCCtrl, FlowCtrlPeriod);
+	/* max error tolerance */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrl, PhyerrThreshold);
+	/* use "real" buffer space for */
+	ibc |= 4ULL << SYM_LSB(IBCCtrl, CreditScale);
+	/* IB credit flow control. */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrl, OverrunThreshold);
+	/*
+	 * set initial max size pkt IBC will send, including ICRC; it's the
+	 * PIO buffer size in dwords, less 1; also see qib_set_mtu()
+	 */
+	ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) << SYM_LSB(IBCCtrl, MaxPktLen);
+	ppd->cpspec->ibcctrl = ibc; /* without linkcmd or linkinitcmd! */
+
+	/* initially come up waiting for TS1, without sending anything. */
+	val = ppd->cpspec->ibcctrl | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE <<
+		QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+	qib_write_kreg(dd, kr_ibcctrl, val);
+
+	if (!ppd->cpspec->ibcddrctrl) {
+		/* not on re-init after reset */
+		ppd->cpspec->ibcddrctrl = qib_read_kreg64(dd, kr_ibcddrctrl);
+
+		if (ppd->link_speed_enabled == (QIB_IB_SDR | QIB_IB_DDR))
+			ppd->cpspec->ibcddrctrl |=
+				IBA7220_IBC_SPEED_AUTONEG_MASK |
+				IBA7220_IBC_IBTA_1_2_MASK;
+		else
+			ppd->cpspec->ibcddrctrl |=
+				ppd->link_speed_enabled == QIB_IB_DDR ?
+				IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR;
+		if ((ppd->link_width_enabled & (IB_WIDTH_1X | IB_WIDTH_4X)) ==
+		    (IB_WIDTH_1X | IB_WIDTH_4X))
+			ppd->cpspec->ibcddrctrl |= IBA7220_IBC_WIDTH_AUTONEG;
+		else
+			ppd->cpspec->ibcddrctrl |=
+				ppd->link_width_enabled == IB_WIDTH_4X ?
+				IBA7220_IBC_WIDTH_4X_ONLY :
+				IBA7220_IBC_WIDTH_1X_ONLY;
+
+		/* always enable these on driver reload, not sticky */
+		ppd->cpspec->ibcddrctrl |=
+			IBA7220_IBC_RXPOL_MASK << IBA7220_IBC_RXPOL_SHIFT;
+		ppd->cpspec->ibcddrctrl |=
+			IBA7220_IBC_HRTBT_MASK << IBA7220_IBC_HRTBT_SHIFT;
+
+		/* enable automatic lane reversal detection for receive */
+		ppd->cpspec->ibcddrctrl |= IBA7220_IBC_LANE_REV_SUPPORTED;
+	} else
+		/* write to chip to prevent back-to-back writes of ibc reg */
+		qib_write_kreg(dd, kr_scratch, 0);
+
+	qib_write_kreg(dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	qib_write_kreg(dd, kr_ncmodectrl, 0Ull);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	ret = qib_sd7220_init(dd);
+
+	val = qib_read_kreg64(dd, kr_xgxs_cfg);
+	prev_val = val;
+	val |= QLOGIC_IB_XGXS_FC_SAFE;
+	if (val != prev_val) {
+		qib_write_kreg(dd, kr_xgxs_cfg, val);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+	if (val & QLOGIC_IB_XGXS_RESET)
+		val &= ~QLOGIC_IB_XGXS_RESET;
+	if (val != prev_val)
+		qib_write_kreg(dd, kr_xgxs_cfg, val);
+
+	/* first time through, set port guid */
+	if (!ppd->guid)
+		ppd->guid = dd->base_guid;
+	guid = be64_to_cpu(ppd->guid);
+
+	qib_write_kreg(dd, kr_hrtbt_guid, guid);
+	if (!ret) {
+		dd->control |= QLOGIC_IB_C_LINKENABLE;
+		qib_write_kreg(dd, kr_control, dd->control);
+	} else
+		/* write to chip to prevent back-to-back writes of ibc reg */
+		qib_write_kreg(dd, kr_scratch, 0);
+	return ret;
+}
+
+/**
+ * qib_7220_quiet_serdes - set serdes to txidle
+ * @ppd: physical port of the qlogic_ib device
+ * Called when driver is being unloaded
+ */
+static void qib_7220_quiet_serdes(struct qib_pportdata *ppd)
+{
+	u64 val;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	/* disable IBC */
+	dd->control &= ~QLOGIC_IB_C_LINKENABLE;
+	qib_write_kreg(dd, kr_control,
+		       dd->control | QLOGIC_IB_C_FREEZEMODE);
+
+	ppd->cpspec->chase_end = 0;
+	if (ppd->cpspec->chase_timer.function) /* if initted */
+		del_timer_sync(&ppd->cpspec->chase_timer);
+
+	if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta ||
+	    ppd->cpspec->ibdeltainprog) {
+		u64 diagc;
+
+		/* enable counter writes */
+		diagc = qib_read_kreg64(dd, kr_hwdiagctrl);
+		qib_write_kreg(dd, kr_hwdiagctrl,
+			       diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable));
+
+		if (ppd->cpspec->ibsymdelta || ppd->cpspec->ibdeltainprog) {
+			val = read_7220_creg32(dd, cr_ibsymbolerr);
+			if (ppd->cpspec->ibdeltainprog)
+				val -= val - ppd->cpspec->ibsymsnap;
+			val -= ppd->cpspec->ibsymdelta;
+			write_7220_creg(dd, cr_ibsymbolerr, val);
+		}
+		if (ppd->cpspec->iblnkerrdelta || ppd->cpspec->ibdeltainprog) {
+			val = read_7220_creg32(dd, cr_iblinkerrrecov);
+			if (ppd->cpspec->ibdeltainprog)
+				val -= val - ppd->cpspec->iblnkerrsnap;
+			val -= ppd->cpspec->iblnkerrdelta;
+			write_7220_creg(dd, cr_iblinkerrrecov, val);
+		}
+
+		/* and disable counter writes */
+		qib_write_kreg(dd, kr_hwdiagctrl, diagc);
+	}
+	qib_set_ib_7220_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	wake_up(&ppd->cpspec->autoneg_wait);
+	cancel_delayed_work_sync(&ppd->cpspec->autoneg_work);
+
+	shutdown_7220_relock_poll(ppd->dd);
+	val = qib_read_kreg64(ppd->dd, kr_xgxs_cfg);
+	val |= QLOGIC_IB_XGXS_RESET;
+	qib_write_kreg(ppd->dd, kr_xgxs_cfg, val);
+}
+
+/**
+ * qib_setup_7220_setextled - set the state of the two external LEDs
+ * @dd: the qlogic_ib device
+ * @on: whether the link is up or not
+ *
+ * The exact combo of LEDs if on is true is determined by looking
+ * at the ibcstatus.
+ *
+ * These LEDs indicate the physical and logical state of IB link.
+ * For this chip (at least with recommended board pinouts), LED1
+ * is Yellow (logical state) and LED2 is Green (physical state),
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ *
+ */
+static void qib_setup_7220_setextled(struct qib_pportdata *ppd, u32 on)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 extctl, ledblink = 0, val, lst, ltst;
+	unsigned long flags;
+
+	/*
+	 * The diags use the LED to indicate diag info, so we leave
+	 * the external LED alone when the diags are running.
+	 */
+	if (dd->diag_client)
+		return;
+
+	if (ppd->led_override) {
+		ltst = (ppd->led_override & QIB_LED_PHYS) ?
+			IB_PHYSPORTSTATE_LINKUP : IB_PHYSPORTSTATE_DISABLED,
+		lst = (ppd->led_override & QIB_LED_LOG) ?
+			IB_PORT_ACTIVE : IB_PORT_DOWN;
+	} else if (on) {
+		val = qib_read_kreg64(dd, kr_ibcstatus);
+		ltst = qib_7220_phys_portstate(val);
+		lst = qib_7220_iblink_state(val);
+	} else {
+		ltst = 0;
+		lst = 0;
+	}
+
+	spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+	extctl = dd->cspec->extctrl & ~(SYM_MASK(EXTCtrl, LEDPriPortGreenOn) |
+				 SYM_MASK(EXTCtrl, LEDPriPortYellowOn));
+	if (ltst == IB_PHYSPORTSTATE_LINKUP) {
+		extctl |= SYM_MASK(EXTCtrl, LEDPriPortGreenOn);
+		/*
+		 * counts are in chip clock (4ns) periods.
+		 * This is 1/16 sec (66.6ms) on,
+		 * 3/16 sec (187.5 ms) off, with packets rcvd
+		 */
+		ledblink = ((66600 * 1000UL / 4) << IBA7220_LEDBLINK_ON_SHIFT)
+			| ((187500 * 1000UL / 4) << IBA7220_LEDBLINK_OFF_SHIFT);
+	}
+	if (lst == IB_PORT_ACTIVE)
+		extctl |= SYM_MASK(EXTCtrl, LEDPriPortYellowOn);
+	dd->cspec->extctrl = extctl;
+	qib_write_kreg(dd, kr_extctrl, extctl);
+	spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+
+	if (ledblink) /* blink the LED on packet receive */
+		qib_write_kreg(dd, kr_rcvpktledcnt, ledblink);
+}
+
+/*
+ * qib_setup_7220_cleanup - clean up any per-chip chip-specific stuff
+ * @dd: the qlogic_ib device
+ *
+ * This is called during driver unload.
+ *
+ */
+static void qib_setup_7220_cleanup(struct qib_devdata *dd)
+{
+	qib_free_irq(dd);
+	kfree(dd->cspec->cntrs);
+	kfree(dd->cspec->portcntrs);
+}
+
+/*
+ * This is only called for SDmaInt.
+ * SDmaDisabled is handled on the error path.
+ */
+static void sdma_7220_intr(struct qib_pportdata *ppd, u64 istat)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	switch (ppd->sdma_state.current_state) {
+	case qib_sdma_state_s00_hw_down:
+		break;
+
+	case qib_sdma_state_s10_hw_start_up_wait:
+		__qib_sdma_process_event(ppd, qib_sdma_event_e20_hw_started);
+		break;
+
+	case qib_sdma_state_s20_idle:
+		break;
+
+	case qib_sdma_state_s30_sw_clean_up_wait:
+		break;
+
+	case qib_sdma_state_s40_hw_clean_up_wait:
+		break;
+
+	case qib_sdma_state_s50_hw_halt_wait:
+		__qib_sdma_process_event(ppd, qib_sdma_event_e60_hw_halted);
+		break;
+
+	case qib_sdma_state_s99_running:
+		/* too chatty to print here */
+		__qib_sdma_intr(ppd);
+		break;
+	}
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+static void qib_wantpiobuf_7220_intr(struct qib_devdata *dd, u32 needint)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if (needint) {
+		if (!(dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd)))
+			goto done;
+		/*
+		 * blip the availupd off, next write will be on, so
+		 * we ensure an avail update, regardless of threshold or
+		 * buffers becoming free, whenever we want an interrupt
+		 */
+		qib_write_kreg(dd, kr_sendctrl, dd->sendctrl &
+			~SYM_MASK(SendCtrl, SendBufAvailUpd));
+		qib_write_kreg(dd, kr_scratch, 0ULL);
+		dd->sendctrl |= SYM_MASK(SendCtrl, SendIntBufAvail);
+	} else
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SendIntBufAvail);
+	qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+done:
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/*
+ * Handle errors and unusual events first, separate function
+ * to improve cache hits for fast path interrupt handling.
+ */
+static noinline void unlikely_7220_intr(struct qib_devdata *dd, u64 istat)
+{
+	if (unlikely(istat & ~QLOGIC_IB_I_BITSEXTANT))
+		qib_dev_err(dd,
+			    "interrupt with unknown interrupts %Lx set\n",
+			    istat & ~QLOGIC_IB_I_BITSEXTANT);
+
+	if (istat & QLOGIC_IB_I_GPIO) {
+		u32 gpiostatus;
+
+		/*
+		 * Boards for this chip currently don't use GPIO interrupts,
+		 * so clear by writing GPIOstatus to GPIOclear, and complain
+		 * to alert developer. To avoid endless repeats, clear
+		 * the bits in the mask, since there is some kind of
+		 * programming error or chip problem.
+		 */
+		gpiostatus = qib_read_kreg32(dd, kr_gpio_status);
+		/*
+		 * In theory, writing GPIOstatus to GPIOclear could
+		 * have a bad side-effect on some diagnostic that wanted
+		 * to poll for a status-change, but the various shadows
+		 * make that problematic at best. Diags will just suppress
+		 * all GPIO interrupts during such tests.
+		 */
+		qib_write_kreg(dd, kr_gpio_clear, gpiostatus);
+
+		if (gpiostatus) {
+			const u32 mask = qib_read_kreg32(dd, kr_gpio_mask);
+			u32 gpio_irq = mask & gpiostatus;
+
+			/*
+			 * A bit set in status and (chip) Mask register
+			 * would cause an interrupt. Since we are not
+			 * expecting any, report it. Also check that the
+			 * chip reflects our shadow, report issues,
+			 * and refresh from the shadow.
+			 */
+			/*
+			 * Clear any troublemakers, and update chip
+			 * from shadow
+			 */
+			dd->cspec->gpio_mask &= ~gpio_irq;
+			qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+		}
+	}
+
+	if (istat & QLOGIC_IB_I_ERROR) {
+		u64 estat;
+
+		qib_stats.sps_errints++;
+		estat = qib_read_kreg64(dd, kr_errstatus);
+		if (!estat)
+			qib_devinfo(dd->pcidev,
+				"error interrupt (%Lx), but no error bits set!\n",
+				istat);
+		else
+			handle_7220_errors(dd, estat);
+	}
+}
+
+static irqreturn_t qib_7220intr(int irq, void *data)
+{
+	struct qib_devdata *dd = data;
+	irqreturn_t ret;
+	u64 istat;
+	u64 ctxtrbits;
+	u64 rmask;
+	unsigned i;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) {
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		ret = IRQ_HANDLED;
+		goto bail;
+	}
+
+	istat = qib_read_kreg64(dd, kr_intstatus);
+
+	if (unlikely(!istat)) {
+		ret = IRQ_NONE; /* not our interrupt, or already handled */
+		goto bail;
+	}
+	if (unlikely(istat == -1)) {
+		qib_bad_intrstatus(dd);
+		/* don't know if it was our interrupt or not */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	this_cpu_inc(*dd->int_counter);
+	if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT |
+			      QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR)))
+		unlikely_7220_intr(dd, istat);
+
+	/*
+	 * Clear the interrupt bits we found set, relatively early, so we
+	 * "know" know the chip will have seen this by the time we process
+	 * the queue, and will re-interrupt if necessary.  The processor
+	 * itself won't take the interrupt again until we return.
+	 */
+	qib_write_kreg(dd, kr_intclear, istat);
+
+	/*
+	 * Handle kernel receive queues before checking for pio buffers
+	 * available since receives can overflow; piobuf waiters can afford
+	 * a few extra cycles, since they were waiting anyway.
+	 */
+	ctxtrbits = istat &
+		((QLOGIC_IB_I_RCVAVAIL_MASK << QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+		 (QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT));
+	if (ctxtrbits) {
+		rmask = (1ULL << QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+			(1ULL << QLOGIC_IB_I_RCVURG_SHIFT);
+		for (i = 0; i < dd->first_user_ctxt; i++) {
+			if (ctxtrbits & rmask) {
+				ctxtrbits &= ~rmask;
+				qib_kreceive(dd->rcd[i], NULL, NULL);
+			}
+			rmask <<= 1;
+		}
+		if (ctxtrbits) {
+			ctxtrbits =
+				(ctxtrbits >> QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+				(ctxtrbits >> QLOGIC_IB_I_RCVURG_SHIFT);
+			qib_handle_urcv(dd, ctxtrbits);
+		}
+	}
+
+	/* only call for SDmaInt */
+	if (istat & QLOGIC_IB_I_SDMAINT)
+		sdma_7220_intr(dd->pport, istat);
+
+	if ((istat & QLOGIC_IB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED))
+		qib_ib_piobufavail(dd);
+
+	ret = IRQ_HANDLED;
+bail:
+	return ret;
+}
+
+/*
+ * Set up our chip-specific interrupt handler.
+ * The interrupt type has already been setup, so
+ * we just need to do the registration and error checking.
+ * If we are using MSI interrupts, we may fall back to
+ * INTx later, if the interrupt handler doesn't get called
+ * within 1/2 second (see verify_interrupt()).
+ */
+static void qib_setup_7220_interrupt(struct qib_devdata *dd)
+{
+	int ret;
+
+	ret = pci_request_irq(dd->pcidev, 0, qib_7220intr, NULL, dd,
+			      QIB_DRV_NAME);
+	if (ret)
+		qib_dev_err(dd, "Couldn't setup %s interrupt (irq=%d): %d\n",
+			    dd->pcidev->msi_enabled ?  "MSI" : "INTx",
+			    pci_irq_vector(dd->pcidev, 0), ret);
+}
+
+/**
+ * qib_7220_boardname - fill in the board name
+ * @dd: the qlogic_ib device
+ *
+ * info is based on the board revision register
+ */
+static void qib_7220_boardname(struct qib_devdata *dd)
+{
+	u32 boardid;
+
+	boardid = SYM_FIELD(dd->revision, Revision,
+			    BoardID);
+
+	switch (boardid) {
+	case 1:
+		dd->boardname = "InfiniPath_QLE7240";
+		break;
+	case 2:
+		dd->boardname = "InfiniPath_QLE7280";
+		break;
+	default:
+		qib_dev_err(dd, "Unknown 7220 board with ID %u\n", boardid);
+		dd->boardname = "Unknown_InfiniPath_7220";
+		break;
+	}
+
+	if (dd->majrev != 5 || !dd->minrev || dd->minrev > 2)
+		qib_dev_err(dd,
+			    "Unsupported InfiniPath hardware revision %u.%u!\n",
+			    dd->majrev, dd->minrev);
+
+	snprintf(dd->boardversion, sizeof(dd->boardversion),
+		 "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n",
+		 QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname,
+		 (unsigned int)SYM_FIELD(dd->revision, Revision_R, Arch),
+		 dd->majrev, dd->minrev,
+		 (unsigned int)SYM_FIELD(dd->revision, Revision_R, SW));
+}
+
+/*
+ * This routine sleeps, so it can only be called from user context, not
+ * from interrupt context.
+ */
+static int qib_setup_7220_reset(struct qib_devdata *dd)
+{
+	u64 val;
+	int i;
+	int ret;
+	u16 cmdval;
+	u8 int_line, clinesz;
+	unsigned long flags;
+
+	qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz);
+
+	/* Use dev_err so it shows up in logs, etc. */
+	qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit);
+
+	/* no interrupts till re-initted */
+	qib_7220_set_intr_state(dd, 0);
+
+	dd->pport->cpspec->ibdeltainprog = 0;
+	dd->pport->cpspec->ibsymdelta = 0;
+	dd->pport->cpspec->iblnkerrdelta = 0;
+
+	/*
+	 * Keep chip from being accessed until we are ready.  Use
+	 * writeq() directly, to allow the write even though QIB_PRESENT
+	 * isn't set.
+	 */
+	dd->flags &= ~(QIB_INITTED | QIB_PRESENT);
+	/* so we check interrupts work again */
+	dd->z_int_counter = qib_int_counter(dd);
+	val = dd->control | QLOGIC_IB_C_RESET;
+	writeq(val, &dd->kregbase[kr_control]);
+	mb(); /* prevent compiler reordering around actual reset */
+
+	for (i = 1; i <= 5; i++) {
+		/*
+		 * Allow MBIST, etc. to complete; longer on each retry.
+		 * We sometimes get machine checks from bus timeout if no
+		 * response, so for now, make it *really* long.
+		 */
+		msleep(1000 + (1 + i) * 2000);
+
+		qib_pcie_reenable(dd, cmdval, int_line, clinesz);
+
+		/*
+		 * Use readq directly, so we don't need to mark it as PRESENT
+		 * until we get a successful indication that all is well.
+		 */
+		val = readq(&dd->kregbase[kr_revision]);
+		if (val == dd->revision) {
+			dd->flags |= QIB_PRESENT; /* it's back */
+			ret = qib_reinit_intr(dd);
+			goto bail;
+		}
+	}
+	ret = 0; /* failed */
+
+bail:
+	if (ret) {
+		if (qib_pcie_params(dd, dd->lbus_width, NULL))
+			qib_dev_err(dd,
+				"Reset failed to setup PCIe or interrupts; continuing anyway\n");
+
+		/* hold IBC in reset, no sends, etc till later */
+		qib_write_kreg(dd, kr_control, 0ULL);
+
+		/* clear the reset error, init error/hwerror mask */
+		qib_7220_init_hwerrors(dd);
+
+		/* do setup similar to speed or link-width changes */
+		if (dd->pport->cpspec->ibcddrctrl & IBA7220_IBC_IBTA_1_2_MASK)
+			dd->cspec->presets_needed = 1;
+		spin_lock_irqsave(&dd->pport->lflags_lock, flags);
+		dd->pport->lflags |= QIBL_IB_FORCE_NOTIFY;
+		dd->pport->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+		spin_unlock_irqrestore(&dd->pport->lflags_lock, flags);
+	}
+
+	return ret;
+}
+
+/**
+ * qib_7220_put_tid - write a TID to the chip
+ * @dd: the qlogic_ib device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: 0 for eager, 1 for expected
+ * @pa: physical address of in memory buffer; tidinvalid if freeing
+ */
+static void qib_7220_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr,
+			     u32 type, unsigned long pa)
+{
+	if (pa != dd->tidinvalid) {
+		u64 chippa = pa >> IBA7220_TID_PA_SHIFT;
+
+		/* paranoia checks */
+		if (pa != (chippa << IBA7220_TID_PA_SHIFT)) {
+			qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n",
+				    pa);
+			return;
+		}
+		if (chippa >= (1UL << IBA7220_TID_SZ_SHIFT)) {
+			qib_dev_err(dd,
+				"Physical page address 0x%lx larger than supported\n",
+				pa);
+			return;
+		}
+
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			chippa |= dd->tidtemplate;
+		else /* for now, always full 4KB page */
+			chippa |= IBA7220_TID_SZ_4K;
+		pa = chippa;
+	}
+	writeq(pa, tidptr);
+	mmiowb();
+}
+
+/**
+ * qib_7220_clear_tids - clear all TID entries for a ctxt, expected and eager
+ * @dd: the qlogic_ib device
+ * @ctxt: the ctxt
+ *
+ * clear all TID entries for a ctxt, expected and eager.
+ * Used from qib_close().  On this chip, TIDs are only 32 bits,
+ * not 64, but they are still on 64 bit boundaries, so tidbase
+ * is declared as u64 * for the pointer math, even though we write 32 bits
+ */
+static void qib_7220_clear_tids(struct qib_devdata *dd,
+				struct qib_ctxtdata *rcd)
+{
+	u64 __iomem *tidbase;
+	unsigned long tidinv;
+	u32 ctxt;
+	int i;
+
+	if (!dd->kregbase || !rcd)
+		return;
+
+	ctxt = rcd->ctxt;
+
+	tidinv = dd->tidinvalid;
+	tidbase = (u64 __iomem *)
+		((char __iomem *)(dd->kregbase) +
+		 dd->rcvtidbase +
+		 ctxt * dd->rcvtidcnt * sizeof(*tidbase));
+
+	for (i = 0; i < dd->rcvtidcnt; i++)
+		qib_7220_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+				 tidinv);
+
+	tidbase = (u64 __iomem *)
+		((char __iomem *)(dd->kregbase) +
+		 dd->rcvegrbase +
+		 rcd->rcvegr_tid_base * sizeof(*tidbase));
+
+	for (i = 0; i < rcd->rcvegrcnt; i++)
+		qib_7220_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+				 tidinv);
+}
+
+/**
+ * qib_7220_tidtemplate - setup constants for TID updates
+ * @dd: the qlogic_ib device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void qib_7220_tidtemplate(struct qib_devdata *dd)
+{
+	if (dd->rcvegrbufsize == 2048)
+		dd->tidtemplate = IBA7220_TID_SZ_2K;
+	else if (dd->rcvegrbufsize == 4096)
+		dd->tidtemplate = IBA7220_TID_SZ_4K;
+	dd->tidinvalid = 0;
+}
+
+/**
+ * qib_init_7220_get_base_info - set chip-specific flags for user code
+ * @rcd: the qlogic_ib ctxt
+ * @kbase: qib_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithims.
+ */
+static int qib_7220_get_base_info(struct qib_ctxtdata *rcd,
+				  struct qib_base_info *kinfo)
+{
+	kinfo->spi_runtime_flags |= QIB_RUNTIME_PCIE |
+		QIB_RUNTIME_NODMA_RTAIL | QIB_RUNTIME_SDMA;
+
+	if (rcd->dd->flags & QIB_USE_SPCL_TRIG)
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_SPECIAL_TRIGGER;
+
+	return 0;
+}
+
+static struct qib_message_header *
+qib_7220_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr)
+{
+	u32 offset = qib_hdrget_offset(rhf_addr);
+
+	return (struct qib_message_header *)
+		(rhf_addr - dd->rhf_offset + offset);
+}
+
+static void qib_7220_config_ctxts(struct qib_devdata *dd)
+{
+	unsigned long flags;
+	u32 nchipctxts;
+
+	nchipctxts = qib_read_kreg32(dd, kr_portcnt);
+	dd->cspec->numctxts = nchipctxts;
+	if (qib_n_krcv_queues > 1) {
+		dd->qpn_mask = 0x3e;
+		dd->first_user_ctxt = qib_n_krcv_queues * dd->num_pports;
+		if (dd->first_user_ctxt > nchipctxts)
+			dd->first_user_ctxt = nchipctxts;
+	} else
+		dd->first_user_ctxt = dd->num_pports;
+	dd->n_krcv_queues = dd->first_user_ctxt;
+
+	if (!qib_cfgctxts) {
+		int nctxts = dd->first_user_ctxt + num_online_cpus();
+
+		if (nctxts <= 5)
+			dd->ctxtcnt = 5;
+		else if (nctxts <= 9)
+			dd->ctxtcnt = 9;
+		else if (nctxts <= nchipctxts)
+			dd->ctxtcnt = nchipctxts;
+	} else if (qib_cfgctxts <= nchipctxts)
+		dd->ctxtcnt = qib_cfgctxts;
+	if (!dd->ctxtcnt) /* none of the above, set to max */
+		dd->ctxtcnt = nchipctxts;
+
+	/*
+	 * Chip can be configured for 5, 9, or 17 ctxts, and choice
+	 * affects number of eager TIDs per ctxt (1K, 2K, 4K).
+	 * Lock to be paranoid about later motion, etc.
+	 */
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+	if (dd->ctxtcnt > 9)
+		dd->rcvctrl |= 2ULL << IBA7220_R_CTXTCFG_SHIFT;
+	else if (dd->ctxtcnt > 5)
+		dd->rcvctrl |= 1ULL << IBA7220_R_CTXTCFG_SHIFT;
+	/* else configure for default 5 receive ctxts */
+	if (dd->qpn_mask)
+		dd->rcvctrl |= 1ULL << QIB_7220_RcvCtrl_RcvQPMapEnable_LSB;
+	qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+
+	/* kr_rcvegrcnt changes based on the number of contexts enabled */
+	dd->cspec->rcvegrcnt = qib_read_kreg32(dd, kr_rcvegrcnt);
+	dd->rcvhdrcnt = max(dd->cspec->rcvegrcnt, IBA7220_KRCVEGRCNT);
+}
+
+static int qib_7220_get_ib_cfg(struct qib_pportdata *ppd, int which)
+{
+	int lsb, ret = 0;
+	u64 maskr; /* right-justified mask */
+
+	switch (which) {
+	case QIB_IB_CFG_LWID_ENB: /* Get allowed Link-width */
+		ret = ppd->link_width_enabled;
+		goto done;
+
+	case QIB_IB_CFG_LWID: /* Get currently active Link-width */
+		ret = ppd->link_width_active;
+		goto done;
+
+	case QIB_IB_CFG_SPD_ENB: /* Get allowed Link speeds */
+		ret = ppd->link_speed_enabled;
+		goto done;
+
+	case QIB_IB_CFG_SPD: /* Get current Link spd */
+		ret = ppd->link_speed_active;
+		goto done;
+
+	case QIB_IB_CFG_RXPOL_ENB: /* Get Auto-RX-polarity enable */
+		lsb = IBA7220_IBC_RXPOL_SHIFT;
+		maskr = IBA7220_IBC_RXPOL_MASK;
+		break;
+
+	case QIB_IB_CFG_LREV_ENB: /* Get Auto-Lane-reversal enable */
+		lsb = IBA7220_IBC_LREV_SHIFT;
+		maskr = IBA7220_IBC_LREV_MASK;
+		break;
+
+	case QIB_IB_CFG_LINKLATENCY:
+		ret = qib_read_kreg64(ppd->dd, kr_ibcddrstatus)
+			& IBA7220_DDRSTAT_LINKLAT_MASK;
+		goto done;
+
+	case QIB_IB_CFG_OP_VLS:
+		ret = ppd->vls_operational;
+		goto done;
+
+	case QIB_IB_CFG_VL_HIGH_CAP:
+		ret = 0;
+		goto done;
+
+	case QIB_IB_CFG_VL_LOW_CAP:
+		ret = 0;
+		goto done;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		ret = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl,
+				OverrunThreshold);
+		goto done;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		ret = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl,
+				PhyerrThreshold);
+		goto done;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		ret = (ppd->cpspec->ibcctrl &
+		       SYM_MASK(IBCCtrl, LinkDownDefaultState)) ?
+			IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL;
+		goto done;
+
+	case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */
+		lsb = IBA7220_IBC_HRTBT_SHIFT;
+		maskr = IBA7220_IBC_HRTBT_MASK;
+		break;
+
+	case QIB_IB_CFG_PMA_TICKS:
+		/*
+		 * 0x00 = 10x link transfer rate or 4 nsec. for 2.5Gbs
+		 * Since the clock is always 250MHz, the value is 1 or 0.
+		 */
+		ret = (ppd->link_speed_active == QIB_IB_DDR);
+		goto done;
+
+	default:
+		ret = -EINVAL;
+		goto done;
+	}
+	ret = (int)((ppd->cpspec->ibcddrctrl >> lsb) & maskr);
+done:
+	return ret;
+}
+
+static int qib_7220_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 maskr; /* right-justified mask */
+	int lsb, ret = 0, setforce = 0;
+	u16 lcmd, licmd;
+	unsigned long flags;
+	u32 tmp = 0;
+
+	switch (which) {
+	case QIB_IB_CFG_LIDLMC:
+		/*
+		 * Set LID and LMC. Combined to avoid possible hazard
+		 * caller puts LMC in 16MSbits, DLID in 16LSbits of val
+		 */
+		lsb = IBA7220_IBC_DLIDLMC_SHIFT;
+		maskr = IBA7220_IBC_DLIDLMC_MASK;
+		break;
+
+	case QIB_IB_CFG_LWID_ENB: /* set allowed Link-width */
+		/*
+		 * As with speed, only write the actual register if
+		 * the link is currently down, otherwise takes effect
+		 * on next link change.
+		 */
+		ppd->link_width_enabled = val;
+		if (!(ppd->lflags & QIBL_LINKDOWN))
+			goto bail;
+		/*
+		 * We set the QIBL_IB_FORCE_NOTIFY bit so updown
+		 * will get called because we want update
+		 * link_width_active, and the change may not take
+		 * effect for some time (if we are in POLL), so this
+		 * flag will force the updown routine to be called
+		 * on the next ibstatuschange down interrupt, even
+		 * if it's not an down->up transition.
+		 */
+		val--; /* convert from IB to chip */
+		maskr = IBA7220_IBC_WIDTH_MASK;
+		lsb = IBA7220_IBC_WIDTH_SHIFT;
+		setforce = 1;
+		break;
+
+	case QIB_IB_CFG_SPD_ENB: /* set allowed Link speeds */
+		/*
+		 * If we turn off IB1.2, need to preset SerDes defaults,
+		 * but not right now. Set a flag for the next time
+		 * we command the link down.  As with width, only write the
+		 * actual register if the link is currently down, otherwise
+		 * takes effect on next link change.  Since setting is being
+		 * explicitly requested (via MAD or sysfs), clear autoneg
+		 * failure status if speed autoneg is enabled.
+		 */
+		ppd->link_speed_enabled = val;
+		if ((ppd->cpspec->ibcddrctrl & IBA7220_IBC_IBTA_1_2_MASK) &&
+		    !(val & (val - 1)))
+			dd->cspec->presets_needed = 1;
+		if (!(ppd->lflags & QIBL_LINKDOWN))
+			goto bail;
+		/*
+		 * We set the QIBL_IB_FORCE_NOTIFY bit so updown
+		 * will get called because we want update
+		 * link_speed_active, and the change may not take
+		 * effect for some time (if we are in POLL), so this
+		 * flag will force the updown routine to be called
+		 * on the next ibstatuschange down interrupt, even
+		 * if it's not an down->up transition.
+		 */
+		if (val == (QIB_IB_SDR | QIB_IB_DDR)) {
+			val = IBA7220_IBC_SPEED_AUTONEG_MASK |
+				IBA7220_IBC_IBTA_1_2_MASK;
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		} else
+			val = val == QIB_IB_DDR ?
+				IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR;
+		maskr = IBA7220_IBC_SPEED_AUTONEG_MASK |
+			IBA7220_IBC_IBTA_1_2_MASK;
+		/* IBTA 1.2 mode + speed bits are contiguous */
+		lsb = SYM_LSB(IBCDDRCtrl, IB_ENHANCED_MODE);
+		setforce = 1;
+		break;
+
+	case QIB_IB_CFG_RXPOL_ENB: /* set Auto-RX-polarity enable */
+		lsb = IBA7220_IBC_RXPOL_SHIFT;
+		maskr = IBA7220_IBC_RXPOL_MASK;
+		break;
+
+	case QIB_IB_CFG_LREV_ENB: /* set Auto-Lane-reversal enable */
+		lsb = IBA7220_IBC_LREV_SHIFT;
+		maskr = IBA7220_IBC_LREV_MASK;
+		break;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		maskr = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl,
+				  OverrunThreshold);
+		if (maskr != val) {
+			ppd->cpspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, OverrunThreshold);
+			ppd->cpspec->ibcctrl |= (u64) val <<
+				SYM_LSB(IBCCtrl, OverrunThreshold);
+			qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		maskr = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl,
+				  PhyerrThreshold);
+		if (maskr != val) {
+			ppd->cpspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, PhyerrThreshold);
+			ppd->cpspec->ibcctrl |= (u64) val <<
+				SYM_LSB(IBCCtrl, PhyerrThreshold);
+			qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_PKEYS: /* update pkeys */
+		maskr = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) |
+			((u64) ppd->pkeys[2] << 32) |
+			((u64) ppd->pkeys[3] << 48);
+		qib_write_kreg(dd, kr_partitionkey, maskr);
+		goto bail;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		if (val == IB_LINKINITCMD_POLL)
+			ppd->cpspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, LinkDownDefaultState);
+		else /* SLEEP */
+			ppd->cpspec->ibcctrl |=
+				SYM_MASK(IBCCtrl, LinkDownDefaultState);
+		qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		goto bail;
+
+	case QIB_IB_CFG_MTU: /* update the MTU in IBC */
+		/*
+		 * Update our housekeeping variables, and set IBC max
+		 * size, same as init code; max IBC is max we allow in
+		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+		 * Set even if it's unchanged, print debug message only
+		 * on changes.
+		 */
+		val = (ppd->ibmaxlen >> 2) + 1;
+		ppd->cpspec->ibcctrl &= ~SYM_MASK(IBCCtrl, MaxPktLen);
+		ppd->cpspec->ibcctrl |= (u64)val << SYM_LSB(IBCCtrl, MaxPktLen);
+		qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		goto bail;
+
+	case QIB_IB_CFG_LSTATE: /* set the IB link state */
+		switch (val & 0xffff0000) {
+		case IB_LINKCMD_DOWN:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN;
+			if (!ppd->cpspec->ibdeltainprog &&
+			    qib_compat_ddr_negotiate) {
+				ppd->cpspec->ibdeltainprog = 1;
+				ppd->cpspec->ibsymsnap =
+					read_7220_creg32(dd, cr_ibsymbolerr);
+				ppd->cpspec->iblnkerrsnap =
+					read_7220_creg32(dd, cr_iblinkerrrecov);
+			}
+			break;
+
+		case IB_LINKCMD_ARMED:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED;
+			break;
+
+		case IB_LINKCMD_ACTIVE:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE;
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16);
+			goto bail;
+		}
+		switch (val & 0xffff) {
+		case IB_LINKINITCMD_NOP:
+			licmd = 0;
+			break;
+
+		case IB_LINKINITCMD_POLL:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL;
+			break;
+
+		case IB_LINKINITCMD_SLEEP:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP;
+			break;
+
+		case IB_LINKINITCMD_DISABLE:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE;
+			ppd->cpspec->chase_end = 0;
+			/*
+			 * stop state chase counter and timer, if running.
+			 * wait forpending timer, but don't clear .data (ppd)!
+			 */
+			if (ppd->cpspec->chase_timer.expires) {
+				del_timer_sync(&ppd->cpspec->chase_timer);
+				ppd->cpspec->chase_timer.expires = 0;
+			}
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkinitcmd req 0x%x\n",
+				    val & 0xffff);
+			goto bail;
+		}
+		qib_set_ib_7220_lstate(ppd, lcmd, licmd);
+
+		maskr = IBA7220_IBC_WIDTH_MASK;
+		lsb = IBA7220_IBC_WIDTH_SHIFT;
+		tmp = (ppd->cpspec->ibcddrctrl >> lsb) & maskr;
+		/* If the width active on the chip does not match the
+		 * width in the shadow register, write the new active
+		 * width to the chip.
+		 * We don't have to worry about speed as the speed is taken
+		 * care of by set_7220_ibspeed_fast called by ib_updown.
+		 */
+		if (ppd->link_width_enabled-1 != tmp) {
+			ppd->cpspec->ibcddrctrl &= ~(maskr << lsb);
+			ppd->cpspec->ibcddrctrl |=
+				(((u64)(ppd->link_width_enabled-1) & maskr) <<
+				 lsb);
+			qib_write_kreg(dd, kr_ibcddrctrl,
+				       ppd->cpspec->ibcddrctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags |= QIBL_IB_FORCE_NOTIFY;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_HRTBT: /* set Heartbeat off/enable/auto */
+		if (val > IBA7220_IBC_HRTBT_MASK) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		lsb = IBA7220_IBC_HRTBT_SHIFT;
+		maskr = IBA7220_IBC_HRTBT_MASK;
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+	ppd->cpspec->ibcddrctrl &= ~(maskr << lsb);
+	ppd->cpspec->ibcddrctrl |= (((u64) val & maskr) << lsb);
+	qib_write_kreg(dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+	if (setforce) {
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_FORCE_NOTIFY;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+bail:
+	return ret;
+}
+
+static int qib_7220_set_loopback(struct qib_pportdata *ppd, const char *what)
+{
+	int ret = 0;
+	u64 val, ddr;
+
+	if (!strncmp(what, "ibc", 3)) {
+		ppd->cpspec->ibcctrl |= SYM_MASK(IBCCtrl, Loopback);
+		val = 0; /* disable heart beat, so link will come up */
+		qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n",
+			 ppd->dd->unit, ppd->port);
+	} else if (!strncmp(what, "off", 3)) {
+		ppd->cpspec->ibcctrl &= ~SYM_MASK(IBCCtrl, Loopback);
+		/* enable heart beat again */
+		val = IBA7220_IBC_HRTBT_MASK << IBA7220_IBC_HRTBT_SHIFT;
+		qib_devinfo(ppd->dd->pcidev,
+			"Disabling IB%u:%u IBC loopback (normal)\n",
+			ppd->dd->unit, ppd->port);
+	} else
+		ret = -EINVAL;
+	if (!ret) {
+		qib_write_kreg(ppd->dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+		ddr = ppd->cpspec->ibcddrctrl & ~(IBA7220_IBC_HRTBT_MASK
+					     << IBA7220_IBC_HRTBT_SHIFT);
+		ppd->cpspec->ibcddrctrl = ddr | val;
+		qib_write_kreg(ppd->dd, kr_ibcddrctrl,
+			       ppd->cpspec->ibcddrctrl);
+		qib_write_kreg(ppd->dd, kr_scratch, 0);
+	}
+	return ret;
+}
+
+static void qib_update_7220_usrhead(struct qib_ctxtdata *rcd, u64 hd,
+				    u32 updegr, u32 egrhd, u32 npkts)
+{
+	if (updegr)
+		qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt);
+	mmiowb();
+	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
+	mmiowb();
+}
+
+static u32 qib_7220_hdrqempty(struct qib_ctxtdata *rcd)
+{
+	u32 head, tail;
+
+	head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt);
+	if (rcd->rcvhdrtail_kvaddr)
+		tail = qib_get_rcvhdrtail(rcd);
+	else
+		tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt);
+	return head == tail;
+}
+
+/*
+ * Modify the RCVCTRL register in chip-specific way. This
+ * is a function because bit positions and (future) register
+ * location is chip-specifc, but the needed operations are
+ * generic. <op> is a bit-mask because we often want to
+ * do multiple modifications.
+ */
+static void rcvctrl_7220_mod(struct qib_pportdata *ppd, unsigned int op,
+			     int ctxt)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 mask, val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+	if (op & QIB_RCVCTRL_TAILUPD_ENB)
+		dd->rcvctrl |= (1ULL << IBA7220_R_TAILUPD_SHIFT);
+	if (op & QIB_RCVCTRL_TAILUPD_DIS)
+		dd->rcvctrl &= ~(1ULL << IBA7220_R_TAILUPD_SHIFT);
+	if (op & QIB_RCVCTRL_PKEY_ENB)
+		dd->rcvctrl &= ~(1ULL << IBA7220_R_PKEY_DIS_SHIFT);
+	if (op & QIB_RCVCTRL_PKEY_DIS)
+		dd->rcvctrl |= (1ULL << IBA7220_R_PKEY_DIS_SHIFT);
+	if (ctxt < 0)
+		mask = (1ULL << dd->ctxtcnt) - 1;
+	else
+		mask = (1ULL << ctxt);
+	if (op & QIB_RCVCTRL_CTXT_ENB) {
+		/* always done for specific ctxt */
+		dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, PortEnable));
+		if (!(dd->flags & QIB_NODMA_RTAIL))
+			dd->rcvctrl |= 1ULL << IBA7220_R_TAILUPD_SHIFT;
+		/* Write these registers before the context is enabled. */
+		qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt,
+			dd->rcd[ctxt]->rcvhdrqtailaddr_phys);
+		qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt,
+			dd->rcd[ctxt]->rcvhdrq_phys);
+		dd->rcd[ctxt]->seq_cnt = 1;
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS)
+		dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, PortEnable));
+	if (op & QIB_RCVCTRL_INTRAVAIL_ENB)
+		dd->rcvctrl |= (mask << IBA7220_R_INTRAVAIL_SHIFT);
+	if (op & QIB_RCVCTRL_INTRAVAIL_DIS)
+		dd->rcvctrl &= ~(mask << IBA7220_R_INTRAVAIL_SHIFT);
+	qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) && dd->rhdrhead_intr_off) {
+		/* arm rcv interrupt */
+		val = qib_read_ureg32(dd, ur_rcvhdrhead, ctxt) |
+			dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_ENB) {
+		/*
+		 * Init the context registers also; if we were
+		 * disabled, tail and head should both be zero
+		 * already from the enable, but since we don't
+		 * know, we have to do it explicitly.
+		 */
+		val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt);
+		qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt);
+
+		val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt);
+		dd->rcd[ctxt]->head = val;
+		/* If kctxt, interrupt on next receive. */
+		if (ctxt < dd->first_user_ctxt)
+			val |= dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS) {
+		if (ctxt >= 0) {
+			qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt, 0);
+			qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt, 0);
+		} else {
+			unsigned i;
+
+			for (i = 0; i < dd->cfgctxts; i++) {
+				qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr,
+						    i, 0);
+				qib_write_kreg_ctxt(dd, kr_rcvhdraddr, i, 0);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+}
+
+/*
+ * Modify the SENDCTRL register in chip-specific way. This
+ * is a function there may be multiple such registers with
+ * slightly different layouts. To start, we assume the
+ * "canonical" register layout of the first chips.
+ * Chip requires no back-back sendctrl writes, so write
+ * scratch register after writing sendctrl
+ */
+static void sendctrl_7220_mod(struct qib_pportdata *ppd, u32 op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 tmp_dd_sendctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+	/* First the ones that are "sticky", saved in shadow */
+	if (op & QIB_SENDCTRL_CLEAR)
+		dd->sendctrl = 0;
+	if (op & QIB_SENDCTRL_SEND_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SPioEnable);
+	else if (op & QIB_SENDCTRL_SEND_ENB) {
+		dd->sendctrl |= SYM_MASK(SendCtrl, SPioEnable);
+		if (dd->flags & QIB_USE_SPCL_TRIG)
+			dd->sendctrl |= SYM_MASK(SendCtrl,
+						 SSpecialTriggerEn);
+	}
+	if (op & QIB_SENDCTRL_AVAIL_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+	else if (op & QIB_SENDCTRL_AVAIL_ENB)
+		dd->sendctrl |= SYM_MASK(SendCtrl, SendBufAvailUpd);
+
+	if (op & QIB_SENDCTRL_DISARM_ALL) {
+		u32 i, last;
+
+		tmp_dd_sendctrl = dd->sendctrl;
+		/*
+		 * disarm any that are not yet launched, disabling sends
+		 * and updates until done.
+		 */
+		last = dd->piobcnt2k + dd->piobcnt4k;
+		tmp_dd_sendctrl &=
+			~(SYM_MASK(SendCtrl, SPioEnable) |
+			  SYM_MASK(SendCtrl, SendBufAvailUpd));
+		for (i = 0; i < last; i++) {
+			qib_write_kreg(dd, kr_sendctrl,
+				       tmp_dd_sendctrl |
+				       SYM_MASK(SendCtrl, Disarm) | i);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+	}
+
+	tmp_dd_sendctrl = dd->sendctrl;
+
+	if (op & QIB_SENDCTRL_FLUSH)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Abort);
+	if (op & QIB_SENDCTRL_DISARM)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) |
+			((op & QIB_7220_SendCtrl_DisarmPIOBuf_RMASK) <<
+			 SYM_LSB(SendCtrl, DisarmPIOBuf));
+	if ((op & QIB_SENDCTRL_AVAIL_BLIP) &&
+	    (dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd)))
+		tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+
+	qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	if (op & QIB_SENDCTRL_AVAIL_BLIP) {
+		qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+
+	if (op & QIB_SENDCTRL_FLUSH) {
+		u32 v;
+		/*
+		 * ensure writes have hit chip, then do a few
+		 * more reads, to allow DMA of pioavail registers
+		 * to occur, so in-memory copy is in sync with
+		 * the chip.  Not always safe to sleep.
+		 */
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+}
+
+/**
+ * qib_portcntr_7220 - read a per-port counter
+ * @dd: the qlogic_ib device
+ * @creg: the counter to snapshot
+ */
+static u64 qib_portcntr_7220(struct qib_pportdata *ppd, u32 reg)
+{
+	u64 ret = 0ULL;
+	struct qib_devdata *dd = ppd->dd;
+	u16 creg;
+	/* 0xffff for unimplemented or synthesized counters */
+	static const u16 xlator[] = {
+		[QIBPORTCNTR_PKTSEND] = cr_pktsend,
+		[QIBPORTCNTR_WORDSEND] = cr_wordsend,
+		[QIBPORTCNTR_PSXMITDATA] = cr_psxmitdatacount,
+		[QIBPORTCNTR_PSXMITPKTS] = cr_psxmitpktscount,
+		[QIBPORTCNTR_PSXMITWAIT] = cr_psxmitwaitcount,
+		[QIBPORTCNTR_SENDSTALL] = cr_sendstall,
+		[QIBPORTCNTR_PKTRCV] = cr_pktrcv,
+		[QIBPORTCNTR_PSRCVDATA] = cr_psrcvdatacount,
+		[QIBPORTCNTR_PSRCVPKTS] = cr_psrcvpktscount,
+		[QIBPORTCNTR_RCVEBP] = cr_rcvebp,
+		[QIBPORTCNTR_RCVOVFL] = cr_rcvovfl,
+		[QIBPORTCNTR_WORDRCV] = cr_wordrcv,
+		[QIBPORTCNTR_RXDROPPKT] = cr_rxdroppkt,
+		[QIBPORTCNTR_RXLOCALPHYERR] = cr_rxotherlocalphyerr,
+		[QIBPORTCNTR_RXVLERR] = cr_rxvlerr,
+		[QIBPORTCNTR_ERRICRC] = cr_erricrc,
+		[QIBPORTCNTR_ERRVCRC] = cr_errvcrc,
+		[QIBPORTCNTR_ERRLPCRC] = cr_errlpcrc,
+		[QIBPORTCNTR_BADFORMAT] = cr_badformat,
+		[QIBPORTCNTR_ERR_RLEN] = cr_err_rlen,
+		[QIBPORTCNTR_IBSYMBOLERR] = cr_ibsymbolerr,
+		[QIBPORTCNTR_INVALIDRLEN] = cr_invalidrlen,
+		[QIBPORTCNTR_UNSUPVL] = cr_txunsupvl,
+		[QIBPORTCNTR_EXCESSBUFOVFL] = cr_excessbufferovfl,
+		[QIBPORTCNTR_ERRLINK] = cr_errlink,
+		[QIBPORTCNTR_IBLINKDOWN] = cr_iblinkdown,
+		[QIBPORTCNTR_IBLINKERRRECOV] = cr_iblinkerrrecov,
+		[QIBPORTCNTR_LLI] = cr_locallinkintegrityerr,
+		[QIBPORTCNTR_PSINTERVAL] = cr_psinterval,
+		[QIBPORTCNTR_PSSTART] = cr_psstart,
+		[QIBPORTCNTR_PSSTAT] = cr_psstat,
+		[QIBPORTCNTR_VL15PKTDROP] = cr_vl15droppedpkt,
+		[QIBPORTCNTR_ERRPKEY] = cr_errpkey,
+		[QIBPORTCNTR_KHDROVFL] = 0xffff,
+	};
+
+	if (reg >= ARRAY_SIZE(xlator)) {
+		qib_devinfo(ppd->dd->pcidev,
+			 "Unimplemented portcounter %u\n", reg);
+		goto done;
+	}
+	creg = xlator[reg];
+
+	if (reg == QIBPORTCNTR_KHDROVFL) {
+		int i;
+
+		/* sum over all kernel contexts */
+		for (i = 0; i < dd->first_user_ctxt; i++)
+			ret += read_7220_creg32(dd, cr_portovfl + i);
+	}
+	if (creg == 0xffff)
+		goto done;
+
+	/*
+	 * only fast incrementing counters are 64bit; use 32 bit reads to
+	 * avoid two independent reads when on opteron
+	 */
+	if ((creg == cr_wordsend || creg == cr_wordrcv ||
+	     creg == cr_pktsend || creg == cr_pktrcv))
+		ret = read_7220_creg(dd, creg);
+	else
+		ret = read_7220_creg32(dd, creg);
+	if (creg == cr_ibsymbolerr) {
+		if (dd->pport->cpspec->ibdeltainprog)
+			ret -= ret - ppd->cpspec->ibsymsnap;
+		ret -= dd->pport->cpspec->ibsymdelta;
+	} else if (creg == cr_iblinkerrrecov) {
+		if (dd->pport->cpspec->ibdeltainprog)
+			ret -= ret - ppd->cpspec->iblnkerrsnap;
+		ret -= dd->pport->cpspec->iblnkerrdelta;
+	}
+done:
+	return ret;
+}
+
+/*
+ * Device counter names (not port-specific), one line per stat,
+ * single string.  Used by utilities like ipathstats to print the stats
+ * in a way which works for different versions of drivers, without changing
+ * the utility.  Names need to be 12 chars or less (w/o newline), for proper
+ * display by utility.
+ * Non-error counters are first.
+ * Start of "error" conters is indicated by a leading "E " on the first
+ * "error" counter, and doesn't count in label length.
+ * The EgrOvfl list needs to be last so we truncate them at the configured
+ * context count for the device.
+ * cntr7220indices contains the corresponding register indices.
+ */
+static const char cntr7220names[] =
+	"Interrupts\n"
+	"HostBusStall\n"
+	"E RxTIDFull\n"
+	"RxTIDInvalid\n"
+	"Ctxt0EgrOvfl\n"
+	"Ctxt1EgrOvfl\n"
+	"Ctxt2EgrOvfl\n"
+	"Ctxt3EgrOvfl\n"
+	"Ctxt4EgrOvfl\n"
+	"Ctxt5EgrOvfl\n"
+	"Ctxt6EgrOvfl\n"
+	"Ctxt7EgrOvfl\n"
+	"Ctxt8EgrOvfl\n"
+	"Ctxt9EgrOvfl\n"
+	"Ctx10EgrOvfl\n"
+	"Ctx11EgrOvfl\n"
+	"Ctx12EgrOvfl\n"
+	"Ctx13EgrOvfl\n"
+	"Ctx14EgrOvfl\n"
+	"Ctx15EgrOvfl\n"
+	"Ctx16EgrOvfl\n";
+
+static const size_t cntr7220indices[] = {
+	cr_lbint,
+	cr_lbflowstall,
+	cr_errtidfull,
+	cr_errtidvalid,
+	cr_portovfl + 0,
+	cr_portovfl + 1,
+	cr_portovfl + 2,
+	cr_portovfl + 3,
+	cr_portovfl + 4,
+	cr_portovfl + 5,
+	cr_portovfl + 6,
+	cr_portovfl + 7,
+	cr_portovfl + 8,
+	cr_portovfl + 9,
+	cr_portovfl + 10,
+	cr_portovfl + 11,
+	cr_portovfl + 12,
+	cr_portovfl + 13,
+	cr_portovfl + 14,
+	cr_portovfl + 15,
+	cr_portovfl + 16,
+};
+
+/*
+ * same as cntr7220names and cntr7220indices, but for port-specific counters.
+ * portcntr7220indices is somewhat complicated by some registers needing
+ * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG
+ */
+static const char portcntr7220names[] =
+	"TxPkt\n"
+	"TxFlowPkt\n"
+	"TxWords\n"
+	"RxPkt\n"
+	"RxFlowPkt\n"
+	"RxWords\n"
+	"TxFlowStall\n"
+	"TxDmaDesc\n"  /* 7220 and 7322-only */
+	"E RxDlidFltr\n"  /* 7220 and 7322-only */
+	"IBStatusChng\n"
+	"IBLinkDown\n"
+	"IBLnkRecov\n"
+	"IBRxLinkErr\n"
+	"IBSymbolErr\n"
+	"RxLLIErr\n"
+	"RxBadFormat\n"
+	"RxBadLen\n"
+	"RxBufOvrfl\n"
+	"RxEBP\n"
+	"RxFlowCtlErr\n"
+	"RxICRCerr\n"
+	"RxLPCRCerr\n"
+	"RxVCRCerr\n"
+	"RxInvalLen\n"
+	"RxInvalPKey\n"
+	"RxPktDropped\n"
+	"TxBadLength\n"
+	"TxDropped\n"
+	"TxInvalLen\n"
+	"TxUnderrun\n"
+	"TxUnsupVL\n"
+	"RxLclPhyErr\n" /* 7220 and 7322-only */
+	"RxVL15Drop\n" /* 7220 and 7322-only */
+	"RxVlErr\n" /* 7220 and 7322-only */
+	"XcessBufOvfl\n" /* 7220 and 7322-only */
+	;
+
+#define _PORT_VIRT_FLAG 0x8000 /* "virtual", need adjustments */
+static const size_t portcntr7220indices[] = {
+	QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG,
+	cr_pktsendflow,
+	QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG,
+	cr_pktrcvflowctrl,
+	QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG,
+	cr_txsdmadesc,
+	cr_rxdlidfltr,
+	cr_ibstatuschange,
+	QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_LLI | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG,
+	cr_rcvflowctrl_err,
+	QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG,
+	cr_invalidslen,
+	cr_senddropped,
+	cr_errslen,
+	cr_sendunderrun,
+	cr_txunsupvl,
+	QIBPORTCNTR_RXLOCALPHYERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_VL15PKTDROP | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXVLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_EXCESSBUFOVFL | _PORT_VIRT_FLAG,
+};
+
+/* do all the setup to make the counter reads efficient later */
+static void init_7220_cntrnames(struct qib_devdata *dd)
+{
+	int i, j = 0;
+	char *s;
+
+	for (i = 0, s = (char *)cntr7220names; s && j <= dd->cfgctxts;
+	     i++) {
+		/* we always have at least one counter before the egrovfl */
+		if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12))
+			j = 1;
+		s = strchr(s + 1, '\n');
+		if (s && j)
+			j++;
+	}
+	dd->cspec->ncntrs = i;
+	if (!s)
+		/* full list; size is without terminating null */
+		dd->cspec->cntrnamelen = sizeof(cntr7220names) - 1;
+	else
+		dd->cspec->cntrnamelen = 1 + s - cntr7220names;
+	dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs
+		* sizeof(u64), GFP_KERNEL);
+
+	for (i = 0, s = (char *)portcntr7220names; s; i++)
+		s = strchr(s + 1, '\n');
+	dd->cspec->nportcntrs = i - 1;
+	dd->cspec->portcntrnamelen = sizeof(portcntr7220names) - 1;
+	dd->cspec->portcntrs = kmalloc(dd->cspec->nportcntrs
+		* sizeof(u64), GFP_KERNEL);
+}
+
+static u32 qib_read_7220cntrs(struct qib_devdata *dd, loff_t pos, char **namep,
+			      u64 **cntrp)
+{
+	u32 ret;
+
+	if (!dd->cspec->cntrs) {
+		ret = 0;
+		goto done;
+	}
+
+	if (namep) {
+		*namep = (char *)cntr7220names;
+		ret = dd->cspec->cntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+	} else {
+		u64 *cntr = dd->cspec->cntrs;
+		int i;
+
+		ret = dd->cspec->ncntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->ncntrs; i++)
+			*cntr++ = read_7220_creg32(dd, cntr7220indices[i]);
+	}
+done:
+	return ret;
+}
+
+static u32 qib_read_7220portcntrs(struct qib_devdata *dd, loff_t pos, u32 port,
+				  char **namep, u64 **cntrp)
+{
+	u32 ret;
+
+	if (!dd->cspec->portcntrs) {
+		ret = 0;
+		goto done;
+	}
+	if (namep) {
+		*namep = (char *)portcntr7220names;
+		ret = dd->cspec->portcntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+	} else {
+		u64 *cntr = dd->cspec->portcntrs;
+		struct qib_pportdata *ppd = &dd->pport[port];
+		int i;
+
+		ret = dd->cspec->nportcntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->nportcntrs; i++) {
+			if (portcntr7220indices[i] & _PORT_VIRT_FLAG)
+				*cntr++ = qib_portcntr_7220(ppd,
+					portcntr7220indices[i] &
+					~_PORT_VIRT_FLAG);
+			else
+				*cntr++ = read_7220_creg32(dd,
+					   portcntr7220indices[i]);
+		}
+	}
+done:
+	return ret;
+}
+
+/**
+ * qib_get_7220_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the qlogic_ib device qib_devdata
+ *
+ * This needs more work; in particular, decision on whether we really
+ * need traffic_wds done the way it is
+ * called from add_timer
+ */
+static void qib_get_7220_faststats(struct timer_list *t)
+{
+	struct qib_devdata *dd = from_timer(dd, t, stats_timer);
+	struct qib_pportdata *ppd = dd->pport;
+	unsigned long flags;
+	u64 traffic_wds;
+
+	/*
+	 * don't access the chip while running diags, or memory diags can
+	 * fail
+	 */
+	if (!(dd->flags & QIB_INITTED) || dd->diag_client)
+		/* but re-arm the timer, for diags case; won't hurt other */
+		goto done;
+
+	/*
+	 * We now try to maintain an activity timer, based on traffic
+	 * exceeding a threshold, so we need to check the word-counts
+	 * even if they are 64-bit.
+	 */
+	traffic_wds = qib_portcntr_7220(ppd, cr_wordsend) +
+		qib_portcntr_7220(ppd, cr_wordrcv);
+	spin_lock_irqsave(&dd->eep_st_lock, flags);
+	traffic_wds -= dd->traffic_wds;
+	dd->traffic_wds += traffic_wds;
+	spin_unlock_irqrestore(&dd->eep_st_lock, flags);
+done:
+	mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER);
+}
+
+/*
+ * If we are using MSI, try to fallback to INTx.
+ */
+static int qib_7220_intr_fallback(struct qib_devdata *dd)
+{
+	if (!dd->msi_lo)
+		return 0;
+
+	qib_devinfo(dd->pcidev,
+		    "MSI interrupt not detected, trying INTx interrupts\n");
+
+	qib_free_irq(dd);
+	dd->msi_lo = 0;
+	if (pci_alloc_irq_vectors(dd->pcidev, 1, 1, PCI_IRQ_LEGACY) < 0)
+		qib_dev_err(dd, "Failed to enable INTx\n");
+	qib_setup_7220_interrupt(dd);
+	return 1;
+}
+
+/*
+ * Reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well.
+ */
+static void qib_7220_xgxs_reset(struct qib_pportdata *ppd)
+{
+	u64 val, prev_val;
+	struct qib_devdata *dd = ppd->dd;
+
+	prev_val = qib_read_kreg64(dd, kr_xgxs_cfg);
+	val = prev_val | QLOGIC_IB_XGXS_RESET;
+	prev_val &= ~QLOGIC_IB_XGXS_RESET; /* be sure */
+	qib_write_kreg(dd, kr_control,
+		       dd->control & ~QLOGIC_IB_C_LINKENABLE);
+	qib_write_kreg(dd, kr_xgxs_cfg, val);
+	qib_read_kreg32(dd, kr_scratch);
+	qib_write_kreg(dd, kr_xgxs_cfg, prev_val);
+	qib_write_kreg(dd, kr_control, dd->control);
+}
+
+/*
+ * For this chip, we want to use the same buffer every time
+ * when we are trying to bring the link up (they are always VL15
+ * packets).  At that link state the packet should always go out immediately
+ * (or at least be discarded at the tx interface if the link is down).
+ * If it doesn't, and the buffer isn't available, that means some other
+ * sender has gotten ahead of us, and is preventing our packet from going
+ * out.  In that case, we flush all packets, and try again.  If that still
+ * fails, we fail the request, and hope things work the next time around.
+ *
+ * We don't need very complicated heuristics on whether the packet had
+ * time to go out or not, since even at SDR 1X, it goes out in very short
+ * time periods, covered by the chip reads done here and as part of the
+ * flush.
+ */
+static u32 __iomem *get_7220_link_buf(struct qib_pportdata *ppd, u32 *bnum)
+{
+	u32 __iomem *buf;
+	u32 lbuf = ppd->dd->cspec->lastbuf_for_pio;
+	int do_cleanup;
+	unsigned long flags;
+
+	/*
+	 * always blip to get avail list updated, since it's almost
+	 * always needed, and is fairly cheap.
+	 */
+	sendctrl_7220_mod(ppd->dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+	qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */
+	buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf);
+	if (buf)
+		goto done;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+	if (ppd->sdma_state.current_state == qib_sdma_state_s20_idle &&
+	    ppd->sdma_state.current_state != qib_sdma_state_s00_hw_down) {
+		__qib_sdma_process_event(ppd, qib_sdma_event_e00_go_hw_down);
+		do_cleanup = 0;
+	} else {
+		do_cleanup = 1;
+		qib_7220_sdma_hw_clean_up(ppd);
+	}
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+	if (do_cleanup) {
+		qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */
+		buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf);
+	}
+done:
+	return buf;
+}
+
+/*
+ * This code for non-IBTA-compliant IB speed negotiation is only known to
+ * work for the SDR to DDR transition, and only between an HCA and a switch
+ * with recent firmware.  It is based on observed heuristics, rather than
+ * actual knowledge of the non-compliant speed negotiation.
+ * It has a number of hard-coded fields, since the hope is to rewrite this
+ * when a spec is available on how the negoation is intended to work.
+ */
+static void autoneg_7220_sendpkt(struct qib_pportdata *ppd, u32 *hdr,
+				 u32 dcnt, u32 *data)
+{
+	int i;
+	u64 pbc;
+	u32 __iomem *piobuf;
+	u32 pnum;
+	struct qib_devdata *dd = ppd->dd;
+
+	i = 0;
+	pbc = 7 + dcnt + 1; /* 7 dword header, dword data, icrc */
+	pbc |= PBC_7220_VL15_SEND;
+	while (!(piobuf = get_7220_link_buf(ppd, &pnum))) {
+		if (i++ > 5)
+			return;
+		udelay(2);
+	}
+	sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_DISARM_BUF(pnum));
+	writeq(pbc, piobuf);
+	qib_flush_wc();
+	qib_pio_copy(piobuf + 2, hdr, 7);
+	qib_pio_copy(piobuf + 9, data, dcnt);
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pnum >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf + spcl_off);
+	}
+	qib_flush_wc();
+	qib_sendbuf_done(dd, pnum);
+}
+
+/*
+ * _start packet gets sent twice at start, _done gets sent twice at end
+ */
+static void autoneg_7220_send(struct qib_pportdata *ppd, int which)
+{
+	struct qib_devdata *dd = ppd->dd;
+	static u32 swapped;
+	u32 dw, i, hcnt, dcnt, *data;
+	static u32 hdr[7] = { 0xf002ffff, 0x48ffff, 0x6400abba };
+	static u32 madpayload_start[0x40] = {
+		0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0,
+		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x1, 0x1388, 0x15e, 0x1, /* rest 0's */
+		};
+	static u32 madpayload_done[0x40] = {
+		0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0,
+		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x40000001, 0x1388, 0x15e, /* rest 0's */
+		};
+
+	dcnt = ARRAY_SIZE(madpayload_start);
+	hcnt = ARRAY_SIZE(hdr);
+	if (!swapped) {
+		/* for maintainability, do it at runtime */
+		for (i = 0; i < hcnt; i++) {
+			dw = (__force u32) cpu_to_be32(hdr[i]);
+			hdr[i] = dw;
+		}
+		for (i = 0; i < dcnt; i++) {
+			dw = (__force u32) cpu_to_be32(madpayload_start[i]);
+			madpayload_start[i] = dw;
+			dw = (__force u32) cpu_to_be32(madpayload_done[i]);
+			madpayload_done[i] = dw;
+		}
+		swapped = 1;
+	}
+
+	data = which ? madpayload_done : madpayload_start;
+
+	autoneg_7220_sendpkt(ppd, hdr, dcnt, data);
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(2);
+	autoneg_7220_sendpkt(ppd, hdr, dcnt, data);
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(2);
+}
+
+/*
+ * Do the absolute minimum to cause an IB speed change, and make it
+ * ready, but don't actually trigger the change.   The caller will
+ * do that when ready (if link is in Polling training state, it will
+ * happen immediately, otherwise when link next goes down)
+ *
+ * This routine should only be used as part of the DDR autonegotation
+ * code for devices that are not compliant with IB 1.2 (or code that
+ * fixes things up for same).
+ *
+ * When link has gone down, and autoneg enabled, or autoneg has
+ * failed and we give up until next time we set both speeds, and
+ * then we want IBTA enabled as well as "use max enabled speed.
+ */
+static void set_7220_ibspeed_fast(struct qib_pportdata *ppd, u32 speed)
+{
+	ppd->cpspec->ibcddrctrl &= ~(IBA7220_IBC_SPEED_AUTONEG_MASK |
+		IBA7220_IBC_IBTA_1_2_MASK);
+
+	if (speed == (QIB_IB_SDR | QIB_IB_DDR))
+		ppd->cpspec->ibcddrctrl |= IBA7220_IBC_SPEED_AUTONEG_MASK |
+			IBA7220_IBC_IBTA_1_2_MASK;
+	else
+		ppd->cpspec->ibcddrctrl |= speed == QIB_IB_DDR ?
+			IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR;
+
+	qib_write_kreg(ppd->dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl);
+	qib_write_kreg(ppd->dd, kr_scratch, 0);
+}
+
+/*
+ * This routine is only used when we are not talking to another
+ * IB 1.2-compliant device that we think can do DDR.
+ * (This includes all existing switch chips as of Oct 2007.)
+ * 1.2-compliant devices go directly to DDR prior to reaching INIT
+ */
+static void try_7220_autoneg(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+
+	/*
+	 * Required for older non-IB1.2 DDR switches.  Newer
+	 * non-IB-compliant switches don't need it, but so far,
+	 * aren't bothered by it either.  "Magic constant"
+	 */
+	qib_write_kreg(ppd->dd, kr_ncmodectrl, 0x3b9dc07);
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags |= QIBL_IB_AUTONEG_INPROG;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	autoneg_7220_send(ppd, 0);
+	set_7220_ibspeed_fast(ppd, QIB_IB_DDR);
+
+	toggle_7220_rclkrls(ppd->dd);
+	/* 2 msec is minimum length of a poll cycle */
+	queue_delayed_work(ib_wq, &ppd->cpspec->autoneg_work,
+			   msecs_to_jiffies(2));
+}
+
+/*
+ * Handle the empirically determined mechanism for auto-negotiation
+ * of DDR speed with switches.
+ */
+static void autoneg_7220_work(struct work_struct *work)
+{
+	struct qib_pportdata *ppd;
+	struct qib_devdata *dd;
+	u32 i;
+	unsigned long flags;
+
+	ppd = &container_of(work, struct qib_chippport_specific,
+			    autoneg_work.work)->pportdata;
+	dd = ppd->dd;
+
+	/*
+	 * Busy wait for this first part, it should be at most a
+	 * few hundred usec, since we scheduled ourselves for 2msec.
+	 */
+	for (i = 0; i < 25; i++) {
+		if (SYM_FIELD(ppd->lastibcstat, IBCStatus, LinkTrainingState)
+		     == IB_7220_LT_STATE_POLLQUIET) {
+			qib_set_linkstate(ppd, QIB_IB_LINKDOWN_DISABLE);
+			break;
+		}
+		udelay(100);
+	}
+
+	if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+		goto done; /* we got there early or told to stop */
+
+	/* we expect this to timeout */
+	if (wait_event_timeout(ppd->cpspec->autoneg_wait,
+			       !(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+			       msecs_to_jiffies(90)))
+		goto done;
+
+	toggle_7220_rclkrls(dd);
+
+	/* we expect this to timeout */
+	if (wait_event_timeout(ppd->cpspec->autoneg_wait,
+			       !(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+			       msecs_to_jiffies(1700)))
+		goto done;
+
+	set_7220_ibspeed_fast(ppd, QIB_IB_SDR);
+	toggle_7220_rclkrls(dd);
+
+	/*
+	 * Wait up to 250 msec for link to train and get to INIT at DDR;
+	 * this should terminate early.
+	 */
+	wait_event_timeout(ppd->cpspec->autoneg_wait,
+		!(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+		msecs_to_jiffies(250));
+done:
+	if (ppd->lflags & QIBL_IB_AUTONEG_INPROG) {
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG;
+		if (dd->cspec->autoneg_tries == AUTONEG_TRIES) {
+			ppd->lflags |= QIBL_IB_AUTONEG_FAILED;
+			dd->cspec->autoneg_tries = 0;
+		}
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		set_7220_ibspeed_fast(ppd, ppd->link_speed_enabled);
+	}
+}
+
+static u32 qib_7220_iblink_state(u64 ibcs)
+{
+	u32 state = (u32)SYM_FIELD(ibcs, IBCStatus, LinkState);
+
+	switch (state) {
+	case IB_7220_L_STATE_INIT:
+		state = IB_PORT_INIT;
+		break;
+	case IB_7220_L_STATE_ARM:
+		state = IB_PORT_ARMED;
+		break;
+	case IB_7220_L_STATE_ACTIVE:
+		/* fall through */
+	case IB_7220_L_STATE_ACT_DEFER:
+		state = IB_PORT_ACTIVE;
+		break;
+	default: /* fall through */
+	case IB_7220_L_STATE_DOWN:
+		state = IB_PORT_DOWN;
+		break;
+	}
+	return state;
+}
+
+/* returns the IBTA port state, rather than the IBC link training state */
+static u8 qib_7220_phys_portstate(u64 ibcs)
+{
+	u8 state = (u8)SYM_FIELD(ibcs, IBCStatus, LinkTrainingState);
+	return qib_7220_physportstate[state];
+}
+
+static int qib_7220_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs)
+{
+	int ret = 0, symadj = 0;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+	if (!ibup) {
+		/*
+		 * When the link goes down we don't want AEQ running, so it
+		 * won't interfere with IBC training, etc., and we need
+		 * to go back to the static SerDes preset values.
+		 */
+		if (!(ppd->lflags & (QIBL_IB_AUTONEG_FAILED |
+				     QIBL_IB_AUTONEG_INPROG)))
+			set_7220_ibspeed_fast(ppd, ppd->link_speed_enabled);
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+			qib_sd7220_presets(dd);
+			qib_cancel_sends(ppd); /* initial disarm, etc. */
+			spin_lock_irqsave(&ppd->sdma_lock, flags);
+			if (__qib_sdma_running(ppd))
+				__qib_sdma_process_event(ppd,
+					qib_sdma_event_e70_go_idle);
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+		}
+		/* this might better in qib_sd7220_presets() */
+		set_7220_relock_poll(dd, ibup);
+	} else {
+		if (qib_compat_ddr_negotiate &&
+		    !(ppd->lflags & (QIBL_IB_AUTONEG_FAILED |
+				     QIBL_IB_AUTONEG_INPROG)) &&
+		    ppd->link_speed_active == QIB_IB_SDR &&
+		    (ppd->link_speed_enabled & (QIB_IB_DDR | QIB_IB_SDR)) ==
+		    (QIB_IB_DDR | QIB_IB_SDR) &&
+		    dd->cspec->autoneg_tries < AUTONEG_TRIES) {
+			/* we are SDR, and DDR auto-negotiation enabled */
+			++dd->cspec->autoneg_tries;
+			if (!ppd->cpspec->ibdeltainprog) {
+				ppd->cpspec->ibdeltainprog = 1;
+				ppd->cpspec->ibsymsnap = read_7220_creg32(dd,
+					cr_ibsymbolerr);
+				ppd->cpspec->iblnkerrsnap = read_7220_creg32(dd,
+					cr_iblinkerrrecov);
+			}
+			try_7220_autoneg(ppd);
+			ret = 1; /* no other IB status change processing */
+		} else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) &&
+			   ppd->link_speed_active == QIB_IB_SDR) {
+			autoneg_7220_send(ppd, 1);
+			set_7220_ibspeed_fast(ppd, QIB_IB_DDR);
+			udelay(2);
+			toggle_7220_rclkrls(dd);
+			ret = 1; /* no other IB status change processing */
+		} else {
+			if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) &&
+			    (ppd->link_speed_active & QIB_IB_DDR)) {
+				spin_lock_irqsave(&ppd->lflags_lock, flags);
+				ppd->lflags &= ~(QIBL_IB_AUTONEG_INPROG |
+						 QIBL_IB_AUTONEG_FAILED);
+				spin_unlock_irqrestore(&ppd->lflags_lock,
+						       flags);
+				dd->cspec->autoneg_tries = 0;
+				/* re-enable SDR, for next link down */
+				set_7220_ibspeed_fast(ppd,
+						      ppd->link_speed_enabled);
+				wake_up(&ppd->cpspec->autoneg_wait);
+				symadj = 1;
+			} else if (ppd->lflags & QIBL_IB_AUTONEG_FAILED) {
+				/*
+				 * Clear autoneg failure flag, and do setup
+				 * so we'll try next time link goes down and
+				 * back to INIT (possibly connected to a
+				 * different device).
+				 */
+				spin_lock_irqsave(&ppd->lflags_lock, flags);
+				ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+				spin_unlock_irqrestore(&ppd->lflags_lock,
+						       flags);
+				ppd->cpspec->ibcddrctrl |=
+					IBA7220_IBC_IBTA_1_2_MASK;
+				qib_write_kreg(dd, kr_ncmodectrl, 0);
+				symadj = 1;
+			}
+		}
+
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+			symadj = 1;
+
+		if (!ret) {
+			ppd->delay_mult = rate_to_delay
+			    [(ibcs >> IBA7220_LINKSPEED_SHIFT) & 1]
+			    [(ibcs >> IBA7220_LINKWIDTH_SHIFT) & 1];
+
+			set_7220_relock_poll(dd, ibup);
+			spin_lock_irqsave(&ppd->sdma_lock, flags);
+			/*
+			 * Unlike 7322, the 7220 needs this, due to lack of
+			 * interrupt in some cases when we have sdma active
+			 * when the link goes down.
+			 */
+			if (ppd->sdma_state.current_state !=
+			    qib_sdma_state_s20_idle)
+				__qib_sdma_process_event(ppd,
+					qib_sdma_event_e00_go_hw_down);
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+		}
+	}
+
+	if (symadj) {
+		if (ppd->cpspec->ibdeltainprog) {
+			ppd->cpspec->ibdeltainprog = 0;
+			ppd->cpspec->ibsymdelta += read_7220_creg32(ppd->dd,
+				cr_ibsymbolerr) - ppd->cpspec->ibsymsnap;
+			ppd->cpspec->iblnkerrdelta += read_7220_creg32(ppd->dd,
+				cr_iblinkerrrecov) - ppd->cpspec->iblnkerrsnap;
+		}
+	} else if (!ibup && qib_compat_ddr_negotiate &&
+		   !ppd->cpspec->ibdeltainprog &&
+			!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+		ppd->cpspec->ibdeltainprog = 1;
+		ppd->cpspec->ibsymsnap = read_7220_creg32(ppd->dd,
+							  cr_ibsymbolerr);
+		ppd->cpspec->iblnkerrsnap = read_7220_creg32(ppd->dd,
+						     cr_iblinkerrrecov);
+	}
+
+	if (!ret)
+		qib_setup_7220_setextled(ppd, ibup);
+	return ret;
+}
+
+/*
+ * Does read/modify/write to appropriate registers to
+ * set output and direction bits selected by mask.
+ * these are in their canonical postions (e.g. lsb of
+ * dir will end up in D48 of extctrl on existing chips).
+ * returns contents of GP Inputs.
+ */
+static int gpio_7220_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask)
+{
+	u64 read_val, new_out;
+	unsigned long flags;
+
+	if (mask) {
+		/* some bits being written, lock access to GPIO */
+		dir &= mask;
+		out &= mask;
+		spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+		dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe));
+		dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe));
+		new_out = (dd->cspec->gpio_out & ~mask) | out;
+
+		qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+		qib_write_kreg(dd, kr_gpio_out, new_out);
+		dd->cspec->gpio_out = new_out;
+		spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+	}
+	/*
+	 * It is unlikely that a read at this time would get valid
+	 * data on a pin whose direction line was set in the same
+	 * call to this function. We include the read here because
+	 * that allows us to potentially combine a change on one pin with
+	 * a read on another, and because the old code did something like
+	 * this.
+	 */
+	read_val = qib_read_kreg64(dd, kr_extstatus);
+	return SYM_FIELD(read_val, EXTStatus, GPIOIn);
+}
+
+/*
+ * Read fundamental info we need to use the chip.  These are
+ * the registers that describe chip capabilities, and are
+ * saved in shadow registers.
+ */
+static void get_7220_chip_params(struct qib_devdata *dd)
+{
+	u64 val;
+	u32 piobufs;
+	int mtu;
+
+	dd->uregbase = qib_read_kreg32(dd, kr_userregbase);
+
+	dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt);
+	dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase);
+	dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase);
+	dd->palign = qib_read_kreg32(dd, kr_palign);
+	dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase);
+	dd->pio2k_bufbase = dd->piobufbase & 0xffffffff;
+
+	val = qib_read_kreg64(dd, kr_sendpiosize);
+	dd->piosize2k = val & ~0U;
+	dd->piosize4k = val >> 32;
+
+	mtu = ib_mtu_enum_to_int(qib_ibmtu);
+	if (mtu == -1)
+		mtu = QIB_DEFAULT_MTU;
+	dd->pport->ibmtu = (u32)mtu;
+
+	val = qib_read_kreg64(dd, kr_sendpiobufcnt);
+	dd->piobcnt2k = val & ~0U;
+	dd->piobcnt4k = val >> 32;
+	/* these may be adjusted in init_chip_wc_pat() */
+	dd->pio2kbase = (u32 __iomem *)
+		((char __iomem *) dd->kregbase + dd->pio2k_bufbase);
+	if (dd->piobcnt4k) {
+		dd->pio4kbase = (u32 __iomem *)
+			((char __iomem *) dd->kregbase +
+			 (dd->piobufbase >> 32));
+		/*
+		 * 4K buffers take 2 pages; we use roundup just to be
+		 * paranoid; we calculate it once here, rather than on
+		 * ever buf allocate
+		 */
+		dd->align4k = ALIGN(dd->piosize4k, dd->palign);
+	}
+
+	piobufs = dd->piobcnt4k + dd->piobcnt2k;
+
+	dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) /
+		(sizeof(u64) * BITS_PER_BYTE / 2);
+}
+
+/*
+ * The chip base addresses in cspec and cpspec have to be set
+ * after possible init_chip_wc_pat(), rather than in
+ * qib_get_7220_chip_params(), so split out as separate function
+ */
+static void set_7220_baseaddrs(struct qib_devdata *dd)
+{
+	u32 cregbase;
+	/* init after possible re-map in init_chip_wc_pat() */
+	cregbase = qib_read_kreg32(dd, kr_counterregbase);
+	dd->cspec->cregbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + cregbase);
+
+	dd->egrtidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + dd->rcvegrbase);
+}
+
+
+#define SENDCTRL_SHADOWED (SYM_MASK(SendCtrl, SendIntBufAvail) |	\
+			   SYM_MASK(SendCtrl, SPioEnable) |		\
+			   SYM_MASK(SendCtrl, SSpecialTriggerEn) |	\
+			   SYM_MASK(SendCtrl, SendBufAvailUpd) |	\
+			   SYM_MASK(SendCtrl, AvailUpdThld) |		\
+			   SYM_MASK(SendCtrl, SDmaEnable) |		\
+			   SYM_MASK(SendCtrl, SDmaIntEnable) |		\
+			   SYM_MASK(SendCtrl, SDmaHalt) |		\
+			   SYM_MASK(SendCtrl, SDmaSingleDescriptor))
+
+static int sendctrl_hook(struct qib_devdata *dd,
+			 const struct diag_observer *op,
+			 u32 offs, u64 *data, u64 mask, int only_32)
+{
+	unsigned long flags;
+	unsigned idx = offs / sizeof(u64);
+	u64 local_data, all_bits;
+
+	if (idx != kr_sendctrl) {
+		qib_dev_err(dd, "SendCtrl Hook called with offs %X, %s-bit\n",
+			    offs, only_32 ? "32" : "64");
+		return 0;
+	}
+
+	all_bits = ~0ULL;
+	if (only_32)
+		all_bits >>= 32;
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if ((mask & all_bits) != all_bits) {
+		/*
+		 * At least some mask bits are zero, so we need
+		 * to read. The judgement call is whether from
+		 * reg or shadow. First-cut: read reg, and complain
+		 * if any bits which should be shadowed are different
+		 * from their shadowed value.
+		 */
+		if (only_32)
+			local_data = (u64)qib_read_kreg32(dd, idx);
+		else
+			local_data = qib_read_kreg64(dd, idx);
+		qib_dev_err(dd, "Sendctrl -> %X, Shad -> %X\n",
+			    (u32)local_data, (u32)dd->sendctrl);
+		if ((local_data & SENDCTRL_SHADOWED) !=
+		    (dd->sendctrl & SENDCTRL_SHADOWED))
+			qib_dev_err(dd, "Sendctrl read: %X shadow is %X\n",
+				(u32)local_data, (u32) dd->sendctrl);
+		*data = (local_data & ~mask) | (*data & mask);
+	}
+	if (mask) {
+		/*
+		 * At least some mask bits are one, so we need
+		 * to write, but only shadow some bits.
+		 */
+		u64 sval, tval; /* Shadowed, transient */
+
+		/*
+		 * New shadow val is bits we don't want to touch,
+		 * ORed with bits we do, that are intended for shadow.
+		 */
+		sval = (dd->sendctrl & ~mask);
+		sval |= *data & SENDCTRL_SHADOWED & mask;
+		dd->sendctrl = sval;
+		tval = sval | (*data & ~SENDCTRL_SHADOWED & mask);
+		qib_dev_err(dd, "Sendctrl <- %X, Shad <- %X\n",
+			    (u32)tval, (u32)sval);
+		qib_write_kreg(dd, kr_sendctrl, tval);
+		qib_write_kreg(dd, kr_scratch, 0Ull);
+	}
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+
+	return only_32 ? 4 : 8;
+}
+
+static const struct diag_observer sendctrl_observer = {
+	sendctrl_hook, kr_sendctrl * sizeof(u64),
+	kr_sendctrl * sizeof(u64)
+};
+
+/*
+ * write the final few registers that depend on some of the
+ * init setup.  Done late in init, just before bringing up
+ * the serdes.
+ */
+static int qib_late_7220_initreg(struct qib_devdata *dd)
+{
+	int ret = 0;
+	u64 val;
+
+	qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize);
+	qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize);
+	qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt);
+	qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys);
+	val = qib_read_kreg64(dd, kr_sendpioavailaddr);
+	if (val != dd->pioavailregs_phys) {
+		qib_dev_err(dd,
+			"Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n",
+			(unsigned long) dd->pioavailregs_phys,
+			(unsigned long long) val);
+		ret = -EINVAL;
+	}
+	qib_register_observer(dd, &sendctrl_observer);
+	return ret;
+}
+
+static int qib_init_7220_variables(struct qib_devdata *dd)
+{
+	struct qib_chippport_specific *cpspec;
+	struct qib_pportdata *ppd;
+	int ret = 0;
+	u32 sbufs, updthresh;
+
+	cpspec = (struct qib_chippport_specific *)(dd + 1);
+	ppd = &cpspec->pportdata;
+	dd->pport = ppd;
+	dd->num_pports = 1;
+
+	dd->cspec = (struct qib_chip_specific *)(cpspec + dd->num_pports);
+	dd->cspec->dd = dd;
+	ppd->cpspec = cpspec;
+
+	spin_lock_init(&dd->cspec->sdepb_lock);
+	spin_lock_init(&dd->cspec->rcvmod_lock);
+	spin_lock_init(&dd->cspec->gpio_lock);
+
+	/* we haven't yet set QIB_PRESENT, so use read directly */
+	dd->revision = readq(&dd->kregbase[kr_revision]);
+
+	if ((dd->revision & 0xffffffffU) == 0xffffffffU) {
+		qib_dev_err(dd,
+			"Revision register read failure, giving up initialization\n");
+		ret = -ENODEV;
+		goto bail;
+	}
+	dd->flags |= QIB_PRESENT;  /* now register routines work */
+
+	dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R,
+				    ChipRevMajor);
+	dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R,
+				    ChipRevMinor);
+
+	get_7220_chip_params(dd);
+	qib_7220_boardname(dd);
+
+	/*
+	 * GPIO bits for TWSI data and clock,
+	 * used for serial EEPROM.
+	 */
+	dd->gpio_sda_num = _QIB_GPIO_SDA_NUM;
+	dd->gpio_scl_num = _QIB_GPIO_SCL_NUM;
+	dd->twsi_eeprom_dev = QIB_TWSI_EEPROM_DEV;
+
+	dd->flags |= QIB_HAS_INTX | QIB_HAS_LINK_LATENCY |
+		QIB_NODMA_RTAIL | QIB_HAS_THRESH_UPDATE;
+	dd->flags |= qib_special_trigger ?
+		QIB_USE_SPCL_TRIG : QIB_HAS_SEND_DMA;
+
+	init_waitqueue_head(&cpspec->autoneg_wait);
+	INIT_DELAYED_WORK(&cpspec->autoneg_work, autoneg_7220_work);
+
+	ret = qib_init_pportdata(ppd, dd, 0, 1);
+	if (ret)
+		goto bail;
+	ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+	ppd->link_speed_supported = QIB_IB_SDR | QIB_IB_DDR;
+
+	ppd->link_width_enabled = ppd->link_width_supported;
+	ppd->link_speed_enabled = ppd->link_speed_supported;
+	/*
+	 * Set the initial values to reasonable default, will be set
+	 * for real when link is up.
+	 */
+	ppd->link_width_active = IB_WIDTH_4X;
+	ppd->link_speed_active = QIB_IB_SDR;
+	ppd->delay_mult = rate_to_delay[0][1];
+	ppd->vls_supported = IB_VL_VL0;
+	ppd->vls_operational = ppd->vls_supported;
+
+	if (!qib_mini_init)
+		qib_write_kreg(dd, kr_rcvbthqp, QIB_KD_QP);
+
+	timer_setup(&ppd->cpspec->chase_timer, reenable_7220_chase, 0);
+
+	qib_num_cfg_vls = 1; /* if any 7220's, only one VL */
+
+	dd->rcvhdrentsize = QIB_RCVHDR_ENTSIZE;
+	dd->rcvhdrsize = QIB_DFLT_RCVHDRSIZE;
+	dd->rhf_offset =
+		dd->rcvhdrentsize - sizeof(u64) / sizeof(u32);
+
+	/* we always allocate at least 2048 bytes for eager buffers */
+	ret = ib_mtu_enum_to_int(qib_ibmtu);
+	dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU;
+	BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
+	dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
+
+	qib_7220_tidtemplate(dd);
+
+	/*
+	 * We can request a receive interrupt for 1 or
+	 * more packets from current offset.  For now, we set this
+	 * up for a single packet.
+	 */
+	dd->rhdrhead_intr_off = 1ULL << 32;
+
+	/* setup the stats timer; the add_timer is done at end of init */
+	timer_setup(&dd->stats_timer, qib_get_7220_faststats, 0);
+	dd->stats_timer.expires = jiffies + ACTIVITY_TIMER * HZ;
+
+	/*
+	 * Control[4] has been added to change the arbitration within
+	 * the SDMA engine between favoring data fetches over descriptor
+	 * fetches.  qib_sdma_fetch_arb==0 gives data fetches priority.
+	 */
+	if (qib_sdma_fetch_arb)
+		dd->control |= 1 << 4;
+
+	dd->ureg_align = 0x10000;  /* 64KB alignment */
+
+	dd->piosize2kmax_dwords = (dd->piosize2k >> 2)-1;
+	qib_7220_config_ctxts(dd);
+	qib_set_ctxtcnt(dd);  /* needed for PAT setup */
+
+	ret = init_chip_wc_pat(dd, 0);
+	if (ret)
+		goto bail;
+	set_7220_baseaddrs(dd); /* set chip access pointers now */
+
+	ret = 0;
+	if (qib_mini_init)
+		goto bail;
+
+	ret = qib_create_ctxts(dd);
+	init_7220_cntrnames(dd);
+
+	/* use all of 4KB buffers for the kernel SDMA, zero if !SDMA.
+	 * reserve the update threshold amount for other kernel use, such
+	 * as sending SMI, MAD, and ACKs, or 3, whichever is greater,
+	 * unless we aren't enabling SDMA, in which case we want to use
+	 * all the 4k bufs for the kernel.
+	 * if this was less than the update threshold, we could wait
+	 * a long time for an update.  Coded this way because we
+	 * sometimes change the update threshold for various reasons,
+	 * and we want this to remain robust.
+	 */
+	updthresh = 8U; /* update threshold */
+	if (dd->flags & QIB_HAS_SEND_DMA) {
+		dd->cspec->sdmabufcnt =  dd->piobcnt4k;
+		sbufs = updthresh > 3 ? updthresh : 3;
+	} else {
+		dd->cspec->sdmabufcnt = 0;
+		sbufs = dd->piobcnt4k;
+	}
+
+	dd->cspec->lastbuf_for_pio = dd->piobcnt2k + dd->piobcnt4k -
+		dd->cspec->sdmabufcnt;
+	dd->lastctxt_piobuf = dd->cspec->lastbuf_for_pio - sbufs;
+	dd->cspec->lastbuf_for_pio--; /* range is <= , not < */
+	dd->last_pio = dd->cspec->lastbuf_for_pio;
+	dd->pbufsctxt = dd->lastctxt_piobuf /
+		(dd->cfgctxts - dd->first_user_ctxt);
+
+	/*
+	 * if we are at 16 user contexts, we will have one 7 sbufs
+	 * per context, so drop the update threshold to match.  We
+	 * want to update before we actually run out, at low pbufs/ctxt
+	 * so give ourselves some margin
+	 */
+	if ((dd->pbufsctxt - 2) < updthresh)
+		updthresh = dd->pbufsctxt - 2;
+
+	dd->cspec->updthresh_dflt = updthresh;
+	dd->cspec->updthresh = updthresh;
+
+	/* before full enable, no interrupts, no locking needed */
+	dd->sendctrl |= (updthresh & SYM_RMASK(SendCtrl, AvailUpdThld))
+			     << SYM_LSB(SendCtrl, AvailUpdThld);
+
+	dd->psxmitwait_supported = 1;
+	dd->psxmitwait_check_rate = QIB_7220_PSXMITWAIT_CHECK_RATE;
+bail:
+	return ret;
+}
+
+static u32 __iomem *qib_7220_getsendbuf(struct qib_pportdata *ppd, u64 pbc,
+					u32 *pbufnum)
+{
+	u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK;
+	struct qib_devdata *dd = ppd->dd;
+	u32 __iomem *buf;
+
+	if (((pbc >> 32) & PBC_7220_VL15_SEND_CTRL) &&
+		!(ppd->lflags & (QIBL_IB_AUTONEG_INPROG | QIBL_LINKACTIVE)))
+		buf = get_7220_link_buf(ppd, pbufnum);
+	else {
+		if ((plen + 1) > dd->piosize2kmax_dwords)
+			first = dd->piobcnt2k;
+		else
+			first = 0;
+		/* try 4k if all 2k busy, so same last for both sizes */
+		last = dd->cspec->lastbuf_for_pio;
+		buf = qib_getsendbuf_range(dd, pbufnum, first, last);
+	}
+	return buf;
+}
+
+/* these 2 "counters" are really control registers, and are always RW */
+static void qib_set_cntr_7220_sample(struct qib_pportdata *ppd, u32 intv,
+				     u32 start)
+{
+	write_7220_creg(ppd->dd, cr_psinterval, intv);
+	write_7220_creg(ppd->dd, cr_psstart, start);
+}
+
+/*
+ * NOTE: no real attempt is made to generalize the SDMA stuff.
+ * At some point "soon" we will have a new more generalized
+ * set of sdma interface, and then we'll clean this up.
+ */
+
+/* Must be called with sdma_lock held, or before init finished */
+static void qib_sdma_update_7220_tail(struct qib_pportdata *ppd, u16 tail)
+{
+	/* Commit writes to memory and advance the tail on the chip */
+	wmb();
+	ppd->sdma_descq_tail = tail;
+	qib_write_kreg(ppd->dd, kr_senddmatail, tail);
+}
+
+static void qib_sdma_set_7220_desc_cnt(struct qib_pportdata *ppd, unsigned cnt)
+{
+}
+
+static struct sdma_set_state_action sdma_7220_action_table[] = {
+	[qib_sdma_state_s00_hw_down] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.go_s99_running_tofalse = 1,
+	},
+	[qib_sdma_state_s10_hw_start_up_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+	},
+	[qib_sdma_state_s20_idle] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+	},
+	[qib_sdma_state_s30_sw_clean_up_wait] = {
+		.op_enable = 0,
+		.op_intenable = 1,
+		.op_halt = 0,
+	},
+	[qib_sdma_state_s40_hw_clean_up_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+	},
+	[qib_sdma_state_s50_hw_halt_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+	},
+	[qib_sdma_state_s99_running] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 0,
+		.go_s99_running_totrue = 1,
+	},
+};
+
+static void qib_7220_sdma_init_early(struct qib_pportdata *ppd)
+{
+	ppd->sdma_state.set_state_action = sdma_7220_action_table;
+}
+
+static int init_sdma_7220_regs(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned i, n;
+	u64 senddmabufmask[3] = { 0 };
+
+	/* Set SendDmaBase */
+	qib_write_kreg(dd, kr_senddmabase, ppd->sdma_descq_phys);
+	qib_sdma_7220_setlengen(ppd);
+	qib_sdma_update_7220_tail(ppd, 0); /* Set SendDmaTail */
+	/* Set SendDmaHeadAddr */
+	qib_write_kreg(dd, kr_senddmaheadaddr, ppd->sdma_head_phys);
+
+	/*
+	 * Reserve all the former "kernel" piobufs, using high number range
+	 * so we get as many 4K buffers as possible
+	 */
+	n = dd->piobcnt2k + dd->piobcnt4k;
+	i = n - dd->cspec->sdmabufcnt;
+
+	for (; i < n; ++i) {
+		unsigned word = i / 64;
+		unsigned bit = i & 63;
+
+		BUG_ON(word >= 3);
+		senddmabufmask[word] |= 1ULL << bit;
+	}
+	qib_write_kreg(dd, kr_senddmabufmask0, senddmabufmask[0]);
+	qib_write_kreg(dd, kr_senddmabufmask1, senddmabufmask[1]);
+	qib_write_kreg(dd, kr_senddmabufmask2, senddmabufmask[2]);
+
+	ppd->sdma_state.first_sendbuf = i;
+	ppd->sdma_state.last_sendbuf = n;
+
+	return 0;
+}
+
+/* sdma_lock must be held */
+static u16 qib_sdma_7220_gethead(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int sane;
+	int use_dmahead;
+	u16 swhead;
+	u16 swtail;
+	u16 cnt;
+	u16 hwhead;
+
+	use_dmahead = __qib_sdma_running(ppd) &&
+		(dd->flags & QIB_HAS_SDMA_TIMEOUT);
+retry:
+	hwhead = use_dmahead ?
+		(u16)le64_to_cpu(*ppd->sdma_head_dma) :
+		(u16)qib_read_kreg32(dd, kr_senddmahead);
+
+	swhead = ppd->sdma_descq_head;
+	swtail = ppd->sdma_descq_tail;
+	cnt = ppd->sdma_descq_cnt;
+
+	if (swhead < swtail) {
+		/* not wrapped */
+		sane = (hwhead >= swhead) & (hwhead <= swtail);
+	} else if (swhead > swtail) {
+		/* wrapped around */
+		sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
+			(hwhead <= swtail);
+	} else {
+		/* empty */
+		sane = (hwhead == swhead);
+	}
+
+	if (unlikely(!sane)) {
+		if (use_dmahead) {
+			/* try one more time, directly from the register */
+			use_dmahead = 0;
+			goto retry;
+		}
+		/* assume no progress */
+		hwhead = swhead;
+	}
+
+	return hwhead;
+}
+
+static int qib_sdma_7220_busy(struct qib_pportdata *ppd)
+{
+	u64 hwstatus = qib_read_kreg64(ppd->dd, kr_senddmastatus);
+
+	return (hwstatus & SYM_MASK(SendDmaStatus, ScoreBoardDrainInProg)) ||
+	       (hwstatus & SYM_MASK(SendDmaStatus, AbortInProg)) ||
+	       (hwstatus & SYM_MASK(SendDmaStatus, InternalSDmaEnable)) ||
+	       !(hwstatus & SYM_MASK(SendDmaStatus, ScbEmpty));
+}
+
+/*
+ * Compute the amount of delay before sending the next packet if the
+ * port's send rate differs from the static rate set for the QP.
+ * Since the delay affects this packet but the amount of the delay is
+ * based on the length of the previous packet, use the last delay computed
+ * and save the delay count for this packet to be used next time
+ * we get here.
+ */
+static u32 qib_7220_setpbc_control(struct qib_pportdata *ppd, u32 plen,
+				   u8 srate, u8 vl)
+{
+	u8 snd_mult = ppd->delay_mult;
+	u8 rcv_mult = ib_rate_to_delay[srate];
+	u32 ret = ppd->cpspec->last_delay_mult;
+
+	ppd->cpspec->last_delay_mult = (rcv_mult > snd_mult) ?
+		(plen * (rcv_mult - snd_mult) + 1) >> 1 : 0;
+
+	/* Indicate VL15, if necessary */
+	if (vl == 15)
+		ret |= PBC_7220_VL15_SEND_CTRL;
+	return ret;
+}
+
+static void qib_7220_initvl15_bufs(struct qib_devdata *dd)
+{
+}
+
+static void qib_7220_init_ctxt(struct qib_ctxtdata *rcd)
+{
+	if (!rcd->ctxt) {
+		rcd->rcvegrcnt = IBA7220_KRCVEGRCNT;
+		rcd->rcvegr_tid_base = 0;
+	} else {
+		rcd->rcvegrcnt = rcd->dd->cspec->rcvegrcnt;
+		rcd->rcvegr_tid_base = IBA7220_KRCVEGRCNT +
+			(rcd->ctxt - 1) * rcd->rcvegrcnt;
+	}
+}
+
+static void qib_7220_txchk_change(struct qib_devdata *dd, u32 start,
+				  u32 len, u32 which, struct qib_ctxtdata *rcd)
+{
+	int i;
+	unsigned long flags;
+
+	switch (which) {
+	case TXCHK_CHG_TYPE_KERN:
+		/* see if we need to raise avail update threshold */
+		spin_lock_irqsave(&dd->uctxt_lock, flags);
+		for (i = dd->first_user_ctxt;
+		     dd->cspec->updthresh != dd->cspec->updthresh_dflt
+		     && i < dd->cfgctxts; i++)
+			if (dd->rcd[i] && dd->rcd[i]->subctxt_cnt &&
+			   ((dd->rcd[i]->piocnt / dd->rcd[i]->subctxt_cnt) - 1)
+			   < dd->cspec->updthresh_dflt)
+				break;
+		spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+		if (i == dd->cfgctxts) {
+			spin_lock_irqsave(&dd->sendctrl_lock, flags);
+			dd->cspec->updthresh = dd->cspec->updthresh_dflt;
+			dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld);
+			dd->sendctrl |= (dd->cspec->updthresh &
+					 SYM_RMASK(SendCtrl, AvailUpdThld)) <<
+					   SYM_LSB(SendCtrl, AvailUpdThld);
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+			sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		}
+		break;
+	case TXCHK_CHG_TYPE_USER:
+		spin_lock_irqsave(&dd->sendctrl_lock, flags);
+		if (rcd && rcd->subctxt_cnt && ((rcd->piocnt
+			/ rcd->subctxt_cnt) - 1) < dd->cspec->updthresh) {
+			dd->cspec->updthresh = (rcd->piocnt /
+						rcd->subctxt_cnt) - 1;
+			dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld);
+			dd->sendctrl |= (dd->cspec->updthresh &
+					SYM_RMASK(SendCtrl, AvailUpdThld))
+					<< SYM_LSB(SendCtrl, AvailUpdThld);
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+			sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		} else
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+		break;
+	}
+}
+
+static void writescratch(struct qib_devdata *dd, u32 val)
+{
+	qib_write_kreg(dd, kr_scratch, val);
+}
+
+#define VALID_TS_RD_REG_MASK 0xBF
+/**
+ * qib_7220_tempsense_read - read register of temp sensor via TWSI
+ * @dd: the qlogic_ib device
+ * @regnum: register to read from
+ *
+ * returns reg contents (0..255) or < 0 for error
+ */
+static int qib_7220_tempsense_rd(struct qib_devdata *dd, int regnum)
+{
+	int ret;
+	u8 rdata;
+
+	if (regnum > 7) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* return a bogus value for (the one) register we do not have */
+	if (!((1 << regnum) & VALID_TS_RD_REG_MASK)) {
+		ret = 0;
+		goto bail;
+	}
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (ret)
+		goto bail;
+
+	ret = qib_twsi_blk_rd(dd, QIB_TWSI_TEMP_DEV, regnum, &rdata, 1);
+	if (!ret)
+		ret = rdata;
+
+	mutex_unlock(&dd->eep_lock);
+
+	/*
+	 * There are three possibilities here:
+	 * ret is actual value (0..255)
+	 * ret is -ENXIO or -EINVAL from twsi code or this file
+	 * ret is -EINTR from mutex_lock_interruptible.
+	 */
+bail:
+	return ret;
+}
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+static int qib_7220_notify_dca(struct qib_devdata *dd, unsigned long event)
+{
+	return 0;
+}
+#endif
+
+/* Dummy function, as 7220 boards never disable EEPROM Write */
+static int qib_7220_eeprom_wen(struct qib_devdata *dd, int wen)
+{
+	return 1;
+}
+
+/**
+ * qib_init_iba7220_funcs - set up the chip-specific function pointers
+ * @dev: the pci_dev for qlogic_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct qib_devdata *qib_init_iba7220_funcs(struct pci_dev *pdev,
+					   const struct pci_device_id *ent)
+{
+	struct qib_devdata *dd;
+	int ret;
+	u32 boardid, minwidth;
+
+	dd = qib_alloc_devdata(pdev, sizeof(struct qib_chip_specific) +
+		sizeof(struct qib_chippport_specific));
+	if (IS_ERR(dd))
+		goto bail;
+
+	dd->f_bringup_serdes    = qib_7220_bringup_serdes;
+	dd->f_cleanup           = qib_setup_7220_cleanup;
+	dd->f_clear_tids        = qib_7220_clear_tids;
+	dd->f_free_irq          = qib_free_irq;
+	dd->f_get_base_info     = qib_7220_get_base_info;
+	dd->f_get_msgheader     = qib_7220_get_msgheader;
+	dd->f_getsendbuf        = qib_7220_getsendbuf;
+	dd->f_gpio_mod          = gpio_7220_mod;
+	dd->f_eeprom_wen        = qib_7220_eeprom_wen;
+	dd->f_hdrqempty         = qib_7220_hdrqempty;
+	dd->f_ib_updown         = qib_7220_ib_updown;
+	dd->f_init_ctxt         = qib_7220_init_ctxt;
+	dd->f_initvl15_bufs     = qib_7220_initvl15_bufs;
+	dd->f_intr_fallback     = qib_7220_intr_fallback;
+	dd->f_late_initreg      = qib_late_7220_initreg;
+	dd->f_setpbc_control    = qib_7220_setpbc_control;
+	dd->f_portcntr          = qib_portcntr_7220;
+	dd->f_put_tid           = qib_7220_put_tid;
+	dd->f_quiet_serdes      = qib_7220_quiet_serdes;
+	dd->f_rcvctrl           = rcvctrl_7220_mod;
+	dd->f_read_cntrs        = qib_read_7220cntrs;
+	dd->f_read_portcntrs    = qib_read_7220portcntrs;
+	dd->f_reset             = qib_setup_7220_reset;
+	dd->f_init_sdma_regs    = init_sdma_7220_regs;
+	dd->f_sdma_busy         = qib_sdma_7220_busy;
+	dd->f_sdma_gethead      = qib_sdma_7220_gethead;
+	dd->f_sdma_sendctrl     = qib_7220_sdma_sendctrl;
+	dd->f_sdma_set_desc_cnt = qib_sdma_set_7220_desc_cnt;
+	dd->f_sdma_update_tail  = qib_sdma_update_7220_tail;
+	dd->f_sdma_hw_clean_up  = qib_7220_sdma_hw_clean_up;
+	dd->f_sdma_hw_start_up  = qib_7220_sdma_hw_start_up;
+	dd->f_sdma_init_early   = qib_7220_sdma_init_early;
+	dd->f_sendctrl          = sendctrl_7220_mod;
+	dd->f_set_armlaunch     = qib_set_7220_armlaunch;
+	dd->f_set_cntr_sample   = qib_set_cntr_7220_sample;
+	dd->f_iblink_state      = qib_7220_iblink_state;
+	dd->f_ibphys_portstate  = qib_7220_phys_portstate;
+	dd->f_get_ib_cfg        = qib_7220_get_ib_cfg;
+	dd->f_set_ib_cfg        = qib_7220_set_ib_cfg;
+	dd->f_set_ib_loopback   = qib_7220_set_loopback;
+	dd->f_set_intr_state    = qib_7220_set_intr_state;
+	dd->f_setextled         = qib_setup_7220_setextled;
+	dd->f_txchk_change      = qib_7220_txchk_change;
+	dd->f_update_usrhead    = qib_update_7220_usrhead;
+	dd->f_wantpiobuf_intr   = qib_wantpiobuf_7220_intr;
+	dd->f_xgxs_reset        = qib_7220_xgxs_reset;
+	dd->f_writescratch      = writescratch;
+	dd->f_tempsense_rd	= qib_7220_tempsense_rd;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dd->f_notify_dca = qib_7220_notify_dca;
+#endif
+	/*
+	 * Do remaining pcie setup and save pcie values in dd.
+	 * Any error printing is already done by the init code.
+	 * On return, we have the chip mapped, but chip registers
+	 * are not set up until start of qib_init_7220_variables.
+	 */
+	ret = qib_pcie_ddinit(dd, pdev, ent);
+	if (ret < 0)
+		goto bail_free;
+
+	/* initialize chip-specific variables */
+	ret = qib_init_7220_variables(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	if (qib_mini_init)
+		goto bail;
+
+	boardid = SYM_FIELD(dd->revision, Revision,
+			    BoardID);
+	switch (boardid) {
+	case 0:
+	case 2:
+	case 10:
+	case 12:
+		minwidth = 16; /* x16 capable boards */
+		break;
+	default:
+		minwidth = 8; /* x8 capable boards */
+		break;
+	}
+	if (qib_pcie_params(dd, minwidth, NULL))
+		qib_dev_err(dd,
+			"Failed to setup PCIe or interrupts; continuing anyway\n");
+
+	if (qib_read_kreg64(dd, kr_hwerrstatus) &
+	    QLOGIC_IB_HWE_SERDESPLLFAILED)
+		qib_write_kreg(dd, kr_hwerrclear,
+			       QLOGIC_IB_HWE_SERDESPLLFAILED);
+
+	/* setup interrupt handler (interrupt type handled above) */
+	qib_setup_7220_interrupt(dd);
+	qib_7220_init_hwerrors(dd);
+
+	/* clear diagctrl register, in case diags were running and crashed */
+	qib_write_kreg(dd, kr_hwdiagctrl, 0);
+
+	goto bail;
+
+bail_cleanup:
+	qib_pcie_ddcleanup(dd);
+bail_free:
+	qib_free_devdata(dd);
+	dd = ERR_PTR(ret);
+bail:
+	return dd;
+}
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c
new file mode 100644
index 0000000..8414ae4
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -0,0 +1,8506 @@
+/*
+ * Copyright (c) 2012 - 2017 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2008 - 2012 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains all of the code that is specific to the
+ * InfiniPath 7322 chip
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+#include <linux/dca.h>
+#endif
+
+#include "qib.h"
+#include "qib_7322_regs.h"
+#include "qib_qsfp.h"
+
+#include "qib_mad.h"
+#include "qib_verbs.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) QIB_DRV_NAME " " fmt
+
+static void qib_setup_7322_setextled(struct qib_pportdata *, u32);
+static void qib_7322_handle_hwerrors(struct qib_devdata *, char *, size_t);
+static void sendctrl_7322_mod(struct qib_pportdata *ppd, u32 op);
+static irqreturn_t qib_7322intr(int irq, void *data);
+static irqreturn_t qib_7322bufavail(int irq, void *data);
+static irqreturn_t sdma_intr(int irq, void *data);
+static irqreturn_t sdma_idle_intr(int irq, void *data);
+static irqreturn_t sdma_progress_intr(int irq, void *data);
+static irqreturn_t sdma_cleanup_intr(int irq, void *data);
+static void qib_7322_txchk_change(struct qib_devdata *, u32, u32, u32,
+				  struct qib_ctxtdata *rcd);
+static u8 qib_7322_phys_portstate(u64);
+static u32 qib_7322_iblink_state(u64);
+static void qib_set_ib_7322_lstate(struct qib_pportdata *ppd, u16 linkcmd,
+				   u16 linitcmd);
+static void force_h1(struct qib_pportdata *);
+static void adj_tx_serdes(struct qib_pportdata *);
+static u32 qib_7322_setpbc_control(struct qib_pportdata *, u32, u8, u8);
+static void qib_7322_mini_pcs_reset(struct qib_pportdata *);
+
+static u32 ahb_mod(struct qib_devdata *, int, int, int, u32, u32);
+static void ibsd_wr_allchans(struct qib_pportdata *, int, unsigned, unsigned);
+static void serdes_7322_los_enable(struct qib_pportdata *, int);
+static int serdes_7322_init_old(struct qib_pportdata *);
+static int serdes_7322_init_new(struct qib_pportdata *);
+static void dump_sdma_7322_state(struct qib_pportdata *);
+
+#define BMASK(msb, lsb) (((1 << ((msb) + 1 - (lsb))) - 1) << (lsb))
+
+/* LE2 serdes values for different cases */
+#define LE2_DEFAULT 5
+#define LE2_5m 4
+#define LE2_QME 0
+
+/* Below is special-purpose, so only really works for the IB SerDes blocks. */
+#define IBSD(hw_pidx) (hw_pidx + 2)
+
+/* these are variables for documentation and experimentation purposes */
+static const unsigned rcv_int_timeout = 375;
+static const unsigned rcv_int_count = 16;
+static const unsigned sdma_idle_cnt = 64;
+
+/* Time to stop altering Rx Equalization parameters, after link up. */
+#define RXEQ_DISABLE_MSECS 2500
+
+/*
+ * Number of VLs we are configured to use (to allow for more
+ * credits per vl, etc.)
+ */
+ushort qib_num_cfg_vls = 2;
+module_param_named(num_vls, qib_num_cfg_vls, ushort, S_IRUGO);
+MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
+
+static ushort qib_chase = 1;
+module_param_named(chase, qib_chase, ushort, S_IRUGO);
+MODULE_PARM_DESC(chase, "Enable state chase handling");
+
+static ushort qib_long_atten = 10; /* 10 dB ~= 5m length */
+module_param_named(long_attenuation, qib_long_atten, ushort, S_IRUGO);
+MODULE_PARM_DESC(long_attenuation,
+		 "attenuation cutoff (dB) for long copper cable setup");
+
+static ushort qib_singleport;
+module_param_named(singleport, qib_singleport, ushort, S_IRUGO);
+MODULE_PARM_DESC(singleport, "Use only IB port 1; more per-port buffer space");
+
+static ushort qib_krcvq01_no_msi;
+module_param_named(krcvq01_no_msi, qib_krcvq01_no_msi, ushort, S_IRUGO);
+MODULE_PARM_DESC(krcvq01_no_msi, "No MSI for kctx < 2");
+
+/*
+ * Receive header queue sizes
+ */
+static unsigned qib_rcvhdrcnt;
+module_param_named(rcvhdrcnt, qib_rcvhdrcnt, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrcnt, "receive header count");
+
+static unsigned qib_rcvhdrsize;
+module_param_named(rcvhdrsize, qib_rcvhdrsize, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrsize, "receive header size in 32-bit words");
+
+static unsigned qib_rcvhdrentsize;
+module_param_named(rcvhdrentsize, qib_rcvhdrentsize, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrentsize, "receive header entry size in 32-bit words");
+
+#define MAX_ATTEN_LEN 64 /* plenty for any real system */
+/* for read back, default index is ~5m copper cable */
+static char txselect_list[MAX_ATTEN_LEN] = "10";
+static struct kparam_string kp_txselect = {
+	.string = txselect_list,
+	.maxlen = MAX_ATTEN_LEN
+};
+static int  setup_txselect(const char *, const struct kernel_param *);
+module_param_call(txselect, setup_txselect, param_get_string,
+		  &kp_txselect, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(txselect,
+		 "Tx serdes indices (for no QSFP or invalid QSFP data)");
+
+#define BOARD_QME7342 5
+#define BOARD_QMH7342 6
+#define BOARD_QMH7360 9
+#define IS_QMH(dd) (SYM_FIELD((dd)->revision, Revision, BoardID) == \
+		    BOARD_QMH7342)
+#define IS_QME(dd) (SYM_FIELD((dd)->revision, Revision, BoardID) == \
+		    BOARD_QME7342)
+
+#define KREG_IDX(regname)     (QIB_7322_##regname##_OFFS / sizeof(u64))
+
+#define KREG_IBPORT_IDX(regname) ((QIB_7322_##regname##_0_OFFS / sizeof(u64)))
+
+#define MASK_ACROSS(lsb, msb) \
+	(((1ULL << ((msb) + 1 - (lsb))) - 1) << (lsb))
+
+#define SYM_RMASK(regname, fldname) ((u64)              \
+	QIB_7322_##regname##_##fldname##_RMASK)
+
+#define SYM_MASK(regname, fldname) ((u64)               \
+	QIB_7322_##regname##_##fldname##_RMASK <<       \
+	 QIB_7322_##regname##_##fldname##_LSB)
+
+#define SYM_FIELD(value, regname, fldname) ((u64)	\
+	(((value) >> SYM_LSB(regname, fldname)) &	\
+	 SYM_RMASK(regname, fldname)))
+
+/* useful for things like LaFifoEmpty_0...7, TxCreditOK_0...7, etc. */
+#define SYM_FIELD_ACROSS(value, regname, fldname, nbits) \
+	(((value) >> SYM_LSB(regname, fldname)) & MASK_ACROSS(0, nbits))
+
+#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask)
+#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask)
+#define ERR_MASK_N(fldname) SYM_MASK(ErrMask_0, fldname##Mask)
+#define INT_MASK(fldname) SYM_MASK(IntMask, fldname##IntMask)
+#define INT_MASK_P(fldname, port) SYM_MASK(IntMask, fldname##IntMask##_##port)
+/* Below because most, but not all, fields of IntMask have that full suffix */
+#define INT_MASK_PM(fldname, port) SYM_MASK(IntMask, fldname##Mask##_##port)
+
+
+#define SYM_LSB(regname, fldname) (QIB_7322_##regname##_##fldname##_LSB)
+
+/*
+ * the size bits give us 2^N, in KB units.  0 marks as invalid,
+ * and 7 is reserved.  We currently use only 2KB and 4KB
+ */
+#define IBA7322_TID_SZ_SHIFT QIB_7322_RcvTIDArray0_RT_BufSize_LSB
+#define IBA7322_TID_SZ_2K (1UL<<IBA7322_TID_SZ_SHIFT) /* 2KB */
+#define IBA7322_TID_SZ_4K (2UL<<IBA7322_TID_SZ_SHIFT) /* 4KB */
+#define IBA7322_TID_PA_SHIFT 11U /* TID addr in chip stored w/o low bits */
+
+#define SendIBSLIDAssignMask \
+	QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_RMASK
+#define SendIBSLMCMask \
+	QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_RMASK
+
+#define ExtLED_IB1_YEL SYM_MASK(EXTCtrl, LEDPort0YellowOn)
+#define ExtLED_IB1_GRN SYM_MASK(EXTCtrl, LEDPort0GreenOn)
+#define ExtLED_IB2_YEL SYM_MASK(EXTCtrl, LEDPort1YellowOn)
+#define ExtLED_IB2_GRN SYM_MASK(EXTCtrl, LEDPort1GreenOn)
+#define ExtLED_IB1_MASK (ExtLED_IB1_YEL | ExtLED_IB1_GRN)
+#define ExtLED_IB2_MASK (ExtLED_IB2_YEL | ExtLED_IB2_GRN)
+
+#define _QIB_GPIO_SDA_NUM 1
+#define _QIB_GPIO_SCL_NUM 0
+#define QIB_EEPROM_WEN_NUM 14
+#define QIB_TWSI_EEPROM_DEV 0xA2 /* All Production 7322 cards. */
+
+/* HW counter clock is at 4nsec */
+#define QIB_7322_PSXMITWAIT_CHECK_RATE 4000
+
+/* full speed IB port 1 only */
+#define PORT_SPD_CAP (QIB_IB_SDR | QIB_IB_DDR | QIB_IB_QDR)
+#define PORT_SPD_CAP_SHIFT 3
+
+/* full speed featuremask, both ports */
+#define DUAL_PORT_CAP (PORT_SPD_CAP | (PORT_SPD_CAP << PORT_SPD_CAP_SHIFT))
+
+/*
+ * This file contains almost all the chip-specific register information and
+ * access functions for the FAKED QLogic InfiniPath 7322 PCI-Express chip.
+ */
+
+/* Use defines to tie machine-generated names to lower-case names */
+#define kr_contextcnt KREG_IDX(ContextCnt)
+#define kr_control KREG_IDX(Control)
+#define kr_counterregbase KREG_IDX(CntrRegBase)
+#define kr_errclear KREG_IDX(ErrClear)
+#define kr_errmask KREG_IDX(ErrMask)
+#define kr_errstatus KREG_IDX(ErrStatus)
+#define kr_extctrl KREG_IDX(EXTCtrl)
+#define kr_extstatus KREG_IDX(EXTStatus)
+#define kr_gpio_clear KREG_IDX(GPIOClear)
+#define kr_gpio_mask KREG_IDX(GPIOMask)
+#define kr_gpio_out KREG_IDX(GPIOOut)
+#define kr_gpio_status KREG_IDX(GPIOStatus)
+#define kr_hwdiagctrl KREG_IDX(HwDiagCtrl)
+#define kr_debugportval KREG_IDX(DebugPortValueReg)
+#define kr_fmask KREG_IDX(feature_mask)
+#define kr_act_fmask KREG_IDX(active_feature_mask)
+#define kr_hwerrclear KREG_IDX(HwErrClear)
+#define kr_hwerrmask KREG_IDX(HwErrMask)
+#define kr_hwerrstatus KREG_IDX(HwErrStatus)
+#define kr_intclear KREG_IDX(IntClear)
+#define kr_intmask KREG_IDX(IntMask)
+#define kr_intredirect KREG_IDX(IntRedirect0)
+#define kr_intstatus KREG_IDX(IntStatus)
+#define kr_pagealign KREG_IDX(PageAlign)
+#define kr_rcvavailtimeout KREG_IDX(RcvAvailTimeOut0)
+#define kr_rcvctrl KREG_IDX(RcvCtrl) /* Common, but chip also has per-port */
+#define kr_rcvegrbase KREG_IDX(RcvEgrBase)
+#define kr_rcvegrcnt KREG_IDX(RcvEgrCnt)
+#define kr_rcvhdrcnt KREG_IDX(RcvHdrCnt)
+#define kr_rcvhdrentsize KREG_IDX(RcvHdrEntSize)
+#define kr_rcvhdrsize KREG_IDX(RcvHdrSize)
+#define kr_rcvtidbase KREG_IDX(RcvTIDBase)
+#define kr_rcvtidcnt KREG_IDX(RcvTIDCnt)
+#define kr_revision KREG_IDX(Revision)
+#define kr_scratch KREG_IDX(Scratch)
+#define kr_sendbuffererror KREG_IDX(SendBufErr0) /* and base for 1 and 2 */
+#define kr_sendcheckmask KREG_IDX(SendCheckMask0) /* and 1, 2 */
+#define kr_sendctrl KREG_IDX(SendCtrl)
+#define kr_sendgrhcheckmask KREG_IDX(SendGRHCheckMask0) /* and 1, 2 */
+#define kr_sendibpktmask KREG_IDX(SendIBPacketMask0) /* and 1, 2 */
+#define kr_sendpioavailaddr KREG_IDX(SendBufAvailAddr)
+#define kr_sendpiobufbase KREG_IDX(SendBufBase)
+#define kr_sendpiobufcnt KREG_IDX(SendBufCnt)
+#define kr_sendpiosize KREG_IDX(SendBufSize)
+#define kr_sendregbase KREG_IDX(SendRegBase)
+#define kr_sendbufavail0 KREG_IDX(SendBufAvail0)
+#define kr_userregbase KREG_IDX(UserRegBase)
+#define kr_intgranted KREG_IDX(Int_Granted)
+#define kr_vecclr_wo_int KREG_IDX(vec_clr_without_int)
+#define kr_intblocked KREG_IDX(IntBlocked)
+#define kr_r_access KREG_IDX(SPC_JTAG_ACCESS_REG)
+
+/*
+ * per-port kernel registers.  Access only with qib_read_kreg_port()
+ * or qib_write_kreg_port()
+ */
+#define krp_errclear KREG_IBPORT_IDX(ErrClear)
+#define krp_errmask KREG_IBPORT_IDX(ErrMask)
+#define krp_errstatus KREG_IBPORT_IDX(ErrStatus)
+#define krp_highprio_0 KREG_IBPORT_IDX(HighPriority0)
+#define krp_highprio_limit KREG_IBPORT_IDX(HighPriorityLimit)
+#define krp_hrtbt_guid KREG_IBPORT_IDX(HRTBT_GUID)
+#define krp_ib_pcsconfig KREG_IBPORT_IDX(IBPCSConfig)
+#define krp_ibcctrl_a KREG_IBPORT_IDX(IBCCtrlA)
+#define krp_ibcctrl_b KREG_IBPORT_IDX(IBCCtrlB)
+#define krp_ibcctrl_c KREG_IBPORT_IDX(IBCCtrlC)
+#define krp_ibcstatus_a KREG_IBPORT_IDX(IBCStatusA)
+#define krp_ibcstatus_b KREG_IBPORT_IDX(IBCStatusB)
+#define krp_txestatus KREG_IBPORT_IDX(TXEStatus)
+#define krp_lowprio_0 KREG_IBPORT_IDX(LowPriority0)
+#define krp_ncmodectrl KREG_IBPORT_IDX(IBNCModeCtrl)
+#define krp_partitionkey KREG_IBPORT_IDX(RcvPartitionKey)
+#define krp_psinterval KREG_IBPORT_IDX(PSInterval)
+#define krp_psstart KREG_IBPORT_IDX(PSStart)
+#define krp_psstat KREG_IBPORT_IDX(PSStat)
+#define krp_rcvbthqp KREG_IBPORT_IDX(RcvBTHQP)
+#define krp_rcvctrl KREG_IBPORT_IDX(RcvCtrl)
+#define krp_rcvpktledcnt KREG_IBPORT_IDX(RcvPktLEDCnt)
+#define krp_rcvqpmaptable KREG_IBPORT_IDX(RcvQPMapTableA)
+#define krp_rxcreditvl0 KREG_IBPORT_IDX(RxCreditVL0)
+#define krp_rxcreditvl15 (KREG_IBPORT_IDX(RxCreditVL0)+15)
+#define krp_sendcheckcontrol KREG_IBPORT_IDX(SendCheckControl)
+#define krp_sendctrl KREG_IBPORT_IDX(SendCtrl)
+#define krp_senddmabase KREG_IBPORT_IDX(SendDmaBase)
+#define krp_senddmabufmask0 KREG_IBPORT_IDX(SendDmaBufMask0)
+#define krp_senddmabufmask1 (KREG_IBPORT_IDX(SendDmaBufMask0) + 1)
+#define krp_senddmabufmask2 (KREG_IBPORT_IDX(SendDmaBufMask0) + 2)
+#define krp_senddmabuf_use0 KREG_IBPORT_IDX(SendDmaBufUsed0)
+#define krp_senddmabuf_use1 (KREG_IBPORT_IDX(SendDmaBufUsed0) + 1)
+#define krp_senddmabuf_use2 (KREG_IBPORT_IDX(SendDmaBufUsed0) + 2)
+#define krp_senddmadesccnt KREG_IBPORT_IDX(SendDmaDescCnt)
+#define krp_senddmahead KREG_IBPORT_IDX(SendDmaHead)
+#define krp_senddmaheadaddr KREG_IBPORT_IDX(SendDmaHeadAddr)
+#define krp_senddmaidlecnt KREG_IBPORT_IDX(SendDmaIdleCnt)
+#define krp_senddmalengen KREG_IBPORT_IDX(SendDmaLenGen)
+#define krp_senddmaprioritythld KREG_IBPORT_IDX(SendDmaPriorityThld)
+#define krp_senddmareloadcnt KREG_IBPORT_IDX(SendDmaReloadCnt)
+#define krp_senddmastatus KREG_IBPORT_IDX(SendDmaStatus)
+#define krp_senddmatail KREG_IBPORT_IDX(SendDmaTail)
+#define krp_sendhdrsymptom KREG_IBPORT_IDX(SendHdrErrSymptom)
+#define krp_sendslid KREG_IBPORT_IDX(SendIBSLIDAssign)
+#define krp_sendslidmask KREG_IBPORT_IDX(SendIBSLIDMask)
+#define krp_ibsdtestiftx KREG_IBPORT_IDX(IB_SDTEST_IF_TX)
+#define krp_adapt_dis_timer KREG_IBPORT_IDX(ADAPT_DISABLE_TIMER_THRESHOLD)
+#define krp_tx_deemph_override KREG_IBPORT_IDX(IBSD_TX_DEEMPHASIS_OVERRIDE)
+#define krp_serdesctrl KREG_IBPORT_IDX(IBSerdesCtrl)
+
+/*
+ * Per-context kernel registers.  Access only with qib_read_kreg_ctxt()
+ * or qib_write_kreg_ctxt()
+ */
+#define krc_rcvhdraddr KREG_IDX(RcvHdrAddr0)
+#define krc_rcvhdrtailaddr KREG_IDX(RcvHdrTailAddr0)
+
+/*
+ * TID Flow table, per context.  Reduces
+ * number of hdrq updates to one per flow (or on errors).
+ * context 0 and 1 share same memory, but have distinct
+ * addresses.  Since for now, we never use expected sends
+ * on kernel contexts, we don't worry about that (we initialize
+ * those entries for ctxt 0/1 on driver load twice, for example).
+ */
+#define NUM_TIDFLOWS_CTXT 0x20 /* 0x20 per context; have to hardcode */
+#define ur_rcvflowtable (KREG_IDX(RcvTIDFlowTable0) - KREG_IDX(RcvHdrTail0))
+
+/* these are the error bits in the tid flows, and are W1C */
+#define TIDFLOW_ERRBITS  ( \
+	(SYM_MASK(RcvTIDFlowTable0, GenMismatch) << \
+	SYM_LSB(RcvTIDFlowTable0, GenMismatch)) | \
+	(SYM_MASK(RcvTIDFlowTable0, SeqMismatch) << \
+	SYM_LSB(RcvTIDFlowTable0, SeqMismatch)))
+
+/* Most (not all) Counters are per-IBport.
+ * Requires LBIntCnt is at offset 0 in the group
+ */
+#define CREG_IDX(regname) \
+((QIB_7322_##regname##_0_OFFS - QIB_7322_LBIntCnt_OFFS) / sizeof(u64))
+
+#define crp_badformat CREG_IDX(RxVersionErrCnt)
+#define crp_err_rlen CREG_IDX(RxLenErrCnt)
+#define crp_erricrc CREG_IDX(RxICRCErrCnt)
+#define crp_errlink CREG_IDX(RxLinkMalformCnt)
+#define crp_errlpcrc CREG_IDX(RxLPCRCErrCnt)
+#define crp_errpkey CREG_IDX(RxPKeyMismatchCnt)
+#define crp_errvcrc CREG_IDX(RxVCRCErrCnt)
+#define crp_excessbufferovfl CREG_IDX(ExcessBufferOvflCnt)
+#define crp_iblinkdown CREG_IDX(IBLinkDownedCnt)
+#define crp_iblinkerrrecov CREG_IDX(IBLinkErrRecoveryCnt)
+#define crp_ibstatuschange CREG_IDX(IBStatusChangeCnt)
+#define crp_ibsymbolerr CREG_IDX(IBSymbolErrCnt)
+#define crp_invalidrlen CREG_IDX(RxMaxMinLenErrCnt)
+#define crp_locallinkintegrityerr CREG_IDX(LocalLinkIntegrityErrCnt)
+#define crp_pktrcv CREG_IDX(RxDataPktCnt)
+#define crp_pktrcvflowctrl CREG_IDX(RxFlowPktCnt)
+#define crp_pktsend CREG_IDX(TxDataPktCnt)
+#define crp_pktsendflow CREG_IDX(TxFlowPktCnt)
+#define crp_psrcvdatacount CREG_IDX(PSRcvDataCount)
+#define crp_psrcvpktscount CREG_IDX(PSRcvPktsCount)
+#define crp_psxmitdatacount CREG_IDX(PSXmitDataCount)
+#define crp_psxmitpktscount CREG_IDX(PSXmitPktsCount)
+#define crp_psxmitwaitcount CREG_IDX(PSXmitWaitCount)
+#define crp_rcvebp CREG_IDX(RxEBPCnt)
+#define crp_rcvflowctrlviol CREG_IDX(RxFlowCtrlViolCnt)
+#define crp_rcvovfl CREG_IDX(RxBufOvflCnt)
+#define crp_rxdlidfltr CREG_IDX(RxDlidFltrCnt)
+#define crp_rxdroppkt CREG_IDX(RxDroppedPktCnt)
+#define crp_rxotherlocalphyerr CREG_IDX(RxOtherLocalPhyErrCnt)
+#define crp_rxqpinvalidctxt CREG_IDX(RxQPInvalidContextCnt)
+#define crp_rxvlerr CREG_IDX(RxVlErrCnt)
+#define crp_sendstall CREG_IDX(TxFlowStallCnt)
+#define crp_txdroppedpkt CREG_IDX(TxDroppedPktCnt)
+#define crp_txhdrerr CREG_IDX(TxHeadersErrCnt)
+#define crp_txlenerr CREG_IDX(TxLenErrCnt)
+#define crp_txminmaxlenerr CREG_IDX(TxMaxMinLenErrCnt)
+#define crp_txsdmadesc CREG_IDX(TxSDmaDescCnt)
+#define crp_txunderrun CREG_IDX(TxUnderrunCnt)
+#define crp_txunsupvl CREG_IDX(TxUnsupVLErrCnt)
+#define crp_vl15droppedpkt CREG_IDX(RxVL15DroppedPktCnt)
+#define crp_wordrcv CREG_IDX(RxDwordCnt)
+#define crp_wordsend CREG_IDX(TxDwordCnt)
+#define crp_tx_creditstalls CREG_IDX(TxCreditUpToDateTimeOut)
+
+/* these are the (few) counters that are not port-specific */
+#define CREG_DEVIDX(regname) ((QIB_7322_##regname##_OFFS - \
+			QIB_7322_LBIntCnt_OFFS) / sizeof(u64))
+#define cr_base_egrovfl CREG_DEVIDX(RxP0HdrEgrOvflCnt)
+#define cr_lbint CREG_DEVIDX(LBIntCnt)
+#define cr_lbstall CREG_DEVIDX(LBFlowStallCnt)
+#define cr_pcieretrydiag CREG_DEVIDX(PcieRetryBufDiagQwordCnt)
+#define cr_rxtidflowdrop CREG_DEVIDX(RxTidFlowDropCnt)
+#define cr_tidfull CREG_DEVIDX(RxTIDFullErrCnt)
+#define cr_tidinvalid CREG_DEVIDX(RxTIDValidErrCnt)
+
+/* no chip register for # of IB ports supported, so define */
+#define NUM_IB_PORTS 2
+
+/* 1 VL15 buffer per hardware IB port, no register for this, so define */
+#define NUM_VL15_BUFS NUM_IB_PORTS
+
+/*
+ * context 0 and 1 are special, and there is no chip register that
+ * defines this value, so we have to define it here.
+ * These are all allocated to either 0 or 1 for single port
+ * hardware configuration, otherwise each gets half
+ */
+#define KCTXT0_EGRCNT 2048
+
+/* values for vl and port fields in PBC, 7322-specific */
+#define PBC_PORT_SEL_LSB 26
+#define PBC_PORT_SEL_RMASK 1
+#define PBC_VL_NUM_LSB 27
+#define PBC_VL_NUM_RMASK 7
+#define PBC_7322_VL15_SEND (1ULL << 63) /* pbc; VL15, no credit check */
+#define PBC_7322_VL15_SEND_CTRL (1ULL << 31) /* control version of same */
+
+static u8 ib_rate_to_delay[IB_RATE_120_GBPS + 1] = {
+	[IB_RATE_2_5_GBPS] = 16,
+	[IB_RATE_5_GBPS] = 8,
+	[IB_RATE_10_GBPS] = 4,
+	[IB_RATE_20_GBPS] = 2,
+	[IB_RATE_30_GBPS] = 2,
+	[IB_RATE_40_GBPS] = 1
+};
+
+static const char * const qib_sdma_state_names[] = {
+	[qib_sdma_state_s00_hw_down]          = "s00_HwDown",
+	[qib_sdma_state_s10_hw_start_up_wait] = "s10_HwStartUpWait",
+	[qib_sdma_state_s20_idle]             = "s20_Idle",
+	[qib_sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait",
+	[qib_sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait",
+	[qib_sdma_state_s50_hw_halt_wait]     = "s50_HwHaltWait",
+	[qib_sdma_state_s99_running]          = "s99_Running",
+};
+
+#define IBA7322_LINKSPEED_SHIFT SYM_LSB(IBCStatusA_0, LinkSpeedActive)
+#define IBA7322_LINKWIDTH_SHIFT SYM_LSB(IBCStatusA_0, LinkWidthActive)
+
+/* link training states, from IBC */
+#define IB_7322_LT_STATE_DISABLED        0x00
+#define IB_7322_LT_STATE_LINKUP          0x01
+#define IB_7322_LT_STATE_POLLACTIVE      0x02
+#define IB_7322_LT_STATE_POLLQUIET       0x03
+#define IB_7322_LT_STATE_SLEEPDELAY      0x04
+#define IB_7322_LT_STATE_SLEEPQUIET      0x05
+#define IB_7322_LT_STATE_CFGDEBOUNCE     0x08
+#define IB_7322_LT_STATE_CFGRCVFCFG      0x09
+#define IB_7322_LT_STATE_CFGWAITRMT      0x0a
+#define IB_7322_LT_STATE_CFGIDLE         0x0b
+#define IB_7322_LT_STATE_RECOVERRETRAIN  0x0c
+#define IB_7322_LT_STATE_TXREVLANES      0x0d
+#define IB_7322_LT_STATE_RECOVERWAITRMT  0x0e
+#define IB_7322_LT_STATE_RECOVERIDLE     0x0f
+#define IB_7322_LT_STATE_CFGENH          0x10
+#define IB_7322_LT_STATE_CFGTEST         0x11
+#define IB_7322_LT_STATE_CFGWAITRMTTEST  0x12
+#define IB_7322_LT_STATE_CFGWAITENH      0x13
+
+/* link state machine states from IBC */
+#define IB_7322_L_STATE_DOWN             0x0
+#define IB_7322_L_STATE_INIT             0x1
+#define IB_7322_L_STATE_ARM              0x2
+#define IB_7322_L_STATE_ACTIVE           0x3
+#define IB_7322_L_STATE_ACT_DEFER        0x4
+
+static const u8 qib_7322_physportstate[0x20] = {
+	[IB_7322_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+	[IB_7322_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+	[IB_7322_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+	[IB_7322_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+	[IB_7322_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_7322_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_7322_LT_STATE_CFGDEBOUNCE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGRCVFCFG] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGWAITRMT] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_IDLE,
+	[IB_7322_LT_STATE_RECOVERRETRAIN] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7322_LT_STATE_RECOVERWAITRMT] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7322_LT_STATE_RECOVERIDLE] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7322_LT_STATE_CFGENH] = IB_PHYSPORTSTATE_CFG_ENH,
+	[IB_7322_LT_STATE_CFGTEST] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGWAITRMTTEST] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGWAITENH] =
+		IB_PHYSPORTSTATE_CFG_WAIT_ENH,
+	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+struct qib_irq_notify {
+	int rcv;
+	void *arg;
+	struct irq_affinity_notify notify;
+};
+#endif
+
+struct qib_chip_specific {
+	u64 __iomem *cregbase;
+	u64 *cntrs;
+	spinlock_t rcvmod_lock; /* protect rcvctrl shadow changes */
+	spinlock_t gpio_lock; /* RMW of shadows/regs for ExtCtrl and GPIO */
+	u64 main_int_mask;      /* clear bits which have dedicated handlers */
+	u64 int_enable_mask;  /* for per port interrupts in single port mode */
+	u64 errormask;
+	u64 hwerrmask;
+	u64 gpio_out; /* shadow of kr_gpio_out, for rmw ops */
+	u64 gpio_mask; /* shadow the gpio mask register */
+	u64 extctrl; /* shadow the gpio output enable, etc... */
+	u32 ncntrs;
+	u32 nportcntrs;
+	u32 cntrnamelen;
+	u32 portcntrnamelen;
+	u32 numctxts;
+	u32 rcvegrcnt;
+	u32 updthresh; /* current AvailUpdThld */
+	u32 updthresh_dflt; /* default AvailUpdThld */
+	u32 r1;
+	u32 num_msix_entries;
+	u32 sdmabufcnt;
+	u32 lastbuf_for_pio;
+	u32 stay_in_freeze;
+	u32 recovery_ports_initted;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	u32 dca_ctrl;
+	int rhdr_cpu[18];
+	int sdma_cpu[2];
+	u64 dca_rcvhdr_ctrl[5]; /* B, C, D, E, F */
+#endif
+	struct qib_msix_entry *msix_entries;
+	unsigned long *sendchkenable;
+	unsigned long *sendgrhchk;
+	unsigned long *sendibchk;
+	u32 rcvavail_timeout[18];
+	char emsgbuf[128]; /* for device error interrupt msg buffer */
+};
+
+/* Table of entries in "human readable" form Tx Emphasis. */
+struct txdds_ent {
+	u8 amp;
+	u8 pre;
+	u8 main;
+	u8 post;
+};
+
+struct vendor_txdds_ent {
+	u8 oui[QSFP_VOUI_LEN];
+	u8 *partnum;
+	struct txdds_ent sdr;
+	struct txdds_ent ddr;
+	struct txdds_ent qdr;
+};
+
+static void write_tx_serdes_param(struct qib_pportdata *, struct txdds_ent *);
+
+#define TXDDS_TABLE_SZ 16 /* number of entries per speed in onchip table */
+#define TXDDS_EXTRA_SZ 18 /* number of extra tx settings entries */
+#define TXDDS_MFG_SZ 2    /* number of mfg tx settings entries */
+#define SERDES_CHANS 4 /* yes, it's obvious, but one less magic number */
+
+#define H1_FORCE_VAL 8
+#define H1_FORCE_QME 1 /*  may be overridden via setup_txselect() */
+#define H1_FORCE_QMH 7 /*  may be overridden via setup_txselect() */
+
+/* The static and dynamic registers are paired, and the pairs indexed by spd */
+#define krp_static_adapt_dis(spd) (KREG_IBPORT_IDX(ADAPT_DISABLE_STATIC_SDR) \
+	+ ((spd) * 2))
+
+#define QDR_DFE_DISABLE_DELAY 4000 /* msec after LINKUP */
+#define QDR_STATIC_ADAPT_DOWN 0xf0f0f0f0ULL /* link down, H1-H4 QDR adapts */
+#define QDR_STATIC_ADAPT_DOWN_R1 0ULL /* r1 link down, H1-H4 QDR adapts */
+#define QDR_STATIC_ADAPT_INIT 0xffffffffffULL /* up, disable H0,H1-8, LE */
+#define QDR_STATIC_ADAPT_INIT_R1 0xf0ffffffffULL /* r1 up, disable H0,H1-8 */
+
+struct qib_chippport_specific {
+	u64 __iomem *kpregbase;
+	u64 __iomem *cpregbase;
+	u64 *portcntrs;
+	struct qib_pportdata *ppd;
+	wait_queue_head_t autoneg_wait;
+	struct delayed_work autoneg_work;
+	struct delayed_work ipg_work;
+	struct timer_list chase_timer;
+	/*
+	 * these 5 fields are used to establish deltas for IB symbol
+	 * errors and linkrecovery errors.  They can be reported on
+	 * some chips during link negotiation prior to INIT, and with
+	 * DDR when faking DDR negotiations with non-IBTA switches.
+	 * The chip counters are adjusted at driver unload if there is
+	 * a non-zero delta.
+	 */
+	u64 ibdeltainprog;
+	u64 ibsymdelta;
+	u64 ibsymsnap;
+	u64 iblnkerrdelta;
+	u64 iblnkerrsnap;
+	u64 iblnkdownsnap;
+	u64 iblnkdowndelta;
+	u64 ibmalfdelta;
+	u64 ibmalfsnap;
+	u64 ibcctrl_a; /* krp_ibcctrl_a shadow */
+	u64 ibcctrl_b; /* krp_ibcctrl_b shadow */
+	unsigned long qdr_dfe_time;
+	unsigned long chase_end;
+	u32 autoneg_tries;
+	u32 recovery_init;
+	u32 qdr_dfe_on;
+	u32 qdr_reforce;
+	/*
+	 * Per-bay per-channel rcv QMH H1 values and Tx values for QDR.
+	 * entry zero is unused, to simplify indexing
+	 */
+	u8 h1_val;
+	u8 no_eep;  /* txselect table index to use if no qsfp info */
+	u8 ipg_tries;
+	u8 ibmalfusesnap;
+	struct qib_qsfp_data qsfp_data;
+	char epmsgbuf[192]; /* for port error interrupt msg buffer */
+	char sdmamsgbuf[192]; /* for per-port sdma error messages */
+};
+
+static struct {
+	const char *name;
+	irq_handler_t handler;
+	int lsb;
+	int port; /* 0 if not port-specific, else port # */
+	int dca;
+} irq_table[] = {
+	{ "", qib_7322intr, -1, 0, 0 },
+	{ " (buf avail)", qib_7322bufavail,
+		SYM_LSB(IntStatus, SendBufAvail), 0, 0},
+	{ " (sdma 0)", sdma_intr,
+		SYM_LSB(IntStatus, SDmaInt_0), 1, 1 },
+	{ " (sdma 1)", sdma_intr,
+		SYM_LSB(IntStatus, SDmaInt_1), 2, 1 },
+	{ " (sdmaI 0)", sdma_idle_intr,
+		SYM_LSB(IntStatus, SDmaIdleInt_0), 1, 1},
+	{ " (sdmaI 1)", sdma_idle_intr,
+		SYM_LSB(IntStatus, SDmaIdleInt_1), 2, 1},
+	{ " (sdmaP 0)", sdma_progress_intr,
+		SYM_LSB(IntStatus, SDmaProgressInt_0), 1, 1 },
+	{ " (sdmaP 1)", sdma_progress_intr,
+		SYM_LSB(IntStatus, SDmaProgressInt_1), 2, 1 },
+	{ " (sdmaC 0)", sdma_cleanup_intr,
+		SYM_LSB(IntStatus, SDmaCleanupDone_0), 1, 0 },
+	{ " (sdmaC 1)", sdma_cleanup_intr,
+		SYM_LSB(IntStatus, SDmaCleanupDone_1), 2 , 0},
+};
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+
+static const struct dca_reg_map {
+	int     shadow_inx;
+	int     lsb;
+	u64     mask;
+	u16     regno;
+} dca_rcvhdr_reg_map[] = {
+	{ 0, SYM_LSB(DCACtrlB, RcvHdrq0DCAOPH),
+	   ~SYM_MASK(DCACtrlB, RcvHdrq0DCAOPH) , KREG_IDX(DCACtrlB) },
+	{ 0, SYM_LSB(DCACtrlB, RcvHdrq1DCAOPH),
+	   ~SYM_MASK(DCACtrlB, RcvHdrq1DCAOPH) , KREG_IDX(DCACtrlB) },
+	{ 0, SYM_LSB(DCACtrlB, RcvHdrq2DCAOPH),
+	   ~SYM_MASK(DCACtrlB, RcvHdrq2DCAOPH) , KREG_IDX(DCACtrlB) },
+	{ 0, SYM_LSB(DCACtrlB, RcvHdrq3DCAOPH),
+	   ~SYM_MASK(DCACtrlB, RcvHdrq3DCAOPH) , KREG_IDX(DCACtrlB) },
+	{ 1, SYM_LSB(DCACtrlC, RcvHdrq4DCAOPH),
+	   ~SYM_MASK(DCACtrlC, RcvHdrq4DCAOPH) , KREG_IDX(DCACtrlC) },
+	{ 1, SYM_LSB(DCACtrlC, RcvHdrq5DCAOPH),
+	   ~SYM_MASK(DCACtrlC, RcvHdrq5DCAOPH) , KREG_IDX(DCACtrlC) },
+	{ 1, SYM_LSB(DCACtrlC, RcvHdrq6DCAOPH),
+	   ~SYM_MASK(DCACtrlC, RcvHdrq6DCAOPH) , KREG_IDX(DCACtrlC) },
+	{ 1, SYM_LSB(DCACtrlC, RcvHdrq7DCAOPH),
+	   ~SYM_MASK(DCACtrlC, RcvHdrq7DCAOPH) , KREG_IDX(DCACtrlC) },
+	{ 2, SYM_LSB(DCACtrlD, RcvHdrq8DCAOPH),
+	   ~SYM_MASK(DCACtrlD, RcvHdrq8DCAOPH) , KREG_IDX(DCACtrlD) },
+	{ 2, SYM_LSB(DCACtrlD, RcvHdrq9DCAOPH),
+	   ~SYM_MASK(DCACtrlD, RcvHdrq9DCAOPH) , KREG_IDX(DCACtrlD) },
+	{ 2, SYM_LSB(DCACtrlD, RcvHdrq10DCAOPH),
+	   ~SYM_MASK(DCACtrlD, RcvHdrq10DCAOPH) , KREG_IDX(DCACtrlD) },
+	{ 2, SYM_LSB(DCACtrlD, RcvHdrq11DCAOPH),
+	   ~SYM_MASK(DCACtrlD, RcvHdrq11DCAOPH) , KREG_IDX(DCACtrlD) },
+	{ 3, SYM_LSB(DCACtrlE, RcvHdrq12DCAOPH),
+	   ~SYM_MASK(DCACtrlE, RcvHdrq12DCAOPH) , KREG_IDX(DCACtrlE) },
+	{ 3, SYM_LSB(DCACtrlE, RcvHdrq13DCAOPH),
+	   ~SYM_MASK(DCACtrlE, RcvHdrq13DCAOPH) , KREG_IDX(DCACtrlE) },
+	{ 3, SYM_LSB(DCACtrlE, RcvHdrq14DCAOPH),
+	   ~SYM_MASK(DCACtrlE, RcvHdrq14DCAOPH) , KREG_IDX(DCACtrlE) },
+	{ 3, SYM_LSB(DCACtrlE, RcvHdrq15DCAOPH),
+	   ~SYM_MASK(DCACtrlE, RcvHdrq15DCAOPH) , KREG_IDX(DCACtrlE) },
+	{ 4, SYM_LSB(DCACtrlF, RcvHdrq16DCAOPH),
+	   ~SYM_MASK(DCACtrlF, RcvHdrq16DCAOPH) , KREG_IDX(DCACtrlF) },
+	{ 4, SYM_LSB(DCACtrlF, RcvHdrq17DCAOPH),
+	   ~SYM_MASK(DCACtrlF, RcvHdrq17DCAOPH) , KREG_IDX(DCACtrlF) },
+};
+#endif
+
+/* ibcctrl bits */
+#define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define QLOGIC_IB_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define QLOGIC_IB_IBCC_LINKINITCMD_SLEEP 3
+#define QLOGIC_IB_IBCC_LINKINITCMD_SHIFT 16
+
+#define QLOGIC_IB_IBCC_LINKCMD_DOWN 1           /* move to 0x11 */
+#define QLOGIC_IB_IBCC_LINKCMD_ARMED 2          /* move to 0x21 */
+#define QLOGIC_IB_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */
+
+#define BLOB_7322_IBCHG 0x101
+
+static inline void qib_write_kreg(const struct qib_devdata *dd,
+				  const u32 regno, u64 value);
+static inline u32 qib_read_kreg32(const struct qib_devdata *, const u32);
+static void write_7322_initregs(struct qib_devdata *);
+static void write_7322_init_portregs(struct qib_pportdata *);
+static void setup_7322_link_recovery(struct qib_pportdata *, u32);
+static void check_7322_rxe_status(struct qib_pportdata *);
+static u32 __iomem *qib_7322_getsendbuf(struct qib_pportdata *, u64, u32 *);
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+static void qib_setup_dca(struct qib_devdata *dd);
+static void setup_dca_notifier(struct qib_devdata *dd, int msixnum);
+static void reset_dca_notifier(struct qib_devdata *dd, int msixnum);
+#endif
+
+/**
+ * qib_read_ureg32 - read 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @ctxt: context number
+ *
+ * Return the contents of a register that is virtualized to be per context.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 qib_read_ureg32(const struct qib_devdata *dd,
+				  enum qib_ureg regno, int ctxt)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(regno + (u64 __iomem *)(
+		(dd->ureg_align * ctxt) + (dd->userbase ?
+		 (char __iomem *)dd->userbase :
+		 (char __iomem *)dd->kregbase + dd->uregbase)));
+}
+
+/**
+ * qib_read_ureg - read virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @ctxt: context number
+ *
+ * Return the contents of a register that is virtualized to be per context.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u64 qib_read_ureg(const struct qib_devdata *dd,
+				enum qib_ureg regno, int ctxt)
+{
+
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(regno + (u64 __iomem *)(
+		(dd->ureg_align * ctxt) + (dd->userbase ?
+		 (char __iomem *)dd->userbase :
+		 (char __iomem *)dd->kregbase + dd->uregbase)));
+}
+
+/**
+ * qib_write_ureg - write virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @ctxt: context
+ *
+ * Write the contents of a register that is virtualized to be per context.
+ */
+static inline void qib_write_ureg(const struct qib_devdata *dd,
+				  enum qib_ureg regno, u64 value, int ctxt)
+{
+	u64 __iomem *ubase;
+
+	if (dd->userbase)
+		ubase = (u64 __iomem *)
+			((char __iomem *) dd->userbase +
+			 dd->ureg_align * ctxt);
+	else
+		ubase = (u64 __iomem *)
+			(dd->uregbase +
+			 (char __iomem *) dd->kregbase +
+			 dd->ureg_align * ctxt);
+
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &ubase[regno]);
+}
+
+static inline u32 qib_read_kreg32(const struct qib_devdata *dd,
+				  const u32 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+	return readl((u32 __iomem *) &dd->kregbase[regno]);
+}
+
+static inline u64 qib_read_kreg64(const struct qib_devdata *dd,
+				  const u32 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+	return readq(&dd->kregbase[regno]);
+}
+
+static inline void qib_write_kreg(const struct qib_devdata *dd,
+				  const u32 regno, u64 value)
+{
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &dd->kregbase[regno]);
+}
+
+/*
+ * not many sanity checks for the port-specific kernel register routines,
+ * since they are only used when it's known to be safe.
+*/
+static inline u64 qib_read_kreg_port(const struct qib_pportdata *ppd,
+				     const u16 regno)
+{
+	if (!ppd->cpspec->kpregbase || !(ppd->dd->flags & QIB_PRESENT))
+		return 0ULL;
+	return readq(&ppd->cpspec->kpregbase[regno]);
+}
+
+static inline void qib_write_kreg_port(const struct qib_pportdata *ppd,
+				       const u16 regno, u64 value)
+{
+	if (ppd->cpspec && ppd->dd && ppd->cpspec->kpregbase &&
+	    (ppd->dd->flags & QIB_PRESENT))
+		writeq(value, &ppd->cpspec->kpregbase[regno]);
+}
+
+/**
+ * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register
+ * @dd: the qlogic_ib device
+ * @regno: the register number to write
+ * @ctxt: the context containing the register
+ * @value: the value to write
+ */
+static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd,
+				       const u16 regno, unsigned ctxt,
+				       u64 value)
+{
+	qib_write_kreg(dd, regno + ctxt, value);
+}
+
+static inline u64 read_7322_creg(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(&dd->cspec->cregbase[regno]);
+
+
+}
+
+static inline u32 read_7322_creg32(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(&dd->cspec->cregbase[regno]);
+
+
+}
+
+static inline void write_7322_creg_port(const struct qib_pportdata *ppd,
+					u16 regno, u64 value)
+{
+	if (ppd->cpspec && ppd->cpspec->cpregbase &&
+	    (ppd->dd->flags & QIB_PRESENT))
+		writeq(value, &ppd->cpspec->cpregbase[regno]);
+}
+
+static inline u64 read_7322_creg_port(const struct qib_pportdata *ppd,
+				      u16 regno)
+{
+	if (!ppd->cpspec || !ppd->cpspec->cpregbase ||
+	    !(ppd->dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(&ppd->cpspec->cpregbase[regno]);
+}
+
+static inline u32 read_7322_creg32_port(const struct qib_pportdata *ppd,
+					u16 regno)
+{
+	if (!ppd->cpspec || !ppd->cpspec->cpregbase ||
+	    !(ppd->dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(&ppd->cpspec->cpregbase[regno]);
+}
+
+/* bits in Control register */
+#define QLOGIC_IB_C_RESET SYM_MASK(Control, SyncReset)
+#define QLOGIC_IB_C_SDMAFETCHPRIOEN SYM_MASK(Control, SDmaDescFetchPriorityEn)
+
+/* bits in general interrupt regs */
+#define QIB_I_RCVURG_LSB SYM_LSB(IntMask, RcvUrg0IntMask)
+#define QIB_I_RCVURG_RMASK MASK_ACROSS(0, 17)
+#define QIB_I_RCVURG_MASK (QIB_I_RCVURG_RMASK << QIB_I_RCVURG_LSB)
+#define QIB_I_RCVAVAIL_LSB SYM_LSB(IntMask, RcvAvail0IntMask)
+#define QIB_I_RCVAVAIL_RMASK MASK_ACROSS(0, 17)
+#define QIB_I_RCVAVAIL_MASK (QIB_I_RCVAVAIL_RMASK << QIB_I_RCVAVAIL_LSB)
+#define QIB_I_C_ERROR INT_MASK(Err)
+
+#define QIB_I_SPIOSENT (INT_MASK_P(SendDone, 0) | INT_MASK_P(SendDone, 1))
+#define QIB_I_SPIOBUFAVAIL INT_MASK(SendBufAvail)
+#define QIB_I_GPIO INT_MASK(AssertGPIO)
+#define QIB_I_P_SDMAINT(pidx) \
+	(INT_MASK_P(SDma, pidx) | INT_MASK_P(SDmaIdle, pidx) | \
+	 INT_MASK_P(SDmaProgress, pidx) | \
+	 INT_MASK_PM(SDmaCleanupDone, pidx))
+
+/* Interrupt bits that are "per port" */
+#define QIB_I_P_BITSEXTANT(pidx) \
+	(INT_MASK_P(Err, pidx) | INT_MASK_P(SendDone, pidx) | \
+	INT_MASK_P(SDma, pidx) | INT_MASK_P(SDmaIdle, pidx) | \
+	INT_MASK_P(SDmaProgress, pidx) | \
+	INT_MASK_PM(SDmaCleanupDone, pidx))
+
+/* Interrupt bits that are common to a device */
+/* currently unused: QIB_I_SPIOSENT */
+#define QIB_I_C_BITSEXTANT \
+	(QIB_I_RCVURG_MASK | QIB_I_RCVAVAIL_MASK | \
+	QIB_I_SPIOSENT | \
+	QIB_I_C_ERROR | QIB_I_SPIOBUFAVAIL | QIB_I_GPIO)
+
+#define QIB_I_BITSEXTANT (QIB_I_C_BITSEXTANT | \
+	QIB_I_P_BITSEXTANT(0) | QIB_I_P_BITSEXTANT(1))
+
+/*
+ * Error bits that are "per port".
+ */
+#define QIB_E_P_IBSTATUSCHANGED ERR_MASK_N(IBStatusChanged)
+#define QIB_E_P_SHDR ERR_MASK_N(SHeadersErr)
+#define QIB_E_P_VL15_BUF_MISUSE ERR_MASK_N(VL15BufMisuseErr)
+#define QIB_E_P_SND_BUF_MISUSE ERR_MASK_N(SendBufMisuseErr)
+#define QIB_E_P_SUNSUPVL ERR_MASK_N(SendUnsupportedVLErr)
+#define QIB_E_P_SUNEXP_PKTNUM ERR_MASK_N(SendUnexpectedPktNumErr)
+#define QIB_E_P_SDROP_DATA ERR_MASK_N(SendDroppedDataPktErr)
+#define QIB_E_P_SDROP_SMP ERR_MASK_N(SendDroppedSmpPktErr)
+#define QIB_E_P_SPKTLEN ERR_MASK_N(SendPktLenErr)
+#define QIB_E_P_SUNDERRUN ERR_MASK_N(SendUnderRunErr)
+#define QIB_E_P_SMAXPKTLEN ERR_MASK_N(SendMaxPktLenErr)
+#define QIB_E_P_SMINPKTLEN ERR_MASK_N(SendMinPktLenErr)
+#define QIB_E_P_RIBLOSTLINK ERR_MASK_N(RcvIBLostLinkErr)
+#define QIB_E_P_RHDR ERR_MASK_N(RcvHdrErr)
+#define QIB_E_P_RHDRLEN ERR_MASK_N(RcvHdrLenErr)
+#define QIB_E_P_RBADTID ERR_MASK_N(RcvBadTidErr)
+#define QIB_E_P_RBADVERSION ERR_MASK_N(RcvBadVersionErr)
+#define QIB_E_P_RIBFLOW ERR_MASK_N(RcvIBFlowErr)
+#define QIB_E_P_REBP ERR_MASK_N(RcvEBPErr)
+#define QIB_E_P_RUNSUPVL ERR_MASK_N(RcvUnsupportedVLErr)
+#define QIB_E_P_RUNEXPCHAR ERR_MASK_N(RcvUnexpectedCharErr)
+#define QIB_E_P_RSHORTPKTLEN ERR_MASK_N(RcvShortPktLenErr)
+#define QIB_E_P_RLONGPKTLEN ERR_MASK_N(RcvLongPktLenErr)
+#define QIB_E_P_RMAXPKTLEN ERR_MASK_N(RcvMaxPktLenErr)
+#define QIB_E_P_RMINPKTLEN ERR_MASK_N(RcvMinPktLenErr)
+#define QIB_E_P_RICRC ERR_MASK_N(RcvICRCErr)
+#define QIB_E_P_RVCRC ERR_MASK_N(RcvVCRCErr)
+#define QIB_E_P_RFORMATERR ERR_MASK_N(RcvFormatErr)
+
+#define QIB_E_P_SDMA1STDESC ERR_MASK_N(SDma1stDescErr)
+#define QIB_E_P_SDMABASE ERR_MASK_N(SDmaBaseErr)
+#define QIB_E_P_SDMADESCADDRMISALIGN ERR_MASK_N(SDmaDescAddrMisalignErr)
+#define QIB_E_P_SDMADWEN ERR_MASK_N(SDmaDwEnErr)
+#define QIB_E_P_SDMAGENMISMATCH ERR_MASK_N(SDmaGenMismatchErr)
+#define QIB_E_P_SDMAHALT ERR_MASK_N(SDmaHaltErr)
+#define QIB_E_P_SDMAMISSINGDW ERR_MASK_N(SDmaMissingDwErr)
+#define QIB_E_P_SDMAOUTOFBOUND ERR_MASK_N(SDmaOutOfBoundErr)
+#define QIB_E_P_SDMARPYTAG ERR_MASK_N(SDmaRpyTagErr)
+#define QIB_E_P_SDMATAILOUTOFBOUND ERR_MASK_N(SDmaTailOutOfBoundErr)
+#define QIB_E_P_SDMAUNEXPDATA ERR_MASK_N(SDmaUnexpDataErr)
+
+/* Error bits that are common to a device */
+#define QIB_E_RESET ERR_MASK(ResetNegated)
+#define QIB_E_HARDWARE ERR_MASK(HardwareErr)
+#define QIB_E_INVALIDADDR ERR_MASK(InvalidAddrErr)
+
+
+/*
+ * Per chip (rather than per-port) errors.  Most either do
+ * nothing but trigger a print (because they self-recover, or
+ * always occur in tandem with other errors that handle the
+ * issue), or because they indicate errors with no recovery,
+ * but we want to know that they happened.
+ */
+#define QIB_E_SBUF_VL15_MISUSE ERR_MASK(SBufVL15MisUseErr)
+#define QIB_E_BADEEP ERR_MASK(InvalidEEPCmd)
+#define QIB_E_VLMISMATCH ERR_MASK(SendVLMismatchErr)
+#define QIB_E_ARMLAUNCH ERR_MASK(SendArmLaunchErr)
+#define QIB_E_SPCLTRIG ERR_MASK(SendSpecialTriggerErr)
+#define QIB_E_RRCVHDRFULL ERR_MASK(RcvHdrFullErr)
+#define QIB_E_RRCVEGRFULL ERR_MASK(RcvEgrFullErr)
+#define QIB_E_RCVCTXTSHARE ERR_MASK(RcvContextShareErr)
+
+/* SDMA chip errors (not per port)
+ * QIB_E_SDMA_BUF_DUP needs no special handling, because we will also get
+ * the SDMAHALT error immediately, so we just print the dup error via the
+ * E_AUTO mechanism.  This is true of most of the per-port fatal errors
+ * as well, but since this is port-independent, by definition, it's
+ * handled a bit differently.  SDMA_VL15 and SDMA_WRONG_PORT are per
+ * packet send errors, and so are handled in the same manner as other
+ * per-packet errors.
+ */
+#define QIB_E_SDMA_VL15 ERR_MASK(SDmaVL15Err)
+#define QIB_E_SDMA_WRONG_PORT ERR_MASK(SDmaWrongPortErr)
+#define QIB_E_SDMA_BUF_DUP ERR_MASK(SDmaBufMaskDuplicateErr)
+
+/*
+ * Below functionally equivalent to legacy QLOGIC_IB_E_PKTERRS
+ * it is used to print "common" packet errors.
+ */
+#define QIB_E_P_PKTERRS (QIB_E_P_SPKTLEN |\
+	QIB_E_P_SDROP_DATA | QIB_E_P_RVCRC |\
+	QIB_E_P_RICRC | QIB_E_P_RSHORTPKTLEN |\
+	QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SHDR | \
+	QIB_E_P_REBP)
+
+/* Error Bits that Packet-related (Receive, per-port) */
+#define QIB_E_P_RPKTERRS (\
+	QIB_E_P_RHDRLEN | QIB_E_P_RBADTID | \
+	QIB_E_P_RBADVERSION | QIB_E_P_RHDR | \
+	QIB_E_P_RLONGPKTLEN | QIB_E_P_RSHORTPKTLEN |\
+	QIB_E_P_RMAXPKTLEN | QIB_E_P_RMINPKTLEN | \
+	QIB_E_P_RFORMATERR | QIB_E_P_RUNSUPVL | \
+	QIB_E_P_RUNEXPCHAR | QIB_E_P_RIBFLOW | QIB_E_P_REBP)
+
+/*
+ * Error bits that are Send-related (per port)
+ * (ARMLAUNCH excluded from E_SPKTERRS because it gets special handling).
+ * All of these potentially need to have a buffer disarmed
+ */
+#define QIB_E_P_SPKTERRS (\
+	QIB_E_P_SUNEXP_PKTNUM |\
+	QIB_E_P_SDROP_DATA | QIB_E_P_SDROP_SMP |\
+	QIB_E_P_SMAXPKTLEN |\
+	QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SHDR | \
+	QIB_E_P_SMINPKTLEN | QIB_E_P_SPKTLEN | \
+	QIB_E_P_SND_BUF_MISUSE | QIB_E_P_SUNSUPVL)
+
+#define QIB_E_SPKTERRS ( \
+		QIB_E_SBUF_VL15_MISUSE | QIB_E_VLMISMATCH | \
+		ERR_MASK_N(SendUnsupportedVLErr) |			\
+		QIB_E_SPCLTRIG | QIB_E_SDMA_VL15 | QIB_E_SDMA_WRONG_PORT)
+
+#define QIB_E_P_SDMAERRS ( \
+	QIB_E_P_SDMAHALT | \
+	QIB_E_P_SDMADESCADDRMISALIGN | \
+	QIB_E_P_SDMAUNEXPDATA | \
+	QIB_E_P_SDMAMISSINGDW | \
+	QIB_E_P_SDMADWEN | \
+	QIB_E_P_SDMARPYTAG | \
+	QIB_E_P_SDMA1STDESC | \
+	QIB_E_P_SDMABASE | \
+	QIB_E_P_SDMATAILOUTOFBOUND | \
+	QIB_E_P_SDMAOUTOFBOUND | \
+	QIB_E_P_SDMAGENMISMATCH)
+
+/*
+ * This sets some bits more than once, but makes it more obvious which
+ * bits are not handled under other categories, and the repeat definition
+ * is not a problem.
+ */
+#define QIB_E_P_BITSEXTANT ( \
+	QIB_E_P_SPKTERRS | QIB_E_P_PKTERRS | QIB_E_P_RPKTERRS | \
+	QIB_E_P_RIBLOSTLINK | QIB_E_P_IBSTATUSCHANGED | \
+	QIB_E_P_SND_BUF_MISUSE | QIB_E_P_SUNDERRUN | \
+	QIB_E_P_SHDR | QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SDMAERRS \
+	)
+
+/*
+ * These are errors that can occur when the link
+ * changes state while a packet is being sent or received.  This doesn't
+ * cover things like EBP or VCRC that can be the result of a sending
+ * having the link change state, so we receive a "known bad" packet.
+ * All of these are "per port", so renamed:
+ */
+#define QIB_E_P_LINK_PKTERRS (\
+	QIB_E_P_SDROP_DATA | QIB_E_P_SDROP_SMP |\
+	QIB_E_P_SMINPKTLEN | QIB_E_P_SPKTLEN |\
+	QIB_E_P_RSHORTPKTLEN | QIB_E_P_RMINPKTLEN |\
+	QIB_E_P_RUNEXPCHAR)
+
+/*
+ * This sets some bits more than once, but makes it more obvious which
+ * bits are not handled under other categories (such as QIB_E_SPKTERRS),
+ * and the repeat definition is not a problem.
+ */
+#define QIB_E_C_BITSEXTANT (\
+	QIB_E_HARDWARE | QIB_E_INVALIDADDR | QIB_E_BADEEP |\
+	QIB_E_ARMLAUNCH | QIB_E_VLMISMATCH | QIB_E_RRCVHDRFULL |\
+	QIB_E_RRCVEGRFULL | QIB_E_RESET | QIB_E_SBUF_VL15_MISUSE)
+
+/* Likewise Neuter E_SPKT_ERRS_IGNORE */
+#define E_SPKT_ERRS_IGNORE 0
+
+#define QIB_EXTS_MEMBIST_DISABLED \
+	SYM_MASK(EXTStatus, MemBISTDisabled)
+#define QIB_EXTS_MEMBIST_ENDTEST \
+	SYM_MASK(EXTStatus, MemBISTEndTest)
+
+#define QIB_E_SPIOARMLAUNCH \
+	ERR_MASK(SendArmLaunchErr)
+
+#define IBA7322_IBCC_LINKINITCMD_MASK SYM_RMASK(IBCCtrlA_0, LinkInitCmd)
+#define IBA7322_IBCC_LINKCMD_SHIFT SYM_LSB(IBCCtrlA_0, LinkCmd)
+
+/*
+ * IBTA_1_2 is set when multiple speeds are enabled (normal),
+ * and also if forced QDR (only QDR enabled).  It's enabled for the
+ * forced QDR case so that scrambling will be enabled by the TS3
+ * exchange, when supported by both sides of the link.
+ */
+#define IBA7322_IBC_IBTA_1_2_MASK SYM_MASK(IBCCtrlB_0, IB_ENHANCED_MODE)
+#define IBA7322_IBC_MAX_SPEED_MASK SYM_MASK(IBCCtrlB_0, SD_SPEED)
+#define IBA7322_IBC_SPEED_QDR SYM_MASK(IBCCtrlB_0, SD_SPEED_QDR)
+#define IBA7322_IBC_SPEED_DDR SYM_MASK(IBCCtrlB_0, SD_SPEED_DDR)
+#define IBA7322_IBC_SPEED_SDR SYM_MASK(IBCCtrlB_0, SD_SPEED_SDR)
+#define IBA7322_IBC_SPEED_MASK (SYM_MASK(IBCCtrlB_0, SD_SPEED_SDR) | \
+	SYM_MASK(IBCCtrlB_0, SD_SPEED_DDR) | SYM_MASK(IBCCtrlB_0, SD_SPEED_QDR))
+#define IBA7322_IBC_SPEED_LSB SYM_LSB(IBCCtrlB_0, SD_SPEED_SDR)
+
+#define IBA7322_LEDBLINK_OFF_SHIFT SYM_LSB(RcvPktLEDCnt_0, OFFperiod)
+#define IBA7322_LEDBLINK_ON_SHIFT SYM_LSB(RcvPktLEDCnt_0, ONperiod)
+
+#define IBA7322_IBC_WIDTH_AUTONEG SYM_MASK(IBCCtrlB_0, IB_NUM_CHANNELS)
+#define IBA7322_IBC_WIDTH_4X_ONLY (1<<SYM_LSB(IBCCtrlB_0, IB_NUM_CHANNELS))
+#define IBA7322_IBC_WIDTH_1X_ONLY (0<<SYM_LSB(IBCCtrlB_0, IB_NUM_CHANNELS))
+
+#define IBA7322_IBC_RXPOL_MASK SYM_MASK(IBCCtrlB_0, IB_POLARITY_REV_SUPP)
+#define IBA7322_IBC_RXPOL_LSB SYM_LSB(IBCCtrlB_0, IB_POLARITY_REV_SUPP)
+#define IBA7322_IBC_HRTBT_MASK (SYM_MASK(IBCCtrlB_0, HRTBT_AUTO) | \
+	SYM_MASK(IBCCtrlB_0, HRTBT_ENB))
+#define IBA7322_IBC_HRTBT_RMASK (IBA7322_IBC_HRTBT_MASK >> \
+	SYM_LSB(IBCCtrlB_0, HRTBT_ENB))
+#define IBA7322_IBC_HRTBT_LSB SYM_LSB(IBCCtrlB_0, HRTBT_ENB)
+
+#define IBA7322_REDIRECT_VEC_PER_REG 12
+
+#define IBA7322_SENDCHK_PKEY SYM_MASK(SendCheckControl_0, PKey_En)
+#define IBA7322_SENDCHK_BTHQP SYM_MASK(SendCheckControl_0, BTHQP_En)
+#define IBA7322_SENDCHK_SLID SYM_MASK(SendCheckControl_0, SLID_En)
+#define IBA7322_SENDCHK_RAW_IPV6 SYM_MASK(SendCheckControl_0, RawIPV6_En)
+#define IBA7322_SENDCHK_MINSZ SYM_MASK(SendCheckControl_0, PacketTooSmall_En)
+
+#define AUTONEG_TRIES 3 /* sequential retries to negotiate DDR */
+
+#define HWE_AUTO(fldname) { .mask = SYM_MASK(HwErrMask, fldname##Mask), \
+	.msg = #fldname , .sz = sizeof(#fldname) }
+#define HWE_AUTO_P(fldname, port) { .mask = SYM_MASK(HwErrMask, \
+	fldname##Mask##_##port), .msg = #fldname , .sz = sizeof(#fldname) }
+static const struct qib_hwerror_msgs qib_7322_hwerror_msgs[] = {
+	HWE_AUTO_P(IBSerdesPClkNotDetect, 1),
+	HWE_AUTO_P(IBSerdesPClkNotDetect, 0),
+	HWE_AUTO(PCIESerdesPClkNotDetect),
+	HWE_AUTO(PowerOnBISTFailed),
+	HWE_AUTO(TempsenseTholdReached),
+	HWE_AUTO(MemoryErr),
+	HWE_AUTO(PCIeBusParityErr),
+	HWE_AUTO(PcieCplTimeout),
+	HWE_AUTO(PciePoisonedTLP),
+	HWE_AUTO_P(SDmaMemReadErr, 1),
+	HWE_AUTO_P(SDmaMemReadErr, 0),
+	HWE_AUTO_P(IBCBusFromSPCParityErr, 1),
+	HWE_AUTO_P(IBCBusToSPCParityErr, 1),
+	HWE_AUTO_P(IBCBusFromSPCParityErr, 0),
+	HWE_AUTO(statusValidNoEop),
+	HWE_AUTO(LATriggered),
+	{ .mask = 0, .sz = 0 }
+};
+
+#define E_AUTO(fldname) { .mask = SYM_MASK(ErrMask, fldname##Mask), \
+	.msg = #fldname, .sz = sizeof(#fldname) }
+#define E_P_AUTO(fldname) { .mask = SYM_MASK(ErrMask_0, fldname##Mask), \
+	.msg = #fldname, .sz = sizeof(#fldname) }
+static const struct qib_hwerror_msgs qib_7322error_msgs[] = {
+	E_AUTO(RcvEgrFullErr),
+	E_AUTO(RcvHdrFullErr),
+	E_AUTO(ResetNegated),
+	E_AUTO(HardwareErr),
+	E_AUTO(InvalidAddrErr),
+	E_AUTO(SDmaVL15Err),
+	E_AUTO(SBufVL15MisUseErr),
+	E_AUTO(InvalidEEPCmd),
+	E_AUTO(RcvContextShareErr),
+	E_AUTO(SendVLMismatchErr),
+	E_AUTO(SendArmLaunchErr),
+	E_AUTO(SendSpecialTriggerErr),
+	E_AUTO(SDmaWrongPortErr),
+	E_AUTO(SDmaBufMaskDuplicateErr),
+	{ .mask = 0, .sz = 0 }
+};
+
+static const struct  qib_hwerror_msgs qib_7322p_error_msgs[] = {
+	E_P_AUTO(IBStatusChanged),
+	E_P_AUTO(SHeadersErr),
+	E_P_AUTO(VL15BufMisuseErr),
+	/*
+	 * SDmaHaltErr is not really an error, make it clearer;
+	 */
+	{.mask = SYM_MASK(ErrMask_0, SDmaHaltErrMask), .msg = "SDmaHalted",
+		.sz = 11},
+	E_P_AUTO(SDmaDescAddrMisalignErr),
+	E_P_AUTO(SDmaUnexpDataErr),
+	E_P_AUTO(SDmaMissingDwErr),
+	E_P_AUTO(SDmaDwEnErr),
+	E_P_AUTO(SDmaRpyTagErr),
+	E_P_AUTO(SDma1stDescErr),
+	E_P_AUTO(SDmaBaseErr),
+	E_P_AUTO(SDmaTailOutOfBoundErr),
+	E_P_AUTO(SDmaOutOfBoundErr),
+	E_P_AUTO(SDmaGenMismatchErr),
+	E_P_AUTO(SendBufMisuseErr),
+	E_P_AUTO(SendUnsupportedVLErr),
+	E_P_AUTO(SendUnexpectedPktNumErr),
+	E_P_AUTO(SendDroppedDataPktErr),
+	E_P_AUTO(SendDroppedSmpPktErr),
+	E_P_AUTO(SendPktLenErr),
+	E_P_AUTO(SendUnderRunErr),
+	E_P_AUTO(SendMaxPktLenErr),
+	E_P_AUTO(SendMinPktLenErr),
+	E_P_AUTO(RcvIBLostLinkErr),
+	E_P_AUTO(RcvHdrErr),
+	E_P_AUTO(RcvHdrLenErr),
+	E_P_AUTO(RcvBadTidErr),
+	E_P_AUTO(RcvBadVersionErr),
+	E_P_AUTO(RcvIBFlowErr),
+	E_P_AUTO(RcvEBPErr),
+	E_P_AUTO(RcvUnsupportedVLErr),
+	E_P_AUTO(RcvUnexpectedCharErr),
+	E_P_AUTO(RcvShortPktLenErr),
+	E_P_AUTO(RcvLongPktLenErr),
+	E_P_AUTO(RcvMaxPktLenErr),
+	E_P_AUTO(RcvMinPktLenErr),
+	E_P_AUTO(RcvICRCErr),
+	E_P_AUTO(RcvVCRCErr),
+	E_P_AUTO(RcvFormatErr),
+	{ .mask = 0, .sz = 0 }
+};
+
+/*
+ * Below generates "auto-message" for interrupts not specific to any port or
+ * context
+ */
+#define INTR_AUTO(fldname) { .mask = SYM_MASK(IntMask, fldname##Mask), \
+	.msg = #fldname, .sz = sizeof(#fldname) }
+/* Below generates "auto-message" for interrupts specific to a port */
+#define INTR_AUTO_P(fldname) { .mask = MASK_ACROSS(\
+	SYM_LSB(IntMask, fldname##Mask##_0), \
+	SYM_LSB(IntMask, fldname##Mask##_1)), \
+	.msg = #fldname "_P", .sz = sizeof(#fldname "_P") }
+/* For some reason, the SerDesTrimDone bits are reversed */
+#define INTR_AUTO_PI(fldname) { .mask = MASK_ACROSS(\
+	SYM_LSB(IntMask, fldname##Mask##_1), \
+	SYM_LSB(IntMask, fldname##Mask##_0)), \
+	.msg = #fldname "_P", .sz = sizeof(#fldname "_P") }
+/*
+ * Below generates "auto-message" for interrupts specific to a context,
+ * with ctxt-number appended
+ */
+#define INTR_AUTO_C(fldname) { .mask = MASK_ACROSS(\
+	SYM_LSB(IntMask, fldname##0IntMask), \
+	SYM_LSB(IntMask, fldname##17IntMask)), \
+	.msg = #fldname "_C", .sz = sizeof(#fldname "_C") }
+
+#define TXSYMPTOM_AUTO_P(fldname) \
+	{ .mask = SYM_MASK(SendHdrErrSymptom_0, fldname), \
+	.msg = #fldname, .sz = sizeof(#fldname) }
+static const struct  qib_hwerror_msgs hdrchk_msgs[] = {
+	TXSYMPTOM_AUTO_P(NonKeyPacket),
+	TXSYMPTOM_AUTO_P(GRHFail),
+	TXSYMPTOM_AUTO_P(PkeyFail),
+	TXSYMPTOM_AUTO_P(QPFail),
+	TXSYMPTOM_AUTO_P(SLIDFail),
+	TXSYMPTOM_AUTO_P(RawIPV6),
+	TXSYMPTOM_AUTO_P(PacketTooSmall),
+	{ .mask = 0, .sz = 0 }
+};
+
+#define IBA7322_HDRHEAD_PKTINT_SHIFT 32 /* interrupt cnt in upper 32 bits */
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used,
+ * because we don't need to force the update of pioavail
+ */
+static void qib_disarm_7322_senderrbufs(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 i;
+	int any;
+	u32 piobcnt = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS;
+	u32 regcnt = (piobcnt + BITS_PER_LONG - 1) / BITS_PER_LONG;
+	unsigned long sbuf[4];
+
+	/*
+	 * It's possible that sendbuffererror could have bits set; might
+	 * have already done this as a result of hardware error handling.
+	 */
+	any = 0;
+	for (i = 0; i < regcnt; ++i) {
+		sbuf[i] = qib_read_kreg64(dd, kr_sendbuffererror + i);
+		if (sbuf[i]) {
+			any = 1;
+			qib_write_kreg(dd, kr_sendbuffererror + i, sbuf[i]);
+		}
+	}
+
+	if (any)
+		qib_disarm_piobufs_set(dd, sbuf, piobcnt);
+}
+
+/* No txe_recover yet, if ever */
+
+/* No decode__errors yet */
+static void err_decode(char *msg, size_t len, u64 errs,
+		       const struct qib_hwerror_msgs *msp)
+{
+	u64 these, lmask;
+	int took, multi, n = 0;
+
+	while (errs && msp && msp->mask) {
+		multi = (msp->mask & (msp->mask - 1));
+		while (errs & msp->mask) {
+			these = (errs & msp->mask);
+			lmask = (these & (these - 1)) ^ these;
+			if (len) {
+				if (n++) {
+					/* separate the strings */
+					*msg++ = ',';
+					len--;
+				}
+				BUG_ON(!msp->sz);
+				/* msp->sz counts the nul */
+				took = min_t(size_t, msp->sz - (size_t)1, len);
+				memcpy(msg,  msp->msg, took);
+				len -= took;
+				msg += took;
+				if (len)
+					*msg = '\0';
+			}
+			errs &= ~lmask;
+			if (len && multi) {
+				/* More than one bit this mask */
+				int idx = -1;
+
+				while (lmask & msp->mask) {
+					++idx;
+					lmask >>= 1;
+				}
+				took = scnprintf(msg, len, "_%d", idx);
+				len -= took;
+				msg += took;
+			}
+		}
+		++msp;
+	}
+	/* If some bits are left, show in hex. */
+	if (len && errs)
+		snprintf(msg, len, "%sMORE:%llX", n ? "," : "",
+			(unsigned long long) errs);
+}
+
+/* only called if r1 set */
+static void flush_fifo(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 __iomem *piobuf;
+	u32 bufn;
+	u32 *hdr;
+	u64 pbc;
+	const unsigned hdrwords = 7;
+	static struct ib_header ibhdr = {
+		.lrh[0] = cpu_to_be16(0xF000 | QIB_LRH_BTH),
+		.lrh[1] = IB_LID_PERMISSIVE,
+		.lrh[2] = cpu_to_be16(hdrwords + SIZE_OF_CRC),
+		.lrh[3] = IB_LID_PERMISSIVE,
+		.u.oth.bth[0] = cpu_to_be32(
+			(IB_OPCODE_UD_SEND_ONLY << 24) | QIB_DEFAULT_P_KEY),
+		.u.oth.bth[1] = cpu_to_be32(0),
+		.u.oth.bth[2] = cpu_to_be32(0),
+		.u.oth.u.ud.deth[0] = cpu_to_be32(0),
+		.u.oth.u.ud.deth[1] = cpu_to_be32(0),
+	};
+
+	/*
+	 * Send a dummy VL15 packet to flush the launch FIFO.
+	 * This will not actually be sent since the TxeBypassIbc bit is set.
+	 */
+	pbc = PBC_7322_VL15_SEND |
+		(((u64)ppd->hw_pidx) << (PBC_PORT_SEL_LSB + 32)) |
+		(hdrwords + SIZE_OF_CRC);
+	piobuf = qib_7322_getsendbuf(ppd, pbc, &bufn);
+	if (!piobuf)
+		return;
+	writeq(pbc, piobuf);
+	hdr = (u32 *) &ibhdr;
+	if (dd->flags & QIB_PIO_FLUSH_WC) {
+		qib_flush_wc();
+		qib_pio_copy(piobuf + 2, hdr, hdrwords - 1);
+		qib_flush_wc();
+		__raw_writel(hdr[hdrwords - 1], piobuf + hdrwords + 1);
+		qib_flush_wc();
+	} else
+		qib_pio_copy(piobuf + 2, hdr, hdrwords);
+	qib_sendbuf_done(dd, bufn);
+}
+
+/*
+ * This is called with interrupts disabled and sdma_lock held.
+ */
+static void qib_7322_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 set_sendctrl = 0;
+	u64 clr_sendctrl = 0;
+
+	if (op & QIB_SDMA_SENDCTRL_OP_ENABLE)
+		set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaEnable);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaEnable);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_INTENABLE)
+		set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaIntEnable);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaIntEnable);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_HALT)
+		set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaHalt);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaHalt);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_DRAIN)
+		set_sendctrl |= SYM_MASK(SendCtrl_0, TxeBypassIbc) |
+				SYM_MASK(SendCtrl_0, TxeAbortIbc) |
+				SYM_MASK(SendCtrl_0, TxeDrainRmFifo);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl_0, TxeBypassIbc) |
+				SYM_MASK(SendCtrl_0, TxeAbortIbc) |
+				SYM_MASK(SendCtrl_0, TxeDrainRmFifo);
+
+	spin_lock(&dd->sendctrl_lock);
+
+	/* If we are draining everything, block sends first */
+	if (op & QIB_SDMA_SENDCTRL_OP_DRAIN) {
+		ppd->p_sendctrl &= ~SYM_MASK(SendCtrl_0, SendEnable);
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	ppd->p_sendctrl |= set_sendctrl;
+	ppd->p_sendctrl &= ~clr_sendctrl;
+
+	if (op & QIB_SDMA_SENDCTRL_OP_CLEANUP)
+		qib_write_kreg_port(ppd, krp_sendctrl,
+				    ppd->p_sendctrl |
+				    SYM_MASK(SendCtrl_0, SDmaCleanup));
+	else
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_DRAIN) {
+		ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, SendEnable);
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	spin_unlock(&dd->sendctrl_lock);
+
+	if ((op & QIB_SDMA_SENDCTRL_OP_DRAIN) && ppd->dd->cspec->r1)
+		flush_fifo(ppd);
+}
+
+static void qib_7322_sdma_hw_clean_up(struct qib_pportdata *ppd)
+{
+	__qib_sdma_process_event(ppd, qib_sdma_event_e50_hw_cleaned);
+}
+
+static void qib_sdma_7322_setlengen(struct qib_pportdata *ppd)
+{
+	/*
+	 * Set SendDmaLenGen and clear and set
+	 * the MSB of the generation count to enable generation checking
+	 * and load the internal generation counter.
+	 */
+	qib_write_kreg_port(ppd, krp_senddmalengen, ppd->sdma_descq_cnt);
+	qib_write_kreg_port(ppd, krp_senddmalengen,
+			    ppd->sdma_descq_cnt |
+			    (1ULL << QIB_7322_SendDmaLenGen_0_Generation_MSB));
+}
+
+/*
+ * Must be called with sdma_lock held, or before init finished.
+ */
+static void qib_sdma_update_7322_tail(struct qib_pportdata *ppd, u16 tail)
+{
+	/* Commit writes to memory and advance the tail on the chip */
+	wmb();
+	ppd->sdma_descq_tail = tail;
+	qib_write_kreg_port(ppd, krp_senddmatail, tail);
+}
+
+/*
+ * This is called with interrupts disabled and sdma_lock held.
+ */
+static void qib_7322_sdma_hw_start_up(struct qib_pportdata *ppd)
+{
+	/*
+	 * Drain all FIFOs.
+	 * The hardware doesn't require this but we do it so that verbs
+	 * and user applications don't wait for link active to send stale
+	 * data.
+	 */
+	sendctrl_7322_mod(ppd, QIB_SENDCTRL_FLUSH);
+
+	qib_sdma_7322_setlengen(ppd);
+	qib_sdma_update_7322_tail(ppd, 0); /* Set SendDmaTail */
+	ppd->sdma_head_dma[0] = 0;
+	qib_7322_sdma_sendctrl(ppd,
+		ppd->sdma_state.current_op | QIB_SDMA_SENDCTRL_OP_CLEANUP);
+}
+
+#define DISABLES_SDMA ( \
+	QIB_E_P_SDMAHALT | \
+	QIB_E_P_SDMADESCADDRMISALIGN | \
+	QIB_E_P_SDMAMISSINGDW | \
+	QIB_E_P_SDMADWEN | \
+	QIB_E_P_SDMARPYTAG | \
+	QIB_E_P_SDMA1STDESC | \
+	QIB_E_P_SDMABASE | \
+	QIB_E_P_SDMATAILOUTOFBOUND | \
+	QIB_E_P_SDMAOUTOFBOUND | \
+	QIB_E_P_SDMAGENMISMATCH)
+
+static void sdma_7322_p_errors(struct qib_pportdata *ppd, u64 errs)
+{
+	unsigned long flags;
+	struct qib_devdata *dd = ppd->dd;
+
+	errs &= QIB_E_P_SDMAERRS;
+	err_decode(ppd->cpspec->sdmamsgbuf, sizeof(ppd->cpspec->sdmamsgbuf),
+		   errs, qib_7322p_error_msgs);
+
+	if (errs & QIB_E_P_SDMAUNEXPDATA)
+		qib_dev_err(dd, "IB%u:%u SDmaUnexpData\n", dd->unit,
+			    ppd->port);
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	if (errs != QIB_E_P_SDMAHALT) {
+		/* SDMA errors have QIB_E_P_SDMAHALT and another bit set */
+		qib_dev_porterr(dd, ppd->port,
+			"SDMA %s 0x%016llx %s\n",
+			qib_sdma_state_names[ppd->sdma_state.current_state],
+			errs, ppd->cpspec->sdmamsgbuf);
+		dump_sdma_7322_state(ppd);
+	}
+
+	switch (ppd->sdma_state.current_state) {
+	case qib_sdma_state_s00_hw_down:
+		break;
+
+	case qib_sdma_state_s10_hw_start_up_wait:
+		if (errs & QIB_E_P_SDMAHALT)
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e20_hw_started);
+		break;
+
+	case qib_sdma_state_s20_idle:
+		break;
+
+	case qib_sdma_state_s30_sw_clean_up_wait:
+		break;
+
+	case qib_sdma_state_s40_hw_clean_up_wait:
+		if (errs & QIB_E_P_SDMAHALT)
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e50_hw_cleaned);
+		break;
+
+	case qib_sdma_state_s50_hw_halt_wait:
+		if (errs & QIB_E_P_SDMAHALT)
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e60_hw_halted);
+		break;
+
+	case qib_sdma_state_s99_running:
+		__qib_sdma_process_event(ppd, qib_sdma_event_e7322_err_halted);
+		__qib_sdma_process_event(ppd, qib_sdma_event_e60_hw_halted);
+		break;
+	}
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+/*
+ * handle per-device errors (not per-port errors)
+ */
+static noinline void handle_7322_errors(struct qib_devdata *dd)
+{
+	char *msg;
+	u64 iserr = 0;
+	u64 errs;
+	u64 mask;
+
+	qib_stats.sps_errints++;
+	errs = qib_read_kreg64(dd, kr_errstatus);
+	if (!errs) {
+		qib_devinfo(dd->pcidev,
+			"device error interrupt, but no error bits set!\n");
+		goto done;
+	}
+
+	/* don't report errors that are masked */
+	errs &= dd->cspec->errormask;
+	msg = dd->cspec->emsgbuf;
+
+	/* do these first, they are most important */
+	if (errs & QIB_E_HARDWARE) {
+		*msg = '\0';
+		qib_7322_handle_hwerrors(dd, msg, sizeof(dd->cspec->emsgbuf));
+	}
+
+	if (errs & QIB_E_SPKTERRS) {
+		qib_disarm_7322_senderrbufs(dd->pport);
+		qib_stats.sps_txerrs++;
+	} else if (errs & QIB_E_INVALIDADDR)
+		qib_stats.sps_txerrs++;
+	else if (errs & QIB_E_ARMLAUNCH) {
+		qib_stats.sps_txerrs++;
+		qib_disarm_7322_senderrbufs(dd->pport);
+	}
+	qib_write_kreg(dd, kr_errclear, errs);
+
+	/*
+	 * The ones we mask off are handled specially below
+	 * or above.  Also mask SDMADISABLED by default as it
+	 * is too chatty.
+	 */
+	mask = QIB_E_HARDWARE;
+	*msg = '\0';
+
+	err_decode(msg, sizeof(dd->cspec->emsgbuf), errs & ~mask,
+		   qib_7322error_msgs);
+
+	/*
+	 * Getting reset is a tragedy for all ports. Mark the device
+	 * _and_ the ports as "offline" in way meaningful to each.
+	 */
+	if (errs & QIB_E_RESET) {
+		int pidx;
+
+		qib_dev_err(dd,
+			"Got reset, requires re-init (unload and reload driver)\n");
+		dd->flags &= ~QIB_INITTED;  /* needs re-init */
+		/* mark as having had error */
+		*dd->devstatusp |= QIB_STATUS_HWERROR;
+		for (pidx = 0; pidx < dd->num_pports; ++pidx)
+			if (dd->pport[pidx].link_speed_supported)
+				*dd->pport[pidx].statusp &= ~QIB_STATUS_IB_CONF;
+	}
+
+	if (*msg && iserr)
+		qib_dev_err(dd, "%s error\n", msg);
+
+	/*
+	 * If there were hdrq or egrfull errors, wake up any processes
+	 * waiting in poll.  We used to try to check which contexts had
+	 * the overflow, but given the cost of that and the chip reads
+	 * to support it, it's better to just wake everybody up if we
+	 * get an overflow; waiters can poll again if it's not them.
+	 */
+	if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) {
+		qib_handle_urcv(dd, ~0U);
+		if (errs & ERR_MASK(RcvEgrFullErr))
+			qib_stats.sps_buffull++;
+		else
+			qib_stats.sps_hdrfull++;
+	}
+
+done:
+	return;
+}
+
+static void qib_error_tasklet(unsigned long data)
+{
+	struct qib_devdata *dd = (struct qib_devdata *)data;
+
+	handle_7322_errors(dd);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+}
+
+static void reenable_chase(struct timer_list *t)
+{
+	struct qib_chippport_specific *cp = from_timer(cp, t, chase_timer);
+	struct qib_pportdata *ppd = cp->ppd;
+
+	ppd->cpspec->chase_timer.expires = 0;
+	qib_set_ib_7322_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN,
+		QLOGIC_IB_IBCC_LINKINITCMD_POLL);
+}
+
+static void disable_chase(struct qib_pportdata *ppd, unsigned long tnow,
+		u8 ibclt)
+{
+	ppd->cpspec->chase_end = 0;
+
+	if (!qib_chase)
+		return;
+
+	qib_set_ib_7322_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN,
+		QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+	ppd->cpspec->chase_timer.expires = jiffies + QIB_CHASE_DIS_TIME;
+	add_timer(&ppd->cpspec->chase_timer);
+}
+
+static void handle_serdes_issues(struct qib_pportdata *ppd, u64 ibcst)
+{
+	u8 ibclt;
+	unsigned long tnow;
+
+	ibclt = (u8)SYM_FIELD(ibcst, IBCStatusA_0, LinkTrainingState);
+
+	/*
+	 * Detect and handle the state chase issue, where we can
+	 * get stuck if we are unlucky on timing on both sides of
+	 * the link.   If we are, we disable, set a timer, and
+	 * then re-enable.
+	 */
+	switch (ibclt) {
+	case IB_7322_LT_STATE_CFGRCVFCFG:
+	case IB_7322_LT_STATE_CFGWAITRMT:
+	case IB_7322_LT_STATE_TXREVLANES:
+	case IB_7322_LT_STATE_CFGENH:
+		tnow = jiffies;
+		if (ppd->cpspec->chase_end &&
+		     time_after(tnow, ppd->cpspec->chase_end))
+			disable_chase(ppd, tnow, ibclt);
+		else if (!ppd->cpspec->chase_end)
+			ppd->cpspec->chase_end = tnow + QIB_CHASE_TIME;
+		break;
+	default:
+		ppd->cpspec->chase_end = 0;
+		break;
+	}
+
+	if (((ibclt >= IB_7322_LT_STATE_CFGTEST &&
+	      ibclt <= IB_7322_LT_STATE_CFGWAITENH) ||
+	     ibclt == IB_7322_LT_STATE_LINKUP) &&
+	    (ibcst & SYM_MASK(IBCStatusA_0, LinkSpeedQDR))) {
+		force_h1(ppd);
+		ppd->cpspec->qdr_reforce = 1;
+		if (!ppd->dd->cspec->r1)
+			serdes_7322_los_enable(ppd, 0);
+	} else if (ppd->cpspec->qdr_reforce &&
+		(ibcst & SYM_MASK(IBCStatusA_0, LinkSpeedQDR)) &&
+		 (ibclt == IB_7322_LT_STATE_CFGENH ||
+		ibclt == IB_7322_LT_STATE_CFGIDLE ||
+		ibclt == IB_7322_LT_STATE_LINKUP))
+		force_h1(ppd);
+
+	if ((IS_QMH(ppd->dd) || IS_QME(ppd->dd)) &&
+	    ppd->link_speed_enabled == QIB_IB_QDR &&
+	    (ibclt == IB_7322_LT_STATE_CFGTEST ||
+	     ibclt == IB_7322_LT_STATE_CFGENH ||
+	     (ibclt >= IB_7322_LT_STATE_POLLACTIVE &&
+	      ibclt <= IB_7322_LT_STATE_SLEEPQUIET)))
+		adj_tx_serdes(ppd);
+
+	if (ibclt != IB_7322_LT_STATE_LINKUP) {
+		u8 ltstate = qib_7322_phys_portstate(ibcst);
+		u8 pibclt = (u8)SYM_FIELD(ppd->lastibcstat, IBCStatusA_0,
+					  LinkTrainingState);
+		if (!ppd->dd->cspec->r1 &&
+		    pibclt == IB_7322_LT_STATE_LINKUP &&
+		    ltstate != IB_PHYSPORTSTATE_LINK_ERR_RECOVER &&
+		    ltstate != IB_PHYSPORTSTATE_RECOVERY_RETRAIN &&
+		    ltstate != IB_PHYSPORTSTATE_RECOVERY_WAITRMT &&
+		    ltstate != IB_PHYSPORTSTATE_RECOVERY_IDLE)
+			/* If the link went down (but no into recovery,
+			 * turn LOS back on */
+			serdes_7322_los_enable(ppd, 1);
+		if (!ppd->cpspec->qdr_dfe_on &&
+		    ibclt <= IB_7322_LT_STATE_SLEEPQUIET) {
+			ppd->cpspec->qdr_dfe_on = 1;
+			ppd->cpspec->qdr_dfe_time = 0;
+			/* On link down, reenable QDR adaptation */
+			qib_write_kreg_port(ppd, krp_static_adapt_dis(2),
+					    ppd->dd->cspec->r1 ?
+					    QDR_STATIC_ADAPT_DOWN_R1 :
+					    QDR_STATIC_ADAPT_DOWN);
+			pr_info(
+				"IB%u:%u re-enabled QDR adaptation ibclt %x\n",
+				ppd->dd->unit, ppd->port, ibclt);
+		}
+	}
+}
+
+static int qib_7322_set_ib_cfg(struct qib_pportdata *, int, u32);
+
+/*
+ * This is per-pport error handling.
+ * will likely get it's own MSIx interrupt (one for each port,
+ * although just a single handler).
+ */
+static noinline void handle_7322_p_errors(struct qib_pportdata *ppd)
+{
+	char *msg;
+	u64 ignore_this_time = 0, iserr = 0, errs, fmask;
+	struct qib_devdata *dd = ppd->dd;
+
+	/* do this as soon as possible */
+	fmask = qib_read_kreg64(dd, kr_act_fmask);
+	if (!fmask)
+		check_7322_rxe_status(ppd);
+
+	errs = qib_read_kreg_port(ppd, krp_errstatus);
+	if (!errs)
+		qib_devinfo(dd->pcidev,
+			 "Port%d error interrupt, but no error bits set!\n",
+			 ppd->port);
+	if (!fmask)
+		errs &= ~QIB_E_P_IBSTATUSCHANGED;
+	if (!errs)
+		goto done;
+
+	msg = ppd->cpspec->epmsgbuf;
+	*msg = '\0';
+
+	if (errs & ~QIB_E_P_BITSEXTANT) {
+		err_decode(msg, sizeof(ppd->cpspec->epmsgbuf),
+			   errs & ~QIB_E_P_BITSEXTANT, qib_7322p_error_msgs);
+		if (!*msg)
+			snprintf(msg, sizeof(ppd->cpspec->epmsgbuf),
+				 "no others");
+		qib_dev_porterr(dd, ppd->port,
+			"error interrupt with unknown errors 0x%016Lx set (and %s)\n",
+			(errs & ~QIB_E_P_BITSEXTANT), msg);
+		*msg = '\0';
+	}
+
+	if (errs & QIB_E_P_SHDR) {
+		u64 symptom;
+
+		/* determine cause, then write to clear */
+		symptom = qib_read_kreg_port(ppd, krp_sendhdrsymptom);
+		qib_write_kreg_port(ppd, krp_sendhdrsymptom, 0);
+		err_decode(msg, sizeof(ppd->cpspec->epmsgbuf), symptom,
+			   hdrchk_msgs);
+		*msg = '\0';
+		/* senderrbuf cleared in SPKTERRS below */
+	}
+
+	if (errs & QIB_E_P_SPKTERRS) {
+		if ((errs & QIB_E_P_LINK_PKTERRS) &&
+		    !(ppd->lflags & QIBL_LINKACTIVE)) {
+			/*
+			 * This can happen when trying to bring the link
+			 * up, but the IB link changes state at the "wrong"
+			 * time. The IB logic then complains that the packet
+			 * isn't valid.  We don't want to confuse people, so
+			 * we just don't print them, except at debug
+			 */
+			err_decode(msg, sizeof(ppd->cpspec->epmsgbuf),
+				   (errs & QIB_E_P_LINK_PKTERRS),
+				   qib_7322p_error_msgs);
+			*msg = '\0';
+			ignore_this_time = errs & QIB_E_P_LINK_PKTERRS;
+		}
+		qib_disarm_7322_senderrbufs(ppd);
+	} else if ((errs & QIB_E_P_LINK_PKTERRS) &&
+		   !(ppd->lflags & QIBL_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		err_decode(msg, sizeof(ppd->cpspec->epmsgbuf), errs,
+			   qib_7322p_error_msgs);
+		ignore_this_time = errs & QIB_E_P_LINK_PKTERRS;
+		*msg = '\0';
+	}
+
+	qib_write_kreg_port(ppd, krp_errclear, errs);
+
+	errs &= ~ignore_this_time;
+	if (!errs)
+		goto done;
+
+	if (errs & QIB_E_P_RPKTERRS)
+		qib_stats.sps_rcverrs++;
+	if (errs & QIB_E_P_SPKTERRS)
+		qib_stats.sps_txerrs++;
+
+	iserr = errs & ~(QIB_E_P_RPKTERRS | QIB_E_P_PKTERRS);
+
+	if (errs & QIB_E_P_SDMAERRS)
+		sdma_7322_p_errors(ppd, errs);
+
+	if (errs & QIB_E_P_IBSTATUSCHANGED) {
+		u64 ibcs;
+		u8 ltstate;
+
+		ibcs = qib_read_kreg_port(ppd, krp_ibcstatus_a);
+		ltstate = qib_7322_phys_portstate(ibcs);
+
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+			handle_serdes_issues(ppd, ibcs);
+		if (!(ppd->cpspec->ibcctrl_a &
+		      SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn))) {
+			/*
+			 * We got our interrupt, so init code should be
+			 * happy and not try alternatives. Now squelch
+			 * other "chatter" from link-negotiation (pre Init)
+			 */
+			ppd->cpspec->ibcctrl_a |=
+				SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn);
+			qib_write_kreg_port(ppd, krp_ibcctrl_a,
+					    ppd->cpspec->ibcctrl_a);
+		}
+
+		/* Update our picture of width and speed from chip */
+		ppd->link_width_active =
+			(ibcs & SYM_MASK(IBCStatusA_0, LinkWidthActive)) ?
+			    IB_WIDTH_4X : IB_WIDTH_1X;
+		ppd->link_speed_active = (ibcs & SYM_MASK(IBCStatusA_0,
+			LinkSpeedQDR)) ? QIB_IB_QDR : (ibcs &
+			  SYM_MASK(IBCStatusA_0, LinkSpeedActive)) ?
+				   QIB_IB_DDR : QIB_IB_SDR;
+
+		if ((ppd->lflags & QIBL_IB_LINK_DISABLED) && ltstate !=
+		    IB_PHYSPORTSTATE_DISABLED)
+			qib_set_ib_7322_lstate(ppd, 0,
+			       QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+		else
+			/*
+			 * Since going into a recovery state causes the link
+			 * state to go down and since recovery is transitory,
+			 * it is better if we "miss" ever seeing the link
+			 * training state go into recovery (i.e., ignore this
+			 * transition for link state special handling purposes)
+			 * without updating lastibcstat.
+			 */
+			if (ltstate != IB_PHYSPORTSTATE_LINK_ERR_RECOVER &&
+			    ltstate != IB_PHYSPORTSTATE_RECOVERY_RETRAIN &&
+			    ltstate != IB_PHYSPORTSTATE_RECOVERY_WAITRMT &&
+			    ltstate != IB_PHYSPORTSTATE_RECOVERY_IDLE)
+				qib_handle_e_ibstatuschanged(ppd, ibcs);
+	}
+	if (*msg && iserr)
+		qib_dev_porterr(dd, ppd->port, "%s error\n", msg);
+
+	if (ppd->state_wanted & ppd->lflags)
+		wake_up_interruptible(&ppd->state_wait);
+done:
+	return;
+}
+
+/* enable/disable chip from delivering interrupts */
+static void qib_7322_set_intr_state(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		if (dd->flags & QIB_BADINTR)
+			return;
+		qib_write_kreg(dd, kr_intmask, dd->cspec->int_enable_mask);
+		/* cause any pending enabled interrupts to be re-delivered */
+		qib_write_kreg(dd, kr_intclear, 0ULL);
+		if (dd->cspec->num_msix_entries) {
+			/* and same for MSIx */
+			u64 val = qib_read_kreg64(dd, kr_intgranted);
+
+			if (val)
+				qib_write_kreg(dd, kr_intgranted, val);
+		}
+	} else
+		qib_write_kreg(dd, kr_intmask, 0ULL);
+}
+
+/*
+ * Try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ * This is in chip-specific code because of all of the register accesses,
+ * even though the details are similar on most chips.
+ */
+static void qib_7322_clear_freeze(struct qib_devdata *dd)
+{
+	int pidx;
+
+	/* disable error interrupts, to avoid confusion */
+	qib_write_kreg(dd, kr_errmask, 0ULL);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx)
+		if (dd->pport[pidx].link_speed_supported)
+			qib_write_kreg_port(dd->pport + pidx, krp_errmask,
+					    0ULL);
+
+	/* also disable interrupts; errormask is sometimes overwritten */
+	qib_7322_set_intr_state(dd, 0);
+
+	/* clear the freeze, and be sure chip saw it */
+	qib_write_kreg(dd, kr_control, dd->control);
+	qib_read_kreg32(dd, kr_scratch);
+
+	/*
+	 * Force new interrupt if any hwerr, error or interrupt bits are
+	 * still set, and clear "safe" send packet errors related to freeze
+	 * and cancelling sends.  Re-enable error interrupts before possible
+	 * force of re-interrupt on pending interrupts.
+	 */
+	qib_write_kreg(dd, kr_hwerrclear, 0ULL);
+	qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+	/* We need to purge per-port errs and reset mask, too */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		if (!dd->pport[pidx].link_speed_supported)
+			continue;
+		qib_write_kreg_port(dd->pport + pidx, krp_errclear, ~0Ull);
+		qib_write_kreg_port(dd->pport + pidx, krp_errmask, ~0Ull);
+	}
+	qib_7322_set_intr_state(dd, 1);
+}
+
+/* no error handling to speak of */
+/**
+ * qib_7322_handle_hwerrors - display hardware errors.
+ * @dd: the qlogic_ib device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  We reuse the same message buffer as
+ * qib_handle_errors() to avoid excessive stack usage.
+ */
+static void qib_7322_handle_hwerrors(struct qib_devdata *dd, char *msg,
+				     size_t msgl)
+{
+	u64 hwerrs;
+	u32 ctrl;
+	int isfatal = 0;
+
+	hwerrs = qib_read_kreg64(dd, kr_hwerrstatus);
+	if (!hwerrs)
+		goto bail;
+	if (hwerrs == ~0ULL) {
+		qib_dev_err(dd,
+			"Read of hardware error status failed (all bits set); ignoring\n");
+		goto bail;
+	}
+	qib_stats.sps_hwerrs++;
+
+	/* Always clear the error status register, except BIST fail */
+	qib_write_kreg(dd, kr_hwerrclear, hwerrs &
+		       ~HWE_MASK(PowerOnBISTFailed));
+
+	hwerrs &= dd->cspec->hwerrmask;
+
+	/* no EEPROM logging, yet */
+
+	if (hwerrs)
+		qib_devinfo(dd->pcidev,
+			"Hardware error: hwerr=0x%llx (cleared)\n",
+			(unsigned long long) hwerrs);
+
+	ctrl = qib_read_kreg32(dd, kr_control);
+	if ((ctrl & SYM_MASK(Control, FreezeMode)) && !dd->diag_client) {
+		/*
+		 * No recovery yet...
+		 */
+		if ((hwerrs & ~HWE_MASK(LATriggered)) ||
+		    dd->cspec->stay_in_freeze) {
+			/*
+			 * If any set that we aren't ignoring only make the
+			 * complaint once, in case it's stuck or recurring,
+			 * and we get here multiple times
+			 * Force link down, so switch knows, and
+			 * LEDs are turned off.
+			 */
+			if (dd->flags & QIB_INITTED)
+				isfatal = 1;
+		} else
+			qib_7322_clear_freeze(dd);
+	}
+
+	if (hwerrs & HWE_MASK(PowerOnBISTFailed)) {
+		isfatal = 1;
+		strlcpy(msg,
+			"[Memory BIST test failed, InfiniPath hardware unusable]",
+			msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	err_decode(msg, msgl, hwerrs, qib_7322_hwerror_msgs);
+
+	/* Ignore esoteric PLL failures et al. */
+
+	qib_dev_err(dd, "%s hardware error\n", msg);
+
+	if (hwerrs &
+		   (SYM_MASK(HwErrMask, SDmaMemReadErrMask_0) |
+		    SYM_MASK(HwErrMask, SDmaMemReadErrMask_1))) {
+		int pidx = 0;
+		int err;
+		unsigned long flags;
+		struct qib_pportdata *ppd = dd->pport;
+
+		for (; pidx < dd->num_pports; ++pidx, ppd++) {
+			err = 0;
+			if (pidx == 0 && (hwerrs &
+				SYM_MASK(HwErrMask, SDmaMemReadErrMask_0)))
+				err++;
+			if (pidx == 1 && (hwerrs &
+				SYM_MASK(HwErrMask, SDmaMemReadErrMask_1)))
+				err++;
+			if (err) {
+				spin_lock_irqsave(&ppd->sdma_lock, flags);
+				dump_sdma_7322_state(ppd);
+				spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+			}
+		}
+	}
+
+	if (isfatal && !dd->diag_client) {
+		qib_dev_err(dd,
+			"Fatal Hardware Error, no longer usable, SN %.16s\n",
+			dd->serial);
+		/*
+		 * for /sys status file and user programs to print; if no
+		 * trailing brace is copied, we'll know it was truncated.
+		 */
+		if (dd->freezemsg)
+			snprintf(dd->freezemsg, dd->freezelen,
+				 "{%s}", msg);
+		qib_disable_after_error(dd);
+	}
+bail:;
+}
+
+/**
+ * qib_7322_init_hwerrors - enable hardware errors
+ * @dd: the qlogic_ib device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void qib_7322_init_hwerrors(struct qib_devdata *dd)
+{
+	int pidx;
+	u64 extsval;
+
+	extsval = qib_read_kreg64(dd, kr_extstatus);
+	if (!(extsval & (QIB_EXTS_MEMBIST_DISABLED |
+			 QIB_EXTS_MEMBIST_ENDTEST)))
+		qib_dev_err(dd, "MemBIST did not complete!\n");
+
+	/* never clear BIST failure, so reported on each driver load */
+	qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed));
+	qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+
+	/* clear all */
+	qib_write_kreg(dd, kr_errclear, ~0ULL);
+	/* enable errors that are masked, at least this first time. */
+	qib_write_kreg(dd, kr_errmask, ~0ULL);
+	dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx)
+		if (dd->pport[pidx].link_speed_supported)
+			qib_write_kreg_port(dd->pport + pidx, krp_errmask,
+					    ~0ULL);
+}
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing
+ * on chips that are count-based, rather than trigger-based.  There is no
+ * reference counting, but that's also fine, given the intended use.
+ * Only chip-specific because it's all register accesses
+ */
+static void qib_set_7322_armlaunch(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		qib_write_kreg(dd, kr_errclear, QIB_E_SPIOARMLAUNCH);
+		dd->cspec->errormask |= QIB_E_SPIOARMLAUNCH;
+	} else
+		dd->cspec->errormask &= ~QIB_E_SPIOARMLAUNCH;
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+}
+
+/*
+ * Formerly took parameter <which> in pre-shifted,
+ * pre-merged form with LinkCmd and LinkInitCmd
+ * together, and assuming the zero was NOP.
+ */
+static void qib_set_ib_7322_lstate(struct qib_pportdata *ppd, u16 linkcmd,
+				   u16 linitcmd)
+{
+	u64 mod_wd;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) {
+		/*
+		 * If we are told to disable, note that so link-recovery
+		 * code does not attempt to bring us back up.
+		 * Also reset everything that we can, so we start
+		 * completely clean when re-enabled (before we
+		 * actually issue the disable to the IBC)
+		 */
+		qib_7322_mini_pcs_reset(ppd);
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	} else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) {
+		/*
+		 * Any other linkinitcmd will lead to LINKDOWN and then
+		 * to INIT (if all is well), so clear flag to let
+		 * link-recovery code attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		/*
+		 * Clear status change interrupt reduction so the
+		 * new state is seen.
+		 */
+		ppd->cpspec->ibcctrl_a &=
+			~SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn);
+	}
+
+	mod_wd = (linkcmd << IBA7322_IBCC_LINKCMD_SHIFT) |
+		(linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a |
+			    mod_wd);
+	/* write to chip to prevent back-to-back writes of ibc reg */
+	qib_write_kreg(dd, kr_scratch, 0);
+
+}
+
+/*
+ * The total RCV buffer memory is 64KB, used for both ports, and is
+ * in units of 64 bytes (same as IB flow control credit unit).
+ * The consumedVL unit in the same registers are in 32 byte units!
+ * So, a VL15 packet needs 4.50 IB credits, and 9 rx buffer chunks,
+ * and we can therefore allocate just 9 IB credits for 2 VL15 packets
+ * in krp_rxcreditvl15, rather than 10.
+ */
+#define RCV_BUF_UNITSZ 64
+#define NUM_RCV_BUF_UNITS(dd) ((64 * 1024) / (RCV_BUF_UNITSZ * dd->num_pports))
+
+static void set_vls(struct qib_pportdata *ppd)
+{
+	int i, numvls, totcred, cred_vl, vl0extra;
+	struct qib_devdata *dd = ppd->dd;
+	u64 val;
+
+	numvls = qib_num_vls(ppd->vls_operational);
+
+	/*
+	 * Set up per-VL credits. Below is kluge based on these assumptions:
+	 * 1) port is disabled at the time early_init is called.
+	 * 2) give VL15 17 credits, for two max-plausible packets.
+	 * 3) Give VL0-N the rest, with any rounding excess used for VL0
+	 */
+	/* 2 VL15 packets @ 288 bytes each (including IB headers) */
+	totcred = NUM_RCV_BUF_UNITS(dd);
+	cred_vl = (2 * 288 + RCV_BUF_UNITSZ - 1) / RCV_BUF_UNITSZ;
+	totcred -= cred_vl;
+	qib_write_kreg_port(ppd, krp_rxcreditvl15, (u64) cred_vl);
+	cred_vl = totcred / numvls;
+	vl0extra = totcred - cred_vl * numvls;
+	qib_write_kreg_port(ppd, krp_rxcreditvl0, cred_vl + vl0extra);
+	for (i = 1; i < numvls; i++)
+		qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, cred_vl);
+	for (; i < 8; i++) /* no buffer space for other VLs */
+		qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, 0);
+
+	/* Notify IBC that credits need to be recalculated */
+	val = qib_read_kreg_port(ppd, krp_ibsdtestiftx);
+	val |= SYM_MASK(IB_SDTEST_IF_TX_0, CREDIT_CHANGE);
+	qib_write_kreg_port(ppd, krp_ibsdtestiftx, val);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	val &= ~SYM_MASK(IB_SDTEST_IF_TX_0, CREDIT_CHANGE);
+	qib_write_kreg_port(ppd, krp_ibsdtestiftx, val);
+
+	for (i = 0; i < numvls; i++)
+		val = qib_read_kreg_port(ppd, krp_rxcreditvl0 + i);
+	val = qib_read_kreg_port(ppd, krp_rxcreditvl15);
+
+	/* Change the number of operational VLs */
+	ppd->cpspec->ibcctrl_a = (ppd->cpspec->ibcctrl_a &
+				~SYM_MASK(IBCCtrlA_0, NumVLane)) |
+		((u64)(numvls - 1) << SYM_LSB(IBCCtrlA_0, NumVLane));
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+}
+
+/*
+ * The code that deals with actual SerDes is in serdes_7322_init().
+ * Compared to the code for iba7220, it is minimal.
+ */
+static int serdes_7322_init(struct qib_pportdata *ppd);
+
+/**
+ * qib_7322_bringup_serdes - bring up the serdes
+ * @ppd: physical port on the qlogic_ib device
+ */
+static int qib_7322_bringup_serdes(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 val, guid, ibc;
+	unsigned long flags;
+	int ret = 0;
+
+	/*
+	 * SerDes model not in Pd, but still need to
+	 * set up much of IBCCtrl and IBCDDRCtrl; move elsewhere
+	 * eventually.
+	 */
+	/* Put IBC in reset, sends disabled (should be in reset already) */
+	ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, IBLinkEn);
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+
+	/* ensure previous Tx parameters are not still forced */
+	qib_write_kreg_port(ppd, krp_tx_deemph_override,
+		SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		reset_tx_deemphasis_override));
+
+	if (qib_compat_ddr_negotiate) {
+		ppd->cpspec->ibdeltainprog = 1;
+		ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd,
+						crp_ibsymbolerr);
+		ppd->cpspec->iblnkerrsnap = read_7322_creg32_port(ppd,
+						crp_iblinkerrrecov);
+	}
+
+	/* flowcontrolwatermark is in units of KBytes */
+	ibc = 0x5ULL << SYM_LSB(IBCCtrlA_0, FlowCtrlWaterMark);
+	/*
+	 * Flow control is sent this often, even if no changes in
+	 * buffer space occur.  Units are 128ns for this chip.
+	 * Set to 3usec.
+	 */
+	ibc |= 24ULL << SYM_LSB(IBCCtrlA_0, FlowCtrlPeriod);
+	/* max error tolerance */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrlA_0, PhyerrThreshold);
+	/* IB credit flow control. */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrlA_0, OverrunThreshold);
+	/*
+	 * set initial max size pkt IBC will send, including ICRC; it's the
+	 * PIO buffer size in dwords, less 1; also see qib_set_mtu()
+	 */
+	ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) <<
+		SYM_LSB(IBCCtrlA_0, MaxPktLen);
+	ppd->cpspec->ibcctrl_a = ibc; /* without linkcmd or linkinitcmd! */
+
+	/*
+	 * Reset the PCS interface to the serdes (and also ibc, which is still
+	 * in reset from above).  Writes new value of ibcctrl_a as last step.
+	 */
+	qib_7322_mini_pcs_reset(ppd);
+
+	if (!ppd->cpspec->ibcctrl_b) {
+		unsigned lse = ppd->link_speed_enabled;
+
+		/*
+		 * Not on re-init after reset, establish shadow
+		 * and force initial config.
+		 */
+		ppd->cpspec->ibcctrl_b = qib_read_kreg_port(ppd,
+							     krp_ibcctrl_b);
+		ppd->cpspec->ibcctrl_b &= ~(IBA7322_IBC_SPEED_QDR |
+				IBA7322_IBC_SPEED_DDR |
+				IBA7322_IBC_SPEED_SDR |
+				IBA7322_IBC_WIDTH_AUTONEG |
+				SYM_MASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED));
+		if (lse & (lse - 1)) /* Muliple speeds enabled */
+			ppd->cpspec->ibcctrl_b |=
+				(lse << IBA7322_IBC_SPEED_LSB) |
+				IBA7322_IBC_IBTA_1_2_MASK |
+				IBA7322_IBC_MAX_SPEED_MASK;
+		else
+			ppd->cpspec->ibcctrl_b |= (lse == QIB_IB_QDR) ?
+				IBA7322_IBC_SPEED_QDR |
+				 IBA7322_IBC_IBTA_1_2_MASK :
+				(lse == QIB_IB_DDR) ?
+					IBA7322_IBC_SPEED_DDR :
+					IBA7322_IBC_SPEED_SDR;
+		if ((ppd->link_width_enabled & (IB_WIDTH_1X | IB_WIDTH_4X)) ==
+		    (IB_WIDTH_1X | IB_WIDTH_4X))
+			ppd->cpspec->ibcctrl_b |= IBA7322_IBC_WIDTH_AUTONEG;
+		else
+			ppd->cpspec->ibcctrl_b |=
+				ppd->link_width_enabled == IB_WIDTH_4X ?
+				IBA7322_IBC_WIDTH_4X_ONLY :
+				IBA7322_IBC_WIDTH_1X_ONLY;
+
+		/* always enable these on driver reload, not sticky */
+		ppd->cpspec->ibcctrl_b |= (IBA7322_IBC_RXPOL_MASK |
+			IBA7322_IBC_HRTBT_MASK);
+	}
+	qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b);
+
+	/* setup so we have more time at CFGTEST to change H1 */
+	val = qib_read_kreg_port(ppd, krp_ibcctrl_c);
+	val &= ~SYM_MASK(IBCCtrlC_0, IB_FRONT_PORCH);
+	val |= 0xfULL << SYM_LSB(IBCCtrlC_0, IB_FRONT_PORCH);
+	qib_write_kreg_port(ppd, krp_ibcctrl_c, val);
+
+	serdes_7322_init(ppd);
+
+	guid = be64_to_cpu(ppd->guid);
+	if (!guid) {
+		if (dd->base_guid)
+			guid = be64_to_cpu(dd->base_guid) + ppd->port - 1;
+		ppd->guid = cpu_to_be64(guid);
+	}
+
+	qib_write_kreg_port(ppd, krp_hrtbt_guid, guid);
+	/* write to chip to prevent back-to-back writes of ibc reg */
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	/* Enable port */
+	ppd->cpspec->ibcctrl_a |= SYM_MASK(IBCCtrlA_0, IBLinkEn);
+	set_vls(ppd);
+
+	/* initially come up DISABLED, without sending anything. */
+	val = ppd->cpspec->ibcctrl_a | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE <<
+					QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, val);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	/* clear the linkinit cmds */
+	ppd->cpspec->ibcctrl_a = val & ~SYM_MASK(IBCCtrlA_0, LinkInitCmd);
+
+	/* be paranoid against later code motion, etc. */
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+	ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvIBPortEnable);
+	qib_write_kreg_port(ppd, krp_rcvctrl, ppd->p_rcvctrl);
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+
+	/* Also enable IBSTATUSCHG interrupt.  */
+	val = qib_read_kreg_port(ppd, krp_errmask);
+	qib_write_kreg_port(ppd, krp_errmask,
+		val | ERR_MASK_N(IBStatusChanged));
+
+	/* Always zero until we start messing with SerDes for real */
+	return ret;
+}
+
+/**
+ * qib_7322_quiet_serdes - set serdes to txidle
+ * @dd: the qlogic_ib device
+ * Called when driver is being unloaded
+ */
+static void qib_7322_mini_quiet_serdes(struct qib_pportdata *ppd)
+{
+	u64 val;
+	unsigned long flags;
+
+	qib_set_ib_7322_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	wake_up(&ppd->cpspec->autoneg_wait);
+	cancel_delayed_work_sync(&ppd->cpspec->autoneg_work);
+	if (ppd->dd->cspec->r1)
+		cancel_delayed_work_sync(&ppd->cpspec->ipg_work);
+
+	ppd->cpspec->chase_end = 0;
+	if (ppd->cpspec->chase_timer.function) /* if initted */
+		del_timer_sync(&ppd->cpspec->chase_timer);
+
+	/*
+	 * Despite the name, actually disables IBC as well. Do it when
+	 * we are as sure as possible that no more packets can be
+	 * received, following the down and the PCS reset.
+	 * The actual disabling happens in qib_7322_mini_pci_reset(),
+	 * along with the PCS being reset.
+	 */
+	ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, IBLinkEn);
+	qib_7322_mini_pcs_reset(ppd);
+
+	/*
+	 * Update the adjusted counters so the adjustment persists
+	 * across driver reload.
+	 */
+	if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta ||
+	    ppd->cpspec->ibdeltainprog || ppd->cpspec->iblnkdowndelta) {
+		struct qib_devdata *dd = ppd->dd;
+		u64 diagc;
+
+		/* enable counter writes */
+		diagc = qib_read_kreg64(dd, kr_hwdiagctrl);
+		qib_write_kreg(dd, kr_hwdiagctrl,
+			       diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable));
+
+		if (ppd->cpspec->ibsymdelta || ppd->cpspec->ibdeltainprog) {
+			val = read_7322_creg32_port(ppd, crp_ibsymbolerr);
+			if (ppd->cpspec->ibdeltainprog)
+				val -= val - ppd->cpspec->ibsymsnap;
+			val -= ppd->cpspec->ibsymdelta;
+			write_7322_creg_port(ppd, crp_ibsymbolerr, val);
+		}
+		if (ppd->cpspec->iblnkerrdelta || ppd->cpspec->ibdeltainprog) {
+			val = read_7322_creg32_port(ppd, crp_iblinkerrrecov);
+			if (ppd->cpspec->ibdeltainprog)
+				val -= val - ppd->cpspec->iblnkerrsnap;
+			val -= ppd->cpspec->iblnkerrdelta;
+			write_7322_creg_port(ppd, crp_iblinkerrrecov, val);
+		}
+		if (ppd->cpspec->iblnkdowndelta) {
+			val = read_7322_creg32_port(ppd, crp_iblinkdown);
+			val += ppd->cpspec->iblnkdowndelta;
+			write_7322_creg_port(ppd, crp_iblinkdown, val);
+		}
+		/*
+		 * No need to save ibmalfdelta since IB perfcounters
+		 * are cleared on driver reload.
+		 */
+
+		/* and disable counter writes */
+		qib_write_kreg(dd, kr_hwdiagctrl, diagc);
+	}
+}
+
+/**
+ * qib_setup_7322_setextled - set the state of the two external LEDs
+ * @ppd: physical port on the qlogic_ib device
+ * @on: whether the link is up or not
+ *
+ * The exact combo of LEDs if on is true is determined by looking
+ * at the ibcstatus.
+ *
+ * These LEDs indicate the physical and logical state of IB link.
+ * For this chip (at least with recommended board pinouts), LED1
+ * is Yellow (logical state) and LED2 is Green (physical state),
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ */
+static void qib_setup_7322_setextled(struct qib_pportdata *ppd, u32 on)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 extctl, ledblink = 0, val;
+	unsigned long flags;
+	int yel, grn;
+
+	/*
+	 * The diags use the LED to indicate diag info, so we leave
+	 * the external LED alone when the diags are running.
+	 */
+	if (dd->diag_client)
+		return;
+
+	/* Allow override of LED display for, e.g. Locating system in rack */
+	if (ppd->led_override) {
+		grn = (ppd->led_override & QIB_LED_PHYS);
+		yel = (ppd->led_override & QIB_LED_LOG);
+	} else if (on) {
+		val = qib_read_kreg_port(ppd, krp_ibcstatus_a);
+		grn = qib_7322_phys_portstate(val) ==
+			IB_PHYSPORTSTATE_LINKUP;
+		yel = qib_7322_iblink_state(val) == IB_PORT_ACTIVE;
+	} else {
+		grn = 0;
+		yel = 0;
+	}
+
+	spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+	extctl = dd->cspec->extctrl & (ppd->port == 1 ?
+		~ExtLED_IB1_MASK : ~ExtLED_IB2_MASK);
+	if (grn) {
+		extctl |= ppd->port == 1 ? ExtLED_IB1_GRN : ExtLED_IB2_GRN;
+		/*
+		 * Counts are in chip clock (4ns) periods.
+		 * This is 1/16 sec (66.6ms) on,
+		 * 3/16 sec (187.5 ms) off, with packets rcvd.
+		 */
+		ledblink = ((66600 * 1000UL / 4) << IBA7322_LEDBLINK_ON_SHIFT) |
+			((187500 * 1000UL / 4) << IBA7322_LEDBLINK_OFF_SHIFT);
+	}
+	if (yel)
+		extctl |= ppd->port == 1 ? ExtLED_IB1_YEL : ExtLED_IB2_YEL;
+	dd->cspec->extctrl = extctl;
+	qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+	spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+
+	if (ledblink) /* blink the LED on packet receive */
+		qib_write_kreg_port(ppd, krp_rcvpktledcnt, ledblink);
+}
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+
+static int qib_7322_notify_dca(struct qib_devdata *dd, unsigned long event)
+{
+	switch (event) {
+	case DCA_PROVIDER_ADD:
+		if (dd->flags & QIB_DCA_ENABLED)
+			break;
+		if (!dca_add_requester(&dd->pcidev->dev)) {
+			qib_devinfo(dd->pcidev, "DCA enabled\n");
+			dd->flags |= QIB_DCA_ENABLED;
+			qib_setup_dca(dd);
+		}
+		break;
+	case DCA_PROVIDER_REMOVE:
+		if (dd->flags & QIB_DCA_ENABLED) {
+			dca_remove_requester(&dd->pcidev->dev);
+			dd->flags &= ~QIB_DCA_ENABLED;
+			dd->cspec->dca_ctrl = 0;
+			qib_write_kreg(dd, KREG_IDX(DCACtrlA),
+				dd->cspec->dca_ctrl);
+		}
+		break;
+	}
+	return 0;
+}
+
+static void qib_update_rhdrq_dca(struct qib_ctxtdata *rcd, int cpu)
+{
+	struct qib_devdata *dd = rcd->dd;
+	struct qib_chip_specific *cspec = dd->cspec;
+
+	if (!(dd->flags & QIB_DCA_ENABLED))
+		return;
+	if (cspec->rhdr_cpu[rcd->ctxt] != cpu) {
+		const struct dca_reg_map *rmp;
+
+		cspec->rhdr_cpu[rcd->ctxt] = cpu;
+		rmp = &dca_rcvhdr_reg_map[rcd->ctxt];
+		cspec->dca_rcvhdr_ctrl[rmp->shadow_inx] &= rmp->mask;
+		cspec->dca_rcvhdr_ctrl[rmp->shadow_inx] |=
+			(u64) dca3_get_tag(&dd->pcidev->dev, cpu) << rmp->lsb;
+		qib_devinfo(dd->pcidev,
+			"Ctxt %d cpu %d dca %llx\n", rcd->ctxt, cpu,
+			(long long) cspec->dca_rcvhdr_ctrl[rmp->shadow_inx]);
+		qib_write_kreg(dd, rmp->regno,
+			       cspec->dca_rcvhdr_ctrl[rmp->shadow_inx]);
+		cspec->dca_ctrl |= SYM_MASK(DCACtrlA, RcvHdrqDCAEnable);
+		qib_write_kreg(dd, KREG_IDX(DCACtrlA), cspec->dca_ctrl);
+	}
+}
+
+static void qib_update_sdma_dca(struct qib_pportdata *ppd, int cpu)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_chip_specific *cspec = dd->cspec;
+	unsigned pidx = ppd->port - 1;
+
+	if (!(dd->flags & QIB_DCA_ENABLED))
+		return;
+	if (cspec->sdma_cpu[pidx] != cpu) {
+		cspec->sdma_cpu[pidx] = cpu;
+		cspec->dca_rcvhdr_ctrl[4] &= ~(ppd->hw_pidx ?
+			SYM_MASK(DCACtrlF, SendDma1DCAOPH) :
+			SYM_MASK(DCACtrlF, SendDma0DCAOPH));
+		cspec->dca_rcvhdr_ctrl[4] |=
+			(u64) dca3_get_tag(&dd->pcidev->dev, cpu) <<
+				(ppd->hw_pidx ?
+					SYM_LSB(DCACtrlF, SendDma1DCAOPH) :
+					SYM_LSB(DCACtrlF, SendDma0DCAOPH));
+		qib_devinfo(dd->pcidev,
+			"sdma %d cpu %d dca %llx\n", ppd->hw_pidx, cpu,
+			(long long) cspec->dca_rcvhdr_ctrl[4]);
+		qib_write_kreg(dd, KREG_IDX(DCACtrlF),
+			       cspec->dca_rcvhdr_ctrl[4]);
+		cspec->dca_ctrl |= ppd->hw_pidx ?
+			SYM_MASK(DCACtrlA, SendDMAHead1DCAEnable) :
+			SYM_MASK(DCACtrlA, SendDMAHead0DCAEnable);
+		qib_write_kreg(dd, KREG_IDX(DCACtrlA), cspec->dca_ctrl);
+	}
+}
+
+static void qib_setup_dca(struct qib_devdata *dd)
+{
+	struct qib_chip_specific *cspec = dd->cspec;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(cspec->rhdr_cpu); i++)
+		cspec->rhdr_cpu[i] = -1;
+	for (i = 0; i < ARRAY_SIZE(cspec->sdma_cpu); i++)
+		cspec->sdma_cpu[i] = -1;
+	cspec->dca_rcvhdr_ctrl[0] =
+		(1ULL << SYM_LSB(DCACtrlB, RcvHdrq0DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlB, RcvHdrq1DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlB, RcvHdrq2DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlB, RcvHdrq3DCAXfrCnt));
+	cspec->dca_rcvhdr_ctrl[1] =
+		(1ULL << SYM_LSB(DCACtrlC, RcvHdrq4DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlC, RcvHdrq5DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlC, RcvHdrq6DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlC, RcvHdrq7DCAXfrCnt));
+	cspec->dca_rcvhdr_ctrl[2] =
+		(1ULL << SYM_LSB(DCACtrlD, RcvHdrq8DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlD, RcvHdrq9DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlD, RcvHdrq10DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlD, RcvHdrq11DCAXfrCnt));
+	cspec->dca_rcvhdr_ctrl[3] =
+		(1ULL << SYM_LSB(DCACtrlE, RcvHdrq12DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlE, RcvHdrq13DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlE, RcvHdrq14DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlE, RcvHdrq15DCAXfrCnt));
+	cspec->dca_rcvhdr_ctrl[4] =
+		(1ULL << SYM_LSB(DCACtrlF, RcvHdrq16DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlF, RcvHdrq17DCAXfrCnt));
+	for (i = 0; i < ARRAY_SIZE(cspec->sdma_cpu); i++)
+		qib_write_kreg(dd, KREG_IDX(DCACtrlB) + i,
+			       cspec->dca_rcvhdr_ctrl[i]);
+	for (i = 0; i < cspec->num_msix_entries; i++)
+		setup_dca_notifier(dd, i);
+}
+
+static void qib_irq_notifier_notify(struct irq_affinity_notify *notify,
+			     const cpumask_t *mask)
+{
+	struct qib_irq_notify *n =
+		container_of(notify, struct qib_irq_notify, notify);
+	int cpu = cpumask_first(mask);
+
+	if (n->rcv) {
+		struct qib_ctxtdata *rcd = (struct qib_ctxtdata *)n->arg;
+
+		qib_update_rhdrq_dca(rcd, cpu);
+	} else {
+		struct qib_pportdata *ppd = (struct qib_pportdata *)n->arg;
+
+		qib_update_sdma_dca(ppd, cpu);
+	}
+}
+
+static void qib_irq_notifier_release(struct kref *ref)
+{
+	struct qib_irq_notify *n =
+		container_of(ref, struct qib_irq_notify, notify.kref);
+	struct qib_devdata *dd;
+
+	if (n->rcv) {
+		struct qib_ctxtdata *rcd = (struct qib_ctxtdata *)n->arg;
+
+		dd = rcd->dd;
+	} else {
+		struct qib_pportdata *ppd = (struct qib_pportdata *)n->arg;
+
+		dd = ppd->dd;
+	}
+	qib_devinfo(dd->pcidev,
+		"release on HCA notify 0x%p n 0x%p\n", ref, n);
+	kfree(n);
+}
+#endif
+
+static void qib_7322_free_irq(struct qib_devdata *dd)
+{
+	u64 intgranted;
+	int i;
+
+	dd->cspec->main_int_mask = ~0ULL;
+
+	for (i = 0; i < dd->cspec->num_msix_entries; i++) {
+		/* only free IRQs that were allocated */
+		if (dd->cspec->msix_entries[i].arg) {
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+			reset_dca_notifier(dd, i);
+#endif
+			irq_set_affinity_hint(pci_irq_vector(dd->pcidev, i),
+					      NULL);
+			free_cpumask_var(dd->cspec->msix_entries[i].mask);
+			pci_free_irq(dd->pcidev, i,
+				     dd->cspec->msix_entries[i].arg);
+		}
+	}
+
+	/* If num_msix_entries was 0, disable the INTx IRQ */
+	if (!dd->cspec->num_msix_entries)
+		pci_free_irq(dd->pcidev, 0, dd);
+	else
+		dd->cspec->num_msix_entries = 0;
+
+	pci_free_irq_vectors(dd->pcidev);
+
+	/* make sure no MSIx interrupts are left pending */
+	intgranted = qib_read_kreg64(dd, kr_intgranted);
+	if (intgranted)
+		qib_write_kreg(dd, kr_intgranted, intgranted);
+}
+
+static void qib_setup_7322_cleanup(struct qib_devdata *dd)
+{
+	int i;
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	if (dd->flags & QIB_DCA_ENABLED) {
+		dca_remove_requester(&dd->pcidev->dev);
+		dd->flags &= ~QIB_DCA_ENABLED;
+		dd->cspec->dca_ctrl = 0;
+		qib_write_kreg(dd, KREG_IDX(DCACtrlA), dd->cspec->dca_ctrl);
+	}
+#endif
+
+	qib_7322_free_irq(dd);
+	kfree(dd->cspec->cntrs);
+	kfree(dd->cspec->sendchkenable);
+	kfree(dd->cspec->sendgrhchk);
+	kfree(dd->cspec->sendibchk);
+	kfree(dd->cspec->msix_entries);
+	for (i = 0; i < dd->num_pports; i++) {
+		unsigned long flags;
+		u32 mask = QSFP_GPIO_MOD_PRS_N |
+			(QSFP_GPIO_MOD_PRS_N << QSFP_GPIO_PORT2_SHIFT);
+
+		kfree(dd->pport[i].cpspec->portcntrs);
+		if (dd->flags & QIB_HAS_QSFP) {
+			spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+			dd->cspec->gpio_mask &= ~mask;
+			qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+			spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+		}
+	}
+}
+
+/* handle SDMA interrupts */
+static void sdma_7322_intr(struct qib_devdata *dd, u64 istat)
+{
+	struct qib_pportdata *ppd0 = &dd->pport[0];
+	struct qib_pportdata *ppd1 = &dd->pport[1];
+	u64 intr0 = istat & (INT_MASK_P(SDma, 0) |
+		INT_MASK_P(SDmaIdle, 0) | INT_MASK_P(SDmaProgress, 0));
+	u64 intr1 = istat & (INT_MASK_P(SDma, 1) |
+		INT_MASK_P(SDmaIdle, 1) | INT_MASK_P(SDmaProgress, 1));
+
+	if (intr0)
+		qib_sdma_intr(ppd0);
+	if (intr1)
+		qib_sdma_intr(ppd1);
+
+	if (istat & INT_MASK_PM(SDmaCleanupDone, 0))
+		qib_sdma_process_event(ppd0, qib_sdma_event_e20_hw_started);
+	if (istat & INT_MASK_PM(SDmaCleanupDone, 1))
+		qib_sdma_process_event(ppd1, qib_sdma_event_e20_hw_started);
+}
+
+/*
+ * Set or clear the Send buffer available interrupt enable bit.
+ */
+static void qib_wantpiobuf_7322_intr(struct qib_devdata *dd, u32 needint)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if (needint)
+		dd->sendctrl |= SYM_MASK(SendCtrl, SendIntBufAvail);
+	else
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SendIntBufAvail);
+	qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/*
+ * Somehow got an interrupt with reserved bits set in interrupt status.
+ * Print a message so we know it happened, then clear them.
+ * keep mainline interrupt handler cache-friendly
+ */
+static noinline void unknown_7322_ibits(struct qib_devdata *dd, u64 istat)
+{
+	u64 kills;
+	char msg[128];
+
+	kills = istat & ~QIB_I_BITSEXTANT;
+	qib_dev_err(dd,
+		"Clearing reserved interrupt(s) 0x%016llx: %s\n",
+		(unsigned long long) kills, msg);
+	qib_write_kreg(dd, kr_intmask, (dd->cspec->int_enable_mask & ~kills));
+}
+
+/* keep mainline interrupt handler cache-friendly */
+static noinline void unknown_7322_gpio_intr(struct qib_devdata *dd)
+{
+	u32 gpiostatus;
+	int handled = 0;
+	int pidx;
+
+	/*
+	 * Boards for this chip currently don't use GPIO interrupts,
+	 * so clear by writing GPIOstatus to GPIOclear, and complain
+	 * to developer.  To avoid endless repeats, clear
+	 * the bits in the mask, since there is some kind of
+	 * programming error or chip problem.
+	 */
+	gpiostatus = qib_read_kreg32(dd, kr_gpio_status);
+	/*
+	 * In theory, writing GPIOstatus to GPIOclear could
+	 * have a bad side-effect on some diagnostic that wanted
+	 * to poll for a status-change, but the various shadows
+	 * make that problematic at best. Diags will just suppress
+	 * all GPIO interrupts during such tests.
+	 */
+	qib_write_kreg(dd, kr_gpio_clear, gpiostatus);
+	/*
+	 * Check for QSFP MOD_PRS changes
+	 * only works for single port if IB1 != pidx1
+	 */
+	for (pidx = 0; pidx < dd->num_pports && (dd->flags & QIB_HAS_QSFP);
+	     ++pidx) {
+		struct qib_pportdata *ppd;
+		struct qib_qsfp_data *qd;
+		u32 mask;
+
+		if (!dd->pport[pidx].link_speed_supported)
+			continue;
+		mask = QSFP_GPIO_MOD_PRS_N;
+		ppd = dd->pport + pidx;
+		mask <<= (QSFP_GPIO_PORT2_SHIFT * ppd->hw_pidx);
+		if (gpiostatus & dd->cspec->gpio_mask & mask) {
+			u64 pins;
+
+			qd = &ppd->cpspec->qsfp_data;
+			gpiostatus &= ~mask;
+			pins = qib_read_kreg64(dd, kr_extstatus);
+			pins >>= SYM_LSB(EXTStatus, GPIOIn);
+			if (!(pins & mask)) {
+				++handled;
+				qd->t_insert = jiffies;
+				queue_work(ib_wq, &qd->work);
+			}
+		}
+	}
+
+	if (gpiostatus && !handled) {
+		const u32 mask = qib_read_kreg32(dd, kr_gpio_mask);
+		u32 gpio_irq = mask & gpiostatus;
+
+		/*
+		 * Clear any troublemakers, and update chip from shadow
+		 */
+		dd->cspec->gpio_mask &= ~gpio_irq;
+		qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+	}
+}
+
+/*
+ * Handle errors and unusual events first, separate function
+ * to improve cache hits for fast path interrupt handling.
+ */
+static noinline void unlikely_7322_intr(struct qib_devdata *dd, u64 istat)
+{
+	if (istat & ~QIB_I_BITSEXTANT)
+		unknown_7322_ibits(dd, istat);
+	if (istat & QIB_I_GPIO)
+		unknown_7322_gpio_intr(dd);
+	if (istat & QIB_I_C_ERROR) {
+		qib_write_kreg(dd, kr_errmask, 0ULL);
+		tasklet_schedule(&dd->error_tasklet);
+	}
+	if (istat & INT_MASK_P(Err, 0) && dd->rcd[0])
+		handle_7322_p_errors(dd->rcd[0]->ppd);
+	if (istat & INT_MASK_P(Err, 1) && dd->rcd[1])
+		handle_7322_p_errors(dd->rcd[1]->ppd);
+}
+
+/*
+ * Dynamically adjust the rcv int timeout for a context based on incoming
+ * packet rate.
+ */
+static void adjust_rcv_timeout(struct qib_ctxtdata *rcd, int npkts)
+{
+	struct qib_devdata *dd = rcd->dd;
+	u32 timeout = dd->cspec->rcvavail_timeout[rcd->ctxt];
+
+	/*
+	 * Dynamically adjust idle timeout on chip
+	 * based on number of packets processed.
+	 */
+	if (npkts < rcv_int_count && timeout > 2)
+		timeout >>= 1;
+	else if (npkts >= rcv_int_count && timeout < rcv_int_timeout)
+		timeout = min(timeout << 1, rcv_int_timeout);
+	else
+		return;
+
+	dd->cspec->rcvavail_timeout[rcd->ctxt] = timeout;
+	qib_write_kreg(dd, kr_rcvavailtimeout + rcd->ctxt, timeout);
+}
+
+/*
+ * This is the main interrupt handler.
+ * It will normally only be used for low frequency interrupts but may
+ * have to handle all interrupts if INTx is enabled or fewer than normal
+ * MSIx interrupts were allocated.
+ * This routine should ignore the interrupt bits for any of the
+ * dedicated MSIx handlers.
+ */
+static irqreturn_t qib_7322intr(int irq, void *data)
+{
+	struct qib_devdata *dd = data;
+	irqreturn_t ret;
+	u64 istat;
+	u64 ctxtrbits;
+	u64 rmask;
+	unsigned i;
+	u32 npkts;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) {
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		ret = IRQ_HANDLED;
+		goto bail;
+	}
+
+	istat = qib_read_kreg64(dd, kr_intstatus);
+
+	if (unlikely(istat == ~0ULL)) {
+		qib_bad_intrstatus(dd);
+		qib_dev_err(dd, "Interrupt status all f's, skipping\n");
+		/* don't know if it was our interrupt or not */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	istat &= dd->cspec->main_int_mask;
+	if (unlikely(!istat)) {
+		/* already handled, or shared and not us */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* handle "errors" of various kinds first, device ahead of port */
+	if (unlikely(istat & (~QIB_I_BITSEXTANT | QIB_I_GPIO |
+			      QIB_I_C_ERROR | INT_MASK_P(Err, 0) |
+			      INT_MASK_P(Err, 1))))
+		unlikely_7322_intr(dd, istat);
+
+	/*
+	 * Clear the interrupt bits we found set, relatively early, so we
+	 * "know" know the chip will have seen this by the time we process
+	 * the queue, and will re-interrupt if necessary.  The processor
+	 * itself won't take the interrupt again until we return.
+	 */
+	qib_write_kreg(dd, kr_intclear, istat);
+
+	/*
+	 * Handle kernel receive queues before checking for pio buffers
+	 * available since receives can overflow; piobuf waiters can afford
+	 * a few extra cycles, since they were waiting anyway.
+	 */
+	ctxtrbits = istat & (QIB_I_RCVAVAIL_MASK | QIB_I_RCVURG_MASK);
+	if (ctxtrbits) {
+		rmask = (1ULL << QIB_I_RCVAVAIL_LSB) |
+			(1ULL << QIB_I_RCVURG_LSB);
+		for (i = 0; i < dd->first_user_ctxt; i++) {
+			if (ctxtrbits & rmask) {
+				ctxtrbits &= ~rmask;
+				if (dd->rcd[i])
+					qib_kreceive(dd->rcd[i], NULL, &npkts);
+			}
+			rmask <<= 1;
+		}
+		if (ctxtrbits) {
+			ctxtrbits = (ctxtrbits >> QIB_I_RCVAVAIL_LSB) |
+				(ctxtrbits >> QIB_I_RCVURG_LSB);
+			qib_handle_urcv(dd, ctxtrbits);
+		}
+	}
+
+	if (istat & (QIB_I_P_SDMAINT(0) | QIB_I_P_SDMAINT(1)))
+		sdma_7322_intr(dd, istat);
+
+	if ((istat & QIB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED))
+		qib_ib_piobufavail(dd);
+
+	ret = IRQ_HANDLED;
+bail:
+	return ret;
+}
+
+/*
+ * Dedicated receive packet available interrupt handler.
+ */
+static irqreturn_t qib_7322pintr(int irq, void *data)
+{
+	struct qib_ctxtdata *rcd = data;
+	struct qib_devdata *dd = rcd->dd;
+	u32 npkts;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ((1ULL << QIB_I_RCVAVAIL_LSB) |
+		       (1ULL << QIB_I_RCVURG_LSB)) << rcd->ctxt);
+
+	qib_kreceive(rcd, NULL, &npkts);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send buffer available interrupt handler.
+ */
+static irqreturn_t qib_7322bufavail(int irq, void *data)
+{
+	struct qib_devdata *dd = data;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, QIB_I_SPIOBUFAVAIL);
+
+	/* qib_ib_piobufavail() will clear the want PIO interrupt if needed */
+	if (dd->flags & QIB_INITTED)
+		qib_ib_piobufavail(dd);
+	else
+		qib_wantpiobuf_7322_intr(dd, 0);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send DMA interrupt handler.
+ */
+static irqreturn_t sdma_intr(int irq, void *data)
+{
+	struct qib_pportdata *ppd = data;
+	struct qib_devdata *dd = ppd->dd;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ?
+		       INT_MASK_P(SDma, 1) : INT_MASK_P(SDma, 0));
+	qib_sdma_intr(ppd);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send DMA idle interrupt handler.
+ */
+static irqreturn_t sdma_idle_intr(int irq, void *data)
+{
+	struct qib_pportdata *ppd = data;
+	struct qib_devdata *dd = ppd->dd;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ?
+		       INT_MASK_P(SDmaIdle, 1) : INT_MASK_P(SDmaIdle, 0));
+	qib_sdma_intr(ppd);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send DMA progress interrupt handler.
+ */
+static irqreturn_t sdma_progress_intr(int irq, void *data)
+{
+	struct qib_pportdata *ppd = data;
+	struct qib_devdata *dd = ppd->dd;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ?
+		       INT_MASK_P(SDmaProgress, 1) :
+		       INT_MASK_P(SDmaProgress, 0));
+	qib_sdma_intr(ppd);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send DMA cleanup interrupt handler.
+ */
+static irqreturn_t sdma_cleanup_intr(int irq, void *data)
+{
+	struct qib_pportdata *ppd = data;
+	struct qib_devdata *dd = ppd->dd;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ?
+		       INT_MASK_PM(SDmaCleanupDone, 1) :
+		       INT_MASK_PM(SDmaCleanupDone, 0));
+	qib_sdma_process_event(ppd, qib_sdma_event_e20_hw_started);
+
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+
+static void reset_dca_notifier(struct qib_devdata *dd, int msixnum)
+{
+	if (!dd->cspec->msix_entries[msixnum].dca)
+		return;
+
+	qib_devinfo(dd->pcidev, "Disabling notifier on HCA %d irq %d\n",
+		    dd->unit, pci_irq_vector(dd->pcidev, msixnum));
+	irq_set_affinity_notifier(pci_irq_vector(dd->pcidev, msixnum), NULL);
+	dd->cspec->msix_entries[msixnum].notifier = NULL;
+}
+
+static void setup_dca_notifier(struct qib_devdata *dd, int msixnum)
+{
+	struct qib_msix_entry *m = &dd->cspec->msix_entries[msixnum];
+	struct qib_irq_notify *n;
+
+	if (!m->dca)
+		return;
+	n = kzalloc(sizeof(*n), GFP_KERNEL);
+	if (n) {
+		int ret;
+
+		m->notifier = n;
+		n->notify.irq = pci_irq_vector(dd->pcidev, msixnum);
+		n->notify.notify = qib_irq_notifier_notify;
+		n->notify.release = qib_irq_notifier_release;
+		n->arg = m->arg;
+		n->rcv = m->rcv;
+		qib_devinfo(dd->pcidev,
+			"set notifier irq %d rcv %d notify %p\n",
+			n->notify.irq, n->rcv, &n->notify);
+		ret = irq_set_affinity_notifier(
+				n->notify.irq,
+				&n->notify);
+		if (ret) {
+			m->notifier = NULL;
+			kfree(n);
+		}
+	}
+}
+
+#endif
+
+/*
+ * Set up our chip-specific interrupt handler.
+ * The interrupt type has already been setup, so
+ * we just need to do the registration and error checking.
+ * If we are using MSIx interrupts, we may fall back to
+ * INTx later, if the interrupt handler doesn't get called
+ * within 1/2 second (see verify_interrupt()).
+ */
+static void qib_setup_7322_interrupt(struct qib_devdata *dd, int clearpend)
+{
+	int ret, i, msixnum;
+	u64 redirect[6];
+	u64 mask;
+	const struct cpumask *local_mask;
+	int firstcpu, secondcpu = 0, currrcvcpu = 0;
+
+	if (!dd->num_pports)
+		return;
+
+	if (clearpend) {
+		/*
+		 * if not switching interrupt types, be sure interrupts are
+		 * disabled, and then clear anything pending at this point,
+		 * because we are starting clean.
+		 */
+		qib_7322_set_intr_state(dd, 0);
+
+		/* clear the reset error, init error/hwerror mask */
+		qib_7322_init_hwerrors(dd);
+
+		/* clear any interrupt bits that might be set */
+		qib_write_kreg(dd, kr_intclear, ~0ULL);
+
+		/* make sure no pending MSIx intr, and clear diag reg */
+		qib_write_kreg(dd, kr_intgranted, ~0ULL);
+		qib_write_kreg(dd, kr_vecclr_wo_int, ~0ULL);
+	}
+
+	if (!dd->cspec->num_msix_entries) {
+		/* Try to get INTx interrupt */
+try_intx:
+		ret = pci_request_irq(dd->pcidev, 0, qib_7322intr, NULL, dd,
+				      QIB_DRV_NAME);
+		if (ret) {
+			qib_dev_err(
+				dd,
+				"Couldn't setup INTx interrupt (irq=%d): %d\n",
+				pci_irq_vector(dd->pcidev, 0), ret);
+			return;
+		}
+		dd->cspec->main_int_mask = ~0ULL;
+		return;
+	}
+
+	/* Try to get MSIx interrupts */
+	memset(redirect, 0, sizeof(redirect));
+	mask = ~0ULL;
+	msixnum = 0;
+	local_mask = cpumask_of_pcibus(dd->pcidev->bus);
+	firstcpu = cpumask_first(local_mask);
+	if (firstcpu >= nr_cpu_ids ||
+			cpumask_weight(local_mask) == num_online_cpus()) {
+		local_mask = topology_core_cpumask(0);
+		firstcpu = cpumask_first(local_mask);
+	}
+	if (firstcpu < nr_cpu_ids) {
+		secondcpu = cpumask_next(firstcpu, local_mask);
+		if (secondcpu >= nr_cpu_ids)
+			secondcpu = firstcpu;
+		currrcvcpu = secondcpu;
+	}
+	for (i = 0; msixnum < dd->cspec->num_msix_entries; i++) {
+		irq_handler_t handler;
+		void *arg;
+		int lsb, reg, sh;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+		int dca = 0;
+#endif
+		if (i < ARRAY_SIZE(irq_table)) {
+			if (irq_table[i].port) {
+				/* skip if for a non-configured port */
+				if (irq_table[i].port > dd->num_pports)
+					continue;
+				arg = dd->pport + irq_table[i].port - 1;
+			} else
+				arg = dd;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+			dca = irq_table[i].dca;
+#endif
+			lsb = irq_table[i].lsb;
+			handler = irq_table[i].handler;
+			ret = pci_request_irq(dd->pcidev, msixnum, handler,
+					      NULL, arg, QIB_DRV_NAME "%d%s",
+					      dd->unit,
+					      irq_table[i].name);
+		} else {
+			unsigned ctxt;
+
+			ctxt = i - ARRAY_SIZE(irq_table);
+			/* per krcvq context receive interrupt */
+			arg = dd->rcd[ctxt];
+			if (!arg)
+				continue;
+			if (qib_krcvq01_no_msi && ctxt < 2)
+				continue;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+			dca = 1;
+#endif
+			lsb = QIB_I_RCVAVAIL_LSB + ctxt;
+			handler = qib_7322pintr;
+			ret = pci_request_irq(dd->pcidev, msixnum, handler,
+					      NULL, arg,
+					      QIB_DRV_NAME "%d (kctx)",
+					      dd->unit);
+		}
+
+		if (ret) {
+			/*
+			 * Shouldn't happen since the enable said we could
+			 * have as many as we are trying to setup here.
+			 */
+			qib_dev_err(dd,
+				    "Couldn't setup MSIx interrupt (vec=%d, irq=%d): %d\n",
+				    msixnum,
+				    pci_irq_vector(dd->pcidev, msixnum),
+				    ret);
+			qib_7322_free_irq(dd);
+			pci_alloc_irq_vectors(dd->pcidev, 1, 1,
+					      PCI_IRQ_LEGACY);
+			goto try_intx;
+		}
+		dd->cspec->msix_entries[msixnum].arg = arg;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+		dd->cspec->msix_entries[msixnum].dca = dca;
+		dd->cspec->msix_entries[msixnum].rcv =
+			handler == qib_7322pintr;
+#endif
+		if (lsb >= 0) {
+			reg = lsb / IBA7322_REDIRECT_VEC_PER_REG;
+			sh = (lsb % IBA7322_REDIRECT_VEC_PER_REG) *
+				SYM_LSB(IntRedirect0, vec1);
+			mask &= ~(1ULL << lsb);
+			redirect[reg] |= ((u64) msixnum) << sh;
+		}
+		qib_read_kreg64(dd, 2 * msixnum + 1 +
+				(QIB_7322_MsixTable_OFFS / sizeof(u64)));
+		if (firstcpu < nr_cpu_ids &&
+			zalloc_cpumask_var(
+				&dd->cspec->msix_entries[msixnum].mask,
+				GFP_KERNEL)) {
+			if (handler == qib_7322pintr) {
+				cpumask_set_cpu(currrcvcpu,
+					dd->cspec->msix_entries[msixnum].mask);
+				currrcvcpu = cpumask_next(currrcvcpu,
+					local_mask);
+				if (currrcvcpu >= nr_cpu_ids)
+					currrcvcpu = secondcpu;
+			} else {
+				cpumask_set_cpu(firstcpu,
+					dd->cspec->msix_entries[msixnum].mask);
+			}
+			irq_set_affinity_hint(
+				pci_irq_vector(dd->pcidev, msixnum),
+				dd->cspec->msix_entries[msixnum].mask);
+		}
+		msixnum++;
+	}
+	/* Initialize the vector mapping */
+	for (i = 0; i < ARRAY_SIZE(redirect); i++)
+		qib_write_kreg(dd, kr_intredirect + i, redirect[i]);
+	dd->cspec->main_int_mask = mask;
+	tasklet_init(&dd->error_tasklet, qib_error_tasklet,
+		(unsigned long)dd);
+}
+
+/**
+ * qib_7322_boardname - fill in the board name and note features
+ * @dd: the qlogic_ib device
+ *
+ * info will be based on the board revision register
+ */
+static unsigned qib_7322_boardname(struct qib_devdata *dd)
+{
+	/* Will need enumeration of board-types here */
+	u32 boardid;
+	unsigned int features = DUAL_PORT_CAP;
+
+	boardid = SYM_FIELD(dd->revision, Revision, BoardID);
+
+	switch (boardid) {
+	case 0:
+		dd->boardname = "InfiniPath_QLE7342_Emulation";
+		break;
+	case 1:
+		dd->boardname = "InfiniPath_QLE7340";
+		dd->flags |= QIB_HAS_QSFP;
+		features = PORT_SPD_CAP;
+		break;
+	case 2:
+		dd->boardname = "InfiniPath_QLE7342";
+		dd->flags |= QIB_HAS_QSFP;
+		break;
+	case 3:
+		dd->boardname = "InfiniPath_QMI7342";
+		break;
+	case 4:
+		dd->boardname = "InfiniPath_Unsupported7342";
+		qib_dev_err(dd, "Unsupported version of QMH7342\n");
+		features = 0;
+		break;
+	case BOARD_QMH7342:
+		dd->boardname = "InfiniPath_QMH7342";
+		features = 0x24;
+		break;
+	case BOARD_QME7342:
+		dd->boardname = "InfiniPath_QME7342";
+		break;
+	case 8:
+		dd->boardname = "InfiniPath_QME7362";
+		dd->flags |= QIB_HAS_QSFP;
+		break;
+	case BOARD_QMH7360:
+		dd->boardname = "Intel IB QDR 1P FLR-QSFP Adptr";
+		dd->flags |= QIB_HAS_QSFP;
+		break;
+	case 15:
+		dd->boardname = "InfiniPath_QLE7342_TEST";
+		dd->flags |= QIB_HAS_QSFP;
+		break;
+	default:
+		dd->boardname = "InfiniPath_QLE73xy_UNKNOWN";
+		qib_dev_err(dd, "Unknown 7322 board type %u\n", boardid);
+		break;
+	}
+	dd->board_atten = 1; /* index into txdds_Xdr */
+
+	snprintf(dd->boardversion, sizeof(dd->boardversion),
+		 "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n",
+		 QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname,
+		 (unsigned int)SYM_FIELD(dd->revision, Revision_R, Arch),
+		 dd->majrev, dd->minrev,
+		 (unsigned int)SYM_FIELD(dd->revision, Revision_R, SW));
+
+	if (qib_singleport && (features >> PORT_SPD_CAP_SHIFT) & PORT_SPD_CAP) {
+		qib_devinfo(dd->pcidev,
+			    "IB%u: Forced to single port mode by module parameter\n",
+			    dd->unit);
+		features &= PORT_SPD_CAP;
+	}
+
+	return features;
+}
+
+/*
+ * This routine sleeps, so it can only be called from user context, not
+ * from interrupt context.
+ */
+static int qib_do_7322_reset(struct qib_devdata *dd)
+{
+	u64 val;
+	u64 *msix_vecsave = NULL;
+	int i, msix_entries, ret = 1;
+	u16 cmdval;
+	u8 int_line, clinesz;
+	unsigned long flags;
+
+	/* Use dev_err so it shows up in logs, etc. */
+	qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit);
+
+	qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz);
+
+	msix_entries = dd->cspec->num_msix_entries;
+
+	/* no interrupts till re-initted */
+	qib_7322_set_intr_state(dd, 0);
+
+	qib_7322_free_irq(dd);
+
+	if (msix_entries) {
+		/* can be up to 512 bytes, too big for stack */
+		msix_vecsave = kmalloc(2 * dd->cspec->num_msix_entries *
+			sizeof(u64), GFP_KERNEL);
+	}
+
+	/*
+	 * Core PCI (as of 2.6.18) doesn't save or rewrite the full vector
+	 * info that is set up by the BIOS, so we have to save and restore
+	 * it ourselves.   There is some risk something could change it,
+	 * after we save it, but since we have disabled the MSIx, it
+	 * shouldn't be touched...
+	 */
+	for (i = 0; i < msix_entries; i++) {
+		u64 vecaddr, vecdata;
+
+		vecaddr = qib_read_kreg64(dd, 2 * i +
+				  (QIB_7322_MsixTable_OFFS / sizeof(u64)));
+		vecdata = qib_read_kreg64(dd, 1 + 2 * i +
+				  (QIB_7322_MsixTable_OFFS / sizeof(u64)));
+		if (msix_vecsave) {
+			msix_vecsave[2 * i] = vecaddr;
+			/* save it without the masked bit set */
+			msix_vecsave[1 + 2 * i] = vecdata & ~0x100000000ULL;
+		}
+	}
+
+	dd->pport->cpspec->ibdeltainprog = 0;
+	dd->pport->cpspec->ibsymdelta = 0;
+	dd->pport->cpspec->iblnkerrdelta = 0;
+	dd->pport->cpspec->ibmalfdelta = 0;
+	/* so we check interrupts work again */
+	dd->z_int_counter = qib_int_counter(dd);
+
+	/*
+	 * Keep chip from being accessed until we are ready.  Use
+	 * writeq() directly, to allow the write even though QIB_PRESENT
+	 * isn't set.
+	 */
+	dd->flags &= ~(QIB_INITTED | QIB_PRESENT | QIB_BADINTR);
+	dd->flags |= QIB_DOING_RESET;
+	val = dd->control | QLOGIC_IB_C_RESET;
+	writeq(val, &dd->kregbase[kr_control]);
+
+	for (i = 1; i <= 5; i++) {
+		/*
+		 * Allow MBIST, etc. to complete; longer on each retry.
+		 * We sometimes get machine checks from bus timeout if no
+		 * response, so for now, make it *really* long.
+		 */
+		msleep(1000 + (1 + i) * 3000);
+
+		qib_pcie_reenable(dd, cmdval, int_line, clinesz);
+
+		/*
+		 * Use readq directly, so we don't need to mark it as PRESENT
+		 * until we get a successful indication that all is well.
+		 */
+		val = readq(&dd->kregbase[kr_revision]);
+		if (val == dd->revision)
+			break;
+		if (i == 5) {
+			qib_dev_err(dd,
+				"Failed to initialize after reset, unusable\n");
+			ret = 0;
+			goto  bail;
+		}
+	}
+
+	dd->flags |= QIB_PRESENT; /* it's back */
+
+	if (msix_entries) {
+		/* restore the MSIx vector address and data if saved above */
+		for (i = 0; i < msix_entries; i++) {
+			if (!msix_vecsave || !msix_vecsave[2 * i])
+				continue;
+			qib_write_kreg(dd, 2 * i +
+				(QIB_7322_MsixTable_OFFS / sizeof(u64)),
+				msix_vecsave[2 * i]);
+			qib_write_kreg(dd, 1 + 2 * i +
+				(QIB_7322_MsixTable_OFFS / sizeof(u64)),
+				msix_vecsave[1 + 2 * i]);
+		}
+	}
+
+	/* initialize the remaining registers.  */
+	for (i = 0; i < dd->num_pports; ++i)
+		write_7322_init_portregs(&dd->pport[i]);
+	write_7322_initregs(dd);
+
+	if (qib_pcie_params(dd, dd->lbus_width, &msix_entries))
+		qib_dev_err(dd,
+			"Reset failed to setup PCIe or interrupts; continuing anyway\n");
+
+	dd->cspec->num_msix_entries = msix_entries;
+	qib_setup_7322_interrupt(dd, 1);
+
+	for (i = 0; i < dd->num_pports; ++i) {
+		struct qib_pportdata *ppd = &dd->pport[i];
+
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_FORCE_NOTIFY;
+		ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+
+bail:
+	dd->flags &= ~QIB_DOING_RESET; /* OK or not, no longer resetting */
+	kfree(msix_vecsave);
+	return ret;
+}
+
+/**
+ * qib_7322_put_tid - write a TID to the chip
+ * @dd: the qlogic_ib device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: 0 for eager, 1 for expected
+ * @pa: physical address of in memory buffer; tidinvalid if freeing
+ */
+static void qib_7322_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr,
+			     u32 type, unsigned long pa)
+{
+	if (!(dd->flags & QIB_PRESENT))
+		return;
+	if (pa != dd->tidinvalid) {
+		u64 chippa = pa >> IBA7322_TID_PA_SHIFT;
+
+		/* paranoia checks */
+		if (pa != (chippa << IBA7322_TID_PA_SHIFT)) {
+			qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n",
+				    pa);
+			return;
+		}
+		if (chippa >= (1UL << IBA7322_TID_SZ_SHIFT)) {
+			qib_dev_err(dd,
+				"Physical page address 0x%lx larger than supported\n",
+				pa);
+			return;
+		}
+
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			chippa |= dd->tidtemplate;
+		else /* for now, always full 4KB page */
+			chippa |= IBA7322_TID_SZ_4K;
+		pa = chippa;
+	}
+	writeq(pa, tidptr);
+	mmiowb();
+}
+
+/**
+ * qib_7322_clear_tids - clear all TID entries for a ctxt, expected and eager
+ * @dd: the qlogic_ib device
+ * @ctxt: the ctxt
+ *
+ * clear all TID entries for a ctxt, expected and eager.
+ * Used from qib_close().
+ */
+static void qib_7322_clear_tids(struct qib_devdata *dd,
+				struct qib_ctxtdata *rcd)
+{
+	u64 __iomem *tidbase;
+	unsigned long tidinv;
+	u32 ctxt;
+	int i;
+
+	if (!dd->kregbase || !rcd)
+		return;
+
+	ctxt = rcd->ctxt;
+
+	tidinv = dd->tidinvalid;
+	tidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase +
+		 dd->rcvtidbase +
+		 ctxt * dd->rcvtidcnt * sizeof(*tidbase));
+
+	for (i = 0; i < dd->rcvtidcnt; i++)
+		qib_7322_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+				 tidinv);
+
+	tidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase +
+		 dd->rcvegrbase +
+		 rcd->rcvegr_tid_base * sizeof(*tidbase));
+
+	for (i = 0; i < rcd->rcvegrcnt; i++)
+		qib_7322_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+				 tidinv);
+}
+
+/**
+ * qib_7322_tidtemplate - setup constants for TID updates
+ * @dd: the qlogic_ib device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void qib_7322_tidtemplate(struct qib_devdata *dd)
+{
+	/*
+	 * For now, we always allocate 4KB buffers (at init) so we can
+	 * receive max size packets.  We may want a module parameter to
+	 * specify 2KB or 4KB and/or make it per port instead of per device
+	 * for those who want to reduce memory footprint.  Note that the
+	 * rcvhdrentsize size must be large enough to hold the largest
+	 * IB header (currently 96 bytes) that we expect to handle (plus of
+	 * course the 2 dwords of RHF).
+	 */
+	if (dd->rcvegrbufsize == 2048)
+		dd->tidtemplate = IBA7322_TID_SZ_2K;
+	else if (dd->rcvegrbufsize == 4096)
+		dd->tidtemplate = IBA7322_TID_SZ_4K;
+	dd->tidinvalid = 0;
+}
+
+/**
+ * qib_init_7322_get_base_info - set chip-specific flags for user code
+ * @rcd: the qlogic_ib ctxt
+ * @kbase: qib_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithims.
+ */
+
+static int qib_7322_get_base_info(struct qib_ctxtdata *rcd,
+				  struct qib_base_info *kinfo)
+{
+	kinfo->spi_runtime_flags |= QIB_RUNTIME_CTXT_MSB_IN_QP |
+		QIB_RUNTIME_PCIE | QIB_RUNTIME_NODMA_RTAIL |
+		QIB_RUNTIME_HDRSUPP | QIB_RUNTIME_SDMA;
+	if (rcd->dd->cspec->r1)
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_RCHK;
+	if (rcd->dd->flags & QIB_USE_SPCL_TRIG)
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_SPECIAL_TRIGGER;
+
+	return 0;
+}
+
+static struct qib_message_header *
+qib_7322_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr)
+{
+	u32 offset = qib_hdrget_offset(rhf_addr);
+
+	return (struct qib_message_header *)
+		(rhf_addr - dd->rhf_offset + offset);
+}
+
+/*
+ * Configure number of contexts.
+ */
+static void qib_7322_config_ctxts(struct qib_devdata *dd)
+{
+	unsigned long flags;
+	u32 nchipctxts;
+
+	nchipctxts = qib_read_kreg32(dd, kr_contextcnt);
+	dd->cspec->numctxts = nchipctxts;
+	if (qib_n_krcv_queues > 1 && dd->num_pports) {
+		dd->first_user_ctxt = NUM_IB_PORTS +
+			(qib_n_krcv_queues - 1) * dd->num_pports;
+		if (dd->first_user_ctxt > nchipctxts)
+			dd->first_user_ctxt = nchipctxts;
+		dd->n_krcv_queues = dd->first_user_ctxt / dd->num_pports;
+	} else {
+		dd->first_user_ctxt = NUM_IB_PORTS;
+		dd->n_krcv_queues = 1;
+	}
+
+	if (!qib_cfgctxts) {
+		int nctxts = dd->first_user_ctxt + num_online_cpus();
+
+		if (nctxts <= 6)
+			dd->ctxtcnt = 6;
+		else if (nctxts <= 10)
+			dd->ctxtcnt = 10;
+		else if (nctxts <= nchipctxts)
+			dd->ctxtcnt = nchipctxts;
+	} else if (qib_cfgctxts < dd->num_pports)
+		dd->ctxtcnt = dd->num_pports;
+	else if (qib_cfgctxts <= nchipctxts)
+		dd->ctxtcnt = qib_cfgctxts;
+	if (!dd->ctxtcnt) /* none of the above, set to max */
+		dd->ctxtcnt = nchipctxts;
+
+	/*
+	 * Chip can be configured for 6, 10, or 18 ctxts, and choice
+	 * affects number of eager TIDs per ctxt (1K, 2K, 4K).
+	 * Lock to be paranoid about later motion, etc.
+	 */
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+	if (dd->ctxtcnt > 10)
+		dd->rcvctrl |= 2ULL << SYM_LSB(RcvCtrl, ContextCfg);
+	else if (dd->ctxtcnt > 6)
+		dd->rcvctrl |= 1ULL << SYM_LSB(RcvCtrl, ContextCfg);
+	/* else configure for default 6 receive ctxts */
+
+	/* The XRC opcode is 5. */
+	dd->rcvctrl |= 5ULL << SYM_LSB(RcvCtrl, XrcTypeCode);
+
+	/*
+	 * RcvCtrl *must* be written here so that the
+	 * chip understands how to change rcvegrcnt below.
+	 */
+	qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+
+	/* kr_rcvegrcnt changes based on the number of contexts enabled */
+	dd->cspec->rcvegrcnt = qib_read_kreg32(dd, kr_rcvegrcnt);
+	if (qib_rcvhdrcnt)
+		dd->rcvhdrcnt = max(dd->cspec->rcvegrcnt, qib_rcvhdrcnt);
+	else
+		dd->rcvhdrcnt = 2 * max(dd->cspec->rcvegrcnt,
+				    dd->num_pports > 1 ? 1024U : 2048U);
+}
+
+static int qib_7322_get_ib_cfg(struct qib_pportdata *ppd, int which)
+{
+
+	int lsb, ret = 0;
+	u64 maskr; /* right-justified mask */
+
+	switch (which) {
+
+	case QIB_IB_CFG_LWID_ENB: /* Get allowed Link-width */
+		ret = ppd->link_width_enabled;
+		goto done;
+
+	case QIB_IB_CFG_LWID: /* Get currently active Link-width */
+		ret = ppd->link_width_active;
+		goto done;
+
+	case QIB_IB_CFG_SPD_ENB: /* Get allowed Link speeds */
+		ret = ppd->link_speed_enabled;
+		goto done;
+
+	case QIB_IB_CFG_SPD: /* Get current Link spd */
+		ret = ppd->link_speed_active;
+		goto done;
+
+	case QIB_IB_CFG_RXPOL_ENB: /* Get Auto-RX-polarity enable */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_POLARITY_REV_SUPP);
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_POLARITY_REV_SUPP);
+		break;
+
+	case QIB_IB_CFG_LREV_ENB: /* Get Auto-Lane-reversal enable */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_LANE_REV_SUPPORTED);
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED);
+		break;
+
+	case QIB_IB_CFG_LINKLATENCY:
+		ret = qib_read_kreg_port(ppd, krp_ibcstatus_b) &
+			SYM_MASK(IBCStatusB_0, LinkRoundTripLatency);
+		goto done;
+
+	case QIB_IB_CFG_OP_VLS:
+		ret = ppd->vls_operational;
+		goto done;
+
+	case QIB_IB_CFG_VL_HIGH_CAP:
+		ret = 16;
+		goto done;
+
+	case QIB_IB_CFG_VL_LOW_CAP:
+		ret = 16;
+		goto done;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		ret = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0,
+				OverrunThreshold);
+		goto done;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		ret = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0,
+				PhyerrThreshold);
+		goto done;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		ret = (ppd->cpspec->ibcctrl_a &
+		       SYM_MASK(IBCCtrlA_0, LinkDownDefaultState)) ?
+			IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL;
+		goto done;
+
+	case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */
+		lsb = IBA7322_IBC_HRTBT_LSB;
+		maskr = IBA7322_IBC_HRTBT_RMASK; /* OR of AUTO and ENB */
+		break;
+
+	case QIB_IB_CFG_PMA_TICKS:
+		/*
+		 * 0x00 = 10x link transfer rate or 4 nsec. for 2.5Gbs
+		 * Since the clock is always 250MHz, the value is 3, 1 or 0.
+		 */
+		if (ppd->link_speed_active == QIB_IB_QDR)
+			ret = 3;
+		else if (ppd->link_speed_active == QIB_IB_DDR)
+			ret = 1;
+		else
+			ret = 0;
+		goto done;
+
+	default:
+		ret = -EINVAL;
+		goto done;
+	}
+	ret = (int)((ppd->cpspec->ibcctrl_b >> lsb) & maskr);
+done:
+	return ret;
+}
+
+/*
+ * Below again cribbed liberally from older version. Do not lean
+ * heavily on it.
+ */
+#define IBA7322_IBC_DLIDLMC_SHIFT QIB_7322_IBCCtrlB_0_IB_DLID_LSB
+#define IBA7322_IBC_DLIDLMC_MASK (QIB_7322_IBCCtrlB_0_IB_DLID_RMASK \
+	| (QIB_7322_IBCCtrlB_0_IB_DLID_MASK_RMASK << 16))
+
+static int qib_7322_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 maskr; /* right-justified mask */
+	int lsb, ret = 0;
+	u16 lcmd, licmd;
+	unsigned long flags;
+
+	switch (which) {
+	case QIB_IB_CFG_LIDLMC:
+		/*
+		 * Set LID and LMC. Combined to avoid possible hazard
+		 * caller puts LMC in 16MSbits, DLID in 16LSbits of val
+		 */
+		lsb = IBA7322_IBC_DLIDLMC_SHIFT;
+		maskr = IBA7322_IBC_DLIDLMC_MASK;
+		/*
+		 * For header-checking, the SLID in the packet will
+		 * be masked with SendIBSLMCMask, and compared
+		 * with SendIBSLIDAssignMask. Make sure we do not
+		 * set any bits not covered by the mask, or we get
+		 * false-positives.
+		 */
+		qib_write_kreg_port(ppd, krp_sendslid,
+				    val & (val >> 16) & SendIBSLIDAssignMask);
+		qib_write_kreg_port(ppd, krp_sendslidmask,
+				    (val >> 16) & SendIBSLMCMask);
+		break;
+
+	case QIB_IB_CFG_LWID_ENB: /* set allowed Link-width */
+		ppd->link_width_enabled = val;
+		/* convert IB value to chip register value */
+		if (val == IB_WIDTH_1X)
+			val = 0;
+		else if (val == IB_WIDTH_4X)
+			val = 1;
+		else
+			val = 3;
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_NUM_CHANNELS);
+		lsb = SYM_LSB(IBCCtrlB_0, IB_NUM_CHANNELS);
+		break;
+
+	case QIB_IB_CFG_SPD_ENB: /* set allowed Link speeds */
+		/*
+		 * As with width, only write the actual register if the
+		 * link is currently down, otherwise takes effect on next
+		 * link change.  Since setting is being explicitly requested
+		 * (via MAD or sysfs), clear autoneg failure status if speed
+		 * autoneg is enabled.
+		 */
+		ppd->link_speed_enabled = val;
+		val <<= IBA7322_IBC_SPEED_LSB;
+		maskr = IBA7322_IBC_SPEED_MASK | IBA7322_IBC_IBTA_1_2_MASK |
+			IBA7322_IBC_MAX_SPEED_MASK;
+		if (val & (val - 1)) {
+			/* Muliple speeds enabled */
+			val |= IBA7322_IBC_IBTA_1_2_MASK |
+				IBA7322_IBC_MAX_SPEED_MASK;
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		} else if (val & IBA7322_IBC_SPEED_QDR)
+			val |= IBA7322_IBC_IBTA_1_2_MASK;
+		/* IBTA 1.2 mode + min/max + speed bits are contiguous */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_ENHANCED_MODE);
+		break;
+
+	case QIB_IB_CFG_RXPOL_ENB: /* set Auto-RX-polarity enable */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_POLARITY_REV_SUPP);
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_POLARITY_REV_SUPP);
+		break;
+
+	case QIB_IB_CFG_LREV_ENB: /* set Auto-Lane-reversal enable */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_LANE_REV_SUPPORTED);
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED);
+		break;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		maskr = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0,
+				  OverrunThreshold);
+		if (maskr != val) {
+			ppd->cpspec->ibcctrl_a &=
+				~SYM_MASK(IBCCtrlA_0, OverrunThreshold);
+			ppd->cpspec->ibcctrl_a |= (u64) val <<
+				SYM_LSB(IBCCtrlA_0, OverrunThreshold);
+			qib_write_kreg_port(ppd, krp_ibcctrl_a,
+					    ppd->cpspec->ibcctrl_a);
+			qib_write_kreg(dd, kr_scratch, 0ULL);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		maskr = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0,
+				  PhyerrThreshold);
+		if (maskr != val) {
+			ppd->cpspec->ibcctrl_a &=
+				~SYM_MASK(IBCCtrlA_0, PhyerrThreshold);
+			ppd->cpspec->ibcctrl_a |= (u64) val <<
+				SYM_LSB(IBCCtrlA_0, PhyerrThreshold);
+			qib_write_kreg_port(ppd, krp_ibcctrl_a,
+					    ppd->cpspec->ibcctrl_a);
+			qib_write_kreg(dd, kr_scratch, 0ULL);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_PKEYS: /* update pkeys */
+		maskr = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) |
+			((u64) ppd->pkeys[2] << 32) |
+			((u64) ppd->pkeys[3] << 48);
+		qib_write_kreg_port(ppd, krp_partitionkey, maskr);
+		goto bail;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		if (val == IB_LINKINITCMD_POLL)
+			ppd->cpspec->ibcctrl_a &=
+				~SYM_MASK(IBCCtrlA_0, LinkDownDefaultState);
+		else /* SLEEP */
+			ppd->cpspec->ibcctrl_a |=
+				SYM_MASK(IBCCtrlA_0, LinkDownDefaultState);
+		qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a);
+		qib_write_kreg(dd, kr_scratch, 0ULL);
+		goto bail;
+
+	case QIB_IB_CFG_MTU: /* update the MTU in IBC */
+		/*
+		 * Update our housekeeping variables, and set IBC max
+		 * size, same as init code; max IBC is max we allow in
+		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+		 * Set even if it's unchanged, print debug message only
+		 * on changes.
+		 */
+		val = (ppd->ibmaxlen >> 2) + 1;
+		ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, MaxPktLen);
+		ppd->cpspec->ibcctrl_a |= (u64)val <<
+			SYM_LSB(IBCCtrlA_0, MaxPktLen);
+		qib_write_kreg_port(ppd, krp_ibcctrl_a,
+				    ppd->cpspec->ibcctrl_a);
+		qib_write_kreg(dd, kr_scratch, 0ULL);
+		goto bail;
+
+	case QIB_IB_CFG_LSTATE: /* set the IB link state */
+		switch (val & 0xffff0000) {
+		case IB_LINKCMD_DOWN:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN;
+			ppd->cpspec->ibmalfusesnap = 1;
+			ppd->cpspec->ibmalfsnap = read_7322_creg32_port(ppd,
+				crp_errlink);
+			if (!ppd->cpspec->ibdeltainprog &&
+			    qib_compat_ddr_negotiate) {
+				ppd->cpspec->ibdeltainprog = 1;
+				ppd->cpspec->ibsymsnap =
+					read_7322_creg32_port(ppd,
+							      crp_ibsymbolerr);
+				ppd->cpspec->iblnkerrsnap =
+					read_7322_creg32_port(ppd,
+						      crp_iblinkerrrecov);
+			}
+			break;
+
+		case IB_LINKCMD_ARMED:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED;
+			if (ppd->cpspec->ibmalfusesnap) {
+				ppd->cpspec->ibmalfusesnap = 0;
+				ppd->cpspec->ibmalfdelta +=
+					read_7322_creg32_port(ppd,
+							      crp_errlink) -
+					ppd->cpspec->ibmalfsnap;
+			}
+			break;
+
+		case IB_LINKCMD_ACTIVE:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE;
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16);
+			goto bail;
+		}
+		switch (val & 0xffff) {
+		case IB_LINKINITCMD_NOP:
+			licmd = 0;
+			break;
+
+		case IB_LINKINITCMD_POLL:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL;
+			break;
+
+		case IB_LINKINITCMD_SLEEP:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP;
+			break;
+
+		case IB_LINKINITCMD_DISABLE:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE;
+			ppd->cpspec->chase_end = 0;
+			/*
+			 * stop state chase counter and timer, if running.
+			 * wait forpending timer, but don't clear .data (ppd)!
+			 */
+			if (ppd->cpspec->chase_timer.expires) {
+				del_timer_sync(&ppd->cpspec->chase_timer);
+				ppd->cpspec->chase_timer.expires = 0;
+			}
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkinitcmd req 0x%x\n",
+				    val & 0xffff);
+			goto bail;
+		}
+		qib_set_ib_7322_lstate(ppd, lcmd, licmd);
+		goto bail;
+
+	case QIB_IB_CFG_OP_VLS:
+		if (ppd->vls_operational != val) {
+			ppd->vls_operational = val;
+			set_vls(ppd);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_VL_HIGH_LIMIT:
+		qib_write_kreg_port(ppd, krp_highprio_limit, val);
+		goto bail;
+
+	case QIB_IB_CFG_HRTBT: /* set Heartbeat off/enable/auto */
+		if (val > 3) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		lsb = IBA7322_IBC_HRTBT_LSB;
+		maskr = IBA7322_IBC_HRTBT_RMASK; /* OR of AUTO and ENB */
+		break;
+
+	case QIB_IB_CFG_PORT:
+		/* val is the port number of the switch we are connected to. */
+		if (ppd->dd->cspec->r1) {
+			cancel_delayed_work(&ppd->cpspec->ipg_work);
+			ppd->cpspec->ipg_tries = 0;
+		}
+		goto bail;
+
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+	ppd->cpspec->ibcctrl_b &= ~(maskr << lsb);
+	ppd->cpspec->ibcctrl_b |= (((u64) val & maskr) << lsb);
+	qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b);
+	qib_write_kreg(dd, kr_scratch, 0);
+bail:
+	return ret;
+}
+
+static int qib_7322_set_loopback(struct qib_pportdata *ppd, const char *what)
+{
+	int ret = 0;
+	u64 val, ctrlb;
+
+	/* only IBC loopback, may add serdes and xgxs loopbacks later */
+	if (!strncmp(what, "ibc", 3)) {
+		ppd->cpspec->ibcctrl_a |= SYM_MASK(IBCCtrlA_0,
+						       Loopback);
+		val = 0; /* disable heart beat, so link will come up */
+		qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n",
+			 ppd->dd->unit, ppd->port);
+	} else if (!strncmp(what, "off", 3)) {
+		ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0,
+							Loopback);
+		/* enable heart beat again */
+		val = IBA7322_IBC_HRTBT_RMASK << IBA7322_IBC_HRTBT_LSB;
+		qib_devinfo(ppd->dd->pcidev,
+			"Disabling IB%u:%u IBC loopback (normal)\n",
+			ppd->dd->unit, ppd->port);
+	} else
+		ret = -EINVAL;
+	if (!ret) {
+		qib_write_kreg_port(ppd, krp_ibcctrl_a,
+				    ppd->cpspec->ibcctrl_a);
+		ctrlb = ppd->cpspec->ibcctrl_b & ~(IBA7322_IBC_HRTBT_MASK
+					     << IBA7322_IBC_HRTBT_LSB);
+		ppd->cpspec->ibcctrl_b = ctrlb | val;
+		qib_write_kreg_port(ppd, krp_ibcctrl_b,
+				    ppd->cpspec->ibcctrl_b);
+		qib_write_kreg(ppd->dd, kr_scratch, 0);
+	}
+	return ret;
+}
+
+static void get_vl_weights(struct qib_pportdata *ppd, unsigned regno,
+			   struct ib_vl_weight_elem *vl)
+{
+	unsigned i;
+
+	for (i = 0; i < 16; i++, regno++, vl++) {
+		u32 val = qib_read_kreg_port(ppd, regno);
+
+		vl->vl = (val >> SYM_LSB(LowPriority0_0, VirtualLane)) &
+			SYM_RMASK(LowPriority0_0, VirtualLane);
+		vl->weight = (val >> SYM_LSB(LowPriority0_0, Weight)) &
+			SYM_RMASK(LowPriority0_0, Weight);
+	}
+}
+
+static void set_vl_weights(struct qib_pportdata *ppd, unsigned regno,
+			   struct ib_vl_weight_elem *vl)
+{
+	unsigned i;
+
+	for (i = 0; i < 16; i++, regno++, vl++) {
+		u64 val;
+
+		val = ((vl->vl & SYM_RMASK(LowPriority0_0, VirtualLane)) <<
+			SYM_LSB(LowPriority0_0, VirtualLane)) |
+		      ((vl->weight & SYM_RMASK(LowPriority0_0, Weight)) <<
+			SYM_LSB(LowPriority0_0, Weight));
+		qib_write_kreg_port(ppd, regno, val);
+	}
+	if (!(ppd->p_sendctrl & SYM_MASK(SendCtrl_0, IBVLArbiterEn))) {
+		struct qib_devdata *dd = ppd->dd;
+		unsigned long flags;
+
+		spin_lock_irqsave(&dd->sendctrl_lock, flags);
+		ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, IBVLArbiterEn);
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+	}
+}
+
+static int qib_7322_get_ib_table(struct qib_pportdata *ppd, int which, void *t)
+{
+	switch (which) {
+	case QIB_IB_TBL_VL_HIGH_ARB:
+		get_vl_weights(ppd, krp_highprio_0, t);
+		break;
+
+	case QIB_IB_TBL_VL_LOW_ARB:
+		get_vl_weights(ppd, krp_lowprio_0, t);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int qib_7322_set_ib_table(struct qib_pportdata *ppd, int which, void *t)
+{
+	switch (which) {
+	case QIB_IB_TBL_VL_HIGH_ARB:
+		set_vl_weights(ppd, krp_highprio_0, t);
+		break;
+
+	case QIB_IB_TBL_VL_LOW_ARB:
+		set_vl_weights(ppd, krp_lowprio_0, t);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void qib_update_7322_usrhead(struct qib_ctxtdata *rcd, u64 hd,
+				    u32 updegr, u32 egrhd, u32 npkts)
+{
+	/*
+	 * Need to write timeout register before updating rcvhdrhead to ensure
+	 * that the timer is enabled on reception of a packet.
+	 */
+	if (hd >> IBA7322_HDRHEAD_PKTINT_SHIFT)
+		adjust_rcv_timeout(rcd, npkts);
+	if (updegr)
+		qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt);
+	mmiowb();
+	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
+	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
+	mmiowb();
+}
+
+static u32 qib_7322_hdrqempty(struct qib_ctxtdata *rcd)
+{
+	u32 head, tail;
+
+	head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt);
+	if (rcd->rcvhdrtail_kvaddr)
+		tail = qib_get_rcvhdrtail(rcd);
+	else
+		tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt);
+	return head == tail;
+}
+
+#define RCVCTRL_COMMON_MODS (QIB_RCVCTRL_CTXT_ENB | \
+	QIB_RCVCTRL_CTXT_DIS | \
+	QIB_RCVCTRL_TIDFLOW_ENB | \
+	QIB_RCVCTRL_TIDFLOW_DIS | \
+	QIB_RCVCTRL_TAILUPD_ENB | \
+	QIB_RCVCTRL_TAILUPD_DIS | \
+	QIB_RCVCTRL_INTRAVAIL_ENB | \
+	QIB_RCVCTRL_INTRAVAIL_DIS | \
+	QIB_RCVCTRL_BP_ENB | \
+	QIB_RCVCTRL_BP_DIS)
+
+#define RCVCTRL_PORT_MODS (QIB_RCVCTRL_CTXT_ENB | \
+	QIB_RCVCTRL_CTXT_DIS | \
+	QIB_RCVCTRL_PKEY_DIS | \
+	QIB_RCVCTRL_PKEY_ENB)
+
+/*
+ * Modify the RCVCTRL register in chip-specific way. This
+ * is a function because bit positions and (future) register
+ * location is chip-specifc, but the needed operations are
+ * generic. <op> is a bit-mask because we often want to
+ * do multiple modifications.
+ */
+static void rcvctrl_7322_mod(struct qib_pportdata *ppd, unsigned int op,
+			     int ctxt)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_ctxtdata *rcd;
+	u64 mask, val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+
+	if (op & QIB_RCVCTRL_TIDFLOW_ENB)
+		dd->rcvctrl |= SYM_MASK(RcvCtrl, TidFlowEnable);
+	if (op & QIB_RCVCTRL_TIDFLOW_DIS)
+		dd->rcvctrl &= ~SYM_MASK(RcvCtrl, TidFlowEnable);
+	if (op & QIB_RCVCTRL_TAILUPD_ENB)
+		dd->rcvctrl |= SYM_MASK(RcvCtrl, TailUpd);
+	if (op & QIB_RCVCTRL_TAILUPD_DIS)
+		dd->rcvctrl &= ~SYM_MASK(RcvCtrl, TailUpd);
+	if (op & QIB_RCVCTRL_PKEY_ENB)
+		ppd->p_rcvctrl &= ~SYM_MASK(RcvCtrl_0, RcvPartitionKeyDisable);
+	if (op & QIB_RCVCTRL_PKEY_DIS)
+		ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvPartitionKeyDisable);
+	if (ctxt < 0) {
+		mask = (1ULL << dd->ctxtcnt) - 1;
+		rcd = NULL;
+	} else {
+		mask = (1ULL << ctxt);
+		rcd = dd->rcd[ctxt];
+	}
+	if ((op & QIB_RCVCTRL_CTXT_ENB) && rcd) {
+		ppd->p_rcvctrl |=
+			(mask << SYM_LSB(RcvCtrl_0, ContextEnableKernel));
+		if (!(dd->flags & QIB_NODMA_RTAIL)) {
+			op |= QIB_RCVCTRL_TAILUPD_ENB; /* need reg write */
+			dd->rcvctrl |= SYM_MASK(RcvCtrl, TailUpd);
+		}
+		/* Write these registers before the context is enabled. */
+		qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr, ctxt,
+				    rcd->rcvhdrqtailaddr_phys);
+		qib_write_kreg_ctxt(dd, krc_rcvhdraddr, ctxt,
+				    rcd->rcvhdrq_phys);
+		rcd->seq_cnt = 1;
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS)
+		ppd->p_rcvctrl &=
+			~(mask << SYM_LSB(RcvCtrl_0, ContextEnableKernel));
+	if (op & QIB_RCVCTRL_BP_ENB)
+		dd->rcvctrl |= mask << SYM_LSB(RcvCtrl, dontDropRHQFull);
+	if (op & QIB_RCVCTRL_BP_DIS)
+		dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, dontDropRHQFull));
+	if (op & QIB_RCVCTRL_INTRAVAIL_ENB)
+		dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, IntrAvail));
+	if (op & QIB_RCVCTRL_INTRAVAIL_DIS)
+		dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, IntrAvail));
+	/*
+	 * Decide which registers to write depending on the ops enabled.
+	 * Special case is "flush" (no bits set at all)
+	 * which needs to write both.
+	 */
+	if (op == 0 || (op & RCVCTRL_COMMON_MODS))
+		qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	if (op == 0 || (op & RCVCTRL_PORT_MODS))
+		qib_write_kreg_port(ppd, krp_rcvctrl, ppd->p_rcvctrl);
+	if ((op & QIB_RCVCTRL_CTXT_ENB) && dd->rcd[ctxt]) {
+		/*
+		 * Init the context registers also; if we were
+		 * disabled, tail and head should both be zero
+		 * already from the enable, but since we don't
+		 * know, we have to do it explicitly.
+		 */
+		val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt);
+		qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt);
+
+		/* be sure enabling write seen; hd/tl should be 0 */
+		(void) qib_read_kreg32(dd, kr_scratch);
+		val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt);
+		dd->rcd[ctxt]->head = val;
+		/* If kctxt, interrupt on next receive. */
+		if (ctxt < dd->first_user_ctxt)
+			val |= dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	} else if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) &&
+		dd->rcd[ctxt] && dd->rhdrhead_intr_off) {
+		/* arm rcv interrupt */
+		val = dd->rcd[ctxt]->head | dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS) {
+		unsigned f;
+
+		/* Now that the context is disabled, clear these registers. */
+		if (ctxt >= 0) {
+			qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr, ctxt, 0);
+			qib_write_kreg_ctxt(dd, krc_rcvhdraddr, ctxt, 0);
+			for (f = 0; f < NUM_TIDFLOWS_CTXT; f++)
+				qib_write_ureg(dd, ur_rcvflowtable + f,
+					       TIDFLOW_ERRBITS, ctxt);
+		} else {
+			unsigned i;
+
+			for (i = 0; i < dd->cfgctxts; i++) {
+				qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr,
+						    i, 0);
+				qib_write_kreg_ctxt(dd, krc_rcvhdraddr, i, 0);
+				for (f = 0; f < NUM_TIDFLOWS_CTXT; f++)
+					qib_write_ureg(dd, ur_rcvflowtable + f,
+						       TIDFLOW_ERRBITS, i);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+}
+
+/*
+ * Modify the SENDCTRL register in chip-specific way. This
+ * is a function where there are multiple such registers with
+ * slightly different layouts.
+ * The chip doesn't allow back-to-back sendctrl writes, so write
+ * the scratch register after writing sendctrl.
+ *
+ * Which register is written depends on the operation.
+ * Most operate on the common register, while
+ * SEND_ENB and SEND_DIS operate on the per-port ones.
+ * SEND_ENB is included in common because it can change SPCL_TRIG
+ */
+#define SENDCTRL_COMMON_MODS (\
+	QIB_SENDCTRL_CLEAR | \
+	QIB_SENDCTRL_AVAIL_DIS | \
+	QIB_SENDCTRL_AVAIL_ENB | \
+	QIB_SENDCTRL_AVAIL_BLIP | \
+	QIB_SENDCTRL_DISARM | \
+	QIB_SENDCTRL_DISARM_ALL | \
+	QIB_SENDCTRL_SEND_ENB)
+
+#define SENDCTRL_PORT_MODS (\
+	QIB_SENDCTRL_CLEAR | \
+	QIB_SENDCTRL_SEND_ENB | \
+	QIB_SENDCTRL_SEND_DIS | \
+	QIB_SENDCTRL_FLUSH)
+
+static void sendctrl_7322_mod(struct qib_pportdata *ppd, u32 op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 tmp_dd_sendctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+	/* First the dd ones that are "sticky", saved in shadow */
+	if (op & QIB_SENDCTRL_CLEAR)
+		dd->sendctrl = 0;
+	if (op & QIB_SENDCTRL_AVAIL_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+	else if (op & QIB_SENDCTRL_AVAIL_ENB) {
+		dd->sendctrl |= SYM_MASK(SendCtrl, SendBufAvailUpd);
+		if (dd->flags & QIB_USE_SPCL_TRIG)
+			dd->sendctrl |= SYM_MASK(SendCtrl, SpecialTriggerEn);
+	}
+
+	/* Then the ppd ones that are "sticky", saved in shadow */
+	if (op & QIB_SENDCTRL_SEND_DIS)
+		ppd->p_sendctrl &= ~SYM_MASK(SendCtrl_0, SendEnable);
+	else if (op & QIB_SENDCTRL_SEND_ENB)
+		ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, SendEnable);
+
+	if (op & QIB_SENDCTRL_DISARM_ALL) {
+		u32 i, last;
+
+		tmp_dd_sendctrl = dd->sendctrl;
+		last = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS;
+		/*
+		 * Disarm any buffers that are not yet launched,
+		 * disabling updates until done.
+		 */
+		tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+		for (i = 0; i < last; i++) {
+			qib_write_kreg(dd, kr_sendctrl,
+				       tmp_dd_sendctrl |
+				       SYM_MASK(SendCtrl, Disarm) | i);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+	}
+
+	if (op & QIB_SENDCTRL_FLUSH) {
+		u64 tmp_ppd_sendctrl = ppd->p_sendctrl;
+
+		/*
+		 * Now drain all the fifos.  The Abort bit should never be
+		 * needed, so for now, at least, we don't use it.
+		 */
+		tmp_ppd_sendctrl |=
+			SYM_MASK(SendCtrl_0, TxeDrainRmFifo) |
+			SYM_MASK(SendCtrl_0, TxeDrainLaFifo) |
+			SYM_MASK(SendCtrl_0, TxeBypassIbc);
+		qib_write_kreg_port(ppd, krp_sendctrl, tmp_ppd_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	tmp_dd_sendctrl = dd->sendctrl;
+
+	if (op & QIB_SENDCTRL_DISARM)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) |
+			((op & QIB_7322_SendCtrl_DisarmSendBuf_RMASK) <<
+			 SYM_LSB(SendCtrl, DisarmSendBuf));
+	if ((op & QIB_SENDCTRL_AVAIL_BLIP) &&
+	    (dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd)))
+		tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+
+	if (op == 0 || (op & SENDCTRL_COMMON_MODS)) {
+		qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	if (op == 0 || (op & SENDCTRL_PORT_MODS)) {
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	if (op & QIB_SENDCTRL_AVAIL_BLIP) {
+		qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+
+	if (op & QIB_SENDCTRL_FLUSH) {
+		u32 v;
+		/*
+		 * ensure writes have hit chip, then do a few
+		 * more reads, to allow DMA of pioavail registers
+		 * to occur, so in-memory copy is in sync with
+		 * the chip.  Not always safe to sleep.
+		 */
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+}
+
+#define _PORT_VIRT_FLAG 0x8000U /* "virtual", need adjustments */
+#define _PORT_64BIT_FLAG 0x10000U /* not "virtual", but 64bit */
+#define _PORT_CNTR_IDXMASK 0x7fffU /* mask off flags above */
+
+/**
+ * qib_portcntr_7322 - read a per-port chip counter
+ * @ppd: the qlogic_ib pport
+ * @creg: the counter to read (not a chip offset)
+ */
+static u64 qib_portcntr_7322(struct qib_pportdata *ppd, u32 reg)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 ret = 0ULL;
+	u16 creg;
+	/* 0xffff for unimplemented or synthesized counters */
+	static const u32 xlator[] = {
+		[QIBPORTCNTR_PKTSEND] = crp_pktsend | _PORT_64BIT_FLAG,
+		[QIBPORTCNTR_WORDSEND] = crp_wordsend | _PORT_64BIT_FLAG,
+		[QIBPORTCNTR_PSXMITDATA] = crp_psxmitdatacount,
+		[QIBPORTCNTR_PSXMITPKTS] = crp_psxmitpktscount,
+		[QIBPORTCNTR_PSXMITWAIT] = crp_psxmitwaitcount,
+		[QIBPORTCNTR_SENDSTALL] = crp_sendstall,
+		[QIBPORTCNTR_PKTRCV] = crp_pktrcv | _PORT_64BIT_FLAG,
+		[QIBPORTCNTR_PSRCVDATA] = crp_psrcvdatacount,
+		[QIBPORTCNTR_PSRCVPKTS] = crp_psrcvpktscount,
+		[QIBPORTCNTR_RCVEBP] = crp_rcvebp,
+		[QIBPORTCNTR_RCVOVFL] = crp_rcvovfl,
+		[QIBPORTCNTR_WORDRCV] = crp_wordrcv | _PORT_64BIT_FLAG,
+		[QIBPORTCNTR_RXDROPPKT] = 0xffff, /* not needed  for 7322 */
+		[QIBPORTCNTR_RXLOCALPHYERR] = crp_rxotherlocalphyerr,
+		[QIBPORTCNTR_RXVLERR] = crp_rxvlerr,
+		[QIBPORTCNTR_ERRICRC] = crp_erricrc,
+		[QIBPORTCNTR_ERRVCRC] = crp_errvcrc,
+		[QIBPORTCNTR_ERRLPCRC] = crp_errlpcrc,
+		[QIBPORTCNTR_BADFORMAT] = crp_badformat,
+		[QIBPORTCNTR_ERR_RLEN] = crp_err_rlen,
+		[QIBPORTCNTR_IBSYMBOLERR] = crp_ibsymbolerr,
+		[QIBPORTCNTR_INVALIDRLEN] = crp_invalidrlen,
+		[QIBPORTCNTR_UNSUPVL] = crp_txunsupvl,
+		[QIBPORTCNTR_EXCESSBUFOVFL] = crp_excessbufferovfl,
+		[QIBPORTCNTR_ERRLINK] = crp_errlink,
+		[QIBPORTCNTR_IBLINKDOWN] = crp_iblinkdown,
+		[QIBPORTCNTR_IBLINKERRRECOV] = crp_iblinkerrrecov,
+		[QIBPORTCNTR_LLI] = crp_locallinkintegrityerr,
+		[QIBPORTCNTR_VL15PKTDROP] = crp_vl15droppedpkt,
+		[QIBPORTCNTR_ERRPKEY] = crp_errpkey,
+		/*
+		 * the next 3 aren't really counters, but were implemented
+		 * as counters in older chips, so still get accessed as
+		 * though they were counters from this code.
+		 */
+		[QIBPORTCNTR_PSINTERVAL] = krp_psinterval,
+		[QIBPORTCNTR_PSSTART] = krp_psstart,
+		[QIBPORTCNTR_PSSTAT] = krp_psstat,
+		/* pseudo-counter, summed for all ports */
+		[QIBPORTCNTR_KHDROVFL] = 0xffff,
+	};
+
+	if (reg >= ARRAY_SIZE(xlator)) {
+		qib_devinfo(ppd->dd->pcidev,
+			 "Unimplemented portcounter %u\n", reg);
+		goto done;
+	}
+	creg = xlator[reg] & _PORT_CNTR_IDXMASK;
+
+	/* handle non-counters and special cases first */
+	if (reg == QIBPORTCNTR_KHDROVFL) {
+		int i;
+
+		/* sum over all kernel contexts (skip if mini_init) */
+		for (i = 0; dd->rcd && i < dd->first_user_ctxt; i++) {
+			struct qib_ctxtdata *rcd = dd->rcd[i];
+
+			if (!rcd || rcd->ppd != ppd)
+				continue;
+			ret += read_7322_creg32(dd, cr_base_egrovfl + i);
+		}
+		goto done;
+	} else if (reg == QIBPORTCNTR_RXDROPPKT) {
+		/*
+		 * Used as part of the synthesis of port_rcv_errors
+		 * in the verbs code for IBTA counters.  Not needed for 7322,
+		 * because all the errors are already counted by other cntrs.
+		 */
+		goto done;
+	} else if (reg == QIBPORTCNTR_PSINTERVAL ||
+		   reg == QIBPORTCNTR_PSSTART || reg == QIBPORTCNTR_PSSTAT) {
+		/* were counters in older chips, now per-port kernel regs */
+		ret = qib_read_kreg_port(ppd, creg);
+		goto done;
+	}
+
+	/*
+	 * Only fast increment counters are 64 bits; use 32 bit reads to
+	 * avoid two independent reads when on Opteron.
+	 */
+	if (xlator[reg] & _PORT_64BIT_FLAG)
+		ret = read_7322_creg_port(ppd, creg);
+	else
+		ret = read_7322_creg32_port(ppd, creg);
+	if (creg == crp_ibsymbolerr) {
+		if (ppd->cpspec->ibdeltainprog)
+			ret -= ret - ppd->cpspec->ibsymsnap;
+		ret -= ppd->cpspec->ibsymdelta;
+	} else if (creg == crp_iblinkerrrecov) {
+		if (ppd->cpspec->ibdeltainprog)
+			ret -= ret - ppd->cpspec->iblnkerrsnap;
+		ret -= ppd->cpspec->iblnkerrdelta;
+	} else if (creg == crp_errlink)
+		ret -= ppd->cpspec->ibmalfdelta;
+	else if (creg == crp_iblinkdown)
+		ret += ppd->cpspec->iblnkdowndelta;
+done:
+	return ret;
+}
+
+/*
+ * Device counter names (not port-specific), one line per stat,
+ * single string.  Used by utilities like ipathstats to print the stats
+ * in a way which works for different versions of drivers, without changing
+ * the utility.  Names need to be 12 chars or less (w/o newline), for proper
+ * display by utility.
+ * Non-error counters are first.
+ * Start of "error" conters is indicated by a leading "E " on the first
+ * "error" counter, and doesn't count in label length.
+ * The EgrOvfl list needs to be last so we truncate them at the configured
+ * context count for the device.
+ * cntr7322indices contains the corresponding register indices.
+ */
+static const char cntr7322names[] =
+	"Interrupts\n"
+	"HostBusStall\n"
+	"E RxTIDFull\n"
+	"RxTIDInvalid\n"
+	"RxTIDFloDrop\n" /* 7322 only */
+	"Ctxt0EgrOvfl\n"
+	"Ctxt1EgrOvfl\n"
+	"Ctxt2EgrOvfl\n"
+	"Ctxt3EgrOvfl\n"
+	"Ctxt4EgrOvfl\n"
+	"Ctxt5EgrOvfl\n"
+	"Ctxt6EgrOvfl\n"
+	"Ctxt7EgrOvfl\n"
+	"Ctxt8EgrOvfl\n"
+	"Ctxt9EgrOvfl\n"
+	"Ctx10EgrOvfl\n"
+	"Ctx11EgrOvfl\n"
+	"Ctx12EgrOvfl\n"
+	"Ctx13EgrOvfl\n"
+	"Ctx14EgrOvfl\n"
+	"Ctx15EgrOvfl\n"
+	"Ctx16EgrOvfl\n"
+	"Ctx17EgrOvfl\n"
+	;
+
+static const u32 cntr7322indices[] = {
+	cr_lbint | _PORT_64BIT_FLAG,
+	cr_lbstall | _PORT_64BIT_FLAG,
+	cr_tidfull,
+	cr_tidinvalid,
+	cr_rxtidflowdrop,
+	cr_base_egrovfl + 0,
+	cr_base_egrovfl + 1,
+	cr_base_egrovfl + 2,
+	cr_base_egrovfl + 3,
+	cr_base_egrovfl + 4,
+	cr_base_egrovfl + 5,
+	cr_base_egrovfl + 6,
+	cr_base_egrovfl + 7,
+	cr_base_egrovfl + 8,
+	cr_base_egrovfl + 9,
+	cr_base_egrovfl + 10,
+	cr_base_egrovfl + 11,
+	cr_base_egrovfl + 12,
+	cr_base_egrovfl + 13,
+	cr_base_egrovfl + 14,
+	cr_base_egrovfl + 15,
+	cr_base_egrovfl + 16,
+	cr_base_egrovfl + 17,
+};
+
+/*
+ * same as cntr7322names and cntr7322indices, but for port-specific counters.
+ * portcntr7322indices is somewhat complicated by some registers needing
+ * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG
+ */
+static const char portcntr7322names[] =
+	"TxPkt\n"
+	"TxFlowPkt\n"
+	"TxWords\n"
+	"RxPkt\n"
+	"RxFlowPkt\n"
+	"RxWords\n"
+	"TxFlowStall\n"
+	"TxDmaDesc\n"  /* 7220 and 7322-only */
+	"E RxDlidFltr\n"  /* 7220 and 7322-only */
+	"IBStatusChng\n"
+	"IBLinkDown\n"
+	"IBLnkRecov\n"
+	"IBRxLinkErr\n"
+	"IBSymbolErr\n"
+	"RxLLIErr\n"
+	"RxBadFormat\n"
+	"RxBadLen\n"
+	"RxBufOvrfl\n"
+	"RxEBP\n"
+	"RxFlowCtlErr\n"
+	"RxICRCerr\n"
+	"RxLPCRCerr\n"
+	"RxVCRCerr\n"
+	"RxInvalLen\n"
+	"RxInvalPKey\n"
+	"RxPktDropped\n"
+	"TxBadLength\n"
+	"TxDropped\n"
+	"TxInvalLen\n"
+	"TxUnderrun\n"
+	"TxUnsupVL\n"
+	"RxLclPhyErr\n" /* 7220 and 7322-only from here down */
+	"RxVL15Drop\n"
+	"RxVlErr\n"
+	"XcessBufOvfl\n"
+	"RxQPBadCtxt\n" /* 7322-only from here down */
+	"TXBadHeader\n"
+	;
+
+static const u32 portcntr7322indices[] = {
+	QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG,
+	crp_pktsendflow,
+	QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG,
+	crp_pktrcvflowctrl,
+	QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG,
+	crp_txsdmadesc | _PORT_64BIT_FLAG,
+	crp_rxdlidfltr,
+	crp_ibstatuschange,
+	QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_LLI | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG,
+	crp_rcvflowctrlviol,
+	QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG,
+	crp_txminmaxlenerr,
+	crp_txdroppedpkt,
+	crp_txlenerr,
+	crp_txunderrun,
+	crp_txunsupvl,
+	QIBPORTCNTR_RXLOCALPHYERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_VL15PKTDROP | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXVLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_EXCESSBUFOVFL | _PORT_VIRT_FLAG,
+	crp_rxqpinvalidctxt,
+	crp_txhdrerr,
+};
+
+/* do all the setup to make the counter reads efficient later */
+static void init_7322_cntrnames(struct qib_devdata *dd)
+{
+	int i, j = 0;
+	char *s;
+
+	for (i = 0, s = (char *)cntr7322names; s && j <= dd->cfgctxts;
+	     i++) {
+		/* we always have at least one counter before the egrovfl */
+		if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12))
+			j = 1;
+		s = strchr(s + 1, '\n');
+		if (s && j)
+			j++;
+	}
+	dd->cspec->ncntrs = i;
+	if (!s)
+		/* full list; size is without terminating null */
+		dd->cspec->cntrnamelen = sizeof(cntr7322names) - 1;
+	else
+		dd->cspec->cntrnamelen = 1 + s - cntr7322names;
+	dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs
+		* sizeof(u64), GFP_KERNEL);
+
+	for (i = 0, s = (char *)portcntr7322names; s; i++)
+		s = strchr(s + 1, '\n');
+	dd->cspec->nportcntrs = i - 1;
+	dd->cspec->portcntrnamelen = sizeof(portcntr7322names) - 1;
+	for (i = 0; i < dd->num_pports; ++i) {
+		dd->pport[i].cpspec->portcntrs = kmalloc(dd->cspec->nportcntrs
+			* sizeof(u64), GFP_KERNEL);
+	}
+}
+
+static u32 qib_read_7322cntrs(struct qib_devdata *dd, loff_t pos, char **namep,
+			      u64 **cntrp)
+{
+	u32 ret;
+
+	if (namep) {
+		ret = dd->cspec->cntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+		else
+			*namep = (char *) cntr7322names;
+	} else {
+		u64 *cntr = dd->cspec->cntrs;
+		int i;
+
+		ret = dd->cspec->ncntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->ncntrs; i++)
+			if (cntr7322indices[i] & _PORT_64BIT_FLAG)
+				*cntr++ = read_7322_creg(dd,
+							 cntr7322indices[i] &
+							 _PORT_CNTR_IDXMASK);
+			else
+				*cntr++ = read_7322_creg32(dd,
+							   cntr7322indices[i]);
+	}
+done:
+	return ret;
+}
+
+static u32 qib_read_7322portcntrs(struct qib_devdata *dd, loff_t pos, u32 port,
+				  char **namep, u64 **cntrp)
+{
+	u32 ret;
+
+	if (namep) {
+		ret = dd->cspec->portcntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+		else
+			*namep = (char *)portcntr7322names;
+	} else {
+		struct qib_pportdata *ppd = &dd->pport[port];
+		u64 *cntr = ppd->cpspec->portcntrs;
+		int i;
+
+		ret = dd->cspec->nportcntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->nportcntrs; i++) {
+			if (portcntr7322indices[i] & _PORT_VIRT_FLAG)
+				*cntr++ = qib_portcntr_7322(ppd,
+					portcntr7322indices[i] &
+					_PORT_CNTR_IDXMASK);
+			else if (portcntr7322indices[i] & _PORT_64BIT_FLAG)
+				*cntr++ = read_7322_creg_port(ppd,
+					   portcntr7322indices[i] &
+					    _PORT_CNTR_IDXMASK);
+			else
+				*cntr++ = read_7322_creg32_port(ppd,
+					   portcntr7322indices[i]);
+		}
+	}
+done:
+	return ret;
+}
+
+/**
+ * qib_get_7322_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the qlogic_ib device qib_devdata
+ *
+ * VESTIGIAL IBA7322 has no "small fast counters", so the only
+ * real purpose of this function is to maintain the notion of
+ * "active time", which in turn is only logged into the eeprom,
+ * which we don;t have, yet, for 7322-based boards.
+ *
+ * called from add_timer
+ */
+static void qib_get_7322_faststats(struct timer_list *t)
+{
+	struct qib_devdata *dd = from_timer(dd, t, stats_timer);
+	struct qib_pportdata *ppd;
+	unsigned long flags;
+	u64 traffic_wds;
+	int pidx;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+
+		/*
+		 * If port isn't enabled or not operational ports, or
+		 * diags is running (can cause memory diags to fail)
+		 * skip this port this time.
+		 */
+		if (!ppd->link_speed_supported || !(dd->flags & QIB_INITTED)
+		    || dd->diag_client)
+			continue;
+
+		/*
+		 * Maintain an activity timer, based on traffic
+		 * exceeding a threshold, so we need to check the word-counts
+		 * even if they are 64-bit.
+		 */
+		traffic_wds = qib_portcntr_7322(ppd, QIBPORTCNTR_WORDRCV) +
+			qib_portcntr_7322(ppd, QIBPORTCNTR_WORDSEND);
+		spin_lock_irqsave(&ppd->dd->eep_st_lock, flags);
+		traffic_wds -= ppd->dd->traffic_wds;
+		ppd->dd->traffic_wds += traffic_wds;
+		spin_unlock_irqrestore(&ppd->dd->eep_st_lock, flags);
+		if (ppd->cpspec->qdr_dfe_on && (ppd->link_speed_active &
+						QIB_IB_QDR) &&
+		    (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED |
+				    QIBL_LINKACTIVE)) &&
+		    ppd->cpspec->qdr_dfe_time &&
+		    time_is_before_jiffies(ppd->cpspec->qdr_dfe_time)) {
+			ppd->cpspec->qdr_dfe_on = 0;
+
+			qib_write_kreg_port(ppd, krp_static_adapt_dis(2),
+					    ppd->dd->cspec->r1 ?
+					    QDR_STATIC_ADAPT_INIT_R1 :
+					    QDR_STATIC_ADAPT_INIT);
+			force_h1(ppd);
+		}
+	}
+	mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER);
+}
+
+/*
+ * If we were using MSIx, try to fallback to INTx.
+ */
+static int qib_7322_intr_fallback(struct qib_devdata *dd)
+{
+	if (!dd->cspec->num_msix_entries)
+		return 0; /* already using INTx */
+
+	qib_devinfo(dd->pcidev,
+		"MSIx interrupt not detected, trying INTx interrupts\n");
+	qib_7322_free_irq(dd);
+	if (pci_alloc_irq_vectors(dd->pcidev, 1, 1, PCI_IRQ_LEGACY) < 0)
+		qib_dev_err(dd, "Failed to enable INTx\n");
+	qib_setup_7322_interrupt(dd, 0);
+	return 1;
+}
+
+/*
+ * Reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well, then return to previous state (which may be still in reset)
+ * NOTE: some callers of this "know" this writes the current value
+ * of cpspec->ibcctrl_a as part of it's operation, so if that changes,
+ * check all callers.
+ */
+static void qib_7322_mini_pcs_reset(struct qib_pportdata *ppd)
+{
+	u64 val;
+	struct qib_devdata *dd = ppd->dd;
+	const u64 reset_bits = SYM_MASK(IBPCSConfig_0, xcv_rreset) |
+		SYM_MASK(IBPCSConfig_0, xcv_treset) |
+		SYM_MASK(IBPCSConfig_0, tx_rx_reset);
+
+	val = qib_read_kreg_port(ppd, krp_ib_pcsconfig);
+	qib_write_kreg(dd, kr_hwerrmask,
+		       dd->cspec->hwerrmask & ~HWE_MASK(statusValidNoEop));
+	qib_write_kreg_port(ppd, krp_ibcctrl_a,
+			    ppd->cpspec->ibcctrl_a &
+			    ~SYM_MASK(IBCCtrlA_0, IBLinkEn));
+
+	qib_write_kreg_port(ppd, krp_ib_pcsconfig, val | reset_bits);
+	qib_read_kreg32(dd, kr_scratch);
+	qib_write_kreg_port(ppd, krp_ib_pcsconfig, val & ~reset_bits);
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	qib_write_kreg(dd, kr_hwerrclear,
+		       SYM_MASK(HwErrClear, statusValidNoEopClear));
+	qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+}
+
+/*
+ * This code for non-IBTA-compliant IB speed negotiation is only known to
+ * work for the SDR to DDR transition, and only between an HCA and a switch
+ * with recent firmware.  It is based on observed heuristics, rather than
+ * actual knowledge of the non-compliant speed negotiation.
+ * It has a number of hard-coded fields, since the hope is to rewrite this
+ * when a spec is available on how the negoation is intended to work.
+ */
+static void autoneg_7322_sendpkt(struct qib_pportdata *ppd, u32 *hdr,
+				 u32 dcnt, u32 *data)
+{
+	int i;
+	u64 pbc;
+	u32 __iomem *piobuf;
+	u32 pnum, control, len;
+	struct qib_devdata *dd = ppd->dd;
+
+	i = 0;
+	len = 7 + dcnt + 1; /* 7 dword header, dword data, icrc */
+	control = qib_7322_setpbc_control(ppd, len, 0, 15);
+	pbc = ((u64) control << 32) | len;
+	while (!(piobuf = qib_7322_getsendbuf(ppd, pbc, &pnum))) {
+		if (i++ > 15)
+			return;
+		udelay(2);
+	}
+	/* disable header check on this packet, since it can't be valid */
+	dd->f_txchk_change(dd, pnum, 1, TXCHK_CHG_TYPE_DIS1, NULL);
+	writeq(pbc, piobuf);
+	qib_flush_wc();
+	qib_pio_copy(piobuf + 2, hdr, 7);
+	qib_pio_copy(piobuf + 9, data, dcnt);
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pnum >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf + spcl_off);
+	}
+	qib_flush_wc();
+	qib_sendbuf_done(dd, pnum);
+	/* and re-enable hdr check */
+	dd->f_txchk_change(dd, pnum, 1, TXCHK_CHG_TYPE_ENAB1, NULL);
+}
+
+/*
+ * _start packet gets sent twice at start, _done gets sent twice at end
+ */
+static void qib_autoneg_7322_send(struct qib_pportdata *ppd, int which)
+{
+	struct qib_devdata *dd = ppd->dd;
+	static u32 swapped;
+	u32 dw, i, hcnt, dcnt, *data;
+	static u32 hdr[7] = { 0xf002ffff, 0x48ffff, 0x6400abba };
+	static u32 madpayload_start[0x40] = {
+		0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0,
+		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x1, 0x1388, 0x15e, 0x1, /* rest 0's */
+		};
+	static u32 madpayload_done[0x40] = {
+		0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0,
+		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x40000001, 0x1388, 0x15e, /* rest 0's */
+		};
+
+	dcnt = ARRAY_SIZE(madpayload_start);
+	hcnt = ARRAY_SIZE(hdr);
+	if (!swapped) {
+		/* for maintainability, do it at runtime */
+		for (i = 0; i < hcnt; i++) {
+			dw = (__force u32) cpu_to_be32(hdr[i]);
+			hdr[i] = dw;
+		}
+		for (i = 0; i < dcnt; i++) {
+			dw = (__force u32) cpu_to_be32(madpayload_start[i]);
+			madpayload_start[i] = dw;
+			dw = (__force u32) cpu_to_be32(madpayload_done[i]);
+			madpayload_done[i] = dw;
+		}
+		swapped = 1;
+	}
+
+	data = which ? madpayload_done : madpayload_start;
+
+	autoneg_7322_sendpkt(ppd, hdr, dcnt, data);
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(2);
+	autoneg_7322_sendpkt(ppd, hdr, dcnt, data);
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(2);
+}
+
+/*
+ * Do the absolute minimum to cause an IB speed change, and make it
+ * ready, but don't actually trigger the change.   The caller will
+ * do that when ready (if link is in Polling training state, it will
+ * happen immediately, otherwise when link next goes down)
+ *
+ * This routine should only be used as part of the DDR autonegotation
+ * code for devices that are not compliant with IB 1.2 (or code that
+ * fixes things up for same).
+ *
+ * When link has gone down, and autoneg enabled, or autoneg has
+ * failed and we give up until next time we set both speeds, and
+ * then we want IBTA enabled as well as "use max enabled speed.
+ */
+static void set_7322_ibspeed_fast(struct qib_pportdata *ppd, u32 speed)
+{
+	u64 newctrlb;
+
+	newctrlb = ppd->cpspec->ibcctrl_b & ~(IBA7322_IBC_SPEED_MASK |
+				    IBA7322_IBC_IBTA_1_2_MASK |
+				    IBA7322_IBC_MAX_SPEED_MASK);
+
+	if (speed & (speed - 1)) /* multiple speeds */
+		newctrlb |= (speed << IBA7322_IBC_SPEED_LSB) |
+				    IBA7322_IBC_IBTA_1_2_MASK |
+				    IBA7322_IBC_MAX_SPEED_MASK;
+	else
+		newctrlb |= speed == QIB_IB_QDR ?
+			IBA7322_IBC_SPEED_QDR | IBA7322_IBC_IBTA_1_2_MASK :
+			((speed == QIB_IB_DDR ?
+			  IBA7322_IBC_SPEED_DDR : IBA7322_IBC_SPEED_SDR));
+
+	if (newctrlb == ppd->cpspec->ibcctrl_b)
+		return;
+
+	ppd->cpspec->ibcctrl_b = newctrlb;
+	qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b);
+	qib_write_kreg(ppd->dd, kr_scratch, 0);
+}
+
+/*
+ * This routine is only used when we are not talking to another
+ * IB 1.2-compliant device that we think can do DDR.
+ * (This includes all existing switch chips as of Oct 2007.)
+ * 1.2-compliant devices go directly to DDR prior to reaching INIT
+ */
+static void try_7322_autoneg(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags |= QIBL_IB_AUTONEG_INPROG;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	qib_autoneg_7322_send(ppd, 0);
+	set_7322_ibspeed_fast(ppd, QIB_IB_DDR);
+	qib_7322_mini_pcs_reset(ppd);
+	/* 2 msec is minimum length of a poll cycle */
+	queue_delayed_work(ib_wq, &ppd->cpspec->autoneg_work,
+			   msecs_to_jiffies(2));
+}
+
+/*
+ * Handle the empirically determined mechanism for auto-negotiation
+ * of DDR speed with switches.
+ */
+static void autoneg_7322_work(struct work_struct *work)
+{
+	struct qib_pportdata *ppd;
+	u32 i;
+	unsigned long flags;
+
+	ppd = container_of(work, struct qib_chippport_specific,
+			    autoneg_work.work)->ppd;
+
+	/*
+	 * Busy wait for this first part, it should be at most a
+	 * few hundred usec, since we scheduled ourselves for 2msec.
+	 */
+	for (i = 0; i < 25; i++) {
+		if (SYM_FIELD(ppd->lastibcstat, IBCStatusA_0, LinkState)
+		     == IB_7322_LT_STATE_POLLQUIET) {
+			qib_set_linkstate(ppd, QIB_IB_LINKDOWN_DISABLE);
+			break;
+		}
+		udelay(100);
+	}
+
+	if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+		goto done; /* we got there early or told to stop */
+
+	/* we expect this to timeout */
+	if (wait_event_timeout(ppd->cpspec->autoneg_wait,
+			       !(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+			       msecs_to_jiffies(90)))
+		goto done;
+	qib_7322_mini_pcs_reset(ppd);
+
+	/* we expect this to timeout */
+	if (wait_event_timeout(ppd->cpspec->autoneg_wait,
+			       !(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+			       msecs_to_jiffies(1700)))
+		goto done;
+	qib_7322_mini_pcs_reset(ppd);
+
+	set_7322_ibspeed_fast(ppd, QIB_IB_SDR);
+
+	/*
+	 * Wait up to 250 msec for link to train and get to INIT at DDR;
+	 * this should terminate early.
+	 */
+	wait_event_timeout(ppd->cpspec->autoneg_wait,
+		!(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+		msecs_to_jiffies(250));
+done:
+	if (ppd->lflags & QIBL_IB_AUTONEG_INPROG) {
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG;
+		if (ppd->cpspec->autoneg_tries == AUTONEG_TRIES) {
+			ppd->lflags |= QIBL_IB_AUTONEG_FAILED;
+			ppd->cpspec->autoneg_tries = 0;
+		}
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled);
+	}
+}
+
+/*
+ * This routine is used to request IPG set in the QLogic switch.
+ * Only called if r1.
+ */
+static void try_7322_ipg(struct qib_pportdata *ppd)
+{
+	struct qib_ibport *ibp = &ppd->ibport_data;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent;
+	struct ib_smp *smp;
+	unsigned delay;
+	int ret;
+
+	agent = ibp->rvp.send_agent;
+	if (!agent)
+		goto retry;
+
+	send_buf = ib_create_send_mad(agent, 0, 0, 0, IB_MGMT_MAD_HDR,
+				      IB_MGMT_MAD_DATA, GFP_ATOMIC,
+				      IB_MGMT_BASE_VERSION);
+	if (IS_ERR(send_buf))
+		goto retry;
+
+	if (!ibp->smi_ah) {
+		struct ib_ah *ah;
+
+		ah = qib_create_qp0_ah(ibp, be16_to_cpu(IB_LID_PERMISSIVE));
+		if (IS_ERR(ah))
+			ret = PTR_ERR(ah);
+		else {
+			send_buf->ah = ah;
+			ibp->smi_ah = ibah_to_rvtah(ah);
+			ret = 0;
+		}
+	} else {
+		send_buf->ah = &ibp->smi_ah->ibah;
+		ret = 0;
+	}
+
+	smp = send_buf->mad;
+	smp->base_version = IB_MGMT_BASE_VERSION;
+	smp->mgmt_class = IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE;
+	smp->class_version = 1;
+	smp->method = IB_MGMT_METHOD_SEND;
+	smp->hop_cnt = 1;
+	smp->attr_id = QIB_VENDOR_IPG;
+	smp->attr_mod = 0;
+
+	if (!ret)
+		ret = ib_post_send_mad(send_buf, NULL);
+	if (ret)
+		ib_free_send_mad(send_buf);
+retry:
+	delay = 2 << ppd->cpspec->ipg_tries;
+	queue_delayed_work(ib_wq, &ppd->cpspec->ipg_work,
+			   msecs_to_jiffies(delay));
+}
+
+/*
+ * Timeout handler for setting IPG.
+ * Only called if r1.
+ */
+static void ipg_7322_work(struct work_struct *work)
+{
+	struct qib_pportdata *ppd;
+
+	ppd = container_of(work, struct qib_chippport_specific,
+			   ipg_work.work)->ppd;
+	if ((ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED | QIBL_LINKACTIVE))
+	    && ++ppd->cpspec->ipg_tries <= 10)
+		try_7322_ipg(ppd);
+}
+
+static u32 qib_7322_iblink_state(u64 ibcs)
+{
+	u32 state = (u32)SYM_FIELD(ibcs, IBCStatusA_0, LinkState);
+
+	switch (state) {
+	case IB_7322_L_STATE_INIT:
+		state = IB_PORT_INIT;
+		break;
+	case IB_7322_L_STATE_ARM:
+		state = IB_PORT_ARMED;
+		break;
+	case IB_7322_L_STATE_ACTIVE:
+		/* fall through */
+	case IB_7322_L_STATE_ACT_DEFER:
+		state = IB_PORT_ACTIVE;
+		break;
+	default: /* fall through */
+	case IB_7322_L_STATE_DOWN:
+		state = IB_PORT_DOWN;
+		break;
+	}
+	return state;
+}
+
+/* returns the IBTA port state, rather than the IBC link training state */
+static u8 qib_7322_phys_portstate(u64 ibcs)
+{
+	u8 state = (u8)SYM_FIELD(ibcs, IBCStatusA_0, LinkTrainingState);
+	return qib_7322_physportstate[state];
+}
+
+static int qib_7322_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs)
+{
+	int ret = 0, symadj = 0;
+	unsigned long flags;
+	int mult;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+	/* Update our picture of width and speed from chip */
+	if (ibcs & SYM_MASK(IBCStatusA_0, LinkSpeedQDR)) {
+		ppd->link_speed_active = QIB_IB_QDR;
+		mult = 4;
+	} else if (ibcs & SYM_MASK(IBCStatusA_0, LinkSpeedActive)) {
+		ppd->link_speed_active = QIB_IB_DDR;
+		mult = 2;
+	} else {
+		ppd->link_speed_active = QIB_IB_SDR;
+		mult = 1;
+	}
+	if (ibcs & SYM_MASK(IBCStatusA_0, LinkWidthActive)) {
+		ppd->link_width_active = IB_WIDTH_4X;
+		mult *= 4;
+	} else
+		ppd->link_width_active = IB_WIDTH_1X;
+	ppd->delay_mult = ib_rate_to_delay[mult_to_ib_rate(mult)];
+
+	if (!ibup) {
+		u64 clr;
+
+		/* Link went down. */
+		/* do IPG MAD again after linkdown, even if last time failed */
+		ppd->cpspec->ipg_tries = 0;
+		clr = qib_read_kreg_port(ppd, krp_ibcstatus_b) &
+			(SYM_MASK(IBCStatusB_0, heartbeat_timed_out) |
+			 SYM_MASK(IBCStatusB_0, heartbeat_crosstalk));
+		if (clr)
+			qib_write_kreg_port(ppd, krp_ibcstatus_b, clr);
+		if (!(ppd->lflags & (QIBL_IB_AUTONEG_FAILED |
+				     QIBL_IB_AUTONEG_INPROG)))
+			set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled);
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+			struct qib_qsfp_data *qd =
+				&ppd->cpspec->qsfp_data;
+			/* unlock the Tx settings, speed may change */
+			qib_write_kreg_port(ppd, krp_tx_deemph_override,
+				SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				reset_tx_deemphasis_override));
+			qib_cancel_sends(ppd);
+			/* on link down, ensure sane pcs state */
+			qib_7322_mini_pcs_reset(ppd);
+			/* schedule the qsfp refresh which should turn the link
+			   off */
+			if (ppd->dd->flags & QIB_HAS_QSFP) {
+				qd->t_insert = jiffies;
+				queue_work(ib_wq, &qd->work);
+			}
+			spin_lock_irqsave(&ppd->sdma_lock, flags);
+			if (__qib_sdma_running(ppd))
+				__qib_sdma_process_event(ppd,
+					qib_sdma_event_e70_go_idle);
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+		}
+		clr = read_7322_creg32_port(ppd, crp_iblinkdown);
+		if (clr == ppd->cpspec->iblnkdownsnap)
+			ppd->cpspec->iblnkdowndelta++;
+	} else {
+		if (qib_compat_ddr_negotiate &&
+		    !(ppd->lflags & (QIBL_IB_AUTONEG_FAILED |
+				     QIBL_IB_AUTONEG_INPROG)) &&
+		    ppd->link_speed_active == QIB_IB_SDR &&
+		    (ppd->link_speed_enabled & QIB_IB_DDR)
+		    && ppd->cpspec->autoneg_tries < AUTONEG_TRIES) {
+			/* we are SDR, and auto-negotiation enabled */
+			++ppd->cpspec->autoneg_tries;
+			if (!ppd->cpspec->ibdeltainprog) {
+				ppd->cpspec->ibdeltainprog = 1;
+				ppd->cpspec->ibsymdelta +=
+					read_7322_creg32_port(ppd,
+						crp_ibsymbolerr) -
+						ppd->cpspec->ibsymsnap;
+				ppd->cpspec->iblnkerrdelta +=
+					read_7322_creg32_port(ppd,
+						crp_iblinkerrrecov) -
+						ppd->cpspec->iblnkerrsnap;
+			}
+			try_7322_autoneg(ppd);
+			ret = 1; /* no other IB status change processing */
+		} else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) &&
+			   ppd->link_speed_active == QIB_IB_SDR) {
+			qib_autoneg_7322_send(ppd, 1);
+			set_7322_ibspeed_fast(ppd, QIB_IB_DDR);
+			qib_7322_mini_pcs_reset(ppd);
+			udelay(2);
+			ret = 1; /* no other IB status change processing */
+		} else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) &&
+			   (ppd->link_speed_active & QIB_IB_DDR)) {
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags &= ~(QIBL_IB_AUTONEG_INPROG |
+					 QIBL_IB_AUTONEG_FAILED);
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+			ppd->cpspec->autoneg_tries = 0;
+			/* re-enable SDR, for next link down */
+			set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled);
+			wake_up(&ppd->cpspec->autoneg_wait);
+			symadj = 1;
+		} else if (ppd->lflags & QIBL_IB_AUTONEG_FAILED) {
+			/*
+			 * Clear autoneg failure flag, and do setup
+			 * so we'll try next time link goes down and
+			 * back to INIT (possibly connected to a
+			 * different device).
+			 */
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+			ppd->cpspec->ibcctrl_b |= IBA7322_IBC_IBTA_1_2_MASK;
+			symadj = 1;
+		}
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+			symadj = 1;
+			if (ppd->dd->cspec->r1 && ppd->cpspec->ipg_tries <= 10)
+				try_7322_ipg(ppd);
+			if (!ppd->cpspec->recovery_init)
+				setup_7322_link_recovery(ppd, 0);
+			ppd->cpspec->qdr_dfe_time = jiffies +
+				msecs_to_jiffies(QDR_DFE_DISABLE_DELAY);
+		}
+		ppd->cpspec->ibmalfusesnap = 0;
+		ppd->cpspec->ibmalfsnap = read_7322_creg32_port(ppd,
+			crp_errlink);
+	}
+	if (symadj) {
+		ppd->cpspec->iblnkdownsnap =
+			read_7322_creg32_port(ppd, crp_iblinkdown);
+		if (ppd->cpspec->ibdeltainprog) {
+			ppd->cpspec->ibdeltainprog = 0;
+			ppd->cpspec->ibsymdelta += read_7322_creg32_port(ppd,
+				crp_ibsymbolerr) - ppd->cpspec->ibsymsnap;
+			ppd->cpspec->iblnkerrdelta += read_7322_creg32_port(ppd,
+				crp_iblinkerrrecov) - ppd->cpspec->iblnkerrsnap;
+		}
+	} else if (!ibup && qib_compat_ddr_negotiate &&
+		   !ppd->cpspec->ibdeltainprog &&
+			!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+		ppd->cpspec->ibdeltainprog = 1;
+		ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd,
+			crp_ibsymbolerr);
+		ppd->cpspec->iblnkerrsnap = read_7322_creg32_port(ppd,
+			crp_iblinkerrrecov);
+	}
+
+	if (!ret)
+		qib_setup_7322_setextled(ppd, ibup);
+	return ret;
+}
+
+/*
+ * Does read/modify/write to appropriate registers to
+ * set output and direction bits selected by mask.
+ * these are in their canonical postions (e.g. lsb of
+ * dir will end up in D48 of extctrl on existing chips).
+ * returns contents of GP Inputs.
+ */
+static int gpio_7322_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask)
+{
+	u64 read_val, new_out;
+	unsigned long flags;
+
+	if (mask) {
+		/* some bits being written, lock access to GPIO */
+		dir &= mask;
+		out &= mask;
+		spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+		dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe));
+		dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe));
+		new_out = (dd->cspec->gpio_out & ~mask) | out;
+
+		qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+		qib_write_kreg(dd, kr_gpio_out, new_out);
+		dd->cspec->gpio_out = new_out;
+		spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+	}
+	/*
+	 * It is unlikely that a read at this time would get valid
+	 * data on a pin whose direction line was set in the same
+	 * call to this function. We include the read here because
+	 * that allows us to potentially combine a change on one pin with
+	 * a read on another, and because the old code did something like
+	 * this.
+	 */
+	read_val = qib_read_kreg64(dd, kr_extstatus);
+	return SYM_FIELD(read_val, EXTStatus, GPIOIn);
+}
+
+/* Enable writes to config EEPROM, if possible. Returns previous state */
+static int qib_7322_eeprom_wen(struct qib_devdata *dd, int wen)
+{
+	int prev_wen;
+	u32 mask;
+
+	mask = 1 << QIB_EEPROM_WEN_NUM;
+	prev_wen = ~gpio_7322_mod(dd, 0, 0, 0) >> QIB_EEPROM_WEN_NUM;
+	gpio_7322_mod(dd, wen ? 0 : mask, mask, mask);
+
+	return prev_wen & 1;
+}
+
+/*
+ * Read fundamental info we need to use the chip.  These are
+ * the registers that describe chip capabilities, and are
+ * saved in shadow registers.
+ */
+static void get_7322_chip_params(struct qib_devdata *dd)
+{
+	u64 val;
+	u32 piobufs;
+	int mtu;
+
+	dd->palign = qib_read_kreg32(dd, kr_pagealign);
+
+	dd->uregbase = qib_read_kreg32(dd, kr_userregbase);
+
+	dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt);
+	dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase);
+	dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase);
+	dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase);
+	dd->pio2k_bufbase = dd->piobufbase & 0xffffffff;
+
+	val = qib_read_kreg64(dd, kr_sendpiobufcnt);
+	dd->piobcnt2k = val & ~0U;
+	dd->piobcnt4k = val >> 32;
+	val = qib_read_kreg64(dd, kr_sendpiosize);
+	dd->piosize2k = val & ~0U;
+	dd->piosize4k = val >> 32;
+
+	mtu = ib_mtu_enum_to_int(qib_ibmtu);
+	if (mtu == -1)
+		mtu = QIB_DEFAULT_MTU;
+	dd->pport[0].ibmtu = (u32)mtu;
+	dd->pport[1].ibmtu = (u32)mtu;
+
+	/* these may be adjusted in init_chip_wc_pat() */
+	dd->pio2kbase = (u32 __iomem *)
+		((char __iomem *) dd->kregbase + dd->pio2k_bufbase);
+	dd->pio4kbase = (u32 __iomem *)
+		((char __iomem *) dd->kregbase +
+		 (dd->piobufbase >> 32));
+	/*
+	 * 4K buffers take 2 pages; we use roundup just to be
+	 * paranoid; we calculate it once here, rather than on
+	 * ever buf allocate
+	 */
+	dd->align4k = ALIGN(dd->piosize4k, dd->palign);
+
+	piobufs = dd->piobcnt4k + dd->piobcnt2k + NUM_VL15_BUFS;
+
+	dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) /
+		(sizeof(u64) * BITS_PER_BYTE / 2);
+}
+
+/*
+ * The chip base addresses in cspec and cpspec have to be set
+ * after possible init_chip_wc_pat(), rather than in
+ * get_7322_chip_params(), so split out as separate function
+ */
+static void qib_7322_set_baseaddrs(struct qib_devdata *dd)
+{
+	u32 cregbase;
+
+	cregbase = qib_read_kreg32(dd, kr_counterregbase);
+
+	dd->cspec->cregbase = (u64 __iomem *)(cregbase +
+		(char __iomem *)dd->kregbase);
+
+	dd->egrtidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + dd->rcvegrbase);
+
+	/* port registers are defined as relative to base of chip */
+	dd->pport[0].cpspec->kpregbase =
+		(u64 __iomem *)((char __iomem *)dd->kregbase);
+	dd->pport[1].cpspec->kpregbase =
+		(u64 __iomem *)(dd->palign +
+		(char __iomem *)dd->kregbase);
+	dd->pport[0].cpspec->cpregbase =
+		(u64 __iomem *)(qib_read_kreg_port(&dd->pport[0],
+		kr_counterregbase) + (char __iomem *)dd->kregbase);
+	dd->pport[1].cpspec->cpregbase =
+		(u64 __iomem *)(qib_read_kreg_port(&dd->pport[1],
+		kr_counterregbase) + (char __iomem *)dd->kregbase);
+}
+
+/*
+ * This is a fairly special-purpose observer, so we only support
+ * the port-specific parts of SendCtrl
+ */
+
+#define SENDCTRL_SHADOWED (SYM_MASK(SendCtrl_0, SendEnable) |		\
+			   SYM_MASK(SendCtrl_0, SDmaEnable) |		\
+			   SYM_MASK(SendCtrl_0, SDmaIntEnable) |	\
+			   SYM_MASK(SendCtrl_0, SDmaSingleDescriptor) | \
+			   SYM_MASK(SendCtrl_0, SDmaHalt) |		\
+			   SYM_MASK(SendCtrl_0, IBVLArbiterEn) |	\
+			   SYM_MASK(SendCtrl_0, ForceCreditUpToDate))
+
+static int sendctrl_hook(struct qib_devdata *dd,
+			 const struct diag_observer *op, u32 offs,
+			 u64 *data, u64 mask, int only_32)
+{
+	unsigned long flags;
+	unsigned idx;
+	unsigned pidx;
+	struct qib_pportdata *ppd = NULL;
+	u64 local_data, all_bits;
+
+	/*
+	 * The fixed correspondence between Physical ports and pports is
+	 * severed. We need to hunt for the ppd that corresponds
+	 * to the offset we got. And we have to do that without admitting
+	 * we know the stride, apparently.
+	 */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		u64 __iomem *psptr;
+		u32 psoffs;
+
+		ppd = dd->pport + pidx;
+		if (!ppd->cpspec->kpregbase)
+			continue;
+
+		psptr = ppd->cpspec->kpregbase + krp_sendctrl;
+		psoffs = (u32) (psptr - dd->kregbase) * sizeof(*psptr);
+		if (psoffs == offs)
+			break;
+	}
+
+	/* If pport is not being managed by driver, just avoid shadows. */
+	if (pidx >= dd->num_pports)
+		ppd = NULL;
+
+	/* In any case, "idx" is flat index in kreg space */
+	idx = offs / sizeof(u64);
+
+	all_bits = ~0ULL;
+	if (only_32)
+		all_bits >>= 32;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if (!ppd || (mask & all_bits) != all_bits) {
+		/*
+		 * At least some mask bits are zero, so we need
+		 * to read. The judgement call is whether from
+		 * reg or shadow. First-cut: read reg, and complain
+		 * if any bits which should be shadowed are different
+		 * from their shadowed value.
+		 */
+		if (only_32)
+			local_data = (u64)qib_read_kreg32(dd, idx);
+		else
+			local_data = qib_read_kreg64(dd, idx);
+		*data = (local_data & ~mask) | (*data & mask);
+	}
+	if (mask) {
+		/*
+		 * At least some mask bits are one, so we need
+		 * to write, but only shadow some bits.
+		 */
+		u64 sval, tval; /* Shadowed, transient */
+
+		/*
+		 * New shadow val is bits we don't want to touch,
+		 * ORed with bits we do, that are intended for shadow.
+		 */
+		if (ppd) {
+			sval = ppd->p_sendctrl & ~mask;
+			sval |= *data & SENDCTRL_SHADOWED & mask;
+			ppd->p_sendctrl = sval;
+		} else
+			sval = *data & SENDCTRL_SHADOWED & mask;
+		tval = sval | (*data & ~SENDCTRL_SHADOWED & mask);
+		qib_write_kreg(dd, idx, tval);
+		qib_write_kreg(dd, kr_scratch, 0Ull);
+	}
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+	return only_32 ? 4 : 8;
+}
+
+static const struct diag_observer sendctrl_0_observer = {
+	sendctrl_hook, KREG_IDX(SendCtrl_0) * sizeof(u64),
+	KREG_IDX(SendCtrl_0) * sizeof(u64)
+};
+
+static const struct diag_observer sendctrl_1_observer = {
+	sendctrl_hook, KREG_IDX(SendCtrl_1) * sizeof(u64),
+	KREG_IDX(SendCtrl_1) * sizeof(u64)
+};
+
+static ushort sdma_fetch_prio = 8;
+module_param_named(sdma_fetch_prio, sdma_fetch_prio, ushort, S_IRUGO);
+MODULE_PARM_DESC(sdma_fetch_prio, "SDMA descriptor fetch priority");
+
+/* Besides logging QSFP events, we set appropriate TxDDS values */
+static void init_txdds_table(struct qib_pportdata *ppd, int override);
+
+static void qsfp_7322_event(struct work_struct *work)
+{
+	struct qib_qsfp_data *qd;
+	struct qib_pportdata *ppd;
+	unsigned long pwrup;
+	unsigned long flags;
+	int ret;
+	u32 le2;
+
+	qd = container_of(work, struct qib_qsfp_data, work);
+	ppd = qd->ppd;
+	pwrup = qd->t_insert +
+		msecs_to_jiffies(QSFP_PWR_LAG_MSEC - QSFP_MODPRS_LAG_MSEC);
+
+	/* Delay for 20 msecs to allow ModPrs resistor to setup */
+	mdelay(QSFP_MODPRS_LAG_MSEC);
+
+	if (!qib_qsfp_mod_present(ppd)) {
+		ppd->cpspec->qsfp_data.modpresent = 0;
+		/* Set the physical link to disabled */
+		qib_set_ib_7322_lstate(ppd, 0,
+				       QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_LINKV;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	} else {
+		/*
+		 * Some QSFP's not only do not respond until the full power-up
+		 * time, but may behave badly if we try. So hold off responding
+		 * to insertion.
+		 */
+		while (1) {
+			if (time_is_before_jiffies(pwrup))
+				break;
+			msleep(20);
+		}
+
+		ret = qib_refresh_qsfp_cache(ppd, &qd->cache);
+
+		/*
+		 * Need to change LE2 back to defaults if we couldn't
+		 * read the cable type (to handle cable swaps), so do this
+		 * even on failure to read cable information.  We don't
+		 * get here for QME, so IS_QME check not needed here.
+		 */
+		if (!ret && !ppd->dd->cspec->r1) {
+			if (QSFP_IS_ACTIVE_FAR(qd->cache.tech))
+				le2 = LE2_QME;
+			else if (qd->cache.atten[1] >= qib_long_atten &&
+				 QSFP_IS_CU(qd->cache.tech))
+				le2 = LE2_5m;
+			else
+				le2 = LE2_DEFAULT;
+		} else
+			le2 = LE2_DEFAULT;
+		ibsd_wr_allchans(ppd, 13, (le2 << 7), BMASK(9, 7));
+		/*
+		 * We always change parameteters, since we can choose
+		 * values for cables without eeproms, and the cable may have
+		 * changed from a cable with full or partial eeprom content
+		 * to one with partial or no content.
+		 */
+		init_txdds_table(ppd, 0);
+		/* The physical link is being re-enabled only when the
+		 * previous state was DISABLED and the VALID bit is not
+		 * set. This should only happen when  the cable has been
+		 * physically pulled. */
+		if (!ppd->cpspec->qsfp_data.modpresent &&
+		    (ppd->lflags & (QIBL_LINKV | QIBL_IB_LINK_DISABLED))) {
+			ppd->cpspec->qsfp_data.modpresent = 1;
+			qib_set_ib_7322_lstate(ppd, 0,
+				QLOGIC_IB_IBCC_LINKINITCMD_SLEEP);
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags |= QIBL_LINKV;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		}
+	}
+}
+
+/*
+ * There is little we can do but complain to the user if QSFP
+ * initialization fails.
+ */
+static void qib_init_7322_qsfp(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+	struct qib_qsfp_data *qd = &ppd->cpspec->qsfp_data;
+	struct qib_devdata *dd = ppd->dd;
+	u64 mod_prs_bit = QSFP_GPIO_MOD_PRS_N;
+
+	mod_prs_bit <<= (QSFP_GPIO_PORT2_SHIFT * ppd->hw_pidx);
+	qd->ppd = ppd;
+	qib_qsfp_init(qd, qsfp_7322_event);
+	spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+	dd->cspec->extctrl |= (mod_prs_bit << SYM_LSB(EXTCtrl, GPIOInvert));
+	dd->cspec->gpio_mask |= mod_prs_bit;
+	qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+	qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+	spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+}
+
+/*
+ * called at device initialization time, and also if the txselect
+ * module parameter is changed.  This is used for cables that don't
+ * have valid QSFP EEPROMs (not present, or attenuation is zero).
+ * We initialize to the default, then if there is a specific
+ * unit,port match, we use that (and set it immediately, for the
+ * current speed, if the link is at INIT or better).
+ * String format is "default# unit#,port#=# ... u,p=#", separators must
+ * be a SPACE character.  A newline terminates.  The u,p=# tuples may
+ * optionally have "u,p=#,#", where the final # is the H1 value
+ * The last specific match is used (actually, all are used, but last
+ * one is the one that winds up set); if none at all, fall back on default.
+ */
+static void set_no_qsfp_atten(struct qib_devdata *dd, int change)
+{
+	char *nxt, *str;
+	u32 pidx, unit, port, deflt, h1;
+	unsigned long val;
+	int any = 0, seth1;
+	int txdds_size;
+
+	str = txselect_list;
+
+	/* default number is validated in setup_txselect() */
+	deflt = simple_strtoul(str, &nxt, 0);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx)
+		dd->pport[pidx].cpspec->no_eep = deflt;
+
+	txdds_size = TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ;
+	if (IS_QME(dd) || IS_QMH(dd))
+		txdds_size += TXDDS_MFG_SZ;
+
+	while (*nxt && nxt[1]) {
+		str = ++nxt;
+		unit = simple_strtoul(str, &nxt, 0);
+		if (nxt == str || !*nxt || *nxt != ',') {
+			while (*nxt && *nxt++ != ' ') /* skip to next, if any */
+				;
+			continue;
+		}
+		str = ++nxt;
+		port = simple_strtoul(str, &nxt, 0);
+		if (nxt == str || *nxt != '=') {
+			while (*nxt && *nxt++ != ' ') /* skip to next, if any */
+				;
+			continue;
+		}
+		str = ++nxt;
+		val = simple_strtoul(str, &nxt, 0);
+		if (nxt == str) {
+			while (*nxt && *nxt++ != ' ') /* skip to next, if any */
+				;
+			continue;
+		}
+		if (val >= txdds_size)
+			continue;
+		seth1 = 0;
+		h1 = 0; /* gcc thinks it might be used uninitted */
+		if (*nxt == ',' && nxt[1]) {
+			str = ++nxt;
+			h1 = (u32)simple_strtoul(str, &nxt, 0);
+			if (nxt == str)
+				while (*nxt && *nxt++ != ' ') /* skip */
+					;
+			else
+				seth1 = 1;
+		}
+		for (pidx = 0; dd->unit == unit && pidx < dd->num_pports;
+		     ++pidx) {
+			struct qib_pportdata *ppd = &dd->pport[pidx];
+
+			if (ppd->port != port || !ppd->link_speed_supported)
+				continue;
+			ppd->cpspec->no_eep = val;
+			if (seth1)
+				ppd->cpspec->h1_val = h1;
+			/* now change the IBC and serdes, overriding generic */
+			init_txdds_table(ppd, 1);
+			/* Re-enable the physical state machine on mezz boards
+			 * now that the correct settings have been set.
+			 * QSFP boards are handles by the QSFP event handler */
+			if (IS_QMH(dd) || IS_QME(dd))
+				qib_set_ib_7322_lstate(ppd, 0,
+					    QLOGIC_IB_IBCC_LINKINITCMD_SLEEP);
+			any++;
+		}
+		if (*nxt == '\n')
+			break; /* done */
+	}
+	if (change && !any) {
+		/* no specific setting, use the default.
+		 * Change the IBC and serdes, but since it's
+		 * general, don't override specific settings.
+		 */
+		for (pidx = 0; pidx < dd->num_pports; ++pidx)
+			if (dd->pport[pidx].link_speed_supported)
+				init_txdds_table(&dd->pport[pidx], 0);
+	}
+}
+
+/* handle the txselect parameter changing */
+static int setup_txselect(const char *str, const struct kernel_param *kp)
+{
+	struct qib_devdata *dd;
+	unsigned long val;
+	char *n;
+
+	if (strlen(str) >= ARRAY_SIZE(txselect_list)) {
+		pr_info("txselect_values string too long\n");
+		return -ENOSPC;
+	}
+	val = simple_strtoul(str, &n, 0);
+	if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ +
+				TXDDS_MFG_SZ)) {
+		pr_info("txselect_values must start with a number < %d\n",
+			TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ);
+		return -EINVAL;
+	}
+	strncpy(txselect_list, str, ARRAY_SIZE(txselect_list) - 1);
+
+	list_for_each_entry(dd, &qib_dev_list, list)
+		if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322)
+			set_no_qsfp_atten(dd, 1);
+	return 0;
+}
+
+/*
+ * Write the final few registers that depend on some of the
+ * init setup.  Done late in init, just before bringing up
+ * the serdes.
+ */
+static int qib_late_7322_initreg(struct qib_devdata *dd)
+{
+	int ret = 0, n;
+	u64 val;
+
+	qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize);
+	qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize);
+	qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt);
+	qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys);
+	val = qib_read_kreg64(dd, kr_sendpioavailaddr);
+	if (val != dd->pioavailregs_phys) {
+		qib_dev_err(dd,
+			"Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n",
+			(unsigned long) dd->pioavailregs_phys,
+			(unsigned long long) val);
+		ret = -EINVAL;
+	}
+
+	n = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS;
+	qib_7322_txchk_change(dd, 0, n, TXCHK_CHG_TYPE_KERN, NULL);
+	/* driver sends get pkey, lid, etc. checking also, to catch bugs */
+	qib_7322_txchk_change(dd, 0, n, TXCHK_CHG_TYPE_ENAB1, NULL);
+
+	qib_register_observer(dd, &sendctrl_0_observer);
+	qib_register_observer(dd, &sendctrl_1_observer);
+
+	dd->control &= ~QLOGIC_IB_C_SDMAFETCHPRIOEN;
+	qib_write_kreg(dd, kr_control, dd->control);
+	/*
+	 * Set SendDmaFetchPriority and init Tx params, including
+	 * QSFP handler on boards that have QSFP.
+	 * First set our default attenuation entry for cables that
+	 * don't have valid attenuation.
+	 */
+	set_no_qsfp_atten(dd, 0);
+	for (n = 0; n < dd->num_pports; ++n) {
+		struct qib_pportdata *ppd = dd->pport + n;
+
+		qib_write_kreg_port(ppd, krp_senddmaprioritythld,
+				    sdma_fetch_prio & 0xf);
+		/* Initialize qsfp if present on board. */
+		if (dd->flags & QIB_HAS_QSFP)
+			qib_init_7322_qsfp(ppd);
+	}
+	dd->control |= QLOGIC_IB_C_SDMAFETCHPRIOEN;
+	qib_write_kreg(dd, kr_control, dd->control);
+
+	return ret;
+}
+
+/* per IB port errors.  */
+#define SENDCTRL_PIBP (MASK_ACROSS(0, 1) | MASK_ACROSS(3, 3) | \
+	MASK_ACROSS(8, 15))
+#define RCVCTRL_PIBP (MASK_ACROSS(0, 17) | MASK_ACROSS(39, 41))
+#define ERRS_PIBP (MASK_ACROSS(57, 58) | MASK_ACROSS(54, 54) | \
+	MASK_ACROSS(36, 49) | MASK_ACROSS(29, 34) | MASK_ACROSS(14, 17) | \
+	MASK_ACROSS(0, 11))
+
+/*
+ * Write the initialization per-port registers that need to be done at
+ * driver load and after reset completes (i.e., that aren't done as part
+ * of other init procedures called from qib_init.c).
+ * Some of these should be redundant on reset, but play safe.
+ */
+static void write_7322_init_portregs(struct qib_pportdata *ppd)
+{
+	u64 val;
+	int i;
+
+	if (!ppd->link_speed_supported) {
+		/* no buffer credits for this port */
+		for (i = 1; i < 8; i++)
+			qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, 0);
+		qib_write_kreg_port(ppd, krp_ibcctrl_b, 0);
+		qib_write_kreg(ppd->dd, kr_scratch, 0);
+		return;
+	}
+
+	/*
+	 * Set the number of supported virtual lanes in IBC,
+	 * for flow control packet handling on unsupported VLs
+	 */
+	val = qib_read_kreg_port(ppd, krp_ibsdtestiftx);
+	val &= ~SYM_MASK(IB_SDTEST_IF_TX_0, VL_CAP);
+	val |= (u64)(ppd->vls_supported - 1) <<
+		SYM_LSB(IB_SDTEST_IF_TX_0, VL_CAP);
+	qib_write_kreg_port(ppd, krp_ibsdtestiftx, val);
+
+	qib_write_kreg_port(ppd, krp_rcvbthqp, QIB_KD_QP);
+
+	/* enable tx header checking */
+	qib_write_kreg_port(ppd, krp_sendcheckcontrol, IBA7322_SENDCHK_PKEY |
+			    IBA7322_SENDCHK_BTHQP | IBA7322_SENDCHK_SLID |
+			    IBA7322_SENDCHK_RAW_IPV6 | IBA7322_SENDCHK_MINSZ);
+
+	qib_write_kreg_port(ppd, krp_ncmodectrl,
+		SYM_MASK(IBNCModeCtrl_0, ScrambleCapLocal));
+
+	/*
+	 * Unconditionally clear the bufmask bits.  If SDMA is
+	 * enabled, we'll set them appropriately later.
+	 */
+	qib_write_kreg_port(ppd, krp_senddmabufmask0, 0);
+	qib_write_kreg_port(ppd, krp_senddmabufmask1, 0);
+	qib_write_kreg_port(ppd, krp_senddmabufmask2, 0);
+	if (ppd->dd->cspec->r1)
+		ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, ForceCreditUpToDate);
+}
+
+/*
+ * Write the initialization per-device registers that need to be done at
+ * driver load and after reset completes (i.e., that aren't done as part
+ * of other init procedures called from qib_init.c).  Also write per-port
+ * registers that are affected by overall device config, such as QP mapping
+ * Some of these should be redundant on reset, but play safe.
+ */
+static void write_7322_initregs(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	int i, pidx;
+	u64 val;
+
+	/* Set Multicast QPs received by port 2 to map to context one. */
+	qib_write_kreg(dd, KREG_IDX(RcvQPMulticastContext_1), 1);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		unsigned n, regno;
+		unsigned long flags;
+
+		if (dd->n_krcv_queues < 2 ||
+			!dd->pport[pidx].link_speed_supported)
+			continue;
+
+		ppd = &dd->pport[pidx];
+
+		/* be paranoid against later code motion, etc. */
+		spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+		ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvQPMapEnable);
+		spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+
+		/* Initialize QP to context mapping */
+		regno = krp_rcvqpmaptable;
+		val = 0;
+		if (dd->num_pports > 1)
+			n = dd->first_user_ctxt / dd->num_pports;
+		else
+			n = dd->first_user_ctxt - 1;
+		for (i = 0; i < 32; ) {
+			unsigned ctxt;
+
+			if (dd->num_pports > 1)
+				ctxt = (i % n) * dd->num_pports + pidx;
+			else if (i % n)
+				ctxt = (i % n) + 1;
+			else
+				ctxt = ppd->hw_pidx;
+			val |= ctxt << (5 * (i % 6));
+			i++;
+			if (i % 6 == 0) {
+				qib_write_kreg_port(ppd, regno, val);
+				val = 0;
+				regno++;
+			}
+		}
+		qib_write_kreg_port(ppd, regno, val);
+	}
+
+	/*
+	 * Setup up interrupt mitigation for kernel contexts, but
+	 * not user contexts (user contexts use interrupts when
+	 * stalled waiting for any packet, so want those interrupts
+	 * right away).
+	 */
+	for (i = 0; i < dd->first_user_ctxt; i++) {
+		dd->cspec->rcvavail_timeout[i] = rcv_int_timeout;
+		qib_write_kreg(dd, kr_rcvavailtimeout + i, rcv_int_timeout);
+	}
+
+	/*
+	 * Initialize  as (disabled) rcvflow tables.  Application code
+	 * will setup each flow as it uses the flow.
+	 * Doesn't clear any of the error bits that might be set.
+	 */
+	val = TIDFLOW_ERRBITS; /* these are W1C */
+	for (i = 0; i < dd->cfgctxts; i++) {
+		int flow;
+
+		for (flow = 0; flow < NUM_TIDFLOWS_CTXT; flow++)
+			qib_write_ureg(dd, ur_rcvflowtable+flow, val, i);
+	}
+
+	/*
+	 * dual cards init to dual port recovery, single port cards to
+	 * the one port.  Dual port cards may later adjust to 1 port,
+	 * and then back to dual port if both ports are connected
+	 * */
+	if (dd->num_pports)
+		setup_7322_link_recovery(dd->pport, dd->num_pports > 1);
+}
+
+static int qib_init_7322_variables(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	unsigned features, pidx, sbufcnt;
+	int ret, mtu;
+	u32 sbufs, updthresh;
+	resource_size_t vl15off;
+
+	/* pport structs are contiguous, allocated after devdata */
+	ppd = (struct qib_pportdata *)(dd + 1);
+	dd->pport = ppd;
+	ppd[0].dd = dd;
+	ppd[1].dd = dd;
+
+	dd->cspec = (struct qib_chip_specific *)(ppd + 2);
+
+	ppd[0].cpspec = (struct qib_chippport_specific *)(dd->cspec + 1);
+	ppd[1].cpspec = &ppd[0].cpspec[1];
+	ppd[0].cpspec->ppd = &ppd[0]; /* for autoneg_7322_work() */
+	ppd[1].cpspec->ppd = &ppd[1]; /* for autoneg_7322_work() */
+
+	spin_lock_init(&dd->cspec->rcvmod_lock);
+	spin_lock_init(&dd->cspec->gpio_lock);
+
+	/* we haven't yet set QIB_PRESENT, so use read directly */
+	dd->revision = readq(&dd->kregbase[kr_revision]);
+
+	if ((dd->revision & 0xffffffffU) == 0xffffffffU) {
+		qib_dev_err(dd,
+			"Revision register read failure, giving up initialization\n");
+		ret = -ENODEV;
+		goto bail;
+	}
+	dd->flags |= QIB_PRESENT;  /* now register routines work */
+
+	dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R, ChipRevMajor);
+	dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R, ChipRevMinor);
+	dd->cspec->r1 = dd->minrev == 1;
+
+	get_7322_chip_params(dd);
+	features = qib_7322_boardname(dd);
+
+	/* now that piobcnt2k and 4k set, we can allocate these */
+	sbufcnt = dd->piobcnt2k + dd->piobcnt4k +
+		NUM_VL15_BUFS + BITS_PER_LONG - 1;
+	sbufcnt /= BITS_PER_LONG;
+	dd->cspec->sendchkenable = kmalloc(sbufcnt *
+		sizeof(*dd->cspec->sendchkenable), GFP_KERNEL);
+	dd->cspec->sendgrhchk = kmalloc(sbufcnt *
+		sizeof(*dd->cspec->sendgrhchk), GFP_KERNEL);
+	dd->cspec->sendibchk = kmalloc(sbufcnt *
+		sizeof(*dd->cspec->sendibchk), GFP_KERNEL);
+	if (!dd->cspec->sendchkenable || !dd->cspec->sendgrhchk ||
+		!dd->cspec->sendibchk) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	ppd = dd->pport;
+
+	/*
+	 * GPIO bits for TWSI data and clock,
+	 * used for serial EEPROM.
+	 */
+	dd->gpio_sda_num = _QIB_GPIO_SDA_NUM;
+	dd->gpio_scl_num = _QIB_GPIO_SCL_NUM;
+	dd->twsi_eeprom_dev = QIB_TWSI_EEPROM_DEV;
+
+	dd->flags |= QIB_HAS_INTX | QIB_HAS_LINK_LATENCY |
+		QIB_NODMA_RTAIL | QIB_HAS_VLSUPP | QIB_HAS_HDRSUPP |
+		QIB_HAS_THRESH_UPDATE |
+		(sdma_idle_cnt ? QIB_HAS_SDMA_TIMEOUT : 0);
+	dd->flags |= qib_special_trigger ?
+		QIB_USE_SPCL_TRIG : QIB_HAS_SEND_DMA;
+
+	/*
+	 * Setup initial values.  These may change when PAT is enabled, but
+	 * we need these to do initial chip register accesses.
+	 */
+	qib_7322_set_baseaddrs(dd);
+
+	mtu = ib_mtu_enum_to_int(qib_ibmtu);
+	if (mtu == -1)
+		mtu = QIB_DEFAULT_MTU;
+
+	dd->cspec->int_enable_mask = QIB_I_BITSEXTANT;
+	/* all hwerrors become interrupts, unless special purposed */
+	dd->cspec->hwerrmask = ~0ULL;
+	/*  link_recovery setup causes these errors, so ignore them,
+	 *  other than clearing them when they occur */
+	dd->cspec->hwerrmask &=
+		~(SYM_MASK(HwErrMask, IBSerdesPClkNotDetectMask_0) |
+		  SYM_MASK(HwErrMask, IBSerdesPClkNotDetectMask_1) |
+		  HWE_MASK(LATriggered));
+
+	for (pidx = 0; pidx < NUM_IB_PORTS; ++pidx) {
+		struct qib_chippport_specific *cp = ppd->cpspec;
+
+		ppd->link_speed_supported = features & PORT_SPD_CAP;
+		features >>=  PORT_SPD_CAP_SHIFT;
+		if (!ppd->link_speed_supported) {
+			/* single port mode (7340, or configured) */
+			dd->skip_kctxt_mask |= 1 << pidx;
+			if (pidx == 0) {
+				/* Make sure port is disabled. */
+				qib_write_kreg_port(ppd, krp_rcvctrl, 0);
+				qib_write_kreg_port(ppd, krp_ibcctrl_a, 0);
+				ppd[0] = ppd[1];
+				dd->cspec->hwerrmask &= ~(SYM_MASK(HwErrMask,
+						  IBSerdesPClkNotDetectMask_0)
+						  | SYM_MASK(HwErrMask,
+						  SDmaMemReadErrMask_0));
+				dd->cspec->int_enable_mask &= ~(
+				     SYM_MASK(IntMask, SDmaCleanupDoneMask_0) |
+				     SYM_MASK(IntMask, SDmaIdleIntMask_0) |
+				     SYM_MASK(IntMask, SDmaProgressIntMask_0) |
+				     SYM_MASK(IntMask, SDmaIntMask_0) |
+				     SYM_MASK(IntMask, ErrIntMask_0) |
+				     SYM_MASK(IntMask, SendDoneIntMask_0));
+			} else {
+				/* Make sure port is disabled. */
+				qib_write_kreg_port(ppd, krp_rcvctrl, 0);
+				qib_write_kreg_port(ppd, krp_ibcctrl_a, 0);
+				dd->cspec->hwerrmask &= ~(SYM_MASK(HwErrMask,
+						  IBSerdesPClkNotDetectMask_1)
+						  | SYM_MASK(HwErrMask,
+						  SDmaMemReadErrMask_1));
+				dd->cspec->int_enable_mask &= ~(
+				     SYM_MASK(IntMask, SDmaCleanupDoneMask_1) |
+				     SYM_MASK(IntMask, SDmaIdleIntMask_1) |
+				     SYM_MASK(IntMask, SDmaProgressIntMask_1) |
+				     SYM_MASK(IntMask, SDmaIntMask_1) |
+				     SYM_MASK(IntMask, ErrIntMask_1) |
+				     SYM_MASK(IntMask, SendDoneIntMask_1));
+			}
+			continue;
+		}
+
+		dd->num_pports++;
+		ret = qib_init_pportdata(ppd, dd, pidx, dd->num_pports);
+		if (ret) {
+			dd->num_pports--;
+			goto bail;
+		}
+
+		ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+		ppd->link_width_enabled = IB_WIDTH_4X;
+		ppd->link_speed_enabled = ppd->link_speed_supported;
+		/*
+		 * Set the initial values to reasonable default, will be set
+		 * for real when link is up.
+		 */
+		ppd->link_width_active = IB_WIDTH_4X;
+		ppd->link_speed_active = QIB_IB_SDR;
+		ppd->delay_mult = ib_rate_to_delay[IB_RATE_10_GBPS];
+		switch (qib_num_cfg_vls) {
+		case 1:
+			ppd->vls_supported = IB_VL_VL0;
+			break;
+		case 2:
+			ppd->vls_supported = IB_VL_VL0_1;
+			break;
+		default:
+			qib_devinfo(dd->pcidev,
+				    "Invalid num_vls %u, using 4 VLs\n",
+				    qib_num_cfg_vls);
+			qib_num_cfg_vls = 4;
+			/* fall through */
+		case 4:
+			ppd->vls_supported = IB_VL_VL0_3;
+			break;
+		case 8:
+			if (mtu <= 2048)
+				ppd->vls_supported = IB_VL_VL0_7;
+			else {
+				qib_devinfo(dd->pcidev,
+					    "Invalid num_vls %u for MTU %d , using 4 VLs\n",
+					    qib_num_cfg_vls, mtu);
+				ppd->vls_supported = IB_VL_VL0_3;
+				qib_num_cfg_vls = 4;
+			}
+			break;
+		}
+		ppd->vls_operational = ppd->vls_supported;
+
+		init_waitqueue_head(&cp->autoneg_wait);
+		INIT_DELAYED_WORK(&cp->autoneg_work,
+				  autoneg_7322_work);
+		if (ppd->dd->cspec->r1)
+			INIT_DELAYED_WORK(&cp->ipg_work, ipg_7322_work);
+
+		/*
+		 * For Mez and similar cards, no qsfp info, so do
+		 * the "cable info" setup here.  Can be overridden
+		 * in adapter-specific routines.
+		 */
+		if (!(dd->flags & QIB_HAS_QSFP)) {
+			if (!IS_QMH(dd) && !IS_QME(dd))
+				qib_devinfo(dd->pcidev,
+					"IB%u:%u: Unknown mezzanine card type\n",
+					dd->unit, ppd->port);
+			cp->h1_val = IS_QMH(dd) ? H1_FORCE_QMH : H1_FORCE_QME;
+			/*
+			 * Choose center value as default tx serdes setting
+			 * until changed through module parameter.
+			 */
+			ppd->cpspec->no_eep = IS_QMH(dd) ?
+				TXDDS_TABLE_SZ + 2 : TXDDS_TABLE_SZ + 4;
+		} else
+			cp->h1_val = H1_FORCE_VAL;
+
+		/* Avoid writes to chip for mini_init */
+		if (!qib_mini_init)
+			write_7322_init_portregs(ppd);
+
+		timer_setup(&cp->chase_timer, reenable_chase, 0);
+
+		ppd++;
+	}
+
+	dd->rcvhdrentsize = qib_rcvhdrentsize ?
+		qib_rcvhdrentsize : QIB_RCVHDR_ENTSIZE;
+	dd->rcvhdrsize = qib_rcvhdrsize ?
+		qib_rcvhdrsize : QIB_DFLT_RCVHDRSIZE;
+	dd->rhf_offset = dd->rcvhdrentsize - sizeof(u64) / sizeof(u32);
+
+	/* we always allocate at least 2048 bytes for eager buffers */
+	dd->rcvegrbufsize = max(mtu, 2048);
+	BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
+	dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
+
+	qib_7322_tidtemplate(dd);
+
+	/*
+	 * We can request a receive interrupt for 1 or
+	 * more packets from current offset.
+	 */
+	dd->rhdrhead_intr_off =
+		(u64) rcv_int_count << IBA7322_HDRHEAD_PKTINT_SHIFT;
+
+	/* setup the stats timer; the add_timer is done at end of init */
+	timer_setup(&dd->stats_timer, qib_get_7322_faststats, 0);
+
+	dd->ureg_align = 0x10000;  /* 64KB alignment */
+
+	dd->piosize2kmax_dwords = dd->piosize2k >> 2;
+
+	qib_7322_config_ctxts(dd);
+	qib_set_ctxtcnt(dd);
+
+	/*
+	 * We do not set WC on the VL15 buffers to avoid
+	 * a rare problem with unaligned writes from
+	 * interrupt-flushed store buffers, so we need
+	 * to map those separately here.  We can't solve
+	 * this for the rarely used mtrr case.
+	 */
+	ret = init_chip_wc_pat(dd, 0);
+	if (ret)
+		goto bail;
+
+	/* vl15 buffers start just after the 4k buffers */
+	vl15off = dd->physaddr + (dd->piobufbase >> 32) +
+		  dd->piobcnt4k * dd->align4k;
+	dd->piovl15base	= ioremap_nocache(vl15off,
+					  NUM_VL15_BUFS * dd->align4k);
+	if (!dd->piovl15base) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	qib_7322_set_baseaddrs(dd); /* set chip access pointers now */
+
+	ret = 0;
+	if (qib_mini_init)
+		goto bail;
+	if (!dd->num_pports) {
+		qib_dev_err(dd, "No ports enabled, giving up initialization\n");
+		goto bail; /* no error, so can still figure out why err */
+	}
+
+	write_7322_initregs(dd);
+	ret = qib_create_ctxts(dd);
+	init_7322_cntrnames(dd);
+
+	updthresh = 8U; /* update threshold */
+
+	/* use all of 4KB buffers for the kernel SDMA, zero if !SDMA.
+	 * reserve the update threshold amount for other kernel use, such
+	 * as sending SMI, MAD, and ACKs, or 3, whichever is greater,
+	 * unless we aren't enabling SDMA, in which case we want to use
+	 * all the 4k bufs for the kernel.
+	 * if this was less than the update threshold, we could wait
+	 * a long time for an update.  Coded this way because we
+	 * sometimes change the update threshold for various reasons,
+	 * and we want this to remain robust.
+	 */
+	if (dd->flags & QIB_HAS_SEND_DMA) {
+		dd->cspec->sdmabufcnt = dd->piobcnt4k;
+		sbufs = updthresh > 3 ? updthresh : 3;
+	} else {
+		dd->cspec->sdmabufcnt = 0;
+		sbufs = dd->piobcnt4k;
+	}
+	dd->cspec->lastbuf_for_pio = dd->piobcnt2k + dd->piobcnt4k -
+		dd->cspec->sdmabufcnt;
+	dd->lastctxt_piobuf = dd->cspec->lastbuf_for_pio - sbufs;
+	dd->cspec->lastbuf_for_pio--; /* range is <= , not < */
+	dd->last_pio = dd->cspec->lastbuf_for_pio;
+	dd->pbufsctxt = (dd->cfgctxts > dd->first_user_ctxt) ?
+		dd->lastctxt_piobuf / (dd->cfgctxts - dd->first_user_ctxt) : 0;
+
+	/*
+	 * If we have 16 user contexts, we will have 7 sbufs
+	 * per context, so reduce the update threshold to match.  We
+	 * want to update before we actually run out, at low pbufs/ctxt
+	 * so give ourselves some margin.
+	 */
+	if (dd->pbufsctxt >= 2 && dd->pbufsctxt - 2 < updthresh)
+		updthresh = dd->pbufsctxt - 2;
+	dd->cspec->updthresh_dflt = updthresh;
+	dd->cspec->updthresh = updthresh;
+
+	/* before full enable, no interrupts, no locking needed */
+	dd->sendctrl |= ((updthresh & SYM_RMASK(SendCtrl, AvailUpdThld))
+			     << SYM_LSB(SendCtrl, AvailUpdThld)) |
+			SYM_MASK(SendCtrl, SendBufAvailPad64Byte);
+
+	dd->psxmitwait_supported = 1;
+	dd->psxmitwait_check_rate = QIB_7322_PSXMITWAIT_CHECK_RATE;
+bail:
+	if (!dd->ctxtcnt)
+		dd->ctxtcnt = 1; /* for other initialization code */
+
+	return ret;
+}
+
+static u32 __iomem *qib_7322_getsendbuf(struct qib_pportdata *ppd, u64 pbc,
+					u32 *pbufnum)
+{
+	u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK;
+	struct qib_devdata *dd = ppd->dd;
+
+	/* last is same for 2k and 4k, because we use 4k if all 2k busy */
+	if (pbc & PBC_7322_VL15_SEND) {
+		first = dd->piobcnt2k + dd->piobcnt4k + ppd->hw_pidx;
+		last = first;
+	} else {
+		if ((plen + 1) > dd->piosize2kmax_dwords)
+			first = dd->piobcnt2k;
+		else
+			first = 0;
+		last = dd->cspec->lastbuf_for_pio;
+	}
+	return qib_getsendbuf_range(dd, pbufnum, first, last);
+}
+
+static void qib_set_cntr_7322_sample(struct qib_pportdata *ppd, u32 intv,
+				     u32 start)
+{
+	qib_write_kreg_port(ppd, krp_psinterval, intv);
+	qib_write_kreg_port(ppd, krp_psstart, start);
+}
+
+/*
+ * Must be called with sdma_lock held, or before init finished.
+ */
+static void qib_sdma_set_7322_desc_cnt(struct qib_pportdata *ppd, unsigned cnt)
+{
+	qib_write_kreg_port(ppd, krp_senddmadesccnt, cnt);
+}
+
+/*
+ * sdma_lock should be acquired before calling this routine
+ */
+static void dump_sdma_7322_state(struct qib_pportdata *ppd)
+{
+	u64 reg, reg1, reg2;
+
+	reg = qib_read_kreg_port(ppd, krp_senddmastatus);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmastatus: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_sendctrl);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA sendctrl: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmabase);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmabase: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmabufmask0);
+	reg1 = qib_read_kreg_port(ppd, krp_senddmabufmask1);
+	reg2 = qib_read_kreg_port(ppd, krp_senddmabufmask2);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmabufmask 0:%llx  1:%llx  2:%llx\n",
+		 reg, reg1, reg2);
+
+	/* get bufuse bits, clear them, and print them again if non-zero */
+	reg = qib_read_kreg_port(ppd, krp_senddmabuf_use0);
+	qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg);
+	reg1 = qib_read_kreg_port(ppd, krp_senddmabuf_use1);
+	qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg1);
+	reg2 = qib_read_kreg_port(ppd, krp_senddmabuf_use2);
+	qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg2);
+	/* 0 and 1 should always be zero, so print as short form */
+	qib_dev_porterr(ppd->dd, ppd->port,
+		 "SDMA current senddmabuf_use 0:%llx  1:%llx  2:%llx\n",
+		 reg, reg1, reg2);
+	reg = qib_read_kreg_port(ppd, krp_senddmabuf_use0);
+	reg1 = qib_read_kreg_port(ppd, krp_senddmabuf_use1);
+	reg2 = qib_read_kreg_port(ppd, krp_senddmabuf_use2);
+	/* 0 and 1 should always be zero, so print as short form */
+	qib_dev_porterr(ppd->dd, ppd->port,
+		 "SDMA cleared senddmabuf_use 0:%llx  1:%llx  2:%llx\n",
+		 reg, reg1, reg2);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmatail);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmatail: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmahead);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmahead: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmaheadaddr);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmaheadaddr: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmalengen);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmalengen: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmadesccnt);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmadesccnt: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmaidlecnt);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmaidlecnt: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmaprioritythld);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmapriorityhld: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmareloadcnt);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmareloadcnt: 0x%016llx\n", reg);
+
+	dump_sdma_state(ppd);
+}
+
+static struct sdma_set_state_action sdma_7322_action_table[] = {
+	[qib_sdma_state_s00_hw_down] = {
+		.go_s99_running_tofalse = 1,
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s10_hw_start_up_wait] = {
+		.op_enable = 0,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s20_idle] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s30_sw_clean_up_wait] = {
+		.op_enable = 0,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s40_hw_clean_up_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s50_hw_halt_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 1,
+	},
+	[qib_sdma_state_s99_running] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 0,
+		.op_drain = 0,
+		.go_s99_running_totrue = 1,
+	},
+};
+
+static void qib_7322_sdma_init_early(struct qib_pportdata *ppd)
+{
+	ppd->sdma_state.set_state_action = sdma_7322_action_table;
+}
+
+static int init_sdma_7322_regs(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned lastbuf, erstbuf;
+	u64 senddmabufmask[3] = { 0 };
+	int n, ret = 0;
+
+	qib_write_kreg_port(ppd, krp_senddmabase, ppd->sdma_descq_phys);
+	qib_sdma_7322_setlengen(ppd);
+	qib_sdma_update_7322_tail(ppd, 0); /* Set SendDmaTail */
+	qib_write_kreg_port(ppd, krp_senddmareloadcnt, sdma_idle_cnt);
+	qib_write_kreg_port(ppd, krp_senddmadesccnt, 0);
+	qib_write_kreg_port(ppd, krp_senddmaheadaddr, ppd->sdma_head_phys);
+
+	if (dd->num_pports)
+		n = dd->cspec->sdmabufcnt / dd->num_pports; /* no remainder */
+	else
+		n = dd->cspec->sdmabufcnt; /* failsafe for init */
+	erstbuf = (dd->piobcnt2k + dd->piobcnt4k) -
+		((dd->num_pports == 1 || ppd->port == 2) ? n :
+		dd->cspec->sdmabufcnt);
+	lastbuf = erstbuf + n;
+
+	ppd->sdma_state.first_sendbuf = erstbuf;
+	ppd->sdma_state.last_sendbuf = lastbuf;
+	for (; erstbuf < lastbuf; ++erstbuf) {
+		unsigned word = erstbuf / BITS_PER_LONG;
+		unsigned bit = erstbuf & (BITS_PER_LONG - 1);
+
+		BUG_ON(word >= 3);
+		senddmabufmask[word] |= 1ULL << bit;
+	}
+	qib_write_kreg_port(ppd, krp_senddmabufmask0, senddmabufmask[0]);
+	qib_write_kreg_port(ppd, krp_senddmabufmask1, senddmabufmask[1]);
+	qib_write_kreg_port(ppd, krp_senddmabufmask2, senddmabufmask[2]);
+	return ret;
+}
+
+/* sdma_lock must be held */
+static u16 qib_sdma_7322_gethead(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int sane;
+	int use_dmahead;
+	u16 swhead;
+	u16 swtail;
+	u16 cnt;
+	u16 hwhead;
+
+	use_dmahead = __qib_sdma_running(ppd) &&
+		(dd->flags & QIB_HAS_SDMA_TIMEOUT);
+retry:
+	hwhead = use_dmahead ?
+		(u16) le64_to_cpu(*ppd->sdma_head_dma) :
+		(u16) qib_read_kreg_port(ppd, krp_senddmahead);
+
+	swhead = ppd->sdma_descq_head;
+	swtail = ppd->sdma_descq_tail;
+	cnt = ppd->sdma_descq_cnt;
+
+	if (swhead < swtail)
+		/* not wrapped */
+		sane = (hwhead >= swhead) & (hwhead <= swtail);
+	else if (swhead > swtail)
+		/* wrapped around */
+		sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
+			(hwhead <= swtail);
+	else
+		/* empty */
+		sane = (hwhead == swhead);
+
+	if (unlikely(!sane)) {
+		if (use_dmahead) {
+			/* try one more time, directly from the register */
+			use_dmahead = 0;
+			goto retry;
+		}
+		/* proceed as if no progress */
+		hwhead = swhead;
+	}
+
+	return hwhead;
+}
+
+static int qib_sdma_7322_busy(struct qib_pportdata *ppd)
+{
+	u64 hwstatus = qib_read_kreg_port(ppd, krp_senddmastatus);
+
+	return (hwstatus & SYM_MASK(SendDmaStatus_0, ScoreBoardDrainInProg)) ||
+	       (hwstatus & SYM_MASK(SendDmaStatus_0, HaltInProg)) ||
+	       !(hwstatus & SYM_MASK(SendDmaStatus_0, InternalSDmaHalt)) ||
+	       !(hwstatus & SYM_MASK(SendDmaStatus_0, ScbEmpty));
+}
+
+/*
+ * Compute the amount of delay before sending the next packet if the
+ * port's send rate differs from the static rate set for the QP.
+ * The delay affects the next packet and the amount of the delay is
+ * based on the length of the this packet.
+ */
+static u32 qib_7322_setpbc_control(struct qib_pportdata *ppd, u32 plen,
+				   u8 srate, u8 vl)
+{
+	u8 snd_mult = ppd->delay_mult;
+	u8 rcv_mult = ib_rate_to_delay[srate];
+	u32 ret;
+
+	ret = rcv_mult > snd_mult ? ((plen + 1) >> 1) * snd_mult : 0;
+
+	/* Indicate VL15, else set the VL in the control word */
+	if (vl == 15)
+		ret |= PBC_7322_VL15_SEND_CTRL;
+	else
+		ret |= vl << PBC_VL_NUM_LSB;
+	ret |= ((u32)(ppd->hw_pidx)) << PBC_PORT_SEL_LSB;
+
+	return ret;
+}
+
+/*
+ * Enable the per-port VL15 send buffers for use.
+ * They follow the rest of the buffers, without a config parameter.
+ * This was in initregs, but that is done before the shadow
+ * is set up, and this has to be done after the shadow is
+ * set up.
+ */
+static void qib_7322_initvl15_bufs(struct qib_devdata *dd)
+{
+	unsigned vl15bufs;
+
+	vl15bufs = dd->piobcnt2k + dd->piobcnt4k;
+	qib_chg_pioavailkernel(dd, vl15bufs, NUM_VL15_BUFS,
+			       TXCHK_CHG_TYPE_KERN, NULL);
+}
+
+static void qib_7322_init_ctxt(struct qib_ctxtdata *rcd)
+{
+	if (rcd->ctxt < NUM_IB_PORTS) {
+		if (rcd->dd->num_pports > 1) {
+			rcd->rcvegrcnt = KCTXT0_EGRCNT / 2;
+			rcd->rcvegr_tid_base = rcd->ctxt ? rcd->rcvegrcnt : 0;
+		} else {
+			rcd->rcvegrcnt = KCTXT0_EGRCNT;
+			rcd->rcvegr_tid_base = 0;
+		}
+	} else {
+		rcd->rcvegrcnt = rcd->dd->cspec->rcvegrcnt;
+		rcd->rcvegr_tid_base = KCTXT0_EGRCNT +
+			(rcd->ctxt - NUM_IB_PORTS) * rcd->rcvegrcnt;
+	}
+}
+
+#define QTXSLEEPS 5000
+static void qib_7322_txchk_change(struct qib_devdata *dd, u32 start,
+				  u32 len, u32 which, struct qib_ctxtdata *rcd)
+{
+	int i;
+	const int last = start + len - 1;
+	const int lastr = last / BITS_PER_LONG;
+	u32 sleeps = 0;
+	int wait = rcd != NULL;
+	unsigned long flags;
+
+	while (wait) {
+		unsigned long shadow = 0;
+		int cstart, previ = -1;
+
+		/*
+		 * when flipping from kernel to user, we can't change
+		 * the checking type if the buffer is allocated to the
+		 * driver.   It's OK the other direction, because it's
+		 * from close, and we have just disarm'ed all the
+		 * buffers.  All the kernel to kernel changes are also
+		 * OK.
+		 */
+		for (cstart = start; cstart <= last; cstart++) {
+			i = ((2 * cstart) + QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT)
+				/ BITS_PER_LONG;
+			if (i != previ) {
+				shadow = (unsigned long)
+					le64_to_cpu(dd->pioavailregs_dma[i]);
+				previ = i;
+			}
+			if (test_bit(((2 * cstart) +
+				      QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT)
+				     % BITS_PER_LONG, &shadow))
+				break;
+		}
+
+		if (cstart > last)
+			break;
+
+		if (sleeps == QTXSLEEPS)
+			break;
+		/* make sure we see an updated copy next time around */
+		sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		sleeps++;
+		msleep(20);
+	}
+
+	switch (which) {
+	case TXCHK_CHG_TYPE_DIS1:
+		/*
+		 * disable checking on a range; used by diags; just
+		 * one buffer, but still written generically
+		 */
+		for (i = start; i <= last; i++)
+			clear_bit(i, dd->cspec->sendchkenable);
+		break;
+
+	case TXCHK_CHG_TYPE_ENAB1:
+		/*
+		 * (re)enable checking on a range; used by diags; just
+		 * one buffer, but still written generically; read
+		 * scratch to be sure buffer actually triggered, not
+		 * just flushed from processor.
+		 */
+		qib_read_kreg32(dd, kr_scratch);
+		for (i = start; i <= last; i++)
+			set_bit(i, dd->cspec->sendchkenable);
+		break;
+
+	case TXCHK_CHG_TYPE_KERN:
+		/* usable by kernel */
+		for (i = start; i <= last; i++) {
+			set_bit(i, dd->cspec->sendibchk);
+			clear_bit(i, dd->cspec->sendgrhchk);
+		}
+		spin_lock_irqsave(&dd->uctxt_lock, flags);
+		/* see if we need to raise avail update threshold */
+		for (i = dd->first_user_ctxt;
+		     dd->cspec->updthresh != dd->cspec->updthresh_dflt
+		     && i < dd->cfgctxts; i++)
+			if (dd->rcd[i] && dd->rcd[i]->subctxt_cnt &&
+			   ((dd->rcd[i]->piocnt / dd->rcd[i]->subctxt_cnt) - 1)
+			   < dd->cspec->updthresh_dflt)
+				break;
+		spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+		if (i == dd->cfgctxts) {
+			spin_lock_irqsave(&dd->sendctrl_lock, flags);
+			dd->cspec->updthresh = dd->cspec->updthresh_dflt;
+			dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld);
+			dd->sendctrl |= (dd->cspec->updthresh &
+					 SYM_RMASK(SendCtrl, AvailUpdThld)) <<
+					   SYM_LSB(SendCtrl, AvailUpdThld);
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+			sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		}
+		break;
+
+	case TXCHK_CHG_TYPE_USER:
+		/* for user process */
+		for (i = start; i <= last; i++) {
+			clear_bit(i, dd->cspec->sendibchk);
+			set_bit(i, dd->cspec->sendgrhchk);
+		}
+		spin_lock_irqsave(&dd->sendctrl_lock, flags);
+		if (rcd && rcd->subctxt_cnt && ((rcd->piocnt
+			/ rcd->subctxt_cnt) - 1) < dd->cspec->updthresh) {
+			dd->cspec->updthresh = (rcd->piocnt /
+						rcd->subctxt_cnt) - 1;
+			dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld);
+			dd->sendctrl |= (dd->cspec->updthresh &
+					SYM_RMASK(SendCtrl, AvailUpdThld))
+					<< SYM_LSB(SendCtrl, AvailUpdThld);
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+			sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		} else
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+		break;
+
+	default:
+		break;
+	}
+
+	for (i = start / BITS_PER_LONG; which >= 2 && i <= lastr; ++i)
+		qib_write_kreg(dd, kr_sendcheckmask + i,
+			       dd->cspec->sendchkenable[i]);
+
+	for (i = start / BITS_PER_LONG; which < 2 && i <= lastr; ++i) {
+		qib_write_kreg(dd, kr_sendgrhcheckmask + i,
+			       dd->cspec->sendgrhchk[i]);
+		qib_write_kreg(dd, kr_sendibpktmask + i,
+			       dd->cspec->sendibchk[i]);
+	}
+
+	/*
+	 * Be sure whatever we did was seen by the chip and acted upon,
+	 * before we return.  Mostly important for which >= 2.
+	 */
+	qib_read_kreg32(dd, kr_scratch);
+}
+
+
+/* useful for trigger analyzers, etc. */
+static void writescratch(struct qib_devdata *dd, u32 val)
+{
+	qib_write_kreg(dd, kr_scratch, val);
+}
+
+/* Dummy for now, use chip regs soon */
+static int qib_7322_tempsense_rd(struct qib_devdata *dd, int regnum)
+{
+	return -ENXIO;
+}
+
+/**
+ * qib_init_iba7322_funcs - set up the chip-specific function pointers
+ * @dev: the pci_dev for qlogic_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * Also allocates, inits, and returns the devdata struct for this
+ * device instance
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *pdev,
+					   const struct pci_device_id *ent)
+{
+	struct qib_devdata *dd;
+	int ret, i;
+	u32 tabsize, actual_cnt = 0;
+
+	dd = qib_alloc_devdata(pdev,
+		NUM_IB_PORTS * sizeof(struct qib_pportdata) +
+		sizeof(struct qib_chip_specific) +
+		NUM_IB_PORTS * sizeof(struct qib_chippport_specific));
+	if (IS_ERR(dd))
+		goto bail;
+
+	dd->f_bringup_serdes    = qib_7322_bringup_serdes;
+	dd->f_cleanup           = qib_setup_7322_cleanup;
+	dd->f_clear_tids        = qib_7322_clear_tids;
+	dd->f_free_irq          = qib_7322_free_irq;
+	dd->f_get_base_info     = qib_7322_get_base_info;
+	dd->f_get_msgheader     = qib_7322_get_msgheader;
+	dd->f_getsendbuf        = qib_7322_getsendbuf;
+	dd->f_gpio_mod          = gpio_7322_mod;
+	dd->f_eeprom_wen        = qib_7322_eeprom_wen;
+	dd->f_hdrqempty         = qib_7322_hdrqempty;
+	dd->f_ib_updown         = qib_7322_ib_updown;
+	dd->f_init_ctxt         = qib_7322_init_ctxt;
+	dd->f_initvl15_bufs     = qib_7322_initvl15_bufs;
+	dd->f_intr_fallback     = qib_7322_intr_fallback;
+	dd->f_late_initreg      = qib_late_7322_initreg;
+	dd->f_setpbc_control    = qib_7322_setpbc_control;
+	dd->f_portcntr          = qib_portcntr_7322;
+	dd->f_put_tid           = qib_7322_put_tid;
+	dd->f_quiet_serdes      = qib_7322_mini_quiet_serdes;
+	dd->f_rcvctrl           = rcvctrl_7322_mod;
+	dd->f_read_cntrs        = qib_read_7322cntrs;
+	dd->f_read_portcntrs    = qib_read_7322portcntrs;
+	dd->f_reset             = qib_do_7322_reset;
+	dd->f_init_sdma_regs    = init_sdma_7322_regs;
+	dd->f_sdma_busy         = qib_sdma_7322_busy;
+	dd->f_sdma_gethead      = qib_sdma_7322_gethead;
+	dd->f_sdma_sendctrl     = qib_7322_sdma_sendctrl;
+	dd->f_sdma_set_desc_cnt = qib_sdma_set_7322_desc_cnt;
+	dd->f_sdma_update_tail  = qib_sdma_update_7322_tail;
+	dd->f_sendctrl          = sendctrl_7322_mod;
+	dd->f_set_armlaunch     = qib_set_7322_armlaunch;
+	dd->f_set_cntr_sample   = qib_set_cntr_7322_sample;
+	dd->f_iblink_state      = qib_7322_iblink_state;
+	dd->f_ibphys_portstate  = qib_7322_phys_portstate;
+	dd->f_get_ib_cfg        = qib_7322_get_ib_cfg;
+	dd->f_set_ib_cfg        = qib_7322_set_ib_cfg;
+	dd->f_set_ib_loopback   = qib_7322_set_loopback;
+	dd->f_get_ib_table      = qib_7322_get_ib_table;
+	dd->f_set_ib_table      = qib_7322_set_ib_table;
+	dd->f_set_intr_state    = qib_7322_set_intr_state;
+	dd->f_setextled         = qib_setup_7322_setextled;
+	dd->f_txchk_change      = qib_7322_txchk_change;
+	dd->f_update_usrhead    = qib_update_7322_usrhead;
+	dd->f_wantpiobuf_intr   = qib_wantpiobuf_7322_intr;
+	dd->f_xgxs_reset        = qib_7322_mini_pcs_reset;
+	dd->f_sdma_hw_clean_up  = qib_7322_sdma_hw_clean_up;
+	dd->f_sdma_hw_start_up  = qib_7322_sdma_hw_start_up;
+	dd->f_sdma_init_early   = qib_7322_sdma_init_early;
+	dd->f_writescratch      = writescratch;
+	dd->f_tempsense_rd	= qib_7322_tempsense_rd;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dd->f_notify_dca	= qib_7322_notify_dca;
+#endif
+	/*
+	 * Do remaining PCIe setup and save PCIe values in dd.
+	 * Any error printing is already done by the init code.
+	 * On return, we have the chip mapped, but chip registers
+	 * are not set up until start of qib_init_7322_variables.
+	 */
+	ret = qib_pcie_ddinit(dd, pdev, ent);
+	if (ret < 0)
+		goto bail_free;
+
+	/* initialize chip-specific variables */
+	ret = qib_init_7322_variables(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	if (qib_mini_init || !dd->num_pports)
+		goto bail;
+
+	/*
+	 * Determine number of vectors we want; depends on port count
+	 * and number of configured kernel receive queues actually used.
+	 * Should also depend on whether sdma is enabled or not, but
+	 * that's such a rare testing case it's not worth worrying about.
+	 */
+	tabsize = dd->first_user_ctxt + ARRAY_SIZE(irq_table);
+	for (i = 0; i < tabsize; i++)
+		if ((i < ARRAY_SIZE(irq_table) &&
+		     irq_table[i].port <= dd->num_pports) ||
+		    (i >= ARRAY_SIZE(irq_table) &&
+		     dd->rcd[i - ARRAY_SIZE(irq_table)]))
+			actual_cnt++;
+	/* reduce by ctxt's < 2 */
+	if (qib_krcvq01_no_msi)
+		actual_cnt -= dd->num_pports;
+
+	tabsize = actual_cnt;
+	dd->cspec->msix_entries = kzalloc(tabsize *
+			sizeof(struct qib_msix_entry), GFP_KERNEL);
+	if (!dd->cspec->msix_entries)
+		tabsize = 0;
+
+	if (qib_pcie_params(dd, 8, &tabsize))
+		qib_dev_err(dd,
+			"Failed to setup PCIe or interrupts; continuing anyway\n");
+	/* may be less than we wanted, if not enough available */
+	dd->cspec->num_msix_entries = tabsize;
+
+	/* setup interrupt handler */
+	qib_setup_7322_interrupt(dd, 1);
+
+	/* clear diagctrl register, in case diags were running and crashed */
+	qib_write_kreg(dd, kr_hwdiagctrl, 0);
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	if (!dca_add_requester(&pdev->dev)) {
+		qib_devinfo(dd->pcidev, "DCA enabled\n");
+		dd->flags |= QIB_DCA_ENABLED;
+		qib_setup_dca(dd);
+	}
+#endif
+	goto bail;
+
+bail_cleanup:
+	qib_pcie_ddcleanup(dd);
+bail_free:
+	qib_free_devdata(dd);
+	dd = ERR_PTR(ret);
+bail:
+	return dd;
+}
+
+/*
+ * Set the table entry at the specified index from the table specifed.
+ * There are 3 * TXDDS_TABLE_SZ entries in all per port, with the first
+ * TXDDS_TABLE_SZ for SDR, the next for DDR, and the last for QDR.
+ * 'idx' below addresses the correct entry, while its 4 LSBs select the
+ * corresponding entry (one of TXDDS_TABLE_SZ) from the selected table.
+ */
+#define DDS_ENT_AMP_LSB 14
+#define DDS_ENT_MAIN_LSB 9
+#define DDS_ENT_POST_LSB 5
+#define DDS_ENT_PRE_XTRA_LSB 3
+#define DDS_ENT_PRE_LSB 0
+
+/*
+ * Set one entry in the TxDDS table for spec'd port
+ * ridx picks one of the entries, while tp points
+ * to the appropriate table entry.
+ */
+static void set_txdds(struct qib_pportdata *ppd, int ridx,
+		      const struct txdds_ent *tp)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 pack_ent;
+	int regidx;
+
+	/* Get correct offset in chip-space, and in source table */
+	regidx = KREG_IBPORT_IDX(IBSD_DDS_MAP_TABLE) + ridx;
+	/*
+	 * We do not use qib_write_kreg_port() because it was intended
+	 * only for registers in the lower "port specific" pages.
+	 * So do index calculation  by hand.
+	 */
+	if (ppd->hw_pidx)
+		regidx += (dd->palign / sizeof(u64));
+
+	pack_ent = tp->amp << DDS_ENT_AMP_LSB;
+	pack_ent |= tp->main << DDS_ENT_MAIN_LSB;
+	pack_ent |= tp->pre << DDS_ENT_PRE_LSB;
+	pack_ent |= tp->post << DDS_ENT_POST_LSB;
+	qib_write_kreg(dd, regidx, pack_ent);
+	/* Prevent back-to-back writes by hitting scratch */
+	qib_write_kreg(ppd->dd, kr_scratch, 0);
+}
+
+static const struct vendor_txdds_ent vendor_txdds[] = {
+	{ /* Amphenol 1m 30awg NoEq */
+		{ 0x41, 0x50, 0x48 }, "584470002       ",
+		{ 10,  0,  0,  5 }, { 10,  0,  0,  9 }, {  7,  1,  0, 13 },
+	},
+	{ /* Amphenol 3m 28awg NoEq */
+		{ 0x41, 0x50, 0x48 }, "584470004       ",
+		{  0,  0,  0,  8 }, {  0,  0,  0, 11 }, {  0,  1,  7, 15 },
+	},
+	{ /* Finisar 3m OM2 Optical */
+		{ 0x00, 0x90, 0x65 }, "FCBG410QB1C03-QL",
+		{  0,  0,  0,  3 }, {  0,  0,  0,  4 }, {  0,  0,  0, 13 },
+	},
+	{ /* Finisar 30m OM2 Optical */
+		{ 0x00, 0x90, 0x65 }, "FCBG410QB1C30-QL",
+		{  0,  0,  0,  1 }, {  0,  0,  0,  5 }, {  0,  0,  0, 11 },
+	},
+	{ /* Finisar Default OM2 Optical */
+		{ 0x00, 0x90, 0x65 }, NULL,
+		{  0,  0,  0,  2 }, {  0,  0,  0,  5 }, {  0,  0,  0, 12 },
+	},
+	{ /* Gore 1m 30awg NoEq */
+		{ 0x00, 0x21, 0x77 }, "QSN3300-1       ",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  9 }, {  0,  1,  0, 15 },
+	},
+	{ /* Gore 2m 30awg NoEq */
+		{ 0x00, 0x21, 0x77 }, "QSN3300-2       ",
+		{  0,  0,  0,  8 }, {  0,  0,  0, 10 }, {  0,  1,  7, 15 },
+	},
+	{ /* Gore 1m 28awg NoEq */
+		{ 0x00, 0x21, 0x77 }, "QSN3800-1       ",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  8 }, {  0,  1,  0, 15 },
+	},
+	{ /* Gore 3m 28awg NoEq */
+		{ 0x00, 0x21, 0x77 }, "QSN3800-3       ",
+		{  0,  0,  0,  9 }, {  0,  0,  0, 13 }, {  0,  1,  7, 15 },
+	},
+	{ /* Gore 5m 24awg Eq */
+		{ 0x00, 0x21, 0x77 }, "QSN7000-5       ",
+		{  0,  0,  0,  7 }, {  0,  0,  0,  9 }, {  0,  1,  3, 15 },
+	},
+	{ /* Gore 7m 24awg Eq */
+		{ 0x00, 0x21, 0x77 }, "QSN7000-7       ",
+		{  0,  0,  0,  9 }, {  0,  0,  0, 11 }, {  0,  2,  6, 15 },
+	},
+	{ /* Gore 5m 26awg Eq */
+		{ 0x00, 0x21, 0x77 }, "QSN7600-5       ",
+		{  0,  0,  0,  8 }, {  0,  0,  0, 11 }, {  0,  1,  9, 13 },
+	},
+	{ /* Gore 7m 26awg Eq */
+		{ 0x00, 0x21, 0x77 }, "QSN7600-7       ",
+		{  0,  0,  0,  8 }, {  0,  0,  0, 11 }, {  10,  1,  8, 15 },
+	},
+	{ /* Intersil 12m 24awg Active */
+		{ 0x00, 0x30, 0xB4 }, "QLX4000CQSFP1224",
+		{  0,  0,  0,  2 }, {  0,  0,  0,  5 }, {  0,  3,  0,  9 },
+	},
+	{ /* Intersil 10m 28awg Active */
+		{ 0x00, 0x30, 0xB4 }, "QLX4000CQSFP1028",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  4 }, {  0,  2,  0,  2 },
+	},
+	{ /* Intersil 7m 30awg Active */
+		{ 0x00, 0x30, 0xB4 }, "QLX4000CQSFP0730",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  4 }, {  0,  1,  0,  3 },
+	},
+	{ /* Intersil 5m 32awg Active */
+		{ 0x00, 0x30, 0xB4 }, "QLX4000CQSFP0532",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  6 }, {  0,  2,  0,  8 },
+	},
+	{ /* Intersil Default Active */
+		{ 0x00, 0x30, 0xB4 }, NULL,
+		{  0,  0,  0,  6 }, {  0,  0,  0,  5 }, {  0,  2,  0,  5 },
+	},
+	{ /* Luxtera 20m Active Optical */
+		{ 0x00, 0x25, 0x63 }, NULL,
+		{  0,  0,  0,  5 }, {  0,  0,  0,  8 }, {  0,  2,  0,  12 },
+	},
+	{ /* Molex 1M Cu loopback */
+		{ 0x00, 0x09, 0x3A }, "74763-0025      ",
+		{  2,  2,  6, 15 }, {  2,  2,  6, 15 }, {  2,  2,  6, 15 },
+	},
+	{ /* Molex 2m 28awg NoEq */
+		{ 0x00, 0x09, 0x3A }, "74757-2201      ",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  9 }, {  0,  1,  1, 15 },
+	},
+};
+
+static const struct txdds_ent txdds_sdr[TXDDS_TABLE_SZ] = {
+	/* amp, pre, main, post */
+	{  2, 2, 15,  6 },	/* Loopback */
+	{  0, 0,  0,  1 },	/*  2 dB */
+	{  0, 0,  0,  2 },	/*  3 dB */
+	{  0, 0,  0,  3 },	/*  4 dB */
+	{  0, 0,  0,  4 },	/*  5 dB */
+	{  0, 0,  0,  5 },	/*  6 dB */
+	{  0, 0,  0,  6 },	/*  7 dB */
+	{  0, 0,  0,  7 },	/*  8 dB */
+	{  0, 0,  0,  8 },	/*  9 dB */
+	{  0, 0,  0,  9 },	/* 10 dB */
+	{  0, 0,  0, 10 },	/* 11 dB */
+	{  0, 0,  0, 11 },	/* 12 dB */
+	{  0, 0,  0, 12 },	/* 13 dB */
+	{  0, 0,  0, 13 },	/* 14 dB */
+	{  0, 0,  0, 14 },	/* 15 dB */
+	{  0, 0,  0, 15 },	/* 16 dB */
+};
+
+static const struct txdds_ent txdds_ddr[TXDDS_TABLE_SZ] = {
+	/* amp, pre, main, post */
+	{  2, 2, 15,  6 },	/* Loopback */
+	{  0, 0,  0,  8 },	/*  2 dB */
+	{  0, 0,  0,  8 },	/*  3 dB */
+	{  0, 0,  0,  9 },	/*  4 dB */
+	{  0, 0,  0,  9 },	/*  5 dB */
+	{  0, 0,  0, 10 },	/*  6 dB */
+	{  0, 0,  0, 10 },	/*  7 dB */
+	{  0, 0,  0, 11 },	/*  8 dB */
+	{  0, 0,  0, 11 },	/*  9 dB */
+	{  0, 0,  0, 12 },	/* 10 dB */
+	{  0, 0,  0, 12 },	/* 11 dB */
+	{  0, 0,  0, 13 },	/* 12 dB */
+	{  0, 0,  0, 13 },	/* 13 dB */
+	{  0, 0,  0, 14 },	/* 14 dB */
+	{  0, 0,  0, 14 },	/* 15 dB */
+	{  0, 0,  0, 15 },	/* 16 dB */
+};
+
+static const struct txdds_ent txdds_qdr[TXDDS_TABLE_SZ] = {
+	/* amp, pre, main, post */
+	{  2, 2, 15,  6 },	/* Loopback */
+	{  0, 1,  0,  7 },	/*  2 dB (also QMH7342) */
+	{  0, 1,  0,  9 },	/*  3 dB (also QMH7342) */
+	{  0, 1,  0, 11 },	/*  4 dB */
+	{  0, 1,  0, 13 },	/*  5 dB */
+	{  0, 1,  0, 15 },	/*  6 dB */
+	{  0, 1,  3, 15 },	/*  7 dB */
+	{  0, 1,  7, 15 },	/*  8 dB */
+	{  0, 1,  7, 15 },	/*  9 dB */
+	{  0, 1,  8, 15 },	/* 10 dB */
+	{  0, 1,  9, 15 },	/* 11 dB */
+	{  0, 1, 10, 15 },	/* 12 dB */
+	{  0, 2,  6, 15 },	/* 13 dB */
+	{  0, 2,  7, 15 },	/* 14 dB */
+	{  0, 2,  8, 15 },	/* 15 dB */
+	{  0, 2,  9, 15 },	/* 16 dB */
+};
+
+/*
+ * extra entries for use with txselect, for indices >= TXDDS_TABLE_SZ.
+ * These are mostly used for mez cards going through connectors
+ * and backplane traces, but can be used to add other "unusual"
+ * table values as well.
+ */
+static const struct txdds_ent txdds_extra_sdr[TXDDS_EXTRA_SZ] = {
+	/* amp, pre, main, post */
+	{  0, 0, 0,  1 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  1 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  2 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  2 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  3 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  4 },	/* QMH7342 backplane settings */
+	{  0, 1, 4, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 3, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 12 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 11 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0,  9 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 14 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 2, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 11 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  7 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  9 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  6 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  8 },       /* QME7342 backplane settings 1.1 */
+};
+
+static const struct txdds_ent txdds_extra_ddr[TXDDS_EXTRA_SZ] = {
+	/* amp, pre, main, post */
+	{  0, 0, 0,  7 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  7 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  8 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  8 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  9 },	/* QMH7342 backplane settings */
+	{  0, 0, 0, 10 },	/* QMH7342 backplane settings */
+	{  0, 1, 4, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 3, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 12 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 11 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0,  9 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 14 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 2, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 11 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  7 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  9 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  6 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  8 },       /* QME7342 backplane settings 1.1 */
+};
+
+static const struct txdds_ent txdds_extra_qdr[TXDDS_EXTRA_SZ] = {
+	/* amp, pre, main, post */
+	{  0, 1,  0,  4 },	/* QMH7342 backplane settings */
+	{  0, 1,  0,  5 },	/* QMH7342 backplane settings */
+	{  0, 1,  0,  6 },	/* QMH7342 backplane settings */
+	{  0, 1,  0,  8 },	/* QMH7342 backplane settings */
+	{  0, 1,  0, 10 },	/* QMH7342 backplane settings */
+	{  0, 1,  0, 12 },	/* QMH7342 backplane settings */
+	{  0, 1,  4, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  3, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0, 12 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0, 11 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0,  9 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0, 14 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  2, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0, 11 },      /* QME7342 backplane settings 1.1 */
+	{  0, 1,  0,  7 },      /* QME7342 backplane settings 1.1 */
+	{  0, 1,  0,  9 },      /* QME7342 backplane settings 1.1 */
+	{  0, 1,  0,  6 },      /* QME7342 backplane settings 1.1 */
+	{  0, 1,  0,  8 },      /* QME7342 backplane settings 1.1 */
+};
+
+static const struct txdds_ent txdds_extra_mfg[TXDDS_MFG_SZ] = {
+	/* amp, pre, main, post */
+	{ 0, 0, 0, 0 },         /* QME7342 mfg settings */
+	{ 0, 0, 0, 6 },         /* QME7342 P2 mfg settings */
+};
+
+static const struct txdds_ent *get_atten_table(const struct txdds_ent *txdds,
+					       unsigned atten)
+{
+	/*
+	 * The attenuation table starts at 2dB for entry 1,
+	 * with entry 0 being the loopback entry.
+	 */
+	if (atten <= 2)
+		atten = 1;
+	else if (atten > TXDDS_TABLE_SZ)
+		atten = TXDDS_TABLE_SZ - 1;
+	else
+		atten--;
+	return txdds + atten;
+}
+
+/*
+ * if override is set, the module parameter txselect has a value
+ * for this specific port, so use it, rather than our normal mechanism.
+ */
+static void find_best_ent(struct qib_pportdata *ppd,
+			  const struct txdds_ent **sdr_dds,
+			  const struct txdds_ent **ddr_dds,
+			  const struct txdds_ent **qdr_dds, int override)
+{
+	struct qib_qsfp_cache *qd = &ppd->cpspec->qsfp_data.cache;
+	int idx;
+
+	/* Search table of known cables */
+	for (idx = 0; !override && idx < ARRAY_SIZE(vendor_txdds); ++idx) {
+		const struct vendor_txdds_ent *v = vendor_txdds + idx;
+
+		if (!memcmp(v->oui, qd->oui, QSFP_VOUI_LEN) &&
+		    (!v->partnum ||
+		     !memcmp(v->partnum, qd->partnum, QSFP_PN_LEN))) {
+			*sdr_dds = &v->sdr;
+			*ddr_dds = &v->ddr;
+			*qdr_dds = &v->qdr;
+			return;
+		}
+	}
+
+	/* Active cables don't have attenuation so we only set SERDES
+	 * settings to account for the attenuation of the board traces. */
+	if (!override && QSFP_IS_ACTIVE(qd->tech)) {
+		*sdr_dds = txdds_sdr + ppd->dd->board_atten;
+		*ddr_dds = txdds_ddr + ppd->dd->board_atten;
+		*qdr_dds = txdds_qdr + ppd->dd->board_atten;
+		return;
+	}
+
+	if (!override && QSFP_HAS_ATTEN(qd->tech) && (qd->atten[0] ||
+						      qd->atten[1])) {
+		*sdr_dds = get_atten_table(txdds_sdr, qd->atten[0]);
+		*ddr_dds = get_atten_table(txdds_ddr, qd->atten[0]);
+		*qdr_dds = get_atten_table(txdds_qdr, qd->atten[1]);
+		return;
+	} else if (ppd->cpspec->no_eep < TXDDS_TABLE_SZ) {
+		/*
+		 * If we have no (or incomplete) data from the cable
+		 * EEPROM, or no QSFP, or override is set, use the
+		 * module parameter value to index into the attentuation
+		 * table.
+		 */
+		idx = ppd->cpspec->no_eep;
+		*sdr_dds = &txdds_sdr[idx];
+		*ddr_dds = &txdds_ddr[idx];
+		*qdr_dds = &txdds_qdr[idx];
+	} else if (ppd->cpspec->no_eep < (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ)) {
+		/* similar to above, but index into the "extra" table. */
+		idx = ppd->cpspec->no_eep - TXDDS_TABLE_SZ;
+		*sdr_dds = &txdds_extra_sdr[idx];
+		*ddr_dds = &txdds_extra_ddr[idx];
+		*qdr_dds = &txdds_extra_qdr[idx];
+	} else if ((IS_QME(ppd->dd) || IS_QMH(ppd->dd)) &&
+		   ppd->cpspec->no_eep < (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ +
+					  TXDDS_MFG_SZ)) {
+		idx = ppd->cpspec->no_eep - (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ);
+		pr_info("IB%u:%u use idx %u into txdds_mfg\n",
+			ppd->dd->unit, ppd->port, idx);
+		*sdr_dds = &txdds_extra_mfg[idx];
+		*ddr_dds = &txdds_extra_mfg[idx];
+		*qdr_dds = &txdds_extra_mfg[idx];
+	} else {
+		/* this shouldn't happen, it's range checked */
+		*sdr_dds = txdds_sdr + qib_long_atten;
+		*ddr_dds = txdds_ddr + qib_long_atten;
+		*qdr_dds = txdds_qdr + qib_long_atten;
+	}
+}
+
+static void init_txdds_table(struct qib_pportdata *ppd, int override)
+{
+	const struct txdds_ent *sdr_dds, *ddr_dds, *qdr_dds;
+	struct txdds_ent *dds;
+	int idx;
+	int single_ent = 0;
+
+	find_best_ent(ppd, &sdr_dds, &ddr_dds, &qdr_dds, override);
+
+	/* for mez cards or override, use the selected value for all entries */
+	if (!(ppd->dd->flags & QIB_HAS_QSFP) || override)
+		single_ent = 1;
+
+	/* Fill in the first entry with the best entry found. */
+	set_txdds(ppd, 0, sdr_dds);
+	set_txdds(ppd, TXDDS_TABLE_SZ, ddr_dds);
+	set_txdds(ppd, 2 * TXDDS_TABLE_SZ, qdr_dds);
+	if (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED |
+		QIBL_LINKACTIVE)) {
+		dds = (struct txdds_ent *)(ppd->link_speed_active ==
+					   QIB_IB_QDR ?  qdr_dds :
+					   (ppd->link_speed_active ==
+					    QIB_IB_DDR ? ddr_dds : sdr_dds));
+		write_tx_serdes_param(ppd, dds);
+	}
+
+	/* Fill in the remaining entries with the default table values. */
+	for (idx = 1; idx < ARRAY_SIZE(txdds_sdr); ++idx) {
+		set_txdds(ppd, idx, single_ent ? sdr_dds : txdds_sdr + idx);
+		set_txdds(ppd, idx + TXDDS_TABLE_SZ,
+			  single_ent ? ddr_dds : txdds_ddr + idx);
+		set_txdds(ppd, idx + 2 * TXDDS_TABLE_SZ,
+			  single_ent ? qdr_dds : txdds_qdr + idx);
+	}
+}
+
+#define KR_AHB_ACC KREG_IDX(ahb_access_ctrl)
+#define KR_AHB_TRANS KREG_IDX(ahb_transaction_reg)
+#define AHB_TRANS_RDY SYM_MASK(ahb_transaction_reg, ahb_rdy)
+#define AHB_ADDR_LSB SYM_LSB(ahb_transaction_reg, ahb_address)
+#define AHB_DATA_LSB SYM_LSB(ahb_transaction_reg, ahb_data)
+#define AHB_WR SYM_MASK(ahb_transaction_reg, write_not_read)
+#define AHB_TRANS_TRIES 10
+
+/*
+ * The chan argument is 0=chan0, 1=chan1, 2=pll, 3=chan2, 4=chan4,
+ * 5=subsystem which is why most calls have "chan + chan >> 1"
+ * for the channel argument.
+ */
+static u32 ahb_mod(struct qib_devdata *dd, int quad, int chan, int addr,
+		    u32 data, u32 mask)
+{
+	u32 rd_data, wr_data, sz_mask;
+	u64 trans, acc, prev_acc;
+	u32 ret = 0xBAD0BAD;
+	int tries;
+
+	prev_acc = qib_read_kreg64(dd, KR_AHB_ACC);
+	/* From this point on, make sure we return access */
+	acc = (quad << 1) | 1;
+	qib_write_kreg(dd, KR_AHB_ACC, acc);
+
+	for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) {
+		trans = qib_read_kreg64(dd, KR_AHB_TRANS);
+		if (trans & AHB_TRANS_RDY)
+			break;
+	}
+	if (tries >= AHB_TRANS_TRIES) {
+		qib_dev_err(dd, "No ahb_rdy in %d tries\n", AHB_TRANS_TRIES);
+		goto bail;
+	}
+
+	/* If mask is not all 1s, we need to read, but different SerDes
+	 * entities have different sizes
+	 */
+	sz_mask = (1UL << ((quad == 1) ? 32 : 16)) - 1;
+	wr_data = data & mask & sz_mask;
+	if ((~mask & sz_mask) != 0) {
+		trans = ((chan << 6) | addr) << (AHB_ADDR_LSB + 1);
+		qib_write_kreg(dd, KR_AHB_TRANS, trans);
+
+		for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) {
+			trans = qib_read_kreg64(dd, KR_AHB_TRANS);
+			if (trans & AHB_TRANS_RDY)
+				break;
+		}
+		if (tries >= AHB_TRANS_TRIES) {
+			qib_dev_err(dd, "No Rd ahb_rdy in %d tries\n",
+				    AHB_TRANS_TRIES);
+			goto bail;
+		}
+		/* Re-read in case host split reads and read data first */
+		trans = qib_read_kreg64(dd, KR_AHB_TRANS);
+		rd_data = (uint32_t)(trans >> AHB_DATA_LSB);
+		wr_data |= (rd_data & ~mask & sz_mask);
+	}
+
+	/* If mask is not zero, we need to write. */
+	if (mask & sz_mask) {
+		trans = ((chan << 6) | addr) << (AHB_ADDR_LSB + 1);
+		trans |= ((uint64_t)wr_data << AHB_DATA_LSB);
+		trans |= AHB_WR;
+		qib_write_kreg(dd, KR_AHB_TRANS, trans);
+
+		for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) {
+			trans = qib_read_kreg64(dd, KR_AHB_TRANS);
+			if (trans & AHB_TRANS_RDY)
+				break;
+		}
+		if (tries >= AHB_TRANS_TRIES) {
+			qib_dev_err(dd, "No Wr ahb_rdy in %d tries\n",
+				    AHB_TRANS_TRIES);
+			goto bail;
+		}
+	}
+	ret = wr_data;
+bail:
+	qib_write_kreg(dd, KR_AHB_ACC, prev_acc);
+	return ret;
+}
+
+static void ibsd_wr_allchans(struct qib_pportdata *ppd, int addr, unsigned data,
+			     unsigned mask)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int chan;
+
+	for (chan = 0; chan < SERDES_CHANS; ++chan) {
+		ahb_mod(dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), addr,
+			data, mask);
+		ahb_mod(dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), addr,
+			0, 0);
+	}
+}
+
+static void serdes_7322_los_enable(struct qib_pportdata *ppd, int enable)
+{
+	u64 data = qib_read_kreg_port(ppd, krp_serdesctrl);
+	u8 state = SYM_FIELD(data, IBSerdesCtrl_0, RXLOSEN);
+
+	if (enable && !state) {
+		pr_info("IB%u:%u Turning LOS on\n",
+			ppd->dd->unit, ppd->port);
+		data |= SYM_MASK(IBSerdesCtrl_0, RXLOSEN);
+	} else if (!enable && state) {
+		pr_info("IB%u:%u Turning LOS off\n",
+			ppd->dd->unit, ppd->port);
+		data &= ~SYM_MASK(IBSerdesCtrl_0, RXLOSEN);
+	}
+	qib_write_kreg_port(ppd, krp_serdesctrl, data);
+}
+
+static int serdes_7322_init(struct qib_pportdata *ppd)
+{
+	int ret = 0;
+
+	if (ppd->dd->cspec->r1)
+		ret = serdes_7322_init_old(ppd);
+	else
+		ret = serdes_7322_init_new(ppd);
+	return ret;
+}
+
+static int serdes_7322_init_old(struct qib_pportdata *ppd)
+{
+	u32 le_val;
+
+	/*
+	 * Initialize the Tx DDS tables.  Also done every QSFP event,
+	 * for adapters with QSFP
+	 */
+	init_txdds_table(ppd, 0);
+
+	/* ensure no tx overrides from earlier driver loads */
+	qib_write_kreg_port(ppd, krp_tx_deemph_override,
+		SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		reset_tx_deemphasis_override));
+
+	/* Patch some SerDes defaults to "Better for IB" */
+	/* Timing Loop Bandwidth: cdr_timing[11:9] = 0 */
+	ibsd_wr_allchans(ppd, 2, 0, BMASK(11, 9));
+
+	/* Termination: rxtermctrl_r2d addr 11 bits [12:11] = 1 */
+	ibsd_wr_allchans(ppd, 11, (1 << 11), BMASK(12, 11));
+	/* Enable LE2: rxle2en_r2a addr 13 bit [6] = 1 */
+	ibsd_wr_allchans(ppd, 13, (1 << 6), (1 << 6));
+
+	/* May be overridden in qsfp_7322_event */
+	le_val = IS_QME(ppd->dd) ? LE2_QME : LE2_DEFAULT;
+	ibsd_wr_allchans(ppd, 13, (le_val << 7), BMASK(9, 7));
+
+	/* enable LE1 adaptation for all but QME, which is disabled */
+	le_val = IS_QME(ppd->dd) ? 0 : 1;
+	ibsd_wr_allchans(ppd, 13, (le_val << 5), (1 << 5));
+
+	/* Clear cmode-override, may be set from older driver */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 0 << 14, 1 << 14);
+
+	/* Timing Recovery: rxtapsel addr 5 bits [9:8] = 0 */
+	ibsd_wr_allchans(ppd, 5, (0 << 8), BMASK(9, 8));
+
+	/* setup LoS params; these are subsystem, so chan == 5 */
+	/* LoS filter threshold_count on, ch 0-3, set to 8 */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 5, 8 << 11, BMASK(14, 11));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 8 << 4, BMASK(7, 4));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 8, 8 << 11, BMASK(14, 11));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 8 << 4, BMASK(7, 4));
+
+	/* LoS filter threshold_count off, ch 0-3, set to 4 */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 6, 4 << 0, BMASK(3, 0));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 4 << 8, BMASK(11, 8));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 4 << 0, BMASK(3, 0));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 4 << 8, BMASK(11, 8));
+
+	/* LoS filter select enabled */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 1 << 15, 1 << 15);
+
+	/* LoS target data:  SDR=4, DDR=2, QDR=1 */
+	ibsd_wr_allchans(ppd, 14, (1 << 3), BMASK(5, 3)); /* QDR */
+	ibsd_wr_allchans(ppd, 20, (2 << 10), BMASK(12, 10)); /* DDR */
+	ibsd_wr_allchans(ppd, 20, (4 << 13), BMASK(15, 13)); /* SDR */
+
+	serdes_7322_los_enable(ppd, 1);
+
+	/* rxbistena; set 0 to avoid effects of it switch later */
+	ibsd_wr_allchans(ppd, 9, 0 << 15, 1 << 15);
+
+	/* Configure 4 DFE taps, and only they adapt */
+	ibsd_wr_allchans(ppd, 16, 0 << 0, BMASK(1, 0));
+
+	/* gain hi stop 32 (22) (6:1) lo stop 7 (10:7) target 22 (13) (15:11) */
+	le_val = (ppd->dd->cspec->r1 || IS_QME(ppd->dd)) ? 0xb6c0 : 0x6bac;
+	ibsd_wr_allchans(ppd, 21, le_val, 0xfffe);
+
+	/*
+	 * Set receive adaptation mode.  SDR and DDR adaptation are
+	 * always on, and QDR is initially enabled; later disabled.
+	 */
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(0), 0ULL);
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(1), 0ULL);
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(2),
+			    ppd->dd->cspec->r1 ?
+			    QDR_STATIC_ADAPT_DOWN_R1 : QDR_STATIC_ADAPT_DOWN);
+	ppd->cpspec->qdr_dfe_on = 1;
+
+	/* FLoop LOS gate: PPM filter  enabled */
+	ibsd_wr_allchans(ppd, 38, 0 << 10, 1 << 10);
+
+	/* rx offset center enabled */
+	ibsd_wr_allchans(ppd, 12, 1 << 4, 1 << 4);
+
+	if (!ppd->dd->cspec->r1) {
+		ibsd_wr_allchans(ppd, 12, 1 << 12, 1 << 12);
+		ibsd_wr_allchans(ppd, 12, 2 << 8, 0x0f << 8);
+	}
+
+	/* Set the frequency loop bandwidth to 15 */
+	ibsd_wr_allchans(ppd, 2, 15 << 5, BMASK(8, 5));
+
+	return 0;
+}
+
+static int serdes_7322_init_new(struct qib_pportdata *ppd)
+{
+	unsigned long tend;
+	u32 le_val, rxcaldone;
+	int chan, chan_done = (1 << SERDES_CHANS) - 1;
+
+	/* Clear cmode-override, may be set from older driver */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 0 << 14, 1 << 14);
+
+	/* ensure no tx overrides from earlier driver loads */
+	qib_write_kreg_port(ppd, krp_tx_deemph_override,
+		SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		reset_tx_deemphasis_override));
+
+	/* START OF LSI SUGGESTED SERDES BRINGUP */
+	/* Reset - Calibration Setup */
+	/*       Stop DFE adaptaion */
+	ibsd_wr_allchans(ppd, 1, 0, BMASK(9, 1));
+	/*       Disable LE1 */
+	ibsd_wr_allchans(ppd, 13, 0, BMASK(5, 5));
+	/*       Disable autoadapt for LE1 */
+	ibsd_wr_allchans(ppd, 1, 0, BMASK(15, 15));
+	/*       Disable LE2 */
+	ibsd_wr_allchans(ppd, 13, 0, BMASK(6, 6));
+	/*       Disable VGA */
+	ibsd_wr_allchans(ppd, 5, 0, BMASK(0, 0));
+	/*       Disable AFE Offset Cancel */
+	ibsd_wr_allchans(ppd, 12, 0, BMASK(12, 12));
+	/*       Disable Timing Loop */
+	ibsd_wr_allchans(ppd, 2, 0, BMASK(3, 3));
+	/*       Disable Frequency Loop */
+	ibsd_wr_allchans(ppd, 2, 0, BMASK(4, 4));
+	/*       Disable Baseline Wander Correction */
+	ibsd_wr_allchans(ppd, 13, 0, BMASK(13, 13));
+	/*       Disable RX Calibration */
+	ibsd_wr_allchans(ppd, 4, 0, BMASK(10, 10));
+	/*       Disable RX Offset Calibration */
+	ibsd_wr_allchans(ppd, 12, 0, BMASK(4, 4));
+	/*       Select BB CDR */
+	ibsd_wr_allchans(ppd, 2, (1 << 15), BMASK(15, 15));
+	/*       CDR Step Size */
+	ibsd_wr_allchans(ppd, 5, 0, BMASK(9, 8));
+	/*       Enable phase Calibration */
+	ibsd_wr_allchans(ppd, 12, (1 << 5), BMASK(5, 5));
+	/*       DFE Bandwidth [2:14-12] */
+	ibsd_wr_allchans(ppd, 2, (4 << 12), BMASK(14, 12));
+	/*       DFE Config (4 taps only) */
+	ibsd_wr_allchans(ppd, 16, 0, BMASK(1, 0));
+	/*       Gain Loop Bandwidth */
+	if (!ppd->dd->cspec->r1) {
+		ibsd_wr_allchans(ppd, 12, 1 << 12, BMASK(12, 12));
+		ibsd_wr_allchans(ppd, 12, 2 << 8, BMASK(11, 8));
+	} else {
+		ibsd_wr_allchans(ppd, 19, (3 << 11), BMASK(13, 11));
+	}
+	/*       Baseline Wander Correction Gain [13:4-0] (leave as default) */
+	/*       Baseline Wander Correction Gain [3:7-5] (leave as default) */
+	/*       Data Rate Select [5:7-6] (leave as default) */
+	/*       RX Parallel Word Width [3:10-8] (leave as default) */
+
+	/* RX REST */
+	/*       Single- or Multi-channel reset */
+	/*       RX Analog reset */
+	/*       RX Digital reset */
+	ibsd_wr_allchans(ppd, 0, 0, BMASK(15, 13));
+	msleep(20);
+	/*       RX Analog reset */
+	ibsd_wr_allchans(ppd, 0, (1 << 14), BMASK(14, 14));
+	msleep(20);
+	/*       RX Digital reset */
+	ibsd_wr_allchans(ppd, 0, (1 << 13), BMASK(13, 13));
+	msleep(20);
+
+	/* setup LoS params; these are subsystem, so chan == 5 */
+	/* LoS filter threshold_count on, ch 0-3, set to 8 */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 5, 8 << 11, BMASK(14, 11));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 8 << 4, BMASK(7, 4));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 8, 8 << 11, BMASK(14, 11));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 8 << 4, BMASK(7, 4));
+
+	/* LoS filter threshold_count off, ch 0-3, set to 4 */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 6, 4 << 0, BMASK(3, 0));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 4 << 8, BMASK(11, 8));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 4 << 0, BMASK(3, 0));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 4 << 8, BMASK(11, 8));
+
+	/* LoS filter select enabled */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 1 << 15, 1 << 15);
+
+	/* LoS target data:  SDR=4, DDR=2, QDR=1 */
+	ibsd_wr_allchans(ppd, 14, (1 << 3), BMASK(5, 3)); /* QDR */
+	ibsd_wr_allchans(ppd, 20, (2 << 10), BMASK(12, 10)); /* DDR */
+	ibsd_wr_allchans(ppd, 20, (4 << 13), BMASK(15, 13)); /* SDR */
+
+	/* Turn on LOS on initial SERDES init */
+	serdes_7322_los_enable(ppd, 1);
+	/* FLoop LOS gate: PPM filter  enabled */
+	ibsd_wr_allchans(ppd, 38, 0 << 10, 1 << 10);
+
+	/* RX LATCH CALIBRATION */
+	/*       Enable Eyefinder Phase Calibration latch */
+	ibsd_wr_allchans(ppd, 15, 1, BMASK(0, 0));
+	/*       Enable RX Offset Calibration latch */
+	ibsd_wr_allchans(ppd, 12, (1 << 4), BMASK(4, 4));
+	msleep(20);
+	/*       Start Calibration */
+	ibsd_wr_allchans(ppd, 4, (1 << 10), BMASK(10, 10));
+	tend = jiffies + msecs_to_jiffies(500);
+	while (chan_done && !time_is_before_jiffies(tend)) {
+		msleep(20);
+		for (chan = 0; chan < SERDES_CHANS; ++chan) {
+			rxcaldone = ahb_mod(ppd->dd, IBSD(ppd->hw_pidx),
+					    (chan + (chan >> 1)),
+					    25, 0, 0);
+			if ((~rxcaldone & (u32)BMASK(9, 9)) == 0 &&
+			    (~chan_done & (1 << chan)) == 0)
+				chan_done &= ~(1 << chan);
+		}
+	}
+	if (chan_done) {
+		pr_info("Serdes %d calibration not done after .5 sec: 0x%x\n",
+			 IBSD(ppd->hw_pidx), chan_done);
+	} else {
+		for (chan = 0; chan < SERDES_CHANS; ++chan) {
+			rxcaldone = ahb_mod(ppd->dd, IBSD(ppd->hw_pidx),
+					    (chan + (chan >> 1)),
+					    25, 0, 0);
+			if ((~rxcaldone & (u32)BMASK(10, 10)) == 0)
+				pr_info("Serdes %d chan %d calibration failed\n",
+					IBSD(ppd->hw_pidx), chan);
+		}
+	}
+
+	/*       Turn off Calibration */
+	ibsd_wr_allchans(ppd, 4, 0, BMASK(10, 10));
+	msleep(20);
+
+	/* BRING RX UP */
+	/*       Set LE2 value (May be overridden in qsfp_7322_event) */
+	le_val = IS_QME(ppd->dd) ? LE2_QME : LE2_DEFAULT;
+	ibsd_wr_allchans(ppd, 13, (le_val << 7), BMASK(9, 7));
+	/*       Set LE2 Loop bandwidth */
+	ibsd_wr_allchans(ppd, 3, (7 << 5), BMASK(7, 5));
+	/*       Enable LE2 */
+	ibsd_wr_allchans(ppd, 13, (1 << 6), BMASK(6, 6));
+	msleep(20);
+	/*       Enable H0 only */
+	ibsd_wr_allchans(ppd, 1, 1, BMASK(9, 1));
+	/* gain hi stop 32 (22) (6:1) lo stop 7 (10:7) target 22 (13) (15:11) */
+	le_val = (ppd->dd->cspec->r1 || IS_QME(ppd->dd)) ? 0xb6c0 : 0x6bac;
+	ibsd_wr_allchans(ppd, 21, le_val, 0xfffe);
+	/*       Enable VGA */
+	ibsd_wr_allchans(ppd, 5, 0, BMASK(0, 0));
+	msleep(20);
+	/*       Set Frequency Loop Bandwidth */
+	ibsd_wr_allchans(ppd, 2, (15 << 5), BMASK(8, 5));
+	/*       Enable Frequency Loop */
+	ibsd_wr_allchans(ppd, 2, (1 << 4), BMASK(4, 4));
+	/*       Set Timing Loop Bandwidth */
+	ibsd_wr_allchans(ppd, 2, 0, BMASK(11, 9));
+	/*       Enable Timing Loop */
+	ibsd_wr_allchans(ppd, 2, (1 << 3), BMASK(3, 3));
+	msleep(50);
+	/*       Enable DFE
+	 *       Set receive adaptation mode.  SDR and DDR adaptation are
+	 *       always on, and QDR is initially enabled; later disabled.
+	 */
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(0), 0ULL);
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(1), 0ULL);
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(2),
+			    ppd->dd->cspec->r1 ?
+			    QDR_STATIC_ADAPT_DOWN_R1 : QDR_STATIC_ADAPT_DOWN);
+	ppd->cpspec->qdr_dfe_on = 1;
+	/*       Disable LE1  */
+	ibsd_wr_allchans(ppd, 13, (0 << 5), (1 << 5));
+	/*       Disable auto adapt for LE1 */
+	ibsd_wr_allchans(ppd, 1, (0 << 15), BMASK(15, 15));
+	msleep(20);
+	/*       Enable AFE Offset Cancel */
+	ibsd_wr_allchans(ppd, 12, (1 << 12), BMASK(12, 12));
+	/*       Enable Baseline Wander Correction */
+	ibsd_wr_allchans(ppd, 12, (1 << 13), BMASK(13, 13));
+	/* Termination: rxtermctrl_r2d addr 11 bits [12:11] = 1 */
+	ibsd_wr_allchans(ppd, 11, (1 << 11), BMASK(12, 11));
+	/* VGA output common mode */
+	ibsd_wr_allchans(ppd, 12, (3 << 2), BMASK(3, 2));
+
+	/*
+	 * Initialize the Tx DDS tables.  Also done every QSFP event,
+	 * for adapters with QSFP
+	 */
+	init_txdds_table(ppd, 0);
+
+	return 0;
+}
+
+/* start adjust QMH serdes parameters */
+
+static void set_man_code(struct qib_pportdata *ppd, int chan, int code)
+{
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		9, code << 9, 0x3f << 9);
+}
+
+static void set_man_mode_h1(struct qib_pportdata *ppd, int chan,
+	int enable, u32 tapenable)
+{
+	if (enable)
+		ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+			1, 3 << 10, 0x1f << 10);
+	else
+		ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+			1, 0, 0x1f << 10);
+}
+
+/* Set clock to 1, 0, 1, 0 */
+static void clock_man(struct qib_pportdata *ppd, int chan)
+{
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		4, 0x4000, 0x4000);
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		4, 0, 0x4000);
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		4, 0x4000, 0x4000);
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		4, 0, 0x4000);
+}
+
+/*
+ * write the current Tx serdes pre,post,main,amp settings into the serdes.
+ * The caller must pass the settings appropriate for the current speed,
+ * or not care if they are correct for the current speed.
+ */
+static void write_tx_serdes_param(struct qib_pportdata *ppd,
+				  struct txdds_ent *txdds)
+{
+	u64 deemph;
+
+	deemph = qib_read_kreg_port(ppd, krp_tx_deemph_override);
+	/* field names for amp, main, post, pre, respectively */
+	deemph &= ~(SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txampcntl_d2a) |
+		    SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txc0_ena) |
+		    SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txcp1_ena) |
+		    SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txcn1_ena));
+
+	deemph |= SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+			   tx_override_deemphasis_select);
+	deemph |= (txdds->amp & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		    txampcntl_d2a)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				       txampcntl_d2a);
+	deemph |= (txdds->main & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		     txc0_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				   txc0_ena);
+	deemph |= (txdds->post & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		     txcp1_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				    txcp1_ena);
+	deemph |= (txdds->pre & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		     txcn1_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				    txcn1_ena);
+	qib_write_kreg_port(ppd, krp_tx_deemph_override, deemph);
+}
+
+/*
+ * Set the parameters for mez cards on link bounce, so they are
+ * always exactly what was requested.  Similar logic to init_txdds
+ * but does just the serdes.
+ */
+static void adj_tx_serdes(struct qib_pportdata *ppd)
+{
+	const struct txdds_ent *sdr_dds, *ddr_dds, *qdr_dds;
+	struct txdds_ent *dds;
+
+	find_best_ent(ppd, &sdr_dds, &ddr_dds, &qdr_dds, 1);
+	dds = (struct txdds_ent *)(ppd->link_speed_active == QIB_IB_QDR ?
+		qdr_dds : (ppd->link_speed_active == QIB_IB_DDR ?
+				ddr_dds : sdr_dds));
+	write_tx_serdes_param(ppd, dds);
+}
+
+/* set QDR forced value for H1, if needed */
+static void force_h1(struct qib_pportdata *ppd)
+{
+	int chan;
+
+	ppd->cpspec->qdr_reforce = 0;
+	if (!ppd->dd->cspec->r1)
+		return;
+
+	for (chan = 0; chan < SERDES_CHANS; chan++) {
+		set_man_mode_h1(ppd, chan, 1, 0);
+		set_man_code(ppd, chan, ppd->cpspec->h1_val);
+		clock_man(ppd, chan);
+		set_man_mode_h1(ppd, chan, 0, 0);
+	}
+}
+
+#define SJA_EN SYM_MASK(SPC_JTAG_ACCESS_REG, SPC_JTAG_ACCESS_EN)
+#define BISTEN_LSB SYM_LSB(SPC_JTAG_ACCESS_REG, bist_en)
+
+#define R_OPCODE_LSB 3
+#define R_OP_NOP 0
+#define R_OP_SHIFT 2
+#define R_OP_UPDATE 3
+#define R_TDI_LSB 2
+#define R_TDO_LSB 1
+#define R_RDY 1
+
+static int qib_r_grab(struct qib_devdata *dd)
+{
+	u64 val = SJA_EN;
+
+	qib_write_kreg(dd, kr_r_access, val);
+	qib_read_kreg32(dd, kr_scratch);
+	return 0;
+}
+
+/* qib_r_wait_for_rdy() not only waits for the ready bit, it
+ * returns the current state of R_TDO
+ */
+static int qib_r_wait_for_rdy(struct qib_devdata *dd)
+{
+	u64 val;
+	int timeout;
+
+	for (timeout = 0; timeout < 100 ; ++timeout) {
+		val = qib_read_kreg32(dd, kr_r_access);
+		if (val & R_RDY)
+			return (val >> R_TDO_LSB) & 1;
+	}
+	return -1;
+}
+
+static int qib_r_shift(struct qib_devdata *dd, int bisten,
+		       int len, u8 *inp, u8 *outp)
+{
+	u64 valbase, val;
+	int ret, pos;
+
+	valbase = SJA_EN | (bisten << BISTEN_LSB) |
+		(R_OP_SHIFT << R_OPCODE_LSB);
+	ret = qib_r_wait_for_rdy(dd);
+	if (ret < 0)
+		goto bail;
+	for (pos = 0; pos < len; ++pos) {
+		val = valbase;
+		if (outp) {
+			outp[pos >> 3] &= ~(1 << (pos & 7));
+			outp[pos >> 3] |= (ret << (pos & 7));
+		}
+		if (inp) {
+			int tdi = inp[pos >> 3] >> (pos & 7);
+
+			val |= ((tdi & 1) << R_TDI_LSB);
+		}
+		qib_write_kreg(dd, kr_r_access, val);
+		qib_read_kreg32(dd, kr_scratch);
+		ret = qib_r_wait_for_rdy(dd);
+		if (ret < 0)
+			break;
+	}
+	/* Restore to NOP between operations. */
+	val =  SJA_EN | (bisten << BISTEN_LSB);
+	qib_write_kreg(dd, kr_r_access, val);
+	qib_read_kreg32(dd, kr_scratch);
+	ret = qib_r_wait_for_rdy(dd);
+
+	if (ret >= 0)
+		ret = pos;
+bail:
+	return ret;
+}
+
+static int qib_r_update(struct qib_devdata *dd, int bisten)
+{
+	u64 val;
+	int ret;
+
+	val = SJA_EN | (bisten << BISTEN_LSB) | (R_OP_UPDATE << R_OPCODE_LSB);
+	ret = qib_r_wait_for_rdy(dd);
+	if (ret >= 0) {
+		qib_write_kreg(dd, kr_r_access, val);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+	return ret;
+}
+
+#define BISTEN_PORT_SEL 15
+#define LEN_PORT_SEL 625
+#define BISTEN_AT 17
+#define LEN_AT 156
+#define BISTEN_ETM 16
+#define LEN_ETM 632
+
+#define BIT2BYTE(x) (((x) +  BITS_PER_BYTE - 1) / BITS_PER_BYTE)
+
+/* these are common for all IB port use cases. */
+static u8 reset_at[BIT2BYTE(LEN_AT)] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00,
+};
+static u8 reset_atetm[BIT2BYTE(LEN_ETM)] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x80, 0xe3, 0x81, 0x73, 0x3c, 0x70, 0x8e,
+	0x07, 0xce, 0xf1, 0xc0, 0x39, 0x1e, 0x38, 0xc7, 0x03, 0xe7,
+	0x78, 0xe0, 0x1c, 0x0f, 0x9c, 0x7f, 0x80, 0x73, 0x0f, 0x70,
+	0xde, 0x01, 0xce, 0x39, 0xc0, 0xf9, 0x06, 0x38, 0xd7, 0x00,
+	0xe7, 0x19, 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+};
+static u8 at[BIT2BYTE(LEN_AT)] = {
+	0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00,
+};
+
+/* used for IB1 or IB2, only one in use */
+static u8 atetm_1port[BIT2BYTE(LEN_ETM)] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x10, 0xf2, 0x80, 0x83, 0x1e, 0x38, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x50, 0xf4, 0x41, 0x00, 0x18, 0x78, 0xc8, 0x03,
+	0x07, 0x7b, 0xa0, 0x3e, 0x00, 0x02, 0x00, 0x00, 0x18, 0x00,
+	0x18, 0x00, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
+};
+
+/* used when both IB1 and IB2 are in use */
+static u8 atetm_2port[BIT2BYTE(LEN_ETM)] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x79,
+	0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0xf8, 0x80, 0x83, 0x1e, 0x38, 0xe0, 0x03, 0x05,
+	0x7b, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+	0xa2, 0x0f, 0x50, 0xf4, 0x41, 0x00, 0x18, 0x78, 0xd1, 0x07,
+	0x02, 0x7c, 0x80, 0x3e, 0x00, 0x02, 0x00, 0x00, 0x3e, 0x00,
+	0x02, 0x00, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00,
+};
+
+/* used when only IB1 is in use */
+static u8 portsel_port1[BIT2BYTE(LEN_PORT_SEL)] = {
+	0x32, 0x65, 0xa4, 0x7b, 0x10, 0x98, 0xdc, 0xfe, 0x13, 0x13,
+	0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x73, 0x0c, 0x0c, 0x0c,
+	0x0c, 0x0c, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13,
+	0x13, 0x78, 0x78, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13,
+	0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x74, 0x32,
+	0x32, 0x32, 0x32, 0x32, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14,
+	0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14,
+	0x14, 0x14, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+/* used when only IB2 is in use */
+static u8 portsel_port2[BIT2BYTE(LEN_PORT_SEL)] = {
+	0x32, 0x65, 0xa4, 0x7b, 0x10, 0x98, 0xdc, 0xfe, 0x39, 0x39,
+	0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x73, 0x32, 0x32, 0x32,
+	0x32, 0x32, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
+	0x39, 0x78, 0x78, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
+	0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x74, 0x32,
+	0x32, 0x32, 0x32, 0x32, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+	0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+	0x3a, 0x3a, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01,
+};
+
+/* used when both IB1 and IB2 are in use */
+static u8 portsel_2port[BIT2BYTE(LEN_PORT_SEL)] = {
+	0x32, 0xba, 0x54, 0x76, 0x10, 0x98, 0xdc, 0xfe, 0x13, 0x13,
+	0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x73, 0x0c, 0x0c, 0x0c,
+	0x0c, 0x0c, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13,
+	0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13,
+	0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x74, 0x32,
+	0x32, 0x32, 0x32, 0x32, 0x14, 0x14, 0x14, 0x14, 0x14, 0x3a,
+	0x3a, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14,
+	0x14, 0x14, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+/*
+ * Do setup to properly handle IB link recovery; if port is zero, we
+ * are initializing to cover both ports; otherwise we are initializing
+ * to cover a single port card, or the port has reached INIT and we may
+ * need to switch coverage types.
+ */
+static void setup_7322_link_recovery(struct qib_pportdata *ppd, u32 both)
+{
+	u8 *portsel, *etm;
+	struct qib_devdata *dd = ppd->dd;
+
+	if (!ppd->dd->cspec->r1)
+		return;
+	if (!both) {
+		dd->cspec->recovery_ports_initted++;
+		ppd->cpspec->recovery_init = 1;
+	}
+	if (!both && dd->cspec->recovery_ports_initted == 1) {
+		portsel = ppd->port == 1 ? portsel_port1 : portsel_port2;
+		etm = atetm_1port;
+	} else {
+		portsel = portsel_2port;
+		etm = atetm_2port;
+	}
+
+	if (qib_r_grab(dd) < 0 ||
+		qib_r_shift(dd, BISTEN_ETM, LEN_ETM, reset_atetm, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_ETM) < 0 ||
+		qib_r_shift(dd, BISTEN_AT, LEN_AT, reset_at, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_AT) < 0 ||
+		qib_r_shift(dd, BISTEN_PORT_SEL, LEN_PORT_SEL,
+			    portsel, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_PORT_SEL) < 0 ||
+		qib_r_shift(dd, BISTEN_AT, LEN_AT, at, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_AT) < 0 ||
+		qib_r_shift(dd, BISTEN_ETM, LEN_ETM, etm, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_ETM) < 0)
+		qib_dev_err(dd, "Failed IB link recovery setup\n");
+}
+
+static void check_7322_rxe_status(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 fmask;
+
+	if (dd->cspec->recovery_ports_initted != 1)
+		return; /* rest doesn't apply to dualport */
+	qib_write_kreg(dd, kr_control, dd->control |
+		       SYM_MASK(Control, FreezeMode));
+	(void)qib_read_kreg64(dd, kr_scratch);
+	udelay(3); /* ibcreset asserted 400ns, be sure that's over */
+	fmask = qib_read_kreg64(dd, kr_act_fmask);
+	if (!fmask) {
+		/*
+		 * require a powercycle before we'll work again, and make
+		 * sure we get no more interrupts, and don't turn off
+		 * freeze.
+		 */
+		ppd->dd->cspec->stay_in_freeze = 1;
+		qib_7322_set_intr_state(ppd->dd, 0);
+		qib_write_kreg(dd, kr_fmask, 0ULL);
+		qib_dev_err(dd, "HCA unusable until powercycled\n");
+		return; /* eventually reset */
+	}
+
+	qib_write_kreg(ppd->dd, kr_hwerrclear,
+	    SYM_MASK(HwErrClear, IBSerdesPClkNotDetectClear_1));
+
+	/* don't do the full clear_freeze(), not needed for this */
+	qib_write_kreg(dd, kr_control, dd->control);
+	qib_read_kreg32(dd, kr_scratch);
+	/* take IBC out of reset */
+	if (ppd->link_speed_supported) {
+		ppd->cpspec->ibcctrl_a &=
+			~SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn);
+		qib_write_kreg_port(ppd, krp_ibcctrl_a,
+				    ppd->cpspec->ibcctrl_a);
+		qib_read_kreg32(dd, kr_scratch);
+		if (ppd->lflags & QIBL_IB_LINK_DISABLED)
+			qib_set_ib_7322_lstate(ppd, 0,
+				QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c
new file mode 100644
index 0000000..6c68f8a
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_init.c
@@ -0,0 +1,1817 @@
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+#include <linux/dca.h>
+#endif
+#include <rdma/rdma_vt.h>
+
+#include "qib.h"
+#include "qib_common.h"
+#include "qib_mad.h"
+#ifdef CONFIG_DEBUG_FS
+#include "qib_debugfs.h"
+#include "qib_verbs.h"
+#endif
+
+#undef pr_fmt
+#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt
+
+/*
+ * min buffers we want to have per context, after driver
+ */
+#define QIB_MIN_USER_CTXT_BUFCNT 7
+
+#define QLOGIC_IB_R_SOFTWARE_MASK 0xFF
+#define QLOGIC_IB_R_SOFTWARE_SHIFT 24
+#define QLOGIC_IB_R_EMULATOR_MASK (1ULL<<62)
+
+/*
+ * Number of ctxts we are configured to use (to allow for more pio
+ * buffers per ctxt, etc.)  Zero means use chip value.
+ */
+ushort qib_cfgctxts;
+module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO);
+MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use");
+
+unsigned qib_numa_aware;
+module_param_named(numa_aware, qib_numa_aware, uint, S_IRUGO);
+MODULE_PARM_DESC(numa_aware,
+	"0 -> PSM allocation close to HCA, 1 -> PSM allocation local to process");
+
+/*
+ * If set, do not write to any regs if avoidable, hack to allow
+ * check for deranged default register values.
+ */
+ushort qib_mini_init;
+module_param_named(mini_init, qib_mini_init, ushort, S_IRUGO);
+MODULE_PARM_DESC(mini_init, "If set, do minimal diag init");
+
+unsigned qib_n_krcv_queues;
+module_param_named(krcvqs, qib_n_krcv_queues, uint, S_IRUGO);
+MODULE_PARM_DESC(krcvqs, "number of kernel receive queues per IB port");
+
+unsigned qib_cc_table_size;
+module_param_named(cc_table_size, qib_cc_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(cc_table_size, "Congestion control table entries 0 (CCA disabled - default), min = 128, max = 1984");
+
+static void verify_interrupt(struct timer_list *);
+
+static struct idr qib_unit_table;
+u32 qib_cpulist_count;
+unsigned long *qib_cpulist;
+
+/* set number of contexts we'll actually use */
+void qib_set_ctxtcnt(struct qib_devdata *dd)
+{
+	if (!qib_cfgctxts) {
+		dd->cfgctxts = dd->first_user_ctxt + num_online_cpus();
+		if (dd->cfgctxts > dd->ctxtcnt)
+			dd->cfgctxts = dd->ctxtcnt;
+	} else if (qib_cfgctxts < dd->num_pports)
+		dd->cfgctxts = dd->ctxtcnt;
+	else if (qib_cfgctxts <= dd->ctxtcnt)
+		dd->cfgctxts = qib_cfgctxts;
+	else
+		dd->cfgctxts = dd->ctxtcnt;
+	dd->freectxts = (dd->first_user_ctxt > dd->cfgctxts) ? 0 :
+		dd->cfgctxts - dd->first_user_ctxt;
+}
+
+/*
+ * Common code for creating the receive context array.
+ */
+int qib_create_ctxts(struct qib_devdata *dd)
+{
+	unsigned i;
+	int local_node_id = pcibus_to_node(dd->pcidev->bus);
+
+	if (local_node_id < 0)
+		local_node_id = numa_node_id();
+	dd->assigned_node_id = local_node_id;
+
+	/*
+	 * Allocate full ctxtcnt array, rather than just cfgctxts, because
+	 * cleanup iterates across all possible ctxts.
+	 */
+	dd->rcd = kcalloc(dd->ctxtcnt, sizeof(*dd->rcd), GFP_KERNEL);
+	if (!dd->rcd)
+		return -ENOMEM;
+
+	/* create (one or more) kctxt */
+	for (i = 0; i < dd->first_user_ctxt; ++i) {
+		struct qib_pportdata *ppd;
+		struct qib_ctxtdata *rcd;
+
+		if (dd->skip_kctxt_mask & (1 << i))
+			continue;
+
+		ppd = dd->pport + (i % dd->num_pports);
+
+		rcd = qib_create_ctxtdata(ppd, i, dd->assigned_node_id);
+		if (!rcd) {
+			qib_dev_err(dd,
+				"Unable to allocate ctxtdata for Kernel ctxt, failing\n");
+			kfree(dd->rcd);
+			dd->rcd = NULL;
+			return -ENOMEM;
+		}
+		rcd->pkeys[0] = QIB_DEFAULT_P_KEY;
+		rcd->seq_cnt = 1;
+	}
+	return 0;
+}
+
+/*
+ * Common code for user and kernel context setup.
+ */
+struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt,
+	int node_id)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_ctxtdata *rcd;
+
+	rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, node_id);
+	if (rcd) {
+		INIT_LIST_HEAD(&rcd->qp_wait_list);
+		rcd->node_id = node_id;
+		rcd->ppd = ppd;
+		rcd->dd = dd;
+		rcd->cnt = 1;
+		rcd->ctxt = ctxt;
+		dd->rcd[ctxt] = rcd;
+#ifdef CONFIG_DEBUG_FS
+		if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
+			rcd->opstats = kzalloc_node(sizeof(*rcd->opstats),
+				GFP_KERNEL, node_id);
+			if (!rcd->opstats) {
+				kfree(rcd);
+				qib_dev_err(dd,
+					"Unable to allocate per ctxt stats buffer\n");
+				return NULL;
+			}
+		}
+#endif
+		dd->f_init_ctxt(rcd);
+
+		/*
+		 * To avoid wasting a lot of memory, we allocate 32KB chunks
+		 * of physically contiguous memory, advance through it until
+		 * used up and then allocate more.  Of course, we need
+		 * memory to store those extra pointers, now.  32KB seems to
+		 * be the most that is "safe" under memory pressure
+		 * (creating large files and then copying them over
+		 * NFS while doing lots of MPI jobs).  The OOM killer can
+		 * get invoked, even though we say we can sleep and this can
+		 * cause significant system problems....
+		 */
+		rcd->rcvegrbuf_size = 0x8000;
+		rcd->rcvegrbufs_perchunk =
+			rcd->rcvegrbuf_size / dd->rcvegrbufsize;
+		rcd->rcvegrbuf_chunks = (rcd->rcvegrcnt +
+			rcd->rcvegrbufs_perchunk - 1) /
+			rcd->rcvegrbufs_perchunk;
+		BUG_ON(!is_power_of_2(rcd->rcvegrbufs_perchunk));
+		rcd->rcvegrbufs_perchunk_shift =
+			ilog2(rcd->rcvegrbufs_perchunk);
+	}
+	return rcd;
+}
+
+/*
+ * Common code for initializing the physical port structure.
+ */
+int qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd,
+			u8 hw_pidx, u8 port)
+{
+	int size;
+
+	ppd->dd = dd;
+	ppd->hw_pidx = hw_pidx;
+	ppd->port = port; /* IB port number, not index */
+
+	spin_lock_init(&ppd->sdma_lock);
+	spin_lock_init(&ppd->lflags_lock);
+	spin_lock_init(&ppd->cc_shadow_lock);
+	init_waitqueue_head(&ppd->state_wait);
+
+	timer_setup(&ppd->symerr_clear_timer, qib_clear_symerror_on_linkup, 0);
+
+	ppd->qib_wq = NULL;
+	ppd->ibport_data.pmastats =
+		alloc_percpu(struct qib_pma_counters);
+	if (!ppd->ibport_data.pmastats)
+		return -ENOMEM;
+	ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64);
+	ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64);
+	ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64);
+	if (!(ppd->ibport_data.rvp.rc_acks) ||
+	    !(ppd->ibport_data.rvp.rc_qacks) ||
+	    !(ppd->ibport_data.rvp.rc_delayed_comp))
+		return -ENOMEM;
+
+	if (qib_cc_table_size < IB_CCT_MIN_ENTRIES)
+		goto bail;
+
+	ppd->cc_supported_table_entries = min(max_t(int, qib_cc_table_size,
+		IB_CCT_MIN_ENTRIES), IB_CCT_ENTRIES*IB_CC_TABLE_CAP_DEFAULT);
+
+	ppd->cc_max_table_entries =
+		ppd->cc_supported_table_entries/IB_CCT_ENTRIES;
+
+	size = IB_CC_TABLE_CAP_DEFAULT * sizeof(struct ib_cc_table_entry)
+		* IB_CCT_ENTRIES;
+	ppd->ccti_entries = kzalloc(size, GFP_KERNEL);
+	if (!ppd->ccti_entries)
+		goto bail;
+
+	size = IB_CC_CCS_ENTRIES * sizeof(struct ib_cc_congestion_entry);
+	ppd->congestion_entries = kzalloc(size, GFP_KERNEL);
+	if (!ppd->congestion_entries)
+		goto bail_1;
+
+	size = sizeof(struct cc_table_shadow);
+	ppd->ccti_entries_shadow = kzalloc(size, GFP_KERNEL);
+	if (!ppd->ccti_entries_shadow)
+		goto bail_2;
+
+	size = sizeof(struct ib_cc_congestion_setting_attr);
+	ppd->congestion_entries_shadow = kzalloc(size, GFP_KERNEL);
+	if (!ppd->congestion_entries_shadow)
+		goto bail_3;
+
+	return 0;
+
+bail_3:
+	kfree(ppd->ccti_entries_shadow);
+	ppd->ccti_entries_shadow = NULL;
+bail_2:
+	kfree(ppd->congestion_entries);
+	ppd->congestion_entries = NULL;
+bail_1:
+	kfree(ppd->ccti_entries);
+	ppd->ccti_entries = NULL;
+bail:
+	/* User is intentionally disabling the congestion control agent */
+	if (!qib_cc_table_size)
+		return 0;
+
+	if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) {
+		qib_cc_table_size = 0;
+		qib_dev_err(dd,
+		 "Congestion Control table size %d less than minimum %d for port %d\n",
+		 qib_cc_table_size, IB_CCT_MIN_ENTRIES, port);
+	}
+
+	qib_dev_err(dd, "Congestion Control Agent disabled for port %d\n",
+		port);
+	return 0;
+}
+
+static int init_pioavailregs(struct qib_devdata *dd)
+{
+	int ret, pidx;
+	u64 *status_page;
+
+	dd->pioavailregs_dma = dma_alloc_coherent(
+		&dd->pcidev->dev, PAGE_SIZE, &dd->pioavailregs_phys,
+		GFP_KERNEL);
+	if (!dd->pioavailregs_dma) {
+		qib_dev_err(dd,
+			"failed to allocate PIOavail reg area in memory\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * We really want L2 cache aligned, but for current CPUs of
+	 * interest, they are the same.
+	 */
+	status_page = (u64 *)
+		((char *) dd->pioavailregs_dma +
+		 ((2 * L1_CACHE_BYTES +
+		   dd->pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES));
+	/* device status comes first, for backwards compatibility */
+	dd->devstatusp = status_page;
+	*status_page++ = 0;
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		dd->pport[pidx].statusp = status_page;
+		*status_page++ = 0;
+	}
+
+	/*
+	 * Setup buffer to hold freeze and other messages, accessible to
+	 * apps, following statusp.  This is per-unit, not per port.
+	 */
+	dd->freezemsg = (char *) status_page;
+	*dd->freezemsg = 0;
+	/* length of msg buffer is "whatever is left" */
+	ret = (char *) status_page - (char *) dd->pioavailregs_dma;
+	dd->freezelen = PAGE_SIZE - ret;
+
+	ret = 0;
+
+done:
+	return ret;
+}
+
+/**
+ * init_shadow_tids - allocate the shadow TID array
+ * @dd: the qlogic_ib device
+ *
+ * allocate the shadow TID array, so we can qib_munlock previous
+ * entries.  It may make more sense to move the pageshadow to the
+ * ctxt data structure, so we only allocate memory for ctxts actually
+ * in use, since we at 8k per ctxt, now.
+ * We don't want failures here to prevent use of the driver/chip,
+ * so no return value.
+ */
+static void init_shadow_tids(struct qib_devdata *dd)
+{
+	struct page **pages;
+	dma_addr_t *addrs;
+
+	pages = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(struct page *));
+	if (!pages)
+		goto bail;
+
+	addrs = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(dma_addr_t));
+	if (!addrs)
+		goto bail_free;
+
+	dd->pageshadow = pages;
+	dd->physshadow = addrs;
+	return;
+
+bail_free:
+	vfree(pages);
+bail:
+	dd->pageshadow = NULL;
+}
+
+/*
+ * Do initialization for device that is only needed on
+ * first detect, not on resets.
+ */
+static int loadtime_init(struct qib_devdata *dd)
+{
+	int ret = 0;
+
+	if (((dd->revision >> QLOGIC_IB_R_SOFTWARE_SHIFT) &
+	     QLOGIC_IB_R_SOFTWARE_MASK) != QIB_CHIP_SWVERSION) {
+		qib_dev_err(dd,
+			"Driver only handles version %d, chip swversion is %d (%llx), failing\n",
+			QIB_CHIP_SWVERSION,
+			(int)(dd->revision >>
+				QLOGIC_IB_R_SOFTWARE_SHIFT) &
+				QLOGIC_IB_R_SOFTWARE_MASK,
+			(unsigned long long) dd->revision);
+		ret = -ENOSYS;
+		goto done;
+	}
+
+	if (dd->revision & QLOGIC_IB_R_EMULATOR_MASK)
+		qib_devinfo(dd->pcidev, "%s", dd->boardversion);
+
+	spin_lock_init(&dd->pioavail_lock);
+	spin_lock_init(&dd->sendctrl_lock);
+	spin_lock_init(&dd->uctxt_lock);
+	spin_lock_init(&dd->qib_diag_trans_lock);
+	spin_lock_init(&dd->eep_st_lock);
+	mutex_init(&dd->eep_lock);
+
+	if (qib_mini_init)
+		goto done;
+
+	ret = init_pioavailregs(dd);
+	init_shadow_tids(dd);
+
+	qib_get_eeprom_info(dd);
+
+	/* setup time (don't start yet) to verify we got interrupt */
+	timer_setup(&dd->intrchk_timer, verify_interrupt, 0);
+done:
+	return ret;
+}
+
+/**
+ * init_after_reset - re-initialize after a reset
+ * @dd: the qlogic_ib device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_after_reset(struct qib_devdata *dd)
+{
+	int i;
+
+	/*
+	 * Ensure chip does no sends or receives, tail updates, or
+	 * pioavail updates while we re-initialize.  This is mostly
+	 * for the driver data structures, not chip registers.
+	 */
+	for (i = 0; i < dd->num_pports; ++i) {
+		/*
+		 * ctxt == -1 means "all contexts". Only really safe for
+		 * _dis_abling things, as here.
+		 */
+		dd->f_rcvctrl(dd->pport + i, QIB_RCVCTRL_CTXT_DIS |
+				  QIB_RCVCTRL_INTRAVAIL_DIS |
+				  QIB_RCVCTRL_TAILUPD_DIS, -1);
+		/* Redundant across ports for some, but no big deal.  */
+		dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_DIS |
+			QIB_SENDCTRL_AVAIL_DIS);
+	}
+
+	return 0;
+}
+
+static void enable_chip(struct qib_devdata *dd)
+{
+	u64 rcvmask;
+	int i;
+
+	/*
+	 * Enable PIO send, and update of PIOavail regs to memory.
+	 */
+	for (i = 0; i < dd->num_pports; ++i)
+		dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_ENB |
+			QIB_SENDCTRL_AVAIL_ENB);
+	/*
+	 * Enable kernel ctxts' receive and receive interrupt.
+	 * Other ctxts done as user opens and inits them.
+	 */
+	rcvmask = QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_INTRAVAIL_ENB;
+	rcvmask |= (dd->flags & QIB_NODMA_RTAIL) ?
+		  QIB_RCVCTRL_TAILUPD_DIS : QIB_RCVCTRL_TAILUPD_ENB;
+	for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
+		struct qib_ctxtdata *rcd = dd->rcd[i];
+
+		if (rcd)
+			dd->f_rcvctrl(rcd->ppd, rcvmask, i);
+	}
+}
+
+static void verify_interrupt(struct timer_list *t)
+{
+	struct qib_devdata *dd = from_timer(dd, t, intrchk_timer);
+	u64 int_counter;
+
+	if (!dd)
+		return; /* being torn down */
+
+	/*
+	 * If we don't have a lid or any interrupts, let the user know and
+	 * don't bother checking again.
+	 */
+	int_counter = qib_int_counter(dd) - dd->z_int_counter;
+	if (int_counter == 0) {
+		if (!dd->f_intr_fallback(dd))
+			dev_err(&dd->pcidev->dev,
+				"No interrupts detected, not usable.\n");
+		else /* re-arm the timer to see if fallback works */
+			mod_timer(&dd->intrchk_timer, jiffies + HZ/2);
+	}
+}
+
+static void init_piobuf_state(struct qib_devdata *dd)
+{
+	int i, pidx;
+	u32 uctxts;
+
+	/*
+	 * Ensure all buffers are free, and fifos empty.  Buffers
+	 * are common, so only do once for port 0.
+	 *
+	 * After enable and qib_chg_pioavailkernel so we can safely
+	 * enable pioavail updates and PIOENABLE.  After this, packets
+	 * are ready and able to go out.
+	 */
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_ALL);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx)
+		dd->f_sendctrl(dd->pport + pidx, QIB_SENDCTRL_FLUSH);
+
+	/*
+	 * If not all sendbufs are used, add the one to each of the lower
+	 * numbered contexts.  pbufsctxt and lastctxt_piobuf are
+	 * calculated in chip-specific code because it may cause some
+	 * chip-specific adjustments to be made.
+	 */
+	uctxts = dd->cfgctxts - dd->first_user_ctxt;
+	dd->ctxts_extrabuf = dd->pbufsctxt ?
+		dd->lastctxt_piobuf - (dd->pbufsctxt * uctxts) : 0;
+
+	/*
+	 * Set up the shadow copies of the piobufavail registers,
+	 * which we compare against the chip registers for now, and
+	 * the in memory DMA'ed copies of the registers.
+	 * By now pioavail updates to memory should have occurred, so
+	 * copy them into our working/shadow registers; this is in
+	 * case something went wrong with abort, but mostly to get the
+	 * initial values of the generation bit correct.
+	 */
+	for (i = 0; i < dd->pioavregs; i++) {
+		__le64 tmp;
+
+		tmp = dd->pioavailregs_dma[i];
+		/*
+		 * Don't need to worry about pioavailkernel here
+		 * because we will call qib_chg_pioavailkernel() later
+		 * in initialization, to busy out buffers as needed.
+		 */
+		dd->pioavailshadow[i] = le64_to_cpu(tmp);
+	}
+	while (i < ARRAY_SIZE(dd->pioavailshadow))
+		dd->pioavailshadow[i++] = 0; /* for debugging sanity */
+
+	/* after pioavailshadow is setup */
+	qib_chg_pioavailkernel(dd, 0, dd->piobcnt2k + dd->piobcnt4k,
+			       TXCHK_CHG_TYPE_KERN, NULL);
+	dd->f_initvl15_bufs(dd);
+}
+
+/**
+ * qib_create_workqueues - create per port workqueues
+ * @dd: the qlogic_ib device
+ */
+static int qib_create_workqueues(struct qib_devdata *dd)
+{
+	int pidx;
+	struct qib_pportdata *ppd;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (!ppd->qib_wq) {
+			char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */
+
+			snprintf(wq_name, sizeof(wq_name), "qib%d_%d",
+				dd->unit, pidx);
+			ppd->qib_wq = alloc_ordered_workqueue(wq_name,
+							      WQ_MEM_RECLAIM);
+			if (!ppd->qib_wq)
+				goto wq_error;
+		}
+	}
+	return 0;
+wq_error:
+	pr_err("create_singlethread_workqueue failed for port %d\n",
+		pidx + 1);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (ppd->qib_wq) {
+			destroy_workqueue(ppd->qib_wq);
+			ppd->qib_wq = NULL;
+		}
+	}
+	return -ENOMEM;
+}
+
+static void qib_free_pportdata(struct qib_pportdata *ppd)
+{
+	free_percpu(ppd->ibport_data.pmastats);
+	free_percpu(ppd->ibport_data.rvp.rc_acks);
+	free_percpu(ppd->ibport_data.rvp.rc_qacks);
+	free_percpu(ppd->ibport_data.rvp.rc_delayed_comp);
+	ppd->ibport_data.pmastats = NULL;
+}
+
+/**
+ * qib_init - do the actual initialization sequence on the chip
+ * @dd: the qlogic_ib device
+ * @reinit: reinitializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip.  This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0).  We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int qib_init(struct qib_devdata *dd, int reinit)
+{
+	int ret = 0, pidx, lastfail = 0;
+	u32 portok = 0;
+	unsigned i;
+	struct qib_ctxtdata *rcd;
+	struct qib_pportdata *ppd;
+	unsigned long flags;
+
+	/* Set linkstate to unknown, so we can watch for a transition. */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~(QIBL_LINKACTIVE | QIBL_LINKARMED |
+				 QIBL_LINKDOWN | QIBL_LINKINIT |
+				 QIBL_LINKV);
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+
+	if (reinit)
+		ret = init_after_reset(dd);
+	else
+		ret = loadtime_init(dd);
+	if (ret)
+		goto done;
+
+	/* Bypass most chip-init, to get to device creation */
+	if (qib_mini_init)
+		return 0;
+
+	ret = dd->f_late_initreg(dd);
+	if (ret)
+		goto done;
+
+	/* dd->rcd can be NULL if early init failed */
+	for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
+		/*
+		 * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
+		 * re-init, the simplest way to handle this is to free
+		 * existing, and re-allocate.
+		 * Need to re-create rest of ctxt 0 ctxtdata as well.
+		 */
+		rcd = dd->rcd[i];
+		if (!rcd)
+			continue;
+
+		lastfail = qib_create_rcvhdrq(dd, rcd);
+		if (!lastfail)
+			lastfail = qib_setup_eagerbufs(rcd);
+		if (lastfail)
+			qib_dev_err(dd,
+				"failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
+	}
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		int mtu;
+
+		if (lastfail)
+			ret = lastfail;
+		ppd = dd->pport + pidx;
+		mtu = ib_mtu_enum_to_int(qib_ibmtu);
+		if (mtu == -1) {
+			mtu = QIB_DEFAULT_MTU;
+			qib_ibmtu = 0; /* don't leave invalid value */
+		}
+		/* set max we can ever have for this driver load */
+		ppd->init_ibmaxlen = min(mtu > 2048 ?
+					 dd->piosize4k : dd->piosize2k,
+					 dd->rcvegrbufsize +
+					 (dd->rcvhdrentsize << 2));
+		/*
+		 * Have to initialize ibmaxlen, but this will normally
+		 * change immediately in qib_set_mtu().
+		 */
+		ppd->ibmaxlen = ppd->init_ibmaxlen;
+		qib_set_mtu(ppd, mtu);
+
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+		lastfail = dd->f_bringup_serdes(ppd);
+		if (lastfail) {
+			qib_devinfo(dd->pcidev,
+				 "Failed to bringup IB port %u\n", ppd->port);
+			lastfail = -ENETDOWN;
+			continue;
+		}
+
+		portok++;
+	}
+
+	if (!portok) {
+		/* none of the ports initialized */
+		if (!ret && lastfail)
+			ret = lastfail;
+		else if (!ret)
+			ret = -ENETDOWN;
+		/* but continue on, so we can debug cause */
+	}
+
+	enable_chip(dd);
+
+	init_piobuf_state(dd);
+
+done:
+	if (!ret) {
+		/* chip is OK for user apps; mark it as initialized */
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+			/*
+			 * Set status even if port serdes is not initialized
+			 * so that diags will work.
+			 */
+			*ppd->statusp |= QIB_STATUS_CHIP_PRESENT |
+				QIB_STATUS_INITTED;
+			if (!ppd->link_speed_enabled)
+				continue;
+			if (dd->flags & QIB_HAS_SEND_DMA)
+				ret = qib_setup_sdma(ppd);
+			timer_setup(&ppd->hol_timer, qib_hol_event, 0);
+			ppd->hol_state = QIB_HOL_UP;
+		}
+
+		/* now we can enable all interrupts from the chip */
+		dd->f_set_intr_state(dd, 1);
+
+		/*
+		 * Setup to verify we get an interrupt, and fallback
+		 * to an alternate if necessary and possible.
+		 */
+		mod_timer(&dd->intrchk_timer, jiffies + HZ/2);
+		/* start stats retrieval timer */
+		mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER);
+	}
+
+	/* if ret is non-zero, we probably should do some cleanup here... */
+	return ret;
+}
+
+/*
+ * These next two routines are placeholders in case we don't have per-arch
+ * code for controlling write combining.  If explicit control of write
+ * combining is not available, performance will probably be awful.
+ */
+
+int __attribute__((weak)) qib_enable_wc(struct qib_devdata *dd)
+{
+	return -EOPNOTSUPP;
+}
+
+void __attribute__((weak)) qib_disable_wc(struct qib_devdata *dd)
+{
+}
+
+static inline struct qib_devdata *__qib_lookup(int unit)
+{
+	return idr_find(&qib_unit_table, unit);
+}
+
+struct qib_devdata *qib_lookup(int unit)
+{
+	struct qib_devdata *dd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+	dd = __qib_lookup(unit);
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+
+	return dd;
+}
+
+/*
+ * Stop the timers during unit shutdown, or after an error late
+ * in initialization.
+ */
+static void qib_stop_timers(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	int pidx;
+
+	if (dd->stats_timer.function)
+		del_timer_sync(&dd->stats_timer);
+	if (dd->intrchk_timer.function)
+		del_timer_sync(&dd->intrchk_timer);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (ppd->hol_timer.function)
+			del_timer_sync(&ppd->hol_timer);
+		if (ppd->led_override_timer.function) {
+			del_timer_sync(&ppd->led_override_timer);
+			atomic_set(&ppd->led_override_timer_active, 0);
+		}
+		if (ppd->symerr_clear_timer.function)
+			del_timer_sync(&ppd->symerr_clear_timer);
+	}
+}
+
+/**
+ * qib_shutdown_device - shut down a device
+ * @dd: the qlogic_ib device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled.   It does not free any data structures.
+ * Everything it does has to be setup again by qib_init(dd, 1)
+ */
+static void qib_shutdown_device(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	unsigned pidx;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+
+		spin_lock_irq(&ppd->lflags_lock);
+		ppd->lflags &= ~(QIBL_LINKDOWN | QIBL_LINKINIT |
+				 QIBL_LINKARMED | QIBL_LINKACTIVE |
+				 QIBL_LINKV);
+		spin_unlock_irq(&ppd->lflags_lock);
+		*ppd->statusp &= ~(QIB_STATUS_IB_CONF | QIB_STATUS_IB_READY);
+	}
+	dd->flags &= ~QIB_INITTED;
+
+	/* mask interrupts, but not errors */
+	dd->f_set_intr_state(dd, 0);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		dd->f_rcvctrl(ppd, QIB_RCVCTRL_TAILUPD_DIS |
+				   QIB_RCVCTRL_CTXT_DIS |
+				   QIB_RCVCTRL_INTRAVAIL_DIS |
+				   QIB_RCVCTRL_PKEY_ENB, -1);
+		/*
+		 * Gracefully stop all sends allowing any in progress to
+		 * trickle out first.
+		 */
+		dd->f_sendctrl(ppd, QIB_SENDCTRL_CLEAR);
+	}
+
+	/*
+	 * Enough for anything that's going to trickle out to have actually
+	 * done so.
+	 */
+	udelay(20);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		dd->f_setextled(ppd, 0); /* make sure LEDs are off */
+
+		if (dd->flags & QIB_HAS_SEND_DMA)
+			qib_teardown_sdma(ppd);
+
+		dd->f_sendctrl(ppd, QIB_SENDCTRL_AVAIL_DIS |
+				    QIB_SENDCTRL_SEND_DIS);
+		/*
+		 * Clear SerdesEnable.
+		 * We can't count on interrupts since we are stopping.
+		 */
+		dd->f_quiet_serdes(ppd);
+
+		if (ppd->qib_wq) {
+			destroy_workqueue(ppd->qib_wq);
+			ppd->qib_wq = NULL;
+		}
+		qib_free_pportdata(ppd);
+	}
+
+}
+
+/**
+ * qib_free_ctxtdata - free a context's allocated data
+ * @dd: the qlogic_ib device
+ * @rcd: the ctxtdata structure
+ *
+ * free up any allocated data for a context
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of context data, because it is called after qib_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ */
+void qib_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
+{
+	if (!rcd)
+		return;
+
+	if (rcd->rcvhdrq) {
+		dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
+				  rcd->rcvhdrq, rcd->rcvhdrq_phys);
+		rcd->rcvhdrq = NULL;
+		if (rcd->rcvhdrtail_kvaddr) {
+			dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+					  rcd->rcvhdrtail_kvaddr,
+					  rcd->rcvhdrqtailaddr_phys);
+			rcd->rcvhdrtail_kvaddr = NULL;
+		}
+	}
+	if (rcd->rcvegrbuf) {
+		unsigned e;
+
+		for (e = 0; e < rcd->rcvegrbuf_chunks; e++) {
+			void *base = rcd->rcvegrbuf[e];
+			size_t size = rcd->rcvegrbuf_size;
+
+			dma_free_coherent(&dd->pcidev->dev, size,
+					  base, rcd->rcvegrbuf_phys[e]);
+		}
+		kfree(rcd->rcvegrbuf);
+		rcd->rcvegrbuf = NULL;
+		kfree(rcd->rcvegrbuf_phys);
+		rcd->rcvegrbuf_phys = NULL;
+		rcd->rcvegrbuf_chunks = 0;
+	}
+
+	kfree(rcd->tid_pg_list);
+	vfree(rcd->user_event_mask);
+	vfree(rcd->subctxt_uregbase);
+	vfree(rcd->subctxt_rcvegrbuf);
+	vfree(rcd->subctxt_rcvhdr_base);
+#ifdef CONFIG_DEBUG_FS
+	kfree(rcd->opstats);
+	rcd->opstats = NULL;
+#endif
+	kfree(rcd);
+}
+
+/*
+ * Perform a PIO buffer bandwidth write test, to verify proper system
+ * configuration.  Even when all the setup calls work, occasionally
+ * BIOS or other issues can prevent write combining from working, or
+ * can cause other bandwidth problems to the chip.
+ *
+ * This test simply writes the same buffer over and over again, and
+ * measures close to the peak bandwidth to the chip (not testing
+ * data bandwidth to the wire).   On chips that use an address-based
+ * trigger to send packets to the wire, this is easy.  On chips that
+ * use a count to trigger, we want to make sure that the packet doesn't
+ * go out on the wire, or trigger flow control checks.
+ */
+static void qib_verify_pioperf(struct qib_devdata *dd)
+{
+	u32 pbnum, cnt, lcnt;
+	u32 __iomem *piobuf;
+	u32 *addr;
+	u64 msecs, emsecs;
+
+	piobuf = dd->f_getsendbuf(dd->pport, 0ULL, &pbnum);
+	if (!piobuf) {
+		qib_devinfo(dd->pcidev,
+			 "No PIObufs for checking perf, skipping\n");
+		return;
+	}
+
+	/*
+	 * Enough to give us a reasonable test, less than piobuf size, and
+	 * likely multiple of store buffer length.
+	 */
+	cnt = 1024;
+
+	addr = vmalloc(cnt);
+	if (!addr)
+		goto done;
+
+	preempt_disable();  /* we want reasonably accurate elapsed time */
+	msecs = 1 + jiffies_to_msecs(jiffies);
+	for (lcnt = 0; lcnt < 10000U; lcnt++) {
+		/* wait until we cross msec boundary */
+		if (jiffies_to_msecs(jiffies) >= msecs)
+			break;
+		udelay(1);
+	}
+
+	dd->f_set_armlaunch(dd, 0);
+
+	/*
+	 * length 0, no dwords actually sent
+	 */
+	writeq(0, piobuf);
+	qib_flush_wc();
+
+	/*
+	 * This is only roughly accurate, since even with preempt we
+	 * still take interrupts that could take a while.   Running for
+	 * >= 5 msec seems to get us "close enough" to accurate values.
+	 */
+	msecs = jiffies_to_msecs(jiffies);
+	for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
+		qib_pio_copy(piobuf + 64, addr, cnt >> 2);
+		emsecs = jiffies_to_msecs(jiffies) - msecs;
+	}
+
+	/* 1 GiB/sec, slightly over IB SDR line rate */
+	if (lcnt < (emsecs * 1024U))
+		qib_dev_err(dd,
+			    "Performance problem: bandwidth to PIO buffers is only %u MiB/sec\n",
+			    lcnt / (u32) emsecs);
+
+	preempt_enable();
+
+	vfree(addr);
+
+done:
+	/* disarm piobuf, so it's available again */
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbnum));
+	qib_sendbuf_done(dd, pbnum);
+	dd->f_set_armlaunch(dd, 1);
+}
+
+void qib_free_devdata(struct qib_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+	idr_remove(&qib_unit_table, dd->unit);
+	list_del(&dd->list);
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_ibdev_exit(&dd->verbs_dev);
+#endif
+	free_percpu(dd->int_counter);
+	rvt_dealloc_device(&dd->verbs_dev.rdi);
+}
+
+u64 qib_int_counter(struct qib_devdata *dd)
+{
+	int cpu;
+	u64 int_counter = 0;
+
+	for_each_possible_cpu(cpu)
+		int_counter += *per_cpu_ptr(dd->int_counter, cpu);
+	return int_counter;
+}
+
+u64 qib_sps_ints(void)
+{
+	unsigned long flags;
+	struct qib_devdata *dd;
+	u64 sps_ints = 0;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+	list_for_each_entry(dd, &qib_dev_list, list) {
+		sps_ints += qib_int_counter(dd);
+	}
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+	return sps_ints;
+}
+
+/*
+ * Allocate our primary per-unit data structure.  Must be done via verbs
+ * allocator, because the verbs cleanup process both does cleanup and
+ * free of the data structure.
+ * "extra" is for chip-specific data.
+ *
+ * Use the idr mechanism to get a unit number for this unit.
+ */
+struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)
+{
+	unsigned long flags;
+	struct qib_devdata *dd;
+	int ret, nports;
+
+	/* extra is * number of ports */
+	nports = extra / sizeof(struct qib_pportdata);
+	dd = (struct qib_devdata *)rvt_alloc_device(sizeof(*dd) + extra,
+						    nports);
+	if (!dd)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&dd->list);
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irqsave(&qib_devs_lock, flags);
+
+	ret = idr_alloc(&qib_unit_table, dd, 0, 0, GFP_NOWAIT);
+	if (ret >= 0) {
+		dd->unit = ret;
+		list_add(&dd->list, &qib_dev_list);
+	}
+
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+	idr_preload_end();
+
+	if (ret < 0) {
+		qib_early_err(&pdev->dev,
+			      "Could not allocate unit ID: error %d\n", -ret);
+		goto bail;
+	}
+	rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s%d", "qib", dd->unit);
+
+	dd->int_counter = alloc_percpu(u64);
+	if (!dd->int_counter) {
+		ret = -ENOMEM;
+		qib_early_err(&pdev->dev,
+			      "Could not allocate per-cpu int_counter\n");
+		goto bail;
+	}
+
+	if (!qib_cpulist_count) {
+		u32 count = num_online_cpus();
+
+		qib_cpulist = kzalloc(BITS_TO_LONGS(count) *
+				      sizeof(long), GFP_KERNEL);
+		if (qib_cpulist)
+			qib_cpulist_count = count;
+	}
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_ibdev_init(&dd->verbs_dev);
+#endif
+	return dd;
+bail:
+	if (!list_empty(&dd->list))
+		list_del_init(&dd->list);
+	rvt_dealloc_device(&dd->verbs_dev.rdi);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Called from freeze mode handlers, and from PCI error
+ * reporting code.  Should be paranoid about state of
+ * system and data structures.
+ */
+void qib_disable_after_error(struct qib_devdata *dd)
+{
+	if (dd->flags & QIB_INITTED) {
+		u32 pidx;
+
+		dd->flags &= ~QIB_INITTED;
+		if (dd->pport)
+			for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+				struct qib_pportdata *ppd;
+
+				ppd = dd->pport + pidx;
+				if (dd->flags & QIB_PRESENT) {
+					qib_set_linkstate(ppd,
+						QIB_IB_LINKDOWN_DISABLE);
+					dd->f_setextled(ppd, 0);
+				}
+				*ppd->statusp &= ~QIB_STATUS_IB_READY;
+			}
+	}
+
+	/*
+	 * Mark as having had an error for driver, and also
+	 * for /sys and status word mapped to user programs.
+	 * This marks unit as not usable, until reset.
+	 */
+	if (dd->devstatusp)
+		*dd->devstatusp |= QIB_STATUS_HWERROR;
+}
+
+static void qib_remove_one(struct pci_dev *);
+static int qib_init_one(struct pci_dev *, const struct pci_device_id *);
+
+#define DRIVER_LOAD_MSG "Intel " QIB_DRV_NAME " loaded: "
+#define PFX QIB_DRV_NAME ": "
+
+static const struct pci_device_id qib_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_QLOGIC_IB_6120) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7220) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7322) },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, qib_pci_tbl);
+
+static struct pci_driver qib_driver = {
+	.name = QIB_DRV_NAME,
+	.probe = qib_init_one,
+	.remove = qib_remove_one,
+	.id_table = qib_pci_tbl,
+	.err_handler = &qib_pci_err_handler,
+};
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+
+static int qib_notify_dca(struct notifier_block *, unsigned long, void *);
+static struct notifier_block dca_notifier = {
+	.notifier_call  = qib_notify_dca,
+	.next           = NULL,
+	.priority       = 0
+};
+
+static int qib_notify_dca_device(struct device *device, void *data)
+{
+	struct qib_devdata *dd = dev_get_drvdata(device);
+	unsigned long event = *(unsigned long *)data;
+
+	return dd->f_notify_dca(dd, event);
+}
+
+static int qib_notify_dca(struct notifier_block *nb, unsigned long event,
+					  void *p)
+{
+	int rval;
+
+	rval = driver_for_each_device(&qib_driver.driver, NULL,
+				      &event, qib_notify_dca_device);
+	return rval ? NOTIFY_BAD : NOTIFY_DONE;
+}
+
+#endif
+
+/*
+ * Do all the generic driver unit- and chip-independent memory
+ * allocation and initialization.
+ */
+static int __init qib_ib_init(void)
+{
+	int ret;
+
+	ret = qib_dev_init();
+	if (ret)
+		goto bail;
+
+	/*
+	 * These must be called before the driver is registered with
+	 * the PCI subsystem.
+	 */
+	idr_init(&qib_unit_table);
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dca_register_notify(&dca_notifier);
+#endif
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_init();
+#endif
+	ret = pci_register_driver(&qib_driver);
+	if (ret < 0) {
+		pr_err("Unable to register driver: error %d\n", -ret);
+		goto bail_dev;
+	}
+
+	/* not fatal if it doesn't work */
+	if (qib_init_qibfs())
+		pr_err("Unable to register ipathfs\n");
+	goto bail; /* all OK */
+
+bail_dev:
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dca_unregister_notify(&dca_notifier);
+#endif
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_exit();
+#endif
+	idr_destroy(&qib_unit_table);
+	qib_dev_cleanup();
+bail:
+	return ret;
+}
+
+module_init(qib_ib_init);
+
+/*
+ * Do the non-unit driver cleanup, memory free, etc. at unload.
+ */
+static void __exit qib_ib_cleanup(void)
+{
+	int ret;
+
+	ret = qib_exit_qibfs();
+	if (ret)
+		pr_err(
+			"Unable to cleanup counter filesystem: error %d\n",
+			-ret);
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dca_unregister_notify(&dca_notifier);
+#endif
+	pci_unregister_driver(&qib_driver);
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_exit();
+#endif
+
+	qib_cpulist_count = 0;
+	kfree(qib_cpulist);
+
+	idr_destroy(&qib_unit_table);
+	qib_dev_cleanup();
+}
+
+module_exit(qib_ib_cleanup);
+
+/* this can only be called after a successful initialization */
+static void cleanup_device_data(struct qib_devdata *dd)
+{
+	int ctxt;
+	int pidx;
+	struct qib_ctxtdata **tmp;
+	unsigned long flags;
+
+	/* users can't do anything more with chip */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		if (dd->pport[pidx].statusp)
+			*dd->pport[pidx].statusp &= ~QIB_STATUS_CHIP_PRESENT;
+
+		spin_lock(&dd->pport[pidx].cc_shadow_lock);
+
+		kfree(dd->pport[pidx].congestion_entries);
+		dd->pport[pidx].congestion_entries = NULL;
+		kfree(dd->pport[pidx].ccti_entries);
+		dd->pport[pidx].ccti_entries = NULL;
+		kfree(dd->pport[pidx].ccti_entries_shadow);
+		dd->pport[pidx].ccti_entries_shadow = NULL;
+		kfree(dd->pport[pidx].congestion_entries_shadow);
+		dd->pport[pidx].congestion_entries_shadow = NULL;
+
+		spin_unlock(&dd->pport[pidx].cc_shadow_lock);
+	}
+
+	qib_disable_wc(dd);
+
+	if (dd->pioavailregs_dma) {
+		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+				  (void *) dd->pioavailregs_dma,
+				  dd->pioavailregs_phys);
+		dd->pioavailregs_dma = NULL;
+	}
+
+	if (dd->pageshadow) {
+		struct page **tmpp = dd->pageshadow;
+		dma_addr_t *tmpd = dd->physshadow;
+		int i;
+
+		for (ctxt = 0; ctxt < dd->cfgctxts; ctxt++) {
+			int ctxt_tidbase = ctxt * dd->rcvtidcnt;
+			int maxtid = ctxt_tidbase + dd->rcvtidcnt;
+
+			for (i = ctxt_tidbase; i < maxtid; i++) {
+				if (!tmpp[i])
+					continue;
+				pci_unmap_page(dd->pcidev, tmpd[i],
+					       PAGE_SIZE, PCI_DMA_FROMDEVICE);
+				qib_release_user_pages(&tmpp[i], 1);
+				tmpp[i] = NULL;
+			}
+		}
+
+		dd->pageshadow = NULL;
+		vfree(tmpp);
+		dd->physshadow = NULL;
+		vfree(tmpd);
+	}
+
+	/*
+	 * Free any resources still in use (usually just kernel contexts)
+	 * at unload; we do for ctxtcnt, because that's what we allocate.
+	 * We acquire lock to be really paranoid that rcd isn't being
+	 * accessed from some interrupt-related code (that should not happen,
+	 * but best to be sure).
+	 */
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	tmp = dd->rcd;
+	dd->rcd = NULL;
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+	for (ctxt = 0; tmp && ctxt < dd->ctxtcnt; ctxt++) {
+		struct qib_ctxtdata *rcd = tmp[ctxt];
+
+		tmp[ctxt] = NULL; /* debugging paranoia */
+		qib_free_ctxtdata(dd, rcd);
+	}
+	kfree(tmp);
+}
+
+/*
+ * Clean up on unit shutdown, or error during unit load after
+ * successful initialization.
+ */
+static void qib_postinit_cleanup(struct qib_devdata *dd)
+{
+	/*
+	 * Clean up chip-specific stuff.
+	 * We check for NULL here, because it's outside
+	 * the kregbase check, and we need to call it
+	 * after the free_irq.  Thus it's possible that
+	 * the function pointers were never initialized.
+	 */
+	if (dd->f_cleanup)
+		dd->f_cleanup(dd);
+
+	qib_pcie_ddcleanup(dd);
+
+	cleanup_device_data(dd);
+
+	qib_free_devdata(dd);
+}
+
+static int qib_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret, j, pidx, initfail;
+	struct qib_devdata *dd = NULL;
+
+	ret = qib_pcie_init(pdev, ent);
+	if (ret)
+		goto bail;
+
+	/*
+	 * Do device-specific initialiation, function table setup, dd
+	 * allocation, etc.
+	 */
+	switch (ent->device) {
+	case PCI_DEVICE_ID_QLOGIC_IB_6120:
+#ifdef CONFIG_PCI_MSI
+		dd = qib_init_iba6120_funcs(pdev, ent);
+#else
+		qib_early_err(&pdev->dev,
+			"Intel PCIE device 0x%x cannot work if CONFIG_PCI_MSI is not enabled\n",
+			ent->device);
+		dd = ERR_PTR(-ENODEV);
+#endif
+		break;
+
+	case PCI_DEVICE_ID_QLOGIC_IB_7220:
+		dd = qib_init_iba7220_funcs(pdev, ent);
+		break;
+
+	case PCI_DEVICE_ID_QLOGIC_IB_7322:
+		dd = qib_init_iba7322_funcs(pdev, ent);
+		break;
+
+	default:
+		qib_early_err(&pdev->dev,
+			"Failing on unknown Intel deviceid 0x%x\n",
+			ent->device);
+		ret = -ENODEV;
+	}
+
+	if (IS_ERR(dd))
+		ret = PTR_ERR(dd);
+	if (ret)
+		goto bail; /* error already printed */
+
+	ret = qib_create_workqueues(dd);
+	if (ret)
+		goto bail;
+
+	/* do the generic initialization */
+	initfail = qib_init(dd, 0);
+
+	ret = qib_register_ib_device(dd);
+
+	/*
+	 * Now ready for use.  this should be cleared whenever we
+	 * detect a reset, or initiate one.  If earlier failure,
+	 * we still create devices, so diags, etc. can be used
+	 * to determine cause of problem.
+	 */
+	if (!qib_mini_init && !initfail && !ret)
+		dd->flags |= QIB_INITTED;
+
+	j = qib_device_create(dd);
+	if (j)
+		qib_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
+	j = qibfs_add(dd);
+	if (j)
+		qib_dev_err(dd, "Failed filesystem setup for counters: %d\n",
+			    -j);
+
+	if (qib_mini_init || initfail || ret) {
+		qib_stop_timers(dd);
+		flush_workqueue(ib_wq);
+		for (pidx = 0; pidx < dd->num_pports; ++pidx)
+			dd->f_quiet_serdes(dd->pport + pidx);
+		if (qib_mini_init)
+			goto bail;
+		if (!j) {
+			(void) qibfs_remove(dd);
+			qib_device_remove(dd);
+		}
+		if (!ret)
+			qib_unregister_ib_device(dd);
+		qib_postinit_cleanup(dd);
+		if (initfail)
+			ret = initfail;
+		goto bail;
+	}
+
+	ret = qib_enable_wc(dd);
+	if (ret) {
+		qib_dev_err(dd,
+			"Write combining not enabled (err %d): performance may be poor\n",
+			-ret);
+		ret = 0;
+	}
+
+	qib_verify_pioperf(dd);
+bail:
+	return ret;
+}
+
+static void qib_remove_one(struct pci_dev *pdev)
+{
+	struct qib_devdata *dd = pci_get_drvdata(pdev);
+	int ret;
+
+	/* unregister from IB core */
+	qib_unregister_ib_device(dd);
+
+	/*
+	 * Disable the IB link, disable interrupts on the device,
+	 * clear dma engines, etc.
+	 */
+	if (!qib_mini_init)
+		qib_shutdown_device(dd);
+
+	qib_stop_timers(dd);
+
+	/* wait until all of our (qsfp) queue_work() calls complete */
+	flush_workqueue(ib_wq);
+
+	ret = qibfs_remove(dd);
+	if (ret)
+		qib_dev_err(dd, "Failed counters filesystem cleanup: %d\n",
+			    -ret);
+
+	qib_device_remove(dd);
+
+	qib_postinit_cleanup(dd);
+}
+
+/**
+ * qib_create_rcvhdrq - create a receive header queue
+ * @dd: the qlogic_ib device
+ * @rcd: the context data
+ *
+ * This must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
+{
+	unsigned amt;
+	int old_node_id;
+
+	if (!rcd->rcvhdrq) {
+		dma_addr_t phys_hdrqtail;
+		gfp_t gfp_flags;
+
+		amt = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize *
+			    sizeof(u32), PAGE_SIZE);
+		gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
+			GFP_USER : GFP_KERNEL;
+
+		old_node_id = dev_to_node(&dd->pcidev->dev);
+		set_dev_node(&dd->pcidev->dev, rcd->node_id);
+		rcd->rcvhdrq = dma_alloc_coherent(
+			&dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
+			gfp_flags | __GFP_COMP);
+		set_dev_node(&dd->pcidev->dev, old_node_id);
+
+		if (!rcd->rcvhdrq) {
+			qib_dev_err(dd,
+				"attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
+				amt, rcd->ctxt);
+			goto bail;
+		}
+
+		if (rcd->ctxt >= dd->first_user_ctxt) {
+			rcd->user_event_mask = vmalloc_user(PAGE_SIZE);
+			if (!rcd->user_event_mask)
+				goto bail_free_hdrq;
+		}
+
+		if (!(dd->flags & QIB_NODMA_RTAIL)) {
+			set_dev_node(&dd->pcidev->dev, rcd->node_id);
+			rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(
+				&dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+				gfp_flags);
+			set_dev_node(&dd->pcidev->dev, old_node_id);
+			if (!rcd->rcvhdrtail_kvaddr)
+				goto bail_free;
+			rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
+		}
+
+		rcd->rcvhdrq_size = amt;
+	}
+
+	/* clear for security and sanity on each use */
+	memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
+	if (rcd->rcvhdrtail_kvaddr)
+		memset(rcd->rcvhdrtail_kvaddr, 0, PAGE_SIZE);
+	return 0;
+
+bail_free:
+	qib_dev_err(dd,
+		"attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
+		rcd->ctxt);
+	vfree(rcd->user_event_mask);
+	rcd->user_event_mask = NULL;
+bail_free_hdrq:
+	dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
+			  rcd->rcvhdrq_phys);
+	rcd->rcvhdrq = NULL;
+bail:
+	return -ENOMEM;
+}
+
+/**
+ * allocate eager buffers, both kernel and user contexts.
+ * @rcd: the context we are setting up.
+ *
+ * Allocate the eager TID buffers and program them into hip.
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.  Otherwise we get the OOM code involved, by asking for too
+ * much per call, with disastrous results on some kernels.
+ */
+int qib_setup_eagerbufs(struct qib_ctxtdata *rcd)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
+	size_t size;
+	gfp_t gfp_flags;
+	int old_node_id;
+
+	/*
+	 * GFP_USER, but without GFP_FS, so buffer cache can be
+	 * coalesced (we hope); otherwise, even at order 4,
+	 * heavy filesystem activity makes these fail, and we can
+	 * use compound pages.
+	 */
+	gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
+
+	egrcnt = rcd->rcvegrcnt;
+	egroff = rcd->rcvegr_tid_base;
+	egrsize = dd->rcvegrbufsize;
+
+	chunk = rcd->rcvegrbuf_chunks;
+	egrperchunk = rcd->rcvegrbufs_perchunk;
+	size = rcd->rcvegrbuf_size;
+	if (!rcd->rcvegrbuf) {
+		rcd->rcvegrbuf =
+			kzalloc_node(chunk * sizeof(rcd->rcvegrbuf[0]),
+				GFP_KERNEL, rcd->node_id);
+		if (!rcd->rcvegrbuf)
+			goto bail;
+	}
+	if (!rcd->rcvegrbuf_phys) {
+		rcd->rcvegrbuf_phys =
+			kmalloc_array_node(chunk,
+					   sizeof(rcd->rcvegrbuf_phys[0]),
+					   GFP_KERNEL, rcd->node_id);
+		if (!rcd->rcvegrbuf_phys)
+			goto bail_rcvegrbuf;
+	}
+	for (e = 0; e < rcd->rcvegrbuf_chunks; e++) {
+		if (rcd->rcvegrbuf[e])
+			continue;
+
+		old_node_id = dev_to_node(&dd->pcidev->dev);
+		set_dev_node(&dd->pcidev->dev, rcd->node_id);
+		rcd->rcvegrbuf[e] =
+			dma_alloc_coherent(&dd->pcidev->dev, size,
+					   &rcd->rcvegrbuf_phys[e],
+					   gfp_flags);
+		set_dev_node(&dd->pcidev->dev, old_node_id);
+		if (!rcd->rcvegrbuf[e])
+			goto bail_rcvegrbuf_phys;
+	}
+
+	rcd->rcvegr_phys = rcd->rcvegrbuf_phys[0];
+
+	for (e = chunk = 0; chunk < rcd->rcvegrbuf_chunks; chunk++) {
+		dma_addr_t pa = rcd->rcvegrbuf_phys[chunk];
+		unsigned i;
+
+		/* clear for security and sanity on each use */
+		memset(rcd->rcvegrbuf[chunk], 0, size);
+
+		for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
+			dd->f_put_tid(dd, e + egroff +
+					  (u64 __iomem *)
+					  ((char __iomem *)
+					   dd->kregbase +
+					   dd->rcvegrbase),
+					  RCVHQ_RCV_TYPE_EAGER, pa);
+			pa += egrsize;
+		}
+		cond_resched(); /* don't hog the cpu */
+	}
+
+	return 0;
+
+bail_rcvegrbuf_phys:
+	for (e = 0; e < rcd->rcvegrbuf_chunks && rcd->rcvegrbuf[e]; e++)
+		dma_free_coherent(&dd->pcidev->dev, size,
+				  rcd->rcvegrbuf[e], rcd->rcvegrbuf_phys[e]);
+	kfree(rcd->rcvegrbuf_phys);
+	rcd->rcvegrbuf_phys = NULL;
+bail_rcvegrbuf:
+	kfree(rcd->rcvegrbuf);
+	rcd->rcvegrbuf = NULL;
+bail:
+	return -ENOMEM;
+}
+
+/*
+ * Note: Changes to this routine should be mirrored
+ * for the diagnostics routine qib_remap_ioaddr32().
+ * There is also related code for VL15 buffers in qib_init_7322_variables().
+ * The teardown code that unmaps is in qib_pcie_ddcleanup()
+ */
+int init_chip_wc_pat(struct qib_devdata *dd, u32 vl15buflen)
+{
+	u64 __iomem *qib_kregbase = NULL;
+	void __iomem *qib_piobase = NULL;
+	u64 __iomem *qib_userbase = NULL;
+	u64 qib_kreglen;
+	u64 qib_pio2koffset = dd->piobufbase & 0xffffffff;
+	u64 qib_pio4koffset = dd->piobufbase >> 32;
+	u64 qib_pio2klen = dd->piobcnt2k * dd->palign;
+	u64 qib_pio4klen = dd->piobcnt4k * dd->align4k;
+	u64 qib_physaddr = dd->physaddr;
+	u64 qib_piolen;
+	u64 qib_userlen = 0;
+
+	/*
+	 * Free the old mapping because the kernel will try to reuse the
+	 * old mapping and not create a new mapping with the
+	 * write combining attribute.
+	 */
+	iounmap(dd->kregbase);
+	dd->kregbase = NULL;
+
+	/*
+	 * Assumes chip address space looks like:
+	 *	- kregs + sregs + cregs + uregs (in any order)
+	 *	- piobufs (2K and 4K bufs in either order)
+	 * or:
+	 *	- kregs + sregs + cregs (in any order)
+	 *	- piobufs (2K and 4K bufs in either order)
+	 *	- uregs
+	 */
+	if (dd->piobcnt4k == 0) {
+		qib_kreglen = qib_pio2koffset;
+		qib_piolen = qib_pio2klen;
+	} else if (qib_pio2koffset < qib_pio4koffset) {
+		qib_kreglen = qib_pio2koffset;
+		qib_piolen = qib_pio4koffset + qib_pio4klen - qib_kreglen;
+	} else {
+		qib_kreglen = qib_pio4koffset;
+		qib_piolen = qib_pio2koffset + qib_pio2klen - qib_kreglen;
+	}
+	qib_piolen += vl15buflen;
+	/* Map just the configured ports (not all hw ports) */
+	if (dd->uregbase > qib_kreglen)
+		qib_userlen = dd->ureg_align * dd->cfgctxts;
+
+	/* Sanity checks passed, now create the new mappings */
+	qib_kregbase = ioremap_nocache(qib_physaddr, qib_kreglen);
+	if (!qib_kregbase)
+		goto bail;
+
+	qib_piobase = ioremap_wc(qib_physaddr + qib_kreglen, qib_piolen);
+	if (!qib_piobase)
+		goto bail_kregbase;
+
+	if (qib_userlen) {
+		qib_userbase = ioremap_nocache(qib_physaddr + dd->uregbase,
+					       qib_userlen);
+		if (!qib_userbase)
+			goto bail_piobase;
+	}
+
+	dd->kregbase = qib_kregbase;
+	dd->kregend = (u64 __iomem *)
+		((char __iomem *) qib_kregbase + qib_kreglen);
+	dd->piobase = qib_piobase;
+	dd->pio2kbase = (void __iomem *)
+		(((char __iomem *) dd->piobase) +
+		 qib_pio2koffset - qib_kreglen);
+	if (dd->piobcnt4k)
+		dd->pio4kbase = (void __iomem *)
+			(((char __iomem *) dd->piobase) +
+			 qib_pio4koffset - qib_kreglen);
+	if (qib_userlen)
+		/* ureg will now be accessed relative to dd->userbase */
+		dd->userbase = qib_userbase;
+	return 0;
+
+bail_piobase:
+	iounmap(qib_piobase);
+bail_kregbase:
+	iounmap(qib_kregbase);
+bail:
+	return -ENOMEM;
+}
diff --git a/drivers/infiniband/hw/qib/qib_intr.c b/drivers/infiniband/hw/qib/qib_intr.c
new file mode 100644
index 0000000..65c3b96
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_intr.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "qib.h"
+#include "qib_common.h"
+
+/**
+ * qib_format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void qib_format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+	strlcat(msg, "[", msgl);
+	strlcat(msg, hwmsg, msgl);
+	strlcat(msg, "]", msgl);
+}
+
+/**
+ * qib_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void qib_format_hwerrors(u64 hwerrs, const struct qib_hwerror_msgs *hwerrmsgs,
+			 size_t nhwerrmsgs, char *msg, size_t msgl)
+{
+	int i;
+
+	for (i = 0; i < nhwerrmsgs; i++)
+		if (hwerrs & hwerrmsgs[i].mask)
+			qib_format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+}
+
+static void signal_ib_event(struct qib_pportdata *ppd, enum ib_event_type ev)
+{
+	struct ib_event event;
+	struct qib_devdata *dd = ppd->dd;
+
+	event.device = &dd->verbs_dev.rdi.ibdev;
+	event.element.port_num = ppd->port;
+	event.event = ev;
+	ib_dispatch_event(&event);
+}
+
+void qib_handle_e_ibstatuschanged(struct qib_pportdata *ppd, u64 ibcs)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+	u32 lstate;
+	u8 ltstate;
+	enum ib_event_type ev = 0;
+
+	lstate = dd->f_iblink_state(ibcs); /* linkstate */
+	ltstate = dd->f_ibphys_portstate(ibcs);
+
+	/*
+	 * If linkstate transitions into INIT from any of the various down
+	 * states, or if it transitions from any of the up (INIT or better)
+	 * states into any of the down states (except link recovery), then
+	 * call the chip-specific code to take appropriate actions.
+	 *
+	 * ppd->lflags could be 0 if this is the first time the interrupt
+	 * handlers has been called but the link is already up.
+	 */
+	if (lstate >= IB_PORT_INIT &&
+	    (!ppd->lflags || (ppd->lflags & QIBL_LINKDOWN)) &&
+	    ltstate == IB_PHYSPORTSTATE_LINKUP) {
+		/* transitioned to UP */
+		if (dd->f_ib_updown(ppd, 1, ibcs))
+			goto skip_ibchange; /* chip-code handled */
+	} else if (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED |
+		   QIBL_LINKACTIVE | QIBL_IB_FORCE_NOTIFY)) {
+		if (ltstate != IB_PHYSPORTSTATE_LINKUP &&
+		    ltstate <= IB_PHYSPORTSTATE_CFG_TRAIN &&
+		    dd->f_ib_updown(ppd, 0, ibcs))
+			goto skip_ibchange; /* chip-code handled */
+		qib_set_uevent_bits(ppd, _QIB_EVENT_LINKDOWN_BIT);
+	}
+
+	if (lstate != IB_PORT_DOWN) {
+		/* lstate is INIT, ARMED, or ACTIVE */
+		if (lstate != IB_PORT_ACTIVE) {
+			*ppd->statusp &= ~QIB_STATUS_IB_READY;
+			if (ppd->lflags & QIBL_LINKACTIVE)
+				ev = IB_EVENT_PORT_ERR;
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			if (lstate == IB_PORT_ARMED) {
+				ppd->lflags |= QIBL_LINKARMED | QIBL_LINKV;
+				ppd->lflags &= ~(QIBL_LINKINIT |
+					QIBL_LINKDOWN | QIBL_LINKACTIVE);
+			} else {
+				ppd->lflags |= QIBL_LINKINIT | QIBL_LINKV;
+				ppd->lflags &= ~(QIBL_LINKARMED |
+					QIBL_LINKDOWN | QIBL_LINKACTIVE);
+			}
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+			/* start a 75msec timer to clear symbol errors */
+			mod_timer(&ppd->symerr_clear_timer,
+				  msecs_to_jiffies(75));
+		} else if (ltstate == IB_PHYSPORTSTATE_LINKUP &&
+			   !(ppd->lflags & QIBL_LINKACTIVE)) {
+			/* active, but not active defered */
+			qib_hol_up(ppd); /* useful only for 6120 now */
+			*ppd->statusp |=
+				QIB_STATUS_IB_READY | QIB_STATUS_IB_CONF;
+			qib_clear_symerror_on_linkup(&ppd->symerr_clear_timer);
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags |= QIBL_LINKACTIVE | QIBL_LINKV;
+			ppd->lflags &= ~(QIBL_LINKINIT |
+				QIBL_LINKDOWN | QIBL_LINKARMED);
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+			if (dd->flags & QIB_HAS_SEND_DMA)
+				qib_sdma_process_event(ppd,
+					qib_sdma_event_e30_go_running);
+			ev = IB_EVENT_PORT_ACTIVE;
+			dd->f_setextled(ppd, 1);
+		}
+	} else { /* down */
+		if (ppd->lflags & QIBL_LINKACTIVE)
+			ev = IB_EVENT_PORT_ERR;
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_LINKDOWN | QIBL_LINKV;
+		ppd->lflags &= ~(QIBL_LINKINIT |
+				 QIBL_LINKACTIVE | QIBL_LINKARMED);
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		*ppd->statusp &= ~QIB_STATUS_IB_READY;
+	}
+
+skip_ibchange:
+	ppd->lastibcstat = ibcs;
+	if (ev)
+		signal_ib_event(ppd, ev);
+}
+
+void qib_clear_symerror_on_linkup(struct timer_list *t)
+{
+	struct qib_pportdata *ppd = from_timer(ppd, t, symerr_clear_timer);
+
+	if (ppd->lflags & QIBL_LINKACTIVE)
+		return;
+
+	ppd->ibport_data.z_symbol_error_counter =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBSYMBOLERR);
+}
+
+/*
+ * Handle receive interrupts for user ctxts; this means a user
+ * process was waiting for a packet to arrive, and didn't want
+ * to poll.
+ */
+void qib_handle_urcv(struct qib_devdata *dd, u64 ctxtr)
+{
+	struct qib_ctxtdata *rcd;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	for (i = dd->first_user_ctxt; dd->rcd && i < dd->cfgctxts; i++) {
+		if (!(ctxtr & (1ULL << i)))
+			continue;
+		rcd = dd->rcd[i];
+		if (!rcd || !rcd->cnt)
+			continue;
+
+		if (test_and_clear_bit(QIB_CTXT_WAITING_RCV, &rcd->flag)) {
+			wake_up_interruptible(&rcd->wait);
+			dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_INTRAVAIL_DIS,
+				      rcd->ctxt);
+		} else if (test_and_clear_bit(QIB_CTXT_WAITING_URG,
+					      &rcd->flag)) {
+			rcd->urgent++;
+			wake_up_interruptible(&rcd->wait);
+		}
+	}
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+}
+
+void qib_bad_intrstatus(struct qib_devdata *dd)
+{
+	static int allbits;
+
+	/* separate routine, for better optimization of qib_intr() */
+
+	/*
+	 * We print the message and disable interrupts, in hope of
+	 * having a better chance of debugging the problem.
+	 */
+	qib_dev_err(dd,
+		"Read of chip interrupt status failed disabling interrupts\n");
+	if (allbits++) {
+		/* disable interrupt delivery, something is very wrong */
+		if (allbits == 2)
+			dd->f_set_intr_state(dd, 0);
+		if (allbits == 3) {
+			qib_dev_err(dd,
+				"2nd bad interrupt status, unregistering interrupts\n");
+			dd->flags |= QIB_BADINTR;
+			dd->flags &= ~QIB_INITTED;
+			dd->f_free_irq(dd);
+		}
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c
new file mode 100644
index 0000000..4845d00
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_mad.c
@@ -0,0 +1,2498 @@
+/*
+ * Copyright (c) 2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+
+#include "qib.h"
+#include "qib_mad.h"
+
+static int reply(struct ib_smp *smp)
+{
+	/*
+	 * The verbs framework will handle the directed/LID route
+	 * packet changes.
+	 */
+	smp->method = IB_MGMT_METHOD_GET_RESP;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		smp->status |= IB_SMP_DIRECTION;
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static int reply_failure(struct ib_smp *smp)
+{
+	/*
+	 * The verbs framework will handle the directed/LID route
+	 * packet changes.
+	 */
+	smp->method = IB_MGMT_METHOD_GET_RESP;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		smp->status |= IB_SMP_DIRECTION;
+	return IB_MAD_RESULT_FAILURE | IB_MAD_RESULT_REPLY;
+}
+
+static void qib_send_trap(struct qib_ibport *ibp, void *data, unsigned len)
+{
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent;
+	struct ib_smp *smp;
+	int ret;
+	unsigned long flags;
+	unsigned long timeout;
+
+	agent = ibp->rvp.send_agent;
+	if (!agent)
+		return;
+
+	/* o14-3.2.1 */
+	if (!(ppd_from_ibp(ibp)->lflags & QIBL_LINKACTIVE))
+		return;
+
+	/* o14-2 */
+	if (ibp->rvp.trap_timeout &&
+	    time_before(jiffies, ibp->rvp.trap_timeout))
+		return;
+
+	send_buf = ib_create_send_mad(agent, 0, 0, 0, IB_MGMT_MAD_HDR,
+				      IB_MGMT_MAD_DATA, GFP_ATOMIC,
+				      IB_MGMT_BASE_VERSION);
+	if (IS_ERR(send_buf))
+		return;
+
+	smp = send_buf->mad;
+	smp->base_version = IB_MGMT_BASE_VERSION;
+	smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	smp->class_version = 1;
+	smp->method = IB_MGMT_METHOD_TRAP;
+	ibp->rvp.tid++;
+	smp->tid = cpu_to_be64(ibp->rvp.tid);
+	smp->attr_id = IB_SMP_ATTR_NOTICE;
+	/* o14-1: smp->mkey = 0; */
+	memcpy(smp->data, data, len);
+
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	if (!ibp->rvp.sm_ah) {
+		if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
+			struct ib_ah *ah;
+
+			ah = qib_create_qp0_ah(ibp, (u16)ibp->rvp.sm_lid);
+			if (IS_ERR(ah))
+				ret = PTR_ERR(ah);
+			else {
+				send_buf->ah = ah;
+				ibp->rvp.sm_ah = ibah_to_rvtah(ah);
+				ret = 0;
+			}
+		} else
+			ret = -EINVAL;
+	} else {
+		send_buf->ah = &ibp->rvp.sm_ah->ibah;
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+	if (!ret)
+		ret = ib_post_send_mad(send_buf, NULL);
+	if (!ret) {
+		/* 4.096 usec. */
+		timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000;
+		ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout);
+	} else {
+		ib_free_send_mad(send_buf);
+		ibp->rvp.trap_timeout = 0;
+	}
+}
+
+/*
+ * Send a bad P_Key trap (ch. 14.3.8).
+ */
+void qib_bad_pkey(struct qib_ibport *ibp, u32 key, u32 sl,
+		  u32 qp1, u32 qp2, __be16 lid1, __be16 lid2)
+{
+	struct ib_mad_notice_attr data;
+
+	ibp->rvp.n_pkt_drops++;
+	ibp->rvp.pkey_violations++;
+
+	/* Send violation trap */
+	data.generic_type = IB_NOTICE_TYPE_SECURITY;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_BAD_PKEY;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_257_258.lid1 = lid1;
+	data.details.ntc_257_258.lid2 = lid2;
+	data.details.ntc_257_258.key = cpu_to_be32(key);
+	data.details.ntc_257_258.sl_qp1 = cpu_to_be32((sl << 28) | qp1);
+	data.details.ntc_257_258.qp2 = cpu_to_be32(qp2);
+
+	qib_send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a bad M_Key trap (ch. 14.3.9).
+ */
+static void qib_bad_mkey(struct qib_ibport *ibp, struct ib_smp *smp)
+{
+	struct ib_mad_notice_attr data;
+
+	/* Send violation trap */
+	data.generic_type = IB_NOTICE_TYPE_SECURITY;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_BAD_MKEY;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_256.lid = data.issuer_lid;
+	data.details.ntc_256.method = smp->method;
+	data.details.ntc_256.attr_id = smp->attr_id;
+	data.details.ntc_256.attr_mod = smp->attr_mod;
+	data.details.ntc_256.mkey = smp->mkey;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		u8 hop_cnt;
+
+		data.details.ntc_256.dr_slid = smp->dr_slid;
+		data.details.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
+		hop_cnt = smp->hop_cnt;
+		if (hop_cnt > ARRAY_SIZE(data.details.ntc_256.dr_rtn_path)) {
+			data.details.ntc_256.dr_trunc_hop |=
+				IB_NOTICE_TRAP_DR_TRUNC;
+			hop_cnt = ARRAY_SIZE(data.details.ntc_256.dr_rtn_path);
+		}
+		data.details.ntc_256.dr_trunc_hop |= hop_cnt;
+		memcpy(data.details.ntc_256.dr_rtn_path, smp->return_path,
+		       hop_cnt);
+	}
+
+	qib_send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Port Capability Mask Changed trap (ch. 14.3.11).
+ */
+void qib_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
+{
+	struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi);
+	struct qib_devdata *dd = dd_from_dev(ibdev);
+	struct qib_ibport *ibp = &dd->pport[port_num - 1].ibport_data;
+	struct ib_mad_notice_attr data;
+
+	data.generic_type = IB_NOTICE_TYPE_INFO;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_144.lid = data.issuer_lid;
+	data.details.ntc_144.new_cap_mask =
+					cpu_to_be32(ibp->rvp.port_cap_flags);
+	qib_send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a System Image GUID Changed trap (ch. 14.3.12).
+ */
+void qib_sys_guid_chg(struct qib_ibport *ibp)
+{
+	struct ib_mad_notice_attr data;
+
+	data.generic_type = IB_NOTICE_TYPE_INFO;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_SYS_GUID_CHG;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_145.lid = data.issuer_lid;
+	data.details.ntc_145.new_sys_guid = ib_qib_sys_image_guid;
+
+	qib_send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Node Description Changed trap (ch. 14.3.13).
+ */
+void qib_node_desc_chg(struct qib_ibport *ibp)
+{
+	struct ib_mad_notice_attr data;
+
+	data.generic_type = IB_NOTICE_TYPE_INFO;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_144.lid = data.issuer_lid;
+	data.details.ntc_144.local_changes = 1;
+	data.details.ntc_144.change_flags = IB_NOTICE_TRAP_NODE_DESC_CHG;
+
+	qib_send_trap(ibp, &data, sizeof(data));
+}
+
+static int subn_get_nodedescription(struct ib_smp *smp,
+				    struct ib_device *ibdev)
+{
+	if (smp->attr_mod)
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	memcpy(smp->data, ibdev->node_desc, sizeof(smp->data));
+
+	return reply(smp);
+}
+
+static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	u32 majrev, minrev;
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	/* GUID 0 is illegal */
+	if (smp->attr_mod || pidx >= dd->num_pports ||
+	    dd->pport[pidx].guid == 0)
+		smp->status |= IB_SMP_INVALID_FIELD;
+	else
+		nip->port_guid = dd->pport[pidx].guid;
+
+	nip->base_version = 1;
+	nip->class_version = 1;
+	nip->node_type = 1;     /* channel adapter */
+	nip->num_ports = ibdev->phys_port_cnt;
+	/* This is already in network order */
+	nip->sys_guid = ib_qib_sys_image_guid;
+	nip->node_guid = dd->pport->guid; /* Use first-port GUID as node */
+	nip->partition_cap = cpu_to_be16(qib_get_npkeys(dd));
+	nip->device_id = cpu_to_be16(dd->deviceid);
+	majrev = dd->majrev;
+	minrev = dd->minrev;
+	nip->revision = cpu_to_be32((majrev << 16) | minrev);
+	nip->local_port_num = port;
+	nip->vendor_id[0] = QIB_SRC_OUI_1;
+	nip->vendor_id[1] = QIB_SRC_OUI_2;
+	nip->vendor_id[2] = QIB_SRC_OUI_3;
+
+	return reply(smp);
+}
+
+static int subn_get_guidinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
+	__be64 *p = (__be64 *) smp->data;
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	/* 32 blocks of 8 64-bit GUIDs per block */
+
+	memset(smp->data, 0, sizeof(smp->data));
+
+	if (startgx == 0 && pidx < dd->num_pports) {
+		struct qib_pportdata *ppd = dd->pport + pidx;
+		struct qib_ibport *ibp = &ppd->ibport_data;
+		__be64 g = ppd->guid;
+		unsigned i;
+
+		/* GUID 0 is illegal */
+		if (g == 0)
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else {
+			/* The first is a copy of the read-only HW GUID. */
+			p[0] = g;
+			for (i = 1; i < QIB_GUIDS_PER_PORT; i++)
+				p[i] = ibp->guids[i - 1];
+		}
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static void set_link_width_enabled(struct qib_pportdata *ppd, u32 w)
+{
+	(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_speed_enabled(struct qib_pportdata *ppd, u32 s)
+{
+	(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_SPD_ENB, s);
+}
+
+static int get_overrunthreshold(struct qib_pportdata *ppd)
+{
+	return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_OVERRUN_THRESH);
+}
+
+/**
+ * set_overrunthreshold - set the overrun threshold
+ * @ppd: the physical port data
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_overrunthreshold(struct qib_pportdata *ppd, unsigned n)
+{
+	(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_OVERRUN_THRESH,
+					 (u32)n);
+	return 0;
+}
+
+static int get_phyerrthreshold(struct qib_pportdata *ppd)
+{
+	return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_PHYERR_THRESH);
+}
+
+/**
+ * set_phyerrthreshold - set the physical error threshold
+ * @ppd: the physical port data
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_phyerrthreshold(struct qib_pportdata *ppd, unsigned n)
+{
+	(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PHYERR_THRESH,
+					 (u32)n);
+	return 0;
+}
+
+/**
+ * get_linkdowndefaultstate - get the default linkdown state
+ * @ppd: the physical port data
+ *
+ * Returns zero if the default is POLL, 1 if the default is SLEEP.
+ */
+static int get_linkdowndefaultstate(struct qib_pportdata *ppd)
+{
+	return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT) ==
+		IB_LINKINITCMD_SLEEP;
+}
+
+static int check_mkey(struct qib_ibport *ibp, struct ib_smp *smp, int mad_flags)
+{
+	int valid_mkey = 0;
+	int ret = 0;
+
+	/* Is the mkey in the process of expiring? */
+	if (ibp->rvp.mkey_lease_timeout &&
+	    time_after_eq(jiffies, ibp->rvp.mkey_lease_timeout)) {
+		/* Clear timeout and mkey protection field. */
+		ibp->rvp.mkey_lease_timeout = 0;
+		ibp->rvp.mkeyprot = 0;
+	}
+
+	if ((mad_flags & IB_MAD_IGNORE_MKEY) ||  ibp->rvp.mkey == 0 ||
+	    ibp->rvp.mkey == smp->mkey)
+		valid_mkey = 1;
+
+	/* Unset lease timeout on any valid Get/Set/TrapRepress */
+	if (valid_mkey && ibp->rvp.mkey_lease_timeout &&
+	    (smp->method == IB_MGMT_METHOD_GET ||
+	     smp->method == IB_MGMT_METHOD_SET ||
+	     smp->method == IB_MGMT_METHOD_TRAP_REPRESS))
+		ibp->rvp.mkey_lease_timeout = 0;
+
+	if (!valid_mkey) {
+		switch (smp->method) {
+		case IB_MGMT_METHOD_GET:
+			/* Bad mkey not a violation below level 2 */
+			if (ibp->rvp.mkeyprot < 2)
+				break;
+			/* fall through */
+		case IB_MGMT_METHOD_SET:
+		case IB_MGMT_METHOD_TRAP_REPRESS:
+			if (ibp->rvp.mkey_violations != 0xFFFF)
+				++ibp->rvp.mkey_violations;
+			if (!ibp->rvp.mkey_lease_timeout &&
+			    ibp->rvp.mkey_lease_period)
+				ibp->rvp.mkey_lease_timeout = jiffies +
+					ibp->rvp.mkey_lease_period * HZ;
+			/* Generate a trap notice. */
+			qib_bad_mkey(ibp, smp);
+			ret = 1;
+		}
+	}
+
+	return ret;
+}
+
+static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_devdata *dd;
+	struct qib_pportdata *ppd;
+	struct qib_ibport *ibp;
+	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+	u8 mtu;
+	int ret;
+	u32 state;
+	u32 port_num = be32_to_cpu(smp->attr_mod);
+
+	if (port_num == 0)
+		port_num = port;
+	else {
+		if (port_num > ibdev->phys_port_cnt) {
+			smp->status |= IB_SMP_INVALID_FIELD;
+			ret = reply(smp);
+			goto bail;
+		}
+		if (port_num != port) {
+			ibp = to_iport(ibdev, port_num);
+			ret = check_mkey(ibp, smp, 0);
+			if (ret) {
+				ret = IB_MAD_RESULT_FAILURE;
+				goto bail;
+			}
+		}
+	}
+
+	dd = dd_from_ibdev(ibdev);
+	/* IB numbers ports from 1, hdw from 0 */
+	ppd = dd->pport + (port_num - 1);
+	ibp = &ppd->ibport_data;
+
+	/* Clear all fields.  Only set the non-zero fields. */
+	memset(smp->data, 0, sizeof(smp->data));
+
+	/* Only return the mkey if the protection field allows it. */
+	if (!(smp->method == IB_MGMT_METHOD_GET &&
+	      ibp->rvp.mkey != smp->mkey &&
+	      ibp->rvp.mkeyprot == 1))
+		pip->mkey = ibp->rvp.mkey;
+	pip->gid_prefix = ibp->rvp.gid_prefix;
+	pip->lid = cpu_to_be16(ppd->lid);
+	pip->sm_lid = cpu_to_be16((u16)ibp->rvp.sm_lid);
+	pip->cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+	/* pip->diag_code; */
+	pip->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period);
+	pip->local_port_num = port;
+	pip->link_width_enabled = ppd->link_width_enabled;
+	pip->link_width_supported = ppd->link_width_supported;
+	pip->link_width_active = ppd->link_width_active;
+	state = dd->f_iblink_state(ppd->lastibcstat);
+	pip->linkspeed_portstate = ppd->link_speed_supported << 4 | state;
+
+	pip->portphysstate_linkdown =
+		(dd->f_ibphys_portstate(ppd->lastibcstat) << 4) |
+		(get_linkdowndefaultstate(ppd) ? 1 : 2);
+	pip->mkeyprot_resv_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc;
+	pip->linkspeedactive_enabled = (ppd->link_speed_active << 4) |
+		ppd->link_speed_enabled;
+	switch (ppd->ibmtu) {
+	default: /* something is wrong; fall through */
+	case 4096:
+		mtu = IB_MTU_4096;
+		break;
+	case 2048:
+		mtu = IB_MTU_2048;
+		break;
+	case 1024:
+		mtu = IB_MTU_1024;
+		break;
+	case 512:
+		mtu = IB_MTU_512;
+		break;
+	case 256:
+		mtu = IB_MTU_256;
+		break;
+	}
+	pip->neighbormtu_mastersmsl = (mtu << 4) | ibp->rvp.sm_sl;
+	pip->vlcap_inittype = ppd->vls_supported << 4;  /* InitType = 0 */
+	pip->vl_high_limit = ibp->rvp.vl_high_limit;
+	pip->vl_arb_high_cap =
+		dd->f_get_ib_cfg(ppd, QIB_IB_CFG_VL_HIGH_CAP);
+	pip->vl_arb_low_cap =
+		dd->f_get_ib_cfg(ppd, QIB_IB_CFG_VL_LOW_CAP);
+	/* InitTypeReply = 0 */
+	pip->inittypereply_mtucap = qib_ibmtu ? qib_ibmtu : IB_MTU_4096;
+	/* HCAs ignore VLStallCount and HOQLife */
+	/* pip->vlstallcnt_hoqlife; */
+	pip->operationalvl_pei_peo_fpi_fpo =
+		dd->f_get_ib_cfg(ppd, QIB_IB_CFG_OP_VLS) << 4;
+	pip->mkey_violations = cpu_to_be16(ibp->rvp.mkey_violations);
+	/* P_KeyViolations are counted by hardware. */
+	pip->pkey_violations = cpu_to_be16(ibp->rvp.pkey_violations);
+	pip->qkey_violations = cpu_to_be16(ibp->rvp.qkey_violations);
+	/* Only the hardware GUID is supported for now */
+	pip->guid_cap = QIB_GUIDS_PER_PORT;
+	pip->clientrereg_resv_subnetto = ibp->rvp.subnet_timeout;
+	/* 32.768 usec. response time (guessing) */
+	pip->resv_resptimevalue = 3;
+	pip->localphyerrors_overrunerrors =
+		(get_phyerrthreshold(ppd) << 4) |
+		get_overrunthreshold(ppd);
+	/* pip->max_credit_hint; */
+	if (ibp->rvp.port_cap_flags & IB_PORT_LINK_LATENCY_SUP) {
+		u32 v;
+
+		v = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_LINKLATENCY);
+		pip->link_roundtrip_latency[0] = v >> 16;
+		pip->link_roundtrip_latency[1] = v >> 8;
+		pip->link_roundtrip_latency[2] = v;
+	}
+
+	ret = reply(smp);
+
+bail:
+	return ret;
+}
+
+/**
+ * get_pkeys - return the PKEY table
+ * @dd: the qlogic_ib device
+ * @port: the IB port number
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys)
+{
+	struct qib_pportdata *ppd = dd->pport + port - 1;
+	/*
+	 * always a kernel context, no locking needed.
+	 * If we get here with ppd setup, no need to check
+	 * that pd is valid.
+	 */
+	struct qib_ctxtdata *rcd = dd->rcd[ppd->hw_pidx];
+
+	memcpy(pkeys, rcd->pkeys, sizeof(rcd->pkeys));
+
+	return 0;
+}
+
+static int subn_get_pkeytable(struct ib_smp *smp, struct ib_device *ibdev,
+			      u8 port)
+{
+	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+	u16 *p = (u16 *) smp->data;
+	__be16 *q = (__be16 *) smp->data;
+
+	/* 64 blocks of 32 16-bit P_Key entries */
+
+	memset(smp->data, 0, sizeof(smp->data));
+	if (startpx == 0) {
+		struct qib_devdata *dd = dd_from_ibdev(ibdev);
+		unsigned i, n = qib_get_npkeys(dd);
+
+		get_pkeys(dd, port, p);
+
+		for (i = 0; i < n; i++)
+			q[i] = cpu_to_be16(p[i]);
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static int subn_set_guidinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
+	__be64 *p = (__be64 *) smp->data;
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	/* 32 blocks of 8 64-bit GUIDs per block */
+
+	if (startgx == 0 && pidx < dd->num_pports) {
+		struct qib_pportdata *ppd = dd->pport + pidx;
+		struct qib_ibport *ibp = &ppd->ibport_data;
+		unsigned i;
+
+		/* The first entry is read-only. */
+		for (i = 1; i < QIB_GUIDS_PER_PORT; i++)
+			ibp->guids[i - 1] = p[i];
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	/* The only GUID we support is the first read-only entry. */
+	return subn_get_guidinfo(smp, ibdev, port);
+}
+
+/**
+ * subn_set_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ * Set Portinfo (see ch. 14.2.5.6).
+ */
+static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+	struct ib_event event;
+	struct qib_devdata *dd;
+	struct qib_pportdata *ppd;
+	struct qib_ibport *ibp;
+	u8 clientrereg = (pip->clientrereg_resv_subnetto & 0x80);
+	unsigned long flags;
+	u16 lid, smlid;
+	u8 lwe;
+	u8 lse;
+	u8 state;
+	u8 vls;
+	u8 msl;
+	u16 lstate;
+	int ret, ore, mtu;
+	u32 port_num = be32_to_cpu(smp->attr_mod);
+
+	if (port_num == 0)
+		port_num = port;
+	else {
+		if (port_num > ibdev->phys_port_cnt)
+			goto err;
+		/* Port attributes can only be set on the receiving port */
+		if (port_num != port)
+			goto get_only;
+	}
+
+	dd = dd_from_ibdev(ibdev);
+	/* IB numbers ports from 1, hdw from 0 */
+	ppd = dd->pport + (port_num - 1);
+	ibp = &ppd->ibport_data;
+	event.device = ibdev;
+	event.element.port_num = port;
+
+	ibp->rvp.mkey = pip->mkey;
+	ibp->rvp.gid_prefix = pip->gid_prefix;
+	ibp->rvp.mkey_lease_period = be16_to_cpu(pip->mkey_lease_period);
+
+	lid = be16_to_cpu(pip->lid);
+	/* Must be a valid unicast LID address. */
+	if (lid == 0 || lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
+		smp->status |= IB_SMP_INVALID_FIELD;
+	else if (ppd->lid != lid || ppd->lmc != (pip->mkeyprot_resv_lmc & 7)) {
+		if (ppd->lid != lid)
+			qib_set_uevent_bits(ppd, _QIB_EVENT_LID_CHANGE_BIT);
+		if (ppd->lmc != (pip->mkeyprot_resv_lmc & 7))
+			qib_set_uevent_bits(ppd, _QIB_EVENT_LMC_CHANGE_BIT);
+		qib_set_lid(ppd, lid, pip->mkeyprot_resv_lmc & 7);
+		event.event = IB_EVENT_LID_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	smlid = be16_to_cpu(pip->sm_lid);
+	msl = pip->neighbormtu_mastersmsl & 0xF;
+	/* Must be a valid unicast LID address. */
+	if (smlid == 0 || smlid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
+		smp->status |= IB_SMP_INVALID_FIELD;
+	else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) {
+		spin_lock_irqsave(&ibp->rvp.lock, flags);
+		if (ibp->rvp.sm_ah) {
+			if (smlid != ibp->rvp.sm_lid)
+				rdma_ah_set_dlid(&ibp->rvp.sm_ah->attr,
+						 smlid);
+			if (msl != ibp->rvp.sm_sl)
+				rdma_ah_set_sl(&ibp->rvp.sm_ah->attr, msl);
+		}
+		spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+		if (smlid != ibp->rvp.sm_lid)
+			ibp->rvp.sm_lid = smlid;
+		if (msl != ibp->rvp.sm_sl)
+			ibp->rvp.sm_sl = msl;
+		event.event = IB_EVENT_SM_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	/* Allow 1x or 4x to be set (see 14.2.6.6). */
+	lwe = pip->link_width_enabled;
+	if (lwe) {
+		if (lwe == 0xFF)
+			set_link_width_enabled(ppd, ppd->link_width_supported);
+		else if (lwe >= 16 || (lwe & ~ppd->link_width_supported))
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else if (lwe != ppd->link_width_enabled)
+			set_link_width_enabled(ppd, lwe);
+	}
+
+	lse = pip->linkspeedactive_enabled & 0xF;
+	if (lse) {
+		/*
+		 * The IB 1.2 spec. only allows link speed values
+		 * 1, 3, 5, 7, 15.  1.2.1 extended to allow specific
+		 * speeds.
+		 */
+		if (lse == 15)
+			set_link_speed_enabled(ppd,
+					       ppd->link_speed_supported);
+		else if (lse >= 8 || (lse & ~ppd->link_speed_supported))
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else if (lse != ppd->link_speed_enabled)
+			set_link_speed_enabled(ppd, lse);
+	}
+
+	/* Set link down default state. */
+	switch (pip->portphysstate_linkdown & 0xF) {
+	case 0: /* NOP */
+		break;
+	case 1: /* SLEEP */
+		(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT,
+					IB_LINKINITCMD_SLEEP);
+		break;
+	case 2: /* POLL */
+		(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT,
+					IB_LINKINITCMD_POLL);
+		break;
+	default:
+		smp->status |= IB_SMP_INVALID_FIELD;
+	}
+
+	ibp->rvp.mkeyprot = pip->mkeyprot_resv_lmc >> 6;
+	ibp->rvp.vl_high_limit = pip->vl_high_limit;
+	(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_VL_HIGH_LIMIT,
+				    ibp->rvp.vl_high_limit);
+
+	mtu = ib_mtu_enum_to_int((pip->neighbormtu_mastersmsl >> 4) & 0xF);
+	if (mtu == -1)
+		smp->status |= IB_SMP_INVALID_FIELD;
+	else
+		qib_set_mtu(ppd, mtu);
+
+	/* Set operational VLs */
+	vls = (pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF;
+	if (vls) {
+		if (vls > ppd->vls_supported)
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else
+			(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_OP_VLS, vls);
+	}
+
+	if (pip->mkey_violations == 0)
+		ibp->rvp.mkey_violations = 0;
+
+	if (pip->pkey_violations == 0)
+		ibp->rvp.pkey_violations = 0;
+
+	if (pip->qkey_violations == 0)
+		ibp->rvp.qkey_violations = 0;
+
+	ore = pip->localphyerrors_overrunerrors;
+	if (set_phyerrthreshold(ppd, (ore >> 4) & 0xF))
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	if (set_overrunthreshold(ppd, (ore & 0xF)))
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	ibp->rvp.subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F;
+
+	/*
+	 * Do the port state change now that the other link parameters
+	 * have been set.
+	 * Changing the port physical state only makes sense if the link
+	 * is down or is being set to down.
+	 */
+	state = pip->linkspeed_portstate & 0xF;
+	lstate = (pip->portphysstate_linkdown >> 4) & 0xF;
+	if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP))
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	/*
+	 * Only state changes of DOWN, ARM, and ACTIVE are valid
+	 * and must be in the correct state to take effect (see 7.2.6).
+	 */
+	switch (state) {
+	case IB_PORT_NOP:
+		if (lstate == 0)
+			break;
+		/* FALLTHROUGH */
+	case IB_PORT_DOWN:
+		if (lstate == 0)
+			lstate = QIB_IB_LINKDOWN_ONLY;
+		else if (lstate == 1)
+			lstate = QIB_IB_LINKDOWN_SLEEP;
+		else if (lstate == 2)
+			lstate = QIB_IB_LINKDOWN;
+		else if (lstate == 3)
+			lstate = QIB_IB_LINKDOWN_DISABLE;
+		else {
+			smp->status |= IB_SMP_INVALID_FIELD;
+			break;
+		}
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_LINKV;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		qib_set_linkstate(ppd, lstate);
+		/*
+		 * Don't send a reply if the response would be sent
+		 * through the disabled port.
+		 */
+		if (lstate == QIB_IB_LINKDOWN_DISABLE && smp->hop_cnt) {
+			ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+			goto done;
+		}
+		qib_wait_linkstate(ppd, QIBL_LINKV, 10);
+		break;
+	case IB_PORT_ARMED:
+		qib_set_linkstate(ppd, QIB_IB_LINKARM);
+		break;
+	case IB_PORT_ACTIVE:
+		qib_set_linkstate(ppd, QIB_IB_LINKACTIVE);
+		break;
+	default:
+		smp->status |= IB_SMP_INVALID_FIELD;
+	}
+
+	if (clientrereg) {
+		event.event = IB_EVENT_CLIENT_REREGISTER;
+		ib_dispatch_event(&event);
+	}
+
+	/* restore re-reg bit per o14-12.2.1 */
+	pip->clientrereg_resv_subnetto |= clientrereg;
+
+	goto get_only;
+
+err:
+	smp->status |= IB_SMP_INVALID_FIELD;
+get_only:
+	ret = subn_get_portinfo(smp, ibdev, port);
+done:
+	return ret;
+}
+
+/**
+ * rm_pkey - decrecment the reference count for the given PKEY
+ * @dd: the qlogic_ib device
+ * @key: the PKEY index
+ *
+ * Return true if this was the last reference and the hardware table entry
+ * needs to be changed.
+ */
+static int rm_pkey(struct qib_pportdata *ppd, u16 key)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (ppd->pkeys[i] != key)
+			continue;
+		if (atomic_dec_and_test(&ppd->pkeyrefs[i])) {
+			ppd->pkeys[i] = 0;
+			ret = 1;
+			goto bail;
+		}
+		break;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * add_pkey - add the given PKEY to the hardware table
+ * @dd: the qlogic_ib device
+ * @key: the PKEY
+ *
+ * Return an error code if unable to add the entry, zero if no change,
+ * or 1 if the hardware PKEY register needs to be updated.
+ */
+static int add_pkey(struct qib_pportdata *ppd, u16 key)
+{
+	int i;
+	u16 lkey = key & 0x7FFF;
+	int any = 0;
+	int ret;
+
+	if (lkey == 0x7FFF) {
+		ret = 0;
+		goto bail;
+	}
+
+	/* Look for an empty slot or a matching PKEY. */
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (!ppd->pkeys[i]) {
+			any++;
+			continue;
+		}
+		/* If it matches exactly, try to increment the ref count */
+		if (ppd->pkeys[i] == key) {
+			if (atomic_inc_return(&ppd->pkeyrefs[i]) > 1) {
+				ret = 0;
+				goto bail;
+			}
+			/* Lost the race. Look for an empty slot below. */
+			atomic_dec(&ppd->pkeyrefs[i]);
+			any++;
+		}
+		/*
+		 * It makes no sense to have both the limited and unlimited
+		 * PKEY set at the same time since the unlimited one will
+		 * disable the limited one.
+		 */
+		if ((ppd->pkeys[i] & 0x7FFF) == lkey) {
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (!any) {
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (!ppd->pkeys[i] &&
+		    atomic_inc_return(&ppd->pkeyrefs[i]) == 1) {
+			/* for qibstats, etc. */
+			ppd->pkeys[i] = key;
+			ret = 1;
+			goto bail;
+		}
+	}
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+/**
+ * set_pkeys - set the PKEY table for ctxt 0
+ * @dd: the qlogic_ib device
+ * @port: the IB port number
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys)
+{
+	struct qib_pportdata *ppd;
+	struct qib_ctxtdata *rcd;
+	int i;
+	int changed = 0;
+
+	/*
+	 * IB port one/two always maps to context zero/one,
+	 * always a kernel context, no locking needed
+	 * If we get here with ppd setup, no need to check
+	 * that rcd is valid.
+	 */
+	ppd = dd->pport + (port - 1);
+	rcd = dd->rcd[ppd->hw_pidx];
+
+	for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) {
+		u16 key = pkeys[i];
+		u16 okey = rcd->pkeys[i];
+
+		if (key == okey)
+			continue;
+		/*
+		 * The value of this PKEY table entry is changing.
+		 * Remove the old entry in the hardware's array of PKEYs.
+		 */
+		if (okey & 0x7FFF)
+			changed |= rm_pkey(ppd, okey);
+		if (key & 0x7FFF) {
+			int ret = add_pkey(ppd, key);
+
+			if (ret < 0)
+				key = 0;
+			else
+				changed |= ret;
+		}
+		rcd->pkeys[i] = key;
+	}
+	if (changed) {
+		struct ib_event event;
+
+		(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0);
+
+		event.event = IB_EVENT_PKEY_CHANGE;
+		event.device = &dd->verbs_dev.rdi.ibdev;
+		event.element.port_num = port;
+		ib_dispatch_event(&event);
+	}
+	return 0;
+}
+
+static int subn_set_pkeytable(struct ib_smp *smp, struct ib_device *ibdev,
+			      u8 port)
+{
+	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+	__be16 *p = (__be16 *) smp->data;
+	u16 *q = (u16 *) smp->data;
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	unsigned i, n = qib_get_npkeys(dd);
+
+	for (i = 0; i < n; i++)
+		q[i] = be16_to_cpu(p[i]);
+
+	if (startpx != 0 || set_pkeys(dd, port, q) != 0)
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return subn_get_pkeytable(smp, ibdev, port);
+}
+
+static int subn_get_sl_to_vl(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	u8 *p = (u8 *) smp->data;
+	unsigned i;
+
+	memset(smp->data, 0, sizeof(smp->data));
+
+	if (!(ibp->rvp.port_cap_flags & IB_PORT_SL_MAP_SUP))
+		smp->status |= IB_SMP_UNSUP_METHOD;
+	else
+		for (i = 0; i < ARRAY_SIZE(ibp->sl_to_vl); i += 2)
+			*p++ = (ibp->sl_to_vl[i] << 4) | ibp->sl_to_vl[i + 1];
+
+	return reply(smp);
+}
+
+static int subn_set_sl_to_vl(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	u8 *p = (u8 *) smp->data;
+	unsigned i;
+
+	if (!(ibp->rvp.port_cap_flags & IB_PORT_SL_MAP_SUP)) {
+		smp->status |= IB_SMP_UNSUP_METHOD;
+		return reply(smp);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ibp->sl_to_vl); i += 2, p++) {
+		ibp->sl_to_vl[i] = *p >> 4;
+		ibp->sl_to_vl[i + 1] = *p & 0xF;
+	}
+	qib_set_uevent_bits(ppd_from_ibp(to_iport(ibdev, port)),
+			    _QIB_EVENT_SL2VL_CHANGE_BIT);
+
+	return subn_get_sl_to_vl(smp, ibdev, port);
+}
+
+static int subn_get_vl_arb(struct ib_smp *smp, struct ib_device *ibdev,
+			   u8 port)
+{
+	unsigned which = be32_to_cpu(smp->attr_mod) >> 16;
+	struct qib_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+
+	memset(smp->data, 0, sizeof(smp->data));
+
+	if (ppd->vls_supported == IB_VL_VL0)
+		smp->status |= IB_SMP_UNSUP_METHOD;
+	else if (which == IB_VLARB_LOWPRI_0_31)
+		(void) ppd->dd->f_get_ib_table(ppd, QIB_IB_TBL_VL_LOW_ARB,
+						   smp->data);
+	else if (which == IB_VLARB_HIGHPRI_0_31)
+		(void) ppd->dd->f_get_ib_table(ppd, QIB_IB_TBL_VL_HIGH_ARB,
+						   smp->data);
+	else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static int subn_set_vl_arb(struct ib_smp *smp, struct ib_device *ibdev,
+			   u8 port)
+{
+	unsigned which = be32_to_cpu(smp->attr_mod) >> 16;
+	struct qib_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+
+	if (ppd->vls_supported == IB_VL_VL0)
+		smp->status |= IB_SMP_UNSUP_METHOD;
+	else if (which == IB_VLARB_LOWPRI_0_31)
+		(void) ppd->dd->f_set_ib_table(ppd, QIB_IB_TBL_VL_LOW_ARB,
+						   smp->data);
+	else if (which == IB_VLARB_HIGHPRI_0_31)
+		(void) ppd->dd->f_set_ib_table(ppd, QIB_IB_TBL_VL_HIGH_ARB,
+						   smp->data);
+	else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return subn_get_vl_arb(smp, ibdev, port);
+}
+
+static int subn_trap_repress(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	/*
+	 * For now, we only send the trap once so no need to process this.
+	 * o13-6, o13-7,
+	 * o14-3.a4 The SMA shall not send any message in response to a valid
+	 * SubnTrapRepress() message.
+	 */
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+}
+
+static int pma_get_classportinfo(struct ib_pma_mad *pmp,
+				 struct ib_device *ibdev)
+{
+	struct ib_class_port_info *p =
+		(struct ib_class_port_info *)pmp->data;
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	if (pmp->mad_hdr.attr_mod != 0)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	/* Note that AllPortSelect is not valid */
+	p->base_version = 1;
+	p->class_version = 1;
+	p->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
+	/*
+	 * Set the most significant bit of CM2 to indicate support for
+	 * congestion statistics
+	 */
+	ib_set_cpi_capmask2(p,
+			    dd->psxmitwait_supported <<
+			    (31 - IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE));
+	/*
+	 * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
+	 */
+	ib_set_cpi_resp_time(p, 18);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_get_portsamplescontrol(struct ib_pma_mad *pmp,
+				      struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplescontrol *p =
+		(struct ib_pma_portsamplescontrol *)pmp->data;
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned long flags;
+	u8 port_select = p->port_select;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 || port_select != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		goto bail;
+	}
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	p->tick = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_PMA_TICKS);
+	p->sample_status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+	p->counter_width = 4;   /* 32 bit counters */
+	p->counter_mask0_9 = COUNTER_MASK0_9;
+	p->sample_start = cpu_to_be32(ibp->rvp.pma_sample_start);
+	p->sample_interval = cpu_to_be32(ibp->rvp.pma_sample_interval);
+	p->tag = cpu_to_be16(ibp->rvp.pma_tag);
+	p->counter_select[0] = ibp->rvp.pma_counter_select[0];
+	p->counter_select[1] = ibp->rvp.pma_counter_select[1];
+	p->counter_select[2] = ibp->rvp.pma_counter_select[2];
+	p->counter_select[3] = ibp->rvp.pma_counter_select[3];
+	p->counter_select[4] = ibp->rvp.pma_counter_select[4];
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+bail:
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_set_portsamplescontrol(struct ib_pma_mad *pmp,
+				      struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplescontrol *p =
+		(struct ib_pma_portsamplescontrol *)pmp->data;
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned long flags;
+	u8 status, xmit_flags;
+	int ret;
+
+	if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		ret = reply((struct ib_smp *) pmp);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+
+	/* Port Sampling code owns the PS* HW counters */
+	xmit_flags = ppd->cong_stats.flags;
+	ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_SAMPLE;
+	status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+	if (status == IB_PMA_SAMPLE_STATUS_DONE ||
+	    (status == IB_PMA_SAMPLE_STATUS_RUNNING &&
+	     xmit_flags == IB_PMA_CONG_HW_CONTROL_TIMER)) {
+		ibp->rvp.pma_sample_start = be32_to_cpu(p->sample_start);
+		ibp->rvp.pma_sample_interval = be32_to_cpu(p->sample_interval);
+		ibp->rvp.pma_tag = be16_to_cpu(p->tag);
+		ibp->rvp.pma_counter_select[0] = p->counter_select[0];
+		ibp->rvp.pma_counter_select[1] = p->counter_select[1];
+		ibp->rvp.pma_counter_select[2] = p->counter_select[2];
+		ibp->rvp.pma_counter_select[3] = p->counter_select[3];
+		ibp->rvp.pma_counter_select[4] = p->counter_select[4];
+		dd->f_set_cntr_sample(ppd, ibp->rvp.pma_sample_interval,
+				      ibp->rvp.pma_sample_start);
+	}
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+	ret = pma_get_portsamplescontrol(pmp, ibdev, port);
+
+bail:
+	return ret;
+}
+
+static u64 get_counter(struct qib_ibport *ibp, struct qib_pportdata *ppd,
+		       __be16 sel)
+{
+	u64 ret;
+
+	switch (sel) {
+	case IB_PMA_PORT_XMIT_DATA:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITDATA);
+		break;
+	case IB_PMA_PORT_RCV_DATA:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSRCVDATA);
+		break;
+	case IB_PMA_PORT_XMIT_PKTS:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITPKTS);
+		break;
+	case IB_PMA_PORT_RCV_PKTS:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSRCVPKTS);
+		break;
+	case IB_PMA_PORT_XMIT_WAIT:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITWAIT);
+		break;
+	default:
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/* This function assumes that the xmit_wait lock is already held */
+static u64 xmit_wait_get_value_delta(struct qib_pportdata *ppd)
+{
+	u32 delta;
+
+	delta = get_counter(&ppd->ibport_data, ppd,
+			    IB_PMA_PORT_XMIT_WAIT);
+	return ppd->cong_stats.counter + delta;
+}
+
+static void cache_hw_sample_counters(struct qib_pportdata *ppd)
+{
+	struct qib_ibport *ibp = &ppd->ibport_data;
+
+	ppd->cong_stats.counter_cache.psxmitdata =
+		get_counter(ibp, ppd, IB_PMA_PORT_XMIT_DATA);
+	ppd->cong_stats.counter_cache.psrcvdata =
+		get_counter(ibp, ppd, IB_PMA_PORT_RCV_DATA);
+	ppd->cong_stats.counter_cache.psxmitpkts =
+		get_counter(ibp, ppd, IB_PMA_PORT_XMIT_PKTS);
+	ppd->cong_stats.counter_cache.psrcvpkts =
+		get_counter(ibp, ppd, IB_PMA_PORT_RCV_PKTS);
+	ppd->cong_stats.counter_cache.psxmitwait =
+		get_counter(ibp, ppd, IB_PMA_PORT_XMIT_WAIT);
+}
+
+static u64 get_cache_hw_sample_counters(struct qib_pportdata *ppd,
+					__be16 sel)
+{
+	u64 ret;
+
+	switch (sel) {
+	case IB_PMA_PORT_XMIT_DATA:
+		ret = ppd->cong_stats.counter_cache.psxmitdata;
+		break;
+	case IB_PMA_PORT_RCV_DATA:
+		ret = ppd->cong_stats.counter_cache.psrcvdata;
+		break;
+	case IB_PMA_PORT_XMIT_PKTS:
+		ret = ppd->cong_stats.counter_cache.psxmitpkts;
+		break;
+	case IB_PMA_PORT_RCV_PKTS:
+		ret = ppd->cong_stats.counter_cache.psrcvpkts;
+		break;
+	case IB_PMA_PORT_XMIT_WAIT:
+		ret = ppd->cong_stats.counter_cache.psxmitwait;
+		break;
+	default:
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int pma_get_portsamplesresult(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplesresult *p =
+		(struct ib_pma_portsamplesresult *)pmp->data;
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned long flags;
+	u8 status;
+	int i;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	p->tag = cpu_to_be16(ibp->rvp.pma_tag);
+	if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_TIMER)
+		p->sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+	else {
+		status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+		p->sample_status = cpu_to_be16(status);
+		if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+			cache_hw_sample_counters(ppd);
+			ppd->cong_stats.counter =
+				xmit_wait_get_value_delta(ppd);
+			dd->f_set_cntr_sample(ppd,
+					      QIB_CONG_TIMER_PSINTERVAL, 0);
+			ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER;
+		}
+	}
+	for (i = 0; i < ARRAY_SIZE(ibp->rvp.pma_counter_select); i++)
+		p->counter[i] = cpu_to_be32(
+			get_cache_hw_sample_counters(
+				ppd, ibp->rvp.pma_counter_select[i]));
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp,
+					 struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplesresult_ext *p =
+		(struct ib_pma_portsamplesresult_ext *)pmp->data;
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned long flags;
+	u8 status;
+	int i;
+
+	/* Port Sampling code owns the PS* HW counters */
+	memset(pmp->data, 0, sizeof(pmp->data));
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	p->tag = cpu_to_be16(ibp->rvp.pma_tag);
+	if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_TIMER)
+		p->sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+	else {
+		status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+		p->sample_status = cpu_to_be16(status);
+		/* 64 bits */
+		p->extended_width = cpu_to_be32(0x80000000);
+		if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+			cache_hw_sample_counters(ppd);
+			ppd->cong_stats.counter =
+				xmit_wait_get_value_delta(ppd);
+			dd->f_set_cntr_sample(ppd,
+					      QIB_CONG_TIMER_PSINTERVAL, 0);
+			ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER;
+		}
+	}
+	for (i = 0; i < ARRAY_SIZE(ibp->rvp.pma_counter_select); i++)
+		p->counter[i] = cpu_to_be64(
+			get_cache_hw_sample_counters(
+				ppd, ibp->rvp.pma_counter_select[i]));
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_get_portcounters(struct ib_pma_mad *pmp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_verbs_counters cntrs;
+	u8 port_select = p->port_select;
+
+	qib_get_counters(ppd, &cntrs);
+
+	/* Adjust counters for any resets done. */
+	cntrs.symbol_error_counter -= ibp->z_symbol_error_counter;
+	cntrs.link_error_recovery_counter -=
+		ibp->z_link_error_recovery_counter;
+	cntrs.link_downed_counter -= ibp->z_link_downed_counter;
+	cntrs.port_rcv_errors -= ibp->z_port_rcv_errors;
+	cntrs.port_rcv_remphys_errors -= ibp->z_port_rcv_remphys_errors;
+	cntrs.port_xmit_discards -= ibp->z_port_xmit_discards;
+	cntrs.port_xmit_data -= ibp->z_port_xmit_data;
+	cntrs.port_rcv_data -= ibp->z_port_rcv_data;
+	cntrs.port_xmit_packets -= ibp->z_port_xmit_packets;
+	cntrs.port_rcv_packets -= ibp->z_port_rcv_packets;
+	cntrs.local_link_integrity_errors -=
+		ibp->z_local_link_integrity_errors;
+	cntrs.excessive_buffer_overrun_errors -=
+		ibp->z_excessive_buffer_overrun_errors;
+	cntrs.vl15_dropped -= ibp->z_vl15_dropped;
+	cntrs.vl15_dropped += ibp->rvp.n_vl15_dropped;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 || port_select != port)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	if (cntrs.symbol_error_counter > 0xFFFFUL)
+		p->symbol_error_counter = cpu_to_be16(0xFFFF);
+	else
+		p->symbol_error_counter =
+			cpu_to_be16((u16)cntrs.symbol_error_counter);
+	if (cntrs.link_error_recovery_counter > 0xFFUL)
+		p->link_error_recovery_counter = 0xFF;
+	else
+		p->link_error_recovery_counter =
+			(u8)cntrs.link_error_recovery_counter;
+	if (cntrs.link_downed_counter > 0xFFUL)
+		p->link_downed_counter = 0xFF;
+	else
+		p->link_downed_counter = (u8)cntrs.link_downed_counter;
+	if (cntrs.port_rcv_errors > 0xFFFFUL)
+		p->port_rcv_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_errors =
+			cpu_to_be16((u16) cntrs.port_rcv_errors);
+	if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
+		p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_remphys_errors =
+			cpu_to_be16((u16)cntrs.port_rcv_remphys_errors);
+	if (cntrs.port_xmit_discards > 0xFFFFUL)
+		p->port_xmit_discards = cpu_to_be16(0xFFFF);
+	else
+		p->port_xmit_discards =
+			cpu_to_be16((u16)cntrs.port_xmit_discards);
+	if (cntrs.local_link_integrity_errors > 0xFUL)
+		cntrs.local_link_integrity_errors = 0xFUL;
+	if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
+		cntrs.excessive_buffer_overrun_errors = 0xFUL;
+	p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
+		cntrs.excessive_buffer_overrun_errors;
+	if (cntrs.vl15_dropped > 0xFFFFUL)
+		p->vl15_dropped = cpu_to_be16(0xFFFF);
+	else
+		p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
+	if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
+		p->port_xmit_data = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data);
+	if (cntrs.port_rcv_data > 0xFFFFFFFFUL)
+		p->port_rcv_data = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data);
+	if (cntrs.port_xmit_packets > 0xFFFFFFFFUL)
+		p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_xmit_packets =
+			cpu_to_be32((u32)cntrs.port_xmit_packets);
+	if (cntrs.port_rcv_packets > 0xFFFFFFFFUL)
+		p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_rcv_packets =
+			cpu_to_be32((u32) cntrs.port_rcv_packets);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_get_portcounters_cong(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	/* Congestion PMA packets start at offset 24 not 64 */
+	struct ib_pma_portcounters_cong *p =
+		(struct ib_pma_portcounters_cong *)pmp->reserved;
+	struct qib_verbs_counters cntrs;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = dd_from_ppd(ppd);
+	u32 port_select = be32_to_cpu(pmp->mad_hdr.attr_mod) & 0xFF;
+	u64 xmit_wait_counter;
+	unsigned long flags;
+
+	/*
+	 * This check is performed only in the GET method because the
+	 * SET method ends up calling this anyway.
+	 */
+	if (!dd->psxmitwait_supported)
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+	if (port_select != port)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	qib_get_counters(ppd, &cntrs);
+	spin_lock_irqsave(&ppd->ibport_data.rvp.lock, flags);
+	xmit_wait_counter = xmit_wait_get_value_delta(ppd);
+	spin_unlock_irqrestore(&ppd->ibport_data.rvp.lock, flags);
+
+	/* Adjust counters for any resets done. */
+	cntrs.symbol_error_counter -= ibp->z_symbol_error_counter;
+	cntrs.link_error_recovery_counter -=
+		ibp->z_link_error_recovery_counter;
+	cntrs.link_downed_counter -= ibp->z_link_downed_counter;
+	cntrs.port_rcv_errors -= ibp->z_port_rcv_errors;
+	cntrs.port_rcv_remphys_errors -=
+		ibp->z_port_rcv_remphys_errors;
+	cntrs.port_xmit_discards -= ibp->z_port_xmit_discards;
+	cntrs.local_link_integrity_errors -=
+		ibp->z_local_link_integrity_errors;
+	cntrs.excessive_buffer_overrun_errors -=
+		ibp->z_excessive_buffer_overrun_errors;
+	cntrs.vl15_dropped -= ibp->z_vl15_dropped;
+	cntrs.vl15_dropped += ibp->rvp.n_vl15_dropped;
+	cntrs.port_xmit_data -= ibp->z_port_xmit_data;
+	cntrs.port_rcv_data -= ibp->z_port_rcv_data;
+	cntrs.port_xmit_packets -= ibp->z_port_xmit_packets;
+	cntrs.port_rcv_packets -= ibp->z_port_rcv_packets;
+
+	memset(pmp->reserved, 0, sizeof(pmp->reserved));
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	/*
+	 * Set top 3 bits to indicate interval in picoseconds in
+	 * remaining bits.
+	 */
+	p->port_check_rate =
+		cpu_to_be16((QIB_XMIT_RATE_PICO << 13) |
+			    (dd->psxmitwait_check_rate &
+			     ~(QIB_XMIT_RATE_PICO << 13)));
+	p->port_adr_events = cpu_to_be64(0);
+	p->port_xmit_wait = cpu_to_be64(xmit_wait_counter);
+	p->port_xmit_data = cpu_to_be64(cntrs.port_xmit_data);
+	p->port_rcv_data = cpu_to_be64(cntrs.port_rcv_data);
+	p->port_xmit_packets =
+		cpu_to_be64(cntrs.port_xmit_packets);
+	p->port_rcv_packets =
+		cpu_to_be64(cntrs.port_rcv_packets);
+	if (cntrs.symbol_error_counter > 0xFFFFUL)
+		p->symbol_error_counter = cpu_to_be16(0xFFFF);
+	else
+		p->symbol_error_counter =
+			cpu_to_be16(
+				(u16)cntrs.symbol_error_counter);
+	if (cntrs.link_error_recovery_counter > 0xFFUL)
+		p->link_error_recovery_counter = 0xFF;
+	else
+		p->link_error_recovery_counter =
+			(u8)cntrs.link_error_recovery_counter;
+	if (cntrs.link_downed_counter > 0xFFUL)
+		p->link_downed_counter = 0xFF;
+	else
+		p->link_downed_counter =
+			(u8)cntrs.link_downed_counter;
+	if (cntrs.port_rcv_errors > 0xFFFFUL)
+		p->port_rcv_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_errors =
+			cpu_to_be16((u16) cntrs.port_rcv_errors);
+	if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
+		p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_remphys_errors =
+			cpu_to_be16(
+				(u16)cntrs.port_rcv_remphys_errors);
+	if (cntrs.port_xmit_discards > 0xFFFFUL)
+		p->port_xmit_discards = cpu_to_be16(0xFFFF);
+	else
+		p->port_xmit_discards =
+			cpu_to_be16((u16)cntrs.port_xmit_discards);
+	if (cntrs.local_link_integrity_errors > 0xFUL)
+		cntrs.local_link_integrity_errors = 0xFUL;
+	if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
+		cntrs.excessive_buffer_overrun_errors = 0xFUL;
+	p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
+		cntrs.excessive_buffer_overrun_errors;
+	if (cntrs.vl15_dropped > 0xFFFFUL)
+		p->vl15_dropped = cpu_to_be16(0xFFFF);
+	else
+		p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
+
+	return reply((struct ib_smp *)pmp);
+}
+
+static void qib_snapshot_pmacounters(
+	struct qib_ibport *ibp,
+	struct qib_pma_counters *pmacounters)
+{
+	struct qib_pma_counters *p;
+	int cpu;
+
+	memset(pmacounters, 0, sizeof(*pmacounters));
+	for_each_possible_cpu(cpu) {
+		p = per_cpu_ptr(ibp->pmastats, cpu);
+		pmacounters->n_unicast_xmit += p->n_unicast_xmit;
+		pmacounters->n_unicast_rcv += p->n_unicast_rcv;
+		pmacounters->n_multicast_xmit += p->n_multicast_xmit;
+		pmacounters->n_multicast_rcv += p->n_multicast_rcv;
+	}
+}
+
+static int pma_get_portcounters_ext(struct ib_pma_mad *pmp,
+				    struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters_ext *p =
+		(struct ib_pma_portcounters_ext *)pmp->data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u64 swords, rwords, spkts, rpkts, xwait;
+	struct qib_pma_counters pma;
+	u8 port_select = p->port_select;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 || port_select != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		goto bail;
+	}
+
+	qib_snapshot_counters(ppd, &swords, &rwords, &spkts, &rpkts, &xwait);
+
+	/* Adjust counters for any resets done. */
+	swords -= ibp->z_port_xmit_data;
+	rwords -= ibp->z_port_rcv_data;
+	spkts -= ibp->z_port_xmit_packets;
+	rpkts -= ibp->z_port_rcv_packets;
+
+	p->port_xmit_data = cpu_to_be64(swords);
+	p->port_rcv_data = cpu_to_be64(rwords);
+	p->port_xmit_packets = cpu_to_be64(spkts);
+	p->port_rcv_packets = cpu_to_be64(rpkts);
+
+	qib_snapshot_pmacounters(ibp, &pma);
+
+	p->port_unicast_xmit_packets = cpu_to_be64(pma.n_unicast_xmit
+		- ibp->z_unicast_xmit);
+	p->port_unicast_rcv_packets = cpu_to_be64(pma.n_unicast_rcv
+		- ibp->z_unicast_rcv);
+	p->port_multicast_xmit_packets = cpu_to_be64(pma.n_multicast_xmit
+		- ibp->z_multicast_xmit);
+	p->port_multicast_rcv_packets = cpu_to_be64(pma.n_multicast_rcv
+		- ibp->z_multicast_rcv);
+
+bail:
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_set_portcounters(struct ib_pma_mad *pmp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_verbs_counters cntrs;
+
+	/*
+	 * Since the HW doesn't support clearing counters, we save the
+	 * current count and subtract it from future responses.
+	 */
+	qib_get_counters(ppd, &cntrs);
+
+	if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR)
+		ibp->z_symbol_error_counter = cntrs.symbol_error_counter;
+
+	if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY)
+		ibp->z_link_error_recovery_counter =
+			cntrs.link_error_recovery_counter;
+
+	if (p->counter_select & IB_PMA_SEL_LINK_DOWNED)
+		ibp->z_link_downed_counter = cntrs.link_downed_counter;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS)
+		ibp->z_port_rcv_errors = cntrs.port_rcv_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS)
+		ibp->z_port_rcv_remphys_errors =
+			cntrs.port_rcv_remphys_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS)
+		ibp->z_port_xmit_discards = cntrs.port_xmit_discards;
+
+	if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS)
+		ibp->z_local_link_integrity_errors =
+			cntrs.local_link_integrity_errors;
+
+	if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS)
+		ibp->z_excessive_buffer_overrun_errors =
+			cntrs.excessive_buffer_overrun_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) {
+		ibp->rvp.n_vl15_dropped = 0;
+		ibp->z_vl15_dropped = cntrs.vl15_dropped;
+	}
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA)
+		ibp->z_port_xmit_data = cntrs.port_xmit_data;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA)
+		ibp->z_port_rcv_data = cntrs.port_rcv_data;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS)
+		ibp->z_port_xmit_packets = cntrs.port_xmit_packets;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS)
+		ibp->z_port_rcv_packets = cntrs.port_rcv_packets;
+
+	return pma_get_portcounters(pmp, ibdev, port);
+}
+
+static int pma_set_portcounters_cong(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = dd_from_ppd(ppd);
+	struct qib_verbs_counters cntrs;
+	u32 counter_select = (be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24) & 0xFF;
+	int ret = 0;
+	unsigned long flags;
+
+	qib_get_counters(ppd, &cntrs);
+	/* Get counter values before we save them */
+	ret = pma_get_portcounters_cong(pmp, ibdev, port);
+
+	if (counter_select & IB_PMA_SEL_CONG_XMIT) {
+		spin_lock_irqsave(&ppd->ibport_data.rvp.lock, flags);
+		ppd->cong_stats.counter = 0;
+		dd->f_set_cntr_sample(ppd, QIB_CONG_TIMER_PSINTERVAL,
+				      0x0);
+		spin_unlock_irqrestore(&ppd->ibport_data.rvp.lock, flags);
+	}
+	if (counter_select & IB_PMA_SEL_CONG_PORT_DATA) {
+		ibp->z_port_xmit_data = cntrs.port_xmit_data;
+		ibp->z_port_rcv_data = cntrs.port_rcv_data;
+		ibp->z_port_xmit_packets = cntrs.port_xmit_packets;
+		ibp->z_port_rcv_packets = cntrs.port_rcv_packets;
+	}
+	if (counter_select & IB_PMA_SEL_CONG_ALL) {
+		ibp->z_symbol_error_counter =
+			cntrs.symbol_error_counter;
+		ibp->z_link_error_recovery_counter =
+			cntrs.link_error_recovery_counter;
+		ibp->z_link_downed_counter =
+			cntrs.link_downed_counter;
+		ibp->z_port_rcv_errors = cntrs.port_rcv_errors;
+		ibp->z_port_rcv_remphys_errors =
+			cntrs.port_rcv_remphys_errors;
+		ibp->z_port_xmit_discards =
+			cntrs.port_xmit_discards;
+		ibp->z_local_link_integrity_errors =
+			cntrs.local_link_integrity_errors;
+		ibp->z_excessive_buffer_overrun_errors =
+			cntrs.excessive_buffer_overrun_errors;
+		ibp->rvp.n_vl15_dropped = 0;
+		ibp->z_vl15_dropped = cntrs.vl15_dropped;
+	}
+
+	return ret;
+}
+
+static int pma_set_portcounters_ext(struct ib_pma_mad *pmp,
+				    struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u64 swords, rwords, spkts, rpkts, xwait;
+	struct qib_pma_counters pma;
+
+	qib_snapshot_counters(ppd, &swords, &rwords, &spkts, &rpkts, &xwait);
+
+	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA)
+		ibp->z_port_xmit_data = swords;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA)
+		ibp->z_port_rcv_data = rwords;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS)
+		ibp->z_port_xmit_packets = spkts;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS)
+		ibp->z_port_rcv_packets = rpkts;
+
+	qib_snapshot_pmacounters(ibp, &pma);
+
+	if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS)
+		ibp->z_unicast_xmit = pma.n_unicast_xmit;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS)
+		ibp->z_unicast_rcv = pma.n_unicast_rcv;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS)
+		ibp->z_multicast_xmit = pma.n_multicast_xmit;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS)
+		ibp->z_multicast_rcv = pma.n_multicast_rcv;
+
+	return pma_get_portcounters_ext(pmp, ibdev, port);
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+			u8 port, const struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_smp *smp = (struct ib_smp *)out_mad;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	int ret;
+
+	*out_mad = *in_mad;
+	if (smp->class_version != 1) {
+		smp->status |= IB_SMP_UNSUP_VERSION;
+		ret = reply(smp);
+		goto bail;
+	}
+
+	ret = check_mkey(ibp, smp, mad_flags);
+	if (ret) {
+		u32 port_num = be32_to_cpu(smp->attr_mod);
+
+		/*
+		 * If this is a get/set portinfo, we already check the
+		 * M_Key if the MAD is for another port and the M_Key
+		 * is OK on the receiving port. This check is needed
+		 * to increment the error counters when the M_Key
+		 * fails to match on *both* ports.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+		    (smp->method == IB_MGMT_METHOD_GET ||
+		     smp->method == IB_MGMT_METHOD_SET) &&
+		    port_num && port_num <= ibdev->phys_port_cnt &&
+		    port != port_num)
+			(void) check_mkey(to_iport(ibdev, port_num), smp, 0);
+		ret = IB_MAD_RESULT_FAILURE;
+		goto bail;
+	}
+
+	switch (smp->method) {
+	case IB_MGMT_METHOD_GET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_NODE_DESC:
+			ret = subn_get_nodedescription(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_NODE_INFO:
+			ret = subn_get_nodeinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_GUID_INFO:
+			ret = subn_get_guidinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_PORT_INFO:
+			ret = subn_get_portinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_PKEY_TABLE:
+			ret = subn_get_pkeytable(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_SL_TO_VL_TABLE:
+			ret = subn_get_sl_to_vl(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_VL_ARB_TABLE:
+			ret = subn_get_vl_arb(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_SM_INFO:
+			if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED) {
+				ret = IB_MAD_RESULT_SUCCESS |
+					IB_MAD_RESULT_CONSUMED;
+				goto bail;
+			}
+			if (ibp->rvp.port_cap_flags & IB_PORT_SM) {
+				ret = IB_MAD_RESULT_SUCCESS;
+				goto bail;
+			}
+			/* FALLTHROUGH */
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_GUID_INFO:
+			ret = subn_set_guidinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_PORT_INFO:
+			ret = subn_set_portinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_PKEY_TABLE:
+			ret = subn_set_pkeytable(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_SL_TO_VL_TABLE:
+			ret = subn_set_sl_to_vl(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_VL_ARB_TABLE:
+			ret = subn_set_vl_arb(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_SM_INFO:
+			if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED) {
+				ret = IB_MAD_RESULT_SUCCESS |
+					IB_MAD_RESULT_CONSUMED;
+				goto bail;
+			}
+			if (ibp->rvp.port_cap_flags & IB_PORT_SM) {
+				ret = IB_MAD_RESULT_SUCCESS;
+				goto bail;
+			}
+			/* FALLTHROUGH */
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_TRAP_REPRESS:
+		if (smp->attr_id == IB_SMP_ATTR_NOTICE)
+			ret = subn_trap_repress(smp, ibdev, port);
+		else {
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+		}
+		goto bail;
+
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_MGMT_METHOD_REPORT_RESP:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+
+	case IB_MGMT_METHOD_SEND:
+		if (ib_get_smp_direction(smp) &&
+		    smp->attr_id == QIB_VENDOR_IPG) {
+			ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PORT,
+					      smp->data[0]);
+			ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+		} else
+			ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+
+	default:
+		smp->status |= IB_SMP_UNSUP_METHOD;
+		ret = reply(smp);
+	}
+
+bail:
+	return ret;
+}
+
+static int process_perf(struct ib_device *ibdev, u8 port,
+			const struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
+	int ret;
+
+	*out_mad = *in_mad;
+	if (pmp->mad_hdr.class_version != 1) {
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_smp *) pmp);
+		goto bail;
+	}
+
+	switch (pmp->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_CLASS_PORT_INFO:
+			ret = pma_get_classportinfo(pmp, ibdev);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_CONTROL:
+			ret = pma_get_portsamplescontrol(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_RESULT:
+			ret = pma_get_portsamplesresult(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_RESULT_EXT:
+			ret = pma_get_portsamplesresult_ext(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS:
+			ret = pma_get_portcounters(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_EXT:
+			ret = pma_get_portcounters_ext(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_CONG:
+			ret = pma_get_portcounters_cong(pmp, ibdev, port);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_PORT_SAMPLES_CONTROL:
+			ret = pma_set_portsamplescontrol(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS:
+			ret = pma_set_portcounters(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_EXT:
+			ret = pma_set_portcounters_ext(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_CONG:
+			ret = pma_set_portcounters_cong(pmp, ibdev, port);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+
+	default:
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_smp *) pmp);
+	}
+
+bail:
+	return ret;
+}
+
+static int cc_get_classportinfo(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev)
+{
+	struct ib_cc_classportinfo_attr *p =
+		(struct ib_cc_classportinfo_attr *)ccp->mgmt_data;
+
+	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
+
+	p->base_version = 1;
+	p->class_version = 1;
+	p->cap_mask = 0;
+
+	/*
+	 * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
+	 */
+	p->resp_time_value = 18;
+
+	return reply((struct ib_smp *) ccp);
+}
+
+static int cc_get_congestion_info(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_cc_info_attr *p =
+		(struct ib_cc_info_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
+
+	p->congestion_info = 0;
+	p->control_table_cap = ppd->cc_max_table_entries;
+
+	return reply((struct ib_smp *) ccp);
+}
+
+static int cc_get_congestion_setting(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	int i;
+	struct ib_cc_congestion_setting_attr *p =
+		(struct ib_cc_congestion_setting_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct ib_cc_congestion_entry_shadow *entries;
+
+	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
+
+	spin_lock(&ppd->cc_shadow_lock);
+
+	entries = ppd->congestion_entries_shadow->entries;
+	p->port_control = cpu_to_be16(
+		ppd->congestion_entries_shadow->port_control);
+	p->control_map = cpu_to_be16(
+		ppd->congestion_entries_shadow->control_map);
+	for (i = 0; i < IB_CC_CCS_ENTRIES; i++) {
+		p->entries[i].ccti_increase = entries[i].ccti_increase;
+		p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer);
+		p->entries[i].trigger_threshold = entries[i].trigger_threshold;
+		p->entries[i].ccti_min = entries[i].ccti_min;
+	}
+
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return reply((struct ib_smp *) ccp);
+}
+
+static int cc_get_congestion_control_table(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_cc_table_attr *p =
+		(struct ib_cc_table_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 cct_block_index = be32_to_cpu(ccp->attr_mod);
+	u32 max_cct_block;
+	u32 cct_entry;
+	struct ib_cc_table_entry_shadow *entries;
+	int i;
+
+	/* Is the table index more than what is supported? */
+	if (cct_block_index > IB_CC_TABLE_CAP_DEFAULT - 1)
+		goto bail;
+
+	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
+
+	spin_lock(&ppd->cc_shadow_lock);
+
+	max_cct_block =
+		(ppd->ccti_entries_shadow->ccti_last_entry + 1)/IB_CCT_ENTRIES;
+	max_cct_block = max_cct_block ? max_cct_block - 1 : 0;
+
+	if (cct_block_index > max_cct_block) {
+		spin_unlock(&ppd->cc_shadow_lock);
+		goto bail;
+	}
+
+	ccp->attr_mod = cpu_to_be32(cct_block_index);
+
+	cct_entry = IB_CCT_ENTRIES * (cct_block_index + 1);
+
+	cct_entry--;
+
+	p->ccti_limit = cpu_to_be16(cct_entry);
+
+	entries = &ppd->ccti_entries_shadow->
+			entries[IB_CCT_ENTRIES * cct_block_index];
+	cct_entry %= IB_CCT_ENTRIES;
+
+	for (i = 0; i <= cct_entry; i++)
+		p->ccti_entries[i].entry = cpu_to_be16(entries[i].entry);
+
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return reply((struct ib_smp *) ccp);
+
+bail:
+	return reply_failure((struct ib_smp *) ccp);
+}
+
+static int cc_set_congestion_setting(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_cc_congestion_setting_attr *p =
+		(struct ib_cc_congestion_setting_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	int i;
+
+	ppd->cc_sl_control_map = be16_to_cpu(p->control_map);
+
+	for (i = 0; i < IB_CC_CCS_ENTRIES; i++) {
+		ppd->congestion_entries[i].ccti_increase =
+			p->entries[i].ccti_increase;
+
+		ppd->congestion_entries[i].ccti_timer =
+			be16_to_cpu(p->entries[i].ccti_timer);
+
+		ppd->congestion_entries[i].trigger_threshold =
+			p->entries[i].trigger_threshold;
+
+		ppd->congestion_entries[i].ccti_min =
+			p->entries[i].ccti_min;
+	}
+
+	return reply((struct ib_smp *) ccp);
+}
+
+static int cc_set_congestion_control_table(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_cc_table_attr *p =
+		(struct ib_cc_table_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 cct_block_index = be32_to_cpu(ccp->attr_mod);
+	u32 cct_entry;
+	struct ib_cc_table_entry_shadow *entries;
+	int i;
+
+	/* Is the table index more than what is supported? */
+	if (cct_block_index > IB_CC_TABLE_CAP_DEFAULT - 1)
+		goto bail;
+
+	/* If this packet is the first in the sequence then
+	 * zero the total table entry count.
+	 */
+	if (be16_to_cpu(p->ccti_limit) < IB_CCT_ENTRIES)
+		ppd->total_cct_entry = 0;
+
+	cct_entry = (be16_to_cpu(p->ccti_limit))%IB_CCT_ENTRIES;
+
+	/* ccti_limit is 0 to 63 */
+	ppd->total_cct_entry += (cct_entry + 1);
+
+	if (ppd->total_cct_entry > ppd->cc_supported_table_entries)
+		goto bail;
+
+	ppd->ccti_limit = be16_to_cpu(p->ccti_limit);
+
+	entries = ppd->ccti_entries + (IB_CCT_ENTRIES * cct_block_index);
+
+	for (i = 0; i <= cct_entry; i++)
+		entries[i].entry = be16_to_cpu(p->ccti_entries[i].entry);
+
+	spin_lock(&ppd->cc_shadow_lock);
+
+	ppd->ccti_entries_shadow->ccti_last_entry = ppd->total_cct_entry - 1;
+	memcpy(ppd->ccti_entries_shadow->entries, ppd->ccti_entries,
+		(ppd->total_cct_entry * sizeof(struct ib_cc_table_entry)));
+
+	ppd->congestion_entries_shadow->port_control = IB_CC_CCS_PC_SL_BASED;
+	ppd->congestion_entries_shadow->control_map = ppd->cc_sl_control_map;
+	memcpy(ppd->congestion_entries_shadow->entries, ppd->congestion_entries,
+		IB_CC_CCS_ENTRIES * sizeof(struct ib_cc_congestion_entry));
+
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return reply((struct ib_smp *) ccp);
+
+bail:
+	return reply_failure((struct ib_smp *) ccp);
+}
+
+static int check_cc_key(struct qib_ibport *ibp,
+			struct ib_cc_mad *ccp, int mad_flags)
+{
+	return 0;
+}
+
+static int process_cc(struct ib_device *ibdev, int mad_flags,
+			u8 port, const struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_cc_mad *ccp = (struct ib_cc_mad *)out_mad;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	int ret;
+
+	*out_mad = *in_mad;
+
+	if (ccp->class_version != 2) {
+		ccp->status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_smp *)ccp);
+		goto bail;
+	}
+
+	ret = check_cc_key(ibp, ccp, mad_flags);
+	if (ret)
+		goto bail;
+
+	switch (ccp->method) {
+	case IB_MGMT_METHOD_GET:
+		switch (ccp->attr_id) {
+		case IB_CC_ATTR_CLASSPORTINFO:
+			ret = cc_get_classportinfo(ccp, ibdev);
+			goto bail;
+
+		case IB_CC_ATTR_CONGESTION_INFO:
+			ret = cc_get_congestion_info(ccp, ibdev, port);
+			goto bail;
+
+		case IB_CC_ATTR_CA_CONGESTION_SETTING:
+			ret = cc_get_congestion_setting(ccp, ibdev, port);
+			goto bail;
+
+		case IB_CC_ATTR_CONGESTION_CONTROL_TABLE:
+			ret = cc_get_congestion_control_table(ccp, ibdev, port);
+			goto bail;
+
+			/* FALLTHROUGH */
+		default:
+			ccp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) ccp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (ccp->attr_id) {
+		case IB_CC_ATTR_CA_CONGESTION_SETTING:
+			ret = cc_set_congestion_setting(ccp, ibdev, port);
+			goto bail;
+
+		case IB_CC_ATTR_CONGESTION_CONTROL_TABLE:
+			ret = cc_set_congestion_control_table(ccp, ibdev, port);
+			goto bail;
+
+			/* FALLTHROUGH */
+		default:
+			ccp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) ccp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+
+	case IB_MGMT_METHOD_TRAP:
+	default:
+		ccp->status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_smp *) ccp);
+	}
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+		    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		    const struct ib_mad_hdr *in, size_t in_mad_size,
+		    struct ib_mad_hdr *out, size_t *out_mad_size,
+		    u16 *out_mad_pkey_index)
+{
+	int ret;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	const struct ib_mad *in_mad = (const struct ib_mad *)in;
+	struct ib_mad *out_mad = (struct ib_mad *)out;
+
+	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
+			 *out_mad_size != sizeof(*out_mad)))
+		return IB_MAD_RESULT_FAILURE;
+
+	switch (in_mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+		ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
+		goto bail;
+
+	case IB_MGMT_CLASS_PERF_MGMT:
+		ret = process_perf(ibdev, port, in_mad, out_mad);
+		goto bail;
+
+	case IB_MGMT_CLASS_CONG_MGMT:
+		if (!ppd->congestion_entries_shadow ||
+			 !qib_cc_table_size) {
+			ret = IB_MAD_RESULT_SUCCESS;
+			goto bail;
+		}
+		ret = process_cc(ibdev, mad_flags, port, in_mad, out_mad);
+		goto bail;
+
+	default:
+		ret = IB_MAD_RESULT_SUCCESS;
+	}
+
+bail:
+	return ret;
+}
+
+static void xmit_wait_timer_func(struct timer_list *t)
+{
+	struct qib_pportdata *ppd = from_timer(ppd, t, cong_stats.timer);
+	struct qib_devdata *dd = dd_from_ppd(ppd);
+	unsigned long flags;
+	u8 status;
+
+	spin_lock_irqsave(&ppd->ibport_data.rvp.lock, flags);
+	if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_SAMPLE) {
+		status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+		if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+			/* save counter cache */
+			cache_hw_sample_counters(ppd);
+			ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER;
+		} else
+			goto done;
+	}
+	ppd->cong_stats.counter = xmit_wait_get_value_delta(ppd);
+	dd->f_set_cntr_sample(ppd, QIB_CONG_TIMER_PSINTERVAL, 0x0);
+done:
+	spin_unlock_irqrestore(&ppd->ibport_data.rvp.lock, flags);
+	mod_timer(&ppd->cong_stats.timer, jiffies + HZ);
+}
+
+void qib_notify_create_mad_agent(struct rvt_dev_info *rdi, int port_idx)
+{
+	struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi);
+	struct qib_devdata *dd = container_of(ibdev,
+					      struct qib_devdata, verbs_dev);
+
+	/* Initialize xmit_wait structure */
+	dd->pport[port_idx].cong_stats.counter = 0;
+	timer_setup(&dd->pport[port_idx].cong_stats.timer,
+		    xmit_wait_timer_func, 0);
+	dd->pport[port_idx].cong_stats.timer.expires = 0;
+	add_timer(&dd->pport[port_idx].cong_stats.timer);
+}
+
+void qib_notify_free_mad_agent(struct rvt_dev_info *rdi, int port_idx)
+{
+	struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi);
+	struct qib_devdata *dd = container_of(ibdev,
+					      struct qib_devdata, verbs_dev);
+
+	if (dd->pport[port_idx].cong_stats.timer.function)
+		del_timer_sync(&dd->pport[port_idx].cong_stats.timer);
+
+	if (dd->pport[port_idx].ibport_data.smi_ah)
+		rdma_destroy_ah(&dd->pport[port_idx].ibport_data.smi_ah->ibah);
+}
diff --git a/drivers/infiniband/hw/qib/qib_mad.h b/drivers/infiniband/hw/qib/qib_mad.h
new file mode 100644
index 0000000..57e99dc
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_mad.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _QIB_MAD_H
+#define _QIB_MAD_H
+
+#include <rdma/ib_pma.h>
+
+#define IB_SMP_UNSUP_VERSION \
+cpu_to_be16(IB_MGMT_MAD_STATUS_BAD_VERSION)
+
+#define IB_SMP_UNSUP_METHOD \
+cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD)
+
+#define IB_SMP_UNSUP_METH_ATTR \
+cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB)
+
+#define IB_SMP_INVALID_FIELD \
+cpu_to_be16(IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE)
+
+#define IB_VLARB_LOWPRI_0_31    1
+#define IB_VLARB_LOWPRI_32_63   2
+#define IB_VLARB_HIGHPRI_0_31   3
+#define IB_VLARB_HIGHPRI_32_63  4
+
+#define IB_PMA_PORT_COUNTERS_CONG       cpu_to_be16(0xFF00)
+
+struct ib_pma_portcounters_cong {
+	u8 reserved;
+	u8 reserved1;
+	__be16 port_check_rate;
+	__be16 symbol_error_counter;
+	u8 link_error_recovery_counter;
+	u8 link_downed_counter;
+	__be16 port_rcv_errors;
+	__be16 port_rcv_remphys_errors;
+	__be16 port_rcv_switch_relay_errors;
+	__be16 port_xmit_discards;
+	u8 port_xmit_constraint_errors;
+	u8 port_rcv_constraint_errors;
+	u8 reserved2;
+	u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+	__be16 reserved3;
+	__be16 vl15_dropped;
+	__be64 port_xmit_data;
+	__be64 port_rcv_data;
+	__be64 port_xmit_packets;
+	__be64 port_rcv_packets;
+	__be64 port_xmit_wait;
+	__be64 port_adr_events;
+} __packed;
+
+#define IB_PMA_CONG_HW_CONTROL_TIMER            0x00
+#define IB_PMA_CONG_HW_CONTROL_SAMPLE           0x01
+
+#define QIB_XMIT_RATE_UNSUPPORTED               0x0
+#define QIB_XMIT_RATE_PICO                      0x7
+/* number of 4nsec cycles equaling 2secs */
+#define QIB_CONG_TIMER_PSINTERVAL               0x1DCD64EC
+
+#define IB_PMA_SEL_CONG_ALL                     0x01
+#define IB_PMA_SEL_CONG_PORT_DATA               0x02
+#define IB_PMA_SEL_CONG_XMIT                    0x04
+#define IB_PMA_SEL_CONG_ROUTING                 0x08
+
+/*
+ * Congestion control class attributes
+ */
+#define IB_CC_ATTR_CLASSPORTINFO			cpu_to_be16(0x0001)
+#define IB_CC_ATTR_NOTICE				cpu_to_be16(0x0002)
+#define IB_CC_ATTR_CONGESTION_INFO			cpu_to_be16(0x0011)
+#define IB_CC_ATTR_CONGESTION_KEY_INFO			cpu_to_be16(0x0012)
+#define IB_CC_ATTR_CONGESTION_LOG			cpu_to_be16(0x0013)
+#define IB_CC_ATTR_SWITCH_CONGESTION_SETTING		cpu_to_be16(0x0014)
+#define IB_CC_ATTR_SWITCH_PORT_CONGESTION_SETTING	cpu_to_be16(0x0015)
+#define IB_CC_ATTR_CA_CONGESTION_SETTING		cpu_to_be16(0x0016)
+#define IB_CC_ATTR_CONGESTION_CONTROL_TABLE		cpu_to_be16(0x0017)
+#define IB_CC_ATTR_TIME_STAMP				cpu_to_be16(0x0018)
+
+/* generalizations for threshold values */
+#define IB_CC_THRESHOLD_NONE 0x0
+#define IB_CC_THRESHOLD_MIN  0x1
+#define IB_CC_THRESHOLD_MAX  0xf
+
+/* CCA MAD header constants */
+#define IB_CC_MAD_LOGDATA_LEN 32
+#define IB_CC_MAD_MGMTDATA_LEN 192
+
+struct ib_cc_mad {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	__be16	status;
+	__be16	class_specific;
+	__be64	tid;
+	__be16	attr_id;
+	__be16	resv;
+	__be32	attr_mod;
+	__be64 cckey;
+
+	/* For CongestionLog attribute only */
+	u8 log_data[IB_CC_MAD_LOGDATA_LEN];
+
+	u8 mgmt_data[IB_CC_MAD_MGMTDATA_LEN];
+} __packed;
+
+/*
+ * Congestion Control class portinfo capability mask bits
+ */
+#define IB_CC_CPI_CM_TRAP_GEN		cpu_to_be16(1 << 0)
+#define IB_CC_CPI_CM_GET_SET_NOTICE	cpu_to_be16(1 << 1)
+#define IB_CC_CPI_CM_CAP2		cpu_to_be16(1 << 2)
+#define IB_CC_CPI_CM_ENHANCEDPORT0_CC	cpu_to_be16(1 << 8)
+
+struct ib_cc_classportinfo_attr {
+	u8 base_version;
+	u8 class_version;
+	__be16 cap_mask;
+	u8 reserved[3];
+	u8 resp_time_value;     /* only lower 5 bits */
+	union ib_gid redirect_gid;
+	__be32 redirect_tc_sl_fl;       /* 8, 4, 20 bits respectively */
+	__be16 redirect_lid;
+	__be16 redirect_pkey;
+	__be32 redirect_qp;     /* only lower 24 bits */
+	__be32 redirect_qkey;
+	union ib_gid trap_gid;
+	__be32 trap_tc_sl_fl;   /* 8, 4, 20 bits respectively */
+	__be16 trap_lid;
+	__be16 trap_pkey;
+	__be32 trap_hl_qp;      /* 8, 24 bits respectively */
+	__be32 trap_qkey;
+} __packed;
+
+/* Congestion control traps */
+#define IB_CC_TRAP_KEY_VIOLATION 0x0000
+
+struct ib_cc_trap_key_violation_attr {
+	__be16 source_lid;
+	u8 method;
+	u8 reserved1;
+	__be16 attrib_id;
+	__be32 attrib_mod;
+	__be32 qp;
+	__be64 cckey;
+	u8 sgid[16];
+	u8 padding[24];
+} __packed;
+
+/* Congestion info flags */
+#define IB_CC_CI_FLAGS_CREDIT_STARVATION 0x1
+#define IB_CC_TABLE_CAP_DEFAULT 31
+
+struct ib_cc_info_attr {
+	__be16 congestion_info;
+	u8  control_table_cap; /* Multiple of 64 entry unit CCTs */
+} __packed;
+
+struct ib_cc_key_info_attr {
+	__be64 cckey;
+	u8  protect;
+	__be16 lease_period;
+	__be16 violations;
+} __packed;
+
+#define IB_CC_CL_CA_LOGEVENTS_LEN 208
+
+struct ib_cc_log_attr {
+	u8 log_type;
+	u8 congestion_flags;
+	__be16 threshold_event_counter;
+	__be16 threshold_congestion_event_map;
+	__be16 current_time_stamp;
+	u8 log_events[IB_CC_CL_CA_LOGEVENTS_LEN];
+} __packed;
+
+#define IB_CC_CLEC_SERVICETYPE_RC 0x0
+#define IB_CC_CLEC_SERVICETYPE_UC 0x1
+#define IB_CC_CLEC_SERVICETYPE_RD 0x2
+#define IB_CC_CLEC_SERVICETYPE_UD 0x3
+
+struct ib_cc_log_event {
+	u8 local_qp_cn_entry;
+	u8 remote_qp_number_cn_entry[3];
+	u8  sl_cn_entry:4;
+	u8  service_type_cn_entry:4;
+	__be32 remote_lid_cn_entry;
+	__be32 timestamp_cn_entry;
+} __packed;
+
+/* Sixteen congestion entries */
+#define IB_CC_CCS_ENTRIES 16
+
+/* Port control flags */
+#define IB_CC_CCS_PC_SL_BASED 0x01
+
+struct ib_cc_congestion_entry {
+	u8 ccti_increase;
+	__be16 ccti_timer;
+	u8 trigger_threshold;
+	u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct ib_cc_congestion_entry_shadow {
+	u8 ccti_increase;
+	u16 ccti_timer;
+	u8 trigger_threshold;
+	u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct ib_cc_congestion_setting_attr {
+	__be16 port_control;
+	__be16 control_map;
+	struct ib_cc_congestion_entry entries[IB_CC_CCS_ENTRIES];
+} __packed;
+
+struct ib_cc_congestion_setting_attr_shadow {
+	u16 port_control;
+	u16 control_map;
+	struct ib_cc_congestion_entry_shadow entries[IB_CC_CCS_ENTRIES];
+} __packed;
+
+#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1
+#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1
+
+/* 64 Congestion Control table entries in a single MAD */
+#define IB_CCT_ENTRIES 64
+#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2)
+
+struct ib_cc_table_entry {
+	__be16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_entry_shadow {
+	u16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_attr {
+	__be16 ccti_limit; /* max CCTI for cc table */
+	struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+struct ib_cc_table_attr_shadow {
+	u16 ccti_limit; /* max CCTI for cc table */
+	struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+#define CC_TABLE_SHADOW_MAX \
+	(IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES)
+
+struct cc_table_shadow {
+	u16 ccti_last_entry;
+	struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
+} __packed;
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 \
+	cpu_to_be32(COUNTER_MASK(1, 0) | \
+		    COUNTER_MASK(1, 1) | \
+		    COUNTER_MASK(1, 2) | \
+		    COUNTER_MASK(1, 3) | \
+		    COUNTER_MASK(1, 4))
+
+#endif				/* _QIB_MAD_H */
diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c
new file mode 100644
index 0000000..5ac7b31
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_pcie.c
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) 2010 - 2017 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2008, 2009 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/aer.h>
+#include <linux/module.h>
+
+#include "qib.h"
+
+/*
+ * This file contains PCIe utility routines that are common to the
+ * various QLogic InfiniPath adapters
+ */
+
+/*
+ * Code to adjust PCIe capabilities.
+ * To minimize the change footprint, we call it
+ * from qib_pcie_params, which every chip-specific
+ * file calls, even though this violates some
+ * expectations of harmlessness.
+ */
+static void qib_tune_pcie_caps(struct qib_devdata *);
+static void qib_tune_pcie_coalesce(struct qib_devdata *);
+
+/*
+ * Do all the common PCIe setup and initialization.
+ * devdata is not yet allocated, and is not allocated until after this
+ * routine returns success.  Therefore qib_dev_err() can't be used for error
+ * printing.
+ */
+int qib_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret;
+
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		/*
+		 * This can happen (in theory) iff:
+		 * We did a chip reset, and then failed to reprogram the
+		 * BAR, or the chip reset due to an internal error.  We then
+		 * unloaded the driver and reloaded it.
+		 *
+		 * Both reset cases set the BAR back to initial state.  For
+		 * the latter case, the AER sticky error bit at offset 0x718
+		 * should be set, but the Linux kernel doesn't yet know
+		 * about that, it appears.  If the original BAR was retained
+		 * in the kernel data structures, this may be OK.
+		 */
+		qib_early_err(&pdev->dev, "pci enable failed: error %d\n",
+			      -ret);
+		goto done;
+	}
+
+	ret = pci_request_regions(pdev, QIB_DRV_NAME);
+	if (ret) {
+		qib_devinfo(pdev, "pci_request_regions fails: err %d\n", -ret);
+		goto bail;
+	}
+
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		/*
+		 * If the 64 bit setup fails, try 32 bit.  Some systems
+		 * do not setup 64 bit maps on systems with 2GB or less
+		 * memory installed.
+		 */
+		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (ret) {
+			qib_devinfo(pdev, "Unable to set DMA mask: %d\n", ret);
+			goto bail;
+		}
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+	} else
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		qib_early_err(&pdev->dev,
+			      "Unable to set DMA consistent mask: %d\n", ret);
+		goto bail;
+	}
+
+	pci_set_master(pdev);
+	ret = pci_enable_pcie_error_reporting(pdev);
+	if (ret) {
+		qib_early_err(&pdev->dev,
+			      "Unable to enable pcie error reporting: %d\n",
+			      ret);
+		ret = 0;
+	}
+	goto done;
+
+bail:
+	pci_disable_device(pdev);
+	pci_release_regions(pdev);
+done:
+	return ret;
+}
+
+/*
+ * Do remaining PCIe setup, once dd is allocated, and save away
+ * fields required to re-initialize after a chip reset, or for
+ * various other purposes
+ */
+int qib_pcie_ddinit(struct qib_devdata *dd, struct pci_dev *pdev,
+		    const struct pci_device_id *ent)
+{
+	unsigned long len;
+	resource_size_t addr;
+
+	dd->pcidev = pdev;
+	pci_set_drvdata(pdev, dd);
+
+	addr = pci_resource_start(pdev, 0);
+	len = pci_resource_len(pdev, 0);
+
+	dd->kregbase = ioremap_nocache(addr, len);
+	if (!dd->kregbase)
+		return -ENOMEM;
+
+	dd->kregend = (u64 __iomem *)((void __iomem *) dd->kregbase + len);
+	dd->physaddr = addr;        /* used for io_remap, etc. */
+
+	/*
+	 * Save BARs to rewrite after device reset.  Save all 64 bits of
+	 * BAR, just in case.
+	 */
+	dd->pcibar0 = addr;
+	dd->pcibar1 = addr >> 32;
+	dd->deviceid = ent->device; /* save for later use */
+	dd->vendorid = ent->vendor;
+
+	return 0;
+}
+
+/*
+ * Do PCIe cleanup, after chip-specific cleanup, etc.  Just prior
+ * to releasing the dd memory.
+ * void because none of the core pcie cleanup returns are void
+ */
+void qib_pcie_ddcleanup(struct qib_devdata *dd)
+{
+	u64 __iomem *base = (void __iomem *) dd->kregbase;
+
+	dd->kregbase = NULL;
+	iounmap(base);
+	if (dd->piobase)
+		iounmap(dd->piobase);
+	if (dd->userbase)
+		iounmap(dd->userbase);
+	if (dd->piovl15base)
+		iounmap(dd->piovl15base);
+
+	pci_disable_device(dd->pcidev);
+	pci_release_regions(dd->pcidev);
+
+	pci_set_drvdata(dd->pcidev, NULL);
+}
+
+/**
+ * We save the msi lo and hi values, so we can restore them after
+ * chip reset (the kernel PCI infrastructure doesn't yet handle that
+ * correctly.
+ */
+static void qib_cache_msi_info(struct qib_devdata *dd, int pos)
+{
+	struct pci_dev *pdev = dd->pcidev;
+	u16 control;
+
+	pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_LO, &dd->msi_lo);
+	pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_HI, &dd->msi_hi);
+	pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &control);
+
+	/* now save the data (vector) info */
+	pci_read_config_word(pdev,
+			     pos + ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8),
+			     &dd->msi_data);
+}
+
+int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent)
+{
+	u16 linkstat, speed;
+	int nvec;
+	int maxvec;
+	unsigned int flags = PCI_IRQ_MSIX | PCI_IRQ_MSI;
+
+	if (!pci_is_pcie(dd->pcidev)) {
+		qib_dev_err(dd, "Can't find PCI Express capability!\n");
+		/* set up something... */
+		dd->lbus_width = 1;
+		dd->lbus_speed = 2500; /* Gen1, 2.5GHz */
+		nvec = -1;
+		goto bail;
+	}
+
+	if (dd->flags & QIB_HAS_INTX)
+		flags |= PCI_IRQ_LEGACY;
+	maxvec = (nent && *nent) ? *nent : 1;
+	nvec = pci_alloc_irq_vectors(dd->pcidev, 1, maxvec, flags);
+	if (nvec < 0)
+		goto bail;
+
+	/*
+	 * If nent exists, make sure to record how many vectors were allocated.
+	 * If msix_enabled is false, return 0 so the fallback code works
+	 * correctly.
+	 */
+	if (nent)
+		*nent = !dd->pcidev->msix_enabled ? 0 : nvec;
+
+	if (dd->pcidev->msi_enabled)
+		qib_cache_msi_info(dd, dd->pcidev->msi_cap);
+
+	pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
+	/*
+	 * speed is bits 0-3, linkwidth is bits 4-8
+	 * no defines for them in headers
+	 */
+	speed = linkstat & 0xf;
+	linkstat >>= 4;
+	linkstat &= 0x1f;
+	dd->lbus_width = linkstat;
+
+	switch (speed) {
+	case 1:
+		dd->lbus_speed = 2500; /* Gen1, 2.5GHz */
+		break;
+	case 2:
+		dd->lbus_speed = 5000; /* Gen1, 5GHz */
+		break;
+	default: /* not defined, assume gen1 */
+		dd->lbus_speed = 2500;
+		break;
+	}
+
+	/*
+	 * Check against expected pcie width and complain if "wrong"
+	 * on first initialization, not afterwards (i.e., reset).
+	 */
+	if (minw && linkstat < minw)
+		qib_dev_err(dd,
+			    "PCIe width %u (x%u HCA), performance reduced\n",
+			    linkstat, minw);
+
+	qib_tune_pcie_caps(dd);
+
+	qib_tune_pcie_coalesce(dd);
+
+bail:
+	/* fill in string, even on errors */
+	snprintf(dd->lbus_info, sizeof(dd->lbus_info),
+		 "PCIe,%uMHz,x%u\n", dd->lbus_speed, dd->lbus_width);
+	return nvec < 0 ? nvec : 0;
+}
+
+/**
+ * qib_free_irq - Cleanup INTx and MSI interrupts
+ * @dd: valid pointer to qib dev data
+ *
+ * Since cleanup for INTx and MSI interrupts is trivial, have a common
+ * routine.
+ *
+ */
+void qib_free_irq(struct qib_devdata *dd)
+{
+	pci_free_irq(dd->pcidev, 0, dd);
+	pci_free_irq_vectors(dd->pcidev);
+}
+
+/*
+ * Setup pcie interrupt stuff again after a reset.  I'd like to just call
+ * pci_enable_msi() again for msi, but when I do that,
+ * the MSI enable bit doesn't get set in the command word, and
+ * we switch to to a different interrupt vector, which is confusing,
+ * so I instead just do it all inline.  Perhaps somehow can tie this
+ * into the PCIe hotplug support at some point
+ */
+int qib_reinit_intr(struct qib_devdata *dd)
+{
+	int pos;
+	u16 control;
+	int ret = 0;
+
+	/* If we aren't using MSI, don't restore it */
+	if (!dd->msi_lo)
+		goto bail;
+
+	pos = dd->pcidev->msi_cap;
+	if (!pos) {
+		qib_dev_err(dd,
+			"Can't find MSI capability, can't restore MSI settings\n");
+		ret = 0;
+		/* nothing special for MSIx, just MSI */
+		goto bail;
+	}
+	pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_LO,
+			       dd->msi_lo);
+	pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_HI,
+			       dd->msi_hi);
+	pci_read_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, &control);
+	if (!(control & PCI_MSI_FLAGS_ENABLE)) {
+		control |= PCI_MSI_FLAGS_ENABLE;
+		pci_write_config_word(dd->pcidev, pos + PCI_MSI_FLAGS,
+				      control);
+	}
+	/* now rewrite the data (vector) info */
+	pci_write_config_word(dd->pcidev, pos +
+			      ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8),
+			      dd->msi_data);
+	ret = 1;
+bail:
+	qib_free_irq(dd);
+
+	if (!ret && (dd->flags & QIB_HAS_INTX))
+		ret = 1;
+
+	/* and now set the pci master bit again */
+	pci_set_master(dd->pcidev);
+
+	return ret;
+}
+
+/*
+ * These two routines are helper routines for the device reset code
+ * to move all the pcie code out of the chip-specific driver code.
+ */
+void qib_pcie_getcmd(struct qib_devdata *dd, u16 *cmd, u8 *iline, u8 *cline)
+{
+	pci_read_config_word(dd->pcidev, PCI_COMMAND, cmd);
+	pci_read_config_byte(dd->pcidev, PCI_INTERRUPT_LINE, iline);
+	pci_read_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, cline);
+}
+
+void qib_pcie_reenable(struct qib_devdata *dd, u16 cmd, u8 iline, u8 cline)
+{
+	int r;
+
+	r = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0,
+				   dd->pcibar0);
+	if (r)
+		qib_dev_err(dd, "rewrite of BAR0 failed: %d\n", r);
+	r = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1,
+				   dd->pcibar1);
+	if (r)
+		qib_dev_err(dd, "rewrite of BAR1 failed: %d\n", r);
+	/* now re-enable memory access, and restore cosmetic settings */
+	pci_write_config_word(dd->pcidev, PCI_COMMAND, cmd);
+	pci_write_config_byte(dd->pcidev, PCI_INTERRUPT_LINE, iline);
+	pci_write_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, cline);
+	r = pci_enable_device(dd->pcidev);
+	if (r)
+		qib_dev_err(dd,
+			"pci_enable_device failed after reset: %d\n", r);
+}
+
+
+static int qib_pcie_coalesce;
+module_param_named(pcie_coalesce, qib_pcie_coalesce, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_coalesce, "tune PCIe colescing on some Intel chipsets");
+
+/*
+ * Enable PCIe completion and data coalescing, on Intel 5x00 and 7300
+ * chipsets.   This is known to be unsafe for some revisions of some
+ * of these chipsets, with some BIOS settings, and enabling it on those
+ * systems may result in the system crashing, and/or data corruption.
+ */
+static void qib_tune_pcie_coalesce(struct qib_devdata *dd)
+{
+	struct pci_dev *parent;
+	u16 devid;
+	u32 mask, bits, val;
+
+	if (!qib_pcie_coalesce)
+		return;
+
+	/* Find out supported and configured values for parent (root) */
+	parent = dd->pcidev->bus->self;
+	if (parent->bus->parent) {
+		qib_devinfo(dd->pcidev, "Parent not root\n");
+		return;
+	}
+	if (!pci_is_pcie(parent))
+		return;
+	if (parent->vendor != 0x8086)
+		return;
+
+	/*
+	 *  - bit 12: Max_rdcmp_Imt_EN: need to set to 1
+	 *  - bit 11: COALESCE_FORCE: need to set to 0
+	 *  - bit 10: COALESCE_EN: need to set to 1
+	 *  (but limitations on some on some chipsets)
+	 *
+	 *  On the Intel 5000, 5100, and 7300 chipsets, there is
+	 *  also: - bit 25:24: COALESCE_MODE, need to set to 0
+	 */
+	devid = parent->device;
+	if (devid >= 0x25e2 && devid <= 0x25fa) {
+		/* 5000 P/V/X/Z */
+		if (parent->revision <= 0xb2)
+			bits = 1U << 10;
+		else
+			bits = 7U << 10;
+		mask = (3U << 24) | (7U << 10);
+	} else if (devid >= 0x65e2 && devid <= 0x65fa) {
+		/* 5100 */
+		bits = 1U << 10;
+		mask = (3U << 24) | (7U << 10);
+	} else if (devid >= 0x4021 && devid <= 0x402e) {
+		/* 5400 */
+		bits = 7U << 10;
+		mask = 7U << 10;
+	} else if (devid >= 0x3604 && devid <= 0x360a) {
+		/* 7300 */
+		bits = 7U << 10;
+		mask = (3U << 24) | (7U << 10);
+	} else {
+		/* not one of the chipsets that we know about */
+		return;
+	}
+	pci_read_config_dword(parent, 0x48, &val);
+	val &= ~mask;
+	val |= bits;
+	pci_write_config_dword(parent, 0x48, val);
+}
+
+/*
+ * BIOS may not set PCIe bus-utilization parameters for best performance.
+ * Check and optionally adjust them to maximize our throughput.
+ */
+static int qib_pcie_caps;
+module_param_named(pcie_caps, qib_pcie_caps, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
+
+static void qib_tune_pcie_caps(struct qib_devdata *dd)
+{
+	struct pci_dev *parent;
+	u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
+	u16 rc_mrrs, ep_mrrs, max_mrrs;
+
+	/* Find out supported and configured values for parent (root) */
+	parent = dd->pcidev->bus->self;
+	if (!pci_is_root_bus(parent->bus)) {
+		qib_devinfo(dd->pcidev, "Parent not root\n");
+		return;
+	}
+
+	if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
+		return;
+
+	rc_mpss = parent->pcie_mpss;
+	rc_mps = ffs(pcie_get_mps(parent)) - 8;
+	/* Find out supported and configured values for endpoint (us) */
+	ep_mpss = dd->pcidev->pcie_mpss;
+	ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8;
+
+	/* Find max payload supported by root, endpoint */
+	if (rc_mpss > ep_mpss)
+		rc_mpss = ep_mpss;
+
+	/* If Supported greater than limit in module param, limit it */
+	if (rc_mpss > (qib_pcie_caps & 7))
+		rc_mpss = qib_pcie_caps & 7;
+	/* If less than (allowed, supported), bump root payload */
+	if (rc_mpss > rc_mps) {
+		rc_mps = rc_mpss;
+		pcie_set_mps(parent, 128 << rc_mps);
+	}
+	/* If less than (allowed, supported), bump endpoint payload */
+	if (rc_mpss > ep_mps) {
+		ep_mps = rc_mpss;
+		pcie_set_mps(dd->pcidev, 128 << ep_mps);
+	}
+
+	/*
+	 * Now the Read Request size.
+	 * No field for max supported, but PCIe spec limits it to 4096,
+	 * which is code '5' (log2(4096) - 7)
+	 */
+	max_mrrs = 5;
+	if (max_mrrs > ((qib_pcie_caps >> 4) & 7))
+		max_mrrs = (qib_pcie_caps >> 4) & 7;
+
+	max_mrrs = 128 << max_mrrs;
+	rc_mrrs = pcie_get_readrq(parent);
+	ep_mrrs = pcie_get_readrq(dd->pcidev);
+
+	if (max_mrrs > rc_mrrs) {
+		rc_mrrs = max_mrrs;
+		pcie_set_readrq(parent, rc_mrrs);
+	}
+	if (max_mrrs > ep_mrrs) {
+		ep_mrrs = max_mrrs;
+		pcie_set_readrq(dd->pcidev, ep_mrrs);
+	}
+}
+/* End of PCIe capability tuning */
+
+/*
+ * From here through qib_pci_err_handler definition is invoked via
+ * PCI error infrastructure, registered via pci
+ */
+static pci_ers_result_t
+qib_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	struct qib_devdata *dd = pci_get_drvdata(pdev);
+	pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+	switch (state) {
+	case pci_channel_io_normal:
+		qib_devinfo(pdev, "State Normal, ignoring\n");
+		break;
+
+	case pci_channel_io_frozen:
+		qib_devinfo(pdev, "State Frozen, requesting reset\n");
+		pci_disable_device(pdev);
+		ret = PCI_ERS_RESULT_NEED_RESET;
+		break;
+
+	case pci_channel_io_perm_failure:
+		qib_devinfo(pdev, "State Permanent Failure, disabling\n");
+		if (dd) {
+			/* no more register accesses! */
+			dd->flags &= ~QIB_PRESENT;
+			qib_disable_after_error(dd);
+		}
+		 /* else early, or other problem */
+		ret =  PCI_ERS_RESULT_DISCONNECT;
+		break;
+
+	default: /* shouldn't happen */
+		qib_devinfo(pdev, "QIB PCI errors detected (state %d)\n",
+			state);
+		break;
+	}
+	return ret;
+}
+
+static pci_ers_result_t
+qib_pci_mmio_enabled(struct pci_dev *pdev)
+{
+	u64 words = 0U;
+	struct qib_devdata *dd = pci_get_drvdata(pdev);
+	pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+	if (dd && dd->pport) {
+		words = dd->f_portcntr(dd->pport, QIBPORTCNTR_WORDRCV);
+		if (words == ~0ULL)
+			ret = PCI_ERS_RESULT_NEED_RESET;
+	}
+	qib_devinfo(pdev,
+		"QIB mmio_enabled function called, read wordscntr %Lx, returning %d\n",
+		words, ret);
+	return  ret;
+}
+
+static pci_ers_result_t
+qib_pci_slot_reset(struct pci_dev *pdev)
+{
+	qib_devinfo(pdev, "QIB slot_reset function called, ignored\n");
+	return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static void
+qib_pci_resume(struct pci_dev *pdev)
+{
+	struct qib_devdata *dd = pci_get_drvdata(pdev);
+
+	qib_devinfo(pdev, "QIB resume function called\n");
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+	/*
+	 * Running jobs will fail, since it's asynchronous
+	 * unlike sysfs-requested reset.   Better than
+	 * doing nothing.
+	 */
+	qib_init(dd, 1); /* same as re-init after reset */
+}
+
+const struct pci_error_handlers qib_pci_err_handler = {
+	.error_detected = qib_pci_error_detected,
+	.mmio_enabled = qib_pci_mmio_enabled,
+	.slot_reset = qib_pci_slot_reset,
+	.resume = qib_pci_resume,
+};
diff --git a/drivers/infiniband/hw/qib/qib_pio_copy.c b/drivers/infiniband/hw/qib/qib_pio_copy.c
new file mode 100644
index 0000000..10b8c44
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_pio_copy.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2009 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "qib.h"
+
+/**
+ * qib_pio_copy - copy data to MMIO space, in multiples of 32-bits
+ * @to: destination, in MMIO space (must be 64-bit aligned)
+ * @from: source (must be 64-bit aligned)
+ * @count: number of 32-bit quantities to copy
+ *
+ * Copy data from kernel space to MMIO space, in multiples of 32 bits at a
+ * time.  Order of access is not guaranteed, nor is a memory barrier
+ * performed afterwards.
+ */
+void qib_pio_copy(void __iomem *to, const void *from, size_t count)
+{
+#ifdef CONFIG_64BIT
+	u64 __iomem *dst = to;
+	const u64 *src = from;
+	const u64 *end = src + (count >> 1);
+
+	while (src < end)
+		__raw_writeq(*src++, dst++);
+	if (count & 1)
+		__raw_writel(*(const u32 *)src, dst);
+#else
+	u32 __iomem *dst = to;
+	const u32 *src = from;
+	const u32 *end = src + count;
+
+	while (src < end)
+		__raw_writel(*src++, dst++);
+#endif
+}
diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c
new file mode 100644
index 0000000..344e401
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_qp.c
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2012 - 2017 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation.  * All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <rdma/rdma_vt.h>
+#ifdef CONFIG_DEBUG_FS
+#include <linux/seq_file.h>
+#endif
+
+#include "qib.h"
+
+static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
+			      struct rvt_qpn_map *map, unsigned off)
+{
+	return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
+}
+
+static inline unsigned find_next_offset(struct rvt_qpn_table *qpt,
+					struct rvt_qpn_map *map, unsigned off,
+					unsigned n, u16 qpt_mask)
+{
+	if (qpt_mask) {
+		off++;
+		if (((off & qpt_mask) >> 1) >= n)
+			off = (off | qpt_mask) + 2;
+	} else {
+		off = find_next_zero_bit(map->page, RVT_BITS_PER_PAGE, off);
+	}
+	return off;
+}
+
+const struct rvt_operation_params qib_post_parms[RVT_OPERATION_MAX] = {
+[IB_WR_RDMA_WRITE] = {
+	.length = sizeof(struct ib_rdma_wr),
+	.qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_RDMA_READ] = {
+	.length = sizeof(struct ib_rdma_wr),
+	.qpt_support = BIT(IB_QPT_RC),
+	.flags = RVT_OPERATION_ATOMIC,
+},
+
+[IB_WR_ATOMIC_CMP_AND_SWP] = {
+	.length = sizeof(struct ib_atomic_wr),
+	.qpt_support = BIT(IB_QPT_RC),
+	.flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_ATOMIC_FETCH_AND_ADD] = {
+	.length = sizeof(struct ib_atomic_wr),
+	.qpt_support = BIT(IB_QPT_RC),
+	.flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_RDMA_WRITE_WITH_IMM] = {
+	.length = sizeof(struct ib_rdma_wr),
+	.qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND] = {
+	.length = sizeof(struct ib_send_wr),
+	.qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+		       BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND_WITH_IMM] = {
+	.length = sizeof(struct ib_send_wr),
+	.qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+		       BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+};
+
+static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map)
+{
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+	/*
+	 * Free the page if someone raced with us installing it.
+	 */
+
+	spin_lock(&qpt->lock);
+	if (map->page)
+		free_page(page);
+	else
+		map->page = (void *)page;
+	spin_unlock(&qpt->lock);
+}
+
+/*
+ * Allocate the next available QPN or
+ * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI.
+ */
+int qib_alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
+		  enum ib_qp_type type, u8 port)
+{
+	u32 i, offset, max_scan, qpn;
+	struct rvt_qpn_map *map;
+	u32 ret;
+	struct qib_ibdev *verbs_dev = container_of(rdi, struct qib_ibdev, rdi);
+	struct qib_devdata *dd = container_of(verbs_dev, struct qib_devdata,
+					      verbs_dev);
+	u16 qpt_mask = dd->qpn_mask;
+
+	if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
+		unsigned n;
+
+		ret = type == IB_QPT_GSI;
+		n = 1 << (ret + 2 * (port - 1));
+		spin_lock(&qpt->lock);
+		if (qpt->flags & n)
+			ret = -EINVAL;
+		else
+			qpt->flags |= n;
+		spin_unlock(&qpt->lock);
+		goto bail;
+	}
+
+	qpn = qpt->last + 2;
+	if (qpn >= RVT_QPN_MAX)
+		qpn = 2;
+	if (qpt_mask && ((qpn & qpt_mask) >> 1) >= dd->n_krcv_queues)
+		qpn = (qpn | qpt_mask) + 2;
+	offset = qpn & RVT_BITS_PER_PAGE_MASK;
+	map = &qpt->map[qpn / RVT_BITS_PER_PAGE];
+	max_scan = qpt->nmaps - !offset;
+	for (i = 0;;) {
+		if (unlikely(!map->page)) {
+			get_map_page(qpt, map);
+			if (unlikely(!map->page))
+				break;
+		}
+		do {
+			if (!test_and_set_bit(offset, map->page)) {
+				qpt->last = qpn;
+				ret = qpn;
+				goto bail;
+			}
+			offset = find_next_offset(qpt, map, offset,
+				dd->n_krcv_queues, qpt_mask);
+			qpn = mk_qpn(qpt, map, offset);
+			/*
+			 * This test differs from alloc_pidmap().
+			 * If find_next_offset() does find a zero
+			 * bit, we don't need to check for QPN
+			 * wrapping around past our starting QPN.
+			 * We just need to be sure we don't loop
+			 * forever.
+			 */
+		} while (offset < RVT_BITS_PER_PAGE && qpn < RVT_QPN_MAX);
+		/*
+		 * In order to keep the number of pages allocated to a
+		 * minimum, we scan the all existing pages before increasing
+		 * the size of the bitmap table.
+		 */
+		if (++i > max_scan) {
+			if (qpt->nmaps == RVT_QPNMAP_ENTRIES)
+				break;
+			map = &qpt->map[qpt->nmaps++];
+			offset = 0;
+		} else if (map < &qpt->map[qpt->nmaps]) {
+			++map;
+			offset = 0;
+		} else {
+			map = &qpt->map[0];
+			offset = 2;
+		}
+		qpn = mk_qpn(qpt, map, offset);
+	}
+
+	ret = -ENOMEM;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_free_all_qps - check for QPs still in use
+ */
+unsigned qib_free_all_qps(struct rvt_dev_info *rdi)
+{
+	struct qib_ibdev *verbs_dev = container_of(rdi, struct qib_ibdev, rdi);
+	struct qib_devdata *dd = container_of(verbs_dev, struct qib_devdata,
+					      verbs_dev);
+	unsigned n, qp_inuse = 0;
+
+	for (n = 0; n < dd->num_pports; n++) {
+		struct qib_ibport *ibp = &dd->pport[n].ibport_data;
+
+		rcu_read_lock();
+		if (rcu_dereference(ibp->rvp.qp[0]))
+			qp_inuse++;
+		if (rcu_dereference(ibp->rvp.qp[1]))
+			qp_inuse++;
+		rcu_read_unlock();
+	}
+	return qp_inuse;
+}
+
+void qib_notify_qp_reset(struct rvt_qp *qp)
+{
+	struct qib_qp_priv *priv = qp->priv;
+
+	atomic_set(&priv->s_dma_busy, 0);
+}
+
+void qib_notify_error_qp(struct rvt_qp *qp)
+{
+	struct qib_qp_priv *priv = qp->priv;
+	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+
+	spin_lock(&dev->rdi.pending_lock);
+	if (!list_empty(&priv->iowait) && !(qp->s_flags & RVT_S_BUSY)) {
+		qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
+		list_del_init(&priv->iowait);
+	}
+	spin_unlock(&dev->rdi.pending_lock);
+
+	if (!(qp->s_flags & RVT_S_BUSY)) {
+		qp->s_hdrwords = 0;
+		if (qp->s_rdma_mr) {
+			rvt_put_mr(qp->s_rdma_mr);
+			qp->s_rdma_mr = NULL;
+		}
+		if (priv->s_tx) {
+			qib_put_txreq(priv->s_tx);
+			priv->s_tx = NULL;
+		}
+	}
+}
+
+static int mtu_to_enum(u32 mtu)
+{
+	int enum_mtu;
+
+	switch (mtu) {
+	case 4096:
+		enum_mtu = IB_MTU_4096;
+		break;
+	case 2048:
+		enum_mtu = IB_MTU_2048;
+		break;
+	case 1024:
+		enum_mtu = IB_MTU_1024;
+		break;
+	case 512:
+		enum_mtu = IB_MTU_512;
+		break;
+	case 256:
+		enum_mtu = IB_MTU_256;
+		break;
+	default:
+		enum_mtu = IB_MTU_2048;
+	}
+	return enum_mtu;
+}
+
+int qib_get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+			   struct ib_qp_attr *attr)
+{
+	int mtu, pmtu, pidx = qp->port_num - 1;
+	struct qib_ibdev *verbs_dev = container_of(rdi, struct qib_ibdev, rdi);
+	struct qib_devdata *dd = container_of(verbs_dev, struct qib_devdata,
+					      verbs_dev);
+	mtu = ib_mtu_enum_to_int(attr->path_mtu);
+	if (mtu == -1)
+		return -EINVAL;
+
+	if (mtu > dd->pport[pidx].ibmtu)
+		pmtu = mtu_to_enum(dd->pport[pidx].ibmtu);
+	else
+		pmtu = attr->path_mtu;
+	return pmtu;
+}
+
+int qib_mtu_to_path_mtu(u32 mtu)
+{
+	return mtu_to_enum(mtu);
+}
+
+u32 qib_mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu)
+{
+	return ib_mtu_enum_to_int(pmtu);
+}
+
+void *qib_qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+	struct qib_qp_priv *priv;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return ERR_PTR(-ENOMEM);
+	priv->owner = qp;
+
+	priv->s_hdr = kzalloc(sizeof(*priv->s_hdr), GFP_KERNEL);
+	if (!priv->s_hdr) {
+		kfree(priv);
+		return ERR_PTR(-ENOMEM);
+	}
+	init_waitqueue_head(&priv->wait_dma);
+	INIT_WORK(&priv->s_work, _qib_do_send);
+	INIT_LIST_HEAD(&priv->iowait);
+
+	return priv;
+}
+
+void qib_qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+	struct qib_qp_priv *priv = qp->priv;
+
+	kfree(priv->s_hdr);
+	kfree(priv);
+}
+
+void qib_stop_send_queue(struct rvt_qp *qp)
+{
+	struct qib_qp_priv *priv = qp->priv;
+
+	cancel_work_sync(&priv->s_work);
+}
+
+void qib_quiesce_qp(struct rvt_qp *qp)
+{
+	struct qib_qp_priv *priv = qp->priv;
+
+	wait_event(priv->wait_dma, !atomic_read(&priv->s_dma_busy));
+	if (priv->s_tx) {
+		qib_put_txreq(priv->s_tx);
+		priv->s_tx = NULL;
+	}
+}
+
+void qib_flush_qp_waiters(struct rvt_qp *qp)
+{
+	struct qib_qp_priv *priv = qp->priv;
+	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+
+	spin_lock(&dev->rdi.pending_lock);
+	if (!list_empty(&priv->iowait))
+		list_del_init(&priv->iowait);
+	spin_unlock(&dev->rdi.pending_lock);
+}
+
+/**
+ * qib_check_send_wqe - validate wr/wqe
+ * @qp - The qp
+ * @wqe - The built wqe
+ *
+ * validate wr/wqe.  This is called
+ * prior to inserting the wqe into
+ * the ring but after the wqe has been
+ * setup.
+ *
+ * Returns 1 to force direct progress, 0 otherwise, -EINVAL on failure
+ */
+int qib_check_send_wqe(struct rvt_qp *qp,
+		       struct rvt_swqe *wqe)
+{
+	struct rvt_ah *ah;
+	int ret = 0;
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+		if (wqe->length > 0x80000000U)
+			return -EINVAL;
+		break;
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	case IB_QPT_UD:
+		ah = ibah_to_rvtah(wqe->ud_wr.ah);
+		if (wqe->length > (1 << ah->log_pmtu))
+			return -EINVAL;
+		/* progress hint */
+		ret = 1;
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static const char * const qp_type_str[] = {
+	"SMI", "GSI", "RC", "UC", "UD",
+};
+
+/**
+ * qib_qp_iter_print - print information to seq_file
+ * @s - the seq_file
+ * @iter - the iterator
+ */
+void qib_qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter)
+{
+	struct rvt_swqe *wqe;
+	struct rvt_qp *qp = iter->qp;
+	struct qib_qp_priv *priv = qp->priv;
+
+	wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+	seq_printf(s,
+		   "N %d QP%u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u) QP%u LID %x\n",
+		   iter->n,
+		   qp->ibqp.qp_num,
+		   qp_type_str[qp->ibqp.qp_type],
+		   qp->state,
+		   wqe->wr.opcode,
+		   qp->s_hdrwords,
+		   qp->s_flags,
+		   atomic_read(&priv->s_dma_busy),
+		   !list_empty(&priv->iowait),
+		   qp->timeout,
+		   wqe->ssn,
+		   qp->s_lsn,
+		   qp->s_last_psn,
+		   qp->s_psn, qp->s_next_psn,
+		   qp->s_sending_psn, qp->s_sending_hpsn,
+		   qp->s_last, qp->s_acked, qp->s_cur,
+		   qp->s_tail, qp->s_head, qp->s_size,
+		   qp->remote_qpn,
+		   rdma_ah_get_dlid(&qp->remote_ah_attr));
+}
+
+#endif
diff --git a/drivers/infiniband/hw/qib/qib_qsfp.c b/drivers/infiniband/hw/qib/qib_qsfp.c
new file mode 100644
index 0000000..295d40a
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_qsfp.c
@@ -0,0 +1,549 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "qib.h"
+#include "qib_qsfp.h"
+
+/*
+ * QSFP support for ib_qib driver, using "Two Wire Serial Interface" driver
+ * in qib_twsi.c
+ */
+#define QSFP_MAX_RETRY 4
+
+static int qsfp_read(struct qib_pportdata *ppd, int addr, void *bp, int len)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 out, mask;
+	int ret, cnt, pass = 0;
+	int stuck = 0;
+	u8 *buff = bp;
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (ret)
+		goto no_unlock;
+
+	if (dd->twsi_eeprom_dev == QIB_TWSI_NO_DEV) {
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	/*
+	 * We presume, if we are called at all, that this board has
+	 * QSFP. This is on the same i2c chain as the legacy parts,
+	 * but only responds if the module is selected via GPIO pins.
+	 * Further, there are very long setup and hold requirements
+	 * on MODSEL.
+	 */
+	mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	out = QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	if (ppd->hw_pidx) {
+		mask <<= QSFP_GPIO_PORT2_SHIFT;
+		out <<= QSFP_GPIO_PORT2_SHIFT;
+	}
+
+	dd->f_gpio_mod(dd, out, mask, mask);
+
+	/*
+	 * Module could take up to 2 Msec to respond to MOD_SEL, and there
+	 * is no way to tell if it is ready, so we must wait.
+	 */
+	msleep(20);
+
+	/* Make sure TWSI bus is in sane state. */
+	ret = qib_twsi_reset(dd);
+	if (ret) {
+		qib_dev_porterr(dd, ppd->port,
+				"QSFP interface Reset for read failed\n");
+		ret = -EIO;
+		stuck = 1;
+		goto deselect;
+	}
+
+	/* All QSFP modules are at A0 */
+
+	cnt = 0;
+	while (cnt < len) {
+		unsigned in_page;
+		int wlen = len - cnt;
+
+		in_page = addr % QSFP_PAGESIZE;
+		if ((in_page + wlen) > QSFP_PAGESIZE)
+			wlen = QSFP_PAGESIZE - in_page;
+		ret = qib_twsi_blk_rd(dd, QSFP_DEV, addr, buff + cnt, wlen);
+		/* Some QSFP's fail first try. Retry as experiment */
+		if (ret && cnt == 0 && ++pass < QSFP_MAX_RETRY)
+			continue;
+		if (ret) {
+			/* qib_twsi_blk_rd() 1 for error, else 0 */
+			ret = -EIO;
+			goto deselect;
+		}
+		addr += wlen;
+		cnt += wlen;
+	}
+	ret = cnt;
+
+deselect:
+	/*
+	 * Module could take up to 10 uSec after transfer before
+	 * ready to respond to MOD_SEL negation, and there is no way
+	 * to tell if it is ready, so we must wait.
+	 */
+	udelay(10);
+	/* set QSFP MODSEL, RST. LP all high */
+	dd->f_gpio_mod(dd, mask, mask, mask);
+
+	/*
+	 * Module could take up to 2 Msec to respond to MOD_SEL
+	 * going away, and there is no way to tell if it is ready.
+	 * so we must wait.
+	 */
+	if (stuck)
+		qib_dev_err(dd, "QSFP interface bus stuck non-idle\n");
+
+	if (pass >= QSFP_MAX_RETRY && ret)
+		qib_dev_porterr(dd, ppd->port, "QSFP failed even retrying\n");
+	else if (pass)
+		qib_dev_porterr(dd, ppd->port, "QSFP retries: %d\n", pass);
+
+	msleep(20);
+
+bail:
+	mutex_unlock(&dd->eep_lock);
+
+no_unlock:
+	return ret;
+}
+
+/*
+ * qsfp_write
+ * We do not ordinarily write the QSFP, but this is needed to select
+ * the page on non-flat QSFPs, and possibly later unusual cases
+ */
+static int qib_qsfp_write(struct qib_pportdata *ppd, int addr, void *bp,
+			  int len)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 out, mask;
+	int ret, cnt;
+	u8 *buff = bp;
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (ret)
+		goto no_unlock;
+
+	if (dd->twsi_eeprom_dev == QIB_TWSI_NO_DEV) {
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	/*
+	 * We presume, if we are called at all, that this board has
+	 * QSFP. This is on the same i2c chain as the legacy parts,
+	 * but only responds if the module is selected via GPIO pins.
+	 * Further, there are very long setup and hold requirements
+	 * on MODSEL.
+	 */
+	mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	out = QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	if (ppd->hw_pidx) {
+		mask <<= QSFP_GPIO_PORT2_SHIFT;
+		out <<= QSFP_GPIO_PORT2_SHIFT;
+	}
+	dd->f_gpio_mod(dd, out, mask, mask);
+
+	/*
+	 * Module could take up to 2 Msec to respond to MOD_SEL,
+	 * and there is no way to tell if it is ready, so we must wait.
+	 */
+	msleep(20);
+
+	/* Make sure TWSI bus is in sane state. */
+	ret = qib_twsi_reset(dd);
+	if (ret) {
+		qib_dev_porterr(dd, ppd->port,
+				"QSFP interface Reset for write failed\n");
+		ret = -EIO;
+		goto deselect;
+	}
+
+	/* All QSFP modules are at A0 */
+
+	cnt = 0;
+	while (cnt < len) {
+		unsigned in_page;
+		int wlen = len - cnt;
+
+		in_page = addr % QSFP_PAGESIZE;
+		if ((in_page + wlen) > QSFP_PAGESIZE)
+			wlen = QSFP_PAGESIZE - in_page;
+		ret = qib_twsi_blk_wr(dd, QSFP_DEV, addr, buff + cnt, wlen);
+		if (ret) {
+			/* qib_twsi_blk_wr() 1 for error, else 0 */
+			ret = -EIO;
+			goto deselect;
+		}
+		addr += wlen;
+		cnt += wlen;
+	}
+	ret = cnt;
+
+deselect:
+	/*
+	 * Module could take up to 10 uSec after transfer before
+	 * ready to respond to MOD_SEL negation, and there is no way
+	 * to tell if it is ready, so we must wait.
+	 */
+	udelay(10);
+	/* set QSFP MODSEL, RST, LP high */
+	dd->f_gpio_mod(dd, mask, mask, mask);
+	/*
+	 * Module could take up to 2 Msec to respond to MOD_SEL
+	 * going away, and there is no way to tell if it is ready.
+	 * so we must wait.
+	 */
+	msleep(20);
+
+bail:
+	mutex_unlock(&dd->eep_lock);
+
+no_unlock:
+	return ret;
+}
+
+/*
+ * For validation, we want to check the checksums, even of the
+ * fields we do not otherwise use. This function reads the bytes from
+ * <first> to <next-1> and returns the 8lsbs of the sum, or <0 for errors
+ */
+static int qsfp_cks(struct qib_pportdata *ppd, int first, int next)
+{
+	int ret;
+	u16 cks;
+	u8 bval;
+
+	cks = 0;
+	while (first < next) {
+		ret = qsfp_read(ppd, first, &bval, 1);
+		if (ret < 0)
+			goto bail;
+		cks += bval;
+		++first;
+	}
+	ret = cks & 0xFF;
+bail:
+	return ret;
+
+}
+
+int qib_refresh_qsfp_cache(struct qib_pportdata *ppd, struct qib_qsfp_cache *cp)
+{
+	int ret;
+	int idx;
+	u16 cks;
+	u8 peek[4];
+
+	/* ensure sane contents on invalid reads, for cable swaps */
+	memset(cp, 0, sizeof(*cp));
+
+	if (!qib_qsfp_mod_present(ppd)) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	ret = qsfp_read(ppd, 0, peek, 3);
+	if (ret < 0)
+		goto bail;
+	if ((peek[0] & 0xFE) != 0x0C)
+		qib_dev_porterr(ppd->dd, ppd->port,
+				"QSFP byte0 is 0x%02X, S/B 0x0C/D\n", peek[0]);
+
+	if ((peek[2] & 4) == 0) {
+		/*
+		 * If cable is paged, rather than "flat memory", we need to
+		 * set the page to zero, Even if it already appears to be zero.
+		 */
+		u8 poke = 0;
+
+		ret = qib_qsfp_write(ppd, 127, &poke, 1);
+		udelay(50);
+		if (ret != 1) {
+			qib_dev_porterr(ppd->dd, ppd->port,
+					"Failed QSFP Page set\n");
+			goto bail;
+		}
+	}
+
+	ret = qsfp_read(ppd, QSFP_MOD_ID_OFFS, &cp->id, 1);
+	if (ret < 0)
+		goto bail;
+	if ((cp->id & 0xFE) != 0x0C)
+		qib_dev_porterr(ppd->dd, ppd->port,
+				"QSFP ID byte is 0x%02X, S/B 0x0C/D\n", cp->id);
+	cks = cp->id;
+
+	ret = qsfp_read(ppd, QSFP_MOD_PWR_OFFS, &cp->pwr, 1);
+	if (ret < 0)
+		goto bail;
+	cks += cp->pwr;
+
+	ret = qsfp_cks(ppd, QSFP_MOD_PWR_OFFS + 1, QSFP_MOD_LEN_OFFS);
+	if (ret < 0)
+		goto bail;
+	cks += ret;
+
+	ret = qsfp_read(ppd, QSFP_MOD_LEN_OFFS, &cp->len, 1);
+	if (ret < 0)
+		goto bail;
+	cks += cp->len;
+
+	ret = qsfp_read(ppd, QSFP_MOD_TECH_OFFS, &cp->tech, 1);
+	if (ret < 0)
+		goto bail;
+	cks += cp->tech;
+
+	ret = qsfp_read(ppd, QSFP_VEND_OFFS, &cp->vendor, QSFP_VEND_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_VEND_LEN; ++idx)
+		cks += cp->vendor[idx];
+
+	ret = qsfp_read(ppd, QSFP_IBXCV_OFFS, &cp->xt_xcv, 1);
+	if (ret < 0)
+		goto bail;
+	cks += cp->xt_xcv;
+
+	ret = qsfp_read(ppd, QSFP_VOUI_OFFS, &cp->oui, QSFP_VOUI_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_VOUI_LEN; ++idx)
+		cks += cp->oui[idx];
+
+	ret = qsfp_read(ppd, QSFP_PN_OFFS, &cp->partnum, QSFP_PN_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_PN_LEN; ++idx)
+		cks += cp->partnum[idx];
+
+	ret = qsfp_read(ppd, QSFP_REV_OFFS, &cp->rev, QSFP_REV_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_REV_LEN; ++idx)
+		cks += cp->rev[idx];
+
+	ret = qsfp_read(ppd, QSFP_ATTEN_OFFS, &cp->atten, QSFP_ATTEN_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_ATTEN_LEN; ++idx)
+		cks += cp->atten[idx];
+
+	ret = qsfp_cks(ppd, QSFP_ATTEN_OFFS + QSFP_ATTEN_LEN, QSFP_CC_OFFS);
+	if (ret < 0)
+		goto bail;
+	cks += ret;
+
+	cks &= 0xFF;
+	ret = qsfp_read(ppd, QSFP_CC_OFFS, &cp->cks1, 1);
+	if (ret < 0)
+		goto bail;
+	if (cks != cp->cks1)
+		qib_dev_porterr(ppd->dd, ppd->port,
+				"QSFP cks1 is %02X, computed %02X\n", cp->cks1,
+				cks);
+
+	/* Second checksum covers 192 to (serial, date, lot) */
+	ret = qsfp_cks(ppd, QSFP_CC_OFFS + 1, QSFP_SN_OFFS);
+	if (ret < 0)
+		goto bail;
+	cks = ret;
+
+	ret = qsfp_read(ppd, QSFP_SN_OFFS, &cp->serial, QSFP_SN_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_SN_LEN; ++idx)
+		cks += cp->serial[idx];
+
+	ret = qsfp_read(ppd, QSFP_DATE_OFFS, &cp->date, QSFP_DATE_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_DATE_LEN; ++idx)
+		cks += cp->date[idx];
+
+	ret = qsfp_read(ppd, QSFP_LOT_OFFS, &cp->lot, QSFP_LOT_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_LOT_LEN; ++idx)
+		cks += cp->lot[idx];
+
+	ret = qsfp_cks(ppd, QSFP_LOT_OFFS + QSFP_LOT_LEN, QSFP_CC_EXT_OFFS);
+	if (ret < 0)
+		goto bail;
+	cks += ret;
+
+	ret = qsfp_read(ppd, QSFP_CC_EXT_OFFS, &cp->cks2, 1);
+	if (ret < 0)
+		goto bail;
+	cks &= 0xFF;
+	if (cks != cp->cks2)
+		qib_dev_porterr(ppd->dd, ppd->port,
+				"QSFP cks2 is %02X, computed %02X\n", cp->cks2,
+				cks);
+	return 0;
+
+bail:
+	cp->id = 0;
+	return ret;
+}
+
+const char * const qib_qsfp_devtech[16] = {
+	"850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP",
+	"1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML",
+	"Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq",
+	"Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq"
+};
+
+#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
+#define QSFP_DEFAULT_HDR_CNT 224
+
+static const char *pwr_codes = "1.5W2.0W2.5W3.5W";
+
+int qib_qsfp_mod_present(struct qib_pportdata *ppd)
+{
+	u32 mask;
+	int ret;
+
+	mask = QSFP_GPIO_MOD_PRS_N <<
+		(ppd->hw_pidx * QSFP_GPIO_PORT2_SHIFT);
+	ret = ppd->dd->f_gpio_mod(ppd->dd, 0, 0, 0);
+
+	return !((ret & mask) >>
+		 ((ppd->hw_pidx * QSFP_GPIO_PORT2_SHIFT) + 3));
+}
+
+/*
+ * Initialize structures that control access to QSFP. Called once per port
+ * on cards that support QSFP.
+ */
+void qib_qsfp_init(struct qib_qsfp_data *qd,
+		   void (*fevent)(struct work_struct *))
+{
+	u32 mask, highs;
+
+	struct qib_devdata *dd = qd->ppd->dd;
+
+	/* Initialize work struct for later QSFP events */
+	INIT_WORK(&qd->work, fevent);
+
+	/*
+	 * Later, we may want more validation. For now, just set up pins and
+	 * blip reset. If module is present, call qib_refresh_qsfp_cache(),
+	 * to do further init.
+	 */
+	mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	highs = mask - QSFP_GPIO_MOD_RST_N;
+	if (qd->ppd->hw_pidx) {
+		mask <<= QSFP_GPIO_PORT2_SHIFT;
+		highs <<= QSFP_GPIO_PORT2_SHIFT;
+	}
+	dd->f_gpio_mod(dd, highs, mask, mask);
+	udelay(20); /* Generous RST dwell */
+
+	dd->f_gpio_mod(dd, mask, mask, mask);
+}
+
+int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len)
+{
+	struct qib_qsfp_cache cd;
+	u8 bin_buff[QSFP_DUMP_CHUNK];
+	char lenstr[6];
+	int sofar, ret;
+	int bidx = 0;
+
+	sofar = 0;
+	ret = qib_refresh_qsfp_cache(ppd, &cd);
+	if (ret < 0)
+		goto bail;
+
+	lenstr[0] = ' ';
+	lenstr[1] = '\0';
+	if (QSFP_IS_CU(cd.tech))
+		sprintf(lenstr, "%dM ", cd.len);
+
+	sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n", pwr_codes +
+			   (QSFP_PWR(cd.pwr) * 4));
+
+	sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n", lenstr,
+			   qib_qsfp_devtech[cd.tech >> 4]);
+
+	sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n",
+			   QSFP_VEND_LEN, cd.vendor);
+
+	sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n",
+			   QSFP_OUI(cd.oui));
+
+	sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n",
+			   QSFP_PN_LEN, cd.partnum);
+	sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n",
+			   QSFP_REV_LEN, cd.rev);
+	if (QSFP_IS_CU(cd.tech))
+		sofar += scnprintf(buf + sofar, len - sofar, "Atten:%d, %d\n",
+				   QSFP_ATTEN_SDR(cd.atten),
+				   QSFP_ATTEN_DDR(cd.atten));
+	sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n",
+			   QSFP_SN_LEN, cd.serial);
+	sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n",
+			   QSFP_DATE_LEN, cd.date);
+	sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n",
+			   QSFP_LOT_LEN, cd.lot);
+
+	while (bidx < QSFP_DEFAULT_HDR_CNT) {
+		int iidx;
+
+		ret = qsfp_read(ppd, bidx, bin_buff, QSFP_DUMP_CHUNK);
+		if (ret < 0)
+			goto bail;
+		for (iidx = 0; iidx < ret; ++iidx) {
+			sofar += scnprintf(buf + sofar, len-sofar, " %02X",
+				bin_buff[iidx]);
+		}
+		sofar += scnprintf(buf + sofar, len - sofar, "\n");
+		bidx += QSFP_DUMP_CHUNK;
+	}
+	ret = sofar;
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_qsfp.h b/drivers/infiniband/hw/qib/qib_qsfp.h
new file mode 100644
index 0000000..ad8dbd6
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_qsfp.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* QSFP support common definitions, for ib_qib driver */
+
+#define QSFP_DEV 0xA0
+#define QSFP_PWR_LAG_MSEC 2000
+#define QSFP_MODPRS_LAG_MSEC 20
+
+/*
+ * Below are masks for various QSFP signals, for Port 1.
+ * Port2 equivalents are shifted by QSFP_GPIO_PORT2_SHIFT.
+ * _N means asserted low
+ */
+#define QSFP_GPIO_MOD_SEL_N (4)
+#define QSFP_GPIO_MOD_PRS_N (8)
+#define QSFP_GPIO_INT_N (0x10)
+#define QSFP_GPIO_MOD_RST_N (0x20)
+#define QSFP_GPIO_LP_MODE (0x40)
+#define QSFP_GPIO_PORT2_SHIFT 5
+
+#define QSFP_PAGESIZE 128
+/* Defined fields that QLogic requires of qualified cables */
+/* Byte 0 is Identifier, not checked */
+/* Byte 1 is reserved "status MSB" */
+/* Byte 2 is "status LSB" We only care that D2 "Flat Mem" is set. */
+/*
+ * Rest of first 128 not used, although 127 is reserved for page select
+ * if module is not "Flat memory".
+ */
+/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
+#define QSFP_MOD_ID_OFFS 128
+/*
+ * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class
+ *  0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ */
+#define QSFP_MOD_PWR_OFFS 129
+/* Byte 130 is Connector type. Not QLogic req'd */
+/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */
+/* Byte 139 is encoding. code 0x01 is 8b10b. Not QLogic req'd */
+/* byte 140 is nominal bit-rate, in units of 100Mbits/sec Not QLogic req'd */
+/* Byte 141 is Extended Rate Select. Not QLogic req'd */
+/* Bytes 142..145 are lengths for various fiber types. Not QLogic req'd */
+/* Byte 146 is length for Copper. Units of 1 meter */
+#define QSFP_MOD_LEN_OFFS 146
+/*
+ * Byte 147 is Device technology. D0..3 not Qlogc req'd
+ * D4..7 select from 15 choices, translated by table:
+ */
+#define QSFP_MOD_TECH_OFFS 147
+extern const char *const qib_qsfp_devtech[16];
+/* Active Equalization includes fiber, copper full EQ, and copper near Eq */
+#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1)
+/* Active Equalization includes fiber, copper full EQ, and copper far Eq */
+#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1)
+/* Attenuation should be valid for copper other than full/near Eq */
+#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1)
+/* Length is only valid if technology is "copper" */
+#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1)
+#define QSFP_TECH_1490 9
+
+#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \
+			oui[2])
+#define QSFP_OUI_AMPHENOL 0x415048
+#define QSFP_OUI_FINISAR  0x009065
+#define QSFP_OUI_GORE     0x002177
+
+/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */
+#define QSFP_VEND_OFFS 148
+#define QSFP_VEND_LEN 16
+/* Byte 164 is IB Extended tranceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */
+#define QSFP_IBXCV_OFFS 164
+/* Bytes 165..167 are Vendor OUI number */
+#define QSFP_VOUI_OFFS 165
+#define QSFP_VOUI_LEN 3
+/* Bytes 168..183 are Vendor Part Number, string */
+#define QSFP_PN_OFFS 168
+#define QSFP_PN_LEN 16
+/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */
+#define QSFP_REV_OFFS 184
+#define QSFP_REV_LEN 2
+/*
+ * Bytes 186,187 are Wavelength, if Optical. Not Qlogic req'd
+ *  If copper, they are attenuation in dB:
+ * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR)
+ */
+#define QSFP_ATTEN_OFFS 186
+#define QSFP_ATTEN_LEN 2
+/* Bytes 188,189 are Wavelength tolerance, not QLogic req'd */
+/* Byte 190 is Max Case Temp. Not QLogic req'd */
+/* Byte 191 is LSB of sum of bytes 128..190. Not QLogic req'd */
+#define QSFP_CC_OFFS 191
+/* Bytes 192..195 are Options implemented in qsfp. Not Qlogic req'd */
+/* Bytes 196..211 are Serial Number, String */
+#define QSFP_SN_OFFS 196
+#define QSFP_SN_LEN 16
+/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */
+#define QSFP_DATE_OFFS 212
+#define QSFP_DATE_LEN 6
+/* Bytes 218,219 are optional lot-code, string */
+#define QSFP_LOT_OFFS 218
+#define QSFP_LOT_LEN 2
+/* Bytes 220, 221 indicate monitoring options, Not QLogic req'd */
+/* Byte 223 is LSB of sum of bytes 192..222 */
+#define QSFP_CC_EXT_OFFS 223
+
+/*
+ * struct qib_qsfp_data encapsulates state of QSFP device for one port.
+ * it will be part of port-chip-specific data if a board supports QSFP.
+ *
+ * Since multiple board-types use QSFP, and their pport_data structs
+ * differ (in the chip-specific section), we need a pointer to its head.
+ *
+ * Avoiding premature optimization, we will have one work_struct per port,
+ * and let the (increasingly inaccurately named) eep_lock arbitrate
+ * access to common resources.
+ *
+ */
+
+/*
+ * Hold the parts of the onboard EEPROM that we care about, so we aren't
+ * coonstantly bit-boffing
+ */
+struct qib_qsfp_cache {
+	u8 id;	/* must be 0x0C or 0x0D; 0 indicates invalid EEPROM read */
+	u8 pwr; /* in D6,7 */
+	u8 len;	/* in meters, Cu only */
+	u8 tech;
+	char vendor[QSFP_VEND_LEN];
+	u8 xt_xcv; /* Ext. tranceiver codes, 4 lsbs are IB speed supported */
+	u8 oui[QSFP_VOUI_LEN];
+	u8 partnum[QSFP_PN_LEN];
+	u8 rev[QSFP_REV_LEN];
+	u8 atten[QSFP_ATTEN_LEN];
+	u8 cks1;	/* Checksum of bytes 128..190 */
+	u8 serial[QSFP_SN_LEN];
+	u8 date[QSFP_DATE_LEN];
+	u8 lot[QSFP_LOT_LEN];
+	u8 cks2;	/* Checsum of bytes 192..222 */
+};
+
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
+struct qib_qsfp_data {
+	/* Helps to find our way */
+	struct qib_pportdata *ppd;
+	struct work_struct work;
+	struct qib_qsfp_cache cache;
+	unsigned long t_insert;
+	u8 modpresent;
+};
+
+extern int qib_refresh_qsfp_cache(struct qib_pportdata *ppd,
+				  struct qib_qsfp_cache *cp);
+extern int qib_qsfp_mod_present(struct qib_pportdata *ppd);
+extern void qib_qsfp_init(struct qib_qsfp_data *qd,
+			  void (*fevent)(struct work_struct *));
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
new file mode 100644
index 0000000..c9955d4
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -0,0 +1,2156 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/io.h>
+
+#include "qib.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+
+static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
+		       u32 psn, u32 pmtu)
+{
+	u32 len;
+
+	len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
+	ss->sge = wqe->sg_list[0];
+	ss->sg_list = wqe->sg_list + 1;
+	ss->num_sge = wqe->wr.num_sge;
+	ss->total_len = wqe->length;
+	rvt_skip_sge(ss, len, false);
+	return wqe->length - len;
+}
+
+/**
+ * qib_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @dev: the device for this QP
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @pmtu: the path MTU
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int qib_make_rc_ack(struct qib_ibdev *dev, struct rvt_qp *qp,
+			   struct ib_other_headers *ohdr, u32 pmtu)
+{
+	struct rvt_ack_entry *e;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+
+	/* Don't send an ACK if we aren't supposed to. */
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+		goto bail;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+
+	switch (qp->s_ack_state) {
+	case OP(RDMA_READ_RESPONSE_LAST):
+	case OP(RDMA_READ_RESPONSE_ONLY):
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->rdma_sge.mr) {
+			rvt_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		/* FALLTHROUGH */
+	case OP(ATOMIC_ACKNOWLEDGE):
+		/*
+		 * We can increment the tail pointer now that the last
+		 * response has been sent instead of only being
+		 * constructed.
+		 */
+		if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC)
+			qp->s_tail_ack_queue = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_ONLY):
+	case OP(ACKNOWLEDGE):
+		/* Check for no next entry in the queue. */
+		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+			if (qp->s_flags & RVT_S_ACK_PENDING)
+				goto normal;
+			goto bail;
+		}
+
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST)) {
+			/*
+			 * If a RDMA read response is being resent and
+			 * we haven't seen the duplicate request yet,
+			 * then stop sending the remaining responses the
+			 * responder has seen until the requester resends it.
+			 */
+			len = e->rdma_sge.sge_length;
+			if (len && !e->rdma_sge.mr) {
+				qp->s_tail_ack_queue = qp->r_head_ack_queue;
+				goto bail;
+			}
+			/* Copy SGE state in case we need to resend */
+			qp->s_rdma_mr = e->rdma_sge.mr;
+			if (qp->s_rdma_mr)
+				rvt_get_mr(qp->s_rdma_mr);
+			qp->s_ack_rdma_sge.sge = e->rdma_sge;
+			qp->s_ack_rdma_sge.num_sge = 1;
+			qp->s_cur_sge = &qp->s_ack_rdma_sge;
+			if (len > pmtu) {
+				len = pmtu;
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+			} else {
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+				e->sent = 1;
+			}
+			ohdr->u.aeth = rvt_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_rdma_psn = e->psn;
+			bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
+		} else {
+			/* COMPARE_SWAP or FETCH_ADD */
+			qp->s_cur_sge = NULL;
+			len = 0;
+			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+			ohdr->u.at.aeth = rvt_compute_aeth(qp);
+			ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth);
+			hwords += sizeof(ohdr->u.at) / sizeof(u32);
+			bth2 = e->psn & QIB_PSN_MASK;
+			e->sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		qp->s_cur_sge = &qp->s_ack_rdma_sge;
+		qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
+		if (qp->s_rdma_mr)
+			rvt_get_mr(qp->s_rdma_mr);
+		len = qp->s_ack_rdma_sge.sge.sge_length;
+		if (len > pmtu)
+			len = pmtu;
+		else {
+			ohdr->u.aeth = rvt_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+			e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+			e->sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
+		break;
+
+	default:
+normal:
+		/*
+		 * Send a regular ACK.
+		 * Set the s_ack_state so we wait until after sending
+		 * the ACK before setting s_ack_state to ACKNOWLEDGE
+		 * (see above).
+		 */
+		qp->s_ack_state = OP(SEND_ONLY);
+		qp->s_flags &= ~RVT_S_ACK_PENDING;
+		qp->s_cur_sge = NULL;
+		if (qp->s_nak_state)
+			ohdr->u.aeth =
+				cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
+					    (qp->s_nak_state <<
+					     IB_AETH_CREDIT_SHIFT));
+		else
+			ohdr->u.aeth = rvt_compute_aeth(qp);
+		hwords++;
+		len = 0;
+		bth0 = OP(ACKNOWLEDGE) << 24;
+		bth2 = qp->s_ack_psn & QIB_PSN_MASK;
+	}
+	qp->s_rdma_ack_cnt++;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_size = len;
+	qib_make_ruc_header(qp, ohdr, bth0, bth2);
+	return 1;
+
+bail:
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+	qp->s_flags &= ~(RVT_S_RESP_PENDING | RVT_S_ACK_PENDING);
+	return 0;
+}
+
+/**
+ * qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Assumes the s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags)
+{
+	struct qib_qp_priv *priv = qp->priv;
+	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ib_other_headers *ohdr;
+	struct rvt_sge_state *ss;
+	struct rvt_swqe *wqe;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+	u32 pmtu = qp->pmtu;
+	char newreq;
+	int ret = 0;
+	int delta;
+
+	ohdr = &priv->s_hdr->u.oth;
+	if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)
+		ohdr = &priv->s_hdr->u.l.oth;
+
+	/* Sending responses has higher priority over sending requests. */
+	if ((qp->s_flags & RVT_S_RESP_PENDING) &&
+	    qib_make_rc_ack(dev, qp, ohdr, pmtu))
+		goto done;
+
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
+		if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == READ_ONCE(qp->s_head))
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&priv->s_dma_busy)) {
+			qp->s_flags |= RVT_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+		qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+		/* will get called again */
+		goto done;
+	}
+
+	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
+		goto bail;
+
+	if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) {
+		if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
+			qp->s_flags |= RVT_S_WAIT_PSN;
+			goto bail;
+		}
+		qp->s_sending_psn = qp->s_psn;
+		qp->s_sending_hpsn = qp->s_psn - 1;
+	}
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+	bth0 = 0;
+
+	/* Send a request. */
+	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+	switch (qp->s_state) {
+	default:
+		if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/*
+		 * Resend an old request or start a new one.
+		 *
+		 * We keep track of the current SWQE so that
+		 * we don't reset the "furthest progress" state
+		 * if we need to back up.
+		 */
+		newreq = 0;
+		if (qp->s_cur == qp->s_tail) {
+			/* Check if send work queue is empty. */
+			if (qp->s_tail == READ_ONCE(qp->s_head))
+				goto bail;
+			/*
+			 * If a fence is requested, wait for previous
+			 * RDMA read and atomic operations to finish.
+			 */
+			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+			    qp->s_num_rd_atomic) {
+				qp->s_flags |= RVT_S_WAIT_FENCE;
+				goto bail;
+			}
+			newreq = 1;
+			qp->s_psn = wqe->psn;
+		}
+		/*
+		 * Note that we have to be careful not to modify the
+		 * original work request since we may need to resend
+		 * it.
+		 */
+		len = wqe->length;
+		ss = &qp->s_sge;
+		bth2 = qp->s_psn & QIB_PSN_MASK;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			/* If no credit, return. */
+			if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
+			    rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			if (len > pmtu) {
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+				qp->s_lsn++;
+			goto no_flow_control;
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			/* If no credit, return. */
+			if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
+			    rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+no_flow_control:
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->rdma_wr.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->rdma_wr.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / sizeof(u32);
+			if (len > pmtu) {
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->rdma_wr.wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state = OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after RETH */
+				ohdr->u.rc.imm_data =
+					wqe->rdma_wr.wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->rdma_wr.wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= IB_BTH_SOLICITED;
+			}
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_READ:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= RVT_S_WAIT_RDMAR;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+					qp->s_lsn++;
+			}
+
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->rdma_wr.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->rdma_wr.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			qp->s_state = OP(RDMA_READ_REQUEST);
+			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_ATOMIC_CMP_AND_SWP:
+		case IB_WR_ATOMIC_FETCH_AND_ADD:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= RVT_S_WAIT_RDMAR;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+					qp->s_lsn++;
+			}
+			if (wqe->atomic_wr.wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+				qp->s_state = OP(COMPARE_SWAP);
+				put_ib_ateth_swap(wqe->atomic_wr.swap,
+						  &ohdr->u.atomic_eth);
+				put_ib_ateth_compare(wqe->atomic_wr.compare_add,
+						     &ohdr->u.atomic_eth);
+			} else {
+				qp->s_state = OP(FETCH_ADD);
+				put_ib_ateth_swap(wqe->atomic_wr.compare_add,
+						  &ohdr->u.atomic_eth);
+				put_ib_ateth_compare(0, &ohdr->u.atomic_eth);
+			}
+			put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr,
+					   &ohdr->u.atomic_eth);
+			ohdr->u.atomic_eth.rkey = cpu_to_be32(
+				wqe->atomic_wr.rkey);
+			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_sge.total_len = wqe->length;
+		qp->s_len = wqe->length;
+		if (newreq) {
+			qp->s_tail++;
+			if (qp->s_tail >= qp->s_size)
+				qp->s_tail = 0;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+			qp->s_psn = wqe->lpsn + 1;
+		else
+			qp->s_psn++;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
+		 * thread to indicate a SEND needs to be restarted from an
+		 * earlier PSN without interferring with the sending thread.
+		 * See qib_restart_rc().
+		 */
+		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+		/* FALLTHROUGH */
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		bth2 = qp->s_psn++ & QIB_PSN_MASK;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= IB_BTH_SOLICITED;
+		bth2 |= IB_BTH_REQ_ACK;
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
+		 * thread to indicate a RDMA write needs to be restarted from
+		 * an earlier PSN without interferring with the sending thread.
+		 * See qib_restart_rc().
+		 */
+		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		bth2 = qp->s_psn++ & QIB_PSN_MASK;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+		}
+		bth2 |= IB_BTH_REQ_ACK;
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
+		 * thread to indicate a RDMA read needs to be restarted from
+		 * an earlier PSN without interferring with the sending thread.
+		 * See qib_restart_rc().
+		 */
+		len = ((qp->s_psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
+		ohdr->u.rc.reth.vaddr =
+			cpu_to_be64(wqe->rdma_wr.remote_addr + len);
+		ohdr->u.rc.reth.rkey =
+			cpu_to_be32(wqe->rdma_wr.rkey);
+		ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
+		qp->s_state = OP(RDMA_READ_REQUEST);
+		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+		bth2 = (qp->s_psn & QIB_PSN_MASK) | IB_BTH_REQ_ACK;
+		qp->s_psn = wqe->lpsn + 1;
+		ss = NULL;
+		len = 0;
+		qp->s_cur++;
+		if (qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	qp->s_sending_hpsn = bth2;
+	delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8;
+	if (delta && delta % QIB_PSN_CREDIT == 0)
+		bth2 |= IB_BTH_REQ_ACK;
+	if (qp->s_flags & RVT_S_SEND_ONE) {
+		qp->s_flags &= ~RVT_S_SEND_ONE;
+		qp->s_flags |= RVT_S_WAIT_ACK;
+		bth2 |= IB_BTH_REQ_ACK;
+	}
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = ss;
+	qp->s_cur_size = len;
+	qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2);
+done:
+	return 1;
+bail:
+	qp->s_flags &= ~RVT_S_BUSY;
+	return ret;
+}
+
+/**
+ * qib_send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from qib_rc_rcv() and qib_kreceive().
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+void qib_send_rc_ack(struct rvt_qp *qp)
+{
+	struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u64 pbc;
+	u16 lrh0;
+	u32 bth0;
+	u32 hwords;
+	u32 pbufn;
+	u32 __iomem *piobuf;
+	struct ib_header hdr;
+	struct ib_other_headers *ohdr;
+	u32 control;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+		goto unlock;
+
+	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+	if ((qp->s_flags & RVT_S_RESP_PENDING) || qp->s_rdma_ack_cnt)
+		goto queue_ack;
+
+	/* Construct the header with s_lock held so APM doesn't change it. */
+	ohdr = &hdr.u.oth;
+	lrh0 = QIB_LRH_BTH;
+	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
+	hwords = 6;
+	if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) &
+		     IB_AH_GRH)) {
+		hwords += qib_make_grh(ibp, &hdr.u.l.grh,
+				       rdma_ah_read_grh(&qp->remote_ah_attr),
+				       hwords, 0);
+		ohdr = &hdr.u.l.oth;
+		lrh0 = QIB_LRH_GRH;
+	}
+	/* read pkey_index w/o lock (its atomic) */
+	bth0 = qib_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
+	if (qp->s_mig_state == IB_MIG_MIGRATED)
+		bth0 |= IB_BTH_MIG_REQ;
+	if (qp->r_nak_state)
+		ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
+					    (qp->r_nak_state <<
+					     IB_AETH_CREDIT_SHIFT));
+	else
+		ohdr->u.aeth = rvt_compute_aeth(qp);
+	lrh0 |= ibp->sl_to_vl[rdma_ah_get_sl(&qp->remote_ah_attr)] << 12 |
+		rdma_ah_get_sl(&qp->remote_ah_attr) << 4;
+	hdr.lrh[0] = cpu_to_be16(lrh0);
+	hdr.lrh[1] = cpu_to_be16(rdma_ah_get_dlid(&qp->remote_ah_attr));
+	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+	hdr.lrh[3] = cpu_to_be16(ppd->lid |
+				 rdma_ah_get_path_bits(&qp->remote_ah_attr));
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & QIB_PSN_MASK);
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	/* Don't try to send ACKs if the link isn't ACTIVE */
+	if (!(ppd->lflags & QIBL_LINKACTIVE))
+		goto done;
+
+	control = dd->f_setpbc_control(ppd, hwords + SIZE_OF_CRC,
+				       qp->s_srate, lrh0 >> 12);
+	/* length is + 1 for the control dword */
+	pbc = ((u64) control << 32) | (hwords + 1);
+
+	piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn);
+	if (!piobuf) {
+		/*
+		 * We are out of PIO buffers at the moment.
+		 * Pass responsibility for sending the ACK to the
+		 * send tasklet so that when a PIO buffer becomes
+		 * available, the ACK is sent ahead of other outgoing
+		 * packets.
+		 */
+		spin_lock_irqsave(&qp->s_lock, flags);
+		goto queue_ack;
+	}
+
+	/*
+	 * Write the pbc.
+	 * We have to flush after the PBC for correctness
+	 * on some cpus or WC buffer can be written out of order.
+	 */
+	writeq(pbc, piobuf);
+
+	if (dd->flags & QIB_PIO_FLUSH_WC) {
+		u32 *hdrp = (u32 *) &hdr;
+
+		qib_flush_wc();
+		qib_pio_copy(piobuf + 2, hdrp, hwords - 1);
+		qib_flush_wc();
+		__raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
+	} else
+		qib_pio_copy(piobuf + 2, (u32 *) &hdr, hwords);
+
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf + spcl_off);
+	}
+
+	qib_flush_wc();
+	qib_sendbuf_done(dd, pbufn);
+
+	this_cpu_inc(ibp->pmastats->n_unicast_xmit);
+	goto done;
+
+queue_ack:
+	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+		this_cpu_inc(*ibp->rvp.rc_qacks);
+		qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
+		qp->s_nak_state = qp->r_nak_state;
+		qp->s_ack_psn = qp->r_ack_psn;
+
+		/* Schedule the send tasklet. */
+		qib_schedule_send(qp);
+	}
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return;
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from qib_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct rvt_qp *qp, u32 psn)
+{
+	u32 n = qp->s_acked;
+	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
+	u32 opcode;
+
+	qp->s_cur = n;
+
+	/*
+	 * If we are starting the request from the beginning,
+	 * let the normal send code handle initialization.
+	 */
+	if (qib_cmp24(psn, wqe->psn) <= 0) {
+		qp->s_state = OP(SEND_LAST);
+		goto done;
+	}
+
+	/* Find the work request opcode corresponding to the given PSN. */
+	opcode = wqe->wr.opcode;
+	for (;;) {
+		int diff;
+
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+		wqe = rvt_get_swqe_ptr(qp, n);
+		diff = qib_cmp24(psn, wqe->psn);
+		if (diff < 0)
+			break;
+		qp->s_cur = n;
+		/*
+		 * If we are starting the request from the beginning,
+		 * let the normal send code handle initialization.
+		 */
+		if (diff == 0) {
+			qp->s_state = OP(SEND_LAST);
+			goto done;
+		}
+		opcode = wqe->wr.opcode;
+	}
+
+	/*
+	 * Set the state to restart in the middle of a request.
+	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
+	 * See qib_make_rc_req().
+	 */
+	switch (opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+		break;
+
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+		break;
+
+	case IB_WR_RDMA_READ:
+		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		break;
+
+	default:
+		/*
+		 * This case shouldn't happen since its only
+		 * one PSN per req.
+		 */
+		qp->s_state = OP(SEND_LAST);
+	}
+done:
+	qp->s_psn = psn;
+	/*
+	 * Set RVT_S_WAIT_PSN as qib_rc_complete() may start the timer
+	 * asynchronously before the send tasklet can get scheduled.
+	 * Doing it in qib_make_rc_req() is too late.
+	 */
+	if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
+	    (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
+		qp->s_flags |= RVT_S_WAIT_PSN;
+}
+
+/*
+ * Back up requester to resend the last un-ACKed request.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ */
+void qib_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
+{
+	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+	struct qib_ibport *ibp;
+
+	if (qp->s_retry == 0) {
+		if (qp->s_mig_state == IB_MIG_ARMED) {
+			qib_migrate_qp(qp);
+			qp->s_retry = qp->s_retry_cnt;
+		} else if (qp->s_last == qp->s_acked) {
+			qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+			rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			return;
+		} else /* XXX need to handle delayed completion */
+			return;
+	} else
+		qp->s_retry--;
+
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	if (wqe->wr.opcode == IB_WR_RDMA_READ)
+		ibp->rvp.n_rc_resends++;
+	else
+		ibp->rvp.n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
+
+	qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
+			 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
+			 RVT_S_WAIT_ACK);
+	if (wait)
+		qp->s_flags |= RVT_S_SEND_ONE;
+	reset_psn(qp, psn);
+}
+
+/*
+ * Set qp->s_sending_psn to the next PSN after the given one.
+ * This would be psn+1 except when RDMA reads are present.
+ */
+static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
+{
+	struct rvt_swqe *wqe;
+	u32 n = qp->s_last;
+
+	/* Find the work request corresponding to the given PSN. */
+	for (;;) {
+		wqe = rvt_get_swqe_ptr(qp, n);
+		if (qib_cmp24(psn, wqe->lpsn) <= 0) {
+			if (wqe->wr.opcode == IB_WR_RDMA_READ)
+				qp->s_sending_psn = wqe->lpsn + 1;
+			else
+				qp->s_sending_psn = psn + 1;
+			break;
+		}
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+	}
+}
+
+/*
+ * This should be called with the QP s_lock held and interrupts disabled.
+ */
+void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr)
+{
+	struct ib_other_headers *ohdr;
+	struct rvt_swqe *wqe;
+	u32 opcode;
+	u32 psn;
+
+	if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
+		return;
+
+	/* Find out where the BTH is */
+	if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else
+		ohdr = &hdr->u.l.oth;
+
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		WARN_ON(!qp->s_rdma_ack_cnt);
+		qp->s_rdma_ack_cnt--;
+		return;
+	}
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	reset_sending_psn(qp, psn);
+
+	/*
+	 * Start timer after a packet requesting an ACK has been sent and
+	 * there are still requests that haven't been acked.
+	 */
+	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+	    !(qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
+	    (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+		rvt_add_retry_timer(qp);
+
+	while (qp->s_last != qp->s_acked) {
+		u32 s_last;
+
+		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+		if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&
+		    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
+			break;
+		s_last = qp->s_last;
+		if (++s_last >= qp->s_size)
+			s_last = 0;
+		qp->s_last = s_last;
+		/* see post_send() */
+		barrier();
+		rvt_put_swqe(wqe);
+		rvt_qp_swqe_complete(qp,
+				     wqe,
+				     ib_qib_wc_opcode[wqe->wr.opcode],
+				     IB_WC_SUCCESS);
+	}
+	/*
+	 * If we were waiting for sends to complete before resending,
+	 * and they are now complete, restart sending.
+	 */
+	if (qp->s_flags & RVT_S_WAIT_PSN &&
+	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+		qp->s_flags &= ~RVT_S_WAIT_PSN;
+		qp->s_sending_psn = qp->s_psn;
+		qp->s_sending_hpsn = qp->s_psn - 1;
+		qib_schedule_send(qp);
+	}
+}
+
+static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
+{
+	qp->s_last_psn = psn;
+}
+
+/*
+ * Generate a SWQE completion.
+ * This is similar to qib_send_complete but has to check to be sure
+ * that the SGEs are not being referenced if the SWQE is being resent.
+ */
+static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
+					 struct rvt_swqe *wqe,
+					 struct qib_ibport *ibp)
+{
+	/*
+	 * Don't decrement refcount and don't generate a
+	 * completion if the SWQE is being resent until the send
+	 * is finished.
+	 */
+	if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||
+	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+		u32 s_last;
+
+		rvt_put_swqe(wqe);
+		s_last = qp->s_last;
+		if (++s_last >= qp->s_size)
+			s_last = 0;
+		qp->s_last = s_last;
+		/* see post_send() */
+		barrier();
+		rvt_qp_swqe_complete(qp,
+				     wqe,
+				     ib_qib_wc_opcode[wqe->wr.opcode],
+				     IB_WC_SUCCESS);
+	} else
+		this_cpu_inc(*ibp->rvp.rc_delayed_comp);
+
+	qp->s_retry = qp->s_retry_cnt;
+	update_last_psn(qp, wqe->lpsn);
+
+	/*
+	 * If we are completing a request which is in the process of
+	 * being resent, we can stop resending it since we know the
+	 * responder has already seen it.
+	 */
+	if (qp->s_acked == qp->s_cur) {
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		qp->s_acked = qp->s_cur;
+		wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+		if (qp->s_acked != qp->s_tail) {
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = wqe->psn;
+		}
+	} else {
+		if (++qp->s_acked >= qp->s_size)
+			qp->s_acked = 0;
+		if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
+			qp->s_draining = 0;
+		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+	}
+	return wqe;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from qib_rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
+		     u64 val, struct qib_ctxtdata *rcd)
+{
+	struct qib_ibport *ibp;
+	enum ib_wc_status status;
+	struct rvt_swqe *wqe;
+	int ret = 0;
+	u32 ack_psn;
+	int diff;
+
+	/*
+	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+	 * requests and implicitly NAK RDMA read and atomic requests issued
+	 * before the NAK'ed request.  The MSN won't include the NAK'ed
+	 * request but will include an ACK'ed request(s).
+	 */
+	ack_psn = psn;
+	if (aeth >> IB_AETH_NAK_SHIFT)
+		ack_psn--;
+	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+	/*
+	 * The MSN might be for a later WQE than the PSN indicates so
+	 * only complete WQEs that the PSN finishes.
+	 */
+	while ((diff = qib_cmp24(ack_psn, wqe->lpsn)) >= 0) {
+		/*
+		 * RDMA_READ_RESPONSE_ONLY is a special case since
+		 * we want to generate completion events for everything
+		 * before the RDMA read, copy the data, then generate
+		 * the completion for the read.
+		 */
+		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+		    diff == 0) {
+			ret = 1;
+			goto bail;
+		}
+		/*
+		 * If this request is a RDMA read or atomic, and the ACK is
+		 * for a later operation, this ACK NAKs the RDMA read or
+		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
+		 * can ACK a RDMA read and likewise for atomic ops.  Note
+		 * that the NAK case can only happen if relaxed ordering is
+		 * used and requests are sent after an RDMA read or atomic
+		 * is sent but before the response is received.
+		 */
+		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+			/* Retry this request. */
+			if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+				qp->r_flags |= RVT_R_RDMAR_SEQ;
+				qib_restart_rc(qp, qp->s_last_psn + 1, 0);
+				if (list_empty(&qp->rspwait)) {
+					qp->r_flags |= RVT_R_RSP_SEND;
+					rvt_get_qp(qp);
+					list_add_tail(&qp->rspwait,
+						      &rcd->qp_wait_list);
+				}
+			}
+			/*
+			 * No need to process the ACK/NAK since we are
+			 * restarting an earlier request.
+			 */
+			goto bail;
+		}
+		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+			u64 *vaddr = wqe->sg_list[0].vaddr;
+			*vaddr = val;
+		}
+		if (qp->s_num_rd_atomic &&
+		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+			qp->s_num_rd_atomic--;
+			/* Restart sending task if fence is complete */
+			if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
+			    !qp->s_num_rd_atomic) {
+				qp->s_flags &= ~(RVT_S_WAIT_FENCE |
+						 RVT_S_WAIT_ACK);
+				qib_schedule_send(qp);
+			} else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
+				qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
+						 RVT_S_WAIT_ACK);
+				qib_schedule_send(qp);
+			}
+		}
+		wqe = do_rc_completion(qp, wqe, ibp);
+		if (qp->s_acked == qp->s_tail)
+			break;
+	}
+
+	switch (aeth >> IB_AETH_NAK_SHIFT) {
+	case 0:         /* ACK */
+		this_cpu_inc(*ibp->rvp.rc_acks);
+		if (qp->s_acked != qp->s_tail) {
+			/*
+			 * We are expecting more ACKs so
+			 * reset the retransmit timer.
+			 */
+			rvt_mod_retry_timer(qp);
+			/*
+			 * We can stop resending the earlier packets and
+			 * continue with the next packet the receiver wants.
+			 */
+			if (qib_cmp24(qp->s_psn, psn) <= 0)
+				reset_psn(qp, psn + 1);
+		} else {
+			/* No more acks - kill all timers */
+			rvt_stop_rc_timers(qp);
+			if (qib_cmp24(qp->s_psn, psn) <= 0) {
+				qp->s_state = OP(SEND_LAST);
+				qp->s_psn = psn + 1;
+			}
+		}
+		if (qp->s_flags & RVT_S_WAIT_ACK) {
+			qp->s_flags &= ~RVT_S_WAIT_ACK;
+			qib_schedule_send(qp);
+		}
+		rvt_get_credit(qp, aeth);
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		qp->s_retry = qp->s_retry_cnt;
+		update_last_psn(qp, psn);
+		return 1;
+
+	case 1:         /* RNR NAK */
+		ibp->rvp.n_rnr_naks++;
+		if (qp->s_acked == qp->s_tail)
+			goto bail;
+		if (qp->s_flags & RVT_S_WAIT_RNR)
+			goto bail;
+		if (qp->s_rnr_retry == 0) {
+			status = IB_WC_RNR_RETRY_EXC_ERR;
+			goto class_b;
+		}
+		if (qp->s_rnr_retry_cnt < 7)
+			qp->s_rnr_retry--;
+
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+
+		ibp->rvp.n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
+
+		reset_psn(qp, psn);
+
+		qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
+		rvt_stop_rc_timers(qp);
+		rvt_add_rnr_timer(qp, aeth);
+		return 0;
+
+	case 3:         /* NAK */
+		if (qp->s_acked == qp->s_tail)
+			goto bail;
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+		switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
+			IB_AETH_CREDIT_MASK) {
+		case 0: /* PSN sequence error */
+			ibp->rvp.n_seq_naks++;
+			/*
+			 * Back up to the responder's expected PSN.
+			 * Note that we might get a NAK in the middle of an
+			 * RDMA READ response which terminates the RDMA
+			 * READ.
+			 */
+			qib_restart_rc(qp, psn, 0);
+			qib_schedule_send(qp);
+			break;
+
+		case 1: /* Invalid Request */
+			status = IB_WC_REM_INV_REQ_ERR;
+			ibp->rvp.n_other_naks++;
+			goto class_b;
+
+		case 2: /* Remote Access Error */
+			status = IB_WC_REM_ACCESS_ERR;
+			ibp->rvp.n_other_naks++;
+			goto class_b;
+
+		case 3: /* Remote Operation Error */
+			status = IB_WC_REM_OP_ERR;
+			ibp->rvp.n_other_naks++;
+class_b:
+			if (qp->s_last == qp->s_acked) {
+				qib_send_complete(qp, wqe, status);
+				rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			}
+			break;
+
+		default:
+			/* Ignore other reserved NAK error codes */
+			goto reserved;
+		}
+		qp->s_retry = qp->s_retry_cnt;
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		goto bail;
+
+	default:                /* 2: reserved */
+reserved:
+		/* Ignore reserved NAK codes. */
+		goto bail;
+	}
+
+bail:
+	rvt_stop_rc_timers(qp);
+	return ret;
+}
+
+/*
+ * We have seen an out of sequence RDMA read middle or last packet.
+ * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
+ */
+static void rdma_seq_err(struct rvt_qp *qp, struct qib_ibport *ibp, u32 psn,
+			 struct qib_ctxtdata *rcd)
+{
+	struct rvt_swqe *wqe;
+
+	/* Remove QP from retry timer */
+	rvt_stop_rc_timers(qp);
+
+	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+
+	while (qib_cmp24(psn, wqe->lpsn) > 0) {
+		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+			break;
+		wqe = do_rc_completion(qp, wqe, ibp);
+	}
+
+	ibp->rvp.n_rdma_seq++;
+	qp->r_flags |= RVT_R_RDMAR_SEQ;
+	qib_restart_rc(qp, qp->s_last_psn + 1, 0);
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= RVT_R_RSP_SEND;
+		rvt_get_qp(qp);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+}
+
+/**
+ * qib_rc_rcv_resp - process an incoming RC response packet
+ * @ibp: the port this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ *
+ * This is called from qib_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static void qib_rc_rcv_resp(struct qib_ibport *ibp,
+			    struct ib_other_headers *ohdr,
+			    void *data, u32 tlen,
+			    struct rvt_qp *qp,
+			    u32 opcode,
+			    u32 psn, u32 hdrsize, u32 pmtu,
+			    struct qib_ctxtdata *rcd)
+{
+	struct rvt_swqe *wqe;
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	enum ib_wc_status status;
+	unsigned long flags;
+	int diff;
+	u32 pad;
+	u32 aeth;
+	u64 val;
+
+	if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) {
+		/*
+		 * If ACK'd PSN on SDMA busy list try to make progress to
+		 * reclaim SDMA credits.
+		 */
+		if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) &&
+		    (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) {
+
+			/*
+			 * If send tasklet not running attempt to progress
+			 * SDMA queue.
+			 */
+			if (!(qp->s_flags & RVT_S_BUSY)) {
+				/* Acquire SDMA Lock */
+				spin_lock_irqsave(&ppd->sdma_lock, flags);
+				/* Invoke sdma make progress */
+				qib_sdma_make_progress(ppd);
+				/* Release SDMA Lock */
+				spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+			}
+		}
+	}
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+		goto ack_done;
+
+	/* Ignore invalid responses. */
+	if (qib_cmp24(psn, READ_ONCE(qp->s_next_psn)) >= 0)
+		goto ack_done;
+
+	/* Ignore duplicate responses. */
+	diff = qib_cmp24(psn, qp->s_last_psn);
+	if (unlikely(diff <= 0)) {
+		/* Update credits for "ghost" ACKs */
+		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+			aeth = be32_to_cpu(ohdr->u.aeth);
+			if ((aeth >> IB_AETH_NAK_SHIFT) == 0)
+				rvt_get_credit(qp, aeth);
+		}
+		goto ack_done;
+	}
+
+	/*
+	 * Skip everything other than the PSN we expect, if we are waiting
+	 * for a reply to a restarted RDMA read or atomic op.
+	 */
+	if (qp->r_flags & RVT_R_RDMAR_SEQ) {
+		if (qib_cmp24(psn, qp->s_last_psn + 1) != 0)
+			goto ack_done;
+		qp->r_flags &= ~RVT_R_RDMAR_SEQ;
+	}
+
+	if (unlikely(qp->s_acked == qp->s_tail))
+		goto ack_done;
+	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+	status = IB_WC_SUCCESS;
+
+	switch (opcode) {
+	case OP(ACKNOWLEDGE):
+	case OP(ATOMIC_ACKNOWLEDGE):
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		if (opcode == OP(ATOMIC_ACKNOWLEDGE))
+			val = ib_u64_get(&ohdr->u.at.atomic_ack_eth);
+		else
+			val = 0;
+		if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
+		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
+			goto ack_done;
+		hdrsize += 4;
+		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_middle;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/* no AETH, no ACK */
+		if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
+			goto ack_seq_err;
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+read_middle:
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto ack_len_err;
+		if (unlikely(pmtu >= qp->s_rdma_read_len))
+			goto ack_len_err;
+
+		/*
+		 * We got a response so update the timeout.
+		 * 4.096 usec. * (1 << qp->timeout)
+		 */
+		rvt_mod_retry_timer(qp);
+		if (qp->s_flags & RVT_S_WAIT_ACK) {
+			qp->s_flags &= ~RVT_S_WAIT_ACK;
+			qib_schedule_send(qp);
+		}
+
+		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+			qp->s_retry = qp->s_retry_cnt;
+
+		/*
+		 * Update the RDMA receive state but do the copy w/o
+		 * holding the locks and blocking interrupts.
+		 */
+		qp->s_rdma_read_len -= pmtu;
+		update_last_psn(qp, psn);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);
+		goto bail;
+
+	case OP(RDMA_READ_RESPONSE_ONLY):
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+			goto ack_done;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 0 && <= pmtu.
+		 * Remember to account for the AETH header (4) and
+		 * ICRC (4).
+		 */
+		if (unlikely(tlen < (hdrsize + pad + 8)))
+			goto ack_len_err;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_last;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/* ACKs READ req. */
+		if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
+			goto ack_seq_err;
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 1 && <= pmtu.
+		 * Remember to account for the AETH header (4) and
+		 * ICRC (4).
+		 */
+		if (unlikely(tlen <= (hdrsize + pad + 8)))
+			goto ack_len_err;
+read_last:
+		tlen -= hdrsize + pad + 8;
+		if (unlikely(tlen != qp->s_rdma_read_len))
+			goto ack_len_err;
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);
+		WARN_ON(qp->s_rdma_read_sge.num_sge);
+		(void) do_rc_ack(qp, aeth, psn,
+				 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
+		goto ack_done;
+	}
+
+ack_op_err:
+	status = IB_WC_LOC_QP_OP_ERR;
+	goto ack_err;
+
+ack_seq_err:
+	rdma_seq_err(qp, ibp, psn, rcd);
+	goto ack_done;
+
+ack_len_err:
+	status = IB_WC_LOC_LEN_ERR;
+ack_err:
+	if (qp->s_last == qp->s_acked) {
+		qib_send_complete(qp, wqe, status);
+		rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+	}
+ack_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+	return;
+}
+
+/**
+ * qib_rc_rcv_error - process an incoming duplicate or error RC packet
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ *
+ * This is called from qib_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static int qib_rc_rcv_error(struct ib_other_headers *ohdr,
+			    void *data,
+			    struct rvt_qp *qp,
+			    u32 opcode,
+			    u32 psn,
+			    int diff,
+			    struct qib_ctxtdata *rcd)
+{
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct rvt_ack_entry *e;
+	unsigned long flags;
+	u8 i, prev;
+	int old_req;
+
+	if (diff > 0) {
+		/*
+		 * Packet sequence error.
+		 * A NAK will ACK earlier sends and RDMA writes.
+		 * Don't queue the NAK if we already sent one.
+		 */
+		if (!qp->r_nak_state) {
+			ibp->rvp.n_rc_seqnak++;
+			qp->r_nak_state = IB_NAK_PSN_ERROR;
+			/* Use the expected PSN. */
+			qp->r_ack_psn = qp->r_psn;
+			/*
+			 * Wait to send the sequence NAK until all packets
+			 * in the receive queue have been processed.
+			 * Otherwise, we end up propagating congestion.
+			 */
+			if (list_empty(&qp->rspwait)) {
+				qp->r_flags |= RVT_R_RSP_NAK;
+				rvt_get_qp(qp);
+				list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+			}
+		}
+		goto done;
+	}
+
+	/*
+	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
+	 * write or atomic op.  Don't NAK errors, just silently drop
+	 * the duplicate request.  Note that r_sge, r_len, and
+	 * r_rcv_len may be in use so don't modify them.
+	 *
+	 * We are supposed to ACK the earliest duplicate PSN but we
+	 * can coalesce an outstanding duplicate ACK.  We have to
+	 * send the earliest so that RDMA reads can be restarted at
+	 * the requester's expected PSN.
+	 *
+	 * First, find where this duplicate PSN falls within the
+	 * ACKs previously sent.
+	 * old_req is true if there is an older response that is scheduled
+	 * to be sent before sending this one.
+	 */
+	e = NULL;
+	old_req = 1;
+	ibp->rvp.n_rc_dupreq++;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	for (i = qp->r_head_ack_queue; ; i = prev) {
+		if (i == qp->s_tail_ack_queue)
+			old_req = 0;
+		if (i)
+			prev = i - 1;
+		else
+			prev = QIB_MAX_RDMA_ATOMIC;
+		if (prev == qp->r_head_ack_queue) {
+			e = NULL;
+			break;
+		}
+		e = &qp->s_ack_queue[prev];
+		if (!e->opcode) {
+			e = NULL;
+			break;
+		}
+		if (qib_cmp24(psn, e->psn) >= 0) {
+			if (prev == qp->s_tail_ack_queue &&
+			    qib_cmp24(psn, e->lpsn) <= 0)
+				old_req = 0;
+			break;
+		}
+	}
+	switch (opcode) {
+	case OP(RDMA_READ_REQUEST): {
+		struct ib_reth *reth;
+		u32 offset;
+		u32 len;
+
+		/*
+		 * If we didn't find the RDMA read request in the ack queue,
+		 * we can ignore this request.
+		 */
+		if (!e || e->opcode != OP(RDMA_READ_REQUEST))
+			goto unlock_done;
+		/* RETH comes after BTH */
+		reth = &ohdr->u.rc.reth;
+		/*
+		 * Address range must be a subset of the original
+		 * request and start on pmtu boundaries.
+		 * We reuse the old ack_queue slot since the requester
+		 * should not back up and request an earlier PSN for the
+		 * same request.
+		 */
+		offset = ((psn - e->psn) & QIB_PSN_MASK) *
+			qp->pmtu;
+		len = be32_to_cpu(reth->length);
+		if (unlikely(offset + len != e->rdma_sge.sge_length))
+			goto unlock_done;
+		if (e->rdma_sge.mr) {
+			rvt_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		if (len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+					 IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto unlock_done;
+		} else {
+			e->rdma_sge.vaddr = NULL;
+			e->rdma_sge.length = 0;
+			e->rdma_sge.sge_length = 0;
+		}
+		e->psn = psn;
+		if (old_req)
+			goto unlock_done;
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		/*
+		 * If we didn't find the atomic request in the ack queue
+		 * or the send tasklet is already backed up to send an
+		 * earlier entry, we can ignore this request.
+		 */
+		if (!e || e->opcode != (u8) opcode || old_req)
+			goto unlock_done;
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	default:
+		/*
+		 * Ignore this operation if it doesn't request an ACK
+		 * or an earlier RDMA read or atomic is going to be resent.
+		 */
+		if (!(psn & IB_BTH_REQ_ACK) || old_req)
+			goto unlock_done;
+		/*
+		 * Resend the most recent ACK if this request is
+		 * after all the previous RDMA reads and atomics.
+		 */
+		if (i == qp->r_head_ack_queue) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->r_psn - 1;
+			goto send_ack;
+		}
+		/*
+		 * Try to send a simple ACK to work around a Mellanox bug
+		 * which doesn't accept a RDMA read response or atomic
+		 * response as an ACK for earlier SENDs or RDMA writes.
+		 */
+		if (!(qp->s_flags & RVT_S_RESP_PENDING)) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
+			goto send_ack;
+		}
+		/*
+		 * Resend the RDMA read or atomic op which
+		 * ACKs this duplicate request.
+		 */
+		qp->s_tail_ack_queue = i;
+		break;
+	}
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+	qp->s_flags |= RVT_S_RESP_PENDING;
+	qp->r_nak_state = 0;
+	qib_schedule_send(qp);
+
+unlock_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return 1;
+
+send_ack:
+	return 0;
+}
+
+static inline void qib_update_ack_queue(struct rvt_qp *qp, unsigned n)
+{
+	unsigned next;
+
+	next = n + 1;
+	if (next > QIB_MAX_RDMA_ATOMIC)
+		next = 0;
+	qp->s_tail_ack_queue = next;
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+/**
+ * qib_rc_rcv - process an incoming RC packet
+ * @rcd: the context pointer
+ * @hdr: the header of this packet
+ * @has_grh: true if the header has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from qib_qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void qib_rc_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct rvt_qp *qp)
+{
+	struct qib_ibport *ibp = &rcd->ppd->ibport_data;
+	struct ib_other_headers *ohdr;
+	u32 opcode;
+	u32 hdrsize;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = qp->pmtu;
+	int diff;
+	struct ib_reth *reth;
+	unsigned long flags;
+	int ret;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12;       /* LRH + BTH */
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
+	}
+
+	opcode = be32_to_cpu(ohdr->bth[0]);
+	if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
+		return;
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	opcode >>= 24;
+
+	/*
+	 * Process responses (ACKs) before anything else.  Note that the
+	 * packet sequence number will be for something in the send work
+	 * queue rather than the expected receive packet sequence number.
+	 * In other words, this QP is the requester.
+	 */
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		qib_rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
+				hdrsize, pmtu, rcd);
+		return;
+	}
+
+	/* Compute 24 bits worth of difference. */
+	diff = qib_cmp24(psn, qp->r_psn);
+	if (unlikely(diff)) {
+		if (qib_rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
+			return;
+		goto send_ack;
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	default:
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			goto nack_inv;
+		/*
+		 * Note that it is up to the requester to not send a new
+		 * RDMA read or atomic operation before receiving an ACK
+		 * for the previous operation.
+		 */
+		break;
+	}
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+		rvt_comm_est(qp);
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+	case OP(RDMA_WRITE_MIDDLE):
+send_middle:
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto nack_inv;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto nack_inv;
+		qib_copy_sge(&qp->r_sge, data, pmtu, 1);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		/* consume RWQE */
+		ret = qib_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		goto send_last_imm;
+
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto no_immediate_data;
+		/* fall through -- for SEND_ONLY_WITH_IMMEDIATE */
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+		wc.ex.imm_data = ohdr->u.imm_data;
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+	case OP(SEND_LAST):
+	case OP(RDMA_WRITE_LAST):
+no_immediate_data:
+		wc.wc_flags = 0;
+		wc.ex.imm_data = 0;
+send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto nack_inv;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len))
+			goto nack_inv;
+		qib_copy_sge(&qp->r_sge, data, tlen, 1);
+		rvt_put_ss(&qp->r_sge);
+		qp->r_msn++;
+		if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+			break;
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		else
+			wc.opcode = IB_WC_RECV;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr);
+		wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
+		/* zero fields that are N/A */
+		wc.vendor_err = 0;
+		wc.pkey_index = 0;
+		wc.dlid_path_bits = 0;
+		wc.port_num = 0;
+		/* Signal completion event if the solicited bit is set. */
+		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+			     ib_bth_is_solicited(ohdr));
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto nack_inv;
+		/* consume RWQE */
+		reth = &ohdr->u.rc.reth;
+		hdrsize += sizeof(*reth);
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		qp->r_sge.sg_list = NULL;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
+					 rkey, IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok))
+				goto nack_acc;
+			qp->r_sge.num_sge = 1;
+		} else {
+			qp->r_sge.num_sge = 0;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (opcode == OP(RDMA_WRITE_FIRST))
+			goto send_middle;
+		else if (opcode == OP(RDMA_WRITE_ONLY))
+			goto no_immediate_data;
+		ret = qib_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret) {
+			rvt_put_ss(&qp->r_sge);
+			goto rnr_nak;
+		}
+		wc.ex.imm_data = ohdr->u.rc.imm_data;
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+
+	case OP(RDMA_READ_REQUEST): {
+		struct rvt_ack_entry *e;
+		u32 len;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		/* s_ack_queue is size QIB_MAX_RDMA_ATOMIC+1 so use > not >= */
+		if (next > QIB_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			qib_update_ack_queue(qp, next);
+		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+			rvt_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		reth = &ohdr->u.rc.reth;
+		len = be32_to_cpu(reth->length);
+		if (len) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+					 rkey, IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto nack_acc_unlck;
+			/*
+			 * Update the next expected PSN.  We add 1 later
+			 * below, so only add the remainder here.
+			 */
+			qp->r_psn += rvt_div_mtu(qp, len - 1);
+		} else {
+			e->rdma_sge.mr = NULL;
+			e->rdma_sge.vaddr = NULL;
+			e->rdma_sge.length = 0;
+			e->rdma_sge.sge_length = 0;
+		}
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn;
+		e->lpsn = qp->r_psn;
+		/*
+		 * We need to increment the MSN here instead of when we
+		 * finish sending the result since a duplicate request would
+		 * increment it more than once.
+		 */
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		qp->s_flags |= RVT_S_RESP_PENDING;
+		qib_schedule_send(qp);
+
+		goto sunlock;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		struct ib_atomic_eth *ateth;
+		struct rvt_ack_entry *e;
+		u64 vaddr;
+		atomic64_t *maddr;
+		u64 sdata;
+		u32 rkey;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		if (next > QIB_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			qib_update_ack_queue(qp, next);
+		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+			rvt_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		ateth = &ohdr->u.atomic_eth;
+		vaddr = get_ib_ateth_vaddr(ateth);
+		if (unlikely(vaddr & (sizeof(u64) - 1)))
+			goto nack_inv_unlck;
+		rkey = be32_to_cpu(ateth->rkey);
+		/* Check rkey & NAK */
+		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+					  vaddr, rkey,
+					  IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_acc_unlck;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = get_ib_ateth_swap(ateth);
+		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      get_ib_ateth_compare(ateth),
+				      sdata);
+		rvt_put_mr(qp->r_sge.sge.mr);
+		qp->r_sge.num_sge = 0;
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn;
+		e->lpsn = psn;
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		qp->s_flags |= RVT_S_RESP_PENDING;
+		qib_schedule_send(qp);
+
+		goto sunlock;
+	}
+
+	default:
+		/* NAK unknown opcodes. */
+		goto nack_inv;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+	qp->r_ack_psn = psn;
+	qp->r_nak_state = 0;
+	/* Send an ACK if requested or required. */
+	if (psn & (1 << 31))
+		goto send_ack;
+	return;
+
+rnr_nak:
+	qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue RNR NAK for later */
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= RVT_R_RSP_NAK;
+		rvt_get_qp(qp);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+	return;
+
+nack_op_err:
+	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= RVT_R_RSP_NAK;
+		rvt_get_qp(qp);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+	return;
+
+nack_inv_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= RVT_R_RSP_NAK;
+		rvt_get_qp(qp);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+	return;
+
+nack_acc_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+send_ack:
+	qib_send_rc_ack(qp);
+	return;
+
+sunlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c
new file mode 100644
index 0000000..4662cc7
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -0,0 +1,804 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <rdma/ib_smi.h>
+
+#include "qib.h"
+#include "qib_mad.h"
+
+/*
+ * Validate a RWQE and fill in the SGE state.
+ * Return 1 if OK.
+ */
+static int qib_init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
+{
+	int i, j, ret;
+	struct ib_wc wc;
+	struct rvt_lkey_table *rkt;
+	struct rvt_pd *pd;
+	struct rvt_sge_state *ss;
+
+	rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table;
+	pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
+	ss = &qp->r_sge;
+	ss->sg_list = qp->r_sg_list;
+	qp->r_len = 0;
+	for (i = j = 0; i < wqe->num_sge; i++) {
+		if (wqe->sg_list[i].length == 0)
+			continue;
+		/* Check LKEY */
+		ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+				  NULL, &wqe->sg_list[i],
+				  IB_ACCESS_LOCAL_WRITE);
+		if (unlikely(ret <= 0))
+			goto bad_lkey;
+		qp->r_len += wqe->sg_list[i].length;
+		j++;
+	}
+	ss->num_sge = j;
+	ss->total_len = qp->r_len;
+	ret = 1;
+	goto bail;
+
+bad_lkey:
+	while (j) {
+		struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
+
+		rvt_put_mr(sge->mr);
+	}
+	ss->num_sge = 0;
+	memset(&wc, 0, sizeof(wc));
+	wc.wr_id = wqe->wr_id;
+	wc.status = IB_WC_LOC_PROT_ERR;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	/* Signal solicited completion event. */
+	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * qib_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return -1 if there is a local error, 0 if no RWQE is available,
+ * otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int qib_get_rwqe(struct rvt_qp *qp, int wr_id_only)
+{
+	unsigned long flags;
+	struct rvt_rq *rq;
+	struct rvt_rwq *wq;
+	struct rvt_srq *srq;
+	struct rvt_rwqe *wqe;
+	void (*handler)(struct ib_event *, void *);
+	u32 tail;
+	int ret;
+
+	if (qp->ibqp.srq) {
+		srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
+		handler = srq->ibsrq.event_handler;
+		rq = &srq->rq;
+	} else {
+		srq = NULL;
+		handler = NULL;
+		rq = &qp->r_rq;
+	}
+
+	spin_lock_irqsave(&rq->lock, flags);
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+		ret = 0;
+		goto unlock;
+	}
+
+	wq = rq->wq;
+	tail = wq->tail;
+	/* Validate tail before using it since it is user writable. */
+	if (tail >= rq->size)
+		tail = 0;
+	if (unlikely(tail == wq->head)) {
+		ret = 0;
+		goto unlock;
+	}
+	/* Make sure entry is read after head index is read. */
+	smp_rmb();
+	wqe = rvt_get_rwqe_ptr(rq, tail);
+	/*
+	 * Even though we update the tail index in memory, the verbs
+	 * consumer is not supposed to post more entries until a
+	 * completion is generated.
+	 */
+	if (++tail >= rq->size)
+		tail = 0;
+	wq->tail = tail;
+	if (!wr_id_only && !qib_init_sge(qp, wqe)) {
+		ret = -1;
+		goto unlock;
+	}
+	qp->r_wr_id = wqe->wr_id;
+
+	ret = 1;
+	set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
+	if (handler) {
+		u32 n;
+
+		/*
+		 * Validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		n = wq->head;
+		if (n >= rq->size)
+			n = 0;
+		if (n < tail)
+			n += rq->size - tail;
+		else
+			n -= tail;
+		if (n < srq->limit) {
+			struct ib_event ev;
+
+			srq->limit = 0;
+			spin_unlock_irqrestore(&rq->lock, flags);
+			ev.device = qp->ibqp.device;
+			ev.element.srq = qp->ibqp.srq;
+			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			handler(&ev, srq->ibsrq.srq_context);
+			goto bail;
+		}
+	}
+unlock:
+	spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+	return ret;
+}
+
+/*
+ * Switch to alternate path.
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void qib_migrate_qp(struct rvt_qp *qp)
+{
+	struct ib_event ev;
+
+	qp->s_mig_state = IB_MIG_MIGRATED;
+	qp->remote_ah_attr = qp->alt_ah_attr;
+	qp->port_num = rdma_ah_get_port_num(&qp->alt_ah_attr);
+	qp->s_pkey_index = qp->s_alt_pkey_index;
+
+	ev.device = qp->ibqp.device;
+	ev.element.qp = &qp->ibqp;
+	ev.event = IB_EVENT_PATH_MIG;
+	qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+}
+
+static __be64 get_sguid(struct qib_ibport *ibp, unsigned index)
+{
+	if (!index) {
+		struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+		return ppd->guid;
+	}
+	return ibp->guids[index - 1];
+}
+
+static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
+{
+	return (gid->global.interface_id == id &&
+		(gid->global.subnet_prefix == gid_prefix ||
+		 gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
+}
+
+/*
+ *
+ * This should be called with the QP r_lock held.
+ *
+ * The s_lock will be acquired around the qib_migrate_qp() call.
+ */
+int qib_ruc_check_hdr(struct qib_ibport *ibp, struct ib_header *hdr,
+		      int has_grh, struct rvt_qp *qp, u32 bth0)
+{
+	__be64 guid;
+	unsigned long flags;
+
+	if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
+		if (!has_grh) {
+			if (rdma_ah_get_ah_flags(&qp->alt_ah_attr) &
+			    IB_AH_GRH)
+				goto err;
+		} else {
+			const struct ib_global_route *grh;
+
+			if (!(rdma_ah_get_ah_flags(&qp->alt_ah_attr) &
+			      IB_AH_GRH))
+				goto err;
+			grh = rdma_ah_read_grh(&qp->alt_ah_attr);
+			guid = get_sguid(ibp, grh->sgid_index);
+			if (!gid_ok(&hdr->u.l.grh.dgid,
+				    ibp->rvp.gid_prefix, guid))
+				goto err;
+			if (!gid_ok(&hdr->u.l.grh.sgid,
+			    grh->dgid.global.subnet_prefix,
+			    grh->dgid.global.interface_id))
+				goto err;
+		}
+		if (!qib_pkey_ok((u16)bth0,
+				 qib_get_pkey(ibp, qp->s_alt_pkey_index))) {
+			qib_bad_pkey(ibp,
+				     (u16)bth0,
+				     (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+				     0, qp->ibqp.qp_num,
+				     hdr->lrh[3], hdr->lrh[1]);
+			goto err;
+		}
+		/* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
+		if ((be16_to_cpu(hdr->lrh[3]) !=
+		     rdma_ah_get_dlid(&qp->alt_ah_attr)) ||
+		    ppd_from_ibp(ibp)->port !=
+			    rdma_ah_get_port_num(&qp->alt_ah_attr))
+			goto err;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		qib_migrate_qp(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	} else {
+		if (!has_grh) {
+			if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) &
+			    IB_AH_GRH)
+				goto err;
+		} else {
+			const struct ib_global_route *grh;
+
+			if (!(rdma_ah_get_ah_flags(&qp->remote_ah_attr) &
+			      IB_AH_GRH))
+				goto err;
+			grh = rdma_ah_read_grh(&qp->remote_ah_attr);
+			guid = get_sguid(ibp, grh->sgid_index);
+			if (!gid_ok(&hdr->u.l.grh.dgid,
+				    ibp->rvp.gid_prefix, guid))
+				goto err;
+			if (!gid_ok(&hdr->u.l.grh.sgid,
+			    grh->dgid.global.subnet_prefix,
+			    grh->dgid.global.interface_id))
+				goto err;
+		}
+		if (!qib_pkey_ok((u16)bth0,
+				 qib_get_pkey(ibp, qp->s_pkey_index))) {
+			qib_bad_pkey(ibp,
+				     (u16)bth0,
+				     (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+				     0, qp->ibqp.qp_num,
+				     hdr->lrh[3], hdr->lrh[1]);
+			goto err;
+		}
+		/* Validate the SLID. See Ch. 9.6.1.5 */
+		if (be16_to_cpu(hdr->lrh[3]) !=
+		    rdma_ah_get_dlid(&qp->remote_ah_attr) ||
+		    ppd_from_ibp(ibp)->port != qp->port_num)
+			goto err;
+		if (qp->s_mig_state == IB_MIG_REARM &&
+		    !(bth0 & IB_BTH_MIG_REQ))
+			qp->s_mig_state = IB_MIG_ARMED;
+	}
+
+	return 0;
+
+err:
+	return 1;
+}
+
+/**
+ * qib_ruc_loopback - handle UC and RC lookback requests
+ * @sqp: the sending QP
+ *
+ * This is called from qib_do_send() to
+ * forward a WQE addressed to the same HCA.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void qib_ruc_loopback(struct rvt_qp *sqp)
+{
+	struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = ppd->dd;
+	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+	struct rvt_qp *qp;
+	struct rvt_swqe *wqe;
+	struct rvt_sge *sge;
+	unsigned long flags;
+	struct ib_wc wc;
+	u64 sdata;
+	atomic64_t *maddr;
+	enum ib_wc_status send_status;
+	int release;
+	int ret;
+
+	rcu_read_lock();
+	/*
+	 * Note that we check the responder QP state after
+	 * checking the requester's state.
+	 */
+	qp = rvt_lookup_qpn(rdi, &ibp->rvp, sqp->remote_qpn);
+	if (!qp)
+		goto done;
+
+	spin_lock_irqsave(&sqp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
+	    !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+		goto unlock;
+
+	sqp->s_flags |= RVT_S_BUSY;
+
+again:
+	if (sqp->s_last == READ_ONCE(sqp->s_head))
+		goto clr_busy;
+	wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
+
+	/* Return if it is not OK to start a new work reqeust. */
+	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
+			goto clr_busy;
+		/* We are in the error state, flush the work request. */
+		send_status = IB_WC_WR_FLUSH_ERR;
+		goto flush_send;
+	}
+
+	/*
+	 * We can rely on the entry not changing without the s_lock
+	 * being held until we update s_last.
+	 * We increment s_cur to indicate s_last is in progress.
+	 */
+	if (sqp->s_last == sqp->s_cur) {
+		if (++sqp->s_cur >= sqp->s_size)
+			sqp->s_cur = 0;
+	}
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+	if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
+	    qp->ibqp.qp_type != sqp->ibqp.qp_type) {
+		ibp->rvp.n_pkt_drops++;
+		/*
+		 * For RC, the requester would timeout and retry so
+		 * shortcut the timeouts and just signal too many retries.
+		 */
+		if (sqp->ibqp.qp_type == IB_QPT_RC)
+			send_status = IB_WC_RETRY_EXC_ERR;
+		else
+			send_status = IB_WC_SUCCESS;
+		goto serr;
+	}
+
+	memset(&wc, 0, sizeof(wc));
+	send_status = IB_WC_SUCCESS;
+
+	release = 1;
+	sqp->s_sge.sge = wqe->sg_list[0];
+	sqp->s_sge.sg_list = wqe->sg_list + 1;
+	sqp->s_sge.num_sge = wqe->wr.num_sge;
+	sqp->s_len = wqe->length;
+	switch (wqe->wr.opcode) {
+	case IB_WR_SEND_WITH_IMM:
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		/* FALLTHROUGH */
+	case IB_WR_SEND:
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto op_err;
+		if (!ret)
+			goto rnr_nak;
+		break;
+
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		ret = qib_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto op_err;
+		if (!ret)
+			goto rnr_nak;
+		/* FALLTHROUGH */
+	case IB_WR_RDMA_WRITE:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		if (wqe->length == 0)
+			break;
+		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
+					  wqe->rdma_wr.remote_addr,
+					  wqe->rdma_wr.rkey,
+					  IB_ACCESS_REMOTE_WRITE)))
+			goto acc_err;
+		qp->r_sge.sg_list = NULL;
+		qp->r_sge.num_sge = 1;
+		qp->r_sge.total_len = wqe->length;
+		break;
+
+	case IB_WR_RDMA_READ:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto inv_err;
+		if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
+					  wqe->rdma_wr.remote_addr,
+					  wqe->rdma_wr.rkey,
+					  IB_ACCESS_REMOTE_READ)))
+			goto acc_err;
+		release = 0;
+		sqp->s_sge.sg_list = NULL;
+		sqp->s_sge.num_sge = 1;
+		qp->r_sge.sge = wqe->sg_list[0];
+		qp->r_sge.sg_list = wqe->sg_list + 1;
+		qp->r_sge.num_sge = wqe->wr.num_sge;
+		qp->r_sge.total_len = wqe->length;
+		break;
+
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto inv_err;
+		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+					  wqe->atomic_wr.remote_addr,
+					  wqe->atomic_wr.rkey,
+					  IB_ACCESS_REMOTE_ATOMIC)))
+			goto acc_err;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = wqe->atomic_wr.compare_add;
+		*(u64 *) sqp->s_sge.sge.vaddr =
+			(wqe->atomic_wr.wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      sdata, wqe->atomic_wr.swap);
+		rvt_put_mr(qp->r_sge.sge.mr);
+		qp->r_sge.num_sge = 0;
+		goto send_comp;
+
+	default:
+		send_status = IB_WC_LOC_QP_OP_ERR;
+		goto serr;
+	}
+
+	sge = &sqp->s_sge.sge;
+	while (sqp->s_len) {
+		u32 len = sqp->s_len;
+
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		qib_copy_sge(&qp->r_sge, sge->vaddr, len, release);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (!release)
+				rvt_put_mr(sge->mr);
+			if (--sqp->s_sge.num_sge)
+				*sge = *sqp->s_sge.sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= RVT_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		sqp->s_len -= len;
+	}
+	if (release)
+		rvt_put_ss(&qp->r_sge);
+
+	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+		goto send_comp;
+
+	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+	else
+		wc.opcode = IB_WC_RECV;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.byte_len = wqe->length;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = qp->remote_qpn;
+	wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr);
+	wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+		     wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	ibp->rvp.n_loop_pkts++;
+flush_send:
+	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+	qib_send_complete(sqp, wqe, send_status);
+	goto again;
+
+rnr_nak:
+	/* Handle RNR NAK */
+	if (qp->ibqp.qp_type == IB_QPT_UC)
+		goto send_comp;
+	ibp->rvp.n_rnr_naks++;
+	/*
+	 * Note: we don't need the s_lock held since the BUSY flag
+	 * makes this single threaded.
+	 */
+	if (sqp->s_rnr_retry == 0) {
+		send_status = IB_WC_RNR_RETRY_EXC_ERR;
+		goto serr;
+	}
+	if (sqp->s_rnr_retry_cnt < 7)
+		sqp->s_rnr_retry--;
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
+		goto clr_busy;
+	rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer <<
+				IB_AETH_CREDIT_SHIFT);
+	goto clr_busy;
+
+op_err:
+	send_status = IB_WC_REM_OP_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+inv_err:
+	send_status = IB_WC_REM_INV_REQ_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+acc_err:
+	send_status = IB_WC_REM_ACCESS_ERR;
+	wc.status = IB_WC_LOC_PROT_ERR;
+err:
+	/* responder goes to error state */
+	rvt_rc_error(qp, wc.status);
+
+serr:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	qib_send_complete(sqp, wqe, send_status);
+	if (sqp->ibqp.qp_type == IB_QPT_RC) {
+		int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+		sqp->s_flags &= ~RVT_S_BUSY;
+		spin_unlock_irqrestore(&sqp->s_lock, flags);
+		if (lastwqe) {
+			struct ib_event ev;
+
+			ev.device = sqp->ibqp.device;
+			ev.element.qp = &sqp->ibqp;
+			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+		}
+		goto done;
+	}
+clr_busy:
+	sqp->s_flags &= ~RVT_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+	rcu_read_unlock();
+}
+
+/**
+ * qib_make_grh - construct a GRH header
+ * @ibp: a pointer to the IB port
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr,
+		 const struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+	hdr->version_tclass_flow =
+		cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
+			    (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
+			    (grh->flow_label << IB_GRH_FLOW_SHIFT));
+	hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+	/* next_hdr is defined by C8-7 in ch. 8.4.1 */
+	hdr->next_hdr = IB_GRH_NEXT_HDR;
+	hdr->hop_limit = grh->hop_limit;
+	/* The SGID is 32-bit aligned. */
+	hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix;
+	if (!grh->sgid_index)
+		hdr->sgid.global.interface_id = ppd_from_ibp(ibp)->guid;
+	else if (grh->sgid_index < QIB_GUIDS_PER_PORT)
+		hdr->sgid.global.interface_id = ibp->guids[grh->sgid_index - 1];
+	hdr->dgid = grh->dgid;
+
+	/* GRH header size in 32-bit words. */
+	return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+void qib_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
+			 u32 bth0, u32 bth2)
+{
+	struct qib_qp_priv *priv = qp->priv;
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	u16 lrh0;
+	u32 nwords;
+	u32 extra_bytes;
+
+	/* Construct the header. */
+	extra_bytes = -qp->s_cur_size & 3;
+	nwords = (qp->s_cur_size + extra_bytes) >> 2;
+	lrh0 = QIB_LRH_BTH;
+	if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) {
+		qp->s_hdrwords +=
+			qib_make_grh(ibp, &priv->s_hdr->u.l.grh,
+				     rdma_ah_read_grh(&qp->remote_ah_attr),
+				     qp->s_hdrwords, nwords);
+		lrh0 = QIB_LRH_GRH;
+	}
+	lrh0 |= ibp->sl_to_vl[rdma_ah_get_sl(&qp->remote_ah_attr)] << 12 |
+		rdma_ah_get_sl(&qp->remote_ah_attr) << 4;
+	priv->s_hdr->lrh[0] = cpu_to_be16(lrh0);
+	priv->s_hdr->lrh[1] =
+			cpu_to_be16(rdma_ah_get_dlid(&qp->remote_ah_attr));
+	priv->s_hdr->lrh[2] =
+			cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+	priv->s_hdr->lrh[3] =
+		cpu_to_be16(ppd_from_ibp(ibp)->lid |
+			    rdma_ah_get_path_bits(&qp->remote_ah_attr));
+	bth0 |= qib_get_pkey(ibp, qp->s_pkey_index);
+	bth0 |= extra_bytes << 20;
+	if (qp->s_mig_state == IB_MIG_MIGRATED)
+		bth0 |= IB_BTH_MIG_REQ;
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(bth2);
+	this_cpu_inc(ibp->pmastats->n_unicast_xmit);
+}
+
+void _qib_do_send(struct work_struct *work)
+{
+	struct qib_qp_priv *priv = container_of(work, struct qib_qp_priv,
+						s_work);
+	struct rvt_qp *qp = priv->owner;
+
+	qib_do_send(qp);
+}
+
+/**
+ * qib_do_send - perform a send on a QP
+ * @qp: pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void qib_do_send(struct rvt_qp *qp)
+{
+	struct qib_qp_priv *priv = qp->priv;
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	int (*make_req)(struct rvt_qp *qp, unsigned long *flags);
+	unsigned long flags;
+
+	if ((qp->ibqp.qp_type == IB_QPT_RC ||
+	     qp->ibqp.qp_type == IB_QPT_UC) &&
+	    (rdma_ah_get_dlid(&qp->remote_ah_attr) &
+	     ~((1 << ppd->lmc) - 1)) == ppd->lid) {
+		qib_ruc_loopback(qp);
+		return;
+	}
+
+	if (qp->ibqp.qp_type == IB_QPT_RC)
+		make_req = qib_make_rc_req;
+	else if (qp->ibqp.qp_type == IB_QPT_UC)
+		make_req = qib_make_uc_req;
+	else
+		make_req = qib_make_ud_req;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if (!qib_send_ok(qp)) {
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		return;
+	}
+
+	qp->s_flags |= RVT_S_BUSY;
+
+	do {
+		/* Check for a constructed packet to be sent. */
+		if (qp->s_hdrwords != 0) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			/*
+			 * If the packet cannot be sent now, return and
+			 * the send tasklet will be woken up later.
+			 */
+			if (qib_verbs_send(qp, priv->s_hdr, qp->s_hdrwords,
+					   qp->s_cur_sge, qp->s_cur_size))
+				return;
+			/* Record that s_hdr is empty. */
+			qp->s_hdrwords = 0;
+			spin_lock_irqsave(&qp->s_lock, flags);
+		}
+	} while (make_req(qp, &flags));
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
+		       enum ib_wc_status status)
+{
+	u32 old_last, last;
+
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+		return;
+
+	last = qp->s_last;
+	old_last = last;
+	if (++last >= qp->s_size)
+		last = 0;
+	qp->s_last = last;
+	/* See post_send() */
+	barrier();
+	rvt_put_swqe(wqe);
+	if (qp->ibqp.qp_type == IB_QPT_UD ||
+	    qp->ibqp.qp_type == IB_QPT_SMI ||
+	    qp->ibqp.qp_type == IB_QPT_GSI)
+		atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
+
+	rvt_qp_swqe_complete(qp,
+			     wqe,
+			     ib_qib_wc_opcode[wqe->wr.opcode],
+			     status);
+
+	if (qp->s_acked == old_last)
+		qp->s_acked = last;
+	if (qp->s_cur == old_last)
+		qp->s_cur = last;
+	if (qp->s_tail == old_last)
+		qp->s_tail = last;
+	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+		qp->s_draining = 0;
+}
diff --git a/drivers/infiniband/hw/qib/qib_sd7220.c b/drivers/infiniband/hw/qib/qib_sd7220.c
new file mode 100644
index 0000000..12caf3d
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_sd7220.c
@@ -0,0 +1,1450 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * This file contains all of the code that is specific to the SerDes
+ * on the QLogic_IB 7220 chip.
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/firmware.h>
+
+#include "qib.h"
+#include "qib_7220.h"
+
+#define SD7220_FW_NAME "qlogic/sd7220.fw"
+MODULE_FIRMWARE(SD7220_FW_NAME);
+
+/*
+ * Same as in qib_iba7220.c, but just the registers needed here.
+ * Could move whole set to qib_7220.h, but decided better to keep
+ * local.
+ */
+#define KREG_IDX(regname) (QIB_7220_##regname##_OFFS / sizeof(u64))
+#define kr_hwerrclear KREG_IDX(HwErrClear)
+#define kr_hwerrmask KREG_IDX(HwErrMask)
+#define kr_hwerrstatus KREG_IDX(HwErrStatus)
+#define kr_ibcstatus KREG_IDX(IBCStatus)
+#define kr_ibserdesctrl KREG_IDX(IBSerDesCtrl)
+#define kr_scratch KREG_IDX(Scratch)
+#define kr_xgxs_cfg KREG_IDX(XGXSCfg)
+/* these are used only here, not in qib_iba7220.c */
+#define kr_ibsd_epb_access_ctrl KREG_IDX(ibsd_epb_access_ctrl)
+#define kr_ibsd_epb_transaction_reg KREG_IDX(ibsd_epb_transaction_reg)
+#define kr_pciesd_epb_transaction_reg KREG_IDX(pciesd_epb_transaction_reg)
+#define kr_pciesd_epb_access_ctrl KREG_IDX(pciesd_epb_access_ctrl)
+#define kr_serdes_ddsrxeq0 KREG_IDX(SerDes_DDSRXEQ0)
+
+/*
+ * The IBSerDesMappTable is a memory that holds values to be stored in
+ * various SerDes registers by IBC.
+ */
+#define kr_serdes_maptable KREG_IDX(IBSerDesMappTable)
+
+/*
+ * Below used for sdnum parameter, selecting one of the two sections
+ * used for PCIe, or the single SerDes used for IB.
+ */
+#define PCIE_SERDES0 0
+#define PCIE_SERDES1 1
+
+/*
+ * The EPB requires addressing in a particular form. EPB_LOC() is intended
+ * to make #definitions a little more readable.
+ */
+#define EPB_ADDR_SHF 8
+#define EPB_LOC(chn, elt, reg) \
+	(((elt & 0xf) | ((chn & 7) << 4) | ((reg & 0x3f) << 9)) << \
+	 EPB_ADDR_SHF)
+#define EPB_IB_QUAD0_CS_SHF (25)
+#define EPB_IB_QUAD0_CS (1U <<  EPB_IB_QUAD0_CS_SHF)
+#define EPB_IB_UC_CS_SHF (26)
+#define EPB_PCIE_UC_CS_SHF (27)
+#define EPB_GLOBAL_WR (1U << (EPB_ADDR_SHF + 8))
+
+/* Forward declarations. */
+static int qib_sd7220_reg_mod(struct qib_devdata *dd, int sdnum, u32 loc,
+			      u32 data, u32 mask);
+static int ibsd_mod_allchnls(struct qib_devdata *dd, int loc, int val,
+			     int mask);
+static int qib_sd_trimdone_poll(struct qib_devdata *dd);
+static void qib_sd_trimdone_monitor(struct qib_devdata *dd, const char *where);
+static int qib_sd_setvals(struct qib_devdata *dd);
+static int qib_sd_early(struct qib_devdata *dd);
+static int qib_sd_dactrim(struct qib_devdata *dd);
+static int qib_internal_presets(struct qib_devdata *dd);
+/* Tweak the register (CMUCTRL5) that contains the TRIMSELF controls */
+static int qib_sd_trimself(struct qib_devdata *dd, int val);
+static int epb_access(struct qib_devdata *dd, int sdnum, int claim);
+static int qib_sd7220_ib_load(struct qib_devdata *dd,
+			      const struct firmware *fw);
+static int qib_sd7220_ib_vfy(struct qib_devdata *dd,
+			     const struct firmware *fw);
+
+/*
+ * Below keeps track of whether the "once per power-on" initialization has
+ * been done, because uC code Version 1.32.17 or higher allows the uC to
+ * be reset at will, and Automatic Equalization may require it. So the
+ * state of the reset "pin", is no longer valid. Instead, we check for the
+ * actual uC code having been loaded.
+ */
+static int qib_ibsd_ucode_loaded(struct qib_pportdata *ppd,
+				 const struct firmware *fw)
+{
+	struct qib_devdata *dd = ppd->dd;
+
+	if (!dd->cspec->serdes_first_init_done &&
+	    qib_sd7220_ib_vfy(dd, fw) > 0)
+		dd->cspec->serdes_first_init_done = 1;
+	return dd->cspec->serdes_first_init_done;
+}
+
+/* repeat #define for local use. "Real" #define is in qib_iba7220.c */
+#define QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR      0x0000004000000000ULL
+#define IB_MPREG5 (EPB_LOC(6, 0, 0xE) | (1L << EPB_IB_UC_CS_SHF))
+#define IB_MPREG6 (EPB_LOC(6, 0, 0xF) | (1U << EPB_IB_UC_CS_SHF))
+#define UC_PAR_CLR_D 8
+#define UC_PAR_CLR_M 0xC
+#define IB_CTRL2(chn) (EPB_LOC(chn, 7, 3) | EPB_IB_QUAD0_CS)
+#define START_EQ1(chan) EPB_LOC(chan, 7, 0x27)
+
+void qib_sd7220_clr_ibpar(struct qib_devdata *dd)
+{
+	int ret;
+
+	/* clear, then re-enable parity errs */
+	ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6,
+		UC_PAR_CLR_D, UC_PAR_CLR_M);
+	if (ret < 0) {
+		qib_dev_err(dd, "Failed clearing IBSerDes Parity err\n");
+		goto bail;
+	}
+	ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0,
+		UC_PAR_CLR_M);
+
+	qib_read_kreg32(dd, kr_scratch);
+	udelay(4);
+	qib_write_kreg(dd, kr_hwerrclear,
+		QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR);
+	qib_read_kreg32(dd, kr_scratch);
+bail:
+	return;
+}
+
+/*
+ * After a reset or other unusual event, the epb interface may need
+ * to be re-synchronized, between the host and the uC.
+ * returns <0 for failure to resync within IBSD_RESYNC_TRIES (not expected)
+ */
+#define IBSD_RESYNC_TRIES 3
+#define IB_PGUDP(chn) (EPB_LOC((chn), 2, 1) | EPB_IB_QUAD0_CS)
+#define IB_CMUDONE(chn) (EPB_LOC((chn), 7, 0xF) | EPB_IB_QUAD0_CS)
+
+static int qib_resync_ibepb(struct qib_devdata *dd)
+{
+	int ret, pat, tries, chn;
+	u32 loc;
+
+	ret = -1;
+	chn = 0;
+	for (tries = 0; tries < (4 * IBSD_RESYNC_TRIES); ++tries) {
+		loc = IB_PGUDP(chn);
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed read in resync\n");
+			continue;
+		}
+		if (ret != 0xF0 && ret != 0x55 && tries == 0)
+			qib_dev_err(dd, "unexpected pattern in resync\n");
+		pat = ret ^ 0xA5; /* alternate F0 and 55 */
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, pat, 0xFF);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed write in resync\n");
+			continue;
+		}
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed re-read in resync\n");
+			continue;
+		}
+		if (ret != pat) {
+			qib_dev_err(dd, "Failed compare1 in resync\n");
+			continue;
+		}
+		loc = IB_CMUDONE(chn);
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed CMUDONE rd in resync\n");
+			continue;
+		}
+		if ((ret & 0x70) != ((chn << 4) | 0x40)) {
+			qib_dev_err(dd, "Bad CMUDONE value %02X, chn %d\n",
+				    ret, chn);
+			continue;
+		}
+		if (++chn == 4)
+			break;  /* Success */
+	}
+	return (ret > 0) ? 0 : ret;
+}
+
+/*
+ * Localize the stuff that should be done to change IB uC reset
+ * returns <0 for errors.
+ */
+static int qib_ibsd_reset(struct qib_devdata *dd, int assert_rst)
+{
+	u64 rst_val;
+	int ret = 0;
+	unsigned long flags;
+
+	rst_val = qib_read_kreg64(dd, kr_ibserdesctrl);
+	if (assert_rst) {
+		/*
+		 * Vendor recommends "interrupting" uC before reset, to
+		 * minimize possible glitches.
+		 */
+		spin_lock_irqsave(&dd->cspec->sdepb_lock, flags);
+		epb_access(dd, IB_7220_SERDES, 1);
+		rst_val |= 1ULL;
+		/* Squelch possible parity error from _asserting_ reset */
+		qib_write_kreg(dd, kr_hwerrmask,
+			       dd->cspec->hwerrmask &
+			       ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR);
+		qib_write_kreg(dd, kr_ibserdesctrl, rst_val);
+		/* flush write, delay to ensure it took effect */
+		qib_read_kreg32(dd, kr_scratch);
+		udelay(2);
+		/* once it's reset, can remove interrupt */
+		epb_access(dd, IB_7220_SERDES, -1);
+		spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+	} else {
+		/*
+		 * Before we de-assert reset, we need to deal with
+		 * possible glitch on the Parity-error line.
+		 * Suppress it around the reset, both in chip-level
+		 * hwerrmask and in IB uC control reg. uC will allow
+		 * it again during startup.
+		 */
+		u64 val;
+
+		rst_val &= ~(1ULL);
+		qib_write_kreg(dd, kr_hwerrmask,
+			       dd->cspec->hwerrmask &
+			       ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR);
+
+		ret = qib_resync_ibepb(dd);
+		if (ret < 0)
+			qib_dev_err(dd, "unable to re-sync IB EPB\n");
+
+		/* set uC control regs to suppress parity errs */
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG5, 1, 1);
+		if (ret < 0)
+			goto bail;
+		/* IB uC code past Version 1.32.17 allow suppression of wdog */
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0x80,
+			0x80);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed to set WDOG disable\n");
+			goto bail;
+		}
+		qib_write_kreg(dd, kr_ibserdesctrl, rst_val);
+		/* flush write, delay for startup */
+		qib_read_kreg32(dd, kr_scratch);
+		udelay(1);
+		/* clear, then re-enable parity errs */
+		qib_sd7220_clr_ibpar(dd);
+		val = qib_read_kreg64(dd, kr_hwerrstatus);
+		if (val & QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR) {
+			qib_dev_err(dd, "IBUC Parity still set after RST\n");
+			dd->cspec->hwerrmask &=
+				~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR;
+		}
+		qib_write_kreg(dd, kr_hwerrmask,
+			dd->cspec->hwerrmask);
+	}
+
+bail:
+	return ret;
+}
+
+static void qib_sd_trimdone_monitor(struct qib_devdata *dd,
+	const char *where)
+{
+	int ret, chn, baduns;
+	u64 val;
+
+	if (!where)
+		where = "?";
+
+	/* give time for reset to settle out in EPB */
+	udelay(2);
+
+	ret = qib_resync_ibepb(dd);
+	if (ret < 0)
+		qib_dev_err(dd, "not able to re-sync IB EPB (%s)\n", where);
+
+	/* Do "sacrificial read" to get EPB in sane state after reset */
+	ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_CTRL2(0), 0, 0);
+	if (ret < 0)
+		qib_dev_err(dd, "Failed TRIMDONE 1st read, (%s)\n", where);
+
+	/* Check/show "summary" Trim-done bit in IBCStatus */
+	val = qib_read_kreg64(dd, kr_ibcstatus);
+	if (!(val & (1ULL << 11)))
+		qib_dev_err(dd, "IBCS TRIMDONE clear (%s)\n", where);
+	/*
+	 * Do "dummy read/mod/wr" to get EPB in sane state after reset
+	 * The default value for MPREG6 is 0.
+	 */
+	udelay(2);
+
+	ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0x80, 0x80);
+	if (ret < 0)
+		qib_dev_err(dd, "Failed Dummy RMW, (%s)\n", where);
+	udelay(10);
+
+	baduns = 0;
+
+	for (chn = 3; chn >= 0; --chn) {
+		/* Read CTRL reg for each channel to check TRIMDONE */
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+			IB_CTRL2(chn), 0, 0);
+		if (ret < 0)
+			qib_dev_err(dd,
+				"Failed checking TRIMDONE, chn %d (%s)\n",
+				chn, where);
+
+		if (!(ret & 0x10)) {
+			int probe;
+
+			baduns |= (1 << chn);
+			qib_dev_err(dd,
+				"TRIMDONE cleared on chn %d (%02X). (%s)\n",
+				chn, ret, where);
+			probe = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+				IB_PGUDP(0), 0, 0);
+			qib_dev_err(dd, "probe is %d (%02X)\n",
+				probe, probe);
+			probe = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+				IB_CTRL2(chn), 0, 0);
+			qib_dev_err(dd, "re-read: %d (%02X)\n",
+				probe, probe);
+			ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+				IB_CTRL2(chn), 0x10, 0x10);
+			if (ret < 0)
+				qib_dev_err(dd,
+					"Err on TRIMDONE rewrite1\n");
+		}
+	}
+	for (chn = 3; chn >= 0; --chn) {
+		/* Read CTRL reg for each channel to check TRIMDONE */
+		if (baduns & (1 << chn)) {
+			qib_dev_err(dd,
+				"Resetting TRIMDONE on chn %d (%s)\n",
+				chn, where);
+			ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+				IB_CTRL2(chn), 0x10, 0x10);
+			if (ret < 0)
+				qib_dev_err(dd,
+					"Failed re-setting TRIMDONE, chn %d (%s)\n",
+					chn, where);
+		}
+	}
+}
+
+/*
+ * Below is portion of IBA7220-specific bringup_serdes() that actually
+ * deals with registers and memory within the SerDes itself.
+ * Post IB uC code version 1.32.17, was_reset being 1 is not really
+ * informative, so we double-check.
+ */
+int qib_sd7220_init(struct qib_devdata *dd)
+{
+	const struct firmware *fw;
+	int ret = 1; /* default to failure */
+	int first_reset, was_reset;
+
+	/* SERDES MPU reset recorded in D0 */
+	was_reset = (qib_read_kreg64(dd, kr_ibserdesctrl) & 1);
+	if (!was_reset) {
+		/* entered with reset not asserted, we need to do it */
+		qib_ibsd_reset(dd, 1);
+		qib_sd_trimdone_monitor(dd, "Driver-reload");
+	}
+
+	ret = request_firmware(&fw, SD7220_FW_NAME, &dd->pcidev->dev);
+	if (ret) {
+		qib_dev_err(dd, "Failed to load IB SERDES image\n");
+		goto done;
+	}
+
+	/* Substitute our deduced value for was_reset */
+	ret = qib_ibsd_ucode_loaded(dd->pport, fw);
+	if (ret < 0)
+		goto bail;
+
+	first_reset = !ret; /* First reset if IBSD uCode not yet loaded */
+	/*
+	 * Alter some regs per vendor latest doc, reset-defaults
+	 * are not right for IB.
+	 */
+	ret = qib_sd_early(dd);
+	if (ret < 0) {
+		qib_dev_err(dd, "Failed to set IB SERDES early defaults\n");
+		goto bail;
+	}
+	/*
+	 * Set DAC manual trim IB.
+	 * We only do this once after chip has been reset (usually
+	 * same as once per system boot).
+	 */
+	if (first_reset) {
+		ret = qib_sd_dactrim(dd);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed IB SERDES DAC trim\n");
+			goto bail;
+		}
+	}
+	/*
+	 * Set various registers (DDS and RXEQ) that will be
+	 * controlled by IBC (in 1.2 mode) to reasonable preset values
+	 * Calling the "internal" version avoids the "check for needed"
+	 * and "trimdone monitor" that might be counter-productive.
+	 */
+	ret = qib_internal_presets(dd);
+	if (ret < 0) {
+		qib_dev_err(dd, "Failed to set IB SERDES presets\n");
+		goto bail;
+	}
+	ret = qib_sd_trimself(dd, 0x80);
+	if (ret < 0) {
+		qib_dev_err(dd, "Failed to set IB SERDES TRIMSELF\n");
+		goto bail;
+	}
+
+	/* Load image, then try to verify */
+	ret = 0;        /* Assume success */
+	if (first_reset) {
+		int vfy;
+		int trim_done;
+
+		ret = qib_sd7220_ib_load(dd, fw);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed to load IB SERDES image\n");
+			goto bail;
+		} else {
+			/* Loaded image, try to verify */
+			vfy = qib_sd7220_ib_vfy(dd, fw);
+			if (vfy != ret) {
+				qib_dev_err(dd, "SERDES PRAM VFY failed\n");
+				goto bail;
+			} /* end if verified */
+		} /* end if loaded */
+
+		/*
+		 * Loaded and verified. Almost good...
+		 * hold "success" in ret
+		 */
+		ret = 0;
+		/*
+		 * Prev steps all worked, continue bringup
+		 * De-assert RESET to uC, only in first reset, to allow
+		 * trimming.
+		 *
+		 * Since our default setup sets START_EQ1 to
+		 * PRESET, we need to clear that for this very first run.
+		 */
+		ret = ibsd_mod_allchnls(dd, START_EQ1(0), 0, 0x38);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed clearing START_EQ1\n");
+			goto bail;
+		}
+
+		qib_ibsd_reset(dd, 0);
+		/*
+		 * If this is not the first reset, trimdone should be set
+		 * already. We may need to check about this.
+		 */
+		trim_done = qib_sd_trimdone_poll(dd);
+		/*
+		 * Whether or not trimdone succeeded, we need to put the
+		 * uC back into reset to avoid a possible fight with the
+		 * IBC state-machine.
+		 */
+		qib_ibsd_reset(dd, 1);
+
+		if (!trim_done) {
+			qib_dev_err(dd, "No TRIMDONE seen\n");
+			goto bail;
+		}
+		/*
+		 * DEBUG: check each time we reset if trimdone bits have
+		 * gotten cleared, and re-set them.
+		 */
+		qib_sd_trimdone_monitor(dd, "First-reset");
+		/* Remember so we do not re-do the load, dactrim, etc. */
+		dd->cspec->serdes_first_init_done = 1;
+	}
+	/*
+	 * setup for channel training and load values for
+	 * RxEq and DDS in tables used by IBC in IB1.2 mode
+	 */
+	ret = 0;
+	if (qib_sd_setvals(dd) >= 0)
+		goto done;
+bail:
+	ret = 1;
+done:
+	/* start relock timer regardless, but start at 1 second */
+	set_7220_relock_poll(dd, -1);
+
+	release_firmware(fw);
+	return ret;
+}
+
+#define EPB_ACC_REQ 1
+#define EPB_ACC_GNT 0x100
+#define EPB_DATA_MASK 0xFF
+#define EPB_RD (1ULL << 24)
+#define EPB_TRANS_RDY (1ULL << 31)
+#define EPB_TRANS_ERR (1ULL << 30)
+#define EPB_TRANS_TRIES 5
+
+/*
+ * query, claim, release ownership of the EPB (External Parallel Bus)
+ * for a specified SERDES.
+ * the "claim" parameter is >0 to claim, <0 to release, 0 to query.
+ * Returns <0 for errors, >0 if we had ownership, else 0.
+ */
+static int epb_access(struct qib_devdata *dd, int sdnum, int claim)
+{
+	u16 acc;
+	u64 accval;
+	int owned = 0;
+	u64 oct_sel = 0;
+
+	switch (sdnum) {
+	case IB_7220_SERDES:
+		/*
+		 * The IB SERDES "ownership" is fairly simple. A single each
+		 * request/grant.
+		 */
+		acc = kr_ibsd_epb_access_ctrl;
+		break;
+
+	case PCIE_SERDES0:
+	case PCIE_SERDES1:
+		/* PCIe SERDES has two "octants", need to select which */
+		acc = kr_pciesd_epb_access_ctrl;
+		oct_sel = (2 << (sdnum - PCIE_SERDES0));
+		break;
+
+	default:
+		return 0;
+	}
+
+	/* Make sure any outstanding transaction was seen */
+	qib_read_kreg32(dd, kr_scratch);
+	udelay(15);
+
+	accval = qib_read_kreg32(dd, acc);
+
+	owned = !!(accval & EPB_ACC_GNT);
+	if (claim < 0) {
+		/* Need to release */
+		u64 pollval;
+		/*
+		 * The only writeable bits are the request and CS.
+		 * Both should be clear
+		 */
+		u64 newval = 0;
+
+		qib_write_kreg(dd, acc, newval);
+		/* First read after write is not trustworthy */
+		pollval = qib_read_kreg32(dd, acc);
+		udelay(5);
+		pollval = qib_read_kreg32(dd, acc);
+		if (pollval & EPB_ACC_GNT)
+			owned = -1;
+	} else if (claim > 0) {
+		/* Need to claim */
+		u64 pollval;
+		u64 newval = EPB_ACC_REQ | oct_sel;
+
+		qib_write_kreg(dd, acc, newval);
+		/* First read after write is not trustworthy */
+		pollval = qib_read_kreg32(dd, acc);
+		udelay(5);
+		pollval = qib_read_kreg32(dd, acc);
+		if (!(pollval & EPB_ACC_GNT))
+			owned = -1;
+	}
+	return owned;
+}
+
+/*
+ * Lemma to deal with race condition of write..read to epb regs
+ */
+static int epb_trans(struct qib_devdata *dd, u16 reg, u64 i_val, u64 *o_vp)
+{
+	int tries;
+	u64 transval;
+
+	qib_write_kreg(dd, reg, i_val);
+	/* Throw away first read, as RDY bit may be stale */
+	transval = qib_read_kreg64(dd, reg);
+
+	for (tries = EPB_TRANS_TRIES; tries; --tries) {
+		transval = qib_read_kreg32(dd, reg);
+		if (transval & EPB_TRANS_RDY)
+			break;
+		udelay(5);
+	}
+	if (transval & EPB_TRANS_ERR)
+		return -1;
+	if (tries > 0 && o_vp)
+		*o_vp = transval;
+	return tries;
+}
+
+/**
+ * qib_sd7220_reg_mod - modify SERDES register
+ * @dd: the qlogic_ib device
+ * @sdnum: which SERDES to access
+ * @loc: location - channel, element, register, as packed by EPB_LOC() macro.
+ * @wd: Write Data - value to set in register
+ * @mask: ones where data should be spliced into reg.
+ *
+ * Basic register read/modify/write, with un-needed acesses elided. That is,
+ * a mask of zero will prevent write, while a mask of 0xFF will prevent read.
+ * returns current (presumed, if a write was done) contents of selected
+ * register, or <0 if errors.
+ */
+static int qib_sd7220_reg_mod(struct qib_devdata *dd, int sdnum, u32 loc,
+			      u32 wd, u32 mask)
+{
+	u16 trans;
+	u64 transval;
+	int owned;
+	int tries, ret;
+	unsigned long flags;
+
+	switch (sdnum) {
+	case IB_7220_SERDES:
+		trans = kr_ibsd_epb_transaction_reg;
+		break;
+
+	case PCIE_SERDES0:
+	case PCIE_SERDES1:
+		trans = kr_pciesd_epb_transaction_reg;
+		break;
+
+	default:
+		return -1;
+	}
+
+	/*
+	 * All access is locked in software (vs other host threads) and
+	 * hardware (vs uC access).
+	 */
+	spin_lock_irqsave(&dd->cspec->sdepb_lock, flags);
+
+	owned = epb_access(dd, sdnum, 1);
+	if (owned < 0) {
+		spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+		return -1;
+	}
+	ret = 0;
+	for (tries = EPB_TRANS_TRIES; tries; --tries) {
+		transval = qib_read_kreg32(dd, trans);
+		if (transval & EPB_TRANS_RDY)
+			break;
+		udelay(5);
+	}
+
+	if (tries > 0) {
+		tries = 1;      /* to make read-skip work */
+		if (mask != 0xFF) {
+			/*
+			 * Not a pure write, so need to read.
+			 * loc encodes chip-select as well as address
+			 */
+			transval = loc | EPB_RD;
+			tries = epb_trans(dd, trans, transval, &transval);
+		}
+		if (tries > 0 && mask != 0) {
+			/*
+			 * Not a pure read, so need to write.
+			 */
+			wd = (wd & mask) | (transval & ~mask);
+			transval = loc | (wd & EPB_DATA_MASK);
+			tries = epb_trans(dd, trans, transval, &transval);
+		}
+	}
+	/* else, failed to see ready, what error-handling? */
+
+	/*
+	 * Release bus. Failure is an error.
+	 */
+	if (epb_access(dd, sdnum, -1) < 0)
+		ret = -1;
+	else
+		ret = transval & EPB_DATA_MASK;
+
+	spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+	if (tries <= 0)
+		ret = -1;
+	return ret;
+}
+
+#define EPB_ROM_R (2)
+#define EPB_ROM_W (1)
+/*
+ * Below, all uC-related, use appropriate UC_CS, depending
+ * on which SerDes is used.
+ */
+#define EPB_UC_CTL EPB_LOC(6, 0, 0)
+#define EPB_MADDRL EPB_LOC(6, 0, 2)
+#define EPB_MADDRH EPB_LOC(6, 0, 3)
+#define EPB_ROMDATA EPB_LOC(6, 0, 4)
+#define EPB_RAMDATA EPB_LOC(6, 0, 5)
+
+/* Transfer date to/from uC Program RAM of IB or PCIe SerDes */
+static int qib_sd7220_ram_xfer(struct qib_devdata *dd, int sdnum, u32 loc,
+			       u8 *buf, int cnt, int rd_notwr)
+{
+	u16 trans;
+	u64 transval;
+	u64 csbit;
+	int owned;
+	int tries;
+	int sofar;
+	int addr;
+	int ret;
+	unsigned long flags;
+
+	/* Pick appropriate transaction reg and "Chip select" for this serdes */
+	switch (sdnum) {
+	case IB_7220_SERDES:
+		csbit = 1ULL << EPB_IB_UC_CS_SHF;
+		trans = kr_ibsd_epb_transaction_reg;
+		break;
+
+	case PCIE_SERDES0:
+	case PCIE_SERDES1:
+		/* PCIe SERDES has uC "chip select" in different bit, too */
+		csbit = 1ULL << EPB_PCIE_UC_CS_SHF;
+		trans = kr_pciesd_epb_transaction_reg;
+		break;
+
+	default:
+		return -1;
+	}
+
+	spin_lock_irqsave(&dd->cspec->sdepb_lock, flags);
+
+	owned = epb_access(dd, sdnum, 1);
+	if (owned < 0) {
+		spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+		return -1;
+	}
+
+	/*
+	 * In future code, we may need to distinguish several address ranges,
+	 * and select various memories based on this. For now, just trim
+	 * "loc" (location including address and memory select) to
+	 * "addr" (address within memory). we will only support PRAM
+	 * The memory is 8KB.
+	 */
+	addr = loc & 0x1FFF;
+	for (tries = EPB_TRANS_TRIES; tries; --tries) {
+		transval = qib_read_kreg32(dd, trans);
+		if (transval & EPB_TRANS_RDY)
+			break;
+		udelay(5);
+	}
+
+	sofar = 0;
+	if (tries > 0) {
+		/*
+		 * Every "memory" access is doubly-indirect.
+		 * We set two bytes of address, then read/write
+		 * one or mores bytes of data.
+		 */
+
+		/* First, we set control to "Read" or "Write" */
+		transval = csbit | EPB_UC_CTL |
+			(rd_notwr ? EPB_ROM_R : EPB_ROM_W);
+		tries = epb_trans(dd, trans, transval, &transval);
+		while (tries > 0 && sofar < cnt) {
+			if (!sofar) {
+				/* Only set address at start of chunk */
+				int addrbyte = (addr + sofar) >> 8;
+
+				transval = csbit | EPB_MADDRH | addrbyte;
+				tries = epb_trans(dd, trans, transval,
+						  &transval);
+				if (tries <= 0)
+					break;
+				addrbyte = (addr + sofar) & 0xFF;
+				transval = csbit | EPB_MADDRL | addrbyte;
+				tries = epb_trans(dd, trans, transval,
+						 &transval);
+				if (tries <= 0)
+					break;
+			}
+
+			if (rd_notwr)
+				transval = csbit | EPB_ROMDATA | EPB_RD;
+			else
+				transval = csbit | EPB_ROMDATA | buf[sofar];
+			tries = epb_trans(dd, trans, transval, &transval);
+			if (tries <= 0)
+				break;
+			if (rd_notwr)
+				buf[sofar] = transval & EPB_DATA_MASK;
+			++sofar;
+		}
+		/* Finally, clear control-bit for Read or Write */
+		transval = csbit | EPB_UC_CTL;
+		tries = epb_trans(dd, trans, transval, &transval);
+	}
+
+	ret = sofar;
+	/* Release bus. Failure is an error */
+	if (epb_access(dd, sdnum, -1) < 0)
+		ret = -1;
+
+	spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+	if (tries <= 0)
+		ret = -1;
+	return ret;
+}
+
+#define PROG_CHUNK 64
+
+static int qib_sd7220_prog_ld(struct qib_devdata *dd, int sdnum,
+			      const u8 *img, int len, int offset)
+{
+	int cnt, sofar, req;
+
+	sofar = 0;
+	while (sofar < len) {
+		req = len - sofar;
+		if (req > PROG_CHUNK)
+			req = PROG_CHUNK;
+		cnt = qib_sd7220_ram_xfer(dd, sdnum, offset + sofar,
+					  (u8 *)img + sofar, req, 0);
+		if (cnt < req) {
+			sofar = -1;
+			break;
+		}
+		sofar += req;
+	}
+	return sofar;
+}
+
+#define VFY_CHUNK 64
+#define SD_PRAM_ERROR_LIMIT 42
+
+static int qib_sd7220_prog_vfy(struct qib_devdata *dd, int sdnum,
+			       const u8 *img, int len, int offset)
+{
+	int cnt, sofar, req, idx, errors;
+	unsigned char readback[VFY_CHUNK];
+
+	errors = 0;
+	sofar = 0;
+	while (sofar < len) {
+		req = len - sofar;
+		if (req > VFY_CHUNK)
+			req = VFY_CHUNK;
+		cnt = qib_sd7220_ram_xfer(dd, sdnum, sofar + offset,
+					  readback, req, 1);
+		if (cnt < req) {
+			/* failed in read itself */
+			sofar = -1;
+			break;
+		}
+		for (idx = 0; idx < cnt; ++idx) {
+			if (readback[idx] != img[idx+sofar])
+				++errors;
+		}
+		sofar += cnt;
+	}
+	return errors ? -errors : sofar;
+}
+
+static int
+qib_sd7220_ib_load(struct qib_devdata *dd, const struct firmware *fw)
+{
+	return qib_sd7220_prog_ld(dd, IB_7220_SERDES, fw->data, fw->size, 0);
+}
+
+static int
+qib_sd7220_ib_vfy(struct qib_devdata *dd, const struct firmware *fw)
+{
+	return qib_sd7220_prog_vfy(dd, IB_7220_SERDES, fw->data, fw->size, 0);
+}
+
+/*
+ * IRQ not set up at this point in init, so we poll.
+ */
+#define IB_SERDES_TRIM_DONE (1ULL << 11)
+#define TRIM_TMO (15)
+
+static int qib_sd_trimdone_poll(struct qib_devdata *dd)
+{
+	int trim_tmo, ret;
+	uint64_t val;
+
+	/*
+	 * Default to failure, so IBC will not start
+	 * without IB_SERDES_TRIM_DONE.
+	 */
+	ret = 0;
+	for (trim_tmo = 0; trim_tmo < TRIM_TMO; ++trim_tmo) {
+		val = qib_read_kreg64(dd, kr_ibcstatus);
+		if (val & IB_SERDES_TRIM_DONE) {
+			ret = 1;
+			break;
+		}
+		msleep(20);
+	}
+	if (trim_tmo >= TRIM_TMO) {
+		qib_dev_err(dd, "No TRIMDONE in %d tries\n", trim_tmo);
+		ret = 0;
+	}
+	return ret;
+}
+
+#define TX_FAST_ELT (9)
+
+/*
+ * Set the "negotiation" values for SERDES. These are used by the IB1.2
+ * link negotiation. Macros below are attempt to keep the values a
+ * little more human-editable.
+ * First, values related to Drive De-emphasis Settings.
+ */
+
+#define NUM_DDS_REGS 6
+#define DDS_REG_MAP 0x76A910 /* LSB-first list of regs (in elt 9) to mod */
+
+#define DDS_VAL(amp_d, main_d, ipst_d, ipre_d, amp_s, main_s, ipst_s, ipre_s) \
+	{ { ((amp_d & 0x1F) << 1) | 1, ((amp_s & 0x1F) << 1) | 1, \
+	  (main_d << 3) | 4 | (ipre_d >> 2), \
+	  (main_s << 3) | 4 | (ipre_s >> 2), \
+	  ((ipst_d & 0xF) << 1) | ((ipre_d & 3) << 6) | 0x21, \
+	  ((ipst_s & 0xF) << 1) | ((ipre_s & 3) << 6) | 0x21 } }
+
+static struct dds_init {
+	uint8_t reg_vals[NUM_DDS_REGS];
+} dds_init_vals[] = {
+	/*       DDR(FDR)       SDR(HDR)   */
+	/* Vendor recommends below for 3m cable */
+#define DDS_3M 0
+	DDS_VAL(31, 19, 12, 0, 29, 22,  9, 0),
+	DDS_VAL(31, 12, 15, 4, 31, 15, 15, 1),
+	DDS_VAL(31, 13, 15, 3, 31, 16, 15, 0),
+	DDS_VAL(31, 14, 15, 2, 31, 17, 14, 0),
+	DDS_VAL(31, 15, 15, 1, 31, 18, 13, 0),
+	DDS_VAL(31, 16, 15, 0, 31, 19, 12, 0),
+	DDS_VAL(31, 17, 14, 0, 31, 20, 11, 0),
+	DDS_VAL(31, 18, 13, 0, 30, 21, 10, 0),
+	DDS_VAL(31, 20, 11, 0, 28, 23,  8, 0),
+	DDS_VAL(31, 21, 10, 0, 27, 24,  7, 0),
+	DDS_VAL(31, 22,  9, 0, 26, 25,  6, 0),
+	DDS_VAL(30, 23,  8, 0, 25, 26,  5, 0),
+	DDS_VAL(29, 24,  7, 0, 23, 27,  4, 0),
+	/* Vendor recommends below for 1m cable */
+#define DDS_1M 13
+	DDS_VAL(28, 25,  6, 0, 21, 28,  3, 0),
+	DDS_VAL(27, 26,  5, 0, 19, 29,  2, 0),
+	DDS_VAL(25, 27,  4, 0, 17, 30,  1, 0)
+};
+
+/*
+ * Now the RXEQ section of the table.
+ */
+/* Hardware packs an element number and register address thus: */
+#define RXEQ_INIT_RDESC(elt, addr) (((elt) & 0xF) | ((addr) << 4))
+#define RXEQ_VAL(elt, adr, val0, val1, val2, val3) \
+	{RXEQ_INIT_RDESC((elt), (adr)), {(val0), (val1), (val2), (val3)} }
+
+#define RXEQ_VAL_ALL(elt, adr, val)  \
+	{RXEQ_INIT_RDESC((elt), (adr)), {(val), (val), (val), (val)} }
+
+#define RXEQ_SDR_DFELTH 0
+#define RXEQ_SDR_TLTH 0
+#define RXEQ_SDR_G1CNT_Z1CNT 0x11
+#define RXEQ_SDR_ZCNT 23
+
+static struct rxeq_init {
+	u16 rdesc;      /* in form used in SerDesDDSRXEQ */
+	u8  rdata[4];
+} rxeq_init_vals[] = {
+	/* Set Rcv Eq. to Preset node */
+	RXEQ_VAL_ALL(7, 0x27, 0x10),
+	/* Set DFELTHFDR/HDR thresholds */
+	RXEQ_VAL(7, 8,    0, 0, 0, 0), /* FDR, was 0, 1, 2, 3 */
+	RXEQ_VAL(7, 0x21, 0, 0, 0, 0), /* HDR */
+	/* Set TLTHFDR/HDR theshold */
+	RXEQ_VAL(7, 9,    2, 2, 2, 2), /* FDR, was 0, 2, 4, 6 */
+	RXEQ_VAL(7, 0x23, 2, 2, 2, 2), /* HDR, was  0, 1, 2, 3 */
+	/* Set Preamp setting 2 (ZFR/ZCNT) */
+	RXEQ_VAL(7, 0x1B, 12, 12, 12, 12), /* FDR, was 12, 16, 20, 24 */
+	RXEQ_VAL(7, 0x1C, 12, 12, 12, 12), /* HDR, was 12, 16, 20, 24 */
+	/* Set Preamp DC gain and Setting 1 (GFR/GHR) */
+	RXEQ_VAL(7, 0x1E, 16, 16, 16, 16), /* FDR, was 16, 17, 18, 20 */
+	RXEQ_VAL(7, 0x1F, 16, 16, 16, 16), /* HDR, was 16, 17, 18, 20 */
+	/* Toggle RELOCK (in VCDL_CTRL0) to lock to data */
+	RXEQ_VAL_ALL(6, 6, 0x20), /* Set D5 High */
+	RXEQ_VAL_ALL(6, 6, 0), /* Set D5 Low */
+};
+
+/* There are 17 values from vendor, but IBC only accesses the first 16 */
+#define DDS_ROWS (16)
+#define RXEQ_ROWS ARRAY_SIZE(rxeq_init_vals)
+
+static int qib_sd_setvals(struct qib_devdata *dd)
+{
+	int idx, midx;
+	int min_idx;     /* Minimum index for this portion of table */
+	uint32_t dds_reg_map;
+	u64 __iomem *taddr, *iaddr;
+	uint64_t data;
+	uint64_t sdctl;
+
+	taddr = dd->kregbase + kr_serdes_maptable;
+	iaddr = dd->kregbase + kr_serdes_ddsrxeq0;
+
+	/*
+	 * Init the DDS section of the table.
+	 * Each "row" of the table provokes NUM_DDS_REG writes, to the
+	 * registers indicated in DDS_REG_MAP.
+	 */
+	sdctl = qib_read_kreg64(dd, kr_ibserdesctrl);
+	sdctl = (sdctl & ~(0x1f << 8)) | (NUM_DDS_REGS << 8);
+	sdctl = (sdctl & ~(0x1f << 13)) | (RXEQ_ROWS << 13);
+	qib_write_kreg(dd, kr_ibserdesctrl, sdctl);
+
+	/*
+	 * Iterate down table within loop for each register to store.
+	 */
+	dds_reg_map = DDS_REG_MAP;
+	for (idx = 0; idx < NUM_DDS_REGS; ++idx) {
+		data = ((dds_reg_map & 0xF) << 4) | TX_FAST_ELT;
+		writeq(data, iaddr + idx);
+		mmiowb();
+		qib_read_kreg32(dd, kr_scratch);
+		dds_reg_map >>= 4;
+		for (midx = 0; midx < DDS_ROWS; ++midx) {
+			u64 __iomem *daddr = taddr + ((midx << 4) + idx);
+
+			data = dds_init_vals[midx].reg_vals[idx];
+			writeq(data, daddr);
+			mmiowb();
+			qib_read_kreg32(dd, kr_scratch);
+		} /* End inner for (vals for this reg, each row) */
+	} /* end outer for (regs to be stored) */
+
+	/*
+	 * Init the RXEQ section of the table.
+	 * This runs in a different order, as the pattern of
+	 * register references is more complex, but there are only
+	 * four "data" values per register.
+	 */
+	min_idx = idx; /* RXEQ indices pick up where DDS left off */
+	taddr += 0x100; /* RXEQ data is in second half of table */
+	/* Iterate through RXEQ register addresses */
+	for (idx = 0; idx < RXEQ_ROWS; ++idx) {
+		int didx; /* "destination" */
+		int vidx;
+
+		/* didx is offset by min_idx to address RXEQ range of regs */
+		didx = idx + min_idx;
+		/* Store the next RXEQ register address */
+		writeq(rxeq_init_vals[idx].rdesc, iaddr + didx);
+		mmiowb();
+		qib_read_kreg32(dd, kr_scratch);
+		/* Iterate through RXEQ values */
+		for (vidx = 0; vidx < 4; vidx++) {
+			data = rxeq_init_vals[idx].rdata[vidx];
+			writeq(data, taddr + (vidx << 6) + idx);
+			mmiowb();
+			qib_read_kreg32(dd, kr_scratch);
+		}
+	} /* end outer for (Reg-writes for RXEQ) */
+	return 0;
+}
+
+#define CMUCTRL5 EPB_LOC(7, 0, 0x15)
+#define RXHSCTRL0(chan) EPB_LOC(chan, 6, 0)
+#define VCDL_DAC2(chan) EPB_LOC(chan, 6, 5)
+#define VCDL_CTRL0(chan) EPB_LOC(chan, 6, 6)
+#define VCDL_CTRL2(chan) EPB_LOC(chan, 6, 8)
+#define START_EQ2(chan) EPB_LOC(chan, 7, 0x28)
+
+/*
+ * Repeat a "store" across all channels of the IB SerDes.
+ * Although nominally it inherits the "read value" of the last
+ * channel it modified, the only really useful return is <0 for
+ * failure, >= 0 for success. The parameter 'loc' is assumed to
+ * be the location in some channel of the register to be modified
+ * The caller can specify use of the "gang write" option of EPB,
+ * in which case we use the specified channel data for any fields
+ * not explicitely written.
+ */
+static int ibsd_mod_allchnls(struct qib_devdata *dd, int loc, int val,
+			     int mask)
+{
+	int ret = -1;
+	int chnl;
+
+	if (loc & EPB_GLOBAL_WR) {
+		/*
+		 * Our caller has assured us that we can set all four
+		 * channels at once. Trust that. If mask is not 0xFF,
+		 * we will read the _specified_ channel for our starting
+		 * value.
+		 */
+		loc |= (1U << EPB_IB_QUAD0_CS_SHF);
+		chnl = (loc >> (4 + EPB_ADDR_SHF)) & 7;
+		if (mask != 0xFF) {
+			ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+						 loc & ~EPB_GLOBAL_WR, 0, 0);
+			if (ret < 0) {
+				int sloc = loc >> EPB_ADDR_SHF;
+
+				qib_dev_err(dd,
+					"pre-read failed: elt %d, addr 0x%X, chnl %d\n",
+					(sloc & 0xF),
+					(sloc >> 9) & 0x3f, chnl);
+				return ret;
+			}
+			val = (ret & ~mask) | (val & mask);
+		}
+		loc &=  ~(7 << (4+EPB_ADDR_SHF));
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, val, 0xFF);
+		if (ret < 0) {
+			int sloc = loc >> EPB_ADDR_SHF;
+
+			qib_dev_err(dd,
+				"Global WR failed: elt %d, addr 0x%X, val %02X\n",
+				(sloc & 0xF), (sloc >> 9) & 0x3f, val);
+		}
+		return ret;
+	}
+	/* Clear "channel" and set CS so we can simply iterate */
+	loc &=  ~(7 << (4+EPB_ADDR_SHF));
+	loc |= (1U << EPB_IB_QUAD0_CS_SHF);
+	for (chnl = 0; chnl < 4; ++chnl) {
+		int cloc = loc | (chnl << (4+EPB_ADDR_SHF));
+
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, cloc, val, mask);
+		if (ret < 0) {
+			int sloc = loc >> EPB_ADDR_SHF;
+
+			qib_dev_err(dd,
+				"Write failed: elt %d, addr 0x%X, chnl %d, val 0x%02X, mask 0x%02X\n",
+				(sloc & 0xF), (sloc >> 9) & 0x3f, chnl,
+				val & 0xFF, mask & 0xFF);
+			break;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Set the Tx values normally modified by IBC in IB1.2 mode to default
+ * values, as gotten from first row of init table.
+ */
+static int set_dds_vals(struct qib_devdata *dd, struct dds_init *ddi)
+{
+	int ret;
+	int idx, reg, data;
+	uint32_t regmap;
+
+	regmap = DDS_REG_MAP;
+	for (idx = 0; idx < NUM_DDS_REGS; ++idx) {
+		reg = (regmap & 0xF);
+		regmap >>= 4;
+		data = ddi->reg_vals[idx];
+		/* Vendor says RMW not needed for these regs, use 0xFF mask */
+		ret = ibsd_mod_allchnls(dd, EPB_LOC(0, 9, reg), data, 0xFF);
+		if (ret < 0)
+			break;
+	}
+	return ret;
+}
+
+/*
+ * Set the Rx values normally modified by IBC in IB1.2 mode to default
+ * values, as gotten from selected column of init table.
+ */
+static int set_rxeq_vals(struct qib_devdata *dd, int vsel)
+{
+	int ret;
+	int ridx;
+	int cnt = ARRAY_SIZE(rxeq_init_vals);
+
+	for (ridx = 0; ridx < cnt; ++ridx) {
+		int elt, reg, val, loc;
+
+		elt = rxeq_init_vals[ridx].rdesc & 0xF;
+		reg = rxeq_init_vals[ridx].rdesc >> 4;
+		loc = EPB_LOC(0, elt, reg);
+		val = rxeq_init_vals[ridx].rdata[vsel];
+		/* mask of 0xFF, because hardware does full-byte store. */
+		ret = ibsd_mod_allchnls(dd, loc, val, 0xFF);
+		if (ret < 0)
+			break;
+	}
+	return ret;
+}
+
+/*
+ * Set the default values (row 0) for DDR Driver Demphasis.
+ * we do this initially and whenever we turn off IB-1.2
+ *
+ * The "default" values for Rx equalization are also stored to
+ * SerDes registers. Formerly (and still default), we used set 2.
+ * For experimenting with cables and link-partners, we allow changing
+ * that via a module parameter.
+ */
+static unsigned qib_rxeq_set = 2;
+module_param_named(rxeq_default_set, qib_rxeq_set, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(rxeq_default_set,
+		 "Which set [0..3] of Rx Equalization values is default");
+
+static int qib_internal_presets(struct qib_devdata *dd)
+{
+	int ret = 0;
+
+	ret = set_dds_vals(dd, dds_init_vals + DDS_3M);
+
+	if (ret < 0)
+		qib_dev_err(dd, "Failed to set default DDS values\n");
+	ret = set_rxeq_vals(dd, qib_rxeq_set & 3);
+	if (ret < 0)
+		qib_dev_err(dd, "Failed to set default RXEQ values\n");
+	return ret;
+}
+
+int qib_sd7220_presets(struct qib_devdata *dd)
+{
+	int ret = 0;
+
+	if (!dd->cspec->presets_needed)
+		return ret;
+	dd->cspec->presets_needed = 0;
+	/* Assert uC reset, so we don't clash with it. */
+	qib_ibsd_reset(dd, 1);
+	udelay(2);
+	qib_sd_trimdone_monitor(dd, "link-down");
+
+	ret = qib_internal_presets(dd);
+	return ret;
+}
+
+static int qib_sd_trimself(struct qib_devdata *dd, int val)
+{
+	int loc = CMUCTRL5 | (1U << EPB_IB_QUAD0_CS_SHF);
+
+	return qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, val, 0xFF);
+}
+
+static int qib_sd_early(struct qib_devdata *dd)
+{
+	int ret;
+
+	ret = ibsd_mod_allchnls(dd, RXHSCTRL0(0) | EPB_GLOBAL_WR, 0xD4, 0xFF);
+	if (ret < 0)
+		goto bail;
+	ret = ibsd_mod_allchnls(dd, START_EQ1(0) | EPB_GLOBAL_WR, 0x10, 0xFF);
+	if (ret < 0)
+		goto bail;
+	ret = ibsd_mod_allchnls(dd, START_EQ2(0) | EPB_GLOBAL_WR, 0x30, 0xFF);
+bail:
+	return ret;
+}
+
+#define BACTRL(chnl) EPB_LOC(chnl, 6, 0x0E)
+#define LDOUTCTRL1(chnl) EPB_LOC(chnl, 7, 6)
+#define RXHSSTATUS(chnl) EPB_LOC(chnl, 6, 0xF)
+
+static int qib_sd_dactrim(struct qib_devdata *dd)
+{
+	int ret;
+
+	ret = ibsd_mod_allchnls(dd, VCDL_DAC2(0) | EPB_GLOBAL_WR, 0x2D, 0xFF);
+	if (ret < 0)
+		goto bail;
+
+	/* more fine-tuning of what will be default */
+	ret = ibsd_mod_allchnls(dd, VCDL_CTRL2(0), 3, 0xF);
+	if (ret < 0)
+		goto bail;
+
+	ret = ibsd_mod_allchnls(dd, BACTRL(0) | EPB_GLOBAL_WR, 0x40, 0xFF);
+	if (ret < 0)
+		goto bail;
+
+	ret = ibsd_mod_allchnls(dd, LDOUTCTRL1(0) | EPB_GLOBAL_WR, 0x04, 0xFF);
+	if (ret < 0)
+		goto bail;
+
+	ret = ibsd_mod_allchnls(dd, RXHSSTATUS(0) | EPB_GLOBAL_WR, 0x04, 0xFF);
+	if (ret < 0)
+		goto bail;
+
+	/*
+	 * Delay for max possible number of steps, with slop.
+	 * Each step is about 4usec.
+	 */
+	udelay(415);
+
+	ret = ibsd_mod_allchnls(dd, LDOUTCTRL1(0) | EPB_GLOBAL_WR, 0x00, 0xFF);
+
+bail:
+	return ret;
+}
+
+#define RELOCK_FIRST_MS 3
+#define RXLSPPM(chan) EPB_LOC(chan, 0, 2)
+void toggle_7220_rclkrls(struct qib_devdata *dd)
+{
+	int loc = RXLSPPM(0) | EPB_GLOBAL_WR;
+	int ret;
+
+	ret = ibsd_mod_allchnls(dd, loc, 0, 0x80);
+	if (ret < 0)
+		qib_dev_err(dd, "RCLKRLS failed to clear D7\n");
+	else {
+		udelay(1);
+		ibsd_mod_allchnls(dd, loc, 0x80, 0x80);
+	}
+	/* And again for good measure */
+	udelay(1);
+	ret = ibsd_mod_allchnls(dd, loc, 0, 0x80);
+	if (ret < 0)
+		qib_dev_err(dd, "RCLKRLS failed to clear D7\n");
+	else {
+		udelay(1);
+		ibsd_mod_allchnls(dd, loc, 0x80, 0x80);
+	}
+	/* Now reset xgxs and IBC to complete the recovery */
+	dd->f_xgxs_reset(dd->pport);
+}
+
+/*
+ * Shut down the timer that polls for relock occasions, if needed
+ * this is "hooked" from qib_7220_quiet_serdes(), which is called
+ * just before qib_shutdown_device() in qib_driver.c shuts down all
+ * the other timers
+ */
+void shutdown_7220_relock_poll(struct qib_devdata *dd)
+{
+	if (dd->cspec->relock_timer_active)
+		del_timer_sync(&dd->cspec->relock_timer);
+}
+
+static unsigned qib_relock_by_timer = 1;
+module_param_named(relock_by_timer, qib_relock_by_timer, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(relock_by_timer, "Allow relock attempt if link not up");
+
+static void qib_run_relock(struct timer_list *t)
+{
+	struct qib_chip_specific *cs = from_timer(cs, t, relock_timer);
+	struct qib_devdata *dd = cs->dd;
+	struct qib_pportdata *ppd = dd->pport;
+	int timeoff;
+
+	/*
+	 * Check link-training state for "stuck" state, when down.
+	 * if found, try relock and schedule another try at
+	 * exponentially growing delay, maxed at one second.
+	 * if not stuck, our work is done.
+	 */
+	if ((dd->flags & QIB_INITTED) && !(ppd->lflags &
+	    (QIBL_IB_AUTONEG_INPROG | QIBL_LINKINIT | QIBL_LINKARMED |
+	     QIBL_LINKACTIVE))) {
+		if (qib_relock_by_timer) {
+			if (!(ppd->lflags & QIBL_IB_LINK_DISABLED))
+				toggle_7220_rclkrls(dd);
+		}
+		/* re-set timer for next check */
+		timeoff = cs->relock_interval << 1;
+		if (timeoff > HZ)
+			timeoff = HZ;
+		cs->relock_interval = timeoff;
+	} else
+		timeoff = HZ;
+	mod_timer(&cs->relock_timer, jiffies + timeoff);
+}
+
+void set_7220_relock_poll(struct qib_devdata *dd, int ibup)
+{
+	struct qib_chip_specific *cs = dd->cspec;
+
+	if (ibup) {
+		/* We are now up, relax timer to 1 second interval */
+		if (cs->relock_timer_active) {
+			cs->relock_interval = HZ;
+			mod_timer(&cs->relock_timer, jiffies + HZ);
+		}
+	} else {
+		/* Transition to down, (re-)set timer to short interval. */
+		unsigned int timeout;
+
+		timeout = msecs_to_jiffies(RELOCK_FIRST_MS);
+		if (timeout == 0)
+			timeout = 1;
+		/* If timer has not yet been started, do so. */
+		if (!cs->relock_timer_active) {
+			cs->relock_timer_active = 1;
+			timer_setup(&cs->relock_timer, qib_run_relock, 0);
+			cs->relock_interval = timeout;
+			cs->relock_timer.expires = jiffies + timeout;
+			add_timer(&cs->relock_timer);
+		} else {
+			cs->relock_interval = timeout;
+			mod_timer(&cs->relock_timer, jiffies + timeout);
+		}
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c
new file mode 100644
index 0000000..d0723d4
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_sdma.c
@@ -0,0 +1,1020 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2007 - 2012 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+
+#include "qib.h"
+#include "qib_common.h"
+
+/* default pio off, sdma on */
+static ushort sdma_descq_cnt = 256;
+module_param_named(sdma_descq_cnt, sdma_descq_cnt, ushort, S_IRUGO);
+MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
+
+/*
+ * Bits defined in the send DMA descriptor.
+ */
+#define SDMA_DESC_LAST          (1ULL << 11)
+#define SDMA_DESC_FIRST         (1ULL << 12)
+#define SDMA_DESC_DMA_HEAD      (1ULL << 13)
+#define SDMA_DESC_USE_LARGE_BUF (1ULL << 14)
+#define SDMA_DESC_INTR          (1ULL << 15)
+#define SDMA_DESC_COUNT_LSB     16
+#define SDMA_DESC_GEN_LSB       30
+
+/* declare all statics here rather than keep sorting */
+static int alloc_sdma(struct qib_pportdata *);
+static void sdma_complete(struct kref *);
+static void sdma_finalput(struct qib_sdma_state *);
+static void sdma_get(struct qib_sdma_state *);
+static void sdma_put(struct qib_sdma_state *);
+static void sdma_set_state(struct qib_pportdata *, enum qib_sdma_states);
+static void sdma_start_sw_clean_up(struct qib_pportdata *);
+static void sdma_sw_clean_up_task(unsigned long);
+static void unmap_desc(struct qib_pportdata *, unsigned);
+
+static void sdma_get(struct qib_sdma_state *ss)
+{
+	kref_get(&ss->kref);
+}
+
+static void sdma_complete(struct kref *kref)
+{
+	struct qib_sdma_state *ss =
+		container_of(kref, struct qib_sdma_state, kref);
+
+	complete(&ss->comp);
+}
+
+static void sdma_put(struct qib_sdma_state *ss)
+{
+	kref_put(&ss->kref, sdma_complete);
+}
+
+static void sdma_finalput(struct qib_sdma_state *ss)
+{
+	sdma_put(ss);
+	wait_for_completion(&ss->comp);
+}
+
+/*
+ * Complete all the sdma requests on the active list, in the correct
+ * order, and with appropriate processing.   Called when cleaning up
+ * after sdma shutdown, and when new sdma requests are submitted for
+ * a link that is down.   This matches what is done for requests
+ * that complete normally, it's just the full list.
+ *
+ * Must be called with sdma_lock held
+ */
+static void clear_sdma_activelist(struct qib_pportdata *ppd)
+{
+	struct qib_sdma_txreq *txp, *txp_next;
+
+	list_for_each_entry_safe(txp, txp_next, &ppd->sdma_activelist, list) {
+		list_del_init(&txp->list);
+		if (txp->flags & QIB_SDMA_TXREQ_F_FREEDESC) {
+			unsigned idx;
+
+			idx = txp->start_idx;
+			while (idx != txp->next_descq_idx) {
+				unmap_desc(ppd, idx);
+				if (++idx == ppd->sdma_descq_cnt)
+					idx = 0;
+			}
+		}
+		if (txp->callback)
+			(*txp->callback)(txp, QIB_SDMA_TXREQ_S_ABORTED);
+	}
+}
+
+static void sdma_sw_clean_up_task(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *) opaque;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	/*
+	 * At this point, the following should always be true:
+	 * - We are halted, so no more descriptors are getting retired.
+	 * - We are not running, so no one is submitting new work.
+	 * - Only we can send the e40_sw_cleaned, so we can't start
+	 *   running again until we say so.  So, the active list and
+	 *   descq are ours to play with.
+	 */
+
+	/* Process all retired requests. */
+	qib_sdma_make_progress(ppd);
+
+	clear_sdma_activelist(ppd);
+
+	/*
+	 * Resync count of added and removed.  It is VERY important that
+	 * sdma_descq_removed NEVER decrement - user_sdma depends on it.
+	 */
+	ppd->sdma_descq_removed = ppd->sdma_descq_added;
+
+	/*
+	 * Reset our notion of head and tail.
+	 * Note that the HW registers will be reset when switching states
+	 * due to calling __qib_sdma_process_event() below.
+	 */
+	ppd->sdma_descq_tail = 0;
+	ppd->sdma_descq_head = 0;
+	ppd->sdma_head_dma[0] = 0;
+	ppd->sdma_generation = 0;
+
+	__qib_sdma_process_event(ppd, qib_sdma_event_e40_sw_cleaned);
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+/*
+ * This is called when changing to state qib_sdma_state_s10_hw_start_up_wait
+ * as a result of send buffer errors or send DMA descriptor errors.
+ * We want to disarm the buffers in these cases.
+ */
+static void sdma_hw_start_up(struct qib_pportdata *ppd)
+{
+	struct qib_sdma_state *ss = &ppd->sdma_state;
+	unsigned bufno;
+
+	for (bufno = ss->first_sendbuf; bufno < ss->last_sendbuf; ++bufno)
+		ppd->dd->f_sendctrl(ppd, QIB_SENDCTRL_DISARM_BUF(bufno));
+
+	ppd->dd->f_sdma_hw_start_up(ppd);
+}
+
+static void sdma_sw_tear_down(struct qib_pportdata *ppd)
+{
+	struct qib_sdma_state *ss = &ppd->sdma_state;
+
+	/* Releasing this reference means the state machine has stopped. */
+	sdma_put(ss);
+}
+
+static void sdma_start_sw_clean_up(struct qib_pportdata *ppd)
+{
+	tasklet_hi_schedule(&ppd->sdma_sw_clean_up_task);
+}
+
+static void sdma_set_state(struct qib_pportdata *ppd,
+	enum qib_sdma_states next_state)
+{
+	struct qib_sdma_state *ss = &ppd->sdma_state;
+	struct sdma_set_state_action *action = ss->set_state_action;
+	unsigned op = 0;
+
+	/* debugging bookkeeping */
+	ss->previous_state = ss->current_state;
+	ss->previous_op = ss->current_op;
+
+	ss->current_state = next_state;
+
+	if (action[next_state].op_enable)
+		op |= QIB_SDMA_SENDCTRL_OP_ENABLE;
+
+	if (action[next_state].op_intenable)
+		op |= QIB_SDMA_SENDCTRL_OP_INTENABLE;
+
+	if (action[next_state].op_halt)
+		op |= QIB_SDMA_SENDCTRL_OP_HALT;
+
+	if (action[next_state].op_drain)
+		op |= QIB_SDMA_SENDCTRL_OP_DRAIN;
+
+	if (action[next_state].go_s99_running_tofalse)
+		ss->go_s99_running = 0;
+
+	if (action[next_state].go_s99_running_totrue)
+		ss->go_s99_running = 1;
+
+	ss->current_op = op;
+
+	ppd->dd->f_sdma_sendctrl(ppd, ss->current_op);
+}
+
+static void unmap_desc(struct qib_pportdata *ppd, unsigned head)
+{
+	__le64 *descqp = &ppd->sdma_descq[head].qw[0];
+	u64 desc[2];
+	dma_addr_t addr;
+	size_t len;
+
+	desc[0] = le64_to_cpu(descqp[0]);
+	desc[1] = le64_to_cpu(descqp[1]);
+
+	addr = (desc[1] << 32) | (desc[0] >> 32);
+	len = (desc[0] >> 14) & (0x7ffULL << 2);
+	dma_unmap_single(&ppd->dd->pcidev->dev, addr, len, DMA_TO_DEVICE);
+}
+
+static int alloc_sdma(struct qib_pportdata *ppd)
+{
+	ppd->sdma_descq_cnt = sdma_descq_cnt;
+	if (!ppd->sdma_descq_cnt)
+		ppd->sdma_descq_cnt = 256;
+
+	/* Allocate memory for SendDMA descriptor FIFO */
+	ppd->sdma_descq = dma_alloc_coherent(&ppd->dd->pcidev->dev,
+		ppd->sdma_descq_cnt * sizeof(u64[2]), &ppd->sdma_descq_phys,
+		GFP_KERNEL);
+
+	if (!ppd->sdma_descq) {
+		qib_dev_err(ppd->dd,
+			"failed to allocate SendDMA descriptor FIFO memory\n");
+		goto bail;
+	}
+
+	/* Allocate memory for DMA of head register to memory */
+	ppd->sdma_head_dma = dma_alloc_coherent(&ppd->dd->pcidev->dev,
+		PAGE_SIZE, &ppd->sdma_head_phys, GFP_KERNEL);
+	if (!ppd->sdma_head_dma) {
+		qib_dev_err(ppd->dd,
+			"failed to allocate SendDMA head memory\n");
+		goto cleanup_descq;
+	}
+	ppd->sdma_head_dma[0] = 0;
+	return 0;
+
+cleanup_descq:
+	dma_free_coherent(&ppd->dd->pcidev->dev,
+		ppd->sdma_descq_cnt * sizeof(u64[2]), (void *)ppd->sdma_descq,
+		ppd->sdma_descq_phys);
+	ppd->sdma_descq = NULL;
+	ppd->sdma_descq_phys = 0;
+bail:
+	ppd->sdma_descq_cnt = 0;
+	return -ENOMEM;
+}
+
+static void free_sdma(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+
+	if (ppd->sdma_head_dma) {
+		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+				  (void *)ppd->sdma_head_dma,
+				  ppd->sdma_head_phys);
+		ppd->sdma_head_dma = NULL;
+		ppd->sdma_head_phys = 0;
+	}
+
+	if (ppd->sdma_descq) {
+		dma_free_coherent(&dd->pcidev->dev,
+				  ppd->sdma_descq_cnt * sizeof(u64[2]),
+				  ppd->sdma_descq, ppd->sdma_descq_phys);
+		ppd->sdma_descq = NULL;
+		ppd->sdma_descq_phys = 0;
+	}
+}
+
+static inline void make_sdma_desc(struct qib_pportdata *ppd,
+				  u64 *sdmadesc, u64 addr, u64 dwlen,
+				  u64 dwoffset)
+{
+
+	WARN_ON(addr & 3);
+	/* SDmaPhyAddr[47:32] */
+	sdmadesc[1] = addr >> 32;
+	/* SDmaPhyAddr[31:0] */
+	sdmadesc[0] = (addr & 0xfffffffcULL) << 32;
+	/* SDmaGeneration[1:0] */
+	sdmadesc[0] |= (ppd->sdma_generation & 3ULL) <<
+		SDMA_DESC_GEN_LSB;
+	/* SDmaDwordCount[10:0] */
+	sdmadesc[0] |= (dwlen & 0x7ffULL) << SDMA_DESC_COUNT_LSB;
+	/* SDmaBufOffset[12:2] */
+	sdmadesc[0] |= dwoffset & 0x7ffULL;
+}
+
+/* sdma_lock must be held */
+int qib_sdma_make_progress(struct qib_pportdata *ppd)
+{
+	struct list_head *lp = NULL;
+	struct qib_sdma_txreq *txp = NULL;
+	struct qib_devdata *dd = ppd->dd;
+	int progress = 0;
+	u16 hwhead;
+	u16 idx = 0;
+
+	hwhead = dd->f_sdma_gethead(ppd);
+
+	/* The reason for some of the complexity of this code is that
+	 * not all descriptors have corresponding txps.  So, we have to
+	 * be able to skip over descs until we wander into the range of
+	 * the next txp on the list.
+	 */
+
+	if (!list_empty(&ppd->sdma_activelist)) {
+		lp = ppd->sdma_activelist.next;
+		txp = list_entry(lp, struct qib_sdma_txreq, list);
+		idx = txp->start_idx;
+	}
+
+	while (ppd->sdma_descq_head != hwhead) {
+		/* if desc is part of this txp, unmap if needed */
+		if (txp && (txp->flags & QIB_SDMA_TXREQ_F_FREEDESC) &&
+		    (idx == ppd->sdma_descq_head)) {
+			unmap_desc(ppd, ppd->sdma_descq_head);
+			if (++idx == ppd->sdma_descq_cnt)
+				idx = 0;
+		}
+
+		/* increment dequed desc count */
+		ppd->sdma_descq_removed++;
+
+		/* advance head, wrap if needed */
+		if (++ppd->sdma_descq_head == ppd->sdma_descq_cnt)
+			ppd->sdma_descq_head = 0;
+
+		/* if now past this txp's descs, do the callback */
+		if (txp && txp->next_descq_idx == ppd->sdma_descq_head) {
+			/* remove from active list */
+			list_del_init(&txp->list);
+			if (txp->callback)
+				(*txp->callback)(txp, QIB_SDMA_TXREQ_S_OK);
+			/* see if there is another txp */
+			if (list_empty(&ppd->sdma_activelist))
+				txp = NULL;
+			else {
+				lp = ppd->sdma_activelist.next;
+				txp = list_entry(lp, struct qib_sdma_txreq,
+					list);
+				idx = txp->start_idx;
+			}
+		}
+		progress = 1;
+	}
+	if (progress)
+		qib_verbs_sdma_desc_avail(ppd, qib_sdma_descq_freecnt(ppd));
+	return progress;
+}
+
+/*
+ * This is called from interrupt context.
+ */
+void qib_sdma_intr(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	__qib_sdma_intr(ppd);
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+void __qib_sdma_intr(struct qib_pportdata *ppd)
+{
+	if (__qib_sdma_running(ppd)) {
+		qib_sdma_make_progress(ppd);
+		if (!list_empty(&ppd->sdma_userpending))
+			qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending);
+	}
+}
+
+int qib_setup_sdma(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+	int ret = 0;
+
+	ret = alloc_sdma(ppd);
+	if (ret)
+		goto bail;
+
+	/* set consistent sdma state */
+	ppd->dd->f_sdma_init_early(ppd);
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+	sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+	/* set up reference counting */
+	kref_init(&ppd->sdma_state.kref);
+	init_completion(&ppd->sdma_state.comp);
+
+	ppd->sdma_generation = 0;
+	ppd->sdma_descq_head = 0;
+	ppd->sdma_descq_removed = 0;
+	ppd->sdma_descq_added = 0;
+
+	ppd->sdma_intrequest = 0;
+	INIT_LIST_HEAD(&ppd->sdma_userpending);
+
+	INIT_LIST_HEAD(&ppd->sdma_activelist);
+
+	tasklet_init(&ppd->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
+		(unsigned long)ppd);
+
+	ret = dd->f_init_sdma_regs(ppd);
+	if (ret)
+		goto bail_alloc;
+
+	qib_sdma_process_event(ppd, qib_sdma_event_e10_go_hw_start);
+
+	return 0;
+
+bail_alloc:
+	qib_teardown_sdma(ppd);
+bail:
+	return ret;
+}
+
+void qib_teardown_sdma(struct qib_pportdata *ppd)
+{
+	qib_sdma_process_event(ppd, qib_sdma_event_e00_go_hw_down);
+
+	/*
+	 * This waits for the state machine to exit so it is not
+	 * necessary to kill the sdma_sw_clean_up_task to make sure
+	 * it is not running.
+	 */
+	sdma_finalput(&ppd->sdma_state);
+
+	free_sdma(ppd);
+}
+
+int qib_sdma_running(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+	ret = __qib_sdma_running(ppd);
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+	return ret;
+}
+
+/*
+ * Complete a request when sdma not running; likely only request
+ * but to simplify the code, always queue it, then process the full
+ * activelist.  We process the entire list to ensure that this particular
+ * request does get it's callback, but in the correct order.
+ * Must be called with sdma_lock held
+ */
+static void complete_sdma_err_req(struct qib_pportdata *ppd,
+				  struct qib_verbs_txreq *tx)
+{
+	struct qib_qp_priv *priv = tx->qp->priv;
+
+	atomic_inc(&priv->s_dma_busy);
+	/* no sdma descriptors, so no unmap_desc */
+	tx->txreq.start_idx = 0;
+	tx->txreq.next_descq_idx = 0;
+	list_add_tail(&tx->txreq.list, &ppd->sdma_activelist);
+	clear_sdma_activelist(ppd);
+}
+
+/*
+ * This function queues one IB packet onto the send DMA queue per call.
+ * The caller is responsible for checking:
+ * 1) The number of send DMA descriptor entries is less than the size of
+ *    the descriptor queue.
+ * 2) The IB SGE addresses and lengths are 32-bit aligned
+ *    (except possibly the last SGE's length)
+ * 3) The SGE addresses are suitable for passing to dma_map_single().
+ */
+int qib_sdma_verbs_send(struct qib_pportdata *ppd,
+			struct rvt_sge_state *ss, u32 dwords,
+			struct qib_verbs_txreq *tx)
+{
+	unsigned long flags;
+	struct rvt_sge *sge;
+	struct rvt_qp *qp;
+	int ret = 0;
+	u16 tail;
+	__le64 *descqp;
+	u64 sdmadesc[2];
+	u32 dwoffset;
+	dma_addr_t addr;
+	struct qib_qp_priv *priv;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+retry:
+	if (unlikely(!__qib_sdma_running(ppd))) {
+		complete_sdma_err_req(ppd, tx);
+		goto unlock;
+	}
+
+	if (tx->txreq.sg_count > qib_sdma_descq_freecnt(ppd)) {
+		if (qib_sdma_make_progress(ppd))
+			goto retry;
+		if (ppd->dd->flags & QIB_HAS_SDMA_TIMEOUT)
+			ppd->dd->f_sdma_set_desc_cnt(ppd,
+					ppd->sdma_descq_cnt / 2);
+		goto busy;
+	}
+
+	dwoffset = tx->hdr_dwords;
+	make_sdma_desc(ppd, sdmadesc, (u64) tx->txreq.addr, dwoffset, 0);
+
+	sdmadesc[0] |= SDMA_DESC_FIRST;
+	if (tx->txreq.flags & QIB_SDMA_TXREQ_F_USELARGEBUF)
+		sdmadesc[0] |= SDMA_DESC_USE_LARGE_BUF;
+
+	/* write to the descq */
+	tail = ppd->sdma_descq_tail;
+	descqp = &ppd->sdma_descq[tail].qw[0];
+	*descqp++ = cpu_to_le64(sdmadesc[0]);
+	*descqp++ = cpu_to_le64(sdmadesc[1]);
+
+	/* increment the tail */
+	if (++tail == ppd->sdma_descq_cnt) {
+		tail = 0;
+		descqp = &ppd->sdma_descq[0].qw[0];
+		++ppd->sdma_generation;
+	}
+
+	tx->txreq.start_idx = tail;
+
+	sge = &ss->sge;
+	while (dwords) {
+		u32 dw;
+		u32 len;
+
+		len = dwords << 2;
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		dw = (len + 3) >> 2;
+		addr = dma_map_single(&ppd->dd->pcidev->dev, sge->vaddr,
+				      dw << 2, DMA_TO_DEVICE);
+		if (dma_mapping_error(&ppd->dd->pcidev->dev, addr))
+			goto unmap;
+		sdmadesc[0] = 0;
+		make_sdma_desc(ppd, sdmadesc, (u64) addr, dw, dwoffset);
+		/* SDmaUseLargeBuf has to be set in every descriptor */
+		if (tx->txreq.flags & QIB_SDMA_TXREQ_F_USELARGEBUF)
+			sdmadesc[0] |= SDMA_DESC_USE_LARGE_BUF;
+		/* write to the descq */
+		*descqp++ = cpu_to_le64(sdmadesc[0]);
+		*descqp++ = cpu_to_le64(sdmadesc[1]);
+
+		/* increment the tail */
+		if (++tail == ppd->sdma_descq_cnt) {
+			tail = 0;
+			descqp = &ppd->sdma_descq[0].qw[0];
+			++ppd->sdma_generation;
+		}
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= RVT_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+
+		dwoffset += dw;
+		dwords -= dw;
+	}
+
+	if (!tail)
+		descqp = &ppd->sdma_descq[ppd->sdma_descq_cnt].qw[0];
+	descqp -= 2;
+	descqp[0] |= cpu_to_le64(SDMA_DESC_LAST);
+	if (tx->txreq.flags & QIB_SDMA_TXREQ_F_HEADTOHOST)
+		descqp[0] |= cpu_to_le64(SDMA_DESC_DMA_HEAD);
+	if (tx->txreq.flags & QIB_SDMA_TXREQ_F_INTREQ)
+		descqp[0] |= cpu_to_le64(SDMA_DESC_INTR);
+	priv = tx->qp->priv;
+	atomic_inc(&priv->s_dma_busy);
+	tx->txreq.next_descq_idx = tail;
+	ppd->dd->f_sdma_update_tail(ppd, tail);
+	ppd->sdma_descq_added += tx->txreq.sg_count;
+	list_add_tail(&tx->txreq.list, &ppd->sdma_activelist);
+	goto unlock;
+
+unmap:
+	for (;;) {
+		if (!tail)
+			tail = ppd->sdma_descq_cnt - 1;
+		else
+			tail--;
+		if (tail == ppd->sdma_descq_tail)
+			break;
+		unmap_desc(ppd, tail);
+	}
+	qp = tx->qp;
+	priv = qp->priv;
+	qib_put_txreq(tx);
+	spin_lock(&qp->r_lock);
+	spin_lock(&qp->s_lock);
+	if (qp->ibqp.qp_type == IB_QPT_RC) {
+		/* XXX what about error sending RDMA read responses? */
+		if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)
+			rvt_error_qp(qp, IB_WC_GENERAL_ERR);
+	} else if (qp->s_wqe)
+		qib_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+	spin_unlock(&qp->s_lock);
+	spin_unlock(&qp->r_lock);
+	/* return zero to process the next send work request */
+	goto unlock;
+
+busy:
+	qp = tx->qp;
+	priv = qp->priv;
+	spin_lock(&qp->s_lock);
+	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+		struct qib_ibdev *dev;
+
+		/*
+		 * If we couldn't queue the DMA request, save the info
+		 * and try again later rather than destroying the
+		 * buffer and undoing the side effects of the copy.
+		 */
+		tx->ss = ss;
+		tx->dwords = dwords;
+		priv->s_tx = tx;
+		dev = &ppd->dd->verbs_dev;
+		spin_lock(&dev->rdi.pending_lock);
+		if (list_empty(&priv->iowait)) {
+			struct qib_ibport *ibp;
+
+			ibp = &ppd->ibport_data;
+			ibp->rvp.n_dmawait++;
+			qp->s_flags |= RVT_S_WAIT_DMA_DESC;
+			list_add_tail(&priv->iowait, &dev->dmawait);
+		}
+		spin_unlock(&dev->rdi.pending_lock);
+		qp->s_flags &= ~RVT_S_BUSY;
+		spin_unlock(&qp->s_lock);
+		ret = -EBUSY;
+	} else {
+		spin_unlock(&qp->s_lock);
+		qib_put_txreq(tx);
+	}
+unlock:
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+	return ret;
+}
+
+/*
+ * sdma_lock should be acquired before calling this routine
+ */
+void dump_sdma_state(struct qib_pportdata *ppd)
+{
+	struct qib_sdma_desc *descq;
+	struct qib_sdma_txreq *txp, *txpnext;
+	__le64 *descqp;
+	u64 desc[2];
+	u64 addr;
+	u16 gen, dwlen, dwoffset;
+	u16 head, tail, cnt;
+
+	head = ppd->sdma_descq_head;
+	tail = ppd->sdma_descq_tail;
+	cnt = qib_sdma_descq_freecnt(ppd);
+	descq = ppd->sdma_descq;
+
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA ppd->sdma_descq_head: %u\n", head);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA ppd->sdma_descq_tail: %u\n", tail);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA sdma_descq_freecnt: %u\n", cnt);
+
+	/* print info for each entry in the descriptor queue */
+	while (head != tail) {
+		char flags[6] = { 'x', 'x', 'x', 'x', 'x', 0 };
+
+		descqp = &descq[head].qw[0];
+		desc[0] = le64_to_cpu(descqp[0]);
+		desc[1] = le64_to_cpu(descqp[1]);
+		flags[0] = (desc[0] & 1<<15) ? 'I' : '-';
+		flags[1] = (desc[0] & 1<<14) ? 'L' : 'S';
+		flags[2] = (desc[0] & 1<<13) ? 'H' : '-';
+		flags[3] = (desc[0] & 1<<12) ? 'F' : '-';
+		flags[4] = (desc[0] & 1<<11) ? 'L' : '-';
+		addr = (desc[1] << 32) | ((desc[0] >> 32) & 0xfffffffcULL);
+		gen = (desc[0] >> 30) & 3ULL;
+		dwlen = (desc[0] >> 14) & (0x7ffULL << 2);
+		dwoffset = (desc[0] & 0x7ffULL) << 2;
+		qib_dev_porterr(ppd->dd, ppd->port,
+			"SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes offset:%u bytes\n",
+			 head, flags, addr, gen, dwlen, dwoffset);
+		if (++head == ppd->sdma_descq_cnt)
+			head = 0;
+	}
+
+	/* print dma descriptor indices from the TX requests */
+	list_for_each_entry_safe(txp, txpnext, &ppd->sdma_activelist,
+				 list)
+		qib_dev_porterr(ppd->dd, ppd->port,
+			"SDMA txp->start_idx: %u txp->next_descq_idx: %u\n",
+			txp->start_idx, txp->next_descq_idx);
+}
+
+void qib_sdma_process_event(struct qib_pportdata *ppd,
+	enum qib_sdma_events event)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	__qib_sdma_process_event(ppd, event);
+
+	if (ppd->sdma_state.current_state == qib_sdma_state_s99_running)
+		qib_verbs_sdma_desc_avail(ppd, qib_sdma_descq_freecnt(ppd));
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+void __qib_sdma_process_event(struct qib_pportdata *ppd,
+	enum qib_sdma_events event)
+{
+	struct qib_sdma_state *ss = &ppd->sdma_state;
+
+	switch (ss->current_state) {
+	case qib_sdma_state_s00_hw_down:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			break;
+		case qib_sdma_event_e30_go_running:
+			/*
+			 * If down, but running requested (usually result
+			 * of link up, then we need to start up.
+			 * This can happen when hw down is requested while
+			 * bringing the link up with traffic active on
+			 * 7220, e.g. */
+			ss->go_s99_running = 1;
+			/* fall through -- and start dma engine */
+		case qib_sdma_event_e10_go_hw_start:
+			/* This reference means the state machine is started */
+			sdma_get(&ppd->sdma_state);
+			sdma_set_state(ppd,
+				       qib_sdma_state_s10_hw_start_up_wait);
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			sdma_sw_tear_down(ppd);
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s10_hw_start_up_wait:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_sw_tear_down(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			sdma_set_state(ppd, ss->go_s99_running ?
+				       qib_sdma_state_s99_running :
+				       qib_sdma_state_s20_idle);
+			break;
+		case qib_sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s20_idle:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_sw_tear_down(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			sdma_set_state(ppd, qib_sdma_state_s99_running);
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s30_sw_clean_up_wait:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s10_hw_start_up_wait);
+			sdma_hw_start_up(ppd);
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s40_hw_clean_up_wait:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s30_sw_clean_up_wait);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s50_hw_halt_wait:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s40_hw_clean_up_wait);
+			ppd->dd->f_sdma_hw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s99_running:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s30_sw_clean_up_wait);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e70_go_idle:
+			sdma_set_state(ppd, qib_sdma_state_s50_hw_halt_wait);
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s30_sw_clean_up_wait);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			sdma_set_state(ppd, qib_sdma_state_s50_hw_halt_wait);
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+	}
+
+	ss->last_event = event;
+}
diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c
new file mode 100644
index 0000000..ca2638d
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_sysfs.c
@@ -0,0 +1,871 @@
+/*
+ * Copyright (c) 2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/ctype.h>
+
+#include "qib.h"
+#include "qib_mad.h"
+
+/* start of per-port functions */
+/*
+ * Get/Set heartbeat enable. OR of 1=enabled, 2=auto
+ */
+static ssize_t show_hrtbt_enb(struct qib_pportdata *ppd, char *buf)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret;
+
+	ret = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_HRTBT);
+	ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_hrtbt_enb(struct qib_pportdata *ppd, const char *buf,
+			       size_t count)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret;
+	u16 val;
+
+	ret = kstrtou16(buf, 0, &val);
+	if (ret) {
+		qib_dev_err(dd, "attempt to set invalid Heartbeat enable\n");
+		return ret;
+	}
+
+	/*
+	 * Set the "intentional" heartbeat enable per either of
+	 * "Enable" and "Auto", as these are normally set together.
+	 * This bit is consulted when leaving loopback mode,
+	 * because entering loopback mode overrides it and automatically
+	 * disables heartbeat.
+	 */
+	ret = dd->f_set_ib_cfg(ppd, QIB_IB_CFG_HRTBT, val);
+	return ret < 0 ? ret : count;
+}
+
+static ssize_t store_loopback(struct qib_pportdata *ppd, const char *buf,
+			      size_t count)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret = count, r;
+
+	r = dd->f_set_ib_loopback(ppd, buf);
+	if (r < 0)
+		ret = r;
+
+	return ret;
+}
+
+static ssize_t store_led_override(struct qib_pportdata *ppd, const char *buf,
+				  size_t count)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret;
+	u16 val;
+
+	ret = kstrtou16(buf, 0, &val);
+	if (ret) {
+		qib_dev_err(dd, "attempt to set invalid LED override\n");
+		return ret;
+	}
+
+	qib_set_led_override(ppd, val);
+	return count;
+}
+
+static ssize_t show_status(struct qib_pportdata *ppd, char *buf)
+{
+	ssize_t ret;
+
+	if (!ppd->statusp)
+		ret = -EINVAL;
+	else
+		ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+				(unsigned long long) *(ppd->statusp));
+	return ret;
+}
+
+/*
+ * For userland compatibility, these offsets must remain fixed.
+ * They are strings for QIB_STATUS_*
+ */
+static const char * const qib_status_str[] = {
+	"Initted",
+	"",
+	"",
+	"",
+	"",
+	"Present",
+	"IB_link_up",
+	"IB_configured",
+	"",
+	"Fatal_Hardware_Error",
+	NULL,
+};
+
+static ssize_t show_status_str(struct qib_pportdata *ppd, char *buf)
+{
+	int i, any;
+	u64 s;
+	ssize_t ret;
+
+	if (!ppd->statusp) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	s = *(ppd->statusp);
+	*buf = '\0';
+	for (any = i = 0; s && qib_status_str[i]; i++) {
+		if (s & 1) {
+			/* if overflow */
+			if (any && strlcat(buf, " ", PAGE_SIZE) >= PAGE_SIZE)
+				break;
+			if (strlcat(buf, qib_status_str[i], PAGE_SIZE) >=
+					PAGE_SIZE)
+				break;
+			any = 1;
+		}
+		s >>= 1;
+	}
+	if (any)
+		strlcat(buf, "\n", PAGE_SIZE);
+
+	ret = strlen(buf);
+
+bail:
+	return ret;
+}
+
+/* end of per-port functions */
+
+/*
+ * Start of per-port file structures and support code
+ * Because we are fitting into other infrastructure, we have to supply the
+ * full set of kobject/sysfs_ops structures and routines.
+ */
+#define QIB_PORT_ATTR(name, mode, show, store) \
+	static struct qib_port_attr qib_port_attr_##name = \
+		__ATTR(name, mode, show, store)
+
+struct qib_port_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct qib_pportdata *, char *);
+	ssize_t (*store)(struct qib_pportdata *, const char *, size_t);
+};
+
+QIB_PORT_ATTR(loopback, S_IWUSR, NULL, store_loopback);
+QIB_PORT_ATTR(led_override, S_IWUSR, NULL, store_led_override);
+QIB_PORT_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb,
+	      store_hrtbt_enb);
+QIB_PORT_ATTR(status, S_IRUGO, show_status, NULL);
+QIB_PORT_ATTR(status_str, S_IRUGO, show_status_str, NULL);
+
+static struct attribute *port_default_attributes[] = {
+	&qib_port_attr_loopback.attr,
+	&qib_port_attr_led_override.attr,
+	&qib_port_attr_hrtbt_enable.attr,
+	&qib_port_attr_status.attr,
+	&qib_port_attr_status_str.attr,
+	NULL
+};
+
+/*
+ * Start of per-port congestion control structures and support code
+ */
+
+/*
+ * Congestion control table size followed by table entries
+ */
+static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *bin_attr,
+		char *buf, loff_t pos, size_t count)
+{
+	int ret;
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, pport_cc_kobj);
+
+	if (!qib_cc_table_size || !ppd->ccti_entries_shadow)
+		return -EINVAL;
+
+	ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow)
+		 + sizeof(__be16);
+
+	if (pos > ret)
+		return -EINVAL;
+
+	if (count > ret - pos)
+		count = ret - pos;
+
+	if (!count)
+		return count;
+
+	spin_lock(&ppd->cc_shadow_lock);
+	memcpy(buf, ppd->ccti_entries_shadow, count);
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return count;
+}
+
+static void qib_port_release(struct kobject *kobj)
+{
+	/* nothing to do since memory is freed by qib_free_devdata() */
+}
+
+static struct kobj_type qib_port_cc_ktype = {
+	.release = qib_port_release,
+};
+
+static const struct bin_attribute cc_table_bin_attr = {
+	.attr = {.name = "cc_table_bin", .mode = 0444},
+	.read = read_cc_table_bin,
+	.size = PAGE_SIZE,
+};
+
+/*
+ * Congestion settings: port control, control map and an array of 16
+ * entries for the congestion entries - increase, timer, event log
+ * trigger threshold and the minimum injection rate delay.
+ */
+static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *bin_attr,
+		char *buf, loff_t pos, size_t count)
+{
+	int ret;
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, pport_cc_kobj);
+
+	if (!qib_cc_table_size || !ppd->congestion_entries_shadow)
+		return -EINVAL;
+
+	ret = sizeof(struct ib_cc_congestion_setting_attr_shadow);
+
+	if (pos > ret)
+		return -EINVAL;
+	if (count > ret - pos)
+		count = ret - pos;
+
+	if (!count)
+		return count;
+
+	spin_lock(&ppd->cc_shadow_lock);
+	memcpy(buf, ppd->congestion_entries_shadow, count);
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return count;
+}
+
+static const struct bin_attribute cc_setting_bin_attr = {
+	.attr = {.name = "cc_settings_bin", .mode = 0444},
+	.read = read_cc_setting_bin,
+	.size = PAGE_SIZE,
+};
+
+
+static ssize_t qib_portattr_show(struct kobject *kobj,
+	struct attribute *attr, char *buf)
+{
+	struct qib_port_attr *pattr =
+		container_of(attr, struct qib_port_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, pport_kobj);
+
+	return pattr->show(ppd, buf);
+}
+
+static ssize_t qib_portattr_store(struct kobject *kobj,
+	struct attribute *attr, const char *buf, size_t len)
+{
+	struct qib_port_attr *pattr =
+		container_of(attr, struct qib_port_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, pport_kobj);
+
+	return pattr->store(ppd, buf, len);
+}
+
+
+static const struct sysfs_ops qib_port_ops = {
+	.show = qib_portattr_show,
+	.store = qib_portattr_store,
+};
+
+static struct kobj_type qib_port_ktype = {
+	.release = qib_port_release,
+	.sysfs_ops = &qib_port_ops,
+	.default_attrs = port_default_attributes
+};
+
+/* Start sl2vl */
+
+#define QIB_SL2VL_ATTR(N) \
+	static struct qib_sl2vl_attr qib_sl2vl_attr_##N = { \
+		.attr = { .name = __stringify(N), .mode = 0444 }, \
+		.sl = N \
+	}
+
+struct qib_sl2vl_attr {
+	struct attribute attr;
+	int sl;
+};
+
+QIB_SL2VL_ATTR(0);
+QIB_SL2VL_ATTR(1);
+QIB_SL2VL_ATTR(2);
+QIB_SL2VL_ATTR(3);
+QIB_SL2VL_ATTR(4);
+QIB_SL2VL_ATTR(5);
+QIB_SL2VL_ATTR(6);
+QIB_SL2VL_ATTR(7);
+QIB_SL2VL_ATTR(8);
+QIB_SL2VL_ATTR(9);
+QIB_SL2VL_ATTR(10);
+QIB_SL2VL_ATTR(11);
+QIB_SL2VL_ATTR(12);
+QIB_SL2VL_ATTR(13);
+QIB_SL2VL_ATTR(14);
+QIB_SL2VL_ATTR(15);
+
+static struct attribute *sl2vl_default_attributes[] = {
+	&qib_sl2vl_attr_0.attr,
+	&qib_sl2vl_attr_1.attr,
+	&qib_sl2vl_attr_2.attr,
+	&qib_sl2vl_attr_3.attr,
+	&qib_sl2vl_attr_4.attr,
+	&qib_sl2vl_attr_5.attr,
+	&qib_sl2vl_attr_6.attr,
+	&qib_sl2vl_attr_7.attr,
+	&qib_sl2vl_attr_8.attr,
+	&qib_sl2vl_attr_9.attr,
+	&qib_sl2vl_attr_10.attr,
+	&qib_sl2vl_attr_11.attr,
+	&qib_sl2vl_attr_12.attr,
+	&qib_sl2vl_attr_13.attr,
+	&qib_sl2vl_attr_14.attr,
+	&qib_sl2vl_attr_15.attr,
+	NULL
+};
+
+static ssize_t sl2vl_attr_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct qib_sl2vl_attr *sattr =
+		container_of(attr, struct qib_sl2vl_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, sl2vl_kobj);
+	struct qib_ibport *qibp = &ppd->ibport_data;
+
+	return sprintf(buf, "%u\n", qibp->sl_to_vl[sattr->sl]);
+}
+
+static const struct sysfs_ops qib_sl2vl_ops = {
+	.show = sl2vl_attr_show,
+};
+
+static struct kobj_type qib_sl2vl_ktype = {
+	.release = qib_port_release,
+	.sysfs_ops = &qib_sl2vl_ops,
+	.default_attrs = sl2vl_default_attributes
+};
+
+/* End sl2vl */
+
+/* Start diag_counters */
+
+#define QIB_DIAGC_ATTR(N) \
+	static struct qib_diagc_attr qib_diagc_attr_##N = { \
+		.attr = { .name = __stringify(N), .mode = 0664 }, \
+		.counter = offsetof(struct qib_ibport, rvp.n_##N) \
+	}
+
+#define QIB_DIAGC_ATTR_PER_CPU(N) \
+	static struct qib_diagc_attr qib_diagc_attr_##N = { \
+		.attr = { .name = __stringify(N), .mode = 0664 }, \
+		.counter = offsetof(struct qib_ibport, rvp.z_##N) \
+	}
+
+struct qib_diagc_attr {
+	struct attribute attr;
+	size_t counter;
+};
+
+QIB_DIAGC_ATTR_PER_CPU(rc_acks);
+QIB_DIAGC_ATTR_PER_CPU(rc_qacks);
+QIB_DIAGC_ATTR_PER_CPU(rc_delayed_comp);
+
+QIB_DIAGC_ATTR(rc_resends);
+QIB_DIAGC_ATTR(seq_naks);
+QIB_DIAGC_ATTR(rdma_seq);
+QIB_DIAGC_ATTR(rnr_naks);
+QIB_DIAGC_ATTR(other_naks);
+QIB_DIAGC_ATTR(rc_timeouts);
+QIB_DIAGC_ATTR(loop_pkts);
+QIB_DIAGC_ATTR(pkt_drops);
+QIB_DIAGC_ATTR(dmawait);
+QIB_DIAGC_ATTR(unaligned);
+QIB_DIAGC_ATTR(rc_dupreq);
+QIB_DIAGC_ATTR(rc_seqnak);
+
+static struct attribute *diagc_default_attributes[] = {
+	&qib_diagc_attr_rc_resends.attr,
+	&qib_diagc_attr_rc_acks.attr,
+	&qib_diagc_attr_rc_qacks.attr,
+	&qib_diagc_attr_rc_delayed_comp.attr,
+	&qib_diagc_attr_seq_naks.attr,
+	&qib_diagc_attr_rdma_seq.attr,
+	&qib_diagc_attr_rnr_naks.attr,
+	&qib_diagc_attr_other_naks.attr,
+	&qib_diagc_attr_rc_timeouts.attr,
+	&qib_diagc_attr_loop_pkts.attr,
+	&qib_diagc_attr_pkt_drops.attr,
+	&qib_diagc_attr_dmawait.attr,
+	&qib_diagc_attr_unaligned.attr,
+	&qib_diagc_attr_rc_dupreq.attr,
+	&qib_diagc_attr_rc_seqnak.attr,
+	NULL
+};
+
+static u64 get_all_cpu_total(u64 __percpu *cntr)
+{
+	int cpu;
+	u64 counter = 0;
+
+	for_each_possible_cpu(cpu)
+		counter += *per_cpu_ptr(cntr, cpu);
+	return counter;
+}
+
+#define def_write_per_cpu(cntr) \
+static void write_per_cpu_##cntr(struct qib_pportdata *ppd, u32 data)	\
+{									\
+	struct qib_devdata *dd = ppd->dd;				\
+	struct qib_ibport *qibp = &ppd->ibport_data;			\
+	/*  A write can only zero the counter */			\
+	if (data == 0)							\
+		qibp->rvp.z_##cntr = get_all_cpu_total(qibp->rvp.cntr); \
+	else								\
+		qib_dev_err(dd, "Per CPU cntrs can only be zeroed");	\
+}
+
+def_write_per_cpu(rc_acks)
+def_write_per_cpu(rc_qacks)
+def_write_per_cpu(rc_delayed_comp)
+
+#define READ_PER_CPU_CNTR(cntr) (get_all_cpu_total(qibp->rvp.cntr) - \
+							qibp->rvp.z_##cntr)
+
+static ssize_t diagc_attr_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct qib_diagc_attr *dattr =
+		container_of(attr, struct qib_diagc_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, diagc_kobj);
+	struct qib_ibport *qibp = &ppd->ibport_data;
+
+	if (!strncmp(dattr->attr.name, "rc_acks", 7))
+		return sprintf(buf, "%llu\n", READ_PER_CPU_CNTR(rc_acks));
+	else if (!strncmp(dattr->attr.name, "rc_qacks", 8))
+		return sprintf(buf, "%llu\n", READ_PER_CPU_CNTR(rc_qacks));
+	else if (!strncmp(dattr->attr.name, "rc_delayed_comp", 15))
+		return sprintf(buf, "%llu\n",
+					READ_PER_CPU_CNTR(rc_delayed_comp));
+	else
+		return sprintf(buf, "%u\n",
+				*(u32 *)((char *)qibp + dattr->counter));
+}
+
+static ssize_t diagc_attr_store(struct kobject *kobj, struct attribute *attr,
+				const char *buf, size_t size)
+{
+	struct qib_diagc_attr *dattr =
+		container_of(attr, struct qib_diagc_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, diagc_kobj);
+	struct qib_ibport *qibp = &ppd->ibport_data;
+	u32 val;
+	int ret;
+
+	ret = kstrtou32(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (!strncmp(dattr->attr.name, "rc_acks", 7))
+		write_per_cpu_rc_acks(ppd, val);
+	else if (!strncmp(dattr->attr.name, "rc_qacks", 8))
+		write_per_cpu_rc_qacks(ppd, val);
+	else if (!strncmp(dattr->attr.name, "rc_delayed_comp", 15))
+		write_per_cpu_rc_delayed_comp(ppd, val);
+	else
+		*(u32 *)((char *)qibp + dattr->counter) = val;
+	return size;
+}
+
+static const struct sysfs_ops qib_diagc_ops = {
+	.show = diagc_attr_show,
+	.store = diagc_attr_store,
+};
+
+static struct kobj_type qib_diagc_ktype = {
+	.release = qib_port_release,
+	.sysfs_ops = &qib_diagc_ops,
+	.default_attrs = diagc_default_attributes
+};
+
+/* End diag_counters */
+
+/* end of per-port file structures and support code */
+
+/*
+ * Start of per-unit (or driver, in some cases, but replicated
+ * per unit) functions (these get a device *)
+ */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+
+	return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	int ret;
+
+	if (!dd->boardname)
+		ret = -EINVAL;
+	else
+		ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
+	return ret;
+}
+
+static ssize_t show_version(struct device *device,
+			    struct device_attribute *attr, char *buf)
+{
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", (char *)ib_qib_version);
+}
+
+static ssize_t show_boardversion(struct device *device,
+				 struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
+}
+
+
+static ssize_t show_localbus_info(struct device *device,
+				  struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->lbus_info);
+}
+
+
+static ssize_t show_nctxts(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	/* Return the number of user ports (contexts) available. */
+	/* The calculation below deals with a special case where
+	 * cfgctxts is set to 1 on a single-port board. */
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			(dd->first_user_ctxt > dd->cfgctxts) ? 0 :
+			(dd->cfgctxts - dd->first_user_ctxt));
+}
+
+static ssize_t show_nfreectxts(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	/* Return the number of free user ports (contexts) available. */
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
+}
+
+static ssize_t show_serial(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	buf[sizeof(dd->serial)] = '\0';
+	memcpy(buf, dd->serial, sizeof(dd->serial));
+	strcat(buf, "\n");
+	return strlen(buf);
+}
+
+static ssize_t store_chip_reset(struct device *device,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	int ret;
+
+	if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ret = qib_reset_device(dd->unit);
+bail:
+	return ret < 0 ? ret : count;
+}
+
+/*
+ * Dump tempsense regs. in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, rdi.ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	int ret;
+	int idx;
+	u8 regvals[8];
+
+	ret = -ENXIO;
+	for (idx = 0; idx < 8; ++idx) {
+		if (idx == 6)
+			continue;
+		ret = dd->f_tempsense_rd(dd, idx);
+		if (ret < 0)
+			break;
+		regvals[idx] = ret;
+	}
+	if (idx == 8)
+		ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n",
+				*(signed char *)(regvals),
+				*(signed char *)(regvals + 1),
+				regvals[2], regvals[3],
+				*(signed char *)(regvals + 5),
+				*(signed char *)(regvals + 7));
+	return ret;
+}
+
+/*
+ * end of per-unit (or driver, in some cases, but replicated
+ * per unit) functions
+ */
+
+/* start of per-unit file structures and support code */
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(version, S_IRUGO, show_version, NULL);
+static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
+static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
+static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
+static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+
+static struct device_attribute *qib_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+	&dev_attr_version,
+	&dev_attr_nctxts,
+	&dev_attr_nfreectxts,
+	&dev_attr_serial,
+	&dev_attr_boardversion,
+	&dev_attr_tempsense,
+	&dev_attr_localbus_info,
+	&dev_attr_chip_reset,
+};
+
+int qib_create_port_files(struct ib_device *ibdev, u8 port_num,
+			  struct kobject *kobj)
+{
+	struct qib_pportdata *ppd;
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	int ret;
+
+	if (!port_num || port_num > dd->num_pports) {
+		qib_dev_err(dd,
+			"Skipping infiniband class with invalid port %u\n",
+			port_num);
+		ret = -ENODEV;
+		goto bail;
+	}
+	ppd = &dd->pport[port_num - 1];
+
+	ret = kobject_init_and_add(&ppd->pport_kobj, &qib_port_ktype, kobj,
+				   "linkcontrol");
+	if (ret) {
+		qib_dev_err(dd,
+			"Skipping linkcontrol sysfs info, (err %d) port %u\n",
+			ret, port_num);
+		goto bail;
+	}
+	kobject_uevent(&ppd->pport_kobj, KOBJ_ADD);
+
+	ret = kobject_init_and_add(&ppd->sl2vl_kobj, &qib_sl2vl_ktype, kobj,
+				   "sl2vl");
+	if (ret) {
+		qib_dev_err(dd,
+			"Skipping sl2vl sysfs info, (err %d) port %u\n",
+			ret, port_num);
+		goto bail_link;
+	}
+	kobject_uevent(&ppd->sl2vl_kobj, KOBJ_ADD);
+
+	ret = kobject_init_and_add(&ppd->diagc_kobj, &qib_diagc_ktype, kobj,
+				   "diag_counters");
+	if (ret) {
+		qib_dev_err(dd,
+			"Skipping diag_counters sysfs info, (err %d) port %u\n",
+			ret, port_num);
+		goto bail_sl;
+	}
+	kobject_uevent(&ppd->diagc_kobj, KOBJ_ADD);
+
+	if (!qib_cc_table_size || !ppd->congestion_entries_shadow)
+		return 0;
+
+	ret = kobject_init_and_add(&ppd->pport_cc_kobj, &qib_port_cc_ktype,
+				kobj, "CCMgtA");
+	if (ret) {
+		qib_dev_err(dd,
+		 "Skipping Congestion Control sysfs info, (err %d) port %u\n",
+		 ret, port_num);
+		goto bail_diagc;
+	}
+
+	kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
+
+	ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
+				&cc_setting_bin_attr);
+	if (ret) {
+		qib_dev_err(dd,
+		 "Skipping Congestion Control setting sysfs info, (err %d) port %u\n",
+		 ret, port_num);
+		goto bail_cc;
+	}
+
+	ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
+				&cc_table_bin_attr);
+	if (ret) {
+		qib_dev_err(dd,
+		 "Skipping Congestion Control table sysfs info, (err %d) port %u\n",
+		 ret, port_num);
+		goto bail_cc_entry_bin;
+	}
+
+	qib_devinfo(dd->pcidev,
+		"IB%u: Congestion Control Agent enabled for port %d\n",
+		dd->unit, port_num);
+
+	return 0;
+
+bail_cc_entry_bin:
+	sysfs_remove_bin_file(&ppd->pport_cc_kobj, &cc_setting_bin_attr);
+bail_cc:
+	kobject_put(&ppd->pport_cc_kobj);
+bail_diagc:
+	kobject_put(&ppd->diagc_kobj);
+bail_sl:
+	kobject_put(&ppd->sl2vl_kobj);
+bail_link:
+	kobject_put(&ppd->pport_kobj);
+bail:
+	return ret;
+}
+
+/*
+ * Register and create our files in /sys/class/infiniband.
+ */
+int qib_verbs_register_sysfs(struct qib_devdata *dd)
+{
+	struct ib_device *dev = &dd->verbs_dev.rdi.ibdev;
+	int i, ret;
+
+	for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) {
+		ret = device_create_file(&dev->dev, qib_attributes[i]);
+		if (ret)
+			goto bail;
+	}
+
+	return 0;
+bail:
+	for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i)
+		device_remove_file(&dev->dev, qib_attributes[i]);
+	return ret;
+}
+
+/*
+ * Unregister and remove our files in /sys/class/infiniband.
+ */
+void qib_verbs_unregister_sysfs(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	int i;
+
+	for (i = 0; i < dd->num_pports; i++) {
+		ppd = &dd->pport[i];
+		if (qib_cc_table_size &&
+			ppd->congestion_entries_shadow) {
+			sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+				&cc_setting_bin_attr);
+			sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+				&cc_table_bin_attr);
+			kobject_put(&ppd->pport_cc_kobj);
+		}
+		kobject_put(&ppd->sl2vl_kobj);
+		kobject_put(&ppd->pport_kobj);
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_twsi.c b/drivers/infiniband/hw/qib/qib_twsi.c
new file mode 100644
index 0000000..f569866
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_twsi.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "qib.h"
+
+/*
+ * QLogic_IB "Two Wire Serial Interface" driver.
+ * Originally written for a not-quite-i2c serial eeprom, which is
+ * still used on some supported boards. Later boards have added a
+ * variety of other uses, most board-specific, so the bit-boffing
+ * part has been split off to this file, while the other parts
+ * have been moved to chip-specific files.
+ *
+ * We have also dropped all pretense of fully generic (e.g. pretend
+ * we don't know whether '1' is the higher voltage) interface, as
+ * the restrictions of the generic i2c interface (e.g. no access from
+ * driver itself) make it unsuitable for this use.
+ */
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the qlogic_ib device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip.  Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct qib_devdata *dd)
+{
+	/*
+	 * implicit read of EXTStatus is as good as explicit
+	 * read of scratch, if all we want to do is flush
+	 * writes.
+	 */
+	dd->f_gpio_mod(dd, 0, 0, 0);
+	rmb(); /* inlined, so prevent compiler reordering */
+}
+
+/*
+ * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that
+ * for "almost compliant" modules
+ */
+#define SCL_WAIT_USEC 1000
+
+/* BUF_WAIT is time bus must be free between STOP or ACK and to next START.
+ * Should be 20, but some chips need more.
+ */
+#define TWSI_BUF_WAIT_USEC 60
+
+static void scl_out(struct qib_devdata *dd, u8 bit)
+{
+	u32 mask;
+
+	udelay(1);
+
+	mask = 1UL << dd->gpio_scl_num;
+
+	/* SCL is meant to be bare-drain, so never set "OUT", just DIR */
+	dd->f_gpio_mod(dd, 0, bit ? 0 : mask, mask);
+
+	/*
+	 * Allow for slow slaves by simple
+	 * delay for falling edge, sampling on rise.
+	 */
+	if (!bit)
+		udelay(2);
+	else {
+		int rise_usec;
+
+		for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) {
+			if (mask & dd->f_gpio_mod(dd, 0, 0, 0))
+				break;
+			udelay(2);
+		}
+		if (rise_usec <= 0)
+			qib_dev_err(dd, "SCL interface stuck low > %d uSec\n",
+				    SCL_WAIT_USEC);
+	}
+	i2c_wait_for_writes(dd);
+}
+
+static void sda_out(struct qib_devdata *dd, u8 bit)
+{
+	u32 mask;
+
+	mask = 1UL << dd->gpio_sda_num;
+
+	/* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+	dd->f_gpio_mod(dd, 0, bit ? 0 : mask, mask);
+
+	i2c_wait_for_writes(dd);
+	udelay(2);
+}
+
+static u8 sda_in(struct qib_devdata *dd, int wait)
+{
+	int bnum;
+	u32 read_val, mask;
+
+	bnum = dd->gpio_sda_num;
+	mask = (1UL << bnum);
+	/* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+	dd->f_gpio_mod(dd, 0, 0, mask);
+	read_val = dd->f_gpio_mod(dd, 0, 0, 0);
+	if (wait)
+		i2c_wait_for_writes(dd);
+	return (read_val & mask) >> bnum;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the qlogic_ib device
+ */
+static int i2c_ackrcv(struct qib_devdata *dd)
+{
+	u8 ack_received;
+
+	/* AT ENTRY SCL = LOW */
+	/* change direction, ignore data */
+	ack_received = sda_in(dd, 1);
+	scl_out(dd, 1);
+	ack_received = sda_in(dd, 1) == 0;
+	scl_out(dd, 0);
+	return ack_received;
+}
+
+static void stop_cmd(struct qib_devdata *dd);
+
+/**
+ * rd_byte - read a byte, sending STOP on last, else ACK
+ * @dd: the qlogic_ib device
+ *
+ * Returns byte shifted out of device
+ */
+static int rd_byte(struct qib_devdata *dd, int last)
+{
+	int bit_cntr, data;
+
+	data = 0;
+
+	for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
+		data <<= 1;
+		scl_out(dd, 1);
+		data |= sda_in(dd, 0);
+		scl_out(dd, 0);
+	}
+	if (last) {
+		scl_out(dd, 1);
+		stop_cmd(dd);
+	} else {
+		sda_out(dd, 0);
+		scl_out(dd, 1);
+		scl_out(dd, 0);
+		sda_out(dd, 1);
+	}
+	return data;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the qlogic_ib device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct qib_devdata *dd, u8 data)
+{
+	int bit_cntr;
+	u8 bit;
+
+	for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+		bit = (data >> bit_cntr) & 1;
+		sda_out(dd, bit);
+		scl_out(dd, 1);
+		scl_out(dd, 0);
+	}
+	return (!i2c_ackrcv(dd)) ? 1 : 0;
+}
+
+/*
+ * issue TWSI start sequence:
+ * (both clock/data high, clock high, data low while clock is high)
+ */
+static void start_seq(struct qib_devdata *dd)
+{
+	sda_out(dd, 1);
+	scl_out(dd, 1);
+	sda_out(dd, 0);
+	udelay(1);
+	scl_out(dd, 0);
+}
+
+/**
+ * stop_seq - transmit the stop sequence
+ * @dd: the qlogic_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_seq(struct qib_devdata *dd)
+{
+	scl_out(dd, 0);
+	sda_out(dd, 0);
+	scl_out(dd, 1);
+	sda_out(dd, 1);
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the qlogic_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct qib_devdata *dd)
+{
+	stop_seq(dd);
+	udelay(TWSI_BUF_WAIT_USEC);
+}
+
+/**
+ * qib_twsi_reset - reset I2C communication
+ * @dd: the qlogic_ib device
+ */
+
+int qib_twsi_reset(struct qib_devdata *dd)
+{
+	int clock_cycles_left = 9;
+	int was_high = 0;
+	u32 pins, mask;
+
+	/* Both SCL and SDA should be high. If not, there
+	 * is something wrong.
+	 */
+	mask = (1UL << dd->gpio_scl_num) | (1UL << dd->gpio_sda_num);
+
+	/*
+	 * Force pins to desired innocuous state.
+	 * This is the default power-on state with out=0 and dir=0,
+	 * So tri-stated and should be floating high (barring HW problems)
+	 */
+	dd->f_gpio_mod(dd, 0, 0, mask);
+
+	/*
+	 * Clock nine times to get all listeners into a sane state.
+	 * If SDA does not go high at any point, we are wedged.
+	 * One vendor recommends then issuing START followed by STOP.
+	 * we cannot use our "normal" functions to do that, because
+	 * if SCL drops between them, another vendor's part will
+	 * wedge, dropping SDA and keeping it low forever, at the end of
+	 * the next transaction (even if it was not the device addressed).
+	 * So our START and STOP take place with SCL held high.
+	 */
+	while (clock_cycles_left--) {
+		scl_out(dd, 0);
+		scl_out(dd, 1);
+		/* Note if SDA is high, but keep clocking to sync slave */
+		was_high |= sda_in(dd, 0);
+	}
+
+	if (was_high) {
+		/*
+		 * We saw a high, which we hope means the slave is sync'd.
+		 * Issue START, STOP, pause for T_BUF.
+		 */
+
+		pins = dd->f_gpio_mod(dd, 0, 0, 0);
+		if ((pins & mask) != mask)
+			qib_dev_err(dd, "GPIO pins not at rest: %d\n",
+				    pins & mask);
+		/* Drop SDA to issue START */
+		udelay(1); /* Guarantee .6 uSec setup */
+		sda_out(dd, 0);
+		udelay(1); /* Guarantee .6 uSec hold */
+		/* At this point, SCL is high, SDA low. Raise SDA for STOP */
+		sda_out(dd, 1);
+		udelay(TWSI_BUF_WAIT_USEC);
+	}
+
+	return !was_high;
+}
+
+#define QIB_TWSI_START 0x100
+#define QIB_TWSI_STOP 0x200
+
+/* Write byte to TWSI, optionally prefixed with START or suffixed with
+ * STOP.
+ * returns 0 if OK (ACK received), else != 0
+ */
+static int qib_twsi_wr(struct qib_devdata *dd, int data, int flags)
+{
+	int ret = 1;
+
+	if (flags & QIB_TWSI_START)
+		start_seq(dd);
+
+	ret = wr_byte(dd, data); /* Leaves SCL low (from i2c_ackrcv()) */
+
+	if (flags & QIB_TWSI_STOP)
+		stop_cmd(dd);
+	return ret;
+}
+
+/* Added functionality for IBA7220-based cards */
+#define QIB_TEMP_DEV 0x98
+
+/*
+ * qib_twsi_blk_rd
+ * Formerly called qib_eeprom_internal_read, and only used for eeprom,
+ * but now the general interface for data transfer from twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * QIB_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a one-byte "address" which selects
+ * the "register" or "offset" within the device from which data should
+ * be read.
+ */
+int qib_twsi_blk_rd(struct qib_devdata *dd, int dev, int addr,
+		    void *buffer, int len)
+{
+	int ret;
+	u8 *bp = buffer;
+
+	ret = 1;
+
+	if (dev == QIB_TWSI_NO_DEV) {
+		/* legacy not-really-I2C */
+		addr = (addr << 1) | READ_CMD;
+		ret = qib_twsi_wr(dd, addr, QIB_TWSI_START);
+	} else {
+		/* Actual I2C */
+		ret = qib_twsi_wr(dd, dev | WRITE_CMD, QIB_TWSI_START);
+		if (ret) {
+			stop_cmd(dd);
+			ret = 1;
+			goto bail;
+		}
+		/*
+		 * SFF spec claims we do _not_ stop after the addr
+		 * but simply issue a start with the "read" dev-addr.
+		 * Since we are implicitely waiting for ACK here,
+		 * we need t_buf (nominally 20uSec) before that start,
+		 * and cannot rely on the delay built in to the STOP
+		 */
+		ret = qib_twsi_wr(dd, addr, 0);
+		udelay(TWSI_BUF_WAIT_USEC);
+
+		if (ret) {
+			qib_dev_err(dd,
+				"Failed to write interface read addr %02X\n",
+				addr);
+			ret = 1;
+			goto bail;
+		}
+		ret = qib_twsi_wr(dd, dev | READ_CMD, QIB_TWSI_START);
+	}
+	if (ret) {
+		stop_cmd(dd);
+		ret = 1;
+		goto bail;
+	}
+
+	/*
+	 * block devices keeps clocking data out as long as we ack,
+	 * automatically incrementing the address. Some have "pages"
+	 * whose boundaries will not be crossed, but the handling
+	 * of these is left to the caller, who is in a better
+	 * position to know.
+	 */
+	while (len-- > 0) {
+		/*
+		 * Get and store data, sending ACK if length remaining,
+		 * else STOP
+		 */
+		*bp++ = rd_byte(dd, !len);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/*
+ * qib_twsi_blk_wr
+ * Formerly called qib_eeprom_internal_write, and only used for eeprom,
+ * but now the general interface for data transfer to twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * QIB_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a one-byte "address" which selects
+ * the "register" or "offset" within the device to which data should
+ * be written.
+ */
+int qib_twsi_blk_wr(struct qib_devdata *dd, int dev, int addr,
+		    const void *buffer, int len)
+{
+	int sub_len;
+	const u8 *bp = buffer;
+	int max_wait_time, i;
+	int ret = 1;
+
+	while (len > 0) {
+		if (dev == QIB_TWSI_NO_DEV) {
+			if (qib_twsi_wr(dd, (addr << 1) | WRITE_CMD,
+					QIB_TWSI_START)) {
+				goto failed_write;
+			}
+		} else {
+			/* Real I2C */
+			if (qib_twsi_wr(dd, dev | WRITE_CMD, QIB_TWSI_START))
+				goto failed_write;
+			ret = qib_twsi_wr(dd, addr, 0);
+			if (ret) {
+				qib_dev_err(dd,
+					"Failed to write interface write addr %02X\n",
+					addr);
+				goto failed_write;
+			}
+		}
+
+		sub_len = min(len, 4);
+		addr += sub_len;
+		len -= sub_len;
+
+		for (i = 0; i < sub_len; i++)
+			if (qib_twsi_wr(dd, *bp++, 0))
+				goto failed_write;
+
+		stop_cmd(dd);
+
+		/*
+		 * Wait for write complete by waiting for a successful
+		 * read (the chip replies with a zero after the write
+		 * cmd completes, and before it writes to the eeprom.
+		 * The startcmd for the read will fail the ack until
+		 * the writes have completed.   We do this inline to avoid
+		 * the debug prints that are in the real read routine
+		 * if the startcmd fails.
+		 * We also use the proper device address, so it doesn't matter
+		 * whether we have real eeprom_dev. Legacy likes any address.
+		 */
+		max_wait_time = 100;
+		while (qib_twsi_wr(dd, dev | READ_CMD, QIB_TWSI_START)) {
+			stop_cmd(dd);
+			if (!--max_wait_time)
+				goto failed_write;
+		}
+		/* now read (and ignore) the resulting byte */
+		rd_byte(dd, 1);
+	}
+
+	ret = 0;
+	goto bail;
+
+failed_write:
+	stop_cmd(dd);
+	ret = 1;
+
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_tx.c b/drivers/infiniband/hw/qib/qib_tx.c
new file mode 100644
index 0000000..29785eb
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_tx.c
@@ -0,0 +1,568 @@
+/*
+ * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/moduleparam.h>
+
+#include "qib.h"
+
+static unsigned qib_hol_timeout_ms = 3000;
+module_param_named(hol_timeout_ms, qib_hol_timeout_ms, uint, S_IRUGO);
+MODULE_PARM_DESC(hol_timeout_ms,
+		 "duration of user app suspension after link failure");
+
+unsigned qib_sdma_fetch_arb = 1;
+module_param_named(fetch_arb, qib_sdma_fetch_arb, uint, S_IRUGO);
+MODULE_PARM_DESC(fetch_arb, "IBA7220: change SDMA descriptor arbitration");
+
+/**
+ * qib_disarm_piobufs - cancel a range of PIO buffers
+ * @dd: the qlogic_ib device
+ * @first: the first PIO buffer to cancel
+ * @cnt: the number of PIO buffers to cancel
+ *
+ * Cancel a range of PIO buffers. Used at user process close,
+ * in case it died while writing to a PIO buffer.
+ */
+void qib_disarm_piobufs(struct qib_devdata *dd, unsigned first, unsigned cnt)
+{
+	unsigned long flags;
+	unsigned i;
+	unsigned last;
+
+	last = first + cnt;
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	for (i = first; i < last; i++) {
+		__clear_bit(i, dd->pio_need_disarm);
+		dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(i));
+	}
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+}
+
+/*
+ * This is called by a user process when it sees the DISARM_BUFS event
+ * bit is set.
+ */
+int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *rcd)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned i;
+	unsigned last;
+	unsigned n = 0;
+
+	last = rcd->pio_base + rcd->piocnt;
+	/*
+	 * Don't need uctxt_lock here, since user has called in to us.
+	 * Clear at start in case more interrupts set bits while we
+	 * are disarming
+	 */
+	if (rcd->user_event_mask) {
+		/*
+		 * subctxt_cnt is 0 if not shared, so do base
+		 * separately, first, then remaining subctxt, if any
+		 */
+		clear_bit(_QIB_EVENT_DISARM_BUFS_BIT, &rcd->user_event_mask[0]);
+		for (i = 1; i < rcd->subctxt_cnt; i++)
+			clear_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+				  &rcd->user_event_mask[i]);
+	}
+	spin_lock_irq(&dd->pioavail_lock);
+	for (i = rcd->pio_base; i < last; i++) {
+		if (__test_and_clear_bit(i, dd->pio_need_disarm)) {
+			n++;
+			dd->f_sendctrl(rcd->ppd, QIB_SENDCTRL_DISARM_BUF(i));
+		}
+	}
+	spin_unlock_irq(&dd->pioavail_lock);
+	return 0;
+}
+
+static struct qib_pportdata *is_sdma_buf(struct qib_devdata *dd, unsigned i)
+{
+	struct qib_pportdata *ppd;
+	unsigned pidx;
+
+	for (pidx = 0; pidx < dd->num_pports; pidx++) {
+		ppd = dd->pport + pidx;
+		if (i >= ppd->sdma_state.first_sendbuf &&
+		    i < ppd->sdma_state.last_sendbuf)
+			return ppd;
+	}
+	return NULL;
+}
+
+/*
+ * Return true if send buffer is being used by a user context.
+ * Sets  _QIB_EVENT_DISARM_BUFS_BIT in user_event_mask as a side effect
+ */
+static int find_ctxt(struct qib_devdata *dd, unsigned bufn)
+{
+	struct qib_ctxtdata *rcd;
+	unsigned ctxt;
+	int ret = 0;
+
+	spin_lock(&dd->uctxt_lock);
+	for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts; ctxt++) {
+		rcd = dd->rcd[ctxt];
+		if (!rcd || bufn < rcd->pio_base ||
+		    bufn >= rcd->pio_base + rcd->piocnt)
+			continue;
+		if (rcd->user_event_mask) {
+			int i;
+			/*
+			 * subctxt_cnt is 0 if not shared, so do base
+			 * separately, first, then remaining subctxt, if any
+			 */
+			set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+				&rcd->user_event_mask[0]);
+			for (i = 1; i < rcd->subctxt_cnt; i++)
+				set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+					&rcd->user_event_mask[i]);
+		}
+		ret = 1;
+		break;
+	}
+	spin_unlock(&dd->uctxt_lock);
+
+	return ret;
+}
+
+/*
+ * Disarm a set of send buffers.  If the buffer might be actively being
+ * written to, mark the buffer to be disarmed later when it is not being
+ * written to.
+ *
+ * This should only be called from the IRQ error handler.
+ */
+void qib_disarm_piobufs_set(struct qib_devdata *dd, unsigned long *mask,
+			    unsigned cnt)
+{
+	struct qib_pportdata *ppd, *pppd[QIB_MAX_IB_PORTS];
+	unsigned i;
+	unsigned long flags;
+
+	for (i = 0; i < dd->num_pports; i++)
+		pppd[i] = NULL;
+
+	for (i = 0; i < cnt; i++) {
+		if (!test_bit(i, mask))
+			continue;
+		/*
+		 * If the buffer is owned by the DMA hardware,
+		 * reset the DMA engine.
+		 */
+		ppd = is_sdma_buf(dd, i);
+		if (ppd) {
+			pppd[ppd->port] = ppd;
+			continue;
+		}
+		/*
+		 * If the kernel is writing the buffer or the buffer is
+		 * owned by a user process, we can't clear it yet.
+		 */
+		spin_lock_irqsave(&dd->pioavail_lock, flags);
+		if (test_bit(i, dd->pio_writing) ||
+		    (!test_bit(i << 1, dd->pioavailkernel) &&
+		     find_ctxt(dd, i))) {
+			__set_bit(i, dd->pio_need_disarm);
+		} else {
+			dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(i));
+		}
+		spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+	}
+
+	/* do cancel_sends once per port that had sdma piobufs in error */
+	for (i = 0; i < dd->num_pports; i++)
+		if (pppd[i])
+			qib_cancel_sends(pppd[i]);
+}
+
+/**
+ * update_send_bufs - update shadow copy of the PIO availability map
+ * @dd: the qlogic_ib device
+ *
+ * called whenever our local copy indicates we have run out of send buffers
+ */
+static void update_send_bufs(struct qib_devdata *dd)
+{
+	unsigned long flags;
+	unsigned i;
+	const unsigned piobregs = dd->pioavregs;
+
+	/*
+	 * If the generation (check) bits have changed, then we update the
+	 * busy bit for the corresponding PIO buffer.  This algorithm will
+	 * modify positions to the value they already have in some cases
+	 * (i.e., no change), but it's faster than changing only the bits
+	 * that have changed.
+	 *
+	 * We would like to do this atomicly, to avoid spinlocks in the
+	 * critical send path, but that's not really possible, given the
+	 * type of changes, and that this routine could be called on
+	 * multiple cpu's simultaneously, so we lock in this routine only,
+	 * to avoid conflicting updates; all we change is the shadow, and
+	 * it's a single 64 bit memory location, so by definition the update
+	 * is atomic in terms of what other cpu's can see in testing the
+	 * bits.  The spin_lock overhead isn't too bad, since it only
+	 * happens when all buffers are in use, so only cpu overhead, not
+	 * latency or bandwidth is affected.
+	 */
+	if (!dd->pioavailregs_dma)
+		return;
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	for (i = 0; i < piobregs; i++) {
+		u64 pchbusy, pchg, piov, pnew;
+
+		piov = le64_to_cpu(dd->pioavailregs_dma[i]);
+		pchg = dd->pioavailkernel[i] &
+			~(dd->pioavailshadow[i] ^ piov);
+		pchbusy = pchg << QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT;
+		if (pchg && (pchbusy & dd->pioavailshadow[i])) {
+			pnew = dd->pioavailshadow[i] & ~pchbusy;
+			pnew |= piov & pchbusy;
+			dd->pioavailshadow[i] = pnew;
+		}
+	}
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+}
+
+/*
+ * Debugging code and stats updates if no pio buffers available.
+ */
+static noinline void no_send_bufs(struct qib_devdata *dd)
+{
+	dd->upd_pio_shadow = 1;
+
+	/* not atomic, but if we lose a stat count in a while, that's OK */
+	qib_stats.sps_nopiobufs++;
+}
+
+/*
+ * Common code for normal driver send buffer allocation, and reserved
+ * allocation.
+ *
+ * Do appropriate marking as busy, etc.
+ * Returns buffer pointer if one is found, otherwise NULL.
+ */
+u32 __iomem *qib_getsendbuf_range(struct qib_devdata *dd, u32 *pbufnum,
+				  u32 first, u32 last)
+{
+	unsigned i, j, updated = 0;
+	unsigned nbufs;
+	unsigned long flags;
+	unsigned long *shadow = dd->pioavailshadow;
+	u32 __iomem *buf;
+
+	if (!(dd->flags & QIB_PRESENT))
+		return NULL;
+
+	nbufs = last - first + 1; /* number in range to check */
+	if (dd->upd_pio_shadow) {
+update_shadow:
+		/*
+		 * Minor optimization.  If we had no buffers on last call,
+		 * start out by doing the update; continue and do scan even
+		 * if no buffers were updated, to be paranoid.
+		 */
+		update_send_bufs(dd);
+		updated++;
+	}
+	i = first;
+	/*
+	 * While test_and_set_bit() is atomic, we do that and then the
+	 * change_bit(), and the pair is not.  See if this is the cause
+	 * of the remaining armlaunch errors.
+	 */
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	if (dd->last_pio >= first && dd->last_pio <= last)
+		i = dd->last_pio + 1;
+	if (!first)
+		/* adjust to min possible  */
+		nbufs = last - dd->min_kernel_pio + 1;
+	for (j = 0; j < nbufs; j++, i++) {
+		if (i > last)
+			i = !first ? dd->min_kernel_pio : first;
+		if (__test_and_set_bit((2 * i) + 1, shadow))
+			continue;
+		/* flip generation bit */
+		__change_bit(2 * i, shadow);
+		/* remember that the buffer can be written to now */
+		__set_bit(i, dd->pio_writing);
+		if (!first && first != last) /* first == last on VL15, avoid */
+			dd->last_pio = i;
+		break;
+	}
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+
+	if (j == nbufs) {
+		if (!updated)
+			/*
+			 * First time through; shadow exhausted, but may be
+			 * buffers available, try an update and then rescan.
+			 */
+			goto update_shadow;
+		no_send_bufs(dd);
+		buf = NULL;
+	} else {
+		if (i < dd->piobcnt2k)
+			buf = (u32 __iomem *)(dd->pio2kbase +
+				i * dd->palign);
+		else if (i < dd->piobcnt2k + dd->piobcnt4k || !dd->piovl15base)
+			buf = (u32 __iomem *)(dd->pio4kbase +
+				(i - dd->piobcnt2k) * dd->align4k);
+		else
+			buf = (u32 __iomem *)(dd->piovl15base +
+				(i - (dd->piobcnt2k + dd->piobcnt4k)) *
+				dd->align4k);
+		if (pbufnum)
+			*pbufnum = i;
+		dd->upd_pio_shadow = 0;
+	}
+
+	return buf;
+}
+
+/*
+ * Record that the caller is finished writing to the buffer so we don't
+ * disarm it while it is being written and disarm it now if needed.
+ */
+void qib_sendbuf_done(struct qib_devdata *dd, unsigned n)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	__clear_bit(n, dd->pio_writing);
+	if (__test_and_clear_bit(n, dd->pio_need_disarm))
+		dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(n));
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+}
+
+/**
+ * qib_chg_pioavailkernel - change which send buffers are available for kernel
+ * @dd: the qlogic_ib device
+ * @start: the starting send buffer number
+ * @len: the number of send buffers
+ * @avail: true if the buffers are available for kernel use, false otherwise
+ */
+void qib_chg_pioavailkernel(struct qib_devdata *dd, unsigned start,
+	unsigned len, u32 avail, struct qib_ctxtdata *rcd)
+{
+	unsigned long flags;
+	unsigned end;
+	unsigned ostart = start;
+
+	/* There are two bits per send buffer (busy and generation) */
+	start *= 2;
+	end = start + len * 2;
+
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	/* Set or clear the busy bit in the shadow. */
+	while (start < end) {
+		if (avail) {
+			unsigned long dma;
+			int i;
+
+			/*
+			 * The BUSY bit will never be set, because we disarm
+			 * the user buffers before we hand them back to the
+			 * kernel.  We do have to make sure the generation
+			 * bit is set correctly in shadow, since it could
+			 * have changed many times while allocated to user.
+			 * We can't use the bitmap functions on the full
+			 * dma array because it is always little-endian, so
+			 * we have to flip to host-order first.
+			 * BITS_PER_LONG is slightly wrong, since it's
+			 * always 64 bits per register in chip...
+			 * We only work on 64 bit kernels, so that's OK.
+			 */
+			i = start / BITS_PER_LONG;
+			__clear_bit(QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT + start,
+				    dd->pioavailshadow);
+			dma = (unsigned long)
+				le64_to_cpu(dd->pioavailregs_dma[i]);
+			if (test_bit((QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT +
+				      start) % BITS_PER_LONG, &dma))
+				__set_bit(QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT +
+					  start, dd->pioavailshadow);
+			else
+				__clear_bit(QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT
+					    + start, dd->pioavailshadow);
+			__set_bit(start, dd->pioavailkernel);
+			if ((start >> 1) < dd->min_kernel_pio)
+				dd->min_kernel_pio = start >> 1;
+		} else {
+			__set_bit(start + QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT,
+				  dd->pioavailshadow);
+			__clear_bit(start, dd->pioavailkernel);
+			if ((start >> 1) > dd->min_kernel_pio)
+				dd->min_kernel_pio = start >> 1;
+		}
+		start += 2;
+	}
+
+	if (dd->min_kernel_pio > 0 && dd->last_pio < dd->min_kernel_pio - 1)
+		dd->last_pio = dd->min_kernel_pio - 1;
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+
+	dd->f_txchk_change(dd, ostart, len, avail, rcd);
+}
+
+/*
+ * Flush all sends that might be in the ready to send state, as well as any
+ * that are in the process of being sent.  Used whenever we need to be
+ * sure the send side is idle.  Cleans up all buffer state by canceling
+ * all pio buffers, and issuing an abort, which cleans up anything in the
+ * launch fifo.  The cancel is superfluous on some chip versions, but
+ * it's safer to always do it.
+ * PIOAvail bits are updated by the chip as if a normal send had happened.
+ */
+void qib_cancel_sends(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_ctxtdata *rcd;
+	unsigned long flags;
+	unsigned ctxt;
+	unsigned i;
+	unsigned last;
+
+	/*
+	 * Tell PSM to disarm buffers again before trying to reuse them.
+	 * We need to be sure the rcd doesn't change out from under us
+	 * while we do so.  We hold the two locks sequentially.  We might
+	 * needlessly set some need_disarm bits as a result, if the
+	 * context is closed after we release the uctxt_lock, but that's
+	 * fairly benign, and safer than nesting the locks.
+	 */
+	for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts; ctxt++) {
+		spin_lock_irqsave(&dd->uctxt_lock, flags);
+		rcd = dd->rcd[ctxt];
+		if (rcd && rcd->ppd == ppd) {
+			last = rcd->pio_base + rcd->piocnt;
+			if (rcd->user_event_mask) {
+				/*
+				 * subctxt_cnt is 0 if not shared, so do base
+				 * separately, first, then remaining subctxt,
+				 * if any
+				 */
+				set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+					&rcd->user_event_mask[0]);
+				for (i = 1; i < rcd->subctxt_cnt; i++)
+					set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+						&rcd->user_event_mask[i]);
+			}
+			i = rcd->pio_base;
+			spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+			spin_lock_irqsave(&dd->pioavail_lock, flags);
+			for (; i < last; i++)
+				__set_bit(i, dd->pio_need_disarm);
+			spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+		} else
+			spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+	}
+
+	if (!(dd->flags & QIB_HAS_SEND_DMA))
+		dd->f_sendctrl(ppd, QIB_SENDCTRL_DISARM_ALL |
+				    QIB_SENDCTRL_FLUSH);
+}
+
+/*
+ * Force an update of in-memory copy of the pioavail registers, when
+ * needed for any of a variety of reasons.
+ * If already off, this routine is a nop, on the assumption that the
+ * caller (or set of callers) will "do the right thing".
+ * This is a per-device operation, so just the first port.
+ */
+void qib_force_pio_avail_update(struct qib_devdata *dd)
+{
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+}
+
+void qib_hol_down(struct qib_pportdata *ppd)
+{
+	/*
+	 * Cancel sends when the link goes DOWN so that we aren't doing it
+	 * at INIT when we might be trying to send SMI packets.
+	 */
+	if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+		qib_cancel_sends(ppd);
+}
+
+/*
+ * Link is at INIT.
+ * We start the HoL timer so we can detect stuck packets blocking SMP replies.
+ * Timer may already be running, so use mod_timer, not add_timer.
+ */
+void qib_hol_init(struct qib_pportdata *ppd)
+{
+	if (ppd->hol_state != QIB_HOL_INIT) {
+		ppd->hol_state = QIB_HOL_INIT;
+		mod_timer(&ppd->hol_timer,
+			  jiffies + msecs_to_jiffies(qib_hol_timeout_ms));
+	}
+}
+
+/*
+ * Link is up, continue any user processes, and ensure timer
+ * is a nop, if running.  Let timer keep running, if set; it
+ * will nop when it sees the link is up.
+ */
+void qib_hol_up(struct qib_pportdata *ppd)
+{
+	ppd->hol_state = QIB_HOL_UP;
+}
+
+/*
+ * This is only called via the timer.
+ */
+void qib_hol_event(struct timer_list *t)
+{
+	struct qib_pportdata *ppd = from_timer(ppd, t, hol_timer);
+
+	/* If hardware error, etc, skip. */
+	if (!(ppd->dd->flags & QIB_INITTED))
+		return;
+
+	if (ppd->hol_state != QIB_HOL_UP) {
+		/*
+		 * Try to flush sends in case a stuck packet is blocking
+		 * SMP replies.
+		 */
+		qib_hol_down(ppd);
+		mod_timer(&ppd->hol_timer,
+			  jiffies + msecs_to_jiffies(qib_hol_timeout_ms));
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c
new file mode 100644
index 0000000..840eec6
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "qib.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/**
+ * qib_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Assumes the s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags)
+{
+	struct qib_qp_priv *priv = qp->priv;
+	struct ib_other_headers *ohdr;
+	struct rvt_swqe *wqe;
+	u32 hwords;
+	u32 bth0;
+	u32 len;
+	u32 pmtu = qp->pmtu;
+	int ret = 0;
+
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
+		if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == READ_ONCE(qp->s_head))
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&priv->s_dma_busy)) {
+			qp->s_flags |= RVT_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+		qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	ohdr = &priv->s_hdr->u.oth;
+	if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)
+		ohdr = &priv->s_hdr->u.l.oth;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+	bth0 = 0;
+
+	/* Get the next send request. */
+	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+	qp->s_wqe = NULL;
+	switch (qp->s_state) {
+	default:
+		if (!(ib_rvt_state_ops[qp->state] &
+		    RVT_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/* Check if send work queue is empty. */
+		if (qp->s_cur == READ_ONCE(qp->s_head))
+			goto bail;
+		/*
+		 * Start a new request.
+		 */
+		qp->s_psn = wqe->psn;
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_sge.total_len = wqe->length;
+		len = wqe->length;
+		qp->s_len = len;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			if (len > pmtu) {
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state =
+					OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->rdma_wr.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->rdma_wr.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / 4;
+			if (len > pmtu) {
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= IB_BTH_SOLICITED;
+			}
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		break;
+
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= IB_BTH_SOLICITED;
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state =
+				OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+		}
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_cur_size = len;
+	qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
+			    qp->s_psn++ & QIB_PSN_MASK);
+done:
+	return 1;
+bail:
+	qp->s_flags &= ~RVT_S_BUSY;
+	return ret;
+}
+
+/**
+ * qib_uc_rcv - handle an incoming UC packet
+ * @ibp: the port the packet came in on
+ * @hdr: the header of the packet
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from qib_qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void qib_uc_rcv(struct qib_ibport *ibp, struct ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct rvt_qp *qp)
+{
+	struct ib_other_headers *ohdr;
+	u32 opcode;
+	u32 hdrsize;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = qp->pmtu;
+	struct ib_reth *reth;
+	int ret;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12;       /* LRH + BTH */
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
+	}
+
+	opcode = be32_to_cpu(ohdr->bth[0]);
+	if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
+		return;
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	opcode >>= 24;
+
+	/* Compare the PSN verses the expected PSN. */
+	if (unlikely(qib_cmp24(psn, qp->r_psn) != 0)) {
+		/*
+		 * Handle a sequence error.
+		 * Silently drop any current message.
+		 */
+		qp->r_psn = psn;
+inv:
+		if (qp->r_state == OP(SEND_FIRST) ||
+		    qp->r_state == OP(SEND_MIDDLE)) {
+			set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
+			qp->r_sge.num_sge = 0;
+		} else
+			rvt_put_ss(&qp->r_sge);
+		qp->r_state = OP(SEND_LAST);
+		switch (opcode) {
+		case OP(SEND_FIRST):
+		case OP(SEND_ONLY):
+		case OP(SEND_ONLY_WITH_IMMEDIATE):
+			goto send_first;
+
+		case OP(RDMA_WRITE_FIRST):
+		case OP(RDMA_WRITE_ONLY):
+		case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+			goto rdma_first;
+
+		default:
+			goto drop;
+		}
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	default:
+		if (opcode == OP(SEND_FIRST) ||
+		    opcode == OP(SEND_ONLY) ||
+		    opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_FIRST) ||
+		    opcode == OP(RDMA_WRITE_ONLY) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			break;
+		goto inv;
+	}
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+		rvt_comm_est(qp);
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+send_first:
+		if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
+			qp->r_sge = qp->s_rdma_read_sge;
+		else {
+			ret = qib_get_rwqe(qp, 0);
+			if (ret < 0)
+				goto op_err;
+			if (!ret)
+				goto drop;
+			/*
+			 * qp->s_rdma_read_sge will be the owner
+			 * of the mr references.
+			 */
+			qp->s_rdma_read_sge = qp->r_sge;
+		}
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto no_immediate_data;
+		else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+			goto send_last_imm;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto rewind;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto rewind;
+		qib_copy_sge(&qp->r_sge, data, pmtu, 0);
+		break;
+
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+		wc.ex.imm_data = ohdr->u.imm_data;
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+	case OP(SEND_LAST):
+no_immediate_data:
+		wc.ex.imm_data = 0;
+		wc.wc_flags = 0;
+send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto rewind;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len))
+			goto rewind;
+		wc.opcode = IB_WC_RECV;
+		qib_copy_sge(&qp->r_sge, data, tlen, 0);
+		rvt_put_ss(&qp->s_rdma_read_sge);
+last_imm:
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr);
+		wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
+		/* zero fields that are N/A */
+		wc.vendor_err = 0;
+		wc.pkey_index = 0;
+		wc.dlid_path_bits = 0;
+		wc.port_num = 0;
+		/* Signal completion event if the solicited bit is set. */
+		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+			     ib_bth_is_solicited(ohdr));
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+rdma_first:
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_WRITE))) {
+			goto drop;
+		}
+		reth = &ohdr->u.rc.reth;
+		hdrsize += sizeof(*reth);
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		qp->r_sge.sg_list = NULL;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey */
+			ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
+					 vaddr, rkey, IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok))
+				goto drop;
+			qp->r_sge.num_sge = 1;
+		} else {
+			qp->r_sge.num_sge = 0;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (opcode == OP(RDMA_WRITE_ONLY))
+			goto rdma_last;
+		else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) {
+			wc.ex.imm_data = ohdr->u.rc.imm_data;
+			goto rdma_last_imm;
+		}
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto drop;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto drop;
+		qib_copy_sge(&qp->r_sge, data, pmtu, 1);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		wc.ex.imm_data = ohdr->u.imm_data;
+rdma_last_imm:
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto drop;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+			goto drop;
+		if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
+			rvt_put_ss(&qp->s_rdma_read_sge);
+		else {
+			ret = qib_get_rwqe(qp, 1);
+			if (ret < 0)
+				goto op_err;
+			if (!ret)
+				goto drop;
+		}
+		wc.byte_len = qp->r_len;
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		qib_copy_sge(&qp->r_sge, data, tlen, 1);
+		rvt_put_ss(&qp->r_sge);
+		goto last_imm;
+
+	case OP(RDMA_WRITE_LAST):
+rdma_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto drop;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+			goto drop;
+		qib_copy_sge(&qp->r_sge, data, tlen, 1);
+		rvt_put_ss(&qp->r_sge);
+		break;
+
+	default:
+		/* Drop packet for unknown opcodes. */
+		goto drop;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+	return;
+
+rewind:
+	set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
+	qp->r_sge.num_sge = 0;
+drop:
+	ibp->rvp.n_pkt_drops++;
+	return;
+
+op_err:
+	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	return;
+
+}
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
new file mode 100644
index 0000000..3e4ff77
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -0,0 +1,587 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_verbs.h>
+
+#include "qib.h"
+#include "qib_mad.h"
+
+/**
+ * qib_ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from qib_make_ud_req() to forward a WQE addressed
+ * to the same HCA.
+ * Note that the receive interrupt handler may be calling qib_ud_rcv()
+ * while this is being called.
+ */
+static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
+{
+	struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = ppd->dd;
+	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+	struct rvt_qp *qp;
+	struct rdma_ah_attr *ah_attr;
+	unsigned long flags;
+	struct rvt_sge_state ssge;
+	struct rvt_sge *sge;
+	struct ib_wc wc;
+	u32 length;
+	enum ib_qp_type sqptype, dqptype;
+
+	rcu_read_lock();
+	qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.remote_qpn);
+	if (!qp) {
+		ibp->rvp.n_pkt_drops++;
+		goto drop;
+	}
+
+	sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ?
+			IB_QPT_UD : sqp->ibqp.qp_type;
+	dqptype = qp->ibqp.qp_type == IB_QPT_GSI ?
+			IB_QPT_UD : qp->ibqp.qp_type;
+
+	if (dqptype != sqptype ||
+	    !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+		ibp->rvp.n_pkt_drops++;
+		goto drop;
+	}
+
+	ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
+	ppd = ppd_from_ibp(ibp);
+
+	if (qp->ibqp.qp_num > 1) {
+		u16 pkey1;
+		u16 pkey2;
+		u16 lid;
+
+		pkey1 = qib_get_pkey(ibp, sqp->s_pkey_index);
+		pkey2 = qib_get_pkey(ibp, qp->s_pkey_index);
+		if (unlikely(!qib_pkey_ok(pkey1, pkey2))) {
+			lid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) &
+					  ((1 << ppd->lmc) - 1));
+			qib_bad_pkey(ibp, pkey1,
+				     rdma_ah_get_sl(ah_attr),
+				     sqp->ibqp.qp_num, qp->ibqp.qp_num,
+				     cpu_to_be16(lid),
+				     cpu_to_be16(rdma_ah_get_dlid(ah_attr)));
+			goto drop;
+		}
+	}
+
+	/*
+	 * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	if (qp->ibqp.qp_num) {
+		u32 qkey;
+
+		qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
+			sqp->qkey : swqe->ud_wr.remote_qkey;
+		if (unlikely(qkey != qp->qkey))
+			goto drop;
+	}
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	length = swqe->length;
+	memset(&wc, 0, sizeof(wc));
+	wc.byte_len = length + sizeof(struct ib_grh);
+
+	if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = swqe->wr.ex.imm_data;
+	}
+
+	spin_lock_irqsave(&qp->r_lock, flags);
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 */
+	if (qp->r_flags & RVT_R_REUSE_SGE)
+		qp->r_flags &= ~RVT_R_REUSE_SGE;
+	else {
+		int ret;
+
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0) {
+			rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+			goto bail_unlock;
+		}
+		if (!ret) {
+			if (qp->ibqp.qp_num == 0)
+				ibp->rvp.n_vl15_dropped++;
+			goto bail_unlock;
+		}
+	}
+	/* Silently drop packets which are too big. */
+	if (unlikely(wc.byte_len > qp->r_len)) {
+		qp->r_flags |= RVT_R_REUSE_SGE;
+		ibp->rvp.n_pkt_drops++;
+		goto bail_unlock;
+	}
+
+	if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
+		struct ib_grh grh;
+		const struct ib_global_route *grd = rdma_ah_read_grh(ah_attr);
+
+		qib_make_grh(ibp, &grh, grd, 0, 0);
+		qib_copy_sge(&qp->r_sge, &grh,
+			     sizeof(grh), 1);
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true);
+	ssge.sg_list = swqe->sg_list + 1;
+	ssge.sge = *swqe->sg_list;
+	ssge.num_sge = swqe->wr.num_sge;
+	sge = &ssge.sge;
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		qib_copy_sge(&qp->r_sge, sge->vaddr, len, 1);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ssge.num_sge)
+				*sge = *ssge.sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= RVT_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+	rvt_put_ss(&qp->r_sge);
+	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+		goto bail_unlock;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = sqp->ibqp.qp_num;
+	wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ?
+		swqe->ud_wr.pkey_index : 0;
+	wc.slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) &
+				((1 << ppd->lmc) - 1));
+	wc.sl = rdma_ah_get_sl(ah_attr);
+	wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1);
+	wc.port_num = qp->port_num;
+	/* Signal completion event if the solicited bit is set. */
+	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+		     swqe->wr.send_flags & IB_SEND_SOLICITED);
+	ibp->rvp.n_loop_pkts++;
+bail_unlock:
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+drop:
+	rcu_read_unlock();
+}
+
+/**
+ * qib_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Assumes the s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
+{
+	struct qib_qp_priv *priv = qp->priv;
+	struct ib_other_headers *ohdr;
+	struct rdma_ah_attr *ah_attr;
+	struct qib_pportdata *ppd;
+	struct qib_ibport *ibp;
+	struct rvt_swqe *wqe;
+	u32 nwords;
+	u32 extra_bytes;
+	u32 bth0;
+	u16 lrh0;
+	u16 lid;
+	int ret = 0;
+	int next_cur;
+
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == READ_ONCE(qp->s_head))
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&priv->s_dma_busy)) {
+			qp->s_flags |= RVT_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+		qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	/* see post_one_send() */
+	if (qp->s_cur == READ_ONCE(qp->s_head))
+		goto bail;
+
+	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+	next_cur = qp->s_cur + 1;
+	if (next_cur >= qp->s_size)
+		next_cur = 0;
+
+	/* Construct the header. */
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ppd = ppd_from_ibp(ibp);
+	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+	if (rdma_ah_get_dlid(ah_attr) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+		if (rdma_ah_get_dlid(ah_attr) !=
+				be16_to_cpu(IB_LID_PERMISSIVE))
+			this_cpu_inc(ibp->pmastats->n_multicast_xmit);
+		else
+			this_cpu_inc(ibp->pmastats->n_unicast_xmit);
+	} else {
+		this_cpu_inc(ibp->pmastats->n_unicast_xmit);
+		lid = rdma_ah_get_dlid(ah_attr) & ~((1 << ppd->lmc) - 1);
+		if (unlikely(lid == ppd->lid)) {
+			unsigned long tflags = *flags;
+			/*
+			 * If DMAs are in progress, we can't generate
+			 * a completion for the loopback packet since
+			 * it would be out of order.
+			 * XXX Instead of waiting, we could queue a
+			 * zero length descriptor so we get a callback.
+			 */
+			if (atomic_read(&priv->s_dma_busy)) {
+				qp->s_flags |= RVT_S_WAIT_DMA;
+				goto bail;
+			}
+			qp->s_cur = next_cur;
+			spin_unlock_irqrestore(&qp->s_lock, tflags);
+			qib_ud_loopback(qp, wqe);
+			spin_lock_irqsave(&qp->s_lock, tflags);
+			*flags = tflags;
+			qib_send_complete(qp, wqe, IB_WC_SUCCESS);
+			goto done;
+		}
+	}
+
+	qp->s_cur = next_cur;
+	extra_bytes = -wqe->length & 3;
+	nwords = (wqe->length + extra_bytes) >> 2;
+
+	/* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+	qp->s_hdrwords = 7;
+	qp->s_cur_size = wqe->length;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_srate = rdma_ah_get_static_rate(ah_attr);
+	qp->s_wqe = wqe;
+	qp->s_sge.sge = wqe->sg_list[0];
+	qp->s_sge.sg_list = wqe->sg_list + 1;
+	qp->s_sge.num_sge = wqe->wr.num_sge;
+	qp->s_sge.total_len = wqe->length;
+
+	if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
+		/* Header size in 32-bit words. */
+		qp->s_hdrwords += qib_make_grh(ibp, &priv->s_hdr->u.l.grh,
+					       rdma_ah_read_grh(ah_attr),
+					       qp->s_hdrwords, nwords);
+		lrh0 = QIB_LRH_GRH;
+		ohdr = &priv->s_hdr->u.l.oth;
+		/*
+		 * Don't worry about sending to locally attached multicast
+		 * QPs.  It is unspecified by the spec. what happens.
+		 */
+	} else {
+		/* Header size in 32-bit words. */
+		lrh0 = QIB_LRH_BTH;
+		ohdr = &priv->s_hdr->u.oth;
+	}
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		qp->s_hdrwords++;
+		ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+	} else
+		bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+	lrh0 |= rdma_ah_get_sl(ah_attr) << 4;
+	if (qp->ibqp.qp_type == IB_QPT_SMI)
+		lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+	else
+		lrh0 |= ibp->sl_to_vl[rdma_ah_get_sl(ah_attr)] << 12;
+	priv->s_hdr->lrh[0] = cpu_to_be16(lrh0);
+	priv->s_hdr->lrh[1] =
+			cpu_to_be16(rdma_ah_get_dlid(ah_attr));  /* DEST LID */
+	priv->s_hdr->lrh[2] =
+			cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+	lid = ppd->lid;
+	if (lid) {
+		lid |= rdma_ah_get_path_bits(ah_attr) &
+			((1 << ppd->lmc) - 1);
+		priv->s_hdr->lrh[3] = cpu_to_be16(lid);
+	} else
+		priv->s_hdr->lrh[3] = IB_LID_PERMISSIVE;
+	if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+		bth0 |= IB_BTH_SOLICITED;
+	bth0 |= extra_bytes << 20;
+	bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY :
+		qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ?
+			     wqe->ud_wr.pkey_index : qp->s_pkey_index);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	/*
+	 * Use the multicast QP if the destination LID is a multicast LID.
+	 */
+	ohdr->bth[1] = rdma_ah_get_dlid(ah_attr) >=
+			be16_to_cpu(IB_MULTICAST_LID_BASE) &&
+		rdma_ah_get_dlid(ah_attr) != be16_to_cpu(IB_LID_PERMISSIVE) ?
+		cpu_to_be32(QIB_MULTICAST_QPN) :
+		cpu_to_be32(wqe->ud_wr.remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(wqe->psn & QIB_PSN_MASK);
+	/*
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
+					 qp->qkey : wqe->ud_wr.remote_qkey);
+	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+
+done:
+	return 1;
+bail:
+	qp->s_flags &= ~RVT_S_BUSY;
+	return ret;
+}
+
+static unsigned qib_lookup_pkey(struct qib_ibport *ibp, u16 pkey)
+{
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = ppd->dd;
+	unsigned ctxt = ppd->hw_pidx;
+	unsigned i;
+
+	pkey &= 0x7fff;	/* remove limited/full membership bit */
+
+	for (i = 0; i < ARRAY_SIZE(dd->rcd[ctxt]->pkeys); ++i)
+		if ((dd->rcd[ctxt]->pkeys[i] & 0x7fff) == pkey)
+			return i;
+
+	/*
+	 * Should not get here, this means hardware failed to validate pkeys.
+	 * Punt and return index 0.
+	 */
+	return 0;
+}
+
+/**
+ * qib_ud_rcv - receive an incoming UD packet
+ * @ibp: the port the packet came in on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from qib_qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct rvt_qp *qp)
+{
+	struct ib_other_headers *ohdr;
+	int opcode;
+	u32 hdrsize;
+	u32 pad;
+	struct ib_wc wc;
+	u32 qkey;
+	u32 src_qp;
+	u16 dlid;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12 + 8;   /* LRH + BTH + DETH */
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */
+	}
+	qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+	src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
+
+	/*
+	 * Get the number of bytes the message was padded by
+	 * and drop incomplete packets.
+	 */
+	pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+	if (unlikely(tlen < (hdrsize + pad + 4)))
+		goto drop;
+
+	tlen -= hdrsize + pad + 4;
+
+	/*
+	 * Check that the permissive LID is only used on QP0
+	 * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+	 */
+	if (qp->ibqp.qp_num) {
+		if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+			     hdr->lrh[3] == IB_LID_PERMISSIVE))
+			goto drop;
+		if (qp->ibqp.qp_num > 1) {
+			u16 pkey1, pkey2;
+
+			pkey1 = be32_to_cpu(ohdr->bth[0]);
+			pkey2 = qib_get_pkey(ibp, qp->s_pkey_index);
+			if (unlikely(!qib_pkey_ok(pkey1, pkey2))) {
+				qib_bad_pkey(ibp,
+					     pkey1,
+					     (be16_to_cpu(hdr->lrh[0]) >> 4) &
+						0xF,
+					     src_qp, qp->ibqp.qp_num,
+					     hdr->lrh[3], hdr->lrh[1]);
+				return;
+			}
+		}
+		if (unlikely(qkey != qp->qkey))
+			return;
+
+		/* Drop invalid MAD packets (see 13.5.3.1). */
+		if (unlikely(qp->ibqp.qp_num == 1 &&
+			     (tlen != 256 ||
+			      (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))
+			goto drop;
+	} else {
+		struct ib_smp *smp;
+
+		/* Drop invalid MAD packets (see 13.5.3.1). */
+		if (tlen != 256 || (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)
+			goto drop;
+		smp = (struct ib_smp *) data;
+		if ((hdr->lrh[1] == IB_LID_PERMISSIVE ||
+		     hdr->lrh[3] == IB_LID_PERMISSIVE) &&
+		    smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+			goto drop;
+	}
+
+	/*
+	 * The opcode is in the low byte when its in network order
+	 * (top byte when in host order).
+	 */
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (qp->ibqp.qp_num > 1 &&
+	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+		wc.ex.imm_data = ohdr->u.ud.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		tlen -= sizeof(u32);
+	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+		wc.ex.imm_data = 0;
+		wc.wc_flags = 0;
+	} else
+		goto drop;
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	wc.byte_len = tlen + sizeof(struct ib_grh);
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 */
+	if (qp->r_flags & RVT_R_REUSE_SGE)
+		qp->r_flags &= ~RVT_R_REUSE_SGE;
+	else {
+		int ret;
+
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0) {
+			rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+			return;
+		}
+		if (!ret) {
+			if (qp->ibqp.qp_num == 0)
+				ibp->rvp.n_vl15_dropped++;
+			return;
+		}
+	}
+	/* Silently drop packets which are too big. */
+	if (unlikely(wc.byte_len > qp->r_len)) {
+		qp->r_flags |= RVT_R_REUSE_SGE;
+		goto drop;
+	}
+	if (has_grh) {
+		qib_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+			     sizeof(struct ib_grh), 1);
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true);
+	qib_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1);
+	rvt_put_ss(&qp->r_sge);
+	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+		return;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.vendor_err = 0;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = src_qp;
+	wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ?
+		qib_lookup_pkey(ibp, be32_to_cpu(ohdr->bth[0])) : 0;
+	wc.slid = be16_to_cpu(hdr->lrh[3]);
+	wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
+	dlid = be16_to_cpu(hdr->lrh[1]);
+	/*
+	 * Save the LMC lower bits if the destination LID is a unicast LID.
+	 */
+	wc.dlid_path_bits = dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) ? 0 :
+		dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
+	wc.port_num = qp->port_num;
+	/* Signal completion event if the solicited bit is set. */
+	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+		     ib_bth_is_solicited(ohdr));
+	return;
+
+drop:
+	ibp->rvp.n_pkt_drops++;
+}
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
new file mode 100644
index 0000000..ce83ba9
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/device.h>
+
+#include "qib.h"
+
+static void __qib_release_user_pages(struct page **p, size_t num_pages,
+				     int dirty)
+{
+	size_t i;
+
+	for (i = 0; i < num_pages; i++) {
+		if (dirty)
+			set_page_dirty_lock(p[i]);
+		put_page(p[i]);
+	}
+}
+
+/*
+ * Call with current->mm->mmap_sem held.
+ */
+static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
+				struct page **p)
+{
+	unsigned long lock_limit;
+	size_t got;
+	int ret;
+
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if (num_pages > lock_limit && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	for (got = 0; got < num_pages; got += ret) {
+		ret = get_user_pages(start_page + got * PAGE_SIZE,
+				     num_pages - got,
+				     FOLL_WRITE | FOLL_FORCE,
+				     p + got, NULL);
+		if (ret < 0)
+			goto bail_release;
+	}
+
+	current->mm->pinned_vm += num_pages;
+
+	ret = 0;
+	goto bail;
+
+bail_release:
+	__qib_release_user_pages(p, got, 0);
+bail:
+	return ret;
+}
+
+/**
+ * qib_map_page - a safety wrapper around pci_map_page()
+ *
+ * A dma_addr of all 0's is interpreted by the chip as "disabled".
+ * Unfortunately, it can also be a valid dma_addr returned on some
+ * architectures.
+ *
+ * The powerpc iommu assigns dma_addrs in ascending order, so we don't
+ * have to bother with retries or mapping a dummy page to insure we
+ * don't just get the same mapping again.
+ *
+ * I'm sure we won't be so lucky with other iommu's, so FIXME.
+ */
+dma_addr_t qib_map_page(struct pci_dev *hwdev, struct page *page,
+			unsigned long offset, size_t size, int direction)
+{
+	dma_addr_t phys;
+
+	phys = pci_map_page(hwdev, page, offset, size, direction);
+
+	if (phys == 0) {
+		pci_unmap_page(hwdev, phys, size, direction);
+		phys = pci_map_page(hwdev, page, offset, size, direction);
+		/*
+		 * FIXME: If we get 0 again, we should keep this page,
+		 * map another, then free the 0 page.
+		 */
+	}
+
+	return phys;
+}
+
+/**
+ * qib_get_user_pages - lock user pages into memory
+ * @start_page: the start page
+ * @num_pages: the number of pages
+ * @p: the output page structures
+ *
+ * This function takes a given start page (page aligned user virtual
+ * address) and pins it and the following specified number of pages.  For
+ * now, num_pages is always 1, but that will probably change at some point
+ * (because caller is doing expected sends on a single virtually contiguous
+ * buffer, so we can do all pages at once).
+ */
+int qib_get_user_pages(unsigned long start_page, size_t num_pages,
+		       struct page **p)
+{
+	int ret;
+
+	down_write(&current->mm->mmap_sem);
+
+	ret = __qib_get_user_pages(start_page, num_pages, p);
+
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+void qib_release_user_pages(struct page **p, size_t num_pages)
+{
+	if (current->mm) /* during close after signal, mm can be NULL */
+		down_write(&current->mm->mmap_sem);
+
+	__qib_release_user_pages(p, num_pages, 1);
+
+	if (current->mm) {
+		current->mm->pinned_vm -= num_pages;
+		up_write(&current->mm->mmap_sem);
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c
new file mode 100644
index 0000000..926f3c8
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
@@ -0,0 +1,1465 @@
+/*
+ * Copyright (c) 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "qib.h"
+#include "qib_user_sdma.h"
+
+/* minimum size of header */
+#define QIB_USER_SDMA_MIN_HEADER_LENGTH 64
+/* expected size of headers (for dma_pool) */
+#define QIB_USER_SDMA_EXP_HEADER_LENGTH 64
+/* attempt to drain the queue for 5secs */
+#define QIB_USER_SDMA_DRAIN_TIMEOUT 250
+
+/*
+ * track how many times a process open this driver.
+ */
+static struct rb_root qib_user_sdma_rb_root = RB_ROOT;
+
+struct qib_user_sdma_rb_node {
+	struct rb_node node;
+	int refcount;
+	pid_t pid;
+};
+
+struct qib_user_sdma_pkt {
+	struct list_head list;  /* list element */
+
+	u8  tiddma;		/* if this is NEW tid-sdma */
+	u8  largepkt;		/* this is large pkt from kmalloc */
+	u16 frag_size;		/* frag size used by PSM */
+	u16 index;              /* last header index or push index */
+	u16 naddr;              /* dimension of addr (1..3) ... */
+	u16 addrlimit;		/* addr array size */
+	u16 tidsmidx;		/* current tidsm index */
+	u16 tidsmcount;		/* tidsm array item count */
+	u16 payload_size;	/* payload size so far for header */
+	u32 bytes_togo;		/* bytes for processing */
+	u32 counter;            /* sdma pkts queued counter for this entry */
+	struct qib_tid_session_member *tidsm;	/* tid session member array */
+	struct qib_user_sdma_queue *pq;	/* which pq this pkt belongs to */
+	u64 added;              /* global descq number of entries */
+
+	struct {
+		u16 offset;                     /* offset for kvaddr, addr */
+		u16 length;                     /* length in page */
+		u16 first_desc;			/* first desc */
+		u16 last_desc;			/* last desc */
+		u16 put_page;                   /* should we put_page? */
+		u16 dma_mapped;                 /* is page dma_mapped? */
+		u16 dma_length;			/* for dma_unmap_page() */
+		u16 padding;
+		struct page *page;              /* may be NULL (coherent mem) */
+		void *kvaddr;                   /* FIXME: only for pio hack */
+		dma_addr_t addr;
+	} addr[4];   /* max pages, any more and we coalesce */
+};
+
+struct qib_user_sdma_queue {
+	/*
+	 * pkts sent to dma engine are queued on this
+	 * list head.  the type of the elements of this
+	 * list are struct qib_user_sdma_pkt...
+	 */
+	struct list_head sent;
+
+	/*
+	 * Because above list will be accessed by both process and
+	 * signal handler, we need a spinlock for it.
+	 */
+	spinlock_t sent_lock ____cacheline_aligned_in_smp;
+
+	/* headers with expected length are allocated from here... */
+	char header_cache_name[64];
+	struct dma_pool *header_cache;
+
+	/* packets are allocated from the slab cache... */
+	char pkt_slab_name[64];
+	struct kmem_cache *pkt_slab;
+
+	/* as packets go on the queued queue, they are counted... */
+	u32 counter;
+	u32 sent_counter;
+	/* pending packets, not sending yet */
+	u32 num_pending;
+	/* sending packets, not complete yet */
+	u32 num_sending;
+	/* global descq number of entry of last sending packet */
+	u64 added;
+
+	/* dma page table */
+	struct rb_root dma_pages_root;
+
+	struct qib_user_sdma_rb_node *sdma_rb_node;
+
+	/* protect everything above... */
+	struct mutex lock;
+};
+
+static struct qib_user_sdma_rb_node *
+qib_user_sdma_rb_search(struct rb_root *root, pid_t pid)
+{
+	struct qib_user_sdma_rb_node *sdma_rb_node;
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		sdma_rb_node = rb_entry(node, struct qib_user_sdma_rb_node,
+					node);
+		if (pid < sdma_rb_node->pid)
+			node = node->rb_left;
+		else if (pid > sdma_rb_node->pid)
+			node = node->rb_right;
+		else
+			return sdma_rb_node;
+	}
+	return NULL;
+}
+
+static int
+qib_user_sdma_rb_insert(struct rb_root *root, struct qib_user_sdma_rb_node *new)
+{
+	struct rb_node **node = &(root->rb_node);
+	struct rb_node *parent = NULL;
+	struct qib_user_sdma_rb_node *got;
+
+	while (*node) {
+		got = rb_entry(*node, struct qib_user_sdma_rb_node, node);
+		parent = *node;
+		if (new->pid < got->pid)
+			node = &((*node)->rb_left);
+		else if (new->pid > got->pid)
+			node = &((*node)->rb_right);
+		else
+			return 0;
+	}
+
+	rb_link_node(&new->node, parent, node);
+	rb_insert_color(&new->node, root);
+	return 1;
+}
+
+struct qib_user_sdma_queue *
+qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt)
+{
+	struct qib_user_sdma_queue *pq =
+		kmalloc(sizeof(struct qib_user_sdma_queue), GFP_KERNEL);
+	struct qib_user_sdma_rb_node *sdma_rb_node;
+
+	if (!pq)
+		goto done;
+
+	pq->counter = 0;
+	pq->sent_counter = 0;
+	pq->num_pending = 0;
+	pq->num_sending = 0;
+	pq->added = 0;
+	pq->sdma_rb_node = NULL;
+
+	INIT_LIST_HEAD(&pq->sent);
+	spin_lock_init(&pq->sent_lock);
+	mutex_init(&pq->lock);
+
+	snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name),
+		 "qib-user-sdma-pkts-%u-%02u.%02u", unit, ctxt, sctxt);
+	pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name,
+					 sizeof(struct qib_user_sdma_pkt),
+					 0, 0, NULL);
+
+	if (!pq->pkt_slab)
+		goto err_kfree;
+
+	snprintf(pq->header_cache_name, sizeof(pq->header_cache_name),
+		 "qib-user-sdma-headers-%u-%02u.%02u", unit, ctxt, sctxt);
+	pq->header_cache = dma_pool_create(pq->header_cache_name,
+					   dev,
+					   QIB_USER_SDMA_EXP_HEADER_LENGTH,
+					   4, 0);
+	if (!pq->header_cache)
+		goto err_slab;
+
+	pq->dma_pages_root = RB_ROOT;
+
+	sdma_rb_node = qib_user_sdma_rb_search(&qib_user_sdma_rb_root,
+					current->pid);
+	if (sdma_rb_node) {
+		sdma_rb_node->refcount++;
+	} else {
+		int ret;
+
+		sdma_rb_node = kmalloc(sizeof(
+			struct qib_user_sdma_rb_node), GFP_KERNEL);
+		if (!sdma_rb_node)
+			goto err_rb;
+
+		sdma_rb_node->refcount = 1;
+		sdma_rb_node->pid = current->pid;
+
+		ret = qib_user_sdma_rb_insert(&qib_user_sdma_rb_root,
+					sdma_rb_node);
+		BUG_ON(ret == 0);
+	}
+	pq->sdma_rb_node = sdma_rb_node;
+
+	goto done;
+
+err_rb:
+	dma_pool_destroy(pq->header_cache);
+err_slab:
+	kmem_cache_destroy(pq->pkt_slab);
+err_kfree:
+	kfree(pq);
+	pq = NULL;
+
+done:
+	return pq;
+}
+
+static void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt,
+				    int i, u16 offset, u16 len,
+				    u16 first_desc, u16 last_desc,
+				    u16 put_page, u16 dma_mapped,
+				    struct page *page, void *kvaddr,
+				    dma_addr_t dma_addr, u16 dma_length)
+{
+	pkt->addr[i].offset = offset;
+	pkt->addr[i].length = len;
+	pkt->addr[i].first_desc = first_desc;
+	pkt->addr[i].last_desc = last_desc;
+	pkt->addr[i].put_page = put_page;
+	pkt->addr[i].dma_mapped = dma_mapped;
+	pkt->addr[i].page = page;
+	pkt->addr[i].kvaddr = kvaddr;
+	pkt->addr[i].addr = dma_addr;
+	pkt->addr[i].dma_length = dma_length;
+}
+
+static void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq,
+				size_t len, dma_addr_t *dma_addr)
+{
+	void *hdr;
+
+	if (len == QIB_USER_SDMA_EXP_HEADER_LENGTH)
+		hdr = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
+					     dma_addr);
+	else
+		hdr = NULL;
+
+	if (!hdr) {
+		hdr = kmalloc(len, GFP_KERNEL);
+		if (!hdr)
+			return NULL;
+
+		*dma_addr = 0;
+	}
+
+	return hdr;
+}
+
+static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd,
+				       struct qib_user_sdma_queue *pq,
+				       struct qib_user_sdma_pkt *pkt,
+				       struct page *page, u16 put,
+				       u16 offset, u16 len, void *kvaddr)
+{
+	__le16 *pbc16;
+	void *pbcvaddr;
+	struct qib_message_header *hdr;
+	u16 newlen, pbclen, lastdesc, dma_mapped;
+	u32 vcto;
+	union qib_seqnum seqnum;
+	dma_addr_t pbcdaddr;
+	dma_addr_t dma_addr =
+		dma_map_page(&dd->pcidev->dev,
+			page, offset, len, DMA_TO_DEVICE);
+	int ret = 0;
+
+	if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+		/*
+		 * dma mapping error, pkt has not managed
+		 * this page yet, return the page here so
+		 * the caller can ignore this page.
+		 */
+		if (put) {
+			put_page(page);
+		} else {
+			/* coalesce case */
+			kunmap(page);
+			__free_page(page);
+		}
+		ret = -ENOMEM;
+		goto done;
+	}
+	offset = 0;
+	dma_mapped = 1;
+
+
+next_fragment:
+
+	/*
+	 * In tid-sdma, the transfer length is restricted by
+	 * receiver side current tid page length.
+	 */
+	if (pkt->tiddma && len > pkt->tidsm[pkt->tidsmidx].length)
+		newlen = pkt->tidsm[pkt->tidsmidx].length;
+	else
+		newlen = len;
+
+	/*
+	 * Then the transfer length is restricted by MTU.
+	 * the last descriptor flag is determined by:
+	 * 1. the current packet is at frag size length.
+	 * 2. the current tid page is done if tid-sdma.
+	 * 3. there is no more byte togo if sdma.
+	 */
+	lastdesc = 0;
+	if ((pkt->payload_size + newlen) >= pkt->frag_size) {
+		newlen = pkt->frag_size - pkt->payload_size;
+		lastdesc = 1;
+	} else if (pkt->tiddma) {
+		if (newlen == pkt->tidsm[pkt->tidsmidx].length)
+			lastdesc = 1;
+	} else {
+		if (newlen == pkt->bytes_togo)
+			lastdesc = 1;
+	}
+
+	/* fill the next fragment in this page */
+	qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */
+		offset, newlen,		/* offset, len */
+		0, lastdesc,		/* first last desc */
+		put, dma_mapped,	/* put page, dma mapped */
+		page, kvaddr,		/* struct page, virt addr */
+		dma_addr, len);		/* dma addr, dma length */
+	pkt->bytes_togo -= newlen;
+	pkt->payload_size += newlen;
+	pkt->naddr++;
+	if (pkt->naddr == pkt->addrlimit) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	/* If there is no more byte togo. (lastdesc==1) */
+	if (pkt->bytes_togo == 0) {
+		/* The packet is done, header is not dma mapped yet.
+		 * it should be from kmalloc */
+		if (!pkt->addr[pkt->index].addr) {
+			pkt->addr[pkt->index].addr =
+				dma_map_single(&dd->pcidev->dev,
+					pkt->addr[pkt->index].kvaddr,
+					pkt->addr[pkt->index].dma_length,
+					DMA_TO_DEVICE);
+			if (dma_mapping_error(&dd->pcidev->dev,
+					pkt->addr[pkt->index].addr)) {
+				ret = -ENOMEM;
+				goto done;
+			}
+			pkt->addr[pkt->index].dma_mapped = 1;
+		}
+
+		goto done;
+	}
+
+	/* If tid-sdma, advance tid info. */
+	if (pkt->tiddma) {
+		pkt->tidsm[pkt->tidsmidx].length -= newlen;
+		if (pkt->tidsm[pkt->tidsmidx].length) {
+			pkt->tidsm[pkt->tidsmidx].offset += newlen;
+		} else {
+			pkt->tidsmidx++;
+			if (pkt->tidsmidx == pkt->tidsmcount) {
+				ret = -EFAULT;
+				goto done;
+			}
+		}
+	}
+
+	/*
+	 * If this is NOT the last descriptor. (newlen==len)
+	 * the current packet is not done yet, but the current
+	 * send side page is done.
+	 */
+	if (lastdesc == 0)
+		goto done;
+
+	/*
+	 * If running this driver under PSM with message size
+	 * fitting into one transfer unit, it is not possible
+	 * to pass this line. otherwise, it is a buggggg.
+	 */
+
+	/*
+	 * Since the current packet is done, and there are more
+	 * bytes togo, we need to create a new sdma header, copying
+	 * from previous sdma header and modify both.
+	 */
+	pbclen = pkt->addr[pkt->index].length;
+	pbcvaddr = qib_user_sdma_alloc_header(pq, pbclen, &pbcdaddr);
+	if (!pbcvaddr) {
+		ret = -ENOMEM;
+		goto done;
+	}
+	/* Copy the previous sdma header to new sdma header */
+	pbc16 = (__le16 *)pkt->addr[pkt->index].kvaddr;
+	memcpy(pbcvaddr, pbc16, pbclen);
+
+	/* Modify the previous sdma header */
+	hdr = (struct qib_message_header *)&pbc16[4];
+
+	/* New pbc length */
+	pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->bytes_togo>>2));
+
+	/* New packet length */
+	hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0]));
+
+	if (pkt->tiddma) {
+		/* turn on the header suppression */
+		hdr->iph.pkt_flags =
+			cpu_to_le16(le16_to_cpu(hdr->iph.pkt_flags)|0x2);
+		/* turn off ACK_REQ: 0x04 and EXPECTED_DONE: 0x20 */
+		hdr->flags &= ~(0x04|0x20);
+	} else {
+		/* turn off extra bytes: 20-21 bits */
+		hdr->bth[0] = cpu_to_be32(be32_to_cpu(hdr->bth[0])&0xFFCFFFFF);
+		/* turn off ACK_REQ: 0x04 */
+		hdr->flags &= ~(0x04);
+	}
+
+	/* New kdeth checksum */
+	vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset);
+	hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH +
+		be16_to_cpu(hdr->lrh[2]) -
+		((vcto>>16)&0xFFFF) - (vcto&0xFFFF) -
+		le16_to_cpu(hdr->iph.pkt_flags));
+
+	/* The packet is done, header is not dma mapped yet.
+	 * it should be from kmalloc */
+	if (!pkt->addr[pkt->index].addr) {
+		pkt->addr[pkt->index].addr =
+			dma_map_single(&dd->pcidev->dev,
+				pkt->addr[pkt->index].kvaddr,
+				pkt->addr[pkt->index].dma_length,
+				DMA_TO_DEVICE);
+		if (dma_mapping_error(&dd->pcidev->dev,
+				pkt->addr[pkt->index].addr)) {
+			ret = -ENOMEM;
+			goto done;
+		}
+		pkt->addr[pkt->index].dma_mapped = 1;
+	}
+
+	/* Modify the new sdma header */
+	pbc16 = (__le16 *)pbcvaddr;
+	hdr = (struct qib_message_header *)&pbc16[4];
+
+	/* New pbc length */
+	pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->payload_size>>2));
+
+	/* New packet length */
+	hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0]));
+
+	if (pkt->tiddma) {
+		/* Set new tid and offset for new sdma header */
+		hdr->iph.ver_ctxt_tid_offset = cpu_to_le32(
+			(le32_to_cpu(hdr->iph.ver_ctxt_tid_offset)&0xFF000000) +
+			(pkt->tidsm[pkt->tidsmidx].tid<<QLOGIC_IB_I_TID_SHIFT) +
+			(pkt->tidsm[pkt->tidsmidx].offset>>2));
+	} else {
+		/* Middle protocol new packet offset */
+		hdr->uwords[2] += pkt->payload_size;
+	}
+
+	/* New kdeth checksum */
+	vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset);
+	hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH +
+		be16_to_cpu(hdr->lrh[2]) -
+		((vcto>>16)&0xFFFF) - (vcto&0xFFFF) -
+		le16_to_cpu(hdr->iph.pkt_flags));
+
+	/* Next sequence number in new sdma header */
+	seqnum.val = be32_to_cpu(hdr->bth[2]);
+	if (pkt->tiddma)
+		seqnum.seq++;
+	else
+		seqnum.pkt++;
+	hdr->bth[2] = cpu_to_be32(seqnum.val);
+
+	/* Init new sdma header. */
+	qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */
+		0, pbclen,		/* offset, len */
+		1, 0,			/* first last desc */
+		0, 0,			/* put page, dma mapped */
+		NULL, pbcvaddr,		/* struct page, virt addr */
+		pbcdaddr, pbclen);	/* dma addr, dma length */
+	pkt->index = pkt->naddr;
+	pkt->payload_size = 0;
+	pkt->naddr++;
+	if (pkt->naddr == pkt->addrlimit) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	/* Prepare for next fragment in this page */
+	if (newlen != len) {
+		if (dma_mapped) {
+			put = 0;
+			dma_mapped = 0;
+			page = NULL;
+			kvaddr = NULL;
+		}
+		len -= newlen;
+		offset += newlen;
+
+		goto next_fragment;
+	}
+
+done:
+	return ret;
+}
+
+/* we've too many pages in the iovec, coalesce to a single page */
+static int qib_user_sdma_coalesce(const struct qib_devdata *dd,
+				  struct qib_user_sdma_queue *pq,
+				  struct qib_user_sdma_pkt *pkt,
+				  const struct iovec *iov,
+				  unsigned long niov)
+{
+	int ret = 0;
+	struct page *page = alloc_page(GFP_KERNEL);
+	void *mpage_save;
+	char *mpage;
+	int i;
+	int len = 0;
+
+	if (!page) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	mpage = kmap(page);
+	mpage_save = mpage;
+	for (i = 0; i < niov; i++) {
+		int cfur;
+
+		cfur = copy_from_user(mpage,
+				      iov[i].iov_base, iov[i].iov_len);
+		if (cfur) {
+			ret = -EFAULT;
+			goto free_unmap;
+		}
+
+		mpage += iov[i].iov_len;
+		len += iov[i].iov_len;
+	}
+
+	ret = qib_user_sdma_page_to_frags(dd, pq, pkt,
+			page, 0, 0, len, mpage_save);
+	goto done;
+
+free_unmap:
+	kunmap(page);
+	__free_page(page);
+done:
+	return ret;
+}
+
+/*
+ * How many pages in this iovec element?
+ */
+static int qib_user_sdma_num_pages(const struct iovec *iov)
+{
+	const unsigned long addr  = (unsigned long) iov->iov_base;
+	const unsigned long  len  = iov->iov_len;
+	const unsigned long spage = addr & PAGE_MASK;
+	const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+	return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+static void qib_user_sdma_free_pkt_frag(struct device *dev,
+					struct qib_user_sdma_queue *pq,
+					struct qib_user_sdma_pkt *pkt,
+					int frag)
+{
+	const int i = frag;
+
+	if (pkt->addr[i].page) {
+		/* only user data has page */
+		if (pkt->addr[i].dma_mapped)
+			dma_unmap_page(dev,
+				       pkt->addr[i].addr,
+				       pkt->addr[i].dma_length,
+				       DMA_TO_DEVICE);
+
+		if (pkt->addr[i].kvaddr)
+			kunmap(pkt->addr[i].page);
+
+		if (pkt->addr[i].put_page)
+			put_page(pkt->addr[i].page);
+		else
+			__free_page(pkt->addr[i].page);
+	} else if (pkt->addr[i].kvaddr) {
+		/* for headers */
+		if (pkt->addr[i].dma_mapped) {
+			/* from kmalloc & dma mapped */
+			dma_unmap_single(dev,
+				       pkt->addr[i].addr,
+				       pkt->addr[i].dma_length,
+				       DMA_TO_DEVICE);
+			kfree(pkt->addr[i].kvaddr);
+		} else if (pkt->addr[i].addr) {
+			/* free coherent mem from cache... */
+			dma_pool_free(pq->header_cache,
+			      pkt->addr[i].kvaddr, pkt->addr[i].addr);
+		} else {
+			/* from kmalloc but not dma mapped */
+			kfree(pkt->addr[i].kvaddr);
+		}
+	}
+}
+
+/* return number of pages pinned... */
+static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
+				   struct qib_user_sdma_queue *pq,
+				   struct qib_user_sdma_pkt *pkt,
+				   unsigned long addr, int tlen, int npages)
+{
+	struct page *pages[8];
+	int i, j;
+	int ret = 0;
+
+	while (npages) {
+		if (npages > 8)
+			j = 8;
+		else
+			j = npages;
+
+		ret = get_user_pages_fast(addr, j, 0, pages);
+		if (ret != j) {
+			i = 0;
+			j = ret;
+			ret = -ENOMEM;
+			goto free_pages;
+		}
+
+		for (i = 0; i < j; i++) {
+			/* map the pages... */
+			unsigned long fofs = addr & ~PAGE_MASK;
+			int flen = ((fofs + tlen) > PAGE_SIZE) ?
+				(PAGE_SIZE - fofs) : tlen;
+
+			ret = qib_user_sdma_page_to_frags(dd, pq, pkt,
+				pages[i], 1, fofs, flen, NULL);
+			if (ret < 0) {
+				/* current page has beed taken
+				 * care of inside above call.
+				 */
+				i++;
+				goto free_pages;
+			}
+
+			addr += flen;
+			tlen -= flen;
+		}
+
+		npages -= j;
+	}
+
+	goto done;
+
+	/* if error, return all pages not managed by pkt */
+free_pages:
+	while (i < j)
+		put_page(pages[i++]);
+
+done:
+	return ret;
+}
+
+static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd,
+				 struct qib_user_sdma_queue *pq,
+				 struct qib_user_sdma_pkt *pkt,
+				 const struct iovec *iov,
+				 unsigned long niov)
+{
+	int ret = 0;
+	unsigned long idx;
+
+	for (idx = 0; idx < niov; idx++) {
+		const int npages = qib_user_sdma_num_pages(iov + idx);
+		const unsigned long addr = (unsigned long) iov[idx].iov_base;
+
+		ret = qib_user_sdma_pin_pages(dd, pq, pkt, addr,
+					      iov[idx].iov_len, npages);
+		if (ret < 0)
+			goto free_pkt;
+	}
+
+	goto done;
+
+free_pkt:
+	/* we need to ignore the first entry here */
+	for (idx = 1; idx < pkt->naddr; idx++)
+		qib_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx);
+
+	/* need to dma unmap the first entry, this is to restore to
+	 * the original state so that caller can free the memory in
+	 * error condition. Caller does not know if dma mapped or not*/
+	if (pkt->addr[0].dma_mapped) {
+		dma_unmap_single(&dd->pcidev->dev,
+		       pkt->addr[0].addr,
+		       pkt->addr[0].dma_length,
+		       DMA_TO_DEVICE);
+		pkt->addr[0].addr = 0;
+		pkt->addr[0].dma_mapped = 0;
+	}
+
+done:
+	return ret;
+}
+
+static int qib_user_sdma_init_payload(const struct qib_devdata *dd,
+				      struct qib_user_sdma_queue *pq,
+				      struct qib_user_sdma_pkt *pkt,
+				      const struct iovec *iov,
+				      unsigned long niov, int npages)
+{
+	int ret = 0;
+
+	if (pkt->frag_size == pkt->bytes_togo &&
+			npages >= ARRAY_SIZE(pkt->addr))
+		ret = qib_user_sdma_coalesce(dd, pq, pkt, iov, niov);
+	else
+		ret = qib_user_sdma_pin_pkt(dd, pq, pkt, iov, niov);
+
+	return ret;
+}
+
+/* free a packet list -- return counter value of last packet */
+static void qib_user_sdma_free_pkt_list(struct device *dev,
+					struct qib_user_sdma_queue *pq,
+					struct list_head *list)
+{
+	struct qib_user_sdma_pkt *pkt, *pkt_next;
+
+	list_for_each_entry_safe(pkt, pkt_next, list, list) {
+		int i;
+
+		for (i = 0; i < pkt->naddr; i++)
+			qib_user_sdma_free_pkt_frag(dev, pq, pkt, i);
+
+		if (pkt->largepkt)
+			kfree(pkt);
+		else
+			kmem_cache_free(pq->pkt_slab, pkt);
+	}
+	INIT_LIST_HEAD(list);
+}
+
+/*
+ * copy headers, coalesce etc -- pq->lock must be held
+ *
+ * we queue all the packets to list, returning the
+ * number of bytes total.  list must be empty initially,
+ * as, if there is an error we clean it...
+ */
+static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
+				    struct qib_pportdata *ppd,
+				    struct qib_user_sdma_queue *pq,
+				    const struct iovec *iov,
+				    unsigned long niov,
+				    struct list_head *list,
+				    int *maxpkts, int *ndesc)
+{
+	unsigned long idx = 0;
+	int ret = 0;
+	int npkts = 0;
+	__le32 *pbc;
+	dma_addr_t dma_addr;
+	struct qib_user_sdma_pkt *pkt = NULL;
+	size_t len;
+	size_t nw;
+	u32 counter = pq->counter;
+	u16 frag_size;
+
+	while (idx < niov && npkts < *maxpkts) {
+		const unsigned long addr = (unsigned long) iov[idx].iov_base;
+		const unsigned long idx_save = idx;
+		unsigned pktnw;
+		unsigned pktnwc;
+		int nfrags = 0;
+		int npages = 0;
+		int bytes_togo = 0;
+		int tiddma = 0;
+		int cfur;
+
+		len = iov[idx].iov_len;
+		nw = len >> 2;
+
+		if (len < QIB_USER_SDMA_MIN_HEADER_LENGTH ||
+		    len > PAGE_SIZE || len & 3 || addr & 3) {
+			ret = -EINVAL;
+			goto free_list;
+		}
+
+		pbc = qib_user_sdma_alloc_header(pq, len, &dma_addr);
+		if (!pbc) {
+			ret = -ENOMEM;
+			goto free_list;
+		}
+
+		cfur = copy_from_user(pbc, iov[idx].iov_base, len);
+		if (cfur) {
+			ret = -EFAULT;
+			goto free_pbc;
+		}
+
+		/*
+		 * This assignment is a bit strange.  it's because the
+		 * the pbc counts the number of 32 bit words in the full
+		 * packet _except_ the first word of the pbc itself...
+		 */
+		pktnwc = nw - 1;
+
+		/*
+		 * pktnw computation yields the number of 32 bit words
+		 * that the caller has indicated in the PBC.  note that
+		 * this is one less than the total number of words that
+		 * goes to the send DMA engine as the first 32 bit word
+		 * of the PBC itself is not counted.  Armed with this count,
+		 * we can verify that the packet is consistent with the
+		 * iovec lengths.
+		 */
+		pktnw = le32_to_cpu(*pbc) & 0xFFFF;
+		if (pktnw < pktnwc) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+		idx++;
+		while (pktnwc < pktnw && idx < niov) {
+			const size_t slen = iov[idx].iov_len;
+			const unsigned long faddr =
+				(unsigned long) iov[idx].iov_base;
+
+			if (slen & 3 || faddr & 3 || !slen) {
+				ret = -EINVAL;
+				goto free_pbc;
+			}
+
+			npages += qib_user_sdma_num_pages(&iov[idx]);
+
+			bytes_togo += slen;
+			pktnwc += slen >> 2;
+			idx++;
+			nfrags++;
+		}
+
+		if (pktnwc != pktnw) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+		frag_size = ((le32_to_cpu(*pbc))>>16) & 0xFFFF;
+		if (((frag_size ? frag_size : bytes_togo) + len) >
+						ppd->ibmaxlen) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+		if (frag_size) {
+			int pktsize, tidsmsize, n;
+
+			n = npages*((2*PAGE_SIZE/frag_size)+1);
+			pktsize = sizeof(*pkt) + sizeof(pkt->addr[0])*n;
+
+			/*
+			 * Determine if this is tid-sdma or just sdma.
+			 */
+			tiddma = (((le32_to_cpu(pbc[7])>>
+				QLOGIC_IB_I_TID_SHIFT)&
+				QLOGIC_IB_I_TID_MASK) !=
+				QLOGIC_IB_I_TID_MASK);
+
+			if (tiddma)
+				tidsmsize = iov[idx].iov_len;
+			else
+				tidsmsize = 0;
+
+			pkt = kmalloc(pktsize+tidsmsize, GFP_KERNEL);
+			if (!pkt) {
+				ret = -ENOMEM;
+				goto free_pbc;
+			}
+			pkt->largepkt = 1;
+			pkt->frag_size = frag_size;
+			pkt->addrlimit = n + ARRAY_SIZE(pkt->addr);
+
+			if (tiddma) {
+				char *tidsm = (char *)pkt + pktsize;
+
+				cfur = copy_from_user(tidsm,
+					iov[idx].iov_base, tidsmsize);
+				if (cfur) {
+					ret = -EFAULT;
+					goto free_pkt;
+				}
+				pkt->tidsm =
+					(struct qib_tid_session_member *)tidsm;
+				pkt->tidsmcount = tidsmsize/
+					sizeof(struct qib_tid_session_member);
+				pkt->tidsmidx = 0;
+				idx++;
+			}
+
+			/*
+			 * pbc 'fill1' field is borrowed to pass frag size,
+			 * we need to clear it after picking frag size, the
+			 * hardware requires this field to be zero.
+			 */
+			*pbc = cpu_to_le32(le32_to_cpu(*pbc) & 0x0000FFFF);
+		} else {
+			pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
+			if (!pkt) {
+				ret = -ENOMEM;
+				goto free_pbc;
+			}
+			pkt->largepkt = 0;
+			pkt->frag_size = bytes_togo;
+			pkt->addrlimit = ARRAY_SIZE(pkt->addr);
+		}
+		pkt->bytes_togo = bytes_togo;
+		pkt->payload_size = 0;
+		pkt->counter = counter;
+		pkt->tiddma = tiddma;
+
+		/* setup the first header */
+		qib_user_sdma_init_frag(pkt, 0, /* index */
+			0, len,		/* offset, len */
+			1, 0,		/* first last desc */
+			0, 0,		/* put page, dma mapped */
+			NULL, pbc,	/* struct page, virt addr */
+			dma_addr, len);	/* dma addr, dma length */
+		pkt->index = 0;
+		pkt->naddr = 1;
+
+		if (nfrags) {
+			ret = qib_user_sdma_init_payload(dd, pq, pkt,
+							 iov + idx_save + 1,
+							 nfrags, npages);
+			if (ret < 0)
+				goto free_pkt;
+		} else {
+			/* since there is no payload, mark the
+			 * header as the last desc. */
+			pkt->addr[0].last_desc = 1;
+
+			if (dma_addr == 0) {
+				/*
+				 * the header is not dma mapped yet.
+				 * it should be from kmalloc.
+				 */
+				dma_addr = dma_map_single(&dd->pcidev->dev,
+					pbc, len, DMA_TO_DEVICE);
+				if (dma_mapping_error(&dd->pcidev->dev,
+								dma_addr)) {
+					ret = -ENOMEM;
+					goto free_pkt;
+				}
+				pkt->addr[0].addr = dma_addr;
+				pkt->addr[0].dma_mapped = 1;
+			}
+		}
+
+		counter++;
+		npkts++;
+		pkt->pq = pq;
+		pkt->index = 0; /* reset index for push on hw */
+		*ndesc += pkt->naddr;
+
+		list_add_tail(&pkt->list, list);
+	}
+
+	*maxpkts = npkts;
+	ret = idx;
+	goto done;
+
+free_pkt:
+	if (pkt->largepkt)
+		kfree(pkt);
+	else
+		kmem_cache_free(pq->pkt_slab, pkt);
+free_pbc:
+	if (dma_addr)
+		dma_pool_free(pq->header_cache, pbc, dma_addr);
+	else
+		kfree(pbc);
+free_list:
+	qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list);
+done:
+	return ret;
+}
+
+static void qib_user_sdma_set_complete_counter(struct qib_user_sdma_queue *pq,
+					       u32 c)
+{
+	pq->sent_counter = c;
+}
+
+/* try to clean out queue -- needs pq->lock */
+static int qib_user_sdma_queue_clean(struct qib_pportdata *ppd,
+				     struct qib_user_sdma_queue *pq)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct list_head free_list;
+	struct qib_user_sdma_pkt *pkt;
+	struct qib_user_sdma_pkt *pkt_prev;
+	unsigned long flags;
+	int ret = 0;
+
+	if (!pq->num_sending)
+		return 0;
+
+	INIT_LIST_HEAD(&free_list);
+
+	/*
+	 * We need this spin lock here because interrupt handler
+	 * might modify this list in qib_user_sdma_send_desc(), also
+	 * we can not get interrupted, otherwise it is a deadlock.
+	 */
+	spin_lock_irqsave(&pq->sent_lock, flags);
+	list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) {
+		s64 descd = ppd->sdma_descq_removed - pkt->added;
+
+		if (descd < 0)
+			break;
+
+		list_move_tail(&pkt->list, &free_list);
+
+		/* one more packet cleaned */
+		ret++;
+		pq->num_sending--;
+	}
+	spin_unlock_irqrestore(&pq->sent_lock, flags);
+
+	if (!list_empty(&free_list)) {
+		u32 counter;
+
+		pkt = list_entry(free_list.prev,
+				 struct qib_user_sdma_pkt, list);
+		counter = pkt->counter;
+
+		qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+		qib_user_sdma_set_complete_counter(pq, counter);
+	}
+
+	return ret;
+}
+
+void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq)
+{
+	if (!pq)
+		return;
+
+	pq->sdma_rb_node->refcount--;
+	if (pq->sdma_rb_node->refcount == 0) {
+		rb_erase(&pq->sdma_rb_node->node, &qib_user_sdma_rb_root);
+		kfree(pq->sdma_rb_node);
+	}
+	dma_pool_destroy(pq->header_cache);
+	kmem_cache_destroy(pq->pkt_slab);
+	kfree(pq);
+}
+
+/* clean descriptor queue, returns > 0 if some elements cleaned */
+static int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+	ret = qib_sdma_make_progress(ppd);
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+	return ret;
+}
+
+/* we're in close, drain packets so that we can cleanup successfully... */
+void qib_user_sdma_queue_drain(struct qib_pportdata *ppd,
+			       struct qib_user_sdma_queue *pq)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+	int i;
+
+	if (!pq)
+		return;
+
+	for (i = 0; i < QIB_USER_SDMA_DRAIN_TIMEOUT; i++) {
+		mutex_lock(&pq->lock);
+		if (!pq->num_pending && !pq->num_sending) {
+			mutex_unlock(&pq->lock);
+			break;
+		}
+		qib_user_sdma_hwqueue_clean(ppd);
+		qib_user_sdma_queue_clean(ppd, pq);
+		mutex_unlock(&pq->lock);
+		msleep(20);
+	}
+
+	if (pq->num_pending || pq->num_sending) {
+		struct qib_user_sdma_pkt *pkt;
+		struct qib_user_sdma_pkt *pkt_prev;
+		struct list_head free_list;
+
+		mutex_lock(&pq->lock);
+		spin_lock_irqsave(&ppd->sdma_lock, flags);
+		/*
+		 * Since we hold sdma_lock, it is safe without sent_lock.
+		 */
+		if (pq->num_pending) {
+			list_for_each_entry_safe(pkt, pkt_prev,
+					&ppd->sdma_userpending, list) {
+				if (pkt->pq == pq) {
+					list_move_tail(&pkt->list, &pq->sent);
+					pq->num_pending--;
+					pq->num_sending++;
+				}
+			}
+		}
+		spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+		qib_dev_err(dd, "user sdma lists not empty: forcing!\n");
+		INIT_LIST_HEAD(&free_list);
+		list_splice_init(&pq->sent, &free_list);
+		pq->num_sending = 0;
+		qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+		mutex_unlock(&pq->lock);
+	}
+}
+
+static inline __le64 qib_sdma_make_desc0(u8 gen,
+					 u64 addr, u64 dwlen, u64 dwoffset)
+{
+	return cpu_to_le64(/* SDmaPhyAddr[31:0] */
+			   ((addr & 0xfffffffcULL) << 32) |
+			   /* SDmaGeneration[1:0] */
+			   ((gen & 3ULL) << 30) |
+			   /* SDmaDwordCount[10:0] */
+			   ((dwlen & 0x7ffULL) << 16) |
+			   /* SDmaBufOffset[12:2] */
+			   (dwoffset & 0x7ffULL));
+}
+
+static inline __le64 qib_sdma_make_first_desc0(__le64 descq)
+{
+	return descq | cpu_to_le64(1ULL << 12);
+}
+
+static inline __le64 qib_sdma_make_last_desc0(__le64 descq)
+{
+					      /* last */  /* dma head */
+	return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13);
+}
+
+static inline __le64 qib_sdma_make_desc1(u64 addr)
+{
+	/* SDmaPhyAddr[47:32] */
+	return cpu_to_le64(addr >> 32);
+}
+
+static void qib_user_sdma_send_frag(struct qib_pportdata *ppd,
+				    struct qib_user_sdma_pkt *pkt, int idx,
+				    unsigned ofs, u16 tail, u8 gen)
+{
+	const u64 addr = (u64) pkt->addr[idx].addr +
+		(u64) pkt->addr[idx].offset;
+	const u64 dwlen = (u64) pkt->addr[idx].length / 4;
+	__le64 *descqp;
+	__le64 descq0;
+
+	descqp = &ppd->sdma_descq[tail].qw[0];
+
+	descq0 = qib_sdma_make_desc0(gen, addr, dwlen, ofs);
+	if (pkt->addr[idx].first_desc)
+		descq0 = qib_sdma_make_first_desc0(descq0);
+	if (pkt->addr[idx].last_desc) {
+		descq0 = qib_sdma_make_last_desc0(descq0);
+		if (ppd->sdma_intrequest) {
+			descq0 |= cpu_to_le64(1ULL << 15);
+			ppd->sdma_intrequest = 0;
+		}
+	}
+
+	descqp[0] = descq0;
+	descqp[1] = qib_sdma_make_desc1(addr);
+}
+
+void qib_user_sdma_send_desc(struct qib_pportdata *ppd,
+				struct list_head *pktlist)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u16 nfree, nsent;
+	u16 tail, tail_c;
+	u8 gen, gen_c;
+
+	nfree = qib_sdma_descq_freecnt(ppd);
+	if (!nfree)
+		return;
+
+retry:
+	nsent = 0;
+	tail_c = tail = ppd->sdma_descq_tail;
+	gen_c = gen = ppd->sdma_generation;
+	while (!list_empty(pktlist)) {
+		struct qib_user_sdma_pkt *pkt =
+			list_entry(pktlist->next, struct qib_user_sdma_pkt,
+				   list);
+		int i, j, c = 0;
+		unsigned ofs = 0;
+		u16 dtail = tail;
+
+		for (i = pkt->index; i < pkt->naddr && nfree; i++) {
+			qib_user_sdma_send_frag(ppd, pkt, i, ofs, tail, gen);
+			ofs += pkt->addr[i].length >> 2;
+
+			if (++tail == ppd->sdma_descq_cnt) {
+				tail = 0;
+				++gen;
+				ppd->sdma_intrequest = 1;
+			} else if (tail == (ppd->sdma_descq_cnt>>1)) {
+				ppd->sdma_intrequest = 1;
+			}
+			nfree--;
+			if (pkt->addr[i].last_desc == 0)
+				continue;
+
+			/*
+			 * If the packet is >= 2KB mtu equivalent, we
+			 * have to use the large buffers, and have to
+			 * mark each descriptor as part of a large
+			 * buffer packet.
+			 */
+			if (ofs > dd->piosize2kmax_dwords) {
+				for (j = pkt->index; j <= i; j++) {
+					ppd->sdma_descq[dtail].qw[0] |=
+						cpu_to_le64(1ULL << 14);
+					if (++dtail == ppd->sdma_descq_cnt)
+						dtail = 0;
+				}
+			}
+			c += i + 1 - pkt->index;
+			pkt->index = i + 1; /* index for next first */
+			tail_c = dtail = tail;
+			gen_c = gen;
+			ofs = 0;  /* reset for next packet */
+		}
+
+		ppd->sdma_descq_added += c;
+		nsent += c;
+		if (pkt->index == pkt->naddr) {
+			pkt->added = ppd->sdma_descq_added;
+			pkt->pq->added = pkt->added;
+			pkt->pq->num_pending--;
+			spin_lock(&pkt->pq->sent_lock);
+			pkt->pq->num_sending++;
+			list_move_tail(&pkt->list, &pkt->pq->sent);
+			spin_unlock(&pkt->pq->sent_lock);
+		}
+		if (!nfree || (nsent<<2) > ppd->sdma_descq_cnt)
+			break;
+	}
+
+	/* advance the tail on the chip if necessary */
+	if (ppd->sdma_descq_tail != tail_c) {
+		ppd->sdma_generation = gen_c;
+		dd->f_sdma_update_tail(ppd, tail_c);
+	}
+
+	if (nfree && !list_empty(pktlist))
+		goto retry;
+}
+
+/* pq->lock must be held, get packets on the wire... */
+static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd,
+				 struct qib_user_sdma_queue *pq,
+				 struct list_head *pktlist, int count)
+{
+	unsigned long flags;
+
+	if (unlikely(!(ppd->lflags & QIBL_LINKACTIVE)))
+		return -ECOMM;
+
+	/* non-blocking mode */
+	if (pq->sdma_rb_node->refcount > 1) {
+		spin_lock_irqsave(&ppd->sdma_lock, flags);
+		if (unlikely(!__qib_sdma_running(ppd))) {
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+			return -ECOMM;
+		}
+		pq->num_pending += count;
+		list_splice_tail_init(pktlist, &ppd->sdma_userpending);
+		qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending);
+		spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+		return 0;
+	}
+
+	/* In this case, descriptors from this process are not
+	 * linked to ppd pending queue, interrupt handler
+	 * won't update this process, it is OK to directly
+	 * modify without sdma lock.
+	 */
+
+
+	pq->num_pending += count;
+	/*
+	 * Blocking mode for single rail process, we must
+	 * release/regain sdma_lock to give other process
+	 * chance to make progress. This is important for
+	 * performance.
+	 */
+	do {
+		spin_lock_irqsave(&ppd->sdma_lock, flags);
+		if (unlikely(!__qib_sdma_running(ppd))) {
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+			return -ECOMM;
+		}
+		qib_user_sdma_send_desc(ppd, pktlist);
+		if (!list_empty(pktlist))
+			qib_sdma_make_progress(ppd);
+		spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+	} while (!list_empty(pktlist));
+
+	return 0;
+}
+
+int qib_user_sdma_writev(struct qib_ctxtdata *rcd,
+			 struct qib_user_sdma_queue *pq,
+			 const struct iovec *iov,
+			 unsigned long dim)
+{
+	struct qib_devdata *dd = rcd->dd;
+	struct qib_pportdata *ppd = rcd->ppd;
+	int ret = 0;
+	struct list_head list;
+	int npkts = 0;
+
+	INIT_LIST_HEAD(&list);
+
+	mutex_lock(&pq->lock);
+
+	/* why not -ECOMM like qib_user_sdma_push_pkts() below? */
+	if (!qib_sdma_running(ppd))
+		goto done_unlock;
+
+	/* if I have packets not complete yet */
+	if (pq->added > ppd->sdma_descq_removed)
+		qib_user_sdma_hwqueue_clean(ppd);
+	/* if I have complete packets to be freed */
+	if (pq->num_sending)
+		qib_user_sdma_queue_clean(ppd, pq);
+
+	while (dim) {
+		int mxp = 1;
+		int ndesc = 0;
+
+		ret = qib_user_sdma_queue_pkts(dd, ppd, pq,
+				iov, dim, &list, &mxp, &ndesc);
+		if (ret < 0)
+			goto done_unlock;
+		else {
+			dim -= ret;
+			iov += ret;
+		}
+
+		/* force packets onto the sdma hw queue... */
+		if (!list_empty(&list)) {
+			/*
+			 * Lazily clean hw queue.
+			 */
+			if (qib_sdma_descq_freecnt(ppd) < ndesc) {
+				qib_user_sdma_hwqueue_clean(ppd);
+				if (pq->num_sending)
+					qib_user_sdma_queue_clean(ppd, pq);
+			}
+
+			ret = qib_user_sdma_push_pkts(ppd, pq, &list, mxp);
+			if (ret < 0)
+				goto done_unlock;
+			else {
+				npkts += mxp;
+				pq->counter += mxp;
+			}
+		}
+	}
+
+done_unlock:
+	if (!list_empty(&list))
+		qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list);
+	mutex_unlock(&pq->lock);
+
+	return (ret < 0) ? ret : npkts;
+}
+
+int qib_user_sdma_make_progress(struct qib_pportdata *ppd,
+				struct qib_user_sdma_queue *pq)
+{
+	int ret = 0;
+
+	mutex_lock(&pq->lock);
+	qib_user_sdma_hwqueue_clean(ppd);
+	ret = qib_user_sdma_queue_clean(ppd, pq);
+	mutex_unlock(&pq->lock);
+
+	return ret;
+}
+
+u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq)
+{
+	return pq ? pq->sent_counter : 0;
+}
+
+u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq)
+{
+	return pq ? pq->counter : 0;
+}
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.h b/drivers/infiniband/hw/qib/qib_user_sdma.h
new file mode 100644
index 0000000..ce8cbaf
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/device.h>
+
+struct qib_user_sdma_queue;
+
+struct qib_user_sdma_queue *
+qib_user_sdma_queue_create(struct device *dev, int unit, int port, int sport);
+void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq);
+
+int qib_user_sdma_writev(struct qib_ctxtdata *pd,
+			 struct qib_user_sdma_queue *pq,
+			 const struct iovec *iov,
+			 unsigned long dim);
+
+int qib_user_sdma_make_progress(struct qib_pportdata *ppd,
+				struct qib_user_sdma_queue *pq);
+
+void qib_user_sdma_queue_drain(struct qib_pportdata *ppd,
+			       struct qib_user_sdma_queue *pq);
+
+u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq);
+u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq);
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
new file mode 100644
index 0000000..3977abb
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -0,0 +1,1743 @@
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/vmalloc.h>
+#include <rdma/rdma_vt.h>
+
+#include "qib.h"
+#include "qib_common.h"
+
+static unsigned int ib_qib_qp_table_size = 256;
+module_param_named(qp_table_size, ib_qib_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+static unsigned int qib_lkey_table_size = 16;
+module_param_named(lkey_table_size, qib_lkey_table_size, uint,
+		   S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+		 "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int ib_qib_max_pds = 0xFFFF;
+module_param_named(max_pds, ib_qib_max_pds, uint, S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+		 "Maximum number of protection domains to support");
+
+static unsigned int ib_qib_max_ahs = 0xFFFF;
+module_param_named(max_ahs, ib_qib_max_ahs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int ib_qib_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, ib_qib_max_cqes, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+		 "Maximum number of completion queue entries to support");
+
+unsigned int ib_qib_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, ib_qib_max_cqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int ib_qib_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, ib_qib_max_qp_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int ib_qib_max_qps = 16384;
+module_param_named(max_qps, ib_qib_max_qps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int ib_qib_max_sges = 0x60;
+module_param_named(max_sges, ib_qib_max_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int ib_qib_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, ib_qib_max_mcast_grps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+		 "Maximum number of multicast groups to support");
+
+unsigned int ib_qib_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, ib_qib_max_mcast_qp_attached,
+		   uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+		 "Maximum number of attached QPs to support");
+
+unsigned int ib_qib_max_srqs = 1024;
+module_param_named(max_srqs, ib_qib_max_srqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int ib_qib_max_srq_sges = 128;
+module_param_named(max_srq_sges, ib_qib_max_srq_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int ib_qib_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, ib_qib_max_srq_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+static unsigned int ib_qib_disable_sma;
+module_param_named(disable_sma, ib_qib_disable_sma, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(disable_sma, "Disable the SMA");
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_qib_wc_opcode[] = {
+	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+	[IB_WR_SEND] = IB_WC_SEND,
+	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * System image GUID.
+ */
+__be64 ib_qib_sys_image_guid;
+
+/**
+ * qib_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ */
+void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, int release)
+{
+	struct rvt_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = rvt_get_sge_length(sge, length);
+
+		WARN_ON_ONCE(len == 0);
+		memcpy(sge->vaddr, data, len);
+		rvt_update_sge(ss, len, release);
+		data += len;
+		length -= len;
+	}
+}
+
+/*
+ * Count the number of DMA descriptors needed to send length bytes of data.
+ * Don't modify the qib_sge_state to get the count.
+ * Return zero if any of the segments is not aligned.
+ */
+static u32 qib_count_sge(struct rvt_sge_state *ss, u32 length)
+{
+	struct rvt_sge *sg_list = ss->sg_list;
+	struct rvt_sge sge = ss->sge;
+	u8 num_sge = ss->num_sge;
+	u32 ndesc = 1;  /* count the header */
+
+	while (length) {
+		u32 len = sge.length;
+
+		if (len > length)
+			len = length;
+		if (len > sge.sge_length)
+			len = sge.sge_length;
+		BUG_ON(len == 0);
+		if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
+		    (len != length && (len & (sizeof(u32) - 1)))) {
+			ndesc = 0;
+			break;
+		}
+		ndesc++;
+		sge.vaddr += len;
+		sge.length -= len;
+		sge.sge_length -= len;
+		if (sge.sge_length == 0) {
+			if (--num_sge)
+				sge = *sg_list++;
+		} else if (sge.length == 0 && sge.mr->lkey) {
+			if (++sge.n >= RVT_SEGSZ) {
+				if (++sge.m >= sge.mr->mapsz)
+					break;
+				sge.n = 0;
+			}
+			sge.vaddr =
+				sge.mr->map[sge.m]->segs[sge.n].vaddr;
+			sge.length =
+				sge.mr->map[sge.m]->segs[sge.n].length;
+		}
+		length -= len;
+	}
+	return ndesc;
+}
+
+/*
+ * Copy from the SGEs to the data buffer.
+ */
+static void qib_copy_from_sge(void *data, struct rvt_sge_state *ss, u32 length)
+{
+	struct rvt_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		memcpy(data, sge->vaddr, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= RVT_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		data += len;
+		length -= len;
+	}
+}
+
+/**
+ * qib_qp_rcv - processing an incoming packet on a QP
+ * @rcd: the context pointer
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from qib_ib_rcv() to process an incoming packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+static void qib_qp_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr,
+		       int has_grh, void *data, u32 tlen, struct rvt_qp *qp)
+{
+	struct qib_ibport *ibp = &rcd->ppd->ibport_data;
+
+	spin_lock(&qp->r_lock);
+
+	/* Check for valid receive state. */
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+		ibp->rvp.n_pkt_drops++;
+		goto unlock;
+	}
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		if (ib_qib_disable_sma)
+			break;
+		/* FALLTHROUGH */
+	case IB_QPT_UD:
+		qib_ud_rcv(ibp, hdr, has_grh, data, tlen, qp);
+		break;
+
+	case IB_QPT_RC:
+		qib_rc_rcv(rcd, hdr, has_grh, data, tlen, qp);
+		break;
+
+	case IB_QPT_UC:
+		qib_uc_rcv(ibp, hdr, has_grh, data, tlen, qp);
+		break;
+
+	default:
+		break;
+	}
+
+unlock:
+	spin_unlock(&qp->r_lock);
+}
+
+/**
+ * qib_ib_rcv - process an incoming packet
+ * @rcd: the context pointer
+ * @rhdr: the header of the packet
+ * @data: the packet payload
+ * @tlen: the packet length
+ *
+ * This is called from qib_kreceive() to process an incoming packet at
+ * interrupt level. Tlen is the length of the header + data + CRC in bytes.
+ */
+void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen)
+{
+	struct qib_pportdata *ppd = rcd->ppd;
+	struct qib_ibport *ibp = &ppd->ibport_data;
+	struct ib_header *hdr = rhdr;
+	struct qib_devdata *dd = ppd->dd;
+	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+	struct ib_other_headers *ohdr;
+	struct rvt_qp *qp;
+	u32 qp_num;
+	int lnh;
+	u8 opcode;
+	u16 lid;
+
+	/* 24 == LRH+BTH+CRC */
+	if (unlikely(tlen < 24))
+		goto drop;
+
+	/* Check for a valid destination LID (see ch. 7.11.1). */
+	lid = be16_to_cpu(hdr->lrh[1]);
+	if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+		lid &= ~((1 << ppd->lmc) - 1);
+		if (unlikely(lid != ppd->lid))
+			goto drop;
+	}
+
+	/* Check for GRH */
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh == QIB_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else if (lnh == QIB_LRH_GRH) {
+		u32 vtf;
+
+		ohdr = &hdr->u.l.oth;
+		if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+			goto drop;
+		vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+		if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+			goto drop;
+	} else
+		goto drop;
+
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f;
+#ifdef CONFIG_DEBUG_FS
+	rcd->opstats->stats[opcode].n_bytes += tlen;
+	rcd->opstats->stats[opcode].n_packets++;
+#endif
+
+	/* Get the destination QP number. */
+	qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+	if (qp_num == QIB_MULTICAST_QPN) {
+		struct rvt_mcast *mcast;
+		struct rvt_mcast_qp *p;
+
+		if (lnh != QIB_LRH_GRH)
+			goto drop;
+		mcast = rvt_mcast_find(&ibp->rvp, &hdr->u.l.grh.dgid, lid);
+		if (mcast == NULL)
+			goto drop;
+		this_cpu_inc(ibp->pmastats->n_multicast_rcv);
+		list_for_each_entry_rcu(p, &mcast->qp_list, list)
+			qib_qp_rcv(rcd, hdr, 1, data, tlen, p->qp);
+		/*
+		 * Notify rvt_multicast_detach() if it is waiting for us
+		 * to finish.
+		 */
+		if (atomic_dec_return(&mcast->refcount) <= 1)
+			wake_up(&mcast->wait);
+	} else {
+		rcu_read_lock();
+		qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+		if (!qp) {
+			rcu_read_unlock();
+			goto drop;
+		}
+		this_cpu_inc(ibp->pmastats->n_unicast_rcv);
+		qib_qp_rcv(rcd, hdr, lnh == QIB_LRH_GRH, data, tlen, qp);
+		rcu_read_unlock();
+	}
+	return;
+
+drop:
+	ibp->rvp.n_pkt_drops++;
+}
+
+/*
+ * This is called from a timer to check for QPs
+ * which need kernel memory in order to send a packet.
+ */
+static void mem_timer(struct timer_list *t)
+{
+	struct qib_ibdev *dev = from_timer(dev, t, mem_timer);
+	struct list_head *list = &dev->memwait;
+	struct rvt_qp *qp = NULL;
+	struct qib_qp_priv *priv = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->rdi.pending_lock, flags);
+	if (!list_empty(list)) {
+		priv = list_entry(list->next, struct qib_qp_priv, iowait);
+		qp = priv->owner;
+		list_del_init(&priv->iowait);
+		rvt_get_qp(qp);
+		if (!list_empty(list))
+			mod_timer(&dev->mem_timer, jiffies + 1);
+	}
+	spin_unlock_irqrestore(&dev->rdi.pending_lock, flags);
+
+	if (qp) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (qp->s_flags & RVT_S_WAIT_KMEM) {
+			qp->s_flags &= ~RVT_S_WAIT_KMEM;
+			qib_schedule_send(qp);
+		}
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		rvt_put_qp(qp);
+	}
+}
+
+#ifdef __LITTLE_ENDIAN
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+	return data >> shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+	return data << shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+	data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
+	data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+	return data;
+}
+#else
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+	return data << shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+	return data >> shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+	data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
+	data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+	return data;
+}
+#endif
+
+static void copy_io(u32 __iomem *piobuf, struct rvt_sge_state *ss,
+		    u32 length, unsigned flush_wc)
+{
+	u32 extra = 0;
+	u32 data = 0;
+	u32 last;
+
+	while (1) {
+		u32 len = ss->sge.length;
+		u32 off;
+
+		if (len > length)
+			len = length;
+		if (len > ss->sge.sge_length)
+			len = ss->sge.sge_length;
+		BUG_ON(len == 0);
+		/* If the source address is not aligned, try to align it. */
+		off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
+		if (off) {
+			u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
+					    ~(sizeof(u32) - 1));
+			u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
+			u32 y;
+
+			y = sizeof(u32) - off;
+			if (len > y)
+				len = y;
+			if (len + extra >= sizeof(u32)) {
+				data |= set_upper_bits(v, extra *
+						       BITS_PER_BYTE);
+				len = sizeof(u32) - extra;
+				if (len == length) {
+					last = data;
+					break;
+				}
+				__raw_writel(data, piobuf);
+				piobuf++;
+				extra = 0;
+				data = 0;
+			} else {
+				/* Clear unused upper bytes */
+				data |= clear_upper_bytes(v, len, extra);
+				if (len == length) {
+					last = data;
+					break;
+				}
+				extra += len;
+			}
+		} else if (extra) {
+			/* Source address is aligned. */
+			u32 *addr = (u32 *) ss->sge.vaddr;
+			int shift = extra * BITS_PER_BYTE;
+			int ushift = 32 - shift;
+			u32 l = len;
+
+			while (l >= sizeof(u32)) {
+				u32 v = *addr;
+
+				data |= set_upper_bits(v, shift);
+				__raw_writel(data, piobuf);
+				data = get_upper_bits(v, ushift);
+				piobuf++;
+				addr++;
+				l -= sizeof(u32);
+			}
+			/*
+			 * We still have 'extra' number of bytes leftover.
+			 */
+			if (l) {
+				u32 v = *addr;
+
+				if (l + extra >= sizeof(u32)) {
+					data |= set_upper_bits(v, shift);
+					len -= l + extra - sizeof(u32);
+					if (len == length) {
+						last = data;
+						break;
+					}
+					__raw_writel(data, piobuf);
+					piobuf++;
+					extra = 0;
+					data = 0;
+				} else {
+					/* Clear unused upper bytes */
+					data |= clear_upper_bytes(v, l, extra);
+					if (len == length) {
+						last = data;
+						break;
+					}
+					extra += l;
+				}
+			} else if (len == length) {
+				last = data;
+				break;
+			}
+		} else if (len == length) {
+			u32 w;
+
+			/*
+			 * Need to round up for the last dword in the
+			 * packet.
+			 */
+			w = (len + 3) >> 2;
+			qib_pio_copy(piobuf, ss->sge.vaddr, w - 1);
+			piobuf += w - 1;
+			last = ((u32 *) ss->sge.vaddr)[w - 1];
+			break;
+		} else {
+			u32 w = len >> 2;
+
+			qib_pio_copy(piobuf, ss->sge.vaddr, w);
+			piobuf += w;
+
+			extra = len & (sizeof(u32) - 1);
+			if (extra) {
+				u32 v = ((u32 *) ss->sge.vaddr)[w];
+
+				/* Clear unused upper bytes */
+				data = clear_upper_bytes(v, extra, 0);
+			}
+		}
+		rvt_update_sge(ss, len, false);
+		length -= len;
+	}
+	/* Update address before sending packet. */
+	rvt_update_sge(ss, length, false);
+	if (flush_wc) {
+		/* must flush early everything before trigger word */
+		qib_flush_wc();
+		__raw_writel(last, piobuf);
+		/* be sure trigger word is written */
+		qib_flush_wc();
+	} else
+		__raw_writel(last, piobuf);
+}
+
+static noinline struct qib_verbs_txreq *__get_txreq(struct qib_ibdev *dev,
+					   struct rvt_qp *qp)
+{
+	struct qib_qp_priv *priv = qp->priv;
+	struct qib_verbs_txreq *tx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	spin_lock(&dev->rdi.pending_lock);
+
+	if (!list_empty(&dev->txreq_free)) {
+		struct list_head *l = dev->txreq_free.next;
+
+		list_del(l);
+		spin_unlock(&dev->rdi.pending_lock);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		tx = list_entry(l, struct qib_verbs_txreq, txreq.list);
+	} else {
+		if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK &&
+		    list_empty(&priv->iowait)) {
+			dev->n_txwait++;
+			qp->s_flags |= RVT_S_WAIT_TX;
+			list_add_tail(&priv->iowait, &dev->txwait);
+		}
+		qp->s_flags &= ~RVT_S_BUSY;
+		spin_unlock(&dev->rdi.pending_lock);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		tx = ERR_PTR(-EBUSY);
+	}
+	return tx;
+}
+
+static inline struct qib_verbs_txreq *get_txreq(struct qib_ibdev *dev,
+					 struct rvt_qp *qp)
+{
+	struct qib_verbs_txreq *tx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->rdi.pending_lock, flags);
+	/* assume the list non empty */
+	if (likely(!list_empty(&dev->txreq_free))) {
+		struct list_head *l = dev->txreq_free.next;
+
+		list_del(l);
+		spin_unlock_irqrestore(&dev->rdi.pending_lock, flags);
+		tx = list_entry(l, struct qib_verbs_txreq, txreq.list);
+	} else {
+		/* call slow path to get the extra lock */
+		spin_unlock_irqrestore(&dev->rdi.pending_lock, flags);
+		tx =  __get_txreq(dev, qp);
+	}
+	return tx;
+}
+
+void qib_put_txreq(struct qib_verbs_txreq *tx)
+{
+	struct qib_ibdev *dev;
+	struct rvt_qp *qp;
+	struct qib_qp_priv *priv;
+	unsigned long flags;
+
+	qp = tx->qp;
+	dev = to_idev(qp->ibqp.device);
+
+	if (tx->mr) {
+		rvt_put_mr(tx->mr);
+		tx->mr = NULL;
+	}
+	if (tx->txreq.flags & QIB_SDMA_TXREQ_F_FREEBUF) {
+		tx->txreq.flags &= ~QIB_SDMA_TXREQ_F_FREEBUF;
+		dma_unmap_single(&dd_from_dev(dev)->pcidev->dev,
+				 tx->txreq.addr, tx->hdr_dwords << 2,
+				 DMA_TO_DEVICE);
+		kfree(tx->align_buf);
+	}
+
+	spin_lock_irqsave(&dev->rdi.pending_lock, flags);
+
+	/* Put struct back on free list */
+	list_add(&tx->txreq.list, &dev->txreq_free);
+
+	if (!list_empty(&dev->txwait)) {
+		/* Wake up first QP wanting a free struct */
+		priv = list_entry(dev->txwait.next, struct qib_qp_priv,
+				  iowait);
+		qp = priv->owner;
+		list_del_init(&priv->iowait);
+		rvt_get_qp(qp);
+		spin_unlock_irqrestore(&dev->rdi.pending_lock, flags);
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (qp->s_flags & RVT_S_WAIT_TX) {
+			qp->s_flags &= ~RVT_S_WAIT_TX;
+			qib_schedule_send(qp);
+		}
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		rvt_put_qp(qp);
+	} else
+		spin_unlock_irqrestore(&dev->rdi.pending_lock, flags);
+}
+
+/*
+ * This is called when there are send DMA descriptors that might be
+ * available.
+ *
+ * This is called with ppd->sdma_lock held.
+ */
+void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail)
+{
+	struct rvt_qp *qp;
+	struct qib_qp_priv *qpp, *nqpp;
+	struct rvt_qp *qps[20];
+	struct qib_ibdev *dev;
+	unsigned i, n;
+
+	n = 0;
+	dev = &ppd->dd->verbs_dev;
+	spin_lock(&dev->rdi.pending_lock);
+
+	/* Search wait list for first QP wanting DMA descriptors. */
+	list_for_each_entry_safe(qpp, nqpp, &dev->dmawait, iowait) {
+		qp = qpp->owner;
+		if (qp->port_num != ppd->port)
+			continue;
+		if (n == ARRAY_SIZE(qps))
+			break;
+		if (qpp->s_tx->txreq.sg_count > avail)
+			break;
+		avail -= qpp->s_tx->txreq.sg_count;
+		list_del_init(&qpp->iowait);
+		rvt_get_qp(qp);
+		qps[n++] = qp;
+	}
+
+	spin_unlock(&dev->rdi.pending_lock);
+
+	for (i = 0; i < n; i++) {
+		qp = qps[i];
+		spin_lock(&qp->s_lock);
+		if (qp->s_flags & RVT_S_WAIT_DMA_DESC) {
+			qp->s_flags &= ~RVT_S_WAIT_DMA_DESC;
+			qib_schedule_send(qp);
+		}
+		spin_unlock(&qp->s_lock);
+		rvt_put_qp(qp);
+	}
+}
+
+/*
+ * This is called with ppd->sdma_lock held.
+ */
+static void sdma_complete(struct qib_sdma_txreq *cookie, int status)
+{
+	struct qib_verbs_txreq *tx =
+		container_of(cookie, struct qib_verbs_txreq, txreq);
+	struct rvt_qp *qp = tx->qp;
+	struct qib_qp_priv *priv = qp->priv;
+
+	spin_lock(&qp->s_lock);
+	if (tx->wqe)
+		qib_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+	else if (qp->ibqp.qp_type == IB_QPT_RC) {
+		struct ib_header *hdr;
+
+		if (tx->txreq.flags & QIB_SDMA_TXREQ_F_FREEBUF)
+			hdr = &tx->align_buf->hdr;
+		else {
+			struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+
+			hdr = &dev->pio_hdrs[tx->hdr_inx].hdr;
+		}
+		qib_rc_send_complete(qp, hdr);
+	}
+	if (atomic_dec_and_test(&priv->s_dma_busy)) {
+		if (qp->state == IB_QPS_RESET)
+			wake_up(&priv->wait_dma);
+		else if (qp->s_flags & RVT_S_WAIT_DMA) {
+			qp->s_flags &= ~RVT_S_WAIT_DMA;
+			qib_schedule_send(qp);
+		}
+	}
+	spin_unlock(&qp->s_lock);
+
+	qib_put_txreq(tx);
+}
+
+static int wait_kmem(struct qib_ibdev *dev, struct rvt_qp *qp)
+{
+	struct qib_qp_priv *priv = qp->priv;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+		spin_lock(&dev->rdi.pending_lock);
+		if (list_empty(&priv->iowait)) {
+			if (list_empty(&dev->memwait))
+				mod_timer(&dev->mem_timer, jiffies + 1);
+			qp->s_flags |= RVT_S_WAIT_KMEM;
+			list_add_tail(&priv->iowait, &dev->memwait);
+		}
+		spin_unlock(&dev->rdi.pending_lock);
+		qp->s_flags &= ~RVT_S_BUSY;
+		ret = -EBUSY;
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	return ret;
+}
+
+static int qib_verbs_send_dma(struct rvt_qp *qp, struct ib_header *hdr,
+			      u32 hdrwords, struct rvt_sge_state *ss, u32 len,
+			      u32 plen, u32 dwords)
+{
+	struct qib_qp_priv *priv = qp->priv;
+	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_verbs_txreq *tx;
+	struct qib_pio_header *phdr;
+	u32 control;
+	u32 ndesc;
+	int ret;
+
+	tx = priv->s_tx;
+	if (tx) {
+		priv->s_tx = NULL;
+		/* resend previously constructed packet */
+		ret = qib_sdma_verbs_send(ppd, tx->ss, tx->dwords, tx);
+		goto bail;
+	}
+
+	tx = get_txreq(dev, qp);
+	if (IS_ERR(tx))
+		goto bail_tx;
+
+	control = dd->f_setpbc_control(ppd, plen, qp->s_srate,
+				       be16_to_cpu(hdr->lrh[0]) >> 12);
+	tx->qp = qp;
+	tx->wqe = qp->s_wqe;
+	tx->mr = qp->s_rdma_mr;
+	if (qp->s_rdma_mr)
+		qp->s_rdma_mr = NULL;
+	tx->txreq.callback = sdma_complete;
+	if (dd->flags & QIB_HAS_SDMA_TIMEOUT)
+		tx->txreq.flags = QIB_SDMA_TXREQ_F_HEADTOHOST;
+	else
+		tx->txreq.flags = QIB_SDMA_TXREQ_F_INTREQ;
+	if (plen + 1 > dd->piosize2kmax_dwords)
+		tx->txreq.flags |= QIB_SDMA_TXREQ_F_USELARGEBUF;
+
+	if (len) {
+		/*
+		 * Don't try to DMA if it takes more descriptors than
+		 * the queue holds.
+		 */
+		ndesc = qib_count_sge(ss, len);
+		if (ndesc >= ppd->sdma_descq_cnt)
+			ndesc = 0;
+	} else
+		ndesc = 1;
+	if (ndesc) {
+		phdr = &dev->pio_hdrs[tx->hdr_inx];
+		phdr->pbc[0] = cpu_to_le32(plen);
+		phdr->pbc[1] = cpu_to_le32(control);
+		memcpy(&phdr->hdr, hdr, hdrwords << 2);
+		tx->txreq.flags |= QIB_SDMA_TXREQ_F_FREEDESC;
+		tx->txreq.sg_count = ndesc;
+		tx->txreq.addr = dev->pio_hdrs_phys +
+			tx->hdr_inx * sizeof(struct qib_pio_header);
+		tx->hdr_dwords = hdrwords + 2; /* add PBC length */
+		ret = qib_sdma_verbs_send(ppd, ss, dwords, tx);
+		goto bail;
+	}
+
+	/* Allocate a buffer and copy the header and payload to it. */
+	tx->hdr_dwords = plen + 1;
+	phdr = kmalloc(tx->hdr_dwords << 2, GFP_ATOMIC);
+	if (!phdr)
+		goto err_tx;
+	phdr->pbc[0] = cpu_to_le32(plen);
+	phdr->pbc[1] = cpu_to_le32(control);
+	memcpy(&phdr->hdr, hdr, hdrwords << 2);
+	qib_copy_from_sge((u32 *) &phdr->hdr + hdrwords, ss, len);
+
+	tx->txreq.addr = dma_map_single(&dd->pcidev->dev, phdr,
+					tx->hdr_dwords << 2, DMA_TO_DEVICE);
+	if (dma_mapping_error(&dd->pcidev->dev, tx->txreq.addr))
+		goto map_err;
+	tx->align_buf = phdr;
+	tx->txreq.flags |= QIB_SDMA_TXREQ_F_FREEBUF;
+	tx->txreq.sg_count = 1;
+	ret = qib_sdma_verbs_send(ppd, NULL, 0, tx);
+	goto unaligned;
+
+map_err:
+	kfree(phdr);
+err_tx:
+	qib_put_txreq(tx);
+	ret = wait_kmem(dev, qp);
+unaligned:
+	ibp->rvp.n_unaligned++;
+bail:
+	return ret;
+bail_tx:
+	ret = PTR_ERR(tx);
+	goto bail;
+}
+
+/*
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int no_bufs_available(struct rvt_qp *qp)
+{
+	struct qib_qp_priv *priv = qp->priv;
+	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+	struct qib_devdata *dd;
+	unsigned long flags;
+	int ret = 0;
+
+	/*
+	 * Note that as soon as want_buffer() is called and
+	 * possibly before it returns, qib_ib_piobufavail()
+	 * could be called. Therefore, put QP on the I/O wait list before
+	 * enabling the PIO avail interrupt.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+		spin_lock(&dev->rdi.pending_lock);
+		if (list_empty(&priv->iowait)) {
+			dev->n_piowait++;
+			qp->s_flags |= RVT_S_WAIT_PIO;
+			list_add_tail(&priv->iowait, &dev->piowait);
+			dd = dd_from_dev(dev);
+			dd->f_wantpiobuf_intr(dd, 1);
+		}
+		spin_unlock(&dev->rdi.pending_lock);
+		qp->s_flags &= ~RVT_S_BUSY;
+		ret = -EBUSY;
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+static int qib_verbs_send_pio(struct rvt_qp *qp, struct ib_header *ibhdr,
+			      u32 hdrwords, struct rvt_sge_state *ss, u32 len,
+			      u32 plen, u32 dwords)
+{
+	struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	struct qib_pportdata *ppd = dd->pport + qp->port_num - 1;
+	u32 *hdr = (u32 *) ibhdr;
+	u32 __iomem *piobuf_orig;
+	u32 __iomem *piobuf;
+	u64 pbc;
+	unsigned long flags;
+	unsigned flush_wc;
+	u32 control;
+	u32 pbufn;
+
+	control = dd->f_setpbc_control(ppd, plen, qp->s_srate,
+		be16_to_cpu(ibhdr->lrh[0]) >> 12);
+	pbc = ((u64) control << 32) | plen;
+	piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn);
+	if (unlikely(piobuf == NULL))
+		return no_bufs_available(qp);
+
+	/*
+	 * Write the pbc.
+	 * We have to flush after the PBC for correctness on some cpus
+	 * or WC buffer can be written out of order.
+	 */
+	writeq(pbc, piobuf);
+	piobuf_orig = piobuf;
+	piobuf += 2;
+
+	flush_wc = dd->flags & QIB_PIO_FLUSH_WC;
+	if (len == 0) {
+		/*
+		 * If there is just the header portion, must flush before
+		 * writing last word of header for correctness, and after
+		 * the last header word (trigger word).
+		 */
+		if (flush_wc) {
+			qib_flush_wc();
+			qib_pio_copy(piobuf, hdr, hdrwords - 1);
+			qib_flush_wc();
+			__raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
+			qib_flush_wc();
+		} else
+			qib_pio_copy(piobuf, hdr, hdrwords);
+		goto done;
+	}
+
+	if (flush_wc)
+		qib_flush_wc();
+	qib_pio_copy(piobuf, hdr, hdrwords);
+	piobuf += hdrwords;
+
+	/* The common case is aligned and contained in one segment. */
+	if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
+		   !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
+		u32 *addr = (u32 *) ss->sge.vaddr;
+
+		/* Update address before sending packet. */
+		rvt_update_sge(ss, len, false);
+		if (flush_wc) {
+			qib_pio_copy(piobuf, addr, dwords - 1);
+			/* must flush early everything before trigger word */
+			qib_flush_wc();
+			__raw_writel(addr[dwords - 1], piobuf + dwords - 1);
+			/* be sure trigger word is written */
+			qib_flush_wc();
+		} else
+			qib_pio_copy(piobuf, addr, dwords);
+		goto done;
+	}
+	copy_io(piobuf, ss, len, flush_wc);
+done:
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf_orig + spcl_off);
+	}
+	qib_sendbuf_done(dd, pbufn);
+	if (qp->s_rdma_mr) {
+		rvt_put_mr(qp->s_rdma_mr);
+		qp->s_rdma_mr = NULL;
+	}
+	if (qp->s_wqe) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		qib_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		qib_rc_send_complete(qp, ibhdr);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+	return 0;
+}
+
+/**
+ * qib_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @hdr: the packet header
+ * @hdrwords: the number of 32-bit words in the header
+ * @ss: the SGE to send
+ * @len: the length of the packet in bytes
+ *
+ * Return zero if packet is sent or queued OK.
+ * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise.
+ */
+int qib_verbs_send(struct rvt_qp *qp, struct ib_header *hdr,
+		   u32 hdrwords, struct rvt_sge_state *ss, u32 len)
+{
+	struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	u32 plen;
+	int ret;
+	u32 dwords = (len + 3) >> 2;
+
+	/*
+	 * Calculate the send buffer trigger address.
+	 * The +1 counts for the pbc control dword following the pbc length.
+	 */
+	plen = hdrwords + dwords + 1;
+
+	/*
+	 * VL15 packets (IB_QPT_SMI) will always use PIO, so we
+	 * can defer SDMA restart until link goes ACTIVE without
+	 * worrying about just how we got there.
+	 */
+	if (qp->ibqp.qp_type == IB_QPT_SMI ||
+	    !(dd->flags & QIB_HAS_SEND_DMA))
+		ret = qib_verbs_send_pio(qp, hdr, hdrwords, ss, len,
+					 plen, dwords);
+	else
+		ret = qib_verbs_send_dma(qp, hdr, hdrwords, ss, len,
+					 plen, dwords);
+
+	return ret;
+}
+
+int qib_snapshot_counters(struct qib_pportdata *ppd, u64 *swords,
+			  u64 *rwords, u64 *spkts, u64 *rpkts,
+			  u64 *xmit_wait)
+{
+	int ret;
+	struct qib_devdata *dd = ppd->dd;
+
+	if (!(dd->flags & QIB_PRESENT)) {
+		/* no hardware, freeze, etc. */
+		ret = -EINVAL;
+		goto bail;
+	}
+	*swords = dd->f_portcntr(ppd, QIBPORTCNTR_WORDSEND);
+	*rwords = dd->f_portcntr(ppd, QIBPORTCNTR_WORDRCV);
+	*spkts = dd->f_portcntr(ppd, QIBPORTCNTR_PKTSEND);
+	*rpkts = dd->f_portcntr(ppd, QIBPORTCNTR_PKTRCV);
+	*xmit_wait = dd->f_portcntr(ppd, QIBPORTCNTR_SENDSTALL);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_get_counters - get various chip counters
+ * @dd: the qlogic_ib device
+ * @cntrs: counters are placed here
+ *
+ * Return the counters needed by recv_pma_get_portcounters().
+ */
+int qib_get_counters(struct qib_pportdata *ppd,
+		     struct qib_verbs_counters *cntrs)
+{
+	int ret;
+
+	if (!(ppd->dd->flags & QIB_PRESENT)) {
+		/* no hardware, freeze, etc. */
+		ret = -EINVAL;
+		goto bail;
+	}
+	cntrs->symbol_error_counter =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBSYMBOLERR);
+	cntrs->link_error_recovery_counter =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBLINKERRRECOV);
+	/*
+	 * The link downed counter counts when the other side downs the
+	 * connection.  We add in the number of times we downed the link
+	 * due to local link integrity errors to compensate.
+	 */
+	cntrs->link_downed_counter =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBLINKDOWN);
+	cntrs->port_rcv_errors =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXDROPPKT) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RCVOVFL) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERR_RLEN) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_INVALIDRLEN) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRLINK) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRICRC) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRVCRC) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRLPCRC) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_BADFORMAT);
+	cntrs->port_rcv_errors +=
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXLOCALPHYERR);
+	cntrs->port_rcv_errors +=
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXVLERR);
+	cntrs->port_rcv_remphys_errors =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RCVEBP);
+	cntrs->port_xmit_discards =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_UNSUPVL);
+	cntrs->port_xmit_data = ppd->dd->f_portcntr(ppd,
+			QIBPORTCNTR_WORDSEND);
+	cntrs->port_rcv_data = ppd->dd->f_portcntr(ppd,
+			QIBPORTCNTR_WORDRCV);
+	cntrs->port_xmit_packets = ppd->dd->f_portcntr(ppd,
+			QIBPORTCNTR_PKTSEND);
+	cntrs->port_rcv_packets = ppd->dd->f_portcntr(ppd,
+			QIBPORTCNTR_PKTRCV);
+	cntrs->local_link_integrity_errors =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_LLI);
+	cntrs->excessive_buffer_overrun_errors =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_EXCESSBUFOVFL);
+	cntrs->vl15_dropped =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_VL15PKTDROP);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_ib_piobufavail - callback when a PIO buffer is available
+ * @dd: the device pointer
+ *
+ * This is called from qib_intr() at interrupt level when a PIO buffer is
+ * available after qib_verbs_send() returned an error that no buffers were
+ * available. Disable the interrupt if there are no more QPs waiting.
+ */
+void qib_ib_piobufavail(struct qib_devdata *dd)
+{
+	struct qib_ibdev *dev = &dd->verbs_dev;
+	struct list_head *list;
+	struct rvt_qp *qps[5];
+	struct rvt_qp *qp;
+	unsigned long flags;
+	unsigned i, n;
+	struct qib_qp_priv *priv;
+
+	list = &dev->piowait;
+	n = 0;
+
+	/*
+	 * Note: checking that the piowait list is empty and clearing
+	 * the buffer available interrupt needs to be atomic or we
+	 * could end up with QPs on the wait list with the interrupt
+	 * disabled.
+	 */
+	spin_lock_irqsave(&dev->rdi.pending_lock, flags);
+	while (!list_empty(list)) {
+		if (n == ARRAY_SIZE(qps))
+			goto full;
+		priv = list_entry(list->next, struct qib_qp_priv, iowait);
+		qp = priv->owner;
+		list_del_init(&priv->iowait);
+		rvt_get_qp(qp);
+		qps[n++] = qp;
+	}
+	dd->f_wantpiobuf_intr(dd, 0);
+full:
+	spin_unlock_irqrestore(&dev->rdi.pending_lock, flags);
+
+	for (i = 0; i < n; i++) {
+		qp = qps[i];
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (qp->s_flags & RVT_S_WAIT_PIO) {
+			qp->s_flags &= ~RVT_S_WAIT_PIO;
+			qib_schedule_send(qp);
+		}
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify qib_destroy_qp() if it is waiting. */
+		rvt_put_qp(qp);
+	}
+}
+
+static int qib_query_port(struct rvt_dev_info *rdi, u8 port_num,
+			  struct ib_port_attr *props)
+{
+	struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi);
+	struct qib_devdata *dd = dd_from_dev(ibdev);
+	struct qib_pportdata *ppd = &dd->pport[port_num - 1];
+	enum ib_mtu mtu;
+	u16 lid = ppd->lid;
+
+	/* props being zeroed by the caller, avoid zeroing it here */
+	props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
+	props->lmc = ppd->lmc;
+	props->state = dd->f_iblink_state(ppd->lastibcstat);
+	props->phys_state = dd->f_ibphys_portstate(ppd->lastibcstat);
+	props->gid_tbl_len = QIB_GUIDS_PER_PORT;
+	props->active_width = ppd->link_width_active;
+	/* See rate_show() */
+	props->active_speed = ppd->link_speed_active;
+	props->max_vl_num = qib_num_vls(ppd->vls_supported);
+
+	props->max_mtu = qib_ibmtu ? qib_ibmtu : IB_MTU_4096;
+	switch (ppd->ibmtu) {
+	case 4096:
+		mtu = IB_MTU_4096;
+		break;
+	case 2048:
+		mtu = IB_MTU_2048;
+		break;
+	case 1024:
+		mtu = IB_MTU_1024;
+		break;
+	case 512:
+		mtu = IB_MTU_512;
+		break;
+	case 256:
+		mtu = IB_MTU_256;
+		break;
+	default:
+		mtu = IB_MTU_2048;
+	}
+	props->active_mtu = mtu;
+
+	return 0;
+}
+
+static int qib_modify_device(struct ib_device *device,
+			     int device_modify_mask,
+			     struct ib_device_modify *device_modify)
+{
+	struct qib_devdata *dd = dd_from_ibdev(device);
+	unsigned i;
+	int ret;
+
+	if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+				   IB_DEVICE_MODIFY_NODE_DESC)) {
+		ret = -EOPNOTSUPP;
+		goto bail;
+	}
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		memcpy(device->node_desc, device_modify->node_desc,
+		       IB_DEVICE_NODE_DESC_MAX);
+		for (i = 0; i < dd->num_pports; i++) {
+			struct qib_ibport *ibp = &dd->pport[i].ibport_data;
+
+			qib_node_desc_chg(ibp);
+		}
+	}
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
+		ib_qib_sys_image_guid =
+			cpu_to_be64(device_modify->sys_image_guid);
+		for (i = 0; i < dd->num_pports; i++) {
+			struct qib_ibport *ibp = &dd->pport[i].ibport_data;
+
+			qib_sys_guid_chg(ibp);
+		}
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int qib_shut_down_port(struct rvt_dev_info *rdi, u8 port_num)
+{
+	struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi);
+	struct qib_devdata *dd = dd_from_dev(ibdev);
+	struct qib_pportdata *ppd = &dd->pport[port_num - 1];
+
+	qib_set_linkstate(ppd, QIB_IB_LINKDOWN);
+
+	return 0;
+}
+
+static int qib_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
+			   int guid_index, __be64 *guid)
+{
+	struct qib_ibport *ibp = container_of(rvp, struct qib_ibport, rvp);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+	if (guid_index == 0)
+		*guid = ppd->guid;
+	else if (guid_index < QIB_GUIDS_PER_PORT)
+		*guid = ibp->guids[guid_index - 1];
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+int qib_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr)
+{
+	if (rdma_ah_get_sl(ah_attr) > 15)
+		return -EINVAL;
+
+	if (rdma_ah_get_dlid(ah_attr) == 0)
+		return -EINVAL;
+	if (rdma_ah_get_dlid(ah_attr) >=
+		be16_to_cpu(IB_MULTICAST_LID_BASE) &&
+	    rdma_ah_get_dlid(ah_attr) !=
+		be16_to_cpu(IB_LID_PERMISSIVE) &&
+	    !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void qib_notify_new_ah(struct ib_device *ibdev,
+			      struct rdma_ah_attr *ah_attr,
+			      struct rvt_ah *ah)
+{
+	struct qib_ibport *ibp;
+	struct qib_pportdata *ppd;
+
+	/*
+	 * Do not trust reading anything from rvt_ah at this point as it is not
+	 * done being setup. We can however modify things which we need to set.
+	 */
+
+	ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
+	ppd = ppd_from_ibp(ibp);
+	ah->vl = ibp->sl_to_vl[rdma_ah_get_sl(&ah->attr)];
+	ah->log_pmtu = ilog2(ppd->ibmtu);
+}
+
+struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid)
+{
+	struct rdma_ah_attr attr;
+	struct ib_ah *ah = ERR_PTR(-EINVAL);
+	struct rvt_qp *qp0;
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = dd_from_ppd(ppd);
+	u8 port_num = ppd->port;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.type = rdma_ah_find_type(&dd->verbs_dev.rdi.ibdev, port_num);
+	rdma_ah_set_dlid(&attr, dlid);
+	rdma_ah_set_port_num(&attr, port_num);
+	rcu_read_lock();
+	qp0 = rcu_dereference(ibp->rvp.qp[0]);
+	if (qp0)
+		ah = rdma_create_ah(qp0->ibqp.pd, &attr);
+	rcu_read_unlock();
+	return ah;
+}
+
+/**
+ * qib_get_npkeys - return the size of the PKEY table for context 0
+ * @dd: the qlogic_ib device
+ */
+unsigned qib_get_npkeys(struct qib_devdata *dd)
+{
+	return ARRAY_SIZE(dd->rcd[0]->pkeys);
+}
+
+/*
+ * Return the indexed PKEY from the port PKEY table.
+ * No need to validate rcd[ctxt]; the port is setup if we are here.
+ */
+unsigned qib_get_pkey(struct qib_ibport *ibp, unsigned index)
+{
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = ppd->dd;
+	unsigned ctxt = ppd->hw_pidx;
+	unsigned ret;
+
+	/* dd->rcd null if mini_init or some init failures */
+	if (!dd->rcd || index >= ARRAY_SIZE(dd->rcd[ctxt]->pkeys))
+		ret = 0;
+	else
+		ret = dd->rcd[ctxt]->pkeys[index];
+
+	return ret;
+}
+
+static void init_ibport(struct qib_pportdata *ppd)
+{
+	struct qib_verbs_counters cntrs;
+	struct qib_ibport *ibp = &ppd->ibport_data;
+
+	spin_lock_init(&ibp->rvp.lock);
+	/* Set the prefix to the default value (see ch. 4.1.1) */
+	ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
+	ibp->rvp.sm_lid = be16_to_cpu(IB_LID_PERMISSIVE);
+	ibp->rvp.port_cap_flags = IB_PORT_SYS_IMAGE_GUID_SUP |
+		IB_PORT_CLIENT_REG_SUP | IB_PORT_SL_MAP_SUP |
+		IB_PORT_TRAP_SUP | IB_PORT_AUTO_MIGR_SUP |
+		IB_PORT_DR_NOTICE_SUP | IB_PORT_CAP_MASK_NOTICE_SUP |
+		IB_PORT_OTHER_LOCAL_CHANGES_SUP;
+	if (ppd->dd->flags & QIB_HAS_LINK_LATENCY)
+		ibp->rvp.port_cap_flags |= IB_PORT_LINK_LATENCY_SUP;
+	ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+	ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+	ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+	ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+	ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+	/* Snapshot current HW counters to "clear" them. */
+	qib_get_counters(ppd, &cntrs);
+	ibp->z_symbol_error_counter = cntrs.symbol_error_counter;
+	ibp->z_link_error_recovery_counter =
+		cntrs.link_error_recovery_counter;
+	ibp->z_link_downed_counter = cntrs.link_downed_counter;
+	ibp->z_port_rcv_errors = cntrs.port_rcv_errors;
+	ibp->z_port_rcv_remphys_errors = cntrs.port_rcv_remphys_errors;
+	ibp->z_port_xmit_discards = cntrs.port_xmit_discards;
+	ibp->z_port_xmit_data = cntrs.port_xmit_data;
+	ibp->z_port_rcv_data = cntrs.port_rcv_data;
+	ibp->z_port_xmit_packets = cntrs.port_xmit_packets;
+	ibp->z_port_rcv_packets = cntrs.port_rcv_packets;
+	ibp->z_local_link_integrity_errors =
+		cntrs.local_link_integrity_errors;
+	ibp->z_excessive_buffer_overrun_errors =
+		cntrs.excessive_buffer_overrun_errors;
+	ibp->z_vl15_dropped = cntrs.vl15_dropped;
+	RCU_INIT_POINTER(ibp->rvp.qp[0], NULL);
+	RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
+}
+
+/**
+ * qib_fill_device_attr - Fill in rvt dev info device attributes.
+ * @dd: the device data structure
+ */
+static void qib_fill_device_attr(struct qib_devdata *dd)
+{
+	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+
+	memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
+
+	rdi->dparms.props.max_pd = ib_qib_max_pds;
+	rdi->dparms.props.max_ah = ib_qib_max_ahs;
+	rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+		IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+		IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+	rdi->dparms.props.page_size_cap = PAGE_SIZE;
+	rdi->dparms.props.vendor_id =
+		QIB_SRC_OUI_1 << 16 | QIB_SRC_OUI_2 << 8 | QIB_SRC_OUI_3;
+	rdi->dparms.props.vendor_part_id = dd->deviceid;
+	rdi->dparms.props.hw_ver = dd->minrev;
+	rdi->dparms.props.sys_image_guid = ib_qib_sys_image_guid;
+	rdi->dparms.props.max_mr_size = ~0ULL;
+	rdi->dparms.props.max_qp = ib_qib_max_qps;
+	rdi->dparms.props.max_qp_wr = ib_qib_max_qp_wrs;
+	rdi->dparms.props.max_sge = ib_qib_max_sges;
+	rdi->dparms.props.max_sge_rd = ib_qib_max_sges;
+	rdi->dparms.props.max_cq = ib_qib_max_cqs;
+	rdi->dparms.props.max_cqe = ib_qib_max_cqes;
+	rdi->dparms.props.max_ah = ib_qib_max_ahs;
+	rdi->dparms.props.max_mr = rdi->lkey_table.max;
+	rdi->dparms.props.max_fmr = rdi->lkey_table.max;
+	rdi->dparms.props.max_map_per_fmr = 32767;
+	rdi->dparms.props.max_qp_rd_atom = QIB_MAX_RDMA_ATOMIC;
+	rdi->dparms.props.max_qp_init_rd_atom = 255;
+	rdi->dparms.props.max_srq = ib_qib_max_srqs;
+	rdi->dparms.props.max_srq_wr = ib_qib_max_srq_wrs;
+	rdi->dparms.props.max_srq_sge = ib_qib_max_srq_sges;
+	rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB;
+	rdi->dparms.props.max_pkeys = qib_get_npkeys(dd);
+	rdi->dparms.props.max_mcast_grp = ib_qib_max_mcast_grps;
+	rdi->dparms.props.max_mcast_qp_attach = ib_qib_max_mcast_qp_attached;
+	rdi->dparms.props.max_total_mcast_qp_attach =
+					rdi->dparms.props.max_mcast_qp_attach *
+					rdi->dparms.props.max_mcast_grp;
+	/* post send table */
+	dd->verbs_dev.rdi.post_parms = qib_post_parms;
+}
+
+/**
+ * qib_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return the allocated qib_ibdev pointer or NULL on error.
+ */
+int qib_register_ib_device(struct qib_devdata *dd)
+{
+	struct qib_ibdev *dev = &dd->verbs_dev;
+	struct ib_device *ibdev = &dev->rdi.ibdev;
+	struct qib_pportdata *ppd = dd->pport;
+	unsigned i, ctxt;
+	int ret;
+
+	get_random_bytes(&dev->qp_rnd, sizeof(dev->qp_rnd));
+	for (i = 0; i < dd->num_pports; i++)
+		init_ibport(ppd + i);
+
+	/* Only need to initialize non-zero fields. */
+	timer_setup(&dev->mem_timer, mem_timer, 0);
+
+	INIT_LIST_HEAD(&dev->piowait);
+	INIT_LIST_HEAD(&dev->dmawait);
+	INIT_LIST_HEAD(&dev->txwait);
+	INIT_LIST_HEAD(&dev->memwait);
+	INIT_LIST_HEAD(&dev->txreq_free);
+
+	if (ppd->sdma_descq_cnt) {
+		dev->pio_hdrs = dma_alloc_coherent(&dd->pcidev->dev,
+						ppd->sdma_descq_cnt *
+						sizeof(struct qib_pio_header),
+						&dev->pio_hdrs_phys,
+						GFP_KERNEL);
+		if (!dev->pio_hdrs) {
+			ret = -ENOMEM;
+			goto err_hdrs;
+		}
+	}
+
+	for (i = 0; i < ppd->sdma_descq_cnt; i++) {
+		struct qib_verbs_txreq *tx;
+
+		tx = kzalloc(sizeof(*tx), GFP_KERNEL);
+		if (!tx) {
+			ret = -ENOMEM;
+			goto err_tx;
+		}
+		tx->hdr_inx = i;
+		list_add(&tx->txreq.list, &dev->txreq_free);
+	}
+
+	/*
+	 * The system image GUID is supposed to be the same for all
+	 * IB HCAs in a single system but since there can be other
+	 * device types in the system, we can't be sure this is unique.
+	 */
+	if (!ib_qib_sys_image_guid)
+		ib_qib_sys_image_guid = ppd->guid;
+
+	ibdev->owner = THIS_MODULE;
+	ibdev->node_guid = ppd->guid;
+	ibdev->phys_port_cnt = dd->num_pports;
+	ibdev->dev.parent = &dd->pcidev->dev;
+	ibdev->modify_device = qib_modify_device;
+	ibdev->process_mad = qib_process_mad;
+
+	snprintf(ibdev->node_desc, sizeof(ibdev->node_desc),
+		 "Intel Infiniband HCA %s", init_utsname()->nodename);
+
+	/*
+	 * Fill in rvt info object.
+	 */
+	dd->verbs_dev.rdi.driver_f.port_callback = qib_create_port_files;
+	dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev;
+	dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah;
+	dd->verbs_dev.rdi.driver_f.check_send_wqe = qib_check_send_wqe;
+	dd->verbs_dev.rdi.driver_f.notify_new_ah = qib_notify_new_ah;
+	dd->verbs_dev.rdi.driver_f.alloc_qpn = qib_alloc_qpn;
+	dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qib_qp_priv_alloc;
+	dd->verbs_dev.rdi.driver_f.qp_priv_free = qib_qp_priv_free;
+	dd->verbs_dev.rdi.driver_f.free_all_qps = qib_free_all_qps;
+	dd->verbs_dev.rdi.driver_f.notify_qp_reset = qib_notify_qp_reset;
+	dd->verbs_dev.rdi.driver_f.do_send = qib_do_send;
+	dd->verbs_dev.rdi.driver_f.schedule_send = qib_schedule_send;
+	dd->verbs_dev.rdi.driver_f.quiesce_qp = qib_quiesce_qp;
+	dd->verbs_dev.rdi.driver_f.stop_send_queue = qib_stop_send_queue;
+	dd->verbs_dev.rdi.driver_f.flush_qp_waiters = qib_flush_qp_waiters;
+	dd->verbs_dev.rdi.driver_f.notify_error_qp = qib_notify_error_qp;
+	dd->verbs_dev.rdi.driver_f.notify_restart_rc = qib_restart_rc;
+	dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = qib_mtu_to_path_mtu;
+	dd->verbs_dev.rdi.driver_f.mtu_from_qp = qib_mtu_from_qp;
+	dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = qib_get_pmtu_from_attr;
+	dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _qib_schedule_send;
+	dd->verbs_dev.rdi.driver_f.query_port_state = qib_query_port;
+	dd->verbs_dev.rdi.driver_f.shut_down_port = qib_shut_down_port;
+	dd->verbs_dev.rdi.driver_f.cap_mask_chg = qib_cap_mask_chg;
+	dd->verbs_dev.rdi.driver_f.notify_create_mad_agent =
+						qib_notify_create_mad_agent;
+	dd->verbs_dev.rdi.driver_f.notify_free_mad_agent =
+						qib_notify_free_mad_agent;
+
+	dd->verbs_dev.rdi.dparms.max_rdma_atomic = QIB_MAX_RDMA_ATOMIC;
+	dd->verbs_dev.rdi.driver_f.get_guid_be = qib_get_guid_be;
+	dd->verbs_dev.rdi.dparms.lkey_table_size = qib_lkey_table_size;
+	dd->verbs_dev.rdi.dparms.qp_table_size = ib_qib_qp_table_size;
+	dd->verbs_dev.rdi.dparms.qpn_start = 1;
+	dd->verbs_dev.rdi.dparms.qpn_res_start = QIB_KD_QP;
+	dd->verbs_dev.rdi.dparms.qpn_res_end = QIB_KD_QP; /* Reserve one QP */
+	dd->verbs_dev.rdi.dparms.qpn_inc = 1;
+	dd->verbs_dev.rdi.dparms.qos_shift = 1;
+	dd->verbs_dev.rdi.dparms.psn_mask = QIB_PSN_MASK;
+	dd->verbs_dev.rdi.dparms.psn_shift = QIB_PSN_SHIFT;
+	dd->verbs_dev.rdi.dparms.psn_modify_mask = QIB_PSN_MASK;
+	dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
+	dd->verbs_dev.rdi.dparms.npkeys = qib_get_npkeys(dd);
+	dd->verbs_dev.rdi.dparms.node = dd->assigned_node_id;
+	dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+	dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE;
+
+	snprintf(dd->verbs_dev.rdi.dparms.cq_name,
+		 sizeof(dd->verbs_dev.rdi.dparms.cq_name),
+		 "qib_cq%d", dd->unit);
+
+	qib_fill_device_attr(dd);
+
+	ppd = dd->pport;
+	for (i = 0; i < dd->num_pports; i++, ppd++) {
+		ctxt = ppd->hw_pidx;
+		rvt_init_port(&dd->verbs_dev.rdi,
+			      &ppd->ibport_data.rvp,
+			      i,
+			      dd->rcd[ctxt]->pkeys);
+	}
+
+	ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_QIB);
+	if (ret)
+		goto err_tx;
+
+	ret = qib_verbs_register_sysfs(dd);
+	if (ret)
+		goto err_class;
+
+	return ret;
+
+err_class:
+	rvt_unregister_device(&dd->verbs_dev.rdi);
+err_tx:
+	while (!list_empty(&dev->txreq_free)) {
+		struct list_head *l = dev->txreq_free.next;
+		struct qib_verbs_txreq *tx;
+
+		list_del(l);
+		tx = list_entry(l, struct qib_verbs_txreq, txreq.list);
+		kfree(tx);
+	}
+	if (ppd->sdma_descq_cnt)
+		dma_free_coherent(&dd->pcidev->dev,
+				  ppd->sdma_descq_cnt *
+					sizeof(struct qib_pio_header),
+				  dev->pio_hdrs, dev->pio_hdrs_phys);
+err_hdrs:
+	qib_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+	return ret;
+}
+
+void qib_unregister_ib_device(struct qib_devdata *dd)
+{
+	struct qib_ibdev *dev = &dd->verbs_dev;
+
+	qib_verbs_unregister_sysfs(dd);
+
+	rvt_unregister_device(&dd->verbs_dev.rdi);
+
+	if (!list_empty(&dev->piowait))
+		qib_dev_err(dd, "piowait list not empty!\n");
+	if (!list_empty(&dev->dmawait))
+		qib_dev_err(dd, "dmawait list not empty!\n");
+	if (!list_empty(&dev->txwait))
+		qib_dev_err(dd, "txwait list not empty!\n");
+	if (!list_empty(&dev->memwait))
+		qib_dev_err(dd, "memwait list not empty!\n");
+
+	del_timer_sync(&dev->mem_timer);
+	while (!list_empty(&dev->txreq_free)) {
+		struct list_head *l = dev->txreq_free.next;
+		struct qib_verbs_txreq *tx;
+
+		list_del(l);
+		tx = list_entry(l, struct qib_verbs_txreq, txreq.list);
+		kfree(tx);
+	}
+	if (dd->pport->sdma_descq_cnt)
+		dma_free_coherent(&dd->pcidev->dev,
+				  dd->pport->sdma_descq_cnt *
+					sizeof(struct qib_pio_header),
+				  dev->pio_hdrs, dev->pio_hdrs_phys);
+}
+
+/**
+ * _qib_schedule_send - schedule progress
+ * @qp - the qp
+ *
+ * This schedules progress w/o regard to the s_flags.
+ *
+ * It is only used in post send, which doesn't hold
+ * the s_lock.
+ */
+void _qib_schedule_send(struct rvt_qp *qp)
+{
+	struct qib_ibport *ibp =
+		to_iport(qp->ibqp.device, qp->port_num);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_qp_priv *priv = qp->priv;
+
+	queue_work(ppd->qib_wq, &priv->s_work);
+}
+
+/**
+ * qib_schedule_send - schedule progress
+ * @qp - the qp
+ *
+ * This schedules qp progress.  The s_lock
+ * should be held.
+ */
+void qib_schedule_send(struct rvt_qp *qp)
+{
+	if (qib_send_ok(qp))
+		_qib_schedule_send(qp);
+}
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
new file mode 100644
index 0000000..f887737
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2012 - 2017 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef QIB_VERBS_H
+#define QIB_VERBS_H
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_hdrs.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_cq.h>
+
+struct qib_ctxtdata;
+struct qib_pportdata;
+struct qib_devdata;
+struct qib_verbs_txreq;
+
+#define QIB_MAX_RDMA_ATOMIC     16
+#define QIB_GUIDS_PER_PORT	5
+#define QIB_PSN_SHIFT		8
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define QIB_UVERBS_ABI_VERSION       2
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE       0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED    0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING    0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA   cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA    cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS   cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS    cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT   cpu_to_be16(0x0005)
+
+#define QIB_VENDOR_IPG		cpu_to_be16(0xFFA0)
+
+/* XXX Should be defined in ib_verbs.h enum ib_port_cap_flags */
+#define IB_PORT_OTHER_LOCAL_CHANGES_SUP (1 << 26)
+
+#define IB_DEFAULT_GID_PREFIX	cpu_to_be64(0xfe80000000000000ULL)
+
+/* Values for set/get portinfo VLCap OperationalVLs */
+#define IB_VL_VL0       1
+#define IB_VL_VL0_1     2
+#define IB_VL_VL0_3     3
+#define IB_VL_VL0_7     4
+#define IB_VL_VL0_14    5
+
+static inline int qib_num_vls(int vls)
+{
+	switch (vls) {
+	default:
+	case IB_VL_VL0:
+		return 1;
+	case IB_VL_VL0_1:
+		return 2;
+	case IB_VL_VL0_3:
+		return 4;
+	case IB_VL_VL0_7:
+		return 8;
+	case IB_VL_VL0_14:
+		return 15;
+	}
+}
+
+struct qib_pio_header {
+	__le32 pbc[2];
+	struct ib_header hdr;
+} __packed;
+
+/*
+ * qib specific data structure that will be hidden from rvt after the queue pair
+ * is made common.
+ */
+struct qib_qp_priv {
+	struct ib_header *s_hdr;        /* next packet header to send */
+	struct list_head iowait;        /* link for wait PIO buf */
+	atomic_t s_dma_busy;
+	struct qib_verbs_txreq *s_tx;
+	struct work_struct s_work;
+	wait_queue_head_t wait_dma;
+	struct rvt_qp *owner;
+};
+
+#define QIB_PSN_CREDIT  16
+
+struct qib_opcode_stats {
+	u64 n_packets;          /* number of packets */
+	u64 n_bytes;            /* total number of bytes */
+};
+
+struct qib_opcode_stats_perctx {
+	struct qib_opcode_stats stats[128];
+};
+
+struct qib_pma_counters {
+	u64 n_unicast_xmit;     /* total unicast packets sent */
+	u64 n_unicast_rcv;      /* total unicast packets received */
+	u64 n_multicast_xmit;   /* total multicast packets sent */
+	u64 n_multicast_rcv;    /* total multicast packets received */
+};
+
+struct qib_ibport {
+	struct rvt_ibport rvp;
+	struct rvt_ah *smi_ah;
+	__be64 guids[QIB_GUIDS_PER_PORT	- 1];	/* writable GUIDs */
+	struct qib_pma_counters __percpu *pmastats;
+	u64 z_unicast_xmit;     /* starting count for PMA */
+	u64 z_unicast_rcv;      /* starting count for PMA */
+	u64 z_multicast_xmit;   /* starting count for PMA */
+	u64 z_multicast_rcv;    /* starting count for PMA */
+	u64 z_symbol_error_counter;             /* starting count for PMA */
+	u64 z_link_error_recovery_counter;      /* starting count for PMA */
+	u64 z_link_downed_counter;              /* starting count for PMA */
+	u64 z_port_rcv_errors;                  /* starting count for PMA */
+	u64 z_port_rcv_remphys_errors;          /* starting count for PMA */
+	u64 z_port_xmit_discards;               /* starting count for PMA */
+	u64 z_port_xmit_data;                   /* starting count for PMA */
+	u64 z_port_rcv_data;                    /* starting count for PMA */
+	u64 z_port_xmit_packets;                /* starting count for PMA */
+	u64 z_port_rcv_packets;                 /* starting count for PMA */
+	u32 z_local_link_integrity_errors;      /* starting count for PMA */
+	u32 z_excessive_buffer_overrun_errors;  /* starting count for PMA */
+	u32 z_vl15_dropped;                     /* starting count for PMA */
+	u8 sl_to_vl[16];
+};
+
+struct qib_ibdev {
+	struct rvt_dev_info rdi;
+
+	struct list_head piowait;       /* list for wait PIO buf */
+	struct list_head dmawait;	/* list for wait DMA */
+	struct list_head txwait;        /* list for wait qib_verbs_txreq */
+	struct list_head memwait;       /* list for wait kernel memory */
+	struct list_head txreq_free;
+	struct timer_list mem_timer;
+	struct qib_pio_header *pio_hdrs;
+	dma_addr_t pio_hdrs_phys;
+	u32 qp_rnd; /* random bytes for hash */
+
+	u32 n_piowait;
+	u32 n_txwait;
+
+#ifdef CONFIG_DEBUG_FS
+	/* per HCA debugfs */
+	struct dentry *qib_ibdev_dbg;
+#endif
+};
+
+struct qib_verbs_counters {
+	u64 symbol_error_counter;
+	u64 link_error_recovery_counter;
+	u64 link_downed_counter;
+	u64 port_rcv_errors;
+	u64 port_rcv_remphys_errors;
+	u64 port_xmit_discards;
+	u64 port_xmit_data;
+	u64 port_rcv_data;
+	u64 port_xmit_packets;
+	u64 port_rcv_packets;
+	u32 local_link_integrity_errors;
+	u32 excessive_buffer_overrun_errors;
+	u32 vl15_dropped;
+};
+
+static inline struct qib_ibdev *to_idev(struct ib_device *ibdev)
+{
+	struct rvt_dev_info *rdi;
+
+	rdi = container_of(ibdev, struct rvt_dev_info, ibdev);
+	return container_of(rdi, struct qib_ibdev, rdi);
+}
+
+/*
+ * Send if not busy or waiting for I/O and either
+ * a RC response is pending or we can process send work requests.
+ */
+static inline int qib_send_ok(struct rvt_qp *qp)
+{
+	return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) &&
+		(qp->s_hdrwords || (qp->s_flags & RVT_S_RESP_PENDING) ||
+		 !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
+}
+
+void _qib_schedule_send(struct rvt_qp *qp);
+void qib_schedule_send(struct rvt_qp *qp);
+
+static inline int qib_pkey_ok(u16 pkey1, u16 pkey2)
+{
+	u16 p1 = pkey1 & 0x7FFF;
+	u16 p2 = pkey2 & 0x7FFF;
+
+	/*
+	 * Low 15 bits must be non-zero and match, and
+	 * one of the two must be a full member.
+	 */
+	return p1 && p1 == p2 && ((__s16)pkey1 < 0 || (__s16)pkey2 < 0);
+}
+
+void qib_bad_pkey(struct qib_ibport *ibp, u32 key, u32 sl,
+		  u32 qp1, u32 qp2, __be16 lid1, __be16 lid2);
+void qib_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num);
+void qib_sys_guid_chg(struct qib_ibport *ibp);
+void qib_node_desc_chg(struct qib_ibport *ibp);
+int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		    const struct ib_mad_hdr *in, size_t in_mad_size,
+		    struct ib_mad_hdr *out, size_t *out_mad_size,
+		    u16 *out_mad_pkey_index);
+void qib_notify_create_mad_agent(struct rvt_dev_info *rdi, int port_idx);
+void qib_notify_free_mad_agent(struct rvt_dev_info *rdi, int port_idx);
+
+/*
+ * Compare the lower 24 bits of the two values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int qib_cmp24(u32 a, u32 b)
+{
+	return (((int) a) - ((int) b)) << 8;
+}
+
+int qib_snapshot_counters(struct qib_pportdata *ppd, u64 *swords,
+			  u64 *rwords, u64 *spkts, u64 *rpkts,
+			  u64 *xmit_wait);
+
+int qib_get_counters(struct qib_pportdata *ppd,
+		     struct qib_verbs_counters *cntrs);
+
+/*
+ * Functions provided by qib driver for rdmavt to use
+ */
+unsigned qib_free_all_qps(struct rvt_dev_info *rdi);
+void *qib_qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+void qib_qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+void qib_notify_qp_reset(struct rvt_qp *qp);
+int qib_alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
+		  enum ib_qp_type type, u8 port);
+void qib_restart_rc(struct rvt_qp *qp, u32 psn, int wait);
+#ifdef CONFIG_DEBUG_FS
+
+void qib_qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter);
+
+#endif
+
+unsigned qib_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult);
+
+void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail);
+
+void qib_put_txreq(struct qib_verbs_txreq *tx);
+
+int qib_verbs_send(struct rvt_qp *qp, struct ib_header *hdr,
+		   u32 hdrwords, struct rvt_sge_state *ss, u32 len);
+
+void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length,
+		  int release);
+
+void qib_uc_rcv(struct qib_ibport *ibp, struct ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct rvt_qp *qp);
+
+void qib_rc_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct rvt_qp *qp);
+
+int qib_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr);
+
+int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid);
+
+void qib_rc_rnr_retry(unsigned long arg);
+
+void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr);
+
+int qib_post_ud_send(struct rvt_qp *qp, struct ib_send_wr *wr);
+
+void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct rvt_qp *qp);
+
+void mr_rcu_callback(struct rcu_head *list);
+
+int qib_get_rwqe(struct rvt_qp *qp, int wr_id_only);
+
+void qib_migrate_qp(struct rvt_qp *qp);
+
+int qib_ruc_check_hdr(struct qib_ibport *ibp, struct ib_header *hdr,
+		      int has_grh, struct rvt_qp *qp, u32 bth0);
+
+u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr,
+		 const struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void qib_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
+			 u32 bth0, u32 bth2);
+
+void _qib_do_send(struct work_struct *work);
+
+void qib_do_send(struct rvt_qp *qp);
+
+void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
+		       enum ib_wc_status status);
+
+void qib_send_rc_ack(struct rvt_qp *qp);
+
+int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags);
+
+int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags);
+
+int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags);
+
+int qib_register_ib_device(struct qib_devdata *);
+
+void qib_unregister_ib_device(struct qib_devdata *);
+
+void qib_ib_rcv(struct qib_ctxtdata *, void *, void *, u32);
+
+void qib_ib_piobufavail(struct qib_devdata *);
+
+unsigned qib_get_npkeys(struct qib_devdata *);
+
+unsigned qib_get_pkey(struct qib_ibport *, unsigned);
+
+extern const enum ib_wc_opcode ib_qib_wc_opcode[];
+
+/*
+ * Below  HCA-independent IB PhysPortState values, returned
+ * by the f_ibphys_portstate() routine.
+ */
+#define IB_PHYSPORTSTATE_SLEEP 1
+#define IB_PHYSPORTSTATE_POLL 2
+#define IB_PHYSPORTSTATE_DISABLED 3
+#define IB_PHYSPORTSTATE_CFG_TRAIN 4
+#define IB_PHYSPORTSTATE_LINKUP 5
+#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6
+#define IB_PHYSPORTSTATE_CFG_DEBOUNCE 8
+#define IB_PHYSPORTSTATE_CFG_IDLE 0xB
+#define IB_PHYSPORTSTATE_RECOVERY_RETRAIN 0xC
+#define IB_PHYSPORTSTATE_RECOVERY_WAITRMT 0xE
+#define IB_PHYSPORTSTATE_RECOVERY_IDLE 0xF
+#define IB_PHYSPORTSTATE_CFG_ENH 0x10
+#define IB_PHYSPORTSTATE_CFG_WAIT_ENH 0x13
+
+extern const int ib_rvt_state_ops[];
+
+extern __be64 ib_qib_sys_image_guid;    /* in network order */
+
+extern unsigned int ib_rvt_lkey_table_size;
+
+extern unsigned int ib_qib_max_cqes;
+
+extern unsigned int ib_qib_max_cqs;
+
+extern unsigned int ib_qib_max_qp_wrs;
+
+extern unsigned int ib_qib_max_qps;
+
+extern unsigned int ib_qib_max_sges;
+
+extern unsigned int ib_qib_max_mcast_grps;
+
+extern unsigned int ib_qib_max_mcast_qp_attached;
+
+extern unsigned int ib_qib_max_srqs;
+
+extern unsigned int ib_qib_max_srq_sges;
+
+extern unsigned int ib_qib_max_srq_wrs;
+
+extern const u32 ib_qib_rnr_table[];
+
+extern const struct rvt_operation_params qib_post_parms[];
+
+#endif                          /* QIB_VERBS_H */
diff --git a/drivers/infiniband/hw/qib/qib_wc_ppc64.c b/drivers/infiniband/hw/qib/qib_wc_ppc64.c
new file mode 100644
index 0000000..673cf4c
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_wc_ppc64.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on PowerPC only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include "qib.h"
+
+/**
+ * qib_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: qlogic_ib device
+ *
+ * Nothing to do on PowerPC, so just return without error.
+ */
+int qib_enable_wc(struct qib_devdata *dd)
+{
+	return 0;
+}
+
+/**
+ * qib_unordered_wc - indicate whether write combining is unordered
+ *
+ * Because our performance depends on our ability to do write
+ * combining mmio writes in the most efficient way, we need to
+ * know if we are on a processor that may reorder stores when
+ * write combining.
+ */
+int qib_unordered_wc(void)
+{
+	return 1;
+}
diff --git a/drivers/infiniband/hw/qib/qib_wc_x86_64.c b/drivers/infiniband/hw/qib/qib_wc_x86_64.c
new file mode 100644
index 0000000..edd0ddb
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_wc_x86_64.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on x86_64 only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include <linux/pci.h>
+#include <asm/mtrr.h>
+#include <asm/processor.h>
+
+#include "qib.h"
+
+/**
+ * qib_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: qlogic_ib device
+ *
+ * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable
+ * write combining.
+ */
+int qib_enable_wc(struct qib_devdata *dd)
+{
+	int ret = 0;
+	u64 pioaddr, piolen;
+	unsigned bits;
+	const unsigned long addr = pci_resource_start(dd->pcidev, 0);
+	const size_t len = pci_resource_len(dd->pcidev, 0);
+
+	/*
+	 * Set the PIO buffers to be WCCOMB, so we get HT bursts to the
+	 * chip.  Linux (possibly the hardware) requires it to be on a power
+	 * of 2 address matching the length (which has to be a power of 2).
+	 * For rev1, that means the base address, for rev2, it will be just
+	 * the PIO buffers themselves.
+	 * For chips with two sets of buffers, the calculations are
+	 * somewhat more complicated; we need to sum, and the piobufbase
+	 * register has both offsets, 2K in low 32 bits, 4K in high 32 bits.
+	 * The buffers are still packed, so a single range covers both.
+	 */
+	if (dd->piobcnt2k && dd->piobcnt4k) {
+		/* 2 sizes for chip */
+		unsigned long pio2kbase, pio4kbase;
+
+		pio2kbase = dd->piobufbase & 0xffffffffUL;
+		pio4kbase = (dd->piobufbase >> 32) & 0xffffffffUL;
+		if (pio2kbase < pio4kbase) {
+			/* all current chips */
+			pioaddr = addr + pio2kbase;
+			piolen = pio4kbase - pio2kbase +
+				dd->piobcnt4k * dd->align4k;
+		} else {
+			pioaddr = addr + pio4kbase;
+			piolen = pio2kbase - pio4kbase +
+				dd->piobcnt2k * dd->palign;
+		}
+	} else {  /* single buffer size (2K, currently) */
+		pioaddr = addr + dd->piobufbase;
+		piolen = dd->piobcnt2k * dd->palign +
+			dd->piobcnt4k * dd->align4k;
+	}
+
+	for (bits = 0; !(piolen & (1ULL << bits)); bits++)
+		; /* do nothing */
+
+	if (piolen != (1ULL << bits)) {
+		piolen >>= bits;
+		while (piolen >>= 1)
+			bits++;
+		piolen = 1ULL << (bits + 1);
+	}
+	if (pioaddr & (piolen - 1)) {
+		u64 atmp = pioaddr & ~(piolen - 1);
+
+		if (atmp < addr || (atmp + piolen) > (addr + len)) {
+			qib_dev_err(dd,
+				"No way to align address/size (%llx/%llx), no WC mtrr\n",
+				(unsigned long long) atmp,
+				(unsigned long long) piolen << 1);
+			ret = -ENODEV;
+		} else {
+			pioaddr = atmp;
+			piolen <<= 1;
+		}
+	}
+
+	if (!ret) {
+		dd->wc_cookie = arch_phys_wc_add(pioaddr, piolen);
+		if (dd->wc_cookie < 0)
+			/* use error from routine */
+			ret = dd->wc_cookie;
+	}
+
+	return ret;
+}
+
+/**
+ * qib_disable_wc - disable write combining for MMIO writes to the device
+ * @dd: qlogic_ib device
+ */
+void qib_disable_wc(struct qib_devdata *dd)
+{
+	arch_phys_wc_del(dd->wc_cookie);
+}
+
+/**
+ * qib_unordered_wc - indicate whether write combining is ordered
+ *
+ * Because our performance depends on our ability to do write combining mmio
+ * writes in the most efficient way, we need to know if we are on an Intel
+ * or AMD x86_64 processor.  AMD x86_64 processors flush WC buffers out in
+ * the order completed, and so no special flushing is required to get
+ * correct ordering.  Intel processors, however, will flush write buffers
+ * out in "random" orders, and so explicit ordering is needed at times.
+ */
+int qib_unordered_wc(void)
+{
+	return boot_cpu_data.x86_vendor != X86_VENDOR_AMD;
+}
diff --git a/drivers/infiniband/hw/usnic/Kconfig b/drivers/infiniband/hw/usnic/Kconfig
new file mode 100644
index 0000000..29ab11c
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/Kconfig
@@ -0,0 +1,10 @@
+config INFINIBAND_USNIC
+	tristate "Verbs support for Cisco VIC"
+	depends on NETDEVICES && ETHERNET && INET && PCI && INTEL_IOMMU
+	select ENIC
+	select NET_VENDOR_CISCO
+	select PCI_IOV
+	select INFINIBAND_USER_ACCESS
+	---help---
+	  This is a low-level driver for Cisco's Virtual Interface
+	  Cards (VICs), including the VIC 1240 and 1280 cards.
diff --git a/drivers/infiniband/hw/usnic/Makefile b/drivers/infiniband/hw/usnic/Makefile
new file mode 100644
index 0000000..94ae7a1
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+ccflags-y := -Idrivers/net/ethernet/cisco/enic
+
+obj-$(CONFIG_INFINIBAND_USNIC)+= usnic_verbs.o
+
+usnic_verbs-y=\
+usnic_fwd.o \
+usnic_transport.o \
+usnic_uiom.o \
+usnic_uiom_interval_tree.o \
+usnic_vnic.o \
+usnic_ib_main.o \
+usnic_ib_qp_grp.o \
+usnic_ib_sysfs.o \
+usnic_ib_verbs.o \
+usnic_debugfs.o \
diff --git a/drivers/infiniband/hw/usnic/usnic.h b/drivers/infiniband/hw/usnic/usnic.h
new file mode 100644
index 0000000..f903502
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_H_
+#define USNIC_H_
+
+#define DRV_NAME	"usnic_verbs"
+
+#define PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC	0x00cf	/* User space NIC */
+
+#define DRV_VERSION    "1.0.3"
+#define DRV_RELDATE    "December 19, 2013"
+
+#endif /* USNIC_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_abi.h b/drivers/infiniband/hw/usnic/usnic_abi.h
new file mode 100644
index 0000000..7fe9502
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_abi.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#ifndef USNIC_ABI_H
+#define USNIC_ABI_H
+
+/* ABI between userspace and kernel */
+#define USNIC_UVERBS_ABI_VERSION	4
+
+#define USNIC_QP_GRP_MAX_WQS		8
+#define USNIC_QP_GRP_MAX_RQS		8
+#define USNIC_QP_GRP_MAX_CQS		16
+
+enum usnic_transport_type {
+	USNIC_TRANSPORT_UNKNOWN		= 0,
+	USNIC_TRANSPORT_ROCE_CUSTOM	= 1,
+	USNIC_TRANSPORT_IPV4_UDP	= 2,
+	USNIC_TRANSPORT_MAX		= 3,
+};
+
+struct usnic_transport_spec {
+	enum usnic_transport_type	trans_type;
+	union {
+		struct {
+			uint16_t	port_num;
+		} usnic_roce;
+		struct {
+			uint32_t	sock_fd;
+		} udp;
+	};
+};
+
+struct usnic_ib_create_qp_cmd {
+	struct usnic_transport_spec	spec;
+};
+
+/*TODO: Future - usnic_modify_qp needs to pass in generic filters */
+struct usnic_ib_create_qp_resp {
+	u32				vfid;
+	u32				qp_grp_id;
+	u64				bar_bus_addr;
+	u32				bar_len;
+/*
+ * WQ, RQ, CQ are explicity specified bc exposing a generic resources inteface
+ * expands the scope of ABI to many files.
+ */
+	u32				wq_cnt;
+	u32				rq_cnt;
+	u32				cq_cnt;
+	u32				wq_idx[USNIC_QP_GRP_MAX_WQS];
+	u32				rq_idx[USNIC_QP_GRP_MAX_RQS];
+	u32				cq_idx[USNIC_QP_GRP_MAX_CQS];
+	u32				transport;
+	u32				reserved[9];
+};
+
+#endif /* USNIC_ABI_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h
new file mode 100644
index 0000000..bf7d197
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_CMN_PKT_HDR_H
+#define USNIC_CMN_PKT_HDR_H
+
+#define USNIC_ROCE_GRH_VER              (8)
+#define USNIC_PROTO_VER                 (1)
+#define USNIC_ROCE_GRH_VER_SHIFT        (4)
+
+#endif /* USNIC_COMMON_PKT_HDR_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_common_util.h b/drivers/infiniband/hw/usnic/usnic_common_util.h
new file mode 100644
index 0000000..ddd8129
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_common_util.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_CMN_UTIL_H
+#define USNIC_CMN_UTIL_H
+
+#include <net/addrconf.h>
+
+static inline void
+usnic_mac_ip_to_gid(const char *const mac, const __be32 inaddr, char *raw_gid)
+{
+	raw_gid[0] = 0xfe;
+	raw_gid[1] = 0x80;
+	memset(&raw_gid[2], 0, 2);
+	memcpy(&raw_gid[4], &inaddr, 4);
+	addrconf_addr_eui48(&raw_gid[8], mac);
+}
+
+#endif /* USNIC_COMMON_UTIL_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c
new file mode 100644
index 0000000..92dc66c
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+
+#include "usnic.h"
+#include "usnic_log.h"
+#include "usnic_debugfs.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_transport.h"
+
+static struct dentry *debugfs_root;
+static struct dentry *flows_dentry;
+
+static ssize_t usnic_debugfs_buildinfo_read(struct file *f, char __user *data,
+						size_t count, loff_t *ppos)
+{
+	char buf[500];
+	int res;
+
+	if (*ppos > 0)
+		return 0;
+
+	res = scnprintf(buf, sizeof(buf),
+			"version:       %s\n"
+			"build date:    %s\n",
+			DRV_VERSION, DRV_RELDATE);
+
+	return simple_read_from_buffer(data, count, ppos, buf, res);
+}
+
+static const struct file_operations usnic_debugfs_buildinfo_ops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = usnic_debugfs_buildinfo_read
+};
+
+static ssize_t flowinfo_read(struct file *f, char __user *data,
+				size_t count, loff_t *ppos)
+{
+	struct usnic_ib_qp_grp_flow *qp_flow;
+	int n;
+	int left;
+	char *ptr;
+	char buf[512];
+
+	qp_flow = f->private_data;
+	ptr = buf;
+	left = count;
+
+	if (*ppos > 0)
+		return 0;
+
+	spin_lock(&qp_flow->qp_grp->lock);
+	n = scnprintf(ptr, left,
+			"QP Grp ID: %d Transport: %s ",
+			qp_flow->qp_grp->grp_id,
+			usnic_transport_to_str(qp_flow->trans_type));
+	UPDATE_PTR_LEFT(n, ptr, left);
+	if (qp_flow->trans_type == USNIC_TRANSPORT_ROCE_CUSTOM) {
+		n = scnprintf(ptr, left, "Port_Num:%hu\n",
+					qp_flow->usnic_roce.port_num);
+		UPDATE_PTR_LEFT(n, ptr, left);
+	} else if (qp_flow->trans_type == USNIC_TRANSPORT_IPV4_UDP) {
+		n = usnic_transport_sock_to_str(ptr, left,
+				qp_flow->udp.sock);
+		UPDATE_PTR_LEFT(n, ptr, left);
+		n = scnprintf(ptr, left, "\n");
+		UPDATE_PTR_LEFT(n, ptr, left);
+	}
+	spin_unlock(&qp_flow->qp_grp->lock);
+
+	return simple_read_from_buffer(data, count, ppos, buf, ptr - buf);
+}
+
+static const struct file_operations flowinfo_ops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = flowinfo_read,
+};
+
+void usnic_debugfs_init(void)
+{
+	debugfs_root = debugfs_create_dir(DRV_NAME, NULL);
+	if (IS_ERR(debugfs_root)) {
+		usnic_err("Failed to create debugfs root dir, check if debugfs is enabled in kernel configuration\n");
+		goto out_clear_root;
+	}
+
+	flows_dentry = debugfs_create_dir("flows", debugfs_root);
+	if (IS_ERR_OR_NULL(flows_dentry)) {
+		usnic_err("Failed to create debugfs flow dir with err %ld\n",
+				PTR_ERR(flows_dentry));
+		goto out_free_root;
+	}
+
+	debugfs_create_file("build-info", S_IRUGO, debugfs_root,
+				NULL, &usnic_debugfs_buildinfo_ops);
+	return;
+
+out_free_root:
+	debugfs_remove_recursive(debugfs_root);
+out_clear_root:
+	debugfs_root = NULL;
+}
+
+void usnic_debugfs_exit(void)
+{
+	if (!debugfs_root)
+		return;
+
+	debugfs_remove_recursive(debugfs_root);
+	debugfs_root = NULL;
+}
+
+void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	if (IS_ERR_OR_NULL(flows_dentry))
+		return;
+
+	scnprintf(qp_flow->dentry_name, sizeof(qp_flow->dentry_name),
+			"%u", qp_flow->flow->flow_id);
+	qp_flow->dbgfs_dentry = debugfs_create_file(qp_flow->dentry_name,
+							S_IRUGO,
+							flows_dentry,
+							qp_flow,
+							&flowinfo_ops);
+	if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) {
+		usnic_err("Failed to create dbg fs entry for flow %u with error %ld\n",
+				qp_flow->flow->flow_id,
+				PTR_ERR(qp_flow->dbgfs_dentry));
+	}
+}
+
+void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	if (!IS_ERR_OR_NULL(qp_flow->dbgfs_dentry))
+		debugfs_remove(qp_flow->dbgfs_dentry);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.h b/drivers/infiniband/hw/usnic/usnic_debugfs.h
new file mode 100644
index 0000000..98453e9
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_debugfs.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#ifndef USNIC_DEBUGFS_H_
+#define USNIC_DEBUGFS_H_
+
+#include "usnic_ib_qp_grp.h"
+
+void usnic_debugfs_init(void);
+
+void usnic_debugfs_exit(void);
+void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow);
+void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow);
+
+#endif /*!USNIC_DEBUGFS_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.c b/drivers/infiniband/hw/usnic/usnic_fwd.c
new file mode 100644
index 0000000..995a26b
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_fwd.c
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+
+#include "enic_api.h"
+#include "usnic_common_pkt_hdr.h"
+#include "usnic_fwd.h"
+#include "usnic_log.h"
+
+static int usnic_fwd_devcmd_locked(struct usnic_fwd_dev *ufdev, int vnic_idx,
+					enum vnic_devcmd_cmd cmd, u64 *a0,
+					u64 *a1)
+{
+	int status;
+	struct net_device *netdev = ufdev->netdev;
+
+	lockdep_assert_held(&ufdev->lock);
+
+	status = enic_api_devcmd_proxy_by_index(netdev,
+			vnic_idx,
+			cmd,
+			a0, a1,
+			1000);
+	if (status) {
+		if (status == ERR_EINVAL && cmd == CMD_DEL_FILTER) {
+			usnic_dbg("Dev %s vnic idx %u cmd %u already deleted",
+					ufdev->name, vnic_idx, cmd);
+		} else {
+			usnic_err("Dev %s vnic idx %u cmd %u failed with status %d\n",
+					ufdev->name, vnic_idx, cmd,
+					status);
+		}
+	} else {
+		usnic_dbg("Dev %s vnic idx %u cmd %u success",
+				ufdev->name, vnic_idx, cmd);
+	}
+
+	return status;
+}
+
+static int usnic_fwd_devcmd(struct usnic_fwd_dev *ufdev, int vnic_idx,
+				enum vnic_devcmd_cmd cmd, u64 *a0, u64 *a1)
+{
+	int status;
+
+	spin_lock(&ufdev->lock);
+	status = usnic_fwd_devcmd_locked(ufdev, vnic_idx, cmd, a0, a1);
+	spin_unlock(&ufdev->lock);
+
+	return status;
+}
+
+struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev)
+{
+	struct usnic_fwd_dev *ufdev;
+
+	ufdev = kzalloc(sizeof(*ufdev), GFP_KERNEL);
+	if (!ufdev)
+		return NULL;
+
+	ufdev->pdev = pdev;
+	ufdev->netdev = pci_get_drvdata(pdev);
+	spin_lock_init(&ufdev->lock);
+	strncpy(ufdev->name, netdev_name(ufdev->netdev),
+			sizeof(ufdev->name) - 1);
+
+	return ufdev;
+}
+
+void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev)
+{
+	kfree(ufdev);
+}
+
+void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN])
+{
+	spin_lock(&ufdev->lock);
+	memcpy(&ufdev->mac, mac, sizeof(ufdev->mac));
+	spin_unlock(&ufdev->lock);
+}
+
+void usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr)
+{
+	spin_lock(&ufdev->lock);
+	if (!ufdev->inaddr)
+		ufdev->inaddr = inaddr;
+	spin_unlock(&ufdev->lock);
+}
+
+void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev)
+{
+	spin_lock(&ufdev->lock);
+	ufdev->inaddr = 0;
+	spin_unlock(&ufdev->lock);
+}
+
+void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev)
+{
+	spin_lock(&ufdev->lock);
+	ufdev->link_up = 1;
+	spin_unlock(&ufdev->lock);
+}
+
+void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev)
+{
+	spin_lock(&ufdev->lock);
+	ufdev->link_up = 0;
+	spin_unlock(&ufdev->lock);
+}
+
+void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu)
+{
+	spin_lock(&ufdev->lock);
+	ufdev->mtu = mtu;
+	spin_unlock(&ufdev->lock);
+}
+
+static int usnic_fwd_dev_ready_locked(struct usnic_fwd_dev *ufdev)
+{
+	lockdep_assert_held(&ufdev->lock);
+
+	if (!ufdev->link_up)
+		return -EPERM;
+
+	return 0;
+}
+
+static int validate_filter_locked(struct usnic_fwd_dev *ufdev,
+					struct filter *filter)
+{
+
+	lockdep_assert_held(&ufdev->lock);
+
+	if (filter->type == FILTER_IPV4_5TUPLE) {
+		if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_AD))
+			return -EACCES;
+		if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_PT))
+			return -EBUSY;
+		else if (ufdev->inaddr == 0)
+			return -EINVAL;
+		else if (filter->u.ipv4.dst_port == 0)
+			return -ERANGE;
+		else if (ntohl(ufdev->inaddr) != filter->u.ipv4.dst_addr)
+			return -EFAULT;
+		else
+			return 0;
+	}
+
+	return 0;
+}
+
+static void fill_tlv(struct filter_tlv *tlv, struct filter *filter,
+		struct filter_action *action)
+{
+	tlv->type = CLSF_TLV_FILTER;
+	tlv->length = sizeof(struct filter);
+	*((struct filter *)&tlv->val) = *filter;
+
+	tlv = (struct filter_tlv *)((char *)tlv + sizeof(struct filter_tlv) +
+			sizeof(struct filter));
+	tlv->type = CLSF_TLV_ACTION;
+	tlv->length = sizeof(struct filter_action);
+	*((struct filter_action *)&tlv->val) = *action;
+}
+
+struct usnic_fwd_flow*
+usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter,
+				struct usnic_filter_action *uaction)
+{
+	struct filter_tlv *tlv;
+	struct pci_dev *pdev;
+	struct usnic_fwd_flow *flow;
+	uint64_t a0, a1;
+	uint64_t tlv_size;
+	dma_addr_t tlv_pa;
+	int status;
+
+	pdev = ufdev->pdev;
+	tlv_size = (2*sizeof(struct filter_tlv) + sizeof(struct filter) +
+			sizeof(struct filter_action));
+
+	flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
+	if (!flow)
+		return ERR_PTR(-ENOMEM);
+
+	tlv = pci_alloc_consistent(pdev, tlv_size, &tlv_pa);
+	if (!tlv) {
+		usnic_err("Failed to allocate memory\n");
+		status = -ENOMEM;
+		goto out_free_flow;
+	}
+
+	fill_tlv(tlv, filter, &uaction->action);
+
+	spin_lock(&ufdev->lock);
+	status = usnic_fwd_dev_ready_locked(ufdev);
+	if (status) {
+		usnic_err("Forwarding dev %s not ready with status %d\n",
+				ufdev->name, status);
+		goto out_free_tlv;
+	}
+
+	status = validate_filter_locked(ufdev, filter);
+	if (status) {
+		usnic_err("Failed to validate filter with status %d\n",
+				status);
+		goto out_free_tlv;
+	}
+
+	/* Issue Devcmd */
+	a0 = tlv_pa;
+	a1 = tlv_size;
+	status = usnic_fwd_devcmd_locked(ufdev, uaction->vnic_idx,
+						CMD_ADD_FILTER, &a0, &a1);
+	if (status) {
+		usnic_err("VF %s Filter add failed with status:%d",
+				ufdev->name, status);
+		status = -EFAULT;
+		goto out_free_tlv;
+	} else {
+		usnic_dbg("VF %s FILTER ID:%llu", ufdev->name, a0);
+	}
+
+	flow->flow_id = (uint32_t) a0;
+	flow->vnic_idx = uaction->vnic_idx;
+	flow->ufdev = ufdev;
+
+out_free_tlv:
+	spin_unlock(&ufdev->lock);
+	pci_free_consistent(pdev, tlv_size, tlv, tlv_pa);
+	if (!status)
+		return flow;
+out_free_flow:
+	kfree(flow);
+	return ERR_PTR(status);
+}
+
+int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow)
+{
+	int status;
+	u64 a0, a1;
+
+	a0 = flow->flow_id;
+
+	status = usnic_fwd_devcmd(flow->ufdev, flow->vnic_idx,
+					CMD_DEL_FILTER, &a0, &a1);
+	if (status) {
+		if (status == ERR_EINVAL) {
+			usnic_dbg("Filter %u already deleted for VF Idx %u pf: %s status: %d",
+					flow->flow_id, flow->vnic_idx,
+					flow->ufdev->name, status);
+		} else {
+			usnic_err("PF %s VF Idx %u Filter: %u FILTER DELETE failed with status %d",
+					flow->ufdev->name, flow->vnic_idx,
+					flow->flow_id, status);
+		}
+		status = 0;
+		/*
+		 * Log the error and fake success to the caller because if
+		 * a flow fails to be deleted in the firmware, it is an
+		 * unrecoverable error.
+		 */
+	} else {
+		usnic_dbg("PF %s VF Idx %u Filter: %u FILTER DELETED",
+				flow->ufdev->name, flow->vnic_idx,
+				flow->flow_id);
+	}
+
+	kfree(flow);
+	return status;
+}
+
+int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx)
+{
+	int status;
+	struct net_device *pf_netdev;
+	u64 a0, a1;
+
+	pf_netdev = ufdev->netdev;
+	a0 = qp_idx;
+	a1 = CMD_QP_RQWQ;
+
+	status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_ENABLE,
+						&a0, &a1);
+	if (status) {
+		usnic_err("PF %s VNIC Index %u RQ Index: %u ENABLE Failed with status %d",
+				netdev_name(pf_netdev),
+				vnic_idx,
+				qp_idx,
+				status);
+	} else {
+		usnic_dbg("PF %s VNIC Index %u RQ Index: %u ENABLED",
+				netdev_name(pf_netdev),
+				vnic_idx, qp_idx);
+	}
+
+	return status;
+}
+
+int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx)
+{
+	int status;
+	u64 a0, a1;
+	struct net_device *pf_netdev;
+
+	pf_netdev = ufdev->netdev;
+	a0 = qp_idx;
+	a1 = CMD_QP_RQWQ;
+
+	status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_DISABLE,
+			&a0, &a1);
+	if (status) {
+		usnic_err("PF %s VNIC Index %u RQ Index: %u DISABLE Failed with status %d",
+				netdev_name(pf_netdev),
+				vnic_idx,
+				qp_idx,
+				status);
+	} else {
+		usnic_dbg("PF %s VNIC Index %u RQ Index: %u DISABLED",
+				netdev_name(pf_netdev),
+				vnic_idx,
+				qp_idx);
+	}
+
+	return status;
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.h b/drivers/infiniband/hw/usnic/usnic_fwd.h
new file mode 100644
index 0000000..0b2cc4e
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_fwd.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_FWD_H_
+#define USNIC_FWD_H_
+
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/pci.h>
+#include <linux/in.h>
+
+#include "usnic_abi.h"
+#include "usnic_common_pkt_hdr.h"
+#include "vnic_devcmd.h"
+
+struct usnic_fwd_dev {
+	struct pci_dev			*pdev;
+	struct net_device		*netdev;
+	spinlock_t			lock;
+	/*
+	 * The following fields can be read directly off the device.
+	 * However, they should be set by a accessor function, except name,
+	 * which cannot be changed.
+	 */
+	bool				link_up;
+	char				mac[ETH_ALEN];
+	unsigned int			mtu;
+	__be32				inaddr;
+	char				name[IFNAMSIZ+1];
+};
+
+struct usnic_fwd_flow {
+	uint32_t			flow_id;
+	struct usnic_fwd_dev		*ufdev;
+	unsigned int			vnic_idx;
+};
+
+struct usnic_filter_action {
+	int				vnic_idx;
+	struct filter_action		action;
+};
+
+struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev);
+void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev);
+
+void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]);
+void usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr);
+void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev);
+void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev);
+void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev);
+void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu);
+
+/*
+ * Allocate a flow on this forwarding device. Whoever calls this function,
+ * must monitor netdev events on ufdev's netdevice. If NETDEV_REBOOT or
+ * NETDEV_DOWN is seen, flow will no longer function and must be
+ * immediately freed by calling usnic_dealloc_flow.
+ */
+struct usnic_fwd_flow*
+usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter,
+				struct usnic_filter_action *action);
+int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow);
+int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx);
+int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx);
+
+static inline void usnic_fwd_init_usnic_filter(struct filter *filter,
+						uint32_t usnic_id)
+{
+	filter->type = FILTER_USNIC_ID;
+	filter->u.usnic.ethtype = ETH_P_IBOE;
+	filter->u.usnic.flags = FILTER_FIELD_USNIC_ETHTYPE |
+				FILTER_FIELD_USNIC_ID |
+				FILTER_FIELD_USNIC_PROTO;
+	filter->u.usnic.proto_version = (USNIC_ROCE_GRH_VER <<
+					 USNIC_ROCE_GRH_VER_SHIFT) |
+					 USNIC_PROTO_VER;
+	filter->u.usnic.usnic_id = usnic_id;
+}
+
+static inline void usnic_fwd_init_udp_filter(struct filter *filter,
+						uint32_t daddr, uint16_t dport)
+{
+	filter->type = FILTER_IPV4_5TUPLE;
+	filter->u.ipv4.flags = FILTER_FIELD_5TUP_PROTO;
+	filter->u.ipv4.protocol = PROTO_UDP;
+
+	if (daddr) {
+		filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_AD;
+		filter->u.ipv4.dst_addr = daddr;
+	}
+
+	if (dport) {
+		filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_PT;
+		filter->u.ipv4.dst_port = dport;
+	}
+}
+
+#endif /* !USNIC_FWD_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib.h b/drivers/infiniband/hw/usnic/usnic_ib.h
new file mode 100644
index 0000000..525bf27
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_IB_H_
+#define USNIC_IB_H_
+
+#include <linux/iommu.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_verbs.h>
+
+
+#include "usnic.h"
+#include "usnic_abi.h"
+#include "usnic_vnic.h"
+
+#define USNIC_IB_PORT_CNT		1
+#define USNIC_IB_NUM_COMP_VECTORS	1
+
+extern unsigned int usnic_ib_share_vf;
+
+struct usnic_ib_ucontext {
+	struct ib_ucontext		ibucontext;
+	/* Protected by usnic_ib_dev->usdev_lock */
+	struct list_head		qp_grp_list;
+	struct list_head		link;
+};
+
+struct usnic_ib_pd {
+	struct ib_pd			ibpd;
+	struct usnic_uiom_pd		*umem_pd;
+};
+
+struct usnic_ib_mr {
+	struct ib_mr			ibmr;
+	struct usnic_uiom_reg		*umem;
+};
+
+struct usnic_ib_dev {
+	struct ib_device		ib_dev;
+	struct pci_dev			*pdev;
+	struct net_device		*netdev;
+	struct usnic_fwd_dev		*ufdev;
+	struct list_head		ib_dev_link;
+	struct list_head		vf_dev_list;
+	struct list_head		ctx_list;
+	struct mutex			usdev_lock;
+
+	/* provisioning information */
+	struct kref			vf_cnt;
+	unsigned int			vf_res_cnt[USNIC_VNIC_RES_TYPE_MAX];
+
+	/* sysfs vars for QPN reporting */
+	struct kobject *qpn_kobj;
+};
+
+struct usnic_ib_vf {
+	struct usnic_ib_dev		*pf;
+	spinlock_t			lock;
+	struct usnic_vnic		*vnic;
+	unsigned int			qp_grp_ref_cnt;
+	struct usnic_ib_pd		*pd;
+	struct list_head		link;
+};
+
+static inline
+struct usnic_ib_dev *to_usdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct usnic_ib_dev, ib_dev);
+}
+
+static inline
+struct usnic_ib_ucontext *to_ucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext);
+}
+
+static inline
+struct usnic_ib_pd *to_upd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct usnic_ib_pd, ibpd);
+}
+
+static inline
+struct usnic_ib_ucontext *to_uucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext);
+}
+
+static inline
+struct usnic_ib_mr *to_umr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct usnic_ib_mr, ibmr);
+}
+void usnic_ib_log_vf(struct usnic_ib_vf *vf);
+
+#define UPDATE_PTR_LEFT(N, P, L)			\
+do {							\
+	L -= (N);					\
+	P += (N);					\
+} while (0)
+
+#endif /* USNIC_IB_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
new file mode 100644
index 0000000..f0538a4
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -0,0 +1,715 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Author: Upinder Malhi <umalhi@cisco.com>
+ * Author: Anant Deepak <anadeepa@cisco.com>
+ * Author: Cesare Cantu' <cantuc@cisco.com>
+ * Author: Jeff Squyres <jsquyres@cisco.com>
+ * Author: Kiran Thirumalai <kithirum@cisco.com>
+ * Author: Xuyang Wang <xuywang@cisco.com>
+ * Author: Reese Faucette <rfaucett@cisco.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include "usnic_abi.h"
+#include "usnic_common_util.h"
+#include "usnic_ib.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_log.h"
+#include "usnic_fwd.h"
+#include "usnic_debugfs.h"
+#include "usnic_ib_verbs.h"
+#include "usnic_transport.h"
+#include "usnic_uiom.h"
+#include "usnic_ib_sysfs.h"
+
+unsigned int usnic_log_lvl = USNIC_LOG_LVL_ERR;
+unsigned int usnic_ib_share_vf = 1;
+
+static const char usnic_version[] =
+	DRV_NAME ": Cisco VIC (USNIC) Verbs Driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static DEFINE_MUTEX(usnic_ib_ibdev_list_lock);
+static LIST_HEAD(usnic_ib_ibdev_list);
+
+/* Callback dump funcs */
+static int usnic_ib_dump_vf_hdr(void *obj, char *buf, int buf_sz)
+{
+	struct usnic_ib_vf *vf = obj;
+	return scnprintf(buf, buf_sz, "PF: %s ", vf->pf->ib_dev.name);
+}
+/* End callback dump funcs */
+
+static void usnic_ib_dump_vf(struct usnic_ib_vf *vf, char *buf, int buf_sz)
+{
+	usnic_vnic_dump(vf->vnic, buf, buf_sz, vf,
+			usnic_ib_dump_vf_hdr,
+			usnic_ib_qp_grp_dump_hdr, usnic_ib_qp_grp_dump_rows);
+}
+
+void usnic_ib_log_vf(struct usnic_ib_vf *vf)
+{
+	char buf[1000];
+	usnic_ib_dump_vf(vf, buf, sizeof(buf));
+	usnic_dbg("%s\n", buf);
+}
+
+/* Start of netdev section */
+static void usnic_ib_qp_grp_modify_active_to_err(struct usnic_ib_dev *us_ibdev)
+{
+	struct usnic_ib_ucontext *ctx;
+	struct usnic_ib_qp_grp *qp_grp;
+	enum ib_qp_state cur_state;
+	int status;
+
+	BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock));
+
+	list_for_each_entry(ctx, &us_ibdev->ctx_list, link) {
+		list_for_each_entry(qp_grp, &ctx->qp_grp_list, link) {
+			cur_state = qp_grp->state;
+			if (cur_state == IB_QPS_INIT ||
+				cur_state == IB_QPS_RTR ||
+				cur_state == IB_QPS_RTS) {
+				status = usnic_ib_qp_grp_modify(qp_grp,
+								IB_QPS_ERR,
+								NULL);
+				if (status) {
+					usnic_err("Failed to transistion qp grp %u from %s to %s\n",
+						qp_grp->grp_id,
+						usnic_ib_qp_grp_state_to_string
+						(cur_state),
+						usnic_ib_qp_grp_state_to_string
+						(IB_QPS_ERR));
+				}
+			}
+		}
+	}
+}
+
+static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev,
+					unsigned long event)
+{
+	struct net_device *netdev;
+	struct ib_event ib_event;
+
+	memset(&ib_event, 0, sizeof(ib_event));
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	netdev = us_ibdev->netdev;
+	switch (event) {
+	case NETDEV_REBOOT:
+		usnic_info("PF Reset on %s\n", us_ibdev->ib_dev.name);
+		usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+		ib_event.event = IB_EVENT_PORT_ERR;
+		ib_event.device = &us_ibdev->ib_dev;
+		ib_event.element.port_num = 1;
+		ib_dispatch_event(&ib_event);
+		break;
+	case NETDEV_UP:
+	case NETDEV_DOWN:
+	case NETDEV_CHANGE:
+		if (!us_ibdev->ufdev->link_up &&
+				netif_carrier_ok(netdev)) {
+			usnic_fwd_carrier_up(us_ibdev->ufdev);
+			usnic_info("Link UP on %s\n", us_ibdev->ib_dev.name);
+			ib_event.event = IB_EVENT_PORT_ACTIVE;
+			ib_event.device = &us_ibdev->ib_dev;
+			ib_event.element.port_num = 1;
+			ib_dispatch_event(&ib_event);
+		} else if (us_ibdev->ufdev->link_up &&
+				!netif_carrier_ok(netdev)) {
+			usnic_fwd_carrier_down(us_ibdev->ufdev);
+			usnic_info("Link DOWN on %s\n", us_ibdev->ib_dev.name);
+			usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+			ib_event.event = IB_EVENT_PORT_ERR;
+			ib_event.device = &us_ibdev->ib_dev;
+			ib_event.element.port_num = 1;
+			ib_dispatch_event(&ib_event);
+		} else {
+			usnic_dbg("Ignoring %s on %s\n",
+					netdev_cmd_to_name(event),
+					us_ibdev->ib_dev.name);
+		}
+		break;
+	case NETDEV_CHANGEADDR:
+		if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr,
+				sizeof(us_ibdev->ufdev->mac))) {
+			usnic_dbg("Ignoring addr change on %s\n",
+					us_ibdev->ib_dev.name);
+		} else {
+			usnic_info(" %s old mac: %pM new mac: %pM\n",
+					us_ibdev->ib_dev.name,
+					us_ibdev->ufdev->mac,
+					netdev->dev_addr);
+			usnic_fwd_set_mac(us_ibdev->ufdev, netdev->dev_addr);
+			usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+			ib_event.event = IB_EVENT_GID_CHANGE;
+			ib_event.device = &us_ibdev->ib_dev;
+			ib_event.element.port_num = 1;
+			ib_dispatch_event(&ib_event);
+		}
+
+		break;
+	case NETDEV_CHANGEMTU:
+		if (us_ibdev->ufdev->mtu != netdev->mtu) {
+			usnic_info("MTU Change on %s old: %u new: %u\n",
+					us_ibdev->ib_dev.name,
+					us_ibdev->ufdev->mtu, netdev->mtu);
+			usnic_fwd_set_mtu(us_ibdev->ufdev, netdev->mtu);
+			usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+		} else {
+			usnic_dbg("Ignoring MTU change on %s\n",
+					us_ibdev->ib_dev.name);
+		}
+		break;
+	default:
+		usnic_dbg("Ignoring event %s on %s",
+				netdev_cmd_to_name(event),
+				us_ibdev->ib_dev.name);
+	}
+	mutex_unlock(&us_ibdev->usdev_lock);
+}
+
+static int usnic_ib_netdevice_event(struct notifier_block *notifier,
+					unsigned long event, void *ptr)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+	mutex_lock(&usnic_ib_ibdev_list_lock);
+	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) {
+		if (us_ibdev->netdev == netdev) {
+			usnic_ib_handle_usdev_event(us_ibdev, event);
+			break;
+		}
+	}
+	mutex_unlock(&usnic_ib_ibdev_list_lock);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block usnic_ib_netdevice_notifier = {
+	.notifier_call = usnic_ib_netdevice_event
+};
+/* End of netdev section */
+
+/* Start of inet section */
+static int usnic_ib_handle_inet_event(struct usnic_ib_dev *us_ibdev,
+					unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+	struct ib_event ib_event;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+
+	switch (event) {
+	case NETDEV_DOWN:
+		usnic_info("%s via ip notifiers",
+				netdev_cmd_to_name(event));
+		usnic_fwd_del_ipaddr(us_ibdev->ufdev);
+		usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+		ib_event.event = IB_EVENT_GID_CHANGE;
+		ib_event.device = &us_ibdev->ib_dev;
+		ib_event.element.port_num = 1;
+		ib_dispatch_event(&ib_event);
+		break;
+	case NETDEV_UP:
+		usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address);
+		usnic_info("%s via ip notifiers: ip %pI4",
+				netdev_cmd_to_name(event),
+				&us_ibdev->ufdev->inaddr);
+		ib_event.event = IB_EVENT_GID_CHANGE;
+		ib_event.device = &us_ibdev->ib_dev;
+		ib_event.element.port_num = 1;
+		ib_dispatch_event(&ib_event);
+		break;
+	default:
+		usnic_info("Ignoring event %s on %s",
+				netdev_cmd_to_name(event),
+				us_ibdev->ib_dev.name);
+	}
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return NOTIFY_DONE;
+}
+
+static int usnic_ib_inetaddr_event(struct notifier_block *notifier,
+					unsigned long event, void *ptr)
+{
+	struct usnic_ib_dev *us_ibdev;
+	struct in_ifaddr *ifa = ptr;
+	struct net_device *netdev = ifa->ifa_dev->dev;
+
+	mutex_lock(&usnic_ib_ibdev_list_lock);
+	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) {
+		if (us_ibdev->netdev == netdev) {
+			usnic_ib_handle_inet_event(us_ibdev, event, ptr);
+			break;
+		}
+	}
+	mutex_unlock(&usnic_ib_ibdev_list_lock);
+
+	return NOTIFY_DONE;
+}
+static struct notifier_block usnic_ib_inetaddr_notifier = {
+	.notifier_call = usnic_ib_inetaddr_event
+};
+/* End of inet section*/
+
+static int usnic_port_immutable(struct ib_device *ibdev, u8 port_num,
+			        struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	immutable->core_cap_flags = RDMA_CORE_PORT_USNIC;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	return 0;
+}
+
+static void usnic_get_dev_fw_str(struct ib_device *device, char *str)
+{
+	struct usnic_ib_dev *us_ibdev =
+		container_of(device, struct usnic_ib_dev, ib_dev);
+	struct ethtool_drvinfo info;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info);
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version);
+}
+
+/* Start of PF discovery section */
+static void *usnic_ib_device_add(struct pci_dev *dev)
+{
+	struct usnic_ib_dev *us_ibdev;
+	union ib_gid gid;
+	struct in_device *ind;
+	struct net_device *netdev;
+
+	usnic_dbg("\n");
+	netdev = pci_get_drvdata(dev);
+
+	us_ibdev = (struct usnic_ib_dev *)ib_alloc_device(sizeof(*us_ibdev));
+	if (!us_ibdev) {
+		usnic_err("Device %s context alloc failed\n",
+				netdev_name(pci_get_drvdata(dev)));
+		return ERR_PTR(-EFAULT);
+	}
+
+	us_ibdev->ufdev = usnic_fwd_dev_alloc(dev);
+	if (!us_ibdev->ufdev) {
+		usnic_err("Failed to alloc ufdev for %s\n", pci_name(dev));
+		goto err_dealloc;
+	}
+
+	mutex_init(&us_ibdev->usdev_lock);
+	INIT_LIST_HEAD(&us_ibdev->vf_dev_list);
+	INIT_LIST_HEAD(&us_ibdev->ctx_list);
+
+	us_ibdev->pdev = dev;
+	us_ibdev->netdev = pci_get_drvdata(dev);
+	us_ibdev->ib_dev.owner = THIS_MODULE;
+	us_ibdev->ib_dev.node_type = RDMA_NODE_USNIC_UDP;
+	us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT;
+	us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS;
+	us_ibdev->ib_dev.dev.parent = &dev->dev;
+	us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION;
+	strlcpy(us_ibdev->ib_dev.name, "usnic_%d", IB_DEVICE_NAME_MAX);
+
+	us_ibdev->ib_dev.uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_REG_MR) |
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) |
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST) |
+		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
+
+	us_ibdev->ib_dev.query_device = usnic_ib_query_device;
+	us_ibdev->ib_dev.query_port = usnic_ib_query_port;
+	us_ibdev->ib_dev.query_pkey = usnic_ib_query_pkey;
+	us_ibdev->ib_dev.query_gid = usnic_ib_query_gid;
+	us_ibdev->ib_dev.get_netdev = usnic_get_netdev;
+	us_ibdev->ib_dev.get_link_layer = usnic_ib_port_link_layer;
+	us_ibdev->ib_dev.alloc_pd = usnic_ib_alloc_pd;
+	us_ibdev->ib_dev.dealloc_pd = usnic_ib_dealloc_pd;
+	us_ibdev->ib_dev.create_qp = usnic_ib_create_qp;
+	us_ibdev->ib_dev.modify_qp = usnic_ib_modify_qp;
+	us_ibdev->ib_dev.query_qp = usnic_ib_query_qp;
+	us_ibdev->ib_dev.destroy_qp = usnic_ib_destroy_qp;
+	us_ibdev->ib_dev.create_cq = usnic_ib_create_cq;
+	us_ibdev->ib_dev.destroy_cq = usnic_ib_destroy_cq;
+	us_ibdev->ib_dev.reg_user_mr = usnic_ib_reg_mr;
+	us_ibdev->ib_dev.dereg_mr = usnic_ib_dereg_mr;
+	us_ibdev->ib_dev.alloc_ucontext = usnic_ib_alloc_ucontext;
+	us_ibdev->ib_dev.dealloc_ucontext = usnic_ib_dealloc_ucontext;
+	us_ibdev->ib_dev.mmap = usnic_ib_mmap;
+	us_ibdev->ib_dev.create_ah = usnic_ib_create_ah;
+	us_ibdev->ib_dev.destroy_ah = usnic_ib_destroy_ah;
+	us_ibdev->ib_dev.post_send = usnic_ib_post_send;
+	us_ibdev->ib_dev.post_recv = usnic_ib_post_recv;
+	us_ibdev->ib_dev.poll_cq = usnic_ib_poll_cq;
+	us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq;
+	us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr;
+	us_ibdev->ib_dev.get_port_immutable = usnic_port_immutable;
+	us_ibdev->ib_dev.get_dev_fw_str     = usnic_get_dev_fw_str;
+
+
+	us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC;
+	if (ib_register_device(&us_ibdev->ib_dev, NULL))
+		goto err_fwd_dealloc;
+
+	usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu);
+	usnic_fwd_set_mac(us_ibdev->ufdev, us_ibdev->netdev->dev_addr);
+	if (netif_carrier_ok(us_ibdev->netdev))
+		usnic_fwd_carrier_up(us_ibdev->ufdev);
+
+	ind = in_dev_get(netdev);
+	if (ind->ifa_list)
+		usnic_fwd_add_ipaddr(us_ibdev->ufdev,
+				     ind->ifa_list->ifa_address);
+	in_dev_put(ind);
+
+	usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr,
+				us_ibdev->ufdev->inaddr, &gid.raw[0]);
+	memcpy(&us_ibdev->ib_dev.node_guid, &gid.global.interface_id,
+		sizeof(gid.global.interface_id));
+	kref_init(&us_ibdev->vf_cnt);
+
+	usnic_info("Added ibdev: %s netdev: %s with mac %pM Link: %u MTU: %u\n",
+			us_ibdev->ib_dev.name, netdev_name(us_ibdev->netdev),
+			us_ibdev->ufdev->mac, us_ibdev->ufdev->link_up,
+			us_ibdev->ufdev->mtu);
+	return us_ibdev;
+
+err_fwd_dealloc:
+	usnic_fwd_dev_free(us_ibdev->ufdev);
+err_dealloc:
+	usnic_err("failed -- deallocing device\n");
+	ib_dealloc_device(&us_ibdev->ib_dev);
+	return NULL;
+}
+
+static void usnic_ib_device_remove(struct usnic_ib_dev *us_ibdev)
+{
+	usnic_info("Unregistering %s\n", us_ibdev->ib_dev.name);
+	usnic_ib_sysfs_unregister_usdev(us_ibdev);
+	usnic_fwd_dev_free(us_ibdev->ufdev);
+	ib_unregister_device(&us_ibdev->ib_dev);
+	ib_dealloc_device(&us_ibdev->ib_dev);
+}
+
+static void usnic_ib_undiscover_pf(struct kref *kref)
+{
+	struct usnic_ib_dev *us_ibdev, *tmp;
+	struct pci_dev *dev;
+	bool found = false;
+
+	dev = container_of(kref, struct usnic_ib_dev, vf_cnt)->pdev;
+	mutex_lock(&usnic_ib_ibdev_list_lock);
+	list_for_each_entry_safe(us_ibdev, tmp,
+				&usnic_ib_ibdev_list, ib_dev_link) {
+		if (us_ibdev->pdev == dev) {
+			list_del(&us_ibdev->ib_dev_link);
+			usnic_ib_device_remove(us_ibdev);
+			found = true;
+			break;
+		}
+	}
+
+	WARN(!found, "Failed to remove PF %s\n", pci_name(dev));
+
+	mutex_unlock(&usnic_ib_ibdev_list_lock);
+}
+
+static struct usnic_ib_dev *usnic_ib_discover_pf(struct usnic_vnic *vnic)
+{
+	struct usnic_ib_dev *us_ibdev;
+	struct pci_dev *parent_pci, *vf_pci;
+	int err;
+
+	vf_pci = usnic_vnic_get_pdev(vnic);
+	parent_pci = pci_physfn(vf_pci);
+
+	BUG_ON(!parent_pci);
+
+	mutex_lock(&usnic_ib_ibdev_list_lock);
+	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) {
+		if (us_ibdev->pdev == parent_pci) {
+			kref_get(&us_ibdev->vf_cnt);
+			goto out;
+		}
+	}
+
+	us_ibdev = usnic_ib_device_add(parent_pci);
+	if (IS_ERR_OR_NULL(us_ibdev)) {
+		us_ibdev = us_ibdev ? us_ibdev : ERR_PTR(-EFAULT);
+		goto out;
+	}
+
+	err = usnic_ib_sysfs_register_usdev(us_ibdev);
+	if (err) {
+		usnic_ib_device_remove(us_ibdev);
+		us_ibdev = ERR_PTR(err);
+		goto out;
+	}
+
+	list_add(&us_ibdev->ib_dev_link, &usnic_ib_ibdev_list);
+out:
+	mutex_unlock(&usnic_ib_ibdev_list_lock);
+	return us_ibdev;
+}
+/* End of PF discovery section */
+
+/* Start of PCI section */
+
+static const struct pci_device_id usnic_ib_pci_ids[] = {
+	{PCI_DEVICE(PCI_VENDOR_ID_CISCO, PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC)},
+	{0,}
+};
+
+static int usnic_ib_pci_probe(struct pci_dev *pdev,
+				const struct pci_device_id *id)
+{
+	int err;
+	struct usnic_ib_dev *pf;
+	struct usnic_ib_vf *vf;
+	enum usnic_vnic_res_type res_type;
+
+	vf = kzalloc(sizeof(*vf), GFP_KERNEL);
+	if (!vf)
+		return -ENOMEM;
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		usnic_err("Failed to enable %s with err %d\n",
+				pci_name(pdev), err);
+		goto out_clean_vf;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		usnic_err("Failed to request region for %s with err %d\n",
+				pci_name(pdev), err);
+		goto out_disable_device;
+	}
+
+	pci_set_master(pdev);
+	pci_set_drvdata(pdev, vf);
+
+	vf->vnic = usnic_vnic_alloc(pdev);
+	if (IS_ERR_OR_NULL(vf->vnic)) {
+		err = vf->vnic ? PTR_ERR(vf->vnic) : -ENOMEM;
+		usnic_err("Failed to alloc vnic for %s with err %d\n",
+				pci_name(pdev), err);
+		goto out_release_regions;
+	}
+
+	pf = usnic_ib_discover_pf(vf->vnic);
+	if (IS_ERR_OR_NULL(pf)) {
+		usnic_err("Failed to discover pf of vnic %s with err%ld\n",
+				pci_name(pdev), PTR_ERR(pf));
+		err = pf ? PTR_ERR(pf) : -EFAULT;
+		goto out_clean_vnic;
+	}
+
+	vf->pf = pf;
+	spin_lock_init(&vf->lock);
+	mutex_lock(&pf->usdev_lock);
+	list_add_tail(&vf->link, &pf->vf_dev_list);
+	/*
+	 * Save max settings (will be same for each VF, easier to re-write than
+	 * to say "if (!set) { set_values(); set=1; }
+	 */
+	for (res_type = USNIC_VNIC_RES_TYPE_EOL+1;
+			res_type < USNIC_VNIC_RES_TYPE_MAX;
+			res_type++) {
+		pf->vf_res_cnt[res_type] = usnic_vnic_res_cnt(vf->vnic,
+								res_type);
+	}
+
+	mutex_unlock(&pf->usdev_lock);
+
+	usnic_info("Registering usnic VF %s into PF %s\n", pci_name(pdev),
+			pf->ib_dev.name);
+	usnic_ib_log_vf(vf);
+	return 0;
+
+out_clean_vnic:
+	usnic_vnic_free(vf->vnic);
+out_release_regions:
+	pci_set_drvdata(pdev, NULL);
+	pci_clear_master(pdev);
+	pci_release_regions(pdev);
+out_disable_device:
+	pci_disable_device(pdev);
+out_clean_vf:
+	kfree(vf);
+	return err;
+}
+
+static void usnic_ib_pci_remove(struct pci_dev *pdev)
+{
+	struct usnic_ib_vf *vf = pci_get_drvdata(pdev);
+	struct usnic_ib_dev *pf = vf->pf;
+
+	mutex_lock(&pf->usdev_lock);
+	list_del(&vf->link);
+	mutex_unlock(&pf->usdev_lock);
+
+	kref_put(&pf->vf_cnt, usnic_ib_undiscover_pf);
+	usnic_vnic_free(vf->vnic);
+	pci_set_drvdata(pdev, NULL);
+	pci_clear_master(pdev);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	kfree(vf);
+
+	usnic_info("Removed VF %s\n", pci_name(pdev));
+}
+
+/* PCI driver entry points */
+static struct pci_driver usnic_ib_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = usnic_ib_pci_ids,
+	.probe = usnic_ib_pci_probe,
+	.remove = usnic_ib_pci_remove,
+};
+/* End of PCI section */
+
+/* Start of module section */
+static int __init usnic_ib_init(void)
+{
+	int err;
+
+	printk_once(KERN_INFO "%s", usnic_version);
+
+	err = usnic_uiom_init(DRV_NAME);
+	if (err) {
+		usnic_err("Unable to initalize umem with err %d\n", err);
+		return err;
+	}
+
+	err = pci_register_driver(&usnic_ib_pci_driver);
+	if (err) {
+		usnic_err("Unable to register with PCI\n");
+		goto out_umem_fini;
+	}
+
+	err = register_netdevice_notifier(&usnic_ib_netdevice_notifier);
+	if (err) {
+		usnic_err("Failed to register netdev notifier\n");
+		goto out_pci_unreg;
+	}
+
+	err = register_inetaddr_notifier(&usnic_ib_inetaddr_notifier);
+	if (err) {
+		usnic_err("Failed to register inet addr notifier\n");
+		goto out_unreg_netdev_notifier;
+	}
+
+	err = usnic_transport_init();
+	if (err) {
+		usnic_err("Failed to initialize transport\n");
+		goto out_unreg_inetaddr_notifier;
+	}
+
+	usnic_debugfs_init();
+
+	return 0;
+
+out_unreg_inetaddr_notifier:
+	unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier);
+out_unreg_netdev_notifier:
+	unregister_netdevice_notifier(&usnic_ib_netdevice_notifier);
+out_pci_unreg:
+	pci_unregister_driver(&usnic_ib_pci_driver);
+out_umem_fini:
+	usnic_uiom_fini();
+
+	return err;
+}
+
+static void __exit usnic_ib_destroy(void)
+{
+	usnic_dbg("\n");
+	usnic_debugfs_exit();
+	usnic_transport_fini();
+	unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier);
+	unregister_netdevice_notifier(&usnic_ib_netdevice_notifier);
+	pci_unregister_driver(&usnic_ib_pci_driver);
+	usnic_uiom_fini();
+}
+
+MODULE_DESCRIPTION("Cisco VIC (usNIC) Verbs Driver");
+MODULE_AUTHOR("Upinder Malhi <umalhi@cisco.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+module_param(usnic_log_lvl, uint, S_IRUGO | S_IWUSR);
+module_param(usnic_ib_share_vf, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(usnic_log_lvl, " Off=0, Err=1, Info=2, Debug=3");
+MODULE_PARM_DESC(usnic_ib_share_vf, "Off=0, On=1 VF sharing amongst QPs");
+MODULE_DEVICE_TABLE(pci, usnic_ib_pci_ids);
+
+module_init(usnic_ib_init);
+module_exit(usnic_ib_destroy);
+/* End of module section */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
new file mode 100644
index 0000000..912d8ef
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
@@ -0,0 +1,766 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/bug.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+
+#include "usnic_log.h"
+#include "usnic_vnic.h"
+#include "usnic_fwd.h"
+#include "usnic_uiom.h"
+#include "usnic_debugfs.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_ib_sysfs.h"
+#include "usnic_transport.h"
+
+#define DFLT_RQ_IDX	0
+
+const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:
+		return "Rst";
+	case IB_QPS_INIT:
+		return "Init";
+	case IB_QPS_RTR:
+		return "RTR";
+	case IB_QPS_RTS:
+		return "RTS";
+	case IB_QPS_SQD:
+		return "SQD";
+	case IB_QPS_SQE:
+		return "SQE";
+	case IB_QPS_ERR:
+		return "ERR";
+	default:
+		return "UNKNOWN STATE";
+
+	}
+}
+
+int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz)
+{
+	return scnprintf(buf, buf_sz, "|QPN\t|State\t|PID\t|VF Idx\t|Fil ID");
+}
+
+int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz)
+{
+	struct usnic_ib_qp_grp *qp_grp = obj;
+	struct usnic_ib_qp_grp_flow *default_flow;
+	if (obj) {
+		default_flow = list_first_entry(&qp_grp->flows_lst,
+					struct usnic_ib_qp_grp_flow, link);
+		return scnprintf(buf, buf_sz, "|%d\t|%s\t|%d\t|%hu\t|%d",
+					qp_grp->ibqp.qp_num,
+					usnic_ib_qp_grp_state_to_string(
+							qp_grp->state),
+					qp_grp->owner_pid,
+					usnic_vnic_get_index(qp_grp->vf->vnic),
+					default_flow->flow->flow_id);
+	} else {
+		return scnprintf(buf, buf_sz, "|N/A\t|N/A\t|N/A\t|N/A\t|N/A");
+	}
+}
+
+static struct usnic_vnic_res_chunk *
+get_qp_res_chunk(struct usnic_ib_qp_grp *qp_grp)
+{
+	lockdep_assert_held(&qp_grp->lock);
+	/*
+	 * The QP res chunk, used to derive qp indices,
+	 * are just indices of the RQs
+	 */
+	return usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ);
+}
+
+static int enable_qp_grp(struct usnic_ib_qp_grp *qp_grp)
+{
+
+	int status;
+	int i, vnic_idx;
+	struct usnic_vnic_res_chunk *res_chunk;
+	struct usnic_vnic_res *res;
+
+	lockdep_assert_held(&qp_grp->lock);
+
+	vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic);
+
+	res_chunk = get_qp_res_chunk(qp_grp);
+	if (IS_ERR(res_chunk)) {
+		usnic_err("Unable to get qp res with err %ld\n",
+				PTR_ERR(res_chunk));
+		return PTR_ERR(res_chunk);
+	}
+
+	for (i = 0; i < res_chunk->cnt; i++) {
+		res = res_chunk->res[i];
+		status = usnic_fwd_enable_qp(qp_grp->ufdev, vnic_idx,
+						res->vnic_idx);
+		if (status) {
+			usnic_err("Failed to enable qp %d of %s:%d\n with err %d\n",
+					res->vnic_idx, qp_grp->ufdev->name,
+					vnic_idx, status);
+			goto out_err;
+		}
+	}
+
+	return 0;
+
+out_err:
+	for (i--; i >= 0; i--) {
+		res = res_chunk->res[i];
+		usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx,
+					res->vnic_idx);
+	}
+
+	return status;
+}
+
+static int disable_qp_grp(struct usnic_ib_qp_grp *qp_grp)
+{
+	int i, vnic_idx;
+	struct usnic_vnic_res_chunk *res_chunk;
+	struct usnic_vnic_res *res;
+	int status = 0;
+
+	lockdep_assert_held(&qp_grp->lock);
+	vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic);
+
+	res_chunk = get_qp_res_chunk(qp_grp);
+	if (IS_ERR(res_chunk)) {
+		usnic_err("Unable to get qp res with err %ld\n",
+			PTR_ERR(res_chunk));
+		return PTR_ERR(res_chunk);
+	}
+
+	for (i = 0; i < res_chunk->cnt; i++) {
+		res = res_chunk->res[i];
+		status = usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx,
+						res->vnic_idx);
+		if (status) {
+			usnic_err("Failed to disable rq %d of %s:%d\n with err %d\n",
+					res->vnic_idx,
+					qp_grp->ufdev->name,
+					vnic_idx, status);
+		}
+	}
+
+	return status;
+
+}
+
+static int init_filter_action(struct usnic_ib_qp_grp *qp_grp,
+				struct usnic_filter_action *uaction)
+{
+	struct usnic_vnic_res_chunk *res_chunk;
+
+	res_chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ);
+	if (IS_ERR(res_chunk)) {
+		usnic_err("Unable to get %s with err %ld\n",
+			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ),
+			PTR_ERR(res_chunk));
+		return PTR_ERR(res_chunk);
+	}
+
+	uaction->vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic);
+	uaction->action.type = FILTER_ACTION_RQ_STEERING;
+	uaction->action.u.rq_idx = res_chunk->res[DFLT_RQ_IDX]->vnic_idx;
+
+	return 0;
+}
+
+static struct usnic_ib_qp_grp_flow*
+create_roce_custom_flow(struct usnic_ib_qp_grp *qp_grp,
+			struct usnic_transport_spec *trans_spec)
+{
+	uint16_t port_num;
+	int err;
+	struct filter filter;
+	struct usnic_filter_action uaction;
+	struct usnic_ib_qp_grp_flow *qp_flow;
+	struct usnic_fwd_flow *flow;
+	enum usnic_transport_type trans_type;
+
+	trans_type = trans_spec->trans_type;
+	port_num = trans_spec->usnic_roce.port_num;
+
+	/* Reserve Port */
+	port_num = usnic_transport_rsrv_port(trans_type, port_num);
+	if (port_num == 0)
+		return ERR_PTR(-EINVAL);
+
+	/* Create Flow */
+	usnic_fwd_init_usnic_filter(&filter, port_num);
+	err = init_filter_action(qp_grp, &uaction);
+	if (err)
+		goto out_unreserve_port;
+
+	flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction);
+	if (IS_ERR_OR_NULL(flow)) {
+		err = flow ? PTR_ERR(flow) : -EFAULT;
+		goto out_unreserve_port;
+	}
+
+	/* Create Flow Handle */
+	qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC);
+	if (!qp_flow) {
+		err = -ENOMEM;
+		goto out_dealloc_flow;
+	}
+	qp_flow->flow = flow;
+	qp_flow->trans_type = trans_type;
+	qp_flow->usnic_roce.port_num = port_num;
+	qp_flow->qp_grp = qp_grp;
+	return qp_flow;
+
+out_dealloc_flow:
+	usnic_fwd_dealloc_flow(flow);
+out_unreserve_port:
+	usnic_transport_unrsrv_port(trans_type, port_num);
+	return ERR_PTR(err);
+}
+
+static void release_roce_custom_flow(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	usnic_fwd_dealloc_flow(qp_flow->flow);
+	usnic_transport_unrsrv_port(qp_flow->trans_type,
+					qp_flow->usnic_roce.port_num);
+	kfree(qp_flow);
+}
+
+static struct usnic_ib_qp_grp_flow*
+create_udp_flow(struct usnic_ib_qp_grp *qp_grp,
+		struct usnic_transport_spec *trans_spec)
+{
+	struct socket *sock;
+	int sock_fd;
+	int err;
+	struct filter filter;
+	struct usnic_filter_action uaction;
+	struct usnic_ib_qp_grp_flow *qp_flow;
+	struct usnic_fwd_flow *flow;
+	enum usnic_transport_type trans_type;
+	uint32_t addr;
+	uint16_t port_num;
+	int proto;
+
+	trans_type = trans_spec->trans_type;
+	sock_fd = trans_spec->udp.sock_fd;
+
+	/* Get and check socket */
+	sock = usnic_transport_get_socket(sock_fd);
+	if (IS_ERR_OR_NULL(sock))
+		return ERR_CAST(sock);
+
+	err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port_num);
+	if (err)
+		goto out_put_sock;
+
+	if (proto != IPPROTO_UDP) {
+		usnic_err("Protocol for fd %d is not UDP", sock_fd);
+		err = -EPERM;
+		goto out_put_sock;
+	}
+
+	/* Create flow */
+	usnic_fwd_init_udp_filter(&filter, addr, port_num);
+	err = init_filter_action(qp_grp, &uaction);
+	if (err)
+		goto out_put_sock;
+
+	flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction);
+	if (IS_ERR_OR_NULL(flow)) {
+		err = flow ? PTR_ERR(flow) : -EFAULT;
+		goto out_put_sock;
+	}
+
+	/* Create qp_flow */
+	qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC);
+	if (!qp_flow) {
+		err = -ENOMEM;
+		goto out_dealloc_flow;
+	}
+	qp_flow->flow = flow;
+	qp_flow->trans_type = trans_type;
+	qp_flow->udp.sock = sock;
+	qp_flow->qp_grp = qp_grp;
+	return qp_flow;
+
+out_dealloc_flow:
+	usnic_fwd_dealloc_flow(flow);
+out_put_sock:
+	usnic_transport_put_socket(sock);
+	return ERR_PTR(err);
+}
+
+static void release_udp_flow(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	usnic_fwd_dealloc_flow(qp_flow->flow);
+	usnic_transport_put_socket(qp_flow->udp.sock);
+	kfree(qp_flow);
+}
+
+static struct usnic_ib_qp_grp_flow*
+create_and_add_flow(struct usnic_ib_qp_grp *qp_grp,
+			struct usnic_transport_spec *trans_spec)
+{
+	struct usnic_ib_qp_grp_flow *qp_flow;
+	enum usnic_transport_type trans_type;
+
+	trans_type = trans_spec->trans_type;
+	switch (trans_type) {
+	case USNIC_TRANSPORT_ROCE_CUSTOM:
+		qp_flow = create_roce_custom_flow(qp_grp, trans_spec);
+		break;
+	case USNIC_TRANSPORT_IPV4_UDP:
+		qp_flow = create_udp_flow(qp_grp, trans_spec);
+		break;
+	default:
+		usnic_err("Unsupported transport %u\n",
+				trans_spec->trans_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!IS_ERR_OR_NULL(qp_flow)) {
+		list_add_tail(&qp_flow->link, &qp_grp->flows_lst);
+		usnic_debugfs_flow_add(qp_flow);
+	}
+
+
+	return qp_flow;
+}
+
+static void release_and_remove_flow(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	usnic_debugfs_flow_remove(qp_flow);
+	list_del(&qp_flow->link);
+
+	switch (qp_flow->trans_type) {
+	case USNIC_TRANSPORT_ROCE_CUSTOM:
+		release_roce_custom_flow(qp_flow);
+		break;
+	case USNIC_TRANSPORT_IPV4_UDP:
+		release_udp_flow(qp_flow);
+		break;
+	default:
+		WARN(1, "Unsupported transport %u\n",
+				qp_flow->trans_type);
+		break;
+	}
+}
+
+static void release_and_remove_all_flows(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct usnic_ib_qp_grp_flow *qp_flow, *tmp;
+	list_for_each_entry_safe(qp_flow, tmp, &qp_grp->flows_lst, link)
+		release_and_remove_flow(qp_flow);
+}
+
+int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp,
+				enum ib_qp_state new_state,
+				void *data)
+{
+	int status = 0;
+	struct ib_event ib_event;
+	enum ib_qp_state old_state;
+	struct usnic_transport_spec *trans_spec;
+	struct usnic_ib_qp_grp_flow *qp_flow;
+
+	old_state = qp_grp->state;
+	trans_spec = (struct usnic_transport_spec *) data;
+
+	spin_lock(&qp_grp->lock);
+	switch (new_state) {
+	case IB_QPS_RESET:
+		switch (old_state) {
+		case IB_QPS_RESET:
+			/* NO-OP */
+			break;
+		case IB_QPS_INIT:
+			release_and_remove_all_flows(qp_grp);
+			status = 0;
+			break;
+		case IB_QPS_RTR:
+		case IB_QPS_RTS:
+		case IB_QPS_ERR:
+			status = disable_qp_grp(qp_grp);
+			release_and_remove_all_flows(qp_grp);
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	case IB_QPS_INIT:
+		switch (old_state) {
+		case IB_QPS_RESET:
+			if (trans_spec) {
+				qp_flow = create_and_add_flow(qp_grp,
+								trans_spec);
+				if (IS_ERR_OR_NULL(qp_flow)) {
+					status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT;
+					break;
+				}
+			} else {
+				/*
+				 * Optional to specify filters.
+				 */
+				status = 0;
+			}
+			break;
+		case IB_QPS_INIT:
+			if (trans_spec) {
+				qp_flow = create_and_add_flow(qp_grp,
+								trans_spec);
+				if (IS_ERR_OR_NULL(qp_flow)) {
+					status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT;
+					break;
+				}
+			} else {
+				/*
+				 * Doesn't make sense to go into INIT state
+				 * from INIT state w/o adding filters.
+				 */
+				status = -EINVAL;
+			}
+			break;
+		case IB_QPS_RTR:
+			status = disable_qp_grp(qp_grp);
+			break;
+		case IB_QPS_RTS:
+			status = disable_qp_grp(qp_grp);
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	case IB_QPS_RTR:
+		switch (old_state) {
+		case IB_QPS_INIT:
+			status = enable_qp_grp(qp_grp);
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	case IB_QPS_RTS:
+		switch (old_state) {
+		case IB_QPS_RTR:
+			/* NO-OP FOR NOW */
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	case IB_QPS_ERR:
+		ib_event.device = &qp_grp->vf->pf->ib_dev;
+		ib_event.element.qp = &qp_grp->ibqp;
+		ib_event.event = IB_EVENT_QP_FATAL;
+
+		switch (old_state) {
+		case IB_QPS_RESET:
+			qp_grp->ibqp.event_handler(&ib_event,
+					qp_grp->ibqp.qp_context);
+			break;
+		case IB_QPS_INIT:
+			release_and_remove_all_flows(qp_grp);
+			qp_grp->ibqp.event_handler(&ib_event,
+					qp_grp->ibqp.qp_context);
+			break;
+		case IB_QPS_RTR:
+		case IB_QPS_RTS:
+			status = disable_qp_grp(qp_grp);
+			release_and_remove_all_flows(qp_grp);
+			qp_grp->ibqp.event_handler(&ib_event,
+					qp_grp->ibqp.qp_context);
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	default:
+		status = -EINVAL;
+	}
+	spin_unlock(&qp_grp->lock);
+
+	if (!status) {
+		qp_grp->state = new_state;
+		usnic_info("Transitioned %u from %s to %s",
+		qp_grp->grp_id,
+		usnic_ib_qp_grp_state_to_string(old_state),
+		usnic_ib_qp_grp_state_to_string(new_state));
+	} else {
+		usnic_err("Failed to transition %u from %s to %s",
+		qp_grp->grp_id,
+		usnic_ib_qp_grp_state_to_string(old_state),
+		usnic_ib_qp_grp_state_to_string(new_state));
+	}
+
+	return status;
+}
+
+static struct usnic_vnic_res_chunk**
+alloc_res_chunk_list(struct usnic_vnic *vnic,
+			struct usnic_vnic_res_spec *res_spec, void *owner_obj)
+{
+	enum usnic_vnic_res_type res_type;
+	struct usnic_vnic_res_chunk **res_chunk_list;
+	int err, i, res_cnt, res_lst_sz;
+
+	for (res_lst_sz = 0;
+		res_spec->resources[res_lst_sz].type != USNIC_VNIC_RES_TYPE_EOL;
+		res_lst_sz++) {
+		/* Do Nothing */
+	}
+
+	res_chunk_list = kzalloc(sizeof(*res_chunk_list)*(res_lst_sz+1),
+					GFP_ATOMIC);
+	if (!res_chunk_list)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; res_spec->resources[i].type != USNIC_VNIC_RES_TYPE_EOL;
+		i++) {
+		res_type = res_spec->resources[i].type;
+		res_cnt = res_spec->resources[i].cnt;
+
+		res_chunk_list[i] = usnic_vnic_get_resources(vnic, res_type,
+					res_cnt, owner_obj);
+		if (IS_ERR_OR_NULL(res_chunk_list[i])) {
+			err = res_chunk_list[i] ?
+					PTR_ERR(res_chunk_list[i]) : -ENOMEM;
+			usnic_err("Failed to get %s from %s with err %d\n",
+				usnic_vnic_res_type_to_str(res_type),
+				usnic_vnic_pci_name(vnic),
+				err);
+			goto out_free_res;
+		}
+	}
+
+	return res_chunk_list;
+
+out_free_res:
+	for (i--; i >= 0; i--)
+		usnic_vnic_put_resources(res_chunk_list[i]);
+	kfree(res_chunk_list);
+	return ERR_PTR(err);
+}
+
+static void free_qp_grp_res(struct usnic_vnic_res_chunk **res_chunk_list)
+{
+	int i;
+	for (i = 0; res_chunk_list[i]; i++)
+		usnic_vnic_put_resources(res_chunk_list[i]);
+	kfree(res_chunk_list);
+}
+
+static int qp_grp_and_vf_bind(struct usnic_ib_vf *vf,
+				struct usnic_ib_pd *pd,
+				struct usnic_ib_qp_grp *qp_grp)
+{
+	int err;
+	struct pci_dev *pdev;
+
+	lockdep_assert_held(&vf->lock);
+
+	pdev = usnic_vnic_get_pdev(vf->vnic);
+	if (vf->qp_grp_ref_cnt == 0) {
+		err = usnic_uiom_attach_dev_to_pd(pd->umem_pd, &pdev->dev);
+		if (err) {
+			usnic_err("Failed to attach %s to domain\n",
+					pci_name(pdev));
+			return err;
+		}
+		vf->pd = pd;
+	}
+	vf->qp_grp_ref_cnt++;
+
+	WARN_ON(vf->pd != pd);
+	qp_grp->vf = vf;
+
+	return 0;
+}
+
+static void qp_grp_and_vf_unbind(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct pci_dev *pdev;
+	struct usnic_ib_pd *pd;
+
+	lockdep_assert_held(&qp_grp->vf->lock);
+
+	pd = qp_grp->vf->pd;
+	pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic);
+	if (--qp_grp->vf->qp_grp_ref_cnt == 0) {
+		qp_grp->vf->pd = NULL;
+		usnic_uiom_detach_dev_from_pd(pd->umem_pd, &pdev->dev);
+	}
+	qp_grp->vf = NULL;
+}
+
+static void log_spec(struct usnic_vnic_res_spec *res_spec)
+{
+	char buf[512];
+	usnic_vnic_spec_dump(buf, sizeof(buf), res_spec);
+	usnic_dbg("%s\n", buf);
+}
+
+static int qp_grp_id_from_flow(struct usnic_ib_qp_grp_flow *qp_flow,
+				uint32_t *id)
+{
+	enum usnic_transport_type trans_type = qp_flow->trans_type;
+	int err;
+	uint16_t port_num = 0;
+
+	switch (trans_type) {
+	case USNIC_TRANSPORT_ROCE_CUSTOM:
+		*id = qp_flow->usnic_roce.port_num;
+		break;
+	case USNIC_TRANSPORT_IPV4_UDP:
+		err = usnic_transport_sock_get_addr(qp_flow->udp.sock,
+							NULL, NULL,
+							&port_num);
+		if (err)
+			return err;
+		/*
+		 * Copy port_num to stack first and then to *id,
+		 * so that the short to int cast works for little
+		 * and big endian systems.
+		 */
+		*id = port_num;
+		break;
+	default:
+		usnic_err("Unsupported transport %u\n", trans_type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+struct usnic_ib_qp_grp *
+usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf,
+			struct usnic_ib_pd *pd,
+			struct usnic_vnic_res_spec *res_spec,
+			struct usnic_transport_spec *transport_spec)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	int err;
+	enum usnic_transport_type transport = transport_spec->trans_type;
+	struct usnic_ib_qp_grp_flow *qp_flow;
+
+	lockdep_assert_held(&vf->lock);
+
+	err = usnic_vnic_res_spec_satisfied(&min_transport_spec[transport],
+						res_spec);
+	if (err) {
+		usnic_err("Spec does not meet miniumum req for transport %d\n",
+				transport);
+		log_spec(res_spec);
+		return ERR_PTR(err);
+	}
+
+	qp_grp = kzalloc(sizeof(*qp_grp), GFP_ATOMIC);
+	if (!qp_grp)
+		return NULL;
+
+	qp_grp->res_chunk_list = alloc_res_chunk_list(vf->vnic, res_spec,
+							qp_grp);
+	if (IS_ERR_OR_NULL(qp_grp->res_chunk_list)) {
+		err = qp_grp->res_chunk_list ?
+				PTR_ERR(qp_grp->res_chunk_list) : -ENOMEM;
+		goto out_free_qp_grp;
+	}
+
+	err = qp_grp_and_vf_bind(vf, pd, qp_grp);
+	if (err)
+		goto out_free_res;
+
+	INIT_LIST_HEAD(&qp_grp->flows_lst);
+	spin_lock_init(&qp_grp->lock);
+	qp_grp->ufdev = ufdev;
+	qp_grp->state = IB_QPS_RESET;
+	qp_grp->owner_pid = current->pid;
+
+	qp_flow = create_and_add_flow(qp_grp, transport_spec);
+	if (IS_ERR_OR_NULL(qp_flow)) {
+		usnic_err("Unable to create and add flow with err %ld\n",
+				PTR_ERR(qp_flow));
+		err = qp_flow ? PTR_ERR(qp_flow) : -EFAULT;
+		goto out_qp_grp_vf_unbind;
+	}
+
+	err = qp_grp_id_from_flow(qp_flow, &qp_grp->grp_id);
+	if (err)
+		goto out_release_flow;
+	qp_grp->ibqp.qp_num = qp_grp->grp_id;
+
+	usnic_ib_sysfs_qpn_add(qp_grp);
+
+	return qp_grp;
+
+out_release_flow:
+	release_and_remove_flow(qp_flow);
+out_qp_grp_vf_unbind:
+	qp_grp_and_vf_unbind(qp_grp);
+out_free_res:
+	free_qp_grp_res(qp_grp->res_chunk_list);
+out_free_qp_grp:
+	kfree(qp_grp);
+
+	return ERR_PTR(err);
+}
+
+void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp)
+{
+
+	WARN_ON(qp_grp->state != IB_QPS_RESET);
+	lockdep_assert_held(&qp_grp->vf->lock);
+
+	release_and_remove_all_flows(qp_grp);
+	usnic_ib_sysfs_qpn_remove(qp_grp);
+	qp_grp_and_vf_unbind(qp_grp);
+	free_qp_grp_res(qp_grp->res_chunk_list);
+	kfree(qp_grp);
+}
+
+struct usnic_vnic_res_chunk*
+usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp,
+				enum usnic_vnic_res_type res_type)
+{
+	int i;
+
+	for (i = 0; qp_grp->res_chunk_list[i]; i++) {
+		if (qp_grp->res_chunk_list[i]->type == res_type)
+			return qp_grp->res_chunk_list[i];
+	}
+
+	return ERR_PTR(-EINVAL);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h
new file mode 100644
index 0000000..a8a2314
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_IB_QP_GRP_H_
+#define USNIC_IB_QP_GRP_H_
+
+#include <linux/debugfs.h>
+#include <rdma/ib_verbs.h>
+
+#include "usnic_ib.h"
+#include "usnic_abi.h"
+#include "usnic_fwd.h"
+#include "usnic_vnic.h"
+
+/*
+ * The qp group struct represents all the hw resources needed to present a ib_qp
+ */
+struct usnic_ib_qp_grp {
+	struct ib_qp				ibqp;
+	enum ib_qp_state			state;
+	int					grp_id;
+
+	struct usnic_fwd_dev			*ufdev;
+	struct usnic_ib_ucontext		*ctx;
+	struct list_head			flows_lst;
+
+	struct usnic_vnic_res_chunk		**res_chunk_list;
+
+	pid_t					owner_pid;
+	struct usnic_ib_vf			*vf;
+	struct list_head			link;
+
+	spinlock_t				lock;
+
+	struct kobject				kobj;
+};
+
+struct usnic_ib_qp_grp_flow {
+	struct usnic_fwd_flow		*flow;
+	enum usnic_transport_type	trans_type;
+	union {
+		struct {
+			uint16_t	port_num;
+		} usnic_roce;
+		struct {
+			struct socket	*sock;
+		} udp;
+	};
+	struct usnic_ib_qp_grp		*qp_grp;
+	struct list_head		link;
+
+	/* Debug FS */
+	struct dentry			*dbgfs_dentry;
+	char				dentry_name[32];
+};
+
+extern const struct usnic_vnic_res_spec min_transport_spec[USNIC_TRANSPORT_MAX];
+
+const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state);
+int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz);
+int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz);
+struct usnic_ib_qp_grp *
+usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf,
+			struct usnic_ib_pd *pd,
+			struct usnic_vnic_res_spec *res_spec,
+			struct usnic_transport_spec *trans_spec);
+void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp);
+int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp,
+				enum ib_qp_state new_state,
+				void *data);
+struct usnic_vnic_res_chunk
+*usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp,
+				enum usnic_vnic_res_type type);
+static inline
+struct usnic_ib_qp_grp *to_uqp_grp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct usnic_ib_qp_grp, ibqp);
+}
+#endif /* USNIC_IB_QP_GRP_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
new file mode 100644
index 0000000..4210ca1
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include "usnic_common_util.h"
+#include "usnic_ib.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_vnic.h"
+#include "usnic_ib_verbs.h"
+#include "usnic_ib_sysfs.h"
+#include "usnic_log.h"
+
+static ssize_t usnic_ib_show_board(struct device *device,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct usnic_ib_dev *us_ibdev =
+		container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	unsigned short subsystem_device_id;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	subsystem_device_id = us_ibdev->pdev->subsystem_device;
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%hu\n", subsystem_device_id);
+}
+
+/*
+ * Report the configuration for this PF
+ */
+static ssize_t
+usnic_ib_show_config(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+	char *ptr;
+	unsigned left;
+	unsigned n;
+	enum usnic_vnic_res_type res_type;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+
+	/* Buffer space limit is 1 page */
+	ptr = buf;
+	left = PAGE_SIZE;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	if (kref_read(&us_ibdev->vf_cnt) > 0) {
+		char *busname;
+
+		/*
+		 * bus name seems to come with annoying prefix.
+		 * Remove it if it is predictable
+		 */
+		busname = us_ibdev->pdev->bus->name;
+		if (strncmp(busname, "PCI Bus ", 8) == 0)
+			busname += 8;
+
+		n = scnprintf(ptr, left,
+			"%s: %s:%d.%d, %s, %pM, %u VFs\n Per VF:",
+			us_ibdev->ib_dev.name,
+			busname,
+			PCI_SLOT(us_ibdev->pdev->devfn),
+			PCI_FUNC(us_ibdev->pdev->devfn),
+			netdev_name(us_ibdev->netdev),
+			us_ibdev->ufdev->mac,
+			kref_read(&us_ibdev->vf_cnt));
+		UPDATE_PTR_LEFT(n, ptr, left);
+
+		for (res_type = USNIC_VNIC_RES_TYPE_EOL;
+				res_type < USNIC_VNIC_RES_TYPE_MAX;
+				res_type++) {
+			if (us_ibdev->vf_res_cnt[res_type] == 0)
+				continue;
+			n = scnprintf(ptr, left, " %d %s%s",
+				us_ibdev->vf_res_cnt[res_type],
+				usnic_vnic_res_type_to_str(res_type),
+				(res_type < (USNIC_VNIC_RES_TYPE_MAX - 1)) ?
+				 "," : "");
+			UPDATE_PTR_LEFT(n, ptr, left);
+		}
+		n = scnprintf(ptr, left, "\n");
+		UPDATE_PTR_LEFT(n, ptr, left);
+	} else {
+		n = scnprintf(ptr, left, "%s: no VFs\n",
+				us_ibdev->ib_dev.name);
+		UPDATE_PTR_LEFT(n, ptr, left);
+	}
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return ptr - buf;
+}
+
+static ssize_t
+usnic_ib_show_iface(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n",
+			netdev_name(us_ibdev->netdev));
+}
+
+static ssize_t
+usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			kref_read(&us_ibdev->vf_cnt));
+}
+
+static ssize_t
+usnic_ib_show_qp_per_vf(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+	int qp_per_vf;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
+			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
+
+	return scnprintf(buf, PAGE_SIZE,
+				"%d\n", qp_per_vf);
+}
+
+static ssize_t
+usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n",
+			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]);
+}
+
+static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL);
+static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL);
+static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL);
+static DEVICE_ATTR(max_vf, S_IRUGO, usnic_ib_show_max_vf, NULL);
+static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL);
+static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL);
+
+static struct device_attribute *usnic_class_attributes[] = {
+	&dev_attr_board_id,
+	&dev_attr_config,
+	&dev_attr_iface,
+	&dev_attr_max_vf,
+	&dev_attr_qp_per_vf,
+	&dev_attr_cq_per_vf,
+};
+
+struct qpn_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct usnic_ib_qp_grp *, char *buf);
+};
+
+/*
+ * Definitions for supporting QPN entries in sysfs
+ */
+static ssize_t
+usnic_ib_qpn_attr_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	struct qpn_attribute *qpn_attr;
+
+	qp_grp = container_of(kobj, struct usnic_ib_qp_grp, kobj);
+	qpn_attr = container_of(attr, struct qpn_attribute, attr);
+
+	return qpn_attr->show(qp_grp, buf);
+}
+
+static const struct sysfs_ops usnic_ib_qpn_sysfs_ops = {
+	.show = usnic_ib_qpn_attr_show
+};
+
+#define QPN_ATTR_RO(NAME) \
+struct qpn_attribute qpn_attr_##NAME = __ATTR_RO(NAME)
+
+static ssize_t context_show(struct usnic_ib_qp_grp *qp_grp, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "0x%p\n", qp_grp->ctx);
+}
+
+static ssize_t summary_show(struct usnic_ib_qp_grp *qp_grp, char *buf)
+{
+	int i, j, n;
+	int left;
+	char *ptr;
+	struct usnic_vnic_res_chunk *res_chunk;
+	struct usnic_vnic_res *vnic_res;
+
+	left = PAGE_SIZE;
+	ptr = buf;
+
+	n = scnprintf(ptr, left,
+			"QPN: %d State: (%s) PID: %u VF Idx: %hu ",
+			qp_grp->ibqp.qp_num,
+			usnic_ib_qp_grp_state_to_string(qp_grp->state),
+			qp_grp->owner_pid,
+			usnic_vnic_get_index(qp_grp->vf->vnic));
+	UPDATE_PTR_LEFT(n, ptr, left);
+
+	for (i = 0; qp_grp->res_chunk_list[i]; i++) {
+		res_chunk = qp_grp->res_chunk_list[i];
+		for (j = 0; j < res_chunk->cnt; j++) {
+			vnic_res = res_chunk->res[j];
+			n = scnprintf(ptr, left, "%s[%d] ",
+				usnic_vnic_res_type_to_str(vnic_res->type),
+				vnic_res->vnic_idx);
+			UPDATE_PTR_LEFT(n, ptr, left);
+		}
+	}
+
+	n = scnprintf(ptr, left, "\n");
+	UPDATE_PTR_LEFT(n, ptr, left);
+
+	return ptr - buf;
+}
+
+static QPN_ATTR_RO(context);
+static QPN_ATTR_RO(summary);
+
+static struct attribute *usnic_ib_qpn_default_attrs[] = {
+	&qpn_attr_context.attr,
+	&qpn_attr_summary.attr,
+	NULL
+};
+
+static struct kobj_type usnic_ib_qpn_type = {
+	.sysfs_ops = &usnic_ib_qpn_sysfs_ops,
+	.default_attrs = usnic_ib_qpn_default_attrs
+};
+
+int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev)
+{
+	int i;
+	int err;
+	for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) {
+		err = device_create_file(&us_ibdev->ib_dev.dev,
+						usnic_class_attributes[i]);
+		if (err) {
+			usnic_err("Failed to create device file %d for %s eith err %d",
+				i, us_ibdev->ib_dev.name, err);
+			return -EINVAL;
+		}
+	}
+
+	/* create kernel object for looking at individual QPs */
+	kobject_get(&us_ibdev->ib_dev.dev.kobj);
+	us_ibdev->qpn_kobj = kobject_create_and_add("qpn",
+			&us_ibdev->ib_dev.dev.kobj);
+	if (us_ibdev->qpn_kobj == NULL) {
+		kobject_put(&us_ibdev->ib_dev.dev.kobj);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) {
+		device_remove_file(&us_ibdev->ib_dev.dev,
+					usnic_class_attributes[i]);
+	}
+
+	kobject_put(us_ibdev->qpn_kobj);
+}
+
+void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct usnic_ib_dev *us_ibdev;
+	int err;
+
+	us_ibdev = qp_grp->vf->pf;
+
+	err = kobject_init_and_add(&qp_grp->kobj, &usnic_ib_qpn_type,
+			kobject_get(us_ibdev->qpn_kobj),
+			"%d", qp_grp->grp_id);
+	if (err) {
+		kobject_put(us_ibdev->qpn_kobj);
+		return;
+	}
+}
+
+void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	us_ibdev = qp_grp->vf->pf;
+
+	kobject_put(&qp_grp->kobj);
+	kobject_put(us_ibdev->qpn_kobj);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h
new file mode 100644
index 0000000..3d98e16
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_IB_SYSFS_H_
+#define USNIC_IB_SYSFS_H_
+
+#include "usnic_ib.h"
+
+int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev);
+void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev);
+void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp);
+void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp);
+
+#endif /* !USNIC_IB_SYSFS_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
new file mode 100644
index 0000000..a688a56
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -0,0 +1,810 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include "usnic_abi.h"
+#include "usnic_ib.h"
+#include "usnic_common_util.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_ib_verbs.h"
+#include "usnic_fwd.h"
+#include "usnic_log.h"
+#include "usnic_uiom.h"
+#include "usnic_transport.h"
+
+#define USNIC_DEFAULT_TRANSPORT USNIC_TRANSPORT_ROCE_CUSTOM
+
+const struct usnic_vnic_res_spec min_transport_spec[USNIC_TRANSPORT_MAX] = {
+	{ /*USNIC_TRANSPORT_UNKNOWN*/
+		.resources = {
+			{.type = USNIC_VNIC_RES_TYPE_EOL,	.cnt = 0,},
+		},
+	},
+	{ /*USNIC_TRANSPORT_ROCE_CUSTOM*/
+		.resources = {
+			{.type = USNIC_VNIC_RES_TYPE_WQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_RQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_CQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_EOL,	.cnt = 0,},
+		},
+	},
+	{ /*USNIC_TRANSPORT_IPV4_UDP*/
+		.resources = {
+			{.type = USNIC_VNIC_RES_TYPE_WQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_RQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_CQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_EOL,	.cnt = 0,},
+		},
+	},
+};
+
+static void usnic_ib_fw_string_to_u64(char *fw_ver_str, u64 *fw_ver)
+{
+	*fw_ver = *((u64 *)fw_ver_str);
+}
+
+static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp,
+					struct ib_udata *udata)
+{
+	struct usnic_ib_dev *us_ibdev;
+	struct usnic_ib_create_qp_resp resp;
+	struct pci_dev *pdev;
+	struct vnic_dev_bar *bar;
+	struct usnic_vnic_res_chunk *chunk;
+	struct usnic_ib_qp_grp_flow *default_flow;
+	int i, err;
+
+	memset(&resp, 0, sizeof(resp));
+
+	us_ibdev = qp_grp->vf->pf;
+	pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic);
+	if (!pdev) {
+		usnic_err("Failed to get pdev of qp_grp %d\n",
+				qp_grp->grp_id);
+		return -EFAULT;
+	}
+
+	bar = usnic_vnic_get_bar(qp_grp->vf->vnic, 0);
+	if (!bar) {
+		usnic_err("Failed to get bar0 of qp_grp %d vf %s",
+				qp_grp->grp_id, pci_name(pdev));
+		return -EFAULT;
+	}
+
+	resp.vfid = usnic_vnic_get_index(qp_grp->vf->vnic);
+	resp.bar_bus_addr = bar->bus_addr;
+	resp.bar_len = bar->len;
+
+	chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ);
+	if (IS_ERR(chunk)) {
+		usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n",
+			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ),
+			qp_grp->grp_id,
+			PTR_ERR(chunk));
+		return PTR_ERR(chunk);
+	}
+
+	WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_RQ);
+	resp.rq_cnt = chunk->cnt;
+	for (i = 0; i < chunk->cnt; i++)
+		resp.rq_idx[i] = chunk->res[i]->vnic_idx;
+
+	chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_WQ);
+	if (IS_ERR(chunk)) {
+		usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n",
+			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_WQ),
+			qp_grp->grp_id,
+			PTR_ERR(chunk));
+		return PTR_ERR(chunk);
+	}
+
+	WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_WQ);
+	resp.wq_cnt = chunk->cnt;
+	for (i = 0; i < chunk->cnt; i++)
+		resp.wq_idx[i] = chunk->res[i]->vnic_idx;
+
+	chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_CQ);
+	if (IS_ERR(chunk)) {
+		usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n",
+			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_CQ),
+			qp_grp->grp_id,
+			PTR_ERR(chunk));
+		return PTR_ERR(chunk);
+	}
+
+	WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_CQ);
+	resp.cq_cnt = chunk->cnt;
+	for (i = 0; i < chunk->cnt; i++)
+		resp.cq_idx[i] = chunk->res[i]->vnic_idx;
+
+	default_flow = list_first_entry(&qp_grp->flows_lst,
+					struct usnic_ib_qp_grp_flow, link);
+	resp.transport = default_flow->trans_type;
+
+	err = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (err) {
+		usnic_err("Failed to copy udata for %s", us_ibdev->ib_dev.name);
+		return err;
+	}
+
+	return 0;
+}
+
+static struct usnic_ib_qp_grp*
+find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev,
+				struct usnic_ib_pd *pd,
+				struct usnic_transport_spec *trans_spec,
+				struct usnic_vnic_res_spec *res_spec)
+{
+	struct usnic_ib_vf *vf;
+	struct usnic_vnic *vnic;
+	struct usnic_ib_qp_grp *qp_grp;
+	struct device *dev, **dev_list;
+	int i;
+
+	BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock));
+
+	if (list_empty(&us_ibdev->vf_dev_list)) {
+		usnic_info("No vfs to allocate\n");
+		return NULL;
+	}
+
+	if (usnic_ib_share_vf) {
+		/* Try to find resouces on a used vf which is in pd */
+		dev_list = usnic_uiom_get_dev_list(pd->umem_pd);
+		if (IS_ERR(dev_list))
+			return ERR_CAST(dev_list);
+		for (i = 0; dev_list[i]; i++) {
+			dev = dev_list[i];
+			vf = pci_get_drvdata(to_pci_dev(dev));
+			spin_lock(&vf->lock);
+			vnic = vf->vnic;
+			if (!usnic_vnic_check_room(vnic, res_spec)) {
+				usnic_dbg("Found used vnic %s from %s\n",
+						us_ibdev->ib_dev.name,
+						pci_name(usnic_vnic_get_pdev(
+									vnic)));
+				qp_grp = usnic_ib_qp_grp_create(us_ibdev->ufdev,
+								vf, pd,
+								res_spec,
+								trans_spec);
+
+				spin_unlock(&vf->lock);
+				goto qp_grp_check;
+			}
+			spin_unlock(&vf->lock);
+
+		}
+		usnic_uiom_free_dev_list(dev_list);
+	}
+
+	/* Try to find resources on an unused vf */
+	list_for_each_entry(vf, &us_ibdev->vf_dev_list, link) {
+		spin_lock(&vf->lock);
+		vnic = vf->vnic;
+		if (vf->qp_grp_ref_cnt == 0 &&
+		    usnic_vnic_check_room(vnic, res_spec) == 0) {
+			qp_grp = usnic_ib_qp_grp_create(us_ibdev->ufdev, vf,
+							pd, res_spec,
+							trans_spec);
+
+			spin_unlock(&vf->lock);
+			goto qp_grp_check;
+		}
+		spin_unlock(&vf->lock);
+	}
+
+	usnic_info("No free qp grp found on %s\n", us_ibdev->ib_dev.name);
+	return ERR_PTR(-ENOMEM);
+
+qp_grp_check:
+	if (IS_ERR_OR_NULL(qp_grp)) {
+		usnic_err("Failed to allocate qp_grp\n");
+		return ERR_PTR(qp_grp ? PTR_ERR(qp_grp) : -ENOMEM);
+	}
+	return qp_grp;
+}
+
+static void qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct usnic_ib_vf *vf = qp_grp->vf;
+
+	WARN_ON(qp_grp->state != IB_QPS_RESET);
+
+	spin_lock(&vf->lock);
+	usnic_ib_qp_grp_destroy(qp_grp);
+	spin_unlock(&vf->lock);
+}
+
+static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd)
+{
+	if (cmd.spec.trans_type <= USNIC_TRANSPORT_UNKNOWN ||
+			cmd.spec.trans_type >= USNIC_TRANSPORT_MAX)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* Start of ib callback functions */
+
+enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device,
+						u8 port_num)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+int usnic_ib_query_device(struct ib_device *ibdev,
+			  struct ib_device_attr *props,
+			  struct ib_udata *uhw)
+{
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
+	union ib_gid gid;
+	struct ethtool_drvinfo info;
+	int qp_per_vf;
+
+	usnic_dbg("\n");
+	if (uhw->inlen || uhw->outlen)
+		return -EINVAL;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info);
+	memset(props, 0, sizeof(*props));
+	usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr,
+			&gid.raw[0]);
+	memcpy(&props->sys_image_guid, &gid.global.interface_id,
+		sizeof(gid.global.interface_id));
+	usnic_ib_fw_string_to_u64(&info.fw_version[0], &props->fw_ver);
+	props->max_mr_size = USNIC_UIOM_MAX_MR_SIZE;
+	props->page_size_cap = USNIC_UIOM_PAGE_SIZE;
+	props->vendor_id = PCI_VENDOR_ID_CISCO;
+	props->vendor_part_id = PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC;
+	props->hw_ver = us_ibdev->pdev->subsystem_device;
+	qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
+			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
+	props->max_qp = qp_per_vf *
+		kref_read(&us_ibdev->vf_cnt);
+	props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT |
+		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+	props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] *
+		kref_read(&us_ibdev->vf_cnt);
+	props->max_pd = USNIC_UIOM_MAX_PD_CNT;
+	props->max_mr = USNIC_UIOM_MAX_MR_CNT;
+	props->local_ca_ack_delay = 0;
+	props->max_pkeys = 0;
+	props->atomic_cap = IB_ATOMIC_NONE;
+	props->masked_atomic_cap = props->atomic_cap;
+	props->max_qp_rd_atom = 0;
+	props->max_qp_init_rd_atom = 0;
+	props->max_res_rd_atom = 0;
+	props->max_srq = 0;
+	props->max_srq_wr = 0;
+	props->max_srq_sge = 0;
+	props->max_fast_reg_page_list_len = 0;
+	props->max_mcast_grp = 0;
+	props->max_mcast_qp_attach = 0;
+	props->max_total_mcast_qp_attach = 0;
+	props->max_map_per_fmr = 0;
+	/* Owned by Userspace
+	 * max_qp_wr, max_sge, max_sge_rd, max_cqe */
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return 0;
+}
+
+int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
+				struct ib_port_attr *props)
+{
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
+
+	usnic_dbg("\n");
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	if (ib_get_eth_speed(ibdev, port, &props->active_speed,
+			     &props->active_width)) {
+		mutex_unlock(&us_ibdev->usdev_lock);
+		return -EINVAL;
+	}
+
+	/* props being zeroed by the caller, avoid zeroing it here */
+
+	props->lid = 0;
+	props->lmc = 1;
+	props->sm_lid = 0;
+	props->sm_sl = 0;
+
+	if (!us_ibdev->ufdev->link_up) {
+		props->state = IB_PORT_DOWN;
+		props->phys_state = 3;
+	} else if (!us_ibdev->ufdev->inaddr) {
+		props->state = IB_PORT_INIT;
+		props->phys_state = 4;
+	} else {
+		props->state = IB_PORT_ACTIVE;
+		props->phys_state = 5;
+	}
+
+	props->port_cap_flags = 0;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->bad_pkey_cntr = 0;
+	props->qkey_viol_cntr = 0;
+	props->max_mtu = IB_MTU_4096;
+	props->active_mtu = iboe_get_mtu(us_ibdev->ufdev->mtu);
+	/* Userspace will adjust for hdrs */
+	props->max_msg_sz = us_ibdev->ufdev->mtu;
+	props->max_vl_num = 1;
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return 0;
+}
+
+int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+				int qp_attr_mask,
+				struct ib_qp_init_attr *qp_init_attr)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	struct usnic_ib_vf *vf;
+	int err;
+
+	usnic_dbg("\n");
+
+	memset(qp_attr, 0, sizeof(*qp_attr));
+	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
+
+	qp_grp = to_uqp_grp(qp);
+	vf = qp_grp->vf;
+	mutex_lock(&vf->pf->usdev_lock);
+	usnic_dbg("\n");
+	qp_attr->qp_state = qp_grp->state;
+	qp_attr->cur_qp_state = qp_grp->state;
+
+	switch (qp_grp->ibqp.qp_type) {
+	case IB_QPT_UD:
+		qp_attr->qkey = 0;
+		break;
+	default:
+		usnic_err("Unexpected qp_type %d\n", qp_grp->ibqp.qp_type);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	mutex_unlock(&vf->pf->usdev_lock);
+	return 0;
+
+err_out:
+	mutex_unlock(&vf->pf->usdev_lock);
+	return err;
+}
+
+int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+				union ib_gid *gid)
+{
+
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
+	usnic_dbg("\n");
+
+	if (index > 1)
+		return -EINVAL;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr,
+			&gid->raw[0]);
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return 0;
+}
+
+struct net_device *usnic_get_netdev(struct ib_device *device, u8 port_num)
+{
+	struct usnic_ib_dev *us_ibdev = to_usdev(device);
+
+	if (us_ibdev->netdev)
+		dev_hold(us_ibdev->netdev);
+
+	return us_ibdev->netdev;
+}
+
+int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+				u16 *pkey)
+{
+	if (index > 1)
+		return -EINVAL;
+
+	*pkey = 0xffff;
+	return 0;
+}
+
+struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev,
+					struct ib_ucontext *context,
+					struct ib_udata *udata)
+{
+	struct usnic_ib_pd *pd;
+	void *umem_pd;
+
+	usnic_dbg("\n");
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	umem_pd = pd->umem_pd = usnic_uiom_alloc_pd();
+	if (IS_ERR_OR_NULL(umem_pd)) {
+		kfree(pd);
+		return ERR_PTR(umem_pd ? PTR_ERR(umem_pd) : -ENOMEM);
+	}
+
+	usnic_info("domain 0x%p allocated for context 0x%p and device %s\n",
+			pd, context, ibdev->name);
+	return &pd->ibpd;
+}
+
+int usnic_ib_dealloc_pd(struct ib_pd *pd)
+{
+	usnic_info("freeing domain 0x%p\n", pd);
+
+	usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd);
+	kfree(pd);
+	return 0;
+}
+
+struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
+					struct ib_qp_init_attr *init_attr,
+					struct ib_udata *udata)
+{
+	int err;
+	struct usnic_ib_dev *us_ibdev;
+	struct usnic_ib_qp_grp *qp_grp;
+	struct usnic_ib_ucontext *ucontext;
+	int cq_cnt;
+	struct usnic_vnic_res_spec res_spec;
+	struct usnic_ib_create_qp_cmd cmd;
+	struct usnic_transport_spec trans_spec;
+
+	usnic_dbg("\n");
+
+	ucontext = to_uucontext(pd->uobject->context);
+	us_ibdev = to_usdev(pd->device);
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	err = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+	if (err) {
+		usnic_err("%s: cannot copy udata for create_qp\n",
+				us_ibdev->ib_dev.name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	err = create_qp_validate_user_data(cmd);
+	if (err) {
+		usnic_err("%s: Failed to validate user data\n",
+				us_ibdev->ib_dev.name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (init_attr->qp_type != IB_QPT_UD) {
+		usnic_err("%s asked to make a non-UD QP: %d\n",
+				us_ibdev->ib_dev.name, init_attr->qp_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	trans_spec = cmd.spec;
+	mutex_lock(&us_ibdev->usdev_lock);
+	cq_cnt = (init_attr->send_cq == init_attr->recv_cq) ? 1 : 2;
+	res_spec = min_transport_spec[trans_spec.trans_type];
+	usnic_vnic_res_spec_update(&res_spec, USNIC_VNIC_RES_TYPE_CQ, cq_cnt);
+	qp_grp = find_free_vf_and_create_qp_grp(us_ibdev, to_upd(pd),
+						&trans_spec,
+						&res_spec);
+	if (IS_ERR_OR_NULL(qp_grp)) {
+		err = qp_grp ? PTR_ERR(qp_grp) : -ENOMEM;
+		goto out_release_mutex;
+	}
+
+	err = usnic_ib_fill_create_qp_resp(qp_grp, udata);
+	if (err) {
+		err = -EBUSY;
+		goto out_release_qp_grp;
+	}
+
+	qp_grp->ctx = ucontext;
+	list_add_tail(&qp_grp->link, &ucontext->qp_grp_list);
+	usnic_ib_log_vf(qp_grp->vf);
+	mutex_unlock(&us_ibdev->usdev_lock);
+	return &qp_grp->ibqp;
+
+out_release_qp_grp:
+	qp_grp_destroy(qp_grp);
+out_release_mutex:
+	mutex_unlock(&us_ibdev->usdev_lock);
+	return ERR_PTR(err);
+}
+
+int usnic_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	struct usnic_ib_vf *vf;
+
+	usnic_dbg("\n");
+
+	qp_grp = to_uqp_grp(qp);
+	vf = qp_grp->vf;
+	mutex_lock(&vf->pf->usdev_lock);
+	if (usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RESET, NULL)) {
+		usnic_err("Failed to move qp grp %u to reset\n",
+				qp_grp->grp_id);
+	}
+
+	list_del(&qp_grp->link);
+	qp_grp_destroy(qp_grp);
+	mutex_unlock(&vf->pf->usdev_lock);
+
+	return 0;
+}
+
+int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+				int attr_mask, struct ib_udata *udata)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	int status;
+	usnic_dbg("\n");
+
+	qp_grp = to_uqp_grp(ibqp);
+
+	mutex_lock(&qp_grp->vf->pf->usdev_lock);
+	if ((attr_mask & IB_QP_PORT) && attr->port_num != 1) {
+		/* usnic devices only have one port */
+		status = -EINVAL;
+		goto out_unlock;
+	}
+	if (attr_mask & IB_QP_STATE) {
+		status = usnic_ib_qp_grp_modify(qp_grp, attr->qp_state, NULL);
+	} else {
+		usnic_err("Unhandled request, attr_mask=0x%x\n", attr_mask);
+		status = -EINVAL;
+	}
+
+out_unlock:
+	mutex_unlock(&qp_grp->vf->pf->usdev_lock);
+	return status;
+}
+
+struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev,
+				 const struct ib_cq_init_attr *attr,
+				 struct ib_ucontext *context,
+				 struct ib_udata *udata)
+{
+	struct ib_cq *cq;
+
+	usnic_dbg("\n");
+	if (attr->flags)
+		return ERR_PTR(-EINVAL);
+
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-EBUSY);
+
+	return cq;
+}
+
+int usnic_ib_destroy_cq(struct ib_cq *cq)
+{
+	usnic_dbg("\n");
+	kfree(cq);
+	return 0;
+}
+
+struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
+					u64 virt_addr, int access_flags,
+					struct ib_udata *udata)
+{
+	struct usnic_ib_mr *mr;
+	int err;
+
+	usnic_dbg("start 0x%llx va 0x%llx length 0x%llx\n", start,
+			virt_addr, length);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->umem = usnic_uiom_reg_get(to_upd(pd)->umem_pd, start, length,
+					access_flags, 0);
+	if (IS_ERR_OR_NULL(mr->umem)) {
+		err = mr->umem ? PTR_ERR(mr->umem) : -EFAULT;
+		goto err_free;
+	}
+
+	mr->ibmr.lkey = mr->ibmr.rkey = 0;
+	return &mr->ibmr;
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+int usnic_ib_dereg_mr(struct ib_mr *ibmr)
+{
+	struct usnic_ib_mr *mr = to_umr(ibmr);
+
+	usnic_dbg("va 0x%lx length 0x%zx\n", mr->umem->va, mr->umem->length);
+
+	usnic_uiom_reg_release(mr->umem, ibmr->pd->uobject->context->closing);
+	kfree(mr);
+	return 0;
+}
+
+struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev,
+							struct ib_udata *udata)
+{
+	struct usnic_ib_ucontext *context;
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
+	usnic_dbg("\n");
+
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&context->qp_grp_list);
+	mutex_lock(&us_ibdev->usdev_lock);
+	list_add_tail(&context->link, &us_ibdev->ctx_list);
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return &context->ibucontext;
+}
+
+int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct usnic_ib_ucontext *context = to_uucontext(ibcontext);
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibcontext->device);
+	usnic_dbg("\n");
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	BUG_ON(!list_empty(&context->qp_grp_list));
+	list_del(&context->link);
+	mutex_unlock(&us_ibdev->usdev_lock);
+	kfree(context);
+	return 0;
+}
+
+int usnic_ib_mmap(struct ib_ucontext *context,
+				struct vm_area_struct *vma)
+{
+	struct usnic_ib_ucontext *uctx = to_ucontext(context);
+	struct usnic_ib_dev *us_ibdev;
+	struct usnic_ib_qp_grp *qp_grp;
+	struct usnic_ib_vf *vf;
+	struct vnic_dev_bar *bar;
+	dma_addr_t bus_addr;
+	unsigned int len;
+	unsigned int vfid;
+
+	usnic_dbg("\n");
+
+	us_ibdev = to_usdev(context->device);
+	vma->vm_flags |= VM_IO;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vfid = vma->vm_pgoff;
+	usnic_dbg("Page Offset %lu PAGE_SHIFT %u VFID %u\n",
+			vma->vm_pgoff, PAGE_SHIFT, vfid);
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	list_for_each_entry(qp_grp, &uctx->qp_grp_list, link) {
+		vf = qp_grp->vf;
+		if (usnic_vnic_get_index(vf->vnic) == vfid) {
+			bar = usnic_vnic_get_bar(vf->vnic, 0);
+			if ((vma->vm_end - vma->vm_start) != bar->len) {
+				usnic_err("Bar0 Len %lu - Request map %lu\n",
+						bar->len,
+						vma->vm_end - vma->vm_start);
+				mutex_unlock(&us_ibdev->usdev_lock);
+				return -EINVAL;
+			}
+			bus_addr = bar->bus_addr;
+			len = bar->len;
+			usnic_dbg("bus: %pa vaddr: %p size: %ld\n",
+					&bus_addr, bar->vaddr, bar->len);
+			mutex_unlock(&us_ibdev->usdev_lock);
+
+			return remap_pfn_range(vma,
+						vma->vm_start,
+						bus_addr >> PAGE_SHIFT,
+						len, vma->vm_page_prot);
+		}
+	}
+
+	mutex_unlock(&us_ibdev->usdev_lock);
+	usnic_err("No VF %u found\n", vfid);
+	return -EINVAL;
+}
+
+/* In ib callbacks section -  Start of stub funcs */
+struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd,
+				 struct rdma_ah_attr *ah_attr,
+				 struct ib_udata *udata)
+
+{
+	usnic_dbg("\n");
+	return ERR_PTR(-EPERM);
+}
+
+int usnic_ib_destroy_ah(struct ib_ah *ah)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+				struct ib_send_wr **bad_wr)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+				struct ib_recv_wr **bad_wr)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries,
+				struct ib_wc *wc)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+int usnic_ib_req_notify_cq(struct ib_cq *cq,
+					enum ib_cq_notify_flags flags)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	usnic_dbg("\n");
+	return ERR_PTR(-ENOMEM);
+}
+
+
+/* In ib callbacks section - End of stub funcs */
+/* End of ib callbacks section */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
new file mode 100644
index 0000000..1fda944
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_IB_VERBS_H_
+#define USNIC_IB_VERBS_H_
+
+#include "usnic_ib.h"
+
+enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device,
+						u8 port_num);
+int usnic_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props,
+			  struct ib_udata *uhw);
+int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
+				struct ib_port_attr *props);
+int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+				int qp_attr_mask,
+				struct ib_qp_init_attr *qp_init_attr);
+int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+				union ib_gid *gid);
+struct net_device *usnic_get_netdev(struct ib_device *device, u8 port_num);
+int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+				u16 *pkey);
+struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev,
+				struct ib_ucontext *context,
+				struct ib_udata *udata);
+int usnic_ib_dealloc_pd(struct ib_pd *pd);
+struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
+					struct ib_qp_init_attr *init_attr,
+					struct ib_udata *udata);
+int usnic_ib_destroy_qp(struct ib_qp *qp);
+int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+				int attr_mask, struct ib_udata *udata);
+struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev,
+				 const struct ib_cq_init_attr *attr,
+				 struct ib_ucontext *context,
+				 struct ib_udata *udata);
+int usnic_ib_destroy_cq(struct ib_cq *cq);
+struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
+				u64 virt_addr, int access_flags,
+				struct ib_udata *udata);
+int usnic_ib_dereg_mr(struct ib_mr *ibmr);
+struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev,
+						struct ib_udata *udata);
+int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext);
+int usnic_ib_mmap(struct ib_ucontext *context,
+			struct vm_area_struct *vma);
+struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd,
+				 struct rdma_ah_attr *ah_attr,
+				 struct ib_udata *udata);
+
+int usnic_ib_destroy_ah(struct ib_ah *ah);
+int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			struct ib_send_wr **bad_wr);
+int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			struct ib_recv_wr **bad_wr);
+int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries,
+			struct ib_wc *wc);
+int usnic_ib_req_notify_cq(struct ib_cq *cq,
+				enum ib_cq_notify_flags flags);
+struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc);
+#endif /* !USNIC_IB_VERBS_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_log.h b/drivers/infiniband/hw/usnic/usnic_log.h
new file mode 100644
index 0000000..183fcb6
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_log.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_LOG_H_
+#define USNIC_LOG_H_
+
+#include "usnic.h"
+
+extern unsigned int usnic_log_lvl;
+
+#define USNIC_LOG_LVL_NONE		(0)
+#define USNIC_LOG_LVL_ERR		(1)
+#define USNIC_LOG_LVL_INFO		(2)
+#define USNIC_LOG_LVL_DBG		(3)
+
+#define usnic_printk(lvl, args...) \
+	do { \
+		printk(lvl "%s:%s:%d: ", DRV_NAME, __func__, \
+				__LINE__); \
+		printk(args); \
+	} while (0)
+
+#define usnic_dbg(args...) \
+	do { \
+		if (unlikely(usnic_log_lvl >= USNIC_LOG_LVL_DBG)) { \
+			usnic_printk(KERN_INFO, args); \
+	} \
+} while (0)
+
+#define usnic_info(args...) \
+do { \
+	if (usnic_log_lvl >= USNIC_LOG_LVL_INFO) { \
+			usnic_printk(KERN_INFO, args); \
+	} \
+} while (0)
+
+#define usnic_err(args...) \
+	do { \
+		if (usnic_log_lvl >= USNIC_LOG_LVL_ERR) { \
+			usnic_printk(KERN_ERR, args); \
+		} \
+	} while (0)
+#endif /* !USNIC_LOG_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_transport.c b/drivers/infiniband/hw/usnic/usnic_transport.c
new file mode 100644
index 0000000..e0a9553
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_transport.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/bitmap.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/inet_sock.h>
+
+#include "usnic_transport.h"
+#include "usnic_log.h"
+
+/* ROCE */
+static unsigned long *roce_bitmap;
+static u16 roce_next_port = 1;
+#define ROCE_BITMAP_SZ ((1 << (8 /*CHAR_BIT*/ * sizeof(u16)))/8 /*CHAR BIT*/)
+static DEFINE_SPINLOCK(roce_bitmap_lock);
+
+const char *usnic_transport_to_str(enum usnic_transport_type type)
+{
+	switch (type) {
+	case USNIC_TRANSPORT_UNKNOWN:
+		return "Unknown";
+	case USNIC_TRANSPORT_ROCE_CUSTOM:
+		return "roce custom";
+	case USNIC_TRANSPORT_IPV4_UDP:
+		return "IPv4 UDP";
+	case USNIC_TRANSPORT_MAX:
+		return "Max?";
+	default:
+		return "Not known";
+	}
+}
+
+int usnic_transport_sock_to_str(char *buf, int buf_sz,
+					struct socket *sock)
+{
+	int err;
+	uint32_t addr;
+	uint16_t port;
+	int proto;
+
+	memset(buf, 0, buf_sz);
+	err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port);
+	if (err)
+		return 0;
+
+	return scnprintf(buf, buf_sz, "Proto:%u Addr:%pI4h Port:%hu",
+			proto, &addr, port);
+}
+
+/*
+ * reserve a port number.  if "0" specified, we will try to pick one
+ * starting at roce_next_port.  roce_next_port will take on the values
+ * 1..4096
+ */
+u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num)
+{
+	if (type == USNIC_TRANSPORT_ROCE_CUSTOM) {
+		spin_lock(&roce_bitmap_lock);
+		if (!port_num) {
+			port_num = bitmap_find_next_zero_area(roce_bitmap,
+						ROCE_BITMAP_SZ,
+						roce_next_port /* start */,
+						1 /* nr */,
+						0 /* align */);
+			roce_next_port = (port_num & 4095) + 1;
+		} else if (test_bit(port_num, roce_bitmap)) {
+			usnic_err("Failed to allocate port for %s\n",
+					usnic_transport_to_str(type));
+			spin_unlock(&roce_bitmap_lock);
+			goto out_fail;
+		}
+		bitmap_set(roce_bitmap, port_num, 1);
+		spin_unlock(&roce_bitmap_lock);
+	} else {
+		usnic_err("Failed to allocate port - transport %s unsupported\n",
+				usnic_transport_to_str(type));
+		goto out_fail;
+	}
+
+	usnic_dbg("Allocating port %hu for %s\n", port_num,
+			usnic_transport_to_str(type));
+	return port_num;
+
+out_fail:
+	return 0;
+}
+
+void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num)
+{
+	if (type == USNIC_TRANSPORT_ROCE_CUSTOM) {
+		spin_lock(&roce_bitmap_lock);
+		if (!port_num) {
+			usnic_err("Unreserved unvalid port num 0 for %s\n",
+					usnic_transport_to_str(type));
+			goto out_roce_custom;
+		}
+
+		if (!test_bit(port_num, roce_bitmap)) {
+			usnic_err("Unreserving invalid %hu for %s\n",
+					port_num,
+					usnic_transport_to_str(type));
+			goto out_roce_custom;
+		}
+		bitmap_clear(roce_bitmap, port_num, 1);
+		usnic_dbg("Freeing port %hu for %s\n", port_num,
+				usnic_transport_to_str(type));
+out_roce_custom:
+		spin_unlock(&roce_bitmap_lock);
+	} else {
+		usnic_err("Freeing invalid port %hu for %d\n", port_num, type);
+	}
+}
+
+struct socket *usnic_transport_get_socket(int sock_fd)
+{
+	struct socket *sock;
+	int err;
+	char buf[25];
+
+	/* sockfd_lookup will internally do a fget */
+	sock = sockfd_lookup(sock_fd, &err);
+	if (!sock) {
+		usnic_err("Unable to lookup socket for fd %d with err %d\n",
+				sock_fd, err);
+		return ERR_PTR(-ENOENT);
+	}
+
+	usnic_transport_sock_to_str(buf, sizeof(buf), sock);
+	usnic_dbg("Get sock %s\n", buf);
+
+	return sock;
+}
+
+void usnic_transport_put_socket(struct socket *sock)
+{
+	char buf[100];
+
+	usnic_transport_sock_to_str(buf, sizeof(buf), sock);
+	usnic_dbg("Put sock %s\n", buf);
+	sockfd_put(sock);
+}
+
+int usnic_transport_sock_get_addr(struct socket *sock, int *proto,
+					uint32_t *addr, uint16_t *port)
+{
+	int err;
+	struct sockaddr_in sock_addr;
+
+	err = sock->ops->getname(sock,
+				(struct sockaddr *)&sock_addr,
+				0);
+	if (err < 0)
+		return err;
+
+	if (sock_addr.sin_family != AF_INET)
+		return -EINVAL;
+
+	if (proto)
+		*proto = sock->sk->sk_protocol;
+	if (port)
+		*port = ntohs(((struct sockaddr_in *)&sock_addr)->sin_port);
+	if (addr)
+		*addr = ntohl(((struct sockaddr_in *)
+					&sock_addr)->sin_addr.s_addr);
+
+	return 0;
+}
+
+int usnic_transport_init(void)
+{
+	roce_bitmap = kzalloc(ROCE_BITMAP_SZ, GFP_KERNEL);
+	if (!roce_bitmap)
+		return -ENOMEM;
+
+	/* Do not ever allocate bit 0, hence set it here */
+	bitmap_set(roce_bitmap, 0, 1);
+	return 0;
+}
+
+void usnic_transport_fini(void)
+{
+	kfree(roce_bitmap);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_transport.h b/drivers/infiniband/hw/usnic/usnic_transport.h
new file mode 100644
index 0000000..9a7a2d9
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_transport.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_TRANSPORT_H_
+#define USNIC_TRANSPORT_H_
+
+#include "usnic_abi.h"
+
+const char *usnic_transport_to_str(enum usnic_transport_type trans_type);
+/*
+ * Returns number of bytes written, excluding null terminator. If
+ * nothing was written, the function returns 0.
+ */
+int usnic_transport_sock_to_str(char *buf, int buf_sz,
+					struct socket *sock);
+/*
+ * Reserve a port. If "port_num" is set, then the function will try
+ * to reserve that particular port.
+ */
+u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num);
+void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num);
+/*
+ * Do a fget on the socket refered to by sock_fd and returns the socket.
+ * Socket will not be destroyed before usnic_transport_put_socket has
+ * been called.
+ */
+struct socket *usnic_transport_get_socket(int sock_fd);
+void usnic_transport_put_socket(struct socket *sock);
+/*
+ * Call usnic_transport_get_socket before calling *_sock_get_addr
+ */
+int usnic_transport_sock_get_addr(struct socket *sock, int *proto,
+					uint32_t *addr, uint16_t *port);
+int usnic_transport_init(void);
+void usnic_transport_fini(void);
+#endif /* !USNIC_TRANSPORT_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
new file mode 100644
index 0000000..4381c0a
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2013 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/iommu.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+
+#include "usnic_log.h"
+#include "usnic_uiom.h"
+#include "usnic_uiom_interval_tree.h"
+
+static struct workqueue_struct *usnic_uiom_wq;
+
+#define USNIC_UIOM_PAGE_CHUNK						\
+	((PAGE_SIZE - offsetof(struct usnic_uiom_chunk, page_list))	/\
+	((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] -	\
+	(void *) &((struct usnic_uiom_chunk *) 0)->page_list[0]))
+
+static void usnic_uiom_reg_account(struct work_struct *work)
+{
+	struct usnic_uiom_reg *umem = container_of(work,
+						struct usnic_uiom_reg, work);
+
+	down_write(&umem->mm->mmap_sem);
+	umem->mm->locked_vm -= umem->diff;
+	up_write(&umem->mm->mmap_sem);
+	mmput(umem->mm);
+	kfree(umem);
+}
+
+static int usnic_uiom_dma_fault(struct iommu_domain *domain,
+				struct device *dev,
+				unsigned long iova, int flags,
+				void *token)
+{
+	usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n",
+		dev_name(dev),
+		domain, iova, flags);
+	return -ENOSYS;
+}
+
+static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty)
+{
+	struct usnic_uiom_chunk *chunk, *tmp;
+	struct page *page;
+	struct scatterlist *sg;
+	int i;
+	dma_addr_t pa;
+
+	list_for_each_entry_safe(chunk, tmp, chunk_list, list) {
+		for_each_sg(chunk->page_list, sg, chunk->nents, i) {
+			page = sg_page(sg);
+			pa = sg_phys(sg);
+			if (dirty)
+				set_page_dirty_lock(page);
+			put_page(page);
+			usnic_dbg("pa: %pa\n", &pa);
+		}
+		kfree(chunk);
+	}
+}
+
+static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
+				int dmasync, struct list_head *chunk_list)
+{
+	struct page **page_list;
+	struct scatterlist *sg;
+	struct usnic_uiom_chunk *chunk;
+	unsigned long locked;
+	unsigned long lock_limit;
+	unsigned long cur_base;
+	unsigned long npages;
+	int ret;
+	int off;
+	int i;
+	int flags;
+	dma_addr_t pa;
+	unsigned int gup_flags;
+
+	if (!can_do_mlock())
+		return -EPERM;
+
+	INIT_LIST_HEAD(chunk_list);
+
+	page_list = (struct page **) __get_free_page(GFP_KERNEL);
+	if (!page_list)
+		return -ENOMEM;
+
+	npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT;
+
+	down_write(&current->mm->mmap_sem);
+
+	locked = npages + current->mm->locked_vm;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	flags = IOMMU_READ | IOMMU_CACHE;
+	flags |= (writable) ? IOMMU_WRITE : 0;
+	gup_flags = FOLL_WRITE;
+	gup_flags |= (writable) ? 0 : FOLL_FORCE;
+	cur_base = addr & PAGE_MASK;
+	ret = 0;
+
+	while (npages) {
+		ret = get_user_pages(cur_base,
+					min_t(unsigned long, npages,
+					PAGE_SIZE / sizeof(struct page *)),
+					gup_flags, page_list, NULL);
+
+		if (ret < 0)
+			goto out;
+
+		npages -= ret;
+		off = 0;
+
+		while (ret) {
+			chunk = kmalloc(sizeof(*chunk) +
+					sizeof(struct scatterlist) *
+					min_t(int, ret, USNIC_UIOM_PAGE_CHUNK),
+					GFP_KERNEL);
+			if (!chunk) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			chunk->nents = min_t(int, ret, USNIC_UIOM_PAGE_CHUNK);
+			sg_init_table(chunk->page_list, chunk->nents);
+			for_each_sg(chunk->page_list, sg, chunk->nents, i) {
+				sg_set_page(sg, page_list[i + off],
+						PAGE_SIZE, 0);
+				pa = sg_phys(sg);
+				usnic_dbg("va: 0x%lx pa: %pa\n",
+						cur_base + i*PAGE_SIZE, &pa);
+			}
+			cur_base += chunk->nents * PAGE_SIZE;
+			ret -= chunk->nents;
+			off += chunk->nents;
+			list_add_tail(&chunk->list, chunk_list);
+		}
+
+		ret = 0;
+	}
+
+out:
+	if (ret < 0)
+		usnic_uiom_put_pages(chunk_list, 0);
+	else
+		current->mm->locked_vm = locked;
+
+	up_write(&current->mm->mmap_sem);
+	free_page((unsigned long) page_list);
+	return ret;
+}
+
+static void usnic_uiom_unmap_sorted_intervals(struct list_head *intervals,
+						struct usnic_uiom_pd *pd)
+{
+	struct usnic_uiom_interval_node *interval, *tmp;
+	long unsigned va, size;
+
+	list_for_each_entry_safe(interval, tmp, intervals, link) {
+		va = interval->start << PAGE_SHIFT;
+		size = ((interval->last - interval->start) + 1) << PAGE_SHIFT;
+		while (size > 0) {
+			/* Workaround for RH 970401 */
+			usnic_dbg("va 0x%lx size 0x%lx", va, PAGE_SIZE);
+			iommu_unmap(pd->domain, va, PAGE_SIZE);
+			va += PAGE_SIZE;
+			size -= PAGE_SIZE;
+		}
+	}
+}
+
+static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd,
+					struct usnic_uiom_reg *uiomr,
+					int dirty)
+{
+	int npages;
+	unsigned long vpn_start, vpn_last;
+	struct usnic_uiom_interval_node *interval, *tmp;
+	int writable = 0;
+	LIST_HEAD(rm_intervals);
+
+	npages = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT;
+	vpn_start = (uiomr->va & PAGE_MASK) >> PAGE_SHIFT;
+	vpn_last = vpn_start + npages - 1;
+
+	spin_lock(&pd->lock);
+	usnic_uiom_remove_interval(&pd->root, vpn_start,
+					vpn_last, &rm_intervals);
+	usnic_uiom_unmap_sorted_intervals(&rm_intervals, pd);
+
+	list_for_each_entry_safe(interval, tmp, &rm_intervals, link) {
+		if (interval->flags & IOMMU_WRITE)
+			writable = 1;
+		list_del(&interval->link);
+		kfree(interval);
+	}
+
+	usnic_uiom_put_pages(&uiomr->chunk_list, dirty & writable);
+	spin_unlock(&pd->lock);
+}
+
+static int usnic_uiom_map_sorted_intervals(struct list_head *intervals,
+						struct usnic_uiom_reg *uiomr)
+{
+	int i, err;
+	size_t size;
+	struct usnic_uiom_chunk *chunk;
+	struct usnic_uiom_interval_node *interval_node;
+	dma_addr_t pa;
+	dma_addr_t pa_start = 0;
+	dma_addr_t pa_end = 0;
+	long int va_start = -EINVAL;
+	struct usnic_uiom_pd *pd = uiomr->pd;
+	long int va = uiomr->va & PAGE_MASK;
+	int flags = IOMMU_READ | IOMMU_CACHE;
+
+	flags |= (uiomr->writable) ? IOMMU_WRITE : 0;
+	chunk = list_first_entry(&uiomr->chunk_list, struct usnic_uiom_chunk,
+									list);
+	list_for_each_entry(interval_node, intervals, link) {
+iter_chunk:
+		for (i = 0; i < chunk->nents; i++, va += PAGE_SIZE) {
+			pa = sg_phys(&chunk->page_list[i]);
+			if ((va >> PAGE_SHIFT) < interval_node->start)
+				continue;
+
+			if ((va >> PAGE_SHIFT) == interval_node->start) {
+				/* First page of the interval */
+				va_start = va;
+				pa_start = pa;
+				pa_end = pa;
+			}
+
+			WARN_ON(va_start == -EINVAL);
+
+			if ((pa_end + PAGE_SIZE != pa) &&
+					(pa != pa_start)) {
+				/* PAs are not contiguous */
+				size = pa_end - pa_start + PAGE_SIZE;
+				usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x",
+					va_start, &pa_start, size, flags);
+				err = iommu_map(pd->domain, va_start, pa_start,
+							size, flags);
+				if (err) {
+					usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n",
+						va_start, &pa_start, size, err);
+					goto err_out;
+				}
+				va_start = va;
+				pa_start = pa;
+				pa_end = pa;
+			}
+
+			if ((va >> PAGE_SHIFT) == interval_node->last) {
+				/* Last page of the interval */
+				size = pa - pa_start + PAGE_SIZE;
+				usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x\n",
+					va_start, &pa_start, size, flags);
+				err = iommu_map(pd->domain, va_start, pa_start,
+						size, flags);
+				if (err) {
+					usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n",
+						va_start, &pa_start, size, err);
+					goto err_out;
+				}
+				break;
+			}
+
+			if (pa != pa_start)
+				pa_end += PAGE_SIZE;
+		}
+
+		if (i == chunk->nents) {
+			/*
+			 * Hit last entry of the chunk,
+			 * hence advance to next chunk
+			 */
+			chunk = list_first_entry(&chunk->list,
+							struct usnic_uiom_chunk,
+							list);
+			goto iter_chunk;
+		}
+	}
+
+	return 0;
+
+err_out:
+	usnic_uiom_unmap_sorted_intervals(intervals, pd);
+	return err;
+}
+
+struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
+						unsigned long addr, size_t size,
+						int writable, int dmasync)
+{
+	struct usnic_uiom_reg *uiomr;
+	unsigned long va_base, vpn_start, vpn_last;
+	unsigned long npages;
+	int offset, err;
+	LIST_HEAD(sorted_diff_intervals);
+
+	/*
+	 * Intel IOMMU map throws an error if a translation entry is
+	 * changed from read to write.  This module may not unmap
+	 * and then remap the entry after fixing the permission
+	 * b/c this open up a small windows where hw DMA may page fault
+	 * Hence, make all entries to be writable.
+	 */
+	writable = 1;
+
+	va_base = addr & PAGE_MASK;
+	offset = addr & ~PAGE_MASK;
+	npages = PAGE_ALIGN(size + offset) >> PAGE_SHIFT;
+	vpn_start = (addr & PAGE_MASK) >> PAGE_SHIFT;
+	vpn_last = vpn_start + npages - 1;
+
+	uiomr = kmalloc(sizeof(*uiomr), GFP_KERNEL);
+	if (!uiomr)
+		return ERR_PTR(-ENOMEM);
+
+	uiomr->va = va_base;
+	uiomr->offset = offset;
+	uiomr->length = size;
+	uiomr->writable = writable;
+	uiomr->pd = pd;
+
+	err = usnic_uiom_get_pages(addr, size, writable, dmasync,
+					&uiomr->chunk_list);
+	if (err) {
+		usnic_err("Failed get_pages vpn [0x%lx,0x%lx] err %d\n",
+				vpn_start, vpn_last, err);
+		goto out_free_uiomr;
+	}
+
+	spin_lock(&pd->lock);
+	err = usnic_uiom_get_intervals_diff(vpn_start, vpn_last,
+						(writable) ? IOMMU_WRITE : 0,
+						IOMMU_WRITE,
+						&pd->root,
+						&sorted_diff_intervals);
+	if (err) {
+		usnic_err("Failed disjoint interval vpn [0x%lx,0x%lx] err %d\n",
+						vpn_start, vpn_last, err);
+		goto out_put_pages;
+	}
+
+	err = usnic_uiom_map_sorted_intervals(&sorted_diff_intervals, uiomr);
+	if (err) {
+		usnic_err("Failed map interval vpn [0x%lx,0x%lx] err %d\n",
+						vpn_start, vpn_last, err);
+		goto out_put_intervals;
+
+	}
+
+	err = usnic_uiom_insert_interval(&pd->root, vpn_start, vpn_last,
+					(writable) ? IOMMU_WRITE : 0);
+	if (err) {
+		usnic_err("Failed insert interval vpn [0x%lx,0x%lx] err %d\n",
+						vpn_start, vpn_last, err);
+		goto out_unmap_intervals;
+	}
+
+	usnic_uiom_put_interval_set(&sorted_diff_intervals);
+	spin_unlock(&pd->lock);
+
+	return uiomr;
+
+out_unmap_intervals:
+	usnic_uiom_unmap_sorted_intervals(&sorted_diff_intervals, pd);
+out_put_intervals:
+	usnic_uiom_put_interval_set(&sorted_diff_intervals);
+out_put_pages:
+	usnic_uiom_put_pages(&uiomr->chunk_list, 0);
+	spin_unlock(&pd->lock);
+out_free_uiomr:
+	kfree(uiomr);
+	return ERR_PTR(err);
+}
+
+void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing)
+{
+	struct mm_struct *mm;
+	unsigned long diff;
+
+	__usnic_uiom_reg_release(uiomr->pd, uiomr, 1);
+
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(uiomr);
+		return;
+	}
+
+	diff = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT;
+
+	/*
+	 * We may be called with the mm's mmap_sem already held.  This
+	 * can happen when a userspace munmap() is the call that drops
+	 * the last reference to our file and calls our release
+	 * method.  If there are memory regions to destroy, we'll end
+	 * up here and not be able to take the mmap_sem.  In that case
+	 * we defer the vm_locked accounting to the system workqueue.
+	 */
+	if (closing) {
+		if (!down_write_trylock(&mm->mmap_sem)) {
+			INIT_WORK(&uiomr->work, usnic_uiom_reg_account);
+			uiomr->mm = mm;
+			uiomr->diff = diff;
+
+			queue_work(usnic_uiom_wq, &uiomr->work);
+			return;
+		}
+	} else
+		down_write(&mm->mmap_sem);
+
+	current->mm->locked_vm -= diff;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	kfree(uiomr);
+}
+
+struct usnic_uiom_pd *usnic_uiom_alloc_pd(void)
+{
+	struct usnic_uiom_pd *pd;
+	void *domain;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	pd->domain = domain = iommu_domain_alloc(&pci_bus_type);
+	if (!domain) {
+		usnic_err("Failed to allocate IOMMU domain");
+		kfree(pd);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	iommu_set_fault_handler(pd->domain, usnic_uiom_dma_fault, NULL);
+
+	spin_lock_init(&pd->lock);
+	INIT_LIST_HEAD(&pd->devs);
+
+	return pd;
+}
+
+void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd)
+{
+	iommu_domain_free(pd->domain);
+	kfree(pd);
+}
+
+int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev)
+{
+	struct usnic_uiom_dev *uiom_dev;
+	int err;
+
+	uiom_dev = kzalloc(sizeof(*uiom_dev), GFP_ATOMIC);
+	if (!uiom_dev)
+		return -ENOMEM;
+	uiom_dev->dev = dev;
+
+	err = iommu_attach_device(pd->domain, dev);
+	if (err)
+		goto out_free_dev;
+
+	if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) {
+		usnic_err("IOMMU of %s does not support cache coherency\n",
+				dev_name(dev));
+		err = -EINVAL;
+		goto out_detach_device;
+	}
+
+	spin_lock(&pd->lock);
+	list_add_tail(&uiom_dev->link, &pd->devs);
+	pd->dev_cnt++;
+	spin_unlock(&pd->lock);
+
+	return 0;
+
+out_detach_device:
+	iommu_detach_device(pd->domain, dev);
+out_free_dev:
+	kfree(uiom_dev);
+	return err;
+}
+
+void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, struct device *dev)
+{
+	struct usnic_uiom_dev *uiom_dev;
+	int found = 0;
+
+	spin_lock(&pd->lock);
+	list_for_each_entry(uiom_dev, &pd->devs, link) {
+		if (uiom_dev->dev == dev) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		usnic_err("Unable to free dev %s - not found\n",
+				dev_name(dev));
+		spin_unlock(&pd->lock);
+		return;
+	}
+
+	list_del(&uiom_dev->link);
+	pd->dev_cnt--;
+	spin_unlock(&pd->lock);
+
+	return iommu_detach_device(pd->domain, dev);
+}
+
+struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd)
+{
+	struct usnic_uiom_dev *uiom_dev;
+	struct device **devs;
+	int i = 0;
+
+	spin_lock(&pd->lock);
+	devs = kcalloc(pd->dev_cnt + 1, sizeof(*devs), GFP_ATOMIC);
+	if (!devs) {
+		devs = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	list_for_each_entry(uiom_dev, &pd->devs, link) {
+		devs[i++] = uiom_dev->dev;
+	}
+out:
+	spin_unlock(&pd->lock);
+	return devs;
+}
+
+void usnic_uiom_free_dev_list(struct device **devs)
+{
+	kfree(devs);
+}
+
+int usnic_uiom_init(char *drv_name)
+{
+	if (!iommu_present(&pci_bus_type)) {
+		usnic_err("IOMMU required but not present or enabled.  USNIC QPs will not function w/o enabling IOMMU\n");
+		return -EPERM;
+	}
+
+	usnic_uiom_wq = create_workqueue(drv_name);
+	if (!usnic_uiom_wq) {
+		usnic_err("Unable to alloc wq for drv %s\n", drv_name);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void usnic_uiom_fini(void)
+{
+	flush_workqueue(usnic_uiom_wq);
+	destroy_workqueue(usnic_uiom_wq);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h
new file mode 100644
index 0000000..431efe4
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_UIOM_H_
+#define USNIC_UIOM_H_
+
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+
+#include "usnic_uiom_interval_tree.h"
+
+#define USNIC_UIOM_READ			(1)
+#define USNIC_UIOM_WRITE		(2)
+
+#define USNIC_UIOM_MAX_PD_CNT		(1000)
+#define USNIC_UIOM_MAX_MR_CNT		(1000000)
+#define USNIC_UIOM_MAX_MR_SIZE		(~0UL)
+#define USNIC_UIOM_PAGE_SIZE		(PAGE_SIZE)
+
+struct usnic_uiom_dev {
+	struct device			*dev;
+	struct list_head		link;
+};
+
+struct usnic_uiom_pd {
+	struct iommu_domain		*domain;
+	spinlock_t			lock;
+	struct rb_root_cached		root;
+	struct list_head		devs;
+	int				dev_cnt;
+};
+
+struct usnic_uiom_reg {
+	struct usnic_uiom_pd		*pd;
+	unsigned long			va;
+	size_t				length;
+	int				offset;
+	int				page_size;
+	int				writable;
+	struct list_head		chunk_list;
+	struct work_struct		work;
+	struct mm_struct		*mm;
+	unsigned long			diff;
+};
+
+struct usnic_uiom_chunk {
+	struct list_head		list;
+	int				nents;
+	struct scatterlist		page_list[0];
+};
+
+struct usnic_uiom_pd *usnic_uiom_alloc_pd(void);
+void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd);
+int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev);
+void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd,
+					struct device *dev);
+struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd);
+void usnic_uiom_free_dev_list(struct device **devs);
+struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
+						unsigned long addr, size_t size,
+						int access, int dmasync);
+void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing);
+int usnic_uiom_init(char *drv_name);
+void usnic_uiom_fini(void);
+#endif /* USNIC_UIOM_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
new file mode 100644
index 0000000..d399523
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/list_sort.h>
+
+#include <linux/interval_tree_generic.h>
+#include "usnic_uiom_interval_tree.h"
+
+#define START(node) ((node)->start)
+#define LAST(node) ((node)->last)
+
+#define MAKE_NODE(node, start, end, ref_cnt, flags, err, err_out)	\
+		do {							\
+			node = usnic_uiom_interval_node_alloc(start,	\
+					end, ref_cnt, flags);		\
+				if (!node) {				\
+					err = -ENOMEM;			\
+					goto err_out;			\
+				}					\
+		} while (0)
+
+#define MARK_FOR_ADD(node, list) (list_add_tail(&node->link, list))
+
+#define MAKE_NODE_AND_APPEND(node, start, end, ref_cnt, flags, err,	\
+				err_out, list)				\
+				do {					\
+					MAKE_NODE(node, start, end,	\
+						ref_cnt, flags, err,	\
+						err_out);		\
+					MARK_FOR_ADD(node, list);	\
+				} while (0)
+
+#define FLAGS_EQUAL(flags1, flags2, mask)				\
+			(((flags1) & (mask)) == ((flags2) & (mask)))
+
+static struct usnic_uiom_interval_node*
+usnic_uiom_interval_node_alloc(long int start, long int last, int ref_cnt,
+				int flags)
+{
+	struct usnic_uiom_interval_node *interval = kzalloc(sizeof(*interval),
+								GFP_ATOMIC);
+	if (!interval)
+		return NULL;
+
+	interval->start = start;
+	interval->last = last;
+	interval->flags = flags;
+	interval->ref_cnt = ref_cnt;
+
+	return interval;
+}
+
+static int interval_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct usnic_uiom_interval_node *node_a, *node_b;
+
+	node_a = list_entry(a, struct usnic_uiom_interval_node, link);
+	node_b = list_entry(b, struct usnic_uiom_interval_node, link);
+
+	/* long to int */
+	if (node_a->start < node_b->start)
+		return -1;
+	else if (node_a->start > node_b->start)
+		return 1;
+
+	return 0;
+}
+
+static void
+find_intervals_intersection_sorted(struct rb_root_cached *root,
+				   unsigned long start, unsigned long last,
+				   struct list_head *list)
+{
+	struct usnic_uiom_interval_node *node;
+
+	INIT_LIST_HEAD(list);
+
+	for (node = usnic_uiom_interval_tree_iter_first(root, start, last);
+		node;
+		node = usnic_uiom_interval_tree_iter_next(node, start, last))
+		list_add_tail(&node->link, list);
+
+	list_sort(NULL, list, interval_cmp);
+}
+
+int usnic_uiom_get_intervals_diff(unsigned long start, unsigned long last,
+					int flags, int flag_mask,
+					struct rb_root_cached *root,
+					struct list_head *diff_set)
+{
+	struct usnic_uiom_interval_node *interval, *tmp;
+	int err = 0;
+	long int pivot = start;
+	LIST_HEAD(intersection_set);
+
+	INIT_LIST_HEAD(diff_set);
+
+	find_intervals_intersection_sorted(root, start, last,
+						&intersection_set);
+
+	list_for_each_entry(interval, &intersection_set, link) {
+		if (pivot < interval->start) {
+			MAKE_NODE_AND_APPEND(tmp, pivot, interval->start - 1,
+						1, flags, err, err_out,
+						diff_set);
+			pivot = interval->start;
+		}
+
+		/*
+		 * Invariant: Set [start, pivot] is either in diff_set or root,
+		 * but not in both.
+		 */
+
+		if (pivot > interval->last) {
+			continue;
+		} else if (pivot <= interval->last &&
+				FLAGS_EQUAL(interval->flags, flags,
+				flag_mask)) {
+			pivot = interval->last + 1;
+		}
+	}
+
+	if (pivot <= last)
+		MAKE_NODE_AND_APPEND(tmp, pivot, last, 1, flags, err, err_out,
+					diff_set);
+
+	return 0;
+
+err_out:
+	list_for_each_entry_safe(interval, tmp, diff_set, link) {
+		list_del(&interval->link);
+		kfree(interval);
+	}
+
+	return err;
+}
+
+void usnic_uiom_put_interval_set(struct list_head *intervals)
+{
+	struct usnic_uiom_interval_node *interval, *tmp;
+	list_for_each_entry_safe(interval, tmp, intervals, link)
+		kfree(interval);
+}
+
+int usnic_uiom_insert_interval(struct rb_root_cached *root, unsigned long start,
+				unsigned long last, int flags)
+{
+	struct usnic_uiom_interval_node *interval, *tmp;
+	unsigned long istart, ilast;
+	int iref_cnt, iflags;
+	unsigned long lpivot = start;
+	int err = 0;
+	LIST_HEAD(to_add);
+	LIST_HEAD(intersection_set);
+
+	find_intervals_intersection_sorted(root, start, last,
+						&intersection_set);
+
+	list_for_each_entry(interval, &intersection_set, link) {
+		/*
+		 * Invariant - lpivot is the left edge of next interval to be
+		 * inserted
+		 */
+		istart = interval->start;
+		ilast = interval->last;
+		iref_cnt = interval->ref_cnt;
+		iflags = interval->flags;
+
+		if (istart < lpivot) {
+			MAKE_NODE_AND_APPEND(tmp, istart, lpivot - 1, iref_cnt,
+						iflags, err, err_out, &to_add);
+		} else if (istart > lpivot) {
+			MAKE_NODE_AND_APPEND(tmp, lpivot, istart - 1, 1, flags,
+						err, err_out, &to_add);
+			lpivot = istart;
+		} else {
+			lpivot = istart;
+		}
+
+		if (ilast > last) {
+			MAKE_NODE_AND_APPEND(tmp, lpivot, last, iref_cnt + 1,
+						iflags | flags, err, err_out,
+						&to_add);
+			MAKE_NODE_AND_APPEND(tmp, last + 1, ilast, iref_cnt,
+						iflags, err, err_out, &to_add);
+		} else {
+			MAKE_NODE_AND_APPEND(tmp, lpivot, ilast, iref_cnt + 1,
+						iflags | flags, err, err_out,
+						&to_add);
+		}
+
+		lpivot = ilast + 1;
+	}
+
+	if (lpivot <= last)
+		MAKE_NODE_AND_APPEND(tmp, lpivot, last, 1, flags, err, err_out,
+					&to_add);
+
+	list_for_each_entry_safe(interval, tmp, &intersection_set, link) {
+		usnic_uiom_interval_tree_remove(interval, root);
+		kfree(interval);
+	}
+
+	list_for_each_entry(interval, &to_add, link)
+		usnic_uiom_interval_tree_insert(interval, root);
+
+	return 0;
+
+err_out:
+	list_for_each_entry_safe(interval, tmp, &to_add, link)
+		kfree(interval);
+
+	return err;
+}
+
+void usnic_uiom_remove_interval(struct rb_root_cached *root,
+				unsigned long start, unsigned long last,
+				struct list_head *removed)
+{
+	struct usnic_uiom_interval_node *interval;
+
+	for (interval = usnic_uiom_interval_tree_iter_first(root, start, last);
+			interval;
+			interval = usnic_uiom_interval_tree_iter_next(interval,
+									start,
+									last)) {
+		if (--interval->ref_cnt == 0)
+			list_add_tail(&interval->link, removed);
+	}
+
+	list_for_each_entry(interval, removed, link)
+		usnic_uiom_interval_tree_remove(interval, root);
+}
+
+INTERVAL_TREE_DEFINE(struct usnic_uiom_interval_node, rb,
+			unsigned long, __subtree_last,
+			START, LAST, , usnic_uiom_interval_tree)
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
new file mode 100644
index 0000000..1d7fc32
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_UIOM_INTERVAL_TREE_H_
+#define USNIC_UIOM_INTERVAL_TREE_H_
+
+#include <linux/rbtree.h>
+
+struct usnic_uiom_interval_node {
+	struct rb_node			rb;
+	struct list_head		link;
+	unsigned long			start;
+	unsigned long			last;
+	unsigned long			__subtree_last;
+	unsigned int			ref_cnt;
+	int				flags;
+};
+
+extern void
+usnic_uiom_interval_tree_insert(struct usnic_uiom_interval_node *node,
+					struct rb_root_cached *root);
+extern void
+usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node,
+					struct rb_root_cached *root);
+extern struct usnic_uiom_interval_node *
+usnic_uiom_interval_tree_iter_first(struct rb_root_cached *root,
+					unsigned long start,
+					unsigned long last);
+extern struct usnic_uiom_interval_node *
+usnic_uiom_interval_tree_iter_next(struct usnic_uiom_interval_node *node,
+			unsigned long start, unsigned long last);
+/*
+ * Inserts {start...last} into {root}.  If there are overlaps,
+ * nodes will be broken up and merged
+ */
+int usnic_uiom_insert_interval(struct rb_root_cached *root,
+				unsigned long start, unsigned long last,
+				int flags);
+/*
+ * Removed {start...last} from {root}.  The nodes removed are returned in
+ * 'removed.' The caller is responsibile for freeing memory of nodes in
+ * 'removed.'
+ */
+void usnic_uiom_remove_interval(struct rb_root_cached *root,
+				unsigned long start, unsigned long last,
+				struct list_head *removed);
+/*
+ * Returns {start...last} - {root} (relative complement of {start...last} in
+ * {root}) in diff_set sorted ascendingly
+ */
+int usnic_uiom_get_intervals_diff(unsigned long start,
+					unsigned long last, int flags,
+					int flag_mask,
+					struct rb_root_cached *root,
+					struct list_head *diff_set);
+/* Call this to free diff_set returned by usnic_uiom_get_intervals_diff */
+void usnic_uiom_put_interval_set(struct list_head *intervals);
+#endif /* USNIC_UIOM_INTERVAL_TREE_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.c b/drivers/infiniband/hw/usnic/usnic_vnic.c
new file mode 100644
index 0000000..e7b0030
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_vnic.c
@@ -0,0 +1,476 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include "usnic_ib.h"
+#include "vnic_resource.h"
+#include "usnic_log.h"
+#include "usnic_vnic.h"
+
+struct usnic_vnic {
+	struct vnic_dev			*vdev;
+	struct vnic_dev_bar		bar[PCI_NUM_RESOURCES];
+	struct usnic_vnic_res_chunk	chunks[USNIC_VNIC_RES_TYPE_MAX];
+	spinlock_t			res_lock;
+};
+
+static enum vnic_res_type _to_vnic_res_type(enum usnic_vnic_res_type res_type)
+{
+#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \
+		vnic_res_type,
+#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \
+		vnic_res_type,
+	static enum vnic_res_type usnic_vnic_type_2_vnic_type[] = {
+						USNIC_VNIC_RES_TYPES};
+#undef DEFINE_USNIC_VNIC_RES
+#undef DEFINE_USNIC_VNIC_RES_AT
+
+	if (res_type >= USNIC_VNIC_RES_TYPE_MAX)
+		return RES_TYPE_MAX;
+
+	return usnic_vnic_type_2_vnic_type[res_type];
+}
+
+const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type)
+{
+#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \
+		desc,
+#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \
+		desc,
+	static const char * const usnic_vnic_res_type_desc[] = {
+						USNIC_VNIC_RES_TYPES};
+#undef DEFINE_USNIC_VNIC_RES
+#undef DEFINE_USNIC_VNIC_RES_AT
+
+	if (res_type >= USNIC_VNIC_RES_TYPE_MAX)
+		return "unknown";
+
+	return usnic_vnic_res_type_desc[res_type];
+
+}
+
+const char *usnic_vnic_pci_name(struct usnic_vnic *vnic)
+{
+	return pci_name(usnic_vnic_get_pdev(vnic));
+}
+
+int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf,
+			int buf_sz,
+			void *hdr_obj,
+			int (*printtitle)(void *, char*, int),
+			int (*printcols)(char *, int),
+			int (*printrow)(void *, char *, int))
+{
+	struct usnic_vnic_res_chunk *chunk;
+	struct usnic_vnic_res *res;
+	struct vnic_dev_bar *bar0;
+	int i, j, offset;
+
+	offset = 0;
+	bar0 = usnic_vnic_get_bar(vnic, 0);
+	offset += scnprintf(buf + offset, buf_sz - offset,
+			"VF:%hu BAR0 bus_addr=%pa vaddr=0x%p size=%ld ",
+			usnic_vnic_get_index(vnic),
+			&bar0->bus_addr,
+			bar0->vaddr, bar0->len);
+	if (printtitle)
+		offset += printtitle(hdr_obj, buf + offset, buf_sz - offset);
+	offset += scnprintf(buf + offset, buf_sz - offset, "\n");
+	offset += scnprintf(buf + offset, buf_sz - offset,
+			"|RES\t|CTRL_PIN\t\t|IN_USE\t");
+	if (printcols)
+		offset += printcols(buf + offset, buf_sz - offset);
+	offset += scnprintf(buf + offset, buf_sz - offset, "\n");
+
+	spin_lock(&vnic->res_lock);
+	for (i = 0; i < ARRAY_SIZE(vnic->chunks); i++) {
+		chunk = &vnic->chunks[i];
+		for (j = 0; j < chunk->cnt; j++) {
+			res = chunk->res[j];
+			offset += scnprintf(buf + offset, buf_sz - offset,
+					"|%s[%u]\t|0x%p\t|%u\t",
+					usnic_vnic_res_type_to_str(res->type),
+					res->vnic_idx, res->ctrl, !!res->owner);
+			if (printrow) {
+				offset += printrow(res->owner, buf + offset,
+							buf_sz - offset);
+			}
+			offset += scnprintf(buf + offset, buf_sz - offset,
+						"\n");
+		}
+	}
+	spin_unlock(&vnic->res_lock);
+	return offset;
+}
+
+void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec,
+				enum usnic_vnic_res_type trgt_type,
+				u16 cnt)
+{
+	int i;
+
+	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) {
+		if (spec->resources[i].type == trgt_type) {
+			spec->resources[i].cnt = cnt;
+			return;
+		}
+	}
+
+	WARN_ON(1);
+}
+
+int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec,
+					struct usnic_vnic_res_spec *res_spec)
+{
+	int found, i, j;
+
+	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) {
+		found = 0;
+
+		for (j = 0; j < USNIC_VNIC_RES_TYPE_MAX; j++) {
+			if (res_spec->resources[i].type !=
+				min_spec->resources[i].type)
+				continue;
+			found = 1;
+			if (min_spec->resources[i].cnt >
+					res_spec->resources[i].cnt)
+				return -EINVAL;
+			break;
+		}
+
+		if (!found)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+int usnic_vnic_spec_dump(char *buf, int buf_sz,
+				struct usnic_vnic_res_spec *res_spec)
+{
+	enum usnic_vnic_res_type res_type;
+	int res_cnt;
+	int i;
+	int offset = 0;
+
+	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) {
+		res_type = res_spec->resources[i].type;
+		res_cnt = res_spec->resources[i].cnt;
+		offset += scnprintf(buf + offset, buf_sz - offset,
+				"Res: %s Cnt: %d ",
+				usnic_vnic_res_type_to_str(res_type),
+				res_cnt);
+	}
+
+	return offset;
+}
+
+int usnic_vnic_check_room(struct usnic_vnic *vnic,
+				struct usnic_vnic_res_spec *res_spec)
+{
+	int i;
+	enum usnic_vnic_res_type res_type;
+	int res_cnt;
+
+	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) {
+		res_type = res_spec->resources[i].type;
+		res_cnt = res_spec->resources[i].cnt;
+
+		if (res_type == USNIC_VNIC_RES_TYPE_EOL)
+			break;
+
+		if (res_cnt > usnic_vnic_res_free_cnt(vnic, res_type))
+			return -EBUSY;
+	}
+
+	return 0;
+}
+
+int usnic_vnic_res_cnt(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type)
+{
+	return vnic->chunks[type].cnt;
+}
+
+int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type)
+{
+	return vnic->chunks[type].free_cnt;
+}
+
+struct usnic_vnic_res_chunk *
+usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type,
+				int cnt, void *owner)
+{
+	struct usnic_vnic_res_chunk *src, *ret;
+	struct usnic_vnic_res *res;
+	int i;
+
+	if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 0 || !owner)
+		return ERR_PTR(-EINVAL);
+
+	ret = kzalloc(sizeof(*ret), GFP_ATOMIC);
+	if (!ret)
+		return ERR_PTR(-ENOMEM);
+
+	if (cnt > 0) {
+		ret->res = kcalloc(cnt, sizeof(*(ret->res)), GFP_ATOMIC);
+		if (!ret->res) {
+			kfree(ret);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		spin_lock(&vnic->res_lock);
+		src = &vnic->chunks[type];
+		for (i = 0; i < src->cnt && ret->cnt < cnt; i++) {
+			res = src->res[i];
+			if (!res->owner) {
+				src->free_cnt--;
+				res->owner = owner;
+				ret->res[ret->cnt++] = res;
+			}
+		}
+
+		spin_unlock(&vnic->res_lock);
+	}
+	ret->type = type;
+	ret->vnic = vnic;
+	WARN_ON(ret->cnt != cnt);
+
+	return ret;
+}
+
+void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk)
+{
+
+	struct usnic_vnic_res *res;
+	int i;
+	struct usnic_vnic *vnic = chunk->vnic;
+
+	if (chunk->cnt > 0) {
+		spin_lock(&vnic->res_lock);
+		while ((i = --chunk->cnt) >= 0) {
+			res = chunk->res[i];
+			chunk->res[i] = NULL;
+			res->owner = NULL;
+			vnic->chunks[res->type].free_cnt++;
+		}
+		spin_unlock(&vnic->res_lock);
+	}
+
+	kfree(chunk->res);
+	kfree(chunk);
+}
+
+u16 usnic_vnic_get_index(struct usnic_vnic *vnic)
+{
+	return usnic_vnic_get_pdev(vnic)->devfn - 1;
+}
+
+static int usnic_vnic_alloc_res_chunk(struct usnic_vnic *vnic,
+					enum usnic_vnic_res_type type,
+					struct usnic_vnic_res_chunk *chunk)
+{
+	int cnt, err, i;
+	struct usnic_vnic_res *res;
+
+	cnt = vnic_dev_get_res_count(vnic->vdev, _to_vnic_res_type(type));
+	if (cnt < 1) {
+		usnic_err("Wrong res count with cnt %d\n", cnt);
+		return -EINVAL;
+	}
+
+	chunk->cnt = chunk->free_cnt = cnt;
+	chunk->res = kzalloc(sizeof(*(chunk->res))*cnt, GFP_KERNEL);
+	if (!chunk->res)
+		return -ENOMEM;
+
+	for (i = 0; i < cnt; i++) {
+		res = kzalloc(sizeof(*res), GFP_KERNEL);
+		if (!res) {
+			err = -ENOMEM;
+			goto fail;
+		}
+		res->type = type;
+		res->vnic_idx = i;
+		res->vnic = vnic;
+		res->ctrl = vnic_dev_get_res(vnic->vdev,
+						_to_vnic_res_type(type), i);
+		chunk->res[i] = res;
+	}
+
+	chunk->vnic = vnic;
+	return 0;
+fail:
+	for (i--; i >= 0; i--)
+		kfree(chunk->res[i]);
+	kfree(chunk->res);
+	return err;
+}
+
+static void usnic_vnic_free_res_chunk(struct usnic_vnic_res_chunk *chunk)
+{
+	int i;
+	for (i = 0; i < chunk->cnt; i++)
+		kfree(chunk->res[i]);
+	kfree(chunk->res);
+}
+
+static int usnic_vnic_discover_resources(struct pci_dev *pdev,
+						struct usnic_vnic *vnic)
+{
+	enum usnic_vnic_res_type res_type;
+	int i;
+	int err = 0;
+
+	for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) {
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+			continue;
+		vnic->bar[i].len = pci_resource_len(pdev, i);
+		vnic->bar[i].vaddr = pci_iomap(pdev, i, vnic->bar[i].len);
+		if (!vnic->bar[i].vaddr) {
+			usnic_err("Cannot memory-map BAR %d, aborting\n",
+					i);
+			err = -ENODEV;
+			goto out_clean_bar;
+		}
+		vnic->bar[i].bus_addr = pci_resource_start(pdev, i);
+	}
+
+	vnic->vdev = vnic_dev_register(NULL, pdev, pdev, vnic->bar,
+			ARRAY_SIZE(vnic->bar));
+	if (!vnic->vdev) {
+		usnic_err("Failed to register device %s\n",
+				pci_name(pdev));
+		err = -EINVAL;
+		goto out_clean_bar;
+	}
+
+	for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1;
+			res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++) {
+		err = usnic_vnic_alloc_res_chunk(vnic, res_type,
+						&vnic->chunks[res_type]);
+		if (err)
+			goto out_clean_chunks;
+	}
+
+	return 0;
+
+out_clean_chunks:
+	for (res_type--; res_type > USNIC_VNIC_RES_TYPE_EOL; res_type--)
+		usnic_vnic_free_res_chunk(&vnic->chunks[res_type]);
+	vnic_dev_unregister(vnic->vdev);
+out_clean_bar:
+	for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) {
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+			continue;
+		if (!vnic->bar[i].vaddr)
+			break;
+
+		iounmap(vnic->bar[i].vaddr);
+	}
+
+	return err;
+}
+
+struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic)
+{
+	return vnic_dev_get_pdev(vnic->vdev);
+}
+
+struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic,
+				int bar_num)
+{
+	return (bar_num < ARRAY_SIZE(vnic->bar)) ? &vnic->bar[bar_num] : NULL;
+}
+
+static void usnic_vnic_release_resources(struct usnic_vnic *vnic)
+{
+	int i;
+	struct pci_dev *pdev;
+	enum usnic_vnic_res_type res_type;
+
+	pdev = usnic_vnic_get_pdev(vnic);
+
+	for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1;
+			res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++)
+		usnic_vnic_free_res_chunk(&vnic->chunks[res_type]);
+
+	vnic_dev_unregister(vnic->vdev);
+
+	for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) {
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+			continue;
+		iounmap(vnic->bar[i].vaddr);
+	}
+}
+
+struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev)
+{
+	struct usnic_vnic *vnic;
+	int err = 0;
+
+	if (!pci_is_enabled(pdev)) {
+		usnic_err("PCI dev %s is disabled\n", pci_name(pdev));
+		return ERR_PTR(-EINVAL);
+	}
+
+	vnic = kzalloc(sizeof(*vnic), GFP_KERNEL);
+	if (!vnic)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&vnic->res_lock);
+
+	err = usnic_vnic_discover_resources(pdev, vnic);
+	if (err) {
+		usnic_err("Failed to discover %s resources with err %d\n",
+				pci_name(pdev), err);
+		goto out_free_vnic;
+	}
+
+	usnic_dbg("Allocated vnic for %s\n", usnic_vnic_pci_name(vnic));
+
+	return vnic;
+
+out_free_vnic:
+	kfree(vnic);
+
+	return ERR_PTR(err);
+}
+
+void usnic_vnic_free(struct usnic_vnic *vnic)
+{
+	usnic_vnic_release_resources(vnic);
+	kfree(vnic);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.h b/drivers/infiniband/hw/usnic/usnic_vnic.h
new file mode 100644
index 0000000..a08423e
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_vnic.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_VNIC_H_
+#define USNIC_VNIC_H_
+
+#include <linux/pci.h>
+
+#include "vnic_dev.h"
+
+/*                      =USNIC_VNIC_RES_TYPE= =VNIC_RES=   =DESC= */
+#define USNIC_VNIC_RES_TYPES \
+	DEFINE_USNIC_VNIC_RES_AT(EOL, RES_TYPE_EOL, "EOL", 0) \
+	DEFINE_USNIC_VNIC_RES(WQ, RES_TYPE_WQ, "WQ") \
+	DEFINE_USNIC_VNIC_RES(RQ, RES_TYPE_RQ, "RQ") \
+	DEFINE_USNIC_VNIC_RES(CQ, RES_TYPE_CQ, "CQ") \
+	DEFINE_USNIC_VNIC_RES(INTR, RES_TYPE_INTR_CTRL, "INT") \
+	DEFINE_USNIC_VNIC_RES(MAX, RES_TYPE_MAX, "MAX")\
+
+#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \
+	USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t = val,
+#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \
+	USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t,
+enum usnic_vnic_res_type {
+	USNIC_VNIC_RES_TYPES
+};
+#undef DEFINE_USNIC_VNIC_RES
+#undef DEFINE_USNIC_VNIC_RES_AT
+
+struct usnic_vnic_res {
+	enum usnic_vnic_res_type	type;
+	unsigned int			vnic_idx;
+	struct usnic_vnic		*vnic;
+	void __iomem			*ctrl;
+	void				*owner;
+};
+
+struct usnic_vnic_res_chunk {
+	enum usnic_vnic_res_type	type;
+	int				cnt;
+	int				free_cnt;
+	struct usnic_vnic_res		**res;
+	struct usnic_vnic		*vnic;
+};
+
+struct usnic_vnic_res_desc {
+	enum usnic_vnic_res_type	type;
+	uint16_t			cnt;
+};
+
+struct usnic_vnic_res_spec {
+	struct usnic_vnic_res_desc resources[USNIC_VNIC_RES_TYPE_MAX];
+};
+
+const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type);
+const char *usnic_vnic_pci_name(struct usnic_vnic *vnic);
+int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf, int buf_sz,
+			void *hdr_obj,
+			int (*printtitle)(void *, char*, int),
+			int (*printcols)(char *, int),
+			int (*printrow)(void *, char *, int));
+void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec,
+				enum usnic_vnic_res_type trgt_type,
+				u16 cnt);
+int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec,
+					struct usnic_vnic_res_spec *res_spec);
+int usnic_vnic_spec_dump(char *buf, int buf_sz,
+				struct usnic_vnic_res_spec *res_spec);
+int usnic_vnic_check_room(struct usnic_vnic *vnic,
+				struct usnic_vnic_res_spec *res_spec);
+int usnic_vnic_res_cnt(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type);
+int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type);
+struct usnic_vnic_res_chunk *
+usnic_vnic_get_resources(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type,
+				int cnt,
+				void *owner);
+void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk);
+struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic);
+struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic,
+				int bar_num);
+struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev);
+void usnic_vnic_free(struct usnic_vnic *vnic);
+u16 usnic_vnic_get_index(struct usnic_vnic *vnic);
+
+#endif /*!USNIC_VNIC_H_*/
diff --git a/drivers/infiniband/hw/vmw_pvrdma/Kconfig b/drivers/infiniband/hw/vmw_pvrdma/Kconfig
new file mode 100644
index 0000000..5a9790a
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/Kconfig
@@ -0,0 +1,7 @@
+config INFINIBAND_VMWARE_PVRDMA
+	tristate "VMware Paravirtualized RDMA Driver"
+	depends on NETDEVICES && ETHERNET && PCI && INET && VMXNET3
+	---help---
+	  This driver provides low-level support for VMware Paravirtual
+	  RDMA adapter. It interacts with the VMXNet3 driver to provide
+	  Ethernet capabilities.
diff --git a/drivers/infiniband/hw/vmw_pvrdma/Makefile b/drivers/infiniband/hw/vmw_pvrdma/Makefile
new file mode 100644
index 0000000..2f52e0a
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_INFINIBAND_VMWARE_PVRDMA) += vmw_pvrdma.o
+
+vmw_pvrdma-y := pvrdma_cmd.o pvrdma_cq.o pvrdma_doorbell.o pvrdma_main.o pvrdma_misc.o pvrdma_mr.o pvrdma_qp.o pvrdma_srq.o pvrdma_verbs.o
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
new file mode 100644
index 0000000..44cb1cf
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __PVRDMA_H__
+#define __PVRDMA_H__
+
+#include <linux/compiler.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/semaphore.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/vmw_pvrdma-abi.h>
+
+#include "pvrdma_ring.h"
+#include "pvrdma_dev_api.h"
+#include "pvrdma_verbs.h"
+
+/* NOT the same as BIT_MASK(). */
+#define PVRDMA_MASK(n) ((n << 1) - 1)
+
+/*
+ * VMware PVRDMA PCI device id.
+ */
+#define PCI_DEVICE_ID_VMWARE_PVRDMA	0x0820
+
+#define PVRDMA_NUM_RING_PAGES		4
+#define PVRDMA_QP_NUM_HEADER_PAGES	1
+
+struct pvrdma_dev;
+
+struct pvrdma_page_dir {
+	dma_addr_t dir_dma;
+	u64 *dir;
+	int ntables;
+	u64 **tables;
+	u64 npages;
+	void **pages;
+};
+
+struct pvrdma_cq {
+	struct ib_cq ibcq;
+	int offset;
+	spinlock_t cq_lock; /* Poll lock. */
+	struct pvrdma_uar_map *uar;
+	struct ib_umem *umem;
+	struct pvrdma_ring_state *ring_state;
+	struct pvrdma_page_dir pdir;
+	u32 cq_handle;
+	bool is_kernel;
+	refcount_t refcnt;
+	struct completion free;
+};
+
+struct pvrdma_id_table {
+	u32 last;
+	u32 top;
+	u32 max;
+	u32 mask;
+	spinlock_t lock; /* Table lock. */
+	unsigned long *table;
+};
+
+struct pvrdma_uar_map {
+	unsigned long pfn;
+	void __iomem *map;
+	int index;
+};
+
+struct pvrdma_uar_table {
+	struct pvrdma_id_table tbl;
+	int size;
+};
+
+struct pvrdma_ucontext {
+	struct ib_ucontext ibucontext;
+	struct pvrdma_dev *dev;
+	struct pvrdma_uar_map uar;
+	u64 ctx_handle;
+};
+
+struct pvrdma_pd {
+	struct ib_pd ibpd;
+	u32 pdn;
+	u32 pd_handle;
+	int privileged;
+};
+
+struct pvrdma_mr {
+	u32 mr_handle;
+	u64 iova;
+	u64 size;
+};
+
+struct pvrdma_user_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct pvrdma_mr mmr;
+	struct pvrdma_page_dir pdir;
+	u64 *pages;
+	u32 npages;
+	u32 max_pages;
+	u32 page_shift;
+};
+
+struct pvrdma_wq {
+	struct pvrdma_ring *ring;
+	spinlock_t lock; /* Work queue lock. */
+	int wqe_cnt;
+	int wqe_size;
+	int max_sg;
+	int offset;
+};
+
+struct pvrdma_ah {
+	struct ib_ah ibah;
+	struct pvrdma_av av;
+};
+
+struct pvrdma_srq {
+	struct ib_srq ibsrq;
+	int offset;
+	spinlock_t lock; /* SRQ lock. */
+	int wqe_cnt;
+	int wqe_size;
+	int max_gs;
+	struct ib_umem *umem;
+	struct pvrdma_ring_state *ring;
+	struct pvrdma_page_dir pdir;
+	u32 srq_handle;
+	int npages;
+	refcount_t refcnt;
+	struct completion free;
+};
+
+struct pvrdma_qp {
+	struct ib_qp ibqp;
+	u32 qp_handle;
+	u32 qkey;
+	struct pvrdma_wq sq;
+	struct pvrdma_wq rq;
+	struct ib_umem *rumem;
+	struct ib_umem *sumem;
+	struct pvrdma_page_dir pdir;
+	struct pvrdma_srq *srq;
+	int npages;
+	int npages_send;
+	int npages_recv;
+	u32 flags;
+	u8 port;
+	u8 state;
+	bool is_kernel;
+	struct mutex mutex; /* QP state mutex. */
+	refcount_t refcnt;
+	struct completion free;
+};
+
+struct pvrdma_dev {
+	/* PCI device-related information. */
+	struct ib_device ib_dev;
+	struct pci_dev *pdev;
+	void __iomem *regs;
+	struct pvrdma_device_shared_region *dsr; /* Shared region pointer */
+	dma_addr_t dsrbase; /* Shared region base address */
+	void *cmd_slot;
+	void *resp_slot;
+	unsigned long flags;
+	struct list_head device_link;
+	unsigned int dsr_version;
+
+	/* Locking and interrupt information. */
+	spinlock_t cmd_lock; /* Command lock. */
+	struct semaphore cmd_sema;
+	struct completion cmd_done;
+	unsigned int nr_vectors;
+
+	/* RDMA-related device information. */
+	union ib_gid *sgid_tbl;
+	struct pvrdma_ring_state *async_ring_state;
+	struct pvrdma_page_dir async_pdir;
+	struct pvrdma_ring_state *cq_ring_state;
+	struct pvrdma_page_dir cq_pdir;
+	struct pvrdma_cq **cq_tbl;
+	spinlock_t cq_tbl_lock;
+	struct pvrdma_srq **srq_tbl;
+	spinlock_t srq_tbl_lock;
+	struct pvrdma_qp **qp_tbl;
+	spinlock_t qp_tbl_lock;
+	struct pvrdma_uar_table uar_table;
+	struct pvrdma_uar_map driver_uar;
+	__be64 sys_image_guid;
+	spinlock_t desc_lock; /* Device modification lock. */
+	u32 port_cap_mask;
+	struct mutex port_mutex; /* Port modification mutex. */
+	bool ib_active;
+	atomic_t num_qps;
+	atomic_t num_cqs;
+	atomic_t num_srqs;
+	atomic_t num_pds;
+	atomic_t num_ahs;
+
+	/* Network device information. */
+	struct net_device *netdev;
+	struct notifier_block nb_netdev;
+};
+
+struct pvrdma_netdevice_work {
+	struct work_struct work;
+	struct net_device *event_netdev;
+	unsigned long event;
+};
+
+static inline struct pvrdma_dev *to_vdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct pvrdma_dev, ib_dev);
+}
+
+static inline struct
+pvrdma_ucontext *to_vucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct pvrdma_ucontext, ibucontext);
+}
+
+static inline struct pvrdma_pd *to_vpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct pvrdma_pd, ibpd);
+}
+
+static inline struct pvrdma_cq *to_vcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct pvrdma_cq, ibcq);
+}
+
+static inline struct pvrdma_srq *to_vsrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct pvrdma_srq, ibsrq);
+}
+
+static inline struct pvrdma_user_mr *to_vmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct pvrdma_user_mr, ibmr);
+}
+
+static inline struct pvrdma_qp *to_vqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct pvrdma_qp, ibqp);
+}
+
+static inline struct pvrdma_ah *to_vah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct pvrdma_ah, ibah);
+}
+
+static inline void pvrdma_write_reg(struct pvrdma_dev *dev, u32 reg, u32 val)
+{
+	writel(cpu_to_le32(val), dev->regs + reg);
+}
+
+static inline u32 pvrdma_read_reg(struct pvrdma_dev *dev, u32 reg)
+{
+	return le32_to_cpu(readl(dev->regs + reg));
+}
+
+static inline void pvrdma_write_uar_cq(struct pvrdma_dev *dev, u32 val)
+{
+	writel(cpu_to_le32(val), dev->driver_uar.map + PVRDMA_UAR_CQ_OFFSET);
+}
+
+static inline void pvrdma_write_uar_qp(struct pvrdma_dev *dev, u32 val)
+{
+	writel(cpu_to_le32(val), dev->driver_uar.map + PVRDMA_UAR_QP_OFFSET);
+}
+
+static inline void *pvrdma_page_dir_get_ptr(struct pvrdma_page_dir *pdir,
+					    u64 offset)
+{
+	return pdir->pages[offset / PAGE_SIZE] + (offset % PAGE_SIZE);
+}
+
+static inline enum pvrdma_mtu ib_mtu_to_pvrdma(enum ib_mtu mtu)
+{
+	return (enum pvrdma_mtu)mtu;
+}
+
+static inline enum ib_mtu pvrdma_mtu_to_ib(enum pvrdma_mtu mtu)
+{
+	return (enum ib_mtu)mtu;
+}
+
+static inline enum pvrdma_port_state ib_port_state_to_pvrdma(
+					enum ib_port_state state)
+{
+	return (enum pvrdma_port_state)state;
+}
+
+static inline enum ib_port_state pvrdma_port_state_to_ib(
+					enum pvrdma_port_state state)
+{
+	return (enum ib_port_state)state;
+}
+
+static inline int ib_port_cap_flags_to_pvrdma(int flags)
+{
+	return flags & PVRDMA_MASK(PVRDMA_PORT_CAP_FLAGS_MAX);
+}
+
+static inline int pvrdma_port_cap_flags_to_ib(int flags)
+{
+	return flags;
+}
+
+static inline enum pvrdma_port_width ib_port_width_to_pvrdma(
+					enum ib_port_width width)
+{
+	return (enum pvrdma_port_width)width;
+}
+
+static inline enum ib_port_width pvrdma_port_width_to_ib(
+					enum pvrdma_port_width width)
+{
+	return (enum ib_port_width)width;
+}
+
+static inline enum pvrdma_port_speed ib_port_speed_to_pvrdma(
+					enum ib_port_speed speed)
+{
+	return (enum pvrdma_port_speed)speed;
+}
+
+static inline enum ib_port_speed pvrdma_port_speed_to_ib(
+					enum pvrdma_port_speed speed)
+{
+	return (enum ib_port_speed)speed;
+}
+
+static inline int pvrdma_qp_attr_mask_to_ib(int attr_mask)
+{
+	return attr_mask;
+}
+
+static inline int ib_qp_attr_mask_to_pvrdma(int attr_mask)
+{
+	return attr_mask & PVRDMA_MASK(PVRDMA_QP_ATTR_MASK_MAX);
+}
+
+static inline enum pvrdma_mig_state ib_mig_state_to_pvrdma(
+					enum ib_mig_state state)
+{
+	return (enum pvrdma_mig_state)state;
+}
+
+static inline enum ib_mig_state pvrdma_mig_state_to_ib(
+					enum pvrdma_mig_state state)
+{
+	return (enum ib_mig_state)state;
+}
+
+static inline int ib_access_flags_to_pvrdma(int flags)
+{
+	return flags;
+}
+
+static inline int pvrdma_access_flags_to_ib(int flags)
+{
+	return flags & PVRDMA_MASK(PVRDMA_ACCESS_FLAGS_MAX);
+}
+
+static inline enum pvrdma_qp_type ib_qp_type_to_pvrdma(enum ib_qp_type type)
+{
+	return (enum pvrdma_qp_type)type;
+}
+
+static inline enum ib_qp_type pvrdma_qp_type_to_ib(enum pvrdma_qp_type type)
+{
+	return (enum ib_qp_type)type;
+}
+
+static inline enum pvrdma_qp_state ib_qp_state_to_pvrdma(enum ib_qp_state state)
+{
+	return (enum pvrdma_qp_state)state;
+}
+
+static inline enum ib_qp_state pvrdma_qp_state_to_ib(enum pvrdma_qp_state state)
+{
+	return (enum ib_qp_state)state;
+}
+
+static inline enum pvrdma_wr_opcode ib_wr_opcode_to_pvrdma(enum ib_wr_opcode op)
+{
+	return (enum pvrdma_wr_opcode)op;
+}
+
+static inline enum ib_wc_status pvrdma_wc_status_to_ib(
+					enum pvrdma_wc_status status)
+{
+	return (enum ib_wc_status)status;
+}
+
+static inline int pvrdma_wc_opcode_to_ib(unsigned int opcode)
+{
+	switch (opcode) {
+	case PVRDMA_WC_SEND:
+		return IB_WC_SEND;
+	case PVRDMA_WC_RDMA_WRITE:
+		return IB_WC_RDMA_WRITE;
+	case PVRDMA_WC_RDMA_READ:
+		return IB_WC_RDMA_READ;
+	case PVRDMA_WC_COMP_SWAP:
+		return IB_WC_COMP_SWAP;
+	case PVRDMA_WC_FETCH_ADD:
+		return IB_WC_FETCH_ADD;
+	case PVRDMA_WC_LOCAL_INV:
+		return IB_WC_LOCAL_INV;
+	case PVRDMA_WC_FAST_REG_MR:
+		return IB_WC_REG_MR;
+	case PVRDMA_WC_MASKED_COMP_SWAP:
+		return IB_WC_MASKED_COMP_SWAP;
+	case PVRDMA_WC_MASKED_FETCH_ADD:
+		return IB_WC_MASKED_FETCH_ADD;
+	case PVRDMA_WC_RECV:
+		return IB_WC_RECV;
+	case PVRDMA_WC_RECV_RDMA_WITH_IMM:
+		return IB_WC_RECV_RDMA_WITH_IMM;
+	default:
+		return IB_WC_SEND;
+	}
+}
+
+static inline int pvrdma_wc_flags_to_ib(int flags)
+{
+	return flags;
+}
+
+static inline int ib_send_flags_to_pvrdma(int flags)
+{
+	return flags & PVRDMA_MASK(PVRDMA_SEND_FLAGS_MAX);
+}
+
+void pvrdma_qp_cap_to_ib(struct ib_qp_cap *dst,
+			 const struct pvrdma_qp_cap *src);
+void ib_qp_cap_to_pvrdma(struct pvrdma_qp_cap *dst,
+			 const struct ib_qp_cap *src);
+void pvrdma_gid_to_ib(union ib_gid *dst, const union pvrdma_gid *src);
+void ib_gid_to_pvrdma(union pvrdma_gid *dst, const union ib_gid *src);
+void pvrdma_global_route_to_ib(struct ib_global_route *dst,
+			       const struct pvrdma_global_route *src);
+void ib_global_route_to_pvrdma(struct pvrdma_global_route *dst,
+			       const struct ib_global_route *src);
+void pvrdma_ah_attr_to_rdma(struct rdma_ah_attr *dst,
+			    const struct pvrdma_ah_attr *src);
+void rdma_ah_attr_to_pvrdma(struct pvrdma_ah_attr *dst,
+			    const struct rdma_ah_attr *src);
+u8 ib_gid_type_to_pvrdma(enum ib_gid_type gid_type);
+
+int pvrdma_uar_table_init(struct pvrdma_dev *dev);
+void pvrdma_uar_table_cleanup(struct pvrdma_dev *dev);
+
+int pvrdma_uar_alloc(struct pvrdma_dev *dev, struct pvrdma_uar_map *uar);
+void pvrdma_uar_free(struct pvrdma_dev *dev, struct pvrdma_uar_map *uar);
+
+void _pvrdma_flush_cqe(struct pvrdma_qp *qp, struct pvrdma_cq *cq);
+
+int pvrdma_page_dir_init(struct pvrdma_dev *dev, struct pvrdma_page_dir *pdir,
+			 u64 npages, bool alloc_pages);
+void pvrdma_page_dir_cleanup(struct pvrdma_dev *dev,
+			     struct pvrdma_page_dir *pdir);
+int pvrdma_page_dir_insert_dma(struct pvrdma_page_dir *pdir, u64 idx,
+			       dma_addr_t daddr);
+int pvrdma_page_dir_insert_umem(struct pvrdma_page_dir *pdir,
+				struct ib_umem *umem, u64 offset);
+dma_addr_t pvrdma_page_dir_get_dma(struct pvrdma_page_dir *pdir, u64 idx);
+int pvrdma_page_dir_insert_page_list(struct pvrdma_page_dir *pdir,
+				     u64 *page_list, int num_pages);
+
+int pvrdma_cmd_post(struct pvrdma_dev *dev, union pvrdma_cmd_req *req,
+		    union pvrdma_cmd_resp *rsp, unsigned resp_code);
+
+#endif /* __PVRDMA_H__ */
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cmd.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cmd.c
new file mode 100644
index 0000000..4a78c53
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cmd.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/list.h>
+
+#include "pvrdma.h"
+
+#define PVRDMA_CMD_TIMEOUT	10000 /* ms */
+
+static inline int pvrdma_cmd_recv(struct pvrdma_dev *dev,
+				  union pvrdma_cmd_resp *resp,
+				  unsigned resp_code)
+{
+	int err;
+
+	dev_dbg(&dev->pdev->dev, "receive response from device\n");
+
+	err = wait_for_completion_interruptible_timeout(&dev->cmd_done,
+			msecs_to_jiffies(PVRDMA_CMD_TIMEOUT));
+	if (err == 0 || err == -ERESTARTSYS) {
+		dev_warn(&dev->pdev->dev,
+			 "completion timeout or interrupted\n");
+		return -ETIMEDOUT;
+	}
+
+	spin_lock(&dev->cmd_lock);
+	memcpy(resp, dev->resp_slot, sizeof(*resp));
+	spin_unlock(&dev->cmd_lock);
+
+	if (resp->hdr.ack != resp_code) {
+		dev_warn(&dev->pdev->dev,
+			 "unknown response %#x expected %#x\n",
+			 resp->hdr.ack, resp_code);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+int
+pvrdma_cmd_post(struct pvrdma_dev *dev, union pvrdma_cmd_req *req,
+		union pvrdma_cmd_resp *resp, unsigned resp_code)
+{
+	int err;
+
+	dev_dbg(&dev->pdev->dev, "post request to device\n");
+
+	/* Serializiation */
+	down(&dev->cmd_sema);
+
+	BUILD_BUG_ON(sizeof(union pvrdma_cmd_req) !=
+		     sizeof(struct pvrdma_cmd_modify_qp));
+
+	spin_lock(&dev->cmd_lock);
+	memcpy(dev->cmd_slot, req, sizeof(*req));
+	spin_unlock(&dev->cmd_lock);
+
+	init_completion(&dev->cmd_done);
+	pvrdma_write_reg(dev, PVRDMA_REG_REQUEST, 0);
+
+	/* Make sure the request is written before reading status. */
+	mb();
+
+	err = pvrdma_read_reg(dev, PVRDMA_REG_ERR);
+	if (err == 0) {
+		if (resp != NULL)
+			err = pvrdma_cmd_recv(dev, resp, resp_code);
+	} else {
+		dev_warn(&dev->pdev->dev,
+			 "failed to write request error reg: %d\n", err);
+		err = -EFAULT;
+	}
+
+	up(&dev->cmd_sema);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
new file mode 100644
index 0000000..f95b976
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <asm/page.h>
+#include <linux/io.h>
+#include <linux/wait.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "pvrdma.h"
+
+/**
+ * pvrdma_req_notify_cq - request notification for a completion queue
+ * @ibcq: the completion queue
+ * @notify_flags: notification flags
+ *
+ * @return: 0 for success.
+ */
+int pvrdma_req_notify_cq(struct ib_cq *ibcq,
+			 enum ib_cq_notify_flags notify_flags)
+{
+	struct pvrdma_dev *dev = to_vdev(ibcq->device);
+	struct pvrdma_cq *cq = to_vcq(ibcq);
+	u32 val = cq->cq_handle;
+	unsigned long flags;
+	int has_data = 0;
+
+	val |= (notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		PVRDMA_UAR_CQ_ARM_SOL : PVRDMA_UAR_CQ_ARM;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+
+	pvrdma_write_uar_cq(dev, val);
+
+	if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+		unsigned int head;
+
+		has_data = pvrdma_idx_ring_has_data(&cq->ring_state->rx,
+						    cq->ibcq.cqe, &head);
+		if (unlikely(has_data == PVRDMA_INVALID_IDX))
+			dev_err(&dev->pdev->dev, "CQ ring state invalid\n");
+	}
+
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+	return has_data;
+}
+
+/**
+ * pvrdma_create_cq - create completion queue
+ * @ibdev: the device
+ * @attr: completion queue attributes
+ * @context: user context
+ * @udata: user data
+ *
+ * @return: ib_cq completion queue pointer on success,
+ *          otherwise returns negative errno.
+ */
+struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
+			       const struct ib_cq_init_attr *attr,
+			       struct ib_ucontext *context,
+			       struct ib_udata *udata)
+{
+	int entries = attr->cqe;
+	struct pvrdma_dev *dev = to_vdev(ibdev);
+	struct pvrdma_cq *cq;
+	int ret;
+	int npages;
+	unsigned long flags;
+	union pvrdma_cmd_req req;
+	union pvrdma_cmd_resp rsp;
+	struct pvrdma_cmd_create_cq *cmd = &req.create_cq;
+	struct pvrdma_cmd_create_cq_resp *resp = &rsp.create_cq_resp;
+	struct pvrdma_create_cq_resp cq_resp = {0};
+	struct pvrdma_create_cq ucmd;
+
+	BUILD_BUG_ON(sizeof(struct pvrdma_cqe) != 64);
+
+	entries = roundup_pow_of_two(entries);
+	if (entries < 1 || entries > dev->dsr->caps.max_cqe)
+		return ERR_PTR(-EINVAL);
+
+	if (!atomic_add_unless(&dev->num_cqs, 1, dev->dsr->caps.max_cq))
+		return ERR_PTR(-ENOMEM);
+
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq) {
+		atomic_dec(&dev->num_cqs);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	cq->ibcq.cqe = entries;
+	cq->is_kernel = !context;
+
+	if (!cq->is_kernel) {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+			ret = -EFAULT;
+			goto err_cq;
+		}
+
+		cq->umem = ib_umem_get(context, ucmd.buf_addr, ucmd.buf_size,
+				       IB_ACCESS_LOCAL_WRITE, 1);
+		if (IS_ERR(cq->umem)) {
+			ret = PTR_ERR(cq->umem);
+			goto err_cq;
+		}
+
+		npages = ib_umem_page_count(cq->umem);
+	} else {
+		/* One extra page for shared ring state */
+		npages = 1 + (entries * sizeof(struct pvrdma_cqe) +
+			      PAGE_SIZE - 1) / PAGE_SIZE;
+
+		/* Skip header page. */
+		cq->offset = PAGE_SIZE;
+	}
+
+	if (npages < 0 || npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
+		dev_warn(&dev->pdev->dev,
+			 "overflow pages in completion queue\n");
+		ret = -EINVAL;
+		goto err_umem;
+	}
+
+	ret = pvrdma_page_dir_init(dev, &cq->pdir, npages, cq->is_kernel);
+	if (ret) {
+		dev_warn(&dev->pdev->dev,
+			 "could not allocate page directory\n");
+		goto err_umem;
+	}
+
+	/* Ring state is always the first page. Set in library for user cq. */
+	if (cq->is_kernel)
+		cq->ring_state = cq->pdir.pages[0];
+	else
+		pvrdma_page_dir_insert_umem(&cq->pdir, cq->umem, 0);
+
+	refcount_set(&cq->refcnt, 1);
+	init_completion(&cq->free);
+	spin_lock_init(&cq->cq_lock);
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_CREATE_CQ;
+	cmd->nchunks = npages;
+	cmd->ctx_handle = (context) ?
+		(u64)to_vucontext(context)->ctx_handle : 0;
+	cmd->cqe = entries;
+	cmd->pdir_dma = cq->pdir.dir_dma;
+	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_CQ_RESP);
+	if (ret < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "could not create completion queue, error: %d\n", ret);
+		goto err_page_dir;
+	}
+
+	cq->ibcq.cqe = resp->cqe;
+	cq->cq_handle = resp->cq_handle;
+	cq_resp.cqn = resp->cq_handle;
+	spin_lock_irqsave(&dev->cq_tbl_lock, flags);
+	dev->cq_tbl[cq->cq_handle % dev->dsr->caps.max_cq] = cq;
+	spin_unlock_irqrestore(&dev->cq_tbl_lock, flags);
+
+	if (!cq->is_kernel) {
+		cq->uar = &(to_vucontext(context)->uar);
+
+		/* Copy udata back. */
+		if (ib_copy_to_udata(udata, &cq_resp, sizeof(cq_resp))) {
+			dev_warn(&dev->pdev->dev,
+				 "failed to copy back udata\n");
+			pvrdma_destroy_cq(&cq->ibcq);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	return &cq->ibcq;
+
+err_page_dir:
+	pvrdma_page_dir_cleanup(dev, &cq->pdir);
+err_umem:
+	if (!cq->is_kernel)
+		ib_umem_release(cq->umem);
+err_cq:
+	atomic_dec(&dev->num_cqs);
+	kfree(cq);
+
+	return ERR_PTR(ret);
+}
+
+static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq)
+{
+	if (refcount_dec_and_test(&cq->refcnt))
+		complete(&cq->free);
+	wait_for_completion(&cq->free);
+
+	if (!cq->is_kernel)
+		ib_umem_release(cq->umem);
+
+	pvrdma_page_dir_cleanup(dev, &cq->pdir);
+	kfree(cq);
+}
+
+/**
+ * pvrdma_destroy_cq - destroy completion queue
+ * @cq: the completion queue to destroy.
+ *
+ * @return: 0 for success.
+ */
+int pvrdma_destroy_cq(struct ib_cq *cq)
+{
+	struct pvrdma_cq *vcq = to_vcq(cq);
+	union pvrdma_cmd_req req;
+	struct pvrdma_cmd_destroy_cq *cmd = &req.destroy_cq;
+	struct pvrdma_dev *dev = to_vdev(cq->device);
+	unsigned long flags;
+	int ret;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_DESTROY_CQ;
+	cmd->cq_handle = vcq->cq_handle;
+
+	ret = pvrdma_cmd_post(dev, &req, NULL, 0);
+	if (ret < 0)
+		dev_warn(&dev->pdev->dev,
+			 "could not destroy completion queue, error: %d\n",
+			 ret);
+
+	/* free cq's resources */
+	spin_lock_irqsave(&dev->cq_tbl_lock, flags);
+	dev->cq_tbl[vcq->cq_handle] = NULL;
+	spin_unlock_irqrestore(&dev->cq_tbl_lock, flags);
+
+	pvrdma_free_cq(dev, vcq);
+	atomic_dec(&dev->num_cqs);
+
+	return ret;
+}
+
+/**
+ * pvrdma_modify_cq - modify the CQ moderation parameters
+ * @ibcq: the CQ to modify
+ * @cq_count: number of CQEs that will trigger an event
+ * @cq_period: max period of time in usec before triggering an event
+ *
+ * @return: -EOPNOTSUPP as CQ resize is not supported.
+ */
+int pvrdma_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline struct pvrdma_cqe *get_cqe(struct pvrdma_cq *cq, int i)
+{
+	return (struct pvrdma_cqe *)pvrdma_page_dir_get_ptr(
+					&cq->pdir,
+					cq->offset +
+					sizeof(struct pvrdma_cqe) * i);
+}
+
+void _pvrdma_flush_cqe(struct pvrdma_qp *qp, struct pvrdma_cq *cq)
+{
+	unsigned int head;
+	int has_data;
+
+	if (!cq->is_kernel)
+		return;
+
+	/* Lock held */
+	has_data = pvrdma_idx_ring_has_data(&cq->ring_state->rx,
+					    cq->ibcq.cqe, &head);
+	if (unlikely(has_data > 0)) {
+		int items;
+		int curr;
+		int tail = pvrdma_idx(&cq->ring_state->rx.prod_tail,
+				      cq->ibcq.cqe);
+		struct pvrdma_cqe *cqe;
+		struct pvrdma_cqe *curr_cqe;
+
+		items = (tail > head) ? (tail - head) :
+			(cq->ibcq.cqe - head + tail);
+		curr = --tail;
+		while (items-- > 0) {
+			if (curr < 0)
+				curr = cq->ibcq.cqe - 1;
+			if (tail < 0)
+				tail = cq->ibcq.cqe - 1;
+			curr_cqe = get_cqe(cq, curr);
+			if ((curr_cqe->qp & 0xFFFF) != qp->qp_handle) {
+				if (curr != tail) {
+					cqe = get_cqe(cq, tail);
+					*cqe = *curr_cqe;
+				}
+				tail--;
+			} else {
+				pvrdma_idx_ring_inc(
+					&cq->ring_state->rx.cons_head,
+					cq->ibcq.cqe);
+			}
+			curr--;
+		}
+	}
+}
+
+static int pvrdma_poll_one(struct pvrdma_cq *cq, struct pvrdma_qp **cur_qp,
+			   struct ib_wc *wc)
+{
+	struct pvrdma_dev *dev = to_vdev(cq->ibcq.device);
+	int has_data;
+	unsigned int head;
+	bool tried = false;
+	struct pvrdma_cqe *cqe;
+
+retry:
+	has_data = pvrdma_idx_ring_has_data(&cq->ring_state->rx,
+					    cq->ibcq.cqe, &head);
+	if (has_data == 0) {
+		if (tried)
+			return -EAGAIN;
+
+		pvrdma_write_uar_cq(dev, cq->cq_handle | PVRDMA_UAR_CQ_POLL);
+
+		tried = true;
+		goto retry;
+	} else if (has_data == PVRDMA_INVALID_IDX) {
+		dev_err(&dev->pdev->dev, "CQ ring state invalid\n");
+		return -EAGAIN;
+	}
+
+	cqe = get_cqe(cq, head);
+
+	/* Ensure cqe is valid. */
+	rmb();
+	if (dev->qp_tbl[cqe->qp & 0xffff])
+		*cur_qp = (struct pvrdma_qp *)dev->qp_tbl[cqe->qp & 0xffff];
+	else
+		return -EAGAIN;
+
+	wc->opcode = pvrdma_wc_opcode_to_ib(cqe->opcode);
+	wc->status = pvrdma_wc_status_to_ib(cqe->status);
+	wc->wr_id = cqe->wr_id;
+	wc->qp = &(*cur_qp)->ibqp;
+	wc->byte_len = cqe->byte_len;
+	wc->ex.imm_data = cqe->imm_data;
+	wc->src_qp = cqe->src_qp;
+	wc->wc_flags = pvrdma_wc_flags_to_ib(cqe->wc_flags);
+	wc->pkey_index = cqe->pkey_index;
+	wc->slid = cqe->slid;
+	wc->sl = cqe->sl;
+	wc->dlid_path_bits = cqe->dlid_path_bits;
+	wc->port_num = cqe->port_num;
+	wc->vendor_err = cqe->vendor_err;
+	wc->network_hdr_type = cqe->network_hdr_type;
+
+	/* Update shared ring state */
+	pvrdma_idx_ring_inc(&cq->ring_state->rx.cons_head, cq->ibcq.cqe);
+
+	return 0;
+}
+
+/**
+ * pvrdma_poll_cq - poll for work completion queue entries
+ * @ibcq: completion queue
+ * @num_entries: the maximum number of entries
+ * @entry: pointer to work completion array
+ *
+ * @return: number of polled completion entries
+ */
+int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct pvrdma_cq *cq = to_vcq(ibcq);
+	struct pvrdma_qp *cur_qp = NULL;
+	unsigned long flags;
+	int npolled;
+
+	if (num_entries < 1 || wc == NULL)
+		return 0;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		if (pvrdma_poll_one(cq, &cur_qp, wc + npolled))
+			break;
+	}
+
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+	/* Ensure we do not return errors from poll_cq */
+	return npolled;
+}
+
+/**
+ * pvrdma_resize_cq - resize CQ
+ * @ibcq: the completion queue
+ * @entries: CQ entries
+ * @udata: user data
+ *
+ * @return: -EOPNOTSUPP as CQ resize is not supported.
+ */
+int pvrdma_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+	return -EOPNOTSUPP;
+}
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
new file mode 100644
index 0000000..6fd5a8f
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
@@ -0,0 +1,667 @@
+/*
+ * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __PVRDMA_DEV_API_H__
+#define __PVRDMA_DEV_API_H__
+
+#include <linux/types.h>
+
+#include "pvrdma_verbs.h"
+
+/*
+ * PVRDMA version macros. Some new features require updates to PVRDMA_VERSION.
+ * These macros allow us to check for different features if necessary.
+ */
+
+#define PVRDMA_ROCEV1_VERSION		17
+#define PVRDMA_ROCEV2_VERSION		18
+#define PVRDMA_VERSION			PVRDMA_ROCEV2_VERSION
+
+#define PVRDMA_BOARD_ID			1
+#define PVRDMA_REV_ID			1
+
+/*
+ * Masks and accessors for page directory, which is a two-level lookup:
+ * page directory -> page table -> page. Only one directory for now, but we
+ * could expand that easily. 9 bits for tables, 9 bits for pages, gives one
+ * gigabyte for memory regions and so forth.
+ */
+
+#define PVRDMA_PDIR_SHIFT		18
+#define PVRDMA_PTABLE_SHIFT		9
+#define PVRDMA_PAGE_DIR_DIR(x)		(((x) >> PVRDMA_PDIR_SHIFT) & 0x1)
+#define PVRDMA_PAGE_DIR_TABLE(x)	(((x) >> PVRDMA_PTABLE_SHIFT) & 0x1ff)
+#define PVRDMA_PAGE_DIR_PAGE(x)		((x) & 0x1ff)
+#define PVRDMA_PAGE_DIR_MAX_PAGES	(1 * 512 * 512)
+#define PVRDMA_MAX_FAST_REG_PAGES	128
+
+/*
+ * Max MSI-X vectors.
+ */
+
+#define PVRDMA_MAX_INTERRUPTS	3
+
+/* Register offsets within PCI resource on BAR1. */
+#define PVRDMA_REG_VERSION	0x00	/* R: Version of device. */
+#define PVRDMA_REG_DSRLOW	0x04	/* W: Device shared region low PA. */
+#define PVRDMA_REG_DSRHIGH	0x08	/* W: Device shared region high PA. */
+#define PVRDMA_REG_CTL		0x0c	/* W: PVRDMA_DEVICE_CTL */
+#define PVRDMA_REG_REQUEST	0x10	/* W: Indicate device request. */
+#define PVRDMA_REG_ERR		0x14	/* R: Device error. */
+#define PVRDMA_REG_ICR		0x18	/* R: Interrupt cause. */
+#define PVRDMA_REG_IMR		0x1c	/* R/W: Interrupt mask. */
+#define PVRDMA_REG_MACL		0x20	/* R/W: MAC address low. */
+#define PVRDMA_REG_MACH		0x24	/* R/W: MAC address high. */
+
+/* Object flags. */
+#define PVRDMA_CQ_FLAG_ARMED_SOL	BIT(0)	/* Armed for solicited-only. */
+#define PVRDMA_CQ_FLAG_ARMED		BIT(1)	/* Armed. */
+#define PVRDMA_MR_FLAG_DMA		BIT(0)	/* DMA region. */
+#define PVRDMA_MR_FLAG_FRMR		BIT(1)	/* Fast reg memory region. */
+
+/*
+ * Atomic operation capability (masked versions are extended atomic
+ * operations.
+ */
+
+#define PVRDMA_ATOMIC_OP_COMP_SWAP	BIT(0)	/* Compare and swap. */
+#define PVRDMA_ATOMIC_OP_FETCH_ADD	BIT(1)	/* Fetch and add. */
+#define PVRDMA_ATOMIC_OP_MASK_COMP_SWAP	BIT(2)	/* Masked compare and swap. */
+#define PVRDMA_ATOMIC_OP_MASK_FETCH_ADD	BIT(3)	/* Masked fetch and add. */
+
+/*
+ * Base Memory Management Extension flags to support Fast Reg Memory Regions
+ * and Fast Reg Work Requests. Each flag represents a verb operation and we
+ * must support all of them to qualify for the BMME device cap.
+ */
+
+#define PVRDMA_BMME_FLAG_LOCAL_INV	BIT(0)	/* Local Invalidate. */
+#define PVRDMA_BMME_FLAG_REMOTE_INV	BIT(1)	/* Remote Invalidate. */
+#define PVRDMA_BMME_FLAG_FAST_REG_WR	BIT(2)	/* Fast Reg Work Request. */
+
+/*
+ * GID types. The interpretation of the gid_types bit field in the device
+ * capabilities will depend on the device mode. For now, the device only
+ * supports RoCE as mode, so only the different GID types for RoCE are
+ * defined.
+ */
+
+#define PVRDMA_GID_TYPE_FLAG_ROCE_V1	BIT(0)
+#define PVRDMA_GID_TYPE_FLAG_ROCE_V2	BIT(1)
+
+/*
+ * Version checks. This checks whether each version supports specific
+ * capabilities from the device.
+ */
+
+#define PVRDMA_IS_VERSION17(_dev)					\
+	(_dev->dsr_version == PVRDMA_ROCEV1_VERSION &&			\
+	 _dev->dsr->caps.gid_types == PVRDMA_GID_TYPE_FLAG_ROCE_V1)
+
+#define PVRDMA_IS_VERSION18(_dev)					\
+	(_dev->dsr_version >= PVRDMA_ROCEV2_VERSION &&			\
+	 (_dev->dsr->caps.gid_types == PVRDMA_GID_TYPE_FLAG_ROCE_V1 ||  \
+	  _dev->dsr->caps.gid_types == PVRDMA_GID_TYPE_FLAG_ROCE_V2))	\
+
+#define PVRDMA_SUPPORTED(_dev)						\
+	((_dev->dsr->caps.mode == PVRDMA_DEVICE_MODE_ROCE) &&		\
+	 (PVRDMA_IS_VERSION17(_dev) || PVRDMA_IS_VERSION18(_dev)))
+
+/*
+ * Get capability values based on device version.
+ */
+
+#define PVRDMA_GET_CAP(_dev, _old_val, _val) \
+	((PVRDMA_IS_VERSION18(_dev)) ? _val : _old_val)
+
+enum pvrdma_pci_resource {
+	PVRDMA_PCI_RESOURCE_MSIX,	/* BAR0: MSI-X, MMIO. */
+	PVRDMA_PCI_RESOURCE_REG,	/* BAR1: Registers, MMIO. */
+	PVRDMA_PCI_RESOURCE_UAR,	/* BAR2: UAR pages, MMIO, 64-bit. */
+	PVRDMA_PCI_RESOURCE_LAST,	/* Last. */
+};
+
+enum pvrdma_device_ctl {
+	PVRDMA_DEVICE_CTL_ACTIVATE,	/* Activate device. */
+	PVRDMA_DEVICE_CTL_UNQUIESCE,	/* Unquiesce device. */
+	PVRDMA_DEVICE_CTL_RESET,	/* Reset device. */
+};
+
+enum pvrdma_intr_vector {
+	PVRDMA_INTR_VECTOR_RESPONSE,	/* Command response. */
+	PVRDMA_INTR_VECTOR_ASYNC,	/* Async events. */
+	PVRDMA_INTR_VECTOR_CQ,		/* CQ notification. */
+	/* Additional CQ notification vectors. */
+};
+
+enum pvrdma_intr_cause {
+	PVRDMA_INTR_CAUSE_RESPONSE	= (1 << PVRDMA_INTR_VECTOR_RESPONSE),
+	PVRDMA_INTR_CAUSE_ASYNC		= (1 << PVRDMA_INTR_VECTOR_ASYNC),
+	PVRDMA_INTR_CAUSE_CQ		= (1 << PVRDMA_INTR_VECTOR_CQ),
+};
+
+enum pvrdma_gos_bits {
+	PVRDMA_GOS_BITS_UNK,		/* Unknown. */
+	PVRDMA_GOS_BITS_32,		/* 32-bit. */
+	PVRDMA_GOS_BITS_64,		/* 64-bit. */
+};
+
+enum pvrdma_gos_type {
+	PVRDMA_GOS_TYPE_UNK,		/* Unknown. */
+	PVRDMA_GOS_TYPE_LINUX,		/* Linux. */
+};
+
+enum pvrdma_device_mode {
+	PVRDMA_DEVICE_MODE_ROCE,	/* RoCE. */
+	PVRDMA_DEVICE_MODE_IWARP,	/* iWarp. */
+	PVRDMA_DEVICE_MODE_IB,		/* InfiniBand. */
+};
+
+struct pvrdma_gos_info {
+	u32 gos_bits:2;			/* W: PVRDMA_GOS_BITS_ */
+	u32 gos_type:4;			/* W: PVRDMA_GOS_TYPE_ */
+	u32 gos_ver:16;			/* W: Guest OS version. */
+	u32 gos_misc:10;		/* W: Other. */
+	u32 pad;			/* Pad to 8-byte alignment. */
+};
+
+struct pvrdma_device_caps {
+	u64 fw_ver;				/* R: Query device. */
+	__be64 node_guid;
+	__be64 sys_image_guid;
+	u64 max_mr_size;
+	u64 page_size_cap;
+	u64 atomic_arg_sizes;			/* EX verbs. */
+	u32 ex_comp_mask;			/* EX verbs. */
+	u32 device_cap_flags2;			/* EX verbs. */
+	u32 max_fa_bit_boundary;		/* EX verbs. */
+	u32 log_max_atomic_inline_arg;		/* EX verbs. */
+	u32 vendor_id;
+	u32 vendor_part_id;
+	u32 hw_ver;
+	u32 max_qp;
+	u32 max_qp_wr;
+	u32 device_cap_flags;
+	u32 max_sge;
+	u32 max_sge_rd;
+	u32 max_cq;
+	u32 max_cqe;
+	u32 max_mr;
+	u32 max_pd;
+	u32 max_qp_rd_atom;
+	u32 max_ee_rd_atom;
+	u32 max_res_rd_atom;
+	u32 max_qp_init_rd_atom;
+	u32 max_ee_init_rd_atom;
+	u32 max_ee;
+	u32 max_rdd;
+	u32 max_mw;
+	u32 max_raw_ipv6_qp;
+	u32 max_raw_ethy_qp;
+	u32 max_mcast_grp;
+	u32 max_mcast_qp_attach;
+	u32 max_total_mcast_qp_attach;
+	u32 max_ah;
+	u32 max_fmr;
+	u32 max_map_per_fmr;
+	u32 max_srq;
+	u32 max_srq_wr;
+	u32 max_srq_sge;
+	u32 max_uar;
+	u32 gid_tbl_len;
+	u16 max_pkeys;
+	u8  local_ca_ack_delay;
+	u8  phys_port_cnt;
+	u8  mode;				/* PVRDMA_DEVICE_MODE_ */
+	u8  atomic_ops;				/* PVRDMA_ATOMIC_OP_* bits */
+	u8  bmme_flags;				/* FRWR Mem Mgmt Extensions */
+	u8  gid_types;				/* PVRDMA_GID_TYPE_FLAG_ */
+	u32 max_fast_reg_page_list_len;
+};
+
+struct pvrdma_ring_page_info {
+	u32 num_pages;				/* Num pages incl. header. */
+	u32 reserved;				/* Reserved. */
+	u64 pdir_dma;				/* Page directory PA. */
+};
+
+#pragma pack(push, 1)
+
+struct pvrdma_device_shared_region {
+	u32 driver_version;			/* W: Driver version. */
+	u32 pad;				/* Pad to 8-byte align. */
+	struct pvrdma_gos_info gos_info;	/* W: Guest OS information. */
+	u64 cmd_slot_dma;			/* W: Command slot address. */
+	u64 resp_slot_dma;			/* W: Response slot address. */
+	struct pvrdma_ring_page_info async_ring_pages;
+						/* W: Async ring page info. */
+	struct pvrdma_ring_page_info cq_ring_pages;
+						/* W: CQ ring page info. */
+	u32 uar_pfn;				/* W: UAR pageframe. */
+	u32 pad2;				/* Pad to 8-byte align. */
+	struct pvrdma_device_caps caps;		/* R: Device capabilities. */
+};
+
+#pragma pack(pop)
+
+/* Event types. Currently a 1:1 mapping with enum ib_event. */
+enum pvrdma_eqe_type {
+	PVRDMA_EVENT_CQ_ERR,
+	PVRDMA_EVENT_QP_FATAL,
+	PVRDMA_EVENT_QP_REQ_ERR,
+	PVRDMA_EVENT_QP_ACCESS_ERR,
+	PVRDMA_EVENT_COMM_EST,
+	PVRDMA_EVENT_SQ_DRAINED,
+	PVRDMA_EVENT_PATH_MIG,
+	PVRDMA_EVENT_PATH_MIG_ERR,
+	PVRDMA_EVENT_DEVICE_FATAL,
+	PVRDMA_EVENT_PORT_ACTIVE,
+	PVRDMA_EVENT_PORT_ERR,
+	PVRDMA_EVENT_LID_CHANGE,
+	PVRDMA_EVENT_PKEY_CHANGE,
+	PVRDMA_EVENT_SM_CHANGE,
+	PVRDMA_EVENT_SRQ_ERR,
+	PVRDMA_EVENT_SRQ_LIMIT_REACHED,
+	PVRDMA_EVENT_QP_LAST_WQE_REACHED,
+	PVRDMA_EVENT_CLIENT_REREGISTER,
+	PVRDMA_EVENT_GID_CHANGE,
+};
+
+/* Event queue element. */
+struct pvrdma_eqe {
+	u32 type;	/* Event type. */
+	u32 info;	/* Handle, other. */
+};
+
+/* CQ notification queue element. */
+struct pvrdma_cqne {
+	u32 info;	/* Handle */
+};
+
+enum {
+	PVRDMA_CMD_FIRST,
+	PVRDMA_CMD_QUERY_PORT = PVRDMA_CMD_FIRST,
+	PVRDMA_CMD_QUERY_PKEY,
+	PVRDMA_CMD_CREATE_PD,
+	PVRDMA_CMD_DESTROY_PD,
+	PVRDMA_CMD_CREATE_MR,
+	PVRDMA_CMD_DESTROY_MR,
+	PVRDMA_CMD_CREATE_CQ,
+	PVRDMA_CMD_RESIZE_CQ,
+	PVRDMA_CMD_DESTROY_CQ,
+	PVRDMA_CMD_CREATE_QP,
+	PVRDMA_CMD_MODIFY_QP,
+	PVRDMA_CMD_QUERY_QP,
+	PVRDMA_CMD_DESTROY_QP,
+	PVRDMA_CMD_CREATE_UC,
+	PVRDMA_CMD_DESTROY_UC,
+	PVRDMA_CMD_CREATE_BIND,
+	PVRDMA_CMD_DESTROY_BIND,
+	PVRDMA_CMD_CREATE_SRQ,
+	PVRDMA_CMD_MODIFY_SRQ,
+	PVRDMA_CMD_QUERY_SRQ,
+	PVRDMA_CMD_DESTROY_SRQ,
+	PVRDMA_CMD_MAX,
+};
+
+enum {
+	PVRDMA_CMD_FIRST_RESP = (1 << 31),
+	PVRDMA_CMD_QUERY_PORT_RESP = PVRDMA_CMD_FIRST_RESP,
+	PVRDMA_CMD_QUERY_PKEY_RESP,
+	PVRDMA_CMD_CREATE_PD_RESP,
+	PVRDMA_CMD_DESTROY_PD_RESP_NOOP,
+	PVRDMA_CMD_CREATE_MR_RESP,
+	PVRDMA_CMD_DESTROY_MR_RESP_NOOP,
+	PVRDMA_CMD_CREATE_CQ_RESP,
+	PVRDMA_CMD_RESIZE_CQ_RESP,
+	PVRDMA_CMD_DESTROY_CQ_RESP_NOOP,
+	PVRDMA_CMD_CREATE_QP_RESP,
+	PVRDMA_CMD_MODIFY_QP_RESP,
+	PVRDMA_CMD_QUERY_QP_RESP,
+	PVRDMA_CMD_DESTROY_QP_RESP,
+	PVRDMA_CMD_CREATE_UC_RESP,
+	PVRDMA_CMD_DESTROY_UC_RESP_NOOP,
+	PVRDMA_CMD_CREATE_BIND_RESP_NOOP,
+	PVRDMA_CMD_DESTROY_BIND_RESP_NOOP,
+	PVRDMA_CMD_CREATE_SRQ_RESP,
+	PVRDMA_CMD_MODIFY_SRQ_RESP,
+	PVRDMA_CMD_QUERY_SRQ_RESP,
+	PVRDMA_CMD_DESTROY_SRQ_RESP,
+	PVRDMA_CMD_MAX_RESP,
+};
+
+struct pvrdma_cmd_hdr {
+	u64 response;		/* Key for response lookup. */
+	u32 cmd;		/* PVRDMA_CMD_ */
+	u32 reserved;		/* Reserved. */
+};
+
+struct pvrdma_cmd_resp_hdr {
+	u64 response;		/* From cmd hdr. */
+	u32 ack;		/* PVRDMA_CMD_XXX_RESP */
+	u8 err;			/* Error. */
+	u8 reserved[3];		/* Reserved. */
+};
+
+struct pvrdma_cmd_query_port {
+	struct pvrdma_cmd_hdr hdr;
+	u8 port_num;
+	u8 reserved[7];
+};
+
+struct pvrdma_cmd_query_port_resp {
+	struct pvrdma_cmd_resp_hdr hdr;
+	struct pvrdma_port_attr attrs;
+};
+
+struct pvrdma_cmd_query_pkey {
+	struct pvrdma_cmd_hdr hdr;
+	u8 port_num;
+	u8 index;
+	u8 reserved[6];
+};
+
+struct pvrdma_cmd_query_pkey_resp {
+	struct pvrdma_cmd_resp_hdr hdr;
+	u16 pkey;
+	u8 reserved[6];
+};
+
+struct pvrdma_cmd_create_uc {
+	struct pvrdma_cmd_hdr hdr;
+	u32 pfn; /* UAR page frame number */
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_create_uc_resp {
+	struct pvrdma_cmd_resp_hdr hdr;
+	u32 ctx_handle;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_destroy_uc {
+	struct pvrdma_cmd_hdr hdr;
+	u32 ctx_handle;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_create_pd {
+	struct pvrdma_cmd_hdr hdr;
+	u32 ctx_handle;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_create_pd_resp {
+	struct pvrdma_cmd_resp_hdr hdr;
+	u32 pd_handle;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_destroy_pd {
+	struct pvrdma_cmd_hdr hdr;
+	u32 pd_handle;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_create_mr {
+	struct pvrdma_cmd_hdr hdr;
+	u64 start;
+	u64 length;
+	u64 pdir_dma;
+	u32 pd_handle;
+	u32 access_flags;
+	u32 flags;
+	u32 nchunks;
+};
+
+struct pvrdma_cmd_create_mr_resp {
+	struct pvrdma_cmd_resp_hdr hdr;
+	u32 mr_handle;
+	u32 lkey;
+	u32 rkey;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_destroy_mr {
+	struct pvrdma_cmd_hdr hdr;
+	u32 mr_handle;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_create_cq {
+	struct pvrdma_cmd_hdr hdr;
+	u64 pdir_dma;
+	u32 ctx_handle;
+	u32 cqe;
+	u32 nchunks;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_create_cq_resp {
+	struct pvrdma_cmd_resp_hdr hdr;
+	u32 cq_handle;
+	u32 cqe;
+};
+
+struct pvrdma_cmd_resize_cq {
+	struct pvrdma_cmd_hdr hdr;
+	u32 cq_handle;
+	u32 cqe;
+};
+
+struct pvrdma_cmd_resize_cq_resp {
+	struct pvrdma_cmd_resp_hdr hdr;
+	u32 cqe;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_destroy_cq {
+	struct pvrdma_cmd_hdr hdr;
+	u32 cq_handle;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_create_srq {
+	struct pvrdma_cmd_hdr hdr;
+	u64 pdir_dma;
+	u32 pd_handle;
+	u32 nchunks;
+	struct pvrdma_srq_attr attrs;
+	u8 srq_type;
+	u8 reserved[7];
+};
+
+struct pvrdma_cmd_create_srq_resp {
+	struct pvrdma_cmd_resp_hdr hdr;
+	u32 srqn;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_modify_srq {
+	struct pvrdma_cmd_hdr hdr;
+	u32 srq_handle;
+	u32 attr_mask;
+	struct pvrdma_srq_attr attrs;
+};
+
+struct pvrdma_cmd_query_srq {
+	struct pvrdma_cmd_hdr hdr;
+	u32 srq_handle;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_query_srq_resp {
+	struct pvrdma_cmd_resp_hdr hdr;
+	struct pvrdma_srq_attr attrs;
+};
+
+struct pvrdma_cmd_destroy_srq {
+	struct pvrdma_cmd_hdr hdr;
+	u32 srq_handle;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_create_qp {
+	struct pvrdma_cmd_hdr hdr;
+	u64 pdir_dma;
+	u32 pd_handle;
+	u32 send_cq_handle;
+	u32 recv_cq_handle;
+	u32 srq_handle;
+	u32 max_send_wr;
+	u32 max_recv_wr;
+	u32 max_send_sge;
+	u32 max_recv_sge;
+	u32 max_inline_data;
+	u32 lkey;
+	u32 access_flags;
+	u16 total_chunks;
+	u16 send_chunks;
+	u16 max_atomic_arg;
+	u8 sq_sig_all;
+	u8 qp_type;
+	u8 is_srq;
+	u8 reserved[3];
+};
+
+struct pvrdma_cmd_create_qp_resp {
+	struct pvrdma_cmd_resp_hdr hdr;
+	u32 qpn;
+	u32 max_send_wr;
+	u32 max_recv_wr;
+	u32 max_send_sge;
+	u32 max_recv_sge;
+	u32 max_inline_data;
+};
+
+struct pvrdma_cmd_modify_qp {
+	struct pvrdma_cmd_hdr hdr;
+	u32 qp_handle;
+	u32 attr_mask;
+	struct pvrdma_qp_attr attrs;
+};
+
+struct pvrdma_cmd_query_qp {
+	struct pvrdma_cmd_hdr hdr;
+	u32 qp_handle;
+	u32 attr_mask;
+};
+
+struct pvrdma_cmd_query_qp_resp {
+	struct pvrdma_cmd_resp_hdr hdr;
+	struct pvrdma_qp_attr attrs;
+};
+
+struct pvrdma_cmd_destroy_qp {
+	struct pvrdma_cmd_hdr hdr;
+	u32 qp_handle;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_destroy_qp_resp {
+	struct pvrdma_cmd_resp_hdr hdr;
+	u32 events_reported;
+	u8 reserved[4];
+};
+
+struct pvrdma_cmd_create_bind {
+	struct pvrdma_cmd_hdr hdr;
+	u32 mtu;
+	u32 vlan;
+	u32 index;
+	u8 new_gid[16];
+	u8 gid_type;
+	u8 reserved[3];
+};
+
+struct pvrdma_cmd_destroy_bind {
+	struct pvrdma_cmd_hdr hdr;
+	u32 index;
+	u8 dest_gid[16];
+	u8 reserved[4];
+};
+
+union pvrdma_cmd_req {
+	struct pvrdma_cmd_hdr hdr;
+	struct pvrdma_cmd_query_port query_port;
+	struct pvrdma_cmd_query_pkey query_pkey;
+	struct pvrdma_cmd_create_uc create_uc;
+	struct pvrdma_cmd_destroy_uc destroy_uc;
+	struct pvrdma_cmd_create_pd create_pd;
+	struct pvrdma_cmd_destroy_pd destroy_pd;
+	struct pvrdma_cmd_create_mr create_mr;
+	struct pvrdma_cmd_destroy_mr destroy_mr;
+	struct pvrdma_cmd_create_cq create_cq;
+	struct pvrdma_cmd_resize_cq resize_cq;
+	struct pvrdma_cmd_destroy_cq destroy_cq;
+	struct pvrdma_cmd_create_qp create_qp;
+	struct pvrdma_cmd_modify_qp modify_qp;
+	struct pvrdma_cmd_query_qp query_qp;
+	struct pvrdma_cmd_destroy_qp destroy_qp;
+	struct pvrdma_cmd_create_bind create_bind;
+	struct pvrdma_cmd_destroy_bind destroy_bind;
+	struct pvrdma_cmd_create_srq create_srq;
+	struct pvrdma_cmd_modify_srq modify_srq;
+	struct pvrdma_cmd_query_srq query_srq;
+	struct pvrdma_cmd_destroy_srq destroy_srq;
+};
+
+union pvrdma_cmd_resp {
+	struct pvrdma_cmd_resp_hdr hdr;
+	struct pvrdma_cmd_query_port_resp query_port_resp;
+	struct pvrdma_cmd_query_pkey_resp query_pkey_resp;
+	struct pvrdma_cmd_create_uc_resp create_uc_resp;
+	struct pvrdma_cmd_create_pd_resp create_pd_resp;
+	struct pvrdma_cmd_create_mr_resp create_mr_resp;
+	struct pvrdma_cmd_create_cq_resp create_cq_resp;
+	struct pvrdma_cmd_resize_cq_resp resize_cq_resp;
+	struct pvrdma_cmd_create_qp_resp create_qp_resp;
+	struct pvrdma_cmd_query_qp_resp query_qp_resp;
+	struct pvrdma_cmd_destroy_qp_resp destroy_qp_resp;
+	struct pvrdma_cmd_create_srq_resp create_srq_resp;
+	struct pvrdma_cmd_query_srq_resp query_srq_resp;
+};
+
+#endif /* __PVRDMA_DEV_API_H__ */
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_doorbell.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_doorbell.c
new file mode 100644
index 0000000..bf51357
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_doorbell.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/bitmap.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+#include "pvrdma.h"
+
+int pvrdma_uar_table_init(struct pvrdma_dev *dev)
+{
+	u32 num = dev->dsr->caps.max_uar;
+	u32 mask = num - 1;
+	struct pvrdma_id_table *tbl = &dev->uar_table.tbl;
+
+	if (!is_power_of_2(num))
+		return -EINVAL;
+
+	tbl->last = 0;
+	tbl->top = 0;
+	tbl->max = num;
+	tbl->mask = mask;
+	spin_lock_init(&tbl->lock);
+	tbl->table = kcalloc(BITS_TO_LONGS(num), sizeof(long), GFP_KERNEL);
+	if (!tbl->table)
+		return -ENOMEM;
+
+	/* 0th UAR is taken by the device. */
+	set_bit(0, tbl->table);
+
+	return 0;
+}
+
+void pvrdma_uar_table_cleanup(struct pvrdma_dev *dev)
+{
+	struct pvrdma_id_table *tbl = &dev->uar_table.tbl;
+
+	kfree(tbl->table);
+}
+
+int pvrdma_uar_alloc(struct pvrdma_dev *dev, struct pvrdma_uar_map *uar)
+{
+	struct pvrdma_id_table *tbl;
+	unsigned long flags;
+	u32 obj;
+
+	tbl = &dev->uar_table.tbl;
+
+	spin_lock_irqsave(&tbl->lock, flags);
+	obj = find_next_zero_bit(tbl->table, tbl->max, tbl->last);
+	if (obj >= tbl->max) {
+		tbl->top = (tbl->top + tbl->max) & tbl->mask;
+		obj = find_first_zero_bit(tbl->table, tbl->max);
+	}
+
+	if (obj >= tbl->max) {
+		spin_unlock_irqrestore(&tbl->lock, flags);
+		return -ENOMEM;
+	}
+
+	set_bit(obj, tbl->table);
+	obj |= tbl->top;
+
+	spin_unlock_irqrestore(&tbl->lock, flags);
+
+	uar->index = obj;
+	uar->pfn = (pci_resource_start(dev->pdev, PVRDMA_PCI_RESOURCE_UAR) >>
+		    PAGE_SHIFT) + uar->index;
+
+	return 0;
+}
+
+void pvrdma_uar_free(struct pvrdma_dev *dev, struct pvrdma_uar_map *uar)
+{
+	struct pvrdma_id_table *tbl = &dev->uar_table.tbl;
+	unsigned long flags;
+	u32 obj;
+
+	obj = uar->index & (tbl->max - 1);
+	spin_lock_irqsave(&tbl->lock, flags);
+	clear_bit(obj, tbl->table);
+	tbl->last = min(tbl->last, obj);
+	tbl->top = (tbl->top + tbl->max) & tbl->mask;
+	spin_unlock_irqrestore(&tbl->lock, flags);
+}
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
new file mode 100644
index 0000000..0be33a8
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -0,0 +1,1156 @@
+/*
+ * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/errno.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <net/addrconf.h>
+
+#include "pvrdma.h"
+
+#define DRV_NAME	"vmw_pvrdma"
+#define DRV_VERSION	"1.0.1.0-k"
+
+static DEFINE_MUTEX(pvrdma_device_list_lock);
+static LIST_HEAD(pvrdma_device_list);
+static struct workqueue_struct *event_wq;
+
+static int pvrdma_add_gid(const union ib_gid *gid,
+			  const struct ib_gid_attr *attr,
+			  void **context);
+static int pvrdma_del_gid(const struct ib_gid_attr *attr, void **context);
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	return sprintf(buf, "VMW_PVRDMA-%s\n", DRV_VERSION);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	return sprintf(buf, "%d\n", PVRDMA_REV_ID);
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	return sprintf(buf, "%d\n", PVRDMA_BOARD_ID);
+}
+
+static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,	   NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,	   NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+
+static struct device_attribute *pvrdma_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+static void pvrdma_get_fw_ver_str(struct ib_device *device, char *str)
+{
+	struct pvrdma_dev *dev =
+		container_of(device, struct pvrdma_dev, ib_dev);
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d\n",
+		 (int) (dev->dsr->caps.fw_ver >> 32),
+		 (int) (dev->dsr->caps.fw_ver >> 16) & 0xffff,
+		 (int) dev->dsr->caps.fw_ver & 0xffff);
+}
+
+static int pvrdma_init_device(struct pvrdma_dev *dev)
+{
+	/*  Initialize some device related stuff */
+	spin_lock_init(&dev->cmd_lock);
+	sema_init(&dev->cmd_sema, 1);
+	atomic_set(&dev->num_qps, 0);
+	atomic_set(&dev->num_srqs, 0);
+	atomic_set(&dev->num_cqs, 0);
+	atomic_set(&dev->num_pds, 0);
+	atomic_set(&dev->num_ahs, 0);
+
+	return 0;
+}
+
+static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num,
+				 struct ib_port_immutable *immutable)
+{
+	struct pvrdma_dev *dev = to_vdev(ibdev);
+	struct ib_port_attr attr;
+	int err;
+
+	if (dev->dsr->caps.gid_types == PVRDMA_GID_TYPE_FLAG_ROCE_V1)
+		immutable->core_cap_flags |= RDMA_CORE_PORT_IBA_ROCE;
+	else if (dev->dsr->caps.gid_types == PVRDMA_GID_TYPE_FLAG_ROCE_V2)
+		immutable->core_cap_flags |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+	return 0;
+}
+
+static struct net_device *pvrdma_get_netdev(struct ib_device *ibdev,
+					    u8 port_num)
+{
+	struct net_device *netdev;
+	struct pvrdma_dev *dev = to_vdev(ibdev);
+
+	if (port_num != 1)
+		return NULL;
+
+	rcu_read_lock();
+	netdev = dev->netdev;
+	if (netdev)
+		dev_hold(netdev);
+	rcu_read_unlock();
+
+	return netdev;
+}
+
+static int pvrdma_register_device(struct pvrdma_dev *dev)
+{
+	int ret = -1;
+	int i = 0;
+
+	strlcpy(dev->ib_dev.name, "vmw_pvrdma%d", IB_DEVICE_NAME_MAX);
+	dev->ib_dev.node_guid = dev->dsr->caps.node_guid;
+	dev->sys_image_guid = dev->dsr->caps.sys_image_guid;
+	dev->flags = 0;
+	dev->ib_dev.owner = THIS_MODULE;
+	dev->ib_dev.num_comp_vectors = 1;
+	dev->ib_dev.dev.parent = &dev->pdev->dev;
+	dev->ib_dev.uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION;
+	dev->ib_dev.uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_POLL_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)	|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_POST_SEND)		|
+		(1ull << IB_USER_VERBS_CMD_POST_RECV)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH);
+
+	dev->ib_dev.node_type = RDMA_NODE_IB_CA;
+	dev->ib_dev.phys_port_cnt = dev->dsr->caps.phys_port_cnt;
+
+	dev->ib_dev.query_device = pvrdma_query_device;
+	dev->ib_dev.query_port = pvrdma_query_port;
+	dev->ib_dev.query_gid = pvrdma_query_gid;
+	dev->ib_dev.query_pkey = pvrdma_query_pkey;
+	dev->ib_dev.modify_port	= pvrdma_modify_port;
+	dev->ib_dev.alloc_ucontext = pvrdma_alloc_ucontext;
+	dev->ib_dev.dealloc_ucontext = pvrdma_dealloc_ucontext;
+	dev->ib_dev.mmap = pvrdma_mmap;
+	dev->ib_dev.alloc_pd = pvrdma_alloc_pd;
+	dev->ib_dev.dealloc_pd = pvrdma_dealloc_pd;
+	dev->ib_dev.create_ah = pvrdma_create_ah;
+	dev->ib_dev.destroy_ah = pvrdma_destroy_ah;
+	dev->ib_dev.create_qp = pvrdma_create_qp;
+	dev->ib_dev.modify_qp = pvrdma_modify_qp;
+	dev->ib_dev.query_qp = pvrdma_query_qp;
+	dev->ib_dev.destroy_qp = pvrdma_destroy_qp;
+	dev->ib_dev.post_send = pvrdma_post_send;
+	dev->ib_dev.post_recv = pvrdma_post_recv;
+	dev->ib_dev.create_cq = pvrdma_create_cq;
+	dev->ib_dev.modify_cq = pvrdma_modify_cq;
+	dev->ib_dev.resize_cq = pvrdma_resize_cq;
+	dev->ib_dev.destroy_cq = pvrdma_destroy_cq;
+	dev->ib_dev.poll_cq = pvrdma_poll_cq;
+	dev->ib_dev.req_notify_cq = pvrdma_req_notify_cq;
+	dev->ib_dev.get_dma_mr = pvrdma_get_dma_mr;
+	dev->ib_dev.reg_user_mr	= pvrdma_reg_user_mr;
+	dev->ib_dev.dereg_mr = pvrdma_dereg_mr;
+	dev->ib_dev.alloc_mr = pvrdma_alloc_mr;
+	dev->ib_dev.map_mr_sg = pvrdma_map_mr_sg;
+	dev->ib_dev.add_gid = pvrdma_add_gid;
+	dev->ib_dev.del_gid = pvrdma_del_gid;
+	dev->ib_dev.get_netdev = pvrdma_get_netdev;
+	dev->ib_dev.get_port_immutable = pvrdma_port_immutable;
+	dev->ib_dev.get_link_layer = pvrdma_port_link_layer;
+	dev->ib_dev.get_dev_fw_str = pvrdma_get_fw_ver_str;
+
+	mutex_init(&dev->port_mutex);
+	spin_lock_init(&dev->desc_lock);
+
+	dev->cq_tbl = kcalloc(dev->dsr->caps.max_cq, sizeof(struct pvrdma_cq *),
+			      GFP_KERNEL);
+	if (!dev->cq_tbl)
+		return ret;
+	spin_lock_init(&dev->cq_tbl_lock);
+
+	dev->qp_tbl = kcalloc(dev->dsr->caps.max_qp, sizeof(struct pvrdma_qp *),
+			      GFP_KERNEL);
+	if (!dev->qp_tbl)
+		goto err_cq_free;
+	spin_lock_init(&dev->qp_tbl_lock);
+
+	/* Check if SRQ is supported by backend */
+	if (dev->dsr->caps.max_srq) {
+		dev->ib_dev.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)	|
+			(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)	|
+			(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)	|
+			(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)	|
+			(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+
+		dev->ib_dev.create_srq = pvrdma_create_srq;
+		dev->ib_dev.modify_srq = pvrdma_modify_srq;
+		dev->ib_dev.query_srq = pvrdma_query_srq;
+		dev->ib_dev.destroy_srq = pvrdma_destroy_srq;
+		dev->ib_dev.post_srq_recv = pvrdma_post_srq_recv;
+
+		dev->srq_tbl = kcalloc(dev->dsr->caps.max_srq,
+				       sizeof(struct pvrdma_srq *),
+				       GFP_KERNEL);
+		if (!dev->srq_tbl)
+			goto err_qp_free;
+	}
+	dev->ib_dev.driver_id = RDMA_DRIVER_VMW_PVRDMA;
+	spin_lock_init(&dev->srq_tbl_lock);
+
+	ret = ib_register_device(&dev->ib_dev, NULL);
+	if (ret)
+		goto err_srq_free;
+
+	for (i = 0; i < ARRAY_SIZE(pvrdma_class_attributes); ++i) {
+		ret = device_create_file(&dev->ib_dev.dev,
+					 pvrdma_class_attributes[i]);
+		if (ret)
+			goto err_class;
+	}
+
+	dev->ib_active = true;
+
+	return 0;
+
+err_class:
+	ib_unregister_device(&dev->ib_dev);
+err_srq_free:
+	kfree(dev->srq_tbl);
+err_qp_free:
+	kfree(dev->qp_tbl);
+err_cq_free:
+	kfree(dev->cq_tbl);
+
+	return ret;
+}
+
+static irqreturn_t pvrdma_intr0_handler(int irq, void *dev_id)
+{
+	u32 icr = PVRDMA_INTR_CAUSE_RESPONSE;
+	struct pvrdma_dev *dev = dev_id;
+
+	dev_dbg(&dev->pdev->dev, "interrupt 0 (response) handler\n");
+
+	if (!dev->pdev->msix_enabled) {
+		/* Legacy intr */
+		icr = pvrdma_read_reg(dev, PVRDMA_REG_ICR);
+		if (icr == 0)
+			return IRQ_NONE;
+	}
+
+	if (icr == PVRDMA_INTR_CAUSE_RESPONSE)
+		complete(&dev->cmd_done);
+
+	return IRQ_HANDLED;
+}
+
+static void pvrdma_qp_event(struct pvrdma_dev *dev, u32 qpn, int type)
+{
+	struct pvrdma_qp *qp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->qp_tbl_lock, flags);
+	qp = dev->qp_tbl[qpn % dev->dsr->caps.max_qp];
+	if (qp)
+		refcount_inc(&qp->refcnt);
+	spin_unlock_irqrestore(&dev->qp_tbl_lock, flags);
+
+	if (qp && qp->ibqp.event_handler) {
+		struct ib_qp *ibqp = &qp->ibqp;
+		struct ib_event e;
+
+		e.device = ibqp->device;
+		e.element.qp = ibqp;
+		e.event = type; /* 1:1 mapping for now. */
+		ibqp->event_handler(&e, ibqp->qp_context);
+	}
+	if (qp) {
+		if (refcount_dec_and_test(&qp->refcnt))
+			complete(&qp->free);
+	}
+}
+
+static void pvrdma_cq_event(struct pvrdma_dev *dev, u32 cqn, int type)
+{
+	struct pvrdma_cq *cq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->cq_tbl_lock, flags);
+	cq = dev->cq_tbl[cqn % dev->dsr->caps.max_cq];
+	if (cq)
+		refcount_inc(&cq->refcnt);
+	spin_unlock_irqrestore(&dev->cq_tbl_lock, flags);
+
+	if (cq && cq->ibcq.event_handler) {
+		struct ib_cq *ibcq = &cq->ibcq;
+		struct ib_event e;
+
+		e.device = ibcq->device;
+		e.element.cq = ibcq;
+		e.event = type; /* 1:1 mapping for now. */
+		ibcq->event_handler(&e, ibcq->cq_context);
+	}
+	if (cq) {
+		if (refcount_dec_and_test(&cq->refcnt))
+			complete(&cq->free);
+	}
+}
+
+static void pvrdma_srq_event(struct pvrdma_dev *dev, u32 srqn, int type)
+{
+	struct pvrdma_srq *srq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->srq_tbl_lock, flags);
+	if (dev->srq_tbl)
+		srq = dev->srq_tbl[srqn % dev->dsr->caps.max_srq];
+	else
+		srq = NULL;
+	if (srq)
+		refcount_inc(&srq->refcnt);
+	spin_unlock_irqrestore(&dev->srq_tbl_lock, flags);
+
+	if (srq && srq->ibsrq.event_handler) {
+		struct ib_srq *ibsrq = &srq->ibsrq;
+		struct ib_event e;
+
+		e.device = ibsrq->device;
+		e.element.srq = ibsrq;
+		e.event = type; /* 1:1 mapping for now. */
+		ibsrq->event_handler(&e, ibsrq->srq_context);
+	}
+	if (srq) {
+		if (refcount_dec_and_test(&srq->refcnt))
+			complete(&srq->free);
+	}
+}
+
+static void pvrdma_dispatch_event(struct pvrdma_dev *dev, int port,
+				  enum ib_event_type event)
+{
+	struct ib_event ib_event;
+
+	memset(&ib_event, 0, sizeof(ib_event));
+	ib_event.device = &dev->ib_dev;
+	ib_event.element.port_num = port;
+	ib_event.event = event;
+	ib_dispatch_event(&ib_event);
+}
+
+static void pvrdma_dev_event(struct pvrdma_dev *dev, u8 port, int type)
+{
+	if (port < 1 || port > dev->dsr->caps.phys_port_cnt) {
+		dev_warn(&dev->pdev->dev, "event on port %d\n", port);
+		return;
+	}
+
+	pvrdma_dispatch_event(dev, port, type);
+}
+
+static inline struct pvrdma_eqe *get_eqe(struct pvrdma_dev *dev, unsigned int i)
+{
+	return (struct pvrdma_eqe *)pvrdma_page_dir_get_ptr(
+					&dev->async_pdir,
+					PAGE_SIZE +
+					sizeof(struct pvrdma_eqe) * i);
+}
+
+static irqreturn_t pvrdma_intr1_handler(int irq, void *dev_id)
+{
+	struct pvrdma_dev *dev = dev_id;
+	struct pvrdma_ring *ring = &dev->async_ring_state->rx;
+	int ring_slots = (dev->dsr->async_ring_pages.num_pages - 1) *
+			 PAGE_SIZE / sizeof(struct pvrdma_eqe);
+	unsigned int head;
+
+	dev_dbg(&dev->pdev->dev, "interrupt 1 (async event) handler\n");
+
+	/*
+	 * Don't process events until the IB device is registered. Otherwise
+	 * we'll try to ib_dispatch_event() on an invalid device.
+	 */
+	if (!dev->ib_active)
+		return IRQ_HANDLED;
+
+	while (pvrdma_idx_ring_has_data(ring, ring_slots, &head) > 0) {
+		struct pvrdma_eqe *eqe;
+
+		eqe = get_eqe(dev, head);
+
+		switch (eqe->type) {
+		case PVRDMA_EVENT_QP_FATAL:
+		case PVRDMA_EVENT_QP_REQ_ERR:
+		case PVRDMA_EVENT_QP_ACCESS_ERR:
+		case PVRDMA_EVENT_COMM_EST:
+		case PVRDMA_EVENT_SQ_DRAINED:
+		case PVRDMA_EVENT_PATH_MIG:
+		case PVRDMA_EVENT_PATH_MIG_ERR:
+		case PVRDMA_EVENT_QP_LAST_WQE_REACHED:
+			pvrdma_qp_event(dev, eqe->info, eqe->type);
+			break;
+
+		case PVRDMA_EVENT_CQ_ERR:
+			pvrdma_cq_event(dev, eqe->info, eqe->type);
+			break;
+
+		case PVRDMA_EVENT_SRQ_ERR:
+		case PVRDMA_EVENT_SRQ_LIMIT_REACHED:
+			pvrdma_srq_event(dev, eqe->info, eqe->type);
+			break;
+
+		case PVRDMA_EVENT_PORT_ACTIVE:
+		case PVRDMA_EVENT_PORT_ERR:
+		case PVRDMA_EVENT_LID_CHANGE:
+		case PVRDMA_EVENT_PKEY_CHANGE:
+		case PVRDMA_EVENT_SM_CHANGE:
+		case PVRDMA_EVENT_CLIENT_REREGISTER:
+		case PVRDMA_EVENT_GID_CHANGE:
+			pvrdma_dev_event(dev, eqe->info, eqe->type);
+			break;
+
+		case PVRDMA_EVENT_DEVICE_FATAL:
+			pvrdma_dev_event(dev, 1, eqe->type);
+			break;
+
+		default:
+			break;
+		}
+
+		pvrdma_idx_ring_inc(&ring->cons_head, ring_slots);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static inline struct pvrdma_cqne *get_cqne(struct pvrdma_dev *dev,
+					   unsigned int i)
+{
+	return (struct pvrdma_cqne *)pvrdma_page_dir_get_ptr(
+					&dev->cq_pdir,
+					PAGE_SIZE +
+					sizeof(struct pvrdma_cqne) * i);
+}
+
+static irqreturn_t pvrdma_intrx_handler(int irq, void *dev_id)
+{
+	struct pvrdma_dev *dev = dev_id;
+	struct pvrdma_ring *ring = &dev->cq_ring_state->rx;
+	int ring_slots = (dev->dsr->cq_ring_pages.num_pages - 1) * PAGE_SIZE /
+			 sizeof(struct pvrdma_cqne);
+	unsigned int head;
+	unsigned long flags;
+
+	dev_dbg(&dev->pdev->dev, "interrupt x (completion) handler\n");
+
+	while (pvrdma_idx_ring_has_data(ring, ring_slots, &head) > 0) {
+		struct pvrdma_cqne *cqne;
+		struct pvrdma_cq *cq;
+
+		cqne = get_cqne(dev, head);
+		spin_lock_irqsave(&dev->cq_tbl_lock, flags);
+		cq = dev->cq_tbl[cqne->info % dev->dsr->caps.max_cq];
+		if (cq)
+			refcount_inc(&cq->refcnt);
+		spin_unlock_irqrestore(&dev->cq_tbl_lock, flags);
+
+		if (cq && cq->ibcq.comp_handler)
+			cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+		if (cq) {
+			if (refcount_dec_and_test(&cq->refcnt))
+				complete(&cq->free);
+		}
+		pvrdma_idx_ring_inc(&ring->cons_head, ring_slots);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void pvrdma_free_irq(struct pvrdma_dev *dev)
+{
+	int i;
+
+	dev_dbg(&dev->pdev->dev, "freeing interrupts\n");
+	for (i = 0; i < dev->nr_vectors; i++)
+		free_irq(pci_irq_vector(dev->pdev, i), dev);
+}
+
+static void pvrdma_enable_intrs(struct pvrdma_dev *dev)
+{
+	dev_dbg(&dev->pdev->dev, "enable interrupts\n");
+	pvrdma_write_reg(dev, PVRDMA_REG_IMR, 0);
+}
+
+static void pvrdma_disable_intrs(struct pvrdma_dev *dev)
+{
+	dev_dbg(&dev->pdev->dev, "disable interrupts\n");
+	pvrdma_write_reg(dev, PVRDMA_REG_IMR, ~0);
+}
+
+static int pvrdma_alloc_intrs(struct pvrdma_dev *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int ret = 0, i;
+
+	ret = pci_alloc_irq_vectors(pdev, 1, PVRDMA_MAX_INTERRUPTS,
+			PCI_IRQ_MSIX);
+	if (ret < 0) {
+		ret = pci_alloc_irq_vectors(pdev, 1, 1,
+				PCI_IRQ_MSI | PCI_IRQ_LEGACY);
+		if (ret < 0)
+			return ret;
+	}
+	dev->nr_vectors = ret;
+
+	ret = request_irq(pci_irq_vector(dev->pdev, 0), pvrdma_intr0_handler,
+			pdev->msix_enabled ? 0 : IRQF_SHARED, DRV_NAME, dev);
+	if (ret) {
+		dev_err(&dev->pdev->dev,
+			"failed to request interrupt 0\n");
+		goto out_free_vectors;
+	}
+
+	for (i = 1; i < dev->nr_vectors; i++) {
+		ret = request_irq(pci_irq_vector(dev->pdev, i),
+				i == 1 ? pvrdma_intr1_handler :
+					 pvrdma_intrx_handler,
+				0, DRV_NAME, dev);
+		if (ret) {
+			dev_err(&dev->pdev->dev,
+				"failed to request interrupt %d\n", i);
+			goto free_irqs;
+		}
+	}
+
+	return 0;
+
+free_irqs:
+	while (--i >= 0)
+		free_irq(pci_irq_vector(dev->pdev, i), dev);
+out_free_vectors:
+	pci_free_irq_vectors(pdev);
+	return ret;
+}
+
+static void pvrdma_free_slots(struct pvrdma_dev *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+
+	if (dev->resp_slot)
+		dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->resp_slot,
+				  dev->dsr->resp_slot_dma);
+	if (dev->cmd_slot)
+		dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->cmd_slot,
+				  dev->dsr->cmd_slot_dma);
+}
+
+static int pvrdma_add_gid_at_index(struct pvrdma_dev *dev,
+				   const union ib_gid *gid,
+				   u8 gid_type,
+				   int index)
+{
+	int ret;
+	union pvrdma_cmd_req req;
+	struct pvrdma_cmd_create_bind *cmd_bind = &req.create_bind;
+
+	if (!dev->sgid_tbl) {
+		dev_warn(&dev->pdev->dev, "sgid table not initialized\n");
+		return -EINVAL;
+	}
+
+	memset(cmd_bind, 0, sizeof(*cmd_bind));
+	cmd_bind->hdr.cmd = PVRDMA_CMD_CREATE_BIND;
+	memcpy(cmd_bind->new_gid, gid->raw, 16);
+	cmd_bind->mtu = ib_mtu_enum_to_int(IB_MTU_1024);
+	cmd_bind->vlan = 0xfff;
+	cmd_bind->index = index;
+	cmd_bind->gid_type = gid_type;
+
+	ret = pvrdma_cmd_post(dev, &req, NULL, 0);
+	if (ret < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "could not create binding, error: %d\n", ret);
+		return -EFAULT;
+	}
+	memcpy(&dev->sgid_tbl[index], gid, sizeof(*gid));
+	return 0;
+}
+
+static int pvrdma_add_gid(const union ib_gid *gid,
+			  const struct ib_gid_attr *attr,
+			  void **context)
+{
+	struct pvrdma_dev *dev = to_vdev(attr->device);
+
+	return pvrdma_add_gid_at_index(dev, gid,
+				       ib_gid_type_to_pvrdma(attr->gid_type),
+				       attr->index);
+}
+
+static int pvrdma_del_gid_at_index(struct pvrdma_dev *dev, int index)
+{
+	int ret;
+	union pvrdma_cmd_req req;
+	struct pvrdma_cmd_destroy_bind *cmd_dest = &req.destroy_bind;
+
+	/* Update sgid table. */
+	if (!dev->sgid_tbl) {
+		dev_warn(&dev->pdev->dev, "sgid table not initialized\n");
+		return -EINVAL;
+	}
+
+	memset(cmd_dest, 0, sizeof(*cmd_dest));
+	cmd_dest->hdr.cmd = PVRDMA_CMD_DESTROY_BIND;
+	memcpy(cmd_dest->dest_gid, &dev->sgid_tbl[index], 16);
+	cmd_dest->index = index;
+
+	ret = pvrdma_cmd_post(dev, &req, NULL, 0);
+	if (ret < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "could not destroy binding, error: %d\n", ret);
+		return ret;
+	}
+	memset(&dev->sgid_tbl[index], 0, 16);
+	return 0;
+}
+
+static int pvrdma_del_gid(const struct ib_gid_attr *attr, void **context)
+{
+	struct pvrdma_dev *dev = to_vdev(attr->device);
+
+	dev_dbg(&dev->pdev->dev, "removing gid at index %u from %s",
+		attr->index, dev->netdev->name);
+
+	return pvrdma_del_gid_at_index(dev, attr->index);
+}
+
+static void pvrdma_netdevice_event_handle(struct pvrdma_dev *dev,
+					  unsigned long event)
+{
+	switch (event) {
+	case NETDEV_REBOOT:
+	case NETDEV_DOWN:
+		pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ERR);
+		break;
+	case NETDEV_UP:
+		pvrdma_write_reg(dev, PVRDMA_REG_CTL,
+				 PVRDMA_DEVICE_CTL_UNQUIESCE);
+
+		mb();
+
+		if (pvrdma_read_reg(dev, PVRDMA_REG_ERR))
+			dev_err(&dev->pdev->dev,
+				"failed to activate device during link up\n");
+		else
+			pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ACTIVE);
+		break;
+	default:
+		dev_dbg(&dev->pdev->dev, "ignore netdevice event %ld on %s\n",
+			event, dev->ib_dev.name);
+		break;
+	}
+}
+
+static void pvrdma_netdevice_event_work(struct work_struct *work)
+{
+	struct pvrdma_netdevice_work *netdev_work;
+	struct pvrdma_dev *dev;
+
+	netdev_work = container_of(work, struct pvrdma_netdevice_work, work);
+
+	mutex_lock(&pvrdma_device_list_lock);
+	list_for_each_entry(dev, &pvrdma_device_list, device_link) {
+		if (dev->netdev == netdev_work->event_netdev) {
+			pvrdma_netdevice_event_handle(dev, netdev_work->event);
+			break;
+		}
+	}
+	mutex_unlock(&pvrdma_device_list_lock);
+
+	kfree(netdev_work);
+}
+
+static int pvrdma_netdevice_event(struct notifier_block *this,
+				  unsigned long event, void *ptr)
+{
+	struct net_device *event_netdev = netdev_notifier_info_to_dev(ptr);
+	struct pvrdma_netdevice_work *netdev_work;
+
+	netdev_work = kmalloc(sizeof(*netdev_work), GFP_ATOMIC);
+	if (!netdev_work)
+		return NOTIFY_BAD;
+
+	INIT_WORK(&netdev_work->work, pvrdma_netdevice_event_work);
+	netdev_work->event_netdev = event_netdev;
+	netdev_work->event = event;
+	queue_work(event_wq, &netdev_work->work);
+
+	return NOTIFY_DONE;
+}
+
+static int pvrdma_pci_probe(struct pci_dev *pdev,
+			    const struct pci_device_id *id)
+{
+	struct pci_dev *pdev_net;
+	struct pvrdma_dev *dev;
+	int ret;
+	unsigned long start;
+	unsigned long len;
+	dma_addr_t slot_dma = 0;
+
+	dev_dbg(&pdev->dev, "initializing driver %s\n", pci_name(pdev));
+
+	/* Allocate zero-out device */
+	dev = (struct pvrdma_dev *)ib_alloc_device(sizeof(*dev));
+	if (!dev) {
+		dev_err(&pdev->dev, "failed to allocate IB device\n");
+		return -ENOMEM;
+	}
+
+	mutex_lock(&pvrdma_device_list_lock);
+	list_add(&dev->device_link, &pvrdma_device_list);
+	mutex_unlock(&pvrdma_device_list_lock);
+
+	ret = pvrdma_init_device(dev);
+	if (ret)
+		goto err_free_device;
+
+	dev->pdev = pdev;
+	pci_set_drvdata(pdev, dev);
+
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		dev_err(&pdev->dev, "cannot enable PCI device\n");
+		goto err_free_device;
+	}
+
+	dev_dbg(&pdev->dev, "PCI resource flags BAR0 %#lx\n",
+		pci_resource_flags(pdev, 0));
+	dev_dbg(&pdev->dev, "PCI resource len %#llx\n",
+		(unsigned long long)pci_resource_len(pdev, 0));
+	dev_dbg(&pdev->dev, "PCI resource start %#llx\n",
+		(unsigned long long)pci_resource_start(pdev, 0));
+	dev_dbg(&pdev->dev, "PCI resource flags BAR1 %#lx\n",
+		pci_resource_flags(pdev, 1));
+	dev_dbg(&pdev->dev, "PCI resource len %#llx\n",
+		(unsigned long long)pci_resource_len(pdev, 1));
+	dev_dbg(&pdev->dev, "PCI resource start %#llx\n",
+		(unsigned long long)pci_resource_start(pdev, 1));
+
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+	    !(pci_resource_flags(pdev, 1) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "PCI BAR region not MMIO\n");
+		ret = -ENOMEM;
+		goto err_free_device;
+	}
+
+	ret = pci_request_regions(pdev, DRV_NAME);
+	if (ret) {
+		dev_err(&pdev->dev, "cannot request PCI resources\n");
+		goto err_disable_pdev;
+	}
+
+	/* Enable 64-Bit DMA */
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) {
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+		if (ret != 0) {
+			dev_err(&pdev->dev,
+				"pci_set_consistent_dma_mask failed\n");
+			goto err_free_resource;
+		}
+	} else {
+		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (ret != 0) {
+			dev_err(&pdev->dev,
+				"pci_set_dma_mask failed\n");
+			goto err_free_resource;
+		}
+	}
+
+	pci_set_master(pdev);
+
+	/* Map register space */
+	start = pci_resource_start(dev->pdev, PVRDMA_PCI_RESOURCE_REG);
+	len = pci_resource_len(dev->pdev, PVRDMA_PCI_RESOURCE_REG);
+	dev->regs = ioremap(start, len);
+	if (!dev->regs) {
+		dev_err(&pdev->dev, "register mapping failed\n");
+		ret = -ENOMEM;
+		goto err_free_resource;
+	}
+
+	/* Setup per-device UAR. */
+	dev->driver_uar.index = 0;
+	dev->driver_uar.pfn =
+		pci_resource_start(dev->pdev, PVRDMA_PCI_RESOURCE_UAR) >>
+		PAGE_SHIFT;
+	dev->driver_uar.map =
+		ioremap(dev->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!dev->driver_uar.map) {
+		dev_err(&pdev->dev, "failed to remap UAR pages\n");
+		ret = -ENOMEM;
+		goto err_unmap_regs;
+	}
+
+	dev->dsr_version = pvrdma_read_reg(dev, PVRDMA_REG_VERSION);
+	dev_info(&pdev->dev, "device version %d, driver version %d\n",
+		 dev->dsr_version, PVRDMA_VERSION);
+
+	dev->dsr = dma_zalloc_coherent(&pdev->dev, sizeof(*dev->dsr),
+				       &dev->dsrbase, GFP_KERNEL);
+	if (!dev->dsr) {
+		dev_err(&pdev->dev, "failed to allocate shared region\n");
+		ret = -ENOMEM;
+		goto err_uar_unmap;
+	}
+
+	/* Setup the shared region */
+	dev->dsr->driver_version = PVRDMA_VERSION;
+	dev->dsr->gos_info.gos_bits = sizeof(void *) == 4 ?
+		PVRDMA_GOS_BITS_32 :
+		PVRDMA_GOS_BITS_64;
+	dev->dsr->gos_info.gos_type = PVRDMA_GOS_TYPE_LINUX;
+	dev->dsr->gos_info.gos_ver = 1;
+	dev->dsr->uar_pfn = dev->driver_uar.pfn;
+
+	/* Command slot. */
+	dev->cmd_slot = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
+					   &slot_dma, GFP_KERNEL);
+	if (!dev->cmd_slot) {
+		ret = -ENOMEM;
+		goto err_free_dsr;
+	}
+
+	dev->dsr->cmd_slot_dma = (u64)slot_dma;
+
+	/* Response slot. */
+	dev->resp_slot = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
+					    &slot_dma, GFP_KERNEL);
+	if (!dev->resp_slot) {
+		ret = -ENOMEM;
+		goto err_free_slots;
+	}
+
+	dev->dsr->resp_slot_dma = (u64)slot_dma;
+
+	/* Async event ring */
+	dev->dsr->async_ring_pages.num_pages = PVRDMA_NUM_RING_PAGES;
+	ret = pvrdma_page_dir_init(dev, &dev->async_pdir,
+				   dev->dsr->async_ring_pages.num_pages, true);
+	if (ret)
+		goto err_free_slots;
+	dev->async_ring_state = dev->async_pdir.pages[0];
+	dev->dsr->async_ring_pages.pdir_dma = dev->async_pdir.dir_dma;
+
+	/* CQ notification ring */
+	dev->dsr->cq_ring_pages.num_pages = PVRDMA_NUM_RING_PAGES;
+	ret = pvrdma_page_dir_init(dev, &dev->cq_pdir,
+				   dev->dsr->cq_ring_pages.num_pages, true);
+	if (ret)
+		goto err_free_async_ring;
+	dev->cq_ring_state = dev->cq_pdir.pages[0];
+	dev->dsr->cq_ring_pages.pdir_dma = dev->cq_pdir.dir_dma;
+
+	/*
+	 * Write the PA of the shared region to the device. The writes must be
+	 * ordered such that the high bits are written last. When the writes
+	 * complete, the device will have filled out the capabilities.
+	 */
+
+	pvrdma_write_reg(dev, PVRDMA_REG_DSRLOW, (u32)dev->dsrbase);
+	pvrdma_write_reg(dev, PVRDMA_REG_DSRHIGH,
+			 (u32)((u64)(dev->dsrbase) >> 32));
+
+	/* Make sure the write is complete before reading status. */
+	mb();
+
+	/* The driver supports RoCE V1 and V2. */
+	if (!PVRDMA_SUPPORTED(dev)) {
+		dev_err(&pdev->dev, "driver needs RoCE v1 or v2 support\n");
+		ret = -EFAULT;
+		goto err_free_cq_ring;
+	}
+
+	/* Paired vmxnet3 will have same bus, slot. But func will be 0 */
+	pdev_net = pci_get_slot(pdev->bus, PCI_DEVFN(PCI_SLOT(pdev->devfn), 0));
+	if (!pdev_net) {
+		dev_err(&pdev->dev, "failed to find paired net device\n");
+		ret = -ENODEV;
+		goto err_free_cq_ring;
+	}
+
+	if (pdev_net->vendor != PCI_VENDOR_ID_VMWARE ||
+	    pdev_net->device != PCI_DEVICE_ID_VMWARE_VMXNET3) {
+		dev_err(&pdev->dev, "failed to find paired vmxnet3 device\n");
+		pci_dev_put(pdev_net);
+		ret = -ENODEV;
+		goto err_free_cq_ring;
+	}
+
+	dev->netdev = pci_get_drvdata(pdev_net);
+	pci_dev_put(pdev_net);
+	if (!dev->netdev) {
+		dev_err(&pdev->dev, "failed to get vmxnet3 device\n");
+		ret = -ENODEV;
+		goto err_free_cq_ring;
+	}
+
+	dev_info(&pdev->dev, "paired device to %s\n", dev->netdev->name);
+
+	/* Interrupt setup */
+	ret = pvrdma_alloc_intrs(dev);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to allocate interrupts\n");
+		ret = -ENOMEM;
+		goto err_free_cq_ring;
+	}
+
+	/* Allocate UAR table. */
+	ret = pvrdma_uar_table_init(dev);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to allocate UAR table\n");
+		ret = -ENOMEM;
+		goto err_free_intrs;
+	}
+
+	/* Allocate GID table */
+	dev->sgid_tbl = kcalloc(dev->dsr->caps.gid_tbl_len,
+				sizeof(union ib_gid), GFP_KERNEL);
+	if (!dev->sgid_tbl) {
+		ret = -ENOMEM;
+		goto err_free_uar_table;
+	}
+	dev_dbg(&pdev->dev, "gid table len %d\n", dev->dsr->caps.gid_tbl_len);
+
+	pvrdma_enable_intrs(dev);
+
+	/* Activate pvrdma device */
+	pvrdma_write_reg(dev, PVRDMA_REG_CTL, PVRDMA_DEVICE_CTL_ACTIVATE);
+
+	/* Make sure the write is complete before reading status. */
+	mb();
+
+	/* Check if device was successfully activated */
+	ret = pvrdma_read_reg(dev, PVRDMA_REG_ERR);
+	if (ret != 0) {
+		dev_err(&pdev->dev, "failed to activate device\n");
+		ret = -EFAULT;
+		goto err_disable_intr;
+	}
+
+	/* Register IB device */
+	ret = pvrdma_register_device(dev);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register IB device\n");
+		goto err_disable_intr;
+	}
+
+	dev->nb_netdev.notifier_call = pvrdma_netdevice_event;
+	ret = register_netdevice_notifier(&dev->nb_netdev);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register netdevice events\n");
+		goto err_unreg_ibdev;
+	}
+
+	dev_info(&pdev->dev, "attached to device\n");
+	return 0;
+
+err_unreg_ibdev:
+	ib_unregister_device(&dev->ib_dev);
+err_disable_intr:
+	pvrdma_disable_intrs(dev);
+	kfree(dev->sgid_tbl);
+err_free_uar_table:
+	pvrdma_uar_table_cleanup(dev);
+err_free_intrs:
+	pvrdma_free_irq(dev);
+	pci_free_irq_vectors(pdev);
+err_free_cq_ring:
+	pvrdma_page_dir_cleanup(dev, &dev->cq_pdir);
+err_free_async_ring:
+	pvrdma_page_dir_cleanup(dev, &dev->async_pdir);
+err_free_slots:
+	pvrdma_free_slots(dev);
+err_free_dsr:
+	dma_free_coherent(&pdev->dev, sizeof(*dev->dsr), dev->dsr,
+			  dev->dsrbase);
+err_uar_unmap:
+	iounmap(dev->driver_uar.map);
+err_unmap_regs:
+	iounmap(dev->regs);
+err_free_resource:
+	pci_release_regions(pdev);
+err_disable_pdev:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+err_free_device:
+	mutex_lock(&pvrdma_device_list_lock);
+	list_del(&dev->device_link);
+	mutex_unlock(&pvrdma_device_list_lock);
+	ib_dealloc_device(&dev->ib_dev);
+	return ret;
+}
+
+static void pvrdma_pci_remove(struct pci_dev *pdev)
+{
+	struct pvrdma_dev *dev = pci_get_drvdata(pdev);
+
+	if (!dev)
+		return;
+
+	dev_info(&pdev->dev, "detaching from device\n");
+
+	unregister_netdevice_notifier(&dev->nb_netdev);
+	dev->nb_netdev.notifier_call = NULL;
+
+	flush_workqueue(event_wq);
+
+	/* Unregister ib device */
+	ib_unregister_device(&dev->ib_dev);
+
+	mutex_lock(&pvrdma_device_list_lock);
+	list_del(&dev->device_link);
+	mutex_unlock(&pvrdma_device_list_lock);
+
+	pvrdma_disable_intrs(dev);
+	pvrdma_free_irq(dev);
+	pci_free_irq_vectors(pdev);
+
+	/* Deactivate pvrdma device */
+	pvrdma_write_reg(dev, PVRDMA_REG_CTL, PVRDMA_DEVICE_CTL_RESET);
+	pvrdma_page_dir_cleanup(dev, &dev->cq_pdir);
+	pvrdma_page_dir_cleanup(dev, &dev->async_pdir);
+	pvrdma_free_slots(dev);
+
+	iounmap(dev->regs);
+	kfree(dev->sgid_tbl);
+	kfree(dev->cq_tbl);
+	kfree(dev->srq_tbl);
+	kfree(dev->qp_tbl);
+	pvrdma_uar_table_cleanup(dev);
+	iounmap(dev->driver_uar.map);
+
+	ib_dealloc_device(&dev->ib_dev);
+
+	/* Free pci resources */
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
+
+static const struct pci_device_id pvrdma_pci_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_VMWARE, PCI_DEVICE_ID_VMWARE_PVRDMA), },
+	{ 0 },
+};
+
+MODULE_DEVICE_TABLE(pci, pvrdma_pci_table);
+
+static struct pci_driver pvrdma_driver = {
+	.name		= DRV_NAME,
+	.id_table	= pvrdma_pci_table,
+	.probe		= pvrdma_pci_probe,
+	.remove		= pvrdma_pci_remove,
+};
+
+static int __init pvrdma_init(void)
+{
+	int err;
+
+	event_wq = alloc_ordered_workqueue("pvrdma_event_wq", WQ_MEM_RECLAIM);
+	if (!event_wq)
+		return -ENOMEM;
+
+	err = pci_register_driver(&pvrdma_driver);
+	if (err)
+		destroy_workqueue(event_wq);
+
+	return err;
+}
+
+static void __exit pvrdma_cleanup(void)
+{
+	pci_unregister_driver(&pvrdma_driver);
+
+	destroy_workqueue(event_wq);
+}
+
+module_init(pvrdma_init);
+module_exit(pvrdma_cleanup);
+
+MODULE_AUTHOR("VMware, Inc");
+MODULE_DESCRIPTION("VMware Paravirtual RDMA driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
new file mode 100644
index 0000000..fb0c5c0
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+
+#include "pvrdma.h"
+
+int pvrdma_page_dir_init(struct pvrdma_dev *dev, struct pvrdma_page_dir *pdir,
+			 u64 npages, bool alloc_pages)
+{
+	u64 i;
+
+	if (npages > PVRDMA_PAGE_DIR_MAX_PAGES)
+		return -EINVAL;
+
+	memset(pdir, 0, sizeof(*pdir));
+
+	pdir->dir = dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+				       &pdir->dir_dma, GFP_KERNEL);
+	if (!pdir->dir)
+		goto err;
+
+	pdir->ntables = PVRDMA_PAGE_DIR_TABLE(npages - 1) + 1;
+	pdir->tables = kcalloc(pdir->ntables, sizeof(*pdir->tables),
+			       GFP_KERNEL);
+	if (!pdir->tables)
+		goto err;
+
+	for (i = 0; i < pdir->ntables; i++) {
+		pdir->tables[i] = dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+						(dma_addr_t *)&pdir->dir[i],
+						GFP_KERNEL);
+		if (!pdir->tables[i])
+			goto err;
+	}
+
+	pdir->npages = npages;
+
+	if (alloc_pages) {
+		pdir->pages = kcalloc(npages, sizeof(*pdir->pages),
+				      GFP_KERNEL);
+		if (!pdir->pages)
+			goto err;
+
+		for (i = 0; i < pdir->npages; i++) {
+			dma_addr_t page_dma;
+
+			pdir->pages[i] = dma_alloc_coherent(&dev->pdev->dev,
+							    PAGE_SIZE,
+							    &page_dma,
+							    GFP_KERNEL);
+			if (!pdir->pages[i])
+				goto err;
+
+			pvrdma_page_dir_insert_dma(pdir, i, page_dma);
+		}
+	}
+
+	return 0;
+
+err:
+	pvrdma_page_dir_cleanup(dev, pdir);
+
+	return -ENOMEM;
+}
+
+static u64 *pvrdma_page_dir_table(struct pvrdma_page_dir *pdir, u64 idx)
+{
+	return pdir->tables[PVRDMA_PAGE_DIR_TABLE(idx)];
+}
+
+dma_addr_t pvrdma_page_dir_get_dma(struct pvrdma_page_dir *pdir, u64 idx)
+{
+	return pvrdma_page_dir_table(pdir, idx)[PVRDMA_PAGE_DIR_PAGE(idx)];
+}
+
+static void pvrdma_page_dir_cleanup_pages(struct pvrdma_dev *dev,
+					  struct pvrdma_page_dir *pdir)
+{
+	if (pdir->pages) {
+		u64 i;
+
+		for (i = 0; i < pdir->npages && pdir->pages[i]; i++) {
+			dma_addr_t page_dma = pvrdma_page_dir_get_dma(pdir, i);
+
+			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+					  pdir->pages[i], page_dma);
+		}
+
+		kfree(pdir->pages);
+	}
+}
+
+static void pvrdma_page_dir_cleanup_tables(struct pvrdma_dev *dev,
+					   struct pvrdma_page_dir *pdir)
+{
+	if (pdir->tables) {
+		int i;
+
+		pvrdma_page_dir_cleanup_pages(dev, pdir);
+
+		for (i = 0; i < pdir->ntables; i++) {
+			u64 *table = pdir->tables[i];
+
+			if (table)
+				dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+						  table, pdir->dir[i]);
+		}
+
+		kfree(pdir->tables);
+	}
+}
+
+void pvrdma_page_dir_cleanup(struct pvrdma_dev *dev,
+			     struct pvrdma_page_dir *pdir)
+{
+	if (pdir->dir) {
+		pvrdma_page_dir_cleanup_tables(dev, pdir);
+		dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+				  pdir->dir, pdir->dir_dma);
+	}
+}
+
+int pvrdma_page_dir_insert_dma(struct pvrdma_page_dir *pdir, u64 idx,
+			       dma_addr_t daddr)
+{
+	u64 *table;
+
+	if (idx >= pdir->npages)
+		return -EINVAL;
+
+	table = pvrdma_page_dir_table(pdir, idx);
+	table[PVRDMA_PAGE_DIR_PAGE(idx)] = daddr;
+
+	return 0;
+}
+
+int pvrdma_page_dir_insert_umem(struct pvrdma_page_dir *pdir,
+				struct ib_umem *umem, u64 offset)
+{
+	u64 i = offset;
+	int j, entry;
+	int ret = 0, len = 0;
+	struct scatterlist *sg;
+
+	if (offset >= pdir->npages)
+		return -EINVAL;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		len = sg_dma_len(sg) >> PAGE_SHIFT;
+		for (j = 0; j < len; j++) {
+			dma_addr_t addr = sg_dma_address(sg) +
+					  (j << umem->page_shift);
+
+			ret = pvrdma_page_dir_insert_dma(pdir, i, addr);
+			if (ret)
+				goto exit;
+
+			i++;
+		}
+	}
+
+exit:
+	return ret;
+}
+
+int pvrdma_page_dir_insert_page_list(struct pvrdma_page_dir *pdir,
+				     u64 *page_list,
+				     int num_pages)
+{
+	int i;
+	int ret;
+
+	if (num_pages > pdir->npages)
+		return -EINVAL;
+
+	for (i = 0; i < num_pages; i++) {
+		ret = pvrdma_page_dir_insert_dma(pdir, i, page_list[i]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+void pvrdma_qp_cap_to_ib(struct ib_qp_cap *dst, const struct pvrdma_qp_cap *src)
+{
+	dst->max_send_wr = src->max_send_wr;
+	dst->max_recv_wr = src->max_recv_wr;
+	dst->max_send_sge = src->max_send_sge;
+	dst->max_recv_sge = src->max_recv_sge;
+	dst->max_inline_data = src->max_inline_data;
+}
+
+void ib_qp_cap_to_pvrdma(struct pvrdma_qp_cap *dst, const struct ib_qp_cap *src)
+{
+	dst->max_send_wr = src->max_send_wr;
+	dst->max_recv_wr = src->max_recv_wr;
+	dst->max_send_sge = src->max_send_sge;
+	dst->max_recv_sge = src->max_recv_sge;
+	dst->max_inline_data = src->max_inline_data;
+}
+
+void pvrdma_gid_to_ib(union ib_gid *dst, const union pvrdma_gid *src)
+{
+	BUILD_BUG_ON(sizeof(union pvrdma_gid) != sizeof(union ib_gid));
+	memcpy(dst, src, sizeof(*src));
+}
+
+void ib_gid_to_pvrdma(union pvrdma_gid *dst, const union ib_gid *src)
+{
+	BUILD_BUG_ON(sizeof(union pvrdma_gid) != sizeof(union ib_gid));
+	memcpy(dst, src, sizeof(*src));
+}
+
+void pvrdma_global_route_to_ib(struct ib_global_route *dst,
+			       const struct pvrdma_global_route *src)
+{
+	pvrdma_gid_to_ib(&dst->dgid, &src->dgid);
+	dst->flow_label = src->flow_label;
+	dst->sgid_index = src->sgid_index;
+	dst->hop_limit = src->hop_limit;
+	dst->traffic_class = src->traffic_class;
+}
+
+void ib_global_route_to_pvrdma(struct pvrdma_global_route *dst,
+			       const struct ib_global_route *src)
+{
+	ib_gid_to_pvrdma(&dst->dgid, &src->dgid);
+	dst->flow_label = src->flow_label;
+	dst->sgid_index = src->sgid_index;
+	dst->hop_limit = src->hop_limit;
+	dst->traffic_class = src->traffic_class;
+}
+
+void pvrdma_ah_attr_to_rdma(struct rdma_ah_attr *dst,
+			    const struct pvrdma_ah_attr *src)
+{
+	dst->type = RDMA_AH_ATTR_TYPE_ROCE;
+	pvrdma_global_route_to_ib(rdma_ah_retrieve_grh(dst), &src->grh);
+	rdma_ah_set_dlid(dst, src->dlid);
+	rdma_ah_set_sl(dst, src->sl);
+	rdma_ah_set_path_bits(dst, src->src_path_bits);
+	rdma_ah_set_static_rate(dst, src->static_rate);
+	rdma_ah_set_ah_flags(dst, src->ah_flags);
+	rdma_ah_set_port_num(dst, src->port_num);
+	memcpy(dst->roce.dmac, &src->dmac, ETH_ALEN);
+}
+
+void rdma_ah_attr_to_pvrdma(struct pvrdma_ah_attr *dst,
+			    const struct rdma_ah_attr *src)
+{
+	ib_global_route_to_pvrdma(&dst->grh, rdma_ah_read_grh(src));
+	dst->dlid = rdma_ah_get_dlid(src);
+	dst->sl = rdma_ah_get_sl(src);
+	dst->src_path_bits = rdma_ah_get_path_bits(src);
+	dst->static_rate = rdma_ah_get_static_rate(src);
+	dst->ah_flags = rdma_ah_get_ah_flags(src);
+	dst->port_num = rdma_ah_get_port_num(src);
+	memcpy(&dst->dmac, src->roce.dmac, sizeof(dst->dmac));
+}
+
+u8 ib_gid_type_to_pvrdma(enum ib_gid_type gid_type)
+{
+	return (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ?
+		PVRDMA_GID_TYPE_FLAG_ROCE_V2 :
+		PVRDMA_GID_TYPE_FLAG_ROCE_V1;
+}
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
new file mode 100644
index 0000000..fa96fa4
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include "pvrdma.h"
+
+/**
+ * pvrdma_get_dma_mr - get a DMA memory region
+ * @pd: protection domain
+ * @acc: access flags
+ *
+ * @return: ib_mr pointer on success, otherwise returns an errno.
+ */
+struct ib_mr *pvrdma_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct pvrdma_dev *dev = to_vdev(pd->device);
+	struct pvrdma_user_mr *mr;
+	union pvrdma_cmd_req req;
+	union pvrdma_cmd_resp rsp;
+	struct pvrdma_cmd_create_mr *cmd = &req.create_mr;
+	struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp;
+	int ret;
+
+	/* Support only LOCAL_WRITE flag for DMA MRs */
+	if (acc & ~IB_ACCESS_LOCAL_WRITE) {
+		dev_warn(&dev->pdev->dev,
+			 "unsupported dma mr access flags %#x\n", acc);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_CREATE_MR;
+	cmd->pd_handle = to_vpd(pd)->pd_handle;
+	cmd->access_flags = acc;
+	cmd->flags = PVRDMA_MR_FLAG_DMA;
+
+	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP);
+	if (ret < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "could not get DMA mem region, error: %d\n", ret);
+		kfree(mr);
+		return ERR_PTR(ret);
+	}
+
+	mr->mmr.mr_handle = resp->mr_handle;
+	mr->ibmr.lkey = resp->lkey;
+	mr->ibmr.rkey = resp->rkey;
+
+	return &mr->ibmr;
+}
+
+/**
+ * pvrdma_reg_user_mr - register a userspace memory region
+ * @pd: protection domain
+ * @start: starting address
+ * @length: length of region
+ * @virt_addr: I/O virtual address
+ * @access_flags: access flags for memory region
+ * @udata: user data
+ *
+ * @return: ib_mr pointer on success, otherwise returns an errno.
+ */
+struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				 u64 virt_addr, int access_flags,
+				 struct ib_udata *udata)
+{
+	struct pvrdma_dev *dev = to_vdev(pd->device);
+	struct pvrdma_user_mr *mr = NULL;
+	struct ib_umem *umem;
+	union pvrdma_cmd_req req;
+	union pvrdma_cmd_resp rsp;
+	struct pvrdma_cmd_create_mr *cmd = &req.create_mr;
+	struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp;
+	int ret;
+
+	if (length == 0 || length > dev->dsr->caps.max_mr_size) {
+		dev_warn(&dev->pdev->dev, "invalid mem region length\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	umem = ib_umem_get(pd->uobject->context, start,
+			   length, access_flags, 0);
+	if (IS_ERR(umem)) {
+		dev_warn(&dev->pdev->dev,
+			 "could not get umem for mem region\n");
+		return ERR_CAST(umem);
+	}
+
+	if (umem->npages < 0 || umem->npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
+		dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n",
+			 umem->npages);
+		ret = -EINVAL;
+		goto err_umem;
+	}
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		ret = -ENOMEM;
+		goto err_umem;
+	}
+
+	mr->mmr.iova = virt_addr;
+	mr->mmr.size = length;
+	mr->umem = umem;
+
+	ret = pvrdma_page_dir_init(dev, &mr->pdir, umem->npages, false);
+	if (ret) {
+		dev_warn(&dev->pdev->dev,
+			 "could not allocate page directory\n");
+		goto err_umem;
+	}
+
+	ret = pvrdma_page_dir_insert_umem(&mr->pdir, mr->umem, 0);
+	if (ret)
+		goto err_pdir;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_CREATE_MR;
+	cmd->start = start;
+	cmd->length = length;
+	cmd->pd_handle = to_vpd(pd)->pd_handle;
+	cmd->access_flags = access_flags;
+	cmd->nchunks = umem->npages;
+	cmd->pdir_dma = mr->pdir.dir_dma;
+
+	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP);
+	if (ret < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "could not register mem region, error: %d\n", ret);
+		goto err_pdir;
+	}
+
+	mr->mmr.mr_handle = resp->mr_handle;
+	mr->ibmr.lkey = resp->lkey;
+	mr->ibmr.rkey = resp->rkey;
+
+	return &mr->ibmr;
+
+err_pdir:
+	pvrdma_page_dir_cleanup(dev, &mr->pdir);
+err_umem:
+	ib_umem_release(umem);
+	kfree(mr);
+
+	return ERR_PTR(ret);
+}
+
+/**
+ * pvrdma_alloc_mr - allocate a memory region
+ * @pd: protection domain
+ * @mr_type: type of memory region
+ * @max_num_sg: maximum number of pages
+ *
+ * @return: ib_mr pointer on success, otherwise returns an errno.
+ */
+struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			      u32 max_num_sg)
+{
+	struct pvrdma_dev *dev = to_vdev(pd->device);
+	struct pvrdma_user_mr *mr;
+	union pvrdma_cmd_req req;
+	union pvrdma_cmd_resp rsp;
+	struct pvrdma_cmd_create_mr *cmd = &req.create_mr;
+	struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp;
+	int size = max_num_sg * sizeof(u64);
+	int ret;
+
+	if (mr_type != IB_MR_TYPE_MEM_REG ||
+	    max_num_sg > PVRDMA_MAX_FAST_REG_PAGES)
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->pages = kzalloc(size, GFP_KERNEL);
+	if (!mr->pages) {
+		ret = -ENOMEM;
+		goto freemr;
+	}
+
+	ret = pvrdma_page_dir_init(dev, &mr->pdir, max_num_sg, false);
+	if (ret) {
+		dev_warn(&dev->pdev->dev,
+			 "failed to allocate page dir for mr\n");
+		ret = -ENOMEM;
+		goto freepages;
+	}
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_CREATE_MR;
+	cmd->pd_handle = to_vpd(pd)->pd_handle;
+	cmd->access_flags = 0;
+	cmd->flags = PVRDMA_MR_FLAG_FRMR;
+	cmd->nchunks = max_num_sg;
+
+	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP);
+	if (ret < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "could not create FR mem region, error: %d\n", ret);
+		goto freepdir;
+	}
+
+	mr->max_pages = max_num_sg;
+	mr->mmr.mr_handle = resp->mr_handle;
+	mr->ibmr.lkey = resp->lkey;
+	mr->ibmr.rkey = resp->rkey;
+	mr->page_shift = PAGE_SHIFT;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+freepdir:
+	pvrdma_page_dir_cleanup(dev, &mr->pdir);
+freepages:
+	kfree(mr->pages);
+freemr:
+	kfree(mr);
+	return ERR_PTR(ret);
+}
+
+/**
+ * pvrdma_dereg_mr - deregister a memory region
+ * @ibmr: memory region
+ *
+ * @return: 0 on success.
+ */
+int pvrdma_dereg_mr(struct ib_mr *ibmr)
+{
+	struct pvrdma_user_mr *mr = to_vmr(ibmr);
+	struct pvrdma_dev *dev = to_vdev(ibmr->device);
+	union pvrdma_cmd_req req;
+	struct pvrdma_cmd_destroy_mr *cmd = &req.destroy_mr;
+	int ret;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_DESTROY_MR;
+	cmd->mr_handle = mr->mmr.mr_handle;
+	ret = pvrdma_cmd_post(dev, &req, NULL, 0);
+	if (ret < 0)
+		dev_warn(&dev->pdev->dev,
+			 "could not deregister mem region, error: %d\n", ret);
+
+	pvrdma_page_dir_cleanup(dev, &mr->pdir);
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+
+	kfree(mr->pages);
+	kfree(mr);
+
+	return 0;
+}
+
+static int pvrdma_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct pvrdma_user_mr *mr = to_vmr(ibmr);
+
+	if (mr->npages == mr->max_pages)
+		return -ENOMEM;
+
+	mr->pages[mr->npages++] = addr;
+	return 0;
+}
+
+int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		     unsigned int *sg_offset)
+{
+	struct pvrdma_user_mr *mr = to_vmr(ibmr);
+	struct pvrdma_dev *dev = to_vdev(ibmr->device);
+	int ret;
+
+	mr->npages = 0;
+
+	ret = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, pvrdma_set_page);
+	if (ret < 0)
+		dev_warn(&dev->pdev->dev, "could not map sg to pages\n");
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
new file mode 100644
index 0000000..eb5b106
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
@@ -0,0 +1,1003 @@
+/*
+ * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <asm/page.h>
+#include <linux/io.h>
+#include <linux/wait.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "pvrdma.h"
+
+static inline void get_cqs(struct pvrdma_qp *qp, struct pvrdma_cq **send_cq,
+			   struct pvrdma_cq **recv_cq)
+{
+	*send_cq = to_vcq(qp->ibqp.send_cq);
+	*recv_cq = to_vcq(qp->ibqp.recv_cq);
+}
+
+static void pvrdma_lock_cqs(struct pvrdma_cq *scq, struct pvrdma_cq *rcq,
+			    unsigned long *scq_flags,
+			    unsigned long *rcq_flags)
+	__acquires(scq->cq_lock) __acquires(rcq->cq_lock)
+{
+	if (scq == rcq) {
+		spin_lock_irqsave(&scq->cq_lock, *scq_flags);
+		__acquire(rcq->cq_lock);
+	} else if (scq->cq_handle < rcq->cq_handle) {
+		spin_lock_irqsave(&scq->cq_lock, *scq_flags);
+		spin_lock_irqsave_nested(&rcq->cq_lock, *rcq_flags,
+					 SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock_irqsave(&rcq->cq_lock, *rcq_flags);
+		spin_lock_irqsave_nested(&scq->cq_lock, *scq_flags,
+					 SINGLE_DEPTH_NESTING);
+	}
+}
+
+static void pvrdma_unlock_cqs(struct pvrdma_cq *scq, struct pvrdma_cq *rcq,
+			      unsigned long *scq_flags,
+			      unsigned long *rcq_flags)
+	__releases(scq->cq_lock) __releases(rcq->cq_lock)
+{
+	if (scq == rcq) {
+		__release(rcq->cq_lock);
+		spin_unlock_irqrestore(&scq->cq_lock, *scq_flags);
+	} else if (scq->cq_handle < rcq->cq_handle) {
+		spin_unlock_irqrestore(&rcq->cq_lock, *rcq_flags);
+		spin_unlock_irqrestore(&scq->cq_lock, *scq_flags);
+	} else {
+		spin_unlock_irqrestore(&scq->cq_lock, *scq_flags);
+		spin_unlock_irqrestore(&rcq->cq_lock, *rcq_flags);
+	}
+}
+
+static void pvrdma_reset_qp(struct pvrdma_qp *qp)
+{
+	struct pvrdma_cq *scq, *rcq;
+	unsigned long scq_flags, rcq_flags;
+
+	/* Clean up cqes */
+	get_cqs(qp, &scq, &rcq);
+	pvrdma_lock_cqs(scq, rcq, &scq_flags, &rcq_flags);
+
+	_pvrdma_flush_cqe(qp, scq);
+	if (scq != rcq)
+		_pvrdma_flush_cqe(qp, rcq);
+
+	pvrdma_unlock_cqs(scq, rcq, &scq_flags, &rcq_flags);
+
+	/*
+	 * Reset queuepair. The checks are because usermode queuepairs won't
+	 * have kernel ringstates.
+	 */
+	if (qp->rq.ring) {
+		atomic_set(&qp->rq.ring->cons_head, 0);
+		atomic_set(&qp->rq.ring->prod_tail, 0);
+	}
+	if (qp->sq.ring) {
+		atomic_set(&qp->sq.ring->cons_head, 0);
+		atomic_set(&qp->sq.ring->prod_tail, 0);
+	}
+}
+
+static int pvrdma_set_rq_size(struct pvrdma_dev *dev,
+			      struct ib_qp_cap *req_cap,
+			      struct pvrdma_qp *qp)
+{
+	if (req_cap->max_recv_wr > dev->dsr->caps.max_qp_wr ||
+	    req_cap->max_recv_sge > dev->dsr->caps.max_sge) {
+		dev_warn(&dev->pdev->dev, "recv queue size invalid\n");
+		return -EINVAL;
+	}
+
+	qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, req_cap->max_recv_wr));
+	qp->rq.max_sg = roundup_pow_of_two(max(1U, req_cap->max_recv_sge));
+
+	/* Write back */
+	req_cap->max_recv_wr = qp->rq.wqe_cnt;
+	req_cap->max_recv_sge = qp->rq.max_sg;
+
+	qp->rq.wqe_size = roundup_pow_of_two(sizeof(struct pvrdma_rq_wqe_hdr) +
+					     sizeof(struct pvrdma_sge) *
+					     qp->rq.max_sg);
+	qp->npages_recv = (qp->rq.wqe_cnt * qp->rq.wqe_size + PAGE_SIZE - 1) /
+			  PAGE_SIZE;
+
+	return 0;
+}
+
+static int pvrdma_set_sq_size(struct pvrdma_dev *dev, struct ib_qp_cap *req_cap,
+			      struct pvrdma_qp *qp)
+{
+	if (req_cap->max_send_wr > dev->dsr->caps.max_qp_wr ||
+	    req_cap->max_send_sge > dev->dsr->caps.max_sge) {
+		dev_warn(&dev->pdev->dev, "send queue size invalid\n");
+		return -EINVAL;
+	}
+
+	qp->sq.wqe_cnt = roundup_pow_of_two(max(1U, req_cap->max_send_wr));
+	qp->sq.max_sg = roundup_pow_of_two(max(1U, req_cap->max_send_sge));
+
+	/* Write back */
+	req_cap->max_send_wr = qp->sq.wqe_cnt;
+	req_cap->max_send_sge = qp->sq.max_sg;
+
+	qp->sq.wqe_size = roundup_pow_of_two(sizeof(struct pvrdma_sq_wqe_hdr) +
+					     sizeof(struct pvrdma_sge) *
+					     qp->sq.max_sg);
+	/* Note: one extra page for the header. */
+	qp->npages_send = PVRDMA_QP_NUM_HEADER_PAGES +
+			  (qp->sq.wqe_cnt * qp->sq.wqe_size + PAGE_SIZE - 1) /
+								PAGE_SIZE;
+
+	return 0;
+}
+
+/**
+ * pvrdma_create_qp - create queue pair
+ * @pd: protection domain
+ * @init_attr: queue pair attributes
+ * @udata: user data
+ *
+ * @return: the ib_qp pointer on success, otherwise returns an errno.
+ */
+struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
+			       struct ib_qp_init_attr *init_attr,
+			       struct ib_udata *udata)
+{
+	struct pvrdma_qp *qp = NULL;
+	struct pvrdma_dev *dev = to_vdev(pd->device);
+	union pvrdma_cmd_req req;
+	union pvrdma_cmd_resp rsp;
+	struct pvrdma_cmd_create_qp *cmd = &req.create_qp;
+	struct pvrdma_cmd_create_qp_resp *resp = &rsp.create_qp_resp;
+	struct pvrdma_create_qp ucmd;
+	unsigned long flags;
+	int ret;
+	bool is_srq = !!init_attr->srq;
+
+	if (init_attr->create_flags) {
+		dev_warn(&dev->pdev->dev,
+			 "invalid create queuepair flags %#x\n",
+			 init_attr->create_flags);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (init_attr->qp_type != IB_QPT_RC &&
+	    init_attr->qp_type != IB_QPT_UD &&
+	    init_attr->qp_type != IB_QPT_GSI) {
+		dev_warn(&dev->pdev->dev, "queuepair type %d not supported\n",
+			 init_attr->qp_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (is_srq && !dev->dsr->caps.max_srq) {
+		dev_warn(&dev->pdev->dev,
+			 "SRQs not supported by device\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!atomic_add_unless(&dev->num_qps, 1, dev->dsr->caps.max_qp))
+		return ERR_PTR(-ENOMEM);
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_GSI:
+		if (init_attr->port_num == 0 ||
+		    init_attr->port_num > pd->device->phys_port_cnt ||
+		    udata) {
+			dev_warn(&dev->pdev->dev, "invalid queuepair attrs\n");
+			ret = -EINVAL;
+			goto err_qp;
+		}
+		/* fall through */
+	case IB_QPT_RC:
+	case IB_QPT_UD:
+		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+		if (!qp) {
+			ret = -ENOMEM;
+			goto err_qp;
+		}
+
+		spin_lock_init(&qp->sq.lock);
+		spin_lock_init(&qp->rq.lock);
+		mutex_init(&qp->mutex);
+		refcount_set(&qp->refcnt, 1);
+		init_completion(&qp->free);
+
+		qp->state = IB_QPS_RESET;
+		qp->is_kernel = !(pd->uobject && udata);
+
+		if (!qp->is_kernel) {
+			dev_dbg(&dev->pdev->dev,
+				"create queuepair from user space\n");
+
+			if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+				ret = -EFAULT;
+				goto err_qp;
+			}
+
+			if (!is_srq) {
+				/* set qp->sq.wqe_cnt, shift, buf_size.. */
+				qp->rumem = ib_umem_get(pd->uobject->context,
+							ucmd.rbuf_addr,
+							ucmd.rbuf_size, 0, 0);
+				if (IS_ERR(qp->rumem)) {
+					ret = PTR_ERR(qp->rumem);
+					goto err_qp;
+				}
+				qp->srq = NULL;
+			} else {
+				qp->rumem = NULL;
+				qp->srq = to_vsrq(init_attr->srq);
+			}
+
+			qp->sumem = ib_umem_get(pd->uobject->context,
+						ucmd.sbuf_addr,
+						ucmd.sbuf_size, 0, 0);
+			if (IS_ERR(qp->sumem)) {
+				if (!is_srq)
+					ib_umem_release(qp->rumem);
+				ret = PTR_ERR(qp->sumem);
+				goto err_qp;
+			}
+
+			qp->npages_send = ib_umem_page_count(qp->sumem);
+			if (!is_srq)
+				qp->npages_recv = ib_umem_page_count(qp->rumem);
+			else
+				qp->npages_recv = 0;
+			qp->npages = qp->npages_send + qp->npages_recv;
+		} else {
+			ret = pvrdma_set_sq_size(to_vdev(pd->device),
+						 &init_attr->cap, qp);
+			if (ret)
+				goto err_qp;
+
+			ret = pvrdma_set_rq_size(to_vdev(pd->device),
+						 &init_attr->cap, qp);
+			if (ret)
+				goto err_qp;
+
+			qp->npages = qp->npages_send + qp->npages_recv;
+
+			/* Skip header page. */
+			qp->sq.offset = PVRDMA_QP_NUM_HEADER_PAGES * PAGE_SIZE;
+
+			/* Recv queue pages are after send pages. */
+			qp->rq.offset = qp->npages_send * PAGE_SIZE;
+		}
+
+		if (qp->npages < 0 || qp->npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
+			dev_warn(&dev->pdev->dev,
+				 "overflow pages in queuepair\n");
+			ret = -EINVAL;
+			goto err_umem;
+		}
+
+		ret = pvrdma_page_dir_init(dev, &qp->pdir, qp->npages,
+					   qp->is_kernel);
+		if (ret) {
+			dev_warn(&dev->pdev->dev,
+				 "could not allocate page directory\n");
+			goto err_umem;
+		}
+
+		if (!qp->is_kernel) {
+			pvrdma_page_dir_insert_umem(&qp->pdir, qp->sumem, 0);
+			if (!is_srq)
+				pvrdma_page_dir_insert_umem(&qp->pdir,
+							    qp->rumem,
+							    qp->npages_send);
+		} else {
+			/* Ring state is always the first page. */
+			qp->sq.ring = qp->pdir.pages[0];
+			qp->rq.ring = is_srq ? NULL : &qp->sq.ring[1];
+		}
+		break;
+	default:
+		ret = -EINVAL;
+		goto err_qp;
+	}
+
+	/* Not supported */
+	init_attr->cap.max_inline_data = 0;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_CREATE_QP;
+	cmd->pd_handle = to_vpd(pd)->pd_handle;
+	cmd->send_cq_handle = to_vcq(init_attr->send_cq)->cq_handle;
+	cmd->recv_cq_handle = to_vcq(init_attr->recv_cq)->cq_handle;
+	if (is_srq)
+		cmd->srq_handle = to_vsrq(init_attr->srq)->srq_handle;
+	else
+		cmd->srq_handle = 0;
+	cmd->max_send_wr = init_attr->cap.max_send_wr;
+	cmd->max_recv_wr = init_attr->cap.max_recv_wr;
+	cmd->max_send_sge = init_attr->cap.max_send_sge;
+	cmd->max_recv_sge = init_attr->cap.max_recv_sge;
+	cmd->max_inline_data = init_attr->cap.max_inline_data;
+	cmd->sq_sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) ? 1 : 0;
+	cmd->qp_type = ib_qp_type_to_pvrdma(init_attr->qp_type);
+	cmd->is_srq = is_srq;
+	cmd->lkey = 0;
+	cmd->access_flags = IB_ACCESS_LOCAL_WRITE;
+	cmd->total_chunks = qp->npages;
+	cmd->send_chunks = qp->npages_send - PVRDMA_QP_NUM_HEADER_PAGES;
+	cmd->pdir_dma = qp->pdir.dir_dma;
+
+	dev_dbg(&dev->pdev->dev, "create queuepair with %d, %d, %d, %d\n",
+		cmd->max_send_wr, cmd->max_recv_wr, cmd->max_send_sge,
+		cmd->max_recv_sge);
+
+	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_QP_RESP);
+	if (ret < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "could not create queuepair, error: %d\n", ret);
+		goto err_pdir;
+	}
+
+	/* max_send_wr/_recv_wr/_send_sge/_recv_sge/_inline_data */
+	qp->qp_handle = resp->qpn;
+	qp->port = init_attr->port_num;
+	qp->ibqp.qp_num = resp->qpn;
+	spin_lock_irqsave(&dev->qp_tbl_lock, flags);
+	dev->qp_tbl[qp->qp_handle % dev->dsr->caps.max_qp] = qp;
+	spin_unlock_irqrestore(&dev->qp_tbl_lock, flags);
+
+	return &qp->ibqp;
+
+err_pdir:
+	pvrdma_page_dir_cleanup(dev, &qp->pdir);
+err_umem:
+	if (!qp->is_kernel) {
+		if (qp->rumem)
+			ib_umem_release(qp->rumem);
+		if (qp->sumem)
+			ib_umem_release(qp->sumem);
+	}
+err_qp:
+	kfree(qp);
+	atomic_dec(&dev->num_qps);
+
+	return ERR_PTR(ret);
+}
+
+static void pvrdma_free_qp(struct pvrdma_qp *qp)
+{
+	struct pvrdma_dev *dev = to_vdev(qp->ibqp.device);
+	struct pvrdma_cq *scq;
+	struct pvrdma_cq *rcq;
+	unsigned long flags, scq_flags, rcq_flags;
+
+	/* In case cq is polling */
+	get_cqs(qp, &scq, &rcq);
+	pvrdma_lock_cqs(scq, rcq, &scq_flags, &rcq_flags);
+
+	_pvrdma_flush_cqe(qp, scq);
+	if (scq != rcq)
+		_pvrdma_flush_cqe(qp, rcq);
+
+	spin_lock_irqsave(&dev->qp_tbl_lock, flags);
+	dev->qp_tbl[qp->qp_handle] = NULL;
+	spin_unlock_irqrestore(&dev->qp_tbl_lock, flags);
+
+	pvrdma_unlock_cqs(scq, rcq, &scq_flags, &rcq_flags);
+
+	if (refcount_dec_and_test(&qp->refcnt))
+		complete(&qp->free);
+	wait_for_completion(&qp->free);
+
+	if (!qp->is_kernel) {
+		if (qp->rumem)
+			ib_umem_release(qp->rumem);
+		if (qp->sumem)
+			ib_umem_release(qp->sumem);
+	}
+
+	pvrdma_page_dir_cleanup(dev, &qp->pdir);
+
+	kfree(qp);
+
+	atomic_dec(&dev->num_qps);
+}
+
+/**
+ * pvrdma_destroy_qp - destroy a queue pair
+ * @qp: the queue pair to destroy
+ *
+ * @return: 0 on success.
+ */
+int pvrdma_destroy_qp(struct ib_qp *qp)
+{
+	struct pvrdma_qp *vqp = to_vqp(qp);
+	union pvrdma_cmd_req req;
+	struct pvrdma_cmd_destroy_qp *cmd = &req.destroy_qp;
+	int ret;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_DESTROY_QP;
+	cmd->qp_handle = vqp->qp_handle;
+
+	ret = pvrdma_cmd_post(to_vdev(qp->device), &req, NULL, 0);
+	if (ret < 0)
+		dev_warn(&to_vdev(qp->device)->pdev->dev,
+			 "destroy queuepair failed, error: %d\n", ret);
+
+	pvrdma_free_qp(vqp);
+
+	return 0;
+}
+
+/**
+ * pvrdma_modify_qp - modify queue pair attributes
+ * @ibqp: the queue pair
+ * @attr: the new queue pair's attributes
+ * @attr_mask: attributes mask
+ * @udata: user data
+ *
+ * @returns 0 on success, otherwise returns an errno.
+ */
+int pvrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_udata *udata)
+{
+	struct pvrdma_dev *dev = to_vdev(ibqp->device);
+	struct pvrdma_qp *qp = to_vqp(ibqp);
+	union pvrdma_cmd_req req;
+	union pvrdma_cmd_resp rsp;
+	struct pvrdma_cmd_modify_qp *cmd = &req.modify_qp;
+	enum ib_qp_state cur_state, next_state;
+	int ret;
+
+	/* Sanity checking. Should need lock here */
+	mutex_lock(&qp->mutex);
+	cur_state = (attr_mask & IB_QP_CUR_STATE) ? attr->cur_qp_state :
+		qp->state;
+	next_state = (attr_mask & IB_QP_STATE) ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, next_state, ibqp->qp_type,
+				attr_mask, IB_LINK_LAYER_ETHERNET)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_PORT) {
+		if (attr->port_num == 0 ||
+		    attr->port_num > ibqp->device->phys_port_cnt) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		if (attr->min_rnr_timer > 31) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		if (attr->pkey_index >= dev->dsr->caps.max_pkeys) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (attr_mask & IB_QP_QKEY)
+		qp->qkey = attr->qkey;
+
+	if (cur_state == next_state && cur_state == IB_QPS_RESET) {
+		ret = 0;
+		goto out;
+	}
+
+	qp->state = next_state;
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_MODIFY_QP;
+	cmd->qp_handle = qp->qp_handle;
+	cmd->attr_mask = ib_qp_attr_mask_to_pvrdma(attr_mask);
+	cmd->attrs.qp_state = ib_qp_state_to_pvrdma(attr->qp_state);
+	cmd->attrs.cur_qp_state =
+		ib_qp_state_to_pvrdma(attr->cur_qp_state);
+	cmd->attrs.path_mtu = ib_mtu_to_pvrdma(attr->path_mtu);
+	cmd->attrs.path_mig_state =
+		ib_mig_state_to_pvrdma(attr->path_mig_state);
+	cmd->attrs.qkey = attr->qkey;
+	cmd->attrs.rq_psn = attr->rq_psn;
+	cmd->attrs.sq_psn = attr->sq_psn;
+	cmd->attrs.dest_qp_num = attr->dest_qp_num;
+	cmd->attrs.qp_access_flags =
+		ib_access_flags_to_pvrdma(attr->qp_access_flags);
+	cmd->attrs.pkey_index = attr->pkey_index;
+	cmd->attrs.alt_pkey_index = attr->alt_pkey_index;
+	cmd->attrs.en_sqd_async_notify = attr->en_sqd_async_notify;
+	cmd->attrs.sq_draining = attr->sq_draining;
+	cmd->attrs.max_rd_atomic = attr->max_rd_atomic;
+	cmd->attrs.max_dest_rd_atomic = attr->max_dest_rd_atomic;
+	cmd->attrs.min_rnr_timer = attr->min_rnr_timer;
+	cmd->attrs.port_num = attr->port_num;
+	cmd->attrs.timeout = attr->timeout;
+	cmd->attrs.retry_cnt = attr->retry_cnt;
+	cmd->attrs.rnr_retry = attr->rnr_retry;
+	cmd->attrs.alt_port_num = attr->alt_port_num;
+	cmd->attrs.alt_timeout = attr->alt_timeout;
+	ib_qp_cap_to_pvrdma(&cmd->attrs.cap, &attr->cap);
+	rdma_ah_attr_to_pvrdma(&cmd->attrs.ah_attr, &attr->ah_attr);
+	rdma_ah_attr_to_pvrdma(&cmd->attrs.alt_ah_attr, &attr->alt_ah_attr);
+
+	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_MODIFY_QP_RESP);
+	if (ret < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "could not modify queuepair, error: %d\n", ret);
+	} else if (rsp.hdr.err > 0) {
+		dev_warn(&dev->pdev->dev,
+			 "cannot modify queuepair, error: %d\n", rsp.hdr.err);
+		ret = -EINVAL;
+	}
+
+	if (ret == 0 && next_state == IB_QPS_RESET)
+		pvrdma_reset_qp(qp);
+
+out:
+	mutex_unlock(&qp->mutex);
+
+	return ret;
+}
+
+static inline void *get_sq_wqe(struct pvrdma_qp *qp, unsigned int n)
+{
+	return pvrdma_page_dir_get_ptr(&qp->pdir,
+				       qp->sq.offset + n * qp->sq.wqe_size);
+}
+
+static inline void *get_rq_wqe(struct pvrdma_qp *qp, unsigned int n)
+{
+	return pvrdma_page_dir_get_ptr(&qp->pdir,
+				       qp->rq.offset + n * qp->rq.wqe_size);
+}
+
+static int set_reg_seg(struct pvrdma_sq_wqe_hdr *wqe_hdr, struct ib_reg_wr *wr)
+{
+	struct pvrdma_user_mr *mr = to_vmr(wr->mr);
+
+	wqe_hdr->wr.fast_reg.iova_start = mr->ibmr.iova;
+	wqe_hdr->wr.fast_reg.pl_pdir_dma = mr->pdir.dir_dma;
+	wqe_hdr->wr.fast_reg.page_shift = mr->page_shift;
+	wqe_hdr->wr.fast_reg.page_list_len = mr->npages;
+	wqe_hdr->wr.fast_reg.length = mr->ibmr.length;
+	wqe_hdr->wr.fast_reg.access_flags = wr->access;
+	wqe_hdr->wr.fast_reg.rkey = wr->key;
+
+	return pvrdma_page_dir_insert_page_list(&mr->pdir, mr->pages,
+						mr->npages);
+}
+
+/**
+ * pvrdma_post_send - post send work request entries on a QP
+ * @ibqp: the QP
+ * @wr: work request list to post
+ * @bad_wr: the first bad WR returned
+ *
+ * @return: 0 on success, otherwise errno returned.
+ */
+int pvrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		     struct ib_send_wr **bad_wr)
+{
+	struct pvrdma_qp *qp = to_vqp(ibqp);
+	struct pvrdma_dev *dev = to_vdev(ibqp->device);
+	unsigned long flags;
+	struct pvrdma_sq_wqe_hdr *wqe_hdr;
+	struct pvrdma_sge *sge;
+	int i, ret;
+
+	/*
+	 * In states lower than RTS, we can fail immediately. In other states,
+	 * just post and let the device figure it out.
+	 */
+	if (qp->state < IB_QPS_RTS) {
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	while (wr) {
+		unsigned int tail = 0;
+
+		if (unlikely(!pvrdma_idx_ring_has_space(
+				qp->sq.ring, qp->sq.wqe_cnt, &tail))) {
+			dev_warn_ratelimited(&dev->pdev->dev,
+					     "send queue is full\n");
+			*bad_wr = wr;
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->sq.max_sg || wr->num_sge < 0)) {
+			dev_warn_ratelimited(&dev->pdev->dev,
+					     "send SGE overflow\n");
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (unlikely(wr->opcode < 0)) {
+			dev_warn_ratelimited(&dev->pdev->dev,
+					     "invalid send opcode\n");
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto out;
+		}
+
+		/*
+		 * Only support UD, RC.
+		 * Need to check opcode table for thorough checking.
+		 * opcode		_UD	_UC	_RC
+		 * _SEND		x	x	x
+		 * _SEND_WITH_IMM	x	x	x
+		 * _RDMA_WRITE			x	x
+		 * _RDMA_WRITE_WITH_IMM		x	x
+		 * _LOCAL_INV			x	x
+		 * _SEND_WITH_INV		x	x
+		 * _RDMA_READ				x
+		 * _ATOMIC_CMP_AND_SWP			x
+		 * _ATOMIC_FETCH_AND_ADD		x
+		 * _MASK_ATOMIC_CMP_AND_SWP		x
+		 * _MASK_ATOMIC_FETCH_AND_ADD		x
+		 * _REG_MR				x
+		 *
+		 */
+		if (qp->ibqp.qp_type != IB_QPT_UD &&
+		    qp->ibqp.qp_type != IB_QPT_RC &&
+			wr->opcode != IB_WR_SEND) {
+			dev_warn_ratelimited(&dev->pdev->dev,
+					     "unsupported queuepair type\n");
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto out;
+		} else if (qp->ibqp.qp_type == IB_QPT_UD ||
+			   qp->ibqp.qp_type == IB_QPT_GSI) {
+			if (wr->opcode != IB_WR_SEND &&
+			    wr->opcode != IB_WR_SEND_WITH_IMM) {
+				dev_warn_ratelimited(&dev->pdev->dev,
+						     "invalid send opcode\n");
+				*bad_wr = wr;
+				ret = -EINVAL;
+				goto out;
+			}
+		}
+
+		wqe_hdr = (struct pvrdma_sq_wqe_hdr *)get_sq_wqe(qp, tail);
+		memset(wqe_hdr, 0, sizeof(*wqe_hdr));
+		wqe_hdr->wr_id = wr->wr_id;
+		wqe_hdr->num_sge = wr->num_sge;
+		wqe_hdr->opcode = ib_wr_opcode_to_pvrdma(wr->opcode);
+		wqe_hdr->send_flags = ib_send_flags_to_pvrdma(wr->send_flags);
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			wqe_hdr->ex.imm_data = wr->ex.imm_data;
+
+		switch (qp->ibqp.qp_type) {
+		case IB_QPT_GSI:
+		case IB_QPT_UD:
+			if (unlikely(!ud_wr(wr)->ah)) {
+				dev_warn_ratelimited(&dev->pdev->dev,
+						     "invalid address handle\n");
+				*bad_wr = wr;
+				ret = -EINVAL;
+				goto out;
+			}
+
+			/*
+			 * Use qkey from qp context if high order bit set,
+			 * otherwise from work request.
+			 */
+			wqe_hdr->wr.ud.remote_qpn = ud_wr(wr)->remote_qpn;
+			wqe_hdr->wr.ud.remote_qkey =
+				ud_wr(wr)->remote_qkey & 0x80000000 ?
+				qp->qkey : ud_wr(wr)->remote_qkey;
+			wqe_hdr->wr.ud.av = to_vah(ud_wr(wr)->ah)->av;
+
+			break;
+		case IB_QPT_RC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				wqe_hdr->wr.rdma.remote_addr =
+					rdma_wr(wr)->remote_addr;
+				wqe_hdr->wr.rdma.rkey = rdma_wr(wr)->rkey;
+				break;
+			case IB_WR_LOCAL_INV:
+			case IB_WR_SEND_WITH_INV:
+				wqe_hdr->ex.invalidate_rkey =
+					wr->ex.invalidate_rkey;
+				break;
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				wqe_hdr->wr.atomic.remote_addr =
+					atomic_wr(wr)->remote_addr;
+				wqe_hdr->wr.atomic.rkey = atomic_wr(wr)->rkey;
+				wqe_hdr->wr.atomic.compare_add =
+					atomic_wr(wr)->compare_add;
+				if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP)
+					wqe_hdr->wr.atomic.swap =
+						atomic_wr(wr)->swap;
+				break;
+			case IB_WR_REG_MR:
+				ret = set_reg_seg(wqe_hdr, reg_wr(wr));
+				if (ret < 0) {
+					dev_warn_ratelimited(&dev->pdev->dev,
+							     "Failed to set fast register work request\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				break;
+			default:
+				break;
+			}
+
+			break;
+		default:
+			dev_warn_ratelimited(&dev->pdev->dev,
+					     "invalid queuepair type\n");
+			ret = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		sge = (struct pvrdma_sge *)(wqe_hdr + 1);
+		for (i = 0; i < wr->num_sge; i++) {
+			/* Need to check wqe_size 0 or max size */
+			sge->addr = wr->sg_list[i].addr;
+			sge->length = wr->sg_list[i].length;
+			sge->lkey = wr->sg_list[i].lkey;
+			sge++;
+		}
+
+		/* Make sure wqe is written before index update */
+		smp_wmb();
+
+		/* Update shared sq ring */
+		pvrdma_idx_ring_inc(&qp->sq.ring->prod_tail,
+				    qp->sq.wqe_cnt);
+
+		wr = wr->next;
+	}
+
+	ret = 0;
+
+out:
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+	if (!ret)
+		pvrdma_write_uar_qp(dev, PVRDMA_UAR_QP_SEND | qp->qp_handle);
+
+	return ret;
+}
+
+/**
+ * pvrdma_post_receive - post receive work request entries on a QP
+ * @ibqp: the QP
+ * @wr: the work request list to post
+ * @bad_wr: the first bad WR returned
+ *
+ * @return: 0 on success, otherwise errno returned.
+ */
+int pvrdma_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		     struct ib_recv_wr **bad_wr)
+{
+	struct pvrdma_dev *dev = to_vdev(ibqp->device);
+	unsigned long flags;
+	struct pvrdma_qp *qp = to_vqp(ibqp);
+	struct pvrdma_rq_wqe_hdr *wqe_hdr;
+	struct pvrdma_sge *sge;
+	int ret = 0;
+	int i;
+
+	/*
+	 * In the RESET state, we can fail immediately. For other states,
+	 * just post and let the device figure it out.
+	 */
+	if (qp->state == IB_QPS_RESET) {
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
+	if (qp->srq) {
+		dev_warn(&dev->pdev->dev, "QP associated with SRQ\n");
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	while (wr) {
+		unsigned int tail = 0;
+
+		if (unlikely(wr->num_sge > qp->rq.max_sg ||
+			     wr->num_sge < 0)) {
+			ret = -EINVAL;
+			*bad_wr = wr;
+			dev_warn_ratelimited(&dev->pdev->dev,
+					     "recv SGE overflow\n");
+			goto out;
+		}
+
+		if (unlikely(!pvrdma_idx_ring_has_space(
+				qp->rq.ring, qp->rq.wqe_cnt, &tail))) {
+			ret = -ENOMEM;
+			*bad_wr = wr;
+			dev_warn_ratelimited(&dev->pdev->dev,
+					     "recv queue full\n");
+			goto out;
+		}
+
+		wqe_hdr = (struct pvrdma_rq_wqe_hdr *)get_rq_wqe(qp, tail);
+		wqe_hdr->wr_id = wr->wr_id;
+		wqe_hdr->num_sge = wr->num_sge;
+		wqe_hdr->total_len = 0;
+
+		sge = (struct pvrdma_sge *)(wqe_hdr + 1);
+		for (i = 0; i < wr->num_sge; i++) {
+			sge->addr = wr->sg_list[i].addr;
+			sge->length = wr->sg_list[i].length;
+			sge->lkey = wr->sg_list[i].lkey;
+			sge++;
+		}
+
+		/* Make sure wqe is written before index update */
+		smp_wmb();
+
+		/* Update shared rq ring */
+		pvrdma_idx_ring_inc(&qp->rq.ring->prod_tail,
+				    qp->rq.wqe_cnt);
+
+		wr = wr->next;
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	pvrdma_write_uar_qp(dev, PVRDMA_UAR_QP_RECV | qp->qp_handle);
+
+	return ret;
+
+out:
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	return ret;
+}
+
+/**
+ * pvrdma_query_qp - query a queue pair's attributes
+ * @ibqp: the queue pair to query
+ * @attr: the queue pair's attributes
+ * @attr_mask: attributes mask
+ * @init_attr: initial queue pair attributes
+ *
+ * @returns 0 on success, otherwise returns an errno.
+ */
+int pvrdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		    int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct pvrdma_dev *dev = to_vdev(ibqp->device);
+	struct pvrdma_qp *qp = to_vqp(ibqp);
+	union pvrdma_cmd_req req;
+	union pvrdma_cmd_resp rsp;
+	struct pvrdma_cmd_query_qp *cmd = &req.query_qp;
+	struct pvrdma_cmd_query_qp_resp *resp = &rsp.query_qp_resp;
+	int ret = 0;
+
+	mutex_lock(&qp->mutex);
+
+	if (qp->state == IB_QPS_RESET) {
+		attr->qp_state = IB_QPS_RESET;
+		goto out;
+	}
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_QUERY_QP;
+	cmd->qp_handle = qp->qp_handle;
+	cmd->attr_mask = ib_qp_attr_mask_to_pvrdma(attr_mask);
+
+	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_QUERY_QP_RESP);
+	if (ret < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "could not query queuepair, error: %d\n", ret);
+		goto out;
+	}
+
+	attr->qp_state = pvrdma_qp_state_to_ib(resp->attrs.qp_state);
+	attr->cur_qp_state =
+		pvrdma_qp_state_to_ib(resp->attrs.cur_qp_state);
+	attr->path_mtu = pvrdma_mtu_to_ib(resp->attrs.path_mtu);
+	attr->path_mig_state =
+		pvrdma_mig_state_to_ib(resp->attrs.path_mig_state);
+	attr->qkey = resp->attrs.qkey;
+	attr->rq_psn = resp->attrs.rq_psn;
+	attr->sq_psn = resp->attrs.sq_psn;
+	attr->dest_qp_num = resp->attrs.dest_qp_num;
+	attr->qp_access_flags =
+		pvrdma_access_flags_to_ib(resp->attrs.qp_access_flags);
+	attr->pkey_index = resp->attrs.pkey_index;
+	attr->alt_pkey_index = resp->attrs.alt_pkey_index;
+	attr->en_sqd_async_notify = resp->attrs.en_sqd_async_notify;
+	attr->sq_draining = resp->attrs.sq_draining;
+	attr->max_rd_atomic = resp->attrs.max_rd_atomic;
+	attr->max_dest_rd_atomic = resp->attrs.max_dest_rd_atomic;
+	attr->min_rnr_timer = resp->attrs.min_rnr_timer;
+	attr->port_num = resp->attrs.port_num;
+	attr->timeout = resp->attrs.timeout;
+	attr->retry_cnt = resp->attrs.retry_cnt;
+	attr->rnr_retry = resp->attrs.rnr_retry;
+	attr->alt_port_num = resp->attrs.alt_port_num;
+	attr->alt_timeout = resp->attrs.alt_timeout;
+	pvrdma_qp_cap_to_ib(&attr->cap, &resp->attrs.cap);
+	pvrdma_ah_attr_to_rdma(&attr->ah_attr, &resp->attrs.ah_attr);
+	pvrdma_ah_attr_to_rdma(&attr->alt_ah_attr, &resp->attrs.alt_ah_attr);
+
+	qp->state = attr->qp_state;
+
+	ret = 0;
+
+out:
+	attr->cur_qp_state = attr->qp_state;
+
+	init_attr->event_handler = qp->ibqp.event_handler;
+	init_attr->qp_context = qp->ibqp.qp_context;
+	init_attr->send_cq = qp->ibqp.send_cq;
+	init_attr->recv_cq = qp->ibqp.recv_cq;
+	init_attr->srq = qp->ibqp.srq;
+	init_attr->xrcd = NULL;
+	init_attr->cap = attr->cap;
+	init_attr->sq_sig_type = 0;
+	init_attr->qp_type = qp->ibqp.qp_type;
+	init_attr->create_flags = 0;
+	init_attr->port_num = qp->port;
+
+	mutex_unlock(&qp->mutex);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h
new file mode 100644
index 0000000..8b558ae
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __PVRDMA_RING_H__
+#define __PVRDMA_RING_H__
+
+#include <linux/types.h>
+
+#define PVRDMA_INVALID_IDX	-1	/* Invalid index. */
+
+struct pvrdma_ring {
+	atomic_t prod_tail;	/* Producer tail. */
+	atomic_t cons_head;	/* Consumer head. */
+};
+
+struct pvrdma_ring_state {
+	struct pvrdma_ring tx;	/* Tx ring. */
+	struct pvrdma_ring rx;	/* Rx ring. */
+};
+
+static inline int pvrdma_idx_valid(__u32 idx, __u32 max_elems)
+{
+	/* Generates fewer instructions than a less-than. */
+	return (idx & ~((max_elems << 1) - 1)) == 0;
+}
+
+static inline __s32 pvrdma_idx(atomic_t *var, __u32 max_elems)
+{
+	const unsigned int idx = atomic_read(var);
+
+	if (pvrdma_idx_valid(idx, max_elems))
+		return idx & (max_elems - 1);
+	return PVRDMA_INVALID_IDX;
+}
+
+static inline void pvrdma_idx_ring_inc(atomic_t *var, __u32 max_elems)
+{
+	__u32 idx = atomic_read(var) + 1;	/* Increment. */
+
+	idx &= (max_elems << 1) - 1;		/* Modulo size, flip gen. */
+	atomic_set(var, idx);
+}
+
+static inline __s32 pvrdma_idx_ring_has_space(const struct pvrdma_ring *r,
+					      __u32 max_elems, __u32 *out_tail)
+{
+	const __u32 tail = atomic_read(&r->prod_tail);
+	const __u32 head = atomic_read(&r->cons_head);
+
+	if (pvrdma_idx_valid(tail, max_elems) &&
+	    pvrdma_idx_valid(head, max_elems)) {
+		*out_tail = tail & (max_elems - 1);
+		return tail != (head ^ max_elems);
+	}
+	return PVRDMA_INVALID_IDX;
+}
+
+static inline __s32 pvrdma_idx_ring_has_data(const struct pvrdma_ring *r,
+					     __u32 max_elems, __u32 *out_head)
+{
+	const __u32 tail = atomic_read(&r->prod_tail);
+	const __u32 head = atomic_read(&r->cons_head);
+
+	if (pvrdma_idx_valid(tail, max_elems) &&
+	    pvrdma_idx_valid(head, max_elems)) {
+		*out_head = head & (max_elems - 1);
+		return tail != head;
+	}
+	return PVRDMA_INVALID_IDX;
+}
+
+#endif /* __PVRDMA_RING_H__ */
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
new file mode 100644
index 0000000..af23596
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2016-2017 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <asm/page.h>
+#include <linux/io.h>
+#include <linux/wait.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "pvrdma.h"
+
+int pvrdma_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			 struct ib_recv_wr **bad_wr)
+{
+	/* No support for kernel clients. */
+	return -EOPNOTSUPP;
+}
+
+/**
+ * pvrdma_query_srq - query shared receive queue
+ * @ibsrq: the shared receive queue to query
+ * @srq_attr: attributes to query and return to client
+ *
+ * @return: 0 for success, otherwise returns an errno.
+ */
+int pvrdma_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct pvrdma_dev *dev = to_vdev(ibsrq->device);
+	struct pvrdma_srq *srq = to_vsrq(ibsrq);
+	union pvrdma_cmd_req req;
+	union pvrdma_cmd_resp rsp;
+	struct pvrdma_cmd_query_srq *cmd = &req.query_srq;
+	struct pvrdma_cmd_query_srq_resp *resp = &rsp.query_srq_resp;
+	int ret;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_QUERY_SRQ;
+	cmd->srq_handle = srq->srq_handle;
+
+	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_QUERY_SRQ_RESP);
+	if (ret < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "could not query shared receive queue, error: %d\n",
+			 ret);
+		return -EINVAL;
+	}
+
+	srq_attr->srq_limit = resp->attrs.srq_limit;
+	srq_attr->max_wr = resp->attrs.max_wr;
+	srq_attr->max_sge = resp->attrs.max_sge;
+
+	return 0;
+}
+
+/**
+ * pvrdma_create_srq - create shared receive queue
+ * @pd: protection domain
+ * @init_attr: shared receive queue attributes
+ * @udata: user data
+ *
+ * @return: the ib_srq pointer on success, otherwise returns an errno.
+ */
+struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
+				 struct ib_srq_init_attr *init_attr,
+				 struct ib_udata *udata)
+{
+	struct pvrdma_srq *srq = NULL;
+	struct pvrdma_dev *dev = to_vdev(pd->device);
+	union pvrdma_cmd_req req;
+	union pvrdma_cmd_resp rsp;
+	struct pvrdma_cmd_create_srq *cmd = &req.create_srq;
+	struct pvrdma_cmd_create_srq_resp *resp = &rsp.create_srq_resp;
+	struct pvrdma_create_srq_resp srq_resp = {0};
+	struct pvrdma_create_srq ucmd;
+	unsigned long flags;
+	int ret;
+
+	if (!(pd->uobject && udata)) {
+		/* No support for kernel clients. */
+		dev_warn(&dev->pdev->dev,
+			 "no shared receive queue support for kernel client\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (init_attr->srq_type != IB_SRQT_BASIC) {
+		dev_warn(&dev->pdev->dev,
+			 "shared receive queue type %d not supported\n",
+			 init_attr->srq_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (init_attr->attr.max_wr  > dev->dsr->caps.max_srq_wr ||
+	    init_attr->attr.max_sge > dev->dsr->caps.max_srq_sge) {
+		dev_warn(&dev->pdev->dev,
+			 "shared receive queue size invalid\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!atomic_add_unless(&dev->num_srqs, 1, dev->dsr->caps.max_srq))
+		return ERR_PTR(-ENOMEM);
+
+	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq) {
+		ret = -ENOMEM;
+		goto err_srq;
+	}
+
+	spin_lock_init(&srq->lock);
+	refcount_set(&srq->refcnt, 1);
+	init_completion(&srq->free);
+
+	dev_dbg(&dev->pdev->dev,
+		"create shared receive queue from user space\n");
+
+	if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+		ret = -EFAULT;
+		goto err_srq;
+	}
+
+	srq->umem = ib_umem_get(pd->uobject->context,
+				ucmd.buf_addr,
+				ucmd.buf_size, 0, 0);
+	if (IS_ERR(srq->umem)) {
+		ret = PTR_ERR(srq->umem);
+		goto err_srq;
+	}
+
+	srq->npages = ib_umem_page_count(srq->umem);
+
+	if (srq->npages < 0 || srq->npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
+		dev_warn(&dev->pdev->dev,
+			 "overflow pages in shared receive queue\n");
+		ret = -EINVAL;
+		goto err_umem;
+	}
+
+	ret = pvrdma_page_dir_init(dev, &srq->pdir, srq->npages, false);
+	if (ret) {
+		dev_warn(&dev->pdev->dev,
+			 "could not allocate page directory\n");
+		goto err_umem;
+	}
+
+	pvrdma_page_dir_insert_umem(&srq->pdir, srq->umem, 0);
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_CREATE_SRQ;
+	cmd->srq_type = init_attr->srq_type;
+	cmd->nchunks = srq->npages;
+	cmd->pd_handle = to_vpd(pd)->pd_handle;
+	cmd->attrs.max_wr = init_attr->attr.max_wr;
+	cmd->attrs.max_sge = init_attr->attr.max_sge;
+	cmd->attrs.srq_limit = init_attr->attr.srq_limit;
+	cmd->pdir_dma = srq->pdir.dir_dma;
+
+	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_SRQ_RESP);
+	if (ret < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "could not create shared receive queue, error: %d\n",
+			 ret);
+		goto err_page_dir;
+	}
+
+	srq->srq_handle = resp->srqn;
+	srq_resp.srqn = resp->srqn;
+	spin_lock_irqsave(&dev->srq_tbl_lock, flags);
+	dev->srq_tbl[srq->srq_handle % dev->dsr->caps.max_srq] = srq;
+	spin_unlock_irqrestore(&dev->srq_tbl_lock, flags);
+
+	/* Copy udata back. */
+	if (ib_copy_to_udata(udata, &srq_resp, sizeof(srq_resp))) {
+		dev_warn(&dev->pdev->dev, "failed to copy back udata\n");
+		pvrdma_destroy_srq(&srq->ibsrq);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return &srq->ibsrq;
+
+err_page_dir:
+	pvrdma_page_dir_cleanup(dev, &srq->pdir);
+err_umem:
+	ib_umem_release(srq->umem);
+err_srq:
+	kfree(srq);
+	atomic_dec(&dev->num_srqs);
+
+	return ERR_PTR(ret);
+}
+
+static void pvrdma_free_srq(struct pvrdma_dev *dev, struct pvrdma_srq *srq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->srq_tbl_lock, flags);
+	dev->srq_tbl[srq->srq_handle] = NULL;
+	spin_unlock_irqrestore(&dev->srq_tbl_lock, flags);
+
+	if (refcount_dec_and_test(&srq->refcnt))
+		complete(&srq->free);
+	wait_for_completion(&srq->free);
+
+	/* There is no support for kernel clients, so this is safe. */
+	ib_umem_release(srq->umem);
+
+	pvrdma_page_dir_cleanup(dev, &srq->pdir);
+
+	kfree(srq);
+
+	atomic_dec(&dev->num_srqs);
+}
+
+/**
+ * pvrdma_destroy_srq - destroy shared receive queue
+ * @srq: the shared receive queue to destroy
+ *
+ * @return: 0 for success.
+ */
+int pvrdma_destroy_srq(struct ib_srq *srq)
+{
+	struct pvrdma_srq *vsrq = to_vsrq(srq);
+	union pvrdma_cmd_req req;
+	struct pvrdma_cmd_destroy_srq *cmd = &req.destroy_srq;
+	struct pvrdma_dev *dev = to_vdev(srq->device);
+	int ret;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_DESTROY_SRQ;
+	cmd->srq_handle = vsrq->srq_handle;
+
+	ret = pvrdma_cmd_post(dev, &req, NULL, 0);
+	if (ret < 0)
+		dev_warn(&dev->pdev->dev,
+			 "destroy shared receive queue failed, error: %d\n",
+			 ret);
+
+	pvrdma_free_srq(dev, vsrq);
+
+	return 0;
+}
+
+/**
+ * pvrdma_modify_srq - modify shared receive queue attributes
+ * @ibsrq: the shared receive queue to modify
+ * @attr: the shared receive queue's new attributes
+ * @attr_mask: attributes mask
+ * @udata: user data
+ *
+ * @returns 0 on success, otherwise returns an errno.
+ */
+int pvrdma_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		      enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct pvrdma_srq *vsrq = to_vsrq(ibsrq);
+	union pvrdma_cmd_req req;
+	struct pvrdma_cmd_modify_srq *cmd = &req.modify_srq;
+	struct pvrdma_dev *dev = to_vdev(ibsrq->device);
+	int ret;
+
+	/* Only support SRQ limit. */
+	if (!(attr_mask & IB_SRQ_LIMIT))
+		return -EINVAL;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_MODIFY_SRQ;
+	cmd->srq_handle = vsrq->srq_handle;
+	cmd->attrs.srq_limit = attr->srq_limit;
+	cmd->attr_mask = attr_mask;
+
+	ret = pvrdma_cmd_post(dev, &req, NULL, 0);
+	if (ret < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "could not modify shared receive queue, error: %d\n",
+			 ret);
+
+		return -EINVAL;
+	}
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
new file mode 100644
index 0000000..a51463c
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
@@ -0,0 +1,593 @@
+/*
+ * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <asm/page.h>
+#include <linux/inet.h>
+#include <linux/io.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/vmw_pvrdma-abi.h>
+
+#include "pvrdma.h"
+
+/**
+ * pvrdma_query_device - query device
+ * @ibdev: the device to query
+ * @props: the device properties
+ * @uhw: user data
+ *
+ * @return: 0 on success, otherwise negative errno
+ */
+int pvrdma_query_device(struct ib_device *ibdev,
+			struct ib_device_attr *props,
+			struct ib_udata *uhw)
+{
+	struct pvrdma_dev *dev = to_vdev(ibdev);
+
+	if (uhw->inlen || uhw->outlen)
+		return -EINVAL;
+
+	memset(props, 0, sizeof(*props));
+
+	props->fw_ver = dev->dsr->caps.fw_ver;
+	props->sys_image_guid = dev->dsr->caps.sys_image_guid;
+	props->max_mr_size = dev->dsr->caps.max_mr_size;
+	props->page_size_cap = dev->dsr->caps.page_size_cap;
+	props->vendor_id = dev->dsr->caps.vendor_id;
+	props->vendor_part_id = dev->pdev->device;
+	props->hw_ver = dev->dsr->caps.hw_ver;
+	props->max_qp = dev->dsr->caps.max_qp;
+	props->max_qp_wr = dev->dsr->caps.max_qp_wr;
+	props->device_cap_flags = dev->dsr->caps.device_cap_flags;
+	props->max_sge = dev->dsr->caps.max_sge;
+	props->max_sge_rd = PVRDMA_GET_CAP(dev, dev->dsr->caps.max_sge,
+					   dev->dsr->caps.max_sge_rd);
+	props->max_srq = dev->dsr->caps.max_srq;
+	props->max_srq_wr = dev->dsr->caps.max_srq_wr;
+	props->max_srq_sge = dev->dsr->caps.max_srq_sge;
+	props->max_cq = dev->dsr->caps.max_cq;
+	props->max_cqe = dev->dsr->caps.max_cqe;
+	props->max_mr = dev->dsr->caps.max_mr;
+	props->max_pd = dev->dsr->caps.max_pd;
+	props->max_qp_rd_atom = dev->dsr->caps.max_qp_rd_atom;
+	props->max_qp_init_rd_atom = dev->dsr->caps.max_qp_init_rd_atom;
+	props->atomic_cap =
+		dev->dsr->caps.atomic_ops &
+		(PVRDMA_ATOMIC_OP_COMP_SWAP | PVRDMA_ATOMIC_OP_FETCH_ADD) ?
+		IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+	props->masked_atomic_cap = props->atomic_cap;
+	props->max_ah = dev->dsr->caps.max_ah;
+	props->max_pkeys = dev->dsr->caps.max_pkeys;
+	props->local_ca_ack_delay = dev->dsr->caps.local_ca_ack_delay;
+	if ((dev->dsr->caps.bmme_flags & PVRDMA_BMME_FLAG_LOCAL_INV) &&
+	    (dev->dsr->caps.bmme_flags & PVRDMA_BMME_FLAG_REMOTE_INV) &&
+	    (dev->dsr->caps.bmme_flags & PVRDMA_BMME_FLAG_FAST_REG_WR)) {
+		props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+		props->max_fast_reg_page_list_len = PVRDMA_GET_CAP(dev,
+				PVRDMA_MAX_FAST_REG_PAGES,
+				dev->dsr->caps.max_fast_reg_page_list_len);
+	}
+
+	props->device_cap_flags |= IB_DEVICE_PORT_ACTIVE_EVENT |
+				   IB_DEVICE_RC_RNR_NAK_GEN;
+
+	return 0;
+}
+
+/**
+ * pvrdma_query_port - query device port attributes
+ * @ibdev: the device to query
+ * @port: the port number
+ * @props: the device properties
+ *
+ * @return: 0 on success, otherwise negative errno
+ */
+int pvrdma_query_port(struct ib_device *ibdev, u8 port,
+		      struct ib_port_attr *props)
+{
+	struct pvrdma_dev *dev = to_vdev(ibdev);
+	union pvrdma_cmd_req req;
+	union pvrdma_cmd_resp rsp;
+	struct pvrdma_cmd_query_port *cmd = &req.query_port;
+	struct pvrdma_cmd_query_port_resp *resp = &rsp.query_port_resp;
+	int err;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_QUERY_PORT;
+	cmd->port_num = port;
+
+	err = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_QUERY_PORT_RESP);
+	if (err < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "could not query port, error: %d\n", err);
+		return err;
+	}
+
+	/* props being zeroed by the caller, avoid zeroing it here */
+
+	props->state = pvrdma_port_state_to_ib(resp->attrs.state);
+	props->max_mtu = pvrdma_mtu_to_ib(resp->attrs.max_mtu);
+	props->active_mtu = pvrdma_mtu_to_ib(resp->attrs.active_mtu);
+	props->gid_tbl_len = resp->attrs.gid_tbl_len;
+	props->port_cap_flags =
+		pvrdma_port_cap_flags_to_ib(resp->attrs.port_cap_flags);
+	props->port_cap_flags |= IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS;
+	props->max_msg_sz = resp->attrs.max_msg_sz;
+	props->bad_pkey_cntr = resp->attrs.bad_pkey_cntr;
+	props->qkey_viol_cntr = resp->attrs.qkey_viol_cntr;
+	props->pkey_tbl_len = resp->attrs.pkey_tbl_len;
+	props->lid = resp->attrs.lid;
+	props->sm_lid = resp->attrs.sm_lid;
+	props->lmc = resp->attrs.lmc;
+	props->max_vl_num = resp->attrs.max_vl_num;
+	props->sm_sl = resp->attrs.sm_sl;
+	props->subnet_timeout = resp->attrs.subnet_timeout;
+	props->init_type_reply = resp->attrs.init_type_reply;
+	props->active_width = pvrdma_port_width_to_ib(resp->attrs.active_width);
+	props->active_speed = pvrdma_port_speed_to_ib(resp->attrs.active_speed);
+	props->phys_state = resp->attrs.phys_state;
+
+	return 0;
+}
+
+/**
+ * pvrdma_query_gid - query device gid
+ * @ibdev: the device to query
+ * @port: the port number
+ * @index: the index
+ * @gid: the device gid value
+ *
+ * @return: 0 on success, otherwise negative errno
+ */
+int pvrdma_query_gid(struct ib_device *ibdev, u8 port, int index,
+		     union ib_gid *gid)
+{
+	struct pvrdma_dev *dev = to_vdev(ibdev);
+
+	if (index >= dev->dsr->caps.gid_tbl_len)
+		return -EINVAL;
+
+	memcpy(gid, &dev->sgid_tbl[index], sizeof(union ib_gid));
+
+	return 0;
+}
+
+/**
+ * pvrdma_query_pkey - query device port's P_Key table
+ * @ibdev: the device to query
+ * @port: the port number
+ * @index: the index
+ * @pkey: the device P_Key value
+ *
+ * @return: 0 on success, otherwise negative errno
+ */
+int pvrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+		      u16 *pkey)
+{
+	int err = 0;
+	union pvrdma_cmd_req req;
+	union pvrdma_cmd_resp rsp;
+	struct pvrdma_cmd_query_pkey *cmd = &req.query_pkey;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_QUERY_PKEY;
+	cmd->port_num = port;
+	cmd->index = index;
+
+	err = pvrdma_cmd_post(to_vdev(ibdev), &req, &rsp,
+			      PVRDMA_CMD_QUERY_PKEY_RESP);
+	if (err < 0) {
+		dev_warn(&to_vdev(ibdev)->pdev->dev,
+			 "could not query pkey, error: %d\n", err);
+		return err;
+	}
+
+	*pkey = rsp.query_pkey_resp.pkey;
+
+	return 0;
+}
+
+enum rdma_link_layer pvrdma_port_link_layer(struct ib_device *ibdev,
+					    u8 port)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+int pvrdma_modify_device(struct ib_device *ibdev, int mask,
+			 struct ib_device_modify *props)
+{
+	unsigned long flags;
+
+	if (mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+		     IB_DEVICE_MODIFY_NODE_DESC)) {
+		dev_warn(&to_vdev(ibdev)->pdev->dev,
+			 "unsupported device modify mask %#x\n", mask);
+		return -EOPNOTSUPP;
+	}
+
+	if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		spin_lock_irqsave(&to_vdev(ibdev)->desc_lock, flags);
+		memcpy(ibdev->node_desc, props->node_desc, 64);
+		spin_unlock_irqrestore(&to_vdev(ibdev)->desc_lock, flags);
+	}
+
+	if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
+		mutex_lock(&to_vdev(ibdev)->port_mutex);
+		to_vdev(ibdev)->sys_image_guid =
+			cpu_to_be64(props->sys_image_guid);
+		mutex_unlock(&to_vdev(ibdev)->port_mutex);
+	}
+
+	return 0;
+}
+
+/**
+ * pvrdma_modify_port - modify device port attributes
+ * @ibdev: the device to modify
+ * @port: the port number
+ * @mask: attributes to modify
+ * @props: the device properties
+ *
+ * @return: 0 on success, otherwise negative errno
+ */
+int pvrdma_modify_port(struct ib_device *ibdev, u8 port, int mask,
+		       struct ib_port_modify *props)
+{
+	struct ib_port_attr attr;
+	struct pvrdma_dev *vdev = to_vdev(ibdev);
+	int ret;
+
+	if (mask & ~IB_PORT_SHUTDOWN) {
+		dev_warn(&vdev->pdev->dev,
+			 "unsupported port modify mask %#x\n", mask);
+		return -EOPNOTSUPP;
+	}
+
+	mutex_lock(&vdev->port_mutex);
+	ret = ib_query_port(ibdev, port, &attr);
+	if (ret)
+		goto out;
+
+	vdev->port_cap_mask |= props->set_port_cap_mask;
+	vdev->port_cap_mask &= ~props->clr_port_cap_mask;
+
+	if (mask & IB_PORT_SHUTDOWN)
+		vdev->ib_active = false;
+
+out:
+	mutex_unlock(&vdev->port_mutex);
+	return ret;
+}
+
+/**
+ * pvrdma_alloc_ucontext - allocate ucontext
+ * @ibdev: the IB device
+ * @udata: user data
+ *
+ * @return: the ib_ucontext pointer on success, otherwise errno.
+ */
+struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev,
+					  struct ib_udata *udata)
+{
+	struct pvrdma_dev *vdev = to_vdev(ibdev);
+	struct pvrdma_ucontext *context;
+	union pvrdma_cmd_req req;
+	union pvrdma_cmd_resp rsp;
+	struct pvrdma_cmd_create_uc *cmd = &req.create_uc;
+	struct pvrdma_cmd_create_uc_resp *resp = &rsp.create_uc_resp;
+	struct pvrdma_alloc_ucontext_resp uresp = {0};
+	int ret;
+	void *ptr;
+
+	if (!vdev->ib_active)
+		return ERR_PTR(-EAGAIN);
+
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	context->dev = vdev;
+	ret = pvrdma_uar_alloc(vdev, &context->uar);
+	if (ret) {
+		kfree(context);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* get ctx_handle from host */
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->pfn = context->uar.pfn;
+	cmd->hdr.cmd = PVRDMA_CMD_CREATE_UC;
+	ret = pvrdma_cmd_post(vdev, &req, &rsp, PVRDMA_CMD_CREATE_UC_RESP);
+	if (ret < 0) {
+		dev_warn(&vdev->pdev->dev,
+			 "could not create ucontext, error: %d\n", ret);
+		ptr = ERR_PTR(ret);
+		goto err;
+	}
+
+	context->ctx_handle = resp->ctx_handle;
+
+	/* copy back to user */
+	uresp.qp_tab_size = vdev->dsr->caps.max_qp;
+	ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (ret) {
+		pvrdma_uar_free(vdev, &context->uar);
+		context->ibucontext.device = ibdev;
+		pvrdma_dealloc_ucontext(&context->ibucontext);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return &context->ibucontext;
+
+err:
+	pvrdma_uar_free(vdev, &context->uar);
+	kfree(context);
+	return ptr;
+}
+
+/**
+ * pvrdma_dealloc_ucontext - deallocate ucontext
+ * @ibcontext: the ucontext
+ *
+ * @return: 0 on success, otherwise errno.
+ */
+int pvrdma_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct pvrdma_ucontext *context = to_vucontext(ibcontext);
+	union pvrdma_cmd_req req;
+	struct pvrdma_cmd_destroy_uc *cmd = &req.destroy_uc;
+	int ret;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_DESTROY_UC;
+	cmd->ctx_handle = context->ctx_handle;
+
+	ret = pvrdma_cmd_post(context->dev, &req, NULL, 0);
+	if (ret < 0)
+		dev_warn(&context->dev->pdev->dev,
+			 "destroy ucontext failed, error: %d\n", ret);
+
+	/* Free the UAR even if the device command failed */
+	pvrdma_uar_free(to_vdev(ibcontext->device), &context->uar);
+	kfree(context);
+
+	return ret;
+}
+
+/**
+ * pvrdma_mmap - create mmap region
+ * @ibcontext: the user context
+ * @vma: the VMA
+ *
+ * @return: 0 on success, otherwise errno.
+ */
+int pvrdma_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
+{
+	struct pvrdma_ucontext *context = to_vucontext(ibcontext);
+	unsigned long start = vma->vm_start;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+
+	dev_dbg(&context->dev->pdev->dev, "create mmap region\n");
+
+	if ((size != PAGE_SIZE) || (offset & ~PAGE_MASK)) {
+		dev_warn(&context->dev->pdev->dev,
+			 "invalid params for mmap region\n");
+		return -EINVAL;
+	}
+
+	/* Map UAR to kernel space, VM_LOCKED? */
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	if (io_remap_pfn_range(vma, start, context->uar.pfn, size,
+			       vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+/**
+ * pvrdma_alloc_pd - allocate protection domain
+ * @ibdev: the IB device
+ * @context: user context
+ * @udata: user data
+ *
+ * @return: the ib_pd protection domain pointer on success, otherwise errno.
+ */
+struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev,
+			      struct ib_ucontext *context,
+			      struct ib_udata *udata)
+{
+	struct pvrdma_pd *pd;
+	struct pvrdma_dev *dev = to_vdev(ibdev);
+	union pvrdma_cmd_req req;
+	union pvrdma_cmd_resp rsp;
+	struct pvrdma_cmd_create_pd *cmd = &req.create_pd;
+	struct pvrdma_cmd_create_pd_resp *resp = &rsp.create_pd_resp;
+	struct pvrdma_alloc_pd_resp pd_resp = {0};
+	int ret;
+	void *ptr;
+
+	/* Check allowed max pds */
+	if (!atomic_add_unless(&dev->num_pds, 1, dev->dsr->caps.max_pd))
+		return ERR_PTR(-ENOMEM);
+
+	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd) {
+		ptr = ERR_PTR(-ENOMEM);
+		goto err;
+	}
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_CREATE_PD;
+	cmd->ctx_handle = (context) ? to_vucontext(context)->ctx_handle : 0;
+	ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_PD_RESP);
+	if (ret < 0) {
+		dev_warn(&dev->pdev->dev,
+			 "failed to allocate protection domain, error: %d\n",
+			 ret);
+		ptr = ERR_PTR(ret);
+		goto freepd;
+	}
+
+	pd->privileged = !context;
+	pd->pd_handle = resp->pd_handle;
+	pd->pdn = resp->pd_handle;
+	pd_resp.pdn = resp->pd_handle;
+
+	if (context) {
+		if (ib_copy_to_udata(udata, &pd_resp, sizeof(pd_resp))) {
+			dev_warn(&dev->pdev->dev,
+				 "failed to copy back protection domain\n");
+			pvrdma_dealloc_pd(&pd->ibpd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	/* u32 pd handle */
+	return &pd->ibpd;
+
+freepd:
+	kfree(pd);
+err:
+	atomic_dec(&dev->num_pds);
+	return ptr;
+}
+
+/**
+ * pvrdma_dealloc_pd - deallocate protection domain
+ * @pd: the protection domain to be released
+ *
+ * @return: 0 on success, otherwise errno.
+ */
+int pvrdma_dealloc_pd(struct ib_pd *pd)
+{
+	struct pvrdma_dev *dev = to_vdev(pd->device);
+	union pvrdma_cmd_req req;
+	struct pvrdma_cmd_destroy_pd *cmd = &req.destroy_pd;
+	int ret;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->hdr.cmd = PVRDMA_CMD_DESTROY_PD;
+	cmd->pd_handle = to_vpd(pd)->pd_handle;
+
+	ret = pvrdma_cmd_post(dev, &req, NULL, 0);
+	if (ret)
+		dev_warn(&dev->pdev->dev,
+			 "could not dealloc protection domain, error: %d\n",
+			 ret);
+
+	kfree(to_vpd(pd));
+	atomic_dec(&dev->num_pds);
+
+	return 0;
+}
+
+/**
+ * pvrdma_create_ah - create an address handle
+ * @pd: the protection domain
+ * @ah_attr: the attributes of the AH
+ * @udata: user data blob
+ *
+ * @return: the ib_ah pointer on success, otherwise errno.
+ */
+struct ib_ah *pvrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
+			       struct ib_udata *udata)
+{
+	struct pvrdma_dev *dev = to_vdev(pd->device);
+	struct pvrdma_ah *ah;
+	const struct ib_global_route *grh;
+	u8 port_num = rdma_ah_get_port_num(ah_attr);
+
+	if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
+		return ERR_PTR(-EINVAL);
+
+	grh = rdma_ah_read_grh(ah_attr);
+	if ((ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE)  ||
+	    rdma_is_multicast_addr((struct in6_addr *)grh->dgid.raw))
+		return ERR_PTR(-EINVAL);
+
+	if (!atomic_add_unless(&dev->num_ahs, 1, dev->dsr->caps.max_ah))
+		return ERR_PTR(-ENOMEM);
+
+	ah = kzalloc(sizeof(*ah), GFP_KERNEL);
+	if (!ah) {
+		atomic_dec(&dev->num_ahs);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ah->av.port_pd = to_vpd(pd)->pd_handle | (port_num << 24);
+	ah->av.src_path_bits = rdma_ah_get_path_bits(ah_attr);
+	ah->av.src_path_bits |= 0x80;
+	ah->av.gid_index = grh->sgid_index;
+	ah->av.hop_limit = grh->hop_limit;
+	ah->av.sl_tclass_flowlabel = (grh->traffic_class << 20) |
+				      grh->flow_label;
+	memcpy(ah->av.dgid, grh->dgid.raw, 16);
+	memcpy(ah->av.dmac, ah_attr->roce.dmac, ETH_ALEN);
+
+	ah->ibah.device = pd->device;
+	ah->ibah.pd = pd;
+	ah->ibah.uobject = NULL;
+
+	return &ah->ibah;
+}
+
+/**
+ * pvrdma_destroy_ah - destroy an address handle
+ * @ah: the address handle to destroyed
+ *
+ * @return: 0 on success.
+ */
+int pvrdma_destroy_ah(struct ib_ah *ah)
+{
+	struct pvrdma_dev *dev = to_vdev(ah->device);
+
+	kfree(to_vah(ah));
+	atomic_dec(&dev->num_ahs);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
new file mode 100644
index 0000000..b7b2572
--- /dev/null
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __PVRDMA_VERBS_H__
+#define __PVRDMA_VERBS_H__
+
+#include <linux/types.h>
+
+union pvrdma_gid {
+	u8	raw[16];
+	struct {
+		__be64	subnet_prefix;
+		__be64	interface_id;
+	} global;
+};
+
+enum pvrdma_link_layer {
+	PVRDMA_LINK_LAYER_UNSPECIFIED,
+	PVRDMA_LINK_LAYER_INFINIBAND,
+	PVRDMA_LINK_LAYER_ETHERNET,
+};
+
+enum pvrdma_mtu {
+	PVRDMA_MTU_256  = 1,
+	PVRDMA_MTU_512  = 2,
+	PVRDMA_MTU_1024 = 3,
+	PVRDMA_MTU_2048 = 4,
+	PVRDMA_MTU_4096 = 5,
+};
+
+static inline int pvrdma_mtu_enum_to_int(enum pvrdma_mtu mtu)
+{
+	switch (mtu) {
+	case PVRDMA_MTU_256:	return  256;
+	case PVRDMA_MTU_512:	return  512;
+	case PVRDMA_MTU_1024:	return 1024;
+	case PVRDMA_MTU_2048:	return 2048;
+	case PVRDMA_MTU_4096:	return 4096;
+	default:		return   -1;
+	}
+}
+
+static inline enum pvrdma_mtu pvrdma_mtu_int_to_enum(int mtu)
+{
+	switch (mtu) {
+	case 256:	return PVRDMA_MTU_256;
+	case 512:	return PVRDMA_MTU_512;
+	case 1024:	return PVRDMA_MTU_1024;
+	case 2048:	return PVRDMA_MTU_2048;
+	case 4096:
+	default:	return PVRDMA_MTU_4096;
+	}
+}
+
+enum pvrdma_port_state {
+	PVRDMA_PORT_NOP			= 0,
+	PVRDMA_PORT_DOWN		= 1,
+	PVRDMA_PORT_INIT		= 2,
+	PVRDMA_PORT_ARMED		= 3,
+	PVRDMA_PORT_ACTIVE		= 4,
+	PVRDMA_PORT_ACTIVE_DEFER	= 5,
+};
+
+enum pvrdma_port_cap_flags {
+	PVRDMA_PORT_SM				= 1 <<  1,
+	PVRDMA_PORT_NOTICE_SUP			= 1 <<  2,
+	PVRDMA_PORT_TRAP_SUP			= 1 <<  3,
+	PVRDMA_PORT_OPT_IPD_SUP			= 1 <<  4,
+	PVRDMA_PORT_AUTO_MIGR_SUP		= 1 <<  5,
+	PVRDMA_PORT_SL_MAP_SUP			= 1 <<  6,
+	PVRDMA_PORT_MKEY_NVRAM			= 1 <<  7,
+	PVRDMA_PORT_PKEY_NVRAM			= 1 <<  8,
+	PVRDMA_PORT_LED_INFO_SUP		= 1 <<  9,
+	PVRDMA_PORT_SM_DISABLED			= 1 << 10,
+	PVRDMA_PORT_SYS_IMAGE_GUID_SUP		= 1 << 11,
+	PVRDMA_PORT_PKEY_SW_EXT_PORT_TRAP_SUP	= 1 << 12,
+	PVRDMA_PORT_EXTENDED_SPEEDS_SUP		= 1 << 14,
+	PVRDMA_PORT_CM_SUP			= 1 << 16,
+	PVRDMA_PORT_SNMP_TUNNEL_SUP		= 1 << 17,
+	PVRDMA_PORT_REINIT_SUP			= 1 << 18,
+	PVRDMA_PORT_DEVICE_MGMT_SUP		= 1 << 19,
+	PVRDMA_PORT_VENDOR_CLASS_SUP		= 1 << 20,
+	PVRDMA_PORT_DR_NOTICE_SUP		= 1 << 21,
+	PVRDMA_PORT_CAP_MASK_NOTICE_SUP		= 1 << 22,
+	PVRDMA_PORT_BOOT_MGMT_SUP		= 1 << 23,
+	PVRDMA_PORT_LINK_LATENCY_SUP		= 1 << 24,
+	PVRDMA_PORT_CLIENT_REG_SUP		= 1 << 25,
+	PVRDMA_PORT_IP_BASED_GIDS		= 1 << 26,
+	PVRDMA_PORT_CAP_FLAGS_MAX		= PVRDMA_PORT_IP_BASED_GIDS,
+};
+
+enum pvrdma_port_width {
+	PVRDMA_WIDTH_1X		= 1,
+	PVRDMA_WIDTH_4X		= 2,
+	PVRDMA_WIDTH_8X		= 4,
+	PVRDMA_WIDTH_12X	= 8,
+};
+
+static inline int pvrdma_width_enum_to_int(enum pvrdma_port_width width)
+{
+	switch (width) {
+	case PVRDMA_WIDTH_1X:	return  1;
+	case PVRDMA_WIDTH_4X:	return  4;
+	case PVRDMA_WIDTH_8X:	return  8;
+	case PVRDMA_WIDTH_12X:	return 12;
+	default:		return -1;
+	}
+}
+
+enum pvrdma_port_speed {
+	PVRDMA_SPEED_SDR	= 1,
+	PVRDMA_SPEED_DDR	= 2,
+	PVRDMA_SPEED_QDR	= 4,
+	PVRDMA_SPEED_FDR10	= 8,
+	PVRDMA_SPEED_FDR	= 16,
+	PVRDMA_SPEED_EDR	= 32,
+};
+
+struct pvrdma_port_attr {
+	enum pvrdma_port_state	state;
+	enum pvrdma_mtu		max_mtu;
+	enum pvrdma_mtu		active_mtu;
+	u32			gid_tbl_len;
+	u32			port_cap_flags;
+	u32			max_msg_sz;
+	u32			bad_pkey_cntr;
+	u32			qkey_viol_cntr;
+	u16			pkey_tbl_len;
+	u16			lid;
+	u16			sm_lid;
+	u8			lmc;
+	u8			max_vl_num;
+	u8			sm_sl;
+	u8			subnet_timeout;
+	u8			init_type_reply;
+	u8			active_width;
+	u8			active_speed;
+	u8			phys_state;
+	u8			reserved[2];
+};
+
+struct pvrdma_global_route {
+	union pvrdma_gid	dgid;
+	u32			flow_label;
+	u8			sgid_index;
+	u8			hop_limit;
+	u8			traffic_class;
+	u8			reserved;
+};
+
+struct pvrdma_grh {
+	__be32			version_tclass_flow;
+	__be16			paylen;
+	u8			next_hdr;
+	u8			hop_limit;
+	union pvrdma_gid	sgid;
+	union pvrdma_gid	dgid;
+};
+
+enum pvrdma_ah_flags {
+	PVRDMA_AH_GRH = 1,
+};
+
+enum pvrdma_rate {
+	PVRDMA_RATE_PORT_CURRENT	= 0,
+	PVRDMA_RATE_2_5_GBPS		= 2,
+	PVRDMA_RATE_5_GBPS		= 5,
+	PVRDMA_RATE_10_GBPS		= 3,
+	PVRDMA_RATE_20_GBPS		= 6,
+	PVRDMA_RATE_30_GBPS		= 4,
+	PVRDMA_RATE_40_GBPS		= 7,
+	PVRDMA_RATE_60_GBPS		= 8,
+	PVRDMA_RATE_80_GBPS		= 9,
+	PVRDMA_RATE_120_GBPS		= 10,
+	PVRDMA_RATE_14_GBPS		= 11,
+	PVRDMA_RATE_56_GBPS		= 12,
+	PVRDMA_RATE_112_GBPS		= 13,
+	PVRDMA_RATE_168_GBPS		= 14,
+	PVRDMA_RATE_25_GBPS		= 15,
+	PVRDMA_RATE_100_GBPS		= 16,
+	PVRDMA_RATE_200_GBPS		= 17,
+	PVRDMA_RATE_300_GBPS		= 18,
+};
+
+struct pvrdma_ah_attr {
+	struct pvrdma_global_route	grh;
+	u16				dlid;
+	u16				vlan_id;
+	u8				sl;
+	u8				src_path_bits;
+	u8				static_rate;
+	u8				ah_flags;
+	u8				port_num;
+	u8				dmac[6];
+	u8				reserved;
+};
+
+enum pvrdma_cq_notify_flags {
+	PVRDMA_CQ_SOLICITED		= 1 << 0,
+	PVRDMA_CQ_NEXT_COMP		= 1 << 1,
+	PVRDMA_CQ_SOLICITED_MASK	= PVRDMA_CQ_SOLICITED |
+					  PVRDMA_CQ_NEXT_COMP,
+	PVRDMA_CQ_REPORT_MISSED_EVENTS	= 1 << 2,
+};
+
+struct pvrdma_qp_cap {
+	u32	max_send_wr;
+	u32	max_recv_wr;
+	u32	max_send_sge;
+	u32	max_recv_sge;
+	u32	max_inline_data;
+	u32	reserved;
+};
+
+enum pvrdma_sig_type {
+	PVRDMA_SIGNAL_ALL_WR,
+	PVRDMA_SIGNAL_REQ_WR,
+};
+
+enum pvrdma_qp_type {
+	PVRDMA_QPT_SMI,
+	PVRDMA_QPT_GSI,
+	PVRDMA_QPT_RC,
+	PVRDMA_QPT_UC,
+	PVRDMA_QPT_UD,
+	PVRDMA_QPT_RAW_IPV6,
+	PVRDMA_QPT_RAW_ETHERTYPE,
+	PVRDMA_QPT_RAW_PACKET = 8,
+	PVRDMA_QPT_XRC_INI = 9,
+	PVRDMA_QPT_XRC_TGT,
+	PVRDMA_QPT_MAX,
+};
+
+enum pvrdma_qp_create_flags {
+	PVRDMA_QP_CREATE_IPOPVRDMA_UD_LSO		= 1 << 0,
+	PVRDMA_QP_CREATE_BLOCK_MULTICAST_LOOPBACK	= 1 << 1,
+};
+
+enum pvrdma_qp_attr_mask {
+	PVRDMA_QP_STATE			= 1 << 0,
+	PVRDMA_QP_CUR_STATE		= 1 << 1,
+	PVRDMA_QP_EN_SQD_ASYNC_NOTIFY	= 1 << 2,
+	PVRDMA_QP_ACCESS_FLAGS		= 1 << 3,
+	PVRDMA_QP_PKEY_INDEX		= 1 << 4,
+	PVRDMA_QP_PORT			= 1 << 5,
+	PVRDMA_QP_QKEY			= 1 << 6,
+	PVRDMA_QP_AV			= 1 << 7,
+	PVRDMA_QP_PATH_MTU		= 1 << 8,
+	PVRDMA_QP_TIMEOUT		= 1 << 9,
+	PVRDMA_QP_RETRY_CNT		= 1 << 10,
+	PVRDMA_QP_RNR_RETRY		= 1 << 11,
+	PVRDMA_QP_RQ_PSN		= 1 << 12,
+	PVRDMA_QP_MAX_QP_RD_ATOMIC	= 1 << 13,
+	PVRDMA_QP_ALT_PATH		= 1 << 14,
+	PVRDMA_QP_MIN_RNR_TIMER		= 1 << 15,
+	PVRDMA_QP_SQ_PSN		= 1 << 16,
+	PVRDMA_QP_MAX_DEST_RD_ATOMIC	= 1 << 17,
+	PVRDMA_QP_PATH_MIG_STATE	= 1 << 18,
+	PVRDMA_QP_CAP			= 1 << 19,
+	PVRDMA_QP_DEST_QPN		= 1 << 20,
+	PVRDMA_QP_ATTR_MASK_MAX		= PVRDMA_QP_DEST_QPN,
+};
+
+enum pvrdma_qp_state {
+	PVRDMA_QPS_RESET,
+	PVRDMA_QPS_INIT,
+	PVRDMA_QPS_RTR,
+	PVRDMA_QPS_RTS,
+	PVRDMA_QPS_SQD,
+	PVRDMA_QPS_SQE,
+	PVRDMA_QPS_ERR,
+};
+
+enum pvrdma_mig_state {
+	PVRDMA_MIG_MIGRATED,
+	PVRDMA_MIG_REARM,
+	PVRDMA_MIG_ARMED,
+};
+
+enum pvrdma_mw_type {
+	PVRDMA_MW_TYPE_1 = 1,
+	PVRDMA_MW_TYPE_2 = 2,
+};
+
+struct pvrdma_srq_attr {
+	u32			max_wr;
+	u32			max_sge;
+	u32			srq_limit;
+	u32			reserved;
+};
+
+struct pvrdma_qp_attr {
+	enum pvrdma_qp_state	qp_state;
+	enum pvrdma_qp_state	cur_qp_state;
+	enum pvrdma_mtu		path_mtu;
+	enum pvrdma_mig_state	path_mig_state;
+	u32			qkey;
+	u32			rq_psn;
+	u32			sq_psn;
+	u32			dest_qp_num;
+	u32			qp_access_flags;
+	u16			pkey_index;
+	u16			alt_pkey_index;
+	u8			en_sqd_async_notify;
+	u8			sq_draining;
+	u8			max_rd_atomic;
+	u8			max_dest_rd_atomic;
+	u8			min_rnr_timer;
+	u8			port_num;
+	u8			timeout;
+	u8			retry_cnt;
+	u8			rnr_retry;
+	u8			alt_port_num;
+	u8			alt_timeout;
+	u8			reserved[5];
+	struct pvrdma_qp_cap	cap;
+	struct pvrdma_ah_attr	ah_attr;
+	struct pvrdma_ah_attr	alt_ah_attr;
+};
+
+enum pvrdma_send_flags {
+	PVRDMA_SEND_FENCE	= 1 << 0,
+	PVRDMA_SEND_SIGNALED	= 1 << 1,
+	PVRDMA_SEND_SOLICITED	= 1 << 2,
+	PVRDMA_SEND_INLINE	= 1 << 3,
+	PVRDMA_SEND_IP_CSUM	= 1 << 4,
+	PVRDMA_SEND_FLAGS_MAX	= PVRDMA_SEND_IP_CSUM,
+};
+
+enum pvrdma_access_flags {
+	PVRDMA_ACCESS_LOCAL_WRITE	= 1 << 0,
+	PVRDMA_ACCESS_REMOTE_WRITE	= 1 << 1,
+	PVRDMA_ACCESS_REMOTE_READ	= 1 << 2,
+	PVRDMA_ACCESS_REMOTE_ATOMIC	= 1 << 3,
+	PVRDMA_ACCESS_MW_BIND		= 1 << 4,
+	PVRDMA_ZERO_BASED		= 1 << 5,
+	PVRDMA_ACCESS_ON_DEMAND		= 1 << 6,
+	PVRDMA_ACCESS_FLAGS_MAX		= PVRDMA_ACCESS_ON_DEMAND,
+};
+
+int pvrdma_query_device(struct ib_device *ibdev,
+			struct ib_device_attr *props,
+			struct ib_udata *udata);
+int pvrdma_query_port(struct ib_device *ibdev, u8 port,
+		      struct ib_port_attr *props);
+int pvrdma_query_gid(struct ib_device *ibdev, u8 port,
+		     int index, union ib_gid *gid);
+int pvrdma_query_pkey(struct ib_device *ibdev, u8 port,
+		      u16 index, u16 *pkey);
+enum rdma_link_layer pvrdma_port_link_layer(struct ib_device *ibdev,
+					    u8 port);
+int pvrdma_modify_device(struct ib_device *ibdev, int mask,
+			 struct ib_device_modify *props);
+int pvrdma_modify_port(struct ib_device *ibdev, u8 port,
+		       int mask, struct ib_port_modify *props);
+int pvrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev,
+					  struct ib_udata *udata);
+int pvrdma_dealloc_ucontext(struct ib_ucontext *context);
+struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev,
+			      struct ib_ucontext *context,
+			      struct ib_udata *udata);
+int pvrdma_dealloc_pd(struct ib_pd *ibpd);
+struct ib_mr *pvrdma_get_dma_mr(struct ib_pd *pd, int acc);
+struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				 u64 virt_addr, int access_flags,
+				 struct ib_udata *udata);
+int pvrdma_dereg_mr(struct ib_mr *mr);
+struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+			      u32 max_num_sg);
+int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+		     int sg_nents, unsigned int *sg_offset);
+int pvrdma_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+int pvrdma_resize_cq(struct ib_cq *ibcq, int entries,
+		     struct ib_udata *udata);
+struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
+			       const struct ib_cq_init_attr *attr,
+			       struct ib_ucontext *context,
+			       struct ib_udata *udata);
+int pvrdma_resize_cq(struct ib_cq *ibcq, int entries,
+		     struct ib_udata *udata);
+int pvrdma_destroy_cq(struct ib_cq *cq);
+int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+struct ib_ah *pvrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
+			       struct ib_udata *udata);
+int pvrdma_destroy_ah(struct ib_ah *ah);
+
+struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
+				 struct ib_srq_init_attr *init_attr,
+				 struct ib_udata *udata);
+int pvrdma_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		      enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int pvrdma_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int pvrdma_destroy_srq(struct ib_srq *srq);
+int pvrdma_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			 struct ib_recv_wr **bad_wr);
+
+struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
+			       struct ib_qp_init_attr *init_attr,
+			       struct ib_udata *udata);
+int pvrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_udata *udata);
+int pvrdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+		    int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+int pvrdma_destroy_qp(struct ib_qp *qp);
+int pvrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		     struct ib_send_wr **bad_wr);
+int pvrdma_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		     struct ib_recv_wr **bad_wr);
+
+#endif /* __PVRDMA_VERBS_H__ */
diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile
new file mode 100644
index 0000000..8b095b2
--- /dev/null
+++ b/drivers/infiniband/sw/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INFINIBAND_RDMAVT)		+= rdmavt/
+obj-$(CONFIG_RDMA_RXE)			+= rxe/
diff --git a/drivers/infiniband/sw/rdmavt/Kconfig b/drivers/infiniband/sw/rdmavt/Kconfig
new file mode 100644
index 0000000..2b5513d
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/Kconfig
@@ -0,0 +1,7 @@
+config INFINIBAND_RDMAVT
+	tristate "RDMA verbs transport library"
+	depends on 64BIT
+	depends on PCI
+	select DMA_VIRT_OPS
+	---help---
+	This is a common software verbs provider for RDMA networks.
diff --git a/drivers/infiniband/sw/rdmavt/Makefile b/drivers/infiniband/sw/rdmavt/Makefile
new file mode 100644
index 0000000..78b276a
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/Makefile
@@ -0,0 +1,13 @@
+#
+# rdmavt driver
+#
+#
+#
+# Called from the kernel module build system.
+#
+obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt.o
+
+rdmavt-y := vt.o ah.o cq.o mad.o mcast.o mmap.o mr.o pd.o qp.o \
+	rc.o srq.o trace.o
+
+CFLAGS_trace.o = -I$(src)
diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c
new file mode 100644
index 0000000..ba3639a
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/ah.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/slab.h>
+#include "ah.h"
+#include "vt.h" /* for prints */
+
+/**
+ * rvt_check_ah - validate the attributes of AH
+ * @ibdev: the ib device
+ * @ah_attr: the attributes of the AH
+ *
+ * If driver supports a more detailed check_ah function call back to it
+ * otherwise just check the basics.
+ *
+ * Return: 0 on success
+ */
+int rvt_check_ah(struct ib_device *ibdev,
+		 struct rdma_ah_attr *ah_attr)
+{
+	int err;
+	int port_num = rdma_ah_get_port_num(ah_attr);
+	struct ib_port_attr port_attr;
+	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+	u8 ah_flags = rdma_ah_get_ah_flags(ah_attr);
+	u8 static_rate = rdma_ah_get_static_rate(ah_attr);
+
+	err = ib_query_port(ibdev, port_num, &port_attr);
+	if (err)
+		return -EINVAL;
+	if (port_num < 1 ||
+	    port_num > ibdev->phys_port_cnt)
+		return -EINVAL;
+	if (static_rate != IB_RATE_PORT_CURRENT &&
+	    ib_rate_to_mbps(static_rate) < 0)
+		return -EINVAL;
+	if ((ah_flags & IB_AH_GRH) &&
+	    rdma_ah_read_grh(ah_attr)->sgid_index >= port_attr.gid_tbl_len)
+		return -EINVAL;
+	if (rdi->driver_f.check_ah)
+		return rdi->driver_f.check_ah(ibdev, ah_attr);
+	return 0;
+}
+EXPORT_SYMBOL(rvt_check_ah);
+
+/**
+ * rvt_create_ah - create an address handle
+ * @pd: the protection domain
+ * @ah_attr: the attributes of the AH
+ *
+ * This may be called from interrupt context.
+ *
+ * Return: newly allocated ah
+ */
+struct ib_ah *rvt_create_ah(struct ib_pd *pd,
+			    struct rdma_ah_attr *ah_attr)
+{
+	struct rvt_ah *ah;
+	struct rvt_dev_info *dev = ib_to_rvt(pd->device);
+	unsigned long flags;
+
+	if (rvt_check_ah(pd->device, ah_attr))
+		return ERR_PTR(-EINVAL);
+
+	ah = kmalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	if (dev->n_ahs_allocated == dev->dparms.props.max_ah) {
+		spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+		kfree(ah);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dev->n_ahs_allocated++;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	ah->attr = *ah_attr;
+	atomic_set(&ah->refcount, 0);
+
+	if (dev->driver_f.notify_new_ah)
+		dev->driver_f.notify_new_ah(pd->device, ah_attr, ah);
+
+	return &ah->ibah;
+}
+
+/**
+ * rvt_destory_ah - Destory an address handle
+ * @ibah: address handle
+ *
+ * Return: 0 on success
+ */
+int rvt_destroy_ah(struct ib_ah *ibah)
+{
+	struct rvt_dev_info *dev = ib_to_rvt(ibah->device);
+	struct rvt_ah *ah = ibah_to_rvtah(ibah);
+	unsigned long flags;
+
+	if (atomic_read(&ah->refcount) != 0)
+		return -EBUSY;
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	dev->n_ahs_allocated--;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	kfree(ah);
+
+	return 0;
+}
+
+/**
+ * rvt_modify_ah - modify an ah with given attrs
+ * @ibah: address handle to modify
+ * @ah_attr: attrs to apply
+ *
+ * Return: 0 on success
+ */
+int rvt_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
+{
+	struct rvt_ah *ah = ibah_to_rvtah(ibah);
+
+	if (rvt_check_ah(ibah->device, ah_attr))
+		return -EINVAL;
+
+	ah->attr = *ah_attr;
+
+	return 0;
+}
+
+/**
+ * rvt_query_ah - return attrs for ah
+ * @ibah: address handle to query
+ * @ah_attr: return info in this
+ *
+ * Return: always 0
+ */
+int rvt_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
+{
+	struct rvt_ah *ah = ibah_to_rvtah(ibah);
+
+	*ah_attr = ah->attr;
+
+	return 0;
+}
diff --git a/drivers/infiniband/sw/rdmavt/ah.h b/drivers/infiniband/sw/rdmavt/ah.h
new file mode 100644
index 0000000..16105af
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/ah.h
@@ -0,0 +1,59 @@
+#ifndef DEF_RVTAH_H
+#define DEF_RVTAH_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/rdma_vt.h>
+
+struct ib_ah *rvt_create_ah(struct ib_pd *pd,
+			    struct rdma_ah_attr *ah_attr);
+int rvt_destroy_ah(struct ib_ah *ibah);
+int rvt_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
+int rvt_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
+
+#endif          /* DEF_RVTAH_H */
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
new file mode 100644
index 0000000..fb52b66
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -0,0 +1,550 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/kthread.h>
+#include "cq.h"
+#include "vt.h"
+#include "trace.h"
+
+/**
+ * rvt_cq_enter - add a new entry to the completion queue
+ * @cq: completion queue
+ * @entry: work completion entry to add
+ * @solicited: true if @entry is solicited
+ *
+ * This may be called with qp->s_lock held.
+ */
+void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
+{
+	struct rvt_cq_wc *wc;
+	unsigned long flags;
+	u32 head;
+	u32 next;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	/*
+	 * Note that the head pointer might be writable by user processes.
+	 * Take care to verify it is a sane value.
+	 */
+	wc = cq->queue;
+	head = wc->head;
+	if (head >= (unsigned)cq->ibcq.cqe) {
+		head = cq->ibcq.cqe;
+		next = 0;
+	} else {
+		next = head + 1;
+	}
+
+	if (unlikely(next == wc->tail)) {
+		spin_unlock_irqrestore(&cq->lock, flags);
+		if (cq->ibcq.event_handler) {
+			struct ib_event ev;
+
+			ev.device = cq->ibcq.device;
+			ev.element.cq = &cq->ibcq;
+			ev.event = IB_EVENT_CQ_ERR;
+			cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+		}
+		return;
+	}
+	trace_rvt_cq_enter(cq, entry, head);
+	if (cq->ip) {
+		wc->uqueue[head].wr_id = entry->wr_id;
+		wc->uqueue[head].status = entry->status;
+		wc->uqueue[head].opcode = entry->opcode;
+		wc->uqueue[head].vendor_err = entry->vendor_err;
+		wc->uqueue[head].byte_len = entry->byte_len;
+		wc->uqueue[head].ex.imm_data = entry->ex.imm_data;
+		wc->uqueue[head].qp_num = entry->qp->qp_num;
+		wc->uqueue[head].src_qp = entry->src_qp;
+		wc->uqueue[head].wc_flags = entry->wc_flags;
+		wc->uqueue[head].pkey_index = entry->pkey_index;
+		wc->uqueue[head].slid = ib_lid_cpu16(entry->slid);
+		wc->uqueue[head].sl = entry->sl;
+		wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+		wc->uqueue[head].port_num = entry->port_num;
+		/* Make sure entry is written before the head index. */
+		smp_wmb();
+	} else {
+		wc->kqueue[head] = *entry;
+	}
+	wc->head = next;
+
+	if (cq->notify == IB_CQ_NEXT_COMP ||
+	    (cq->notify == IB_CQ_SOLICITED &&
+	     (solicited || entry->status != IB_WC_SUCCESS))) {
+		/*
+		 * This will cause send_complete() to be called in
+		 * another thread.
+		 */
+		spin_lock(&cq->rdi->n_cqs_lock);
+		if (likely(cq->rdi->worker)) {
+			cq->notify = RVT_CQ_NONE;
+			cq->triggered++;
+			kthread_queue_work(cq->rdi->worker, &cq->comptask);
+		}
+		spin_unlock(&cq->rdi->n_cqs_lock);
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+}
+EXPORT_SYMBOL(rvt_cq_enter);
+
+static void send_complete(struct kthread_work *work)
+{
+	struct rvt_cq *cq = container_of(work, struct rvt_cq, comptask);
+
+	/*
+	 * The completion handler will most likely rearm the notification
+	 * and poll for all pending entries.  If a new completion entry
+	 * is added while we are in this routine, queue_work()
+	 * won't call us again until we return so we check triggered to
+	 * see if we need to call the handler again.
+	 */
+	for (;;) {
+		u8 triggered = cq->triggered;
+
+		/*
+		 * IPoIB connected mode assumes the callback is from a
+		 * soft IRQ. We simulate this by blocking "bottom halves".
+		 * See the implementation for ipoib_cm_handle_tx_wc(),
+		 * netif_tx_lock_bh() and netif_tx_lock().
+		 */
+		local_bh_disable();
+		cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+		local_bh_enable();
+
+		if (cq->triggered == triggered)
+			return;
+	}
+}
+
+/**
+ * rvt_create_cq - create a completion queue
+ * @ibdev: the device this completion queue is attached to
+ * @attr: creation attributes
+ * @context: unused by the QLogic_IB driver
+ * @udata: user data for libibverbs.so
+ *
+ * Called by ib_create_cq() in the generic verbs code.
+ *
+ * Return: pointer to the completion queue or negative errno values
+ * for failure.
+ */
+struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
+			    const struct ib_cq_init_attr *attr,
+			    struct ib_ucontext *context,
+			    struct ib_udata *udata)
+{
+	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+	struct rvt_cq *cq;
+	struct rvt_cq_wc *wc;
+	struct ib_cq *ret;
+	u32 sz;
+	unsigned int entries = attr->cqe;
+
+	if (attr->flags)
+		return ERR_PTR(-EINVAL);
+
+	if (entries < 1 || entries > rdi->dparms.props.max_cqe)
+		return ERR_PTR(-EINVAL);
+
+	/* Allocate the completion queue structure. */
+	cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Allocate the completion queue entries and head/tail pointers.
+	 * This is allocated separately so that it can be resized and
+	 * also mapped into user space.
+	 * We need to use vmalloc() in order to support mmap and large
+	 * numbers of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
+	else
+		sz += sizeof(struct ib_wc) * (entries + 1);
+	wc = udata ?
+		vmalloc_user(sz) :
+		vzalloc_node(sz, rdi->dparms.node);
+	if (!wc) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_cq;
+	}
+
+	/*
+	 * Return the address of the WC as the offset to mmap.
+	 * See rvt_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+
+		cq->ip = rvt_create_mmap_info(rdi, sz, context, wc);
+		if (!cq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wc;
+		}
+
+		err = ib_copy_to_udata(udata, &cq->ip->offset,
+				       sizeof(cq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	}
+
+	spin_lock_irq(&rdi->n_cqs_lock);
+	if (rdi->n_cqs_allocated == rdi->dparms.props.max_cq) {
+		spin_unlock_irq(&rdi->n_cqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	rdi->n_cqs_allocated++;
+	spin_unlock_irq(&rdi->n_cqs_lock);
+
+	if (cq->ip) {
+		spin_lock_irq(&rdi->pending_lock);
+		list_add(&cq->ip->pending_mmaps, &rdi->pending_mmaps);
+		spin_unlock_irq(&rdi->pending_lock);
+	}
+
+	/*
+	 * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+	 * The number of entries should be >= the number requested or return
+	 * an error.
+	 */
+	cq->rdi = rdi;
+	cq->ibcq.cqe = entries;
+	cq->notify = RVT_CQ_NONE;
+	spin_lock_init(&cq->lock);
+	kthread_init_work(&cq->comptask, send_complete);
+	cq->queue = wc;
+
+	ret = &cq->ibcq;
+
+	goto done;
+
+bail_ip:
+	kfree(cq->ip);
+bail_wc:
+	vfree(wc);
+bail_cq:
+	kfree(cq);
+done:
+	return ret;
+}
+
+/**
+ * rvt_destroy_cq - destroy a completion queue
+ * @ibcq: the completion queue to destroy.
+ *
+ * Called by ib_destroy_cq() in the generic verbs code.
+ *
+ * Return: always 0
+ */
+int rvt_destroy_cq(struct ib_cq *ibcq)
+{
+	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
+	struct rvt_dev_info *rdi = cq->rdi;
+
+	kthread_flush_work(&cq->comptask);
+	spin_lock_irq(&rdi->n_cqs_lock);
+	rdi->n_cqs_allocated--;
+	spin_unlock_irq(&rdi->n_cqs_lock);
+	if (cq->ip)
+		kref_put(&cq->ip->ref, rvt_release_mmap_info);
+	else
+		vfree(cq->queue);
+	kfree(cq);
+
+	return 0;
+}
+
+/**
+ * rvt_req_notify_cq - change the notification type for a completion queue
+ * @ibcq: the completion queue
+ * @notify_flags: the type of notification to request
+ *
+ * This may be called from interrupt context.  Also called by
+ * ib_req_notify_cq() in the generic verbs code.
+ *
+ * Return: 0 for success.
+ */
+int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+	/*
+	 * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+	 * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
+	 */
+	if (cq->notify != IB_CQ_NEXT_COMP)
+		cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
+
+	if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
+	    cq->queue->head != cq->queue->tail)
+		ret = 1;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return ret;
+}
+
+/**
+ * rvt_resize_cq - change the size of the CQ
+ * @ibcq: the completion queue
+ *
+ * Return: 0 for success.
+ */
+int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
+	struct rvt_cq_wc *old_wc;
+	struct rvt_cq_wc *wc;
+	u32 head, tail, n;
+	int ret;
+	u32 sz;
+	struct rvt_dev_info *rdi = cq->rdi;
+
+	if (cqe < 1 || cqe > rdi->dparms.props.max_cqe)
+		return -EINVAL;
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
+	else
+		sz += sizeof(struct ib_wc) * (cqe + 1);
+	wc = udata ?
+		vmalloc_user(sz) :
+		vzalloc_node(sz, rdi->dparms.node);
+	if (!wc)
+		return -ENOMEM;
+
+	/* Check that we can write the offset to mmap. */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		__u64 offset = 0;
+
+		ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
+		if (ret)
+			goto bail_free;
+	}
+
+	spin_lock_irq(&cq->lock);
+	/*
+	 * Make sure head and tail are sane since they
+	 * might be user writable.
+	 */
+	old_wc = cq->queue;
+	head = old_wc->head;
+	if (head > (u32)cq->ibcq.cqe)
+		head = (u32)cq->ibcq.cqe;
+	tail = old_wc->tail;
+	if (tail > (u32)cq->ibcq.cqe)
+		tail = (u32)cq->ibcq.cqe;
+	if (head < tail)
+		n = cq->ibcq.cqe + 1 + head - tail;
+	else
+		n = head - tail;
+	if (unlikely((u32)cqe < n)) {
+		ret = -EINVAL;
+		goto bail_unlock;
+	}
+	for (n = 0; tail != head; n++) {
+		if (cq->ip)
+			wc->uqueue[n] = old_wc->uqueue[tail];
+		else
+			wc->kqueue[n] = old_wc->kqueue[tail];
+		if (tail == (u32)cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	cq->ibcq.cqe = cqe;
+	wc->head = n;
+	wc->tail = 0;
+	cq->queue = wc;
+	spin_unlock_irq(&cq->lock);
+
+	vfree(old_wc);
+
+	if (cq->ip) {
+		struct rvt_mmap_info *ip = cq->ip;
+
+		rvt_update_mmap_info(rdi, ip, sz, wc);
+
+		/*
+		 * Return the offset to mmap.
+		 * See rvt_mmap() for details.
+		 */
+		if (udata && udata->outlen >= sizeof(__u64)) {
+			ret = ib_copy_to_udata(udata, &ip->offset,
+					       sizeof(ip->offset));
+			if (ret)
+				return ret;
+		}
+
+		spin_lock_irq(&rdi->pending_lock);
+		if (list_empty(&ip->pending_mmaps))
+			list_add(&ip->pending_mmaps, &rdi->pending_mmaps);
+		spin_unlock_irq(&rdi->pending_lock);
+	}
+
+	return 0;
+
+bail_unlock:
+	spin_unlock_irq(&cq->lock);
+bail_free:
+	vfree(wc);
+	return ret;
+}
+
+/**
+ * rvt_poll_cq - poll for work completion entries
+ * @ibcq: the completion queue to poll
+ * @num_entries: the maximum number of entries to return
+ * @entry: pointer to array where work completions are placed
+ *
+ * This may be called from interrupt context.  Also called by ib_poll_cq()
+ * in the generic verbs code.
+ *
+ * Return: the number of completion entries polled.
+ */
+int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
+	struct rvt_cq_wc *wc;
+	unsigned long flags;
+	int npolled;
+	u32 tail;
+
+	/* The kernel can only poll a kernel completion queue */
+	if (cq->ip)
+		return -EINVAL;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	wc = cq->queue;
+	tail = wc->tail;
+	if (tail > (u32)cq->ibcq.cqe)
+		tail = (u32)cq->ibcq.cqe;
+	for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+		if (tail == wc->head)
+			break;
+		/* The kernel doesn't need a RMB since it has the lock. */
+		trace_rvt_cq_poll(cq, &wc->kqueue[tail], npolled);
+		*entry = wc->kqueue[tail];
+		if (tail >= cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	wc->tail = tail;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return npolled;
+}
+
+/**
+ * rvt_driver_cq_init - Init cq resources on behalf of driver
+ * @rdi: rvt dev structure
+ *
+ * Return: 0 on success
+ */
+int rvt_driver_cq_init(struct rvt_dev_info *rdi)
+{
+	int cpu;
+	struct kthread_worker *worker;
+
+	if (rdi->worker)
+		return 0;
+
+	spin_lock_init(&rdi->n_cqs_lock);
+
+	cpu = cpumask_first(cpumask_of_node(rdi->dparms.node));
+	worker = kthread_create_worker_on_cpu(cpu, 0,
+					      "%s", rdi->dparms.cq_name);
+	if (IS_ERR(worker))
+		return PTR_ERR(worker);
+
+	set_user_nice(worker->task, MIN_NICE);
+	rdi->worker = worker;
+	return 0;
+}
+
+/**
+ * rvt_cq_exit - tear down cq reources
+ * @rdi: rvt dev structure
+ */
+void rvt_cq_exit(struct rvt_dev_info *rdi)
+{
+	struct kthread_worker *worker;
+
+	/* block future queuing from send_complete() */
+	spin_lock_irq(&rdi->n_cqs_lock);
+	worker = rdi->worker;
+	if (!worker) {
+		spin_unlock_irq(&rdi->n_cqs_lock);
+		return;
+	}
+	rdi->worker = NULL;
+	spin_unlock_irq(&rdi->n_cqs_lock);
+
+	kthread_destroy_worker(worker);
+}
diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h
new file mode 100644
index 0000000..6182c29
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/cq.h
@@ -0,0 +1,64 @@
+#ifndef DEF_RVTCQ_H
+#define DEF_RVTCQ_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_cq.h>
+
+struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
+			    const struct ib_cq_init_attr *attr,
+			    struct ib_ucontext *context,
+			    struct ib_udata *udata);
+int rvt_destroy_cq(struct ib_cq *ibcq);
+int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
+int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+int rvt_driver_cq_init(struct rvt_dev_info *rdi);
+void rvt_cq_exit(struct rvt_dev_info *rdi);
+#endif          /* DEF_RVTCQ_H */
diff --git a/drivers/infiniband/sw/rdmavt/mad.c b/drivers/infiniband/sw/rdmavt/mad.c
new file mode 100644
index 0000000..d6981dc
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mad.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/ib_mad.h>
+#include "mad.h"
+#include "vt.h"
+
+/**
+ * rvt_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port_num: the port number this packet came in on, 1 based from ib core
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ *
+ * Return: IB_MAD_RESULT_SUCCESS or error
+ */
+int rvt_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		    const struct ib_mad_hdr *in, size_t in_mad_size,
+		    struct ib_mad_hdr *out, size_t *out_mad_size,
+		    u16 *out_mad_pkey_index)
+{
+	/*
+	 * MAD processing is quite different between hfi1 and qib. Therefore
+	 * this is expected to be provided by the driver. Other drivers in the
+	 * future may choose to implement this but it should not be made into a
+	 * requirement.
+	 */
+	if (ibport_num_to_idx(ibdev, port_num) < 0)
+		return -EINVAL;
+
+	return IB_MAD_RESULT_FAILURE;
+}
+
+static void rvt_send_mad_handler(struct ib_mad_agent *agent,
+				 struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+/**
+ * rvt_create_mad_agents - create mad agents
+ * @rdi: rvt dev struct
+ *
+ * If driver needs to be notified of mad agent creation then call back
+ *
+ * Return 0 on success
+ */
+int rvt_create_mad_agents(struct rvt_dev_info *rdi)
+{
+	struct ib_mad_agent *agent;
+	struct rvt_ibport *rvp;
+	int p;
+	int ret;
+
+	for (p = 0; p < rdi->dparms.nports; p++) {
+		rvp = rdi->ports[p];
+		agent = ib_register_mad_agent(&rdi->ibdev, p + 1,
+					      IB_QPT_SMI,
+					      NULL, 0, rvt_send_mad_handler,
+					      NULL, NULL, 0);
+		if (IS_ERR(agent)) {
+			ret = PTR_ERR(agent);
+			goto err;
+		}
+
+		rvp->send_agent = agent;
+
+		if (rdi->driver_f.notify_create_mad_agent)
+			rdi->driver_f.notify_create_mad_agent(rdi, p);
+	}
+
+	return 0;
+
+err:
+	for (p = 0; p < rdi->dparms.nports; p++) {
+		rvp = rdi->ports[p];
+		if (rvp->send_agent) {
+			agent = rvp->send_agent;
+			rvp->send_agent = NULL;
+			ib_unregister_mad_agent(agent);
+			if (rdi->driver_f.notify_free_mad_agent)
+				rdi->driver_f.notify_free_mad_agent(rdi, p);
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * rvt_free_mad_agents - free up mad agents
+ * @rdi: rvt dev struct
+ *
+ * If driver needs notification of mad agent removal make the call back
+ */
+void rvt_free_mad_agents(struct rvt_dev_info *rdi)
+{
+	struct ib_mad_agent *agent;
+	struct rvt_ibport *rvp;
+	int p;
+
+	for (p = 0; p < rdi->dparms.nports; p++) {
+		rvp = rdi->ports[p];
+		if (rvp->send_agent) {
+			agent = rvp->send_agent;
+			rvp->send_agent = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+		if (rvp->sm_ah) {
+			rdma_destroy_ah(&rvp->sm_ah->ibah);
+			rvp->sm_ah = NULL;
+		}
+
+		if (rdi->driver_f.notify_free_mad_agent)
+			rdi->driver_f.notify_free_mad_agent(rdi, p);
+	}
+}
+
diff --git a/drivers/infiniband/sw/rdmavt/mad.h b/drivers/infiniband/sw/rdmavt/mad.h
new file mode 100644
index 0000000..a9d6eec
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mad.h
@@ -0,0 +1,60 @@
+#ifndef DEF_RVTMAD_H
+#define DEF_RVTMAD_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/rdma_vt.h>
+
+int rvt_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		    const struct ib_mad_hdr *in, size_t in_mad_size,
+		    struct ib_mad_hdr *out, size_t *out_mad_size,
+		    u16 *out_mad_pkey_index);
+int rvt_create_mad_agents(struct rvt_dev_info *rdi);
+void rvt_free_mad_agents(struct rvt_dev_info *rdi);
+#endif          /* DEF_RVTMAD_H */
diff --git a/drivers/infiniband/sw/rdmavt/mcast.c b/drivers/infiniband/sw/rdmavt/mcast.c
new file mode 100644
index 0000000..dd11c6f
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mcast.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/rculist.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+
+#include "mcast.h"
+
+/**
+ * rvt_driver_mcast - init resources for multicast
+ * @rdi: rvt dev struct
+ *
+ * This is per device that registers with rdmavt
+ */
+void rvt_driver_mcast_init(struct rvt_dev_info *rdi)
+{
+	/*
+	 * Anything that needs setup for multicast on a per driver or per rdi
+	 * basis should be done in here.
+	 */
+	spin_lock_init(&rdi->n_mcast_grps_lock);
+}
+
+/**
+ * mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
+ * @qp: the QP to link
+ */
+static struct rvt_mcast_qp *rvt_mcast_qp_alloc(struct rvt_qp *qp)
+{
+	struct rvt_mcast_qp *mqp;
+
+	mqp = kmalloc(sizeof(*mqp), GFP_KERNEL);
+	if (!mqp)
+		goto bail;
+
+	mqp->qp = qp;
+	rvt_get_qp(qp);
+
+bail:
+	return mqp;
+}
+
+static void rvt_mcast_qp_free(struct rvt_mcast_qp *mqp)
+{
+	struct rvt_qp *qp = mqp->qp;
+
+	/* Notify hfi1_destroy_qp() if it is waiting. */
+	rvt_put_qp(qp);
+
+	kfree(mqp);
+}
+
+/**
+ * mcast_alloc - allocate the multicast GID structure
+ * @mgid: the multicast GID
+ * @lid: the muilticast LID (host order)
+ *
+ * A list of QPs will be attached to this structure.
+ */
+static struct rvt_mcast *rvt_mcast_alloc(union ib_gid *mgid, u16 lid)
+{
+	struct rvt_mcast *mcast;
+
+	mcast = kzalloc(sizeof(*mcast), GFP_KERNEL);
+	if (!mcast)
+		goto bail;
+
+	mcast->mcast_addr.mgid = *mgid;
+	mcast->mcast_addr.lid = lid;
+
+	INIT_LIST_HEAD(&mcast->qp_list);
+	init_waitqueue_head(&mcast->wait);
+	atomic_set(&mcast->refcount, 0);
+
+bail:
+	return mcast;
+}
+
+static void rvt_mcast_free(struct rvt_mcast *mcast)
+{
+	struct rvt_mcast_qp *p, *tmp;
+
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+		rvt_mcast_qp_free(p);
+
+	kfree(mcast);
+}
+
+/**
+ * rvt_mcast_find - search the global table for the given multicast GID/LID
+ * NOTE: It is valid to have 1 MLID with multiple MGIDs.  It is not valid
+ * to have 1 MGID with multiple MLIDs.
+ * @ibp: the IB port structure
+ * @mgid: the multicast GID to search for
+ * @lid: the multicast LID portion of the multicast address (host order)
+ *
+ * The caller is responsible for decrementing the reference count if found.
+ *
+ * Return: NULL if not found.
+ */
+struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid,
+				 u16 lid)
+{
+	struct rb_node *n;
+	unsigned long flags;
+	struct rvt_mcast *found = NULL;
+
+	spin_lock_irqsave(&ibp->lock, flags);
+	n = ibp->mcast_tree.rb_node;
+	while (n) {
+		int ret;
+		struct rvt_mcast *mcast;
+
+		mcast = rb_entry(n, struct rvt_mcast, rb_node);
+
+		ret = memcmp(mgid->raw, mcast->mcast_addr.mgid.raw,
+			     sizeof(*mgid));
+		if (ret < 0) {
+			n = n->rb_left;
+		} else if (ret > 0) {
+			n = n->rb_right;
+		} else {
+			/* MGID/MLID must match */
+			if (mcast->mcast_addr.lid == lid) {
+				atomic_inc(&mcast->refcount);
+				found = mcast;
+			}
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&ibp->lock, flags);
+	return found;
+}
+EXPORT_SYMBOL(rvt_mcast_find);
+
+/**
+ * mcast_add - insert mcast GID into table and attach QP struct
+ * @mcast: the mcast GID table
+ * @mqp: the QP to attach
+ *
+ * Return: zero if both were added.  Return EEXIST if the GID was already in
+ * the table but the QP was added.  Return ESRCH if the QP was already
+ * attached and neither structure was added. Return EINVAL if the MGID was
+ * found, but the MLID did NOT match.
+ */
+static int rvt_mcast_add(struct rvt_dev_info *rdi, struct rvt_ibport *ibp,
+			 struct rvt_mcast *mcast, struct rvt_mcast_qp *mqp)
+{
+	struct rb_node **n = &ibp->mcast_tree.rb_node;
+	struct rb_node *pn = NULL;
+	int ret;
+
+	spin_lock_irq(&ibp->lock);
+
+	while (*n) {
+		struct rvt_mcast *tmcast;
+		struct rvt_mcast_qp *p;
+
+		pn = *n;
+		tmcast = rb_entry(pn, struct rvt_mcast, rb_node);
+
+		ret = memcmp(mcast->mcast_addr.mgid.raw,
+			     tmcast->mcast_addr.mgid.raw,
+			     sizeof(mcast->mcast_addr.mgid));
+		if (ret < 0) {
+			n = &pn->rb_left;
+			continue;
+		}
+		if (ret > 0) {
+			n = &pn->rb_right;
+			continue;
+		}
+
+		if (tmcast->mcast_addr.lid != mcast->mcast_addr.lid) {
+			ret = EINVAL;
+			goto bail;
+		}
+
+		/* Search the QP list to see if this is already there. */
+		list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+			if (p->qp == mqp->qp) {
+				ret = ESRCH;
+				goto bail;
+			}
+		}
+		if (tmcast->n_attached ==
+		    rdi->dparms.props.max_mcast_qp_attach) {
+			ret = ENOMEM;
+			goto bail;
+		}
+
+		tmcast->n_attached++;
+
+		list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+		ret = EEXIST;
+		goto bail;
+	}
+
+	spin_lock(&rdi->n_mcast_grps_lock);
+	if (rdi->n_mcast_grps_allocated == rdi->dparms.props.max_mcast_grp) {
+		spin_unlock(&rdi->n_mcast_grps_lock);
+		ret = ENOMEM;
+		goto bail;
+	}
+
+	rdi->n_mcast_grps_allocated++;
+	spin_unlock(&rdi->n_mcast_grps_lock);
+
+	mcast->n_attached++;
+
+	list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+	atomic_inc(&mcast->refcount);
+	rb_link_node(&mcast->rb_node, pn, n);
+	rb_insert_color(&mcast->rb_node, &ibp->mcast_tree);
+
+	ret = 0;
+
+bail:
+	spin_unlock_irq(&ibp->lock);
+
+	return ret;
+}
+
+/**
+ * rvt_attach_mcast - attach a qp to a multicast group
+ * @ibqp: Infiniband qp
+ * @gid: multicast guid
+ * @lid: multicast lid
+ *
+ * Return: 0 on success
+ */
+int rvt_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+	struct rvt_ibport *ibp = rdi->ports[qp->port_num - 1];
+	struct rvt_mcast *mcast;
+	struct rvt_mcast_qp *mqp;
+	int ret = -ENOMEM;
+
+	if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET)
+		return -EINVAL;
+
+	/*
+	 * Allocate data structures since its better to do this outside of
+	 * spin locks and it will most likely be needed.
+	 */
+	mcast = rvt_mcast_alloc(gid, lid);
+	if (!mcast)
+		return -ENOMEM;
+
+	mqp = rvt_mcast_qp_alloc(qp);
+	if (!mqp)
+		goto bail_mcast;
+
+	switch (rvt_mcast_add(rdi, ibp, mcast, mqp)) {
+	case ESRCH:
+		/* Neither was used: OK to attach the same QP twice. */
+		ret = 0;
+		goto bail_mqp;
+	case EEXIST: /* The mcast wasn't used */
+		ret = 0;
+		goto bail_mcast;
+	case ENOMEM:
+		/* Exceeded the maximum number of mcast groups. */
+		ret = -ENOMEM;
+		goto bail_mqp;
+	case EINVAL:
+		/* Invalid MGID/MLID pair */
+		ret = -EINVAL;
+		goto bail_mqp;
+	default:
+		break;
+	}
+
+	return 0;
+
+bail_mqp:
+	rvt_mcast_qp_free(mqp);
+
+bail_mcast:
+	rvt_mcast_free(mcast);
+
+	return ret;
+}
+
+/**
+ * rvt_detach_mcast - remove a qp from a multicast group
+ * @ibqp: Infiniband qp
+ * @gid: multicast guid
+ * @lid: multicast lid
+ *
+ * Return: 0 on success
+ */
+int rvt_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+	struct rvt_ibport *ibp = rdi->ports[qp->port_num - 1];
+	struct rvt_mcast *mcast = NULL;
+	struct rvt_mcast_qp *p, *tmp, *delp = NULL;
+	struct rb_node *n;
+	int last = 0;
+	int ret = 0;
+
+	if (ibqp->qp_num <= 1)
+		return -EINVAL;
+
+	spin_lock_irq(&ibp->lock);
+
+	/* Find the GID in the mcast table. */
+	n = ibp->mcast_tree.rb_node;
+	while (1) {
+		if (!n) {
+			spin_unlock_irq(&ibp->lock);
+			return -EINVAL;
+		}
+
+		mcast = rb_entry(n, struct rvt_mcast, rb_node);
+		ret = memcmp(gid->raw, mcast->mcast_addr.mgid.raw,
+			     sizeof(*gid));
+		if (ret < 0) {
+			n = n->rb_left;
+		} else if (ret > 0) {
+			n = n->rb_right;
+		} else {
+			/* MGID/MLID must match */
+			if (mcast->mcast_addr.lid != lid) {
+				spin_unlock_irq(&ibp->lock);
+				return -EINVAL;
+			}
+			break;
+		}
+	}
+
+	/* Search the QP list. */
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+		if (p->qp != qp)
+			continue;
+		/*
+		 * We found it, so remove it, but don't poison the forward
+		 * link until we are sure there are no list walkers.
+		 */
+		list_del_rcu(&p->list);
+		mcast->n_attached--;
+		delp = p;
+
+		/* If this was the last attached QP, remove the GID too. */
+		if (list_empty(&mcast->qp_list)) {
+			rb_erase(&mcast->rb_node, &ibp->mcast_tree);
+			last = 1;
+		}
+		break;
+	}
+
+	spin_unlock_irq(&ibp->lock);
+	/* QP not attached */
+	if (!delp)
+		return -EINVAL;
+
+	/*
+	 * Wait for any list walkers to finish before freeing the
+	 * list element.
+	 */
+	wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+	rvt_mcast_qp_free(delp);
+
+	if (last) {
+		atomic_dec(&mcast->refcount);
+		wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+		rvt_mcast_free(mcast);
+		spin_lock_irq(&rdi->n_mcast_grps_lock);
+		rdi->n_mcast_grps_allocated--;
+		spin_unlock_irq(&rdi->n_mcast_grps_lock);
+	}
+
+	return 0;
+}
+
+/**
+ *rvt_mast_tree_empty - determine if any qps are attached to any mcast group
+ *@rdi: rvt dev struct
+ *
+ * Return: in use count
+ */
+int rvt_mcast_tree_empty(struct rvt_dev_info *rdi)
+{
+	int i;
+	int in_use = 0;
+
+	for (i = 0; i < rdi->dparms.nports; i++)
+		if (rdi->ports[i]->mcast_tree.rb_node)
+			in_use++;
+	return in_use;
+}
diff --git a/drivers/infiniband/sw/rdmavt/mcast.h b/drivers/infiniband/sw/rdmavt/mcast.h
new file mode 100644
index 0000000..29f5792
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mcast.h
@@ -0,0 +1,58 @@
+#ifndef DEF_RVTMCAST_H
+#define DEF_RVTMCAST_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/rdma_vt.h>
+
+void rvt_driver_mcast_init(struct rvt_dev_info *rdi);
+int rvt_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int rvt_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int rvt_mcast_tree_empty(struct rvt_dev_info *rdi);
+
+#endif          /* DEF_RVTMCAST_H */
diff --git a/drivers/infiniband/sw/rdmavt/mmap.c b/drivers/infiniband/sw/rdmavt/mmap.c
new file mode 100644
index 0000000..6b712ee
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mmap.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <asm/pgtable.h>
+#include "mmap.h"
+
+/**
+ * rvt_mmap_init - init link list and lock for mem map
+ * @rdi: rvt dev struct
+ */
+void rvt_mmap_init(struct rvt_dev_info *rdi)
+{
+	INIT_LIST_HEAD(&rdi->pending_mmaps);
+	spin_lock_init(&rdi->pending_lock);
+	rdi->mmap_offset = PAGE_SIZE;
+	spin_lock_init(&rdi->mmap_offset_lock);
+}
+
+/**
+ * rvt_release_mmap_info - free mmap info structure
+ * @ref: a pointer to the kref within struct rvt_mmap_info
+ */
+void rvt_release_mmap_info(struct kref *ref)
+{
+	struct rvt_mmap_info *ip =
+		container_of(ref, struct rvt_mmap_info, ref);
+	struct rvt_dev_info *rdi = ib_to_rvt(ip->context->device);
+
+	spin_lock_irq(&rdi->pending_lock);
+	list_del(&ip->pending_mmaps);
+	spin_unlock_irq(&rdi->pending_lock);
+
+	vfree(ip->obj);
+	kfree(ip);
+}
+
+static void rvt_vma_open(struct vm_area_struct *vma)
+{
+	struct rvt_mmap_info *ip = vma->vm_private_data;
+
+	kref_get(&ip->ref);
+}
+
+static void rvt_vma_close(struct vm_area_struct *vma)
+{
+	struct rvt_mmap_info *ip = vma->vm_private_data;
+
+	kref_put(&ip->ref, rvt_release_mmap_info);
+}
+
+static const struct vm_operations_struct rvt_vm_ops = {
+	.open = rvt_vma_open,
+	.close = rvt_vma_close,
+};
+
+/**
+ * rvt_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ *
+ * Return: zero if the mmap is OK. Otherwise, return an errno.
+ */
+int rvt_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct rvt_dev_info *rdi = ib_to_rvt(context->device);
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct rvt_mmap_info *ip, *pp;
+	int ret = -EINVAL;
+
+	/*
+	 * Search the device's list of objects waiting for a mmap call.
+	 * Normally, this list is very short since a call to create a
+	 * CQ, QP, or SRQ is soon followed by a call to mmap().
+	 */
+	spin_lock_irq(&rdi->pending_lock);
+	list_for_each_entry_safe(ip, pp, &rdi->pending_mmaps,
+				 pending_mmaps) {
+		/* Only the creator is allowed to mmap the object */
+		if (context != ip->context || (__u64)offset != ip->offset)
+			continue;
+		/* Don't allow a mmap larger than the object. */
+		if (size > ip->size)
+			break;
+
+		list_del_init(&ip->pending_mmaps);
+		spin_unlock_irq(&rdi->pending_lock);
+
+		ret = remap_vmalloc_range(vma, ip->obj, 0);
+		if (ret)
+			goto done;
+		vma->vm_ops = &rvt_vm_ops;
+		vma->vm_private_data = ip;
+		rvt_vma_open(vma);
+		goto done;
+	}
+	spin_unlock_irq(&rdi->pending_lock);
+done:
+	return ret;
+}
+
+/**
+ * rvt_create_mmap_info - allocate information for hfi1_mmap
+ * @rdi: rvt dev struct
+ * @size: size in bytes to map
+ * @context: user context
+ * @obj: opaque pointer to a cq, wq etc
+ *
+ * Return: rvt_mmap struct on success
+ */
+struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi,
+					   u32 size,
+					   struct ib_ucontext *context,
+					   void *obj)
+{
+	struct rvt_mmap_info *ip;
+
+	ip = kmalloc_node(sizeof(*ip), GFP_KERNEL, rdi->dparms.node);
+	if (!ip)
+		return ip;
+
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&rdi->mmap_offset_lock);
+	if (rdi->mmap_offset == 0)
+		rdi->mmap_offset = ALIGN(PAGE_SIZE, SHMLBA);
+	ip->offset = rdi->mmap_offset;
+	rdi->mmap_offset += ALIGN(size, SHMLBA);
+	spin_unlock_irq(&rdi->mmap_offset_lock);
+
+	INIT_LIST_HEAD(&ip->pending_mmaps);
+	ip->size = size;
+	ip->context = context;
+	ip->obj = obj;
+	kref_init(&ip->ref);
+
+	return ip;
+}
+
+/**
+ * rvt_update_mmap_info - update a mem map
+ * @rdi: rvt dev struct
+ * @ip: mmap info pointer
+ * @size: size to grow by
+ * @obj: opaque pointer to cq, wq, etc.
+ */
+void rvt_update_mmap_info(struct rvt_dev_info *rdi, struct rvt_mmap_info *ip,
+			  u32 size, void *obj)
+{
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&rdi->mmap_offset_lock);
+	if (rdi->mmap_offset == 0)
+		rdi->mmap_offset = PAGE_SIZE;
+	ip->offset = rdi->mmap_offset;
+	rdi->mmap_offset += size;
+	spin_unlock_irq(&rdi->mmap_offset_lock);
+
+	ip->size = size;
+	ip->obj = obj;
+}
diff --git a/drivers/infiniband/sw/rdmavt/mmap.h b/drivers/infiniband/sw/rdmavt/mmap.h
new file mode 100644
index 0000000..fab0e7b
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mmap.h
@@ -0,0 +1,63 @@
+#ifndef DEF_RDMAVTMMAP_H
+#define DEF_RDMAVTMMAP_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/rdma_vt.h>
+
+void rvt_mmap_init(struct rvt_dev_info *rdi);
+void rvt_release_mmap_info(struct kref *ref);
+int rvt_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi,
+					   u32 size,
+					   struct ib_ucontext *context,
+					   void *obj);
+void rvt_update_mmap_info(struct rvt_dev_info *rdi, struct rvt_mmap_info *ip,
+			  u32 size, void *obj);
+
+#endif          /* DEF_RDMAVTMMAP_H */
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
new file mode 100644
index 0000000..cc429b5
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -0,0 +1,1114 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <rdma/ib_umem.h>
+#include <rdma/rdma_vt.h>
+#include "vt.h"
+#include "mr.h"
+#include "trace.h"
+
+/**
+ * rvt_driver_mr_init - Init MR resources per driver
+ * @rdi: rvt dev struct
+ *
+ * Do any intilization needed when a driver registers with rdmavt.
+ *
+ * Return: 0 on success or errno on failure
+ */
+int rvt_driver_mr_init(struct rvt_dev_info *rdi)
+{
+	unsigned int lkey_table_size = rdi->dparms.lkey_table_size;
+	unsigned lk_tab_size;
+	int i;
+
+	/*
+	 * The top hfi1_lkey_table_size bits are used to index the
+	 * table.  The lower 8 bits can be owned by the user (copied from
+	 * the LKEY).  The remaining bits act as a generation number or tag.
+	 */
+	if (!lkey_table_size)
+		return -EINVAL;
+
+	spin_lock_init(&rdi->lkey_table.lock);
+
+	/* ensure generation is at least 4 bits */
+	if (lkey_table_size > RVT_MAX_LKEY_TABLE_BITS) {
+		rvt_pr_warn(rdi, "lkey bits %u too large, reduced to %u\n",
+			    lkey_table_size, RVT_MAX_LKEY_TABLE_BITS);
+		rdi->dparms.lkey_table_size = RVT_MAX_LKEY_TABLE_BITS;
+		lkey_table_size = rdi->dparms.lkey_table_size;
+	}
+	rdi->lkey_table.max = 1 << lkey_table_size;
+	rdi->lkey_table.shift = 32 - lkey_table_size;
+	lk_tab_size = rdi->lkey_table.max * sizeof(*rdi->lkey_table.table);
+	rdi->lkey_table.table = (struct rvt_mregion __rcu **)
+			       vmalloc_node(lk_tab_size, rdi->dparms.node);
+	if (!rdi->lkey_table.table)
+		return -ENOMEM;
+
+	RCU_INIT_POINTER(rdi->dma_mr, NULL);
+	for (i = 0; i < rdi->lkey_table.max; i++)
+		RCU_INIT_POINTER(rdi->lkey_table.table[i], NULL);
+
+	return 0;
+}
+
+/**
+ *rvt_mr_exit: clean up MR
+ *@rdi: rvt dev structure
+ *
+ * called when drivers have unregistered or perhaps failed to register with us
+ */
+void rvt_mr_exit(struct rvt_dev_info *rdi)
+{
+	if (rdi->dma_mr)
+		rvt_pr_err(rdi, "DMA MR not null!\n");
+
+	vfree(rdi->lkey_table.table);
+}
+
+static void rvt_deinit_mregion(struct rvt_mregion *mr)
+{
+	int i = mr->mapsz;
+
+	mr->mapsz = 0;
+	while (i)
+		kfree(mr->map[--i]);
+	percpu_ref_exit(&mr->refcount);
+}
+
+static void __rvt_mregion_complete(struct percpu_ref *ref)
+{
+	struct rvt_mregion *mr = container_of(ref, struct rvt_mregion,
+					      refcount);
+
+	complete(&mr->comp);
+}
+
+static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd,
+			    int count, unsigned int percpu_flags)
+{
+	int m, i = 0;
+	struct rvt_dev_info *dev = ib_to_rvt(pd->device);
+
+	mr->mapsz = 0;
+	m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
+	for (; i < m; i++) {
+		mr->map[i] = kzalloc_node(sizeof(*mr->map[0]), GFP_KERNEL,
+					  dev->dparms.node);
+		if (!mr->map[i])
+			goto bail;
+		mr->mapsz++;
+	}
+	init_completion(&mr->comp);
+	/* count returning the ptr to user */
+	if (percpu_ref_init(&mr->refcount, &__rvt_mregion_complete,
+			    percpu_flags, GFP_KERNEL))
+		goto bail;
+
+	atomic_set(&mr->lkey_invalid, 0);
+	mr->pd = pd;
+	mr->max_segs = count;
+	return 0;
+bail:
+	rvt_deinit_mregion(mr);
+	return -ENOMEM;
+}
+
+/**
+ * rvt_alloc_lkey - allocate an lkey
+ * @mr: memory region that this lkey protects
+ * @dma_region: 0->normal key, 1->restricted DMA key
+ *
+ * Returns 0 if successful, otherwise returns -errno.
+ *
+ * Increments mr reference count as required.
+ *
+ * Sets the lkey field mr for non-dma regions.
+ *
+ */
+static int rvt_alloc_lkey(struct rvt_mregion *mr, int dma_region)
+{
+	unsigned long flags;
+	u32 r;
+	u32 n;
+	int ret = 0;
+	struct rvt_dev_info *dev = ib_to_rvt(mr->pd->device);
+	struct rvt_lkey_table *rkt = &dev->lkey_table;
+
+	rvt_get_mr(mr);
+	spin_lock_irqsave(&rkt->lock, flags);
+
+	/* special case for dma_mr lkey == 0 */
+	if (dma_region) {
+		struct rvt_mregion *tmr;
+
+		tmr = rcu_access_pointer(dev->dma_mr);
+		if (!tmr) {
+			mr->lkey_published = 1;
+			/* Insure published written first */
+			rcu_assign_pointer(dev->dma_mr, mr);
+			rvt_get_mr(mr);
+		}
+		goto success;
+	}
+
+	/* Find the next available LKEY */
+	r = rkt->next;
+	n = r;
+	for (;;) {
+		if (!rcu_access_pointer(rkt->table[r]))
+			break;
+		r = (r + 1) & (rkt->max - 1);
+		if (r == n)
+			goto bail;
+	}
+	rkt->next = (r + 1) & (rkt->max - 1);
+	/*
+	 * Make sure lkey is never zero which is reserved to indicate an
+	 * unrestricted LKEY.
+	 */
+	rkt->gen++;
+	/*
+	 * bits are capped to ensure enough bits for generation number
+	 */
+	mr->lkey = (r << (32 - dev->dparms.lkey_table_size)) |
+		((((1 << (24 - dev->dparms.lkey_table_size)) - 1) & rkt->gen)
+		 << 8);
+	if (mr->lkey == 0) {
+		mr->lkey |= 1 << 8;
+		rkt->gen++;
+	}
+	mr->lkey_published = 1;
+	/* Insure published written first */
+	rcu_assign_pointer(rkt->table[r], mr);
+success:
+	spin_unlock_irqrestore(&rkt->lock, flags);
+out:
+	return ret;
+bail:
+	rvt_put_mr(mr);
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	ret = -ENOMEM;
+	goto out;
+}
+
+/**
+ * rvt_free_lkey - free an lkey
+ * @mr: mr to free from tables
+ */
+static void rvt_free_lkey(struct rvt_mregion *mr)
+{
+	unsigned long flags;
+	u32 lkey = mr->lkey;
+	u32 r;
+	struct rvt_dev_info *dev = ib_to_rvt(mr->pd->device);
+	struct rvt_lkey_table *rkt = &dev->lkey_table;
+	int freed = 0;
+
+	spin_lock_irqsave(&rkt->lock, flags);
+	if (!lkey) {
+		if (mr->lkey_published) {
+			mr->lkey_published = 0;
+			/* insure published is written before pointer */
+			rcu_assign_pointer(dev->dma_mr, NULL);
+			rvt_put_mr(mr);
+		}
+	} else {
+		if (!mr->lkey_published)
+			goto out;
+		r = lkey >> (32 - dev->dparms.lkey_table_size);
+		mr->lkey_published = 0;
+		/* insure published is written before pointer */
+		rcu_assign_pointer(rkt->table[r], NULL);
+	}
+	freed++;
+out:
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	if (freed)
+		percpu_ref_kill(&mr->refcount);
+}
+
+static struct rvt_mr *__rvt_alloc_mr(int count, struct ib_pd *pd)
+{
+	struct rvt_mr *mr;
+	int rval = -ENOMEM;
+	int m;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
+	mr = kzalloc(sizeof(*mr) + m * sizeof(mr->mr.map[0]), GFP_KERNEL);
+	if (!mr)
+		goto bail;
+
+	rval = rvt_init_mregion(&mr->mr, pd, count, 0);
+	if (rval)
+		goto bail;
+	/*
+	 * ib_reg_phys_mr() will initialize mr->ibmr except for
+	 * lkey and rkey.
+	 */
+	rval = rvt_alloc_lkey(&mr->mr, 0);
+	if (rval)
+		goto bail_mregion;
+	mr->ibmr.lkey = mr->mr.lkey;
+	mr->ibmr.rkey = mr->mr.lkey;
+done:
+	return mr;
+
+bail_mregion:
+	rvt_deinit_mregion(&mr->mr);
+bail:
+	kfree(mr);
+	mr = ERR_PTR(rval);
+	goto done;
+}
+
+static void __rvt_free_mr(struct rvt_mr *mr)
+{
+	rvt_free_lkey(&mr->mr);
+	rvt_deinit_mregion(&mr->mr);
+	kfree(mr);
+}
+
+/**
+ * rvt_get_dma_mr - get a DMA memory region
+ * @pd: protection domain for this memory region
+ * @acc: access flags
+ *
+ * Return: the memory region on success, otherwise returns an errno.
+ * Note that all DMA addresses should be created via the functions in
+ * struct dma_virt_ops.
+ */
+struct ib_mr *rvt_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct rvt_mr *mr;
+	struct ib_mr *ret;
+	int rval;
+
+	if (ibpd_to_rvtpd(pd)->user)
+		return ERR_PTR(-EPERM);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	rval = rvt_init_mregion(&mr->mr, pd, 0, 0);
+	if (rval) {
+		ret = ERR_PTR(rval);
+		goto bail;
+	}
+
+	rval = rvt_alloc_lkey(&mr->mr, 1);
+	if (rval) {
+		ret = ERR_PTR(rval);
+		goto bail_mregion;
+	}
+
+	mr->mr.access_flags = acc;
+	ret = &mr->ibmr;
+done:
+	return ret;
+
+bail_mregion:
+	rvt_deinit_mregion(&mr->mr);
+bail:
+	kfree(mr);
+	goto done;
+}
+
+/**
+ * rvt_reg_user_mr - register a userspace memory region
+ * @pd: protection domain for this memory region
+ * @start: starting userspace address
+ * @length: length of region to register
+ * @mr_access_flags: access flags for this memory region
+ * @udata: unused by the driver
+ *
+ * Return: the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			      u64 virt_addr, int mr_access_flags,
+			      struct ib_udata *udata)
+{
+	struct rvt_mr *mr;
+	struct ib_umem *umem;
+	struct scatterlist *sg;
+	int n, m, entry;
+	struct ib_mr *ret;
+
+	if (length == 0)
+		return ERR_PTR(-EINVAL);
+
+	umem = ib_umem_get(pd->uobject->context, start, length,
+			   mr_access_flags, 0);
+	if (IS_ERR(umem))
+		return (void *)umem;
+
+	n = umem->nmap;
+
+	mr = __rvt_alloc_mr(n, pd);
+	if (IS_ERR(mr)) {
+		ret = (struct ib_mr *)mr;
+		goto bail_umem;
+	}
+
+	mr->mr.user_base = start;
+	mr->mr.iova = virt_addr;
+	mr->mr.length = length;
+	mr->mr.offset = ib_umem_offset(umem);
+	mr->mr.access_flags = mr_access_flags;
+	mr->umem = umem;
+
+	mr->mr.page_shift = umem->page_shift;
+	m = 0;
+	n = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		void *vaddr;
+
+		vaddr = page_address(sg_page(sg));
+		if (!vaddr) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail_inval;
+		}
+		mr->mr.map[m]->segs[n].vaddr = vaddr;
+		mr->mr.map[m]->segs[n].length = BIT(umem->page_shift);
+		trace_rvt_mr_user_seg(&mr->mr, m, n, vaddr,
+				      BIT(umem->page_shift));
+		n++;
+		if (n == RVT_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	return &mr->ibmr;
+
+bail_inval:
+	__rvt_free_mr(mr);
+
+bail_umem:
+	ib_umem_release(umem);
+
+	return ret;
+}
+
+/**
+ * rvt_dereg_clean_qp_cb - callback from iterator
+ * @qp - the qp
+ * @v - the mregion (as u64)
+ *
+ * This routine fields the callback for all QPs and
+ * for QPs in the same PD as the MR will call the
+ * rvt_qp_mr_clean() to potentially cleanup references.
+ */
+static void rvt_dereg_clean_qp_cb(struct rvt_qp *qp, u64 v)
+{
+	struct rvt_mregion *mr = (struct rvt_mregion *)v;
+
+	/* skip PDs that are not ours */
+	if (mr->pd != qp->ibqp.pd)
+		return;
+	rvt_qp_mr_clean(qp, mr->lkey);
+}
+
+/**
+ * rvt_dereg_clean_qps - find QPs for reference cleanup
+ * @mr - the MR that is being deregistered
+ *
+ * This routine iterates RC QPs looking for references
+ * to the lkey noted in mr.
+ */
+static void rvt_dereg_clean_qps(struct rvt_mregion *mr)
+{
+	struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
+
+	rvt_qp_iter(rdi, (u64)mr, rvt_dereg_clean_qp_cb);
+}
+
+/**
+ * rvt_check_refs - check references
+ * @mr - the megion
+ * @t - the caller identification
+ *
+ * This routine checks MRs holding a reference during
+ * when being de-registered.
+ *
+ * If the count is non-zero, the code calls a clean routine then
+ * waits for the timeout for the count to zero.
+ */
+static int rvt_check_refs(struct rvt_mregion *mr, const char *t)
+{
+	unsigned long timeout;
+	struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
+
+	if (mr->lkey) {
+		/* avoid dma mr */
+		rvt_dereg_clean_qps(mr);
+		/* @mr was indexed on rcu protected @lkey_table */
+		synchronize_rcu();
+	}
+
+	timeout = wait_for_completion_timeout(&mr->comp, 5 * HZ);
+	if (!timeout) {
+		rvt_pr_err(rdi,
+			   "%s timeout mr %p pd %p lkey %x refcount %ld\n",
+			   t, mr, mr->pd, mr->lkey,
+			   atomic_long_read(&mr->refcount.count));
+		rvt_get_mr(mr);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+/**
+ * rvt_mr_has_lkey - is MR
+ * @mr - the mregion
+ * @lkey - the lkey
+ */
+bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey)
+{
+	return mr && lkey == mr->lkey;
+}
+
+/**
+ * rvt_ss_has_lkey - is mr in sge tests
+ * @ss - the sge state
+ * @lkey
+ *
+ * This code tests for an MR in the indicated
+ * sge state.
+ */
+bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey)
+{
+	int i;
+	bool rval = false;
+
+	if (!ss->num_sge)
+		return rval;
+	/* first one */
+	rval = rvt_mr_has_lkey(ss->sge.mr, lkey);
+	/* any others */
+	for (i = 0; !rval && i < ss->num_sge - 1; i++)
+		rval = rvt_mr_has_lkey(ss->sg_list[i].mr, lkey);
+	return rval;
+}
+
+/**
+ * rvt_dereg_mr - unregister and free a memory region
+ * @ibmr: the memory region to free
+ *
+ *
+ * Note that this is called to free MRs created by rvt_get_dma_mr()
+ * or rvt_reg_user_mr().
+ *
+ * Returns 0 on success.
+ */
+int rvt_dereg_mr(struct ib_mr *ibmr)
+{
+	struct rvt_mr *mr = to_imr(ibmr);
+	int ret;
+
+	rvt_free_lkey(&mr->mr);
+
+	rvt_put_mr(&mr->mr); /* will set completion if last */
+	ret = rvt_check_refs(&mr->mr, __func__);
+	if (ret)
+		goto out;
+	rvt_deinit_mregion(&mr->mr);
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+	kfree(mr);
+out:
+	return ret;
+}
+
+/**
+ * rvt_alloc_mr - Allocate a memory region usable with the
+ * @pd: protection domain for this memory region
+ * @mr_type: mem region type
+ * @max_num_sg: Max number of segments allowed
+ *
+ * Return: the memory region on success, otherwise return an errno.
+ */
+struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
+			   enum ib_mr_type mr_type,
+			   u32 max_num_sg)
+{
+	struct rvt_mr *mr;
+
+	if (mr_type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EINVAL);
+
+	mr = __rvt_alloc_mr(max_num_sg, pd);
+	if (IS_ERR(mr))
+		return (struct ib_mr *)mr;
+
+	return &mr->ibmr;
+}
+
+/**
+ * rvt_set_page - page assignment function called by ib_sg_to_pages
+ * @ibmr: memory region
+ * @addr: dma address of mapped page
+ *
+ * Return: 0 on success
+ */
+static int rvt_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct rvt_mr *mr = to_imr(ibmr);
+	u32 ps = 1 << mr->mr.page_shift;
+	u32 mapped_segs = mr->mr.length >> mr->mr.page_shift;
+	int m, n;
+
+	if (unlikely(mapped_segs == mr->mr.max_segs))
+		return -ENOMEM;
+
+	if (mr->mr.length == 0) {
+		mr->mr.user_base = addr;
+		mr->mr.iova = addr;
+	}
+
+	m = mapped_segs / RVT_SEGSZ;
+	n = mapped_segs % RVT_SEGSZ;
+	mr->mr.map[m]->segs[n].vaddr = (void *)addr;
+	mr->mr.map[m]->segs[n].length = ps;
+	trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps);
+	mr->mr.length += ps;
+
+	return 0;
+}
+
+/**
+ * rvt_map_mr_sg - map sg list and set it the memory region
+ * @ibmr: memory region
+ * @sg: dma mapped scatterlist
+ * @sg_nents: number of entries in sg
+ * @sg_offset: offset in bytes into sg
+ *
+ * Return: number of sg elements mapped to the memory region
+ */
+int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+		  int sg_nents, unsigned int *sg_offset)
+{
+	struct rvt_mr *mr = to_imr(ibmr);
+
+	mr->mr.length = 0;
+	mr->mr.page_shift = PAGE_SHIFT;
+	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
+			      rvt_set_page);
+}
+
+/**
+ * rvt_fast_reg_mr - fast register physical MR
+ * @qp: the queue pair where the work request comes from
+ * @ibmr: the memory region to be registered
+ * @key: updated key for this memory region
+ * @access: access flags for this memory region
+ *
+ * Returns 0 on success.
+ */
+int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
+		    int access)
+{
+	struct rvt_mr *mr = to_imr(ibmr);
+
+	if (qp->ibqp.pd != mr->mr.pd)
+		return -EACCES;
+
+	/* not applicable to dma MR or user MR */
+	if (!mr->mr.lkey || mr->umem)
+		return -EINVAL;
+
+	if ((key & 0xFFFFFF00) != (mr->mr.lkey & 0xFFFFFF00))
+		return -EINVAL;
+
+	ibmr->lkey = key;
+	ibmr->rkey = key;
+	mr->mr.lkey = key;
+	mr->mr.access_flags = access;
+	atomic_set(&mr->mr.lkey_invalid, 0);
+
+	return 0;
+}
+EXPORT_SYMBOL(rvt_fast_reg_mr);
+
+/**
+ * rvt_invalidate_rkey - invalidate an MR rkey
+ * @qp: queue pair associated with the invalidate op
+ * @rkey: rkey to invalidate
+ *
+ * Returns 0 on success.
+ */
+int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey)
+{
+	struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
+	struct rvt_lkey_table *rkt = &dev->lkey_table;
+	struct rvt_mregion *mr;
+
+	if (rkey == 0)
+		return -EINVAL;
+
+	rcu_read_lock();
+	mr = rcu_dereference(
+		rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
+	if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+		goto bail;
+
+	atomic_set(&mr->lkey_invalid, 1);
+	rcu_read_unlock();
+	return 0;
+
+bail:
+	rcu_read_unlock();
+	return -EINVAL;
+}
+EXPORT_SYMBOL(rvt_invalidate_rkey);
+
+/**
+ * rvt_alloc_fmr - allocate a fast memory region
+ * @pd: the protection domain for this memory region
+ * @mr_access_flags: access flags for this memory region
+ * @fmr_attr: fast memory region attributes
+ *
+ * Return: the memory region on success, otherwise returns an errno.
+ */
+struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			     struct ib_fmr_attr *fmr_attr)
+{
+	struct rvt_fmr *fmr;
+	int m;
+	struct ib_fmr *ret;
+	int rval = -ENOMEM;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (fmr_attr->max_pages + RVT_SEGSZ - 1) / RVT_SEGSZ;
+	fmr = kzalloc(sizeof(*fmr) + m * sizeof(fmr->mr.map[0]), GFP_KERNEL);
+	if (!fmr)
+		goto bail;
+
+	rval = rvt_init_mregion(&fmr->mr, pd, fmr_attr->max_pages,
+				PERCPU_REF_INIT_ATOMIC);
+	if (rval)
+		goto bail;
+
+	/*
+	 * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
+	 * rkey.
+	 */
+	rval = rvt_alloc_lkey(&fmr->mr, 0);
+	if (rval)
+		goto bail_mregion;
+	fmr->ibfmr.rkey = fmr->mr.lkey;
+	fmr->ibfmr.lkey = fmr->mr.lkey;
+	/*
+	 * Resources are allocated but no valid mapping (RKEY can't be
+	 * used).
+	 */
+	fmr->mr.access_flags = mr_access_flags;
+	fmr->mr.max_segs = fmr_attr->max_pages;
+	fmr->mr.page_shift = fmr_attr->page_shift;
+
+	ret = &fmr->ibfmr;
+done:
+	return ret;
+
+bail_mregion:
+	rvt_deinit_mregion(&fmr->mr);
+bail:
+	kfree(fmr);
+	ret = ERR_PTR(rval);
+	goto done;
+}
+
+/**
+ * rvt_map_phys_fmr - set up a fast memory region
+ * @ibfmr: the fast memory region to set up
+ * @page_list: the list of pages to associate with the fast memory region
+ * @list_len: the number of pages to associate with the fast memory region
+ * @iova: the virtual address of the start of the fast memory region
+ *
+ * This may be called from interrupt context.
+ *
+ * Return: 0 on success
+ */
+
+int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		     int list_len, u64 iova)
+{
+	struct rvt_fmr *fmr = to_ifmr(ibfmr);
+	struct rvt_lkey_table *rkt;
+	unsigned long flags;
+	int m, n;
+	unsigned long i;
+	u32 ps;
+	struct rvt_dev_info *rdi = ib_to_rvt(ibfmr->device);
+
+	i = atomic_long_read(&fmr->mr.refcount.count);
+	if (i > 2)
+		return -EBUSY;
+
+	if (list_len > fmr->mr.max_segs)
+		return -EINVAL;
+
+	rkt = &rdi->lkey_table;
+	spin_lock_irqsave(&rkt->lock, flags);
+	fmr->mr.user_base = iova;
+	fmr->mr.iova = iova;
+	ps = 1 << fmr->mr.page_shift;
+	fmr->mr.length = list_len * ps;
+	m = 0;
+	n = 0;
+	for (i = 0; i < list_len; i++) {
+		fmr->mr.map[m]->segs[n].vaddr = (void *)page_list[i];
+		fmr->mr.map[m]->segs[n].length = ps;
+		trace_rvt_mr_fmr_seg(&fmr->mr, m, n, (void *)page_list[i], ps);
+		if (++n == RVT_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	return 0;
+}
+
+/**
+ * rvt_unmap_fmr - unmap fast memory regions
+ * @fmr_list: the list of fast memory regions to unmap
+ *
+ * Return: 0 on success.
+ */
+int rvt_unmap_fmr(struct list_head *fmr_list)
+{
+	struct rvt_fmr *fmr;
+	struct rvt_lkey_table *rkt;
+	unsigned long flags;
+	struct rvt_dev_info *rdi;
+
+	list_for_each_entry(fmr, fmr_list, ibfmr.list) {
+		rdi = ib_to_rvt(fmr->ibfmr.device);
+		rkt = &rdi->lkey_table;
+		spin_lock_irqsave(&rkt->lock, flags);
+		fmr->mr.user_base = 0;
+		fmr->mr.iova = 0;
+		fmr->mr.length = 0;
+		spin_unlock_irqrestore(&rkt->lock, flags);
+	}
+	return 0;
+}
+
+/**
+ * rvt_dealloc_fmr - deallocate a fast memory region
+ * @ibfmr: the fast memory region to deallocate
+ *
+ * Return: 0 on success.
+ */
+int rvt_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+	struct rvt_fmr *fmr = to_ifmr(ibfmr);
+	int ret = 0;
+
+	rvt_free_lkey(&fmr->mr);
+	rvt_put_mr(&fmr->mr); /* will set completion if last */
+	ret = rvt_check_refs(&fmr->mr, __func__);
+	if (ret)
+		goto out;
+	rvt_deinit_mregion(&fmr->mr);
+	kfree(fmr);
+out:
+	return ret;
+}
+
+/**
+ * rvt_sge_adjacent - is isge compressible
+ * @last_sge: last outgoing SGE written
+ * @sge: SGE to check
+ *
+ * If adjacent will update last_sge to add length.
+ *
+ * Return: true if isge is adjacent to last sge
+ */
+static inline bool rvt_sge_adjacent(struct rvt_sge *last_sge,
+				    struct ib_sge *sge)
+{
+	if (last_sge && sge->lkey == last_sge->mr->lkey &&
+	    ((uint64_t)(last_sge->vaddr + last_sge->length) == sge->addr)) {
+		if (sge->lkey) {
+			if (unlikely((sge->addr - last_sge->mr->user_base +
+			      sge->length > last_sge->mr->length)))
+				return false; /* overrun, caller will catch */
+		} else {
+			last_sge->length += sge->length;
+		}
+		last_sge->sge_length += sge->length;
+		trace_rvt_sge_adjacent(last_sge, sge);
+		return true;
+	}
+	return false;
+}
+
+/**
+ * rvt_lkey_ok - check IB SGE for validity and initialize
+ * @rkt: table containing lkey to check SGE against
+ * @pd: protection domain
+ * @isge: outgoing internal SGE
+ * @last_sge: last outgoing SGE written
+ * @sge: SGE to check
+ * @acc: access flags
+ *
+ * Check the IB SGE for validity and initialize our internal version
+ * of it.
+ *
+ * Increments the reference count when a new sge is stored.
+ *
+ * Return: 0 if compressed, 1 if added , otherwise returns -errno.
+ */
+int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
+		struct rvt_sge *isge, struct rvt_sge *last_sge,
+		struct ib_sge *sge, int acc)
+{
+	struct rvt_mregion *mr;
+	unsigned n, m;
+	size_t off;
+
+	/*
+	 * We use LKEY == zero for kernel virtual addresses
+	 * (see rvt_get_dma_mr() and dma_virt_ops).
+	 */
+	if (sge->lkey == 0) {
+		struct rvt_dev_info *dev = ib_to_rvt(pd->ibpd.device);
+
+		if (pd->user)
+			return -EINVAL;
+		if (rvt_sge_adjacent(last_sge, sge))
+			return 0;
+		rcu_read_lock();
+		mr = rcu_dereference(dev->dma_mr);
+		if (!mr)
+			goto bail;
+		rvt_get_mr(mr);
+		rcu_read_unlock();
+
+		isge->mr = mr;
+		isge->vaddr = (void *)sge->addr;
+		isge->length = sge->length;
+		isge->sge_length = sge->length;
+		isge->m = 0;
+		isge->n = 0;
+		goto ok;
+	}
+	if (rvt_sge_adjacent(last_sge, sge))
+		return 0;
+	rcu_read_lock();
+	mr = rcu_dereference(rkt->table[sge->lkey >> rkt->shift]);
+	if (!mr)
+		goto bail;
+	rvt_get_mr(mr);
+	if (!READ_ONCE(mr->lkey_published))
+		goto bail_unref;
+
+	if (unlikely(atomic_read(&mr->lkey_invalid) ||
+		     mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
+		goto bail_unref;
+
+	off = sge->addr - mr->user_base;
+	if (unlikely(sge->addr < mr->user_base ||
+		     off + sge->length > mr->length ||
+		     (mr->access_flags & acc) != acc))
+		goto bail_unref;
+	rcu_read_unlock();
+
+	off += mr->offset;
+	if (mr->page_shift) {
+		/*
+		 * page sizes are uniform power of 2 so no loop is necessary
+		 * entries_spanned_by_off is the number of times the loop below
+		 * would have executed.
+		*/
+		size_t entries_spanned_by_off;
+
+		entries_spanned_by_off = off >> mr->page_shift;
+		off -= (entries_spanned_by_off << mr->page_shift);
+		m = entries_spanned_by_off / RVT_SEGSZ;
+		n = entries_spanned_by_off % RVT_SEGSZ;
+	} else {
+		m = 0;
+		n = 0;
+		while (off >= mr->map[m]->segs[n].length) {
+			off -= mr->map[m]->segs[n].length;
+			n++;
+			if (n >= RVT_SEGSZ) {
+				m++;
+				n = 0;
+			}
+		}
+	}
+	isge->mr = mr;
+	isge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	isge->length = mr->map[m]->segs[n].length - off;
+	isge->sge_length = sge->length;
+	isge->m = m;
+	isge->n = n;
+ok:
+	trace_rvt_sge_new(isge, sge);
+	return 1;
+bail_unref:
+	rvt_put_mr(mr);
+bail:
+	rcu_read_unlock();
+	return -EINVAL;
+}
+EXPORT_SYMBOL(rvt_lkey_ok);
+
+/**
+ * rvt_rkey_ok - check the IB virtual address, length, and RKEY
+ * @qp: qp for validation
+ * @sge: SGE state
+ * @len: length of data
+ * @vaddr: virtual address to place data
+ * @rkey: rkey to check
+ * @acc: access flags
+ *
+ * Return: 1 if successful, otherwise 0.
+ *
+ * increments the reference count upon success
+ */
+int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
+		u32 len, u64 vaddr, u32 rkey, int acc)
+{
+	struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
+	struct rvt_lkey_table *rkt = &dev->lkey_table;
+	struct rvt_mregion *mr;
+	unsigned n, m;
+	size_t off;
+
+	/*
+	 * We use RKEY == zero for kernel virtual addresses
+	 * (see rvt_get_dma_mr() and dma_virt_ops).
+	 */
+	rcu_read_lock();
+	if (rkey == 0) {
+		struct rvt_pd *pd = ibpd_to_rvtpd(qp->ibqp.pd);
+		struct rvt_dev_info *rdi = ib_to_rvt(pd->ibpd.device);
+
+		if (pd->user)
+			goto bail;
+		mr = rcu_dereference(rdi->dma_mr);
+		if (!mr)
+			goto bail;
+		rvt_get_mr(mr);
+		rcu_read_unlock();
+
+		sge->mr = mr;
+		sge->vaddr = (void *)vaddr;
+		sge->length = len;
+		sge->sge_length = len;
+		sge->m = 0;
+		sge->n = 0;
+		goto ok;
+	}
+
+	mr = rcu_dereference(rkt->table[rkey >> rkt->shift]);
+	if (!mr)
+		goto bail;
+	rvt_get_mr(mr);
+	/* insure mr read is before test */
+	if (!READ_ONCE(mr->lkey_published))
+		goto bail_unref;
+	if (unlikely(atomic_read(&mr->lkey_invalid) ||
+		     mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+		goto bail_unref;
+
+	off = vaddr - mr->iova;
+	if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+		     (mr->access_flags & acc) == 0))
+		goto bail_unref;
+	rcu_read_unlock();
+
+	off += mr->offset;
+	if (mr->page_shift) {
+		/*
+		 * page sizes are uniform power of 2 so no loop is necessary
+		 * entries_spanned_by_off is the number of times the loop below
+		 * would have executed.
+		*/
+		size_t entries_spanned_by_off;
+
+		entries_spanned_by_off = off >> mr->page_shift;
+		off -= (entries_spanned_by_off << mr->page_shift);
+		m = entries_spanned_by_off / RVT_SEGSZ;
+		n = entries_spanned_by_off % RVT_SEGSZ;
+	} else {
+		m = 0;
+		n = 0;
+		while (off >= mr->map[m]->segs[n].length) {
+			off -= mr->map[m]->segs[n].length;
+			n++;
+			if (n >= RVT_SEGSZ) {
+				m++;
+				n = 0;
+			}
+		}
+	}
+	sge->mr = mr;
+	sge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	sge->length = mr->map[m]->segs[n].length - off;
+	sge->sge_length = len;
+	sge->m = m;
+	sge->n = n;
+ok:
+	return 1;
+bail_unref:
+	rvt_put_mr(mr);
+bail:
+	rcu_read_unlock();
+	return 0;
+}
+EXPORT_SYMBOL(rvt_rkey_ok);
diff --git a/drivers/infiniband/sw/rdmavt/mr.h b/drivers/infiniband/sw/rdmavt/mr.h
new file mode 100644
index 0000000..132800e
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mr.h
@@ -0,0 +1,94 @@
+#ifndef DEF_RVTMR_H
+#define DEF_RVTMR_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/rdma_vt.h>
+struct rvt_fmr {
+	struct ib_fmr ibfmr;
+	struct rvt_mregion mr;        /* must be last */
+};
+
+struct rvt_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct rvt_mregion mr;  /* must be last */
+};
+
+static inline struct rvt_fmr *to_ifmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct rvt_fmr, ibfmr);
+}
+
+static inline struct rvt_mr *to_imr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct rvt_mr, ibmr);
+}
+
+int rvt_driver_mr_init(struct rvt_dev_info *rdi);
+void rvt_mr_exit(struct rvt_dev_info *rdi);
+
+/* Mem Regions */
+struct ib_mr *rvt_get_dma_mr(struct ib_pd *pd, int acc);
+struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			      u64 virt_addr, int mr_access_flags,
+			      struct ib_udata *udata);
+int rvt_dereg_mr(struct ib_mr *ibmr);
+struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
+			   enum ib_mr_type mr_type,
+			   u32 max_num_sg);
+int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+		  int sg_nents, unsigned int *sg_offset);
+struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			     struct ib_fmr_attr *fmr_attr);
+int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		     int list_len, u64 iova);
+int rvt_unmap_fmr(struct list_head *fmr_list);
+int rvt_dealloc_fmr(struct ib_fmr *ibfmr);
+
+#endif          /* DEF_RVTMR_H */
diff --git a/drivers/infiniband/sw/rdmavt/pd.c b/drivers/infiniband/sw/rdmavt/pd.c
new file mode 100644
index 0000000..8a89aff
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/pd.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/slab.h>
+#include "pd.h"
+
+/**
+ * rvt_alloc_pd - allocate a protection domain
+ * @ibdev: ib device
+ * @context: optional user context
+ * @udata: optional user data
+ *
+ * Allocate and keep track of a PD.
+ *
+ * Return: 0 on success
+ */
+struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev,
+			   struct ib_ucontext *context,
+			   struct ib_udata *udata)
+{
+	struct rvt_dev_info *dev = ib_to_rvt(ibdev);
+	struct rvt_pd *pd;
+	struct ib_pd *ret;
+
+	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+	/*
+	 * While we could continue allocating protecetion domains, being
+	 * constrained only by system resources. The IBTA spec defines that
+	 * there is a max_pd limit that can be set and we need to check for
+	 * that.
+	 */
+
+	spin_lock(&dev->n_pds_lock);
+	if (dev->n_pds_allocated == dev->dparms.props.max_pd) {
+		spin_unlock(&dev->n_pds_lock);
+		kfree(pd);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	dev->n_pds_allocated++;
+	spin_unlock(&dev->n_pds_lock);
+
+	/* ib_alloc_pd() will initialize pd->ibpd. */
+	pd->user = !!udata;
+
+	ret = &pd->ibpd;
+
+bail:
+	return ret;
+}
+
+/**
+ * rvt_dealloc_pd - Free PD
+ * @ibpd: Free up PD
+ *
+ * Return: always 0
+ */
+int rvt_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct rvt_pd *pd = ibpd_to_rvtpd(ibpd);
+	struct rvt_dev_info *dev = ib_to_rvt(ibpd->device);
+
+	spin_lock(&dev->n_pds_lock);
+	dev->n_pds_allocated--;
+	spin_unlock(&dev->n_pds_lock);
+
+	kfree(pd);
+
+	return 0;
+}
diff --git a/drivers/infiniband/sw/rdmavt/pd.h b/drivers/infiniband/sw/rdmavt/pd.h
new file mode 100644
index 0000000..1892ca4
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/pd.h
@@ -0,0 +1,58 @@
+#ifndef DEF_RDMAVTPD_H
+#define DEF_RDMAVTPD_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/rdma_vt.h>
+
+struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev,
+			   struct ib_ucontext *context,
+			   struct ib_udata *udata);
+int rvt_dealloc_pd(struct ib_pd *ibpd);
+
+#endif          /* DEF_RDMAVTPD_H */
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
new file mode 100644
index 0000000..c82e6bb
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -0,0 +1,2316 @@
+/*
+ * Copyright(c) 2016, 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/hash.h>
+#include <linux/bitops.h>
+#include <linux/lockdep.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_hdrs.h>
+#include <rdma/opa_addr.h>
+#include "qp.h"
+#include "vt.h"
+#include "trace.h"
+
+static void rvt_rc_timeout(struct timer_list *t);
+
+/*
+ * Convert the AETH RNR timeout code into the number of microseconds.
+ */
+static const u32 ib_rvt_rnr_table[32] = {
+	655360, /* 00: 655.36 */
+	10,     /* 01:    .01 */
+	20,     /* 02     .02 */
+	30,     /* 03:    .03 */
+	40,     /* 04:    .04 */
+	60,     /* 05:    .06 */
+	80,     /* 06:    .08 */
+	120,    /* 07:    .12 */
+	160,    /* 08:    .16 */
+	240,    /* 09:    .24 */
+	320,    /* 0A:    .32 */
+	480,    /* 0B:    .48 */
+	640,    /* 0C:    .64 */
+	960,    /* 0D:    .96 */
+	1280,   /* 0E:   1.28 */
+	1920,   /* 0F:   1.92 */
+	2560,   /* 10:   2.56 */
+	3840,   /* 11:   3.84 */
+	5120,   /* 12:   5.12 */
+	7680,   /* 13:   7.68 */
+	10240,  /* 14:  10.24 */
+	15360,  /* 15:  15.36 */
+	20480,  /* 16:  20.48 */
+	30720,  /* 17:  30.72 */
+	40960,  /* 18:  40.96 */
+	61440,  /* 19:  61.44 */
+	81920,  /* 1A:  81.92 */
+	122880, /* 1B: 122.88 */
+	163840, /* 1C: 163.84 */
+	245760, /* 1D: 245.76 */
+	327680, /* 1E: 327.68 */
+	491520  /* 1F: 491.52 */
+};
+
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; rvt_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
+const int ib_rvt_state_ops[IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = 0,
+	[IB_QPS_INIT] = RVT_POST_RECV_OK,
+	[IB_QPS_RTR] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK,
+	[IB_QPS_RTS] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
+	    RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK |
+	    RVT_PROCESS_NEXT_SEND_OK,
+	[IB_QPS_SQD] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
+	    RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK,
+	[IB_QPS_SQE] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
+	    RVT_POST_SEND_OK | RVT_FLUSH_SEND,
+	[IB_QPS_ERR] = RVT_POST_RECV_OK | RVT_FLUSH_RECV |
+	    RVT_POST_SEND_OK | RVT_FLUSH_SEND,
+};
+EXPORT_SYMBOL(ib_rvt_state_ops);
+
+static void get_map_page(struct rvt_qpn_table *qpt,
+			 struct rvt_qpn_map *map)
+{
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+	/*
+	 * Free the page if someone raced with us installing it.
+	 */
+
+	spin_lock(&qpt->lock);
+	if (map->page)
+		free_page(page);
+	else
+		map->page = (void *)page;
+	spin_unlock(&qpt->lock);
+}
+
+/**
+ * init_qpn_table - initialize the QP number table for a device
+ * @qpt: the QPN table
+ */
+static int init_qpn_table(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt)
+{
+	u32 offset, i;
+	struct rvt_qpn_map *map;
+	int ret = 0;
+
+	if (!(rdi->dparms.qpn_res_end >= rdi->dparms.qpn_res_start))
+		return -EINVAL;
+
+	spin_lock_init(&qpt->lock);
+
+	qpt->last = rdi->dparms.qpn_start;
+	qpt->incr = rdi->dparms.qpn_inc << rdi->dparms.qos_shift;
+
+	/*
+	 * Drivers may want some QPs beyond what we need for verbs let them use
+	 * our qpn table. No need for two. Lets go ahead and mark the bitmaps
+	 * for those. The reserved range must be *after* the range which verbs
+	 * will pick from.
+	 */
+
+	/* Figure out number of bit maps needed before reserved range */
+	qpt->nmaps = rdi->dparms.qpn_res_start / RVT_BITS_PER_PAGE;
+
+	/* This should always be zero */
+	offset = rdi->dparms.qpn_res_start & RVT_BITS_PER_PAGE_MASK;
+
+	/* Starting with the first reserved bit map */
+	map = &qpt->map[qpt->nmaps];
+
+	rvt_pr_info(rdi, "Reserving QPNs from 0x%x to 0x%x for non-verbs use\n",
+		    rdi->dparms.qpn_res_start, rdi->dparms.qpn_res_end);
+	for (i = rdi->dparms.qpn_res_start; i <= rdi->dparms.qpn_res_end; i++) {
+		if (!map->page) {
+			get_map_page(qpt, map);
+			if (!map->page) {
+				ret = -ENOMEM;
+				break;
+			}
+		}
+		set_bit(offset, map->page);
+		offset++;
+		if (offset == RVT_BITS_PER_PAGE) {
+			/* next page */
+			qpt->nmaps++;
+			map++;
+			offset = 0;
+		}
+	}
+	return ret;
+}
+
+/**
+ * free_qpn_table - free the QP number table for a device
+ * @qpt: the QPN table
+ */
+static void free_qpn_table(struct rvt_qpn_table *qpt)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
+		free_page((unsigned long)qpt->map[i].page);
+}
+
+/**
+ * rvt_driver_qp_init - Init driver qp resources
+ * @rdi: rvt dev strucutre
+ *
+ * Return: 0 on success
+ */
+int rvt_driver_qp_init(struct rvt_dev_info *rdi)
+{
+	int i;
+	int ret = -ENOMEM;
+
+	if (!rdi->dparms.qp_table_size)
+		return -EINVAL;
+
+	/*
+	 * If driver is not doing any QP allocation then make sure it is
+	 * providing the necessary QP functions.
+	 */
+	if (!rdi->driver_f.free_all_qps ||
+	    !rdi->driver_f.qp_priv_alloc ||
+	    !rdi->driver_f.qp_priv_free ||
+	    !rdi->driver_f.notify_qp_reset ||
+	    !rdi->driver_f.notify_restart_rc)
+		return -EINVAL;
+
+	/* allocate parent object */
+	rdi->qp_dev = kzalloc_node(sizeof(*rdi->qp_dev), GFP_KERNEL,
+				   rdi->dparms.node);
+	if (!rdi->qp_dev)
+		return -ENOMEM;
+
+	/* allocate hash table */
+	rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size;
+	rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size);
+	rdi->qp_dev->qp_table =
+		kmalloc_array_node(rdi->qp_dev->qp_table_size,
+			     sizeof(*rdi->qp_dev->qp_table),
+			     GFP_KERNEL, rdi->dparms.node);
+	if (!rdi->qp_dev->qp_table)
+		goto no_qp_table;
+
+	for (i = 0; i < rdi->qp_dev->qp_table_size; i++)
+		RCU_INIT_POINTER(rdi->qp_dev->qp_table[i], NULL);
+
+	spin_lock_init(&rdi->qp_dev->qpt_lock);
+
+	/* initialize qpn map */
+	if (init_qpn_table(rdi, &rdi->qp_dev->qpn_table))
+		goto fail_table;
+
+	spin_lock_init(&rdi->n_qps_lock);
+
+	return 0;
+
+fail_table:
+	kfree(rdi->qp_dev->qp_table);
+	free_qpn_table(&rdi->qp_dev->qpn_table);
+
+no_qp_table:
+	kfree(rdi->qp_dev);
+
+	return ret;
+}
+
+/**
+ * free_all_qps - check for QPs still in use
+ * @rdi: rvt device info structure
+ *
+ * There should not be any QPs still in use.
+ * Free memory for table.
+ */
+static unsigned rvt_free_all_qps(struct rvt_dev_info *rdi)
+{
+	unsigned long flags;
+	struct rvt_qp *qp;
+	unsigned n, qp_inuse = 0;
+	spinlock_t *ql; /* work around too long line below */
+
+	if (rdi->driver_f.free_all_qps)
+		qp_inuse = rdi->driver_f.free_all_qps(rdi);
+
+	qp_inuse += rvt_mcast_tree_empty(rdi);
+
+	if (!rdi->qp_dev)
+		return qp_inuse;
+
+	ql = &rdi->qp_dev->qpt_lock;
+	spin_lock_irqsave(ql, flags);
+	for (n = 0; n < rdi->qp_dev->qp_table_size; n++) {
+		qp = rcu_dereference_protected(rdi->qp_dev->qp_table[n],
+					       lockdep_is_held(ql));
+		RCU_INIT_POINTER(rdi->qp_dev->qp_table[n], NULL);
+
+		for (; qp; qp = rcu_dereference_protected(qp->next,
+							  lockdep_is_held(ql)))
+			qp_inuse++;
+	}
+	spin_unlock_irqrestore(ql, flags);
+	synchronize_rcu();
+	return qp_inuse;
+}
+
+/**
+ * rvt_qp_exit - clean up qps on device exit
+ * @rdi: rvt dev structure
+ *
+ * Check for qp leaks and free resources.
+ */
+void rvt_qp_exit(struct rvt_dev_info *rdi)
+{
+	u32 qps_inuse = rvt_free_all_qps(rdi);
+
+	if (qps_inuse)
+		rvt_pr_err(rdi, "QP memory leak! %u still in use\n",
+			   qps_inuse);
+	if (!rdi->qp_dev)
+		return;
+
+	kfree(rdi->qp_dev->qp_table);
+	free_qpn_table(&rdi->qp_dev->qpn_table);
+	kfree(rdi->qp_dev);
+}
+
+static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
+			      struct rvt_qpn_map *map, unsigned off)
+{
+	return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
+}
+
+/**
+ * alloc_qpn - Allocate the next available qpn or zero/one for QP type
+ *	       IB_QPT_SMI/IB_QPT_GSI
+ * @rdi: rvt device info structure
+ * @qpt: queue pair number table pointer
+ * @port_num: IB port number, 1 based, comes from core
+ *
+ * Return: The queue pair number
+ */
+static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
+		     enum ib_qp_type type, u8 port_num)
+{
+	u32 i, offset, max_scan, qpn;
+	struct rvt_qpn_map *map;
+	u32 ret;
+
+	if (rdi->driver_f.alloc_qpn)
+		return rdi->driver_f.alloc_qpn(rdi, qpt, type, port_num);
+
+	if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
+		unsigned n;
+
+		ret = type == IB_QPT_GSI;
+		n = 1 << (ret + 2 * (port_num - 1));
+		spin_lock(&qpt->lock);
+		if (qpt->flags & n)
+			ret = -EINVAL;
+		else
+			qpt->flags |= n;
+		spin_unlock(&qpt->lock);
+		goto bail;
+	}
+
+	qpn = qpt->last + qpt->incr;
+	if (qpn >= RVT_QPN_MAX)
+		qpn = qpt->incr | ((qpt->last & 1) ^ 1);
+	/* offset carries bit 0 */
+	offset = qpn & RVT_BITS_PER_PAGE_MASK;
+	map = &qpt->map[qpn / RVT_BITS_PER_PAGE];
+	max_scan = qpt->nmaps - !offset;
+	for (i = 0;;) {
+		if (unlikely(!map->page)) {
+			get_map_page(qpt, map);
+			if (unlikely(!map->page))
+				break;
+		}
+		do {
+			if (!test_and_set_bit(offset, map->page)) {
+				qpt->last = qpn;
+				ret = qpn;
+				goto bail;
+			}
+			offset += qpt->incr;
+			/*
+			 * This qpn might be bogus if offset >= BITS_PER_PAGE.
+			 * That is OK.   It gets re-assigned below
+			 */
+			qpn = mk_qpn(qpt, map, offset);
+		} while (offset < RVT_BITS_PER_PAGE && qpn < RVT_QPN_MAX);
+		/*
+		 * In order to keep the number of pages allocated to a
+		 * minimum, we scan the all existing pages before increasing
+		 * the size of the bitmap table.
+		 */
+		if (++i > max_scan) {
+			if (qpt->nmaps == RVT_QPNMAP_ENTRIES)
+				break;
+			map = &qpt->map[qpt->nmaps++];
+			/* start at incr with current bit 0 */
+			offset = qpt->incr | (offset & 1);
+		} else if (map < &qpt->map[qpt->nmaps]) {
+			++map;
+			/* start at incr with current bit 0 */
+			offset = qpt->incr | (offset & 1);
+		} else {
+			map = &qpt->map[0];
+			/* wrap to first map page, invert bit 0 */
+			offset = qpt->incr | ((offset & 1) ^ 1);
+		}
+		/* there can be no set bits in low-order QoS bits */
+		WARN_ON(offset & (BIT(rdi->dparms.qos_shift) - 1));
+		qpn = mk_qpn(qpt, map, offset);
+	}
+
+	ret = -ENOMEM;
+
+bail:
+	return ret;
+}
+
+/**
+ * rvt_clear_mr_refs - Drop help mr refs
+ * @qp: rvt qp data structure
+ * @clr_sends: If shoudl clear send side or not
+ */
+static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
+{
+	unsigned n;
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+
+	if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
+		rvt_put_ss(&qp->s_rdma_read_sge);
+
+	rvt_put_ss(&qp->r_sge);
+
+	if (clr_sends) {
+		while (qp->s_last != qp->s_head) {
+			struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+
+			rvt_put_swqe(wqe);
+
+			if (qp->ibqp.qp_type == IB_QPT_UD ||
+			    qp->ibqp.qp_type == IB_QPT_SMI ||
+			    qp->ibqp.qp_type == IB_QPT_GSI)
+				atomic_dec(&ibah_to_rvtah(
+						wqe->ud_wr.ah)->refcount);
+			if (++qp->s_last >= qp->s_size)
+				qp->s_last = 0;
+			smp_wmb(); /* see qp_set_savail */
+		}
+		if (qp->s_rdma_mr) {
+			rvt_put_mr(qp->s_rdma_mr);
+			qp->s_rdma_mr = NULL;
+		}
+	}
+
+	for (n = 0; qp->s_ack_queue && n < rvt_max_atomic(rdi); n++) {
+		struct rvt_ack_entry *e = &qp->s_ack_queue[n];
+
+		if (e->rdma_sge.mr) {
+			rvt_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+	}
+}
+
+/**
+ * rvt_swqe_has_lkey - return true if lkey is used by swqe
+ * @wqe - the send wqe
+ * @lkey - the lkey
+ *
+ * Test the swqe for using lkey
+ */
+static bool rvt_swqe_has_lkey(struct rvt_swqe *wqe, u32 lkey)
+{
+	int i;
+
+	for (i = 0; i < wqe->wr.num_sge; i++) {
+		struct rvt_sge *sge = &wqe->sg_list[i];
+
+		if (rvt_mr_has_lkey(sge->mr, lkey))
+			return true;
+	}
+	return false;
+}
+
+/**
+ * rvt_qp_sends_has_lkey - return true is qp sends use lkey
+ * @qp - the rvt_qp
+ * @lkey - the lkey
+ */
+static bool rvt_qp_sends_has_lkey(struct rvt_qp *qp, u32 lkey)
+{
+	u32 s_last = qp->s_last;
+
+	while (s_last != qp->s_head) {
+		struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, s_last);
+
+		if (rvt_swqe_has_lkey(wqe, lkey))
+			return true;
+
+		if (++s_last >= qp->s_size)
+			s_last = 0;
+	}
+	if (qp->s_rdma_mr)
+		if (rvt_mr_has_lkey(qp->s_rdma_mr, lkey))
+			return true;
+	return false;
+}
+
+/**
+ * rvt_qp_acks_has_lkey - return true if acks have lkey
+ * @qp - the qp
+ * @lkey - the lkey
+ */
+static bool rvt_qp_acks_has_lkey(struct rvt_qp *qp, u32 lkey)
+{
+	int i;
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+
+	for (i = 0; qp->s_ack_queue && i < rvt_max_atomic(rdi); i++) {
+		struct rvt_ack_entry *e = &qp->s_ack_queue[i];
+
+		if (rvt_mr_has_lkey(e->rdma_sge.mr, lkey))
+			return true;
+	}
+	return false;
+}
+
+/*
+ * rvt_qp_mr_clean - clean up remote ops for lkey
+ * @qp - the qp
+ * @lkey - the lkey that is being de-registered
+ *
+ * This routine checks if the lkey is being used by
+ * the qp.
+ *
+ * If so, the qp is put into an error state to elminate
+ * any references from the qp.
+ */
+void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey)
+{
+	bool lastwqe = false;
+
+	if (qp->ibqp.qp_type == IB_QPT_SMI ||
+	    qp->ibqp.qp_type == IB_QPT_GSI)
+		/* avoid special QPs */
+		return;
+	spin_lock_irq(&qp->r_lock);
+	spin_lock(&qp->s_hlock);
+	spin_lock(&qp->s_lock);
+
+	if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
+		goto check_lwqe;
+
+	if (rvt_ss_has_lkey(&qp->r_sge, lkey) ||
+	    rvt_qp_sends_has_lkey(qp, lkey) ||
+	    rvt_qp_acks_has_lkey(qp, lkey))
+		lastwqe = rvt_error_qp(qp, IB_WC_LOC_PROT_ERR);
+check_lwqe:
+	spin_unlock(&qp->s_lock);
+	spin_unlock(&qp->s_hlock);
+	spin_unlock_irq(&qp->r_lock);
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+}
+
+/**
+ * rvt_remove_qp - remove qp form table
+ * @rdi: rvt dev struct
+ * @qp: qp to remove
+ *
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive routine.
+ */
+static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+	struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
+	u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
+	unsigned long flags;
+	int removed = 1;
+
+	spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
+
+	if (rcu_dereference_protected(rvp->qp[0],
+			lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
+		RCU_INIT_POINTER(rvp->qp[0], NULL);
+	} else if (rcu_dereference_protected(rvp->qp[1],
+			lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
+		RCU_INIT_POINTER(rvp->qp[1], NULL);
+	} else {
+		struct rvt_qp *q;
+		struct rvt_qp __rcu **qpp;
+
+		removed = 0;
+		qpp = &rdi->qp_dev->qp_table[n];
+		for (; (q = rcu_dereference_protected(*qpp,
+			lockdep_is_held(&rdi->qp_dev->qpt_lock))) != NULL;
+			qpp = &q->next) {
+			if (q == qp) {
+				RCU_INIT_POINTER(*qpp,
+				     rcu_dereference_protected(qp->next,
+				     lockdep_is_held(&rdi->qp_dev->qpt_lock)));
+				removed = 1;
+				trace_rvt_qpremove(qp, n);
+				break;
+			}
+		}
+	}
+
+	spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
+	if (removed) {
+		synchronize_rcu();
+		rvt_put_qp(qp);
+	}
+}
+
+/**
+ * rvt_init_qp - initialize the QP state to the reset state
+ * @qp: the QP to init or reinit
+ * @type: the QP type
+ *
+ * This function is called from both rvt_create_qp() and
+ * rvt_reset_qp().   The difference is that the reset
+ * patch the necessary locks to protect against concurent
+ * access.
+ */
+static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+			enum ib_qp_type type)
+{
+	qp->remote_qpn = 0;
+	qp->qkey = 0;
+	qp->qp_access_flags = 0;
+	qp->s_flags &= RVT_S_SIGNAL_REQ_WR;
+	qp->s_hdrwords = 0;
+	qp->s_wqe = NULL;
+	qp->s_draining = 0;
+	qp->s_next_psn = 0;
+	qp->s_last_psn = 0;
+	qp->s_sending_psn = 0;
+	qp->s_sending_hpsn = 0;
+	qp->s_psn = 0;
+	qp->r_psn = 0;
+	qp->r_msn = 0;
+	if (type == IB_QPT_RC) {
+		qp->s_state = IB_OPCODE_RC_SEND_LAST;
+		qp->r_state = IB_OPCODE_RC_SEND_LAST;
+	} else {
+		qp->s_state = IB_OPCODE_UC_SEND_LAST;
+		qp->r_state = IB_OPCODE_UC_SEND_LAST;
+	}
+	qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+	qp->r_nak_state = 0;
+	qp->r_aflags = 0;
+	qp->r_flags = 0;
+	qp->s_head = 0;
+	qp->s_tail = 0;
+	qp->s_cur = 0;
+	qp->s_acked = 0;
+	qp->s_last = 0;
+	qp->s_ssn = 1;
+	qp->s_lsn = 0;
+	qp->s_mig_state = IB_MIG_MIGRATED;
+	qp->r_head_ack_queue = 0;
+	qp->s_tail_ack_queue = 0;
+	qp->s_num_rd_atomic = 0;
+	if (qp->r_rq.wq) {
+		qp->r_rq.wq->head = 0;
+		qp->r_rq.wq->tail = 0;
+	}
+	qp->r_sge.num_sge = 0;
+	atomic_set(&qp->s_reserved_used, 0);
+}
+
+/**
+ * rvt_reset_qp - initialize the QP state to the reset state
+ * @qp: the QP to reset
+ * @type: the QP type
+ *
+ * r_lock, s_hlock, and s_lock are required to be held by the caller
+ */
+static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+			 enum ib_qp_type type)
+	__must_hold(&qp->s_lock)
+	__must_hold(&qp->s_hlock)
+	__must_hold(&qp->r_lock)
+{
+	lockdep_assert_held(&qp->r_lock);
+	lockdep_assert_held(&qp->s_hlock);
+	lockdep_assert_held(&qp->s_lock);
+	if (qp->state != IB_QPS_RESET) {
+		qp->state = IB_QPS_RESET;
+
+		/* Let drivers flush their waitlist */
+		rdi->driver_f.flush_qp_waiters(qp);
+		rvt_stop_rc_timers(qp);
+		qp->s_flags &= ~(RVT_S_TIMER | RVT_S_ANY_WAIT);
+		spin_unlock(&qp->s_lock);
+		spin_unlock(&qp->s_hlock);
+		spin_unlock_irq(&qp->r_lock);
+
+		/* Stop the send queue and the retry timer */
+		rdi->driver_f.stop_send_queue(qp);
+		rvt_del_timers_sync(qp);
+		/* Wait for things to stop */
+		rdi->driver_f.quiesce_qp(qp);
+
+		/* take qp out the hash and wait for it to be unused */
+		rvt_remove_qp(rdi, qp);
+
+		/* grab the lock b/c it was locked at call time */
+		spin_lock_irq(&qp->r_lock);
+		spin_lock(&qp->s_hlock);
+		spin_lock(&qp->s_lock);
+
+		rvt_clear_mr_refs(qp, 1);
+		/*
+		 * Let the driver do any tear down or re-init it needs to for
+		 * a qp that has been reset
+		 */
+		rdi->driver_f.notify_qp_reset(qp);
+	}
+	rvt_init_qp(rdi, qp, type);
+	lockdep_assert_held(&qp->r_lock);
+	lockdep_assert_held(&qp->s_hlock);
+	lockdep_assert_held(&qp->s_lock);
+}
+
+/** rvt_free_qpn - Free a qpn from the bit map
+ * @qpt: QP table
+ * @qpn: queue pair number to free
+ */
+static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
+{
+	struct rvt_qpn_map *map;
+
+	map = qpt->map + (qpn & RVT_QPN_MASK) / RVT_BITS_PER_PAGE;
+	if (map->page)
+		clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page);
+}
+
+/**
+ * rvt_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Queue pair creation is mostly an rvt issue. However, drivers have their own
+ * unique idea of what queue pair numbers mean. For instance there is a reserved
+ * range for PSM.
+ *
+ * Return: the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata)
+{
+	struct rvt_qp *qp;
+	int err;
+	struct rvt_swqe *swq = NULL;
+	size_t sz;
+	size_t sg_list_sz;
+	struct ib_qp *ret = ERR_PTR(-ENOMEM);
+	struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
+	void *priv = NULL;
+	size_t sqsize;
+
+	if (!rdi)
+		return ERR_PTR(-EINVAL);
+
+	if (init_attr->cap.max_send_sge > rdi->dparms.props.max_sge ||
+	    init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr ||
+	    init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	/* Check receive queue parameters if no SRQ is specified. */
+	if (!init_attr->srq) {
+		if (init_attr->cap.max_recv_sge > rdi->dparms.props.max_sge ||
+		    init_attr->cap.max_recv_wr > rdi->dparms.props.max_qp_wr)
+			return ERR_PTR(-EINVAL);
+
+		if (init_attr->cap.max_send_sge +
+		    init_attr->cap.max_send_wr +
+		    init_attr->cap.max_recv_sge +
+		    init_attr->cap.max_recv_wr == 0)
+			return ERR_PTR(-EINVAL);
+	}
+	sqsize =
+		init_attr->cap.max_send_wr + 1 +
+		rdi->dparms.reserved_operations;
+	switch (init_attr->qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		if (init_attr->port_num == 0 ||
+		    init_attr->port_num > ibpd->device->phys_port_cnt)
+			return ERR_PTR(-EINVAL);
+		/* fall through */
+	case IB_QPT_UC:
+	case IB_QPT_RC:
+	case IB_QPT_UD:
+		sz = sizeof(struct rvt_sge) *
+			init_attr->cap.max_send_sge +
+			sizeof(struct rvt_swqe);
+		swq = vzalloc_node(sqsize * sz, rdi->dparms.node);
+		if (!swq)
+			return ERR_PTR(-ENOMEM);
+
+		sz = sizeof(*qp);
+		sg_list_sz = 0;
+		if (init_attr->srq) {
+			struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq);
+
+			if (srq->rq.max_sge > 1)
+				sg_list_sz = sizeof(*qp->r_sg_list) *
+					(srq->rq.max_sge - 1);
+		} else if (init_attr->cap.max_recv_sge > 1)
+			sg_list_sz = sizeof(*qp->r_sg_list) *
+				(init_attr->cap.max_recv_sge - 1);
+		qp = kzalloc_node(sz + sg_list_sz, GFP_KERNEL,
+				  rdi->dparms.node);
+		if (!qp)
+			goto bail_swq;
+
+		RCU_INIT_POINTER(qp->next, NULL);
+		if (init_attr->qp_type == IB_QPT_RC) {
+			qp->s_ack_queue =
+				kzalloc_node(
+					sizeof(*qp->s_ack_queue) *
+					 rvt_max_atomic(rdi),
+					GFP_KERNEL,
+					rdi->dparms.node);
+			if (!qp->s_ack_queue)
+				goto bail_qp;
+		}
+		/* initialize timers needed for rc qp */
+		timer_setup(&qp->s_timer, rvt_rc_timeout, 0);
+		hrtimer_init(&qp->s_rnr_timer, CLOCK_MONOTONIC,
+			     HRTIMER_MODE_REL);
+		qp->s_rnr_timer.function = rvt_rc_rnr_retry;
+
+		/*
+		 * Driver needs to set up it's private QP structure and do any
+		 * initialization that is needed.
+		 */
+		priv = rdi->driver_f.qp_priv_alloc(rdi, qp);
+		if (IS_ERR(priv)) {
+			ret = priv;
+			goto bail_qp;
+		}
+		qp->priv = priv;
+		qp->timeout_jiffies =
+			usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
+				1000UL);
+		if (init_attr->srq) {
+			sz = 0;
+		} else {
+			qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
+				sizeof(struct rvt_rwqe);
+			if (udata)
+				qp->r_rq.wq = vmalloc_user(
+						sizeof(struct rvt_rwq) +
+						qp->r_rq.size * sz);
+			else
+				qp->r_rq.wq = vzalloc_node(
+						sizeof(struct rvt_rwq) +
+						qp->r_rq.size * sz,
+						rdi->dparms.node);
+			if (!qp->r_rq.wq)
+				goto bail_driver_priv;
+		}
+
+		/*
+		 * ib_create_qp() will initialize qp->ibqp
+		 * except for qp->ibqp.qp_num.
+		 */
+		spin_lock_init(&qp->r_lock);
+		spin_lock_init(&qp->s_hlock);
+		spin_lock_init(&qp->s_lock);
+		spin_lock_init(&qp->r_rq.lock);
+		atomic_set(&qp->refcount, 0);
+		atomic_set(&qp->local_ops_pending, 0);
+		init_waitqueue_head(&qp->wait);
+		INIT_LIST_HEAD(&qp->rspwait);
+		qp->state = IB_QPS_RESET;
+		qp->s_wq = swq;
+		qp->s_size = sqsize;
+		qp->s_avail = init_attr->cap.max_send_wr;
+		qp->s_max_sge = init_attr->cap.max_send_sge;
+		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
+			qp->s_flags = RVT_S_SIGNAL_REQ_WR;
+
+		err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
+				init_attr->qp_type,
+				init_attr->port_num);
+		if (err < 0) {
+			ret = ERR_PTR(err);
+			goto bail_rq_wq;
+		}
+		qp->ibqp.qp_num = err;
+		qp->port_num = init_attr->port_num;
+		rvt_init_qp(rdi, qp, init_attr->qp_type);
+		break;
+
+	default:
+		/* Don't support raw QPs */
+		return ERR_PTR(-EINVAL);
+	}
+
+	init_attr->cap.max_inline_data = 0;
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See rvt_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		if (!qp->r_rq.wq) {
+			__u64 offset = 0;
+
+			err = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_qpn;
+			}
+		} else {
+			u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
+
+			qp->ip = rvt_create_mmap_info(rdi, s,
+						      ibpd->uobject->context,
+						      qp->r_rq.wq);
+			if (!qp->ip) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_qpn;
+			}
+
+			err = ib_copy_to_udata(udata, &qp->ip->offset,
+					       sizeof(qp->ip->offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_ip;
+			}
+		}
+		qp->pid = current->pid;
+	}
+
+	spin_lock(&rdi->n_qps_lock);
+	if (rdi->n_qps_allocated == rdi->dparms.props.max_qp) {
+		spin_unlock(&rdi->n_qps_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	rdi->n_qps_allocated++;
+	/*
+	 * Maintain a busy_jiffies variable that will be added to the timeout
+	 * period in mod_retry_timer and add_retry_timer. This busy jiffies
+	 * is scaled by the number of rc qps created for the device to reduce
+	 * the number of timeouts occurring when there is a large number of
+	 * qps. busy_jiffies is incremented every rc qp scaling interval.
+	 * The scaling interval is selected based on extensive performance
+	 * evaluation of targeted workloads.
+	 */
+	if (init_attr->qp_type == IB_QPT_RC) {
+		rdi->n_rc_qps++;
+		rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
+	}
+	spin_unlock(&rdi->n_qps_lock);
+
+	if (qp->ip) {
+		spin_lock_irq(&rdi->pending_lock);
+		list_add(&qp->ip->pending_mmaps, &rdi->pending_mmaps);
+		spin_unlock_irq(&rdi->pending_lock);
+	}
+
+	ret = &qp->ibqp;
+
+	/*
+	 * We have our QP and its good, now keep track of what types of opcodes
+	 * can be processed on this QP. We do this by keeping track of what the
+	 * 3 high order bits of the opcode are.
+	 */
+	switch (init_attr->qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	case IB_QPT_UD:
+		qp->allowed_ops = IB_OPCODE_UD;
+		break;
+	case IB_QPT_RC:
+		qp->allowed_ops = IB_OPCODE_RC;
+		break;
+	case IB_QPT_UC:
+		qp->allowed_ops = IB_OPCODE_UC;
+		break;
+	default:
+		ret = ERR_PTR(-EINVAL);
+		goto bail_ip;
+	}
+
+	return ret;
+
+bail_ip:
+	if (qp->ip)
+		kref_put(&qp->ip->ref, rvt_release_mmap_info);
+
+bail_qpn:
+	rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
+
+bail_rq_wq:
+	if (!qp->ip)
+		vfree(qp->r_rq.wq);
+
+bail_driver_priv:
+	rdi->driver_f.qp_priv_free(rdi, qp);
+
+bail_qp:
+	kfree(qp->s_ack_queue);
+	kfree(qp);
+
+bail_swq:
+	vfree(swq);
+
+	return ret;
+}
+
+/**
+ * rvt_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ *
+ * Return: true if last WQE event should be generated.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
+{
+	struct ib_wc wc;
+	int ret = 0;
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+
+	lockdep_assert_held(&qp->r_lock);
+	lockdep_assert_held(&qp->s_lock);
+	if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
+		goto bail;
+
+	qp->state = IB_QPS_ERR;
+
+	if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
+		qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
+		del_timer(&qp->s_timer);
+	}
+
+	if (qp->s_flags & RVT_S_ANY_WAIT_SEND)
+		qp->s_flags &= ~RVT_S_ANY_WAIT_SEND;
+
+	rdi->driver_f.notify_error_qp(qp);
+
+	/* Schedule the sending tasklet to drain the send work queue. */
+	if (READ_ONCE(qp->s_last) != qp->s_head)
+		rdi->driver_f.schedule_send(qp);
+
+	rvt_clear_mr_refs(qp, 0);
+
+	memset(&wc, 0, sizeof(wc));
+	wc.qp = &qp->ibqp;
+	wc.opcode = IB_WC_RECV;
+
+	if (test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) {
+		wc.wr_id = qp->r_wr_id;
+		wc.status = err;
+		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
+	}
+	wc.status = IB_WC_WR_FLUSH_ERR;
+
+	if (qp->r_rq.wq) {
+		struct rvt_rwq *wq;
+		u32 head;
+		u32 tail;
+
+		spin_lock(&qp->r_rq.lock);
+
+		/* sanity check pointers before trusting them */
+		wq = qp->r_rq.wq;
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		while (tail != head) {
+			wc.wr_id = rvt_get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
+			if (++tail >= qp->r_rq.size)
+				tail = 0;
+			rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
+		}
+		wq->tail = tail;
+
+		spin_unlock(&qp->r_rq.lock);
+	} else if (qp->ibqp.event_handler) {
+		ret = 1;
+	}
+
+bail:
+	return ret;
+}
+EXPORT_SYMBOL(rvt_error_qp);
+
+/*
+ * Put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+static void rvt_insert_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+	struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
+	unsigned long flags;
+
+	rvt_get_qp(qp);
+	spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
+
+	if (qp->ibqp.qp_num <= 1) {
+		rcu_assign_pointer(rvp->qp[qp->ibqp.qp_num], qp);
+	} else {
+		u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
+
+		qp->next = rdi->qp_dev->qp_table[n];
+		rcu_assign_pointer(rdi->qp_dev->qp_table[n], qp);
+		trace_rvt_qpinsert(qp, n);
+	}
+
+	spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
+}
+
+/**
+ * rvt_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for libibverbs.so
+ *
+ * Return: 0 on success, otherwise returns an errno.
+ */
+int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		  int attr_mask, struct ib_udata *udata)
+{
+	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	struct ib_event ev;
+	int lastwqe = 0;
+	int mig = 0;
+	int pmtu = 0; /* for gcc warning only */
+	enum rdma_link_layer link;
+	int opa_ah;
+
+	link = rdma_port_get_link_layer(ibqp->device, qp->port_num);
+
+	spin_lock_irq(&qp->r_lock);
+	spin_lock(&qp->s_hlock);
+	spin_lock(&qp->s_lock);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ?
+		attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+	opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num);
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask, link))
+		goto inval;
+
+	if (rdi->driver_f.check_modify_qp &&
+	    rdi->driver_f.check_modify_qp(qp, attr, attr_mask, udata))
+		goto inval;
+
+	if (attr_mask & IB_QP_AV) {
+		if (opa_ah) {
+			if (rdma_ah_get_dlid(&attr->ah_attr) >=
+				opa_get_mcast_base(OPA_MCAST_NR))
+				goto inval;
+		} else {
+			if (rdma_ah_get_dlid(&attr->ah_attr) >=
+				be16_to_cpu(IB_MULTICAST_LID_BASE))
+				goto inval;
+		}
+
+		if (rvt_check_ah(qp->ibqp.device, &attr->ah_attr))
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (opa_ah) {
+			if (rdma_ah_get_dlid(&attr->alt_ah_attr) >=
+				opa_get_mcast_base(OPA_MCAST_NR))
+				goto inval;
+		} else {
+			if (rdma_ah_get_dlid(&attr->alt_ah_attr) >=
+				be16_to_cpu(IB_MULTICAST_LID_BASE))
+				goto inval;
+		}
+
+		if (rvt_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
+			goto inval;
+		if (attr->alt_pkey_index >= rvt_get_npkeys(rdi))
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		if (attr->pkey_index >= rvt_get_npkeys(rdi))
+			goto inval;
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		if (attr->min_rnr_timer > 31)
+			goto inval;
+
+	if (attr_mask & IB_QP_PORT)
+		if (qp->ibqp.qp_type == IB_QPT_SMI ||
+		    qp->ibqp.qp_type == IB_QPT_GSI ||
+		    attr->port_num == 0 ||
+		    attr->port_num > ibqp->device->phys_port_cnt)
+			goto inval;
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		if (attr->dest_qp_num > RVT_QPN_MASK)
+			goto inval;
+
+	if (attr_mask & IB_QP_RETRY_CNT)
+		if (attr->retry_cnt > 7)
+			goto inval;
+
+	if (attr_mask & IB_QP_RNR_RETRY)
+		if (attr->rnr_retry > 7)
+			goto inval;
+
+	/*
+	 * Don't allow invalid path_mtu values.  OK to set greater
+	 * than the active mtu (or even the max_cap, if we have tuned
+	 * that to a small mtu.  We'll set qp->path_mtu
+	 * to the lesser of requested attribute mtu and active,
+	 * for packetizing messages.
+	 * Note that the QP port has to be set in INIT and MTU in RTR.
+	 */
+	if (attr_mask & IB_QP_PATH_MTU) {
+		pmtu = rdi->driver_f.get_pmtu_from_attr(rdi, qp, attr);
+		if (pmtu < 0)
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE) {
+		if (attr->path_mig_state == IB_MIG_REARM) {
+			if (qp->s_mig_state == IB_MIG_ARMED)
+				goto inval;
+			if (new_state != IB_QPS_RTS)
+				goto inval;
+		} else if (attr->path_mig_state == IB_MIG_MIGRATED) {
+			if (qp->s_mig_state == IB_MIG_REARM)
+				goto inval;
+			if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD)
+				goto inval;
+			if (qp->s_mig_state == IB_MIG_ARMED)
+				mig = 1;
+		} else {
+			goto inval;
+		}
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		if (attr->max_dest_rd_atomic > rdi->dparms.max_rdma_atomic)
+			goto inval;
+
+	switch (new_state) {
+	case IB_QPS_RESET:
+		if (qp->state != IB_QPS_RESET)
+			rvt_reset_qp(rdi, qp, ibqp->qp_type);
+		break;
+
+	case IB_QPS_RTR:
+		/* Allow event to re-trigger if QP set to RTR more than once */
+		qp->r_flags &= ~RVT_R_COMM_EST;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_SQD:
+		qp->s_draining = qp->s_last != qp->s_cur;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_SQE:
+		if (qp->ibqp.qp_type == IB_QPT_RC)
+			goto inval;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_ERR:
+		lastwqe = rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		break;
+
+	default:
+		qp->state = new_state;
+		break;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		qp->s_pkey_index = attr->pkey_index;
+
+	if (attr_mask & IB_QP_PORT)
+		qp->port_num = attr->port_num;
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		qp->remote_qpn = attr->dest_qp_num;
+
+	if (attr_mask & IB_QP_SQ_PSN) {
+		qp->s_next_psn = attr->sq_psn & rdi->dparms.psn_modify_mask;
+		qp->s_psn = qp->s_next_psn;
+		qp->s_sending_psn = qp->s_next_psn;
+		qp->s_last_psn = qp->s_next_psn - 1;
+		qp->s_sending_hpsn = qp->s_last_psn;
+	}
+
+	if (attr_mask & IB_QP_RQ_PSN)
+		qp->r_psn = attr->rq_psn & rdi->dparms.psn_modify_mask;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->qp_access_flags = attr->qp_access_flags;
+
+	if (attr_mask & IB_QP_AV) {
+		qp->remote_ah_attr = attr->ah_attr;
+		qp->s_srate = rdma_ah_get_static_rate(&attr->ah_attr);
+		qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		qp->alt_ah_attr = attr->alt_ah_attr;
+		qp->s_alt_pkey_index = attr->alt_pkey_index;
+	}
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE) {
+		qp->s_mig_state = attr->path_mig_state;
+		if (mig) {
+			qp->remote_ah_attr = qp->alt_ah_attr;
+			qp->port_num = rdma_ah_get_port_num(&qp->alt_ah_attr);
+			qp->s_pkey_index = qp->s_alt_pkey_index;
+		}
+	}
+
+	if (attr_mask & IB_QP_PATH_MTU) {
+		qp->pmtu = rdi->driver_f.mtu_from_qp(rdi, qp, pmtu);
+		qp->log_pmtu = ilog2(qp->pmtu);
+	}
+
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		qp->s_retry_cnt = attr->retry_cnt;
+		qp->s_retry = attr->retry_cnt;
+	}
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		qp->s_rnr_retry_cnt = attr->rnr_retry;
+		qp->s_rnr_retry = attr->rnr_retry;
+	}
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		qp->r_min_rnr_timer = attr->min_rnr_timer;
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		qp->timeout = attr->timeout;
+		qp->timeout_jiffies = rvt_timeout_to_jiffies(qp->timeout);
+	}
+
+	if (attr_mask & IB_QP_QKEY)
+		qp->qkey = attr->qkey;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+		qp->s_max_rd_atomic = attr->max_rd_atomic;
+
+	if (rdi->driver_f.modify_qp)
+		rdi->driver_f.modify_qp(qp, attr, attr_mask, udata);
+
+	spin_unlock(&qp->s_lock);
+	spin_unlock(&qp->s_hlock);
+	spin_unlock_irq(&qp->r_lock);
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		rvt_insert_qp(rdi, qp);
+
+	if (lastwqe) {
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+	if (mig) {
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_PATH_MIG;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+	return 0;
+
+inval:
+	spin_unlock(&qp->s_lock);
+	spin_unlock(&qp->s_hlock);
+	spin_unlock_irq(&qp->r_lock);
+	return -EINVAL;
+}
+
+/**
+ * rvt_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ *
+ * Return: 0 on success.
+ */
+int rvt_destroy_qp(struct ib_qp *ibqp)
+{
+	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+	spin_lock_irq(&qp->r_lock);
+	spin_lock(&qp->s_hlock);
+	spin_lock(&qp->s_lock);
+	rvt_reset_qp(rdi, qp, ibqp->qp_type);
+	spin_unlock(&qp->s_lock);
+	spin_unlock(&qp->s_hlock);
+	spin_unlock_irq(&qp->r_lock);
+
+	wait_event(qp->wait, !atomic_read(&qp->refcount));
+	/* qpn is now available for use again */
+	rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
+
+	spin_lock(&rdi->n_qps_lock);
+	rdi->n_qps_allocated--;
+	if (qp->ibqp.qp_type == IB_QPT_RC) {
+		rdi->n_rc_qps--;
+		rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
+	}
+	spin_unlock(&rdi->n_qps_lock);
+
+	if (qp->ip)
+		kref_put(&qp->ip->ref, rvt_release_mmap_info);
+	else
+		vfree(qp->r_rq.wq);
+	vfree(qp->s_wq);
+	rdi->driver_f.qp_priv_free(rdi, qp);
+	kfree(qp->s_ack_queue);
+	kfree(qp);
+	return 0;
+}
+
+/**
+ * rvt_query_qp - query an ipbq
+ * @ibqp: IB qp to query
+ * @attr: attr struct to fill in
+ * @attr_mask: attr mask ignored
+ * @init_attr: struct to fill in
+ *
+ * Return: always 0
+ */
+int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		 int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+	attr->qp_state = qp->state;
+	attr->cur_qp_state = attr->qp_state;
+	attr->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu);
+	attr->path_mig_state = qp->s_mig_state;
+	attr->qkey = qp->qkey;
+	attr->rq_psn = qp->r_psn & rdi->dparms.psn_mask;
+	attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask;
+	attr->dest_qp_num = qp->remote_qpn;
+	attr->qp_access_flags = qp->qp_access_flags;
+	attr->cap.max_send_wr = qp->s_size - 1 -
+		rdi->dparms.reserved_operations;
+	attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
+	attr->cap.max_send_sge = qp->s_max_sge;
+	attr->cap.max_recv_sge = qp->r_rq.max_sge;
+	attr->cap.max_inline_data = 0;
+	attr->ah_attr = qp->remote_ah_attr;
+	attr->alt_ah_attr = qp->alt_ah_attr;
+	attr->pkey_index = qp->s_pkey_index;
+	attr->alt_pkey_index = qp->s_alt_pkey_index;
+	attr->en_sqd_async_notify = 0;
+	attr->sq_draining = qp->s_draining;
+	attr->max_rd_atomic = qp->s_max_rd_atomic;
+	attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
+	attr->min_rnr_timer = qp->r_min_rnr_timer;
+	attr->port_num = qp->port_num;
+	attr->timeout = qp->timeout;
+	attr->retry_cnt = qp->s_retry_cnt;
+	attr->rnr_retry = qp->s_rnr_retry_cnt;
+	attr->alt_port_num =
+		rdma_ah_get_port_num(&qp->alt_ah_attr);
+	attr->alt_timeout = qp->alt_timeout;
+
+	init_attr->event_handler = qp->ibqp.event_handler;
+	init_attr->qp_context = qp->ibqp.qp_context;
+	init_attr->send_cq = qp->ibqp.send_cq;
+	init_attr->recv_cq = qp->ibqp.recv_cq;
+	init_attr->srq = qp->ibqp.srq;
+	init_attr->cap = attr->cap;
+	if (qp->s_flags & RVT_S_SIGNAL_REQ_WR)
+		init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	else
+		init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+	init_attr->qp_type = qp->ibqp.qp_type;
+	init_attr->port_num = qp->port_num;
+	return 0;
+}
+
+/**
+ * rvt_post_receive - post a receive on a QP
+ * @ibqp: the QP to post the receive on
+ * @wr: the WR to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ *
+ * Return: 0 on success otherwise errno
+ */
+int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		  struct ib_recv_wr **bad_wr)
+{
+	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+	struct rvt_rwq *wq = qp->r_rq.wq;
+	unsigned long flags;
+	int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) &&
+				!qp->ibqp.srq;
+
+	/* Check that state is OK to post receive. */
+	if (!(ib_rvt_state_ops[qp->state] & RVT_POST_RECV_OK) || !wq) {
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
+	for (; wr; wr = wr->next) {
+		struct rvt_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned)wr->num_sge > qp->r_rq.max_sge) {
+			*bad_wr = wr;
+			return -EINVAL;
+		}
+
+		spin_lock_irqsave(&qp->r_rq.lock, flags);
+		next = wq->head + 1;
+		if (next >= qp->r_rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+			*bad_wr = wr;
+			return -ENOMEM;
+		}
+		if (unlikely(qp_err_flush)) {
+			struct ib_wc wc;
+
+			memset(&wc, 0, sizeof(wc));
+			wc.qp = &qp->ibqp;
+			wc.opcode = IB_WC_RECV;
+			wc.wr_id = wr->wr_id;
+			wc.status = IB_WC_WR_FLUSH_ERR;
+			rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
+		} else {
+			wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head);
+			wqe->wr_id = wr->wr_id;
+			wqe->num_sge = wr->num_sge;
+			for (i = 0; i < wr->num_sge; i++)
+				wqe->sg_list[i] = wr->sg_list[i];
+			/*
+			 * Make sure queue entry is written
+			 * before the head index.
+			 */
+			smp_wmb();
+			wq->head = next;
+		}
+		spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+	}
+	return 0;
+}
+
+/**
+ * rvt_qp_valid_operation - validate post send wr request
+ * @qp - the qp
+ * @post-parms - the post send table for the driver
+ * @wr - the work request
+ *
+ * The routine validates the operation based on the
+ * validation table an returns the length of the operation
+ * which can extend beyond the ib_send_bw.  Operation
+ * dependent flags key atomic operation validation.
+ *
+ * There is an exception for UD qps that validates the pd and
+ * overrides the length to include the additional UD specific
+ * length.
+ *
+ * Returns a negative error or the length of the work request
+ * for building the swqe.
+ */
+static inline int rvt_qp_valid_operation(
+	struct rvt_qp *qp,
+	const struct rvt_operation_params *post_parms,
+	struct ib_send_wr *wr)
+{
+	int len;
+
+	if (wr->opcode >= RVT_OPERATION_MAX || !post_parms[wr->opcode].length)
+		return -EINVAL;
+	if (!(post_parms[wr->opcode].qpt_support & BIT(qp->ibqp.qp_type)))
+		return -EINVAL;
+	if ((post_parms[wr->opcode].flags & RVT_OPERATION_PRIV) &&
+	    ibpd_to_rvtpd(qp->ibqp.pd)->user)
+		return -EINVAL;
+	if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC_SGE &&
+	    (wr->num_sge == 0 ||
+	     wr->sg_list[0].length < sizeof(u64) ||
+	     wr->sg_list[0].addr & (sizeof(u64) - 1)))
+		return -EINVAL;
+	if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC &&
+	    !qp->s_max_rd_atomic)
+		return -EINVAL;
+	len = post_parms[wr->opcode].length;
+	/* UD specific */
+	if (qp->ibqp.qp_type != IB_QPT_UC &&
+	    qp->ibqp.qp_type != IB_QPT_RC) {
+		if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
+			return -EINVAL;
+		len = sizeof(struct ib_ud_wr);
+	}
+	return len;
+}
+
+/**
+ * rvt_qp_is_avail - determine queue capacity
+ * @qp: the qp
+ * @rdi: the rdmavt device
+ * @reserved_op: is reserved operation
+ *
+ * This assumes the s_hlock is held but the s_last
+ * qp variable is uncontrolled.
+ *
+ * For non reserved operations, the qp->s_avail
+ * may be changed.
+ *
+ * The return value is zero or a -ENOMEM.
+ */
+static inline int rvt_qp_is_avail(
+	struct rvt_qp *qp,
+	struct rvt_dev_info *rdi,
+	bool reserved_op)
+{
+	u32 slast;
+	u32 avail;
+	u32 reserved_used;
+
+	/* see rvt_qp_wqe_unreserve() */
+	smp_mb__before_atomic();
+	reserved_used = atomic_read(&qp->s_reserved_used);
+	if (unlikely(reserved_op)) {
+		/* see rvt_qp_wqe_unreserve() */
+		smp_mb__before_atomic();
+		if (reserved_used >= rdi->dparms.reserved_operations)
+			return -ENOMEM;
+		return 0;
+	}
+	/* non-reserved operations */
+	if (likely(qp->s_avail))
+		return 0;
+	slast = READ_ONCE(qp->s_last);
+	if (qp->s_head >= slast)
+		avail = qp->s_size - (qp->s_head - slast);
+	else
+		avail = slast - qp->s_head;
+
+	/* see rvt_qp_wqe_unreserve() */
+	smp_mb__before_atomic();
+	reserved_used = atomic_read(&qp->s_reserved_used);
+	avail =  avail - 1 -
+		(rdi->dparms.reserved_operations - reserved_used);
+	/* insure we don't assign a negative s_avail */
+	if ((s32)avail <= 0)
+		return -ENOMEM;
+	qp->s_avail = avail;
+	if (WARN_ON(qp->s_avail >
+		    (qp->s_size - 1 - rdi->dparms.reserved_operations)))
+		rvt_pr_err(rdi,
+			   "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
+			   qp->ibqp.qp_num, qp->s_size, qp->s_avail,
+			   qp->s_head, qp->s_tail, qp->s_cur,
+			   qp->s_acked, qp->s_last);
+	return 0;
+}
+
+/**
+ * rvt_post_one_wr - post one RC, UC, or UD send work request
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ */
+static int rvt_post_one_wr(struct rvt_qp *qp,
+			   struct ib_send_wr *wr,
+			   int *call_send)
+{
+	struct rvt_swqe *wqe;
+	u32 next;
+	int i;
+	int j;
+	int acc;
+	struct rvt_lkey_table *rkt;
+	struct rvt_pd *pd;
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+	u8 log_pmtu;
+	int ret;
+	size_t cplen;
+	bool reserved_op;
+	int local_ops_delayed = 0;
+
+	BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE));
+
+	/* IB spec says that num_sge == 0 is OK. */
+	if (unlikely(wr->num_sge > qp->s_max_sge))
+		return -EINVAL;
+
+	ret = rvt_qp_valid_operation(qp, rdi->post_parms, wr);
+	if (ret < 0)
+		return ret;
+	cplen = ret;
+
+	/*
+	 * Local operations include fast register and local invalidate.
+	 * Fast register needs to be processed immediately because the
+	 * registered lkey may be used by following work requests and the
+	 * lkey needs to be valid at the time those requests are posted.
+	 * Local invalidate can be processed immediately if fencing is
+	 * not required and no previous local invalidate ops are pending.
+	 * Signaled local operations that have been processed immediately
+	 * need to have requests with "completion only" flags set posted
+	 * to the send queue in order to generate completions.
+	 */
+	if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) {
+		switch (wr->opcode) {
+		case IB_WR_REG_MR:
+			ret = rvt_fast_reg_mr(qp,
+					      reg_wr(wr)->mr,
+					      reg_wr(wr)->key,
+					      reg_wr(wr)->access);
+			if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
+				return ret;
+			break;
+		case IB_WR_LOCAL_INV:
+			if ((wr->send_flags & IB_SEND_FENCE) ||
+			    atomic_read(&qp->local_ops_pending)) {
+				local_ops_delayed = 1;
+			} else {
+				ret = rvt_invalidate_rkey(
+					qp, wr->ex.invalidate_rkey);
+				if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
+					return ret;
+			}
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	reserved_op = rdi->post_parms[wr->opcode].flags &
+			RVT_OPERATION_USE_RESERVE;
+	/* check for avail */
+	ret = rvt_qp_is_avail(qp, rdi, reserved_op);
+	if (ret)
+		return ret;
+	next = qp->s_head + 1;
+	if (next >= qp->s_size)
+		next = 0;
+
+	rkt = &rdi->lkey_table;
+	pd = ibpd_to_rvtpd(qp->ibqp.pd);
+	wqe = rvt_get_swqe_ptr(qp, qp->s_head);
+
+	/* cplen has length from above */
+	memcpy(&wqe->wr, wr, cplen);
+
+	wqe->length = 0;
+	j = 0;
+	if (wr->num_sge) {
+		struct rvt_sge *last_sge = NULL;
+
+		acc = wr->opcode >= IB_WR_RDMA_READ ?
+			IB_ACCESS_LOCAL_WRITE : 0;
+		for (i = 0; i < wr->num_sge; i++) {
+			u32 length = wr->sg_list[i].length;
+
+			if (length == 0)
+				continue;
+			ret = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j], last_sge,
+					  &wr->sg_list[i], acc);
+			if (unlikely(ret < 0))
+				goto bail_inval_free;
+			wqe->length += length;
+			if (ret)
+				last_sge = &wqe->sg_list[j];
+			j += ret;
+		}
+		wqe->wr.num_sge = j;
+	}
+
+	/* general part of wqe valid - allow for driver checks */
+	if (rdi->driver_f.check_send_wqe) {
+		ret = rdi->driver_f.check_send_wqe(qp, wqe);
+		if (ret < 0)
+			goto bail_inval_free;
+		if (ret)
+			*call_send = ret;
+	}
+
+	log_pmtu = qp->log_pmtu;
+	if (qp->ibqp.qp_type != IB_QPT_UC &&
+	    qp->ibqp.qp_type != IB_QPT_RC) {
+		struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah);
+
+		log_pmtu = ah->log_pmtu;
+		atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
+	}
+
+	if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
+		if (local_ops_delayed)
+			atomic_inc(&qp->local_ops_pending);
+		else
+			wqe->wr.send_flags |= RVT_SEND_COMPLETION_ONLY;
+		wqe->ssn = 0;
+		wqe->psn = 0;
+		wqe->lpsn = 0;
+	} else {
+		wqe->ssn = qp->s_ssn++;
+		wqe->psn = qp->s_next_psn;
+		wqe->lpsn = wqe->psn +
+				(wqe->length ?
+					((wqe->length - 1) >> log_pmtu) :
+					0);
+		qp->s_next_psn = wqe->lpsn + 1;
+	}
+	if (unlikely(reserved_op)) {
+		wqe->wr.send_flags |= RVT_SEND_RESERVE_USED;
+		rvt_qp_wqe_reserve(qp, wqe);
+	} else {
+		wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED;
+		qp->s_avail--;
+	}
+	trace_rvt_post_one_wr(qp, wqe, wr->num_sge);
+	smp_wmb(); /* see request builders */
+	qp->s_head = next;
+
+	return 0;
+
+bail_inval_free:
+	/* release mr holds */
+	while (j) {
+		struct rvt_sge *sge = &wqe->sg_list[--j];
+
+		rvt_put_mr(sge->mr);
+	}
+	return ret;
+}
+
+/**
+ * rvt_post_send - post a send on a QP
+ * @ibqp: the QP to post the send on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ *
+ * Return: 0 on success else errno
+ */
+int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		  struct ib_send_wr **bad_wr)
+{
+	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+	unsigned long flags = 0;
+	int call_send;
+	unsigned nreq = 0;
+	int err = 0;
+
+	spin_lock_irqsave(&qp->s_hlock, flags);
+
+	/*
+	 * Ensure QP state is such that we can send. If not bail out early,
+	 * there is no need to do this every time we post a send.
+	 */
+	if (unlikely(!(ib_rvt_state_ops[qp->state] & RVT_POST_SEND_OK))) {
+		spin_unlock_irqrestore(&qp->s_hlock, flags);
+		return -EINVAL;
+	}
+
+	/*
+	 * If the send queue is empty, and we only have a single WR then just go
+	 * ahead and kick the send engine into gear. Otherwise we will always
+	 * just schedule the send to happen later.
+	 */
+	call_send = qp->s_head == READ_ONCE(qp->s_last) && !wr->next;
+
+	for (; wr; wr = wr->next) {
+		err = rvt_post_one_wr(qp, wr, &call_send);
+		if (unlikely(err)) {
+			*bad_wr = wr;
+			goto bail;
+		}
+		nreq++;
+	}
+bail:
+	spin_unlock_irqrestore(&qp->s_hlock, flags);
+	if (nreq) {
+		if (call_send)
+			rdi->driver_f.do_send(qp);
+		else
+			rdi->driver_f.schedule_send_no_lock(qp);
+	}
+	return err;
+}
+
+/**
+ * rvt_post_srq_receive - post a receive on a shared receive queue
+ * @ibsrq: the SRQ to post the receive on
+ * @wr: the list of work requests to post
+ * @bad_wr: A pointer to the first WR to cause a problem is put here
+ *
+ * This may be called from interrupt context.
+ *
+ * Return: 0 on success else errno
+ */
+int rvt_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
+	struct rvt_rwq *wq;
+	unsigned long flags;
+
+	for (; wr; wr = wr->next) {
+		struct rvt_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned)wr->num_sge > srq->rq.max_sge) {
+			*bad_wr = wr;
+			return -EINVAL;
+		}
+
+		spin_lock_irqsave(&srq->rq.lock, flags);
+		wq = srq->rq.wq;
+		next = wq->head + 1;
+		if (next >= srq->rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&srq->rq.lock, flags);
+			*bad_wr = wr;
+			return -ENOMEM;
+		}
+
+		wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head);
+		wqe->wr_id = wr->wr_id;
+		wqe->num_sge = wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++)
+			wqe->sg_list[i] = wr->sg_list[i];
+		/* Make sure queue entry is written before the head index. */
+		smp_wmb();
+		wq->head = next;
+		spin_unlock_irqrestore(&srq->rq.lock, flags);
+	}
+	return 0;
+}
+
+/**
+ * qp_comm_est - handle trap with QP established
+ * @qp: the QP
+ */
+void rvt_comm_est(struct rvt_qp *qp)
+{
+	qp->r_flags |= RVT_R_COMM_EST;
+	if (qp->ibqp.event_handler) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_COMM_EST;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+}
+EXPORT_SYMBOL(rvt_comm_est);
+
+void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err)
+{
+	unsigned long flags;
+	int lastwqe;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	lastwqe = rvt_error_qp(qp, err);
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+}
+EXPORT_SYMBOL(rvt_rc_error);
+
+/*
+ *  rvt_rnr_tbl_to_usec - return index into ib_rvt_rnr_table
+ *  @index - the index
+ *  return usec from an index into ib_rvt_rnr_table
+ */
+unsigned long rvt_rnr_tbl_to_usec(u32 index)
+{
+	return ib_rvt_rnr_table[(index & IB_AETH_CREDIT_MASK)];
+}
+EXPORT_SYMBOL(rvt_rnr_tbl_to_usec);
+
+static inline unsigned long rvt_aeth_to_usec(u32 aeth)
+{
+	return ib_rvt_rnr_table[(aeth >> IB_AETH_CREDIT_SHIFT) &
+				  IB_AETH_CREDIT_MASK];
+}
+
+/*
+ *  rvt_add_retry_timer - add/start a retry timer
+ *  @qp - the QP
+ *  add a retry timer on the QP
+ */
+void rvt_add_retry_timer(struct rvt_qp *qp)
+{
+	struct ib_qp *ibqp = &qp->ibqp;
+	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+	lockdep_assert_held(&qp->s_lock);
+	qp->s_flags |= RVT_S_TIMER;
+       /* 4.096 usec. * (1 << qp->timeout) */
+	qp->s_timer.expires = jiffies + qp->timeout_jiffies +
+			     rdi->busy_jiffies;
+	add_timer(&qp->s_timer);
+}
+EXPORT_SYMBOL(rvt_add_retry_timer);
+
+/**
+ * rvt_add_rnr_timer - add/start an rnr timer
+ * @qp - the QP
+ * @aeth - aeth of RNR timeout, simulated aeth for loopback
+ * add an rnr timer on the QP
+ */
+void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth)
+{
+	u32 to;
+
+	lockdep_assert_held(&qp->s_lock);
+	qp->s_flags |= RVT_S_WAIT_RNR;
+	to = rvt_aeth_to_usec(aeth);
+	trace_rvt_rnrnak_add(qp, to);
+	hrtimer_start(&qp->s_rnr_timer,
+		      ns_to_ktime(1000 * to), HRTIMER_MODE_REL);
+}
+EXPORT_SYMBOL(rvt_add_rnr_timer);
+
+/**
+ * rvt_stop_rc_timers - stop all timers
+ * @qp - the QP
+ * stop any pending timers
+ */
+void rvt_stop_rc_timers(struct rvt_qp *qp)
+{
+	lockdep_assert_held(&qp->s_lock);
+	/* Remove QP from all timers */
+	if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
+		qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
+		del_timer(&qp->s_timer);
+		hrtimer_try_to_cancel(&qp->s_rnr_timer);
+	}
+}
+EXPORT_SYMBOL(rvt_stop_rc_timers);
+
+/**
+ * rvt_stop_rnr_timer - stop an rnr timer
+ * @qp - the QP
+ *
+ * stop an rnr timer and return if the timer
+ * had been pending.
+ */
+static void rvt_stop_rnr_timer(struct rvt_qp *qp)
+{
+	lockdep_assert_held(&qp->s_lock);
+	/* Remove QP from rnr timer */
+	if (qp->s_flags & RVT_S_WAIT_RNR) {
+		qp->s_flags &= ~RVT_S_WAIT_RNR;
+		trace_rvt_rnrnak_stop(qp, 0);
+	}
+}
+
+/**
+ * rvt_del_timers_sync - wait for any timeout routines to exit
+ * @qp - the QP
+ */
+void rvt_del_timers_sync(struct rvt_qp *qp)
+{
+	del_timer_sync(&qp->s_timer);
+	hrtimer_cancel(&qp->s_rnr_timer);
+}
+EXPORT_SYMBOL(rvt_del_timers_sync);
+
+/**
+ * This is called from s_timer for missing responses.
+ */
+static void rvt_rc_timeout(struct timer_list *t)
+{
+	struct rvt_qp *qp = from_timer(qp, t, s_timer);
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->r_lock, flags);
+	spin_lock(&qp->s_lock);
+	if (qp->s_flags & RVT_S_TIMER) {
+		struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
+
+		qp->s_flags &= ~RVT_S_TIMER;
+		rvp->n_rc_timeouts++;
+		del_timer(&qp->s_timer);
+		trace_rvt_rc_timeout(qp, qp->s_last_psn + 1);
+		if (rdi->driver_f.notify_restart_rc)
+			rdi->driver_f.notify_restart_rc(qp,
+							qp->s_last_psn + 1,
+							1);
+		rdi->driver_f.schedule_send(qp);
+	}
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+/*
+ * This is called from s_timer for RNR timeouts.
+ */
+enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t)
+{
+	struct rvt_qp *qp = container_of(t, struct rvt_qp, s_rnr_timer);
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	rvt_stop_rnr_timer(qp);
+	trace_rvt_rnrnak_timeout(qp, 0);
+	rdi->driver_f.schedule_send(qp);
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return HRTIMER_NORESTART;
+}
+EXPORT_SYMBOL(rvt_rc_rnr_retry);
+
+/**
+ * rvt_qp_iter_init - initial for QP iteration
+ * @rdi: rvt devinfo
+ * @v: u64 value
+ *
+ * This returns an iterator suitable for iterating QPs
+ * in the system.
+ *
+ * The @cb is a user defined callback and @v is a 64
+ * bit value passed to and relevant for processing in the
+ * @cb.  An example use case would be to alter QP processing
+ * based on criteria not part of the rvt_qp.
+ *
+ * Use cases that require memory allocation to succeed
+ * must preallocate appropriately.
+ *
+ * Return: a pointer to an rvt_qp_iter or NULL
+ */
+struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi,
+				     u64 v,
+				     void (*cb)(struct rvt_qp *qp, u64 v))
+{
+	struct rvt_qp_iter *i;
+
+	i = kzalloc(sizeof(*i), GFP_KERNEL);
+	if (!i)
+		return NULL;
+
+	i->rdi = rdi;
+	/* number of special QPs (SMI/GSI) for device */
+	i->specials = rdi->ibdev.phys_port_cnt * 2;
+	i->v = v;
+	i->cb = cb;
+
+	return i;
+}
+EXPORT_SYMBOL(rvt_qp_iter_init);
+
+/**
+ * rvt_qp_iter_next - return the next QP in iter
+ * @iter - the iterator
+ *
+ * Fine grained QP iterator suitable for use
+ * with debugfs seq_file mechanisms.
+ *
+ * Updates iter->qp with the current QP when the return
+ * value is 0.
+ *
+ * Return: 0 - iter->qp is valid 1 - no more QPs
+ */
+int rvt_qp_iter_next(struct rvt_qp_iter *iter)
+	__must_hold(RCU)
+{
+	int n = iter->n;
+	int ret = 1;
+	struct rvt_qp *pqp = iter->qp;
+	struct rvt_qp *qp;
+	struct rvt_dev_info *rdi = iter->rdi;
+
+	/*
+	 * The approach is to consider the special qps
+	 * as additional table entries before the
+	 * real hash table.  Since the qp code sets
+	 * the qp->next hash link to NULL, this works just fine.
+	 *
+	 * iter->specials is 2 * # ports
+	 *
+	 * n = 0..iter->specials is the special qp indices
+	 *
+	 * n = iter->specials..rdi->qp_dev->qp_table_size+iter->specials are
+	 * the potential hash bucket entries
+	 *
+	 */
+	for (; n <  rdi->qp_dev->qp_table_size + iter->specials; n++) {
+		if (pqp) {
+			qp = rcu_dereference(pqp->next);
+		} else {
+			if (n < iter->specials) {
+				struct rvt_ibport *rvp;
+				int pidx;
+
+				pidx = n % rdi->ibdev.phys_port_cnt;
+				rvp = rdi->ports[pidx];
+				qp = rcu_dereference(rvp->qp[n & 1]);
+			} else {
+				qp = rcu_dereference(
+					rdi->qp_dev->qp_table[
+						(n - iter->specials)]);
+			}
+		}
+		pqp = qp;
+		if (qp) {
+			iter->qp = qp;
+			iter->n = n;
+			return 0;
+		}
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rvt_qp_iter_next);
+
+/**
+ * rvt_qp_iter - iterate all QPs
+ * @rdi - rvt devinfo
+ * @v - a 64 bit value
+ * @cb - a callback
+ *
+ * This provides a way for iterating all QPs.
+ *
+ * The @cb is a user defined callback and @v is a 64
+ * bit value passed to and relevant for processing in the
+ * cb.  An example use case would be to alter QP processing
+ * based on criteria not part of the rvt_qp.
+ *
+ * The code has an internal iterator to simplify
+ * non seq_file use cases.
+ */
+void rvt_qp_iter(struct rvt_dev_info *rdi,
+		 u64 v,
+		 void (*cb)(struct rvt_qp *qp, u64 v))
+{
+	int ret;
+	struct rvt_qp_iter i = {
+		.rdi = rdi,
+		.specials = rdi->ibdev.phys_port_cnt * 2,
+		.v = v,
+		.cb = cb
+	};
+
+	rcu_read_lock();
+	do {
+		ret = rvt_qp_iter_next(&i);
+		if (!ret) {
+			rvt_get_qp(i.qp);
+			rcu_read_unlock();
+			i.cb(i.qp, i.v);
+			rcu_read_lock();
+			rvt_put_qp(i.qp);
+		}
+	} while (!ret);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(rvt_qp_iter);
diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h
new file mode 100644
index 0000000..8409f80
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/qp.h
@@ -0,0 +1,69 @@
+#ifndef DEF_RVTQP_H
+#define DEF_RVTQP_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/rdma_vt.h>
+
+int rvt_driver_qp_init(struct rvt_dev_info *rdi);
+void rvt_qp_exit(struct rvt_dev_info *rdi);
+struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata);
+int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		  int attr_mask, struct ib_udata *udata);
+int rvt_destroy_qp(struct ib_qp *ibqp);
+int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		 int attr_mask, struct ib_qp_init_attr *init_attr);
+int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		  struct ib_recv_wr **bad_wr);
+int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		  struct ib_send_wr **bad_wr);
+int rvt_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+#endif          /* DEF_RVTQP_H */
diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c
new file mode 100644
index 0000000..6131cc5
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/rc.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/rdma_vt.h>
+#include <rdma/ib_hdrs.h>
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static const u16 credit_table[31] = {
+	0,                      /* 0 */
+	1,                      /* 1 */
+	2,                      /* 2 */
+	3,                      /* 3 */
+	4,                      /* 4 */
+	6,                      /* 5 */
+	8,                      /* 6 */
+	12,                     /* 7 */
+	16,                     /* 8 */
+	24,                     /* 9 */
+	32,                     /* A */
+	48,                     /* B */
+	64,                     /* C */
+	96,                     /* D */
+	128,                    /* E */
+	192,                    /* F */
+	256,                    /* 10 */
+	384,                    /* 11 */
+	512,                    /* 12 */
+	768,                    /* 13 */
+	1024,                   /* 14 */
+	1536,                   /* 15 */
+	2048,                   /* 16 */
+	3072,                   /* 17 */
+	4096,                   /* 18 */
+	6144,                   /* 19 */
+	8192,                   /* 1A */
+	12288,                  /* 1B */
+	16384,                  /* 1C */
+	24576,                  /* 1D */
+	32768                   /* 1E */
+};
+
+/**
+ * rvt_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 rvt_compute_aeth(struct rvt_qp *qp)
+{
+	u32 aeth = qp->r_msn & IB_MSN_MASK;
+
+	if (qp->ibqp.srq) {
+		/*
+		 * Shared receive queues don't generate credits.
+		 * Set the credit field to the invalid value.
+		 */
+		aeth |= IB_AETH_CREDIT_INVAL << IB_AETH_CREDIT_SHIFT;
+	} else {
+		u32 min, max, x;
+		u32 credits;
+		struct rvt_rwq *wq = qp->r_rq.wq;
+		u32 head;
+		u32 tail;
+
+		/* sanity check pointers before trusting them */
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		/*
+		 * Compute the number of credits available (RWQEs).
+		 * There is a small chance that the pair of reads are
+		 * not atomic, which is OK, since the fuzziness is
+		 * resolved as further ACKs go out.
+		 */
+		credits = head - tail;
+		if ((int)credits < 0)
+			credits += qp->r_rq.size;
+		/*
+		 * Binary search the credit table to find the code to
+		 * use.
+		 */
+		min = 0;
+		max = 31;
+		for (;;) {
+			x = (min + max) / 2;
+			if (credit_table[x] == credits)
+				break;
+			if (credit_table[x] > credits) {
+				max = x;
+			} else {
+				if (min == x)
+					break;
+				min = x;
+			}
+		}
+		aeth |= x << IB_AETH_CREDIT_SHIFT;
+	}
+	return cpu_to_be32(aeth);
+}
+EXPORT_SYMBOL(rvt_compute_aeth);
+
+/**
+ * rvt_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void rvt_get_credit(struct rvt_qp *qp, u32 aeth)
+{
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+	u32 credit = (aeth >> IB_AETH_CREDIT_SHIFT) & IB_AETH_CREDIT_MASK;
+
+	lockdep_assert_held(&qp->s_lock);
+	/*
+	 * If the credit is invalid, we can send
+	 * as many packets as we like.  Otherwise, we have to
+	 * honor the credit field.
+	 */
+	if (credit == IB_AETH_CREDIT_INVAL) {
+		if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
+			qp->s_flags |= RVT_S_UNLIMITED_CREDIT;
+			if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
+				qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
+				rdi->driver_f.schedule_send(qp);
+			}
+		}
+	} else if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
+		/* Compute new LSN (i.e., MSN + credit) */
+		credit = (aeth + credit_table[credit]) & IB_MSN_MASK;
+		if (rvt_cmp_msn(credit, qp->s_lsn) > 0) {
+			qp->s_lsn = credit;
+			if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
+				qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
+				rdi->driver_f.schedule_send(qp);
+			}
+		}
+	}
+}
+EXPORT_SYMBOL(rvt_get_credit);
diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c
new file mode 100644
index 0000000..3707952
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/srq.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "srq.h"
+#include "vt.h"
+
+/**
+ * rvt_driver_srq_init - init srq resources on a per driver basis
+ * @rdi: rvt dev structure
+ *
+ * Do any initialization needed when a driver registers with rdmavt.
+ */
+void rvt_driver_srq_init(struct rvt_dev_info *rdi)
+{
+	spin_lock_init(&rdi->n_srqs_lock);
+	rdi->n_srqs_allocated = 0;
+}
+
+/**
+ * rvt_create_srq - create a shared receive queue
+ * @ibpd: the protection domain of the SRQ to create
+ * @srq_init_attr: the attributes of the SRQ
+ * @udata: data from libibverbs when creating a user SRQ
+ *
+ * Return: Allocated srq object
+ */
+struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
+			      struct ib_srq_init_attr *srq_init_attr,
+			      struct ib_udata *udata)
+{
+	struct rvt_dev_info *dev = ib_to_rvt(ibpd->device);
+	struct rvt_srq *srq;
+	u32 sz;
+	struct ib_srq *ret;
+
+	if (srq_init_attr->srq_type != IB_SRQT_BASIC)
+		return ERR_PTR(-ENOSYS);
+
+	if (srq_init_attr->attr.max_sge == 0 ||
+	    srq_init_attr->attr.max_sge > dev->dparms.props.max_srq_sge ||
+	    srq_init_attr->attr.max_wr == 0 ||
+	    srq_init_attr->attr.max_wr > dev->dparms.props.max_srq_wr)
+		return ERR_PTR(-EINVAL);
+
+	srq = kzalloc_node(sizeof(*srq), GFP_KERNEL, dev->dparms.node);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	srq->rq.size = srq_init_attr->attr.max_wr + 1;
+	srq->rq.max_sge = srq_init_attr->attr.max_sge;
+	sz = sizeof(struct ib_sge) * srq->rq.max_sge +
+		sizeof(struct rvt_rwqe);
+	srq->rq.wq = udata ?
+		vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz) :
+		vzalloc_node(sizeof(struct rvt_rwq) + srq->rq.size * sz,
+			     dev->dparms.node);
+	if (!srq->rq.wq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_srq;
+	}
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See rvt_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+		u32 s = sizeof(struct rvt_rwq) + srq->rq.size * sz;
+
+		srq->ip =
+		    rvt_create_mmap_info(dev, s, ibpd->uobject->context,
+					 srq->rq.wq);
+		if (!srq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wq;
+		}
+
+		err = ib_copy_to_udata(udata, &srq->ip->offset,
+				       sizeof(srq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	}
+
+	/*
+	 * ib_create_srq() will initialize srq->ibsrq.
+	 */
+	spin_lock_init(&srq->rq.lock);
+	srq->limit = srq_init_attr->attr.srq_limit;
+
+	spin_lock(&dev->n_srqs_lock);
+	if (dev->n_srqs_allocated == dev->dparms.props.max_srq) {
+		spin_unlock(&dev->n_srqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_srqs_allocated++;
+	spin_unlock(&dev->n_srqs_lock);
+
+	if (srq->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	return &srq->ibsrq;
+
+bail_ip:
+	kfree(srq->ip);
+bail_wq:
+	vfree(srq->rq.wq);
+bail_srq:
+	kfree(srq);
+	return ret;
+}
+
+/**
+ * rvt_modify_srq - modify a shared receive queue
+ * @ibsrq: the SRQ to modify
+ * @attr: the new attributes of the SRQ
+ * @attr_mask: indicates which attributes to modify
+ * @udata: user data for libibverbs.so
+ *
+ * Return: 0 on success
+ */
+int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		   enum ib_srq_attr_mask attr_mask,
+		   struct ib_udata *udata)
+{
+	struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
+	struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device);
+	struct rvt_rwq *wq;
+	int ret = 0;
+
+	if (attr_mask & IB_SRQ_MAX_WR) {
+		struct rvt_rwq *owq;
+		struct rvt_rwqe *p;
+		u32 sz, size, n, head, tail;
+
+		/* Check that the requested sizes are below the limits. */
+		if ((attr->max_wr > dev->dparms.props.max_srq_wr) ||
+		    ((attr_mask & IB_SRQ_LIMIT) ?
+		     attr->srq_limit : srq->limit) > attr->max_wr)
+			return -EINVAL;
+
+		sz = sizeof(struct rvt_rwqe) +
+			srq->rq.max_sge * sizeof(struct ib_sge);
+		size = attr->max_wr + 1;
+		wq = udata ?
+			vmalloc_user(sizeof(struct rvt_rwq) + size * sz) :
+			vzalloc_node(sizeof(struct rvt_rwq) + size * sz,
+				     dev->dparms.node);
+		if (!wq)
+			return -ENOMEM;
+
+		/* Check that we can write the offset to mmap. */
+		if (udata && udata->inlen >= sizeof(__u64)) {
+			__u64 offset_addr;
+			__u64 offset = 0;
+
+			ret = ib_copy_from_udata(&offset_addr, udata,
+						 sizeof(offset_addr));
+			if (ret)
+				goto bail_free;
+			udata->outbuf = (void __user *)
+					(unsigned long)offset_addr;
+			ret = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (ret)
+				goto bail_free;
+		}
+
+		spin_lock_irq(&srq->rq.lock);
+		/*
+		 * validate head and tail pointer values and compute
+		 * the number of remaining WQEs.
+		 */
+		owq = srq->rq.wq;
+		head = owq->head;
+		tail = owq->tail;
+		if (head >= srq->rq.size || tail >= srq->rq.size) {
+			ret = -EINVAL;
+			goto bail_unlock;
+		}
+		n = head;
+		if (n < tail)
+			n += srq->rq.size - tail;
+		else
+			n -= tail;
+		if (size <= n) {
+			ret = -EINVAL;
+			goto bail_unlock;
+		}
+		n = 0;
+		p = wq->wq;
+		while (tail != head) {
+			struct rvt_rwqe *wqe;
+			int i;
+
+			wqe = rvt_get_rwqe_ptr(&srq->rq, tail);
+			p->wr_id = wqe->wr_id;
+			p->num_sge = wqe->num_sge;
+			for (i = 0; i < wqe->num_sge; i++)
+				p->sg_list[i] = wqe->sg_list[i];
+			n++;
+			p = (struct rvt_rwqe *)((char *)p + sz);
+			if (++tail >= srq->rq.size)
+				tail = 0;
+		}
+		srq->rq.wq = wq;
+		srq->rq.size = size;
+		wq->head = n;
+		wq->tail = 0;
+		if (attr_mask & IB_SRQ_LIMIT)
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+
+		vfree(owq);
+
+		if (srq->ip) {
+			struct rvt_mmap_info *ip = srq->ip;
+			struct rvt_dev_info *dev = ib_to_rvt(srq->ibsrq.device);
+			u32 s = sizeof(struct rvt_rwq) + size * sz;
+
+			rvt_update_mmap_info(dev, ip, s, wq);
+
+			/*
+			 * Return the offset to mmap.
+			 * See rvt_mmap() for details.
+			 */
+			if (udata && udata->inlen >= sizeof(__u64)) {
+				ret = ib_copy_to_udata(udata, &ip->offset,
+						       sizeof(ip->offset));
+				if (ret)
+					return ret;
+			}
+
+			/*
+			 * Put user mapping info onto the pending list
+			 * unless it already is on the list.
+			 */
+			spin_lock_irq(&dev->pending_lock);
+			if (list_empty(&ip->pending_mmaps))
+				list_add(&ip->pending_mmaps,
+					 &dev->pending_mmaps);
+			spin_unlock_irq(&dev->pending_lock);
+		}
+	} else if (attr_mask & IB_SRQ_LIMIT) {
+		spin_lock_irq(&srq->rq.lock);
+		if (attr->srq_limit >= srq->rq.size)
+			ret = -EINVAL;
+		else
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+	}
+	return ret;
+
+bail_unlock:
+	spin_unlock_irq(&srq->rq.lock);
+bail_free:
+	vfree(wq);
+	return ret;
+}
+
+/** rvt_query_srq - query srq data
+ * @ibsrq: srq to query
+ * @attr: return info in attr
+ *
+ * Return: always 0
+ */
+int rvt_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+	struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
+
+	attr->max_wr = srq->rq.size - 1;
+	attr->max_sge = srq->rq.max_sge;
+	attr->srq_limit = srq->limit;
+	return 0;
+}
+
+/**
+ * rvt_destroy_srq - destory an srq
+ * @ibsrq: srq object to destroy
+ *
+ * Return always 0
+ */
+int rvt_destroy_srq(struct ib_srq *ibsrq)
+{
+	struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
+	struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device);
+
+	spin_lock(&dev->n_srqs_lock);
+	dev->n_srqs_allocated--;
+	spin_unlock(&dev->n_srqs_lock);
+	if (srq->ip)
+		kref_put(&srq->ip->ref, rvt_release_mmap_info);
+	else
+		vfree(srq->rq.wq);
+	kfree(srq);
+
+	return 0;
+}
diff --git a/drivers/infiniband/sw/rdmavt/srq.h b/drivers/infiniband/sw/rdmavt/srq.h
new file mode 100644
index 0000000..bf0eaaf
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/srq.h
@@ -0,0 +1,62 @@
+#ifndef DEF_RVTSRQ_H
+#define DEF_RVTSRQ_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/rdma_vt.h>
+void rvt_driver_srq_init(struct rvt_dev_info *rdi);
+struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
+			      struct ib_srq_init_attr *srq_init_attr,
+			      struct ib_udata *udata);
+int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		   enum ib_srq_attr_mask attr_mask,
+		   struct ib_udata *udata);
+int rvt_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+int rvt_destroy_srq(struct ib_srq *ibsrq);
+
+#endif          /* DEF_RVTSRQ_H */
diff --git a/drivers/infiniband/sw/rdmavt/trace.c b/drivers/infiniband/sw/rdmavt/trace.c
new file mode 100644
index 0000000..d593285
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/drivers/infiniband/sw/rdmavt/trace.h b/drivers/infiniband/sw/rdmavt/trace.h
new file mode 100644
index 0000000..36ddbd2
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright(c) 2016, 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define RDI_DEV_ENTRY(rdi)   __string(dev, rvt_get_ibdev_name(rdi))
+#define RDI_DEV_ASSIGN(rdi)  __assign_str(dev, rvt_get_ibdev_name(rdi))
+
+#include "trace_rvt.h"
+#include "trace_qp.h"
+#include "trace_tx.h"
+#include "trace_mr.h"
+#include "trace_cq.h"
+#include "trace_rc.h"
diff --git a/drivers/infiniband/sw/rdmavt/trace_cq.h b/drivers/infiniband/sw/rdmavt/trace_cq.h
new file mode 100644
index 0000000..a315850
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace_cq.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__RVT_TRACE_CQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_CQ_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdmavt_cq.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt_cq
+
+#define wc_opcode_name(opcode) { IB_WC_##opcode, #opcode  }
+#define show_wc_opcode(opcode)                                \
+__print_symbolic(opcode,                                      \
+	wc_opcode_name(SEND),                                 \
+	wc_opcode_name(RDMA_WRITE),                           \
+	wc_opcode_name(RDMA_READ),                            \
+	wc_opcode_name(COMP_SWAP),                            \
+	wc_opcode_name(FETCH_ADD),                            \
+	wc_opcode_name(LSO),                                  \
+	wc_opcode_name(LOCAL_INV),                            \
+	wc_opcode_name(REG_MR),                               \
+	wc_opcode_name(MASKED_COMP_SWAP),                     \
+	wc_opcode_name(RECV),                                 \
+	wc_opcode_name(RECV_RDMA_WITH_IMM))
+
+#define CQ_PRN \
+"[%s] idx %u wr_id %llx status %u opcode %u,%s length %u qpn %x"
+
+DECLARE_EVENT_CLASS(
+	rvt_cq_entry_template,
+	TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx),
+	TP_ARGS(cq, wc, idx),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(cq->rdi)
+		__field(u64, wr_id)
+		__field(u32, status)
+		__field(u32, opcode)
+		__field(u32, qpn)
+		__field(u32, length)
+		__field(u32, idx)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(cq->rdi)
+		__entry->wr_id = wc->wr_id;
+		__entry->status = wc->status;
+		__entry->opcode = wc->opcode;
+		__entry->length = wc->byte_len;
+		__entry->qpn = wc->qp->qp_num;
+		__entry->idx = idx;
+	),
+	TP_printk(
+		CQ_PRN,
+		__get_str(dev),
+		__entry->idx,
+		__entry->wr_id,
+		__entry->status,
+		__entry->opcode, show_wc_opcode(__entry->opcode),
+		__entry->length,
+		__entry->qpn
+	)
+);
+
+DEFINE_EVENT(
+	rvt_cq_entry_template, rvt_cq_enter,
+	TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx),
+	TP_ARGS(cq, wc, idx));
+
+DEFINE_EVENT(
+	rvt_cq_entry_template, rvt_cq_poll,
+	TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx),
+	TP_ARGS(cq, wc, idx));
+
+#endif /* __RVT_TRACE_CQ_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_cq
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/sw/rdmavt/trace_mr.h b/drivers/infiniband/sw/rdmavt/trace_mr.h
new file mode 100644
index 0000000..976e482
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace_mr.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__RVT_TRACE_MR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_MR_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_mr.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt_mr
+DECLARE_EVENT_CLASS(
+	rvt_mr_template,
+	TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len),
+	TP_ARGS(mr, m, n, v, len),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(ib_to_rvt(mr->pd->device))
+		__field(void *, vaddr)
+		__field(struct page *, page)
+		__field(size_t, len)
+		__field(u32, lkey)
+		__field(u16, m)
+		__field(u16, n)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(ib_to_rvt(mr->pd->device));
+		__entry->vaddr = v;
+		__entry->page = virt_to_page(v);
+		__entry->m = m;
+		__entry->n = n;
+		__entry->len = len;
+	),
+	TP_printk(
+		"[%s] vaddr %p page %p m %u n %u len %ld",
+		__get_str(dev),
+		__entry->vaddr,
+		__entry->page,
+		__entry->m,
+		__entry->n,
+		__entry->len
+	)
+);
+
+DEFINE_EVENT(
+	rvt_mr_template, rvt_mr_page_seg,
+	TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len),
+	TP_ARGS(mr, m, n, v, len));
+
+DEFINE_EVENT(
+	rvt_mr_template, rvt_mr_fmr_seg,
+	TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len),
+	TP_ARGS(mr, m, n, v, len));
+
+DEFINE_EVENT(
+	rvt_mr_template, rvt_mr_user_seg,
+	TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len),
+	TP_ARGS(mr, m, n, v, len));
+
+DECLARE_EVENT_CLASS(
+	rvt_sge_template,
+	TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge),
+	TP_ARGS(sge, isge),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(ib_to_rvt(sge->mr->pd->device))
+		__field(struct rvt_mregion *, mr)
+		__field(struct rvt_sge *, sge)
+		__field(struct ib_sge *, isge)
+		__field(void *, vaddr)
+		__field(u64, ivaddr)
+		__field(u32, lkey)
+		__field(u32, sge_length)
+		__field(u32, length)
+		__field(u32, ilength)
+		__field(int, user)
+		__field(u16, m)
+		__field(u16, n)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(ib_to_rvt(sge->mr->pd->device));
+		__entry->mr = sge->mr;
+		__entry->sge = sge;
+		__entry->isge = isge;
+		__entry->vaddr = sge->vaddr;
+		__entry->ivaddr = isge->addr;
+		__entry->lkey = sge->mr->lkey;
+		__entry->sge_length = sge->sge_length;
+		__entry->length = sge->length;
+		__entry->ilength = isge->length;
+		__entry->m = sge->m;
+		__entry->n = sge->m;
+		__entry->user = ibpd_to_rvtpd(sge->mr->pd)->user;
+	),
+	TP_printk(
+		"[%s] mr %p sge %p isge %p vaddr %p ivaddr %llx lkey %x sge_length %u length %u ilength %u m %u n %u user %u",
+		__get_str(dev),
+		__entry->mr,
+		__entry->sge,
+		__entry->isge,
+		__entry->vaddr,
+		__entry->ivaddr,
+		__entry->lkey,
+		__entry->sge_length,
+		__entry->length,
+		__entry->ilength,
+		__entry->m,
+		__entry->n,
+		__entry->user
+	)
+);
+
+DEFINE_EVENT(
+	rvt_sge_template, rvt_sge_adjacent,
+	TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge),
+	TP_ARGS(sge, isge));
+
+DEFINE_EVENT(
+	rvt_sge_template, rvt_sge_new,
+	TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge),
+	TP_ARGS(sge, isge));
+
+#endif /* __RVT_TRACE_MR_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_mr
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/sw/rdmavt/trace_qp.h b/drivers/infiniband/sw/rdmavt/trace_qp.h
new file mode 100644
index 0000000..efc9d81
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace_qp.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__RVT_TRACE_QP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_QP_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_vt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt_qp
+
+DECLARE_EVENT_CLASS(rvt_qphash_template,
+	TP_PROTO(struct rvt_qp *qp, u32 bucket),
+	TP_ARGS(qp, bucket),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, bucket)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->bucket = bucket;
+	),
+	TP_printk(
+		"[%s] qpn 0x%x bucket %u",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->bucket
+	)
+);
+
+DEFINE_EVENT(rvt_qphash_template, rvt_qpinsert,
+	TP_PROTO(struct rvt_qp *qp, u32 bucket),
+	TP_ARGS(qp, bucket));
+
+DEFINE_EVENT(rvt_qphash_template, rvt_qpremove,
+	TP_PROTO(struct rvt_qp *qp, u32 bucket),
+	TP_ARGS(qp, bucket));
+
+DECLARE_EVENT_CLASS(
+	rvt_rnrnak_template,
+	TP_PROTO(struct rvt_qp *qp, u32 to),
+	TP_ARGS(qp, to),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(void *, hrtimer)
+		__field(u32, s_flags)
+		__field(u32, to)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->hrtimer = &qp->s_rnr_timer;
+		__entry->s_flags = qp->s_flags;
+		__entry->to = to;
+	),
+	TP_printk(
+		"[%s] qpn 0x%x hrtimer 0x%p s_flags 0x%x timeout %u us",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->hrtimer,
+		__entry->s_flags,
+		__entry->to
+	)
+);
+
+DEFINE_EVENT(
+	rvt_rnrnak_template, rvt_rnrnak_add,
+	TP_PROTO(struct rvt_qp *qp, u32 to),
+	TP_ARGS(qp, to));
+
+DEFINE_EVENT(
+	rvt_rnrnak_template, rvt_rnrnak_timeout,
+	TP_PROTO(struct rvt_qp *qp, u32 to),
+	TP_ARGS(qp, to));
+
+DEFINE_EVENT(
+	rvt_rnrnak_template, rvt_rnrnak_stop,
+	TP_PROTO(struct rvt_qp *qp, u32 to),
+	TP_ARGS(qp, to));
+
+#endif /* __RVT_TRACE_QP_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_qp
+#include <trace/define_trace.h>
+
diff --git a/drivers/infiniband/sw/rdmavt/trace_rc.h b/drivers/infiniband/sw/rdmavt/trace_rc.h
new file mode 100644
index 0000000..9952769
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace_rc.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__RVT_TRACE_RC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_RC_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_vt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt_rc
+
+DECLARE_EVENT_CLASS(rvt_rc_template,
+		    TP_PROTO(struct rvt_qp *qp, u32 psn),
+		    TP_ARGS(qp, psn),
+		    TP_STRUCT__entry(
+			RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
+			__field(u32, qpn)
+			__field(u32, s_flags)
+			__field(u32, psn)
+			__field(u32, s_psn)
+			__field(u32, s_next_psn)
+			__field(u32, s_sending_psn)
+			__field(u32, s_sending_hpsn)
+			__field(u32, r_psn)
+			),
+		    TP_fast_assign(
+			RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device))
+			__entry->qpn = qp->ibqp.qp_num;
+			__entry->s_flags = qp->s_flags;
+			__entry->psn = psn;
+			__entry->s_psn = qp->s_psn;
+			__entry->s_next_psn = qp->s_next_psn;
+			__entry->s_sending_psn = qp->s_sending_psn;
+			__entry->s_sending_hpsn = qp->s_sending_hpsn;
+			__entry->r_psn = qp->r_psn;
+			),
+		    TP_printk(
+			"[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x",
+			__get_str(dev),
+			__entry->qpn,
+			__entry->s_flags,
+			__entry->psn,
+			__entry->s_psn,
+			__entry->s_next_psn,
+			__entry->s_sending_psn,
+			__entry->s_sending_hpsn,
+			__entry->r_psn
+			)
+);
+
+DEFINE_EVENT(rvt_rc_template, rvt_rc_timeout,
+	     TP_PROTO(struct rvt_qp *qp, u32 psn),
+	     TP_ARGS(qp, psn)
+);
+
+#endif /* __RVT_TRACE_RC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_rc
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/sw/rdmavt/trace_rvt.h b/drivers/infiniband/sw/rdmavt/trace_rvt.h
new file mode 100644
index 0000000..746f334
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace_rvt.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__RVT_TRACE_RVT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_RVT_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_vt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt
+
+TRACE_EVENT(rvt_dbg,
+	TP_PROTO(struct rvt_dev_info *rdi,
+		 const char *msg),
+	TP_ARGS(rdi, msg),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(rdi)
+		__string(msg, msg)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(rdi);
+		__assign_str(msg, msg);
+	),
+	TP_printk("[%s]: %s", __get_str(dev), __get_str(msg))
+);
+
+#endif /* __RVT_TRACE_MISC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_rvt
+#include <trace/define_trace.h>
+
diff --git a/drivers/infiniband/sw/rdmavt/trace_tx.h b/drivers/infiniband/sw/rdmavt/trace_tx.h
new file mode 100644
index 0000000..0ef25fc
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace_tx.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__RVT_TRACE_TX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_TX_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_vt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt_tx
+
+#define wr_opcode_name(opcode) { IB_WR_##opcode, #opcode  }
+#define show_wr_opcode(opcode)                             \
+__print_symbolic(opcode,                                   \
+	wr_opcode_name(RDMA_WRITE),                        \
+	wr_opcode_name(RDMA_WRITE_WITH_IMM),               \
+	wr_opcode_name(SEND),                              \
+	wr_opcode_name(SEND_WITH_IMM),                     \
+	wr_opcode_name(RDMA_READ),                         \
+	wr_opcode_name(ATOMIC_CMP_AND_SWP),                \
+	wr_opcode_name(ATOMIC_FETCH_AND_ADD),              \
+	wr_opcode_name(LSO),                               \
+	wr_opcode_name(SEND_WITH_INV),                     \
+	wr_opcode_name(RDMA_READ_WITH_INV),                \
+	wr_opcode_name(LOCAL_INV),                         \
+	wr_opcode_name(MASKED_ATOMIC_CMP_AND_SWP),         \
+	wr_opcode_name(MASKED_ATOMIC_FETCH_AND_ADD),       \
+	wr_opcode_name(RESERVED1),                         \
+	wr_opcode_name(RESERVED2),                         \
+	wr_opcode_name(RESERVED3),                         \
+	wr_opcode_name(RESERVED4),                         \
+	wr_opcode_name(RESERVED5),                         \
+	wr_opcode_name(RESERVED6),                         \
+	wr_opcode_name(RESERVED7),                         \
+	wr_opcode_name(RESERVED8),                         \
+	wr_opcode_name(RESERVED9),                         \
+	wr_opcode_name(RESERVED10))
+
+#define POS_PRN \
+"[%s] wqe %p wr_id %llx send_flags %x qpn %x qpt %u psn %x lpsn %x ssn %x length %u opcode 0x%.2x,%s size %u avail %u head %u last %u pid %u num_sge %u wr_num_sge %u"
+
+TRACE_EVENT(
+	rvt_post_one_wr,
+	TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe, int wr_num_sge),
+	TP_ARGS(qp, wqe, wr_num_sge),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
+		__field(u64, wr_id)
+		__field(struct rvt_swqe *, wqe)
+		__field(u32, qpn)
+		__field(u32, qpt)
+		__field(u32, psn)
+		__field(u32, lpsn)
+		__field(u32, length)
+		__field(u32, opcode)
+		__field(u32, size)
+		__field(u32, avail)
+		__field(u32, head)
+		__field(u32, last)
+		__field(u32, ssn)
+		__field(int, send_flags)
+		__field(pid_t, pid)
+		__field(int, num_sge)
+		__field(int, wr_num_sge)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device))
+		__entry->wqe = wqe;
+		__entry->wr_id = wqe->wr.wr_id;
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->qpt = qp->ibqp.qp_type;
+		__entry->psn = wqe->psn;
+		__entry->lpsn = wqe->lpsn;
+		__entry->length = wqe->length;
+		__entry->opcode = wqe->wr.opcode;
+		__entry->size = qp->s_size;
+		__entry->avail = qp->s_avail;
+		__entry->head = qp->s_head;
+		__entry->last = qp->s_last;
+		__entry->pid = qp->pid;
+		__entry->ssn = wqe->ssn;
+		__entry->send_flags = wqe->wr.send_flags;
+		__entry->num_sge = wqe->wr.num_sge;
+		__entry->wr_num_sge = wr_num_sge;
+	),
+	TP_printk(
+		POS_PRN,
+		__get_str(dev),
+		__entry->wqe,
+		__entry->wr_id,
+		__entry->send_flags,
+		__entry->qpn,
+		__entry->qpt,
+		__entry->psn,
+		__entry->lpsn,
+		__entry->ssn,
+		__entry->length,
+		__entry->opcode, show_wr_opcode(__entry->opcode),
+		__entry->size,
+		__entry->avail,
+		__entry->head,
+		__entry->last,
+		__entry->pid,
+		__entry->num_sge,
+		__entry->wr_num_sge
+	)
+);
+
+#endif /* __RVT_TRACE_TX_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_tx
+#include <trace/define_trace.h>
+
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
new file mode 100644
index 0000000..434199d
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -0,0 +1,899 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include "vt.h"
+#include "trace.h"
+
+#define RVT_UVERBS_ABI_VERSION 2
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("RDMA Verbs Transport Library");
+
+static int rvt_init(void)
+{
+	/*
+	 * rdmavt does not need to do anything special when it starts up. All it
+	 * needs to do is sit and wait until a driver attempts registration.
+	 */
+	return 0;
+}
+module_init(rvt_init);
+
+static void rvt_cleanup(void)
+{
+	/*
+	 * Nothing to do at exit time either. The module won't be able to be
+	 * removed until all drivers are gone which means all the dev structs
+	 * are gone so there is really nothing to do.
+	 */
+}
+module_exit(rvt_cleanup);
+
+/**
+ * rvt_alloc_device - allocate rdi
+ * @size: how big of a structure to allocate
+ * @nports: number of ports to allocate array slots for
+ *
+ * Use IB core device alloc to allocate space for the rdi which is assumed to be
+ * inside of the ib_device. Any extra space that drivers require should be
+ * included in size.
+ *
+ * We also allocate a port array based on the number of ports.
+ *
+ * Return: pointer to allocated rdi
+ */
+struct rvt_dev_info *rvt_alloc_device(size_t size, int nports)
+{
+	struct rvt_dev_info *rdi;
+
+	rdi = (struct rvt_dev_info *)ib_alloc_device(size);
+	if (!rdi)
+		return rdi;
+
+	rdi->ports = kcalloc(nports,
+			     sizeof(struct rvt_ibport **),
+			     GFP_KERNEL);
+	if (!rdi->ports)
+		ib_dealloc_device(&rdi->ibdev);
+
+	return rdi;
+}
+EXPORT_SYMBOL(rvt_alloc_device);
+
+/**
+ * rvt_dealloc_device - deallocate rdi
+ * @rdi: structure to free
+ *
+ * Free a structure allocated with rvt_alloc_device()
+ */
+void rvt_dealloc_device(struct rvt_dev_info *rdi)
+{
+	kfree(rdi->ports);
+	ib_dealloc_device(&rdi->ibdev);
+}
+EXPORT_SYMBOL(rvt_dealloc_device);
+
+static int rvt_query_device(struct ib_device *ibdev,
+			    struct ib_device_attr *props,
+			    struct ib_udata *uhw)
+{
+	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+
+	if (uhw->inlen || uhw->outlen)
+		return -EINVAL;
+	/*
+	 * Return rvt_dev_info.dparms.props contents
+	 */
+	*props = rdi->dparms.props;
+	return 0;
+}
+
+static int rvt_modify_device(struct ib_device *device,
+			     int device_modify_mask,
+			     struct ib_device_modify *device_modify)
+{
+	/*
+	 * There is currently no need to supply this based on qib and hfi1.
+	 * Future drivers may need to implement this though.
+	 */
+
+	return -EOPNOTSUPP;
+}
+
+/**
+ * rvt_query_port: Passes the query port call to the driver
+ * @ibdev: Verbs IB dev
+ * @port_num: port number, 1 based from ib core
+ * @props: structure to hold returned properties
+ *
+ * Return: 0 on success
+ */
+static int rvt_query_port(struct ib_device *ibdev, u8 port_num,
+			  struct ib_port_attr *props)
+{
+	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+	struct rvt_ibport *rvp;
+	int port_index = ibport_num_to_idx(ibdev, port_num);
+
+	if (port_index < 0)
+		return -EINVAL;
+
+	rvp = rdi->ports[port_index];
+	/* props being zeroed by the caller, avoid zeroing it here */
+	props->sm_lid = rvp->sm_lid;
+	props->sm_sl = rvp->sm_sl;
+	props->port_cap_flags = rvp->port_cap_flags;
+	props->max_msg_sz = 0x80000000;
+	props->pkey_tbl_len = rvt_get_npkeys(rdi);
+	props->bad_pkey_cntr = rvp->pkey_violations;
+	props->qkey_viol_cntr = rvp->qkey_violations;
+	props->subnet_timeout = rvp->subnet_timeout;
+	props->init_type_reply = 0;
+
+	/* Populate the remaining ib_port_attr elements */
+	return rdi->driver_f.query_port_state(rdi, port_num, props);
+}
+
+/**
+ * rvt_modify_port
+ * @ibdev: Verbs IB dev
+ * @port_num: Port number, 1 based from ib core
+ * @port_modify_mask: How to change the port
+ * @props: Structure to fill in
+ *
+ * Return: 0 on success
+ */
+static int rvt_modify_port(struct ib_device *ibdev, u8 port_num,
+			   int port_modify_mask, struct ib_port_modify *props)
+{
+	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+	struct rvt_ibport *rvp;
+	int ret = 0;
+	int port_index = ibport_num_to_idx(ibdev, port_num);
+
+	if (port_index < 0)
+		return -EINVAL;
+
+	rvp = rdi->ports[port_index];
+	if (port_modify_mask & IB_PORT_OPA_MASK_CHG) {
+		rvp->port_cap3_flags |= props->set_port_cap_mask;
+		rvp->port_cap3_flags &= ~props->clr_port_cap_mask;
+	} else {
+		rvp->port_cap_flags |= props->set_port_cap_mask;
+		rvp->port_cap_flags &= ~props->clr_port_cap_mask;
+	}
+
+	if (props->set_port_cap_mask || props->clr_port_cap_mask)
+		rdi->driver_f.cap_mask_chg(rdi, port_num);
+	if (port_modify_mask & IB_PORT_SHUTDOWN)
+		ret = rdi->driver_f.shut_down_port(rdi, port_num);
+	if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+		rvp->qkey_violations = 0;
+
+	return ret;
+}
+
+/**
+ * rvt_query_pkey - Return a pkey from the table at a given index
+ * @ibdev: Verbs IB dev
+ * @port_num: Port number, 1 based from ib core
+ * @index: Index into pkey table
+ * @pkey: returned pkey from the port pkey table
+ *
+ * Return: 0 on failure pkey otherwise
+ */
+static int rvt_query_pkey(struct ib_device *ibdev, u8 port_num, u16 index,
+			  u16 *pkey)
+{
+	/*
+	 * Driver will be responsible for keeping rvt_dev_info.pkey_table up to
+	 * date. This function will just return that value. There is no need to
+	 * lock, if a stale value is read and sent to the user so be it there is
+	 * no way to protect against that anyway.
+	 */
+	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+	int port_index;
+
+	port_index = ibport_num_to_idx(ibdev, port_num);
+	if (port_index < 0)
+		return -EINVAL;
+
+	if (index >= rvt_get_npkeys(rdi))
+		return -EINVAL;
+
+	*pkey = rvt_get_pkey(rdi, port_index, index);
+	return 0;
+}
+
+/**
+ * rvt_query_gid - Return a gid from the table
+ * @ibdev: Verbs IB dev
+ * @port_num: Port number, 1 based from ib core
+ * @guid_index: Index in table
+ * @gid: Gid to return
+ *
+ * Return: 0 on success
+ */
+static int rvt_query_gid(struct ib_device *ibdev, u8 port_num,
+			 int guid_index, union ib_gid *gid)
+{
+	struct rvt_dev_info *rdi;
+	struct rvt_ibport *rvp;
+	int port_index;
+
+	/*
+	 * Driver is responsible for updating the guid table. Which will be used
+	 * to craft the return value. This will work similar to how query_pkey()
+	 * is being done.
+	 */
+	port_index = ibport_num_to_idx(ibdev, port_num);
+	if (port_index < 0)
+		return -EINVAL;
+
+	rdi = ib_to_rvt(ibdev);
+	rvp = rdi->ports[port_index];
+
+	gid->global.subnet_prefix = rvp->gid_prefix;
+
+	return rdi->driver_f.get_guid_be(rdi, rvp, guid_index,
+					 &gid->global.interface_id);
+}
+
+struct rvt_ucontext {
+	struct ib_ucontext ibucontext;
+};
+
+static inline struct rvt_ucontext *to_iucontext(struct ib_ucontext
+						*ibucontext)
+{
+	return container_of(ibucontext, struct rvt_ucontext, ibucontext);
+}
+
+/**
+ * rvt_alloc_ucontext - Allocate a user context
+ * @ibdev: Verbs IB dev
+ * @udata: User data allocated
+ */
+static struct ib_ucontext *rvt_alloc_ucontext(struct ib_device *ibdev,
+					      struct ib_udata *udata)
+{
+	struct rvt_ucontext *context;
+
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+	return &context->ibucontext;
+}
+
+/**
+ *rvt_dealloc_ucontext - Free a user context
+ *@context - Free this
+ */
+static int rvt_dealloc_ucontext(struct ib_ucontext *context)
+{
+	kfree(to_iucontext(context));
+	return 0;
+}
+
+static int rvt_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+				  struct ib_port_immutable *immutable)
+{
+	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+	struct ib_port_attr attr;
+	int err, port_index;
+
+	port_index = ibport_num_to_idx(ibdev, port_num);
+	if (port_index < 0)
+		return -EINVAL;
+
+	immutable->core_cap_flags = rdi->dparms.core_cap_flags;
+
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+	immutable->max_mad_size = rdi->dparms.max_mad_size;
+
+	return 0;
+}
+
+enum {
+	MISC,
+	QUERY_DEVICE,
+	MODIFY_DEVICE,
+	QUERY_PORT,
+	MODIFY_PORT,
+	QUERY_PKEY,
+	QUERY_GID,
+	ALLOC_UCONTEXT,
+	DEALLOC_UCONTEXT,
+	GET_PORT_IMMUTABLE,
+	CREATE_QP,
+	MODIFY_QP,
+	DESTROY_QP,
+	QUERY_QP,
+	POST_SEND,
+	POST_RECV,
+	POST_SRQ_RECV,
+	CREATE_AH,
+	DESTROY_AH,
+	MODIFY_AH,
+	QUERY_AH,
+	CREATE_SRQ,
+	MODIFY_SRQ,
+	DESTROY_SRQ,
+	QUERY_SRQ,
+	ATTACH_MCAST,
+	DETACH_MCAST,
+	GET_DMA_MR,
+	REG_USER_MR,
+	DEREG_MR,
+	ALLOC_MR,
+	MAP_MR_SG,
+	ALLOC_FMR,
+	MAP_PHYS_FMR,
+	UNMAP_FMR,
+	DEALLOC_FMR,
+	MMAP,
+	CREATE_CQ,
+	DESTROY_CQ,
+	POLL_CQ,
+	REQ_NOTFIY_CQ,
+	RESIZE_CQ,
+	ALLOC_PD,
+	DEALLOC_PD,
+	_VERB_IDX_MAX /* Must always be last! */
+};
+
+static inline int check_driver_override(struct rvt_dev_info *rdi,
+					size_t offset, void *func)
+{
+	if (!*(void **)((void *)&rdi->ibdev + offset)) {
+		*(void **)((void *)&rdi->ibdev + offset) = func;
+		return 0;
+	}
+
+	return 1;
+}
+
+static noinline int check_support(struct rvt_dev_info *rdi, int verb)
+{
+	switch (verb) {
+	case MISC:
+		/*
+		 * These functions are not part of verbs specifically but are
+		 * required for rdmavt to function.
+		 */
+		if ((!rdi->driver_f.port_callback) ||
+		    (!rdi->driver_f.get_pci_dev))
+			return -EINVAL;
+		break;
+
+	case QUERY_DEVICE:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    query_device),
+						    rvt_query_device);
+		break;
+
+	case MODIFY_DEVICE:
+		/*
+		 * rdmavt does not support modify device currently drivers must
+		 * provide.
+		 */
+		if (!check_driver_override(rdi, offsetof(struct ib_device,
+							 modify_device),
+					   rvt_modify_device))
+			return -EOPNOTSUPP;
+		break;
+
+	case QUERY_PORT:
+		if (!check_driver_override(rdi, offsetof(struct ib_device,
+							 query_port),
+					   rvt_query_port))
+			if (!rdi->driver_f.query_port_state)
+				return -EINVAL;
+		break;
+
+	case MODIFY_PORT:
+		if (!check_driver_override(rdi, offsetof(struct ib_device,
+							 modify_port),
+					   rvt_modify_port))
+			if (!rdi->driver_f.cap_mask_chg ||
+			    !rdi->driver_f.shut_down_port)
+				return -EINVAL;
+		break;
+
+	case QUERY_PKEY:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    query_pkey),
+				      rvt_query_pkey);
+		break;
+
+	case QUERY_GID:
+		if (!check_driver_override(rdi, offsetof(struct ib_device,
+							 query_gid),
+					   rvt_query_gid))
+			if (!rdi->driver_f.get_guid_be)
+				return -EINVAL;
+		break;
+
+	case ALLOC_UCONTEXT:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    alloc_ucontext),
+				      rvt_alloc_ucontext);
+		break;
+
+	case DEALLOC_UCONTEXT:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    dealloc_ucontext),
+				      rvt_dealloc_ucontext);
+		break;
+
+	case GET_PORT_IMMUTABLE:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    get_port_immutable),
+				      rvt_get_port_immutable);
+		break;
+
+	case CREATE_QP:
+		if (!check_driver_override(rdi, offsetof(struct ib_device,
+							 create_qp),
+					   rvt_create_qp))
+			if (!rdi->driver_f.qp_priv_alloc ||
+			    !rdi->driver_f.qp_priv_free ||
+			    !rdi->driver_f.notify_qp_reset ||
+			    !rdi->driver_f.flush_qp_waiters ||
+			    !rdi->driver_f.stop_send_queue ||
+			    !rdi->driver_f.quiesce_qp)
+				return -EINVAL;
+		break;
+
+	case MODIFY_QP:
+		if (!check_driver_override(rdi, offsetof(struct ib_device,
+							 modify_qp),
+					   rvt_modify_qp))
+			if (!rdi->driver_f.notify_qp_reset ||
+			    !rdi->driver_f.schedule_send ||
+			    !rdi->driver_f.get_pmtu_from_attr ||
+			    !rdi->driver_f.flush_qp_waiters ||
+			    !rdi->driver_f.stop_send_queue ||
+			    !rdi->driver_f.quiesce_qp ||
+			    !rdi->driver_f.notify_error_qp ||
+			    !rdi->driver_f.mtu_from_qp ||
+			    !rdi->driver_f.mtu_to_path_mtu)
+				return -EINVAL;
+		break;
+
+	case DESTROY_QP:
+		if (!check_driver_override(rdi, offsetof(struct ib_device,
+							 destroy_qp),
+					   rvt_destroy_qp))
+			if (!rdi->driver_f.qp_priv_free ||
+			    !rdi->driver_f.notify_qp_reset ||
+			    !rdi->driver_f.flush_qp_waiters ||
+			    !rdi->driver_f.stop_send_queue ||
+			    !rdi->driver_f.quiesce_qp)
+				return -EINVAL;
+		break;
+
+	case QUERY_QP:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    query_qp),
+						    rvt_query_qp);
+		break;
+
+	case POST_SEND:
+		if (!check_driver_override(rdi, offsetof(struct ib_device,
+							 post_send),
+					   rvt_post_send))
+			if (!rdi->driver_f.schedule_send ||
+			    !rdi->driver_f.do_send ||
+			    !rdi->post_parms)
+				return -EINVAL;
+		break;
+
+	case POST_RECV:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    post_recv),
+				      rvt_post_recv);
+		break;
+	case POST_SRQ_RECV:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    post_srq_recv),
+				      rvt_post_srq_recv);
+		break;
+
+	case CREATE_AH:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    create_ah),
+				      rvt_create_ah);
+		break;
+
+	case DESTROY_AH:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    destroy_ah),
+				      rvt_destroy_ah);
+		break;
+
+	case MODIFY_AH:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    modify_ah),
+				      rvt_modify_ah);
+		break;
+
+	case QUERY_AH:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    query_ah),
+				      rvt_query_ah);
+		break;
+
+	case CREATE_SRQ:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    create_srq),
+				      rvt_create_srq);
+		break;
+
+	case MODIFY_SRQ:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    modify_srq),
+				      rvt_modify_srq);
+		break;
+
+	case DESTROY_SRQ:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    destroy_srq),
+				      rvt_destroy_srq);
+		break;
+
+	case QUERY_SRQ:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    query_srq),
+				      rvt_query_srq);
+		break;
+
+	case ATTACH_MCAST:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    attach_mcast),
+				      rvt_attach_mcast);
+		break;
+
+	case DETACH_MCAST:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    detach_mcast),
+				      rvt_detach_mcast);
+		break;
+
+	case GET_DMA_MR:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    get_dma_mr),
+				      rvt_get_dma_mr);
+		break;
+
+	case REG_USER_MR:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    reg_user_mr),
+				      rvt_reg_user_mr);
+		break;
+
+	case DEREG_MR:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    dereg_mr),
+				      rvt_dereg_mr);
+		break;
+
+	case ALLOC_FMR:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    alloc_fmr),
+				      rvt_alloc_fmr);
+		break;
+
+	case ALLOC_MR:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    alloc_mr),
+				      rvt_alloc_mr);
+		break;
+
+	case MAP_MR_SG:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    map_mr_sg),
+				      rvt_map_mr_sg);
+		break;
+
+	case MAP_PHYS_FMR:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    map_phys_fmr),
+				      rvt_map_phys_fmr);
+		break;
+
+	case UNMAP_FMR:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    unmap_fmr),
+				      rvt_unmap_fmr);
+		break;
+
+	case DEALLOC_FMR:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    dealloc_fmr),
+				      rvt_dealloc_fmr);
+		break;
+
+	case MMAP:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    mmap),
+				      rvt_mmap);
+		break;
+
+	case CREATE_CQ:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    create_cq),
+				      rvt_create_cq);
+		break;
+
+	case DESTROY_CQ:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    destroy_cq),
+				      rvt_destroy_cq);
+		break;
+
+	case POLL_CQ:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    poll_cq),
+				      rvt_poll_cq);
+		break;
+
+	case REQ_NOTFIY_CQ:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    req_notify_cq),
+				      rvt_req_notify_cq);
+		break;
+
+	case RESIZE_CQ:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    resize_cq),
+				      rvt_resize_cq);
+		break;
+
+	case ALLOC_PD:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    alloc_pd),
+				      rvt_alloc_pd);
+		break;
+
+	case DEALLOC_PD:
+		check_driver_override(rdi, offsetof(struct ib_device,
+						    dealloc_pd),
+				      rvt_dealloc_pd);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * rvt_register_device - register a driver
+ * @rdi: main dev structure for all of rdmavt operations
+ *
+ * It is up to drivers to allocate the rdi and fill in the appropriate
+ * information.
+ *
+ * Return: 0 on success otherwise an errno.
+ */
+int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
+{
+	int ret = 0, i;
+
+	if (!rdi)
+		return -EINVAL;
+
+	/*
+	 * Check to ensure drivers have setup the required helpers for the verbs
+	 * they want rdmavt to handle
+	 */
+	for (i = 0; i < _VERB_IDX_MAX; i++)
+		if (check_support(rdi, i)) {
+			pr_err("Driver support req not met at %d\n", i);
+			return -EINVAL;
+		}
+
+
+	/* Once we get past here we can use rvt_pr macros and tracepoints */
+	trace_rvt_dbg(rdi, "Driver attempting registration");
+	rvt_mmap_init(rdi);
+
+	/* Queue Pairs */
+	ret = rvt_driver_qp_init(rdi);
+	if (ret) {
+		pr_err("Error in driver QP init.\n");
+		return -EINVAL;
+	}
+
+	/* Address Handle */
+	spin_lock_init(&rdi->n_ahs_lock);
+	rdi->n_ahs_allocated = 0;
+
+	/* Shared Receive Queue */
+	rvt_driver_srq_init(rdi);
+
+	/* Multicast */
+	rvt_driver_mcast_init(rdi);
+
+	/* Mem Region */
+	ret = rvt_driver_mr_init(rdi);
+	if (ret) {
+		pr_err("Error in driver MR init.\n");
+		goto bail_no_mr;
+	}
+
+	/* Completion queues */
+	ret = rvt_driver_cq_init(rdi);
+	if (ret) {
+		pr_err("Error in driver CQ init.\n");
+		goto bail_mr;
+	}
+
+	/* DMA Operations */
+	rdi->ibdev.dev.dma_ops = rdi->ibdev.dev.dma_ops ? : &dma_virt_ops;
+
+	/* Protection Domain */
+	spin_lock_init(&rdi->n_pds_lock);
+	rdi->n_pds_allocated = 0;
+
+	/*
+	 * There are some things which could be set by underlying drivers but
+	 * really should be up to rdmavt to set. For instance drivers can't know
+	 * exactly which functions rdmavt supports, nor do they know the ABI
+	 * version, so we do all of this sort of stuff here.
+	 */
+	rdi->ibdev.uverbs_abi_ver = RVT_UVERBS_ABI_VERSION;
+	rdi->ibdev.uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_AH)           |
+		(1ull << IB_USER_VERBS_CMD_QUERY_AH)            |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
+		(1ull << IB_USER_VERBS_CMD_REG_MR)              |
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
+		(1ull << IB_USER_VERBS_CMD_POLL_CQ)             |
+		(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)       |
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
+		(1ull << IB_USER_VERBS_CMD_POST_SEND)           |
+		(1ull << IB_USER_VERBS_CMD_POST_RECV)           |
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
+		(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+	rdi->ibdev.node_type = RDMA_NODE_IB_CA;
+	rdi->ibdev.num_comp_vectors = 1;
+
+	rdi->ibdev.driver_id = driver_id;
+	/* We are now good to announce we exist */
+	ret =  ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback);
+	if (ret) {
+		rvt_pr_err(rdi, "Failed to register driver with ib core.\n");
+		goto bail_cq;
+	}
+
+	rvt_create_mad_agents(rdi);
+
+	rvt_pr_info(rdi, "Registration with rdmavt done.\n");
+	return ret;
+
+bail_cq:
+	rvt_cq_exit(rdi);
+
+bail_mr:
+	rvt_mr_exit(rdi);
+
+bail_no_mr:
+	rvt_qp_exit(rdi);
+
+	return ret;
+}
+EXPORT_SYMBOL(rvt_register_device);
+
+/**
+ * rvt_unregister_device - remove a driver
+ * @rdi: rvt dev struct
+ */
+void rvt_unregister_device(struct rvt_dev_info *rdi)
+{
+	trace_rvt_dbg(rdi, "Driver is unregistering.");
+	if (!rdi)
+		return;
+
+	rvt_free_mad_agents(rdi);
+
+	ib_unregister_device(&rdi->ibdev);
+	rvt_cq_exit(rdi);
+	rvt_mr_exit(rdi);
+	rvt_qp_exit(rdi);
+}
+EXPORT_SYMBOL(rvt_unregister_device);
+
+/**
+ * rvt_init_port - init internal data for driver port
+ * @rdi: rvt dev strut
+ * @port: rvt port
+ * @port_index: 0 based index of ports, different from IB core port num
+ *
+ * Keep track of a list of ports. No need to have a detach port.
+ * They persist until the driver goes away.
+ *
+ * Return: always 0
+ */
+int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port,
+		  int port_index, u16 *pkey_table)
+{
+
+	rdi->ports[port_index] = port;
+	rdi->ports[port_index]->pkey_table = pkey_table;
+
+	return 0;
+}
+EXPORT_SYMBOL(rvt_init_port);
diff --git a/drivers/infiniband/sw/rdmavt/vt.h b/drivers/infiniband/sw/rdmavt/vt.h
new file mode 100644
index 0000000..0675ea6
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/vt.h
@@ -0,0 +1,102 @@
+#ifndef DEF_RDMAVT_H
+#define DEF_RDMAVT_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/rdma_vt.h>
+#include <linux/pci.h>
+#include "pd.h"
+#include "qp.h"
+#include "ah.h"
+#include "mr.h"
+#include "srq.h"
+#include "mcast.h"
+#include "mmap.h"
+#include "cq.h"
+#include "mad.h"
+
+#define rvt_pr_info(rdi, fmt, ...) \
+	__rvt_pr_info(rdi->driver_f.get_pci_dev(rdi), \
+		      rvt_get_ibdev_name(rdi), \
+		      fmt, \
+		      ##__VA_ARGS__)
+
+#define rvt_pr_warn(rdi, fmt, ...) \
+	__rvt_pr_warn(rdi->driver_f.get_pci_dev(rdi), \
+		      rvt_get_ibdev_name(rdi), \
+		      fmt, \
+		      ##__VA_ARGS__)
+
+#define rvt_pr_err(rdi, fmt, ...) \
+	__rvt_pr_err(rdi->driver_f.get_pci_dev(rdi), \
+		     rvt_get_ibdev_name(rdi), \
+		     fmt, \
+		     ##__VA_ARGS__)
+
+#define __rvt_pr_info(pdev, name, fmt, ...) \
+	dev_info(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
+
+#define __rvt_pr_warn(pdev, name, fmt, ...) \
+	dev_warn(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
+
+#define __rvt_pr_err(pdev, name, fmt, ...) \
+	dev_err(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
+
+static inline int ibport_num_to_idx(struct ib_device *ibdev, u8 port_num)
+{
+	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+	int port_index;
+
+	port_index = port_num - 1; /* IB ports start at 1 our arrays at 0 */
+	if ((port_index < 0) || (port_index >= rdi->dparms.nports))
+		return -EINVAL;
+
+	return port_index;
+}
+
+#endif          /* DEF_RDMAVT_H */
diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig
new file mode 100644
index 0000000..bad4a57
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/Kconfig
@@ -0,0 +1,26 @@
+config RDMA_RXE
+	tristate "Software RDMA over Ethernet (RoCE) driver"
+	depends on INET && PCI && INFINIBAND
+	select NET_UDP_TUNNEL
+	select CRYPTO_CRC32
+	select DMA_VIRT_OPS
+	---help---
+	This driver implements the InfiniBand RDMA transport over
+	the Linux network stack. It enables a system with a
+	standard Ethernet adapter to interoperate with a RoCE
+	adapter or with another system running the RXE driver.
+	Documentation on InfiniBand and RoCE can be downloaded at
+	www.infinibandta.org and www.openfabrics.org. (See also
+	siw which is a similar software driver for iWARP.)
+
+	The driver is split into two layers, one interfaces with the
+	Linux RDMA stack and implements a kernel or user space
+	verbs API. The user space verbs API requires a support
+	library named librxe which is loaded by the generic user
+	space verbs API, libibverbs. The other layer interfaces
+	with the Linux network stack at layer 3.
+
+	To configure and work with soft-RoCE driver please use the
+	following wiki page under "configure Soft-RoCE (RXE)" section:
+
+	https://github.com/linux-rdma/rdma-core/blob/master/Documentation/rxe.md
diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile
new file mode 100644
index 0000000..66af72d
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_RDMA_RXE) += rdma_rxe.o
+
+rdma_rxe-y := \
+	rxe.o \
+	rxe_comp.o \
+	rxe_req.o \
+	rxe_resp.o \
+	rxe_recv.o \
+	rxe_pool.o \
+	rxe_queue.o \
+	rxe_verbs.o \
+	rxe_av.o \
+	rxe_srq.o \
+	rxe_qp.o \
+	rxe_cq.o \
+	rxe_mr.o \
+	rxe_opcode.o \
+	rxe_mmap.o \
+	rxe_icrc.o \
+	rxe_mcast.o \
+	rxe_task.o \
+	rxe_net.o \
+	rxe_sysfs.o \
+	rxe_hw_counters.o
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
new file mode 100644
index 0000000..e493fdb
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <net/addrconf.h>
+#include "rxe.h"
+#include "rxe_loc.h"
+
+MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib");
+MODULE_DESCRIPTION("Soft RDMA transport");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/* free resources for all ports on a device */
+static void rxe_cleanup_ports(struct rxe_dev *rxe)
+{
+	kfree(rxe->port.pkey_tbl);
+	rxe->port.pkey_tbl = NULL;
+
+}
+
+/* free resources for a rxe device all objects created for this device must
+ * have been destroyed
+ */
+static void rxe_cleanup(struct rxe_dev *rxe)
+{
+	rxe_pool_cleanup(&rxe->uc_pool);
+	rxe_pool_cleanup(&rxe->pd_pool);
+	rxe_pool_cleanup(&rxe->ah_pool);
+	rxe_pool_cleanup(&rxe->srq_pool);
+	rxe_pool_cleanup(&rxe->qp_pool);
+	rxe_pool_cleanup(&rxe->cq_pool);
+	rxe_pool_cleanup(&rxe->mr_pool);
+	rxe_pool_cleanup(&rxe->mw_pool);
+	rxe_pool_cleanup(&rxe->mc_grp_pool);
+	rxe_pool_cleanup(&rxe->mc_elem_pool);
+
+	rxe_cleanup_ports(rxe);
+
+	crypto_free_shash(rxe->tfm);
+}
+
+/* called when all references have been dropped */
+void rxe_release(struct kref *kref)
+{
+	struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt);
+
+	rxe_cleanup(rxe);
+	ib_dealloc_device(&rxe->ib_dev);
+}
+
+/* initialize rxe device parameters */
+static void rxe_init_device_param(struct rxe_dev *rxe)
+{
+	rxe->max_inline_data			= RXE_MAX_INLINE_DATA;
+
+	rxe->attr.fw_ver			= RXE_FW_VER;
+	rxe->attr.max_mr_size			= RXE_MAX_MR_SIZE;
+	rxe->attr.page_size_cap			= RXE_PAGE_SIZE_CAP;
+	rxe->attr.vendor_id			= RXE_VENDOR_ID;
+	rxe->attr.vendor_part_id		= RXE_VENDOR_PART_ID;
+	rxe->attr.hw_ver			= RXE_HW_VER;
+	rxe->attr.max_qp			= RXE_MAX_QP;
+	rxe->attr.max_qp_wr			= RXE_MAX_QP_WR;
+	rxe->attr.device_cap_flags		= RXE_DEVICE_CAP_FLAGS;
+	rxe->attr.max_sge			= RXE_MAX_SGE;
+	rxe->attr.max_sge_rd			= RXE_MAX_SGE_RD;
+	rxe->attr.max_cq			= RXE_MAX_CQ;
+	rxe->attr.max_cqe			= (1 << RXE_MAX_LOG_CQE) - 1;
+	rxe->attr.max_mr			= RXE_MAX_MR;
+	rxe->attr.max_pd			= RXE_MAX_PD;
+	rxe->attr.max_qp_rd_atom		= RXE_MAX_QP_RD_ATOM;
+	rxe->attr.max_ee_rd_atom		= RXE_MAX_EE_RD_ATOM;
+	rxe->attr.max_res_rd_atom		= RXE_MAX_RES_RD_ATOM;
+	rxe->attr.max_qp_init_rd_atom		= RXE_MAX_QP_INIT_RD_ATOM;
+	rxe->attr.max_ee_init_rd_atom		= RXE_MAX_EE_INIT_RD_ATOM;
+	rxe->attr.atomic_cap			= RXE_ATOMIC_CAP;
+	rxe->attr.max_ee			= RXE_MAX_EE;
+	rxe->attr.max_rdd			= RXE_MAX_RDD;
+	rxe->attr.max_mw			= RXE_MAX_MW;
+	rxe->attr.max_raw_ipv6_qp		= RXE_MAX_RAW_IPV6_QP;
+	rxe->attr.max_raw_ethy_qp		= RXE_MAX_RAW_ETHY_QP;
+	rxe->attr.max_mcast_grp			= RXE_MAX_MCAST_GRP;
+	rxe->attr.max_mcast_qp_attach		= RXE_MAX_MCAST_QP_ATTACH;
+	rxe->attr.max_total_mcast_qp_attach	= RXE_MAX_TOT_MCAST_QP_ATTACH;
+	rxe->attr.max_ah			= RXE_MAX_AH;
+	rxe->attr.max_fmr			= RXE_MAX_FMR;
+	rxe->attr.max_map_per_fmr		= RXE_MAX_MAP_PER_FMR;
+	rxe->attr.max_srq			= RXE_MAX_SRQ;
+	rxe->attr.max_srq_wr			= RXE_MAX_SRQ_WR;
+	rxe->attr.max_srq_sge			= RXE_MAX_SRQ_SGE;
+	rxe->attr.max_fast_reg_page_list_len	= RXE_MAX_FMR_PAGE_LIST_LEN;
+	rxe->attr.max_pkeys			= RXE_MAX_PKEYS;
+	rxe->attr.local_ca_ack_delay		= RXE_LOCAL_CA_ACK_DELAY;
+
+	rxe->max_ucontext			= RXE_MAX_UCONTEXT;
+}
+
+/* initialize port attributes */
+static int rxe_init_port_param(struct rxe_port *port)
+{
+	port->attr.state		= RXE_PORT_STATE;
+	port->attr.max_mtu		= RXE_PORT_MAX_MTU;
+	port->attr.active_mtu		= RXE_PORT_ACTIVE_MTU;
+	port->attr.gid_tbl_len		= RXE_PORT_GID_TBL_LEN;
+	port->attr.port_cap_flags	= RXE_PORT_PORT_CAP_FLAGS;
+	port->attr.max_msg_sz		= RXE_PORT_MAX_MSG_SZ;
+	port->attr.bad_pkey_cntr	= RXE_PORT_BAD_PKEY_CNTR;
+	port->attr.qkey_viol_cntr	= RXE_PORT_QKEY_VIOL_CNTR;
+	port->attr.pkey_tbl_len		= RXE_PORT_PKEY_TBL_LEN;
+	port->attr.lid			= RXE_PORT_LID;
+	port->attr.sm_lid		= RXE_PORT_SM_LID;
+	port->attr.lmc			= RXE_PORT_LMC;
+	port->attr.max_vl_num		= RXE_PORT_MAX_VL_NUM;
+	port->attr.sm_sl		= RXE_PORT_SM_SL;
+	port->attr.subnet_timeout	= RXE_PORT_SUBNET_TIMEOUT;
+	port->attr.init_type_reply	= RXE_PORT_INIT_TYPE_REPLY;
+	port->attr.active_width		= RXE_PORT_ACTIVE_WIDTH;
+	port->attr.active_speed		= RXE_PORT_ACTIVE_SPEED;
+	port->attr.phys_state		= RXE_PORT_PHYS_STATE;
+	port->mtu_cap			=
+				ib_mtu_enum_to_int(RXE_PORT_ACTIVE_MTU);
+	port->subnet_prefix		= cpu_to_be64(RXE_PORT_SUBNET_PREFIX);
+
+	return 0;
+}
+
+/* initialize port state, note IB convention that HCA ports are always
+ * numbered from 1
+ */
+static int rxe_init_ports(struct rxe_dev *rxe)
+{
+	struct rxe_port *port = &rxe->port;
+
+	rxe_init_port_param(port);
+
+	if (!port->attr.pkey_tbl_len || !port->attr.gid_tbl_len)
+		return -EINVAL;
+
+	port->pkey_tbl = kcalloc(port->attr.pkey_tbl_len,
+			sizeof(*port->pkey_tbl), GFP_KERNEL);
+
+	if (!port->pkey_tbl)
+		return -ENOMEM;
+
+	port->pkey_tbl[0] = 0xffff;
+	addrconf_addr_eui48((unsigned char *)&port->port_guid,
+			    rxe->ndev->dev_addr);
+
+	spin_lock_init(&port->port_lock);
+
+	return 0;
+}
+
+/* init pools of managed objects */
+static int rxe_init_pools(struct rxe_dev *rxe)
+{
+	int err;
+
+	err = rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC,
+			    rxe->max_ucontext);
+	if (err)
+		goto err1;
+
+	err = rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD,
+			    rxe->attr.max_pd);
+	if (err)
+		goto err2;
+
+	err = rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH,
+			    rxe->attr.max_ah);
+	if (err)
+		goto err3;
+
+	err = rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ,
+			    rxe->attr.max_srq);
+	if (err)
+		goto err4;
+
+	err = rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP,
+			    rxe->attr.max_qp);
+	if (err)
+		goto err5;
+
+	err = rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ,
+			    rxe->attr.max_cq);
+	if (err)
+		goto err6;
+
+	err = rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR,
+			    rxe->attr.max_mr);
+	if (err)
+		goto err7;
+
+	err = rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW,
+			    rxe->attr.max_mw);
+	if (err)
+		goto err8;
+
+	err = rxe_pool_init(rxe, &rxe->mc_grp_pool, RXE_TYPE_MC_GRP,
+			    rxe->attr.max_mcast_grp);
+	if (err)
+		goto err9;
+
+	err = rxe_pool_init(rxe, &rxe->mc_elem_pool, RXE_TYPE_MC_ELEM,
+			    rxe->attr.max_total_mcast_qp_attach);
+	if (err)
+		goto err10;
+
+	return 0;
+
+err10:
+	rxe_pool_cleanup(&rxe->mc_grp_pool);
+err9:
+	rxe_pool_cleanup(&rxe->mw_pool);
+err8:
+	rxe_pool_cleanup(&rxe->mr_pool);
+err7:
+	rxe_pool_cleanup(&rxe->cq_pool);
+err6:
+	rxe_pool_cleanup(&rxe->qp_pool);
+err5:
+	rxe_pool_cleanup(&rxe->srq_pool);
+err4:
+	rxe_pool_cleanup(&rxe->ah_pool);
+err3:
+	rxe_pool_cleanup(&rxe->pd_pool);
+err2:
+	rxe_pool_cleanup(&rxe->uc_pool);
+err1:
+	return err;
+}
+
+/* initialize rxe device state */
+static int rxe_init(struct rxe_dev *rxe)
+{
+	int err;
+
+	/* init default device parameters */
+	rxe_init_device_param(rxe);
+
+	err = rxe_init_ports(rxe);
+	if (err)
+		goto err1;
+
+	err = rxe_init_pools(rxe);
+	if (err)
+		goto err2;
+
+	/* init pending mmap list */
+	spin_lock_init(&rxe->mmap_offset_lock);
+	spin_lock_init(&rxe->pending_lock);
+	INIT_LIST_HEAD(&rxe->pending_mmaps);
+	INIT_LIST_HEAD(&rxe->list);
+
+	mutex_init(&rxe->usdev_lock);
+
+	return 0;
+
+err2:
+	rxe_cleanup_ports(rxe);
+err1:
+	return err;
+}
+
+int rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
+{
+	struct rxe_port *port = &rxe->port;
+	enum ib_mtu mtu;
+
+	mtu = eth_mtu_int_to_enum(ndev_mtu);
+
+	/* Make sure that new MTU in range */
+	mtu = mtu ? min_t(enum ib_mtu, mtu, RXE_PORT_MAX_MTU) : IB_MTU_256;
+
+	port->attr.active_mtu = mtu;
+	port->mtu_cap = ib_mtu_enum_to_int(mtu);
+
+	return 0;
+}
+EXPORT_SYMBOL(rxe_set_mtu);
+
+/* called by ifc layer to create new rxe device.
+ * The caller should allocate memory for rxe by calling ib_alloc_device.
+ */
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
+{
+	int err;
+
+	kref_init(&rxe->ref_cnt);
+
+	err = rxe_init(rxe);
+	if (err)
+		goto err1;
+
+	err = rxe_set_mtu(rxe, mtu);
+	if (err)
+		goto err1;
+
+	err = rxe_register_device(rxe);
+	if (err)
+		goto err1;
+
+	return 0;
+
+err1:
+	rxe_dev_put(rxe);
+	return err;
+}
+EXPORT_SYMBOL(rxe_add);
+
+/* called by the ifc layer to remove a device */
+void rxe_remove(struct rxe_dev *rxe)
+{
+	rxe_unregister_device(rxe);
+
+	rxe_dev_put(rxe);
+}
+EXPORT_SYMBOL(rxe_remove);
+
+static int __init rxe_module_init(void)
+{
+	int err;
+
+	/* initialize slab caches for managed objects */
+	err = rxe_cache_init();
+	if (err) {
+		pr_err("unable to init object pools\n");
+		return err;
+	}
+
+	err = rxe_net_init();
+	if (err)
+		return err;
+
+	pr_info("loaded\n");
+	return 0;
+}
+
+static void __exit rxe_module_exit(void)
+{
+	rxe_remove_all();
+	rxe_net_exit();
+	rxe_cache_exit();
+
+	pr_info("unloaded\n");
+}
+
+late_initcall(rxe_module_init);
+module_exit(rxe_module_exit);
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
new file mode 100644
index 0000000..561ad30
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_H
+#define RXE_H
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/crc32.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+#include <crypto/hash.h>
+
+#include "rxe_net.h"
+#include "rxe_opcode.h"
+#include "rxe_hdr.h"
+#include "rxe_param.h"
+#include "rxe_verbs.h"
+#include "rxe_loc.h"
+
+/*
+ * Version 1 and Version 2 are identical on 64 bit machines, but on 32 bit
+ * machines Version 2 has a different struct layout.
+ */
+#define RXE_UVERBS_ABI_VERSION		2
+
+#define IB_PHYS_STATE_LINK_UP		(5)
+#define IB_PHYS_STATE_LINK_DOWN		(3)
+
+#define RXE_ROCE_V2_SPORT		(0xc000)
+
+static inline u32 rxe_crc32(struct rxe_dev *rxe,
+			    u32 crc, void *next, size_t len)
+{
+	u32 retval;
+	int err;
+
+	SHASH_DESC_ON_STACK(shash, rxe->tfm);
+
+	shash->tfm = rxe->tfm;
+	shash->flags = 0;
+	*(u32 *)shash_desc_ctx(shash) = crc;
+	err = crypto_shash_update(shash, next, len);
+	if (unlikely(err)) {
+		pr_warn_ratelimited("failed crc calculation, err: %d\n", err);
+		return crc32_le(crc, next, len);
+	}
+
+	retval = *(u32 *)shash_desc_ctx(shash);
+	barrier_data(shash_desc_ctx(shash));
+	return retval;
+}
+
+int rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
+
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu);
+void rxe_remove(struct rxe_dev *rxe);
+void rxe_remove_all(void);
+
+int rxe_rcv(struct sk_buff *skb);
+
+static inline void rxe_dev_put(struct rxe_dev *rxe)
+{
+	kref_put(&rxe->ref_cnt, rxe_release);
+}
+struct rxe_dev *net_to_rxe(struct net_device *ndev);
+struct rxe_dev *get_rxe_by_name(const char *name);
+
+void rxe_port_up(struct rxe_dev *rxe);
+void rxe_port_down(struct rxe_dev *rxe);
+
+#endif /* RXE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c
new file mode 100644
index 0000000..7f1ae36
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_av.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *	   Redistribution and use in source and binary forms, with or
+ *	   without modification, are permitted provided that the following
+ *	   conditions are met:
+ *
+ *		- Redistributions of source code must retain the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer.
+ *
+ *		- Redistributions in binary form must reproduce the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer in the documentation and/or other materials
+ *		  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr)
+{
+	struct rxe_port *port;
+
+	port = &rxe->port;
+
+	if (rdma_ah_get_ah_flags(attr) & IB_AH_GRH) {
+		u8 sgid_index = rdma_ah_read_grh(attr)->sgid_index;
+
+		if (sgid_index > port->attr.gid_tbl_len) {
+			pr_warn("invalid sgid index = %d\n", sgid_index);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+void rxe_av_from_attr(u8 port_num, struct rxe_av *av,
+		     struct rdma_ah_attr *attr)
+{
+	memset(av, 0, sizeof(*av));
+	memcpy(&av->grh, rdma_ah_read_grh(attr),
+	       sizeof(*rdma_ah_read_grh(attr)));
+	av->port_num = port_num;
+}
+
+void rxe_av_to_attr(struct rxe_av *av, struct rdma_ah_attr *attr)
+{
+	attr->type = RDMA_AH_ATTR_TYPE_ROCE;
+	memcpy(rdma_ah_retrieve_grh(attr), &av->grh, sizeof(av->grh));
+	rdma_ah_set_ah_flags(attr, IB_AH_GRH);
+	rdma_ah_set_port_num(attr, av->port_num);
+}
+
+void rxe_av_fill_ip_info(struct rxe_av *av,
+			struct rdma_ah_attr *attr,
+			struct ib_gid_attr *sgid_attr,
+			union ib_gid *sgid)
+{
+	rdma_gid2ip((struct sockaddr *)&av->sgid_addr, sgid);
+	rdma_gid2ip((struct sockaddr *)&av->dgid_addr,
+		    &rdma_ah_read_grh(attr)->dgid);
+	av->network_type = ib_gid_to_network_type(sgid_attr->gid_type, sgid);
+}
+
+struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt)
+{
+	if (!pkt || !pkt->qp)
+		return NULL;
+
+	if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC)
+		return &pkt->qp->pri_av;
+
+	return (pkt->wqe) ? &pkt->wqe->av : NULL;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
new file mode 100644
index 0000000..6cdc40e
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -0,0 +1,771 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+#include "rxe_task.h"
+
+enum comp_state {
+	COMPST_GET_ACK,
+	COMPST_GET_WQE,
+	COMPST_COMP_WQE,
+	COMPST_COMP_ACK,
+	COMPST_CHECK_PSN,
+	COMPST_CHECK_ACK,
+	COMPST_READ,
+	COMPST_ATOMIC,
+	COMPST_WRITE_SEND,
+	COMPST_UPDATE_COMP,
+	COMPST_ERROR_RETRY,
+	COMPST_RNR_RETRY,
+	COMPST_ERROR,
+	COMPST_EXIT, /* We have an issue, and we want to rerun the completer */
+	COMPST_DONE, /* The completer finished successflly */
+};
+
+static char *comp_state_name[] =  {
+	[COMPST_GET_ACK]		= "GET ACK",
+	[COMPST_GET_WQE]		= "GET WQE",
+	[COMPST_COMP_WQE]		= "COMP WQE",
+	[COMPST_COMP_ACK]		= "COMP ACK",
+	[COMPST_CHECK_PSN]		= "CHECK PSN",
+	[COMPST_CHECK_ACK]		= "CHECK ACK",
+	[COMPST_READ]			= "READ",
+	[COMPST_ATOMIC]			= "ATOMIC",
+	[COMPST_WRITE_SEND]		= "WRITE/SEND",
+	[COMPST_UPDATE_COMP]		= "UPDATE COMP",
+	[COMPST_ERROR_RETRY]		= "ERROR RETRY",
+	[COMPST_RNR_RETRY]		= "RNR RETRY",
+	[COMPST_ERROR]			= "ERROR",
+	[COMPST_EXIT]			= "EXIT",
+	[COMPST_DONE]			= "DONE",
+};
+
+static unsigned long rnrnak_usec[32] = {
+	[IB_RNR_TIMER_655_36] = 655360,
+	[IB_RNR_TIMER_000_01] = 10,
+	[IB_RNR_TIMER_000_02] = 20,
+	[IB_RNR_TIMER_000_03] = 30,
+	[IB_RNR_TIMER_000_04] = 40,
+	[IB_RNR_TIMER_000_06] = 60,
+	[IB_RNR_TIMER_000_08] = 80,
+	[IB_RNR_TIMER_000_12] = 120,
+	[IB_RNR_TIMER_000_16] = 160,
+	[IB_RNR_TIMER_000_24] = 240,
+	[IB_RNR_TIMER_000_32] = 320,
+	[IB_RNR_TIMER_000_48] = 480,
+	[IB_RNR_TIMER_000_64] = 640,
+	[IB_RNR_TIMER_000_96] = 960,
+	[IB_RNR_TIMER_001_28] = 1280,
+	[IB_RNR_TIMER_001_92] = 1920,
+	[IB_RNR_TIMER_002_56] = 2560,
+	[IB_RNR_TIMER_003_84] = 3840,
+	[IB_RNR_TIMER_005_12] = 5120,
+	[IB_RNR_TIMER_007_68] = 7680,
+	[IB_RNR_TIMER_010_24] = 10240,
+	[IB_RNR_TIMER_015_36] = 15360,
+	[IB_RNR_TIMER_020_48] = 20480,
+	[IB_RNR_TIMER_030_72] = 30720,
+	[IB_RNR_TIMER_040_96] = 40960,
+	[IB_RNR_TIMER_061_44] = 61410,
+	[IB_RNR_TIMER_081_92] = 81920,
+	[IB_RNR_TIMER_122_88] = 122880,
+	[IB_RNR_TIMER_163_84] = 163840,
+	[IB_RNR_TIMER_245_76] = 245760,
+	[IB_RNR_TIMER_327_68] = 327680,
+	[IB_RNR_TIMER_491_52] = 491520,
+};
+
+static inline unsigned long rnrnak_jiffies(u8 timeout)
+{
+	return max_t(unsigned long,
+		usecs_to_jiffies(rnrnak_usec[timeout]), 1);
+}
+
+static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode)
+{
+	switch (opcode) {
+	case IB_WR_RDMA_WRITE:			return IB_WC_RDMA_WRITE;
+	case IB_WR_RDMA_WRITE_WITH_IMM:		return IB_WC_RDMA_WRITE;
+	case IB_WR_SEND:			return IB_WC_SEND;
+	case IB_WR_SEND_WITH_IMM:		return IB_WC_SEND;
+	case IB_WR_RDMA_READ:			return IB_WC_RDMA_READ;
+	case IB_WR_ATOMIC_CMP_AND_SWP:		return IB_WC_COMP_SWAP;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:	return IB_WC_FETCH_ADD;
+	case IB_WR_LSO:				return IB_WC_LSO;
+	case IB_WR_SEND_WITH_INV:		return IB_WC_SEND;
+	case IB_WR_RDMA_READ_WITH_INV:		return IB_WC_RDMA_READ;
+	case IB_WR_LOCAL_INV:			return IB_WC_LOCAL_INV;
+	case IB_WR_REG_MR:			return IB_WC_REG_MR;
+
+	default:
+		return 0xff;
+	}
+}
+
+void retransmit_timer(struct timer_list *t)
+{
+	struct rxe_qp *qp = from_timer(qp, t, retrans_timer);
+
+	if (qp->valid) {
+		qp->comp.timeout = 1;
+		rxe_run_task(&qp->comp.task, 1);
+	}
+}
+
+void rxe_comp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
+			struct sk_buff *skb)
+{
+	int must_sched;
+
+	skb_queue_tail(&qp->resp_pkts, skb);
+
+	must_sched = skb_queue_len(&qp->resp_pkts) > 1;
+	if (must_sched != 0)
+		rxe_counter_inc(rxe, RXE_CNT_COMPLETER_SCHED);
+	rxe_run_task(&qp->comp.task, must_sched);
+}
+
+static inline enum comp_state get_wqe(struct rxe_qp *qp,
+				      struct rxe_pkt_info *pkt,
+				      struct rxe_send_wqe **wqe_p)
+{
+	struct rxe_send_wqe *wqe;
+
+	/* we come here whether or not we found a response packet to see if
+	 * there are any posted WQEs
+	 */
+	wqe = queue_head(qp->sq.queue);
+	*wqe_p = wqe;
+
+	/* no WQE or requester has not started it yet */
+	if (!wqe || wqe->state == wqe_state_posted)
+		return pkt ? COMPST_DONE : COMPST_EXIT;
+
+	/* WQE does not require an ack */
+	if (wqe->state == wqe_state_done)
+		return COMPST_COMP_WQE;
+
+	/* WQE caused an error */
+	if (wqe->state == wqe_state_error)
+		return COMPST_ERROR;
+
+	/* we have a WQE, if we also have an ack check its PSN */
+	return pkt ? COMPST_CHECK_PSN : COMPST_EXIT;
+}
+
+static inline void reset_retry_counters(struct rxe_qp *qp)
+{
+	qp->comp.retry_cnt = qp->attr.retry_cnt;
+	qp->comp.rnr_retry = qp->attr.rnr_retry;
+}
+
+static inline enum comp_state check_psn(struct rxe_qp *qp,
+					struct rxe_pkt_info *pkt,
+					struct rxe_send_wqe *wqe)
+{
+	s32 diff;
+
+	/* check to see if response is past the oldest WQE. if it is, complete
+	 * send/write or error read/atomic
+	 */
+	diff = psn_compare(pkt->psn, wqe->last_psn);
+	if (diff > 0) {
+		if (wqe->state == wqe_state_pending) {
+			if (wqe->mask & WR_ATOMIC_OR_READ_MASK)
+				return COMPST_ERROR_RETRY;
+
+			reset_retry_counters(qp);
+			return COMPST_COMP_WQE;
+		} else {
+			return COMPST_DONE;
+		}
+	}
+
+	/* compare response packet to expected response */
+	diff = psn_compare(pkt->psn, qp->comp.psn);
+	if (diff < 0) {
+		/* response is most likely a retried packet if it matches an
+		 * uncompleted WQE go complete it else ignore it
+		 */
+		if (pkt->psn == wqe->last_psn)
+			return COMPST_COMP_ACK;
+		else
+			return COMPST_DONE;
+	} else if ((diff > 0) && (wqe->mask & WR_ATOMIC_OR_READ_MASK)) {
+		return COMPST_DONE;
+	} else {
+		return COMPST_CHECK_ACK;
+	}
+}
+
+static inline enum comp_state check_ack(struct rxe_qp *qp,
+					struct rxe_pkt_info *pkt,
+					struct rxe_send_wqe *wqe)
+{
+	unsigned int mask = pkt->mask;
+	u8 syn;
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+	/* Check the sequence only */
+	switch (qp->comp.opcode) {
+	case -1:
+		/* Will catch all *_ONLY cases. */
+		if (!(mask & RXE_START_MASK))
+			return COMPST_ERROR;
+
+		break;
+
+	case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+	case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+		if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE &&
+		    pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) {
+			return COMPST_ERROR;
+		}
+		break;
+	default:
+		WARN_ON_ONCE(1);
+	}
+
+	/* Check operation validity. */
+	switch (pkt->opcode) {
+	case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+	case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST:
+	case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY:
+		syn = aeth_syn(pkt);
+
+		if ((syn & AETH_TYPE_MASK) != AETH_ACK)
+			return COMPST_ERROR;
+
+		/* fall through */
+		/* (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE doesn't have an AETH)
+		 */
+	case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+		if (wqe->wr.opcode != IB_WR_RDMA_READ &&
+		    wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV) {
+			return COMPST_ERROR;
+		}
+		reset_retry_counters(qp);
+		return COMPST_READ;
+
+	case IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE:
+		syn = aeth_syn(pkt);
+
+		if ((syn & AETH_TYPE_MASK) != AETH_ACK)
+			return COMPST_ERROR;
+
+		if (wqe->wr.opcode != IB_WR_ATOMIC_CMP_AND_SWP &&
+		    wqe->wr.opcode != IB_WR_ATOMIC_FETCH_AND_ADD)
+			return COMPST_ERROR;
+		reset_retry_counters(qp);
+		return COMPST_ATOMIC;
+
+	case IB_OPCODE_RC_ACKNOWLEDGE:
+		syn = aeth_syn(pkt);
+		switch (syn & AETH_TYPE_MASK) {
+		case AETH_ACK:
+			reset_retry_counters(qp);
+			return COMPST_WRITE_SEND;
+
+		case AETH_RNR_NAK:
+			rxe_counter_inc(rxe, RXE_CNT_RCV_RNR);
+			return COMPST_RNR_RETRY;
+
+		case AETH_NAK:
+			switch (syn) {
+			case AETH_NAK_PSN_SEQ_ERROR:
+				/* a nak implicitly acks all packets with psns
+				 * before
+				 */
+				if (psn_compare(pkt->psn, qp->comp.psn) > 0) {
+					rxe_counter_inc(rxe,
+							RXE_CNT_RCV_SEQ_ERR);
+					qp->comp.psn = pkt->psn;
+					if (qp->req.wait_psn) {
+						qp->req.wait_psn = 0;
+						rxe_run_task(&qp->req.task, 1);
+					}
+				}
+				return COMPST_ERROR_RETRY;
+
+			case AETH_NAK_INVALID_REQ:
+				wqe->status = IB_WC_REM_INV_REQ_ERR;
+				return COMPST_ERROR;
+
+			case AETH_NAK_REM_ACC_ERR:
+				wqe->status = IB_WC_REM_ACCESS_ERR;
+				return COMPST_ERROR;
+
+			case AETH_NAK_REM_OP_ERR:
+				wqe->status = IB_WC_REM_OP_ERR;
+				return COMPST_ERROR;
+
+			default:
+				pr_warn("unexpected nak %x\n", syn);
+				wqe->status = IB_WC_REM_OP_ERR;
+				return COMPST_ERROR;
+			}
+
+		default:
+			return COMPST_ERROR;
+		}
+		break;
+
+	default:
+		pr_warn("unexpected opcode\n");
+	}
+
+	return COMPST_ERROR;
+}
+
+static inline enum comp_state do_read(struct rxe_qp *qp,
+				      struct rxe_pkt_info *pkt,
+				      struct rxe_send_wqe *wqe)
+{
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+	int ret;
+
+	ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE,
+			&wqe->dma, payload_addr(pkt),
+			payload_size(pkt), to_mem_obj, NULL);
+	if (ret)
+		return COMPST_ERROR;
+
+	if (wqe->dma.resid == 0 && (pkt->mask & RXE_END_MASK))
+		return COMPST_COMP_ACK;
+	else
+		return COMPST_UPDATE_COMP;
+}
+
+static inline enum comp_state do_atomic(struct rxe_qp *qp,
+					struct rxe_pkt_info *pkt,
+					struct rxe_send_wqe *wqe)
+{
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+	int ret;
+
+	u64 atomic_orig = atmack_orig(pkt);
+
+	ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE,
+			&wqe->dma, &atomic_orig,
+			sizeof(u64), to_mem_obj, NULL);
+	if (ret)
+		return COMPST_ERROR;
+	else
+		return COMPST_COMP_ACK;
+}
+
+static void make_send_cqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+			  struct rxe_cqe *cqe)
+{
+	memset(cqe, 0, sizeof(*cqe));
+
+	if (!qp->is_user) {
+		struct ib_wc		*wc	= &cqe->ibwc;
+
+		wc->wr_id		= wqe->wr.wr_id;
+		wc->status		= wqe->status;
+		wc->opcode		= wr_to_wc_opcode(wqe->wr.opcode);
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+		    wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+			wc->wc_flags = IB_WC_WITH_IMM;
+		wc->byte_len		= wqe->dma.length;
+		wc->qp			= &qp->ibqp;
+	} else {
+		struct ib_uverbs_wc	*uwc	= &cqe->uibwc;
+
+		uwc->wr_id		= wqe->wr.wr_id;
+		uwc->status		= wqe->status;
+		uwc->opcode		= wr_to_wc_opcode(wqe->wr.opcode);
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+		    wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+			uwc->wc_flags = IB_WC_WITH_IMM;
+		uwc->byte_len		= wqe->dma.length;
+		uwc->qp_num		= qp->ibqp.qp_num;
+	}
+}
+
+/*
+ * IBA Spec. Section 10.7.3.1 SIGNALED COMPLETIONS
+ * ---------8<---------8<-------------
+ * ...Note that if a completion error occurs, a Work Completion
+ * will always be generated, even if the signaling
+ * indicator requests an Unsignaled Completion.
+ * ---------8<---------8<-------------
+ */
+static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+	struct rxe_cqe cqe;
+
+	if ((qp->sq_sig_type == IB_SIGNAL_ALL_WR) ||
+	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+	    wqe->status != IB_WC_SUCCESS) {
+		make_send_cqe(qp, wqe, &cqe);
+		advance_consumer(qp->sq.queue);
+		rxe_cq_post(qp->scq, &cqe, 0);
+	} else {
+		advance_consumer(qp->sq.queue);
+	}
+
+	/*
+	 * we completed something so let req run again
+	 * if it is trying to fence
+	 */
+	if (qp->req.wait_fence) {
+		qp->req.wait_fence = 0;
+		rxe_run_task(&qp->req.task, 1);
+	}
+}
+
+static inline enum comp_state complete_ack(struct rxe_qp *qp,
+					   struct rxe_pkt_info *pkt,
+					   struct rxe_send_wqe *wqe)
+{
+	unsigned long flags;
+
+	if (wqe->has_rd_atomic) {
+		wqe->has_rd_atomic = 0;
+		atomic_inc(&qp->req.rd_atomic);
+		if (qp->req.need_rd_atomic) {
+			qp->comp.timeout_retry = 0;
+			qp->req.need_rd_atomic = 0;
+			rxe_run_task(&qp->req.task, 1);
+		}
+	}
+
+	if (unlikely(qp->req.state == QP_STATE_DRAIN)) {
+		/* state_lock used by requester & completer */
+		spin_lock_irqsave(&qp->state_lock, flags);
+		if ((qp->req.state == QP_STATE_DRAIN) &&
+		    (qp->comp.psn == qp->req.psn)) {
+			qp->req.state = QP_STATE_DRAINED;
+			spin_unlock_irqrestore(&qp->state_lock, flags);
+
+			if (qp->ibqp.event_handler) {
+				struct ib_event ev;
+
+				ev.device = qp->ibqp.device;
+				ev.element.qp = &qp->ibqp;
+				ev.event = IB_EVENT_SQ_DRAINED;
+				qp->ibqp.event_handler(&ev,
+					qp->ibqp.qp_context);
+			}
+		} else {
+			spin_unlock_irqrestore(&qp->state_lock, flags);
+		}
+	}
+
+	do_complete(qp, wqe);
+
+	if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+		return COMPST_UPDATE_COMP;
+	else
+		return COMPST_DONE;
+}
+
+static inline enum comp_state complete_wqe(struct rxe_qp *qp,
+					   struct rxe_pkt_info *pkt,
+					   struct rxe_send_wqe *wqe)
+{
+	qp->comp.opcode = -1;
+
+	if (pkt) {
+		if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+			qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+
+		if (qp->req.wait_psn) {
+			qp->req.wait_psn = 0;
+			rxe_run_task(&qp->req.task, 1);
+		}
+	}
+
+	do_complete(qp, wqe);
+
+	return COMPST_GET_WQE;
+}
+
+static void rxe_drain_resp_pkts(struct rxe_qp *qp, bool notify)
+{
+	struct sk_buff *skb;
+	struct rxe_send_wqe *wqe;
+
+	while ((skb = skb_dequeue(&qp->resp_pkts))) {
+		rxe_drop_ref(qp);
+		kfree_skb(skb);
+	}
+
+	while ((wqe = queue_head(qp->sq.queue))) {
+		if (notify) {
+			wqe->status = IB_WC_WR_FLUSH_ERR;
+			do_complete(qp, wqe);
+		} else {
+			advance_consumer(qp->sq.queue);
+		}
+	}
+}
+
+int rxe_completer(void *arg)
+{
+	struct rxe_qp *qp = (struct rxe_qp *)arg;
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+	struct rxe_send_wqe *wqe = wqe;
+	struct sk_buff *skb = NULL;
+	struct rxe_pkt_info *pkt = NULL;
+	enum comp_state state;
+
+	rxe_add_ref(qp);
+
+	if (!qp->valid || qp->req.state == QP_STATE_ERROR ||
+	    qp->req.state == QP_STATE_RESET) {
+		rxe_drain_resp_pkts(qp, qp->valid &&
+				    qp->req.state == QP_STATE_ERROR);
+		goto exit;
+	}
+
+	if (qp->comp.timeout) {
+		qp->comp.timeout_retry = 1;
+		qp->comp.timeout = 0;
+	} else {
+		qp->comp.timeout_retry = 0;
+	}
+
+	if (qp->req.need_retry)
+		goto exit;
+
+	state = COMPST_GET_ACK;
+
+	while (1) {
+		pr_debug("qp#%d state = %s\n", qp_num(qp),
+			 comp_state_name[state]);
+		switch (state) {
+		case COMPST_GET_ACK:
+			skb = skb_dequeue(&qp->resp_pkts);
+			if (skb) {
+				pkt = SKB_TO_PKT(skb);
+				qp->comp.timeout_retry = 0;
+			}
+			state = COMPST_GET_WQE;
+			break;
+
+		case COMPST_GET_WQE:
+			state = get_wqe(qp, pkt, &wqe);
+			break;
+
+		case COMPST_CHECK_PSN:
+			state = check_psn(qp, pkt, wqe);
+			break;
+
+		case COMPST_CHECK_ACK:
+			state = check_ack(qp, pkt, wqe);
+			break;
+
+		case COMPST_READ:
+			state = do_read(qp, pkt, wqe);
+			break;
+
+		case COMPST_ATOMIC:
+			state = do_atomic(qp, pkt, wqe);
+			break;
+
+		case COMPST_WRITE_SEND:
+			if (wqe->state == wqe_state_pending &&
+			    wqe->last_psn == pkt->psn)
+				state = COMPST_COMP_ACK;
+			else
+				state = COMPST_UPDATE_COMP;
+			break;
+
+		case COMPST_COMP_ACK:
+			state = complete_ack(qp, pkt, wqe);
+			break;
+
+		case COMPST_COMP_WQE:
+			state = complete_wqe(qp, pkt, wqe);
+			break;
+
+		case COMPST_UPDATE_COMP:
+			if (pkt->mask & RXE_END_MASK)
+				qp->comp.opcode = -1;
+			else
+				qp->comp.opcode = pkt->opcode;
+
+			if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+				qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+
+			if (qp->req.wait_psn) {
+				qp->req.wait_psn = 0;
+				rxe_run_task(&qp->req.task, 1);
+			}
+
+			state = COMPST_DONE;
+			break;
+
+		case COMPST_DONE:
+			if (pkt) {
+				rxe_drop_ref(pkt->qp);
+				kfree_skb(skb);
+				skb = NULL;
+			}
+			goto done;
+
+		case COMPST_EXIT:
+			if (qp->comp.timeout_retry && wqe) {
+				state = COMPST_ERROR_RETRY;
+				break;
+			}
+
+			/* re reset the timeout counter if
+			 * (1) QP is type RC
+			 * (2) the QP is alive
+			 * (3) there is a packet sent by the requester that
+			 *     might be acked (we still might get spurious
+			 *     timeouts but try to keep them as few as possible)
+			 * (4) the timeout parameter is set
+			 */
+			if ((qp_type(qp) == IB_QPT_RC) &&
+			    (qp->req.state == QP_STATE_READY) &&
+			    (psn_compare(qp->req.psn, qp->comp.psn) > 0) &&
+			    qp->qp_timeout_jiffies)
+				mod_timer(&qp->retrans_timer,
+					  jiffies + qp->qp_timeout_jiffies);
+			WARN_ON_ONCE(skb);
+			goto exit;
+
+		case COMPST_ERROR_RETRY:
+			/* we come here if the retry timer fired and we did
+			 * not receive a response packet. try to retry the send
+			 * queue if that makes sense and the limits have not
+			 * been exceeded. remember that some timeouts are
+			 * spurious since we do not reset the timer but kick
+			 * it down the road or let it expire
+			 */
+
+			/* there is nothing to retry in this case */
+			if (!wqe || (wqe->state == wqe_state_posted)) {
+				WARN_ON_ONCE(skb);
+				goto exit;
+			}
+
+			if (qp->comp.retry_cnt > 0) {
+				if (qp->comp.retry_cnt != 7)
+					qp->comp.retry_cnt--;
+
+				/* no point in retrying if we have already
+				 * seen the last ack that the requester could
+				 * have caused
+				 */
+				if (psn_compare(qp->req.psn,
+						qp->comp.psn) > 0) {
+					/* tell the requester to retry the
+					 * send queue next time around
+					 */
+					rxe_counter_inc(rxe,
+							RXE_CNT_COMP_RETRY);
+					qp->req.need_retry = 1;
+					rxe_run_task(&qp->req.task, 1);
+				}
+
+				if (pkt) {
+					rxe_drop_ref(pkt->qp);
+					kfree_skb(skb);
+					skb = NULL;
+				}
+
+				WARN_ON_ONCE(skb);
+				goto exit;
+
+			} else {
+				rxe_counter_inc(rxe, RXE_CNT_RETRY_EXCEEDED);
+				wqe->status = IB_WC_RETRY_EXC_ERR;
+				state = COMPST_ERROR;
+			}
+			break;
+
+		case COMPST_RNR_RETRY:
+			if (qp->comp.rnr_retry > 0) {
+				if (qp->comp.rnr_retry != 7)
+					qp->comp.rnr_retry--;
+
+				qp->req.need_retry = 1;
+				pr_debug("qp#%d set rnr nak timer\n",
+					 qp_num(qp));
+				mod_timer(&qp->rnr_nak_timer,
+					  jiffies + rnrnak_jiffies(aeth_syn(pkt)
+						& ~AETH_TYPE_MASK));
+				rxe_drop_ref(pkt->qp);
+				kfree_skb(skb);
+				skb = NULL;
+				goto exit;
+			} else {
+				rxe_counter_inc(rxe,
+						RXE_CNT_RNR_RETRY_EXCEEDED);
+				wqe->status = IB_WC_RNR_RETRY_EXC_ERR;
+				state = COMPST_ERROR;
+			}
+			break;
+
+		case COMPST_ERROR:
+			WARN_ON_ONCE(wqe->status == IB_WC_SUCCESS);
+			do_complete(qp, wqe);
+			rxe_qp_error(qp);
+
+			if (pkt) {
+				rxe_drop_ref(pkt->qp);
+				kfree_skb(skb);
+				skb = NULL;
+			}
+
+			WARN_ON_ONCE(skb);
+			goto exit;
+		}
+	}
+
+exit:
+	/* we come here if we are done with processing and want the task to
+	 * exit from the loop calling us
+	 */
+	WARN_ON_ONCE(skb);
+	rxe_drop_ref(qp);
+	return -EAGAIN;
+
+done:
+	/* we come here if we have processed a packet we want the task to call
+	 * us again to see if there is anything else to do
+	 */
+	WARN_ON_ONCE(skb);
+	rxe_drop_ref(qp);
+	return 0;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c
new file mode 100644
index 0000000..2ee4b08
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_cq.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *	   Redistribution and use in source and binary forms, with or
+ *	   without modification, are permitted provided that the following
+ *	   conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
+		    int cqe, int comp_vector)
+{
+	int count;
+
+	if (cqe <= 0) {
+		pr_warn("cqe(%d) <= 0\n", cqe);
+		goto err1;
+	}
+
+	if (cqe > rxe->attr.max_cqe) {
+		pr_warn("cqe(%d) > max_cqe(%d)\n",
+			cqe, rxe->attr.max_cqe);
+		goto err1;
+	}
+
+	if (cq) {
+		count = queue_count(cq->queue);
+		if (cqe < count) {
+			pr_warn("cqe(%d) < current # elements in queue (%d)",
+				cqe, count);
+			goto err1;
+		}
+	}
+
+	return 0;
+
+err1:
+	return -EINVAL;
+}
+
+static void rxe_send_complete(unsigned long data)
+{
+	struct rxe_cq *cq = (struct rxe_cq *)data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	if (cq->is_dying) {
+		spin_unlock_irqrestore(&cq->cq_lock, flags);
+		return;
+	}
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
+		     int comp_vector, struct ib_ucontext *context,
+		     struct rxe_create_cq_resp __user *uresp)
+{
+	int err;
+
+	cq->queue = rxe_queue_init(rxe, &cqe,
+				   sizeof(struct rxe_cqe));
+	if (!cq->queue) {
+		pr_warn("unable to create cq\n");
+		return -ENOMEM;
+	}
+
+	err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context,
+			   cq->queue->buf, cq->queue->buf_size, &cq->queue->ip);
+	if (err) {
+		kvfree(cq->queue->buf);
+		kfree(cq->queue);
+		return err;
+	}
+
+	if (uresp)
+		cq->is_user = 1;
+
+	cq->is_dying = false;
+
+	tasklet_init(&cq->comp_task, rxe_send_complete, (unsigned long)cq);
+
+	spin_lock_init(&cq->cq_lock);
+	cq->ibcq.cqe = cqe;
+	return 0;
+}
+
+int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe,
+			struct rxe_resize_cq_resp __user *uresp)
+{
+	int err;
+
+	err = rxe_queue_resize(cq->queue, (unsigned int *)&cqe,
+			       sizeof(struct rxe_cqe),
+			       cq->queue->ip ? cq->queue->ip->context : NULL,
+			       uresp ? &uresp->mi : NULL, NULL, &cq->cq_lock);
+	if (!err)
+		cq->ibcq.cqe = cqe;
+
+	return err;
+}
+
+int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
+{
+	struct ib_event ev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+
+	if (unlikely(queue_full(cq->queue))) {
+		spin_unlock_irqrestore(&cq->cq_lock, flags);
+		if (cq->ibcq.event_handler) {
+			ev.device = cq->ibcq.device;
+			ev.element.cq = &cq->ibcq;
+			ev.event = IB_EVENT_CQ_ERR;
+			cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+		}
+
+		return -EBUSY;
+	}
+
+	memcpy(producer_addr(cq->queue), cqe, sizeof(*cqe));
+
+	/* make sure all changes to the CQ are written before we update the
+	 * producer pointer
+	 */
+	smp_wmb();
+
+	advance_producer(cq->queue);
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+	if ((cq->notify == IB_CQ_NEXT_COMP) ||
+	    (cq->notify == IB_CQ_SOLICITED && solicited)) {
+		cq->notify = 0;
+		tasklet_schedule(&cq->comp_task);
+	}
+
+	return 0;
+}
+
+void rxe_cq_disable(struct rxe_cq *cq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	cq->is_dying = true;
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+}
+
+void rxe_cq_cleanup(struct rxe_pool_entry *arg)
+{
+	struct rxe_cq *cq = container_of(arg, typeof(*cq), pelem);
+
+	if (cq->queue)
+		rxe_queue_cleanup(cq->queue);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h
new file mode 100644
index 0000000..6cb1840
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_hdr.h
@@ -0,0 +1,960 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_HDR_H
+#define RXE_HDR_H
+
+/* extracted information about a packet carried in an sk_buff struct fits in
+ * the skbuff cb array. Must be at most 48 bytes. stored in control block of
+ * sk_buff for received packets.
+ */
+struct rxe_pkt_info {
+	struct rxe_dev		*rxe;		/* device that owns packet */
+	struct rxe_qp		*qp;		/* qp that owns packet */
+	struct rxe_send_wqe	*wqe;		/* send wqe */
+	u8			*hdr;		/* points to bth */
+	u32			mask;		/* useful info about pkt */
+	u32			psn;		/* bth psn of packet */
+	u16			pkey_index;	/* partition of pkt */
+	u16			paylen;		/* length of bth - icrc */
+	u8			port_num;	/* port pkt received on */
+	u8			opcode;		/* bth opcode of packet */
+	u8			offset;		/* bth offset from pkt->hdr */
+};
+
+/* Macros should be used only for received skb */
+static inline struct rxe_pkt_info *SKB_TO_PKT(struct sk_buff *skb)
+{
+	BUILD_BUG_ON(sizeof(struct rxe_pkt_info) > sizeof(skb->cb));
+	return (void *)skb->cb;
+}
+
+static inline struct sk_buff *PKT_TO_SKB(struct rxe_pkt_info *pkt)
+{
+	return container_of((void *)pkt, struct sk_buff, cb);
+}
+
+/*
+ * IBA header types and methods
+ *
+ * Some of these are for reference and completeness only since
+ * rxe does not currently support RD transport
+ * most of this could be moved into IB core. ib_pack.h has
+ * part of this but is incomplete
+ *
+ * Header specific routines to insert/extract values to/from headers
+ * the routines that are named __hhh_(set_)fff() take a pointer to a
+ * hhh header and get(set) the fff field. The routines named
+ * hhh_(set_)fff take a packet info struct and find the
+ * header and field based on the opcode in the packet.
+ * Conversion to/from network byte order from cpu order is also done.
+ */
+
+#define RXE_ICRC_SIZE		(4)
+#define RXE_MAX_HDR_LENGTH	(80)
+
+/******************************************************************************
+ * Base Transport Header
+ ******************************************************************************/
+struct rxe_bth {
+	u8			opcode;
+	u8			flags;
+	__be16			pkey;
+	__be32			qpn;
+	__be32			apsn;
+};
+
+#define BTH_TVER		(0)
+#define BTH_DEF_PKEY		(0xffff)
+
+#define BTH_SE_MASK		(0x80)
+#define BTH_MIG_MASK		(0x40)
+#define BTH_PAD_MASK		(0x30)
+#define BTH_TVER_MASK		(0x0f)
+#define BTH_FECN_MASK		(0x80000000)
+#define BTH_BECN_MASK		(0x40000000)
+#define BTH_RESV6A_MASK		(0x3f000000)
+#define BTH_QPN_MASK		(0x00ffffff)
+#define BTH_ACK_MASK		(0x80000000)
+#define BTH_RESV7_MASK		(0x7f000000)
+#define BTH_PSN_MASK		(0x00ffffff)
+
+static inline u8 __bth_opcode(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	return bth->opcode;
+}
+
+static inline void __bth_set_opcode(void *arg, u8 opcode)
+{
+	struct rxe_bth *bth = arg;
+
+	bth->opcode = opcode;
+}
+
+static inline u8 __bth_se(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	return 0 != (BTH_SE_MASK & bth->flags);
+}
+
+static inline void __bth_set_se(void *arg, int se)
+{
+	struct rxe_bth *bth = arg;
+
+	if (se)
+		bth->flags |= BTH_SE_MASK;
+	else
+		bth->flags &= ~BTH_SE_MASK;
+}
+
+static inline u8 __bth_mig(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	return 0 != (BTH_MIG_MASK & bth->flags);
+}
+
+static inline void __bth_set_mig(void *arg, u8 mig)
+{
+	struct rxe_bth *bth = arg;
+
+	if (mig)
+		bth->flags |= BTH_MIG_MASK;
+	else
+		bth->flags &= ~BTH_MIG_MASK;
+}
+
+static inline u8 __bth_pad(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	return (BTH_PAD_MASK & bth->flags) >> 4;
+}
+
+static inline void __bth_set_pad(void *arg, u8 pad)
+{
+	struct rxe_bth *bth = arg;
+
+	bth->flags = (BTH_PAD_MASK & (pad << 4)) |
+			(~BTH_PAD_MASK & bth->flags);
+}
+
+static inline u8 __bth_tver(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	return BTH_TVER_MASK & bth->flags;
+}
+
+static inline void __bth_set_tver(void *arg, u8 tver)
+{
+	struct rxe_bth *bth = arg;
+
+	bth->flags = (BTH_TVER_MASK & tver) |
+			(~BTH_TVER_MASK & bth->flags);
+}
+
+static inline u16 __bth_pkey(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	return be16_to_cpu(bth->pkey);
+}
+
+static inline void __bth_set_pkey(void *arg, u16 pkey)
+{
+	struct rxe_bth *bth = arg;
+
+	bth->pkey = cpu_to_be16(pkey);
+}
+
+static inline u32 __bth_qpn(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	return BTH_QPN_MASK & be32_to_cpu(bth->qpn);
+}
+
+static inline void __bth_set_qpn(void *arg, u32 qpn)
+{
+	struct rxe_bth *bth = arg;
+	u32 resvqpn = be32_to_cpu(bth->qpn);
+
+	bth->qpn = cpu_to_be32((BTH_QPN_MASK & qpn) |
+			       (~BTH_QPN_MASK & resvqpn));
+}
+
+static inline int __bth_fecn(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	return 0 != (cpu_to_be32(BTH_FECN_MASK) & bth->qpn);
+}
+
+static inline void __bth_set_fecn(void *arg, int fecn)
+{
+	struct rxe_bth *bth = arg;
+
+	if (fecn)
+		bth->qpn |= cpu_to_be32(BTH_FECN_MASK);
+	else
+		bth->qpn &= ~cpu_to_be32(BTH_FECN_MASK);
+}
+
+static inline int __bth_becn(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	return 0 != (cpu_to_be32(BTH_BECN_MASK) & bth->qpn);
+}
+
+static inline void __bth_set_becn(void *arg, int becn)
+{
+	struct rxe_bth *bth = arg;
+
+	if (becn)
+		bth->qpn |= cpu_to_be32(BTH_BECN_MASK);
+	else
+		bth->qpn &= ~cpu_to_be32(BTH_BECN_MASK);
+}
+
+static inline u8 __bth_resv6a(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	return (BTH_RESV6A_MASK & be32_to_cpu(bth->qpn)) >> 24;
+}
+
+static inline void __bth_set_resv6a(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK);
+}
+
+static inline int __bth_ack(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	return 0 != (cpu_to_be32(BTH_ACK_MASK) & bth->apsn);
+}
+
+static inline void __bth_set_ack(void *arg, int ack)
+{
+	struct rxe_bth *bth = arg;
+
+	if (ack)
+		bth->apsn |= cpu_to_be32(BTH_ACK_MASK);
+	else
+		bth->apsn &= ~cpu_to_be32(BTH_ACK_MASK);
+}
+
+static inline void __bth_set_resv7(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	bth->apsn &= ~cpu_to_be32(BTH_RESV7_MASK);
+}
+
+static inline u32 __bth_psn(void *arg)
+{
+	struct rxe_bth *bth = arg;
+
+	return BTH_PSN_MASK & be32_to_cpu(bth->apsn);
+}
+
+static inline void __bth_set_psn(void *arg, u32 psn)
+{
+	struct rxe_bth *bth = arg;
+	u32 apsn = be32_to_cpu(bth->apsn);
+
+	bth->apsn = cpu_to_be32((BTH_PSN_MASK & psn) |
+			(~BTH_PSN_MASK & apsn));
+}
+
+static inline u8 bth_opcode(struct rxe_pkt_info *pkt)
+{
+	return __bth_opcode(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_opcode(struct rxe_pkt_info *pkt, u8 opcode)
+{
+	__bth_set_opcode(pkt->hdr + pkt->offset, opcode);
+}
+
+static inline u8 bth_se(struct rxe_pkt_info *pkt)
+{
+	return __bth_se(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_se(struct rxe_pkt_info *pkt, int se)
+{
+	__bth_set_se(pkt->hdr + pkt->offset, se);
+}
+
+static inline u8 bth_mig(struct rxe_pkt_info *pkt)
+{
+	return __bth_mig(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_mig(struct rxe_pkt_info *pkt, u8 mig)
+{
+	__bth_set_mig(pkt->hdr + pkt->offset, mig);
+}
+
+static inline u8 bth_pad(struct rxe_pkt_info *pkt)
+{
+	return __bth_pad(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_pad(struct rxe_pkt_info *pkt, u8 pad)
+{
+	__bth_set_pad(pkt->hdr + pkt->offset, pad);
+}
+
+static inline u8 bth_tver(struct rxe_pkt_info *pkt)
+{
+	return __bth_tver(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_tver(struct rxe_pkt_info *pkt, u8 tver)
+{
+	__bth_set_tver(pkt->hdr + pkt->offset, tver);
+}
+
+static inline u16 bth_pkey(struct rxe_pkt_info *pkt)
+{
+	return __bth_pkey(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_pkey(struct rxe_pkt_info *pkt, u16 pkey)
+{
+	__bth_set_pkey(pkt->hdr + pkt->offset, pkey);
+}
+
+static inline u32 bth_qpn(struct rxe_pkt_info *pkt)
+{
+	return __bth_qpn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_qpn(struct rxe_pkt_info *pkt, u32 qpn)
+{
+	__bth_set_qpn(pkt->hdr + pkt->offset, qpn);
+}
+
+static inline int bth_fecn(struct rxe_pkt_info *pkt)
+{
+	return __bth_fecn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_fecn(struct rxe_pkt_info *pkt, int fecn)
+{
+	__bth_set_fecn(pkt->hdr + pkt->offset, fecn);
+}
+
+static inline int bth_becn(struct rxe_pkt_info *pkt)
+{
+	return __bth_becn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_becn(struct rxe_pkt_info *pkt, int becn)
+{
+	__bth_set_becn(pkt->hdr + pkt->offset, becn);
+}
+
+static inline u8 bth_resv6a(struct rxe_pkt_info *pkt)
+{
+	return __bth_resv6a(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_resv6a(struct rxe_pkt_info *pkt)
+{
+	__bth_set_resv6a(pkt->hdr + pkt->offset);
+}
+
+static inline int bth_ack(struct rxe_pkt_info *pkt)
+{
+	return __bth_ack(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_ack(struct rxe_pkt_info *pkt, int ack)
+{
+	__bth_set_ack(pkt->hdr + pkt->offset, ack);
+}
+
+static inline void bth_set_resv7(struct rxe_pkt_info *pkt)
+{
+	__bth_set_resv7(pkt->hdr + pkt->offset);
+}
+
+static inline u32 bth_psn(struct rxe_pkt_info *pkt)
+{
+	return __bth_psn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_psn(struct rxe_pkt_info *pkt, u32 psn)
+{
+	__bth_set_psn(pkt->hdr + pkt->offset, psn);
+}
+
+static inline void bth_init(struct rxe_pkt_info *pkt, u8 opcode, int se,
+			    int mig, int pad, u16 pkey, u32 qpn, int ack_req,
+			    u32 psn)
+{
+	struct rxe_bth *bth = (struct rxe_bth *)(pkt->hdr + pkt->offset);
+
+	bth->opcode = opcode;
+	bth->flags = (pad << 4) & BTH_PAD_MASK;
+	if (se)
+		bth->flags |= BTH_SE_MASK;
+	if (mig)
+		bth->flags |= BTH_MIG_MASK;
+	bth->pkey = cpu_to_be16(pkey);
+	bth->qpn = cpu_to_be32(qpn & BTH_QPN_MASK);
+	psn &= BTH_PSN_MASK;
+	if (ack_req)
+		psn |= BTH_ACK_MASK;
+	bth->apsn = cpu_to_be32(psn);
+}
+
+/******************************************************************************
+ * Reliable Datagram Extended Transport Header
+ ******************************************************************************/
+struct rxe_rdeth {
+	__be32			een;
+};
+
+#define RDETH_EEN_MASK		(0x00ffffff)
+
+static inline u8 __rdeth_een(void *arg)
+{
+	struct rxe_rdeth *rdeth = arg;
+
+	return RDETH_EEN_MASK & be32_to_cpu(rdeth->een);
+}
+
+static inline void __rdeth_set_een(void *arg, u32 een)
+{
+	struct rxe_rdeth *rdeth = arg;
+
+	rdeth->een = cpu_to_be32(RDETH_EEN_MASK & een);
+}
+
+static inline u8 rdeth_een(struct rxe_pkt_info *pkt)
+{
+	return __rdeth_een(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_RDETH]);
+}
+
+static inline void rdeth_set_een(struct rxe_pkt_info *pkt, u32 een)
+{
+	__rdeth_set_een(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_RDETH], een);
+}
+
+/******************************************************************************
+ * Datagram Extended Transport Header
+ ******************************************************************************/
+struct rxe_deth {
+	__be32			qkey;
+	__be32			sqp;
+};
+
+#define GSI_QKEY		(0x80010000)
+#define DETH_SQP_MASK		(0x00ffffff)
+
+static inline u32 __deth_qkey(void *arg)
+{
+	struct rxe_deth *deth = arg;
+
+	return be32_to_cpu(deth->qkey);
+}
+
+static inline void __deth_set_qkey(void *arg, u32 qkey)
+{
+	struct rxe_deth *deth = arg;
+
+	deth->qkey = cpu_to_be32(qkey);
+}
+
+static inline u32 __deth_sqp(void *arg)
+{
+	struct rxe_deth *deth = arg;
+
+	return DETH_SQP_MASK & be32_to_cpu(deth->sqp);
+}
+
+static inline void __deth_set_sqp(void *arg, u32 sqp)
+{
+	struct rxe_deth *deth = arg;
+
+	deth->sqp = cpu_to_be32(DETH_SQP_MASK & sqp);
+}
+
+static inline u32 deth_qkey(struct rxe_pkt_info *pkt)
+{
+	return __deth_qkey(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_DETH]);
+}
+
+static inline void deth_set_qkey(struct rxe_pkt_info *pkt, u32 qkey)
+{
+	__deth_set_qkey(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_DETH], qkey);
+}
+
+static inline u32 deth_sqp(struct rxe_pkt_info *pkt)
+{
+	return __deth_sqp(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_DETH]);
+}
+
+static inline void deth_set_sqp(struct rxe_pkt_info *pkt, u32 sqp)
+{
+	__deth_set_sqp(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_DETH], sqp);
+}
+
+/******************************************************************************
+ * RDMA Extended Transport Header
+ ******************************************************************************/
+struct rxe_reth {
+	__be64			va;
+	__be32			rkey;
+	__be32			len;
+};
+
+static inline u64 __reth_va(void *arg)
+{
+	struct rxe_reth *reth = arg;
+
+	return be64_to_cpu(reth->va);
+}
+
+static inline void __reth_set_va(void *arg, u64 va)
+{
+	struct rxe_reth *reth = arg;
+
+	reth->va = cpu_to_be64(va);
+}
+
+static inline u32 __reth_rkey(void *arg)
+{
+	struct rxe_reth *reth = arg;
+
+	return be32_to_cpu(reth->rkey);
+}
+
+static inline void __reth_set_rkey(void *arg, u32 rkey)
+{
+	struct rxe_reth *reth = arg;
+
+	reth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u32 __reth_len(void *arg)
+{
+	struct rxe_reth *reth = arg;
+
+	return be32_to_cpu(reth->len);
+}
+
+static inline void __reth_set_len(void *arg, u32 len)
+{
+	struct rxe_reth *reth = arg;
+
+	reth->len = cpu_to_be32(len);
+}
+
+static inline u64 reth_va(struct rxe_pkt_info *pkt)
+{
+	return __reth_va(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_va(struct rxe_pkt_info *pkt, u64 va)
+{
+	__reth_set_va(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_RETH], va);
+}
+
+static inline u32 reth_rkey(struct rxe_pkt_info *pkt)
+{
+	return __reth_rkey(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+	__reth_set_rkey(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_RETH], rkey);
+}
+
+static inline u32 reth_len(struct rxe_pkt_info *pkt)
+{
+	return __reth_len(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_len(struct rxe_pkt_info *pkt, u32 len)
+{
+	__reth_set_len(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_RETH], len);
+}
+
+/******************************************************************************
+ * Atomic Extended Transport Header
+ ******************************************************************************/
+struct rxe_atmeth {
+	__be64			va;
+	__be32			rkey;
+	__be64			swap_add;
+	__be64			comp;
+} __attribute__((__packed__));
+
+static inline u64 __atmeth_va(void *arg)
+{
+	struct rxe_atmeth *atmeth = arg;
+
+	return be64_to_cpu(atmeth->va);
+}
+
+static inline void __atmeth_set_va(void *arg, u64 va)
+{
+	struct rxe_atmeth *atmeth = arg;
+
+	atmeth->va = cpu_to_be64(va);
+}
+
+static inline u32 __atmeth_rkey(void *arg)
+{
+	struct rxe_atmeth *atmeth = arg;
+
+	return be32_to_cpu(atmeth->rkey);
+}
+
+static inline void __atmeth_set_rkey(void *arg, u32 rkey)
+{
+	struct rxe_atmeth *atmeth = arg;
+
+	atmeth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u64 __atmeth_swap_add(void *arg)
+{
+	struct rxe_atmeth *atmeth = arg;
+
+	return be64_to_cpu(atmeth->swap_add);
+}
+
+static inline void __atmeth_set_swap_add(void *arg, u64 swap_add)
+{
+	struct rxe_atmeth *atmeth = arg;
+
+	atmeth->swap_add = cpu_to_be64(swap_add);
+}
+
+static inline u64 __atmeth_comp(void *arg)
+{
+	struct rxe_atmeth *atmeth = arg;
+
+	return be64_to_cpu(atmeth->comp);
+}
+
+static inline void __atmeth_set_comp(void *arg, u64 comp)
+{
+	struct rxe_atmeth *atmeth = arg;
+
+	atmeth->comp = cpu_to_be64(comp);
+}
+
+static inline u64 atmeth_va(struct rxe_pkt_info *pkt)
+{
+	return __atmeth_va(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_va(struct rxe_pkt_info *pkt, u64 va)
+{
+	__atmeth_set_va(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH], va);
+}
+
+static inline u32 atmeth_rkey(struct rxe_pkt_info *pkt)
+{
+	return __atmeth_rkey(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+	__atmeth_set_rkey(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH], rkey);
+}
+
+static inline u64 atmeth_swap_add(struct rxe_pkt_info *pkt)
+{
+	return __atmeth_swap_add(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_swap_add(struct rxe_pkt_info *pkt, u64 swap_add)
+{
+	__atmeth_set_swap_add(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH], swap_add);
+}
+
+static inline u64 atmeth_comp(struct rxe_pkt_info *pkt)
+{
+	return __atmeth_comp(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_comp(struct rxe_pkt_info *pkt, u64 comp)
+{
+	__atmeth_set_comp(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH], comp);
+}
+
+/******************************************************************************
+ * Ack Extended Transport Header
+ ******************************************************************************/
+struct rxe_aeth {
+	__be32			smsn;
+};
+
+#define AETH_SYN_MASK		(0xff000000)
+#define AETH_MSN_MASK		(0x00ffffff)
+
+enum aeth_syndrome {
+	AETH_TYPE_MASK		= 0xe0,
+	AETH_ACK		= 0x00,
+	AETH_RNR_NAK		= 0x20,
+	AETH_RSVD		= 0x40,
+	AETH_NAK		= 0x60,
+	AETH_ACK_UNLIMITED	= 0x1f,
+	AETH_NAK_PSN_SEQ_ERROR	= 0x60,
+	AETH_NAK_INVALID_REQ	= 0x61,
+	AETH_NAK_REM_ACC_ERR	= 0x62,
+	AETH_NAK_REM_OP_ERR	= 0x63,
+	AETH_NAK_INV_RD_REQ	= 0x64,
+};
+
+static inline u8 __aeth_syn(void *arg)
+{
+	struct rxe_aeth *aeth = arg;
+
+	return (AETH_SYN_MASK & be32_to_cpu(aeth->smsn)) >> 24;
+}
+
+static inline void __aeth_set_syn(void *arg, u8 syn)
+{
+	struct rxe_aeth *aeth = arg;
+	u32 smsn = be32_to_cpu(aeth->smsn);
+
+	aeth->smsn = cpu_to_be32((AETH_SYN_MASK & (syn << 24)) |
+			 (~AETH_SYN_MASK & smsn));
+}
+
+static inline u32 __aeth_msn(void *arg)
+{
+	struct rxe_aeth *aeth = arg;
+
+	return AETH_MSN_MASK & be32_to_cpu(aeth->smsn);
+}
+
+static inline void __aeth_set_msn(void *arg, u32 msn)
+{
+	struct rxe_aeth *aeth = arg;
+	u32 smsn = be32_to_cpu(aeth->smsn);
+
+	aeth->smsn = cpu_to_be32((AETH_MSN_MASK & msn) |
+			 (~AETH_MSN_MASK & smsn));
+}
+
+static inline u8 aeth_syn(struct rxe_pkt_info *pkt)
+{
+	return __aeth_syn(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_AETH]);
+}
+
+static inline void aeth_set_syn(struct rxe_pkt_info *pkt, u8 syn)
+{
+	__aeth_set_syn(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_AETH], syn);
+}
+
+static inline u32 aeth_msn(struct rxe_pkt_info *pkt)
+{
+	return __aeth_msn(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_AETH]);
+}
+
+static inline void aeth_set_msn(struct rxe_pkt_info *pkt, u32 msn)
+{
+	__aeth_set_msn(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_AETH], msn);
+}
+
+/******************************************************************************
+ * Atomic Ack Extended Transport Header
+ ******************************************************************************/
+struct rxe_atmack {
+	__be64			orig;
+};
+
+static inline u64 __atmack_orig(void *arg)
+{
+	struct rxe_atmack *atmack = arg;
+
+	return be64_to_cpu(atmack->orig);
+}
+
+static inline void __atmack_set_orig(void *arg, u64 orig)
+{
+	struct rxe_atmack *atmack = arg;
+
+	atmack->orig = cpu_to_be64(orig);
+}
+
+static inline u64 atmack_orig(struct rxe_pkt_info *pkt)
+{
+	return __atmack_orig(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_ATMACK]);
+}
+
+static inline void atmack_set_orig(struct rxe_pkt_info *pkt, u64 orig)
+{
+	__atmack_set_orig(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_ATMACK], orig);
+}
+
+/******************************************************************************
+ * Immediate Extended Transport Header
+ ******************************************************************************/
+struct rxe_immdt {
+	__be32			imm;
+};
+
+static inline __be32 __immdt_imm(void *arg)
+{
+	struct rxe_immdt *immdt = arg;
+
+	return immdt->imm;
+}
+
+static inline void __immdt_set_imm(void *arg, __be32 imm)
+{
+	struct rxe_immdt *immdt = arg;
+
+	immdt->imm = imm;
+}
+
+static inline __be32 immdt_imm(struct rxe_pkt_info *pkt)
+{
+	return __immdt_imm(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_IMMDT]);
+}
+
+static inline void immdt_set_imm(struct rxe_pkt_info *pkt, __be32 imm)
+{
+	__immdt_set_imm(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_IMMDT], imm);
+}
+
+/******************************************************************************
+ * Invalidate Extended Transport Header
+ ******************************************************************************/
+struct rxe_ieth {
+	__be32			rkey;
+};
+
+static inline u32 __ieth_rkey(void *arg)
+{
+	struct rxe_ieth *ieth = arg;
+
+	return be32_to_cpu(ieth->rkey);
+}
+
+static inline void __ieth_set_rkey(void *arg, u32 rkey)
+{
+	struct rxe_ieth *ieth = arg;
+
+	ieth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u32 ieth_rkey(struct rxe_pkt_info *pkt)
+{
+	return __ieth_rkey(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_IETH]);
+}
+
+static inline void ieth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+	__ieth_set_rkey(pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_IETH], rkey);
+}
+
+enum rxe_hdr_length {
+	RXE_BTH_BYTES		= sizeof(struct rxe_bth),
+	RXE_DETH_BYTES		= sizeof(struct rxe_deth),
+	RXE_IMMDT_BYTES		= sizeof(struct rxe_immdt),
+	RXE_RETH_BYTES		= sizeof(struct rxe_reth),
+	RXE_AETH_BYTES		= sizeof(struct rxe_aeth),
+	RXE_ATMACK_BYTES	= sizeof(struct rxe_atmack),
+	RXE_ATMETH_BYTES	= sizeof(struct rxe_atmeth),
+	RXE_IETH_BYTES		= sizeof(struct rxe_ieth),
+	RXE_RDETH_BYTES		= sizeof(struct rxe_rdeth),
+};
+
+static inline size_t header_size(struct rxe_pkt_info *pkt)
+{
+	return pkt->offset + rxe_opcode[pkt->opcode].length;
+}
+
+static inline void *payload_addr(struct rxe_pkt_info *pkt)
+{
+	return pkt->hdr + pkt->offset
+		+ rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD];
+}
+
+static inline size_t payload_size(struct rxe_pkt_info *pkt)
+{
+	return pkt->paylen - rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD]
+		- bth_pad(pkt) - RXE_ICRC_SIZE;
+}
+
+#endif /* RXE_HDR_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
new file mode 100644
index 0000000..6aeb7a1
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_hw_counters.h"
+
+static const char * const rxe_counter_name[] = {
+	[RXE_CNT_SENT_PKTS]           =  "sent_pkts",
+	[RXE_CNT_RCVD_PKTS]           =  "rcvd_pkts",
+	[RXE_CNT_DUP_REQ]             =  "duplicate_request",
+	[RXE_CNT_OUT_OF_SEQ_REQ]      =  "out_of_sequence",
+	[RXE_CNT_RCV_RNR]             =  "rcvd_rnr_err",
+	[RXE_CNT_SND_RNR]             =  "send_rnr_err",
+	[RXE_CNT_RCV_SEQ_ERR]         =  "rcvd_seq_err",
+	[RXE_CNT_COMPLETER_SCHED]     =  "ack_deffered",
+	[RXE_CNT_RETRY_EXCEEDED]      =  "retry_exceeded_err",
+	[RXE_CNT_RNR_RETRY_EXCEEDED]  =  "retry_rnr_exceeded_err",
+	[RXE_CNT_COMP_RETRY]          =  "completer_retry_err",
+	[RXE_CNT_SEND_ERR]            =  "send_err",
+};
+
+int rxe_ib_get_hw_stats(struct ib_device *ibdev,
+			struct rdma_hw_stats *stats,
+			u8 port, int index)
+{
+	struct rxe_dev *dev = to_rdev(ibdev);
+	unsigned int cnt;
+
+	if (!port || !stats)
+		return -EINVAL;
+
+	for (cnt = 0; cnt  < ARRAY_SIZE(rxe_counter_name); cnt++)
+		stats->value[cnt] = dev->stats_counters[cnt];
+
+	return ARRAY_SIZE(rxe_counter_name);
+}
+
+struct rdma_hw_stats *rxe_ib_alloc_hw_stats(struct ib_device *ibdev,
+					    u8 port_num)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(rxe_counter_name) != RXE_NUM_OF_COUNTERS);
+	/* We support only per port stats */
+	if (!port_num)
+		return NULL;
+
+	return rdma_alloc_hw_stats_struct(rxe_counter_name,
+					  ARRAY_SIZE(rxe_counter_name),
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.h b/drivers/infiniband/sw/rxe/rxe_hw_counters.h
new file mode 100644
index 0000000..f44df1b
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_HW_COUNTERS_H
+#define RXE_HW_COUNTERS_H
+
+/*
+ * when adding counters to enum also add
+ * them to rxe_counter_name[] vector.
+ */
+enum rxe_counters {
+	RXE_CNT_SENT_PKTS,
+	RXE_CNT_RCVD_PKTS,
+	RXE_CNT_DUP_REQ,
+	RXE_CNT_OUT_OF_SEQ_REQ,
+	RXE_CNT_RCV_RNR,
+	RXE_CNT_SND_RNR,
+	RXE_CNT_RCV_SEQ_ERR,
+	RXE_CNT_COMPLETER_SCHED,
+	RXE_CNT_RETRY_EXCEEDED,
+	RXE_CNT_RNR_RETRY_EXCEEDED,
+	RXE_CNT_COMP_RETRY,
+	RXE_CNT_SEND_ERR,
+	RXE_NUM_OF_COUNTERS
+};
+
+struct rdma_hw_stats *rxe_ib_alloc_hw_stats(struct ib_device *ibdev,
+					    u8 port_num);
+int rxe_ib_get_hw_stats(struct ib_device *ibdev,
+			struct rdma_hw_stats *stats,
+			u8 port, int index);
+#endif /* RXE_HW_COUNTERS_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c
new file mode 100644
index 0000000..39e0be3
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_icrc.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/* Compute a partial ICRC for all the IB transport headers. */
+u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb)
+{
+	unsigned int bth_offset = 0;
+	struct iphdr *ip4h = NULL;
+	struct ipv6hdr *ip6h = NULL;
+	struct udphdr *udph;
+	struct rxe_bth *bth;
+	int crc;
+	int length;
+	int hdr_size = sizeof(struct udphdr) +
+		(skb->protocol == htons(ETH_P_IP) ?
+		sizeof(struct iphdr) : sizeof(struct ipv6hdr));
+	/* pseudo header buffer size is calculate using ipv6 header size since
+	 * it is bigger than ipv4
+	 */
+	u8 pshdr[sizeof(struct udphdr) +
+		sizeof(struct ipv6hdr) +
+		RXE_BTH_BYTES];
+
+	/* This seed is the result of computing a CRC with a seed of
+	 * 0xfffffff and 8 bytes of 0xff representing a masked LRH.
+	 */
+	crc = 0xdebb20e3;
+
+	if (skb->protocol == htons(ETH_P_IP)) { /* IPv4 */
+		memcpy(pshdr, ip_hdr(skb), hdr_size);
+		ip4h = (struct iphdr *)pshdr;
+		udph = (struct udphdr *)(ip4h + 1);
+
+		ip4h->ttl = 0xff;
+		ip4h->check = CSUM_MANGLED_0;
+		ip4h->tos = 0xff;
+	} else {				/* IPv6 */
+		memcpy(pshdr, ipv6_hdr(skb), hdr_size);
+		ip6h = (struct ipv6hdr *)pshdr;
+		udph = (struct udphdr *)(ip6h + 1);
+
+		memset(ip6h->flow_lbl, 0xff, sizeof(ip6h->flow_lbl));
+		ip6h->priority = 0xf;
+		ip6h->hop_limit = 0xff;
+	}
+	udph->check = CSUM_MANGLED_0;
+
+	bth_offset += hdr_size;
+
+	memcpy(&pshdr[bth_offset], pkt->hdr, RXE_BTH_BYTES);
+	bth = (struct rxe_bth *)&pshdr[bth_offset];
+
+	/* exclude bth.resv8a */
+	bth->qpn |= cpu_to_be32(~BTH_QPN_MASK);
+
+	length = hdr_size + RXE_BTH_BYTES;
+	crc = rxe_crc32(pkt->rxe, crc, pshdr, length);
+
+	/* And finish to compute the CRC on the remainder of the headers. */
+	crc = rxe_crc32(pkt->rxe, crc, pkt->hdr + RXE_BTH_BYTES,
+			rxe_opcode[pkt->opcode].length - RXE_BTH_BYTES);
+	return crc;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
new file mode 100644
index 0000000..b71023c
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_LOC_H
+#define RXE_LOC_H
+
+/* rxe_av.c */
+
+int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr);
+
+void rxe_av_from_attr(u8 port_num, struct rxe_av *av,
+		     struct rdma_ah_attr *attr);
+
+void rxe_av_to_attr(struct rxe_av *av, struct rdma_ah_attr *attr);
+
+void rxe_av_fill_ip_info(struct rxe_av *av,
+			struct rdma_ah_attr *attr,
+			struct ib_gid_attr *sgid_attr,
+			union ib_gid *sgid);
+
+struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt);
+
+/* rxe_cq.c */
+int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
+		    int cqe, int comp_vector);
+
+int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
+		     int comp_vector, struct ib_ucontext *context,
+		     struct rxe_create_cq_resp __user *uresp);
+
+int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe,
+			struct rxe_resize_cq_resp __user *uresp);
+
+int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited);
+
+void rxe_cq_disable(struct rxe_cq *cq);
+
+void rxe_cq_cleanup(struct rxe_pool_entry *arg);
+
+/* rxe_mcast.c */
+int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
+		      struct rxe_mc_grp **grp_p);
+
+int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+			   struct rxe_mc_grp *grp);
+
+int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+			    union ib_gid *mgid);
+
+void rxe_drop_all_mcast_groups(struct rxe_qp *qp);
+
+void rxe_mc_cleanup(struct rxe_pool_entry *arg);
+
+/* rxe_mmap.c */
+struct rxe_mmap_info {
+	struct list_head	pending_mmaps;
+	struct ib_ucontext	*context;
+	struct kref		ref;
+	void			*obj;
+
+	struct mminfo info;
+};
+
+void rxe_mmap_release(struct kref *ref);
+
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev,
+					   u32 size,
+					   struct ib_ucontext *context,
+					   void *obj);
+
+int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+/* rxe_mr.c */
+enum copy_direction {
+	to_mem_obj,
+	from_mem_obj,
+};
+
+int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd,
+		     int access, struct rxe_mem *mem);
+
+int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start,
+		      u64 length, u64 iova, int access, struct ib_udata *udata,
+		      struct rxe_mem *mr);
+
+int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd,
+		      int max_pages, struct rxe_mem *mem);
+
+int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr,
+		 int length, enum copy_direction dir, u32 *crcp);
+
+int copy_data(struct rxe_dev *rxe, struct rxe_pd *pd, int access,
+	      struct rxe_dma_info *dma, void *addr, int length,
+	      enum copy_direction dir, u32 *crcp);
+
+void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length);
+
+enum lookup_type {
+	lookup_local,
+	lookup_remote,
+};
+
+struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key,
+			   enum lookup_type type);
+
+int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length);
+
+int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem,
+		      u64 *page, int num_pages, u64 iova);
+
+void rxe_mem_cleanup(struct rxe_pool_entry *arg);
+
+int advance_dma_data(struct rxe_dma_info *dma, unsigned int length);
+
+/* rxe_net.c */
+int rxe_loopback(struct sk_buff *skb);
+int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb);
+struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
+				int paylen, struct rxe_pkt_info *pkt);
+int rxe_prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+		struct sk_buff *skb, u32 *crc);
+enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num);
+const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num);
+struct device *rxe_dma_device(struct rxe_dev *rxe);
+int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid);
+int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid);
+
+/* rxe_qp.c */
+int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init);
+
+int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
+		     struct ib_qp_init_attr *init,
+		     struct rxe_create_qp_resp __user *uresp,
+		     struct ib_pd *ibpd);
+
+int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init);
+
+int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
+		    struct ib_qp_attr *attr, int mask);
+
+int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr,
+		     int mask, struct ib_udata *udata);
+
+int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask);
+
+void rxe_qp_error(struct rxe_qp *qp);
+
+void rxe_qp_destroy(struct rxe_qp *qp);
+
+void rxe_qp_cleanup(struct rxe_pool_entry *arg);
+
+static inline int qp_num(struct rxe_qp *qp)
+{
+	return qp->ibqp.qp_num;
+}
+
+static inline enum ib_qp_type qp_type(struct rxe_qp *qp)
+{
+	return qp->ibqp.qp_type;
+}
+
+static inline enum ib_qp_state qp_state(struct rxe_qp *qp)
+{
+	return qp->attr.qp_state;
+}
+
+static inline int qp_mtu(struct rxe_qp *qp)
+{
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC)
+		return qp->attr.path_mtu;
+	else
+		return RXE_PORT_MAX_MTU;
+}
+
+static inline int rcv_wqe_size(int max_sge)
+{
+	return sizeof(struct rxe_recv_wqe) +
+		max_sge * sizeof(struct ib_sge);
+}
+
+void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res);
+
+static inline void rxe_advance_resp_resource(struct rxe_qp *qp)
+{
+	qp->resp.res_head++;
+	if (unlikely(qp->resp.res_head == qp->attr.max_dest_rd_atomic))
+		qp->resp.res_head = 0;
+}
+
+void retransmit_timer(struct timer_list *t);
+void rnr_nak_timer(struct timer_list *t);
+
+/* rxe_srq.c */
+#define IB_SRQ_INIT_MASK (~IB_SRQ_LIMIT)
+
+int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+		     struct ib_srq_attr *attr, enum ib_srq_attr_mask mask);
+
+int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
+		      struct ib_srq_init_attr *init,
+		      struct ib_ucontext *context,
+		      struct rxe_create_srq_resp __user *uresp);
+
+int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+		      struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
+		      struct rxe_modify_srq_cmd *ucmd);
+
+void rxe_release(struct kref *kref);
+
+int rxe_completer(void *arg);
+int rxe_requester(void *arg);
+int rxe_responder(void *arg);
+
+u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb);
+
+void rxe_resp_queue_pkt(struct rxe_dev *rxe,
+			struct rxe_qp *qp, struct sk_buff *skb);
+
+void rxe_comp_queue_pkt(struct rxe_dev *rxe,
+			struct rxe_qp *qp, struct sk_buff *skb);
+
+static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
+{
+	return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type];
+}
+
+static inline int rxe_xmit_packet(struct rxe_dev *rxe, struct rxe_qp *qp,
+				  struct rxe_pkt_info *pkt, struct sk_buff *skb)
+{
+	int err;
+	int is_request = pkt->mask & RXE_REQ_MASK;
+
+	if ((is_request && (qp->req.state != QP_STATE_READY)) ||
+	    (!is_request && (qp->resp.state != QP_STATE_READY))) {
+		pr_info("Packet dropped. QP is not in ready state\n");
+		goto drop;
+	}
+
+	if (pkt->mask & RXE_LOOPBACK_MASK) {
+		memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt));
+		err = rxe_loopback(skb);
+	} else {
+		err = rxe_send(pkt, skb);
+	}
+
+	if (err) {
+		rxe->xmit_errors++;
+		rxe_counter_inc(rxe, RXE_CNT_SEND_ERR);
+		return err;
+	}
+
+	if ((qp_type(qp) != IB_QPT_RC) &&
+	    (pkt->mask & RXE_END_MASK)) {
+		pkt->wqe->state = wqe_state_done;
+		rxe_run_task(&qp->comp.task, 1);
+	}
+
+	rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS);
+	goto done;
+
+drop:
+	kfree_skb(skb);
+	err = 0;
+done:
+	return err;
+}
+
+#endif /* RXE_LOC_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c
new file mode 100644
index 0000000..522a794
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *	   Redistribution and use in source and binary forms, with or
+ *	   without modification, are permitted provided that the following
+ *	   conditions are met:
+ *
+ *		- Redistributions of source code must retain the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer.
+ *
+ *		- Redistributions in binary form must reproduce the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer in the documentation and/or other materials
+ *		  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
+		      struct rxe_mc_grp **grp_p)
+{
+	int err;
+	struct rxe_mc_grp *grp;
+
+	if (rxe->attr.max_mcast_qp_attach == 0) {
+		err = -EINVAL;
+		goto err1;
+	}
+
+	grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid);
+	if (grp)
+		goto done;
+
+	grp = rxe_alloc(&rxe->mc_grp_pool);
+	if (!grp) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	INIT_LIST_HEAD(&grp->qp_list);
+	spin_lock_init(&grp->mcg_lock);
+	grp->rxe = rxe;
+
+	rxe_add_key(grp, mgid);
+
+	err = rxe_mcast_add(rxe, mgid);
+	if (err)
+		goto err2;
+
+done:
+	*grp_p = grp;
+	return 0;
+
+err2:
+	rxe_drop_ref(grp);
+err1:
+	return err;
+}
+
+int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+			   struct rxe_mc_grp *grp)
+{
+	int err;
+	struct rxe_mc_elem *elem;
+
+	/* check to see of the qp is already a member of the group */
+	spin_lock_bh(&qp->grp_lock);
+	spin_lock_bh(&grp->mcg_lock);
+	list_for_each_entry(elem, &grp->qp_list, qp_list) {
+		if (elem->qp == qp) {
+			err = 0;
+			goto out;
+		}
+	}
+
+	if (grp->num_qp >= rxe->attr.max_mcast_qp_attach) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	elem = rxe_alloc(&rxe->mc_elem_pool);
+	if (!elem) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* each qp holds a ref on the grp */
+	rxe_add_ref(grp);
+
+	grp->num_qp++;
+	elem->qp = qp;
+	elem->grp = grp;
+
+	list_add(&elem->qp_list, &grp->qp_list);
+	list_add(&elem->grp_list, &qp->grp_list);
+
+	err = 0;
+out:
+	spin_unlock_bh(&grp->mcg_lock);
+	spin_unlock_bh(&qp->grp_lock);
+	return err;
+}
+
+int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+			    union ib_gid *mgid)
+{
+	struct rxe_mc_grp *grp;
+	struct rxe_mc_elem *elem, *tmp;
+
+	grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid);
+	if (!grp)
+		goto err1;
+
+	spin_lock_bh(&qp->grp_lock);
+	spin_lock_bh(&grp->mcg_lock);
+
+	list_for_each_entry_safe(elem, tmp, &grp->qp_list, qp_list) {
+		if (elem->qp == qp) {
+			list_del(&elem->qp_list);
+			list_del(&elem->grp_list);
+			grp->num_qp--;
+
+			spin_unlock_bh(&grp->mcg_lock);
+			spin_unlock_bh(&qp->grp_lock);
+			rxe_drop_ref(elem);
+			rxe_drop_ref(grp);	/* ref held by QP */
+			rxe_drop_ref(grp);	/* ref from get_key */
+			return 0;
+		}
+	}
+
+	spin_unlock_bh(&grp->mcg_lock);
+	spin_unlock_bh(&qp->grp_lock);
+	rxe_drop_ref(grp);			/* ref from get_key */
+err1:
+	return -EINVAL;
+}
+
+void rxe_drop_all_mcast_groups(struct rxe_qp *qp)
+{
+	struct rxe_mc_grp *grp;
+	struct rxe_mc_elem *elem;
+
+	while (1) {
+		spin_lock_bh(&qp->grp_lock);
+		if (list_empty(&qp->grp_list)) {
+			spin_unlock_bh(&qp->grp_lock);
+			break;
+		}
+		elem = list_first_entry(&qp->grp_list, struct rxe_mc_elem,
+					grp_list);
+		list_del(&elem->grp_list);
+		spin_unlock_bh(&qp->grp_lock);
+
+		grp = elem->grp;
+		spin_lock_bh(&grp->mcg_lock);
+		list_del(&elem->qp_list);
+		grp->num_qp--;
+		spin_unlock_bh(&grp->mcg_lock);
+		rxe_drop_ref(grp);
+		rxe_drop_ref(elem);
+	}
+}
+
+void rxe_mc_cleanup(struct rxe_pool_entry *arg)
+{
+	struct rxe_mc_grp *grp = container_of(arg, typeof(*grp), pelem);
+	struct rxe_dev *rxe = grp->rxe;
+
+	rxe_drop_key(grp);
+	rxe_mcast_delete(rxe, &grp->mgid);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c
new file mode 100644
index 0000000..d22431e
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_mmap.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+void rxe_mmap_release(struct kref *ref)
+{
+	struct rxe_mmap_info *ip = container_of(ref,
+					struct rxe_mmap_info, ref);
+	struct rxe_dev *rxe = to_rdev(ip->context->device);
+
+	spin_lock_bh(&rxe->pending_lock);
+
+	if (!list_empty(&ip->pending_mmaps))
+		list_del(&ip->pending_mmaps);
+
+	spin_unlock_bh(&rxe->pending_lock);
+
+	vfree(ip->obj);		/* buf */
+	kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the memory region is mapped,
+ * to avoid releasing it.
+ */
+static void rxe_vma_open(struct vm_area_struct *vma)
+{
+	struct rxe_mmap_info *ip = vma->vm_private_data;
+
+	kref_get(&ip->ref);
+}
+
+static void rxe_vma_close(struct vm_area_struct *vma)
+{
+	struct rxe_mmap_info *ip = vma->vm_private_data;
+
+	kref_put(&ip->ref, rxe_mmap_release);
+}
+
+static const struct vm_operations_struct rxe_vm_ops = {
+	.open = rxe_vma_open,
+	.close = rxe_vma_close,
+};
+
+/**
+ * rxe_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct rxe_dev *rxe = to_rdev(context->device);
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct rxe_mmap_info *ip, *pp;
+	int ret;
+
+	/*
+	 * Search the device's list of objects waiting for a mmap call.
+	 * Normally, this list is very short since a call to create a
+	 * CQ, QP, or SRQ is soon followed by a call to mmap().
+	 */
+	spin_lock_bh(&rxe->pending_lock);
+	list_for_each_entry_safe(ip, pp, &rxe->pending_mmaps, pending_mmaps) {
+		if (context != ip->context || (__u64)offset != ip->info.offset)
+			continue;
+
+		/* Don't allow a mmap larger than the object. */
+		if (size > ip->info.size) {
+			pr_err("mmap region is larger than the object!\n");
+			spin_unlock_bh(&rxe->pending_lock);
+			ret = -EINVAL;
+			goto done;
+		}
+
+		goto found_it;
+	}
+	pr_warn("unable to find pending mmap info\n");
+	spin_unlock_bh(&rxe->pending_lock);
+	ret = -EINVAL;
+	goto done;
+
+found_it:
+	list_del_init(&ip->pending_mmaps);
+	spin_unlock_bh(&rxe->pending_lock);
+
+	ret = remap_vmalloc_range(vma, ip->obj, 0);
+	if (ret) {
+		pr_err("err %d from remap_vmalloc_range\n", ret);
+		goto done;
+	}
+
+	vma->vm_ops = &rxe_vm_ops;
+	vma->vm_private_data = ip;
+	rxe_vma_open(vma);
+done:
+	return ret;
+}
+
+/*
+ * Allocate information for rxe_mmap
+ */
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe,
+					   u32 size,
+					   struct ib_ucontext *context,
+					   void *obj)
+{
+	struct rxe_mmap_info *ip;
+
+	ip = kmalloc(sizeof(*ip), GFP_KERNEL);
+	if (!ip)
+		return NULL;
+
+	size = PAGE_ALIGN(size);
+
+	spin_lock_bh(&rxe->mmap_offset_lock);
+
+	if (rxe->mmap_offset == 0)
+		rxe->mmap_offset = ALIGN(PAGE_SIZE, SHMLBA);
+
+	ip->info.offset = rxe->mmap_offset;
+	rxe->mmap_offset += ALIGN(size, SHMLBA);
+
+	spin_unlock_bh(&rxe->mmap_offset_lock);
+
+	INIT_LIST_HEAD(&ip->pending_mmaps);
+	ip->info.size = size;
+	ip->context = context;
+	ip->obj = obj;
+	kref_init(&ip->ref);
+
+	return ip;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
new file mode 100644
index 0000000..5c2684b
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -0,0 +1,648 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/*
+ * lfsr (linear feedback shift register) with period 255
+ */
+static u8 rxe_get_key(void)
+{
+	static u32 key = 1;
+
+	key = key << 1;
+
+	key |= (0 != (key & 0x100)) ^ (0 != (key & 0x10))
+		^ (0 != (key & 0x80)) ^ (0 != (key & 0x40));
+
+	key &= 0xff;
+
+	return key;
+}
+
+int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length)
+{
+	switch (mem->type) {
+	case RXE_MEM_TYPE_DMA:
+		return 0;
+
+	case RXE_MEM_TYPE_MR:
+	case RXE_MEM_TYPE_FMR:
+		if (iova < mem->iova ||
+		    length > mem->length ||
+		    iova > mem->iova + mem->length - length)
+			return -EFAULT;
+		return 0;
+
+	default:
+		return -EFAULT;
+	}
+}
+
+#define IB_ACCESS_REMOTE	(IB_ACCESS_REMOTE_READ		\
+				| IB_ACCESS_REMOTE_WRITE	\
+				| IB_ACCESS_REMOTE_ATOMIC)
+
+static void rxe_mem_init(int access, struct rxe_mem *mem)
+{
+	u32 lkey = mem->pelem.index << 8 | rxe_get_key();
+	u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0;
+
+	if (mem->pelem.pool->type == RXE_TYPE_MR) {
+		mem->ibmr.lkey		= lkey;
+		mem->ibmr.rkey		= rkey;
+	}
+
+	mem->lkey		= lkey;
+	mem->rkey		= rkey;
+	mem->state		= RXE_MEM_STATE_INVALID;
+	mem->type		= RXE_MEM_TYPE_NONE;
+	mem->map_shift		= ilog2(RXE_BUF_PER_MAP);
+}
+
+void rxe_mem_cleanup(struct rxe_pool_entry *arg)
+{
+	struct rxe_mem *mem = container_of(arg, typeof(*mem), pelem);
+	int i;
+
+	if (mem->umem)
+		ib_umem_release(mem->umem);
+
+	if (mem->map) {
+		for (i = 0; i < mem->num_map; i++)
+			kfree(mem->map[i]);
+
+		kfree(mem->map);
+	}
+}
+
+static int rxe_mem_alloc(struct rxe_dev *rxe, struct rxe_mem *mem, int num_buf)
+{
+	int i;
+	int num_map;
+	struct rxe_map **map = mem->map;
+
+	num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP;
+
+	mem->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL);
+	if (!mem->map)
+		goto err1;
+
+	for (i = 0; i < num_map; i++) {
+		mem->map[i] = kmalloc(sizeof(**map), GFP_KERNEL);
+		if (!mem->map[i])
+			goto err2;
+	}
+
+	BUILD_BUG_ON(!is_power_of_2(RXE_BUF_PER_MAP));
+
+	mem->map_shift	= ilog2(RXE_BUF_PER_MAP);
+	mem->map_mask	= RXE_BUF_PER_MAP - 1;
+
+	mem->num_buf = num_buf;
+	mem->num_map = num_map;
+	mem->max_buf = num_map * RXE_BUF_PER_MAP;
+
+	return 0;
+
+err2:
+	for (i--; i >= 0; i--)
+		kfree(mem->map[i]);
+
+	kfree(mem->map);
+err1:
+	return -ENOMEM;
+}
+
+int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd,
+		     int access, struct rxe_mem *mem)
+{
+	rxe_mem_init(access, mem);
+
+	mem->pd			= pd;
+	mem->access		= access;
+	mem->state		= RXE_MEM_STATE_VALID;
+	mem->type		= RXE_MEM_TYPE_DMA;
+
+	return 0;
+}
+
+int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start,
+		      u64 length, u64 iova, int access, struct ib_udata *udata,
+		      struct rxe_mem *mem)
+{
+	int			entry;
+	struct rxe_map		**map;
+	struct rxe_phys_buf	*buf = NULL;
+	struct ib_umem		*umem;
+	struct scatterlist	*sg;
+	int			num_buf;
+	void			*vaddr;
+	int err;
+
+	umem = ib_umem_get(pd->ibpd.uobject->context, start, length, access, 0);
+	if (IS_ERR(umem)) {
+		pr_warn("err %d from rxe_umem_get\n",
+			(int)PTR_ERR(umem));
+		err = -EINVAL;
+		goto err1;
+	}
+
+	mem->umem = umem;
+	num_buf = umem->nmap;
+
+	rxe_mem_init(access, mem);
+
+	err = rxe_mem_alloc(rxe, mem, num_buf);
+	if (err) {
+		pr_warn("err %d from rxe_mem_alloc\n", err);
+		ib_umem_release(umem);
+		goto err1;
+	}
+
+	mem->page_shift		= umem->page_shift;
+	mem->page_mask		= BIT(umem->page_shift) - 1;
+
+	num_buf			= 0;
+	map			= mem->map;
+	if (length > 0) {
+		buf = map[0]->buf;
+
+		for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+			vaddr = page_address(sg_page(sg));
+			if (!vaddr) {
+				pr_warn("null vaddr\n");
+				err = -ENOMEM;
+				goto err1;
+			}
+
+			buf->addr = (uintptr_t)vaddr;
+			buf->size = BIT(umem->page_shift);
+			num_buf++;
+			buf++;
+
+			if (num_buf >= RXE_BUF_PER_MAP) {
+				map++;
+				buf = map[0]->buf;
+				num_buf = 0;
+			}
+		}
+	}
+
+	mem->pd			= pd;
+	mem->umem		= umem;
+	mem->access		= access;
+	mem->length		= length;
+	mem->iova		= iova;
+	mem->va			= start;
+	mem->offset		= ib_umem_offset(umem);
+	mem->state		= RXE_MEM_STATE_VALID;
+	mem->type		= RXE_MEM_TYPE_MR;
+
+	return 0;
+
+err1:
+	return err;
+}
+
+int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd,
+		      int max_pages, struct rxe_mem *mem)
+{
+	int err;
+
+	rxe_mem_init(0, mem);
+
+	/* In fastreg, we also set the rkey */
+	mem->ibmr.rkey = mem->ibmr.lkey;
+
+	err = rxe_mem_alloc(rxe, mem, max_pages);
+	if (err)
+		goto err1;
+
+	mem->pd			= pd;
+	mem->max_buf		= max_pages;
+	mem->state		= RXE_MEM_STATE_FREE;
+	mem->type		= RXE_MEM_TYPE_MR;
+
+	return 0;
+
+err1:
+	return err;
+}
+
+static void lookup_iova(
+	struct rxe_mem	*mem,
+	u64			iova,
+	int			*m_out,
+	int			*n_out,
+	size_t			*offset_out)
+{
+	size_t			offset = iova - mem->iova + mem->offset;
+	int			map_index;
+	int			buf_index;
+	u64			length;
+
+	if (likely(mem->page_shift)) {
+		*offset_out = offset & mem->page_mask;
+		offset >>= mem->page_shift;
+		*n_out = offset & mem->map_mask;
+		*m_out = offset >> mem->map_shift;
+	} else {
+		map_index = 0;
+		buf_index = 0;
+
+		length = mem->map[map_index]->buf[buf_index].size;
+
+		while (offset >= length) {
+			offset -= length;
+			buf_index++;
+
+			if (buf_index == RXE_BUF_PER_MAP) {
+				map_index++;
+				buf_index = 0;
+			}
+			length = mem->map[map_index]->buf[buf_index].size;
+		}
+
+		*m_out = map_index;
+		*n_out = buf_index;
+		*offset_out = offset;
+	}
+}
+
+void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length)
+{
+	size_t offset;
+	int m, n;
+	void *addr;
+
+	if (mem->state != RXE_MEM_STATE_VALID) {
+		pr_warn("mem not in valid state\n");
+		addr = NULL;
+		goto out;
+	}
+
+	if (!mem->map) {
+		addr = (void *)(uintptr_t)iova;
+		goto out;
+	}
+
+	if (mem_check_range(mem, iova, length)) {
+		pr_warn("range violation\n");
+		addr = NULL;
+		goto out;
+	}
+
+	lookup_iova(mem, iova, &m, &n, &offset);
+
+	if (offset + length > mem->map[m]->buf[n].size) {
+		pr_warn("crosses page boundary\n");
+		addr = NULL;
+		goto out;
+	}
+
+	addr = (void *)(uintptr_t)mem->map[m]->buf[n].addr + offset;
+
+out:
+	return addr;
+}
+
+/* copy data from a range (vaddr, vaddr+length-1) to or from
+ * a mem object starting at iova. Compute incremental value of
+ * crc32 if crcp is not zero. caller must hold a reference to mem
+ */
+int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length,
+		 enum copy_direction dir, u32 *crcp)
+{
+	int			err;
+	int			bytes;
+	u8			*va;
+	struct rxe_map		**map;
+	struct rxe_phys_buf	*buf;
+	int			m;
+	int			i;
+	size_t			offset;
+	u32			crc = crcp ? (*crcp) : 0;
+
+	if (length == 0)
+		return 0;
+
+	if (mem->type == RXE_MEM_TYPE_DMA) {
+		u8 *src, *dest;
+
+		src  = (dir == to_mem_obj) ?
+			addr : ((void *)(uintptr_t)iova);
+
+		dest = (dir == to_mem_obj) ?
+			((void *)(uintptr_t)iova) : addr;
+
+		memcpy(dest, src, length);
+
+		if (crcp)
+			*crcp = rxe_crc32(to_rdev(mem->pd->ibpd.device),
+					*crcp, dest, length);
+
+		return 0;
+	}
+
+	WARN_ON_ONCE(!mem->map);
+
+	err = mem_check_range(mem, iova, length);
+	if (err) {
+		err = -EFAULT;
+		goto err1;
+	}
+
+	lookup_iova(mem, iova, &m, &i, &offset);
+
+	map	= mem->map + m;
+	buf	= map[0]->buf + i;
+
+	while (length > 0) {
+		u8 *src, *dest;
+
+		va	= (u8 *)(uintptr_t)buf->addr + offset;
+		src  = (dir == to_mem_obj) ? addr : va;
+		dest = (dir == to_mem_obj) ? va : addr;
+
+		bytes	= buf->size - offset;
+
+		if (bytes > length)
+			bytes = length;
+
+		memcpy(dest, src, bytes);
+
+		if (crcp)
+			crc = rxe_crc32(to_rdev(mem->pd->ibpd.device),
+					crc, dest, bytes);
+
+		length	-= bytes;
+		addr	+= bytes;
+
+		offset	= 0;
+		buf++;
+		i++;
+
+		if (i == RXE_BUF_PER_MAP) {
+			i = 0;
+			map++;
+			buf = map[0]->buf;
+		}
+	}
+
+	if (crcp)
+		*crcp = crc;
+
+	return 0;
+
+err1:
+	return err;
+}
+
+/* copy data in or out of a wqe, i.e. sg list
+ * under the control of a dma descriptor
+ */
+int copy_data(
+	struct rxe_dev		*rxe,
+	struct rxe_pd		*pd,
+	int			access,
+	struct rxe_dma_info	*dma,
+	void			*addr,
+	int			length,
+	enum copy_direction	dir,
+	u32			*crcp)
+{
+	int			bytes;
+	struct rxe_sge		*sge	= &dma->sge[dma->cur_sge];
+	int			offset	= dma->sge_offset;
+	int			resid	= dma->resid;
+	struct rxe_mem		*mem	= NULL;
+	u64			iova;
+	int			err;
+
+	if (length == 0)
+		return 0;
+
+	if (length > resid) {
+		err = -EINVAL;
+		goto err2;
+	}
+
+	if (sge->length && (offset < sge->length)) {
+		mem = lookup_mem(pd, access, sge->lkey, lookup_local);
+		if (!mem) {
+			err = -EINVAL;
+			goto err1;
+		}
+	}
+
+	while (length > 0) {
+		bytes = length;
+
+		if (offset >= sge->length) {
+			if (mem) {
+				rxe_drop_ref(mem);
+				mem = NULL;
+			}
+			sge++;
+			dma->cur_sge++;
+			offset = 0;
+
+			if (dma->cur_sge >= dma->num_sge) {
+				err = -ENOSPC;
+				goto err2;
+			}
+
+			if (sge->length) {
+				mem = lookup_mem(pd, access, sge->lkey,
+						 lookup_local);
+				if (!mem) {
+					err = -EINVAL;
+					goto err1;
+				}
+			} else {
+				continue;
+			}
+		}
+
+		if (bytes > sge->length - offset)
+			bytes = sge->length - offset;
+
+		if (bytes > 0) {
+			iova = sge->addr + offset;
+
+			err = rxe_mem_copy(mem, iova, addr, bytes, dir, crcp);
+			if (err)
+				goto err2;
+
+			offset	+= bytes;
+			resid	-= bytes;
+			length	-= bytes;
+			addr	+= bytes;
+		}
+	}
+
+	dma->sge_offset = offset;
+	dma->resid	= resid;
+
+	if (mem)
+		rxe_drop_ref(mem);
+
+	return 0;
+
+err2:
+	if (mem)
+		rxe_drop_ref(mem);
+err1:
+	return err;
+}
+
+int advance_dma_data(struct rxe_dma_info *dma, unsigned int length)
+{
+	struct rxe_sge		*sge	= &dma->sge[dma->cur_sge];
+	int			offset	= dma->sge_offset;
+	int			resid	= dma->resid;
+
+	while (length) {
+		unsigned int bytes;
+
+		if (offset >= sge->length) {
+			sge++;
+			dma->cur_sge++;
+			offset = 0;
+			if (dma->cur_sge >= dma->num_sge)
+				return -ENOSPC;
+		}
+
+		bytes = length;
+
+		if (bytes > sge->length - offset)
+			bytes = sge->length - offset;
+
+		offset	+= bytes;
+		resid	-= bytes;
+		length	-= bytes;
+	}
+
+	dma->sge_offset = offset;
+	dma->resid	= resid;
+
+	return 0;
+}
+
+/* (1) find the mem (mr or mw) corresponding to lkey/rkey
+ *     depending on lookup_type
+ * (2) verify that the (qp) pd matches the mem pd
+ * (3) verify that the mem can support the requested access
+ * (4) verify that mem state is valid
+ */
+struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key,
+			   enum lookup_type type)
+{
+	struct rxe_mem *mem;
+	struct rxe_dev *rxe = to_rdev(pd->ibpd.device);
+	int index = key >> 8;
+
+	if (index >= RXE_MIN_MR_INDEX && index <= RXE_MAX_MR_INDEX) {
+		mem = rxe_pool_get_index(&rxe->mr_pool, index);
+		if (!mem)
+			goto err1;
+	} else {
+		goto err1;
+	}
+
+	if ((type == lookup_local && mem->lkey != key) ||
+	    (type == lookup_remote && mem->rkey != key))
+		goto err2;
+
+	if (mem->pd != pd)
+		goto err2;
+
+	if (access && !(access & mem->access))
+		goto err2;
+
+	if (mem->state != RXE_MEM_STATE_VALID)
+		goto err2;
+
+	return mem;
+
+err2:
+	rxe_drop_ref(mem);
+err1:
+	return NULL;
+}
+
+int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem,
+		      u64 *page, int num_pages, u64 iova)
+{
+	int i;
+	int num_buf;
+	int err;
+	struct rxe_map **map;
+	struct rxe_phys_buf *buf;
+	int page_size;
+
+	if (num_pages > mem->max_buf) {
+		err = -EINVAL;
+		goto err1;
+	}
+
+	num_buf		= 0;
+	page_size	= 1 << mem->page_shift;
+	map		= mem->map;
+	buf		= map[0]->buf;
+
+	for (i = 0; i < num_pages; i++) {
+		buf->addr = *page++;
+		buf->size = page_size;
+		buf++;
+		num_buf++;
+
+		if (num_buf == RXE_BUF_PER_MAP) {
+			map++;
+			buf = map[0]->buf;
+			num_buf = 0;
+		}
+	}
+
+	mem->iova	= iova;
+	mem->va		= iova;
+	mem->length	= num_pages << mem->page_shift;
+	mem->state	= RXE_MEM_STATE_VALID;
+
+	return 0;
+
+err1:
+	return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
new file mode 100644
index 0000000..9da6e37
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -0,0 +1,770 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <net/udp_tunnel.h>
+#include <net/sch_generic.h>
+#include <linux/netfilter.h>
+#include <rdma/ib_addr.h>
+
+#include "rxe.h"
+#include "rxe_net.h"
+#include "rxe_loc.h"
+
+static LIST_HEAD(rxe_dev_list);
+static DEFINE_SPINLOCK(dev_list_lock); /* spinlock for device list */
+
+struct rxe_dev *net_to_rxe(struct net_device *ndev)
+{
+	struct rxe_dev *rxe;
+	struct rxe_dev *found = NULL;
+
+	spin_lock_bh(&dev_list_lock);
+	list_for_each_entry(rxe, &rxe_dev_list, list) {
+		if (rxe->ndev == ndev) {
+			found = rxe;
+			break;
+		}
+	}
+	spin_unlock_bh(&dev_list_lock);
+
+	return found;
+}
+
+struct rxe_dev *get_rxe_by_name(const char *name)
+{
+	struct rxe_dev *rxe;
+	struct rxe_dev *found = NULL;
+
+	spin_lock_bh(&dev_list_lock);
+	list_for_each_entry(rxe, &rxe_dev_list, list) {
+		if (!strcmp(name, rxe->ib_dev.name)) {
+			found = rxe;
+			break;
+		}
+	}
+	spin_unlock_bh(&dev_list_lock);
+	return found;
+}
+
+
+static struct rxe_recv_sockets recv_sockets;
+
+struct device *rxe_dma_device(struct rxe_dev *rxe)
+{
+	struct net_device *ndev;
+
+	ndev = rxe->ndev;
+
+	if (is_vlan_dev(ndev))
+		ndev = vlan_dev_real_dev(ndev);
+
+	return ndev->dev.parent;
+}
+
+int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+	int err;
+	unsigned char ll_addr[ETH_ALEN];
+
+	ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+	err = dev_mc_add(rxe->ndev, ll_addr);
+
+	return err;
+}
+
+int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+	int err;
+	unsigned char ll_addr[ETH_ALEN];
+
+	ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+	err = dev_mc_del(rxe->ndev, ll_addr);
+
+	return err;
+}
+
+static struct dst_entry *rxe_find_route4(struct net_device *ndev,
+				  struct in_addr *saddr,
+				  struct in_addr *daddr)
+{
+	struct rtable *rt;
+	struct flowi4 fl = { { 0 } };
+
+	memset(&fl, 0, sizeof(fl));
+	fl.flowi4_oif = ndev->ifindex;
+	memcpy(&fl.saddr, saddr, sizeof(*saddr));
+	memcpy(&fl.daddr, daddr, sizeof(*daddr));
+	fl.flowi4_proto = IPPROTO_UDP;
+
+	rt = ip_route_output_key(&init_net, &fl);
+	if (IS_ERR(rt)) {
+		pr_err_ratelimited("no route to %pI4\n", &daddr->s_addr);
+		return NULL;
+	}
+
+	return &rt->dst;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct dst_entry *rxe_find_route6(struct net_device *ndev,
+					 struct in6_addr *saddr,
+					 struct in6_addr *daddr)
+{
+	struct dst_entry *ndst;
+	struct flowi6 fl6 = { { 0 } };
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_oif = ndev->ifindex;
+	memcpy(&fl6.saddr, saddr, sizeof(*saddr));
+	memcpy(&fl6.daddr, daddr, sizeof(*daddr));
+	fl6.flowi6_proto = IPPROTO_UDP;
+
+	if (unlikely(ipv6_stub->ipv6_dst_lookup(sock_net(recv_sockets.sk6->sk),
+						recv_sockets.sk6->sk, &ndst, &fl6))) {
+		pr_err_ratelimited("no route to %pI6\n", daddr);
+		goto put;
+	}
+
+	if (unlikely(ndst->error)) {
+		pr_err("no route to %pI6\n", daddr);
+		goto put;
+	}
+
+	return ndst;
+put:
+	dst_release(ndst);
+	return NULL;
+}
+
+#else
+
+static struct dst_entry *rxe_find_route6(struct net_device *ndev,
+					 struct in6_addr *saddr,
+					 struct in6_addr *daddr)
+{
+	return NULL;
+}
+
+#endif
+
+/*
+ * Derive the net_device from the av.
+ * For physical devices, this will just return rxe->ndev.
+ * But for VLAN devices, it will return the vlan dev.
+ * Caller should dev_put() the returned net_device.
+ */
+static struct net_device *rxe_netdev_from_av(struct rxe_dev *rxe,
+					     int port_num,
+					     struct rxe_av *av)
+{
+	union ib_gid gid;
+	struct ib_gid_attr attr;
+	struct net_device *ndev = rxe->ndev;
+
+	if (ib_get_cached_gid(&rxe->ib_dev, port_num, av->grh.sgid_index,
+			      &gid, &attr) == 0 &&
+	    attr.ndev && attr.ndev != ndev)
+		ndev = attr.ndev;
+	else
+		/* Only to ensure that caller may call dev_put() */
+		dev_hold(ndev);
+
+	return ndev;
+}
+
+static struct dst_entry *rxe_find_route(struct rxe_dev *rxe,
+					struct rxe_qp *qp,
+					struct rxe_av *av)
+{
+	struct dst_entry *dst = NULL;
+	struct net_device *ndev;
+
+	ndev = rxe_netdev_from_av(rxe, qp->attr.port_num, av);
+
+	if (qp_type(qp) == IB_QPT_RC)
+		dst = sk_dst_get(qp->sk->sk);
+
+	if (!dst || !dst_check(dst, qp->dst_cookie)) {
+		if (dst)
+			dst_release(dst);
+
+		if (av->network_type == RDMA_NETWORK_IPV4) {
+			struct in_addr *saddr;
+			struct in_addr *daddr;
+
+			saddr = &av->sgid_addr._sockaddr_in.sin_addr;
+			daddr = &av->dgid_addr._sockaddr_in.sin_addr;
+			dst = rxe_find_route4(ndev, saddr, daddr);
+		} else if (av->network_type == RDMA_NETWORK_IPV6) {
+			struct in6_addr *saddr6;
+			struct in6_addr *daddr6;
+
+			saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr;
+			daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr;
+			dst = rxe_find_route6(ndev, saddr6, daddr6);
+#if IS_ENABLED(CONFIG_IPV6)
+			if (dst)
+				qp->dst_cookie =
+					rt6_get_cookie((struct rt6_info *)dst);
+#endif
+		}
+	}
+
+	dev_put(ndev);
+	return dst;
+}
+
+static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct udphdr *udph;
+	struct net_device *ndev = skb->dev;
+	struct net_device *rdev = ndev;
+	struct rxe_dev *rxe = net_to_rxe(ndev);
+	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+	if (!rxe && is_vlan_dev(rdev)) {
+		rdev = vlan_dev_real_dev(ndev);
+		rxe = net_to_rxe(rdev);
+	}
+	if (!rxe)
+		goto drop;
+
+	if (skb_linearize(skb)) {
+		pr_err("skb_linearize failed\n");
+		goto drop;
+	}
+
+	udph = udp_hdr(skb);
+	pkt->rxe = rxe;
+	pkt->port_num = 1;
+	pkt->hdr = (u8 *)(udph + 1);
+	pkt->mask = RXE_GRH_MASK;
+	pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph);
+
+	return rxe_rcv(skb);
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port,
+					   bool ipv6)
+{
+	int err;
+	struct socket *sock;
+	struct udp_port_cfg udp_cfg = { };
+	struct udp_tunnel_sock_cfg tnl_cfg = { };
+
+	if (ipv6) {
+		udp_cfg.family = AF_INET6;
+		udp_cfg.ipv6_v6only = 1;
+	} else {
+		udp_cfg.family = AF_INET;
+	}
+
+	udp_cfg.local_udp_port = port;
+
+	/* Create UDP socket */
+	err = udp_sock_create(net, &udp_cfg, &sock);
+	if (err < 0) {
+		pr_err("failed to create udp socket. err = %d\n", err);
+		return ERR_PTR(err);
+	}
+
+	tnl_cfg.encap_type = 1;
+	tnl_cfg.encap_rcv = rxe_udp_encap_recv;
+
+	/* Setup UDP tunnel */
+	setup_udp_tunnel_sock(net, sock, &tnl_cfg);
+
+	return sock;
+}
+
+void rxe_release_udp_tunnel(struct socket *sk)
+{
+	if (sk)
+		udp_tunnel_sock_release(sk);
+}
+
+static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port,
+			    __be16 dst_port)
+{
+	struct udphdr *udph;
+
+	__skb_push(skb, sizeof(*udph));
+	skb_reset_transport_header(skb);
+	udph = udp_hdr(skb);
+
+	udph->dest = dst_port;
+	udph->source = src_port;
+	udph->len = htons(skb->len);
+	udph->check = 0;
+}
+
+static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb,
+			     __be32 saddr, __be32 daddr, __u8 proto,
+			     __u8 tos, __u8 ttl, __be16 df, bool xnet)
+{
+	struct iphdr *iph;
+
+	skb_scrub_packet(skb, xnet);
+
+	skb_clear_hash(skb);
+	skb_dst_set(skb, dst_clone(dst));
+	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+	skb_push(skb, sizeof(struct iphdr));
+	skb_reset_network_header(skb);
+
+	iph = ip_hdr(skb);
+
+	iph->version	=	IPVERSION;
+	iph->ihl	=	sizeof(struct iphdr) >> 2;
+	iph->frag_off	=	df;
+	iph->protocol	=	proto;
+	iph->tos	=	tos;
+	iph->daddr	=	daddr;
+	iph->saddr	=	saddr;
+	iph->ttl	=	ttl;
+	__ip_select_ident(dev_net(dst->dev), iph,
+			  skb_shinfo(skb)->gso_segs ?: 1);
+	iph->tot_len = htons(skb->len);
+	ip_send_check(iph);
+}
+
+static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb,
+			     struct in6_addr *saddr, struct in6_addr *daddr,
+			     __u8 proto, __u8 prio, __u8 ttl)
+{
+	struct ipv6hdr *ip6h;
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED
+			    | IPSKB_REROUTED);
+	skb_dst_set(skb, dst_clone(dst));
+
+	__skb_push(skb, sizeof(*ip6h));
+	skb_reset_network_header(skb);
+	ip6h		  = ipv6_hdr(skb);
+	ip6_flow_hdr(ip6h, prio, htonl(0));
+	ip6h->payload_len = htons(skb->len);
+	ip6h->nexthdr     = proto;
+	ip6h->hop_limit   = ttl;
+	ip6h->daddr	  = *daddr;
+	ip6h->saddr	  = *saddr;
+	ip6h->payload_len = htons(skb->len - sizeof(*ip6h));
+}
+
+static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+		    struct sk_buff *skb, struct rxe_av *av)
+{
+	struct rxe_qp *qp = pkt->qp;
+	struct dst_entry *dst;
+	bool xnet = false;
+	__be16 df = htons(IP_DF);
+	struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr;
+	struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr;
+
+	dst = rxe_find_route(rxe, qp, av);
+	if (!dst) {
+		pr_err("Host not reachable\n");
+		return -EHOSTUNREACH;
+	}
+
+	if (!memcmp(saddr, daddr, sizeof(*daddr)))
+		pkt->mask |= RXE_LOOPBACK_MASK;
+
+	prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT),
+			htons(ROCE_V2_UDP_DPORT));
+
+	prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP,
+			 av->grh.traffic_class, av->grh.hop_limit, df, xnet);
+
+	if (qp_type(qp) == IB_QPT_RC)
+		sk_dst_set(qp->sk->sk, dst);
+	else
+		dst_release(dst);
+
+	return 0;
+}
+
+static int prepare6(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+		    struct sk_buff *skb, struct rxe_av *av)
+{
+	struct rxe_qp *qp = pkt->qp;
+	struct dst_entry *dst;
+	struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr;
+	struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr;
+
+	dst = rxe_find_route(rxe, qp, av);
+	if (!dst) {
+		pr_err("Host not reachable\n");
+		return -EHOSTUNREACH;
+	}
+
+	if (!memcmp(saddr, daddr, sizeof(*daddr)))
+		pkt->mask |= RXE_LOOPBACK_MASK;
+
+	prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT),
+			htons(ROCE_V2_UDP_DPORT));
+
+	prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP,
+			 av->grh.traffic_class,
+			 av->grh.hop_limit);
+
+	if (qp_type(qp) == IB_QPT_RC)
+		sk_dst_set(qp->sk->sk, dst);
+	else
+		dst_release(dst);
+
+	return 0;
+}
+
+int rxe_prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+		struct sk_buff *skb, u32 *crc)
+{
+	int err = 0;
+	struct rxe_av *av = rxe_get_av(pkt);
+
+	if (av->network_type == RDMA_NETWORK_IPV4)
+		err = prepare4(rxe, pkt, skb, av);
+	else if (av->network_type == RDMA_NETWORK_IPV6)
+		err = prepare6(rxe, pkt, skb, av);
+
+	*crc = rxe_icrc_hdr(pkt, skb);
+
+	return err;
+}
+
+static void rxe_skb_tx_dtor(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct rxe_qp *qp = sk->sk_user_data;
+	int skb_out = atomic_dec_return(&qp->skb_out);
+
+	if (unlikely(qp->need_req_skb &&
+		     skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW))
+		rxe_run_task(&qp->req.task, 1);
+
+	rxe_drop_ref(qp);
+}
+
+int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb)
+{
+	struct rxe_av *av;
+	int err;
+
+	av = rxe_get_av(pkt);
+
+	skb->destructor = rxe_skb_tx_dtor;
+	skb->sk = pkt->qp->sk->sk;
+
+	rxe_add_ref(pkt->qp);
+	atomic_inc(&pkt->qp->skb_out);
+
+	if (av->network_type == RDMA_NETWORK_IPV4) {
+		err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+	} else if (av->network_type == RDMA_NETWORK_IPV6) {
+		err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+	} else {
+		pr_err("Unknown layer 3 protocol: %d\n", av->network_type);
+		atomic_dec(&pkt->qp->skb_out);
+		rxe_drop_ref(pkt->qp);
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	if (unlikely(net_xmit_eval(err))) {
+		pr_debug("error sending packet: %d\n", err);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+int rxe_loopback(struct sk_buff *skb)
+{
+	return rxe_rcv(skb);
+}
+
+static inline int addr_same(struct rxe_dev *rxe, struct rxe_av *av)
+{
+	return rxe->port.port_guid == av->grh.dgid.global.interface_id;
+}
+
+struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
+				int paylen, struct rxe_pkt_info *pkt)
+{
+	unsigned int hdr_len;
+	struct sk_buff *skb;
+	struct net_device *ndev;
+	const int port_num = 1;
+
+	ndev = rxe_netdev_from_av(rxe, port_num, av);
+
+	if (av->network_type == RDMA_NETWORK_IPV4)
+		hdr_len = ETH_HLEN + sizeof(struct udphdr) +
+			sizeof(struct iphdr);
+	else
+		hdr_len = ETH_HLEN + sizeof(struct udphdr) +
+			sizeof(struct ipv6hdr);
+
+	skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev),
+			GFP_ATOMIC);
+
+	if (unlikely(!skb)) {
+		dev_put(ndev);
+		return NULL;
+	}
+
+	skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(rxe->ndev));
+
+	skb->dev	= ndev;
+	if (av->network_type == RDMA_NETWORK_IPV4)
+		skb->protocol = htons(ETH_P_IP);
+	else
+		skb->protocol = htons(ETH_P_IPV6);
+
+	pkt->rxe	= rxe;
+	pkt->port_num	= port_num;
+	pkt->hdr	= skb_put(skb, paylen);
+	pkt->mask	|= RXE_GRH_MASK;
+
+	memset(pkt->hdr, 0, paylen);
+
+	dev_put(ndev);
+	return skb;
+}
+
+/*
+ * this is required by rxe_cfg to match rxe devices in
+ * /sys/class/infiniband up with their underlying ethernet devices
+ */
+const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num)
+{
+	return rxe->ndev->name;
+}
+
+enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+struct rxe_dev *rxe_net_add(struct net_device *ndev)
+{
+	int err;
+	struct rxe_dev *rxe = NULL;
+
+	rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
+	if (!rxe)
+		return NULL;
+
+	rxe->ndev = ndev;
+
+	err = rxe_add(rxe, ndev->mtu);
+	if (err) {
+		ib_dealloc_device(&rxe->ib_dev);
+		return NULL;
+	}
+
+	spin_lock_bh(&dev_list_lock);
+	list_add_tail(&rxe->list, &rxe_dev_list);
+	spin_unlock_bh(&dev_list_lock);
+	return rxe;
+}
+
+void rxe_remove_all(void)
+{
+	spin_lock_bh(&dev_list_lock);
+	while (!list_empty(&rxe_dev_list)) {
+		struct rxe_dev *rxe =
+			list_first_entry(&rxe_dev_list, struct rxe_dev, list);
+
+		list_del(&rxe->list);
+		spin_unlock_bh(&dev_list_lock);
+		rxe_remove(rxe);
+		spin_lock_bh(&dev_list_lock);
+	}
+	spin_unlock_bh(&dev_list_lock);
+}
+EXPORT_SYMBOL(rxe_remove_all);
+
+static void rxe_port_event(struct rxe_dev *rxe,
+			   enum ib_event_type event)
+{
+	struct ib_event ev;
+
+	ev.device = &rxe->ib_dev;
+	ev.element.port_num = 1;
+	ev.event = event;
+
+	ib_dispatch_event(&ev);
+}
+
+/* Caller must hold net_info_lock */
+void rxe_port_up(struct rxe_dev *rxe)
+{
+	struct rxe_port *port;
+
+	port = &rxe->port;
+	port->attr.state = IB_PORT_ACTIVE;
+	port->attr.phys_state = IB_PHYS_STATE_LINK_UP;
+
+	rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE);
+	pr_info("set %s active\n", rxe->ib_dev.name);
+}
+
+/* Caller must hold net_info_lock */
+void rxe_port_down(struct rxe_dev *rxe)
+{
+	struct rxe_port *port;
+
+	port = &rxe->port;
+	port->attr.state = IB_PORT_DOWN;
+	port->attr.phys_state = IB_PHYS_STATE_LINK_DOWN;
+
+	rxe_port_event(rxe, IB_EVENT_PORT_ERR);
+	pr_info("set %s down\n", rxe->ib_dev.name);
+}
+
+static int rxe_notify(struct notifier_block *not_blk,
+		      unsigned long event,
+		      void *arg)
+{
+	struct net_device *ndev = netdev_notifier_info_to_dev(arg);
+	struct rxe_dev *rxe = net_to_rxe(ndev);
+
+	if (!rxe)
+		goto out;
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		list_del(&rxe->list);
+		rxe_remove(rxe);
+		break;
+	case NETDEV_UP:
+		rxe_port_up(rxe);
+		break;
+	case NETDEV_DOWN:
+		rxe_port_down(rxe);
+		break;
+	case NETDEV_CHANGEMTU:
+		pr_info("%s changed mtu to %d\n", ndev->name, ndev->mtu);
+		rxe_set_mtu(rxe, ndev->mtu);
+		break;
+	case NETDEV_CHANGE:
+		if (netif_running(ndev) && netif_carrier_ok(ndev))
+			rxe_port_up(rxe);
+		else
+			rxe_port_down(rxe);
+		break;
+	case NETDEV_REBOOT:
+	case NETDEV_GOING_DOWN:
+	case NETDEV_CHANGEADDR:
+	case NETDEV_CHANGENAME:
+	case NETDEV_FEAT_CHANGE:
+	default:
+		pr_info("ignoring netdev event = %ld for %s\n",
+			event, ndev->name);
+		break;
+	}
+out:
+	return NOTIFY_OK;
+}
+
+struct notifier_block rxe_net_notifier = {
+	.notifier_call = rxe_notify,
+};
+
+static int rxe_net_ipv4_init(void)
+{
+	recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net,
+				htons(ROCE_V2_UDP_DPORT), false);
+	if (IS_ERR(recv_sockets.sk4)) {
+		recv_sockets.sk4 = NULL;
+		pr_err("Failed to create IPv4 UDP tunnel\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int rxe_net_ipv6_init(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+
+	recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net,
+						htons(ROCE_V2_UDP_DPORT), true);
+	if (IS_ERR(recv_sockets.sk6)) {
+		recv_sockets.sk6 = NULL;
+		pr_err("Failed to create IPv6 UDP tunnel\n");
+		return -1;
+	}
+#endif
+	return 0;
+}
+
+void rxe_net_exit(void)
+{
+	rxe_release_udp_tunnel(recv_sockets.sk6);
+	rxe_release_udp_tunnel(recv_sockets.sk4);
+	unregister_netdevice_notifier(&rxe_net_notifier);
+}
+
+int rxe_net_init(void)
+{
+	int err;
+
+	recv_sockets.sk6 = NULL;
+
+	err = rxe_net_ipv4_init();
+	if (err)
+		return err;
+	err = rxe_net_ipv6_init();
+	if (err)
+		goto err_out;
+	err = register_netdevice_notifier(&rxe_net_notifier);
+	if (err) {
+		pr_err("Failed to register netdev notifier\n");
+		goto err_out;
+	}
+	return 0;
+err_out:
+	rxe_net_exit();
+	return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
new file mode 100644
index 0000000..728d8c7
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_NET_H
+#define RXE_NET_H
+
+#include <net/sock.h>
+#include <net/if_inet6.h>
+#include <linux/module.h>
+
+struct rxe_recv_sockets {
+	struct socket *sk4;
+	struct socket *sk6;
+};
+
+extern struct notifier_block rxe_net_notifier;
+void rxe_release_udp_tunnel(struct socket *sk);
+
+struct rxe_dev *rxe_net_add(struct net_device *ndev);
+
+int rxe_net_init(void);
+void rxe_net_exit(void);
+
+#endif /* RXE_NET_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c
new file mode 100644
index 0000000..4cf1106
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_opcode.c
@@ -0,0 +1,961 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_pack.h>
+#include "rxe_opcode.h"
+#include "rxe_hdr.h"
+
+/* useful information about work request opcodes and pkt opcodes in
+ * table form
+ */
+struct rxe_wr_opcode_info rxe_wr_opcode_info[] = {
+	[IB_WR_RDMA_WRITE]				= {
+		.name	= "IB_WR_RDMA_WRITE",
+		.mask	= {
+			[IB_QPT_RC]	= WR_INLINE_MASK | WR_WRITE_MASK,
+			[IB_QPT_UC]	= WR_INLINE_MASK | WR_WRITE_MASK,
+		},
+	},
+	[IB_WR_RDMA_WRITE_WITH_IMM]			= {
+		.name	= "IB_WR_RDMA_WRITE_WITH_IMM",
+		.mask	= {
+			[IB_QPT_RC]	= WR_INLINE_MASK | WR_WRITE_MASK,
+			[IB_QPT_UC]	= WR_INLINE_MASK | WR_WRITE_MASK,
+		},
+	},
+	[IB_WR_SEND]					= {
+		.name	= "IB_WR_SEND",
+		.mask	= {
+			[IB_QPT_SMI]	= WR_INLINE_MASK | WR_SEND_MASK,
+			[IB_QPT_GSI]	= WR_INLINE_MASK | WR_SEND_MASK,
+			[IB_QPT_RC]	= WR_INLINE_MASK | WR_SEND_MASK,
+			[IB_QPT_UC]	= WR_INLINE_MASK | WR_SEND_MASK,
+			[IB_QPT_UD]	= WR_INLINE_MASK | WR_SEND_MASK,
+		},
+	},
+	[IB_WR_SEND_WITH_IMM]				= {
+		.name	= "IB_WR_SEND_WITH_IMM",
+		.mask	= {
+			[IB_QPT_SMI]	= WR_INLINE_MASK | WR_SEND_MASK,
+			[IB_QPT_GSI]	= WR_INLINE_MASK | WR_SEND_MASK,
+			[IB_QPT_RC]	= WR_INLINE_MASK | WR_SEND_MASK,
+			[IB_QPT_UC]	= WR_INLINE_MASK | WR_SEND_MASK,
+			[IB_QPT_UD]	= WR_INLINE_MASK | WR_SEND_MASK,
+		},
+	},
+	[IB_WR_RDMA_READ]				= {
+		.name	= "IB_WR_RDMA_READ",
+		.mask	= {
+			[IB_QPT_RC]	= WR_READ_MASK,
+		},
+	},
+	[IB_WR_ATOMIC_CMP_AND_SWP]			= {
+		.name	= "IB_WR_ATOMIC_CMP_AND_SWP",
+		.mask	= {
+			[IB_QPT_RC]	= WR_ATOMIC_MASK,
+		},
+	},
+	[IB_WR_ATOMIC_FETCH_AND_ADD]			= {
+		.name	= "IB_WR_ATOMIC_FETCH_AND_ADD",
+		.mask	= {
+			[IB_QPT_RC]	= WR_ATOMIC_MASK,
+		},
+	},
+	[IB_WR_LSO]					= {
+		.name	= "IB_WR_LSO",
+		.mask	= {
+			/* not supported */
+		},
+	},
+	[IB_WR_SEND_WITH_INV]				= {
+		.name	= "IB_WR_SEND_WITH_INV",
+		.mask	= {
+			[IB_QPT_RC]	= WR_INLINE_MASK | WR_SEND_MASK,
+			[IB_QPT_UC]	= WR_INLINE_MASK | WR_SEND_MASK,
+			[IB_QPT_UD]	= WR_INLINE_MASK | WR_SEND_MASK,
+		},
+	},
+	[IB_WR_RDMA_READ_WITH_INV]			= {
+		.name	= "IB_WR_RDMA_READ_WITH_INV",
+		.mask	= {
+			[IB_QPT_RC]	= WR_READ_MASK,
+		},
+	},
+	[IB_WR_LOCAL_INV]				= {
+		.name	= "IB_WR_LOCAL_INV",
+		.mask	= {
+			[IB_QPT_RC]	= WR_REG_MASK,
+		},
+	},
+	[IB_WR_REG_MR]					= {
+		.name	= "IB_WR_REG_MR",
+		.mask	= {
+			[IB_QPT_RC]	= WR_REG_MASK,
+		},
+	},
+};
+
+struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = {
+	[IB_OPCODE_RC_SEND_FIRST]			= {
+		.name	= "IB_OPCODE_RC_SEND_FIRST",
+		.mask	= RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK
+				| RXE_SEND_MASK | RXE_START_MASK,
+		.length = RXE_BTH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_SEND_MIDDLE]		= {
+		.name	= "IB_OPCODE_RC_SEND_MIDDLE]",
+		.mask	= RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK
+				| RXE_MIDDLE_MASK,
+		.length = RXE_BTH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_SEND_LAST]			= {
+		.name	= "IB_OPCODE_RC_SEND_LAST",
+		.mask	= RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+				| RXE_SEND_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]		= {
+		.name	= "IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE",
+		.mask	= RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_IMMDT]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_IMMDT_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_SEND_ONLY]			= {
+		.name	= "IB_OPCODE_RC_SEND_ONLY",
+		.mask	= RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+				| RXE_RWR_MASK | RXE_SEND_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]		= {
+		.name	= "IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE",
+		.mask	= RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_IMMDT]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_IMMDT_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_RDMA_WRITE_FIRST]		= {
+		.name	= "IB_OPCODE_RC_RDMA_WRITE_FIRST",
+		.mask	= RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_WRITE_MASK | RXE_START_MASK,
+		.length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_RDMA_WRITE_MIDDLE]		= {
+		.name	= "IB_OPCODE_RC_RDMA_WRITE_MIDDLE",
+		.mask	= RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+				| RXE_MIDDLE_MASK,
+		.length = RXE_BTH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_RDMA_WRITE_LAST]			= {
+		.name	= "IB_OPCODE_RC_RDMA_WRITE_LAST",
+		.mask	= RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE]		= {
+		.name	= "IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+		.mask	= RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_IMMDT]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_IMMDT_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_RDMA_WRITE_ONLY]			= {
+		.name	= "IB_OPCODE_RC_RDMA_WRITE_ONLY",
+		.mask	= RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_WRITE_MASK | RXE_START_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE]		= {
+		.name	= "IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+		.mask	= RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+				| RXE_REQ_MASK | RXE_WRITE_MASK
+				| RXE_COMP_MASK | RXE_RWR_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RETH]	= RXE_BTH_BYTES,
+			[RXE_IMMDT]	= RXE_BTH_BYTES
+						+ RXE_RETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RETH_BYTES
+						+ RXE_IMMDT_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_RDMA_READ_REQUEST]			= {
+		.name	= "IB_OPCODE_RC_RDMA_READ_REQUEST",
+		.mask	= RXE_RETH_MASK | RXE_REQ_MASK | RXE_READ_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]		= {
+		.name	= "IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST",
+		.mask	= RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+				| RXE_START_MASK,
+		.length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_AETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_AETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]		= {
+		.name	= "IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE",
+		.mask	= RXE_PAYLOAD_MASK | RXE_ACK_MASK | RXE_MIDDLE_MASK,
+		.length = RXE_BTH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]		= {
+		.name	= "IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST",
+		.mask	= RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_AETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_AETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]		= {
+		.name	= "IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY",
+		.mask	= RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_AETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_AETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_ACKNOWLEDGE]			= {
+		.name	= "IB_OPCODE_RC_ACKNOWLEDGE",
+		.mask	= RXE_AETH_MASK | RXE_ACK_MASK | RXE_START_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_AETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_AETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]			= {
+		.name	= "IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE",
+		.mask	= RXE_AETH_MASK | RXE_ATMACK_MASK | RXE_ACK_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_AETH]	= RXE_BTH_BYTES,
+			[RXE_ATMACK]	= RXE_BTH_BYTES
+						+ RXE_AETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+					+ RXE_ATMACK_BYTES + RXE_AETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_COMPARE_SWAP]			= {
+		.name	= "IB_OPCODE_RC_COMPARE_SWAP",
+		.mask	= RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_ATMETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_ATMETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_ATMETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_FETCH_ADD]			= {
+		.name	= "IB_OPCODE_RC_FETCH_ADD",
+		.mask	= RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_ATMETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_ATMETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_ATMETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]		= {
+		.name	= "IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE",
+		.mask	= RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_IETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_IETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]		= {
+		.name	= "IB_OPCODE_RC_SEND_ONLY_INV",
+		.mask	= RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+				| RXE_END_MASK  | RXE_START_MASK,
+		.length = RXE_BTH_BYTES + RXE_IETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_IETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_IETH_BYTES,
+		}
+	},
+
+	/* UC */
+	[IB_OPCODE_UC_SEND_FIRST]			= {
+		.name	= "IB_OPCODE_UC_SEND_FIRST",
+		.mask	= RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK
+				| RXE_SEND_MASK | RXE_START_MASK,
+		.length = RXE_BTH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES,
+		}
+	},
+	[IB_OPCODE_UC_SEND_MIDDLE]		= {
+		.name	= "IB_OPCODE_UC_SEND_MIDDLE",
+		.mask	= RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK
+				| RXE_MIDDLE_MASK,
+		.length = RXE_BTH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES,
+		}
+	},
+	[IB_OPCODE_UC_SEND_LAST]			= {
+		.name	= "IB_OPCODE_UC_SEND_LAST",
+		.mask	= RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+				| RXE_SEND_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES,
+		}
+	},
+	[IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]		= {
+		.name	= "IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE",
+		.mask	= RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_IMMDT]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_IMMDT_BYTES,
+		}
+	},
+	[IB_OPCODE_UC_SEND_ONLY]			= {
+		.name	= "IB_OPCODE_UC_SEND_ONLY",
+		.mask	= RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+				| RXE_RWR_MASK | RXE_SEND_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES,
+		}
+	},
+	[IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]		= {
+		.name	= "IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE",
+		.mask	= RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_IMMDT]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_IMMDT_BYTES,
+		}
+	},
+	[IB_OPCODE_UC_RDMA_WRITE_FIRST]		= {
+		.name	= "IB_OPCODE_UC_RDMA_WRITE_FIRST",
+		.mask	= RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_WRITE_MASK | RXE_START_MASK,
+		.length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RETH_BYTES,
+		}
+	},
+	[IB_OPCODE_UC_RDMA_WRITE_MIDDLE]		= {
+		.name	= "IB_OPCODE_UC_RDMA_WRITE_MIDDLE",
+		.mask	= RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+				| RXE_MIDDLE_MASK,
+		.length = RXE_BTH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES,
+		}
+	},
+	[IB_OPCODE_UC_RDMA_WRITE_LAST]			= {
+		.name	= "IB_OPCODE_UC_RDMA_WRITE_LAST",
+		.mask	= RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES,
+		}
+	},
+	[IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE]		= {
+		.name	= "IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+		.mask	= RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_IMMDT]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_IMMDT_BYTES,
+		}
+	},
+	[IB_OPCODE_UC_RDMA_WRITE_ONLY]			= {
+		.name	= "IB_OPCODE_UC_RDMA_WRITE_ONLY",
+		.mask	= RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_WRITE_MASK | RXE_START_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RETH_BYTES,
+		}
+	},
+	[IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE]		= {
+		.name	= "IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+		.mask	= RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+				| RXE_REQ_MASK | RXE_WRITE_MASK
+				| RXE_COMP_MASK | RXE_RWR_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RETH]	= RXE_BTH_BYTES,
+			[RXE_IMMDT]	= RXE_BTH_BYTES
+						+ RXE_RETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RETH_BYTES
+						+ RXE_IMMDT_BYTES,
+		}
+	},
+
+	/* RD */
+	[IB_OPCODE_RD_SEND_FIRST]			= {
+		.name	= "IB_OPCODE_RD_SEND_FIRST",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+				| RXE_REQ_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+				| RXE_START_MASK,
+		.length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_SEND_MIDDLE]		= {
+		.name	= "IB_OPCODE_RD_SEND_MIDDLE",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+				| RXE_REQ_MASK | RXE_SEND_MASK
+				| RXE_MIDDLE_MASK,
+		.length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_SEND_LAST]			= {
+		.name	= "IB_OPCODE_RD_SEND_LAST",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+				| RXE_REQ_MASK | RXE_COMP_MASK | RXE_SEND_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE]		= {
+		.name	= "IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK
+				| RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_COMP_MASK | RXE_SEND_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES
+				+ RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_IMMDT]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES
+						+ RXE_IMMDT_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_SEND_ONLY]			= {
+		.name	= "IB_OPCODE_RD_SEND_ONLY",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+				| RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+				| RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE]		= {
+		.name	= "IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK
+				| RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES
+				+ RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_IMMDT]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES
+						+ RXE_IMMDT_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_RDMA_WRITE_FIRST]		= {
+		.name	= "IB_OPCODE_RD_RDMA_WRITE_FIRST",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+				| RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_WRITE_MASK | RXE_START_MASK,
+		.length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES
+				+ RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_RETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES
+						+ RXE_RETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_RDMA_WRITE_MIDDLE]		= {
+		.name	= "IB_OPCODE_RD_RDMA_WRITE_MIDDLE",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+				| RXE_REQ_MASK | RXE_WRITE_MASK
+				| RXE_MIDDLE_MASK,
+		.length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_RDMA_WRITE_LAST]			= {
+		.name	= "IB_OPCODE_RD_RDMA_WRITE_LAST",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+				| RXE_REQ_MASK | RXE_WRITE_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE]		= {
+		.name	= "IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK
+				| RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES
+				+ RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_IMMDT]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES
+						+ RXE_IMMDT_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_RDMA_WRITE_ONLY]			= {
+		.name	= "IB_OPCODE_RD_RDMA_WRITE_ONLY",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+				| RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_WRITE_MASK | RXE_START_MASK
+				| RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES
+				+ RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_RETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES
+						+ RXE_RETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE]		= {
+		.name	= "IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+				| RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+				| RXE_REQ_MASK | RXE_WRITE_MASK
+				| RXE_COMP_MASK | RXE_RWR_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES
+				+ RXE_DETH_BYTES + RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_RETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+			[RXE_IMMDT]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES
+						+ RXE_RETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES
+						+ RXE_RETH_BYTES
+						+ RXE_IMMDT_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_RDMA_READ_REQUEST]			= {
+		.name	= "IB_OPCODE_RD_RDMA_READ_REQUEST",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+				| RXE_REQ_MASK | RXE_READ_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES
+				+ RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_RETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RETH_BYTES
+						+ RXE_DETH_BYTES
+						+ RXE_RDETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST]		= {
+		.name	= "IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST",
+		.mask	= RXE_RDETH_MASK | RXE_AETH_MASK
+				| RXE_PAYLOAD_MASK | RXE_ACK_MASK
+				| RXE_START_MASK,
+		.length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_AETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_AETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE]		= {
+		.name	= "IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE",
+		.mask	= RXE_RDETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+				| RXE_MIDDLE_MASK,
+		.length = RXE_BTH_BYTES + RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST]		= {
+		.name	= "IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST",
+		.mask	= RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK
+				| RXE_ACK_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_AETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_AETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY]		= {
+		.name	= "IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY",
+		.mask	= RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK
+				| RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_AETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_AETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_ACKNOWLEDGE]			= {
+		.name	= "IB_OPCODE_RD_ACKNOWLEDGE",
+		.mask	= RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ACK_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_AETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE]			= {
+		.name	= "IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE",
+		.mask	= RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ATMACK_MASK
+				| RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES
+				+ RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_AETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_ATMACK]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_AETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_COMPARE_SWAP]			= {
+		.name	= "RD_COMPARE_SWAP",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK
+				| RXE_REQ_MASK | RXE_ATOMIC_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES
+				+ RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_ATMETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES +
+						+ RXE_ATMETH_BYTES
+						+ RXE_DETH_BYTES +
+						+ RXE_RDETH_BYTES,
+		}
+	},
+	[IB_OPCODE_RD_FETCH_ADD]			= {
+		.name	= "IB_OPCODE_RD_FETCH_ADD",
+		.mask	= RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK
+				| RXE_REQ_MASK | RXE_ATOMIC_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES
+				+ RXE_RDETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_RDETH]	= RXE_BTH_BYTES,
+			[RXE_DETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES,
+			[RXE_ATMETH]	= RXE_BTH_BYTES
+						+ RXE_RDETH_BYTES
+						+ RXE_DETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES +
+						+ RXE_ATMETH_BYTES
+						+ RXE_DETH_BYTES +
+						+ RXE_RDETH_BYTES,
+		}
+	},
+
+	/* UD */
+	[IB_OPCODE_UD_SEND_ONLY]			= {
+		.name	= "IB_OPCODE_UD_SEND_ONLY",
+		.mask	= RXE_DETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+				| RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+				| RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_DETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_DETH]	= RXE_BTH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_DETH_BYTES,
+		}
+	},
+	[IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]		= {
+		.name	= "IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE",
+		.mask	= RXE_DETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+				| RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+				| RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK,
+		.length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES,
+		.offset = {
+			[RXE_BTH]	= 0,
+			[RXE_DETH]	= RXE_BTH_BYTES,
+			[RXE_IMMDT]	= RXE_BTH_BYTES
+						+ RXE_DETH_BYTES,
+			[RXE_PAYLOAD]	= RXE_BTH_BYTES
+						+ RXE_DETH_BYTES
+						+ RXE_IMMDT_BYTES,
+		}
+	},
+
+};
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h
new file mode 100644
index 0000000..307604e
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_opcode.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_OPCODE_H
+#define RXE_OPCODE_H
+
+/*
+ * contains header bit mask definitions and header lengths
+ * declaration of the rxe_opcode_info struct and
+ * rxe_wr_opcode_info struct
+ */
+
+enum rxe_wr_mask {
+	WR_INLINE_MASK			= BIT(0),
+	WR_ATOMIC_MASK			= BIT(1),
+	WR_SEND_MASK			= BIT(2),
+	WR_READ_MASK			= BIT(3),
+	WR_WRITE_MASK			= BIT(4),
+	WR_LOCAL_MASK			= BIT(5),
+	WR_REG_MASK			= BIT(6),
+
+	WR_READ_OR_WRITE_MASK		= WR_READ_MASK | WR_WRITE_MASK,
+	WR_READ_WRITE_OR_SEND_MASK	= WR_READ_OR_WRITE_MASK | WR_SEND_MASK,
+	WR_WRITE_OR_SEND_MASK		= WR_WRITE_MASK | WR_SEND_MASK,
+	WR_ATOMIC_OR_READ_MASK		= WR_ATOMIC_MASK | WR_READ_MASK,
+};
+
+#define WR_MAX_QPT		(8)
+
+struct rxe_wr_opcode_info {
+	char			*name;
+	enum rxe_wr_mask	mask[WR_MAX_QPT];
+};
+
+extern struct rxe_wr_opcode_info rxe_wr_opcode_info[];
+
+enum rxe_hdr_type {
+	RXE_LRH,
+	RXE_GRH,
+	RXE_BTH,
+	RXE_RETH,
+	RXE_AETH,
+	RXE_ATMETH,
+	RXE_ATMACK,
+	RXE_IETH,
+	RXE_RDETH,
+	RXE_DETH,
+	RXE_IMMDT,
+	RXE_PAYLOAD,
+	NUM_HDR_TYPES
+};
+
+enum rxe_hdr_mask {
+	RXE_LRH_MASK		= BIT(RXE_LRH),
+	RXE_GRH_MASK		= BIT(RXE_GRH),
+	RXE_BTH_MASK		= BIT(RXE_BTH),
+	RXE_IMMDT_MASK		= BIT(RXE_IMMDT),
+	RXE_RETH_MASK		= BIT(RXE_RETH),
+	RXE_AETH_MASK		= BIT(RXE_AETH),
+	RXE_ATMETH_MASK		= BIT(RXE_ATMETH),
+	RXE_ATMACK_MASK		= BIT(RXE_ATMACK),
+	RXE_IETH_MASK		= BIT(RXE_IETH),
+	RXE_RDETH_MASK		= BIT(RXE_RDETH),
+	RXE_DETH_MASK		= BIT(RXE_DETH),
+	RXE_PAYLOAD_MASK	= BIT(RXE_PAYLOAD),
+
+	RXE_REQ_MASK		= BIT(NUM_HDR_TYPES + 0),
+	RXE_ACK_MASK		= BIT(NUM_HDR_TYPES + 1),
+	RXE_SEND_MASK		= BIT(NUM_HDR_TYPES + 2),
+	RXE_WRITE_MASK		= BIT(NUM_HDR_TYPES + 3),
+	RXE_READ_MASK		= BIT(NUM_HDR_TYPES + 4),
+	RXE_ATOMIC_MASK		= BIT(NUM_HDR_TYPES + 5),
+
+	RXE_RWR_MASK		= BIT(NUM_HDR_TYPES + 6),
+	RXE_COMP_MASK		= BIT(NUM_HDR_TYPES + 7),
+
+	RXE_START_MASK		= BIT(NUM_HDR_TYPES + 8),
+	RXE_MIDDLE_MASK		= BIT(NUM_HDR_TYPES + 9),
+	RXE_END_MASK		= BIT(NUM_HDR_TYPES + 10),
+
+	RXE_LOOPBACK_MASK	= BIT(NUM_HDR_TYPES + 12),
+
+	RXE_READ_OR_ATOMIC	= (RXE_READ_MASK | RXE_ATOMIC_MASK),
+	RXE_WRITE_OR_SEND	= (RXE_WRITE_MASK | RXE_SEND_MASK),
+};
+
+#define OPCODE_NONE		(-1)
+#define RXE_NUM_OPCODE		256
+
+struct rxe_opcode_info {
+	char			*name;
+	enum rxe_hdr_mask	mask;
+	int			length;
+	int			offset[NUM_HDR_TYPES];
+};
+
+extern struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE];
+
+#endif /* RXE_OPCODE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h
new file mode 100644
index 0000000..1b596fb
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_param.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_PARAM_H
+#define RXE_PARAM_H
+
+static inline enum ib_mtu rxe_mtu_int_to_enum(int mtu)
+{
+	if (mtu < 256)
+		return 0;
+	else if (mtu < 512)
+		return IB_MTU_256;
+	else if (mtu < 1024)
+		return IB_MTU_512;
+	else if (mtu < 2048)
+		return IB_MTU_1024;
+	else if (mtu < 4096)
+		return IB_MTU_2048;
+	else
+		return IB_MTU_4096;
+}
+
+/* Find the IB mtu for a given network MTU. */
+static inline enum ib_mtu eth_mtu_int_to_enum(int mtu)
+{
+	mtu -= RXE_MAX_HDR_LENGTH;
+
+	return rxe_mtu_int_to_enum(mtu);
+}
+
+/* default/initial rxe device parameter settings */
+enum rxe_device_param {
+	RXE_FW_VER			= 0,
+	RXE_MAX_MR_SIZE			= -1ull,
+	RXE_PAGE_SIZE_CAP		= 0xfffff000,
+	RXE_VENDOR_ID			= 0,
+	RXE_VENDOR_PART_ID		= 0,
+	RXE_HW_VER			= 0,
+	RXE_MAX_QP			= 0x10000,
+	RXE_MAX_QP_WR			= 0x4000,
+	RXE_MAX_INLINE_DATA		= 400,
+	RXE_DEVICE_CAP_FLAGS		= IB_DEVICE_BAD_PKEY_CNTR
+					| IB_DEVICE_BAD_QKEY_CNTR
+					| IB_DEVICE_AUTO_PATH_MIG
+					| IB_DEVICE_CHANGE_PHY_PORT
+					| IB_DEVICE_UD_AV_PORT_ENFORCE
+					| IB_DEVICE_PORT_ACTIVE_EVENT
+					| IB_DEVICE_SYS_IMAGE_GUID
+					| IB_DEVICE_RC_RNR_NAK_GEN
+					| IB_DEVICE_SRQ_RESIZE
+					| IB_DEVICE_MEM_MGT_EXTENSIONS,
+	RXE_MAX_SGE			= 32,
+	RXE_MAX_SGE_RD			= 32,
+	RXE_MAX_CQ			= 16384,
+	RXE_MAX_LOG_CQE			= 15,
+	RXE_MAX_MR			= 2 * 1024,
+	RXE_MAX_PD			= 0x7ffc,
+	RXE_MAX_QP_RD_ATOM		= 128,
+	RXE_MAX_EE_RD_ATOM		= 0,
+	RXE_MAX_RES_RD_ATOM		= 0x3f000,
+	RXE_MAX_QP_INIT_RD_ATOM		= 128,
+	RXE_MAX_EE_INIT_RD_ATOM		= 0,
+	RXE_ATOMIC_CAP			= 1,
+	RXE_MAX_EE			= 0,
+	RXE_MAX_RDD			= 0,
+	RXE_MAX_MW			= 0,
+	RXE_MAX_RAW_IPV6_QP		= 0,
+	RXE_MAX_RAW_ETHY_QP		= 0,
+	RXE_MAX_MCAST_GRP		= 8192,
+	RXE_MAX_MCAST_QP_ATTACH		= 56,
+	RXE_MAX_TOT_MCAST_QP_ATTACH	= 0x70000,
+	RXE_MAX_AH			= 100,
+	RXE_MAX_FMR			= 0,
+	RXE_MAX_MAP_PER_FMR		= 0,
+	RXE_MAX_SRQ			= 960,
+	RXE_MAX_SRQ_WR			= 0x4000,
+	RXE_MIN_SRQ_WR			= 1,
+	RXE_MAX_SRQ_SGE			= 27,
+	RXE_MIN_SRQ_SGE			= 1,
+	RXE_MAX_FMR_PAGE_LIST_LEN	= 512,
+	RXE_MAX_PKEYS			= 64,
+	RXE_LOCAL_CA_ACK_DELAY		= 15,
+
+	RXE_MAX_UCONTEXT		= 512,
+
+	RXE_NUM_PORT			= 1,
+
+	RXE_MIN_QP_INDEX		= 16,
+	RXE_MAX_QP_INDEX		= 0x00020000,
+
+	RXE_MIN_SRQ_INDEX		= 0x00020001,
+	RXE_MAX_SRQ_INDEX		= 0x00040000,
+
+	RXE_MIN_MR_INDEX		= 0x00000001,
+	RXE_MAX_MR_INDEX		= 0x00040000,
+	RXE_MIN_MW_INDEX		= 0x00040001,
+	RXE_MAX_MW_INDEX		= 0x00060000,
+	RXE_MAX_PKT_PER_ACK		= 64,
+
+	RXE_MAX_UNACKED_PSNS		= 128,
+
+	/* Max inflight SKBs per queue pair */
+	RXE_INFLIGHT_SKBS_PER_QP_HIGH	= 64,
+	RXE_INFLIGHT_SKBS_PER_QP_LOW	= 16,
+
+	/* Delay before calling arbiter timer */
+	RXE_NSEC_ARB_TIMER_DELAY	= 200,
+};
+
+/* default/initial rxe port parameters */
+enum rxe_port_param {
+	RXE_PORT_STATE			= IB_PORT_DOWN,
+	RXE_PORT_MAX_MTU		= IB_MTU_4096,
+	RXE_PORT_ACTIVE_MTU		= IB_MTU_256,
+	RXE_PORT_GID_TBL_LEN		= 1024,
+	RXE_PORT_PORT_CAP_FLAGS		= RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP,
+	RXE_PORT_MAX_MSG_SZ		= 0x800000,
+	RXE_PORT_BAD_PKEY_CNTR		= 0,
+	RXE_PORT_QKEY_VIOL_CNTR		= 0,
+	RXE_PORT_LID			= 0,
+	RXE_PORT_SM_LID			= 0,
+	RXE_PORT_SM_SL			= 0,
+	RXE_PORT_LMC			= 0,
+	RXE_PORT_MAX_VL_NUM		= 1,
+	RXE_PORT_SUBNET_TIMEOUT		= 0,
+	RXE_PORT_INIT_TYPE_REPLY	= 0,
+	RXE_PORT_ACTIVE_WIDTH		= IB_WIDTH_1X,
+	RXE_PORT_ACTIVE_SPEED		= 1,
+	RXE_PORT_PKEY_TBL_LEN		= 64,
+	RXE_PORT_PHYS_STATE		= 2,
+	RXE_PORT_SUBNET_PREFIX		= 0xfe80000000000000ULL,
+};
+
+/* default/initial port info parameters */
+enum rxe_port_info_param {
+	RXE_PORT_INFO_VL_CAP		= 4,	/* 1-8 */
+	RXE_PORT_INFO_MTU_CAP		= 5,	/* 4096 */
+	RXE_PORT_INFO_OPER_VL		= 1,	/* 1 */
+};
+
+#endif /* RXE_PARAM_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
new file mode 100644
index 0000000..b4a8acc
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *	   Redistribution and use in source and binary forms, with or
+ *	   without modification, are permitted provided that the following
+ *	   conditions are met:
+ *
+ *		- Redistributions of source code must retain the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer.
+ *
+ *		- Redistributions in binary form must reproduce the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer in the documentation and/or other materials
+ *		  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/* info about object pools
+ * note that mr and mw share a single index space
+ * so that one can map an lkey to the correct type of object
+ */
+struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
+	[RXE_TYPE_UC] = {
+		.name		= "rxe-uc",
+		.size		= sizeof(struct rxe_ucontext),
+	},
+	[RXE_TYPE_PD] = {
+		.name		= "rxe-pd",
+		.size		= sizeof(struct rxe_pd),
+	},
+	[RXE_TYPE_AH] = {
+		.name		= "rxe-ah",
+		.size		= sizeof(struct rxe_ah),
+		.flags		= RXE_POOL_ATOMIC,
+	},
+	[RXE_TYPE_SRQ] = {
+		.name		= "rxe-srq",
+		.size		= sizeof(struct rxe_srq),
+		.flags		= RXE_POOL_INDEX,
+		.min_index	= RXE_MIN_SRQ_INDEX,
+		.max_index	= RXE_MAX_SRQ_INDEX,
+	},
+	[RXE_TYPE_QP] = {
+		.name		= "rxe-qp",
+		.size		= sizeof(struct rxe_qp),
+		.cleanup	= rxe_qp_cleanup,
+		.flags		= RXE_POOL_INDEX,
+		.min_index	= RXE_MIN_QP_INDEX,
+		.max_index	= RXE_MAX_QP_INDEX,
+	},
+	[RXE_TYPE_CQ] = {
+		.name		= "rxe-cq",
+		.size		= sizeof(struct rxe_cq),
+		.cleanup	= rxe_cq_cleanup,
+	},
+	[RXE_TYPE_MR] = {
+		.name		= "rxe-mr",
+		.size		= sizeof(struct rxe_mem),
+		.cleanup	= rxe_mem_cleanup,
+		.flags		= RXE_POOL_INDEX,
+		.max_index	= RXE_MAX_MR_INDEX,
+		.min_index	= RXE_MIN_MR_INDEX,
+	},
+	[RXE_TYPE_MW] = {
+		.name		= "rxe-mw",
+		.size		= sizeof(struct rxe_mem),
+		.flags		= RXE_POOL_INDEX,
+		.max_index	= RXE_MAX_MW_INDEX,
+		.min_index	= RXE_MIN_MW_INDEX,
+	},
+	[RXE_TYPE_MC_GRP] = {
+		.name		= "rxe-mc_grp",
+		.size		= sizeof(struct rxe_mc_grp),
+		.cleanup	= rxe_mc_cleanup,
+		.flags		= RXE_POOL_KEY,
+		.key_offset	= offsetof(struct rxe_mc_grp, mgid),
+		.key_size	= sizeof(union ib_gid),
+	},
+	[RXE_TYPE_MC_ELEM] = {
+		.name		= "rxe-mc_elem",
+		.size		= sizeof(struct rxe_mc_elem),
+		.flags		= RXE_POOL_ATOMIC,
+	},
+};
+
+static inline const char *pool_name(struct rxe_pool *pool)
+{
+	return rxe_type_info[pool->type].name;
+}
+
+static inline struct kmem_cache *pool_cache(struct rxe_pool *pool)
+{
+	return rxe_type_info[pool->type].cache;
+}
+
+int rxe_cache_init(void)
+{
+	int err;
+	int i;
+	size_t size;
+	struct rxe_type_info *type;
+
+	for (i = 0; i < RXE_NUM_TYPES; i++) {
+		type = &rxe_type_info[i];
+		size = ALIGN(type->size, RXE_POOL_ALIGN);
+		type->cache = kmem_cache_create(type->name, size,
+				RXE_POOL_ALIGN,
+				RXE_POOL_CACHE_FLAGS, NULL);
+		if (!type->cache) {
+			pr_err("Unable to init kmem cache for %s\n",
+			       type->name);
+			err = -ENOMEM;
+			goto err1;
+		}
+	}
+
+	return 0;
+
+err1:
+	while (--i >= 0) {
+		kmem_cache_destroy(type->cache);
+		type->cache = NULL;
+	}
+
+	return err;
+}
+
+void rxe_cache_exit(void)
+{
+	int i;
+	struct rxe_type_info *type;
+
+	for (i = 0; i < RXE_NUM_TYPES; i++) {
+		type = &rxe_type_info[i];
+		kmem_cache_destroy(type->cache);
+		type->cache = NULL;
+	}
+}
+
+static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min)
+{
+	int err = 0;
+	size_t size;
+
+	if ((max - min + 1) < pool->max_elem) {
+		pr_warn("not enough indices for max_elem\n");
+		err = -EINVAL;
+		goto out;
+	}
+
+	pool->max_index = max;
+	pool->min_index = min;
+
+	size = BITS_TO_LONGS(max - min + 1) * sizeof(long);
+	pool->table = kmalloc(size, GFP_KERNEL);
+	if (!pool->table) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	pool->table_size = size;
+	bitmap_zero(pool->table, max - min + 1);
+
+out:
+	return err;
+}
+
+int rxe_pool_init(
+	struct rxe_dev		*rxe,
+	struct rxe_pool		*pool,
+	enum rxe_elem_type	type,
+	unsigned int		max_elem)
+{
+	int			err = 0;
+	size_t			size = rxe_type_info[type].size;
+
+	memset(pool, 0, sizeof(*pool));
+
+	pool->rxe		= rxe;
+	pool->type		= type;
+	pool->max_elem		= max_elem;
+	pool->elem_size		= ALIGN(size, RXE_POOL_ALIGN);
+	pool->flags		= rxe_type_info[type].flags;
+	pool->tree		= RB_ROOT;
+	pool->cleanup		= rxe_type_info[type].cleanup;
+
+	atomic_set(&pool->num_elem, 0);
+
+	kref_init(&pool->ref_cnt);
+
+	spin_lock_init(&pool->pool_lock);
+
+	if (rxe_type_info[type].flags & RXE_POOL_INDEX) {
+		err = rxe_pool_init_index(pool,
+					  rxe_type_info[type].max_index,
+					  rxe_type_info[type].min_index);
+		if (err)
+			goto out;
+	}
+
+	if (rxe_type_info[type].flags & RXE_POOL_KEY) {
+		pool->key_offset = rxe_type_info[type].key_offset;
+		pool->key_size = rxe_type_info[type].key_size;
+	}
+
+	pool->state = rxe_pool_valid;
+
+out:
+	return err;
+}
+
+static void rxe_pool_release(struct kref *kref)
+{
+	struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt);
+
+	pool->state = rxe_pool_invalid;
+	kfree(pool->table);
+}
+
+static void rxe_pool_put(struct rxe_pool *pool)
+{
+	kref_put(&pool->ref_cnt, rxe_pool_release);
+}
+
+int rxe_pool_cleanup(struct rxe_pool *pool)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	pool->state = rxe_pool_invalid;
+	if (atomic_read(&pool->num_elem) > 0)
+		pr_warn("%s pool destroyed with unfree'd elem\n",
+			pool_name(pool));
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	rxe_pool_put(pool);
+
+	return 0;
+}
+
+static u32 alloc_index(struct rxe_pool *pool)
+{
+	u32 index;
+	u32 range = pool->max_index - pool->min_index + 1;
+
+	index = find_next_zero_bit(pool->table, range, pool->last);
+	if (index >= range)
+		index = find_first_zero_bit(pool->table, range);
+
+	WARN_ON_ONCE(index >= range);
+	set_bit(index, pool->table);
+	pool->last = index;
+	return index + pool->min_index;
+}
+
+static void insert_index(struct rxe_pool *pool, struct rxe_pool_entry *new)
+{
+	struct rb_node **link = &pool->tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct rxe_pool_entry *elem;
+
+	while (*link) {
+		parent = *link;
+		elem = rb_entry(parent, struct rxe_pool_entry, node);
+
+		if (elem->index == new->index) {
+			pr_warn("element already exists!\n");
+			goto out;
+		}
+
+		if (elem->index > new->index)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &pool->tree);
+out:
+	return;
+}
+
+static void insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new)
+{
+	struct rb_node **link = &pool->tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct rxe_pool_entry *elem;
+	int cmp;
+
+	while (*link) {
+		parent = *link;
+		elem = rb_entry(parent, struct rxe_pool_entry, node);
+
+		cmp = memcmp((u8 *)elem + pool->key_offset,
+			     (u8 *)new + pool->key_offset, pool->key_size);
+
+		if (cmp == 0) {
+			pr_warn("key already exists!\n");
+			goto out;
+		}
+
+		if (cmp > 0)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &pool->tree);
+out:
+	return;
+}
+
+void rxe_add_key(void *arg, void *key)
+{
+	struct rxe_pool_entry *elem = arg;
+	struct rxe_pool *pool = elem->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	memcpy((u8 *)elem + pool->key_offset, key, pool->key_size);
+	insert_key(pool, elem);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_drop_key(void *arg)
+{
+	struct rxe_pool_entry *elem = arg;
+	struct rxe_pool *pool = elem->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	rb_erase(&elem->node, &pool->tree);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_add_index(void *arg)
+{
+	struct rxe_pool_entry *elem = arg;
+	struct rxe_pool *pool = elem->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	elem->index = alloc_index(pool);
+	insert_index(pool, elem);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_drop_index(void *arg)
+{
+	struct rxe_pool_entry *elem = arg;
+	struct rxe_pool *pool = elem->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	clear_bit(elem->index - pool->min_index, pool->table);
+	rb_erase(&elem->node, &pool->tree);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void *rxe_alloc(struct rxe_pool *pool)
+{
+	struct rxe_pool_entry *elem;
+	unsigned long flags;
+
+	might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC));
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	if (pool->state != rxe_pool_valid) {
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+		return NULL;
+	}
+	kref_get(&pool->ref_cnt);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	kref_get(&pool->rxe->ref_cnt);
+
+	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
+		goto out_put_pool;
+
+	elem = kmem_cache_zalloc(pool_cache(pool),
+				 (pool->flags & RXE_POOL_ATOMIC) ?
+				 GFP_ATOMIC : GFP_KERNEL);
+	if (!elem)
+		goto out_put_pool;
+
+	elem->pool = pool;
+	kref_init(&elem->ref_cnt);
+
+	return elem;
+
+out_put_pool:
+	atomic_dec(&pool->num_elem);
+	rxe_dev_put(pool->rxe);
+	rxe_pool_put(pool);
+	return NULL;
+}
+
+void rxe_elem_release(struct kref *kref)
+{
+	struct rxe_pool_entry *elem =
+		container_of(kref, struct rxe_pool_entry, ref_cnt);
+	struct rxe_pool *pool = elem->pool;
+
+	if (pool->cleanup)
+		pool->cleanup(elem);
+
+	kmem_cache_free(pool_cache(pool), elem);
+	atomic_dec(&pool->num_elem);
+	rxe_dev_put(pool->rxe);
+	rxe_pool_put(pool);
+}
+
+void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
+{
+	struct rb_node *node = NULL;
+	struct rxe_pool_entry *elem = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+
+	if (pool->state != rxe_pool_valid)
+		goto out;
+
+	node = pool->tree.rb_node;
+
+	while (node) {
+		elem = rb_entry(node, struct rxe_pool_entry, node);
+
+		if (elem->index > index)
+			node = node->rb_left;
+		else if (elem->index < index)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (node)
+		kref_get(&elem->ref_cnt);
+
+out:
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+	return node ? elem : NULL;
+}
+
+void *rxe_pool_get_key(struct rxe_pool *pool, void *key)
+{
+	struct rb_node *node = NULL;
+	struct rxe_pool_entry *elem = NULL;
+	int cmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+
+	if (pool->state != rxe_pool_valid)
+		goto out;
+
+	node = pool->tree.rb_node;
+
+	while (node) {
+		elem = rb_entry(node, struct rxe_pool_entry, node);
+
+		cmp = memcmp((u8 *)elem + pool->key_offset,
+			     key, pool->key_size);
+
+		if (cmp > 0)
+			node = node->rb_left;
+		else if (cmp < 0)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (node)
+		kref_get(&elem->ref_cnt);
+
+out:
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+	return node ? elem : NULL;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
new file mode 100644
index 0000000..47df28e
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *	   Redistribution and use in source and binary forms, with or
+ *	   without modification, are permitted provided that the following
+ *	   conditions are met:
+ *
+ *		- Redistributions of source code must retain the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer.
+ *
+ *		- Redistributions in binary form must reproduce the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer in the documentation and/or other materials
+ *		  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_POOL_H
+#define RXE_POOL_H
+
+#define RXE_POOL_ALIGN		(16)
+#define RXE_POOL_CACHE_FLAGS	(0)
+
+enum rxe_pool_flags {
+	RXE_POOL_ATOMIC		= BIT(0),
+	RXE_POOL_INDEX		= BIT(1),
+	RXE_POOL_KEY		= BIT(2),
+};
+
+enum rxe_elem_type {
+	RXE_TYPE_UC,
+	RXE_TYPE_PD,
+	RXE_TYPE_AH,
+	RXE_TYPE_SRQ,
+	RXE_TYPE_QP,
+	RXE_TYPE_CQ,
+	RXE_TYPE_MR,
+	RXE_TYPE_MW,
+	RXE_TYPE_MC_GRP,
+	RXE_TYPE_MC_ELEM,
+	RXE_NUM_TYPES,		/* keep me last */
+};
+
+struct rxe_pool_entry;
+
+struct rxe_type_info {
+	const char		*name;
+	size_t			size;
+	void			(*cleanup)(struct rxe_pool_entry *obj);
+	enum rxe_pool_flags	flags;
+	u32			max_index;
+	u32			min_index;
+	size_t			key_offset;
+	size_t			key_size;
+	struct kmem_cache	*cache;
+};
+
+extern struct rxe_type_info rxe_type_info[];
+
+enum rxe_pool_state {
+	rxe_pool_invalid,
+	rxe_pool_valid,
+};
+
+struct rxe_pool_entry {
+	struct rxe_pool		*pool;
+	struct kref		ref_cnt;
+	struct list_head	list;
+
+	/* only used if indexed or keyed */
+	struct rb_node		node;
+	u32			index;
+};
+
+struct rxe_pool {
+	struct rxe_dev		*rxe;
+	spinlock_t              pool_lock; /* pool spinlock */
+	size_t			elem_size;
+	struct kref		ref_cnt;
+	void			(*cleanup)(struct rxe_pool_entry *obj);
+	enum rxe_pool_state	state;
+	enum rxe_pool_flags	flags;
+	enum rxe_elem_type	type;
+
+	unsigned int		max_elem;
+	atomic_t		num_elem;
+
+	/* only used if indexed or keyed */
+	struct rb_root		tree;
+	unsigned long		*table;
+	size_t			table_size;
+	u32			max_index;
+	u32			min_index;
+	u32			last;
+	size_t			key_offset;
+	size_t			key_size;
+};
+
+/* initialize slab caches for managed objects */
+int rxe_cache_init(void);
+
+/* cleanup slab caches for managed objects */
+void rxe_cache_exit(void);
+
+/* initialize a pool of objects with given limit on
+ * number of elements. gets parameters from rxe_type_info
+ * pool elements will be allocated out of a slab cache
+ */
+int rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool,
+		  enum rxe_elem_type type, u32 max_elem);
+
+/* free resources from object pool */
+int rxe_pool_cleanup(struct rxe_pool *pool);
+
+/* allocate an object from pool */
+void *rxe_alloc(struct rxe_pool *pool);
+
+/* assign an index to an indexed object and insert object into
+ *  pool's rb tree
+ */
+void rxe_add_index(void *elem);
+
+/* drop an index and remove object from rb tree */
+void rxe_drop_index(void *elem);
+
+/* assign a key to a keyed object and insert object into
+ *  pool's rb tree
+ */
+void rxe_add_key(void *elem, void *key);
+
+/* remove elem from rb tree */
+void rxe_drop_key(void *elem);
+
+/* lookup an indexed object from index. takes a reference on object */
+void *rxe_pool_get_index(struct rxe_pool *pool, u32 index);
+
+/* lookup keyed object from key. takes a reference on the object */
+void *rxe_pool_get_key(struct rxe_pool *pool, void *key);
+
+/* cleanup an object when all references are dropped */
+void rxe_elem_release(struct kref *kref);
+
+/* take a reference on an object */
+#define rxe_add_ref(elem) kref_get(&(elem)->pelem.ref_cnt)
+
+/* drop a reference on an object */
+#define rxe_drop_ref(elem) kref_put(&(elem)->pelem.ref_cnt, rxe_elem_release)
+
+#endif /* RXE_POOL_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
new file mode 100644
index 0000000..b9f7aa1
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -0,0 +1,859 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *	   Redistribution and use in source and binary forms, with or
+ *	   without modification, are permitted provided that the following
+ *	   conditions are met:
+ *
+ *		- Redistributions of source code must retain the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer.
+ *
+ *		- Redistributions in binary form must reproduce the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer in the documentation and/or other materials
+ *		  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+#include "rxe_task.h"
+
+static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap,
+			  int has_srq)
+{
+	if (cap->max_send_wr > rxe->attr.max_qp_wr) {
+		pr_warn("invalid send wr = %d > %d\n",
+			cap->max_send_wr, rxe->attr.max_qp_wr);
+		goto err1;
+	}
+
+	if (cap->max_send_sge > rxe->attr.max_sge) {
+		pr_warn("invalid send sge = %d > %d\n",
+			cap->max_send_sge, rxe->attr.max_sge);
+		goto err1;
+	}
+
+	if (!has_srq) {
+		if (cap->max_recv_wr > rxe->attr.max_qp_wr) {
+			pr_warn("invalid recv wr = %d > %d\n",
+				cap->max_recv_wr, rxe->attr.max_qp_wr);
+			goto err1;
+		}
+
+		if (cap->max_recv_sge > rxe->attr.max_sge) {
+			pr_warn("invalid recv sge = %d > %d\n",
+				cap->max_recv_sge, rxe->attr.max_sge);
+			goto err1;
+		}
+	}
+
+	if (cap->max_inline_data > rxe->max_inline_data) {
+		pr_warn("invalid max inline data = %d > %d\n",
+			cap->max_inline_data, rxe->max_inline_data);
+		goto err1;
+	}
+
+	return 0;
+
+err1:
+	return -EINVAL;
+}
+
+int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init)
+{
+	struct ib_qp_cap *cap = &init->cap;
+	struct rxe_port *port;
+	int port_num = init->port_num;
+
+	if (!init->recv_cq || !init->send_cq) {
+		pr_warn("missing cq\n");
+		goto err1;
+	}
+
+	if (rxe_qp_chk_cap(rxe, cap, !!init->srq))
+		goto err1;
+
+	if (init->qp_type == IB_QPT_SMI || init->qp_type == IB_QPT_GSI) {
+		if (port_num != 1) {
+			pr_warn("invalid port = %d\n", port_num);
+			goto err1;
+		}
+
+		port = &rxe->port;
+
+		if (init->qp_type == IB_QPT_SMI && port->qp_smi_index) {
+			pr_warn("SMI QP exists for port %d\n", port_num);
+			goto err1;
+		}
+
+		if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) {
+			pr_warn("GSI QP exists for port %d\n", port_num);
+			goto err1;
+		}
+	}
+
+	return 0;
+
+err1:
+	return -EINVAL;
+}
+
+static int alloc_rd_atomic_resources(struct rxe_qp *qp, unsigned int n)
+{
+	qp->resp.res_head = 0;
+	qp->resp.res_tail = 0;
+	qp->resp.resources = kcalloc(n, sizeof(struct resp_res), GFP_KERNEL);
+
+	if (!qp->resp.resources)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void free_rd_atomic_resources(struct rxe_qp *qp)
+{
+	if (qp->resp.resources) {
+		int i;
+
+		for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) {
+			struct resp_res *res = &qp->resp.resources[i];
+
+			free_rd_atomic_resource(qp, res);
+		}
+		kfree(qp->resp.resources);
+		qp->resp.resources = NULL;
+	}
+}
+
+void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res)
+{
+	if (res->type == RXE_ATOMIC_MASK) {
+		rxe_drop_ref(qp);
+		kfree_skb(res->atomic.skb);
+	} else if (res->type == RXE_READ_MASK) {
+		if (res->read.mr)
+			rxe_drop_ref(res->read.mr);
+	}
+	res->type = 0;
+}
+
+static void cleanup_rd_atomic_resources(struct rxe_qp *qp)
+{
+	int i;
+	struct resp_res *res;
+
+	if (qp->resp.resources) {
+		for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) {
+			res = &qp->resp.resources[i];
+			free_rd_atomic_resource(qp, res);
+		}
+	}
+}
+
+static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp,
+			     struct ib_qp_init_attr *init)
+{
+	struct rxe_port *port;
+	u32 qpn;
+
+	qp->sq_sig_type		= init->sq_sig_type;
+	qp->attr.path_mtu	= 1;
+	qp->mtu			= ib_mtu_enum_to_int(qp->attr.path_mtu);
+
+	qpn			= qp->pelem.index;
+	port			= &rxe->port;
+
+	switch (init->qp_type) {
+	case IB_QPT_SMI:
+		qp->ibqp.qp_num		= 0;
+		port->qp_smi_index	= qpn;
+		qp->attr.port_num	= init->port_num;
+		break;
+
+	case IB_QPT_GSI:
+		qp->ibqp.qp_num		= 1;
+		port->qp_gsi_index	= qpn;
+		qp->attr.port_num	= init->port_num;
+		break;
+
+	default:
+		qp->ibqp.qp_num		= qpn;
+		break;
+	}
+
+	INIT_LIST_HEAD(&qp->grp_list);
+
+	skb_queue_head_init(&qp->send_pkts);
+
+	spin_lock_init(&qp->grp_lock);
+	spin_lock_init(&qp->state_lock);
+
+	atomic_set(&qp->ssn, 0);
+	atomic_set(&qp->skb_out, 0);
+}
+
+static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
+			   struct ib_qp_init_attr *init,
+			   struct ib_ucontext *context,
+			   struct rxe_create_qp_resp __user *uresp)
+{
+	int err;
+	int wqe_size;
+
+	err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk);
+	if (err < 0)
+		return err;
+	qp->sk->sk->sk_user_data = qp;
+
+	qp->sq.max_wr		= init->cap.max_send_wr;
+	qp->sq.max_sge		= init->cap.max_send_sge;
+	qp->sq.max_inline	= init->cap.max_inline_data;
+
+	wqe_size = max_t(int, sizeof(struct rxe_send_wqe) +
+			 qp->sq.max_sge * sizeof(struct ib_sge),
+			 sizeof(struct rxe_send_wqe) +
+			 qp->sq.max_inline);
+
+	qp->sq.queue = rxe_queue_init(rxe,
+				      &qp->sq.max_wr,
+				      wqe_size);
+	if (!qp->sq.queue)
+		return -ENOMEM;
+
+	err = do_mmap_info(rxe, uresp ? &uresp->sq_mi : NULL, context,
+			   qp->sq.queue->buf, qp->sq.queue->buf_size,
+			   &qp->sq.queue->ip);
+
+	if (err) {
+		kvfree(qp->sq.queue->buf);
+		kfree(qp->sq.queue);
+		return err;
+	}
+
+	qp->req.wqe_index	= producer_index(qp->sq.queue);
+	qp->req.state		= QP_STATE_RESET;
+	qp->req.opcode		= -1;
+	qp->comp.opcode		= -1;
+
+	spin_lock_init(&qp->sq.sq_lock);
+	skb_queue_head_init(&qp->req_pkts);
+
+	rxe_init_task(rxe, &qp->req.task, qp,
+		      rxe_requester, "req");
+	rxe_init_task(rxe, &qp->comp.task, qp,
+		      rxe_completer, "comp");
+
+	qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */
+	if (init->qp_type == IB_QPT_RC) {
+		timer_setup(&qp->rnr_nak_timer, rnr_nak_timer, 0);
+		timer_setup(&qp->retrans_timer, retransmit_timer, 0);
+	}
+	return 0;
+}
+
+static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
+			    struct ib_qp_init_attr *init,
+			    struct ib_ucontext *context,
+			    struct rxe_create_qp_resp __user *uresp)
+{
+	int err;
+	int wqe_size;
+
+	if (!qp->srq) {
+		qp->rq.max_wr		= init->cap.max_recv_wr;
+		qp->rq.max_sge		= init->cap.max_recv_sge;
+
+		wqe_size = rcv_wqe_size(qp->rq.max_sge);
+
+		pr_debug("qp#%d max_wr = %d, max_sge = %d, wqe_size = %d\n",
+			 qp_num(qp), qp->rq.max_wr, qp->rq.max_sge, wqe_size);
+
+		qp->rq.queue = rxe_queue_init(rxe,
+					      &qp->rq.max_wr,
+					      wqe_size);
+		if (!qp->rq.queue)
+			return -ENOMEM;
+
+		err = do_mmap_info(rxe, uresp ? &uresp->rq_mi : NULL, context,
+				   qp->rq.queue->buf, qp->rq.queue->buf_size,
+				   &qp->rq.queue->ip);
+		if (err) {
+			kvfree(qp->rq.queue->buf);
+			kfree(qp->rq.queue);
+			return err;
+		}
+	}
+
+	spin_lock_init(&qp->rq.producer_lock);
+	spin_lock_init(&qp->rq.consumer_lock);
+
+	skb_queue_head_init(&qp->resp_pkts);
+
+	rxe_init_task(rxe, &qp->resp.task, qp,
+		      rxe_responder, "resp");
+
+	qp->resp.opcode		= OPCODE_NONE;
+	qp->resp.msn		= 0;
+	qp->resp.state		= QP_STATE_RESET;
+
+	return 0;
+}
+
+/* called by the create qp verb */
+int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
+		     struct ib_qp_init_attr *init,
+		     struct rxe_create_qp_resp __user *uresp,
+		     struct ib_pd *ibpd)
+{
+	int err;
+	struct rxe_cq *rcq = to_rcq(init->recv_cq);
+	struct rxe_cq *scq = to_rcq(init->send_cq);
+	struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL;
+	struct ib_ucontext *context = ibpd->uobject ? ibpd->uobject->context : NULL;
+
+	rxe_add_ref(pd);
+	rxe_add_ref(rcq);
+	rxe_add_ref(scq);
+	if (srq)
+		rxe_add_ref(srq);
+
+	qp->pd			= pd;
+	qp->rcq			= rcq;
+	qp->scq			= scq;
+	qp->srq			= srq;
+
+	rxe_qp_init_misc(rxe, qp, init);
+
+	err = rxe_qp_init_req(rxe, qp, init, context, uresp);
+	if (err)
+		goto err1;
+
+	err = rxe_qp_init_resp(rxe, qp, init, context, uresp);
+	if (err)
+		goto err2;
+
+	qp->attr.qp_state = IB_QPS_RESET;
+	qp->valid = 1;
+
+	return 0;
+
+err2:
+	rxe_queue_cleanup(qp->sq.queue);
+err1:
+	if (srq)
+		rxe_drop_ref(srq);
+	rxe_drop_ref(scq);
+	rxe_drop_ref(rcq);
+	rxe_drop_ref(pd);
+
+	return err;
+}
+
+/* called by the query qp verb */
+int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init)
+{
+	init->event_handler		= qp->ibqp.event_handler;
+	init->qp_context		= qp->ibqp.qp_context;
+	init->send_cq			= qp->ibqp.send_cq;
+	init->recv_cq			= qp->ibqp.recv_cq;
+	init->srq			= qp->ibqp.srq;
+
+	init->cap.max_send_wr		= qp->sq.max_wr;
+	init->cap.max_send_sge		= qp->sq.max_sge;
+	init->cap.max_inline_data	= qp->sq.max_inline;
+
+	if (!qp->srq) {
+		init->cap.max_recv_wr		= qp->rq.max_wr;
+		init->cap.max_recv_sge		= qp->rq.max_sge;
+	}
+
+	init->sq_sig_type		= qp->sq_sig_type;
+
+	init->qp_type			= qp->ibqp.qp_type;
+	init->port_num			= 1;
+
+	return 0;
+}
+
+/* called by the modify qp verb, this routine checks all the parameters before
+ * making any changes
+ */
+int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
+		    struct ib_qp_attr *attr, int mask)
+{
+	enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ?
+					attr->cur_qp_state : qp->attr.qp_state;
+	enum ib_qp_state new_state = (mask & IB_QP_STATE) ?
+					attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask,
+				IB_LINK_LAYER_ETHERNET)) {
+		pr_warn("invalid mask or state for qp\n");
+		goto err1;
+	}
+
+	if (mask & IB_QP_STATE) {
+		if (cur_state == IB_QPS_SQD) {
+			if (qp->req.state == QP_STATE_DRAIN &&
+			    new_state != IB_QPS_ERR)
+				goto err1;
+		}
+	}
+
+	if (mask & IB_QP_PORT) {
+		if (attr->port_num != 1) {
+			pr_warn("invalid port %d\n", attr->port_num);
+			goto err1;
+		}
+	}
+
+	if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq))
+		goto err1;
+
+	if (mask & IB_QP_AV && rxe_av_chk_attr(rxe, &attr->ah_attr))
+		goto err1;
+
+	if (mask & IB_QP_ALT_PATH) {
+		if (rxe_av_chk_attr(rxe, &attr->alt_ah_attr))
+			goto err1;
+		if (attr->alt_port_num != 1) {
+			pr_warn("invalid alt port %d\n", attr->alt_port_num);
+			goto err1;
+		}
+		if (attr->alt_timeout > 31) {
+			pr_warn("invalid QP alt timeout %d > 31\n",
+				attr->alt_timeout);
+			goto err1;
+		}
+	}
+
+	if (mask & IB_QP_PATH_MTU) {
+		struct rxe_port *port = &rxe->port;
+
+		enum ib_mtu max_mtu = port->attr.max_mtu;
+		enum ib_mtu mtu = attr->path_mtu;
+
+		if (mtu > max_mtu) {
+			pr_debug("invalid mtu (%d) > (%d)\n",
+				 ib_mtu_enum_to_int(mtu),
+				 ib_mtu_enum_to_int(max_mtu));
+			goto err1;
+		}
+	}
+
+	if (mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) {
+			pr_warn("invalid max_rd_atomic %d > %d\n",
+				attr->max_rd_atomic,
+				rxe->attr.max_qp_rd_atom);
+			goto err1;
+		}
+	}
+
+	if (mask & IB_QP_TIMEOUT) {
+		if (attr->timeout > 31) {
+			pr_warn("invalid QP timeout %d > 31\n",
+				attr->timeout);
+			goto err1;
+		}
+	}
+
+	return 0;
+
+err1:
+	return -EINVAL;
+}
+
+/* move the qp to the reset state */
+static void rxe_qp_reset(struct rxe_qp *qp)
+{
+	/* stop tasks from running */
+	rxe_disable_task(&qp->resp.task);
+
+	/* stop request/comp */
+	if (qp->sq.queue) {
+		if (qp_type(qp) == IB_QPT_RC)
+			rxe_disable_task(&qp->comp.task);
+		rxe_disable_task(&qp->req.task);
+	}
+
+	/* move qp to the reset state */
+	qp->req.state = QP_STATE_RESET;
+	qp->resp.state = QP_STATE_RESET;
+
+	/* let state machines reset themselves drain work and packet queues
+	 * etc.
+	 */
+	__rxe_do_task(&qp->resp.task);
+
+	if (qp->sq.queue) {
+		__rxe_do_task(&qp->comp.task);
+		__rxe_do_task(&qp->req.task);
+		rxe_queue_reset(qp->sq.queue);
+	}
+
+	/* cleanup attributes */
+	atomic_set(&qp->ssn, 0);
+	qp->req.opcode = -1;
+	qp->req.need_retry = 0;
+	qp->req.noack_pkts = 0;
+	qp->resp.msn = 0;
+	qp->resp.opcode = -1;
+	qp->resp.drop_msg = 0;
+	qp->resp.goto_error = 0;
+	qp->resp.sent_psn_nak = 0;
+
+	if (qp->resp.mr) {
+		rxe_drop_ref(qp->resp.mr);
+		qp->resp.mr = NULL;
+	}
+
+	cleanup_rd_atomic_resources(qp);
+
+	/* reenable tasks */
+	rxe_enable_task(&qp->resp.task);
+
+	if (qp->sq.queue) {
+		if (qp_type(qp) == IB_QPT_RC)
+			rxe_enable_task(&qp->comp.task);
+
+		rxe_enable_task(&qp->req.task);
+	}
+}
+
+/* drain the send queue */
+static void rxe_qp_drain(struct rxe_qp *qp)
+{
+	if (qp->sq.queue) {
+		if (qp->req.state != QP_STATE_DRAINED) {
+			qp->req.state = QP_STATE_DRAIN;
+			if (qp_type(qp) == IB_QPT_RC)
+				rxe_run_task(&qp->comp.task, 1);
+			else
+				__rxe_do_task(&qp->comp.task);
+			rxe_run_task(&qp->req.task, 1);
+		}
+	}
+}
+
+/* move the qp to the error state */
+void rxe_qp_error(struct rxe_qp *qp)
+{
+	qp->req.state = QP_STATE_ERROR;
+	qp->resp.state = QP_STATE_ERROR;
+	qp->attr.qp_state = IB_QPS_ERR;
+
+	/* drain work and packet queues */
+	rxe_run_task(&qp->resp.task, 1);
+
+	if (qp_type(qp) == IB_QPT_RC)
+		rxe_run_task(&qp->comp.task, 1);
+	else
+		__rxe_do_task(&qp->comp.task);
+	rxe_run_task(&qp->req.task, 1);
+}
+
+/* called by the modify qp verb */
+int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
+		     struct ib_udata *udata)
+{
+	int err;
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+	union ib_gid sgid;
+	struct ib_gid_attr sgid_attr;
+
+	if (mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		int max_rd_atomic = __roundup_pow_of_two(attr->max_rd_atomic);
+
+		qp->attr.max_rd_atomic = max_rd_atomic;
+		atomic_set(&qp->req.rd_atomic, max_rd_atomic);
+	}
+
+	if (mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		int max_dest_rd_atomic =
+			__roundup_pow_of_two(attr->max_dest_rd_atomic);
+
+		qp->attr.max_dest_rd_atomic = max_dest_rd_atomic;
+
+		free_rd_atomic_resources(qp);
+
+		err = alloc_rd_atomic_resources(qp, max_dest_rd_atomic);
+		if (err)
+			return err;
+	}
+
+	if (mask & IB_QP_CUR_STATE)
+		qp->attr.cur_qp_state = attr->qp_state;
+
+	if (mask & IB_QP_EN_SQD_ASYNC_NOTIFY)
+		qp->attr.en_sqd_async_notify = attr->en_sqd_async_notify;
+
+	if (mask & IB_QP_ACCESS_FLAGS)
+		qp->attr.qp_access_flags = attr->qp_access_flags;
+
+	if (mask & IB_QP_PKEY_INDEX)
+		qp->attr.pkey_index = attr->pkey_index;
+
+	if (mask & IB_QP_PORT)
+		qp->attr.port_num = attr->port_num;
+
+	if (mask & IB_QP_QKEY)
+		qp->attr.qkey = attr->qkey;
+
+	if (mask & IB_QP_AV) {
+		ib_get_cached_gid(&rxe->ib_dev, 1,
+				  rdma_ah_read_grh(&attr->ah_attr)->sgid_index,
+				  &sgid, &sgid_attr);
+		rxe_av_from_attr(attr->port_num, &qp->pri_av, &attr->ah_attr);
+		rxe_av_fill_ip_info(&qp->pri_av, &attr->ah_attr,
+				    &sgid_attr, &sgid);
+		if (sgid_attr.ndev)
+			dev_put(sgid_attr.ndev);
+	}
+
+	if (mask & IB_QP_ALT_PATH) {
+		u8 sgid_index =
+			rdma_ah_read_grh(&attr->alt_ah_attr)->sgid_index;
+
+		ib_get_cached_gid(&rxe->ib_dev, 1, sgid_index,
+				  &sgid, &sgid_attr);
+
+		rxe_av_from_attr(attr->alt_port_num, &qp->alt_av,
+				 &attr->alt_ah_attr);
+		rxe_av_fill_ip_info(&qp->alt_av, &attr->alt_ah_attr,
+				    &sgid_attr, &sgid);
+		if (sgid_attr.ndev)
+			dev_put(sgid_attr.ndev);
+
+		qp->attr.alt_port_num = attr->alt_port_num;
+		qp->attr.alt_pkey_index = attr->alt_pkey_index;
+		qp->attr.alt_timeout = attr->alt_timeout;
+	}
+
+	if (mask & IB_QP_PATH_MTU) {
+		qp->attr.path_mtu = attr->path_mtu;
+		qp->mtu = ib_mtu_enum_to_int(attr->path_mtu);
+	}
+
+	if (mask & IB_QP_TIMEOUT) {
+		qp->attr.timeout = attr->timeout;
+		if (attr->timeout == 0) {
+			qp->qp_timeout_jiffies = 0;
+		} else {
+			/* According to the spec, timeout = 4.096 * 2 ^ attr->timeout [us] */
+			int j = nsecs_to_jiffies(4096ULL << attr->timeout);
+
+			qp->qp_timeout_jiffies = j ? j : 1;
+		}
+	}
+
+	if (mask & IB_QP_RETRY_CNT) {
+		qp->attr.retry_cnt = attr->retry_cnt;
+		qp->comp.retry_cnt = attr->retry_cnt;
+		pr_debug("qp#%d set retry count = %d\n", qp_num(qp),
+			 attr->retry_cnt);
+	}
+
+	if (mask & IB_QP_RNR_RETRY) {
+		qp->attr.rnr_retry = attr->rnr_retry;
+		qp->comp.rnr_retry = attr->rnr_retry;
+		pr_debug("qp#%d set rnr retry count = %d\n", qp_num(qp),
+			 attr->rnr_retry);
+	}
+
+	if (mask & IB_QP_RQ_PSN) {
+		qp->attr.rq_psn = (attr->rq_psn & BTH_PSN_MASK);
+		qp->resp.psn = qp->attr.rq_psn;
+		pr_debug("qp#%d set resp psn = 0x%x\n", qp_num(qp),
+			 qp->resp.psn);
+	}
+
+	if (mask & IB_QP_MIN_RNR_TIMER) {
+		qp->attr.min_rnr_timer = attr->min_rnr_timer;
+		pr_debug("qp#%d set min rnr timer = 0x%x\n", qp_num(qp),
+			 attr->min_rnr_timer);
+	}
+
+	if (mask & IB_QP_SQ_PSN) {
+		qp->attr.sq_psn = (attr->sq_psn & BTH_PSN_MASK);
+		qp->req.psn = qp->attr.sq_psn;
+		qp->comp.psn = qp->attr.sq_psn;
+		pr_debug("qp#%d set req psn = 0x%x\n", qp_num(qp), qp->req.psn);
+	}
+
+	if (mask & IB_QP_PATH_MIG_STATE)
+		qp->attr.path_mig_state = attr->path_mig_state;
+
+	if (mask & IB_QP_DEST_QPN)
+		qp->attr.dest_qp_num = attr->dest_qp_num;
+
+	if (mask & IB_QP_STATE) {
+		qp->attr.qp_state = attr->qp_state;
+
+		switch (attr->qp_state) {
+		case IB_QPS_RESET:
+			pr_debug("qp#%d state -> RESET\n", qp_num(qp));
+			rxe_qp_reset(qp);
+			break;
+
+		case IB_QPS_INIT:
+			pr_debug("qp#%d state -> INIT\n", qp_num(qp));
+			qp->req.state = QP_STATE_INIT;
+			qp->resp.state = QP_STATE_INIT;
+			break;
+
+		case IB_QPS_RTR:
+			pr_debug("qp#%d state -> RTR\n", qp_num(qp));
+			qp->resp.state = QP_STATE_READY;
+			break;
+
+		case IB_QPS_RTS:
+			pr_debug("qp#%d state -> RTS\n", qp_num(qp));
+			qp->req.state = QP_STATE_READY;
+			break;
+
+		case IB_QPS_SQD:
+			pr_debug("qp#%d state -> SQD\n", qp_num(qp));
+			rxe_qp_drain(qp);
+			break;
+
+		case IB_QPS_SQE:
+			pr_warn("qp#%d state -> SQE !!?\n", qp_num(qp));
+			/* Not possible from modify_qp. */
+			break;
+
+		case IB_QPS_ERR:
+			pr_debug("qp#%d state -> ERR\n", qp_num(qp));
+			rxe_qp_error(qp);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/* called by the query qp verb */
+int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask)
+{
+	*attr = qp->attr;
+
+	attr->rq_psn				= qp->resp.psn;
+	attr->sq_psn				= qp->req.psn;
+
+	attr->cap.max_send_wr			= qp->sq.max_wr;
+	attr->cap.max_send_sge			= qp->sq.max_sge;
+	attr->cap.max_inline_data		= qp->sq.max_inline;
+
+	if (!qp->srq) {
+		attr->cap.max_recv_wr		= qp->rq.max_wr;
+		attr->cap.max_recv_sge		= qp->rq.max_sge;
+	}
+
+	rxe_av_to_attr(&qp->pri_av, &attr->ah_attr);
+	rxe_av_to_attr(&qp->alt_av, &attr->alt_ah_attr);
+
+	if (qp->req.state == QP_STATE_DRAIN) {
+		attr->sq_draining = 1;
+		/* applications that get this state
+		 * typically spin on it. yield the
+		 * processor
+		 */
+		cond_resched();
+	} else {
+		attr->sq_draining = 0;
+	}
+
+	pr_debug("attr->sq_draining = %d\n", attr->sq_draining);
+
+	return 0;
+}
+
+/* called by the destroy qp verb */
+void rxe_qp_destroy(struct rxe_qp *qp)
+{
+	qp->valid = 0;
+	qp->qp_timeout_jiffies = 0;
+	rxe_cleanup_task(&qp->resp.task);
+
+	if (qp_type(qp) == IB_QPT_RC) {
+		del_timer_sync(&qp->retrans_timer);
+		del_timer_sync(&qp->rnr_nak_timer);
+	}
+
+	rxe_cleanup_task(&qp->req.task);
+	rxe_cleanup_task(&qp->comp.task);
+
+	/* flush out any receive wr's or pending requests */
+	__rxe_do_task(&qp->req.task);
+	if (qp->sq.queue) {
+		__rxe_do_task(&qp->comp.task);
+		__rxe_do_task(&qp->req.task);
+	}
+}
+
+/* called when the last reference to the qp is dropped */
+static void rxe_qp_do_cleanup(struct work_struct *work)
+{
+	struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work);
+
+	rxe_drop_all_mcast_groups(qp);
+
+	if (qp->sq.queue)
+		rxe_queue_cleanup(qp->sq.queue);
+
+	if (qp->srq)
+		rxe_drop_ref(qp->srq);
+
+	if (qp->rq.queue)
+		rxe_queue_cleanup(qp->rq.queue);
+
+	if (qp->scq)
+		rxe_drop_ref(qp->scq);
+	if (qp->rcq)
+		rxe_drop_ref(qp->rcq);
+	if (qp->pd)
+		rxe_drop_ref(qp->pd);
+
+	if (qp->resp.mr) {
+		rxe_drop_ref(qp->resp.mr);
+		qp->resp.mr = NULL;
+	}
+
+	if (qp_type(qp) == IB_QPT_RC)
+		sk_dst_reset(qp->sk->sk);
+
+	free_rd_atomic_resources(qp);
+
+	kernel_sock_shutdown(qp->sk, SHUT_RDWR);
+	sock_release(qp->sk);
+}
+
+/* called when the last reference to the qp is dropped */
+void rxe_qp_cleanup(struct rxe_pool_entry *arg)
+{
+	struct rxe_qp *qp = container_of(arg, typeof(*qp), pelem);
+
+	execute_in_process_context(rxe_qp_do_cleanup, &qp->cleanup_work);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c
new file mode 100644
index 0000000..f84ab44
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_queue.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must retailuce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/vmalloc.h>
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int do_mmap_info(struct rxe_dev *rxe,
+		 struct mminfo __user *outbuf,
+		 struct ib_ucontext *context,
+		 struct rxe_queue_buf *buf,
+		 size_t buf_size,
+		 struct rxe_mmap_info **ip_p)
+{
+	int err;
+	struct rxe_mmap_info *ip = NULL;
+
+	if (outbuf) {
+		ip = rxe_create_mmap_info(rxe, buf_size, context, buf);
+		if (!ip)
+			goto err1;
+
+		err = copy_to_user(outbuf, &ip->info, sizeof(ip->info));
+		if (err)
+			goto err2;
+
+		spin_lock_bh(&rxe->pending_lock);
+		list_add(&ip->pending_mmaps, &rxe->pending_mmaps);
+		spin_unlock_bh(&rxe->pending_lock);
+	}
+
+	*ip_p = ip;
+
+	return 0;
+
+err2:
+	kfree(ip);
+err1:
+	return -EINVAL;
+}
+
+inline void rxe_queue_reset(struct rxe_queue *q)
+{
+	/* queue is comprised from header and the memory
+	 * of the actual queue. See "struct rxe_queue_buf" in rxe_queue.h
+	 * reset only the queue itself and not the management header
+	 */
+	memset(q->buf->data, 0, q->buf_size - sizeof(struct rxe_queue_buf));
+}
+
+struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe,
+				 int *num_elem,
+				 unsigned int elem_size)
+{
+	struct rxe_queue *q;
+	size_t buf_size;
+	unsigned int num_slots;
+
+	/* num_elem == 0 is allowed, but uninteresting */
+	if (*num_elem < 0)
+		goto err1;
+
+	q = kmalloc(sizeof(*q), GFP_KERNEL);
+	if (!q)
+		goto err1;
+
+	q->rxe = rxe;
+
+	/* used in resize, only need to copy used part of queue */
+	q->elem_size = elem_size;
+
+	/* pad element up to at least a cacheline and always a power of 2 */
+	if (elem_size < cache_line_size())
+		elem_size = cache_line_size();
+	elem_size = roundup_pow_of_two(elem_size);
+
+	q->log2_elem_size = order_base_2(elem_size);
+
+	num_slots = *num_elem + 1;
+	num_slots = roundup_pow_of_two(num_slots);
+	q->index_mask = num_slots - 1;
+
+	buf_size = sizeof(struct rxe_queue_buf) + num_slots * elem_size;
+
+	q->buf = vmalloc_user(buf_size);
+	if (!q->buf)
+		goto err2;
+
+	q->buf->log2_elem_size = q->log2_elem_size;
+	q->buf->index_mask = q->index_mask;
+
+	q->buf_size = buf_size;
+
+	*num_elem = num_slots - 1;
+	return q;
+
+err2:
+	kfree(q);
+err1:
+	return NULL;
+}
+
+/* copies elements from original q to new q and then swaps the contents of the
+ * two q headers. This is so that if anyone is holding a pointer to q it will
+ * still work
+ */
+static int resize_finish(struct rxe_queue *q, struct rxe_queue *new_q,
+			 unsigned int num_elem)
+{
+	if (!queue_empty(q) && (num_elem < queue_count(q)))
+		return -EINVAL;
+
+	while (!queue_empty(q)) {
+		memcpy(producer_addr(new_q), consumer_addr(q),
+		       new_q->elem_size);
+		advance_producer(new_q);
+		advance_consumer(q);
+	}
+
+	swap(*q, *new_q);
+
+	return 0;
+}
+
+int rxe_queue_resize(struct rxe_queue *q,
+		     unsigned int *num_elem_p,
+		     unsigned int elem_size,
+		     struct ib_ucontext *context,
+		     struct mminfo __user *outbuf,
+		     spinlock_t *producer_lock,
+		     spinlock_t *consumer_lock)
+{
+	struct rxe_queue *new_q;
+	unsigned int num_elem = *num_elem_p;
+	int err;
+	unsigned long flags = 0, flags1;
+
+	new_q = rxe_queue_init(q->rxe, &num_elem, elem_size);
+	if (!new_q)
+		return -ENOMEM;
+
+	err = do_mmap_info(new_q->rxe, outbuf, context, new_q->buf,
+			   new_q->buf_size, &new_q->ip);
+	if (err) {
+		vfree(new_q->buf);
+		kfree(new_q);
+		goto err1;
+	}
+
+	spin_lock_irqsave(consumer_lock, flags1);
+
+	if (producer_lock) {
+		spin_lock_irqsave(producer_lock, flags);
+		err = resize_finish(q, new_q, num_elem);
+		spin_unlock_irqrestore(producer_lock, flags);
+	} else {
+		err = resize_finish(q, new_q, num_elem);
+	}
+
+	spin_unlock_irqrestore(consumer_lock, flags1);
+
+	rxe_queue_cleanup(new_q);	/* new/old dep on err */
+	if (err)
+		goto err1;
+
+	*num_elem_p = num_elem;
+	return 0;
+
+err1:
+	return err;
+}
+
+void rxe_queue_cleanup(struct rxe_queue *q)
+{
+	if (q->ip)
+		kref_put(&q->ip->ref, rxe_mmap_release);
+	else
+		vfree(q->buf);
+
+	kfree(q);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h
new file mode 100644
index 0000000..79ba4b3
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_queue.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_QUEUE_H
+#define RXE_QUEUE_H
+
+/* implements a simple circular buffer that can optionally be
+ * shared between user space and the kernel and can be resized
+
+ * the requested element size is rounded up to a power of 2
+ * and the number of elements in the buffer is also rounded
+ * up to a power of 2. Since the queue is empty when the
+ * producer and consumer indices match the maximum capacity
+ * of the queue is one less than the number of element slots
+ */
+
+/* this data structure is shared between user space and kernel
+ * space for those cases where the queue is shared. It contains
+ * the producer and consumer indices. Is also contains a copy
+ * of the queue size parameters for user space to use but the
+ * kernel must use the parameters in the rxe_queue struct
+ * this MUST MATCH the corresponding librxe struct
+ * for performance reasons arrange to have producer and consumer
+ * pointers in separate cache lines
+ * the kernel should always mask the indices to avoid accessing
+ * memory outside of the data area
+ */
+struct rxe_queue_buf {
+	__u32			log2_elem_size;
+	__u32			index_mask;
+	__u32			pad_1[30];
+	__u32			producer_index;
+	__u32			pad_2[31];
+	__u32			consumer_index;
+	__u32			pad_3[31];
+	__u8			data[0];
+};
+
+struct rxe_queue {
+	struct rxe_dev		*rxe;
+	struct rxe_queue_buf	*buf;
+	struct rxe_mmap_info	*ip;
+	size_t			buf_size;
+	size_t			elem_size;
+	unsigned int		log2_elem_size;
+	unsigned int		index_mask;
+};
+
+int do_mmap_info(struct rxe_dev *rxe,
+		 struct mminfo __user *outbuf,
+		 struct ib_ucontext *context,
+		 struct rxe_queue_buf *buf,
+		 size_t buf_size,
+		 struct rxe_mmap_info **ip_p);
+
+void rxe_queue_reset(struct rxe_queue *q);
+
+struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe,
+				 int *num_elem,
+				 unsigned int elem_size);
+
+int rxe_queue_resize(struct rxe_queue *q,
+		     unsigned int *num_elem_p,
+		     unsigned int elem_size,
+		     struct ib_ucontext *context,
+		     struct mminfo __user *outbuf,
+		     /* Protect producers while resizing queue */
+		     spinlock_t *producer_lock,
+		     /* Protect consumers while resizing queue */
+		     spinlock_t *consumer_lock);
+
+void rxe_queue_cleanup(struct rxe_queue *queue);
+
+static inline int next_index(struct rxe_queue *q, int index)
+{
+	return (index + 1) & q->buf->index_mask;
+}
+
+static inline int queue_empty(struct rxe_queue *q)
+{
+	return ((q->buf->producer_index - q->buf->consumer_index)
+			& q->index_mask) == 0;
+}
+
+static inline int queue_full(struct rxe_queue *q)
+{
+	return ((q->buf->producer_index + 1 - q->buf->consumer_index)
+			& q->index_mask) == 0;
+}
+
+static inline void advance_producer(struct rxe_queue *q)
+{
+	q->buf->producer_index = (q->buf->producer_index + 1)
+			& q->index_mask;
+}
+
+static inline void advance_consumer(struct rxe_queue *q)
+{
+	q->buf->consumer_index = (q->buf->consumer_index + 1)
+			& q->index_mask;
+}
+
+static inline void *producer_addr(struct rxe_queue *q)
+{
+	return q->buf->data + ((q->buf->producer_index & q->index_mask)
+				<< q->log2_elem_size);
+}
+
+static inline void *consumer_addr(struct rxe_queue *q)
+{
+	return q->buf->data + ((q->buf->consumer_index & q->index_mask)
+				<< q->log2_elem_size);
+}
+
+static inline unsigned int producer_index(struct rxe_queue *q)
+{
+	return q->buf->producer_index;
+}
+
+static inline unsigned int consumer_index(struct rxe_queue *q)
+{
+	return q->buf->consumer_index;
+}
+
+static inline void *addr_from_index(struct rxe_queue *q, unsigned int index)
+{
+	return q->buf->data + ((index & q->index_mask)
+				<< q->buf->log2_elem_size);
+}
+
+static inline unsigned int index_from_addr(const struct rxe_queue *q,
+					   const void *addr)
+{
+	return (((u8 *)addr - q->buf->data) >> q->log2_elem_size)
+		& q->index_mask;
+}
+
+static inline unsigned int queue_count(const struct rxe_queue *q)
+{
+	return (q->buf->producer_index - q->buf->consumer_index)
+		& q->index_mask;
+}
+
+static inline void *queue_head(struct rxe_queue *q)
+{
+	return queue_empty(q) ? NULL : consumer_addr(q);
+}
+
+#endif /* RXE_QUEUE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c
new file mode 100644
index 0000000..dd80c7d
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_recv.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+			    struct rxe_qp *qp)
+{
+	if (unlikely(!qp->valid))
+		goto err1;
+
+	switch (qp_type(qp)) {
+	case IB_QPT_RC:
+		if (unlikely((pkt->opcode & IB_OPCODE_RC) != 0)) {
+			pr_warn_ratelimited("bad qp type\n");
+			goto err1;
+		}
+		break;
+	case IB_QPT_UC:
+		if (unlikely(!(pkt->opcode & IB_OPCODE_UC))) {
+			pr_warn_ratelimited("bad qp type\n");
+			goto err1;
+		}
+		break;
+	case IB_QPT_UD:
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		if (unlikely(!(pkt->opcode & IB_OPCODE_UD))) {
+			pr_warn_ratelimited("bad qp type\n");
+			goto err1;
+		}
+		break;
+	default:
+		pr_warn_ratelimited("unsupported qp type\n");
+		goto err1;
+	}
+
+	if (pkt->mask & RXE_REQ_MASK) {
+		if (unlikely(qp->resp.state != QP_STATE_READY))
+			goto err1;
+	} else if (unlikely(qp->req.state < QP_STATE_READY ||
+				qp->req.state > QP_STATE_DRAINED)) {
+		goto err1;
+	}
+
+	return 0;
+
+err1:
+	return -EINVAL;
+}
+
+static void set_bad_pkey_cntr(struct rxe_port *port)
+{
+	spin_lock_bh(&port->port_lock);
+	port->attr.bad_pkey_cntr = min((u32)0xffff,
+				       port->attr.bad_pkey_cntr + 1);
+	spin_unlock_bh(&port->port_lock);
+}
+
+static void set_qkey_viol_cntr(struct rxe_port *port)
+{
+	spin_lock_bh(&port->port_lock);
+	port->attr.qkey_viol_cntr = min((u32)0xffff,
+					port->attr.qkey_viol_cntr + 1);
+	spin_unlock_bh(&port->port_lock);
+}
+
+static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+		      u32 qpn, struct rxe_qp *qp)
+{
+	int i;
+	int found_pkey = 0;
+	struct rxe_port *port = &rxe->port;
+	u16 pkey = bth_pkey(pkt);
+
+	pkt->pkey_index = 0;
+
+	if (qpn == 1) {
+		for (i = 0; i < port->attr.pkey_tbl_len; i++) {
+			if (pkey_match(pkey, port->pkey_tbl[i])) {
+				pkt->pkey_index = i;
+				found_pkey = 1;
+				break;
+			}
+		}
+
+		if (!found_pkey) {
+			pr_warn_ratelimited("bad pkey = 0x%x\n", pkey);
+			set_bad_pkey_cntr(port);
+			goto err1;
+		}
+	} else if (qpn != 0) {
+		if (unlikely(!pkey_match(pkey,
+					 port->pkey_tbl[qp->attr.pkey_index]
+					))) {
+			pr_warn_ratelimited("bad pkey = 0x%0x\n", pkey);
+			set_bad_pkey_cntr(port);
+			goto err1;
+		}
+		pkt->pkey_index = qp->attr.pkey_index;
+	}
+
+	if ((qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) &&
+	    qpn != 0 && pkt->mask) {
+		u32 qkey = (qpn == 1) ? GSI_QKEY : qp->attr.qkey;
+
+		if (unlikely(deth_qkey(pkt) != qkey)) {
+			pr_warn_ratelimited("bad qkey, got 0x%x expected 0x%x for qpn 0x%x\n",
+					    deth_qkey(pkt), qkey, qpn);
+			set_qkey_viol_cntr(port);
+			goto err1;
+		}
+	}
+
+	return 0;
+
+err1:
+	return -EINVAL;
+}
+
+static int check_addr(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+		      struct rxe_qp *qp)
+{
+	struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+	if (qp_type(qp) != IB_QPT_RC && qp_type(qp) != IB_QPT_UC)
+		goto done;
+
+	if (unlikely(pkt->port_num != qp->attr.port_num)) {
+		pr_warn_ratelimited("port %d != qp port %d\n",
+				    pkt->port_num, qp->attr.port_num);
+		goto err1;
+	}
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		struct in_addr *saddr =
+			&qp->pri_av.sgid_addr._sockaddr_in.sin_addr;
+		struct in_addr *daddr =
+			&qp->pri_av.dgid_addr._sockaddr_in.sin_addr;
+
+		if (ip_hdr(skb)->daddr != saddr->s_addr) {
+			pr_warn_ratelimited("dst addr %pI4 != qp source addr %pI4\n",
+					    &ip_hdr(skb)->daddr,
+					    &saddr->s_addr);
+			goto err1;
+		}
+
+		if (ip_hdr(skb)->saddr != daddr->s_addr) {
+			pr_warn_ratelimited("source addr %pI4 != qp dst addr %pI4\n",
+					    &ip_hdr(skb)->saddr,
+					    &daddr->s_addr);
+			goto err1;
+		}
+
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct in6_addr *saddr =
+			&qp->pri_av.sgid_addr._sockaddr_in6.sin6_addr;
+		struct in6_addr *daddr =
+			&qp->pri_av.dgid_addr._sockaddr_in6.sin6_addr;
+
+		if (memcmp(&ipv6_hdr(skb)->daddr, saddr, sizeof(*saddr))) {
+			pr_warn_ratelimited("dst addr %pI6 != qp source addr %pI6\n",
+					    &ipv6_hdr(skb)->daddr, saddr);
+			goto err1;
+		}
+
+		if (memcmp(&ipv6_hdr(skb)->saddr, daddr, sizeof(*daddr))) {
+			pr_warn_ratelimited("source addr %pI6 != qp dst addr %pI6\n",
+					    &ipv6_hdr(skb)->saddr, daddr);
+			goto err1;
+		}
+	}
+
+done:
+	return 0;
+
+err1:
+	return -EINVAL;
+}
+
+static int hdr_check(struct rxe_pkt_info *pkt)
+{
+	struct rxe_dev *rxe = pkt->rxe;
+	struct rxe_port *port = &rxe->port;
+	struct rxe_qp *qp = NULL;
+	u32 qpn = bth_qpn(pkt);
+	int index;
+	int err;
+
+	if (unlikely(bth_tver(pkt) != BTH_TVER)) {
+		pr_warn_ratelimited("bad tver\n");
+		goto err1;
+	}
+
+	if (qpn != IB_MULTICAST_QPN) {
+		index = (qpn == 0) ? port->qp_smi_index :
+			((qpn == 1) ? port->qp_gsi_index : qpn);
+		qp = rxe_pool_get_index(&rxe->qp_pool, index);
+		if (unlikely(!qp)) {
+			pr_warn_ratelimited("no qp matches qpn 0x%x\n", qpn);
+			goto err1;
+		}
+
+		err = check_type_state(rxe, pkt, qp);
+		if (unlikely(err))
+			goto err2;
+
+		err = check_addr(rxe, pkt, qp);
+		if (unlikely(err))
+			goto err2;
+
+		err = check_keys(rxe, pkt, qpn, qp);
+		if (unlikely(err))
+			goto err2;
+	} else {
+		if (unlikely((pkt->mask & RXE_GRH_MASK) == 0)) {
+			pr_warn_ratelimited("no grh for mcast qpn\n");
+			goto err1;
+		}
+	}
+
+	pkt->qp = qp;
+	return 0;
+
+err2:
+	if (qp)
+		rxe_drop_ref(qp);
+err1:
+	return -EINVAL;
+}
+
+static inline void rxe_rcv_pkt(struct rxe_dev *rxe,
+			       struct rxe_pkt_info *pkt,
+			       struct sk_buff *skb)
+{
+	if (pkt->mask & RXE_REQ_MASK)
+		rxe_resp_queue_pkt(rxe, pkt->qp, skb);
+	else
+		rxe_comp_queue_pkt(rxe, pkt->qp, skb);
+}
+
+static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
+{
+	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+	struct rxe_mc_grp *mcg;
+	struct rxe_mc_elem *mce;
+	struct rxe_qp *qp;
+	union ib_gid dgid;
+	int err;
+
+	if (skb->protocol == htons(ETH_P_IP))
+		ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
+				       (struct in6_addr *)&dgid);
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		memcpy(&dgid, &ipv6_hdr(skb)->daddr, sizeof(dgid));
+
+	/* lookup mcast group corresponding to mgid, takes a ref */
+	mcg = rxe_pool_get_key(&rxe->mc_grp_pool, &dgid);
+	if (!mcg)
+		goto err1;	/* mcast group not registered */
+
+	spin_lock_bh(&mcg->mcg_lock);
+
+	list_for_each_entry(mce, &mcg->qp_list, qp_list) {
+		qp = mce->qp;
+		pkt = SKB_TO_PKT(skb);
+
+		/* validate qp for incoming packet */
+		err = check_type_state(rxe, pkt, qp);
+		if (err)
+			continue;
+
+		err = check_keys(rxe, pkt, bth_qpn(pkt), qp);
+		if (err)
+			continue;
+
+		/* if *not* the last qp in the list
+		 * increase the users of the skb then post to the next qp
+		 */
+		if (mce->qp_list.next != &mcg->qp_list)
+			refcount_inc(&skb->users);
+
+		pkt->qp = qp;
+		rxe_add_ref(qp);
+		rxe_rcv_pkt(rxe, pkt, skb);
+	}
+
+	spin_unlock_bh(&mcg->mcg_lock);
+
+	rxe_drop_ref(mcg);	/* drop ref from rxe_pool_get_key. */
+
+err1:
+	kfree_skb(skb);
+}
+
+static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb)
+{
+	union ib_gid dgid;
+	union ib_gid *pdgid;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
+				       (struct in6_addr *)&dgid);
+		pdgid = &dgid;
+	} else {
+		pdgid = (union ib_gid *)&ipv6_hdr(skb)->daddr;
+	}
+
+	return ib_find_cached_gid_by_port(&rxe->ib_dev, pdgid,
+					  IB_GID_TYPE_ROCE_UDP_ENCAP,
+					  1, skb->dev, NULL);
+}
+
+/* rxe_rcv is called from the interface driver */
+int rxe_rcv(struct sk_buff *skb)
+{
+	int err;
+	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+	struct rxe_dev *rxe = pkt->rxe;
+	__be32 *icrcp;
+	u32 calc_icrc, pack_icrc;
+
+	pkt->offset = 0;
+
+	if (unlikely(skb->len < pkt->offset + RXE_BTH_BYTES))
+		goto drop;
+
+	if (unlikely(rxe_match_dgid(rxe, skb) < 0)) {
+		pr_warn_ratelimited("failed matching dgid\n");
+		goto drop;
+	}
+
+	pkt->opcode = bth_opcode(pkt);
+	pkt->psn = bth_psn(pkt);
+	pkt->qp = NULL;
+	pkt->mask |= rxe_opcode[pkt->opcode].mask;
+
+	if (unlikely(skb->len < header_size(pkt)))
+		goto drop;
+
+	err = hdr_check(pkt);
+	if (unlikely(err))
+		goto drop;
+
+	/* Verify ICRC */
+	icrcp = (__be32 *)(pkt->hdr + pkt->paylen - RXE_ICRC_SIZE);
+	pack_icrc = be32_to_cpu(*icrcp);
+
+	calc_icrc = rxe_icrc_hdr(pkt, skb);
+	calc_icrc = rxe_crc32(rxe, calc_icrc, (u8 *)payload_addr(pkt),
+			      payload_size(pkt));
+	calc_icrc = (__force u32)cpu_to_be32(~calc_icrc);
+	if (unlikely(calc_icrc != pack_icrc)) {
+		if (skb->protocol == htons(ETH_P_IPV6))
+			pr_warn_ratelimited("bad ICRC from %pI6c\n",
+					    &ipv6_hdr(skb)->saddr);
+		else if (skb->protocol == htons(ETH_P_IP))
+			pr_warn_ratelimited("bad ICRC from %pI4\n",
+					    &ip_hdr(skb)->saddr);
+		else
+			pr_warn_ratelimited("bad ICRC from unknown\n");
+
+		goto drop;
+	}
+
+	rxe_counter_inc(rxe, RXE_CNT_RCVD_PKTS);
+
+	if (unlikely(bth_qpn(pkt) == IB_MULTICAST_QPN))
+		rxe_rcv_mcast_pkt(rxe, skb);
+	else
+		rxe_rcv_pkt(rxe, pkt, skb);
+
+	return 0;
+
+drop:
+	if (pkt->qp)
+		rxe_drop_ref(pkt->qp);
+
+	kfree_skb(skb);
+	return 0;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
new file mode 100644
index 0000000..7851999
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -0,0 +1,751 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <crypto/hash.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+		       u32 opcode);
+
+static inline void retry_first_write_send(struct rxe_qp *qp,
+					  struct rxe_send_wqe *wqe,
+					  unsigned int mask, int npsn)
+{
+	int i;
+
+	for (i = 0; i < npsn; i++) {
+		int to_send = (wqe->dma.resid > qp->mtu) ?
+				qp->mtu : wqe->dma.resid;
+
+		qp->req.opcode = next_opcode(qp, wqe,
+					     wqe->wr.opcode);
+
+		if (wqe->wr.send_flags & IB_SEND_INLINE) {
+			wqe->dma.resid -= to_send;
+			wqe->dma.sge_offset += to_send;
+		} else {
+			advance_dma_data(&wqe->dma, to_send);
+		}
+		if (mask & WR_WRITE_MASK)
+			wqe->iova += qp->mtu;
+	}
+}
+
+static void req_retry(struct rxe_qp *qp)
+{
+	struct rxe_send_wqe *wqe;
+	unsigned int wqe_index;
+	unsigned int mask;
+	int npsn;
+	int first = 1;
+
+	wqe = queue_head(qp->sq.queue);
+	npsn = (qp->comp.psn - wqe->first_psn) & BTH_PSN_MASK;
+
+	qp->req.wqe_index	= consumer_index(qp->sq.queue);
+	qp->req.psn		= qp->comp.psn;
+	qp->req.opcode		= -1;
+
+	for (wqe_index = consumer_index(qp->sq.queue);
+		wqe_index != producer_index(qp->sq.queue);
+		wqe_index = next_index(qp->sq.queue, wqe_index)) {
+		wqe = addr_from_index(qp->sq.queue, wqe_index);
+		mask = wr_opcode_mask(wqe->wr.opcode, qp);
+
+		if (wqe->state == wqe_state_posted)
+			break;
+
+		if (wqe->state == wqe_state_done)
+			continue;
+
+		wqe->iova = (mask & WR_ATOMIC_MASK) ?
+			     wqe->wr.wr.atomic.remote_addr :
+			     (mask & WR_READ_OR_WRITE_MASK) ?
+			     wqe->wr.wr.rdma.remote_addr :
+			     0;
+
+		if (!first || (mask & WR_READ_MASK) == 0) {
+			wqe->dma.resid = wqe->dma.length;
+			wqe->dma.cur_sge = 0;
+			wqe->dma.sge_offset = 0;
+		}
+
+		if (first) {
+			first = 0;
+
+			if (mask & WR_WRITE_OR_SEND_MASK)
+				retry_first_write_send(qp, wqe, mask, npsn);
+
+			if (mask & WR_READ_MASK)
+				wqe->iova += npsn * qp->mtu;
+		}
+
+		wqe->state = wqe_state_posted;
+	}
+}
+
+void rnr_nak_timer(struct timer_list *t)
+{
+	struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer);
+
+	pr_debug("qp#%d rnr nak timer fired\n", qp_num(qp));
+	rxe_run_task(&qp->req.task, 1);
+}
+
+static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
+{
+	struct rxe_send_wqe *wqe = queue_head(qp->sq.queue);
+	unsigned long flags;
+
+	if (unlikely(qp->req.state == QP_STATE_DRAIN)) {
+		/* check to see if we are drained;
+		 * state_lock used by requester and completer
+		 */
+		spin_lock_irqsave(&qp->state_lock, flags);
+		do {
+			if (qp->req.state != QP_STATE_DRAIN) {
+				/* comp just finished */
+				spin_unlock_irqrestore(&qp->state_lock,
+						       flags);
+				break;
+			}
+
+			if (wqe && ((qp->req.wqe_index !=
+				consumer_index(qp->sq.queue)) ||
+				(wqe->state != wqe_state_posted))) {
+				/* comp not done yet */
+				spin_unlock_irqrestore(&qp->state_lock,
+						       flags);
+				break;
+			}
+
+			qp->req.state = QP_STATE_DRAINED;
+			spin_unlock_irqrestore(&qp->state_lock, flags);
+
+			if (qp->ibqp.event_handler) {
+				struct ib_event ev;
+
+				ev.device = qp->ibqp.device;
+				ev.element.qp = &qp->ibqp;
+				ev.event = IB_EVENT_SQ_DRAINED;
+				qp->ibqp.event_handler(&ev,
+					qp->ibqp.qp_context);
+			}
+		} while (0);
+	}
+
+	if (qp->req.wqe_index == producer_index(qp->sq.queue))
+		return NULL;
+
+	wqe = addr_from_index(qp->sq.queue, qp->req.wqe_index);
+
+	if (unlikely((qp->req.state == QP_STATE_DRAIN ||
+		      qp->req.state == QP_STATE_DRAINED) &&
+		     (wqe->state != wqe_state_processing)))
+		return NULL;
+
+	if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
+		     (qp->req.wqe_index != consumer_index(qp->sq.queue)))) {
+		qp->req.wait_fence = 1;
+		return NULL;
+	}
+
+	wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
+	return wqe;
+}
+
+static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
+{
+	switch (opcode) {
+	case IB_WR_RDMA_WRITE:
+		if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+		    qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+			return fits ?
+				IB_OPCODE_RC_RDMA_WRITE_LAST :
+				IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+		else
+			return fits ?
+				IB_OPCODE_RC_RDMA_WRITE_ONLY :
+				IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+		    qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+			return fits ?
+				IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+				IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+		else
+			return fits ?
+				IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+				IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+	case IB_WR_SEND:
+		if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+		    qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+			return fits ?
+				IB_OPCODE_RC_SEND_LAST :
+				IB_OPCODE_RC_SEND_MIDDLE;
+		else
+			return fits ?
+				IB_OPCODE_RC_SEND_ONLY :
+				IB_OPCODE_RC_SEND_FIRST;
+
+	case IB_WR_SEND_WITH_IMM:
+		if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+		    qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+			return fits ?
+				IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE :
+				IB_OPCODE_RC_SEND_MIDDLE;
+		else
+			return fits ?
+				IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE :
+				IB_OPCODE_RC_SEND_FIRST;
+
+	case IB_WR_RDMA_READ:
+		return IB_OPCODE_RC_RDMA_READ_REQUEST;
+
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		return IB_OPCODE_RC_COMPARE_SWAP;
+
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		return IB_OPCODE_RC_FETCH_ADD;
+
+	case IB_WR_SEND_WITH_INV:
+		if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+		    qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+			return fits ? IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE :
+				IB_OPCODE_RC_SEND_MIDDLE;
+		else
+			return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE :
+				IB_OPCODE_RC_SEND_FIRST;
+	case IB_WR_REG_MR:
+	case IB_WR_LOCAL_INV:
+		return opcode;
+	}
+
+	return -EINVAL;
+}
+
+static int next_opcode_uc(struct rxe_qp *qp, u32 opcode, int fits)
+{
+	switch (opcode) {
+	case IB_WR_RDMA_WRITE:
+		if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+		    qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+			return fits ?
+				IB_OPCODE_UC_RDMA_WRITE_LAST :
+				IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+		else
+			return fits ?
+				IB_OPCODE_UC_RDMA_WRITE_ONLY :
+				IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+		    qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+			return fits ?
+				IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+				IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+		else
+			return fits ?
+				IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+				IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+	case IB_WR_SEND:
+		if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+		    qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+			return fits ?
+				IB_OPCODE_UC_SEND_LAST :
+				IB_OPCODE_UC_SEND_MIDDLE;
+		else
+			return fits ?
+				IB_OPCODE_UC_SEND_ONLY :
+				IB_OPCODE_UC_SEND_FIRST;
+
+	case IB_WR_SEND_WITH_IMM:
+		if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+		    qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+			return fits ?
+				IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE :
+				IB_OPCODE_UC_SEND_MIDDLE;
+		else
+			return fits ?
+				IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE :
+				IB_OPCODE_UC_SEND_FIRST;
+	}
+
+	return -EINVAL;
+}
+
+static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+		       u32 opcode)
+{
+	int fits = (wqe->dma.resid <= qp->mtu);
+
+	switch (qp_type(qp)) {
+	case IB_QPT_RC:
+		return next_opcode_rc(qp, opcode, fits);
+
+	case IB_QPT_UC:
+		return next_opcode_uc(qp, opcode, fits);
+
+	case IB_QPT_SMI:
+	case IB_QPT_UD:
+	case IB_QPT_GSI:
+		switch (opcode) {
+		case IB_WR_SEND:
+			return IB_OPCODE_UD_SEND_ONLY;
+
+		case IB_WR_SEND_WITH_IMM:
+			return IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static inline int check_init_depth(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+	int depth;
+
+	if (wqe->has_rd_atomic)
+		return 0;
+
+	qp->req.need_rd_atomic = 1;
+	depth = atomic_dec_return(&qp->req.rd_atomic);
+
+	if (depth >= 0) {
+		qp->req.need_rd_atomic = 0;
+		wqe->has_rd_atomic = 1;
+		return 0;
+	}
+
+	atomic_inc(&qp->req.rd_atomic);
+	return -EAGAIN;
+}
+
+static inline int get_mtu(struct rxe_qp *qp)
+{
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+	if ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC))
+		return qp->mtu;
+
+	return rxe->port.mtu_cap;
+}
+
+static struct sk_buff *init_req_packet(struct rxe_qp *qp,
+				       struct rxe_send_wqe *wqe,
+				       int opcode, int payload,
+				       struct rxe_pkt_info *pkt)
+{
+	struct rxe_dev		*rxe = to_rdev(qp->ibqp.device);
+	struct rxe_port		*port = &rxe->port;
+	struct sk_buff		*skb;
+	struct rxe_send_wr	*ibwr = &wqe->wr;
+	struct rxe_av		*av;
+	int			pad = (-payload) & 0x3;
+	int			paylen;
+	int			solicited;
+	u16			pkey;
+	u32			qp_num;
+	int			ack_req;
+
+	/* length from start of bth to end of icrc */
+	paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE;
+
+	/* pkt->hdr, rxe, port_num and mask are initialized in ifc
+	 * layer
+	 */
+	pkt->opcode	= opcode;
+	pkt->qp		= qp;
+	pkt->psn	= qp->req.psn;
+	pkt->mask	= rxe_opcode[opcode].mask;
+	pkt->paylen	= paylen;
+	pkt->offset	= 0;
+	pkt->wqe	= wqe;
+
+	/* init skb */
+	av = rxe_get_av(pkt);
+	skb = rxe_init_packet(rxe, av, paylen, pkt);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* init bth */
+	solicited = (ibwr->send_flags & IB_SEND_SOLICITED) &&
+			(pkt->mask & RXE_END_MASK) &&
+			((pkt->mask & (RXE_SEND_MASK)) ||
+			(pkt->mask & (RXE_WRITE_MASK | RXE_IMMDT_MASK)) ==
+			(RXE_WRITE_MASK | RXE_IMMDT_MASK));
+
+	pkey = (qp_type(qp) == IB_QPT_GSI) ?
+		 port->pkey_tbl[ibwr->wr.ud.pkey_index] :
+		 port->pkey_tbl[qp->attr.pkey_index];
+
+	qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn :
+					 qp->attr.dest_qp_num;
+
+	ack_req = ((pkt->mask & RXE_END_MASK) ||
+		(qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK));
+	if (ack_req)
+		qp->req.noack_pkts = 0;
+
+	bth_init(pkt, pkt->opcode, solicited, 0, pad, pkey, qp_num,
+		 ack_req, pkt->psn);
+
+	/* init optional headers */
+	if (pkt->mask & RXE_RETH_MASK) {
+		reth_set_rkey(pkt, ibwr->wr.rdma.rkey);
+		reth_set_va(pkt, wqe->iova);
+		reth_set_len(pkt, wqe->dma.length);
+	}
+
+	if (pkt->mask & RXE_IMMDT_MASK)
+		immdt_set_imm(pkt, ibwr->ex.imm_data);
+
+	if (pkt->mask & RXE_IETH_MASK)
+		ieth_set_rkey(pkt, ibwr->ex.invalidate_rkey);
+
+	if (pkt->mask & RXE_ATMETH_MASK) {
+		atmeth_set_va(pkt, wqe->iova);
+		if (opcode == IB_OPCODE_RC_COMPARE_SWAP ||
+		    opcode == IB_OPCODE_RD_COMPARE_SWAP) {
+			atmeth_set_swap_add(pkt, ibwr->wr.atomic.swap);
+			atmeth_set_comp(pkt, ibwr->wr.atomic.compare_add);
+		} else {
+			atmeth_set_swap_add(pkt, ibwr->wr.atomic.compare_add);
+		}
+		atmeth_set_rkey(pkt, ibwr->wr.atomic.rkey);
+	}
+
+	if (pkt->mask & RXE_DETH_MASK) {
+		if (qp->ibqp.qp_num == 1)
+			deth_set_qkey(pkt, GSI_QKEY);
+		else
+			deth_set_qkey(pkt, ibwr->wr.ud.remote_qkey);
+		deth_set_sqp(pkt, qp->ibqp.qp_num);
+	}
+
+	return skb;
+}
+
+static int fill_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+		       struct rxe_pkt_info *pkt, struct sk_buff *skb,
+		       int paylen)
+{
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+	u32 crc = 0;
+	u32 *p;
+	int err;
+
+	err = rxe_prepare(rxe, pkt, skb, &crc);
+	if (err)
+		return err;
+
+	if (pkt->mask & RXE_WRITE_OR_SEND) {
+		if (wqe->wr.send_flags & IB_SEND_INLINE) {
+			u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset];
+
+			crc = rxe_crc32(rxe, crc, tmp, paylen);
+			memcpy(payload_addr(pkt), tmp, paylen);
+
+			wqe->dma.resid -= paylen;
+			wqe->dma.sge_offset += paylen;
+		} else {
+			err = copy_data(rxe, qp->pd, 0, &wqe->dma,
+					payload_addr(pkt), paylen,
+					from_mem_obj,
+					&crc);
+			if (err)
+				return err;
+		}
+	}
+	p = payload_addr(pkt) + paylen + bth_pad(pkt);
+
+	*p = ~crc;
+
+	return 0;
+}
+
+static void update_wqe_state(struct rxe_qp *qp,
+		struct rxe_send_wqe *wqe,
+		struct rxe_pkt_info *pkt)
+{
+	if (pkt->mask & RXE_END_MASK) {
+		if (qp_type(qp) == IB_QPT_RC)
+			wqe->state = wqe_state_pending;
+	} else {
+		wqe->state = wqe_state_processing;
+	}
+}
+
+static void update_wqe_psn(struct rxe_qp *qp,
+			   struct rxe_send_wqe *wqe,
+			   struct rxe_pkt_info *pkt,
+			   int payload)
+{
+	/* number of packets left to send including current one */
+	int num_pkt = (wqe->dma.resid + payload + qp->mtu - 1) / qp->mtu;
+
+	/* handle zero length packet case */
+	if (num_pkt == 0)
+		num_pkt = 1;
+
+	if (pkt->mask & RXE_START_MASK) {
+		wqe->first_psn = qp->req.psn;
+		wqe->last_psn = (qp->req.psn + num_pkt - 1) & BTH_PSN_MASK;
+	}
+
+	if (pkt->mask & RXE_READ_MASK)
+		qp->req.psn = (wqe->first_psn + num_pkt) & BTH_PSN_MASK;
+	else
+		qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK;
+}
+
+static void save_state(struct rxe_send_wqe *wqe,
+		       struct rxe_qp *qp,
+		       struct rxe_send_wqe *rollback_wqe,
+		       u32 *rollback_psn)
+{
+	rollback_wqe->state     = wqe->state;
+	rollback_wqe->first_psn = wqe->first_psn;
+	rollback_wqe->last_psn  = wqe->last_psn;
+	*rollback_psn		= qp->req.psn;
+}
+
+static void rollback_state(struct rxe_send_wqe *wqe,
+			   struct rxe_qp *qp,
+			   struct rxe_send_wqe *rollback_wqe,
+			   u32 rollback_psn)
+{
+	wqe->state     = rollback_wqe->state;
+	wqe->first_psn = rollback_wqe->first_psn;
+	wqe->last_psn  = rollback_wqe->last_psn;
+	qp->req.psn    = rollback_psn;
+}
+
+static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+			 struct rxe_pkt_info *pkt, int payload)
+{
+	qp->req.opcode = pkt->opcode;
+
+	if (pkt->mask & RXE_END_MASK)
+		qp->req.wqe_index = next_index(qp->sq.queue, qp->req.wqe_index);
+
+	qp->need_req_skb = 0;
+
+	if (qp->qp_timeout_jiffies && !timer_pending(&qp->retrans_timer))
+		mod_timer(&qp->retrans_timer,
+			  jiffies + qp->qp_timeout_jiffies);
+}
+
+int rxe_requester(void *arg)
+{
+	struct rxe_qp *qp = (struct rxe_qp *)arg;
+	struct rxe_pkt_info pkt;
+	struct sk_buff *skb;
+	struct rxe_send_wqe *wqe;
+	enum rxe_hdr_mask mask;
+	int payload;
+	int mtu;
+	int opcode;
+	int ret;
+	struct rxe_send_wqe rollback_wqe;
+	u32 rollback_psn;
+
+	rxe_add_ref(qp);
+
+next_wqe:
+	if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR))
+		goto exit;
+
+	if (unlikely(qp->req.state == QP_STATE_RESET)) {
+		qp->req.wqe_index = consumer_index(qp->sq.queue);
+		qp->req.opcode = -1;
+		qp->req.need_rd_atomic = 0;
+		qp->req.wait_psn = 0;
+		qp->req.need_retry = 0;
+		goto exit;
+	}
+
+	if (unlikely(qp->req.need_retry)) {
+		req_retry(qp);
+		qp->req.need_retry = 0;
+	}
+
+	wqe = req_next_wqe(qp);
+	if (unlikely(!wqe))
+		goto exit;
+
+	if (wqe->mask & WR_REG_MASK) {
+		if (wqe->wr.opcode == IB_WR_LOCAL_INV) {
+			struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+			struct rxe_mem *rmr;
+
+			rmr = rxe_pool_get_index(&rxe->mr_pool,
+						 wqe->wr.ex.invalidate_rkey >> 8);
+			if (!rmr) {
+				pr_err("No mr for key %#x\n",
+				       wqe->wr.ex.invalidate_rkey);
+				wqe->state = wqe_state_error;
+				wqe->status = IB_WC_MW_BIND_ERR;
+				goto exit;
+			}
+			rmr->state = RXE_MEM_STATE_FREE;
+			rxe_drop_ref(rmr);
+			wqe->state = wqe_state_done;
+			wqe->status = IB_WC_SUCCESS;
+		} else if (wqe->wr.opcode == IB_WR_REG_MR) {
+			struct rxe_mem *rmr = to_rmr(wqe->wr.wr.reg.mr);
+
+			rmr->state = RXE_MEM_STATE_VALID;
+			rmr->access = wqe->wr.wr.reg.access;
+			rmr->lkey = wqe->wr.wr.reg.key;
+			rmr->rkey = wqe->wr.wr.reg.key;
+			wqe->state = wqe_state_done;
+			wqe->status = IB_WC_SUCCESS;
+		} else {
+			goto exit;
+		}
+		qp->req.wqe_index = next_index(qp->sq.queue,
+						qp->req.wqe_index);
+		goto next_wqe;
+	}
+
+	if (unlikely(qp_type(qp) == IB_QPT_RC &&
+		     qp->req.psn > (qp->comp.psn + RXE_MAX_UNACKED_PSNS))) {
+		qp->req.wait_psn = 1;
+		goto exit;
+	}
+
+	/* Limit the number of inflight SKBs per QP */
+	if (unlikely(atomic_read(&qp->skb_out) >
+		     RXE_INFLIGHT_SKBS_PER_QP_HIGH)) {
+		qp->need_req_skb = 1;
+		goto exit;
+	}
+
+	opcode = next_opcode(qp, wqe, wqe->wr.opcode);
+	if (unlikely(opcode < 0)) {
+		wqe->status = IB_WC_LOC_QP_OP_ERR;
+		goto exit;
+	}
+
+	mask = rxe_opcode[opcode].mask;
+	if (unlikely(mask & RXE_READ_OR_ATOMIC)) {
+		if (check_init_depth(qp, wqe))
+			goto exit;
+	}
+
+	mtu = get_mtu(qp);
+	payload = (mask & RXE_WRITE_OR_SEND) ? wqe->dma.resid : 0;
+	if (payload > mtu) {
+		if (qp_type(qp) == IB_QPT_UD) {
+			/* C10-93.1.1: If the total sum of all the buffer lengths specified for a
+			 * UD message exceeds the MTU of the port as returned by QueryHCA, the CI
+			 * shall not emit any packets for this message. Further, the CI shall not
+			 * generate an error due to this condition.
+			 */
+
+			/* fake a successful UD send */
+			wqe->first_psn = qp->req.psn;
+			wqe->last_psn = qp->req.psn;
+			qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK;
+			qp->req.opcode = IB_OPCODE_UD_SEND_ONLY;
+			qp->req.wqe_index = next_index(qp->sq.queue,
+						       qp->req.wqe_index);
+			wqe->state = wqe_state_done;
+			wqe->status = IB_WC_SUCCESS;
+			__rxe_do_task(&qp->comp.task);
+			rxe_drop_ref(qp);
+			return 0;
+		}
+		payload = mtu;
+	}
+
+	skb = init_req_packet(qp, wqe, opcode, payload, &pkt);
+	if (unlikely(!skb)) {
+		pr_err("qp#%d Failed allocating skb\n", qp_num(qp));
+		goto err;
+	}
+
+	if (fill_packet(qp, wqe, &pkt, skb, payload)) {
+		pr_debug("qp#%d Error during fill packet\n", qp_num(qp));
+		goto err;
+	}
+
+	/*
+	 * To prevent a race on wqe access between requester and completer,
+	 * wqe members state and psn need to be set before calling
+	 * rxe_xmit_packet().
+	 * Otherwise, completer might initiate an unjustified retry flow.
+	 */
+	save_state(wqe, qp, &rollback_wqe, &rollback_psn);
+	update_wqe_state(qp, wqe, &pkt);
+	update_wqe_psn(qp, wqe, &pkt, payload);
+	ret = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, &pkt, skb);
+	if (ret) {
+		qp->need_req_skb = 1;
+
+		rollback_state(wqe, qp, &rollback_wqe, rollback_psn);
+
+		if (ret == -EAGAIN) {
+			rxe_run_task(&qp->req.task, 1);
+			goto exit;
+		}
+
+		goto err;
+	}
+
+	update_state(qp, wqe, &pkt, payload);
+
+	goto next_wqe;
+
+err:
+	kfree_skb(skb);
+	wqe->status = IB_WC_LOC_PROT_ERR;
+	wqe->state = wqe_state_error;
+	__rxe_do_task(&qp->comp.task);
+
+exit:
+	rxe_drop_ref(qp);
+	return -EAGAIN;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
new file mode 100644
index 0000000..955ff3b
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -0,0 +1,1405 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+enum resp_states {
+	RESPST_NONE,
+	RESPST_GET_REQ,
+	RESPST_CHK_PSN,
+	RESPST_CHK_OP_SEQ,
+	RESPST_CHK_OP_VALID,
+	RESPST_CHK_RESOURCE,
+	RESPST_CHK_LENGTH,
+	RESPST_CHK_RKEY,
+	RESPST_EXECUTE,
+	RESPST_READ_REPLY,
+	RESPST_COMPLETE,
+	RESPST_ACKNOWLEDGE,
+	RESPST_CLEANUP,
+	RESPST_DUPLICATE_REQUEST,
+	RESPST_ERR_MALFORMED_WQE,
+	RESPST_ERR_UNSUPPORTED_OPCODE,
+	RESPST_ERR_MISALIGNED_ATOMIC,
+	RESPST_ERR_PSN_OUT_OF_SEQ,
+	RESPST_ERR_MISSING_OPCODE_FIRST,
+	RESPST_ERR_MISSING_OPCODE_LAST_C,
+	RESPST_ERR_MISSING_OPCODE_LAST_D1E,
+	RESPST_ERR_TOO_MANY_RDMA_ATM_REQ,
+	RESPST_ERR_RNR,
+	RESPST_ERR_RKEY_VIOLATION,
+	RESPST_ERR_LENGTH,
+	RESPST_ERR_CQ_OVERFLOW,
+	RESPST_ERROR,
+	RESPST_RESET,
+	RESPST_DONE,
+	RESPST_EXIT,
+};
+
+static char *resp_state_name[] = {
+	[RESPST_NONE]				= "NONE",
+	[RESPST_GET_REQ]			= "GET_REQ",
+	[RESPST_CHK_PSN]			= "CHK_PSN",
+	[RESPST_CHK_OP_SEQ]			= "CHK_OP_SEQ",
+	[RESPST_CHK_OP_VALID]			= "CHK_OP_VALID",
+	[RESPST_CHK_RESOURCE]			= "CHK_RESOURCE",
+	[RESPST_CHK_LENGTH]			= "CHK_LENGTH",
+	[RESPST_CHK_RKEY]			= "CHK_RKEY",
+	[RESPST_EXECUTE]			= "EXECUTE",
+	[RESPST_READ_REPLY]			= "READ_REPLY",
+	[RESPST_COMPLETE]			= "COMPLETE",
+	[RESPST_ACKNOWLEDGE]			= "ACKNOWLEDGE",
+	[RESPST_CLEANUP]			= "CLEANUP",
+	[RESPST_DUPLICATE_REQUEST]		= "DUPLICATE_REQUEST",
+	[RESPST_ERR_MALFORMED_WQE]		= "ERR_MALFORMED_WQE",
+	[RESPST_ERR_UNSUPPORTED_OPCODE]		= "ERR_UNSUPPORTED_OPCODE",
+	[RESPST_ERR_MISALIGNED_ATOMIC]		= "ERR_MISALIGNED_ATOMIC",
+	[RESPST_ERR_PSN_OUT_OF_SEQ]		= "ERR_PSN_OUT_OF_SEQ",
+	[RESPST_ERR_MISSING_OPCODE_FIRST]	= "ERR_MISSING_OPCODE_FIRST",
+	[RESPST_ERR_MISSING_OPCODE_LAST_C]	= "ERR_MISSING_OPCODE_LAST_C",
+	[RESPST_ERR_MISSING_OPCODE_LAST_D1E]	= "ERR_MISSING_OPCODE_LAST_D1E",
+	[RESPST_ERR_TOO_MANY_RDMA_ATM_REQ]	= "ERR_TOO_MANY_RDMA_ATM_REQ",
+	[RESPST_ERR_RNR]			= "ERR_RNR",
+	[RESPST_ERR_RKEY_VIOLATION]		= "ERR_RKEY_VIOLATION",
+	[RESPST_ERR_LENGTH]			= "ERR_LENGTH",
+	[RESPST_ERR_CQ_OVERFLOW]		= "ERR_CQ_OVERFLOW",
+	[RESPST_ERROR]				= "ERROR",
+	[RESPST_RESET]				= "RESET",
+	[RESPST_DONE]				= "DONE",
+	[RESPST_EXIT]				= "EXIT",
+};
+
+/* rxe_recv calls here to add a request packet to the input queue */
+void rxe_resp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
+			struct sk_buff *skb)
+{
+	int must_sched;
+	struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+	skb_queue_tail(&qp->req_pkts, skb);
+
+	must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) ||
+			(skb_queue_len(&qp->req_pkts) > 1);
+
+	rxe_run_task(&qp->resp.task, must_sched);
+}
+
+static inline enum resp_states get_req(struct rxe_qp *qp,
+				       struct rxe_pkt_info **pkt_p)
+{
+	struct sk_buff *skb;
+
+	if (qp->resp.state == QP_STATE_ERROR) {
+		skb = skb_dequeue(&qp->req_pkts);
+		if (skb) {
+			/* drain request packet queue */
+			rxe_drop_ref(qp);
+			kfree_skb(skb);
+			return RESPST_GET_REQ;
+		}
+
+		/* go drain recv wr queue */
+		return RESPST_CHK_RESOURCE;
+	}
+
+	skb = skb_peek(&qp->req_pkts);
+	if (!skb)
+		return RESPST_EXIT;
+
+	*pkt_p = SKB_TO_PKT(skb);
+
+	return (qp->resp.res) ? RESPST_READ_REPLY : RESPST_CHK_PSN;
+}
+
+static enum resp_states check_psn(struct rxe_qp *qp,
+				  struct rxe_pkt_info *pkt)
+{
+	int diff = psn_compare(pkt->psn, qp->resp.psn);
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+	switch (qp_type(qp)) {
+	case IB_QPT_RC:
+		if (diff > 0) {
+			if (qp->resp.sent_psn_nak)
+				return RESPST_CLEANUP;
+
+			qp->resp.sent_psn_nak = 1;
+			rxe_counter_inc(rxe, RXE_CNT_OUT_OF_SEQ_REQ);
+			return RESPST_ERR_PSN_OUT_OF_SEQ;
+
+		} else if (diff < 0) {
+			rxe_counter_inc(rxe, RXE_CNT_DUP_REQ);
+			return RESPST_DUPLICATE_REQUEST;
+		}
+
+		if (qp->resp.sent_psn_nak)
+			qp->resp.sent_psn_nak = 0;
+
+		break;
+
+	case IB_QPT_UC:
+		if (qp->resp.drop_msg || diff != 0) {
+			if (pkt->mask & RXE_START_MASK) {
+				qp->resp.drop_msg = 0;
+				return RESPST_CHK_OP_SEQ;
+			}
+
+			qp->resp.drop_msg = 1;
+			return RESPST_CLEANUP;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return RESPST_CHK_OP_SEQ;
+}
+
+static enum resp_states check_op_seq(struct rxe_qp *qp,
+				     struct rxe_pkt_info *pkt)
+{
+	switch (qp_type(qp)) {
+	case IB_QPT_RC:
+		switch (qp->resp.opcode) {
+		case IB_OPCODE_RC_SEND_FIRST:
+		case IB_OPCODE_RC_SEND_MIDDLE:
+			switch (pkt->opcode) {
+			case IB_OPCODE_RC_SEND_MIDDLE:
+			case IB_OPCODE_RC_SEND_LAST:
+			case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE:
+			case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE:
+				return RESPST_CHK_OP_VALID;
+			default:
+				return RESPST_ERR_MISSING_OPCODE_LAST_C;
+			}
+
+		case IB_OPCODE_RC_RDMA_WRITE_FIRST:
+		case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+			switch (pkt->opcode) {
+			case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+			case IB_OPCODE_RC_RDMA_WRITE_LAST:
+			case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+				return RESPST_CHK_OP_VALID;
+			default:
+				return RESPST_ERR_MISSING_OPCODE_LAST_C;
+			}
+
+		default:
+			switch (pkt->opcode) {
+			case IB_OPCODE_RC_SEND_MIDDLE:
+			case IB_OPCODE_RC_SEND_LAST:
+			case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE:
+			case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE:
+			case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+			case IB_OPCODE_RC_RDMA_WRITE_LAST:
+			case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+				return RESPST_ERR_MISSING_OPCODE_FIRST;
+			default:
+				return RESPST_CHK_OP_VALID;
+			}
+		}
+		break;
+
+	case IB_QPT_UC:
+		switch (qp->resp.opcode) {
+		case IB_OPCODE_UC_SEND_FIRST:
+		case IB_OPCODE_UC_SEND_MIDDLE:
+			switch (pkt->opcode) {
+			case IB_OPCODE_UC_SEND_MIDDLE:
+			case IB_OPCODE_UC_SEND_LAST:
+			case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE:
+				return RESPST_CHK_OP_VALID;
+			default:
+				return RESPST_ERR_MISSING_OPCODE_LAST_D1E;
+			}
+
+		case IB_OPCODE_UC_RDMA_WRITE_FIRST:
+		case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+			switch (pkt->opcode) {
+			case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+			case IB_OPCODE_UC_RDMA_WRITE_LAST:
+			case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+				return RESPST_CHK_OP_VALID;
+			default:
+				return RESPST_ERR_MISSING_OPCODE_LAST_D1E;
+			}
+
+		default:
+			switch (pkt->opcode) {
+			case IB_OPCODE_UC_SEND_MIDDLE:
+			case IB_OPCODE_UC_SEND_LAST:
+			case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE:
+			case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+			case IB_OPCODE_UC_RDMA_WRITE_LAST:
+			case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+				qp->resp.drop_msg = 1;
+				return RESPST_CLEANUP;
+			default:
+				return RESPST_CHK_OP_VALID;
+			}
+		}
+		break;
+
+	default:
+		return RESPST_CHK_OP_VALID;
+	}
+}
+
+static enum resp_states check_op_valid(struct rxe_qp *qp,
+				       struct rxe_pkt_info *pkt)
+{
+	switch (qp_type(qp)) {
+	case IB_QPT_RC:
+		if (((pkt->mask & RXE_READ_MASK) &&
+		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) ||
+		    ((pkt->mask & RXE_WRITE_MASK) &&
+		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) ||
+		    ((pkt->mask & RXE_ATOMIC_MASK) &&
+		     !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) {
+			return RESPST_ERR_UNSUPPORTED_OPCODE;
+		}
+
+		break;
+
+	case IB_QPT_UC:
+		if ((pkt->mask & RXE_WRITE_MASK) &&
+		    !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) {
+			qp->resp.drop_msg = 1;
+			return RESPST_CLEANUP;
+		}
+
+		break;
+
+	case IB_QPT_UD:
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		break;
+
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+
+	return RESPST_CHK_RESOURCE;
+}
+
+static enum resp_states get_srq_wqe(struct rxe_qp *qp)
+{
+	struct rxe_srq *srq = qp->srq;
+	struct rxe_queue *q = srq->rq.queue;
+	struct rxe_recv_wqe *wqe;
+	struct ib_event ev;
+
+	if (srq->error)
+		return RESPST_ERR_RNR;
+
+	spin_lock_bh(&srq->rq.consumer_lock);
+
+	wqe = queue_head(q);
+	if (!wqe) {
+		spin_unlock_bh(&srq->rq.consumer_lock);
+		return RESPST_ERR_RNR;
+	}
+
+	/* note kernel and user space recv wqes have same size */
+	memcpy(&qp->resp.srq_wqe, wqe, sizeof(qp->resp.srq_wqe));
+
+	qp->resp.wqe = &qp->resp.srq_wqe.wqe;
+	advance_consumer(q);
+
+	if (srq->limit && srq->ibsrq.event_handler &&
+	    (queue_count(q) < srq->limit)) {
+		srq->limit = 0;
+		goto event;
+	}
+
+	spin_unlock_bh(&srq->rq.consumer_lock);
+	return RESPST_CHK_LENGTH;
+
+event:
+	spin_unlock_bh(&srq->rq.consumer_lock);
+	ev.device = qp->ibqp.device;
+	ev.element.srq = qp->ibqp.srq;
+	ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+	srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context);
+	return RESPST_CHK_LENGTH;
+}
+
+static enum resp_states check_resource(struct rxe_qp *qp,
+				       struct rxe_pkt_info *pkt)
+{
+	struct rxe_srq *srq = qp->srq;
+
+	if (qp->resp.state == QP_STATE_ERROR) {
+		if (qp->resp.wqe) {
+			qp->resp.status = IB_WC_WR_FLUSH_ERR;
+			return RESPST_COMPLETE;
+		} else if (!srq) {
+			qp->resp.wqe = queue_head(qp->rq.queue);
+			if (qp->resp.wqe) {
+				qp->resp.status = IB_WC_WR_FLUSH_ERR;
+				return RESPST_COMPLETE;
+			} else {
+				return RESPST_EXIT;
+			}
+		} else {
+			return RESPST_EXIT;
+		}
+	}
+
+	if (pkt->mask & RXE_READ_OR_ATOMIC) {
+		/* it is the requesters job to not send
+		 * too many read/atomic ops, we just
+		 * recycle the responder resource queue
+		 */
+		if (likely(qp->attr.max_dest_rd_atomic > 0))
+			return RESPST_CHK_LENGTH;
+		else
+			return RESPST_ERR_TOO_MANY_RDMA_ATM_REQ;
+	}
+
+	if (pkt->mask & RXE_RWR_MASK) {
+		if (srq)
+			return get_srq_wqe(qp);
+
+		qp->resp.wqe = queue_head(qp->rq.queue);
+		return (qp->resp.wqe) ? RESPST_CHK_LENGTH : RESPST_ERR_RNR;
+	}
+
+	return RESPST_CHK_LENGTH;
+}
+
+static enum resp_states check_length(struct rxe_qp *qp,
+				     struct rxe_pkt_info *pkt)
+{
+	switch (qp_type(qp)) {
+	case IB_QPT_RC:
+		return RESPST_CHK_RKEY;
+
+	case IB_QPT_UC:
+		return RESPST_CHK_RKEY;
+
+	default:
+		return RESPST_CHK_RKEY;
+	}
+}
+
+static enum resp_states check_rkey(struct rxe_qp *qp,
+				   struct rxe_pkt_info *pkt)
+{
+	struct rxe_mem *mem = NULL;
+	u64 va;
+	u32 rkey;
+	u32 resid;
+	u32 pktlen;
+	int mtu = qp->mtu;
+	enum resp_states state;
+	int access;
+
+	if (pkt->mask & (RXE_READ_MASK | RXE_WRITE_MASK)) {
+		if (pkt->mask & RXE_RETH_MASK) {
+			qp->resp.va = reth_va(pkt);
+			qp->resp.rkey = reth_rkey(pkt);
+			qp->resp.resid = reth_len(pkt);
+		}
+		access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
+						     : IB_ACCESS_REMOTE_WRITE;
+	} else if (pkt->mask & RXE_ATOMIC_MASK) {
+		qp->resp.va = atmeth_va(pkt);
+		qp->resp.rkey = atmeth_rkey(pkt);
+		qp->resp.resid = sizeof(u64);
+		access = IB_ACCESS_REMOTE_ATOMIC;
+	} else {
+		return RESPST_EXECUTE;
+	}
+
+	/* A zero-byte op is not required to set an addr or rkey. */
+	if ((pkt->mask & (RXE_READ_MASK | RXE_WRITE_OR_SEND)) &&
+	    (pkt->mask & RXE_RETH_MASK) &&
+	    reth_len(pkt) == 0) {
+		return RESPST_EXECUTE;
+	}
+
+	va	= qp->resp.va;
+	rkey	= qp->resp.rkey;
+	resid	= qp->resp.resid;
+	pktlen	= payload_size(pkt);
+
+	mem = lookup_mem(qp->pd, access, rkey, lookup_remote);
+	if (!mem) {
+		state = RESPST_ERR_RKEY_VIOLATION;
+		goto err;
+	}
+
+	if (unlikely(mem->state == RXE_MEM_STATE_FREE)) {
+		state = RESPST_ERR_RKEY_VIOLATION;
+		goto err;
+	}
+
+	if (mem_check_range(mem, va, resid)) {
+		state = RESPST_ERR_RKEY_VIOLATION;
+		goto err;
+	}
+
+	if (pkt->mask & RXE_WRITE_MASK)	 {
+		if (resid > mtu) {
+			if (pktlen != mtu || bth_pad(pkt)) {
+				state = RESPST_ERR_LENGTH;
+				goto err;
+			}
+		} else {
+			if (pktlen != resid) {
+				state = RESPST_ERR_LENGTH;
+				goto err;
+			}
+			if ((bth_pad(pkt) != (0x3 & (-resid)))) {
+				/* This case may not be exactly that
+				 * but nothing else fits.
+				 */
+				state = RESPST_ERR_LENGTH;
+				goto err;
+			}
+		}
+	}
+
+	WARN_ON_ONCE(qp->resp.mr);
+
+	qp->resp.mr = mem;
+	return RESPST_EXECUTE;
+
+err:
+	if (mem)
+		rxe_drop_ref(mem);
+	return state;
+}
+
+static enum resp_states send_data_in(struct rxe_qp *qp, void *data_addr,
+				     int data_len)
+{
+	int err;
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+	err = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma,
+			data_addr, data_len, to_mem_obj, NULL);
+	if (unlikely(err))
+		return (err == -ENOSPC) ? RESPST_ERR_LENGTH
+					: RESPST_ERR_MALFORMED_WQE;
+
+	return RESPST_NONE;
+}
+
+static enum resp_states write_data_in(struct rxe_qp *qp,
+				      struct rxe_pkt_info *pkt)
+{
+	enum resp_states rc = RESPST_NONE;
+	int	err;
+	int data_len = payload_size(pkt);
+
+	err = rxe_mem_copy(qp->resp.mr, qp->resp.va, payload_addr(pkt),
+			   data_len, to_mem_obj, NULL);
+	if (err) {
+		rc = RESPST_ERR_RKEY_VIOLATION;
+		goto out;
+	}
+
+	qp->resp.va += data_len;
+	qp->resp.resid -= data_len;
+
+out:
+	return rc;
+}
+
+/* Guarantee atomicity of atomic operations at the machine level. */
+static DEFINE_SPINLOCK(atomic_ops_lock);
+
+static enum resp_states process_atomic(struct rxe_qp *qp,
+				       struct rxe_pkt_info *pkt)
+{
+	u64 iova = atmeth_va(pkt);
+	u64 *vaddr;
+	enum resp_states ret;
+	struct rxe_mem *mr = qp->resp.mr;
+
+	if (mr->state != RXE_MEM_STATE_VALID) {
+		ret = RESPST_ERR_RKEY_VIOLATION;
+		goto out;
+	}
+
+	vaddr = iova_to_vaddr(mr, iova, sizeof(u64));
+
+	/* check vaddr is 8 bytes aligned. */
+	if (!vaddr || (uintptr_t)vaddr & 7) {
+		ret = RESPST_ERR_MISALIGNED_ATOMIC;
+		goto out;
+	}
+
+	spin_lock_bh(&atomic_ops_lock);
+
+	qp->resp.atomic_orig = *vaddr;
+
+	if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP ||
+	    pkt->opcode == IB_OPCODE_RD_COMPARE_SWAP) {
+		if (*vaddr == atmeth_comp(pkt))
+			*vaddr = atmeth_swap_add(pkt);
+	} else {
+		*vaddr += atmeth_swap_add(pkt);
+	}
+
+	spin_unlock_bh(&atomic_ops_lock);
+
+	ret = RESPST_NONE;
+out:
+	return ret;
+}
+
+static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
+					  struct rxe_pkt_info *pkt,
+					  struct rxe_pkt_info *ack,
+					  int opcode,
+					  int payload,
+					  u32 psn,
+					  u8 syndrome,
+					  u32 *crcp)
+{
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+	struct sk_buff *skb;
+	u32 crc = 0;
+	u32 *p;
+	int paylen;
+	int pad;
+	int err;
+
+	/*
+	 * allocate packet
+	 */
+	pad = (-payload) & 0x3;
+	paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE;
+
+	skb = rxe_init_packet(rxe, &qp->pri_av, paylen, ack);
+	if (!skb)
+		return NULL;
+
+	ack->qp = qp;
+	ack->opcode = opcode;
+	ack->mask = rxe_opcode[opcode].mask;
+	ack->offset = pkt->offset;
+	ack->paylen = paylen;
+
+	/* fill in bth using the request packet headers */
+	memcpy(ack->hdr, pkt->hdr, pkt->offset + RXE_BTH_BYTES);
+
+	bth_set_opcode(ack, opcode);
+	bth_set_qpn(ack, qp->attr.dest_qp_num);
+	bth_set_pad(ack, pad);
+	bth_set_se(ack, 0);
+	bth_set_psn(ack, psn);
+	bth_set_ack(ack, 0);
+	ack->psn = psn;
+
+	if (ack->mask & RXE_AETH_MASK) {
+		aeth_set_syn(ack, syndrome);
+		aeth_set_msn(ack, qp->resp.msn);
+	}
+
+	if (ack->mask & RXE_ATMACK_MASK)
+		atmack_set_orig(ack, qp->resp.atomic_orig);
+
+	err = rxe_prepare(rxe, ack, skb, &crc);
+	if (err) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	if (crcp) {
+		/* CRC computation will be continued by the caller */
+		*crcp = crc;
+	} else {
+		p = payload_addr(ack) + payload + bth_pad(ack);
+		*p = ~crc;
+	}
+
+	return skb;
+}
+
+/* RDMA read response. If res is not NULL, then we have a current RDMA request
+ * being processed or replayed.
+ */
+static enum resp_states read_reply(struct rxe_qp *qp,
+				   struct rxe_pkt_info *req_pkt)
+{
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+	struct rxe_pkt_info ack_pkt;
+	struct sk_buff *skb;
+	int mtu = qp->mtu;
+	enum resp_states state;
+	int payload;
+	int opcode;
+	int err;
+	struct resp_res *res = qp->resp.res;
+	u32 icrc;
+	u32 *p;
+
+	if (!res) {
+		/* This is the first time we process that request. Get a
+		 * resource
+		 */
+		res = &qp->resp.resources[qp->resp.res_head];
+
+		free_rd_atomic_resource(qp, res);
+		rxe_advance_resp_resource(qp);
+
+		res->type		= RXE_READ_MASK;
+
+		res->read.va		= qp->resp.va;
+		res->read.va_org	= qp->resp.va;
+
+		res->first_psn		= req_pkt->psn;
+
+		if (reth_len(req_pkt)) {
+			res->last_psn	= (req_pkt->psn +
+					   (reth_len(req_pkt) + mtu - 1) /
+					   mtu - 1) & BTH_PSN_MASK;
+		} else {
+			res->last_psn	= res->first_psn;
+		}
+		res->cur_psn		= req_pkt->psn;
+
+		res->read.resid		= qp->resp.resid;
+		res->read.length	= qp->resp.resid;
+		res->read.rkey		= qp->resp.rkey;
+
+		/* note res inherits the reference to mr from qp */
+		res->read.mr		= qp->resp.mr;
+		qp->resp.mr		= NULL;
+
+		qp->resp.res		= res;
+		res->state		= rdatm_res_state_new;
+	}
+
+	if (res->state == rdatm_res_state_new) {
+		if (res->read.resid <= mtu)
+			opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY;
+		else
+			opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST;
+	} else {
+		if (res->read.resid > mtu)
+			opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE;
+		else
+			opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST;
+	}
+
+	res->state = rdatm_res_state_next;
+
+	payload = min_t(int, res->read.resid, mtu);
+
+	skb = prepare_ack_packet(qp, req_pkt, &ack_pkt, opcode, payload,
+				 res->cur_psn, AETH_ACK_UNLIMITED, &icrc);
+	if (!skb)
+		return RESPST_ERR_RNR;
+
+	err = rxe_mem_copy(res->read.mr, res->read.va, payload_addr(&ack_pkt),
+			   payload, from_mem_obj, &icrc);
+	if (err)
+		pr_err("Failed copying memory\n");
+
+	p = payload_addr(&ack_pkt) + payload + bth_pad(&ack_pkt);
+	*p = ~icrc;
+
+	err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+	if (err) {
+		pr_err("Failed sending RDMA reply.\n");
+		return RESPST_ERR_RNR;
+	}
+
+	res->read.va += payload;
+	res->read.resid -= payload;
+	res->cur_psn = (res->cur_psn + 1) & BTH_PSN_MASK;
+
+	if (res->read.resid > 0) {
+		state = RESPST_DONE;
+	} else {
+		qp->resp.res = NULL;
+		qp->resp.opcode = -1;
+		if (psn_compare(res->cur_psn, qp->resp.psn) >= 0)
+			qp->resp.psn = res->cur_psn;
+		state = RESPST_CLEANUP;
+	}
+
+	return state;
+}
+
+static void build_rdma_network_hdr(union rdma_network_hdr *hdr,
+				   struct rxe_pkt_info *pkt)
+{
+	struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+	memset(hdr, 0, sizeof(*hdr));
+	if (skb->protocol == htons(ETH_P_IP))
+		memcpy(&hdr->roce4grh, ip_hdr(skb), sizeof(hdr->roce4grh));
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		memcpy(&hdr->ibgrh, ipv6_hdr(skb), sizeof(hdr->ibgrh));
+}
+
+/* Executes a new request. A retried request never reach that function (send
+ * and writes are discarded, and reads and atomics are retried elsewhere.
+ */
+static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
+{
+	enum resp_states err;
+
+	if (pkt->mask & RXE_SEND_MASK) {
+		if (qp_type(qp) == IB_QPT_UD ||
+		    qp_type(qp) == IB_QPT_SMI ||
+		    qp_type(qp) == IB_QPT_GSI) {
+			union rdma_network_hdr hdr;
+
+			build_rdma_network_hdr(&hdr, pkt);
+
+			err = send_data_in(qp, &hdr, sizeof(hdr));
+			if (err)
+				return err;
+		}
+		err = send_data_in(qp, payload_addr(pkt), payload_size(pkt));
+		if (err)
+			return err;
+	} else if (pkt->mask & RXE_WRITE_MASK) {
+		err = write_data_in(qp, pkt);
+		if (err)
+			return err;
+	} else if (pkt->mask & RXE_READ_MASK) {
+		/* For RDMA Read we can increment the msn now. See C9-148. */
+		qp->resp.msn++;
+		return RESPST_READ_REPLY;
+	} else if (pkt->mask & RXE_ATOMIC_MASK) {
+		err = process_atomic(qp, pkt);
+		if (err)
+			return err;
+	} else {
+		/* Unreachable */
+		WARN_ON_ONCE(1);
+	}
+
+	/* next expected psn, read handles this separately */
+	qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+
+	qp->resp.opcode = pkt->opcode;
+	qp->resp.status = IB_WC_SUCCESS;
+
+	if (pkt->mask & RXE_COMP_MASK) {
+		/* We successfully processed this new request. */
+		qp->resp.msn++;
+		return RESPST_COMPLETE;
+	} else if (qp_type(qp) == IB_QPT_RC)
+		return RESPST_ACKNOWLEDGE;
+	else
+		return RESPST_CLEANUP;
+}
+
+static enum resp_states do_complete(struct rxe_qp *qp,
+				    struct rxe_pkt_info *pkt)
+{
+	struct rxe_cqe cqe;
+	struct ib_wc *wc = &cqe.ibwc;
+	struct ib_uverbs_wc *uwc = &cqe.uibwc;
+	struct rxe_recv_wqe *wqe = qp->resp.wqe;
+
+	if (unlikely(!wqe))
+		return RESPST_CLEANUP;
+
+	memset(&cqe, 0, sizeof(cqe));
+
+	wc->wr_id		= wqe->wr_id;
+	wc->status		= qp->resp.status;
+	wc->qp			= &qp->ibqp;
+
+	/* fields after status are not required for errors */
+	if (wc->status == IB_WC_SUCCESS) {
+		wc->opcode = (pkt->mask & RXE_IMMDT_MASK &&
+				pkt->mask & RXE_WRITE_MASK) ?
+					IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
+		wc->vendor_err = 0;
+		wc->byte_len = wqe->dma.length - wqe->dma.resid;
+
+		/* fields after byte_len are different between kernel and user
+		 * space
+		 */
+		if (qp->rcq->is_user) {
+			uwc->wc_flags = IB_WC_GRH;
+
+			if (pkt->mask & RXE_IMMDT_MASK) {
+				uwc->wc_flags |= IB_WC_WITH_IMM;
+				uwc->ex.imm_data = immdt_imm(pkt);
+			}
+
+			if (pkt->mask & RXE_IETH_MASK) {
+				uwc->wc_flags |= IB_WC_WITH_INVALIDATE;
+				uwc->ex.invalidate_rkey = ieth_rkey(pkt);
+			}
+
+			uwc->qp_num		= qp->ibqp.qp_num;
+
+			if (pkt->mask & RXE_DETH_MASK)
+				uwc->src_qp = deth_sqp(pkt);
+
+			uwc->port_num		= qp->attr.port_num;
+		} else {
+			struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+			wc->wc_flags = IB_WC_GRH | IB_WC_WITH_NETWORK_HDR_TYPE;
+			if (skb->protocol == htons(ETH_P_IP))
+				wc->network_hdr_type = RDMA_NETWORK_IPV4;
+			else
+				wc->network_hdr_type = RDMA_NETWORK_IPV6;
+
+			if (pkt->mask & RXE_IMMDT_MASK) {
+				wc->wc_flags |= IB_WC_WITH_IMM;
+				wc->ex.imm_data = immdt_imm(pkt);
+			}
+
+			if (pkt->mask & RXE_IETH_MASK) {
+				struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+				struct rxe_mem *rmr;
+
+				wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+				wc->ex.invalidate_rkey = ieth_rkey(pkt);
+
+				rmr = rxe_pool_get_index(&rxe->mr_pool,
+							 wc->ex.invalidate_rkey >> 8);
+				if (unlikely(!rmr)) {
+					pr_err("Bad rkey %#x invalidation\n",
+					       wc->ex.invalidate_rkey);
+					return RESPST_ERROR;
+				}
+				rmr->state = RXE_MEM_STATE_FREE;
+				rxe_drop_ref(rmr);
+			}
+
+			wc->qp			= &qp->ibqp;
+
+			if (pkt->mask & RXE_DETH_MASK)
+				wc->src_qp = deth_sqp(pkt);
+
+			wc->port_num		= qp->attr.port_num;
+		}
+	}
+
+	/* have copy for srq and reference for !srq */
+	if (!qp->srq)
+		advance_consumer(qp->rq.queue);
+
+	qp->resp.wqe = NULL;
+
+	if (rxe_cq_post(qp->rcq, &cqe, pkt ? bth_se(pkt) : 1))
+		return RESPST_ERR_CQ_OVERFLOW;
+
+	if (qp->resp.state == QP_STATE_ERROR)
+		return RESPST_CHK_RESOURCE;
+
+	if (!pkt)
+		return RESPST_DONE;
+	else if (qp_type(qp) == IB_QPT_RC)
+		return RESPST_ACKNOWLEDGE;
+	else
+		return RESPST_CLEANUP;
+}
+
+static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
+		    u8 syndrome, u32 psn)
+{
+	int err = 0;
+	struct rxe_pkt_info ack_pkt;
+	struct sk_buff *skb;
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+	skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE,
+				 0, psn, syndrome, NULL);
+	if (!skb) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+	if (err)
+		pr_err_ratelimited("Failed sending ack\n");
+
+err1:
+	return err;
+}
+
+static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
+			   u8 syndrome)
+{
+	int rc = 0;
+	struct rxe_pkt_info ack_pkt;
+	struct sk_buff *skb;
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+	struct resp_res *res;
+
+	skb = prepare_ack_packet(qp, pkt, &ack_pkt,
+				 IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn,
+				 syndrome, NULL);
+	if (!skb) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rxe_add_ref(qp);
+
+	res = &qp->resp.resources[qp->resp.res_head];
+	free_rd_atomic_resource(qp, res);
+	rxe_advance_resp_resource(qp);
+
+	memcpy(SKB_TO_PKT(skb), &ack_pkt, sizeof(ack_pkt));
+	memset((unsigned char *)SKB_TO_PKT(skb) + sizeof(ack_pkt), 0,
+	       sizeof(skb->cb) - sizeof(ack_pkt));
+
+	refcount_inc(&skb->users);
+	res->type = RXE_ATOMIC_MASK;
+	res->atomic.skb = skb;
+	res->first_psn = ack_pkt.psn;
+	res->last_psn  = ack_pkt.psn;
+	res->cur_psn   = ack_pkt.psn;
+
+	rc = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+	if (rc) {
+		pr_err_ratelimited("Failed sending ack\n");
+		rxe_drop_ref(qp);
+	}
+out:
+	return rc;
+}
+
+static enum resp_states acknowledge(struct rxe_qp *qp,
+				    struct rxe_pkt_info *pkt)
+{
+	if (qp_type(qp) != IB_QPT_RC)
+		return RESPST_CLEANUP;
+
+	if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED)
+		send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn);
+	else if (pkt->mask & RXE_ATOMIC_MASK)
+		send_atomic_ack(qp, pkt, AETH_ACK_UNLIMITED);
+	else if (bth_ack(pkt))
+		send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn);
+
+	return RESPST_CLEANUP;
+}
+
+static enum resp_states cleanup(struct rxe_qp *qp,
+				struct rxe_pkt_info *pkt)
+{
+	struct sk_buff *skb;
+
+	if (pkt) {
+		skb = skb_dequeue(&qp->req_pkts);
+		rxe_drop_ref(qp);
+		kfree_skb(skb);
+	}
+
+	if (qp->resp.mr) {
+		rxe_drop_ref(qp->resp.mr);
+		qp->resp.mr = NULL;
+	}
+
+	return RESPST_DONE;
+}
+
+static struct resp_res *find_resource(struct rxe_qp *qp, u32 psn)
+{
+	int i;
+
+	for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) {
+		struct resp_res *res = &qp->resp.resources[i];
+
+		if (res->type == 0)
+			continue;
+
+		if (psn_compare(psn, res->first_psn) >= 0 &&
+		    psn_compare(psn, res->last_psn) <= 0) {
+			return res;
+		}
+	}
+
+	return NULL;
+}
+
+static enum resp_states duplicate_request(struct rxe_qp *qp,
+					  struct rxe_pkt_info *pkt)
+{
+	enum resp_states rc;
+	u32 prev_psn = (qp->resp.psn - 1) & BTH_PSN_MASK;
+
+	if (pkt->mask & RXE_SEND_MASK ||
+	    pkt->mask & RXE_WRITE_MASK) {
+		/* SEND. Ack again and cleanup. C9-105. */
+		if (bth_ack(pkt))
+			send_ack(qp, pkt, AETH_ACK_UNLIMITED, prev_psn);
+		rc = RESPST_CLEANUP;
+		goto out;
+	} else if (pkt->mask & RXE_READ_MASK) {
+		struct resp_res *res;
+
+		res = find_resource(qp, pkt->psn);
+		if (!res) {
+			/* Resource not found. Class D error.  Drop the
+			 * request.
+			 */
+			rc = RESPST_CLEANUP;
+			goto out;
+		} else {
+			/* Ensure this new request is the same as the previous
+			 * one or a subset of it.
+			 */
+			u64 iova = reth_va(pkt);
+			u32 resid = reth_len(pkt);
+
+			if (iova < res->read.va_org ||
+			    resid > res->read.length ||
+			    (iova + resid) > (res->read.va_org +
+					      res->read.length)) {
+				rc = RESPST_CLEANUP;
+				goto out;
+			}
+
+			if (reth_rkey(pkt) != res->read.rkey) {
+				rc = RESPST_CLEANUP;
+				goto out;
+			}
+
+			res->cur_psn = pkt->psn;
+			res->state = (pkt->psn == res->first_psn) ?
+					rdatm_res_state_new :
+					rdatm_res_state_replay;
+
+			/* Reset the resource, except length. */
+			res->read.va_org = iova;
+			res->read.va = iova;
+			res->read.resid = resid;
+
+			/* Replay the RDMA read reply. */
+			qp->resp.res = res;
+			rc = RESPST_READ_REPLY;
+			goto out;
+		}
+	} else {
+		struct resp_res *res;
+
+		/* Find the operation in our list of responder resources. */
+		res = find_resource(qp, pkt->psn);
+		if (res) {
+			struct sk_buff *skb_copy;
+
+			skb_copy = skb_clone(res->atomic.skb, GFP_ATOMIC);
+			if (skb_copy) {
+				rxe_add_ref(qp); /* for the new SKB */
+			} else {
+				pr_warn("Couldn't clone atomic resp\n");
+				rc = RESPST_CLEANUP;
+				goto out;
+			}
+
+			/* Resend the result. */
+			rc = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp,
+					     pkt, skb_copy);
+			if (rc) {
+				pr_err("Failed resending result. This flow is not handled - skb ignored\n");
+				rxe_drop_ref(qp);
+				rc = RESPST_CLEANUP;
+				goto out;
+			}
+		}
+
+		/* Resource not found. Class D error. Drop the request. */
+		rc = RESPST_CLEANUP;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+/* Process a class A or C. Both are treated the same in this implementation. */
+static void do_class_ac_error(struct rxe_qp *qp, u8 syndrome,
+			      enum ib_wc_status status)
+{
+	qp->resp.aeth_syndrome	= syndrome;
+	qp->resp.status		= status;
+
+	/* indicate that we should go through the ERROR state */
+	qp->resp.goto_error	= 1;
+}
+
+static enum resp_states do_class_d1e_error(struct rxe_qp *qp)
+{
+	/* UC */
+	if (qp->srq) {
+		/* Class E */
+		qp->resp.drop_msg = 1;
+		if (qp->resp.wqe) {
+			qp->resp.status = IB_WC_REM_INV_REQ_ERR;
+			return RESPST_COMPLETE;
+		} else {
+			return RESPST_CLEANUP;
+		}
+	} else {
+		/* Class D1. This packet may be the start of a
+		 * new message and could be valid. The previous
+		 * message is invalid and ignored. reset the
+		 * recv wr to its original state
+		 */
+		if (qp->resp.wqe) {
+			qp->resp.wqe->dma.resid = qp->resp.wqe->dma.length;
+			qp->resp.wqe->dma.cur_sge = 0;
+			qp->resp.wqe->dma.sge_offset = 0;
+			qp->resp.opcode = -1;
+		}
+
+		if (qp->resp.mr) {
+			rxe_drop_ref(qp->resp.mr);
+			qp->resp.mr = NULL;
+		}
+
+		return RESPST_CLEANUP;
+	}
+}
+
+static void rxe_drain_req_pkts(struct rxe_qp *qp, bool notify)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&qp->req_pkts))) {
+		rxe_drop_ref(qp);
+		kfree_skb(skb);
+	}
+
+	if (notify)
+		return;
+
+	while (!qp->srq && qp->rq.queue && queue_head(qp->rq.queue))
+		advance_consumer(qp->rq.queue);
+}
+
+int rxe_responder(void *arg)
+{
+	struct rxe_qp *qp = (struct rxe_qp *)arg;
+	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+	enum resp_states state;
+	struct rxe_pkt_info *pkt = NULL;
+	int ret = 0;
+
+	rxe_add_ref(qp);
+
+	qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED;
+
+	if (!qp->valid) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	switch (qp->resp.state) {
+	case QP_STATE_RESET:
+		state = RESPST_RESET;
+		break;
+
+	default:
+		state = RESPST_GET_REQ;
+		break;
+	}
+
+	while (1) {
+		pr_debug("qp#%d state = %s\n", qp_num(qp),
+			 resp_state_name[state]);
+		switch (state) {
+		case RESPST_GET_REQ:
+			state = get_req(qp, &pkt);
+			break;
+		case RESPST_CHK_PSN:
+			state = check_psn(qp, pkt);
+			break;
+		case RESPST_CHK_OP_SEQ:
+			state = check_op_seq(qp, pkt);
+			break;
+		case RESPST_CHK_OP_VALID:
+			state = check_op_valid(qp, pkt);
+			break;
+		case RESPST_CHK_RESOURCE:
+			state = check_resource(qp, pkt);
+			break;
+		case RESPST_CHK_LENGTH:
+			state = check_length(qp, pkt);
+			break;
+		case RESPST_CHK_RKEY:
+			state = check_rkey(qp, pkt);
+			break;
+		case RESPST_EXECUTE:
+			state = execute(qp, pkt);
+			break;
+		case RESPST_COMPLETE:
+			state = do_complete(qp, pkt);
+			break;
+		case RESPST_READ_REPLY:
+			state = read_reply(qp, pkt);
+			break;
+		case RESPST_ACKNOWLEDGE:
+			state = acknowledge(qp, pkt);
+			break;
+		case RESPST_CLEANUP:
+			state = cleanup(qp, pkt);
+			break;
+		case RESPST_DUPLICATE_REQUEST:
+			state = duplicate_request(qp, pkt);
+			break;
+		case RESPST_ERR_PSN_OUT_OF_SEQ:
+			/* RC only - Class B. Drop packet. */
+			send_ack(qp, pkt, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn);
+			state = RESPST_CLEANUP;
+			break;
+
+		case RESPST_ERR_TOO_MANY_RDMA_ATM_REQ:
+		case RESPST_ERR_MISSING_OPCODE_FIRST:
+		case RESPST_ERR_MISSING_OPCODE_LAST_C:
+		case RESPST_ERR_UNSUPPORTED_OPCODE:
+		case RESPST_ERR_MISALIGNED_ATOMIC:
+			/* RC Only - Class C. */
+			do_class_ac_error(qp, AETH_NAK_INVALID_REQ,
+					  IB_WC_REM_INV_REQ_ERR);
+			state = RESPST_COMPLETE;
+			break;
+
+		case RESPST_ERR_MISSING_OPCODE_LAST_D1E:
+			state = do_class_d1e_error(qp);
+			break;
+		case RESPST_ERR_RNR:
+			if (qp_type(qp) == IB_QPT_RC) {
+				rxe_counter_inc(rxe, RXE_CNT_SND_RNR);
+				/* RC - class B */
+				send_ack(qp, pkt, AETH_RNR_NAK |
+					 (~AETH_TYPE_MASK &
+					 qp->attr.min_rnr_timer),
+					 pkt->psn);
+			} else {
+				/* UD/UC - class D */
+				qp->resp.drop_msg = 1;
+			}
+			state = RESPST_CLEANUP;
+			break;
+
+		case RESPST_ERR_RKEY_VIOLATION:
+			if (qp_type(qp) == IB_QPT_RC) {
+				/* Class C */
+				do_class_ac_error(qp, AETH_NAK_REM_ACC_ERR,
+						  IB_WC_REM_ACCESS_ERR);
+				state = RESPST_COMPLETE;
+			} else {
+				qp->resp.drop_msg = 1;
+				if (qp->srq) {
+					/* UC/SRQ Class D */
+					qp->resp.status = IB_WC_REM_ACCESS_ERR;
+					state = RESPST_COMPLETE;
+				} else {
+					/* UC/non-SRQ Class E. */
+					state = RESPST_CLEANUP;
+				}
+			}
+			break;
+
+		case RESPST_ERR_LENGTH:
+			if (qp_type(qp) == IB_QPT_RC) {
+				/* Class C */
+				do_class_ac_error(qp, AETH_NAK_INVALID_REQ,
+						  IB_WC_REM_INV_REQ_ERR);
+				state = RESPST_COMPLETE;
+			} else if (qp->srq) {
+				/* UC/UD - class E */
+				qp->resp.status = IB_WC_REM_INV_REQ_ERR;
+				state = RESPST_COMPLETE;
+			} else {
+				/* UC/UD - class D */
+				qp->resp.drop_msg = 1;
+				state = RESPST_CLEANUP;
+			}
+			break;
+
+		case RESPST_ERR_MALFORMED_WQE:
+			/* All, Class A. */
+			do_class_ac_error(qp, AETH_NAK_REM_OP_ERR,
+					  IB_WC_LOC_QP_OP_ERR);
+			state = RESPST_COMPLETE;
+			break;
+
+		case RESPST_ERR_CQ_OVERFLOW:
+			/* All - Class G */
+			state = RESPST_ERROR;
+			break;
+
+		case RESPST_DONE:
+			if (qp->resp.goto_error) {
+				state = RESPST_ERROR;
+				break;
+			}
+
+			goto done;
+
+		case RESPST_EXIT:
+			if (qp->resp.goto_error) {
+				state = RESPST_ERROR;
+				break;
+			}
+
+			goto exit;
+
+		case RESPST_RESET:
+			rxe_drain_req_pkts(qp, false);
+			qp->resp.wqe = NULL;
+			goto exit;
+
+		case RESPST_ERROR:
+			qp->resp.goto_error = 0;
+			pr_warn("qp#%d moved to error state\n", qp_num(qp));
+			rxe_qp_error(qp);
+			goto exit;
+
+		default:
+			WARN_ON_ONCE(1);
+		}
+	}
+
+exit:
+	ret = -EAGAIN;
+done:
+	rxe_drop_ref(qp);
+	return ret;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c
new file mode 100644
index 0000000..0d6c04b
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_srq.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+		     struct ib_srq_attr *attr, enum ib_srq_attr_mask mask)
+{
+	if (srq && srq->error) {
+		pr_warn("srq in error state\n");
+		goto err1;
+	}
+
+	if (mask & IB_SRQ_MAX_WR) {
+		if (attr->max_wr > rxe->attr.max_srq_wr) {
+			pr_warn("max_wr(%d) > max_srq_wr(%d)\n",
+				attr->max_wr, rxe->attr.max_srq_wr);
+			goto err1;
+		}
+
+		if (attr->max_wr <= 0) {
+			pr_warn("max_wr(%d) <= 0\n", attr->max_wr);
+			goto err1;
+		}
+
+		if (srq && srq->limit && (attr->max_wr < srq->limit)) {
+			pr_warn("max_wr (%d) < srq->limit (%d)\n",
+				attr->max_wr, srq->limit);
+			goto err1;
+		}
+
+		if (attr->max_wr < RXE_MIN_SRQ_WR)
+			attr->max_wr = RXE_MIN_SRQ_WR;
+	}
+
+	if (mask & IB_SRQ_LIMIT) {
+		if (attr->srq_limit > rxe->attr.max_srq_wr) {
+			pr_warn("srq_limit(%d) > max_srq_wr(%d)\n",
+				attr->srq_limit, rxe->attr.max_srq_wr);
+			goto err1;
+		}
+
+		if (srq && (attr->srq_limit > srq->rq.queue->buf->index_mask)) {
+			pr_warn("srq_limit (%d) > cur limit(%d)\n",
+				attr->srq_limit,
+				 srq->rq.queue->buf->index_mask);
+			goto err1;
+		}
+	}
+
+	if (mask == IB_SRQ_INIT_MASK) {
+		if (attr->max_sge > rxe->attr.max_srq_sge) {
+			pr_warn("max_sge(%d) > max_srq_sge(%d)\n",
+				attr->max_sge, rxe->attr.max_srq_sge);
+			goto err1;
+		}
+
+		if (attr->max_sge < RXE_MIN_SRQ_SGE)
+			attr->max_sge = RXE_MIN_SRQ_SGE;
+	}
+
+	return 0;
+
+err1:
+	return -EINVAL;
+}
+
+int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
+		      struct ib_srq_init_attr *init,
+		      struct ib_ucontext *context,
+		      struct rxe_create_srq_resp __user *uresp)
+{
+	int err;
+	int srq_wqe_size;
+	struct rxe_queue *q;
+
+	srq->ibsrq.event_handler	= init->event_handler;
+	srq->ibsrq.srq_context		= init->srq_context;
+	srq->limit		= init->attr.srq_limit;
+	srq->srq_num		= srq->pelem.index;
+	srq->rq.max_wr		= init->attr.max_wr;
+	srq->rq.max_sge		= init->attr.max_sge;
+
+	srq_wqe_size		= rcv_wqe_size(srq->rq.max_sge);
+
+	spin_lock_init(&srq->rq.producer_lock);
+	spin_lock_init(&srq->rq.consumer_lock);
+
+	q = rxe_queue_init(rxe, &srq->rq.max_wr,
+			   srq_wqe_size);
+	if (!q) {
+		pr_warn("unable to allocate queue for srq\n");
+		return -ENOMEM;
+	}
+
+	srq->rq.queue = q;
+
+	err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context, q->buf,
+			   q->buf_size, &q->ip);
+	if (err)
+		return err;
+
+	if (uresp) {
+		if (copy_to_user(&uresp->srq_num, &srq->srq_num,
+				 sizeof(uresp->srq_num)))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+		      struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
+		      struct rxe_modify_srq_cmd *ucmd)
+{
+	int err;
+	struct rxe_queue *q = srq->rq.queue;
+	struct mminfo __user *mi = NULL;
+
+	if (mask & IB_SRQ_MAX_WR) {
+		/*
+		 * This is completely screwed up, the response is supposed to
+		 * be in the outbuf not like this.
+		 */
+		mi = u64_to_user_ptr(ucmd->mmap_info_addr);
+
+		err = rxe_queue_resize(q, &attr->max_wr,
+				       rcv_wqe_size(srq->rq.max_sge),
+				       srq->rq.queue->ip ?
+						srq->rq.queue->ip->context :
+						NULL,
+				       mi, &srq->rq.producer_lock,
+				       &srq->rq.consumer_lock);
+		if (err)
+			goto err2;
+	}
+
+	if (mask & IB_SRQ_LIMIT)
+		srq->limit = attr->srq_limit;
+
+	return 0;
+
+err2:
+	rxe_queue_cleanup(q);
+	srq->rq.queue = NULL;
+	return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c
new file mode 100644
index 0000000..d5ed757
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_net.h"
+
+/* Copy argument and remove trailing CR. Return the new length. */
+static int sanitize_arg(const char *val, char *intf, int intf_len)
+{
+	int len;
+
+	if (!val)
+		return 0;
+
+	/* Remove newline. */
+	for (len = 0; len < intf_len - 1 && val[len] && val[len] != '\n'; len++)
+		intf[len] = val[len];
+	intf[len] = 0;
+
+	if (len == 0 || (val[len] != 0 && val[len] != '\n'))
+		return 0;
+
+	return len;
+}
+
+static void rxe_set_port_state(struct net_device *ndev)
+{
+	struct rxe_dev *rxe = net_to_rxe(ndev);
+	bool is_up = netif_running(ndev) && netif_carrier_ok(ndev);
+
+	if (!rxe)
+		goto out;
+
+	if (is_up)
+		rxe_port_up(rxe);
+	else
+		rxe_port_down(rxe); /* down for unknown state */
+out:
+	return;
+}
+
+static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
+{
+	int len;
+	int err = 0;
+	char intf[32];
+	struct net_device *ndev = NULL;
+	struct rxe_dev *rxe;
+
+	len = sanitize_arg(val, intf, sizeof(intf));
+	if (!len) {
+		pr_err("add: invalid interface name\n");
+		err = -EINVAL;
+		goto err;
+	}
+
+	ndev = dev_get_by_name(&init_net, intf);
+	if (!ndev) {
+		pr_err("interface %s not found\n", intf);
+		err = -EINVAL;
+		goto err;
+	}
+
+	if (net_to_rxe(ndev)) {
+		pr_err("already configured on %s\n", intf);
+		err = -EINVAL;
+		goto err;
+	}
+
+	rxe = rxe_net_add(ndev);
+	if (!rxe) {
+		pr_err("failed to add %s\n", intf);
+		err = -EINVAL;
+		goto err;
+	}
+
+	rxe_set_port_state(ndev);
+	pr_info("added %s to %s\n", rxe->ib_dev.name, intf);
+err:
+	if (ndev)
+		dev_put(ndev);
+	return err;
+}
+
+static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
+{
+	int len;
+	char intf[32];
+	struct rxe_dev *rxe;
+
+	len = sanitize_arg(val, intf, sizeof(intf));
+	if (!len) {
+		pr_err("add: invalid interface name\n");
+		return -EINVAL;
+	}
+
+	if (strncmp("all", intf, len) == 0) {
+		pr_info("rxe_sys: remove all");
+		rxe_remove_all();
+		return 0;
+	}
+
+	rxe = get_rxe_by_name(intf);
+
+	if (!rxe) {
+		pr_err("not configured on %s\n", intf);
+		return -EINVAL;
+	}
+
+	list_del(&rxe->list);
+	rxe_remove(rxe);
+
+	return 0;
+}
+
+static const struct kernel_param_ops rxe_add_ops = {
+	.set = rxe_param_set_add,
+};
+
+static const struct kernel_param_ops rxe_remove_ops = {
+	.set = rxe_param_set_remove,
+};
+
+module_param_cb(add, &rxe_add_ops, NULL, 0200);
+MODULE_PARM_DESC(add, "Create RXE device over network interface");
+module_param_cb(remove, &rxe_remove_ops, NULL, 0200);
+MODULE_PARM_DESC(remove, "Remove RXE device over network interface");
diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c
new file mode 100644
index 0000000..08f05ac
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_task.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *	   Redistribution and use in source and binary forms, with or
+ *	   without modification, are permitted provided that the following
+ *	   conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/hardirq.h>
+
+#include "rxe_task.h"
+
+int __rxe_do_task(struct rxe_task *task)
+
+{
+	int ret;
+
+	while ((ret = task->func(task->arg)) == 0)
+		;
+
+	task->ret = ret;
+
+	return ret;
+}
+
+/*
+ * this locking is due to a potential race where
+ * a second caller finds the task already running
+ * but looks just after the last call to func
+ */
+void rxe_do_task(unsigned long data)
+{
+	int cont;
+	int ret;
+	unsigned long flags;
+	struct rxe_task *task = (struct rxe_task *)data;
+
+	spin_lock_irqsave(&task->state_lock, flags);
+	switch (task->state) {
+	case TASK_STATE_START:
+		task->state = TASK_STATE_BUSY;
+		spin_unlock_irqrestore(&task->state_lock, flags);
+		break;
+
+	case TASK_STATE_BUSY:
+		task->state = TASK_STATE_ARMED;
+		/* fall through */
+	case TASK_STATE_ARMED:
+		spin_unlock_irqrestore(&task->state_lock, flags);
+		return;
+
+	default:
+		spin_unlock_irqrestore(&task->state_lock, flags);
+		pr_warn("%s failed with bad state %d\n", __func__, task->state);
+		return;
+	}
+
+	do {
+		cont = 0;
+		ret = task->func(task->arg);
+
+		spin_lock_irqsave(&task->state_lock, flags);
+		switch (task->state) {
+		case TASK_STATE_BUSY:
+			if (ret)
+				task->state = TASK_STATE_START;
+			else
+				cont = 1;
+			break;
+
+		/* soneone tried to run the task since the last time we called
+		 * func, so we will call one more time regardless of the
+		 * return value
+		 */
+		case TASK_STATE_ARMED:
+			task->state = TASK_STATE_BUSY;
+			cont = 1;
+			break;
+
+		default:
+			pr_warn("%s failed with bad state %d\n", __func__,
+				task->state);
+		}
+		spin_unlock_irqrestore(&task->state_lock, flags);
+	} while (cont);
+
+	task->ret = ret;
+}
+
+int rxe_init_task(void *obj, struct rxe_task *task,
+		  void *arg, int (*func)(void *), char *name)
+{
+	task->obj	= obj;
+	task->arg	= arg;
+	task->func	= func;
+	snprintf(task->name, sizeof(task->name), "%s", name);
+	task->destroyed	= false;
+
+	tasklet_init(&task->tasklet, rxe_do_task, (unsigned long)task);
+
+	task->state = TASK_STATE_START;
+	spin_lock_init(&task->state_lock);
+
+	return 0;
+}
+
+void rxe_cleanup_task(struct rxe_task *task)
+{
+	unsigned long flags;
+	bool idle;
+
+	/*
+	 * Mark the task, then wait for it to finish. It might be
+	 * running in a non-tasklet (direct call) context.
+	 */
+	task->destroyed = true;
+
+	do {
+		spin_lock_irqsave(&task->state_lock, flags);
+		idle = (task->state == TASK_STATE_START);
+		spin_unlock_irqrestore(&task->state_lock, flags);
+	} while (!idle);
+
+	tasklet_kill(&task->tasklet);
+}
+
+void rxe_run_task(struct rxe_task *task, int sched)
+{
+	if (task->destroyed)
+		return;
+
+	if (sched)
+		tasklet_schedule(&task->tasklet);
+	else
+		rxe_do_task((unsigned long)task);
+}
+
+void rxe_disable_task(struct rxe_task *task)
+{
+	tasklet_disable(&task->tasklet);
+}
+
+void rxe_enable_task(struct rxe_task *task)
+{
+	tasklet_enable(&task->tasklet);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h
new file mode 100644
index 0000000..08ff42d
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_task.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *	   Redistribution and use in source and binary forms, with or
+ *	   without modification, are permitted provided that the following
+ *	   conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_TASK_H
+#define RXE_TASK_H
+
+enum {
+	TASK_STATE_START	= 0,
+	TASK_STATE_BUSY		= 1,
+	TASK_STATE_ARMED	= 2,
+};
+
+/*
+ * data structure to describe a 'task' which is a short
+ * function that returns 0 as long as it needs to be
+ * called again.
+ */
+struct rxe_task {
+	void			*obj;
+	struct tasklet_struct	tasklet;
+	int			state;
+	spinlock_t		state_lock; /* spinlock for task state */
+	void			*arg;
+	int			(*func)(void *arg);
+	int			ret;
+	char			name[16];
+	bool			destroyed;
+};
+
+/*
+ * init rxe_task structure
+ *	arg  => parameter to pass to fcn
+ *	fcn  => function to call until it returns != 0
+ */
+int rxe_init_task(void *obj, struct rxe_task *task,
+		  void *arg, int (*func)(void *), char *name);
+
+/* cleanup task */
+void rxe_cleanup_task(struct rxe_task *task);
+
+/*
+ * raw call to func in loop without any checking
+ * can call when tasklets are disabled
+ */
+int __rxe_do_task(struct rxe_task *task);
+
+/*
+ * common function called by any of the main tasklets
+ * If there is any chance that there is additional
+ * work to do someone must reschedule the task before
+ * leaving
+ */
+void rxe_do_task(unsigned long data);
+
+/* run a task, else schedule it to run as a tasklet, The decision
+ * to run or schedule tasklet is based on the parameter sched.
+ */
+void rxe_run_task(struct rxe_task *task, int sched);
+
+/* keep a task from scheduling */
+void rxe_disable_task(struct rxe_task *task);
+
+/* allow task to run */
+void rxe_enable_task(struct rxe_task *task);
+
+#endif /* RXE_TASK_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
new file mode 100644
index 0000000..73a00a1
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -0,0 +1,1328 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/dma-mapping.h>
+#include <net/addrconf.h>
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+#include "rxe_hw_counters.h"
+
+static int rxe_query_device(struct ib_device *dev,
+			    struct ib_device_attr *attr,
+			    struct ib_udata *uhw)
+{
+	struct rxe_dev *rxe = to_rdev(dev);
+
+	if (uhw->inlen || uhw->outlen)
+		return -EINVAL;
+
+	*attr = rxe->attr;
+	return 0;
+}
+
+static int rxe_query_port(struct ib_device *dev,
+			  u8 port_num, struct ib_port_attr *attr)
+{
+	struct rxe_dev *rxe = to_rdev(dev);
+	struct rxe_port *port;
+	int rc = -EINVAL;
+
+	if (unlikely(port_num != 1)) {
+		pr_warn("invalid port_number %d\n", port_num);
+		goto out;
+	}
+
+	port = &rxe->port;
+
+	/* *attr being zeroed by the caller, avoid zeroing it here */
+	*attr = port->attr;
+
+	mutex_lock(&rxe->usdev_lock);
+	rc = ib_get_eth_speed(dev, port_num, &attr->active_speed,
+			      &attr->active_width);
+	mutex_unlock(&rxe->usdev_lock);
+
+out:
+	return rc;
+}
+
+static struct net_device *rxe_get_netdev(struct ib_device *device,
+					 u8 port_num)
+{
+	struct rxe_dev *rxe = to_rdev(device);
+
+	if (rxe->ndev) {
+		dev_hold(rxe->ndev);
+		return rxe->ndev;
+	}
+
+	return NULL;
+}
+
+static int rxe_query_pkey(struct ib_device *device,
+			  u8 port_num, u16 index, u16 *pkey)
+{
+	struct rxe_dev *rxe = to_rdev(device);
+	struct rxe_port *port;
+
+	if (unlikely(port_num != 1)) {
+		dev_warn(device->dev.parent, "invalid port_num = %d\n",
+			 port_num);
+		goto err1;
+	}
+
+	port = &rxe->port;
+
+	if (unlikely(index >= port->attr.pkey_tbl_len)) {
+		dev_warn(device->dev.parent, "invalid index = %d\n",
+			 index);
+		goto err1;
+	}
+
+	*pkey = port->pkey_tbl[index];
+	return 0;
+
+err1:
+	return -EINVAL;
+}
+
+static int rxe_modify_device(struct ib_device *dev,
+			     int mask, struct ib_device_modify *attr)
+{
+	struct rxe_dev *rxe = to_rdev(dev);
+
+	if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
+		rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid);
+
+	if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		memcpy(rxe->ib_dev.node_desc,
+		       attr->node_desc, sizeof(rxe->ib_dev.node_desc));
+	}
+
+	return 0;
+}
+
+static int rxe_modify_port(struct ib_device *dev,
+			   u8 port_num, int mask, struct ib_port_modify *attr)
+{
+	struct rxe_dev *rxe = to_rdev(dev);
+	struct rxe_port *port;
+
+	if (unlikely(port_num != 1)) {
+		pr_warn("invalid port_num = %d\n", port_num);
+		goto err1;
+	}
+
+	port = &rxe->port;
+
+	port->attr.port_cap_flags |= attr->set_port_cap_mask;
+	port->attr.port_cap_flags &= ~attr->clr_port_cap_mask;
+
+	if (mask & IB_PORT_RESET_QKEY_CNTR)
+		port->attr.qkey_viol_cntr = 0;
+
+	return 0;
+
+err1:
+	return -EINVAL;
+}
+
+static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev,
+					       u8 port_num)
+{
+	struct rxe_dev *rxe = to_rdev(dev);
+
+	return rxe_link_layer(rxe, port_num);
+}
+
+static struct ib_ucontext *rxe_alloc_ucontext(struct ib_device *dev,
+					      struct ib_udata *udata)
+{
+	struct rxe_dev *rxe = to_rdev(dev);
+	struct rxe_ucontext *uc;
+
+	uc = rxe_alloc(&rxe->uc_pool);
+	return uc ? &uc->ibuc : ERR_PTR(-ENOMEM);
+}
+
+static int rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
+{
+	struct rxe_ucontext *uc = to_ruc(ibuc);
+
+	rxe_drop_ref(uc);
+	return 0;
+}
+
+static int rxe_port_immutable(struct ib_device *dev, u8 port_num,
+			      struct ib_port_immutable *immutable)
+{
+	int err;
+	struct ib_port_attr attr;
+
+	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+
+	err = ib_query_port(dev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+	return 0;
+}
+
+static struct ib_pd *rxe_alloc_pd(struct ib_device *dev,
+				  struct ib_ucontext *context,
+				  struct ib_udata *udata)
+{
+	struct rxe_dev *rxe = to_rdev(dev);
+	struct rxe_pd *pd;
+
+	pd = rxe_alloc(&rxe->pd_pool);
+	return pd ? &pd->ibpd : ERR_PTR(-ENOMEM);
+}
+
+static int rxe_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct rxe_pd *pd = to_rpd(ibpd);
+
+	rxe_drop_ref(pd);
+	return 0;
+}
+
+static int rxe_init_av(struct rxe_dev *rxe, struct rdma_ah_attr *attr,
+		       struct rxe_av *av)
+{
+	int err;
+	union ib_gid sgid;
+	struct ib_gid_attr sgid_attr;
+
+	err = ib_get_cached_gid(&rxe->ib_dev, rdma_ah_get_port_num(attr),
+				rdma_ah_read_grh(attr)->sgid_index, &sgid,
+				&sgid_attr);
+	if (err) {
+		pr_err("Failed to query sgid. err = %d\n", err);
+		return err;
+	}
+
+	rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr);
+	rxe_av_fill_ip_info(av, attr, &sgid_attr, &sgid);
+	dev_put(sgid_attr.ndev);
+	return 0;
+}
+
+static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd,
+				   struct rdma_ah_attr *attr,
+				   struct ib_udata *udata)
+
+{
+	int err;
+	struct rxe_dev *rxe = to_rdev(ibpd->device);
+	struct rxe_pd *pd = to_rpd(ibpd);
+	struct rxe_ah *ah;
+
+	err = rxe_av_chk_attr(rxe, attr);
+	if (err)
+		goto err1;
+
+	ah = rxe_alloc(&rxe->ah_pool);
+	if (!ah) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	rxe_add_ref(pd);
+	ah->pd = pd;
+
+	err = rxe_init_av(rxe, attr, &ah->av);
+	if (err)
+		goto err2;
+
+	return &ah->ibah;
+
+err2:
+	rxe_drop_ref(pd);
+	rxe_drop_ref(ah);
+err1:
+	return ERR_PTR(err);
+}
+
+static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
+{
+	int err;
+	struct rxe_dev *rxe = to_rdev(ibah->device);
+	struct rxe_ah *ah = to_rah(ibah);
+
+	err = rxe_av_chk_attr(rxe, attr);
+	if (err)
+		return err;
+
+	err = rxe_init_av(rxe, attr, &ah->av);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int rxe_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
+{
+	struct rxe_ah *ah = to_rah(ibah);
+
+	memset(attr, 0, sizeof(*attr));
+	attr->type = ibah->type;
+	rxe_av_to_attr(&ah->av, attr);
+	return 0;
+}
+
+static int rxe_destroy_ah(struct ib_ah *ibah)
+{
+	struct rxe_ah *ah = to_rah(ibah);
+
+	rxe_drop_ref(ah->pd);
+	rxe_drop_ref(ah);
+	return 0;
+}
+
+static int post_one_recv(struct rxe_rq *rq, struct ib_recv_wr *ibwr)
+{
+	int err;
+	int i;
+	u32 length;
+	struct rxe_recv_wqe *recv_wqe;
+	int num_sge = ibwr->num_sge;
+
+	if (unlikely(queue_full(rq->queue))) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	if (unlikely(num_sge > rq->max_sge)) {
+		err = -EINVAL;
+		goto err1;
+	}
+
+	length = 0;
+	for (i = 0; i < num_sge; i++)
+		length += ibwr->sg_list[i].length;
+
+	recv_wqe = producer_addr(rq->queue);
+	recv_wqe->wr_id = ibwr->wr_id;
+	recv_wqe->num_sge = num_sge;
+
+	memcpy(recv_wqe->dma.sge, ibwr->sg_list,
+	       num_sge * sizeof(struct ib_sge));
+
+	recv_wqe->dma.length		= length;
+	recv_wqe->dma.resid		= length;
+	recv_wqe->dma.num_sge		= num_sge;
+	recv_wqe->dma.cur_sge		= 0;
+	recv_wqe->dma.sge_offset	= 0;
+
+	/* make sure all changes to the work queue are written before we
+	 * update the producer pointer
+	 */
+	smp_wmb();
+
+	advance_producer(rq->queue);
+	return 0;
+
+err1:
+	return err;
+}
+
+static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd,
+				     struct ib_srq_init_attr *init,
+				     struct ib_udata *udata)
+{
+	int err;
+	struct rxe_dev *rxe = to_rdev(ibpd->device);
+	struct rxe_pd *pd = to_rpd(ibpd);
+	struct rxe_srq *srq;
+	struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL;
+	struct rxe_create_srq_resp __user *uresp = NULL;
+
+	if (udata) {
+		if (udata->outlen < sizeof(*uresp))
+			return ERR_PTR(-EINVAL);
+		uresp = udata->outbuf;
+	}
+
+	err = rxe_srq_chk_attr(rxe, NULL, &init->attr, IB_SRQ_INIT_MASK);
+	if (err)
+		goto err1;
+
+	srq = rxe_alloc(&rxe->srq_pool);
+	if (!srq) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	rxe_add_index(srq);
+	rxe_add_ref(pd);
+	srq->pd = pd;
+
+	err = rxe_srq_from_init(rxe, srq, init, context, uresp);
+	if (err)
+		goto err2;
+
+	return &srq->ibsrq;
+
+err2:
+	rxe_drop_ref(pd);
+	rxe_drop_index(srq);
+	rxe_drop_ref(srq);
+err1:
+	return ERR_PTR(err);
+}
+
+static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+			  enum ib_srq_attr_mask mask,
+			  struct ib_udata *udata)
+{
+	int err;
+	struct rxe_srq *srq = to_rsrq(ibsrq);
+	struct rxe_dev *rxe = to_rdev(ibsrq->device);
+	struct rxe_modify_srq_cmd ucmd = {};
+
+	if (udata) {
+		if (udata->inlen < sizeof(ucmd))
+			return -EINVAL;
+
+		err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
+		if (err)
+			return err;
+	}
+
+	err = rxe_srq_chk_attr(rxe, srq, attr, mask);
+	if (err)
+		goto err1;
+
+	err = rxe_srq_from_attr(rxe, srq, attr, mask, &ucmd);
+	if (err)
+		goto err1;
+
+	return 0;
+
+err1:
+	return err;
+}
+
+static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+	struct rxe_srq *srq = to_rsrq(ibsrq);
+
+	if (srq->error)
+		return -EINVAL;
+
+	attr->max_wr = srq->rq.queue->buf->index_mask;
+	attr->max_sge = srq->rq.max_sge;
+	attr->srq_limit = srq->limit;
+	return 0;
+}
+
+static int rxe_destroy_srq(struct ib_srq *ibsrq)
+{
+	struct rxe_srq *srq = to_rsrq(ibsrq);
+
+	if (srq->rq.queue)
+		rxe_queue_cleanup(srq->rq.queue);
+
+	rxe_drop_ref(srq->pd);
+	rxe_drop_index(srq);
+	rxe_drop_ref(srq);
+
+	return 0;
+}
+
+static int rxe_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr)
+{
+	int err = 0;
+	unsigned long flags;
+	struct rxe_srq *srq = to_rsrq(ibsrq);
+
+	spin_lock_irqsave(&srq->rq.producer_lock, flags);
+
+	while (wr) {
+		err = post_one_recv(&srq->rq, wr);
+		if (unlikely(err))
+			break;
+		wr = wr->next;
+	}
+
+	spin_unlock_irqrestore(&srq->rq.producer_lock, flags);
+
+	if (err)
+		*bad_wr = wr;
+
+	return err;
+}
+
+static struct ib_qp *rxe_create_qp(struct ib_pd *ibpd,
+				   struct ib_qp_init_attr *init,
+				   struct ib_udata *udata)
+{
+	int err;
+	struct rxe_dev *rxe = to_rdev(ibpd->device);
+	struct rxe_pd *pd = to_rpd(ibpd);
+	struct rxe_qp *qp;
+	struct rxe_create_qp_resp __user *uresp = NULL;
+
+	if (udata) {
+		if (udata->outlen < sizeof(*uresp))
+			return ERR_PTR(-EINVAL);
+		uresp = udata->outbuf;
+	}
+
+	err = rxe_qp_chk_init(rxe, init);
+	if (err)
+		goto err1;
+
+	qp = rxe_alloc(&rxe->qp_pool);
+	if (!qp) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	if (udata) {
+		if (udata->inlen) {
+			err = -EINVAL;
+			goto err2;
+		}
+		qp->is_user = 1;
+	}
+
+	rxe_add_index(qp);
+
+	err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibpd);
+	if (err)
+		goto err3;
+
+	return &qp->ibqp;
+
+err3:
+	rxe_drop_index(qp);
+err2:
+	rxe_drop_ref(qp);
+err1:
+	return ERR_PTR(err);
+}
+
+static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+			 int mask, struct ib_udata *udata)
+{
+	int err;
+	struct rxe_dev *rxe = to_rdev(ibqp->device);
+	struct rxe_qp *qp = to_rqp(ibqp);
+
+	err = rxe_qp_chk_attr(rxe, qp, attr, mask);
+	if (err)
+		goto err1;
+
+	err = rxe_qp_from_attr(qp, attr, mask, udata);
+	if (err)
+		goto err1;
+
+	return 0;
+
+err1:
+	return err;
+}
+
+static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+			int mask, struct ib_qp_init_attr *init)
+{
+	struct rxe_qp *qp = to_rqp(ibqp);
+
+	rxe_qp_to_init(qp, init);
+	rxe_qp_to_attr(qp, attr, mask);
+
+	return 0;
+}
+
+static int rxe_destroy_qp(struct ib_qp *ibqp)
+{
+	struct rxe_qp *qp = to_rqp(ibqp);
+
+	rxe_qp_destroy(qp);
+	rxe_drop_index(qp);
+	rxe_drop_ref(qp);
+	return 0;
+}
+
+static int validate_send_wr(struct rxe_qp *qp, struct ib_send_wr *ibwr,
+			    unsigned int mask, unsigned int length)
+{
+	int num_sge = ibwr->num_sge;
+	struct rxe_sq *sq = &qp->sq;
+
+	if (unlikely(num_sge > sq->max_sge))
+		goto err1;
+
+	if (unlikely(mask & WR_ATOMIC_MASK)) {
+		if (length < 8)
+			goto err1;
+
+		if (atomic_wr(ibwr)->remote_addr & 0x7)
+			goto err1;
+	}
+
+	if (unlikely((ibwr->send_flags & IB_SEND_INLINE) &&
+		     (length > sq->max_inline)))
+		goto err1;
+
+	return 0;
+
+err1:
+	return -EINVAL;
+}
+
+static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr,
+			 struct ib_send_wr *ibwr)
+{
+	wr->wr_id = ibwr->wr_id;
+	wr->num_sge = ibwr->num_sge;
+	wr->opcode = ibwr->opcode;
+	wr->send_flags = ibwr->send_flags;
+
+	if (qp_type(qp) == IB_QPT_UD ||
+	    qp_type(qp) == IB_QPT_SMI ||
+	    qp_type(qp) == IB_QPT_GSI) {
+		wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn;
+		wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey;
+		if (qp_type(qp) == IB_QPT_GSI)
+			wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index;
+		if (wr->opcode == IB_WR_SEND_WITH_IMM)
+			wr->ex.imm_data = ibwr->ex.imm_data;
+	} else {
+		switch (wr->opcode) {
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			wr->ex.imm_data = ibwr->ex.imm_data;
+			/* fall through */
+		case IB_WR_RDMA_READ:
+		case IB_WR_RDMA_WRITE:
+			wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr;
+			wr->wr.rdma.rkey	= rdma_wr(ibwr)->rkey;
+			break;
+		case IB_WR_SEND_WITH_IMM:
+			wr->ex.imm_data = ibwr->ex.imm_data;
+			break;
+		case IB_WR_SEND_WITH_INV:
+			wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey;
+			break;
+		case IB_WR_ATOMIC_CMP_AND_SWP:
+		case IB_WR_ATOMIC_FETCH_AND_ADD:
+			wr->wr.atomic.remote_addr =
+				atomic_wr(ibwr)->remote_addr;
+			wr->wr.atomic.compare_add =
+				atomic_wr(ibwr)->compare_add;
+			wr->wr.atomic.swap = atomic_wr(ibwr)->swap;
+			wr->wr.atomic.rkey = atomic_wr(ibwr)->rkey;
+			break;
+		case IB_WR_LOCAL_INV:
+			wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey;
+		break;
+		case IB_WR_REG_MR:
+			wr->wr.reg.mr = reg_wr(ibwr)->mr;
+			wr->wr.reg.key = reg_wr(ibwr)->key;
+			wr->wr.reg.access = reg_wr(ibwr)->access;
+		break;
+		default:
+			break;
+		}
+	}
+}
+
+static int init_send_wqe(struct rxe_qp *qp, struct ib_send_wr *ibwr,
+			 unsigned int mask, unsigned int length,
+			 struct rxe_send_wqe *wqe)
+{
+	int num_sge = ibwr->num_sge;
+	struct ib_sge *sge;
+	int i;
+	u8 *p;
+
+	init_send_wr(qp, &wqe->wr, ibwr);
+
+	if (qp_type(qp) == IB_QPT_UD ||
+	    qp_type(qp) == IB_QPT_SMI ||
+	    qp_type(qp) == IB_QPT_GSI)
+		memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av));
+
+	if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) {
+		p = wqe->dma.inline_data;
+
+		sge = ibwr->sg_list;
+		for (i = 0; i < num_sge; i++, sge++) {
+			memcpy(p, (void *)(uintptr_t)sge->addr,
+					sge->length);
+
+			p += sge->length;
+		}
+	} else if (mask & WR_REG_MASK) {
+		wqe->mask = mask;
+		wqe->state = wqe_state_posted;
+		return 0;
+	} else
+		memcpy(wqe->dma.sge, ibwr->sg_list,
+		       num_sge * sizeof(struct ib_sge));
+
+	wqe->iova = mask & WR_ATOMIC_MASK ? atomic_wr(ibwr)->remote_addr :
+		mask & WR_READ_OR_WRITE_MASK ? rdma_wr(ibwr)->remote_addr : 0;
+	wqe->mask		= mask;
+	wqe->dma.length		= length;
+	wqe->dma.resid		= length;
+	wqe->dma.num_sge	= num_sge;
+	wqe->dma.cur_sge	= 0;
+	wqe->dma.sge_offset	= 0;
+	wqe->state		= wqe_state_posted;
+	wqe->ssn		= atomic_add_return(1, &qp->ssn);
+
+	return 0;
+}
+
+static int post_one_send(struct rxe_qp *qp, struct ib_send_wr *ibwr,
+			 unsigned int mask, u32 length)
+{
+	int err;
+	struct rxe_sq *sq = &qp->sq;
+	struct rxe_send_wqe *send_wqe;
+	unsigned long flags;
+
+	err = validate_send_wr(qp, ibwr, mask, length);
+	if (err)
+		return err;
+
+	spin_lock_irqsave(&qp->sq.sq_lock, flags);
+
+	if (unlikely(queue_full(sq->queue))) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	send_wqe = producer_addr(sq->queue);
+
+	err = init_send_wqe(qp, ibwr, mask, length, send_wqe);
+	if (unlikely(err))
+		goto err1;
+
+	/*
+	 * make sure all changes to the work queue are
+	 * written before we update the producer pointer
+	 */
+	smp_wmb();
+
+	advance_producer(sq->queue);
+	spin_unlock_irqrestore(&qp->sq.sq_lock, flags);
+
+	return 0;
+
+err1:
+	spin_unlock_irqrestore(&qp->sq.sq_lock, flags);
+	return err;
+}
+
+static int rxe_post_send_kernel(struct rxe_qp *qp, struct ib_send_wr *wr,
+				struct ib_send_wr **bad_wr)
+{
+	int err = 0;
+	unsigned int mask;
+	unsigned int length = 0;
+	int i;
+
+	while (wr) {
+		mask = wr_opcode_mask(wr->opcode, qp);
+		if (unlikely(!mask)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (unlikely((wr->send_flags & IB_SEND_INLINE) &&
+			     !(mask & WR_INLINE_MASK))) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		length = 0;
+		for (i = 0; i < wr->num_sge; i++)
+			length += wr->sg_list[i].length;
+
+		err = post_one_send(qp, wr, mask, length);
+
+		if (err) {
+			*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+
+	rxe_run_task(&qp->req.task, 1);
+	if (unlikely(qp->req.state == QP_STATE_ERROR))
+		rxe_run_task(&qp->comp.task, 1);
+
+	return err;
+}
+
+static int rxe_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			 struct ib_send_wr **bad_wr)
+{
+	struct rxe_qp *qp = to_rqp(ibqp);
+
+	if (unlikely(!qp->valid)) {
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
+	if (unlikely(qp->req.state < QP_STATE_READY)) {
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
+	if (qp->is_user) {
+		/* Utilize process context to do protocol processing */
+		rxe_run_task(&qp->req.task, 0);
+		return 0;
+	} else
+		return rxe_post_send_kernel(qp, wr, bad_wr);
+}
+
+static int rxe_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			 struct ib_recv_wr **bad_wr)
+{
+	int err = 0;
+	struct rxe_qp *qp = to_rqp(ibqp);
+	struct rxe_rq *rq = &qp->rq;
+	unsigned long flags;
+
+	if (unlikely((qp_state(qp) < IB_QPS_INIT) || !qp->valid)) {
+		*bad_wr = wr;
+		err = -EINVAL;
+		goto err1;
+	}
+
+	if (unlikely(qp->srq)) {
+		*bad_wr = wr;
+		err = -EINVAL;
+		goto err1;
+	}
+
+	spin_lock_irqsave(&rq->producer_lock, flags);
+
+	while (wr) {
+		err = post_one_recv(rq, wr);
+		if (unlikely(err)) {
+			*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+
+	spin_unlock_irqrestore(&rq->producer_lock, flags);
+
+	if (qp->resp.state == QP_STATE_ERROR)
+		rxe_run_task(&qp->resp.task, 1);
+
+err1:
+	return err;
+}
+
+static struct ib_cq *rxe_create_cq(struct ib_device *dev,
+				   const struct ib_cq_init_attr *attr,
+				   struct ib_ucontext *context,
+				   struct ib_udata *udata)
+{
+	int err;
+	struct rxe_dev *rxe = to_rdev(dev);
+	struct rxe_cq *cq;
+	struct rxe_create_cq_resp __user *uresp = NULL;
+
+	if (udata) {
+		if (udata->outlen < sizeof(*uresp))
+			return ERR_PTR(-EINVAL);
+		uresp = udata->outbuf;
+	}
+
+	if (attr->flags)
+		return ERR_PTR(-EINVAL);
+
+	err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector);
+	if (err)
+		goto err1;
+
+	cq = rxe_alloc(&rxe->cq_pool);
+	if (!cq) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector,
+			       context, uresp);
+	if (err)
+		goto err2;
+
+	return &cq->ibcq;
+
+err2:
+	rxe_drop_ref(cq);
+err1:
+	return ERR_PTR(err);
+}
+
+static int rxe_destroy_cq(struct ib_cq *ibcq)
+{
+	struct rxe_cq *cq = to_rcq(ibcq);
+
+	rxe_cq_disable(cq);
+
+	rxe_drop_ref(cq);
+	return 0;
+}
+
+static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+	int err;
+	struct rxe_cq *cq = to_rcq(ibcq);
+	struct rxe_dev *rxe = to_rdev(ibcq->device);
+	struct rxe_resize_cq_resp __user *uresp = NULL;
+
+	if (udata) {
+		if (udata->outlen < sizeof(*uresp))
+			return -EINVAL;
+		uresp = udata->outbuf;
+	}
+
+	err = rxe_cq_chk_attr(rxe, cq, cqe, 0);
+	if (err)
+		goto err1;
+
+	err = rxe_cq_resize_queue(cq, cqe, uresp);
+	if (err)
+		goto err1;
+
+	return 0;
+
+err1:
+	return err;
+}
+
+static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	int i;
+	struct rxe_cq *cq = to_rcq(ibcq);
+	struct rxe_cqe *cqe;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	for (i = 0; i < num_entries; i++) {
+		cqe = queue_head(cq->queue);
+		if (!cqe)
+			break;
+
+		memcpy(wc++, &cqe->ibwc, sizeof(*wc));
+		advance_consumer(cq->queue);
+	}
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+	return i;
+}
+
+static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt)
+{
+	struct rxe_cq *cq = to_rcq(ibcq);
+	int count = queue_count(cq->queue);
+
+	return (count > wc_cnt) ? wc_cnt : count;
+}
+
+static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct rxe_cq *cq = to_rcq(ibcq);
+	unsigned long irq_flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&cq->cq_lock, irq_flags);
+	if (cq->notify != IB_CQ_NEXT_COMP)
+		cq->notify = flags & IB_CQ_SOLICITED_MASK;
+
+	if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !queue_empty(cq->queue))
+		ret = 1;
+
+	spin_unlock_irqrestore(&cq->cq_lock, irq_flags);
+
+	return ret;
+}
+
+static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access)
+{
+	struct rxe_dev *rxe = to_rdev(ibpd->device);
+	struct rxe_pd *pd = to_rpd(ibpd);
+	struct rxe_mem *mr;
+	int err;
+
+	mr = rxe_alloc(&rxe->mr_pool);
+	if (!mr) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	rxe_add_index(mr);
+
+	rxe_add_ref(pd);
+
+	err = rxe_mem_init_dma(rxe, pd, access, mr);
+	if (err)
+		goto err2;
+
+	return &mr->ibmr;
+
+err2:
+	rxe_drop_ref(pd);
+	rxe_drop_index(mr);
+	rxe_drop_ref(mr);
+err1:
+	return ERR_PTR(err);
+}
+
+static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd,
+				     u64 start,
+				     u64 length,
+				     u64 iova,
+				     int access, struct ib_udata *udata)
+{
+	int err;
+	struct rxe_dev *rxe = to_rdev(ibpd->device);
+	struct rxe_pd *pd = to_rpd(ibpd);
+	struct rxe_mem *mr;
+
+	mr = rxe_alloc(&rxe->mr_pool);
+	if (!mr) {
+		err = -ENOMEM;
+		goto err2;
+	}
+
+	rxe_add_index(mr);
+
+	rxe_add_ref(pd);
+
+	err = rxe_mem_init_user(rxe, pd, start, length, iova,
+				access, udata, mr);
+	if (err)
+		goto err3;
+
+	return &mr->ibmr;
+
+err3:
+	rxe_drop_ref(pd);
+	rxe_drop_index(mr);
+	rxe_drop_ref(mr);
+err2:
+	return ERR_PTR(err);
+}
+
+static int rxe_dereg_mr(struct ib_mr *ibmr)
+{
+	struct rxe_mem *mr = to_rmr(ibmr);
+
+	mr->state = RXE_MEM_STATE_ZOMBIE;
+	rxe_drop_ref(mr->pd);
+	rxe_drop_index(mr);
+	rxe_drop_ref(mr);
+	return 0;
+}
+
+static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd,
+				  enum ib_mr_type mr_type,
+				  u32 max_num_sg)
+{
+	struct rxe_dev *rxe = to_rdev(ibpd->device);
+	struct rxe_pd *pd = to_rpd(ibpd);
+	struct rxe_mem *mr;
+	int err;
+
+	if (mr_type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EINVAL);
+
+	mr = rxe_alloc(&rxe->mr_pool);
+	if (!mr) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	rxe_add_index(mr);
+
+	rxe_add_ref(pd);
+
+	err = rxe_mem_init_fast(rxe, pd, max_num_sg, mr);
+	if (err)
+		goto err2;
+
+	return &mr->ibmr;
+
+err2:
+	rxe_drop_ref(pd);
+	rxe_drop_index(mr);
+	rxe_drop_ref(mr);
+err1:
+	return ERR_PTR(err);
+}
+
+static int rxe_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct rxe_mem *mr = to_rmr(ibmr);
+	struct rxe_map *map;
+	struct rxe_phys_buf *buf;
+
+	if (unlikely(mr->nbuf == mr->num_buf))
+		return -ENOMEM;
+
+	map = mr->map[mr->nbuf / RXE_BUF_PER_MAP];
+	buf = &map->buf[mr->nbuf % RXE_BUF_PER_MAP];
+
+	buf->addr = addr;
+	buf->size = ibmr->page_size;
+	mr->nbuf++;
+
+	return 0;
+}
+
+static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+			 int sg_nents, unsigned int *sg_offset)
+{
+	struct rxe_mem *mr = to_rmr(ibmr);
+	int n;
+
+	mr->nbuf = 0;
+
+	n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_set_page);
+
+	mr->va = ibmr->iova;
+	mr->iova = ibmr->iova;
+	mr->length = ibmr->length;
+	mr->page_shift = ilog2(ibmr->page_size);
+	mr->page_mask = ibmr->page_size - 1;
+	mr->offset = mr->iova & mr->page_mask;
+
+	return n;
+}
+
+static int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
+{
+	int err;
+	struct rxe_dev *rxe = to_rdev(ibqp->device);
+	struct rxe_qp *qp = to_rqp(ibqp);
+	struct rxe_mc_grp *grp;
+
+	/* takes a ref on grp if successful */
+	err = rxe_mcast_get_grp(rxe, mgid, &grp);
+	if (err)
+		return err;
+
+	err = rxe_mcast_add_grp_elem(rxe, qp, grp);
+
+	rxe_drop_ref(grp);
+	return err;
+}
+
+static int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
+{
+	struct rxe_dev *rxe = to_rdev(ibqp->device);
+	struct rxe_qp *qp = to_rqp(ibqp);
+
+	return rxe_mcast_drop_grp_elem(rxe, qp, mgid);
+}
+
+static ssize_t parent_show(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct rxe_dev *rxe = container_of(device, struct rxe_dev,
+					   ib_dev.dev);
+
+	return snprintf(buf, 16, "%s\n", rxe_parent_name(rxe, 1));
+}
+
+static DEVICE_ATTR_RO(parent);
+
+static struct device_attribute *rxe_dev_attributes[] = {
+	&dev_attr_parent,
+};
+
+int rxe_register_device(struct rxe_dev *rxe)
+{
+	int err;
+	int i;
+	struct ib_device *dev = &rxe->ib_dev;
+	struct crypto_shash *tfm;
+
+	strlcpy(dev->name, "rxe%d", IB_DEVICE_NAME_MAX);
+	strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc));
+
+	dev->owner = THIS_MODULE;
+	dev->node_type = RDMA_NODE_IB_CA;
+	dev->phys_port_cnt = 1;
+	dev->num_comp_vectors = num_possible_cpus();
+	dev->dev.parent = rxe_dma_device(rxe);
+	dev->local_dma_lkey = 0;
+	addrconf_addr_eui48((unsigned char *)&dev->node_guid,
+			    rxe->ndev->dev_addr);
+	dev->dev.dma_ops = &dma_virt_ops;
+	dma_coerce_mask_and_coherent(&dev->dev,
+				     dma_get_required_mask(&dev->dev));
+
+	dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION;
+	dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT)
+	    | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)
+	    | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE)
+	    | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT)
+	    | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD)
+	    | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD)
+	    | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_POST_SRQ_RECV)
+	    | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP)
+	    | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP)
+	    | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP)
+	    | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP)
+	    | BIT_ULL(IB_USER_VERBS_CMD_POST_SEND)
+	    | BIT_ULL(IB_USER_VERBS_CMD_POST_RECV)
+	    | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_PEEK_CQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_REG_MR)
+	    | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR)
+	    | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH)
+	    | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_AH)
+	    | BIT_ULL(IB_USER_VERBS_CMD_QUERY_AH)
+	    | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH)
+	    | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST)
+	    | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST)
+	    ;
+
+	dev->query_device = rxe_query_device;
+	dev->modify_device = rxe_modify_device;
+	dev->query_port = rxe_query_port;
+	dev->modify_port = rxe_modify_port;
+	dev->get_link_layer = rxe_get_link_layer;
+	dev->get_netdev = rxe_get_netdev;
+	dev->query_pkey = rxe_query_pkey;
+	dev->alloc_ucontext = rxe_alloc_ucontext;
+	dev->dealloc_ucontext = rxe_dealloc_ucontext;
+	dev->mmap = rxe_mmap;
+	dev->get_port_immutable = rxe_port_immutable;
+	dev->alloc_pd = rxe_alloc_pd;
+	dev->dealloc_pd = rxe_dealloc_pd;
+	dev->create_ah = rxe_create_ah;
+	dev->modify_ah = rxe_modify_ah;
+	dev->query_ah = rxe_query_ah;
+	dev->destroy_ah = rxe_destroy_ah;
+	dev->create_srq = rxe_create_srq;
+	dev->modify_srq = rxe_modify_srq;
+	dev->query_srq = rxe_query_srq;
+	dev->destroy_srq = rxe_destroy_srq;
+	dev->post_srq_recv = rxe_post_srq_recv;
+	dev->create_qp = rxe_create_qp;
+	dev->modify_qp = rxe_modify_qp;
+	dev->query_qp = rxe_query_qp;
+	dev->destroy_qp = rxe_destroy_qp;
+	dev->post_send = rxe_post_send;
+	dev->post_recv = rxe_post_recv;
+	dev->create_cq = rxe_create_cq;
+	dev->destroy_cq = rxe_destroy_cq;
+	dev->resize_cq = rxe_resize_cq;
+	dev->poll_cq = rxe_poll_cq;
+	dev->peek_cq = rxe_peek_cq;
+	dev->req_notify_cq = rxe_req_notify_cq;
+	dev->get_dma_mr = rxe_get_dma_mr;
+	dev->reg_user_mr = rxe_reg_user_mr;
+	dev->dereg_mr = rxe_dereg_mr;
+	dev->alloc_mr = rxe_alloc_mr;
+	dev->map_mr_sg = rxe_map_mr_sg;
+	dev->attach_mcast = rxe_attach_mcast;
+	dev->detach_mcast = rxe_detach_mcast;
+	dev->get_hw_stats = rxe_ib_get_hw_stats;
+	dev->alloc_hw_stats = rxe_ib_alloc_hw_stats;
+
+	tfm = crypto_alloc_shash("crc32", 0, 0);
+	if (IS_ERR(tfm)) {
+		pr_err("failed to allocate crc algorithm err:%ld\n",
+		       PTR_ERR(tfm));
+		return PTR_ERR(tfm);
+	}
+	rxe->tfm = tfm;
+
+	dev->driver_id = RDMA_DRIVER_RXE;
+	err = ib_register_device(dev, NULL);
+	if (err) {
+		pr_warn("%s failed with error %d\n", __func__, err);
+		goto err1;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) {
+		err = device_create_file(&dev->dev, rxe_dev_attributes[i]);
+		if (err) {
+			pr_warn("%s failed with error %d for attr number %d\n",
+				__func__, err, i);
+			goto err2;
+		}
+	}
+
+	return 0;
+
+err2:
+	ib_unregister_device(dev);
+err1:
+	crypto_free_shash(rxe->tfm);
+
+	return err;
+}
+
+int rxe_unregister_device(struct rxe_dev *rxe)
+{
+	int i;
+	struct ib_device *dev = &rxe->ib_dev;
+
+	for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i)
+		device_remove_file(&dev->dev, rxe_dev_attributes[i]);
+
+	ib_unregister_device(dev);
+
+	return 0;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
new file mode 100644
index 0000000..af1470d
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -0,0 +1,470 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *	   Redistribution and use in source and binary forms, with or
+ *	   without modification, are permitted provided that the following
+ *	   conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_VERBS_H
+#define RXE_VERBS_H
+
+#include <linux/interrupt.h>
+#include <linux/workqueue.h>
+#include <rdma/rdma_user_rxe.h>
+#include "rxe_pool.h"
+#include "rxe_task.h"
+#include "rxe_hw_counters.h"
+
+static inline int pkey_match(u16 key1, u16 key2)
+{
+	return (((key1 & 0x7fff) != 0) &&
+		((key1 & 0x7fff) == (key2 & 0x7fff)) &&
+		((key1 & 0x8000) || (key2 & 0x8000))) ? 1 : 0;
+}
+
+/* Return >0 if psn_a > psn_b
+ *	   0 if psn_a == psn_b
+ *	  <0 if psn_a < psn_b
+ */
+static inline int psn_compare(u32 psn_a, u32 psn_b)
+{
+	s32 diff;
+
+	diff = (psn_a - psn_b) << 8;
+	return diff;
+}
+
+struct rxe_ucontext {
+	struct rxe_pool_entry	pelem;
+	struct ib_ucontext	ibuc;
+};
+
+struct rxe_pd {
+	struct rxe_pool_entry	pelem;
+	struct ib_pd		ibpd;
+};
+
+struct rxe_ah {
+	struct rxe_pool_entry	pelem;
+	struct ib_ah		ibah;
+	struct rxe_pd		*pd;
+	struct rxe_av		av;
+};
+
+struct rxe_cqe {
+	union {
+		struct ib_wc		ibwc;
+		struct ib_uverbs_wc	uibwc;
+	};
+};
+
+struct rxe_cq {
+	struct rxe_pool_entry	pelem;
+	struct ib_cq		ibcq;
+	struct rxe_queue	*queue;
+	spinlock_t		cq_lock;
+	u8			notify;
+	bool			is_dying;
+	int			is_user;
+	struct tasklet_struct	comp_task;
+};
+
+enum wqe_state {
+	wqe_state_posted,
+	wqe_state_processing,
+	wqe_state_pending,
+	wqe_state_done,
+	wqe_state_error,
+};
+
+struct rxe_sq {
+	int			max_wr;
+	int			max_sge;
+	int			max_inline;
+	spinlock_t		sq_lock; /* guard queue */
+	struct rxe_queue	*queue;
+};
+
+struct rxe_rq {
+	int			max_wr;
+	int			max_sge;
+	spinlock_t		producer_lock; /* guard queue producer */
+	spinlock_t		consumer_lock; /* guard queue consumer */
+	struct rxe_queue	*queue;
+};
+
+struct rxe_srq {
+	struct rxe_pool_entry	pelem;
+	struct ib_srq		ibsrq;
+	struct rxe_pd		*pd;
+	struct rxe_rq		rq;
+	u32			srq_num;
+
+	int			limit;
+	int			error;
+};
+
+enum rxe_qp_state {
+	QP_STATE_RESET,
+	QP_STATE_INIT,
+	QP_STATE_READY,
+	QP_STATE_DRAIN,		/* req only */
+	QP_STATE_DRAINED,	/* req only */
+	QP_STATE_ERROR
+};
+
+struct rxe_req_info {
+	enum rxe_qp_state	state;
+	int			wqe_index;
+	u32			psn;
+	int			opcode;
+	atomic_t		rd_atomic;
+	int			wait_fence;
+	int			need_rd_atomic;
+	int			wait_psn;
+	int			need_retry;
+	int			noack_pkts;
+	struct rxe_task		task;
+};
+
+struct rxe_comp_info {
+	u32			psn;
+	int			opcode;
+	int			timeout;
+	int			timeout_retry;
+	u32			retry_cnt;
+	u32			rnr_retry;
+	struct rxe_task		task;
+};
+
+enum rdatm_res_state {
+	rdatm_res_state_next,
+	rdatm_res_state_new,
+	rdatm_res_state_replay,
+};
+
+struct resp_res {
+	int			type;
+	u32			first_psn;
+	u32			last_psn;
+	u32			cur_psn;
+	enum rdatm_res_state	state;
+
+	union {
+		struct {
+			struct sk_buff	*skb;
+		} atomic;
+		struct {
+			struct rxe_mem	*mr;
+			u64		va_org;
+			u32		rkey;
+			u32		length;
+			u64		va;
+			u32		resid;
+		} read;
+	};
+};
+
+struct rxe_resp_info {
+	enum rxe_qp_state	state;
+	u32			msn;
+	u32			psn;
+	int			opcode;
+	int			drop_msg;
+	int			goto_error;
+	int			sent_psn_nak;
+	enum ib_wc_status	status;
+	u8			aeth_syndrome;
+
+	/* Receive only */
+	struct rxe_recv_wqe	*wqe;
+
+	/* RDMA read / atomic only */
+	u64			va;
+	struct rxe_mem		*mr;
+	u32			resid;
+	u32			rkey;
+	u64			atomic_orig;
+
+	/* SRQ only */
+	struct {
+		struct rxe_recv_wqe	wqe;
+		struct ib_sge		sge[RXE_MAX_SGE];
+	} srq_wqe;
+
+	/* Responder resources. It's a circular list where the oldest
+	 * resource is dropped first.
+	 */
+	struct resp_res		*resources;
+	unsigned int		res_head;
+	unsigned int		res_tail;
+	struct resp_res		*res;
+	struct rxe_task		task;
+};
+
+struct rxe_qp {
+	struct rxe_pool_entry	pelem;
+	struct ib_qp		ibqp;
+	struct ib_qp_attr	attr;
+	unsigned int		valid;
+	unsigned int		mtu;
+	int			is_user;
+
+	struct rxe_pd		*pd;
+	struct rxe_srq		*srq;
+	struct rxe_cq		*scq;
+	struct rxe_cq		*rcq;
+
+	enum ib_sig_type	sq_sig_type;
+
+	struct rxe_sq		sq;
+	struct rxe_rq		rq;
+
+	struct socket		*sk;
+	u32			dst_cookie;
+
+	struct rxe_av		pri_av;
+	struct rxe_av		alt_av;
+
+	/* list of mcast groups qp has joined (for cleanup) */
+	struct list_head	grp_list;
+	spinlock_t		grp_lock; /* guard grp_list */
+
+	struct sk_buff_head	req_pkts;
+	struct sk_buff_head	resp_pkts;
+	struct sk_buff_head	send_pkts;
+
+	struct rxe_req_info	req;
+	struct rxe_comp_info	comp;
+	struct rxe_resp_info	resp;
+
+	atomic_t		ssn;
+	atomic_t		skb_out;
+	int			need_req_skb;
+
+	/* Timer for retranmitting packet when ACKs have been lost. RC
+	 * only. The requester sets it when it is not already
+	 * started. The responder resets it whenever an ack is
+	 * received.
+	 */
+	struct timer_list retrans_timer;
+	u64 qp_timeout_jiffies;
+
+	/* Timer for handling RNR NAKS. */
+	struct timer_list rnr_nak_timer;
+
+	spinlock_t		state_lock; /* guard requester and completer */
+
+	struct execute_work	cleanup_work;
+};
+
+enum rxe_mem_state {
+	RXE_MEM_STATE_ZOMBIE,
+	RXE_MEM_STATE_INVALID,
+	RXE_MEM_STATE_FREE,
+	RXE_MEM_STATE_VALID,
+};
+
+enum rxe_mem_type {
+	RXE_MEM_TYPE_NONE,
+	RXE_MEM_TYPE_DMA,
+	RXE_MEM_TYPE_MR,
+	RXE_MEM_TYPE_FMR,
+	RXE_MEM_TYPE_MW,
+};
+
+#define RXE_BUF_PER_MAP		(PAGE_SIZE / sizeof(struct rxe_phys_buf))
+
+struct rxe_phys_buf {
+	u64      addr;
+	u64      size;
+};
+
+struct rxe_map {
+	struct rxe_phys_buf	buf[RXE_BUF_PER_MAP];
+};
+
+struct rxe_mem {
+	struct rxe_pool_entry	pelem;
+	union {
+		struct ib_mr		ibmr;
+		struct ib_mw		ibmw;
+	};
+
+	struct rxe_pd		*pd;
+	struct ib_umem		*umem;
+
+	u32			lkey;
+	u32			rkey;
+
+	enum rxe_mem_state	state;
+	enum rxe_mem_type	type;
+	u64			va;
+	u64			iova;
+	size_t			length;
+	u32			offset;
+	int			access;
+
+	int			page_shift;
+	int			page_mask;
+	int			map_shift;
+	int			map_mask;
+
+	u32			num_buf;
+	u32			nbuf;
+
+	u32			max_buf;
+	u32			num_map;
+
+	struct rxe_map		**map;
+};
+
+struct rxe_mc_grp {
+	struct rxe_pool_entry	pelem;
+	spinlock_t		mcg_lock; /* guard group */
+	struct rxe_dev		*rxe;
+	struct list_head	qp_list;
+	union ib_gid		mgid;
+	int			num_qp;
+	u32			qkey;
+	u16			pkey;
+};
+
+struct rxe_mc_elem {
+	struct rxe_pool_entry	pelem;
+	struct list_head	qp_list;
+	struct list_head	grp_list;
+	struct rxe_qp		*qp;
+	struct rxe_mc_grp	*grp;
+};
+
+struct rxe_port {
+	struct ib_port_attr	attr;
+	u16			*pkey_tbl;
+	__be64			port_guid;
+	__be64			subnet_prefix;
+	spinlock_t		port_lock; /* guard port */
+	unsigned int		mtu_cap;
+	/* special QPs */
+	u32			qp_smi_index;
+	u32			qp_gsi_index;
+};
+
+struct rxe_dev {
+	struct ib_device	ib_dev;
+	struct ib_device_attr	attr;
+	int			max_ucontext;
+	int			max_inline_data;
+	struct kref		ref_cnt;
+	struct mutex	usdev_lock;
+
+	struct net_device	*ndev;
+
+	int			xmit_errors;
+
+	struct rxe_pool		uc_pool;
+	struct rxe_pool		pd_pool;
+	struct rxe_pool		ah_pool;
+	struct rxe_pool		srq_pool;
+	struct rxe_pool		qp_pool;
+	struct rxe_pool		cq_pool;
+	struct rxe_pool		mr_pool;
+	struct rxe_pool		mw_pool;
+	struct rxe_pool		mc_grp_pool;
+	struct rxe_pool		mc_elem_pool;
+
+	spinlock_t		pending_lock; /* guard pending_mmaps */
+	struct list_head	pending_mmaps;
+
+	spinlock_t		mmap_offset_lock; /* guard mmap_offset */
+	int			mmap_offset;
+
+	u64			stats_counters[RXE_NUM_OF_COUNTERS];
+
+	struct rxe_port		port;
+	struct list_head	list;
+	struct crypto_shash	*tfm;
+};
+
+static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters cnt)
+{
+	rxe->stats_counters[cnt]++;
+}
+
+static inline struct rxe_dev *to_rdev(struct ib_device *dev)
+{
+	return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL;
+}
+
+static inline struct rxe_ucontext *to_ruc(struct ib_ucontext *uc)
+{
+	return uc ? container_of(uc, struct rxe_ucontext, ibuc) : NULL;
+}
+
+static inline struct rxe_pd *to_rpd(struct ib_pd *pd)
+{
+	return pd ? container_of(pd, struct rxe_pd, ibpd) : NULL;
+}
+
+static inline struct rxe_ah *to_rah(struct ib_ah *ah)
+{
+	return ah ? container_of(ah, struct rxe_ah, ibah) : NULL;
+}
+
+static inline struct rxe_srq *to_rsrq(struct ib_srq *srq)
+{
+	return srq ? container_of(srq, struct rxe_srq, ibsrq) : NULL;
+}
+
+static inline struct rxe_qp *to_rqp(struct ib_qp *qp)
+{
+	return qp ? container_of(qp, struct rxe_qp, ibqp) : NULL;
+}
+
+static inline struct rxe_cq *to_rcq(struct ib_cq *cq)
+{
+	return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL;
+}
+
+static inline struct rxe_mem *to_rmr(struct ib_mr *mr)
+{
+	return mr ? container_of(mr, struct rxe_mem, ibmr) : NULL;
+}
+
+static inline struct rxe_mem *to_rmw(struct ib_mw *mw)
+{
+	return mw ? container_of(mw, struct rxe_mem, ibmw) : NULL;
+}
+
+int rxe_register_device(struct rxe_dev *rxe);
+int rxe_unregister_device(struct rxe_dev *rxe);
+
+void rxe_mc_cleanup(struct rxe_pool_entry *arg);
+
+#endif /* RXE_VERBS_H */
diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile
new file mode 100644
index 0000000..437813c
--- /dev/null
+++ b/drivers/infiniband/ulp/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_INFINIBAND_IPOIB)		+= ipoib/
+obj-$(CONFIG_INFINIBAND_SRP)		+= srp/
+obj-$(CONFIG_INFINIBAND_SRPT)		+= srpt/
+obj-$(CONFIG_INFINIBAND_ISER)		+= iser/
+obj-$(CONFIG_INFINIBAND_ISERT)		+= isert/
+obj-$(CONFIG_INFINIBAND_OPA_VNIC)	+= opa_vnic/
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig
new file mode 100644
index 0000000..cda8eac
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/Kconfig
@@ -0,0 +1,49 @@
+config INFINIBAND_IPOIB
+	tristate "IP-over-InfiniBand"
+	depends on NETDEVICES && INET && (IPV6 || IPV6=n)
+	---help---
+	  Support for the IP-over-InfiniBand protocol (IPoIB). This
+	  transports IP packets over InfiniBand so you can use your IB
+	  device as a fancy NIC.
+
+	  See Documentation/infiniband/ipoib.txt for more information
+
+config INFINIBAND_IPOIB_CM
+	bool "IP-over-InfiniBand Connected Mode support"
+	depends on INFINIBAND_IPOIB
+	default n
+	---help---
+	  This option enables support for IPoIB connected mode.  After
+	  enabling this option, you need to switch to connected mode
+	  through /sys/class/net/ibXXX/mode to actually create
+	  connections, and then increase the interface MTU with
+	  e.g. ifconfig ib0 mtu 65520.
+
+	  WARNING: Enabling connected mode will trigger some packet
+	  drops for multicast and UD mode traffic from this interface,
+	  unless you limit mtu for these destinations to 2044.
+
+config INFINIBAND_IPOIB_DEBUG
+	bool "IP-over-InfiniBand debugging" if EXPERT
+	depends on INFINIBAND_IPOIB
+	default y
+	---help---
+	  This option causes debugging code to be compiled into the
+	  IPoIB driver.  The output can be turned on via the
+	  debug_level and mcast_debug_level module parameters (which
+	  can also be set after the driver is loaded through sysfs).
+
+	  This option also creates a directory tree under ipoib/ in
+	  debugfs, which contains files that expose debugging
+	  information about IB multicast groups used by the IPoIB
+	  driver.
+
+config INFINIBAND_IPOIB_DEBUG_DATA
+	bool "IP-over-InfiniBand data path debugging"
+	depends on INFINIBAND_IPOIB_DEBUG
+	---help---
+	  This option compiles debugging code into the data path
+	  of the IPoIB driver.  The output can be turned on via the
+	  data_debug_level module parameter; however, even with output
+	  turned off, this debugging code will have some performance
+	  impact.
diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile
new file mode 100644
index 0000000..6ece857
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_INFINIBAND_IPOIB)			+= ib_ipoib.o
+
+ib_ipoib-y					:= ipoib_main.o \
+						   ipoib_ib.o \
+						   ipoib_multicast.o \
+						   ipoib_verbs.o \
+						   ipoib_vlan.o \
+						   ipoib_ethtool.o \
+						   ipoib_netlink.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM)		+= ipoib_cm.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG)	+= ipoib_fs.o
+
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
new file mode 100644
index 0000000..308e0ce
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -0,0 +1,838 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPOIB_H
+#define _IPOIB_H
+
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/workqueue.h>
+#include <linux/kref.h>
+#include <linux/if_infiniband.h>
+#include <linux/mutex.h>
+
+#include <net/neighbour.h>
+#include <net/sch_generic.h>
+
+#include <linux/atomic.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_sa.h>
+#include <linux/sched.h>
+/* constants */
+
+enum ipoib_flush_level {
+	IPOIB_FLUSH_LIGHT,
+	IPOIB_FLUSH_NORMAL,
+	IPOIB_FLUSH_HEAVY
+};
+
+enum {
+	IPOIB_ENCAP_LEN		  = 4,
+	IPOIB_PSEUDO_LEN	  = 20,
+	IPOIB_HARD_LEN		  = IPOIB_ENCAP_LEN + IPOIB_PSEUDO_LEN,
+
+	IPOIB_UD_HEAD_SIZE	  = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
+	IPOIB_UD_RX_SG		  = 2, /* max buffer needed for 4K mtu */
+
+	IPOIB_CM_MTU		  = 0x10000 - 0x10, /* padding to align header to 16 */
+	IPOIB_CM_BUF_SIZE	  = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
+	IPOIB_CM_HEAD_SIZE	  = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
+	IPOIB_CM_RX_SG		  = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
+	IPOIB_RX_RING_SIZE	  = 256,
+	IPOIB_TX_RING_SIZE	  = 128,
+	IPOIB_MAX_QUEUE_SIZE	  = 8192,
+	IPOIB_MIN_QUEUE_SIZE	  = 2,
+	IPOIB_CM_MAX_CONN_QP	  = 4096,
+
+	IPOIB_NUM_WC		  = 4,
+
+	IPOIB_MAX_PATH_REC_QUEUE  = 3,
+	IPOIB_MAX_MCAST_QUEUE	  = 64,
+
+	IPOIB_FLAG_OPER_UP	  = 0,
+	IPOIB_FLAG_INITIALIZED	  = 1,
+	IPOIB_FLAG_ADMIN_UP	  = 2,
+	IPOIB_PKEY_ASSIGNED	  = 3,
+	IPOIB_FLAG_SUBINTERFACE	  = 5,
+	IPOIB_STOP_REAPER	  = 7,
+	IPOIB_FLAG_ADMIN_CM	  = 9,
+	IPOIB_FLAG_UMCAST	  = 10,
+	IPOIB_STOP_NEIGH_GC	  = 11,
+	IPOIB_NEIGH_TBL_FLUSH	  = 12,
+	IPOIB_FLAG_DEV_ADDR_SET	  = 13,
+	IPOIB_FLAG_DEV_ADDR_CTRL  = 14,
+	IPOIB_FLAG_GOING_DOWN	  = 15,
+
+	IPOIB_MAX_BACKOFF_SECONDS = 16,
+
+	IPOIB_MCAST_FLAG_FOUND	  = 0,	/* used in set_multicast_list */
+	IPOIB_MCAST_FLAG_SENDONLY = 1,
+	/*
+	 * For IPOIB_MCAST_FLAG_BUSY
+	 * When set, in flight join and mcast->mc is unreliable
+	 * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or
+	 *   haven't started yet
+	 * When clear and mcast->mc is valid pointer, join was successful
+	 */
+	IPOIB_MCAST_FLAG_BUSY	  = 2,
+	IPOIB_MCAST_FLAG_ATTACHED = 3,
+
+	MAX_SEND_CQE		  = 64,
+	IPOIB_CM_COPYBREAK	  = 256,
+
+	IPOIB_NON_CHILD		  = 0,
+	IPOIB_LEGACY_CHILD	  = 1,
+	IPOIB_RTNL_CHILD	  = 2,
+};
+
+#define	IPOIB_OP_RECV   (1ul << 31)
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+#define	IPOIB_OP_CM     (1ul << 30)
+#else
+#define	IPOIB_OP_CM     (0)
+#endif
+
+#define IPOIB_QPN_MASK ((__force u32) cpu_to_be32(0xFFFFFF))
+
+/* structs */
+
+struct ipoib_header {
+	__be16	proto;
+	u16	reserved;
+};
+
+struct ipoib_pseudo_header {
+	u8	hwaddr[INFINIBAND_ALEN];
+};
+
+static inline void skb_add_pseudo_hdr(struct sk_buff *skb)
+{
+	char *data = skb_push(skb, IPOIB_PSEUDO_LEN);
+
+	/*
+	 * only the ipoib header is present now, make room for a dummy
+	 * pseudo header and set skb field accordingly
+	 */
+	memset(data, 0, IPOIB_PSEUDO_LEN);
+	skb_reset_mac_header(skb);
+	skb_pull(skb, IPOIB_HARD_LEN);
+}
+
+static inline struct ipoib_dev_priv *ipoib_priv(const struct net_device *dev)
+{
+	struct rdma_netdev *rn = netdev_priv(dev);
+
+	return rn->clnt_priv;
+}
+
+/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
+struct ipoib_mcast {
+	struct ib_sa_mcmember_rec mcmember;
+	struct ib_sa_multicast	 *mc;
+	struct ipoib_ah		 *ah;
+
+	struct rb_node    rb_node;
+	struct list_head  list;
+
+	unsigned long created;
+	unsigned long backoff;
+	unsigned long delay_until;
+
+	unsigned long flags;
+	unsigned char logcount;
+
+	struct list_head  neigh_list;
+
+	struct sk_buff_head pkt_queue;
+
+	struct net_device *dev;
+	struct completion done;
+};
+
+struct ipoib_rx_buf {
+	struct sk_buff *skb;
+	u64		mapping[IPOIB_UD_RX_SG];
+};
+
+struct ipoib_tx_buf {
+	struct sk_buff *skb;
+	u64		mapping[MAX_SKB_FRAGS + 1];
+};
+
+struct ib_cm_id;
+
+struct ipoib_cm_data {
+	__be32 qpn; /* High byte MUST be ignored on receive */
+	__be32 mtu;
+};
+
+/*
+ * Quoting 10.3.1 Queue Pair and EE Context States:
+ *
+ * Note, for QPs that are associated with an SRQ, the Consumer should take the
+ * QP through the Error State before invoking a Destroy QP or a Modify QP to the
+ * Reset State.  The Consumer may invoke the Destroy QP without first performing
+ * a Modify QP to the Error State and waiting for the Affiliated Asynchronous
+ * Last WQE Reached Event. However, if the Consumer does not wait for the
+ * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
+ * leakage may occur. Therefore, it is good programming practice to tear down a
+ * QP that is associated with an SRQ by using the following process:
+ *
+ * - Put the QP in the Error State
+ * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
+ * - either:
+ *       drain the CQ by invoking the Poll CQ verb and either wait for CQ
+ *       to be empty or the number of Poll CQ operations has exceeded
+ *       CQ capacity size;
+ * - or
+ *       post another WR that completes on the same CQ and wait for this
+ *       WR to return as a WC;
+ * - and then invoke a Destroy QP or Reset QP.
+ *
+ * We use the second option and wait for a completion on the
+ * same CQ before destroying QPs attached to our SRQ.
+ */
+
+enum ipoib_cm_state {
+	IPOIB_CM_RX_LIVE,
+	IPOIB_CM_RX_ERROR, /* Ignored by stale task */
+	IPOIB_CM_RX_FLUSH  /* Last WQE Reached event observed */
+};
+
+struct ipoib_cm_rx {
+	struct ib_cm_id	       *id;
+	struct ib_qp	       *qp;
+	struct ipoib_cm_rx_buf *rx_ring;
+	struct list_head	list;
+	struct net_device      *dev;
+	unsigned long		jiffies;
+	enum ipoib_cm_state	state;
+	int			recv_count;
+};
+
+struct ipoib_cm_tx {
+	struct ib_cm_id	    *id;
+	struct ib_qp	    *qp;
+	struct list_head     list;
+	struct net_device   *dev;
+	struct ipoib_neigh  *neigh;
+	struct ipoib_path   *path;
+	struct ipoib_tx_buf *tx_ring;
+	unsigned	     tx_head;
+	unsigned	     tx_tail;
+	unsigned long	     flags;
+	u32		     mtu;
+	unsigned             max_send_sge;
+};
+
+struct ipoib_cm_rx_buf {
+	struct sk_buff *skb;
+	u64 mapping[IPOIB_CM_RX_SG];
+};
+
+struct ipoib_cm_dev_priv {
+	struct ib_srq	       *srq;
+	struct ipoib_cm_rx_buf *srq_ring;
+	struct ib_cm_id	       *id;
+	struct list_head	passive_ids;   /* state: LIVE */
+	struct list_head	rx_error_list; /* state: ERROR */
+	struct list_head	rx_flush_list; /* state: FLUSH, drain not started */
+	struct list_head	rx_drain_list; /* state: FLUSH, drain started */
+	struct list_head	rx_reap_list;  /* state: FLUSH, drain done */
+	struct work_struct      start_task;
+	struct work_struct      reap_task;
+	struct work_struct      skb_task;
+	struct work_struct      rx_reap_task;
+	struct delayed_work     stale_task;
+	struct sk_buff_head     skb_queue;
+	struct list_head	start_list;
+	struct list_head	reap_list;
+	struct ib_wc		ibwc[IPOIB_NUM_WC];
+	struct ib_sge		rx_sge[IPOIB_CM_RX_SG];
+	struct ib_recv_wr       rx_wr;
+	int			nonsrq_conn_qp;
+	int			max_cm_mtu;
+	int			num_frags;
+};
+
+struct ipoib_ethtool_st {
+	u16     coalesce_usecs;
+	u16     max_coalesced_frames;
+};
+
+struct ipoib_neigh_table;
+
+struct ipoib_neigh_hash {
+	struct ipoib_neigh_table       *ntbl;
+	struct ipoib_neigh __rcu      **buckets;
+	struct rcu_head			rcu;
+	u32				mask;
+	u32				size;
+};
+
+struct ipoib_neigh_table {
+	struct ipoib_neigh_hash __rcu  *htbl;
+	atomic_t			entries;
+	struct completion		flushed;
+	struct completion		deleted;
+};
+
+struct ipoib_qp_state_validate {
+	struct work_struct work;
+	struct ipoib_dev_priv   *priv;
+};
+
+/*
+ * Device private locking: network stack tx_lock protects members used
+ * in TX fast path, lock protects everything else.  lock nests inside
+ * of tx_lock (ie tx_lock must be acquired first if needed).
+ */
+struct ipoib_dev_priv {
+	spinlock_t lock;
+
+	struct net_device *dev;
+
+	struct napi_struct send_napi;
+	struct napi_struct recv_napi;
+
+	unsigned long flags;
+
+	struct rw_semaphore vlan_rwsem;
+	struct mutex mcast_mutex;
+	struct mutex sysfs_mutex;
+
+	struct rb_root  path_tree;
+	struct list_head path_list;
+
+	struct ipoib_neigh_table ntbl;
+
+	struct ipoib_mcast *broadcast;
+	struct list_head multicast_list;
+	struct rb_root multicast_tree;
+
+	struct workqueue_struct *wq;
+	struct delayed_work mcast_task;
+	struct work_struct carrier_on_task;
+	struct work_struct flush_light;
+	struct work_struct flush_normal;
+	struct work_struct flush_heavy;
+	struct work_struct restart_task;
+	struct delayed_work ah_reap_task;
+	struct delayed_work neigh_reap_task;
+	struct ib_device *ca;
+	u8		  port;
+	u16		  pkey;
+	u16		  pkey_index;
+	struct ib_pd	 *pd;
+	struct ib_cq	 *recv_cq;
+	struct ib_cq	 *send_cq;
+	struct ib_qp	 *qp;
+	u32		  qkey;
+
+	union ib_gid local_gid;
+	u32	     local_lid;
+
+	unsigned int admin_mtu;
+	unsigned int mcast_mtu;
+	unsigned int max_ib_mtu;
+
+	struct ipoib_rx_buf *rx_ring;
+
+	struct ipoib_tx_buf *tx_ring;
+	unsigned	     tx_head;
+	unsigned	     tx_tail;
+	struct ib_sge	     tx_sge[MAX_SKB_FRAGS + 1];
+	struct ib_ud_wr      tx_wr;
+	struct ib_wc	     send_wc[MAX_SEND_CQE];
+
+	struct ib_recv_wr    rx_wr;
+	struct ib_sge	     rx_sge[IPOIB_UD_RX_SG];
+
+	struct ib_wc ibwc[IPOIB_NUM_WC];
+
+	struct list_head dead_ahs;
+
+	struct ib_event_handler event_handler;
+
+	struct net_device *parent;
+	struct list_head child_intfs;
+	struct list_head list;
+	int    child_type;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	struct ipoib_cm_dev_priv cm;
+#endif
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+	struct list_head fs_list;
+	struct dentry *mcg_dentry;
+	struct dentry *path_dentry;
+#endif
+	u64	hca_caps;
+	struct ipoib_ethtool_st ethtool;
+	unsigned max_send_sge;
+	bool sm_fullmember_sendonly_support;
+	const struct net_device_ops	*rn_ops;
+};
+
+struct ipoib_ah {
+	struct net_device *dev;
+	struct ib_ah	  *ah;
+	struct list_head   list;
+	struct kref	   ref;
+	unsigned	   last_send;
+};
+
+struct ipoib_path {
+	struct net_device    *dev;
+	struct sa_path_rec pathrec;
+	struct ipoib_ah      *ah;
+	struct sk_buff_head   queue;
+
+	struct list_head      neigh_list;
+
+	int		      query_id;
+	struct ib_sa_query   *query;
+	struct completion     done;
+
+	struct rb_node	      rb_node;
+	struct list_head      list;
+	int  		      valid;
+};
+
+struct ipoib_neigh {
+	struct ipoib_ah    *ah;
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	struct ipoib_cm_tx *cm;
+#endif
+	u8     daddr[INFINIBAND_ALEN];
+	struct sk_buff_head queue;
+
+	struct net_device *dev;
+
+	struct list_head    list;
+	struct ipoib_neigh __rcu *hnext;
+	struct rcu_head     rcu;
+	atomic_t	    refcnt;
+	unsigned long       alive;
+};
+
+#define IPOIB_UD_MTU(ib_mtu)		(ib_mtu - IPOIB_ENCAP_LEN)
+#define IPOIB_UD_BUF_SIZE(ib_mtu)	(ib_mtu + IB_GRH_BYTES)
+
+void ipoib_neigh_dtor(struct ipoib_neigh *neigh);
+static inline void ipoib_neigh_put(struct ipoib_neigh *neigh)
+{
+	if (atomic_dec_and_test(&neigh->refcnt))
+		ipoib_neigh_dtor(neigh);
+}
+struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr);
+struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
+				      struct net_device *dev);
+void ipoib_neigh_free(struct ipoib_neigh *neigh);
+void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid);
+
+extern struct workqueue_struct *ipoib_workqueue;
+
+/* functions */
+
+int ipoib_rx_poll(struct napi_struct *napi, int budget);
+int ipoib_tx_poll(struct napi_struct *napi, int budget);
+void ipoib_ib_rx_completion(struct ib_cq *cq, void *ctx_ptr);
+void ipoib_ib_tx_completion(struct ib_cq *cq, void *ctx_ptr);
+
+struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
+				 struct ib_pd *pd, struct rdma_ah_attr *attr);
+void ipoib_free_ah(struct kref *kref);
+static inline void ipoib_put_ah(struct ipoib_ah *ah)
+{
+	kref_put(&ah->ref, ipoib_free_ah);
+}
+int ipoib_open(struct net_device *dev);
+int ipoib_add_pkey_attr(struct net_device *dev);
+int ipoib_add_umcast_attr(struct net_device *dev);
+
+int ipoib_send(struct net_device *dev, struct sk_buff *skb,
+	       struct ib_ah *address, u32 dqpn);
+void ipoib_reap_ah(struct work_struct *work);
+
+struct ipoib_path *__path_find(struct net_device *dev, void *gid);
+void ipoib_mark_paths_invalid(struct net_device *dev);
+void ipoib_flush_paths(struct net_device *dev);
+struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port,
+					const char *format);
+void ipoib_ib_tx_timer_func(struct timer_list *t);
+void ipoib_ib_dev_flush_light(struct work_struct *work);
+void ipoib_ib_dev_flush_normal(struct work_struct *work);
+void ipoib_ib_dev_flush_heavy(struct work_struct *work);
+void ipoib_pkey_event(struct work_struct *work);
+void ipoib_ib_dev_cleanup(struct net_device *dev);
+
+int ipoib_ib_dev_open_default(struct net_device *dev);
+int ipoib_ib_dev_open(struct net_device *dev);
+int ipoib_ib_dev_stop(struct net_device *dev);
+void ipoib_ib_dev_up(struct net_device *dev);
+void ipoib_ib_dev_down(struct net_device *dev);
+int ipoib_ib_dev_stop_default(struct net_device *dev);
+void ipoib_pkey_dev_check_presence(struct net_device *dev);
+
+int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
+void ipoib_dev_cleanup(struct net_device *dev);
+
+void ipoib_mcast_join_task(struct work_struct *work);
+void ipoib_mcast_carrier_on_task(struct work_struct *work);
+void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
+
+void ipoib_mcast_restart_task(struct work_struct *work);
+void ipoib_mcast_start_thread(struct net_device *dev);
+int ipoib_mcast_stop_thread(struct net_device *dev);
+
+void ipoib_mcast_dev_down(struct net_device *dev);
+void ipoib_mcast_dev_flush(struct net_device *dev);
+
+int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
+void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
+			struct ipoib_tx_buf *tx_req);
+
+static inline void ipoib_build_sge(struct ipoib_dev_priv *priv,
+				   struct ipoib_tx_buf *tx_req)
+{
+	int i, off;
+	struct sk_buff *skb = tx_req->skb;
+	skb_frag_t *frags = skb_shinfo(skb)->frags;
+	int nr_frags = skb_shinfo(skb)->nr_frags;
+	u64 *mapping = tx_req->mapping;
+
+	if (skb_headlen(skb)) {
+		priv->tx_sge[0].addr         = mapping[0];
+		priv->tx_sge[0].length       = skb_headlen(skb);
+		off = 1;
+	} else
+		off = 0;
+
+	for (i = 0; i < nr_frags; ++i) {
+		priv->tx_sge[i + off].addr = mapping[i + off];
+		priv->tx_sge[i + off].length = skb_frag_size(&frags[i]);
+	}
+	priv->tx_wr.wr.num_sge	     = nr_frags + off;
+}
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+				  union ib_gid *gid,
+				  unsigned long *created,
+				  unsigned int *queuelen,
+				  unsigned int *complete,
+				  unsigned int *send_only);
+
+struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev);
+int ipoib_path_iter_next(struct ipoib_path_iter *iter);
+void ipoib_path_iter_read(struct ipoib_path_iter *iter,
+			  struct ipoib_path *path);
+#endif
+
+int ipoib_mcast_attach(struct net_device *dev, struct ib_device *hca,
+		       union ib_gid *mgid, u16 mlid, int set_qkey, u32 qkey);
+int ipoib_mcast_detach(struct net_device *dev, struct ib_device *hca,
+		       union ib_gid *mgid, u16 mlid);
+void ipoib_mcast_remove_list(struct list_head *remove_list);
+void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid,
+				struct list_head *remove_list);
+
+int ipoib_init_qp(struct net_device *dev);
+int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
+void ipoib_transport_dev_cleanup(struct net_device *dev);
+
+void ipoib_event(struct ib_event_handler *handler,
+		 struct ib_event *record);
+
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey);
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
+
+int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
+		     u16 pkey, int child_type);
+
+int  __init ipoib_netlink_init(void);
+void __exit ipoib_netlink_fini(void);
+
+void ipoib_set_umcast(struct net_device *ndev, int umcast_val);
+int  ipoib_set_mode(struct net_device *dev, const char *buf);
+
+void ipoib_setup_common(struct net_device *dev);
+
+void ipoib_pkey_open(struct ipoib_dev_priv *priv);
+void ipoib_drain_cq(struct net_device *dev);
+
+void ipoib_set_ethtool_ops(struct net_device *dev);
+void ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca);
+
+#define IPOIB_FLAGS_RC		0x80
+#define IPOIB_FLAGS_UC		0x40
+
+/* We don't support UC connections at the moment */
+#define IPOIB_CM_SUPPORTED(ha)   (ha[0] & (IPOIB_FLAGS_RC))
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+
+extern int ipoib_max_conn_qp;
+
+static inline int ipoib_cm_admin_enabled(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	return IPOIB_CM_SUPPORTED(dev->dev_addr) &&
+		test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+}
+
+static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	return IPOIB_CM_SUPPORTED(hwaddr) &&
+		test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+}
+
+static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
+
+{
+	return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags);
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
+{
+	return neigh->cm;
+}
+
+static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
+{
+	neigh->cm = tx;
+}
+
+static inline int ipoib_cm_has_srq(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	return !!priv->cm.srq;
+}
+
+static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	return priv->cm.max_cm_mtu;
+}
+
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx);
+int ipoib_cm_dev_open(struct net_device *dev);
+void ipoib_cm_dev_stop(struct net_device *dev);
+int ipoib_cm_dev_init(struct net_device *dev);
+int ipoib_cm_add_mode_attr(struct net_device *dev);
+void ipoib_cm_dev_cleanup(struct net_device *dev);
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+				    struct ipoib_neigh *neigh);
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
+void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
+			   unsigned int mtu);
+void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
+#else
+
+struct ipoib_cm_tx;
+
+#define ipoib_max_conn_qp 0
+
+static inline int ipoib_cm_admin_enabled(struct net_device *dev)
+{
+	return 0;
+}
+static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)
+
+{
+	return 0;
+}
+
+static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
+
+{
+	return 0;
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
+{
+	return NULL;
+}
+
+static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
+{
+}
+
+static inline int ipoib_cm_has_srq(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
+{
+	return;
+}
+
+static inline
+int ipoib_cm_dev_open(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline
+void ipoib_cm_dev_stop(struct net_device *dev)
+{
+	return;
+}
+
+static inline
+int ipoib_cm_dev_init(struct net_device *dev)
+{
+	return -ENOSYS;
+}
+
+static inline
+void ipoib_cm_dev_cleanup(struct net_device *dev)
+{
+	return;
+}
+
+static inline
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+				    struct ipoib_neigh *neigh)
+{
+	return NULL;
+}
+
+static inline
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+	return;
+}
+
+static inline
+int ipoib_cm_add_mode_attr(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
+					 unsigned int mtu)
+{
+	dev_kfree_skb_any(skb);
+}
+
+static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+}
+
+static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+}
+#endif
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+void ipoib_create_debug_files(struct net_device *dev);
+void ipoib_delete_debug_files(struct net_device *dev);
+int ipoib_register_debugfs(void);
+void ipoib_unregister_debugfs(void);
+#else
+static inline void ipoib_create_debug_files(struct net_device *dev) { }
+static inline void ipoib_delete_debug_files(struct net_device *dev) { }
+static inline int ipoib_register_debugfs(void) { return 0; }
+static inline void ipoib_unregister_debugfs(void) { }
+#endif
+
+#define ipoib_printk(level, priv, format, arg...)	\
+	printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg)
+#define ipoib_warn(priv, format, arg...)		\
+do {							\
+	static DEFINE_RATELIMIT_STATE(_rs,		\
+		10 * HZ /*10 seconds */,		\
+		100);		\
+	if (__ratelimit(&_rs))				\
+		ipoib_printk(KERN_WARNING, priv, format , ## arg);\
+} while (0)
+
+extern int ipoib_sendq_size;
+extern int ipoib_recvq_size;
+
+extern struct ib_sa_client ipoib_sa_client;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+extern int ipoib_debug_level;
+
+#define ipoib_dbg(priv, format, arg...)			\
+	do {						\
+		if (ipoib_debug_level > 0)			\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#define ipoib_dbg_mcast(priv, format, arg...)		\
+	do {						\
+		if (mcast_debug_level > 0)		\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+#define ipoib_dbg(priv, format, arg...)			\
+	do { (void) (priv); } while (0)
+#define ipoib_dbg_mcast(priv, format, arg...)		\
+	do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+#define ipoib_dbg_data(priv, format, arg...)		\
+	do {						\
+		if (data_debug_level > 0)		\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+#define ipoib_dbg_data(priv, format, arg...)		\
+	do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+
+#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
+
+extern const char ipoib_driver_version[];
+
+#endif /* _IPOIB_H */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
new file mode 100644
index 0000000..962fbcb
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -0,0 +1,1663 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies. All rights reserved
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_cm.h>
+#include <net/dst.h>
+#include <net/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/moduleparam.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+
+#include "ipoib.h"
+
+int ipoib_max_conn_qp = 128;
+
+module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
+MODULE_PARM_DESC(max_nonsrq_conn_qp,
+		 "Max number of connected-mode QPs per interface "
+		 "(applied only if shared receive queue is not available)");
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
+MODULE_PARM_DESC(cm_data_debug_level,
+		 "Enable data path debug tracing for connected mode if > 0");
+#endif
+
+#define IPOIB_CM_IETF_ID 0x1000000000000000ULL
+
+#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
+#define IPOIB_CM_RX_TIMEOUT     (2 * 256 * HZ)
+#define IPOIB_CM_RX_DELAY       (3 * 256 * HZ)
+#define IPOIB_CM_RX_UPDATE_MASK (0x3)
+
+#define IPOIB_CM_RX_RESERVE     (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN)
+
+static struct ib_qp_attr ipoib_cm_err_attr = {
+	.qp_state = IB_QPS_ERR
+};
+
+#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
+
+static struct ib_send_wr ipoib_cm_rx_drain_wr = {
+	.opcode = IB_WR_SEND,
+};
+
+static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event);
+
+static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
+				  u64 mapping[IPOIB_CM_RX_SG])
+{
+	int i;
+
+	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
+
+	for (i = 0; i < frags; ++i)
+		ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
+}
+
+static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ib_recv_wr *bad_wr;
+	int i, ret;
+
+	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+
+	for (i = 0; i < priv->cm.num_frags; ++i)
+		priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
+
+	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
+		ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
+				      priv->cm.srq_ring[id].mapping);
+		dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
+		priv->cm.srq_ring[id].skb = NULL;
+	}
+
+	return ret;
+}
+
+static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,
+					struct ipoib_cm_rx *rx,
+					struct ib_recv_wr *wr,
+					struct ib_sge *sge, int id)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ib_recv_wr *bad_wr;
+	int i, ret;
+
+	wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+
+	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
+		sge[i].addr = rx->rx_ring[id].mapping[i];
+
+	ret = ib_post_recv(rx->qp, wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
+		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+				      rx->rx_ring[id].mapping);
+		dev_kfree_skb_any(rx->rx_ring[id].skb);
+		rx->rx_ring[id].skb = NULL;
+	}
+
+	return ret;
+}
+
+static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
+					     struct ipoib_cm_rx_buf *rx_ring,
+					     int id, int frags,
+					     u64 mapping[IPOIB_CM_RX_SG],
+					     gfp_t gfp)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct sk_buff *skb;
+	int i;
+
+	skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16));
+	if (unlikely(!skb))
+		return NULL;
+
+	/*
+	 * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the
+	 * IP header to a multiple of 16.
+	 */
+	skb_reserve(skb, IPOIB_CM_RX_RESERVE);
+
+	mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
+				       DMA_FROM_DEVICE);
+	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
+		dev_kfree_skb_any(skb);
+		return NULL;
+	}
+
+	for (i = 0; i < frags; i++) {
+		struct page *page = alloc_page(gfp);
+
+		if (!page)
+			goto partial_error;
+		skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE);
+
+		mapping[i + 1] = ib_dma_map_page(priv->ca, page,
+						 0, PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
+			goto partial_error;
+	}
+
+	rx_ring[id].skb = skb;
+	return skb;
+
+partial_error:
+
+	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
+
+	for (; i > 0; --i)
+		ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE);
+
+	dev_kfree_skb_any(skb);
+	return NULL;
+}
+
+static void ipoib_cm_free_rx_ring(struct net_device *dev,
+				  struct ipoib_cm_rx_buf *rx_ring)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i)
+		if (rx_ring[i].skb) {
+			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+					      rx_ring[i].mapping);
+			dev_kfree_skb_any(rx_ring[i].skb);
+		}
+
+	vfree(rx_ring);
+}
+
+static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
+{
+	struct ib_send_wr *bad_wr;
+	struct ipoib_cm_rx *p;
+
+	/* We only reserved 1 extra slot in CQ for drain WRs, so
+	 * make sure we have at most 1 outstanding WR. */
+	if (list_empty(&priv->cm.rx_flush_list) ||
+	    !list_empty(&priv->cm.rx_drain_list))
+		return;
+
+	/*
+	 * QPs on flush list are error state.  This way, a "flush
+	 * error" WC will be immediately generated for each WR we post.
+	 */
+	p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+	ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID;
+	if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
+		ipoib_warn(priv, "failed to post drain wr\n");
+
+	list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
+}
+
+static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
+{
+	struct ipoib_cm_rx *p = ctx;
+	struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
+	unsigned long flags;
+
+	if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
+		return;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_move(&p->list, &priv->cm.rx_flush_list);
+	p->state = IPOIB_CM_RX_FLUSH;
+	ipoib_cm_start_rx_drain(priv);
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
+					   struct ipoib_cm_rx *p)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ib_qp_init_attr attr = {
+		.event_handler = ipoib_cm_rx_event_handler,
+		.send_cq = priv->recv_cq, /* For drain WR */
+		.recv_cq = priv->recv_cq,
+		.srq = priv->cm.srq,
+		.cap.max_send_wr = 1, /* For drain WR */
+		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
+		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.qp_type = IB_QPT_RC,
+		.qp_context = p,
+	};
+
+	if (!ipoib_cm_has_srq(dev)) {
+		attr.cap.max_recv_wr  = ipoib_recvq_size;
+		attr.cap.max_recv_sge = IPOIB_CM_RX_SG;
+	}
+
+	return ib_create_qp(priv->pd, &attr);
+}
+
+static int ipoib_cm_modify_rx_qp(struct net_device *dev,
+				 struct ib_cm_id *cm_id, struct ib_qp *qp,
+				 unsigned psn)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
+		return ret;
+	}
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
+		return ret;
+	}
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+		return ret;
+	}
+	qp_attr.rq_psn = psn;
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+		return ret;
+	}
+
+	/*
+	 * Current Mellanox HCA firmware won't generate completions
+	 * with error for drain WRs unless the QP has been moved to
+	 * RTS first. This work-around leaves a window where a QP has
+	 * moved to error asynchronously, but this will eventually get
+	 * fixed in firmware, so let's not error out if modify QP
+	 * fails.
+	 */
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+		return 0;
+	}
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+		return 0;
+	}
+
+	return 0;
+}
+
+static void ipoib_cm_init_rx_wr(struct net_device *dev,
+				struct ib_recv_wr *wr,
+				struct ib_sge *sge)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int i;
+
+	for (i = 0; i < priv->cm.num_frags; ++i)
+		sge[i].lkey = priv->pd->local_dma_lkey;
+
+	sge[0].length = IPOIB_CM_HEAD_SIZE;
+	for (i = 1; i < priv->cm.num_frags; ++i)
+		sge[i].length = PAGE_SIZE;
+
+	wr->next    = NULL;
+	wr->sg_list = sge;
+	wr->num_sge = priv->cm.num_frags;
+}
+
+static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
+				   struct ipoib_cm_rx *rx)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct {
+		struct ib_recv_wr wr;
+		struct ib_sge sge[IPOIB_CM_RX_SG];
+	} *t;
+	int ret;
+	int i;
+
+	rx->rx_ring = vzalloc(ipoib_recvq_size * sizeof *rx->rx_ring);
+	if (!rx->rx_ring)
+		return -ENOMEM;
+
+	t = kmalloc(sizeof *t, GFP_KERNEL);
+	if (!t) {
+		ret = -ENOMEM;
+		goto err_free_1;
+	}
+
+	ipoib_cm_init_rx_wr(dev, &t->wr, t->sge);
+
+	spin_lock_irq(&priv->lock);
+
+	if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
+		spin_unlock_irq(&priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
+		ret = -EINVAL;
+		goto err_free;
+	} else
+		++priv->cm.nonsrq_conn_qp;
+
+	spin_unlock_irq(&priv->lock);
+
+	for (i = 0; i < ipoib_recvq_size; ++i) {
+		if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1,
+					   rx->rx_ring[i].mapping,
+					   GFP_KERNEL)) {
+			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+			ret = -ENOMEM;
+			goto err_count;
+		}
+		ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i);
+		if (ret) {
+			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
+				   "failed for buf %d\n", i);
+			ret = -EIO;
+			goto err_count;
+		}
+	}
+
+	rx->recv_count = ipoib_recvq_size;
+
+	kfree(t);
+
+	return 0;
+
+err_count:
+	spin_lock_irq(&priv->lock);
+	--priv->cm.nonsrq_conn_qp;
+	spin_unlock_irq(&priv->lock);
+
+err_free:
+	kfree(t);
+
+err_free_1:
+	ipoib_cm_free_rx_ring(dev, rx->rx_ring);
+
+	return ret;
+}
+
+static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
+			     struct ib_qp *qp, struct ib_cm_req_event_param *req,
+			     unsigned psn)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_cm_data data = {};
+	struct ib_cm_rep_param rep = {};
+
+	data.qpn = cpu_to_be32(priv->qp->qp_num);
+	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
+
+	rep.private_data = &data;
+	rep.private_data_len = sizeof data;
+	rep.flow_control = 0;
+	rep.rnr_retry_count = req->rnr_retry_count;
+	rep.srq = ipoib_cm_has_srq(dev);
+	rep.qp_num = qp->qp_num;
+	rep.starting_psn = psn;
+	return ib_send_cm_rep(cm_id, &rep);
+}
+
+static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	struct net_device *dev = cm_id->context;
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_cm_rx *p;
+	unsigned psn;
+	int ret;
+
+	ipoib_dbg(priv, "REQ arrived\n");
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	p->dev = dev;
+	p->id = cm_id;
+	cm_id->context = p;
+	p->state = IPOIB_CM_RX_LIVE;
+	p->jiffies = jiffies;
+	INIT_LIST_HEAD(&p->list);
+
+	p->qp = ipoib_cm_create_rx_qp(dev, p);
+	if (IS_ERR(p->qp)) {
+		ret = PTR_ERR(p->qp);
+		goto err_qp;
+	}
+
+	psn = prandom_u32() & 0xffffff;
+	ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
+	if (ret)
+		goto err_modify;
+
+	if (!ipoib_cm_has_srq(dev)) {
+		ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p);
+		if (ret)
+			goto err_modify;
+	}
+
+	spin_lock_irq(&priv->lock);
+	queue_delayed_work(priv->wq,
+			   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
+	/* Add this entry to passive ids list head, but do not re-add it
+	 * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
+	p->jiffies = jiffies;
+	if (p->state == IPOIB_CM_RX_LIVE)
+		list_move(&p->list, &priv->cm.passive_ids);
+	spin_unlock_irq(&priv->lock);
+
+	ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn);
+	if (ret) {
+		ipoib_warn(priv, "failed to send REP: %d\n", ret);
+		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+			ipoib_warn(priv, "unable to move qp to error state\n");
+	}
+	return 0;
+
+err_modify:
+	ib_destroy_qp(p->qp);
+err_qp:
+	kfree(p);
+	return ret;
+}
+
+static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event)
+{
+	struct ipoib_cm_rx *p;
+	struct ipoib_dev_priv *priv;
+
+	switch (event->event) {
+	case IB_CM_REQ_RECEIVED:
+		return ipoib_cm_req_handler(cm_id, event);
+	case IB_CM_DREQ_RECEIVED:
+		ib_send_cm_drep(cm_id, NULL, 0);
+		/* Fall through */
+	case IB_CM_REJ_RECEIVED:
+		p = cm_id->context;
+		priv = ipoib_priv(p->dev);
+		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+			ipoib_warn(priv, "unable to move qp to error state\n");
+		/* Fall through */
+	default:
+		return 0;
+	}
+}
+/* Adjust length of skb with fragments to match received data */
+static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
+			  unsigned int length, struct sk_buff *toskb)
+{
+	int i, num_frags;
+	unsigned int size;
+
+	/* put header into skb */
+	size = min(length, hdr_space);
+	skb->tail += size;
+	skb->len += size;
+	length -= size;
+
+	num_frags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < num_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		if (length == 0) {
+			/* don't need this page */
+			skb_fill_page_desc(toskb, i, skb_frag_page(frag),
+					   0, PAGE_SIZE);
+			--skb_shinfo(skb)->nr_frags;
+		} else {
+			size = min(length, (unsigned) PAGE_SIZE);
+
+			skb_frag_size_set(frag, size);
+			skb->data_len += size;
+			skb->truesize += size;
+			skb->len += size;
+			length -= size;
+		}
+	}
+}
+
+void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_cm_rx_buf *rx_ring;
+	unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
+	struct sk_buff *skb, *newskb;
+	struct ipoib_cm_rx *p;
+	unsigned long flags;
+	u64 mapping[IPOIB_CM_RX_SG];
+	int frags;
+	int has_srq;
+	struct sk_buff *small_skb;
+
+	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_recvq_size)) {
+		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
+			spin_lock_irqsave(&priv->lock, flags);
+			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
+			ipoib_cm_start_rx_drain(priv);
+			queue_work(priv->wq, &priv->cm.rx_reap_task);
+			spin_unlock_irqrestore(&priv->lock, flags);
+		} else
+			ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
+				   wr_id, ipoib_recvq_size);
+		return;
+	}
+
+	p = wc->qp->qp_context;
+
+	has_srq = ipoib_cm_has_srq(dev);
+	rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
+
+	skb = rx_ring[wr_id].skb;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		ipoib_dbg(priv,
+			  "cm recv error (status=%d, wrid=%d vend_err %#x)\n",
+			  wc->status, wr_id, wc->vendor_err);
+		++dev->stats.rx_dropped;
+		if (has_srq)
+			goto repost;
+		else {
+			if (!--p->recv_count) {
+				spin_lock_irqsave(&priv->lock, flags);
+				list_move(&p->list, &priv->cm.rx_reap_list);
+				spin_unlock_irqrestore(&priv->lock, flags);
+				queue_work(priv->wq, &priv->cm.rx_reap_task);
+			}
+			return;
+		}
+	}
+
+	if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
+		if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
+			spin_lock_irqsave(&priv->lock, flags);
+			p->jiffies = jiffies;
+			/* Move this entry to list head, but do not re-add it
+			 * if it has been moved out of list. */
+			if (p->state == IPOIB_CM_RX_LIVE)
+				list_move(&p->list, &priv->cm.passive_ids);
+			spin_unlock_irqrestore(&priv->lock, flags);
+		}
+	}
+
+	if (wc->byte_len < IPOIB_CM_COPYBREAK) {
+		int dlen = wc->byte_len;
+
+		small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE);
+		if (small_skb) {
+			skb_reserve(small_skb, IPOIB_CM_RX_RESERVE);
+			ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
+						   dlen, DMA_FROM_DEVICE);
+			skb_copy_from_linear_data(skb, small_skb->data, dlen);
+			ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0],
+						      dlen, DMA_FROM_DEVICE);
+			skb_put(small_skb, dlen);
+			skb = small_skb;
+			goto copied;
+		}
+	}
+
+	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
+					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
+
+	newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags,
+				       mapping, GFP_ATOMIC);
+	if (unlikely(!newskb)) {
+		/*
+		 * If we can't allocate a new RX buffer, dump
+		 * this packet and reuse the old buffer.
+		 */
+		ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
+		++dev->stats.rx_dropped;
+		goto repost;
+	}
+
+	ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping);
+	memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping);
+
+	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+		       wc->byte_len, wc->slid);
+
+	skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
+
+copied:
+	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+	skb_add_pseudo_hdr(skb);
+
+	++dev->stats.rx_packets;
+	dev->stats.rx_bytes += skb->len;
+
+	skb->dev = dev;
+	/* XXX get correct PACKET_ type here */
+	skb->pkt_type = PACKET_HOST;
+	netif_receive_skb(skb);
+
+repost:
+	if (has_srq) {
+		if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id)))
+			ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
+				   "for buf %d\n", wr_id);
+	} else {
+		if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p,
+							  &priv->cm.rx_wr,
+							  priv->cm.rx_sge,
+							  wr_id))) {
+			--p->recv_count;
+			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
+				   "for buf %d\n", wr_id);
+		}
+	}
+}
+
+static inline int post_send(struct ipoib_dev_priv *priv,
+			    struct ipoib_cm_tx *tx,
+			    unsigned int wr_id,
+			    struct ipoib_tx_buf *tx_req)
+{
+	struct ib_send_wr *bad_wr;
+
+	ipoib_build_sge(priv, tx_req);
+
+	priv->tx_wr.wr.wr_id	= wr_id | IPOIB_OP_CM;
+
+	return ib_post_send(tx->qp, &priv->tx_wr.wr, &bad_wr);
+}
+
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_tx_buf *tx_req;
+	int rc;
+	unsigned usable_sge = tx->max_send_sge - !!skb_headlen(skb);
+
+	if (unlikely(skb->len > tx->mtu)) {
+		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+			   skb->len, tx->mtu);
+		++dev->stats.tx_dropped;
+		++dev->stats.tx_errors;
+		ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN);
+		return;
+	}
+	if (skb_shinfo(skb)->nr_frags > usable_sge) {
+		if (skb_linearize(skb) < 0) {
+			ipoib_warn(priv, "skb could not be linearized\n");
+			++dev->stats.tx_dropped;
+			++dev->stats.tx_errors;
+			dev_kfree_skb_any(skb);
+			return;
+		}
+		/* Does skb_linearize return ok without reducing nr_frags? */
+		if (skb_shinfo(skb)->nr_frags > usable_sge) {
+			ipoib_warn(priv, "too many frags after skb linearize\n");
+			++dev->stats.tx_dropped;
+			++dev->stats.tx_errors;
+			dev_kfree_skb_any(skb);
+			return;
+		}
+	}
+	ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
+		       tx->tx_head, skb->len, tx->qp->qp_num);
+
+	/*
+	 * We put the skb into the tx_ring _before_ we call post_send()
+	 * because it's entirely possible that the completion handler will
+	 * run before we execute anything after the post_send().  That
+	 * means we have to make sure everything is properly recorded and
+	 * our state is consistent before we call post_send().
+	 */
+	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
+	tx_req->skb = skb;
+
+	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
+		++dev->stats.tx_errors;
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	if ((priv->tx_head - priv->tx_tail) == ipoib_sendq_size - 1) {
+		ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
+			  tx->qp->qp_num);
+		netif_stop_queue(dev);
+	}
+
+	skb_orphan(skb);
+	skb_dst_drop(skb);
+
+	if (netif_queue_stopped(dev)) {
+		rc = ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP |
+				      IB_CQ_REPORT_MISSED_EVENTS);
+		if (unlikely(rc < 0))
+			ipoib_warn(priv, "IPoIB/CM:request notify on send CQ failed\n");
+		else if (rc)
+			napi_schedule(&priv->send_napi);
+	}
+
+	rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req);
+	if (unlikely(rc)) {
+		ipoib_warn(priv, "IPoIB/CM:post_send failed, error %d\n", rc);
+		++dev->stats.tx_errors;
+		ipoib_dma_unmap_tx(priv, tx_req);
+		dev_kfree_skb_any(skb);
+
+		if (netif_queue_stopped(dev))
+			netif_wake_queue(dev);
+	} else {
+		netif_trans_update(dev);
+		++tx->tx_head;
+		++priv->tx_head;
+	}
+}
+
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_cm_tx *tx = wc->qp->qp_context;
+	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
+	struct ipoib_tx_buf *tx_req;
+	unsigned long flags;
+
+	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_sendq_size)) {
+		ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
+			   wr_id, ipoib_sendq_size);
+		return;
+	}
+
+	tx_req = &tx->tx_ring[wr_id];
+
+	ipoib_dma_unmap_tx(priv, tx_req);
+
+	/* FIXME: is this right? Shouldn't we only increment on success? */
+	++dev->stats.tx_packets;
+	dev->stats.tx_bytes += tx_req->skb->len;
+
+	dev_kfree_skb_any(tx_req->skb);
+
+	netif_tx_lock(dev);
+
+	++tx->tx_tail;
+	++priv->tx_tail;
+
+	if (unlikely(netif_queue_stopped(dev) &&
+		     (priv->tx_head - priv->tx_tail) <= ipoib_sendq_size >> 1 &&
+		     test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)))
+		netif_wake_queue(dev);
+
+	if (wc->status != IB_WC_SUCCESS &&
+	    wc->status != IB_WC_WR_FLUSH_ERR) {
+		struct ipoib_neigh *neigh;
+
+		/* IB_WC[_RNR]_RETRY_EXC_ERR error is part of the life cycle,
+		 * so don't make waves.
+		 */
+		if (wc->status == IB_WC_RNR_RETRY_EXC_ERR ||
+		    wc->status == IB_WC_RETRY_EXC_ERR)
+			ipoib_dbg(priv,
+				  "%s: failed cm send event (status=%d, wrid=%d vend_err %#x)\n",
+				   __func__, wc->status, wr_id, wc->vendor_err);
+		else
+			ipoib_warn(priv,
+				    "%s: failed cm send event (status=%d, wrid=%d vend_err %#x)\n",
+				   __func__, wc->status, wr_id, wc->vendor_err);
+
+		spin_lock_irqsave(&priv->lock, flags);
+		neigh = tx->neigh;
+
+		if (neigh) {
+			neigh->cm = NULL;
+			ipoib_neigh_free(neigh);
+
+			tx->neigh = NULL;
+		}
+
+		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+			list_move(&tx->list, &priv->cm.reap_list);
+			queue_work(priv->wq, &priv->cm.reap_task);
+		}
+
+		clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
+
+	netif_tx_unlock(dev);
+}
+
+int ipoib_cm_dev_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int ret;
+
+	if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
+		return 0;
+
+	priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
+	if (IS_ERR(priv->cm.id)) {
+		pr_warn("%s: failed to create CM ID\n", priv->ca->name);
+		ret = PTR_ERR(priv->cm.id);
+		goto err_cm;
+	}
+
+	ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
+			   0);
+	if (ret) {
+		pr_warn("%s: failed to listen on ID 0x%llx\n", priv->ca->name,
+			IPOIB_CM_IETF_ID | priv->qp->qp_num);
+		goto err_listen;
+	}
+
+	return 0;
+
+err_listen:
+	ib_destroy_cm_id(priv->cm.id);
+err_cm:
+	priv->cm.id = NULL;
+	return ret;
+}
+
+static void ipoib_cm_free_rx_reap_list(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_cm_rx *rx, *n;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&priv->lock);
+	list_splice_init(&priv->cm.rx_reap_list, &list);
+	spin_unlock_irq(&priv->lock);
+
+	list_for_each_entry_safe(rx, n, &list, list) {
+		ib_destroy_cm_id(rx->id);
+		ib_destroy_qp(rx->qp);
+		if (!ipoib_cm_has_srq(dev)) {
+			ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring);
+			spin_lock_irq(&priv->lock);
+			--priv->cm.nonsrq_conn_qp;
+			spin_unlock_irq(&priv->lock);
+		}
+		kfree(rx);
+	}
+}
+
+void ipoib_cm_dev_stop(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_cm_rx *p;
+	unsigned long begin;
+	int ret;
+
+	if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
+		return;
+
+	ib_destroy_cm_id(priv->cm.id);
+	priv->cm.id = NULL;
+
+	spin_lock_irq(&priv->lock);
+	while (!list_empty(&priv->cm.passive_ids)) {
+		p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
+		list_move(&p->list, &priv->cm.rx_error_list);
+		p->state = IPOIB_CM_RX_ERROR;
+		spin_unlock_irq(&priv->lock);
+		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+		if (ret)
+			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+		spin_lock_irq(&priv->lock);
+	}
+
+	/* Wait for all RX to be drained */
+	begin = jiffies;
+
+	while (!list_empty(&priv->cm.rx_error_list) ||
+	       !list_empty(&priv->cm.rx_flush_list) ||
+	       !list_empty(&priv->cm.rx_drain_list)) {
+		if (time_after(jiffies, begin + 5 * HZ)) {
+			ipoib_warn(priv, "RX drain timing out\n");
+
+			/*
+			 * assume the HW is wedged and just free up everything.
+			 */
+			list_splice_init(&priv->cm.rx_flush_list,
+					 &priv->cm.rx_reap_list);
+			list_splice_init(&priv->cm.rx_error_list,
+					 &priv->cm.rx_reap_list);
+			list_splice_init(&priv->cm.rx_drain_list,
+					 &priv->cm.rx_reap_list);
+			break;
+		}
+		spin_unlock_irq(&priv->lock);
+		usleep_range(1000, 2000);
+		ipoib_drain_cq(dev);
+		spin_lock_irq(&priv->lock);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	ipoib_cm_free_rx_reap_list(dev);
+
+	cancel_delayed_work(&priv->cm.stale_task);
+}
+
+static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	struct ipoib_cm_tx *p = cm_id->context;
+	struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
+	struct ipoib_cm_data *data = event->private_data;
+	struct sk_buff_head skqueue;
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+	struct sk_buff *skb;
+
+	p->mtu = be32_to_cpu(data->mtu);
+
+	if (p->mtu <= IPOIB_ENCAP_LEN) {
+		ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
+			   p->mtu, IPOIB_ENCAP_LEN);
+		return -EINVAL;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+		return ret;
+	}
+
+	qp_attr.rq_psn = 0 /* FIXME */;
+	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+		return ret;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+		return ret;
+	}
+	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+		return ret;
+	}
+
+	skb_queue_head_init(&skqueue);
+
+	spin_lock_irq(&priv->lock);
+	set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
+	if (p->neigh)
+		while ((skb = __skb_dequeue(&p->neigh->queue)))
+			__skb_queue_tail(&skqueue, skb);
+	spin_unlock_irq(&priv->lock);
+
+	while ((skb = __skb_dequeue(&skqueue))) {
+		skb->dev = p->dev;
+		ret = dev_queue_xmit(skb);
+		if (ret)
+			ipoib_warn(priv, "%s:dev_queue_xmit failed to re-queue packet, ret:%d\n",
+				   __func__, ret);
+	}
+
+	ret = ib_send_cm_rtu(cm_id, NULL, 0);
+	if (ret) {
+		ipoib_warn(priv, "failed to send RTU: %d\n", ret);
+		return ret;
+	}
+	return 0;
+}
+
+static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ib_qp_init_attr attr = {
+		.send_cq		= priv->send_cq,
+		.recv_cq		= priv->recv_cq,
+		.srq			= priv->cm.srq,
+		.cap.max_send_wr	= ipoib_sendq_size,
+		.cap.max_send_sge	= 1,
+		.sq_sig_type		= IB_SIGNAL_ALL_WR,
+		.qp_type		= IB_QPT_RC,
+		.qp_context		= tx,
+		.create_flags		= 0
+	};
+	struct ib_qp *tx_qp;
+
+	if (dev->features & NETIF_F_SG)
+		attr.cap.max_send_sge =
+			min_t(u32, priv->ca->attrs.max_sge, MAX_SKB_FRAGS + 1);
+
+	tx_qp = ib_create_qp(priv->pd, &attr);
+	tx->max_send_sge = attr.cap.max_send_sge;
+	return tx_qp;
+}
+
+static int ipoib_cm_send_req(struct net_device *dev,
+			     struct ib_cm_id *id, struct ib_qp *qp,
+			     u32 qpn,
+			     struct sa_path_rec *pathrec)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_cm_data data = {};
+	struct ib_cm_req_param req = {};
+
+	data.qpn = cpu_to_be32(priv->qp->qp_num);
+	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
+
+	req.primary_path		= pathrec;
+	req.alternate_path		= NULL;
+	req.service_id			= cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
+	req.qp_num			= qp->qp_num;
+	req.qp_type			= qp->qp_type;
+	req.private_data		= &data;
+	req.private_data_len		= sizeof data;
+	req.flow_control		= 0;
+
+	req.starting_psn		= 0; /* FIXME */
+
+	/*
+	 * Pick some arbitrary defaults here; we could make these
+	 * module parameters if anyone cared about setting them.
+	 */
+	req.responder_resources		= 4;
+	req.remote_cm_response_timeout	= 20;
+	req.local_cm_response_timeout	= 20;
+	req.retry_count			= 0; /* RFC draft warns against retries */
+	req.rnr_retry_count		= 0; /* RFC draft warns against retries */
+	req.max_cm_retries		= 15;
+	req.srq				= ipoib_cm_has_srq(dev);
+	return ib_send_cm_req(id, &req);
+}
+
+static int ipoib_cm_modify_tx_init(struct net_device *dev,
+				  struct ib_cm_id *cm_id, struct ib_qp *qp)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+	ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
+	if (ret) {
+		ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret);
+		return ret;
+	}
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
+	qp_attr.port_num = priv->port;
+	qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
+		return ret;
+	}
+	return 0;
+}
+
+static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
+			    struct sa_path_rec *pathrec)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
+	unsigned int noio_flag;
+	int ret;
+
+	noio_flag = memalloc_noio_save();
+	p->tx_ring = vzalloc(ipoib_sendq_size * sizeof(*p->tx_ring));
+	if (!p->tx_ring) {
+		memalloc_noio_restore(noio_flag);
+		ret = -ENOMEM;
+		goto err_tx;
+	}
+	memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring);
+
+	p->qp = ipoib_cm_create_tx_qp(p->dev, p);
+	memalloc_noio_restore(noio_flag);
+	if (IS_ERR(p->qp)) {
+		ret = PTR_ERR(p->qp);
+		ipoib_warn(priv, "failed to create tx qp: %d\n", ret);
+		goto err_qp;
+	}
+
+	p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
+	if (IS_ERR(p->id)) {
+		ret = PTR_ERR(p->id);
+		ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
+		goto err_id;
+	}
+
+	ret = ipoib_cm_modify_tx_init(p->dev, p->id,  p->qp);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
+		goto err_modify_send;
+	}
+
+	ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec);
+	if (ret) {
+		ipoib_warn(priv, "failed to send cm req: %d\n", ret);
+		goto err_modify_send;
+	}
+
+	ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n",
+		  p->qp->qp_num, pathrec->dgid.raw, qpn);
+
+	return 0;
+
+err_modify_send:
+	ib_destroy_cm_id(p->id);
+err_id:
+	p->id = NULL;
+	ib_destroy_qp(p->qp);
+err_qp:
+	p->qp = NULL;
+	vfree(p->tx_ring);
+err_tx:
+	return ret;
+}
+
+static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
+	struct ipoib_tx_buf *tx_req;
+	unsigned long begin;
+
+	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
+		  p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
+
+	if (p->id)
+		ib_destroy_cm_id(p->id);
+
+	if (p->tx_ring) {
+		/* Wait for all sends to complete */
+		begin = jiffies;
+		while ((int) p->tx_tail - (int) p->tx_head < 0) {
+			if (time_after(jiffies, begin + 5 * HZ)) {
+				ipoib_warn(priv, "timing out; %d sends not completed\n",
+					   p->tx_head - p->tx_tail);
+				goto timeout;
+			}
+
+			usleep_range(1000, 2000);
+		}
+	}
+
+timeout:
+
+	while ((int) p->tx_tail - (int) p->tx_head < 0) {
+		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
+		ipoib_dma_unmap_tx(priv, tx_req);
+		dev_kfree_skb_any(tx_req->skb);
+		netif_tx_lock_bh(p->dev);
+		++p->tx_tail;
+		++priv->tx_tail;
+		if (unlikely(priv->tx_head - priv->tx_tail == ipoib_sendq_size >> 1) &&
+		    netif_queue_stopped(p->dev) &&
+		    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+			netif_wake_queue(p->dev);
+		netif_tx_unlock_bh(p->dev);
+	}
+
+	if (p->qp)
+		ib_destroy_qp(p->qp);
+
+	vfree(p->tx_ring);
+	kfree(p);
+}
+
+static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event)
+{
+	struct ipoib_cm_tx *tx = cm_id->context;
+	struct ipoib_dev_priv *priv = ipoib_priv(tx->dev);
+	struct net_device *dev = priv->dev;
+	struct ipoib_neigh *neigh;
+	unsigned long flags;
+	int ret;
+
+	switch (event->event) {
+	case IB_CM_DREQ_RECEIVED:
+		ipoib_dbg(priv, "DREQ received.\n");
+		ib_send_cm_drep(cm_id, NULL, 0);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ipoib_dbg(priv, "REP received.\n");
+		ret = ipoib_cm_rep_handler(cm_id, event);
+		if (ret)
+			ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+				       NULL, 0, NULL, 0);
+		break;
+	case IB_CM_REQ_ERROR:
+	case IB_CM_REJ_RECEIVED:
+	case IB_CM_TIMEWAIT_EXIT:
+		ipoib_dbg(priv, "CM error %d.\n", event->event);
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+		neigh = tx->neigh;
+
+		if (neigh) {
+			neigh->cm = NULL;
+			ipoib_neigh_free(neigh);
+
+			tx->neigh = NULL;
+		}
+
+		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+			list_move(&tx->list, &priv->cm.reap_list);
+			queue_work(priv->wq, &priv->cm.reap_task);
+		}
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+				       struct ipoib_neigh *neigh)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_cm_tx *tx;
+
+	tx = kzalloc(sizeof *tx, GFP_ATOMIC);
+	if (!tx)
+		return NULL;
+
+	neigh->cm = tx;
+	tx->neigh = neigh;
+	tx->path = path;
+	tx->dev = dev;
+	list_add(&tx->list, &priv->cm.start_list);
+	set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
+	queue_work(priv->wq, &priv->cm.start_task);
+	return tx;
+}
+
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(tx->dev);
+	unsigned long flags;
+	if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+		spin_lock_irqsave(&priv->lock, flags);
+		list_move(&tx->list, &priv->cm.reap_list);
+		queue_work(priv->wq, &priv->cm.reap_task);
+		ipoib_dbg(priv, "Reap connection for gid %pI6\n",
+			  tx->neigh->daddr + 4);
+		tx->neigh = NULL;
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
+}
+
+#define QPN_AND_OPTIONS_OFFSET	4
+
+static void ipoib_cm_tx_start(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.start_task);
+	struct net_device *dev = priv->dev;
+	struct ipoib_neigh *neigh;
+	struct ipoib_cm_tx *p;
+	unsigned long flags;
+	struct ipoib_path *path;
+	int ret;
+
+	struct sa_path_rec pathrec;
+	u32 qpn;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	while (!list_empty(&priv->cm.start_list)) {
+		p = list_entry(priv->cm.start_list.next, typeof(*p), list);
+		list_del_init(&p->list);
+		neigh = p->neigh;
+
+		qpn = IPOIB_QPN(neigh->daddr);
+		/*
+		 * As long as the search is with these 2 locks,
+		 * path existence indicates its validity.
+		 */
+		path = __path_find(dev, neigh->daddr + QPN_AND_OPTIONS_OFFSET);
+		if (!path) {
+			pr_info("%s ignore not valid path %pI6\n",
+				__func__,
+				neigh->daddr + QPN_AND_OPTIONS_OFFSET);
+			goto free_neigh;
+		}
+		memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+
+		ret = ipoib_cm_tx_init(p, qpn, &pathrec);
+
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+
+		if (ret) {
+free_neigh:
+			neigh = p->neigh;
+			if (neigh) {
+				neigh->cm = NULL;
+				ipoib_neigh_free(neigh);
+			}
+			list_del(&p->list);
+			kfree(p);
+		}
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+static void ipoib_cm_tx_reap(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.reap_task);
+	struct net_device *dev = priv->dev;
+	struct ipoib_cm_tx *p;
+	unsigned long flags;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	while (!list_empty(&priv->cm.reap_list)) {
+		p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
+		list_del_init(&p->list);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+		ipoib_cm_tx_destroy(p);
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+static void ipoib_cm_skb_reap(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.skb_task);
+	struct net_device *dev = priv->dev;
+	struct sk_buff *skb;
+	unsigned long flags;
+	unsigned mtu = priv->mcast_mtu;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	while ((skb = skb_dequeue(&priv->cm.skb_queue))) {
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+
+		if (skb->protocol == htons(ETH_P_IP))
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+#endif
+		dev_kfree_skb_any(skb);
+
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
+			   unsigned int mtu)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int e = skb_queue_empty(&priv->cm.skb_queue);
+
+	skb_dst_update_pmtu(skb, mtu);
+
+	skb_queue_tail(&priv->cm.skb_queue, skb);
+	if (e)
+		queue_work(priv->wq, &priv->cm.skb_task);
+}
+
+static void ipoib_cm_rx_reap(struct work_struct *work)
+{
+	ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
+						cm.rx_reap_task)->dev);
+}
+
+static void ipoib_cm_stale_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.stale_task.work);
+	struct ipoib_cm_rx *p;
+	int ret;
+
+	spin_lock_irq(&priv->lock);
+	while (!list_empty(&priv->cm.passive_ids)) {
+		/* List is sorted by LRU, start from tail,
+		 * stop when we see a recently used entry */
+		p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
+		if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
+			break;
+		list_move(&p->list, &priv->cm.rx_error_list);
+		p->state = IPOIB_CM_RX_ERROR;
+		spin_unlock_irq(&priv->lock);
+		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+		if (ret)
+			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+		spin_lock_irq(&priv->lock);
+	}
+
+	if (!list_empty(&priv->cm.passive_ids))
+		queue_delayed_work(priv->wq,
+				   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
+	spin_unlock_irq(&priv->lock);
+}
+
+static ssize_t show_mode(struct device *d, struct device_attribute *attr,
+			 char *buf)
+{
+	struct net_device *dev = to_net_dev(d);
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
+		return sprintf(buf, "connected\n");
+	else
+		return sprintf(buf, "datagram\n");
+}
+
+static ssize_t set_mode(struct device *d, struct device_attribute *attr,
+			const char *buf, size_t count)
+{
+	struct net_device *dev = to_net_dev(d);
+	int ret;
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	if (test_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags))
+		return -EPERM;
+
+	if (!mutex_trylock(&priv->sysfs_mutex))
+		return restart_syscall();
+
+	if (!rtnl_trylock()) {
+		mutex_unlock(&priv->sysfs_mutex);
+		return restart_syscall();
+	}
+
+	ret = ipoib_set_mode(dev, buf);
+
+	/* The assumption is that the function ipoib_set_mode returned
+	 * with the rtnl held by it, if not the value -EBUSY returned,
+	 * then no need to rtnl_unlock
+	 */
+	if (ret != -EBUSY)
+		rtnl_unlock();
+	mutex_unlock(&priv->sysfs_mutex);
+
+	return (!ret || ret == -EBUSY) ? count : ret;
+}
+
+static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode);
+
+int ipoib_cm_add_mode_attr(struct net_device *dev)
+{
+	return device_create_file(&dev->dev, &dev_attr_mode);
+}
+
+static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ib_srq_init_attr srq_init_attr = {
+		.srq_type = IB_SRQT_BASIC,
+		.attr = {
+			.max_wr  = ipoib_recvq_size,
+			.max_sge = max_sge
+		}
+	};
+
+	priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
+	if (IS_ERR(priv->cm.srq)) {
+		if (PTR_ERR(priv->cm.srq) != -ENOSYS)
+			pr_warn("%s: failed to allocate SRQ, error %ld\n",
+			       priv->ca->name, PTR_ERR(priv->cm.srq));
+		priv->cm.srq = NULL;
+		return;
+	}
+
+	priv->cm.srq_ring = vzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring);
+	if (!priv->cm.srq_ring) {
+		ib_destroy_srq(priv->cm.srq);
+		priv->cm.srq = NULL;
+		return;
+	}
+
+}
+
+int ipoib_cm_dev_init(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int max_srq_sge, i;
+
+	INIT_LIST_HEAD(&priv->cm.passive_ids);
+	INIT_LIST_HEAD(&priv->cm.reap_list);
+	INIT_LIST_HEAD(&priv->cm.start_list);
+	INIT_LIST_HEAD(&priv->cm.rx_error_list);
+	INIT_LIST_HEAD(&priv->cm.rx_flush_list);
+	INIT_LIST_HEAD(&priv->cm.rx_drain_list);
+	INIT_LIST_HEAD(&priv->cm.rx_reap_list);
+	INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
+	INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
+	INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
+	INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
+	INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
+
+	skb_queue_head_init(&priv->cm.skb_queue);
+
+	ipoib_dbg(priv, "max_srq_sge=%d\n", priv->ca->attrs.max_srq_sge);
+
+	max_srq_sge = min_t(int, IPOIB_CM_RX_SG, priv->ca->attrs.max_srq_sge);
+	ipoib_cm_create_srq(dev, max_srq_sge);
+	if (ipoib_cm_has_srq(dev)) {
+		priv->cm.max_cm_mtu = max_srq_sge * PAGE_SIZE - 0x10;
+		priv->cm.num_frags  = max_srq_sge;
+		ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
+			  priv->cm.max_cm_mtu, priv->cm.num_frags);
+	} else {
+		priv->cm.max_cm_mtu = IPOIB_CM_MTU;
+		priv->cm.num_frags  = IPOIB_CM_RX_SG;
+	}
+
+	ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge);
+
+	if (ipoib_cm_has_srq(dev)) {
+		for (i = 0; i < ipoib_recvq_size; ++i) {
+			if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,
+						   priv->cm.num_frags - 1,
+						   priv->cm.srq_ring[i].mapping,
+						   GFP_KERNEL)) {
+				ipoib_warn(priv, "failed to allocate "
+					   "receive buffer %d\n", i);
+				ipoib_cm_dev_cleanup(dev);
+				return -ENOMEM;
+			}
+
+			if (ipoib_cm_post_receive_srq(dev, i)) {
+				ipoib_warn(priv, "ipoib_cm_post_receive_srq "
+					   "failed for buf %d\n", i);
+				ipoib_cm_dev_cleanup(dev);
+				return -EIO;
+			}
+		}
+	}
+
+	priv->dev->dev_addr[0] = IPOIB_FLAGS_RC;
+	return 0;
+}
+
+void ipoib_cm_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int ret;
+
+	if (!priv->cm.srq)
+		return;
+
+	ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
+
+	ret = ib_destroy_srq(priv->cm.srq);
+	if (ret)
+		ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
+
+	priv->cm.srq = NULL;
+	if (!priv->cm.srq_ring)
+		return;
+
+	ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
+	priv->cm.srq_ring = NULL;
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
new file mode 100644
index 0000000..2706bf2
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+#include "ipoib.h"
+
+struct ipoib_stats {
+	char stat_string[ETH_GSTRING_LEN];
+	int stat_offset;
+};
+
+#define IPOIB_NETDEV_STAT(m) { \
+		.stat_string = #m, \
+		.stat_offset = offsetof(struct rtnl_link_stats64, m) }
+
+static const struct ipoib_stats ipoib_gstrings_stats[] = {
+	IPOIB_NETDEV_STAT(rx_packets),
+	IPOIB_NETDEV_STAT(tx_packets),
+	IPOIB_NETDEV_STAT(rx_bytes),
+	IPOIB_NETDEV_STAT(tx_bytes),
+	IPOIB_NETDEV_STAT(tx_errors),
+	IPOIB_NETDEV_STAT(rx_dropped),
+	IPOIB_NETDEV_STAT(tx_dropped),
+	IPOIB_NETDEV_STAT(multicast),
+};
+
+#define IPOIB_GLOBAL_STATS_LEN	ARRAY_SIZE(ipoib_gstrings_stats)
+
+static void ipoib_get_drvinfo(struct net_device *netdev,
+			      struct ethtool_drvinfo *drvinfo)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(netdev);
+
+	ib_get_device_fw_str(priv->ca, drvinfo->fw_version);
+
+	strlcpy(drvinfo->bus_info, dev_name(priv->ca->dev.parent),
+		sizeof(drvinfo->bus_info));
+
+	strlcpy(drvinfo->version, ipoib_driver_version,
+		sizeof(drvinfo->version));
+
+	strlcpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver));
+}
+
+static int ipoib_get_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs;
+	coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames;
+
+	return 0;
+}
+
+static int ipoib_set_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int ret;
+
+	/*
+	 * These values are saved in the private data and returned
+	 * when ipoib_get_coalesce() is called
+	 */
+	if (coal->rx_coalesce_usecs       > 0xffff ||
+	    coal->rx_max_coalesced_frames > 0xffff)
+		return -EINVAL;
+
+	ret = rdma_set_cq_moderation(priv->recv_cq,
+				     coal->rx_max_coalesced_frames,
+				     coal->rx_coalesce_usecs);
+	if (ret && ret != -ENOSYS) {
+		ipoib_warn(priv, "failed modifying CQ (%d)\n", ret);
+		return ret;
+	}
+
+	priv->ethtool.coalesce_usecs       = coal->rx_coalesce_usecs;
+	priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames;
+
+	return 0;
+}
+static void ipoib_get_ethtool_stats(struct net_device *dev,
+				    struct ethtool_stats __always_unused *stats,
+				    u64 *data)
+{
+	int i;
+	struct net_device_stats *net_stats = &dev->stats;
+	u8 *p = (u8 *)net_stats;
+
+	for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++)
+		data[i] = *(u64 *)(p + ipoib_gstrings_stats[i].stat_offset);
+
+}
+static void ipoib_get_strings(struct net_device __always_unused *dev,
+			      u32 stringset, u8 *data)
+{
+	u8 *p = data;
+	int i;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) {
+			memcpy(p, ipoib_gstrings_stats[i].stat_string,
+				ETH_GSTRING_LEN);
+			p += ETH_GSTRING_LEN;
+		}
+		break;
+	case ETH_SS_TEST:
+	default:
+		break;
+	}
+}
+static int ipoib_get_sset_count(struct net_device __always_unused *dev,
+				 int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return IPOIB_GLOBAL_STATS_LEN;
+	case ETH_SS_TEST:
+	default:
+		break;
+	}
+	return -EOPNOTSUPP;
+}
+
+/* Return lane speed in unit of 1e6 bit/sec */
+static inline int ib_speed_enum_to_int(int speed)
+{
+	switch (speed) {
+	case IB_SPEED_SDR:
+		return SPEED_2500;
+	case IB_SPEED_DDR:
+		return SPEED_5000;
+	case IB_SPEED_QDR:
+	case IB_SPEED_FDR10:
+		return SPEED_10000;
+	case IB_SPEED_FDR:
+		return SPEED_14000;
+	case IB_SPEED_EDR:
+		return SPEED_25000;
+	}
+
+	return SPEED_UNKNOWN;
+}
+
+static int ipoib_get_link_ksettings(struct net_device *netdev,
+				    struct ethtool_link_ksettings *cmd)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(netdev);
+	struct ib_port_attr attr;
+	int ret, speed, width;
+
+	if (!netif_carrier_ok(netdev)) {
+		cmd->base.speed = SPEED_UNKNOWN;
+		cmd->base.duplex = DUPLEX_UNKNOWN;
+		return 0;
+	}
+
+	ret = ib_query_port(priv->ca, priv->port, &attr);
+	if (ret < 0)
+		return -EINVAL;
+
+	speed = ib_speed_enum_to_int(attr.active_speed);
+	width = ib_width_enum_to_int(attr.active_width);
+
+	if (speed < 0 || width < 0)
+		return -EINVAL;
+
+	/* Except the following are set, the other members of
+	 * the struct ethtool_link_settings are initialized to
+	 * zero in the function __ethtool_get_link_ksettings.
+	 */
+	cmd->base.speed		 = speed * width;
+	cmd->base.duplex	 = DUPLEX_FULL;
+
+	cmd->base.phy_address	 = 0xFF;
+
+	cmd->base.autoneg	 = AUTONEG_ENABLE;
+	cmd->base.port		 = PORT_OTHER;
+
+	return 0;
+}
+
+static const struct ethtool_ops ipoib_ethtool_ops = {
+	.get_link_ksettings	= ipoib_get_link_ksettings,
+	.get_drvinfo		= ipoib_get_drvinfo,
+	.get_coalesce		= ipoib_get_coalesce,
+	.set_coalesce		= ipoib_set_coalesce,
+	.get_strings		= ipoib_get_strings,
+	.get_ethtool_stats	= ipoib_get_ethtool_stats,
+	.get_sset_count		= ipoib_get_sset_count,
+};
+
+void ipoib_set_ethtool_ops(struct net_device *dev)
+{
+	dev->ethtool_ops = &ipoib_ethtool_ops;
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
new file mode 100644
index 0000000..ea302b0
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+struct file_operations;
+
+#include <linux/debugfs.h>
+#include <linux/export.h>
+
+#include "ipoib.h"
+
+static struct dentry *ipoib_root;
+
+static void format_gid(union ib_gid *gid, char *buf)
+{
+	int i, n;
+
+	for (n = 0, i = 0; i < 8; ++i) {
+		n += sprintf(buf + n, "%x",
+			     be16_to_cpu(((__be16 *) gid->raw)[i]));
+		if (i < 7)
+			buf[n++] = ':';
+	}
+}
+
+static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct ipoib_mcast_iter *iter;
+	loff_t n = *pos;
+
+	iter = ipoib_mcast_iter_init(file->private);
+	if (!iter)
+		return NULL;
+
+	while (n--) {
+		if (ipoib_mcast_iter_next(iter)) {
+			kfree(iter);
+			return NULL;
+		}
+	}
+
+	return iter;
+}
+
+static void *ipoib_mcg_seq_next(struct seq_file *file, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct ipoib_mcast_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (ipoib_mcast_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void ipoib_mcg_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct ipoib_mcast_iter *iter = iter_ptr;
+	char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"];
+	union ib_gid mgid;
+	unsigned long created;
+	unsigned int queuelen, complete, send_only;
+
+	if (!iter)
+		return 0;
+
+	ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen,
+			      &complete, &send_only);
+
+	format_gid(&mgid, gid_buf);
+
+	seq_printf(file,
+		   "GID: %s\n"
+		   "  created: %10ld\n"
+		   "  queuelen: %9d\n"
+		   "  complete: %9s\n"
+		   "  send_only: %8s\n"
+		   "\n",
+		   gid_buf, created, queuelen,
+		   complete ? "yes" : "no",
+		   send_only ? "yes" : "no");
+
+	return 0;
+}
+
+static const struct seq_operations ipoib_mcg_seq_ops = {
+	.start = ipoib_mcg_seq_start,
+	.next  = ipoib_mcg_seq_next,
+	.stop  = ipoib_mcg_seq_stop,
+	.show  = ipoib_mcg_seq_show,
+};
+
+static int ipoib_mcg_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &ipoib_mcg_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static const struct file_operations ipoib_mcg_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ipoib_mcg_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static void *ipoib_path_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct ipoib_path_iter *iter;
+	loff_t n = *pos;
+
+	iter = ipoib_path_iter_init(file->private);
+	if (!iter)
+		return NULL;
+
+	while (n--) {
+		if (ipoib_path_iter_next(iter)) {
+			kfree(iter);
+			return NULL;
+		}
+	}
+
+	return iter;
+}
+
+static void *ipoib_path_seq_next(struct seq_file *file, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct ipoib_path_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (ipoib_path_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void ipoib_path_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct ipoib_path_iter *iter = iter_ptr;
+	char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"];
+	struct ipoib_path path;
+	int rate;
+
+	if (!iter)
+		return 0;
+
+	ipoib_path_iter_read(iter, &path);
+
+	format_gid(&path.pathrec.dgid, gid_buf);
+
+	seq_printf(file,
+		   "GID: %s\n"
+		   "  complete: %6s\n",
+		   gid_buf, sa_path_get_dlid(&path.pathrec) ? "yes" : "no");
+
+	if (sa_path_get_dlid(&path.pathrec)) {
+		rate = ib_rate_to_mbps(path.pathrec.rate);
+
+		seq_printf(file,
+			   "  DLID:     0x%04x\n"
+			   "  SL: %12d\n"
+			   "  rate: %8d.%d Gb/sec\n",
+			   be32_to_cpu(sa_path_get_dlid(&path.pathrec)),
+			   path.pathrec.sl,
+			   rate / 1000, rate % 1000);
+	}
+
+	seq_putc(file, '\n');
+
+	return 0;
+}
+
+static const struct seq_operations ipoib_path_seq_ops = {
+	.start = ipoib_path_seq_start,
+	.next  = ipoib_path_seq_next,
+	.stop  = ipoib_path_seq_stop,
+	.show  = ipoib_path_seq_show,
+};
+
+static int ipoib_path_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &ipoib_path_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static const struct file_operations ipoib_path_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ipoib_path_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+void ipoib_create_debug_files(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	char name[IFNAMSIZ + sizeof "_path"];
+
+	snprintf(name, sizeof name, "%s_mcg", dev->name);
+	priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
+					       ipoib_root, dev, &ipoib_mcg_fops);
+	if (!priv->mcg_dentry)
+		ipoib_warn(priv, "failed to create mcg debug file\n");
+
+	snprintf(name, sizeof name, "%s_path", dev->name);
+	priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
+						ipoib_root, dev, &ipoib_path_fops);
+	if (!priv->path_dentry)
+		ipoib_warn(priv, "failed to create path debug file\n");
+}
+
+void ipoib_delete_debug_files(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	debugfs_remove(priv->mcg_dentry);
+	debugfs_remove(priv->path_dentry);
+	priv->mcg_dentry = priv->path_dentry = NULL;
+}
+
+int ipoib_register_debugfs(void)
+{
+	ipoib_root = debugfs_create_dir("ipoib", NULL);
+	return ipoib_root ? 0 : -ENOMEM;
+}
+
+void ipoib_unregister_debugfs(void)
+{
+	debugfs_remove(ipoib_root);
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
new file mode 100644
index 0000000..f47f9ac
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -0,0 +1,1317 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param(data_debug_level, int, 0644);
+MODULE_PARM_DESC(data_debug_level,
+		 "Enable data path debug tracing if > 0");
+#endif
+
+struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
+				 struct ib_pd *pd, struct rdma_ah_attr *attr)
+{
+	struct ipoib_ah *ah;
+	struct ib_ah *vah;
+
+	ah = kmalloc(sizeof *ah, GFP_KERNEL);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	ah->dev       = dev;
+	ah->last_send = 0;
+	kref_init(&ah->ref);
+
+	vah = rdma_create_ah(pd, attr);
+	if (IS_ERR(vah)) {
+		kfree(ah);
+		ah = (struct ipoib_ah *)vah;
+	} else {
+		ah->ah = vah;
+		ipoib_dbg(ipoib_priv(dev), "Created ah %p\n", ah->ah);
+	}
+
+	return ah;
+}
+
+void ipoib_free_ah(struct kref *kref)
+{
+	struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
+	struct ipoib_dev_priv *priv = ipoib_priv(ah->dev);
+
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_add_tail(&ah->list, &priv->dead_ahs);
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv,
+				  u64 mapping[IPOIB_UD_RX_SG])
+{
+	ib_dma_unmap_single(priv->ca, mapping[0],
+			    IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
+			    DMA_FROM_DEVICE);
+}
+
+static int ipoib_ib_post_receive(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ib_recv_wr *bad_wr;
+	int ret;
+
+	priv->rx_wr.wr_id   = id | IPOIB_OP_RECV;
+	priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0];
+	priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1];
+
+
+	ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
+		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping);
+		dev_kfree_skb_any(priv->rx_ring[id].skb);
+		priv->rx_ring[id].skb = NULL;
+	}
+
+	return ret;
+}
+
+static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct sk_buff *skb;
+	int buf_size;
+	u64 *mapping;
+
+	buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+
+	skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN);
+	if (unlikely(!skb))
+		return NULL;
+
+	/*
+	 * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is
+	 * 64 bytes aligned
+	 */
+	skb_reserve(skb, sizeof(struct ipoib_pseudo_header));
+
+	mapping = priv->rx_ring[id].mapping;
+	mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
+				       DMA_FROM_DEVICE);
+	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
+		goto error;
+
+	priv->rx_ring[id].skb = skb;
+	return skb;
+error:
+	dev_kfree_skb_any(skb);
+	return NULL;
+}
+
+static int ipoib_ib_post_receives(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i) {
+		if (!ipoib_alloc_rx_skb(dev, i)) {
+			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+			return -ENOMEM;
+		}
+		if (ipoib_ib_post_receive(dev, i)) {
+			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
+	struct sk_buff *skb;
+	u64 mapping[IPOIB_UD_RX_SG];
+	union ib_gid *dgid;
+	union ib_gid *sgid;
+
+	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_recvq_size)) {
+		ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n",
+			   wr_id, ipoib_recvq_size);
+		return;
+	}
+
+	skb  = priv->rx_ring[wr_id].skb;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			ipoib_warn(priv,
+				   "failed recv event (status=%d, wrid=%d vend_err %#x)\n",
+				   wc->status, wr_id, wc->vendor_err);
+		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
+		dev_kfree_skb_any(skb);
+		priv->rx_ring[wr_id].skb = NULL;
+		return;
+	}
+
+	memcpy(mapping, priv->rx_ring[wr_id].mapping,
+	       IPOIB_UD_RX_SG * sizeof *mapping);
+
+	/*
+	 * If we can't allocate a new RX buffer, dump
+	 * this packet and reuse the old buffer.
+	 */
+	if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) {
+		++dev->stats.rx_dropped;
+		goto repost;
+	}
+
+	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+		       wc->byte_len, wc->slid);
+
+	ipoib_ud_dma_unmap_rx(priv, mapping);
+
+	skb_put(skb, wc->byte_len);
+
+	/* First byte of dgid signals multicast when 0xff */
+	dgid = &((struct ib_grh *)skb->data)->dgid;
+
+	if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff)
+		skb->pkt_type = PACKET_HOST;
+	else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0)
+		skb->pkt_type = PACKET_BROADCAST;
+	else
+		skb->pkt_type = PACKET_MULTICAST;
+
+	sgid = &((struct ib_grh *)skb->data)->sgid;
+
+	/*
+	 * Drop packets that this interface sent, ie multicast packets
+	 * that the HCA has replicated.
+	 */
+	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) {
+		int need_repost = 1;
+
+		if ((wc->wc_flags & IB_WC_GRH) &&
+		    sgid->global.interface_id != priv->local_gid.global.interface_id)
+			need_repost = 0;
+
+		if (need_repost) {
+			dev_kfree_skb_any(skb);
+			goto repost;
+		}
+	}
+
+	skb_pull(skb, IB_GRH_BYTES);
+
+	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+	skb_add_pseudo_hdr(skb);
+
+	++dev->stats.rx_packets;
+	dev->stats.rx_bytes += skb->len;
+	if (skb->pkt_type == PACKET_MULTICAST)
+		dev->stats.multicast++;
+
+	skb->dev = dev;
+	if ((dev->features & NETIF_F_RXCSUM) &&
+			likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	napi_gro_receive(&priv->recv_napi, skb);
+
+repost:
+	if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
+		ipoib_warn(priv, "ipoib_ib_post_receive failed "
+			   "for buf %d\n", wr_id);
+}
+
+int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
+{
+	struct sk_buff *skb = tx_req->skb;
+	u64 *mapping = tx_req->mapping;
+	int i;
+	int off;
+
+	if (skb_headlen(skb)) {
+		mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb),
+					       DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(ca, mapping[0])))
+			return -EIO;
+
+		off = 1;
+	} else
+		off = 0;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		mapping[i + off] = ib_dma_map_page(ca,
+						 skb_frag_page(frag),
+						 frag->page_offset, skb_frag_size(frag),
+						 DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
+			goto partial_error;
+	}
+	return 0;
+
+partial_error:
+	for (; i > 0; --i) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
+
+		ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE);
+	}
+
+	if (off)
+		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
+
+	return -EIO;
+}
+
+void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
+			struct ipoib_tx_buf *tx_req)
+{
+	struct sk_buff *skb = tx_req->skb;
+	u64 *mapping = tx_req->mapping;
+	int i;
+	int off;
+
+	if (skb_headlen(skb)) {
+		ib_dma_unmap_single(priv->ca, mapping[0], skb_headlen(skb),
+				    DMA_TO_DEVICE);
+		off = 1;
+	} else
+		off = 0;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		ib_dma_unmap_page(priv->ca, mapping[i + off],
+				  skb_frag_size(frag), DMA_TO_DEVICE);
+	}
+}
+
+/*
+ * As the result of a completion error the QP Can be transferred to SQE states.
+ * The function checks if the (send)QP is in SQE state and
+ * moves it back to RTS state, that in order to have it functional again.
+ */
+static void ipoib_qp_state_validate_work(struct work_struct *work)
+{
+	struct ipoib_qp_state_validate *qp_work =
+		container_of(work, struct ipoib_qp_state_validate, work);
+
+	struct ipoib_dev_priv *priv = qp_work->priv;
+	struct ib_qp_attr qp_attr;
+	struct ib_qp_init_attr query_init_attr;
+	int ret;
+
+	ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
+	if (ret) {
+		ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
+			   __func__, ret);
+		goto free_res;
+	}
+	pr_info("%s: QP: 0x%x is in state: %d\n",
+		__func__, priv->qp->qp_num, qp_attr.qp_state);
+
+	/* currently support only in SQE->RTS transition*/
+	if (qp_attr.qp_state == IB_QPS_SQE) {
+		qp_attr.qp_state = IB_QPS_RTS;
+
+		ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
+		if (ret) {
+			pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
+				ret, priv->qp->qp_num);
+			goto free_res;
+		}
+		pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
+			__func__, priv->qp->qp_num);
+	} else {
+		pr_warn("QP (%d) will stay in state: %d\n",
+			priv->qp->qp_num, qp_attr.qp_state);
+	}
+
+free_res:
+	kfree(qp_work);
+}
+
+static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	unsigned int wr_id = wc->wr_id;
+	struct ipoib_tx_buf *tx_req;
+
+	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_sendq_size)) {
+		ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
+			   wr_id, ipoib_sendq_size);
+		return;
+	}
+
+	tx_req = &priv->tx_ring[wr_id];
+
+	ipoib_dma_unmap_tx(priv, tx_req);
+
+	++dev->stats.tx_packets;
+	dev->stats.tx_bytes += tx_req->skb->len;
+
+	dev_kfree_skb_any(tx_req->skb);
+
+	++priv->tx_tail;
+
+	if (unlikely(netif_queue_stopped(dev) &&
+		     ((priv->tx_head - priv->tx_tail) <= ipoib_sendq_size >> 1) &&
+		     test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)))
+		netif_wake_queue(dev);
+
+	if (wc->status != IB_WC_SUCCESS &&
+	    wc->status != IB_WC_WR_FLUSH_ERR) {
+		struct ipoib_qp_state_validate *qp_work;
+		ipoib_warn(priv,
+			   "failed send event (status=%d, wrid=%d vend_err %#x)\n",
+			   wc->status, wr_id, wc->vendor_err);
+		qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
+		if (!qp_work)
+			return;
+
+		INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
+		qp_work->priv = priv;
+		queue_work(priv->wq, &qp_work->work);
+	}
+}
+
+static int poll_tx(struct ipoib_dev_priv *priv)
+{
+	int n, i;
+	struct ib_wc *wc;
+
+	n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
+	for (i = 0; i < n; ++i) {
+		wc = priv->send_wc + i;
+		if (wc->wr_id & IPOIB_OP_CM)
+			ipoib_cm_handle_tx_wc(priv->dev, priv->send_wc + i);
+		else
+			ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
+	}
+	return n == MAX_SEND_CQE;
+}
+
+int ipoib_rx_poll(struct napi_struct *napi, int budget)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(napi, struct ipoib_dev_priv, recv_napi);
+	struct net_device *dev = priv->dev;
+	int done;
+	int t;
+	int n, i;
+
+	done  = 0;
+
+poll_more:
+	while (done < budget) {
+		int max = (budget - done);
+
+		t = min(IPOIB_NUM_WC, max);
+		n = ib_poll_cq(priv->recv_cq, t, priv->ibwc);
+
+		for (i = 0; i < n; i++) {
+			struct ib_wc *wc = priv->ibwc + i;
+
+			if (wc->wr_id & IPOIB_OP_RECV) {
+				++done;
+				if (wc->wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_rx_wc(dev, wc);
+				else
+					ipoib_ib_handle_rx_wc(dev, wc);
+			} else {
+				pr_warn("%s: Got unexpected wqe id\n", __func__);
+			}
+		}
+
+		if (n != t)
+			break;
+	}
+
+	if (done < budget) {
+		napi_complete(napi);
+		if (unlikely(ib_req_notify_cq(priv->recv_cq,
+					      IB_CQ_NEXT_COMP |
+					      IB_CQ_REPORT_MISSED_EVENTS)) &&
+		    napi_reschedule(napi))
+			goto poll_more;
+	}
+
+	return done;
+}
+
+int ipoib_tx_poll(struct napi_struct *napi, int budget)
+{
+	struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv,
+						   send_napi);
+	struct net_device *dev = priv->dev;
+	int n, i;
+	struct ib_wc *wc;
+
+poll_more:
+	n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
+
+	for (i = 0; i < n; i++) {
+		wc = priv->send_wc + i;
+		if (wc->wr_id & IPOIB_OP_CM)
+			ipoib_cm_handle_tx_wc(dev, wc);
+		else
+			ipoib_ib_handle_tx_wc(dev, wc);
+	}
+
+	if (n < budget) {
+		napi_complete(napi);
+		if (unlikely(ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP |
+					      IB_CQ_REPORT_MISSED_EVENTS)) &&
+		    napi_reschedule(napi))
+			goto poll_more;
+	}
+	return n < 0 ? 0 : n;
+}
+
+void ipoib_ib_rx_completion(struct ib_cq *cq, void *ctx_ptr)
+{
+	struct ipoib_dev_priv *priv = ctx_ptr;
+
+	napi_schedule(&priv->recv_napi);
+}
+
+void ipoib_ib_tx_completion(struct ib_cq *cq, void *ctx_ptr)
+{
+	struct ipoib_dev_priv *priv = ctx_ptr;
+
+	napi_schedule(&priv->send_napi);
+}
+
+static inline int post_send(struct ipoib_dev_priv *priv,
+			    unsigned int wr_id,
+			    struct ib_ah *address, u32 dqpn,
+			    struct ipoib_tx_buf *tx_req,
+			    void *head, int hlen)
+{
+	struct ib_send_wr *bad_wr;
+	struct sk_buff *skb = tx_req->skb;
+
+	ipoib_build_sge(priv, tx_req);
+
+	priv->tx_wr.wr.wr_id	= wr_id;
+	priv->tx_wr.remote_qpn	= dqpn;
+	priv->tx_wr.ah		= address;
+
+	if (head) {
+		priv->tx_wr.mss		= skb_shinfo(skb)->gso_size;
+		priv->tx_wr.header	= head;
+		priv->tx_wr.hlen	= hlen;
+		priv->tx_wr.wr.opcode	= IB_WR_LSO;
+	} else
+		priv->tx_wr.wr.opcode	= IB_WR_SEND;
+
+	return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr);
+}
+
+int ipoib_send(struct net_device *dev, struct sk_buff *skb,
+	       struct ib_ah *address, u32 dqpn)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_tx_buf *tx_req;
+	int hlen, rc;
+	void *phead;
+	unsigned usable_sge = priv->max_send_sge - !!skb_headlen(skb);
+
+	if (skb_is_gso(skb)) {
+		hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		phead = skb->data;
+		if (unlikely(!skb_pull(skb, hlen))) {
+			ipoib_warn(priv, "linear data too small\n");
+			++dev->stats.tx_dropped;
+			++dev->stats.tx_errors;
+			dev_kfree_skb_any(skb);
+			return -1;
+		}
+	} else {
+		if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
+			ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+				   skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN);
+			++dev->stats.tx_dropped;
+			++dev->stats.tx_errors;
+			ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
+			return -1;
+		}
+		phead = NULL;
+		hlen  = 0;
+	}
+	if (skb_shinfo(skb)->nr_frags > usable_sge) {
+		if (skb_linearize(skb) < 0) {
+			ipoib_warn(priv, "skb could not be linearized\n");
+			++dev->stats.tx_dropped;
+			++dev->stats.tx_errors;
+			dev_kfree_skb_any(skb);
+			return -1;
+		}
+		/* Does skb_linearize return ok without reducing nr_frags? */
+		if (skb_shinfo(skb)->nr_frags > usable_sge) {
+			ipoib_warn(priv, "too many frags after skb linearize\n");
+			++dev->stats.tx_dropped;
+			++dev->stats.tx_errors;
+			dev_kfree_skb_any(skb);
+			return -1;
+		}
+	}
+
+	ipoib_dbg_data(priv,
+		       "sending packet, length=%d address=%p dqpn=0x%06x\n",
+		       skb->len, address, dqpn);
+
+	/*
+	 * We put the skb into the tx_ring _before_ we call post_send()
+	 * because it's entirely possible that the completion handler will
+	 * run before we execute anything after the post_send().  That
+	 * means we have to make sure everything is properly recorded and
+	 * our state is consistent before we call post_send().
+	 */
+	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
+	tx_req->skb = skb;
+	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
+		++dev->stats.tx_errors;
+		dev_kfree_skb_any(skb);
+		return -1;
+	}
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM;
+	else
+		priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
+	/* increase the tx_head after send success, but use it for queue state */
+	if (priv->tx_head - priv->tx_tail == ipoib_sendq_size - 1) {
+		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+		netif_stop_queue(dev);
+	}
+
+	skb_orphan(skb);
+	skb_dst_drop(skb);
+
+	if (netif_queue_stopped(dev))
+		if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP |
+				     IB_CQ_REPORT_MISSED_EVENTS) < 0)
+			ipoib_warn(priv, "request notify on send CQ failed\n");
+
+	rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
+		       address, dqpn, tx_req, phead, hlen);
+	if (unlikely(rc)) {
+		ipoib_warn(priv, "post_send failed, error %d\n", rc);
+		++dev->stats.tx_errors;
+		ipoib_dma_unmap_tx(priv, tx_req);
+		dev_kfree_skb_any(skb);
+		if (netif_queue_stopped(dev))
+			netif_wake_queue(dev);
+		rc = 0;
+	} else {
+		netif_trans_update(dev);
+
+		rc = priv->tx_head;
+		++priv->tx_head;
+	}
+	return rc;
+}
+
+static void __ipoib_reap_ah(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_ah *ah, *tah;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
+		if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
+			list_del(&ah->list);
+			rdma_destroy_ah(ah->ah);
+			kfree(ah);
+		}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+void ipoib_reap_ah(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
+	struct net_device *dev = priv->dev;
+
+	__ipoib_reap_ah(dev);
+
+	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
+		queue_delayed_work(priv->wq, &priv->ah_reap_task,
+				   round_jiffies_relative(HZ));
+}
+
+static void ipoib_flush_ah(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	cancel_delayed_work(&priv->ah_reap_task);
+	flush_workqueue(priv->wq);
+	ipoib_reap_ah(&priv->ah_reap_task.work);
+}
+
+static void ipoib_stop_ah(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	set_bit(IPOIB_STOP_REAPER, &priv->flags);
+	ipoib_flush_ah(dev);
+}
+
+static int recvs_pending(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int pending = 0;
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i)
+		if (priv->rx_ring[i].skb)
+			++pending;
+
+	return pending;
+}
+
+static void check_qp_movement_and_print(struct ipoib_dev_priv *priv,
+					struct ib_qp *qp,
+					enum ib_qp_state new_state)
+{
+	struct ib_qp_attr qp_attr;
+	struct ib_qp_init_attr query_init_attr;
+	int ret;
+
+	ret = ib_query_qp(qp, &qp_attr, IB_QP_STATE, &query_init_attr);
+	if (ret) {
+		ipoib_warn(priv, "%s: Failed to query QP\n", __func__);
+		return;
+	}
+	/* print according to the new-state and the previous state.*/
+	if (new_state == IB_QPS_ERR && qp_attr.qp_state == IB_QPS_RESET)
+		ipoib_dbg(priv, "Failed modify QP, IB_QPS_RESET to IB_QPS_ERR, acceptable\n");
+	else
+		ipoib_warn(priv, "Failed to modify QP to state: %d from state: %d\n",
+			   new_state, qp_attr.qp_state);
+}
+
+static void ipoib_napi_enable(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	napi_enable(&priv->recv_napi);
+	napi_enable(&priv->send_napi);
+}
+
+static void ipoib_napi_disable(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	napi_disable(&priv->recv_napi);
+	napi_disable(&priv->send_napi);
+}
+
+int ipoib_ib_dev_stop_default(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ib_qp_attr qp_attr;
+	unsigned long begin;
+	struct ipoib_tx_buf *tx_req;
+	int i;
+
+	if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+		ipoib_napi_disable(dev);
+
+	ipoib_cm_dev_stop(dev);
+
+	/*
+	 * Move our QP to the error state and then reinitialize in
+	 * when all work requests have completed or have been flushed.
+	 */
+	qp_attr.qp_state = IB_QPS_ERR;
+	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+		check_qp_movement_and_print(priv, priv->qp, IB_QPS_ERR);
+
+	/* Wait for all sends and receives to complete */
+	begin = jiffies;
+
+	while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) {
+		if (time_after(jiffies, begin + 5 * HZ)) {
+			ipoib_warn(priv,
+				   "timing out; %d sends %d receives not completed\n",
+				   priv->tx_head - priv->tx_tail,
+				   recvs_pending(dev));
+
+			/*
+			 * assume the HW is wedged and just free up
+			 * all our pending work requests.
+			 */
+			while ((int)priv->tx_tail - (int)priv->tx_head < 0) {
+				tx_req = &priv->tx_ring[priv->tx_tail &
+							(ipoib_sendq_size - 1)];
+				ipoib_dma_unmap_tx(priv, tx_req);
+				dev_kfree_skb_any(tx_req->skb);
+				++priv->tx_tail;
+			}
+
+			for (i = 0; i < ipoib_recvq_size; ++i) {
+				struct ipoib_rx_buf *rx_req;
+
+				rx_req = &priv->rx_ring[i];
+				if (!rx_req->skb)
+					continue;
+				ipoib_ud_dma_unmap_rx(priv,
+						      priv->rx_ring[i].mapping);
+				dev_kfree_skb_any(rx_req->skb);
+				rx_req->skb = NULL;
+			}
+
+			goto timeout;
+		}
+
+		ipoib_drain_cq(dev);
+
+		usleep_range(1000, 2000);
+	}
+
+	ipoib_dbg(priv, "All sends and receives done.\n");
+
+timeout:
+	qp_attr.qp_state = IB_QPS_RESET;
+	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+
+	ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);
+
+	return 0;
+}
+
+int ipoib_ib_dev_stop(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	priv->rn_ops->ndo_stop(dev);
+
+	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
+	ipoib_flush_ah(dev);
+
+	return 0;
+}
+
+int ipoib_ib_dev_open_default(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int ret;
+
+	ret = ipoib_init_qp(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
+		return -1;
+	}
+
+	ret = ipoib_ib_post_receives(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
+		goto out;
+	}
+
+	ret = ipoib_cm_dev_open(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
+		goto out;
+	}
+
+	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+		ipoib_napi_enable(dev);
+
+	return 0;
+out:
+	return -1;
+}
+
+int ipoib_ib_dev_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	ipoib_pkey_dev_check_presence(dev);
+
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+		ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey,
+			   (!(priv->pkey & 0x7fff) ? "Invalid" : "not found"));
+		return -1;
+	}
+
+	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
+	queue_delayed_work(priv->wq, &priv->ah_reap_task,
+			   round_jiffies_relative(HZ));
+
+	if (priv->rn_ops->ndo_open(dev)) {
+		pr_warn("%s: Failed to open dev\n", dev->name);
+		goto dev_stop;
+	}
+
+	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
+
+	return 0;
+
+dev_stop:
+	set_bit(IPOIB_STOP_REAPER, &priv->flags);
+	cancel_delayed_work(&priv->ah_reap_task);
+	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
+	ipoib_ib_dev_stop(dev);
+	return -1;
+}
+
+void ipoib_pkey_dev_check_presence(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct rdma_netdev *rn = netdev_priv(dev);
+
+	if (!(priv->pkey & 0x7fff) ||
+	    ib_find_pkey(priv->ca, priv->port, priv->pkey,
+			 &priv->pkey_index)) {
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	} else {
+		if (rn->set_id)
+			rn->set_id(dev, priv->pkey_index);
+		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	}
+}
+
+void ipoib_ib_dev_up(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	ipoib_pkey_dev_check_presence(dev);
+
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+		ipoib_dbg(priv, "PKEY is not assigned.\n");
+		return;
+	}
+
+	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+
+	ipoib_mcast_start_thread(dev);
+}
+
+void ipoib_ib_dev_down(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	ipoib_dbg(priv, "downing ib_dev\n");
+
+	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+	netif_carrier_off(dev);
+
+	ipoib_mcast_stop_thread(dev);
+	ipoib_mcast_dev_flush(dev);
+
+	ipoib_flush_paths(dev);
+}
+
+void ipoib_drain_cq(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int i, n;
+
+	/*
+	 * We call completion handling routines that expect to be
+	 * called from the BH-disabled NAPI poll context, so disable
+	 * BHs here too.
+	 */
+	local_bh_disable();
+
+	do {
+		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
+		for (i = 0; i < n; ++i) {
+			/*
+			 * Convert any successful completions to flush
+			 * errors to avoid passing packets up the
+			 * stack after bringing the device down.
+			 */
+			if (priv->ibwc[i].status == IB_WC_SUCCESS)
+				priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
+
+			if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
+				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
+				else
+					ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
+			} else {
+				pr_warn("%s: Got unexpected wqe id\n", __func__);
+			}
+		}
+	} while (n == IPOIB_NUM_WC);
+
+	while (poll_tx(priv))
+		; /* nothing */
+
+	local_bh_enable();
+}
+
+/*
+ * Takes whatever value which is in pkey index 0 and updates priv->pkey
+ * returns 0 if the pkey value was changed.
+ */
+static inline int update_parent_pkey(struct ipoib_dev_priv *priv)
+{
+	int result;
+	u16 prev_pkey;
+
+	prev_pkey = priv->pkey;
+	result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey);
+	if (result) {
+		ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n",
+			   priv->port, result);
+		return result;
+	}
+
+	priv->pkey |= 0x8000;
+
+	if (prev_pkey != priv->pkey) {
+		ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n",
+			  prev_pkey, priv->pkey);
+		/*
+		 * Update the pkey in the broadcast address, while making sure to set
+		 * the full membership bit, so that we join the right broadcast group.
+		 */
+		priv->dev->broadcast[8] = priv->pkey >> 8;
+		priv->dev->broadcast[9] = priv->pkey & 0xff;
+		return 0;
+	}
+
+	return 1;
+}
+/*
+ * returns 0 if pkey value was found in a different slot.
+ */
+static inline int update_child_pkey(struct ipoib_dev_priv *priv)
+{
+	u16 old_index = priv->pkey_index;
+
+	priv->pkey_index = 0;
+	ipoib_pkey_dev_check_presence(priv->dev);
+
+	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
+	    (old_index == priv->pkey_index))
+		return 1;
+	return 0;
+}
+
+/*
+ * returns true if the device address of the ipoib interface has changed and the
+ * new address is a valid one (i.e in the gid table), return false otherwise.
+ */
+static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
+{
+	union ib_gid search_gid;
+	union ib_gid gid0;
+	union ib_gid *netdev_gid;
+	int err;
+	u16 index;
+	u8 port;
+	bool ret = false;
+
+	netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4);
+	if (ib_query_gid(priv->ca, priv->port, 0, &gid0, NULL))
+		return false;
+
+	netif_addr_lock_bh(priv->dev);
+
+	/* The subnet prefix may have changed, update it now so we won't have
+	 * to do it later
+	 */
+	priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix;
+	netdev_gid->global.subnet_prefix = gid0.global.subnet_prefix;
+	search_gid.global.subnet_prefix = gid0.global.subnet_prefix;
+
+	search_gid.global.interface_id = priv->local_gid.global.interface_id;
+
+	netif_addr_unlock_bh(priv->dev);
+
+	err = ib_find_gid(priv->ca, &search_gid, &port, &index);
+
+	netif_addr_lock_bh(priv->dev);
+
+	if (search_gid.global.interface_id !=
+	    priv->local_gid.global.interface_id)
+		/* There was a change while we were looking up the gid, bail
+		 * here and let the next work sort this out
+		 */
+		goto out;
+
+	/* The next section of code needs some background:
+	 * Per IB spec the port GUID can't change if the HCA is powered on.
+	 * port GUID is the basis for GID at index 0 which is the basis for
+	 * the default device address of a ipoib interface.
+	 *
+	 * so it seems the flow should be:
+	 * if user_changed_dev_addr && gid in gid tbl
+	 *	set bit dev_addr_set
+	 *	return true
+	 * else
+	 *	return false
+	 *
+	 * The issue is that there are devices that don't follow the spec,
+	 * they change the port GUID when the HCA is powered, so in order
+	 * not to break userspace applications, We need to check if the
+	 * user wanted to control the device address and we assume that
+	 * if he sets the device address back to be based on GID index 0,
+	 * he no longer wishs to control it.
+	 *
+	 * If the user doesn't control the the device address,
+	 * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means
+	 * the port GUID has changed and GID at index 0 has changed
+	 * so we need to change priv->local_gid and priv->dev->dev_addr
+	 * to reflect the new GID.
+	 */
+	if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
+		if (!err && port == priv->port) {
+			set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
+			if (index == 0)
+				clear_bit(IPOIB_FLAG_DEV_ADDR_CTRL,
+					  &priv->flags);
+			else
+				set_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags);
+			ret = true;
+		} else {
+			ret = false;
+		}
+	} else {
+		if (!err && port == priv->port) {
+			ret = true;
+		} else {
+			if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) {
+				memcpy(&priv->local_gid, &gid0,
+				       sizeof(priv->local_gid));
+				memcpy(priv->dev->dev_addr + 4, &gid0,
+				       sizeof(priv->local_gid));
+				ret = true;
+			}
+		}
+	}
+
+out:
+	netif_addr_unlock_bh(priv->dev);
+
+	return ret;
+}
+
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
+				enum ipoib_flush_level level,
+				int nesting)
+{
+	struct ipoib_dev_priv *cpriv;
+	struct net_device *dev = priv->dev;
+	int result;
+
+	down_read_nested(&priv->vlan_rwsem, nesting);
+
+	/*
+	 * Flush any child interfaces too -- they might be up even if
+	 * the parent is down.
+	 */
+	list_for_each_entry(cpriv, &priv->child_intfs, list)
+		__ipoib_ib_dev_flush(cpriv, level, nesting + 1);
+
+	up_read(&priv->vlan_rwsem);
+
+	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) &&
+	    level != IPOIB_FLUSH_HEAVY) {
+		/* Make sure the dev_addr is set even if not flushing */
+		if (level == IPOIB_FLUSH_LIGHT)
+			ipoib_dev_addr_changed_valid(priv);
+		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
+		return;
+	}
+
+	if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
+		/* interface is down. update pkey and leave. */
+		if (level == IPOIB_FLUSH_HEAVY) {
+			if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
+				update_parent_pkey(priv);
+			else
+				update_child_pkey(priv);
+		} else if (level == IPOIB_FLUSH_LIGHT)
+			ipoib_dev_addr_changed_valid(priv);
+		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
+		return;
+	}
+
+	if (level == IPOIB_FLUSH_HEAVY) {
+		/* child devices chase their origin pkey value, while non-child
+		 * (parent) devices should always takes what present in pkey index 0
+		 */
+		if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+			result = update_child_pkey(priv);
+			if (result) {
+				/* restart QP only if P_Key index is changed */
+				ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
+				return;
+			}
+
+		} else {
+			result = update_parent_pkey(priv);
+			/* restart QP only if P_Key value changed */
+			if (result) {
+				ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n");
+				return;
+			}
+		}
+	}
+
+	if (level == IPOIB_FLUSH_LIGHT) {
+		int oper_up;
+		ipoib_mark_paths_invalid(dev);
+		/* Set IPoIB operation as down to prevent races between:
+		 * the flush flow which leaves MCG and on the fly joins
+		 * which can happen during that time. mcast restart task
+		 * should deal with join requests we missed.
+		 */
+		oper_up = test_and_clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+		ipoib_mcast_dev_flush(dev);
+		if (oper_up)
+			set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+		ipoib_flush_ah(dev);
+	}
+
+	if (level >= IPOIB_FLUSH_NORMAL)
+		ipoib_ib_dev_down(dev);
+
+	if (level == IPOIB_FLUSH_HEAVY) {
+		if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+			ipoib_ib_dev_stop(dev);
+
+		if (ipoib_ib_dev_open(dev))
+			return;
+
+		if (netif_queue_stopped(dev))
+			netif_start_queue(dev);
+	}
+
+	/*
+	 * The device could have been brought down between the start and when
+	 * we get here, don't bring it back up if it's not configured up
+	 */
+	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
+		if (level >= IPOIB_FLUSH_NORMAL)
+			ipoib_ib_dev_up(dev);
+		if (ipoib_dev_addr_changed_valid(priv))
+			ipoib_mcast_restart_task(&priv->restart_task);
+	}
+}
+
+void ipoib_ib_dev_flush_light(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_light);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT, 0);
+}
+
+void ipoib_ib_dev_flush_normal(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_normal);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL, 0);
+}
+
+void ipoib_ib_dev_flush_heavy(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_heavy);
+
+	rtnl_lock();
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY, 0);
+	rtnl_unlock();
+}
+
+void ipoib_ib_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	ipoib_dbg(priv, "cleaning up ib_dev\n");
+	/*
+	 * We must make sure there are no more (path) completions
+	 * that may wish to touch priv fields that are no longer valid
+	 */
+	ipoib_flush_paths(dev);
+
+	ipoib_mcast_stop_thread(dev);
+	ipoib_mcast_dev_flush(dev);
+
+	/*
+	 * All of our ah references aren't free until after
+	 * ipoib_mcast_dev_flush(), ipoib_flush_paths, and
+	 * the neighbor garbage collection is stopped and reaped.
+	 * That should all be done now, so make a final ah flush.
+	 */
+	ipoib_stop_ah(dev);
+
+	clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+	priv->rn_ops->ndo_uninit(dev);
+
+	if (priv->pd) {
+		ib_dealloc_pd(priv->pd);
+		priv->pd = NULL;
+	}
+}
+
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
new file mode 100644
index 0000000..cf291f9
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -0,0 +1,2524 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipoib.h"
+
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+
+#include <linux/if_arp.h>	/* For ARPHRD_xxx */
+
+#include <linux/ip.h>
+#include <linux/in.h>
+
+#include <linux/jhash.h>
+#include <net/arp.h>
+#include <net/addrconf.h>
+#include <linux/inetdevice.h>
+#include <rdma/ib_cache.h>
+
+#define DRV_VERSION "1.0.0"
+
+const char ipoib_driver_version[] = DRV_VERSION;
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
+int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
+
+module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
+module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+int ipoib_debug_level;
+
+module_param_named(debug_level, ipoib_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+#endif
+
+struct ipoib_path_iter {
+	struct net_device *dev;
+	struct ipoib_path  path;
+};
+
+static const u8 ipv4_bcast_addr[] = {
+	0x00, 0xff, 0xff, 0xff,
+	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
+};
+
+struct workqueue_struct *ipoib_workqueue;
+
+struct ib_sa_client ipoib_sa_client;
+
+static void ipoib_add_one(struct ib_device *device);
+static void ipoib_remove_one(struct ib_device *device, void *client_data);
+static void ipoib_neigh_reclaim(struct rcu_head *rp);
+static struct net_device *ipoib_get_net_dev_by_params(
+		struct ib_device *dev, u8 port, u16 pkey,
+		const union ib_gid *gid, const struct sockaddr *addr,
+		void *client_data);
+static int ipoib_set_mac(struct net_device *dev, void *addr);
+static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr,
+		       int cmd);
+
+static struct ib_client ipoib_client = {
+	.name   = "ipoib",
+	.add    = ipoib_add_one,
+	.remove = ipoib_remove_one,
+	.get_net_dev_by_params = ipoib_get_net_dev_by_params,
+};
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+static int ipoib_netdev_event(struct notifier_block *this,
+			      unsigned long event, void *ptr)
+{
+	struct netdev_notifier_info *ni = ptr;
+	struct net_device *dev = ni->dev;
+
+	if (dev->netdev_ops->ndo_open != ipoib_open)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		ipoib_create_debug_files(dev);
+		break;
+	case NETDEV_CHANGENAME:
+		ipoib_delete_debug_files(dev);
+		ipoib_create_debug_files(dev);
+		break;
+	case NETDEV_UNREGISTER:
+		ipoib_delete_debug_files(dev);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+#endif
+
+int ipoib_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	ipoib_dbg(priv, "bringing up interface\n");
+
+	netif_carrier_off(dev);
+
+	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	priv->sm_fullmember_sendonly_support = false;
+
+	if (ipoib_ib_dev_open(dev)) {
+		if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+			return 0;
+		goto err_disable;
+	}
+
+	ipoib_ib_dev_up(dev);
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		struct ipoib_dev_priv *cpriv;
+
+		/* Bring up any child interfaces too */
+		down_read(&priv->vlan_rwsem);
+		list_for_each_entry(cpriv, &priv->child_intfs, list) {
+			int flags;
+
+			flags = cpriv->dev->flags;
+			if (flags & IFF_UP)
+				continue;
+
+			dev_change_flags(cpriv->dev, flags | IFF_UP);
+		}
+		up_read(&priv->vlan_rwsem);
+	}
+
+	netif_start_queue(dev);
+
+	return 0;
+
+err_disable:
+	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	return -EINVAL;
+}
+
+static int ipoib_stop(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	ipoib_dbg(priv, "stopping interface\n");
+
+	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	netif_stop_queue(dev);
+
+	ipoib_ib_dev_down(dev);
+	ipoib_ib_dev_stop(dev);
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		struct ipoib_dev_priv *cpriv;
+
+		/* Bring down any child interfaces too */
+		down_read(&priv->vlan_rwsem);
+		list_for_each_entry(cpriv, &priv->child_intfs, list) {
+			int flags;
+
+			flags = cpriv->dev->flags;
+			if (!(flags & IFF_UP))
+				continue;
+
+			dev_change_flags(cpriv->dev, flags & ~IFF_UP);
+		}
+		up_read(&priv->vlan_rwsem);
+	}
+
+	return 0;
+}
+
+static void ipoib_uninit(struct net_device *dev)
+{
+	ipoib_dev_cleanup(dev);
+}
+
+static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
+		features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
+
+	return features;
+}
+
+static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int ret = 0;
+
+	/* dev->mtu > 2K ==> connected mode */
+	if (ipoib_cm_admin_enabled(dev)) {
+		if (new_mtu > ipoib_cm_max_mtu(dev))
+			return -EINVAL;
+
+		if (new_mtu > priv->mcast_mtu)
+			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
+				   priv->mcast_mtu);
+
+		dev->mtu = new_mtu;
+		return 0;
+	}
+
+	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
+		return -EINVAL;
+
+	priv->admin_mtu = new_mtu;
+
+	if (priv->mcast_mtu < priv->admin_mtu)
+		ipoib_dbg(priv, "MTU must be smaller than the underlying "
+				"link layer MTU - 4 (%u)\n", priv->mcast_mtu);
+
+	new_mtu = min(priv->mcast_mtu, priv->admin_mtu);
+
+	if (priv->rn_ops->ndo_change_mtu) {
+		bool carrier_status = netif_carrier_ok(dev);
+
+		netif_carrier_off(dev);
+
+		/* notify lower level on the real mtu */
+		ret = priv->rn_ops->ndo_change_mtu(dev, new_mtu);
+
+		if (carrier_status)
+			netif_carrier_on(dev);
+	} else {
+		dev->mtu = new_mtu;
+	}
+
+	return ret;
+}
+
+static void ipoib_get_stats(struct net_device *dev,
+			    struct rtnl_link_stats64 *stats)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	if (priv->rn_ops->ndo_get_stats64)
+		priv->rn_ops->ndo_get_stats64(dev, stats);
+	else
+		netdev_stats_to_stats64(stats, &dev->stats);
+}
+
+/* Called with an RCU read lock taken */
+static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
+					struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct in_device *in_dev;
+	struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
+	struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
+	__be32 ret_addr;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		in_dev = in_dev_get(dev);
+		if (!in_dev)
+			return false;
+
+		ret_addr = inet_confirm_addr(net, in_dev, 0,
+					     addr_in->sin_addr.s_addr,
+					     RT_SCOPE_HOST);
+		in_dev_put(in_dev);
+		if (ret_addr)
+			return true;
+
+		break;
+	case AF_INET6:
+		if (IS_ENABLED(CONFIG_IPV6) &&
+		    ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
+			return true;
+
+		break;
+	}
+	return false;
+}
+
+/**
+ * Find the master net_device on top of the given net_device.
+ * @dev: base IPoIB net_device
+ *
+ * Returns the master net_device with a reference held, or the same net_device
+ * if no master exists.
+ */
+static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
+{
+	struct net_device *master;
+
+	rcu_read_lock();
+	master = netdev_master_upper_dev_get_rcu(dev);
+	if (master)
+		dev_hold(master);
+	rcu_read_unlock();
+
+	if (master)
+		return master;
+
+	dev_hold(dev);
+	return dev;
+}
+
+struct ipoib_walk_data {
+	const struct sockaddr *addr;
+	struct net_device *result;
+};
+
+static int ipoib_upper_walk(struct net_device *upper, void *_data)
+{
+	struct ipoib_walk_data *data = _data;
+	int ret = 0;
+
+	if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) {
+		dev_hold(upper);
+		data->result = upper;
+		ret = 1;
+	}
+
+	return ret;
+}
+
+/**
+ * Find a net_device matching the given address, which is an upper device of
+ * the given net_device.
+ * @addr: IP address to look for.
+ * @dev: base IPoIB net_device
+ *
+ * If found, returns the net_device with a reference held. Otherwise return
+ * NULL.
+ */
+static struct net_device *ipoib_get_net_dev_match_addr(
+		const struct sockaddr *addr, struct net_device *dev)
+{
+	struct ipoib_walk_data data = {
+		.addr = addr,
+	};
+
+	rcu_read_lock();
+	if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
+		dev_hold(dev);
+		data.result = dev;
+		goto out;
+	}
+
+	netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &data);
+out:
+	rcu_read_unlock();
+	return data.result;
+}
+
+/* returns the number of IPoIB netdevs on top a given ipoib device matching a
+ * pkey_index and address, if one exists.
+ *
+ * @found_net_dev: contains a matching net_device if the return value >= 1,
+ * with a reference held. */
+static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
+				     const union ib_gid *gid,
+				     u16 pkey_index,
+				     const struct sockaddr *addr,
+				     int nesting,
+				     struct net_device **found_net_dev)
+{
+	struct ipoib_dev_priv *child_priv;
+	struct net_device *net_dev = NULL;
+	int matches = 0;
+
+	if (priv->pkey_index == pkey_index &&
+	    (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
+		if (!addr) {
+			net_dev = ipoib_get_master_net_dev(priv->dev);
+		} else {
+			/* Verify the net_device matches the IP address, as
+			 * IPoIB child devices currently share a GID. */
+			net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
+		}
+		if (net_dev) {
+			if (!*found_net_dev)
+				*found_net_dev = net_dev;
+			else
+				dev_put(net_dev);
+			++matches;
+		}
+	}
+
+	/* Check child interfaces */
+	down_read_nested(&priv->vlan_rwsem, nesting);
+	list_for_each_entry(child_priv, &priv->child_intfs, list) {
+		matches += ipoib_match_gid_pkey_addr(child_priv, gid,
+						    pkey_index, addr,
+						    nesting + 1,
+						    found_net_dev);
+		if (matches > 1)
+			break;
+	}
+	up_read(&priv->vlan_rwsem);
+
+	return matches;
+}
+
+/* Returns the number of matching net_devs found (between 0 and 2). Also
+ * return the matching net_device in the @net_dev parameter, holding a
+ * reference to the net_device, if the number of matches >= 1 */
+static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
+					 u16 pkey_index,
+					 const union ib_gid *gid,
+					 const struct sockaddr *addr,
+					 struct net_device **net_dev)
+{
+	struct ipoib_dev_priv *priv;
+	int matches = 0;
+
+	*net_dev = NULL;
+
+	list_for_each_entry(priv, dev_list, list) {
+		if (priv->port != port)
+			continue;
+
+		matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
+						     addr, 0, net_dev);
+		if (matches > 1)
+			break;
+	}
+
+	return matches;
+}
+
+static struct net_device *ipoib_get_net_dev_by_params(
+		struct ib_device *dev, u8 port, u16 pkey,
+		const union ib_gid *gid, const struct sockaddr *addr,
+		void *client_data)
+{
+	struct net_device *net_dev;
+	struct list_head *dev_list = client_data;
+	u16 pkey_index;
+	int matches;
+	int ret;
+
+	if (!rdma_protocol_ib(dev, port))
+		return NULL;
+
+	ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
+	if (ret)
+		return NULL;
+
+	if (!dev_list)
+		return NULL;
+
+	/* See if we can find a unique device matching the L2 parameters */
+	matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
+						gid, NULL, &net_dev);
+
+	switch (matches) {
+	case 0:
+		return NULL;
+	case 1:
+		return net_dev;
+	}
+
+	dev_put(net_dev);
+
+	/* Couldn't find a unique device with L2 parameters only. Use L3
+	 * address to uniquely match the net device */
+	matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
+						gid, addr, &net_dev);
+	switch (matches) {
+	case 0:
+		return NULL;
+	default:
+		dev_warn_ratelimited(&dev->dev,
+				     "duplicate IP address detected\n");
+		/* Fall through */
+	case 1:
+		return net_dev;
+	}
+}
+
+int ipoib_set_mode(struct net_device *dev, const char *buf)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
+	     !strcmp(buf, "connected\n")) ||
+	     (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
+	     !strcmp(buf, "datagram\n"))) {
+		return 0;
+	}
+
+	/* flush paths if we switch modes so that connections are restarted */
+	if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
+		set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+		ipoib_warn(priv, "enabling connected mode "
+			   "will cause multicast packet drops\n");
+		netdev_update_features(dev);
+		dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
+		rtnl_unlock();
+		priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
+
+		ipoib_flush_paths(dev);
+		return (!rtnl_trylock()) ? -EBUSY : 0;
+	}
+
+	if (!strcmp(buf, "datagram\n")) {
+		clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+		netdev_update_features(dev);
+		dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
+		rtnl_unlock();
+		ipoib_flush_paths(dev);
+		return (!rtnl_trylock()) ? -EBUSY : 0;
+	}
+
+	return -EINVAL;
+}
+
+struct ipoib_path *__path_find(struct net_device *dev, void *gid)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct rb_node *n = priv->path_tree.rb_node;
+	struct ipoib_path *path;
+	int ret;
+
+	while (n) {
+		path = rb_entry(n, struct ipoib_path, rb_node);
+
+		ret = memcmp(gid, path->pathrec.dgid.raw,
+			     sizeof (union ib_gid));
+
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return path;
+	}
+
+	return NULL;
+}
+
+static int __path_add(struct net_device *dev, struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct rb_node **n = &priv->path_tree.rb_node;
+	struct rb_node *pn = NULL;
+	struct ipoib_path *tpath;
+	int ret;
+
+	while (*n) {
+		pn = *n;
+		tpath = rb_entry(pn, struct ipoib_path, rb_node);
+
+		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = &pn->rb_left;
+		else if (ret > 0)
+			n = &pn->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&path->rb_node, pn, n);
+	rb_insert_color(&path->rb_node, &priv->path_tree);
+
+	list_add_tail(&path->list, &priv->path_list);
+
+	return 0;
+}
+
+static void path_free(struct net_device *dev, struct ipoib_path *path)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&path->queue)))
+		dev_kfree_skb_irq(skb);
+
+	ipoib_dbg(ipoib_priv(dev), "path_free\n");
+
+	/* remove all neigh connected to this path */
+	ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
+
+	if (path->ah)
+		ipoib_put_ah(path->ah);
+
+	kfree(path);
+}
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+
+struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
+{
+	struct ipoib_path_iter *iter;
+
+	iter = kmalloc(sizeof *iter, GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	iter->dev = dev;
+	memset(iter->path.pathrec.dgid.raw, 0, 16);
+
+	if (ipoib_path_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+int ipoib_path_iter_next(struct ipoib_path_iter *iter)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(iter->dev);
+	struct rb_node *n;
+	struct ipoib_path *path;
+	int ret = 1;
+
+	spin_lock_irq(&priv->lock);
+
+	n = rb_first(&priv->path_tree);
+
+	while (n) {
+		path = rb_entry(n, struct ipoib_path, rb_node);
+
+		if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
+			   sizeof (union ib_gid)) < 0) {
+			iter->path = *path;
+			ret = 0;
+			break;
+		}
+
+		n = rb_next(n);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	return ret;
+}
+
+void ipoib_path_iter_read(struct ipoib_path_iter *iter,
+			  struct ipoib_path *path)
+{
+	*path = iter->path;
+}
+
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+
+void ipoib_mark_paths_invalid(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_path *path, *tp;
+
+	spin_lock_irq(&priv->lock);
+
+	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
+		ipoib_dbg(priv, "mark path LID 0x%08x GID %pI6 invalid\n",
+			  be32_to_cpu(sa_path_get_dlid(&path->pathrec)),
+			  path->pathrec.dgid.raw);
+		path->valid =  0;
+	}
+
+	spin_unlock_irq(&priv->lock);
+}
+
+static void push_pseudo_header(struct sk_buff *skb, const char *daddr)
+{
+	struct ipoib_pseudo_header *phdr;
+
+	phdr = skb_push(skb, sizeof(*phdr));
+	memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
+}
+
+void ipoib_flush_paths(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_path *path, *tp;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_splice_init(&priv->path_list, &remove_list);
+
+	list_for_each_entry(path, &remove_list, list)
+		rb_erase(&path->rb_node, &priv->path_tree);
+
+	list_for_each_entry_safe(path, tp, &remove_list, list) {
+		if (path->query)
+			ib_sa_cancel_query(path->query_id, path->query);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+		wait_for_completion(&path->done);
+		path_free(dev, path);
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+static void path_rec_completion(int status,
+				struct sa_path_rec *pathrec,
+				void *path_ptr)
+{
+	struct ipoib_path *path = path_ptr;
+	struct net_device *dev = path->dev;
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_ah *ah = NULL;
+	struct ipoib_ah *old_ah = NULL;
+	struct ipoib_neigh *neigh, *tn;
+	struct sk_buff_head skqueue;
+	struct sk_buff *skb;
+	unsigned long flags;
+
+	if (!status)
+		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
+			  be32_to_cpu(sa_path_get_dlid(pathrec)),
+			  pathrec->dgid.raw);
+	else
+		ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
+			  status, path->pathrec.dgid.raw);
+
+	skb_queue_head_init(&skqueue);
+
+	if (!status) {
+		struct rdma_ah_attr av;
+
+		if (!ib_init_ah_attr_from_path(priv->ca, priv->port,
+					       pathrec, &av))
+			ah = ipoib_create_ah(dev, priv->pd, &av);
+	}
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (!IS_ERR_OR_NULL(ah)) {
+		/*
+		 * pathrec.dgid is used as the database key from the LLADDR,
+		 * it must remain unchanged even if the SA returns a different
+		 * GID to use in the AH.
+		 */
+		if (memcmp(pathrec->dgid.raw, path->pathrec.dgid.raw,
+			   sizeof(union ib_gid))) {
+			ipoib_dbg(
+				priv,
+				"%s got PathRec for gid %pI6 while asked for %pI6\n",
+				dev->name, pathrec->dgid.raw,
+				path->pathrec.dgid.raw);
+			memcpy(pathrec->dgid.raw, path->pathrec.dgid.raw,
+			       sizeof(union ib_gid));
+		}
+
+		path->pathrec = *pathrec;
+
+		old_ah   = path->ah;
+		path->ah = ah;
+
+		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
+			  ah, be32_to_cpu(sa_path_get_dlid(pathrec)),
+			  pathrec->sl);
+
+		while ((skb = __skb_dequeue(&path->queue)))
+			__skb_queue_tail(&skqueue, skb);
+
+		list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
+			if (neigh->ah) {
+				WARN_ON(neigh->ah != old_ah);
+				/*
+				 * Dropping the ah reference inside
+				 * priv->lock is safe here, because we
+				 * will hold one more reference from
+				 * the original value of path->ah (ie
+				 * old_ah).
+				 */
+				ipoib_put_ah(neigh->ah);
+			}
+			kref_get(&path->ah->ref);
+			neigh->ah = path->ah;
+
+			if (ipoib_cm_enabled(dev, neigh->daddr)) {
+				if (!ipoib_cm_get(neigh))
+					ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
+									       path,
+									       neigh));
+				if (!ipoib_cm_get(neigh)) {
+					ipoib_neigh_free(neigh);
+					continue;
+				}
+			}
+
+			while ((skb = __skb_dequeue(&neigh->queue)))
+				__skb_queue_tail(&skqueue, skb);
+		}
+		path->valid = 1;
+	}
+
+	path->query = NULL;
+	complete(&path->done);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	if (IS_ERR_OR_NULL(ah))
+		ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
+
+	if (old_ah)
+		ipoib_put_ah(old_ah);
+
+	while ((skb = __skb_dequeue(&skqueue))) {
+		int ret;
+		skb->dev = dev;
+		ret = dev_queue_xmit(skb);
+		if (ret)
+			ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n",
+				   __func__, ret);
+	}
+}
+
+static void init_path_rec(struct ipoib_dev_priv *priv, struct ipoib_path *path,
+			  void *gid)
+{
+	path->dev = priv->dev;
+
+	if (rdma_cap_opa_ah(priv->ca, priv->port))
+		path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA;
+	else
+		path->pathrec.rec_type = SA_PATH_REC_TYPE_IB;
+
+	memcpy(path->pathrec.dgid.raw, gid, sizeof(union ib_gid));
+	path->pathrec.sgid	    = priv->local_gid;
+	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
+	path->pathrec.numb_path     = 1;
+	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
+}
+
+static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_path *path;
+
+	if (!priv->broadcast)
+		return NULL;
+
+	path = kzalloc(sizeof *path, GFP_ATOMIC);
+	if (!path)
+		return NULL;
+
+	skb_queue_head_init(&path->queue);
+
+	INIT_LIST_HEAD(&path->neigh_list);
+
+	init_path_rec(priv, path, gid);
+
+	return path;
+}
+
+static int path_rec_start(struct net_device *dev,
+			  struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	ipoib_dbg(priv, "Start path record lookup for %pI6\n",
+		  path->pathrec.dgid.raw);
+
+	init_completion(&path->done);
+
+	path->query_id =
+		ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
+				   &path->pathrec,
+				   IB_SA_PATH_REC_DGID		|
+				   IB_SA_PATH_REC_SGID		|
+				   IB_SA_PATH_REC_NUMB_PATH	|
+				   IB_SA_PATH_REC_TRAFFIC_CLASS |
+				   IB_SA_PATH_REC_PKEY,
+				   1000, GFP_ATOMIC,
+				   path_rec_completion,
+				   path, &path->query);
+	if (path->query_id < 0) {
+		ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
+		path->query = NULL;
+		complete(&path->done);
+		return path->query_id;
+	}
+
+	return 0;
+}
+
+static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr,
+					  struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct rdma_netdev *rn = netdev_priv(dev);
+	struct ipoib_path *path;
+	struct ipoib_neigh *neigh;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	neigh = ipoib_neigh_alloc(daddr, dev);
+	if (!neigh) {
+		spin_unlock_irqrestore(&priv->lock, flags);
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+		return NULL;
+	}
+
+	/* To avoid race condition, make sure that the
+	 * neigh will be added only once.
+	 */
+	if (unlikely(!list_empty(&neigh->list))) {
+		spin_unlock_irqrestore(&priv->lock, flags);
+		return neigh;
+	}
+
+	path = __path_find(dev, daddr + 4);
+	if (!path) {
+		path = path_rec_create(dev, daddr + 4);
+		if (!path)
+			goto err_path;
+
+		__path_add(dev, path);
+	}
+
+	list_add_tail(&neigh->list, &path->neigh_list);
+
+	if (path->ah) {
+		kref_get(&path->ah->ref);
+		neigh->ah = path->ah;
+
+		if (ipoib_cm_enabled(dev, neigh->daddr)) {
+			if (!ipoib_cm_get(neigh))
+				ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
+			if (!ipoib_cm_get(neigh)) {
+				ipoib_neigh_free(neigh);
+				goto err_drop;
+			}
+			if (skb_queue_len(&neigh->queue) <
+			    IPOIB_MAX_PATH_REC_QUEUE) {
+				push_pseudo_header(skb, neigh->daddr);
+				__skb_queue_tail(&neigh->queue, skb);
+			} else {
+				ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
+					   skb_queue_len(&neigh->queue));
+				goto err_drop;
+			}
+		} else {
+			spin_unlock_irqrestore(&priv->lock, flags);
+			path->ah->last_send = rn->send(dev, skb, path->ah->ah,
+						       IPOIB_QPN(daddr));
+			ipoib_neigh_put(neigh);
+			return NULL;
+		}
+	} else {
+		neigh->ah  = NULL;
+
+		if (!path->query && path_rec_start(dev, path))
+			goto err_path;
+		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+			push_pseudo_header(skb, neigh->daddr);
+			__skb_queue_tail(&neigh->queue, skb);
+		} else {
+			goto err_drop;
+		}
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	ipoib_neigh_put(neigh);
+	return NULL;
+
+err_path:
+	ipoib_neigh_free(neigh);
+err_drop:
+	++dev->stats.tx_dropped;
+	dev_kfree_skb_any(skb);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	ipoib_neigh_put(neigh);
+
+	return NULL;
+}
+
+static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
+			     struct ipoib_pseudo_header *phdr)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct rdma_netdev *rn = netdev_priv(dev);
+	struct ipoib_path *path;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	/* no broadcast means that all paths are (going to be) not valid */
+	if (!priv->broadcast)
+		goto drop_and_unlock;
+
+	path = __path_find(dev, phdr->hwaddr + 4);
+	if (!path || !path->valid) {
+		int new_path = 0;
+
+		if (!path) {
+			path = path_rec_create(dev, phdr->hwaddr + 4);
+			new_path = 1;
+		}
+		if (path) {
+			if (!new_path)
+				/* make sure there is no changes in the existing path record */
+				init_path_rec(priv, path, phdr->hwaddr + 4);
+
+			if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+				push_pseudo_header(skb, phdr->hwaddr);
+				__skb_queue_tail(&path->queue, skb);
+			} else {
+				++dev->stats.tx_dropped;
+				dev_kfree_skb_any(skb);
+			}
+
+			if (!path->query && path_rec_start(dev, path)) {
+				spin_unlock_irqrestore(&priv->lock, flags);
+				if (new_path)
+					path_free(dev, path);
+				return;
+			} else
+				__path_add(dev, path);
+		} else {
+			goto drop_and_unlock;
+		}
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		return;
+	}
+
+	if (path->ah) {
+		ipoib_dbg(priv, "Send unicast ARP to %08x\n",
+			  be32_to_cpu(sa_path_get_dlid(&path->pathrec)));
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		path->ah->last_send = rn->send(dev, skb, path->ah->ah,
+					       IPOIB_QPN(phdr->hwaddr));
+		return;
+	} else if ((path->query || !path_rec_start(dev, path)) &&
+		   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+		push_pseudo_header(skb, phdr->hwaddr);
+		__skb_queue_tail(&path->queue, skb);
+	} else {
+		goto drop_and_unlock;
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	return;
+
+drop_and_unlock:
+	++dev->stats.tx_dropped;
+	dev_kfree_skb_any(skb);
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct rdma_netdev *rn = netdev_priv(dev);
+	struct ipoib_neigh *neigh;
+	struct ipoib_pseudo_header *phdr;
+	struct ipoib_header *header;
+	unsigned long flags;
+
+	phdr = (struct ipoib_pseudo_header *) skb->data;
+	skb_pull(skb, sizeof(*phdr));
+	header = (struct ipoib_header *) skb->data;
+
+	if (unlikely(phdr->hwaddr[4] == 0xff)) {
+		/* multicast, arrange "if" according to probability */
+		if ((header->proto != htons(ETH_P_IP)) &&
+		    (header->proto != htons(ETH_P_IPV6)) &&
+		    (header->proto != htons(ETH_P_ARP)) &&
+		    (header->proto != htons(ETH_P_RARP)) &&
+		    (header->proto != htons(ETH_P_TIPC))) {
+			/* ethertype not supported by IPoIB */
+			++dev->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+			return NETDEV_TX_OK;
+		}
+		/* Add in the P_Key for multicast*/
+		phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
+		phdr->hwaddr[9] = priv->pkey & 0xff;
+
+		neigh = ipoib_neigh_get(dev, phdr->hwaddr);
+		if (likely(neigh))
+			goto send_using_neigh;
+		ipoib_mcast_send(dev, phdr->hwaddr, skb);
+		return NETDEV_TX_OK;
+	}
+
+	/* unicast, arrange "switch" according to probability */
+	switch (header->proto) {
+	case htons(ETH_P_IP):
+	case htons(ETH_P_IPV6):
+	case htons(ETH_P_TIPC):
+		neigh = ipoib_neigh_get(dev, phdr->hwaddr);
+		if (unlikely(!neigh)) {
+			neigh = neigh_add_path(skb, phdr->hwaddr, dev);
+			if (likely(!neigh))
+				return NETDEV_TX_OK;
+		}
+		break;
+	case htons(ETH_P_ARP):
+	case htons(ETH_P_RARP):
+		/* for unicast ARP and RARP should always perform path find */
+		unicast_arp_send(skb, dev, phdr);
+		return NETDEV_TX_OK;
+	default:
+		/* ethertype not supported by IPoIB */
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+send_using_neigh:
+	/* note we now hold a ref to neigh */
+	if (ipoib_cm_get(neigh)) {
+		if (ipoib_cm_up(neigh)) {
+			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+			goto unref;
+		}
+	} else if (neigh->ah) {
+		neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
+						IPOIB_QPN(phdr->hwaddr));
+		goto unref;
+	}
+
+	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+		push_pseudo_header(skb, phdr->hwaddr);
+		spin_lock_irqsave(&priv->lock, flags);
+		__skb_queue_tail(&neigh->queue, skb);
+		spin_unlock_irqrestore(&priv->lock, flags);
+	} else {
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+	}
+
+unref:
+	ipoib_neigh_put(neigh);
+
+	return NETDEV_TX_OK;
+}
+
+static void ipoib_timeout(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
+		   jiffies_to_msecs(jiffies - dev_trans_start(dev)));
+	ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
+		   netif_queue_stopped(dev),
+		   priv->tx_head, priv->tx_tail);
+	/* XXX reset QP, etc. */
+}
+
+static int ipoib_hard_header(struct sk_buff *skb,
+			     struct net_device *dev,
+			     unsigned short type,
+			     const void *daddr, const void *saddr, unsigned len)
+{
+	struct ipoib_header *header;
+
+	header = skb_push(skb, sizeof *header);
+
+	header->proto = htons(type);
+	header->reserved = 0;
+
+	/*
+	 * we don't rely on dst_entry structure,  always stuff the
+	 * destination address into skb hard header so we can figure out where
+	 * to send the packet later.
+	 */
+	push_pseudo_header(skb, daddr);
+
+	return IPOIB_HARD_LEN;
+}
+
+static void ipoib_set_mcast_list(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+		ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
+		return;
+	}
+
+	queue_work(priv->wq, &priv->restart_task);
+}
+
+static int ipoib_get_iflink(const struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	/* parent interface */
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
+		return dev->ifindex;
+
+	/* child/vlan interface */
+	return priv->parent->ifindex;
+}
+
+static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
+{
+	/*
+	 * Use only the address parts that contributes to spreading
+	 * The subnet prefix is not used as one can not connect to
+	 * same remote port (GUID) using the same remote QPN via two
+	 * different subnets.
+	 */
+	 /* qpn octets[1:4) & port GUID octets[12:20) */
+	u32 *d32 = (u32 *) daddr;
+	u32 hv;
+
+	hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
+	return hv & htbl->mask;
+}
+
+struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	struct ipoib_neigh *neigh = NULL;
+	u32 hash_val;
+
+	rcu_read_lock_bh();
+
+	htbl = rcu_dereference_bh(ntbl->htbl);
+
+	if (!htbl)
+		goto out_unlock;
+
+	hash_val = ipoib_addr_hash(htbl, daddr);
+	for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
+	     neigh != NULL;
+	     neigh = rcu_dereference_bh(neigh->hnext)) {
+		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
+			/* found, take one ref on behalf of the caller */
+			if (!atomic_inc_not_zero(&neigh->refcnt)) {
+				/* deleted */
+				neigh = NULL;
+				goto out_unlock;
+			}
+
+			if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE))
+				neigh->alive = jiffies;
+			goto out_unlock;
+		}
+	}
+
+out_unlock:
+	rcu_read_unlock_bh();
+	return neigh;
+}
+
+static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	unsigned long neigh_obsolete;
+	unsigned long dt;
+	unsigned long flags;
+	int i;
+	LIST_HEAD(remove_list);
+
+	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
+		return;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					 lockdep_is_held(&priv->lock));
+
+	if (!htbl)
+		goto out_unlock;
+
+	/* neigh is obsolete if it was idle for two GC periods */
+	dt = 2 * arp_tbl.gc_interval;
+	neigh_obsolete = jiffies - dt;
+	/* handle possible race condition */
+	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
+		goto out_unlock;
+
+	for (i = 0; i < htbl->size; i++) {
+		struct ipoib_neigh *neigh;
+		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
+
+		while ((neigh = rcu_dereference_protected(*np,
+							  lockdep_is_held(&priv->lock))) != NULL) {
+			/* was the neigh idle for two GC periods */
+			if (time_after(neigh_obsolete, neigh->alive)) {
+
+				ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list);
+
+				rcu_assign_pointer(*np,
+						   rcu_dereference_protected(neigh->hnext,
+									     lockdep_is_held(&priv->lock)));
+				/* remove from path/mc list */
+				list_del_init(&neigh->list);
+				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
+			} else {
+				np = &neigh->hnext;
+			}
+
+		}
+	}
+
+out_unlock:
+	spin_unlock_irqrestore(&priv->lock, flags);
+	ipoib_mcast_remove_list(&remove_list);
+}
+
+static void ipoib_reap_neigh(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
+
+	__ipoib_reap_neigh(priv);
+
+	if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
+		queue_delayed_work(priv->wq, &priv->neigh_reap_task,
+				   arp_tbl.gc_interval);
+}
+
+
+static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
+				      struct net_device *dev)
+{
+	struct ipoib_neigh *neigh;
+
+	neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
+	if (!neigh)
+		return NULL;
+
+	neigh->dev = dev;
+	memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
+	skb_queue_head_init(&neigh->queue);
+	INIT_LIST_HEAD(&neigh->list);
+	ipoib_cm_set(neigh, NULL);
+	/* one ref on behalf of the caller */
+	atomic_set(&neigh->refcnt, 1);
+
+	return neigh;
+}
+
+struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
+				      struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	struct ipoib_neigh *neigh;
+	u32 hash_val;
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					 lockdep_is_held(&priv->lock));
+	if (!htbl) {
+		neigh = NULL;
+		goto out_unlock;
+	}
+
+	/* need to add a new neigh, but maybe some other thread succeeded?
+	 * recalc hash, maybe hash resize took place so we do a search
+	 */
+	hash_val = ipoib_addr_hash(htbl, daddr);
+	for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
+					       lockdep_is_held(&priv->lock));
+	     neigh != NULL;
+	     neigh = rcu_dereference_protected(neigh->hnext,
+					       lockdep_is_held(&priv->lock))) {
+		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
+			/* found, take one ref on behalf of the caller */
+			if (!atomic_inc_not_zero(&neigh->refcnt)) {
+				/* deleted */
+				neigh = NULL;
+				break;
+			}
+			neigh->alive = jiffies;
+			goto out_unlock;
+		}
+	}
+
+	neigh = ipoib_neigh_ctor(daddr, dev);
+	if (!neigh)
+		goto out_unlock;
+
+	/* one ref on behalf of the hash table */
+	atomic_inc(&neigh->refcnt);
+	neigh->alive = jiffies;
+	/* put in hash */
+	rcu_assign_pointer(neigh->hnext,
+			   rcu_dereference_protected(htbl->buckets[hash_val],
+						     lockdep_is_held(&priv->lock)));
+	rcu_assign_pointer(htbl->buckets[hash_val], neigh);
+	atomic_inc(&ntbl->entries);
+
+out_unlock:
+
+	return neigh;
+}
+
+void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
+{
+	/* neigh reference count was dropprd to zero */
+	struct net_device *dev = neigh->dev;
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct sk_buff *skb;
+	if (neigh->ah)
+		ipoib_put_ah(neigh->ah);
+	while ((skb = __skb_dequeue(&neigh->queue))) {
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+	}
+	if (ipoib_cm_get(neigh))
+		ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
+	ipoib_dbg(ipoib_priv(dev),
+		  "neigh free for %06x %pI6\n",
+		  IPOIB_QPN(neigh->daddr),
+		  neigh->daddr + 4);
+	kfree(neigh);
+	if (atomic_dec_and_test(&priv->ntbl.entries)) {
+		if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
+			complete(&priv->ntbl.flushed);
+	}
+}
+
+static void ipoib_neigh_reclaim(struct rcu_head *rp)
+{
+	/* Called as a result of removal from hash table */
+	struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
+	/* note TX context may hold another ref */
+	ipoib_neigh_put(neigh);
+}
+
+void ipoib_neigh_free(struct ipoib_neigh *neigh)
+{
+	struct net_device *dev = neigh->dev;
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	struct ipoib_neigh __rcu **np;
+	struct ipoib_neigh *n;
+	u32 hash_val;
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					lockdep_is_held(&priv->lock));
+	if (!htbl)
+		return;
+
+	hash_val = ipoib_addr_hash(htbl, neigh->daddr);
+	np = &htbl->buckets[hash_val];
+	for (n = rcu_dereference_protected(*np,
+					    lockdep_is_held(&priv->lock));
+	     n != NULL;
+	     n = rcu_dereference_protected(*np,
+					lockdep_is_held(&priv->lock))) {
+		if (n == neigh) {
+			/* found */
+			rcu_assign_pointer(*np,
+					   rcu_dereference_protected(neigh->hnext,
+								     lockdep_is_held(&priv->lock)));
+			/* remove from parent list */
+			list_del_init(&neigh->list);
+			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
+			return;
+		} else {
+			np = &n->hnext;
+		}
+	}
+}
+
+static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	struct ipoib_neigh __rcu **buckets;
+	u32 size;
+
+	clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
+	ntbl->htbl = NULL;
+	htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
+	if (!htbl)
+		return -ENOMEM;
+	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+	size = roundup_pow_of_two(arp_tbl.gc_thresh3);
+	buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
+	if (!buckets) {
+		kfree(htbl);
+		return -ENOMEM;
+	}
+	htbl->size = size;
+	htbl->mask = (size - 1);
+	htbl->buckets = buckets;
+	RCU_INIT_POINTER(ntbl->htbl, htbl);
+	htbl->ntbl = ntbl;
+	atomic_set(&ntbl->entries, 0);
+
+	/* start garbage collection */
+	clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+	queue_delayed_work(priv->wq, &priv->neigh_reap_task,
+			   arp_tbl.gc_interval);
+
+	return 0;
+}
+
+static void neigh_hash_free_rcu(struct rcu_head *head)
+{
+	struct ipoib_neigh_hash *htbl = container_of(head,
+						    struct ipoib_neigh_hash,
+						    rcu);
+	struct ipoib_neigh __rcu **buckets = htbl->buckets;
+	struct ipoib_neigh_table *ntbl = htbl->ntbl;
+
+	kfree(buckets);
+	kfree(htbl);
+	complete(&ntbl->deleted);
+}
+
+void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	unsigned long flags;
+	int i;
+
+	/* remove all neigh connected to a given path or mcast */
+	spin_lock_irqsave(&priv->lock, flags);
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					 lockdep_is_held(&priv->lock));
+
+	if (!htbl)
+		goto out_unlock;
+
+	for (i = 0; i < htbl->size; i++) {
+		struct ipoib_neigh *neigh;
+		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
+
+		while ((neigh = rcu_dereference_protected(*np,
+							  lockdep_is_held(&priv->lock))) != NULL) {
+			/* delete neighs belong to this parent */
+			if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
+				rcu_assign_pointer(*np,
+						   rcu_dereference_protected(neigh->hnext,
+									     lockdep_is_held(&priv->lock)));
+				/* remove from parent list */
+				list_del_init(&neigh->list);
+				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
+			} else {
+				np = &neigh->hnext;
+			}
+
+		}
+	}
+out_unlock:
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	unsigned long flags;
+	int i, wait_flushed = 0;
+
+	init_completion(&priv->ntbl.flushed);
+	set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					lockdep_is_held(&priv->lock));
+	if (!htbl)
+		goto out_unlock;
+
+	wait_flushed = atomic_read(&priv->ntbl.entries);
+	if (!wait_flushed)
+		goto free_htbl;
+
+	for (i = 0; i < htbl->size; i++) {
+		struct ipoib_neigh *neigh;
+		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
+
+		while ((neigh = rcu_dereference_protected(*np,
+				       lockdep_is_held(&priv->lock))) != NULL) {
+			rcu_assign_pointer(*np,
+					   rcu_dereference_protected(neigh->hnext,
+								     lockdep_is_held(&priv->lock)));
+			/* remove from path/mc list */
+			list_del_init(&neigh->list);
+			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
+		}
+	}
+
+free_htbl:
+	rcu_assign_pointer(ntbl->htbl, NULL);
+	call_rcu(&htbl->rcu, neigh_hash_free_rcu);
+
+out_unlock:
+	spin_unlock_irqrestore(&priv->lock, flags);
+	if (wait_flushed)
+		wait_for_completion(&priv->ntbl.flushed);
+}
+
+static void ipoib_neigh_hash_uninit(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int stopped;
+
+	ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
+	init_completion(&priv->ntbl.deleted);
+
+	/* Stop GC if called at init fail need to cancel work */
+	stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+	if (!stopped)
+		cancel_delayed_work(&priv->neigh_reap_task);
+
+	ipoib_flush_neighs(priv);
+
+	wait_for_completion(&priv->ntbl.deleted);
+}
+
+static void ipoib_napi_add(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	netif_napi_add(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC);
+	netif_napi_add(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE);
+}
+
+static void ipoib_napi_del(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	netif_napi_del(&priv->recv_napi);
+	netif_napi_del(&priv->send_napi);
+}
+
+static void ipoib_dev_uninit_default(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	ipoib_transport_dev_cleanup(dev);
+
+	ipoib_napi_del(dev);
+
+	ipoib_cm_dev_cleanup(dev);
+
+	kfree(priv->rx_ring);
+	vfree(priv->tx_ring);
+
+	priv->rx_ring = NULL;
+	priv->tx_ring = NULL;
+}
+
+static int ipoib_dev_init_default(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	ipoib_napi_add(dev);
+
+	/* Allocate RX/TX "rings" to hold queued skbs */
+	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
+				GFP_KERNEL);
+	if (!priv->rx_ring)
+		goto out;
+
+	priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
+	if (!priv->tx_ring) {
+		pr_warn("%s: failed to allocate TX ring (%d entries)\n",
+			priv->ca->name, ipoib_sendq_size);
+		goto out_rx_ring_cleanup;
+	}
+
+	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
+
+	if (ipoib_transport_dev_init(dev, priv->ca)) {
+		pr_warn("%s: ipoib_transport_dev_init failed\n",
+			priv->ca->name);
+		goto out_tx_ring_cleanup;
+	}
+
+	/* after qp created set dev address */
+	priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
+	priv->dev->dev_addr[2] = (priv->qp->qp_num >>  8) & 0xff;
+	priv->dev->dev_addr[3] = (priv->qp->qp_num) & 0xff;
+
+	return 0;
+
+out_tx_ring_cleanup:
+	vfree(priv->tx_ring);
+
+out_rx_ring_cleanup:
+	kfree(priv->rx_ring);
+
+out:
+	ipoib_napi_del(dev);
+	return -ENOMEM;
+}
+
+static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr,
+		       int cmd)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	if (!priv->rn_ops->ndo_do_ioctl)
+		return -EOPNOTSUPP;
+
+	return priv->rn_ops->ndo_do_ioctl(dev, ifr, cmd);
+}
+
+int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int ret = -ENOMEM;
+
+	priv->ca = ca;
+	priv->port = port;
+	priv->qp = NULL;
+
+	/*
+	 * the various IPoIB tasks assume they will never race against
+	 * themselves, so always use a single thread workqueue
+	 */
+	priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM);
+	if (!priv->wq) {
+		pr_warn("%s: failed to allocate device WQ\n", dev->name);
+		goto out;
+	}
+
+	/* create pd, which used both for control and datapath*/
+	priv->pd = ib_alloc_pd(priv->ca, 0);
+	if (IS_ERR(priv->pd)) {
+		pr_warn("%s: failed to allocate PD\n", ca->name);
+		goto clean_wq;
+	}
+
+	ret = priv->rn_ops->ndo_init(dev);
+	if (ret) {
+		pr_warn("%s failed to init HW resource\n", dev->name);
+		goto out_free_pd;
+	}
+
+	if (ipoib_neigh_hash_init(priv) < 0) {
+		pr_warn("%s failed to init neigh hash\n", dev->name);
+		goto out_dev_uninit;
+	}
+
+	if (dev->flags & IFF_UP) {
+		if (ipoib_ib_dev_open(dev)) {
+			pr_warn("%s failed to open device\n", dev->name);
+			ret = -ENODEV;
+			goto out_dev_uninit;
+		}
+	}
+
+	return 0;
+
+out_dev_uninit:
+	ipoib_ib_dev_cleanup(dev);
+
+out_free_pd:
+	if (priv->pd) {
+		ib_dealloc_pd(priv->pd);
+		priv->pd = NULL;
+	}
+
+clean_wq:
+	if (priv->wq) {
+		destroy_workqueue(priv->wq);
+		priv->wq = NULL;
+	}
+
+out:
+	return ret;
+}
+
+void ipoib_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev), *cpriv, *tcpriv;
+	LIST_HEAD(head);
+
+	ASSERT_RTNL();
+
+	/* Delete any child interfaces first */
+	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+		/* Stop GC on child */
+		set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
+		cancel_delayed_work(&cpriv->neigh_reap_task);
+		unregister_netdevice_queue(cpriv->dev, &head);
+	}
+	unregister_netdevice_many(&head);
+
+	ipoib_neigh_hash_uninit(dev);
+
+	ipoib_ib_dev_cleanup(dev);
+
+	/* no more works over the priv->wq */
+	if (priv->wq) {
+		flush_workqueue(priv->wq);
+		destroy_workqueue(priv->wq);
+		priv->wq = NULL;
+	}
+}
+
+static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state);
+}
+
+static int ipoib_get_vf_config(struct net_device *dev, int vf,
+			       struct ifla_vf_info *ivf)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int err;
+
+	err = ib_get_vf_config(priv->ca, vf, priv->port, ivf);
+	if (err)
+		return err;
+
+	ivf->vf = vf;
+
+	return 0;
+}
+
+static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID)
+		return -EINVAL;
+
+	return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type);
+}
+
+static int ipoib_get_vf_stats(struct net_device *dev, int vf,
+			      struct ifla_vf_stats *vf_stats)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats);
+}
+
+static const struct header_ops ipoib_header_ops = {
+	.create	= ipoib_hard_header,
+};
+
+static const struct net_device_ops ipoib_netdev_ops_pf = {
+	.ndo_uninit		 = ipoib_uninit,
+	.ndo_open		 = ipoib_open,
+	.ndo_stop		 = ipoib_stop,
+	.ndo_change_mtu		 = ipoib_change_mtu,
+	.ndo_fix_features	 = ipoib_fix_features,
+	.ndo_start_xmit		 = ipoib_start_xmit,
+	.ndo_tx_timeout		 = ipoib_timeout,
+	.ndo_set_rx_mode	 = ipoib_set_mcast_list,
+	.ndo_get_iflink		 = ipoib_get_iflink,
+	.ndo_set_vf_link_state	 = ipoib_set_vf_link_state,
+	.ndo_get_vf_config	 = ipoib_get_vf_config,
+	.ndo_get_vf_stats	 = ipoib_get_vf_stats,
+	.ndo_set_vf_guid	 = ipoib_set_vf_guid,
+	.ndo_set_mac_address	 = ipoib_set_mac,
+	.ndo_get_stats64	 = ipoib_get_stats,
+	.ndo_do_ioctl		 = ipoib_ioctl,
+};
+
+static const struct net_device_ops ipoib_netdev_ops_vf = {
+	.ndo_uninit		 = ipoib_uninit,
+	.ndo_open		 = ipoib_open,
+	.ndo_stop		 = ipoib_stop,
+	.ndo_change_mtu		 = ipoib_change_mtu,
+	.ndo_fix_features	 = ipoib_fix_features,
+	.ndo_start_xmit	 	 = ipoib_start_xmit,
+	.ndo_tx_timeout		 = ipoib_timeout,
+	.ndo_set_rx_mode	 = ipoib_set_mcast_list,
+	.ndo_get_iflink		 = ipoib_get_iflink,
+	.ndo_get_stats64	 = ipoib_get_stats,
+	.ndo_do_ioctl		 = ipoib_ioctl,
+};
+
+void ipoib_setup_common(struct net_device *dev)
+{
+	dev->header_ops		 = &ipoib_header_ops;
+
+	ipoib_set_ethtool_ops(dev);
+
+	dev->watchdog_timeo	 = HZ;
+
+	dev->flags		|= IFF_BROADCAST | IFF_MULTICAST;
+
+	dev->hard_header_len	 = IPOIB_HARD_LEN;
+	dev->addr_len		 = INFINIBAND_ALEN;
+	dev->type		 = ARPHRD_INFINIBAND;
+	dev->tx_queue_len	 = ipoib_sendq_size * 2;
+	dev->features		 = (NETIF_F_VLAN_CHALLENGED	|
+				    NETIF_F_HIGHDMA);
+	netif_keep_dst(dev);
+
+	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
+}
+
+static void ipoib_build_priv(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	priv->dev = dev;
+	spin_lock_init(&priv->lock);
+	init_rwsem(&priv->vlan_rwsem);
+	mutex_init(&priv->mcast_mutex);
+	mutex_init(&priv->sysfs_mutex);
+
+	INIT_LIST_HEAD(&priv->path_list);
+	INIT_LIST_HEAD(&priv->child_intfs);
+	INIT_LIST_HEAD(&priv->dead_ahs);
+	INIT_LIST_HEAD(&priv->multicast_list);
+
+	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
+	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
+	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
+	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
+	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
+	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
+	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
+	INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
+}
+
+static const struct net_device_ops ipoib_netdev_default_pf = {
+	.ndo_init		 = ipoib_dev_init_default,
+	.ndo_uninit		 = ipoib_dev_uninit_default,
+	.ndo_open		 = ipoib_ib_dev_open_default,
+	.ndo_stop		 = ipoib_ib_dev_stop_default,
+};
+
+static struct net_device
+*ipoib_create_netdev_default(struct ib_device *hca,
+			     const char *name,
+			     unsigned char name_assign_type,
+			     void (*setup)(struct net_device *))
+{
+	struct net_device *dev;
+	struct rdma_netdev *rn;
+
+	dev = alloc_netdev((int)sizeof(struct rdma_netdev),
+			   name,
+			   name_assign_type, setup);
+	if (!dev)
+		return NULL;
+
+	rn = netdev_priv(dev);
+
+	rn->send = ipoib_send;
+	rn->attach_mcast = ipoib_mcast_attach;
+	rn->detach_mcast = ipoib_mcast_detach;
+	rn->free_rdma_netdev = free_netdev;
+	rn->hca = hca;
+
+	dev->netdev_ops = &ipoib_netdev_default_pf;
+
+	return dev;
+}
+
+static struct net_device *ipoib_get_netdev(struct ib_device *hca, u8 port,
+					   const char *name)
+{
+	struct net_device *dev;
+
+	if (hca->alloc_rdma_netdev) {
+		dev = hca->alloc_rdma_netdev(hca, port,
+					     RDMA_NETDEV_IPOIB, name,
+					     NET_NAME_UNKNOWN,
+					     ipoib_setup_common);
+		if (IS_ERR_OR_NULL(dev) && PTR_ERR(dev) != -EOPNOTSUPP)
+			return NULL;
+	}
+
+	if (!hca->alloc_rdma_netdev || PTR_ERR(dev) == -EOPNOTSUPP)
+		dev = ipoib_create_netdev_default(hca, name, NET_NAME_UNKNOWN,
+						  ipoib_setup_common);
+
+	return dev;
+}
+
+struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port,
+					const char *name)
+{
+	struct net_device *dev;
+	struct ipoib_dev_priv *priv;
+	struct rdma_netdev *rn;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return NULL;
+
+	dev = ipoib_get_netdev(hca, port, name);
+	if (!dev)
+		goto free_priv;
+
+	priv->rn_ops = dev->netdev_ops;
+
+	/* fixme : should be after the query_cap */
+	if (priv->hca_caps & IB_DEVICE_VIRTUAL_FUNCTION)
+		dev->netdev_ops	= &ipoib_netdev_ops_vf;
+	else
+		dev->netdev_ops	= &ipoib_netdev_ops_pf;
+
+	rn = netdev_priv(dev);
+	rn->clnt_priv = priv;
+	ipoib_build_priv(dev);
+
+	return priv;
+free_priv:
+	kfree(priv);
+	return NULL;
+}
+
+static ssize_t show_pkey(struct device *dev,
+			 struct device_attribute *attr, char *buf)
+{
+	struct net_device *ndev = to_net_dev(dev);
+	struct ipoib_dev_priv *priv = ipoib_priv(ndev);
+
+	return sprintf(buf, "0x%04x\n", priv->pkey);
+}
+static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
+
+static ssize_t show_umcast(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct net_device *ndev = to_net_dev(dev);
+	struct ipoib_dev_priv *priv = ipoib_priv(ndev);
+
+	return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
+}
+
+void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(ndev);
+
+	if (umcast_val > 0) {
+		set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
+		ipoib_warn(priv, "ignoring multicast groups joined directly "
+				"by userspace\n");
+	} else
+		clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
+}
+
+static ssize_t set_umcast(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf, size_t count)
+{
+	unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
+
+	ipoib_set_umcast(to_net_dev(dev), umcast_val);
+
+	return count;
+}
+static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
+
+int ipoib_add_umcast_attr(struct net_device *dev)
+{
+	return device_create_file(&dev->dev, &dev_attr_umcast);
+}
+
+static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid)
+{
+	struct ipoib_dev_priv *child_priv;
+	struct net_device *netdev = priv->dev;
+
+	netif_addr_lock_bh(netdev);
+
+	memcpy(&priv->local_gid.global.interface_id,
+	       &gid->global.interface_id,
+	       sizeof(gid->global.interface_id));
+	memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid));
+	clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
+
+	netif_addr_unlock_bh(netdev);
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		down_read(&priv->vlan_rwsem);
+		list_for_each_entry(child_priv, &priv->child_intfs, list)
+			set_base_guid(child_priv, gid);
+		up_read(&priv->vlan_rwsem);
+	}
+}
+
+static int ipoib_check_lladdr(struct net_device *dev,
+			      struct sockaddr_storage *ss)
+{
+	union ib_gid *gid = (union ib_gid *)(ss->__data + 4);
+	int ret = 0;
+
+	netif_addr_lock_bh(dev);
+
+	/* Make sure the QPN, reserved and subnet prefix match the current
+	 * lladdr, it also makes sure the lladdr is unicast.
+	 */
+	if (memcmp(dev->dev_addr, ss->__data,
+		   4 + sizeof(gid->global.subnet_prefix)) ||
+	    gid->global.interface_id == 0)
+		ret = -EINVAL;
+
+	netif_addr_unlock_bh(dev);
+
+	return ret;
+}
+
+static int ipoib_set_mac(struct net_device *dev, void *addr)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct sockaddr_storage *ss = addr;
+	int ret;
+
+	if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
+		return -EBUSY;
+
+	ret = ipoib_check_lladdr(dev, ss);
+	if (ret)
+		return ret;
+
+	set_base_guid(priv, (union ib_gid *)(ss->__data + 4));
+
+	queue_work(ipoib_workqueue, &priv->flush_light);
+
+	return 0;
+}
+
+static ssize_t create_child(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	int pkey;
+	int ret;
+
+	if (sscanf(buf, "%i", &pkey) != 1)
+		return -EINVAL;
+
+	if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
+		return -EINVAL;
+
+	/*
+	 * Set the full membership bit, so that we join the right
+	 * broadcast group, etc.
+	 */
+	pkey |= 0x8000;
+
+	ret = ipoib_vlan_add(to_net_dev(dev), pkey);
+
+	return ret ? ret : count;
+}
+static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
+
+static ssize_t delete_child(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	int pkey;
+	int ret;
+
+	if (sscanf(buf, "%i", &pkey) != 1)
+		return -EINVAL;
+
+	if (pkey < 0 || pkey > 0xffff)
+		return -EINVAL;
+
+	ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
+
+	return ret ? ret : count;
+
+}
+static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
+
+int ipoib_add_pkey_attr(struct net_device *dev)
+{
+	return device_create_file(&dev->dev, &dev_attr_pkey);
+}
+
+void ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
+{
+	priv->hca_caps = hca->attrs.device_cap_flags;
+
+	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
+		priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
+
+		if (priv->hca_caps & IB_DEVICE_UD_TSO)
+			priv->dev->hw_features |= NETIF_F_TSO;
+
+		priv->dev->features |= priv->dev->hw_features;
+	}
+}
+
+static struct net_device *ipoib_add_port(const char *format,
+					 struct ib_device *hca, u8 port)
+{
+	struct ipoib_dev_priv *priv;
+	struct ib_port_attr attr;
+	struct rdma_netdev *rn;
+	int result = -ENOMEM;
+
+	priv = ipoib_intf_alloc(hca, port, format);
+	if (!priv) {
+		pr_warn("%s, %d: ipoib_intf_alloc failed\n", hca->name, port);
+		goto alloc_mem_failed;
+	}
+
+	SET_NETDEV_DEV(priv->dev, hca->dev.parent);
+	priv->dev->dev_id = port - 1;
+
+	result = ib_query_port(hca, port, &attr);
+	if (result) {
+		pr_warn("%s: ib_query_port %d failed\n", hca->name, port);
+		goto device_init_failed;
+	}
+
+	priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+
+	/* MTU will be reset when mcast join happens */
+	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
+	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
+	priv->dev->max_mtu = IPOIB_CM_MTU;
+
+	priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
+
+	result = ib_query_pkey(hca, port, 0, &priv->pkey);
+	if (result) {
+		pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n",
+			hca->name, port, result);
+		goto device_init_failed;
+	}
+
+	ipoib_set_dev_features(priv, hca);
+
+	/*
+	 * Set the full membership bit, so that we join the right
+	 * broadcast group, etc.
+	 */
+	priv->pkey |= 0x8000;
+
+	priv->dev->broadcast[8] = priv->pkey >> 8;
+	priv->dev->broadcast[9] = priv->pkey & 0xff;
+
+	result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL);
+	if (result) {
+		pr_warn("%s: ib_query_gid port %d failed (ret = %d)\n",
+			hca->name, port, result);
+		goto device_init_failed;
+	}
+
+	memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw,
+	       sizeof(union ib_gid));
+	set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
+
+	result = ipoib_dev_init(priv->dev, hca, port);
+	if (result) {
+		pr_warn("%s: failed to initialize port %d (ret = %d)\n",
+			hca->name, port, result);
+		goto device_init_failed;
+	}
+
+	INIT_IB_EVENT_HANDLER(&priv->event_handler,
+			      priv->ca, ipoib_event);
+	ib_register_event_handler(&priv->event_handler);
+
+	/* call event handler to ensure pkey in sync */
+	queue_work(ipoib_workqueue, &priv->flush_heavy);
+
+	result = register_netdev(priv->dev);
+	if (result) {
+		pr_warn("%s: couldn't register ipoib port %d; error %d\n",
+			hca->name, port, result);
+		goto register_failed;
+	}
+
+	result = -ENOMEM;
+	if (ipoib_cm_add_mode_attr(priv->dev))
+		goto sysfs_failed;
+	if (ipoib_add_pkey_attr(priv->dev))
+		goto sysfs_failed;
+	if (ipoib_add_umcast_attr(priv->dev))
+		goto sysfs_failed;
+	if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
+		goto sysfs_failed;
+	if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
+		goto sysfs_failed;
+
+	return priv->dev;
+
+sysfs_failed:
+	unregister_netdev(priv->dev);
+
+register_failed:
+	ib_unregister_event_handler(&priv->event_handler);
+	flush_workqueue(ipoib_workqueue);
+	/* Stop GC if started before flush */
+	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+	cancel_delayed_work(&priv->neigh_reap_task);
+	flush_workqueue(priv->wq);
+	ipoib_dev_cleanup(priv->dev);
+
+device_init_failed:
+	rn = netdev_priv(priv->dev);
+	rn->free_rdma_netdev(priv->dev);
+	kfree(priv);
+
+alloc_mem_failed:
+	return ERR_PTR(result);
+}
+
+static void ipoib_add_one(struct ib_device *device)
+{
+	struct list_head *dev_list;
+	struct net_device *dev;
+	struct ipoib_dev_priv *priv;
+	int p;
+	int count = 0;
+
+	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
+	if (!dev_list)
+		return;
+
+	INIT_LIST_HEAD(dev_list);
+
+	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+		if (!rdma_protocol_ib(device, p))
+			continue;
+		dev = ipoib_add_port("ib%d", device, p);
+		if (!IS_ERR(dev)) {
+			priv = ipoib_priv(dev);
+			list_add_tail(&priv->list, dev_list);
+			count++;
+		}
+	}
+
+	if (!count) {
+		kfree(dev_list);
+		return;
+	}
+
+	ib_set_client_data(device, &ipoib_client, dev_list);
+}
+
+static void ipoib_remove_one(struct ib_device *device, void *client_data)
+{
+	struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv;
+	struct list_head *dev_list = client_data;
+
+	if (!dev_list)
+		return;
+
+	list_for_each_entry_safe(priv, tmp, dev_list, list) {
+		struct rdma_netdev *parent_rn = netdev_priv(priv->dev);
+
+		ib_unregister_event_handler(&priv->event_handler);
+		flush_workqueue(ipoib_workqueue);
+
+		/* mark interface in the middle of destruction */
+		set_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags);
+
+		rtnl_lock();
+		dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
+		rtnl_unlock();
+
+		/* Stop GC */
+		set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+		cancel_delayed_work(&priv->neigh_reap_task);
+		flush_workqueue(priv->wq);
+
+		/* Wrap rtnl_lock/unlock with mutex to protect sysfs calls */
+		mutex_lock(&priv->sysfs_mutex);
+		unregister_netdev(priv->dev);
+		mutex_unlock(&priv->sysfs_mutex);
+
+		parent_rn->free_rdma_netdev(priv->dev);
+
+		list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+			struct rdma_netdev *child_rn;
+
+			child_rn = netdev_priv(cpriv->dev);
+			child_rn->free_rdma_netdev(cpriv->dev);
+			kfree(cpriv);
+		}
+
+		kfree(priv);
+	}
+
+	kfree(dev_list);
+}
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+static struct notifier_block ipoib_netdev_notifier = {
+	.notifier_call = ipoib_netdev_event,
+};
+#endif
+
+static int __init ipoib_init_module(void)
+{
+	int ret;
+
+	ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
+	ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
+	ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
+
+	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
+	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
+	ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
+	ipoib_max_conn_qp = max(ipoib_max_conn_qp, 0);
+#endif
+
+	/*
+	 * When copying small received packets, we only copy from the
+	 * linear data part of the SKB, so we rely on this condition.
+	 */
+	BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
+
+	ret = ipoib_register_debugfs();
+	if (ret)
+		return ret;
+
+	/*
+	 * We create a global workqueue here that is used for all flush
+	 * operations.  However, if you attempt to flush a workqueue
+	 * from a task on that same workqueue, it deadlocks the system.
+	 * We want to be able to flush the tasks associated with a
+	 * specific net device, so we also create a workqueue for each
+	 * netdevice.  We queue up the tasks for that device only on
+	 * its private workqueue, and we only queue up flush events
+	 * on our global flush workqueue.  This avoids the deadlocks.
+	 */
+	ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush",
+						  WQ_MEM_RECLAIM);
+	if (!ipoib_workqueue) {
+		ret = -ENOMEM;
+		goto err_fs;
+	}
+
+	ib_sa_register_client(&ipoib_sa_client);
+
+	ret = ib_register_client(&ipoib_client);
+	if (ret)
+		goto err_sa;
+
+	ret = ipoib_netlink_init();
+	if (ret)
+		goto err_client;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+	register_netdevice_notifier(&ipoib_netdev_notifier);
+#endif
+	return 0;
+
+err_client:
+	ib_unregister_client(&ipoib_client);
+
+err_sa:
+	ib_sa_unregister_client(&ipoib_sa_client);
+	destroy_workqueue(ipoib_workqueue);
+
+err_fs:
+	ipoib_unregister_debugfs();
+
+	return ret;
+}
+
+static void __exit ipoib_cleanup_module(void)
+{
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+	unregister_netdevice_notifier(&ipoib_netdev_notifier);
+#endif
+	ipoib_netlink_fini();
+	ib_unregister_client(&ipoib_client);
+	ib_sa_unregister_client(&ipoib_sa_client);
+	ipoib_unregister_debugfs();
+	destroy_workqueue(ipoib_workqueue);
+}
+
+module_init(ipoib_init_module);
+module_exit(ipoib_cleanup_module);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
new file mode 100644
index 0000000..9b3f47a
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -0,0 +1,1067 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/moduleparam.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/delay.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+
+#include <net/dst.h>
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+static int mcast_debug_level;
+
+module_param(mcast_debug_level, int, 0644);
+MODULE_PARM_DESC(mcast_debug_level,
+		 "Enable multicast debug tracing if > 0");
+#endif
+
+struct ipoib_mcast_iter {
+	struct net_device *dev;
+	union ib_gid       mgid;
+	unsigned long      created;
+	unsigned int       queuelen;
+	unsigned int       complete;
+	unsigned int       send_only;
+};
+
+/* join state that allows creating mcg with sendonly member request */
+#define SENDONLY_FULLMEMBER_JOIN	8
+
+/*
+ * This should be called with the priv->lock held
+ */
+static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv,
+					       struct ipoib_mcast *mcast,
+					       bool delay)
+{
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+		return;
+
+	/*
+	 * We will be scheduling *something*, so cancel whatever is
+	 * currently scheduled first
+	 */
+	cancel_delayed_work(&priv->mcast_task);
+	if (mcast && delay) {
+		/*
+		 * We had a failure and want to schedule a retry later
+		 */
+		mcast->backoff *= 2;
+		if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+			mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+		mcast->delay_until = jiffies + (mcast->backoff * HZ);
+		/*
+		 * Mark this mcast for its delay, but restart the
+		 * task immediately.  The join task will make sure to
+		 * clear out all entries without delays, and then
+		 * schedule itself to run again when the earliest
+		 * delay expires
+		 */
+		queue_delayed_work(priv->wq, &priv->mcast_task, 0);
+	} else if (delay) {
+		/*
+		 * Special case of retrying after a failure to
+		 * allocate the broadcast multicast group, wait
+		 * 1 second and try again
+		 */
+		queue_delayed_work(priv->wq, &priv->mcast_task, HZ);
+	} else
+		queue_delayed_work(priv->wq, &priv->mcast_task, 0);
+}
+
+static void ipoib_mcast_free(struct ipoib_mcast *mcast)
+{
+	struct net_device *dev = mcast->dev;
+	int tx_dropped = 0;
+
+	ipoib_dbg_mcast(ipoib_priv(dev), "deleting multicast group %pI6\n",
+			mcast->mcmember.mgid.raw);
+
+	/* remove all neigh connected to this mcast */
+	ipoib_del_neighs_by_gid(dev, mcast->mcmember.mgid.raw);
+
+	if (mcast->ah)
+		ipoib_put_ah(mcast->ah);
+
+	while (!skb_queue_empty(&mcast->pkt_queue)) {
+		++tx_dropped;
+		dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
+	}
+
+	netif_tx_lock_bh(dev);
+	dev->stats.tx_dropped += tx_dropped;
+	netif_tx_unlock_bh(dev);
+
+	kfree(mcast);
+}
+
+static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
+					     int can_sleep)
+{
+	struct ipoib_mcast *mcast;
+
+	mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+	if (!mcast)
+		return NULL;
+
+	mcast->dev = dev;
+	mcast->created = jiffies;
+	mcast->delay_until = jiffies;
+	mcast->backoff = 1;
+
+	INIT_LIST_HEAD(&mcast->list);
+	INIT_LIST_HEAD(&mcast->neigh_list);
+	skb_queue_head_init(&mcast->pkt_queue);
+
+	return mcast;
+}
+
+static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct rb_node *n = priv->multicast_tree.rb_node;
+
+	while (n) {
+		struct ipoib_mcast *mcast;
+		int ret;
+
+		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+		ret = memcmp(mgid, mcast->mcmember.mgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return mcast;
+	}
+
+	return NULL;
+}
+
+static int __ipoib_mcast_add(struct net_device *dev, struct ipoib_mcast *mcast)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL;
+
+	while (*n) {
+		struct ipoib_mcast *tmcast;
+		int ret;
+
+		pn = *n;
+		tmcast = rb_entry(pn, struct ipoib_mcast, rb_node);
+
+		ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = &pn->rb_left;
+		else if (ret > 0)
+			n = &pn->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&mcast->rb_node, pn, n);
+	rb_insert_color(&mcast->rb_node, &priv->multicast_tree);
+
+	return 0;
+}
+
+static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
+				   struct ib_sa_mcmember_rec *mcmember)
+{
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct rdma_netdev *rn = netdev_priv(dev);
+	struct ipoib_ah *ah;
+	struct rdma_ah_attr av;
+	int ret;
+	int set_qkey = 0;
+
+	mcast->mcmember = *mcmember;
+
+	/* Set the multicast MTU and cached Q_Key before we attach if it's
+	 * the broadcast group.
+	 */
+	if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
+		    sizeof (union ib_gid))) {
+		spin_lock_irq(&priv->lock);
+		if (!priv->broadcast) {
+			spin_unlock_irq(&priv->lock);
+			return -EAGAIN;
+		}
+		/*update priv member according to the new mcast*/
+		priv->broadcast->mcmember.qkey = mcmember->qkey;
+		priv->broadcast->mcmember.mtu = mcmember->mtu;
+		priv->broadcast->mcmember.traffic_class = mcmember->traffic_class;
+		priv->broadcast->mcmember.rate = mcmember->rate;
+		priv->broadcast->mcmember.sl = mcmember->sl;
+		priv->broadcast->mcmember.flow_label = mcmember->flow_label;
+		priv->broadcast->mcmember.hop_limit = mcmember->hop_limit;
+		/* assume if the admin and the mcast are the same both can be changed */
+		if (priv->mcast_mtu == priv->admin_mtu)
+			priv->admin_mtu =
+			priv->mcast_mtu =
+			IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+		else
+			priv->mcast_mtu =
+			IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+
+		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
+		spin_unlock_irq(&priv->lock);
+		priv->tx_wr.remote_qkey = priv->qkey;
+		set_qkey = 1;
+	}
+
+	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+		if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+			ipoib_warn(priv, "multicast group %pI6 already attached\n",
+				   mcast->mcmember.mgid.raw);
+
+			return 0;
+		}
+
+		ret = rn->attach_mcast(dev, priv->ca, &mcast->mcmember.mgid,
+				       be16_to_cpu(mcast->mcmember.mlid),
+				       set_qkey, priv->qkey);
+		if (ret < 0) {
+			ipoib_warn(priv, "couldn't attach QP to multicast group %pI6\n",
+				   mcast->mcmember.mgid.raw);
+
+			clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
+			return ret;
+		}
+	}
+
+	memset(&av, 0, sizeof(av));
+	av.type = rdma_ah_find_type(priv->ca, priv->port);
+	rdma_ah_set_dlid(&av, be16_to_cpu(mcast->mcmember.mlid)),
+	rdma_ah_set_port_num(&av, priv->port);
+	rdma_ah_set_sl(&av, mcast->mcmember.sl);
+	rdma_ah_set_static_rate(&av, mcast->mcmember.rate);
+
+	rdma_ah_set_grh(&av, &mcast->mcmember.mgid,
+			be32_to_cpu(mcast->mcmember.flow_label),
+			0, mcast->mcmember.hop_limit,
+			mcast->mcmember.traffic_class);
+
+	ah = ipoib_create_ah(dev, priv->pd, &av);
+	if (IS_ERR(ah)) {
+		ipoib_warn(priv, "ib_address_create failed %ld\n",
+			   -PTR_ERR(ah));
+		/* use original error */
+		return PTR_ERR(ah);
+	}
+	spin_lock_irq(&priv->lock);
+	mcast->ah = ah;
+	spin_unlock_irq(&priv->lock);
+
+	ipoib_dbg_mcast(priv, "MGID %pI6 AV %p, LID 0x%04x, SL %d\n",
+			mcast->mcmember.mgid.raw,
+			mcast->ah->ah,
+			be16_to_cpu(mcast->mcmember.mlid),
+			mcast->mcmember.sl);
+
+	/* actually send any queued packets */
+	netif_tx_lock_bh(dev);
+	while (!skb_queue_empty(&mcast->pkt_queue)) {
+		struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
+
+		netif_tx_unlock_bh(dev);
+
+		skb->dev = dev;
+
+		ret = dev_queue_xmit(skb);
+		if (ret)
+			ipoib_warn(priv, "%s:dev_queue_xmit failed to re-queue packet, ret:%d\n",
+				   __func__, ret);
+		netif_tx_lock_bh(dev);
+	}
+	netif_tx_unlock_bh(dev);
+
+	return 0;
+}
+
+void ipoib_mcast_carrier_on_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   carrier_on_task);
+	struct ib_port_attr attr;
+
+	if (ib_query_port(priv->ca, priv->port, &attr) ||
+	    attr.state != IB_PORT_ACTIVE) {
+		ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
+		return;
+	}
+	/*
+	 * Check if can send sendonly MCG's with sendonly-fullmember join state.
+	 * It done here after the successfully join to the broadcast group,
+	 * because the broadcast group must always be joined first and is always
+	 * re-joined if the SM changes substantially.
+	 */
+	priv->sm_fullmember_sendonly_support =
+		ib_sa_sendonly_fullmem_support(&ipoib_sa_client,
+					       priv->ca, priv->port);
+	/*
+	 * Take rtnl_lock to avoid racing with ipoib_stop() and
+	 * turning the carrier back on while a device is being
+	 * removed.  However, ipoib_stop() will attempt to flush
+	 * the workqueue while holding the rtnl lock, so loop
+	 * on trylock until either we get the lock or we see
+	 * FLAG_OPER_UP go away as that signals that we are bailing
+	 * and can safely ignore the carrier on work.
+	 */
+	while (!rtnl_trylock()) {
+		if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+			return;
+		else
+			msleep(20);
+	}
+	if (!ipoib_cm_admin_enabled(priv->dev))
+		dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu));
+	netif_carrier_on(priv->dev);
+	rtnl_unlock();
+}
+
+static int ipoib_mcast_join_complete(int status,
+				     struct ib_sa_multicast *multicast)
+{
+	struct ipoib_mcast *mcast = multicast->context;
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	ipoib_dbg_mcast(priv, "%sjoin completion for %pI6 (status %d)\n",
+			test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ?
+			"sendonly " : "",
+			mcast->mcmember.mgid.raw, status);
+
+	/* We trap for port events ourselves. */
+	if (status == -ENETRESET) {
+		status = 0;
+		goto out;
+	}
+
+	if (!status)
+		status = ipoib_mcast_join_finish(mcast, &multicast->rec);
+
+	if (!status) {
+		mcast->backoff = 1;
+		mcast->delay_until = jiffies;
+
+		/*
+		 * Defer carrier on work to priv->wq to avoid a
+		 * deadlock on rtnl_lock here.  Requeue our multicast
+		 * work too, which will end up happening right after
+		 * our carrier on task work and will allow us to
+		 * send out all of the non-broadcast joins
+		 */
+		if (mcast == priv->broadcast) {
+			spin_lock_irq(&priv->lock);
+			queue_work(priv->wq, &priv->carrier_on_task);
+			__ipoib_mcast_schedule_join_thread(priv, NULL, 0);
+			goto out_locked;
+		}
+	} else {
+		bool silent_fail =
+		    test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
+		    status == -EINVAL;
+
+		if (mcast->logcount < 20) {
+			if (status == -ETIMEDOUT || status == -EAGAIN ||
+			    silent_fail) {
+				ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n",
+						test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
+						mcast->mcmember.mgid.raw, status);
+			} else {
+				ipoib_warn(priv, "%smulticast join failed for %pI6, status %d\n",
+						test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
+					   mcast->mcmember.mgid.raw, status);
+			}
+
+			if (!silent_fail)
+				mcast->logcount++;
+		}
+
+		if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
+		    mcast->backoff >= 2) {
+			/*
+			 * We only retry sendonly joins once before we drop
+			 * the packet and quit trying to deal with the
+			 * group.  However, we leave the group in the
+			 * mcast list as an unjoined group.  If we want to
+			 * try joining again, we simply queue up a packet
+			 * and restart the join thread.  The empty queue
+			 * is why the join thread ignores this group.
+			 */
+			mcast->backoff = 1;
+			netif_tx_lock_bh(dev);
+			while (!skb_queue_empty(&mcast->pkt_queue)) {
+				++dev->stats.tx_dropped;
+				dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
+			}
+			netif_tx_unlock_bh(dev);
+		} else {
+			spin_lock_irq(&priv->lock);
+			/* Requeue this join task with a backoff delay */
+			__ipoib_mcast_schedule_join_thread(priv, mcast, 1);
+			goto out_locked;
+		}
+	}
+out:
+	spin_lock_irq(&priv->lock);
+out_locked:
+	/*
+	 * Make sure to set mcast->mc before we clear the busy flag to avoid
+	 * racing with code that checks for BUSY before checking mcast->mc
+	 */
+	if (status)
+		mcast->mc = NULL;
+	else
+		mcast->mc = multicast;
+	clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+	spin_unlock_irq(&priv->lock);
+	complete(&mcast->done);
+
+	return status;
+}
+
+/*
+ * Caller must hold 'priv->lock'
+ */
+static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ib_sa_multicast *multicast;
+	struct ib_sa_mcmember_rec rec = {
+		.join_state = 1
+	};
+	ib_sa_comp_mask comp_mask;
+	int ret = 0;
+
+	if (!priv->broadcast ||
+	    !test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+		return -EINVAL;
+
+	init_completion(&mcast->done);
+	set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+
+	ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw);
+
+	rec.mgid     = mcast->mcmember.mgid;
+	rec.port_gid = priv->local_gid;
+	rec.pkey     = cpu_to_be16(priv->pkey);
+
+	comp_mask =
+		IB_SA_MCMEMBER_REC_MGID		|
+		IB_SA_MCMEMBER_REC_PORT_GID	|
+		IB_SA_MCMEMBER_REC_PKEY		|
+		IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+	if (mcast != priv->broadcast) {
+		/*
+		 * RFC 4391:
+		 *  The MGID MUST use the same P_Key, Q_Key, SL, MTU,
+		 *  and HopLimit as those used in the broadcast-GID.  The rest
+		 *  of attributes SHOULD follow the values used in the
+		 *  broadcast-GID as well.
+		 */
+		comp_mask |=
+			IB_SA_MCMEMBER_REC_QKEY			|
+			IB_SA_MCMEMBER_REC_MTU_SELECTOR		|
+			IB_SA_MCMEMBER_REC_MTU			|
+			IB_SA_MCMEMBER_REC_TRAFFIC_CLASS	|
+			IB_SA_MCMEMBER_REC_RATE_SELECTOR	|
+			IB_SA_MCMEMBER_REC_RATE			|
+			IB_SA_MCMEMBER_REC_SL			|
+			IB_SA_MCMEMBER_REC_FLOW_LABEL		|
+			IB_SA_MCMEMBER_REC_HOP_LIMIT;
+
+		rec.qkey	  = priv->broadcast->mcmember.qkey;
+		rec.mtu_selector  = IB_SA_EQ;
+		rec.mtu		  = priv->broadcast->mcmember.mtu;
+		rec.traffic_class = priv->broadcast->mcmember.traffic_class;
+		rec.rate_selector = IB_SA_EQ;
+		rec.rate	  = priv->broadcast->mcmember.rate;
+		rec.sl		  = priv->broadcast->mcmember.sl;
+		rec.flow_label	  = priv->broadcast->mcmember.flow_label;
+		rec.hop_limit	  = priv->broadcast->mcmember.hop_limit;
+
+		/*
+		 * Send-only IB Multicast joins work at the core IB layer but
+		 * require specific SM support.
+		 * We can use such joins here only if the current SM supports that feature.
+		 * However, if not, we emulate an Ethernet multicast send,
+		 * which does not require a multicast subscription and will
+		 * still send properly. The most appropriate thing to
+		 * do is to create the group if it doesn't exist as that
+		 * most closely emulates the behavior, from a user space
+		 * application perspective, of Ethernet multicast operation.
+		 */
+		if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
+		    priv->sm_fullmember_sendonly_support)
+			/* SM supports sendonly-fullmember, otherwise fallback to full-member */
+			rec.join_state = SENDONLY_FULLMEMBER_JOIN;
+	}
+	spin_unlock_irq(&priv->lock);
+
+	multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
+					 &rec, comp_mask, GFP_KERNEL,
+					 ipoib_mcast_join_complete, mcast);
+	spin_lock_irq(&priv->lock);
+	if (IS_ERR(multicast)) {
+		ret = PTR_ERR(multicast);
+		ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
+		/* Requeue this join task with a backoff delay */
+		__ipoib_mcast_schedule_join_thread(priv, mcast, 1);
+		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+		spin_unlock_irq(&priv->lock);
+		complete(&mcast->done);
+		spin_lock_irq(&priv->lock);
+	}
+	return 0;
+}
+
+void ipoib_mcast_join_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, mcast_task.work);
+	struct net_device *dev = priv->dev;
+	struct ib_port_attr port_attr;
+	unsigned long delay_until = 0;
+	struct ipoib_mcast *mcast = NULL;
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+		return;
+
+	if (ib_query_port(priv->ca, priv->port, &port_attr)) {
+		ipoib_dbg(priv, "ib_query_port() failed\n");
+		return;
+	}
+	if (port_attr.state != IB_PORT_ACTIVE) {
+		ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n",
+			  port_attr.state);
+		return;
+	}
+	priv->local_lid = port_attr.lid;
+	netif_addr_lock_bh(dev);
+
+	if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
+		netif_addr_unlock_bh(dev);
+		return;
+	}
+	netif_addr_unlock_bh(dev);
+
+	spin_lock_irq(&priv->lock);
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+		goto out;
+
+	if (!priv->broadcast) {
+		struct ipoib_mcast *broadcast;
+
+		broadcast = ipoib_mcast_alloc(dev, 0);
+		if (!broadcast) {
+			ipoib_warn(priv, "failed to allocate broadcast group\n");
+			/*
+			 * Restart us after a 1 second delay to retry
+			 * creating our broadcast group and attaching to
+			 * it.  Until this succeeds, this ipoib dev is
+			 * completely stalled (multicast wise).
+			 */
+			__ipoib_mcast_schedule_join_thread(priv, NULL, 1);
+			goto out;
+		}
+
+		memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
+		       sizeof (union ib_gid));
+		priv->broadcast = broadcast;
+
+		__ipoib_mcast_add(dev, priv->broadcast);
+	}
+
+	if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+		if (IS_ERR_OR_NULL(priv->broadcast->mc) &&
+		    !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) {
+			mcast = priv->broadcast;
+			if (mcast->backoff > 1 &&
+			    time_before(jiffies, mcast->delay_until)) {
+				delay_until = mcast->delay_until;
+				mcast = NULL;
+			}
+		}
+		goto out;
+	}
+
+	/*
+	 * We'll never get here until the broadcast group is both allocated
+	 * and attached
+	 */
+	list_for_each_entry(mcast, &priv->multicast_list, list) {
+		if (IS_ERR_OR_NULL(mcast->mc) &&
+		    !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) &&
+		    (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ||
+		     !skb_queue_empty(&mcast->pkt_queue))) {
+			if (mcast->backoff == 1 ||
+			    time_after_eq(jiffies, mcast->delay_until)) {
+				/* Found the next unjoined group */
+				if (ipoib_mcast_join(dev, mcast)) {
+					spin_unlock_irq(&priv->lock);
+					return;
+				}
+			} else if (!delay_until ||
+				 time_before(mcast->delay_until, delay_until))
+				delay_until = mcast->delay_until;
+		}
+	}
+
+	mcast = NULL;
+	ipoib_dbg_mcast(priv, "successfully started all multicast joins\n");
+
+out:
+	if (delay_until) {
+		cancel_delayed_work(&priv->mcast_task);
+		queue_delayed_work(priv->wq, &priv->mcast_task,
+				   delay_until - jiffies);
+	}
+	if (mcast)
+		ipoib_mcast_join(dev, mcast);
+
+	spin_unlock_irq(&priv->lock);
+}
+
+void ipoib_mcast_start_thread(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	unsigned long flags;
+
+	ipoib_dbg_mcast(priv, "starting multicast thread\n");
+
+	spin_lock_irqsave(&priv->lock, flags);
+	__ipoib_mcast_schedule_join_thread(priv, NULL, 0);
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+int ipoib_mcast_stop_thread(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
+
+	cancel_delayed_work_sync(&priv->mcast_task);
+
+	return 0;
+}
+
+static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct rdma_netdev *rn = netdev_priv(dev);
+	int ret = 0;
+
+	if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+		ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n");
+
+	if (!IS_ERR_OR_NULL(mcast->mc))
+		ib_sa_free_multicast(mcast->mc);
+
+	if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+		ipoib_dbg_mcast(priv, "leaving MGID %pI6\n",
+				mcast->mcmember.mgid.raw);
+
+		/* Remove ourselves from the multicast group */
+		ret = rn->detach_mcast(dev, priv->ca, &mcast->mcmember.mgid,
+				       be16_to_cpu(mcast->mcmember.mlid));
+		if (ret)
+			ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
+	} else if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+		ipoib_dbg(priv, "leaving with no mcmember but not a "
+			  "SENDONLY join\n");
+
+	return 0;
+}
+
+/*
+ * Check if the multicast group is sendonly. If so remove it from the maps
+ * and add to the remove list
+ */
+void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid,
+				struct list_head *remove_list)
+{
+	/* Is this multicast ? */
+	if (*mgid == 0xff) {
+		struct ipoib_mcast *mcast = __ipoib_mcast_find(priv->dev, mgid);
+
+		if (mcast && test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			list_del(&mcast->list);
+			rb_erase(&mcast->rb_node, &priv->multicast_tree);
+			list_add_tail(&mcast->list, remove_list);
+		}
+	}
+}
+
+void ipoib_mcast_remove_list(struct list_head *remove_list)
+{
+	struct ipoib_mcast *mcast, *tmcast;
+
+	/*
+	 * make sure the in-flight joins have finished before we attempt
+	 * to leave
+	 */
+	list_for_each_entry_safe(mcast, tmcast, remove_list, list)
+		if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+			wait_for_completion(&mcast->done);
+
+	list_for_each_entry_safe(mcast, tmcast, remove_list, list) {
+		ipoib_mcast_leave(mcast->dev, mcast);
+		ipoib_mcast_free(mcast);
+	}
+}
+
+void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct rdma_netdev *rn = netdev_priv(dev);
+	struct ipoib_mcast *mcast;
+	unsigned long flags;
+	void *mgid = daddr + 4;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)		||
+	    !priv->broadcast					||
+	    !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+		goto unlock;
+	}
+
+	mcast = __ipoib_mcast_find(dev, mgid);
+	if (!mcast || !mcast->ah) {
+		if (!mcast) {
+			/* Let's create a new send only group now */
+			ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n",
+					mgid);
+
+			mcast = ipoib_mcast_alloc(dev, 0);
+			if (!mcast) {
+				ipoib_warn(priv, "unable to allocate memory "
+					   "for multicast structure\n");
+				++dev->stats.tx_dropped;
+				dev_kfree_skb_any(skb);
+				goto unlock;
+			}
+
+			set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
+			memcpy(mcast->mcmember.mgid.raw, mgid,
+			       sizeof (union ib_gid));
+			__ipoib_mcast_add(dev, mcast);
+			list_add_tail(&mcast->list, &priv->multicast_list);
+		}
+		if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) {
+			/* put pseudoheader back on for next time */
+			skb_push(skb, sizeof(struct ipoib_pseudo_header));
+			skb_queue_tail(&mcast->pkt_queue, skb);
+		} else {
+			++dev->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+		}
+		if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
+			__ipoib_mcast_schedule_join_thread(priv, NULL, 0);
+		}
+	} else {
+		struct ipoib_neigh *neigh;
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		neigh = ipoib_neigh_get(dev, daddr);
+		spin_lock_irqsave(&priv->lock, flags);
+		if (!neigh) {
+			neigh = ipoib_neigh_alloc(daddr, dev);
+			/* Make sure that the neigh will be added only
+			 * once to mcast list.
+			 */
+			if (neigh && list_empty(&neigh->list)) {
+				kref_get(&mcast->ah->ref);
+				neigh->ah	= mcast->ah;
+				list_add_tail(&neigh->list, &mcast->neigh_list);
+			}
+		}
+		spin_unlock_irqrestore(&priv->lock, flags);
+		mcast->ah->last_send = rn->send(dev, skb, mcast->ah->ah,
+						IB_MULTICAST_QPN);
+		if (neigh)
+			ipoib_neigh_put(neigh);
+		return;
+	}
+
+unlock:
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+void ipoib_mcast_dev_flush(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	LIST_HEAD(remove_list);
+	struct ipoib_mcast *mcast, *tmcast;
+	unsigned long flags;
+
+	mutex_lock(&priv->mcast_mutex);
+	ipoib_dbg_mcast(priv, "flushing multicast list\n");
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+		list_del(&mcast->list);
+		rb_erase(&mcast->rb_node, &priv->multicast_tree);
+		list_add_tail(&mcast->list, &remove_list);
+	}
+
+	if (priv->broadcast) {
+		rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree);
+		list_add_tail(&priv->broadcast->list, &remove_list);
+		priv->broadcast = NULL;
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	ipoib_mcast_remove_list(&remove_list);
+	mutex_unlock(&priv->mcast_mutex);
+}
+
+static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast)
+{
+	/* reserved QPN, prefix, scope */
+	if (memcmp(addr, broadcast, 6))
+		return 0;
+	/* signature lower, pkey */
+	if (memcmp(addr + 7, broadcast + 7, 3))
+		return 0;
+	return 1;
+}
+
+void ipoib_mcast_restart_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, restart_task);
+	struct net_device *dev = priv->dev;
+	struct netdev_hw_addr *ha;
+	struct ipoib_mcast *mcast, *tmcast;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+	struct ib_sa_mcmember_rec rec;
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+		/*
+		 * shortcut...on shutdown flush is called next, just
+		 * let it do all the work
+		 */
+		return;
+
+	ipoib_dbg_mcast(priv, "restarting multicast task\n");
+
+	local_irq_save(flags);
+	netif_addr_lock(dev);
+	spin_lock(&priv->lock);
+
+	/*
+	 * Unfortunately, the networking core only gives us a list of all of
+	 * the multicast hardware addresses. We need to figure out which ones
+	 * are new and which ones have been removed
+	 */
+
+	/* Clear out the found flag */
+	list_for_each_entry(mcast, &priv->multicast_list, list)
+		clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+
+	/* Mark all of the entries that are found or don't exist */
+	netdev_for_each_mc_addr(ha, dev) {
+		union ib_gid mgid;
+
+		if (!ipoib_mcast_addr_is_valid(ha->addr, dev->broadcast))
+			continue;
+
+		memcpy(mgid.raw, ha->addr + 4, sizeof mgid);
+
+		mcast = __ipoib_mcast_find(dev, &mgid);
+		if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			struct ipoib_mcast *nmcast;
+
+			/* ignore group which is directly joined by userspace */
+			if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) &&
+			    !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) {
+				ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %pI6\n",
+						mgid.raw);
+				continue;
+			}
+
+			/* Not found or send-only group, let's add a new entry */
+			ipoib_dbg_mcast(priv, "adding multicast entry for mgid %pI6\n",
+					mgid.raw);
+
+			nmcast = ipoib_mcast_alloc(dev, 0);
+			if (!nmcast) {
+				ipoib_warn(priv, "unable to allocate memory for multicast structure\n");
+				continue;
+			}
+
+			set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags);
+
+			nmcast->mcmember.mgid = mgid;
+
+			if (mcast) {
+				/* Destroy the send only entry */
+				list_move_tail(&mcast->list, &remove_list);
+
+				rb_replace_node(&mcast->rb_node,
+						&nmcast->rb_node,
+						&priv->multicast_tree);
+			} else
+				__ipoib_mcast_add(dev, nmcast);
+
+			list_add_tail(&nmcast->list, &priv->multicast_list);
+		}
+
+		if (mcast)
+			set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+	}
+
+	/* Remove all of the entries don't exist anymore */
+	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+		if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) &&
+		    !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			ipoib_dbg_mcast(priv, "deleting multicast group %pI6\n",
+					mcast->mcmember.mgid.raw);
+
+			rb_erase(&mcast->rb_node, &priv->multicast_tree);
+
+			/* Move to the remove list */
+			list_move_tail(&mcast->list, &remove_list);
+		}
+	}
+
+	spin_unlock(&priv->lock);
+	netif_addr_unlock(dev);
+	local_irq_restore(flags);
+
+	ipoib_mcast_remove_list(&remove_list);
+
+	/*
+	 * Double check that we are still up
+	 */
+	if (test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+		spin_lock_irqsave(&priv->lock, flags);
+		__ipoib_mcast_schedule_join_thread(priv, NULL, 0);
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
+}
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev)
+{
+	struct ipoib_mcast_iter *iter;
+
+	iter = kmalloc(sizeof *iter, GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	iter->dev = dev;
+	memset(iter->mgid.raw, 0, 16);
+
+	if (ipoib_mcast_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(iter->dev);
+	struct rb_node *n;
+	struct ipoib_mcast *mcast;
+	int ret = 1;
+
+	spin_lock_irq(&priv->lock);
+
+	n = rb_first(&priv->multicast_tree);
+
+	while (n) {
+		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+		if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw,
+			   sizeof (union ib_gid)) < 0) {
+			iter->mgid      = mcast->mcmember.mgid;
+			iter->created   = mcast->created;
+			iter->queuelen  = skb_queue_len(&mcast->pkt_queue);
+			iter->complete  = !!mcast->ah;
+			iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY));
+
+			ret = 0;
+
+			break;
+		}
+
+		n = rb_next(n);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	return ret;
+}
+
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+			   union ib_gid *mgid,
+			   unsigned long *created,
+			   unsigned int *queuelen,
+			   unsigned int *complete,
+			   unsigned int *send_only)
+{
+	*mgid      = iter->mgid;
+	*created   = iter->created;
+	*queuelen  = iter->queuelen;
+	*complete  = iter->complete;
+	*send_only = iter->send_only;
+}
+
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
new file mode 100644
index 0000000..3e44087
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. -  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>      /* For ARPHRD_xxx */
+#include <linux/module.h>
+#include <net/rtnetlink.h>
+#include "ipoib.h"
+
+static const struct nla_policy ipoib_policy[IFLA_IPOIB_MAX + 1] = {
+	[IFLA_IPOIB_PKEY]	= { .type = NLA_U16 },
+	[IFLA_IPOIB_MODE]	= { .type = NLA_U16 },
+	[IFLA_IPOIB_UMCAST]	= { .type = NLA_U16 },
+};
+
+static int ipoib_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	u16 val;
+
+	if (nla_put_u16(skb, IFLA_IPOIB_PKEY, priv->pkey))
+		goto nla_put_failure;
+
+	val = test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+	if (nla_put_u16(skb, IFLA_IPOIB_MODE, val))
+		goto nla_put_failure;
+
+	val = test_bit(IPOIB_FLAG_UMCAST, &priv->flags);
+	if (nla_put_u16(skb, IFLA_IPOIB_UMCAST, val))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int ipoib_changelink(struct net_device *dev, struct nlattr *tb[],
+			    struct nlattr *data[],
+			    struct netlink_ext_ack *extack)
+{
+	u16 mode, umcast;
+	int ret = 0;
+
+	if (data[IFLA_IPOIB_MODE]) {
+		mode  = nla_get_u16(data[IFLA_IPOIB_MODE]);
+		if (mode == IPOIB_MODE_DATAGRAM)
+			ret = ipoib_set_mode(dev, "datagram\n");
+		else if (mode == IPOIB_MODE_CONNECTED)
+			ret = ipoib_set_mode(dev, "connected\n");
+		else
+			ret = -EINVAL;
+
+		if (ret < 0)
+			goto out_err;
+	}
+
+	if (data[IFLA_IPOIB_UMCAST]) {
+		umcast = nla_get_u16(data[IFLA_IPOIB_UMCAST]);
+		ipoib_set_umcast(dev, umcast);
+	}
+
+out_err:
+	return ret;
+}
+
+static int ipoib_new_child_link(struct net *src_net, struct net_device *dev,
+				struct nlattr *tb[], struct nlattr *data[],
+				struct netlink_ext_ack *extack)
+{
+	struct net_device *pdev;
+	struct ipoib_dev_priv *ppriv;
+	u16 child_pkey;
+	int err;
+
+	if (!tb[IFLA_LINK])
+		return -EINVAL;
+
+	pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
+	if (!pdev || pdev->type != ARPHRD_INFINIBAND)
+		return -ENODEV;
+
+	ppriv = ipoib_priv(pdev);
+
+	if (test_bit(IPOIB_FLAG_SUBINTERFACE, &ppriv->flags)) {
+		ipoib_warn(ppriv, "child creation disallowed for child devices\n");
+		return -EINVAL;
+	}
+
+	if (!data || !data[IFLA_IPOIB_PKEY]) {
+		ipoib_dbg(ppriv, "no pkey specified, using parent pkey\n");
+		child_pkey  = ppriv->pkey;
+	} else
+		child_pkey  = nla_get_u16(data[IFLA_IPOIB_PKEY]);
+
+	if (child_pkey == 0 || child_pkey == 0x8000)
+		return -EINVAL;
+
+	/*
+	 * Set the full membership bit, so that we join the right
+	 * broadcast group, etc.
+	 */
+	child_pkey |= 0x8000;
+
+	err = __ipoib_vlan_add(ppriv, ipoib_priv(dev),
+			       child_pkey, IPOIB_RTNL_CHILD);
+
+	if (!err && data)
+		err = ipoib_changelink(dev, tb, data, extack);
+	return err;
+}
+
+static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head *head)
+{
+	struct ipoib_dev_priv *priv, *ppriv;
+
+	priv = ipoib_priv(dev);
+	ppriv = ipoib_priv(priv->parent);
+
+	down_write(&ppriv->vlan_rwsem);
+	unregister_netdevice_queue(dev, head);
+	list_del(&priv->list);
+	up_write(&ppriv->vlan_rwsem);
+}
+
+static size_t ipoib_get_size(const struct net_device *dev)
+{
+	return nla_total_size(2) +	/* IFLA_IPOIB_PKEY   */
+		nla_total_size(2) +	/* IFLA_IPOIB_MODE   */
+		nla_total_size(2);	/* IFLA_IPOIB_UMCAST */
+}
+
+static struct rtnl_link_ops ipoib_link_ops __read_mostly = {
+	.kind		= "ipoib",
+	.maxtype	= IFLA_IPOIB_MAX,
+	.policy		= ipoib_policy,
+	.priv_size	= sizeof(struct ipoib_dev_priv),
+	.setup		= ipoib_setup_common,
+	.newlink	= ipoib_new_child_link,
+	.changelink	= ipoib_changelink,
+	.dellink	= ipoib_unregister_child_dev,
+	.get_size	= ipoib_get_size,
+	.fill_info	= ipoib_fill_info,
+};
+
+int __init ipoib_netlink_init(void)
+{
+	return rtnl_link_register(&ipoib_link_ops);
+}
+
+void __exit ipoib_netlink_fini(void)
+{
+	rtnl_link_unregister(&ipoib_link_ops);
+}
+
+MODULE_ALIAS_RTNL_LINK("ipoib");
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
new file mode 100644
index 0000000..984a880
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "ipoib.h"
+
+int ipoib_mcast_attach(struct net_device *dev, struct ib_device *hca,
+		       union ib_gid *mgid, u16 mlid, int set_qkey, u32 qkey)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ib_qp_attr *qp_attr = NULL;
+	int ret;
+	u16 pkey_index;
+
+	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+		ret = -ENXIO;
+		goto out;
+	}
+	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+	if (set_qkey) {
+		ret = -ENOMEM;
+		qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
+		if (!qp_attr)
+			goto out;
+
+		/* set correct QKey for QP */
+		qp_attr->qkey = qkey;
+		ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY);
+		if (ret) {
+			ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
+			goto out;
+		}
+	}
+
+	/* attach QP to multicast group */
+	ret = ib_attach_mcast(priv->qp, mgid, mlid);
+	if (ret)
+		ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret);
+
+out:
+	kfree(qp_attr);
+	return ret;
+}
+
+int ipoib_mcast_detach(struct net_device *dev, struct ib_device *hca,
+		       union ib_gid *mgid, u16 mlid)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int ret;
+
+	ret = ib_detach_mcast(priv->qp, mgid, mlid);
+
+	return ret;
+}
+
+int ipoib_init_qp(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	int ret;
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+		return -1;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qkey = 0;
+	qp_attr.port_num = priv->port;
+	qp_attr.pkey_index = priv->pkey_index;
+	attr_mask =
+	    IB_QP_QKEY |
+	    IB_QP_PORT |
+	    IB_QP_PKEY_INDEX |
+	    IB_QP_STATE;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	/* Can't set this in a INIT->RTR transition */
+	attr_mask &= ~IB_QP_PORT;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	attr_mask |= IB_QP_SQ_PSN;
+	attr_mask &= ~IB_QP_PKEY_INDEX;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	return 0;
+
+out_fail:
+	qp_attr.qp_state = IB_QPS_RESET;
+	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+
+	return ret;
+}
+
+int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+	struct ib_qp_init_attr init_attr = {
+		.cap = {
+			.max_send_wr  = ipoib_sendq_size,
+			.max_recv_wr  = ipoib_recvq_size,
+			.max_send_sge = min_t(u32, priv->ca->attrs.max_sge,
+					      MAX_SKB_FRAGS + 1),
+			.max_recv_sge = IPOIB_UD_RX_SG
+		},
+		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.qp_type     = IB_QPT_UD
+	};
+	struct ib_cq_init_attr cq_attr = {};
+
+	int ret, size, req_vec;
+	int i;
+
+	size = ipoib_recvq_size + 1;
+	ret = ipoib_cm_dev_init(dev);
+	if (!ret) {
+		size += ipoib_sendq_size;
+		if (ipoib_cm_has_srq(dev))
+			size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */
+		else
+			size += ipoib_recvq_size * ipoib_max_conn_qp;
+	} else
+		if (ret != -ENOSYS)
+			return -ENODEV;
+
+	req_vec = (priv->port - 1) * 2;
+
+	cq_attr.cqe = size;
+	cq_attr.comp_vector = req_vec % priv->ca->num_comp_vectors;
+	priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_rx_completion, NULL,
+				     priv, &cq_attr);
+	if (IS_ERR(priv->recv_cq)) {
+		pr_warn("%s: failed to create receive CQ\n", ca->name);
+		goto out_cm_dev_cleanup;
+	}
+
+	cq_attr.cqe = ipoib_sendq_size;
+	cq_attr.comp_vector = (req_vec + 1) % priv->ca->num_comp_vectors;
+	priv->send_cq = ib_create_cq(priv->ca, ipoib_ib_tx_completion, NULL,
+				     priv, &cq_attr);
+	if (IS_ERR(priv->send_cq)) {
+		pr_warn("%s: failed to create send CQ\n", ca->name);
+		goto out_free_recv_cq;
+	}
+
+	if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP))
+		goto out_free_send_cq;
+
+	init_attr.send_cq = priv->send_cq;
+	init_attr.recv_cq = priv->recv_cq;
+
+	if (priv->hca_caps & IB_DEVICE_UD_TSO)
+		init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+	if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK)
+		init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING)
+		init_attr.create_flags |= IB_QP_CREATE_NETIF_QP;
+
+	priv->qp = ib_create_qp(priv->pd, &init_attr);
+	if (IS_ERR(priv->qp)) {
+		pr_warn("%s: failed to create QP\n", ca->name);
+		goto out_free_send_cq;
+	}
+
+	if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
+		goto out_free_send_cq;
+
+	for (i = 0; i < MAX_SKB_FRAGS + 1; ++i)
+		priv->tx_sge[i].lkey = priv->pd->local_dma_lkey;
+
+	priv->tx_wr.wr.opcode		= IB_WR_SEND;
+	priv->tx_wr.wr.sg_list		= priv->tx_sge;
+	priv->tx_wr.wr.send_flags	= IB_SEND_SIGNALED;
+
+	priv->rx_sge[0].lkey = priv->pd->local_dma_lkey;
+
+	priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+	priv->rx_wr.num_sge = 1;
+
+	priv->rx_wr.next = NULL;
+	priv->rx_wr.sg_list = priv->rx_sge;
+
+	if (init_attr.cap.max_send_sge > 1)
+		dev->features |= NETIF_F_SG;
+
+	priv->max_send_sge = init_attr.cap.max_send_sge;
+
+	return 0;
+
+out_free_send_cq:
+	ib_destroy_cq(priv->send_cq);
+
+out_free_recv_cq:
+	ib_destroy_cq(priv->recv_cq);
+
+out_cm_dev_cleanup:
+	ipoib_cm_dev_cleanup(dev);
+
+	return -ENODEV;
+}
+
+void ipoib_transport_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	if (priv->qp) {
+		if (ib_destroy_qp(priv->qp))
+			ipoib_warn(priv, "ib_qp_destroy failed\n");
+
+		priv->qp = NULL;
+	}
+
+	if (ib_destroy_cq(priv->send_cq))
+		ipoib_warn(priv, "ib_cq_destroy (send) failed\n");
+
+	if (ib_destroy_cq(priv->recv_cq))
+		ipoib_warn(priv, "ib_cq_destroy (recv) failed\n");
+}
+
+void ipoib_event(struct ib_event_handler *handler,
+		 struct ib_event *record)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(handler, struct ipoib_dev_priv, event_handler);
+
+	if (record->element.port_num != priv->port)
+		return;
+
+	ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event,
+		  record->device->name, record->element.port_num);
+
+	if (record->event == IB_EVENT_SM_CHANGE ||
+	    record->event == IB_EVENT_CLIENT_REREGISTER) {
+		queue_work(ipoib_workqueue, &priv->flush_light);
+	} else if (record->event == IB_EVENT_PORT_ERR ||
+		   record->event == IB_EVENT_PORT_ACTIVE ||
+		   record->event == IB_EVENT_LID_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_normal);
+	} else if (record->event == IB_EVENT_PKEY_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_heavy);
+	} else if (record->event == IB_EVENT_GID_CHANGE &&
+		   !test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
+		queue_work(ipoib_workqueue, &priv->flush_light);
+	}
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
new file mode 100644
index 0000000..55a9b71
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/sched/signal.h>
+
+#include <linux/init.h>
+#include <linux/seq_file.h>
+
+#include <linux/uaccess.h>
+
+#include "ipoib.h"
+
+static ssize_t show_parent(struct device *d, struct device_attribute *attr,
+			   char *buf)
+{
+	struct net_device *dev = to_net_dev(d);
+	struct ipoib_dev_priv *priv = ipoib_priv(dev);
+
+	return sprintf(buf, "%s\n", priv->parent->name);
+}
+static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL);
+
+int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
+		     u16 pkey, int type)
+{
+	int result;
+
+	priv->max_ib_mtu = ppriv->max_ib_mtu;
+	/* MTU will be reset when mcast join happens */
+	priv->dev->mtu   = IPOIB_UD_MTU(priv->max_ib_mtu);
+	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
+	priv->parent = ppriv->dev;
+	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
+
+	ipoib_set_dev_features(priv, ppriv->ca);
+
+	priv->pkey = pkey;
+
+	memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
+	memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid));
+	set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
+	priv->dev->broadcast[8] = pkey >> 8;
+	priv->dev->broadcast[9] = pkey & 0xff;
+
+	result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port);
+	if (result < 0) {
+		ipoib_warn(ppriv, "failed to initialize subinterface: "
+			   "device %s, port %d",
+			   ppriv->ca->name, ppriv->port);
+		goto err;
+	}
+
+	result = register_netdevice(priv->dev);
+	if (result) {
+		ipoib_warn(priv, "failed to initialize; error %i", result);
+		goto register_failed;
+	}
+
+	/* RTNL childs don't need proprietary sysfs entries */
+	if (type == IPOIB_LEGACY_CHILD) {
+		if (ipoib_cm_add_mode_attr(priv->dev))
+			goto sysfs_failed;
+		if (ipoib_add_pkey_attr(priv->dev))
+			goto sysfs_failed;
+		if (ipoib_add_umcast_attr(priv->dev))
+			goto sysfs_failed;
+
+		if (device_create_file(&priv->dev->dev, &dev_attr_parent))
+			goto sysfs_failed;
+	}
+
+	priv->child_type  = type;
+	list_add_tail(&priv->list, &ppriv->child_intfs);
+
+	return 0;
+
+sysfs_failed:
+	result = -ENOMEM;
+	unregister_netdevice(priv->dev);
+
+register_failed:
+	ipoib_dev_cleanup(priv->dev);
+
+err:
+	return result;
+}
+
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
+{
+	struct ipoib_dev_priv *ppriv, *priv;
+	char intf_name[IFNAMSIZ];
+	struct ipoib_dev_priv *tpriv;
+	int result;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ppriv = ipoib_priv(pdev);
+
+	if (test_bit(IPOIB_FLAG_GOING_DOWN, &ppriv->flags))
+		return -EPERM;
+
+	snprintf(intf_name, sizeof intf_name, "%s.%04x",
+		 ppriv->dev->name, pkey);
+
+	if (!mutex_trylock(&ppriv->sysfs_mutex))
+		return restart_syscall();
+
+	if (!rtnl_trylock()) {
+		mutex_unlock(&ppriv->sysfs_mutex);
+		return restart_syscall();
+	}
+
+	if (!down_write_trylock(&ppriv->vlan_rwsem)) {
+		rtnl_unlock();
+		mutex_unlock(&ppriv->sysfs_mutex);
+		return restart_syscall();
+	}
+
+	priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name);
+	if (!priv) {
+		result = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * First ensure this isn't a duplicate. We check the parent device and
+	 * then all of the legacy child interfaces to make sure the Pkey
+	 * doesn't match.
+	 */
+	if (ppriv->pkey == pkey) {
+		result = -ENOTUNIQ;
+		goto out;
+	}
+
+	list_for_each_entry(tpriv, &ppriv->child_intfs, list) {
+		if (tpriv->pkey == pkey &&
+		    tpriv->child_type == IPOIB_LEGACY_CHILD) {
+			result = -ENOTUNIQ;
+			goto out;
+		}
+	}
+
+	result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD);
+
+out:
+	up_write(&ppriv->vlan_rwsem);
+	rtnl_unlock();
+	mutex_unlock(&ppriv->sysfs_mutex);
+
+	if (result && priv) {
+		struct rdma_netdev *rn;
+
+		rn = netdev_priv(priv->dev);
+		rn->free_rdma_netdev(priv->dev);
+		kfree(priv);
+	}
+
+	return result;
+}
+
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)
+{
+	struct ipoib_dev_priv *ppriv, *priv, *tpriv;
+	struct net_device *dev = NULL;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ppriv = ipoib_priv(pdev);
+
+	if (test_bit(IPOIB_FLAG_GOING_DOWN, &ppriv->flags))
+		return -EPERM;
+
+	if (!mutex_trylock(&ppriv->sysfs_mutex))
+		return restart_syscall();
+
+	if (!rtnl_trylock()) {
+		mutex_unlock(&ppriv->sysfs_mutex);
+		return restart_syscall();
+	}
+
+	if (!down_write_trylock(&ppriv->vlan_rwsem)) {
+		rtnl_unlock();
+		mutex_unlock(&ppriv->sysfs_mutex);
+		return restart_syscall();
+	}
+
+	list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) {
+		if (priv->pkey == pkey &&
+		    priv->child_type == IPOIB_LEGACY_CHILD) {
+			list_del(&priv->list);
+			dev = priv->dev;
+			break;
+		}
+	}
+	up_write(&ppriv->vlan_rwsem);
+
+	if (dev) {
+		ipoib_dbg(ppriv, "delete child vlan %s\n", dev->name);
+		unregister_netdevice(dev);
+	}
+
+	rtnl_unlock();
+	mutex_unlock(&ppriv->sysfs_mutex);
+
+	if (dev) {
+		struct rdma_netdev *rn;
+
+		rn = netdev_priv(dev);
+		rn->free_rdma_netdev(priv->dev);
+		kfree(priv);
+		return 0;
+	}
+
+	return -ENODEV;
+}
diff --git a/drivers/infiniband/ulp/iser/Kconfig b/drivers/infiniband/ulp/iser/Kconfig
new file mode 100644
index 0000000..d00af71
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/Kconfig
@@ -0,0 +1,12 @@
+config INFINIBAND_ISER
+	tristate "iSCSI Extensions for RDMA (iSER)"
+	depends on SCSI && INET && INFINIBAND_ADDR_TRANS
+	select SCSI_ISCSI_ATTRS
+	---help---
+	  Support for the iSCSI Extensions for RDMA (iSER) Protocol
+          over InfiniBand. This allows you to access storage devices
+          that speak iSCSI over iSER over InfiniBand.
+
+	  The iSER protocol is defined by IETF.
+	  See <http://www.ietf.org/rfc/rfc5046.txt>
+	  and <http://members.infinibandta.org/kwspub/spec/Annex_iSER.PDF>
diff --git a/drivers/infiniband/ulp/iser/Makefile b/drivers/infiniband/ulp/iser/Makefile
new file mode 100644
index 0000000..fe6cd15
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_INFINIBAND_ISER)	+= ib_iser.o
+
+ib_iser-y			:= iser_verbs.o iser_initiator.o iser_memory.o \
+				   iscsi_iser.o
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
new file mode 100644
index 0000000..0336643
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -0,0 +1,1133 @@
+/*
+ * iSCSI Initiator over iSER Data-Path
+ *
+ * Copyright (C) 2004 Dmitry Yusupov
+ * Copyright (C) 2004 Alex Aizman
+ * Copyright (C) 2005 Mike Christie
+ * Copyright (c) 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ * maintained by openib-general@openib.org
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Credits:
+ *	Christoph Hellwig
+ *	FUJITA Tomonori
+ *	Arne Redlich
+ *	Zhenyu Wang
+ * Modified by:
+ *      Erez Zilber
+ */
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/hardirq.h>
+#include <linux/kfifo.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include <net/sock.h>
+
+#include <linux/uaccess.h>
+
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_eh.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_transport_iscsi.h>
+
+#include "iscsi_iser.h"
+
+MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz");
+
+static struct scsi_host_template iscsi_iser_sht;
+static struct iscsi_transport iscsi_iser_transport;
+static struct scsi_transport_template *iscsi_iser_scsi_transport;
+static struct workqueue_struct *release_wq;
+static DEFINE_MUTEX(unbind_iser_conn_mutex);
+struct iser_global ig;
+
+int iser_debug_level = 0;
+module_param_named(debug_level, iser_debug_level, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)");
+
+static unsigned int iscsi_max_lun = 512;
+module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
+MODULE_PARM_DESC(max_lun, "Max LUNs to allow per session (default:512");
+
+unsigned int iser_max_sectors = ISER_DEF_MAX_SECTORS;
+module_param_named(max_sectors, iser_max_sectors, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(max_sectors, "Max number of sectors in a single scsi command (default:1024");
+
+bool iser_always_reg = true;
+module_param_named(always_register, iser_always_reg, bool, S_IRUGO);
+MODULE_PARM_DESC(always_register,
+		 "Always register memory, even for continuous memory regions (default:true)");
+
+bool iser_pi_enable = false;
+module_param_named(pi_enable, iser_pi_enable, bool, S_IRUGO);
+MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)");
+
+int iser_pi_guard;
+module_param_named(pi_guard, iser_pi_guard, int, S_IRUGO);
+MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]");
+
+/*
+ * iscsi_iser_recv() - Process a successful recv completion
+ * @conn:         iscsi connection
+ * @hdr:          iscsi header
+ * @rx_data:      buffer containing receive data payload
+ * @rx_data_len:  length of rx_data
+ *
+ * Notes: In case of data length errors or iscsi PDU completion failures
+ *        this routine will signal iscsi layer of connection failure.
+ */
+void
+iscsi_iser_recv(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
+		char *rx_data, int rx_data_len)
+{
+	int rc = 0;
+	int datalen;
+
+	/* verify PDU length */
+	datalen = ntoh24(hdr->dlength);
+	if (datalen > rx_data_len || (datalen + 4) < rx_data_len) {
+		iser_err("wrong datalen %d (hdr), %d (IB)\n",
+			datalen, rx_data_len);
+		rc = ISCSI_ERR_DATALEN;
+		goto error;
+	}
+
+	if (datalen != rx_data_len)
+		iser_dbg("aligned datalen (%d) hdr, %d (IB)\n",
+			datalen, rx_data_len);
+
+	rc = iscsi_complete_pdu(conn, hdr, rx_data, rx_data_len);
+	if (rc && rc != ISCSI_ERR_NO_SCSI_CMD)
+		goto error;
+
+	return;
+error:
+	iscsi_conn_failure(conn, rc);
+}
+
+/**
+ * iscsi_iser_pdu_alloc() - allocate an iscsi-iser PDU
+ * @task:     iscsi task
+ * @opcode:   iscsi command opcode
+ *
+ * Netes: This routine can't fail, just assign iscsi task
+ *        hdr and max hdr size.
+ */
+static int
+iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+
+	task->hdr = (struct iscsi_hdr *)&iser_task->desc.iscsi_header;
+	task->hdr_max = sizeof(iser_task->desc.iscsi_header);
+
+	return 0;
+}
+
+/**
+ * iser_initialize_task_headers() - Initialize task headers
+ * @task:       iscsi task
+ * @tx_desc:    iser tx descriptor
+ *
+ * Notes:
+ * This routine may race with iser teardown flow for scsi
+ * error handling TMFs. So for TMF we should acquire the
+ * state mutex to avoid dereferencing the IB device which
+ * may have already been terminated.
+ */
+int
+iser_initialize_task_headers(struct iscsi_task *task,
+			     struct iser_tx_desc *tx_desc)
+{
+	struct iser_conn *iser_conn = task->conn->dd_data;
+	struct iser_device *device = iser_conn->ib_conn.device;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	u64 dma_addr;
+	const bool mgmt_task = !task->sc && !in_interrupt();
+	int ret = 0;
+
+	if (unlikely(mgmt_task))
+		mutex_lock(&iser_conn->state_mutex);
+
+	if (unlikely(iser_conn->state != ISER_CONN_UP)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
+				ISER_HEADERS_LEN, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(device->ib_device, dma_addr)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	tx_desc->wr_idx = 0;
+	tx_desc->mapped = true;
+	tx_desc->dma_addr = dma_addr;
+	tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
+	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
+	tx_desc->tx_sg[0].lkey   = device->pd->local_dma_lkey;
+
+	iser_task->iser_conn = iser_conn;
+out:
+	if (unlikely(mgmt_task))
+		mutex_unlock(&iser_conn->state_mutex);
+
+	return ret;
+}
+
+/**
+ * iscsi_iser_task_init() - Initialize iscsi-iser task
+ * @task: iscsi task
+ *
+ * Initialize the task for the scsi command or mgmt command.
+ *
+ * Return: Returns zero on success or -ENOMEM when failing
+ *         to init task headers (dma mapping error).
+ */
+static int
+iscsi_iser_task_init(struct iscsi_task *task)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	int ret;
+
+	ret = iser_initialize_task_headers(task, &iser_task->desc);
+	if (ret) {
+		iser_err("Failed to init task %p, err = %d\n",
+			 iser_task, ret);
+		return ret;
+	}
+
+	/* mgmt task */
+	if (!task->sc)
+		return 0;
+
+	iser_task->command_sent = 0;
+	iser_task_rdma_init(iser_task);
+	iser_task->sc = task->sc;
+
+	return 0;
+}
+
+/**
+ * iscsi_iser_mtask_xmit() - xmit management (immediate) task
+ * @conn: iscsi connection
+ * @task: task management task
+ *
+ * Notes:
+ *	The function can return -EAGAIN in which case caller must
+ *	call it again later, or recover. '0' return code means successful
+ *	xmit.
+ *
+ **/
+static int
+iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task)
+{
+	int error = 0;
+
+	iser_dbg("mtask xmit [cid %d itt 0x%x]\n", conn->id, task->itt);
+
+	error = iser_send_control(conn, task);
+
+	/* since iser xmits control with zero copy, tasks can not be recycled
+	 * right after sending them.
+	 * The recycling scheme is based on whether a response is expected
+	 * - if yes, the task is recycled at iscsi_complete_pdu
+	 * - if no,  the task is recycled at iser_snd_completion
+	 */
+	return error;
+}
+
+static int
+iscsi_iser_task_xmit_unsol_data(struct iscsi_conn *conn,
+				 struct iscsi_task *task)
+{
+	struct iscsi_r2t_info *r2t = &task->unsol_r2t;
+	struct iscsi_data hdr;
+	int error = 0;
+
+	/* Send data-out PDUs while there's still unsolicited data to send */
+	while (iscsi_task_has_unsol_data(task)) {
+		iscsi_prep_data_out_pdu(task, r2t, &hdr);
+		iser_dbg("Sending data-out: itt 0x%x, data count %d\n",
+			   hdr.itt, r2t->data_count);
+
+		/* the buffer description has been passed with the command */
+		/* Send the command */
+		error = iser_send_data_out(conn, task, &hdr);
+		if (error) {
+			r2t->datasn--;
+			goto iscsi_iser_task_xmit_unsol_data_exit;
+		}
+		r2t->sent += r2t->data_count;
+		iser_dbg("Need to send %d more as data-out PDUs\n",
+			   r2t->data_length - r2t->sent);
+	}
+
+iscsi_iser_task_xmit_unsol_data_exit:
+	return error;
+}
+
+/**
+ * iscsi_iser_task_xmit() - xmit iscsi-iser task
+ * @task: iscsi task
+ *
+ * Return: zero on success or escalates $error on failure.
+ */
+static int
+iscsi_iser_task_xmit(struct iscsi_task *task)
+{
+	struct iscsi_conn *conn = task->conn;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	int error = 0;
+
+	if (!task->sc)
+		return iscsi_iser_mtask_xmit(conn, task);
+
+	if (task->sc->sc_data_direction == DMA_TO_DEVICE) {
+		BUG_ON(scsi_bufflen(task->sc) == 0);
+
+		iser_dbg("cmd [itt %x total %d imm %d unsol_data %d\n",
+			   task->itt, scsi_bufflen(task->sc),
+			   task->imm_count, task->unsol_r2t.data_length);
+	}
+
+	iser_dbg("ctask xmit [cid %d itt 0x%x]\n",
+		   conn->id, task->itt);
+
+	/* Send the cmd PDU */
+	if (!iser_task->command_sent) {
+		error = iser_send_command(conn, task);
+		if (error)
+			goto iscsi_iser_task_xmit_exit;
+		iser_task->command_sent = 1;
+	}
+
+	/* Send unsolicited data-out PDU(s) if necessary */
+	if (iscsi_task_has_unsol_data(task))
+		error = iscsi_iser_task_xmit_unsol_data(conn, task);
+
+ iscsi_iser_task_xmit_exit:
+	return error;
+}
+
+/**
+ * iscsi_iser_cleanup_task() - cleanup an iscsi-iser task
+ * @task: iscsi task
+ *
+ * Notes: In case the RDMA device is already NULL (might have
+ *        been removed in DEVICE_REMOVAL CM event it will bail-out
+ *        without doing dma unmapping.
+ */
+static void iscsi_iser_cleanup_task(struct iscsi_task *task)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_tx_desc *tx_desc = &iser_task->desc;
+	struct iser_conn *iser_conn = task->conn->dd_data;
+	struct iser_device *device = iser_conn->ib_conn.device;
+
+	/* DEVICE_REMOVAL event might have already released the device */
+	if (!device)
+		return;
+
+	if (likely(tx_desc->mapped)) {
+		ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
+				    ISER_HEADERS_LEN, DMA_TO_DEVICE);
+		tx_desc->mapped = false;
+	}
+
+	/* mgmt tasks do not need special cleanup */
+	if (!task->sc)
+		return;
+
+	if (iser_task->status == ISER_TASK_STATUS_STARTED) {
+		iser_task->status = ISER_TASK_STATUS_COMPLETED;
+		iser_task_rdma_finalize(iser_task);
+	}
+}
+
+/**
+ * iscsi_iser_check_protection() - check protection information status of task.
+ * @task:     iscsi task
+ * @sector:   error sector if exsists (output)
+ *
+ * Return: zero if no data-integrity errors have occured
+ *         0x1: data-integrity error occured in the guard-block
+ *         0x2: data-integrity error occured in the reference tag
+ *         0x3: data-integrity error occured in the application tag
+ *
+ *         In addition the error sector is marked.
+ */
+static u8
+iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+
+	if (iser_task->dir[ISER_DIR_IN])
+		return iser_check_task_pi_status(iser_task, ISER_DIR_IN,
+						 sector);
+	else
+		return iser_check_task_pi_status(iser_task, ISER_DIR_OUT,
+						 sector);
+}
+
+/**
+ * iscsi_iser_conn_create() - create a new iscsi-iser connection
+ * @cls_session: iscsi class connection
+ * @conn_idx:    connection index within the session (for MCS)
+ *
+ * Return: iscsi_cls_conn when iscsi_conn_setup succeeds or NULL
+ *         otherwise.
+ */
+static struct iscsi_cls_conn *
+iscsi_iser_conn_create(struct iscsi_cls_session *cls_session,
+		       uint32_t conn_idx)
+{
+	struct iscsi_conn *conn;
+	struct iscsi_cls_conn *cls_conn;
+
+	cls_conn = iscsi_conn_setup(cls_session, 0, conn_idx);
+	if (!cls_conn)
+		return NULL;
+	conn = cls_conn->dd_data;
+
+	/*
+	 * due to issues with the login code re iser sematics
+	 * this not set in iscsi_conn_setup - FIXME
+	 */
+	conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN;
+
+	return cls_conn;
+}
+
+/**
+ * iscsi_iser_conn_bind() - bind iscsi and iser connection structures
+ * @cls_session:     iscsi class session
+ * @cls_conn:        iscsi class connection
+ * @transport_eph:   transport end-point handle
+ * @is_leading:      indicate if this is the session leading connection (MCS)
+ *
+ * Return: zero on success, $error if iscsi_conn_bind fails and
+ *         -EINVAL in case end-point doesn't exsits anymore or iser connection
+ *         state is not UP (teardown already started).
+ */
+static int
+iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
+		     struct iscsi_cls_conn *cls_conn,
+		     uint64_t transport_eph,
+		     int is_leading)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct iser_conn *iser_conn;
+	struct iscsi_endpoint *ep;
+	int error;
+
+	error = iscsi_conn_bind(cls_session, cls_conn, is_leading);
+	if (error)
+		return error;
+
+	/* the transport ep handle comes from user space so it must be
+	 * verified against the global ib connections list */
+	ep = iscsi_lookup_endpoint(transport_eph);
+	if (!ep) {
+		iser_err("can't bind eph %llx\n",
+			 (unsigned long long)transport_eph);
+		return -EINVAL;
+	}
+	iser_conn = ep->dd_data;
+
+	mutex_lock(&iser_conn->state_mutex);
+	if (iser_conn->state != ISER_CONN_UP) {
+		error = -EINVAL;
+		iser_err("iser_conn %p state is %d, teardown started\n",
+			 iser_conn, iser_conn->state);
+		goto out;
+	}
+
+	error = iser_alloc_rx_descriptors(iser_conn, conn->session);
+	if (error)
+		goto out;
+
+	/* binds the iSER connection retrieved from the previously
+	 * connected ep_handle to the iSCSI layer connection. exchanges
+	 * connection pointers */
+	iser_info("binding iscsi conn %p to iser_conn %p\n", conn, iser_conn);
+
+	conn->dd_data = iser_conn;
+	iser_conn->iscsi_conn = conn;
+
+out:
+	mutex_unlock(&iser_conn->state_mutex);
+	return error;
+}
+
+/**
+ * iscsi_iser_conn_start() - start iscsi-iser connection
+ * @cls_conn: iscsi class connection
+ *
+ * Notes: Here iser intialize (or re-initialize) stop_completion as
+ *        from this point iscsi must call conn_stop in session/connection
+ *        teardown so iser transport must wait for it.
+ */
+static int
+iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn)
+{
+	struct iscsi_conn *iscsi_conn;
+	struct iser_conn *iser_conn;
+
+	iscsi_conn = cls_conn->dd_data;
+	iser_conn = iscsi_conn->dd_data;
+	reinit_completion(&iser_conn->stop_completion);
+
+	return iscsi_conn_start(cls_conn);
+}
+
+/**
+ * iscsi_iser_conn_stop() - stop iscsi-iser connection
+ * @cls_conn:  iscsi class connection
+ * @flag:      indicate if recover or terminate (passed as is)
+ *
+ * Notes: Calling iscsi_conn_stop might theoretically race with
+ *        DEVICE_REMOVAL event and dereference a previously freed RDMA device
+ *        handle, so we call it under iser the state lock to protect against
+ *        this kind of race.
+ */
+static void
+iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct iser_conn *iser_conn = conn->dd_data;
+
+	iser_info("stopping iscsi_conn: %p, iser_conn: %p\n", conn, iser_conn);
+
+	/*
+	 * Userspace may have goofed up and not bound the connection or
+	 * might have only partially setup the connection.
+	 */
+	if (iser_conn) {
+		mutex_lock(&iser_conn->state_mutex);
+		mutex_lock(&unbind_iser_conn_mutex);
+		iser_conn_terminate(iser_conn);
+		iscsi_conn_stop(cls_conn, flag);
+
+		/* unbind */
+		iser_conn->iscsi_conn = NULL;
+		conn->dd_data = NULL;
+		mutex_unlock(&unbind_iser_conn_mutex);
+
+		complete(&iser_conn->stop_completion);
+		mutex_unlock(&iser_conn->state_mutex);
+	} else {
+		iscsi_conn_stop(cls_conn, flag);
+	}
+}
+
+/**
+ * iscsi_iser_session_destroy() - destroy iscsi-iser session
+ * @cls_session: iscsi class session
+ *
+ * Removes and free iscsi host.
+ */
+static void
+iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session)
+{
+	struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
+
+	iscsi_session_teardown(cls_session);
+	iscsi_host_remove(shost);
+	iscsi_host_free(shost);
+}
+
+static inline unsigned int
+iser_dif_prot_caps(int prot_caps)
+{
+	return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ?
+		SHOST_DIF_TYPE1_PROTECTION | SHOST_DIX_TYPE0_PROTECTION |
+		SHOST_DIX_TYPE1_PROTECTION : 0) |
+	       ((prot_caps & IB_PROT_T10DIF_TYPE_2) ?
+		SHOST_DIF_TYPE2_PROTECTION | SHOST_DIX_TYPE2_PROTECTION : 0) |
+	       ((prot_caps & IB_PROT_T10DIF_TYPE_3) ?
+		SHOST_DIF_TYPE3_PROTECTION | SHOST_DIX_TYPE3_PROTECTION : 0);
+}
+
+/**
+ * iscsi_iser_session_create() - create an iscsi-iser session
+ * @ep:             iscsi end-point handle
+ * @cmds_max:       maximum commands in this session
+ * @qdepth:         session command queue depth
+ * @initial_cmdsn:  initiator command sequnce number
+ *
+ * Allocates and adds a scsi host, expose DIF supprot if
+ * exists, and sets up an iscsi session.
+ */
+static struct iscsi_cls_session *
+iscsi_iser_session_create(struct iscsi_endpoint *ep,
+			  uint16_t cmds_max, uint16_t qdepth,
+			  uint32_t initial_cmdsn)
+{
+	struct iscsi_cls_session *cls_session;
+	struct iscsi_session *session;
+	struct Scsi_Host *shost;
+	struct iser_conn *iser_conn = NULL;
+	struct ib_conn *ib_conn;
+	u32 max_fr_sectors;
+	u16 max_cmds;
+
+	shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);
+	if (!shost)
+		return NULL;
+	shost->transportt = iscsi_iser_scsi_transport;
+	shost->cmd_per_lun = qdepth;
+	shost->max_lun = iscsi_max_lun;
+	shost->max_id = 0;
+	shost->max_channel = 0;
+	shost->max_cmd_len = 16;
+
+	/*
+	 * older userspace tools (before 2.0-870) did not pass us
+	 * the leading conn's ep so this will be NULL;
+	 */
+	if (ep) {
+		iser_conn = ep->dd_data;
+		max_cmds = iser_conn->max_cmds;
+		shost->sg_tablesize = iser_conn->scsi_sg_tablesize;
+
+		mutex_lock(&iser_conn->state_mutex);
+		if (iser_conn->state != ISER_CONN_UP) {
+			iser_err("iser conn %p already started teardown\n",
+				 iser_conn);
+			mutex_unlock(&iser_conn->state_mutex);
+			goto free_host;
+		}
+
+		ib_conn = &iser_conn->ib_conn;
+		if (ib_conn->pi_support) {
+			u32 sig_caps = ib_conn->device->ib_device->attrs.sig_prot_cap;
+
+			scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps));
+			scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP |
+						   SHOST_DIX_GUARD_CRC);
+		}
+
+		if (iscsi_host_add(shost,
+				   ib_conn->device->ib_device->dev.parent)) {
+			mutex_unlock(&iser_conn->state_mutex);
+			goto free_host;
+		}
+		mutex_unlock(&iser_conn->state_mutex);
+	} else {
+		max_cmds = ISER_DEF_XMIT_CMDS_MAX;
+		if (iscsi_host_add(shost, NULL))
+			goto free_host;
+	}
+
+	/*
+	 * FRs or FMRs can only map up to a (device) page per entry, but if the
+	 * first entry is misaligned we'll end up using using two entries
+	 * (head and tail) for a single page worth data, so we have to drop
+	 * one segment from the calculation.
+	 */
+	max_fr_sectors = ((shost->sg_tablesize - 1) * PAGE_SIZE) >> 9;
+	shost->max_sectors = min(iser_max_sectors, max_fr_sectors);
+
+	iser_dbg("iser_conn %p, sg_tablesize %u, max_sectors %u\n",
+		 iser_conn, shost->sg_tablesize,
+		 shost->max_sectors);
+
+	if (cmds_max > max_cmds) {
+		iser_info("cmds_max changed from %u to %u\n",
+			  cmds_max, max_cmds);
+		cmds_max = max_cmds;
+	}
+
+	cls_session = iscsi_session_setup(&iscsi_iser_transport, shost,
+					  cmds_max, 0,
+					  sizeof(struct iscsi_iser_task),
+					  initial_cmdsn, 0);
+	if (!cls_session)
+		goto remove_host;
+	session = cls_session->dd_data;
+
+	shost->can_queue = session->scsi_cmds_max;
+	return cls_session;
+
+remove_host:
+	iscsi_host_remove(shost);
+free_host:
+	iscsi_host_free(shost);
+	return NULL;
+}
+
+static int
+iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn,
+		     enum iscsi_param param, char *buf, int buflen)
+{
+	int value;
+
+	switch (param) {
+	case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		/* TBD */
+		break;
+	case ISCSI_PARAM_HDRDGST_EN:
+		sscanf(buf, "%d", &value);
+		if (value) {
+			iser_err("DataDigest wasn't negotiated to None\n");
+			return -EPROTO;
+		}
+		break;
+	case ISCSI_PARAM_DATADGST_EN:
+		sscanf(buf, "%d", &value);
+		if (value) {
+			iser_err("DataDigest wasn't negotiated to None\n");
+			return -EPROTO;
+		}
+		break;
+	case ISCSI_PARAM_IFMARKER_EN:
+		sscanf(buf, "%d", &value);
+		if (value) {
+			iser_err("IFMarker wasn't negotiated to No\n");
+			return -EPROTO;
+		}
+		break;
+	case ISCSI_PARAM_OFMARKER_EN:
+		sscanf(buf, "%d", &value);
+		if (value) {
+			iser_err("OFMarker wasn't negotiated to No\n");
+			return -EPROTO;
+		}
+		break;
+	default:
+		return iscsi_set_param(cls_conn, param, buf, buflen);
+	}
+
+	return 0;
+}
+
+/**
+ * iscsi_iser_set_param() - set class connection parameter
+ * @cls_conn:    iscsi class connection
+ * @stats:       iscsi stats to output
+ *
+ * Output connection statistics.
+ */
+static void
+iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *stats)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+
+	stats->txdata_octets = conn->txdata_octets;
+	stats->rxdata_octets = conn->rxdata_octets;
+	stats->scsicmd_pdus = conn->scsicmd_pdus_cnt;
+	stats->dataout_pdus = conn->dataout_pdus_cnt;
+	stats->scsirsp_pdus = conn->scsirsp_pdus_cnt;
+	stats->datain_pdus = conn->datain_pdus_cnt; /* always 0 */
+	stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */
+	stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt;
+	stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt;
+	stats->custom_length = 0;
+}
+
+static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep,
+				   enum iscsi_param param, char *buf)
+{
+	struct iser_conn *iser_conn = ep->dd_data;
+	int len;
+
+	switch (param) {
+	case ISCSI_PARAM_CONN_PORT:
+	case ISCSI_PARAM_CONN_ADDRESS:
+		if (!iser_conn || !iser_conn->ib_conn.cma_id)
+			return -ENOTCONN;
+
+		return iscsi_conn_get_addr_param((struct sockaddr_storage *)
+				&iser_conn->ib_conn.cma_id->route.addr.dst_addr,
+				param, buf);
+		break;
+	default:
+		return -ENOSYS;
+	}
+
+	return len;
+}
+
+/**
+ * iscsi_iser_ep_connect() - Initiate iSER connection establishment
+ * @shost:          scsi_host
+ * @dst_addr:       destination address
+ * @non-blocking:   indicate if routine can block
+ *
+ * Allocate an iscsi endpoint, an iser_conn structure and bind them.
+ * After that start RDMA connection establishment via rdma_cm. We
+ * don't allocate iser_conn embedded in iscsi_endpoint since in teardown
+ * the endpoint will be destroyed at ep_disconnect while iser_conn will
+ * cleanup its resources asynchronuously.
+ *
+ * Return: iscsi_endpoint created by iscsi layer or ERR_PTR(error)
+ *         if fails.
+ */
+static struct iscsi_endpoint *
+iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr,
+		      int non_blocking)
+{
+	int err;
+	struct iser_conn *iser_conn;
+	struct iscsi_endpoint *ep;
+
+	ep = iscsi_create_endpoint(0);
+	if (!ep)
+		return ERR_PTR(-ENOMEM);
+
+	iser_conn = kzalloc(sizeof(*iser_conn), GFP_KERNEL);
+	if (!iser_conn) {
+		err = -ENOMEM;
+		goto failure;
+	}
+
+	ep->dd_data = iser_conn;
+	iser_conn->ep = ep;
+	iser_conn_init(iser_conn);
+
+	err = iser_connect(iser_conn, NULL, dst_addr, non_blocking);
+	if (err)
+		goto failure;
+
+	return ep;
+failure:
+	iscsi_destroy_endpoint(ep);
+	return ERR_PTR(err);
+}
+
+/**
+ * iscsi_iser_ep_poll() - poll for iser connection establishment to complete
+ * @ep:            iscsi endpoint (created at ep_connect)
+ * @timeout_ms:    polling timeout allowed in ms.
+ *
+ * This routine boils down to waiting for up_completion signaling
+ * that cma_id got CONNECTED event.
+ *
+ * Return: 1 if succeeded in connection establishment, 0 if timeout expired
+ *         (libiscsi will retry will kick in) or -1 if interrupted by signal
+ *         or more likely iser connection state transitioned to TEMINATING or
+ *         DOWN during the wait period.
+ */
+static int
+iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
+{
+	struct iser_conn *iser_conn = ep->dd_data;
+	int rc;
+
+	rc = wait_for_completion_interruptible_timeout(&iser_conn->up_completion,
+						       msecs_to_jiffies(timeout_ms));
+	/* if conn establishment failed, return error code to iscsi */
+	if (rc == 0) {
+		mutex_lock(&iser_conn->state_mutex);
+		if (iser_conn->state == ISER_CONN_TERMINATING ||
+		    iser_conn->state == ISER_CONN_DOWN)
+			rc = -1;
+		mutex_unlock(&iser_conn->state_mutex);
+	}
+
+	iser_info("iser conn %p rc = %d\n", iser_conn, rc);
+
+	if (rc > 0)
+		return 1; /* success, this is the equivalent of EPOLLOUT */
+	else if (!rc)
+		return 0; /* timeout */
+	else
+		return rc; /* signal */
+}
+
+/**
+ * iscsi_iser_ep_disconnect() - Initiate connection teardown process
+ * @ep:    iscsi endpoint handle
+ *
+ * This routine is not blocked by iser and RDMA termination process
+ * completion as we queue a deffered work for iser/RDMA destruction
+ * and cleanup or actually call it immediately in case we didn't pass
+ * iscsi conn bind/start stage, thus it is safe.
+ */
+static void
+iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
+{
+	struct iser_conn *iser_conn = ep->dd_data;
+
+	iser_info("ep %p iser conn %p\n", ep, iser_conn);
+
+	mutex_lock(&iser_conn->state_mutex);
+	iser_conn_terminate(iser_conn);
+
+	/*
+	 * if iser_conn and iscsi_conn are bound, we must wait for
+	 * iscsi_conn_stop and flush errors completion before freeing
+	 * the iser resources. Otherwise we are safe to free resources
+	 * immediately.
+	 */
+	if (iser_conn->iscsi_conn) {
+		INIT_WORK(&iser_conn->release_work, iser_release_work);
+		queue_work(release_wq, &iser_conn->release_work);
+		mutex_unlock(&iser_conn->state_mutex);
+	} else {
+		iser_conn->state = ISER_CONN_DOWN;
+		mutex_unlock(&iser_conn->state_mutex);
+		iser_conn_release(iser_conn);
+	}
+
+	iscsi_destroy_endpoint(ep);
+}
+
+static umode_t iser_attr_is_visible(int param_type, int param)
+{
+	switch (param_type) {
+	case ISCSI_HOST_PARAM:
+		switch (param) {
+		case ISCSI_HOST_PARAM_NETDEV_NAME:
+		case ISCSI_HOST_PARAM_HWADDRESS:
+		case ISCSI_HOST_PARAM_INITIATOR_NAME:
+			return S_IRUGO;
+		default:
+			return 0;
+		}
+	case ISCSI_PARAM:
+		switch (param) {
+		case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		case ISCSI_PARAM_MAX_XMIT_DLENGTH:
+		case ISCSI_PARAM_HDRDGST_EN:
+		case ISCSI_PARAM_DATADGST_EN:
+		case ISCSI_PARAM_CONN_ADDRESS:
+		case ISCSI_PARAM_CONN_PORT:
+		case ISCSI_PARAM_EXP_STATSN:
+		case ISCSI_PARAM_PERSISTENT_ADDRESS:
+		case ISCSI_PARAM_PERSISTENT_PORT:
+		case ISCSI_PARAM_PING_TMO:
+		case ISCSI_PARAM_RECV_TMO:
+		case ISCSI_PARAM_INITIAL_R2T_EN:
+		case ISCSI_PARAM_MAX_R2T:
+		case ISCSI_PARAM_IMM_DATA_EN:
+		case ISCSI_PARAM_FIRST_BURST:
+		case ISCSI_PARAM_MAX_BURST:
+		case ISCSI_PARAM_PDU_INORDER_EN:
+		case ISCSI_PARAM_DATASEQ_INORDER_EN:
+		case ISCSI_PARAM_TARGET_NAME:
+		case ISCSI_PARAM_TPGT:
+		case ISCSI_PARAM_USERNAME:
+		case ISCSI_PARAM_PASSWORD:
+		case ISCSI_PARAM_USERNAME_IN:
+		case ISCSI_PARAM_PASSWORD_IN:
+		case ISCSI_PARAM_FAST_ABORT:
+		case ISCSI_PARAM_ABORT_TMO:
+		case ISCSI_PARAM_LU_RESET_TMO:
+		case ISCSI_PARAM_TGT_RESET_TMO:
+		case ISCSI_PARAM_IFACE_NAME:
+		case ISCSI_PARAM_INITIATOR_NAME:
+		case ISCSI_PARAM_DISCOVERY_SESS:
+			return S_IRUGO;
+		default:
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
+static int iscsi_iser_slave_alloc(struct scsi_device *sdev)
+{
+	struct iscsi_session *session;
+	struct iser_conn *iser_conn;
+	struct ib_device *ib_dev;
+
+	mutex_lock(&unbind_iser_conn_mutex);
+
+	session = starget_to_session(scsi_target(sdev))->dd_data;
+	iser_conn = session->leadconn->dd_data;
+	if (!iser_conn) {
+		mutex_unlock(&unbind_iser_conn_mutex);
+		return -ENOTCONN;
+	}
+	ib_dev = iser_conn->ib_conn.device->ib_device;
+
+	if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG))
+		blk_queue_virt_boundary(sdev->request_queue, ~MASK_4K);
+
+	mutex_unlock(&unbind_iser_conn_mutex);
+
+	return 0;
+}
+
+static struct scsi_host_template iscsi_iser_sht = {
+	.module                 = THIS_MODULE,
+	.name                   = "iSCSI Initiator over iSER",
+	.queuecommand           = iscsi_queuecommand,
+	.change_queue_depth	= scsi_change_queue_depth,
+	.sg_tablesize           = ISCSI_ISER_DEF_SG_TABLESIZE,
+	.cmd_per_lun            = ISER_DEF_CMD_PER_LUN,
+	.eh_timed_out		= iscsi_eh_cmd_timed_out,
+	.eh_abort_handler       = iscsi_eh_abort,
+	.eh_device_reset_handler= iscsi_eh_device_reset,
+	.eh_target_reset_handler = iscsi_eh_recover_target,
+	.target_alloc		= iscsi_target_alloc,
+	.use_clustering         = ENABLE_CLUSTERING,
+	.slave_alloc            = iscsi_iser_slave_alloc,
+	.proc_name              = "iscsi_iser",
+	.this_id                = -1,
+	.track_queue_depth	= 1,
+};
+
+static struct iscsi_transport iscsi_iser_transport = {
+	.owner                  = THIS_MODULE,
+	.name                   = "iser",
+	.caps                   = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_TEXT_NEGO,
+	/* session management */
+	.create_session         = iscsi_iser_session_create,
+	.destroy_session        = iscsi_iser_session_destroy,
+	/* connection management */
+	.create_conn            = iscsi_iser_conn_create,
+	.bind_conn              = iscsi_iser_conn_bind,
+	.destroy_conn           = iscsi_conn_teardown,
+	.attr_is_visible	= iser_attr_is_visible,
+	.set_param              = iscsi_iser_set_param,
+	.get_conn_param		= iscsi_conn_get_param,
+	.get_ep_param		= iscsi_iser_get_ep_param,
+	.get_session_param	= iscsi_session_get_param,
+	.start_conn             = iscsi_iser_conn_start,
+	.stop_conn              = iscsi_iser_conn_stop,
+	/* iscsi host params */
+	.get_host_param		= iscsi_host_get_param,
+	.set_host_param		= iscsi_host_set_param,
+	/* IO */
+	.send_pdu		= iscsi_conn_send_pdu,
+	.get_stats		= iscsi_iser_conn_get_stats,
+	.init_task		= iscsi_iser_task_init,
+	.xmit_task		= iscsi_iser_task_xmit,
+	.cleanup_task		= iscsi_iser_cleanup_task,
+	.alloc_pdu		= iscsi_iser_pdu_alloc,
+	.check_protection	= iscsi_iser_check_protection,
+	/* recovery */
+	.session_recovery_timedout = iscsi_session_recovery_timedout,
+
+	.ep_connect             = iscsi_iser_ep_connect,
+	.ep_poll                = iscsi_iser_ep_poll,
+	.ep_disconnect          = iscsi_iser_ep_disconnect
+};
+
+static int __init iser_init(void)
+{
+	int err;
+
+	iser_dbg("Starting iSER datamover...\n");
+
+	if (iscsi_max_lun < 1) {
+		iser_err("Invalid max_lun value of %u\n", iscsi_max_lun);
+		return -EINVAL;
+	}
+
+	memset(&ig, 0, sizeof(struct iser_global));
+
+	ig.desc_cache = kmem_cache_create("iser_descriptors",
+					  sizeof(struct iser_tx_desc),
+					  0, SLAB_HWCACHE_ALIGN,
+					  NULL);
+	if (ig.desc_cache == NULL)
+		return -ENOMEM;
+
+	/* device init is called only after the first addr resolution */
+	mutex_init(&ig.device_list_mutex);
+	INIT_LIST_HEAD(&ig.device_list);
+	mutex_init(&ig.connlist_mutex);
+	INIT_LIST_HEAD(&ig.connlist);
+
+	release_wq = alloc_workqueue("release workqueue", 0, 0);
+	if (!release_wq) {
+		iser_err("failed to allocate release workqueue\n");
+		err = -ENOMEM;
+		goto err_alloc_wq;
+	}
+
+	iscsi_iser_scsi_transport = iscsi_register_transport(
+							&iscsi_iser_transport);
+	if (!iscsi_iser_scsi_transport) {
+		iser_err("iscsi_register_transport failed\n");
+		err = -EINVAL;
+		goto err_reg;
+	}
+
+	return 0;
+
+err_reg:
+	destroy_workqueue(release_wq);
+err_alloc_wq:
+	kmem_cache_destroy(ig.desc_cache);
+
+	return err;
+}
+
+static void __exit iser_exit(void)
+{
+	struct iser_conn *iser_conn, *n;
+	int connlist_empty;
+
+	iser_dbg("Removing iSER datamover...\n");
+	destroy_workqueue(release_wq);
+
+	mutex_lock(&ig.connlist_mutex);
+	connlist_empty = list_empty(&ig.connlist);
+	mutex_unlock(&ig.connlist_mutex);
+
+	if (!connlist_empty) {
+		iser_err("Error cleanup stage completed but we still have iser "
+			 "connections, destroying them anyway\n");
+		list_for_each_entry_safe(iser_conn, n, &ig.connlist,
+					 conn_list) {
+			iser_conn_release(iser_conn);
+		}
+	}
+
+	iscsi_unregister_transport(&iscsi_iser_transport);
+	kmem_cache_destroy(ig.desc_cache);
+}
+
+module_init(iser_init);
+module_exit(iser_exit);
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
new file mode 100644
index 0000000..c1ae4ae
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -0,0 +1,718 @@
+/*
+ * iSER transport for the Open iSCSI Initiator & iSER transport internals
+ *
+ * Copyright (C) 2004 Dmitry Yusupov
+ * Copyright (C) 2004 Alex Aizman
+ * Copyright (C) 2005 Mike Christie
+ * based on code maintained by open-iscsi@googlegroups.com
+ *
+ * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ISCSI_ISER_H__
+#define __ISCSI_ISER_H__
+
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/printk.h>
+#include <scsi/libiscsi.h>
+#include <scsi/scsi_transport_iscsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/iser.h>
+
+#include <linux/interrupt.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/mempool.h>
+#include <linux/uio.h>
+
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+#include <rdma/rdma_cm.h>
+
+#define DRV_NAME	"iser"
+#define PFX		DRV_NAME ": "
+#define DRV_VER		"1.6"
+
+#define iser_dbg(fmt, arg...)				 \
+	do {						 \
+		if (unlikely(iser_debug_level > 2))	 \
+			printk(KERN_DEBUG PFX "%s: " fmt,\
+				__func__ , ## arg);	 \
+	} while (0)
+
+#define iser_warn(fmt, arg...)				\
+	do {						\
+		if (unlikely(iser_debug_level > 0))	\
+			pr_warn(PFX "%s: " fmt,		\
+				__func__ , ## arg);	\
+	} while (0)
+
+#define iser_info(fmt, arg...)				\
+	do {						\
+		if (unlikely(iser_debug_level > 1))	\
+			pr_info(PFX "%s: " fmt,		\
+				__func__ , ## arg);	\
+	} while (0)
+
+#define iser_err(fmt, arg...) \
+	pr_err(PFX "%s: " fmt, __func__ , ## arg)
+
+#define SHIFT_4K	12
+#define SIZE_4K	(1ULL << SHIFT_4K)
+#define MASK_4K	(~(SIZE_4K-1))
+
+/* Default support is 512KB I/O size */
+#define ISER_DEF_MAX_SECTORS		1024
+#define ISCSI_ISER_DEF_SG_TABLESIZE	((ISER_DEF_MAX_SECTORS * 512) >> SHIFT_4K)
+/* Maximum support is 8MB I/O size */
+#define ISCSI_ISER_MAX_SG_TABLESIZE	((16384 * 512) >> SHIFT_4K)
+
+#define ISER_DEF_XMIT_CMDS_DEFAULT		512
+#if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT
+	#define ISER_DEF_XMIT_CMDS_MAX		ISCSI_DEF_XMIT_CMDS_MAX
+#else
+	#define ISER_DEF_XMIT_CMDS_MAX		ISER_DEF_XMIT_CMDS_DEFAULT
+#endif
+#define ISER_DEF_CMD_PER_LUN		ISER_DEF_XMIT_CMDS_MAX
+
+/* QP settings */
+/* Maximal bounds on received asynchronous PDUs */
+#define ISER_MAX_RX_MISC_PDUS		4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
+
+#define ISER_MAX_TX_MISC_PDUS		6 /* NOOP_OUT(2), TEXT(1),         *
+					   * SCSI_TMFUNC(2), LOGOUT(1) */
+
+#define ISER_QP_MAX_RECV_DTOS		(ISER_DEF_XMIT_CMDS_MAX)
+
+#define ISER_MIN_POSTED_RX		(ISER_DEF_XMIT_CMDS_MAX >> 2)
+
+/* the max TX (send) WR supported by the iSER QP is defined by                 *
+ * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
+ * to have at max for SCSI command. The tx posting & completion handling code  *
+ * supports -EAGAIN scheme where tx is suspended till the QP has room for more *
+ * send WR. D=8 comes from 64K/8K                                              */
+
+#define ISER_INFLIGHT_DATAOUTS		8
+
+#define ISER_QP_MAX_REQ_DTOS		(ISER_DEF_XMIT_CMDS_MAX *    \
+					(1 + ISER_INFLIGHT_DATAOUTS) + \
+					ISER_MAX_TX_MISC_PDUS        + \
+					ISER_MAX_RX_MISC_PDUS)
+
+/* Max registration work requests per command */
+#define ISER_MAX_REG_WR_PER_CMD		5
+
+/* For Signature we don't support DATAOUTs so no need to make room for them */
+#define ISER_QP_SIG_MAX_REQ_DTOS	(ISER_DEF_XMIT_CMDS_MAX	*       \
+					(1 + ISER_MAX_REG_WR_PER_CMD) + \
+					ISER_MAX_TX_MISC_PDUS         + \
+					ISER_MAX_RX_MISC_PDUS)
+
+#define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr			\
+					 - ISER_MAX_TX_MISC_PDUS	\
+					 - ISER_MAX_RX_MISC_PDUS) /	\
+					 (1 + ISER_INFLIGHT_DATAOUTS))
+
+#define ISER_SIGNAL_CMD_COUNT 32
+
+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN	(sizeof(struct iser_ctrl) + sizeof(struct iscsi_hdr))
+
+#define ISER_RECV_DATA_SEG_LEN	128
+#define ISER_RX_PAYLOAD_SIZE	(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+#define ISER_RX_LOGIN_SIZE	(ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
+
+/* Length of an object name string */
+#define ISER_OBJECT_NAME_SIZE		    64
+
+enum iser_conn_state {
+	ISER_CONN_INIT,		   /* descriptor allocd, no conn          */
+	ISER_CONN_PENDING,	   /* in the process of being established */
+	ISER_CONN_UP,		   /* up and running                      */
+	ISER_CONN_TERMINATING,	   /* in the process of being terminated  */
+	ISER_CONN_DOWN,		   /* shut down                           */
+	ISER_CONN_STATES_NUM
+};
+
+enum iser_task_status {
+	ISER_TASK_STATUS_INIT = 0,
+	ISER_TASK_STATUS_STARTED,
+	ISER_TASK_STATUS_COMPLETED
+};
+
+enum iser_data_dir {
+	ISER_DIR_IN = 0,	   /* to initiator */
+	ISER_DIR_OUT,		   /* from initiator */
+	ISER_DIRS_NUM
+};
+
+/**
+ * struct iser_data_buf - iSER data buffer
+ *
+ * @sg:           pointer to the sg list
+ * @size:         num entries of this sg
+ * @data_len:     total beffer byte len
+ * @dma_nents:    returned by dma_map_sg
+ */
+struct iser_data_buf {
+	struct scatterlist *sg;
+	int                size;
+	unsigned long      data_len;
+	unsigned int       dma_nents;
+};
+
+/* fwd declarations */
+struct iser_device;
+struct iscsi_iser_task;
+struct iscsi_endpoint;
+struct iser_reg_resources;
+
+/**
+ * struct iser_mem_reg - iSER memory registration info
+ *
+ * @sge:          memory region sg element
+ * @rkey:         memory region remote key
+ * @mem_h:        pointer to registration context (FMR/Fastreg)
+ */
+struct iser_mem_reg {
+	struct ib_sge	 sge;
+	u32		 rkey;
+	void		*mem_h;
+};
+
+enum iser_desc_type {
+	ISCSI_TX_CONTROL ,
+	ISCSI_TX_SCSI_COMMAND,
+	ISCSI_TX_DATAOUT
+};
+
+/* Maximum number of work requests per task:
+ * Data memory region local invalidate + fast registration
+ * Protection memory region local invalidate + fast registration
+ * Signature memory region local invalidate + fast registration
+ * PDU send
+ */
+#define ISER_MAX_WRS 7
+
+/**
+ * struct iser_tx_desc - iSER TX descriptor
+ *
+ * @iser_header:   iser header
+ * @iscsi_header:  iscsi header
+ * @type:          command/control/dataout
+ * @dam_addr:      header buffer dma_address
+ * @tx_sg:         sg[0] points to iser/iscsi headers
+ *                 sg[1] optionally points to either of immediate data
+ *                 unsolicited data-out or control
+ * @num_sge:       number sges used on this TX task
+ * @mapped:        Is the task header mapped
+ * @wr_idx:        Current WR index
+ * @wrs:           Array of WRs per task
+ * @data_reg:      Data buffer registration details
+ * @prot_reg:      Protection buffer registration details
+ * @sig_attrs:     Signature attributes
+ */
+struct iser_tx_desc {
+	struct iser_ctrl             iser_header;
+	struct iscsi_hdr             iscsi_header;
+	enum   iser_desc_type        type;
+	u64		             dma_addr;
+	struct ib_sge		     tx_sg[2];
+	int                          num_sge;
+	struct ib_cqe		     cqe;
+	bool			     mapped;
+	u8                           wr_idx;
+	union iser_wr {
+		struct ib_send_wr		send;
+		struct ib_reg_wr		fast_reg;
+		struct ib_sig_handover_wr	sig;
+	} wrs[ISER_MAX_WRS];
+	struct iser_mem_reg          data_reg;
+	struct iser_mem_reg          prot_reg;
+	struct ib_sig_attrs          sig_attrs;
+};
+
+#define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
+				 sizeof(u64) + sizeof(struct ib_sge) + \
+				 sizeof(struct ib_cqe)))
+/**
+ * struct iser_rx_desc - iSER RX descriptor
+ *
+ * @iser_header:   iser header
+ * @iscsi_header:  iscsi header
+ * @data:          received data segment
+ * @dma_addr:      receive buffer dma address
+ * @rx_sg:         ib_sge of receive buffer
+ * @pad:           for sense data TODO: Modify to maximum sense length supported
+ */
+struct iser_rx_desc {
+	struct iser_ctrl             iser_header;
+	struct iscsi_hdr             iscsi_header;
+	char		             data[ISER_RECV_DATA_SEG_LEN];
+	u64		             dma_addr;
+	struct ib_sge		     rx_sg;
+	struct ib_cqe		     cqe;
+	char		             pad[ISER_RX_PAD_SIZE];
+} __packed;
+
+/**
+ * struct iser_login_desc - iSER login descriptor
+ *
+ * @req:           pointer to login request buffer
+ * @resp:          pointer to login response buffer
+ * @req_dma:       DMA address of login request buffer
+ * @rsp_dma:      DMA address of login response buffer
+ * @sge:           IB sge for login post recv
+ * @cqe:           completion handler
+ */
+struct iser_login_desc {
+	void                         *req;
+	void                         *rsp;
+	u64                          req_dma;
+	u64                          rsp_dma;
+	struct ib_sge                sge;
+	struct ib_cqe		     cqe;
+} __attribute__((packed));
+
+struct iser_conn;
+struct ib_conn;
+struct iscsi_iser_task;
+
+/**
+ * struct iser_comp - iSER completion context
+ *
+ * @cq:         completion queue
+ * @active_qps: Number of active QPs attached
+ *              to completion context
+ */
+struct iser_comp {
+	struct ib_cq		*cq;
+	int                      active_qps;
+};
+
+/**
+ * struct iser_device - Memory registration operations
+ *     per-device registration schemes
+ *
+ * @alloc_reg_res:     Allocate registration resources
+ * @free_reg_res:      Free registration resources
+ * @fast_reg_mem:      Register memory buffers
+ * @unreg_mem:         Un-register memory buffers
+ * @reg_desc_get:      Get a registration descriptor for pool
+ * @reg_desc_put:      Get a registration descriptor to pool
+ */
+struct iser_reg_ops {
+	int            (*alloc_reg_res)(struct ib_conn *ib_conn,
+					unsigned cmds_max,
+					unsigned int size);
+	void           (*free_reg_res)(struct ib_conn *ib_conn);
+	int            (*reg_mem)(struct iscsi_iser_task *iser_task,
+				  struct iser_data_buf *mem,
+				  struct iser_reg_resources *rsc,
+				  struct iser_mem_reg *reg);
+	void           (*unreg_mem)(struct iscsi_iser_task *iser_task,
+				    enum iser_data_dir cmd_dir);
+	struct iser_fr_desc * (*reg_desc_get)(struct ib_conn *ib_conn);
+	void           (*reg_desc_put)(struct ib_conn *ib_conn,
+				       struct iser_fr_desc *desc);
+};
+
+/**
+ * struct iser_device - iSER device handle
+ *
+ * @ib_device:     RDMA device
+ * @pd:            Protection Domain for this device
+ * @mr:            Global DMA memory region
+ * @event_handler: IB events handle routine
+ * @ig_list:	   entry in devices list
+ * @refcount:      Reference counter, dominated by open iser connections
+ * @comps_used:    Number of completion contexts used, Min between online
+ *                 cpus and device max completion vectors
+ * @comps:         Dinamically allocated array of completion handlers
+ * @reg_ops:       Registration ops
+ * @remote_inv_sup: Remote invalidate is supported on this device
+ */
+struct iser_device {
+	struct ib_device             *ib_device;
+	struct ib_pd	             *pd;
+	struct ib_event_handler      event_handler;
+	struct list_head             ig_list;
+	int                          refcount;
+	int			     comps_used;
+	struct iser_comp	     *comps;
+	const struct iser_reg_ops    *reg_ops;
+	bool                         remote_inv_sup;
+};
+
+#define ISER_CHECK_GUARD	0xc0
+#define ISER_CHECK_REFTAG	0x0f
+#define ISER_CHECK_APPTAG	0x30
+
+/**
+ * struct iser_reg_resources - Fast registration recources
+ *
+ * @mr:         memory region
+ * @fmr_pool:   pool of fmrs
+ * @page_vec:   fast reg page list used by fmr pool
+ * @mr_valid:   is mr valid indicator
+ */
+struct iser_reg_resources {
+	union {
+		struct ib_mr             *mr;
+		struct ib_fmr_pool       *fmr_pool;
+	};
+	struct iser_page_vec             *page_vec;
+	u8				  mr_valid:1;
+};
+
+/**
+ * struct iser_pi_context - Protection information context
+ *
+ * @rsc:             protection buffer registration resources
+ * @sig_mr:          signature enable memory region
+ * @sig_mr_valid:    is sig_mr valid indicator
+ * @sig_protected:   is region protected indicator
+ */
+struct iser_pi_context {
+	struct iser_reg_resources	rsc;
+	struct ib_mr                   *sig_mr;
+	u8                              sig_mr_valid:1;
+	u8                              sig_protected:1;
+};
+
+/**
+ * struct iser_fr_desc - Fast registration descriptor
+ *
+ * @list:           entry in connection fastreg pool
+ * @rsc:            data buffer registration resources
+ * @pi_ctx:         protection information context
+ */
+struct iser_fr_desc {
+	struct list_head		  list;
+	struct iser_reg_resources	  rsc;
+	struct iser_pi_context		 *pi_ctx;
+	struct list_head                  all_list;
+};
+
+/**
+ * struct iser_fr_pool: connection fast registration pool
+ *
+ * @list:                list of fastreg descriptors
+ * @lock:                protects fmr/fastreg pool
+ * @size:                size of the pool
+ */
+struct iser_fr_pool {
+	struct list_head        list;
+	spinlock_t              lock;
+	int                     size;
+	struct list_head        all_list;
+};
+
+/**
+ * struct ib_conn - Infiniband related objects
+ *
+ * @cma_id:              rdma_cm connection maneger handle
+ * @qp:                  Connection Queue-pair
+ * @post_recv_buf_count: post receive counter
+ * @sig_count:           send work request signal count
+ * @rx_wr:               receive work request for batch posts
+ * @device:              reference to iser device
+ * @comp:                iser completion context
+ * @fr_pool:             connection fast registration poool
+ * @pi_support:          Indicate device T10-PI support
+ */
+struct ib_conn {
+	struct rdma_cm_id           *cma_id;
+	struct ib_qp	            *qp;
+	int                          post_recv_buf_count;
+	u8                           sig_count;
+	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
+	struct iser_device          *device;
+	struct iser_comp	    *comp;
+	struct iser_fr_pool          fr_pool;
+	bool			     pi_support;
+	struct ib_cqe		     reg_cqe;
+};
+
+/**
+ * struct iser_conn - iSER connection context
+ *
+ * @ib_conn:          connection RDMA resources
+ * @iscsi_conn:       link to matching iscsi connection
+ * @ep:               transport handle
+ * @state:            connection logical state
+ * @qp_max_recv_dtos: maximum number of data outs, corresponds
+ *                    to max number of post recvs
+ * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1)
+ * @min_posted_rx:    (qp_max_recv_dtos >> 2)
+ * @max_cmds:         maximum cmds allowed for this connection
+ * @name:             connection peer portal
+ * @release_work:     deffered work for release job
+ * @state_mutex:      protects iser onnection state
+ * @stop_completion:  conn_stop completion
+ * @ib_completion:    RDMA cleanup completion
+ * @up_completion:    connection establishment completed
+ *                    (state is ISER_CONN_UP)
+ * @conn_list:        entry in ig conn list
+ * @login_desc:       login descriptor
+ * @rx_desc_head:     head of rx_descs cyclic buffer
+ * @rx_descs:         rx buffers array (cyclic buffer)
+ * @num_rx_descs:     number of rx descriptors
+ * @scsi_sg_tablesize: scsi host sg_tablesize
+ */
+struct iser_conn {
+	struct ib_conn		     ib_conn;
+	struct iscsi_conn	     *iscsi_conn;
+	struct iscsi_endpoint	     *ep;
+	enum iser_conn_state	     state;
+	unsigned		     qp_max_recv_dtos;
+	unsigned		     qp_max_recv_dtos_mask;
+	unsigned		     min_posted_rx;
+	u16                          max_cmds;
+	char 			     name[ISER_OBJECT_NAME_SIZE];
+	struct work_struct	     release_work;
+	struct mutex		     state_mutex;
+	struct completion	     stop_completion;
+	struct completion	     ib_completion;
+	struct completion	     up_completion;
+	struct list_head	     conn_list;
+	struct iser_login_desc       login_desc;
+	unsigned int 		     rx_desc_head;
+	struct iser_rx_desc	     *rx_descs;
+	u32                          num_rx_descs;
+	unsigned short               scsi_sg_tablesize;
+	bool			     snd_w_inv;
+};
+
+/**
+ * struct iscsi_iser_task - iser task context
+ *
+ * @desc:     TX descriptor
+ * @iser_conn:        link to iser connection
+ * @status:           current task status
+ * @sc:               link to scsi command
+ * @command_sent:     indicate if command was sent
+ * @dir:              iser data direction
+ * @rdma_reg:         task rdma registration desc
+ * @data:             iser data buffer desc
+ * @prot:             iser protection buffer desc
+ */
+struct iscsi_iser_task {
+	struct iser_tx_desc          desc;
+	struct iser_conn	     *iser_conn;
+	enum iser_task_status 	     status;
+	struct scsi_cmnd	     *sc;
+	int                          command_sent;
+	int                          dir[ISER_DIRS_NUM];
+	struct iser_mem_reg          rdma_reg[ISER_DIRS_NUM];
+	struct iser_data_buf         data[ISER_DIRS_NUM];
+	struct iser_data_buf         prot[ISER_DIRS_NUM];
+};
+
+struct iser_page_vec {
+	u64 *pages;
+	int npages;
+	struct ib_mr fake_mr;
+};
+
+/**
+ * struct iser_global: iSER global context
+ *
+ * @device_list_mutex:    protects device_list
+ * @device_list:          iser devices global list
+ * @connlist_mutex:       protects connlist
+ * @connlist:             iser connections global list
+ * @desc_cache:           kmem cache for tx dataout
+ */
+struct iser_global {
+	struct mutex      device_list_mutex;
+	struct list_head  device_list;
+	struct mutex      connlist_mutex;
+	struct list_head  connlist;
+	struct kmem_cache *desc_cache;
+};
+
+extern struct iser_global ig;
+extern int iser_debug_level;
+extern bool iser_pi_enable;
+extern int iser_pi_guard;
+extern unsigned int iser_max_sectors;
+extern bool iser_always_reg;
+
+int iser_assign_reg_ops(struct iser_device *device);
+
+int iser_send_control(struct iscsi_conn *conn,
+		      struct iscsi_task *task);
+
+int iser_send_command(struct iscsi_conn *conn,
+		      struct iscsi_task *task);
+
+int iser_send_data_out(struct iscsi_conn *conn,
+		       struct iscsi_task *task,
+		       struct iscsi_data *hdr);
+
+void iscsi_iser_recv(struct iscsi_conn *conn,
+		     struct iscsi_hdr *hdr,
+		     char *rx_data,
+		     int rx_data_len);
+
+void iser_conn_init(struct iser_conn *iser_conn);
+
+void iser_conn_release(struct iser_conn *iser_conn);
+
+int iser_conn_terminate(struct iser_conn *iser_conn);
+
+void iser_release_work(struct work_struct *work);
+
+void iser_err_comp(struct ib_wc *wc, const char *type);
+void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc);
+
+void iser_task_rdma_init(struct iscsi_iser_task *task);
+
+void iser_task_rdma_finalize(struct iscsi_iser_task *task);
+
+void iser_free_rx_descriptors(struct iser_conn *iser_conn);
+
+void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
+				     struct iser_data_buf *mem,
+				     enum iser_data_dir cmd_dir);
+
+int iser_reg_rdma_mem(struct iscsi_iser_task *task,
+		      enum iser_data_dir dir,
+		      bool all_imm);
+void iser_unreg_rdma_mem(struct iscsi_iser_task *task,
+			 enum iser_data_dir dir);
+
+int  iser_connect(struct iser_conn *iser_conn,
+		  struct sockaddr *src_addr,
+		  struct sockaddr *dst_addr,
+		  int non_blocking);
+
+void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
+			enum iser_data_dir cmd_dir);
+void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
+			    enum iser_data_dir cmd_dir);
+
+int  iser_post_recvl(struct iser_conn *iser_conn);
+int  iser_post_recvm(struct iser_conn *iser_conn, int count);
+int  iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
+		    bool signal);
+
+int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
+			   struct iser_data_buf *data,
+			   enum iser_data_dir iser_dir,
+			   enum dma_data_direction dma_dir);
+
+void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task,
+			      struct iser_data_buf *data,
+			      enum dma_data_direction dir);
+
+int  iser_initialize_task_headers(struct iscsi_task *task,
+			struct iser_tx_desc *tx_desc);
+int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
+			      struct iscsi_session *session);
+int iser_alloc_fmr_pool(struct ib_conn *ib_conn,
+			unsigned cmds_max,
+			unsigned int size);
+void iser_free_fmr_pool(struct ib_conn *ib_conn);
+int iser_alloc_fastreg_pool(struct ib_conn *ib_conn,
+			    unsigned cmds_max,
+			    unsigned int size);
+void iser_free_fastreg_pool(struct ib_conn *ib_conn);
+u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
+			     enum iser_data_dir cmd_dir, sector_t *sector);
+struct iser_fr_desc *
+iser_reg_desc_get_fr(struct ib_conn *ib_conn);
+void
+iser_reg_desc_put_fr(struct ib_conn *ib_conn,
+		     struct iser_fr_desc *desc);
+struct iser_fr_desc *
+iser_reg_desc_get_fmr(struct ib_conn *ib_conn);
+void
+iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
+		      struct iser_fr_desc *desc);
+
+static inline struct ib_send_wr *
+iser_tx_next_wr(struct iser_tx_desc *tx_desc)
+{
+	struct ib_send_wr *cur_wr = &tx_desc->wrs[tx_desc->wr_idx].send;
+	struct ib_send_wr *last_wr;
+
+	if (tx_desc->wr_idx) {
+		last_wr = &tx_desc->wrs[tx_desc->wr_idx - 1].send;
+		last_wr->next = cur_wr;
+	}
+	tx_desc->wr_idx++;
+
+	return cur_wr;
+}
+
+static inline struct iser_conn *
+to_iser_conn(struct ib_conn *ib_conn)
+{
+	return container_of(ib_conn, struct iser_conn, ib_conn);
+}
+
+static inline struct iser_rx_desc *
+iser_rx(struct ib_cqe *cqe)
+{
+	return container_of(cqe, struct iser_rx_desc, cqe);
+}
+
+static inline struct iser_tx_desc *
+iser_tx(struct ib_cqe *cqe)
+{
+	return container_of(cqe, struct iser_tx_desc, cqe);
+}
+
+static inline struct iser_login_desc *
+iser_login(struct ib_cqe *cqe)
+{
+	return container_of(cqe, struct iser_login_desc, cqe);
+}
+
+#endif
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
new file mode 100644
index 0000000..df49c4e
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -0,0 +1,775 @@
+/*
+ * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/kfifo.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_host.h>
+
+#include "iscsi_iser.h"
+
+/* Register user buffer memory and initialize passive rdma
+ *  dto descriptor. Data size is stored in
+ *  task->data[ISER_DIR_IN].data_len, Protection size
+ *  os stored in task->prot[ISER_DIR_IN].data_len
+ */
+static int iser_prepare_read_cmd(struct iscsi_task *task)
+
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_mem_reg *mem_reg;
+	int err;
+	struct iser_ctrl *hdr = &iser_task->desc.iser_header;
+	struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN];
+
+	err = iser_dma_map_task_data(iser_task,
+				     buf_in,
+				     ISER_DIR_IN,
+				     DMA_FROM_DEVICE);
+	if (err)
+		return err;
+
+	if (scsi_prot_sg_count(iser_task->sc)) {
+		struct iser_data_buf *pbuf_in = &iser_task->prot[ISER_DIR_IN];
+
+		err = iser_dma_map_task_data(iser_task,
+					     pbuf_in,
+					     ISER_DIR_IN,
+					     DMA_FROM_DEVICE);
+		if (err)
+			return err;
+	}
+
+	err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN, false);
+	if (err) {
+		iser_err("Failed to set up Data-IN RDMA\n");
+		return err;
+	}
+	mem_reg = &iser_task->rdma_reg[ISER_DIR_IN];
+
+	hdr->flags    |= ISER_RSV;
+	hdr->read_stag = cpu_to_be32(mem_reg->rkey);
+	hdr->read_va   = cpu_to_be64(mem_reg->sge.addr);
+
+	iser_dbg("Cmd itt:%d READ tags RKEY:%#.4X VA:%#llX\n",
+		 task->itt, mem_reg->rkey,
+		 (unsigned long long)mem_reg->sge.addr);
+
+	return 0;
+}
+
+/* Register user buffer memory and initialize passive rdma
+ *  dto descriptor. Data size is stored in
+ *  task->data[ISER_DIR_OUT].data_len, Protection size
+ *  is stored at task->prot[ISER_DIR_OUT].data_len
+ */
+static int
+iser_prepare_write_cmd(struct iscsi_task *task,
+		       unsigned int imm_sz,
+		       unsigned int unsol_sz,
+		       unsigned int edtl)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_mem_reg *mem_reg;
+	int err;
+	struct iser_ctrl *hdr = &iser_task->desc.iser_header;
+	struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT];
+	struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1];
+
+	err = iser_dma_map_task_data(iser_task,
+				     buf_out,
+				     ISER_DIR_OUT,
+				     DMA_TO_DEVICE);
+	if (err)
+		return err;
+
+	if (scsi_prot_sg_count(iser_task->sc)) {
+		struct iser_data_buf *pbuf_out = &iser_task->prot[ISER_DIR_OUT];
+
+		err = iser_dma_map_task_data(iser_task,
+					     pbuf_out,
+					     ISER_DIR_OUT,
+					     DMA_TO_DEVICE);
+		if (err)
+			return err;
+	}
+
+	err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT,
+				buf_out->data_len == imm_sz);
+	if (err != 0) {
+		iser_err("Failed to register write cmd RDMA mem\n");
+		return err;
+	}
+
+	mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT];
+
+	if (unsol_sz < edtl) {
+		hdr->flags     |= ISER_WSV;
+		if (buf_out->data_len > imm_sz) {
+			hdr->write_stag = cpu_to_be32(mem_reg->rkey);
+			hdr->write_va = cpu_to_be64(mem_reg->sge.addr + unsol_sz);
+		}
+
+		iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X VA:%#llX + unsol:%d\n",
+			 task->itt, mem_reg->rkey,
+			 (unsigned long long)mem_reg->sge.addr, unsol_sz);
+	}
+
+	if (imm_sz > 0) {
+		iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n",
+			 task->itt, imm_sz);
+		tx_dsg->addr = mem_reg->sge.addr;
+		tx_dsg->length = imm_sz;
+		tx_dsg->lkey = mem_reg->sge.lkey;
+		iser_task->desc.num_sge = 2;
+	}
+
+	return 0;
+}
+
+/* creates a new tx descriptor and adds header regd buffer */
+static void iser_create_send_desc(struct iser_conn	*iser_conn,
+				  struct iser_tx_desc	*tx_desc)
+{
+	struct iser_device *device = iser_conn->ib_conn.device;
+
+	ib_dma_sync_single_for_cpu(device->ib_device,
+		tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
+
+	memset(&tx_desc->iser_header, 0, sizeof(struct iser_ctrl));
+	tx_desc->iser_header.flags = ISER_VER;
+	tx_desc->num_sge = 1;
+}
+
+static void iser_free_login_buf(struct iser_conn *iser_conn)
+{
+	struct iser_device *device = iser_conn->ib_conn.device;
+	struct iser_login_desc *desc = &iser_conn->login_desc;
+
+	if (!desc->req)
+		return;
+
+	ib_dma_unmap_single(device->ib_device, desc->req_dma,
+			    ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
+
+	ib_dma_unmap_single(device->ib_device, desc->rsp_dma,
+			    ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
+
+	kfree(desc->req);
+	kfree(desc->rsp);
+
+	/* make sure we never redo any unmapping */
+	desc->req = NULL;
+	desc->rsp = NULL;
+}
+
+static int iser_alloc_login_buf(struct iser_conn *iser_conn)
+{
+	struct iser_device *device = iser_conn->ib_conn.device;
+	struct iser_login_desc *desc = &iser_conn->login_desc;
+
+	desc->req = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN, GFP_KERNEL);
+	if (!desc->req)
+		return -ENOMEM;
+
+	desc->req_dma = ib_dma_map_single(device->ib_device, desc->req,
+					  ISCSI_DEF_MAX_RECV_SEG_LEN,
+					  DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(device->ib_device,
+				desc->req_dma))
+		goto free_req;
+
+	desc->rsp = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL);
+	if (!desc->rsp)
+		goto unmap_req;
+
+	desc->rsp_dma = ib_dma_map_single(device->ib_device, desc->rsp,
+					   ISER_RX_LOGIN_SIZE,
+					   DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(device->ib_device,
+				desc->rsp_dma))
+		goto free_rsp;
+
+	return 0;
+
+free_rsp:
+	kfree(desc->rsp);
+unmap_req:
+	ib_dma_unmap_single(device->ib_device, desc->req_dma,
+			    ISCSI_DEF_MAX_RECV_SEG_LEN,
+			    DMA_TO_DEVICE);
+free_req:
+	kfree(desc->req);
+
+	return -ENOMEM;
+}
+
+int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
+			      struct iscsi_session *session)
+{
+	int i, j;
+	u64 dma_addr;
+	struct iser_rx_desc *rx_desc;
+	struct ib_sge       *rx_sg;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+
+	iser_conn->qp_max_recv_dtos = session->cmds_max;
+	iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */
+	iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2;
+
+	if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max,
+					   iser_conn->scsi_sg_tablesize))
+		goto create_rdma_reg_res_failed;
+
+	if (iser_alloc_login_buf(iser_conn))
+		goto alloc_login_buf_fail;
+
+	iser_conn->num_rx_descs = session->cmds_max;
+	iser_conn->rx_descs = kmalloc(iser_conn->num_rx_descs *
+				sizeof(struct iser_rx_desc), GFP_KERNEL);
+	if (!iser_conn->rx_descs)
+		goto rx_desc_alloc_fail;
+
+	rx_desc = iser_conn->rx_descs;
+
+	for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++)  {
+		dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc,
+					ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(device->ib_device, dma_addr))
+			goto rx_desc_dma_map_failed;
+
+		rx_desc->dma_addr = dma_addr;
+		rx_desc->cqe.done = iser_task_rsp;
+		rx_sg = &rx_desc->rx_sg;
+		rx_sg->addr = rx_desc->dma_addr;
+		rx_sg->length = ISER_RX_PAYLOAD_SIZE;
+		rx_sg->lkey = device->pd->local_dma_lkey;
+	}
+
+	iser_conn->rx_desc_head = 0;
+	return 0;
+
+rx_desc_dma_map_failed:
+	rx_desc = iser_conn->rx_descs;
+	for (j = 0; j < i; j++, rx_desc++)
+		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	kfree(iser_conn->rx_descs);
+	iser_conn->rx_descs = NULL;
+rx_desc_alloc_fail:
+	iser_free_login_buf(iser_conn);
+alloc_login_buf_fail:
+	device->reg_ops->free_reg_res(ib_conn);
+create_rdma_reg_res_failed:
+	iser_err("failed allocating rx descriptors / data buffers\n");
+	return -ENOMEM;
+}
+
+void iser_free_rx_descriptors(struct iser_conn *iser_conn)
+{
+	int i;
+	struct iser_rx_desc *rx_desc;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+
+	if (device->reg_ops->free_reg_res)
+		device->reg_ops->free_reg_res(ib_conn);
+
+	rx_desc = iser_conn->rx_descs;
+	for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++)
+		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	kfree(iser_conn->rx_descs);
+	/* make sure we never redo any unmapping */
+	iser_conn->rx_descs = NULL;
+
+	iser_free_login_buf(iser_conn);
+}
+
+static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)
+{
+	struct iser_conn *iser_conn = conn->dd_data;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iscsi_session *session = conn->session;
+
+	iser_dbg("req op %x flags %x\n", req->opcode, req->flags);
+	/* check if this is the last login - going to full feature phase */
+	if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE)
+		return 0;
+
+	/*
+	 * Check that there is one posted recv buffer
+	 * (for the last login response).
+	 */
+	WARN_ON(ib_conn->post_recv_buf_count != 1);
+
+	if (session->discovery_sess) {
+		iser_info("Discovery session, re-using login RX buffer\n");
+		return 0;
+	} else
+		iser_info("Normal session, posting batch of RX %d buffers\n",
+			  iser_conn->min_posted_rx);
+
+	/* Initial post receive buffers */
+	if (iser_post_recvm(iser_conn, iser_conn->min_posted_rx))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline bool iser_signal_comp(u8 sig_count)
+{
+	return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0);
+}
+
+/**
+ * iser_send_command - send command PDU
+ */
+int iser_send_command(struct iscsi_conn *conn,
+		      struct iscsi_task *task)
+{
+	struct iser_conn *iser_conn = conn->dd_data;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	unsigned long edtl;
+	int err;
+	struct iser_data_buf *data_buf, *prot_buf;
+	struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr;
+	struct scsi_cmnd *sc  =  task->sc;
+	struct iser_tx_desc *tx_desc = &iser_task->desc;
+	u8 sig_count = ++iser_conn->ib_conn.sig_count;
+
+	edtl = ntohl(hdr->data_length);
+
+	/* build the tx desc regd header and add it to the tx desc dto */
+	tx_desc->type = ISCSI_TX_SCSI_COMMAND;
+	tx_desc->cqe.done = iser_cmd_comp;
+	iser_create_send_desc(iser_conn, tx_desc);
+
+	if (hdr->flags & ISCSI_FLAG_CMD_READ) {
+		data_buf = &iser_task->data[ISER_DIR_IN];
+		prot_buf = &iser_task->prot[ISER_DIR_IN];
+	} else {
+		data_buf = &iser_task->data[ISER_DIR_OUT];
+		prot_buf = &iser_task->prot[ISER_DIR_OUT];
+	}
+
+	if (scsi_sg_count(sc)) { /* using a scatter list */
+		data_buf->sg = scsi_sglist(sc);
+		data_buf->size = scsi_sg_count(sc);
+	}
+	data_buf->data_len = scsi_bufflen(sc);
+
+	if (scsi_prot_sg_count(sc)) {
+		prot_buf->sg  = scsi_prot_sglist(sc);
+		prot_buf->size = scsi_prot_sg_count(sc);
+		prot_buf->data_len = (data_buf->data_len >>
+				     ilog2(sc->device->sector_size)) * 8;
+	}
+
+	if (hdr->flags & ISCSI_FLAG_CMD_READ) {
+		err = iser_prepare_read_cmd(task);
+		if (err)
+			goto send_command_error;
+	}
+	if (hdr->flags & ISCSI_FLAG_CMD_WRITE) {
+		err = iser_prepare_write_cmd(task,
+					     task->imm_count,
+				             task->imm_count +
+					     task->unsol_r2t.data_length,
+					     edtl);
+		if (err)
+			goto send_command_error;
+	}
+
+	iser_task->status = ISER_TASK_STATUS_STARTED;
+
+	err = iser_post_send(&iser_conn->ib_conn, tx_desc,
+			     iser_signal_comp(sig_count));
+	if (!err)
+		return 0;
+
+send_command_error:
+	iser_err("conn %p failed task->itt %d err %d\n",conn, task->itt, err);
+	return err;
+}
+
+/**
+ * iser_send_data_out - send data out PDU
+ */
+int iser_send_data_out(struct iscsi_conn *conn,
+		       struct iscsi_task *task,
+		       struct iscsi_data *hdr)
+{
+	struct iser_conn *iser_conn = conn->dd_data;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_tx_desc *tx_desc;
+	struct iser_mem_reg *mem_reg;
+	unsigned long buf_offset;
+	unsigned long data_seg_len;
+	uint32_t itt;
+	int err;
+	struct ib_sge *tx_dsg;
+
+	itt = (__force uint32_t)hdr->itt;
+	data_seg_len = ntoh24(hdr->dlength);
+	buf_offset   = ntohl(hdr->offset);
+
+	iser_dbg("%s itt %d dseg_len %d offset %d\n",
+		 __func__,(int)itt,(int)data_seg_len,(int)buf_offset);
+
+	tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC);
+	if (!tx_desc)
+		return -ENOMEM;
+
+	tx_desc->type = ISCSI_TX_DATAOUT;
+	tx_desc->cqe.done = iser_dataout_comp;
+	tx_desc->iser_header.flags = ISER_VER;
+	memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
+
+	/* build the tx desc */
+	err = iser_initialize_task_headers(task, tx_desc);
+	if (err)
+		goto send_data_out_error;
+
+	mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT];
+	tx_dsg = &tx_desc->tx_sg[1];
+	tx_dsg->addr = mem_reg->sge.addr + buf_offset;
+	tx_dsg->length = data_seg_len;
+	tx_dsg->lkey = mem_reg->sge.lkey;
+	tx_desc->num_sge = 2;
+
+	if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) {
+		iser_err("Offset:%ld & DSL:%ld in Data-Out inconsistent with total len:%ld, itt:%d\n",
+			 buf_offset, data_seg_len,
+			 iser_task->data[ISER_DIR_OUT].data_len, itt);
+		err = -EINVAL;
+		goto send_data_out_error;
+	}
+	iser_dbg("data-out itt: %d, offset: %ld, sz: %ld\n",
+		 itt, buf_offset, data_seg_len);
+
+
+	err = iser_post_send(&iser_conn->ib_conn, tx_desc, true);
+	if (!err)
+		return 0;
+
+send_data_out_error:
+	kmem_cache_free(ig.desc_cache, tx_desc);
+	iser_err("conn %p failed err %d\n", conn, err);
+	return err;
+}
+
+int iser_send_control(struct iscsi_conn *conn,
+		      struct iscsi_task *task)
+{
+	struct iser_conn *iser_conn = conn->dd_data;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_tx_desc *mdesc = &iser_task->desc;
+	unsigned long data_seg_len;
+	int err = 0;
+	struct iser_device *device;
+
+	/* build the tx desc regd header and add it to the tx desc dto */
+	mdesc->type = ISCSI_TX_CONTROL;
+	mdesc->cqe.done = iser_ctrl_comp;
+	iser_create_send_desc(iser_conn, mdesc);
+
+	device = iser_conn->ib_conn.device;
+
+	data_seg_len = ntoh24(task->hdr->dlength);
+
+	if (data_seg_len > 0) {
+		struct iser_login_desc *desc = &iser_conn->login_desc;
+		struct ib_sge *tx_dsg = &mdesc->tx_sg[1];
+
+		if (task != conn->login_task) {
+			iser_err("data present on non login task!!!\n");
+			goto send_control_error;
+		}
+
+		ib_dma_sync_single_for_cpu(device->ib_device, desc->req_dma,
+					   task->data_count, DMA_TO_DEVICE);
+
+		memcpy(desc->req, task->data, task->data_count);
+
+		ib_dma_sync_single_for_device(device->ib_device, desc->req_dma,
+					      task->data_count, DMA_TO_DEVICE);
+
+		tx_dsg->addr = desc->req_dma;
+		tx_dsg->length = task->data_count;
+		tx_dsg->lkey = device->pd->local_dma_lkey;
+		mdesc->num_sge = 2;
+	}
+
+	if (task == conn->login_task) {
+		iser_dbg("op %x dsl %lx, posting login rx buffer\n",
+			 task->hdr->opcode, data_seg_len);
+		err = iser_post_recvl(iser_conn);
+		if (err)
+			goto send_control_error;
+		err = iser_post_rx_bufs(conn, task->hdr);
+		if (err)
+			goto send_control_error;
+	}
+
+	err = iser_post_send(&iser_conn->ib_conn, mdesc, true);
+	if (!err)
+		return 0;
+
+send_control_error:
+	iser_err("conn %p failed err %d\n",conn, err);
+	return err;
+}
+
+void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_conn *ib_conn = wc->qp->qp_context;
+	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
+	struct iser_login_desc *desc = iser_login(wc->wr_cqe);
+	struct iscsi_hdr *hdr;
+	char *data;
+	int length;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		iser_err_comp(wc, "login_rsp");
+		return;
+	}
+
+	ib_dma_sync_single_for_cpu(ib_conn->device->ib_device,
+				   desc->rsp_dma, ISER_RX_LOGIN_SIZE,
+				   DMA_FROM_DEVICE);
+
+	hdr = desc->rsp + sizeof(struct iser_ctrl);
+	data = desc->rsp + ISER_HEADERS_LEN;
+	length = wc->byte_len - ISER_HEADERS_LEN;
+
+	iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
+		 hdr->itt, length);
+
+	iscsi_iser_recv(iser_conn->iscsi_conn, hdr, data, length);
+
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+				      desc->rsp_dma, ISER_RX_LOGIN_SIZE,
+				      DMA_FROM_DEVICE);
+
+	ib_conn->post_recv_buf_count--;
+}
+
+static inline void
+iser_inv_desc(struct iser_fr_desc *desc, u32 rkey)
+{
+	if (likely(rkey == desc->rsc.mr->rkey))
+		desc->rsc.mr_valid = 0;
+	else if (likely(rkey == desc->pi_ctx->sig_mr->rkey))
+		desc->pi_ctx->sig_mr_valid = 0;
+}
+
+static int
+iser_check_remote_inv(struct iser_conn *iser_conn,
+		      struct ib_wc *wc,
+		      struct iscsi_hdr *hdr)
+{
+	if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
+		struct iscsi_task *task;
+		u32 rkey = wc->ex.invalidate_rkey;
+
+		iser_dbg("conn %p: remote invalidation for rkey %#x\n",
+			 iser_conn, rkey);
+
+		if (unlikely(!iser_conn->snd_w_inv)) {
+			iser_err("conn %p: unexpected remote invalidation, terminating connection\n",
+				 iser_conn);
+			return -EPROTO;
+		}
+
+		task = iscsi_itt_to_ctask(iser_conn->iscsi_conn, hdr->itt);
+		if (likely(task)) {
+			struct iscsi_iser_task *iser_task = task->dd_data;
+			struct iser_fr_desc *desc;
+
+			if (iser_task->dir[ISER_DIR_IN]) {
+				desc = iser_task->rdma_reg[ISER_DIR_IN].mem_h;
+				iser_inv_desc(desc, rkey);
+			}
+
+			if (iser_task->dir[ISER_DIR_OUT]) {
+				desc = iser_task->rdma_reg[ISER_DIR_OUT].mem_h;
+				iser_inv_desc(desc, rkey);
+			}
+		} else {
+			iser_err("failed to get task for itt=%d\n", hdr->itt);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+
+void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_conn *ib_conn = wc->qp->qp_context;
+	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
+	struct iser_rx_desc *desc = iser_rx(wc->wr_cqe);
+	struct iscsi_hdr *hdr;
+	int length;
+	int outstanding, count, err;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		iser_err_comp(wc, "task_rsp");
+		return;
+	}
+
+	ib_dma_sync_single_for_cpu(ib_conn->device->ib_device,
+				   desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
+				   DMA_FROM_DEVICE);
+
+	hdr = &desc->iscsi_header;
+	length = wc->byte_len - ISER_HEADERS_LEN;
+
+	iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
+		 hdr->itt, length);
+
+	if (iser_check_remote_inv(iser_conn, wc, hdr)) {
+		iscsi_conn_failure(iser_conn->iscsi_conn,
+				   ISCSI_ERR_CONN_FAILED);
+		return;
+	}
+
+	iscsi_iser_recv(iser_conn->iscsi_conn, hdr, desc->data, length);
+
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+				      desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
+				      DMA_FROM_DEVICE);
+
+	/* decrementing conn->post_recv_buf_count only --after-- freeing the   *
+	 * task eliminates the need to worry on tasks which are completed in   *
+	 * parallel to the execution of iser_conn_term. So the code that waits *
+	 * for the posted rx bufs refcount to become zero handles everything   */
+	ib_conn->post_recv_buf_count--;
+
+	outstanding = ib_conn->post_recv_buf_count;
+	if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) {
+		count = min(iser_conn->qp_max_recv_dtos - outstanding,
+			    iser_conn->min_posted_rx);
+		err = iser_post_recvm(iser_conn, count);
+		if (err)
+			iser_err("posting %d rx bufs err %d\n", count, err);
+	}
+}
+
+void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	if (unlikely(wc->status != IB_WC_SUCCESS))
+		iser_err_comp(wc, "command");
+}
+
+void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct iser_tx_desc *desc = iser_tx(wc->wr_cqe);
+	struct iscsi_task *task;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		iser_err_comp(wc, "control");
+		return;
+	}
+
+	/* this arithmetic is legal by libiscsi dd_data allocation */
+	task = (void *)desc - sizeof(struct iscsi_task);
+	if (task->hdr->itt == RESERVED_ITT)
+		iscsi_put_task(task);
+}
+
+void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct iser_tx_desc *desc = iser_tx(wc->wr_cqe);
+	struct ib_conn *ib_conn = wc->qp->qp_context;
+	struct iser_device *device = ib_conn->device;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS))
+		iser_err_comp(wc, "dataout");
+
+	ib_dma_unmap_single(device->ib_device, desc->dma_addr,
+			    ISER_HEADERS_LEN, DMA_TO_DEVICE);
+	kmem_cache_free(ig.desc_cache, desc);
+}
+
+void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
+
+{
+	iser_task->status = ISER_TASK_STATUS_INIT;
+
+	iser_task->dir[ISER_DIR_IN] = 0;
+	iser_task->dir[ISER_DIR_OUT] = 0;
+
+	iser_task->data[ISER_DIR_IN].data_len  = 0;
+	iser_task->data[ISER_DIR_OUT].data_len = 0;
+
+	iser_task->prot[ISER_DIR_IN].data_len  = 0;
+	iser_task->prot[ISER_DIR_OUT].data_len = 0;
+
+	memset(&iser_task->rdma_reg[ISER_DIR_IN], 0,
+	       sizeof(struct iser_mem_reg));
+	memset(&iser_task->rdma_reg[ISER_DIR_OUT], 0,
+	       sizeof(struct iser_mem_reg));
+}
+
+void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
+{
+	int prot_count = scsi_prot_sg_count(iser_task->sc);
+
+	if (iser_task->dir[ISER_DIR_IN]) {
+		iser_unreg_rdma_mem(iser_task, ISER_DIR_IN);
+		iser_dma_unmap_task_data(iser_task,
+					 &iser_task->data[ISER_DIR_IN],
+					 DMA_FROM_DEVICE);
+		if (prot_count)
+			iser_dma_unmap_task_data(iser_task,
+						 &iser_task->prot[ISER_DIR_IN],
+						 DMA_FROM_DEVICE);
+	}
+
+	if (iser_task->dir[ISER_DIR_OUT]) {
+		iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT);
+		iser_dma_unmap_task_data(iser_task,
+					 &iser_task->data[ISER_DIR_OUT],
+					 DMA_TO_DEVICE);
+		if (prot_count)
+			iser_dma_unmap_task_data(iser_task,
+						 &iser_task->prot[ISER_DIR_OUT],
+						 DMA_TO_DEVICE);
+	}
+}
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
new file mode 100644
index 0000000..322209d
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -0,0 +1,578 @@
+/*
+ * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/scatterlist.h>
+
+#include "iscsi_iser.h"
+static
+int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
+		      struct iser_data_buf *mem,
+		      struct iser_reg_resources *rsc,
+		      struct iser_mem_reg *mem_reg);
+static
+int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
+		     struct iser_data_buf *mem,
+		     struct iser_reg_resources *rsc,
+		     struct iser_mem_reg *mem_reg);
+
+static const struct iser_reg_ops fastreg_ops = {
+	.alloc_reg_res	= iser_alloc_fastreg_pool,
+	.free_reg_res	= iser_free_fastreg_pool,
+	.reg_mem	= iser_fast_reg_mr,
+	.unreg_mem	= iser_unreg_mem_fastreg,
+	.reg_desc_get	= iser_reg_desc_get_fr,
+	.reg_desc_put	= iser_reg_desc_put_fr,
+};
+
+static const struct iser_reg_ops fmr_ops = {
+	.alloc_reg_res	= iser_alloc_fmr_pool,
+	.free_reg_res	= iser_free_fmr_pool,
+	.reg_mem	= iser_fast_reg_fmr,
+	.unreg_mem	= iser_unreg_mem_fmr,
+	.reg_desc_get	= iser_reg_desc_get_fmr,
+	.reg_desc_put	= iser_reg_desc_put_fmr,
+};
+
+void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	iser_err_comp(wc, "memreg");
+}
+
+int iser_assign_reg_ops(struct iser_device *device)
+{
+	struct ib_device *ib_dev = device->ib_device;
+
+	/* Assign function handles  - based on FMR support */
+	if (ib_dev->alloc_fmr && ib_dev->dealloc_fmr &&
+	    ib_dev->map_phys_fmr && ib_dev->unmap_fmr) {
+		iser_info("FMR supported, using FMR for registration\n");
+		device->reg_ops = &fmr_ops;
+	} else if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+		iser_info("FastReg supported, using FastReg for registration\n");
+		device->reg_ops = &fastreg_ops;
+		device->remote_inv_sup = iser_always_reg;
+	} else {
+		iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+struct iser_fr_desc *
+iser_reg_desc_get_fr(struct ib_conn *ib_conn)
+{
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+	struct iser_fr_desc *desc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&fr_pool->lock, flags);
+	desc = list_first_entry(&fr_pool->list,
+				struct iser_fr_desc, list);
+	list_del(&desc->list);
+	spin_unlock_irqrestore(&fr_pool->lock, flags);
+
+	return desc;
+}
+
+void
+iser_reg_desc_put_fr(struct ib_conn *ib_conn,
+		     struct iser_fr_desc *desc)
+{
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&fr_pool->lock, flags);
+	list_add(&desc->list, &fr_pool->list);
+	spin_unlock_irqrestore(&fr_pool->lock, flags);
+}
+
+struct iser_fr_desc *
+iser_reg_desc_get_fmr(struct ib_conn *ib_conn)
+{
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+
+	return list_first_entry(&fr_pool->list,
+				struct iser_fr_desc, list);
+}
+
+void
+iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
+		      struct iser_fr_desc *desc)
+{
+}
+
+static void iser_data_buf_dump(struct iser_data_buf *data,
+			       struct ib_device *ibdev)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(data->sg, sg, data->dma_nents, i)
+		iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p "
+			 "off:0x%x sz:0x%x dma_len:0x%x\n",
+			 i, (unsigned long)ib_sg_dma_address(ibdev, sg),
+			 sg_page(sg), sg->offset,
+			 sg->length, ib_sg_dma_len(ibdev, sg));
+}
+
+static void iser_dump_page_vec(struct iser_page_vec *page_vec)
+{
+	int i;
+
+	iser_err("page vec npages %d data length %lld\n",
+		 page_vec->npages, page_vec->fake_mr.length);
+	for (i = 0; i < page_vec->npages; i++)
+		iser_err("vec[%d]: %llx\n", i, page_vec->pages[i]);
+}
+
+int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
+			    struct iser_data_buf *data,
+			    enum iser_data_dir iser_dir,
+			    enum dma_data_direction dma_dir)
+{
+	struct ib_device *dev;
+
+	iser_task->dir[iser_dir] = 1;
+	dev = iser_task->iser_conn->ib_conn.device->ib_device;
+
+	data->dma_nents = ib_dma_map_sg(dev, data->sg, data->size, dma_dir);
+	if (data->dma_nents == 0) {
+		iser_err("dma_map_sg failed!!!\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task,
+			      struct iser_data_buf *data,
+			      enum dma_data_direction dir)
+{
+	struct ib_device *dev;
+
+	dev = iser_task->iser_conn->ib_conn.device->ib_device;
+	ib_dma_unmap_sg(dev, data->sg, data->size, dir);
+}
+
+static int
+iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
+	     struct iser_mem_reg *reg)
+{
+	struct scatterlist *sg = mem->sg;
+
+	reg->sge.lkey = device->pd->local_dma_lkey;
+	/*
+	 * FIXME: rework the registration code path to differentiate
+	 * rkey/lkey use cases
+	 */
+
+	if (device->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY)
+		reg->rkey = device->pd->unsafe_global_rkey;
+	else
+		reg->rkey = 0;
+	reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]);
+	reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]);
+
+	iser_dbg("Single DMA entry: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
+		 " length=0x%x\n", reg->sge.lkey, reg->rkey,
+		 reg->sge.addr, reg->sge.length);
+
+	return 0;
+}
+
+static int iser_set_page(struct ib_mr *mr, u64 addr)
+{
+	struct iser_page_vec *page_vec =
+		container_of(mr, struct iser_page_vec, fake_mr);
+
+	page_vec->pages[page_vec->npages++] = addr;
+
+	return 0;
+}
+
+static
+int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
+		      struct iser_data_buf *mem,
+		      struct iser_reg_resources *rsc,
+		      struct iser_mem_reg *reg)
+{
+	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+	struct iser_page_vec *page_vec = rsc->page_vec;
+	struct ib_fmr_pool *fmr_pool = rsc->fmr_pool;
+	struct ib_pool_fmr *fmr;
+	int ret, plen;
+
+	page_vec->npages = 0;
+	page_vec->fake_mr.page_size = SIZE_4K;
+	plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg,
+			      mem->size, NULL, iser_set_page);
+	if (unlikely(plen < mem->size)) {
+		iser_err("page vec too short to hold this SG\n");
+		iser_data_buf_dump(mem, device->ib_device);
+		iser_dump_page_vec(page_vec);
+		return -EINVAL;
+	}
+
+	fmr  = ib_fmr_pool_map_phys(fmr_pool, page_vec->pages,
+				    page_vec->npages, page_vec->pages[0]);
+	if (IS_ERR(fmr)) {
+		ret = PTR_ERR(fmr);
+		iser_err("ib_fmr_pool_map_phys failed: %d\n", ret);
+		return ret;
+	}
+
+	reg->sge.lkey = fmr->fmr->lkey;
+	reg->rkey = fmr->fmr->rkey;
+	reg->sge.addr = page_vec->fake_mr.iova;
+	reg->sge.length = page_vec->fake_mr.length;
+	reg->mem_h = fmr;
+
+	iser_dbg("fmr reg: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
+		 " length=0x%x\n", reg->sge.lkey, reg->rkey,
+		 reg->sge.addr, reg->sge.length);
+
+	return 0;
+}
+
+/**
+ * Unregister (previosuly registered using FMR) memory.
+ * If memory is non-FMR does nothing.
+ */
+void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
+			enum iser_data_dir cmd_dir)
+{
+	struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
+	int ret;
+
+	if (!reg->mem_h)
+		return;
+
+	iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n", reg->mem_h);
+
+	ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h);
+	if (ret)
+		iser_err("ib_fmr_pool_unmap failed %d\n", ret);
+
+	reg->mem_h = NULL;
+}
+
+void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
+			    enum iser_data_dir cmd_dir)
+{
+	struct iser_device *device = iser_task->iser_conn->ib_conn.device;
+	struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
+
+	if (!reg->mem_h)
+		return;
+
+	device->reg_ops->reg_desc_put(&iser_task->iser_conn->ib_conn,
+				     reg->mem_h);
+	reg->mem_h = NULL;
+}
+
+static void
+iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
+		    struct ib_sig_domain *domain)
+{
+	domain->sig_type = IB_SIG_TYPE_T10_DIF;
+	domain->sig.dif.pi_interval = scsi_prot_interval(sc);
+	domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc);
+	/*
+	 * At the moment we hard code those, but in the future
+	 * we will take them from sc.
+	 */
+	domain->sig.dif.apptag_check_mask = 0xffff;
+	domain->sig.dif.app_escape = true;
+	domain->sig.dif.ref_escape = true;
+	if (sc->prot_flags & SCSI_PROT_REF_INCREMENT)
+		domain->sig.dif.ref_remap = true;
+};
+
+static int
+iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
+{
+	switch (scsi_get_prot_op(sc)) {
+	case SCSI_PROT_WRITE_INSERT:
+	case SCSI_PROT_READ_STRIP:
+		sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
+		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
+		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
+		break;
+	case SCSI_PROT_READ_INSERT:
+	case SCSI_PROT_WRITE_STRIP:
+		sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
+		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
+		sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
+						IB_T10DIF_CSUM : IB_T10DIF_CRC;
+		break;
+	case SCSI_PROT_READ_PASS:
+	case SCSI_PROT_WRITE_PASS:
+		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
+		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
+		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
+		sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
+						IB_T10DIF_CSUM : IB_T10DIF_CRC;
+		break;
+	default:
+		iser_err("Unsupported PI operation %d\n",
+			 scsi_get_prot_op(sc));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline void
+iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
+{
+	*mask = 0;
+	if (sc->prot_flags & SCSI_PROT_REF_CHECK)
+		*mask |= ISER_CHECK_REFTAG;
+	if (sc->prot_flags & SCSI_PROT_GUARD_CHECK)
+		*mask |= ISER_CHECK_GUARD;
+}
+
+static inline void
+iser_inv_rkey(struct ib_send_wr *inv_wr,
+	      struct ib_mr *mr,
+	      struct ib_cqe *cqe)
+{
+	inv_wr->opcode = IB_WR_LOCAL_INV;
+	inv_wr->wr_cqe = cqe;
+	inv_wr->ex.invalidate_rkey = mr->rkey;
+	inv_wr->send_flags = 0;
+	inv_wr->num_sge = 0;
+}
+
+static int
+iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
+		struct iser_pi_context *pi_ctx,
+		struct iser_mem_reg *data_reg,
+		struct iser_mem_reg *prot_reg,
+		struct iser_mem_reg *sig_reg)
+{
+	struct iser_tx_desc *tx_desc = &iser_task->desc;
+	struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs;
+	struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
+	struct ib_sig_handover_wr *wr;
+	struct ib_mr *mr = pi_ctx->sig_mr;
+	int ret;
+
+	memset(sig_attrs, 0, sizeof(*sig_attrs));
+	ret = iser_set_sig_attrs(iser_task->sc, sig_attrs);
+	if (ret)
+		goto err;
+
+	iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask);
+
+	if (pi_ctx->sig_mr_valid)
+		iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe);
+
+	ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
+
+	wr = sig_handover_wr(iser_tx_next_wr(tx_desc));
+	wr->wr.opcode = IB_WR_REG_SIG_MR;
+	wr->wr.wr_cqe = cqe;
+	wr->wr.sg_list = &data_reg->sge;
+	wr->wr.num_sge = 1;
+	wr->wr.send_flags = 0;
+	wr->sig_attrs = sig_attrs;
+	wr->sig_mr = mr;
+	if (scsi_prot_sg_count(iser_task->sc))
+		wr->prot = &prot_reg->sge;
+	else
+		wr->prot = NULL;
+	wr->access_flags = IB_ACCESS_LOCAL_WRITE |
+			   IB_ACCESS_REMOTE_READ |
+			   IB_ACCESS_REMOTE_WRITE;
+	pi_ctx->sig_mr_valid = 1;
+
+	sig_reg->sge.lkey = mr->lkey;
+	sig_reg->rkey = mr->rkey;
+	sig_reg->sge.addr = 0;
+	sig_reg->sge.length = scsi_transfer_length(iser_task->sc);
+
+	iser_dbg("lkey=0x%x rkey=0x%x addr=0x%llx length=%u\n",
+		 sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr,
+		 sig_reg->sge.length);
+err:
+	return ret;
+}
+
+static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
+			    struct iser_data_buf *mem,
+			    struct iser_reg_resources *rsc,
+			    struct iser_mem_reg *reg)
+{
+	struct iser_tx_desc *tx_desc = &iser_task->desc;
+	struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
+	struct ib_mr *mr = rsc->mr;
+	struct ib_reg_wr *wr;
+	int n;
+
+	if (rsc->mr_valid)
+		iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe);
+
+	ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
+
+	n = ib_map_mr_sg(mr, mem->sg, mem->size, NULL, SIZE_4K);
+	if (unlikely(n != mem->size)) {
+		iser_err("failed to map sg (%d/%d)\n",
+			 n, mem->size);
+		return n < 0 ? n : -EINVAL;
+	}
+
+	wr = reg_wr(iser_tx_next_wr(tx_desc));
+	wr->wr.opcode = IB_WR_REG_MR;
+	wr->wr.wr_cqe = cqe;
+	wr->wr.send_flags = 0;
+	wr->wr.num_sge = 0;
+	wr->mr = mr;
+	wr->key = mr->rkey;
+	wr->access = IB_ACCESS_LOCAL_WRITE  |
+		     IB_ACCESS_REMOTE_WRITE |
+		     IB_ACCESS_REMOTE_READ;
+
+	rsc->mr_valid = 1;
+
+	reg->sge.lkey = mr->lkey;
+	reg->rkey = mr->rkey;
+	reg->sge.addr = mr->iova;
+	reg->sge.length = mr->length;
+
+	iser_dbg("lkey=0x%x rkey=0x%x addr=0x%llx length=0x%x\n",
+		 reg->sge.lkey, reg->rkey, reg->sge.addr, reg->sge.length);
+
+	return 0;
+}
+
+static int
+iser_reg_prot_sg(struct iscsi_iser_task *task,
+		 struct iser_data_buf *mem,
+		 struct iser_fr_desc *desc,
+		 bool use_dma_key,
+		 struct iser_mem_reg *reg)
+{
+	struct iser_device *device = task->iser_conn->ib_conn.device;
+
+	if (use_dma_key)
+		return iser_reg_dma(device, mem, reg);
+
+	return device->reg_ops->reg_mem(task, mem, &desc->pi_ctx->rsc, reg);
+}
+
+static int
+iser_reg_data_sg(struct iscsi_iser_task *task,
+		 struct iser_data_buf *mem,
+		 struct iser_fr_desc *desc,
+		 bool use_dma_key,
+		 struct iser_mem_reg *reg)
+{
+	struct iser_device *device = task->iser_conn->ib_conn.device;
+
+	if (use_dma_key)
+		return iser_reg_dma(device, mem, reg);
+
+	return device->reg_ops->reg_mem(task, mem, &desc->rsc, reg);
+}
+
+int iser_reg_rdma_mem(struct iscsi_iser_task *task,
+		      enum iser_data_dir dir,
+		      bool all_imm)
+{
+	struct ib_conn *ib_conn = &task->iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+	struct iser_data_buf *mem = &task->data[dir];
+	struct iser_mem_reg *reg = &task->rdma_reg[dir];
+	struct iser_mem_reg *data_reg;
+	struct iser_fr_desc *desc = NULL;
+	bool use_dma_key;
+	int err;
+
+	use_dma_key = mem->dma_nents == 1 && (all_imm || !iser_always_reg) &&
+		      scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL;
+
+	if (!use_dma_key) {
+		desc = device->reg_ops->reg_desc_get(ib_conn);
+		reg->mem_h = desc;
+	}
+
+	if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL)
+		data_reg = reg;
+	else
+		data_reg = &task->desc.data_reg;
+
+	err = iser_reg_data_sg(task, mem, desc, use_dma_key, data_reg);
+	if (unlikely(err))
+		goto err_reg;
+
+	if (scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) {
+		struct iser_mem_reg *prot_reg = &task->desc.prot_reg;
+
+		if (scsi_prot_sg_count(task->sc)) {
+			mem = &task->prot[dir];
+			err = iser_reg_prot_sg(task, mem, desc,
+					       use_dma_key, prot_reg);
+			if (unlikely(err))
+				goto err_reg;
+		}
+
+		err = iser_reg_sig_mr(task, desc->pi_ctx, data_reg,
+				      prot_reg, reg);
+		if (unlikely(err))
+			goto err_reg;
+
+		desc->pi_ctx->sig_protected = 1;
+	}
+
+	return 0;
+
+err_reg:
+	if (desc)
+		device->reg_ops->reg_desc_put(ib_conn, desc);
+
+	return err;
+}
+
+void iser_unreg_rdma_mem(struct iscsi_iser_task *task,
+			 enum iser_data_dir dir)
+{
+	struct iser_device *device = task->iser_conn->ib_conn.device;
+
+	device->reg_ops->unreg_mem(task, dir);
+}
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
new file mode 100644
index 0000000..56b7240
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -0,0 +1,1160 @@
+/*
+ * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+
+#include "iscsi_iser.h"
+
+#define ISCSI_ISER_MAX_CONN	8
+#define ISER_MAX_RX_LEN		(ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_TX_LEN		(ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_CQ_LEN		(ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
+				 ISCSI_ISER_MAX_CONN)
+
+static void iser_qp_event_callback(struct ib_event *cause, void *context)
+{
+	iser_err("qp event %s (%d)\n",
+		 ib_event_msg(cause->event), cause->event);
+}
+
+static void iser_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+	iser_err("async event %s (%d) on device %s port %d\n",
+		 ib_event_msg(event->event), event->event,
+		 event->device->name, event->element.port_num);
+}
+
+/**
+ * iser_create_device_ib_res - creates Protection Domain (PD), Completion
+ * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with
+ * the adapator.
+ *
+ * returns 0 on success, -1 on failure
+ */
+static int iser_create_device_ib_res(struct iser_device *device)
+{
+	struct ib_device *ib_dev = device->ib_device;
+	int ret, i, max_cqe;
+
+	ret = iser_assign_reg_ops(device);
+	if (ret)
+		return ret;
+
+	device->comps_used = min_t(int, num_online_cpus(),
+				 ib_dev->num_comp_vectors);
+
+	device->comps = kcalloc(device->comps_used, sizeof(*device->comps),
+				GFP_KERNEL);
+	if (!device->comps)
+		goto comps_err;
+
+	max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe);
+
+	iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n",
+		  device->comps_used, ib_dev->name,
+		  ib_dev->num_comp_vectors, max_cqe);
+
+	device->pd = ib_alloc_pd(ib_dev,
+		iser_always_reg ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
+	if (IS_ERR(device->pd))
+		goto pd_err;
+
+	for (i = 0; i < device->comps_used; i++) {
+		struct iser_comp *comp = &device->comps[i];
+
+		comp->cq = ib_alloc_cq(ib_dev, comp, max_cqe, i,
+				       IB_POLL_SOFTIRQ);
+		if (IS_ERR(comp->cq)) {
+			comp->cq = NULL;
+			goto cq_err;
+		}
+	}
+
+	INIT_IB_EVENT_HANDLER(&device->event_handler, ib_dev,
+			      iser_event_handler);
+	ib_register_event_handler(&device->event_handler);
+	return 0;
+
+cq_err:
+	for (i = 0; i < device->comps_used; i++) {
+		struct iser_comp *comp = &device->comps[i];
+
+		if (comp->cq)
+			ib_free_cq(comp->cq);
+	}
+	ib_dealloc_pd(device->pd);
+pd_err:
+	kfree(device->comps);
+comps_err:
+	iser_err("failed to allocate an IB resource\n");
+	return -1;
+}
+
+/**
+ * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR,
+ * CQ and PD created with the device associated with the adapator.
+ */
+static void iser_free_device_ib_res(struct iser_device *device)
+{
+	int i;
+
+	for (i = 0; i < device->comps_used; i++) {
+		struct iser_comp *comp = &device->comps[i];
+
+		ib_free_cq(comp->cq);
+		comp->cq = NULL;
+	}
+
+	ib_unregister_event_handler(&device->event_handler);
+	ib_dealloc_pd(device->pd);
+
+	kfree(device->comps);
+	device->comps = NULL;
+	device->pd = NULL;
+}
+
+/**
+ * iser_alloc_fmr_pool - Creates FMR pool and page_vector
+ *
+ * returns 0 on success, or errno code on failure
+ */
+int iser_alloc_fmr_pool(struct ib_conn *ib_conn,
+			unsigned cmds_max,
+			unsigned int size)
+{
+	struct iser_device *device = ib_conn->device;
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+	struct iser_page_vec *page_vec;
+	struct iser_fr_desc *desc;
+	struct ib_fmr_pool *fmr_pool;
+	struct ib_fmr_pool_param params;
+	int ret;
+
+	INIT_LIST_HEAD(&fr_pool->list);
+	spin_lock_init(&fr_pool->lock);
+
+	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	page_vec = kmalloc(sizeof(*page_vec) + (sizeof(u64) * size),
+			   GFP_KERNEL);
+	if (!page_vec) {
+		ret = -ENOMEM;
+		goto err_frpl;
+	}
+
+	page_vec->pages = (u64 *)(page_vec + 1);
+
+	params.page_shift        = SHIFT_4K;
+	params.max_pages_per_fmr = size;
+	/* make the pool size twice the max number of SCSI commands *
+	 * the ML is expected to queue, watermark for unmap at 50%  */
+	params.pool_size	 = cmds_max * 2;
+	params.dirty_watermark	 = cmds_max;
+	params.cache		 = 0;
+	params.flush_function	 = NULL;
+	params.access		 = (IB_ACCESS_LOCAL_WRITE  |
+				    IB_ACCESS_REMOTE_WRITE |
+				    IB_ACCESS_REMOTE_READ);
+
+	fmr_pool = ib_create_fmr_pool(device->pd, &params);
+	if (IS_ERR(fmr_pool)) {
+		ret = PTR_ERR(fmr_pool);
+		iser_err("FMR allocation failed, err %d\n", ret);
+		goto err_fmr;
+	}
+
+	desc->rsc.page_vec = page_vec;
+	desc->rsc.fmr_pool = fmr_pool;
+	list_add(&desc->list, &fr_pool->list);
+
+	return 0;
+
+err_fmr:
+	kfree(page_vec);
+err_frpl:
+	kfree(desc);
+
+	return ret;
+}
+
+/**
+ * iser_free_fmr_pool - releases the FMR pool and page vec
+ */
+void iser_free_fmr_pool(struct ib_conn *ib_conn)
+{
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+	struct iser_fr_desc *desc;
+
+	desc = list_first_entry(&fr_pool->list,
+				struct iser_fr_desc, list);
+	list_del(&desc->list);
+
+	iser_info("freeing conn %p fmr pool %p\n",
+		  ib_conn, desc->rsc.fmr_pool);
+
+	ib_destroy_fmr_pool(desc->rsc.fmr_pool);
+	kfree(desc->rsc.page_vec);
+	kfree(desc);
+}
+
+static int
+iser_alloc_reg_res(struct iser_device *device,
+		   struct ib_pd *pd,
+		   struct iser_reg_resources *res,
+		   unsigned int size)
+{
+	struct ib_device *ib_dev = device->ib_device;
+	enum ib_mr_type mr_type;
+	int ret;
+
+	if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+		mr_type = IB_MR_TYPE_SG_GAPS;
+	else
+		mr_type = IB_MR_TYPE_MEM_REG;
+
+	res->mr = ib_alloc_mr(pd, mr_type, size);
+	if (IS_ERR(res->mr)) {
+		ret = PTR_ERR(res->mr);
+		iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
+		return ret;
+	}
+	res->mr_valid = 0;
+
+	return 0;
+}
+
+static void
+iser_free_reg_res(struct iser_reg_resources *rsc)
+{
+	ib_dereg_mr(rsc->mr);
+}
+
+static int
+iser_alloc_pi_ctx(struct iser_device *device,
+		  struct ib_pd *pd,
+		  struct iser_fr_desc *desc,
+		  unsigned int size)
+{
+	struct iser_pi_context *pi_ctx = NULL;
+	int ret;
+
+	desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL);
+	if (!desc->pi_ctx)
+		return -ENOMEM;
+
+	pi_ctx = desc->pi_ctx;
+
+	ret = iser_alloc_reg_res(device, pd, &pi_ctx->rsc, size);
+	if (ret) {
+		iser_err("failed to allocate reg_resources\n");
+		goto alloc_reg_res_err;
+	}
+
+	pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2);
+	if (IS_ERR(pi_ctx->sig_mr)) {
+		ret = PTR_ERR(pi_ctx->sig_mr);
+		goto sig_mr_failure;
+	}
+	pi_ctx->sig_mr_valid = 0;
+	desc->pi_ctx->sig_protected = 0;
+
+	return 0;
+
+sig_mr_failure:
+	iser_free_reg_res(&pi_ctx->rsc);
+alloc_reg_res_err:
+	kfree(desc->pi_ctx);
+
+	return ret;
+}
+
+static void
+iser_free_pi_ctx(struct iser_pi_context *pi_ctx)
+{
+	iser_free_reg_res(&pi_ctx->rsc);
+	ib_dereg_mr(pi_ctx->sig_mr);
+	kfree(pi_ctx);
+}
+
+static struct iser_fr_desc *
+iser_create_fastreg_desc(struct iser_device *device,
+			 struct ib_pd *pd,
+			 bool pi_enable,
+			 unsigned int size)
+{
+	struct iser_fr_desc *desc;
+	int ret;
+
+	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc)
+		return ERR_PTR(-ENOMEM);
+
+	ret = iser_alloc_reg_res(device, pd, &desc->rsc, size);
+	if (ret)
+		goto reg_res_alloc_failure;
+
+	if (pi_enable) {
+		ret = iser_alloc_pi_ctx(device, pd, desc, size);
+		if (ret)
+			goto pi_ctx_alloc_failure;
+	}
+
+	return desc;
+
+pi_ctx_alloc_failure:
+	iser_free_reg_res(&desc->rsc);
+reg_res_alloc_failure:
+	kfree(desc);
+
+	return ERR_PTR(ret);
+}
+
+/**
+ * iser_alloc_fastreg_pool - Creates pool of fast_reg descriptors
+ * for fast registration work requests.
+ * returns 0 on success, or errno code on failure
+ */
+int iser_alloc_fastreg_pool(struct ib_conn *ib_conn,
+			    unsigned cmds_max,
+			    unsigned int size)
+{
+	struct iser_device *device = ib_conn->device;
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+	struct iser_fr_desc *desc;
+	int i, ret;
+
+	INIT_LIST_HEAD(&fr_pool->list);
+	INIT_LIST_HEAD(&fr_pool->all_list);
+	spin_lock_init(&fr_pool->lock);
+	fr_pool->size = 0;
+	for (i = 0; i < cmds_max; i++) {
+		desc = iser_create_fastreg_desc(device, device->pd,
+						ib_conn->pi_support, size);
+		if (IS_ERR(desc)) {
+			ret = PTR_ERR(desc);
+			goto err;
+		}
+
+		list_add_tail(&desc->list, &fr_pool->list);
+		list_add_tail(&desc->all_list, &fr_pool->all_list);
+		fr_pool->size++;
+	}
+
+	return 0;
+
+err:
+	iser_free_fastreg_pool(ib_conn);
+	return ret;
+}
+
+/**
+ * iser_free_fastreg_pool - releases the pool of fast_reg descriptors
+ */
+void iser_free_fastreg_pool(struct ib_conn *ib_conn)
+{
+	struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+	struct iser_fr_desc *desc, *tmp;
+	int i = 0;
+
+	if (list_empty(&fr_pool->all_list))
+		return;
+
+	iser_info("freeing conn %p fr pool\n", ib_conn);
+
+	list_for_each_entry_safe(desc, tmp, &fr_pool->all_list, all_list) {
+		list_del(&desc->all_list);
+		iser_free_reg_res(&desc->rsc);
+		if (desc->pi_ctx)
+			iser_free_pi_ctx(desc->pi_ctx);
+		kfree(desc);
+		++i;
+	}
+
+	if (i < fr_pool->size)
+		iser_warn("pool still has %d regions registered\n",
+			  fr_pool->size - i);
+}
+
+/**
+ * iser_create_ib_conn_res - Queue-Pair (QP)
+ *
+ * returns 0 on success, -1 on failure
+ */
+static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
+{
+	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
+	struct iser_device	*device;
+	struct ib_device	*ib_dev;
+	struct ib_qp_init_attr	init_attr;
+	int			ret = -ENOMEM;
+	int index, min_index = 0;
+
+	BUG_ON(ib_conn->device == NULL);
+
+	device = ib_conn->device;
+	ib_dev = device->ib_device;
+
+	memset(&init_attr, 0, sizeof init_attr);
+
+	mutex_lock(&ig.connlist_mutex);
+	/* select the CQ with the minimal number of usages */
+	for (index = 0; index < device->comps_used; index++) {
+		if (device->comps[index].active_qps <
+		    device->comps[min_index].active_qps)
+			min_index = index;
+	}
+	ib_conn->comp = &device->comps[min_index];
+	ib_conn->comp->active_qps++;
+	mutex_unlock(&ig.connlist_mutex);
+	iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn);
+
+	init_attr.event_handler = iser_qp_event_callback;
+	init_attr.qp_context	= (void *)ib_conn;
+	init_attr.send_cq	= ib_conn->comp->cq;
+	init_attr.recv_cq	= ib_conn->comp->cq;
+	init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
+	init_attr.cap.max_send_sge = 2;
+	init_attr.cap.max_recv_sge = 1;
+	init_attr.sq_sig_type	= IB_SIGNAL_REQ_WR;
+	init_attr.qp_type	= IB_QPT_RC;
+	if (ib_conn->pi_support) {
+		init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
+		init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
+		iser_conn->max_cmds =
+			ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS);
+	} else {
+		if (ib_dev->attrs.max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
+			init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS + 1;
+			iser_conn->max_cmds =
+				ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
+		} else {
+			init_attr.cap.max_send_wr = ib_dev->attrs.max_qp_wr;
+			iser_conn->max_cmds =
+				ISER_GET_MAX_XMIT_CMDS(ib_dev->attrs.max_qp_wr);
+			iser_dbg("device %s supports max_send_wr %d\n",
+				 device->ib_device->name, ib_dev->attrs.max_qp_wr);
+		}
+	}
+
+	ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
+	if (ret)
+		goto out_err;
+
+	ib_conn->qp = ib_conn->cma_id->qp;
+	iser_info("setting conn %p cma_id %p qp %p\n",
+		  ib_conn, ib_conn->cma_id,
+		  ib_conn->cma_id->qp);
+	return ret;
+
+out_err:
+	mutex_lock(&ig.connlist_mutex);
+	ib_conn->comp->active_qps--;
+	mutex_unlock(&ig.connlist_mutex);
+	iser_err("unable to alloc mem or create resource, err %d\n", ret);
+
+	return ret;
+}
+
+/**
+ * based on the resolved device node GUID see if there already allocated
+ * device for this device. If there's no such, create one.
+ */
+static
+struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id *cma_id)
+{
+	struct iser_device *device;
+
+	mutex_lock(&ig.device_list_mutex);
+
+	list_for_each_entry(device, &ig.device_list, ig_list)
+		/* find if there's a match using the node GUID */
+		if (device->ib_device->node_guid == cma_id->device->node_guid)
+			goto inc_refcnt;
+
+	device = kzalloc(sizeof *device, GFP_KERNEL);
+	if (device == NULL)
+		goto out;
+
+	/* assign this device to the device */
+	device->ib_device = cma_id->device;
+	/* init the device and link it into ig device list */
+	if (iser_create_device_ib_res(device)) {
+		kfree(device);
+		device = NULL;
+		goto out;
+	}
+	list_add(&device->ig_list, &ig.device_list);
+
+inc_refcnt:
+	device->refcount++;
+out:
+	mutex_unlock(&ig.device_list_mutex);
+	return device;
+}
+
+/* if there's no demand for this device, release it */
+static void iser_device_try_release(struct iser_device *device)
+{
+	mutex_lock(&ig.device_list_mutex);
+	device->refcount--;
+	iser_info("device %p refcount %d\n", device, device->refcount);
+	if (!device->refcount) {
+		iser_free_device_ib_res(device);
+		list_del(&device->ig_list);
+		kfree(device);
+	}
+	mutex_unlock(&ig.device_list_mutex);
+}
+
+/**
+ * Called with state mutex held
+ **/
+static int iser_conn_state_comp_exch(struct iser_conn *iser_conn,
+				     enum iser_conn_state comp,
+				     enum iser_conn_state exch)
+{
+	int ret;
+
+	ret = (iser_conn->state == comp);
+	if (ret)
+		iser_conn->state = exch;
+
+	return ret;
+}
+
+void iser_release_work(struct work_struct *work)
+{
+	struct iser_conn *iser_conn;
+
+	iser_conn = container_of(work, struct iser_conn, release_work);
+
+	/* Wait for conn_stop to complete */
+	wait_for_completion(&iser_conn->stop_completion);
+	/* Wait for IB resouces cleanup to complete */
+	wait_for_completion(&iser_conn->ib_completion);
+
+	mutex_lock(&iser_conn->state_mutex);
+	iser_conn->state = ISER_CONN_DOWN;
+	mutex_unlock(&iser_conn->state_mutex);
+
+	iser_conn_release(iser_conn);
+}
+
+/**
+ * iser_free_ib_conn_res - release IB related resources
+ * @iser_conn: iser connection struct
+ * @destroy: indicator if we need to try to release the
+ *     iser device and memory regoins pool (only iscsi
+ *     shutdown and DEVICE_REMOVAL will use this).
+ *
+ * This routine is called with the iser state mutex held
+ * so the cm_id removal is out of here. It is Safe to
+ * be invoked multiple times.
+ */
+static void iser_free_ib_conn_res(struct iser_conn *iser_conn,
+				  bool destroy)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+
+	iser_info("freeing conn %p cma_id %p qp %p\n",
+		  iser_conn, ib_conn->cma_id, ib_conn->qp);
+
+	if (ib_conn->qp != NULL) {
+		mutex_lock(&ig.connlist_mutex);
+		ib_conn->comp->active_qps--;
+		mutex_unlock(&ig.connlist_mutex);
+		rdma_destroy_qp(ib_conn->cma_id);
+		ib_conn->qp = NULL;
+	}
+
+	if (destroy) {
+		if (iser_conn->rx_descs)
+			iser_free_rx_descriptors(iser_conn);
+
+		if (device != NULL) {
+			iser_device_try_release(device);
+			ib_conn->device = NULL;
+		}
+	}
+}
+
+/**
+ * Frees all conn objects and deallocs conn descriptor
+ */
+void iser_conn_release(struct iser_conn *iser_conn)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+
+	mutex_lock(&ig.connlist_mutex);
+	list_del(&iser_conn->conn_list);
+	mutex_unlock(&ig.connlist_mutex);
+
+	mutex_lock(&iser_conn->state_mutex);
+	/* In case we endup here without ep_disconnect being invoked. */
+	if (iser_conn->state != ISER_CONN_DOWN) {
+		iser_warn("iser conn %p state %d, expected state down.\n",
+			  iser_conn, iser_conn->state);
+		iscsi_destroy_endpoint(iser_conn->ep);
+		iser_conn->state = ISER_CONN_DOWN;
+	}
+	/*
+	 * In case we never got to bind stage, we still need to
+	 * release IB resources (which is safe to call more than once).
+	 */
+	iser_free_ib_conn_res(iser_conn, true);
+	mutex_unlock(&iser_conn->state_mutex);
+
+	if (ib_conn->cma_id != NULL) {
+		rdma_destroy_id(ib_conn->cma_id);
+		ib_conn->cma_id = NULL;
+	}
+
+	kfree(iser_conn);
+}
+
+/**
+ * triggers start of the disconnect procedures and wait for them to be done
+ * Called with state mutex held
+ */
+int iser_conn_terminate(struct iser_conn *iser_conn)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	int err = 0;
+
+	/* terminate the iser conn only if the conn state is UP */
+	if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP,
+				       ISER_CONN_TERMINATING))
+		return 0;
+
+	iser_info("iser_conn %p state %d\n", iser_conn, iser_conn->state);
+
+	/* suspend queuing of new iscsi commands */
+	if (iser_conn->iscsi_conn)
+		iscsi_suspend_queue(iser_conn->iscsi_conn);
+
+	/*
+	 * In case we didn't already clean up the cma_id (peer initiated
+	 * a disconnection), we need to Cause the CMA to change the QP
+	 * state to ERROR.
+	 */
+	if (ib_conn->cma_id) {
+		err = rdma_disconnect(ib_conn->cma_id);
+		if (err)
+			iser_err("Failed to disconnect, conn: 0x%p err %d\n",
+				 iser_conn, err);
+
+		/* block until all flush errors are consumed */
+		ib_drain_sq(ib_conn->qp);
+	}
+
+	return 1;
+}
+
+/**
+ * Called with state mutex held
+ **/
+static void iser_connect_error(struct rdma_cm_id *cma_id)
+{
+	struct iser_conn *iser_conn;
+
+	iser_conn = (struct iser_conn *)cma_id->context;
+	iser_conn->state = ISER_CONN_TERMINATING;
+}
+
+static void
+iser_calc_scsi_params(struct iser_conn *iser_conn,
+		      unsigned int max_sectors)
+{
+	struct iser_device *device = iser_conn->ib_conn.device;
+	unsigned short sg_tablesize, sup_sg_tablesize;
+
+	sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K);
+	if (device->ib_device->attrs.device_cap_flags &
+			IB_DEVICE_MEM_MGT_EXTENSIONS)
+		sup_sg_tablesize =
+			min_t(
+			 uint, ISCSI_ISER_MAX_SG_TABLESIZE,
+			 device->ib_device->attrs.max_fast_reg_page_list_len);
+	else
+		sup_sg_tablesize = ISCSI_ISER_MAX_SG_TABLESIZE;
+
+	iser_conn->scsi_sg_tablesize = min(sg_tablesize, sup_sg_tablesize);
+}
+
+/**
+ * Called with state mutex held
+ **/
+static void iser_addr_handler(struct rdma_cm_id *cma_id)
+{
+	struct iser_device *device;
+	struct iser_conn   *iser_conn;
+	struct ib_conn   *ib_conn;
+	int    ret;
+
+	iser_conn = (struct iser_conn *)cma_id->context;
+	if (iser_conn->state != ISER_CONN_PENDING)
+		/* bailout */
+		return;
+
+	ib_conn = &iser_conn->ib_conn;
+	device = iser_device_find_by_ib_device(cma_id);
+	if (!device) {
+		iser_err("device lookup/creation failed\n");
+		iser_connect_error(cma_id);
+		return;
+	}
+
+	ib_conn->device = device;
+
+	/* connection T10-PI support */
+	if (iser_pi_enable) {
+		if (!(device->ib_device->attrs.device_cap_flags &
+		      IB_DEVICE_SIGNATURE_HANDOVER)) {
+			iser_warn("T10-PI requested but not supported on %s, "
+				  "continue without T10-PI\n",
+				  ib_conn->device->ib_device->name);
+			ib_conn->pi_support = false;
+		} else {
+			ib_conn->pi_support = true;
+		}
+	}
+
+	iser_calc_scsi_params(iser_conn, iser_max_sectors);
+
+	ret = rdma_resolve_route(cma_id, 1000);
+	if (ret) {
+		iser_err("resolve route failed: %d\n", ret);
+		iser_connect_error(cma_id);
+		return;
+	}
+}
+
+/**
+ * Called with state mutex held
+ **/
+static void iser_route_handler(struct rdma_cm_id *cma_id)
+{
+	struct rdma_conn_param conn_param;
+	int    ret;
+	struct iser_cm_hdr req_hdr;
+	struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+
+	if (iser_conn->state != ISER_CONN_PENDING)
+		/* bailout */
+		return;
+
+	ret = iser_create_ib_conn_res(ib_conn);
+	if (ret)
+		goto failure;
+
+	memset(&conn_param, 0, sizeof conn_param);
+	conn_param.responder_resources = device->ib_device->attrs.max_qp_rd_atom;
+	conn_param.initiator_depth     = 1;
+	conn_param.retry_count	       = 7;
+	conn_param.rnr_retry_count     = 6;
+
+	memset(&req_hdr, 0, sizeof(req_hdr));
+	req_hdr.flags = ISER_ZBVA_NOT_SUP;
+	if (!device->remote_inv_sup)
+		req_hdr.flags |= ISER_SEND_W_INV_NOT_SUP;
+	conn_param.private_data	= (void *)&req_hdr;
+	conn_param.private_data_len = sizeof(struct iser_cm_hdr);
+
+	ret = rdma_connect(cma_id, &conn_param);
+	if (ret) {
+		iser_err("failure connecting: %d\n", ret);
+		goto failure;
+	}
+
+	return;
+failure:
+	iser_connect_error(cma_id);
+}
+
+static void iser_connected_handler(struct rdma_cm_id *cma_id,
+				   const void *private_data)
+{
+	struct iser_conn *iser_conn;
+	struct ib_qp_attr attr;
+	struct ib_qp_init_attr init_attr;
+
+	iser_conn = (struct iser_conn *)cma_id->context;
+	if (iser_conn->state != ISER_CONN_PENDING)
+		/* bailout */
+		return;
+
+	(void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr);
+	iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num);
+
+	if (private_data) {
+		u8 flags = *(u8 *)private_data;
+
+		iser_conn->snd_w_inv = !(flags & ISER_SEND_W_INV_NOT_SUP);
+	}
+
+	iser_info("conn %p: negotiated %s invalidation\n",
+		  iser_conn, iser_conn->snd_w_inv ? "remote" : "local");
+
+	iser_conn->state = ISER_CONN_UP;
+	complete(&iser_conn->up_completion);
+}
+
+static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
+{
+	struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
+
+	if (iser_conn_terminate(iser_conn)) {
+		if (iser_conn->iscsi_conn)
+			iscsi_conn_failure(iser_conn->iscsi_conn,
+					   ISCSI_ERR_CONN_FAILED);
+		else
+			iser_err("iscsi_iser connection isn't bound\n");
+	}
+}
+
+static void iser_cleanup_handler(struct rdma_cm_id *cma_id,
+				 bool destroy)
+{
+	struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
+
+	/*
+	 * We are not guaranteed that we visited disconnected_handler
+	 * by now, call it here to be safe that we handle CM drep
+	 * and flush errors.
+	 */
+	iser_disconnected_handler(cma_id);
+	iser_free_ib_conn_res(iser_conn, destroy);
+	complete(&iser_conn->ib_completion);
+};
+
+static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
+{
+	struct iser_conn *iser_conn;
+	int ret = 0;
+
+	iser_conn = (struct iser_conn *)cma_id->context;
+	iser_info("%s (%d): status %d conn %p id %p\n",
+		  rdma_event_msg(event->event), event->event,
+		  event->status, cma_id->context, cma_id);
+
+	mutex_lock(&iser_conn->state_mutex);
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		iser_addr_handler(cma_id);
+		break;
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		iser_route_handler(cma_id);
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		iser_connected_handler(cma_id, event->param.conn.private_data);
+		break;
+	case RDMA_CM_EVENT_REJECTED:
+		iser_info("Connection rejected: %s\n",
+			 rdma_reject_msg(cma_id, event->status));
+		/* FALLTHROUGH */
+	case RDMA_CM_EVENT_ADDR_ERROR:
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+		iser_connect_error(cma_id);
+		break;
+	case RDMA_CM_EVENT_DISCONNECTED:
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		iser_cleanup_handler(cma_id, false);
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		/*
+		 * we *must* destroy the device as we cannot rely
+		 * on iscsid to be around to initiate error handling.
+		 * also if we are not in state DOWN implicitly destroy
+		 * the cma_id.
+		 */
+		iser_cleanup_handler(cma_id, true);
+		if (iser_conn->state != ISER_CONN_DOWN) {
+			iser_conn->ib_conn.cma_id = NULL;
+			ret = 1;
+		}
+		break;
+	default:
+		iser_err("Unexpected RDMA CM event: %s (%d)\n",
+			 rdma_event_msg(event->event), event->event);
+		break;
+	}
+	mutex_unlock(&iser_conn->state_mutex);
+
+	return ret;
+}
+
+void iser_conn_init(struct iser_conn *iser_conn)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+
+	iser_conn->state = ISER_CONN_INIT;
+	init_completion(&iser_conn->stop_completion);
+	init_completion(&iser_conn->ib_completion);
+	init_completion(&iser_conn->up_completion);
+	INIT_LIST_HEAD(&iser_conn->conn_list);
+	mutex_init(&iser_conn->state_mutex);
+
+	ib_conn->post_recv_buf_count = 0;
+	ib_conn->reg_cqe.done = iser_reg_comp;
+}
+
+ /**
+ * starts the process of connecting to the target
+ * sleeps until the connection is established or rejected
+ */
+int iser_connect(struct iser_conn   *iser_conn,
+		 struct sockaddr    *src_addr,
+		 struct sockaddr    *dst_addr,
+		 int                 non_blocking)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	int err = 0;
+
+	mutex_lock(&iser_conn->state_mutex);
+
+	sprintf(iser_conn->name, "%pISp", dst_addr);
+
+	iser_info("connecting to: %s\n", iser_conn->name);
+
+	/* the device is known only --after-- address resolution */
+	ib_conn->device = NULL;
+
+	iser_conn->state = ISER_CONN_PENDING;
+
+	ib_conn->cma_id = rdma_create_id(&init_net, iser_cma_handler,
+					 (void *)iser_conn,
+					 RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(ib_conn->cma_id)) {
+		err = PTR_ERR(ib_conn->cma_id);
+		iser_err("rdma_create_id failed: %d\n", err);
+		goto id_failure;
+	}
+
+	err = rdma_resolve_addr(ib_conn->cma_id, src_addr, dst_addr, 1000);
+	if (err) {
+		iser_err("rdma_resolve_addr failed: %d\n", err);
+		goto addr_failure;
+	}
+
+	if (!non_blocking) {
+		wait_for_completion_interruptible(&iser_conn->up_completion);
+
+		if (iser_conn->state != ISER_CONN_UP) {
+			err =  -EIO;
+			goto connect_failure;
+		}
+	}
+	mutex_unlock(&iser_conn->state_mutex);
+
+	mutex_lock(&ig.connlist_mutex);
+	list_add(&iser_conn->conn_list, &ig.connlist);
+	mutex_unlock(&ig.connlist_mutex);
+	return 0;
+
+id_failure:
+	ib_conn->cma_id = NULL;
+addr_failure:
+	iser_conn->state = ISER_CONN_DOWN;
+connect_failure:
+	mutex_unlock(&iser_conn->state_mutex);
+	iser_conn_release(iser_conn);
+	return err;
+}
+
+int iser_post_recvl(struct iser_conn *iser_conn)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_login_desc *desc = &iser_conn->login_desc;
+	struct ib_recv_wr wr, *wr_failed;
+	int ib_ret;
+
+	desc->sge.addr = desc->rsp_dma;
+	desc->sge.length = ISER_RX_LOGIN_SIZE;
+	desc->sge.lkey = ib_conn->device->pd->local_dma_lkey;
+
+	desc->cqe.done = iser_login_rsp;
+	wr.wr_cqe = &desc->cqe;
+	wr.sg_list = &desc->sge;
+	wr.num_sge = 1;
+	wr.next = NULL;
+
+	ib_conn->post_recv_buf_count++;
+	ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed);
+	if (ib_ret) {
+		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
+		ib_conn->post_recv_buf_count--;
+	}
+
+	return ib_ret;
+}
+
+int iser_post_recvm(struct iser_conn *iser_conn, int count)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	unsigned int my_rx_head = iser_conn->rx_desc_head;
+	struct iser_rx_desc *rx_desc;
+	struct ib_recv_wr *wr, *wr_failed;
+	int i, ib_ret;
+
+	for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) {
+		rx_desc = &iser_conn->rx_descs[my_rx_head];
+		rx_desc->cqe.done = iser_task_rsp;
+		wr->wr_cqe = &rx_desc->cqe;
+		wr->sg_list = &rx_desc->rx_sg;
+		wr->num_sge = 1;
+		wr->next = wr + 1;
+		my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask;
+	}
+
+	wr--;
+	wr->next = NULL; /* mark end of work requests list */
+
+	ib_conn->post_recv_buf_count += count;
+	ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &wr_failed);
+	if (ib_ret) {
+		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
+		ib_conn->post_recv_buf_count -= count;
+	} else
+		iser_conn->rx_desc_head = my_rx_head;
+
+	return ib_ret;
+}
+
+
+/**
+ * iser_start_send - Initiate a Send DTO operation
+ *
+ * returns 0 on success, -1 on failure
+ */
+int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
+		   bool signal)
+{
+	struct ib_send_wr *bad_wr, *wr = iser_tx_next_wr(tx_desc);
+	int ib_ret;
+
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+				      tx_desc->dma_addr, ISER_HEADERS_LEN,
+				      DMA_TO_DEVICE);
+
+	wr->next = NULL;
+	wr->wr_cqe = &tx_desc->cqe;
+	wr->sg_list = tx_desc->tx_sg;
+	wr->num_sge = tx_desc->num_sge;
+	wr->opcode = IB_WR_SEND;
+	wr->send_flags = signal ? IB_SEND_SIGNALED : 0;
+
+	ib_ret = ib_post_send(ib_conn->qp, &tx_desc->wrs[0].send, &bad_wr);
+	if (ib_ret)
+		iser_err("ib_post_send failed, ret:%d opcode:%d\n",
+			 ib_ret, bad_wr->opcode);
+
+	return ib_ret;
+}
+
+u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
+			     enum iser_data_dir cmd_dir, sector_t *sector)
+{
+	struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
+	struct iser_fr_desc *desc = reg->mem_h;
+	unsigned long sector_size = iser_task->sc->device->sector_size;
+	struct ib_mr_status mr_status;
+	int ret;
+
+	if (desc && desc->pi_ctx->sig_protected) {
+		desc->pi_ctx->sig_protected = 0;
+		ret = ib_check_mr_status(desc->pi_ctx->sig_mr,
+					 IB_MR_CHECK_SIG_STATUS, &mr_status);
+		if (ret) {
+			pr_err("ib_check_mr_status failed, ret %d\n", ret);
+			goto err;
+		}
+
+		if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
+			sector_t sector_off = mr_status.sig_err.sig_err_offset;
+
+			sector_div(sector_off, sector_size + 8);
+			*sector = scsi_get_lba(iser_task->sc) + sector_off;
+
+			pr_err("PI error found type %d at sector %llx "
+			       "expected %x vs actual %x\n",
+			       mr_status.sig_err.err_type,
+			       (unsigned long long)*sector,
+			       mr_status.sig_err.expected,
+			       mr_status.sig_err.actual);
+
+			switch (mr_status.sig_err.err_type) {
+			case IB_SIG_BAD_GUARD:
+				return 0x1;
+			case IB_SIG_BAD_REFTAG:
+				return 0x3;
+			case IB_SIG_BAD_APPTAG:
+				return 0x2;
+			}
+		}
+	}
+
+	return 0;
+err:
+	/* Not alot we can do here, return ambiguous guard error */
+	return 0x1;
+}
+
+void iser_err_comp(struct ib_wc *wc, const char *type)
+{
+	if (wc->status != IB_WC_WR_FLUSH_ERR) {
+		struct iser_conn *iser_conn = to_iser_conn(wc->qp->qp_context);
+
+		iser_err("%s failure: %s (%d) vend_err %#x\n", type,
+			 ib_wc_status_msg(wc->status), wc->status,
+			 wc->vendor_err);
+
+		if (iser_conn->iscsi_conn)
+			iscsi_conn_failure(iser_conn->iscsi_conn,
+					   ISCSI_ERR_CONN_FAILED);
+	} else {
+		iser_dbg("%s failure: %s (%d)\n", type,
+			 ib_wc_status_msg(wc->status), wc->status);
+	}
+}
diff --git a/drivers/infiniband/ulp/isert/Kconfig b/drivers/infiniband/ulp/isert/Kconfig
new file mode 100644
index 0000000..02f9759
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/Kconfig
@@ -0,0 +1,5 @@
+config INFINIBAND_ISERT
+	tristate "iSCSI Extensions for RDMA (iSER) target support"
+	depends on INET && INFINIBAND_ADDR_TRANS && TARGET_CORE && ISCSI_TARGET
+	---help---
+	Support for iSCSI Extensions for RDMA (iSER) Target on Infiniband fabrics.
diff --git a/drivers/infiniband/ulp/isert/Makefile b/drivers/infiniband/ulp/isert/Makefile
new file mode 100644
index 0000000..c8bf242
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/Makefile
@@ -0,0 +1,2 @@
+ccflags-y		:= -Idrivers/target -Idrivers/target/iscsi
+obj-$(CONFIG_INFINIBAND_ISERT)	+= ib_isert.o
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
new file mode 100644
index 0000000..fff40b0
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -0,0 +1,2725 @@
+/*******************************************************************************
+ * This file contains iSCSI extentions for RDMA (iSER) Verbs
+ *
+ * (c) Copyright 2013 Datera, Inc.
+ *
+ * Nicholas A. Bellinger <nab@linux-iscsi.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ ****************************************************************************/
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <target/target_core_base.h>
+#include <target/target_core_fabric.h>
+#include <target/iscsi/iscsi_transport.h>
+#include <linux/semaphore.h>
+
+#include "ib_isert.h"
+
+#define	ISERT_MAX_CONN		8
+#define ISER_MAX_RX_CQ_LEN	(ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN)
+#define ISER_MAX_TX_CQ_LEN \
+	((ISERT_QP_MAX_REQ_DTOS + ISCSI_DEF_XMIT_CMDS_MAX) * ISERT_MAX_CONN)
+#define ISER_MAX_CQ_LEN		(ISER_MAX_RX_CQ_LEN + ISER_MAX_TX_CQ_LEN + \
+				 ISERT_MAX_CONN)
+
+static int isert_debug_level;
+module_param_named(debug_level, isert_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:0)");
+
+static DEFINE_MUTEX(device_list_mutex);
+static LIST_HEAD(device_list);
+static struct workqueue_struct *isert_comp_wq;
+static struct workqueue_struct *isert_release_wq;
+
+static int
+isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd);
+static int
+isert_login_post_recv(struct isert_conn *isert_conn);
+static int
+isert_rdma_accept(struct isert_conn *isert_conn);
+struct rdma_cm_id *isert_setup_id(struct isert_np *isert_np);
+
+static void isert_release_work(struct work_struct *work);
+static void isert_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void isert_send_done(struct ib_cq *cq, struct ib_wc *wc);
+static void isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void isert_login_send_done(struct ib_cq *cq, struct ib_wc *wc);
+
+static inline bool
+isert_prot_cmd(struct isert_conn *conn, struct se_cmd *cmd)
+{
+	return (conn->pi_support &&
+		cmd->prot_op != TARGET_PROT_NORMAL);
+}
+
+
+static void
+isert_qp_event_callback(struct ib_event *e, void *context)
+{
+	struct isert_conn *isert_conn = context;
+
+	isert_err("%s (%d): conn %p\n",
+		  ib_event_msg(e->event), e->event, isert_conn);
+
+	switch (e->event) {
+	case IB_EVENT_COMM_EST:
+		rdma_notify(isert_conn->cm_id, IB_EVENT_COMM_EST);
+		break;
+	case IB_EVENT_QP_LAST_WQE_REACHED:
+		isert_warn("Reached TX IB_EVENT_QP_LAST_WQE_REACHED\n");
+		break;
+	default:
+		break;
+	}
+}
+
+static struct isert_comp *
+isert_comp_get(struct isert_conn *isert_conn)
+{
+	struct isert_device *device = isert_conn->device;
+	struct isert_comp *comp;
+	int i, min = 0;
+
+	mutex_lock(&device_list_mutex);
+	for (i = 0; i < device->comps_used; i++)
+		if (device->comps[i].active_qps <
+		    device->comps[min].active_qps)
+			min = i;
+	comp = &device->comps[min];
+	comp->active_qps++;
+	mutex_unlock(&device_list_mutex);
+
+	isert_info("conn %p, using comp %p min_index: %d\n",
+		   isert_conn, comp, min);
+
+	return comp;
+}
+
+static void
+isert_comp_put(struct isert_comp *comp)
+{
+	mutex_lock(&device_list_mutex);
+	comp->active_qps--;
+	mutex_unlock(&device_list_mutex);
+}
+
+static struct ib_qp *
+isert_create_qp(struct isert_conn *isert_conn,
+		struct isert_comp *comp,
+		struct rdma_cm_id *cma_id)
+{
+	struct isert_device *device = isert_conn->device;
+	struct ib_qp_init_attr attr;
+	int ret;
+
+	memset(&attr, 0, sizeof(struct ib_qp_init_attr));
+	attr.event_handler = isert_qp_event_callback;
+	attr.qp_context = isert_conn;
+	attr.send_cq = comp->cq;
+	attr.recv_cq = comp->cq;
+	attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS + 1;
+	attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1;
+	attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX;
+	attr.cap.max_send_sge = device->ib_device->attrs.max_sge;
+	attr.cap.max_recv_sge = 1;
+	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	attr.qp_type = IB_QPT_RC;
+	if (device->pi_capable)
+		attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
+
+	ret = rdma_create_qp(cma_id, device->pd, &attr);
+	if (ret) {
+		isert_err("rdma_create_qp failed for cma_id %d\n", ret);
+		return ERR_PTR(ret);
+	}
+
+	return cma_id->qp;
+}
+
+static int
+isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id)
+{
+	struct isert_comp *comp;
+	int ret;
+
+	comp = isert_comp_get(isert_conn);
+	isert_conn->qp = isert_create_qp(isert_conn, comp, cma_id);
+	if (IS_ERR(isert_conn->qp)) {
+		ret = PTR_ERR(isert_conn->qp);
+		goto err;
+	}
+
+	return 0;
+err:
+	isert_comp_put(comp);
+	return ret;
+}
+
+static int
+isert_alloc_rx_descriptors(struct isert_conn *isert_conn)
+{
+	struct isert_device *device = isert_conn->device;
+	struct ib_device *ib_dev = device->ib_device;
+	struct iser_rx_desc *rx_desc;
+	struct ib_sge *rx_sg;
+	u64 dma_addr;
+	int i, j;
+
+	isert_conn->rx_descs = kzalloc(ISERT_QP_MAX_RECV_DTOS *
+				sizeof(struct iser_rx_desc), GFP_KERNEL);
+	if (!isert_conn->rx_descs)
+		return -ENOMEM;
+
+	rx_desc = isert_conn->rx_descs;
+
+	for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++)  {
+		dma_addr = ib_dma_map_single(ib_dev, (void *)rx_desc,
+					ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(ib_dev, dma_addr))
+			goto dma_map_fail;
+
+		rx_desc->dma_addr = dma_addr;
+
+		rx_sg = &rx_desc->rx_sg;
+		rx_sg->addr = rx_desc->dma_addr;
+		rx_sg->length = ISER_RX_PAYLOAD_SIZE;
+		rx_sg->lkey = device->pd->local_dma_lkey;
+		rx_desc->rx_cqe.done = isert_recv_done;
+	}
+
+	return 0;
+
+dma_map_fail:
+	rx_desc = isert_conn->rx_descs;
+	for (j = 0; j < i; j++, rx_desc++) {
+		ib_dma_unmap_single(ib_dev, rx_desc->dma_addr,
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	}
+	kfree(isert_conn->rx_descs);
+	isert_conn->rx_descs = NULL;
+	isert_err("conn %p failed to allocate rx descriptors\n", isert_conn);
+	return -ENOMEM;
+}
+
+static void
+isert_free_rx_descriptors(struct isert_conn *isert_conn)
+{
+	struct ib_device *ib_dev = isert_conn->device->ib_device;
+	struct iser_rx_desc *rx_desc;
+	int i;
+
+	if (!isert_conn->rx_descs)
+		return;
+
+	rx_desc = isert_conn->rx_descs;
+	for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++)  {
+		ib_dma_unmap_single(ib_dev, rx_desc->dma_addr,
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	}
+
+	kfree(isert_conn->rx_descs);
+	isert_conn->rx_descs = NULL;
+}
+
+static void
+isert_free_comps(struct isert_device *device)
+{
+	int i;
+
+	for (i = 0; i < device->comps_used; i++) {
+		struct isert_comp *comp = &device->comps[i];
+
+		if (comp->cq)
+			ib_free_cq(comp->cq);
+	}
+	kfree(device->comps);
+}
+
+static int
+isert_alloc_comps(struct isert_device *device)
+{
+	int i, max_cqe, ret = 0;
+
+	device->comps_used = min(ISERT_MAX_CQ, min_t(int, num_online_cpus(),
+				 device->ib_device->num_comp_vectors));
+
+	isert_info("Using %d CQs, %s supports %d vectors support "
+		   "pi_capable %d\n",
+		   device->comps_used, device->ib_device->name,
+		   device->ib_device->num_comp_vectors,
+		   device->pi_capable);
+
+	device->comps = kcalloc(device->comps_used, sizeof(struct isert_comp),
+				GFP_KERNEL);
+	if (!device->comps)
+		return -ENOMEM;
+
+	max_cqe = min(ISER_MAX_CQ_LEN, device->ib_device->attrs.max_cqe);
+
+	for (i = 0; i < device->comps_used; i++) {
+		struct isert_comp *comp = &device->comps[i];
+
+		comp->device = device;
+		comp->cq = ib_alloc_cq(device->ib_device, comp, max_cqe, i,
+				IB_POLL_WORKQUEUE);
+		if (IS_ERR(comp->cq)) {
+			isert_err("Unable to allocate cq\n");
+			ret = PTR_ERR(comp->cq);
+			comp->cq = NULL;
+			goto out_cq;
+		}
+	}
+
+	return 0;
+out_cq:
+	isert_free_comps(device);
+	return ret;
+}
+
+static int
+isert_create_device_ib_res(struct isert_device *device)
+{
+	struct ib_device *ib_dev = device->ib_device;
+	int ret;
+
+	isert_dbg("devattr->max_sge: %d\n", ib_dev->attrs.max_sge);
+	isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd);
+
+	ret = isert_alloc_comps(device);
+	if (ret)
+		goto out;
+
+	device->pd = ib_alloc_pd(ib_dev, 0);
+	if (IS_ERR(device->pd)) {
+		ret = PTR_ERR(device->pd);
+		isert_err("failed to allocate pd, device %p, ret=%d\n",
+			  device, ret);
+		goto out_cq;
+	}
+
+	/* Check signature cap */
+	device->pi_capable = ib_dev->attrs.device_cap_flags &
+			     IB_DEVICE_SIGNATURE_HANDOVER ? true : false;
+
+	return 0;
+
+out_cq:
+	isert_free_comps(device);
+out:
+	if (ret > 0)
+		ret = -EINVAL;
+	return ret;
+}
+
+static void
+isert_free_device_ib_res(struct isert_device *device)
+{
+	isert_info("device %p\n", device);
+
+	ib_dealloc_pd(device->pd);
+	isert_free_comps(device);
+}
+
+static void
+isert_device_put(struct isert_device *device)
+{
+	mutex_lock(&device_list_mutex);
+	device->refcount--;
+	isert_info("device %p refcount %d\n", device, device->refcount);
+	if (!device->refcount) {
+		isert_free_device_ib_res(device);
+		list_del(&device->dev_node);
+		kfree(device);
+	}
+	mutex_unlock(&device_list_mutex);
+}
+
+static struct isert_device *
+isert_device_get(struct rdma_cm_id *cma_id)
+{
+	struct isert_device *device;
+	int ret;
+
+	mutex_lock(&device_list_mutex);
+	list_for_each_entry(device, &device_list, dev_node) {
+		if (device->ib_device->node_guid == cma_id->device->node_guid) {
+			device->refcount++;
+			isert_info("Found iser device %p refcount %d\n",
+				   device, device->refcount);
+			mutex_unlock(&device_list_mutex);
+			return device;
+		}
+	}
+
+	device = kzalloc(sizeof(struct isert_device), GFP_KERNEL);
+	if (!device) {
+		mutex_unlock(&device_list_mutex);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	INIT_LIST_HEAD(&device->dev_node);
+
+	device->ib_device = cma_id->device;
+	ret = isert_create_device_ib_res(device);
+	if (ret) {
+		kfree(device);
+		mutex_unlock(&device_list_mutex);
+		return ERR_PTR(ret);
+	}
+
+	device->refcount++;
+	list_add_tail(&device->dev_node, &device_list);
+	isert_info("Created a new iser device %p refcount %d\n",
+		   device, device->refcount);
+	mutex_unlock(&device_list_mutex);
+
+	return device;
+}
+
+static void
+isert_init_conn(struct isert_conn *isert_conn)
+{
+	isert_conn->state = ISER_CONN_INIT;
+	INIT_LIST_HEAD(&isert_conn->node);
+	init_completion(&isert_conn->login_comp);
+	init_completion(&isert_conn->login_req_comp);
+	init_waitqueue_head(&isert_conn->rem_wait);
+	kref_init(&isert_conn->kref);
+	mutex_init(&isert_conn->mutex);
+	INIT_WORK(&isert_conn->release_work, isert_release_work);
+}
+
+static void
+isert_free_login_buf(struct isert_conn *isert_conn)
+{
+	struct ib_device *ib_dev = isert_conn->device->ib_device;
+
+	ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma,
+			    ISER_RX_PAYLOAD_SIZE, DMA_TO_DEVICE);
+	kfree(isert_conn->login_rsp_buf);
+
+	ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma,
+			    ISER_RX_PAYLOAD_SIZE,
+			    DMA_FROM_DEVICE);
+	kfree(isert_conn->login_req_buf);
+}
+
+static int
+isert_alloc_login_buf(struct isert_conn *isert_conn,
+		      struct ib_device *ib_dev)
+{
+	int ret;
+
+	isert_conn->login_req_buf = kzalloc(sizeof(*isert_conn->login_req_buf),
+			GFP_KERNEL);
+	if (!isert_conn->login_req_buf)
+		return -ENOMEM;
+
+	isert_conn->login_req_dma = ib_dma_map_single(ib_dev,
+				isert_conn->login_req_buf,
+				ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	ret = ib_dma_mapping_error(ib_dev, isert_conn->login_req_dma);
+	if (ret) {
+		isert_err("login_req_dma mapping error: %d\n", ret);
+		isert_conn->login_req_dma = 0;
+		goto out_free_login_req_buf;
+	}
+
+	isert_conn->login_rsp_buf = kzalloc(ISER_RX_PAYLOAD_SIZE, GFP_KERNEL);
+	if (!isert_conn->login_rsp_buf) {
+		ret = -ENOMEM;
+		goto out_unmap_login_req_buf;
+	}
+
+	isert_conn->login_rsp_dma = ib_dma_map_single(ib_dev,
+					isert_conn->login_rsp_buf,
+					ISER_RX_PAYLOAD_SIZE, DMA_TO_DEVICE);
+	ret = ib_dma_mapping_error(ib_dev, isert_conn->login_rsp_dma);
+	if (ret) {
+		isert_err("login_rsp_dma mapping error: %d\n", ret);
+		isert_conn->login_rsp_dma = 0;
+		goto out_free_login_rsp_buf;
+	}
+
+	return 0;
+
+out_free_login_rsp_buf:
+	kfree(isert_conn->login_rsp_buf);
+out_unmap_login_req_buf:
+	ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma,
+			    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+out_free_login_req_buf:
+	kfree(isert_conn->login_req_buf);
+	return ret;
+}
+
+static void
+isert_set_nego_params(struct isert_conn *isert_conn,
+		      struct rdma_conn_param *param)
+{
+	struct ib_device_attr *attr = &isert_conn->device->ib_device->attrs;
+
+	/* Set max inflight RDMA READ requests */
+	isert_conn->initiator_depth = min_t(u8, param->initiator_depth,
+				attr->max_qp_init_rd_atom);
+	isert_dbg("Using initiator_depth: %u\n", isert_conn->initiator_depth);
+
+	if (param->private_data) {
+		u8 flags = *(u8 *)param->private_data;
+
+		/*
+		 * use remote invalidation if the both initiator
+		 * and the HCA support it
+		 */
+		isert_conn->snd_w_inv = !(flags & ISER_SEND_W_INV_NOT_SUP) &&
+					  (attr->device_cap_flags &
+					   IB_DEVICE_MEM_MGT_EXTENSIONS);
+		if (isert_conn->snd_w_inv)
+			isert_info("Using remote invalidation\n");
+	}
+}
+
+static int
+isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
+{
+	struct isert_np *isert_np = cma_id->context;
+	struct iscsi_np *np = isert_np->np;
+	struct isert_conn *isert_conn;
+	struct isert_device *device;
+	int ret = 0;
+
+	spin_lock_bh(&np->np_thread_lock);
+	if (!np->enabled) {
+		spin_unlock_bh(&np->np_thread_lock);
+		isert_dbg("iscsi_np is not enabled, reject connect request\n");
+		return rdma_reject(cma_id, NULL, 0);
+	}
+	spin_unlock_bh(&np->np_thread_lock);
+
+	isert_dbg("cma_id: %p, portal: %p\n",
+		 cma_id, cma_id->context);
+
+	isert_conn = kzalloc(sizeof(struct isert_conn), GFP_KERNEL);
+	if (!isert_conn)
+		return -ENOMEM;
+
+	isert_init_conn(isert_conn);
+	isert_conn->cm_id = cma_id;
+
+	ret = isert_alloc_login_buf(isert_conn, cma_id->device);
+	if (ret)
+		goto out;
+
+	device = isert_device_get(cma_id);
+	if (IS_ERR(device)) {
+		ret = PTR_ERR(device);
+		goto out_rsp_dma_map;
+	}
+	isert_conn->device = device;
+
+	isert_set_nego_params(isert_conn, &event->param.conn);
+
+	ret = isert_conn_setup_qp(isert_conn, cma_id);
+	if (ret)
+		goto out_conn_dev;
+
+	ret = isert_login_post_recv(isert_conn);
+	if (ret)
+		goto out_conn_dev;
+
+	ret = isert_rdma_accept(isert_conn);
+	if (ret)
+		goto out_conn_dev;
+
+	mutex_lock(&isert_np->mutex);
+	list_add_tail(&isert_conn->node, &isert_np->accepted);
+	mutex_unlock(&isert_np->mutex);
+
+	return 0;
+
+out_conn_dev:
+	isert_device_put(device);
+out_rsp_dma_map:
+	isert_free_login_buf(isert_conn);
+out:
+	kfree(isert_conn);
+	rdma_reject(cma_id, NULL, 0);
+	return ret;
+}
+
+static void
+isert_connect_release(struct isert_conn *isert_conn)
+{
+	struct isert_device *device = isert_conn->device;
+
+	isert_dbg("conn %p\n", isert_conn);
+
+	BUG_ON(!device);
+
+	isert_free_rx_descriptors(isert_conn);
+	if (isert_conn->cm_id &&
+	    !isert_conn->dev_removed)
+		rdma_destroy_id(isert_conn->cm_id);
+
+	if (isert_conn->qp) {
+		struct isert_comp *comp = isert_conn->qp->recv_cq->cq_context;
+
+		isert_comp_put(comp);
+		ib_destroy_qp(isert_conn->qp);
+	}
+
+	if (isert_conn->login_req_buf)
+		isert_free_login_buf(isert_conn);
+
+	isert_device_put(device);
+
+	if (isert_conn->dev_removed)
+		wake_up_interruptible(&isert_conn->rem_wait);
+	else
+		kfree(isert_conn);
+}
+
+static void
+isert_connected_handler(struct rdma_cm_id *cma_id)
+{
+	struct isert_conn *isert_conn = cma_id->qp->qp_context;
+	struct isert_np *isert_np = cma_id->context;
+
+	isert_info("conn %p\n", isert_conn);
+
+	mutex_lock(&isert_conn->mutex);
+	isert_conn->state = ISER_CONN_UP;
+	kref_get(&isert_conn->kref);
+	mutex_unlock(&isert_conn->mutex);
+
+	mutex_lock(&isert_np->mutex);
+	list_move_tail(&isert_conn->node, &isert_np->pending);
+	mutex_unlock(&isert_np->mutex);
+
+	isert_info("np %p: Allow accept_np to continue\n", isert_np);
+	up(&isert_np->sem);
+}
+
+static void
+isert_release_kref(struct kref *kref)
+{
+	struct isert_conn *isert_conn = container_of(kref,
+				struct isert_conn, kref);
+
+	isert_info("conn %p final kref %s/%d\n", isert_conn, current->comm,
+		   current->pid);
+
+	isert_connect_release(isert_conn);
+}
+
+static void
+isert_put_conn(struct isert_conn *isert_conn)
+{
+	kref_put(&isert_conn->kref, isert_release_kref);
+}
+
+static void
+isert_handle_unbound_conn(struct isert_conn *isert_conn)
+{
+	struct isert_np *isert_np = isert_conn->cm_id->context;
+
+	mutex_lock(&isert_np->mutex);
+	if (!list_empty(&isert_conn->node)) {
+		/*
+		 * This means iscsi doesn't know this connection
+		 * so schedule a cleanup ourselves
+		 */
+		list_del_init(&isert_conn->node);
+		isert_put_conn(isert_conn);
+		queue_work(isert_release_wq, &isert_conn->release_work);
+	}
+	mutex_unlock(&isert_np->mutex);
+}
+
+/**
+ * isert_conn_terminate() - Initiate connection termination
+ * @isert_conn: isert connection struct
+ *
+ * Notes:
+ * In case the connection state is BOUND, move state
+ * to TEMINATING and start teardown sequence (rdma_disconnect).
+ * In case the connection state is UP, complete flush as well.
+ *
+ * This routine must be called with mutex held. Thus it is
+ * safe to call multiple times.
+ */
+static void
+isert_conn_terminate(struct isert_conn *isert_conn)
+{
+	int err;
+
+	if (isert_conn->state >= ISER_CONN_TERMINATING)
+		return;
+
+	isert_info("Terminating conn %p state %d\n",
+		   isert_conn, isert_conn->state);
+	isert_conn->state = ISER_CONN_TERMINATING;
+	err = rdma_disconnect(isert_conn->cm_id);
+	if (err)
+		isert_warn("Failed rdma_disconnect isert_conn %p\n",
+			   isert_conn);
+}
+
+static int
+isert_np_cma_handler(struct isert_np *isert_np,
+		     enum rdma_cm_event_type event)
+{
+	isert_dbg("%s (%d): isert np %p\n",
+		  rdma_event_msg(event), event, isert_np);
+
+	switch (event) {
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		isert_np->cm_id = NULL;
+		break;
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		isert_np->cm_id = isert_setup_id(isert_np);
+		if (IS_ERR(isert_np->cm_id)) {
+			isert_err("isert np %p setup id failed: %ld\n",
+				  isert_np, PTR_ERR(isert_np->cm_id));
+			isert_np->cm_id = NULL;
+		}
+		break;
+	default:
+		isert_err("isert np %p Unexpected event %d\n",
+			  isert_np, event);
+	}
+
+	return -1;
+}
+
+static int
+isert_disconnected_handler(struct rdma_cm_id *cma_id,
+			   enum rdma_cm_event_type event)
+{
+	struct isert_conn *isert_conn = cma_id->qp->qp_context;
+
+	mutex_lock(&isert_conn->mutex);
+	switch (isert_conn->state) {
+	case ISER_CONN_TERMINATING:
+		break;
+	case ISER_CONN_UP:
+		isert_conn_terminate(isert_conn);
+		ib_drain_qp(isert_conn->qp);
+		isert_handle_unbound_conn(isert_conn);
+		break;
+	case ISER_CONN_BOUND:
+	case ISER_CONN_FULL_FEATURE: /* FALLTHRU */
+		iscsit_cause_connection_reinstatement(isert_conn->conn, 0);
+		break;
+	default:
+		isert_warn("conn %p terminating in state %d\n",
+			   isert_conn, isert_conn->state);
+	}
+	mutex_unlock(&isert_conn->mutex);
+
+	return 0;
+}
+
+static int
+isert_connect_error(struct rdma_cm_id *cma_id)
+{
+	struct isert_conn *isert_conn = cma_id->qp->qp_context;
+
+	ib_drain_qp(isert_conn->qp);
+	list_del_init(&isert_conn->node);
+	isert_conn->cm_id = NULL;
+	isert_put_conn(isert_conn);
+
+	return -1;
+}
+
+static int
+isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
+{
+	struct isert_np *isert_np = cma_id->context;
+	struct isert_conn *isert_conn;
+	int ret = 0;
+
+	isert_info("%s (%d): status %d id %p np %p\n",
+		   rdma_event_msg(event->event), event->event,
+		   event->status, cma_id, cma_id->context);
+
+	if (isert_np->cm_id == cma_id)
+		return isert_np_cma_handler(cma_id->context, event->event);
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		ret = isert_connect_request(cma_id, event);
+		if (ret)
+			isert_err("failed handle connect request %d\n", ret);
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		isert_connected_handler(cma_id);
+		break;
+	case RDMA_CM_EVENT_ADDR_CHANGE:    /* FALLTHRU */
+	case RDMA_CM_EVENT_DISCONNECTED:   /* FALLTHRU */
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:  /* FALLTHRU */
+		ret = isert_disconnected_handler(cma_id, event->event);
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		isert_conn = cma_id->qp->qp_context;
+		isert_conn->dev_removed = true;
+		isert_disconnected_handler(cma_id, event->event);
+		wait_event_interruptible(isert_conn->rem_wait,
+					 isert_conn->state == ISER_CONN_DOWN);
+		kfree(isert_conn);
+		/*
+		 * return non-zero from the callback to destroy
+		 * the rdma cm id
+		 */
+		return 1;
+	case RDMA_CM_EVENT_REJECTED:
+		isert_info("Connection rejected: %s\n",
+			   rdma_reject_msg(cma_id, event->status));
+		/* fall through */
+	case RDMA_CM_EVENT_UNREACHABLE:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		ret = isert_connect_error(cma_id);
+		break;
+	default:
+		isert_err("Unhandled RDMA CMA event: %d\n", event->event);
+		break;
+	}
+
+	return ret;
+}
+
+static int
+isert_post_recvm(struct isert_conn *isert_conn, u32 count)
+{
+	struct ib_recv_wr *rx_wr, *rx_wr_failed;
+	int i, ret;
+	struct iser_rx_desc *rx_desc;
+
+	for (rx_wr = isert_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
+		rx_desc = &isert_conn->rx_descs[i];
+
+		rx_wr->wr_cqe = &rx_desc->rx_cqe;
+		rx_wr->sg_list = &rx_desc->rx_sg;
+		rx_wr->num_sge = 1;
+		rx_wr->next = rx_wr + 1;
+		rx_desc->in_use = false;
+	}
+	rx_wr--;
+	rx_wr->next = NULL; /* mark end of work requests list */
+
+	ret = ib_post_recv(isert_conn->qp, isert_conn->rx_wr,
+			   &rx_wr_failed);
+	if (ret)
+		isert_err("ib_post_recv() failed with ret: %d\n", ret);
+
+	return ret;
+}
+
+static int
+isert_post_recv(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc)
+{
+	struct ib_recv_wr *rx_wr_failed, rx_wr;
+	int ret;
+
+	if (!rx_desc->in_use) {
+		/*
+		 * if the descriptor is not in-use we already reposted it
+		 * for recv, so just silently return
+		 */
+		return 0;
+	}
+
+	rx_desc->in_use = false;
+	rx_wr.wr_cqe = &rx_desc->rx_cqe;
+	rx_wr.sg_list = &rx_desc->rx_sg;
+	rx_wr.num_sge = 1;
+	rx_wr.next = NULL;
+
+	ret = ib_post_recv(isert_conn->qp, &rx_wr, &rx_wr_failed);
+	if (ret)
+		isert_err("ib_post_recv() failed with ret: %d\n", ret);
+
+	return ret;
+}
+
+static int
+isert_login_post_send(struct isert_conn *isert_conn, struct iser_tx_desc *tx_desc)
+{
+	struct ib_device *ib_dev = isert_conn->cm_id->device;
+	struct ib_send_wr send_wr, *send_wr_failed;
+	int ret;
+
+	ib_dma_sync_single_for_device(ib_dev, tx_desc->dma_addr,
+				      ISER_HEADERS_LEN, DMA_TO_DEVICE);
+
+	tx_desc->tx_cqe.done = isert_login_send_done;
+
+	send_wr.next	= NULL;
+	send_wr.wr_cqe	= &tx_desc->tx_cqe;
+	send_wr.sg_list	= tx_desc->tx_sg;
+	send_wr.num_sge	= tx_desc->num_sge;
+	send_wr.opcode	= IB_WR_SEND;
+	send_wr.send_flags = IB_SEND_SIGNALED;
+
+	ret = ib_post_send(isert_conn->qp, &send_wr, &send_wr_failed);
+	if (ret)
+		isert_err("ib_post_send() failed, ret: %d\n", ret);
+
+	return ret;
+}
+
+static void
+isert_create_send_desc(struct isert_conn *isert_conn,
+		       struct isert_cmd *isert_cmd,
+		       struct iser_tx_desc *tx_desc)
+{
+	struct isert_device *device = isert_conn->device;
+	struct ib_device *ib_dev = device->ib_device;
+
+	ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr,
+				   ISER_HEADERS_LEN, DMA_TO_DEVICE);
+
+	memset(&tx_desc->iser_header, 0, sizeof(struct iser_ctrl));
+	tx_desc->iser_header.flags = ISCSI_CTRL;
+
+	tx_desc->num_sge = 1;
+
+	if (tx_desc->tx_sg[0].lkey != device->pd->local_dma_lkey) {
+		tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey;
+		isert_dbg("tx_desc %p lkey mismatch, fixing\n", tx_desc);
+	}
+}
+
+static int
+isert_init_tx_hdrs(struct isert_conn *isert_conn,
+		   struct iser_tx_desc *tx_desc)
+{
+	struct isert_device *device = isert_conn->device;
+	struct ib_device *ib_dev = device->ib_device;
+	u64 dma_addr;
+
+	dma_addr = ib_dma_map_single(ib_dev, (void *)tx_desc,
+			ISER_HEADERS_LEN, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(ib_dev, dma_addr)) {
+		isert_err("ib_dma_mapping_error() failed\n");
+		return -ENOMEM;
+	}
+
+	tx_desc->dma_addr = dma_addr;
+	tx_desc->tx_sg[0].addr	= tx_desc->dma_addr;
+	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
+	tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey;
+
+	isert_dbg("Setup tx_sg[0].addr: 0x%llx length: %u lkey: 0x%x\n",
+		  tx_desc->tx_sg[0].addr, tx_desc->tx_sg[0].length,
+		  tx_desc->tx_sg[0].lkey);
+
+	return 0;
+}
+
+static void
+isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
+		   struct ib_send_wr *send_wr)
+{
+	struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc;
+
+	tx_desc->tx_cqe.done = isert_send_done;
+	send_wr->wr_cqe = &tx_desc->tx_cqe;
+
+	if (isert_conn->snd_w_inv && isert_cmd->inv_rkey) {
+		send_wr->opcode  = IB_WR_SEND_WITH_INV;
+		send_wr->ex.invalidate_rkey = isert_cmd->inv_rkey;
+	} else {
+		send_wr->opcode = IB_WR_SEND;
+	}
+
+	send_wr->sg_list = &tx_desc->tx_sg[0];
+	send_wr->num_sge = isert_cmd->tx_desc.num_sge;
+	send_wr->send_flags = IB_SEND_SIGNALED;
+}
+
+static int
+isert_login_post_recv(struct isert_conn *isert_conn)
+{
+	struct ib_recv_wr rx_wr, *rx_wr_fail;
+	struct ib_sge sge;
+	int ret;
+
+	memset(&sge, 0, sizeof(struct ib_sge));
+	sge.addr = isert_conn->login_req_dma;
+	sge.length = ISER_RX_PAYLOAD_SIZE;
+	sge.lkey = isert_conn->device->pd->local_dma_lkey;
+
+	isert_dbg("Setup sge: addr: %llx length: %d 0x%08x\n",
+		sge.addr, sge.length, sge.lkey);
+
+	isert_conn->login_req_buf->rx_cqe.done = isert_login_recv_done;
+
+	memset(&rx_wr, 0, sizeof(struct ib_recv_wr));
+	rx_wr.wr_cqe = &isert_conn->login_req_buf->rx_cqe;
+	rx_wr.sg_list = &sge;
+	rx_wr.num_sge = 1;
+
+	ret = ib_post_recv(isert_conn->qp, &rx_wr, &rx_wr_fail);
+	if (ret)
+		isert_err("ib_post_recv() failed: %d\n", ret);
+
+	return ret;
+}
+
+static int
+isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login,
+		   u32 length)
+{
+	struct isert_conn *isert_conn = conn->context;
+	struct isert_device *device = isert_conn->device;
+	struct ib_device *ib_dev = device->ib_device;
+	struct iser_tx_desc *tx_desc = &isert_conn->login_tx_desc;
+	int ret;
+
+	isert_create_send_desc(isert_conn, NULL, tx_desc);
+
+	memcpy(&tx_desc->iscsi_header, &login->rsp[0],
+	       sizeof(struct iscsi_hdr));
+
+	isert_init_tx_hdrs(isert_conn, tx_desc);
+
+	if (length > 0) {
+		struct ib_sge *tx_dsg = &tx_desc->tx_sg[1];
+
+		ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_rsp_dma,
+					   length, DMA_TO_DEVICE);
+
+		memcpy(isert_conn->login_rsp_buf, login->rsp_buf, length);
+
+		ib_dma_sync_single_for_device(ib_dev, isert_conn->login_rsp_dma,
+					      length, DMA_TO_DEVICE);
+
+		tx_dsg->addr	= isert_conn->login_rsp_dma;
+		tx_dsg->length	= length;
+		tx_dsg->lkey	= isert_conn->device->pd->local_dma_lkey;
+		tx_desc->num_sge = 2;
+	}
+	if (!login->login_failed) {
+		if (login->login_complete) {
+			ret = isert_alloc_rx_descriptors(isert_conn);
+			if (ret)
+				return ret;
+
+			ret = isert_post_recvm(isert_conn,
+					       ISERT_QP_MAX_RECV_DTOS);
+			if (ret)
+				return ret;
+
+			/* Now we are in FULL_FEATURE phase */
+			mutex_lock(&isert_conn->mutex);
+			isert_conn->state = ISER_CONN_FULL_FEATURE;
+			mutex_unlock(&isert_conn->mutex);
+			goto post_send;
+		}
+
+		ret = isert_login_post_recv(isert_conn);
+		if (ret)
+			return ret;
+	}
+post_send:
+	ret = isert_login_post_send(isert_conn, tx_desc);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void
+isert_rx_login_req(struct isert_conn *isert_conn)
+{
+	struct iser_rx_desc *rx_desc = isert_conn->login_req_buf;
+	int rx_buflen = isert_conn->login_req_len;
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_login *login = conn->conn_login;
+	int size;
+
+	isert_info("conn %p\n", isert_conn);
+
+	WARN_ON_ONCE(!login);
+
+	if (login->first_request) {
+		struct iscsi_login_req *login_req =
+			(struct iscsi_login_req *)&rx_desc->iscsi_header;
+		/*
+		 * Setup the initial iscsi_login values from the leading
+		 * login request PDU.
+		 */
+		login->leading_connection = (!login_req->tsih) ? 1 : 0;
+		login->current_stage =
+			(login_req->flags & ISCSI_FLAG_LOGIN_CURRENT_STAGE_MASK)
+			 >> 2;
+		login->version_min	= login_req->min_version;
+		login->version_max	= login_req->max_version;
+		memcpy(login->isid, login_req->isid, 6);
+		login->cmd_sn		= be32_to_cpu(login_req->cmdsn);
+		login->init_task_tag	= login_req->itt;
+		login->initial_exp_statsn = be32_to_cpu(login_req->exp_statsn);
+		login->cid		= be16_to_cpu(login_req->cid);
+		login->tsih		= be16_to_cpu(login_req->tsih);
+	}
+
+	memcpy(&login->req[0], (void *)&rx_desc->iscsi_header, ISCSI_HDR_LEN);
+
+	size = min(rx_buflen, MAX_KEY_VALUE_PAIRS);
+	isert_dbg("Using login payload size: %d, rx_buflen: %d "
+		  "MAX_KEY_VALUE_PAIRS: %d\n", size, rx_buflen,
+		  MAX_KEY_VALUE_PAIRS);
+	memcpy(login->req_buf, &rx_desc->data[0], size);
+
+	if (login->first_request) {
+		complete(&isert_conn->login_comp);
+		return;
+	}
+	schedule_delayed_work(&conn->login_work, 0);
+}
+
+static struct iscsi_cmd
+*isert_allocate_cmd(struct iscsi_conn *conn, struct iser_rx_desc *rx_desc)
+{
+	struct isert_conn *isert_conn = conn->context;
+	struct isert_cmd *isert_cmd;
+	struct iscsi_cmd *cmd;
+
+	cmd = iscsit_allocate_cmd(conn, TASK_INTERRUPTIBLE);
+	if (!cmd) {
+		isert_err("Unable to allocate iscsi_cmd + isert_cmd\n");
+		return NULL;
+	}
+	isert_cmd = iscsit_priv_cmd(cmd);
+	isert_cmd->conn = isert_conn;
+	isert_cmd->iscsi_cmd = cmd;
+	isert_cmd->rx_desc = rx_desc;
+
+	return cmd;
+}
+
+static int
+isert_handle_scsi_cmd(struct isert_conn *isert_conn,
+		      struct isert_cmd *isert_cmd, struct iscsi_cmd *cmd,
+		      struct iser_rx_desc *rx_desc, unsigned char *buf)
+{
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)buf;
+	int imm_data, imm_data_len, unsol_data, sg_nents, rc;
+	bool dump_payload = false;
+	unsigned int data_len;
+
+	rc = iscsit_setup_scsi_cmd(conn, cmd, buf);
+	if (rc < 0)
+		return rc;
+
+	imm_data = cmd->immediate_data;
+	imm_data_len = cmd->first_burst_len;
+	unsol_data = cmd->unsolicited_data;
+	data_len = cmd->se_cmd.data_length;
+
+	if (imm_data && imm_data_len == data_len)
+		cmd->se_cmd.se_cmd_flags |= SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC;
+	rc = iscsit_process_scsi_cmd(conn, cmd, hdr);
+	if (rc < 0) {
+		return 0;
+	} else if (rc > 0) {
+		dump_payload = true;
+		goto sequence_cmd;
+	}
+
+	if (!imm_data)
+		return 0;
+
+	if (imm_data_len != data_len) {
+		sg_nents = max(1UL, DIV_ROUND_UP(imm_data_len, PAGE_SIZE));
+		sg_copy_from_buffer(cmd->se_cmd.t_data_sg, sg_nents,
+				    &rx_desc->data[0], imm_data_len);
+		isert_dbg("Copy Immediate sg_nents: %u imm_data_len: %d\n",
+			  sg_nents, imm_data_len);
+	} else {
+		sg_init_table(&isert_cmd->sg, 1);
+		cmd->se_cmd.t_data_sg = &isert_cmd->sg;
+		cmd->se_cmd.t_data_nents = 1;
+		sg_set_buf(&isert_cmd->sg, &rx_desc->data[0], imm_data_len);
+		isert_dbg("Transfer Immediate imm_data_len: %d\n",
+			  imm_data_len);
+	}
+
+	cmd->write_data_done += imm_data_len;
+
+	if (cmd->write_data_done == cmd->se_cmd.data_length) {
+		spin_lock_bh(&cmd->istate_lock);
+		cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT;
+		cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT;
+		spin_unlock_bh(&cmd->istate_lock);
+	}
+
+sequence_cmd:
+	rc = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn);
+
+	if (!rc && dump_payload == false && unsol_data)
+		iscsit_set_unsoliticed_dataout(cmd);
+	else if (dump_payload && imm_data)
+		target_put_sess_cmd(&cmd->se_cmd);
+
+	return 0;
+}
+
+static int
+isert_handle_iscsi_dataout(struct isert_conn *isert_conn,
+			   struct iser_rx_desc *rx_desc, unsigned char *buf)
+{
+	struct scatterlist *sg_start;
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_cmd *cmd = NULL;
+	struct iscsi_data *hdr = (struct iscsi_data *)buf;
+	u32 unsol_data_len = ntoh24(hdr->dlength);
+	int rc, sg_nents, sg_off, page_off;
+
+	rc = iscsit_check_dataout_hdr(conn, buf, &cmd);
+	if (rc < 0)
+		return rc;
+	else if (!cmd)
+		return 0;
+	/*
+	 * FIXME: Unexpected unsolicited_data out
+	 */
+	if (!cmd->unsolicited_data) {
+		isert_err("Received unexpected solicited data payload\n");
+		dump_stack();
+		return -1;
+	}
+
+	isert_dbg("Unsolicited DataOut unsol_data_len: %u, "
+		  "write_data_done: %u, data_length: %u\n",
+		  unsol_data_len,  cmd->write_data_done,
+		  cmd->se_cmd.data_length);
+
+	sg_off = cmd->write_data_done / PAGE_SIZE;
+	sg_start = &cmd->se_cmd.t_data_sg[sg_off];
+	sg_nents = max(1UL, DIV_ROUND_UP(unsol_data_len, PAGE_SIZE));
+	page_off = cmd->write_data_done % PAGE_SIZE;
+	/*
+	 * FIXME: Non page-aligned unsolicited_data out
+	 */
+	if (page_off) {
+		isert_err("unexpected non-page aligned data payload\n");
+		dump_stack();
+		return -1;
+	}
+	isert_dbg("Copying DataOut: sg_start: %p, sg_off: %u "
+		  "sg_nents: %u from %p %u\n", sg_start, sg_off,
+		  sg_nents, &rx_desc->data[0], unsol_data_len);
+
+	sg_copy_from_buffer(sg_start, sg_nents, &rx_desc->data[0],
+			    unsol_data_len);
+
+	rc = iscsit_check_dataout_payload(cmd, hdr, false);
+	if (rc < 0)
+		return rc;
+
+	/*
+	 * multiple data-outs on the same command can arrive -
+	 * so post the buffer before hand
+	 */
+	rc = isert_post_recv(isert_conn, rx_desc);
+	if (rc) {
+		isert_err("ib_post_recv failed with %d\n", rc);
+		return rc;
+	}
+	return 0;
+}
+
+static int
+isert_handle_nop_out(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
+		     struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc,
+		     unsigned char *buf)
+{
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_nopout *hdr = (struct iscsi_nopout *)buf;
+	int rc;
+
+	rc = iscsit_setup_nop_out(conn, cmd, hdr);
+	if (rc < 0)
+		return rc;
+	/*
+	 * FIXME: Add support for NOPOUT payload using unsolicited RDMA payload
+	 */
+
+	return iscsit_process_nop_out(conn, cmd, hdr);
+}
+
+static int
+isert_handle_text_cmd(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
+		      struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc,
+		      struct iscsi_text *hdr)
+{
+	struct iscsi_conn *conn = isert_conn->conn;
+	u32 payload_length = ntoh24(hdr->dlength);
+	int rc;
+	unsigned char *text_in = NULL;
+
+	rc = iscsit_setup_text_cmd(conn, cmd, hdr);
+	if (rc < 0)
+		return rc;
+
+	if (payload_length) {
+		text_in = kzalloc(payload_length, GFP_KERNEL);
+		if (!text_in)
+			return -ENOMEM;
+	}
+	cmd->text_in_ptr = text_in;
+
+	memcpy(cmd->text_in_ptr, &rx_desc->data[0], payload_length);
+
+	return iscsit_process_text_cmd(conn, cmd, hdr);
+}
+
+static int
+isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc,
+		uint32_t read_stag, uint64_t read_va,
+		uint32_t write_stag, uint64_t write_va)
+{
+	struct iscsi_hdr *hdr = &rx_desc->iscsi_header;
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_cmd *cmd;
+	struct isert_cmd *isert_cmd;
+	int ret = -EINVAL;
+	u8 opcode = (hdr->opcode & ISCSI_OPCODE_MASK);
+
+	if (conn->sess->sess_ops->SessionType &&
+	   (!(opcode & ISCSI_OP_TEXT) || !(opcode & ISCSI_OP_LOGOUT))) {
+		isert_err("Got illegal opcode: 0x%02x in SessionType=Discovery,"
+			  " ignoring\n", opcode);
+		return 0;
+	}
+
+	switch (opcode) {
+	case ISCSI_OP_SCSI_CMD:
+		cmd = isert_allocate_cmd(conn, rx_desc);
+		if (!cmd)
+			break;
+
+		isert_cmd = iscsit_priv_cmd(cmd);
+		isert_cmd->read_stag = read_stag;
+		isert_cmd->read_va = read_va;
+		isert_cmd->write_stag = write_stag;
+		isert_cmd->write_va = write_va;
+		isert_cmd->inv_rkey = read_stag ? read_stag : write_stag;
+
+		ret = isert_handle_scsi_cmd(isert_conn, isert_cmd, cmd,
+					rx_desc, (unsigned char *)hdr);
+		break;
+	case ISCSI_OP_NOOP_OUT:
+		cmd = isert_allocate_cmd(conn, rx_desc);
+		if (!cmd)
+			break;
+
+		isert_cmd = iscsit_priv_cmd(cmd);
+		ret = isert_handle_nop_out(isert_conn, isert_cmd, cmd,
+					   rx_desc, (unsigned char *)hdr);
+		break;
+	case ISCSI_OP_SCSI_DATA_OUT:
+		ret = isert_handle_iscsi_dataout(isert_conn, rx_desc,
+						(unsigned char *)hdr);
+		break;
+	case ISCSI_OP_SCSI_TMFUNC:
+		cmd = isert_allocate_cmd(conn, rx_desc);
+		if (!cmd)
+			break;
+
+		ret = iscsit_handle_task_mgt_cmd(conn, cmd,
+						(unsigned char *)hdr);
+		break;
+	case ISCSI_OP_LOGOUT:
+		cmd = isert_allocate_cmd(conn, rx_desc);
+		if (!cmd)
+			break;
+
+		ret = iscsit_handle_logout_cmd(conn, cmd, (unsigned char *)hdr);
+		break;
+	case ISCSI_OP_TEXT:
+		if (be32_to_cpu(hdr->ttt) != 0xFFFFFFFF)
+			cmd = iscsit_find_cmd_from_itt(conn, hdr->itt);
+		else
+			cmd = isert_allocate_cmd(conn, rx_desc);
+
+		if (!cmd)
+			break;
+
+		isert_cmd = iscsit_priv_cmd(cmd);
+		ret = isert_handle_text_cmd(isert_conn, isert_cmd, cmd,
+					    rx_desc, (struct iscsi_text *)hdr);
+		break;
+	default:
+		isert_err("Got unknown iSCSI OpCode: 0x%02x\n", opcode);
+		dump_stack();
+		break;
+	}
+
+	return ret;
+}
+
+static void
+isert_print_wc(struct ib_wc *wc, const char *type)
+{
+	if (wc->status != IB_WC_WR_FLUSH_ERR)
+		isert_err("%s failure: %s (%d) vend_err %x\n", type,
+			  ib_wc_status_msg(wc->status), wc->status,
+			  wc->vendor_err);
+	else
+		isert_dbg("%s failure: %s (%d)\n", type,
+			  ib_wc_status_msg(wc->status), wc->status);
+}
+
+static void
+isert_recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct isert_conn *isert_conn = wc->qp->qp_context;
+	struct ib_device *ib_dev = isert_conn->cm_id->device;
+	struct iser_rx_desc *rx_desc = cqe_to_rx_desc(wc->wr_cqe);
+	struct iscsi_hdr *hdr = &rx_desc->iscsi_header;
+	struct iser_ctrl *iser_ctrl = &rx_desc->iser_header;
+	uint64_t read_va = 0, write_va = 0;
+	uint32_t read_stag = 0, write_stag = 0;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		isert_print_wc(wc, "recv");
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			iscsit_cause_connection_reinstatement(isert_conn->conn, 0);
+		return;
+	}
+
+	rx_desc->in_use = true;
+
+	ib_dma_sync_single_for_cpu(ib_dev, rx_desc->dma_addr,
+			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+
+	isert_dbg("DMA: 0x%llx, iSCSI opcode: 0x%02x, ITT: 0x%08x, flags: 0x%02x dlen: %d\n",
+		 rx_desc->dma_addr, hdr->opcode, hdr->itt, hdr->flags,
+		 (int)(wc->byte_len - ISER_HEADERS_LEN));
+
+	switch (iser_ctrl->flags & 0xF0) {
+	case ISCSI_CTRL:
+		if (iser_ctrl->flags & ISER_RSV) {
+			read_stag = be32_to_cpu(iser_ctrl->read_stag);
+			read_va = be64_to_cpu(iser_ctrl->read_va);
+			isert_dbg("ISER_RSV: read_stag: 0x%x read_va: 0x%llx\n",
+				  read_stag, (unsigned long long)read_va);
+		}
+		if (iser_ctrl->flags & ISER_WSV) {
+			write_stag = be32_to_cpu(iser_ctrl->write_stag);
+			write_va = be64_to_cpu(iser_ctrl->write_va);
+			isert_dbg("ISER_WSV: write_stag: 0x%x write_va: 0x%llx\n",
+				  write_stag, (unsigned long long)write_va);
+		}
+
+		isert_dbg("ISER ISCSI_CTRL PDU\n");
+		break;
+	case ISER_HELLO:
+		isert_err("iSER Hello message\n");
+		break;
+	default:
+		isert_warn("Unknown iSER hdr flags: 0x%02x\n", iser_ctrl->flags);
+		break;
+	}
+
+	isert_rx_opcode(isert_conn, rx_desc,
+			read_stag, read_va, write_stag, write_va);
+
+	ib_dma_sync_single_for_device(ib_dev, rx_desc->dma_addr,
+			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+}
+
+static void
+isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct isert_conn *isert_conn = wc->qp->qp_context;
+	struct ib_device *ib_dev = isert_conn->device->ib_device;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		isert_print_wc(wc, "login recv");
+		return;
+	}
+
+	ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_req_dma,
+			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+
+	isert_conn->login_req_len = wc->byte_len - ISER_HEADERS_LEN;
+
+	if (isert_conn->conn) {
+		struct iscsi_login *login = isert_conn->conn->conn_login;
+
+		if (login && !login->first_request)
+			isert_rx_login_req(isert_conn);
+	}
+
+	mutex_lock(&isert_conn->mutex);
+	complete(&isert_conn->login_req_comp);
+	mutex_unlock(&isert_conn->mutex);
+
+	ib_dma_sync_single_for_device(ib_dev, isert_conn->login_req_dma,
+				ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+}
+
+static void
+isert_rdma_rw_ctx_destroy(struct isert_cmd *cmd, struct isert_conn *conn)
+{
+	struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd;
+	enum dma_data_direction dir = target_reverse_dma_direction(se_cmd);
+
+	if (!cmd->rw.nr_ops)
+		return;
+
+	if (isert_prot_cmd(conn, se_cmd)) {
+		rdma_rw_ctx_destroy_signature(&cmd->rw, conn->qp,
+				conn->cm_id->port_num, se_cmd->t_data_sg,
+				se_cmd->t_data_nents, se_cmd->t_prot_sg,
+				se_cmd->t_prot_nents, dir);
+	} else {
+		rdma_rw_ctx_destroy(&cmd->rw, conn->qp, conn->cm_id->port_num,
+				se_cmd->t_data_sg, se_cmd->t_data_nents, dir);
+	}
+
+	cmd->rw.nr_ops = 0;
+}
+
+static void
+isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err)
+{
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+	struct isert_conn *isert_conn = isert_cmd->conn;
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_text_rsp *hdr;
+
+	isert_dbg("Cmd %p\n", isert_cmd);
+
+	switch (cmd->iscsi_opcode) {
+	case ISCSI_OP_SCSI_CMD:
+		spin_lock_bh(&conn->cmd_lock);
+		if (!list_empty(&cmd->i_conn_node))
+			list_del_init(&cmd->i_conn_node);
+		spin_unlock_bh(&conn->cmd_lock);
+
+		if (cmd->data_direction == DMA_TO_DEVICE) {
+			iscsit_stop_dataout_timer(cmd);
+			/*
+			 * Check for special case during comp_err where
+			 * WRITE_PENDING has been handed off from core,
+			 * but requires an extra target_put_sess_cmd()
+			 * before transport_generic_free_cmd() below.
+			 */
+			if (comp_err &&
+			    cmd->se_cmd.t_state == TRANSPORT_WRITE_PENDING) {
+				struct se_cmd *se_cmd = &cmd->se_cmd;
+
+				target_put_sess_cmd(se_cmd);
+			}
+		}
+
+		isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
+		transport_generic_free_cmd(&cmd->se_cmd, 0);
+		break;
+	case ISCSI_OP_SCSI_TMFUNC:
+		spin_lock_bh(&conn->cmd_lock);
+		if (!list_empty(&cmd->i_conn_node))
+			list_del_init(&cmd->i_conn_node);
+		spin_unlock_bh(&conn->cmd_lock);
+
+		transport_generic_free_cmd(&cmd->se_cmd, 0);
+		break;
+	case ISCSI_OP_REJECT:
+	case ISCSI_OP_NOOP_OUT:
+	case ISCSI_OP_TEXT:
+		hdr = (struct iscsi_text_rsp *)&isert_cmd->tx_desc.iscsi_header;
+		/* If the continue bit is on, keep the command alive */
+		if (hdr->flags & ISCSI_FLAG_TEXT_CONTINUE)
+			break;
+
+		spin_lock_bh(&conn->cmd_lock);
+		if (!list_empty(&cmd->i_conn_node))
+			list_del_init(&cmd->i_conn_node);
+		spin_unlock_bh(&conn->cmd_lock);
+
+		/*
+		 * Handle special case for REJECT when iscsi_add_reject*() has
+		 * overwritten the original iscsi_opcode assignment, and the
+		 * associated cmd->se_cmd needs to be released.
+		 */
+		if (cmd->se_cmd.se_tfo != NULL) {
+			isert_dbg("Calling transport_generic_free_cmd for 0x%02x\n",
+				 cmd->iscsi_opcode);
+			transport_generic_free_cmd(&cmd->se_cmd, 0);
+			break;
+		}
+		/* fall through */
+	default:
+		iscsit_release_cmd(cmd);
+		break;
+	}
+}
+
+static void
+isert_unmap_tx_desc(struct iser_tx_desc *tx_desc, struct ib_device *ib_dev)
+{
+	if (tx_desc->dma_addr != 0) {
+		isert_dbg("unmap single for tx_desc->dma_addr\n");
+		ib_dma_unmap_single(ib_dev, tx_desc->dma_addr,
+				    ISER_HEADERS_LEN, DMA_TO_DEVICE);
+		tx_desc->dma_addr = 0;
+	}
+}
+
+static void
+isert_completion_put(struct iser_tx_desc *tx_desc, struct isert_cmd *isert_cmd,
+		     struct ib_device *ib_dev, bool comp_err)
+{
+	if (isert_cmd->pdu_buf_dma != 0) {
+		isert_dbg("unmap single for isert_cmd->pdu_buf_dma\n");
+		ib_dma_unmap_single(ib_dev, isert_cmd->pdu_buf_dma,
+				    isert_cmd->pdu_buf_len, DMA_TO_DEVICE);
+		isert_cmd->pdu_buf_dma = 0;
+	}
+
+	isert_unmap_tx_desc(tx_desc, ib_dev);
+	isert_put_cmd(isert_cmd, comp_err);
+}
+
+static int
+isert_check_pi_status(struct se_cmd *se_cmd, struct ib_mr *sig_mr)
+{
+	struct ib_mr_status mr_status;
+	int ret;
+
+	ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status);
+	if (ret) {
+		isert_err("ib_check_mr_status failed, ret %d\n", ret);
+		goto fail_mr_status;
+	}
+
+	if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
+		u64 sec_offset_err;
+		u32 block_size = se_cmd->se_dev->dev_attrib.block_size + 8;
+
+		switch (mr_status.sig_err.err_type) {
+		case IB_SIG_BAD_GUARD:
+			se_cmd->pi_err = TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED;
+			break;
+		case IB_SIG_BAD_REFTAG:
+			se_cmd->pi_err = TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED;
+			break;
+		case IB_SIG_BAD_APPTAG:
+			se_cmd->pi_err = TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED;
+			break;
+		}
+		sec_offset_err = mr_status.sig_err.sig_err_offset;
+		do_div(sec_offset_err, block_size);
+		se_cmd->bad_sector = sec_offset_err + se_cmd->t_task_lba;
+
+		isert_err("PI error found type %d at sector 0x%llx "
+			  "expected 0x%x vs actual 0x%x\n",
+			  mr_status.sig_err.err_type,
+			  (unsigned long long)se_cmd->bad_sector,
+			  mr_status.sig_err.expected,
+			  mr_status.sig_err.actual);
+		ret = 1;
+	}
+
+fail_mr_status:
+	return ret;
+}
+
+static void
+isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct isert_conn *isert_conn = wc->qp->qp_context;
+	struct isert_device *device = isert_conn->device;
+	struct iser_tx_desc *desc = cqe_to_tx_desc(wc->wr_cqe);
+	struct isert_cmd *isert_cmd = tx_desc_to_cmd(desc);
+	struct se_cmd *cmd = &isert_cmd->iscsi_cmd->se_cmd;
+	int ret = 0;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		isert_print_wc(wc, "rdma write");
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			iscsit_cause_connection_reinstatement(isert_conn->conn, 0);
+		isert_completion_put(desc, isert_cmd, device->ib_device, true);
+		return;
+	}
+
+	isert_dbg("Cmd %p\n", isert_cmd);
+
+	ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr);
+	isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
+
+	if (ret) {
+		/*
+		 * transport_generic_request_failure() expects to have
+		 * plus two references to handle queue-full, so re-add
+		 * one here as target-core will have already dropped
+		 * it after the first isert_put_datain() callback.
+		 */
+		kref_get(&cmd->cmd_kref);
+		transport_generic_request_failure(cmd, cmd->pi_err);
+	} else {
+		/*
+		 * XXX: isert_put_response() failure is not retried.
+		 */
+		ret = isert_put_response(isert_conn->conn, isert_cmd->iscsi_cmd);
+		if (ret)
+			pr_warn_ratelimited("isert_put_response() ret: %d\n", ret);
+	}
+}
+
+static void
+isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct isert_conn *isert_conn = wc->qp->qp_context;
+	struct isert_device *device = isert_conn->device;
+	struct iser_tx_desc *desc = cqe_to_tx_desc(wc->wr_cqe);
+	struct isert_cmd *isert_cmd = tx_desc_to_cmd(desc);
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	int ret = 0;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		isert_print_wc(wc, "rdma read");
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			iscsit_cause_connection_reinstatement(isert_conn->conn, 0);
+		isert_completion_put(desc, isert_cmd, device->ib_device, true);
+		return;
+	}
+
+	isert_dbg("Cmd %p\n", isert_cmd);
+
+	iscsit_stop_dataout_timer(cmd);
+
+	if (isert_prot_cmd(isert_conn, se_cmd))
+		ret = isert_check_pi_status(se_cmd, isert_cmd->rw.sig->sig_mr);
+	isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
+	cmd->write_data_done = 0;
+
+	isert_dbg("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd);
+	spin_lock_bh(&cmd->istate_lock);
+	cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT;
+	cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT;
+	spin_unlock_bh(&cmd->istate_lock);
+
+	/*
+	 * transport_generic_request_failure() will drop the extra
+	 * se_cmd->cmd_kref reference after T10-PI error, and handle
+	 * any non-zero ->queue_status() callback error retries.
+	 */
+	if (ret)
+		transport_generic_request_failure(se_cmd, se_cmd->pi_err);
+	else
+		target_execute_cmd(se_cmd);
+}
+
+static void
+isert_do_control_comp(struct work_struct *work)
+{
+	struct isert_cmd *isert_cmd = container_of(work,
+			struct isert_cmd, comp_work);
+	struct isert_conn *isert_conn = isert_cmd->conn;
+	struct ib_device *ib_dev = isert_conn->cm_id->device;
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+
+	isert_dbg("Cmd %p i_state %d\n", isert_cmd, cmd->i_state);
+
+	switch (cmd->i_state) {
+	case ISTATE_SEND_TASKMGTRSP:
+		iscsit_tmr_post_handler(cmd, cmd->conn);
+		/* fall through */
+	case ISTATE_SEND_REJECT:
+	case ISTATE_SEND_TEXTRSP:
+		cmd->i_state = ISTATE_SENT_STATUS;
+		isert_completion_put(&isert_cmd->tx_desc, isert_cmd,
+				     ib_dev, false);
+		break;
+	case ISTATE_SEND_LOGOUTRSP:
+		iscsit_logout_post_handler(cmd, cmd->conn);
+		break;
+	default:
+		isert_err("Unknown i_state %d\n", cmd->i_state);
+		dump_stack();
+		break;
+	}
+}
+
+static void
+isert_login_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct isert_conn *isert_conn = wc->qp->qp_context;
+	struct ib_device *ib_dev = isert_conn->cm_id->device;
+	struct iser_tx_desc *tx_desc = cqe_to_tx_desc(wc->wr_cqe);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		isert_print_wc(wc, "login send");
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			iscsit_cause_connection_reinstatement(isert_conn->conn, 0);
+	}
+
+	isert_unmap_tx_desc(tx_desc, ib_dev);
+}
+
+static void
+isert_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct isert_conn *isert_conn = wc->qp->qp_context;
+	struct ib_device *ib_dev = isert_conn->cm_id->device;
+	struct iser_tx_desc *tx_desc = cqe_to_tx_desc(wc->wr_cqe);
+	struct isert_cmd *isert_cmd = tx_desc_to_cmd(tx_desc);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		isert_print_wc(wc, "send");
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			iscsit_cause_connection_reinstatement(isert_conn->conn, 0);
+		isert_completion_put(tx_desc, isert_cmd, ib_dev, true);
+		return;
+	}
+
+	isert_dbg("Cmd %p\n", isert_cmd);
+
+	switch (isert_cmd->iscsi_cmd->i_state) {
+	case ISTATE_SEND_TASKMGTRSP:
+	case ISTATE_SEND_LOGOUTRSP:
+	case ISTATE_SEND_REJECT:
+	case ISTATE_SEND_TEXTRSP:
+		isert_unmap_tx_desc(tx_desc, ib_dev);
+
+		INIT_WORK(&isert_cmd->comp_work, isert_do_control_comp);
+		queue_work(isert_comp_wq, &isert_cmd->comp_work);
+		return;
+	default:
+		isert_cmd->iscsi_cmd->i_state = ISTATE_SENT_STATUS;
+		isert_completion_put(tx_desc, isert_cmd, ib_dev, false);
+		break;
+	}
+}
+
+static int
+isert_post_response(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd)
+{
+	struct ib_send_wr *wr_failed;
+	int ret;
+
+	ret = isert_post_recv(isert_conn, isert_cmd->rx_desc);
+	if (ret) {
+		isert_err("ib_post_recv failed with %d\n", ret);
+		return ret;
+	}
+
+	ret = ib_post_send(isert_conn->qp, &isert_cmd->tx_desc.send_wr,
+			   &wr_failed);
+	if (ret) {
+		isert_err("ib_post_send failed with %d\n", ret);
+		return ret;
+	}
+	return ret;
+}
+
+static int
+isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+	struct iscsi_scsi_rsp *hdr = (struct iscsi_scsi_rsp *)
+				&isert_cmd->tx_desc.iscsi_header;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_rsp_pdu(cmd, conn, true, hdr);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+	/*
+	 * Attach SENSE DATA payload to iSCSI Response PDU
+	 */
+	if (cmd->se_cmd.sense_buffer &&
+	    ((cmd->se_cmd.se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) ||
+	    (cmd->se_cmd.se_cmd_flags & SCF_EMULATED_TASK_SENSE))) {
+		struct isert_device *device = isert_conn->device;
+		struct ib_device *ib_dev = device->ib_device;
+		struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1];
+		u32 padding, pdu_len;
+
+		put_unaligned_be16(cmd->se_cmd.scsi_sense_length,
+				   cmd->sense_buffer);
+		cmd->se_cmd.scsi_sense_length += sizeof(__be16);
+
+		padding = -(cmd->se_cmd.scsi_sense_length) & 3;
+		hton24(hdr->dlength, (u32)cmd->se_cmd.scsi_sense_length);
+		pdu_len = cmd->se_cmd.scsi_sense_length + padding;
+
+		isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev,
+				(void *)cmd->sense_buffer, pdu_len,
+				DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(ib_dev, isert_cmd->pdu_buf_dma))
+			return -ENOMEM;
+
+		isert_cmd->pdu_buf_len = pdu_len;
+		tx_dsg->addr	= isert_cmd->pdu_buf_dma;
+		tx_dsg->length	= pdu_len;
+		tx_dsg->lkey	= device->pd->local_dma_lkey;
+		isert_cmd->tx_desc.num_sge = 2;
+	}
+
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
+
+	isert_dbg("Posting SCSI Response\n");
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static void
+isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+
+	spin_lock_bh(&conn->cmd_lock);
+	if (!list_empty(&cmd->i_conn_node))
+		list_del_init(&cmd->i_conn_node);
+	spin_unlock_bh(&conn->cmd_lock);
+
+	if (cmd->data_direction == DMA_TO_DEVICE)
+		iscsit_stop_dataout_timer(cmd);
+	isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
+}
+
+static enum target_prot_op
+isert_get_sup_prot_ops(struct iscsi_conn *conn)
+{
+	struct isert_conn *isert_conn = conn->context;
+	struct isert_device *device = isert_conn->device;
+
+	if (conn->tpg->tpg_attrib.t10_pi) {
+		if (device->pi_capable) {
+			isert_info("conn %p PI offload enabled\n", isert_conn);
+			isert_conn->pi_support = true;
+			return TARGET_PROT_ALL;
+		}
+	}
+
+	isert_info("conn %p PI offload disabled\n", isert_conn);
+	isert_conn->pi_support = false;
+
+	return TARGET_PROT_NORMAL;
+}
+
+static int
+isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
+		bool nopout_response)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_nopin_rsp(cmd, conn, (struct iscsi_nopin *)
+			       &isert_cmd->tx_desc.iscsi_header,
+			       nopout_response);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
+
+	isert_dbg("conn %p Posting NOPIN Response\n", isert_conn);
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_put_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_logout_rsp(cmd, conn, (struct iscsi_logout_rsp *)
+				&isert_cmd->tx_desc.iscsi_header);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
+
+	isert_dbg("conn %p Posting Logout Response\n", isert_conn);
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_put_tm_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_task_mgt_rsp(cmd, conn, (struct iscsi_tm_rsp *)
+				  &isert_cmd->tx_desc.iscsi_header);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
+
+	isert_dbg("conn %p Posting Task Management Response\n", isert_conn);
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+	struct isert_device *device = isert_conn->device;
+	struct ib_device *ib_dev = device->ib_device;
+	struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1];
+	struct iscsi_reject *hdr =
+		(struct iscsi_reject *)&isert_cmd->tx_desc.iscsi_header;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_reject(cmd, conn, hdr);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+
+	hton24(hdr->dlength, ISCSI_HDR_LEN);
+	isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev,
+			(void *)cmd->buf_ptr, ISCSI_HDR_LEN,
+			DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(ib_dev, isert_cmd->pdu_buf_dma))
+		return -ENOMEM;
+	isert_cmd->pdu_buf_len = ISCSI_HDR_LEN;
+	tx_dsg->addr	= isert_cmd->pdu_buf_dma;
+	tx_dsg->length	= ISCSI_HDR_LEN;
+	tx_dsg->lkey	= device->pd->local_dma_lkey;
+	isert_cmd->tx_desc.num_sge = 2;
+
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
+
+	isert_dbg("conn %p Posting Reject\n", isert_conn);
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+	struct iscsi_text_rsp *hdr =
+		(struct iscsi_text_rsp *)&isert_cmd->tx_desc.iscsi_header;
+	u32 txt_rsp_len;
+	int rc;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	rc = iscsit_build_text_rsp(cmd, conn, hdr, ISCSI_INFINIBAND);
+	if (rc < 0)
+		return rc;
+
+	txt_rsp_len = rc;
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+
+	if (txt_rsp_len) {
+		struct isert_device *device = isert_conn->device;
+		struct ib_device *ib_dev = device->ib_device;
+		struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1];
+		void *txt_rsp_buf = cmd->buf_ptr;
+
+		isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev,
+				txt_rsp_buf, txt_rsp_len, DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(ib_dev, isert_cmd->pdu_buf_dma))
+			return -ENOMEM;
+
+		isert_cmd->pdu_buf_len = txt_rsp_len;
+		tx_dsg->addr	= isert_cmd->pdu_buf_dma;
+		tx_dsg->length	= txt_rsp_len;
+		tx_dsg->lkey	= device->pd->local_dma_lkey;
+		isert_cmd->tx_desc.num_sge = 2;
+	}
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
+
+	isert_dbg("conn %p Text Response\n", isert_conn);
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static inline void
+isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs,
+		     struct ib_sig_domain *domain)
+{
+	domain->sig_type = IB_SIG_TYPE_T10_DIF;
+	domain->sig.dif.bg_type = IB_T10DIF_CRC;
+	domain->sig.dif.pi_interval = se_cmd->se_dev->dev_attrib.block_size;
+	domain->sig.dif.ref_tag = se_cmd->reftag_seed;
+	/*
+	 * At the moment we hard code those, but if in the future
+	 * the target core would like to use it, we will take it
+	 * from se_cmd.
+	 */
+	domain->sig.dif.apptag_check_mask = 0xffff;
+	domain->sig.dif.app_escape = true;
+	domain->sig.dif.ref_escape = true;
+	if (se_cmd->prot_type == TARGET_DIF_TYPE1_PROT ||
+	    se_cmd->prot_type == TARGET_DIF_TYPE2_PROT)
+		domain->sig.dif.ref_remap = true;
+};
+
+static int
+isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs)
+{
+	memset(sig_attrs, 0, sizeof(*sig_attrs));
+
+	switch (se_cmd->prot_op) {
+	case TARGET_PROT_DIN_INSERT:
+	case TARGET_PROT_DOUT_STRIP:
+		sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
+		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire);
+		break;
+	case TARGET_PROT_DOUT_INSERT:
+	case TARGET_PROT_DIN_STRIP:
+		sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
+		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem);
+		break;
+	case TARGET_PROT_DIN_PASS:
+	case TARGET_PROT_DOUT_PASS:
+		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire);
+		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem);
+		break;
+	default:
+		isert_err("Unsupported PI operation %d\n", se_cmd->prot_op);
+		return -EINVAL;
+	}
+
+	sig_attrs->check_mask =
+	       (se_cmd->prot_checks & TARGET_DIF_CHECK_GUARD  ? 0xc0 : 0) |
+	       (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) |
+	       (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0);
+	return 0;
+}
+
+static int
+isert_rdma_rw_ctx_post(struct isert_cmd *cmd, struct isert_conn *conn,
+		struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+{
+	struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd;
+	enum dma_data_direction dir = target_reverse_dma_direction(se_cmd);
+	u8 port_num = conn->cm_id->port_num;
+	u64 addr;
+	u32 rkey, offset;
+	int ret;
+
+	if (cmd->ctx_init_done)
+		goto rdma_ctx_post;
+
+	if (dir == DMA_FROM_DEVICE) {
+		addr = cmd->write_va;
+		rkey = cmd->write_stag;
+		offset = cmd->iscsi_cmd->write_data_done;
+	} else {
+		addr = cmd->read_va;
+		rkey = cmd->read_stag;
+		offset = 0;
+	}
+
+	if (isert_prot_cmd(conn, se_cmd)) {
+		struct ib_sig_attrs sig_attrs;
+
+		ret = isert_set_sig_attrs(se_cmd, &sig_attrs);
+		if (ret)
+			return ret;
+
+		WARN_ON_ONCE(offset);
+		ret = rdma_rw_ctx_signature_init(&cmd->rw, conn->qp, port_num,
+				se_cmd->t_data_sg, se_cmd->t_data_nents,
+				se_cmd->t_prot_sg, se_cmd->t_prot_nents,
+				&sig_attrs, addr, rkey, dir);
+	} else {
+		ret = rdma_rw_ctx_init(&cmd->rw, conn->qp, port_num,
+				se_cmd->t_data_sg, se_cmd->t_data_nents,
+				offset, addr, rkey, dir);
+	}
+
+	if (ret < 0) {
+		isert_err("Cmd: %p failed to prepare RDMA res\n", cmd);
+		return ret;
+	}
+
+	cmd->ctx_init_done = true;
+
+rdma_ctx_post:
+	ret = rdma_rw_ctx_post(&cmd->rw, conn->qp, port_num, cqe, chain_wr);
+	if (ret < 0)
+		isert_err("Cmd: %p failed to post RDMA res\n", cmd);
+	return ret;
+}
+
+static int
+isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+{
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_cqe *cqe = NULL;
+	struct ib_send_wr *chain_wr = NULL;
+	int rc;
+
+	isert_dbg("Cmd: %p RDMA_WRITE data_length: %u\n",
+		 isert_cmd, se_cmd->data_length);
+
+	if (isert_prot_cmd(isert_conn, se_cmd)) {
+		isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done;
+		cqe = &isert_cmd->tx_desc.tx_cqe;
+	} else {
+		/*
+		 * Build isert_conn->tx_desc for iSCSI response PDU and attach
+		 */
+		isert_create_send_desc(isert_conn, isert_cmd,
+				       &isert_cmd->tx_desc);
+		iscsit_build_rsp_pdu(cmd, conn, true, (struct iscsi_scsi_rsp *)
+				     &isert_cmd->tx_desc.iscsi_header);
+		isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+		isert_init_send_wr(isert_conn, isert_cmd,
+				   &isert_cmd->tx_desc.send_wr);
+
+		rc = isert_post_recv(isert_conn, isert_cmd->rx_desc);
+		if (rc) {
+			isert_err("ib_post_recv failed with %d\n", rc);
+			return rc;
+		}
+
+		chain_wr = &isert_cmd->tx_desc.send_wr;
+	}
+
+	rc = isert_rdma_rw_ctx_post(isert_cmd, isert_conn, cqe, chain_wr);
+	isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ rc: %d\n",
+		  isert_cmd, rc);
+	return rc;
+}
+
+static int
+isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	int ret;
+
+	isert_dbg("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n",
+		 isert_cmd, cmd->se_cmd.data_length, cmd->write_data_done);
+
+	isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done;
+	ret = isert_rdma_rw_ctx_post(isert_cmd, conn->context,
+				     &isert_cmd->tx_desc.tx_cqe, NULL);
+
+	isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE rc: %d\n",
+		 isert_cmd, ret);
+	return ret;
+}
+
+static int
+isert_immediate_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	int ret = 0;
+
+	switch (state) {
+	case ISTATE_REMOVE:
+		spin_lock_bh(&conn->cmd_lock);
+		list_del_init(&cmd->i_conn_node);
+		spin_unlock_bh(&conn->cmd_lock);
+		isert_put_cmd(isert_cmd, true);
+		break;
+	case ISTATE_SEND_NOPIN_WANT_RESPONSE:
+		ret = isert_put_nopin(cmd, conn, false);
+		break;
+	default:
+		isert_err("Unknown immediate state: 0x%02x\n", state);
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int
+isert_response_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state)
+{
+	struct isert_conn *isert_conn = conn->context;
+	int ret;
+
+	switch (state) {
+	case ISTATE_SEND_LOGOUTRSP:
+		ret = isert_put_logout_rsp(cmd, conn);
+		if (!ret)
+			isert_conn->logout_posted = true;
+		break;
+	case ISTATE_SEND_NOPIN:
+		ret = isert_put_nopin(cmd, conn, true);
+		break;
+	case ISTATE_SEND_TASKMGTRSP:
+		ret = isert_put_tm_rsp(cmd, conn);
+		break;
+	case ISTATE_SEND_REJECT:
+		ret = isert_put_reject(cmd, conn);
+		break;
+	case ISTATE_SEND_TEXTRSP:
+		ret = isert_put_text_rsp(cmd, conn);
+		break;
+	case ISTATE_SEND_STATUS:
+		/*
+		 * Special case for sending non GOOD SCSI status from TX thread
+		 * context during pre se_cmd excecution failure.
+		 */
+		ret = isert_put_response(conn, cmd);
+		break;
+	default:
+		isert_err("Unknown response state: 0x%02x\n", state);
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+struct rdma_cm_id *
+isert_setup_id(struct isert_np *isert_np)
+{
+	struct iscsi_np *np = isert_np->np;
+	struct rdma_cm_id *id;
+	struct sockaddr *sa;
+	int ret;
+
+	sa = (struct sockaddr *)&np->np_sockaddr;
+	isert_dbg("ksockaddr: %p, sa: %p\n", &np->np_sockaddr, sa);
+
+	id = rdma_create_id(&init_net, isert_cma_handler, isert_np,
+			    RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(id)) {
+		isert_err("rdma_create_id() failed: %ld\n", PTR_ERR(id));
+		ret = PTR_ERR(id);
+		goto out;
+	}
+	isert_dbg("id %p context %p\n", id, id->context);
+
+	ret = rdma_bind_addr(id, sa);
+	if (ret) {
+		isert_err("rdma_bind_addr() failed: %d\n", ret);
+		goto out_id;
+	}
+
+	ret = rdma_listen(id, 0);
+	if (ret) {
+		isert_err("rdma_listen() failed: %d\n", ret);
+		goto out_id;
+	}
+
+	return id;
+out_id:
+	rdma_destroy_id(id);
+out:
+	return ERR_PTR(ret);
+}
+
+static int
+isert_setup_np(struct iscsi_np *np,
+	       struct sockaddr_storage *ksockaddr)
+{
+	struct isert_np *isert_np;
+	struct rdma_cm_id *isert_lid;
+	int ret;
+
+	isert_np = kzalloc(sizeof(struct isert_np), GFP_KERNEL);
+	if (!isert_np)
+		return -ENOMEM;
+
+	sema_init(&isert_np->sem, 0);
+	mutex_init(&isert_np->mutex);
+	INIT_LIST_HEAD(&isert_np->accepted);
+	INIT_LIST_HEAD(&isert_np->pending);
+	isert_np->np = np;
+
+	/*
+	 * Setup the np->np_sockaddr from the passed sockaddr setup
+	 * in iscsi_target_configfs.c code..
+	 */
+	memcpy(&np->np_sockaddr, ksockaddr,
+	       sizeof(struct sockaddr_storage));
+
+	isert_lid = isert_setup_id(isert_np);
+	if (IS_ERR(isert_lid)) {
+		ret = PTR_ERR(isert_lid);
+		goto out;
+	}
+
+	isert_np->cm_id = isert_lid;
+	np->np_context = isert_np;
+
+	return 0;
+
+out:
+	kfree(isert_np);
+
+	return ret;
+}
+
+static int
+isert_rdma_accept(struct isert_conn *isert_conn)
+{
+	struct rdma_cm_id *cm_id = isert_conn->cm_id;
+	struct rdma_conn_param cp;
+	int ret;
+	struct iser_cm_hdr rsp_hdr;
+
+	memset(&cp, 0, sizeof(struct rdma_conn_param));
+	cp.initiator_depth = isert_conn->initiator_depth;
+	cp.retry_count = 7;
+	cp.rnr_retry_count = 7;
+
+	memset(&rsp_hdr, 0, sizeof(rsp_hdr));
+	rsp_hdr.flags = ISERT_ZBVA_NOT_USED;
+	if (!isert_conn->snd_w_inv)
+		rsp_hdr.flags = rsp_hdr.flags | ISERT_SEND_W_INV_NOT_USED;
+	cp.private_data = (void *)&rsp_hdr;
+	cp.private_data_len = sizeof(rsp_hdr);
+
+	ret = rdma_accept(cm_id, &cp);
+	if (ret) {
+		isert_err("rdma_accept() failed with: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int
+isert_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login)
+{
+	struct isert_conn *isert_conn = conn->context;
+	int ret;
+
+	isert_info("before login_req comp conn: %p\n", isert_conn);
+	ret = wait_for_completion_interruptible(&isert_conn->login_req_comp);
+	if (ret) {
+		isert_err("isert_conn %p interrupted before got login req\n",
+			  isert_conn);
+		return ret;
+	}
+	reinit_completion(&isert_conn->login_req_comp);
+
+	/*
+	 * For login requests after the first PDU, isert_rx_login_req() will
+	 * kick schedule_delayed_work(&conn->login_work) as the packet is
+	 * received, which turns this callback from iscsi_target_do_login_rx()
+	 * into a NOP.
+	 */
+	if (!login->first_request)
+		return 0;
+
+	isert_rx_login_req(isert_conn);
+
+	isert_info("before login_comp conn: %p\n", conn);
+	ret = wait_for_completion_interruptible(&isert_conn->login_comp);
+	if (ret)
+		return ret;
+
+	isert_info("processing login->req: %p\n", login->req);
+
+	return 0;
+}
+
+static void
+isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn,
+		    struct isert_conn *isert_conn)
+{
+	struct rdma_cm_id *cm_id = isert_conn->cm_id;
+	struct rdma_route *cm_route = &cm_id->route;
+
+	conn->login_family = np->np_sockaddr.ss_family;
+
+	conn->login_sockaddr = cm_route->addr.dst_addr;
+	conn->local_sockaddr = cm_route->addr.src_addr;
+}
+
+static int
+isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn)
+{
+	struct isert_np *isert_np = np->np_context;
+	struct isert_conn *isert_conn;
+	int ret;
+
+accept_wait:
+	ret = down_interruptible(&isert_np->sem);
+	if (ret)
+		return -ENODEV;
+
+	spin_lock_bh(&np->np_thread_lock);
+	if (np->np_thread_state >= ISCSI_NP_THREAD_RESET) {
+		spin_unlock_bh(&np->np_thread_lock);
+		isert_dbg("np_thread_state %d\n",
+			 np->np_thread_state);
+		/**
+		 * No point in stalling here when np_thread
+		 * is in state RESET/SHUTDOWN/EXIT - bail
+		 **/
+		return -ENODEV;
+	}
+	spin_unlock_bh(&np->np_thread_lock);
+
+	mutex_lock(&isert_np->mutex);
+	if (list_empty(&isert_np->pending)) {
+		mutex_unlock(&isert_np->mutex);
+		goto accept_wait;
+	}
+	isert_conn = list_first_entry(&isert_np->pending,
+			struct isert_conn, node);
+	list_del_init(&isert_conn->node);
+	mutex_unlock(&isert_np->mutex);
+
+	conn->context = isert_conn;
+	isert_conn->conn = conn;
+	isert_conn->state = ISER_CONN_BOUND;
+
+	isert_set_conn_info(np, conn, isert_conn);
+
+	isert_dbg("Processing isert_conn: %p\n", isert_conn);
+
+	return 0;
+}
+
+static void
+isert_free_np(struct iscsi_np *np)
+{
+	struct isert_np *isert_np = np->np_context;
+	struct isert_conn *isert_conn, *n;
+
+	if (isert_np->cm_id)
+		rdma_destroy_id(isert_np->cm_id);
+
+	/*
+	 * FIXME: At this point we don't have a good way to insure
+	 * that at this point we don't have hanging connections that
+	 * completed RDMA establishment but didn't start iscsi login
+	 * process. So work-around this by cleaning up what ever piled
+	 * up in accepted and pending lists.
+	 */
+	mutex_lock(&isert_np->mutex);
+	if (!list_empty(&isert_np->pending)) {
+		isert_info("Still have isert pending connections\n");
+		list_for_each_entry_safe(isert_conn, n,
+					 &isert_np->pending,
+					 node) {
+			isert_info("cleaning isert_conn %p state (%d)\n",
+				   isert_conn, isert_conn->state);
+			isert_connect_release(isert_conn);
+		}
+	}
+
+	if (!list_empty(&isert_np->accepted)) {
+		isert_info("Still have isert accepted connections\n");
+		list_for_each_entry_safe(isert_conn, n,
+					 &isert_np->accepted,
+					 node) {
+			isert_info("cleaning isert_conn %p state (%d)\n",
+				   isert_conn, isert_conn->state);
+			isert_connect_release(isert_conn);
+		}
+	}
+	mutex_unlock(&isert_np->mutex);
+
+	np->np_context = NULL;
+	kfree(isert_np);
+}
+
+static void isert_release_work(struct work_struct *work)
+{
+	struct isert_conn *isert_conn = container_of(work,
+						     struct isert_conn,
+						     release_work);
+
+	isert_info("Starting release conn %p\n", isert_conn);
+
+	mutex_lock(&isert_conn->mutex);
+	isert_conn->state = ISER_CONN_DOWN;
+	mutex_unlock(&isert_conn->mutex);
+
+	isert_info("Destroying conn %p\n", isert_conn);
+	isert_put_conn(isert_conn);
+}
+
+static void
+isert_wait4logout(struct isert_conn *isert_conn)
+{
+	struct iscsi_conn *conn = isert_conn->conn;
+
+	isert_info("conn %p\n", isert_conn);
+
+	if (isert_conn->logout_posted) {
+		isert_info("conn %p wait for conn_logout_comp\n", isert_conn);
+		wait_for_completion_timeout(&conn->conn_logout_comp,
+					    SECONDS_FOR_LOGOUT_COMP * HZ);
+	}
+}
+
+static void
+isert_wait4cmds(struct iscsi_conn *conn)
+{
+	isert_info("iscsi_conn %p\n", conn);
+
+	if (conn->sess) {
+		target_sess_cmd_list_set_waiting(conn->sess->se_sess);
+		target_wait_for_sess_cmds(conn->sess->se_sess);
+	}
+}
+
+/**
+ * isert_put_unsol_pending_cmds() - Drop commands waiting for
+ *     unsolicitate dataout
+ * @conn:    iscsi connection
+ *
+ * We might still have commands that are waiting for unsolicited
+ * dataouts messages. We must put the extra reference on those
+ * before blocking on the target_wait_for_session_cmds
+ */
+static void
+isert_put_unsol_pending_cmds(struct iscsi_conn *conn)
+{
+	struct iscsi_cmd *cmd, *tmp;
+	static LIST_HEAD(drop_cmd_list);
+
+	spin_lock_bh(&conn->cmd_lock);
+	list_for_each_entry_safe(cmd, tmp, &conn->conn_cmd_list, i_conn_node) {
+		if ((cmd->cmd_flags & ICF_NON_IMMEDIATE_UNSOLICITED_DATA) &&
+		    (cmd->write_data_done < conn->sess->sess_ops->FirstBurstLength) &&
+		    (cmd->write_data_done < cmd->se_cmd.data_length))
+			list_move_tail(&cmd->i_conn_node, &drop_cmd_list);
+	}
+	spin_unlock_bh(&conn->cmd_lock);
+
+	list_for_each_entry_safe(cmd, tmp, &drop_cmd_list, i_conn_node) {
+		list_del_init(&cmd->i_conn_node);
+		if (cmd->i_state != ISTATE_REMOVE) {
+			struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+
+			isert_info("conn %p dropping cmd %p\n", conn, cmd);
+			isert_put_cmd(isert_cmd, true);
+		}
+	}
+}
+
+static void isert_wait_conn(struct iscsi_conn *conn)
+{
+	struct isert_conn *isert_conn = conn->context;
+
+	isert_info("Starting conn %p\n", isert_conn);
+
+	mutex_lock(&isert_conn->mutex);
+	isert_conn_terminate(isert_conn);
+	mutex_unlock(&isert_conn->mutex);
+
+	ib_drain_qp(isert_conn->qp);
+	isert_put_unsol_pending_cmds(conn);
+	isert_wait4cmds(conn);
+	isert_wait4logout(isert_conn);
+
+	queue_work(isert_release_wq, &isert_conn->release_work);
+}
+
+static void isert_free_conn(struct iscsi_conn *conn)
+{
+	struct isert_conn *isert_conn = conn->context;
+
+	ib_drain_qp(isert_conn->qp);
+	isert_put_conn(isert_conn);
+}
+
+static void isert_get_rx_pdu(struct iscsi_conn *conn)
+{
+	struct completion comp;
+
+	init_completion(&comp);
+
+	wait_for_completion_interruptible(&comp);
+}
+
+static struct iscsit_transport iser_target_transport = {
+	.name			= "IB/iSER",
+	.transport_type		= ISCSI_INFINIBAND,
+	.rdma_shutdown		= true,
+	.priv_size		= sizeof(struct isert_cmd),
+	.owner			= THIS_MODULE,
+	.iscsit_setup_np	= isert_setup_np,
+	.iscsit_accept_np	= isert_accept_np,
+	.iscsit_free_np		= isert_free_np,
+	.iscsit_wait_conn	= isert_wait_conn,
+	.iscsit_free_conn	= isert_free_conn,
+	.iscsit_get_login_rx	= isert_get_login_rx,
+	.iscsit_put_login_tx	= isert_put_login_tx,
+	.iscsit_immediate_queue	= isert_immediate_queue,
+	.iscsit_response_queue	= isert_response_queue,
+	.iscsit_get_dataout	= isert_get_dataout,
+	.iscsit_queue_data_in	= isert_put_datain,
+	.iscsit_queue_status	= isert_put_response,
+	.iscsit_aborted_task	= isert_aborted_task,
+	.iscsit_get_rx_pdu	= isert_get_rx_pdu,
+	.iscsit_get_sup_prot_ops = isert_get_sup_prot_ops,
+};
+
+static int __init isert_init(void)
+{
+	int ret;
+
+	isert_comp_wq = alloc_workqueue("isert_comp_wq",
+					WQ_UNBOUND | WQ_HIGHPRI, 0);
+	if (!isert_comp_wq) {
+		isert_err("Unable to allocate isert_comp_wq\n");
+		return -ENOMEM;
+	}
+
+	isert_release_wq = alloc_workqueue("isert_release_wq", WQ_UNBOUND,
+					WQ_UNBOUND_MAX_ACTIVE);
+	if (!isert_release_wq) {
+		isert_err("Unable to allocate isert_release_wq\n");
+		ret = -ENOMEM;
+		goto destroy_comp_wq;
+	}
+
+	iscsit_register_transport(&iser_target_transport);
+	isert_info("iSER_TARGET[0] - Loaded iser_target_transport\n");
+
+	return 0;
+
+destroy_comp_wq:
+	destroy_workqueue(isert_comp_wq);
+
+	return ret;
+}
+
+static void __exit isert_exit(void)
+{
+	flush_scheduled_work();
+	destroy_workqueue(isert_release_wq);
+	destroy_workqueue(isert_comp_wq);
+	iscsit_unregister_transport(&iser_target_transport);
+	isert_info("iSER_TARGET[0] - Released iser_target_transport\n");
+}
+
+MODULE_DESCRIPTION("iSER-Target for mainline target infrastructure");
+MODULE_AUTHOR("nab@Linux-iSCSI.org");
+MODULE_LICENSE("GPL");
+
+module_init(isert_init);
+module_exit(isert_exit);
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h
new file mode 100644
index 0000000..3b296ba
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rw.h>
+#include <scsi/iser.h>
+
+
+#define DRV_NAME	"isert"
+#define PFX		DRV_NAME ": "
+
+#define isert_dbg(fmt, arg...)				 \
+	do {						 \
+		if (unlikely(isert_debug_level > 2))	 \
+			printk(KERN_DEBUG PFX "%s: " fmt,\
+				__func__ , ## arg);	 \
+	} while (0)
+
+#define isert_warn(fmt, arg...)				\
+	do {						\
+		if (unlikely(isert_debug_level > 0))	\
+			pr_warn(PFX "%s: " fmt,         \
+				__func__ , ## arg);	\
+	} while (0)
+
+#define isert_info(fmt, arg...)				\
+	do {						\
+		if (unlikely(isert_debug_level > 1))	\
+			pr_info(PFX "%s: " fmt,         \
+				__func__ , ## arg);	\
+	} while (0)
+
+#define isert_err(fmt, arg...) \
+	pr_err(PFX "%s: " fmt, __func__ , ## arg)
+
+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN	(sizeof(struct iser_ctrl) + \
+				 sizeof(struct iscsi_hdr))
+#define ISER_RX_PAYLOAD_SIZE	(ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
+
+/* QP settings */
+/* Maximal bounds on received asynchronous PDUs */
+#define ISERT_MAX_TX_MISC_PDUS	4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
+
+#define ISERT_MAX_RX_MISC_PDUS	6 /*
+				   * NOOP_OUT(2), TEXT(1),
+				   * SCSI_TMFUNC(2), LOGOUT(1)
+				   */
+
+#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */
+
+#define ISERT_QP_MAX_RECV_DTOS	(ISCSI_DEF_XMIT_CMDS_MAX)
+
+#define ISERT_MIN_POSTED_RX	(ISCSI_DEF_XMIT_CMDS_MAX >> 2)
+
+#define ISERT_QP_MAX_REQ_DTOS	(ISCSI_DEF_XMIT_CMDS_MAX +    \
+				ISERT_MAX_TX_MISC_PDUS	+ \
+				ISERT_MAX_RX_MISC_PDUS)
+
+#define ISER_RX_PAD_SIZE	(ISCSI_DEF_MAX_RECV_SEG_LEN + 4096 - \
+		(ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge) + \
+		 sizeof(struct ib_cqe) + sizeof(bool)))
+
+#define ISCSI_ISER_SG_TABLESIZE		256
+
+enum isert_desc_type {
+	ISCSI_TX_CONTROL,
+	ISCSI_TX_DATAIN
+};
+
+enum iser_conn_state {
+	ISER_CONN_INIT,
+	ISER_CONN_UP,
+	ISER_CONN_BOUND,
+	ISER_CONN_FULL_FEATURE,
+	ISER_CONN_TERMINATING,
+	ISER_CONN_DOWN,
+};
+
+struct iser_rx_desc {
+	struct iser_ctrl iser_header;
+	struct iscsi_hdr iscsi_header;
+	char		data[ISCSI_DEF_MAX_RECV_SEG_LEN];
+	u64		dma_addr;
+	struct ib_sge	rx_sg;
+	struct ib_cqe	rx_cqe;
+	bool		in_use;
+	char		pad[ISER_RX_PAD_SIZE];
+} __packed;
+
+static inline struct iser_rx_desc *cqe_to_rx_desc(struct ib_cqe *cqe)
+{
+	return container_of(cqe, struct iser_rx_desc, rx_cqe);
+}
+
+struct iser_tx_desc {
+	struct iser_ctrl iser_header;
+	struct iscsi_hdr iscsi_header;
+	enum isert_desc_type type;
+	u64		dma_addr;
+	struct ib_sge	tx_sg[2];
+	struct ib_cqe	tx_cqe;
+	int		num_sge;
+	struct ib_send_wr send_wr;
+} __packed;
+
+static inline struct iser_tx_desc *cqe_to_tx_desc(struct ib_cqe *cqe)
+{
+	return container_of(cqe, struct iser_tx_desc, tx_cqe);
+}
+
+struct isert_cmd {
+	uint32_t		read_stag;
+	uint32_t		write_stag;
+	uint64_t		read_va;
+	uint64_t		write_va;
+	uint32_t		inv_rkey;
+	u64			pdu_buf_dma;
+	u32			pdu_buf_len;
+	struct isert_conn	*conn;
+	struct iscsi_cmd	*iscsi_cmd;
+	struct iser_tx_desc	tx_desc;
+	struct iser_rx_desc	*rx_desc;
+	struct rdma_rw_ctx	rw;
+	struct work_struct	comp_work;
+	struct scatterlist	sg;
+	bool			ctx_init_done;
+};
+
+static inline struct isert_cmd *tx_desc_to_cmd(struct iser_tx_desc *desc)
+{
+	return container_of(desc, struct isert_cmd, tx_desc);
+}
+
+struct isert_device;
+
+struct isert_conn {
+	enum iser_conn_state	state;
+	u32			responder_resources;
+	u32			initiator_depth;
+	bool			pi_support;
+	struct iser_rx_desc	*login_req_buf;
+	char			*login_rsp_buf;
+	u64			login_req_dma;
+	int			login_req_len;
+	u64			login_rsp_dma;
+	struct iser_rx_desc	*rx_descs;
+	struct ib_recv_wr	rx_wr[ISERT_QP_MAX_RECV_DTOS];
+	struct iscsi_conn	*conn;
+	struct list_head	node;
+	struct completion	login_comp;
+	struct completion	login_req_comp;
+	struct iser_tx_desc	login_tx_desc;
+	struct rdma_cm_id	*cm_id;
+	struct ib_qp		*qp;
+	struct isert_device	*device;
+	struct mutex		mutex;
+	struct kref		kref;
+	struct work_struct	release_work;
+	bool                    logout_posted;
+	bool                    snd_w_inv;
+	wait_queue_head_t	rem_wait;
+	bool			dev_removed;
+};
+
+#define ISERT_MAX_CQ 64
+
+/**
+ * struct isert_comp - iSER completion context
+ *
+ * @device:     pointer to device handle
+ * @cq:         completion queue
+ * @active_qps: Number of active QPs attached
+ *              to completion context
+ */
+struct isert_comp {
+	struct isert_device     *device;
+	struct ib_cq		*cq;
+	int                      active_qps;
+};
+
+struct isert_device {
+	bool			pi_capable;
+	int			refcount;
+	struct ib_device	*ib_device;
+	struct ib_pd		*pd;
+	struct isert_comp	*comps;
+	int                     comps_used;
+	struct list_head	dev_node;
+};
+
+struct isert_np {
+	struct iscsi_np         *np;
+	struct semaphore	sem;
+	struct rdma_cm_id	*cm_id;
+	struct mutex		mutex;
+	struct list_head	accepted;
+	struct list_head	pending;
+};
diff --git a/drivers/infiniband/ulp/opa_vnic/Kconfig b/drivers/infiniband/ulp/opa_vnic/Kconfig
new file mode 100644
index 0000000..48132ab
--- /dev/null
+++ b/drivers/infiniband/ulp/opa_vnic/Kconfig
@@ -0,0 +1,8 @@
+config INFINIBAND_OPA_VNIC
+	tristate "Intel OPA VNIC support"
+	depends on X86_64 && INFINIBAND
+	---help---
+	This is Omni-Path (OPA) Virtual Network Interface Controller (VNIC)
+	driver for Ethernet over Omni-Path feature. It implements the HW
+	independent VNIC functionality. It interfaces with Linux stack for
+	data path and IB MAD for the control path.
diff --git a/drivers/infiniband/ulp/opa_vnic/Makefile b/drivers/infiniband/ulp/opa_vnic/Makefile
new file mode 100644
index 0000000..8061b28
--- /dev/null
+++ b/drivers/infiniband/ulp/opa_vnic/Makefile
@@ -0,0 +1,7 @@
+# Makefile - Intel Omni-Path Virtual Network Controller driver
+# Copyright(c) 2017, Intel Corporation.
+#
+obj-$(CONFIG_INFINIBAND_OPA_VNIC) += opa_vnic.o
+
+opa_vnic-y := opa_vnic_netdev.o opa_vnic_encap.o opa_vnic_ethtool.o \
+              opa_vnic_vema.o opa_vnic_vema_iface.o
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c
new file mode 100644
index 0000000..4be3aef
--- /dev/null
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c
@@ -0,0 +1,513 @@
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains OPA VNIC encapsulation/decapsulation function.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+
+#include "opa_vnic_internal.h"
+
+/* OPA 16B Header fields */
+#define OPA_16B_LID_MASK        0xFFFFFull
+#define OPA_16B_SLID_HIGH_SHFT  8
+#define OPA_16B_SLID_MASK       0xF00ull
+#define OPA_16B_DLID_MASK       0xF000ull
+#define OPA_16B_DLID_HIGH_SHFT  12
+#define OPA_16B_LEN_SHFT        20
+#define OPA_16B_SC_SHFT         20
+#define OPA_16B_RC_SHFT         25
+#define OPA_16B_PKEY_SHFT       16
+
+#define OPA_VNIC_L4_HDR_SHFT    16
+
+/* L2+L4 hdr len is 20 bytes (5 quad words) */
+#define OPA_VNIC_HDR_QW_LEN   5
+
+static inline void opa_vnic_make_header(u8 *hdr, u32 slid, u32 dlid, u16 len,
+					u16 pkey, u16 entropy, u8 sc, u8 rc,
+					u8 l4_type, u16 l4_hdr)
+{
+	/* h[1]: LT=1, 16B L2=10 */
+	u32 h[OPA_VNIC_HDR_QW_LEN] = {0, 0xc0000000, 0, 0, 0};
+
+	h[2] = l4_type;
+	h[3] = entropy;
+	h[4] = l4_hdr << OPA_VNIC_L4_HDR_SHFT;
+
+	/* Extract and set 4 upper bits and 20 lower bits of the lids */
+	h[0] |= (slid & OPA_16B_LID_MASK);
+	h[2] |= ((slid >> (20 - OPA_16B_SLID_HIGH_SHFT)) & OPA_16B_SLID_MASK);
+
+	h[1] |= (dlid & OPA_16B_LID_MASK);
+	h[2] |= ((dlid >> (20 - OPA_16B_DLID_HIGH_SHFT)) & OPA_16B_DLID_MASK);
+
+	h[0] |= (len << OPA_16B_LEN_SHFT);
+	h[1] |= (rc << OPA_16B_RC_SHFT);
+	h[1] |= (sc << OPA_16B_SC_SHFT);
+	h[2] |= ((u32)pkey << OPA_16B_PKEY_SHFT);
+
+	memcpy(hdr, h, OPA_VNIC_HDR_LEN);
+}
+
+/*
+ * Using a simple hash table for mac table implementation with the last octet
+ * of mac address as a key.
+ */
+static void opa_vnic_free_mac_tbl(struct hlist_head *mactbl)
+{
+	struct opa_vnic_mac_tbl_node *node;
+	struct hlist_node *tmp;
+	int bkt;
+
+	if (!mactbl)
+		return;
+
+	vnic_hash_for_each_safe(mactbl, bkt, tmp, node, hlist) {
+		hash_del(&node->hlist);
+		kfree(node);
+	}
+	kfree(mactbl);
+}
+
+static struct hlist_head *opa_vnic_alloc_mac_tbl(void)
+{
+	u32 size = sizeof(struct hlist_head) * OPA_VNIC_MAC_TBL_SIZE;
+	struct hlist_head *mactbl;
+
+	mactbl = kzalloc(size, GFP_KERNEL);
+	if (!mactbl)
+		return ERR_PTR(-ENOMEM);
+
+	vnic_hash_init(mactbl);
+	return mactbl;
+}
+
+/* opa_vnic_release_mac_tbl - empty and free the mac table */
+void opa_vnic_release_mac_tbl(struct opa_vnic_adapter *adapter)
+{
+	struct hlist_head *mactbl;
+
+	mutex_lock(&adapter->mactbl_lock);
+	mactbl = rcu_access_pointer(adapter->mactbl);
+	rcu_assign_pointer(adapter->mactbl, NULL);
+	synchronize_rcu();
+	opa_vnic_free_mac_tbl(mactbl);
+	adapter->info.vport.mac_tbl_digest = 0;
+	mutex_unlock(&adapter->mactbl_lock);
+}
+
+/*
+ * opa_vnic_query_mac_tbl - query the mac table for a section
+ *
+ * This function implements query of specific function of the mac table.
+ * The function also expects the requested range to be valid.
+ */
+void opa_vnic_query_mac_tbl(struct opa_vnic_adapter *adapter,
+			    struct opa_veswport_mactable *tbl)
+{
+	struct opa_vnic_mac_tbl_node *node;
+	struct hlist_head *mactbl;
+	int bkt;
+	u16 loffset, lnum_entries;
+
+	rcu_read_lock();
+	mactbl = rcu_dereference(adapter->mactbl);
+	if (!mactbl)
+		goto get_mac_done;
+
+	loffset = be16_to_cpu(tbl->offset);
+	lnum_entries = be16_to_cpu(tbl->num_entries);
+
+	vnic_hash_for_each(mactbl, bkt, node, hlist) {
+		struct __opa_vnic_mactable_entry *nentry = &node->entry;
+		struct opa_veswport_mactable_entry *entry;
+
+		if ((node->index < loffset) ||
+		    (node->index >= (loffset + lnum_entries)))
+			continue;
+
+		/* populate entry in the tbl corresponding to the index */
+		entry = &tbl->tbl_entries[node->index - loffset];
+		memcpy(entry->mac_addr, nentry->mac_addr,
+		       ARRAY_SIZE(entry->mac_addr));
+		memcpy(entry->mac_addr_mask, nentry->mac_addr_mask,
+		       ARRAY_SIZE(entry->mac_addr_mask));
+		entry->dlid_sd = cpu_to_be32(nentry->dlid_sd);
+	}
+	tbl->mac_tbl_digest = cpu_to_be32(adapter->info.vport.mac_tbl_digest);
+get_mac_done:
+	rcu_read_unlock();
+}
+
+/*
+ * opa_vnic_update_mac_tbl - update mac table section
+ *
+ * This function updates the specified section of the mac table.
+ * The procedure includes following steps.
+ *  - Allocate a new mac (hash) table.
+ *  - Add the specified entries to the new table.
+ *    (except the ones that are requested to be deleted).
+ *  - Add all the other entries from the old mac table.
+ *  - If there is a failure, free the new table and return.
+ *  - Switch to the new table.
+ *  - Free the old table and return.
+ *
+ * The function also expects the requested range to be valid.
+ */
+int opa_vnic_update_mac_tbl(struct opa_vnic_adapter *adapter,
+			    struct opa_veswport_mactable *tbl)
+{
+	struct opa_vnic_mac_tbl_node *node, *new_node;
+	struct hlist_head *new_mactbl, *old_mactbl;
+	int i, bkt, rc = 0;
+	u8 key;
+	u16 loffset, lnum_entries;
+
+	mutex_lock(&adapter->mactbl_lock);
+	/* allocate new mac table */
+	new_mactbl = opa_vnic_alloc_mac_tbl();
+	if (IS_ERR(new_mactbl)) {
+		mutex_unlock(&adapter->mactbl_lock);
+		return PTR_ERR(new_mactbl);
+	}
+
+	loffset = be16_to_cpu(tbl->offset);
+	lnum_entries = be16_to_cpu(tbl->num_entries);
+
+	/* add updated entries to the new mac table */
+	for (i = 0; i < lnum_entries; i++) {
+		struct __opa_vnic_mactable_entry *nentry;
+		struct opa_veswport_mactable_entry *entry =
+							&tbl->tbl_entries[i];
+		u8 *mac_addr = entry->mac_addr;
+		u8 empty_mac[ETH_ALEN] = { 0 };
+
+		v_dbg("new mac entry %4d: %02x:%02x:%02x:%02x:%02x:%02x %x\n",
+		      loffset + i, mac_addr[0], mac_addr[1], mac_addr[2],
+		      mac_addr[3], mac_addr[4], mac_addr[5],
+		      entry->dlid_sd);
+
+		/* if the entry is being removed, do not add it */
+		if (!memcmp(mac_addr, empty_mac, ARRAY_SIZE(empty_mac)))
+			continue;
+
+		node = kzalloc(sizeof(*node), GFP_KERNEL);
+		if (!node) {
+			rc = -ENOMEM;
+			goto updt_done;
+		}
+
+		node->index = loffset + i;
+		nentry = &node->entry;
+		memcpy(nentry->mac_addr, entry->mac_addr,
+		       ARRAY_SIZE(nentry->mac_addr));
+		memcpy(nentry->mac_addr_mask, entry->mac_addr_mask,
+		       ARRAY_SIZE(nentry->mac_addr_mask));
+		nentry->dlid_sd = be32_to_cpu(entry->dlid_sd);
+		key = node->entry.mac_addr[OPA_VNIC_MAC_HASH_IDX];
+		vnic_hash_add(new_mactbl, &node->hlist, key);
+	}
+
+	/* add other entries from current mac table to new mac table */
+	old_mactbl = rcu_access_pointer(adapter->mactbl);
+	if (!old_mactbl)
+		goto switch_tbl;
+
+	vnic_hash_for_each(old_mactbl, bkt, node, hlist) {
+		if ((node->index >= loffset) &&
+		    (node->index < (loffset + lnum_entries)))
+			continue;
+
+		new_node = kzalloc(sizeof(*new_node), GFP_KERNEL);
+		if (!new_node) {
+			rc = -ENOMEM;
+			goto updt_done;
+		}
+
+		new_node->index = node->index;
+		memcpy(&new_node->entry, &node->entry, sizeof(node->entry));
+		key = new_node->entry.mac_addr[OPA_VNIC_MAC_HASH_IDX];
+		vnic_hash_add(new_mactbl, &new_node->hlist, key);
+	}
+
+switch_tbl:
+	/* switch to new table */
+	rcu_assign_pointer(adapter->mactbl, new_mactbl);
+	synchronize_rcu();
+
+	adapter->info.vport.mac_tbl_digest = be32_to_cpu(tbl->mac_tbl_digest);
+updt_done:
+	/* upon failure, free the new table; otherwise, free the old table */
+	if (rc)
+		opa_vnic_free_mac_tbl(new_mactbl);
+	else
+		opa_vnic_free_mac_tbl(old_mactbl);
+
+	mutex_unlock(&adapter->mactbl_lock);
+	return rc;
+}
+
+/* opa_vnic_chk_mac_tbl - check mac table for dlid */
+static uint32_t opa_vnic_chk_mac_tbl(struct opa_vnic_adapter *adapter,
+				     struct ethhdr *mac_hdr)
+{
+	struct opa_vnic_mac_tbl_node *node;
+	struct hlist_head *mactbl;
+	u32 dlid = 0;
+	u8 key;
+
+	rcu_read_lock();
+	mactbl = rcu_dereference(adapter->mactbl);
+	if (unlikely(!mactbl))
+		goto chk_done;
+
+	key = mac_hdr->h_dest[OPA_VNIC_MAC_HASH_IDX];
+	vnic_hash_for_each_possible(mactbl, node, hlist, key) {
+		struct __opa_vnic_mactable_entry *entry = &node->entry;
+
+		/* if related to source mac, skip */
+		if (unlikely(OPA_VNIC_DLID_SD_IS_SRC_MAC(entry->dlid_sd)))
+			continue;
+
+		if (!memcmp(node->entry.mac_addr, mac_hdr->h_dest,
+			    ARRAY_SIZE(node->entry.mac_addr))) {
+			/* mac address found */
+			dlid = OPA_VNIC_DLID_SD_GET_DLID(node->entry.dlid_sd);
+			break;
+		}
+	}
+
+chk_done:
+	rcu_read_unlock();
+	return dlid;
+}
+
+/* opa_vnic_get_dlid - find and return the DLID */
+static uint32_t opa_vnic_get_dlid(struct opa_vnic_adapter *adapter,
+				  struct sk_buff *skb, u8 def_port)
+{
+	struct __opa_veswport_info *info = &adapter->info;
+	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
+	u32 dlid;
+
+	dlid = opa_vnic_chk_mac_tbl(adapter, mac_hdr);
+	if (dlid)
+		return dlid;
+
+	if (is_multicast_ether_addr(mac_hdr->h_dest)) {
+		dlid = info->vesw.u_mcast_dlid;
+	} else {
+		if (is_local_ether_addr(mac_hdr->h_dest)) {
+			dlid = ((uint32_t)mac_hdr->h_dest[5] << 16) |
+				((uint32_t)mac_hdr->h_dest[4] << 8)  |
+				mac_hdr->h_dest[3];
+			if (unlikely(!dlid))
+				v_warn("Null dlid in MAC address\n");
+		} else if (def_port != OPA_VNIC_INVALID_PORT) {
+			dlid = info->vesw.u_ucast_dlid[def_port];
+		}
+	}
+
+	return dlid;
+}
+
+/* opa_vnic_get_sc - return the service class */
+static u8 opa_vnic_get_sc(struct __opa_veswport_info *info,
+			  struct sk_buff *skb)
+{
+	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
+	u16 vlan_tci;
+	u8 sc;
+
+	if (!__vlan_get_tag(skb, &vlan_tci)) {
+		u8 pcp = OPA_VNIC_VLAN_PCP(vlan_tci);
+
+		if (is_multicast_ether_addr(mac_hdr->h_dest))
+			sc = info->vport.pcp_to_sc_mc[pcp];
+		else
+			sc = info->vport.pcp_to_sc_uc[pcp];
+	} else {
+		if (is_multicast_ether_addr(mac_hdr->h_dest))
+			sc = info->vport.non_vlan_sc_mc;
+		else
+			sc = info->vport.non_vlan_sc_uc;
+	}
+
+	return sc;
+}
+
+u8 opa_vnic_get_vl(struct opa_vnic_adapter *adapter, struct sk_buff *skb)
+{
+	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
+	struct __opa_veswport_info *info = &adapter->info;
+	u8 vl;
+
+	if (skb_vlan_tag_present(skb)) {
+		u8 pcp = skb_vlan_tag_get(skb) >> VLAN_PRIO_SHIFT;
+
+		if (is_multicast_ether_addr(mac_hdr->h_dest))
+			vl = info->vport.pcp_to_vl_mc[pcp];
+		else
+			vl = info->vport.pcp_to_vl_uc[pcp];
+	} else {
+		if (is_multicast_ether_addr(mac_hdr->h_dest))
+			vl = info->vport.non_vlan_vl_mc;
+		else
+			vl = info->vport.non_vlan_vl_uc;
+	}
+
+	return vl;
+}
+
+/* opa_vnic_get_rc - return the routing control */
+static u8 opa_vnic_get_rc(struct __opa_veswport_info *info,
+			  struct sk_buff *skb)
+{
+	u8 proto, rout_ctrl;
+
+	switch (vlan_get_protocol(skb)) {
+	case htons(ETH_P_IPV6):
+		proto = ipv6_hdr(skb)->nexthdr;
+		if (proto == IPPROTO_TCP)
+			rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc,
+							  IPV6_TCP);
+		else if (proto == IPPROTO_UDP)
+			rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc,
+							  IPV6_UDP);
+		else
+			rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc, IPV6);
+		break;
+	case htons(ETH_P_IP):
+		proto = ip_hdr(skb)->protocol;
+		if (proto == IPPROTO_TCP)
+			rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc,
+							  IPV4_TCP);
+		else if (proto == IPPROTO_UDP)
+			rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc,
+							  IPV4_UDP);
+		else
+			rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc, IPV4);
+		break;
+	default:
+		rout_ctrl = OPA_VNIC_ENCAP_RC_EXT(info->vesw.rc, DEFAULT);
+	}
+
+	return rout_ctrl;
+}
+
+/* opa_vnic_calc_entropy - calculate the packet entropy */
+u8 opa_vnic_calc_entropy(struct opa_vnic_adapter *adapter, struct sk_buff *skb)
+{
+	u16 hash16;
+
+	/*
+	 * Get flow based 16-bit hash and then XOR the upper and lower bytes
+	 * to get the entropy.
+	 * __skb_tx_hash limits qcount to 16 bits. Hence, get 15-bit hash.
+	 */
+	hash16 = __skb_tx_hash(adapter->netdev, skb, BIT(15));
+	return (u8)((hash16 >> 8) ^ (hash16 & 0xff));
+}
+
+/* opa_vnic_get_def_port - get default port based on entropy */
+static inline u8 opa_vnic_get_def_port(struct opa_vnic_adapter *adapter,
+				       u8 entropy)
+{
+	u8 flow_id;
+
+	/* Add the upper and lower 4-bits of entropy to get the flow id */
+	flow_id = ((entropy & 0xf) + (entropy >> 4));
+	return adapter->flow_tbl[flow_id & (OPA_VNIC_FLOW_TBL_SIZE - 1)];
+}
+
+/* Calculate packet length including OPA header, crc and padding */
+static inline int opa_vnic_wire_length(struct sk_buff *skb)
+{
+	u32 pad_len;
+
+	/* padding for 8 bytes size alignment */
+	pad_len = -(skb->len + OPA_VNIC_ICRC_TAIL_LEN) & 0x7;
+	pad_len += OPA_VNIC_ICRC_TAIL_LEN;
+
+	return (skb->len + pad_len) >> 3;
+}
+
+/* opa_vnic_encap_skb - encapsulate skb packet with OPA header and meta data */
+void opa_vnic_encap_skb(struct opa_vnic_adapter *adapter, struct sk_buff *skb)
+{
+	struct __opa_veswport_info *info = &adapter->info;
+	struct opa_vnic_skb_mdata *mdata;
+	u8 def_port, sc, rc, entropy, *hdr;
+	u16 len, l4_hdr;
+	u32 dlid;
+
+	hdr = skb_push(skb, OPA_VNIC_HDR_LEN);
+
+	entropy = opa_vnic_calc_entropy(adapter, skb);
+	def_port = opa_vnic_get_def_port(adapter, entropy);
+	len = opa_vnic_wire_length(skb);
+	dlid = opa_vnic_get_dlid(adapter, skb, def_port);
+	sc = opa_vnic_get_sc(info, skb);
+	rc = opa_vnic_get_rc(info, skb);
+	l4_hdr = info->vesw.vesw_id;
+
+	mdata = skb_push(skb, sizeof(*mdata));
+	mdata->vl = opa_vnic_get_vl(adapter, skb);
+	mdata->entropy = entropy;
+	mdata->flags = 0;
+	if (unlikely(!dlid)) {
+		mdata->flags = OPA_VNIC_SKB_MDATA_ENCAP_ERR;
+		return;
+	}
+
+	opa_vnic_make_header(hdr, info->vport.encap_slid, dlid, len,
+			     info->vesw.pkey, entropy, sc, rc,
+			     OPA_VNIC_L4_ETHR, l4_hdr);
+}
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
new file mode 100644
index 0000000..e4c9bf2
--- /dev/null
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
@@ -0,0 +1,501 @@
+#ifndef _OPA_VNIC_ENCAP_H
+#define _OPA_VNIC_ENCAP_H
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all OPA VNIC declaration required for encapsulation
+ * and decapsulation of Ethernet packets
+ */
+
+#include <linux/types.h>
+#include <rdma/ib_mad.h>
+
+/* EMA class version */
+#define OPA_EMA_CLASS_VERSION               0x80
+
+/*
+ * Define the Intel vendor management class for OPA
+ * ETHERNET MANAGEMENT
+ */
+#define OPA_MGMT_CLASS_INTEL_EMA            0x34
+
+/* EM attribute IDs */
+#define OPA_EM_ATTR_CLASS_PORT_INFO                 0x0001
+#define OPA_EM_ATTR_VESWPORT_INFO                   0x0011
+#define OPA_EM_ATTR_VESWPORT_MAC_ENTRIES            0x0012
+#define OPA_EM_ATTR_IFACE_UCAST_MACS                0x0013
+#define OPA_EM_ATTR_IFACE_MCAST_MACS                0x0014
+#define OPA_EM_ATTR_DELETE_VESW                     0x0015
+#define OPA_EM_ATTR_VESWPORT_SUMMARY_COUNTERS       0x0020
+#define OPA_EM_ATTR_VESWPORT_ERROR_COUNTERS         0x0022
+
+/* VNIC configured and operational state values */
+#define OPA_VNIC_STATE_DROP_ALL        0x1
+#define OPA_VNIC_STATE_FORWARDING      0x3
+
+#define OPA_VESW_MAX_NUM_DEF_PORT   16
+#define OPA_VNIC_MAX_NUM_PCP        8
+
+#define OPA_VNIC_EMA_DATA    (OPA_MGMT_MAD_SIZE - IB_MGMT_VENDOR_HDR)
+
+/* Defines for vendor specific notice(trap) attributes */
+#define OPA_INTEL_EMA_NOTICE_TYPE_INFO 0x04
+
+/* INTEL OUI */
+#define INTEL_OUI_1 0x00
+#define INTEL_OUI_2 0x06
+#define INTEL_OUI_3 0x6a
+
+/* Trap opcodes sent from VNIC */
+#define OPA_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE 0x1
+#define OPA_VESWPORT_TRAP_IFACE_MCAST_MAC_CHANGE 0x2
+#define OPA_VESWPORT_TRAP_ETH_LINK_STATUS_CHANGE 0x3
+
+#define OPA_VNIC_DLID_SD_IS_SRC_MAC(dlid_sd)  (!!((dlid_sd) & 0x20))
+#define OPA_VNIC_DLID_SD_GET_DLID(dlid_sd)    ((dlid_sd) >> 8)
+
+/* VNIC Ethernet link status */
+#define OPA_VNIC_ETH_LINK_UP     1
+#define OPA_VNIC_ETH_LINK_DOWN   2
+
+/* routing control */
+#define OPA_VNIC_ENCAP_RC_DEFAULT   0
+#define OPA_VNIC_ENCAP_RC_IPV4      4
+#define OPA_VNIC_ENCAP_RC_IPV4_UDP  8
+#define OPA_VNIC_ENCAP_RC_IPV4_TCP  12
+#define OPA_VNIC_ENCAP_RC_IPV6      16
+#define OPA_VNIC_ENCAP_RC_IPV6_TCP  20
+#define OPA_VNIC_ENCAP_RC_IPV6_UDP  24
+
+#define OPA_VNIC_ENCAP_RC_EXT(w, b) (((w) >> OPA_VNIC_ENCAP_RC_ ## b) & 0x7)
+
+/**
+ * struct opa_vesw_info - OPA vnic switch information
+ * @fabric_id: 10-bit fabric id
+ * @vesw_id: 12-bit virtual ethernet switch id
+ * @def_port_mask: bitmask of default ports
+ * @pkey: partition key
+ * @u_mcast_dlid: unknown multicast dlid
+ * @u_ucast_dlid: array of unknown unicast dlids
+ * @rc: routing control
+ * @eth_mtu: Ethernet MTU
+ */
+struct opa_vesw_info {
+	__be16  fabric_id;
+	__be16  vesw_id;
+
+	u8      rsvd0[6];
+	__be16  def_port_mask;
+
+	u8      rsvd1[2];
+	__be16  pkey;
+
+	u8      rsvd2[4];
+	__be32  u_mcast_dlid;
+	__be32  u_ucast_dlid[OPA_VESW_MAX_NUM_DEF_PORT];
+
+	__be32  rc;
+
+	u8      rsvd3[56];
+	__be16  eth_mtu;
+	u8      rsvd4[2];
+} __packed;
+
+/**
+ * struct opa_per_veswport_info - OPA vnic per port information
+ * @port_num: port number
+ * @eth_link_status: current ethernet link state
+ * @base_mac_addr: base mac address
+ * @config_state: configured port state
+ * @oper_state: operational port state
+ * @max_mac_tbl_ent: max number of mac table entries
+ * @max_smac_ent: max smac entries in mac table
+ * @mac_tbl_digest: mac table digest
+ * @encap_slid: base slid for the port
+ * @pcp_to_sc_uc: sc by pcp index for unicast ethernet packets
+ * @pcp_to_vl_uc: vl by pcp index for unicast ethernet packets
+ * @pcp_to_sc_mc: sc by pcp index for multicast ethernet packets
+ * @pcp_to_vl_mc: vl by pcp index for multicast ethernet packets
+ * @non_vlan_sc_uc: sc for non-vlan unicast ethernet packets
+ * @non_vlan_vl_uc: vl for non-vlan unicast ethernet packets
+ * @non_vlan_sc_mc: sc for non-vlan multicast ethernet packets
+ * @non_vlan_vl_mc: vl for non-vlan multicast ethernet packets
+ * @uc_macs_gen_count: generation count for unicast macs list
+ * @mc_macs_gen_count: generation count for multicast macs list
+ */
+struct opa_per_veswport_info {
+	__be32  port_num;
+
+	u8      eth_link_status;
+	u8      rsvd0[3];
+
+	u8      base_mac_addr[ETH_ALEN];
+	u8      config_state;
+	u8      oper_state;
+
+	__be16  max_mac_tbl_ent;
+	__be16  max_smac_ent;
+	__be32  mac_tbl_digest;
+	u8      rsvd1[4];
+
+	__be32  encap_slid;
+
+	u8      pcp_to_sc_uc[OPA_VNIC_MAX_NUM_PCP];
+	u8      pcp_to_vl_uc[OPA_VNIC_MAX_NUM_PCP];
+	u8      pcp_to_sc_mc[OPA_VNIC_MAX_NUM_PCP];
+	u8      pcp_to_vl_mc[OPA_VNIC_MAX_NUM_PCP];
+
+	u8      non_vlan_sc_uc;
+	u8      non_vlan_vl_uc;
+	u8      non_vlan_sc_mc;
+	u8      non_vlan_vl_mc;
+
+	u8      rsvd2[48];
+
+	__be16  uc_macs_gen_count;
+	__be16  mc_macs_gen_count;
+
+	u8      rsvd3[8];
+} __packed;
+
+/**
+ * struct opa_veswport_info - OPA vnic port information
+ * @vesw: OPA vnic switch information
+ * @vport: OPA vnic per port information
+ *
+ * On host, each of the virtual ethernet ports belongs
+ * to a different virtual ethernet switches.
+ */
+struct opa_veswport_info {
+	struct opa_vesw_info          vesw;
+	struct opa_per_veswport_info  vport;
+};
+
+/**
+ * struct opa_veswport_mactable_entry - single entry in the forwarding table
+ * @mac_addr: MAC address
+ * @mac_addr_mask: MAC address bit mask
+ * @dlid_sd: Matching DLID and side data
+ *
+ * On the host each virtual ethernet port will have
+ * a forwarding table. These tables are used to
+ * map a MAC to a LID and other data. For more
+ * details see struct opa_veswport_mactable_entries.
+ * This is the structure of a single mactable entry
+ */
+struct opa_veswport_mactable_entry {
+	u8      mac_addr[ETH_ALEN];
+	u8      mac_addr_mask[ETH_ALEN];
+	__be32  dlid_sd;
+} __packed;
+
+/**
+ * struct opa_veswport_mactable - Forwarding table array
+ * @offset: mac table starting offset
+ * @num_entries: Number of entries to get or set
+ * @mac_tbl_digest: mac table digest
+ * @tbl_entries[]: Array of table entries
+ *
+ * The EM sends down this structure in a MAD indicating
+ * the starting offset in the forwarding table that this
+ * entry is to be loaded into and the number of entries
+ * that that this MAD instance contains
+ * The mac_tbl_digest has been added to this MAD structure. It will be set by
+ * the EM and it will be used by the EM to check if there are any
+ * discrepancies with this value and the value
+ * maintained by the EM in the case of VNIC port being deleted or unloaded
+ * A new instantiation of a VNIC will always have a value of zero.
+ * This value is stored as part of the vnic adapter structure and will be
+ * accessed by the GET and SET routines for both the mactable entries and the
+ * veswport info.
+ */
+struct opa_veswport_mactable {
+	__be16                              offset;
+	__be16                              num_entries;
+	__be32                              mac_tbl_digest;
+	struct opa_veswport_mactable_entry  tbl_entries[0];
+} __packed;
+
+/**
+ * struct opa_veswport_summary_counters - summary counters
+ * @vp_instance: vport instance on the OPA port
+ * @vesw_id: virtual ethernet switch id
+ * @veswport_num: virtual ethernet switch port number
+ * @tx_errors: transmit errors
+ * @rx_errors: receive errors
+ * @tx_packets: transmit packets
+ * @rx_packets: receive packets
+ * @tx_bytes: transmit bytes
+ * @rx_bytes: receive bytes
+ * @tx_unicast: unicast packets transmitted
+ * @tx_mcastbcast: multicast/broadcast packets transmitted
+ * @tx_untagged: non-vlan packets transmitted
+ * @tx_vlan: vlan packets transmitted
+ * @tx_64_size: transmit packet length is 64 bytes
+ * @tx_65_127: transmit packet length is >=65 and < 127 bytes
+ * @tx_128_255: transmit packet length is >=128 and < 255 bytes
+ * @tx_256_511: transmit packet length is >=256 and < 511 bytes
+ * @tx_512_1023: transmit packet length is >=512 and < 1023 bytes
+ * @tx_1024_1518: transmit packet length is >=1024 and < 1518 bytes
+ * @tx_1519_max: transmit packet length >= 1519 bytes
+ * @rx_unicast: unicast packets received
+ * @rx_mcastbcast: multicast/broadcast packets received
+ * @rx_untagged: non-vlan packets received
+ * @rx_vlan: vlan packets received
+ * @rx_64_size: received packet length is 64 bytes
+ * @rx_65_127: received packet length is >=65 and < 127 bytes
+ * @rx_128_255: received packet length is >=128 and < 255 bytes
+ * @rx_256_511: received packet length is >=256 and < 511 bytes
+ * @rx_512_1023: received packet length is >=512 and < 1023 bytes
+ * @rx_1024_1518: received packet length is >=1024 and < 1518 bytes
+ * @rx_1519_max: received packet length >= 1519 bytes
+ *
+ * All the above are counters of corresponding conditions.
+ */
+struct opa_veswport_summary_counters {
+	__be16  vp_instance;
+	__be16  vesw_id;
+	__be32  veswport_num;
+
+	__be64  tx_errors;
+	__be64  rx_errors;
+	__be64  tx_packets;
+	__be64  rx_packets;
+	__be64  tx_bytes;
+	__be64  rx_bytes;
+
+	__be64  tx_unicast;
+	__be64  tx_mcastbcast;
+
+	__be64  tx_untagged;
+	__be64  tx_vlan;
+
+	__be64  tx_64_size;
+	__be64  tx_65_127;
+	__be64  tx_128_255;
+	__be64  tx_256_511;
+	__be64  tx_512_1023;
+	__be64  tx_1024_1518;
+	__be64  tx_1519_max;
+
+	__be64  rx_unicast;
+	__be64  rx_mcastbcast;
+
+	__be64  rx_untagged;
+	__be64  rx_vlan;
+
+	__be64  rx_64_size;
+	__be64  rx_65_127;
+	__be64  rx_128_255;
+	__be64  rx_256_511;
+	__be64  rx_512_1023;
+	__be64  rx_1024_1518;
+	__be64  rx_1519_max;
+
+	__be64  reserved[16];
+} __packed;
+
+/**
+ * struct opa_veswport_error_counters - error counters
+ * @vp_instance: vport instance on the OPA port
+ * @vesw_id: virtual ethernet switch id
+ * @veswport_num: virtual ethernet switch port number
+ * @tx_errors: transmit errors
+ * @rx_errors: receive errors
+ * @tx_smac_filt: smac filter errors
+ * @tx_dlid_zero: transmit packets with invalid dlid
+ * @tx_logic: other transmit errors
+ * @tx_drop_state: packet tansmission in non-forward port state
+ * @rx_bad_veswid: received packet with invalid vesw id
+ * @rx_runt: received ethernet packet with length < 64 bytes
+ * @rx_oversize: received ethernet packet with length > MTU size
+ * @rx_eth_down: received packets when interface is down
+ * @rx_drop_state: received packets in non-forwarding port state
+ * @rx_logic: other receive errors
+ *
+ * All the above are counters of corresponding erorr conditions.
+ */
+struct opa_veswport_error_counters {
+	__be16  vp_instance;
+	__be16  vesw_id;
+	__be32  veswport_num;
+
+	__be64  tx_errors;
+	__be64  rx_errors;
+
+	__be64  rsvd0;
+	__be64  tx_smac_filt;
+	__be64  rsvd1;
+	__be64  rsvd2;
+	__be64  rsvd3;
+	__be64  tx_dlid_zero;
+	__be64  rsvd4;
+	__be64  tx_logic;
+	__be64  rsvd5;
+	__be64  tx_drop_state;
+
+	__be64  rx_bad_veswid;
+	__be64  rsvd6;
+	__be64  rx_runt;
+	__be64  rx_oversize;
+	__be64  rsvd7;
+	__be64  rx_eth_down;
+	__be64  rx_drop_state;
+	__be64  rx_logic;
+	__be64  rsvd8;
+
+	__be64  rsvd9[16];
+} __packed;
+
+/**
+ * struct opa_veswport_trap - Trap message sent to EM by VNIC
+ * @fabric_id: 10 bit fabric id
+ * @veswid: 12 bit virtual ethernet switch id
+ * @veswportnum: logical port number on the Virtual switch
+ * @opaportnum: physical port num (redundant on host)
+ * @veswportindex: switch port index on opa port 0 based
+ * @opcode: operation
+ * @reserved: 32 bit for alignment
+ *
+ * The VNIC will send trap messages to the Ethernet manager to
+ * inform it about changes to the VNIC config, behaviour etc.
+ * This is the format of the trap payload.
+ */
+struct opa_veswport_trap {
+	__be16  fabric_id;
+	__be16  veswid;
+	__be32  veswportnum;
+	__be16  opaportnum;
+	u8      veswportindex;
+	u8      opcode;
+	__be32  reserved;
+} __packed;
+
+/**
+ * struct opa_vnic_iface_macs_entry - single entry in the mac list
+ * @mac_addr: MAC address
+ */
+struct opa_vnic_iface_mac_entry {
+	u8 mac_addr[ETH_ALEN];
+};
+
+/**
+ * struct opa_veswport_iface_macs - Msg to set globally administered MAC
+ * @start_idx: position of first entry (0 based)
+ * @num_macs_in_msg: number of MACs in this message
+ * @tot_macs_in_lst: The total number of MACs the agent has
+ * @gen_count: gen_count to indicate change
+ * @entry: The mac list entry
+ *
+ * Same attribute IDS and attribute modifiers as in locally administered
+ * addresses used to set globally administered addresses
+ */
+struct opa_veswport_iface_macs {
+	__be16 start_idx;
+	__be16 num_macs_in_msg;
+	__be16 tot_macs_in_lst;
+	__be16 gen_count;
+	struct opa_vnic_iface_mac_entry entry[0];
+} __packed;
+
+/**
+ * struct opa_vnic_vema_mad - Generic VEMA MAD
+ * @mad_hdr: Generic MAD header
+ * @rmpp_hdr: RMPP header for vendor specific MADs
+ * @oui: Unique org identifier
+ * @data: MAD data
+ */
+struct opa_vnic_vema_mad {
+	struct ib_mad_hdr  mad_hdr;
+	struct ib_rmpp_hdr rmpp_hdr;
+	u8                 reserved;
+	u8                 oui[3];
+	u8                 data[OPA_VNIC_EMA_DATA];
+};
+
+/**
+ * struct opa_vnic_notice_attr - Generic Notice MAD
+ * @gen_type: Generic/Specific bit and type of notice
+ * @oui_1: Vendor ID byte 1
+ * @oui_2: Vendor ID byte 2
+ * @oui_3: Vendor ID byte 3
+ * @trap_num: Trap number
+ * @toggle_count: Notice toggle bit and count value
+ * @issuer_lid: Trap issuer's lid
+ * @issuer_gid: Issuer GID (only if Report method)
+ * @raw_data: Trap message body
+ */
+struct opa_vnic_notice_attr {
+	u8     gen_type;
+	u8     oui_1;
+	u8     oui_2;
+	u8     oui_3;
+	__be16 trap_num;
+	__be16 toggle_count;
+	__be32 issuer_lid;
+	__be32 reserved;
+	u8     issuer_gid[16];
+	u8     raw_data[64];
+} __packed;
+
+/**
+ * struct opa_vnic_vema_mad_trap - Generic VEMA MAD Trap
+ * @mad_hdr: Generic MAD header
+ * @rmpp_hdr: RMPP header for vendor specific MADs
+ * @oui: Unique org identifier
+ * @notice: Notice structure
+ */
+struct opa_vnic_vema_mad_trap {
+	struct ib_mad_hdr            mad_hdr;
+	struct ib_rmpp_hdr           rmpp_hdr;
+	u8                           reserved;
+	u8                           oui[3];
+	struct opa_vnic_notice_attr  notice;
+};
+
+#endif /* _OPA_VNIC_ENCAP_H */
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c
new file mode 100644
index 0000000..62390e9
--- /dev/null
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains OPA VNIC ethtool functions
+ */
+
+#include <linux/ethtool.h>
+
+#include "opa_vnic_internal.h"
+
+enum {NETDEV_STATS, VNIC_STATS};
+
+struct vnic_stats {
+	char stat_string[ETH_GSTRING_LEN];
+	struct {
+		int sizeof_stat;
+		int stat_offset;
+	};
+};
+
+#define VNIC_STAT(m)            { FIELD_SIZEOF(struct opa_vnic_stats, m),   \
+				  offsetof(struct opa_vnic_stats, m) }
+
+static struct vnic_stats vnic_gstrings_stats[] = {
+	/* NETDEV stats */
+	{"rx_packets", VNIC_STAT(netstats.rx_packets)},
+	{"tx_packets", VNIC_STAT(netstats.tx_packets)},
+	{"rx_bytes", VNIC_STAT(netstats.rx_bytes)},
+	{"tx_bytes", VNIC_STAT(netstats.tx_bytes)},
+	{"rx_errors", VNIC_STAT(netstats.rx_errors)},
+	{"tx_errors", VNIC_STAT(netstats.tx_errors)},
+	{"rx_dropped", VNIC_STAT(netstats.rx_dropped)},
+	{"tx_dropped", VNIC_STAT(netstats.tx_dropped)},
+
+	/* SUMMARY counters */
+	{"tx_unicast", VNIC_STAT(tx_grp.unicast)},
+	{"tx_mcastbcast", VNIC_STAT(tx_grp.mcastbcast)},
+	{"tx_untagged", VNIC_STAT(tx_grp.untagged)},
+	{"tx_vlan", VNIC_STAT(tx_grp.vlan)},
+
+	{"tx_64_size", VNIC_STAT(tx_grp.s_64)},
+	{"tx_65_127", VNIC_STAT(tx_grp.s_65_127)},
+	{"tx_128_255", VNIC_STAT(tx_grp.s_128_255)},
+	{"tx_256_511", VNIC_STAT(tx_grp.s_256_511)},
+	{"tx_512_1023", VNIC_STAT(tx_grp.s_512_1023)},
+	{"tx_1024_1518", VNIC_STAT(tx_grp.s_1024_1518)},
+	{"tx_1519_max", VNIC_STAT(tx_grp.s_1519_max)},
+
+	{"rx_unicast", VNIC_STAT(rx_grp.unicast)},
+	{"rx_mcastbcast", VNIC_STAT(rx_grp.mcastbcast)},
+	{"rx_untagged", VNIC_STAT(rx_grp.untagged)},
+	{"rx_vlan", VNIC_STAT(rx_grp.vlan)},
+
+	{"rx_64_size", VNIC_STAT(rx_grp.s_64)},
+	{"rx_65_127", VNIC_STAT(rx_grp.s_65_127)},
+	{"rx_128_255", VNIC_STAT(rx_grp.s_128_255)},
+	{"rx_256_511", VNIC_STAT(rx_grp.s_256_511)},
+	{"rx_512_1023", VNIC_STAT(rx_grp.s_512_1023)},
+	{"rx_1024_1518", VNIC_STAT(rx_grp.s_1024_1518)},
+	{"rx_1519_max", VNIC_STAT(rx_grp.s_1519_max)},
+
+	/* ERROR counters */
+	{"rx_fifo_errors", VNIC_STAT(netstats.rx_fifo_errors)},
+	{"rx_length_errors", VNIC_STAT(netstats.rx_length_errors)},
+
+	{"tx_fifo_errors", VNIC_STAT(netstats.tx_fifo_errors)},
+	{"tx_carrier_errors", VNIC_STAT(netstats.tx_carrier_errors)},
+
+	{"tx_dlid_zero", VNIC_STAT(tx_dlid_zero)},
+	{"tx_drop_state", VNIC_STAT(tx_drop_state)},
+	{"rx_drop_state", VNIC_STAT(rx_drop_state)},
+	{"rx_oversize", VNIC_STAT(rx_oversize)},
+	{"rx_runt", VNIC_STAT(rx_runt)},
+};
+
+#define VNIC_STATS_LEN  ARRAY_SIZE(vnic_gstrings_stats)
+
+/* vnic_get_drvinfo - get driver info */
+static void vnic_get_drvinfo(struct net_device *netdev,
+			     struct ethtool_drvinfo *drvinfo)
+{
+	strlcpy(drvinfo->driver, opa_vnic_driver_name, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, opa_vnic_driver_version,
+		sizeof(drvinfo->version));
+	strlcpy(drvinfo->bus_info, dev_name(netdev->dev.parent),
+		sizeof(drvinfo->bus_info));
+}
+
+/* vnic_get_sset_count - get string set count */
+static int vnic_get_sset_count(struct net_device *netdev, int sset)
+{
+	return (sset == ETH_SS_STATS) ? VNIC_STATS_LEN : -EOPNOTSUPP;
+}
+
+/* vnic_get_ethtool_stats - get statistics */
+static void vnic_get_ethtool_stats(struct net_device *netdev,
+				   struct ethtool_stats *stats, u64 *data)
+{
+	struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev);
+	struct opa_vnic_stats vstats;
+	int i;
+
+	memset(&vstats, 0, sizeof(vstats));
+	spin_lock(&adapter->stats_lock);
+	adapter->rn_ops->ndo_get_stats64(netdev, &vstats.netstats);
+	spin_unlock(&adapter->stats_lock);
+	for (i = 0; i < VNIC_STATS_LEN; i++) {
+		char *p = (char *)&vstats + vnic_gstrings_stats[i].stat_offset;
+
+		data[i] = (vnic_gstrings_stats[i].sizeof_stat ==
+			   sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+	}
+}
+
+/* vnic_get_strings - get strings */
+static void vnic_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
+{
+	int i;
+
+	if (stringset != ETH_SS_STATS)
+		return;
+
+	for (i = 0; i < VNIC_STATS_LEN; i++)
+		memcpy(data + i * ETH_GSTRING_LEN,
+		       vnic_gstrings_stats[i].stat_string,
+		       ETH_GSTRING_LEN);
+}
+
+/* ethtool ops */
+static const struct ethtool_ops opa_vnic_ethtool_ops = {
+	.get_drvinfo = vnic_get_drvinfo,
+	.get_link = ethtool_op_get_link,
+	.get_strings = vnic_get_strings,
+	.get_sset_count = vnic_get_sset_count,
+	.get_ethtool_stats = vnic_get_ethtool_stats,
+};
+
+/* opa_vnic_set_ethtool_ops - set ethtool ops */
+void opa_vnic_set_ethtool_ops(struct net_device *netdev)
+{
+	netdev->ethtool_ops = &opa_vnic_ethtool_ops;
+}
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h b/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h
new file mode 100644
index 0000000..afd95f4
--- /dev/null
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h
@@ -0,0 +1,330 @@
+#ifndef _OPA_VNIC_INTERNAL_H
+#define _OPA_VNIC_INTERNAL_H
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains OPA VNIC driver internal declarations
+ */
+
+#include <linux/bitops.h>
+#include <linux/etherdevice.h>
+#include <linux/hashtable.h>
+#include <linux/sizes.h>
+#include <rdma/opa_vnic.h>
+
+#include "opa_vnic_encap.h"
+
+#define OPA_VNIC_VLAN_PCP(vlan_tci)  \
+			(((vlan_tci) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT)
+
+/* Flow to default port redirection table size */
+#define OPA_VNIC_FLOW_TBL_SIZE    32
+
+/* Invalid port number */
+#define OPA_VNIC_INVALID_PORT     0xff
+
+struct opa_vnic_adapter;
+
+/**
+ * struct __opa_vesw_info - OPA vnic virtual switch info
+ *
+ * Same as opa_vesw_info without bitwise attribute.
+ */
+struct __opa_vesw_info {
+	u16  fabric_id;
+	u16  vesw_id;
+
+	u8   rsvd0[6];
+	u16  def_port_mask;
+
+	u8   rsvd1[2];
+	u16  pkey;
+
+	u8   rsvd2[4];
+	u32  u_mcast_dlid;
+	u32  u_ucast_dlid[OPA_VESW_MAX_NUM_DEF_PORT];
+
+	u32  rc;
+
+	u8   rsvd3[56];
+	u16  eth_mtu;
+	u8   rsvd4[2];
+} __packed;
+
+/**
+ * struct __opa_per_veswport_info - OPA vnic per port info
+ *
+ * Same as opa_per_veswport_info without bitwise attribute.
+ */
+struct __opa_per_veswport_info {
+	u32  port_num;
+
+	u8   eth_link_status;
+	u8   rsvd0[3];
+
+	u8   base_mac_addr[ETH_ALEN];
+	u8   config_state;
+	u8   oper_state;
+
+	u16  max_mac_tbl_ent;
+	u16  max_smac_ent;
+	u32  mac_tbl_digest;
+	u8   rsvd1[4];
+
+	u32  encap_slid;
+
+	u8   pcp_to_sc_uc[OPA_VNIC_MAX_NUM_PCP];
+	u8   pcp_to_vl_uc[OPA_VNIC_MAX_NUM_PCP];
+	u8   pcp_to_sc_mc[OPA_VNIC_MAX_NUM_PCP];
+	u8   pcp_to_vl_mc[OPA_VNIC_MAX_NUM_PCP];
+
+	u8   non_vlan_sc_uc;
+	u8   non_vlan_vl_uc;
+	u8   non_vlan_sc_mc;
+	u8   non_vlan_vl_mc;
+
+	u8   rsvd2[48];
+
+	u16  uc_macs_gen_count;
+	u16  mc_macs_gen_count;
+
+	u8   rsvd3[8];
+} __packed;
+
+/**
+ * struct __opa_veswport_info - OPA vnic port info
+ *
+ * Same as opa_veswport_info without bitwise attribute.
+ */
+struct __opa_veswport_info {
+	struct __opa_vesw_info            vesw;
+	struct __opa_per_veswport_info    vport;
+};
+
+/**
+ * struct __opa_veswport_trap - OPA vnic trap info
+ *
+ * Same as opa_veswport_trap without bitwise attribute.
+ */
+struct __opa_veswport_trap {
+	u16	fabric_id;
+	u16	veswid;
+	u32	veswportnum;
+	u16	opaportnum;
+	u8	veswportindex;
+	u8	opcode;
+	u32	reserved;
+} __packed;
+
+/**
+ * struct opa_vnic_ctrl_port - OPA virtual NIC control port
+ * @ibdev: pointer to ib device
+ * @ops: opa vnic control operations
+ * @num_ports: number of opa ports
+ */
+struct opa_vnic_ctrl_port {
+	struct ib_device           *ibdev;
+	struct opa_vnic_ctrl_ops   *ops;
+	u8                          num_ports;
+};
+
+/**
+ * struct opa_vnic_adapter - OPA VNIC netdev private data structure
+ * @netdev: pointer to associated netdev
+ * @ibdev: ib device
+ * @cport: pointer to opa vnic control port
+ * @rn_ops: rdma netdev's net_device_ops
+ * @port_num: OPA port number
+ * @vport_num: vesw port number
+ * @lock: adapter lock
+ * @info: virtual ethernet switch port information
+ * @vema_mac_addr: mac address configured by vema
+ * @umac_hash: unicast maclist hash
+ * @mmac_hash: multicast maclist hash
+ * @mactbl: hash table of MAC entries
+ * @mactbl_lock: mac table lock
+ * @stats_lock: statistics lock
+ * @flow_tbl: flow to default port redirection table
+ * @trap_timeout: trap timeout
+ * @trap_count: no. of traps allowed within timeout period
+ */
+struct opa_vnic_adapter {
+	struct net_device             *netdev;
+	struct ib_device              *ibdev;
+	struct opa_vnic_ctrl_port     *cport;
+	const struct net_device_ops   *rn_ops;
+
+	u8 port_num;
+	u8 vport_num;
+
+	/* Lock used around concurrent updates to netdev */
+	struct mutex lock;
+
+	struct __opa_veswport_info  info;
+	u8                          vema_mac_addr[ETH_ALEN];
+	u32                         umac_hash;
+	u32                         mmac_hash;
+	struct hlist_head  __rcu   *mactbl;
+
+	/* Lock used to protect updates to mac table */
+	struct mutex mactbl_lock;
+
+	/* Lock used to protect access to vnic counters */
+	spinlock_t stats_lock;
+
+	u8 flow_tbl[OPA_VNIC_FLOW_TBL_SIZE];
+
+	unsigned long trap_timeout;
+	u8            trap_count;
+};
+
+/* Same as opa_veswport_mactable_entry, but without bitwise attribute */
+struct __opa_vnic_mactable_entry {
+	u8  mac_addr[ETH_ALEN];
+	u8  mac_addr_mask[ETH_ALEN];
+	u32 dlid_sd;
+} __packed;
+
+/**
+ * struct opa_vnic_mac_tbl_node - OPA VNIC mac table node
+ * @hlist: hash list handle
+ * @index: index of entry in the mac table
+ * @entry: entry in the table
+ */
+struct opa_vnic_mac_tbl_node {
+	struct hlist_node                    hlist;
+	u16                                  index;
+	struct __opa_vnic_mactable_entry     entry;
+};
+
+#define v_dbg(format, arg...) \
+	netdev_dbg(adapter->netdev, format, ## arg)
+#define v_err(format, arg...) \
+	netdev_err(adapter->netdev, format, ## arg)
+#define v_info(format, arg...) \
+	netdev_info(adapter->netdev, format, ## arg)
+#define v_warn(format, arg...) \
+	netdev_warn(adapter->netdev, format, ## arg)
+
+#define c_err(format, arg...) \
+	dev_err(&cport->ibdev->dev, format, ## arg)
+#define c_info(format, arg...) \
+	dev_info(&cport->ibdev->dev, format, ## arg)
+#define c_dbg(format, arg...) \
+	dev_dbg(&cport->ibdev->dev, format, ## arg)
+
+/* The maximum allowed entries in the mac table */
+#define OPA_VNIC_MAC_TBL_MAX_ENTRIES  2048
+/* Limit of smac entries in mac table */
+#define OPA_VNIC_MAX_SMAC_LIMIT       256
+
+/* The last octet of the MAC address is used as the key to the hash table */
+#define OPA_VNIC_MAC_HASH_IDX         5
+
+/* The VNIC MAC hash table is of size 2^8 */
+#define OPA_VNIC_MAC_TBL_HASH_BITS    8
+#define OPA_VNIC_MAC_TBL_SIZE  BIT(OPA_VNIC_MAC_TBL_HASH_BITS)
+
+/* VNIC HASH MACROS */
+#define vnic_hash_init(hashtable) __hash_init(hashtable, OPA_VNIC_MAC_TBL_SIZE)
+
+#define vnic_hash_add(hashtable, node, key)                                   \
+	hlist_add_head(node,                                                  \
+		&hashtable[hash_min(key, ilog2(OPA_VNIC_MAC_TBL_SIZE))])
+
+#define vnic_hash_for_each_safe(name, bkt, tmp, obj, member)                  \
+	for ((bkt) = 0, obj = NULL;                                           \
+		    !obj && (bkt) < OPA_VNIC_MAC_TBL_SIZE; (bkt)++)           \
+		hlist_for_each_entry_safe(obj, tmp, &name[bkt], member)
+
+#define vnic_hash_for_each_possible(name, obj, member, key)                   \
+	hlist_for_each_entry(obj,                                             \
+		&name[hash_min(key, ilog2(OPA_VNIC_MAC_TBL_SIZE))], member)
+
+#define vnic_hash_for_each(name, bkt, obj, member)                            \
+	for ((bkt) = 0, obj = NULL;                                           \
+		    !obj && (bkt) < OPA_VNIC_MAC_TBL_SIZE; (bkt)++)           \
+		hlist_for_each_entry(obj, &name[bkt], member)
+
+extern char opa_vnic_driver_name[];
+extern const char opa_vnic_driver_version[];
+
+struct opa_vnic_adapter *opa_vnic_add_netdev(struct ib_device *ibdev,
+					     u8 port_num, u8 vport_num);
+void opa_vnic_rem_netdev(struct opa_vnic_adapter *adapter);
+void opa_vnic_encap_skb(struct opa_vnic_adapter *adapter, struct sk_buff *skb);
+u8 opa_vnic_get_vl(struct opa_vnic_adapter *adapter, struct sk_buff *skb);
+u8 opa_vnic_calc_entropy(struct opa_vnic_adapter *adapter, struct sk_buff *skb);
+void opa_vnic_process_vema_config(struct opa_vnic_adapter *adapter);
+void opa_vnic_release_mac_tbl(struct opa_vnic_adapter *adapter);
+void opa_vnic_query_mac_tbl(struct opa_vnic_adapter *adapter,
+			    struct opa_veswport_mactable *tbl);
+int opa_vnic_update_mac_tbl(struct opa_vnic_adapter *adapter,
+			    struct opa_veswport_mactable *tbl);
+void opa_vnic_query_ucast_macs(struct opa_vnic_adapter *adapter,
+			       struct opa_veswport_iface_macs *macs);
+void opa_vnic_query_mcast_macs(struct opa_vnic_adapter *adapter,
+			       struct opa_veswport_iface_macs *macs);
+void opa_vnic_get_summary_counters(struct opa_vnic_adapter *adapter,
+				   struct opa_veswport_summary_counters *cntrs);
+void opa_vnic_get_error_counters(struct opa_vnic_adapter *adapter,
+				 struct opa_veswport_error_counters *cntrs);
+void opa_vnic_get_vesw_info(struct opa_vnic_adapter *adapter,
+			    struct opa_vesw_info *info);
+void opa_vnic_set_vesw_info(struct opa_vnic_adapter *adapter,
+			    struct opa_vesw_info *info);
+void opa_vnic_get_per_veswport_info(struct opa_vnic_adapter *adapter,
+				    struct opa_per_veswport_info *info);
+void opa_vnic_set_per_veswport_info(struct opa_vnic_adapter *adapter,
+				    struct opa_per_veswport_info *info);
+void opa_vnic_vema_report_event(struct opa_vnic_adapter *adapter, u8 event);
+void opa_vnic_set_ethtool_ops(struct net_device *netdev);
+void opa_vnic_vema_send_trap(struct opa_vnic_adapter *adapter,
+			     struct __opa_veswport_trap *data, u32 lid);
+
+#endif /* _OPA_VNIC_INTERNAL_H */
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
new file mode 100644
index 0000000..ce57e0f
--- /dev/null
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains OPA Virtual Network Interface Controller (VNIC) driver
+ * netdev functionality.
+ */
+
+#include <linux/module.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+
+#include "opa_vnic_internal.h"
+
+#define OPA_TX_TIMEOUT_MS 1000
+
+#define OPA_VNIC_SKB_HEADROOM  \
+			ALIGN((OPA_VNIC_HDR_LEN + OPA_VNIC_SKB_MDATA_LEN), 8)
+
+/* This function is overloaded for opa_vnic specific implementation */
+static void opa_vnic_get_stats64(struct net_device *netdev,
+				 struct rtnl_link_stats64 *stats)
+{
+	struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev);
+	struct opa_vnic_stats vstats;
+
+	memset(&vstats, 0, sizeof(vstats));
+	spin_lock(&adapter->stats_lock);
+	adapter->rn_ops->ndo_get_stats64(netdev, &vstats.netstats);
+	spin_unlock(&adapter->stats_lock);
+	memcpy(stats, &vstats.netstats, sizeof(*stats));
+}
+
+/* opa_netdev_start_xmit - transmit function */
+static netdev_tx_t opa_netdev_start_xmit(struct sk_buff *skb,
+					 struct net_device *netdev)
+{
+	struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev);
+
+	v_dbg("xmit: queue %d skb len %d\n", skb->queue_mapping, skb->len);
+	/* pad to ensure mininum ethernet packet length */
+	if (unlikely(skb->len < ETH_ZLEN)) {
+		if (skb_padto(skb, ETH_ZLEN))
+			return NETDEV_TX_OK;
+
+		skb_put(skb, ETH_ZLEN - skb->len);
+	}
+
+	opa_vnic_encap_skb(adapter, skb);
+	return adapter->rn_ops->ndo_start_xmit(skb, netdev);
+}
+
+static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb,
+				 void *accel_priv,
+				 select_queue_fallback_t fallback)
+{
+	struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev);
+	struct opa_vnic_skb_mdata *mdata;
+	int rc;
+
+	/* pass entropy and vl as metadata in skb */
+	mdata = skb_push(skb, sizeof(*mdata));
+	mdata->entropy =  opa_vnic_calc_entropy(adapter, skb);
+	mdata->vl = opa_vnic_get_vl(adapter, skb);
+	rc = adapter->rn_ops->ndo_select_queue(netdev, skb,
+					       accel_priv, fallback);
+	skb_pull(skb, sizeof(*mdata));
+	return rc;
+}
+
+static void opa_vnic_update_state(struct opa_vnic_adapter *adapter, bool up)
+{
+	struct __opa_veswport_info *info = &adapter->info;
+
+	mutex_lock(&adapter->lock);
+	/* Operational state can only be DROP_ALL or FORWARDING */
+	if ((info->vport.config_state == OPA_VNIC_STATE_FORWARDING) && up) {
+		info->vport.oper_state = OPA_VNIC_STATE_FORWARDING;
+		info->vport.eth_link_status = OPA_VNIC_ETH_LINK_UP;
+	} else {
+		info->vport.oper_state = OPA_VNIC_STATE_DROP_ALL;
+		info->vport.eth_link_status = OPA_VNIC_ETH_LINK_DOWN;
+	}
+
+	if (info->vport.config_state == OPA_VNIC_STATE_FORWARDING)
+		netif_dormant_off(adapter->netdev);
+	else
+		netif_dormant_on(adapter->netdev);
+	mutex_unlock(&adapter->lock);
+}
+
+/* opa_vnic_process_vema_config - process vema configuration updates */
+void opa_vnic_process_vema_config(struct opa_vnic_adapter *adapter)
+{
+	struct __opa_veswport_info *info = &adapter->info;
+	struct rdma_netdev *rn = netdev_priv(adapter->netdev);
+	u8 port_num[OPA_VESW_MAX_NUM_DEF_PORT] = { 0 };
+	struct net_device *netdev = adapter->netdev;
+	u8 i, port_count = 0;
+	u16 port_mask;
+
+	/* If the base_mac_addr is changed, update the interface mac address */
+	if (memcmp(info->vport.base_mac_addr, adapter->vema_mac_addr,
+		   ARRAY_SIZE(info->vport.base_mac_addr))) {
+		struct sockaddr saddr;
+
+		memcpy(saddr.sa_data, info->vport.base_mac_addr,
+		       ARRAY_SIZE(info->vport.base_mac_addr));
+		mutex_lock(&adapter->lock);
+		eth_commit_mac_addr_change(netdev, &saddr);
+		memcpy(adapter->vema_mac_addr,
+		       info->vport.base_mac_addr, ETH_ALEN);
+		mutex_unlock(&adapter->lock);
+	}
+
+	rn->set_id(netdev, info->vesw.vesw_id);
+
+	/* Handle MTU limit change */
+	rtnl_lock();
+	netdev->max_mtu = max_t(unsigned int, info->vesw.eth_mtu,
+				netdev->min_mtu);
+	if (netdev->mtu > netdev->max_mtu)
+		dev_set_mtu(netdev, netdev->max_mtu);
+	rtnl_unlock();
+
+	/* Update flow to default port redirection table */
+	port_mask = info->vesw.def_port_mask;
+	for (i = 0; i < OPA_VESW_MAX_NUM_DEF_PORT; i++) {
+		if (port_mask & 1)
+			port_num[port_count++] = i;
+		port_mask >>= 1;
+	}
+
+	/*
+	 * Build the flow table. Flow table is required when destination LID
+	 * is not available. Up to OPA_VNIC_FLOW_TBL_SIZE flows supported.
+	 * Each flow need a default port number to get its dlid from the
+	 * u_ucast_dlid array.
+	 */
+	for (i = 0; i < OPA_VNIC_FLOW_TBL_SIZE; i++)
+		adapter->flow_tbl[i] = port_count ? port_num[i % port_count] :
+						    OPA_VNIC_INVALID_PORT;
+
+	/* update state */
+	opa_vnic_update_state(adapter, !!(netdev->flags & IFF_UP));
+}
+
+/*
+ * Set the power on default values in adapter's vema interface structure.
+ */
+static inline void opa_vnic_set_pod_values(struct opa_vnic_adapter *adapter)
+{
+	adapter->info.vport.max_mac_tbl_ent = OPA_VNIC_MAC_TBL_MAX_ENTRIES;
+	adapter->info.vport.max_smac_ent = OPA_VNIC_MAX_SMAC_LIMIT;
+	adapter->info.vport.config_state = OPA_VNIC_STATE_DROP_ALL;
+	adapter->info.vport.eth_link_status = OPA_VNIC_ETH_LINK_DOWN;
+	adapter->info.vesw.eth_mtu = ETH_DATA_LEN;
+}
+
+/* opa_vnic_set_mac_addr - change mac address */
+static int opa_vnic_set_mac_addr(struct net_device *netdev, void *addr)
+{
+	struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev);
+	struct sockaddr *sa = addr;
+	int rc;
+
+	if (!memcmp(netdev->dev_addr, sa->sa_data, ETH_ALEN))
+		return 0;
+
+	mutex_lock(&adapter->lock);
+	rc = eth_mac_addr(netdev, addr);
+	mutex_unlock(&adapter->lock);
+	if (rc)
+		return rc;
+
+	adapter->info.vport.uc_macs_gen_count++;
+	opa_vnic_vema_report_event(adapter,
+				   OPA_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE);
+	return 0;
+}
+
+/*
+ * opa_vnic_mac_send_event - post event on possible mac list exchange
+ *  Send trap when digest from uc/mc mac list differs from previous run.
+ *  Digest is evaluated similar to how cksum does.
+ */
+static void opa_vnic_mac_send_event(struct net_device *netdev, u8 event)
+{
+	struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev);
+	struct netdev_hw_addr *ha;
+	struct netdev_hw_addr_list *hw_list;
+	u32 *ref_crc;
+	u32 l, crc = 0;
+
+	switch (event) {
+	case OPA_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE:
+		hw_list = &netdev->uc;
+		adapter->info.vport.uc_macs_gen_count++;
+		ref_crc = &adapter->umac_hash;
+		break;
+	case OPA_VESWPORT_TRAP_IFACE_MCAST_MAC_CHANGE:
+		hw_list = &netdev->mc;
+		adapter->info.vport.mc_macs_gen_count++;
+		ref_crc = &adapter->mmac_hash;
+		break;
+	default:
+		return;
+	}
+	netdev_hw_addr_list_for_each(ha, hw_list) {
+		crc = crc32_le(crc, ha->addr, ETH_ALEN);
+	}
+	l = netdev_hw_addr_list_count(hw_list) * ETH_ALEN;
+	crc = ~crc32_le(crc, (void *)&l, sizeof(l));
+
+	if (crc != *ref_crc) {
+		*ref_crc = crc;
+		opa_vnic_vema_report_event(adapter, event);
+	}
+}
+
+/* opa_vnic_set_rx_mode - handle uc/mc mac list change */
+static void opa_vnic_set_rx_mode(struct net_device *netdev)
+{
+	opa_vnic_mac_send_event(netdev,
+				OPA_VESWPORT_TRAP_IFACE_UCAST_MAC_CHANGE);
+
+	opa_vnic_mac_send_event(netdev,
+				OPA_VESWPORT_TRAP_IFACE_MCAST_MAC_CHANGE);
+}
+
+/* opa_netdev_open - activate network interface */
+static int opa_netdev_open(struct net_device *netdev)
+{
+	struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev);
+	int rc;
+
+	rc = adapter->rn_ops->ndo_open(adapter->netdev);
+	if (rc) {
+		v_dbg("open failed %d\n", rc);
+		return rc;
+	}
+
+	/* Update status and send trap */
+	opa_vnic_update_state(adapter, true);
+	opa_vnic_vema_report_event(adapter,
+				   OPA_VESWPORT_TRAP_ETH_LINK_STATUS_CHANGE);
+	return 0;
+}
+
+/* opa_netdev_close - disable network interface */
+static int opa_netdev_close(struct net_device *netdev)
+{
+	struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev);
+	int rc;
+
+	rc = adapter->rn_ops->ndo_stop(adapter->netdev);
+	if (rc) {
+		v_dbg("close failed %d\n", rc);
+		return rc;
+	}
+
+	/* Update status and send trap */
+	opa_vnic_update_state(adapter, false);
+	opa_vnic_vema_report_event(adapter,
+				   OPA_VESWPORT_TRAP_ETH_LINK_STATUS_CHANGE);
+	return 0;
+}
+
+/* netdev ops */
+static const struct net_device_ops opa_netdev_ops = {
+	.ndo_open = opa_netdev_open,
+	.ndo_stop = opa_netdev_close,
+	.ndo_start_xmit = opa_netdev_start_xmit,
+	.ndo_get_stats64 = opa_vnic_get_stats64,
+	.ndo_set_rx_mode = opa_vnic_set_rx_mode,
+	.ndo_select_queue = opa_vnic_select_queue,
+	.ndo_set_mac_address = opa_vnic_set_mac_addr,
+};
+
+/* opa_vnic_add_netdev - create vnic netdev interface */
+struct opa_vnic_adapter *opa_vnic_add_netdev(struct ib_device *ibdev,
+					     u8 port_num, u8 vport_num)
+{
+	struct opa_vnic_adapter *adapter;
+	struct net_device *netdev;
+	struct rdma_netdev *rn;
+	int rc;
+
+	netdev = ibdev->alloc_rdma_netdev(ibdev, port_num,
+					  RDMA_NETDEV_OPA_VNIC,
+					  "veth%d", NET_NAME_UNKNOWN,
+					  ether_setup);
+	if (!netdev)
+		return ERR_PTR(-ENOMEM);
+	else if (IS_ERR(netdev))
+		return ERR_CAST(netdev);
+
+	rn = netdev_priv(netdev);
+	adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+	if (!adapter) {
+		rc = -ENOMEM;
+		goto adapter_err;
+	}
+
+	rn->clnt_priv = adapter;
+	rn->hca = ibdev;
+	rn->port_num = port_num;
+	adapter->netdev = netdev;
+	adapter->ibdev = ibdev;
+	adapter->port_num = port_num;
+	adapter->vport_num = vport_num;
+	adapter->rn_ops = netdev->netdev_ops;
+
+	netdev->netdev_ops = &opa_netdev_ops;
+	netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	netdev->hard_header_len += OPA_VNIC_SKB_HEADROOM;
+	mutex_init(&adapter->lock);
+	mutex_init(&adapter->mactbl_lock);
+	spin_lock_init(&adapter->stats_lock);
+
+	SET_NETDEV_DEV(netdev, ibdev->dev.parent);
+
+	opa_vnic_set_ethtool_ops(netdev);
+
+	opa_vnic_set_pod_values(adapter);
+
+	rc = register_netdev(netdev);
+	if (rc)
+		goto netdev_err;
+
+	netif_carrier_off(netdev);
+	netif_dormant_on(netdev);
+	v_info("initialized\n");
+
+	return adapter;
+netdev_err:
+	mutex_destroy(&adapter->lock);
+	mutex_destroy(&adapter->mactbl_lock);
+	kfree(adapter);
+adapter_err:
+	rn->free_rdma_netdev(netdev);
+
+	return ERR_PTR(rc);
+}
+
+/* opa_vnic_rem_netdev - remove vnic netdev interface */
+void opa_vnic_rem_netdev(struct opa_vnic_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct rdma_netdev *rn = netdev_priv(netdev);
+
+	v_info("removing\n");
+	unregister_netdev(netdev);
+	opa_vnic_release_mac_tbl(adapter);
+	mutex_destroy(&adapter->lock);
+	mutex_destroy(&adapter->mactbl_lock);
+	kfree(adapter);
+	rn->free_rdma_netdev(netdev);
+}
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
new file mode 100644
index 0000000..15711dc
--- /dev/null
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
@@ -0,0 +1,1076 @@
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains OPA Virtual Network Interface Controller (VNIC)
+ * Ethernet Management Agent (EMA) driver
+ */
+
+#include <linux/module.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/opa_smi.h>
+#include <rdma/opa_port_info.h>
+
+#include "opa_vnic_internal.h"
+
+#define DRV_VERSION "1.0"
+char opa_vnic_driver_name[] = "opa_vnic";
+const char opa_vnic_driver_version[] = DRV_VERSION;
+
+/*
+ * The trap service level is kept in bits 3 to 7 in the trap_sl_rsvd
+ * field in the class port info MAD.
+ */
+#define GET_TRAP_SL_FROM_CLASS_PORT_INFO(x)  (((x) >> 3) & 0x1f)
+
+/* Cap trap bursts to a reasonable limit good for normal cases */
+#define OPA_VNIC_TRAP_BURST_LIMIT 4
+
+/*
+ * VNIC trap limit timeout.
+ * Inverse of cap2_mask response time out (1.0737 secs) = 0.9
+ * secs approx IB spec 13.4.6.2.1 PortInfoSubnetTimeout and
+ * 13.4.9 Traps.
+ */
+#define OPA_VNIC_TRAP_TIMEOUT  ((4096 * (1UL << 18)) / 1000)
+
+#define OPA_VNIC_UNSUP_ATTR  \
+		cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB)
+
+#define OPA_VNIC_INVAL_ATTR  \
+		cpu_to_be16(IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE)
+
+#define OPA_VNIC_CLASS_CAP_TRAP   0x1
+
+/* Maximum number of VNIC ports supported */
+#define OPA_VNIC_MAX_NUM_VPORT    255
+
+/**
+ * struct opa_vnic_vema_port -- VNIC VEMA port details
+ * @cport: pointer to port
+ * @mad_agent: pointer to mad agent for port
+ * @class_port_info: Class port info information.
+ * @tid: Transaction id
+ * @port_num: OPA port number
+ * @vport_idr: vnic ports idr
+ * @event_handler: ib event handler
+ * @lock: adapter interface lock
+ */
+struct opa_vnic_vema_port {
+	struct opa_vnic_ctrl_port      *cport;
+	struct ib_mad_agent            *mad_agent;
+	struct opa_class_port_info      class_port_info;
+	u64                             tid;
+	u8                              port_num;
+	struct idr                      vport_idr;
+	struct ib_event_handler         event_handler;
+
+	/* Lock to query/update network adapter */
+	struct mutex                    lock;
+};
+
+static void opa_vnic_vema_add_one(struct ib_device *device);
+static void opa_vnic_vema_rem_one(struct ib_device *device,
+				  void *client_data);
+
+static struct ib_client opa_vnic_client = {
+	.name   = opa_vnic_driver_name,
+	.add    = opa_vnic_vema_add_one,
+	.remove = opa_vnic_vema_rem_one,
+};
+
+/**
+ * vema_get_vport_num -- Get the vnic from the mad
+ * @recvd_mad:  Received mad
+ *
+ * Return: returns value of the vnic port number
+ */
+static inline u8 vema_get_vport_num(struct opa_vnic_vema_mad *recvd_mad)
+{
+	return be32_to_cpu(recvd_mad->mad_hdr.attr_mod) & 0xff;
+}
+
+/**
+ * vema_get_vport_adapter -- Get vnic port adapter from recvd mad
+ * @recvd_mad: received mad
+ * @port: ptr to port struct on which MAD was recvd
+ *
+ * Return: vnic adapter
+ */
+static inline struct opa_vnic_adapter *
+vema_get_vport_adapter(struct opa_vnic_vema_mad *recvd_mad,
+		       struct opa_vnic_vema_port *port)
+{
+	u8 vport_num = vema_get_vport_num(recvd_mad);
+
+	return idr_find(&port->vport_idr, vport_num);
+}
+
+/**
+ * vema_mac_tbl_req_ok -- Check if mac request has correct values
+ * @mac_tbl: mac table
+ *
+ * This function checks for the validity of the offset and number of
+ * entries required.
+ *
+ * Return: true if offset and num_entries are valid
+ */
+static inline bool vema_mac_tbl_req_ok(struct opa_veswport_mactable *mac_tbl)
+{
+	u16 offset, num_entries;
+	u16 req_entries = ((OPA_VNIC_EMA_DATA - sizeof(*mac_tbl)) /
+			   sizeof(mac_tbl->tbl_entries[0]));
+
+	offset = be16_to_cpu(mac_tbl->offset);
+	num_entries = be16_to_cpu(mac_tbl->num_entries);
+
+	return ((num_entries <= req_entries) &&
+		(offset + num_entries <= OPA_VNIC_MAC_TBL_MAX_ENTRIES));
+}
+
+/*
+ * Return the power on default values in the port info structure
+ * in big endian format as required by MAD.
+ */
+static inline void vema_get_pod_values(struct opa_veswport_info *port_info)
+{
+	memset(port_info, 0, sizeof(*port_info));
+	port_info->vport.max_mac_tbl_ent =
+		cpu_to_be16(OPA_VNIC_MAC_TBL_MAX_ENTRIES);
+	port_info->vport.max_smac_ent =
+		cpu_to_be16(OPA_VNIC_MAX_SMAC_LIMIT);
+	port_info->vport.oper_state = OPA_VNIC_STATE_DROP_ALL;
+	port_info->vport.config_state = OPA_VNIC_STATE_DROP_ALL;
+	port_info->vesw.eth_mtu = cpu_to_be16(ETH_DATA_LEN);
+}
+
+/**
+ * vema_add_vport -- Add a new vnic port
+ * @port: ptr to opa_vnic_vema_port struct
+ * @vport_num: vnic port number (to be added)
+ *
+ * Return a pointer to the vnic adapter structure
+ */
+static struct opa_vnic_adapter *vema_add_vport(struct opa_vnic_vema_port *port,
+					       u8 vport_num)
+{
+	struct opa_vnic_ctrl_port *cport = port->cport;
+	struct opa_vnic_adapter *adapter;
+
+	adapter = opa_vnic_add_netdev(cport->ibdev, port->port_num, vport_num);
+	if (!IS_ERR(adapter)) {
+		int rc;
+
+		adapter->cport = cport;
+		rc = idr_alloc(&port->vport_idr, adapter, vport_num,
+			       vport_num + 1, GFP_NOWAIT);
+		if (rc < 0) {
+			opa_vnic_rem_netdev(adapter);
+			adapter = ERR_PTR(rc);
+		}
+	}
+
+	return adapter;
+}
+
+/**
+ * vema_get_class_port_info -- Get class info for port
+ * @port:  Port on whic MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ *
+ * This function copies the latest class port info value set for the
+ * port and stores it for generating traps
+ */
+static void vema_get_class_port_info(struct opa_vnic_vema_port *port,
+				     struct opa_vnic_vema_mad *recvd_mad,
+				     struct opa_vnic_vema_mad *rsp_mad)
+{
+	struct opa_class_port_info *port_info;
+
+	port_info = (struct opa_class_port_info *)rsp_mad->data;
+	memcpy(port_info, &port->class_port_info, sizeof(*port_info));
+	port_info->base_version = OPA_MGMT_BASE_VERSION,
+	port_info->class_version = OPA_EMA_CLASS_VERSION;
+
+	/*
+	 * Set capability mask bit indicating agent generates traps,
+	 * and set the maximum number of VNIC ports supported.
+	 */
+	port_info->cap_mask = cpu_to_be16((OPA_VNIC_CLASS_CAP_TRAP |
+					   (OPA_VNIC_MAX_NUM_VPORT << 8)));
+
+	/*
+	 * Since a get routine is always sent by the EM first we
+	 * set the expected response time to
+	 * 4.096 usec * 2^18 == 1.0737 sec here.
+	 */
+	port_info->cap_mask2_resp_time = cpu_to_be32(18);
+}
+
+/**
+ * vema_set_class_port_info -- Get class info for port
+ * @port:  Port on whic MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ *
+ * This function updates the port class info for the specific vnic
+ * and sets up the response mad data
+ */
+static void vema_set_class_port_info(struct opa_vnic_vema_port *port,
+				     struct opa_vnic_vema_mad *recvd_mad,
+				     struct opa_vnic_vema_mad *rsp_mad)
+{
+	memcpy(&port->class_port_info, recvd_mad->data,
+	       sizeof(port->class_port_info));
+
+	vema_get_class_port_info(port, recvd_mad, rsp_mad);
+}
+
+/**
+ * vema_get_veswport_info -- Get veswport info
+ * @port:      source port on which MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ */
+static void vema_get_veswport_info(struct opa_vnic_vema_port *port,
+				   struct opa_vnic_vema_mad *recvd_mad,
+				   struct opa_vnic_vema_mad *rsp_mad)
+{
+	struct opa_veswport_info *port_info =
+				  (struct opa_veswport_info *)rsp_mad->data;
+	struct opa_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (adapter) {
+		memset(port_info, 0, sizeof(*port_info));
+		opa_vnic_get_vesw_info(adapter, &port_info->vesw);
+		opa_vnic_get_per_veswport_info(adapter,
+					       &port_info->vport);
+	} else {
+		vema_get_pod_values(port_info);
+	}
+}
+
+/**
+ * vema_set_veswport_info -- Set veswport info
+ * @port:      source port on which MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ *
+ * This function gets the port class infor for vnic
+ */
+static void vema_set_veswport_info(struct opa_vnic_vema_port *port,
+				   struct opa_vnic_vema_mad *recvd_mad,
+				   struct opa_vnic_vema_mad *rsp_mad)
+{
+	struct opa_vnic_ctrl_port *cport = port->cport;
+	struct opa_veswport_info *port_info;
+	struct opa_vnic_adapter *adapter;
+	u8 vport_num;
+
+	vport_num = vema_get_vport_num(recvd_mad);
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (!adapter) {
+		adapter = vema_add_vport(port, vport_num);
+		if (IS_ERR(adapter)) {
+			c_err("failed to add vport %d: %ld\n",
+			      vport_num, PTR_ERR(adapter));
+			goto err_exit;
+		}
+	}
+
+	port_info = (struct opa_veswport_info *)recvd_mad->data;
+	opa_vnic_set_vesw_info(adapter, &port_info->vesw);
+	opa_vnic_set_per_veswport_info(adapter, &port_info->vport);
+
+	/* Process the new config settings */
+	opa_vnic_process_vema_config(adapter);
+
+	vema_get_veswport_info(port, recvd_mad, rsp_mad);
+	return;
+
+err_exit:
+	rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR;
+}
+
+/**
+ * vema_get_mac_entries -- Get MAC entries in VNIC MAC table
+ * @port:      source port on which MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ *
+ * This function gets the MAC entries that are programmed into
+ * the VNIC MAC forwarding table. It checks for the validity of
+ * the index into the MAC table and the number of entries that
+ * are to be retrieved.
+ */
+static void vema_get_mac_entries(struct opa_vnic_vema_port *port,
+				 struct opa_vnic_vema_mad *recvd_mad,
+				 struct opa_vnic_vema_mad *rsp_mad)
+{
+	struct opa_veswport_mactable *mac_tbl_in, *mac_tbl_out;
+	struct opa_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (!adapter) {
+		rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR;
+		return;
+	}
+
+	mac_tbl_in = (struct opa_veswport_mactable *)recvd_mad->data;
+	mac_tbl_out = (struct opa_veswport_mactable *)rsp_mad->data;
+
+	if (vema_mac_tbl_req_ok(mac_tbl_in)) {
+		mac_tbl_out->offset = mac_tbl_in->offset;
+		mac_tbl_out->num_entries = mac_tbl_in->num_entries;
+		opa_vnic_query_mac_tbl(adapter, mac_tbl_out);
+	} else {
+		rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR;
+	}
+}
+
+/**
+ * vema_set_mac_entries -- Set MAC entries in VNIC MAC table
+ * @port:      source port on which MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ *
+ * This function sets the MAC entries in the VNIC forwarding table
+ * It checks for the validity of the index and the number of forwarding
+ * table entries to be programmed.
+ */
+static void vema_set_mac_entries(struct opa_vnic_vema_port *port,
+				 struct opa_vnic_vema_mad *recvd_mad,
+				 struct opa_vnic_vema_mad *rsp_mad)
+{
+	struct opa_veswport_mactable *mac_tbl;
+	struct opa_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (!adapter) {
+		rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR;
+		return;
+	}
+
+	mac_tbl = (struct opa_veswport_mactable *)recvd_mad->data;
+	if (vema_mac_tbl_req_ok(mac_tbl)) {
+		if (opa_vnic_update_mac_tbl(adapter, mac_tbl))
+			rsp_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR;
+	} else {
+		rsp_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR;
+	}
+	vema_get_mac_entries(port, recvd_mad, rsp_mad);
+}
+
+/**
+ * vema_set_delete_vesw -- Reset VESW info to POD values
+ * @port:      source port on which MAD was received
+ * @recvd_mad: pointer to the received mad
+ * @rsp_mad:   pointer to respose mad
+ *
+ * This function clears all the fields of veswport info for the requested vesw
+ * and sets them back to the power-on default values. It does not delete the
+ * vesw.
+ */
+static void vema_set_delete_vesw(struct opa_vnic_vema_port *port,
+				 struct opa_vnic_vema_mad *recvd_mad,
+				 struct opa_vnic_vema_mad *rsp_mad)
+{
+	struct opa_veswport_info *port_info =
+				  (struct opa_veswport_info *)rsp_mad->data;
+	struct opa_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (!adapter) {
+		rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR;
+		return;
+	}
+
+	vema_get_pod_values(port_info);
+	opa_vnic_set_vesw_info(adapter, &port_info->vesw);
+	opa_vnic_set_per_veswport_info(adapter, &port_info->vport);
+
+	/* Process the new config settings */
+	opa_vnic_process_vema_config(adapter);
+
+	opa_vnic_release_mac_tbl(adapter);
+
+	vema_get_veswport_info(port, recvd_mad, rsp_mad);
+}
+
+/**
+ * vema_get_mac_list -- Get the unicast/multicast macs.
+ * @port:      source port on which MAD was received
+ * @recvd_mad: Received mad contains fields to set vnic parameters
+ * @rsp_mad:   Response mad to be built
+ * @attr_id:   Attribute ID indicating multicast or unicast mac list
+ */
+static void vema_get_mac_list(struct opa_vnic_vema_port *port,
+			      struct opa_vnic_vema_mad *recvd_mad,
+			      struct opa_vnic_vema_mad *rsp_mad,
+			      u16 attr_id)
+{
+	struct opa_veswport_iface_macs *macs_in, *macs_out;
+	int max_entries = (OPA_VNIC_EMA_DATA - sizeof(*macs_out)) / ETH_ALEN;
+	struct opa_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (!adapter) {
+		rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR;
+		return;
+	}
+
+	macs_in = (struct opa_veswport_iface_macs *)recvd_mad->data;
+	macs_out = (struct opa_veswport_iface_macs *)rsp_mad->data;
+
+	macs_out->start_idx = macs_in->start_idx;
+	if (macs_in->num_macs_in_msg)
+		macs_out->num_macs_in_msg = macs_in->num_macs_in_msg;
+	else
+		macs_out->num_macs_in_msg = cpu_to_be16(max_entries);
+
+	if (attr_id == OPA_EM_ATTR_IFACE_MCAST_MACS)
+		opa_vnic_query_mcast_macs(adapter, macs_out);
+	else
+		opa_vnic_query_ucast_macs(adapter, macs_out);
+}
+
+/**
+ * vema_get_summary_counters -- Gets summary counters.
+ * @port:      source port on which MAD was received
+ * @recvd_mad: Received mad contains fields to set vnic parameters
+ * @rsp_mad:   Response mad to be built
+ */
+static void vema_get_summary_counters(struct opa_vnic_vema_port *port,
+				      struct opa_vnic_vema_mad *recvd_mad,
+				      struct opa_vnic_vema_mad *rsp_mad)
+{
+	struct opa_veswport_summary_counters *cntrs;
+	struct opa_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (adapter) {
+		cntrs = (struct opa_veswport_summary_counters *)rsp_mad->data;
+		opa_vnic_get_summary_counters(adapter, cntrs);
+	} else {
+		rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR;
+	}
+}
+
+/**
+ * vema_get_error_counters -- Gets summary counters.
+ * @port:      source port on which MAD was received
+ * @recvd_mad: Received mad contains fields to set vnic parameters
+ * @rsp_mad:   Response mad to be built
+ */
+static void vema_get_error_counters(struct opa_vnic_vema_port *port,
+				    struct opa_vnic_vema_mad *recvd_mad,
+				    struct opa_vnic_vema_mad *rsp_mad)
+{
+	struct opa_veswport_error_counters *cntrs;
+	struct opa_vnic_adapter *adapter;
+
+	adapter = vema_get_vport_adapter(recvd_mad, port);
+	if (adapter) {
+		cntrs = (struct opa_veswport_error_counters *)rsp_mad->data;
+		opa_vnic_get_error_counters(adapter, cntrs);
+	} else {
+		rsp_mad->mad_hdr.status = OPA_VNIC_INVAL_ATTR;
+	}
+}
+
+/**
+ * vema_get -- Process received get MAD
+ * @port:      source port on which MAD was received
+ * @recvd_mad: Received mad
+ * @rsp_mad:   Response mad to be built
+ */
+static void vema_get(struct opa_vnic_vema_port *port,
+		     struct opa_vnic_vema_mad *recvd_mad,
+		     struct opa_vnic_vema_mad *rsp_mad)
+{
+	u16 attr_id = be16_to_cpu(recvd_mad->mad_hdr.attr_id);
+
+	switch (attr_id) {
+	case OPA_EM_ATTR_CLASS_PORT_INFO:
+		vema_get_class_port_info(port, recvd_mad, rsp_mad);
+		break;
+	case OPA_EM_ATTR_VESWPORT_INFO:
+		vema_get_veswport_info(port, recvd_mad, rsp_mad);
+		break;
+	case OPA_EM_ATTR_VESWPORT_MAC_ENTRIES:
+		vema_get_mac_entries(port, recvd_mad, rsp_mad);
+		break;
+	case OPA_EM_ATTR_IFACE_UCAST_MACS:
+		/* fall through */
+	case OPA_EM_ATTR_IFACE_MCAST_MACS:
+		vema_get_mac_list(port, recvd_mad, rsp_mad, attr_id);
+		break;
+	case OPA_EM_ATTR_VESWPORT_SUMMARY_COUNTERS:
+		vema_get_summary_counters(port, recvd_mad, rsp_mad);
+		break;
+	case OPA_EM_ATTR_VESWPORT_ERROR_COUNTERS:
+		vema_get_error_counters(port, recvd_mad, rsp_mad);
+		break;
+	default:
+		rsp_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR;
+		break;
+	}
+}
+
+/**
+ * vema_set -- Process received set MAD
+ * @port:      source port on which MAD was received
+ * @recvd_mad: Received mad contains fields to set vnic parameters
+ * @rsp_mad:   Response mad to be built
+ */
+static void vema_set(struct opa_vnic_vema_port *port,
+		     struct opa_vnic_vema_mad *recvd_mad,
+		     struct opa_vnic_vema_mad *rsp_mad)
+{
+	u16 attr_id = be16_to_cpu(recvd_mad->mad_hdr.attr_id);
+
+	switch (attr_id) {
+	case OPA_EM_ATTR_CLASS_PORT_INFO:
+		vema_set_class_port_info(port, recvd_mad, rsp_mad);
+		break;
+	case OPA_EM_ATTR_VESWPORT_INFO:
+		vema_set_veswport_info(port, recvd_mad, rsp_mad);
+		break;
+	case OPA_EM_ATTR_VESWPORT_MAC_ENTRIES:
+		vema_set_mac_entries(port, recvd_mad, rsp_mad);
+		break;
+	case OPA_EM_ATTR_DELETE_VESW:
+		vema_set_delete_vesw(port, recvd_mad, rsp_mad);
+		break;
+	default:
+		rsp_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR;
+		break;
+	}
+}
+
+/**
+ * vema_send -- Send handler for VEMA MAD agent
+ * @mad_agent: pointer to the mad agent
+ * @mad_wc:    pointer to mad send work completion information
+ *
+ * Free all the data structures associated with the sent MAD
+ */
+static void vema_send(struct ib_mad_agent *mad_agent,
+		      struct ib_mad_send_wc *mad_wc)
+{
+	rdma_destroy_ah(mad_wc->send_buf->ah);
+	ib_free_send_mad(mad_wc->send_buf);
+}
+
+/**
+ * vema_recv -- Recv handler for VEMA MAD agent
+ * @mad_agent: pointer to the mad agent
+ * @send_buf: Send buffer if found, else NULL
+ * @mad_wc:    pointer to mad send work completion information
+ *
+ * Handle only set and get methods and respond to other methods
+ * as unsupported. Allocate response buffer and address handle
+ * for the response MAD.
+ */
+static void vema_recv(struct ib_mad_agent *mad_agent,
+		      struct ib_mad_send_buf *send_buf,
+		      struct ib_mad_recv_wc *mad_wc)
+{
+	struct opa_vnic_vema_port *port;
+	struct ib_ah              *ah;
+	struct ib_mad_send_buf    *rsp;
+	struct opa_vnic_vema_mad  *vema_mad;
+
+	if (!mad_wc || !mad_wc->recv_buf.mad)
+		return;
+
+	port = mad_agent->context;
+	ah = ib_create_ah_from_wc(mad_agent->qp->pd, mad_wc->wc,
+				  mad_wc->recv_buf.grh, mad_agent->port_num);
+	if (IS_ERR(ah))
+		goto free_recv_mad;
+
+	rsp = ib_create_send_mad(mad_agent, mad_wc->wc->src_qp,
+				 mad_wc->wc->pkey_index, 0,
+				 IB_MGMT_VENDOR_HDR, OPA_VNIC_EMA_DATA,
+				 GFP_KERNEL, OPA_MGMT_BASE_VERSION);
+	if (IS_ERR(rsp))
+		goto err_rsp;
+
+	rsp->ah = ah;
+	vema_mad = rsp->mad;
+	memcpy(vema_mad, mad_wc->recv_buf.mad, IB_MGMT_VENDOR_HDR);
+	vema_mad->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+	vema_mad->mad_hdr.status = 0;
+
+	/* Lock ensures network adapter is not removed */
+	mutex_lock(&port->lock);
+
+	switch (mad_wc->recv_buf.mad->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		vema_get(port, (struct opa_vnic_vema_mad *)mad_wc->recv_buf.mad,
+			 vema_mad);
+		break;
+	case IB_MGMT_METHOD_SET:
+		vema_set(port, (struct opa_vnic_vema_mad *)mad_wc->recv_buf.mad,
+			 vema_mad);
+		break;
+	default:
+		vema_mad->mad_hdr.status = OPA_VNIC_UNSUP_ATTR;
+		break;
+	}
+	mutex_unlock(&port->lock);
+
+	if (!ib_post_send_mad(rsp, NULL)) {
+		/*
+		 * with post send successful ah and send mad
+		 * will be destroyed in send handler
+		 */
+		goto free_recv_mad;
+	}
+
+	ib_free_send_mad(rsp);
+
+err_rsp:
+	rdma_destroy_ah(ah);
+free_recv_mad:
+	ib_free_recv_mad(mad_wc);
+}
+
+/**
+ * vema_get_port -- Gets the opa_vnic_vema_port
+ * @cport: pointer to control dev
+ * @port_num: Port number
+ *
+ * This function loops through the ports and returns
+ * the opa_vnic_vema port structure that is associated
+ * with the OPA port number
+ *
+ * Return: ptr to requested opa_vnic_vema_port strucure
+ *         if success, NULL if not
+ */
+static struct opa_vnic_vema_port *
+vema_get_port(struct opa_vnic_ctrl_port *cport, u8 port_num)
+{
+	struct opa_vnic_vema_port *port = (void *)cport + sizeof(*cport);
+
+	if (port_num > cport->num_ports)
+		return NULL;
+
+	return port + (port_num - 1);
+}
+
+/**
+ * opa_vnic_vema_send_trap -- This function sends a trap to the EM
+ * @adapter: pointer to vnic adapter
+ * @data: pointer to trap data filled by calling function
+ * @lid:  issuers lid (encap_slid from vesw_port_info)
+ *
+ * This function is called from the VNIC driver to send a trap if there
+ * is somethng the EM should be notified about. These events currently
+ * are
+ * 1) UNICAST INTERFACE MACADDRESS changes
+ * 2) MULTICAST INTERFACE MACADDRESS changes
+ * 3) ETHERNET LINK STATUS changes
+ * While allocating the send mad the remote site qpn used is 1
+ * as this is the well known QP.
+ *
+ */
+void opa_vnic_vema_send_trap(struct opa_vnic_adapter *adapter,
+			     struct __opa_veswport_trap *data, u32 lid)
+{
+	struct opa_vnic_ctrl_port *cport = adapter->cport;
+	struct ib_mad_send_buf *send_buf;
+	struct opa_vnic_vema_port *port;
+	struct ib_device *ibp;
+	struct opa_vnic_vema_mad_trap *trap_mad;
+	struct opa_class_port_info *class;
+	struct rdma_ah_attr ah_attr;
+	struct ib_ah *ah;
+	struct opa_veswport_trap *trap;
+	u32 trap_lid;
+	u16 pkey_idx;
+
+	if (!cport)
+		goto err_exit;
+	ibp = cport->ibdev;
+	port = vema_get_port(cport, data->opaportnum);
+	if (!port || !port->mad_agent)
+		goto err_exit;
+
+	if (time_before(jiffies, adapter->trap_timeout)) {
+		if (adapter->trap_count == OPA_VNIC_TRAP_BURST_LIMIT) {
+			v_warn("Trap rate exceeded\n");
+			goto err_exit;
+		} else {
+			adapter->trap_count++;
+		}
+	} else {
+		adapter->trap_count = 0;
+	}
+
+	class = &port->class_port_info;
+	/* Set up address handle */
+	memset(&ah_attr, 0, sizeof(ah_attr));
+	ah_attr.type = rdma_ah_find_type(ibp, port->port_num);
+	rdma_ah_set_sl(&ah_attr,
+		       GET_TRAP_SL_FROM_CLASS_PORT_INFO(class->trap_sl_rsvd));
+	rdma_ah_set_port_num(&ah_attr, port->port_num);
+	trap_lid = be32_to_cpu(class->trap_lid);
+	/*
+	 * check for trap lid validity, must not be zero
+	 * The trap sink could change after we fashion the MAD but since traps
+	 * are not guaranteed we won't use a lock as anyway the change will take
+	 * place even with locking.
+	 */
+	if (!trap_lid) {
+		c_err("%s: Invalid dlid\n", __func__);
+		goto err_exit;
+	}
+
+	rdma_ah_set_dlid(&ah_attr, trap_lid);
+	ah = rdma_create_ah(port->mad_agent->qp->pd, &ah_attr);
+	if (IS_ERR(ah)) {
+		c_err("%s:Couldn't create new AH = %p\n", __func__, ah);
+		c_err("%s:dlid = %d, sl = %d, port = %d\n", __func__,
+		      rdma_ah_get_dlid(&ah_attr), rdma_ah_get_sl(&ah_attr),
+		      rdma_ah_get_port_num(&ah_attr));
+		goto err_exit;
+	}
+
+	if (ib_find_pkey(ibp, data->opaportnum, IB_DEFAULT_PKEY_FULL,
+			 &pkey_idx) < 0) {
+		c_err("%s:full key not found, defaulting to partial\n",
+		      __func__);
+		if (ib_find_pkey(ibp, data->opaportnum, IB_DEFAULT_PKEY_PARTIAL,
+				 &pkey_idx) < 0)
+			pkey_idx = 1;
+	}
+
+	send_buf = ib_create_send_mad(port->mad_agent, 1, pkey_idx, 0,
+				      IB_MGMT_VENDOR_HDR, IB_MGMT_MAD_DATA,
+				      GFP_ATOMIC, OPA_MGMT_BASE_VERSION);
+	if (IS_ERR(send_buf)) {
+		c_err("%s:Couldn't allocate send buf\n", __func__);
+		goto err_sndbuf;
+	}
+
+	send_buf->ah = ah;
+
+	/* Set up common MAD hdr */
+	trap_mad = send_buf->mad;
+	trap_mad->mad_hdr.base_version = OPA_MGMT_BASE_VERSION;
+	trap_mad->mad_hdr.mgmt_class = OPA_MGMT_CLASS_INTEL_EMA;
+	trap_mad->mad_hdr.class_version = OPA_EMA_CLASS_VERSION;
+	trap_mad->mad_hdr.method = IB_MGMT_METHOD_TRAP;
+	port->tid++;
+	trap_mad->mad_hdr.tid = cpu_to_be64(port->tid);
+	trap_mad->mad_hdr.attr_id = IB_SMP_ATTR_NOTICE;
+
+	/* Set up vendor OUI */
+	trap_mad->oui[0] = INTEL_OUI_1;
+	trap_mad->oui[1] = INTEL_OUI_2;
+	trap_mad->oui[2] = INTEL_OUI_3;
+
+	/* Setup notice attribute portion */
+	trap_mad->notice.gen_type = OPA_INTEL_EMA_NOTICE_TYPE_INFO << 1;
+	trap_mad->notice.oui_1 = INTEL_OUI_1;
+	trap_mad->notice.oui_2 = INTEL_OUI_2;
+	trap_mad->notice.oui_3 = INTEL_OUI_3;
+	trap_mad->notice.issuer_lid = cpu_to_be32(lid);
+
+	/* copy the actual trap data */
+	trap = (struct opa_veswport_trap *)trap_mad->notice.raw_data;
+	trap->fabric_id = cpu_to_be16(data->fabric_id);
+	trap->veswid = cpu_to_be16(data->veswid);
+	trap->veswportnum = cpu_to_be32(data->veswportnum);
+	trap->opaportnum = cpu_to_be16(data->opaportnum);
+	trap->veswportindex = data->veswportindex;
+	trap->opcode = data->opcode;
+
+	/* If successful send set up rate limit timeout else bail */
+	if (ib_post_send_mad(send_buf, NULL)) {
+		ib_free_send_mad(send_buf);
+	} else {
+		if (adapter->trap_count)
+			return;
+		adapter->trap_timeout = jiffies +
+					usecs_to_jiffies(OPA_VNIC_TRAP_TIMEOUT);
+		return;
+	}
+
+err_sndbuf:
+	rdma_destroy_ah(ah);
+err_exit:
+	v_err("Aborting trap\n");
+}
+
+static int vema_rem_vport(int id, void *p, void *data)
+{
+	struct opa_vnic_adapter *adapter = p;
+
+	opa_vnic_rem_netdev(adapter);
+	return 0;
+}
+
+static int vema_enable_vport(int id, void *p, void *data)
+{
+	struct opa_vnic_adapter *adapter = p;
+
+	netif_carrier_on(adapter->netdev);
+	return 0;
+}
+
+static int vema_disable_vport(int id, void *p, void *data)
+{
+	struct opa_vnic_adapter *adapter = p;
+
+	netif_carrier_off(adapter->netdev);
+	return 0;
+}
+
+static void opa_vnic_event(struct ib_event_handler *handler,
+			   struct ib_event *record)
+{
+	struct opa_vnic_vema_port *port =
+		container_of(handler, struct opa_vnic_vema_port, event_handler);
+	struct opa_vnic_ctrl_port *cport = port->cport;
+
+	if (record->element.port_num != port->port_num)
+		return;
+
+	c_dbg("OPA_VNIC received event %d on device %s port %d\n",
+	      record->event, record->device->name, record->element.port_num);
+
+	if (record->event == IB_EVENT_PORT_ERR)
+		idr_for_each(&port->vport_idr, vema_disable_vport, NULL);
+	if (record->event == IB_EVENT_PORT_ACTIVE)
+		idr_for_each(&port->vport_idr, vema_enable_vport, NULL);
+}
+
+/**
+ * vema_unregister -- Unregisters agent
+ * @cport: pointer to control port
+ *
+ * This deletes the registration by VEMA for MADs
+ */
+static void vema_unregister(struct opa_vnic_ctrl_port *cport)
+{
+	int i;
+
+	for (i = 1; i <= cport->num_ports; i++) {
+		struct opa_vnic_vema_port *port = vema_get_port(cport, i);
+
+		if (!port->mad_agent)
+			continue;
+
+		/* Lock ensures no MAD is being processed */
+		mutex_lock(&port->lock);
+		idr_for_each(&port->vport_idr, vema_rem_vport, NULL);
+		mutex_unlock(&port->lock);
+
+		ib_unregister_mad_agent(port->mad_agent);
+		port->mad_agent = NULL;
+		mutex_destroy(&port->lock);
+		idr_destroy(&port->vport_idr);
+		ib_unregister_event_handler(&port->event_handler);
+	}
+}
+
+/**
+ * vema_register -- Registers agent
+ * @cport: pointer to control port
+ *
+ * This function registers the handlers for the VEMA MADs
+ *
+ * Return: returns 0 on success. non zero otherwise
+ */
+static int vema_register(struct opa_vnic_ctrl_port *cport)
+{
+	struct ib_mad_reg_req reg_req = {
+		.mgmt_class = OPA_MGMT_CLASS_INTEL_EMA,
+		.mgmt_class_version = OPA_MGMT_BASE_VERSION,
+		.oui = { INTEL_OUI_1, INTEL_OUI_2, INTEL_OUI_3 }
+	};
+	int i;
+
+	set_bit(IB_MGMT_METHOD_GET, reg_req.method_mask);
+	set_bit(IB_MGMT_METHOD_SET, reg_req.method_mask);
+
+	/* register ib event handler and mad agent for each port on dev */
+	for (i = 1; i <= cport->num_ports; i++) {
+		struct opa_vnic_vema_port *port = vema_get_port(cport, i);
+		int ret;
+
+		port->cport = cport;
+		port->port_num = i;
+
+		INIT_IB_EVENT_HANDLER(&port->event_handler,
+				      cport->ibdev, opa_vnic_event);
+		ib_register_event_handler(&port->event_handler);
+
+		idr_init(&port->vport_idr);
+		mutex_init(&port->lock);
+		port->mad_agent = ib_register_mad_agent(cport->ibdev, i,
+							IB_QPT_GSI, &reg_req,
+							IB_MGMT_RMPP_VERSION,
+							vema_send, vema_recv,
+							port, 0);
+		if (IS_ERR(port->mad_agent)) {
+			ret = PTR_ERR(port->mad_agent);
+			port->mad_agent = NULL;
+			mutex_destroy(&port->lock);
+			idr_destroy(&port->vport_idr);
+			vema_unregister(cport);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * opa_vnic_ctrl_config_dev -- This function sends a trap to the EM
+ * by way of ib_modify_port to indicate support for ethernet on the
+ * fabric.
+ * @cport: pointer to control port
+ * @en: enable or disable ethernet on fabric support
+ */
+static void opa_vnic_ctrl_config_dev(struct opa_vnic_ctrl_port *cport, bool en)
+{
+	struct ib_port_modify pm = { 0 };
+	int i;
+
+	if (en)
+		pm.set_port_cap_mask = OPA_CAP_MASK3_IsEthOnFabricSupported;
+	else
+		pm.clr_port_cap_mask = OPA_CAP_MASK3_IsEthOnFabricSupported;
+
+	for (i = 1; i <= cport->num_ports; i++)
+		ib_modify_port(cport->ibdev, i, IB_PORT_OPA_MASK_CHG, &pm);
+}
+
+/**
+ * opa_vnic_vema_add_one -- Handle new ib device
+ * @device: ib device pointer
+ *
+ * Allocate the vnic control port and initialize it.
+ */
+static void opa_vnic_vema_add_one(struct ib_device *device)
+{
+	struct opa_vnic_ctrl_port *cport;
+	int rc, size = sizeof(*cport);
+
+	if (!rdma_cap_opa_vnic(device))
+		return;
+
+	size += device->phys_port_cnt * sizeof(struct opa_vnic_vema_port);
+	cport = kzalloc(size, GFP_KERNEL);
+	if (!cport)
+		return;
+
+	cport->num_ports = device->phys_port_cnt;
+	cport->ibdev = device;
+
+	/* Initialize opa vnic management agent (vema) */
+	rc = vema_register(cport);
+	if (!rc)
+		c_info("VNIC client initialized\n");
+
+	ib_set_client_data(device, &opa_vnic_client, cport);
+	opa_vnic_ctrl_config_dev(cport, true);
+}
+
+/**
+ * opa_vnic_vema_rem_one -- Handle ib device removal
+ * @device: ib device pointer
+ * @client_data: ib client data
+ *
+ * Uninitialize and free the vnic control port.
+ */
+static void opa_vnic_vema_rem_one(struct ib_device *device,
+				  void *client_data)
+{
+	struct opa_vnic_ctrl_port *cport = client_data;
+
+	if (!cport)
+		return;
+
+	c_info("removing VNIC client\n");
+	opa_vnic_ctrl_config_dev(cport, false);
+	vema_unregister(cport);
+	kfree(cport);
+}
+
+static int __init opa_vnic_init(void)
+{
+	int rc;
+
+	pr_info("OPA Virtual Network Driver - v%s\n",
+		opa_vnic_driver_version);
+
+	rc = ib_register_client(&opa_vnic_client);
+	if (rc)
+		pr_err("VNIC driver register failed %d\n", rc);
+
+	return rc;
+}
+module_init(opa_vnic_init);
+
+static void opa_vnic_deinit(void)
+{
+	ib_unregister_client(&opa_vnic_client);
+}
+module_exit(opa_vnic_deinit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("Intel OPA Virtual Network driver");
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c
new file mode 100644
index 0000000..868b5ae
--- /dev/null
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c
@@ -0,0 +1,390 @@
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains OPA VNIC EMA Interface functions.
+ */
+
+#include "opa_vnic_internal.h"
+
+/**
+ * opa_vnic_vema_report_event - sent trap to report the specified event
+ * @adapter: vnic port adapter
+ * @event: event to be reported
+ *
+ * This function calls vema api to sent a trap for the given event.
+ */
+void opa_vnic_vema_report_event(struct opa_vnic_adapter *adapter, u8 event)
+{
+	struct __opa_veswport_info *info = &adapter->info;
+	struct __opa_veswport_trap trap_data;
+
+	trap_data.fabric_id = info->vesw.fabric_id;
+	trap_data.veswid = info->vesw.vesw_id;
+	trap_data.veswportnum = info->vport.port_num;
+	trap_data.opaportnum = adapter->port_num;
+	trap_data.veswportindex = adapter->vport_num;
+	trap_data.opcode = event;
+
+	opa_vnic_vema_send_trap(adapter, &trap_data, info->vport.encap_slid);
+}
+
+/**
+ * opa_vnic_get_error_counters - get summary counters
+ * @adapter: vnic port adapter
+ * @cntrs: pointer to destination summary counters structure
+ *
+ * This function populates the summary counters that is maintained by the
+ * given adapter to destination address provided.
+ */
+void opa_vnic_get_summary_counters(struct opa_vnic_adapter *adapter,
+				   struct opa_veswport_summary_counters *cntrs)
+{
+	struct opa_vnic_stats vstats;
+	__be64 *dst;
+	u64 *src;
+
+	memset(&vstats, 0, sizeof(vstats));
+	spin_lock(&adapter->stats_lock);
+	adapter->rn_ops->ndo_get_stats64(adapter->netdev, &vstats.netstats);
+	spin_unlock(&adapter->stats_lock);
+
+	cntrs->vp_instance = cpu_to_be16(adapter->vport_num);
+	cntrs->vesw_id = cpu_to_be16(adapter->info.vesw.vesw_id);
+	cntrs->veswport_num = cpu_to_be32(adapter->port_num);
+
+	cntrs->tx_errors = cpu_to_be64(vstats.netstats.tx_errors);
+	cntrs->rx_errors = cpu_to_be64(vstats.netstats.rx_errors);
+	cntrs->tx_packets = cpu_to_be64(vstats.netstats.tx_packets);
+	cntrs->rx_packets = cpu_to_be64(vstats.netstats.rx_packets);
+	cntrs->tx_bytes = cpu_to_be64(vstats.netstats.tx_bytes);
+	cntrs->rx_bytes = cpu_to_be64(vstats.netstats.rx_bytes);
+
+	/*
+	 * This loop depends on layout of
+	 * opa_veswport_summary_counters opa_vnic_stats structures.
+	 */
+	for (dst = &cntrs->tx_unicast, src = &vstats.tx_grp.unicast;
+	     dst < &cntrs->reserved[0]; dst++, src++) {
+		*dst = cpu_to_be64(*src);
+	}
+}
+
+/**
+ * opa_vnic_get_error_counters - get error counters
+ * @adapter: vnic port adapter
+ * @cntrs: pointer to destination error counters structure
+ *
+ * This function populates the error counters that is maintained by the
+ * given adapter to destination address provided.
+ */
+void opa_vnic_get_error_counters(struct opa_vnic_adapter *adapter,
+				 struct opa_veswport_error_counters *cntrs)
+{
+	struct opa_vnic_stats vstats;
+
+	memset(&vstats, 0, sizeof(vstats));
+	spin_lock(&adapter->stats_lock);
+	adapter->rn_ops->ndo_get_stats64(adapter->netdev, &vstats.netstats);
+	spin_unlock(&adapter->stats_lock);
+
+	cntrs->vp_instance = cpu_to_be16(adapter->vport_num);
+	cntrs->vesw_id = cpu_to_be16(adapter->info.vesw.vesw_id);
+	cntrs->veswport_num = cpu_to_be32(adapter->port_num);
+
+	cntrs->tx_errors = cpu_to_be64(vstats.netstats.tx_errors);
+	cntrs->rx_errors = cpu_to_be64(vstats.netstats.rx_errors);
+	cntrs->tx_dlid_zero = cpu_to_be64(vstats.tx_dlid_zero);
+	cntrs->tx_drop_state = cpu_to_be64(vstats.tx_drop_state);
+	cntrs->tx_logic = cpu_to_be64(vstats.netstats.tx_fifo_errors +
+				      vstats.netstats.tx_carrier_errors);
+
+	cntrs->rx_bad_veswid = cpu_to_be64(vstats.netstats.rx_nohandler);
+	cntrs->rx_runt = cpu_to_be64(vstats.rx_runt);
+	cntrs->rx_oversize = cpu_to_be64(vstats.rx_oversize);
+	cntrs->rx_drop_state = cpu_to_be64(vstats.rx_drop_state);
+	cntrs->rx_logic = cpu_to_be64(vstats.netstats.rx_fifo_errors);
+}
+
+/**
+ * opa_vnic_get_vesw_info -- Get the vesw information
+ * @adapter: vnic port adapter
+ * @info: pointer to destination vesw info structure
+ *
+ * This function copies the vesw info that is maintained by the
+ * given adapter to destination address provided.
+ */
+void opa_vnic_get_vesw_info(struct opa_vnic_adapter *adapter,
+			    struct opa_vesw_info *info)
+{
+	struct __opa_vesw_info *src = &adapter->info.vesw;
+	int i;
+
+	info->fabric_id = cpu_to_be16(src->fabric_id);
+	info->vesw_id = cpu_to_be16(src->vesw_id);
+	memcpy(info->rsvd0, src->rsvd0, ARRAY_SIZE(src->rsvd0));
+	info->def_port_mask = cpu_to_be16(src->def_port_mask);
+	memcpy(info->rsvd1, src->rsvd1, ARRAY_SIZE(src->rsvd1));
+	info->pkey = cpu_to_be16(src->pkey);
+
+	memcpy(info->rsvd2, src->rsvd2, ARRAY_SIZE(src->rsvd2));
+	info->u_mcast_dlid = cpu_to_be32(src->u_mcast_dlid);
+	for (i = 0; i < OPA_VESW_MAX_NUM_DEF_PORT; i++)
+		info->u_ucast_dlid[i] = cpu_to_be32(src->u_ucast_dlid[i]);
+
+	info->rc = cpu_to_be32(src->rc);
+
+	memcpy(info->rsvd3, src->rsvd3, ARRAY_SIZE(src->rsvd3));
+	info->eth_mtu = cpu_to_be16(src->eth_mtu);
+	memcpy(info->rsvd4, src->rsvd4, ARRAY_SIZE(src->rsvd4));
+}
+
+/**
+ * opa_vnic_set_vesw_info -- Set the vesw information
+ * @adapter: vnic port adapter
+ * @info: pointer to vesw info structure
+ *
+ * This function updates the vesw info that is maintained by the
+ * given adapter with vesw info provided. Reserved fields are stored
+ * and returned back to EM as is.
+ */
+void opa_vnic_set_vesw_info(struct opa_vnic_adapter *adapter,
+			    struct opa_vesw_info *info)
+{
+	struct __opa_vesw_info *dst = &adapter->info.vesw;
+	int i;
+
+	dst->fabric_id = be16_to_cpu(info->fabric_id);
+	dst->vesw_id = be16_to_cpu(info->vesw_id);
+	memcpy(dst->rsvd0, info->rsvd0, ARRAY_SIZE(info->rsvd0));
+	dst->def_port_mask = be16_to_cpu(info->def_port_mask);
+	memcpy(dst->rsvd1, info->rsvd1, ARRAY_SIZE(info->rsvd1));
+	dst->pkey = be16_to_cpu(info->pkey);
+
+	memcpy(dst->rsvd2, info->rsvd2, ARRAY_SIZE(info->rsvd2));
+	dst->u_mcast_dlid = be32_to_cpu(info->u_mcast_dlid);
+	for (i = 0; i < OPA_VESW_MAX_NUM_DEF_PORT; i++)
+		dst->u_ucast_dlid[i] = be32_to_cpu(info->u_ucast_dlid[i]);
+
+	dst->rc = be32_to_cpu(info->rc);
+
+	memcpy(dst->rsvd3, info->rsvd3, ARRAY_SIZE(info->rsvd3));
+	dst->eth_mtu = be16_to_cpu(info->eth_mtu);
+	memcpy(dst->rsvd4, info->rsvd4, ARRAY_SIZE(info->rsvd4));
+}
+
+/**
+ * opa_vnic_get_per_veswport_info -- Get the vesw per port information
+ * @adapter: vnic port adapter
+ * @info: pointer to destination vport info structure
+ *
+ * This function copies the vesw per port info that is maintained by the
+ * given adapter to destination address provided.
+ * Note that the read only fields are not copied.
+ */
+void opa_vnic_get_per_veswport_info(struct opa_vnic_adapter *adapter,
+				    struct opa_per_veswport_info *info)
+{
+	struct __opa_per_veswport_info *src = &adapter->info.vport;
+
+	info->port_num = cpu_to_be32(src->port_num);
+	info->eth_link_status = src->eth_link_status;
+	memcpy(info->rsvd0, src->rsvd0, ARRAY_SIZE(src->rsvd0));
+
+	memcpy(info->base_mac_addr, src->base_mac_addr,
+	       ARRAY_SIZE(info->base_mac_addr));
+	info->config_state = src->config_state;
+	info->oper_state = src->oper_state;
+	info->max_mac_tbl_ent = cpu_to_be16(src->max_mac_tbl_ent);
+	info->max_smac_ent = cpu_to_be16(src->max_smac_ent);
+	info->mac_tbl_digest = cpu_to_be32(src->mac_tbl_digest);
+	memcpy(info->rsvd1, src->rsvd1, ARRAY_SIZE(src->rsvd1));
+
+	info->encap_slid = cpu_to_be32(src->encap_slid);
+	memcpy(info->pcp_to_sc_uc, src->pcp_to_sc_uc,
+	       ARRAY_SIZE(info->pcp_to_sc_uc));
+	memcpy(info->pcp_to_vl_uc, src->pcp_to_vl_uc,
+	       ARRAY_SIZE(info->pcp_to_vl_uc));
+	memcpy(info->pcp_to_sc_mc, src->pcp_to_sc_mc,
+	       ARRAY_SIZE(info->pcp_to_sc_mc));
+	memcpy(info->pcp_to_vl_mc, src->pcp_to_vl_mc,
+	       ARRAY_SIZE(info->pcp_to_vl_mc));
+	info->non_vlan_sc_uc = src->non_vlan_sc_uc;
+	info->non_vlan_vl_uc = src->non_vlan_vl_uc;
+	info->non_vlan_sc_mc = src->non_vlan_sc_mc;
+	info->non_vlan_vl_mc = src->non_vlan_vl_mc;
+	memcpy(info->rsvd2, src->rsvd2, ARRAY_SIZE(src->rsvd2));
+
+	info->uc_macs_gen_count = cpu_to_be16(src->uc_macs_gen_count);
+	info->mc_macs_gen_count = cpu_to_be16(src->mc_macs_gen_count);
+	memcpy(info->rsvd3, src->rsvd3, ARRAY_SIZE(src->rsvd3));
+}
+
+/**
+ * opa_vnic_set_per_veswport_info -- Set vesw per port information
+ * @adapter: vnic port adapter
+ * @info: pointer to vport info structure
+ *
+ * This function updates the vesw per port info that is maintained by the
+ * given adapter with vesw per port info provided. Reserved fields are
+ * stored and returned back to EM as is.
+ */
+void opa_vnic_set_per_veswport_info(struct opa_vnic_adapter *adapter,
+				    struct opa_per_veswport_info *info)
+{
+	struct __opa_per_veswport_info *dst = &adapter->info.vport;
+
+	dst->port_num = be32_to_cpu(info->port_num);
+	memcpy(dst->rsvd0, info->rsvd0, ARRAY_SIZE(info->rsvd0));
+
+	memcpy(dst->base_mac_addr, info->base_mac_addr,
+	       ARRAY_SIZE(dst->base_mac_addr));
+	dst->config_state = info->config_state;
+	memcpy(dst->rsvd1, info->rsvd1, ARRAY_SIZE(info->rsvd1));
+
+	dst->encap_slid = be32_to_cpu(info->encap_slid);
+	memcpy(dst->pcp_to_sc_uc, info->pcp_to_sc_uc,
+	       ARRAY_SIZE(dst->pcp_to_sc_uc));
+	memcpy(dst->pcp_to_vl_uc, info->pcp_to_vl_uc,
+	       ARRAY_SIZE(dst->pcp_to_vl_uc));
+	memcpy(dst->pcp_to_sc_mc, info->pcp_to_sc_mc,
+	       ARRAY_SIZE(dst->pcp_to_sc_mc));
+	memcpy(dst->pcp_to_vl_mc, info->pcp_to_vl_mc,
+	       ARRAY_SIZE(dst->pcp_to_vl_mc));
+	dst->non_vlan_sc_uc = info->non_vlan_sc_uc;
+	dst->non_vlan_vl_uc = info->non_vlan_vl_uc;
+	dst->non_vlan_sc_mc = info->non_vlan_sc_mc;
+	dst->non_vlan_vl_mc = info->non_vlan_vl_mc;
+	memcpy(dst->rsvd2, info->rsvd2, ARRAY_SIZE(info->rsvd2));
+	memcpy(dst->rsvd3, info->rsvd3, ARRAY_SIZE(info->rsvd3));
+}
+
+/**
+ * opa_vnic_query_mcast_macs - query multicast mac list
+ * @adapter: vnic port adapter
+ * @macs: pointer mac list
+ *
+ * This function populates the provided mac list with the configured
+ * multicast addresses in the adapter.
+ */
+void opa_vnic_query_mcast_macs(struct opa_vnic_adapter *adapter,
+			       struct opa_veswport_iface_macs *macs)
+{
+	u16 start_idx, num_macs, idx = 0, count = 0;
+	struct netdev_hw_addr *ha;
+
+	start_idx = be16_to_cpu(macs->start_idx);
+	num_macs = be16_to_cpu(macs->num_macs_in_msg);
+	netdev_for_each_mc_addr(ha, adapter->netdev) {
+		struct opa_vnic_iface_mac_entry *entry = &macs->entry[count];
+
+		if (start_idx > idx++)
+			continue;
+		else if (num_macs == count)
+			break;
+		memcpy(entry, ha->addr, sizeof(*entry));
+		count++;
+	}
+
+	macs->tot_macs_in_lst = cpu_to_be16(netdev_mc_count(adapter->netdev));
+	macs->num_macs_in_msg = cpu_to_be16(count);
+	macs->gen_count = cpu_to_be16(adapter->info.vport.mc_macs_gen_count);
+}
+
+/**
+ * opa_vnic_query_ucast_macs - query unicast mac list
+ * @adapter: vnic port adapter
+ * @macs: pointer mac list
+ *
+ * This function populates the provided mac list with the configured
+ * unicast addresses in the adapter.
+ */
+void opa_vnic_query_ucast_macs(struct opa_vnic_adapter *adapter,
+			       struct opa_veswport_iface_macs *macs)
+{
+	u16 start_idx, tot_macs, num_macs, idx = 0, count = 0, em_macs = 0;
+	struct netdev_hw_addr *ha;
+
+	start_idx = be16_to_cpu(macs->start_idx);
+	num_macs = be16_to_cpu(macs->num_macs_in_msg);
+	/* loop through dev_addrs list first */
+	for_each_dev_addr(adapter->netdev, ha) {
+		struct opa_vnic_iface_mac_entry *entry = &macs->entry[count];
+
+		/* Do not include EM specified MAC address */
+		if (!memcmp(adapter->info.vport.base_mac_addr, ha->addr,
+			    ARRAY_SIZE(adapter->info.vport.base_mac_addr))) {
+			em_macs++;
+			continue;
+		}
+
+		if (start_idx > idx++)
+			continue;
+		else if (num_macs == count)
+			break;
+		memcpy(entry, ha->addr, sizeof(*entry));
+		count++;
+	}
+
+	/* loop through uc list */
+	netdev_for_each_uc_addr(ha, adapter->netdev) {
+		struct opa_vnic_iface_mac_entry *entry = &macs->entry[count];
+
+		if (start_idx > idx++)
+			continue;
+		else if (num_macs == count)
+			break;
+		memcpy(entry, ha->addr, sizeof(*entry));
+		count++;
+	}
+
+	tot_macs = netdev_hw_addr_list_count(&adapter->netdev->dev_addrs) +
+		   netdev_uc_count(adapter->netdev) - em_macs;
+	macs->tot_macs_in_lst = cpu_to_be16(tot_macs);
+	macs->num_macs_in_msg = cpu_to_be16(count);
+	macs->gen_count = cpu_to_be16(adapter->info.vport.uc_macs_gen_count);
+}
diff --git a/drivers/infiniband/ulp/srp/Kbuild b/drivers/infiniband/ulp/srp/Kbuild
new file mode 100644
index 0000000..a16c73c
--- /dev/null
+++ b/drivers/infiniband/ulp/srp/Kbuild
@@ -0,0 +1 @@
+obj-$(CONFIG_INFINIBAND_SRP)			+= ib_srp.o
diff --git a/drivers/infiniband/ulp/srp/Kconfig b/drivers/infiniband/ulp/srp/Kconfig
new file mode 100644
index 0000000..99db8fe
--- /dev/null
+++ b/drivers/infiniband/ulp/srp/Kconfig
@@ -0,0 +1,12 @@
+config INFINIBAND_SRP
+	tristate "InfiniBand SCSI RDMA Protocol"
+	depends on SCSI && INFINIBAND_ADDR_TRANS
+	select SCSI_SRP_ATTRS
+	---help---
+	  Support for the SCSI RDMA Protocol over InfiniBand.  This
+	  allows you to access storage devices that speak SRP over
+	  InfiniBand.
+
+	  The SRP protocol is defined by the INCITS T10 technical
+	  committee.  See <http://www.t10.org/>.
+
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
new file mode 100644
index 0000000..c35d2cd
--- /dev/null
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -0,0 +1,4263 @@
+/*
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/parser.h>
+#include <linux/random.h>
+#include <linux/jiffies.h>
+#include <linux/lockdep.h>
+#include <linux/inet.h>
+#include <rdma/ib_cache.h>
+
+#include <linux/atomic.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_dbg.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/srp.h>
+#include <scsi/scsi_transport_srp.h>
+
+#include "ib_srp.h"
+
+#define DRV_NAME	"ib_srp"
+#define PFX		DRV_NAME ": "
+#define DRV_VERSION	"2.0"
+#define DRV_RELDATE	"July 26, 2015"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_INFO(release_date, DRV_RELDATE);
+
+#if !defined(CONFIG_DYNAMIC_DEBUG)
+#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt)
+#define DYNAMIC_DEBUG_BRANCH(descriptor) false
+#endif
+
+static unsigned int srp_sg_tablesize;
+static unsigned int cmd_sg_entries;
+static unsigned int indirect_sg_entries;
+static bool allow_ext_sg;
+static bool prefer_fr = true;
+static bool register_always = true;
+static bool never_register;
+static int topspin_workarounds = 1;
+
+module_param(srp_sg_tablesize, uint, 0444);
+MODULE_PARM_DESC(srp_sg_tablesize, "Deprecated name for cmd_sg_entries");
+
+module_param(cmd_sg_entries, uint, 0444);
+MODULE_PARM_DESC(cmd_sg_entries,
+		 "Default number of gather/scatter entries in the SRP command (default is 12, max 255)");
+
+module_param(indirect_sg_entries, uint, 0444);
+MODULE_PARM_DESC(indirect_sg_entries,
+		 "Default max number of gather/scatter entries (default is 12, max is " __stringify(SG_MAX_SEGMENTS) ")");
+
+module_param(allow_ext_sg, bool, 0444);
+MODULE_PARM_DESC(allow_ext_sg,
+		  "Default behavior when there are more than cmd_sg_entries S/G entries after mapping; fails the request when false (default false)");
+
+module_param(topspin_workarounds, int, 0444);
+MODULE_PARM_DESC(topspin_workarounds,
+		 "Enable workarounds for Topspin/Cisco SRP target bugs if != 0");
+
+module_param(prefer_fr, bool, 0444);
+MODULE_PARM_DESC(prefer_fr,
+"Whether to use fast registration if both FMR and fast registration are supported");
+
+module_param(register_always, bool, 0444);
+MODULE_PARM_DESC(register_always,
+		 "Use memory registration even for contiguous memory regions");
+
+module_param(never_register, bool, 0444);
+MODULE_PARM_DESC(never_register, "Never register memory");
+
+static const struct kernel_param_ops srp_tmo_ops;
+
+static int srp_reconnect_delay = 10;
+module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts");
+
+static int srp_fast_io_fail_tmo = 15;
+module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(fast_io_fail_tmo,
+		 "Number of seconds between the observation of a transport"
+		 " layer error and failing all I/O. \"off\" means that this"
+		 " functionality is disabled.");
+
+static int srp_dev_loss_tmo = 600;
+module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dev_loss_tmo,
+		 "Maximum number of seconds that the SRP transport should"
+		 " insulate transport layer errors. After this time has been"
+		 " exceeded the SCSI host is removed. Should be"
+		 " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
+		 " if fast_io_fail_tmo has not been set. \"off\" means that"
+		 " this functionality is disabled.");
+
+static unsigned ch_count;
+module_param(ch_count, uint, 0444);
+MODULE_PARM_DESC(ch_count,
+		 "Number of RDMA channels to use for communication with an SRP target. Using more than one channel improves performance if the HCA supports multiple completion vectors. The default value is the minimum of four times the number of online CPU sockets and the number of completion vectors supported by the HCA.");
+
+static void srp_add_one(struct ib_device *device);
+static void srp_remove_one(struct ib_device *device, void *client_data);
+static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
+		const char *opname);
+static int srp_ib_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
+static int srp_rdma_cm_handler(struct rdma_cm_id *cm_id,
+			       struct rdma_cm_event *event);
+
+static struct scsi_transport_template *ib_srp_transport_template;
+static struct workqueue_struct *srp_remove_wq;
+
+static struct ib_client srp_client = {
+	.name   = "srp",
+	.add    = srp_add_one,
+	.remove = srp_remove_one
+};
+
+static struct ib_sa_client srp_sa_client;
+
+static int srp_tmo_get(char *buffer, const struct kernel_param *kp)
+{
+	int tmo = *(int *)kp->arg;
+
+	if (tmo >= 0)
+		return sprintf(buffer, "%d", tmo);
+	else
+		return sprintf(buffer, "off");
+}
+
+static int srp_tmo_set(const char *val, const struct kernel_param *kp)
+{
+	int tmo, res;
+
+	res = srp_parse_tmo(&tmo, val);
+	if (res)
+		goto out;
+
+	if (kp->arg == &srp_reconnect_delay)
+		res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo,
+				    srp_dev_loss_tmo);
+	else if (kp->arg == &srp_fast_io_fail_tmo)
+		res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo);
+	else
+		res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo,
+				    tmo);
+	if (res)
+		goto out;
+	*(int *)kp->arg = tmo;
+
+out:
+	return res;
+}
+
+static const struct kernel_param_ops srp_tmo_ops = {
+	.get = srp_tmo_get,
+	.set = srp_tmo_set,
+};
+
+static inline struct srp_target_port *host_to_target(struct Scsi_Host *host)
+{
+	return (struct srp_target_port *) host->hostdata;
+}
+
+static const char *srp_target_info(struct Scsi_Host *host)
+{
+	return host_to_target(host)->target_name;
+}
+
+static int srp_target_is_topspin(struct srp_target_port *target)
+{
+	static const u8 topspin_oui[3] = { 0x00, 0x05, 0xad };
+	static const u8 cisco_oui[3]   = { 0x00, 0x1b, 0x0d };
+
+	return topspin_workarounds &&
+		(!memcmp(&target->ioc_guid, topspin_oui, sizeof topspin_oui) ||
+		 !memcmp(&target->ioc_guid, cisco_oui, sizeof cisco_oui));
+}
+
+static struct srp_iu *srp_alloc_iu(struct srp_host *host, size_t size,
+				   gfp_t gfp_mask,
+				   enum dma_data_direction direction)
+{
+	struct srp_iu *iu;
+
+	iu = kmalloc(sizeof *iu, gfp_mask);
+	if (!iu)
+		goto out;
+
+	iu->buf = kzalloc(size, gfp_mask);
+	if (!iu->buf)
+		goto out_free_iu;
+
+	iu->dma = ib_dma_map_single(host->srp_dev->dev, iu->buf, size,
+				    direction);
+	if (ib_dma_mapping_error(host->srp_dev->dev, iu->dma))
+		goto out_free_buf;
+
+	iu->size      = size;
+	iu->direction = direction;
+
+	return iu;
+
+out_free_buf:
+	kfree(iu->buf);
+out_free_iu:
+	kfree(iu);
+out:
+	return NULL;
+}
+
+static void srp_free_iu(struct srp_host *host, struct srp_iu *iu)
+{
+	if (!iu)
+		return;
+
+	ib_dma_unmap_single(host->srp_dev->dev, iu->dma, iu->size,
+			    iu->direction);
+	kfree(iu->buf);
+	kfree(iu);
+}
+
+static void srp_qp_event(struct ib_event *event, void *context)
+{
+	pr_debug("QP event %s (%d)\n",
+		 ib_event_msg(event->event), event->event);
+}
+
+static int srp_init_ib_qp(struct srp_target_port *target,
+			  struct ib_qp *qp)
+{
+	struct ib_qp_attr *attr;
+	int ret;
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	ret = ib_find_cached_pkey(target->srp_host->srp_dev->dev,
+				  target->srp_host->port,
+				  be16_to_cpu(target->ib_cm.pkey),
+				  &attr->pkey_index);
+	if (ret)
+		goto out;
+
+	attr->qp_state        = IB_QPS_INIT;
+	attr->qp_access_flags = (IB_ACCESS_REMOTE_READ |
+				    IB_ACCESS_REMOTE_WRITE);
+	attr->port_num        = target->srp_host->port;
+
+	ret = ib_modify_qp(qp, attr,
+			   IB_QP_STATE		|
+			   IB_QP_PKEY_INDEX	|
+			   IB_QP_ACCESS_FLAGS	|
+			   IB_QP_PORT);
+
+out:
+	kfree(attr);
+	return ret;
+}
+
+static int srp_new_ib_cm_id(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct ib_cm_id *new_cm_id;
+
+	new_cm_id = ib_create_cm_id(target->srp_host->srp_dev->dev,
+				    srp_ib_cm_handler, ch);
+	if (IS_ERR(new_cm_id))
+		return PTR_ERR(new_cm_id);
+
+	if (ch->ib_cm.cm_id)
+		ib_destroy_cm_id(ch->ib_cm.cm_id);
+	ch->ib_cm.cm_id = new_cm_id;
+	if (rdma_cap_opa_ah(target->srp_host->srp_dev->dev,
+			    target->srp_host->port))
+		ch->ib_cm.path.rec_type = SA_PATH_REC_TYPE_OPA;
+	else
+		ch->ib_cm.path.rec_type = SA_PATH_REC_TYPE_IB;
+	ch->ib_cm.path.sgid = target->sgid;
+	ch->ib_cm.path.dgid = target->ib_cm.orig_dgid;
+	ch->ib_cm.path.pkey = target->ib_cm.pkey;
+	ch->ib_cm.path.service_id = target->ib_cm.service_id;
+
+	return 0;
+}
+
+static int srp_new_rdma_cm_id(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct rdma_cm_id *new_cm_id;
+	int ret;
+
+	new_cm_id = rdma_create_id(target->net, srp_rdma_cm_handler, ch,
+				   RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(new_cm_id)) {
+		ret = PTR_ERR(new_cm_id);
+		new_cm_id = NULL;
+		goto out;
+	}
+
+	init_completion(&ch->done);
+	ret = rdma_resolve_addr(new_cm_id, target->rdma_cm.src_specified ?
+				(struct sockaddr *)&target->rdma_cm.src : NULL,
+				(struct sockaddr *)&target->rdma_cm.dst,
+				SRP_PATH_REC_TIMEOUT_MS);
+	if (ret) {
+		pr_err("No route available from %pIS to %pIS (%d)\n",
+		       &target->rdma_cm.src, &target->rdma_cm.dst, ret);
+		goto out;
+	}
+	ret = wait_for_completion_interruptible(&ch->done);
+	if (ret < 0)
+		goto out;
+
+	ret = ch->status;
+	if (ret) {
+		pr_err("Resolving address %pIS failed (%d)\n",
+		       &target->rdma_cm.dst, ret);
+		goto out;
+	}
+
+	swap(ch->rdma_cm.cm_id, new_cm_id);
+
+out:
+	if (new_cm_id)
+		rdma_destroy_id(new_cm_id);
+
+	return ret;
+}
+
+static int srp_new_cm_id(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+
+	return target->using_rdma_cm ? srp_new_rdma_cm_id(ch) :
+		srp_new_ib_cm_id(ch);
+}
+
+static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_fmr_pool_param fmr_param;
+
+	memset(&fmr_param, 0, sizeof(fmr_param));
+	fmr_param.pool_size	    = target->mr_pool_size;
+	fmr_param.dirty_watermark   = fmr_param.pool_size / 4;
+	fmr_param.cache		    = 1;
+	fmr_param.max_pages_per_fmr = dev->max_pages_per_mr;
+	fmr_param.page_shift	    = ilog2(dev->mr_page_size);
+	fmr_param.access	    = (IB_ACCESS_LOCAL_WRITE |
+				       IB_ACCESS_REMOTE_WRITE |
+				       IB_ACCESS_REMOTE_READ);
+
+	return ib_create_fmr_pool(dev->pd, &fmr_param);
+}
+
+/**
+ * srp_destroy_fr_pool() - free the resources owned by a pool
+ * @pool: Fast registration pool to be destroyed.
+ */
+static void srp_destroy_fr_pool(struct srp_fr_pool *pool)
+{
+	int i;
+	struct srp_fr_desc *d;
+
+	if (!pool)
+		return;
+
+	for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) {
+		if (d->mr)
+			ib_dereg_mr(d->mr);
+	}
+	kfree(pool);
+}
+
+/**
+ * srp_create_fr_pool() - allocate and initialize a pool for fast registration
+ * @device:            IB device to allocate fast registration descriptors for.
+ * @pd:                Protection domain associated with the FR descriptors.
+ * @pool_size:         Number of descriptors to allocate.
+ * @max_page_list_len: Maximum fast registration work request page list length.
+ */
+static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device,
+					      struct ib_pd *pd, int pool_size,
+					      int max_page_list_len)
+{
+	struct srp_fr_pool *pool;
+	struct srp_fr_desc *d;
+	struct ib_mr *mr;
+	int i, ret = -EINVAL;
+	enum ib_mr_type mr_type;
+
+	if (pool_size <= 0)
+		goto err;
+	ret = -ENOMEM;
+	pool = kzalloc(sizeof(struct srp_fr_pool) +
+		       pool_size * sizeof(struct srp_fr_desc), GFP_KERNEL);
+	if (!pool)
+		goto err;
+	pool->size = pool_size;
+	pool->max_page_list_len = max_page_list_len;
+	spin_lock_init(&pool->lock);
+	INIT_LIST_HEAD(&pool->free_list);
+
+	if (device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+		mr_type = IB_MR_TYPE_SG_GAPS;
+	else
+		mr_type = IB_MR_TYPE_MEM_REG;
+
+	for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) {
+		mr = ib_alloc_mr(pd, mr_type, max_page_list_len);
+		if (IS_ERR(mr)) {
+			ret = PTR_ERR(mr);
+			if (ret == -ENOMEM)
+				pr_info("%s: ib_alloc_mr() failed. Try to reduce max_cmd_per_lun, max_sect or ch_count\n",
+					dev_name(&device->dev));
+			goto destroy_pool;
+		}
+		d->mr = mr;
+		list_add_tail(&d->entry, &pool->free_list);
+	}
+
+out:
+	return pool;
+
+destroy_pool:
+	srp_destroy_fr_pool(pool);
+
+err:
+	pool = ERR_PTR(ret);
+	goto out;
+}
+
+/**
+ * srp_fr_pool_get() - obtain a descriptor suitable for fast registration
+ * @pool: Pool to obtain descriptor from.
+ */
+static struct srp_fr_desc *srp_fr_pool_get(struct srp_fr_pool *pool)
+{
+	struct srp_fr_desc *d = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (!list_empty(&pool->free_list)) {
+		d = list_first_entry(&pool->free_list, typeof(*d), entry);
+		list_del(&d->entry);
+	}
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	return d;
+}
+
+/**
+ * srp_fr_pool_put() - put an FR descriptor back in the free list
+ * @pool: Pool the descriptor was allocated from.
+ * @desc: Pointer to an array of fast registration descriptor pointers.
+ * @n:    Number of descriptors to put back.
+ *
+ * Note: The caller must already have queued an invalidation request for
+ * desc->mr->rkey before calling this function.
+ */
+static void srp_fr_pool_put(struct srp_fr_pool *pool, struct srp_fr_desc **desc,
+			    int n)
+{
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	for (i = 0; i < n; i++)
+		list_add(&desc[i]->entry, &pool->free_list);
+	spin_unlock_irqrestore(&pool->lock, flags);
+}
+
+static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+
+	return srp_create_fr_pool(dev->dev, dev->pd, target->mr_pool_size,
+				  dev->max_pages_per_mr);
+}
+
+/**
+ * srp_destroy_qp() - destroy an RDMA queue pair
+ * @ch: SRP RDMA channel.
+ *
+ * Drain the qp before destroying it.  This avoids that the receive
+ * completion handler can access the queue pair while it is
+ * being destroyed.
+ */
+static void srp_destroy_qp(struct srp_rdma_ch *ch)
+{
+	spin_lock_irq(&ch->lock);
+	ib_process_cq_direct(ch->send_cq, -1);
+	spin_unlock_irq(&ch->lock);
+
+	ib_drain_qp(ch->qp);
+	ib_destroy_qp(ch->qp);
+}
+
+static int srp_create_ch_ib(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_qp_init_attr *init_attr;
+	struct ib_cq *recv_cq, *send_cq;
+	struct ib_qp *qp;
+	struct ib_fmr_pool *fmr_pool = NULL;
+	struct srp_fr_pool *fr_pool = NULL;
+	const int m = 1 + dev->use_fast_reg * target->mr_per_cmd * 2;
+	int ret;
+
+	init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);
+	if (!init_attr)
+		return -ENOMEM;
+
+	/* queue_size + 1 for ib_drain_rq() */
+	recv_cq = ib_alloc_cq(dev->dev, ch, target->queue_size + 1,
+				ch->comp_vector, IB_POLL_SOFTIRQ);
+	if (IS_ERR(recv_cq)) {
+		ret = PTR_ERR(recv_cq);
+		goto err;
+	}
+
+	send_cq = ib_alloc_cq(dev->dev, ch, m * target->queue_size,
+				ch->comp_vector, IB_POLL_DIRECT);
+	if (IS_ERR(send_cq)) {
+		ret = PTR_ERR(send_cq);
+		goto err_recv_cq;
+	}
+
+	init_attr->event_handler       = srp_qp_event;
+	init_attr->cap.max_send_wr     = m * target->queue_size;
+	init_attr->cap.max_recv_wr     = target->queue_size + 1;
+	init_attr->cap.max_recv_sge    = 1;
+	init_attr->cap.max_send_sge    = 1;
+	init_attr->sq_sig_type         = IB_SIGNAL_REQ_WR;
+	init_attr->qp_type             = IB_QPT_RC;
+	init_attr->send_cq             = send_cq;
+	init_attr->recv_cq             = recv_cq;
+
+	if (target->using_rdma_cm) {
+		ret = rdma_create_qp(ch->rdma_cm.cm_id, dev->pd, init_attr);
+		qp = ch->rdma_cm.cm_id->qp;
+	} else {
+		qp = ib_create_qp(dev->pd, init_attr);
+		if (!IS_ERR(qp)) {
+			ret = srp_init_ib_qp(target, qp);
+			if (ret)
+				ib_destroy_qp(qp);
+		} else {
+			ret = PTR_ERR(qp);
+		}
+	}
+	if (ret) {
+		pr_err("QP creation failed for dev %s: %d\n",
+		       dev_name(&dev->dev->dev), ret);
+		goto err_send_cq;
+	}
+
+	if (dev->use_fast_reg) {
+		fr_pool = srp_alloc_fr_pool(target);
+		if (IS_ERR(fr_pool)) {
+			ret = PTR_ERR(fr_pool);
+			shost_printk(KERN_WARNING, target->scsi_host, PFX
+				     "FR pool allocation failed (%d)\n", ret);
+			goto err_qp;
+		}
+	} else if (dev->use_fmr) {
+		fmr_pool = srp_alloc_fmr_pool(target);
+		if (IS_ERR(fmr_pool)) {
+			ret = PTR_ERR(fmr_pool);
+			shost_printk(KERN_WARNING, target->scsi_host, PFX
+				     "FMR pool allocation failed (%d)\n", ret);
+			goto err_qp;
+		}
+	}
+
+	if (ch->qp)
+		srp_destroy_qp(ch);
+	if (ch->recv_cq)
+		ib_free_cq(ch->recv_cq);
+	if (ch->send_cq)
+		ib_free_cq(ch->send_cq);
+
+	ch->qp = qp;
+	ch->recv_cq = recv_cq;
+	ch->send_cq = send_cq;
+
+	if (dev->use_fast_reg) {
+		if (ch->fr_pool)
+			srp_destroy_fr_pool(ch->fr_pool);
+		ch->fr_pool = fr_pool;
+	} else if (dev->use_fmr) {
+		if (ch->fmr_pool)
+			ib_destroy_fmr_pool(ch->fmr_pool);
+		ch->fmr_pool = fmr_pool;
+	}
+
+	kfree(init_attr);
+	return 0;
+
+err_qp:
+	if (target->using_rdma_cm)
+		rdma_destroy_qp(ch->rdma_cm.cm_id);
+	else
+		ib_destroy_qp(qp);
+
+err_send_cq:
+	ib_free_cq(send_cq);
+
+err_recv_cq:
+	ib_free_cq(recv_cq);
+
+err:
+	kfree(init_attr);
+	return ret;
+}
+
+/*
+ * Note: this function may be called without srp_alloc_iu_bufs() having been
+ * invoked. Hence the ch->[rt]x_ring checks.
+ */
+static void srp_free_ch_ib(struct srp_target_port *target,
+			   struct srp_rdma_ch *ch)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	int i;
+
+	if (!ch->target)
+		return;
+
+	if (target->using_rdma_cm) {
+		if (ch->rdma_cm.cm_id) {
+			rdma_destroy_id(ch->rdma_cm.cm_id);
+			ch->rdma_cm.cm_id = NULL;
+		}
+	} else {
+		if (ch->ib_cm.cm_id) {
+			ib_destroy_cm_id(ch->ib_cm.cm_id);
+			ch->ib_cm.cm_id = NULL;
+		}
+	}
+
+	/* If srp_new_cm_id() succeeded but srp_create_ch_ib() not, return. */
+	if (!ch->qp)
+		return;
+
+	if (dev->use_fast_reg) {
+		if (ch->fr_pool)
+			srp_destroy_fr_pool(ch->fr_pool);
+	} else if (dev->use_fmr) {
+		if (ch->fmr_pool)
+			ib_destroy_fmr_pool(ch->fmr_pool);
+	}
+
+	srp_destroy_qp(ch);
+	ib_free_cq(ch->send_cq);
+	ib_free_cq(ch->recv_cq);
+
+	/*
+	 * Avoid that the SCSI error handler tries to use this channel after
+	 * it has been freed. The SCSI error handler can namely continue
+	 * trying to perform recovery actions after scsi_remove_host()
+	 * returned.
+	 */
+	ch->target = NULL;
+
+	ch->qp = NULL;
+	ch->send_cq = ch->recv_cq = NULL;
+
+	if (ch->rx_ring) {
+		for (i = 0; i < target->queue_size; ++i)
+			srp_free_iu(target->srp_host, ch->rx_ring[i]);
+		kfree(ch->rx_ring);
+		ch->rx_ring = NULL;
+	}
+	if (ch->tx_ring) {
+		for (i = 0; i < target->queue_size; ++i)
+			srp_free_iu(target->srp_host, ch->tx_ring[i]);
+		kfree(ch->tx_ring);
+		ch->tx_ring = NULL;
+	}
+}
+
+static void srp_path_rec_completion(int status,
+				    struct sa_path_rec *pathrec,
+				    void *ch_ptr)
+{
+	struct srp_rdma_ch *ch = ch_ptr;
+	struct srp_target_port *target = ch->target;
+
+	ch->status = status;
+	if (status)
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Got failed path rec status %d\n", status);
+	else
+		ch->ib_cm.path = *pathrec;
+	complete(&ch->done);
+}
+
+static int srp_ib_lookup_path(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	int ret;
+
+	ch->ib_cm.path.numb_path = 1;
+
+	init_completion(&ch->done);
+
+	ch->ib_cm.path_query_id = ib_sa_path_rec_get(&srp_sa_client,
+					       target->srp_host->srp_dev->dev,
+					       target->srp_host->port,
+					       &ch->ib_cm.path,
+					       IB_SA_PATH_REC_SERVICE_ID |
+					       IB_SA_PATH_REC_DGID	 |
+					       IB_SA_PATH_REC_SGID	 |
+					       IB_SA_PATH_REC_NUMB_PATH	 |
+					       IB_SA_PATH_REC_PKEY,
+					       SRP_PATH_REC_TIMEOUT_MS,
+					       GFP_KERNEL,
+					       srp_path_rec_completion,
+					       ch, &ch->ib_cm.path_query);
+	if (ch->ib_cm.path_query_id < 0)
+		return ch->ib_cm.path_query_id;
+
+	ret = wait_for_completion_interruptible(&ch->done);
+	if (ret < 0)
+		return ret;
+
+	if (ch->status < 0)
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Path record query failed: sgid %pI6, dgid %pI6, pkey %#04x, service_id %#16llx\n",
+			     ch->ib_cm.path.sgid.raw, ch->ib_cm.path.dgid.raw,
+			     be16_to_cpu(target->ib_cm.pkey),
+			     be64_to_cpu(target->ib_cm.service_id));
+
+	return ch->status;
+}
+
+static int srp_rdma_lookup_path(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	int ret;
+
+	init_completion(&ch->done);
+
+	ret = rdma_resolve_route(ch->rdma_cm.cm_id, SRP_PATH_REC_TIMEOUT_MS);
+	if (ret)
+		return ret;
+
+	wait_for_completion_interruptible(&ch->done);
+
+	if (ch->status != 0)
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Path resolution failed\n");
+
+	return ch->status;
+}
+
+static int srp_lookup_path(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+
+	return target->using_rdma_cm ? srp_rdma_lookup_path(ch) :
+		srp_ib_lookup_path(ch);
+}
+
+static u8 srp_get_subnet_timeout(struct srp_host *host)
+{
+	struct ib_port_attr attr;
+	int ret;
+	u8 subnet_timeout = 18;
+
+	ret = ib_query_port(host->srp_dev->dev, host->port, &attr);
+	if (ret == 0)
+		subnet_timeout = attr.subnet_timeout;
+
+	if (unlikely(subnet_timeout < 15))
+		pr_warn("%s: subnet timeout %d may cause SRP login to fail.\n",
+			dev_name(&host->srp_dev->dev->dev), subnet_timeout);
+
+	return subnet_timeout;
+}
+
+static int srp_send_req(struct srp_rdma_ch *ch, bool multich)
+{
+	struct srp_target_port *target = ch->target;
+	struct {
+		struct rdma_conn_param	  rdma_param;
+		struct srp_login_req_rdma rdma_req;
+		struct ib_cm_req_param	  ib_param;
+		struct srp_login_req	  ib_req;
+	} *req = NULL;
+	char *ipi, *tpi;
+	int status;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	req->ib_param.flow_control = 1;
+	req->ib_param.retry_count = target->tl_retry_count;
+
+	/*
+	 * Pick some arbitrary defaults here; we could make these
+	 * module parameters if anyone cared about setting them.
+	 */
+	req->ib_param.responder_resources = 4;
+	req->ib_param.rnr_retry_count = 7;
+	req->ib_param.max_cm_retries = 15;
+
+	req->ib_req.opcode = SRP_LOGIN_REQ;
+	req->ib_req.tag = 0;
+	req->ib_req.req_it_iu_len = cpu_to_be32(target->max_iu_len);
+	req->ib_req.req_buf_fmt	= cpu_to_be16(SRP_BUF_FORMAT_DIRECT |
+					      SRP_BUF_FORMAT_INDIRECT);
+	req->ib_req.req_flags = (multich ? SRP_MULTICHAN_MULTI :
+				 SRP_MULTICHAN_SINGLE);
+
+	if (target->using_rdma_cm) {
+		req->rdma_param.flow_control = req->ib_param.flow_control;
+		req->rdma_param.responder_resources =
+			req->ib_param.responder_resources;
+		req->rdma_param.initiator_depth = req->ib_param.initiator_depth;
+		req->rdma_param.retry_count = req->ib_param.retry_count;
+		req->rdma_param.rnr_retry_count = req->ib_param.rnr_retry_count;
+		req->rdma_param.private_data = &req->rdma_req;
+		req->rdma_param.private_data_len = sizeof(req->rdma_req);
+
+		req->rdma_req.opcode = req->ib_req.opcode;
+		req->rdma_req.tag = req->ib_req.tag;
+		req->rdma_req.req_it_iu_len = req->ib_req.req_it_iu_len;
+		req->rdma_req.req_buf_fmt = req->ib_req.req_buf_fmt;
+		req->rdma_req.req_flags	= req->ib_req.req_flags;
+
+		ipi = req->rdma_req.initiator_port_id;
+		tpi = req->rdma_req.target_port_id;
+	} else {
+		u8 subnet_timeout;
+
+		subnet_timeout = srp_get_subnet_timeout(target->srp_host);
+
+		req->ib_param.primary_path = &ch->ib_cm.path;
+		req->ib_param.alternate_path = NULL;
+		req->ib_param.service_id = target->ib_cm.service_id;
+		get_random_bytes(&req->ib_param.starting_psn, 4);
+		req->ib_param.starting_psn &= 0xffffff;
+		req->ib_param.qp_num = ch->qp->qp_num;
+		req->ib_param.qp_type = ch->qp->qp_type;
+		req->ib_param.local_cm_response_timeout = subnet_timeout + 2;
+		req->ib_param.remote_cm_response_timeout = subnet_timeout + 2;
+		req->ib_param.private_data = &req->ib_req;
+		req->ib_param.private_data_len = sizeof(req->ib_req);
+
+		ipi = req->ib_req.initiator_port_id;
+		tpi = req->ib_req.target_port_id;
+	}
+
+	/*
+	 * In the published SRP specification (draft rev. 16a), the
+	 * port identifier format is 8 bytes of ID extension followed
+	 * by 8 bytes of GUID.  Older drafts put the two halves in the
+	 * opposite order, so that the GUID comes first.
+	 *
+	 * Targets conforming to these obsolete drafts can be
+	 * recognized by the I/O Class they report.
+	 */
+	if (target->io_class == SRP_REV10_IB_IO_CLASS) {
+		memcpy(ipi,     &target->sgid.global.interface_id, 8);
+		memcpy(ipi + 8, &target->initiator_ext, 8);
+		memcpy(tpi,     &target->ioc_guid, 8);
+		memcpy(tpi + 8, &target->id_ext, 8);
+	} else {
+		memcpy(ipi,     &target->initiator_ext, 8);
+		memcpy(ipi + 8, &target->sgid.global.interface_id, 8);
+		memcpy(tpi,     &target->id_ext, 8);
+		memcpy(tpi + 8, &target->ioc_guid, 8);
+	}
+
+	/*
+	 * Topspin/Cisco SRP targets will reject our login unless we
+	 * zero out the first 8 bytes of our initiator port ID and set
+	 * the second 8 bytes to the local node GUID.
+	 */
+	if (srp_target_is_topspin(target)) {
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Topspin/Cisco initiator port ID workaround "
+			     "activated for target GUID %016llx\n",
+			     be64_to_cpu(target->ioc_guid));
+		memset(ipi, 0, 8);
+		memcpy(ipi + 8, &target->srp_host->srp_dev->dev->node_guid, 8);
+	}
+
+	if (target->using_rdma_cm)
+		status = rdma_connect(ch->rdma_cm.cm_id, &req->rdma_param);
+	else
+		status = ib_send_cm_req(ch->ib_cm.cm_id, &req->ib_param);
+
+	kfree(req);
+
+	return status;
+}
+
+static bool srp_queue_remove_work(struct srp_target_port *target)
+{
+	bool changed = false;
+
+	spin_lock_irq(&target->lock);
+	if (target->state != SRP_TARGET_REMOVED) {
+		target->state = SRP_TARGET_REMOVED;
+		changed = true;
+	}
+	spin_unlock_irq(&target->lock);
+
+	if (changed)
+		queue_work(srp_remove_wq, &target->remove_work);
+
+	return changed;
+}
+
+static void srp_disconnect_target(struct srp_target_port *target)
+{
+	struct srp_rdma_ch *ch;
+	int i, ret;
+
+	/* XXX should send SRP_I_LOGOUT request */
+
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		ch->connected = false;
+		ret = 0;
+		if (target->using_rdma_cm) {
+			if (ch->rdma_cm.cm_id)
+				rdma_disconnect(ch->rdma_cm.cm_id);
+		} else {
+			if (ch->ib_cm.cm_id)
+				ret = ib_send_cm_dreq(ch->ib_cm.cm_id,
+						      NULL, 0);
+		}
+		if (ret < 0) {
+			shost_printk(KERN_DEBUG, target->scsi_host,
+				     PFX "Sending CM DREQ failed\n");
+		}
+	}
+}
+
+static void srp_free_req_data(struct srp_target_port *target,
+			      struct srp_rdma_ch *ch)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = dev->dev;
+	struct srp_request *req;
+	int i;
+
+	if (!ch->req_ring)
+		return;
+
+	for (i = 0; i < target->req_ring_size; ++i) {
+		req = &ch->req_ring[i];
+		if (dev->use_fast_reg) {
+			kfree(req->fr_list);
+		} else {
+			kfree(req->fmr_list);
+			kfree(req->map_page);
+		}
+		if (req->indirect_dma_addr) {
+			ib_dma_unmap_single(ibdev, req->indirect_dma_addr,
+					    target->indirect_size,
+					    DMA_TO_DEVICE);
+		}
+		kfree(req->indirect_desc);
+	}
+
+	kfree(ch->req_ring);
+	ch->req_ring = NULL;
+}
+
+static int srp_alloc_req_data(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *srp_dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = srp_dev->dev;
+	struct srp_request *req;
+	void *mr_list;
+	dma_addr_t dma_addr;
+	int i, ret = -ENOMEM;
+
+	ch->req_ring = kcalloc(target->req_ring_size, sizeof(*ch->req_ring),
+			       GFP_KERNEL);
+	if (!ch->req_ring)
+		goto out;
+
+	for (i = 0; i < target->req_ring_size; ++i) {
+		req = &ch->req_ring[i];
+		mr_list = kmalloc(target->mr_per_cmd * sizeof(void *),
+				  GFP_KERNEL);
+		if (!mr_list)
+			goto out;
+		if (srp_dev->use_fast_reg) {
+			req->fr_list = mr_list;
+		} else {
+			req->fmr_list = mr_list;
+			req->map_page = kmalloc(srp_dev->max_pages_per_mr *
+						sizeof(void *), GFP_KERNEL);
+			if (!req->map_page)
+				goto out;
+		}
+		req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL);
+		if (!req->indirect_desc)
+			goto out;
+
+		dma_addr = ib_dma_map_single(ibdev, req->indirect_desc,
+					     target->indirect_size,
+					     DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(ibdev, dma_addr))
+			goto out;
+
+		req->indirect_dma_addr = dma_addr;
+	}
+	ret = 0;
+
+out:
+	return ret;
+}
+
+/**
+ * srp_del_scsi_host_attr() - Remove attributes defined in the host template.
+ * @shost: SCSI host whose attributes to remove from sysfs.
+ *
+ * Note: Any attributes defined in the host template and that did not exist
+ * before invocation of this function will be ignored.
+ */
+static void srp_del_scsi_host_attr(struct Scsi_Host *shost)
+{
+	struct device_attribute **attr;
+
+	for (attr = shost->hostt->shost_attrs; attr && *attr; ++attr)
+		device_remove_file(&shost->shost_dev, *attr);
+}
+
+static void srp_remove_target(struct srp_target_port *target)
+{
+	struct srp_rdma_ch *ch;
+	int i;
+
+	WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED);
+
+	srp_del_scsi_host_attr(target->scsi_host);
+	srp_rport_get(target->rport);
+	srp_remove_host(target->scsi_host);
+	scsi_remove_host(target->scsi_host);
+	srp_stop_rport_timers(target->rport);
+	srp_disconnect_target(target);
+	kobj_ns_drop(KOBJ_NS_TYPE_NET, target->net);
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		srp_free_ch_ib(target, ch);
+	}
+	cancel_work_sync(&target->tl_err_work);
+	srp_rport_put(target->rport);
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		srp_free_req_data(target, ch);
+	}
+	kfree(target->ch);
+	target->ch = NULL;
+
+	spin_lock(&target->srp_host->target_lock);
+	list_del(&target->list);
+	spin_unlock(&target->srp_host->target_lock);
+
+	scsi_host_put(target->scsi_host);
+}
+
+static void srp_remove_work(struct work_struct *work)
+{
+	struct srp_target_port *target =
+		container_of(work, struct srp_target_port, remove_work);
+
+	WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED);
+
+	srp_remove_target(target);
+}
+
+static void srp_rport_delete(struct srp_rport *rport)
+{
+	struct srp_target_port *target = rport->lld_data;
+
+	srp_queue_remove_work(target);
+}
+
+/**
+ * srp_connected_ch() - number of connected channels
+ * @target: SRP target port.
+ */
+static int srp_connected_ch(struct srp_target_port *target)
+{
+	int i, c = 0;
+
+	for (i = 0; i < target->ch_count; i++)
+		c += target->ch[i].connected;
+
+	return c;
+}
+
+static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich)
+{
+	struct srp_target_port *target = ch->target;
+	int ret;
+
+	WARN_ON_ONCE(!multich && srp_connected_ch(target) > 0);
+
+	ret = srp_lookup_path(ch);
+	if (ret)
+		goto out;
+
+	while (1) {
+		init_completion(&ch->done);
+		ret = srp_send_req(ch, multich);
+		if (ret)
+			goto out;
+		ret = wait_for_completion_interruptible(&ch->done);
+		if (ret < 0)
+			goto out;
+
+		/*
+		 * The CM event handling code will set status to
+		 * SRP_PORT_REDIRECT if we get a port redirect REJ
+		 * back, or SRP_DLID_REDIRECT if we get a lid/qp
+		 * redirect REJ back.
+		 */
+		ret = ch->status;
+		switch (ret) {
+		case 0:
+			ch->connected = true;
+			goto out;
+
+		case SRP_PORT_REDIRECT:
+			ret = srp_lookup_path(ch);
+			if (ret)
+				goto out;
+			break;
+
+		case SRP_DLID_REDIRECT:
+			break;
+
+		case SRP_STALE_CONN:
+			shost_printk(KERN_ERR, target->scsi_host, PFX
+				     "giving up on stale connection\n");
+			ret = -ECONNRESET;
+			goto out;
+
+		default:
+			goto out;
+		}
+	}
+
+out:
+	return ret <= 0 ? ret : -ENODEV;
+}
+
+static void srp_inv_rkey_err_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	srp_handle_qp_err(cq, wc, "INV RKEY");
+}
+
+static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch,
+		u32 rkey)
+{
+	struct ib_send_wr *bad_wr;
+	struct ib_send_wr wr = {
+		.opcode		    = IB_WR_LOCAL_INV,
+		.next		    = NULL,
+		.num_sge	    = 0,
+		.send_flags	    = 0,
+		.ex.invalidate_rkey = rkey,
+	};
+
+	wr.wr_cqe = &req->reg_cqe;
+	req->reg_cqe.done = srp_inv_rkey_err_done;
+	return ib_post_send(ch->qp, &wr, &bad_wr);
+}
+
+static void srp_unmap_data(struct scsi_cmnd *scmnd,
+			   struct srp_rdma_ch *ch,
+			   struct srp_request *req)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = dev->dev;
+	int i, res;
+
+	if (!scsi_sglist(scmnd) ||
+	    (scmnd->sc_data_direction != DMA_TO_DEVICE &&
+	     scmnd->sc_data_direction != DMA_FROM_DEVICE))
+		return;
+
+	if (dev->use_fast_reg) {
+		struct srp_fr_desc **pfr;
+
+		for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) {
+			res = srp_inv_rkey(req, ch, (*pfr)->mr->rkey);
+			if (res < 0) {
+				shost_printk(KERN_ERR, target->scsi_host, PFX
+				  "Queueing INV WR for rkey %#x failed (%d)\n",
+				  (*pfr)->mr->rkey, res);
+				queue_work(system_long_wq,
+					   &target->tl_err_work);
+			}
+		}
+		if (req->nmdesc)
+			srp_fr_pool_put(ch->fr_pool, req->fr_list,
+					req->nmdesc);
+	} else if (dev->use_fmr) {
+		struct ib_pool_fmr **pfmr;
+
+		for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++)
+			ib_fmr_pool_unmap(*pfmr);
+	}
+
+	ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd),
+			scmnd->sc_data_direction);
+}
+
+/**
+ * srp_claim_req - Take ownership of the scmnd associated with a request.
+ * @ch: SRP RDMA channel.
+ * @req: SRP request.
+ * @sdev: If not NULL, only take ownership for this SCSI device.
+ * @scmnd: If NULL, take ownership of @req->scmnd. If not NULL, only take
+ *         ownership of @req->scmnd if it equals @scmnd.
+ *
+ * Return value:
+ * Either NULL or a pointer to the SCSI command the caller became owner of.
+ */
+static struct scsi_cmnd *srp_claim_req(struct srp_rdma_ch *ch,
+				       struct srp_request *req,
+				       struct scsi_device *sdev,
+				       struct scsi_cmnd *scmnd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ch->lock, flags);
+	if (req->scmnd &&
+	    (!sdev || req->scmnd->device == sdev) &&
+	    (!scmnd || req->scmnd == scmnd)) {
+		scmnd = req->scmnd;
+		req->scmnd = NULL;
+	} else {
+		scmnd = NULL;
+	}
+	spin_unlock_irqrestore(&ch->lock, flags);
+
+	return scmnd;
+}
+
+/**
+ * srp_free_req() - Unmap data and adjust ch->req_lim.
+ * @ch:     SRP RDMA channel.
+ * @req:    Request to be freed.
+ * @scmnd:  SCSI command associated with @req.
+ * @req_lim_delta: Amount to be added to @target->req_lim.
+ */
+static void srp_free_req(struct srp_rdma_ch *ch, struct srp_request *req,
+			 struct scsi_cmnd *scmnd, s32 req_lim_delta)
+{
+	unsigned long flags;
+
+	srp_unmap_data(scmnd, ch, req);
+
+	spin_lock_irqsave(&ch->lock, flags);
+	ch->req_lim += req_lim_delta;
+	spin_unlock_irqrestore(&ch->lock, flags);
+}
+
+static void srp_finish_req(struct srp_rdma_ch *ch, struct srp_request *req,
+			   struct scsi_device *sdev, int result)
+{
+	struct scsi_cmnd *scmnd = srp_claim_req(ch, req, sdev, NULL);
+
+	if (scmnd) {
+		srp_free_req(ch, req, scmnd, 0);
+		scmnd->result = result;
+		scmnd->scsi_done(scmnd);
+	}
+}
+
+static void srp_terminate_io(struct srp_rport *rport)
+{
+	struct srp_target_port *target = rport->lld_data;
+	struct srp_rdma_ch *ch;
+	struct Scsi_Host *shost = target->scsi_host;
+	struct scsi_device *sdev;
+	int i, j;
+
+	/*
+	 * Invoking srp_terminate_io() while srp_queuecommand() is running
+	 * is not safe. Hence the warning statement below.
+	 */
+	shost_for_each_device(sdev, shost)
+		WARN_ON_ONCE(sdev->request_queue->request_fn_active);
+
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+
+		for (j = 0; j < target->req_ring_size; ++j) {
+			struct srp_request *req = &ch->req_ring[j];
+
+			srp_finish_req(ch, req, NULL,
+				       DID_TRANSPORT_FAILFAST << 16);
+		}
+	}
+}
+
+/*
+ * It is up to the caller to ensure that srp_rport_reconnect() calls are
+ * serialized and that no concurrent srp_queuecommand(), srp_abort(),
+ * srp_reset_device() or srp_reset_host() calls will occur while this function
+ * is in progress. One way to realize that is not to call this function
+ * directly but to call srp_reconnect_rport() instead since that last function
+ * serializes calls of this function via rport->mutex and also blocks
+ * srp_queuecommand() calls before invoking this function.
+ */
+static int srp_rport_reconnect(struct srp_rport *rport)
+{
+	struct srp_target_port *target = rport->lld_data;
+	struct srp_rdma_ch *ch;
+	int i, j, ret = 0;
+	bool multich = false;
+
+	srp_disconnect_target(target);
+
+	if (target->state == SRP_TARGET_SCANNING)
+		return -ENODEV;
+
+	/*
+	 * Now get a new local CM ID so that we avoid confusing the target in
+	 * case things are really fouled up. Doing so also ensures that all CM
+	 * callbacks will have finished before a new QP is allocated.
+	 */
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		ret += srp_new_cm_id(ch);
+	}
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		for (j = 0; j < target->req_ring_size; ++j) {
+			struct srp_request *req = &ch->req_ring[j];
+
+			srp_finish_req(ch, req, NULL, DID_RESET << 16);
+		}
+	}
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		/*
+		 * Whether or not creating a new CM ID succeeded, create a new
+		 * QP. This guarantees that all completion callback function
+		 * invocations have finished before request resetting starts.
+		 */
+		ret += srp_create_ch_ib(ch);
+
+		INIT_LIST_HEAD(&ch->free_tx);
+		for (j = 0; j < target->queue_size; ++j)
+			list_add(&ch->tx_ring[j]->list, &ch->free_tx);
+	}
+
+	target->qp_in_error = false;
+
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		if (ret)
+			break;
+		ret = srp_connect_ch(ch, multich);
+		multich = true;
+	}
+
+	if (ret == 0)
+		shost_printk(KERN_INFO, target->scsi_host,
+			     PFX "reconnect succeeded\n");
+
+	return ret;
+}
+
+static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr,
+			 unsigned int dma_len, u32 rkey)
+{
+	struct srp_direct_buf *desc = state->desc;
+
+	WARN_ON_ONCE(!dma_len);
+
+	desc->va = cpu_to_be64(dma_addr);
+	desc->key = cpu_to_be32(rkey);
+	desc->len = cpu_to_be32(dma_len);
+
+	state->total_len += dma_len;
+	state->desc++;
+	state->ndesc++;
+}
+
+static int srp_map_finish_fmr(struct srp_map_state *state,
+			      struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_pool_fmr *fmr;
+	u64 io_addr = 0;
+
+	if (state->fmr.next >= state->fmr.end) {
+		shost_printk(KERN_ERR, ch->target->scsi_host,
+			     PFX "Out of MRs (mr_per_cmd = %d)\n",
+			     ch->target->mr_per_cmd);
+		return -ENOMEM;
+	}
+
+	WARN_ON_ONCE(!dev->use_fmr);
+
+	if (state->npages == 0)
+		return 0;
+
+	if (state->npages == 1 && target->global_rkey) {
+		srp_map_desc(state, state->base_dma_addr, state->dma_len,
+			     target->global_rkey);
+		goto reset_state;
+	}
+
+	fmr = ib_fmr_pool_map_phys(ch->fmr_pool, state->pages,
+				   state->npages, io_addr);
+	if (IS_ERR(fmr))
+		return PTR_ERR(fmr);
+
+	*state->fmr.next++ = fmr;
+	state->nmdesc++;
+
+	srp_map_desc(state, state->base_dma_addr & ~dev->mr_page_mask,
+		     state->dma_len, fmr->fmr->rkey);
+
+reset_state:
+	state->npages = 0;
+	state->dma_len = 0;
+
+	return 0;
+}
+
+static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	srp_handle_qp_err(cq, wc, "FAST REG");
+}
+
+/*
+ * Map up to sg_nents elements of state->sg where *sg_offset_p is the offset
+ * where to start in the first element. If sg_offset_p != NULL then
+ * *sg_offset_p is updated to the offset in state->sg[retval] of the first
+ * byte that has not yet been mapped.
+ */
+static int srp_map_finish_fr(struct srp_map_state *state,
+			     struct srp_request *req,
+			     struct srp_rdma_ch *ch, int sg_nents,
+			     unsigned int *sg_offset_p)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_send_wr *bad_wr;
+	struct ib_reg_wr wr;
+	struct srp_fr_desc *desc;
+	u32 rkey;
+	int n, err;
+
+	if (state->fr.next >= state->fr.end) {
+		shost_printk(KERN_ERR, ch->target->scsi_host,
+			     PFX "Out of MRs (mr_per_cmd = %d)\n",
+			     ch->target->mr_per_cmd);
+		return -ENOMEM;
+	}
+
+	WARN_ON_ONCE(!dev->use_fast_reg);
+
+	if (sg_nents == 1 && target->global_rkey) {
+		unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
+
+		srp_map_desc(state, sg_dma_address(state->sg) + sg_offset,
+			     sg_dma_len(state->sg) - sg_offset,
+			     target->global_rkey);
+		if (sg_offset_p)
+			*sg_offset_p = 0;
+		return 1;
+	}
+
+	desc = srp_fr_pool_get(ch->fr_pool);
+	if (!desc)
+		return -ENOMEM;
+
+	rkey = ib_inc_rkey(desc->mr->rkey);
+	ib_update_fast_reg_key(desc->mr, rkey);
+
+	n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, sg_offset_p,
+			 dev->mr_page_size);
+	if (unlikely(n < 0)) {
+		srp_fr_pool_put(ch->fr_pool, &desc, 1);
+		pr_debug("%s: ib_map_mr_sg(%d, %d) returned %d.\n",
+			 dev_name(&req->scmnd->device->sdev_gendev), sg_nents,
+			 sg_offset_p ? *sg_offset_p : -1, n);
+		return n;
+	}
+
+	WARN_ON_ONCE(desc->mr->length == 0);
+
+	req->reg_cqe.done = srp_reg_mr_err_done;
+
+	wr.wr.next = NULL;
+	wr.wr.opcode = IB_WR_REG_MR;
+	wr.wr.wr_cqe = &req->reg_cqe;
+	wr.wr.num_sge = 0;
+	wr.wr.send_flags = 0;
+	wr.mr = desc->mr;
+	wr.key = desc->mr->rkey;
+	wr.access = (IB_ACCESS_LOCAL_WRITE |
+		     IB_ACCESS_REMOTE_READ |
+		     IB_ACCESS_REMOTE_WRITE);
+
+	*state->fr.next++ = desc;
+	state->nmdesc++;
+
+	srp_map_desc(state, desc->mr->iova,
+		     desc->mr->length, desc->mr->rkey);
+
+	err = ib_post_send(ch->qp, &wr.wr, &bad_wr);
+	if (unlikely(err)) {
+		WARN_ON_ONCE(err == -ENOMEM);
+		return err;
+	}
+
+	return n;
+}
+
+static int srp_map_sg_entry(struct srp_map_state *state,
+			    struct srp_rdma_ch *ch,
+			    struct scatterlist *sg)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = dev->dev;
+	dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg);
+	unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
+	unsigned int len = 0;
+	int ret;
+
+	WARN_ON_ONCE(!dma_len);
+
+	while (dma_len) {
+		unsigned offset = dma_addr & ~dev->mr_page_mask;
+
+		if (state->npages == dev->max_pages_per_mr ||
+		    (state->npages > 0 && offset != 0)) {
+			ret = srp_map_finish_fmr(state, ch);
+			if (ret)
+				return ret;
+		}
+
+		len = min_t(unsigned int, dma_len, dev->mr_page_size - offset);
+
+		if (!state->npages)
+			state->base_dma_addr = dma_addr;
+		state->pages[state->npages++] = dma_addr & dev->mr_page_mask;
+		state->dma_len += len;
+		dma_addr += len;
+		dma_len -= len;
+	}
+
+	/*
+	 * If the end of the MR is not on a page boundary then we need to
+	 * close it out and start a new one -- we can only merge at page
+	 * boundaries.
+	 */
+	ret = 0;
+	if ((dma_addr & ~dev->mr_page_mask) != 0)
+		ret = srp_map_finish_fmr(state, ch);
+	return ret;
+}
+
+static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch,
+			  struct srp_request *req, struct scatterlist *scat,
+			  int count)
+{
+	struct scatterlist *sg;
+	int i, ret;
+
+	state->pages = req->map_page;
+	state->fmr.next = req->fmr_list;
+	state->fmr.end = req->fmr_list + ch->target->mr_per_cmd;
+
+	for_each_sg(scat, sg, count, i) {
+		ret = srp_map_sg_entry(state, ch, sg);
+		if (ret)
+			return ret;
+	}
+
+	ret = srp_map_finish_fmr(state, ch);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
+			 struct srp_request *req, struct scatterlist *scat,
+			 int count)
+{
+	unsigned int sg_offset = 0;
+
+	state->fr.next = req->fr_list;
+	state->fr.end = req->fr_list + ch->target->mr_per_cmd;
+	state->sg = scat;
+
+	if (count == 0)
+		return 0;
+
+	while (count) {
+		int i, n;
+
+		n = srp_map_finish_fr(state, req, ch, count, &sg_offset);
+		if (unlikely(n < 0))
+			return n;
+
+		count -= n;
+		for (i = 0; i < n; i++)
+			state->sg = sg_next(state->sg);
+	}
+
+	return 0;
+}
+
+static int srp_map_sg_dma(struct srp_map_state *state, struct srp_rdma_ch *ch,
+			  struct srp_request *req, struct scatterlist *scat,
+			  int count)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(scat, sg, count, i) {
+		srp_map_desc(state, ib_sg_dma_address(dev->dev, sg),
+			     ib_sg_dma_len(dev->dev, sg),
+			     target->global_rkey);
+	}
+
+	return 0;
+}
+
+/*
+ * Register the indirect data buffer descriptor with the HCA.
+ *
+ * Note: since the indirect data buffer descriptor has been allocated with
+ * kmalloc() it is guaranteed that this buffer is a physically contiguous
+ * memory buffer.
+ */
+static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
+		       void **next_mr, void **end_mr, u32 idb_len,
+		       __be32 *idb_rkey)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct srp_map_state state;
+	struct srp_direct_buf idb_desc;
+	u64 idb_pages[1];
+	struct scatterlist idb_sg[1];
+	int ret;
+
+	memset(&state, 0, sizeof(state));
+	memset(&idb_desc, 0, sizeof(idb_desc));
+	state.gen.next = next_mr;
+	state.gen.end = end_mr;
+	state.desc = &idb_desc;
+	state.base_dma_addr = req->indirect_dma_addr;
+	state.dma_len = idb_len;
+
+	if (dev->use_fast_reg) {
+		state.sg = idb_sg;
+		sg_init_one(idb_sg, req->indirect_desc, idb_len);
+		idb_sg->dma_address = req->indirect_dma_addr; /* hack! */
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+		idb_sg->dma_length = idb_sg->length;	      /* hack^2 */
+#endif
+		ret = srp_map_finish_fr(&state, req, ch, 1, NULL);
+		if (ret < 0)
+			return ret;
+		WARN_ON_ONCE(ret < 1);
+	} else if (dev->use_fmr) {
+		state.pages = idb_pages;
+		state.pages[0] = (req->indirect_dma_addr &
+				  dev->mr_page_mask);
+		state.npages = 1;
+		ret = srp_map_finish_fmr(&state, ch);
+		if (ret < 0)
+			return ret;
+	} else {
+		return -EINVAL;
+	}
+
+	*idb_rkey = idb_desc.key;
+
+	return 0;
+}
+
+static void srp_check_mapping(struct srp_map_state *state,
+			      struct srp_rdma_ch *ch, struct srp_request *req,
+			      struct scatterlist *scat, int count)
+{
+	struct srp_device *dev = ch->target->srp_host->srp_dev;
+	struct srp_fr_desc **pfr;
+	u64 desc_len = 0, mr_len = 0;
+	int i;
+
+	for (i = 0; i < state->ndesc; i++)
+		desc_len += be32_to_cpu(req->indirect_desc[i].len);
+	if (dev->use_fast_reg)
+		for (i = 0, pfr = req->fr_list; i < state->nmdesc; i++, pfr++)
+			mr_len += (*pfr)->mr->length;
+	else if (dev->use_fmr)
+		for (i = 0; i < state->nmdesc; i++)
+			mr_len += be32_to_cpu(req->indirect_desc[i].len);
+	if (desc_len != scsi_bufflen(req->scmnd) ||
+	    mr_len > scsi_bufflen(req->scmnd))
+		pr_err("Inconsistent: scsi len %d <> desc len %lld <> mr len %lld; ndesc %d; nmdesc = %d\n",
+		       scsi_bufflen(req->scmnd), desc_len, mr_len,
+		       state->ndesc, state->nmdesc);
+}
+
+/**
+ * srp_map_data() - map SCSI data buffer onto an SRP request
+ * @scmnd: SCSI command to map
+ * @ch: SRP RDMA channel
+ * @req: SRP request
+ *
+ * Returns the length in bytes of the SRP_CMD IU or a negative value if
+ * mapping failed.
+ */
+static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
+			struct srp_request *req)
+{
+	struct srp_target_port *target = ch->target;
+	struct scatterlist *scat;
+	struct srp_cmd *cmd = req->cmd->buf;
+	int len, nents, count, ret;
+	struct srp_device *dev;
+	struct ib_device *ibdev;
+	struct srp_map_state state;
+	struct srp_indirect_buf *indirect_hdr;
+	u32 idb_len, table_len;
+	__be32 idb_rkey;
+	u8 fmt;
+
+	if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE)
+		return sizeof (struct srp_cmd);
+
+	if (scmnd->sc_data_direction != DMA_FROM_DEVICE &&
+	    scmnd->sc_data_direction != DMA_TO_DEVICE) {
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled data direction %d\n",
+			     scmnd->sc_data_direction);
+		return -EINVAL;
+	}
+
+	nents = scsi_sg_count(scmnd);
+	scat  = scsi_sglist(scmnd);
+
+	dev = target->srp_host->srp_dev;
+	ibdev = dev->dev;
+
+	count = ib_dma_map_sg(ibdev, scat, nents, scmnd->sc_data_direction);
+	if (unlikely(count == 0))
+		return -EIO;
+
+	fmt = SRP_DATA_DESC_DIRECT;
+	len = sizeof (struct srp_cmd) +	sizeof (struct srp_direct_buf);
+
+	if (count == 1 && target->global_rkey) {
+		/*
+		 * The midlayer only generated a single gather/scatter
+		 * entry, or DMA mapping coalesced everything to a
+		 * single entry.  So a direct descriptor along with
+		 * the DMA MR suffices.
+		 */
+		struct srp_direct_buf *buf = (void *) cmd->add_data;
+
+		buf->va  = cpu_to_be64(ib_sg_dma_address(ibdev, scat));
+		buf->key = cpu_to_be32(target->global_rkey);
+		buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat));
+
+		req->nmdesc = 0;
+		goto map_complete;
+	}
+
+	/*
+	 * We have more than one scatter/gather entry, so build our indirect
+	 * descriptor table, trying to merge as many entries as we can.
+	 */
+	indirect_hdr = (void *) cmd->add_data;
+
+	ib_dma_sync_single_for_cpu(ibdev, req->indirect_dma_addr,
+				   target->indirect_size, DMA_TO_DEVICE);
+
+	memset(&state, 0, sizeof(state));
+	state.desc = req->indirect_desc;
+	if (dev->use_fast_reg)
+		ret = srp_map_sg_fr(&state, ch, req, scat, count);
+	else if (dev->use_fmr)
+		ret = srp_map_sg_fmr(&state, ch, req, scat, count);
+	else
+		ret = srp_map_sg_dma(&state, ch, req, scat, count);
+	req->nmdesc = state.nmdesc;
+	if (ret < 0)
+		goto unmap;
+
+	{
+		DEFINE_DYNAMIC_DEBUG_METADATA(ddm,
+			"Memory mapping consistency check");
+		if (DYNAMIC_DEBUG_BRANCH(ddm))
+			srp_check_mapping(&state, ch, req, scat, count);
+	}
+
+	/* We've mapped the request, now pull as much of the indirect
+	 * descriptor table as we can into the command buffer. If this
+	 * target is not using an external indirect table, we are
+	 * guaranteed to fit into the command, as the SCSI layer won't
+	 * give us more S/G entries than we allow.
+	 */
+	if (state.ndesc == 1) {
+		/*
+		 * Memory registration collapsed the sg-list into one entry,
+		 * so use a direct descriptor.
+		 */
+		struct srp_direct_buf *buf = (void *) cmd->add_data;
+
+		*buf = req->indirect_desc[0];
+		goto map_complete;
+	}
+
+	if (unlikely(target->cmd_sg_cnt < state.ndesc &&
+						!target->allow_ext_sg)) {
+		shost_printk(KERN_ERR, target->scsi_host,
+			     "Could not fit S/G list into SRP_CMD\n");
+		ret = -EIO;
+		goto unmap;
+	}
+
+	count = min(state.ndesc, target->cmd_sg_cnt);
+	table_len = state.ndesc * sizeof (struct srp_direct_buf);
+	idb_len = sizeof(struct srp_indirect_buf) + table_len;
+
+	fmt = SRP_DATA_DESC_INDIRECT;
+	len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf);
+	len += count * sizeof (struct srp_direct_buf);
+
+	memcpy(indirect_hdr->desc_list, req->indirect_desc,
+	       count * sizeof (struct srp_direct_buf));
+
+	if (!target->global_rkey) {
+		ret = srp_map_idb(ch, req, state.gen.next, state.gen.end,
+				  idb_len, &idb_rkey);
+		if (ret < 0)
+			goto unmap;
+		req->nmdesc++;
+	} else {
+		idb_rkey = cpu_to_be32(target->global_rkey);
+	}
+
+	indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr);
+	indirect_hdr->table_desc.key = idb_rkey;
+	indirect_hdr->table_desc.len = cpu_to_be32(table_len);
+	indirect_hdr->len = cpu_to_be32(state.total_len);
+
+	if (scmnd->sc_data_direction == DMA_TO_DEVICE)
+		cmd->data_out_desc_cnt = count;
+	else
+		cmd->data_in_desc_cnt = count;
+
+	ib_dma_sync_single_for_device(ibdev, req->indirect_dma_addr, table_len,
+				      DMA_TO_DEVICE);
+
+map_complete:
+	if (scmnd->sc_data_direction == DMA_TO_DEVICE)
+		cmd->buf_fmt = fmt << 4;
+	else
+		cmd->buf_fmt = fmt;
+
+	return len;
+
+unmap:
+	srp_unmap_data(scmnd, ch, req);
+	if (ret == -ENOMEM && req->nmdesc >= target->mr_pool_size)
+		ret = -E2BIG;
+	return ret;
+}
+
+/*
+ * Return an IU and possible credit to the free pool
+ */
+static void srp_put_tx_iu(struct srp_rdma_ch *ch, struct srp_iu *iu,
+			  enum srp_iu_type iu_type)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ch->lock, flags);
+	list_add(&iu->list, &ch->free_tx);
+	if (iu_type != SRP_IU_RSP)
+		++ch->req_lim;
+	spin_unlock_irqrestore(&ch->lock, flags);
+}
+
+/*
+ * Must be called with ch->lock held to protect req_lim and free_tx.
+ * If IU is not sent, it must be returned using srp_put_tx_iu().
+ *
+ * Note:
+ * An upper limit for the number of allocated information units for each
+ * request type is:
+ * - SRP_IU_CMD: SRP_CMD_SQ_SIZE, since the SCSI mid-layer never queues
+ *   more than Scsi_Host.can_queue requests.
+ * - SRP_IU_TSK_MGMT: SRP_TSK_MGMT_SQ_SIZE.
+ * - SRP_IU_RSP: 1, since a conforming SRP target never sends more than
+ *   one unanswered SRP request to an initiator.
+ */
+static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch,
+				      enum srp_iu_type iu_type)
+{
+	struct srp_target_port *target = ch->target;
+	s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE;
+	struct srp_iu *iu;
+
+	lockdep_assert_held(&ch->lock);
+
+	ib_process_cq_direct(ch->send_cq, -1);
+
+	if (list_empty(&ch->free_tx))
+		return NULL;
+
+	/* Initiator responses to target requests do not consume credits */
+	if (iu_type != SRP_IU_RSP) {
+		if (ch->req_lim <= rsv) {
+			++target->zero_req_lim;
+			return NULL;
+		}
+
+		--ch->req_lim;
+	}
+
+	iu = list_first_entry(&ch->free_tx, struct srp_iu, list);
+	list_del(&iu->list);
+	return iu;
+}
+
+/*
+ * Note: if this function is called from inside ib_drain_sq() then it will
+ * be called without ch->lock being held. If ib_drain_sq() dequeues a WQE
+ * with status IB_WC_SUCCESS then that's a bug.
+ */
+static void srp_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe);
+	struct srp_rdma_ch *ch = cq->cq_context;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		srp_handle_qp_err(cq, wc, "SEND");
+		return;
+	}
+
+	lockdep_assert_held(&ch->lock);
+
+	list_add(&iu->list, &ch->free_tx);
+}
+
+static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
+{
+	struct srp_target_port *target = ch->target;
+	struct ib_sge list;
+	struct ib_send_wr wr, *bad_wr;
+
+	list.addr   = iu->dma;
+	list.length = len;
+	list.lkey   = target->lkey;
+
+	iu->cqe.done = srp_send_done;
+
+	wr.next       = NULL;
+	wr.wr_cqe     = &iu->cqe;
+	wr.sg_list    = &list;
+	wr.num_sge    = 1;
+	wr.opcode     = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+
+	return ib_post_send(ch->qp, &wr, &bad_wr);
+}
+
+static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu)
+{
+	struct srp_target_port *target = ch->target;
+	struct ib_recv_wr wr, *bad_wr;
+	struct ib_sge list;
+
+	list.addr   = iu->dma;
+	list.length = iu->size;
+	list.lkey   = target->lkey;
+
+	iu->cqe.done = srp_recv_done;
+
+	wr.next     = NULL;
+	wr.wr_cqe   = &iu->cqe;
+	wr.sg_list  = &list;
+	wr.num_sge  = 1;
+
+	return ib_post_recv(ch->qp, &wr, &bad_wr);
+}
+
+static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_request *req;
+	struct scsi_cmnd *scmnd;
+	unsigned long flags;
+
+	if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) {
+		spin_lock_irqsave(&ch->lock, flags);
+		ch->req_lim += be32_to_cpu(rsp->req_lim_delta);
+		if (rsp->tag == ch->tsk_mgmt_tag) {
+			ch->tsk_mgmt_status = -1;
+			if (be32_to_cpu(rsp->resp_data_len) >= 4)
+				ch->tsk_mgmt_status = rsp->data[3];
+			complete(&ch->tsk_mgmt_done);
+		} else {
+			shost_printk(KERN_ERR, target->scsi_host,
+				     "Received tsk mgmt response too late for tag %#llx\n",
+				     rsp->tag);
+		}
+		spin_unlock_irqrestore(&ch->lock, flags);
+	} else {
+		scmnd = scsi_host_find_tag(target->scsi_host, rsp->tag);
+		if (scmnd && scmnd->host_scribble) {
+			req = (void *)scmnd->host_scribble;
+			scmnd = srp_claim_req(ch, req, NULL, scmnd);
+		} else {
+			scmnd = NULL;
+		}
+		if (!scmnd) {
+			shost_printk(KERN_ERR, target->scsi_host,
+				     "Null scmnd for RSP w/tag %#016llx received on ch %td / QP %#x\n",
+				     rsp->tag, ch - target->ch, ch->qp->qp_num);
+
+			spin_lock_irqsave(&ch->lock, flags);
+			ch->req_lim += be32_to_cpu(rsp->req_lim_delta);
+			spin_unlock_irqrestore(&ch->lock, flags);
+
+			return;
+		}
+		scmnd->result = rsp->status;
+
+		if (rsp->flags & SRP_RSP_FLAG_SNSVALID) {
+			memcpy(scmnd->sense_buffer, rsp->data +
+			       be32_to_cpu(rsp->resp_data_len),
+			       min_t(int, be32_to_cpu(rsp->sense_data_len),
+				     SCSI_SENSE_BUFFERSIZE));
+		}
+
+		if (unlikely(rsp->flags & SRP_RSP_FLAG_DIUNDER))
+			scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt));
+		else if (unlikely(rsp->flags & SRP_RSP_FLAG_DIOVER))
+			scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_in_res_cnt));
+		else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOUNDER))
+			scsi_set_resid(scmnd, be32_to_cpu(rsp->data_out_res_cnt));
+		else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOOVER))
+			scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_out_res_cnt));
+
+		srp_free_req(ch, req, scmnd,
+			     be32_to_cpu(rsp->req_lim_delta));
+
+		scmnd->host_scribble = NULL;
+		scmnd->scsi_done(scmnd);
+	}
+}
+
+static int srp_response_common(struct srp_rdma_ch *ch, s32 req_delta,
+			       void *rsp, int len)
+{
+	struct srp_target_port *target = ch->target;
+	struct ib_device *dev = target->srp_host->srp_dev->dev;
+	unsigned long flags;
+	struct srp_iu *iu;
+	int err;
+
+	spin_lock_irqsave(&ch->lock, flags);
+	ch->req_lim += req_delta;
+	iu = __srp_get_tx_iu(ch, SRP_IU_RSP);
+	spin_unlock_irqrestore(&ch->lock, flags);
+
+	if (!iu) {
+		shost_printk(KERN_ERR, target->scsi_host, PFX
+			     "no IU available to send response\n");
+		return 1;
+	}
+
+	ib_dma_sync_single_for_cpu(dev, iu->dma, len, DMA_TO_DEVICE);
+	memcpy(iu->buf, rsp, len);
+	ib_dma_sync_single_for_device(dev, iu->dma, len, DMA_TO_DEVICE);
+
+	err = srp_post_send(ch, iu, len);
+	if (err) {
+		shost_printk(KERN_ERR, target->scsi_host, PFX
+			     "unable to post response: %d\n", err);
+		srp_put_tx_iu(ch, iu, SRP_IU_RSP);
+	}
+
+	return err;
+}
+
+static void srp_process_cred_req(struct srp_rdma_ch *ch,
+				 struct srp_cred_req *req)
+{
+	struct srp_cred_rsp rsp = {
+		.opcode = SRP_CRED_RSP,
+		.tag = req->tag,
+	};
+	s32 delta = be32_to_cpu(req->req_lim_delta);
+
+	if (srp_response_common(ch, delta, &rsp, sizeof(rsp)))
+		shost_printk(KERN_ERR, ch->target->scsi_host, PFX
+			     "problems processing SRP_CRED_REQ\n");
+}
+
+static void srp_process_aer_req(struct srp_rdma_ch *ch,
+				struct srp_aer_req *req)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_aer_rsp rsp = {
+		.opcode = SRP_AER_RSP,
+		.tag = req->tag,
+	};
+	s32 delta = be32_to_cpu(req->req_lim_delta);
+
+	shost_printk(KERN_ERR, target->scsi_host, PFX
+		     "ignoring AER for LUN %llu\n", scsilun_to_int(&req->lun));
+
+	if (srp_response_common(ch, delta, &rsp, sizeof(rsp)))
+		shost_printk(KERN_ERR, target->scsi_host, PFX
+			     "problems processing SRP_AER_REQ\n");
+}
+
+static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe);
+	struct srp_rdma_ch *ch = cq->cq_context;
+	struct srp_target_port *target = ch->target;
+	struct ib_device *dev = target->srp_host->srp_dev->dev;
+	int res;
+	u8 opcode;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		srp_handle_qp_err(cq, wc, "RECV");
+		return;
+	}
+
+	ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_ti_iu_len,
+				   DMA_FROM_DEVICE);
+
+	opcode = *(u8 *) iu->buf;
+
+	if (0) {
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "recv completion, opcode 0x%02x\n", opcode);
+		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 8, 1,
+			       iu->buf, wc->byte_len, true);
+	}
+
+	switch (opcode) {
+	case SRP_RSP:
+		srp_process_rsp(ch, iu->buf);
+		break;
+
+	case SRP_CRED_REQ:
+		srp_process_cred_req(ch, iu->buf);
+		break;
+
+	case SRP_AER_REQ:
+		srp_process_aer_req(ch, iu->buf);
+		break;
+
+	case SRP_T_LOGOUT:
+		/* XXX Handle target logout */
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Got target logout request\n");
+		break;
+
+	default:
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled SRP opcode 0x%02x\n", opcode);
+		break;
+	}
+
+	ib_dma_sync_single_for_device(dev, iu->dma, ch->max_ti_iu_len,
+				      DMA_FROM_DEVICE);
+
+	res = srp_post_recv(ch, iu);
+	if (res != 0)
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Recv failed with error code %d\n", res);
+}
+
+/**
+ * srp_tl_err_work() - handle a transport layer error
+ * @work: Work structure embedded in an SRP target port.
+ *
+ * Note: This function may get invoked before the rport has been created,
+ * hence the target->rport test.
+ */
+static void srp_tl_err_work(struct work_struct *work)
+{
+	struct srp_target_port *target;
+
+	target = container_of(work, struct srp_target_port, tl_err_work);
+	if (target->rport)
+		srp_start_tl_fail_timers(target->rport);
+}
+
+static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
+		const char *opname)
+{
+	struct srp_rdma_ch *ch = cq->cq_context;
+	struct srp_target_port *target = ch->target;
+
+	if (ch->connected && !target->qp_in_error) {
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "failed %s status %s (%d) for CQE %p\n",
+			     opname, ib_wc_status_msg(wc->status), wc->status,
+			     wc->wr_cqe);
+		queue_work(system_long_wq, &target->tl_err_work);
+	}
+	target->qp_in_error = true;
+}
+
+static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
+{
+	struct srp_target_port *target = host_to_target(shost);
+	struct srp_rport *rport = target->rport;
+	struct srp_rdma_ch *ch;
+	struct srp_request *req;
+	struct srp_iu *iu;
+	struct srp_cmd *cmd;
+	struct ib_device *dev;
+	unsigned long flags;
+	u32 tag;
+	u16 idx;
+	int len, ret;
+	const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler;
+
+	/*
+	 * The SCSI EH thread is the only context from which srp_queuecommand()
+	 * can get invoked for blocked devices (SDEV_BLOCK /
+	 * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by
+	 * locking the rport mutex if invoked from inside the SCSI EH.
+	 */
+	if (in_scsi_eh)
+		mutex_lock(&rport->mutex);
+
+	scmnd->result = srp_chkready(target->rport);
+	if (unlikely(scmnd->result))
+		goto err;
+
+	WARN_ON_ONCE(scmnd->request->tag < 0);
+	tag = blk_mq_unique_tag(scmnd->request);
+	ch = &target->ch[blk_mq_unique_tag_to_hwq(tag)];
+	idx = blk_mq_unique_tag_to_tag(tag);
+	WARN_ONCE(idx >= target->req_ring_size, "%s: tag %#x: idx %d >= %d\n",
+		  dev_name(&shost->shost_gendev), tag, idx,
+		  target->req_ring_size);
+
+	spin_lock_irqsave(&ch->lock, flags);
+	iu = __srp_get_tx_iu(ch, SRP_IU_CMD);
+	spin_unlock_irqrestore(&ch->lock, flags);
+
+	if (!iu)
+		goto err;
+
+	req = &ch->req_ring[idx];
+	dev = target->srp_host->srp_dev->dev;
+	ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_iu_len,
+				   DMA_TO_DEVICE);
+
+	scmnd->host_scribble = (void *) req;
+
+	cmd = iu->buf;
+	memset(cmd, 0, sizeof *cmd);
+
+	cmd->opcode = SRP_CMD;
+	int_to_scsilun(scmnd->device->lun, &cmd->lun);
+	cmd->tag    = tag;
+	memcpy(cmd->cdb, scmnd->cmnd, scmnd->cmd_len);
+
+	req->scmnd    = scmnd;
+	req->cmd      = iu;
+
+	len = srp_map_data(scmnd, ch, req);
+	if (len < 0) {
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Failed to map data (%d)\n", len);
+		/*
+		 * If we ran out of memory descriptors (-ENOMEM) because an
+		 * application is queuing many requests with more than
+		 * max_pages_per_mr sg-list elements, tell the SCSI mid-layer
+		 * to reduce queue depth temporarily.
+		 */
+		scmnd->result = len == -ENOMEM ?
+			DID_OK << 16 | QUEUE_FULL << 1 : DID_ERROR << 16;
+		goto err_iu;
+	}
+
+	ib_dma_sync_single_for_device(dev, iu->dma, target->max_iu_len,
+				      DMA_TO_DEVICE);
+
+	if (srp_post_send(ch, iu, len)) {
+		shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n");
+		goto err_unmap;
+	}
+
+	ret = 0;
+
+unlock_rport:
+	if (in_scsi_eh)
+		mutex_unlock(&rport->mutex);
+
+	return ret;
+
+err_unmap:
+	srp_unmap_data(scmnd, ch, req);
+
+err_iu:
+	srp_put_tx_iu(ch, iu, SRP_IU_CMD);
+
+	/*
+	 * Avoid that the loops that iterate over the request ring can
+	 * encounter a dangling SCSI command pointer.
+	 */
+	req->scmnd = NULL;
+
+err:
+	if (scmnd->result) {
+		scmnd->scsi_done(scmnd);
+		ret = 0;
+	} else {
+		ret = SCSI_MLQUEUE_HOST_BUSY;
+	}
+
+	goto unlock_rport;
+}
+
+/*
+ * Note: the resources allocated in this function are freed in
+ * srp_free_ch_ib().
+ */
+static int srp_alloc_iu_bufs(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	int i;
+
+	ch->rx_ring = kcalloc(target->queue_size, sizeof(*ch->rx_ring),
+			      GFP_KERNEL);
+	if (!ch->rx_ring)
+		goto err_no_ring;
+	ch->tx_ring = kcalloc(target->queue_size, sizeof(*ch->tx_ring),
+			      GFP_KERNEL);
+	if (!ch->tx_ring)
+		goto err_no_ring;
+
+	for (i = 0; i < target->queue_size; ++i) {
+		ch->rx_ring[i] = srp_alloc_iu(target->srp_host,
+					      ch->max_ti_iu_len,
+					      GFP_KERNEL, DMA_FROM_DEVICE);
+		if (!ch->rx_ring[i])
+			goto err;
+	}
+
+	for (i = 0; i < target->queue_size; ++i) {
+		ch->tx_ring[i] = srp_alloc_iu(target->srp_host,
+					      target->max_iu_len,
+					      GFP_KERNEL, DMA_TO_DEVICE);
+		if (!ch->tx_ring[i])
+			goto err;
+
+		list_add(&ch->tx_ring[i]->list, &ch->free_tx);
+	}
+
+	return 0;
+
+err:
+	for (i = 0; i < target->queue_size; ++i) {
+		srp_free_iu(target->srp_host, ch->rx_ring[i]);
+		srp_free_iu(target->srp_host, ch->tx_ring[i]);
+	}
+
+
+err_no_ring:
+	kfree(ch->tx_ring);
+	ch->tx_ring = NULL;
+	kfree(ch->rx_ring);
+	ch->rx_ring = NULL;
+
+	return -ENOMEM;
+}
+
+static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask)
+{
+	uint64_t T_tr_ns, max_compl_time_ms;
+	uint32_t rq_tmo_jiffies;
+
+	/*
+	 * According to section 11.2.4.2 in the IBTA spec (Modify Queue Pair,
+	 * table 91), both the QP timeout and the retry count have to be set
+	 * for RC QP's during the RTR to RTS transition.
+	 */
+	WARN_ON_ONCE((attr_mask & (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)) !=
+		     (IB_QP_TIMEOUT | IB_QP_RETRY_CNT));
+
+	/*
+	 * Set target->rq_tmo_jiffies to one second more than the largest time
+	 * it can take before an error completion is generated. See also
+	 * C9-140..142 in the IBTA spec for more information about how to
+	 * convert the QP Local ACK Timeout value to nanoseconds.
+	 */
+	T_tr_ns = 4096 * (1ULL << qp_attr->timeout);
+	max_compl_time_ms = qp_attr->retry_cnt * 4 * T_tr_ns;
+	do_div(max_compl_time_ms, NSEC_PER_MSEC);
+	rq_tmo_jiffies = msecs_to_jiffies(max_compl_time_ms + 1000);
+
+	return rq_tmo_jiffies;
+}
+
+static void srp_cm_rep_handler(struct ib_cm_id *cm_id,
+			       const struct srp_login_rsp *lrsp,
+			       struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct ib_qp_attr *qp_attr = NULL;
+	int attr_mask = 0;
+	int ret = 0;
+	int i;
+
+	if (lrsp->opcode == SRP_LOGIN_RSP) {
+		ch->max_ti_iu_len = be32_to_cpu(lrsp->max_ti_iu_len);
+		ch->req_lim       = be32_to_cpu(lrsp->req_lim_delta);
+
+		/*
+		 * Reserve credits for task management so we don't
+		 * bounce requests back to the SCSI mid-layer.
+		 */
+		target->scsi_host->can_queue
+			= min(ch->req_lim - SRP_TSK_MGMT_SQ_SIZE,
+			      target->scsi_host->can_queue);
+		target->scsi_host->cmd_per_lun
+			= min_t(int, target->scsi_host->can_queue,
+				target->scsi_host->cmd_per_lun);
+	} else {
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled RSP opcode %#x\n", lrsp->opcode);
+		ret = -ECONNRESET;
+		goto error;
+	}
+
+	if (!ch->rx_ring) {
+		ret = srp_alloc_iu_bufs(ch);
+		if (ret)
+			goto error;
+	}
+
+	for (i = 0; i < target->queue_size; i++) {
+		struct srp_iu *iu = ch->rx_ring[i];
+
+		ret = srp_post_recv(ch, iu);
+		if (ret)
+			goto error;
+	}
+
+	if (!target->using_rdma_cm) {
+		ret = -ENOMEM;
+		qp_attr = kmalloc(sizeof(*qp_attr), GFP_KERNEL);
+		if (!qp_attr)
+			goto error;
+
+		qp_attr->qp_state = IB_QPS_RTR;
+		ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask);
+		if (ret)
+			goto error_free;
+
+		ret = ib_modify_qp(ch->qp, qp_attr, attr_mask);
+		if (ret)
+			goto error_free;
+
+		qp_attr->qp_state = IB_QPS_RTS;
+		ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask);
+		if (ret)
+			goto error_free;
+
+		target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask);
+
+		ret = ib_modify_qp(ch->qp, qp_attr, attr_mask);
+		if (ret)
+			goto error_free;
+
+		ret = ib_send_cm_rtu(cm_id, NULL, 0);
+	}
+
+error_free:
+	kfree(qp_attr);
+
+error:
+	ch->status = ret;
+}
+
+static void srp_ib_cm_rej_handler(struct ib_cm_id *cm_id,
+				  struct ib_cm_event *event,
+				  struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct Scsi_Host *shost = target->scsi_host;
+	struct ib_class_port_info *cpi;
+	int opcode;
+	u16 dlid;
+
+	switch (event->param.rej_rcvd.reason) {
+	case IB_CM_REJ_PORT_CM_REDIRECT:
+		cpi = event->param.rej_rcvd.ari;
+		dlid = be16_to_cpu(cpi->redirect_lid);
+		sa_path_set_dlid(&ch->ib_cm.path, dlid);
+		ch->ib_cm.path.pkey = cpi->redirect_pkey;
+		cm_id->remote_cm_qpn = be32_to_cpu(cpi->redirect_qp) & 0x00ffffff;
+		memcpy(ch->ib_cm.path.dgid.raw, cpi->redirect_gid, 16);
+
+		ch->status = dlid ? SRP_DLID_REDIRECT : SRP_PORT_REDIRECT;
+		break;
+
+	case IB_CM_REJ_PORT_REDIRECT:
+		if (srp_target_is_topspin(target)) {
+			union ib_gid *dgid = &ch->ib_cm.path.dgid;
+
+			/*
+			 * Topspin/Cisco SRP gateways incorrectly send
+			 * reject reason code 25 when they mean 24
+			 * (port redirect).
+			 */
+			memcpy(dgid->raw, event->param.rej_rcvd.ari, 16);
+
+			shost_printk(KERN_DEBUG, shost,
+				     PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n",
+				     be64_to_cpu(dgid->global.subnet_prefix),
+				     be64_to_cpu(dgid->global.interface_id));
+
+			ch->status = SRP_PORT_REDIRECT;
+		} else {
+			shost_printk(KERN_WARNING, shost,
+				     "  REJ reason: IB_CM_REJ_PORT_REDIRECT\n");
+			ch->status = -ECONNRESET;
+		}
+		break;
+
+	case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID:
+		shost_printk(KERN_WARNING, shost,
+			    "  REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n");
+		ch->status = -ECONNRESET;
+		break;
+
+	case IB_CM_REJ_CONSUMER_DEFINED:
+		opcode = *(u8 *) event->private_data;
+		if (opcode == SRP_LOGIN_REJ) {
+			struct srp_login_rej *rej = event->private_data;
+			u32 reason = be32_to_cpu(rej->reason);
+
+			if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE)
+				shost_printk(KERN_WARNING, shost,
+					     PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n");
+			else
+				shost_printk(KERN_WARNING, shost, PFX
+					     "SRP LOGIN from %pI6 to %pI6 REJECTED, reason 0x%08x\n",
+					     target->sgid.raw,
+					     target->ib_cm.orig_dgid.raw,
+					     reason);
+		} else
+			shost_printk(KERN_WARNING, shost,
+				     "  REJ reason: IB_CM_REJ_CONSUMER_DEFINED,"
+				     " opcode 0x%02x\n", opcode);
+		ch->status = -ECONNRESET;
+		break;
+
+	case IB_CM_REJ_STALE_CONN:
+		shost_printk(KERN_WARNING, shost, "  REJ reason: stale connection\n");
+		ch->status = SRP_STALE_CONN;
+		break;
+
+	default:
+		shost_printk(KERN_WARNING, shost, "  REJ reason 0x%x\n",
+			     event->param.rej_rcvd.reason);
+		ch->status = -ECONNRESET;
+	}
+}
+
+static int srp_ib_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	struct srp_rdma_ch *ch = cm_id->context;
+	struct srp_target_port *target = ch->target;
+	int comp = 0;
+
+	switch (event->event) {
+	case IB_CM_REQ_ERROR:
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Sending CM REQ failed\n");
+		comp = 1;
+		ch->status = -ECONNRESET;
+		break;
+
+	case IB_CM_REP_RECEIVED:
+		comp = 1;
+		srp_cm_rep_handler(cm_id, event->private_data, ch);
+		break;
+
+	case IB_CM_REJ_RECEIVED:
+		shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n");
+		comp = 1;
+
+		srp_ib_cm_rej_handler(cm_id, event, ch);
+		break;
+
+	case IB_CM_DREQ_RECEIVED:
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "DREQ received - connection closed\n");
+		ch->connected = false;
+		if (ib_send_cm_drep(cm_id, NULL, 0))
+			shost_printk(KERN_ERR, target->scsi_host,
+				     PFX "Sending CM DREP failed\n");
+		queue_work(system_long_wq, &target->tl_err_work);
+		break;
+
+	case IB_CM_TIMEWAIT_EXIT:
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "connection closed\n");
+		comp = 1;
+
+		ch->status = 0;
+		break;
+
+	case IB_CM_MRA_RECEIVED:
+	case IB_CM_DREQ_ERROR:
+	case IB_CM_DREP_RECEIVED:
+		break;
+
+	default:
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled CM event %d\n", event->event);
+		break;
+	}
+
+	if (comp)
+		complete(&ch->done);
+
+	return 0;
+}
+
+static void srp_rdma_cm_rej_handler(struct srp_rdma_ch *ch,
+				    struct rdma_cm_event *event)
+{
+	struct srp_target_port *target = ch->target;
+	struct Scsi_Host *shost = target->scsi_host;
+	int opcode;
+
+	switch (event->status) {
+	case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID:
+		shost_printk(KERN_WARNING, shost,
+			    "  REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n");
+		ch->status = -ECONNRESET;
+		break;
+
+	case IB_CM_REJ_CONSUMER_DEFINED:
+		opcode = *(u8 *) event->param.conn.private_data;
+		if (opcode == SRP_LOGIN_REJ) {
+			struct srp_login_rej *rej =
+				(struct srp_login_rej *)
+				event->param.conn.private_data;
+			u32 reason = be32_to_cpu(rej->reason);
+
+			if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE)
+				shost_printk(KERN_WARNING, shost,
+					     PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n");
+			else
+				shost_printk(KERN_WARNING, shost,
+					    PFX "SRP LOGIN REJECTED, reason 0x%08x\n", reason);
+		} else {
+			shost_printk(KERN_WARNING, shost,
+				     "  REJ reason: IB_CM_REJ_CONSUMER_DEFINED, opcode 0x%02x\n",
+				     opcode);
+		}
+		ch->status = -ECONNRESET;
+		break;
+
+	case IB_CM_REJ_STALE_CONN:
+		shost_printk(KERN_WARNING, shost,
+			     "  REJ reason: stale connection\n");
+		ch->status = SRP_STALE_CONN;
+		break;
+
+	default:
+		shost_printk(KERN_WARNING, shost, "  REJ reason 0x%x\n",
+			     event->status);
+		ch->status = -ECONNRESET;
+		break;
+	}
+}
+
+static int srp_rdma_cm_handler(struct rdma_cm_id *cm_id,
+			       struct rdma_cm_event *event)
+{
+	struct srp_rdma_ch *ch = cm_id->context;
+	struct srp_target_port *target = ch->target;
+	int comp = 0;
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		ch->status = 0;
+		comp = 1;
+		break;
+
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		ch->status = -ENXIO;
+		comp = 1;
+		break;
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		ch->status = 0;
+		comp = 1;
+		break;
+
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+		ch->status = -EHOSTUNREACH;
+		comp = 1;
+		break;
+
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Sending CM REQ failed\n");
+		comp = 1;
+		ch->status = -ECONNRESET;
+		break;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		comp = 1;
+		srp_cm_rep_handler(NULL, event->param.conn.private_data, ch);
+		break;
+
+	case RDMA_CM_EVENT_REJECTED:
+		shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n");
+		comp = 1;
+
+		srp_rdma_cm_rej_handler(ch, event);
+		break;
+
+	case RDMA_CM_EVENT_DISCONNECTED:
+		if (ch->connected) {
+			shost_printk(KERN_WARNING, target->scsi_host,
+				     PFX "received DREQ\n");
+			rdma_disconnect(ch->rdma_cm.cm_id);
+			comp = 1;
+			ch->status = 0;
+			queue_work(system_long_wq, &target->tl_err_work);
+		}
+		break;
+
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "connection closed\n");
+
+		comp = 1;
+		ch->status = 0;
+		break;
+
+	default:
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled CM event %d\n", event->event);
+		break;
+	}
+
+	if (comp)
+		complete(&ch->done);
+
+	return 0;
+}
+
+/**
+ * srp_change_queue_depth - setting device queue depth
+ * @sdev: scsi device struct
+ * @qdepth: requested queue depth
+ *
+ * Returns queue depth.
+ */
+static int
+srp_change_queue_depth(struct scsi_device *sdev, int qdepth)
+{
+	if (!sdev->tagged_supported)
+		qdepth = 1;
+	return scsi_change_queue_depth(sdev, qdepth);
+}
+
+static int srp_send_tsk_mgmt(struct srp_rdma_ch *ch, u64 req_tag, u64 lun,
+			     u8 func, u8 *status)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_rport *rport = target->rport;
+	struct ib_device *dev = target->srp_host->srp_dev->dev;
+	struct srp_iu *iu;
+	struct srp_tsk_mgmt *tsk_mgmt;
+	int res;
+
+	if (!ch->connected || target->qp_in_error)
+		return -1;
+
+	/*
+	 * Lock the rport mutex to avoid that srp_create_ch_ib() is
+	 * invoked while a task management function is being sent.
+	 */
+	mutex_lock(&rport->mutex);
+	spin_lock_irq(&ch->lock);
+	iu = __srp_get_tx_iu(ch, SRP_IU_TSK_MGMT);
+	spin_unlock_irq(&ch->lock);
+
+	if (!iu) {
+		mutex_unlock(&rport->mutex);
+
+		return -1;
+	}
+
+	ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt,
+				   DMA_TO_DEVICE);
+	tsk_mgmt = iu->buf;
+	memset(tsk_mgmt, 0, sizeof *tsk_mgmt);
+
+	tsk_mgmt->opcode 	= SRP_TSK_MGMT;
+	int_to_scsilun(lun, &tsk_mgmt->lun);
+	tsk_mgmt->tsk_mgmt_func = func;
+	tsk_mgmt->task_tag	= req_tag;
+
+	spin_lock_irq(&ch->lock);
+	ch->tsk_mgmt_tag = (ch->tsk_mgmt_tag + 1) | SRP_TAG_TSK_MGMT;
+	tsk_mgmt->tag = ch->tsk_mgmt_tag;
+	spin_unlock_irq(&ch->lock);
+
+	init_completion(&ch->tsk_mgmt_done);
+
+	ib_dma_sync_single_for_device(dev, iu->dma, sizeof *tsk_mgmt,
+				      DMA_TO_DEVICE);
+	if (srp_post_send(ch, iu, sizeof(*tsk_mgmt))) {
+		srp_put_tx_iu(ch, iu, SRP_IU_TSK_MGMT);
+		mutex_unlock(&rport->mutex);
+
+		return -1;
+	}
+	res = wait_for_completion_timeout(&ch->tsk_mgmt_done,
+					msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS));
+	if (res > 0 && status)
+		*status = ch->tsk_mgmt_status;
+	mutex_unlock(&rport->mutex);
+
+	WARN_ON_ONCE(res < 0);
+
+	return res > 0 ? 0 : -1;
+}
+
+static int srp_abort(struct scsi_cmnd *scmnd)
+{
+	struct srp_target_port *target = host_to_target(scmnd->device->host);
+	struct srp_request *req = (struct srp_request *) scmnd->host_scribble;
+	u32 tag;
+	u16 ch_idx;
+	struct srp_rdma_ch *ch;
+	int ret;
+
+	shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n");
+
+	if (!req)
+		return SUCCESS;
+	tag = blk_mq_unique_tag(scmnd->request);
+	ch_idx = blk_mq_unique_tag_to_hwq(tag);
+	if (WARN_ON_ONCE(ch_idx >= target->ch_count))
+		return SUCCESS;
+	ch = &target->ch[ch_idx];
+	if (!srp_claim_req(ch, req, NULL, scmnd))
+		return SUCCESS;
+	shost_printk(KERN_ERR, target->scsi_host,
+		     "Sending SRP abort for tag %#x\n", tag);
+	if (srp_send_tsk_mgmt(ch, tag, scmnd->device->lun,
+			      SRP_TSK_ABORT_TASK, NULL) == 0)
+		ret = SUCCESS;
+	else if (target->rport->state == SRP_RPORT_LOST)
+		ret = FAST_IO_FAIL;
+	else
+		ret = FAILED;
+	if (ret == SUCCESS) {
+		srp_free_req(ch, req, scmnd, 0);
+		scmnd->result = DID_ABORT << 16;
+		scmnd->scsi_done(scmnd);
+	}
+
+	return ret;
+}
+
+static int srp_reset_device(struct scsi_cmnd *scmnd)
+{
+	struct srp_target_port *target = host_to_target(scmnd->device->host);
+	struct srp_rdma_ch *ch;
+	int i;
+	u8 status;
+
+	shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n");
+
+	ch = &target->ch[0];
+	if (srp_send_tsk_mgmt(ch, SRP_TAG_NO_REQ, scmnd->device->lun,
+			      SRP_TSK_LUN_RESET, &status))
+		return FAILED;
+	if (status)
+		return FAILED;
+
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		for (i = 0; i < target->req_ring_size; ++i) {
+			struct srp_request *req = &ch->req_ring[i];
+
+			srp_finish_req(ch, req, scmnd->device, DID_RESET << 16);
+		}
+	}
+
+	return SUCCESS;
+}
+
+static int srp_reset_host(struct scsi_cmnd *scmnd)
+{
+	struct srp_target_port *target = host_to_target(scmnd->device->host);
+
+	shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n");
+
+	return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
+}
+
+static int srp_target_alloc(struct scsi_target *starget)
+{
+	struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
+	struct srp_target_port *target = host_to_target(shost);
+
+	if (target->target_can_queue)
+		starget->can_queue = target->target_can_queue;
+	return 0;
+}
+
+static int srp_slave_alloc(struct scsi_device *sdev)
+{
+	struct Scsi_Host *shost = sdev->host;
+	struct srp_target_port *target = host_to_target(shost);
+	struct srp_device *srp_dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = srp_dev->dev;
+
+	if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG))
+		blk_queue_virt_boundary(sdev->request_queue,
+					~srp_dev->mr_page_mask);
+
+	return 0;
+}
+
+static int srp_slave_configure(struct scsi_device *sdev)
+{
+	struct Scsi_Host *shost = sdev->host;
+	struct srp_target_port *target = host_to_target(shost);
+	struct request_queue *q = sdev->request_queue;
+	unsigned long timeout;
+
+	if (sdev->type == TYPE_DISK) {
+		timeout = max_t(unsigned, 30 * HZ, target->rq_tmo_jiffies);
+		blk_queue_rq_timeout(q, timeout);
+	}
+
+	return 0;
+}
+
+static ssize_t show_id_ext(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "0x%016llx\n", be64_to_cpu(target->id_ext));
+}
+
+static ssize_t show_ioc_guid(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "0x%016llx\n", be64_to_cpu(target->ioc_guid));
+}
+
+static ssize_t show_service_id(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	if (target->using_rdma_cm)
+		return -ENOENT;
+	return sprintf(buf, "0x%016llx\n",
+		       be64_to_cpu(target->ib_cm.service_id));
+}
+
+static ssize_t show_pkey(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	if (target->using_rdma_cm)
+		return -ENOENT;
+	return sprintf(buf, "0x%04x\n", be16_to_cpu(target->ib_cm.pkey));
+}
+
+static ssize_t show_sgid(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%pI6\n", target->sgid.raw);
+}
+
+static ssize_t show_dgid(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+	struct srp_rdma_ch *ch = &target->ch[0];
+
+	if (target->using_rdma_cm)
+		return -ENOENT;
+	return sprintf(buf, "%pI6\n", ch->ib_cm.path.dgid.raw);
+}
+
+static ssize_t show_orig_dgid(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	if (target->using_rdma_cm)
+		return -ENOENT;
+	return sprintf(buf, "%pI6\n", target->ib_cm.orig_dgid.raw);
+}
+
+static ssize_t show_req_lim(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+	struct srp_rdma_ch *ch;
+	int i, req_lim = INT_MAX;
+
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		req_lim = min(req_lim, ch->req_lim);
+	}
+	return sprintf(buf, "%d\n", req_lim);
+}
+
+static ssize_t show_zero_req_lim(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->zero_req_lim);
+}
+
+static ssize_t show_local_ib_port(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->srp_host->port);
+}
+
+static ssize_t show_local_ib_device(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name);
+}
+
+static ssize_t show_ch_count(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->ch_count);
+}
+
+static ssize_t show_comp_vector(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->comp_vector);
+}
+
+static ssize_t show_tl_retry_count(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->tl_retry_count);
+}
+
+static ssize_t show_cmd_sg_entries(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%u\n", target->cmd_sg_cnt);
+}
+
+static ssize_t show_allow_ext_sg(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%s\n", target->allow_ext_sg ? "true" : "false");
+}
+
+static DEVICE_ATTR(id_ext,	    S_IRUGO, show_id_ext,	   NULL);
+static DEVICE_ATTR(ioc_guid,	    S_IRUGO, show_ioc_guid,	   NULL);
+static DEVICE_ATTR(service_id,	    S_IRUGO, show_service_id,	   NULL);
+static DEVICE_ATTR(pkey,	    S_IRUGO, show_pkey,		   NULL);
+static DEVICE_ATTR(sgid,	    S_IRUGO, show_sgid,		   NULL);
+static DEVICE_ATTR(dgid,	    S_IRUGO, show_dgid,		   NULL);
+static DEVICE_ATTR(orig_dgid,	    S_IRUGO, show_orig_dgid,	   NULL);
+static DEVICE_ATTR(req_lim,         S_IRUGO, show_req_lim,         NULL);
+static DEVICE_ATTR(zero_req_lim,    S_IRUGO, show_zero_req_lim,	   NULL);
+static DEVICE_ATTR(local_ib_port,   S_IRUGO, show_local_ib_port,   NULL);
+static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL);
+static DEVICE_ATTR(ch_count,        S_IRUGO, show_ch_count,        NULL);
+static DEVICE_ATTR(comp_vector,     S_IRUGO, show_comp_vector,     NULL);
+static DEVICE_ATTR(tl_retry_count,  S_IRUGO, show_tl_retry_count,  NULL);
+static DEVICE_ATTR(cmd_sg_entries,  S_IRUGO, show_cmd_sg_entries,  NULL);
+static DEVICE_ATTR(allow_ext_sg,    S_IRUGO, show_allow_ext_sg,    NULL);
+
+static struct device_attribute *srp_host_attrs[] = {
+	&dev_attr_id_ext,
+	&dev_attr_ioc_guid,
+	&dev_attr_service_id,
+	&dev_attr_pkey,
+	&dev_attr_sgid,
+	&dev_attr_dgid,
+	&dev_attr_orig_dgid,
+	&dev_attr_req_lim,
+	&dev_attr_zero_req_lim,
+	&dev_attr_local_ib_port,
+	&dev_attr_local_ib_device,
+	&dev_attr_ch_count,
+	&dev_attr_comp_vector,
+	&dev_attr_tl_retry_count,
+	&dev_attr_cmd_sg_entries,
+	&dev_attr_allow_ext_sg,
+	NULL
+};
+
+static struct scsi_host_template srp_template = {
+	.module				= THIS_MODULE,
+	.name				= "InfiniBand SRP initiator",
+	.proc_name			= DRV_NAME,
+	.target_alloc			= srp_target_alloc,
+	.slave_alloc			= srp_slave_alloc,
+	.slave_configure		= srp_slave_configure,
+	.info				= srp_target_info,
+	.queuecommand			= srp_queuecommand,
+	.change_queue_depth             = srp_change_queue_depth,
+	.eh_timed_out			= srp_timed_out,
+	.eh_abort_handler		= srp_abort,
+	.eh_device_reset_handler	= srp_reset_device,
+	.eh_host_reset_handler		= srp_reset_host,
+	.skip_settle_delay		= true,
+	.sg_tablesize			= SRP_DEF_SG_TABLESIZE,
+	.can_queue			= SRP_DEFAULT_CMD_SQ_SIZE,
+	.this_id			= -1,
+	.cmd_per_lun			= SRP_DEFAULT_CMD_SQ_SIZE,
+	.use_clustering			= ENABLE_CLUSTERING,
+	.shost_attrs			= srp_host_attrs,
+	.track_queue_depth		= 1,
+};
+
+static int srp_sdev_count(struct Scsi_Host *host)
+{
+	struct scsi_device *sdev;
+	int c = 0;
+
+	shost_for_each_device(sdev, host)
+		c++;
+
+	return c;
+}
+
+/*
+ * Return values:
+ * < 0 upon failure. Caller is responsible for SRP target port cleanup.
+ * 0 and target->state == SRP_TARGET_REMOVED if asynchronous target port
+ *    removal has been scheduled.
+ * 0 and target->state != SRP_TARGET_REMOVED upon success.
+ */
+static int srp_add_target(struct srp_host *host, struct srp_target_port *target)
+{
+	struct srp_rport_identifiers ids;
+	struct srp_rport *rport;
+
+	target->state = SRP_TARGET_SCANNING;
+	sprintf(target->target_name, "SRP.T10:%016llX",
+		be64_to_cpu(target->id_ext));
+
+	if (scsi_add_host(target->scsi_host, host->srp_dev->dev->dev.parent))
+		return -ENODEV;
+
+	memcpy(ids.port_id, &target->id_ext, 8);
+	memcpy(ids.port_id + 8, &target->ioc_guid, 8);
+	ids.roles = SRP_RPORT_ROLE_TARGET;
+	rport = srp_rport_add(target->scsi_host, &ids);
+	if (IS_ERR(rport)) {
+		scsi_remove_host(target->scsi_host);
+		return PTR_ERR(rport);
+	}
+
+	rport->lld_data = target;
+	target->rport = rport;
+
+	spin_lock(&host->target_lock);
+	list_add_tail(&target->list, &host->target_list);
+	spin_unlock(&host->target_lock);
+
+	scsi_scan_target(&target->scsi_host->shost_gendev,
+			 0, target->scsi_id, SCAN_WILD_CARD, SCSI_SCAN_INITIAL);
+
+	if (srp_connected_ch(target) < target->ch_count ||
+	    target->qp_in_error) {
+		shost_printk(KERN_INFO, target->scsi_host,
+			     PFX "SCSI scan failed - removing SCSI host\n");
+		srp_queue_remove_work(target);
+		goto out;
+	}
+
+	pr_debug("%s: SCSI scan succeeded - detected %d LUNs\n",
+		 dev_name(&target->scsi_host->shost_gendev),
+		 srp_sdev_count(target->scsi_host));
+
+	spin_lock_irq(&target->lock);
+	if (target->state == SRP_TARGET_SCANNING)
+		target->state = SRP_TARGET_LIVE;
+	spin_unlock_irq(&target->lock);
+
+out:
+	return 0;
+}
+
+static void srp_release_dev(struct device *dev)
+{
+	struct srp_host *host =
+		container_of(dev, struct srp_host, dev);
+
+	complete(&host->released);
+}
+
+static struct class srp_class = {
+	.name    = "infiniband_srp",
+	.dev_release = srp_release_dev
+};
+
+/**
+ * srp_conn_unique() - check whether the connection to a target is unique
+ * @host:   SRP host.
+ * @target: SRP target port.
+ */
+static bool srp_conn_unique(struct srp_host *host,
+			    struct srp_target_port *target)
+{
+	struct srp_target_port *t;
+	bool ret = false;
+
+	if (target->state == SRP_TARGET_REMOVED)
+		goto out;
+
+	ret = true;
+
+	spin_lock(&host->target_lock);
+	list_for_each_entry(t, &host->target_list, list) {
+		if (t != target &&
+		    target->id_ext == t->id_ext &&
+		    target->ioc_guid == t->ioc_guid &&
+		    target->initiator_ext == t->initiator_ext) {
+			ret = false;
+			break;
+		}
+	}
+	spin_unlock(&host->target_lock);
+
+out:
+	return ret;
+}
+
+/*
+ * Target ports are added by writing
+ *
+ *     id_ext=<SRP ID ext>,ioc_guid=<SRP IOC GUID>,dgid=<dest GID>,
+ *     pkey=<P_Key>,service_id=<service ID>
+ * or
+ *     id_ext=<SRP ID ext>,ioc_guid=<SRP IOC GUID>,
+ *     [src=<IPv4 address>,]dest=<IPv4 address>:<port number>
+ *
+ * to the add_target sysfs attribute.
+ */
+enum {
+	SRP_OPT_ERR		= 0,
+	SRP_OPT_ID_EXT		= 1 << 0,
+	SRP_OPT_IOC_GUID	= 1 << 1,
+	SRP_OPT_DGID		= 1 << 2,
+	SRP_OPT_PKEY		= 1 << 3,
+	SRP_OPT_SERVICE_ID	= 1 << 4,
+	SRP_OPT_MAX_SECT	= 1 << 5,
+	SRP_OPT_MAX_CMD_PER_LUN	= 1 << 6,
+	SRP_OPT_IO_CLASS	= 1 << 7,
+	SRP_OPT_INITIATOR_EXT	= 1 << 8,
+	SRP_OPT_CMD_SG_ENTRIES	= 1 << 9,
+	SRP_OPT_ALLOW_EXT_SG	= 1 << 10,
+	SRP_OPT_SG_TABLESIZE	= 1 << 11,
+	SRP_OPT_COMP_VECTOR	= 1 << 12,
+	SRP_OPT_TL_RETRY_COUNT	= 1 << 13,
+	SRP_OPT_QUEUE_SIZE	= 1 << 14,
+	SRP_OPT_IP_SRC		= 1 << 15,
+	SRP_OPT_IP_DEST		= 1 << 16,
+	SRP_OPT_TARGET_CAN_QUEUE= 1 << 17,
+};
+
+static unsigned int srp_opt_mandatory[] = {
+	SRP_OPT_ID_EXT		|
+	SRP_OPT_IOC_GUID	|
+	SRP_OPT_DGID		|
+	SRP_OPT_PKEY		|
+	SRP_OPT_SERVICE_ID,
+	SRP_OPT_ID_EXT		|
+	SRP_OPT_IOC_GUID	|
+	SRP_OPT_IP_DEST,
+};
+
+static const match_table_t srp_opt_tokens = {
+	{ SRP_OPT_ID_EXT,		"id_ext=%s" 		},
+	{ SRP_OPT_IOC_GUID,		"ioc_guid=%s" 		},
+	{ SRP_OPT_DGID,			"dgid=%s" 		},
+	{ SRP_OPT_PKEY,			"pkey=%x" 		},
+	{ SRP_OPT_SERVICE_ID,		"service_id=%s"		},
+	{ SRP_OPT_MAX_SECT,		"max_sect=%d" 		},
+	{ SRP_OPT_MAX_CMD_PER_LUN,	"max_cmd_per_lun=%d" 	},
+	{ SRP_OPT_TARGET_CAN_QUEUE,	"target_can_queue=%d"	},
+	{ SRP_OPT_IO_CLASS,		"io_class=%x"		},
+	{ SRP_OPT_INITIATOR_EXT,	"initiator_ext=%s"	},
+	{ SRP_OPT_CMD_SG_ENTRIES,	"cmd_sg_entries=%u"	},
+	{ SRP_OPT_ALLOW_EXT_SG,		"allow_ext_sg=%u"	},
+	{ SRP_OPT_SG_TABLESIZE,		"sg_tablesize=%u"	},
+	{ SRP_OPT_COMP_VECTOR,		"comp_vector=%u"	},
+	{ SRP_OPT_TL_RETRY_COUNT,	"tl_retry_count=%u"	},
+	{ SRP_OPT_QUEUE_SIZE,		"queue_size=%d"		},
+	{ SRP_OPT_IP_SRC,		"src=%s"		},
+	{ SRP_OPT_IP_DEST,		"dest=%s"		},
+	{ SRP_OPT_ERR,			NULL 			}
+};
+
+/**
+ * srp_parse_in - parse an IP address and port number combination
+ *
+ * Parse the following address formats:
+ * - IPv4: <ip_address>:<port>, e.g. 1.2.3.4:5.
+ * - IPv6: \[<ipv6_address>\]:<port>, e.g. [1::2:3%4]:5.
+ */
+static int srp_parse_in(struct net *net, struct sockaddr_storage *sa,
+			const char *addr_port_str)
+{
+	char *addr_end, *addr = kstrdup(addr_port_str, GFP_KERNEL);
+	char *port_str;
+	int ret;
+
+	if (!addr)
+		return -ENOMEM;
+	port_str = strrchr(addr, ':');
+	if (!port_str)
+		return -EINVAL;
+	*port_str++ = '\0';
+	ret = inet_pton_with_scope(net, AF_INET, addr, port_str, sa);
+	if (ret && addr[0]) {
+		addr_end = addr + strlen(addr) - 1;
+		if (addr[0] == '[' && *addr_end == ']') {
+			*addr_end = '\0';
+			ret = inet_pton_with_scope(net, AF_INET6, addr + 1,
+						   port_str, sa);
+		}
+	}
+	kfree(addr);
+	pr_debug("%s -> %pISpfsc\n", addr_port_str, sa);
+	return ret;
+}
+
+static int srp_parse_options(struct net *net, const char *buf,
+			     struct srp_target_port *target)
+{
+	char *options, *sep_opt;
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	unsigned long long ull;
+	int opt_mask = 0;
+	int token;
+	int ret = -EINVAL;
+	int i;
+
+	options = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	sep_opt = options;
+	while ((p = strsep(&sep_opt, ",\n")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, srp_opt_tokens, args);
+		opt_mask |= token;
+
+		switch (token) {
+		case SRP_OPT_ID_EXT:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = kstrtoull(p, 16, &ull);
+			if (ret) {
+				pr_warn("invalid id_ext parameter '%s'\n", p);
+				kfree(p);
+				goto out;
+			}
+			target->id_ext = cpu_to_be64(ull);
+			kfree(p);
+			break;
+
+		case SRP_OPT_IOC_GUID:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = kstrtoull(p, 16, &ull);
+			if (ret) {
+				pr_warn("invalid ioc_guid parameter '%s'\n", p);
+				kfree(p);
+				goto out;
+			}
+			target->ioc_guid = cpu_to_be64(ull);
+			kfree(p);
+			break;
+
+		case SRP_OPT_DGID:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) != 32) {
+				pr_warn("bad dest GID parameter '%s'\n", p);
+				kfree(p);
+				goto out;
+			}
+
+			ret = hex2bin(target->ib_cm.orig_dgid.raw, p, 16);
+			kfree(p);
+			if (ret < 0)
+				goto out;
+			break;
+
+		case SRP_OPT_PKEY:
+			if (match_hex(args, &token)) {
+				pr_warn("bad P_Key parameter '%s'\n", p);
+				goto out;
+			}
+			target->ib_cm.pkey = cpu_to_be16(token);
+			break;
+
+		case SRP_OPT_SERVICE_ID:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = kstrtoull(p, 16, &ull);
+			if (ret) {
+				pr_warn("bad service_id parameter '%s'\n", p);
+				kfree(p);
+				goto out;
+			}
+			target->ib_cm.service_id = cpu_to_be64(ull);
+			kfree(p);
+			break;
+
+		case SRP_OPT_IP_SRC:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = srp_parse_in(net, &target->rdma_cm.src.ss, p);
+			if (ret < 0) {
+				pr_warn("bad source parameter '%s'\n", p);
+				kfree(p);
+				goto out;
+			}
+			target->rdma_cm.src_specified = true;
+			kfree(p);
+			break;
+
+		case SRP_OPT_IP_DEST:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p);
+			if (ret < 0) {
+				pr_warn("bad dest parameter '%s'\n", p);
+				kfree(p);
+				goto out;
+			}
+			target->using_rdma_cm = true;
+			kfree(p);
+			break;
+
+		case SRP_OPT_MAX_SECT:
+			if (match_int(args, &token)) {
+				pr_warn("bad max sect parameter '%s'\n", p);
+				goto out;
+			}
+			target->scsi_host->max_sectors = token;
+			break;
+
+		case SRP_OPT_QUEUE_SIZE:
+			if (match_int(args, &token) || token < 1) {
+				pr_warn("bad queue_size parameter '%s'\n", p);
+				goto out;
+			}
+			target->scsi_host->can_queue = token;
+			target->queue_size = token + SRP_RSP_SQ_SIZE +
+					     SRP_TSK_MGMT_SQ_SIZE;
+			if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN))
+				target->scsi_host->cmd_per_lun = token;
+			break;
+
+		case SRP_OPT_MAX_CMD_PER_LUN:
+			if (match_int(args, &token) || token < 1) {
+				pr_warn("bad max cmd_per_lun parameter '%s'\n",
+					p);
+				goto out;
+			}
+			target->scsi_host->cmd_per_lun = token;
+			break;
+
+		case SRP_OPT_TARGET_CAN_QUEUE:
+			if (match_int(args, &token) || token < 1) {
+				pr_warn("bad max target_can_queue parameter '%s'\n",
+					p);
+				goto out;
+			}
+			target->target_can_queue = token;
+			break;
+
+		case SRP_OPT_IO_CLASS:
+			if (match_hex(args, &token)) {
+				pr_warn("bad IO class parameter '%s'\n", p);
+				goto out;
+			}
+			if (token != SRP_REV10_IB_IO_CLASS &&
+			    token != SRP_REV16A_IB_IO_CLASS) {
+				pr_warn("unknown IO class parameter value %x specified (use %x or %x).\n",
+					token, SRP_REV10_IB_IO_CLASS,
+					SRP_REV16A_IB_IO_CLASS);
+				goto out;
+			}
+			target->io_class = token;
+			break;
+
+		case SRP_OPT_INITIATOR_EXT:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = kstrtoull(p, 16, &ull);
+			if (ret) {
+				pr_warn("bad initiator_ext value '%s'\n", p);
+				kfree(p);
+				goto out;
+			}
+			target->initiator_ext = cpu_to_be64(ull);
+			kfree(p);
+			break;
+
+		case SRP_OPT_CMD_SG_ENTRIES:
+			if (match_int(args, &token) || token < 1 || token > 255) {
+				pr_warn("bad max cmd_sg_entries parameter '%s'\n",
+					p);
+				goto out;
+			}
+			target->cmd_sg_cnt = token;
+			break;
+
+		case SRP_OPT_ALLOW_EXT_SG:
+			if (match_int(args, &token)) {
+				pr_warn("bad allow_ext_sg parameter '%s'\n", p);
+				goto out;
+			}
+			target->allow_ext_sg = !!token;
+			break;
+
+		case SRP_OPT_SG_TABLESIZE:
+			if (match_int(args, &token) || token < 1 ||
+					token > SG_MAX_SEGMENTS) {
+				pr_warn("bad max sg_tablesize parameter '%s'\n",
+					p);
+				goto out;
+			}
+			target->sg_tablesize = token;
+			break;
+
+		case SRP_OPT_COMP_VECTOR:
+			if (match_int(args, &token) || token < 0) {
+				pr_warn("bad comp_vector parameter '%s'\n", p);
+				goto out;
+			}
+			target->comp_vector = token;
+			break;
+
+		case SRP_OPT_TL_RETRY_COUNT:
+			if (match_int(args, &token) || token < 2 || token > 7) {
+				pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n",
+					p);
+				goto out;
+			}
+			target->tl_retry_count = token;
+			break;
+
+		default:
+			pr_warn("unknown parameter or missing value '%s' in target creation request\n",
+				p);
+			goto out;
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(srp_opt_mandatory); i++) {
+		if ((opt_mask & srp_opt_mandatory[i]) == srp_opt_mandatory[i]) {
+			ret = 0;
+			break;
+		}
+	}
+	if (ret)
+		pr_warn("target creation request is missing one or more parameters\n");
+
+	if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue
+	    && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN))
+		pr_warn("cmd_per_lun = %d > queue_size = %d\n",
+			target->scsi_host->cmd_per_lun,
+			target->scsi_host->can_queue);
+
+out:
+	kfree(options);
+	return ret;
+}
+
+static ssize_t srp_create_target(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct srp_host *host =
+		container_of(dev, struct srp_host, dev);
+	struct Scsi_Host *target_host;
+	struct srp_target_port *target;
+	struct srp_rdma_ch *ch;
+	struct srp_device *srp_dev = host->srp_dev;
+	struct ib_device *ibdev = srp_dev->dev;
+	int ret, node_idx, node, cpu, i;
+	unsigned int max_sectors_per_mr, mr_per_cmd = 0;
+	bool multich = false;
+
+	target_host = scsi_host_alloc(&srp_template,
+				      sizeof (struct srp_target_port));
+	if (!target_host)
+		return -ENOMEM;
+
+	target_host->transportt  = ib_srp_transport_template;
+	target_host->max_channel = 0;
+	target_host->max_id      = 1;
+	target_host->max_lun     = -1LL;
+	target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb;
+
+	target = host_to_target(target_host);
+
+	target->net		= kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+	target->io_class	= SRP_REV16A_IB_IO_CLASS;
+	target->scsi_host	= target_host;
+	target->srp_host	= host;
+	target->lkey		= host->srp_dev->pd->local_dma_lkey;
+	target->global_rkey	= host->srp_dev->global_rkey;
+	target->cmd_sg_cnt	= cmd_sg_entries;
+	target->sg_tablesize	= indirect_sg_entries ? : cmd_sg_entries;
+	target->allow_ext_sg	= allow_ext_sg;
+	target->tl_retry_count	= 7;
+	target->queue_size	= SRP_DEFAULT_QUEUE_SIZE;
+
+	/*
+	 * Avoid that the SCSI host can be removed by srp_remove_target()
+	 * before this function returns.
+	 */
+	scsi_host_get(target->scsi_host);
+
+	ret = mutex_lock_interruptible(&host->add_target_mutex);
+	if (ret < 0)
+		goto put;
+
+	ret = srp_parse_options(target->net, buf, target);
+	if (ret)
+		goto out;
+
+	target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE;
+
+	if (!srp_conn_unique(target->srp_host, target)) {
+		if (target->using_rdma_cm) {
+			shost_printk(KERN_INFO, target->scsi_host,
+				     PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;dest=%pIS\n",
+				     be64_to_cpu(target->id_ext),
+				     be64_to_cpu(target->ioc_guid),
+				     &target->rdma_cm.dst);
+		} else {
+			shost_printk(KERN_INFO, target->scsi_host,
+				     PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n",
+				     be64_to_cpu(target->id_ext),
+				     be64_to_cpu(target->ioc_guid),
+				     be64_to_cpu(target->initiator_ext));
+		}
+		ret = -EEXIST;
+		goto out;
+	}
+
+	if (!srp_dev->has_fmr && !srp_dev->has_fr && !target->allow_ext_sg &&
+	    target->cmd_sg_cnt < target->sg_tablesize) {
+		pr_warn("No MR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n");
+		target->sg_tablesize = target->cmd_sg_cnt;
+	}
+
+	if (srp_dev->use_fast_reg || srp_dev->use_fmr) {
+		bool gaps_reg = (ibdev->attrs.device_cap_flags &
+				 IB_DEVICE_SG_GAPS_REG);
+
+		max_sectors_per_mr = srp_dev->max_pages_per_mr <<
+				  (ilog2(srp_dev->mr_page_size) - 9);
+		if (!gaps_reg) {
+			/*
+			 * FR and FMR can only map one HCA page per entry. If
+			 * the start address is not aligned on a HCA page
+			 * boundary two entries will be used for the head and
+			 * the tail although these two entries combined
+			 * contain at most one HCA page of data. Hence the "+
+			 * 1" in the calculation below.
+			 *
+			 * The indirect data buffer descriptor is contiguous
+			 * so the memory for that buffer will only be
+			 * registered if register_always is true. Hence add
+			 * one to mr_per_cmd if register_always has been set.
+			 */
+			mr_per_cmd = register_always +
+				(target->scsi_host->max_sectors + 1 +
+				 max_sectors_per_mr - 1) / max_sectors_per_mr;
+		} else {
+			mr_per_cmd = register_always +
+				(target->sg_tablesize +
+				 srp_dev->max_pages_per_mr - 1) /
+				srp_dev->max_pages_per_mr;
+		}
+		pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; max_sectors_per_mr = %u; mr_per_cmd = %u\n",
+			 target->scsi_host->max_sectors, srp_dev->max_pages_per_mr, srp_dev->mr_page_size,
+			 max_sectors_per_mr, mr_per_cmd);
+	}
+
+	target_host->sg_tablesize = target->sg_tablesize;
+	target->mr_pool_size = target->scsi_host->can_queue * mr_per_cmd;
+	target->mr_per_cmd = mr_per_cmd;
+	target->indirect_size = target->sg_tablesize *
+				sizeof (struct srp_direct_buf);
+	target->max_iu_len = sizeof (struct srp_cmd) +
+			     sizeof (struct srp_indirect_buf) +
+			     target->cmd_sg_cnt * sizeof (struct srp_direct_buf);
+
+	INIT_WORK(&target->tl_err_work, srp_tl_err_work);
+	INIT_WORK(&target->remove_work, srp_remove_work);
+	spin_lock_init(&target->lock);
+	ret = ib_query_gid(ibdev, host->port, 0, &target->sgid, NULL);
+	if (ret)
+		goto out;
+
+	ret = -ENOMEM;
+	target->ch_count = max_t(unsigned, num_online_nodes(),
+				 min(ch_count ? :
+				     min(4 * num_online_nodes(),
+					 ibdev->num_comp_vectors),
+				     num_online_cpus()));
+	target->ch = kcalloc(target->ch_count, sizeof(*target->ch),
+			     GFP_KERNEL);
+	if (!target->ch)
+		goto out;
+
+	node_idx = 0;
+	for_each_online_node(node) {
+		const int ch_start = (node_idx * target->ch_count /
+				      num_online_nodes());
+		const int ch_end = ((node_idx + 1) * target->ch_count /
+				    num_online_nodes());
+		const int cv_start = node_idx * ibdev->num_comp_vectors /
+				     num_online_nodes();
+		const int cv_end = (node_idx + 1) * ibdev->num_comp_vectors /
+				   num_online_nodes();
+		int cpu_idx = 0;
+
+		for_each_online_cpu(cpu) {
+			if (cpu_to_node(cpu) != node)
+				continue;
+			if (ch_start + cpu_idx >= ch_end)
+				continue;
+			ch = &target->ch[ch_start + cpu_idx];
+			ch->target = target;
+			ch->comp_vector = cv_start == cv_end ? cv_start :
+				cv_start + cpu_idx % (cv_end - cv_start);
+			spin_lock_init(&ch->lock);
+			INIT_LIST_HEAD(&ch->free_tx);
+			ret = srp_new_cm_id(ch);
+			if (ret)
+				goto err_disconnect;
+
+			ret = srp_create_ch_ib(ch);
+			if (ret)
+				goto err_disconnect;
+
+			ret = srp_alloc_req_data(ch);
+			if (ret)
+				goto err_disconnect;
+
+			ret = srp_connect_ch(ch, multich);
+			if (ret) {
+				char dst[64];
+
+				if (target->using_rdma_cm)
+					snprintf(dst, sizeof(dst), "%pIS",
+						 &target->rdma_cm.dst);
+				else
+					snprintf(dst, sizeof(dst), "%pI6",
+						 target->ib_cm.orig_dgid.raw);
+				shost_printk(KERN_ERR, target->scsi_host,
+					     PFX "Connection %d/%d to %s failed\n",
+					     ch_start + cpu_idx,
+					     target->ch_count, dst);
+				if (node_idx == 0 && cpu_idx == 0) {
+					goto free_ch;
+				} else {
+					srp_free_ch_ib(target, ch);
+					srp_free_req_data(target, ch);
+					target->ch_count = ch - target->ch;
+					goto connected;
+				}
+			}
+
+			multich = true;
+			cpu_idx++;
+		}
+		node_idx++;
+	}
+
+connected:
+	target->scsi_host->nr_hw_queues = target->ch_count;
+
+	ret = srp_add_target(host, target);
+	if (ret)
+		goto err_disconnect;
+
+	if (target->state != SRP_TARGET_REMOVED) {
+		if (target->using_rdma_cm) {
+			shost_printk(KERN_DEBUG, target->scsi_host, PFX
+				     "new target: id_ext %016llx ioc_guid %016llx sgid %pI6 dest %pIS\n",
+				     be64_to_cpu(target->id_ext),
+				     be64_to_cpu(target->ioc_guid),
+				     target->sgid.raw, &target->rdma_cm.dst);
+		} else {
+			shost_printk(KERN_DEBUG, target->scsi_host, PFX
+				     "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n",
+				     be64_to_cpu(target->id_ext),
+				     be64_to_cpu(target->ioc_guid),
+				     be16_to_cpu(target->ib_cm.pkey),
+				     be64_to_cpu(target->ib_cm.service_id),
+				     target->sgid.raw,
+				     target->ib_cm.orig_dgid.raw);
+		}
+	}
+
+	ret = count;
+
+out:
+	mutex_unlock(&host->add_target_mutex);
+
+put:
+	scsi_host_put(target->scsi_host);
+	if (ret < 0) {
+		/*
+		 * If a call to srp_remove_target() has not been scheduled,
+		 * drop the network namespace reference now that was obtained
+		 * earlier in this function.
+		 */
+		if (target->state != SRP_TARGET_REMOVED)
+			kobj_ns_drop(KOBJ_NS_TYPE_NET, target->net);
+		scsi_host_put(target->scsi_host);
+	}
+
+	return ret;
+
+err_disconnect:
+	srp_disconnect_target(target);
+
+free_ch:
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		srp_free_ch_ib(target, ch);
+		srp_free_req_data(target, ch);
+	}
+
+	kfree(target->ch);
+	goto out;
+}
+
+static DEVICE_ATTR(add_target, S_IWUSR, NULL, srp_create_target);
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct srp_host *host = container_of(dev, struct srp_host, dev);
+
+	return sprintf(buf, "%s\n", host->srp_dev->dev->name);
+}
+
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct srp_host *host = container_of(dev, struct srp_host, dev);
+
+	return sprintf(buf, "%d\n", host->port);
+}
+
+static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static struct srp_host *srp_add_port(struct srp_device *device, u8 port)
+{
+	struct srp_host *host;
+
+	host = kzalloc(sizeof *host, GFP_KERNEL);
+	if (!host)
+		return NULL;
+
+	INIT_LIST_HEAD(&host->target_list);
+	spin_lock_init(&host->target_lock);
+	init_completion(&host->released);
+	mutex_init(&host->add_target_mutex);
+	host->srp_dev = device;
+	host->port = port;
+
+	host->dev.class = &srp_class;
+	host->dev.parent = device->dev->dev.parent;
+	dev_set_name(&host->dev, "srp-%s-%d", device->dev->name, port);
+
+	if (device_register(&host->dev))
+		goto free_host;
+	if (device_create_file(&host->dev, &dev_attr_add_target))
+		goto err_class;
+	if (device_create_file(&host->dev, &dev_attr_ibdev))
+		goto err_class;
+	if (device_create_file(&host->dev, &dev_attr_port))
+		goto err_class;
+
+	return host;
+
+err_class:
+	device_unregister(&host->dev);
+
+free_host:
+	kfree(host);
+
+	return NULL;
+}
+
+static void srp_add_one(struct ib_device *device)
+{
+	struct srp_device *srp_dev;
+	struct ib_device_attr *attr = &device->attrs;
+	struct srp_host *host;
+	int mr_page_shift, p;
+	u64 max_pages_per_mr;
+	unsigned int flags = 0;
+
+	srp_dev = kzalloc(sizeof(*srp_dev), GFP_KERNEL);
+	if (!srp_dev)
+		return;
+
+	/*
+	 * Use the smallest page size supported by the HCA, down to a
+	 * minimum of 4096 bytes. We're unlikely to build large sglists
+	 * out of smaller entries.
+	 */
+	mr_page_shift		= max(12, ffs(attr->page_size_cap) - 1);
+	srp_dev->mr_page_size	= 1 << mr_page_shift;
+	srp_dev->mr_page_mask	= ~((u64) srp_dev->mr_page_size - 1);
+	max_pages_per_mr	= attr->max_mr_size;
+	do_div(max_pages_per_mr, srp_dev->mr_page_size);
+	pr_debug("%s: %llu / %u = %llu <> %u\n", __func__,
+		 attr->max_mr_size, srp_dev->mr_page_size,
+		 max_pages_per_mr, SRP_MAX_PAGES_PER_MR);
+	srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
+					  max_pages_per_mr);
+
+	srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
+			    device->map_phys_fmr && device->unmap_fmr);
+	srp_dev->has_fr = (attr->device_cap_flags &
+			   IB_DEVICE_MEM_MGT_EXTENSIONS);
+	if (!never_register && !srp_dev->has_fmr && !srp_dev->has_fr) {
+		dev_warn(&device->dev, "neither FMR nor FR is supported\n");
+	} else if (!never_register &&
+		   attr->max_mr_size >= 2 * srp_dev->mr_page_size) {
+		srp_dev->use_fast_reg = (srp_dev->has_fr &&
+					 (!srp_dev->has_fmr || prefer_fr));
+		srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
+	}
+
+	if (never_register || !register_always ||
+	    (!srp_dev->has_fmr && !srp_dev->has_fr))
+		flags |= IB_PD_UNSAFE_GLOBAL_RKEY;
+
+	if (srp_dev->use_fast_reg) {
+		srp_dev->max_pages_per_mr =
+			min_t(u32, srp_dev->max_pages_per_mr,
+			      attr->max_fast_reg_page_list_len);
+	}
+	srp_dev->mr_max_size	= srp_dev->mr_page_size *
+				   srp_dev->max_pages_per_mr;
+	pr_debug("%s: mr_page_shift = %d, device->max_mr_size = %#llx, device->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n",
+		 device->name, mr_page_shift, attr->max_mr_size,
+		 attr->max_fast_reg_page_list_len,
+		 srp_dev->max_pages_per_mr, srp_dev->mr_max_size);
+
+	INIT_LIST_HEAD(&srp_dev->dev_list);
+
+	srp_dev->dev = device;
+	srp_dev->pd  = ib_alloc_pd(device, flags);
+	if (IS_ERR(srp_dev->pd))
+		goto free_dev;
+
+	if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
+		srp_dev->global_rkey = srp_dev->pd->unsafe_global_rkey;
+		WARN_ON_ONCE(srp_dev->global_rkey == 0);
+	}
+
+	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+		host = srp_add_port(srp_dev, p);
+		if (host)
+			list_add_tail(&host->list, &srp_dev->dev_list);
+	}
+
+	ib_set_client_data(device, &srp_client, srp_dev);
+	return;
+
+free_dev:
+	kfree(srp_dev);
+}
+
+static void srp_remove_one(struct ib_device *device, void *client_data)
+{
+	struct srp_device *srp_dev;
+	struct srp_host *host, *tmp_host;
+	struct srp_target_port *target;
+
+	srp_dev = client_data;
+	if (!srp_dev)
+		return;
+
+	list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) {
+		device_unregister(&host->dev);
+		/*
+		 * Wait for the sysfs entry to go away, so that no new
+		 * target ports can be created.
+		 */
+		wait_for_completion(&host->released);
+
+		/*
+		 * Remove all target ports.
+		 */
+		spin_lock(&host->target_lock);
+		list_for_each_entry(target, &host->target_list, list)
+			srp_queue_remove_work(target);
+		spin_unlock(&host->target_lock);
+
+		/*
+		 * Wait for tl_err and target port removal tasks.
+		 */
+		flush_workqueue(system_long_wq);
+		flush_workqueue(srp_remove_wq);
+
+		kfree(host);
+	}
+
+	ib_dealloc_pd(srp_dev->pd);
+
+	kfree(srp_dev);
+}
+
+static struct srp_function_template ib_srp_transport_functions = {
+	.has_rport_state	 = true,
+	.reset_timer_if_blocked	 = true,
+	.reconnect_delay	 = &srp_reconnect_delay,
+	.fast_io_fail_tmo	 = &srp_fast_io_fail_tmo,
+	.dev_loss_tmo		 = &srp_dev_loss_tmo,
+	.reconnect		 = srp_rport_reconnect,
+	.rport_delete		 = srp_rport_delete,
+	.terminate_rport_io	 = srp_terminate_io,
+};
+
+static int __init srp_init_module(void)
+{
+	int ret;
+
+	if (srp_sg_tablesize) {
+		pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n");
+		if (!cmd_sg_entries)
+			cmd_sg_entries = srp_sg_tablesize;
+	}
+
+	if (!cmd_sg_entries)
+		cmd_sg_entries = SRP_DEF_SG_TABLESIZE;
+
+	if (cmd_sg_entries > 255) {
+		pr_warn("Clamping cmd_sg_entries to 255\n");
+		cmd_sg_entries = 255;
+	}
+
+	if (!indirect_sg_entries)
+		indirect_sg_entries = cmd_sg_entries;
+	else if (indirect_sg_entries < cmd_sg_entries) {
+		pr_warn("Bumping up indirect_sg_entries to match cmd_sg_entries (%u)\n",
+			cmd_sg_entries);
+		indirect_sg_entries = cmd_sg_entries;
+	}
+
+	if (indirect_sg_entries > SG_MAX_SEGMENTS) {
+		pr_warn("Clamping indirect_sg_entries to %u\n",
+			SG_MAX_SEGMENTS);
+		indirect_sg_entries = SG_MAX_SEGMENTS;
+	}
+
+	srp_remove_wq = create_workqueue("srp_remove");
+	if (!srp_remove_wq) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = -ENOMEM;
+	ib_srp_transport_template =
+		srp_attach_transport(&ib_srp_transport_functions);
+	if (!ib_srp_transport_template)
+		goto destroy_wq;
+
+	ret = class_register(&srp_class);
+	if (ret) {
+		pr_err("couldn't register class infiniband_srp\n");
+		goto release_tr;
+	}
+
+	ib_sa_register_client(&srp_sa_client);
+
+	ret = ib_register_client(&srp_client);
+	if (ret) {
+		pr_err("couldn't register IB client\n");
+		goto unreg_sa;
+	}
+
+out:
+	return ret;
+
+unreg_sa:
+	ib_sa_unregister_client(&srp_sa_client);
+	class_unregister(&srp_class);
+
+release_tr:
+	srp_release_transport(ib_srp_transport_template);
+
+destroy_wq:
+	destroy_workqueue(srp_remove_wq);
+	goto out;
+}
+
+static void __exit srp_cleanup_module(void)
+{
+	ib_unregister_client(&srp_client);
+	ib_sa_unregister_client(&srp_sa_client);
+	class_unregister(&srp_class);
+	srp_release_transport(ib_srp_transport_template);
+	destroy_workqueue(srp_remove_wq);
+}
+
+module_init(srp_init_module);
+module_exit(srp_cleanup_module);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
new file mode 100644
index 0000000..a270608
--- /dev/null
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_SRP_H
+#define IB_SRP_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_cmnd.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_fmr_pool.h>
+#include <rdma/rdma_cm.h>
+
+enum {
+	SRP_PATH_REC_TIMEOUT_MS	= 1000,
+	SRP_ABORT_TIMEOUT_MS	= 5000,
+
+	SRP_PORT_REDIRECT	= 1,
+	SRP_DLID_REDIRECT	= 2,
+	SRP_STALE_CONN		= 3,
+
+	SRP_DEF_SG_TABLESIZE	= 12,
+
+	SRP_DEFAULT_QUEUE_SIZE	= 1 << 6,
+	SRP_RSP_SQ_SIZE		= 1,
+	SRP_TSK_MGMT_SQ_SIZE	= 1,
+	SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE -
+				  SRP_TSK_MGMT_SQ_SIZE,
+
+	SRP_TAG_NO_REQ		= ~0U,
+	SRP_TAG_TSK_MGMT	= 1U << 31,
+
+	SRP_MAX_PAGES_PER_MR	= 512,
+};
+
+enum srp_target_state {
+	SRP_TARGET_SCANNING,
+	SRP_TARGET_LIVE,
+	SRP_TARGET_REMOVED,
+};
+
+enum srp_iu_type {
+	SRP_IU_CMD,
+	SRP_IU_TSK_MGMT,
+	SRP_IU_RSP,
+};
+
+/*
+ * @mr_page_mask: HCA memory registration page mask.
+ * @mr_page_size: HCA memory registration page size.
+ * @mr_max_size: Maximum size in bytes of a single FMR / FR registration
+ *   request.
+ */
+struct srp_device {
+	struct list_head	dev_list;
+	struct ib_device       *dev;
+	struct ib_pd	       *pd;
+	u32			global_rkey;
+	u64			mr_page_mask;
+	int			mr_page_size;
+	int			mr_max_size;
+	int			max_pages_per_mr;
+	bool			has_fmr;
+	bool			has_fr;
+	bool			use_fmr;
+	bool			use_fast_reg;
+};
+
+struct srp_host {
+	struct srp_device      *srp_dev;
+	u8			port;
+	struct device		dev;
+	struct list_head	target_list;
+	spinlock_t		target_lock;
+	struct completion	released;
+	struct list_head	list;
+	struct mutex		add_target_mutex;
+};
+
+struct srp_request {
+	struct scsi_cmnd       *scmnd;
+	struct srp_iu	       *cmd;
+	union {
+		struct ib_pool_fmr **fmr_list;
+		struct srp_fr_desc **fr_list;
+	};
+	u64		       *map_page;
+	struct srp_direct_buf  *indirect_desc;
+	dma_addr_t		indirect_dma_addr;
+	short			nmdesc;
+	struct ib_cqe		reg_cqe;
+};
+
+/**
+ * struct srp_rdma_ch
+ * @comp_vector: Completion vector used by this RDMA channel.
+ */
+struct srp_rdma_ch {
+	/* These are RW in the hot path, and commonly used together */
+	struct list_head	free_tx;
+	spinlock_t		lock;
+	s32			req_lim;
+
+	/* These are read-only in the hot path */
+	struct srp_target_port *target ____cacheline_aligned_in_smp;
+	struct ib_cq	       *send_cq;
+	struct ib_cq	       *recv_cq;
+	struct ib_qp	       *qp;
+	union {
+		struct ib_fmr_pool     *fmr_pool;
+		struct srp_fr_pool     *fr_pool;
+	};
+
+	/* Everything above this point is used in the hot path of
+	 * command processing. Try to keep them packed into cachelines.
+	 */
+
+	struct completion	done;
+	int			status;
+
+	union {
+		struct ib_cm {
+			struct sa_path_rec	path;
+			struct ib_sa_query	*path_query;
+			int			path_query_id;
+			struct ib_cm_id		*cm_id;
+		} ib_cm;
+		struct rdma_cm {
+			struct rdma_cm_id	*cm_id;
+		} rdma_cm;
+	};
+
+	struct srp_iu	      **tx_ring;
+	struct srp_iu	      **rx_ring;
+	struct srp_request     *req_ring;
+	int			max_ti_iu_len;
+	int			comp_vector;
+
+	u64			tsk_mgmt_tag;
+	struct completion	tsk_mgmt_done;
+	u8			tsk_mgmt_status;
+	bool			connected;
+};
+
+/**
+ * struct srp_target_port
+ * @comp_vector: Completion vector used by the first RDMA channel created for
+ *   this target port.
+ */
+struct srp_target_port {
+	/* read and written in the hot path */
+	spinlock_t		lock;
+
+	/* read only in the hot path */
+	u32			global_rkey;
+	struct srp_rdma_ch	*ch;
+	struct net		*net;
+	u32			ch_count;
+	u32			lkey;
+	enum srp_target_state	state;
+	unsigned int		max_iu_len;
+	unsigned int		cmd_sg_cnt;
+	unsigned int		indirect_size;
+	bool			allow_ext_sg;
+
+	/* other member variables */
+	union ib_gid		sgid;
+	__be64			id_ext;
+	__be64			ioc_guid;
+	__be64			initiator_ext;
+	u16			io_class;
+	struct srp_host	       *srp_host;
+	struct Scsi_Host       *scsi_host;
+	struct srp_rport       *rport;
+	char			target_name[32];
+	unsigned int		scsi_id;
+	unsigned int		sg_tablesize;
+	unsigned int		target_can_queue;
+	int			mr_pool_size;
+	int			mr_per_cmd;
+	int			queue_size;
+	int			req_ring_size;
+	int			comp_vector;
+	int			tl_retry_count;
+
+	bool			using_rdma_cm;
+
+	union {
+		struct {
+			__be64			service_id;
+			union ib_gid		orig_dgid;
+			__be16			pkey;
+		} ib_cm;
+		struct {
+			union {
+				struct sockaddr_in	ip4;
+				struct sockaddr_in6	ip6;
+				struct sockaddr_storage ss;
+			} src;
+			union {
+				struct sockaddr_in	ip4;
+				struct sockaddr_in6	ip6;
+				struct sockaddr_storage ss;
+			} dst;
+			bool src_specified;
+		} rdma_cm;
+	};
+
+	u32			rq_tmo_jiffies;
+
+	int			zero_req_lim;
+
+	struct work_struct	tl_err_work;
+	struct work_struct	remove_work;
+
+	struct list_head	list;
+	bool			qp_in_error;
+};
+
+struct srp_iu {
+	struct list_head	list;
+	u64			dma;
+	void		       *buf;
+	size_t			size;
+	enum dma_data_direction	direction;
+	struct ib_cqe		cqe;
+};
+
+/**
+ * struct srp_fr_desc - fast registration work request arguments
+ * @entry: Entry in srp_fr_pool.free_list.
+ * @mr:    Memory region.
+ * @frpl:  Fast registration page list.
+ */
+struct srp_fr_desc {
+	struct list_head		entry;
+	struct ib_mr			*mr;
+};
+
+/**
+ * struct srp_fr_pool - pool of fast registration descriptors
+ *
+ * An entry is available for allocation if and only if it occurs in @free_list.
+ *
+ * @size:      Number of descriptors in this pool.
+ * @max_page_list_len: Maximum fast registration work request page list length.
+ * @lock:      Protects free_list.
+ * @free_list: List of free descriptors.
+ * @desc:      Fast registration descriptor pool.
+ */
+struct srp_fr_pool {
+	int			size;
+	int			max_page_list_len;
+	spinlock_t		lock;
+	struct list_head	free_list;
+	struct srp_fr_desc	desc[0];
+};
+
+/**
+ * struct srp_map_state - per-request DMA memory mapping state
+ * @desc:	    Pointer to the element of the SRP buffer descriptor array
+ *		    that is being filled in.
+ * @pages:	    Array with DMA addresses of pages being considered for
+ *		    memory registration.
+ * @base_dma_addr:  DMA address of the first page that has not yet been mapped.
+ * @dma_len:	    Number of bytes that will be registered with the next
+ *		    FMR or FR memory registration call.
+ * @total_len:	    Total number of bytes in the sg-list being mapped.
+ * @npages:	    Number of page addresses in the pages[] array.
+ * @nmdesc:	    Number of FMR or FR memory descriptors used for mapping.
+ * @ndesc:	    Number of SRP buffer descriptors that have been filled in.
+ */
+struct srp_map_state {
+	union {
+		struct {
+			struct ib_pool_fmr **next;
+			struct ib_pool_fmr **end;
+		} fmr;
+		struct {
+			struct srp_fr_desc **next;
+			struct srp_fr_desc **end;
+		} fr;
+		struct {
+			void		   **next;
+			void		   **end;
+		} gen;
+	};
+	struct srp_direct_buf  *desc;
+	union {
+		u64			*pages;
+		struct scatterlist	*sg;
+	};
+	dma_addr_t		base_dma_addr;
+	u32			dma_len;
+	u32			total_len;
+	unsigned int		npages;
+	unsigned int		nmdesc;
+	unsigned int		ndesc;
+};
+
+#endif /* IB_SRP_H */
diff --git a/drivers/infiniband/ulp/srpt/Kconfig b/drivers/infiniband/ulp/srpt/Kconfig
new file mode 100644
index 0000000..fb8b718
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/Kconfig
@@ -0,0 +1,12 @@
+config INFINIBAND_SRPT
+	tristate "InfiniBand SCSI RDMA Protocol target support"
+	depends on INFINIBAND && INFINIBAND_ADDR_TRANS && TARGET_CORE
+	---help---
+
+	  Support for the SCSI RDMA Protocol (SRP) Target driver. The
+	  SRP protocol is a protocol that allows an initiator to access
+	  a block storage device on another host (target) over a network
+	  that supports the RDMA protocol. Currently the RDMA protocol is
+	  supported by InfiniBand and by iWarp network hardware. More
+	  information about the SRP protocol can be found on the website
+	  of the INCITS T10 technical committee (http://www.t10.org/).
diff --git a/drivers/infiniband/ulp/srpt/Makefile b/drivers/infiniband/ulp/srpt/Makefile
new file mode 100644
index 0000000..e3ee4bd
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/Makefile
@@ -0,0 +1,2 @@
+ccflags-y			:= -Idrivers/target
+obj-$(CONFIG_INFINIBAND_SRPT)	+= ib_srpt.o
diff --git a/drivers/infiniband/ulp/srpt/ib_dm_mad.h b/drivers/infiniband/ulp/srpt/ib_dm_mad.h
new file mode 100644
index 0000000..fb1de1f
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/ib_dm_mad.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2006 - 2009 Mellanox Technology Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef IB_DM_MAD_H
+#define IB_DM_MAD_H
+
+#include <linux/types.h>
+
+#include <rdma/ib_mad.h>
+
+enum {
+	/*
+	 * See also section 13.4.7 Status Field, table 115 MAD Common Status
+	 * Field Bit Values and also section 16.3.1.1 Status Field in the
+	 * InfiniBand Architecture Specification.
+	 */
+	DM_MAD_STATUS_UNSUP_METHOD = 0x0008,
+	DM_MAD_STATUS_UNSUP_METHOD_ATTR = 0x000c,
+	DM_MAD_STATUS_INVALID_FIELD = 0x001c,
+	DM_MAD_STATUS_NO_IOC = 0x0100,
+
+	/*
+	 * See also the Device Management chapter, section 16.3.3 Attributes,
+	 * table 279 Device Management Attributes in the InfiniBand
+	 * Architecture Specification.
+	 */
+	DM_ATTR_CLASS_PORT_INFO = 0x01,
+	DM_ATTR_IOU_INFO = 0x10,
+	DM_ATTR_IOC_PROFILE = 0x11,
+	DM_ATTR_SVC_ENTRIES = 0x12
+};
+
+struct ib_dm_hdr {
+	u8 reserved[28];
+};
+
+/*
+ * Structure of management datagram sent by the SRP target implementation.
+ * Contains a management datagram header, reliable multi-packet transaction
+ * protocol (RMPP) header and ib_dm_hdr. Notes:
+ * - The SRP target implementation does not use RMPP or ib_dm_hdr when sending
+ *   management datagrams.
+ * - The header size must be exactly 64 bytes (IB_MGMT_DEVICE_HDR), since this
+ *   is the header size that is passed to ib_create_send_mad() in ib_srpt.c.
+ * - The maximum supported size for a management datagram when not using RMPP
+ *   is 256 bytes -- 64 bytes header and 192 (IB_MGMT_DEVICE_DATA) bytes data.
+ */
+struct ib_dm_mad {
+	struct ib_mad_hdr mad_hdr;
+	struct ib_rmpp_hdr rmpp_hdr;
+	struct ib_dm_hdr dm_hdr;
+	u8 data[IB_MGMT_DEVICE_DATA];
+};
+
+/*
+ * IOUnitInfo as defined in section 16.3.3.3 IOUnitInfo of the InfiniBand
+ * Architecture Specification.
+ */
+struct ib_dm_iou_info {
+	__be16 change_id;
+	u8 max_controllers;
+	u8 op_rom;
+	u8 controller_list[128];
+};
+
+/*
+ * IOControllerprofile as defined in section 16.3.3.4 IOControllerProfile of
+ * the InfiniBand Architecture Specification.
+ */
+struct ib_dm_ioc_profile {
+	__be64 guid;
+	__be32 vendor_id;
+	__be32 device_id;
+	__be16 device_version;
+	__be16 reserved1;
+	__be32 subsys_vendor_id;
+	__be32 subsys_device_id;
+	__be16 io_class;
+	__be16 io_subclass;
+	__be16 protocol;
+	__be16 protocol_version;
+	__be16 service_conn;
+	__be16 initiators_supported;
+	__be16 send_queue_depth;
+	u8 reserved2;
+	u8 rdma_read_depth;
+	__be32 send_size;
+	__be32 rdma_size;
+	u8 op_cap_mask;
+	u8 svc_cap_mask;
+	u8 num_svc_entries;
+	u8 reserved3[9];
+	u8 id_string[64];
+};
+
+struct ib_dm_svc_entry {
+	u8 name[40];
+	__be64 id;
+};
+
+/*
+ * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture
+ * Specification. See also section B.7, table B.8 in the T10 SRP r16a document.
+ */
+struct ib_dm_svc_entries {
+	struct ib_dm_svc_entry service_entries[4];
+};
+
+#endif
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
new file mode 100644
index 0000000..dfec0e1
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -0,0 +1,3762 @@
+/*
+ * Copyright (c) 2006 - 2009 Mellanox Technology Inc.  All rights reserved.
+ * Copyright (C) 2008 - 2011 Bart Van Assche <bvanassche@acm.org>.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/ctype.h>
+#include <linux/kthread.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/atomic.h>
+#include <linux/inet.h>
+#include <rdma/ib_cache.h>
+#include <scsi/scsi_proto.h>
+#include <scsi/scsi_tcq.h>
+#include <target/target_core_base.h>
+#include <target/target_core_fabric.h>
+#include "ib_srpt.h"
+
+/* Name of this kernel module. */
+#define DRV_NAME		"ib_srpt"
+#define DRV_VERSION		"2.0.0"
+#define DRV_RELDATE		"2011-02-14"
+
+#define SRPT_ID_STRING	"Linux SRP target"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRV_NAME " " fmt
+
+MODULE_AUTHOR("Vu Pham and Bart Van Assche");
+MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol target "
+		   "v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/*
+ * Global Variables
+ */
+
+static u64 srpt_service_guid;
+static DEFINE_SPINLOCK(srpt_dev_lock);	/* Protects srpt_dev_list. */
+static LIST_HEAD(srpt_dev_list);	/* List of srpt_device structures. */
+
+static unsigned srp_max_req_size = DEFAULT_MAX_REQ_SIZE;
+module_param(srp_max_req_size, int, 0444);
+MODULE_PARM_DESC(srp_max_req_size,
+		 "Maximum size of SRP request messages in bytes.");
+
+static int srpt_srq_size = DEFAULT_SRPT_SRQ_SIZE;
+module_param(srpt_srq_size, int, 0444);
+MODULE_PARM_DESC(srpt_srq_size,
+		 "Shared receive queue (SRQ) size.");
+
+static int srpt_get_u64_x(char *buffer, const struct kernel_param *kp)
+{
+	return sprintf(buffer, "0x%016llx", *(u64 *)kp->arg);
+}
+module_param_call(srpt_service_guid, NULL, srpt_get_u64_x, &srpt_service_guid,
+		  0444);
+MODULE_PARM_DESC(srpt_service_guid,
+		 "Using this value for ioc_guid, id_ext, and cm_listen_id"
+		 " instead of using the node_guid of the first HCA.");
+
+static struct ib_client srpt_client;
+/* Protects both rdma_cm_port and rdma_cm_id. */
+static DEFINE_MUTEX(rdma_cm_mutex);
+/* Port number RDMA/CM will bind to. */
+static u16 rdma_cm_port;
+static struct rdma_cm_id *rdma_cm_id;
+static void srpt_release_cmd(struct se_cmd *se_cmd);
+static void srpt_free_ch(struct kref *kref);
+static int srpt_queue_status(struct se_cmd *cmd);
+static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc);
+static void srpt_process_wait_list(struct srpt_rdma_ch *ch);
+
+/*
+ * The only allowed channel state changes are those that change the channel
+ * state into a state with a higher numerical value. Hence the new > prev test.
+ */
+static bool srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new)
+{
+	unsigned long flags;
+	enum rdma_ch_state prev;
+	bool changed = false;
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	prev = ch->state;
+	if (new > prev) {
+		ch->state = new;
+		changed = true;
+	}
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+
+	return changed;
+}
+
+/**
+ * srpt_event_handler - asynchronous IB event callback function
+ * @handler: IB event handler registered by ib_register_event_handler().
+ * @event: Description of the event that occurred.
+ *
+ * Callback function called by the InfiniBand core when an asynchronous IB
+ * event occurs. This callback may occur in interrupt context. See also
+ * section 11.5.2, Set Asynchronous Event Handler in the InfiniBand
+ * Architecture Specification.
+ */
+static void srpt_event_handler(struct ib_event_handler *handler,
+			       struct ib_event *event)
+{
+	struct srpt_device *sdev;
+	struct srpt_port *sport;
+	u8 port_num;
+
+	sdev = ib_get_client_data(event->device, &srpt_client);
+	if (!sdev || sdev->device != event->device)
+		return;
+
+	pr_debug("ASYNC event= %d on device= %s\n", event->event,
+		 sdev->device->name);
+
+	switch (event->event) {
+	case IB_EVENT_PORT_ERR:
+		port_num = event->element.port_num - 1;
+		if (port_num < sdev->device->phys_port_cnt) {
+			sport = &sdev->port[port_num];
+			sport->lid = 0;
+			sport->sm_lid = 0;
+		} else {
+			WARN(true, "event %d: port_num %d out of range 1..%d\n",
+			     event->event, port_num + 1,
+			     sdev->device->phys_port_cnt);
+		}
+		break;
+	case IB_EVENT_PORT_ACTIVE:
+	case IB_EVENT_LID_CHANGE:
+	case IB_EVENT_PKEY_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+	case IB_EVENT_GID_CHANGE:
+		/* Refresh port data asynchronously. */
+		port_num = event->element.port_num - 1;
+		if (port_num < sdev->device->phys_port_cnt) {
+			sport = &sdev->port[port_num];
+			if (!sport->lid && !sport->sm_lid)
+				schedule_work(&sport->work);
+		} else {
+			WARN(true, "event %d: port_num %d out of range 1..%d\n",
+			     event->event, port_num + 1,
+			     sdev->device->phys_port_cnt);
+		}
+		break;
+	default:
+		pr_err("received unrecognized IB event %d\n", event->event);
+		break;
+	}
+}
+
+/**
+ * srpt_srq_event - SRQ event callback function
+ * @event: Description of the event that occurred.
+ * @ctx: Context pointer specified at SRQ creation time.
+ */
+static void srpt_srq_event(struct ib_event *event, void *ctx)
+{
+	pr_debug("SRQ event %d\n", event->event);
+}
+
+static const char *get_ch_state_name(enum rdma_ch_state s)
+{
+	switch (s) {
+	case CH_CONNECTING:
+		return "connecting";
+	case CH_LIVE:
+		return "live";
+	case CH_DISCONNECTING:
+		return "disconnecting";
+	case CH_DRAINING:
+		return "draining";
+	case CH_DISCONNECTED:
+		return "disconnected";
+	}
+	return "???";
+}
+
+/**
+ * srpt_qp_event - QP event callback function
+ * @event: Description of the event that occurred.
+ * @ch: SRPT RDMA channel.
+ */
+static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch)
+{
+	pr_debug("QP event %d on ch=%p sess_name=%s state=%d\n",
+		 event->event, ch, ch->sess_name, ch->state);
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		if (ch->using_rdma_cm)
+			rdma_notify(ch->rdma_cm.cm_id, event->event);
+		else
+			ib_cm_notify(ch->ib_cm.cm_id, event->event);
+		break;
+	case IB_EVENT_QP_LAST_WQE_REACHED:
+		pr_debug("%s-%d, state %s: received Last WQE event.\n",
+			 ch->sess_name, ch->qp->qp_num,
+			 get_ch_state_name(ch->state));
+		break;
+	default:
+		pr_err("received unrecognized IB QP event %d\n", event->event);
+		break;
+	}
+}
+
+/**
+ * srpt_set_ioc - initialize a IOUnitInfo structure
+ * @c_list: controller list.
+ * @slot: one-based slot number.
+ * @value: four-bit value.
+ *
+ * Copies the lowest four bits of value in element slot of the array of four
+ * bit elements called c_list (controller list). The index slot is one-based.
+ */
+static void srpt_set_ioc(u8 *c_list, u32 slot, u8 value)
+{
+	u16 id;
+	u8 tmp;
+
+	id = (slot - 1) / 2;
+	if (slot & 0x1) {
+		tmp = c_list[id] & 0xf;
+		c_list[id] = (value << 4) | tmp;
+	} else {
+		tmp = c_list[id] & 0xf0;
+		c_list[id] = (value & 0xf) | tmp;
+	}
+}
+
+/**
+ * srpt_get_class_port_info - copy ClassPortInfo to a management datagram
+ * @mad: Datagram that will be sent as response to DM_ATTR_CLASS_PORT_INFO.
+ *
+ * See also section 16.3.3.1 ClassPortInfo in the InfiniBand Architecture
+ * Specification.
+ */
+static void srpt_get_class_port_info(struct ib_dm_mad *mad)
+{
+	struct ib_class_port_info *cif;
+
+	cif = (struct ib_class_port_info *)mad->data;
+	memset(cif, 0, sizeof(*cif));
+	cif->base_version = 1;
+	cif->class_version = 1;
+
+	ib_set_cpi_resp_time(cif, 20);
+	mad->mad_hdr.status = 0;
+}
+
+/**
+ * srpt_get_iou - write IOUnitInfo to a management datagram
+ * @mad: Datagram that will be sent as response to DM_ATTR_IOU_INFO.
+ *
+ * See also section 16.3.3.3 IOUnitInfo in the InfiniBand Architecture
+ * Specification. See also section B.7, table B.6 in the SRP r16a document.
+ */
+static void srpt_get_iou(struct ib_dm_mad *mad)
+{
+	struct ib_dm_iou_info *ioui;
+	u8 slot;
+	int i;
+
+	ioui = (struct ib_dm_iou_info *)mad->data;
+	ioui->change_id = cpu_to_be16(1);
+	ioui->max_controllers = 16;
+
+	/* set present for slot 1 and empty for the rest */
+	srpt_set_ioc(ioui->controller_list, 1, 1);
+	for (i = 1, slot = 2; i < 16; i++, slot++)
+		srpt_set_ioc(ioui->controller_list, slot, 0);
+
+	mad->mad_hdr.status = 0;
+}
+
+/**
+ * srpt_get_ioc - write IOControllerprofile to a management datagram
+ * @sport: HCA port through which the MAD has been received.
+ * @slot: Slot number specified in DM_ATTR_IOC_PROFILE query.
+ * @mad: Datagram that will be sent as response to DM_ATTR_IOC_PROFILE.
+ *
+ * See also section 16.3.3.4 IOControllerProfile in the InfiniBand
+ * Architecture Specification. See also section B.7, table B.7 in the SRP
+ * r16a document.
+ */
+static void srpt_get_ioc(struct srpt_port *sport, u32 slot,
+			 struct ib_dm_mad *mad)
+{
+	struct srpt_device *sdev = sport->sdev;
+	struct ib_dm_ioc_profile *iocp;
+	int send_queue_depth;
+
+	iocp = (struct ib_dm_ioc_profile *)mad->data;
+
+	if (!slot || slot > 16) {
+		mad->mad_hdr.status
+			= cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD);
+		return;
+	}
+
+	if (slot > 2) {
+		mad->mad_hdr.status
+			= cpu_to_be16(DM_MAD_STATUS_NO_IOC);
+		return;
+	}
+
+	if (sdev->use_srq)
+		send_queue_depth = sdev->srq_size;
+	else
+		send_queue_depth = min(MAX_SRPT_RQ_SIZE,
+				       sdev->device->attrs.max_qp_wr);
+
+	memset(iocp, 0, sizeof(*iocp));
+	strcpy(iocp->id_string, SRPT_ID_STRING);
+	iocp->guid = cpu_to_be64(srpt_service_guid);
+	iocp->vendor_id = cpu_to_be32(sdev->device->attrs.vendor_id);
+	iocp->device_id = cpu_to_be32(sdev->device->attrs.vendor_part_id);
+	iocp->device_version = cpu_to_be16(sdev->device->attrs.hw_ver);
+	iocp->subsys_vendor_id = cpu_to_be32(sdev->device->attrs.vendor_id);
+	iocp->subsys_device_id = 0x0;
+	iocp->io_class = cpu_to_be16(SRP_REV16A_IB_IO_CLASS);
+	iocp->io_subclass = cpu_to_be16(SRP_IO_SUBCLASS);
+	iocp->protocol = cpu_to_be16(SRP_PROTOCOL);
+	iocp->protocol_version = cpu_to_be16(SRP_PROTOCOL_VERSION);
+	iocp->send_queue_depth = cpu_to_be16(send_queue_depth);
+	iocp->rdma_read_depth = 4;
+	iocp->send_size = cpu_to_be32(srp_max_req_size);
+	iocp->rdma_size = cpu_to_be32(min(sport->port_attrib.srp_max_rdma_size,
+					  1U << 24));
+	iocp->num_svc_entries = 1;
+	iocp->op_cap_mask = SRP_SEND_TO_IOC | SRP_SEND_FROM_IOC |
+		SRP_RDMA_READ_FROM_IOC | SRP_RDMA_WRITE_FROM_IOC;
+
+	mad->mad_hdr.status = 0;
+}
+
+/**
+ * srpt_get_svc_entries - write ServiceEntries to a management datagram
+ * @ioc_guid: I/O controller GUID to use in reply.
+ * @slot: I/O controller number.
+ * @hi: End of the range of service entries to be specified in the reply.
+ * @lo: Start of the range of service entries to be specified in the reply..
+ * @mad: Datagram that will be sent as response to DM_ATTR_SVC_ENTRIES.
+ *
+ * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture
+ * Specification. See also section B.7, table B.8 in the SRP r16a document.
+ */
+static void srpt_get_svc_entries(u64 ioc_guid,
+				 u16 slot, u8 hi, u8 lo, struct ib_dm_mad *mad)
+{
+	struct ib_dm_svc_entries *svc_entries;
+
+	WARN_ON(!ioc_guid);
+
+	if (!slot || slot > 16) {
+		mad->mad_hdr.status
+			= cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD);
+		return;
+	}
+
+	if (slot > 2 || lo > hi || hi > 1) {
+		mad->mad_hdr.status
+			= cpu_to_be16(DM_MAD_STATUS_NO_IOC);
+		return;
+	}
+
+	svc_entries = (struct ib_dm_svc_entries *)mad->data;
+	memset(svc_entries, 0, sizeof(*svc_entries));
+	svc_entries->service_entries[0].id = cpu_to_be64(ioc_guid);
+	snprintf(svc_entries->service_entries[0].name,
+		 sizeof(svc_entries->service_entries[0].name),
+		 "%s%016llx",
+		 SRP_SERVICE_NAME_PREFIX,
+		 ioc_guid);
+
+	mad->mad_hdr.status = 0;
+}
+
+/**
+ * srpt_mgmt_method_get - process a received management datagram
+ * @sp:      HCA port through which the MAD has been received.
+ * @rq_mad:  received MAD.
+ * @rsp_mad: response MAD.
+ */
+static void srpt_mgmt_method_get(struct srpt_port *sp, struct ib_mad *rq_mad,
+				 struct ib_dm_mad *rsp_mad)
+{
+	u16 attr_id;
+	u32 slot;
+	u8 hi, lo;
+
+	attr_id = be16_to_cpu(rq_mad->mad_hdr.attr_id);
+	switch (attr_id) {
+	case DM_ATTR_CLASS_PORT_INFO:
+		srpt_get_class_port_info(rsp_mad);
+		break;
+	case DM_ATTR_IOU_INFO:
+		srpt_get_iou(rsp_mad);
+		break;
+	case DM_ATTR_IOC_PROFILE:
+		slot = be32_to_cpu(rq_mad->mad_hdr.attr_mod);
+		srpt_get_ioc(sp, slot, rsp_mad);
+		break;
+	case DM_ATTR_SVC_ENTRIES:
+		slot = be32_to_cpu(rq_mad->mad_hdr.attr_mod);
+		hi = (u8) ((slot >> 8) & 0xff);
+		lo = (u8) (slot & 0xff);
+		slot = (u16) ((slot >> 16) & 0xffff);
+		srpt_get_svc_entries(srpt_service_guid,
+				     slot, hi, lo, rsp_mad);
+		break;
+	default:
+		rsp_mad->mad_hdr.status =
+		    cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR);
+		break;
+	}
+}
+
+/**
+ * srpt_mad_send_handler - MAD send completion callback
+ * @mad_agent: Return value of ib_register_mad_agent().
+ * @mad_wc: Work completion reporting that the MAD has been sent.
+ */
+static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent,
+				  struct ib_mad_send_wc *mad_wc)
+{
+	rdma_destroy_ah(mad_wc->send_buf->ah);
+	ib_free_send_mad(mad_wc->send_buf);
+}
+
+/**
+ * srpt_mad_recv_handler - MAD reception callback function
+ * @mad_agent: Return value of ib_register_mad_agent().
+ * @send_buf: Not used.
+ * @mad_wc: Work completion reporting that a MAD has been received.
+ */
+static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent,
+				  struct ib_mad_send_buf *send_buf,
+				  struct ib_mad_recv_wc *mad_wc)
+{
+	struct srpt_port *sport = (struct srpt_port *)mad_agent->context;
+	struct ib_ah *ah;
+	struct ib_mad_send_buf *rsp;
+	struct ib_dm_mad *dm_mad;
+
+	if (!mad_wc || !mad_wc->recv_buf.mad)
+		return;
+
+	ah = ib_create_ah_from_wc(mad_agent->qp->pd, mad_wc->wc,
+				  mad_wc->recv_buf.grh, mad_agent->port_num);
+	if (IS_ERR(ah))
+		goto err;
+
+	BUILD_BUG_ON(offsetof(struct ib_dm_mad, data) != IB_MGMT_DEVICE_HDR);
+
+	rsp = ib_create_send_mad(mad_agent, mad_wc->wc->src_qp,
+				 mad_wc->wc->pkey_index, 0,
+				 IB_MGMT_DEVICE_HDR, IB_MGMT_DEVICE_DATA,
+				 GFP_KERNEL,
+				 IB_MGMT_BASE_VERSION);
+	if (IS_ERR(rsp))
+		goto err_rsp;
+
+	rsp->ah = ah;
+
+	dm_mad = rsp->mad;
+	memcpy(dm_mad, mad_wc->recv_buf.mad, sizeof(*dm_mad));
+	dm_mad->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+	dm_mad->mad_hdr.status = 0;
+
+	switch (mad_wc->recv_buf.mad->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		srpt_mgmt_method_get(sport, mad_wc->recv_buf.mad, dm_mad);
+		break;
+	case IB_MGMT_METHOD_SET:
+		dm_mad->mad_hdr.status =
+		    cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR);
+		break;
+	default:
+		dm_mad->mad_hdr.status =
+		    cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD);
+		break;
+	}
+
+	if (!ib_post_send_mad(rsp, NULL)) {
+		ib_free_recv_mad(mad_wc);
+		/* will destroy_ah & free_send_mad in send completion */
+		return;
+	}
+
+	ib_free_send_mad(rsp);
+
+err_rsp:
+	rdma_destroy_ah(ah);
+err:
+	ib_free_recv_mad(mad_wc);
+}
+
+static int srpt_format_guid(char *buf, unsigned int size, const __be64 *guid)
+{
+	const __be16 *g = (const __be16 *)guid;
+
+	return snprintf(buf, size, "%04x:%04x:%04x:%04x",
+			be16_to_cpu(g[0]), be16_to_cpu(g[1]),
+			be16_to_cpu(g[2]), be16_to_cpu(g[3]));
+}
+
+/**
+ * srpt_refresh_port - configure a HCA port
+ * @sport: SRPT HCA port.
+ *
+ * Enable InfiniBand management datagram processing, update the cached sm_lid,
+ * lid and gid values, and register a callback function for processing MADs
+ * on the specified port.
+ *
+ * Note: It is safe to call this function more than once for the same port.
+ */
+static int srpt_refresh_port(struct srpt_port *sport)
+{
+	struct ib_mad_reg_req reg_req;
+	struct ib_port_modify port_modify;
+	struct ib_port_attr port_attr;
+	int ret;
+
+	memset(&port_modify, 0, sizeof(port_modify));
+	port_modify.set_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP;
+	port_modify.clr_port_cap_mask = 0;
+
+	ret = ib_modify_port(sport->sdev->device, sport->port, 0, &port_modify);
+	if (ret)
+		goto err_mod_port;
+
+	ret = ib_query_port(sport->sdev->device, sport->port, &port_attr);
+	if (ret)
+		goto err_query_port;
+
+	sport->sm_lid = port_attr.sm_lid;
+	sport->lid = port_attr.lid;
+
+	ret = ib_query_gid(sport->sdev->device, sport->port, 0, &sport->gid,
+			   NULL);
+	if (ret)
+		goto err_query_port;
+
+	sport->port_guid_wwn.priv = sport;
+	srpt_format_guid(sport->port_guid, sizeof(sport->port_guid),
+			 &sport->gid.global.interface_id);
+	sport->port_gid_wwn.priv = sport;
+	snprintf(sport->port_gid, sizeof(sport->port_gid),
+		 "0x%016llx%016llx",
+		 be64_to_cpu(sport->gid.global.subnet_prefix),
+		 be64_to_cpu(sport->gid.global.interface_id));
+
+	if (!sport->mad_agent) {
+		memset(&reg_req, 0, sizeof(reg_req));
+		reg_req.mgmt_class = IB_MGMT_CLASS_DEVICE_MGMT;
+		reg_req.mgmt_class_version = IB_MGMT_BASE_VERSION;
+		set_bit(IB_MGMT_METHOD_GET, reg_req.method_mask);
+		set_bit(IB_MGMT_METHOD_SET, reg_req.method_mask);
+
+		sport->mad_agent = ib_register_mad_agent(sport->sdev->device,
+							 sport->port,
+							 IB_QPT_GSI,
+							 &reg_req, 0,
+							 srpt_mad_send_handler,
+							 srpt_mad_recv_handler,
+							 sport, 0);
+		if (IS_ERR(sport->mad_agent)) {
+			ret = PTR_ERR(sport->mad_agent);
+			sport->mad_agent = NULL;
+			goto err_query_port;
+		}
+	}
+
+	return 0;
+
+err_query_port:
+
+	port_modify.set_port_cap_mask = 0;
+	port_modify.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP;
+	ib_modify_port(sport->sdev->device, sport->port, 0, &port_modify);
+
+err_mod_port:
+
+	return ret;
+}
+
+/**
+ * srpt_unregister_mad_agent - unregister MAD callback functions
+ * @sdev: SRPT HCA pointer.
+ *
+ * Note: It is safe to call this function more than once for the same device.
+ */
+static void srpt_unregister_mad_agent(struct srpt_device *sdev)
+{
+	struct ib_port_modify port_modify = {
+		.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP,
+	};
+	struct srpt_port *sport;
+	int i;
+
+	for (i = 1; i <= sdev->device->phys_port_cnt; i++) {
+		sport = &sdev->port[i - 1];
+		WARN_ON(sport->port != i);
+		if (ib_modify_port(sdev->device, i, 0, &port_modify) < 0)
+			pr_err("disabling MAD processing failed.\n");
+		if (sport->mad_agent) {
+			ib_unregister_mad_agent(sport->mad_agent);
+			sport->mad_agent = NULL;
+		}
+	}
+}
+
+/**
+ * srpt_alloc_ioctx - allocate a SRPT I/O context structure
+ * @sdev: SRPT HCA pointer.
+ * @ioctx_size: I/O context size.
+ * @dma_size: Size of I/O context DMA buffer.
+ * @dir: DMA data direction.
+ */
+static struct srpt_ioctx *srpt_alloc_ioctx(struct srpt_device *sdev,
+					   int ioctx_size, int dma_size,
+					   enum dma_data_direction dir)
+{
+	struct srpt_ioctx *ioctx;
+
+	ioctx = kmalloc(ioctx_size, GFP_KERNEL);
+	if (!ioctx)
+		goto err;
+
+	ioctx->buf = kmalloc(dma_size, GFP_KERNEL);
+	if (!ioctx->buf)
+		goto err_free_ioctx;
+
+	ioctx->dma = ib_dma_map_single(sdev->device, ioctx->buf, dma_size, dir);
+	if (ib_dma_mapping_error(sdev->device, ioctx->dma))
+		goto err_free_buf;
+
+	return ioctx;
+
+err_free_buf:
+	kfree(ioctx->buf);
+err_free_ioctx:
+	kfree(ioctx);
+err:
+	return NULL;
+}
+
+/**
+ * srpt_free_ioctx - free a SRPT I/O context structure
+ * @sdev: SRPT HCA pointer.
+ * @ioctx: I/O context pointer.
+ * @dma_size: Size of I/O context DMA buffer.
+ * @dir: DMA data direction.
+ */
+static void srpt_free_ioctx(struct srpt_device *sdev, struct srpt_ioctx *ioctx,
+			    int dma_size, enum dma_data_direction dir)
+{
+	if (!ioctx)
+		return;
+
+	ib_dma_unmap_single(sdev->device, ioctx->dma, dma_size, dir);
+	kfree(ioctx->buf);
+	kfree(ioctx);
+}
+
+/**
+ * srpt_alloc_ioctx_ring - allocate a ring of SRPT I/O context structures
+ * @sdev:       Device to allocate the I/O context ring for.
+ * @ring_size:  Number of elements in the I/O context ring.
+ * @ioctx_size: I/O context size.
+ * @dma_size:   DMA buffer size.
+ * @dir:        DMA data direction.
+ */
+static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct srpt_device *sdev,
+				int ring_size, int ioctx_size,
+				int dma_size, enum dma_data_direction dir)
+{
+	struct srpt_ioctx **ring;
+	int i;
+
+	WARN_ON(ioctx_size != sizeof(struct srpt_recv_ioctx)
+		&& ioctx_size != sizeof(struct srpt_send_ioctx));
+
+	ring = kmalloc(ring_size * sizeof(ring[0]), GFP_KERNEL);
+	if (!ring)
+		goto out;
+	for (i = 0; i < ring_size; ++i) {
+		ring[i] = srpt_alloc_ioctx(sdev, ioctx_size, dma_size, dir);
+		if (!ring[i])
+			goto err;
+		ring[i]->index = i;
+	}
+	goto out;
+
+err:
+	while (--i >= 0)
+		srpt_free_ioctx(sdev, ring[i], dma_size, dir);
+	kfree(ring);
+	ring = NULL;
+out:
+	return ring;
+}
+
+/**
+ * srpt_free_ioctx_ring - free the ring of SRPT I/O context structures
+ * @ioctx_ring: I/O context ring to be freed.
+ * @sdev: SRPT HCA pointer.
+ * @ring_size: Number of ring elements.
+ * @dma_size: Size of I/O context DMA buffer.
+ * @dir: DMA data direction.
+ */
+static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring,
+				 struct srpt_device *sdev, int ring_size,
+				 int dma_size, enum dma_data_direction dir)
+{
+	int i;
+
+	if (!ioctx_ring)
+		return;
+
+	for (i = 0; i < ring_size; ++i)
+		srpt_free_ioctx(sdev, ioctx_ring[i], dma_size, dir);
+	kfree(ioctx_ring);
+}
+
+/**
+ * srpt_set_cmd_state - set the state of a SCSI command
+ * @ioctx: Send I/O context.
+ * @new: New I/O context state.
+ *
+ * Does not modify the state of aborted commands. Returns the previous command
+ * state.
+ */
+static enum srpt_command_state srpt_set_cmd_state(struct srpt_send_ioctx *ioctx,
+						  enum srpt_command_state new)
+{
+	enum srpt_command_state previous;
+
+	previous = ioctx->state;
+	if (previous != SRPT_STATE_DONE)
+		ioctx->state = new;
+
+	return previous;
+}
+
+/**
+ * srpt_test_and_set_cmd_state - test and set the state of a command
+ * @ioctx: Send I/O context.
+ * @old: Current I/O context state.
+ * @new: New I/O context state.
+ *
+ * Returns true if and only if the previous command state was equal to 'old'.
+ */
+static bool srpt_test_and_set_cmd_state(struct srpt_send_ioctx *ioctx,
+					enum srpt_command_state old,
+					enum srpt_command_state new)
+{
+	enum srpt_command_state previous;
+
+	WARN_ON(!ioctx);
+	WARN_ON(old == SRPT_STATE_DONE);
+	WARN_ON(new == SRPT_STATE_NEW);
+
+	previous = ioctx->state;
+	if (previous == old)
+		ioctx->state = new;
+
+	return previous == old;
+}
+
+/**
+ * srpt_post_recv - post an IB receive request
+ * @sdev: SRPT HCA pointer.
+ * @ch: SRPT RDMA channel.
+ * @ioctx: Receive I/O context pointer.
+ */
+static int srpt_post_recv(struct srpt_device *sdev, struct srpt_rdma_ch *ch,
+			  struct srpt_recv_ioctx *ioctx)
+{
+	struct ib_sge list;
+	struct ib_recv_wr wr, *bad_wr;
+
+	BUG_ON(!sdev);
+	list.addr = ioctx->ioctx.dma;
+	list.length = srp_max_req_size;
+	list.lkey = sdev->lkey;
+
+	ioctx->ioctx.cqe.done = srpt_recv_done;
+	wr.wr_cqe = &ioctx->ioctx.cqe;
+	wr.next = NULL;
+	wr.sg_list = &list;
+	wr.num_sge = 1;
+
+	if (sdev->use_srq)
+		return ib_post_srq_recv(sdev->srq, &wr, &bad_wr);
+	else
+		return ib_post_recv(ch->qp, &wr, &bad_wr);
+}
+
+/**
+ * srpt_zerolength_write - perform a zero-length RDMA write
+ * @ch: SRPT RDMA channel.
+ *
+ * A quote from the InfiniBand specification: C9-88: For an HCA responder
+ * using Reliable Connection service, for each zero-length RDMA READ or WRITE
+ * request, the R_Key shall not be validated, even if the request includes
+ * Immediate data.
+ */
+static int srpt_zerolength_write(struct srpt_rdma_ch *ch)
+{
+	struct ib_send_wr *bad_wr;
+	struct ib_rdma_wr wr = {
+		.wr = {
+			.next		= NULL,
+			{ .wr_cqe	= &ch->zw_cqe, },
+			.opcode		= IB_WR_RDMA_WRITE,
+			.send_flags	= IB_SEND_SIGNALED,
+		}
+	};
+
+	pr_debug("%s-%d: queued zerolength write\n", ch->sess_name,
+		 ch->qp->qp_num);
+
+	return ib_post_send(ch->qp, &wr.wr, &bad_wr);
+}
+
+static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct srpt_rdma_ch *ch = cq->cq_context;
+
+	pr_debug("%s-%d wc->status %d\n", ch->sess_name, ch->qp->qp_num,
+		 wc->status);
+
+	if (wc->status == IB_WC_SUCCESS) {
+		srpt_process_wait_list(ch);
+	} else {
+		if (srpt_set_ch_state(ch, CH_DISCONNECTED))
+			schedule_work(&ch->release_work);
+		else
+			pr_debug("%s-%d: already disconnected.\n",
+				 ch->sess_name, ch->qp->qp_num);
+	}
+}
+
+static int srpt_alloc_rw_ctxs(struct srpt_send_ioctx *ioctx,
+		struct srp_direct_buf *db, int nbufs, struct scatterlist **sg,
+		unsigned *sg_cnt)
+{
+	enum dma_data_direction dir = target_reverse_dma_direction(&ioctx->cmd);
+	struct srpt_rdma_ch *ch = ioctx->ch;
+	struct scatterlist *prev = NULL;
+	unsigned prev_nents;
+	int ret, i;
+
+	if (nbufs == 1) {
+		ioctx->rw_ctxs = &ioctx->s_rw_ctx;
+	} else {
+		ioctx->rw_ctxs = kmalloc_array(nbufs, sizeof(*ioctx->rw_ctxs),
+			GFP_KERNEL);
+		if (!ioctx->rw_ctxs)
+			return -ENOMEM;
+	}
+
+	for (i = ioctx->n_rw_ctx; i < nbufs; i++, db++) {
+		struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+		u64 remote_addr = be64_to_cpu(db->va);
+		u32 size = be32_to_cpu(db->len);
+		u32 rkey = be32_to_cpu(db->key);
+
+		ret = target_alloc_sgl(&ctx->sg, &ctx->nents, size, false,
+				i < nbufs - 1);
+		if (ret)
+			goto unwind;
+
+		ret = rdma_rw_ctx_init(&ctx->rw, ch->qp, ch->sport->port,
+				ctx->sg, ctx->nents, 0, remote_addr, rkey, dir);
+		if (ret < 0) {
+			target_free_sgl(ctx->sg, ctx->nents);
+			goto unwind;
+		}
+
+		ioctx->n_rdma += ret;
+		ioctx->n_rw_ctx++;
+
+		if (prev) {
+			sg_unmark_end(&prev[prev_nents - 1]);
+			sg_chain(prev, prev_nents + 1, ctx->sg);
+		} else {
+			*sg = ctx->sg;
+		}
+
+		prev = ctx->sg;
+		prev_nents = ctx->nents;
+
+		*sg_cnt += ctx->nents;
+	}
+
+	return 0;
+
+unwind:
+	while (--i >= 0) {
+		struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+
+		rdma_rw_ctx_destroy(&ctx->rw, ch->qp, ch->sport->port,
+				ctx->sg, ctx->nents, dir);
+		target_free_sgl(ctx->sg, ctx->nents);
+	}
+	if (ioctx->rw_ctxs != &ioctx->s_rw_ctx)
+		kfree(ioctx->rw_ctxs);
+	return ret;
+}
+
+static void srpt_free_rw_ctxs(struct srpt_rdma_ch *ch,
+				    struct srpt_send_ioctx *ioctx)
+{
+	enum dma_data_direction dir = target_reverse_dma_direction(&ioctx->cmd);
+	int i;
+
+	for (i = 0; i < ioctx->n_rw_ctx; i++) {
+		struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+
+		rdma_rw_ctx_destroy(&ctx->rw, ch->qp, ch->sport->port,
+				ctx->sg, ctx->nents, dir);
+		target_free_sgl(ctx->sg, ctx->nents);
+	}
+
+	if (ioctx->rw_ctxs != &ioctx->s_rw_ctx)
+		kfree(ioctx->rw_ctxs);
+}
+
+static inline void *srpt_get_desc_buf(struct srp_cmd *srp_cmd)
+{
+	/*
+	 * The pointer computations below will only be compiled correctly
+	 * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check
+	 * whether srp_cmd::add_data has been declared as a byte pointer.
+	 */
+	BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0) &&
+		     !__same_type(srp_cmd->add_data[0], (u8)0));
+
+	/*
+	 * According to the SRP spec, the lower two bits of the 'ADDITIONAL
+	 * CDB LENGTH' field are reserved and the size in bytes of this field
+	 * is four times the value specified in bits 3..7. Hence the "& ~3".
+	 */
+	return srp_cmd->add_data + (srp_cmd->add_cdb_len & ~3);
+}
+
+/**
+ * srpt_get_desc_tbl - parse the data descriptors of a SRP_CMD request
+ * @ioctx: Pointer to the I/O context associated with the request.
+ * @srp_cmd: Pointer to the SRP_CMD request data.
+ * @dir: Pointer to the variable to which the transfer direction will be
+ *   written.
+ * @sg: [out] scatterlist allocated for the parsed SRP_CMD.
+ * @sg_cnt: [out] length of @sg.
+ * @data_len: Pointer to the variable to which the total data length of all
+ *   descriptors in the SRP_CMD request will be written.
+ *
+ * This function initializes ioctx->nrbuf and ioctx->r_bufs.
+ *
+ * Returns -EINVAL when the SRP_CMD request contains inconsistent descriptors;
+ * -ENOMEM when memory allocation fails and zero upon success.
+ */
+static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx,
+		struct srp_cmd *srp_cmd, enum dma_data_direction *dir,
+		struct scatterlist **sg, unsigned *sg_cnt, u64 *data_len)
+{
+	BUG_ON(!dir);
+	BUG_ON(!data_len);
+
+	/*
+	 * The lower four bits of the buffer format field contain the DATA-IN
+	 * buffer descriptor format, and the highest four bits contain the
+	 * DATA-OUT buffer descriptor format.
+	 */
+	if (srp_cmd->buf_fmt & 0xf)
+		/* DATA-IN: transfer data from target to initiator (read). */
+		*dir = DMA_FROM_DEVICE;
+	else if (srp_cmd->buf_fmt >> 4)
+		/* DATA-OUT: transfer data from initiator to target (write). */
+		*dir = DMA_TO_DEVICE;
+	else
+		*dir = DMA_NONE;
+
+	/* initialize data_direction early as srpt_alloc_rw_ctxs needs it */
+	ioctx->cmd.data_direction = *dir;
+
+	if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) ||
+	    ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) {
+	    	struct srp_direct_buf *db = srpt_get_desc_buf(srp_cmd);
+
+		*data_len = be32_to_cpu(db->len);
+		return srpt_alloc_rw_ctxs(ioctx, db, 1, sg, sg_cnt);
+	} else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) ||
+		   ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) {
+		struct srp_indirect_buf *idb = srpt_get_desc_buf(srp_cmd);
+		int nbufs = be32_to_cpu(idb->table_desc.len) /
+				sizeof(struct srp_direct_buf);
+
+		if (nbufs >
+		    (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) {
+			pr_err("received unsupported SRP_CMD request"
+			       " type (%u out + %u in != %u / %zu)\n",
+			       srp_cmd->data_out_desc_cnt,
+			       srp_cmd->data_in_desc_cnt,
+			       be32_to_cpu(idb->table_desc.len),
+			       sizeof(struct srp_direct_buf));
+			return -EINVAL;
+		}
+
+		*data_len = be32_to_cpu(idb->len);
+		return srpt_alloc_rw_ctxs(ioctx, idb->desc_list, nbufs,
+				sg, sg_cnt);
+	} else {
+		*data_len = 0;
+		return 0;
+	}
+}
+
+/**
+ * srpt_init_ch_qp - initialize queue pair attributes
+ * @ch: SRPT RDMA channel.
+ * @qp: Queue pair pointer.
+ *
+ * Initialized the attributes of queue pair 'qp' by allowing local write,
+ * remote read and remote write. Also transitions 'qp' to state IB_QPS_INIT.
+ */
+static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp)
+{
+	struct ib_qp_attr *attr;
+	int ret;
+
+	WARN_ON_ONCE(ch->using_rdma_cm);
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	attr->qp_state = IB_QPS_INIT;
+	attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE;
+	attr->port_num = ch->sport->port;
+
+	ret = ib_find_cached_pkey(ch->sport->sdev->device, ch->sport->port,
+				  ch->pkey, &attr->pkey_index);
+	if (ret < 0)
+		pr_err("Translating pkey %#x failed (%d) - using index 0\n",
+		       ch->pkey, ret);
+
+	ret = ib_modify_qp(qp, attr,
+			   IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PORT |
+			   IB_QP_PKEY_INDEX);
+
+	kfree(attr);
+	return ret;
+}
+
+/**
+ * srpt_ch_qp_rtr - change the state of a channel to 'ready to receive' (RTR)
+ * @ch: channel of the queue pair.
+ * @qp: queue pair to change the state of.
+ *
+ * Returns zero upon success and a negative value upon failure.
+ *
+ * Note: currently a struct ib_qp_attr takes 136 bytes on a 64-bit system.
+ * If this structure ever becomes larger, it might be necessary to allocate
+ * it dynamically instead of on the stack.
+ */
+static int srpt_ch_qp_rtr(struct srpt_rdma_ch *ch, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+	int ret;
+
+	WARN_ON_ONCE(ch->using_rdma_cm);
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_cm_init_qp_attr(ch->ib_cm.cm_id, &qp_attr, &attr_mask);
+	if (ret)
+		goto out;
+
+	qp_attr.max_dest_rd_atomic = 4;
+
+	ret = ib_modify_qp(qp, &qp_attr, attr_mask);
+
+out:
+	return ret;
+}
+
+/**
+ * srpt_ch_qp_rts - change the state of a channel to 'ready to send' (RTS)
+ * @ch: channel of the queue pair.
+ * @qp: queue pair to change the state of.
+ *
+ * Returns zero upon success and a negative value upon failure.
+ *
+ * Note: currently a struct ib_qp_attr takes 136 bytes on a 64-bit system.
+ * If this structure ever becomes larger, it might be necessary to allocate
+ * it dynamically instead of on the stack.
+ */
+static int srpt_ch_qp_rts(struct srpt_rdma_ch *ch, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+	int ret;
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(ch->ib_cm.cm_id, &qp_attr, &attr_mask);
+	if (ret)
+		goto out;
+
+	qp_attr.max_rd_atomic = 4;
+
+	ret = ib_modify_qp(qp, &qp_attr, attr_mask);
+
+out:
+	return ret;
+}
+
+/**
+ * srpt_ch_qp_err - set the channel queue pair state to 'error'
+ * @ch: SRPT RDMA channel.
+ */
+static int srpt_ch_qp_err(struct srpt_rdma_ch *ch)
+{
+	struct ib_qp_attr qp_attr;
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	return ib_modify_qp(ch->qp, &qp_attr, IB_QP_STATE);
+}
+
+/**
+ * srpt_get_send_ioctx - obtain an I/O context for sending to the initiator
+ * @ch: SRPT RDMA channel.
+ */
+static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch)
+{
+	struct srpt_send_ioctx *ioctx;
+	unsigned long flags;
+
+	BUG_ON(!ch);
+
+	ioctx = NULL;
+	spin_lock_irqsave(&ch->spinlock, flags);
+	if (!list_empty(&ch->free_list)) {
+		ioctx = list_first_entry(&ch->free_list,
+					 struct srpt_send_ioctx, free_list);
+		list_del(&ioctx->free_list);
+	}
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+
+	if (!ioctx)
+		return ioctx;
+
+	BUG_ON(ioctx->ch != ch);
+	ioctx->state = SRPT_STATE_NEW;
+	ioctx->n_rdma = 0;
+	ioctx->n_rw_ctx = 0;
+	ioctx->queue_status_only = false;
+	/*
+	 * transport_init_se_cmd() does not initialize all fields, so do it
+	 * here.
+	 */
+	memset(&ioctx->cmd, 0, sizeof(ioctx->cmd));
+	memset(&ioctx->sense_data, 0, sizeof(ioctx->sense_data));
+
+	return ioctx;
+}
+
+/**
+ * srpt_abort_cmd - abort a SCSI command
+ * @ioctx:   I/O context associated with the SCSI command.
+ */
+static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx)
+{
+	enum srpt_command_state state;
+
+	BUG_ON(!ioctx);
+
+	/*
+	 * If the command is in a state where the target core is waiting for
+	 * the ib_srpt driver, change the state to the next state.
+	 */
+
+	state = ioctx->state;
+	switch (state) {
+	case SRPT_STATE_NEED_DATA:
+		ioctx->state = SRPT_STATE_DATA_IN;
+		break;
+	case SRPT_STATE_CMD_RSP_SENT:
+	case SRPT_STATE_MGMT_RSP_SENT:
+		ioctx->state = SRPT_STATE_DONE;
+		break;
+	default:
+		WARN_ONCE(true, "%s: unexpected I/O context state %d\n",
+			  __func__, state);
+		break;
+	}
+
+	pr_debug("Aborting cmd with state %d -> %d and tag %lld\n", state,
+		 ioctx->state, ioctx->cmd.tag);
+
+	switch (state) {
+	case SRPT_STATE_NEW:
+	case SRPT_STATE_DATA_IN:
+	case SRPT_STATE_MGMT:
+	case SRPT_STATE_DONE:
+		/*
+		 * Do nothing - defer abort processing until
+		 * srpt_queue_response() is invoked.
+		 */
+		break;
+	case SRPT_STATE_NEED_DATA:
+		pr_debug("tag %#llx: RDMA read error\n", ioctx->cmd.tag);
+		transport_generic_request_failure(&ioctx->cmd,
+					TCM_CHECK_CONDITION_ABORT_CMD);
+		break;
+	case SRPT_STATE_CMD_RSP_SENT:
+		/*
+		 * SRP_RSP sending failed or the SRP_RSP send completion has
+		 * not been received in time.
+		 */
+		transport_generic_free_cmd(&ioctx->cmd, 0);
+		break;
+	case SRPT_STATE_MGMT_RSP_SENT:
+		transport_generic_free_cmd(&ioctx->cmd, 0);
+		break;
+	default:
+		WARN(1, "Unexpected command state (%d)", state);
+		break;
+	}
+
+	return state;
+}
+
+/**
+ * srpt_rdma_read_done - RDMA read completion callback
+ * @cq: Completion queue.
+ * @wc: Work completion.
+ *
+ * XXX: what is now target_execute_cmd used to be asynchronous, and unmapping
+ * the data that has been transferred via IB RDMA had to be postponed until the
+ * check_stop_free() callback.  None of this is necessary anymore and needs to
+ * be cleaned up.
+ */
+static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct srpt_rdma_ch *ch = cq->cq_context;
+	struct srpt_send_ioctx *ioctx =
+		container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe);
+
+	WARN_ON(ioctx->n_rdma <= 0);
+	atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
+	ioctx->n_rdma = 0;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		pr_info("RDMA_READ for ioctx 0x%p failed with status %d\n",
+			ioctx, wc->status);
+		srpt_abort_cmd(ioctx);
+		return;
+	}
+
+	if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA,
+					SRPT_STATE_DATA_IN))
+		target_execute_cmd(&ioctx->cmd);
+	else
+		pr_err("%s[%d]: wrong state = %d\n", __func__,
+		       __LINE__, ioctx->state);
+}
+
+/**
+ * srpt_build_cmd_rsp - build a SRP_RSP response
+ * @ch: RDMA channel through which the request has been received.
+ * @ioctx: I/O context associated with the SRP_CMD request. The response will
+ *   be built in the buffer ioctx->buf points at and hence this function will
+ *   overwrite the request data.
+ * @tag: tag of the request for which this response is being generated.
+ * @status: value for the STATUS field of the SRP_RSP information unit.
+ *
+ * Returns the size in bytes of the SRP_RSP response.
+ *
+ * An SRP_RSP response contains a SCSI status or service response. See also
+ * section 6.9 in the SRP r16a document for the format of an SRP_RSP
+ * response. See also SPC-2 for more information about sense data.
+ */
+static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch,
+			      struct srpt_send_ioctx *ioctx, u64 tag,
+			      int status)
+{
+	struct srp_rsp *srp_rsp;
+	const u8 *sense_data;
+	int sense_data_len, max_sense_len;
+
+	/*
+	 * The lowest bit of all SAM-3 status codes is zero (see also
+	 * paragraph 5.3 in SAM-3).
+	 */
+	WARN_ON(status & 1);
+
+	srp_rsp = ioctx->ioctx.buf;
+	BUG_ON(!srp_rsp);
+
+	sense_data = ioctx->sense_data;
+	sense_data_len = ioctx->cmd.scsi_sense_length;
+	WARN_ON(sense_data_len > sizeof(ioctx->sense_data));
+
+	memset(srp_rsp, 0, sizeof(*srp_rsp));
+	srp_rsp->opcode = SRP_RSP;
+	srp_rsp->req_lim_delta =
+		cpu_to_be32(1 + atomic_xchg(&ch->req_lim_delta, 0));
+	srp_rsp->tag = tag;
+	srp_rsp->status = status;
+
+	if (sense_data_len) {
+		BUILD_BUG_ON(MIN_MAX_RSP_SIZE <= sizeof(*srp_rsp));
+		max_sense_len = ch->max_ti_iu_len - sizeof(*srp_rsp);
+		if (sense_data_len > max_sense_len) {
+			pr_warn("truncated sense data from %d to %d"
+				" bytes\n", sense_data_len, max_sense_len);
+			sense_data_len = max_sense_len;
+		}
+
+		srp_rsp->flags |= SRP_RSP_FLAG_SNSVALID;
+		srp_rsp->sense_data_len = cpu_to_be32(sense_data_len);
+		memcpy(srp_rsp + 1, sense_data, sense_data_len);
+	}
+
+	return sizeof(*srp_rsp) + sense_data_len;
+}
+
+/**
+ * srpt_build_tskmgmt_rsp - build a task management response
+ * @ch:       RDMA channel through which the request has been received.
+ * @ioctx:    I/O context in which the SRP_RSP response will be built.
+ * @rsp_code: RSP_CODE that will be stored in the response.
+ * @tag:      Tag of the request for which this response is being generated.
+ *
+ * Returns the size in bytes of the SRP_RSP response.
+ *
+ * An SRP_RSP response contains a SCSI status or service response. See also
+ * section 6.9 in the SRP r16a document for the format of an SRP_RSP
+ * response.
+ */
+static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch,
+				  struct srpt_send_ioctx *ioctx,
+				  u8 rsp_code, u64 tag)
+{
+	struct srp_rsp *srp_rsp;
+	int resp_data_len;
+	int resp_len;
+
+	resp_data_len = 4;
+	resp_len = sizeof(*srp_rsp) + resp_data_len;
+
+	srp_rsp = ioctx->ioctx.buf;
+	BUG_ON(!srp_rsp);
+	memset(srp_rsp, 0, sizeof(*srp_rsp));
+
+	srp_rsp->opcode = SRP_RSP;
+	srp_rsp->req_lim_delta =
+		cpu_to_be32(1 + atomic_xchg(&ch->req_lim_delta, 0));
+	srp_rsp->tag = tag;
+
+	srp_rsp->flags |= SRP_RSP_FLAG_RSPVALID;
+	srp_rsp->resp_data_len = cpu_to_be32(resp_data_len);
+	srp_rsp->data[3] = rsp_code;
+
+	return resp_len;
+}
+
+static int srpt_check_stop_free(struct se_cmd *cmd)
+{
+	struct srpt_send_ioctx *ioctx = container_of(cmd,
+				struct srpt_send_ioctx, cmd);
+
+	return target_put_sess_cmd(&ioctx->cmd);
+}
+
+/**
+ * srpt_handle_cmd - process a SRP_CMD information unit
+ * @ch: SRPT RDMA channel.
+ * @recv_ioctx: Receive I/O context.
+ * @send_ioctx: Send I/O context.
+ */
+static void srpt_handle_cmd(struct srpt_rdma_ch *ch,
+			    struct srpt_recv_ioctx *recv_ioctx,
+			    struct srpt_send_ioctx *send_ioctx)
+{
+	struct se_cmd *cmd;
+	struct srp_cmd *srp_cmd;
+	struct scatterlist *sg = NULL;
+	unsigned sg_cnt = 0;
+	u64 data_len;
+	enum dma_data_direction dir;
+	int rc;
+
+	BUG_ON(!send_ioctx);
+
+	srp_cmd = recv_ioctx->ioctx.buf;
+	cmd = &send_ioctx->cmd;
+	cmd->tag = srp_cmd->tag;
+
+	switch (srp_cmd->task_attr) {
+	case SRP_CMD_SIMPLE_Q:
+		cmd->sam_task_attr = TCM_SIMPLE_TAG;
+		break;
+	case SRP_CMD_ORDERED_Q:
+	default:
+		cmd->sam_task_attr = TCM_ORDERED_TAG;
+		break;
+	case SRP_CMD_HEAD_OF_Q:
+		cmd->sam_task_attr = TCM_HEAD_TAG;
+		break;
+	case SRP_CMD_ACA:
+		cmd->sam_task_attr = TCM_ACA_TAG;
+		break;
+	}
+
+	rc = srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &sg, &sg_cnt,
+			&data_len);
+	if (rc) {
+		if (rc != -EAGAIN) {
+			pr_err("0x%llx: parsing SRP descriptor table failed.\n",
+			       srp_cmd->tag);
+		}
+		goto release_ioctx;
+	}
+
+	rc = target_submit_cmd_map_sgls(cmd, ch->sess, srp_cmd->cdb,
+			       &send_ioctx->sense_data[0],
+			       scsilun_to_int(&srp_cmd->lun), data_len,
+			       TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF,
+			       sg, sg_cnt, NULL, 0, NULL, 0);
+	if (rc != 0) {
+		pr_debug("target_submit_cmd() returned %d for tag %#llx\n", rc,
+			 srp_cmd->tag);
+		goto release_ioctx;
+	}
+	return;
+
+release_ioctx:
+	send_ioctx->state = SRPT_STATE_DONE;
+	srpt_release_cmd(cmd);
+}
+
+static int srp_tmr_to_tcm(int fn)
+{
+	switch (fn) {
+	case SRP_TSK_ABORT_TASK:
+		return TMR_ABORT_TASK;
+	case SRP_TSK_ABORT_TASK_SET:
+		return TMR_ABORT_TASK_SET;
+	case SRP_TSK_CLEAR_TASK_SET:
+		return TMR_CLEAR_TASK_SET;
+	case SRP_TSK_LUN_RESET:
+		return TMR_LUN_RESET;
+	case SRP_TSK_CLEAR_ACA:
+		return TMR_CLEAR_ACA;
+	default:
+		return -1;
+	}
+}
+
+/**
+ * srpt_handle_tsk_mgmt - process a SRP_TSK_MGMT information unit
+ * @ch: SRPT RDMA channel.
+ * @recv_ioctx: Receive I/O context.
+ * @send_ioctx: Send I/O context.
+ *
+ * Returns 0 if and only if the request will be processed by the target core.
+ *
+ * For more information about SRP_TSK_MGMT information units, see also section
+ * 6.7 in the SRP r16a document.
+ */
+static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch,
+				 struct srpt_recv_ioctx *recv_ioctx,
+				 struct srpt_send_ioctx *send_ioctx)
+{
+	struct srp_tsk_mgmt *srp_tsk;
+	struct se_cmd *cmd;
+	struct se_session *sess = ch->sess;
+	int tcm_tmr;
+	int rc;
+
+	BUG_ON(!send_ioctx);
+
+	srp_tsk = recv_ioctx->ioctx.buf;
+	cmd = &send_ioctx->cmd;
+
+	pr_debug("recv tsk_mgmt fn %d for task_tag %lld and cmd tag %lld ch %p sess %p\n",
+		 srp_tsk->tsk_mgmt_func, srp_tsk->task_tag, srp_tsk->tag, ch,
+		 ch->sess);
+
+	srpt_set_cmd_state(send_ioctx, SRPT_STATE_MGMT);
+	send_ioctx->cmd.tag = srp_tsk->tag;
+	tcm_tmr = srp_tmr_to_tcm(srp_tsk->tsk_mgmt_func);
+	rc = target_submit_tmr(&send_ioctx->cmd, sess, NULL,
+			       scsilun_to_int(&srp_tsk->lun), srp_tsk, tcm_tmr,
+			       GFP_KERNEL, srp_tsk->task_tag,
+			       TARGET_SCF_ACK_KREF);
+	if (rc != 0) {
+		send_ioctx->cmd.se_tmr_req->response = TMR_FUNCTION_REJECTED;
+		goto fail;
+	}
+	return;
+fail:
+	transport_send_check_condition_and_sense(cmd, 0, 0); // XXX:
+}
+
+/**
+ * srpt_handle_new_iu - process a newly received information unit
+ * @ch:    RDMA channel through which the information unit has been received.
+ * @recv_ioctx: Receive I/O context associated with the information unit.
+ */
+static bool
+srpt_handle_new_iu(struct srpt_rdma_ch *ch, struct srpt_recv_ioctx *recv_ioctx)
+{
+	struct srpt_send_ioctx *send_ioctx = NULL;
+	struct srp_cmd *srp_cmd;
+	bool res = false;
+	u8 opcode;
+
+	BUG_ON(!ch);
+	BUG_ON(!recv_ioctx);
+
+	if (unlikely(ch->state == CH_CONNECTING))
+		goto push;
+
+	ib_dma_sync_single_for_cpu(ch->sport->sdev->device,
+				   recv_ioctx->ioctx.dma, srp_max_req_size,
+				   DMA_FROM_DEVICE);
+
+	srp_cmd = recv_ioctx->ioctx.buf;
+	opcode = srp_cmd->opcode;
+	if (opcode == SRP_CMD || opcode == SRP_TSK_MGMT) {
+		send_ioctx = srpt_get_send_ioctx(ch);
+		if (unlikely(!send_ioctx))
+			goto push;
+	}
+
+	if (!list_empty(&recv_ioctx->wait_list)) {
+		WARN_ON_ONCE(!ch->processing_wait_list);
+		list_del_init(&recv_ioctx->wait_list);
+	}
+
+	switch (opcode) {
+	case SRP_CMD:
+		srpt_handle_cmd(ch, recv_ioctx, send_ioctx);
+		break;
+	case SRP_TSK_MGMT:
+		srpt_handle_tsk_mgmt(ch, recv_ioctx, send_ioctx);
+		break;
+	case SRP_I_LOGOUT:
+		pr_err("Not yet implemented: SRP_I_LOGOUT\n");
+		break;
+	case SRP_CRED_RSP:
+		pr_debug("received SRP_CRED_RSP\n");
+		break;
+	case SRP_AER_RSP:
+		pr_debug("received SRP_AER_RSP\n");
+		break;
+	case SRP_RSP:
+		pr_err("Received SRP_RSP\n");
+		break;
+	default:
+		pr_err("received IU with unknown opcode 0x%x\n", opcode);
+		break;
+	}
+
+	srpt_post_recv(ch->sport->sdev, ch, recv_ioctx);
+	res = true;
+
+out:
+	return res;
+
+push:
+	if (list_empty(&recv_ioctx->wait_list)) {
+		WARN_ON_ONCE(ch->processing_wait_list);
+		list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list);
+	}
+	goto out;
+}
+
+static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct srpt_rdma_ch *ch = cq->cq_context;
+	struct srpt_recv_ioctx *ioctx =
+		container_of(wc->wr_cqe, struct srpt_recv_ioctx, ioctx.cqe);
+
+	if (wc->status == IB_WC_SUCCESS) {
+		int req_lim;
+
+		req_lim = atomic_dec_return(&ch->req_lim);
+		if (unlikely(req_lim < 0))
+			pr_err("req_lim = %d < 0\n", req_lim);
+		srpt_handle_new_iu(ch, ioctx);
+	} else {
+		pr_info_ratelimited("receiving failed for ioctx %p with status %d\n",
+				    ioctx, wc->status);
+	}
+}
+
+/*
+ * This function must be called from the context in which RDMA completions are
+ * processed because it accesses the wait list without protection against
+ * access from other threads.
+ */
+static void srpt_process_wait_list(struct srpt_rdma_ch *ch)
+{
+	struct srpt_recv_ioctx *recv_ioctx, *tmp;
+
+	WARN_ON_ONCE(ch->state == CH_CONNECTING);
+
+	if (list_empty(&ch->cmd_wait_list))
+		return;
+
+	WARN_ON_ONCE(ch->processing_wait_list);
+	ch->processing_wait_list = true;
+	list_for_each_entry_safe(recv_ioctx, tmp, &ch->cmd_wait_list,
+				 wait_list) {
+		if (!srpt_handle_new_iu(ch, recv_ioctx))
+			break;
+	}
+	ch->processing_wait_list = false;
+}
+
+/**
+ * srpt_send_done - send completion callback
+ * @cq: Completion queue.
+ * @wc: Work completion.
+ *
+ * Note: Although this has not yet been observed during tests, at least in
+ * theory it is possible that the srpt_get_send_ioctx() call invoked by
+ * srpt_handle_new_iu() fails. This is possible because the req_lim_delta
+ * value in each response is set to one, and it is possible that this response
+ * makes the initiator send a new request before the send completion for that
+ * response has been processed. This could e.g. happen if the call to
+ * srpt_put_send_iotcx() is delayed because of a higher priority interrupt or
+ * if IB retransmission causes generation of the send completion to be
+ * delayed. Incoming information units for which srpt_get_send_ioctx() fails
+ * are queued on cmd_wait_list. The code below processes these delayed
+ * requests one at a time.
+ */
+static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct srpt_rdma_ch *ch = cq->cq_context;
+	struct srpt_send_ioctx *ioctx =
+		container_of(wc->wr_cqe, struct srpt_send_ioctx, ioctx.cqe);
+	enum srpt_command_state state;
+
+	state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
+
+	WARN_ON(state != SRPT_STATE_CMD_RSP_SENT &&
+		state != SRPT_STATE_MGMT_RSP_SENT);
+
+	atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail);
+
+	if (wc->status != IB_WC_SUCCESS)
+		pr_info("sending response for ioctx 0x%p failed"
+			" with status %d\n", ioctx, wc->status);
+
+	if (state != SRPT_STATE_DONE) {
+		transport_generic_free_cmd(&ioctx->cmd, 0);
+	} else {
+		pr_err("IB completion has been received too late for"
+		       " wr_id = %u.\n", ioctx->ioctx.index);
+	}
+
+	srpt_process_wait_list(ch);
+}
+
+/**
+ * srpt_create_ch_ib - create receive and send completion queues
+ * @ch: SRPT RDMA channel.
+ */
+static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
+{
+	struct ib_qp_init_attr *qp_init;
+	struct srpt_port *sport = ch->sport;
+	struct srpt_device *sdev = sport->sdev;
+	const struct ib_device_attr *attrs = &sdev->device->attrs;
+	int sq_size = sport->port_attrib.srp_sq_size;
+	int i, ret;
+
+	WARN_ON(ch->rq_size < 1);
+
+	ret = -ENOMEM;
+	qp_init = kzalloc(sizeof(*qp_init), GFP_KERNEL);
+	if (!qp_init)
+		goto out;
+
+retry:
+	ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + sq_size,
+			0 /* XXX: spread CQs */, IB_POLL_WORKQUEUE);
+	if (IS_ERR(ch->cq)) {
+		ret = PTR_ERR(ch->cq);
+		pr_err("failed to create CQ cqe= %d ret= %d\n",
+		       ch->rq_size + sq_size, ret);
+		goto out;
+	}
+
+	qp_init->qp_context = (void *)ch;
+	qp_init->event_handler
+		= (void(*)(struct ib_event *, void*))srpt_qp_event;
+	qp_init->send_cq = ch->cq;
+	qp_init->recv_cq = ch->cq;
+	qp_init->sq_sig_type = IB_SIGNAL_REQ_WR;
+	qp_init->qp_type = IB_QPT_RC;
+	/*
+	 * We divide up our send queue size into half SEND WRs to send the
+	 * completions, and half R/W contexts to actually do the RDMA
+	 * READ/WRITE transfers.  Note that we need to allocate CQ slots for
+	 * both both, as RDMA contexts will also post completions for the
+	 * RDMA READ case.
+	 */
+	qp_init->cap.max_send_wr = min(sq_size / 2, attrs->max_qp_wr);
+	qp_init->cap.max_rdma_ctxs = sq_size / 2;
+	qp_init->cap.max_send_sge = min(attrs->max_sge, SRPT_MAX_SG_PER_WQE);
+	qp_init->port_num = ch->sport->port;
+	if (sdev->use_srq) {
+		qp_init->srq = sdev->srq;
+	} else {
+		qp_init->cap.max_recv_wr = ch->rq_size;
+		qp_init->cap.max_recv_sge = qp_init->cap.max_send_sge;
+	}
+
+	if (ch->using_rdma_cm) {
+		ret = rdma_create_qp(ch->rdma_cm.cm_id, sdev->pd, qp_init);
+		ch->qp = ch->rdma_cm.cm_id->qp;
+	} else {
+		ch->qp = ib_create_qp(sdev->pd, qp_init);
+		if (!IS_ERR(ch->qp)) {
+			ret = srpt_init_ch_qp(ch, ch->qp);
+			if (ret)
+				ib_destroy_qp(ch->qp);
+		} else {
+			ret = PTR_ERR(ch->qp);
+		}
+	}
+	if (ret) {
+		bool retry = sq_size > MIN_SRPT_SQ_SIZE;
+
+		if (retry) {
+			pr_debug("failed to create queue pair with sq_size = %d (%d) - retrying\n",
+				 sq_size, ret);
+			ib_free_cq(ch->cq);
+			sq_size = max(sq_size / 2, MIN_SRPT_SQ_SIZE);
+			goto retry;
+		} else {
+			pr_err("failed to create queue pair with sq_size = %d (%d)\n",
+			       sq_size, ret);
+			goto err_destroy_cq;
+		}
+	}
+
+	atomic_set(&ch->sq_wr_avail, qp_init->cap.max_send_wr);
+
+	pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d ch= %p\n",
+		 __func__, ch->cq->cqe, qp_init->cap.max_send_sge,
+		 qp_init->cap.max_send_wr, ch);
+
+	if (!sdev->use_srq)
+		for (i = 0; i < ch->rq_size; i++)
+			srpt_post_recv(sdev, ch, ch->ioctx_recv_ring[i]);
+
+out:
+	kfree(qp_init);
+	return ret;
+
+err_destroy_cq:
+	ch->qp = NULL;
+	ib_free_cq(ch->cq);
+	goto out;
+}
+
+static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch)
+{
+	ib_destroy_qp(ch->qp);
+	ib_free_cq(ch->cq);
+}
+
+/**
+ * srpt_close_ch - close a RDMA channel
+ * @ch: SRPT RDMA channel.
+ *
+ * Make sure all resources associated with the channel will be deallocated at
+ * an appropriate time.
+ *
+ * Returns true if and only if the channel state has been modified into
+ * CH_DRAINING.
+ */
+static bool srpt_close_ch(struct srpt_rdma_ch *ch)
+{
+	int ret;
+
+	if (!srpt_set_ch_state(ch, CH_DRAINING)) {
+		pr_debug("%s-%d: already closed\n", ch->sess_name,
+			 ch->qp->qp_num);
+		return false;
+	}
+
+	kref_get(&ch->kref);
+
+	ret = srpt_ch_qp_err(ch);
+	if (ret < 0)
+		pr_err("%s-%d: changing queue pair into error state failed: %d\n",
+		       ch->sess_name, ch->qp->qp_num, ret);
+
+	ret = srpt_zerolength_write(ch);
+	if (ret < 0) {
+		pr_err("%s-%d: queuing zero-length write failed: %d\n",
+		       ch->sess_name, ch->qp->qp_num, ret);
+		if (srpt_set_ch_state(ch, CH_DISCONNECTED))
+			schedule_work(&ch->release_work);
+		else
+			WARN_ON_ONCE(true);
+	}
+
+	kref_put(&ch->kref, srpt_free_ch);
+
+	return true;
+}
+
+/*
+ * Change the channel state into CH_DISCONNECTING. If a channel has not yet
+ * reached the connected state, close it. If a channel is in the connected
+ * state, send a DREQ. If a DREQ has been received, send a DREP. Note: it is
+ * the responsibility of the caller to ensure that this function is not
+ * invoked concurrently with the code that accepts a connection. This means
+ * that this function must either be invoked from inside a CM callback
+ * function or that it must be invoked with the srpt_port.mutex held.
+ */
+static int srpt_disconnect_ch(struct srpt_rdma_ch *ch)
+{
+	int ret;
+
+	if (!srpt_set_ch_state(ch, CH_DISCONNECTING))
+		return -ENOTCONN;
+
+	if (ch->using_rdma_cm) {
+		ret = rdma_disconnect(ch->rdma_cm.cm_id);
+	} else {
+		ret = ib_send_cm_dreq(ch->ib_cm.cm_id, NULL, 0);
+		if (ret < 0)
+			ret = ib_send_cm_drep(ch->ib_cm.cm_id, NULL, 0);
+	}
+
+	if (ret < 0 && srpt_close_ch(ch))
+		ret = 0;
+
+	return ret;
+}
+
+static bool srpt_ch_closed(struct srpt_port *sport, struct srpt_rdma_ch *ch)
+{
+	struct srpt_nexus *nexus;
+	struct srpt_rdma_ch *ch2;
+	bool res = true;
+
+	rcu_read_lock();
+	list_for_each_entry(nexus, &sport->nexus_list, entry) {
+		list_for_each_entry(ch2, &nexus->ch_list, list) {
+			if (ch2 == ch) {
+				res = false;
+				goto done;
+			}
+		}
+	}
+done:
+	rcu_read_unlock();
+
+	return res;
+}
+
+/* Send DREQ and wait for DREP. */
+static void srpt_disconnect_ch_sync(struct srpt_rdma_ch *ch)
+{
+	struct srpt_port *sport = ch->sport;
+
+	pr_debug("ch %s-%d state %d\n", ch->sess_name, ch->qp->qp_num,
+		 ch->state);
+
+	mutex_lock(&sport->mutex);
+	srpt_disconnect_ch(ch);
+	mutex_unlock(&sport->mutex);
+
+	while (wait_event_timeout(sport->ch_releaseQ, srpt_ch_closed(sport, ch),
+				  5 * HZ) == 0)
+		pr_info("%s(%s-%d state %d): still waiting ...\n", __func__,
+			ch->sess_name, ch->qp->qp_num, ch->state);
+
+}
+
+static void __srpt_close_all_ch(struct srpt_port *sport)
+{
+	struct srpt_nexus *nexus;
+	struct srpt_rdma_ch *ch;
+
+	lockdep_assert_held(&sport->mutex);
+
+	list_for_each_entry(nexus, &sport->nexus_list, entry) {
+		list_for_each_entry(ch, &nexus->ch_list, list) {
+			if (srpt_disconnect_ch(ch) >= 0)
+				pr_info("Closing channel %s-%d because target %s_%d has been disabled\n",
+					ch->sess_name, ch->qp->qp_num,
+					sport->sdev->device->name, sport->port);
+			srpt_close_ch(ch);
+		}
+	}
+}
+
+/*
+ * Look up (i_port_id, t_port_id) in sport->nexus_list. Create an entry if
+ * it does not yet exist.
+ */
+static struct srpt_nexus *srpt_get_nexus(struct srpt_port *sport,
+					 const u8 i_port_id[16],
+					 const u8 t_port_id[16])
+{
+	struct srpt_nexus *nexus = NULL, *tmp_nexus = NULL, *n;
+
+	for (;;) {
+		mutex_lock(&sport->mutex);
+		list_for_each_entry(n, &sport->nexus_list, entry) {
+			if (memcmp(n->i_port_id, i_port_id, 16) == 0 &&
+			    memcmp(n->t_port_id, t_port_id, 16) == 0) {
+				nexus = n;
+				break;
+			}
+		}
+		if (!nexus && tmp_nexus) {
+			list_add_tail_rcu(&tmp_nexus->entry,
+					  &sport->nexus_list);
+			swap(nexus, tmp_nexus);
+		}
+		mutex_unlock(&sport->mutex);
+
+		if (nexus)
+			break;
+		tmp_nexus = kzalloc(sizeof(*nexus), GFP_KERNEL);
+		if (!tmp_nexus) {
+			nexus = ERR_PTR(-ENOMEM);
+			break;
+		}
+		INIT_LIST_HEAD(&tmp_nexus->ch_list);
+		memcpy(tmp_nexus->i_port_id, i_port_id, 16);
+		memcpy(tmp_nexus->t_port_id, t_port_id, 16);
+	}
+
+	kfree(tmp_nexus);
+
+	return nexus;
+}
+
+static void srpt_set_enabled(struct srpt_port *sport, bool enabled)
+	__must_hold(&sport->mutex)
+{
+	lockdep_assert_held(&sport->mutex);
+
+	if (sport->enabled == enabled)
+		return;
+	sport->enabled = enabled;
+	if (!enabled)
+		__srpt_close_all_ch(sport);
+}
+
+static void srpt_free_ch(struct kref *kref)
+{
+	struct srpt_rdma_ch *ch = container_of(kref, struct srpt_rdma_ch, kref);
+
+	kfree_rcu(ch, rcu);
+}
+
+static void srpt_release_channel_work(struct work_struct *w)
+{
+	struct srpt_rdma_ch *ch;
+	struct srpt_device *sdev;
+	struct srpt_port *sport;
+	struct se_session *se_sess;
+
+	ch = container_of(w, struct srpt_rdma_ch, release_work);
+	pr_debug("%s-%d\n", ch->sess_name, ch->qp->qp_num);
+
+	sdev = ch->sport->sdev;
+	BUG_ON(!sdev);
+
+	se_sess = ch->sess;
+	BUG_ON(!se_sess);
+
+	target_sess_cmd_list_set_waiting(se_sess);
+	target_wait_for_sess_cmds(se_sess);
+
+	transport_deregister_session_configfs(se_sess);
+	transport_deregister_session(se_sess);
+	ch->sess = NULL;
+
+	if (ch->using_rdma_cm)
+		rdma_destroy_id(ch->rdma_cm.cm_id);
+	else
+		ib_destroy_cm_id(ch->ib_cm.cm_id);
+
+	srpt_destroy_ch_ib(ch);
+
+	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring,
+			     ch->sport->sdev, ch->rq_size,
+			     ch->max_rsp_size, DMA_TO_DEVICE);
+
+	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_recv_ring,
+			     sdev, ch->rq_size,
+			     srp_max_req_size, DMA_FROM_DEVICE);
+
+	sport = ch->sport;
+	mutex_lock(&sport->mutex);
+	list_del_rcu(&ch->list);
+	mutex_unlock(&sport->mutex);
+
+	wake_up(&sport->ch_releaseQ);
+
+	kref_put(&ch->kref, srpt_free_ch);
+}
+
+/**
+ * srpt_cm_req_recv - process the event IB_CM_REQ_RECEIVED
+ * @sdev: HCA through which the login request was received.
+ * @ib_cm_id: IB/CM connection identifier in case of IB/CM.
+ * @rdma_cm_id: RDMA/CM connection identifier in case of RDMA/CM.
+ * @port_num: Port through which the REQ message was received.
+ * @pkey: P_Key of the incoming connection.
+ * @req: SRP login request.
+ * @src_addr: GID (IB/CM) or IP address (RDMA/CM) of the port that submitted
+ * the login request.
+ *
+ * Ownership of the cm_id is transferred to the target session if this
+ * function returns zero. Otherwise the caller remains the owner of cm_id.
+ */
+static int srpt_cm_req_recv(struct srpt_device *const sdev,
+			    struct ib_cm_id *ib_cm_id,
+			    struct rdma_cm_id *rdma_cm_id,
+			    u8 port_num, __be16 pkey,
+			    const struct srp_login_req *req,
+			    const char *src_addr)
+{
+	struct srpt_port *sport = &sdev->port[port_num - 1];
+	struct srpt_nexus *nexus;
+	struct srp_login_rsp *rsp = NULL;
+	struct srp_login_rej *rej = NULL;
+	union {
+		struct rdma_conn_param rdma_cm;
+		struct ib_cm_rep_param ib_cm;
+	} *rep_param = NULL;
+	struct srpt_rdma_ch *ch;
+	char i_port_id[36];
+	u32 it_iu_len;
+	int i, ret;
+
+	WARN_ON_ONCE(irqs_disabled());
+
+	if (WARN_ON(!sdev || !req))
+		return -EINVAL;
+
+	it_iu_len = be32_to_cpu(req->req_it_iu_len);
+
+	pr_info("Received SRP_LOGIN_REQ with i_port_id %pI6, t_port_id %pI6 and it_iu_len %d on port %d (guid=%pI6); pkey %#04x\n",
+		req->initiator_port_id, req->target_port_id, it_iu_len,
+		port_num, &sport->gid, be16_to_cpu(pkey));
+
+	nexus = srpt_get_nexus(sport, req->initiator_port_id,
+			       req->target_port_id);
+	if (IS_ERR(nexus)) {
+		ret = PTR_ERR(nexus);
+		goto out;
+	}
+
+	ret = -ENOMEM;
+	rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+	rej = kzalloc(sizeof(*rej), GFP_KERNEL);
+	rep_param = kzalloc(sizeof(*rep_param), GFP_KERNEL);
+	if (!rsp || !rej || !rep_param)
+		goto out;
+
+	ret = -EINVAL;
+	if (it_iu_len > srp_max_req_size || it_iu_len < 64) {
+		rej->reason = cpu_to_be32(
+				SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE);
+		pr_err("rejected SRP_LOGIN_REQ because its length (%d bytes) is out of range (%d .. %d)\n",
+		       it_iu_len, 64, srp_max_req_size);
+		goto reject;
+	}
+
+	if (!sport->enabled) {
+		rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		pr_info("rejected SRP_LOGIN_REQ because target port %s_%d has not yet been enabled\n",
+			sport->sdev->device->name, port_num);
+		goto reject;
+	}
+
+	if (*(__be64 *)req->target_port_id != cpu_to_be64(srpt_service_guid)
+	    || *(__be64 *)(req->target_port_id + 8) !=
+	       cpu_to_be64(srpt_service_guid)) {
+		rej->reason = cpu_to_be32(
+				SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL);
+		pr_err("rejected SRP_LOGIN_REQ because it has an invalid target port identifier.\n");
+		goto reject;
+	}
+
+	ret = -ENOMEM;
+	ch = kzalloc(sizeof(*ch), GFP_KERNEL);
+	if (!ch) {
+		rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		pr_err("rejected SRP_LOGIN_REQ because out of memory.\n");
+		goto reject;
+	}
+
+	kref_init(&ch->kref);
+	ch->pkey = be16_to_cpu(pkey);
+	ch->nexus = nexus;
+	ch->zw_cqe.done = srpt_zerolength_write_done;
+	INIT_WORK(&ch->release_work, srpt_release_channel_work);
+	ch->sport = sport;
+	if (ib_cm_id) {
+		ch->ib_cm.cm_id = ib_cm_id;
+		ib_cm_id->context = ch;
+	} else {
+		ch->using_rdma_cm = true;
+		ch->rdma_cm.cm_id = rdma_cm_id;
+		rdma_cm_id->context = ch;
+	}
+	/*
+	 * ch->rq_size should be at least as large as the initiator queue
+	 * depth to avoid that the initiator driver has to report QUEUE_FULL
+	 * to the SCSI mid-layer.
+	 */
+	ch->rq_size = min(MAX_SRPT_RQ_SIZE, sdev->device->attrs.max_qp_wr);
+	spin_lock_init(&ch->spinlock);
+	ch->state = CH_CONNECTING;
+	INIT_LIST_HEAD(&ch->cmd_wait_list);
+	ch->max_rsp_size = ch->sport->port_attrib.srp_max_rsp_size;
+
+	ch->ioctx_ring = (struct srpt_send_ioctx **)
+		srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size,
+				      sizeof(*ch->ioctx_ring[0]),
+				      ch->max_rsp_size, DMA_TO_DEVICE);
+	if (!ch->ioctx_ring) {
+		pr_err("rejected SRP_LOGIN_REQ because creating a new QP SQ ring failed.\n");
+		rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		goto free_ch;
+	}
+
+	INIT_LIST_HEAD(&ch->free_list);
+	for (i = 0; i < ch->rq_size; i++) {
+		ch->ioctx_ring[i]->ch = ch;
+		list_add_tail(&ch->ioctx_ring[i]->free_list, &ch->free_list);
+	}
+	if (!sdev->use_srq) {
+		ch->ioctx_recv_ring = (struct srpt_recv_ioctx **)
+			srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size,
+					      sizeof(*ch->ioctx_recv_ring[0]),
+					      srp_max_req_size,
+					      DMA_FROM_DEVICE);
+		if (!ch->ioctx_recv_ring) {
+			pr_err("rejected SRP_LOGIN_REQ because creating a new QP RQ ring failed.\n");
+			rej->reason =
+			    cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+			goto free_ring;
+		}
+		for (i = 0; i < ch->rq_size; i++)
+			INIT_LIST_HEAD(&ch->ioctx_recv_ring[i]->wait_list);
+	}
+
+	ret = srpt_create_ch_ib(ch);
+	if (ret) {
+		rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		pr_err("rejected SRP_LOGIN_REQ because creating a new RDMA channel failed.\n");
+		goto free_recv_ring;
+	}
+
+	strlcpy(ch->sess_name, src_addr, sizeof(ch->sess_name));
+	snprintf(i_port_id, sizeof(i_port_id), "0x%016llx%016llx",
+			be64_to_cpu(*(__be64 *)nexus->i_port_id),
+			be64_to_cpu(*(__be64 *)(nexus->i_port_id + 8)));
+
+	pr_debug("registering session %s\n", ch->sess_name);
+
+	if (sport->port_guid_tpg.se_tpg_wwn)
+		ch->sess = target_alloc_session(&sport->port_guid_tpg, 0, 0,
+						TARGET_PROT_NORMAL,
+						ch->sess_name, ch, NULL);
+	if (sport->port_gid_tpg.se_tpg_wwn && IS_ERR_OR_NULL(ch->sess))
+		ch->sess = target_alloc_session(&sport->port_gid_tpg, 0, 0,
+					TARGET_PROT_NORMAL, i_port_id, ch,
+					NULL);
+	/* Retry without leading "0x" */
+	if (sport->port_gid_tpg.se_tpg_wwn && IS_ERR_OR_NULL(ch->sess))
+		ch->sess = target_alloc_session(&sport->port_gid_tpg, 0, 0,
+						TARGET_PROT_NORMAL,
+						i_port_id + 2, ch, NULL);
+	if (IS_ERR_OR_NULL(ch->sess)) {
+		ret = PTR_ERR(ch->sess);
+		pr_info("Rejected login for initiator %s: ret = %d.\n",
+			ch->sess_name, ret);
+		rej->reason = cpu_to_be32(ret == -ENOMEM ?
+				SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES :
+				SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED);
+		goto reject;
+	}
+
+	mutex_lock(&sport->mutex);
+
+	if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) {
+		struct srpt_rdma_ch *ch2;
+
+		rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN;
+
+		list_for_each_entry(ch2, &nexus->ch_list, list) {
+			if (srpt_disconnect_ch(ch2) < 0)
+				continue;
+			pr_info("Relogin - closed existing channel %s\n",
+				ch2->sess_name);
+			rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_TERMINATED;
+		}
+	} else {
+		rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED;
+	}
+
+	list_add_tail_rcu(&ch->list, &nexus->ch_list);
+
+	if (!sport->enabled) {
+		rej->reason = cpu_to_be32(
+				SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		pr_info("rejected SRP_LOGIN_REQ because target %s_%d is not enabled\n",
+			sdev->device->name, port_num);
+		mutex_unlock(&sport->mutex);
+		goto reject;
+	}
+
+	mutex_unlock(&sport->mutex);
+
+	ret = ch->using_rdma_cm ? 0 : srpt_ch_qp_rtr(ch, ch->qp);
+	if (ret) {
+		rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		pr_err("rejected SRP_LOGIN_REQ because enabling RTR failed (error code = %d)\n",
+		       ret);
+		goto destroy_ib;
+	}
+
+	pr_debug("Establish connection sess=%p name=%s ch=%p\n", ch->sess,
+		 ch->sess_name, ch);
+
+	/* create srp_login_response */
+	rsp->opcode = SRP_LOGIN_RSP;
+	rsp->tag = req->tag;
+	rsp->max_it_iu_len = req->req_it_iu_len;
+	rsp->max_ti_iu_len = req->req_it_iu_len;
+	ch->max_ti_iu_len = it_iu_len;
+	rsp->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT |
+				   SRP_BUF_FORMAT_INDIRECT);
+	rsp->req_lim_delta = cpu_to_be32(ch->rq_size);
+	atomic_set(&ch->req_lim, ch->rq_size);
+	atomic_set(&ch->req_lim_delta, 0);
+
+	/* create cm reply */
+	if (ch->using_rdma_cm) {
+		rep_param->rdma_cm.private_data = (void *)rsp;
+		rep_param->rdma_cm.private_data_len = sizeof(*rsp);
+		rep_param->rdma_cm.rnr_retry_count = 7;
+		rep_param->rdma_cm.flow_control = 1;
+		rep_param->rdma_cm.responder_resources = 4;
+		rep_param->rdma_cm.initiator_depth = 4;
+	} else {
+		rep_param->ib_cm.qp_num = ch->qp->qp_num;
+		rep_param->ib_cm.private_data = (void *)rsp;
+		rep_param->ib_cm.private_data_len = sizeof(*rsp);
+		rep_param->ib_cm.rnr_retry_count = 7;
+		rep_param->ib_cm.flow_control = 1;
+		rep_param->ib_cm.failover_accepted = 0;
+		rep_param->ib_cm.srq = 1;
+		rep_param->ib_cm.responder_resources = 4;
+		rep_param->ib_cm.initiator_depth = 4;
+	}
+
+	/*
+	 * Hold the sport mutex while accepting a connection to avoid that
+	 * srpt_disconnect_ch() is invoked concurrently with this code.
+	 */
+	mutex_lock(&sport->mutex);
+	if (sport->enabled && ch->state == CH_CONNECTING) {
+		if (ch->using_rdma_cm)
+			ret = rdma_accept(rdma_cm_id, &rep_param->rdma_cm);
+		else
+			ret = ib_send_cm_rep(ib_cm_id, &rep_param->ib_cm);
+	} else {
+		ret = -EINVAL;
+	}
+	mutex_unlock(&sport->mutex);
+
+	switch (ret) {
+	case 0:
+		break;
+	case -EINVAL:
+		goto reject;
+	default:
+		rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		pr_err("sending SRP_LOGIN_REQ response failed (error code = %d)\n",
+		       ret);
+		goto reject;
+	}
+
+	goto out;
+
+destroy_ib:
+	srpt_destroy_ch_ib(ch);
+
+free_recv_ring:
+	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_recv_ring,
+			     ch->sport->sdev, ch->rq_size,
+			     srp_max_req_size, DMA_FROM_DEVICE);
+
+free_ring:
+	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring,
+			     ch->sport->sdev, ch->rq_size,
+			     ch->max_rsp_size, DMA_TO_DEVICE);
+free_ch:
+	if (ib_cm_id)
+		ib_cm_id->context = NULL;
+	kfree(ch);
+	ch = NULL;
+
+	WARN_ON_ONCE(ret == 0);
+
+reject:
+	pr_info("Rejecting login with reason %#x\n", be32_to_cpu(rej->reason));
+	rej->opcode = SRP_LOGIN_REJ;
+	rej->tag = req->tag;
+	rej->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT |
+				   SRP_BUF_FORMAT_INDIRECT);
+
+	if (rdma_cm_id)
+		rdma_reject(rdma_cm_id, rej, sizeof(*rej));
+	else
+		ib_send_cm_rej(ib_cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0,
+			       rej, sizeof(*rej));
+
+out:
+	kfree(rep_param);
+	kfree(rsp);
+	kfree(rej);
+
+	return ret;
+}
+
+static int srpt_ib_cm_req_recv(struct ib_cm_id *cm_id,
+			       struct ib_cm_req_event_param *param,
+			       void *private_data)
+{
+	char sguid[40];
+
+	srpt_format_guid(sguid, sizeof(sguid),
+			 &param->primary_path->dgid.global.interface_id);
+
+	return srpt_cm_req_recv(cm_id->context, cm_id, NULL, param->port,
+				param->primary_path->pkey,
+				private_data, sguid);
+}
+
+static int srpt_rdma_cm_req_recv(struct rdma_cm_id *cm_id,
+				 struct rdma_cm_event *event)
+{
+	struct srpt_device *sdev;
+	struct srp_login_req req;
+	const struct srp_login_req_rdma *req_rdma;
+	char src_addr[40];
+
+	sdev = ib_get_client_data(cm_id->device, &srpt_client);
+	if (!sdev)
+		return -ECONNREFUSED;
+
+	if (event->param.conn.private_data_len < sizeof(*req_rdma))
+		return -EINVAL;
+
+	/* Transform srp_login_req_rdma into srp_login_req. */
+	req_rdma = event->param.conn.private_data;
+	memset(&req, 0, sizeof(req));
+	req.opcode		= req_rdma->opcode;
+	req.tag			= req_rdma->tag;
+	req.req_it_iu_len	= req_rdma->req_it_iu_len;
+	req.req_buf_fmt		= req_rdma->req_buf_fmt;
+	req.req_flags		= req_rdma->req_flags;
+	memcpy(req.initiator_port_id, req_rdma->initiator_port_id, 16);
+	memcpy(req.target_port_id, req_rdma->target_port_id, 16);
+
+	snprintf(src_addr, sizeof(src_addr), "%pIS",
+		 &cm_id->route.addr.src_addr);
+
+	return srpt_cm_req_recv(sdev, NULL, cm_id, cm_id->port_num,
+				cm_id->route.path_rec->pkey, &req, src_addr);
+}
+
+static void srpt_cm_rej_recv(struct srpt_rdma_ch *ch,
+			     enum ib_cm_rej_reason reason,
+			     const u8 *private_data,
+			     u8 private_data_len)
+{
+	char *priv = NULL;
+	int i;
+
+	if (private_data_len && (priv = kmalloc(private_data_len * 3 + 1,
+						GFP_KERNEL))) {
+		for (i = 0; i < private_data_len; i++)
+			sprintf(priv + 3 * i, " %02x", private_data[i]);
+	}
+	pr_info("Received CM REJ for ch %s-%d; reason %d%s%s.\n",
+		ch->sess_name, ch->qp->qp_num, reason, private_data_len ?
+		"; private data" : "", priv ? priv : " (?)");
+	kfree(priv);
+}
+
+/**
+ * srpt_cm_rtu_recv - process an IB_CM_RTU_RECEIVED or USER_ESTABLISHED event
+ * @ch: SRPT RDMA channel.
+ *
+ * An RTU (ready to use) message indicates that the connection has been
+ * established and that the recipient may begin transmitting.
+ */
+static void srpt_cm_rtu_recv(struct srpt_rdma_ch *ch)
+{
+	int ret;
+
+	ret = ch->using_rdma_cm ? 0 : srpt_ch_qp_rts(ch, ch->qp);
+	if (ret < 0) {
+		pr_err("%s-%d: QP transition to RTS failed\n", ch->sess_name,
+		       ch->qp->qp_num);
+		srpt_close_ch(ch);
+		return;
+	}
+
+	/*
+	 * Note: calling srpt_close_ch() if the transition to the LIVE state
+	 * fails is not necessary since that means that that function has
+	 * already been invoked from another thread.
+	 */
+	if (!srpt_set_ch_state(ch, CH_LIVE)) {
+		pr_err("%s-%d: channel transition to LIVE state failed\n",
+		       ch->sess_name, ch->qp->qp_num);
+		return;
+	}
+
+	/* Trigger wait list processing. */
+	ret = srpt_zerolength_write(ch);
+	WARN_ONCE(ret < 0, "%d\n", ret);
+}
+
+/**
+ * srpt_cm_handler - IB connection manager callback function
+ * @cm_id: IB/CM connection identifier.
+ * @event: IB/CM event.
+ *
+ * A non-zero return value will cause the caller destroy the CM ID.
+ *
+ * Note: srpt_cm_handler() must only return a non-zero value when transferring
+ * ownership of the cm_id to a channel by srpt_cm_req_recv() failed. Returning
+ * a non-zero value in any other case will trigger a race with the
+ * ib_destroy_cm_id() call in srpt_release_channel().
+ */
+static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	struct srpt_rdma_ch *ch = cm_id->context;
+	int ret;
+
+	ret = 0;
+	switch (event->event) {
+	case IB_CM_REQ_RECEIVED:
+		ret = srpt_ib_cm_req_recv(cm_id, &event->param.req_rcvd,
+					  event->private_data);
+		break;
+	case IB_CM_REJ_RECEIVED:
+		srpt_cm_rej_recv(ch, event->param.rej_rcvd.reason,
+				 event->private_data,
+				 IB_CM_REJ_PRIVATE_DATA_SIZE);
+		break;
+	case IB_CM_RTU_RECEIVED:
+	case IB_CM_USER_ESTABLISHED:
+		srpt_cm_rtu_recv(ch);
+		break;
+	case IB_CM_DREQ_RECEIVED:
+		srpt_disconnect_ch(ch);
+		break;
+	case IB_CM_DREP_RECEIVED:
+		pr_info("Received CM DREP message for ch %s-%d.\n",
+			ch->sess_name, ch->qp->qp_num);
+		srpt_close_ch(ch);
+		break;
+	case IB_CM_TIMEWAIT_EXIT:
+		pr_info("Received CM TimeWait exit for ch %s-%d.\n",
+			ch->sess_name, ch->qp->qp_num);
+		srpt_close_ch(ch);
+		break;
+	case IB_CM_REP_ERROR:
+		pr_info("Received CM REP error for ch %s-%d.\n", ch->sess_name,
+			ch->qp->qp_num);
+		break;
+	case IB_CM_DREQ_ERROR:
+		pr_info("Received CM DREQ ERROR event.\n");
+		break;
+	case IB_CM_MRA_RECEIVED:
+		pr_info("Received CM MRA event\n");
+		break;
+	default:
+		pr_err("received unrecognized CM event %d\n", event->event);
+		break;
+	}
+
+	return ret;
+}
+
+static int srpt_rdma_cm_handler(struct rdma_cm_id *cm_id,
+				struct rdma_cm_event *event)
+{
+	struct srpt_rdma_ch *ch = cm_id->context;
+	int ret = 0;
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		ret = srpt_rdma_cm_req_recv(cm_id, event);
+		break;
+	case RDMA_CM_EVENT_REJECTED:
+		srpt_cm_rej_recv(ch, event->status,
+				 event->param.conn.private_data,
+				 event->param.conn.private_data_len);
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		srpt_cm_rtu_recv(ch);
+		break;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		if (ch->state < CH_DISCONNECTING)
+			srpt_disconnect_ch(ch);
+		else
+			srpt_close_ch(ch);
+		break;
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		srpt_close_ch(ch);
+		break;
+	case RDMA_CM_EVENT_UNREACHABLE:
+		pr_info("Received CM REP error for ch %s-%d.\n", ch->sess_name,
+			ch->qp->qp_num);
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		break;
+	default:
+		pr_err("received unrecognized RDMA CM event %d\n",
+		       event->event);
+		break;
+	}
+
+	return ret;
+}
+
+static int srpt_write_pending_status(struct se_cmd *se_cmd)
+{
+	struct srpt_send_ioctx *ioctx;
+
+	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd);
+	return ioctx->state == SRPT_STATE_NEED_DATA;
+}
+
+/*
+ * srpt_write_pending - Start data transfer from initiator to target (write).
+ */
+static int srpt_write_pending(struct se_cmd *se_cmd)
+{
+	struct srpt_send_ioctx *ioctx =
+		container_of(se_cmd, struct srpt_send_ioctx, cmd);
+	struct srpt_rdma_ch *ch = ioctx->ch;
+	struct ib_send_wr *first_wr = NULL, *bad_wr;
+	struct ib_cqe *cqe = &ioctx->rdma_cqe;
+	enum srpt_command_state new_state;
+	int ret, i;
+
+	new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA);
+	WARN_ON(new_state == SRPT_STATE_DONE);
+
+	if (atomic_sub_return(ioctx->n_rdma, &ch->sq_wr_avail) < 0) {
+		pr_warn("%s: IB send queue full (needed %d)\n",
+				__func__, ioctx->n_rdma);
+		ret = -ENOMEM;
+		goto out_undo;
+	}
+
+	cqe->done = srpt_rdma_read_done;
+	for (i = ioctx->n_rw_ctx - 1; i >= 0; i--) {
+		struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+
+		first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp, ch->sport->port,
+				cqe, first_wr);
+		cqe = NULL;
+	}
+
+	ret = ib_post_send(ch->qp, first_wr, &bad_wr);
+	if (ret) {
+		pr_err("%s: ib_post_send() returned %d for %d (avail: %d)\n",
+			 __func__, ret, ioctx->n_rdma,
+			 atomic_read(&ch->sq_wr_avail));
+		goto out_undo;
+	}
+
+	return 0;
+out_undo:
+	atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
+	return ret;
+}
+
+static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status)
+{
+	switch (tcm_mgmt_status) {
+	case TMR_FUNCTION_COMPLETE:
+		return SRP_TSK_MGMT_SUCCESS;
+	case TMR_FUNCTION_REJECTED:
+		return SRP_TSK_MGMT_FUNC_NOT_SUPP;
+	}
+	return SRP_TSK_MGMT_FAILED;
+}
+
+/**
+ * srpt_queue_response - transmit the response to a SCSI command
+ * @cmd: SCSI target command.
+ *
+ * Callback function called by the TCM core. Must not block since it can be
+ * invoked on the context of the IB completion handler.
+ */
+static void srpt_queue_response(struct se_cmd *cmd)
+{
+	struct srpt_send_ioctx *ioctx =
+		container_of(cmd, struct srpt_send_ioctx, cmd);
+	struct srpt_rdma_ch *ch = ioctx->ch;
+	struct srpt_device *sdev = ch->sport->sdev;
+	struct ib_send_wr send_wr, *first_wr = &send_wr, *bad_wr;
+	struct ib_sge sge;
+	enum srpt_command_state state;
+	int resp_len, ret, i;
+	u8 srp_tm_status;
+
+	BUG_ON(!ch);
+
+	state = ioctx->state;
+	switch (state) {
+	case SRPT_STATE_NEW:
+	case SRPT_STATE_DATA_IN:
+		ioctx->state = SRPT_STATE_CMD_RSP_SENT;
+		break;
+	case SRPT_STATE_MGMT:
+		ioctx->state = SRPT_STATE_MGMT_RSP_SENT;
+		break;
+	default:
+		WARN(true, "ch %p; cmd %d: unexpected command state %d\n",
+			ch, ioctx->ioctx.index, ioctx->state);
+		break;
+	}
+
+	if (unlikely(WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT)))
+		return;
+
+	/* For read commands, transfer the data to the initiator. */
+	if (ioctx->cmd.data_direction == DMA_FROM_DEVICE &&
+	    ioctx->cmd.data_length &&
+	    !ioctx->queue_status_only) {
+		for (i = ioctx->n_rw_ctx - 1; i >= 0; i--) {
+			struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+
+			first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp,
+					ch->sport->port, NULL, first_wr);
+		}
+	}
+
+	if (state != SRPT_STATE_MGMT)
+		resp_len = srpt_build_cmd_rsp(ch, ioctx, ioctx->cmd.tag,
+					      cmd->scsi_status);
+	else {
+		srp_tm_status
+			= tcm_to_srp_tsk_mgmt_status(cmd->se_tmr_req->response);
+		resp_len = srpt_build_tskmgmt_rsp(ch, ioctx, srp_tm_status,
+						 ioctx->cmd.tag);
+	}
+
+	atomic_inc(&ch->req_lim);
+
+	if (unlikely(atomic_sub_return(1 + ioctx->n_rdma,
+			&ch->sq_wr_avail) < 0)) {
+		pr_warn("%s: IB send queue full (needed %d)\n",
+				__func__, ioctx->n_rdma);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, resp_len,
+				      DMA_TO_DEVICE);
+
+	sge.addr = ioctx->ioctx.dma;
+	sge.length = resp_len;
+	sge.lkey = sdev->lkey;
+
+	ioctx->ioctx.cqe.done = srpt_send_done;
+	send_wr.next = NULL;
+	send_wr.wr_cqe = &ioctx->ioctx.cqe;
+	send_wr.sg_list = &sge;
+	send_wr.num_sge = 1;
+	send_wr.opcode = IB_WR_SEND;
+	send_wr.send_flags = IB_SEND_SIGNALED;
+
+	ret = ib_post_send(ch->qp, first_wr, &bad_wr);
+	if (ret < 0) {
+		pr_err("%s: sending cmd response failed for tag %llu (%d)\n",
+			__func__, ioctx->cmd.tag, ret);
+		goto out;
+	}
+
+	return;
+
+out:
+	atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail);
+	atomic_dec(&ch->req_lim);
+	srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
+	target_put_sess_cmd(&ioctx->cmd);
+}
+
+static int srpt_queue_data_in(struct se_cmd *cmd)
+{
+	srpt_queue_response(cmd);
+	return 0;
+}
+
+static void srpt_queue_tm_rsp(struct se_cmd *cmd)
+{
+	srpt_queue_response(cmd);
+}
+
+static void srpt_aborted_task(struct se_cmd *cmd)
+{
+}
+
+static int srpt_queue_status(struct se_cmd *cmd)
+{
+	struct srpt_send_ioctx *ioctx;
+
+	ioctx = container_of(cmd, struct srpt_send_ioctx, cmd);
+	BUG_ON(ioctx->sense_data != cmd->sense_buffer);
+	if (cmd->se_cmd_flags &
+	    (SCF_TRANSPORT_TASK_SENSE | SCF_EMULATED_TASK_SENSE))
+		WARN_ON(cmd->scsi_status != SAM_STAT_CHECK_CONDITION);
+	ioctx->queue_status_only = true;
+	srpt_queue_response(cmd);
+	return 0;
+}
+
+static void srpt_refresh_port_work(struct work_struct *work)
+{
+	struct srpt_port *sport = container_of(work, struct srpt_port, work);
+
+	srpt_refresh_port(sport);
+}
+
+static bool srpt_ch_list_empty(struct srpt_port *sport)
+{
+	struct srpt_nexus *nexus;
+	bool res = true;
+
+	rcu_read_lock();
+	list_for_each_entry(nexus, &sport->nexus_list, entry)
+		if (!list_empty(&nexus->ch_list))
+			res = false;
+	rcu_read_unlock();
+
+	return res;
+}
+
+/**
+ * srpt_release_sport - disable login and wait for associated channels
+ * @sport: SRPT HCA port.
+ */
+static int srpt_release_sport(struct srpt_port *sport)
+{
+	struct srpt_nexus *nexus, *next_n;
+	struct srpt_rdma_ch *ch;
+
+	WARN_ON_ONCE(irqs_disabled());
+
+	mutex_lock(&sport->mutex);
+	srpt_set_enabled(sport, false);
+	mutex_unlock(&sport->mutex);
+
+	while (wait_event_timeout(sport->ch_releaseQ,
+				  srpt_ch_list_empty(sport), 5 * HZ) <= 0) {
+		pr_info("%s_%d: waiting for session unregistration ...\n",
+			sport->sdev->device->name, sport->port);
+		rcu_read_lock();
+		list_for_each_entry(nexus, &sport->nexus_list, entry) {
+			list_for_each_entry(ch, &nexus->ch_list, list) {
+				pr_info("%s-%d: state %s\n",
+					ch->sess_name, ch->qp->qp_num,
+					get_ch_state_name(ch->state));
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	mutex_lock(&sport->mutex);
+	list_for_each_entry_safe(nexus, next_n, &sport->nexus_list, entry) {
+		list_del(&nexus->entry);
+		kfree_rcu(nexus, rcu);
+	}
+	mutex_unlock(&sport->mutex);
+
+	return 0;
+}
+
+static struct se_wwn *__srpt_lookup_wwn(const char *name)
+{
+	struct ib_device *dev;
+	struct srpt_device *sdev;
+	struct srpt_port *sport;
+	int i;
+
+	list_for_each_entry(sdev, &srpt_dev_list, list) {
+		dev = sdev->device;
+		if (!dev)
+			continue;
+
+		for (i = 0; i < dev->phys_port_cnt; i++) {
+			sport = &sdev->port[i];
+
+			if (strcmp(sport->port_guid, name) == 0)
+				return &sport->port_guid_wwn;
+			if (strcmp(sport->port_gid, name) == 0)
+				return &sport->port_gid_wwn;
+		}
+	}
+
+	return NULL;
+}
+
+static struct se_wwn *srpt_lookup_wwn(const char *name)
+{
+	struct se_wwn *wwn;
+
+	spin_lock(&srpt_dev_lock);
+	wwn = __srpt_lookup_wwn(name);
+	spin_unlock(&srpt_dev_lock);
+
+	return wwn;
+}
+
+static void srpt_free_srq(struct srpt_device *sdev)
+{
+	if (!sdev->srq)
+		return;
+
+	ib_destroy_srq(sdev->srq);
+	srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev,
+			     sdev->srq_size, srp_max_req_size, DMA_FROM_DEVICE);
+	sdev->srq = NULL;
+}
+
+static int srpt_alloc_srq(struct srpt_device *sdev)
+{
+	struct ib_srq_init_attr srq_attr = {
+		.event_handler = srpt_srq_event,
+		.srq_context = (void *)sdev,
+		.attr.max_wr = sdev->srq_size,
+		.attr.max_sge = 1,
+		.srq_type = IB_SRQT_BASIC,
+	};
+	struct ib_device *device = sdev->device;
+	struct ib_srq *srq;
+	int i;
+
+	WARN_ON_ONCE(sdev->srq);
+	srq = ib_create_srq(sdev->pd, &srq_attr);
+	if (IS_ERR(srq)) {
+		pr_debug("ib_create_srq() failed: %ld\n", PTR_ERR(srq));
+		return PTR_ERR(srq);
+	}
+
+	pr_debug("create SRQ #wr= %d max_allow=%d dev= %s\n", sdev->srq_size,
+		 sdev->device->attrs.max_srq_wr, device->name);
+
+	sdev->ioctx_ring = (struct srpt_recv_ioctx **)
+		srpt_alloc_ioctx_ring(sdev, sdev->srq_size,
+				      sizeof(*sdev->ioctx_ring[0]),
+				      srp_max_req_size, DMA_FROM_DEVICE);
+	if (!sdev->ioctx_ring) {
+		ib_destroy_srq(srq);
+		return -ENOMEM;
+	}
+
+	sdev->use_srq = true;
+	sdev->srq = srq;
+
+	for (i = 0; i < sdev->srq_size; ++i) {
+		INIT_LIST_HEAD(&sdev->ioctx_ring[i]->wait_list);
+		srpt_post_recv(sdev, NULL, sdev->ioctx_ring[i]);
+	}
+
+	return 0;
+}
+
+static int srpt_use_srq(struct srpt_device *sdev, bool use_srq)
+{
+	struct ib_device *device = sdev->device;
+	int ret = 0;
+
+	if (!use_srq) {
+		srpt_free_srq(sdev);
+		sdev->use_srq = false;
+	} else if (use_srq && !sdev->srq) {
+		ret = srpt_alloc_srq(sdev);
+	}
+	pr_debug("%s(%s): use_srq = %d; ret = %d\n", __func__, device->name,
+		 sdev->use_srq, ret);
+	return ret;
+}
+
+/**
+ * srpt_add_one - InfiniBand device addition callback function
+ * @device: Describes a HCA.
+ */
+static void srpt_add_one(struct ib_device *device)
+{
+	struct srpt_device *sdev;
+	struct srpt_port *sport;
+	int i, ret;
+
+	pr_debug("device = %p\n", device);
+
+	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+	if (!sdev)
+		goto err;
+
+	sdev->device = device;
+	mutex_init(&sdev->sdev_mutex);
+
+	sdev->pd = ib_alloc_pd(device, 0);
+	if (IS_ERR(sdev->pd))
+		goto free_dev;
+
+	sdev->lkey = sdev->pd->local_dma_lkey;
+
+	sdev->srq_size = min(srpt_srq_size, sdev->device->attrs.max_srq_wr);
+
+	srpt_use_srq(sdev, sdev->port[0].port_attrib.use_srq);
+
+	if (!srpt_service_guid)
+		srpt_service_guid = be64_to_cpu(device->node_guid);
+
+	if (rdma_port_get_link_layer(device, 1) == IB_LINK_LAYER_INFINIBAND)
+		sdev->cm_id = ib_create_cm_id(device, srpt_cm_handler, sdev);
+	if (IS_ERR(sdev->cm_id)) {
+		pr_info("ib_create_cm_id() failed: %ld\n",
+			PTR_ERR(sdev->cm_id));
+		sdev->cm_id = NULL;
+		if (!rdma_cm_id)
+			goto err_ring;
+	}
+
+	/* print out target login information */
+	pr_debug("Target login info: id_ext=%016llx,ioc_guid=%016llx,"
+		 "pkey=ffff,service_id=%016llx\n", srpt_service_guid,
+		 srpt_service_guid, srpt_service_guid);
+
+	/*
+	 * We do not have a consistent service_id (ie. also id_ext of target_id)
+	 * to identify this target. We currently use the guid of the first HCA
+	 * in the system as service_id; therefore, the target_id will change
+	 * if this HCA is gone bad and replaced by different HCA
+	 */
+	ret = sdev->cm_id ?
+		ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0) :
+		0;
+	if (ret < 0) {
+		pr_err("ib_cm_listen() failed: %d (cm_id state = %d)\n", ret,
+		       sdev->cm_id->state);
+		goto err_cm;
+	}
+
+	INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device,
+			      srpt_event_handler);
+	ib_register_event_handler(&sdev->event_handler);
+
+	WARN_ON(sdev->device->phys_port_cnt > ARRAY_SIZE(sdev->port));
+
+	for (i = 1; i <= sdev->device->phys_port_cnt; i++) {
+		sport = &sdev->port[i - 1];
+		INIT_LIST_HEAD(&sport->nexus_list);
+		init_waitqueue_head(&sport->ch_releaseQ);
+		mutex_init(&sport->mutex);
+		sport->sdev = sdev;
+		sport->port = i;
+		sport->port_attrib.srp_max_rdma_size = DEFAULT_MAX_RDMA_SIZE;
+		sport->port_attrib.srp_max_rsp_size = DEFAULT_MAX_RSP_SIZE;
+		sport->port_attrib.srp_sq_size = DEF_SRPT_SQ_SIZE;
+		sport->port_attrib.use_srq = false;
+		INIT_WORK(&sport->work, srpt_refresh_port_work);
+
+		if (srpt_refresh_port(sport)) {
+			pr_err("MAD registration failed for %s-%d.\n",
+			       sdev->device->name, i);
+			goto err_event;
+		}
+	}
+
+	spin_lock(&srpt_dev_lock);
+	list_add_tail(&sdev->list, &srpt_dev_list);
+	spin_unlock(&srpt_dev_lock);
+
+out:
+	ib_set_client_data(device, &srpt_client, sdev);
+	pr_debug("added %s.\n", device->name);
+	return;
+
+err_event:
+	ib_unregister_event_handler(&sdev->event_handler);
+err_cm:
+	if (sdev->cm_id)
+		ib_destroy_cm_id(sdev->cm_id);
+err_ring:
+	srpt_free_srq(sdev);
+	ib_dealloc_pd(sdev->pd);
+free_dev:
+	kfree(sdev);
+err:
+	sdev = NULL;
+	pr_info("%s(%s) failed.\n", __func__, device->name);
+	goto out;
+}
+
+/**
+ * srpt_remove_one - InfiniBand device removal callback function
+ * @device: Describes a HCA.
+ * @client_data: The value passed as the third argument to ib_set_client_data().
+ */
+static void srpt_remove_one(struct ib_device *device, void *client_data)
+{
+	struct srpt_device *sdev = client_data;
+	int i;
+
+	if (!sdev) {
+		pr_info("%s(%s): nothing to do.\n", __func__, device->name);
+		return;
+	}
+
+	srpt_unregister_mad_agent(sdev);
+
+	ib_unregister_event_handler(&sdev->event_handler);
+
+	/* Cancel any work queued by the just unregistered IB event handler. */
+	for (i = 0; i < sdev->device->phys_port_cnt; i++)
+		cancel_work_sync(&sdev->port[i].work);
+
+	if (sdev->cm_id)
+		ib_destroy_cm_id(sdev->cm_id);
+
+	ib_set_client_data(device, &srpt_client, NULL);
+
+	/*
+	 * Unregistering a target must happen after destroying sdev->cm_id
+	 * such that no new SRP_LOGIN_REQ information units can arrive while
+	 * destroying the target.
+	 */
+	spin_lock(&srpt_dev_lock);
+	list_del(&sdev->list);
+	spin_unlock(&srpt_dev_lock);
+
+	for (i = 0; i < sdev->device->phys_port_cnt; i++)
+		srpt_release_sport(&sdev->port[i]);
+
+	srpt_free_srq(sdev);
+
+	ib_dealloc_pd(sdev->pd);
+
+	kfree(sdev);
+}
+
+static struct ib_client srpt_client = {
+	.name = DRV_NAME,
+	.add = srpt_add_one,
+	.remove = srpt_remove_one
+};
+
+static int srpt_check_true(struct se_portal_group *se_tpg)
+{
+	return 1;
+}
+
+static int srpt_check_false(struct se_portal_group *se_tpg)
+{
+	return 0;
+}
+
+static char *srpt_get_fabric_name(void)
+{
+	return "srpt";
+}
+
+static struct srpt_port *srpt_tpg_to_sport(struct se_portal_group *tpg)
+{
+	return tpg->se_tpg_wwn->priv;
+}
+
+static char *srpt_get_fabric_wwn(struct se_portal_group *tpg)
+{
+	struct srpt_port *sport = srpt_tpg_to_sport(tpg);
+
+	WARN_ON_ONCE(tpg != &sport->port_guid_tpg &&
+		     tpg != &sport->port_gid_tpg);
+	return tpg == &sport->port_guid_tpg ? sport->port_guid :
+		sport->port_gid;
+}
+
+static u16 srpt_get_tag(struct se_portal_group *tpg)
+{
+	return 1;
+}
+
+static u32 srpt_tpg_get_inst_index(struct se_portal_group *se_tpg)
+{
+	return 1;
+}
+
+static void srpt_release_cmd(struct se_cmd *se_cmd)
+{
+	struct srpt_send_ioctx *ioctx = container_of(se_cmd,
+				struct srpt_send_ioctx, cmd);
+	struct srpt_rdma_ch *ch = ioctx->ch;
+	unsigned long flags;
+
+	WARN_ON_ONCE(ioctx->state != SRPT_STATE_DONE &&
+		     !(ioctx->cmd.transport_state & CMD_T_ABORTED));
+
+	if (ioctx->n_rw_ctx) {
+		srpt_free_rw_ctxs(ch, ioctx);
+		ioctx->n_rw_ctx = 0;
+	}
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	list_add(&ioctx->free_list, &ch->free_list);
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+}
+
+/**
+ * srpt_close_session - forcibly close a session
+ * @se_sess: SCSI target session.
+ *
+ * Callback function invoked by the TCM core to clean up sessions associated
+ * with a node ACL when the user invokes
+ * rmdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id
+ */
+static void srpt_close_session(struct se_session *se_sess)
+{
+	struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr;
+
+	srpt_disconnect_ch_sync(ch);
+}
+
+/**
+ * srpt_sess_get_index - return the value of scsiAttIntrPortIndex (SCSI-MIB)
+ * @se_sess: SCSI target session.
+ *
+ * A quote from RFC 4455 (SCSI-MIB) about this MIB object:
+ * This object represents an arbitrary integer used to uniquely identify a
+ * particular attached remote initiator port to a particular SCSI target port
+ * within a particular SCSI target device within a particular SCSI instance.
+ */
+static u32 srpt_sess_get_index(struct se_session *se_sess)
+{
+	return 0;
+}
+
+static void srpt_set_default_node_attrs(struct se_node_acl *nacl)
+{
+}
+
+/* Note: only used from inside debug printk's by the TCM core. */
+static int srpt_get_tcm_cmd_state(struct se_cmd *se_cmd)
+{
+	struct srpt_send_ioctx *ioctx;
+
+	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd);
+	return ioctx->state;
+}
+
+static int srpt_parse_guid(u64 *guid, const char *name)
+{
+	u16 w[4];
+	int ret = -EINVAL;
+
+	if (sscanf(name, "%hx:%hx:%hx:%hx", &w[0], &w[1], &w[2], &w[3]) != 4)
+		goto out;
+	*guid = get_unaligned_be64(w);
+	ret = 0;
+out:
+	return ret;
+}
+
+/**
+ * srpt_parse_i_port_id - parse an initiator port ID
+ * @name: ASCII representation of a 128-bit initiator port ID.
+ * @i_port_id: Binary 128-bit port ID.
+ */
+static int srpt_parse_i_port_id(u8 i_port_id[16], const char *name)
+{
+	const char *p;
+	unsigned len, count, leading_zero_bytes;
+	int ret;
+
+	p = name;
+	if (strncasecmp(p, "0x", 2) == 0)
+		p += 2;
+	ret = -EINVAL;
+	len = strlen(p);
+	if (len % 2)
+		goto out;
+	count = min(len / 2, 16U);
+	leading_zero_bytes = 16 - count;
+	memset(i_port_id, 0, leading_zero_bytes);
+	ret = hex2bin(i_port_id + leading_zero_bytes, p, count);
+
+out:
+	return ret;
+}
+
+/*
+ * configfs callback function invoked for mkdir
+ * /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id
+ *
+ * i_port_id must be an initiator port GUID, GID or IP address. See also the
+ * target_alloc_session() calls in this driver. Examples of valid initiator
+ * port IDs:
+ * 0x0000000000000000505400fffe4a0b7b
+ * 0000000000000000505400fffe4a0b7b
+ * 5054:00ff:fe4a:0b7b
+ * 192.168.122.76
+ */
+static int srpt_init_nodeacl(struct se_node_acl *se_nacl, const char *name)
+{
+	struct sockaddr_storage sa;
+	u64 guid;
+	u8 i_port_id[16];
+	int ret;
+
+	ret = srpt_parse_guid(&guid, name);
+	if (ret < 0)
+		ret = srpt_parse_i_port_id(i_port_id, name);
+	if (ret < 0)
+		ret = inet_pton_with_scope(&init_net, AF_UNSPEC, name, NULL,
+					   &sa);
+	if (ret < 0)
+		pr_err("invalid initiator port ID %s\n", name);
+	return ret;
+}
+
+static ssize_t srpt_tpg_attrib_srp_max_rdma_size_show(struct config_item *item,
+		char *page)
+{
+	struct se_portal_group *se_tpg = attrib_to_tpg(item);
+	struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
+
+	return sprintf(page, "%u\n", sport->port_attrib.srp_max_rdma_size);
+}
+
+static ssize_t srpt_tpg_attrib_srp_max_rdma_size_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct se_portal_group *se_tpg = attrib_to_tpg(item);
+	struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(page, 0, &val);
+	if (ret < 0) {
+		pr_err("kstrtoul() failed with ret: %d\n", ret);
+		return -EINVAL;
+	}
+	if (val > MAX_SRPT_RDMA_SIZE) {
+		pr_err("val: %lu exceeds MAX_SRPT_RDMA_SIZE: %d\n", val,
+			MAX_SRPT_RDMA_SIZE);
+		return -EINVAL;
+	}
+	if (val < DEFAULT_MAX_RDMA_SIZE) {
+		pr_err("val: %lu smaller than DEFAULT_MAX_RDMA_SIZE: %d\n",
+			val, DEFAULT_MAX_RDMA_SIZE);
+		return -EINVAL;
+	}
+	sport->port_attrib.srp_max_rdma_size = val;
+
+	return count;
+}
+
+static ssize_t srpt_tpg_attrib_srp_max_rsp_size_show(struct config_item *item,
+		char *page)
+{
+	struct se_portal_group *se_tpg = attrib_to_tpg(item);
+	struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
+
+	return sprintf(page, "%u\n", sport->port_attrib.srp_max_rsp_size);
+}
+
+static ssize_t srpt_tpg_attrib_srp_max_rsp_size_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct se_portal_group *se_tpg = attrib_to_tpg(item);
+	struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(page, 0, &val);
+	if (ret < 0) {
+		pr_err("kstrtoul() failed with ret: %d\n", ret);
+		return -EINVAL;
+	}
+	if (val > MAX_SRPT_RSP_SIZE) {
+		pr_err("val: %lu exceeds MAX_SRPT_RSP_SIZE: %d\n", val,
+			MAX_SRPT_RSP_SIZE);
+		return -EINVAL;
+	}
+	if (val < MIN_MAX_RSP_SIZE) {
+		pr_err("val: %lu smaller than MIN_MAX_RSP_SIZE: %d\n", val,
+			MIN_MAX_RSP_SIZE);
+		return -EINVAL;
+	}
+	sport->port_attrib.srp_max_rsp_size = val;
+
+	return count;
+}
+
+static ssize_t srpt_tpg_attrib_srp_sq_size_show(struct config_item *item,
+		char *page)
+{
+	struct se_portal_group *se_tpg = attrib_to_tpg(item);
+	struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
+
+	return sprintf(page, "%u\n", sport->port_attrib.srp_sq_size);
+}
+
+static ssize_t srpt_tpg_attrib_srp_sq_size_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct se_portal_group *se_tpg = attrib_to_tpg(item);
+	struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(page, 0, &val);
+	if (ret < 0) {
+		pr_err("kstrtoul() failed with ret: %d\n", ret);
+		return -EINVAL;
+	}
+	if (val > MAX_SRPT_SRQ_SIZE) {
+		pr_err("val: %lu exceeds MAX_SRPT_SRQ_SIZE: %d\n", val,
+			MAX_SRPT_SRQ_SIZE);
+		return -EINVAL;
+	}
+	if (val < MIN_SRPT_SRQ_SIZE) {
+		pr_err("val: %lu smaller than MIN_SRPT_SRQ_SIZE: %d\n", val,
+			MIN_SRPT_SRQ_SIZE);
+		return -EINVAL;
+	}
+	sport->port_attrib.srp_sq_size = val;
+
+	return count;
+}
+
+static ssize_t srpt_tpg_attrib_use_srq_show(struct config_item *item,
+					    char *page)
+{
+	struct se_portal_group *se_tpg = attrib_to_tpg(item);
+	struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
+
+	return sprintf(page, "%d\n", sport->port_attrib.use_srq);
+}
+
+static ssize_t srpt_tpg_attrib_use_srq_store(struct config_item *item,
+					     const char *page, size_t count)
+{
+	struct se_portal_group *se_tpg = attrib_to_tpg(item);
+	struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
+	struct srpt_device *sdev = sport->sdev;
+	unsigned long val;
+	bool enabled;
+	int ret;
+
+	ret = kstrtoul(page, 0, &val);
+	if (ret < 0)
+		return ret;
+	if (val != !!val)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&sdev->sdev_mutex);
+	if (ret < 0)
+		return ret;
+	ret = mutex_lock_interruptible(&sport->mutex);
+	if (ret < 0)
+		goto unlock_sdev;
+	enabled = sport->enabled;
+	/* Log out all initiator systems before changing 'use_srq'. */
+	srpt_set_enabled(sport, false);
+	sport->port_attrib.use_srq = val;
+	srpt_use_srq(sdev, sport->port_attrib.use_srq);
+	srpt_set_enabled(sport, enabled);
+	ret = count;
+	mutex_unlock(&sport->mutex);
+unlock_sdev:
+	mutex_unlock(&sdev->sdev_mutex);
+
+	return ret;
+}
+
+CONFIGFS_ATTR(srpt_tpg_attrib_,  srp_max_rdma_size);
+CONFIGFS_ATTR(srpt_tpg_attrib_,  srp_max_rsp_size);
+CONFIGFS_ATTR(srpt_tpg_attrib_,  srp_sq_size);
+CONFIGFS_ATTR(srpt_tpg_attrib_,  use_srq);
+
+static struct configfs_attribute *srpt_tpg_attrib_attrs[] = {
+	&srpt_tpg_attrib_attr_srp_max_rdma_size,
+	&srpt_tpg_attrib_attr_srp_max_rsp_size,
+	&srpt_tpg_attrib_attr_srp_sq_size,
+	&srpt_tpg_attrib_attr_use_srq,
+	NULL,
+};
+
+static struct rdma_cm_id *srpt_create_rdma_id(struct sockaddr *listen_addr)
+{
+	struct rdma_cm_id *rdma_cm_id;
+	int ret;
+
+	rdma_cm_id = rdma_create_id(&init_net, srpt_rdma_cm_handler,
+				    NULL, RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(rdma_cm_id)) {
+		pr_err("RDMA/CM ID creation failed: %ld\n",
+		       PTR_ERR(rdma_cm_id));
+		goto out;
+	}
+
+	ret = rdma_bind_addr(rdma_cm_id, listen_addr);
+	if (ret) {
+		char addr_str[64];
+
+		snprintf(addr_str, sizeof(addr_str), "%pISp", listen_addr);
+		pr_err("Binding RDMA/CM ID to address %s failed: %d\n",
+		       addr_str, ret);
+		rdma_destroy_id(rdma_cm_id);
+		rdma_cm_id = ERR_PTR(ret);
+		goto out;
+	}
+
+	ret = rdma_listen(rdma_cm_id, 128);
+	if (ret) {
+		pr_err("rdma_listen() failed: %d\n", ret);
+		rdma_destroy_id(rdma_cm_id);
+		rdma_cm_id = ERR_PTR(ret);
+	}
+
+out:
+	return rdma_cm_id;
+}
+
+static ssize_t srpt_rdma_cm_port_show(struct config_item *item, char *page)
+{
+	return sprintf(page, "%d\n", rdma_cm_port);
+}
+
+static ssize_t srpt_rdma_cm_port_store(struct config_item *item,
+				       const char *page, size_t count)
+{
+	struct sockaddr_in  addr4 = { .sin_family  = AF_INET  };
+	struct sockaddr_in6 addr6 = { .sin6_family = AF_INET6 };
+	struct rdma_cm_id *new_id = NULL;
+	u16 val;
+	int ret;
+
+	ret = kstrtou16(page, 0, &val);
+	if (ret < 0)
+		return ret;
+	ret = count;
+	if (rdma_cm_port == val)
+		goto out;
+
+	if (val) {
+		addr6.sin6_port = cpu_to_be16(val);
+		new_id = srpt_create_rdma_id((struct sockaddr *)&addr6);
+		if (IS_ERR(new_id)) {
+			addr4.sin_port = cpu_to_be16(val);
+			new_id = srpt_create_rdma_id((struct sockaddr *)&addr4);
+			if (IS_ERR(new_id)) {
+				ret = PTR_ERR(new_id);
+				goto out;
+			}
+		}
+	}
+
+	mutex_lock(&rdma_cm_mutex);
+	rdma_cm_port = val;
+	swap(rdma_cm_id, new_id);
+	mutex_unlock(&rdma_cm_mutex);
+
+	if (new_id)
+		rdma_destroy_id(new_id);
+	ret = count;
+out:
+	return ret;
+}
+
+CONFIGFS_ATTR(srpt_, rdma_cm_port);
+
+static struct configfs_attribute *srpt_da_attrs[] = {
+	&srpt_attr_rdma_cm_port,
+	NULL,
+};
+
+static ssize_t srpt_tpg_enable_show(struct config_item *item, char *page)
+{
+	struct se_portal_group *se_tpg = to_tpg(item);
+	struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
+
+	return snprintf(page, PAGE_SIZE, "%d\n", (sport->enabled) ? 1: 0);
+}
+
+static ssize_t srpt_tpg_enable_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct se_portal_group *se_tpg = to_tpg(item);
+	struct srpt_port *sport = srpt_tpg_to_sport(se_tpg);
+	unsigned long tmp;
+        int ret;
+
+	ret = kstrtoul(page, 0, &tmp);
+	if (ret < 0) {
+		pr_err("Unable to extract srpt_tpg_store_enable\n");
+		return -EINVAL;
+	}
+
+	if ((tmp != 0) && (tmp != 1)) {
+		pr_err("Illegal value for srpt_tpg_store_enable: %lu\n", tmp);
+		return -EINVAL;
+	}
+
+	mutex_lock(&sport->mutex);
+	srpt_set_enabled(sport, tmp);
+	mutex_unlock(&sport->mutex);
+
+	return count;
+}
+
+CONFIGFS_ATTR(srpt_tpg_, enable);
+
+static struct configfs_attribute *srpt_tpg_attrs[] = {
+	&srpt_tpg_attr_enable,
+	NULL,
+};
+
+/**
+ * srpt_make_tpg - configfs callback invoked for mkdir /sys/kernel/config/target/$driver/$port/$tpg
+ * @wwn: Corresponds to $driver/$port.
+ * @group: Not used.
+ * @name: $tpg.
+ */
+static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn,
+					     struct config_group *group,
+					     const char *name)
+{
+	struct srpt_port *sport = wwn->priv;
+	static struct se_portal_group *tpg;
+	int res;
+
+	WARN_ON_ONCE(wwn != &sport->port_guid_wwn &&
+		     wwn != &sport->port_gid_wwn);
+	tpg = wwn == &sport->port_guid_wwn ? &sport->port_guid_tpg :
+		&sport->port_gid_tpg;
+	res = core_tpg_register(wwn, tpg, SCSI_PROTOCOL_SRP);
+	if (res)
+		return ERR_PTR(res);
+
+	return tpg;
+}
+
+/**
+ * srpt_drop_tpg - configfs callback invoked for rmdir /sys/kernel/config/target/$driver/$port/$tpg
+ * @tpg: Target portal group to deregister.
+ */
+static void srpt_drop_tpg(struct se_portal_group *tpg)
+{
+	struct srpt_port *sport = srpt_tpg_to_sport(tpg);
+
+	sport->enabled = false;
+	core_tpg_deregister(tpg);
+}
+
+/**
+ * srpt_make_tport - configfs callback invoked for mkdir /sys/kernel/config/target/$driver/$port
+ * @tf: Not used.
+ * @group: Not used.
+ * @name: $port.
+ */
+static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf,
+				      struct config_group *group,
+				      const char *name)
+{
+	return srpt_lookup_wwn(name) ? : ERR_PTR(-EINVAL);
+}
+
+/**
+ * srpt_drop_tport - configfs callback invoked for rmdir /sys/kernel/config/target/$driver/$port
+ * @wwn: $port.
+ */
+static void srpt_drop_tport(struct se_wwn *wwn)
+{
+}
+
+static ssize_t srpt_wwn_version_show(struct config_item *item, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%s\n", DRV_VERSION);
+}
+
+CONFIGFS_ATTR_RO(srpt_wwn_, version);
+
+static struct configfs_attribute *srpt_wwn_attrs[] = {
+	&srpt_wwn_attr_version,
+	NULL,
+};
+
+static const struct target_core_fabric_ops srpt_template = {
+	.module				= THIS_MODULE,
+	.name				= "srpt",
+	.get_fabric_name		= srpt_get_fabric_name,
+	.tpg_get_wwn			= srpt_get_fabric_wwn,
+	.tpg_get_tag			= srpt_get_tag,
+	.tpg_check_demo_mode		= srpt_check_false,
+	.tpg_check_demo_mode_cache	= srpt_check_true,
+	.tpg_check_demo_mode_write_protect = srpt_check_true,
+	.tpg_check_prod_mode_write_protect = srpt_check_false,
+	.tpg_get_inst_index		= srpt_tpg_get_inst_index,
+	.release_cmd			= srpt_release_cmd,
+	.check_stop_free		= srpt_check_stop_free,
+	.close_session			= srpt_close_session,
+	.sess_get_index			= srpt_sess_get_index,
+	.sess_get_initiator_sid		= NULL,
+	.write_pending			= srpt_write_pending,
+	.write_pending_status		= srpt_write_pending_status,
+	.set_default_node_attributes	= srpt_set_default_node_attrs,
+	.get_cmd_state			= srpt_get_tcm_cmd_state,
+	.queue_data_in			= srpt_queue_data_in,
+	.queue_status			= srpt_queue_status,
+	.queue_tm_rsp			= srpt_queue_tm_rsp,
+	.aborted_task			= srpt_aborted_task,
+	/*
+	 * Setup function pointers for generic logic in
+	 * target_core_fabric_configfs.c
+	 */
+	.fabric_make_wwn		= srpt_make_tport,
+	.fabric_drop_wwn		= srpt_drop_tport,
+	.fabric_make_tpg		= srpt_make_tpg,
+	.fabric_drop_tpg		= srpt_drop_tpg,
+	.fabric_init_nodeacl		= srpt_init_nodeacl,
+
+	.tfc_discovery_attrs		= srpt_da_attrs,
+	.tfc_wwn_attrs			= srpt_wwn_attrs,
+	.tfc_tpg_base_attrs		= srpt_tpg_attrs,
+	.tfc_tpg_attrib_attrs		= srpt_tpg_attrib_attrs,
+};
+
+/**
+ * srpt_init_module - kernel module initialization
+ *
+ * Note: Since ib_register_client() registers callback functions, and since at
+ * least one of these callback functions (srpt_add_one()) calls target core
+ * functions, this driver must be registered with the target core before
+ * ib_register_client() is called.
+ */
+static int __init srpt_init_module(void)
+{
+	int ret;
+
+	ret = -EINVAL;
+	if (srp_max_req_size < MIN_MAX_REQ_SIZE) {
+		pr_err("invalid value %d for kernel module parameter"
+		       " srp_max_req_size -- must be at least %d.\n",
+		       srp_max_req_size, MIN_MAX_REQ_SIZE);
+		goto out;
+	}
+
+	if (srpt_srq_size < MIN_SRPT_SRQ_SIZE
+	    || srpt_srq_size > MAX_SRPT_SRQ_SIZE) {
+		pr_err("invalid value %d for kernel module parameter"
+		       " srpt_srq_size -- must be in the range [%d..%d].\n",
+		       srpt_srq_size, MIN_SRPT_SRQ_SIZE, MAX_SRPT_SRQ_SIZE);
+		goto out;
+	}
+
+	ret = target_register_template(&srpt_template);
+	if (ret)
+		goto out;
+
+	ret = ib_register_client(&srpt_client);
+	if (ret) {
+		pr_err("couldn't register IB client\n");
+		goto out_unregister_target;
+	}
+
+	return 0;
+
+out_unregister_target:
+	target_unregister_template(&srpt_template);
+out:
+	return ret;
+}
+
+static void __exit srpt_cleanup_module(void)
+{
+	if (rdma_cm_id)
+		rdma_destroy_id(rdma_cm_id);
+	ib_unregister_client(&srpt_client);
+	target_unregister_template(&srpt_template);
+}
+
+module_init(srpt_init_module);
+module_exit(srpt_cleanup_module);
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
new file mode 100644
index 0000000..2361483
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2006 - 2009 Mellanox Technology Inc.  All rights reserved.
+ * Copyright (C) 2009 - 2010 Bart Van Assche <bvanassche@acm.org>.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef IB_SRPT_H
+#define IB_SRPT_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cm.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rw.h>
+
+#include <scsi/srp.h>
+
+#include "ib_dm_mad.h"
+
+/*
+ * The prefix the ServiceName field must start with in the device management
+ * ServiceEntries attribute pair. See also the SRP specification.
+ */
+#define SRP_SERVICE_NAME_PREFIX		"SRP.T10:"
+
+struct srpt_nexus;
+
+enum {
+	/*
+	 * SRP IOControllerProfile attributes for SRP target ports that have
+	 * not been defined in <scsi/srp.h>. Source: section B.7, table B.7
+	 * in the SRP specification.
+	 */
+	SRP_PROTOCOL = 0x0108,
+	SRP_PROTOCOL_VERSION = 0x0001,
+	SRP_IO_SUBCLASS = 0x609e,
+	SRP_SEND_TO_IOC = 0x01,
+	SRP_SEND_FROM_IOC = 0x02,
+	SRP_RDMA_READ_FROM_IOC = 0x08,
+	SRP_RDMA_WRITE_FROM_IOC = 0x20,
+
+	/*
+	 * srp_login_cmd.req_flags bitmasks. See also table 9 in the SRP
+	 * specification.
+	 */
+	SRP_MTCH_ACTION = 0x03, /* MULTI-CHANNEL ACTION */
+	SRP_LOSOLNT = 0x10, /* logout solicited notification */
+	SRP_CRSOLNT = 0x20, /* credit request solicited notification */
+	SRP_AESOLNT = 0x40, /* asynchronous event solicited notification */
+
+	/*
+	 * srp_cmd.sol_nt / srp_tsk_mgmt.sol_not bitmasks. See also tables
+	 * 18 and 20 in the SRP specification.
+	 */
+	SRP_SCSOLNT = 0x02, /* SCSOLNT = successful solicited notification */
+	SRP_UCSOLNT = 0x04, /* UCSOLNT = unsuccessful solicited notification */
+
+	/*
+	 * srp_rsp.sol_not / srp_t_logout.sol_not bitmasks. See also tables
+	 * 16 and 22 in the SRP specification.
+	 */
+	SRP_SOLNT = 0x01, /* SOLNT = solicited notification */
+
+	/* See also table 24 in the SRP specification. */
+	SRP_TSK_MGMT_SUCCESS = 0x00,
+	SRP_TSK_MGMT_FUNC_NOT_SUPP = 0x04,
+	SRP_TSK_MGMT_FAILED = 0x05,
+
+	/* See also table 21 in the SRP specification. */
+	SRP_CMD_SIMPLE_Q = 0x0,
+	SRP_CMD_HEAD_OF_Q = 0x1,
+	SRP_CMD_ORDERED_Q = 0x2,
+	SRP_CMD_ACA = 0x4,
+
+	SRP_LOGIN_RSP_MULTICHAN_NO_CHAN = 0x0,
+	SRP_LOGIN_RSP_MULTICHAN_TERMINATED = 0x1,
+	SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2,
+
+	SRPT_DEF_SG_TABLESIZE = 128,
+	/*
+	 * An experimentally determined value that avoids that QP creation
+	 * fails due to "swiotlb buffer is full" on systems using the swiotlb.
+	 */
+	SRPT_MAX_SG_PER_WQE = 16,
+
+	MIN_SRPT_SQ_SIZE = 16,
+	DEF_SRPT_SQ_SIZE = 4096,
+	MAX_SRPT_RQ_SIZE = 128,
+	MIN_SRPT_SRQ_SIZE = 4,
+	DEFAULT_SRPT_SRQ_SIZE = 4095,
+	MAX_SRPT_SRQ_SIZE = 65535,
+	MAX_SRPT_RDMA_SIZE = 1U << 24,
+	MAX_SRPT_RSP_SIZE = 1024,
+
+	MIN_MAX_REQ_SIZE = 996,
+	DEFAULT_MAX_REQ_SIZE
+		= sizeof(struct srp_cmd)/*48*/
+		+ sizeof(struct srp_indirect_buf)/*20*/
+		+ 128 * sizeof(struct srp_direct_buf)/*16*/,
+
+	MIN_MAX_RSP_SIZE = sizeof(struct srp_rsp)/*36*/ + 4,
+	DEFAULT_MAX_RSP_SIZE = 256, /* leaves 220 bytes for sense data */
+
+	DEFAULT_MAX_RDMA_SIZE = 65536,
+};
+
+/**
+ * enum srpt_command_state - SCSI command state managed by SRPT
+ * @SRPT_STATE_NEW:           New command arrived and is being processed.
+ * @SRPT_STATE_NEED_DATA:     Processing a write or bidir command and waiting
+ *                            for data arrival.
+ * @SRPT_STATE_DATA_IN:       Data for the write or bidir command arrived and is
+ *                            being processed.
+ * @SRPT_STATE_CMD_RSP_SENT:  SRP_RSP for SRP_CMD has been sent.
+ * @SRPT_STATE_MGMT:          Processing a SCSI task management command.
+ * @SRPT_STATE_MGMT_RSP_SENT: SRP_RSP for SRP_TSK_MGMT has been sent.
+ * @SRPT_STATE_DONE:          Command processing finished successfully, command
+ *                            processing has been aborted or command processing
+ *                            failed.
+ */
+enum srpt_command_state {
+	SRPT_STATE_NEW		 = 0,
+	SRPT_STATE_NEED_DATA	 = 1,
+	SRPT_STATE_DATA_IN	 = 2,
+	SRPT_STATE_CMD_RSP_SENT	 = 3,
+	SRPT_STATE_MGMT		 = 4,
+	SRPT_STATE_MGMT_RSP_SENT = 5,
+	SRPT_STATE_DONE		 = 6,
+};
+
+/**
+ * struct srpt_ioctx - shared SRPT I/O context information
+ * @cqe:   Completion queue element.
+ * @buf:   Pointer to the buffer.
+ * @dma:   DMA address of the buffer.
+ * @index: Index of the I/O context in its ioctx_ring array.
+ */
+struct srpt_ioctx {
+	struct ib_cqe		cqe;
+	void			*buf;
+	dma_addr_t		dma;
+	uint32_t		index;
+};
+
+/**
+ * struct srpt_recv_ioctx - SRPT receive I/O context
+ * @ioctx:     See above.
+ * @wait_list: Node for insertion in srpt_rdma_ch.cmd_wait_list.
+ */
+struct srpt_recv_ioctx {
+	struct srpt_ioctx	ioctx;
+	struct list_head	wait_list;
+};
+	
+struct srpt_rw_ctx {
+	struct rdma_rw_ctx	rw;
+	struct scatterlist	*sg;
+	unsigned int		nents;
+};
+
+/**
+ * struct srpt_send_ioctx - SRPT send I/O context
+ * @ioctx:       See above.
+ * @ch:          Channel pointer.
+ * @s_rw_ctx:    @rw_ctxs points here if only a single rw_ctx is needed.
+ * @rw_ctxs:     RDMA read/write contexts.
+ * @rdma_cqe:    RDMA completion queue element.
+ * @free_list:   Node in srpt_rdma_ch.free_list.
+ * @state:       I/O context state.
+ * @cmd:         Target core command data structure.
+ * @sense_data:  SCSI sense data.
+ * @n_rdma:      Number of work requests needed to transfer this ioctx.
+ * @n_rw_ctx:    Size of rw_ctxs array.
+ * @queue_status_only: Send a SCSI status back to the initiator but no data.
+ * @sense_data:  Sense data to be sent to the initiator.
+ */
+struct srpt_send_ioctx {
+	struct srpt_ioctx	ioctx;
+	struct srpt_rdma_ch	*ch;
+
+	struct srpt_rw_ctx	s_rw_ctx;
+	struct srpt_rw_ctx	*rw_ctxs;
+
+	struct ib_cqe		rdma_cqe;
+	struct list_head	free_list;
+	enum srpt_command_state	state;
+	struct se_cmd		cmd;
+	u8			n_rdma;
+	u8			n_rw_ctx;
+	bool			queue_status_only;
+	u8			sense_data[TRANSPORT_SENSE_BUFFER];
+};
+
+/**
+ * enum rdma_ch_state - SRP channel state
+ * @CH_CONNECTING:    QP is in RTR state; waiting for RTU.
+ * @CH_LIVE:	      QP is in RTS state.
+ * @CH_DISCONNECTING: DREQ has been sent and waiting for DREP or DREQ has
+ *                    been received.
+ * @CH_DRAINING:      DREP has been received or waiting for DREP timed out
+ *                    and last work request has been queued.
+ * @CH_DISCONNECTED:  Last completion has been received.
+ */
+enum rdma_ch_state {
+	CH_CONNECTING,
+	CH_LIVE,
+	CH_DISCONNECTING,
+	CH_DRAINING,
+	CH_DISCONNECTED,
+};
+
+/**
+ * struct srpt_rdma_ch - RDMA channel
+ * @nexus:         I_T nexus this channel is associated with.
+ * @qp:            IB queue pair used for communicating over this channel.
+ * @cm_id:         IB CM ID associated with the channel.
+ * @cq:            IB completion queue for this channel.
+ * @zw_cqe:	   Zero-length write CQE.
+ * @rcu:           RCU head.
+ * @kref:	   kref for this channel.
+ * @rq_size:       IB receive queue size.
+ * @max_rsp_size:  Maximum size of an RSP response message in bytes.
+ * @sq_wr_avail:   number of work requests available in the send queue.
+ * @sport:         pointer to the information of the HCA port used by this
+ *                 channel.
+ * @max_ti_iu_len: maximum target-to-initiator information unit length.
+ * @req_lim:       request limit: maximum number of requests that may be sent
+ *                 by the initiator without having received a response.
+ * @req_lim_delta: Number of credits not yet sent back to the initiator.
+ * @spinlock:      Protects free_list and state.
+ * @free_list:     Head of list with free send I/O contexts.
+ * @state:         channel state. See also enum rdma_ch_state.
+ * @using_rdma_cm: Whether the RDMA/CM or IB/CM is used for this channel.
+ * @processing_wait_list: Whether or not cmd_wait_list is being processed.
+ * @ioctx_ring:    Send ring.
+ * @ioctx_recv_ring: Receive I/O context ring.
+ * @list:          Node in srpt_nexus.ch_list.
+ * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This
+ *                 list contains struct srpt_ioctx elements and is protected
+ *                 against concurrent modification by the cm_id spinlock.
+ * @pkey:          P_Key of the IB partition for this SRP channel.
+ * @sess:          Session information associated with this SRP channel.
+ * @sess_name:     Session name.
+ * @release_work:  Allows scheduling of srpt_release_channel().
+ */
+struct srpt_rdma_ch {
+	struct srpt_nexus	*nexus;
+	struct ib_qp		*qp;
+	union {
+		struct {
+			struct ib_cm_id		*cm_id;
+		} ib_cm;
+		struct {
+			struct rdma_cm_id	*cm_id;
+		} rdma_cm;
+	};
+	struct ib_cq		*cq;
+	struct ib_cqe		zw_cqe;
+	struct rcu_head		rcu;
+	struct kref		kref;
+	int			rq_size;
+	u32			max_rsp_size;
+	atomic_t		sq_wr_avail;
+	struct srpt_port	*sport;
+	int			max_ti_iu_len;
+	atomic_t		req_lim;
+	atomic_t		req_lim_delta;
+	spinlock_t		spinlock;
+	struct list_head	free_list;
+	enum rdma_ch_state	state;
+	struct srpt_send_ioctx	**ioctx_ring;
+	struct srpt_recv_ioctx	**ioctx_recv_ring;
+	struct list_head	list;
+	struct list_head	cmd_wait_list;
+	uint16_t		pkey;
+	bool			using_rdma_cm;
+	bool			processing_wait_list;
+	struct se_session	*sess;
+	u8			sess_name[40];
+	struct work_struct	release_work;
+};
+
+/**
+ * struct srpt_nexus - I_T nexus
+ * @rcu:       RCU head for this data structure.
+ * @entry:     srpt_port.nexus_list list node.
+ * @ch_list:   struct srpt_rdma_ch list. Protected by srpt_port.mutex.
+ * @i_port_id: 128-bit initiator port identifier copied from SRP_LOGIN_REQ.
+ * @t_port_id: 128-bit target port identifier copied from SRP_LOGIN_REQ.
+ */
+struct srpt_nexus {
+	struct rcu_head		rcu;
+	struct list_head	entry;
+	struct list_head	ch_list;
+	u8			i_port_id[16];
+	u8			t_port_id[16];
+};
+
+/**
+ * struct srpt_port_attib - attributes for SRPT port
+ * @srp_max_rdma_size: Maximum size of SRP RDMA transfers for new connections.
+ * @srp_max_rsp_size: Maximum size of SRP response messages in bytes.
+ * @srp_sq_size: Shared receive queue (SRQ) size.
+ * @use_srq: Whether or not to use SRQ.
+ */
+struct srpt_port_attrib {
+	u32			srp_max_rdma_size;
+	u32			srp_max_rsp_size;
+	u32			srp_sq_size;
+	bool			use_srq;
+};
+
+/**
+ * struct srpt_port - information associated by SRPT with a single IB port
+ * @sdev:      backpointer to the HCA information.
+ * @mad_agent: per-port management datagram processing information.
+ * @enabled:   Whether or not this target port is enabled.
+ * @port_guid: ASCII representation of Port GUID
+ * @port_gid:  ASCII representation of Port GID
+ * @port:      one-based port number.
+ * @sm_lid:    cached value of the port's sm_lid.
+ * @lid:       cached value of the port's lid.
+ * @gid:       cached value of the port's gid.
+ * @port_acl_lock spinlock for port_acl_list:
+ * @work:      work structure for refreshing the aforementioned cached values.
+ * @port_guid_tpg: TPG associated with target port GUID.
+ * @port_guid_wwn: WWN associated with target port GUID.
+ * @port_gid_tpg:  TPG associated with target port GID.
+ * @port_gid_wwn:  WWN associated with target port GID.
+ * @port_attrib:   Port attributes that can be accessed through configfs.
+ * @ch_releaseQ:   Enables waiting for removal from nexus_list.
+ * @mutex:	   Protects nexus_list.
+ * @nexus_list:	   Nexus list. See also srpt_nexus.entry.
+ */
+struct srpt_port {
+	struct srpt_device	*sdev;
+	struct ib_mad_agent	*mad_agent;
+	bool			enabled;
+	u8			port_guid[24];
+	u8			port_gid[64];
+	u8			port;
+	u32			sm_lid;
+	u32			lid;
+	union ib_gid		gid;
+	struct work_struct	work;
+	struct se_portal_group	port_guid_tpg;
+	struct se_wwn		port_guid_wwn;
+	struct se_portal_group	port_gid_tpg;
+	struct se_wwn		port_gid_wwn;
+	struct srpt_port_attrib port_attrib;
+	wait_queue_head_t	ch_releaseQ;
+	struct mutex		mutex;
+	struct list_head	nexus_list;
+};
+
+/**
+ * struct srpt_device - information associated by SRPT with a single HCA
+ * @device:        Backpointer to the struct ib_device managed by the IB core.
+ * @pd:            IB protection domain.
+ * @lkey:          L_Key (local key) with write access to all local memory.
+ * @srq:           Per-HCA SRQ (shared receive queue).
+ * @cm_id:         Connection identifier.
+ * @srq_size:      SRQ size.
+ * @sdev_mutex:	   Serializes use_srq changes.
+ * @use_srq:       Whether or not to use SRQ.
+ * @ioctx_ring:    Per-HCA SRQ.
+ * @port:          Information about the ports owned by this HCA.
+ * @event_handler: Per-HCA asynchronous IB event handler.
+ * @list:          Node in srpt_dev_list.
+ */
+struct srpt_device {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	u32			lkey;
+	struct ib_srq		*srq;
+	struct ib_cm_id		*cm_id;
+	int			srq_size;
+	struct mutex		sdev_mutex;
+	bool			use_srq;
+	struct srpt_recv_ioctx	**ioctx_ring;
+	struct srpt_port	port[2];
+	struct ib_event_handler	event_handler;
+	struct list_head	list;
+};
+
+#endif				/* IB_SRPT_H */
diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
new file mode 100644
index 0000000..af75156
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -0,0 +1,235 @@
+#
+# Broadcom device configuration
+#
+
+config NET_VENDOR_BROADCOM
+	bool "Broadcom devices"
+	default y
+	depends on (SSB_POSSIBLE && HAS_DMA) || PCI || BCM63XX || \
+		   SIBYTE_SB1xxx_SOC
+	---help---
+	  If you have a network (Ethernet) chipset belonging to this class,
+	  say Y.
+
+	  Note that the answer to this question does not directly affect
+	  the kernel: saying N will just case the configurator to skip all
+	  the questions regarding AMD chipsets. If you say Y, you will be asked
+	  for your specific chipset/driver in the following questions.
+
+if NET_VENDOR_BROADCOM
+
+config B44
+	tristate "Broadcom 440x/47xx ethernet support"
+	depends on SSB_POSSIBLE && HAS_DMA
+	select SSB
+	select MII
+	select PHYLIB
+	---help---
+	  If you have a network (Ethernet) controller of this type, say Y
+	  or M here.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called b44.
+
+# Auto-select SSB PCI-HOST support, if possible
+config B44_PCI_AUTOSELECT
+	bool
+	depends on B44 && SSB_PCIHOST_POSSIBLE
+	select SSB_PCIHOST
+	default y
+
+# Auto-select SSB PCICORE driver, if possible
+config B44_PCICORE_AUTOSELECT
+	bool
+	depends on B44 && SSB_DRIVER_PCICORE_POSSIBLE
+	select SSB_DRIVER_PCICORE
+	default y
+
+config B44_PCI
+	bool
+	depends on B44_PCI_AUTOSELECT && B44_PCICORE_AUTOSELECT
+	default y
+
+config BCM63XX_ENET
+	tristate "Broadcom 63xx internal mac support"
+	depends on BCM63XX
+	select MII
+	select PHYLIB
+	help
+	  This driver supports the ethernet MACs in the Broadcom 63xx
+	  MIPS chipset family (BCM63XX).
+
+config BCMGENET
+	tristate "Broadcom GENET internal MAC support"
+	depends on OF && HAS_IOMEM
+	select MII
+	select PHYLIB
+	select FIXED_PHY
+	select BCM7XXX_PHY
+	select MDIO_BCM_UNIMAC
+	help
+	  This driver supports the built-in Ethernet MACs found in the
+	  Broadcom BCM7xxx Set Top Box family chipset.
+
+config BNX2
+	tristate "QLogic bnx2 support"
+	depends on PCI
+	select CRC32
+	select FW_LOADER
+	---help---
+	  This driver supports QLogic bnx2 gigabit Ethernet cards.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called bnx2.  This is recommended.
+
+config CNIC
+	tristate "QLogic CNIC support"
+	depends on PCI && (IPV6 || IPV6=n)
+	select BNX2
+	select UIO
+	---help---
+	  This driver supports offload features of QLogic bnx2 gigabit
+	  Ethernet cards.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called cnic.  This is recommended.
+
+config SB1250_MAC
+	tristate "SB1250 Gigabit Ethernet support"
+	depends on SIBYTE_SB1xxx_SOC
+	select PHYLIB
+	---help---
+	  This driver supports Gigabit Ethernet interfaces based on the
+	  Broadcom SiByte family of System-On-a-Chip parts.  They include
+	  the BCM1120, BCM1125, BCM1125H, BCM1250, BCM1255, BCM1280, BCM1455
+	  and BCM1480 chips.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sb1250-mac.
+
+config TIGON3
+	tristate "Broadcom Tigon3 support"
+	depends on PCI
+	select PHYLIB
+	imply PTP_1588_CLOCK
+	---help---
+	  This driver supports Broadcom Tigon3 based gigabit Ethernet cards.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called tg3.  This is recommended.
+
+config TIGON3_HWMON
+	bool "Broadcom Tigon3 HWMON support"
+	default y
+	depends on TIGON3 && HWMON && !(TIGON3=y && HWMON=m)
+	---help---
+	  Say Y if you want to expose the thermal sensor on Tigon3 devices.
+
+config BNX2X
+	tristate "Broadcom NetXtremeII 10Gb support"
+	depends on PCI
+	imply PTP_1588_CLOCK
+	select FW_LOADER
+	select ZLIB_INFLATE
+	select LIBCRC32C
+	select MDIO
+	---help---
+	  This driver supports Broadcom NetXtremeII 10 gigabit Ethernet cards.
+	  To compile this driver as a module, choose M here: the module
+	  will be called bnx2x.  This is recommended.
+
+config BNX2X_SRIOV
+	bool "Broadcom 578xx and 57712 SR-IOV support"
+	depends on BNX2X && PCI_IOV
+	default y
+	---help---
+	  This configuration parameter enables Single Root Input Output
+	  Virtualization support in the 578xx and 57712 products. This
+	  allows for virtual function acceleration in virtual environments.
+
+config BGMAC
+	tristate
+	help
+	  This enables the integrated ethernet controller support for many
+	  Broadcom (mostly iProc) SoCs. An appropriate bus interface driver
+	  needs to be enabled to select this.
+
+config BGMAC_BCMA
+	tristate "Broadcom iProc GBit BCMA support"
+	depends on BCMA && BCMA_HOST_SOC
+	depends on HAS_DMA
+	depends on BCM47XX || ARCH_BCM_5301X || COMPILE_TEST
+	select BGMAC
+	select PHYLIB
+	select FIXED_PHY
+	---help---
+	  This driver supports GBit MAC and BCM4706 GBit MAC cores on BCMA bus.
+	  They can be found on BCM47xx SoCs and provide gigabit ethernet.
+	  In case of using this driver on BCM4706 it's also requires to enable
+	  BCMA_DRIVER_GMAC_CMN to make it work.
+
+config BGMAC_PLATFORM
+	tristate "Broadcom iProc GBit platform support"
+	depends on HAS_DMA
+	depends on ARCH_BCM_IPROC || COMPILE_TEST
+	depends on OF
+	select BGMAC
+	select PHYLIB
+	select FIXED_PHY
+	default ARCH_BCM_IPROC
+	---help---
+	  Say Y here if you want to use the Broadcom iProc Gigabit Ethernet
+	  controller through the generic platform interface
+
+config SYSTEMPORT
+	tristate "Broadcom SYSTEMPORT internal MAC support"
+	depends on OF
+	depends on NET_DSA || !NET_DSA
+	select MII
+	select PHYLIB
+	select FIXED_PHY
+	help
+	  This driver supports the built-in Ethernet MACs found in the
+	  Broadcom BCM7xxx Set Top Box family chipset using an internal
+	  Ethernet switch.
+
+config BNXT
+	tristate "Broadcom NetXtreme-C/E support"
+	depends on PCI
+	depends on MAY_USE_DEVLINK
+	select FW_LOADER
+	select LIBCRC32C
+	---help---
+	  This driver supports Broadcom NetXtreme-C/E 10/25/40/50 gigabit
+	  Ethernet cards.  To compile this driver as a module, choose M here:
+	  the module will be called bnxt_en.  This is recommended.
+
+config BNXT_SRIOV
+	bool "Broadcom NetXtreme-C/E SR-IOV support"
+	depends on BNXT && PCI_IOV
+	default y
+	---help---
+	  This configuration parameter enables Single Root Input Output
+	  Virtualization support in the NetXtreme-C/E products. This
+	  allows for virtual function acceleration in virtual environments.
+
+config BNXT_FLOWER_OFFLOAD
+	bool "TC Flower offload support for NetXtreme-C/E"
+	depends on BNXT
+	default y
+	---help---
+	  This configuration parameter enables TC Flower packet classifier
+	  offload for eswitch.  This option enables SR-IOV switchdev eswitch
+	  offload.
+
+config BNXT_DCB
+	bool "Data Center Bridging (DCB) Support"
+	default n
+	depends on BNXT && DCB
+	---help---
+	  Say Y here if you want to use Data Center Bridging (DCB) in the
+	  driver.
+
+	  If unsure, say N.
+
+endif # NET_VENDOR_BROADCOM
diff --git a/drivers/net/ethernet/broadcom/Makefile b/drivers/net/ethernet/broadcom/Makefile
new file mode 100644
index 0000000..7046ad6
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Broadcom network device drivers.
+#
+
+obj-$(CONFIG_B44) += b44.o
+obj-$(CONFIG_BCM63XX_ENET) += bcm63xx_enet.o
+obj-$(CONFIG_BCMGENET) += genet/
+obj-$(CONFIG_BNX2) += bnx2.o
+obj-$(CONFIG_CNIC) += cnic.o
+obj-$(CONFIG_BNX2X) += bnx2x/
+obj-$(CONFIG_SB1250_MAC) += sb1250-mac.o
+obj-$(CONFIG_TIGON3) += tg3.o
+obj-$(CONFIG_BGMAC) += bgmac.o
+obj-$(CONFIG_BGMAC_BCMA) += bgmac-bcma.o bgmac-bcma-mdio.o
+obj-$(CONFIG_BGMAC_PLATFORM) += bgmac-platform.o
+obj-$(CONFIG_SYSTEMPORT) += bcmsysport.o
+obj-$(CONFIG_BNXT) += bnxt/
diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile
new file mode 100644
index 0000000..7c560d5
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_BNXT) += bnxt_en.o
+
+bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o
+bnxt_en-$(CONFIG_BNXT_FLOWER_OFFLOAD) += bnxt_tc.o
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
new file mode 100644
index 0000000..f83769d
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -0,0 +1,9092 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2014-2016 Broadcom Corporation
+ * Copyright (c) 2016-2018 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+
+#include <linux/stringify.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/dma-mapping.h>
+#include <linux/bitops.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <asm/byteorder.h>
+#include <asm/page.h>
+#include <linux/time.h>
+#include <linux/mii.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <linux/if_bridge.h>
+#include <linux/rtc.h>
+#include <linux/bpf.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/checksum.h>
+#include <net/ip6_checksum.h>
+#include <net/udp_tunnel.h>
+#include <linux/workqueue.h>
+#include <linux/prefetch.h>
+#include <linux/cache.h>
+#include <linux/log2.h>
+#include <linux/aer.h>
+#include <linux/bitmap.h>
+#include <linux/cpu_rmap.h>
+#include <linux/cpumask.h>
+#include <net/pkt_cls.h>
+
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+#include "bnxt_ulp.h"
+#include "bnxt_sriov.h"
+#include "bnxt_ethtool.h"
+#include "bnxt_dcb.h"
+#include "bnxt_xdp.h"
+#include "bnxt_vfr.h"
+#include "bnxt_tc.h"
+#include "bnxt_devlink.h"
+
+#define BNXT_TX_TIMEOUT		(5 * HZ)
+
+static const char version[] =
+	"Broadcom NetXtreme-C/E driver " DRV_MODULE_NAME " v" DRV_MODULE_VERSION "\n";
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Broadcom BCM573xx network driver");
+MODULE_VERSION(DRV_MODULE_VERSION);
+
+#define BNXT_RX_OFFSET (NET_SKB_PAD + NET_IP_ALIGN)
+#define BNXT_RX_DMA_OFFSET NET_SKB_PAD
+#define BNXT_RX_COPY_THRESH 256
+
+#define BNXT_TX_PUSH_THRESH 164
+
+enum board_idx {
+	BCM57301,
+	BCM57302,
+	BCM57304,
+	BCM57417_NPAR,
+	BCM58700,
+	BCM57311,
+	BCM57312,
+	BCM57402,
+	BCM57404,
+	BCM57406,
+	BCM57402_NPAR,
+	BCM57407,
+	BCM57412,
+	BCM57414,
+	BCM57416,
+	BCM57417,
+	BCM57412_NPAR,
+	BCM57314,
+	BCM57417_SFP,
+	BCM57416_SFP,
+	BCM57404_NPAR,
+	BCM57406_NPAR,
+	BCM57407_SFP,
+	BCM57407_NPAR,
+	BCM57414_NPAR,
+	BCM57416_NPAR,
+	BCM57452,
+	BCM57454,
+	BCM5745x_NPAR,
+	BCM58802,
+	BCM58804,
+	BCM58808,
+	NETXTREME_E_VF,
+	NETXTREME_C_VF,
+	NETXTREME_S_VF,
+};
+
+/* indexed by enum above */
+static const struct {
+	char *name;
+} board_info[] = {
+	[BCM57301] = { "Broadcom BCM57301 NetXtreme-C 10Gb Ethernet" },
+	[BCM57302] = { "Broadcom BCM57302 NetXtreme-C 10Gb/25Gb Ethernet" },
+	[BCM57304] = { "Broadcom BCM57304 NetXtreme-C 10Gb/25Gb/40Gb/50Gb Ethernet" },
+	[BCM57417_NPAR] = { "Broadcom BCM57417 NetXtreme-E Ethernet Partition" },
+	[BCM58700] = { "Broadcom BCM58700 Nitro 1Gb/2.5Gb/10Gb Ethernet" },
+	[BCM57311] = { "Broadcom BCM57311 NetXtreme-C 10Gb Ethernet" },
+	[BCM57312] = { "Broadcom BCM57312 NetXtreme-C 10Gb/25Gb Ethernet" },
+	[BCM57402] = { "Broadcom BCM57402 NetXtreme-E 10Gb Ethernet" },
+	[BCM57404] = { "Broadcom BCM57404 NetXtreme-E 10Gb/25Gb Ethernet" },
+	[BCM57406] = { "Broadcom BCM57406 NetXtreme-E 10GBase-T Ethernet" },
+	[BCM57402_NPAR] = { "Broadcom BCM57402 NetXtreme-E Ethernet Partition" },
+	[BCM57407] = { "Broadcom BCM57407 NetXtreme-E 10GBase-T Ethernet" },
+	[BCM57412] = { "Broadcom BCM57412 NetXtreme-E 10Gb Ethernet" },
+	[BCM57414] = { "Broadcom BCM57414 NetXtreme-E 10Gb/25Gb Ethernet" },
+	[BCM57416] = { "Broadcom BCM57416 NetXtreme-E 10GBase-T Ethernet" },
+	[BCM57417] = { "Broadcom BCM57417 NetXtreme-E 10GBase-T Ethernet" },
+	[BCM57412_NPAR] = { "Broadcom BCM57412 NetXtreme-E Ethernet Partition" },
+	[BCM57314] = { "Broadcom BCM57314 NetXtreme-C 10Gb/25Gb/40Gb/50Gb Ethernet" },
+	[BCM57417_SFP] = { "Broadcom BCM57417 NetXtreme-E 10Gb/25Gb Ethernet" },
+	[BCM57416_SFP] = { "Broadcom BCM57416 NetXtreme-E 10Gb Ethernet" },
+	[BCM57404_NPAR] = { "Broadcom BCM57404 NetXtreme-E Ethernet Partition" },
+	[BCM57406_NPAR] = { "Broadcom BCM57406 NetXtreme-E Ethernet Partition" },
+	[BCM57407_SFP] = { "Broadcom BCM57407 NetXtreme-E 25Gb Ethernet" },
+	[BCM57407_NPAR] = { "Broadcom BCM57407 NetXtreme-E Ethernet Partition" },
+	[BCM57414_NPAR] = { "Broadcom BCM57414 NetXtreme-E Ethernet Partition" },
+	[BCM57416_NPAR] = { "Broadcom BCM57416 NetXtreme-E Ethernet Partition" },
+	[BCM57452] = { "Broadcom BCM57452 NetXtreme-E 10Gb/25Gb/40Gb/50Gb Ethernet" },
+	[BCM57454] = { "Broadcom BCM57454 NetXtreme-E 10Gb/25Gb/40Gb/50Gb/100Gb Ethernet" },
+	[BCM5745x_NPAR] = { "Broadcom BCM5745x NetXtreme-E Ethernet Partition" },
+	[BCM58802] = { "Broadcom BCM58802 NetXtreme-S 10Gb/25Gb/40Gb/50Gb Ethernet" },
+	[BCM58804] = { "Broadcom BCM58804 NetXtreme-S 10Gb/25Gb/40Gb/50Gb/100Gb Ethernet" },
+	[BCM58808] = { "Broadcom BCM58808 NetXtreme-S 10Gb/25Gb/40Gb/50Gb/100Gb Ethernet" },
+	[NETXTREME_E_VF] = { "Broadcom NetXtreme-E Ethernet Virtual Function" },
+	[NETXTREME_C_VF] = { "Broadcom NetXtreme-C Ethernet Virtual Function" },
+	[NETXTREME_S_VF] = { "Broadcom NetXtreme-S Ethernet Virtual Function" },
+};
+
+static const struct pci_device_id bnxt_pci_tbl[] = {
+	{ PCI_VDEVICE(BROADCOM, 0x1604), .driver_data = BCM5745x_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x1605), .driver_data = BCM5745x_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x1614), .driver_data = BCM57454 },
+	{ PCI_VDEVICE(BROADCOM, 0x16c0), .driver_data = BCM57417_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x16c8), .driver_data = BCM57301 },
+	{ PCI_VDEVICE(BROADCOM, 0x16c9), .driver_data = BCM57302 },
+	{ PCI_VDEVICE(BROADCOM, 0x16ca), .driver_data = BCM57304 },
+	{ PCI_VDEVICE(BROADCOM, 0x16cc), .driver_data = BCM57417_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x16cd), .driver_data = BCM58700 },
+	{ PCI_VDEVICE(BROADCOM, 0x16ce), .driver_data = BCM57311 },
+	{ PCI_VDEVICE(BROADCOM, 0x16cf), .driver_data = BCM57312 },
+	{ PCI_VDEVICE(BROADCOM, 0x16d0), .driver_data = BCM57402 },
+	{ PCI_VDEVICE(BROADCOM, 0x16d1), .driver_data = BCM57404 },
+	{ PCI_VDEVICE(BROADCOM, 0x16d2), .driver_data = BCM57406 },
+	{ PCI_VDEVICE(BROADCOM, 0x16d4), .driver_data = BCM57402_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x16d5), .driver_data = BCM57407 },
+	{ PCI_VDEVICE(BROADCOM, 0x16d6), .driver_data = BCM57412 },
+	{ PCI_VDEVICE(BROADCOM, 0x16d7), .driver_data = BCM57414 },
+	{ PCI_VDEVICE(BROADCOM, 0x16d8), .driver_data = BCM57416 },
+	{ PCI_VDEVICE(BROADCOM, 0x16d9), .driver_data = BCM57417 },
+	{ PCI_VDEVICE(BROADCOM, 0x16de), .driver_data = BCM57412_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x16df), .driver_data = BCM57314 },
+	{ PCI_VDEVICE(BROADCOM, 0x16e2), .driver_data = BCM57417_SFP },
+	{ PCI_VDEVICE(BROADCOM, 0x16e3), .driver_data = BCM57416_SFP },
+	{ PCI_VDEVICE(BROADCOM, 0x16e7), .driver_data = BCM57404_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x16e8), .driver_data = BCM57406_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x16e9), .driver_data = BCM57407_SFP },
+	{ PCI_VDEVICE(BROADCOM, 0x16ea), .driver_data = BCM57407_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x16eb), .driver_data = BCM57412_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x16ec), .driver_data = BCM57414_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x16ed), .driver_data = BCM57414_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x16ee), .driver_data = BCM57416_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x16ef), .driver_data = BCM57416_NPAR },
+	{ PCI_VDEVICE(BROADCOM, 0x16f0), .driver_data = BCM58808 },
+	{ PCI_VDEVICE(BROADCOM, 0x16f1), .driver_data = BCM57452 },
+	{ PCI_VDEVICE(BROADCOM, 0xd802), .driver_data = BCM58802 },
+	{ PCI_VDEVICE(BROADCOM, 0xd804), .driver_data = BCM58804 },
+#ifdef CONFIG_BNXT_SRIOV
+	{ PCI_VDEVICE(BROADCOM, 0x1606), .driver_data = NETXTREME_E_VF },
+	{ PCI_VDEVICE(BROADCOM, 0x1609), .driver_data = NETXTREME_E_VF },
+	{ PCI_VDEVICE(BROADCOM, 0x16c1), .driver_data = NETXTREME_E_VF },
+	{ PCI_VDEVICE(BROADCOM, 0x16cb), .driver_data = NETXTREME_C_VF },
+	{ PCI_VDEVICE(BROADCOM, 0x16d3), .driver_data = NETXTREME_E_VF },
+	{ PCI_VDEVICE(BROADCOM, 0x16dc), .driver_data = NETXTREME_E_VF },
+	{ PCI_VDEVICE(BROADCOM, 0x16e1), .driver_data = NETXTREME_C_VF },
+	{ PCI_VDEVICE(BROADCOM, 0x16e5), .driver_data = NETXTREME_C_VF },
+	{ PCI_VDEVICE(BROADCOM, 0xd800), .driver_data = NETXTREME_S_VF },
+#endif
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, bnxt_pci_tbl);
+
+static const u16 bnxt_vf_req_snif[] = {
+	HWRM_FUNC_CFG,
+	HWRM_FUNC_VF_CFG,
+	HWRM_PORT_PHY_QCFG,
+	HWRM_CFA_L2_FILTER_ALLOC,
+};
+
+static const u16 bnxt_async_events_arr[] = {
+	ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE,
+	ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD,
+	ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED,
+	ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE,
+	ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE,
+};
+
+static struct workqueue_struct *bnxt_pf_wq;
+
+static bool bnxt_vf_pciid(enum board_idx idx)
+{
+	return (idx == NETXTREME_C_VF || idx == NETXTREME_E_VF ||
+		idx == NETXTREME_S_VF);
+}
+
+#define DB_CP_REARM_FLAGS	(DB_KEY_CP | DB_IDX_VALID)
+#define DB_CP_FLAGS		(DB_KEY_CP | DB_IDX_VALID | DB_IRQ_DIS)
+#define DB_CP_IRQ_DIS_FLAGS	(DB_KEY_CP | DB_IRQ_DIS)
+
+#define BNXT_CP_DB_REARM(db, raw_cons)					\
+		writel(DB_CP_REARM_FLAGS | RING_CMP(raw_cons), db)
+
+#define BNXT_CP_DB(db, raw_cons)					\
+		writel(DB_CP_FLAGS | RING_CMP(raw_cons), db)
+
+#define BNXT_CP_DB_IRQ_DIS(db)						\
+		writel(DB_CP_IRQ_DIS_FLAGS, db)
+
+const u16 bnxt_lhint_arr[] = {
+	TX_BD_FLAGS_LHINT_512_AND_SMALLER,
+	TX_BD_FLAGS_LHINT_512_TO_1023,
+	TX_BD_FLAGS_LHINT_1024_TO_2047,
+	TX_BD_FLAGS_LHINT_1024_TO_2047,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
+};
+
+static u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb)
+{
+	struct metadata_dst *md_dst = skb_metadata_dst(skb);
+
+	if (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)
+		return 0;
+
+	return md_dst->u.port_info.port_id;
+}
+
+static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct tx_bd *txbd;
+	struct tx_bd_ext *txbd1;
+	struct netdev_queue *txq;
+	int i;
+	dma_addr_t mapping;
+	unsigned int length, pad = 0;
+	u32 len, free_size, vlan_tag_flags, cfa_action, flags;
+	u16 prod, last_frag;
+	struct pci_dev *pdev = bp->pdev;
+	struct bnxt_tx_ring_info *txr;
+	struct bnxt_sw_tx_bd *tx_buf;
+
+	i = skb_get_queue_mapping(skb);
+	if (unlikely(i >= bp->tx_nr_rings)) {
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+	txq = netdev_get_tx_queue(dev, i);
+	txr = &bp->tx_ring[bp->tx_ring_map[i]];
+	prod = txr->tx_prod;
+
+	free_size = bnxt_tx_avail(bp, txr);
+	if (unlikely(free_size < skb_shinfo(skb)->nr_frags + 2)) {
+		netif_tx_stop_queue(txq);
+		return NETDEV_TX_BUSY;
+	}
+
+	length = skb->len;
+	len = skb_headlen(skb);
+	last_frag = skb_shinfo(skb)->nr_frags;
+
+	txbd = &txr->tx_desc_ring[TX_RING(prod)][TX_IDX(prod)];
+
+	txbd->tx_bd_opaque = prod;
+
+	tx_buf = &txr->tx_buf_ring[prod];
+	tx_buf->skb = skb;
+	tx_buf->nr_frags = last_frag;
+
+	vlan_tag_flags = 0;
+	cfa_action = bnxt_xmit_get_cfa_action(skb);
+	if (skb_vlan_tag_present(skb)) {
+		vlan_tag_flags = TX_BD_CFA_META_KEY_VLAN |
+				 skb_vlan_tag_get(skb);
+		/* Currently supports 8021Q, 8021AD vlan offloads
+		 * QINQ1, QINQ2, QINQ3 vlan headers are deprecated
+		 */
+		if (skb->vlan_proto == htons(ETH_P_8021Q))
+			vlan_tag_flags |= 1 << TX_BD_CFA_META_TPID_SHIFT;
+	}
+
+	if (free_size == bp->tx_ring_size && length <= bp->tx_push_thresh) {
+		struct tx_push_buffer *tx_push_buf = txr->tx_push;
+		struct tx_push_bd *tx_push = &tx_push_buf->push_bd;
+		struct tx_bd_ext *tx_push1 = &tx_push->txbd2;
+		void *pdata = tx_push_buf->data;
+		u64 *end;
+		int j, push_len;
+
+		/* Set COAL_NOW to be ready quickly for the next push */
+		tx_push->tx_bd_len_flags_type =
+			cpu_to_le32((length << TX_BD_LEN_SHIFT) |
+					TX_BD_TYPE_LONG_TX_BD |
+					TX_BD_FLAGS_LHINT_512_AND_SMALLER |
+					TX_BD_FLAGS_COAL_NOW |
+					TX_BD_FLAGS_PACKET_END |
+					(2 << TX_BD_FLAGS_BD_CNT_SHIFT));
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL)
+			tx_push1->tx_bd_hsize_lflags =
+					cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM);
+		else
+			tx_push1->tx_bd_hsize_lflags = 0;
+
+		tx_push1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags);
+		tx_push1->tx_bd_cfa_action =
+			cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT);
+
+		end = pdata + length;
+		end = PTR_ALIGN(end, 8) - 1;
+		*end = 0;
+
+		skb_copy_from_linear_data(skb, pdata, len);
+		pdata += len;
+		for (j = 0; j < last_frag; j++) {
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[j];
+			void *fptr;
+
+			fptr = skb_frag_address_safe(frag);
+			if (!fptr)
+				goto normal_tx;
+
+			memcpy(pdata, fptr, skb_frag_size(frag));
+			pdata += skb_frag_size(frag);
+		}
+
+		txbd->tx_bd_len_flags_type = tx_push->tx_bd_len_flags_type;
+		txbd->tx_bd_haddr = txr->data_mapping;
+		prod = NEXT_TX(prod);
+		txbd = &txr->tx_desc_ring[TX_RING(prod)][TX_IDX(prod)];
+		memcpy(txbd, tx_push1, sizeof(*txbd));
+		prod = NEXT_TX(prod);
+		tx_push->doorbell =
+			cpu_to_le32(DB_KEY_TX_PUSH | DB_LONG_TX_PUSH | prod);
+		txr->tx_prod = prod;
+
+		tx_buf->is_push = 1;
+		netdev_tx_sent_queue(txq, skb->len);
+		wmb();	/* Sync is_push and byte queue before pushing data */
+
+		push_len = (length + sizeof(*tx_push) + 7) / 8;
+		if (push_len > 16) {
+			__iowrite64_copy(txr->tx_doorbell, tx_push_buf, 16);
+			__iowrite32_copy(txr->tx_doorbell + 4, tx_push_buf + 1,
+					 (push_len - 16) << 1);
+		} else {
+			__iowrite64_copy(txr->tx_doorbell, tx_push_buf,
+					 push_len);
+		}
+
+		goto tx_done;
+	}
+
+normal_tx:
+	if (length < BNXT_MIN_PKT_SIZE) {
+		pad = BNXT_MIN_PKT_SIZE - length;
+		if (skb_pad(skb, pad)) {
+			/* SKB already freed. */
+			tx_buf->skb = NULL;
+			return NETDEV_TX_OK;
+		}
+		length = BNXT_MIN_PKT_SIZE;
+	}
+
+	mapping = dma_map_single(&pdev->dev, skb->data, len, DMA_TO_DEVICE);
+
+	if (unlikely(dma_mapping_error(&pdev->dev, mapping))) {
+		dev_kfree_skb_any(skb);
+		tx_buf->skb = NULL;
+		return NETDEV_TX_OK;
+	}
+
+	dma_unmap_addr_set(tx_buf, mapping, mapping);
+	flags = (len << TX_BD_LEN_SHIFT) | TX_BD_TYPE_LONG_TX_BD |
+		((last_frag + 2) << TX_BD_FLAGS_BD_CNT_SHIFT);
+
+	txbd->tx_bd_haddr = cpu_to_le64(mapping);
+
+	prod = NEXT_TX(prod);
+	txbd1 = (struct tx_bd_ext *)
+		&txr->tx_desc_ring[TX_RING(prod)][TX_IDX(prod)];
+
+	txbd1->tx_bd_hsize_lflags = 0;
+	if (skb_is_gso(skb)) {
+		u32 hdr_len;
+
+		if (skb->encapsulation)
+			hdr_len = skb_inner_network_offset(skb) +
+				skb_inner_network_header_len(skb) +
+				inner_tcp_hdrlen(skb);
+		else
+			hdr_len = skb_transport_offset(skb) +
+				tcp_hdrlen(skb);
+
+		txbd1->tx_bd_hsize_lflags = cpu_to_le32(TX_BD_FLAGS_LSO |
+					TX_BD_FLAGS_T_IPID |
+					(hdr_len << (TX_BD_HSIZE_SHIFT - 1)));
+		length = skb_shinfo(skb)->gso_size;
+		txbd1->tx_bd_mss = cpu_to_le32(length);
+		length += hdr_len;
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		txbd1->tx_bd_hsize_lflags =
+			cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM);
+		txbd1->tx_bd_mss = 0;
+	}
+
+	length >>= 9;
+	flags |= bnxt_lhint_arr[length];
+	txbd->tx_bd_len_flags_type = cpu_to_le32(flags);
+
+	txbd1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags);
+	txbd1->tx_bd_cfa_action =
+			cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT);
+	for (i = 0; i < last_frag; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		prod = NEXT_TX(prod);
+		txbd = &txr->tx_desc_ring[TX_RING(prod)][TX_IDX(prod)];
+
+		len = skb_frag_size(frag);
+		mapping = skb_frag_dma_map(&pdev->dev, frag, 0, len,
+					   DMA_TO_DEVICE);
+
+		if (unlikely(dma_mapping_error(&pdev->dev, mapping)))
+			goto tx_dma_error;
+
+		tx_buf = &txr->tx_buf_ring[prod];
+		dma_unmap_addr_set(tx_buf, mapping, mapping);
+
+		txbd->tx_bd_haddr = cpu_to_le64(mapping);
+
+		flags = len << TX_BD_LEN_SHIFT;
+		txbd->tx_bd_len_flags_type = cpu_to_le32(flags);
+	}
+
+	flags &= ~TX_BD_LEN;
+	txbd->tx_bd_len_flags_type =
+		cpu_to_le32(((len + pad) << TX_BD_LEN_SHIFT) | flags |
+			    TX_BD_FLAGS_PACKET_END);
+
+	netdev_tx_sent_queue(txq, skb->len);
+
+	/* Sync BD data before updating doorbell */
+	wmb();
+
+	prod = NEXT_TX(prod);
+	txr->tx_prod = prod;
+
+	if (!skb->xmit_more || netif_xmit_stopped(txq))
+		bnxt_db_write(bp, txr->tx_doorbell, DB_KEY_TX | prod);
+
+tx_done:
+
+	mmiowb();
+
+	if (unlikely(bnxt_tx_avail(bp, txr) <= MAX_SKB_FRAGS + 1)) {
+		if (skb->xmit_more && !tx_buf->is_push)
+			bnxt_db_write(bp, txr->tx_doorbell, DB_KEY_TX | prod);
+
+		netif_tx_stop_queue(txq);
+
+		/* netif_tx_stop_queue() must be done before checking
+		 * tx index in bnxt_tx_avail() below, because in
+		 * bnxt_tx_int(), we update tx index before checking for
+		 * netif_tx_queue_stopped().
+		 */
+		smp_mb();
+		if (bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh)
+			netif_tx_wake_queue(txq);
+	}
+	return NETDEV_TX_OK;
+
+tx_dma_error:
+	last_frag = i;
+
+	/* start back at beginning and unmap skb */
+	prod = txr->tx_prod;
+	tx_buf = &txr->tx_buf_ring[prod];
+	tx_buf->skb = NULL;
+	dma_unmap_single(&pdev->dev, dma_unmap_addr(tx_buf, mapping),
+			 skb_headlen(skb), PCI_DMA_TODEVICE);
+	prod = NEXT_TX(prod);
+
+	/* unmap remaining mapped pages */
+	for (i = 0; i < last_frag; i++) {
+		prod = NEXT_TX(prod);
+		tx_buf = &txr->tx_buf_ring[prod];
+		dma_unmap_page(&pdev->dev, dma_unmap_addr(tx_buf, mapping),
+			       skb_frag_size(&skb_shinfo(skb)->frags[i]),
+			       PCI_DMA_TODEVICE);
+	}
+
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
+}
+
+static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
+{
+	struct bnxt_tx_ring_info *txr = bnapi->tx_ring;
+	struct netdev_queue *txq = netdev_get_tx_queue(bp->dev, txr->txq_index);
+	u16 cons = txr->tx_cons;
+	struct pci_dev *pdev = bp->pdev;
+	int i;
+	unsigned int tx_bytes = 0;
+
+	for (i = 0; i < nr_pkts; i++) {
+		struct bnxt_sw_tx_bd *tx_buf;
+		struct sk_buff *skb;
+		int j, last;
+
+		tx_buf = &txr->tx_buf_ring[cons];
+		cons = NEXT_TX(cons);
+		skb = tx_buf->skb;
+		tx_buf->skb = NULL;
+
+		if (tx_buf->is_push) {
+			tx_buf->is_push = 0;
+			goto next_tx_int;
+		}
+
+		dma_unmap_single(&pdev->dev, dma_unmap_addr(tx_buf, mapping),
+				 skb_headlen(skb), PCI_DMA_TODEVICE);
+		last = tx_buf->nr_frags;
+
+		for (j = 0; j < last; j++) {
+			cons = NEXT_TX(cons);
+			tx_buf = &txr->tx_buf_ring[cons];
+			dma_unmap_page(
+				&pdev->dev,
+				dma_unmap_addr(tx_buf, mapping),
+				skb_frag_size(&skb_shinfo(skb)->frags[j]),
+				PCI_DMA_TODEVICE);
+		}
+
+next_tx_int:
+		cons = NEXT_TX(cons);
+
+		tx_bytes += skb->len;
+		dev_kfree_skb_any(skb);
+	}
+
+	netdev_tx_completed_queue(txq, nr_pkts, tx_bytes);
+	txr->tx_cons = cons;
+
+	/* Need to make the tx_cons update visible to bnxt_start_xmit()
+	 * before checking for netif_tx_queue_stopped().  Without the
+	 * memory barrier, there is a small possibility that bnxt_start_xmit()
+	 * will miss it and cause the queue to be stopped forever.
+	 */
+	smp_mb();
+
+	if (unlikely(netif_tx_queue_stopped(txq)) &&
+	    (bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh)) {
+		__netif_tx_lock(txq, smp_processor_id());
+		if (netif_tx_queue_stopped(txq) &&
+		    bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh &&
+		    txr->dev_state != BNXT_DEV_STATE_CLOSING)
+			netif_tx_wake_queue(txq);
+		__netif_tx_unlock(txq);
+	}
+}
+
+static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
+					 gfp_t gfp)
+{
+	struct device *dev = &bp->pdev->dev;
+	struct page *page;
+
+	page = alloc_page(gfp);
+	if (!page)
+		return NULL;
+
+	*mapping = dma_map_page_attrs(dev, page, 0, PAGE_SIZE, bp->rx_dir,
+				      DMA_ATTR_WEAK_ORDERING);
+	if (dma_mapping_error(dev, *mapping)) {
+		__free_page(page);
+		return NULL;
+	}
+	*mapping += bp->rx_dma_offset;
+	return page;
+}
+
+static inline u8 *__bnxt_alloc_rx_data(struct bnxt *bp, dma_addr_t *mapping,
+				       gfp_t gfp)
+{
+	u8 *data;
+	struct pci_dev *pdev = bp->pdev;
+
+	data = kmalloc(bp->rx_buf_size, gfp);
+	if (!data)
+		return NULL;
+
+	*mapping = dma_map_single_attrs(&pdev->dev, data + bp->rx_dma_offset,
+					bp->rx_buf_use_size, bp->rx_dir,
+					DMA_ATTR_WEAK_ORDERING);
+
+	if (dma_mapping_error(&pdev->dev, *mapping)) {
+		kfree(data);
+		data = NULL;
+	}
+	return data;
+}
+
+int bnxt_alloc_rx_data(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
+		       u16 prod, gfp_t gfp)
+{
+	struct rx_bd *rxbd = &rxr->rx_desc_ring[RX_RING(prod)][RX_IDX(prod)];
+	struct bnxt_sw_rx_bd *rx_buf = &rxr->rx_buf_ring[prod];
+	dma_addr_t mapping;
+
+	if (BNXT_RX_PAGE_MODE(bp)) {
+		struct page *page = __bnxt_alloc_rx_page(bp, &mapping, gfp);
+
+		if (!page)
+			return -ENOMEM;
+
+		rx_buf->data = page;
+		rx_buf->data_ptr = page_address(page) + bp->rx_offset;
+	} else {
+		u8 *data = __bnxt_alloc_rx_data(bp, &mapping, gfp);
+
+		if (!data)
+			return -ENOMEM;
+
+		rx_buf->data = data;
+		rx_buf->data_ptr = data + bp->rx_offset;
+	}
+	rx_buf->mapping = mapping;
+
+	rxbd->rx_bd_haddr = cpu_to_le64(mapping);
+	return 0;
+}
+
+void bnxt_reuse_rx_data(struct bnxt_rx_ring_info *rxr, u16 cons, void *data)
+{
+	u16 prod = rxr->rx_prod;
+	struct bnxt_sw_rx_bd *cons_rx_buf, *prod_rx_buf;
+	struct rx_bd *cons_bd, *prod_bd;
+
+	prod_rx_buf = &rxr->rx_buf_ring[prod];
+	cons_rx_buf = &rxr->rx_buf_ring[cons];
+
+	prod_rx_buf->data = data;
+	prod_rx_buf->data_ptr = cons_rx_buf->data_ptr;
+
+	prod_rx_buf->mapping = cons_rx_buf->mapping;
+
+	prod_bd = &rxr->rx_desc_ring[RX_RING(prod)][RX_IDX(prod)];
+	cons_bd = &rxr->rx_desc_ring[RX_RING(cons)][RX_IDX(cons)];
+
+	prod_bd->rx_bd_haddr = cons_bd->rx_bd_haddr;
+}
+
+static inline u16 bnxt_find_next_agg_idx(struct bnxt_rx_ring_info *rxr, u16 idx)
+{
+	u16 next, max = rxr->rx_agg_bmap_size;
+
+	next = find_next_zero_bit(rxr->rx_agg_bmap, max, idx);
+	if (next >= max)
+		next = find_first_zero_bit(rxr->rx_agg_bmap, max);
+	return next;
+}
+
+static inline int bnxt_alloc_rx_page(struct bnxt *bp,
+				     struct bnxt_rx_ring_info *rxr,
+				     u16 prod, gfp_t gfp)
+{
+	struct rx_bd *rxbd =
+		&rxr->rx_agg_desc_ring[RX_RING(prod)][RX_IDX(prod)];
+	struct bnxt_sw_rx_agg_bd *rx_agg_buf;
+	struct pci_dev *pdev = bp->pdev;
+	struct page *page;
+	dma_addr_t mapping;
+	u16 sw_prod = rxr->rx_sw_agg_prod;
+	unsigned int offset = 0;
+
+	if (PAGE_SIZE > BNXT_RX_PAGE_SIZE) {
+		page = rxr->rx_page;
+		if (!page) {
+			page = alloc_page(gfp);
+			if (!page)
+				return -ENOMEM;
+			rxr->rx_page = page;
+			rxr->rx_page_offset = 0;
+		}
+		offset = rxr->rx_page_offset;
+		rxr->rx_page_offset += BNXT_RX_PAGE_SIZE;
+		if (rxr->rx_page_offset == PAGE_SIZE)
+			rxr->rx_page = NULL;
+		else
+			get_page(page);
+	} else {
+		page = alloc_page(gfp);
+		if (!page)
+			return -ENOMEM;
+	}
+
+	mapping = dma_map_page_attrs(&pdev->dev, page, offset,
+				     BNXT_RX_PAGE_SIZE, PCI_DMA_FROMDEVICE,
+				     DMA_ATTR_WEAK_ORDERING);
+	if (dma_mapping_error(&pdev->dev, mapping)) {
+		__free_page(page);
+		return -EIO;
+	}
+
+	if (unlikely(test_bit(sw_prod, rxr->rx_agg_bmap)))
+		sw_prod = bnxt_find_next_agg_idx(rxr, sw_prod);
+
+	__set_bit(sw_prod, rxr->rx_agg_bmap);
+	rx_agg_buf = &rxr->rx_agg_ring[sw_prod];
+	rxr->rx_sw_agg_prod = NEXT_RX_AGG(sw_prod);
+
+	rx_agg_buf->page = page;
+	rx_agg_buf->offset = offset;
+	rx_agg_buf->mapping = mapping;
+	rxbd->rx_bd_haddr = cpu_to_le64(mapping);
+	rxbd->rx_bd_opaque = sw_prod;
+	return 0;
+}
+
+static void bnxt_reuse_rx_agg_bufs(struct bnxt_napi *bnapi, u16 cp_cons,
+				   u32 agg_bufs)
+{
+	struct bnxt *bp = bnapi->bp;
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	struct bnxt_rx_ring_info *rxr = bnapi->rx_ring;
+	u16 prod = rxr->rx_agg_prod;
+	u16 sw_prod = rxr->rx_sw_agg_prod;
+	u32 i;
+
+	for (i = 0; i < agg_bufs; i++) {
+		u16 cons;
+		struct rx_agg_cmp *agg;
+		struct bnxt_sw_rx_agg_bd *cons_rx_buf, *prod_rx_buf;
+		struct rx_bd *prod_bd;
+		struct page *page;
+
+		agg = (struct rx_agg_cmp *)
+			&cpr->cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)];
+		cons = agg->rx_agg_cmp_opaque;
+		__clear_bit(cons, rxr->rx_agg_bmap);
+
+		if (unlikely(test_bit(sw_prod, rxr->rx_agg_bmap)))
+			sw_prod = bnxt_find_next_agg_idx(rxr, sw_prod);
+
+		__set_bit(sw_prod, rxr->rx_agg_bmap);
+		prod_rx_buf = &rxr->rx_agg_ring[sw_prod];
+		cons_rx_buf = &rxr->rx_agg_ring[cons];
+
+		/* It is possible for sw_prod to be equal to cons, so
+		 * set cons_rx_buf->page to NULL first.
+		 */
+		page = cons_rx_buf->page;
+		cons_rx_buf->page = NULL;
+		prod_rx_buf->page = page;
+		prod_rx_buf->offset = cons_rx_buf->offset;
+
+		prod_rx_buf->mapping = cons_rx_buf->mapping;
+
+		prod_bd = &rxr->rx_agg_desc_ring[RX_RING(prod)][RX_IDX(prod)];
+
+		prod_bd->rx_bd_haddr = cpu_to_le64(cons_rx_buf->mapping);
+		prod_bd->rx_bd_opaque = sw_prod;
+
+		prod = NEXT_RX_AGG(prod);
+		sw_prod = NEXT_RX_AGG(sw_prod);
+		cp_cons = NEXT_CMP(cp_cons);
+	}
+	rxr->rx_agg_prod = prod;
+	rxr->rx_sw_agg_prod = sw_prod;
+}
+
+static struct sk_buff *bnxt_rx_page_skb(struct bnxt *bp,
+					struct bnxt_rx_ring_info *rxr,
+					u16 cons, void *data, u8 *data_ptr,
+					dma_addr_t dma_addr,
+					unsigned int offset_and_len)
+{
+	unsigned int payload = offset_and_len >> 16;
+	unsigned int len = offset_and_len & 0xffff;
+	struct skb_frag_struct *frag;
+	struct page *page = data;
+	u16 prod = rxr->rx_prod;
+	struct sk_buff *skb;
+	int off, err;
+
+	err = bnxt_alloc_rx_data(bp, rxr, prod, GFP_ATOMIC);
+	if (unlikely(err)) {
+		bnxt_reuse_rx_data(rxr, cons, data);
+		return NULL;
+	}
+	dma_addr -= bp->rx_dma_offset;
+	dma_unmap_page_attrs(&bp->pdev->dev, dma_addr, PAGE_SIZE, bp->rx_dir,
+			     DMA_ATTR_WEAK_ORDERING);
+
+	if (unlikely(!payload))
+		payload = eth_get_headlen(data_ptr, len);
+
+	skb = napi_alloc_skb(&rxr->bnapi->napi, payload);
+	if (!skb) {
+		__free_page(page);
+		return NULL;
+	}
+
+	off = (void *)data_ptr - page_address(page);
+	skb_add_rx_frag(skb, 0, page, off, len, PAGE_SIZE);
+	memcpy(skb->data - NET_IP_ALIGN, data_ptr - NET_IP_ALIGN,
+	       payload + NET_IP_ALIGN);
+
+	frag = &skb_shinfo(skb)->frags[0];
+	skb_frag_size_sub(frag, payload);
+	frag->page_offset += payload;
+	skb->data_len -= payload;
+	skb->tail += payload;
+
+	return skb;
+}
+
+static struct sk_buff *bnxt_rx_skb(struct bnxt *bp,
+				   struct bnxt_rx_ring_info *rxr, u16 cons,
+				   void *data, u8 *data_ptr,
+				   dma_addr_t dma_addr,
+				   unsigned int offset_and_len)
+{
+	u16 prod = rxr->rx_prod;
+	struct sk_buff *skb;
+	int err;
+
+	err = bnxt_alloc_rx_data(bp, rxr, prod, GFP_ATOMIC);
+	if (unlikely(err)) {
+		bnxt_reuse_rx_data(rxr, cons, data);
+		return NULL;
+	}
+
+	skb = build_skb(data, 0);
+	dma_unmap_single_attrs(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size,
+			       bp->rx_dir, DMA_ATTR_WEAK_ORDERING);
+	if (!skb) {
+		kfree(data);
+		return NULL;
+	}
+
+	skb_reserve(skb, bp->rx_offset);
+	skb_put(skb, offset_and_len & 0xffff);
+	return skb;
+}
+
+static struct sk_buff *bnxt_rx_pages(struct bnxt *bp, struct bnxt_napi *bnapi,
+				     struct sk_buff *skb, u16 cp_cons,
+				     u32 agg_bufs)
+{
+	struct pci_dev *pdev = bp->pdev;
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	struct bnxt_rx_ring_info *rxr = bnapi->rx_ring;
+	u16 prod = rxr->rx_agg_prod;
+	u32 i;
+
+	for (i = 0; i < agg_bufs; i++) {
+		u16 cons, frag_len;
+		struct rx_agg_cmp *agg;
+		struct bnxt_sw_rx_agg_bd *cons_rx_buf;
+		struct page *page;
+		dma_addr_t mapping;
+
+		agg = (struct rx_agg_cmp *)
+			&cpr->cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)];
+		cons = agg->rx_agg_cmp_opaque;
+		frag_len = (le32_to_cpu(agg->rx_agg_cmp_len_flags_type) &
+			    RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT;
+
+		cons_rx_buf = &rxr->rx_agg_ring[cons];
+		skb_fill_page_desc(skb, i, cons_rx_buf->page,
+				   cons_rx_buf->offset, frag_len);
+		__clear_bit(cons, rxr->rx_agg_bmap);
+
+		/* It is possible for bnxt_alloc_rx_page() to allocate
+		 * a sw_prod index that equals the cons index, so we
+		 * need to clear the cons entry now.
+		 */
+		mapping = cons_rx_buf->mapping;
+		page = cons_rx_buf->page;
+		cons_rx_buf->page = NULL;
+
+		if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_ATOMIC) != 0) {
+			struct skb_shared_info *shinfo;
+			unsigned int nr_frags;
+
+			shinfo = skb_shinfo(skb);
+			nr_frags = --shinfo->nr_frags;
+			__skb_frag_set_page(&shinfo->frags[nr_frags], NULL);
+
+			dev_kfree_skb(skb);
+
+			cons_rx_buf->page = page;
+
+			/* Update prod since possibly some pages have been
+			 * allocated already.
+			 */
+			rxr->rx_agg_prod = prod;
+			bnxt_reuse_rx_agg_bufs(bnapi, cp_cons, agg_bufs - i);
+			return NULL;
+		}
+
+		dma_unmap_page_attrs(&pdev->dev, mapping, BNXT_RX_PAGE_SIZE,
+				     PCI_DMA_FROMDEVICE,
+				     DMA_ATTR_WEAK_ORDERING);
+
+		skb->data_len += frag_len;
+		skb->len += frag_len;
+		skb->truesize += PAGE_SIZE;
+
+		prod = NEXT_RX_AGG(prod);
+		cp_cons = NEXT_CMP(cp_cons);
+	}
+	rxr->rx_agg_prod = prod;
+	return skb;
+}
+
+static int bnxt_agg_bufs_valid(struct bnxt *bp, struct bnxt_cp_ring_info *cpr,
+			       u8 agg_bufs, u32 *raw_cons)
+{
+	u16 last;
+	struct rx_agg_cmp *agg;
+
+	*raw_cons = ADV_RAW_CMP(*raw_cons, agg_bufs);
+	last = RING_CMP(*raw_cons);
+	agg = (struct rx_agg_cmp *)
+		&cpr->cp_desc_ring[CP_RING(last)][CP_IDX(last)];
+	return RX_AGG_CMP_VALID(agg, *raw_cons);
+}
+
+static inline struct sk_buff *bnxt_copy_skb(struct bnxt_napi *bnapi, u8 *data,
+					    unsigned int len,
+					    dma_addr_t mapping)
+{
+	struct bnxt *bp = bnapi->bp;
+	struct pci_dev *pdev = bp->pdev;
+	struct sk_buff *skb;
+
+	skb = napi_alloc_skb(&bnapi->napi, len);
+	if (!skb)
+		return NULL;
+
+	dma_sync_single_for_cpu(&pdev->dev, mapping, bp->rx_copy_thresh,
+				bp->rx_dir);
+
+	memcpy(skb->data - NET_IP_ALIGN, data - NET_IP_ALIGN,
+	       len + NET_IP_ALIGN);
+
+	dma_sync_single_for_device(&pdev->dev, mapping, bp->rx_copy_thresh,
+				   bp->rx_dir);
+
+	skb_put(skb, len);
+	return skb;
+}
+
+static int bnxt_discard_rx(struct bnxt *bp, struct bnxt_napi *bnapi,
+			   u32 *raw_cons, void *cmp)
+{
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	struct rx_cmp *rxcmp = cmp;
+	u32 tmp_raw_cons = *raw_cons;
+	u8 cmp_type, agg_bufs = 0;
+
+	cmp_type = RX_CMP_TYPE(rxcmp);
+
+	if (cmp_type == CMP_TYPE_RX_L2_CMP) {
+		agg_bufs = (le32_to_cpu(rxcmp->rx_cmp_misc_v1) &
+			    RX_CMP_AGG_BUFS) >>
+			   RX_CMP_AGG_BUFS_SHIFT;
+	} else if (cmp_type == CMP_TYPE_RX_L2_TPA_END_CMP) {
+		struct rx_tpa_end_cmp *tpa_end = cmp;
+
+		agg_bufs = (le32_to_cpu(tpa_end->rx_tpa_end_cmp_misc_v1) &
+			    RX_TPA_END_CMP_AGG_BUFS) >>
+			   RX_TPA_END_CMP_AGG_BUFS_SHIFT;
+	}
+
+	if (agg_bufs) {
+		if (!bnxt_agg_bufs_valid(bp, cpr, agg_bufs, &tmp_raw_cons))
+			return -EBUSY;
+	}
+	*raw_cons = tmp_raw_cons;
+	return 0;
+}
+
+static void bnxt_queue_sp_work(struct bnxt *bp)
+{
+	if (BNXT_PF(bp))
+		queue_work(bnxt_pf_wq, &bp->sp_task);
+	else
+		schedule_work(&bp->sp_task);
+}
+
+static void bnxt_cancel_sp_work(struct bnxt *bp)
+{
+	if (BNXT_PF(bp))
+		flush_workqueue(bnxt_pf_wq);
+	else
+		cancel_work_sync(&bp->sp_task);
+}
+
+static void bnxt_sched_reset(struct bnxt *bp, struct bnxt_rx_ring_info *rxr)
+{
+	if (!rxr->bnapi->in_reset) {
+		rxr->bnapi->in_reset = true;
+		set_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event);
+		bnxt_queue_sp_work(bp);
+	}
+	rxr->rx_next_cons = 0xffff;
+}
+
+static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
+			   struct rx_tpa_start_cmp *tpa_start,
+			   struct rx_tpa_start_cmp_ext *tpa_start1)
+{
+	u8 agg_id = TPA_START_AGG_ID(tpa_start);
+	u16 cons, prod;
+	struct bnxt_tpa_info *tpa_info;
+	struct bnxt_sw_rx_bd *cons_rx_buf, *prod_rx_buf;
+	struct rx_bd *prod_bd;
+	dma_addr_t mapping;
+
+	cons = tpa_start->rx_tpa_start_cmp_opaque;
+	prod = rxr->rx_prod;
+	cons_rx_buf = &rxr->rx_buf_ring[cons];
+	prod_rx_buf = &rxr->rx_buf_ring[prod];
+	tpa_info = &rxr->rx_tpa[agg_id];
+
+	if (unlikely(cons != rxr->rx_next_cons)) {
+		bnxt_sched_reset(bp, rxr);
+		return;
+	}
+	/* Store cfa_code in tpa_info to use in tpa_end
+	 * completion processing.
+	 */
+	tpa_info->cfa_code = TPA_START_CFA_CODE(tpa_start1);
+	prod_rx_buf->data = tpa_info->data;
+	prod_rx_buf->data_ptr = tpa_info->data_ptr;
+
+	mapping = tpa_info->mapping;
+	prod_rx_buf->mapping = mapping;
+
+	prod_bd = &rxr->rx_desc_ring[RX_RING(prod)][RX_IDX(prod)];
+
+	prod_bd->rx_bd_haddr = cpu_to_le64(mapping);
+
+	tpa_info->data = cons_rx_buf->data;
+	tpa_info->data_ptr = cons_rx_buf->data_ptr;
+	cons_rx_buf->data = NULL;
+	tpa_info->mapping = cons_rx_buf->mapping;
+
+	tpa_info->len =
+		le32_to_cpu(tpa_start->rx_tpa_start_cmp_len_flags_type) >>
+				RX_TPA_START_CMP_LEN_SHIFT;
+	if (likely(TPA_START_HASH_VALID(tpa_start))) {
+		u32 hash_type = TPA_START_HASH_TYPE(tpa_start);
+
+		tpa_info->hash_type = PKT_HASH_TYPE_L4;
+		tpa_info->gso_type = SKB_GSO_TCPV4;
+		/* RSS profiles 1 and 3 with extract code 0 for inner 4-tuple */
+		if (hash_type == 3)
+			tpa_info->gso_type = SKB_GSO_TCPV6;
+		tpa_info->rss_hash =
+			le32_to_cpu(tpa_start->rx_tpa_start_cmp_rss_hash);
+	} else {
+		tpa_info->hash_type = PKT_HASH_TYPE_NONE;
+		tpa_info->gso_type = 0;
+		if (netif_msg_rx_err(bp))
+			netdev_warn(bp->dev, "TPA packet without valid hash\n");
+	}
+	tpa_info->flags2 = le32_to_cpu(tpa_start1->rx_tpa_start_cmp_flags2);
+	tpa_info->metadata = le32_to_cpu(tpa_start1->rx_tpa_start_cmp_metadata);
+	tpa_info->hdr_info = le32_to_cpu(tpa_start1->rx_tpa_start_cmp_hdr_info);
+
+	rxr->rx_prod = NEXT_RX(prod);
+	cons = NEXT_RX(cons);
+	rxr->rx_next_cons = NEXT_RX(cons);
+	cons_rx_buf = &rxr->rx_buf_ring[cons];
+
+	bnxt_reuse_rx_data(rxr, cons, cons_rx_buf->data);
+	rxr->rx_prod = NEXT_RX(rxr->rx_prod);
+	cons_rx_buf->data = NULL;
+}
+
+static void bnxt_abort_tpa(struct bnxt *bp, struct bnxt_napi *bnapi,
+			   u16 cp_cons, u32 agg_bufs)
+{
+	if (agg_bufs)
+		bnxt_reuse_rx_agg_bufs(bnapi, cp_cons, agg_bufs);
+}
+
+static struct sk_buff *bnxt_gro_func_5731x(struct bnxt_tpa_info *tpa_info,
+					   int payload_off, int tcp_ts,
+					   struct sk_buff *skb)
+{
+#ifdef CONFIG_INET
+	struct tcphdr *th;
+	int len, nw_off;
+	u16 outer_ip_off, inner_ip_off, inner_mac_off;
+	u32 hdr_info = tpa_info->hdr_info;
+	bool loopback = false;
+
+	inner_ip_off = BNXT_TPA_INNER_L3_OFF(hdr_info);
+	inner_mac_off = BNXT_TPA_INNER_L2_OFF(hdr_info);
+	outer_ip_off = BNXT_TPA_OUTER_L3_OFF(hdr_info);
+
+	/* If the packet is an internal loopback packet, the offsets will
+	 * have an extra 4 bytes.
+	 */
+	if (inner_mac_off == 4) {
+		loopback = true;
+	} else if (inner_mac_off > 4) {
+		__be16 proto = *((__be16 *)(skb->data + inner_ip_off -
+					    ETH_HLEN - 2));
+
+		/* We only support inner iPv4/ipv6.  If we don't see the
+		 * correct protocol ID, it must be a loopback packet where
+		 * the offsets are off by 4.
+		 */
+		if (proto != htons(ETH_P_IP) && proto != htons(ETH_P_IPV6))
+			loopback = true;
+	}
+	if (loopback) {
+		/* internal loopback packet, subtract all offsets by 4 */
+		inner_ip_off -= 4;
+		inner_mac_off -= 4;
+		outer_ip_off -= 4;
+	}
+
+	nw_off = inner_ip_off - ETH_HLEN;
+	skb_set_network_header(skb, nw_off);
+	if (tpa_info->flags2 & RX_TPA_START_CMP_FLAGS2_IP_TYPE) {
+		struct ipv6hdr *iph = ipv6_hdr(skb);
+
+		skb_set_transport_header(skb, nw_off + sizeof(struct ipv6hdr));
+		len = skb->len - skb_transport_offset(skb);
+		th = tcp_hdr(skb);
+		th->check = ~tcp_v6_check(len, &iph->saddr, &iph->daddr, 0);
+	} else {
+		struct iphdr *iph = ip_hdr(skb);
+
+		skb_set_transport_header(skb, nw_off + sizeof(struct iphdr));
+		len = skb->len - skb_transport_offset(skb);
+		th = tcp_hdr(skb);
+		th->check = ~tcp_v4_check(len, iph->saddr, iph->daddr, 0);
+	}
+
+	if (inner_mac_off) { /* tunnel */
+		struct udphdr *uh = NULL;
+		__be16 proto = *((__be16 *)(skb->data + outer_ip_off -
+					    ETH_HLEN - 2));
+
+		if (proto == htons(ETH_P_IP)) {
+			struct iphdr *iph = (struct iphdr *)skb->data;
+
+			if (iph->protocol == IPPROTO_UDP)
+				uh = (struct udphdr *)(iph + 1);
+		} else {
+			struct ipv6hdr *iph = (struct ipv6hdr *)skb->data;
+
+			if (iph->nexthdr == IPPROTO_UDP)
+				uh = (struct udphdr *)(iph + 1);
+		}
+		if (uh) {
+			if (uh->check)
+				skb_shinfo(skb)->gso_type |=
+					SKB_GSO_UDP_TUNNEL_CSUM;
+			else
+				skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
+		}
+	}
+#endif
+	return skb;
+}
+
+#define BNXT_IPV4_HDR_SIZE	(sizeof(struct iphdr) + sizeof(struct tcphdr))
+#define BNXT_IPV6_HDR_SIZE	(sizeof(struct ipv6hdr) + sizeof(struct tcphdr))
+
+static struct sk_buff *bnxt_gro_func_5730x(struct bnxt_tpa_info *tpa_info,
+					   int payload_off, int tcp_ts,
+					   struct sk_buff *skb)
+{
+#ifdef CONFIG_INET
+	struct tcphdr *th;
+	int len, nw_off, tcp_opt_len = 0;
+
+	if (tcp_ts)
+		tcp_opt_len = 12;
+
+	if (tpa_info->gso_type == SKB_GSO_TCPV4) {
+		struct iphdr *iph;
+
+		nw_off = payload_off - BNXT_IPV4_HDR_SIZE - tcp_opt_len -
+			 ETH_HLEN;
+		skb_set_network_header(skb, nw_off);
+		iph = ip_hdr(skb);
+		skb_set_transport_header(skb, nw_off + sizeof(struct iphdr));
+		len = skb->len - skb_transport_offset(skb);
+		th = tcp_hdr(skb);
+		th->check = ~tcp_v4_check(len, iph->saddr, iph->daddr, 0);
+	} else if (tpa_info->gso_type == SKB_GSO_TCPV6) {
+		struct ipv6hdr *iph;
+
+		nw_off = payload_off - BNXT_IPV6_HDR_SIZE - tcp_opt_len -
+			 ETH_HLEN;
+		skb_set_network_header(skb, nw_off);
+		iph = ipv6_hdr(skb);
+		skb_set_transport_header(skb, nw_off + sizeof(struct ipv6hdr));
+		len = skb->len - skb_transport_offset(skb);
+		th = tcp_hdr(skb);
+		th->check = ~tcp_v6_check(len, &iph->saddr, &iph->daddr, 0);
+	} else {
+		dev_kfree_skb_any(skb);
+		return NULL;
+	}
+
+	if (nw_off) { /* tunnel */
+		struct udphdr *uh = NULL;
+
+		if (skb->protocol == htons(ETH_P_IP)) {
+			struct iphdr *iph = (struct iphdr *)skb->data;
+
+			if (iph->protocol == IPPROTO_UDP)
+				uh = (struct udphdr *)(iph + 1);
+		} else {
+			struct ipv6hdr *iph = (struct ipv6hdr *)skb->data;
+
+			if (iph->nexthdr == IPPROTO_UDP)
+				uh = (struct udphdr *)(iph + 1);
+		}
+		if (uh) {
+			if (uh->check)
+				skb_shinfo(skb)->gso_type |=
+					SKB_GSO_UDP_TUNNEL_CSUM;
+			else
+				skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
+		}
+	}
+#endif
+	return skb;
+}
+
+static inline struct sk_buff *bnxt_gro_skb(struct bnxt *bp,
+					   struct bnxt_tpa_info *tpa_info,
+					   struct rx_tpa_end_cmp *tpa_end,
+					   struct rx_tpa_end_cmp_ext *tpa_end1,
+					   struct sk_buff *skb)
+{
+#ifdef CONFIG_INET
+	int payload_off;
+	u16 segs;
+
+	segs = TPA_END_TPA_SEGS(tpa_end);
+	if (segs == 1)
+		return skb;
+
+	NAPI_GRO_CB(skb)->count = segs;
+	skb_shinfo(skb)->gso_size =
+		le32_to_cpu(tpa_end1->rx_tpa_end_cmp_seg_len);
+	skb_shinfo(skb)->gso_type = tpa_info->gso_type;
+	payload_off = (le32_to_cpu(tpa_end->rx_tpa_end_cmp_misc_v1) &
+		       RX_TPA_END_CMP_PAYLOAD_OFFSET) >>
+		      RX_TPA_END_CMP_PAYLOAD_OFFSET_SHIFT;
+	skb = bp->gro_func(tpa_info, payload_off, TPA_END_GRO_TS(tpa_end), skb);
+	if (likely(skb))
+		tcp_gro_complete(skb);
+#endif
+	return skb;
+}
+
+/* Given the cfa_code of a received packet determine which
+ * netdev (vf-rep or PF) the packet is destined to.
+ */
+static struct net_device *bnxt_get_pkt_dev(struct bnxt *bp, u16 cfa_code)
+{
+	struct net_device *dev = bnxt_get_vf_rep(bp, cfa_code);
+
+	/* if vf-rep dev is NULL, the must belongs to the PF */
+	return dev ? dev : bp->dev;
+}
+
+static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp,
+					   struct bnxt_napi *bnapi,
+					   u32 *raw_cons,
+					   struct rx_tpa_end_cmp *tpa_end,
+					   struct rx_tpa_end_cmp_ext *tpa_end1,
+					   u8 *event)
+{
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	struct bnxt_rx_ring_info *rxr = bnapi->rx_ring;
+	u8 agg_id = TPA_END_AGG_ID(tpa_end);
+	u8 *data_ptr, agg_bufs;
+	u16 cp_cons = RING_CMP(*raw_cons);
+	unsigned int len;
+	struct bnxt_tpa_info *tpa_info;
+	dma_addr_t mapping;
+	struct sk_buff *skb;
+	void *data;
+
+	if (unlikely(bnapi->in_reset)) {
+		int rc = bnxt_discard_rx(bp, bnapi, raw_cons, tpa_end);
+
+		if (rc < 0)
+			return ERR_PTR(-EBUSY);
+		return NULL;
+	}
+
+	tpa_info = &rxr->rx_tpa[agg_id];
+	data = tpa_info->data;
+	data_ptr = tpa_info->data_ptr;
+	prefetch(data_ptr);
+	len = tpa_info->len;
+	mapping = tpa_info->mapping;
+
+	agg_bufs = (le32_to_cpu(tpa_end->rx_tpa_end_cmp_misc_v1) &
+		    RX_TPA_END_CMP_AGG_BUFS) >> RX_TPA_END_CMP_AGG_BUFS_SHIFT;
+
+	if (agg_bufs) {
+		if (!bnxt_agg_bufs_valid(bp, cpr, agg_bufs, raw_cons))
+			return ERR_PTR(-EBUSY);
+
+		*event |= BNXT_AGG_EVENT;
+		cp_cons = NEXT_CMP(cp_cons);
+	}
+
+	if (unlikely(agg_bufs > MAX_SKB_FRAGS || TPA_END_ERRORS(tpa_end1))) {
+		bnxt_abort_tpa(bp, bnapi, cp_cons, agg_bufs);
+		if (agg_bufs > MAX_SKB_FRAGS)
+			netdev_warn(bp->dev, "TPA frags %d exceeded MAX_SKB_FRAGS %d\n",
+				    agg_bufs, (int)MAX_SKB_FRAGS);
+		return NULL;
+	}
+
+	if (len <= bp->rx_copy_thresh) {
+		skb = bnxt_copy_skb(bnapi, data_ptr, len, mapping);
+		if (!skb) {
+			bnxt_abort_tpa(bp, bnapi, cp_cons, agg_bufs);
+			return NULL;
+		}
+	} else {
+		u8 *new_data;
+		dma_addr_t new_mapping;
+
+		new_data = __bnxt_alloc_rx_data(bp, &new_mapping, GFP_ATOMIC);
+		if (!new_data) {
+			bnxt_abort_tpa(bp, bnapi, cp_cons, agg_bufs);
+			return NULL;
+		}
+
+		tpa_info->data = new_data;
+		tpa_info->data_ptr = new_data + bp->rx_offset;
+		tpa_info->mapping = new_mapping;
+
+		skb = build_skb(data, 0);
+		dma_unmap_single_attrs(&bp->pdev->dev, mapping,
+				       bp->rx_buf_use_size, bp->rx_dir,
+				       DMA_ATTR_WEAK_ORDERING);
+
+		if (!skb) {
+			kfree(data);
+			bnxt_abort_tpa(bp, bnapi, cp_cons, agg_bufs);
+			return NULL;
+		}
+		skb_reserve(skb, bp->rx_offset);
+		skb_put(skb, len);
+	}
+
+	if (agg_bufs) {
+		skb = bnxt_rx_pages(bp, bnapi, skb, cp_cons, agg_bufs);
+		if (!skb) {
+			/* Page reuse already handled by bnxt_rx_pages(). */
+			return NULL;
+		}
+	}
+
+	skb->protocol =
+		eth_type_trans(skb, bnxt_get_pkt_dev(bp, tpa_info->cfa_code));
+
+	if (tpa_info->hash_type != PKT_HASH_TYPE_NONE)
+		skb_set_hash(skb, tpa_info->rss_hash, tpa_info->hash_type);
+
+	if ((tpa_info->flags2 & RX_CMP_FLAGS2_META_FORMAT_VLAN) &&
+	    (skb->dev->features & NETIF_F_HW_VLAN_CTAG_RX)) {
+		u16 vlan_proto = tpa_info->metadata >>
+			RX_CMP_FLAGS2_METADATA_TPID_SFT;
+		u16 vtag = tpa_info->metadata & RX_CMP_FLAGS2_METADATA_TCI_MASK;
+
+		__vlan_hwaccel_put_tag(skb, htons(vlan_proto), vtag);
+	}
+
+	skb_checksum_none_assert(skb);
+	if (likely(tpa_info->flags2 & RX_TPA_START_CMP_FLAGS2_L4_CS_CALC)) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		skb->csum_level =
+			(tpa_info->flags2 & RX_CMP_FLAGS2_T_L4_CS_CALC) >> 3;
+	}
+
+	if (TPA_END_GRO(tpa_end))
+		skb = bnxt_gro_skb(bp, tpa_info, tpa_end, tpa_end1, skb);
+
+	return skb;
+}
+
+static void bnxt_deliver_skb(struct bnxt *bp, struct bnxt_napi *bnapi,
+			     struct sk_buff *skb)
+{
+	if (skb->dev != bp->dev) {
+		/* this packet belongs to a vf-rep */
+		bnxt_vf_rep_rx(bp, skb);
+		return;
+	}
+	skb_record_rx_queue(skb, bnapi->index);
+	napi_gro_receive(&bnapi->napi, skb);
+}
+
+/* returns the following:
+ * 1       - 1 packet successfully received
+ * 0       - successful TPA_START, packet not completed yet
+ * -EBUSY  - completion ring does not have all the agg buffers yet
+ * -ENOMEM - packet aborted due to out of memory
+ * -EIO    - packet aborted due to hw error indicated in BD
+ */
+static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons,
+		       u8 *event)
+{
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	struct bnxt_rx_ring_info *rxr = bnapi->rx_ring;
+	struct net_device *dev = bp->dev;
+	struct rx_cmp *rxcmp;
+	struct rx_cmp_ext *rxcmp1;
+	u32 tmp_raw_cons = *raw_cons;
+	u16 cfa_code, cons, prod, cp_cons = RING_CMP(tmp_raw_cons);
+	struct bnxt_sw_rx_bd *rx_buf;
+	unsigned int len;
+	u8 *data_ptr, agg_bufs, cmp_type;
+	dma_addr_t dma_addr;
+	struct sk_buff *skb;
+	void *data;
+	int rc = 0;
+	u32 misc;
+
+	rxcmp = (struct rx_cmp *)
+			&cpr->cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)];
+
+	tmp_raw_cons = NEXT_RAW_CMP(tmp_raw_cons);
+	cp_cons = RING_CMP(tmp_raw_cons);
+	rxcmp1 = (struct rx_cmp_ext *)
+			&cpr->cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)];
+
+	if (!RX_CMP_VALID(rxcmp1, tmp_raw_cons))
+		return -EBUSY;
+
+	cmp_type = RX_CMP_TYPE(rxcmp);
+
+	prod = rxr->rx_prod;
+
+	if (cmp_type == CMP_TYPE_RX_L2_TPA_START_CMP) {
+		bnxt_tpa_start(bp, rxr, (struct rx_tpa_start_cmp *)rxcmp,
+			       (struct rx_tpa_start_cmp_ext *)rxcmp1);
+
+		*event |= BNXT_RX_EVENT;
+		goto next_rx_no_prod_no_len;
+
+	} else if (cmp_type == CMP_TYPE_RX_L2_TPA_END_CMP) {
+		skb = bnxt_tpa_end(bp, bnapi, &tmp_raw_cons,
+				   (struct rx_tpa_end_cmp *)rxcmp,
+				   (struct rx_tpa_end_cmp_ext *)rxcmp1, event);
+
+		if (IS_ERR(skb))
+			return -EBUSY;
+
+		rc = -ENOMEM;
+		if (likely(skb)) {
+			bnxt_deliver_skb(bp, bnapi, skb);
+			rc = 1;
+		}
+		*event |= BNXT_RX_EVENT;
+		goto next_rx_no_prod_no_len;
+	}
+
+	cons = rxcmp->rx_cmp_opaque;
+	rx_buf = &rxr->rx_buf_ring[cons];
+	data = rx_buf->data;
+	data_ptr = rx_buf->data_ptr;
+	if (unlikely(cons != rxr->rx_next_cons)) {
+		int rc1 = bnxt_discard_rx(bp, bnapi, raw_cons, rxcmp);
+
+		bnxt_sched_reset(bp, rxr);
+		return rc1;
+	}
+	prefetch(data_ptr);
+
+	misc = le32_to_cpu(rxcmp->rx_cmp_misc_v1);
+	agg_bufs = (misc & RX_CMP_AGG_BUFS) >> RX_CMP_AGG_BUFS_SHIFT;
+
+	if (agg_bufs) {
+		if (!bnxt_agg_bufs_valid(bp, cpr, agg_bufs, &tmp_raw_cons))
+			return -EBUSY;
+
+		cp_cons = NEXT_CMP(cp_cons);
+		*event |= BNXT_AGG_EVENT;
+	}
+	*event |= BNXT_RX_EVENT;
+
+	rx_buf->data = NULL;
+	if (rxcmp1->rx_cmp_cfa_code_errors_v2 & RX_CMP_L2_ERRORS) {
+		bnxt_reuse_rx_data(rxr, cons, data);
+		if (agg_bufs)
+			bnxt_reuse_rx_agg_bufs(bnapi, cp_cons, agg_bufs);
+
+		rc = -EIO;
+		goto next_rx;
+	}
+
+	len = le32_to_cpu(rxcmp->rx_cmp_len_flags_type) >> RX_CMP_LEN_SHIFT;
+	dma_addr = rx_buf->mapping;
+
+	if (bnxt_rx_xdp(bp, rxr, cons, data, &data_ptr, &len, event)) {
+		rc = 1;
+		goto next_rx;
+	}
+
+	if (len <= bp->rx_copy_thresh) {
+		skb = bnxt_copy_skb(bnapi, data_ptr, len, dma_addr);
+		bnxt_reuse_rx_data(rxr, cons, data);
+		if (!skb) {
+			rc = -ENOMEM;
+			goto next_rx;
+		}
+	} else {
+		u32 payload;
+
+		if (rx_buf->data_ptr == data_ptr)
+			payload = misc & RX_CMP_PAYLOAD_OFFSET;
+		else
+			payload = 0;
+		skb = bp->rx_skb_func(bp, rxr, cons, data, data_ptr, dma_addr,
+				      payload | len);
+		if (!skb) {
+			rc = -ENOMEM;
+			goto next_rx;
+		}
+	}
+
+	if (agg_bufs) {
+		skb = bnxt_rx_pages(bp, bnapi, skb, cp_cons, agg_bufs);
+		if (!skb) {
+			rc = -ENOMEM;
+			goto next_rx;
+		}
+	}
+
+	if (RX_CMP_HASH_VALID(rxcmp)) {
+		u32 hash_type = RX_CMP_HASH_TYPE(rxcmp);
+		enum pkt_hash_types type = PKT_HASH_TYPE_L4;
+
+		/* RSS profiles 1 and 3 with extract code 0 for inner 4-tuple */
+		if (hash_type != 1 && hash_type != 3)
+			type = PKT_HASH_TYPE_L3;
+		skb_set_hash(skb, le32_to_cpu(rxcmp->rx_cmp_rss_hash), type);
+	}
+
+	cfa_code = RX_CMP_CFA_CODE(rxcmp1);
+	skb->protocol = eth_type_trans(skb, bnxt_get_pkt_dev(bp, cfa_code));
+
+	if ((rxcmp1->rx_cmp_flags2 &
+	     cpu_to_le32(RX_CMP_FLAGS2_META_FORMAT_VLAN)) &&
+	    (skb->dev->features & NETIF_F_HW_VLAN_CTAG_RX)) {
+		u32 meta_data = le32_to_cpu(rxcmp1->rx_cmp_meta_data);
+		u16 vtag = meta_data & RX_CMP_FLAGS2_METADATA_TCI_MASK;
+		u16 vlan_proto = meta_data >> RX_CMP_FLAGS2_METADATA_TPID_SFT;
+
+		__vlan_hwaccel_put_tag(skb, htons(vlan_proto), vtag);
+	}
+
+	skb_checksum_none_assert(skb);
+	if (RX_CMP_L4_CS_OK(rxcmp1)) {
+		if (dev->features & NETIF_F_RXCSUM) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			skb->csum_level = RX_CMP_ENCAP(rxcmp1);
+		}
+	} else {
+		if (rxcmp1->rx_cmp_cfa_code_errors_v2 & RX_CMP_L4_CS_ERR_BITS) {
+			if (dev->features & NETIF_F_RXCSUM)
+				cpr->rx_l4_csum_errors++;
+		}
+	}
+
+	bnxt_deliver_skb(bp, bnapi, skb);
+	rc = 1;
+
+next_rx:
+	rxr->rx_prod = NEXT_RX(prod);
+	rxr->rx_next_cons = NEXT_RX(cons);
+
+	cpr->rx_packets += 1;
+	cpr->rx_bytes += len;
+
+next_rx_no_prod_no_len:
+	*raw_cons = tmp_raw_cons;
+
+	return rc;
+}
+
+/* In netpoll mode, if we are using a combined completion ring, we need to
+ * discard the rx packets and recycle the buffers.
+ */
+static int bnxt_force_rx_discard(struct bnxt *bp, struct bnxt_napi *bnapi,
+				 u32 *raw_cons, u8 *event)
+{
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	u32 tmp_raw_cons = *raw_cons;
+	struct rx_cmp_ext *rxcmp1;
+	struct rx_cmp *rxcmp;
+	u16 cp_cons;
+	u8 cmp_type;
+
+	cp_cons = RING_CMP(tmp_raw_cons);
+	rxcmp = (struct rx_cmp *)
+			&cpr->cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)];
+
+	tmp_raw_cons = NEXT_RAW_CMP(tmp_raw_cons);
+	cp_cons = RING_CMP(tmp_raw_cons);
+	rxcmp1 = (struct rx_cmp_ext *)
+			&cpr->cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)];
+
+	if (!RX_CMP_VALID(rxcmp1, tmp_raw_cons))
+		return -EBUSY;
+
+	cmp_type = RX_CMP_TYPE(rxcmp);
+	if (cmp_type == CMP_TYPE_RX_L2_CMP) {
+		rxcmp1->rx_cmp_cfa_code_errors_v2 |=
+			cpu_to_le32(RX_CMPL_ERRORS_CRC_ERROR);
+	} else if (cmp_type == CMP_TYPE_RX_L2_TPA_END_CMP) {
+		struct rx_tpa_end_cmp_ext *tpa_end1;
+
+		tpa_end1 = (struct rx_tpa_end_cmp_ext *)rxcmp1;
+		tpa_end1->rx_tpa_end_cmp_errors_v2 |=
+			cpu_to_le32(RX_TPA_END_CMP_ERRORS);
+	}
+	return bnxt_rx_pkt(bp, bnapi, raw_cons, event);
+}
+
+#define BNXT_GET_EVENT_PORT(data)	\
+	((data) &			\
+	 ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_PORT_ID_MASK)
+
+static int bnxt_async_event_process(struct bnxt *bp,
+				    struct hwrm_async_event_cmpl *cmpl)
+{
+	u16 event_id = le16_to_cpu(cmpl->event_id);
+
+	/* TODO CHIMP_FW: Define event id's for link change, error etc */
+	switch (event_id) {
+	case ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE: {
+		u32 data1 = le32_to_cpu(cmpl->event_data1);
+		struct bnxt_link_info *link_info = &bp->link_info;
+
+		if (BNXT_VF(bp))
+			goto async_event_process_exit;
+
+		/* print unsupported speed warning in forced speed mode only */
+		if (!(link_info->autoneg & BNXT_AUTONEG_SPEED) &&
+		    (data1 & 0x20000)) {
+			u16 fw_speed = link_info->force_link_speed;
+			u32 speed = bnxt_fw_to_ethtool_speed(fw_speed);
+
+			if (speed != SPEED_UNKNOWN)
+				netdev_warn(bp->dev, "Link speed %d no longer supported\n",
+					    speed);
+		}
+		set_bit(BNXT_LINK_SPEED_CHNG_SP_EVENT, &bp->sp_event);
+		/* fall thru */
+	}
+	case ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE:
+		set_bit(BNXT_LINK_CHNG_SP_EVENT, &bp->sp_event);
+		break;
+	case ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD:
+		set_bit(BNXT_HWRM_PF_UNLOAD_SP_EVENT, &bp->sp_event);
+		break;
+	case ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED: {
+		u32 data1 = le32_to_cpu(cmpl->event_data1);
+		u16 port_id = BNXT_GET_EVENT_PORT(data1);
+
+		if (BNXT_VF(bp))
+			break;
+
+		if (bp->pf.port_id != port_id)
+			break;
+
+		set_bit(BNXT_HWRM_PORT_MODULE_SP_EVENT, &bp->sp_event);
+		break;
+	}
+	case ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE:
+		if (BNXT_PF(bp))
+			goto async_event_process_exit;
+		set_bit(BNXT_RESET_TASK_SILENT_SP_EVENT, &bp->sp_event);
+		break;
+	default:
+		goto async_event_process_exit;
+	}
+	bnxt_queue_sp_work(bp);
+async_event_process_exit:
+	bnxt_ulp_async_events(bp, cmpl);
+	return 0;
+}
+
+static int bnxt_hwrm_handler(struct bnxt *bp, struct tx_cmp *txcmp)
+{
+	u16 cmpl_type = TX_CMP_TYPE(txcmp), vf_id, seq_id;
+	struct hwrm_cmpl *h_cmpl = (struct hwrm_cmpl *)txcmp;
+	struct hwrm_fwd_req_cmpl *fwd_req_cmpl =
+				(struct hwrm_fwd_req_cmpl *)txcmp;
+
+	switch (cmpl_type) {
+	case CMPL_BASE_TYPE_HWRM_DONE:
+		seq_id = le16_to_cpu(h_cmpl->sequence_id);
+		if (seq_id == bp->hwrm_intr_seq_id)
+			bp->hwrm_intr_seq_id = HWRM_SEQ_ID_INVALID;
+		else
+			netdev_err(bp->dev, "Invalid hwrm seq id %d\n", seq_id);
+		break;
+
+	case CMPL_BASE_TYPE_HWRM_FWD_REQ:
+		vf_id = le16_to_cpu(fwd_req_cmpl->source_id);
+
+		if ((vf_id < bp->pf.first_vf_id) ||
+		    (vf_id >= bp->pf.first_vf_id + bp->pf.active_vfs)) {
+			netdev_err(bp->dev, "Msg contains invalid VF id %x\n",
+				   vf_id);
+			return -EINVAL;
+		}
+
+		set_bit(vf_id - bp->pf.first_vf_id, bp->pf.vf_event_bmap);
+		set_bit(BNXT_HWRM_EXEC_FWD_REQ_SP_EVENT, &bp->sp_event);
+		bnxt_queue_sp_work(bp);
+		break;
+
+	case CMPL_BASE_TYPE_HWRM_ASYNC_EVENT:
+		bnxt_async_event_process(bp,
+					 (struct hwrm_async_event_cmpl *)txcmp);
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static irqreturn_t bnxt_msix(int irq, void *dev_instance)
+{
+	struct bnxt_napi *bnapi = dev_instance;
+	struct bnxt *bp = bnapi->bp;
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	u32 cons = RING_CMP(cpr->cp_raw_cons);
+
+	cpr->event_ctr++;
+	prefetch(&cpr->cp_desc_ring[CP_RING(cons)][CP_IDX(cons)]);
+	napi_schedule(&bnapi->napi);
+	return IRQ_HANDLED;
+}
+
+static inline int bnxt_has_work(struct bnxt *bp, struct bnxt_cp_ring_info *cpr)
+{
+	u32 raw_cons = cpr->cp_raw_cons;
+	u16 cons = RING_CMP(raw_cons);
+	struct tx_cmp *txcmp;
+
+	txcmp = &cpr->cp_desc_ring[CP_RING(cons)][CP_IDX(cons)];
+
+	return TX_CMP_VALID(txcmp, raw_cons);
+}
+
+static irqreturn_t bnxt_inta(int irq, void *dev_instance)
+{
+	struct bnxt_napi *bnapi = dev_instance;
+	struct bnxt *bp = bnapi->bp;
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	u32 cons = RING_CMP(cpr->cp_raw_cons);
+	u32 int_status;
+
+	prefetch(&cpr->cp_desc_ring[CP_RING(cons)][CP_IDX(cons)]);
+
+	if (!bnxt_has_work(bp, cpr)) {
+		int_status = readl(bp->bar0 + BNXT_CAG_REG_LEGACY_INT_STATUS);
+		/* return if erroneous interrupt */
+		if (!(int_status & (0x10000 << cpr->cp_ring_struct.fw_ring_id)))
+			return IRQ_NONE;
+	}
+
+	/* disable ring IRQ */
+	BNXT_CP_DB_IRQ_DIS(cpr->cp_doorbell);
+
+	/* Return here if interrupt is shared and is disabled. */
+	if (unlikely(atomic_read(&bp->intr_sem) != 0))
+		return IRQ_HANDLED;
+
+	napi_schedule(&bnapi->napi);
+	return IRQ_HANDLED;
+}
+
+static int bnxt_poll_work(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
+{
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	u32 raw_cons = cpr->cp_raw_cons;
+	u32 cons;
+	int tx_pkts = 0;
+	int rx_pkts = 0;
+	u8 event = 0;
+	struct tx_cmp *txcmp;
+
+	while (1) {
+		int rc;
+
+		cons = RING_CMP(raw_cons);
+		txcmp = &cpr->cp_desc_ring[CP_RING(cons)][CP_IDX(cons)];
+
+		if (!TX_CMP_VALID(txcmp, raw_cons))
+			break;
+
+		/* The valid test of the entry must be done first before
+		 * reading any further.
+		 */
+		dma_rmb();
+		if (TX_CMP_TYPE(txcmp) == CMP_TYPE_TX_L2_CMP) {
+			tx_pkts++;
+			/* return full budget so NAPI will complete. */
+			if (unlikely(tx_pkts > bp->tx_wake_thresh))
+				rx_pkts = budget;
+		} else if ((TX_CMP_TYPE(txcmp) & 0x30) == 0x10) {
+			if (likely(budget))
+				rc = bnxt_rx_pkt(bp, bnapi, &raw_cons, &event);
+			else
+				rc = bnxt_force_rx_discard(bp, bnapi, &raw_cons,
+							   &event);
+			if (likely(rc >= 0))
+				rx_pkts += rc;
+			/* Increment rx_pkts when rc is -ENOMEM to count towards
+			 * the NAPI budget.  Otherwise, we may potentially loop
+			 * here forever if we consistently cannot allocate
+			 * buffers.
+			 */
+			else if (rc == -ENOMEM && budget)
+				rx_pkts++;
+			else if (rc == -EBUSY)	/* partial completion */
+				break;
+		} else if (unlikely((TX_CMP_TYPE(txcmp) ==
+				     CMPL_BASE_TYPE_HWRM_DONE) ||
+				    (TX_CMP_TYPE(txcmp) ==
+				     CMPL_BASE_TYPE_HWRM_FWD_REQ) ||
+				    (TX_CMP_TYPE(txcmp) ==
+				     CMPL_BASE_TYPE_HWRM_ASYNC_EVENT))) {
+			bnxt_hwrm_handler(bp, txcmp);
+		}
+		raw_cons = NEXT_RAW_CMP(raw_cons);
+
+		if (rx_pkts == budget)
+			break;
+	}
+
+	if (event & BNXT_TX_EVENT) {
+		struct bnxt_tx_ring_info *txr = bnapi->tx_ring;
+		void __iomem *db = txr->tx_doorbell;
+		u16 prod = txr->tx_prod;
+
+		/* Sync BD data before updating doorbell */
+		wmb();
+
+		bnxt_db_write_relaxed(bp, db, DB_KEY_TX | prod);
+	}
+
+	cpr->cp_raw_cons = raw_cons;
+	/* ACK completion ring before freeing tx ring and producing new
+	 * buffers in rx/agg rings to prevent overflowing the completion
+	 * ring.
+	 */
+	BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
+
+	if (tx_pkts)
+		bnapi->tx_int(bp, bnapi, tx_pkts);
+
+	if (event & BNXT_RX_EVENT) {
+		struct bnxt_rx_ring_info *rxr = bnapi->rx_ring;
+
+		bnxt_db_write(bp, rxr->rx_doorbell, DB_KEY_RX | rxr->rx_prod);
+		if (event & BNXT_AGG_EVENT)
+			bnxt_db_write(bp, rxr->rx_agg_doorbell,
+				      DB_KEY_RX | rxr->rx_agg_prod);
+	}
+	return rx_pkts;
+}
+
+static int bnxt_poll_nitroa0(struct napi_struct *napi, int budget)
+{
+	struct bnxt_napi *bnapi = container_of(napi, struct bnxt_napi, napi);
+	struct bnxt *bp = bnapi->bp;
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	struct bnxt_rx_ring_info *rxr = bnapi->rx_ring;
+	struct tx_cmp *txcmp;
+	struct rx_cmp_ext *rxcmp1;
+	u32 cp_cons, tmp_raw_cons;
+	u32 raw_cons = cpr->cp_raw_cons;
+	u32 rx_pkts = 0;
+	u8 event = 0;
+
+	while (1) {
+		int rc;
+
+		cp_cons = RING_CMP(raw_cons);
+		txcmp = &cpr->cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)];
+
+		if (!TX_CMP_VALID(txcmp, raw_cons))
+			break;
+
+		if ((TX_CMP_TYPE(txcmp) & 0x30) == 0x10) {
+			tmp_raw_cons = NEXT_RAW_CMP(raw_cons);
+			cp_cons = RING_CMP(tmp_raw_cons);
+			rxcmp1 = (struct rx_cmp_ext *)
+			  &cpr->cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)];
+
+			if (!RX_CMP_VALID(rxcmp1, tmp_raw_cons))
+				break;
+
+			/* force an error to recycle the buffer */
+			rxcmp1->rx_cmp_cfa_code_errors_v2 |=
+				cpu_to_le32(RX_CMPL_ERRORS_CRC_ERROR);
+
+			rc = bnxt_rx_pkt(bp, bnapi, &raw_cons, &event);
+			if (likely(rc == -EIO) && budget)
+				rx_pkts++;
+			else if (rc == -EBUSY)	/* partial completion */
+				break;
+		} else if (unlikely(TX_CMP_TYPE(txcmp) ==
+				    CMPL_BASE_TYPE_HWRM_DONE)) {
+			bnxt_hwrm_handler(bp, txcmp);
+		} else {
+			netdev_err(bp->dev,
+				   "Invalid completion received on special ring\n");
+		}
+		raw_cons = NEXT_RAW_CMP(raw_cons);
+
+		if (rx_pkts == budget)
+			break;
+	}
+
+	cpr->cp_raw_cons = raw_cons;
+	BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
+	bnxt_db_write(bp, rxr->rx_doorbell, DB_KEY_RX | rxr->rx_prod);
+
+	if (event & BNXT_AGG_EVENT)
+		bnxt_db_write(bp, rxr->rx_agg_doorbell,
+			      DB_KEY_RX | rxr->rx_agg_prod);
+
+	if (!bnxt_has_work(bp, cpr) && rx_pkts < budget) {
+		napi_complete_done(napi, rx_pkts);
+		BNXT_CP_DB_REARM(cpr->cp_doorbell, cpr->cp_raw_cons);
+	}
+	return rx_pkts;
+}
+
+static int bnxt_poll(struct napi_struct *napi, int budget)
+{
+	struct bnxt_napi *bnapi = container_of(napi, struct bnxt_napi, napi);
+	struct bnxt *bp = bnapi->bp;
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	int work_done = 0;
+
+	while (1) {
+		work_done += bnxt_poll_work(bp, bnapi, budget - work_done);
+
+		if (work_done >= budget)
+			break;
+
+		if (!bnxt_has_work(bp, cpr)) {
+			if (napi_complete_done(napi, work_done))
+				BNXT_CP_DB_REARM(cpr->cp_doorbell,
+						 cpr->cp_raw_cons);
+			break;
+		}
+	}
+	if (bp->flags & BNXT_FLAG_DIM) {
+		struct net_dim_sample dim_sample;
+
+		net_dim_sample(cpr->event_ctr,
+			       cpr->rx_packets,
+			       cpr->rx_bytes,
+			       &dim_sample);
+		net_dim(&cpr->dim, dim_sample);
+	}
+	mmiowb();
+	return work_done;
+}
+
+static void bnxt_free_tx_skbs(struct bnxt *bp)
+{
+	int i, max_idx;
+	struct pci_dev *pdev = bp->pdev;
+
+	if (!bp->tx_ring)
+		return;
+
+	max_idx = bp->tx_nr_pages * TX_DESC_CNT;
+	for (i = 0; i < bp->tx_nr_rings; i++) {
+		struct bnxt_tx_ring_info *txr = &bp->tx_ring[i];
+		int j;
+
+		for (j = 0; j < max_idx;) {
+			struct bnxt_sw_tx_bd *tx_buf = &txr->tx_buf_ring[j];
+			struct sk_buff *skb = tx_buf->skb;
+			int k, last;
+
+			if (!skb) {
+				j++;
+				continue;
+			}
+
+			tx_buf->skb = NULL;
+
+			if (tx_buf->is_push) {
+				dev_kfree_skb(skb);
+				j += 2;
+				continue;
+			}
+
+			dma_unmap_single(&pdev->dev,
+					 dma_unmap_addr(tx_buf, mapping),
+					 skb_headlen(skb),
+					 PCI_DMA_TODEVICE);
+
+			last = tx_buf->nr_frags;
+			j += 2;
+			for (k = 0; k < last; k++, j++) {
+				int ring_idx = j & bp->tx_ring_mask;
+				skb_frag_t *frag = &skb_shinfo(skb)->frags[k];
+
+				tx_buf = &txr->tx_buf_ring[ring_idx];
+				dma_unmap_page(
+					&pdev->dev,
+					dma_unmap_addr(tx_buf, mapping),
+					skb_frag_size(frag), PCI_DMA_TODEVICE);
+			}
+			dev_kfree_skb(skb);
+		}
+		netdev_tx_reset_queue(netdev_get_tx_queue(bp->dev, i));
+	}
+}
+
+static void bnxt_free_rx_skbs(struct bnxt *bp)
+{
+	int i, max_idx, max_agg_idx;
+	struct pci_dev *pdev = bp->pdev;
+
+	if (!bp->rx_ring)
+		return;
+
+	max_idx = bp->rx_nr_pages * RX_DESC_CNT;
+	max_agg_idx = bp->rx_agg_nr_pages * RX_DESC_CNT;
+	for (i = 0; i < bp->rx_nr_rings; i++) {
+		struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i];
+		int j;
+
+		if (rxr->rx_tpa) {
+			for (j = 0; j < MAX_TPA; j++) {
+				struct bnxt_tpa_info *tpa_info =
+							&rxr->rx_tpa[j];
+				u8 *data = tpa_info->data;
+
+				if (!data)
+					continue;
+
+				dma_unmap_single_attrs(&pdev->dev,
+						       tpa_info->mapping,
+						       bp->rx_buf_use_size,
+						       bp->rx_dir,
+						       DMA_ATTR_WEAK_ORDERING);
+
+				tpa_info->data = NULL;
+
+				kfree(data);
+			}
+		}
+
+		for (j = 0; j < max_idx; j++) {
+			struct bnxt_sw_rx_bd *rx_buf = &rxr->rx_buf_ring[j];
+			dma_addr_t mapping = rx_buf->mapping;
+			void *data = rx_buf->data;
+
+			if (!data)
+				continue;
+
+			rx_buf->data = NULL;
+
+			if (BNXT_RX_PAGE_MODE(bp)) {
+				mapping -= bp->rx_dma_offset;
+				dma_unmap_page_attrs(&pdev->dev, mapping,
+						     PAGE_SIZE, bp->rx_dir,
+						     DMA_ATTR_WEAK_ORDERING);
+				__free_page(data);
+			} else {
+				dma_unmap_single_attrs(&pdev->dev, mapping,
+						       bp->rx_buf_use_size,
+						       bp->rx_dir,
+						       DMA_ATTR_WEAK_ORDERING);
+				kfree(data);
+			}
+		}
+
+		for (j = 0; j < max_agg_idx; j++) {
+			struct bnxt_sw_rx_agg_bd *rx_agg_buf =
+				&rxr->rx_agg_ring[j];
+			struct page *page = rx_agg_buf->page;
+
+			if (!page)
+				continue;
+
+			dma_unmap_page_attrs(&pdev->dev, rx_agg_buf->mapping,
+					     BNXT_RX_PAGE_SIZE,
+					     PCI_DMA_FROMDEVICE,
+					     DMA_ATTR_WEAK_ORDERING);
+
+			rx_agg_buf->page = NULL;
+			__clear_bit(j, rxr->rx_agg_bmap);
+
+			__free_page(page);
+		}
+		if (rxr->rx_page) {
+			__free_page(rxr->rx_page);
+			rxr->rx_page = NULL;
+		}
+	}
+}
+
+static void bnxt_free_skbs(struct bnxt *bp)
+{
+	bnxt_free_tx_skbs(bp);
+	bnxt_free_rx_skbs(bp);
+}
+
+static void bnxt_free_ring(struct bnxt *bp, struct bnxt_ring_struct *ring)
+{
+	struct pci_dev *pdev = bp->pdev;
+	int i;
+
+	for (i = 0; i < ring->nr_pages; i++) {
+		if (!ring->pg_arr[i])
+			continue;
+
+		dma_free_coherent(&pdev->dev, ring->page_size,
+				  ring->pg_arr[i], ring->dma_arr[i]);
+
+		ring->pg_arr[i] = NULL;
+	}
+	if (ring->pg_tbl) {
+		dma_free_coherent(&pdev->dev, ring->nr_pages * 8,
+				  ring->pg_tbl, ring->pg_tbl_map);
+		ring->pg_tbl = NULL;
+	}
+	if (ring->vmem_size && *ring->vmem) {
+		vfree(*ring->vmem);
+		*ring->vmem = NULL;
+	}
+}
+
+static int bnxt_alloc_ring(struct bnxt *bp, struct bnxt_ring_struct *ring)
+{
+	int i;
+	struct pci_dev *pdev = bp->pdev;
+
+	if (ring->nr_pages > 1) {
+		ring->pg_tbl = dma_alloc_coherent(&pdev->dev,
+						  ring->nr_pages * 8,
+						  &ring->pg_tbl_map,
+						  GFP_KERNEL);
+		if (!ring->pg_tbl)
+			return -ENOMEM;
+	}
+
+	for (i = 0; i < ring->nr_pages; i++) {
+		ring->pg_arr[i] = dma_alloc_coherent(&pdev->dev,
+						     ring->page_size,
+						     &ring->dma_arr[i],
+						     GFP_KERNEL);
+		if (!ring->pg_arr[i])
+			return -ENOMEM;
+
+		if (ring->nr_pages > 1)
+			ring->pg_tbl[i] = cpu_to_le64(ring->dma_arr[i]);
+	}
+
+	if (ring->vmem_size) {
+		*ring->vmem = vzalloc(ring->vmem_size);
+		if (!(*ring->vmem))
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+static void bnxt_free_rx_rings(struct bnxt *bp)
+{
+	int i;
+
+	if (!bp->rx_ring)
+		return;
+
+	for (i = 0; i < bp->rx_nr_rings; i++) {
+		struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i];
+		struct bnxt_ring_struct *ring;
+
+		if (rxr->xdp_prog)
+			bpf_prog_put(rxr->xdp_prog);
+
+		if (xdp_rxq_info_is_reg(&rxr->xdp_rxq))
+			xdp_rxq_info_unreg(&rxr->xdp_rxq);
+
+		kfree(rxr->rx_tpa);
+		rxr->rx_tpa = NULL;
+
+		kfree(rxr->rx_agg_bmap);
+		rxr->rx_agg_bmap = NULL;
+
+		ring = &rxr->rx_ring_struct;
+		bnxt_free_ring(bp, ring);
+
+		ring = &rxr->rx_agg_ring_struct;
+		bnxt_free_ring(bp, ring);
+	}
+}
+
+static int bnxt_alloc_rx_rings(struct bnxt *bp)
+{
+	int i, rc, agg_rings = 0, tpa_rings = 0;
+
+	if (!bp->rx_ring)
+		return -ENOMEM;
+
+	if (bp->flags & BNXT_FLAG_AGG_RINGS)
+		agg_rings = 1;
+
+	if (bp->flags & BNXT_FLAG_TPA)
+		tpa_rings = 1;
+
+	for (i = 0; i < bp->rx_nr_rings; i++) {
+		struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i];
+		struct bnxt_ring_struct *ring;
+
+		ring = &rxr->rx_ring_struct;
+
+		rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i);
+		if (rc < 0)
+			return rc;
+
+		rc = bnxt_alloc_ring(bp, ring);
+		if (rc)
+			return rc;
+
+		if (agg_rings) {
+			u16 mem_size;
+
+			ring = &rxr->rx_agg_ring_struct;
+			rc = bnxt_alloc_ring(bp, ring);
+			if (rc)
+				return rc;
+
+			ring->grp_idx = i;
+			rxr->rx_agg_bmap_size = bp->rx_agg_ring_mask + 1;
+			mem_size = rxr->rx_agg_bmap_size / 8;
+			rxr->rx_agg_bmap = kzalloc(mem_size, GFP_KERNEL);
+			if (!rxr->rx_agg_bmap)
+				return -ENOMEM;
+
+			if (tpa_rings) {
+				rxr->rx_tpa = kcalloc(MAX_TPA,
+						sizeof(struct bnxt_tpa_info),
+						GFP_KERNEL);
+				if (!rxr->rx_tpa)
+					return -ENOMEM;
+			}
+		}
+	}
+	return 0;
+}
+
+static void bnxt_free_tx_rings(struct bnxt *bp)
+{
+	int i;
+	struct pci_dev *pdev = bp->pdev;
+
+	if (!bp->tx_ring)
+		return;
+
+	for (i = 0; i < bp->tx_nr_rings; i++) {
+		struct bnxt_tx_ring_info *txr = &bp->tx_ring[i];
+		struct bnxt_ring_struct *ring;
+
+		if (txr->tx_push) {
+			dma_free_coherent(&pdev->dev, bp->tx_push_size,
+					  txr->tx_push, txr->tx_push_mapping);
+			txr->tx_push = NULL;
+		}
+
+		ring = &txr->tx_ring_struct;
+
+		bnxt_free_ring(bp, ring);
+	}
+}
+
+static int bnxt_alloc_tx_rings(struct bnxt *bp)
+{
+	int i, j, rc;
+	struct pci_dev *pdev = bp->pdev;
+
+	bp->tx_push_size = 0;
+	if (bp->tx_push_thresh) {
+		int push_size;
+
+		push_size  = L1_CACHE_ALIGN(sizeof(struct tx_push_bd) +
+					bp->tx_push_thresh);
+
+		if (push_size > 256) {
+			push_size = 0;
+			bp->tx_push_thresh = 0;
+		}
+
+		bp->tx_push_size = push_size;
+	}
+
+	for (i = 0, j = 0; i < bp->tx_nr_rings; i++) {
+		struct bnxt_tx_ring_info *txr = &bp->tx_ring[i];
+		struct bnxt_ring_struct *ring;
+
+		ring = &txr->tx_ring_struct;
+
+		rc = bnxt_alloc_ring(bp, ring);
+		if (rc)
+			return rc;
+
+		ring->grp_idx = txr->bnapi->index;
+		if (bp->tx_push_size) {
+			dma_addr_t mapping;
+
+			/* One pre-allocated DMA buffer to backup
+			 * TX push operation
+			 */
+			txr->tx_push = dma_alloc_coherent(&pdev->dev,
+						bp->tx_push_size,
+						&txr->tx_push_mapping,
+						GFP_KERNEL);
+
+			if (!txr->tx_push)
+				return -ENOMEM;
+
+			mapping = txr->tx_push_mapping +
+				sizeof(struct tx_push_bd);
+			txr->data_mapping = cpu_to_le64(mapping);
+
+			memset(txr->tx_push, 0, sizeof(struct tx_push_bd));
+		}
+		ring->queue_id = bp->q_info[j].queue_id;
+		if (i < bp->tx_nr_rings_xdp)
+			continue;
+		if (i % bp->tx_nr_rings_per_tc == (bp->tx_nr_rings_per_tc - 1))
+			j++;
+	}
+	return 0;
+}
+
+static void bnxt_free_cp_rings(struct bnxt *bp)
+{
+	int i;
+
+	if (!bp->bnapi)
+		return;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr;
+		struct bnxt_ring_struct *ring;
+
+		if (!bnapi)
+			continue;
+
+		cpr = &bnapi->cp_ring;
+		ring = &cpr->cp_ring_struct;
+
+		bnxt_free_ring(bp, ring);
+	}
+}
+
+static int bnxt_alloc_cp_rings(struct bnxt *bp)
+{
+	int i, rc, ulp_base_vec, ulp_msix;
+
+	ulp_msix = bnxt_get_ulp_msix_num(bp);
+	ulp_base_vec = bnxt_get_ulp_msix_base(bp);
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr;
+		struct bnxt_ring_struct *ring;
+
+		if (!bnapi)
+			continue;
+
+		cpr = &bnapi->cp_ring;
+		ring = &cpr->cp_ring_struct;
+
+		rc = bnxt_alloc_ring(bp, ring);
+		if (rc)
+			return rc;
+
+		if (ulp_msix && i >= ulp_base_vec)
+			ring->map_idx = i + ulp_msix;
+		else
+			ring->map_idx = i;
+	}
+	return 0;
+}
+
+static void bnxt_init_ring_struct(struct bnxt *bp)
+{
+	int i;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr;
+		struct bnxt_rx_ring_info *rxr;
+		struct bnxt_tx_ring_info *txr;
+		struct bnxt_ring_struct *ring;
+
+		if (!bnapi)
+			continue;
+
+		cpr = &bnapi->cp_ring;
+		ring = &cpr->cp_ring_struct;
+		ring->nr_pages = bp->cp_nr_pages;
+		ring->page_size = HW_CMPD_RING_SIZE;
+		ring->pg_arr = (void **)cpr->cp_desc_ring;
+		ring->dma_arr = cpr->cp_desc_mapping;
+		ring->vmem_size = 0;
+
+		rxr = bnapi->rx_ring;
+		if (!rxr)
+			goto skip_rx;
+
+		ring = &rxr->rx_ring_struct;
+		ring->nr_pages = bp->rx_nr_pages;
+		ring->page_size = HW_RXBD_RING_SIZE;
+		ring->pg_arr = (void **)rxr->rx_desc_ring;
+		ring->dma_arr = rxr->rx_desc_mapping;
+		ring->vmem_size = SW_RXBD_RING_SIZE * bp->rx_nr_pages;
+		ring->vmem = (void **)&rxr->rx_buf_ring;
+
+		ring = &rxr->rx_agg_ring_struct;
+		ring->nr_pages = bp->rx_agg_nr_pages;
+		ring->page_size = HW_RXBD_RING_SIZE;
+		ring->pg_arr = (void **)rxr->rx_agg_desc_ring;
+		ring->dma_arr = rxr->rx_agg_desc_mapping;
+		ring->vmem_size = SW_RXBD_AGG_RING_SIZE * bp->rx_agg_nr_pages;
+		ring->vmem = (void **)&rxr->rx_agg_ring;
+
+skip_rx:
+		txr = bnapi->tx_ring;
+		if (!txr)
+			continue;
+
+		ring = &txr->tx_ring_struct;
+		ring->nr_pages = bp->tx_nr_pages;
+		ring->page_size = HW_RXBD_RING_SIZE;
+		ring->pg_arr = (void **)txr->tx_desc_ring;
+		ring->dma_arr = txr->tx_desc_mapping;
+		ring->vmem_size = SW_TXBD_RING_SIZE * bp->tx_nr_pages;
+		ring->vmem = (void **)&txr->tx_buf_ring;
+	}
+}
+
+static void bnxt_init_rxbd_pages(struct bnxt_ring_struct *ring, u32 type)
+{
+	int i;
+	u32 prod;
+	struct rx_bd **rx_buf_ring;
+
+	rx_buf_ring = (struct rx_bd **)ring->pg_arr;
+	for (i = 0, prod = 0; i < ring->nr_pages; i++) {
+		int j;
+		struct rx_bd *rxbd;
+
+		rxbd = rx_buf_ring[i];
+		if (!rxbd)
+			continue;
+
+		for (j = 0; j < RX_DESC_CNT; j++, rxbd++, prod++) {
+			rxbd->rx_bd_len_flags_type = cpu_to_le32(type);
+			rxbd->rx_bd_opaque = prod;
+		}
+	}
+}
+
+static int bnxt_init_one_rx_ring(struct bnxt *bp, int ring_nr)
+{
+	struct net_device *dev = bp->dev;
+	struct bnxt_rx_ring_info *rxr;
+	struct bnxt_ring_struct *ring;
+	u32 prod, type;
+	int i;
+
+	type = (bp->rx_buf_use_size << RX_BD_LEN_SHIFT) |
+		RX_BD_TYPE_RX_PACKET_BD | RX_BD_FLAGS_EOP;
+
+	if (NET_IP_ALIGN == 2)
+		type |= RX_BD_FLAGS_SOP;
+
+	rxr = &bp->rx_ring[ring_nr];
+	ring = &rxr->rx_ring_struct;
+	bnxt_init_rxbd_pages(ring, type);
+
+	if (BNXT_RX_PAGE_MODE(bp) && bp->xdp_prog) {
+		rxr->xdp_prog = bpf_prog_add(bp->xdp_prog, 1);
+		if (IS_ERR(rxr->xdp_prog)) {
+			int rc = PTR_ERR(rxr->xdp_prog);
+
+			rxr->xdp_prog = NULL;
+			return rc;
+		}
+	}
+	prod = rxr->rx_prod;
+	for (i = 0; i < bp->rx_ring_size; i++) {
+		if (bnxt_alloc_rx_data(bp, rxr, prod, GFP_KERNEL) != 0) {
+			netdev_warn(dev, "init'ed rx ring %d with %d/%d skbs only\n",
+				    ring_nr, i, bp->rx_ring_size);
+			break;
+		}
+		prod = NEXT_RX(prod);
+	}
+	rxr->rx_prod = prod;
+	ring->fw_ring_id = INVALID_HW_RING_ID;
+
+	ring = &rxr->rx_agg_ring_struct;
+	ring->fw_ring_id = INVALID_HW_RING_ID;
+
+	if (!(bp->flags & BNXT_FLAG_AGG_RINGS))
+		return 0;
+
+	type = ((u32)BNXT_RX_PAGE_SIZE << RX_BD_LEN_SHIFT) |
+		RX_BD_TYPE_RX_AGG_BD | RX_BD_FLAGS_SOP;
+
+	bnxt_init_rxbd_pages(ring, type);
+
+	prod = rxr->rx_agg_prod;
+	for (i = 0; i < bp->rx_agg_ring_size; i++) {
+		if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_KERNEL) != 0) {
+			netdev_warn(dev, "init'ed rx ring %d with %d/%d pages only\n",
+				    ring_nr, i, bp->rx_ring_size);
+			break;
+		}
+		prod = NEXT_RX_AGG(prod);
+	}
+	rxr->rx_agg_prod = prod;
+
+	if (bp->flags & BNXT_FLAG_TPA) {
+		if (rxr->rx_tpa) {
+			u8 *data;
+			dma_addr_t mapping;
+
+			for (i = 0; i < MAX_TPA; i++) {
+				data = __bnxt_alloc_rx_data(bp, &mapping,
+							    GFP_KERNEL);
+				if (!data)
+					return -ENOMEM;
+
+				rxr->rx_tpa[i].data = data;
+				rxr->rx_tpa[i].data_ptr = data + bp->rx_offset;
+				rxr->rx_tpa[i].mapping = mapping;
+			}
+		} else {
+			netdev_err(bp->dev, "No resource allocated for LRO/GRO\n");
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static void bnxt_init_cp_rings(struct bnxt *bp)
+{
+	int i;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_cp_ring_info *cpr = &bp->bnapi[i]->cp_ring;
+		struct bnxt_ring_struct *ring = &cpr->cp_ring_struct;
+
+		ring->fw_ring_id = INVALID_HW_RING_ID;
+		cpr->rx_ring_coal.coal_ticks = bp->rx_coal.coal_ticks;
+		cpr->rx_ring_coal.coal_bufs = bp->rx_coal.coal_bufs;
+	}
+}
+
+static int bnxt_init_rx_rings(struct bnxt *bp)
+{
+	int i, rc = 0;
+
+	if (BNXT_RX_PAGE_MODE(bp)) {
+		bp->rx_offset = NET_IP_ALIGN + XDP_PACKET_HEADROOM;
+		bp->rx_dma_offset = XDP_PACKET_HEADROOM;
+	} else {
+		bp->rx_offset = BNXT_RX_OFFSET;
+		bp->rx_dma_offset = BNXT_RX_DMA_OFFSET;
+	}
+
+	for (i = 0; i < bp->rx_nr_rings; i++) {
+		rc = bnxt_init_one_rx_ring(bp, i);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+static int bnxt_init_tx_rings(struct bnxt *bp)
+{
+	u16 i;
+
+	bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2,
+				   MAX_SKB_FRAGS + 1);
+
+	for (i = 0; i < bp->tx_nr_rings; i++) {
+		struct bnxt_tx_ring_info *txr = &bp->tx_ring[i];
+		struct bnxt_ring_struct *ring = &txr->tx_ring_struct;
+
+		ring->fw_ring_id = INVALID_HW_RING_ID;
+	}
+
+	return 0;
+}
+
+static void bnxt_free_ring_grps(struct bnxt *bp)
+{
+	kfree(bp->grp_info);
+	bp->grp_info = NULL;
+}
+
+static int bnxt_init_ring_grps(struct bnxt *bp, bool irq_re_init)
+{
+	int i;
+
+	if (irq_re_init) {
+		bp->grp_info = kcalloc(bp->cp_nr_rings,
+				       sizeof(struct bnxt_ring_grp_info),
+				       GFP_KERNEL);
+		if (!bp->grp_info)
+			return -ENOMEM;
+	}
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		if (irq_re_init)
+			bp->grp_info[i].fw_stats_ctx = INVALID_HW_RING_ID;
+		bp->grp_info[i].fw_grp_id = INVALID_HW_RING_ID;
+		bp->grp_info[i].rx_fw_ring_id = INVALID_HW_RING_ID;
+		bp->grp_info[i].agg_fw_ring_id = INVALID_HW_RING_ID;
+		bp->grp_info[i].cp_fw_ring_id = INVALID_HW_RING_ID;
+	}
+	return 0;
+}
+
+static void bnxt_free_vnics(struct bnxt *bp)
+{
+	kfree(bp->vnic_info);
+	bp->vnic_info = NULL;
+	bp->nr_vnics = 0;
+}
+
+static int bnxt_alloc_vnics(struct bnxt *bp)
+{
+	int num_vnics = 1;
+
+#ifdef CONFIG_RFS_ACCEL
+	if (bp->flags & BNXT_FLAG_RFS)
+		num_vnics += bp->rx_nr_rings;
+#endif
+
+	if (BNXT_CHIP_TYPE_NITRO_A0(bp))
+		num_vnics++;
+
+	bp->vnic_info = kcalloc(num_vnics, sizeof(struct bnxt_vnic_info),
+				GFP_KERNEL);
+	if (!bp->vnic_info)
+		return -ENOMEM;
+
+	bp->nr_vnics = num_vnics;
+	return 0;
+}
+
+static void bnxt_init_vnics(struct bnxt *bp)
+{
+	int i;
+
+	for (i = 0; i < bp->nr_vnics; i++) {
+		struct bnxt_vnic_info *vnic = &bp->vnic_info[i];
+
+		vnic->fw_vnic_id = INVALID_HW_RING_ID;
+		vnic->fw_rss_cos_lb_ctx[0] = INVALID_HW_RING_ID;
+		vnic->fw_rss_cos_lb_ctx[1] = INVALID_HW_RING_ID;
+		vnic->fw_l2_ctx_id = INVALID_HW_RING_ID;
+
+		if (bp->vnic_info[i].rss_hash_key) {
+			if (i == 0)
+				prandom_bytes(vnic->rss_hash_key,
+					      HW_HASH_KEY_SIZE);
+			else
+				memcpy(vnic->rss_hash_key,
+				       bp->vnic_info[0].rss_hash_key,
+				       HW_HASH_KEY_SIZE);
+		}
+	}
+}
+
+static int bnxt_calc_nr_ring_pages(u32 ring_size, int desc_per_pg)
+{
+	int pages;
+
+	pages = ring_size / desc_per_pg;
+
+	if (!pages)
+		return 1;
+
+	pages++;
+
+	while (pages & (pages - 1))
+		pages++;
+
+	return pages;
+}
+
+void bnxt_set_tpa_flags(struct bnxt *bp)
+{
+	bp->flags &= ~BNXT_FLAG_TPA;
+	if (bp->flags & BNXT_FLAG_NO_AGG_RINGS)
+		return;
+	if (bp->dev->features & NETIF_F_LRO)
+		bp->flags |= BNXT_FLAG_LRO;
+	else if (bp->dev->features & NETIF_F_GRO_HW)
+		bp->flags |= BNXT_FLAG_GRO;
+}
+
+/* bp->rx_ring_size, bp->tx_ring_size, dev->mtu, BNXT_FLAG_{G|L}RO flags must
+ * be set on entry.
+ */
+void bnxt_set_ring_params(struct bnxt *bp)
+{
+	u32 ring_size, rx_size, rx_space;
+	u32 agg_factor = 0, agg_ring_size = 0;
+
+	/* 8 for CRC and VLAN */
+	rx_size = SKB_DATA_ALIGN(bp->dev->mtu + ETH_HLEN + NET_IP_ALIGN + 8);
+
+	rx_space = rx_size + NET_SKB_PAD +
+		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	bp->rx_copy_thresh = BNXT_RX_COPY_THRESH;
+	ring_size = bp->rx_ring_size;
+	bp->rx_agg_ring_size = 0;
+	bp->rx_agg_nr_pages = 0;
+
+	if (bp->flags & BNXT_FLAG_TPA)
+		agg_factor = min_t(u32, 4, 65536 / BNXT_RX_PAGE_SIZE);
+
+	bp->flags &= ~BNXT_FLAG_JUMBO;
+	if (rx_space > PAGE_SIZE && !(bp->flags & BNXT_FLAG_NO_AGG_RINGS)) {
+		u32 jumbo_factor;
+
+		bp->flags |= BNXT_FLAG_JUMBO;
+		jumbo_factor = PAGE_ALIGN(bp->dev->mtu - 40) >> PAGE_SHIFT;
+		if (jumbo_factor > agg_factor)
+			agg_factor = jumbo_factor;
+	}
+	agg_ring_size = ring_size * agg_factor;
+
+	if (agg_ring_size) {
+		bp->rx_agg_nr_pages = bnxt_calc_nr_ring_pages(agg_ring_size,
+							RX_DESC_CNT);
+		if (bp->rx_agg_nr_pages > MAX_RX_AGG_PAGES) {
+			u32 tmp = agg_ring_size;
+
+			bp->rx_agg_nr_pages = MAX_RX_AGG_PAGES;
+			agg_ring_size = MAX_RX_AGG_PAGES * RX_DESC_CNT - 1;
+			netdev_warn(bp->dev, "rx agg ring size %d reduced to %d.\n",
+				    tmp, agg_ring_size);
+		}
+		bp->rx_agg_ring_size = agg_ring_size;
+		bp->rx_agg_ring_mask = (bp->rx_agg_nr_pages * RX_DESC_CNT) - 1;
+		rx_size = SKB_DATA_ALIGN(BNXT_RX_COPY_THRESH + NET_IP_ALIGN);
+		rx_space = rx_size + NET_SKB_PAD +
+			SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	}
+
+	bp->rx_buf_use_size = rx_size;
+	bp->rx_buf_size = rx_space;
+
+	bp->rx_nr_pages = bnxt_calc_nr_ring_pages(ring_size, RX_DESC_CNT);
+	bp->rx_ring_mask = (bp->rx_nr_pages * RX_DESC_CNT) - 1;
+
+	ring_size = bp->tx_ring_size;
+	bp->tx_nr_pages = bnxt_calc_nr_ring_pages(ring_size, TX_DESC_CNT);
+	bp->tx_ring_mask = (bp->tx_nr_pages * TX_DESC_CNT) - 1;
+
+	ring_size = bp->rx_ring_size * (2 + agg_factor) + bp->tx_ring_size;
+	bp->cp_ring_size = ring_size;
+
+	bp->cp_nr_pages = bnxt_calc_nr_ring_pages(ring_size, CP_DESC_CNT);
+	if (bp->cp_nr_pages > MAX_CP_PAGES) {
+		bp->cp_nr_pages = MAX_CP_PAGES;
+		bp->cp_ring_size = MAX_CP_PAGES * CP_DESC_CNT - 1;
+		netdev_warn(bp->dev, "completion ring size %d reduced to %d.\n",
+			    ring_size, bp->cp_ring_size);
+	}
+	bp->cp_bit = bp->cp_nr_pages * CP_DESC_CNT;
+	bp->cp_ring_mask = bp->cp_bit - 1;
+}
+
+/* Changing allocation mode of RX rings.
+ * TODO: Update when extending xdp_rxq_info to support allocation modes.
+ */
+int bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode)
+{
+	if (page_mode) {
+		if (bp->dev->mtu > BNXT_MAX_PAGE_MODE_MTU)
+			return -EOPNOTSUPP;
+		bp->dev->max_mtu =
+			min_t(u16, bp->max_mtu, BNXT_MAX_PAGE_MODE_MTU);
+		bp->flags &= ~BNXT_FLAG_AGG_RINGS;
+		bp->flags |= BNXT_FLAG_NO_AGG_RINGS | BNXT_FLAG_RX_PAGE_MODE;
+		bp->rx_dir = DMA_BIDIRECTIONAL;
+		bp->rx_skb_func = bnxt_rx_page_skb;
+		/* Disable LRO or GRO_HW */
+		netdev_update_features(bp->dev);
+	} else {
+		bp->dev->max_mtu = bp->max_mtu;
+		bp->flags &= ~BNXT_FLAG_RX_PAGE_MODE;
+		bp->rx_dir = DMA_FROM_DEVICE;
+		bp->rx_skb_func = bnxt_rx_skb;
+	}
+	return 0;
+}
+
+static void bnxt_free_vnic_attributes(struct bnxt *bp)
+{
+	int i;
+	struct bnxt_vnic_info *vnic;
+	struct pci_dev *pdev = bp->pdev;
+
+	if (!bp->vnic_info)
+		return;
+
+	for (i = 0; i < bp->nr_vnics; i++) {
+		vnic = &bp->vnic_info[i];
+
+		kfree(vnic->fw_grp_ids);
+		vnic->fw_grp_ids = NULL;
+
+		kfree(vnic->uc_list);
+		vnic->uc_list = NULL;
+
+		if (vnic->mc_list) {
+			dma_free_coherent(&pdev->dev, vnic->mc_list_size,
+					  vnic->mc_list, vnic->mc_list_mapping);
+			vnic->mc_list = NULL;
+		}
+
+		if (vnic->rss_table) {
+			dma_free_coherent(&pdev->dev, PAGE_SIZE,
+					  vnic->rss_table,
+					  vnic->rss_table_dma_addr);
+			vnic->rss_table = NULL;
+		}
+
+		vnic->rss_hash_key = NULL;
+		vnic->flags = 0;
+	}
+}
+
+static int bnxt_alloc_vnic_attributes(struct bnxt *bp)
+{
+	int i, rc = 0, size;
+	struct bnxt_vnic_info *vnic;
+	struct pci_dev *pdev = bp->pdev;
+	int max_rings;
+
+	for (i = 0; i < bp->nr_vnics; i++) {
+		vnic = &bp->vnic_info[i];
+
+		if (vnic->flags & BNXT_VNIC_UCAST_FLAG) {
+			int mem_size = (BNXT_MAX_UC_ADDRS - 1) * ETH_ALEN;
+
+			if (mem_size > 0) {
+				vnic->uc_list = kmalloc(mem_size, GFP_KERNEL);
+				if (!vnic->uc_list) {
+					rc = -ENOMEM;
+					goto out;
+				}
+			}
+		}
+
+		if (vnic->flags & BNXT_VNIC_MCAST_FLAG) {
+			vnic->mc_list_size = BNXT_MAX_MC_ADDRS * ETH_ALEN;
+			vnic->mc_list =
+				dma_alloc_coherent(&pdev->dev,
+						   vnic->mc_list_size,
+						   &vnic->mc_list_mapping,
+						   GFP_KERNEL);
+			if (!vnic->mc_list) {
+				rc = -ENOMEM;
+				goto out;
+			}
+		}
+
+		if (vnic->flags & BNXT_VNIC_RSS_FLAG)
+			max_rings = bp->rx_nr_rings;
+		else
+			max_rings = 1;
+
+		vnic->fw_grp_ids = kcalloc(max_rings, sizeof(u16), GFP_KERNEL);
+		if (!vnic->fw_grp_ids) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		if ((bp->flags & BNXT_FLAG_NEW_RSS_CAP) &&
+		    !(vnic->flags & BNXT_VNIC_RSS_FLAG))
+			continue;
+
+		/* Allocate rss table and hash key */
+		vnic->rss_table = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
+						     &vnic->rss_table_dma_addr,
+						     GFP_KERNEL);
+		if (!vnic->rss_table) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		size = L1_CACHE_ALIGN(HW_HASH_INDEX_SIZE * sizeof(u16));
+
+		vnic->rss_hash_key = ((void *)vnic->rss_table) + size;
+		vnic->rss_hash_key_dma_addr = vnic->rss_table_dma_addr + size;
+	}
+	return 0;
+
+out:
+	return rc;
+}
+
+static void bnxt_free_hwrm_resources(struct bnxt *bp)
+{
+	struct pci_dev *pdev = bp->pdev;
+
+	dma_free_coherent(&pdev->dev, PAGE_SIZE, bp->hwrm_cmd_resp_addr,
+			  bp->hwrm_cmd_resp_dma_addr);
+
+	bp->hwrm_cmd_resp_addr = NULL;
+	if (bp->hwrm_dbg_resp_addr) {
+		dma_free_coherent(&pdev->dev, HWRM_DBG_REG_BUF_SIZE,
+				  bp->hwrm_dbg_resp_addr,
+				  bp->hwrm_dbg_resp_dma_addr);
+
+		bp->hwrm_dbg_resp_addr = NULL;
+	}
+}
+
+static int bnxt_alloc_hwrm_resources(struct bnxt *bp)
+{
+	struct pci_dev *pdev = bp->pdev;
+
+	bp->hwrm_cmd_resp_addr = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
+						   &bp->hwrm_cmd_resp_dma_addr,
+						   GFP_KERNEL);
+	if (!bp->hwrm_cmd_resp_addr)
+		return -ENOMEM;
+	bp->hwrm_dbg_resp_addr = dma_alloc_coherent(&pdev->dev,
+						    HWRM_DBG_REG_BUF_SIZE,
+						    &bp->hwrm_dbg_resp_dma_addr,
+						    GFP_KERNEL);
+	if (!bp->hwrm_dbg_resp_addr)
+		netdev_warn(bp->dev, "fail to alloc debug register dma mem\n");
+
+	return 0;
+}
+
+static void bnxt_free_hwrm_short_cmd_req(struct bnxt *bp)
+{
+	if (bp->hwrm_short_cmd_req_addr) {
+		struct pci_dev *pdev = bp->pdev;
+
+		dma_free_coherent(&pdev->dev, BNXT_HWRM_MAX_REQ_LEN,
+				  bp->hwrm_short_cmd_req_addr,
+				  bp->hwrm_short_cmd_req_dma_addr);
+		bp->hwrm_short_cmd_req_addr = NULL;
+	}
+}
+
+static int bnxt_alloc_hwrm_short_cmd_req(struct bnxt *bp)
+{
+	struct pci_dev *pdev = bp->pdev;
+
+	bp->hwrm_short_cmd_req_addr =
+		dma_alloc_coherent(&pdev->dev, BNXT_HWRM_MAX_REQ_LEN,
+				   &bp->hwrm_short_cmd_req_dma_addr,
+				   GFP_KERNEL);
+	if (!bp->hwrm_short_cmd_req_addr)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void bnxt_free_stats(struct bnxt *bp)
+{
+	u32 size, i;
+	struct pci_dev *pdev = bp->pdev;
+
+	bp->flags &= ~BNXT_FLAG_PORT_STATS;
+	bp->flags &= ~BNXT_FLAG_PORT_STATS_EXT;
+
+	if (bp->hw_rx_port_stats) {
+		dma_free_coherent(&pdev->dev, bp->hw_port_stats_size,
+				  bp->hw_rx_port_stats,
+				  bp->hw_rx_port_stats_map);
+		bp->hw_rx_port_stats = NULL;
+	}
+
+	if (bp->hw_rx_port_stats_ext) {
+		dma_free_coherent(&pdev->dev, sizeof(struct rx_port_stats_ext),
+				  bp->hw_rx_port_stats_ext,
+				  bp->hw_rx_port_stats_ext_map);
+		bp->hw_rx_port_stats_ext = NULL;
+	}
+
+	if (!bp->bnapi)
+		return;
+
+	size = sizeof(struct ctx_hw_stats);
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+
+		if (cpr->hw_stats) {
+			dma_free_coherent(&pdev->dev, size, cpr->hw_stats,
+					  cpr->hw_stats_map);
+			cpr->hw_stats = NULL;
+		}
+	}
+}
+
+static int bnxt_alloc_stats(struct bnxt *bp)
+{
+	u32 size, i;
+	struct pci_dev *pdev = bp->pdev;
+
+	size = sizeof(struct ctx_hw_stats);
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+
+		cpr->hw_stats = dma_alloc_coherent(&pdev->dev, size,
+						   &cpr->hw_stats_map,
+						   GFP_KERNEL);
+		if (!cpr->hw_stats)
+			return -ENOMEM;
+
+		cpr->hw_stats_ctx_id = INVALID_STATS_CTX_ID;
+	}
+
+	if (BNXT_PF(bp) && bp->chip_num != CHIP_NUM_58700) {
+		bp->hw_port_stats_size = sizeof(struct rx_port_stats) +
+					 sizeof(struct tx_port_stats) + 1024;
+
+		bp->hw_rx_port_stats =
+			dma_alloc_coherent(&pdev->dev, bp->hw_port_stats_size,
+					   &bp->hw_rx_port_stats_map,
+					   GFP_KERNEL);
+		if (!bp->hw_rx_port_stats)
+			return -ENOMEM;
+
+		bp->hw_tx_port_stats = (void *)(bp->hw_rx_port_stats + 1) +
+				       512;
+		bp->hw_tx_port_stats_map = bp->hw_rx_port_stats_map +
+					   sizeof(struct rx_port_stats) + 512;
+		bp->flags |= BNXT_FLAG_PORT_STATS;
+
+		/* Display extended statistics only if FW supports it */
+		if (bp->hwrm_spec_code < 0x10804 ||
+		    bp->hwrm_spec_code == 0x10900)
+			return 0;
+
+		bp->hw_rx_port_stats_ext =
+			dma_zalloc_coherent(&pdev->dev,
+					    sizeof(struct rx_port_stats_ext),
+					    &bp->hw_rx_port_stats_ext_map,
+					    GFP_KERNEL);
+		if (!bp->hw_rx_port_stats_ext)
+			return 0;
+
+		bp->flags |= BNXT_FLAG_PORT_STATS_EXT;
+	}
+	return 0;
+}
+
+static void bnxt_clear_ring_indices(struct bnxt *bp)
+{
+	int i;
+
+	if (!bp->bnapi)
+		return;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr;
+		struct bnxt_rx_ring_info *rxr;
+		struct bnxt_tx_ring_info *txr;
+
+		if (!bnapi)
+			continue;
+
+		cpr = &bnapi->cp_ring;
+		cpr->cp_raw_cons = 0;
+
+		txr = bnapi->tx_ring;
+		if (txr) {
+			txr->tx_prod = 0;
+			txr->tx_cons = 0;
+		}
+
+		rxr = bnapi->rx_ring;
+		if (rxr) {
+			rxr->rx_prod = 0;
+			rxr->rx_agg_prod = 0;
+			rxr->rx_sw_agg_prod = 0;
+			rxr->rx_next_cons = 0;
+		}
+	}
+}
+
+static void bnxt_free_ntp_fltrs(struct bnxt *bp, bool irq_reinit)
+{
+#ifdef CONFIG_RFS_ACCEL
+	int i;
+
+	/* Under rtnl_lock and all our NAPIs have been disabled.  It's
+	 * safe to delete the hash table.
+	 */
+	for (i = 0; i < BNXT_NTP_FLTR_HASH_SIZE; i++) {
+		struct hlist_head *head;
+		struct hlist_node *tmp;
+		struct bnxt_ntuple_filter *fltr;
+
+		head = &bp->ntp_fltr_hash_tbl[i];
+		hlist_for_each_entry_safe(fltr, tmp, head, hash) {
+			hlist_del(&fltr->hash);
+			kfree(fltr);
+		}
+	}
+	if (irq_reinit) {
+		kfree(bp->ntp_fltr_bmap);
+		bp->ntp_fltr_bmap = NULL;
+	}
+	bp->ntp_fltr_count = 0;
+#endif
+}
+
+static int bnxt_alloc_ntp_fltrs(struct bnxt *bp)
+{
+#ifdef CONFIG_RFS_ACCEL
+	int i, rc = 0;
+
+	if (!(bp->flags & BNXT_FLAG_RFS))
+		return 0;
+
+	for (i = 0; i < BNXT_NTP_FLTR_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&bp->ntp_fltr_hash_tbl[i]);
+
+	bp->ntp_fltr_count = 0;
+	bp->ntp_fltr_bmap = kcalloc(BITS_TO_LONGS(BNXT_NTP_FLTR_MAX_FLTR),
+				    sizeof(long),
+				    GFP_KERNEL);
+
+	if (!bp->ntp_fltr_bmap)
+		rc = -ENOMEM;
+
+	return rc;
+#else
+	return 0;
+#endif
+}
+
+static void bnxt_free_mem(struct bnxt *bp, bool irq_re_init)
+{
+	bnxt_free_vnic_attributes(bp);
+	bnxt_free_tx_rings(bp);
+	bnxt_free_rx_rings(bp);
+	bnxt_free_cp_rings(bp);
+	bnxt_free_ntp_fltrs(bp, irq_re_init);
+	if (irq_re_init) {
+		bnxt_free_stats(bp);
+		bnxt_free_ring_grps(bp);
+		bnxt_free_vnics(bp);
+		kfree(bp->tx_ring_map);
+		bp->tx_ring_map = NULL;
+		kfree(bp->tx_ring);
+		bp->tx_ring = NULL;
+		kfree(bp->rx_ring);
+		bp->rx_ring = NULL;
+		kfree(bp->bnapi);
+		bp->bnapi = NULL;
+	} else {
+		bnxt_clear_ring_indices(bp);
+	}
+}
+
+static int bnxt_alloc_mem(struct bnxt *bp, bool irq_re_init)
+{
+	int i, j, rc, size, arr_size;
+	void *bnapi;
+
+	if (irq_re_init) {
+		/* Allocate bnapi mem pointer array and mem block for
+		 * all queues
+		 */
+		arr_size = L1_CACHE_ALIGN(sizeof(struct bnxt_napi *) *
+				bp->cp_nr_rings);
+		size = L1_CACHE_ALIGN(sizeof(struct bnxt_napi));
+		bnapi = kzalloc(arr_size + size * bp->cp_nr_rings, GFP_KERNEL);
+		if (!bnapi)
+			return -ENOMEM;
+
+		bp->bnapi = bnapi;
+		bnapi += arr_size;
+		for (i = 0; i < bp->cp_nr_rings; i++, bnapi += size) {
+			bp->bnapi[i] = bnapi;
+			bp->bnapi[i]->index = i;
+			bp->bnapi[i]->bp = bp;
+		}
+
+		bp->rx_ring = kcalloc(bp->rx_nr_rings,
+				      sizeof(struct bnxt_rx_ring_info),
+				      GFP_KERNEL);
+		if (!bp->rx_ring)
+			return -ENOMEM;
+
+		for (i = 0; i < bp->rx_nr_rings; i++) {
+			bp->rx_ring[i].bnapi = bp->bnapi[i];
+			bp->bnapi[i]->rx_ring = &bp->rx_ring[i];
+		}
+
+		bp->tx_ring = kcalloc(bp->tx_nr_rings,
+				      sizeof(struct bnxt_tx_ring_info),
+				      GFP_KERNEL);
+		if (!bp->tx_ring)
+			return -ENOMEM;
+
+		bp->tx_ring_map = kcalloc(bp->tx_nr_rings, sizeof(u16),
+					  GFP_KERNEL);
+
+		if (!bp->tx_ring_map)
+			return -ENOMEM;
+
+		if (bp->flags & BNXT_FLAG_SHARED_RINGS)
+			j = 0;
+		else
+			j = bp->rx_nr_rings;
+
+		for (i = 0; i < bp->tx_nr_rings; i++, j++) {
+			bp->tx_ring[i].bnapi = bp->bnapi[j];
+			bp->bnapi[j]->tx_ring = &bp->tx_ring[i];
+			bp->tx_ring_map[i] = bp->tx_nr_rings_xdp + i;
+			if (i >= bp->tx_nr_rings_xdp) {
+				bp->tx_ring[i].txq_index = i -
+					bp->tx_nr_rings_xdp;
+				bp->bnapi[j]->tx_int = bnxt_tx_int;
+			} else {
+				bp->bnapi[j]->flags |= BNXT_NAPI_FLAG_XDP;
+				bp->bnapi[j]->tx_int = bnxt_tx_int_xdp;
+			}
+		}
+
+		rc = bnxt_alloc_stats(bp);
+		if (rc)
+			goto alloc_mem_err;
+
+		rc = bnxt_alloc_ntp_fltrs(bp);
+		if (rc)
+			goto alloc_mem_err;
+
+		rc = bnxt_alloc_vnics(bp);
+		if (rc)
+			goto alloc_mem_err;
+	}
+
+	bnxt_init_ring_struct(bp);
+
+	rc = bnxt_alloc_rx_rings(bp);
+	if (rc)
+		goto alloc_mem_err;
+
+	rc = bnxt_alloc_tx_rings(bp);
+	if (rc)
+		goto alloc_mem_err;
+
+	rc = bnxt_alloc_cp_rings(bp);
+	if (rc)
+		goto alloc_mem_err;
+
+	bp->vnic_info[0].flags |= BNXT_VNIC_RSS_FLAG | BNXT_VNIC_MCAST_FLAG |
+				  BNXT_VNIC_UCAST_FLAG;
+	rc = bnxt_alloc_vnic_attributes(bp);
+	if (rc)
+		goto alloc_mem_err;
+	return 0;
+
+alloc_mem_err:
+	bnxt_free_mem(bp, true);
+	return rc;
+}
+
+static void bnxt_disable_int(struct bnxt *bp)
+{
+	int i;
+
+	if (!bp->bnapi)
+		return;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+		struct bnxt_ring_struct *ring = &cpr->cp_ring_struct;
+
+		if (ring->fw_ring_id != INVALID_HW_RING_ID)
+			BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
+	}
+}
+
+static int bnxt_cp_num_to_irq_num(struct bnxt *bp, int n)
+{
+	struct bnxt_napi *bnapi = bp->bnapi[n];
+	struct bnxt_cp_ring_info *cpr;
+
+	cpr = &bnapi->cp_ring;
+	return cpr->cp_ring_struct.map_idx;
+}
+
+static void bnxt_disable_int_sync(struct bnxt *bp)
+{
+	int i;
+
+	atomic_inc(&bp->intr_sem);
+
+	bnxt_disable_int(bp);
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		int map_idx = bnxt_cp_num_to_irq_num(bp, i);
+
+		synchronize_irq(bp->irq_tbl[map_idx].vector);
+	}
+}
+
+static void bnxt_enable_int(struct bnxt *bp)
+{
+	int i;
+
+	atomic_set(&bp->intr_sem, 0);
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+
+		BNXT_CP_DB_REARM(cpr->cp_doorbell, cpr->cp_raw_cons);
+	}
+}
+
+void bnxt_hwrm_cmd_hdr_init(struct bnxt *bp, void *request, u16 req_type,
+			    u16 cmpl_ring, u16 target_id)
+{
+	struct input *req = request;
+
+	req->req_type = cpu_to_le16(req_type);
+	req->cmpl_ring = cpu_to_le16(cmpl_ring);
+	req->target_id = cpu_to_le16(target_id);
+	req->resp_addr = cpu_to_le64(bp->hwrm_cmd_resp_dma_addr);
+}
+
+static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len,
+				 int timeout, bool silent)
+{
+	int i, intr_process, rc, tmo_count;
+	struct input *req = msg;
+	u32 *data = msg;
+	__le32 *resp_len;
+	u8 *valid;
+	u16 cp_ring_id, len = 0;
+	struct hwrm_err_output *resp = bp->hwrm_cmd_resp_addr;
+	u16 max_req_len = BNXT_HWRM_MAX_REQ_LEN;
+	struct hwrm_short_input short_input = {0};
+
+	req->seq_id = cpu_to_le16(bp->hwrm_cmd_seq++);
+	memset(resp, 0, PAGE_SIZE);
+	cp_ring_id = le16_to_cpu(req->cmpl_ring);
+	intr_process = (cp_ring_id == INVALID_HW_RING_ID) ? 0 : 1;
+
+	if (bp->flags & BNXT_FLAG_SHORT_CMD) {
+		void *short_cmd_req = bp->hwrm_short_cmd_req_addr;
+
+		memcpy(short_cmd_req, req, msg_len);
+		memset(short_cmd_req + msg_len, 0, BNXT_HWRM_MAX_REQ_LEN -
+						   msg_len);
+
+		short_input.req_type = req->req_type;
+		short_input.signature =
+				cpu_to_le16(SHORT_REQ_SIGNATURE_SHORT_CMD);
+		short_input.size = cpu_to_le16(msg_len);
+		short_input.req_addr =
+			cpu_to_le64(bp->hwrm_short_cmd_req_dma_addr);
+
+		data = (u32 *)&short_input;
+		msg_len = sizeof(short_input);
+
+		/* Sync memory write before updating doorbell */
+		wmb();
+
+		max_req_len = BNXT_HWRM_SHORT_REQ_LEN;
+	}
+
+	/* Write request msg to hwrm channel */
+	__iowrite32_copy(bp->bar0, data, msg_len / 4);
+
+	for (i = msg_len; i < max_req_len; i += 4)
+		writel(0, bp->bar0 + i);
+
+	/* currently supports only one outstanding message */
+	if (intr_process)
+		bp->hwrm_intr_seq_id = le16_to_cpu(req->seq_id);
+
+	/* Ring channel doorbell */
+	writel(1, bp->bar0 + 0x100);
+
+	if (!timeout)
+		timeout = DFLT_HWRM_CMD_TIMEOUT;
+
+	i = 0;
+	tmo_count = timeout * 40;
+	resp_len = bp->hwrm_cmd_resp_addr + HWRM_RESP_LEN_OFFSET;
+	if (intr_process) {
+		/* Wait until hwrm response cmpl interrupt is processed */
+		while (bp->hwrm_intr_seq_id != HWRM_SEQ_ID_INVALID &&
+		       i++ < tmo_count) {
+			usleep_range(25, 40);
+		}
+
+		if (bp->hwrm_intr_seq_id != HWRM_SEQ_ID_INVALID) {
+			netdev_err(bp->dev, "Resp cmpl intr err msg: 0x%x\n",
+				   le16_to_cpu(req->req_type));
+			return -1;
+		}
+		len = (le32_to_cpu(*resp_len) & HWRM_RESP_LEN_MASK) >>
+		      HWRM_RESP_LEN_SFT;
+		valid = bp->hwrm_cmd_resp_addr + len - 1;
+	} else {
+		/* Check if response len is updated */
+		for (i = 0; i < tmo_count; i++) {
+			len = (le32_to_cpu(*resp_len) & HWRM_RESP_LEN_MASK) >>
+			      HWRM_RESP_LEN_SFT;
+			if (len)
+				break;
+			usleep_range(25, 40);
+		}
+
+		if (i >= tmo_count) {
+			netdev_err(bp->dev, "Error (timeout: %d) msg {0x%x 0x%x} len:%d\n",
+				   timeout, le16_to_cpu(req->req_type),
+				   le16_to_cpu(req->seq_id), len);
+			return -1;
+		}
+
+		/* Last byte of resp contains valid bit */
+		valid = bp->hwrm_cmd_resp_addr + len - 1;
+		for (i = 0; i < 5; i++) {
+			/* make sure we read from updated DMA memory */
+			dma_rmb();
+			if (*valid)
+				break;
+			udelay(1);
+		}
+
+		if (i >= 5) {
+			netdev_err(bp->dev, "Error (timeout: %d) msg {0x%x 0x%x} len:%d v:%d\n",
+				   timeout, le16_to_cpu(req->req_type),
+				   le16_to_cpu(req->seq_id), len, *valid);
+			return -1;
+		}
+	}
+
+	/* Zero valid bit for compatibility.  Valid bit in an older spec
+	 * may become a new field in a newer spec.  We must make sure that
+	 * a new field not implemented by old spec will read zero.
+	 */
+	*valid = 0;
+	rc = le16_to_cpu(resp->error_code);
+	if (rc && !silent)
+		netdev_err(bp->dev, "hwrm req_type 0x%x seq id 0x%x error 0x%x\n",
+			   le16_to_cpu(resp->req_type),
+			   le16_to_cpu(resp->seq_id), rc);
+	return rc;
+}
+
+int _hwrm_send_message(struct bnxt *bp, void *msg, u32 msg_len, int timeout)
+{
+	return bnxt_hwrm_do_send_msg(bp, msg, msg_len, timeout, false);
+}
+
+int _hwrm_send_message_silent(struct bnxt *bp, void *msg, u32 msg_len,
+			      int timeout)
+{
+	return bnxt_hwrm_do_send_msg(bp, msg, msg_len, timeout, true);
+}
+
+int hwrm_send_message(struct bnxt *bp, void *msg, u32 msg_len, int timeout)
+{
+	int rc;
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, msg, msg_len, timeout);
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+int hwrm_send_message_silent(struct bnxt *bp, void *msg, u32 msg_len,
+			     int timeout)
+{
+	int rc;
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = bnxt_hwrm_do_send_msg(bp, msg, msg_len, timeout, true);
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
+				     int bmap_size)
+{
+	struct hwrm_func_drv_rgtr_input req = {0};
+	DECLARE_BITMAP(async_events_bmap, 256);
+	u32 *events = (u32 *)async_events_bmap;
+	int i;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1);
+
+	req.enables =
+		cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_ASYNC_EVENT_FWD);
+
+	memset(async_events_bmap, 0, sizeof(async_events_bmap));
+	for (i = 0; i < ARRAY_SIZE(bnxt_async_events_arr); i++)
+		__set_bit(bnxt_async_events_arr[i], async_events_bmap);
+
+	if (bmap && bmap_size) {
+		for (i = 0; i < bmap_size; i++) {
+			if (test_bit(i, bmap))
+				__set_bit(i, async_events_bmap);
+		}
+	}
+
+	for (i = 0; i < 8; i++)
+		req.async_event_fwd[i] |= cpu_to_le32(events[i]);
+
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
+{
+	struct hwrm_func_drv_rgtr_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1);
+
+	req.enables =
+		cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_OS_TYPE |
+			    FUNC_DRV_RGTR_REQ_ENABLES_VER);
+
+	req.os_type = cpu_to_le16(FUNC_DRV_RGTR_REQ_OS_TYPE_LINUX);
+	req.flags = cpu_to_le32(FUNC_DRV_RGTR_REQ_FLAGS_16BIT_VER_MODE);
+	req.ver_maj_8b = DRV_VER_MAJ;
+	req.ver_min_8b = DRV_VER_MIN;
+	req.ver_upd_8b = DRV_VER_UPD;
+	req.ver_maj = cpu_to_le16(DRV_VER_MAJ);
+	req.ver_min = cpu_to_le16(DRV_VER_MIN);
+	req.ver_upd = cpu_to_le16(DRV_VER_UPD);
+
+	if (BNXT_PF(bp)) {
+		u32 data[8];
+		int i;
+
+		memset(data, 0, sizeof(data));
+		for (i = 0; i < ARRAY_SIZE(bnxt_vf_req_snif); i++) {
+			u16 cmd = bnxt_vf_req_snif[i];
+			unsigned int bit, idx;
+
+			idx = cmd / 32;
+			bit = cmd % 32;
+			data[idx] |= 1 << bit;
+		}
+
+		for (i = 0; i < 8; i++)
+			req.vf_req_fwd[i] = cpu_to_le32(data[i]);
+
+		req.enables |=
+			cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_VF_REQ_FWD);
+	}
+
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_hwrm_func_drv_unrgtr(struct bnxt *bp)
+{
+	struct hwrm_func_drv_unrgtr_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_UNRGTR, -1, -1);
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_hwrm_tunnel_dst_port_free(struct bnxt *bp, u8 tunnel_type)
+{
+	u32 rc = 0;
+	struct hwrm_tunnel_dst_port_free_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_TUNNEL_DST_PORT_FREE, -1, -1);
+	req.tunnel_type = tunnel_type;
+
+	switch (tunnel_type) {
+	case TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN:
+		req.tunnel_dst_port_id = bp->vxlan_fw_dst_port_id;
+		break;
+	case TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE:
+		req.tunnel_dst_port_id = bp->nge_fw_dst_port_id;
+		break;
+	default:
+		break;
+	}
+
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		netdev_err(bp->dev, "hwrm_tunnel_dst_port_free failed. rc:%d\n",
+			   rc);
+	return rc;
+}
+
+static int bnxt_hwrm_tunnel_dst_port_alloc(struct bnxt *bp, __be16 port,
+					   u8 tunnel_type)
+{
+	u32 rc = 0;
+	struct hwrm_tunnel_dst_port_alloc_input req = {0};
+	struct hwrm_tunnel_dst_port_alloc_output *resp = bp->hwrm_cmd_resp_addr;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_TUNNEL_DST_PORT_ALLOC, -1, -1);
+
+	req.tunnel_type = tunnel_type;
+	req.tunnel_dst_port_val = port;
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc) {
+		netdev_err(bp->dev, "hwrm_tunnel_dst_port_alloc failed. rc:%d\n",
+			   rc);
+		goto err_out;
+	}
+
+	switch (tunnel_type) {
+	case TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN:
+		bp->vxlan_fw_dst_port_id = resp->tunnel_dst_port_id;
+		break;
+	case TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GENEVE:
+		bp->nge_fw_dst_port_id = resp->tunnel_dst_port_id;
+		break;
+	default:
+		break;
+	}
+
+err_out:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_hwrm_cfa_l2_set_rx_mask(struct bnxt *bp, u16 vnic_id)
+{
+	struct hwrm_cfa_l2_set_rx_mask_input req = {0};
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_L2_SET_RX_MASK, -1, -1);
+	req.vnic_id = cpu_to_le32(vnic->fw_vnic_id);
+
+	req.num_mc_entries = cpu_to_le32(vnic->mc_list_count);
+	req.mc_tbl_addr = cpu_to_le64(vnic->mc_list_mapping);
+	req.mask = cpu_to_le32(vnic->rx_mask);
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+#ifdef CONFIG_RFS_ACCEL
+static int bnxt_hwrm_cfa_ntuple_filter_free(struct bnxt *bp,
+					    struct bnxt_ntuple_filter *fltr)
+{
+	struct hwrm_cfa_ntuple_filter_free_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_NTUPLE_FILTER_FREE, -1, -1);
+	req.ntuple_filter_id = fltr->filter_id;
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+#define BNXT_NTP_FLTR_FLAGS					\
+	(CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_L2_FILTER_ID |	\
+	 CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_ETHERTYPE |	\
+	 CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_MACADDR |	\
+	 CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_IPADDR_TYPE |	\
+	 CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_IPADDR |	\
+	 CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_IPADDR_MASK |	\
+	 CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_IPADDR |	\
+	 CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_IPADDR_MASK |	\
+	 CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_IP_PROTOCOL |	\
+	 CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_PORT |		\
+	 CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_PORT_MASK |	\
+	 CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_PORT |		\
+	 CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_PORT_MASK |	\
+	 CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_ID)
+
+#define BNXT_NTP_TUNNEL_FLTR_FLAG				\
+		CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE
+
+static int bnxt_hwrm_cfa_ntuple_filter_alloc(struct bnxt *bp,
+					     struct bnxt_ntuple_filter *fltr)
+{
+	int rc = 0;
+	struct hwrm_cfa_ntuple_filter_alloc_input req = {0};
+	struct hwrm_cfa_ntuple_filter_alloc_output *resp =
+		bp->hwrm_cmd_resp_addr;
+	struct flow_keys *keys = &fltr->fkeys;
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[fltr->rxq + 1];
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_NTUPLE_FILTER_ALLOC, -1, -1);
+	req.l2_filter_id = bp->vnic_info[0].fw_l2_filter_id[fltr->l2_fltr_idx];
+
+	req.enables = cpu_to_le32(BNXT_NTP_FLTR_FLAGS);
+
+	req.ethertype = htons(ETH_P_IP);
+	memcpy(req.src_macaddr, fltr->src_mac_addr, ETH_ALEN);
+	req.ip_addr_type = CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV4;
+	req.ip_protocol = keys->basic.ip_proto;
+
+	if (keys->basic.n_proto == htons(ETH_P_IPV6)) {
+		int i;
+
+		req.ethertype = htons(ETH_P_IPV6);
+		req.ip_addr_type =
+			CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6;
+		*(struct in6_addr *)&req.src_ipaddr[0] =
+			keys->addrs.v6addrs.src;
+		*(struct in6_addr *)&req.dst_ipaddr[0] =
+			keys->addrs.v6addrs.dst;
+		for (i = 0; i < 4; i++) {
+			req.src_ipaddr_mask[i] = cpu_to_be32(0xffffffff);
+			req.dst_ipaddr_mask[i] = cpu_to_be32(0xffffffff);
+		}
+	} else {
+		req.src_ipaddr[0] = keys->addrs.v4addrs.src;
+		req.src_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
+		req.dst_ipaddr[0] = keys->addrs.v4addrs.dst;
+		req.dst_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
+	}
+	if (keys->control.flags & FLOW_DIS_ENCAPSULATION) {
+		req.enables |= cpu_to_le32(BNXT_NTP_TUNNEL_FLTR_FLAG);
+		req.tunnel_type =
+			CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL;
+	}
+
+	req.src_port = keys->ports.src;
+	req.src_port_mask = cpu_to_be16(0xffff);
+	req.dst_port = keys->ports.dst;
+	req.dst_port_mask = cpu_to_be16(0xffff);
+
+	req.dst_id = cpu_to_le16(vnic->fw_vnic_id);
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc)
+		fltr->filter_id = resp->ntuple_filter_id;
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+#endif
+
+static int bnxt_hwrm_set_vnic_filter(struct bnxt *bp, u16 vnic_id, u16 idx,
+				     u8 *mac_addr)
+{
+	u32 rc = 0;
+	struct hwrm_cfa_l2_filter_alloc_input req = {0};
+	struct hwrm_cfa_l2_filter_alloc_output *resp = bp->hwrm_cmd_resp_addr;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_L2_FILTER_ALLOC, -1, -1);
+	req.flags = cpu_to_le32(CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH_RX);
+	if (!BNXT_CHIP_TYPE_NITRO_A0(bp))
+		req.flags |=
+			cpu_to_le32(CFA_L2_FILTER_ALLOC_REQ_FLAGS_OUTERMOST);
+	req.dst_id = cpu_to_le16(bp->vnic_info[vnic_id].fw_vnic_id);
+	req.enables =
+		cpu_to_le32(CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_ADDR |
+			    CFA_L2_FILTER_ALLOC_REQ_ENABLES_DST_ID |
+			    CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_ADDR_MASK);
+	memcpy(req.l2_addr, mac_addr, ETH_ALEN);
+	req.l2_addr_mask[0] = 0xff;
+	req.l2_addr_mask[1] = 0xff;
+	req.l2_addr_mask[2] = 0xff;
+	req.l2_addr_mask[3] = 0xff;
+	req.l2_addr_mask[4] = 0xff;
+	req.l2_addr_mask[5] = 0xff;
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc)
+		bp->vnic_info[vnic_id].fw_l2_filter_id[idx] =
+							resp->l2_filter_id;
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_hwrm_clear_vnic_filter(struct bnxt *bp)
+{
+	u16 i, j, num_of_vnics = 1; /* only vnic 0 supported */
+	int rc = 0;
+
+	/* Any associated ntuple filters will also be cleared by firmware. */
+	mutex_lock(&bp->hwrm_cmd_lock);
+	for (i = 0; i < num_of_vnics; i++) {
+		struct bnxt_vnic_info *vnic = &bp->vnic_info[i];
+
+		for (j = 0; j < vnic->uc_filter_count; j++) {
+			struct hwrm_cfa_l2_filter_free_input req = {0};
+
+			bnxt_hwrm_cmd_hdr_init(bp, &req,
+					       HWRM_CFA_L2_FILTER_FREE, -1, -1);
+
+			req.l2_filter_id = vnic->fw_l2_filter_id[j];
+
+			rc = _hwrm_send_message(bp, &req, sizeof(req),
+						HWRM_CMD_TIMEOUT);
+		}
+		vnic->uc_filter_count = 0;
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+
+	return rc;
+}
+
+static int bnxt_hwrm_vnic_set_tpa(struct bnxt *bp, u16 vnic_id, u32 tpa_flags)
+{
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
+	struct hwrm_vnic_tpa_cfg_input req = {0};
+
+	if (vnic->fw_vnic_id == INVALID_HW_RING_ID)
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_TPA_CFG, -1, -1);
+
+	if (tpa_flags) {
+		u16 mss = bp->dev->mtu - 40;
+		u32 nsegs, n, segs = 0, flags;
+
+		flags = VNIC_TPA_CFG_REQ_FLAGS_TPA |
+			VNIC_TPA_CFG_REQ_FLAGS_ENCAP_TPA |
+			VNIC_TPA_CFG_REQ_FLAGS_RSC_WND_UPDATE |
+			VNIC_TPA_CFG_REQ_FLAGS_AGG_WITH_ECN |
+			VNIC_TPA_CFG_REQ_FLAGS_AGG_WITH_SAME_GRE_SEQ;
+		if (tpa_flags & BNXT_FLAG_GRO)
+			flags |= VNIC_TPA_CFG_REQ_FLAGS_GRO;
+
+		req.flags = cpu_to_le32(flags);
+
+		req.enables =
+			cpu_to_le32(VNIC_TPA_CFG_REQ_ENABLES_MAX_AGG_SEGS |
+				    VNIC_TPA_CFG_REQ_ENABLES_MAX_AGGS |
+				    VNIC_TPA_CFG_REQ_ENABLES_MIN_AGG_LEN);
+
+		/* Number of segs are log2 units, and first packet is not
+		 * included as part of this units.
+		 */
+		if (mss <= BNXT_RX_PAGE_SIZE) {
+			n = BNXT_RX_PAGE_SIZE / mss;
+			nsegs = (MAX_SKB_FRAGS - 1) * n;
+		} else {
+			n = mss / BNXT_RX_PAGE_SIZE;
+			if (mss & (BNXT_RX_PAGE_SIZE - 1))
+				n++;
+			nsegs = (MAX_SKB_FRAGS - n) / n;
+		}
+
+		segs = ilog2(nsegs);
+		req.max_agg_segs = cpu_to_le16(segs);
+		req.max_aggs = cpu_to_le16(VNIC_TPA_CFG_REQ_MAX_AGGS_MAX);
+
+		req.min_agg_len = cpu_to_le32(512);
+	}
+	req.vnic_id = cpu_to_le16(vnic->fw_vnic_id);
+
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_hwrm_vnic_set_rss(struct bnxt *bp, u16 vnic_id, bool set_rss)
+{
+	u32 i, j, max_rings;
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
+	struct hwrm_vnic_rss_cfg_input req = {0};
+
+	if (vnic->fw_rss_cos_lb_ctx[0] == INVALID_HW_RING_ID)
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_RSS_CFG, -1, -1);
+	if (set_rss) {
+		req.hash_type = cpu_to_le32(bp->rss_hash_cfg);
+		if (vnic->flags & BNXT_VNIC_RSS_FLAG) {
+			if (BNXT_CHIP_TYPE_NITRO_A0(bp))
+				max_rings = bp->rx_nr_rings - 1;
+			else
+				max_rings = bp->rx_nr_rings;
+		} else {
+			max_rings = 1;
+		}
+
+		/* Fill the RSS indirection table with ring group ids */
+		for (i = 0, j = 0; i < HW_HASH_INDEX_SIZE; i++, j++) {
+			if (j == max_rings)
+				j = 0;
+			vnic->rss_table[i] = cpu_to_le16(vnic->fw_grp_ids[j]);
+		}
+
+		req.ring_grp_tbl_addr = cpu_to_le64(vnic->rss_table_dma_addr);
+		req.hash_key_tbl_addr =
+			cpu_to_le64(vnic->rss_hash_key_dma_addr);
+	}
+	req.rss_ctx_idx = cpu_to_le16(vnic->fw_rss_cos_lb_ctx[0]);
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_hwrm_vnic_set_hds(struct bnxt *bp, u16 vnic_id)
+{
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
+	struct hwrm_vnic_plcmodes_cfg_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_PLCMODES_CFG, -1, -1);
+	req.flags = cpu_to_le32(VNIC_PLCMODES_CFG_REQ_FLAGS_JUMBO_PLACEMENT |
+				VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_IPV4 |
+				VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_IPV6);
+	req.enables =
+		cpu_to_le32(VNIC_PLCMODES_CFG_REQ_ENABLES_JUMBO_THRESH_VALID |
+			    VNIC_PLCMODES_CFG_REQ_ENABLES_HDS_THRESHOLD_VALID);
+	/* thresholds not implemented in firmware yet */
+	req.jumbo_thresh = cpu_to_le16(bp->rx_copy_thresh);
+	req.hds_threshold = cpu_to_le16(bp->rx_copy_thresh);
+	req.vnic_id = cpu_to_le32(vnic->fw_vnic_id);
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static void bnxt_hwrm_vnic_ctx_free_one(struct bnxt *bp, u16 vnic_id,
+					u16 ctx_idx)
+{
+	struct hwrm_vnic_rss_cos_lb_ctx_free_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_RSS_COS_LB_CTX_FREE, -1, -1);
+	req.rss_cos_lb_ctx_id =
+		cpu_to_le16(bp->vnic_info[vnic_id].fw_rss_cos_lb_ctx[ctx_idx]);
+
+	hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	bp->vnic_info[vnic_id].fw_rss_cos_lb_ctx[ctx_idx] = INVALID_HW_RING_ID;
+}
+
+static void bnxt_hwrm_vnic_ctx_free(struct bnxt *bp)
+{
+	int i, j;
+
+	for (i = 0; i < bp->nr_vnics; i++) {
+		struct bnxt_vnic_info *vnic = &bp->vnic_info[i];
+
+		for (j = 0; j < BNXT_MAX_CTX_PER_VNIC; j++) {
+			if (vnic->fw_rss_cos_lb_ctx[j] != INVALID_HW_RING_ID)
+				bnxt_hwrm_vnic_ctx_free_one(bp, i, j);
+		}
+	}
+	bp->rsscos_nr_ctxs = 0;
+}
+
+static int bnxt_hwrm_vnic_ctx_alloc(struct bnxt *bp, u16 vnic_id, u16 ctx_idx)
+{
+	int rc;
+	struct hwrm_vnic_rss_cos_lb_ctx_alloc_input req = {0};
+	struct hwrm_vnic_rss_cos_lb_ctx_alloc_output *resp =
+						bp->hwrm_cmd_resp_addr;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_RSS_COS_LB_CTX_ALLOC, -1,
+			       -1);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc)
+		bp->vnic_info[vnic_id].fw_rss_cos_lb_ctx[ctx_idx] =
+			le16_to_cpu(resp->rss_cos_lb_ctx_id);
+	mutex_unlock(&bp->hwrm_cmd_lock);
+
+	return rc;
+}
+
+static u32 bnxt_get_roce_vnic_mode(struct bnxt *bp)
+{
+	if (bp->flags & BNXT_FLAG_ROCE_MIRROR_CAP)
+		return VNIC_CFG_REQ_FLAGS_ROCE_MIRRORING_CAPABLE_VNIC_MODE;
+	return VNIC_CFG_REQ_FLAGS_ROCE_DUAL_VNIC_MODE;
+}
+
+int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id)
+{
+	unsigned int ring = 0, grp_idx;
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
+	struct hwrm_vnic_cfg_input req = {0};
+	u16 def_vlan = 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_CFG, -1, -1);
+
+	req.enables = cpu_to_le32(VNIC_CFG_REQ_ENABLES_DFLT_RING_GRP);
+	/* Only RSS support for now TBD: COS & LB */
+	if (vnic->fw_rss_cos_lb_ctx[0] != INVALID_HW_RING_ID) {
+		req.rss_rule = cpu_to_le16(vnic->fw_rss_cos_lb_ctx[0]);
+		req.enables |= cpu_to_le32(VNIC_CFG_REQ_ENABLES_RSS_RULE |
+					   VNIC_CFG_REQ_ENABLES_MRU);
+	} else if (vnic->flags & BNXT_VNIC_RFS_NEW_RSS_FLAG) {
+		req.rss_rule =
+			cpu_to_le16(bp->vnic_info[0].fw_rss_cos_lb_ctx[0]);
+		req.enables |= cpu_to_le32(VNIC_CFG_REQ_ENABLES_RSS_RULE |
+					   VNIC_CFG_REQ_ENABLES_MRU);
+		req.flags |= cpu_to_le32(VNIC_CFG_REQ_FLAGS_RSS_DFLT_CR_MODE);
+	} else {
+		req.rss_rule = cpu_to_le16(0xffff);
+	}
+
+	if (BNXT_CHIP_TYPE_NITRO_A0(bp) &&
+	    (vnic->fw_rss_cos_lb_ctx[0] != INVALID_HW_RING_ID)) {
+		req.cos_rule = cpu_to_le16(vnic->fw_rss_cos_lb_ctx[1]);
+		req.enables |= cpu_to_le32(VNIC_CFG_REQ_ENABLES_COS_RULE);
+	} else {
+		req.cos_rule = cpu_to_le16(0xffff);
+	}
+
+	if (vnic->flags & BNXT_VNIC_RSS_FLAG)
+		ring = 0;
+	else if (vnic->flags & BNXT_VNIC_RFS_FLAG)
+		ring = vnic_id - 1;
+	else if ((vnic_id == 1) && BNXT_CHIP_TYPE_NITRO_A0(bp))
+		ring = bp->rx_nr_rings - 1;
+
+	grp_idx = bp->rx_ring[ring].bnapi->index;
+	req.vnic_id = cpu_to_le16(vnic->fw_vnic_id);
+	req.dflt_ring_grp = cpu_to_le16(bp->grp_info[grp_idx].fw_grp_id);
+
+	req.lb_rule = cpu_to_le16(0xffff);
+	req.mru = cpu_to_le16(bp->dev->mtu + ETH_HLEN + ETH_FCS_LEN +
+			      VLAN_HLEN);
+
+#ifdef CONFIG_BNXT_SRIOV
+	if (BNXT_VF(bp))
+		def_vlan = bp->vf.vlan;
+#endif
+	if ((bp->flags & BNXT_FLAG_STRIP_VLAN) || def_vlan)
+		req.flags |= cpu_to_le32(VNIC_CFG_REQ_FLAGS_VLAN_STRIP_MODE);
+	if (!vnic_id && bnxt_ulp_registered(bp->edev, BNXT_ROCE_ULP))
+		req.flags |= cpu_to_le32(bnxt_get_roce_vnic_mode(bp));
+
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_hwrm_vnic_free_one(struct bnxt *bp, u16 vnic_id)
+{
+	u32 rc = 0;
+
+	if (bp->vnic_info[vnic_id].fw_vnic_id != INVALID_HW_RING_ID) {
+		struct hwrm_vnic_free_input req = {0};
+
+		bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_FREE, -1, -1);
+		req.vnic_id =
+			cpu_to_le32(bp->vnic_info[vnic_id].fw_vnic_id);
+
+		rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+		if (rc)
+			return rc;
+		bp->vnic_info[vnic_id].fw_vnic_id = INVALID_HW_RING_ID;
+	}
+	return rc;
+}
+
+static void bnxt_hwrm_vnic_free(struct bnxt *bp)
+{
+	u16 i;
+
+	for (i = 0; i < bp->nr_vnics; i++)
+		bnxt_hwrm_vnic_free_one(bp, i);
+}
+
+static int bnxt_hwrm_vnic_alloc(struct bnxt *bp, u16 vnic_id,
+				unsigned int start_rx_ring_idx,
+				unsigned int nr_rings)
+{
+	int rc = 0;
+	unsigned int i, j, grp_idx, end_idx = start_rx_ring_idx + nr_rings;
+	struct hwrm_vnic_alloc_input req = {0};
+	struct hwrm_vnic_alloc_output *resp = bp->hwrm_cmd_resp_addr;
+
+	/* map ring groups to this vnic */
+	for (i = start_rx_ring_idx, j = 0; i < end_idx; i++, j++) {
+		grp_idx = bp->rx_ring[i].bnapi->index;
+		if (bp->grp_info[grp_idx].fw_grp_id == INVALID_HW_RING_ID) {
+			netdev_err(bp->dev, "Not enough ring groups avail:%x req:%x\n",
+				   j, nr_rings);
+			break;
+		}
+		bp->vnic_info[vnic_id].fw_grp_ids[j] =
+					bp->grp_info[grp_idx].fw_grp_id;
+	}
+
+	bp->vnic_info[vnic_id].fw_rss_cos_lb_ctx[0] = INVALID_HW_RING_ID;
+	bp->vnic_info[vnic_id].fw_rss_cos_lb_ctx[1] = INVALID_HW_RING_ID;
+	if (vnic_id == 0)
+		req.flags = cpu_to_le32(VNIC_ALLOC_REQ_FLAGS_DEFAULT);
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_ALLOC, -1, -1);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc)
+		bp->vnic_info[vnic_id].fw_vnic_id = le32_to_cpu(resp->vnic_id);
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_hwrm_vnic_qcaps(struct bnxt *bp)
+{
+	struct hwrm_vnic_qcaps_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_vnic_qcaps_input req = {0};
+	int rc;
+
+	if (bp->hwrm_spec_code < 0x10600)
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_QCAPS, -1, -1);
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc) {
+		u32 flags = le32_to_cpu(resp->flags);
+
+		if (flags & VNIC_QCAPS_RESP_FLAGS_RSS_DFLT_CR_CAP)
+			bp->flags |= BNXT_FLAG_NEW_RSS_CAP;
+		if (flags &
+		    VNIC_QCAPS_RESP_FLAGS_ROCE_MIRRORING_CAPABLE_VNIC_CAP)
+			bp->flags |= BNXT_FLAG_ROCE_MIRROR_CAP;
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_hwrm_ring_grp_alloc(struct bnxt *bp)
+{
+	u16 i;
+	u32 rc = 0;
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	for (i = 0; i < bp->rx_nr_rings; i++) {
+		struct hwrm_ring_grp_alloc_input req = {0};
+		struct hwrm_ring_grp_alloc_output *resp =
+					bp->hwrm_cmd_resp_addr;
+		unsigned int grp_idx = bp->rx_ring[i].bnapi->index;
+
+		bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_RING_GRP_ALLOC, -1, -1);
+
+		req.cr = cpu_to_le16(bp->grp_info[grp_idx].cp_fw_ring_id);
+		req.rr = cpu_to_le16(bp->grp_info[grp_idx].rx_fw_ring_id);
+		req.ar = cpu_to_le16(bp->grp_info[grp_idx].agg_fw_ring_id);
+		req.sc = cpu_to_le16(bp->grp_info[grp_idx].fw_stats_ctx);
+
+		rc = _hwrm_send_message(bp, &req, sizeof(req),
+					HWRM_CMD_TIMEOUT);
+		if (rc)
+			break;
+
+		bp->grp_info[grp_idx].fw_grp_id =
+			le32_to_cpu(resp->ring_group_id);
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_hwrm_ring_grp_free(struct bnxt *bp)
+{
+	u16 i;
+	u32 rc = 0;
+	struct hwrm_ring_grp_free_input req = {0};
+
+	if (!bp->grp_info)
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_RING_GRP_FREE, -1, -1);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		if (bp->grp_info[i].fw_grp_id == INVALID_HW_RING_ID)
+			continue;
+		req.ring_group_id =
+			cpu_to_le32(bp->grp_info[i].fw_grp_id);
+
+		rc = _hwrm_send_message(bp, &req, sizeof(req),
+					HWRM_CMD_TIMEOUT);
+		if (rc)
+			break;
+		bp->grp_info[i].fw_grp_id = INVALID_HW_RING_ID;
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int hwrm_ring_alloc_send_msg(struct bnxt *bp,
+				    struct bnxt_ring_struct *ring,
+				    u32 ring_type, u32 map_index)
+{
+	int rc = 0, err = 0;
+	struct hwrm_ring_alloc_input req = {0};
+	struct hwrm_ring_alloc_output *resp = bp->hwrm_cmd_resp_addr;
+	struct bnxt_ring_grp_info *grp_info;
+	u16 ring_id;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_RING_ALLOC, -1, -1);
+
+	req.enables = 0;
+	if (ring->nr_pages > 1) {
+		req.page_tbl_addr = cpu_to_le64(ring->pg_tbl_map);
+		/* Page size is in log2 units */
+		req.page_size = BNXT_PAGE_SHIFT;
+		req.page_tbl_depth = 1;
+	} else {
+		req.page_tbl_addr =  cpu_to_le64(ring->dma_arr[0]);
+	}
+	req.fbo = 0;
+	/* Association of ring index with doorbell index and MSIX number */
+	req.logical_id = cpu_to_le16(map_index);
+
+	switch (ring_type) {
+	case HWRM_RING_ALLOC_TX:
+		req.ring_type = RING_ALLOC_REQ_RING_TYPE_TX;
+		/* Association of transmit ring with completion ring */
+		grp_info = &bp->grp_info[ring->grp_idx];
+		req.cmpl_ring_id = cpu_to_le16(grp_info->cp_fw_ring_id);
+		req.length = cpu_to_le32(bp->tx_ring_mask + 1);
+		req.stat_ctx_id = cpu_to_le32(grp_info->fw_stats_ctx);
+		req.queue_id = cpu_to_le16(ring->queue_id);
+		break;
+	case HWRM_RING_ALLOC_RX:
+		req.ring_type = RING_ALLOC_REQ_RING_TYPE_RX;
+		req.length = cpu_to_le32(bp->rx_ring_mask + 1);
+		break;
+	case HWRM_RING_ALLOC_AGG:
+		req.ring_type = RING_ALLOC_REQ_RING_TYPE_RX;
+		req.length = cpu_to_le32(bp->rx_agg_ring_mask + 1);
+		break;
+	case HWRM_RING_ALLOC_CMPL:
+		req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL;
+		req.length = cpu_to_le32(bp->cp_ring_mask + 1);
+		if (bp->flags & BNXT_FLAG_USING_MSIX)
+			req.int_mode = RING_ALLOC_REQ_INT_MODE_MSIX;
+		break;
+	default:
+		netdev_err(bp->dev, "hwrm alloc invalid ring type %d\n",
+			   ring_type);
+		return -1;
+	}
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	err = le16_to_cpu(resp->error_code);
+	ring_id = le16_to_cpu(resp->ring_id);
+	mutex_unlock(&bp->hwrm_cmd_lock);
+
+	if (rc || err) {
+		switch (ring_type) {
+		case RING_FREE_REQ_RING_TYPE_L2_CMPL:
+			netdev_err(bp->dev, "hwrm_ring_alloc cp failed. rc:%x err:%x\n",
+				   rc, err);
+			return -1;
+
+		case RING_FREE_REQ_RING_TYPE_RX:
+			netdev_err(bp->dev, "hwrm_ring_alloc rx failed. rc:%x err:%x\n",
+				   rc, err);
+			return -1;
+
+		case RING_FREE_REQ_RING_TYPE_TX:
+			netdev_err(bp->dev, "hwrm_ring_alloc tx failed. rc:%x err:%x\n",
+				   rc, err);
+			return -1;
+
+		default:
+			netdev_err(bp->dev, "Invalid ring\n");
+			return -1;
+		}
+	}
+	ring->fw_ring_id = ring_id;
+	return rc;
+}
+
+static int bnxt_hwrm_set_async_event_cr(struct bnxt *bp, int idx)
+{
+	int rc;
+
+	if (BNXT_PF(bp)) {
+		struct hwrm_func_cfg_input req = {0};
+
+		bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+		req.fid = cpu_to_le16(0xffff);
+		req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_ASYNC_EVENT_CR);
+		req.async_event_cr = cpu_to_le16(idx);
+		rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	} else {
+		struct hwrm_func_vf_cfg_input req = {0};
+
+		bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_CFG, -1, -1);
+		req.enables =
+			cpu_to_le32(FUNC_VF_CFG_REQ_ENABLES_ASYNC_EVENT_CR);
+		req.async_event_cr = cpu_to_le16(idx);
+		rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	}
+	return rc;
+}
+
+static int bnxt_hwrm_ring_alloc(struct bnxt *bp)
+{
+	int i, rc = 0;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+		struct bnxt_ring_struct *ring = &cpr->cp_ring_struct;
+		u32 map_idx = ring->map_idx;
+
+		cpr->cp_doorbell = bp->bar1 + map_idx * 0x80;
+		rc = hwrm_ring_alloc_send_msg(bp, ring, HWRM_RING_ALLOC_CMPL,
+					      map_idx);
+		if (rc)
+			goto err_out;
+		BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
+		bp->grp_info[i].cp_fw_ring_id = ring->fw_ring_id;
+
+		if (!i) {
+			rc = bnxt_hwrm_set_async_event_cr(bp, ring->fw_ring_id);
+			if (rc)
+				netdev_warn(bp->dev, "Failed to set async event completion ring.\n");
+		}
+	}
+
+	for (i = 0; i < bp->tx_nr_rings; i++) {
+		struct bnxt_tx_ring_info *txr = &bp->tx_ring[i];
+		struct bnxt_ring_struct *ring = &txr->tx_ring_struct;
+		u32 map_idx = i;
+
+		rc = hwrm_ring_alloc_send_msg(bp, ring, HWRM_RING_ALLOC_TX,
+					      map_idx);
+		if (rc)
+			goto err_out;
+		txr->tx_doorbell = bp->bar1 + map_idx * 0x80;
+	}
+
+	for (i = 0; i < bp->rx_nr_rings; i++) {
+		struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i];
+		struct bnxt_ring_struct *ring = &rxr->rx_ring_struct;
+		u32 map_idx = rxr->bnapi->index;
+
+		rc = hwrm_ring_alloc_send_msg(bp, ring, HWRM_RING_ALLOC_RX,
+					      map_idx);
+		if (rc)
+			goto err_out;
+		rxr->rx_doorbell = bp->bar1 + map_idx * 0x80;
+		writel(DB_KEY_RX | rxr->rx_prod, rxr->rx_doorbell);
+		bp->grp_info[map_idx].rx_fw_ring_id = ring->fw_ring_id;
+	}
+
+	if (bp->flags & BNXT_FLAG_AGG_RINGS) {
+		for (i = 0; i < bp->rx_nr_rings; i++) {
+			struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i];
+			struct bnxt_ring_struct *ring =
+						&rxr->rx_agg_ring_struct;
+			u32 grp_idx = ring->grp_idx;
+			u32 map_idx = grp_idx + bp->rx_nr_rings;
+
+			rc = hwrm_ring_alloc_send_msg(bp, ring,
+						      HWRM_RING_ALLOC_AGG,
+						      map_idx);
+			if (rc)
+				goto err_out;
+
+			rxr->rx_agg_doorbell = bp->bar1 + map_idx * 0x80;
+			writel(DB_KEY_RX | rxr->rx_agg_prod,
+			       rxr->rx_agg_doorbell);
+			bp->grp_info[grp_idx].agg_fw_ring_id = ring->fw_ring_id;
+		}
+	}
+err_out:
+	return rc;
+}
+
+static int hwrm_ring_free_send_msg(struct bnxt *bp,
+				   struct bnxt_ring_struct *ring,
+				   u32 ring_type, int cmpl_ring_id)
+{
+	int rc;
+	struct hwrm_ring_free_input req = {0};
+	struct hwrm_ring_free_output *resp = bp->hwrm_cmd_resp_addr;
+	u16 error_code;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_RING_FREE, cmpl_ring_id, -1);
+	req.ring_type = ring_type;
+	req.ring_id = cpu_to_le16(ring->fw_ring_id);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	error_code = le16_to_cpu(resp->error_code);
+	mutex_unlock(&bp->hwrm_cmd_lock);
+
+	if (rc || error_code) {
+		switch (ring_type) {
+		case RING_FREE_REQ_RING_TYPE_L2_CMPL:
+			netdev_err(bp->dev, "hwrm_ring_free cp failed. rc:%d\n",
+				   rc);
+			return rc;
+		case RING_FREE_REQ_RING_TYPE_RX:
+			netdev_err(bp->dev, "hwrm_ring_free rx failed. rc:%d\n",
+				   rc);
+			return rc;
+		case RING_FREE_REQ_RING_TYPE_TX:
+			netdev_err(bp->dev, "hwrm_ring_free tx failed. rc:%d\n",
+				   rc);
+			return rc;
+		default:
+			netdev_err(bp->dev, "Invalid ring\n");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path)
+{
+	int i;
+
+	if (!bp->bnapi)
+		return;
+
+	for (i = 0; i < bp->tx_nr_rings; i++) {
+		struct bnxt_tx_ring_info *txr = &bp->tx_ring[i];
+		struct bnxt_ring_struct *ring = &txr->tx_ring_struct;
+		u32 grp_idx = txr->bnapi->index;
+		u32 cmpl_ring_id = bp->grp_info[grp_idx].cp_fw_ring_id;
+
+		if (ring->fw_ring_id != INVALID_HW_RING_ID) {
+			hwrm_ring_free_send_msg(bp, ring,
+						RING_FREE_REQ_RING_TYPE_TX,
+						close_path ? cmpl_ring_id :
+						INVALID_HW_RING_ID);
+			ring->fw_ring_id = INVALID_HW_RING_ID;
+		}
+	}
+
+	for (i = 0; i < bp->rx_nr_rings; i++) {
+		struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i];
+		struct bnxt_ring_struct *ring = &rxr->rx_ring_struct;
+		u32 grp_idx = rxr->bnapi->index;
+		u32 cmpl_ring_id = bp->grp_info[grp_idx].cp_fw_ring_id;
+
+		if (ring->fw_ring_id != INVALID_HW_RING_ID) {
+			hwrm_ring_free_send_msg(bp, ring,
+						RING_FREE_REQ_RING_TYPE_RX,
+						close_path ? cmpl_ring_id :
+						INVALID_HW_RING_ID);
+			ring->fw_ring_id = INVALID_HW_RING_ID;
+			bp->grp_info[grp_idx].rx_fw_ring_id =
+				INVALID_HW_RING_ID;
+		}
+	}
+
+	for (i = 0; i < bp->rx_nr_rings; i++) {
+		struct bnxt_rx_ring_info *rxr = &bp->rx_ring[i];
+		struct bnxt_ring_struct *ring = &rxr->rx_agg_ring_struct;
+		u32 grp_idx = rxr->bnapi->index;
+		u32 cmpl_ring_id = bp->grp_info[grp_idx].cp_fw_ring_id;
+
+		if (ring->fw_ring_id != INVALID_HW_RING_ID) {
+			hwrm_ring_free_send_msg(bp, ring,
+						RING_FREE_REQ_RING_TYPE_RX,
+						close_path ? cmpl_ring_id :
+						INVALID_HW_RING_ID);
+			ring->fw_ring_id = INVALID_HW_RING_ID;
+			bp->grp_info[grp_idx].agg_fw_ring_id =
+				INVALID_HW_RING_ID;
+		}
+	}
+
+	/* The completion rings are about to be freed.  After that the
+	 * IRQ doorbell will not work anymore.  So we need to disable
+	 * IRQ here.
+	 */
+	bnxt_disable_int_sync(bp);
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+		struct bnxt_ring_struct *ring = &cpr->cp_ring_struct;
+
+		if (ring->fw_ring_id != INVALID_HW_RING_ID) {
+			hwrm_ring_free_send_msg(bp, ring,
+						RING_FREE_REQ_RING_TYPE_L2_CMPL,
+						INVALID_HW_RING_ID);
+			ring->fw_ring_id = INVALID_HW_RING_ID;
+			bp->grp_info[i].cp_fw_ring_id = INVALID_HW_RING_ID;
+		}
+	}
+}
+
+static int bnxt_hwrm_get_rings(struct bnxt *bp)
+{
+	struct hwrm_func_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+	struct hwrm_func_qcfg_input req = {0};
+	int rc;
+
+	if (bp->hwrm_spec_code < 0x10601)
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1);
+	req.fid = cpu_to_le16(0xffff);
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc) {
+		mutex_unlock(&bp->hwrm_cmd_lock);
+		return -EIO;
+	}
+
+	hw_resc->resv_tx_rings = le16_to_cpu(resp->alloc_tx_rings);
+	if (bp->flags & BNXT_FLAG_NEW_RM) {
+		u16 cp, stats;
+
+		hw_resc->resv_rx_rings = le16_to_cpu(resp->alloc_rx_rings);
+		hw_resc->resv_hw_ring_grps =
+			le32_to_cpu(resp->alloc_hw_ring_grps);
+		hw_resc->resv_vnics = le16_to_cpu(resp->alloc_vnics);
+		cp = le16_to_cpu(resp->alloc_cmpl_rings);
+		stats = le16_to_cpu(resp->alloc_stat_ctx);
+		cp = min_t(u16, cp, stats);
+		hw_resc->resv_cp_rings = cp;
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return 0;
+}
+
+/* Caller must hold bp->hwrm_cmd_lock */
+int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings)
+{
+	struct hwrm_func_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_func_qcfg_input req = {0};
+	int rc;
+
+	if (bp->hwrm_spec_code < 0x10601)
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1);
+	req.fid = cpu_to_le16(fid);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc)
+		*tx_rings = le16_to_cpu(resp->alloc_tx_rings);
+
+	return rc;
+}
+
+static void
+__bnxt_hwrm_reserve_pf_rings(struct bnxt *bp, struct hwrm_func_cfg_input *req,
+			     int tx_rings, int rx_rings, int ring_grps,
+			     int cp_rings, int vnics)
+{
+	u32 enables = 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, req, HWRM_FUNC_CFG, -1, -1);
+	req->fid = cpu_to_le16(0xffff);
+	enables |= tx_rings ? FUNC_CFG_REQ_ENABLES_NUM_TX_RINGS : 0;
+	req->num_tx_rings = cpu_to_le16(tx_rings);
+	if (bp->flags & BNXT_FLAG_NEW_RM) {
+		enables |= rx_rings ? FUNC_CFG_REQ_ENABLES_NUM_RX_RINGS : 0;
+		enables |= cp_rings ? FUNC_CFG_REQ_ENABLES_NUM_CMPL_RINGS |
+				      FUNC_CFG_REQ_ENABLES_NUM_STAT_CTXS : 0;
+		enables |= ring_grps ?
+			   FUNC_CFG_REQ_ENABLES_NUM_HW_RING_GRPS : 0;
+		enables |= vnics ? FUNC_VF_CFG_REQ_ENABLES_NUM_VNICS : 0;
+
+		req->num_rx_rings = cpu_to_le16(rx_rings);
+		req->num_hw_ring_grps = cpu_to_le16(ring_grps);
+		req->num_cmpl_rings = cpu_to_le16(cp_rings);
+		req->num_stat_ctxs = req->num_cmpl_rings;
+		req->num_vnics = cpu_to_le16(vnics);
+	}
+	req->enables = cpu_to_le32(enables);
+}
+
+static void
+__bnxt_hwrm_reserve_vf_rings(struct bnxt *bp,
+			     struct hwrm_func_vf_cfg_input *req, int tx_rings,
+			     int rx_rings, int ring_grps, int cp_rings,
+			     int vnics)
+{
+	u32 enables = 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, req, HWRM_FUNC_VF_CFG, -1, -1);
+	enables |= tx_rings ? FUNC_VF_CFG_REQ_ENABLES_NUM_TX_RINGS : 0;
+	enables |= rx_rings ? FUNC_VF_CFG_REQ_ENABLES_NUM_RX_RINGS : 0;
+	enables |= cp_rings ? FUNC_VF_CFG_REQ_ENABLES_NUM_CMPL_RINGS |
+			      FUNC_VF_CFG_REQ_ENABLES_NUM_STAT_CTXS : 0;
+	enables |= ring_grps ? FUNC_VF_CFG_REQ_ENABLES_NUM_HW_RING_GRPS : 0;
+	enables |= vnics ? FUNC_VF_CFG_REQ_ENABLES_NUM_VNICS : 0;
+
+	req->num_tx_rings = cpu_to_le16(tx_rings);
+	req->num_rx_rings = cpu_to_le16(rx_rings);
+	req->num_hw_ring_grps = cpu_to_le16(ring_grps);
+	req->num_cmpl_rings = cpu_to_le16(cp_rings);
+	req->num_stat_ctxs = req->num_cmpl_rings;
+	req->num_vnics = cpu_to_le16(vnics);
+
+	req->enables = cpu_to_le32(enables);
+}
+
+static int
+bnxt_hwrm_reserve_pf_rings(struct bnxt *bp, int tx_rings, int rx_rings,
+			   int ring_grps, int cp_rings, int vnics)
+{
+	struct hwrm_func_cfg_input req = {0};
+	int rc;
+
+	__bnxt_hwrm_reserve_pf_rings(bp, &req, tx_rings, rx_rings, ring_grps,
+				     cp_rings, vnics);
+	if (!req.enables)
+		return 0;
+
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		return -ENOMEM;
+
+	if (bp->hwrm_spec_code < 0x10601)
+		bp->hw_resc.resv_tx_rings = tx_rings;
+
+	rc = bnxt_hwrm_get_rings(bp);
+	return rc;
+}
+
+static int
+bnxt_hwrm_reserve_vf_rings(struct bnxt *bp, int tx_rings, int rx_rings,
+			   int ring_grps, int cp_rings, int vnics)
+{
+	struct hwrm_func_vf_cfg_input req = {0};
+	int rc;
+
+	if (!(bp->flags & BNXT_FLAG_NEW_RM)) {
+		bp->hw_resc.resv_tx_rings = tx_rings;
+		return 0;
+	}
+
+	__bnxt_hwrm_reserve_vf_rings(bp, &req, tx_rings, rx_rings, ring_grps,
+				     cp_rings, vnics);
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		return -ENOMEM;
+
+	rc = bnxt_hwrm_get_rings(bp);
+	return rc;
+}
+
+static int bnxt_hwrm_reserve_rings(struct bnxt *bp, int tx, int rx, int grp,
+				   int cp, int vnic)
+{
+	if (BNXT_PF(bp))
+		return bnxt_hwrm_reserve_pf_rings(bp, tx, rx, grp, cp, vnic);
+	else
+		return bnxt_hwrm_reserve_vf_rings(bp, tx, rx, grp, cp, vnic);
+}
+
+static int bnxt_cp_rings_in_use(struct bnxt *bp)
+{
+	int cp = bp->cp_nr_rings;
+	int ulp_msix, ulp_base;
+
+	ulp_msix = bnxt_get_ulp_msix_num(bp);
+	if (ulp_msix) {
+		ulp_base = bnxt_get_ulp_msix_base(bp);
+		cp += ulp_msix;
+		if ((ulp_base + ulp_msix) > cp)
+			cp = ulp_base + ulp_msix;
+	}
+	return cp;
+}
+
+static bool bnxt_need_reserve_rings(struct bnxt *bp)
+{
+	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+	int cp = bnxt_cp_rings_in_use(bp);
+	int rx = bp->rx_nr_rings;
+	int vnic = 1, grp = rx;
+
+	if (bp->hwrm_spec_code < 0x10601)
+		return false;
+
+	if (hw_resc->resv_tx_rings != bp->tx_nr_rings)
+		return true;
+
+	if (bp->flags & BNXT_FLAG_RFS)
+		vnic = rx + 1;
+	if (bp->flags & BNXT_FLAG_AGG_RINGS)
+		rx <<= 1;
+	if ((bp->flags & BNXT_FLAG_NEW_RM) &&
+	    (hw_resc->resv_rx_rings != rx || hw_resc->resv_cp_rings != cp ||
+	     hw_resc->resv_hw_ring_grps != grp || hw_resc->resv_vnics != vnic))
+		return true;
+	return false;
+}
+
+static int bnxt_trim_rings(struct bnxt *bp, int *rx, int *tx, int max,
+			   bool shared);
+
+static int __bnxt_reserve_rings(struct bnxt *bp)
+{
+	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+	int cp = bnxt_cp_rings_in_use(bp);
+	int tx = bp->tx_nr_rings;
+	int rx = bp->rx_nr_rings;
+	int grp, rx_rings, rc;
+	bool sh = false;
+	int vnic = 1;
+
+	if (!bnxt_need_reserve_rings(bp))
+		return 0;
+
+	if (bp->flags & BNXT_FLAG_SHARED_RINGS)
+		sh = true;
+	if (bp->flags & BNXT_FLAG_RFS)
+		vnic = rx + 1;
+	if (bp->flags & BNXT_FLAG_AGG_RINGS)
+		rx <<= 1;
+	grp = bp->rx_nr_rings;
+
+	rc = bnxt_hwrm_reserve_rings(bp, tx, rx, grp, cp, vnic);
+	if (rc)
+		return rc;
+
+	tx = hw_resc->resv_tx_rings;
+	if (bp->flags & BNXT_FLAG_NEW_RM) {
+		rx = hw_resc->resv_rx_rings;
+		cp = hw_resc->resv_cp_rings;
+		grp = hw_resc->resv_hw_ring_grps;
+		vnic = hw_resc->resv_vnics;
+	}
+
+	rx_rings = rx;
+	if (bp->flags & BNXT_FLAG_AGG_RINGS) {
+		if (rx >= 2) {
+			rx_rings = rx >> 1;
+		} else {
+			if (netif_running(bp->dev))
+				return -ENOMEM;
+
+			bp->flags &= ~BNXT_FLAG_AGG_RINGS;
+			bp->flags |= BNXT_FLAG_NO_AGG_RINGS;
+			bp->dev->hw_features &= ~NETIF_F_LRO;
+			bp->dev->features &= ~NETIF_F_LRO;
+			bnxt_set_ring_params(bp);
+		}
+	}
+	rx_rings = min_t(int, rx_rings, grp);
+	rc = bnxt_trim_rings(bp, &rx_rings, &tx, cp, sh);
+	if (bp->flags & BNXT_FLAG_AGG_RINGS)
+		rx = rx_rings << 1;
+	cp = sh ? max_t(int, tx, rx_rings) : tx + rx_rings;
+	bp->tx_nr_rings = tx;
+	bp->rx_nr_rings = rx_rings;
+	bp->cp_nr_rings = cp;
+
+	if (!tx || !rx || !cp || !grp || !vnic)
+		return -ENOMEM;
+
+	return rc;
+}
+
+static int bnxt_hwrm_check_vf_rings(struct bnxt *bp, int tx_rings, int rx_rings,
+				    int ring_grps, int cp_rings, int vnics)
+{
+	struct hwrm_func_vf_cfg_input req = {0};
+	u32 flags;
+	int rc;
+
+	if (!(bp->flags & BNXT_FLAG_NEW_RM))
+		return 0;
+
+	__bnxt_hwrm_reserve_vf_rings(bp, &req, tx_rings, rx_rings, ring_grps,
+				     cp_rings, vnics);
+	flags = FUNC_VF_CFG_REQ_FLAGS_TX_ASSETS_TEST |
+		FUNC_VF_CFG_REQ_FLAGS_RX_ASSETS_TEST |
+		FUNC_VF_CFG_REQ_FLAGS_CMPL_ASSETS_TEST |
+		FUNC_VF_CFG_REQ_FLAGS_RING_GRP_ASSETS_TEST |
+		FUNC_VF_CFG_REQ_FLAGS_STAT_CTX_ASSETS_TEST |
+		FUNC_VF_CFG_REQ_FLAGS_VNIC_ASSETS_TEST;
+
+	req.flags = cpu_to_le32(flags);
+	rc = hwrm_send_message_silent(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		return -ENOMEM;
+	return 0;
+}
+
+static int bnxt_hwrm_check_pf_rings(struct bnxt *bp, int tx_rings, int rx_rings,
+				    int ring_grps, int cp_rings, int vnics)
+{
+	struct hwrm_func_cfg_input req = {0};
+	u32 flags;
+	int rc;
+
+	__bnxt_hwrm_reserve_pf_rings(bp, &req, tx_rings, rx_rings, ring_grps,
+				     cp_rings, vnics);
+	flags = FUNC_CFG_REQ_FLAGS_TX_ASSETS_TEST;
+	if (bp->flags & BNXT_FLAG_NEW_RM)
+		flags |= FUNC_CFG_REQ_FLAGS_RX_ASSETS_TEST |
+			 FUNC_CFG_REQ_FLAGS_CMPL_ASSETS_TEST |
+			 FUNC_CFG_REQ_FLAGS_RING_GRP_ASSETS_TEST |
+			 FUNC_CFG_REQ_FLAGS_STAT_CTX_ASSETS_TEST |
+			 FUNC_CFG_REQ_FLAGS_VNIC_ASSETS_TEST;
+
+	req.flags = cpu_to_le32(flags);
+	rc = hwrm_send_message_silent(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		return -ENOMEM;
+	return 0;
+}
+
+static int bnxt_hwrm_check_rings(struct bnxt *bp, int tx_rings, int rx_rings,
+				 int ring_grps, int cp_rings, int vnics)
+{
+	if (bp->hwrm_spec_code < 0x10801)
+		return 0;
+
+	if (BNXT_PF(bp))
+		return bnxt_hwrm_check_pf_rings(bp, tx_rings, rx_rings,
+						ring_grps, cp_rings, vnics);
+
+	return bnxt_hwrm_check_vf_rings(bp, tx_rings, rx_rings, ring_grps,
+					cp_rings, vnics);
+}
+
+static void bnxt_hwrm_set_coal_params(struct bnxt_coal *hw_coal,
+	struct hwrm_ring_cmpl_ring_cfg_aggint_params_input *req)
+{
+	u16 val, tmr, max, flags;
+
+	max = hw_coal->bufs_per_record * 128;
+	if (hw_coal->budget)
+		max = hw_coal->bufs_per_record * hw_coal->budget;
+
+	val = clamp_t(u16, hw_coal->coal_bufs, 1, max);
+	req->num_cmpl_aggr_int = cpu_to_le16(val);
+
+	/* This is a 6-bit value and must not be 0, or we'll get non stop IRQ */
+	val = min_t(u16, val, 63);
+	req->num_cmpl_dma_aggr = cpu_to_le16(val);
+
+	/* This is a 6-bit value and must not be 0, or we'll get non stop IRQ */
+	val = clamp_t(u16, hw_coal->coal_bufs_irq, 1, 63);
+	req->num_cmpl_dma_aggr_during_int = cpu_to_le16(val);
+
+	tmr = BNXT_USEC_TO_COAL_TIMER(hw_coal->coal_ticks);
+	tmr = max_t(u16, tmr, 1);
+	req->int_lat_tmr_max = cpu_to_le16(tmr);
+
+	/* min timer set to 1/2 of interrupt timer */
+	val = tmr / 2;
+	req->int_lat_tmr_min = cpu_to_le16(val);
+
+	/* buf timer set to 1/4 of interrupt timer */
+	val = max_t(u16, tmr / 4, 1);
+	req->cmpl_aggr_dma_tmr = cpu_to_le16(val);
+
+	tmr = BNXT_USEC_TO_COAL_TIMER(hw_coal->coal_ticks_irq);
+	tmr = max_t(u16, tmr, 1);
+	req->cmpl_aggr_dma_tmr_during_int = cpu_to_le16(tmr);
+
+	flags = RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_FLAGS_TIMER_RESET;
+	if (hw_coal->idle_thresh && hw_coal->coal_ticks < hw_coal->idle_thresh)
+		flags |= RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_FLAGS_RING_IDLE;
+	req->flags = cpu_to_le16(flags);
+}
+
+int bnxt_hwrm_set_ring_coal(struct bnxt *bp, struct bnxt_napi *bnapi)
+{
+	struct hwrm_ring_cmpl_ring_cfg_aggint_params_input req_rx = {0};
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	struct bnxt_coal coal;
+	unsigned int grp_idx;
+
+	/* Tick values in micro seconds.
+	 * 1 coal_buf x bufs_per_record = 1 completion record.
+	 */
+	memcpy(&coal, &bp->rx_coal, sizeof(struct bnxt_coal));
+
+	coal.coal_ticks = cpr->rx_ring_coal.coal_ticks;
+	coal.coal_bufs = cpr->rx_ring_coal.coal_bufs;
+
+	if (!bnapi->rx_ring)
+		return -ENODEV;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req_rx,
+			       HWRM_RING_CMPL_RING_CFG_AGGINT_PARAMS, -1, -1);
+
+	bnxt_hwrm_set_coal_params(&coal, &req_rx);
+
+	grp_idx = bnapi->index;
+	req_rx.ring_id = cpu_to_le16(bp->grp_info[grp_idx].cp_fw_ring_id);
+
+	return hwrm_send_message(bp, &req_rx, sizeof(req_rx),
+				 HWRM_CMD_TIMEOUT);
+}
+
+int bnxt_hwrm_set_coal(struct bnxt *bp)
+{
+	int i, rc = 0;
+	struct hwrm_ring_cmpl_ring_cfg_aggint_params_input req_rx = {0},
+							   req_tx = {0}, *req;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req_rx,
+			       HWRM_RING_CMPL_RING_CFG_AGGINT_PARAMS, -1, -1);
+	bnxt_hwrm_cmd_hdr_init(bp, &req_tx,
+			       HWRM_RING_CMPL_RING_CFG_AGGINT_PARAMS, -1, -1);
+
+	bnxt_hwrm_set_coal_params(&bp->rx_coal, &req_rx);
+	bnxt_hwrm_set_coal_params(&bp->tx_coal, &req_tx);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+
+		req = &req_rx;
+		if (!bnapi->rx_ring)
+			req = &req_tx;
+		req->ring_id = cpu_to_le16(bp->grp_info[i].cp_fw_ring_id);
+
+		rc = _hwrm_send_message(bp, req, sizeof(*req),
+					HWRM_CMD_TIMEOUT);
+		if (rc)
+			break;
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_hwrm_stat_ctx_free(struct bnxt *bp)
+{
+	int rc = 0, i;
+	struct hwrm_stat_ctx_free_input req = {0};
+
+	if (!bp->bnapi)
+		return 0;
+
+	if (BNXT_CHIP_TYPE_NITRO_A0(bp))
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_STAT_CTX_FREE, -1, -1);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+
+		if (cpr->hw_stats_ctx_id != INVALID_STATS_CTX_ID) {
+			req.stat_ctx_id = cpu_to_le32(cpr->hw_stats_ctx_id);
+
+			rc = _hwrm_send_message(bp, &req, sizeof(req),
+						HWRM_CMD_TIMEOUT);
+			if (rc)
+				break;
+
+			cpr->hw_stats_ctx_id = INVALID_STATS_CTX_ID;
+		}
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_hwrm_stat_ctx_alloc(struct bnxt *bp)
+{
+	int rc = 0, i;
+	struct hwrm_stat_ctx_alloc_input req = {0};
+	struct hwrm_stat_ctx_alloc_output *resp = bp->hwrm_cmd_resp_addr;
+
+	if (BNXT_CHIP_TYPE_NITRO_A0(bp))
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_STAT_CTX_ALLOC, -1, -1);
+
+	req.update_period_ms = cpu_to_le32(bp->stats_coal_ticks / 1000);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+
+		req.stats_dma_addr = cpu_to_le64(cpr->hw_stats_map);
+
+		rc = _hwrm_send_message(bp, &req, sizeof(req),
+					HWRM_CMD_TIMEOUT);
+		if (rc)
+			break;
+
+		cpr->hw_stats_ctx_id = le32_to_cpu(resp->stat_ctx_id);
+
+		bp->grp_info[i].fw_stats_ctx = cpr->hw_stats_ctx_id;
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_hwrm_func_qcfg(struct bnxt *bp)
+{
+	struct hwrm_func_qcfg_input req = {0};
+	struct hwrm_func_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+	u16 flags;
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1);
+	req.fid = cpu_to_le16(0xffff);
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		goto func_qcfg_exit;
+
+#ifdef CONFIG_BNXT_SRIOV
+	if (BNXT_VF(bp)) {
+		struct bnxt_vf_info *vf = &bp->vf;
+
+		vf->vlan = le16_to_cpu(resp->vlan) & VLAN_VID_MASK;
+	}
+#endif
+	flags = le16_to_cpu(resp->flags);
+	if (flags & (FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED |
+		     FUNC_QCFG_RESP_FLAGS_FW_LLDP_AGENT_ENABLED)) {
+		bp->flags |= BNXT_FLAG_FW_LLDP_AGENT;
+		if (flags & FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED)
+			bp->flags |= BNXT_FLAG_FW_DCBX_AGENT;
+	}
+	if (BNXT_PF(bp) && (flags & FUNC_QCFG_RESP_FLAGS_MULTI_HOST))
+		bp->flags |= BNXT_FLAG_MULTI_HOST;
+
+	switch (resp->port_partition_type) {
+	case FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_0:
+	case FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_5:
+	case FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR2_0:
+		bp->port_partition_type = resp->port_partition_type;
+		break;
+	}
+	if (bp->hwrm_spec_code < 0x10707 ||
+	    resp->evb_mode == FUNC_QCFG_RESP_EVB_MODE_VEB)
+		bp->br_mode = BRIDGE_MODE_VEB;
+	else if (resp->evb_mode == FUNC_QCFG_RESP_EVB_MODE_VEPA)
+		bp->br_mode = BRIDGE_MODE_VEPA;
+	else
+		bp->br_mode = BRIDGE_MODE_UNDEF;
+
+	bp->max_mtu = le16_to_cpu(resp->max_mtu_configured);
+	if (!bp->max_mtu)
+		bp->max_mtu = BNXT_MAX_MTU;
+
+func_qcfg_exit:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+int bnxt_hwrm_func_resc_qcaps(struct bnxt *bp, bool all)
+{
+	struct hwrm_func_resource_qcaps_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_func_resource_qcaps_input req = {0};
+	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_RESOURCE_QCAPS, -1, -1);
+	req.fid = cpu_to_le16(0xffff);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc) {
+		rc = -EIO;
+		goto hwrm_func_resc_qcaps_exit;
+	}
+
+	hw_resc->max_tx_sch_inputs = le16_to_cpu(resp->max_tx_scheduler_inputs);
+	if (!all)
+		goto hwrm_func_resc_qcaps_exit;
+
+	hw_resc->min_rsscos_ctxs = le16_to_cpu(resp->min_rsscos_ctx);
+	hw_resc->max_rsscos_ctxs = le16_to_cpu(resp->max_rsscos_ctx);
+	hw_resc->min_cp_rings = le16_to_cpu(resp->min_cmpl_rings);
+	hw_resc->max_cp_rings = le16_to_cpu(resp->max_cmpl_rings);
+	hw_resc->min_tx_rings = le16_to_cpu(resp->min_tx_rings);
+	hw_resc->max_tx_rings = le16_to_cpu(resp->max_tx_rings);
+	hw_resc->min_rx_rings = le16_to_cpu(resp->min_rx_rings);
+	hw_resc->max_rx_rings = le16_to_cpu(resp->max_rx_rings);
+	hw_resc->min_hw_ring_grps = le16_to_cpu(resp->min_hw_ring_grps);
+	hw_resc->max_hw_ring_grps = le16_to_cpu(resp->max_hw_ring_grps);
+	hw_resc->min_l2_ctxs = le16_to_cpu(resp->min_l2_ctxs);
+	hw_resc->max_l2_ctxs = le16_to_cpu(resp->max_l2_ctxs);
+	hw_resc->min_vnics = le16_to_cpu(resp->min_vnics);
+	hw_resc->max_vnics = le16_to_cpu(resp->max_vnics);
+	hw_resc->min_stat_ctxs = le16_to_cpu(resp->min_stat_ctx);
+	hw_resc->max_stat_ctxs = le16_to_cpu(resp->max_stat_ctx);
+
+	if (BNXT_PF(bp)) {
+		struct bnxt_pf_info *pf = &bp->pf;
+
+		pf->vf_resv_strategy =
+			le16_to_cpu(resp->vf_reservation_strategy);
+		if (pf->vf_resv_strategy > BNXT_VF_RESV_STRATEGY_MINIMAL)
+			pf->vf_resv_strategy = BNXT_VF_RESV_STRATEGY_MAXIMAL;
+	}
+hwrm_func_resc_qcaps_exit:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int __bnxt_hwrm_func_qcaps(struct bnxt *bp)
+{
+	int rc = 0;
+	struct hwrm_func_qcaps_input req = {0};
+	struct hwrm_func_qcaps_output *resp = bp->hwrm_cmd_resp_addr;
+	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+	u32 flags;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCAPS, -1, -1);
+	req.fid = cpu_to_le16(0xffff);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		goto hwrm_func_qcaps_exit;
+
+	flags = le32_to_cpu(resp->flags);
+	if (flags & FUNC_QCAPS_RESP_FLAGS_ROCE_V1_SUPPORTED)
+		bp->flags |= BNXT_FLAG_ROCEV1_CAP;
+	if (flags & FUNC_QCAPS_RESP_FLAGS_ROCE_V2_SUPPORTED)
+		bp->flags |= BNXT_FLAG_ROCEV2_CAP;
+
+	bp->tx_push_thresh = 0;
+	if (flags & FUNC_QCAPS_RESP_FLAGS_PUSH_MODE_SUPPORTED)
+		bp->tx_push_thresh = BNXT_TX_PUSH_THRESH;
+
+	hw_resc->max_rsscos_ctxs = le16_to_cpu(resp->max_rsscos_ctx);
+	hw_resc->max_cp_rings = le16_to_cpu(resp->max_cmpl_rings);
+	hw_resc->max_tx_rings = le16_to_cpu(resp->max_tx_rings);
+	hw_resc->max_rx_rings = le16_to_cpu(resp->max_rx_rings);
+	hw_resc->max_hw_ring_grps = le32_to_cpu(resp->max_hw_ring_grps);
+	if (!hw_resc->max_hw_ring_grps)
+		hw_resc->max_hw_ring_grps = hw_resc->max_tx_rings;
+	hw_resc->max_l2_ctxs = le16_to_cpu(resp->max_l2_ctxs);
+	hw_resc->max_vnics = le16_to_cpu(resp->max_vnics);
+	hw_resc->max_stat_ctxs = le16_to_cpu(resp->max_stat_ctx);
+
+	if (BNXT_PF(bp)) {
+		struct bnxt_pf_info *pf = &bp->pf;
+
+		pf->fw_fid = le16_to_cpu(resp->fid);
+		pf->port_id = le16_to_cpu(resp->port_id);
+		bp->dev->dev_port = pf->port_id;
+		memcpy(pf->mac_addr, resp->mac_address, ETH_ALEN);
+		pf->first_vf_id = le16_to_cpu(resp->first_vf_id);
+		pf->max_vfs = le16_to_cpu(resp->max_vfs);
+		pf->max_encap_records = le32_to_cpu(resp->max_encap_records);
+		pf->max_decap_records = le32_to_cpu(resp->max_decap_records);
+		pf->max_tx_em_flows = le32_to_cpu(resp->max_tx_em_flows);
+		pf->max_tx_wm_flows = le32_to_cpu(resp->max_tx_wm_flows);
+		pf->max_rx_em_flows = le32_to_cpu(resp->max_rx_em_flows);
+		pf->max_rx_wm_flows = le32_to_cpu(resp->max_rx_wm_flows);
+		if (flags & FUNC_QCAPS_RESP_FLAGS_WOL_MAGICPKT_SUPPORTED)
+			bp->flags |= BNXT_FLAG_WOL_CAP;
+	} else {
+#ifdef CONFIG_BNXT_SRIOV
+		struct bnxt_vf_info *vf = &bp->vf;
+
+		vf->fw_fid = le16_to_cpu(resp->fid);
+		memcpy(vf->mac_addr, resp->mac_address, ETH_ALEN);
+#endif
+	}
+
+hwrm_func_qcaps_exit:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_hwrm_func_qcaps(struct bnxt *bp)
+{
+	int rc;
+
+	rc = __bnxt_hwrm_func_qcaps(bp);
+	if (rc)
+		return rc;
+	if (bp->hwrm_spec_code >= 0x10803) {
+		rc = bnxt_hwrm_func_resc_qcaps(bp, true);
+		if (!rc)
+			bp->flags |= BNXT_FLAG_NEW_RM;
+	}
+	return 0;
+}
+
+static int bnxt_hwrm_func_reset(struct bnxt *bp)
+{
+	struct hwrm_func_reset_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_RESET, -1, -1);
+	req.enables = 0;
+
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_RESET_TIMEOUT);
+}
+
+static int bnxt_hwrm_queue_qportcfg(struct bnxt *bp)
+{
+	int rc = 0;
+	struct hwrm_queue_qportcfg_input req = {0};
+	struct hwrm_queue_qportcfg_output *resp = bp->hwrm_cmd_resp_addr;
+	u8 i, *qptr;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_QPORTCFG, -1, -1);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		goto qportcfg_exit;
+
+	if (!resp->max_configurable_queues) {
+		rc = -EINVAL;
+		goto qportcfg_exit;
+	}
+	bp->max_tc = resp->max_configurable_queues;
+	bp->max_lltc = resp->max_configurable_lossless_queues;
+	if (bp->max_tc > BNXT_MAX_QUEUE)
+		bp->max_tc = BNXT_MAX_QUEUE;
+
+	if (resp->queue_cfg_info & QUEUE_QPORTCFG_RESP_QUEUE_CFG_INFO_ASYM_CFG)
+		bp->max_tc = 1;
+
+	if (bp->max_lltc > bp->max_tc)
+		bp->max_lltc = bp->max_tc;
+
+	qptr = &resp->queue_id0;
+	for (i = 0; i < bp->max_tc; i++) {
+		bp->q_info[i].queue_id = *qptr++;
+		bp->q_info[i].queue_profile = *qptr++;
+	}
+
+qportcfg_exit:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_hwrm_ver_get(struct bnxt *bp)
+{
+	int rc;
+	struct hwrm_ver_get_input req = {0};
+	struct hwrm_ver_get_output *resp = bp->hwrm_cmd_resp_addr;
+	u32 dev_caps_cfg;
+
+	bp->hwrm_max_req_len = HWRM_MAX_REQ_LEN;
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VER_GET, -1, -1);
+	req.hwrm_intf_maj = HWRM_VERSION_MAJOR;
+	req.hwrm_intf_min = HWRM_VERSION_MINOR;
+	req.hwrm_intf_upd = HWRM_VERSION_UPDATE;
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		goto hwrm_ver_get_exit;
+
+	memcpy(&bp->ver_resp, resp, sizeof(struct hwrm_ver_get_output));
+
+	bp->hwrm_spec_code = resp->hwrm_intf_maj_8b << 16 |
+			     resp->hwrm_intf_min_8b << 8 |
+			     resp->hwrm_intf_upd_8b;
+	if (resp->hwrm_intf_maj_8b < 1) {
+		netdev_warn(bp->dev, "HWRM interface %d.%d.%d is older than 1.0.0.\n",
+			    resp->hwrm_intf_maj_8b, resp->hwrm_intf_min_8b,
+			    resp->hwrm_intf_upd_8b);
+		netdev_warn(bp->dev, "Please update firmware with HWRM interface 1.0.0 or newer.\n");
+	}
+	snprintf(bp->fw_ver_str, BC_HWRM_STR_LEN, "%d.%d.%d.%d",
+		 resp->hwrm_fw_maj_8b, resp->hwrm_fw_min_8b,
+		 resp->hwrm_fw_bld_8b, resp->hwrm_fw_rsvd_8b);
+
+	bp->hwrm_cmd_timeout = le16_to_cpu(resp->def_req_timeout);
+	if (!bp->hwrm_cmd_timeout)
+		bp->hwrm_cmd_timeout = DFLT_HWRM_CMD_TIMEOUT;
+
+	if (resp->hwrm_intf_maj_8b >= 1)
+		bp->hwrm_max_req_len = le16_to_cpu(resp->max_req_win_len);
+
+	bp->chip_num = le16_to_cpu(resp->chip_num);
+	if (bp->chip_num == CHIP_NUM_58700 && !resp->chip_rev &&
+	    !resp->chip_metal)
+		bp->flags |= BNXT_FLAG_CHIP_NITRO_A0;
+
+	dev_caps_cfg = le32_to_cpu(resp->dev_caps_cfg);
+	if ((dev_caps_cfg & VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_SUPPORTED) &&
+	    (dev_caps_cfg & VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_REQUIRED))
+		bp->flags |= BNXT_FLAG_SHORT_CMD;
+
+hwrm_ver_get_exit:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+int bnxt_hwrm_fw_set_time(struct bnxt *bp)
+{
+	struct hwrm_fw_set_time_input req = {0};
+	struct tm tm;
+	time64_t now = ktime_get_real_seconds();
+
+	if (bp->hwrm_spec_code < 0x10400)
+		return -EOPNOTSUPP;
+
+	time64_to_tm(now, 0, &tm);
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FW_SET_TIME, -1, -1);
+	req.year = cpu_to_le16(1900 + tm.tm_year);
+	req.month = 1 + tm.tm_mon;
+	req.day = tm.tm_mday;
+	req.hour = tm.tm_hour;
+	req.minute = tm.tm_min;
+	req.second = tm.tm_sec;
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_hwrm_port_qstats(struct bnxt *bp)
+{
+	int rc;
+	struct bnxt_pf_info *pf = &bp->pf;
+	struct hwrm_port_qstats_input req = {0};
+
+	if (!(bp->flags & BNXT_FLAG_PORT_STATS))
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_QSTATS, -1, -1);
+	req.port_id = cpu_to_le16(pf->port_id);
+	req.tx_stat_host_addr = cpu_to_le64(bp->hw_tx_port_stats_map);
+	req.rx_stat_host_addr = cpu_to_le64(bp->hw_rx_port_stats_map);
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	return rc;
+}
+
+static int bnxt_hwrm_port_qstats_ext(struct bnxt *bp)
+{
+	struct hwrm_port_qstats_ext_input req = {0};
+	struct bnxt_pf_info *pf = &bp->pf;
+
+	if (!(bp->flags & BNXT_FLAG_PORT_STATS_EXT))
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_QSTATS_EXT, -1, -1);
+	req.port_id = cpu_to_le16(pf->port_id);
+	req.rx_stat_size = cpu_to_le16(sizeof(struct rx_port_stats_ext));
+	req.rx_stat_host_addr = cpu_to_le64(bp->hw_rx_port_stats_ext_map);
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static void bnxt_hwrm_free_tunnel_ports(struct bnxt *bp)
+{
+	if (bp->vxlan_port_cnt) {
+		bnxt_hwrm_tunnel_dst_port_free(
+			bp, TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN);
+	}
+	bp->vxlan_port_cnt = 0;
+	if (bp->nge_port_cnt) {
+		bnxt_hwrm_tunnel_dst_port_free(
+			bp, TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE);
+	}
+	bp->nge_port_cnt = 0;
+}
+
+static int bnxt_set_tpa(struct bnxt *bp, bool set_tpa)
+{
+	int rc, i;
+	u32 tpa_flags = 0;
+
+	if (set_tpa)
+		tpa_flags = bp->flags & BNXT_FLAG_TPA;
+	for (i = 0; i < bp->nr_vnics; i++) {
+		rc = bnxt_hwrm_vnic_set_tpa(bp, i, tpa_flags);
+		if (rc) {
+			netdev_err(bp->dev, "hwrm vnic set tpa failure rc for vnic %d: %x\n",
+				   i, rc);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+static void bnxt_hwrm_clear_vnic_rss(struct bnxt *bp)
+{
+	int i;
+
+	for (i = 0; i < bp->nr_vnics; i++)
+		bnxt_hwrm_vnic_set_rss(bp, i, false);
+}
+
+static void bnxt_hwrm_resource_free(struct bnxt *bp, bool close_path,
+				    bool irq_re_init)
+{
+	if (bp->vnic_info) {
+		bnxt_hwrm_clear_vnic_filter(bp);
+		/* clear all RSS setting before free vnic ctx */
+		bnxt_hwrm_clear_vnic_rss(bp);
+		bnxt_hwrm_vnic_ctx_free(bp);
+		/* before free the vnic, undo the vnic tpa settings */
+		if (bp->flags & BNXT_FLAG_TPA)
+			bnxt_set_tpa(bp, false);
+		bnxt_hwrm_vnic_free(bp);
+	}
+	bnxt_hwrm_ring_free(bp, close_path);
+	bnxt_hwrm_ring_grp_free(bp);
+	if (irq_re_init) {
+		bnxt_hwrm_stat_ctx_free(bp);
+		bnxt_hwrm_free_tunnel_ports(bp);
+	}
+}
+
+static int bnxt_hwrm_set_br_mode(struct bnxt *bp, u16 br_mode)
+{
+	struct hwrm_func_cfg_input req = {0};
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+	req.fid = cpu_to_le16(0xffff);
+	req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_EVB_MODE);
+	if (br_mode == BRIDGE_MODE_VEB)
+		req.evb_mode = FUNC_CFG_REQ_EVB_MODE_VEB;
+	else if (br_mode == BRIDGE_MODE_VEPA)
+		req.evb_mode = FUNC_CFG_REQ_EVB_MODE_VEPA;
+	else
+		return -EINVAL;
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		rc = -EIO;
+	return rc;
+}
+
+static int bnxt_hwrm_set_cache_line_size(struct bnxt *bp, int size)
+{
+	struct hwrm_func_cfg_input req = {0};
+	int rc;
+
+	if (BNXT_VF(bp) || bp->hwrm_spec_code < 0x10803)
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+	req.fid = cpu_to_le16(0xffff);
+	req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_CACHE_LINESIZE);
+	req.options = FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SIZE_64;
+	if (size == 128)
+		req.options = FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SIZE_128;
+
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		rc = -EIO;
+	return rc;
+}
+
+static int bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id)
+{
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
+	int rc;
+
+	if (vnic->flags & BNXT_VNIC_RFS_NEW_RSS_FLAG)
+		goto skip_rss_ctx;
+
+	/* allocate context for vnic */
+	rc = bnxt_hwrm_vnic_ctx_alloc(bp, vnic_id, 0);
+	if (rc) {
+		netdev_err(bp->dev, "hwrm vnic %d alloc failure rc: %x\n",
+			   vnic_id, rc);
+		goto vnic_setup_err;
+	}
+	bp->rsscos_nr_ctxs++;
+
+	if (BNXT_CHIP_TYPE_NITRO_A0(bp)) {
+		rc = bnxt_hwrm_vnic_ctx_alloc(bp, vnic_id, 1);
+		if (rc) {
+			netdev_err(bp->dev, "hwrm vnic %d cos ctx alloc failure rc: %x\n",
+				   vnic_id, rc);
+			goto vnic_setup_err;
+		}
+		bp->rsscos_nr_ctxs++;
+	}
+
+skip_rss_ctx:
+	/* configure default vnic, ring grp */
+	rc = bnxt_hwrm_vnic_cfg(bp, vnic_id);
+	if (rc) {
+		netdev_err(bp->dev, "hwrm vnic %d cfg failure rc: %x\n",
+			   vnic_id, rc);
+		goto vnic_setup_err;
+	}
+
+	/* Enable RSS hashing on vnic */
+	rc = bnxt_hwrm_vnic_set_rss(bp, vnic_id, true);
+	if (rc) {
+		netdev_err(bp->dev, "hwrm vnic %d set rss failure rc: %x\n",
+			   vnic_id, rc);
+		goto vnic_setup_err;
+	}
+
+	if (bp->flags & BNXT_FLAG_AGG_RINGS) {
+		rc = bnxt_hwrm_vnic_set_hds(bp, vnic_id);
+		if (rc) {
+			netdev_err(bp->dev, "hwrm vnic %d set hds failure rc: %x\n",
+				   vnic_id, rc);
+		}
+	}
+
+vnic_setup_err:
+	return rc;
+}
+
+static int bnxt_alloc_rfs_vnics(struct bnxt *bp)
+{
+#ifdef CONFIG_RFS_ACCEL
+	int i, rc = 0;
+
+	for (i = 0; i < bp->rx_nr_rings; i++) {
+		struct bnxt_vnic_info *vnic;
+		u16 vnic_id = i + 1;
+		u16 ring_id = i;
+
+		if (vnic_id >= bp->nr_vnics)
+			break;
+
+		vnic = &bp->vnic_info[vnic_id];
+		vnic->flags |= BNXT_VNIC_RFS_FLAG;
+		if (bp->flags & BNXT_FLAG_NEW_RSS_CAP)
+			vnic->flags |= BNXT_VNIC_RFS_NEW_RSS_FLAG;
+		rc = bnxt_hwrm_vnic_alloc(bp, vnic_id, ring_id, 1);
+		if (rc) {
+			netdev_err(bp->dev, "hwrm vnic %d alloc failure rc: %x\n",
+				   vnic_id, rc);
+			break;
+		}
+		rc = bnxt_setup_vnic(bp, vnic_id);
+		if (rc)
+			break;
+	}
+	return rc;
+#else
+	return 0;
+#endif
+}
+
+/* Allow PF and VF with default VLAN to be in promiscuous mode */
+static bool bnxt_promisc_ok(struct bnxt *bp)
+{
+#ifdef CONFIG_BNXT_SRIOV
+	if (BNXT_VF(bp) && !bp->vf.vlan)
+		return false;
+#endif
+	return true;
+}
+
+static int bnxt_setup_nitroa0_vnic(struct bnxt *bp)
+{
+	unsigned int rc = 0;
+
+	rc = bnxt_hwrm_vnic_alloc(bp, 1, bp->rx_nr_rings - 1, 1);
+	if (rc) {
+		netdev_err(bp->dev, "Cannot allocate special vnic for NS2 A0: %x\n",
+			   rc);
+		return rc;
+	}
+
+	rc = bnxt_hwrm_vnic_cfg(bp, 1);
+	if (rc) {
+		netdev_err(bp->dev, "Cannot allocate special vnic for NS2 A0: %x\n",
+			   rc);
+		return rc;
+	}
+	return rc;
+}
+
+static int bnxt_cfg_rx_mode(struct bnxt *);
+static bool bnxt_mc_list_updated(struct bnxt *, u32 *);
+
+static int bnxt_init_chip(struct bnxt *bp, bool irq_re_init)
+{
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[0];
+	int rc = 0;
+	unsigned int rx_nr_rings = bp->rx_nr_rings;
+
+	if (irq_re_init) {
+		rc = bnxt_hwrm_stat_ctx_alloc(bp);
+		if (rc) {
+			netdev_err(bp->dev, "hwrm stat ctx alloc failure rc: %x\n",
+				   rc);
+			goto err_out;
+		}
+	}
+
+	rc = bnxt_hwrm_ring_alloc(bp);
+	if (rc) {
+		netdev_err(bp->dev, "hwrm ring alloc failure rc: %x\n", rc);
+		goto err_out;
+	}
+
+	rc = bnxt_hwrm_ring_grp_alloc(bp);
+	if (rc) {
+		netdev_err(bp->dev, "hwrm_ring_grp alloc failure: %x\n", rc);
+		goto err_out;
+	}
+
+	if (BNXT_CHIP_TYPE_NITRO_A0(bp))
+		rx_nr_rings--;
+
+	/* default vnic 0 */
+	rc = bnxt_hwrm_vnic_alloc(bp, 0, 0, rx_nr_rings);
+	if (rc) {
+		netdev_err(bp->dev, "hwrm vnic alloc failure rc: %x\n", rc);
+		goto err_out;
+	}
+
+	rc = bnxt_setup_vnic(bp, 0);
+	if (rc)
+		goto err_out;
+
+	if (bp->flags & BNXT_FLAG_RFS) {
+		rc = bnxt_alloc_rfs_vnics(bp);
+		if (rc)
+			goto err_out;
+	}
+
+	if (bp->flags & BNXT_FLAG_TPA) {
+		rc = bnxt_set_tpa(bp, true);
+		if (rc)
+			goto err_out;
+	}
+
+	if (BNXT_VF(bp))
+		bnxt_update_vf_mac(bp);
+
+	/* Filter for default vnic 0 */
+	rc = bnxt_hwrm_set_vnic_filter(bp, 0, 0, bp->dev->dev_addr);
+	if (rc) {
+		netdev_err(bp->dev, "HWRM vnic filter failure rc: %x\n", rc);
+		goto err_out;
+	}
+	vnic->uc_filter_count = 1;
+
+	vnic->rx_mask = CFA_L2_SET_RX_MASK_REQ_MASK_BCAST;
+
+	if ((bp->dev->flags & IFF_PROMISC) && bnxt_promisc_ok(bp))
+		vnic->rx_mask |= CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS;
+
+	if (bp->dev->flags & IFF_ALLMULTI) {
+		vnic->rx_mask |= CFA_L2_SET_RX_MASK_REQ_MASK_ALL_MCAST;
+		vnic->mc_list_count = 0;
+	} else {
+		u32 mask = 0;
+
+		bnxt_mc_list_updated(bp, &mask);
+		vnic->rx_mask |= mask;
+	}
+
+	rc = bnxt_cfg_rx_mode(bp);
+	if (rc)
+		goto err_out;
+
+	rc = bnxt_hwrm_set_coal(bp);
+	if (rc)
+		netdev_warn(bp->dev, "HWRM set coalescing failure rc: %x\n",
+				rc);
+
+	if (BNXT_CHIP_TYPE_NITRO_A0(bp)) {
+		rc = bnxt_setup_nitroa0_vnic(bp);
+		if (rc)
+			netdev_err(bp->dev, "Special vnic setup failure for NS2 A0 rc: %x\n",
+				   rc);
+	}
+
+	if (BNXT_VF(bp)) {
+		bnxt_hwrm_func_qcfg(bp);
+		netdev_update_features(bp->dev);
+	}
+
+	return 0;
+
+err_out:
+	bnxt_hwrm_resource_free(bp, 0, true);
+
+	return rc;
+}
+
+static int bnxt_shutdown_nic(struct bnxt *bp, bool irq_re_init)
+{
+	bnxt_hwrm_resource_free(bp, 1, irq_re_init);
+	return 0;
+}
+
+static int bnxt_init_nic(struct bnxt *bp, bool irq_re_init)
+{
+	bnxt_init_cp_rings(bp);
+	bnxt_init_rx_rings(bp);
+	bnxt_init_tx_rings(bp);
+	bnxt_init_ring_grps(bp, irq_re_init);
+	bnxt_init_vnics(bp);
+
+	return bnxt_init_chip(bp, irq_re_init);
+}
+
+static int bnxt_set_real_num_queues(struct bnxt *bp)
+{
+	int rc;
+	struct net_device *dev = bp->dev;
+
+	rc = netif_set_real_num_tx_queues(dev, bp->tx_nr_rings -
+					  bp->tx_nr_rings_xdp);
+	if (rc)
+		return rc;
+
+	rc = netif_set_real_num_rx_queues(dev, bp->rx_nr_rings);
+	if (rc)
+		return rc;
+
+#ifdef CONFIG_RFS_ACCEL
+	if (bp->flags & BNXT_FLAG_RFS)
+		dev->rx_cpu_rmap = alloc_irq_cpu_rmap(bp->rx_nr_rings);
+#endif
+
+	return rc;
+}
+
+static int bnxt_trim_rings(struct bnxt *bp, int *rx, int *tx, int max,
+			   bool shared)
+{
+	int _rx = *rx, _tx = *tx;
+
+	if (shared) {
+		*rx = min_t(int, _rx, max);
+		*tx = min_t(int, _tx, max);
+	} else {
+		if (max < 2)
+			return -ENOMEM;
+
+		while (_rx + _tx > max) {
+			if (_rx > _tx && _rx > 1)
+				_rx--;
+			else if (_tx > 1)
+				_tx--;
+		}
+		*rx = _rx;
+		*tx = _tx;
+	}
+	return 0;
+}
+
+static void bnxt_setup_msix(struct bnxt *bp)
+{
+	const int len = sizeof(bp->irq_tbl[0].name);
+	struct net_device *dev = bp->dev;
+	int tcs, i;
+
+	tcs = netdev_get_num_tc(dev);
+	if (tcs > 1) {
+		int i, off, count;
+
+		for (i = 0; i < tcs; i++) {
+			count = bp->tx_nr_rings_per_tc;
+			off = i * count;
+			netdev_set_tc_queue(dev, i, count, off);
+		}
+	}
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		int map_idx = bnxt_cp_num_to_irq_num(bp, i);
+		char *attr;
+
+		if (bp->flags & BNXT_FLAG_SHARED_RINGS)
+			attr = "TxRx";
+		else if (i < bp->rx_nr_rings)
+			attr = "rx";
+		else
+			attr = "tx";
+
+		snprintf(bp->irq_tbl[map_idx].name, len, "%s-%s-%d", dev->name,
+			 attr, i);
+		bp->irq_tbl[map_idx].handler = bnxt_msix;
+	}
+}
+
+static void bnxt_setup_inta(struct bnxt *bp)
+{
+	const int len = sizeof(bp->irq_tbl[0].name);
+
+	if (netdev_get_num_tc(bp->dev))
+		netdev_reset_tc(bp->dev);
+
+	snprintf(bp->irq_tbl[0].name, len, "%s-%s-%d", bp->dev->name, "TxRx",
+		 0);
+	bp->irq_tbl[0].handler = bnxt_inta;
+}
+
+static int bnxt_setup_int_mode(struct bnxt *bp)
+{
+	int rc;
+
+	if (bp->flags & BNXT_FLAG_USING_MSIX)
+		bnxt_setup_msix(bp);
+	else
+		bnxt_setup_inta(bp);
+
+	rc = bnxt_set_real_num_queues(bp);
+	return rc;
+}
+
+#ifdef CONFIG_RFS_ACCEL
+static unsigned int bnxt_get_max_func_rss_ctxs(struct bnxt *bp)
+{
+	return bp->hw_resc.max_rsscos_ctxs;
+}
+
+static unsigned int bnxt_get_max_func_vnics(struct bnxt *bp)
+{
+	return bp->hw_resc.max_vnics;
+}
+#endif
+
+unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp)
+{
+	return bp->hw_resc.max_stat_ctxs;
+}
+
+void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max)
+{
+	bp->hw_resc.max_stat_ctxs = max;
+}
+
+unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp)
+{
+	return bp->hw_resc.max_cp_rings;
+}
+
+void bnxt_set_max_func_cp_rings(struct bnxt *bp, unsigned int max)
+{
+	bp->hw_resc.max_cp_rings = max;
+}
+
+unsigned int bnxt_get_max_func_irqs(struct bnxt *bp)
+{
+	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+
+	return min_t(unsigned int, hw_resc->max_irqs, hw_resc->max_cp_rings);
+}
+
+void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max_irqs)
+{
+	bp->hw_resc.max_irqs = max_irqs;
+}
+
+int bnxt_get_avail_msix(struct bnxt *bp, int num)
+{
+	int max_cp = bnxt_get_max_func_cp_rings(bp);
+	int max_irq = bnxt_get_max_func_irqs(bp);
+	int total_req = bp->cp_nr_rings + num;
+	int max_idx, avail_msix;
+
+	max_idx = min_t(int, bp->total_irqs, max_cp);
+	avail_msix = max_idx - bp->cp_nr_rings;
+	if (!(bp->flags & BNXT_FLAG_NEW_RM) || avail_msix >= num)
+		return avail_msix;
+
+	if (max_irq < total_req) {
+		num = max_irq - bp->cp_nr_rings;
+		if (num <= 0)
+			return 0;
+	}
+	return num;
+}
+
+static int bnxt_get_num_msix(struct bnxt *bp)
+{
+	if (!(bp->flags & BNXT_FLAG_NEW_RM))
+		return bnxt_get_max_func_irqs(bp);
+
+	return bnxt_cp_rings_in_use(bp);
+}
+
+static int bnxt_init_msix(struct bnxt *bp)
+{
+	int i, total_vecs, max, rc = 0, min = 1, ulp_msix;
+	struct msix_entry *msix_ent;
+
+	total_vecs = bnxt_get_num_msix(bp);
+	max = bnxt_get_max_func_irqs(bp);
+	if (total_vecs > max)
+		total_vecs = max;
+
+	msix_ent = kcalloc(total_vecs, sizeof(struct msix_entry), GFP_KERNEL);
+	if (!msix_ent)
+		return -ENOMEM;
+
+	for (i = 0; i < total_vecs; i++) {
+		msix_ent[i].entry = i;
+		msix_ent[i].vector = 0;
+	}
+
+	if (!(bp->flags & BNXT_FLAG_SHARED_RINGS))
+		min = 2;
+
+	total_vecs = pci_enable_msix_range(bp->pdev, msix_ent, min, total_vecs);
+	ulp_msix = bnxt_get_ulp_msix_num(bp);
+	if (total_vecs < 0 || total_vecs < ulp_msix) {
+		rc = -ENODEV;
+		goto msix_setup_exit;
+	}
+
+	bp->irq_tbl = kcalloc(total_vecs, sizeof(struct bnxt_irq), GFP_KERNEL);
+	if (bp->irq_tbl) {
+		for (i = 0; i < total_vecs; i++)
+			bp->irq_tbl[i].vector = msix_ent[i].vector;
+
+		bp->total_irqs = total_vecs;
+		/* Trim rings based upon num of vectors allocated */
+		rc = bnxt_trim_rings(bp, &bp->rx_nr_rings, &bp->tx_nr_rings,
+				     total_vecs - ulp_msix, min == 1);
+		if (rc)
+			goto msix_setup_exit;
+
+		bp->cp_nr_rings = (min == 1) ?
+				  max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) :
+				  bp->tx_nr_rings + bp->rx_nr_rings;
+
+	} else {
+		rc = -ENOMEM;
+		goto msix_setup_exit;
+	}
+	bp->flags |= BNXT_FLAG_USING_MSIX;
+	kfree(msix_ent);
+	return 0;
+
+msix_setup_exit:
+	netdev_err(bp->dev, "bnxt_init_msix err: %x\n", rc);
+	kfree(bp->irq_tbl);
+	bp->irq_tbl = NULL;
+	pci_disable_msix(bp->pdev);
+	kfree(msix_ent);
+	return rc;
+}
+
+static int bnxt_init_inta(struct bnxt *bp)
+{
+	bp->irq_tbl = kcalloc(1, sizeof(struct bnxt_irq), GFP_KERNEL);
+	if (!bp->irq_tbl)
+		return -ENOMEM;
+
+	bp->total_irqs = 1;
+	bp->rx_nr_rings = 1;
+	bp->tx_nr_rings = 1;
+	bp->cp_nr_rings = 1;
+	bp->flags |= BNXT_FLAG_SHARED_RINGS;
+	bp->irq_tbl[0].vector = bp->pdev->irq;
+	return 0;
+}
+
+static int bnxt_init_int_mode(struct bnxt *bp)
+{
+	int rc = 0;
+
+	if (bp->flags & BNXT_FLAG_MSIX_CAP)
+		rc = bnxt_init_msix(bp);
+
+	if (!(bp->flags & BNXT_FLAG_USING_MSIX) && BNXT_PF(bp)) {
+		/* fallback to INTA */
+		rc = bnxt_init_inta(bp);
+	}
+	return rc;
+}
+
+static void bnxt_clear_int_mode(struct bnxt *bp)
+{
+	if (bp->flags & BNXT_FLAG_USING_MSIX)
+		pci_disable_msix(bp->pdev);
+
+	kfree(bp->irq_tbl);
+	bp->irq_tbl = NULL;
+	bp->flags &= ~BNXT_FLAG_USING_MSIX;
+}
+
+int bnxt_reserve_rings(struct bnxt *bp)
+{
+	int tcs = netdev_get_num_tc(bp->dev);
+	int rc;
+
+	if (!bnxt_need_reserve_rings(bp))
+		return 0;
+
+	rc = __bnxt_reserve_rings(bp);
+	if (rc) {
+		netdev_err(bp->dev, "ring reservation failure rc: %d\n", rc);
+		return rc;
+	}
+	if ((bp->flags & BNXT_FLAG_NEW_RM) &&
+	    (bnxt_get_num_msix(bp) != bp->total_irqs)) {
+		bnxt_ulp_irq_stop(bp);
+		bnxt_clear_int_mode(bp);
+		rc = bnxt_init_int_mode(bp);
+		bnxt_ulp_irq_restart(bp, rc);
+		if (rc)
+			return rc;
+	}
+	if (tcs && (bp->tx_nr_rings_per_tc * tcs != bp->tx_nr_rings)) {
+		netdev_err(bp->dev, "tx ring reservation failure\n");
+		netdev_reset_tc(bp->dev);
+		bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
+		return -ENOMEM;
+	}
+	bp->num_stat_ctxs = bp->cp_nr_rings;
+	return 0;
+}
+
+static void bnxt_free_irq(struct bnxt *bp)
+{
+	struct bnxt_irq *irq;
+	int i;
+
+#ifdef CONFIG_RFS_ACCEL
+	free_irq_cpu_rmap(bp->dev->rx_cpu_rmap);
+	bp->dev->rx_cpu_rmap = NULL;
+#endif
+	if (!bp->irq_tbl || !bp->bnapi)
+		return;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		int map_idx = bnxt_cp_num_to_irq_num(bp, i);
+
+		irq = &bp->irq_tbl[map_idx];
+		if (irq->requested) {
+			if (irq->have_cpumask) {
+				irq_set_affinity_hint(irq->vector, NULL);
+				free_cpumask_var(irq->cpu_mask);
+				irq->have_cpumask = 0;
+			}
+			free_irq(irq->vector, bp->bnapi[i]);
+		}
+
+		irq->requested = 0;
+	}
+}
+
+static int bnxt_request_irq(struct bnxt *bp)
+{
+	int i, j, rc = 0;
+	unsigned long flags = 0;
+#ifdef CONFIG_RFS_ACCEL
+	struct cpu_rmap *rmap;
+#endif
+
+	rc = bnxt_setup_int_mode(bp);
+	if (rc) {
+		netdev_err(bp->dev, "bnxt_setup_int_mode err: %x\n",
+			   rc);
+		return rc;
+	}
+#ifdef CONFIG_RFS_ACCEL
+	rmap = bp->dev->rx_cpu_rmap;
+#endif
+	if (!(bp->flags & BNXT_FLAG_USING_MSIX))
+		flags = IRQF_SHARED;
+
+	for (i = 0, j = 0; i < bp->cp_nr_rings; i++) {
+		int map_idx = bnxt_cp_num_to_irq_num(bp, i);
+		struct bnxt_irq *irq = &bp->irq_tbl[map_idx];
+
+#ifdef CONFIG_RFS_ACCEL
+		if (rmap && bp->bnapi[i]->rx_ring) {
+			rc = irq_cpu_rmap_add(rmap, irq->vector);
+			if (rc)
+				netdev_warn(bp->dev, "failed adding irq rmap for ring %d\n",
+					    j);
+			j++;
+		}
+#endif
+		rc = request_irq(irq->vector, irq->handler, flags, irq->name,
+				 bp->bnapi[i]);
+		if (rc)
+			break;
+
+		irq->requested = 1;
+
+		if (zalloc_cpumask_var(&irq->cpu_mask, GFP_KERNEL)) {
+			int numa_node = dev_to_node(&bp->pdev->dev);
+
+			irq->have_cpumask = 1;
+			cpumask_set_cpu(cpumask_local_spread(i, numa_node),
+					irq->cpu_mask);
+			rc = irq_set_affinity_hint(irq->vector, irq->cpu_mask);
+			if (rc) {
+				netdev_warn(bp->dev,
+					    "Set affinity failed, IRQ = %d\n",
+					    irq->vector);
+				break;
+			}
+		}
+	}
+	return rc;
+}
+
+static void bnxt_del_napi(struct bnxt *bp)
+{
+	int i;
+
+	if (!bp->bnapi)
+		return;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+
+		napi_hash_del(&bnapi->napi);
+		netif_napi_del(&bnapi->napi);
+	}
+	/* We called napi_hash_del() before netif_napi_del(), we need
+	 * to respect an RCU grace period before freeing napi structures.
+	 */
+	synchronize_net();
+}
+
+static void bnxt_init_napi(struct bnxt *bp)
+{
+	int i;
+	unsigned int cp_nr_rings = bp->cp_nr_rings;
+	struct bnxt_napi *bnapi;
+
+	if (bp->flags & BNXT_FLAG_USING_MSIX) {
+		if (BNXT_CHIP_TYPE_NITRO_A0(bp))
+			cp_nr_rings--;
+		for (i = 0; i < cp_nr_rings; i++) {
+			bnapi = bp->bnapi[i];
+			netif_napi_add(bp->dev, &bnapi->napi,
+				       bnxt_poll, 64);
+		}
+		if (BNXT_CHIP_TYPE_NITRO_A0(bp)) {
+			bnapi = bp->bnapi[cp_nr_rings];
+			netif_napi_add(bp->dev, &bnapi->napi,
+				       bnxt_poll_nitroa0, 64);
+		}
+	} else {
+		bnapi = bp->bnapi[0];
+		netif_napi_add(bp->dev, &bnapi->napi, bnxt_poll, 64);
+	}
+}
+
+static void bnxt_disable_napi(struct bnxt *bp)
+{
+	int i;
+
+	if (!bp->bnapi)
+		return;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_cp_ring_info *cpr = &bp->bnapi[i]->cp_ring;
+
+		if (bp->bnapi[i]->rx_ring)
+			cancel_work_sync(&cpr->dim.work);
+
+		napi_disable(&bp->bnapi[i]->napi);
+	}
+}
+
+static void bnxt_enable_napi(struct bnxt *bp)
+{
+	int i;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_cp_ring_info *cpr = &bp->bnapi[i]->cp_ring;
+		bp->bnapi[i]->in_reset = false;
+
+		if (bp->bnapi[i]->rx_ring) {
+			INIT_WORK(&cpr->dim.work, bnxt_dim_work);
+			cpr->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+		}
+		napi_enable(&bp->bnapi[i]->napi);
+	}
+}
+
+void bnxt_tx_disable(struct bnxt *bp)
+{
+	int i;
+	struct bnxt_tx_ring_info *txr;
+
+	if (bp->tx_ring) {
+		for (i = 0; i < bp->tx_nr_rings; i++) {
+			txr = &bp->tx_ring[i];
+			txr->dev_state = BNXT_DEV_STATE_CLOSING;
+		}
+	}
+	/* Stop all TX queues */
+	netif_tx_disable(bp->dev);
+	netif_carrier_off(bp->dev);
+}
+
+void bnxt_tx_enable(struct bnxt *bp)
+{
+	int i;
+	struct bnxt_tx_ring_info *txr;
+
+	for (i = 0; i < bp->tx_nr_rings; i++) {
+		txr = &bp->tx_ring[i];
+		txr->dev_state = 0;
+	}
+	netif_tx_wake_all_queues(bp->dev);
+	if (bp->link_info.link_up)
+		netif_carrier_on(bp->dev);
+}
+
+static void bnxt_report_link(struct bnxt *bp)
+{
+	if (bp->link_info.link_up) {
+		const char *duplex;
+		const char *flow_ctrl;
+		u32 speed;
+		u16 fec;
+
+		netif_carrier_on(bp->dev);
+		if (bp->link_info.duplex == BNXT_LINK_DUPLEX_FULL)
+			duplex = "full";
+		else
+			duplex = "half";
+		if (bp->link_info.pause == BNXT_LINK_PAUSE_BOTH)
+			flow_ctrl = "ON - receive & transmit";
+		else if (bp->link_info.pause == BNXT_LINK_PAUSE_TX)
+			flow_ctrl = "ON - transmit";
+		else if (bp->link_info.pause == BNXT_LINK_PAUSE_RX)
+			flow_ctrl = "ON - receive";
+		else
+			flow_ctrl = "none";
+		speed = bnxt_fw_to_ethtool_speed(bp->link_info.link_speed);
+		netdev_info(bp->dev, "NIC Link is Up, %u Mbps %s duplex, Flow control: %s\n",
+			    speed, duplex, flow_ctrl);
+		if (bp->flags & BNXT_FLAG_EEE_CAP)
+			netdev_info(bp->dev, "EEE is %s\n",
+				    bp->eee.eee_active ? "active" :
+							 "not active");
+		fec = bp->link_info.fec_cfg;
+		if (!(fec & PORT_PHY_QCFG_RESP_FEC_CFG_FEC_NONE_SUPPORTED))
+			netdev_info(bp->dev, "FEC autoneg %s encodings: %s\n",
+				    (fec & BNXT_FEC_AUTONEG) ? "on" : "off",
+				    (fec & BNXT_FEC_ENC_BASE_R) ? "BaseR" :
+				     (fec & BNXT_FEC_ENC_RS) ? "RS" : "None");
+	} else {
+		netif_carrier_off(bp->dev);
+		netdev_err(bp->dev, "NIC Link is Down\n");
+	}
+}
+
+static int bnxt_hwrm_phy_qcaps(struct bnxt *bp)
+{
+	int rc = 0;
+	struct hwrm_port_phy_qcaps_input req = {0};
+	struct hwrm_port_phy_qcaps_output *resp = bp->hwrm_cmd_resp_addr;
+	struct bnxt_link_info *link_info = &bp->link_info;
+
+	if (bp->hwrm_spec_code < 0x10201)
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_PHY_QCAPS, -1, -1);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		goto hwrm_phy_qcaps_exit;
+
+	if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_EEE_SUPPORTED) {
+		struct ethtool_eee *eee = &bp->eee;
+		u16 fw_speeds = le16_to_cpu(resp->supported_speeds_eee_mode);
+
+		bp->flags |= BNXT_FLAG_EEE_CAP;
+		eee->supported = _bnxt_fw_to_ethtool_adv_spds(fw_speeds, 0);
+		bp->lpi_tmr_lo = le32_to_cpu(resp->tx_lpi_timer_low) &
+				 PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_LOW_MASK;
+		bp->lpi_tmr_hi = le32_to_cpu(resp->valid_tx_lpi_timer_high) &
+				 PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_HIGH_MASK;
+	}
+	if (resp->supported_speeds_auto_mode)
+		link_info->support_auto_speeds =
+			le16_to_cpu(resp->supported_speeds_auto_mode);
+
+	bp->port_count = resp->port_cnt;
+
+hwrm_phy_qcaps_exit:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_update_link(struct bnxt *bp, bool chng_link_state)
+{
+	int rc = 0;
+	struct bnxt_link_info *link_info = &bp->link_info;
+	struct hwrm_port_phy_qcfg_input req = {0};
+	struct hwrm_port_phy_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+	u8 link_up = link_info->link_up;
+	u16 diff;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_PHY_QCFG, -1, -1);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc) {
+		mutex_unlock(&bp->hwrm_cmd_lock);
+		return rc;
+	}
+
+	memcpy(&link_info->phy_qcfg_resp, resp, sizeof(*resp));
+	link_info->phy_link_status = resp->link;
+	link_info->duplex = resp->duplex_cfg;
+	if (bp->hwrm_spec_code >= 0x10800)
+		link_info->duplex = resp->duplex_state;
+	link_info->pause = resp->pause;
+	link_info->auto_mode = resp->auto_mode;
+	link_info->auto_pause_setting = resp->auto_pause;
+	link_info->lp_pause = resp->link_partner_adv_pause;
+	link_info->force_pause_setting = resp->force_pause;
+	link_info->duplex_setting = resp->duplex_cfg;
+	if (link_info->phy_link_status == BNXT_LINK_LINK)
+		link_info->link_speed = le16_to_cpu(resp->link_speed);
+	else
+		link_info->link_speed = 0;
+	link_info->force_link_speed = le16_to_cpu(resp->force_link_speed);
+	link_info->support_speeds = le16_to_cpu(resp->support_speeds);
+	link_info->auto_link_speeds = le16_to_cpu(resp->auto_link_speed_mask);
+	link_info->lp_auto_link_speeds =
+		le16_to_cpu(resp->link_partner_adv_speeds);
+	link_info->preemphasis = le32_to_cpu(resp->preemphasis);
+	link_info->phy_ver[0] = resp->phy_maj;
+	link_info->phy_ver[1] = resp->phy_min;
+	link_info->phy_ver[2] = resp->phy_bld;
+	link_info->media_type = resp->media_type;
+	link_info->phy_type = resp->phy_type;
+	link_info->transceiver = resp->xcvr_pkg_type;
+	link_info->phy_addr = resp->eee_config_phy_addr &
+			      PORT_PHY_QCFG_RESP_PHY_ADDR_MASK;
+	link_info->module_status = resp->module_status;
+
+	if (bp->flags & BNXT_FLAG_EEE_CAP) {
+		struct ethtool_eee *eee = &bp->eee;
+		u16 fw_speeds;
+
+		eee->eee_active = 0;
+		if (resp->eee_config_phy_addr &
+		    PORT_PHY_QCFG_RESP_EEE_CONFIG_EEE_ACTIVE) {
+			eee->eee_active = 1;
+			fw_speeds = le16_to_cpu(
+				resp->link_partner_adv_eee_link_speed_mask);
+			eee->lp_advertised =
+				_bnxt_fw_to_ethtool_adv_spds(fw_speeds, 0);
+		}
+
+		/* Pull initial EEE config */
+		if (!chng_link_state) {
+			if (resp->eee_config_phy_addr &
+			    PORT_PHY_QCFG_RESP_EEE_CONFIG_EEE_ENABLED)
+				eee->eee_enabled = 1;
+
+			fw_speeds = le16_to_cpu(resp->adv_eee_link_speed_mask);
+			eee->advertised =
+				_bnxt_fw_to_ethtool_adv_spds(fw_speeds, 0);
+
+			if (resp->eee_config_phy_addr &
+			    PORT_PHY_QCFG_RESP_EEE_CONFIG_EEE_TX_LPI) {
+				__le32 tmr;
+
+				eee->tx_lpi_enabled = 1;
+				tmr = resp->xcvr_identifier_type_tx_lpi_timer;
+				eee->tx_lpi_timer = le32_to_cpu(tmr) &
+					PORT_PHY_QCFG_RESP_TX_LPI_TIMER_MASK;
+			}
+		}
+	}
+
+	link_info->fec_cfg = PORT_PHY_QCFG_RESP_FEC_CFG_FEC_NONE_SUPPORTED;
+	if (bp->hwrm_spec_code >= 0x10504)
+		link_info->fec_cfg = le16_to_cpu(resp->fec_cfg);
+
+	/* TODO: need to add more logic to report VF link */
+	if (chng_link_state) {
+		if (link_info->phy_link_status == BNXT_LINK_LINK)
+			link_info->link_up = 1;
+		else
+			link_info->link_up = 0;
+		if (link_up != link_info->link_up)
+			bnxt_report_link(bp);
+	} else {
+		/* alwasy link down if not require to update link state */
+		link_info->link_up = 0;
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+
+	diff = link_info->support_auto_speeds ^ link_info->advertising;
+	if ((link_info->support_auto_speeds | diff) !=
+	    link_info->support_auto_speeds) {
+		/* An advertised speed is no longer supported, so we need to
+		 * update the advertisement settings.  Caller holds RTNL
+		 * so we can modify link settings.
+		 */
+		link_info->advertising = link_info->support_auto_speeds;
+		if (link_info->autoneg & BNXT_AUTONEG_SPEED)
+			bnxt_hwrm_set_link_setting(bp, true, false);
+	}
+	return 0;
+}
+
+static void bnxt_get_port_module_status(struct bnxt *bp)
+{
+	struct bnxt_link_info *link_info = &bp->link_info;
+	struct hwrm_port_phy_qcfg_output *resp = &link_info->phy_qcfg_resp;
+	u8 module_status;
+
+	if (bnxt_update_link(bp, true))
+		return;
+
+	module_status = link_info->module_status;
+	switch (module_status) {
+	case PORT_PHY_QCFG_RESP_MODULE_STATUS_DISABLETX:
+	case PORT_PHY_QCFG_RESP_MODULE_STATUS_PWRDOWN:
+	case PORT_PHY_QCFG_RESP_MODULE_STATUS_WARNINGMSG:
+		netdev_warn(bp->dev, "Unqualified SFP+ module detected on port %d\n",
+			    bp->pf.port_id);
+		if (bp->hwrm_spec_code >= 0x10201) {
+			netdev_warn(bp->dev, "Module part number %s\n",
+				    resp->phy_vendor_partnumber);
+		}
+		if (module_status == PORT_PHY_QCFG_RESP_MODULE_STATUS_DISABLETX)
+			netdev_warn(bp->dev, "TX is disabled\n");
+		if (module_status == PORT_PHY_QCFG_RESP_MODULE_STATUS_PWRDOWN)
+			netdev_warn(bp->dev, "SFP+ module is shutdown\n");
+	}
+}
+
+static void
+bnxt_hwrm_set_pause_common(struct bnxt *bp, struct hwrm_port_phy_cfg_input *req)
+{
+	if (bp->link_info.autoneg & BNXT_AUTONEG_FLOW_CTRL) {
+		if (bp->hwrm_spec_code >= 0x10201)
+			req->auto_pause =
+				PORT_PHY_CFG_REQ_AUTO_PAUSE_AUTONEG_PAUSE;
+		if (bp->link_info.req_flow_ctrl & BNXT_LINK_PAUSE_RX)
+			req->auto_pause |= PORT_PHY_CFG_REQ_AUTO_PAUSE_RX;
+		if (bp->link_info.req_flow_ctrl & BNXT_LINK_PAUSE_TX)
+			req->auto_pause |= PORT_PHY_CFG_REQ_AUTO_PAUSE_TX;
+		req->enables |=
+			cpu_to_le32(PORT_PHY_CFG_REQ_ENABLES_AUTO_PAUSE);
+	} else {
+		if (bp->link_info.req_flow_ctrl & BNXT_LINK_PAUSE_RX)
+			req->force_pause |= PORT_PHY_CFG_REQ_FORCE_PAUSE_RX;
+		if (bp->link_info.req_flow_ctrl & BNXT_LINK_PAUSE_TX)
+			req->force_pause |= PORT_PHY_CFG_REQ_FORCE_PAUSE_TX;
+		req->enables |=
+			cpu_to_le32(PORT_PHY_CFG_REQ_ENABLES_FORCE_PAUSE);
+		if (bp->hwrm_spec_code >= 0x10201) {
+			req->auto_pause = req->force_pause;
+			req->enables |= cpu_to_le32(
+				PORT_PHY_CFG_REQ_ENABLES_AUTO_PAUSE);
+		}
+	}
+}
+
+static void bnxt_hwrm_set_link_common(struct bnxt *bp,
+				      struct hwrm_port_phy_cfg_input *req)
+{
+	u8 autoneg = bp->link_info.autoneg;
+	u16 fw_link_speed = bp->link_info.req_link_speed;
+	u16 advertising = bp->link_info.advertising;
+
+	if (autoneg & BNXT_AUTONEG_SPEED) {
+		req->auto_mode |=
+			PORT_PHY_CFG_REQ_AUTO_MODE_SPEED_MASK;
+
+		req->enables |= cpu_to_le32(
+			PORT_PHY_CFG_REQ_ENABLES_AUTO_LINK_SPEED_MASK);
+		req->auto_link_speed_mask = cpu_to_le16(advertising);
+
+		req->enables |= cpu_to_le32(PORT_PHY_CFG_REQ_ENABLES_AUTO_MODE);
+		req->flags |=
+			cpu_to_le32(PORT_PHY_CFG_REQ_FLAGS_RESTART_AUTONEG);
+	} else {
+		req->force_link_speed = cpu_to_le16(fw_link_speed);
+		req->flags |= cpu_to_le32(PORT_PHY_CFG_REQ_FLAGS_FORCE);
+	}
+
+	/* tell chimp that the setting takes effect immediately */
+	req->flags |= cpu_to_le32(PORT_PHY_CFG_REQ_FLAGS_RESET_PHY);
+}
+
+int bnxt_hwrm_set_pause(struct bnxt *bp)
+{
+	struct hwrm_port_phy_cfg_input req = {0};
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_PHY_CFG, -1, -1);
+	bnxt_hwrm_set_pause_common(bp, &req);
+
+	if ((bp->link_info.autoneg & BNXT_AUTONEG_FLOW_CTRL) ||
+	    bp->link_info.force_link_chng)
+		bnxt_hwrm_set_link_common(bp, &req);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc && !(bp->link_info.autoneg & BNXT_AUTONEG_FLOW_CTRL)) {
+		/* since changing of pause setting doesn't trigger any link
+		 * change event, the driver needs to update the current pause
+		 * result upon successfully return of the phy_cfg command
+		 */
+		bp->link_info.pause =
+		bp->link_info.force_pause_setting = bp->link_info.req_flow_ctrl;
+		bp->link_info.auto_pause_setting = 0;
+		if (!bp->link_info.force_link_chng)
+			bnxt_report_link(bp);
+	}
+	bp->link_info.force_link_chng = false;
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static void bnxt_hwrm_set_eee(struct bnxt *bp,
+			      struct hwrm_port_phy_cfg_input *req)
+{
+	struct ethtool_eee *eee = &bp->eee;
+
+	if (eee->eee_enabled) {
+		u16 eee_speeds;
+		u32 flags = PORT_PHY_CFG_REQ_FLAGS_EEE_ENABLE;
+
+		if (eee->tx_lpi_enabled)
+			flags |= PORT_PHY_CFG_REQ_FLAGS_EEE_TX_LPI_ENABLE;
+		else
+			flags |= PORT_PHY_CFG_REQ_FLAGS_EEE_TX_LPI_DISABLE;
+
+		req->flags |= cpu_to_le32(flags);
+		eee_speeds = bnxt_get_fw_auto_link_speeds(eee->advertised);
+		req->eee_link_speed_mask = cpu_to_le16(eee_speeds);
+		req->tx_lpi_timer = cpu_to_le32(eee->tx_lpi_timer);
+	} else {
+		req->flags |= cpu_to_le32(PORT_PHY_CFG_REQ_FLAGS_EEE_DISABLE);
+	}
+}
+
+int bnxt_hwrm_set_link_setting(struct bnxt *bp, bool set_pause, bool set_eee)
+{
+	struct hwrm_port_phy_cfg_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_PHY_CFG, -1, -1);
+	if (set_pause)
+		bnxt_hwrm_set_pause_common(bp, &req);
+
+	bnxt_hwrm_set_link_common(bp, &req);
+
+	if (set_eee)
+		bnxt_hwrm_set_eee(bp, &req);
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_hwrm_shutdown_link(struct bnxt *bp)
+{
+	struct hwrm_port_phy_cfg_input req = {0};
+
+	if (!BNXT_SINGLE_PF(bp))
+		return 0;
+
+	if (pci_num_vf(bp->pdev))
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_PHY_CFG, -1, -1);
+	req.flags = cpu_to_le32(PORT_PHY_CFG_REQ_FLAGS_FORCE_LINK_DWN);
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_hwrm_port_led_qcaps(struct bnxt *bp)
+{
+	struct hwrm_port_led_qcaps_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_port_led_qcaps_input req = {0};
+	struct bnxt_pf_info *pf = &bp->pf;
+	int rc;
+
+	if (BNXT_VF(bp) || bp->hwrm_spec_code < 0x10601)
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_LED_QCAPS, -1, -1);
+	req.port_id = cpu_to_le16(pf->port_id);
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc) {
+		mutex_unlock(&bp->hwrm_cmd_lock);
+		return rc;
+	}
+	if (resp->num_leds > 0 && resp->num_leds < BNXT_MAX_LED) {
+		int i;
+
+		bp->num_leds = resp->num_leds;
+		memcpy(bp->leds, &resp->led0_id, sizeof(bp->leds[0]) *
+						 bp->num_leds);
+		for (i = 0; i < bp->num_leds; i++) {
+			struct bnxt_led_info *led = &bp->leds[i];
+			__le16 caps = led->led_state_caps;
+
+			if (!led->led_group_id ||
+			    !BNXT_LED_ALT_BLINK_CAP(caps)) {
+				bp->num_leds = 0;
+				break;
+			}
+		}
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return 0;
+}
+
+int bnxt_hwrm_alloc_wol_fltr(struct bnxt *bp)
+{
+	struct hwrm_wol_filter_alloc_input req = {0};
+	struct hwrm_wol_filter_alloc_output *resp = bp->hwrm_cmd_resp_addr;
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_WOL_FILTER_ALLOC, -1, -1);
+	req.port_id = cpu_to_le16(bp->pf.port_id);
+	req.wol_type = WOL_FILTER_ALLOC_REQ_WOL_TYPE_MAGICPKT;
+	req.enables = cpu_to_le32(WOL_FILTER_ALLOC_REQ_ENABLES_MAC_ADDRESS);
+	memcpy(req.mac_address, bp->dev->dev_addr, ETH_ALEN);
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc)
+		bp->wol_filter_id = resp->wol_filter_id;
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+int bnxt_hwrm_free_wol_fltr(struct bnxt *bp)
+{
+	struct hwrm_wol_filter_free_input req = {0};
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_WOL_FILTER_FREE, -1, -1);
+	req.port_id = cpu_to_le16(bp->pf.port_id);
+	req.enables = cpu_to_le32(WOL_FILTER_FREE_REQ_ENABLES_WOL_FILTER_ID);
+	req.wol_filter_id = bp->wol_filter_id;
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	return rc;
+}
+
+static u16 bnxt_hwrm_get_wol_fltrs(struct bnxt *bp, u16 handle)
+{
+	struct hwrm_wol_filter_qcfg_input req = {0};
+	struct hwrm_wol_filter_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+	u16 next_handle = 0;
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_WOL_FILTER_QCFG, -1, -1);
+	req.port_id = cpu_to_le16(bp->pf.port_id);
+	req.handle = cpu_to_le16(handle);
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc) {
+		next_handle = le16_to_cpu(resp->next_handle);
+		if (next_handle != 0) {
+			if (resp->wol_type ==
+			    WOL_FILTER_ALLOC_REQ_WOL_TYPE_MAGICPKT) {
+				bp->wol = 1;
+				bp->wol_filter_id = resp->wol_filter_id;
+			}
+		}
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return next_handle;
+}
+
+static void bnxt_get_wol_settings(struct bnxt *bp)
+{
+	u16 handle = 0;
+
+	if (!BNXT_PF(bp) || !(bp->flags & BNXT_FLAG_WOL_CAP))
+		return;
+
+	do {
+		handle = bnxt_hwrm_get_wol_fltrs(bp, handle);
+	} while (handle && handle != 0xffff);
+}
+
+static bool bnxt_eee_config_ok(struct bnxt *bp)
+{
+	struct ethtool_eee *eee = &bp->eee;
+	struct bnxt_link_info *link_info = &bp->link_info;
+
+	if (!(bp->flags & BNXT_FLAG_EEE_CAP))
+		return true;
+
+	if (eee->eee_enabled) {
+		u32 advertising =
+			_bnxt_fw_to_ethtool_adv_spds(link_info->advertising, 0);
+
+		if (!(link_info->autoneg & BNXT_AUTONEG_SPEED)) {
+			eee->eee_enabled = 0;
+			return false;
+		}
+		if (eee->advertised & ~advertising) {
+			eee->advertised = advertising & eee->supported;
+			return false;
+		}
+	}
+	return true;
+}
+
+static int bnxt_update_phy_setting(struct bnxt *bp)
+{
+	int rc;
+	bool update_link = false;
+	bool update_pause = false;
+	bool update_eee = false;
+	struct bnxt_link_info *link_info = &bp->link_info;
+
+	rc = bnxt_update_link(bp, true);
+	if (rc) {
+		netdev_err(bp->dev, "failed to update link (rc: %x)\n",
+			   rc);
+		return rc;
+	}
+	if (!BNXT_SINGLE_PF(bp))
+		return 0;
+
+	if ((link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL) &&
+	    (link_info->auto_pause_setting & BNXT_LINK_PAUSE_BOTH) !=
+	    link_info->req_flow_ctrl)
+		update_pause = true;
+	if (!(link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL) &&
+	    link_info->force_pause_setting != link_info->req_flow_ctrl)
+		update_pause = true;
+	if (!(link_info->autoneg & BNXT_AUTONEG_SPEED)) {
+		if (BNXT_AUTO_MODE(link_info->auto_mode))
+			update_link = true;
+		if (link_info->req_link_speed != link_info->force_link_speed)
+			update_link = true;
+		if (link_info->req_duplex != link_info->duplex_setting)
+			update_link = true;
+	} else {
+		if (link_info->auto_mode == BNXT_LINK_AUTO_NONE)
+			update_link = true;
+		if (link_info->advertising != link_info->auto_link_speeds)
+			update_link = true;
+	}
+
+	/* The last close may have shutdown the link, so need to call
+	 * PHY_CFG to bring it back up.
+	 */
+	if (!netif_carrier_ok(bp->dev))
+		update_link = true;
+
+	if (!bnxt_eee_config_ok(bp))
+		update_eee = true;
+
+	if (update_link)
+		rc = bnxt_hwrm_set_link_setting(bp, update_pause, update_eee);
+	else if (update_pause)
+		rc = bnxt_hwrm_set_pause(bp);
+	if (rc) {
+		netdev_err(bp->dev, "failed to update phy setting (rc: %x)\n",
+			   rc);
+		return rc;
+	}
+
+	return rc;
+}
+
+/* Common routine to pre-map certain register block to different GRC window.
+ * A PF has 16 4K windows and a VF has 4 4K windows. However, only 15 windows
+ * in PF and 3 windows in VF that can be customized to map in different
+ * register blocks.
+ */
+static void bnxt_preset_reg_win(struct bnxt *bp)
+{
+	if (BNXT_PF(bp)) {
+		/* CAG registers map to GRC window #4 */
+		writel(BNXT_CAG_REG_BASE,
+		       bp->bar0 + BNXT_GRCPF_REG_WINDOW_BASE_OUT + 12);
+	}
+}
+
+static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
+{
+	int rc = 0;
+
+	bnxt_preset_reg_win(bp);
+	netif_carrier_off(bp->dev);
+	if (irq_re_init) {
+		rc = bnxt_reserve_rings(bp);
+		if (rc)
+			return rc;
+	}
+	if ((bp->flags & BNXT_FLAG_RFS) &&
+	    !(bp->flags & BNXT_FLAG_USING_MSIX)) {
+		/* disable RFS if falling back to INTA */
+		bp->dev->hw_features &= ~NETIF_F_NTUPLE;
+		bp->flags &= ~BNXT_FLAG_RFS;
+	}
+
+	rc = bnxt_alloc_mem(bp, irq_re_init);
+	if (rc) {
+		netdev_err(bp->dev, "bnxt_alloc_mem err: %x\n", rc);
+		goto open_err_free_mem;
+	}
+
+	if (irq_re_init) {
+		bnxt_init_napi(bp);
+		rc = bnxt_request_irq(bp);
+		if (rc) {
+			netdev_err(bp->dev, "bnxt_request_irq err: %x\n", rc);
+			goto open_err;
+		}
+	}
+
+	bnxt_enable_napi(bp);
+
+	rc = bnxt_init_nic(bp, irq_re_init);
+	if (rc) {
+		netdev_err(bp->dev, "bnxt_init_nic err: %x\n", rc);
+		goto open_err;
+	}
+
+	if (link_re_init) {
+		mutex_lock(&bp->link_lock);
+		rc = bnxt_update_phy_setting(bp);
+		mutex_unlock(&bp->link_lock);
+		if (rc)
+			netdev_warn(bp->dev, "failed to update phy settings\n");
+	}
+
+	if (irq_re_init)
+		udp_tunnel_get_rx_info(bp->dev);
+
+	set_bit(BNXT_STATE_OPEN, &bp->state);
+	bnxt_enable_int(bp);
+	/* Enable TX queues */
+	bnxt_tx_enable(bp);
+	mod_timer(&bp->timer, jiffies + bp->current_interval);
+	/* Poll link status and check for SFP+ module status */
+	bnxt_get_port_module_status(bp);
+
+	/* VF-reps may need to be re-opened after the PF is re-opened */
+	if (BNXT_PF(bp))
+		bnxt_vf_reps_open(bp);
+	return 0;
+
+open_err:
+	bnxt_disable_napi(bp);
+	bnxt_del_napi(bp);
+
+open_err_free_mem:
+	bnxt_free_skbs(bp);
+	bnxt_free_irq(bp);
+	bnxt_free_mem(bp, true);
+	return rc;
+}
+
+/* rtnl_lock held */
+int bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
+{
+	int rc = 0;
+
+	rc = __bnxt_open_nic(bp, irq_re_init, link_re_init);
+	if (rc) {
+		netdev_err(bp->dev, "nic open fail (rc: %x)\n", rc);
+		dev_close(bp->dev);
+	}
+	return rc;
+}
+
+/* rtnl_lock held, open the NIC half way by allocating all resources, but
+ * NAPI, IRQ, and TX are not enabled.  This is mainly used for offline
+ * self tests.
+ */
+int bnxt_half_open_nic(struct bnxt *bp)
+{
+	int rc = 0;
+
+	rc = bnxt_alloc_mem(bp, false);
+	if (rc) {
+		netdev_err(bp->dev, "bnxt_alloc_mem err: %x\n", rc);
+		goto half_open_err;
+	}
+	rc = bnxt_init_nic(bp, false);
+	if (rc) {
+		netdev_err(bp->dev, "bnxt_init_nic err: %x\n", rc);
+		goto half_open_err;
+	}
+	return 0;
+
+half_open_err:
+	bnxt_free_skbs(bp);
+	bnxt_free_mem(bp, false);
+	dev_close(bp->dev);
+	return rc;
+}
+
+/* rtnl_lock held, this call can only be made after a previous successful
+ * call to bnxt_half_open_nic().
+ */
+void bnxt_half_close_nic(struct bnxt *bp)
+{
+	bnxt_hwrm_resource_free(bp, false, false);
+	bnxt_free_skbs(bp);
+	bnxt_free_mem(bp, false);
+}
+
+static int bnxt_open(struct net_device *dev)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	return __bnxt_open_nic(bp, true, true);
+}
+
+static bool bnxt_drv_busy(struct bnxt *bp)
+{
+	return (test_bit(BNXT_STATE_IN_SP_TASK, &bp->state) ||
+		test_bit(BNXT_STATE_READ_STATS, &bp->state));
+}
+
+static void __bnxt_close_nic(struct bnxt *bp, bool irq_re_init,
+			     bool link_re_init)
+{
+	/* Close the VF-reps before closing PF */
+	if (BNXT_PF(bp))
+		bnxt_vf_reps_close(bp);
+
+	/* Change device state to avoid TX queue wake up's */
+	bnxt_tx_disable(bp);
+
+	clear_bit(BNXT_STATE_OPEN, &bp->state);
+	smp_mb__after_atomic();
+	while (bnxt_drv_busy(bp))
+		msleep(20);
+
+	/* Flush rings and and disable interrupts */
+	bnxt_shutdown_nic(bp, irq_re_init);
+
+	/* TODO CHIMP_FW: Link/PHY related cleanup if (link_re_init) */
+
+	bnxt_disable_napi(bp);
+	del_timer_sync(&bp->timer);
+	bnxt_free_skbs(bp);
+
+	if (irq_re_init) {
+		bnxt_free_irq(bp);
+		bnxt_del_napi(bp);
+	}
+	bnxt_free_mem(bp, irq_re_init);
+}
+
+int bnxt_close_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
+{
+	int rc = 0;
+
+#ifdef CONFIG_BNXT_SRIOV
+	if (bp->sriov_cfg) {
+		rc = wait_event_interruptible_timeout(bp->sriov_cfg_wait,
+						      !bp->sriov_cfg,
+						      BNXT_SRIOV_CFG_WAIT_TMO);
+		if (rc)
+			netdev_warn(bp->dev, "timeout waiting for SRIOV config operation to complete!\n");
+	}
+#endif
+	__bnxt_close_nic(bp, irq_re_init, link_re_init);
+	return rc;
+}
+
+static int bnxt_close(struct net_device *dev)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	bnxt_close_nic(bp, true, true);
+	bnxt_hwrm_shutdown_link(bp);
+	return 0;
+}
+
+/* rtnl_lock held */
+static int bnxt_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	switch (cmd) {
+	case SIOCGMIIPHY:
+		/* fallthru */
+	case SIOCGMIIREG: {
+		if (!netif_running(dev))
+			return -EAGAIN;
+
+		return 0;
+	}
+
+	case SIOCSMIIREG:
+		if (!netif_running(dev))
+			return -EAGAIN;
+
+		return 0;
+
+	default:
+		/* do nothing */
+		break;
+	}
+	return -EOPNOTSUPP;
+}
+
+static void
+bnxt_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+	u32 i;
+	struct bnxt *bp = netdev_priv(dev);
+
+	set_bit(BNXT_STATE_READ_STATS, &bp->state);
+	/* Make sure bnxt_close_nic() sees that we are reading stats before
+	 * we check the BNXT_STATE_OPEN flag.
+	 */
+	smp_mb__after_atomic();
+	if (!test_bit(BNXT_STATE_OPEN, &bp->state)) {
+		clear_bit(BNXT_STATE_READ_STATS, &bp->state);
+		return;
+	}
+
+	/* TODO check if we need to synchronize with bnxt_close path */
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+		struct ctx_hw_stats *hw_stats = cpr->hw_stats;
+
+		stats->rx_packets += le64_to_cpu(hw_stats->rx_ucast_pkts);
+		stats->rx_packets += le64_to_cpu(hw_stats->rx_mcast_pkts);
+		stats->rx_packets += le64_to_cpu(hw_stats->rx_bcast_pkts);
+
+		stats->tx_packets += le64_to_cpu(hw_stats->tx_ucast_pkts);
+		stats->tx_packets += le64_to_cpu(hw_stats->tx_mcast_pkts);
+		stats->tx_packets += le64_to_cpu(hw_stats->tx_bcast_pkts);
+
+		stats->rx_bytes += le64_to_cpu(hw_stats->rx_ucast_bytes);
+		stats->rx_bytes += le64_to_cpu(hw_stats->rx_mcast_bytes);
+		stats->rx_bytes += le64_to_cpu(hw_stats->rx_bcast_bytes);
+
+		stats->tx_bytes += le64_to_cpu(hw_stats->tx_ucast_bytes);
+		stats->tx_bytes += le64_to_cpu(hw_stats->tx_mcast_bytes);
+		stats->tx_bytes += le64_to_cpu(hw_stats->tx_bcast_bytes);
+
+		stats->rx_missed_errors +=
+			le64_to_cpu(hw_stats->rx_discard_pkts);
+
+		stats->multicast += le64_to_cpu(hw_stats->rx_mcast_pkts);
+
+		stats->tx_dropped += le64_to_cpu(hw_stats->tx_drop_pkts);
+	}
+
+	if (bp->flags & BNXT_FLAG_PORT_STATS) {
+		struct rx_port_stats *rx = bp->hw_rx_port_stats;
+		struct tx_port_stats *tx = bp->hw_tx_port_stats;
+
+		stats->rx_crc_errors = le64_to_cpu(rx->rx_fcs_err_frames);
+		stats->rx_frame_errors = le64_to_cpu(rx->rx_align_err_frames);
+		stats->rx_length_errors = le64_to_cpu(rx->rx_undrsz_frames) +
+					  le64_to_cpu(rx->rx_ovrsz_frames) +
+					  le64_to_cpu(rx->rx_runt_frames);
+		stats->rx_errors = le64_to_cpu(rx->rx_false_carrier_frames) +
+				   le64_to_cpu(rx->rx_jbr_frames);
+		stats->collisions = le64_to_cpu(tx->tx_total_collisions);
+		stats->tx_fifo_errors = le64_to_cpu(tx->tx_fifo_underruns);
+		stats->tx_errors = le64_to_cpu(tx->tx_err);
+	}
+	clear_bit(BNXT_STATE_READ_STATS, &bp->state);
+}
+
+static bool bnxt_mc_list_updated(struct bnxt *bp, u32 *rx_mask)
+{
+	struct net_device *dev = bp->dev;
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[0];
+	struct netdev_hw_addr *ha;
+	u8 *haddr;
+	int mc_count = 0;
+	bool update = false;
+	int off = 0;
+
+	netdev_for_each_mc_addr(ha, dev) {
+		if (mc_count >= BNXT_MAX_MC_ADDRS) {
+			*rx_mask |= CFA_L2_SET_RX_MASK_REQ_MASK_ALL_MCAST;
+			vnic->mc_list_count = 0;
+			return false;
+		}
+		haddr = ha->addr;
+		if (!ether_addr_equal(haddr, vnic->mc_list + off)) {
+			memcpy(vnic->mc_list + off, haddr, ETH_ALEN);
+			update = true;
+		}
+		off += ETH_ALEN;
+		mc_count++;
+	}
+	if (mc_count)
+		*rx_mask |= CFA_L2_SET_RX_MASK_REQ_MASK_MCAST;
+
+	if (mc_count != vnic->mc_list_count) {
+		vnic->mc_list_count = mc_count;
+		update = true;
+	}
+	return update;
+}
+
+static bool bnxt_uc_list_updated(struct bnxt *bp)
+{
+	struct net_device *dev = bp->dev;
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[0];
+	struct netdev_hw_addr *ha;
+	int off = 0;
+
+	if (netdev_uc_count(dev) != (vnic->uc_filter_count - 1))
+		return true;
+
+	netdev_for_each_uc_addr(ha, dev) {
+		if (!ether_addr_equal(ha->addr, vnic->uc_list + off))
+			return true;
+
+		off += ETH_ALEN;
+	}
+	return false;
+}
+
+static void bnxt_set_rx_mode(struct net_device *dev)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[0];
+	u32 mask = vnic->rx_mask;
+	bool mc_update = false;
+	bool uc_update;
+
+	if (!netif_running(dev))
+		return;
+
+	mask &= ~(CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS |
+		  CFA_L2_SET_RX_MASK_REQ_MASK_MCAST |
+		  CFA_L2_SET_RX_MASK_REQ_MASK_ALL_MCAST);
+
+	if ((dev->flags & IFF_PROMISC) && bnxt_promisc_ok(bp))
+		mask |= CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS;
+
+	uc_update = bnxt_uc_list_updated(bp);
+
+	if (dev->flags & IFF_ALLMULTI) {
+		mask |= CFA_L2_SET_RX_MASK_REQ_MASK_ALL_MCAST;
+		vnic->mc_list_count = 0;
+	} else {
+		mc_update = bnxt_mc_list_updated(bp, &mask);
+	}
+
+	if (mask != vnic->rx_mask || uc_update || mc_update) {
+		vnic->rx_mask = mask;
+
+		set_bit(BNXT_RX_MASK_SP_EVENT, &bp->sp_event);
+		bnxt_queue_sp_work(bp);
+	}
+}
+
+static int bnxt_cfg_rx_mode(struct bnxt *bp)
+{
+	struct net_device *dev = bp->dev;
+	struct bnxt_vnic_info *vnic = &bp->vnic_info[0];
+	struct netdev_hw_addr *ha;
+	int i, off = 0, rc;
+	bool uc_update;
+
+	netif_addr_lock_bh(dev);
+	uc_update = bnxt_uc_list_updated(bp);
+	netif_addr_unlock_bh(dev);
+
+	if (!uc_update)
+		goto skip_uc;
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	for (i = 1; i < vnic->uc_filter_count; i++) {
+		struct hwrm_cfa_l2_filter_free_input req = {0};
+
+		bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_L2_FILTER_FREE, -1,
+				       -1);
+
+		req.l2_filter_id = vnic->fw_l2_filter_id[i];
+
+		rc = _hwrm_send_message(bp, &req, sizeof(req),
+					HWRM_CMD_TIMEOUT);
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+
+	vnic->uc_filter_count = 1;
+
+	netif_addr_lock_bh(dev);
+	if (netdev_uc_count(dev) > (BNXT_MAX_UC_ADDRS - 1)) {
+		vnic->rx_mask |= CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS;
+	} else {
+		netdev_for_each_uc_addr(ha, dev) {
+			memcpy(vnic->uc_list + off, ha->addr, ETH_ALEN);
+			off += ETH_ALEN;
+			vnic->uc_filter_count++;
+		}
+	}
+	netif_addr_unlock_bh(dev);
+
+	for (i = 1, off = 0; i < vnic->uc_filter_count; i++, off += ETH_ALEN) {
+		rc = bnxt_hwrm_set_vnic_filter(bp, 0, i, vnic->uc_list + off);
+		if (rc) {
+			netdev_err(bp->dev, "HWRM vnic filter failure rc: %x\n",
+				   rc);
+			vnic->uc_filter_count = i;
+			return rc;
+		}
+	}
+
+skip_uc:
+	rc = bnxt_hwrm_cfa_l2_set_rx_mask(bp, 0);
+	if (rc)
+		netdev_err(bp->dev, "HWRM cfa l2 rx mask failure rc: %x\n",
+			   rc);
+
+	return rc;
+}
+
+/* If the chip and firmware supports RFS */
+static bool bnxt_rfs_supported(struct bnxt *bp)
+{
+	if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp))
+		return true;
+	if (bp->flags & BNXT_FLAG_NEW_RSS_CAP)
+		return true;
+	return false;
+}
+
+/* If runtime conditions support RFS */
+static bool bnxt_rfs_capable(struct bnxt *bp)
+{
+#ifdef CONFIG_RFS_ACCEL
+	int vnics, max_vnics, max_rss_ctxs;
+
+	if (!(bp->flags & BNXT_FLAG_MSIX_CAP))
+		return false;
+
+	vnics = 1 + bp->rx_nr_rings;
+	max_vnics = bnxt_get_max_func_vnics(bp);
+	max_rss_ctxs = bnxt_get_max_func_rss_ctxs(bp);
+
+	/* RSS contexts not a limiting factor */
+	if (bp->flags & BNXT_FLAG_NEW_RSS_CAP)
+		max_rss_ctxs = max_vnics;
+	if (vnics > max_vnics || vnics > max_rss_ctxs) {
+		if (bp->rx_nr_rings > 1)
+			netdev_warn(bp->dev,
+				    "Not enough resources to support NTUPLE filters, enough resources for up to %d rx rings\n",
+				    min(max_rss_ctxs - 1, max_vnics - 1));
+		return false;
+	}
+
+	if (!(bp->flags & BNXT_FLAG_NEW_RM))
+		return true;
+
+	if (vnics == bp->hw_resc.resv_vnics)
+		return true;
+
+	bnxt_hwrm_reserve_rings(bp, 0, 0, 0, 0, vnics);
+	if (vnics <= bp->hw_resc.resv_vnics)
+		return true;
+
+	netdev_warn(bp->dev, "Unable to reserve resources to support NTUPLE filters.\n");
+	bnxt_hwrm_reserve_rings(bp, 0, 0, 0, 0, 1);
+	return false;
+#else
+	return false;
+#endif
+}
+
+static netdev_features_t bnxt_fix_features(struct net_device *dev,
+					   netdev_features_t features)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	if ((features & NETIF_F_NTUPLE) && !bnxt_rfs_capable(bp))
+		features &= ~NETIF_F_NTUPLE;
+
+	if (bp->flags & BNXT_FLAG_NO_AGG_RINGS)
+		features &= ~(NETIF_F_LRO | NETIF_F_GRO_HW);
+
+	if (!(features & NETIF_F_GRO))
+		features &= ~NETIF_F_GRO_HW;
+
+	if (features & NETIF_F_GRO_HW)
+		features &= ~NETIF_F_LRO;
+
+	/* Both CTAG and STAG VLAN accelaration on the RX side have to be
+	 * turned on or off together.
+	 */
+	if ((features & (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX)) !=
+	    (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX)) {
+		if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
+			features &= ~(NETIF_F_HW_VLAN_CTAG_RX |
+				      NETIF_F_HW_VLAN_STAG_RX);
+		else
+			features |= NETIF_F_HW_VLAN_CTAG_RX |
+				    NETIF_F_HW_VLAN_STAG_RX;
+	}
+#ifdef CONFIG_BNXT_SRIOV
+	if (BNXT_VF(bp)) {
+		if (bp->vf.vlan) {
+			features &= ~(NETIF_F_HW_VLAN_CTAG_RX |
+				      NETIF_F_HW_VLAN_STAG_RX);
+		}
+	}
+#endif
+	return features;
+}
+
+static int bnxt_set_features(struct net_device *dev, netdev_features_t features)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	u32 flags = bp->flags;
+	u32 changes;
+	int rc = 0;
+	bool re_init = false;
+	bool update_tpa = false;
+
+	flags &= ~BNXT_FLAG_ALL_CONFIG_FEATS;
+	if (features & NETIF_F_GRO_HW)
+		flags |= BNXT_FLAG_GRO;
+	else if (features & NETIF_F_LRO)
+		flags |= BNXT_FLAG_LRO;
+
+	if (bp->flags & BNXT_FLAG_NO_AGG_RINGS)
+		flags &= ~BNXT_FLAG_TPA;
+
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		flags |= BNXT_FLAG_STRIP_VLAN;
+
+	if (features & NETIF_F_NTUPLE)
+		flags |= BNXT_FLAG_RFS;
+
+	changes = flags ^ bp->flags;
+	if (changes & BNXT_FLAG_TPA) {
+		update_tpa = true;
+		if ((bp->flags & BNXT_FLAG_TPA) == 0 ||
+		    (flags & BNXT_FLAG_TPA) == 0)
+			re_init = true;
+	}
+
+	if (changes & ~BNXT_FLAG_TPA)
+		re_init = true;
+
+	if (flags != bp->flags) {
+		u32 old_flags = bp->flags;
+
+		bp->flags = flags;
+
+		if (!test_bit(BNXT_STATE_OPEN, &bp->state)) {
+			if (update_tpa)
+				bnxt_set_ring_params(bp);
+			return rc;
+		}
+
+		if (re_init) {
+			bnxt_close_nic(bp, false, false);
+			if (update_tpa)
+				bnxt_set_ring_params(bp);
+
+			return bnxt_open_nic(bp, false, false);
+		}
+		if (update_tpa) {
+			rc = bnxt_set_tpa(bp,
+					  (flags & BNXT_FLAG_TPA) ?
+					  true : false);
+			if (rc)
+				bp->flags = old_flags;
+		}
+	}
+	return rc;
+}
+
+static void bnxt_dump_tx_sw_state(struct bnxt_napi *bnapi)
+{
+	struct bnxt_tx_ring_info *txr = bnapi->tx_ring;
+	int i = bnapi->index;
+
+	if (!txr)
+		return;
+
+	netdev_info(bnapi->bp->dev, "[%d]: tx{fw_ring: %d prod: %x cons: %x}\n",
+		    i, txr->tx_ring_struct.fw_ring_id, txr->tx_prod,
+		    txr->tx_cons);
+}
+
+static void bnxt_dump_rx_sw_state(struct bnxt_napi *bnapi)
+{
+	struct bnxt_rx_ring_info *rxr = bnapi->rx_ring;
+	int i = bnapi->index;
+
+	if (!rxr)
+		return;
+
+	netdev_info(bnapi->bp->dev, "[%d]: rx{fw_ring: %d prod: %x} rx_agg{fw_ring: %d agg_prod: %x sw_agg_prod: %x}\n",
+		    i, rxr->rx_ring_struct.fw_ring_id, rxr->rx_prod,
+		    rxr->rx_agg_ring_struct.fw_ring_id, rxr->rx_agg_prod,
+		    rxr->rx_sw_agg_prod);
+}
+
+static void bnxt_dump_cp_sw_state(struct bnxt_napi *bnapi)
+{
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	int i = bnapi->index;
+
+	netdev_info(bnapi->bp->dev, "[%d]: cp{fw_ring: %d raw_cons: %x}\n",
+		    i, cpr->cp_ring_struct.fw_ring_id, cpr->cp_raw_cons);
+}
+
+static void bnxt_dbg_dump_states(struct bnxt *bp)
+{
+	int i;
+	struct bnxt_napi *bnapi;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		bnapi = bp->bnapi[i];
+		if (netif_msg_drv(bp)) {
+			bnxt_dump_tx_sw_state(bnapi);
+			bnxt_dump_rx_sw_state(bnapi);
+			bnxt_dump_cp_sw_state(bnapi);
+		}
+	}
+}
+
+static void bnxt_reset_task(struct bnxt *bp, bool silent)
+{
+	if (!silent)
+		bnxt_dbg_dump_states(bp);
+	if (netif_running(bp->dev)) {
+		int rc;
+
+		if (!silent)
+			bnxt_ulp_stop(bp);
+		bnxt_close_nic(bp, false, false);
+		rc = bnxt_open_nic(bp, false, false);
+		if (!silent && !rc)
+			bnxt_ulp_start(bp);
+	}
+}
+
+static void bnxt_tx_timeout(struct net_device *dev)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	netdev_err(bp->dev,  "TX timeout detected, starting reset task!\n");
+	set_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event);
+	bnxt_queue_sp_work(bp);
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void bnxt_poll_controller(struct net_device *dev)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int i;
+
+	/* Only process tx rings/combined rings in netpoll mode. */
+	for (i = 0; i < bp->tx_nr_rings; i++) {
+		struct bnxt_tx_ring_info *txr = &bp->tx_ring[i];
+
+		napi_schedule(&txr->bnapi->napi);
+	}
+}
+#endif
+
+static void bnxt_timer(struct timer_list *t)
+{
+	struct bnxt *bp = from_timer(bp, t, timer);
+	struct net_device *dev = bp->dev;
+
+	if (!netif_running(dev))
+		return;
+
+	if (atomic_read(&bp->intr_sem) != 0)
+		goto bnxt_restart_timer;
+
+	if (bp->link_info.link_up && (bp->flags & BNXT_FLAG_PORT_STATS) &&
+	    bp->stats_coal_ticks) {
+		set_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event);
+		bnxt_queue_sp_work(bp);
+	}
+
+	if (bnxt_tc_flower_enabled(bp)) {
+		set_bit(BNXT_FLOW_STATS_SP_EVENT, &bp->sp_event);
+		bnxt_queue_sp_work(bp);
+	}
+bnxt_restart_timer:
+	mod_timer(&bp->timer, jiffies + bp->current_interval);
+}
+
+static void bnxt_rtnl_lock_sp(struct bnxt *bp)
+{
+	/* We are called from bnxt_sp_task which has BNXT_STATE_IN_SP_TASK
+	 * set.  If the device is being closed, bnxt_close() may be holding
+	 * rtnl() and waiting for BNXT_STATE_IN_SP_TASK to clear.  So we
+	 * must clear BNXT_STATE_IN_SP_TASK before holding rtnl().
+	 */
+	clear_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
+	rtnl_lock();
+}
+
+static void bnxt_rtnl_unlock_sp(struct bnxt *bp)
+{
+	set_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
+	rtnl_unlock();
+}
+
+/* Only called from bnxt_sp_task() */
+static void bnxt_reset(struct bnxt *bp, bool silent)
+{
+	bnxt_rtnl_lock_sp(bp);
+	if (test_bit(BNXT_STATE_OPEN, &bp->state))
+		bnxt_reset_task(bp, silent);
+	bnxt_rtnl_unlock_sp(bp);
+}
+
+static void bnxt_cfg_ntp_filters(struct bnxt *);
+
+static void bnxt_sp_task(struct work_struct *work)
+{
+	struct bnxt *bp = container_of(work, struct bnxt, sp_task);
+
+	set_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
+	smp_mb__after_atomic();
+	if (!test_bit(BNXT_STATE_OPEN, &bp->state)) {
+		clear_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
+		return;
+	}
+
+	if (test_and_clear_bit(BNXT_RX_MASK_SP_EVENT, &bp->sp_event))
+		bnxt_cfg_rx_mode(bp);
+
+	if (test_and_clear_bit(BNXT_RX_NTP_FLTR_SP_EVENT, &bp->sp_event))
+		bnxt_cfg_ntp_filters(bp);
+	if (test_and_clear_bit(BNXT_HWRM_EXEC_FWD_REQ_SP_EVENT, &bp->sp_event))
+		bnxt_hwrm_exec_fwd_req(bp);
+	if (test_and_clear_bit(BNXT_VXLAN_ADD_PORT_SP_EVENT, &bp->sp_event)) {
+		bnxt_hwrm_tunnel_dst_port_alloc(
+			bp, bp->vxlan_port,
+			TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN);
+	}
+	if (test_and_clear_bit(BNXT_VXLAN_DEL_PORT_SP_EVENT, &bp->sp_event)) {
+		bnxt_hwrm_tunnel_dst_port_free(
+			bp, TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN);
+	}
+	if (test_and_clear_bit(BNXT_GENEVE_ADD_PORT_SP_EVENT, &bp->sp_event)) {
+		bnxt_hwrm_tunnel_dst_port_alloc(
+			bp, bp->nge_port,
+			TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE);
+	}
+	if (test_and_clear_bit(BNXT_GENEVE_DEL_PORT_SP_EVENT, &bp->sp_event)) {
+		bnxt_hwrm_tunnel_dst_port_free(
+			bp, TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE);
+	}
+	if (test_and_clear_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event)) {
+		bnxt_hwrm_port_qstats(bp);
+		bnxt_hwrm_port_qstats_ext(bp);
+	}
+
+	if (test_and_clear_bit(BNXT_LINK_CHNG_SP_EVENT, &bp->sp_event)) {
+		int rc;
+
+		mutex_lock(&bp->link_lock);
+		if (test_and_clear_bit(BNXT_LINK_SPEED_CHNG_SP_EVENT,
+				       &bp->sp_event))
+			bnxt_hwrm_phy_qcaps(bp);
+
+		rc = bnxt_update_link(bp, true);
+		mutex_unlock(&bp->link_lock);
+		if (rc)
+			netdev_err(bp->dev, "SP task can't update link (rc: %x)\n",
+				   rc);
+	}
+	if (test_and_clear_bit(BNXT_HWRM_PORT_MODULE_SP_EVENT, &bp->sp_event)) {
+		mutex_lock(&bp->link_lock);
+		bnxt_get_port_module_status(bp);
+		mutex_unlock(&bp->link_lock);
+	}
+
+	if (test_and_clear_bit(BNXT_FLOW_STATS_SP_EVENT, &bp->sp_event))
+		bnxt_tc_flow_stats_work(bp);
+
+	/* These functions below will clear BNXT_STATE_IN_SP_TASK.  They
+	 * must be the last functions to be called before exiting.
+	 */
+	if (test_and_clear_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event))
+		bnxt_reset(bp, false);
+
+	if (test_and_clear_bit(BNXT_RESET_TASK_SILENT_SP_EVENT, &bp->sp_event))
+		bnxt_reset(bp, true);
+
+	smp_mb__before_atomic();
+	clear_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
+}
+
+/* Under rtnl_lock */
+int bnxt_check_rings(struct bnxt *bp, int tx, int rx, bool sh, int tcs,
+		     int tx_xdp)
+{
+	int max_rx, max_tx, tx_sets = 1;
+	int tx_rings_needed;
+	int rx_rings = rx;
+	int cp, vnics, rc;
+
+	if (tcs)
+		tx_sets = tcs;
+
+	rc = bnxt_get_max_rings(bp, &max_rx, &max_tx, sh);
+	if (rc)
+		return rc;
+
+	if (max_rx < rx)
+		return -ENOMEM;
+
+	tx_rings_needed = tx * tx_sets + tx_xdp;
+	if (max_tx < tx_rings_needed)
+		return -ENOMEM;
+
+	vnics = 1;
+	if (bp->flags & BNXT_FLAG_RFS)
+		vnics += rx_rings;
+
+	if (bp->flags & BNXT_FLAG_AGG_RINGS)
+		rx_rings <<= 1;
+	cp = sh ? max_t(int, tx_rings_needed, rx) : tx_rings_needed + rx;
+	if (bp->flags & BNXT_FLAG_NEW_RM)
+		cp += bnxt_get_ulp_msix_num(bp);
+	return bnxt_hwrm_check_rings(bp, tx_rings_needed, rx_rings, rx, cp,
+				     vnics);
+}
+
+static void bnxt_unmap_bars(struct bnxt *bp, struct pci_dev *pdev)
+{
+	if (bp->bar2) {
+		pci_iounmap(pdev, bp->bar2);
+		bp->bar2 = NULL;
+	}
+
+	if (bp->bar1) {
+		pci_iounmap(pdev, bp->bar1);
+		bp->bar1 = NULL;
+	}
+
+	if (bp->bar0) {
+		pci_iounmap(pdev, bp->bar0);
+		bp->bar0 = NULL;
+	}
+}
+
+static void bnxt_cleanup_pci(struct bnxt *bp)
+{
+	bnxt_unmap_bars(bp, bp->pdev);
+	pci_release_regions(bp->pdev);
+	pci_disable_device(bp->pdev);
+}
+
+static void bnxt_init_dflt_coal(struct bnxt *bp)
+{
+	struct bnxt_coal *coal;
+
+	/* Tick values in micro seconds.
+	 * 1 coal_buf x bufs_per_record = 1 completion record.
+	 */
+	coal = &bp->rx_coal;
+	coal->coal_ticks = 14;
+	coal->coal_bufs = 30;
+	coal->coal_ticks_irq = 1;
+	coal->coal_bufs_irq = 2;
+	coal->idle_thresh = 25;
+	coal->bufs_per_record = 2;
+	coal->budget = 64;		/* NAPI budget */
+
+	coal = &bp->tx_coal;
+	coal->coal_ticks = 28;
+	coal->coal_bufs = 30;
+	coal->coal_ticks_irq = 2;
+	coal->coal_bufs_irq = 2;
+	coal->bufs_per_record = 1;
+
+	bp->stats_coal_ticks = BNXT_DEF_STATS_COAL_TICKS;
+}
+
+static int bnxt_init_board(struct pci_dev *pdev, struct net_device *dev)
+{
+	int rc;
+	struct bnxt *bp = netdev_priv(dev);
+
+	SET_NETDEV_DEV(dev, &pdev->dev);
+
+	/* enable device (incl. PCI PM wakeup), and bus-mastering */
+	rc = pci_enable_device(pdev);
+	if (rc) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n");
+		goto init_err;
+	}
+
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev,
+			"Cannot find PCI device base address, aborting\n");
+		rc = -ENODEV;
+		goto init_err_disable;
+	}
+
+	rc = pci_request_regions(pdev, DRV_MODULE_NAME);
+	if (rc) {
+		dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting\n");
+		goto init_err_disable;
+	}
+
+	if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) != 0 &&
+	    dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)) != 0) {
+		dev_err(&pdev->dev, "System does not support DMA, aborting\n");
+		goto init_err_disable;
+	}
+
+	pci_set_master(pdev);
+
+	bp->dev = dev;
+	bp->pdev = pdev;
+
+	bp->bar0 = pci_ioremap_bar(pdev, 0);
+	if (!bp->bar0) {
+		dev_err(&pdev->dev, "Cannot map device registers, aborting\n");
+		rc = -ENOMEM;
+		goto init_err_release;
+	}
+
+	bp->bar1 = pci_ioremap_bar(pdev, 2);
+	if (!bp->bar1) {
+		dev_err(&pdev->dev, "Cannot map doorbell registers, aborting\n");
+		rc = -ENOMEM;
+		goto init_err_release;
+	}
+
+	bp->bar2 = pci_ioremap_bar(pdev, 4);
+	if (!bp->bar2) {
+		dev_err(&pdev->dev, "Cannot map bar4 registers, aborting\n");
+		rc = -ENOMEM;
+		goto init_err_release;
+	}
+
+	pci_enable_pcie_error_reporting(pdev);
+
+	INIT_WORK(&bp->sp_task, bnxt_sp_task);
+
+	spin_lock_init(&bp->ntp_fltr_lock);
+
+	bp->rx_ring_size = BNXT_DEFAULT_RX_RING_SIZE;
+	bp->tx_ring_size = BNXT_DEFAULT_TX_RING_SIZE;
+
+	bnxt_init_dflt_coal(bp);
+
+	timer_setup(&bp->timer, bnxt_timer, 0);
+	bp->current_interval = BNXT_TIMER_INTERVAL;
+
+	clear_bit(BNXT_STATE_OPEN, &bp->state);
+	return 0;
+
+init_err_release:
+	bnxt_unmap_bars(bp, pdev);
+	pci_release_regions(pdev);
+
+init_err_disable:
+	pci_disable_device(pdev);
+
+init_err:
+	return rc;
+}
+
+/* rtnl_lock held */
+static int bnxt_change_mac_addr(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+	struct bnxt *bp = netdev_priv(dev);
+	int rc = 0;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	if (ether_addr_equal(addr->sa_data, dev->dev_addr))
+		return 0;
+
+	rc = bnxt_approve_mac(bp, addr->sa_data);
+	if (rc)
+		return rc;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	if (netif_running(dev)) {
+		bnxt_close_nic(bp, false, false);
+		rc = bnxt_open_nic(bp, false, false);
+	}
+
+	return rc;
+}
+
+/* rtnl_lock held */
+static int bnxt_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	if (netif_running(dev))
+		bnxt_close_nic(bp, false, false);
+
+	dev->mtu = new_mtu;
+	bnxt_set_ring_params(bp);
+
+	if (netif_running(dev))
+		return bnxt_open_nic(bp, false, false);
+
+	return 0;
+}
+
+int bnxt_setup_mq_tc(struct net_device *dev, u8 tc)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	bool sh = false;
+	int rc;
+
+	if (tc > bp->max_tc) {
+		netdev_err(dev, "Too many traffic classes requested: %d. Max supported is %d.\n",
+			   tc, bp->max_tc);
+		return -EINVAL;
+	}
+
+	if (netdev_get_num_tc(dev) == tc)
+		return 0;
+
+	if (bp->flags & BNXT_FLAG_SHARED_RINGS)
+		sh = true;
+
+	rc = bnxt_check_rings(bp, bp->tx_nr_rings_per_tc, bp->rx_nr_rings,
+			      sh, tc, bp->tx_nr_rings_xdp);
+	if (rc)
+		return rc;
+
+	/* Needs to close the device and do hw resource re-allocations */
+	if (netif_running(bp->dev))
+		bnxt_close_nic(bp, true, false);
+
+	if (tc) {
+		bp->tx_nr_rings = bp->tx_nr_rings_per_tc * tc;
+		netdev_set_num_tc(dev, tc);
+	} else {
+		bp->tx_nr_rings = bp->tx_nr_rings_per_tc;
+		netdev_reset_tc(dev);
+	}
+	bp->tx_nr_rings += bp->tx_nr_rings_xdp;
+	bp->cp_nr_rings = sh ? max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) :
+			       bp->tx_nr_rings + bp->rx_nr_rings;
+	bp->num_stat_ctxs = bp->cp_nr_rings;
+
+	if (netif_running(bp->dev))
+		return bnxt_open_nic(bp, true, false);
+
+	return 0;
+}
+
+static int bnxt_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+				  void *cb_priv)
+{
+	struct bnxt *bp = cb_priv;
+
+	if (!bnxt_tc_flower_enabled(bp) ||
+	    !tc_cls_can_offload_and_chain0(bp->dev, type_data))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return bnxt_tc_setup_flower(bp, bp->pf.fw_fid, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int bnxt_setup_tc_block(struct net_device *dev,
+			       struct tc_block_offload *f)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		return tcf_block_cb_register(f->block, bnxt_setup_tc_block_cb,
+					     bp, bp);
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block, bnxt_setup_tc_block_cb, bp);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int bnxt_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			 void *type_data)
+{
+	switch (type) {
+	case TC_SETUP_BLOCK:
+		return bnxt_setup_tc_block(dev, type_data);
+	case TC_SETUP_QDISC_MQPRIO: {
+		struct tc_mqprio_qopt *mqprio = type_data;
+
+		mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+		return bnxt_setup_mq_tc(dev, mqprio->num_tc);
+	}
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+#ifdef CONFIG_RFS_ACCEL
+static bool bnxt_fltr_match(struct bnxt_ntuple_filter *f1,
+			    struct bnxt_ntuple_filter *f2)
+{
+	struct flow_keys *keys1 = &f1->fkeys;
+	struct flow_keys *keys2 = &f2->fkeys;
+
+	if (keys1->addrs.v4addrs.src == keys2->addrs.v4addrs.src &&
+	    keys1->addrs.v4addrs.dst == keys2->addrs.v4addrs.dst &&
+	    keys1->ports.ports == keys2->ports.ports &&
+	    keys1->basic.ip_proto == keys2->basic.ip_proto &&
+	    keys1->basic.n_proto == keys2->basic.n_proto &&
+	    keys1->control.flags == keys2->control.flags &&
+	    ether_addr_equal(f1->src_mac_addr, f2->src_mac_addr) &&
+	    ether_addr_equal(f1->dst_mac_addr, f2->dst_mac_addr))
+		return true;
+
+	return false;
+}
+
+static int bnxt_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+			      u16 rxq_index, u32 flow_id)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_ntuple_filter *fltr, *new_fltr;
+	struct flow_keys *fkeys;
+	struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb);
+	int rc = 0, idx, bit_id, l2_idx = 0;
+	struct hlist_head *head;
+
+	if (!ether_addr_equal(dev->dev_addr, eth->h_dest)) {
+		struct bnxt_vnic_info *vnic = &bp->vnic_info[0];
+		int off = 0, j;
+
+		netif_addr_lock_bh(dev);
+		for (j = 0; j < vnic->uc_filter_count; j++, off += ETH_ALEN) {
+			if (ether_addr_equal(eth->h_dest,
+					     vnic->uc_list + off)) {
+				l2_idx = j + 1;
+				break;
+			}
+		}
+		netif_addr_unlock_bh(dev);
+		if (!l2_idx)
+			return -EINVAL;
+	}
+	new_fltr = kzalloc(sizeof(*new_fltr), GFP_ATOMIC);
+	if (!new_fltr)
+		return -ENOMEM;
+
+	fkeys = &new_fltr->fkeys;
+	if (!skb_flow_dissect_flow_keys(skb, fkeys, 0)) {
+		rc = -EPROTONOSUPPORT;
+		goto err_free;
+	}
+
+	if ((fkeys->basic.n_proto != htons(ETH_P_IP) &&
+	     fkeys->basic.n_proto != htons(ETH_P_IPV6)) ||
+	    ((fkeys->basic.ip_proto != IPPROTO_TCP) &&
+	     (fkeys->basic.ip_proto != IPPROTO_UDP))) {
+		rc = -EPROTONOSUPPORT;
+		goto err_free;
+	}
+	if (fkeys->basic.n_proto == htons(ETH_P_IPV6) &&
+	    bp->hwrm_spec_code < 0x10601) {
+		rc = -EPROTONOSUPPORT;
+		goto err_free;
+	}
+	if ((fkeys->control.flags & FLOW_DIS_ENCAPSULATION) &&
+	    bp->hwrm_spec_code < 0x10601) {
+		rc = -EPROTONOSUPPORT;
+		goto err_free;
+	}
+
+	memcpy(new_fltr->dst_mac_addr, eth->h_dest, ETH_ALEN);
+	memcpy(new_fltr->src_mac_addr, eth->h_source, ETH_ALEN);
+
+	idx = skb_get_hash_raw(skb) & BNXT_NTP_FLTR_HASH_MASK;
+	head = &bp->ntp_fltr_hash_tbl[idx];
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(fltr, head, hash) {
+		if (bnxt_fltr_match(fltr, new_fltr)) {
+			rcu_read_unlock();
+			rc = 0;
+			goto err_free;
+		}
+	}
+	rcu_read_unlock();
+
+	spin_lock_bh(&bp->ntp_fltr_lock);
+	bit_id = bitmap_find_free_region(bp->ntp_fltr_bmap,
+					 BNXT_NTP_FLTR_MAX_FLTR, 0);
+	if (bit_id < 0) {
+		spin_unlock_bh(&bp->ntp_fltr_lock);
+		rc = -ENOMEM;
+		goto err_free;
+	}
+
+	new_fltr->sw_id = (u16)bit_id;
+	new_fltr->flow_id = flow_id;
+	new_fltr->l2_fltr_idx = l2_idx;
+	new_fltr->rxq = rxq_index;
+	hlist_add_head_rcu(&new_fltr->hash, head);
+	bp->ntp_fltr_count++;
+	spin_unlock_bh(&bp->ntp_fltr_lock);
+
+	set_bit(BNXT_RX_NTP_FLTR_SP_EVENT, &bp->sp_event);
+	bnxt_queue_sp_work(bp);
+
+	return new_fltr->sw_id;
+
+err_free:
+	kfree(new_fltr);
+	return rc;
+}
+
+static void bnxt_cfg_ntp_filters(struct bnxt *bp)
+{
+	int i;
+
+	for (i = 0; i < BNXT_NTP_FLTR_HASH_SIZE; i++) {
+		struct hlist_head *head;
+		struct hlist_node *tmp;
+		struct bnxt_ntuple_filter *fltr;
+		int rc;
+
+		head = &bp->ntp_fltr_hash_tbl[i];
+		hlist_for_each_entry_safe(fltr, tmp, head, hash) {
+			bool del = false;
+
+			if (test_bit(BNXT_FLTR_VALID, &fltr->state)) {
+				if (rps_may_expire_flow(bp->dev, fltr->rxq,
+							fltr->flow_id,
+							fltr->sw_id)) {
+					bnxt_hwrm_cfa_ntuple_filter_free(bp,
+									 fltr);
+					del = true;
+				}
+			} else {
+				rc = bnxt_hwrm_cfa_ntuple_filter_alloc(bp,
+								       fltr);
+				if (rc)
+					del = true;
+				else
+					set_bit(BNXT_FLTR_VALID, &fltr->state);
+			}
+
+			if (del) {
+				spin_lock_bh(&bp->ntp_fltr_lock);
+				hlist_del_rcu(&fltr->hash);
+				bp->ntp_fltr_count--;
+				spin_unlock_bh(&bp->ntp_fltr_lock);
+				synchronize_rcu();
+				clear_bit(fltr->sw_id, bp->ntp_fltr_bmap);
+				kfree(fltr);
+			}
+		}
+	}
+	if (test_and_clear_bit(BNXT_HWRM_PF_UNLOAD_SP_EVENT, &bp->sp_event))
+		netdev_info(bp->dev, "Receive PF driver unload event!");
+}
+
+#else
+
+static void bnxt_cfg_ntp_filters(struct bnxt *bp)
+{
+}
+
+#endif /* CONFIG_RFS_ACCEL */
+
+static void bnxt_udp_tunnel_add(struct net_device *dev,
+				struct udp_tunnel_info *ti)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	if (ti->sa_family != AF_INET6 && ti->sa_family != AF_INET)
+		return;
+
+	if (!netif_running(dev))
+		return;
+
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		if (bp->vxlan_port_cnt && bp->vxlan_port != ti->port)
+			return;
+
+		bp->vxlan_port_cnt++;
+		if (bp->vxlan_port_cnt == 1) {
+			bp->vxlan_port = ti->port;
+			set_bit(BNXT_VXLAN_ADD_PORT_SP_EVENT, &bp->sp_event);
+			bnxt_queue_sp_work(bp);
+		}
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		if (bp->nge_port_cnt && bp->nge_port != ti->port)
+			return;
+
+		bp->nge_port_cnt++;
+		if (bp->nge_port_cnt == 1) {
+			bp->nge_port = ti->port;
+			set_bit(BNXT_GENEVE_ADD_PORT_SP_EVENT, &bp->sp_event);
+		}
+		break;
+	default:
+		return;
+	}
+
+	bnxt_queue_sp_work(bp);
+}
+
+static void bnxt_udp_tunnel_del(struct net_device *dev,
+				struct udp_tunnel_info *ti)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	if (ti->sa_family != AF_INET6 && ti->sa_family != AF_INET)
+		return;
+
+	if (!netif_running(dev))
+		return;
+
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		if (!bp->vxlan_port_cnt || bp->vxlan_port != ti->port)
+			return;
+		bp->vxlan_port_cnt--;
+
+		if (bp->vxlan_port_cnt != 0)
+			return;
+
+		set_bit(BNXT_VXLAN_DEL_PORT_SP_EVENT, &bp->sp_event);
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		if (!bp->nge_port_cnt || bp->nge_port != ti->port)
+			return;
+		bp->nge_port_cnt--;
+
+		if (bp->nge_port_cnt != 0)
+			return;
+
+		set_bit(BNXT_GENEVE_DEL_PORT_SP_EVENT, &bp->sp_event);
+		break;
+	default:
+		return;
+	}
+
+	bnxt_queue_sp_work(bp);
+}
+
+static int bnxt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+			       struct net_device *dev, u32 filter_mask,
+			       int nlflags)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bp->br_mode, 0, 0,
+				       nlflags, filter_mask, NULL);
+}
+
+static int bnxt_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
+			       u16 flags)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct nlattr *attr, *br_spec;
+	int rem, rc = 0;
+
+	if (bp->hwrm_spec_code < 0x10708 || !BNXT_SINGLE_PF(bp))
+		return -EOPNOTSUPP;
+
+	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+	if (!br_spec)
+		return -EINVAL;
+
+	nla_for_each_nested(attr, br_spec, rem) {
+		u16 mode;
+
+		if (nla_type(attr) != IFLA_BRIDGE_MODE)
+			continue;
+
+		if (nla_len(attr) < sizeof(mode))
+			return -EINVAL;
+
+		mode = nla_get_u16(attr);
+		if (mode == bp->br_mode)
+			break;
+
+		rc = bnxt_hwrm_set_br_mode(bp, mode);
+		if (!rc)
+			bp->br_mode = mode;
+		break;
+	}
+	return rc;
+}
+
+static int bnxt_get_phys_port_name(struct net_device *dev, char *buf,
+				   size_t len)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int rc;
+
+	/* The PF and it's VF-reps only support the switchdev framework */
+	if (!BNXT_PF(bp))
+		return -EOPNOTSUPP;
+
+	rc = snprintf(buf, len, "p%d", bp->pf.port_id);
+
+	if (rc >= len)
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+int bnxt_port_attr_get(struct bnxt *bp, struct switchdev_attr *attr)
+{
+	if (bp->eswitch_mode != DEVLINK_ESWITCH_MODE_SWITCHDEV)
+		return -EOPNOTSUPP;
+
+	/* The PF and it's VF-reps only support the switchdev framework */
+	if (!BNXT_PF(bp))
+		return -EOPNOTSUPP;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
+		attr->u.ppid.id_len = sizeof(bp->switch_id);
+		memcpy(attr->u.ppid.id, bp->switch_id, attr->u.ppid.id_len);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+static int bnxt_swdev_port_attr_get(struct net_device *dev,
+				    struct switchdev_attr *attr)
+{
+	return bnxt_port_attr_get(netdev_priv(dev), attr);
+}
+
+static const struct switchdev_ops bnxt_switchdev_ops = {
+	.switchdev_port_attr_get	= bnxt_swdev_port_attr_get
+};
+
+static const struct net_device_ops bnxt_netdev_ops = {
+	.ndo_open		= bnxt_open,
+	.ndo_start_xmit		= bnxt_start_xmit,
+	.ndo_stop		= bnxt_close,
+	.ndo_get_stats64	= bnxt_get_stats64,
+	.ndo_set_rx_mode	= bnxt_set_rx_mode,
+	.ndo_do_ioctl		= bnxt_ioctl,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= bnxt_change_mac_addr,
+	.ndo_change_mtu		= bnxt_change_mtu,
+	.ndo_fix_features	= bnxt_fix_features,
+	.ndo_set_features	= bnxt_set_features,
+	.ndo_tx_timeout		= bnxt_tx_timeout,
+#ifdef CONFIG_BNXT_SRIOV
+	.ndo_get_vf_config	= bnxt_get_vf_config,
+	.ndo_set_vf_mac		= bnxt_set_vf_mac,
+	.ndo_set_vf_vlan	= bnxt_set_vf_vlan,
+	.ndo_set_vf_rate	= bnxt_set_vf_bw,
+	.ndo_set_vf_link_state	= bnxt_set_vf_link_state,
+	.ndo_set_vf_spoofchk	= bnxt_set_vf_spoofchk,
+	.ndo_set_vf_trust	= bnxt_set_vf_trust,
+#endif
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= bnxt_poll_controller,
+#endif
+	.ndo_setup_tc           = bnxt_setup_tc,
+#ifdef CONFIG_RFS_ACCEL
+	.ndo_rx_flow_steer	= bnxt_rx_flow_steer,
+#endif
+	.ndo_udp_tunnel_add	= bnxt_udp_tunnel_add,
+	.ndo_udp_tunnel_del	= bnxt_udp_tunnel_del,
+	.ndo_bpf		= bnxt_xdp,
+	.ndo_bridge_getlink	= bnxt_bridge_getlink,
+	.ndo_bridge_setlink	= bnxt_bridge_setlink,
+	.ndo_get_phys_port_name = bnxt_get_phys_port_name
+};
+
+static void bnxt_remove_one(struct pci_dev *pdev)
+{
+	struct net_device *dev = pci_get_drvdata(pdev);
+	struct bnxt *bp = netdev_priv(dev);
+
+	if (BNXT_PF(bp)) {
+		bnxt_sriov_disable(bp);
+		bnxt_dl_unregister(bp);
+	}
+
+	pci_disable_pcie_error_reporting(pdev);
+	unregister_netdev(dev);
+	bnxt_shutdown_tc(bp);
+	bnxt_cancel_sp_work(bp);
+	bp->sp_event = 0;
+
+	bnxt_clear_int_mode(bp);
+	bnxt_hwrm_func_drv_unrgtr(bp);
+	bnxt_free_hwrm_resources(bp);
+	bnxt_free_hwrm_short_cmd_req(bp);
+	bnxt_ethtool_free(bp);
+	bnxt_dcb_free(bp);
+	kfree(bp->edev);
+	bp->edev = NULL;
+	bnxt_cleanup_pci(bp);
+	free_netdev(dev);
+}
+
+static int bnxt_probe_phy(struct bnxt *bp)
+{
+	int rc = 0;
+	struct bnxt_link_info *link_info = &bp->link_info;
+
+	rc = bnxt_hwrm_phy_qcaps(bp);
+	if (rc) {
+		netdev_err(bp->dev, "Probe phy can't get phy capabilities (rc: %x)\n",
+			   rc);
+		return rc;
+	}
+	mutex_init(&bp->link_lock);
+
+	rc = bnxt_update_link(bp, false);
+	if (rc) {
+		netdev_err(bp->dev, "Probe phy can't update link (rc: %x)\n",
+			   rc);
+		return rc;
+	}
+
+	/* Older firmware does not have supported_auto_speeds, so assume
+	 * that all supported speeds can be autonegotiated.
+	 */
+	if (link_info->auto_link_speeds && !link_info->support_auto_speeds)
+		link_info->support_auto_speeds = link_info->support_speeds;
+
+	/*initialize the ethool setting copy with NVM settings */
+	if (BNXT_AUTO_MODE(link_info->auto_mode)) {
+		link_info->autoneg = BNXT_AUTONEG_SPEED;
+		if (bp->hwrm_spec_code >= 0x10201) {
+			if (link_info->auto_pause_setting &
+			    PORT_PHY_CFG_REQ_AUTO_PAUSE_AUTONEG_PAUSE)
+				link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
+		} else {
+			link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
+		}
+		link_info->advertising = link_info->auto_link_speeds;
+	} else {
+		link_info->req_link_speed = link_info->force_link_speed;
+		link_info->req_duplex = link_info->duplex_setting;
+	}
+	if (link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)
+		link_info->req_flow_ctrl =
+			link_info->auto_pause_setting & BNXT_LINK_PAUSE_BOTH;
+	else
+		link_info->req_flow_ctrl = link_info->force_pause_setting;
+	return rc;
+}
+
+static int bnxt_get_max_irq(struct pci_dev *pdev)
+{
+	u16 ctrl;
+
+	if (!pdev->msix_cap)
+		return 1;
+
+	pci_read_config_word(pdev, pdev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
+	return (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
+}
+
+static void _bnxt_get_max_rings(struct bnxt *bp, int *max_rx, int *max_tx,
+				int *max_cp)
+{
+	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+	int max_ring_grps = 0;
+
+	*max_tx = hw_resc->max_tx_rings;
+	*max_rx = hw_resc->max_rx_rings;
+	*max_cp = min_t(int, hw_resc->max_irqs, hw_resc->max_cp_rings);
+	*max_cp = min_t(int, *max_cp, hw_resc->max_stat_ctxs);
+	max_ring_grps = hw_resc->max_hw_ring_grps;
+	if (BNXT_CHIP_TYPE_NITRO_A0(bp) && BNXT_PF(bp)) {
+		*max_cp -= 1;
+		*max_rx -= 2;
+	}
+	if (bp->flags & BNXT_FLAG_AGG_RINGS)
+		*max_rx >>= 1;
+	*max_rx = min_t(int, *max_rx, max_ring_grps);
+}
+
+int bnxt_get_max_rings(struct bnxt *bp, int *max_rx, int *max_tx, bool shared)
+{
+	int rx, tx, cp;
+
+	_bnxt_get_max_rings(bp, &rx, &tx, &cp);
+	if (!rx || !tx || !cp)
+		return -ENOMEM;
+
+	*max_rx = rx;
+	*max_tx = tx;
+	return bnxt_trim_rings(bp, max_rx, max_tx, cp, shared);
+}
+
+static int bnxt_get_dflt_rings(struct bnxt *bp, int *max_rx, int *max_tx,
+			       bool shared)
+{
+	int rc;
+
+	rc = bnxt_get_max_rings(bp, max_rx, max_tx, shared);
+	if (rc && (bp->flags & BNXT_FLAG_AGG_RINGS)) {
+		/* Not enough rings, try disabling agg rings. */
+		bp->flags &= ~BNXT_FLAG_AGG_RINGS;
+		rc = bnxt_get_max_rings(bp, max_rx, max_tx, shared);
+		if (rc)
+			return rc;
+		bp->flags |= BNXT_FLAG_NO_AGG_RINGS;
+		bp->dev->hw_features &= ~(NETIF_F_LRO | NETIF_F_GRO_HW);
+		bp->dev->features &= ~(NETIF_F_LRO | NETIF_F_GRO_HW);
+		bnxt_set_ring_params(bp);
+	}
+
+	if (bp->flags & BNXT_FLAG_ROCE_CAP) {
+		int max_cp, max_stat, max_irq;
+
+		/* Reserve minimum resources for RoCE */
+		max_cp = bnxt_get_max_func_cp_rings(bp);
+		max_stat = bnxt_get_max_func_stat_ctxs(bp);
+		max_irq = bnxt_get_max_func_irqs(bp);
+		if (max_cp <= BNXT_MIN_ROCE_CP_RINGS ||
+		    max_irq <= BNXT_MIN_ROCE_CP_RINGS ||
+		    max_stat <= BNXT_MIN_ROCE_STAT_CTXS)
+			return 0;
+
+		max_cp -= BNXT_MIN_ROCE_CP_RINGS;
+		max_irq -= BNXT_MIN_ROCE_CP_RINGS;
+		max_stat -= BNXT_MIN_ROCE_STAT_CTXS;
+		max_cp = min_t(int, max_cp, max_irq);
+		max_cp = min_t(int, max_cp, max_stat);
+		rc = bnxt_trim_rings(bp, max_rx, max_tx, max_cp, shared);
+		if (rc)
+			rc = 0;
+	}
+	return rc;
+}
+
+/* In initial default shared ring setting, each shared ring must have a
+ * RX/TX ring pair.
+ */
+static void bnxt_trim_dflt_sh_rings(struct bnxt *bp)
+{
+	bp->cp_nr_rings = min_t(int, bp->tx_nr_rings_per_tc, bp->rx_nr_rings);
+	bp->rx_nr_rings = bp->cp_nr_rings;
+	bp->tx_nr_rings_per_tc = bp->cp_nr_rings;
+	bp->tx_nr_rings = bp->tx_nr_rings_per_tc;
+}
+
+static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh)
+{
+	int dflt_rings, max_rx_rings, max_tx_rings, rc;
+
+	if (sh)
+		bp->flags |= BNXT_FLAG_SHARED_RINGS;
+	dflt_rings = netif_get_num_default_rss_queues();
+	/* Reduce default rings on multi-port cards so that total default
+	 * rings do not exceed CPU count.
+	 */
+	if (bp->port_count > 1) {
+		int max_rings =
+			max_t(int, num_online_cpus() / bp->port_count, 1);
+
+		dflt_rings = min_t(int, dflt_rings, max_rings);
+	}
+	rc = bnxt_get_dflt_rings(bp, &max_rx_rings, &max_tx_rings, sh);
+	if (rc)
+		return rc;
+	bp->rx_nr_rings = min_t(int, dflt_rings, max_rx_rings);
+	bp->tx_nr_rings_per_tc = min_t(int, dflt_rings, max_tx_rings);
+	if (sh)
+		bnxt_trim_dflt_sh_rings(bp);
+	else
+		bp->cp_nr_rings = bp->tx_nr_rings_per_tc + bp->rx_nr_rings;
+	bp->tx_nr_rings = bp->tx_nr_rings_per_tc;
+
+	rc = __bnxt_reserve_rings(bp);
+	if (rc)
+		netdev_warn(bp->dev, "Unable to reserve tx rings\n");
+	bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
+	if (sh)
+		bnxt_trim_dflt_sh_rings(bp);
+
+	/* Rings may have been trimmed, re-reserve the trimmed rings. */
+	if (bnxt_need_reserve_rings(bp)) {
+		rc = __bnxt_reserve_rings(bp);
+		if (rc)
+			netdev_warn(bp->dev, "2nd rings reservation failed.\n");
+		bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
+	}
+	bp->num_stat_ctxs = bp->cp_nr_rings;
+	if (BNXT_CHIP_TYPE_NITRO_A0(bp)) {
+		bp->rx_nr_rings++;
+		bp->cp_nr_rings++;
+	}
+	return rc;
+}
+
+int bnxt_restore_pf_fw_resources(struct bnxt *bp)
+{
+	int rc;
+
+	ASSERT_RTNL();
+	bnxt_hwrm_func_qcaps(bp);
+
+	if (netif_running(bp->dev))
+		__bnxt_close_nic(bp, true, false);
+
+	bnxt_ulp_irq_stop(bp);
+	bnxt_clear_int_mode(bp);
+	rc = bnxt_init_int_mode(bp);
+	bnxt_ulp_irq_restart(bp, rc);
+
+	if (netif_running(bp->dev)) {
+		if (rc)
+			dev_close(bp->dev);
+		else
+			rc = bnxt_open_nic(bp, true, false);
+	}
+
+	return rc;
+}
+
+static int bnxt_init_mac_addr(struct bnxt *bp)
+{
+	int rc = 0;
+
+	if (BNXT_PF(bp)) {
+		memcpy(bp->dev->dev_addr, bp->pf.mac_addr, ETH_ALEN);
+	} else {
+#ifdef CONFIG_BNXT_SRIOV
+		struct bnxt_vf_info *vf = &bp->vf;
+
+		if (is_valid_ether_addr(vf->mac_addr)) {
+			/* overwrite netdev dev_addr with admin VF MAC */
+			memcpy(bp->dev->dev_addr, vf->mac_addr, ETH_ALEN);
+		} else {
+			eth_hw_addr_random(bp->dev);
+			rc = bnxt_approve_mac(bp, bp->dev->dev_addr);
+		}
+#endif
+	}
+	return rc;
+}
+
+static void bnxt_parse_log_pcie_link(struct bnxt *bp)
+{
+	enum pcie_link_width width = PCIE_LNK_WIDTH_UNKNOWN;
+	enum pci_bus_speed speed = PCI_SPEED_UNKNOWN;
+
+	if (pcie_get_minimum_link(pci_physfn(bp->pdev), &speed, &width) ||
+	    speed == PCI_SPEED_UNKNOWN || width == PCIE_LNK_WIDTH_UNKNOWN)
+		netdev_info(bp->dev, "Failed to determine PCIe Link Info\n");
+	else
+		netdev_info(bp->dev, "PCIe: Speed %s Width x%d\n",
+			    speed == PCIE_SPEED_2_5GT ? "2.5GT/s" :
+			    speed == PCIE_SPEED_5_0GT ? "5.0GT/s" :
+			    speed == PCIE_SPEED_8_0GT ? "8.0GT/s" :
+			    "Unknown", width);
+}
+
+static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	static int version_printed;
+	struct net_device *dev;
+	struct bnxt *bp;
+	int rc, max_irqs;
+
+	if (pci_is_bridge(pdev))
+		return -ENODEV;
+
+	if (version_printed++ == 0)
+		pr_info("%s", version);
+
+	max_irqs = bnxt_get_max_irq(pdev);
+	dev = alloc_etherdev_mq(sizeof(*bp), max_irqs);
+	if (!dev)
+		return -ENOMEM;
+
+	bp = netdev_priv(dev);
+
+	if (bnxt_vf_pciid(ent->driver_data))
+		bp->flags |= BNXT_FLAG_VF;
+
+	if (pdev->msix_cap)
+		bp->flags |= BNXT_FLAG_MSIX_CAP;
+
+	rc = bnxt_init_board(pdev, dev);
+	if (rc < 0)
+		goto init_err_free;
+
+	dev->netdev_ops = &bnxt_netdev_ops;
+	dev->watchdog_timeo = BNXT_TX_TIMEOUT;
+	dev->ethtool_ops = &bnxt_ethtool_ops;
+	SWITCHDEV_SET_OPS(dev, &bnxt_switchdev_ops);
+	pci_set_drvdata(pdev, dev);
+
+	rc = bnxt_alloc_hwrm_resources(bp);
+	if (rc)
+		goto init_err_pci_clean;
+
+	mutex_init(&bp->hwrm_cmd_lock);
+	rc = bnxt_hwrm_ver_get(bp);
+	if (rc)
+		goto init_err_pci_clean;
+
+	if (bp->flags & BNXT_FLAG_SHORT_CMD) {
+		rc = bnxt_alloc_hwrm_short_cmd_req(bp);
+		if (rc)
+			goto init_err_pci_clean;
+	}
+
+	rc = bnxt_hwrm_func_reset(bp);
+	if (rc)
+		goto init_err_pci_clean;
+
+	bnxt_hwrm_fw_set_time(bp);
+
+	dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG |
+			   NETIF_F_TSO | NETIF_F_TSO6 |
+			   NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_GRE |
+			   NETIF_F_GSO_IPXIP4 |
+			   NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM |
+			   NETIF_F_GSO_PARTIAL | NETIF_F_RXHASH |
+			   NETIF_F_RXCSUM | NETIF_F_GRO;
+
+	if (!BNXT_CHIP_TYPE_NITRO_A0(bp))
+		dev->hw_features |= NETIF_F_LRO;
+
+	dev->hw_enc_features =
+			NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG |
+			NETIF_F_TSO | NETIF_F_TSO6 |
+			NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_GRE |
+			NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM |
+			NETIF_F_GSO_IPXIP4 | NETIF_F_GSO_PARTIAL;
+	dev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM |
+				    NETIF_F_GSO_GRE_CSUM;
+	dev->vlan_features = dev->hw_features | NETIF_F_HIGHDMA;
+	dev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_TX |
+			    NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_TX;
+	if (!BNXT_CHIP_TYPE_NITRO_A0(bp))
+		dev->hw_features |= NETIF_F_GRO_HW;
+	dev->features |= dev->hw_features | NETIF_F_HIGHDMA;
+	if (dev->features & NETIF_F_GRO_HW)
+		dev->features &= ~NETIF_F_LRO;
+	dev->priv_flags |= IFF_UNICAST_FLT;
+
+#ifdef CONFIG_BNXT_SRIOV
+	init_waitqueue_head(&bp->sriov_cfg_wait);
+	mutex_init(&bp->sriov_lock);
+#endif
+	bp->gro_func = bnxt_gro_func_5730x;
+	if (BNXT_CHIP_P4_PLUS(bp))
+		bp->gro_func = bnxt_gro_func_5731x;
+	else
+		bp->flags |= BNXT_FLAG_DOUBLE_DB;
+
+	rc = bnxt_hwrm_func_drv_rgtr(bp);
+	if (rc)
+		goto init_err_pci_clean;
+
+	rc = bnxt_hwrm_func_rgtr_async_events(bp, NULL, 0);
+	if (rc)
+		goto init_err_pci_clean;
+
+	bp->ulp_probe = bnxt_ulp_probe;
+
+	/* Get the MAX capabilities for this function */
+	rc = bnxt_hwrm_func_qcaps(bp);
+	if (rc) {
+		netdev_err(bp->dev, "hwrm query capability failure rc: %x\n",
+			   rc);
+		rc = -1;
+		goto init_err_pci_clean;
+	}
+	rc = bnxt_init_mac_addr(bp);
+	if (rc) {
+		dev_err(&pdev->dev, "Unable to initialize mac address.\n");
+		rc = -EADDRNOTAVAIL;
+		goto init_err_pci_clean;
+	}
+	rc = bnxt_hwrm_queue_qportcfg(bp);
+	if (rc) {
+		netdev_err(bp->dev, "hwrm query qportcfg failure rc: %x\n",
+			   rc);
+		rc = -1;
+		goto init_err_pci_clean;
+	}
+
+	bnxt_hwrm_func_qcfg(bp);
+	bnxt_hwrm_port_led_qcaps(bp);
+	bnxt_ethtool_init(bp);
+	bnxt_dcb_init(bp);
+
+	/* MTU range: 60 - FW defined max */
+	dev->min_mtu = ETH_ZLEN;
+	dev->max_mtu = bp->max_mtu;
+
+	rc = bnxt_probe_phy(bp);
+	if (rc)
+		goto init_err_pci_clean;
+
+	bnxt_set_rx_skb_mode(bp, false);
+	bnxt_set_tpa_flags(bp);
+	bnxt_set_ring_params(bp);
+	bnxt_set_max_func_irqs(bp, max_irqs);
+	rc = bnxt_set_dflt_rings(bp, true);
+	if (rc) {
+		netdev_err(bp->dev, "Not enough rings available.\n");
+		rc = -ENOMEM;
+		goto init_err_pci_clean;
+	}
+
+	/* Default RSS hash cfg. */
+	bp->rss_hash_cfg = VNIC_RSS_CFG_REQ_HASH_TYPE_IPV4 |
+			   VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV4 |
+			   VNIC_RSS_CFG_REQ_HASH_TYPE_IPV6 |
+			   VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV6;
+	if (BNXT_CHIP_P4_PLUS(bp) && bp->hwrm_spec_code >= 0x10501) {
+		bp->flags |= BNXT_FLAG_UDP_RSS_CAP;
+		bp->rss_hash_cfg |= VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV4 |
+				    VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV6;
+	}
+
+	bnxt_hwrm_vnic_qcaps(bp);
+	if (bnxt_rfs_supported(bp)) {
+		dev->hw_features |= NETIF_F_NTUPLE;
+		if (bnxt_rfs_capable(bp)) {
+			bp->flags |= BNXT_FLAG_RFS;
+			dev->features |= NETIF_F_NTUPLE;
+		}
+	}
+
+	if (dev->hw_features & NETIF_F_HW_VLAN_CTAG_RX)
+		bp->flags |= BNXT_FLAG_STRIP_VLAN;
+
+	rc = bnxt_init_int_mode(bp);
+	if (rc)
+		goto init_err_pci_clean;
+
+	/* No TC has been set yet and rings may have been trimmed due to
+	 * limited MSIX, so we re-initialize the TX rings per TC.
+	 */
+	bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
+
+	bnxt_get_wol_settings(bp);
+	if (bp->flags & BNXT_FLAG_WOL_CAP)
+		device_set_wakeup_enable(&pdev->dev, bp->wol);
+	else
+		device_set_wakeup_capable(&pdev->dev, false);
+
+	bnxt_hwrm_set_cache_line_size(bp, cache_line_size());
+
+	if (BNXT_PF(bp)) {
+		if (!bnxt_pf_wq) {
+			bnxt_pf_wq =
+				create_singlethread_workqueue("bnxt_pf_wq");
+			if (!bnxt_pf_wq) {
+				dev_err(&pdev->dev, "Unable to create workqueue.\n");
+				goto init_err_pci_clean;
+			}
+		}
+		bnxt_init_tc(bp);
+	}
+
+	rc = register_netdev(dev);
+	if (rc)
+		goto init_err_cleanup_tc;
+
+	if (BNXT_PF(bp))
+		bnxt_dl_register(bp);
+
+	netdev_info(dev, "%s found at mem %lx, node addr %pM\n",
+		    board_info[ent->driver_data].name,
+		    (long)pci_resource_start(pdev, 0), dev->dev_addr);
+
+	bnxt_parse_log_pcie_link(bp);
+
+	return 0;
+
+init_err_cleanup_tc:
+	bnxt_shutdown_tc(bp);
+	bnxt_clear_int_mode(bp);
+
+init_err_pci_clean:
+	bnxt_cleanup_pci(bp);
+
+init_err_free:
+	free_netdev(dev);
+	return rc;
+}
+
+static void bnxt_shutdown(struct pci_dev *pdev)
+{
+	struct net_device *dev = pci_get_drvdata(pdev);
+	struct bnxt *bp;
+
+	if (!dev)
+		return;
+
+	rtnl_lock();
+	bp = netdev_priv(dev);
+	if (!bp)
+		goto shutdown_exit;
+
+	if (netif_running(dev))
+		dev_close(dev);
+
+	bnxt_ulp_shutdown(bp);
+
+	if (system_state == SYSTEM_POWER_OFF) {
+		bnxt_clear_int_mode(bp);
+		pci_wake_from_d3(pdev, bp->wol);
+		pci_set_power_state(pdev, PCI_D3hot);
+	}
+
+shutdown_exit:
+	rtnl_unlock();
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int bnxt_suspend(struct device *device)
+{
+	struct pci_dev *pdev = to_pci_dev(device);
+	struct net_device *dev = pci_get_drvdata(pdev);
+	struct bnxt *bp = netdev_priv(dev);
+	int rc = 0;
+
+	rtnl_lock();
+	if (netif_running(dev)) {
+		netif_device_detach(dev);
+		rc = bnxt_close(dev);
+	}
+	bnxt_hwrm_func_drv_unrgtr(bp);
+	rtnl_unlock();
+	return rc;
+}
+
+static int bnxt_resume(struct device *device)
+{
+	struct pci_dev *pdev = to_pci_dev(device);
+	struct net_device *dev = pci_get_drvdata(pdev);
+	struct bnxt *bp = netdev_priv(dev);
+	int rc = 0;
+
+	rtnl_lock();
+	if (bnxt_hwrm_ver_get(bp) || bnxt_hwrm_func_drv_rgtr(bp)) {
+		rc = -ENODEV;
+		goto resume_exit;
+	}
+	rc = bnxt_hwrm_func_reset(bp);
+	if (rc) {
+		rc = -EBUSY;
+		goto resume_exit;
+	}
+	bnxt_get_wol_settings(bp);
+	if (netif_running(dev)) {
+		rc = bnxt_open(dev);
+		if (!rc)
+			netif_device_attach(dev);
+	}
+
+resume_exit:
+	rtnl_unlock();
+	return rc;
+}
+
+static SIMPLE_DEV_PM_OPS(bnxt_pm_ops, bnxt_suspend, bnxt_resume);
+#define BNXT_PM_OPS (&bnxt_pm_ops)
+
+#else
+
+#define BNXT_PM_OPS NULL
+
+#endif /* CONFIG_PM_SLEEP */
+
+/**
+ * bnxt_io_error_detected - called when PCI error is detected
+ * @pdev: Pointer to PCI device
+ * @state: The current pci connection state
+ *
+ * This function is called after a PCI bus error affecting
+ * this device has been detected.
+ */
+static pci_ers_result_t bnxt_io_error_detected(struct pci_dev *pdev,
+					       pci_channel_state_t state)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct bnxt *bp = netdev_priv(netdev);
+
+	netdev_info(netdev, "PCI I/O error detected\n");
+
+	rtnl_lock();
+	netif_device_detach(netdev);
+
+	bnxt_ulp_stop(bp);
+
+	if (state == pci_channel_io_perm_failure) {
+		rtnl_unlock();
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	if (netif_running(netdev))
+		bnxt_close(netdev);
+
+	pci_disable_device(pdev);
+	rtnl_unlock();
+
+	/* Request a slot slot reset. */
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * bnxt_io_slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Restart the card from scratch, as if from a cold-boot.
+ * At this point, the card has exprienced a hard reset,
+ * followed by fixups by BIOS, and has its config space
+ * set up identically to what it was at cold boot.
+ */
+static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct bnxt *bp = netdev_priv(netdev);
+	int err = 0;
+	pci_ers_result_t result = PCI_ERS_RESULT_DISCONNECT;
+
+	netdev_info(bp->dev, "PCI Slot Reset\n");
+
+	rtnl_lock();
+
+	if (pci_enable_device(pdev)) {
+		dev_err(&pdev->dev,
+			"Cannot re-enable PCI device after reset.\n");
+	} else {
+		pci_set_master(pdev);
+
+		err = bnxt_hwrm_func_reset(bp);
+		if (!err && netif_running(netdev))
+			err = bnxt_open(netdev);
+
+		if (!err) {
+			result = PCI_ERS_RESULT_RECOVERED;
+			bnxt_ulp_start(bp);
+		}
+	}
+
+	if (result != PCI_ERS_RESULT_RECOVERED && netif_running(netdev))
+		dev_close(netdev);
+
+	rtnl_unlock();
+
+	err = pci_cleanup_aer_uncorrect_error_status(pdev);
+	if (err) {
+		dev_err(&pdev->dev,
+			"pci_cleanup_aer_uncorrect_error_status failed 0x%0x\n",
+			 err); /* non-fatal, continue */
+	}
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+/**
+ * bnxt_io_resume - called when traffic can start flowing again.
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the error recovery driver tells
+ * us that its OK to resume normal operation.
+ */
+static void bnxt_io_resume(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+
+	rtnl_lock();
+
+	netif_device_attach(netdev);
+
+	rtnl_unlock();
+}
+
+static const struct pci_error_handlers bnxt_err_handler = {
+	.error_detected	= bnxt_io_error_detected,
+	.slot_reset	= bnxt_io_slot_reset,
+	.resume		= bnxt_io_resume
+};
+
+static struct pci_driver bnxt_pci_driver = {
+	.name		= DRV_MODULE_NAME,
+	.id_table	= bnxt_pci_tbl,
+	.probe		= bnxt_init_one,
+	.remove		= bnxt_remove_one,
+	.shutdown	= bnxt_shutdown,
+	.driver.pm	= BNXT_PM_OPS,
+	.err_handler	= &bnxt_err_handler,
+#if defined(CONFIG_BNXT_SRIOV)
+	.sriov_configure = bnxt_sriov_configure,
+#endif
+};
+
+static int __init bnxt_init(void)
+{
+	return pci_register_driver(&bnxt_pci_driver);
+}
+
+static void __exit bnxt_exit(void)
+{
+	pci_unregister_driver(&bnxt_pci_driver);
+	if (bnxt_pf_wq)
+		destroy_workqueue(bnxt_pf_wq);
+}
+
+module_init(bnxt_init);
+module_exit(bnxt_exit);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
new file mode 100644
index 0000000..3d55d3b
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -0,0 +1,1482 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2014-2016 Broadcom Corporation
+ * Copyright (c) 2016-2018 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_H
+#define BNXT_H
+
+#define DRV_MODULE_NAME		"bnxt_en"
+#define DRV_MODULE_VERSION	"1.9.1"
+
+#define DRV_VER_MAJ	1
+#define DRV_VER_MIN	9
+#define DRV_VER_UPD	1
+
+#include <linux/interrupt.h>
+#include <linux/rhashtable.h>
+#include <net/devlink.h>
+#include <net/dst_metadata.h>
+#include <net/switchdev.h>
+#include <net/xdp.h>
+#include <linux/net_dim.h>
+
+struct tx_bd {
+	__le32 tx_bd_len_flags_type;
+	#define TX_BD_TYPE					(0x3f << 0)
+	 #define TX_BD_TYPE_SHORT_TX_BD				 (0x00 << 0)
+	 #define TX_BD_TYPE_LONG_TX_BD				 (0x10 << 0)
+	#define TX_BD_FLAGS_PACKET_END				(1 << 6)
+	#define TX_BD_FLAGS_NO_CMPL				(1 << 7)
+	#define TX_BD_FLAGS_BD_CNT				(0x1f << 8)
+	 #define TX_BD_FLAGS_BD_CNT_SHIFT			 8
+	#define TX_BD_FLAGS_LHINT				(3 << 13)
+	 #define TX_BD_FLAGS_LHINT_SHIFT			 13
+	 #define TX_BD_FLAGS_LHINT_512_AND_SMALLER		 (0 << 13)
+	 #define TX_BD_FLAGS_LHINT_512_TO_1023			 (1 << 13)
+	 #define TX_BD_FLAGS_LHINT_1024_TO_2047			 (2 << 13)
+	 #define TX_BD_FLAGS_LHINT_2048_AND_LARGER		 (3 << 13)
+	#define TX_BD_FLAGS_COAL_NOW				(1 << 15)
+	#define TX_BD_LEN					(0xffff << 16)
+	 #define TX_BD_LEN_SHIFT				 16
+
+	u32 tx_bd_opaque;
+	__le64 tx_bd_haddr;
+} __packed;
+
+struct tx_bd_ext {
+	__le32 tx_bd_hsize_lflags;
+	#define TX_BD_FLAGS_TCP_UDP_CHKSUM			(1 << 0)
+	#define TX_BD_FLAGS_IP_CKSUM				(1 << 1)
+	#define TX_BD_FLAGS_NO_CRC				(1 << 2)
+	#define TX_BD_FLAGS_STAMP				(1 << 3)
+	#define TX_BD_FLAGS_T_IP_CHKSUM				(1 << 4)
+	#define TX_BD_FLAGS_LSO					(1 << 5)
+	#define TX_BD_FLAGS_IPID_FMT				(1 << 6)
+	#define TX_BD_FLAGS_T_IPID				(1 << 7)
+	#define TX_BD_HSIZE					(0xff << 16)
+	 #define TX_BD_HSIZE_SHIFT				 16
+
+	__le32 tx_bd_mss;
+	__le32 tx_bd_cfa_action;
+	#define TX_BD_CFA_ACTION				(0xffff << 16)
+	 #define TX_BD_CFA_ACTION_SHIFT				 16
+
+	__le32 tx_bd_cfa_meta;
+	#define TX_BD_CFA_META_MASK                             0xfffffff
+	#define TX_BD_CFA_META_VID_MASK                         0xfff
+	#define TX_BD_CFA_META_PRI_MASK                         (0xf << 12)
+	 #define TX_BD_CFA_META_PRI_SHIFT                        12
+	#define TX_BD_CFA_META_TPID_MASK                        (3 << 16)
+	 #define TX_BD_CFA_META_TPID_SHIFT                       16
+	#define TX_BD_CFA_META_KEY                              (0xf << 28)
+	 #define TX_BD_CFA_META_KEY_SHIFT			 28
+	#define TX_BD_CFA_META_KEY_VLAN                         (1 << 28)
+};
+
+struct rx_bd {
+	__le32 rx_bd_len_flags_type;
+	#define RX_BD_TYPE					(0x3f << 0)
+	 #define RX_BD_TYPE_RX_PACKET_BD			 0x4
+	 #define RX_BD_TYPE_RX_BUFFER_BD			 0x5
+	 #define RX_BD_TYPE_RX_AGG_BD				 0x6
+	 #define RX_BD_TYPE_16B_BD_SIZE				 (0 << 4)
+	 #define RX_BD_TYPE_32B_BD_SIZE				 (1 << 4)
+	 #define RX_BD_TYPE_48B_BD_SIZE				 (2 << 4)
+	 #define RX_BD_TYPE_64B_BD_SIZE				 (3 << 4)
+	#define RX_BD_FLAGS_SOP					(1 << 6)
+	#define RX_BD_FLAGS_EOP					(1 << 7)
+	#define RX_BD_FLAGS_BUFFERS				(3 << 8)
+	 #define RX_BD_FLAGS_1_BUFFER_PACKET			 (0 << 8)
+	 #define RX_BD_FLAGS_2_BUFFER_PACKET			 (1 << 8)
+	 #define RX_BD_FLAGS_3_BUFFER_PACKET			 (2 << 8)
+	 #define RX_BD_FLAGS_4_BUFFER_PACKET			 (3 << 8)
+	#define RX_BD_LEN					(0xffff << 16)
+	 #define RX_BD_LEN_SHIFT				 16
+
+	u32 rx_bd_opaque;
+	__le64 rx_bd_haddr;
+};
+
+struct tx_cmp {
+	__le32 tx_cmp_flags_type;
+	#define CMP_TYPE					(0x3f << 0)
+	 #define CMP_TYPE_TX_L2_CMP				 0
+	 #define CMP_TYPE_RX_L2_CMP				 17
+	 #define CMP_TYPE_RX_AGG_CMP				 18
+	 #define CMP_TYPE_RX_L2_TPA_START_CMP			 19
+	 #define CMP_TYPE_RX_L2_TPA_END_CMP			 21
+	 #define CMP_TYPE_STATUS_CMP				 32
+	 #define CMP_TYPE_REMOTE_DRIVER_REQ			 34
+	 #define CMP_TYPE_REMOTE_DRIVER_RESP			 36
+	 #define CMP_TYPE_ERROR_STATUS				 48
+	 #define CMPL_BASE_TYPE_STAT_EJECT			 0x1aUL
+	 #define CMPL_BASE_TYPE_HWRM_DONE			 0x20UL
+	 #define CMPL_BASE_TYPE_HWRM_FWD_REQ			 0x22UL
+	 #define CMPL_BASE_TYPE_HWRM_FWD_RESP			 0x24UL
+	 #define CMPL_BASE_TYPE_HWRM_ASYNC_EVENT		 0x2eUL
+
+	#define TX_CMP_FLAGS_ERROR				(1 << 6)
+	#define TX_CMP_FLAGS_PUSH				(1 << 7)
+
+	u32 tx_cmp_opaque;
+	__le32 tx_cmp_errors_v;
+	#define TX_CMP_V					(1 << 0)
+	#define TX_CMP_ERRORS_BUFFER_ERROR			(7 << 1)
+	 #define TX_CMP_ERRORS_BUFFER_ERROR_NO_ERROR		 0
+	 #define TX_CMP_ERRORS_BUFFER_ERROR_BAD_FORMAT		 2
+	 #define TX_CMP_ERRORS_BUFFER_ERROR_INVALID_STAG	 4
+	 #define TX_CMP_ERRORS_BUFFER_ERROR_STAG_BOUNDS		 5
+	 #define TX_CMP_ERRORS_ZERO_LENGTH_PKT			 (1 << 4)
+	 #define TX_CMP_ERRORS_EXCESSIVE_BD_LEN			 (1 << 5)
+	 #define TX_CMP_ERRORS_DMA_ERROR			 (1 << 6)
+	 #define TX_CMP_ERRORS_HINT_TOO_SHORT			 (1 << 7)
+
+	__le32 tx_cmp_unsed_3;
+};
+
+struct rx_cmp {
+	__le32 rx_cmp_len_flags_type;
+	#define RX_CMP_CMP_TYPE					(0x3f << 0)
+	#define RX_CMP_FLAGS_ERROR				(1 << 6)
+	#define RX_CMP_FLAGS_PLACEMENT				(7 << 7)
+	#define RX_CMP_FLAGS_RSS_VALID				(1 << 10)
+	#define RX_CMP_FLAGS_UNUSED				(1 << 11)
+	 #define RX_CMP_FLAGS_ITYPES_SHIFT			 12
+	 #define RX_CMP_FLAGS_ITYPE_UNKNOWN			 (0 << 12)
+	 #define RX_CMP_FLAGS_ITYPE_IP				 (1 << 12)
+	 #define RX_CMP_FLAGS_ITYPE_TCP				 (2 << 12)
+	 #define RX_CMP_FLAGS_ITYPE_UDP				 (3 << 12)
+	 #define RX_CMP_FLAGS_ITYPE_FCOE			 (4 << 12)
+	 #define RX_CMP_FLAGS_ITYPE_ROCE			 (5 << 12)
+	 #define RX_CMP_FLAGS_ITYPE_PTP_WO_TS			 (8 << 12)
+	 #define RX_CMP_FLAGS_ITYPE_PTP_W_TS			 (9 << 12)
+	#define RX_CMP_LEN					(0xffff << 16)
+	 #define RX_CMP_LEN_SHIFT				 16
+
+	u32 rx_cmp_opaque;
+	__le32 rx_cmp_misc_v1;
+	#define RX_CMP_V1					(1 << 0)
+	#define RX_CMP_AGG_BUFS					(0x1f << 1)
+	 #define RX_CMP_AGG_BUFS_SHIFT				 1
+	#define RX_CMP_RSS_HASH_TYPE				(0x7f << 9)
+	 #define RX_CMP_RSS_HASH_TYPE_SHIFT			 9
+	#define RX_CMP_PAYLOAD_OFFSET				(0xff << 16)
+	 #define RX_CMP_PAYLOAD_OFFSET_SHIFT			 16
+
+	__le32 rx_cmp_rss_hash;
+};
+
+#define RX_CMP_HASH_VALID(rxcmp)				\
+	((rxcmp)->rx_cmp_len_flags_type & cpu_to_le32(RX_CMP_FLAGS_RSS_VALID))
+
+#define RSS_PROFILE_ID_MASK	0x1f
+
+#define RX_CMP_HASH_TYPE(rxcmp)					\
+	(((le32_to_cpu((rxcmp)->rx_cmp_misc_v1) & RX_CMP_RSS_HASH_TYPE) >>\
+	  RX_CMP_RSS_HASH_TYPE_SHIFT) & RSS_PROFILE_ID_MASK)
+
+struct rx_cmp_ext {
+	__le32 rx_cmp_flags2;
+	#define RX_CMP_FLAGS2_IP_CS_CALC			0x1
+	#define RX_CMP_FLAGS2_L4_CS_CALC			(0x1 << 1)
+	#define RX_CMP_FLAGS2_T_IP_CS_CALC			(0x1 << 2)
+	#define RX_CMP_FLAGS2_T_L4_CS_CALC			(0x1 << 3)
+	#define RX_CMP_FLAGS2_META_FORMAT_VLAN			(0x1 << 4)
+	__le32 rx_cmp_meta_data;
+	#define RX_CMP_FLAGS2_METADATA_TCI_MASK			0xffff
+	#define RX_CMP_FLAGS2_METADATA_VID_MASK			0xfff
+	#define RX_CMP_FLAGS2_METADATA_TPID_MASK		0xffff0000
+	 #define RX_CMP_FLAGS2_METADATA_TPID_SFT		 16
+	__le32 rx_cmp_cfa_code_errors_v2;
+	#define RX_CMP_V					(1 << 0)
+	#define RX_CMPL_ERRORS_MASK				(0x7fff << 1)
+	 #define RX_CMPL_ERRORS_SFT				 1
+	#define RX_CMPL_ERRORS_BUFFER_ERROR_MASK		(0x7 << 1)
+	 #define RX_CMPL_ERRORS_BUFFER_ERROR_NO_BUFFER		 (0x0 << 1)
+	 #define RX_CMPL_ERRORS_BUFFER_ERROR_DID_NOT_FIT	 (0x1 << 1)
+	 #define RX_CMPL_ERRORS_BUFFER_ERROR_NOT_ON_CHIP	 (0x2 << 1)
+	 #define RX_CMPL_ERRORS_BUFFER_ERROR_BAD_FORMAT		 (0x3 << 1)
+	#define RX_CMPL_ERRORS_IP_CS_ERROR			(0x1 << 4)
+	#define RX_CMPL_ERRORS_L4_CS_ERROR			(0x1 << 5)
+	#define RX_CMPL_ERRORS_T_IP_CS_ERROR			(0x1 << 6)
+	#define RX_CMPL_ERRORS_T_L4_CS_ERROR			(0x1 << 7)
+	#define RX_CMPL_ERRORS_CRC_ERROR			(0x1 << 8)
+	#define RX_CMPL_ERRORS_T_PKT_ERROR_MASK			(0x7 << 9)
+	 #define RX_CMPL_ERRORS_T_PKT_ERROR_NO_ERROR		 (0x0 << 9)
+	 #define RX_CMPL_ERRORS_T_PKT_ERROR_T_L3_BAD_VERSION	 (0x1 << 9)
+	 #define RX_CMPL_ERRORS_T_PKT_ERROR_T_L3_BAD_HDR_LEN	 (0x2 << 9)
+	 #define RX_CMPL_ERRORS_T_PKT_ERROR_TUNNEL_TOTAL_ERROR	 (0x3 << 9)
+	 #define RX_CMPL_ERRORS_T_PKT_ERROR_T_IP_TOTAL_ERROR	 (0x4 << 9)
+	 #define RX_CMPL_ERRORS_T_PKT_ERROR_T_UDP_TOTAL_ERROR	 (0x5 << 9)
+	 #define RX_CMPL_ERRORS_T_PKT_ERROR_T_L3_BAD_TTL	 (0x6 << 9)
+	#define RX_CMPL_ERRORS_PKT_ERROR_MASK			(0xf << 12)
+	 #define RX_CMPL_ERRORS_PKT_ERROR_NO_ERROR		 (0x0 << 12)
+	 #define RX_CMPL_ERRORS_PKT_ERROR_L3_BAD_VERSION	 (0x1 << 12)
+	 #define RX_CMPL_ERRORS_PKT_ERROR_L3_BAD_HDR_LEN	 (0x2 << 12)
+	 #define RX_CMPL_ERRORS_PKT_ERROR_L3_BAD_TTL		 (0x3 << 12)
+	 #define RX_CMPL_ERRORS_PKT_ERROR_IP_TOTAL_ERROR	 (0x4 << 12)
+	 #define RX_CMPL_ERRORS_PKT_ERROR_UDP_TOTAL_ERROR	 (0x5 << 12)
+	 #define RX_CMPL_ERRORS_PKT_ERROR_L4_BAD_HDR_LEN	 (0x6 << 12)
+	 #define RX_CMPL_ERRORS_PKT_ERROR_L4_BAD_HDR_LEN_TOO_SMALL (0x7 << 12)
+	 #define RX_CMPL_ERRORS_PKT_ERROR_L4_BAD_OPT_LEN	 (0x8 << 12)
+
+	#define RX_CMPL_CFA_CODE_MASK				(0xffff << 16)
+	 #define RX_CMPL_CFA_CODE_SFT				 16
+
+	__le32 rx_cmp_unused3;
+};
+
+#define RX_CMP_L2_ERRORS						\
+	cpu_to_le32(RX_CMPL_ERRORS_BUFFER_ERROR_MASK | RX_CMPL_ERRORS_CRC_ERROR)
+
+#define RX_CMP_L4_CS_BITS						\
+	(cpu_to_le32(RX_CMP_FLAGS2_L4_CS_CALC | RX_CMP_FLAGS2_T_L4_CS_CALC))
+
+#define RX_CMP_L4_CS_ERR_BITS						\
+	(cpu_to_le32(RX_CMPL_ERRORS_L4_CS_ERROR | RX_CMPL_ERRORS_T_L4_CS_ERROR))
+
+#define RX_CMP_L4_CS_OK(rxcmp1)						\
+	    (((rxcmp1)->rx_cmp_flags2 &	RX_CMP_L4_CS_BITS) &&		\
+	     !((rxcmp1)->rx_cmp_cfa_code_errors_v2 & RX_CMP_L4_CS_ERR_BITS))
+
+#define RX_CMP_ENCAP(rxcmp1)						\
+	    ((le32_to_cpu((rxcmp1)->rx_cmp_flags2) &			\
+	     RX_CMP_FLAGS2_T_L4_CS_CALC) >> 3)
+
+#define RX_CMP_CFA_CODE(rxcmpl1)					\
+	((le32_to_cpu((rxcmpl1)->rx_cmp_cfa_code_errors_v2) &		\
+	  RX_CMPL_CFA_CODE_MASK) >> RX_CMPL_CFA_CODE_SFT)
+
+struct rx_agg_cmp {
+	__le32 rx_agg_cmp_len_flags_type;
+	#define RX_AGG_CMP_TYPE					(0x3f << 0)
+	#define RX_AGG_CMP_LEN					(0xffff << 16)
+	 #define RX_AGG_CMP_LEN_SHIFT				 16
+	u32 rx_agg_cmp_opaque;
+	__le32 rx_agg_cmp_v;
+	#define RX_AGG_CMP_V					(1 << 0)
+	__le32 rx_agg_cmp_unused;
+};
+
+struct rx_tpa_start_cmp {
+	__le32 rx_tpa_start_cmp_len_flags_type;
+	#define RX_TPA_START_CMP_TYPE				(0x3f << 0)
+	#define RX_TPA_START_CMP_FLAGS				(0x3ff << 6)
+	 #define RX_TPA_START_CMP_FLAGS_SHIFT			 6
+	#define RX_TPA_START_CMP_FLAGS_PLACEMENT		(0x7 << 7)
+	 #define RX_TPA_START_CMP_FLAGS_PLACEMENT_SHIFT		 7
+	 #define RX_TPA_START_CMP_FLAGS_PLACEMENT_JUMBO		 (0x1 << 7)
+	 #define RX_TPA_START_CMP_FLAGS_PLACEMENT_HDS		 (0x2 << 7)
+	 #define RX_TPA_START_CMP_FLAGS_PLACEMENT_GRO_JUMBO	 (0x5 << 7)
+	 #define RX_TPA_START_CMP_FLAGS_PLACEMENT_GRO_HDS	 (0x6 << 7)
+	#define RX_TPA_START_CMP_FLAGS_RSS_VALID		(0x1 << 10)
+	#define RX_TPA_START_CMP_FLAGS_ITYPES			(0xf << 12)
+	 #define RX_TPA_START_CMP_FLAGS_ITYPES_SHIFT		 12
+	 #define RX_TPA_START_CMP_FLAGS_ITYPE_TCP		 (0x2 << 12)
+	#define RX_TPA_START_CMP_LEN				(0xffff << 16)
+	 #define RX_TPA_START_CMP_LEN_SHIFT			 16
+
+	u32 rx_tpa_start_cmp_opaque;
+	__le32 rx_tpa_start_cmp_misc_v1;
+	#define RX_TPA_START_CMP_V1				(0x1 << 0)
+	#define RX_TPA_START_CMP_RSS_HASH_TYPE			(0x7f << 9)
+	 #define RX_TPA_START_CMP_RSS_HASH_TYPE_SHIFT		 9
+	#define RX_TPA_START_CMP_AGG_ID				(0x7f << 25)
+	 #define RX_TPA_START_CMP_AGG_ID_SHIFT			 25
+
+	__le32 rx_tpa_start_cmp_rss_hash;
+};
+
+#define TPA_START_HASH_VALID(rx_tpa_start)				\
+	((rx_tpa_start)->rx_tpa_start_cmp_len_flags_type &		\
+	 cpu_to_le32(RX_TPA_START_CMP_FLAGS_RSS_VALID))
+
+#define TPA_START_HASH_TYPE(rx_tpa_start)				\
+	(((le32_to_cpu((rx_tpa_start)->rx_tpa_start_cmp_misc_v1) &	\
+	   RX_TPA_START_CMP_RSS_HASH_TYPE) >>				\
+	  RX_TPA_START_CMP_RSS_HASH_TYPE_SHIFT) & RSS_PROFILE_ID_MASK)
+
+#define TPA_START_AGG_ID(rx_tpa_start)					\
+	((le32_to_cpu((rx_tpa_start)->rx_tpa_start_cmp_misc_v1) &	\
+	 RX_TPA_START_CMP_AGG_ID) >> RX_TPA_START_CMP_AGG_ID_SHIFT)
+
+struct rx_tpa_start_cmp_ext {
+	__le32 rx_tpa_start_cmp_flags2;
+	#define RX_TPA_START_CMP_FLAGS2_IP_CS_CALC		(0x1 << 0)
+	#define RX_TPA_START_CMP_FLAGS2_L4_CS_CALC		(0x1 << 1)
+	#define RX_TPA_START_CMP_FLAGS2_T_IP_CS_CALC		(0x1 << 2)
+	#define RX_TPA_START_CMP_FLAGS2_T_L4_CS_CALC		(0x1 << 3)
+	#define RX_TPA_START_CMP_FLAGS2_IP_TYPE			(0x1 << 8)
+
+	__le32 rx_tpa_start_cmp_metadata;
+	__le32 rx_tpa_start_cmp_cfa_code_v2;
+	#define RX_TPA_START_CMP_V2				(0x1 << 0)
+	#define RX_TPA_START_CMP_CFA_CODE			(0xffff << 16)
+	 #define RX_TPA_START_CMPL_CFA_CODE_SHIFT		 16
+	__le32 rx_tpa_start_cmp_hdr_info;
+};
+
+#define TPA_START_CFA_CODE(rx_tpa_start)				\
+	((le32_to_cpu((rx_tpa_start)->rx_tpa_start_cmp_cfa_code_v2) &	\
+	 RX_TPA_START_CMP_CFA_CODE) >> RX_TPA_START_CMPL_CFA_CODE_SHIFT)
+
+struct rx_tpa_end_cmp {
+	__le32 rx_tpa_end_cmp_len_flags_type;
+	#define RX_TPA_END_CMP_TYPE				(0x3f << 0)
+	#define RX_TPA_END_CMP_FLAGS				(0x3ff << 6)
+	 #define RX_TPA_END_CMP_FLAGS_SHIFT			 6
+	#define RX_TPA_END_CMP_FLAGS_PLACEMENT			(0x7 << 7)
+	 #define RX_TPA_END_CMP_FLAGS_PLACEMENT_SHIFT		 7
+	 #define RX_TPA_END_CMP_FLAGS_PLACEMENT_JUMBO		 (0x1 << 7)
+	 #define RX_TPA_END_CMP_FLAGS_PLACEMENT_HDS		 (0x2 << 7)
+	 #define RX_TPA_END_CMP_FLAGS_PLACEMENT_GRO_JUMBO	 (0x5 << 7)
+	 #define RX_TPA_END_CMP_FLAGS_PLACEMENT_GRO_HDS		 (0x6 << 7)
+	#define RX_TPA_END_CMP_FLAGS_RSS_VALID			(0x1 << 10)
+	#define RX_TPA_END_CMP_FLAGS_ITYPES			(0xf << 12)
+	 #define RX_TPA_END_CMP_FLAGS_ITYPES_SHIFT		 12
+	 #define RX_TPA_END_CMP_FLAGS_ITYPE_TCP			 (0x2 << 12)
+	#define RX_TPA_END_CMP_LEN				(0xffff << 16)
+	 #define RX_TPA_END_CMP_LEN_SHIFT			 16
+
+	u32 rx_tpa_end_cmp_opaque;
+	__le32 rx_tpa_end_cmp_misc_v1;
+	#define RX_TPA_END_CMP_V1				(0x1 << 0)
+	#define RX_TPA_END_CMP_AGG_BUFS				(0x3f << 1)
+	 #define RX_TPA_END_CMP_AGG_BUFS_SHIFT			 1
+	#define RX_TPA_END_CMP_TPA_SEGS				(0xff << 8)
+	 #define RX_TPA_END_CMP_TPA_SEGS_SHIFT			 8
+	#define RX_TPA_END_CMP_PAYLOAD_OFFSET			(0xff << 16)
+	 #define RX_TPA_END_CMP_PAYLOAD_OFFSET_SHIFT		 16
+	#define RX_TPA_END_CMP_AGG_ID				(0x7f << 25)
+	 #define RX_TPA_END_CMP_AGG_ID_SHIFT			 25
+
+	__le32 rx_tpa_end_cmp_tsdelta;
+	#define RX_TPA_END_GRO_TS				(0x1 << 31)
+};
+
+#define TPA_END_AGG_ID(rx_tpa_end)					\
+	((le32_to_cpu((rx_tpa_end)->rx_tpa_end_cmp_misc_v1) &		\
+	 RX_TPA_END_CMP_AGG_ID) >> RX_TPA_END_CMP_AGG_ID_SHIFT)
+
+#define TPA_END_TPA_SEGS(rx_tpa_end)					\
+	((le32_to_cpu((rx_tpa_end)->rx_tpa_end_cmp_misc_v1) &		\
+	 RX_TPA_END_CMP_TPA_SEGS) >> RX_TPA_END_CMP_TPA_SEGS_SHIFT)
+
+#define RX_TPA_END_CMP_FLAGS_PLACEMENT_ANY_GRO				\
+	cpu_to_le32(RX_TPA_END_CMP_FLAGS_PLACEMENT_GRO_JUMBO &		\
+		    RX_TPA_END_CMP_FLAGS_PLACEMENT_GRO_HDS)
+
+#define TPA_END_GRO(rx_tpa_end)						\
+	((rx_tpa_end)->rx_tpa_end_cmp_len_flags_type &			\
+	 RX_TPA_END_CMP_FLAGS_PLACEMENT_ANY_GRO)
+
+#define TPA_END_GRO_TS(rx_tpa_end)					\
+	(!!((rx_tpa_end)->rx_tpa_end_cmp_tsdelta &			\
+	    cpu_to_le32(RX_TPA_END_GRO_TS)))
+
+struct rx_tpa_end_cmp_ext {
+	__le32 rx_tpa_end_cmp_dup_acks;
+	#define RX_TPA_END_CMP_TPA_DUP_ACKS			(0xf << 0)
+
+	__le32 rx_tpa_end_cmp_seg_len;
+	#define RX_TPA_END_CMP_TPA_SEG_LEN			(0xffff << 0)
+
+	__le32 rx_tpa_end_cmp_errors_v2;
+	#define RX_TPA_END_CMP_V2				(0x1 << 0)
+	#define RX_TPA_END_CMP_ERRORS				(0x3 << 1)
+	#define RX_TPA_END_CMPL_ERRORS_SHIFT			 1
+
+	u32 rx_tpa_end_cmp_start_opaque;
+};
+
+#define TPA_END_ERRORS(rx_tpa_end_ext)					\
+	((rx_tpa_end_ext)->rx_tpa_end_cmp_errors_v2 &			\
+	 cpu_to_le32(RX_TPA_END_CMP_ERRORS))
+
+#define DB_IDX_MASK						0xffffff
+#define DB_IDX_VALID						(0x1 << 26)
+#define DB_IRQ_DIS						(0x1 << 27)
+#define DB_KEY_TX						(0x0 << 28)
+#define DB_KEY_RX						(0x1 << 28)
+#define DB_KEY_CP						(0x2 << 28)
+#define DB_KEY_ST						(0x3 << 28)
+#define DB_KEY_TX_PUSH						(0x4 << 28)
+#define DB_LONG_TX_PUSH						(0x2 << 24)
+
+#define BNXT_MIN_ROCE_CP_RINGS	2
+#define BNXT_MIN_ROCE_STAT_CTXS	1
+
+#define INVALID_HW_RING_ID	((u16)-1)
+
+/* The hardware supports certain page sizes.  Use the supported page sizes
+ * to allocate the rings.
+ */
+#if (PAGE_SHIFT < 12)
+#define BNXT_PAGE_SHIFT	12
+#elif (PAGE_SHIFT <= 13)
+#define BNXT_PAGE_SHIFT	PAGE_SHIFT
+#elif (PAGE_SHIFT < 16)
+#define BNXT_PAGE_SHIFT	13
+#else
+#define BNXT_PAGE_SHIFT	16
+#endif
+
+#define BNXT_PAGE_SIZE	(1 << BNXT_PAGE_SHIFT)
+
+/* The RXBD length is 16-bit so we can only support page sizes < 64K */
+#if (PAGE_SHIFT > 15)
+#define BNXT_RX_PAGE_SHIFT 15
+#else
+#define BNXT_RX_PAGE_SHIFT PAGE_SHIFT
+#endif
+
+#define BNXT_RX_PAGE_SIZE (1 << BNXT_RX_PAGE_SHIFT)
+
+#define BNXT_MAX_MTU		9500
+#define BNXT_MAX_PAGE_MODE_MTU	\
+	((unsigned int)PAGE_SIZE - VLAN_ETH_HLEN - NET_IP_ALIGN -	\
+	 XDP_PACKET_HEADROOM)
+
+#define BNXT_MIN_PKT_SIZE	52
+
+#define BNXT_DEFAULT_RX_RING_SIZE	511
+#define BNXT_DEFAULT_TX_RING_SIZE	511
+
+#define MAX_TPA		64
+
+#if (BNXT_PAGE_SHIFT == 16)
+#define MAX_RX_PAGES	1
+#define MAX_RX_AGG_PAGES	4
+#define MAX_TX_PAGES	1
+#define MAX_CP_PAGES	8
+#else
+#define MAX_RX_PAGES	8
+#define MAX_RX_AGG_PAGES	32
+#define MAX_TX_PAGES	8
+#define MAX_CP_PAGES	64
+#endif
+
+#define RX_DESC_CNT (BNXT_PAGE_SIZE / sizeof(struct rx_bd))
+#define TX_DESC_CNT (BNXT_PAGE_SIZE / sizeof(struct tx_bd))
+#define CP_DESC_CNT (BNXT_PAGE_SIZE / sizeof(struct tx_cmp))
+
+#define SW_RXBD_RING_SIZE (sizeof(struct bnxt_sw_rx_bd) * RX_DESC_CNT)
+#define HW_RXBD_RING_SIZE (sizeof(struct rx_bd) * RX_DESC_CNT)
+
+#define SW_RXBD_AGG_RING_SIZE (sizeof(struct bnxt_sw_rx_agg_bd) * RX_DESC_CNT)
+
+#define SW_TXBD_RING_SIZE (sizeof(struct bnxt_sw_tx_bd) * TX_DESC_CNT)
+#define HW_TXBD_RING_SIZE (sizeof(struct tx_bd) * TX_DESC_CNT)
+
+#define HW_CMPD_RING_SIZE (sizeof(struct tx_cmp) * CP_DESC_CNT)
+
+#define BNXT_MAX_RX_DESC_CNT		(RX_DESC_CNT * MAX_RX_PAGES - 1)
+#define BNXT_MAX_RX_JUM_DESC_CNT	(RX_DESC_CNT * MAX_RX_AGG_PAGES - 1)
+#define BNXT_MAX_TX_DESC_CNT		(TX_DESC_CNT * MAX_TX_PAGES - 1)
+
+#define RX_RING(x)	(((x) & ~(RX_DESC_CNT - 1)) >> (BNXT_PAGE_SHIFT - 4))
+#define RX_IDX(x)	((x) & (RX_DESC_CNT - 1))
+
+#define TX_RING(x)	(((x) & ~(TX_DESC_CNT - 1)) >> (BNXT_PAGE_SHIFT - 4))
+#define TX_IDX(x)	((x) & (TX_DESC_CNT - 1))
+
+#define CP_RING(x)	(((x) & ~(CP_DESC_CNT - 1)) >> (BNXT_PAGE_SHIFT - 4))
+#define CP_IDX(x)	((x) & (CP_DESC_CNT - 1))
+
+#define TX_CMP_VALID(txcmp, raw_cons)					\
+	(!!((txcmp)->tx_cmp_errors_v & cpu_to_le32(TX_CMP_V)) ==	\
+	 !((raw_cons) & bp->cp_bit))
+
+#define RX_CMP_VALID(rxcmp1, raw_cons)					\
+	(!!((rxcmp1)->rx_cmp_cfa_code_errors_v2 & cpu_to_le32(RX_CMP_V)) ==\
+	 !((raw_cons) & bp->cp_bit))
+
+#define RX_AGG_CMP_VALID(agg, raw_cons)				\
+	(!!((agg)->rx_agg_cmp_v & cpu_to_le32(RX_AGG_CMP_V)) ==	\
+	 !((raw_cons) & bp->cp_bit))
+
+#define TX_CMP_TYPE(txcmp)					\
+	(le32_to_cpu((txcmp)->tx_cmp_flags_type) & CMP_TYPE)
+
+#define RX_CMP_TYPE(rxcmp)					\
+	(le32_to_cpu((rxcmp)->rx_cmp_len_flags_type) & RX_CMP_CMP_TYPE)
+
+#define NEXT_RX(idx)		(((idx) + 1) & bp->rx_ring_mask)
+
+#define NEXT_RX_AGG(idx)	(((idx) + 1) & bp->rx_agg_ring_mask)
+
+#define NEXT_TX(idx)		(((idx) + 1) & bp->tx_ring_mask)
+
+#define ADV_RAW_CMP(idx, n)	((idx) + (n))
+#define NEXT_RAW_CMP(idx)	ADV_RAW_CMP(idx, 1)
+#define RING_CMP(idx)		((idx) & bp->cp_ring_mask)
+#define NEXT_CMP(idx)		RING_CMP(ADV_RAW_CMP(idx, 1))
+
+#define BNXT_HWRM_MAX_REQ_LEN		(bp->hwrm_max_req_len)
+#define BNXT_HWRM_SHORT_REQ_LEN		sizeof(struct hwrm_short_input)
+#define DFLT_HWRM_CMD_TIMEOUT		500
+#define HWRM_CMD_TIMEOUT		(bp->hwrm_cmd_timeout)
+#define HWRM_RESET_TIMEOUT		((HWRM_CMD_TIMEOUT) * 4)
+#define HWRM_RESP_ERR_CODE_MASK		0xffff
+#define HWRM_RESP_LEN_OFFSET		4
+#define HWRM_RESP_LEN_MASK		0xffff0000
+#define HWRM_RESP_LEN_SFT		16
+#define HWRM_RESP_VALID_MASK		0xff000000
+#define HWRM_SEQ_ID_INVALID		-1
+#define BNXT_HWRM_REQ_MAX_SIZE		128
+#define BNXT_HWRM_REQS_PER_PAGE		(BNXT_PAGE_SIZE /	\
+					 BNXT_HWRM_REQ_MAX_SIZE)
+
+#define BNXT_RX_EVENT	1
+#define BNXT_AGG_EVENT	2
+#define BNXT_TX_EVENT	4
+
+struct bnxt_sw_tx_bd {
+	struct sk_buff		*skb;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	u8			is_gso;
+	u8			is_push;
+	union {
+		unsigned short		nr_frags;
+		u16			rx_prod;
+	};
+};
+
+struct bnxt_sw_rx_bd {
+	void			*data;
+	u8			*data_ptr;
+	dma_addr_t		mapping;
+};
+
+struct bnxt_sw_rx_agg_bd {
+	struct page		*page;
+	unsigned int		offset;
+	dma_addr_t		mapping;
+};
+
+struct bnxt_ring_struct {
+	int			nr_pages;
+	int			page_size;
+	void			**pg_arr;
+	dma_addr_t		*dma_arr;
+
+	__le64			*pg_tbl;
+	dma_addr_t		pg_tbl_map;
+
+	int			vmem_size;
+	void			**vmem;
+
+	u16			fw_ring_id; /* Ring id filled by Chimp FW */
+	union {
+		u16		grp_idx;
+		u16		map_idx; /* Used by cmpl rings */
+	};
+	u8			queue_id;
+};
+
+struct tx_push_bd {
+	__le32			doorbell;
+	__le32			tx_bd_len_flags_type;
+	u32			tx_bd_opaque;
+	struct tx_bd_ext	txbd2;
+};
+
+struct tx_push_buffer {
+	struct tx_push_bd	push_bd;
+	u32			data[25];
+};
+
+struct bnxt_tx_ring_info {
+	struct bnxt_napi	*bnapi;
+	u16			tx_prod;
+	u16			tx_cons;
+	u16			txq_index;
+	void __iomem		*tx_doorbell;
+
+	struct tx_bd		*tx_desc_ring[MAX_TX_PAGES];
+	struct bnxt_sw_tx_bd	*tx_buf_ring;
+
+	dma_addr_t		tx_desc_mapping[MAX_TX_PAGES];
+
+	struct tx_push_buffer	*tx_push;
+	dma_addr_t		tx_push_mapping;
+	__le64			data_mapping;
+
+#define BNXT_DEV_STATE_CLOSING	0x1
+	u32			dev_state;
+
+	struct bnxt_ring_struct	tx_ring_struct;
+};
+
+struct bnxt_coal {
+	u16			coal_ticks;
+	u16			coal_ticks_irq;
+	u16			coal_bufs;
+	u16			coal_bufs_irq;
+			/* RING_IDLE enabled when coal ticks < idle_thresh  */
+	u16			idle_thresh;
+	u8			bufs_per_record;
+	u8			budget;
+};
+
+struct bnxt_tpa_info {
+	void			*data;
+	u8			*data_ptr;
+	dma_addr_t		mapping;
+	u16			len;
+	unsigned short		gso_type;
+	u32			flags2;
+	u32			metadata;
+	enum pkt_hash_types	hash_type;
+	u32			rss_hash;
+	u32			hdr_info;
+
+#define BNXT_TPA_L4_SIZE(hdr_info)	\
+	(((hdr_info) & 0xf8000000) ? ((hdr_info) >> 27) : 32)
+
+#define BNXT_TPA_INNER_L3_OFF(hdr_info)	\
+	(((hdr_info) >> 18) & 0x1ff)
+
+#define BNXT_TPA_INNER_L2_OFF(hdr_info)	\
+	(((hdr_info) >> 9) & 0x1ff)
+
+#define BNXT_TPA_OUTER_L3_OFF(hdr_info)	\
+	((hdr_info) & 0x1ff)
+
+	u16			cfa_code; /* cfa_code in TPA start compl */
+};
+
+struct bnxt_rx_ring_info {
+	struct bnxt_napi	*bnapi;
+	u16			rx_prod;
+	u16			rx_agg_prod;
+	u16			rx_sw_agg_prod;
+	u16			rx_next_cons;
+	void __iomem		*rx_doorbell;
+	void __iomem		*rx_agg_doorbell;
+
+	struct bpf_prog		*xdp_prog;
+
+	struct rx_bd		*rx_desc_ring[MAX_RX_PAGES];
+	struct bnxt_sw_rx_bd	*rx_buf_ring;
+
+	struct rx_bd		*rx_agg_desc_ring[MAX_RX_AGG_PAGES];
+	struct bnxt_sw_rx_agg_bd	*rx_agg_ring;
+
+	unsigned long		*rx_agg_bmap;
+	u16			rx_agg_bmap_size;
+
+	struct page		*rx_page;
+	unsigned int		rx_page_offset;
+
+	dma_addr_t		rx_desc_mapping[MAX_RX_PAGES];
+	dma_addr_t		rx_agg_desc_mapping[MAX_RX_AGG_PAGES];
+
+	struct bnxt_tpa_info	*rx_tpa;
+
+	struct bnxt_ring_struct	rx_ring_struct;
+	struct bnxt_ring_struct	rx_agg_ring_struct;
+	struct xdp_rxq_info	xdp_rxq;
+};
+
+struct bnxt_cp_ring_info {
+	u32			cp_raw_cons;
+	void __iomem		*cp_doorbell;
+
+	struct bnxt_coal	rx_ring_coal;
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			event_ctr;
+
+	struct net_dim		dim;
+
+	struct tx_cmp		*cp_desc_ring[MAX_CP_PAGES];
+
+	dma_addr_t		cp_desc_mapping[MAX_CP_PAGES];
+
+	struct ctx_hw_stats	*hw_stats;
+	dma_addr_t		hw_stats_map;
+	u32			hw_stats_ctx_id;
+	u64			rx_l4_csum_errors;
+
+	struct bnxt_ring_struct	cp_ring_struct;
+};
+
+struct bnxt_napi {
+	struct napi_struct	napi;
+	struct bnxt		*bp;
+
+	int			index;
+	struct bnxt_cp_ring_info	cp_ring;
+	struct bnxt_rx_ring_info	*rx_ring;
+	struct bnxt_tx_ring_info	*tx_ring;
+
+	void			(*tx_int)(struct bnxt *, struct bnxt_napi *,
+					  int);
+	u32			flags;
+#define BNXT_NAPI_FLAG_XDP	0x1
+
+	bool			in_reset;
+};
+
+struct bnxt_irq {
+	irq_handler_t	handler;
+	unsigned int	vector;
+	u8		requested:1;
+	u8		have_cpumask:1;
+	char		name[IFNAMSIZ + 2];
+	cpumask_var_t	cpu_mask;
+};
+
+#define HWRM_RING_ALLOC_TX	0x1
+#define HWRM_RING_ALLOC_RX	0x2
+#define HWRM_RING_ALLOC_AGG	0x4
+#define HWRM_RING_ALLOC_CMPL	0x8
+
+#define INVALID_STATS_CTX_ID	-1
+
+struct bnxt_ring_grp_info {
+	u16	fw_stats_ctx;
+	u16	fw_grp_id;
+	u16	rx_fw_ring_id;
+	u16	agg_fw_ring_id;
+	u16	cp_fw_ring_id;
+};
+
+struct bnxt_vnic_info {
+	u16		fw_vnic_id; /* returned by Chimp during alloc */
+#define BNXT_MAX_CTX_PER_VNIC	2
+	u16		fw_rss_cos_lb_ctx[BNXT_MAX_CTX_PER_VNIC];
+	u16		fw_l2_ctx_id;
+#define BNXT_MAX_UC_ADDRS	4
+	__le64		fw_l2_filter_id[BNXT_MAX_UC_ADDRS];
+				/* index 0 always dev_addr */
+	u16		uc_filter_count;
+	u8		*uc_list;
+
+	u16		*fw_grp_ids;
+	dma_addr_t	rss_table_dma_addr;
+	__le16		*rss_table;
+	dma_addr_t	rss_hash_key_dma_addr;
+	u64		*rss_hash_key;
+	u32		rx_mask;
+
+	u8		*mc_list;
+	int		mc_list_size;
+	int		mc_list_count;
+	dma_addr_t	mc_list_mapping;
+#define BNXT_MAX_MC_ADDRS	16
+
+	u32		flags;
+#define BNXT_VNIC_RSS_FLAG	1
+#define BNXT_VNIC_RFS_FLAG	2
+#define BNXT_VNIC_MCAST_FLAG	4
+#define BNXT_VNIC_UCAST_FLAG	8
+#define BNXT_VNIC_RFS_NEW_RSS_FLAG	0x10
+};
+
+struct bnxt_hw_resc {
+	u16	min_rsscos_ctxs;
+	u16	max_rsscos_ctxs;
+	u16	min_cp_rings;
+	u16	max_cp_rings;
+	u16	resv_cp_rings;
+	u16	min_tx_rings;
+	u16	max_tx_rings;
+	u16	resv_tx_rings;
+	u16	max_tx_sch_inputs;
+	u16	min_rx_rings;
+	u16	max_rx_rings;
+	u16	resv_rx_rings;
+	u16	min_hw_ring_grps;
+	u16	max_hw_ring_grps;
+	u16	resv_hw_ring_grps;
+	u16	min_l2_ctxs;
+	u16	max_l2_ctxs;
+	u16	min_vnics;
+	u16	max_vnics;
+	u16	resv_vnics;
+	u16	min_stat_ctxs;
+	u16	max_stat_ctxs;
+	u16	max_irqs;
+};
+
+#if defined(CONFIG_BNXT_SRIOV)
+struct bnxt_vf_info {
+	u16	fw_fid;
+	u8	mac_addr[ETH_ALEN];	/* PF assigned MAC Address */
+	u8	vf_mac_addr[ETH_ALEN];	/* VF assigned MAC address, only
+					 * stored by PF.
+					 */
+	u16	vlan;
+	u32	flags;
+#define BNXT_VF_QOS		0x1
+#define BNXT_VF_SPOOFCHK	0x2
+#define BNXT_VF_LINK_FORCED	0x4
+#define BNXT_VF_LINK_UP		0x8
+#define BNXT_VF_TRUST		0x10
+	u32	func_flags; /* func cfg flags */
+	u32	min_tx_rate;
+	u32	max_tx_rate;
+	void	*hwrm_cmd_req_addr;
+	dma_addr_t	hwrm_cmd_req_dma_addr;
+};
+#endif
+
+struct bnxt_pf_info {
+#define BNXT_FIRST_PF_FID	1
+#define BNXT_FIRST_VF_FID	128
+	u16	fw_fid;
+	u16	port_id;
+	u8	mac_addr[ETH_ALEN];
+	u32	first_vf_id;
+	u16	active_vfs;
+	u16	max_vfs;
+	u32	max_encap_records;
+	u32	max_decap_records;
+	u32	max_tx_em_flows;
+	u32	max_tx_wm_flows;
+	u32	max_rx_em_flows;
+	u32	max_rx_wm_flows;
+	unsigned long	*vf_event_bmap;
+	u16	hwrm_cmd_req_pages;
+	u8	vf_resv_strategy;
+#define BNXT_VF_RESV_STRATEGY_MAXIMAL	0
+#define BNXT_VF_RESV_STRATEGY_MINIMAL	1
+	void			*hwrm_cmd_req_addr[4];
+	dma_addr_t		hwrm_cmd_req_dma_addr[4];
+	struct bnxt_vf_info	*vf;
+};
+
+struct bnxt_ntuple_filter {
+	struct hlist_node	hash;
+	u8			dst_mac_addr[ETH_ALEN];
+	u8			src_mac_addr[ETH_ALEN];
+	struct flow_keys	fkeys;
+	__le64			filter_id;
+	u16			sw_id;
+	u8			l2_fltr_idx;
+	u16			rxq;
+	u32			flow_id;
+	unsigned long		state;
+#define BNXT_FLTR_VALID		0
+#define BNXT_FLTR_UPDATE	1
+};
+
+struct bnxt_link_info {
+	u8			phy_type;
+	u8			media_type;
+	u8			transceiver;
+	u8			phy_addr;
+	u8			phy_link_status;
+#define BNXT_LINK_NO_LINK	PORT_PHY_QCFG_RESP_LINK_NO_LINK
+#define BNXT_LINK_SIGNAL	PORT_PHY_QCFG_RESP_LINK_SIGNAL
+#define BNXT_LINK_LINK		PORT_PHY_QCFG_RESP_LINK_LINK
+	u8			wire_speed;
+	u8			loop_back;
+	u8			link_up;
+	u8			duplex;
+#define BNXT_LINK_DUPLEX_HALF	PORT_PHY_QCFG_RESP_DUPLEX_STATE_HALF
+#define BNXT_LINK_DUPLEX_FULL	PORT_PHY_QCFG_RESP_DUPLEX_STATE_FULL
+	u8			pause;
+#define BNXT_LINK_PAUSE_TX	PORT_PHY_QCFG_RESP_PAUSE_TX
+#define BNXT_LINK_PAUSE_RX	PORT_PHY_QCFG_RESP_PAUSE_RX
+#define BNXT_LINK_PAUSE_BOTH	(PORT_PHY_QCFG_RESP_PAUSE_RX | \
+				 PORT_PHY_QCFG_RESP_PAUSE_TX)
+	u8			lp_pause;
+	u8			auto_pause_setting;
+	u8			force_pause_setting;
+	u8			duplex_setting;
+	u8			auto_mode;
+#define BNXT_AUTO_MODE(mode)	((mode) > BNXT_LINK_AUTO_NONE && \
+				 (mode) <= BNXT_LINK_AUTO_MSK)
+#define BNXT_LINK_AUTO_NONE     PORT_PHY_QCFG_RESP_AUTO_MODE_NONE
+#define BNXT_LINK_AUTO_ALLSPDS	PORT_PHY_QCFG_RESP_AUTO_MODE_ALL_SPEEDS
+#define BNXT_LINK_AUTO_ONESPD	PORT_PHY_QCFG_RESP_AUTO_MODE_ONE_SPEED
+#define BNXT_LINK_AUTO_ONEORBELOW PORT_PHY_QCFG_RESP_AUTO_MODE_ONE_OR_BELOW
+#define BNXT_LINK_AUTO_MSK	PORT_PHY_QCFG_RESP_AUTO_MODE_SPEED_MASK
+#define PHY_VER_LEN		3
+	u8			phy_ver[PHY_VER_LEN];
+	u16			link_speed;
+#define BNXT_LINK_SPEED_100MB	PORT_PHY_QCFG_RESP_LINK_SPEED_100MB
+#define BNXT_LINK_SPEED_1GB	PORT_PHY_QCFG_RESP_LINK_SPEED_1GB
+#define BNXT_LINK_SPEED_2GB	PORT_PHY_QCFG_RESP_LINK_SPEED_2GB
+#define BNXT_LINK_SPEED_2_5GB	PORT_PHY_QCFG_RESP_LINK_SPEED_2_5GB
+#define BNXT_LINK_SPEED_10GB	PORT_PHY_QCFG_RESP_LINK_SPEED_10GB
+#define BNXT_LINK_SPEED_20GB	PORT_PHY_QCFG_RESP_LINK_SPEED_20GB
+#define BNXT_LINK_SPEED_25GB	PORT_PHY_QCFG_RESP_LINK_SPEED_25GB
+#define BNXT_LINK_SPEED_40GB	PORT_PHY_QCFG_RESP_LINK_SPEED_40GB
+#define BNXT_LINK_SPEED_50GB	PORT_PHY_QCFG_RESP_LINK_SPEED_50GB
+#define BNXT_LINK_SPEED_100GB	PORT_PHY_QCFG_RESP_LINK_SPEED_100GB
+	u16			support_speeds;
+	u16			auto_link_speeds;	/* fw adv setting */
+#define BNXT_LINK_SPEED_MSK_100MB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100MB
+#define BNXT_LINK_SPEED_MSK_1GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_1GB
+#define BNXT_LINK_SPEED_MSK_2GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_2GB
+#define BNXT_LINK_SPEED_MSK_10GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_10GB
+#define BNXT_LINK_SPEED_MSK_2_5GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_2_5GB
+#define BNXT_LINK_SPEED_MSK_20GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_20GB
+#define BNXT_LINK_SPEED_MSK_25GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_25GB
+#define BNXT_LINK_SPEED_MSK_40GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_40GB
+#define BNXT_LINK_SPEED_MSK_50GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_50GB
+#define BNXT_LINK_SPEED_MSK_100GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100GB
+	u16			support_auto_speeds;
+	u16			lp_auto_link_speeds;
+	u16			force_link_speed;
+	u32			preemphasis;
+	u8			module_status;
+	u16			fec_cfg;
+#define BNXT_FEC_AUTONEG	PORT_PHY_QCFG_RESP_FEC_CFG_FEC_AUTONEG_ENABLED
+#define BNXT_FEC_ENC_BASE_R	PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE74_ENABLED
+#define BNXT_FEC_ENC_RS		PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE91_ENABLED
+
+	/* copy of requested setting from ethtool cmd */
+	u8			autoneg;
+#define BNXT_AUTONEG_SPEED		1
+#define BNXT_AUTONEG_FLOW_CTRL		2
+	u8			req_duplex;
+	u8			req_flow_ctrl;
+	u16			req_link_speed;
+	u16			advertising;	/* user adv setting */
+	bool			force_link_chng;
+
+	/* a copy of phy_qcfg output used to report link
+	 * info to VF
+	 */
+	struct hwrm_port_phy_qcfg_output phy_qcfg_resp;
+};
+
+#define BNXT_MAX_QUEUE	8
+
+struct bnxt_queue_info {
+	u8	queue_id;
+	u8	queue_profile;
+};
+
+#define BNXT_MAX_LED			4
+
+struct bnxt_led_info {
+	u8	led_id;
+	u8	led_type;
+	u8	led_group_id;
+	u8	unused;
+	__le16	led_state_caps;
+#define BNXT_LED_ALT_BLINK_CAP(x)	((x) &	\
+	cpu_to_le16(PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_BLINK_ALT_SUPPORTED))
+
+	__le16	led_color_caps;
+};
+
+#define BNXT_MAX_TEST	8
+
+struct bnxt_test_info {
+	u8 offline_mask;
+	u16 timeout;
+	char string[BNXT_MAX_TEST][ETH_GSTRING_LEN];
+};
+
+#define BNXT_GRCPF_REG_WINDOW_BASE_OUT	0x400
+#define BNXT_CAG_REG_LEGACY_INT_STATUS	0x4014
+#define BNXT_CAG_REG_BASE		0x300000
+
+struct bnxt_tc_flow_stats {
+	u64		packets;
+	u64		bytes;
+};
+
+struct bnxt_tc_info {
+	bool				enabled;
+
+	/* hash table to store TC offloaded flows */
+	struct rhashtable		flow_table;
+	struct rhashtable_params	flow_ht_params;
+
+	/* hash table to store L2 keys of TC flows */
+	struct rhashtable		l2_table;
+	struct rhashtable_params	l2_ht_params;
+	/* hash table to store L2 keys for TC tunnel decap */
+	struct rhashtable		decap_l2_table;
+	struct rhashtable_params	decap_l2_ht_params;
+	/* hash table to store tunnel decap entries */
+	struct rhashtable		decap_table;
+	struct rhashtable_params	decap_ht_params;
+	/* hash table to store tunnel encap entries */
+	struct rhashtable		encap_table;
+	struct rhashtable_params	encap_ht_params;
+
+	/* lock to atomically add/del an l2 node when a flow is
+	 * added or deleted.
+	 */
+	struct mutex			lock;
+
+	/* Fields used for batching stats query */
+	struct rhashtable_iter		iter;
+#define BNXT_FLOW_STATS_BATCH_MAX	10
+	struct bnxt_tc_stats_batch {
+		void			  *flow_node;
+		struct bnxt_tc_flow_stats hw_stats;
+	} stats_batch[BNXT_FLOW_STATS_BATCH_MAX];
+
+	/* Stat counter mask (width) */
+	u64				bytes_mask;
+	u64				packets_mask;
+};
+
+struct bnxt_vf_rep_stats {
+	u64			packets;
+	u64			bytes;
+	u64			dropped;
+};
+
+struct bnxt_vf_rep {
+	struct bnxt			*bp;
+	struct net_device		*dev;
+	struct metadata_dst		*dst;
+	u16				vf_idx;
+	u16				tx_cfa_action;
+	u16				rx_cfa_code;
+
+	struct bnxt_vf_rep_stats	rx_stats;
+	struct bnxt_vf_rep_stats	tx_stats;
+};
+
+struct bnxt {
+	void __iomem		*bar0;
+	void __iomem		*bar1;
+	void __iomem		*bar2;
+
+	u32			reg_base;
+	u16			chip_num;
+#define CHIP_NUM_57301		0x16c8
+#define CHIP_NUM_57302		0x16c9
+#define CHIP_NUM_57304		0x16ca
+#define CHIP_NUM_58700		0x16cd
+#define CHIP_NUM_57402		0x16d0
+#define CHIP_NUM_57404		0x16d1
+#define CHIP_NUM_57406		0x16d2
+#define CHIP_NUM_57407		0x16d5
+
+#define CHIP_NUM_57311		0x16ce
+#define CHIP_NUM_57312		0x16cf
+#define CHIP_NUM_57314		0x16df
+#define CHIP_NUM_57317		0x16e0
+#define CHIP_NUM_57412		0x16d6
+#define CHIP_NUM_57414		0x16d7
+#define CHIP_NUM_57416		0x16d8
+#define CHIP_NUM_57417		0x16d9
+#define CHIP_NUM_57412L		0x16da
+#define CHIP_NUM_57414L		0x16db
+
+#define CHIP_NUM_5745X		0xd730
+
+#define CHIP_NUM_58802		0xd802
+#define CHIP_NUM_58804		0xd804
+#define CHIP_NUM_58808		0xd808
+
+#define BNXT_CHIP_NUM_5730X(chip_num)		\
+	((chip_num) >= CHIP_NUM_57301 &&	\
+	 (chip_num) <= CHIP_NUM_57304)
+
+#define BNXT_CHIP_NUM_5740X(chip_num)		\
+	(((chip_num) >= CHIP_NUM_57402 &&	\
+	  (chip_num) <= CHIP_NUM_57406) ||	\
+	 (chip_num) == CHIP_NUM_57407)
+
+#define BNXT_CHIP_NUM_5731X(chip_num)		\
+	((chip_num) == CHIP_NUM_57311 ||	\
+	 (chip_num) == CHIP_NUM_57312 ||	\
+	 (chip_num) == CHIP_NUM_57314 ||	\
+	 (chip_num) == CHIP_NUM_57317)
+
+#define BNXT_CHIP_NUM_5741X(chip_num)		\
+	((chip_num) >= CHIP_NUM_57412 &&	\
+	 (chip_num) <= CHIP_NUM_57414L)
+
+#define BNXT_CHIP_NUM_58700(chip_num)		\
+	 ((chip_num) == CHIP_NUM_58700)
+
+#define BNXT_CHIP_NUM_5745X(chip_num)		\
+	 ((chip_num) == CHIP_NUM_5745X)
+
+#define BNXT_CHIP_NUM_57X0X(chip_num)		\
+	(BNXT_CHIP_NUM_5730X(chip_num) || BNXT_CHIP_NUM_5740X(chip_num))
+
+#define BNXT_CHIP_NUM_57X1X(chip_num)		\
+	(BNXT_CHIP_NUM_5731X(chip_num) || BNXT_CHIP_NUM_5741X(chip_num))
+
+#define BNXT_CHIP_NUM_588XX(chip_num)		\
+	((chip_num) == CHIP_NUM_58802 ||	\
+	 (chip_num) == CHIP_NUM_58804 ||        \
+	 (chip_num) == CHIP_NUM_58808)
+
+	struct net_device	*dev;
+	struct pci_dev		*pdev;
+
+	atomic_t		intr_sem;
+
+	u32			flags;
+	#define BNXT_FLAG_DCB_ENABLED	0x1
+	#define BNXT_FLAG_VF		0x2
+	#define BNXT_FLAG_LRO		0x4
+#ifdef CONFIG_INET
+	#define BNXT_FLAG_GRO		0x8
+#else
+	/* Cannot support hardware GRO if CONFIG_INET is not set */
+	#define BNXT_FLAG_GRO		0x0
+#endif
+	#define BNXT_FLAG_TPA		(BNXT_FLAG_LRO | BNXT_FLAG_GRO)
+	#define BNXT_FLAG_JUMBO		0x10
+	#define BNXT_FLAG_STRIP_VLAN	0x20
+	#define BNXT_FLAG_AGG_RINGS	(BNXT_FLAG_JUMBO | BNXT_FLAG_GRO | \
+					 BNXT_FLAG_LRO)
+	#define BNXT_FLAG_USING_MSIX	0x40
+	#define BNXT_FLAG_MSIX_CAP	0x80
+	#define BNXT_FLAG_RFS		0x100
+	#define BNXT_FLAG_SHARED_RINGS	0x200
+	#define BNXT_FLAG_PORT_STATS	0x400
+	#define BNXT_FLAG_UDP_RSS_CAP	0x800
+	#define BNXT_FLAG_EEE_CAP	0x1000
+	#define BNXT_FLAG_NEW_RSS_CAP	0x2000
+	#define BNXT_FLAG_WOL_CAP	0x4000
+	#define BNXT_FLAG_ROCEV1_CAP	0x8000
+	#define BNXT_FLAG_ROCEV2_CAP	0x10000
+	#define BNXT_FLAG_ROCE_CAP	(BNXT_FLAG_ROCEV1_CAP |	\
+					 BNXT_FLAG_ROCEV2_CAP)
+	#define BNXT_FLAG_NO_AGG_RINGS	0x20000
+	#define BNXT_FLAG_RX_PAGE_MODE	0x40000
+	#define BNXT_FLAG_FW_LLDP_AGENT	0x80000
+	#define BNXT_FLAG_MULTI_HOST	0x100000
+	#define BNXT_FLAG_SHORT_CMD	0x200000
+	#define BNXT_FLAG_DOUBLE_DB	0x400000
+	#define BNXT_FLAG_FW_DCBX_AGENT	0x800000
+	#define BNXT_FLAG_CHIP_NITRO_A0	0x1000000
+	#define BNXT_FLAG_DIM		0x2000000
+	#define BNXT_FLAG_ROCE_MIRROR_CAP	0x4000000
+	#define BNXT_FLAG_NEW_RM	0x8000000
+	#define BNXT_FLAG_PORT_STATS_EXT	0x10000000
+
+	#define BNXT_FLAG_ALL_CONFIG_FEATS (BNXT_FLAG_TPA |		\
+					    BNXT_FLAG_RFS |		\
+					    BNXT_FLAG_STRIP_VLAN)
+
+#define BNXT_PF(bp)		(!((bp)->flags & BNXT_FLAG_VF))
+#define BNXT_VF(bp)		((bp)->flags & BNXT_FLAG_VF)
+#define BNXT_NPAR(bp)		((bp)->port_partition_type)
+#define BNXT_MH(bp)		((bp)->flags & BNXT_FLAG_MULTI_HOST)
+#define BNXT_SINGLE_PF(bp)	(BNXT_PF(bp) && !BNXT_NPAR(bp) && !BNXT_MH(bp))
+#define BNXT_CHIP_TYPE_NITRO_A0(bp) ((bp)->flags & BNXT_FLAG_CHIP_NITRO_A0)
+#define BNXT_RX_PAGE_MODE(bp)	((bp)->flags & BNXT_FLAG_RX_PAGE_MODE)
+
+/* Chip class phase 4 and later */
+#define BNXT_CHIP_P4_PLUS(bp)			\
+	(BNXT_CHIP_NUM_57X1X((bp)->chip_num) ||	\
+	 BNXT_CHIP_NUM_5745X((bp)->chip_num) ||	\
+	 BNXT_CHIP_NUM_588XX((bp)->chip_num) ||	\
+	 (BNXT_CHIP_NUM_58700((bp)->chip_num) &&	\
+	  !BNXT_CHIP_TYPE_NITRO_A0(bp)))
+
+	struct bnxt_en_dev	*edev;
+	struct bnxt_en_dev *	(*ulp_probe)(struct net_device *);
+
+	struct bnxt_napi	**bnapi;
+
+	struct bnxt_rx_ring_info	*rx_ring;
+	struct bnxt_tx_ring_info	*tx_ring;
+	u16			*tx_ring_map;
+
+	struct sk_buff *	(*gro_func)(struct bnxt_tpa_info *, int, int,
+					    struct sk_buff *);
+
+	struct sk_buff *	(*rx_skb_func)(struct bnxt *,
+					       struct bnxt_rx_ring_info *,
+					       u16, void *, u8 *, dma_addr_t,
+					       unsigned int);
+
+	u32			rx_buf_size;
+	u32			rx_buf_use_size;	/* useable size */
+	u16			rx_offset;
+	u16			rx_dma_offset;
+	enum dma_data_direction	rx_dir;
+	u32			rx_ring_size;
+	u32			rx_agg_ring_size;
+	u32			rx_copy_thresh;
+	u32			rx_ring_mask;
+	u32			rx_agg_ring_mask;
+	int			rx_nr_pages;
+	int			rx_agg_nr_pages;
+	int			rx_nr_rings;
+	int			rsscos_nr_ctxs;
+
+	u32			tx_ring_size;
+	u32			tx_ring_mask;
+	int			tx_nr_pages;
+	int			tx_nr_rings;
+	int			tx_nr_rings_per_tc;
+	int			tx_nr_rings_xdp;
+
+	int			tx_wake_thresh;
+	int			tx_push_thresh;
+	int			tx_push_size;
+
+	u32			cp_ring_size;
+	u32			cp_ring_mask;
+	u32			cp_bit;
+	int			cp_nr_pages;
+	int			cp_nr_rings;
+
+	int			num_stat_ctxs;
+
+	/* grp_info indexed by completion ring index */
+	struct bnxt_ring_grp_info	*grp_info;
+	struct bnxt_vnic_info	*vnic_info;
+	int			nr_vnics;
+	u32			rss_hash_cfg;
+
+	u16			max_mtu;
+	u8			max_tc;
+	u8			max_lltc;	/* lossless TCs */
+	struct bnxt_queue_info	q_info[BNXT_MAX_QUEUE];
+
+	unsigned int		current_interval;
+#define BNXT_TIMER_INTERVAL	HZ
+
+	struct timer_list	timer;
+
+	unsigned long		state;
+#define BNXT_STATE_OPEN		0
+#define BNXT_STATE_IN_SP_TASK	1
+#define BNXT_STATE_READ_STATS	2
+
+	struct bnxt_irq	*irq_tbl;
+	int			total_irqs;
+	u8			mac_addr[ETH_ALEN];
+
+#ifdef CONFIG_BNXT_DCB
+	struct ieee_pfc		*ieee_pfc;
+	struct ieee_ets		*ieee_ets;
+	u8			dcbx_cap;
+	u8			default_pri;
+#endif /* CONFIG_BNXT_DCB */
+
+	u32			msg_enable;
+
+	u32			hwrm_spec_code;
+	u16			hwrm_cmd_seq;
+	u32			hwrm_intr_seq_id;
+	void			*hwrm_short_cmd_req_addr;
+	dma_addr_t		hwrm_short_cmd_req_dma_addr;
+	void			*hwrm_cmd_resp_addr;
+	dma_addr_t		hwrm_cmd_resp_dma_addr;
+	void			*hwrm_dbg_resp_addr;
+	dma_addr_t		hwrm_dbg_resp_dma_addr;
+#define HWRM_DBG_REG_BUF_SIZE	128
+
+	struct rx_port_stats	*hw_rx_port_stats;
+	struct tx_port_stats	*hw_tx_port_stats;
+	struct rx_port_stats_ext	*hw_rx_port_stats_ext;
+	dma_addr_t		hw_rx_port_stats_map;
+	dma_addr_t		hw_tx_port_stats_map;
+	dma_addr_t		hw_rx_port_stats_ext_map;
+	int			hw_port_stats_size;
+
+	u16			hwrm_max_req_len;
+	int			hwrm_cmd_timeout;
+	struct mutex		hwrm_cmd_lock;	/* serialize hwrm messages */
+	struct hwrm_ver_get_output	ver_resp;
+#define FW_VER_STR_LEN		32
+#define BC_HWRM_STR_LEN		21
+#define PHY_VER_STR_LEN         (FW_VER_STR_LEN - BC_HWRM_STR_LEN)
+	char			fw_ver_str[FW_VER_STR_LEN];
+	__be16			vxlan_port;
+	u8			vxlan_port_cnt;
+	__le16			vxlan_fw_dst_port_id;
+	__be16			nge_port;
+	u8			nge_port_cnt;
+	__le16			nge_fw_dst_port_id;
+	u8			port_partition_type;
+	u8			port_count;
+	u16			br_mode;
+
+	struct bnxt_coal	rx_coal;
+	struct bnxt_coal	tx_coal;
+
+#define BNXT_USEC_TO_COAL_TIMER(x)	((x) * 25 / 2)
+
+	u32			stats_coal_ticks;
+#define BNXT_DEF_STATS_COAL_TICKS	 1000000
+#define BNXT_MIN_STATS_COAL_TICKS	  250000
+#define BNXT_MAX_STATS_COAL_TICKS	 1000000
+
+	struct work_struct	sp_task;
+	unsigned long		sp_event;
+#define BNXT_RX_MASK_SP_EVENT		0
+#define BNXT_RX_NTP_FLTR_SP_EVENT	1
+#define BNXT_LINK_CHNG_SP_EVENT		2
+#define BNXT_HWRM_EXEC_FWD_REQ_SP_EVENT	3
+#define BNXT_VXLAN_ADD_PORT_SP_EVENT	4
+#define BNXT_VXLAN_DEL_PORT_SP_EVENT	5
+#define BNXT_RESET_TASK_SP_EVENT	6
+#define BNXT_RST_RING_SP_EVENT		7
+#define BNXT_HWRM_PF_UNLOAD_SP_EVENT	8
+#define BNXT_PERIODIC_STATS_SP_EVENT	9
+#define BNXT_HWRM_PORT_MODULE_SP_EVENT	10
+#define BNXT_RESET_TASK_SILENT_SP_EVENT	11
+#define BNXT_GENEVE_ADD_PORT_SP_EVENT	12
+#define BNXT_GENEVE_DEL_PORT_SP_EVENT	13
+#define BNXT_LINK_SPEED_CHNG_SP_EVENT	14
+#define BNXT_FLOW_STATS_SP_EVENT	15
+
+	struct bnxt_hw_resc	hw_resc;
+	struct bnxt_pf_info	pf;
+#ifdef CONFIG_BNXT_SRIOV
+	int			nr_vfs;
+	struct bnxt_vf_info	vf;
+	wait_queue_head_t	sriov_cfg_wait;
+	bool			sriov_cfg;
+#define BNXT_SRIOV_CFG_WAIT_TMO	msecs_to_jiffies(10000)
+
+	/* lock to protect VF-rep creation/cleanup via
+	 * multiple paths such as ->sriov_configure() and
+	 * devlink ->eswitch_mode_set()
+	 */
+	struct mutex		sriov_lock;
+#endif
+
+#define BNXT_NTP_FLTR_MAX_FLTR	4096
+#define BNXT_NTP_FLTR_HASH_SIZE	512
+#define BNXT_NTP_FLTR_HASH_MASK	(BNXT_NTP_FLTR_HASH_SIZE - 1)
+	struct hlist_head	ntp_fltr_hash_tbl[BNXT_NTP_FLTR_HASH_SIZE];
+	spinlock_t		ntp_fltr_lock;	/* for hash table add, del */
+
+	unsigned long		*ntp_fltr_bmap;
+	int			ntp_fltr_count;
+
+	/* To protect link related settings during link changes and
+	 * ethtool settings changes.
+	 */
+	struct mutex		link_lock;
+	struct bnxt_link_info	link_info;
+	struct ethtool_eee	eee;
+	u32			lpi_tmr_lo;
+	u32			lpi_tmr_hi;
+
+	u8			num_tests;
+	struct bnxt_test_info	*test_info;
+
+	u8			wol_filter_id;
+	u8			wol;
+
+	u8			num_leds;
+	struct bnxt_led_info	leds[BNXT_MAX_LED];
+
+	struct bpf_prog		*xdp_prog;
+
+	/* devlink interface and vf-rep structs */
+	struct devlink		*dl;
+	enum devlink_eswitch_mode eswitch_mode;
+	struct bnxt_vf_rep	**vf_reps; /* array of vf-rep ptrs */
+	u16			*cfa_code_map; /* cfa_code -> vf_idx map */
+	u8			switch_id[8];
+	struct bnxt_tc_info	*tc_info;
+};
+
+#define BNXT_RX_STATS_OFFSET(counter)			\
+	(offsetof(struct rx_port_stats, counter) / 8)
+
+#define BNXT_TX_STATS_OFFSET(counter)			\
+	((offsetof(struct tx_port_stats, counter) +	\
+	  sizeof(struct rx_port_stats) + 512) / 8)
+
+#define BNXT_RX_STATS_EXT_OFFSET(counter)		\
+	(offsetof(struct rx_port_stats_ext, counter) / 8)
+
+#define I2C_DEV_ADDR_A0				0xa0
+#define I2C_DEV_ADDR_A2				0xa2
+#define SFP_EEPROM_SFF_8472_COMP_ADDR		0x5e
+#define SFP_EEPROM_SFF_8472_COMP_SIZE		1
+#define SFF_MODULE_ID_SFP			0x3
+#define SFF_MODULE_ID_QSFP			0xc
+#define SFF_MODULE_ID_QSFP_PLUS			0xd
+#define SFF_MODULE_ID_QSFP28			0x11
+#define BNXT_MAX_PHY_I2C_RESP_SIZE		64
+
+static inline u32 bnxt_tx_avail(struct bnxt *bp, struct bnxt_tx_ring_info *txr)
+{
+	/* Tell compiler to fetch tx indices from memory. */
+	barrier();
+
+	return bp->tx_ring_size -
+		((txr->tx_prod - txr->tx_cons) & bp->tx_ring_mask);
+}
+
+/* For TX and RX ring doorbells with no ordering guarantee*/
+static inline void bnxt_db_write_relaxed(struct bnxt *bp, void __iomem *db,
+					 u32 val)
+{
+	writel_relaxed(val, db);
+	if (bp->flags & BNXT_FLAG_DOUBLE_DB)
+		writel_relaxed(val, db);
+}
+
+/* For TX and RX ring doorbells */
+static inline void bnxt_db_write(struct bnxt *bp, void __iomem *db, u32 val)
+{
+	writel(val, db);
+	if (bp->flags & BNXT_FLAG_DOUBLE_DB)
+		writel(val, db);
+}
+
+extern const u16 bnxt_lhint_arr[];
+
+int bnxt_alloc_rx_data(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
+		       u16 prod, gfp_t gfp);
+void bnxt_reuse_rx_data(struct bnxt_rx_ring_info *rxr, u16 cons, void *data);
+void bnxt_set_tpa_flags(struct bnxt *bp);
+void bnxt_set_ring_params(struct bnxt *);
+int bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode);
+void bnxt_hwrm_cmd_hdr_init(struct bnxt *, void *, u16, u16, u16);
+int _hwrm_send_message(struct bnxt *, void *, u32, int);
+int _hwrm_send_message_silent(struct bnxt *bp, void *msg, u32 len, int timeout);
+int hwrm_send_message(struct bnxt *, void *, u32, int);
+int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
+int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
+				     int bmap_size);
+int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id);
+int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings);
+int bnxt_hwrm_set_coal(struct bnxt *);
+unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
+void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max);
+unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp);
+void bnxt_set_max_func_cp_rings(struct bnxt *bp, unsigned int max);
+unsigned int bnxt_get_max_func_irqs(struct bnxt *bp);
+void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max);
+int bnxt_get_avail_msix(struct bnxt *bp, int num);
+int bnxt_reserve_rings(struct bnxt *bp);
+void bnxt_tx_disable(struct bnxt *bp);
+void bnxt_tx_enable(struct bnxt *bp);
+int bnxt_hwrm_set_pause(struct bnxt *);
+int bnxt_hwrm_set_link_setting(struct bnxt *, bool, bool);
+int bnxt_hwrm_alloc_wol_fltr(struct bnxt *bp);
+int bnxt_hwrm_free_wol_fltr(struct bnxt *bp);
+int bnxt_hwrm_func_resc_qcaps(struct bnxt *bp, bool all);
+int bnxt_hwrm_fw_set_time(struct bnxt *);
+int bnxt_open_nic(struct bnxt *, bool, bool);
+int bnxt_half_open_nic(struct bnxt *bp);
+void bnxt_half_close_nic(struct bnxt *bp);
+int bnxt_close_nic(struct bnxt *, bool, bool);
+int bnxt_check_rings(struct bnxt *bp, int tx, int rx, bool sh, int tcs,
+		     int tx_xdp);
+int bnxt_setup_mq_tc(struct net_device *dev, u8 tc);
+int bnxt_get_max_rings(struct bnxt *, int *, int *, bool);
+int bnxt_restore_pf_fw_resources(struct bnxt *bp);
+int bnxt_port_attr_get(struct bnxt *bp, struct switchdev_attr *attr);
+void bnxt_dim_work(struct work_struct *work);
+int bnxt_hwrm_set_ring_coal(struct bnxt *bp, struct bnxt_napi *bnapi);
+
+#endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
new file mode 100644
index 0000000..3c746f2
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
@@ -0,0 +1,633 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2014-2016 Broadcom Corporation
+ * Copyright (c) 2016-2017 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/etherdevice.h>
+#include <rdma/ib_verbs.h>
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+#include "bnxt_dcb.h"
+
+#ifdef CONFIG_BNXT_DCB
+static int bnxt_hwrm_queue_pri2cos_cfg(struct bnxt *bp, struct ieee_ets *ets)
+{
+	struct hwrm_queue_pri2cos_cfg_input req = {0};
+	int rc = 0, i;
+	u8 *pri2cos;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_PRI2COS_CFG, -1, -1);
+	req.flags = cpu_to_le32(QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_BIDIR |
+				QUEUE_PRI2COS_CFG_REQ_FLAGS_IVLAN);
+
+	pri2cos = &req.pri0_cos_queue_id;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		req.enables |= cpu_to_le32(
+			QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI0_COS_QUEUE_ID << i);
+
+		pri2cos[i] = bp->q_info[ets->prio_tc[i]].queue_id;
+	}
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	return rc;
+}
+
+static int bnxt_hwrm_queue_pri2cos_qcfg(struct bnxt *bp, struct ieee_ets *ets)
+{
+	struct hwrm_queue_pri2cos_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_queue_pri2cos_qcfg_input req = {0};
+	int rc = 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_PRI2COS_QCFG, -1, -1);
+	req.flags = cpu_to_le32(QUEUE_PRI2COS_QCFG_REQ_FLAGS_IVLAN);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc) {
+		u8 *pri2cos = &resp->pri0_cos_queue_id;
+		int i, j;
+
+		for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+			u8 queue_id = pri2cos[i];
+
+			for (j = 0; j < bp->max_tc; j++) {
+				if (bp->q_info[j].queue_id == queue_id) {
+					ets->prio_tc[i] = j;
+					break;
+				}
+			}
+		}
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_hwrm_queue_cos2bw_cfg(struct bnxt *bp, struct ieee_ets *ets,
+				      u8 max_tc)
+{
+	struct hwrm_queue_cos2bw_cfg_input req = {0};
+	struct bnxt_cos2bw_cfg cos2bw;
+	int rc = 0, i;
+	void *data;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_COS2BW_CFG, -1, -1);
+	data = &req.unused_0;
+	for (i = 0; i < max_tc; i++, data += sizeof(cos2bw) - 4) {
+		req.enables |= cpu_to_le32(
+			QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID0_VALID << i);
+
+		memset(&cos2bw, 0, sizeof(cos2bw));
+		cos2bw.queue_id = bp->q_info[i].queue_id;
+		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_STRICT) {
+			cos2bw.tsa =
+				QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_SP;
+			cos2bw.pri_lvl = i;
+		} else {
+			cos2bw.tsa =
+				QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_ETS;
+			cos2bw.bw_weight = ets->tc_tx_bw[i];
+			/* older firmware requires min_bw to be set to the
+			 * same weight value in percent.
+			 */
+			cos2bw.min_bw =
+				cpu_to_le32((ets->tc_tx_bw[i] * 100) |
+					    BW_VALUE_UNIT_PERCENT1_100);
+		}
+		memcpy(data, &cos2bw.queue_id, sizeof(cos2bw) - 4);
+		if (i == 0) {
+			req.queue_id0 = cos2bw.queue_id;
+			req.unused_0 = 0;
+		}
+	}
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	return rc;
+}
+
+static int bnxt_hwrm_queue_cos2bw_qcfg(struct bnxt *bp, struct ieee_ets *ets)
+{
+	struct hwrm_queue_cos2bw_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_queue_cos2bw_qcfg_input req = {0};
+	struct bnxt_cos2bw_cfg cos2bw;
+	void *data;
+	int rc, i;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_COS2BW_QCFG, -1, -1);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc) {
+		mutex_unlock(&bp->hwrm_cmd_lock);
+		return rc;
+	}
+
+	data = &resp->queue_id0 + offsetof(struct bnxt_cos2bw_cfg, queue_id);
+	for (i = 0; i < bp->max_tc; i++, data += sizeof(cos2bw) - 4) {
+		int j;
+
+		memcpy(&cos2bw.queue_id, data, sizeof(cos2bw) - 4);
+		if (i == 0)
+			cos2bw.queue_id = resp->queue_id0;
+
+		for (j = 0; j < bp->max_tc; j++) {
+			if (bp->q_info[j].queue_id != cos2bw.queue_id)
+				continue;
+			if (cos2bw.tsa ==
+			    QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_SP) {
+				ets->tc_tsa[j] = IEEE_8021QAZ_TSA_STRICT;
+			} else {
+				ets->tc_tsa[j] = IEEE_8021QAZ_TSA_ETS;
+				ets->tc_tx_bw[j] = cos2bw.bw_weight;
+			}
+		}
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return 0;
+}
+
+static int bnxt_hwrm_queue_cfg(struct bnxt *bp, unsigned int lltc_mask)
+{
+	struct hwrm_queue_cfg_input req = {0};
+	int i;
+
+	if (netif_running(bp->dev))
+		bnxt_tx_disable(bp);
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_CFG, -1, -1);
+	req.flags = cpu_to_le32(QUEUE_CFG_REQ_FLAGS_PATH_BIDIR);
+	req.enables = cpu_to_le32(QUEUE_CFG_REQ_ENABLES_SERVICE_PROFILE);
+
+	/* Configure lossless queues to lossy first */
+	req.service_profile = QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSY;
+	for (i = 0; i < bp->max_tc; i++) {
+		if (BNXT_LLQ(bp->q_info[i].queue_profile)) {
+			req.queue_id = cpu_to_le32(bp->q_info[i].queue_id);
+			hwrm_send_message(bp, &req, sizeof(req),
+					  HWRM_CMD_TIMEOUT);
+			bp->q_info[i].queue_profile =
+				QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSY;
+		}
+	}
+
+	/* Now configure desired queues to lossless */
+	req.service_profile = QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSLESS;
+	for (i = 0; i < bp->max_tc; i++) {
+		if (lltc_mask & (1 << i)) {
+			req.queue_id = cpu_to_le32(bp->q_info[i].queue_id);
+			hwrm_send_message(bp, &req, sizeof(req),
+					  HWRM_CMD_TIMEOUT);
+			bp->q_info[i].queue_profile =
+				QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSLESS;
+		}
+	}
+	if (netif_running(bp->dev))
+		bnxt_tx_enable(bp);
+
+	return 0;
+}
+
+static int bnxt_hwrm_queue_pfc_cfg(struct bnxt *bp, struct ieee_pfc *pfc)
+{
+	struct hwrm_queue_pfcenable_cfg_input req = {0};
+	struct ieee_ets *my_ets = bp->ieee_ets;
+	unsigned int tc_mask = 0, pri_mask = 0;
+	u8 i, pri, lltc_count = 0;
+	bool need_q_recfg = false;
+	int rc;
+
+	if (!my_ets)
+		return -EINVAL;
+
+	for (i = 0; i < bp->max_tc; i++) {
+		for (pri = 0; pri < IEEE_8021QAZ_MAX_TCS; pri++) {
+			if ((pfc->pfc_en & (1 << pri)) &&
+			    (my_ets->prio_tc[pri] == i)) {
+				pri_mask |= 1 << pri;
+				tc_mask |= 1 << i;
+			}
+		}
+		if (tc_mask & (1 << i))
+			lltc_count++;
+	}
+	if (lltc_count > bp->max_lltc)
+		return -EINVAL;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_PFCENABLE_CFG, -1, -1);
+	req.flags = cpu_to_le32(pri_mask);
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		return rc;
+
+	for (i = 0; i < bp->max_tc; i++) {
+		if (tc_mask & (1 << i)) {
+			if (!BNXT_LLQ(bp->q_info[i].queue_profile))
+				need_q_recfg = true;
+		}
+	}
+
+	if (need_q_recfg)
+		rc = bnxt_hwrm_queue_cfg(bp, tc_mask);
+
+	return rc;
+}
+
+static int bnxt_hwrm_queue_pfc_qcfg(struct bnxt *bp, struct ieee_pfc *pfc)
+{
+	struct hwrm_queue_pfcenable_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_queue_pfcenable_qcfg_input req = {0};
+	u8 pri_mask;
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_PFCENABLE_QCFG, -1, -1);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc) {
+		mutex_unlock(&bp->hwrm_cmd_lock);
+		return rc;
+	}
+
+	pri_mask = le32_to_cpu(resp->flags);
+	pfc->pfc_en = pri_mask;
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return 0;
+}
+
+static int bnxt_hwrm_set_dcbx_app(struct bnxt *bp, struct dcb_app *app,
+				  bool add)
+{
+	struct hwrm_fw_set_structured_data_input set = {0};
+	struct hwrm_fw_get_structured_data_input get = {0};
+	struct hwrm_struct_data_dcbx_app *fw_app;
+	struct hwrm_struct_hdr *data;
+	dma_addr_t mapping;
+	size_t data_len;
+	int rc, n, i;
+
+	if (bp->hwrm_spec_code < 0x10601)
+		return 0;
+
+	n = IEEE_8021QAZ_MAX_TCS;
+	data_len = sizeof(*data) + sizeof(*fw_app) * n;
+	data = dma_zalloc_coherent(&bp->pdev->dev, data_len, &mapping,
+				   GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &get, HWRM_FW_GET_STRUCTURED_DATA, -1, -1);
+	get.dest_data_addr = cpu_to_le64(mapping);
+	get.structure_id = cpu_to_le16(STRUCT_HDR_STRUCT_ID_DCBX_APP);
+	get.subtype = cpu_to_le16(HWRM_STRUCT_DATA_SUBTYPE_HOST_OPERATIONAL);
+	get.count = 0;
+	rc = hwrm_send_message(bp, &get, sizeof(get), HWRM_CMD_TIMEOUT);
+	if (rc)
+		goto set_app_exit;
+
+	fw_app = (struct hwrm_struct_data_dcbx_app *)(data + 1);
+
+	if (data->struct_id != cpu_to_le16(STRUCT_HDR_STRUCT_ID_DCBX_APP)) {
+		rc = -ENODEV;
+		goto set_app_exit;
+	}
+
+	n = data->count;
+	for (i = 0; i < n; i++, fw_app++) {
+		if (fw_app->protocol_id == cpu_to_be16(app->protocol) &&
+		    fw_app->protocol_selector == app->selector &&
+		    fw_app->priority == app->priority) {
+			if (add)
+				goto set_app_exit;
+			else
+				break;
+		}
+	}
+	if (add) {
+		/* append */
+		n++;
+		fw_app->protocol_id = cpu_to_be16(app->protocol);
+		fw_app->protocol_selector = app->selector;
+		fw_app->priority = app->priority;
+		fw_app->valid = 1;
+	} else {
+		size_t len = 0;
+
+		/* not found, nothing to delete */
+		if (n == i)
+			goto set_app_exit;
+
+		len = (n - 1 - i) * sizeof(*fw_app);
+		if (len)
+			memmove(fw_app, fw_app + 1, len);
+		n--;
+		memset(fw_app + n, 0, sizeof(*fw_app));
+	}
+	data->count = n;
+	data->len = cpu_to_le16(sizeof(*fw_app) * n);
+	data->subtype = cpu_to_le16(HWRM_STRUCT_DATA_SUBTYPE_HOST_OPERATIONAL);
+
+	bnxt_hwrm_cmd_hdr_init(bp, &set, HWRM_FW_SET_STRUCTURED_DATA, -1, -1);
+	set.src_data_addr = cpu_to_le64(mapping);
+	set.data_len = cpu_to_le16(sizeof(*data) + sizeof(*fw_app) * n);
+	set.hdr_cnt = 1;
+	rc = hwrm_send_message(bp, &set, sizeof(set), HWRM_CMD_TIMEOUT);
+	if (rc)
+		rc = -EIO;
+
+set_app_exit:
+	dma_free_coherent(&bp->pdev->dev, data_len, data, mapping);
+	return rc;
+}
+
+static int bnxt_ets_validate(struct bnxt *bp, struct ieee_ets *ets, u8 *tc)
+{
+	int total_ets_bw = 0;
+	u8 max_tc = 0;
+	int i;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (ets->prio_tc[i] > bp->max_tc) {
+			netdev_err(bp->dev, "priority to TC mapping exceeds TC count %d\n",
+				   ets->prio_tc[i]);
+			return -EINVAL;
+		}
+		if (ets->prio_tc[i] > max_tc)
+			max_tc = ets->prio_tc[i];
+
+		if ((ets->tc_tx_bw[i] || ets->tc_tsa[i]) && i > bp->max_tc)
+			return -EINVAL;
+
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_STRICT:
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			total_ets_bw += ets->tc_tx_bw[i];
+			break;
+		default:
+			return -ENOTSUPP;
+		}
+	}
+	if (total_ets_bw > 100)
+		return -EINVAL;
+
+	*tc = max_tc + 1;
+	return 0;
+}
+
+static int bnxt_dcbnl_ieee_getets(struct net_device *dev, struct ieee_ets *ets)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct ieee_ets *my_ets = bp->ieee_ets;
+
+	ets->ets_cap = bp->max_tc;
+
+	if (!my_ets) {
+		int rc;
+
+		if (bp->dcbx_cap & DCB_CAP_DCBX_HOST)
+			return 0;
+
+		my_ets = kzalloc(sizeof(*my_ets), GFP_KERNEL);
+		if (!my_ets)
+			return 0;
+		rc = bnxt_hwrm_queue_cos2bw_qcfg(bp, my_ets);
+		if (rc)
+			return 0;
+		rc = bnxt_hwrm_queue_pri2cos_qcfg(bp, my_ets);
+		if (rc)
+			return 0;
+	}
+
+	ets->cbs = my_ets->cbs;
+	memcpy(ets->tc_tx_bw, my_ets->tc_tx_bw, sizeof(ets->tc_tx_bw));
+	memcpy(ets->tc_rx_bw, my_ets->tc_rx_bw, sizeof(ets->tc_rx_bw));
+	memcpy(ets->tc_tsa, my_ets->tc_tsa, sizeof(ets->tc_tsa));
+	memcpy(ets->prio_tc, my_ets->prio_tc, sizeof(ets->prio_tc));
+	return 0;
+}
+
+static int bnxt_dcbnl_ieee_setets(struct net_device *dev, struct ieee_ets *ets)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct ieee_ets *my_ets = bp->ieee_ets;
+	u8 max_tc = 0;
+	int rc, i;
+
+	if (!(bp->dcbx_cap & DCB_CAP_DCBX_VER_IEEE) ||
+	    !(bp->dcbx_cap & DCB_CAP_DCBX_HOST))
+		return -EINVAL;
+
+	rc = bnxt_ets_validate(bp, ets, &max_tc);
+	if (!rc) {
+		if (!my_ets) {
+			my_ets = kzalloc(sizeof(*my_ets), GFP_KERNEL);
+			if (!my_ets)
+				return -ENOMEM;
+			/* initialize PRI2TC mappings to invalid value */
+			for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+				my_ets->prio_tc[i] = IEEE_8021QAZ_MAX_TCS;
+			bp->ieee_ets = my_ets;
+		}
+		rc = bnxt_setup_mq_tc(dev, max_tc);
+		if (rc)
+			return rc;
+		rc = bnxt_hwrm_queue_cos2bw_cfg(bp, ets, max_tc);
+		if (rc)
+			return rc;
+		rc = bnxt_hwrm_queue_pri2cos_cfg(bp, ets);
+		if (rc)
+			return rc;
+		memcpy(my_ets, ets, sizeof(*my_ets));
+	}
+	return rc;
+}
+
+static int bnxt_dcbnl_ieee_getpfc(struct net_device *dev, struct ieee_pfc *pfc)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	__le64 *stats = (__le64 *)bp->hw_rx_port_stats;
+	struct ieee_pfc *my_pfc = bp->ieee_pfc;
+	long rx_off, tx_off;
+	int i, rc;
+
+	pfc->pfc_cap = bp->max_lltc;
+
+	if (!my_pfc) {
+		if (bp->dcbx_cap & DCB_CAP_DCBX_HOST)
+			return 0;
+
+		my_pfc = kzalloc(sizeof(*my_pfc), GFP_KERNEL);
+		if (!my_pfc)
+			return 0;
+		bp->ieee_pfc = my_pfc;
+		rc = bnxt_hwrm_queue_pfc_qcfg(bp, my_pfc);
+		if (rc)
+			return 0;
+	}
+
+	pfc->pfc_en = my_pfc->pfc_en;
+	pfc->mbc = my_pfc->mbc;
+	pfc->delay = my_pfc->delay;
+
+	if (!stats)
+		return 0;
+
+	rx_off = BNXT_RX_STATS_OFFSET(rx_pfc_ena_frames_pri0);
+	tx_off = BNXT_TX_STATS_OFFSET(tx_pfc_ena_frames_pri0);
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++, rx_off++, tx_off++) {
+		pfc->requests[i] = le64_to_cpu(*(stats + tx_off));
+		pfc->indications[i] = le64_to_cpu(*(stats + rx_off));
+	}
+
+	return 0;
+}
+
+static int bnxt_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct ieee_pfc *my_pfc = bp->ieee_pfc;
+	int rc;
+
+	if (!(bp->dcbx_cap & DCB_CAP_DCBX_VER_IEEE) ||
+	    !(bp->dcbx_cap & DCB_CAP_DCBX_HOST))
+		return -EINVAL;
+
+	if (!my_pfc) {
+		my_pfc = kzalloc(sizeof(*my_pfc), GFP_KERNEL);
+		if (!my_pfc)
+			return -ENOMEM;
+		bp->ieee_pfc = my_pfc;
+	}
+	rc = bnxt_hwrm_queue_pfc_cfg(bp, pfc);
+	if (!rc)
+		memcpy(my_pfc, pfc, sizeof(*my_pfc));
+
+	return rc;
+}
+
+static int bnxt_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int rc = -EINVAL;
+
+	if (!(bp->dcbx_cap & DCB_CAP_DCBX_VER_IEEE) ||
+	    !(bp->dcbx_cap & DCB_CAP_DCBX_HOST))
+		return -EINVAL;
+
+	rc = dcb_ieee_setapp(dev, app);
+	if (rc)
+		return rc;
+
+	if ((app->selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE &&
+	     app->protocol == ETH_P_IBOE) ||
+	    (app->selector == IEEE_8021QAZ_APP_SEL_DGRAM &&
+	     app->protocol == ROCE_V2_UDP_DPORT))
+		rc = bnxt_hwrm_set_dcbx_app(bp, app, true);
+
+	return rc;
+}
+
+static int bnxt_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int rc;
+
+	if (!(bp->dcbx_cap & DCB_CAP_DCBX_VER_IEEE) ||
+	    !(bp->dcbx_cap & DCB_CAP_DCBX_HOST))
+		return -EINVAL;
+
+	rc = dcb_ieee_delapp(dev, app);
+	if (rc)
+		return rc;
+	if ((app->selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE &&
+	     app->protocol == ETH_P_IBOE) ||
+	    (app->selector == IEEE_8021QAZ_APP_SEL_DGRAM &&
+	     app->protocol == ROCE_V2_UDP_DPORT))
+		rc = bnxt_hwrm_set_dcbx_app(bp, app, false);
+
+	return rc;
+}
+
+static u8 bnxt_dcbnl_getdcbx(struct net_device *dev)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	return bp->dcbx_cap;
+}
+
+static u8 bnxt_dcbnl_setdcbx(struct net_device *dev, u8 mode)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	/* All firmware DCBX settings are set in NVRAM */
+	if (bp->dcbx_cap & DCB_CAP_DCBX_LLD_MANAGED)
+		return 1;
+
+	if (mode & DCB_CAP_DCBX_HOST) {
+		if (BNXT_VF(bp) || (bp->flags & BNXT_FLAG_FW_LLDP_AGENT))
+			return 1;
+
+		/* only support IEEE */
+		if ((mode & DCB_CAP_DCBX_VER_CEE) ||
+		    !(mode & DCB_CAP_DCBX_VER_IEEE))
+			return 1;
+	}
+
+	if (mode == bp->dcbx_cap)
+		return 0;
+
+	bp->dcbx_cap = mode;
+	return 0;
+}
+
+static const struct dcbnl_rtnl_ops dcbnl_ops = {
+	.ieee_getets	= bnxt_dcbnl_ieee_getets,
+	.ieee_setets	= bnxt_dcbnl_ieee_setets,
+	.ieee_getpfc	= bnxt_dcbnl_ieee_getpfc,
+	.ieee_setpfc	= bnxt_dcbnl_ieee_setpfc,
+	.ieee_setapp	= bnxt_dcbnl_ieee_setapp,
+	.ieee_delapp	= bnxt_dcbnl_ieee_delapp,
+	.getdcbx	= bnxt_dcbnl_getdcbx,
+	.setdcbx	= bnxt_dcbnl_setdcbx,
+};
+
+void bnxt_dcb_init(struct bnxt *bp)
+{
+	if (bp->hwrm_spec_code < 0x10501)
+		return;
+
+	bp->dcbx_cap = DCB_CAP_DCBX_VER_IEEE;
+	if (BNXT_PF(bp) && !(bp->flags & BNXT_FLAG_FW_LLDP_AGENT))
+		bp->dcbx_cap |= DCB_CAP_DCBX_HOST;
+	else if (bp->flags & BNXT_FLAG_FW_DCBX_AGENT)
+		bp->dcbx_cap |= DCB_CAP_DCBX_LLD_MANAGED;
+	bp->dev->dcbnl_ops = &dcbnl_ops;
+}
+
+void bnxt_dcb_free(struct bnxt *bp)
+{
+	kfree(bp->ieee_pfc);
+	kfree(bp->ieee_ets);
+	bp->ieee_pfc = NULL;
+	bp->ieee_ets = NULL;
+}
+
+#else
+
+void bnxt_dcb_init(struct bnxt *bp)
+{
+}
+
+void bnxt_dcb_free(struct bnxt *bp)
+{
+}
+
+#endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h
new file mode 100644
index 0000000..69efde7
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.h
@@ -0,0 +1,44 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2014-2016 Broadcom Corporation
+ * Copyright (c) 2016-2018 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_DCB_H
+#define BNXT_DCB_H
+
+#include <net/dcbnl.h>
+
+struct bnxt_dcb {
+	u8			max_tc;
+	struct ieee_pfc		*ieee_pfc;
+	struct ieee_ets		*ieee_ets;
+	u8			dcbx_cap;
+	u8			default_pri;
+};
+
+struct bnxt_cos2bw_cfg {
+	u8			pad[3];
+	u8			queue_id;
+	__le32			min_bw;
+	__le32			max_bw;
+#define BW_VALUE_UNIT_PERCENT1_100		(0x1UL << 29)
+	u8			tsa;
+	u8			pri_lvl;
+	u8			bw_weight;
+	u8			unused;
+};
+
+#define BNXT_LLQ(q_profile)	\
+	((q_profile) ==		\
+	 QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSLESS_ROCE)
+
+#define HWRM_STRUCT_DATA_SUBTYPE_HOST_OPERATIONAL	0x0300
+
+void bnxt_dcb_init(struct bnxt *bp);
+void bnxt_dcb_free(struct bnxt *bp);
+#endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
new file mode 100644
index 0000000..402fa32
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -0,0 +1,65 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2017 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+#include "bnxt_vfr.h"
+#include "bnxt_devlink.h"
+
+static const struct devlink_ops bnxt_dl_ops = {
+#ifdef CONFIG_BNXT_SRIOV
+	.eswitch_mode_set = bnxt_dl_eswitch_mode_set,
+	.eswitch_mode_get = bnxt_dl_eswitch_mode_get,
+#endif /* CONFIG_BNXT_SRIOV */
+};
+
+int bnxt_dl_register(struct bnxt *bp)
+{
+	struct devlink *dl;
+	int rc;
+
+	if (!pci_find_ext_capability(bp->pdev, PCI_EXT_CAP_ID_SRIOV))
+		return 0;
+
+	if (bp->hwrm_spec_code < 0x10803) {
+		netdev_warn(bp->dev, "Firmware does not support SR-IOV E-Switch SWITCHDEV mode.\n");
+		return -ENOTSUPP;
+	}
+
+	dl = devlink_alloc(&bnxt_dl_ops, sizeof(struct bnxt_dl));
+	if (!dl) {
+		netdev_warn(bp->dev, "devlink_alloc failed");
+		return -ENOMEM;
+	}
+
+	bnxt_link_bp_to_dl(bp, dl);
+	bp->eswitch_mode = DEVLINK_ESWITCH_MODE_LEGACY;
+	rc = devlink_register(dl, &bp->pdev->dev);
+	if (rc) {
+		bnxt_link_bp_to_dl(bp, NULL);
+		devlink_free(dl);
+		netdev_warn(bp->dev, "devlink_register failed. rc=%d", rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+void bnxt_dl_unregister(struct bnxt *bp)
+{
+	struct devlink *dl = bp->dl;
+
+	if (!dl)
+		return;
+
+	devlink_unregister(dl);
+	devlink_free(dl);
+}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
new file mode 100644
index 0000000..e92a35d
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
@@ -0,0 +1,39 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2017 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_DEVLINK_H
+#define BNXT_DEVLINK_H
+
+/* Struct to hold housekeeping info needed by devlink interface */
+struct bnxt_dl {
+	struct bnxt *bp;	/* back ptr to the controlling dev */
+};
+
+static inline struct bnxt *bnxt_get_bp_from_dl(struct devlink *dl)
+{
+	return ((struct bnxt_dl *)devlink_priv(dl))->bp;
+}
+
+/* To clear devlink pointer from bp, pass NULL dl */
+static inline void bnxt_link_bp_to_dl(struct bnxt *bp, struct devlink *dl)
+{
+	bp->dl = dl;
+
+	/* add a back pointer in dl to bp */
+	if (dl) {
+		struct bnxt_dl *bp_dl = devlink_priv(dl);
+
+		bp_dl->bp = bp;
+	}
+}
+
+int bnxt_dl_register(struct bnxt *bp);
+void bnxt_dl_unregister(struct bnxt *bp);
+
+#endif /* BNXT_DEVLINK_H */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
new file mode 100644
index 0000000..408dd19
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
@@ -0,0 +1,32 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2017-2018 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/net_dim.h>
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+
+void bnxt_dim_work(struct work_struct *work)
+{
+	struct net_dim *dim = container_of(work, struct net_dim,
+					   work);
+	struct bnxt_cp_ring_info *cpr = container_of(dim,
+						     struct bnxt_cp_ring_info,
+						     dim);
+	struct bnxt_napi *bnapi = container_of(cpr,
+					       struct bnxt_napi,
+					       cp_ring);
+	struct net_dim_cq_moder cur_profile = net_dim_get_profile(dim->mode,
+								  dim->profile_ix);
+
+	cpr->rx_ring_coal.coal_ticks = cur_profile.usec;
+	cpr->rx_ring_coal.coal_bufs = cur_profile.pkts;
+
+	bnxt_hwrm_set_ring_coal(bnapi->bp, bnapi);
+	dim->state = NET_DIM_START_MEASURE;
+}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
new file mode 100644
index 0000000..8ba14ae
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -0,0 +1,2730 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2014-2016 Broadcom Corporation
+ * Copyright (c) 2016-2017 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/ctype.h>
+#include <linux/stringify.h>
+#include <linux/ethtool.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/etherdevice.h>
+#include <linux/crc32.h>
+#include <linux/firmware.h>
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+#include "bnxt_xdp.h"
+#include "bnxt_ethtool.h"
+#include "bnxt_nvm_defs.h"	/* NVRAM content constant and structure defs */
+#include "bnxt_fw_hdr.h"	/* Firmware hdr constant and structure defs */
+#define FLASH_NVRAM_TIMEOUT	((HWRM_CMD_TIMEOUT) * 100)
+#define FLASH_PACKAGE_TIMEOUT	((HWRM_CMD_TIMEOUT) * 200)
+#define INSTALL_PACKAGE_TIMEOUT	((HWRM_CMD_TIMEOUT) * 200)
+
+static u32 bnxt_get_msglevel(struct net_device *dev)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	return bp->msg_enable;
+}
+
+static void bnxt_set_msglevel(struct net_device *dev, u32 value)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	bp->msg_enable = value;
+}
+
+static int bnxt_get_coalesce(struct net_device *dev,
+			     struct ethtool_coalesce *coal)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_coal *hw_coal;
+	u16 mult;
+
+	memset(coal, 0, sizeof(*coal));
+
+	coal->use_adaptive_rx_coalesce = bp->flags & BNXT_FLAG_DIM;
+
+	hw_coal = &bp->rx_coal;
+	mult = hw_coal->bufs_per_record;
+	coal->rx_coalesce_usecs = hw_coal->coal_ticks;
+	coal->rx_max_coalesced_frames = hw_coal->coal_bufs / mult;
+	coal->rx_coalesce_usecs_irq = hw_coal->coal_ticks_irq;
+	coal->rx_max_coalesced_frames_irq = hw_coal->coal_bufs_irq / mult;
+
+	hw_coal = &bp->tx_coal;
+	mult = hw_coal->bufs_per_record;
+	coal->tx_coalesce_usecs = hw_coal->coal_ticks;
+	coal->tx_max_coalesced_frames = hw_coal->coal_bufs / mult;
+	coal->tx_coalesce_usecs_irq = hw_coal->coal_ticks_irq;
+	coal->tx_max_coalesced_frames_irq = hw_coal->coal_bufs_irq / mult;
+
+	coal->stats_block_coalesce_usecs = bp->stats_coal_ticks;
+
+	return 0;
+}
+
+static int bnxt_set_coalesce(struct net_device *dev,
+			     struct ethtool_coalesce *coal)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	bool update_stats = false;
+	struct bnxt_coal *hw_coal;
+	int rc = 0;
+	u16 mult;
+
+	if (coal->use_adaptive_rx_coalesce) {
+		bp->flags |= BNXT_FLAG_DIM;
+	} else {
+		if (bp->flags & BNXT_FLAG_DIM) {
+			bp->flags &= ~(BNXT_FLAG_DIM);
+			goto reset_coalesce;
+		}
+	}
+
+	hw_coal = &bp->rx_coal;
+	mult = hw_coal->bufs_per_record;
+	hw_coal->coal_ticks = coal->rx_coalesce_usecs;
+	hw_coal->coal_bufs = coal->rx_max_coalesced_frames * mult;
+	hw_coal->coal_ticks_irq = coal->rx_coalesce_usecs_irq;
+	hw_coal->coal_bufs_irq = coal->rx_max_coalesced_frames_irq * mult;
+
+	hw_coal = &bp->tx_coal;
+	mult = hw_coal->bufs_per_record;
+	hw_coal->coal_ticks = coal->tx_coalesce_usecs;
+	hw_coal->coal_bufs = coal->tx_max_coalesced_frames * mult;
+	hw_coal->coal_ticks_irq = coal->tx_coalesce_usecs_irq;
+	hw_coal->coal_bufs_irq = coal->tx_max_coalesced_frames_irq * mult;
+
+	if (bp->stats_coal_ticks != coal->stats_block_coalesce_usecs) {
+		u32 stats_ticks = coal->stats_block_coalesce_usecs;
+
+		/* Allow 0, which means disable. */
+		if (stats_ticks)
+			stats_ticks = clamp_t(u32, stats_ticks,
+					      BNXT_MIN_STATS_COAL_TICKS,
+					      BNXT_MAX_STATS_COAL_TICKS);
+		stats_ticks = rounddown(stats_ticks, BNXT_MIN_STATS_COAL_TICKS);
+		bp->stats_coal_ticks = stats_ticks;
+		update_stats = true;
+	}
+
+reset_coalesce:
+	if (netif_running(dev)) {
+		if (update_stats) {
+			rc = bnxt_close_nic(bp, true, false);
+			if (!rc)
+				rc = bnxt_open_nic(bp, true, false);
+		} else {
+			rc = bnxt_hwrm_set_coal(bp);
+		}
+	}
+
+	return rc;
+}
+
+#define BNXT_NUM_STATS	21
+
+#define BNXT_RX_STATS_ENTRY(counter)	\
+	{ BNXT_RX_STATS_OFFSET(counter), __stringify(counter) }
+
+#define BNXT_TX_STATS_ENTRY(counter)	\
+	{ BNXT_TX_STATS_OFFSET(counter), __stringify(counter) }
+
+#define BNXT_RX_STATS_EXT_ENTRY(counter)	\
+	{ BNXT_RX_STATS_EXT_OFFSET(counter), __stringify(counter) }
+
+static const struct {
+	long offset;
+	char string[ETH_GSTRING_LEN];
+} bnxt_port_stats_arr[] = {
+	BNXT_RX_STATS_ENTRY(rx_64b_frames),
+	BNXT_RX_STATS_ENTRY(rx_65b_127b_frames),
+	BNXT_RX_STATS_ENTRY(rx_128b_255b_frames),
+	BNXT_RX_STATS_ENTRY(rx_256b_511b_frames),
+	BNXT_RX_STATS_ENTRY(rx_512b_1023b_frames),
+	BNXT_RX_STATS_ENTRY(rx_1024b_1518_frames),
+	BNXT_RX_STATS_ENTRY(rx_good_vlan_frames),
+	BNXT_RX_STATS_ENTRY(rx_1519b_2047b_frames),
+	BNXT_RX_STATS_ENTRY(rx_2048b_4095b_frames),
+	BNXT_RX_STATS_ENTRY(rx_4096b_9216b_frames),
+	BNXT_RX_STATS_ENTRY(rx_9217b_16383b_frames),
+	BNXT_RX_STATS_ENTRY(rx_total_frames),
+	BNXT_RX_STATS_ENTRY(rx_ucast_frames),
+	BNXT_RX_STATS_ENTRY(rx_mcast_frames),
+	BNXT_RX_STATS_ENTRY(rx_bcast_frames),
+	BNXT_RX_STATS_ENTRY(rx_fcs_err_frames),
+	BNXT_RX_STATS_ENTRY(rx_ctrl_frames),
+	BNXT_RX_STATS_ENTRY(rx_pause_frames),
+	BNXT_RX_STATS_ENTRY(rx_pfc_frames),
+	BNXT_RX_STATS_ENTRY(rx_align_err_frames),
+	BNXT_RX_STATS_ENTRY(rx_ovrsz_frames),
+	BNXT_RX_STATS_ENTRY(rx_jbr_frames),
+	BNXT_RX_STATS_ENTRY(rx_mtu_err_frames),
+	BNXT_RX_STATS_ENTRY(rx_tagged_frames),
+	BNXT_RX_STATS_ENTRY(rx_double_tagged_frames),
+	BNXT_RX_STATS_ENTRY(rx_good_frames),
+	BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri0),
+	BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri1),
+	BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri2),
+	BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri3),
+	BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri4),
+	BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri5),
+	BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri6),
+	BNXT_RX_STATS_ENTRY(rx_pfc_ena_frames_pri7),
+	BNXT_RX_STATS_ENTRY(rx_undrsz_frames),
+	BNXT_RX_STATS_ENTRY(rx_eee_lpi_events),
+	BNXT_RX_STATS_ENTRY(rx_eee_lpi_duration),
+	BNXT_RX_STATS_ENTRY(rx_bytes),
+	BNXT_RX_STATS_ENTRY(rx_runt_bytes),
+	BNXT_RX_STATS_ENTRY(rx_runt_frames),
+	BNXT_RX_STATS_ENTRY(rx_stat_discard),
+	BNXT_RX_STATS_ENTRY(rx_stat_err),
+
+	BNXT_TX_STATS_ENTRY(tx_64b_frames),
+	BNXT_TX_STATS_ENTRY(tx_65b_127b_frames),
+	BNXT_TX_STATS_ENTRY(tx_128b_255b_frames),
+	BNXT_TX_STATS_ENTRY(tx_256b_511b_frames),
+	BNXT_TX_STATS_ENTRY(tx_512b_1023b_frames),
+	BNXT_TX_STATS_ENTRY(tx_1024b_1518_frames),
+	BNXT_TX_STATS_ENTRY(tx_good_vlan_frames),
+	BNXT_TX_STATS_ENTRY(tx_1519b_2047_frames),
+	BNXT_TX_STATS_ENTRY(tx_2048b_4095b_frames),
+	BNXT_TX_STATS_ENTRY(tx_4096b_9216b_frames),
+	BNXT_TX_STATS_ENTRY(tx_9217b_16383b_frames),
+	BNXT_TX_STATS_ENTRY(tx_good_frames),
+	BNXT_TX_STATS_ENTRY(tx_total_frames),
+	BNXT_TX_STATS_ENTRY(tx_ucast_frames),
+	BNXT_TX_STATS_ENTRY(tx_mcast_frames),
+	BNXT_TX_STATS_ENTRY(tx_bcast_frames),
+	BNXT_TX_STATS_ENTRY(tx_pause_frames),
+	BNXT_TX_STATS_ENTRY(tx_pfc_frames),
+	BNXT_TX_STATS_ENTRY(tx_jabber_frames),
+	BNXT_TX_STATS_ENTRY(tx_fcs_err_frames),
+	BNXT_TX_STATS_ENTRY(tx_err),
+	BNXT_TX_STATS_ENTRY(tx_fifo_underruns),
+	BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri0),
+	BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri1),
+	BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri2),
+	BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri3),
+	BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri4),
+	BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri5),
+	BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri6),
+	BNXT_TX_STATS_ENTRY(tx_pfc_ena_frames_pri7),
+	BNXT_TX_STATS_ENTRY(tx_eee_lpi_events),
+	BNXT_TX_STATS_ENTRY(tx_eee_lpi_duration),
+	BNXT_TX_STATS_ENTRY(tx_total_collisions),
+	BNXT_TX_STATS_ENTRY(tx_bytes),
+	BNXT_TX_STATS_ENTRY(tx_xthol_frames),
+	BNXT_TX_STATS_ENTRY(tx_stat_discard),
+	BNXT_TX_STATS_ENTRY(tx_stat_error),
+};
+
+static const struct {
+	long offset;
+	char string[ETH_GSTRING_LEN];
+} bnxt_port_stats_ext_arr[] = {
+	BNXT_RX_STATS_EXT_ENTRY(link_down_events),
+	BNXT_RX_STATS_EXT_ENTRY(continuous_pause_events),
+	BNXT_RX_STATS_EXT_ENTRY(resume_pause_events),
+	BNXT_RX_STATS_EXT_ENTRY(continuous_roce_pause_events),
+	BNXT_RX_STATS_EXT_ENTRY(resume_roce_pause_events),
+};
+
+#define BNXT_NUM_PORT_STATS ARRAY_SIZE(bnxt_port_stats_arr)
+#define BNXT_NUM_PORT_STATS_EXT ARRAY_SIZE(bnxt_port_stats_ext_arr)
+
+static int bnxt_get_num_stats(struct bnxt *bp)
+{
+	int num_stats = BNXT_NUM_STATS * bp->cp_nr_rings;
+
+	if (bp->flags & BNXT_FLAG_PORT_STATS)
+		num_stats += BNXT_NUM_PORT_STATS;
+
+	if (bp->flags & BNXT_FLAG_PORT_STATS_EXT)
+		num_stats += BNXT_NUM_PORT_STATS_EXT;
+
+	return num_stats;
+}
+
+static int bnxt_get_sset_count(struct net_device *dev, int sset)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		return bnxt_get_num_stats(bp);
+	case ETH_SS_TEST:
+		if (!bp->num_tests)
+			return -EOPNOTSUPP;
+		return bp->num_tests;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void bnxt_get_ethtool_stats(struct net_device *dev,
+				   struct ethtool_stats *stats, u64 *buf)
+{
+	u32 i, j = 0;
+	struct bnxt *bp = netdev_priv(dev);
+	u32 stat_fields = sizeof(struct ctx_hw_stats) / 8;
+
+	if (!bp->bnapi)
+		return;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		struct bnxt_napi *bnapi = bp->bnapi[i];
+		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+		__le64 *hw_stats = (__le64 *)cpr->hw_stats;
+		int k;
+
+		for (k = 0; k < stat_fields; j++, k++)
+			buf[j] = le64_to_cpu(hw_stats[k]);
+		buf[j++] = cpr->rx_l4_csum_errors;
+	}
+	if (bp->flags & BNXT_FLAG_PORT_STATS) {
+		__le64 *port_stats = (__le64 *)bp->hw_rx_port_stats;
+
+		for (i = 0; i < BNXT_NUM_PORT_STATS; i++, j++) {
+			buf[j] = le64_to_cpu(*(port_stats +
+					       bnxt_port_stats_arr[i].offset));
+		}
+	}
+	if (bp->flags & BNXT_FLAG_PORT_STATS_EXT) {
+		__le64 *port_stats_ext = (__le64 *)bp->hw_rx_port_stats_ext;
+
+		for (i = 0; i < BNXT_NUM_PORT_STATS_EXT; i++, j++) {
+			buf[j] = le64_to_cpu(*(port_stats_ext +
+					    bnxt_port_stats_ext_arr[i].offset));
+		}
+	}
+}
+
+static void bnxt_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	u32 i;
+
+	switch (stringset) {
+	/* The number of strings must match BNXT_NUM_STATS defined above. */
+	case ETH_SS_STATS:
+		for (i = 0; i < bp->cp_nr_rings; i++) {
+			sprintf(buf, "[%d]: rx_ucast_packets", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: rx_mcast_packets", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: rx_bcast_packets", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: rx_discards", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: rx_drops", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: rx_ucast_bytes", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: rx_mcast_bytes", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: rx_bcast_bytes", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: tx_ucast_packets", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: tx_mcast_packets", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: tx_bcast_packets", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: tx_discards", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: tx_drops", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: tx_ucast_bytes", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: tx_mcast_bytes", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: tx_bcast_bytes", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: tpa_packets", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: tpa_bytes", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: tpa_events", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: tpa_aborts", i);
+			buf += ETH_GSTRING_LEN;
+			sprintf(buf, "[%d]: rx_l4_csum_errors", i);
+			buf += ETH_GSTRING_LEN;
+		}
+		if (bp->flags & BNXT_FLAG_PORT_STATS) {
+			for (i = 0; i < BNXT_NUM_PORT_STATS; i++) {
+				strcpy(buf, bnxt_port_stats_arr[i].string);
+				buf += ETH_GSTRING_LEN;
+			}
+		}
+		if (bp->flags & BNXT_FLAG_PORT_STATS_EXT) {
+			for (i = 0; i < BNXT_NUM_PORT_STATS_EXT; i++) {
+				strcpy(buf, bnxt_port_stats_ext_arr[i].string);
+				buf += ETH_GSTRING_LEN;
+			}
+		}
+		break;
+	case ETH_SS_TEST:
+		if (bp->num_tests)
+			memcpy(buf, bp->test_info->string,
+			       bp->num_tests * ETH_GSTRING_LEN);
+		break;
+	default:
+		netdev_err(bp->dev, "bnxt_get_strings invalid request %x\n",
+			   stringset);
+		break;
+	}
+}
+
+static void bnxt_get_ringparam(struct net_device *dev,
+			       struct ethtool_ringparam *ering)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	ering->rx_max_pending = BNXT_MAX_RX_DESC_CNT;
+	ering->rx_jumbo_max_pending = BNXT_MAX_RX_JUM_DESC_CNT;
+	ering->tx_max_pending = BNXT_MAX_TX_DESC_CNT;
+
+	ering->rx_pending = bp->rx_ring_size;
+	ering->rx_jumbo_pending = bp->rx_agg_ring_size;
+	ering->tx_pending = bp->tx_ring_size;
+}
+
+static int bnxt_set_ringparam(struct net_device *dev,
+			      struct ethtool_ringparam *ering)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	if ((ering->rx_pending > BNXT_MAX_RX_DESC_CNT) ||
+	    (ering->tx_pending > BNXT_MAX_TX_DESC_CNT) ||
+	    (ering->tx_pending <= MAX_SKB_FRAGS))
+		return -EINVAL;
+
+	if (netif_running(dev))
+		bnxt_close_nic(bp, false, false);
+
+	bp->rx_ring_size = ering->rx_pending;
+	bp->tx_ring_size = ering->tx_pending;
+	bnxt_set_ring_params(bp);
+
+	if (netif_running(dev))
+		return bnxt_open_nic(bp, false, false);
+
+	return 0;
+}
+
+static void bnxt_get_channels(struct net_device *dev,
+			      struct ethtool_channels *channel)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+	int max_rx_rings, max_tx_rings, tcs;
+	int max_tx_sch_inputs;
+
+	/* Get the most up-to-date max_tx_sch_inputs. */
+	if (bp->flags & BNXT_FLAG_NEW_RM)
+		bnxt_hwrm_func_resc_qcaps(bp, false);
+	max_tx_sch_inputs = hw_resc->max_tx_sch_inputs;
+
+	bnxt_get_max_rings(bp, &max_rx_rings, &max_tx_rings, true);
+	if (max_tx_sch_inputs)
+		max_tx_rings = min_t(int, max_tx_rings, max_tx_sch_inputs);
+	channel->max_combined = min_t(int, max_rx_rings, max_tx_rings);
+
+	if (bnxt_get_max_rings(bp, &max_rx_rings, &max_tx_rings, false)) {
+		max_rx_rings = 0;
+		max_tx_rings = 0;
+	}
+	if (max_tx_sch_inputs)
+		max_tx_rings = min_t(int, max_tx_rings, max_tx_sch_inputs);
+
+	tcs = netdev_get_num_tc(dev);
+	if (tcs > 1)
+		max_tx_rings /= tcs;
+
+	channel->max_rx = max_rx_rings;
+	channel->max_tx = max_tx_rings;
+	channel->max_other = 0;
+	if (bp->flags & BNXT_FLAG_SHARED_RINGS) {
+		channel->combined_count = bp->rx_nr_rings;
+		if (BNXT_CHIP_TYPE_NITRO_A0(bp))
+			channel->combined_count--;
+	} else {
+		if (!BNXT_CHIP_TYPE_NITRO_A0(bp)) {
+			channel->rx_count = bp->rx_nr_rings;
+			channel->tx_count = bp->tx_nr_rings_per_tc;
+		}
+	}
+}
+
+static int bnxt_set_channels(struct net_device *dev,
+			     struct ethtool_channels *channel)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int req_tx_rings, req_rx_rings, tcs;
+	bool sh = false;
+	int tx_xdp = 0;
+	int rc = 0;
+
+	if (channel->other_count)
+		return -EINVAL;
+
+	if (!channel->combined_count &&
+	    (!channel->rx_count || !channel->tx_count))
+		return -EINVAL;
+
+	if (channel->combined_count &&
+	    (channel->rx_count || channel->tx_count))
+		return -EINVAL;
+
+	if (BNXT_CHIP_TYPE_NITRO_A0(bp) && (channel->rx_count ||
+					    channel->tx_count))
+		return -EINVAL;
+
+	if (channel->combined_count)
+		sh = true;
+
+	tcs = netdev_get_num_tc(dev);
+
+	req_tx_rings = sh ? channel->combined_count : channel->tx_count;
+	req_rx_rings = sh ? channel->combined_count : channel->rx_count;
+	if (bp->tx_nr_rings_xdp) {
+		if (!sh) {
+			netdev_err(dev, "Only combined mode supported when XDP is enabled.\n");
+			return -EINVAL;
+		}
+		tx_xdp = req_rx_rings;
+	}
+	rc = bnxt_check_rings(bp, req_tx_rings, req_rx_rings, sh, tcs, tx_xdp);
+	if (rc) {
+		netdev_warn(dev, "Unable to allocate the requested rings\n");
+		return rc;
+	}
+
+	if (netif_running(dev)) {
+		if (BNXT_PF(bp)) {
+			/* TODO CHIMP_FW: Send message to all VF's
+			 * before PF unload
+			 */
+		}
+		rc = bnxt_close_nic(bp, true, false);
+		if (rc) {
+			netdev_err(bp->dev, "Set channel failure rc :%x\n",
+				   rc);
+			return rc;
+		}
+	}
+
+	if (sh) {
+		bp->flags |= BNXT_FLAG_SHARED_RINGS;
+		bp->rx_nr_rings = channel->combined_count;
+		bp->tx_nr_rings_per_tc = channel->combined_count;
+	} else {
+		bp->flags &= ~BNXT_FLAG_SHARED_RINGS;
+		bp->rx_nr_rings = channel->rx_count;
+		bp->tx_nr_rings_per_tc = channel->tx_count;
+	}
+	bp->tx_nr_rings_xdp = tx_xdp;
+	bp->tx_nr_rings = bp->tx_nr_rings_per_tc + tx_xdp;
+	if (tcs > 1)
+		bp->tx_nr_rings = bp->tx_nr_rings_per_tc * tcs + tx_xdp;
+
+	bp->cp_nr_rings = sh ? max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) :
+			       bp->tx_nr_rings + bp->rx_nr_rings;
+
+	bp->num_stat_ctxs = bp->cp_nr_rings;
+
+	/* After changing number of rx channels, update NTUPLE feature. */
+	netdev_update_features(dev);
+	if (netif_running(dev)) {
+		rc = bnxt_open_nic(bp, true, false);
+		if ((!rc) && BNXT_PF(bp)) {
+			/* TODO CHIMP_FW: Send message to all VF's
+			 * to renable
+			 */
+		}
+	}
+
+	return rc;
+}
+
+#ifdef CONFIG_RFS_ACCEL
+static int bnxt_grxclsrlall(struct bnxt *bp, struct ethtool_rxnfc *cmd,
+			    u32 *rule_locs)
+{
+	int i, j = 0;
+
+	cmd->data = bp->ntp_fltr_count;
+	for (i = 0; i < BNXT_NTP_FLTR_HASH_SIZE; i++) {
+		struct hlist_head *head;
+		struct bnxt_ntuple_filter *fltr;
+
+		head = &bp->ntp_fltr_hash_tbl[i];
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(fltr, head, hash) {
+			if (j == cmd->rule_cnt)
+				break;
+			rule_locs[j++] = fltr->sw_id;
+		}
+		rcu_read_unlock();
+		if (j == cmd->rule_cnt)
+			break;
+	}
+	cmd->rule_cnt = j;
+	return 0;
+}
+
+static int bnxt_grxclsrule(struct bnxt *bp, struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_rx_flow_spec *fs =
+		(struct ethtool_rx_flow_spec *)&cmd->fs;
+	struct bnxt_ntuple_filter *fltr;
+	struct flow_keys *fkeys;
+	int i, rc = -EINVAL;
+
+	if (fs->location >= BNXT_NTP_FLTR_MAX_FLTR)
+		return rc;
+
+	for (i = 0; i < BNXT_NTP_FLTR_HASH_SIZE; i++) {
+		struct hlist_head *head;
+
+		head = &bp->ntp_fltr_hash_tbl[i];
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(fltr, head, hash) {
+			if (fltr->sw_id == fs->location)
+				goto fltr_found;
+		}
+		rcu_read_unlock();
+	}
+	return rc;
+
+fltr_found:
+	fkeys = &fltr->fkeys;
+	if (fkeys->basic.n_proto == htons(ETH_P_IP)) {
+		if (fkeys->basic.ip_proto == IPPROTO_TCP)
+			fs->flow_type = TCP_V4_FLOW;
+		else if (fkeys->basic.ip_proto == IPPROTO_UDP)
+			fs->flow_type = UDP_V4_FLOW;
+		else
+			goto fltr_err;
+
+		fs->h_u.tcp_ip4_spec.ip4src = fkeys->addrs.v4addrs.src;
+		fs->m_u.tcp_ip4_spec.ip4src = cpu_to_be32(~0);
+
+		fs->h_u.tcp_ip4_spec.ip4dst = fkeys->addrs.v4addrs.dst;
+		fs->m_u.tcp_ip4_spec.ip4dst = cpu_to_be32(~0);
+
+		fs->h_u.tcp_ip4_spec.psrc = fkeys->ports.src;
+		fs->m_u.tcp_ip4_spec.psrc = cpu_to_be16(~0);
+
+		fs->h_u.tcp_ip4_spec.pdst = fkeys->ports.dst;
+		fs->m_u.tcp_ip4_spec.pdst = cpu_to_be16(~0);
+	} else {
+		int i;
+
+		if (fkeys->basic.ip_proto == IPPROTO_TCP)
+			fs->flow_type = TCP_V6_FLOW;
+		else if (fkeys->basic.ip_proto == IPPROTO_UDP)
+			fs->flow_type = UDP_V6_FLOW;
+		else
+			goto fltr_err;
+
+		*(struct in6_addr *)&fs->h_u.tcp_ip6_spec.ip6src[0] =
+			fkeys->addrs.v6addrs.src;
+		*(struct in6_addr *)&fs->h_u.tcp_ip6_spec.ip6dst[0] =
+			fkeys->addrs.v6addrs.dst;
+		for (i = 0; i < 4; i++) {
+			fs->m_u.tcp_ip6_spec.ip6src[i] = cpu_to_be32(~0);
+			fs->m_u.tcp_ip6_spec.ip6dst[i] = cpu_to_be32(~0);
+		}
+		fs->h_u.tcp_ip6_spec.psrc = fkeys->ports.src;
+		fs->m_u.tcp_ip6_spec.psrc = cpu_to_be16(~0);
+
+		fs->h_u.tcp_ip6_spec.pdst = fkeys->ports.dst;
+		fs->m_u.tcp_ip6_spec.pdst = cpu_to_be16(~0);
+	}
+
+	fs->ring_cookie = fltr->rxq;
+	rc = 0;
+
+fltr_err:
+	rcu_read_unlock();
+
+	return rc;
+}
+#endif
+
+static u64 get_ethtool_ipv4_rss(struct bnxt *bp)
+{
+	if (bp->rss_hash_cfg & VNIC_RSS_CFG_REQ_HASH_TYPE_IPV4)
+		return RXH_IP_SRC | RXH_IP_DST;
+	return 0;
+}
+
+static u64 get_ethtool_ipv6_rss(struct bnxt *bp)
+{
+	if (bp->rss_hash_cfg & VNIC_RSS_CFG_REQ_HASH_TYPE_IPV6)
+		return RXH_IP_SRC | RXH_IP_DST;
+	return 0;
+}
+
+static int bnxt_grxfh(struct bnxt *bp, struct ethtool_rxnfc *cmd)
+{
+	cmd->data = 0;
+	switch (cmd->flow_type) {
+	case TCP_V4_FLOW:
+		if (bp->rss_hash_cfg & VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV4)
+			cmd->data |= RXH_IP_SRC | RXH_IP_DST |
+				     RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		cmd->data |= get_ethtool_ipv4_rss(bp);
+		break;
+	case UDP_V4_FLOW:
+		if (bp->rss_hash_cfg & VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV4)
+			cmd->data |= RXH_IP_SRC | RXH_IP_DST |
+				     RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		/* fall through */
+	case SCTP_V4_FLOW:
+	case AH_ESP_V4_FLOW:
+	case AH_V4_FLOW:
+	case ESP_V4_FLOW:
+	case IPV4_FLOW:
+		cmd->data |= get_ethtool_ipv4_rss(bp);
+		break;
+
+	case TCP_V6_FLOW:
+		if (bp->rss_hash_cfg & VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV6)
+			cmd->data |= RXH_IP_SRC | RXH_IP_DST |
+				     RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		cmd->data |= get_ethtool_ipv6_rss(bp);
+		break;
+	case UDP_V6_FLOW:
+		if (bp->rss_hash_cfg & VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV6)
+			cmd->data |= RXH_IP_SRC | RXH_IP_DST |
+				     RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		/* fall through */
+	case SCTP_V6_FLOW:
+	case AH_ESP_V6_FLOW:
+	case AH_V6_FLOW:
+	case ESP_V6_FLOW:
+	case IPV6_FLOW:
+		cmd->data |= get_ethtool_ipv6_rss(bp);
+		break;
+	}
+	return 0;
+}
+
+#define RXH_4TUPLE (RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3)
+#define RXH_2TUPLE (RXH_IP_SRC | RXH_IP_DST)
+
+static int bnxt_srxfh(struct bnxt *bp, struct ethtool_rxnfc *cmd)
+{
+	u32 rss_hash_cfg = bp->rss_hash_cfg;
+	int tuple, rc = 0;
+
+	if (cmd->data == RXH_4TUPLE)
+		tuple = 4;
+	else if (cmd->data == RXH_2TUPLE)
+		tuple = 2;
+	else if (!cmd->data)
+		tuple = 0;
+	else
+		return -EINVAL;
+
+	if (cmd->flow_type == TCP_V4_FLOW) {
+		rss_hash_cfg &= ~VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV4;
+		if (tuple == 4)
+			rss_hash_cfg |= VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV4;
+	} else if (cmd->flow_type == UDP_V4_FLOW) {
+		if (tuple == 4 && !(bp->flags & BNXT_FLAG_UDP_RSS_CAP))
+			return -EINVAL;
+		rss_hash_cfg &= ~VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV4;
+		if (tuple == 4)
+			rss_hash_cfg |= VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV4;
+	} else if (cmd->flow_type == TCP_V6_FLOW) {
+		rss_hash_cfg &= ~VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV6;
+		if (tuple == 4)
+			rss_hash_cfg |= VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV6;
+	} else if (cmd->flow_type == UDP_V6_FLOW) {
+		if (tuple == 4 && !(bp->flags & BNXT_FLAG_UDP_RSS_CAP))
+			return -EINVAL;
+		rss_hash_cfg &= ~VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV6;
+		if (tuple == 4)
+			rss_hash_cfg |= VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV6;
+	} else if (tuple == 4) {
+		return -EINVAL;
+	}
+
+	switch (cmd->flow_type) {
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+	case SCTP_V4_FLOW:
+	case AH_ESP_V4_FLOW:
+	case AH_V4_FLOW:
+	case ESP_V4_FLOW:
+	case IPV4_FLOW:
+		if (tuple == 2)
+			rss_hash_cfg |= VNIC_RSS_CFG_REQ_HASH_TYPE_IPV4;
+		else if (!tuple)
+			rss_hash_cfg &= ~VNIC_RSS_CFG_REQ_HASH_TYPE_IPV4;
+		break;
+
+	case TCP_V6_FLOW:
+	case UDP_V6_FLOW:
+	case SCTP_V6_FLOW:
+	case AH_ESP_V6_FLOW:
+	case AH_V6_FLOW:
+	case ESP_V6_FLOW:
+	case IPV6_FLOW:
+		if (tuple == 2)
+			rss_hash_cfg |= VNIC_RSS_CFG_REQ_HASH_TYPE_IPV6;
+		else if (!tuple)
+			rss_hash_cfg &= ~VNIC_RSS_CFG_REQ_HASH_TYPE_IPV6;
+		break;
+	}
+
+	if (bp->rss_hash_cfg == rss_hash_cfg)
+		return 0;
+
+	bp->rss_hash_cfg = rss_hash_cfg;
+	if (netif_running(bp->dev)) {
+		bnxt_close_nic(bp, false, false);
+		rc = bnxt_open_nic(bp, false, false);
+	}
+	return rc;
+}
+
+static int bnxt_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
+			  u32 *rule_locs)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int rc = 0;
+
+	switch (cmd->cmd) {
+#ifdef CONFIG_RFS_ACCEL
+	case ETHTOOL_GRXRINGS:
+		cmd->data = bp->rx_nr_rings;
+		break;
+
+	case ETHTOOL_GRXCLSRLCNT:
+		cmd->rule_cnt = bp->ntp_fltr_count;
+		cmd->data = BNXT_NTP_FLTR_MAX_FLTR;
+		break;
+
+	case ETHTOOL_GRXCLSRLALL:
+		rc = bnxt_grxclsrlall(bp, cmd, (u32 *)rule_locs);
+		break;
+
+	case ETHTOOL_GRXCLSRULE:
+		rc = bnxt_grxclsrule(bp, cmd);
+		break;
+#endif
+
+	case ETHTOOL_GRXFH:
+		rc = bnxt_grxfh(bp, cmd);
+		break;
+
+	default:
+		rc = -EOPNOTSUPP;
+		break;
+	}
+
+	return rc;
+}
+
+static int bnxt_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int rc;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXFH:
+		rc = bnxt_srxfh(bp, cmd);
+		break;
+
+	default:
+		rc = -EOPNOTSUPP;
+		break;
+	}
+	return rc;
+}
+
+static u32 bnxt_get_rxfh_indir_size(struct net_device *dev)
+{
+	return HW_HASH_INDEX_SIZE;
+}
+
+static u32 bnxt_get_rxfh_key_size(struct net_device *dev)
+{
+	return HW_HASH_KEY_SIZE;
+}
+
+static int bnxt_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
+			 u8 *hfunc)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_vnic_info *vnic;
+	int i = 0;
+
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+
+	if (!bp->vnic_info)
+		return 0;
+
+	vnic = &bp->vnic_info[0];
+	if (indir && vnic->rss_table) {
+		for (i = 0; i < HW_HASH_INDEX_SIZE; i++)
+			indir[i] = le16_to_cpu(vnic->rss_table[i]);
+	}
+
+	if (key && vnic->rss_hash_key)
+		memcpy(key, vnic->rss_hash_key, HW_HASH_KEY_SIZE);
+
+	return 0;
+}
+
+static void bnxt_get_drvinfo(struct net_device *dev,
+			     struct ethtool_drvinfo *info)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	strlcpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver));
+	strlcpy(info->version, DRV_MODULE_VERSION, sizeof(info->version));
+	strlcpy(info->fw_version, bp->fw_ver_str, sizeof(info->fw_version));
+	strlcpy(info->bus_info, pci_name(bp->pdev), sizeof(info->bus_info));
+	info->n_stats = bnxt_get_num_stats(bp);
+	info->testinfo_len = bp->num_tests;
+	/* TODO CHIMP_FW: eeprom dump details */
+	info->eedump_len = 0;
+	/* TODO CHIMP FW: reg dump details */
+	info->regdump_len = 0;
+}
+
+static void bnxt_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	wol->supported = 0;
+	wol->wolopts = 0;
+	memset(&wol->sopass, 0, sizeof(wol->sopass));
+	if (bp->flags & BNXT_FLAG_WOL_CAP) {
+		wol->supported = WAKE_MAGIC;
+		if (bp->wol)
+			wol->wolopts = WAKE_MAGIC;
+	}
+}
+
+static int bnxt_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	if (wol->wolopts & ~WAKE_MAGIC)
+		return -EINVAL;
+
+	if (wol->wolopts & WAKE_MAGIC) {
+		if (!(bp->flags & BNXT_FLAG_WOL_CAP))
+			return -EINVAL;
+		if (!bp->wol) {
+			if (bnxt_hwrm_alloc_wol_fltr(bp))
+				return -EBUSY;
+			bp->wol = 1;
+		}
+	} else {
+		if (bp->wol) {
+			if (bnxt_hwrm_free_wol_fltr(bp))
+				return -EBUSY;
+			bp->wol = 0;
+		}
+	}
+	return 0;
+}
+
+u32 _bnxt_fw_to_ethtool_adv_spds(u16 fw_speeds, u8 fw_pause)
+{
+	u32 speed_mask = 0;
+
+	/* TODO: support 25GB, 40GB, 50GB with different cable type */
+	/* set the advertised speeds */
+	if (fw_speeds & BNXT_LINK_SPEED_MSK_100MB)
+		speed_mask |= ADVERTISED_100baseT_Full;
+	if (fw_speeds & BNXT_LINK_SPEED_MSK_1GB)
+		speed_mask |= ADVERTISED_1000baseT_Full;
+	if (fw_speeds & BNXT_LINK_SPEED_MSK_2_5GB)
+		speed_mask |= ADVERTISED_2500baseX_Full;
+	if (fw_speeds & BNXT_LINK_SPEED_MSK_10GB)
+		speed_mask |= ADVERTISED_10000baseT_Full;
+	if (fw_speeds & BNXT_LINK_SPEED_MSK_40GB)
+		speed_mask |= ADVERTISED_40000baseCR4_Full;
+
+	if ((fw_pause & BNXT_LINK_PAUSE_BOTH) == BNXT_LINK_PAUSE_BOTH)
+		speed_mask |= ADVERTISED_Pause;
+	else if (fw_pause & BNXT_LINK_PAUSE_TX)
+		speed_mask |= ADVERTISED_Asym_Pause;
+	else if (fw_pause & BNXT_LINK_PAUSE_RX)
+		speed_mask |= ADVERTISED_Pause | ADVERTISED_Asym_Pause;
+
+	return speed_mask;
+}
+
+#define BNXT_FW_TO_ETHTOOL_SPDS(fw_speeds, fw_pause, lk_ksettings, name)\
+{									\
+	if ((fw_speeds) & BNXT_LINK_SPEED_MSK_100MB)			\
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, name,\
+						     100baseT_Full);	\
+	if ((fw_speeds) & BNXT_LINK_SPEED_MSK_1GB)			\
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, name,\
+						     1000baseT_Full);	\
+	if ((fw_speeds) & BNXT_LINK_SPEED_MSK_10GB)			\
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, name,\
+						     10000baseT_Full);	\
+	if ((fw_speeds) & BNXT_LINK_SPEED_MSK_25GB)			\
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, name,\
+						     25000baseCR_Full);	\
+	if ((fw_speeds) & BNXT_LINK_SPEED_MSK_40GB)			\
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, name,\
+						     40000baseCR4_Full);\
+	if ((fw_speeds) & BNXT_LINK_SPEED_MSK_50GB)			\
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, name,\
+						     50000baseCR2_Full);\
+	if ((fw_speeds) & BNXT_LINK_SPEED_MSK_100GB)			\
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, name,\
+						     100000baseCR4_Full);\
+	if ((fw_pause) & BNXT_LINK_PAUSE_RX) {				\
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, name,\
+						     Pause);		\
+		if (!((fw_pause) & BNXT_LINK_PAUSE_TX))			\
+			ethtool_link_ksettings_add_link_mode(		\
+					lk_ksettings, name, Asym_Pause);\
+	} else if ((fw_pause) & BNXT_LINK_PAUSE_TX) {			\
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, name,\
+						     Asym_Pause);	\
+	}								\
+}
+
+#define BNXT_ETHTOOL_TO_FW_SPDS(fw_speeds, lk_ksettings, name)		\
+{									\
+	if (ethtool_link_ksettings_test_link_mode(lk_ksettings, name,	\
+						  100baseT_Full) ||	\
+	    ethtool_link_ksettings_test_link_mode(lk_ksettings, name,	\
+						  100baseT_Half))	\
+		(fw_speeds) |= BNXT_LINK_SPEED_MSK_100MB;		\
+	if (ethtool_link_ksettings_test_link_mode(lk_ksettings, name,	\
+						  1000baseT_Full) ||	\
+	    ethtool_link_ksettings_test_link_mode(lk_ksettings, name,	\
+						  1000baseT_Half))	\
+		(fw_speeds) |= BNXT_LINK_SPEED_MSK_1GB;			\
+	if (ethtool_link_ksettings_test_link_mode(lk_ksettings, name,	\
+						  10000baseT_Full))	\
+		(fw_speeds) |= BNXT_LINK_SPEED_MSK_10GB;		\
+	if (ethtool_link_ksettings_test_link_mode(lk_ksettings, name,	\
+						  25000baseCR_Full))	\
+		(fw_speeds) |= BNXT_LINK_SPEED_MSK_25GB;		\
+	if (ethtool_link_ksettings_test_link_mode(lk_ksettings, name,	\
+						  40000baseCR4_Full))	\
+		(fw_speeds) |= BNXT_LINK_SPEED_MSK_40GB;		\
+	if (ethtool_link_ksettings_test_link_mode(lk_ksettings, name,	\
+						  50000baseCR2_Full))	\
+		(fw_speeds) |= BNXT_LINK_SPEED_MSK_50GB;		\
+	if (ethtool_link_ksettings_test_link_mode(lk_ksettings, name,	\
+						  100000baseCR4_Full))	\
+		(fw_speeds) |= BNXT_LINK_SPEED_MSK_100GB;		\
+}
+
+static void bnxt_fw_to_ethtool_advertised_spds(struct bnxt_link_info *link_info,
+				struct ethtool_link_ksettings *lk_ksettings)
+{
+	u16 fw_speeds = link_info->advertising;
+	u8 fw_pause = 0;
+
+	if (link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)
+		fw_pause = link_info->auto_pause_setting;
+
+	BNXT_FW_TO_ETHTOOL_SPDS(fw_speeds, fw_pause, lk_ksettings, advertising);
+}
+
+static void bnxt_fw_to_ethtool_lp_adv(struct bnxt_link_info *link_info,
+				struct ethtool_link_ksettings *lk_ksettings)
+{
+	u16 fw_speeds = link_info->lp_auto_link_speeds;
+	u8 fw_pause = 0;
+
+	if (link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)
+		fw_pause = link_info->lp_pause;
+
+	BNXT_FW_TO_ETHTOOL_SPDS(fw_speeds, fw_pause, lk_ksettings,
+				lp_advertising);
+}
+
+static void bnxt_fw_to_ethtool_support_spds(struct bnxt_link_info *link_info,
+				struct ethtool_link_ksettings *lk_ksettings)
+{
+	u16 fw_speeds = link_info->support_speeds;
+
+	BNXT_FW_TO_ETHTOOL_SPDS(fw_speeds, 0, lk_ksettings, supported);
+
+	ethtool_link_ksettings_add_link_mode(lk_ksettings, supported, Pause);
+	ethtool_link_ksettings_add_link_mode(lk_ksettings, supported,
+					     Asym_Pause);
+
+	if (link_info->support_auto_speeds)
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, supported,
+						     Autoneg);
+}
+
+u32 bnxt_fw_to_ethtool_speed(u16 fw_link_speed)
+{
+	switch (fw_link_speed) {
+	case BNXT_LINK_SPEED_100MB:
+		return SPEED_100;
+	case BNXT_LINK_SPEED_1GB:
+		return SPEED_1000;
+	case BNXT_LINK_SPEED_2_5GB:
+		return SPEED_2500;
+	case BNXT_LINK_SPEED_10GB:
+		return SPEED_10000;
+	case BNXT_LINK_SPEED_20GB:
+		return SPEED_20000;
+	case BNXT_LINK_SPEED_25GB:
+		return SPEED_25000;
+	case BNXT_LINK_SPEED_40GB:
+		return SPEED_40000;
+	case BNXT_LINK_SPEED_50GB:
+		return SPEED_50000;
+	case BNXT_LINK_SPEED_100GB:
+		return SPEED_100000;
+	default:
+		return SPEED_UNKNOWN;
+	}
+}
+
+static int bnxt_get_link_ksettings(struct net_device *dev,
+				   struct ethtool_link_ksettings *lk_ksettings)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_link_info *link_info = &bp->link_info;
+	struct ethtool_link_settings *base = &lk_ksettings->base;
+	u32 ethtool_speed;
+
+	ethtool_link_ksettings_zero_link_mode(lk_ksettings, supported);
+	mutex_lock(&bp->link_lock);
+	bnxt_fw_to_ethtool_support_spds(link_info, lk_ksettings);
+
+	ethtool_link_ksettings_zero_link_mode(lk_ksettings, advertising);
+	if (link_info->autoneg) {
+		bnxt_fw_to_ethtool_advertised_spds(link_info, lk_ksettings);
+		ethtool_link_ksettings_add_link_mode(lk_ksettings,
+						     advertising, Autoneg);
+		base->autoneg = AUTONEG_ENABLE;
+		if (link_info->phy_link_status == BNXT_LINK_LINK)
+			bnxt_fw_to_ethtool_lp_adv(link_info, lk_ksettings);
+		ethtool_speed = bnxt_fw_to_ethtool_speed(link_info->link_speed);
+		if (!netif_carrier_ok(dev))
+			base->duplex = DUPLEX_UNKNOWN;
+		else if (link_info->duplex & BNXT_LINK_DUPLEX_FULL)
+			base->duplex = DUPLEX_FULL;
+		else
+			base->duplex = DUPLEX_HALF;
+	} else {
+		base->autoneg = AUTONEG_DISABLE;
+		ethtool_speed =
+			bnxt_fw_to_ethtool_speed(link_info->req_link_speed);
+		base->duplex = DUPLEX_HALF;
+		if (link_info->req_duplex == BNXT_LINK_DUPLEX_FULL)
+			base->duplex = DUPLEX_FULL;
+	}
+	base->speed = ethtool_speed;
+
+	base->port = PORT_NONE;
+	if (link_info->media_type == PORT_PHY_QCFG_RESP_MEDIA_TYPE_TP) {
+		base->port = PORT_TP;
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, supported,
+						     TP);
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, advertising,
+						     TP);
+	} else {
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, supported,
+						     FIBRE);
+		ethtool_link_ksettings_add_link_mode(lk_ksettings, advertising,
+						     FIBRE);
+
+		if (link_info->media_type == PORT_PHY_QCFG_RESP_MEDIA_TYPE_DAC)
+			base->port = PORT_DA;
+		else if (link_info->media_type ==
+			 PORT_PHY_QCFG_RESP_MEDIA_TYPE_FIBRE)
+			base->port = PORT_FIBRE;
+	}
+	base->phy_address = link_info->phy_addr;
+	mutex_unlock(&bp->link_lock);
+
+	return 0;
+}
+
+static u32 bnxt_get_fw_speed(struct net_device *dev, u32 ethtool_speed)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_link_info *link_info = &bp->link_info;
+	u16 support_spds = link_info->support_speeds;
+	u32 fw_speed = 0;
+
+	switch (ethtool_speed) {
+	case SPEED_100:
+		if (support_spds & BNXT_LINK_SPEED_MSK_100MB)
+			fw_speed = PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_100MB;
+		break;
+	case SPEED_1000:
+		if (support_spds & BNXT_LINK_SPEED_MSK_1GB)
+			fw_speed = PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_1GB;
+		break;
+	case SPEED_2500:
+		if (support_spds & BNXT_LINK_SPEED_MSK_2_5GB)
+			fw_speed = PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_2_5GB;
+		break;
+	case SPEED_10000:
+		if (support_spds & BNXT_LINK_SPEED_MSK_10GB)
+			fw_speed = PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_10GB;
+		break;
+	case SPEED_20000:
+		if (support_spds & BNXT_LINK_SPEED_MSK_20GB)
+			fw_speed = PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_20GB;
+		break;
+	case SPEED_25000:
+		if (support_spds & BNXT_LINK_SPEED_MSK_25GB)
+			fw_speed = PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_25GB;
+		break;
+	case SPEED_40000:
+		if (support_spds & BNXT_LINK_SPEED_MSK_40GB)
+			fw_speed = PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_40GB;
+		break;
+	case SPEED_50000:
+		if (support_spds & BNXT_LINK_SPEED_MSK_50GB)
+			fw_speed = PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_50GB;
+		break;
+	case SPEED_100000:
+		if (support_spds & BNXT_LINK_SPEED_MSK_100GB)
+			fw_speed = PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_100GB;
+		break;
+	default:
+		netdev_err(dev, "unsupported speed!\n");
+		break;
+	}
+	return fw_speed;
+}
+
+u16 bnxt_get_fw_auto_link_speeds(u32 advertising)
+{
+	u16 fw_speed_mask = 0;
+
+	/* only support autoneg at speed 100, 1000, and 10000 */
+	if (advertising & (ADVERTISED_100baseT_Full |
+			   ADVERTISED_100baseT_Half)) {
+		fw_speed_mask |= BNXT_LINK_SPEED_MSK_100MB;
+	}
+	if (advertising & (ADVERTISED_1000baseT_Full |
+			   ADVERTISED_1000baseT_Half)) {
+		fw_speed_mask |= BNXT_LINK_SPEED_MSK_1GB;
+	}
+	if (advertising & ADVERTISED_10000baseT_Full)
+		fw_speed_mask |= BNXT_LINK_SPEED_MSK_10GB;
+
+	if (advertising & ADVERTISED_40000baseCR4_Full)
+		fw_speed_mask |= BNXT_LINK_SPEED_MSK_40GB;
+
+	return fw_speed_mask;
+}
+
+static int bnxt_set_link_ksettings(struct net_device *dev,
+			   const struct ethtool_link_ksettings *lk_ksettings)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_link_info *link_info = &bp->link_info;
+	const struct ethtool_link_settings *base = &lk_ksettings->base;
+	bool set_pause = false;
+	u16 fw_advertising = 0;
+	u32 speed;
+	int rc = 0;
+
+	if (!BNXT_SINGLE_PF(bp))
+		return -EOPNOTSUPP;
+
+	mutex_lock(&bp->link_lock);
+	if (base->autoneg == AUTONEG_ENABLE) {
+		BNXT_ETHTOOL_TO_FW_SPDS(fw_advertising, lk_ksettings,
+					advertising);
+		link_info->autoneg |= BNXT_AUTONEG_SPEED;
+		if (!fw_advertising)
+			link_info->advertising = link_info->support_auto_speeds;
+		else
+			link_info->advertising = fw_advertising;
+		/* any change to autoneg will cause link change, therefore the
+		 * driver should put back the original pause setting in autoneg
+		 */
+		set_pause = true;
+	} else {
+		u16 fw_speed;
+		u8 phy_type = link_info->phy_type;
+
+		if (phy_type == PORT_PHY_QCFG_RESP_PHY_TYPE_BASET  ||
+		    phy_type == PORT_PHY_QCFG_RESP_PHY_TYPE_BASETE ||
+		    link_info->media_type == PORT_PHY_QCFG_RESP_MEDIA_TYPE_TP) {
+			netdev_err(dev, "10GBase-T devices must autoneg\n");
+			rc = -EINVAL;
+			goto set_setting_exit;
+		}
+		if (base->duplex == DUPLEX_HALF) {
+			netdev_err(dev, "HALF DUPLEX is not supported!\n");
+			rc = -EINVAL;
+			goto set_setting_exit;
+		}
+		speed = base->speed;
+		fw_speed = bnxt_get_fw_speed(dev, speed);
+		if (!fw_speed) {
+			rc = -EINVAL;
+			goto set_setting_exit;
+		}
+		link_info->req_link_speed = fw_speed;
+		link_info->req_duplex = BNXT_LINK_DUPLEX_FULL;
+		link_info->autoneg = 0;
+		link_info->advertising = 0;
+	}
+
+	if (netif_running(dev))
+		rc = bnxt_hwrm_set_link_setting(bp, set_pause, false);
+
+set_setting_exit:
+	mutex_unlock(&bp->link_lock);
+	return rc;
+}
+
+static void bnxt_get_pauseparam(struct net_device *dev,
+				struct ethtool_pauseparam *epause)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_link_info *link_info = &bp->link_info;
+
+	if (BNXT_VF(bp))
+		return;
+	epause->autoneg = !!(link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL);
+	epause->rx_pause = !!(link_info->req_flow_ctrl & BNXT_LINK_PAUSE_RX);
+	epause->tx_pause = !!(link_info->req_flow_ctrl & BNXT_LINK_PAUSE_TX);
+}
+
+static int bnxt_set_pauseparam(struct net_device *dev,
+			       struct ethtool_pauseparam *epause)
+{
+	int rc = 0;
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_link_info *link_info = &bp->link_info;
+
+	if (!BNXT_SINGLE_PF(bp))
+		return -EOPNOTSUPP;
+
+	if (epause->autoneg) {
+		if (!(link_info->autoneg & BNXT_AUTONEG_SPEED))
+			return -EINVAL;
+
+		link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
+		if (bp->hwrm_spec_code >= 0x10201)
+			link_info->req_flow_ctrl =
+				PORT_PHY_CFG_REQ_AUTO_PAUSE_AUTONEG_PAUSE;
+	} else {
+		/* when transition from auto pause to force pause,
+		 * force a link change
+		 */
+		if (link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)
+			link_info->force_link_chng = true;
+		link_info->autoneg &= ~BNXT_AUTONEG_FLOW_CTRL;
+		link_info->req_flow_ctrl = 0;
+	}
+	if (epause->rx_pause)
+		link_info->req_flow_ctrl |= BNXT_LINK_PAUSE_RX;
+
+	if (epause->tx_pause)
+		link_info->req_flow_ctrl |= BNXT_LINK_PAUSE_TX;
+
+	if (netif_running(dev))
+		rc = bnxt_hwrm_set_pause(bp);
+	return rc;
+}
+
+static u32 bnxt_get_link(struct net_device *dev)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	/* TODO: handle MF, VF, driver close case */
+	return bp->link_info.link_up;
+}
+
+static int bnxt_find_nvram_item(struct net_device *dev, u16 type, u16 ordinal,
+				u16 ext, u16 *index, u32 *item_length,
+				u32 *data_length);
+
+static int bnxt_flash_nvram(struct net_device *dev,
+			    u16 dir_type,
+			    u16 dir_ordinal,
+			    u16 dir_ext,
+			    u16 dir_attr,
+			    const u8 *data,
+			    size_t data_len)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int rc;
+	struct hwrm_nvm_write_input req = {0};
+	dma_addr_t dma_handle;
+	u8 *kmem;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_NVM_WRITE, -1, -1);
+
+	req.dir_type = cpu_to_le16(dir_type);
+	req.dir_ordinal = cpu_to_le16(dir_ordinal);
+	req.dir_ext = cpu_to_le16(dir_ext);
+	req.dir_attr = cpu_to_le16(dir_attr);
+	req.dir_data_length = cpu_to_le32(data_len);
+
+	kmem = dma_alloc_coherent(&bp->pdev->dev, data_len, &dma_handle,
+				  GFP_KERNEL);
+	if (!kmem) {
+		netdev_err(dev, "dma_alloc_coherent failure, length = %u\n",
+			   (unsigned)data_len);
+		return -ENOMEM;
+	}
+	memcpy(kmem, data, data_len);
+	req.host_src_addr = cpu_to_le64(dma_handle);
+
+	rc = hwrm_send_message(bp, &req, sizeof(req), FLASH_NVRAM_TIMEOUT);
+	dma_free_coherent(&bp->pdev->dev, data_len, kmem, dma_handle);
+
+	return rc;
+}
+
+static int bnxt_firmware_reset(struct net_device *dev,
+			       u16 dir_type)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct hwrm_fw_reset_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FW_RESET, -1, -1);
+
+	/* TODO: Address self-reset of APE/KONG/BONO/TANG or ungraceful reset */
+	/*       (e.g. when firmware isn't already running) */
+	switch (dir_type) {
+	case BNX_DIR_TYPE_CHIMP_PATCH:
+	case BNX_DIR_TYPE_BOOTCODE:
+	case BNX_DIR_TYPE_BOOTCODE_2:
+		req.embedded_proc_type = FW_RESET_REQ_EMBEDDED_PROC_TYPE_BOOT;
+		/* Self-reset ChiMP upon next PCIe reset: */
+		req.selfrst_status = FW_RESET_REQ_SELFRST_STATUS_SELFRSTPCIERST;
+		break;
+	case BNX_DIR_TYPE_APE_FW:
+	case BNX_DIR_TYPE_APE_PATCH:
+		req.embedded_proc_type = FW_RESET_REQ_EMBEDDED_PROC_TYPE_MGMT;
+		/* Self-reset APE upon next PCIe reset: */
+		req.selfrst_status = FW_RESET_REQ_SELFRST_STATUS_SELFRSTPCIERST;
+		break;
+	case BNX_DIR_TYPE_KONG_FW:
+	case BNX_DIR_TYPE_KONG_PATCH:
+		req.embedded_proc_type =
+			FW_RESET_REQ_EMBEDDED_PROC_TYPE_NETCTRL;
+		break;
+	case BNX_DIR_TYPE_BONO_FW:
+	case BNX_DIR_TYPE_BONO_PATCH:
+		req.embedded_proc_type = FW_RESET_REQ_EMBEDDED_PROC_TYPE_ROCE;
+		break;
+	case BNXT_FW_RESET_CHIP:
+		req.embedded_proc_type = FW_RESET_REQ_EMBEDDED_PROC_TYPE_CHIP;
+		req.selfrst_status = FW_RESET_REQ_SELFRST_STATUS_SELFRSTASAP;
+		break;
+	case BNXT_FW_RESET_AP:
+		req.embedded_proc_type = FW_RESET_REQ_EMBEDDED_PROC_TYPE_AP;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_flash_firmware(struct net_device *dev,
+			       u16 dir_type,
+			       const u8 *fw_data,
+			       size_t fw_size)
+{
+	int	rc = 0;
+	u16	code_type;
+	u32	stored_crc;
+	u32	calculated_crc;
+	struct bnxt_fw_header *header = (struct bnxt_fw_header *)fw_data;
+
+	switch (dir_type) {
+	case BNX_DIR_TYPE_BOOTCODE:
+	case BNX_DIR_TYPE_BOOTCODE_2:
+		code_type = CODE_BOOT;
+		break;
+	case BNX_DIR_TYPE_CHIMP_PATCH:
+		code_type = CODE_CHIMP_PATCH;
+		break;
+	case BNX_DIR_TYPE_APE_FW:
+		code_type = CODE_MCTP_PASSTHRU;
+		break;
+	case BNX_DIR_TYPE_APE_PATCH:
+		code_type = CODE_APE_PATCH;
+		break;
+	case BNX_DIR_TYPE_KONG_FW:
+		code_type = CODE_KONG_FW;
+		break;
+	case BNX_DIR_TYPE_KONG_PATCH:
+		code_type = CODE_KONG_PATCH;
+		break;
+	case BNX_DIR_TYPE_BONO_FW:
+		code_type = CODE_BONO_FW;
+		break;
+	case BNX_DIR_TYPE_BONO_PATCH:
+		code_type = CODE_BONO_PATCH;
+		break;
+	default:
+		netdev_err(dev, "Unsupported directory entry type: %u\n",
+			   dir_type);
+		return -EINVAL;
+	}
+	if (fw_size < sizeof(struct bnxt_fw_header)) {
+		netdev_err(dev, "Invalid firmware file size: %u\n",
+			   (unsigned int)fw_size);
+		return -EINVAL;
+	}
+	if (header->signature != cpu_to_le32(BNXT_FIRMWARE_BIN_SIGNATURE)) {
+		netdev_err(dev, "Invalid firmware signature: %08X\n",
+			   le32_to_cpu(header->signature));
+		return -EINVAL;
+	}
+	if (header->code_type != code_type) {
+		netdev_err(dev, "Expected firmware type: %d, read: %d\n",
+			   code_type, header->code_type);
+		return -EINVAL;
+	}
+	if (header->device != DEVICE_CUMULUS_FAMILY) {
+		netdev_err(dev, "Expected firmware device family %d, read: %d\n",
+			   DEVICE_CUMULUS_FAMILY, header->device);
+		return -EINVAL;
+	}
+	/* Confirm the CRC32 checksum of the file: */
+	stored_crc = le32_to_cpu(*(__le32 *)(fw_data + fw_size -
+					     sizeof(stored_crc)));
+	calculated_crc = ~crc32(~0, fw_data, fw_size - sizeof(stored_crc));
+	if (calculated_crc != stored_crc) {
+		netdev_err(dev, "Firmware file CRC32 checksum (%08lX) does not match calculated checksum (%08lX)\n",
+			   (unsigned long)stored_crc,
+			   (unsigned long)calculated_crc);
+		return -EINVAL;
+	}
+	rc = bnxt_flash_nvram(dev, dir_type, BNX_DIR_ORDINAL_FIRST,
+			      0, 0, fw_data, fw_size);
+	if (rc == 0)	/* Firmware update successful */
+		rc = bnxt_firmware_reset(dev, dir_type);
+
+	return rc;
+}
+
+static int bnxt_flash_microcode(struct net_device *dev,
+				u16 dir_type,
+				const u8 *fw_data,
+				size_t fw_size)
+{
+	struct bnxt_ucode_trailer *trailer;
+	u32 calculated_crc;
+	u32 stored_crc;
+	int rc = 0;
+
+	if (fw_size < sizeof(struct bnxt_ucode_trailer)) {
+		netdev_err(dev, "Invalid microcode file size: %u\n",
+			   (unsigned int)fw_size);
+		return -EINVAL;
+	}
+	trailer = (struct bnxt_ucode_trailer *)(fw_data + (fw_size -
+						sizeof(*trailer)));
+	if (trailer->sig != cpu_to_le32(BNXT_UCODE_TRAILER_SIGNATURE)) {
+		netdev_err(dev, "Invalid microcode trailer signature: %08X\n",
+			   le32_to_cpu(trailer->sig));
+		return -EINVAL;
+	}
+	if (le16_to_cpu(trailer->dir_type) != dir_type) {
+		netdev_err(dev, "Expected microcode type: %d, read: %d\n",
+			   dir_type, le16_to_cpu(trailer->dir_type));
+		return -EINVAL;
+	}
+	if (le16_to_cpu(trailer->trailer_length) <
+		sizeof(struct bnxt_ucode_trailer)) {
+		netdev_err(dev, "Invalid microcode trailer length: %d\n",
+			   le16_to_cpu(trailer->trailer_length));
+		return -EINVAL;
+	}
+
+	/* Confirm the CRC32 checksum of the file: */
+	stored_crc = le32_to_cpu(*(__le32 *)(fw_data + fw_size -
+					     sizeof(stored_crc)));
+	calculated_crc = ~crc32(~0, fw_data, fw_size - sizeof(stored_crc));
+	if (calculated_crc != stored_crc) {
+		netdev_err(dev,
+			   "CRC32 (%08lX) does not match calculated: %08lX\n",
+			   (unsigned long)stored_crc,
+			   (unsigned long)calculated_crc);
+		return -EINVAL;
+	}
+	rc = bnxt_flash_nvram(dev, dir_type, BNX_DIR_ORDINAL_FIRST,
+			      0, 0, fw_data, fw_size);
+
+	return rc;
+}
+
+static bool bnxt_dir_type_is_ape_bin_format(u16 dir_type)
+{
+	switch (dir_type) {
+	case BNX_DIR_TYPE_CHIMP_PATCH:
+	case BNX_DIR_TYPE_BOOTCODE:
+	case BNX_DIR_TYPE_BOOTCODE_2:
+	case BNX_DIR_TYPE_APE_FW:
+	case BNX_DIR_TYPE_APE_PATCH:
+	case BNX_DIR_TYPE_KONG_FW:
+	case BNX_DIR_TYPE_KONG_PATCH:
+	case BNX_DIR_TYPE_BONO_FW:
+	case BNX_DIR_TYPE_BONO_PATCH:
+		return true;
+	}
+
+	return false;
+}
+
+static bool bnxt_dir_type_is_other_exec_format(u16 dir_type)
+{
+	switch (dir_type) {
+	case BNX_DIR_TYPE_AVS:
+	case BNX_DIR_TYPE_EXP_ROM_MBA:
+	case BNX_DIR_TYPE_PCIE:
+	case BNX_DIR_TYPE_TSCF_UCODE:
+	case BNX_DIR_TYPE_EXT_PHY:
+	case BNX_DIR_TYPE_CCM:
+	case BNX_DIR_TYPE_ISCSI_BOOT:
+	case BNX_DIR_TYPE_ISCSI_BOOT_IPV6:
+	case BNX_DIR_TYPE_ISCSI_BOOT_IPV4N6:
+		return true;
+	}
+
+	return false;
+}
+
+static bool bnxt_dir_type_is_executable(u16 dir_type)
+{
+	return bnxt_dir_type_is_ape_bin_format(dir_type) ||
+		bnxt_dir_type_is_other_exec_format(dir_type);
+}
+
+static int bnxt_flash_firmware_from_file(struct net_device *dev,
+					 u16 dir_type,
+					 const char *filename)
+{
+	const struct firmware  *fw;
+	int			rc;
+
+	rc = request_firmware(&fw, filename, &dev->dev);
+	if (rc != 0) {
+		netdev_err(dev, "Error %d requesting firmware file: %s\n",
+			   rc, filename);
+		return rc;
+	}
+	if (bnxt_dir_type_is_ape_bin_format(dir_type) == true)
+		rc = bnxt_flash_firmware(dev, dir_type, fw->data, fw->size);
+	else if (bnxt_dir_type_is_other_exec_format(dir_type) == true)
+		rc = bnxt_flash_microcode(dev, dir_type, fw->data, fw->size);
+	else
+		rc = bnxt_flash_nvram(dev, dir_type, BNX_DIR_ORDINAL_FIRST,
+				      0, 0, fw->data, fw->size);
+	release_firmware(fw);
+	return rc;
+}
+
+static int bnxt_flash_package_from_file(struct net_device *dev,
+					char *filename, u32 install_type)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct hwrm_nvm_install_update_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_nvm_install_update_input install = {0};
+	const struct firmware *fw;
+	u32 item_len;
+	u16 index;
+	int rc;
+
+	bnxt_hwrm_fw_set_time(bp);
+
+	if (bnxt_find_nvram_item(dev, BNX_DIR_TYPE_UPDATE,
+				 BNX_DIR_ORDINAL_FIRST, BNX_DIR_EXT_NONE,
+				 &index, &item_len, NULL) != 0) {
+		netdev_err(dev, "PKG update area not created in nvram\n");
+		return -ENOBUFS;
+	}
+
+	rc = request_firmware(&fw, filename, &dev->dev);
+	if (rc != 0) {
+		netdev_err(dev, "PKG error %d requesting file: %s\n",
+			   rc, filename);
+		return rc;
+	}
+
+	if (fw->size > item_len) {
+		netdev_err(dev, "PKG insufficient update area in nvram: %lu",
+			   (unsigned long)fw->size);
+		rc = -EFBIG;
+	} else {
+		dma_addr_t dma_handle;
+		u8 *kmem;
+		struct hwrm_nvm_modify_input modify = {0};
+
+		bnxt_hwrm_cmd_hdr_init(bp, &modify, HWRM_NVM_MODIFY, -1, -1);
+
+		modify.dir_idx = cpu_to_le16(index);
+		modify.len = cpu_to_le32(fw->size);
+
+		kmem = dma_alloc_coherent(&bp->pdev->dev, fw->size,
+					  &dma_handle, GFP_KERNEL);
+		if (!kmem) {
+			netdev_err(dev,
+				   "dma_alloc_coherent failure, length = %u\n",
+				   (unsigned int)fw->size);
+			rc = -ENOMEM;
+		} else {
+			memcpy(kmem, fw->data, fw->size);
+			modify.host_src_addr = cpu_to_le64(dma_handle);
+
+			rc = hwrm_send_message(bp, &modify, sizeof(modify),
+					       FLASH_PACKAGE_TIMEOUT);
+			dma_free_coherent(&bp->pdev->dev, fw->size, kmem,
+					  dma_handle);
+		}
+	}
+	release_firmware(fw);
+	if (rc)
+		return rc;
+
+	if ((install_type & 0xffff) == 0)
+		install_type >>= 16;
+	bnxt_hwrm_cmd_hdr_init(bp, &install, HWRM_NVM_INSTALL_UPDATE, -1, -1);
+	install.install_type = cpu_to_le32(install_type);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &install, sizeof(install),
+				INSTALL_PACKAGE_TIMEOUT);
+	if (rc) {
+		rc = -EOPNOTSUPP;
+		goto flash_pkg_exit;
+	}
+
+	if (resp->error_code) {
+		u8 error_code = ((struct hwrm_err_output *)resp)->cmd_err;
+
+		if (error_code == NVM_INSTALL_UPDATE_CMD_ERR_CODE_FRAG_ERR) {
+			install.flags |= cpu_to_le16(
+			       NVM_INSTALL_UPDATE_REQ_FLAGS_ALLOWED_TO_DEFRAG);
+			rc = _hwrm_send_message(bp, &install, sizeof(install),
+						INSTALL_PACKAGE_TIMEOUT);
+			if (rc) {
+				rc = -EOPNOTSUPP;
+				goto flash_pkg_exit;
+			}
+		}
+	}
+
+	if (resp->result) {
+		netdev_err(dev, "PKG install error = %d, problem_item = %d\n",
+			   (s8)resp->result, (int)resp->problem_item);
+		rc = -ENOPKG;
+	}
+flash_pkg_exit:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_flash_device(struct net_device *dev,
+			     struct ethtool_flash *flash)
+{
+	if (!BNXT_PF((struct bnxt *)netdev_priv(dev))) {
+		netdev_err(dev, "flashdev not supported from a virtual function\n");
+		return -EINVAL;
+	}
+
+	if (flash->region == ETHTOOL_FLASH_ALL_REGIONS ||
+	    flash->region > 0xffff)
+		return bnxt_flash_package_from_file(dev, flash->data,
+						    flash->region);
+
+	return bnxt_flash_firmware_from_file(dev, flash->region, flash->data);
+}
+
+static int nvm_get_dir_info(struct net_device *dev, u32 *entries, u32 *length)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int rc;
+	struct hwrm_nvm_get_dir_info_input req = {0};
+	struct hwrm_nvm_get_dir_info_output *output = bp->hwrm_cmd_resp_addr;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_NVM_GET_DIR_INFO, -1, -1);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc) {
+		*entries = le32_to_cpu(output->entries);
+		*length = le32_to_cpu(output->entry_length);
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_get_eeprom_len(struct net_device *dev)
+{
+	/* The -1 return value allows the entire 32-bit range of offsets to be
+	 * passed via the ethtool command-line utility.
+	 */
+	return -1;
+}
+
+static int bnxt_get_nvram_directory(struct net_device *dev, u32 len, u8 *data)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int rc;
+	u32 dir_entries;
+	u32 entry_length;
+	u8 *buf;
+	size_t buflen;
+	dma_addr_t dma_handle;
+	struct hwrm_nvm_get_dir_entries_input req = {0};
+
+	rc = nvm_get_dir_info(dev, &dir_entries, &entry_length);
+	if (rc != 0)
+		return rc;
+
+	/* Insert 2 bytes of directory info (count and size of entries) */
+	if (len < 2)
+		return -EINVAL;
+
+	*data++ = dir_entries;
+	*data++ = entry_length;
+	len -= 2;
+	memset(data, 0xff, len);
+
+	buflen = dir_entries * entry_length;
+	buf = dma_alloc_coherent(&bp->pdev->dev, buflen, &dma_handle,
+				 GFP_KERNEL);
+	if (!buf) {
+		netdev_err(dev, "dma_alloc_coherent failure, length = %u\n",
+			   (unsigned)buflen);
+		return -ENOMEM;
+	}
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_NVM_GET_DIR_ENTRIES, -1, -1);
+	req.host_dest_addr = cpu_to_le64(dma_handle);
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc == 0)
+		memcpy(data, buf, len > buflen ? buflen : len);
+	dma_free_coherent(&bp->pdev->dev, buflen, buf, dma_handle);
+	return rc;
+}
+
+static int bnxt_get_nvram_item(struct net_device *dev, u32 index, u32 offset,
+			       u32 length, u8 *data)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int rc;
+	u8 *buf;
+	dma_addr_t dma_handle;
+	struct hwrm_nvm_read_input req = {0};
+
+	if (!length)
+		return -EINVAL;
+
+	buf = dma_alloc_coherent(&bp->pdev->dev, length, &dma_handle,
+				 GFP_KERNEL);
+	if (!buf) {
+		netdev_err(dev, "dma_alloc_coherent failure, length = %u\n",
+			   (unsigned)length);
+		return -ENOMEM;
+	}
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_NVM_READ, -1, -1);
+	req.host_dest_addr = cpu_to_le64(dma_handle);
+	req.dir_idx = cpu_to_le16(index);
+	req.offset = cpu_to_le32(offset);
+	req.len = cpu_to_le32(length);
+
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc == 0)
+		memcpy(data, buf, length);
+	dma_free_coherent(&bp->pdev->dev, length, buf, dma_handle);
+	return rc;
+}
+
+static int bnxt_find_nvram_item(struct net_device *dev, u16 type, u16 ordinal,
+				u16 ext, u16 *index, u32 *item_length,
+				u32 *data_length)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int rc;
+	struct hwrm_nvm_find_dir_entry_input req = {0};
+	struct hwrm_nvm_find_dir_entry_output *output = bp->hwrm_cmd_resp_addr;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_NVM_FIND_DIR_ENTRY, -1, -1);
+	req.enables = 0;
+	req.dir_idx = 0;
+	req.dir_type = cpu_to_le16(type);
+	req.dir_ordinal = cpu_to_le16(ordinal);
+	req.dir_ext = cpu_to_le16(ext);
+	req.opt_ordinal = NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_EQ;
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message_silent(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc == 0) {
+		if (index)
+			*index = le16_to_cpu(output->dir_idx);
+		if (item_length)
+			*item_length = le32_to_cpu(output->dir_item_length);
+		if (data_length)
+			*data_length = le32_to_cpu(output->dir_data_length);
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static char *bnxt_parse_pkglog(int desired_field, u8 *data, size_t datalen)
+{
+	char	*retval = NULL;
+	char	*p;
+	char	*value;
+	int	field = 0;
+
+	if (datalen < 1)
+		return NULL;
+	/* null-terminate the log data (removing last '\n'): */
+	data[datalen - 1] = 0;
+	for (p = data; *p != 0; p++) {
+		field = 0;
+		retval = NULL;
+		while (*p != 0 && *p != '\n') {
+			value = p;
+			while (*p != 0 && *p != '\t' && *p != '\n')
+				p++;
+			if (field == desired_field)
+				retval = value;
+			if (*p != '\t')
+				break;
+			*p = 0;
+			field++;
+			p++;
+		}
+		if (*p == 0)
+			break;
+		*p = 0;
+	}
+	return retval;
+}
+
+static void bnxt_get_pkgver(struct net_device *dev)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	u16 index = 0;
+	char *pkgver;
+	u32 pkglen;
+	u8 *pkgbuf;
+	int len;
+
+	if (bnxt_find_nvram_item(dev, BNX_DIR_TYPE_PKG_LOG,
+				 BNX_DIR_ORDINAL_FIRST, BNX_DIR_EXT_NONE,
+				 &index, NULL, &pkglen) != 0)
+		return;
+
+	pkgbuf = kzalloc(pkglen, GFP_KERNEL);
+	if (!pkgbuf) {
+		dev_err(&bp->pdev->dev, "Unable to allocate memory for pkg version, length = %u\n",
+			pkglen);
+		return;
+	}
+
+	if (bnxt_get_nvram_item(dev, index, 0, pkglen, pkgbuf))
+		goto err;
+
+	pkgver = bnxt_parse_pkglog(BNX_PKG_LOG_FIELD_IDX_PKG_VERSION, pkgbuf,
+				   pkglen);
+	if (pkgver && *pkgver != 0 && isdigit(*pkgver)) {
+		len = strlen(bp->fw_ver_str);
+		snprintf(bp->fw_ver_str + len, FW_VER_STR_LEN - len - 1,
+			 "/pkg %s", pkgver);
+	}
+err:
+	kfree(pkgbuf);
+}
+
+static int bnxt_get_eeprom(struct net_device *dev,
+			   struct ethtool_eeprom *eeprom,
+			   u8 *data)
+{
+	u32 index;
+	u32 offset;
+
+	if (eeprom->offset == 0) /* special offset value to get directory */
+		return bnxt_get_nvram_directory(dev, eeprom->len, data);
+
+	index = eeprom->offset >> 24;
+	offset = eeprom->offset & 0xffffff;
+
+	if (index == 0) {
+		netdev_err(dev, "unsupported index value: %d\n", index);
+		return -EINVAL;
+	}
+
+	return bnxt_get_nvram_item(dev, index - 1, offset, eeprom->len, data);
+}
+
+static int bnxt_erase_nvram_directory(struct net_device *dev, u8 index)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct hwrm_nvm_erase_dir_entry_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_NVM_ERASE_DIR_ENTRY, -1, -1);
+	req.dir_idx = cpu_to_le16(index);
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_set_eeprom(struct net_device *dev,
+			   struct ethtool_eeprom *eeprom,
+			   u8 *data)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	u8 index, dir_op;
+	u16 type, ext, ordinal, attr;
+
+	if (!BNXT_PF(bp)) {
+		netdev_err(dev, "NVM write not supported from a virtual function\n");
+		return -EINVAL;
+	}
+
+	type = eeprom->magic >> 16;
+
+	if (type == 0xffff) { /* special value for directory operations */
+		index = eeprom->magic & 0xff;
+		dir_op = eeprom->magic >> 8;
+		if (index == 0)
+			return -EINVAL;
+		switch (dir_op) {
+		case 0x0e: /* erase */
+			if (eeprom->offset != ~eeprom->magic)
+				return -EINVAL;
+			return bnxt_erase_nvram_directory(dev, index - 1);
+		default:
+			return -EINVAL;
+		}
+	}
+
+	/* Create or re-write an NVM item: */
+	if (bnxt_dir_type_is_executable(type) == true)
+		return -EOPNOTSUPP;
+	ext = eeprom->magic & 0xffff;
+	ordinal = eeprom->offset >> 16;
+	attr = eeprom->offset & 0xffff;
+
+	return bnxt_flash_nvram(dev, type, ordinal, ext, attr, data,
+				eeprom->len);
+}
+
+static int bnxt_set_eee(struct net_device *dev, struct ethtool_eee *edata)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct ethtool_eee *eee = &bp->eee;
+	struct bnxt_link_info *link_info = &bp->link_info;
+	u32 advertising =
+		 _bnxt_fw_to_ethtool_adv_spds(link_info->advertising, 0);
+	int rc = 0;
+
+	if (!BNXT_SINGLE_PF(bp))
+		return -EOPNOTSUPP;
+
+	if (!(bp->flags & BNXT_FLAG_EEE_CAP))
+		return -EOPNOTSUPP;
+
+	if (!edata->eee_enabled)
+		goto eee_ok;
+
+	if (!(link_info->autoneg & BNXT_AUTONEG_SPEED)) {
+		netdev_warn(dev, "EEE requires autoneg\n");
+		return -EINVAL;
+	}
+	if (edata->tx_lpi_enabled) {
+		if (bp->lpi_tmr_hi && (edata->tx_lpi_timer > bp->lpi_tmr_hi ||
+				       edata->tx_lpi_timer < bp->lpi_tmr_lo)) {
+			netdev_warn(dev, "Valid LPI timer range is %d and %d microsecs\n",
+				    bp->lpi_tmr_lo, bp->lpi_tmr_hi);
+			return -EINVAL;
+		} else if (!bp->lpi_tmr_hi) {
+			edata->tx_lpi_timer = eee->tx_lpi_timer;
+		}
+	}
+	if (!edata->advertised) {
+		edata->advertised = advertising & eee->supported;
+	} else if (edata->advertised & ~advertising) {
+		netdev_warn(dev, "EEE advertised %x must be a subset of autoneg advertised speeds %x\n",
+			    edata->advertised, advertising);
+		return -EINVAL;
+	}
+
+	eee->advertised = edata->advertised;
+	eee->tx_lpi_enabled = edata->tx_lpi_enabled;
+	eee->tx_lpi_timer = edata->tx_lpi_timer;
+eee_ok:
+	eee->eee_enabled = edata->eee_enabled;
+
+	if (netif_running(dev))
+		rc = bnxt_hwrm_set_link_setting(bp, false, true);
+
+	return rc;
+}
+
+static int bnxt_get_eee(struct net_device *dev, struct ethtool_eee *edata)
+{
+	struct bnxt *bp = netdev_priv(dev);
+
+	if (!(bp->flags & BNXT_FLAG_EEE_CAP))
+		return -EOPNOTSUPP;
+
+	*edata = bp->eee;
+	if (!bp->eee.eee_enabled) {
+		/* Preserve tx_lpi_timer so that the last value will be used
+		 * by default when it is re-enabled.
+		 */
+		edata->advertised = 0;
+		edata->tx_lpi_enabled = 0;
+	}
+
+	if (!bp->eee.eee_active)
+		edata->lp_advertised = 0;
+
+	return 0;
+}
+
+static int bnxt_read_sfp_module_eeprom_info(struct bnxt *bp, u16 i2c_addr,
+					    u16 page_number, u16 start_addr,
+					    u16 data_length, u8 *buf)
+{
+	struct hwrm_port_phy_i2c_read_input req = {0};
+	struct hwrm_port_phy_i2c_read_output *output = bp->hwrm_cmd_resp_addr;
+	int rc, byte_offset = 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_PHY_I2C_READ, -1, -1);
+	req.i2c_slave_addr = i2c_addr;
+	req.page_number = cpu_to_le16(page_number);
+	req.port_id = cpu_to_le16(bp->pf.port_id);
+	do {
+		u16 xfer_size;
+
+		xfer_size = min_t(u16, data_length, BNXT_MAX_PHY_I2C_RESP_SIZE);
+		data_length -= xfer_size;
+		req.page_offset = cpu_to_le16(start_addr + byte_offset);
+		req.data_length = xfer_size;
+		req.enables = cpu_to_le32(start_addr + byte_offset ?
+				 PORT_PHY_I2C_READ_REQ_ENABLES_PAGE_OFFSET : 0);
+		mutex_lock(&bp->hwrm_cmd_lock);
+		rc = _hwrm_send_message(bp, &req, sizeof(req),
+					HWRM_CMD_TIMEOUT);
+		if (!rc)
+			memcpy(buf + byte_offset, output->data, xfer_size);
+		mutex_unlock(&bp->hwrm_cmd_lock);
+		byte_offset += xfer_size;
+	} while (!rc && data_length > 0);
+
+	return rc;
+}
+
+static int bnxt_get_module_info(struct net_device *dev,
+				struct ethtool_modinfo *modinfo)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct hwrm_port_phy_i2c_read_input req = {0};
+	struct hwrm_port_phy_i2c_read_output *output = bp->hwrm_cmd_resp_addr;
+	int rc;
+
+	/* No point in going further if phy status indicates
+	 * module is not inserted or if it is powered down or
+	 * if it is of type 10GBase-T
+	 */
+	if (bp->link_info.module_status >
+		PORT_PHY_QCFG_RESP_MODULE_STATUS_WARNINGMSG)
+		return -EOPNOTSUPP;
+
+	/* This feature is not supported in older firmware versions */
+	if (bp->hwrm_spec_code < 0x10202)
+		return -EOPNOTSUPP;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_PHY_I2C_READ, -1, -1);
+	req.i2c_slave_addr = I2C_DEV_ADDR_A0;
+	req.page_number = 0;
+	req.page_offset = cpu_to_le16(SFP_EEPROM_SFF_8472_COMP_ADDR);
+	req.data_length = SFP_EEPROM_SFF_8472_COMP_SIZE;
+	req.port_id = cpu_to_le16(bp->pf.port_id);
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc) {
+		u32 module_id = le32_to_cpu(output->data[0]);
+
+		switch (module_id) {
+		case SFF_MODULE_ID_SFP:
+			modinfo->type = ETH_MODULE_SFF_8472;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+			break;
+		case SFF_MODULE_ID_QSFP:
+		case SFF_MODULE_ID_QSFP_PLUS:
+			modinfo->type = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+			break;
+		case SFF_MODULE_ID_QSFP28:
+			modinfo->type = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+			break;
+		default:
+			rc = -EOPNOTSUPP;
+			break;
+		}
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_get_module_eeprom(struct net_device *dev,
+				  struct ethtool_eeprom *eeprom,
+				  u8 *data)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	u16  start = eeprom->offset, length = eeprom->len;
+	int rc = 0;
+
+	memset(data, 0, eeprom->len);
+
+	/* Read A0 portion of the EEPROM */
+	if (start < ETH_MODULE_SFF_8436_LEN) {
+		if (start + eeprom->len > ETH_MODULE_SFF_8436_LEN)
+			length = ETH_MODULE_SFF_8436_LEN - start;
+		rc = bnxt_read_sfp_module_eeprom_info(bp, I2C_DEV_ADDR_A0, 0,
+						      start, length, data);
+		if (rc)
+			return rc;
+		start += length;
+		data += length;
+		length = eeprom->len - length;
+	}
+
+	/* Read A2 portion of the EEPROM */
+	if (length) {
+		start -= ETH_MODULE_SFF_8436_LEN;
+		rc = bnxt_read_sfp_module_eeprom_info(bp, I2C_DEV_ADDR_A2, 1,
+						      start, length, data);
+	}
+	return rc;
+}
+
+static int bnxt_nway_reset(struct net_device *dev)
+{
+	int rc = 0;
+
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_link_info *link_info = &bp->link_info;
+
+	if (!BNXT_SINGLE_PF(bp))
+		return -EOPNOTSUPP;
+
+	if (!(link_info->autoneg & BNXT_AUTONEG_SPEED))
+		return -EINVAL;
+
+	if (netif_running(dev))
+		rc = bnxt_hwrm_set_link_setting(bp, true, false);
+
+	return rc;
+}
+
+static int bnxt_set_phys_id(struct net_device *dev,
+			    enum ethtool_phys_id_state state)
+{
+	struct hwrm_port_led_cfg_input req = {0};
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_pf_info *pf = &bp->pf;
+	struct bnxt_led_cfg *led_cfg;
+	u8 led_state;
+	__le16 duration;
+	int i, rc;
+
+	if (!bp->num_leds || BNXT_VF(bp))
+		return -EOPNOTSUPP;
+
+	if (state == ETHTOOL_ID_ACTIVE) {
+		led_state = PORT_LED_CFG_REQ_LED0_STATE_BLINKALT;
+		duration = cpu_to_le16(500);
+	} else if (state == ETHTOOL_ID_INACTIVE) {
+		led_state = PORT_LED_CFG_REQ_LED1_STATE_DEFAULT;
+		duration = cpu_to_le16(0);
+	} else {
+		return -EINVAL;
+	}
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_LED_CFG, -1, -1);
+	req.port_id = cpu_to_le16(pf->port_id);
+	req.num_leds = bp->num_leds;
+	led_cfg = (struct bnxt_led_cfg *)&req.led0_id;
+	for (i = 0; i < bp->num_leds; i++, led_cfg++) {
+		req.enables |= BNXT_LED_DFLT_ENABLES(i);
+		led_cfg->led_id = bp->leds[i].led_id;
+		led_cfg->led_state = led_state;
+		led_cfg->led_blink_on = duration;
+		led_cfg->led_blink_off = duration;
+		led_cfg->led_group_id = bp->leds[i].led_group_id;
+	}
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		rc = -EIO;
+	return rc;
+}
+
+static int bnxt_hwrm_selftest_irq(struct bnxt *bp, u16 cmpl_ring)
+{
+	struct hwrm_selftest_irq_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_SELFTEST_IRQ, cmpl_ring, -1);
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_test_irq(struct bnxt *bp)
+{
+	int i;
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		u16 cmpl_ring = bp->grp_info[i].cp_fw_ring_id;
+		int rc;
+
+		rc = bnxt_hwrm_selftest_irq(bp, cmpl_ring);
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
+
+static int bnxt_hwrm_mac_loopback(struct bnxt *bp, bool enable)
+{
+	struct hwrm_port_mac_cfg_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_MAC_CFG, -1, -1);
+
+	req.enables = cpu_to_le32(PORT_MAC_CFG_REQ_ENABLES_LPBK);
+	if (enable)
+		req.lpbk = PORT_MAC_CFG_REQ_LPBK_LOCAL;
+	else
+		req.lpbk = PORT_MAC_CFG_REQ_LPBK_NONE;
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_disable_an_for_lpbk(struct bnxt *bp,
+				    struct hwrm_port_phy_cfg_input *req)
+{
+	struct bnxt_link_info *link_info = &bp->link_info;
+	u16 fw_advertising = link_info->advertising;
+	u16 fw_speed;
+	int rc;
+
+	if (!link_info->autoneg)
+		return 0;
+
+	fw_speed = PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_1GB;
+	if (netif_carrier_ok(bp->dev))
+		fw_speed = bp->link_info.link_speed;
+	else if (fw_advertising & BNXT_LINK_SPEED_MSK_10GB)
+		fw_speed = PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_10GB;
+	else if (fw_advertising & BNXT_LINK_SPEED_MSK_25GB)
+		fw_speed = PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_25GB;
+	else if (fw_advertising & BNXT_LINK_SPEED_MSK_40GB)
+		fw_speed = PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_40GB;
+	else if (fw_advertising & BNXT_LINK_SPEED_MSK_50GB)
+		fw_speed = PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_50GB;
+
+	req->force_link_speed = cpu_to_le16(fw_speed);
+	req->flags |= cpu_to_le32(PORT_PHY_CFG_REQ_FLAGS_FORCE |
+				  PORT_PHY_CFG_REQ_FLAGS_RESET_PHY);
+	rc = hwrm_send_message(bp, req, sizeof(*req), HWRM_CMD_TIMEOUT);
+	req->flags = 0;
+	req->force_link_speed = cpu_to_le16(0);
+	return rc;
+}
+
+static int bnxt_hwrm_phy_loopback(struct bnxt *bp, bool enable)
+{
+	struct hwrm_port_phy_cfg_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_PHY_CFG, -1, -1);
+
+	if (enable) {
+		bnxt_disable_an_for_lpbk(bp, &req);
+		req.lpbk = PORT_PHY_CFG_REQ_LPBK_LOCAL;
+	} else {
+		req.lpbk = PORT_PHY_CFG_REQ_LPBK_NONE;
+	}
+	req.enables = cpu_to_le32(PORT_PHY_CFG_REQ_ENABLES_LPBK);
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_rx_loopback(struct bnxt *bp, struct bnxt_napi *bnapi,
+			    u32 raw_cons, int pkt_size)
+{
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	struct bnxt_rx_ring_info *rxr = bnapi->rx_ring;
+	struct bnxt_sw_rx_bd *rx_buf;
+	struct rx_cmp *rxcmp;
+	u16 cp_cons, cons;
+	u8 *data;
+	u32 len;
+	int i;
+
+	cp_cons = RING_CMP(raw_cons);
+	rxcmp = (struct rx_cmp *)
+		&cpr->cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)];
+	cons = rxcmp->rx_cmp_opaque;
+	rx_buf = &rxr->rx_buf_ring[cons];
+	data = rx_buf->data_ptr;
+	len = le32_to_cpu(rxcmp->rx_cmp_len_flags_type) >> RX_CMP_LEN_SHIFT;
+	if (len != pkt_size)
+		return -EIO;
+	i = ETH_ALEN;
+	if (!ether_addr_equal(data + i, bnapi->bp->dev->dev_addr))
+		return -EIO;
+	i += ETH_ALEN;
+	for (  ; i < pkt_size; i++) {
+		if (data[i] != (u8)(i & 0xff))
+			return -EIO;
+	}
+	return 0;
+}
+
+static int bnxt_poll_loopback(struct bnxt *bp, int pkt_size)
+{
+	struct bnxt_napi *bnapi = bp->bnapi[0];
+	struct bnxt_cp_ring_info *cpr;
+	struct tx_cmp *txcmp;
+	int rc = -EIO;
+	u32 raw_cons;
+	u32 cons;
+	int i;
+
+	cpr = &bnapi->cp_ring;
+	raw_cons = cpr->cp_raw_cons;
+	for (i = 0; i < 200; i++) {
+		cons = RING_CMP(raw_cons);
+		txcmp = &cpr->cp_desc_ring[CP_RING(cons)][CP_IDX(cons)];
+
+		if (!TX_CMP_VALID(txcmp, raw_cons)) {
+			udelay(5);
+			continue;
+		}
+
+		/* The valid test of the entry must be done first before
+		 * reading any further.
+		 */
+		dma_rmb();
+		if (TX_CMP_TYPE(txcmp) == CMP_TYPE_RX_L2_CMP) {
+			rc = bnxt_rx_loopback(bp, bnapi, raw_cons, pkt_size);
+			raw_cons = NEXT_RAW_CMP(raw_cons);
+			raw_cons = NEXT_RAW_CMP(raw_cons);
+			break;
+		}
+		raw_cons = NEXT_RAW_CMP(raw_cons);
+	}
+	cpr->cp_raw_cons = raw_cons;
+	return rc;
+}
+
+static int bnxt_run_loopback(struct bnxt *bp)
+{
+	struct bnxt_tx_ring_info *txr = &bp->tx_ring[0];
+	int pkt_size, i = 0;
+	struct sk_buff *skb;
+	dma_addr_t map;
+	u8 *data;
+	int rc;
+
+	pkt_size = min(bp->dev->mtu + ETH_HLEN, bp->rx_copy_thresh);
+	skb = netdev_alloc_skb(bp->dev, pkt_size);
+	if (!skb)
+		return -ENOMEM;
+	data = skb_put(skb, pkt_size);
+	eth_broadcast_addr(data);
+	i += ETH_ALEN;
+	ether_addr_copy(&data[i], bp->dev->dev_addr);
+	i += ETH_ALEN;
+	for ( ; i < pkt_size; i++)
+		data[i] = (u8)(i & 0xff);
+
+	map = dma_map_single(&bp->pdev->dev, skb->data, pkt_size,
+			     PCI_DMA_TODEVICE);
+	if (dma_mapping_error(&bp->pdev->dev, map)) {
+		dev_kfree_skb(skb);
+		return -EIO;
+	}
+	bnxt_xmit_xdp(bp, txr, map, pkt_size, 0);
+
+	/* Sync BD data before updating doorbell */
+	wmb();
+
+	bnxt_db_write(bp, txr->tx_doorbell, DB_KEY_TX | txr->tx_prod);
+	rc = bnxt_poll_loopback(bp, pkt_size);
+
+	dma_unmap_single(&bp->pdev->dev, map, pkt_size, PCI_DMA_TODEVICE);
+	dev_kfree_skb(skb);
+	return rc;
+}
+
+static int bnxt_run_fw_tests(struct bnxt *bp, u8 test_mask, u8 *test_results)
+{
+	struct hwrm_selftest_exec_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_selftest_exec_input req = {0};
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_SELFTEST_EXEC, -1, -1);
+	mutex_lock(&bp->hwrm_cmd_lock);
+	resp->test_success = 0;
+	req.flags = test_mask;
+	rc = _hwrm_send_message(bp, &req, sizeof(req), bp->test_info->timeout);
+	*test_results = resp->test_success;
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+#define BNXT_DRV_TESTS			3
+#define BNXT_MACLPBK_TEST_IDX		(bp->num_tests - BNXT_DRV_TESTS)
+#define BNXT_PHYLPBK_TEST_IDX		(BNXT_MACLPBK_TEST_IDX + 1)
+#define BNXT_IRQ_TEST_IDX		(BNXT_MACLPBK_TEST_IDX + 2)
+
+static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest,
+			   u64 *buf)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	bool offline = false;
+	u8 test_results = 0;
+	u8 test_mask = 0;
+	int rc, i;
+
+	if (!bp->num_tests || !BNXT_SINGLE_PF(bp))
+		return;
+	memset(buf, 0, sizeof(u64) * bp->num_tests);
+	if (!netif_running(dev)) {
+		etest->flags |= ETH_TEST_FL_FAILED;
+		return;
+	}
+
+	if (etest->flags & ETH_TEST_FL_OFFLINE) {
+		if (bp->pf.active_vfs) {
+			etest->flags |= ETH_TEST_FL_FAILED;
+			netdev_warn(dev, "Offline tests cannot be run with active VFs\n");
+			return;
+		}
+		offline = true;
+	}
+
+	for (i = 0; i < bp->num_tests - BNXT_DRV_TESTS; i++) {
+		u8 bit_val = 1 << i;
+
+		if (!(bp->test_info->offline_mask & bit_val))
+			test_mask |= bit_val;
+		else if (offline)
+			test_mask |= bit_val;
+	}
+	if (!offline) {
+		bnxt_run_fw_tests(bp, test_mask, &test_results);
+	} else {
+		rc = bnxt_close_nic(bp, false, false);
+		if (rc)
+			return;
+		bnxt_run_fw_tests(bp, test_mask, &test_results);
+
+		buf[BNXT_MACLPBK_TEST_IDX] = 1;
+		bnxt_hwrm_mac_loopback(bp, true);
+		msleep(250);
+		rc = bnxt_half_open_nic(bp);
+		if (rc) {
+			bnxt_hwrm_mac_loopback(bp, false);
+			etest->flags |= ETH_TEST_FL_FAILED;
+			return;
+		}
+		if (bnxt_run_loopback(bp))
+			etest->flags |= ETH_TEST_FL_FAILED;
+		else
+			buf[BNXT_MACLPBK_TEST_IDX] = 0;
+
+		bnxt_hwrm_mac_loopback(bp, false);
+		bnxt_hwrm_phy_loopback(bp, true);
+		msleep(1000);
+		if (bnxt_run_loopback(bp)) {
+			buf[BNXT_PHYLPBK_TEST_IDX] = 1;
+			etest->flags |= ETH_TEST_FL_FAILED;
+		}
+		bnxt_hwrm_phy_loopback(bp, false);
+		bnxt_half_close_nic(bp);
+		bnxt_open_nic(bp, false, true);
+	}
+	if (bnxt_test_irq(bp)) {
+		buf[BNXT_IRQ_TEST_IDX] = 1;
+		etest->flags |= ETH_TEST_FL_FAILED;
+	}
+	for (i = 0; i < bp->num_tests - BNXT_DRV_TESTS; i++) {
+		u8 bit_val = 1 << i;
+
+		if ((test_mask & bit_val) && !(test_results & bit_val)) {
+			buf[i] = 1;
+			etest->flags |= ETH_TEST_FL_FAILED;
+		}
+	}
+}
+
+static int bnxt_reset(struct net_device *dev, u32 *flags)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int rc = 0;
+
+	if (!BNXT_PF(bp)) {
+		netdev_err(dev, "Reset is not supported from a VF\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (pci_vfs_assigned(bp->pdev)) {
+		netdev_err(dev,
+			   "Reset not allowed when VFs are assigned to VMs\n");
+		return -EBUSY;
+	}
+
+	if (*flags == ETH_RESET_ALL) {
+		/* This feature is not supported in older firmware versions */
+		if (bp->hwrm_spec_code < 0x10803)
+			return -EOPNOTSUPP;
+
+		rc = bnxt_firmware_reset(dev, BNXT_FW_RESET_CHIP);
+		if (!rc) {
+			netdev_info(dev, "Reset request successful. Reload driver to complete reset\n");
+			*flags = 0;
+		}
+	} else if (*flags == ETH_RESET_AP) {
+		/* This feature is not supported in older firmware versions */
+		if (bp->hwrm_spec_code < 0x10803)
+			return -EOPNOTSUPP;
+
+		rc = bnxt_firmware_reset(dev, BNXT_FW_RESET_AP);
+		if (!rc) {
+			netdev_info(dev, "Reset Application Processor request successful.\n");
+			*flags = 0;
+		}
+	} else {
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+void bnxt_ethtool_init(struct bnxt *bp)
+{
+	struct hwrm_selftest_qlist_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_selftest_qlist_input req = {0};
+	struct bnxt_test_info *test_info;
+	struct net_device *dev = bp->dev;
+	int i, rc;
+
+	bnxt_get_pkgver(dev);
+
+	if (bp->hwrm_spec_code < 0x10704 || !BNXT_SINGLE_PF(bp))
+		return;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_SELFTEST_QLIST, -1, -1);
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		goto ethtool_init_exit;
+
+	test_info = kzalloc(sizeof(*bp->test_info), GFP_KERNEL);
+	if (!test_info)
+		goto ethtool_init_exit;
+
+	bp->test_info = test_info;
+	bp->num_tests = resp->num_tests + BNXT_DRV_TESTS;
+	if (bp->num_tests > BNXT_MAX_TEST)
+		bp->num_tests = BNXT_MAX_TEST;
+
+	test_info->offline_mask = resp->offline_tests;
+	test_info->timeout = le16_to_cpu(resp->test_timeout);
+	if (!test_info->timeout)
+		test_info->timeout = HWRM_CMD_TIMEOUT;
+	for (i = 0; i < bp->num_tests; i++) {
+		char *str = test_info->string[i];
+		char *fw_str = resp->test0_name + i * 32;
+
+		if (i == BNXT_MACLPBK_TEST_IDX) {
+			strcpy(str, "Mac loopback test (offline)");
+		} else if (i == BNXT_PHYLPBK_TEST_IDX) {
+			strcpy(str, "Phy loopback test (offline)");
+		} else if (i == BNXT_IRQ_TEST_IDX) {
+			strcpy(str, "Interrupt_test (offline)");
+		} else {
+			strlcpy(str, fw_str, ETH_GSTRING_LEN);
+			strncat(str, " test", ETH_GSTRING_LEN - strlen(str));
+			if (test_info->offline_mask & (1 << i))
+				strncat(str, " (offline)",
+					ETH_GSTRING_LEN - strlen(str));
+			else
+				strncat(str, " (online)",
+					ETH_GSTRING_LEN - strlen(str));
+		}
+	}
+
+ethtool_init_exit:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+}
+
+void bnxt_ethtool_free(struct bnxt *bp)
+{
+	kfree(bp->test_info);
+	bp->test_info = NULL;
+}
+
+const struct ethtool_ops bnxt_ethtool_ops = {
+	.get_link_ksettings	= bnxt_get_link_ksettings,
+	.set_link_ksettings	= bnxt_set_link_ksettings,
+	.get_pauseparam		= bnxt_get_pauseparam,
+	.set_pauseparam		= bnxt_set_pauseparam,
+	.get_drvinfo		= bnxt_get_drvinfo,
+	.get_wol		= bnxt_get_wol,
+	.set_wol		= bnxt_set_wol,
+	.get_coalesce		= bnxt_get_coalesce,
+	.set_coalesce		= bnxt_set_coalesce,
+	.get_msglevel		= bnxt_get_msglevel,
+	.set_msglevel		= bnxt_set_msglevel,
+	.get_sset_count		= bnxt_get_sset_count,
+	.get_strings		= bnxt_get_strings,
+	.get_ethtool_stats	= bnxt_get_ethtool_stats,
+	.set_ringparam		= bnxt_set_ringparam,
+	.get_ringparam		= bnxt_get_ringparam,
+	.get_channels		= bnxt_get_channels,
+	.set_channels		= bnxt_set_channels,
+	.get_rxnfc		= bnxt_get_rxnfc,
+	.set_rxnfc		= bnxt_set_rxnfc,
+	.get_rxfh_indir_size    = bnxt_get_rxfh_indir_size,
+	.get_rxfh_key_size      = bnxt_get_rxfh_key_size,
+	.get_rxfh               = bnxt_get_rxfh,
+	.flash_device		= bnxt_flash_device,
+	.get_eeprom_len         = bnxt_get_eeprom_len,
+	.get_eeprom             = bnxt_get_eeprom,
+	.set_eeprom		= bnxt_set_eeprom,
+	.get_link		= bnxt_get_link,
+	.get_eee		= bnxt_get_eee,
+	.set_eee		= bnxt_set_eee,
+	.get_module_info	= bnxt_get_module_info,
+	.get_module_eeprom	= bnxt_get_module_eeprom,
+	.nway_reset		= bnxt_nway_reset,
+	.set_phys_id		= bnxt_set_phys_id,
+	.self_test		= bnxt_self_test,
+	.reset			= bnxt_reset,
+};
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h
new file mode 100644
index 0000000..836ef68
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h
@@ -0,0 +1,48 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2014-2016 Broadcom Corporation
+ * Copyright (c) 2016-2017 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_ETHTOOL_H
+#define BNXT_ETHTOOL_H
+
+struct bnxt_led_cfg {
+	u8 led_id;
+	u8 led_state;
+	u8 led_color;
+	u8 unused;
+	__le16 led_blink_on;
+	__le16 led_blink_off;
+	u8 led_group_id;
+	u8 rsvd;
+};
+
+#define BNXT_LED_DFLT_ENA				\
+	(PORT_LED_CFG_REQ_ENABLES_LED0_ID |		\
+	 PORT_LED_CFG_REQ_ENABLES_LED0_STATE |		\
+	 PORT_LED_CFG_REQ_ENABLES_LED0_BLINK_ON |	\
+	 PORT_LED_CFG_REQ_ENABLES_LED0_BLINK_OFF |	\
+	 PORT_LED_CFG_REQ_ENABLES_LED0_GROUP_ID)
+
+#define BNXT_LED_DFLT_ENA_SHIFT	6
+
+#define BNXT_LED_DFLT_ENABLES(x)			\
+	cpu_to_le32(BNXT_LED_DFLT_ENA << (BNXT_LED_DFLT_ENA_SHIFT * (x)))
+
+#define BNXT_FW_RESET_AP	0xfffe
+#define BNXT_FW_RESET_CHIP	0xffff
+
+extern const struct ethtool_ops bnxt_ethtool_ops;
+
+u32 _bnxt_fw_to_ethtool_adv_spds(u16, u8);
+u32 bnxt_fw_to_ethtool_speed(u16);
+u16 bnxt_get_fw_auto_link_speeds(u32);
+void bnxt_ethtool_init(struct bnxt *bp);
+void bnxt_ethtool_free(struct bnxt *bp);
+
+#endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_fw_hdr.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_fw_hdr.h
new file mode 100644
index 0000000..cad30dd
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_fw_hdr.h
@@ -0,0 +1,119 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2014-2016 Broadcom Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef __BNXT_FW_HDR_H__
+#define __BNXT_FW_HDR_H__
+
+#define BNXT_FIRMWARE_BIN_SIGNATURE     0x1a4d4342	/* "BCM"+0x1a */
+#define BNXT_UCODE_TRAILER_SIGNATURE	0x726c7254	/* "Trlr" */
+
+enum SUPPORTED_FAMILY {
+	DEVICE_5702_3_4_FAMILY,		/* 0  - Denali, Vinson, K2 */
+	DEVICE_5705_FAMILY,		/* 1  - Bachelor */
+	DEVICE_SHASTA_FAMILY,		/* 2  - 5751 */
+	DEVICE_5706_FAMILY,		/* 3  - Teton */
+	DEVICE_5714_FAMILY,		/* 4  - Hamilton */
+	DEVICE_STANFORD_FAMILY,		/* 5  - 5755 */
+	DEVICE_STANFORD_ME_FAMILY,	/* 6  - 5756 */
+	DEVICE_SOLEDAD_FAMILY,		/* 7  - 5761[E] */
+	DEVICE_CILAI_FAMILY,		/* 8  - 57780/60/90/91 */
+	DEVICE_ASPEN_FAMILY,		/* 9  - 57781/85/61/65/91/95 */
+	DEVICE_ASPEN_PLUS_FAMILY,	/* 10 - 57786 */
+	DEVICE_LOGAN_FAMILY,		/* 11 - Any device in the Logan family
+					 */
+	DEVICE_LOGAN_5762,		/* 12 - Logan Enterprise (aka Columbia)
+					 */
+	DEVICE_LOGAN_57767,		/* 13 - Logan Client */
+	DEVICE_LOGAN_57787,		/* 14 - Logan Consumer */
+	DEVICE_LOGAN_5725,		/* 15 - Logan Server (TruManage-enabled)
+					 */
+	DEVICE_SAWTOOTH_FAMILY,		/* 16 - 5717/18 */
+	DEVICE_COTOPAXI_FAMILY,		/* 17 - 5719 */
+	DEVICE_SNAGGLETOOTH_FAMILY,	/* 18 - 5720 */
+	DEVICE_CUMULUS_FAMILY,		/* 19 - Cumulus/Whitney */
+	MAX_DEVICE_FAMILY
+};
+
+enum SUPPORTED_CODE {
+	CODE_ASF1,		/* 0  - ASF VERSION 1.03 <deprecated> */
+	CODE_ASF2,		/* 1  - ASF VERSION 2.00 <deprecated> */
+	CODE_PASSTHRU,		/* 2  - PassThru         <deprecated> */
+	CODE_PT_SEC,		/* 3  - PassThru with security <deprecated> */
+	CODE_UMP,		/* 4  - UMP                     <deprecated> */
+	CODE_BOOT,		/* 5  - Bootcode */
+	CODE_DASH,		/* 6  - TruManage (DASH + ASF + PMCI)
+				 *	Management firmwares
+				 */
+	CODE_MCTP_PASSTHRU,	/* 7  - NCSI / MCTP Passt-hrough firmware */
+	CODE_PM_OFFLOAD,	/* 8  - Power-Management Proxy Offload firmwares
+				 */
+	CODE_MDNS_SD_OFFLOAD,	/* 9  - Multicast DNS Service Discovery Proxys
+				 *	Offload firmware
+				 */
+	CODE_DISC_OFFLOAD,	/* 10 - Discovery Offload firmware */
+	CODE_MUSTANG,		/* 11 - I2C Error reporting APE firmwares
+				 *	<deprecated>
+				 */
+	CODE_ARP_BATCH,		/* 12 - ARP Batch firmware */
+	CODE_SMASH,		/* 13 - TruManage (SMASH + DCMI/IPMI + PMCI)
+				 *	Management firmware
+				 */
+	CODE_APE_DIAG,		/* 14 - APE Test Diag firmware */
+	CODE_APE_PATCH,		/* 15 - APE Patch firmware */
+	CODE_TANG_PATCH,	/* 16 - TANG Patch firmware */
+	CODE_KONG_FW,		/* 17 - KONG firmware */
+	CODE_KONG_PATCH,	/* 18 - KONG Patch firmware */
+	CODE_BONO_FW,		/* 19 - BONO firmware */
+	CODE_BONO_PATCH,	/* 20 - BONO Patch firmware */
+	CODE_CHIMP_PATCH,	/* 21 - ChiMP Patch firmware */
+
+	MAX_CODE_TYPE,
+};
+
+enum SUPPORTED_MEDIA {
+	MEDIA_COPPER,		/* 0 */
+	MEDIA_FIBER,		/* 1 */
+	MEDIA_NONE,		/* 2 */
+	MEDIA_COPPER_FIBER,	/* 3 */
+	MAX_MEDIA_TYPE,
+};
+
+struct bnxt_fw_header {
+	__le32 signature;	/* constains the constant value of
+				 * BNXT_FIRMWARE_BIN_SIGNATURE
+				 */
+	u8 flags;		/* reserved for ChiMP use */
+	u8 code_type;		/* enum SUPPORTED_CODE */
+	u8 device;		/* enum SUPPORTED_FAMILY */
+	u8 media;		/* enum SUPPORTED_MEDIA */
+	u8 version[16];		/* the null terminated version string to
+				 * indicate the version of the
+				 * file, this will be copied from the binary
+				 * file version string
+				 */
+	u8 build;
+	u8 revision;
+	u8 minor_ver;
+	u8 major_ver;
+};
+
+/* Microcode and pre-boot software/firmware trailer: */
+struct bnxt_ucode_trailer {
+	u8 rsa_sig[256];
+	__le16 flags;
+	u8 version_format;
+	u8 version_length;
+	u8 version[16];
+	__le16 dir_type;
+	__le16 trailer_length;
+	__le32 sig;		/* BNXT_UCODE_TRAILER_SIGNATURE */
+	__le32 chksum;		/* CRC-32 */
+};
+
+#endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
new file mode 100644
index 0000000..0fe0ea8
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
@@ -0,0 +1,6406 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2014-2016 Broadcom Corporation
+ * Copyright (c) 2016-2018 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * DO NOT MODIFY!!! This file is automatically generated.
+ */
+
+#ifndef _BNXT_HSI_H_
+#define _BNXT_HSI_H_
+
+/* hwrm_cmd_hdr (size:128b/16B) */
+struct hwrm_cmd_hdr {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_resp_hdr (size:64b/8B) */
+struct hwrm_resp_hdr {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+};
+
+#define CMD_DISCR_TLV_ENCAP 0x8000UL
+#define CMD_DISCR_LAST     CMD_DISCR_TLV_ENCAP
+
+
+#define TLV_TYPE_HWRM_REQUEST                    0x1UL
+#define TLV_TYPE_HWRM_RESPONSE                   0x2UL
+#define TLV_TYPE_ROCE_SP_COMMAND                 0x3UL
+#define TLV_TYPE_ENGINE_CKV_DEVICE_SERIAL_NUMBER 0x8001UL
+#define TLV_TYPE_ENGINE_CKV_NONCE                0x8002UL
+#define TLV_TYPE_ENGINE_CKV_IV                   0x8003UL
+#define TLV_TYPE_ENGINE_CKV_AUTH_TAG             0x8004UL
+#define TLV_TYPE_ENGINE_CKV_CIPHERTEXT           0x8005UL
+#define TLV_TYPE_ENGINE_CKV_ALGORITHMS           0x8006UL
+#define TLV_TYPE_ENGINE_CKV_ECC_PUBLIC_KEY       0x8007UL
+#define TLV_TYPE_ENGINE_CKV_ECDSA_SIGNATURE      0x8008UL
+#define TLV_TYPE_LAST                           TLV_TYPE_ENGINE_CKV_ECDSA_SIGNATURE
+
+
+/* tlv (size:64b/8B) */
+struct tlv {
+	__le16	cmd_discr;
+	u8	reserved_8b;
+	u8	flags;
+	#define TLV_FLAGS_MORE         0x1UL
+	#define TLV_FLAGS_MORE_LAST      0x0UL
+	#define TLV_FLAGS_MORE_NOT_LAST  0x1UL
+	#define TLV_FLAGS_REQUIRED     0x2UL
+	#define TLV_FLAGS_REQUIRED_NO    (0x0UL << 1)
+	#define TLV_FLAGS_REQUIRED_YES   (0x1UL << 1)
+	#define TLV_FLAGS_REQUIRED_LAST TLV_FLAGS_REQUIRED_YES
+	__le16	tlv_type;
+	__le16	length;
+};
+
+/* input (size:128b/16B) */
+struct input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* output (size:64b/8B) */
+struct output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+};
+
+/* hwrm_short_input (size:128b/16B) */
+struct hwrm_short_input {
+	__le16	req_type;
+	__le16	signature;
+	#define SHORT_REQ_SIGNATURE_SHORT_CMD 0x4321UL
+	#define SHORT_REQ_SIGNATURE_LAST     SHORT_REQ_SIGNATURE_SHORT_CMD
+	__le16	unused_0;
+	__le16	size;
+	__le64	req_addr;
+};
+
+/* cmd_nums (size:64b/8B) */
+struct cmd_nums {
+	__le16	req_type;
+	#define HWRM_VER_GET                              0x0UL
+	#define HWRM_FUNC_BUF_UNRGTR                      0xeUL
+	#define HWRM_FUNC_VF_CFG                          0xfUL
+	#define HWRM_RESERVED1                            0x10UL
+	#define HWRM_FUNC_RESET                           0x11UL
+	#define HWRM_FUNC_GETFID                          0x12UL
+	#define HWRM_FUNC_VF_ALLOC                        0x13UL
+	#define HWRM_FUNC_VF_FREE                         0x14UL
+	#define HWRM_FUNC_QCAPS                           0x15UL
+	#define HWRM_FUNC_QCFG                            0x16UL
+	#define HWRM_FUNC_CFG                             0x17UL
+	#define HWRM_FUNC_QSTATS                          0x18UL
+	#define HWRM_FUNC_CLR_STATS                       0x19UL
+	#define HWRM_FUNC_DRV_UNRGTR                      0x1aUL
+	#define HWRM_FUNC_VF_RESC_FREE                    0x1bUL
+	#define HWRM_FUNC_VF_VNIC_IDS_QUERY               0x1cUL
+	#define HWRM_FUNC_DRV_RGTR                        0x1dUL
+	#define HWRM_FUNC_DRV_QVER                        0x1eUL
+	#define HWRM_FUNC_BUF_RGTR                        0x1fUL
+	#define HWRM_PORT_PHY_CFG                         0x20UL
+	#define HWRM_PORT_MAC_CFG                         0x21UL
+	#define HWRM_PORT_TS_QUERY                        0x22UL
+	#define HWRM_PORT_QSTATS                          0x23UL
+	#define HWRM_PORT_LPBK_QSTATS                     0x24UL
+	#define HWRM_PORT_CLR_STATS                       0x25UL
+	#define HWRM_PORT_LPBK_CLR_STATS                  0x26UL
+	#define HWRM_PORT_PHY_QCFG                        0x27UL
+	#define HWRM_PORT_MAC_QCFG                        0x28UL
+	#define HWRM_PORT_MAC_PTP_QCFG                    0x29UL
+	#define HWRM_PORT_PHY_QCAPS                       0x2aUL
+	#define HWRM_PORT_PHY_I2C_WRITE                   0x2bUL
+	#define HWRM_PORT_PHY_I2C_READ                    0x2cUL
+	#define HWRM_PORT_LED_CFG                         0x2dUL
+	#define HWRM_PORT_LED_QCFG                        0x2eUL
+	#define HWRM_PORT_LED_QCAPS                       0x2fUL
+	#define HWRM_QUEUE_QPORTCFG                       0x30UL
+	#define HWRM_QUEUE_QCFG                           0x31UL
+	#define HWRM_QUEUE_CFG                            0x32UL
+	#define HWRM_FUNC_VLAN_CFG                        0x33UL
+	#define HWRM_FUNC_VLAN_QCFG                       0x34UL
+	#define HWRM_QUEUE_PFCENABLE_QCFG                 0x35UL
+	#define HWRM_QUEUE_PFCENABLE_CFG                  0x36UL
+	#define HWRM_QUEUE_PRI2COS_QCFG                   0x37UL
+	#define HWRM_QUEUE_PRI2COS_CFG                    0x38UL
+	#define HWRM_QUEUE_COS2BW_QCFG                    0x39UL
+	#define HWRM_QUEUE_COS2BW_CFG                     0x3aUL
+	#define HWRM_QUEUE_DSCP_QCAPS                     0x3bUL
+	#define HWRM_QUEUE_DSCP2PRI_QCFG                  0x3cUL
+	#define HWRM_QUEUE_DSCP2PRI_CFG                   0x3dUL
+	#define HWRM_VNIC_ALLOC                           0x40UL
+	#define HWRM_VNIC_FREE                            0x41UL
+	#define HWRM_VNIC_CFG                             0x42UL
+	#define HWRM_VNIC_QCFG                            0x43UL
+	#define HWRM_VNIC_TPA_CFG                         0x44UL
+	#define HWRM_VNIC_TPA_QCFG                        0x45UL
+	#define HWRM_VNIC_RSS_CFG                         0x46UL
+	#define HWRM_VNIC_RSS_QCFG                        0x47UL
+	#define HWRM_VNIC_PLCMODES_CFG                    0x48UL
+	#define HWRM_VNIC_PLCMODES_QCFG                   0x49UL
+	#define HWRM_VNIC_QCAPS                           0x4aUL
+	#define HWRM_RING_ALLOC                           0x50UL
+	#define HWRM_RING_FREE                            0x51UL
+	#define HWRM_RING_CMPL_RING_QAGGINT_PARAMS        0x52UL
+	#define HWRM_RING_CMPL_RING_CFG_AGGINT_PARAMS     0x53UL
+	#define HWRM_RING_RESET                           0x5eUL
+	#define HWRM_RING_GRP_ALLOC                       0x60UL
+	#define HWRM_RING_GRP_FREE                        0x61UL
+	#define HWRM_RESERVED5                            0x64UL
+	#define HWRM_RESERVED6                            0x65UL
+	#define HWRM_VNIC_RSS_COS_LB_CTX_ALLOC            0x70UL
+	#define HWRM_VNIC_RSS_COS_LB_CTX_FREE             0x71UL
+	#define HWRM_CFA_L2_FILTER_ALLOC                  0x90UL
+	#define HWRM_CFA_L2_FILTER_FREE                   0x91UL
+	#define HWRM_CFA_L2_FILTER_CFG                    0x92UL
+	#define HWRM_CFA_L2_SET_RX_MASK                   0x93UL
+	#define HWRM_CFA_VLAN_ANTISPOOF_CFG               0x94UL
+	#define HWRM_CFA_TUNNEL_FILTER_ALLOC              0x95UL
+	#define HWRM_CFA_TUNNEL_FILTER_FREE               0x96UL
+	#define HWRM_CFA_ENCAP_RECORD_ALLOC               0x97UL
+	#define HWRM_CFA_ENCAP_RECORD_FREE                0x98UL
+	#define HWRM_CFA_NTUPLE_FILTER_ALLOC              0x99UL
+	#define HWRM_CFA_NTUPLE_FILTER_FREE               0x9aUL
+	#define HWRM_CFA_NTUPLE_FILTER_CFG                0x9bUL
+	#define HWRM_CFA_EM_FLOW_ALLOC                    0x9cUL
+	#define HWRM_CFA_EM_FLOW_FREE                     0x9dUL
+	#define HWRM_CFA_EM_FLOW_CFG                      0x9eUL
+	#define HWRM_TUNNEL_DST_PORT_QUERY                0xa0UL
+	#define HWRM_TUNNEL_DST_PORT_ALLOC                0xa1UL
+	#define HWRM_TUNNEL_DST_PORT_FREE                 0xa2UL
+	#define HWRM_STAT_CTX_ALLOC                       0xb0UL
+	#define HWRM_STAT_CTX_FREE                        0xb1UL
+	#define HWRM_STAT_CTX_QUERY                       0xb2UL
+	#define HWRM_STAT_CTX_CLR_STATS                   0xb3UL
+	#define HWRM_PORT_QSTATS_EXT                      0xb4UL
+	#define HWRM_FW_RESET                             0xc0UL
+	#define HWRM_FW_QSTATUS                           0xc1UL
+	#define HWRM_FW_SET_TIME                          0xc8UL
+	#define HWRM_FW_GET_TIME                          0xc9UL
+	#define HWRM_FW_SET_STRUCTURED_DATA               0xcaUL
+	#define HWRM_FW_GET_STRUCTURED_DATA               0xcbUL
+	#define HWRM_FW_IPC_MAILBOX                       0xccUL
+	#define HWRM_EXEC_FWD_RESP                        0xd0UL
+	#define HWRM_REJECT_FWD_RESP                      0xd1UL
+	#define HWRM_FWD_RESP                             0xd2UL
+	#define HWRM_FWD_ASYNC_EVENT_CMPL                 0xd3UL
+	#define HWRM_OEM_CMD                              0xd4UL
+	#define HWRM_TEMP_MONITOR_QUERY                   0xe0UL
+	#define HWRM_WOL_FILTER_ALLOC                     0xf0UL
+	#define HWRM_WOL_FILTER_FREE                      0xf1UL
+	#define HWRM_WOL_FILTER_QCFG                      0xf2UL
+	#define HWRM_WOL_REASON_QCFG                      0xf3UL
+	#define HWRM_CFA_METER_PROFILE_ALLOC              0xf5UL
+	#define HWRM_CFA_METER_PROFILE_FREE               0xf6UL
+	#define HWRM_CFA_METER_PROFILE_CFG                0xf7UL
+	#define HWRM_CFA_METER_INSTANCE_ALLOC             0xf8UL
+	#define HWRM_CFA_METER_INSTANCE_FREE              0xf9UL
+	#define HWRM_CFA_VFR_ALLOC                        0xfdUL
+	#define HWRM_CFA_VFR_FREE                         0xfeUL
+	#define HWRM_CFA_VF_PAIR_ALLOC                    0x100UL
+	#define HWRM_CFA_VF_PAIR_FREE                     0x101UL
+	#define HWRM_CFA_VF_PAIR_INFO                     0x102UL
+	#define HWRM_CFA_FLOW_ALLOC                       0x103UL
+	#define HWRM_CFA_FLOW_FREE                        0x104UL
+	#define HWRM_CFA_FLOW_FLUSH                       0x105UL
+	#define HWRM_CFA_FLOW_STATS                       0x106UL
+	#define HWRM_CFA_FLOW_INFO                        0x107UL
+	#define HWRM_CFA_DECAP_FILTER_ALLOC               0x108UL
+	#define HWRM_CFA_DECAP_FILTER_FREE                0x109UL
+	#define HWRM_CFA_VLAN_ANTISPOOF_QCFG              0x10aUL
+	#define HWRM_CFA_REDIRECT_TUNNEL_TYPE_ALLOC       0x10bUL
+	#define HWRM_CFA_REDIRECT_TUNNEL_TYPE_FREE        0x10cUL
+	#define HWRM_CFA_PAIR_ALLOC                       0x10dUL
+	#define HWRM_CFA_PAIR_FREE                        0x10eUL
+	#define HWRM_CFA_PAIR_INFO                        0x10fUL
+	#define HWRM_FW_IPC_MSG                           0x110UL
+	#define HWRM_CFA_REDIRECT_TUNNEL_TYPE_INFO        0x111UL
+	#define HWRM_ENGINE_CKV_HELLO                     0x12dUL
+	#define HWRM_ENGINE_CKV_STATUS                    0x12eUL
+	#define HWRM_ENGINE_CKV_CKEK_ADD                  0x12fUL
+	#define HWRM_ENGINE_CKV_CKEK_DELETE               0x130UL
+	#define HWRM_ENGINE_CKV_KEY_ADD                   0x131UL
+	#define HWRM_ENGINE_CKV_KEY_DELETE                0x132UL
+	#define HWRM_ENGINE_CKV_FLUSH                     0x133UL
+	#define HWRM_ENGINE_CKV_RNG_GET                   0x134UL
+	#define HWRM_ENGINE_CKV_KEY_GEN                   0x135UL
+	#define HWRM_ENGINE_QG_CONFIG_QUERY               0x13cUL
+	#define HWRM_ENGINE_QG_QUERY                      0x13dUL
+	#define HWRM_ENGINE_QG_METER_PROFILE_CONFIG_QUERY 0x13eUL
+	#define HWRM_ENGINE_QG_METER_PROFILE_QUERY        0x13fUL
+	#define HWRM_ENGINE_QG_METER_PROFILE_ALLOC        0x140UL
+	#define HWRM_ENGINE_QG_METER_PROFILE_FREE         0x141UL
+	#define HWRM_ENGINE_QG_METER_QUERY                0x142UL
+	#define HWRM_ENGINE_QG_METER_BIND                 0x143UL
+	#define HWRM_ENGINE_QG_METER_UNBIND               0x144UL
+	#define HWRM_ENGINE_QG_FUNC_BIND                  0x145UL
+	#define HWRM_ENGINE_SG_CONFIG_QUERY               0x146UL
+	#define HWRM_ENGINE_SG_QUERY                      0x147UL
+	#define HWRM_ENGINE_SG_METER_QUERY                0x148UL
+	#define HWRM_ENGINE_SG_METER_CONFIG               0x149UL
+	#define HWRM_ENGINE_SG_QG_BIND                    0x14aUL
+	#define HWRM_ENGINE_QG_SG_UNBIND                  0x14bUL
+	#define HWRM_ENGINE_CONFIG_QUERY                  0x154UL
+	#define HWRM_ENGINE_STATS_CONFIG                  0x155UL
+	#define HWRM_ENGINE_STATS_CLEAR                   0x156UL
+	#define HWRM_ENGINE_STATS_QUERY                   0x157UL
+	#define HWRM_ENGINE_RQ_ALLOC                      0x15eUL
+	#define HWRM_ENGINE_RQ_FREE                       0x15fUL
+	#define HWRM_ENGINE_CQ_ALLOC                      0x160UL
+	#define HWRM_ENGINE_CQ_FREE                       0x161UL
+	#define HWRM_ENGINE_NQ_ALLOC                      0x162UL
+	#define HWRM_ENGINE_NQ_FREE                       0x163UL
+	#define HWRM_ENGINE_ON_DIE_RQE_CREDITS            0x164UL
+	#define HWRM_FUNC_RESOURCE_QCAPS                  0x190UL
+	#define HWRM_FUNC_VF_RESOURCE_CFG                 0x191UL
+	#define HWRM_SELFTEST_QLIST                       0x200UL
+	#define HWRM_SELFTEST_EXEC                        0x201UL
+	#define HWRM_SELFTEST_IRQ                         0x202UL
+	#define HWRM_SELFTEST_RETRIEVE_SERDES_DATA        0x203UL
+	#define HWRM_PCIE_QSTATS                          0x204UL
+	#define HWRM_DBG_READ_DIRECT                      0xff10UL
+	#define HWRM_DBG_READ_INDIRECT                    0xff11UL
+	#define HWRM_DBG_WRITE_DIRECT                     0xff12UL
+	#define HWRM_DBG_WRITE_INDIRECT                   0xff13UL
+	#define HWRM_DBG_DUMP                             0xff14UL
+	#define HWRM_DBG_ERASE_NVM                        0xff15UL
+	#define HWRM_DBG_CFG                              0xff16UL
+	#define HWRM_DBG_COREDUMP_LIST                    0xff17UL
+	#define HWRM_DBG_COREDUMP_INITIATE                0xff18UL
+	#define HWRM_DBG_COREDUMP_RETRIEVE                0xff19UL
+	#define HWRM_NVM_FACTORY_DEFAULTS                 0xffeeUL
+	#define HWRM_NVM_VALIDATE_OPTION                  0xffefUL
+	#define HWRM_NVM_FLUSH                            0xfff0UL
+	#define HWRM_NVM_GET_VARIABLE                     0xfff1UL
+	#define HWRM_NVM_SET_VARIABLE                     0xfff2UL
+	#define HWRM_NVM_INSTALL_UPDATE                   0xfff3UL
+	#define HWRM_NVM_MODIFY                           0xfff4UL
+	#define HWRM_NVM_VERIFY_UPDATE                    0xfff5UL
+	#define HWRM_NVM_GET_DEV_INFO                     0xfff6UL
+	#define HWRM_NVM_ERASE_DIR_ENTRY                  0xfff7UL
+	#define HWRM_NVM_MOD_DIR_ENTRY                    0xfff8UL
+	#define HWRM_NVM_FIND_DIR_ENTRY                   0xfff9UL
+	#define HWRM_NVM_GET_DIR_ENTRIES                  0xfffaUL
+	#define HWRM_NVM_GET_DIR_INFO                     0xfffbUL
+	#define HWRM_NVM_RAW_DUMP                         0xfffcUL
+	#define HWRM_NVM_READ                             0xfffdUL
+	#define HWRM_NVM_WRITE                            0xfffeUL
+	#define HWRM_NVM_RAW_WRITE_BLK                    0xffffUL
+	#define HWRM_LAST                                HWRM_NVM_RAW_WRITE_BLK
+	__le16	unused_0[3];
+};
+
+/* ret_codes (size:64b/8B) */
+struct ret_codes {
+	__le16	error_code;
+	#define HWRM_ERR_CODE_SUCCESS                0x0UL
+	#define HWRM_ERR_CODE_FAIL                   0x1UL
+	#define HWRM_ERR_CODE_INVALID_PARAMS         0x2UL
+	#define HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED 0x3UL
+	#define HWRM_ERR_CODE_RESOURCE_ALLOC_ERROR   0x4UL
+	#define HWRM_ERR_CODE_INVALID_FLAGS          0x5UL
+	#define HWRM_ERR_CODE_INVALID_ENABLES        0x6UL
+	#define HWRM_ERR_CODE_UNSUPPORTED_TLV        0x7UL
+	#define HWRM_ERR_CODE_NO_BUFFER              0x8UL
+	#define HWRM_ERR_CODE_HWRM_ERROR             0xfUL
+	#define HWRM_ERR_CODE_UNKNOWN_ERR            0xfffeUL
+	#define HWRM_ERR_CODE_CMD_NOT_SUPPORTED      0xffffUL
+	#define HWRM_ERR_CODE_LAST                  HWRM_ERR_CODE_CMD_NOT_SUPPORTED
+	__le16	unused_0[3];
+};
+
+/* hwrm_err_output (size:128b/16B) */
+struct hwrm_err_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	opaque_0;
+	__le16	opaque_1;
+	u8	cmd_err;
+	u8	valid;
+};
+#define HWRM_NA_SIGNATURE ((__le32)(-1))
+#define HWRM_MAX_REQ_LEN 128
+#define HWRM_MAX_RESP_LEN 280
+#define HW_HASH_INDEX_SIZE 0x80
+#define HW_HASH_KEY_SIZE 40
+#define HWRM_RESP_VALID_KEY 1
+#define HWRM_VERSION_MAJOR 1
+#define HWRM_VERSION_MINOR 9
+#define HWRM_VERSION_UPDATE 1
+#define HWRM_VERSION_RSVD 15
+#define HWRM_VERSION_STR "1.9.1.15"
+
+/* hwrm_ver_get_input (size:192b/24B) */
+struct hwrm_ver_get_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	hwrm_intf_maj;
+	u8	hwrm_intf_min;
+	u8	hwrm_intf_upd;
+	u8	unused_0[5];
+};
+
+/* hwrm_ver_get_output (size:1408b/176B) */
+struct hwrm_ver_get_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	hwrm_intf_maj_8b;
+	u8	hwrm_intf_min_8b;
+	u8	hwrm_intf_upd_8b;
+	u8	hwrm_intf_rsvd_8b;
+	u8	hwrm_fw_maj_8b;
+	u8	hwrm_fw_min_8b;
+	u8	hwrm_fw_bld_8b;
+	u8	hwrm_fw_rsvd_8b;
+	u8	mgmt_fw_maj_8b;
+	u8	mgmt_fw_min_8b;
+	u8	mgmt_fw_bld_8b;
+	u8	mgmt_fw_rsvd_8b;
+	u8	netctrl_fw_maj_8b;
+	u8	netctrl_fw_min_8b;
+	u8	netctrl_fw_bld_8b;
+	u8	netctrl_fw_rsvd_8b;
+	__le32	dev_caps_cfg;
+	#define VER_GET_RESP_DEV_CAPS_CFG_SECURE_FW_UPD_SUPPORTED     0x1UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_FW_DCBX_AGENT_SUPPORTED     0x2UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_SUPPORTED         0x4UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_REQUIRED          0x8UL
+	u8	roce_fw_maj_8b;
+	u8	roce_fw_min_8b;
+	u8	roce_fw_bld_8b;
+	u8	roce_fw_rsvd_8b;
+	char	hwrm_fw_name[16];
+	char	mgmt_fw_name[16];
+	char	netctrl_fw_name[16];
+	u8	reserved2[16];
+	char	roce_fw_name[16];
+	__le16	chip_num;
+	u8	chip_rev;
+	u8	chip_metal;
+	u8	chip_bond_id;
+	u8	chip_platform_type;
+	#define VER_GET_RESP_CHIP_PLATFORM_TYPE_ASIC      0x0UL
+	#define VER_GET_RESP_CHIP_PLATFORM_TYPE_FPGA      0x1UL
+	#define VER_GET_RESP_CHIP_PLATFORM_TYPE_PALLADIUM 0x2UL
+	#define VER_GET_RESP_CHIP_PLATFORM_TYPE_LAST     VER_GET_RESP_CHIP_PLATFORM_TYPE_PALLADIUM
+	__le16	max_req_win_len;
+	__le16	max_resp_len;
+	__le16	def_req_timeout;
+	u8	flags;
+	#define VER_GET_RESP_FLAGS_DEV_NOT_RDY       0x1UL
+	#define VER_GET_RESP_FLAGS_EXT_VER_AVAIL     0x2UL
+	u8	unused_0[2];
+	u8	always_1;
+	__le16	hwrm_intf_major;
+	__le16	hwrm_intf_minor;
+	__le16	hwrm_intf_build;
+	__le16	hwrm_intf_patch;
+	__le16	hwrm_fw_major;
+	__le16	hwrm_fw_minor;
+	__le16	hwrm_fw_build;
+	__le16	hwrm_fw_patch;
+	__le16	mgmt_fw_major;
+	__le16	mgmt_fw_minor;
+	__le16	mgmt_fw_build;
+	__le16	mgmt_fw_patch;
+	__le16	netctrl_fw_major;
+	__le16	netctrl_fw_minor;
+	__le16	netctrl_fw_build;
+	__le16	netctrl_fw_patch;
+	__le16	roce_fw_major;
+	__le16	roce_fw_minor;
+	__le16	roce_fw_build;
+	__le16	roce_fw_patch;
+	__le16	max_ext_req_len;
+	u8	unused_1[5];
+	u8	valid;
+};
+
+/* eject_cmpl (size:128b/16B) */
+struct eject_cmpl {
+	__le16	type;
+	#define EJECT_CMPL_TYPE_MASK      0x3fUL
+	#define EJECT_CMPL_TYPE_SFT       0
+	#define EJECT_CMPL_TYPE_STAT_EJECT  0x1aUL
+	#define EJECT_CMPL_TYPE_LAST       EJECT_CMPL_TYPE_STAT_EJECT
+	__le16	len;
+	__le32	opaque;
+	__le32	v;
+	#define EJECT_CMPL_V     0x1UL
+	__le32	unused_2;
+};
+
+/* hwrm_cmpl (size:128b/16B) */
+struct hwrm_cmpl {
+	__le16	type;
+	#define CMPL_TYPE_MASK     0x3fUL
+	#define CMPL_TYPE_SFT      0
+	#define CMPL_TYPE_HWRM_DONE  0x20UL
+	#define CMPL_TYPE_LAST      CMPL_TYPE_HWRM_DONE
+	__le16	sequence_id;
+	__le32	unused_1;
+	__le32	v;
+	#define CMPL_V     0x1UL
+	__le32	unused_3;
+};
+
+/* hwrm_fwd_req_cmpl (size:128b/16B) */
+struct hwrm_fwd_req_cmpl {
+	__le16	req_len_type;
+	#define FWD_REQ_CMPL_TYPE_MASK        0x3fUL
+	#define FWD_REQ_CMPL_TYPE_SFT         0
+	#define FWD_REQ_CMPL_TYPE_HWRM_FWD_REQ  0x22UL
+	#define FWD_REQ_CMPL_TYPE_LAST         FWD_REQ_CMPL_TYPE_HWRM_FWD_REQ
+	#define FWD_REQ_CMPL_REQ_LEN_MASK     0xffc0UL
+	#define FWD_REQ_CMPL_REQ_LEN_SFT      6
+	__le16	source_id;
+	__le32	unused0;
+	__le32	req_buf_addr_v[2];
+	#define FWD_REQ_CMPL_V                0x1UL
+	#define FWD_REQ_CMPL_REQ_BUF_ADDR_MASK 0xfffffffeUL
+	#define FWD_REQ_CMPL_REQ_BUF_ADDR_SFT 1
+};
+
+/* hwrm_fwd_resp_cmpl (size:128b/16B) */
+struct hwrm_fwd_resp_cmpl {
+	__le16	type;
+	#define FWD_RESP_CMPL_TYPE_MASK         0x3fUL
+	#define FWD_RESP_CMPL_TYPE_SFT          0
+	#define FWD_RESP_CMPL_TYPE_HWRM_FWD_RESP  0x24UL
+	#define FWD_RESP_CMPL_TYPE_LAST          FWD_RESP_CMPL_TYPE_HWRM_FWD_RESP
+	__le16	source_id;
+	__le16	resp_len;
+	__le16	unused_1;
+	__le32	resp_buf_addr_v[2];
+	#define FWD_RESP_CMPL_V                 0x1UL
+	#define FWD_RESP_CMPL_RESP_BUF_ADDR_MASK 0xfffffffeUL
+	#define FWD_RESP_CMPL_RESP_BUF_ADDR_SFT 1
+};
+
+/* hwrm_async_event_cmpl (size:128b/16B) */
+struct hwrm_async_event_cmpl {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_TYPE_LAST             ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE         0x0UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_MTU_CHANGE            0x1UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CHANGE          0x2UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE          0x3UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED      0x4UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_NOT_ALLOWED 0x5UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE      0x6UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PORT_PHY_CFG_CHANGE        0x7UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_FUNC_DRVR_UNLOAD           0x10UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_FUNC_DRVR_LOAD             0x11UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_FUNC_FLR_PROC_CMPLT        0x12UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD             0x20UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_LOAD               0x21UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_VF_FLR                     0x30UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_VF_MAC_ADDR_CHANGE         0x31UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PF_VF_COMM_STATUS_CHANGE   0x32UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE              0x33UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LLFC_PFC_CHANGE            0x34UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR                 0xffUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LAST                      ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_V          0x1UL
+	#define ASYNC_EVENT_CMPL_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+};
+
+/* hwrm_async_event_cmpl_link_status_change (size:128b/16B) */
+struct hwrm_async_event_cmpl_link_status_change {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_LAST             ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_ID_LINK_STATUS_CHANGE 0x0UL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_ID_LAST              ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_ID_LINK_STATUS_CHANGE
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_V          0x1UL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE     0x1UL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE_DOWN  0x0UL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE_UP    0x1UL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE_LAST ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE_UP
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PORT_MASK       0xeUL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PORT_SFT        1
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PORT_ID_MASK    0xffff0UL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PORT_ID_SFT     4
+};
+
+/* hwrm_async_event_cmpl_port_conn_not_allowed (size:128b/16B) */
+struct hwrm_async_event_cmpl_port_conn_not_allowed {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_LAST             ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_ID_PORT_CONN_NOT_ALLOWED 0x4UL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_ID_LAST                 ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_ID_PORT_CONN_NOT_ALLOWED
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_V          0x1UL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_PORT_ID_MASK                 0xffffUL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_PORT_ID_SFT                  0
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_MASK      0xff0000UL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_SFT       16
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_NONE        (0x0UL << 16)
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_DISABLETX   (0x1UL << 16)
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_WARNINGMSG  (0x2UL << 16)
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_PWRDOWN     (0x3UL << 16)
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_LAST       ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_PWRDOWN
+};
+
+/* hwrm_async_event_cmpl_link_speed_cfg_change (size:128b/16B) */
+struct hwrm_async_event_cmpl_link_speed_cfg_change {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_LAST             ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_ID_LINK_SPEED_CFG_CHANGE 0x6UL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_ID_LAST                 ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_ID_LINK_SPEED_CFG_CHANGE
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_V          0x1UL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_DATA1_PORT_ID_MASK                     0xffffUL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_DATA1_PORT_ID_SFT                      0
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_DATA1_SUPPORTED_LINK_SPEEDS_CHANGE     0x10000UL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_DATA1_ILLEGAL_LINK_SPEED_CFG           0x20000UL
+};
+
+/* hwrm_async_event_cmpl_vf_cfg_change (size:128b/16B) */
+struct hwrm_async_event_cmpl_vf_cfg_change {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_LAST             ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_ID_VF_CFG_CHANGE 0x33UL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_ID_LAST         ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_ID_VF_CFG_CHANGE
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_V          0x1UL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_MTU_CHANGE               0x1UL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_MRU_CHANGE               0x2UL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_DFLT_MAC_ADDR_CHANGE     0x4UL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_DFLT_VLAN_CHANGE         0x8UL
+};
+
+/* hwrm_func_reset_input (size:192b/24B) */
+struct hwrm_func_reset_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define FUNC_RESET_REQ_ENABLES_VF_ID_VALID     0x1UL
+	__le16	vf_id;
+	u8	func_reset_level;
+	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETALL      0x0UL
+	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETME       0x1UL
+	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETCHILDREN 0x2UL
+	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETVF       0x3UL
+	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_LAST         FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETVF
+	u8	unused_0;
+};
+
+/* hwrm_func_reset_output (size:128b/16B) */
+struct hwrm_func_reset_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_getfid_input (size:192b/24B) */
+struct hwrm_func_getfid_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define FUNC_GETFID_REQ_ENABLES_PCI_ID     0x1UL
+	__le16	pci_id;
+	u8	unused_0[2];
+};
+
+/* hwrm_func_getfid_output (size:128b/16B) */
+struct hwrm_func_getfid_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	fid;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_func_vf_alloc_input (size:192b/24B) */
+struct hwrm_func_vf_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define FUNC_VF_ALLOC_REQ_ENABLES_FIRST_VF_ID     0x1UL
+	__le16	first_vf_id;
+	__le16	num_vfs;
+};
+
+/* hwrm_func_vf_alloc_output (size:128b/16B) */
+struct hwrm_func_vf_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	first_vf_id;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_func_vf_free_input (size:192b/24B) */
+struct hwrm_func_vf_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define FUNC_VF_FREE_REQ_ENABLES_FIRST_VF_ID     0x1UL
+	__le16	first_vf_id;
+	__le16	num_vfs;
+};
+
+/* hwrm_func_vf_free_output (size:128b/16B) */
+struct hwrm_func_vf_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_vf_cfg_input (size:448b/56B) */
+struct hwrm_func_vf_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define FUNC_VF_CFG_REQ_ENABLES_MTU                  0x1UL
+	#define FUNC_VF_CFG_REQ_ENABLES_GUEST_VLAN           0x2UL
+	#define FUNC_VF_CFG_REQ_ENABLES_ASYNC_EVENT_CR       0x4UL
+	#define FUNC_VF_CFG_REQ_ENABLES_DFLT_MAC_ADDR        0x8UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_RSSCOS_CTXS      0x10UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_CMPL_RINGS       0x20UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_TX_RINGS         0x40UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_RX_RINGS         0x80UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_L2_CTXS          0x100UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_VNICS            0x200UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_STAT_CTXS        0x400UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_HW_RING_GRPS     0x800UL
+	__le16	mtu;
+	__le16	guest_vlan;
+	__le16	async_event_cr;
+	u8	dflt_mac_addr[6];
+	__le32	flags;
+	#define FUNC_VF_CFG_REQ_FLAGS_TX_ASSETS_TEST             0x1UL
+	#define FUNC_VF_CFG_REQ_FLAGS_RX_ASSETS_TEST             0x2UL
+	#define FUNC_VF_CFG_REQ_FLAGS_CMPL_ASSETS_TEST           0x4UL
+	#define FUNC_VF_CFG_REQ_FLAGS_RSSCOS_CTX_ASSETS_TEST     0x8UL
+	#define FUNC_VF_CFG_REQ_FLAGS_RING_GRP_ASSETS_TEST       0x10UL
+	#define FUNC_VF_CFG_REQ_FLAGS_STAT_CTX_ASSETS_TEST       0x20UL
+	#define FUNC_VF_CFG_REQ_FLAGS_VNIC_ASSETS_TEST           0x40UL
+	#define FUNC_VF_CFG_REQ_FLAGS_L2_CTX_ASSETS_TEST         0x80UL
+	__le16	num_rsscos_ctxs;
+	__le16	num_cmpl_rings;
+	__le16	num_tx_rings;
+	__le16	num_rx_rings;
+	__le16	num_l2_ctxs;
+	__le16	num_vnics;
+	__le16	num_stat_ctxs;
+	__le16	num_hw_ring_grps;
+	u8	unused_0[4];
+};
+
+/* hwrm_func_vf_cfg_output (size:128b/16B) */
+struct hwrm_func_vf_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_qcaps_input (size:192b/24B) */
+struct hwrm_func_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	unused_0[6];
+};
+
+/* hwrm_func_qcaps_output (size:640b/80B) */
+struct hwrm_func_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	fid;
+	__le16	port_id;
+	__le32	flags;
+	#define FUNC_QCAPS_RESP_FLAGS_PUSH_MODE_SUPPORTED            0x1UL
+	#define FUNC_QCAPS_RESP_FLAGS_GLOBAL_MSIX_AUTOMASKING        0x2UL
+	#define FUNC_QCAPS_RESP_FLAGS_PTP_SUPPORTED                  0x4UL
+	#define FUNC_QCAPS_RESP_FLAGS_ROCE_V1_SUPPORTED              0x8UL
+	#define FUNC_QCAPS_RESP_FLAGS_ROCE_V2_SUPPORTED              0x10UL
+	#define FUNC_QCAPS_RESP_FLAGS_WOL_MAGICPKT_SUPPORTED         0x20UL
+	#define FUNC_QCAPS_RESP_FLAGS_WOL_BMP_SUPPORTED              0x40UL
+	#define FUNC_QCAPS_RESP_FLAGS_TX_RING_RL_SUPPORTED           0x80UL
+	#define FUNC_QCAPS_RESP_FLAGS_TX_BW_CFG_SUPPORTED            0x100UL
+	#define FUNC_QCAPS_RESP_FLAGS_VF_TX_RING_RL_SUPPORTED        0x200UL
+	#define FUNC_QCAPS_RESP_FLAGS_VF_BW_CFG_SUPPORTED            0x400UL
+	#define FUNC_QCAPS_RESP_FLAGS_STD_TX_RING_MODE_SUPPORTED     0x800UL
+	#define FUNC_QCAPS_RESP_FLAGS_GENEVE_TUN_FLAGS_SUPPORTED     0x1000UL
+	#define FUNC_QCAPS_RESP_FLAGS_NVGRE_TUN_FLAGS_SUPPORTED      0x2000UL
+	#define FUNC_QCAPS_RESP_FLAGS_GRE_TUN_FLAGS_SUPPORTED        0x4000UL
+	#define FUNC_QCAPS_RESP_FLAGS_MPLS_TUN_FLAGS_SUPPORTED       0x8000UL
+	#define FUNC_QCAPS_RESP_FLAGS_PCIE_STATS_SUPPORTED           0x10000UL
+	u8	mac_address[6];
+	__le16	max_rsscos_ctx;
+	__le16	max_cmpl_rings;
+	__le16	max_tx_rings;
+	__le16	max_rx_rings;
+	__le16	max_l2_ctxs;
+	__le16	max_vnics;
+	__le16	first_vf_id;
+	__le16	max_vfs;
+	__le16	max_stat_ctx;
+	__le32	max_encap_records;
+	__le32	max_decap_records;
+	__le32	max_tx_em_flows;
+	__le32	max_tx_wm_flows;
+	__le32	max_rx_em_flows;
+	__le32	max_rx_wm_flows;
+	__le32	max_mcast_filters;
+	__le32	max_flow_id;
+	__le32	max_hw_ring_grps;
+	__le16	max_sp_tx_rings;
+	u8	unused_0;
+	u8	valid;
+};
+
+/* hwrm_func_qcfg_input (size:192b/24B) */
+struct hwrm_func_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	unused_0[6];
+};
+
+/* hwrm_func_qcfg_output (size:640b/80B) */
+struct hwrm_func_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	fid;
+	__le16	port_id;
+	__le16	vlan;
+	__le16	flags;
+	#define FUNC_QCFG_RESP_FLAGS_OOB_WOL_MAGICPKT_ENABLED     0x1UL
+	#define FUNC_QCFG_RESP_FLAGS_OOB_WOL_BMP_ENABLED          0x2UL
+	#define FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED        0x4UL
+	#define FUNC_QCFG_RESP_FLAGS_STD_TX_RING_MODE_ENABLED     0x8UL
+	#define FUNC_QCFG_RESP_FLAGS_FW_LLDP_AGENT_ENABLED        0x10UL
+	#define FUNC_QCFG_RESP_FLAGS_MULTI_HOST                   0x20UL
+	u8	mac_address[6];
+	__le16	pci_id;
+	__le16	alloc_rsscos_ctx;
+	__le16	alloc_cmpl_rings;
+	__le16	alloc_tx_rings;
+	__le16	alloc_rx_rings;
+	__le16	alloc_l2_ctx;
+	__le16	alloc_vnics;
+	__le16	mtu;
+	__le16	mru;
+	__le16	stat_ctx_id;
+	u8	port_partition_type;
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_SPF     0x0UL
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_MPFS    0x1UL
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_0 0x2UL
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_5 0x3UL
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR2_0 0x4UL
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_UNKNOWN 0xffUL
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_LAST   FUNC_QCFG_RESP_PORT_PARTITION_TYPE_UNKNOWN
+	u8	port_pf_cnt;
+	#define FUNC_QCFG_RESP_PORT_PF_CNT_UNAVAIL 0x0UL
+	#define FUNC_QCFG_RESP_PORT_PF_CNT_LAST   FUNC_QCFG_RESP_PORT_PF_CNT_UNAVAIL
+	__le16	dflt_vnic_id;
+	__le16	max_mtu_configured;
+	__le32	min_bw;
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_SFT              0
+	#define FUNC_QCFG_RESP_MIN_BW_SCALE                     0x10000000UL
+	#define FUNC_QCFG_RESP_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define FUNC_QCFG_RESP_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define FUNC_QCFG_RESP_MIN_BW_SCALE_LAST                 FUNC_QCFG_RESP_MIN_BW_SCALE_BYTES
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_LAST         FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	max_bw;
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_SFT              0
+	#define FUNC_QCFG_RESP_MAX_BW_SCALE                     0x10000000UL
+	#define FUNC_QCFG_RESP_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define FUNC_QCFG_RESP_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define FUNC_QCFG_RESP_MAX_BW_SCALE_LAST                 FUNC_QCFG_RESP_MAX_BW_SCALE_BYTES
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_LAST         FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	evb_mode;
+	#define FUNC_QCFG_RESP_EVB_MODE_NO_EVB 0x0UL
+	#define FUNC_QCFG_RESP_EVB_MODE_VEB    0x1UL
+	#define FUNC_QCFG_RESP_EVB_MODE_VEPA   0x2UL
+	#define FUNC_QCFG_RESP_EVB_MODE_LAST  FUNC_QCFG_RESP_EVB_MODE_VEPA
+	u8	options;
+	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_MASK    0x3UL
+	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_SFT     0
+	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_SIZE_64   0x0UL
+	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_SIZE_128  0x1UL
+	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_LAST     FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_SIZE_128
+	#define FUNC_QCFG_RESP_OPTIONS_RSVD_MASK              0xfcUL
+	#define FUNC_QCFG_RESP_OPTIONS_RSVD_SFT               2
+	__le16	alloc_vfs;
+	__le32	alloc_mcast_filters;
+	__le32	alloc_hw_ring_grps;
+	__le16	alloc_sp_tx_rings;
+	__le16	alloc_stat_ctx;
+	u8	unused_2[7];
+	u8	valid;
+};
+
+/* hwrm_func_vlan_cfg_input (size:384b/48B) */
+struct hwrm_func_vlan_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	unused_0[2];
+	__le32	enables;
+	#define FUNC_VLAN_CFG_REQ_ENABLES_STAG_VID      0x1UL
+	#define FUNC_VLAN_CFG_REQ_ENABLES_CTAG_VID      0x2UL
+	#define FUNC_VLAN_CFG_REQ_ENABLES_STAG_PCP      0x4UL
+	#define FUNC_VLAN_CFG_REQ_ENABLES_CTAG_PCP      0x8UL
+	#define FUNC_VLAN_CFG_REQ_ENABLES_STAG_TPID     0x10UL
+	#define FUNC_VLAN_CFG_REQ_ENABLES_CTAG_TPID     0x20UL
+	__le16	stag_vid;
+	u8	stag_pcp;
+	u8	unused_1;
+	__be16	stag_tpid;
+	__le16	ctag_vid;
+	u8	ctag_pcp;
+	u8	unused_2;
+	__be16	ctag_tpid;
+	__le32	rsvd1;
+	__le32	rsvd2;
+	u8	unused_3[4];
+};
+
+/* hwrm_func_vlan_cfg_output (size:128b/16B) */
+struct hwrm_func_vlan_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_cfg_input (size:704b/88B) */
+struct hwrm_func_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	unused_0[2];
+	__le32	flags;
+	#define FUNC_CFG_REQ_FLAGS_SRC_MAC_ADDR_CHECK_DISABLE     0x1UL
+	#define FUNC_CFG_REQ_FLAGS_SRC_MAC_ADDR_CHECK_ENABLE      0x2UL
+	#define FUNC_CFG_REQ_FLAGS_RSVD_MASK                      0x1fcUL
+	#define FUNC_CFG_REQ_FLAGS_RSVD_SFT                       2
+	#define FUNC_CFG_REQ_FLAGS_STD_TX_RING_MODE_ENABLE        0x200UL
+	#define FUNC_CFG_REQ_FLAGS_STD_TX_RING_MODE_DISABLE       0x400UL
+	#define FUNC_CFG_REQ_FLAGS_VIRT_MAC_PERSIST               0x800UL
+	#define FUNC_CFG_REQ_FLAGS_NO_AUTOCLEAR_STATISTIC         0x1000UL
+	#define FUNC_CFG_REQ_FLAGS_TX_ASSETS_TEST                 0x2000UL
+	#define FUNC_CFG_REQ_FLAGS_RX_ASSETS_TEST                 0x4000UL
+	#define FUNC_CFG_REQ_FLAGS_CMPL_ASSETS_TEST               0x8000UL
+	#define FUNC_CFG_REQ_FLAGS_RSSCOS_CTX_ASSETS_TEST         0x10000UL
+	#define FUNC_CFG_REQ_FLAGS_RING_GRP_ASSETS_TEST           0x20000UL
+	#define FUNC_CFG_REQ_FLAGS_STAT_CTX_ASSETS_TEST           0x40000UL
+	#define FUNC_CFG_REQ_FLAGS_VNIC_ASSETS_TEST               0x80000UL
+	#define FUNC_CFG_REQ_FLAGS_L2_CTX_ASSETS_TEST             0x100000UL
+	__le32	enables;
+	#define FUNC_CFG_REQ_ENABLES_MTU                     0x1UL
+	#define FUNC_CFG_REQ_ENABLES_MRU                     0x2UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_RSSCOS_CTXS         0x4UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_CMPL_RINGS          0x8UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_TX_RINGS            0x10UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_RX_RINGS            0x20UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_L2_CTXS             0x40UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_VNICS               0x80UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_STAT_CTXS           0x100UL
+	#define FUNC_CFG_REQ_ENABLES_DFLT_MAC_ADDR           0x200UL
+	#define FUNC_CFG_REQ_ENABLES_DFLT_VLAN               0x400UL
+	#define FUNC_CFG_REQ_ENABLES_DFLT_IP_ADDR            0x800UL
+	#define FUNC_CFG_REQ_ENABLES_MIN_BW                  0x1000UL
+	#define FUNC_CFG_REQ_ENABLES_MAX_BW                  0x2000UL
+	#define FUNC_CFG_REQ_ENABLES_ASYNC_EVENT_CR          0x4000UL
+	#define FUNC_CFG_REQ_ENABLES_VLAN_ANTISPOOF_MODE     0x8000UL
+	#define FUNC_CFG_REQ_ENABLES_ALLOWED_VLAN_PRIS       0x10000UL
+	#define FUNC_CFG_REQ_ENABLES_EVB_MODE                0x20000UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_MCAST_FILTERS       0x40000UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_HW_RING_GRPS        0x80000UL
+	#define FUNC_CFG_REQ_ENABLES_CACHE_LINESIZE          0x100000UL
+	__le16	mtu;
+	__le16	mru;
+	__le16	num_rsscos_ctxs;
+	__le16	num_cmpl_rings;
+	__le16	num_tx_rings;
+	__le16	num_rx_rings;
+	__le16	num_l2_ctxs;
+	__le16	num_vnics;
+	__le16	num_stat_ctxs;
+	__le16	num_hw_ring_grps;
+	u8	dflt_mac_addr[6];
+	__le16	dflt_vlan;
+	__be32	dflt_ip_addr[4];
+	__le32	min_bw;
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_SFT              0
+	#define FUNC_CFG_REQ_MIN_BW_SCALE                     0x10000000UL
+	#define FUNC_CFG_REQ_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define FUNC_CFG_REQ_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define FUNC_CFG_REQ_MIN_BW_SCALE_LAST                 FUNC_CFG_REQ_MIN_BW_SCALE_BYTES
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_LAST         FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	max_bw;
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_SFT              0
+	#define FUNC_CFG_REQ_MAX_BW_SCALE                     0x10000000UL
+	#define FUNC_CFG_REQ_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define FUNC_CFG_REQ_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define FUNC_CFG_REQ_MAX_BW_SCALE_LAST                 FUNC_CFG_REQ_MAX_BW_SCALE_BYTES
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_LAST         FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_INVALID
+	__le16	async_event_cr;
+	u8	vlan_antispoof_mode;
+	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_NOCHECK                 0x0UL
+	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_VALIDATE_VLAN           0x1UL
+	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_INSERT_IF_VLANDNE       0x2UL
+	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_INSERT_OR_OVERRIDE_VLAN 0x3UL
+	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_LAST                   FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_INSERT_OR_OVERRIDE_VLAN
+	u8	allowed_vlan_pris;
+	u8	evb_mode;
+	#define FUNC_CFG_REQ_EVB_MODE_NO_EVB 0x0UL
+	#define FUNC_CFG_REQ_EVB_MODE_VEB    0x1UL
+	#define FUNC_CFG_REQ_EVB_MODE_VEPA   0x2UL
+	#define FUNC_CFG_REQ_EVB_MODE_LAST  FUNC_CFG_REQ_EVB_MODE_VEPA
+	u8	options;
+	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_MASK    0x3UL
+	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SFT     0
+	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SIZE_64   0x0UL
+	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SIZE_128  0x1UL
+	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_LAST     FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SIZE_128
+	#define FUNC_CFG_REQ_OPTIONS_RSVD_MASK              0xfcUL
+	#define FUNC_CFG_REQ_OPTIONS_RSVD_SFT               2
+	__le16	num_mcast_filters;
+};
+
+/* hwrm_func_cfg_output (size:128b/16B) */
+struct hwrm_func_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_qstats_input (size:192b/24B) */
+struct hwrm_func_qstats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	unused_0[6];
+};
+
+/* hwrm_func_qstats_output (size:1408b/176B) */
+struct hwrm_func_qstats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	tx_ucast_pkts;
+	__le64	tx_mcast_pkts;
+	__le64	tx_bcast_pkts;
+	__le64	tx_discard_pkts;
+	__le64	tx_drop_pkts;
+	__le64	tx_ucast_bytes;
+	__le64	tx_mcast_bytes;
+	__le64	tx_bcast_bytes;
+	__le64	rx_ucast_pkts;
+	__le64	rx_mcast_pkts;
+	__le64	rx_bcast_pkts;
+	__le64	rx_discard_pkts;
+	__le64	rx_drop_pkts;
+	__le64	rx_ucast_bytes;
+	__le64	rx_mcast_bytes;
+	__le64	rx_bcast_bytes;
+	__le64	rx_agg_pkts;
+	__le64	rx_agg_bytes;
+	__le64	rx_agg_events;
+	__le64	rx_agg_aborts;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_clr_stats_input (size:192b/24B) */
+struct hwrm_func_clr_stats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	unused_0[6];
+};
+
+/* hwrm_func_clr_stats_output (size:128b/16B) */
+struct hwrm_func_clr_stats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_vf_resc_free_input (size:192b/24B) */
+struct hwrm_func_vf_resc_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	vf_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_func_vf_resc_free_output (size:128b/16B) */
+struct hwrm_func_vf_resc_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_vf_vnic_ids_query_input (size:256b/32B) */
+struct hwrm_func_vf_vnic_ids_query_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	vf_id;
+	u8	unused_0[2];
+	__le32	max_vnic_id_cnt;
+	__le64	vnic_id_tbl_addr;
+};
+
+/* hwrm_func_vf_vnic_ids_query_output (size:128b/16B) */
+struct hwrm_func_vf_vnic_ids_query_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	vnic_id_cnt;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_func_drv_rgtr_input (size:896b/112B) */
+struct hwrm_func_drv_rgtr_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define FUNC_DRV_RGTR_REQ_FLAGS_FWD_ALL_MODE       0x1UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_FWD_NONE_MODE      0x2UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_16BIT_VER_MODE     0x4UL
+	__le32	enables;
+	#define FUNC_DRV_RGTR_REQ_ENABLES_OS_TYPE             0x1UL
+	#define FUNC_DRV_RGTR_REQ_ENABLES_VER                 0x2UL
+	#define FUNC_DRV_RGTR_REQ_ENABLES_TIMESTAMP           0x4UL
+	#define FUNC_DRV_RGTR_REQ_ENABLES_VF_REQ_FWD          0x8UL
+	#define FUNC_DRV_RGTR_REQ_ENABLES_ASYNC_EVENT_FWD     0x10UL
+	__le16	os_type;
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_UNKNOWN   0x0UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_OTHER     0x1UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_MSDOS     0xeUL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_WINDOWS   0x12UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_SOLARIS   0x1dUL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_LINUX     0x24UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_FREEBSD   0x2aUL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_ESXI      0x68UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_WIN864    0x73UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_WIN2012R2 0x74UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_UEFI      0x8000UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_LAST     FUNC_DRV_RGTR_REQ_OS_TYPE_UEFI
+	u8	ver_maj_8b;
+	u8	ver_min_8b;
+	u8	ver_upd_8b;
+	u8	unused_0[3];
+	__le32	timestamp;
+	u8	unused_1[4];
+	__le32	vf_req_fwd[8];
+	__le32	async_event_fwd[8];
+	__le16	ver_maj;
+	__le16	ver_min;
+	__le16	ver_upd;
+	__le16	ver_patch;
+};
+
+/* hwrm_func_drv_rgtr_output (size:128b/16B) */
+struct hwrm_func_drv_rgtr_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_drv_unrgtr_input (size:192b/24B) */
+struct hwrm_func_drv_unrgtr_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define FUNC_DRV_UNRGTR_REQ_FLAGS_PREPARE_FOR_SHUTDOWN     0x1UL
+	u8	unused_0[4];
+};
+
+/* hwrm_func_drv_unrgtr_output (size:128b/16B) */
+struct hwrm_func_drv_unrgtr_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_buf_rgtr_input (size:1024b/128B) */
+struct hwrm_func_buf_rgtr_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define FUNC_BUF_RGTR_REQ_ENABLES_VF_ID            0x1UL
+	#define FUNC_BUF_RGTR_REQ_ENABLES_ERR_BUF_ADDR     0x2UL
+	__le16	vf_id;
+	__le16	req_buf_num_pages;
+	__le16	req_buf_page_size;
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_16B 0x4UL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_4K  0xcUL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_8K  0xdUL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_64K 0x10UL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_2M  0x15UL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_4M  0x16UL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_1G  0x1eUL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_LAST FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_1G
+	__le16	req_buf_len;
+	__le16	resp_buf_len;
+	u8	unused_0[2];
+	__le64	req_buf_page_addr0;
+	__le64	req_buf_page_addr1;
+	__le64	req_buf_page_addr2;
+	__le64	req_buf_page_addr3;
+	__le64	req_buf_page_addr4;
+	__le64	req_buf_page_addr5;
+	__le64	req_buf_page_addr6;
+	__le64	req_buf_page_addr7;
+	__le64	req_buf_page_addr8;
+	__le64	req_buf_page_addr9;
+	__le64	error_buf_addr;
+	__le64	resp_buf_addr;
+};
+
+/* hwrm_func_buf_rgtr_output (size:128b/16B) */
+struct hwrm_func_buf_rgtr_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_drv_qver_input (size:192b/24B) */
+struct hwrm_func_drv_qver_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	reserved;
+	__le16	fid;
+	u8	unused_0[2];
+};
+
+/* hwrm_func_drv_qver_output (size:192b/24B) */
+struct hwrm_func_drv_qver_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	os_type;
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_UNKNOWN   0x0UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_OTHER     0x1UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_MSDOS     0xeUL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_WINDOWS   0x12UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_SOLARIS   0x1dUL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_LINUX     0x24UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_FREEBSD   0x2aUL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_ESXI      0x68UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_WIN864    0x73UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_WIN2012R2 0x74UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_UEFI      0x8000UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_LAST     FUNC_DRV_QVER_RESP_OS_TYPE_UEFI
+	u8	ver_maj_8b;
+	u8	ver_min_8b;
+	u8	ver_upd_8b;
+	u8	unused_0[2];
+	u8	valid;
+	__le16	ver_maj;
+	__le16	ver_min;
+	__le16	ver_upd;
+	__le16	ver_patch;
+};
+
+/* hwrm_func_resource_qcaps_input (size:192b/24B) */
+struct hwrm_func_resource_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	unused_0[6];
+};
+
+/* hwrm_func_resource_qcaps_output (size:448b/56B) */
+struct hwrm_func_resource_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	max_vfs;
+	__le16	max_msix;
+	__le16	vf_reservation_strategy;
+	#define FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_MAXIMAL        0x0UL
+	#define FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_MINIMAL        0x1UL
+	#define FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_MINIMAL_STATIC 0x2UL
+	#define FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_LAST          FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_MINIMAL_STATIC
+	__le16	min_rsscos_ctx;
+	__le16	max_rsscos_ctx;
+	__le16	min_cmpl_rings;
+	__le16	max_cmpl_rings;
+	__le16	min_tx_rings;
+	__le16	max_tx_rings;
+	__le16	min_rx_rings;
+	__le16	max_rx_rings;
+	__le16	min_l2_ctxs;
+	__le16	max_l2_ctxs;
+	__le16	min_vnics;
+	__le16	max_vnics;
+	__le16	min_stat_ctx;
+	__le16	max_stat_ctx;
+	__le16	min_hw_ring_grps;
+	__le16	max_hw_ring_grps;
+	__le16	max_tx_scheduler_inputs;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_vf_resource_cfg_input (size:448b/56B) */
+struct hwrm_func_vf_resource_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	vf_id;
+	__le16	max_msix;
+	__le16	min_rsscos_ctx;
+	__le16	max_rsscos_ctx;
+	__le16	min_cmpl_rings;
+	__le16	max_cmpl_rings;
+	__le16	min_tx_rings;
+	__le16	max_tx_rings;
+	__le16	min_rx_rings;
+	__le16	max_rx_rings;
+	__le16	min_l2_ctxs;
+	__le16	max_l2_ctxs;
+	__le16	min_vnics;
+	__le16	max_vnics;
+	__le16	min_stat_ctx;
+	__le16	max_stat_ctx;
+	__le16	min_hw_ring_grps;
+	__le16	max_hw_ring_grps;
+	u8	unused_0[4];
+};
+
+/* hwrm_func_vf_resource_cfg_output (size:256b/32B) */
+struct hwrm_func_vf_resource_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	reserved_rsscos_ctx;
+	__le16	reserved_cmpl_rings;
+	__le16	reserved_tx_rings;
+	__le16	reserved_rx_rings;
+	__le16	reserved_l2_ctxs;
+	__le16	reserved_vnics;
+	__le16	reserved_stat_ctx;
+	__le16	reserved_hw_ring_grps;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_phy_cfg_input (size:448b/56B) */
+struct hwrm_port_phy_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define PORT_PHY_CFG_REQ_FLAGS_RESET_PHY                0x1UL
+	#define PORT_PHY_CFG_REQ_FLAGS_DEPRECATED               0x2UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FORCE                    0x4UL
+	#define PORT_PHY_CFG_REQ_FLAGS_RESTART_AUTONEG          0x8UL
+	#define PORT_PHY_CFG_REQ_FLAGS_EEE_ENABLE               0x10UL
+	#define PORT_PHY_CFG_REQ_FLAGS_EEE_DISABLE              0x20UL
+	#define PORT_PHY_CFG_REQ_FLAGS_EEE_TX_LPI_ENABLE        0x40UL
+	#define PORT_PHY_CFG_REQ_FLAGS_EEE_TX_LPI_DISABLE       0x80UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_AUTONEG_ENABLE       0x100UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_AUTONEG_DISABLE      0x200UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_CLAUSE74_ENABLE      0x400UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_CLAUSE74_DISABLE     0x800UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_CLAUSE91_ENABLE      0x1000UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_CLAUSE91_DISABLE     0x2000UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FORCE_LINK_DWN           0x4000UL
+	__le32	enables;
+	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_MODE                0x1UL
+	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_DUPLEX              0x2UL
+	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_PAUSE               0x4UL
+	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_LINK_SPEED          0x8UL
+	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_LINK_SPEED_MASK     0x10UL
+	#define PORT_PHY_CFG_REQ_ENABLES_WIRESPEED                0x20UL
+	#define PORT_PHY_CFG_REQ_ENABLES_LPBK                     0x40UL
+	#define PORT_PHY_CFG_REQ_ENABLES_PREEMPHASIS              0x80UL
+	#define PORT_PHY_CFG_REQ_ENABLES_FORCE_PAUSE              0x100UL
+	#define PORT_PHY_CFG_REQ_ENABLES_EEE_LINK_SPEED_MASK      0x200UL
+	#define PORT_PHY_CFG_REQ_ENABLES_TX_LPI_TIMER             0x400UL
+	__le16	port_id;
+	__le16	force_link_speed;
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_100MB 0x1UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_1GB   0xaUL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_2GB   0x14UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_2_5GB 0x19UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_10GB  0x64UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_20GB  0xc8UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_25GB  0xfaUL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_40GB  0x190UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_50GB  0x1f4UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_100GB 0x3e8UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_10MB  0xffffUL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_LAST PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_10MB
+	u8	auto_mode;
+	#define PORT_PHY_CFG_REQ_AUTO_MODE_NONE         0x0UL
+	#define PORT_PHY_CFG_REQ_AUTO_MODE_ALL_SPEEDS   0x1UL
+	#define PORT_PHY_CFG_REQ_AUTO_MODE_ONE_SPEED    0x2UL
+	#define PORT_PHY_CFG_REQ_AUTO_MODE_ONE_OR_BELOW 0x3UL
+	#define PORT_PHY_CFG_REQ_AUTO_MODE_SPEED_MASK   0x4UL
+	#define PORT_PHY_CFG_REQ_AUTO_MODE_LAST        PORT_PHY_CFG_REQ_AUTO_MODE_SPEED_MASK
+	u8	auto_duplex;
+	#define PORT_PHY_CFG_REQ_AUTO_DUPLEX_HALF 0x0UL
+	#define PORT_PHY_CFG_REQ_AUTO_DUPLEX_FULL 0x1UL
+	#define PORT_PHY_CFG_REQ_AUTO_DUPLEX_BOTH 0x2UL
+	#define PORT_PHY_CFG_REQ_AUTO_DUPLEX_LAST PORT_PHY_CFG_REQ_AUTO_DUPLEX_BOTH
+	u8	auto_pause;
+	#define PORT_PHY_CFG_REQ_AUTO_PAUSE_TX                0x1UL
+	#define PORT_PHY_CFG_REQ_AUTO_PAUSE_RX                0x2UL
+	#define PORT_PHY_CFG_REQ_AUTO_PAUSE_AUTONEG_PAUSE     0x4UL
+	u8	unused_0;
+	__le16	auto_link_speed;
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_100MB 0x1UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_1GB   0xaUL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_2GB   0x14UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_2_5GB 0x19UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_10GB  0x64UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_20GB  0xc8UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_25GB  0xfaUL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_40GB  0x190UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_50GB  0x1f4UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_100GB 0x3e8UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_10MB  0xffffUL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_LAST PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_10MB
+	__le16	auto_link_speed_mask;
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_100MBHD     0x1UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_100MB       0x2UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_1GBHD       0x4UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_1GB         0x8UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_2GB         0x10UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_2_5GB       0x20UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_10GB        0x40UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_20GB        0x80UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_25GB        0x100UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_40GB        0x200UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_50GB        0x400UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_100GB       0x800UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_10MBHD      0x1000UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_10MB        0x2000UL
+	u8	wirespeed;
+	#define PORT_PHY_CFG_REQ_WIRESPEED_OFF 0x0UL
+	#define PORT_PHY_CFG_REQ_WIRESPEED_ON  0x1UL
+	#define PORT_PHY_CFG_REQ_WIRESPEED_LAST PORT_PHY_CFG_REQ_WIRESPEED_ON
+	u8	lpbk;
+	#define PORT_PHY_CFG_REQ_LPBK_NONE   0x0UL
+	#define PORT_PHY_CFG_REQ_LPBK_LOCAL  0x1UL
+	#define PORT_PHY_CFG_REQ_LPBK_REMOTE 0x2UL
+	#define PORT_PHY_CFG_REQ_LPBK_LAST  PORT_PHY_CFG_REQ_LPBK_REMOTE
+	u8	force_pause;
+	#define PORT_PHY_CFG_REQ_FORCE_PAUSE_TX     0x1UL
+	#define PORT_PHY_CFG_REQ_FORCE_PAUSE_RX     0x2UL
+	u8	unused_1;
+	__le32	preemphasis;
+	__le16	eee_link_speed_mask;
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_RSVD1     0x1UL
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_100MB     0x2UL
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_RSVD2     0x4UL
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_1GB       0x8UL
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_RSVD3     0x10UL
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_RSVD4     0x20UL
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_10GB      0x40UL
+	u8	unused_2[2];
+	__le32	tx_lpi_timer;
+	#define PORT_PHY_CFG_REQ_TX_LPI_TIMER_MASK 0xffffffUL
+	#define PORT_PHY_CFG_REQ_TX_LPI_TIMER_SFT 0
+	__le32	unused_3;
+};
+
+/* hwrm_port_phy_cfg_output (size:128b/16B) */
+struct hwrm_port_phy_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_phy_cfg_cmd_err (size:64b/8B) */
+struct hwrm_port_phy_cfg_cmd_err {
+	u8	code;
+	#define PORT_PHY_CFG_CMD_ERR_CODE_UNKNOWN       0x0UL
+	#define PORT_PHY_CFG_CMD_ERR_CODE_ILLEGAL_SPEED 0x1UL
+	#define PORT_PHY_CFG_CMD_ERR_CODE_RETRY         0x2UL
+	#define PORT_PHY_CFG_CMD_ERR_CODE_LAST         PORT_PHY_CFG_CMD_ERR_CODE_RETRY
+	u8	unused_0[7];
+};
+
+/* hwrm_port_phy_qcfg_input (size:192b/24B) */
+struct hwrm_port_phy_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_port_phy_qcfg_output (size:768b/96B) */
+struct hwrm_port_phy_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	link;
+	#define PORT_PHY_QCFG_RESP_LINK_NO_LINK 0x0UL
+	#define PORT_PHY_QCFG_RESP_LINK_SIGNAL  0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_LINK    0x2UL
+	#define PORT_PHY_QCFG_RESP_LINK_LAST   PORT_PHY_QCFG_RESP_LINK_LINK
+	u8	unused_0;
+	__le16	link_speed;
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_100MB 0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_1GB   0xaUL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_2GB   0x14UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_2_5GB 0x19UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_10GB  0x64UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_20GB  0xc8UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_25GB  0xfaUL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_40GB  0x190UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_50GB  0x1f4UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_100GB 0x3e8UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_10MB  0xffffUL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_LAST PORT_PHY_QCFG_RESP_LINK_SPEED_10MB
+	u8	duplex_cfg;
+	#define PORT_PHY_QCFG_RESP_DUPLEX_CFG_HALF 0x0UL
+	#define PORT_PHY_QCFG_RESP_DUPLEX_CFG_FULL 0x1UL
+	#define PORT_PHY_QCFG_RESP_DUPLEX_CFG_LAST PORT_PHY_QCFG_RESP_DUPLEX_CFG_FULL
+	u8	pause;
+	#define PORT_PHY_QCFG_RESP_PAUSE_TX     0x1UL
+	#define PORT_PHY_QCFG_RESP_PAUSE_RX     0x2UL
+	__le16	support_speeds;
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100MBHD     0x1UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100MB       0x2UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_1GBHD       0x4UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_1GB         0x8UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_2GB         0x10UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_2_5GB       0x20UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_10GB        0x40UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_20GB        0x80UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_25GB        0x100UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_40GB        0x200UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_50GB        0x400UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100GB       0x800UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_10MBHD      0x1000UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_10MB        0x2000UL
+	__le16	force_link_speed;
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_100MB 0x1UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_1GB   0xaUL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_2GB   0x14UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_2_5GB 0x19UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_10GB  0x64UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_20GB  0xc8UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_25GB  0xfaUL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_40GB  0x190UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_50GB  0x1f4UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_100GB 0x3e8UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_10MB  0xffffUL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_LAST PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_10MB
+	u8	auto_mode;
+	#define PORT_PHY_QCFG_RESP_AUTO_MODE_NONE         0x0UL
+	#define PORT_PHY_QCFG_RESP_AUTO_MODE_ALL_SPEEDS   0x1UL
+	#define PORT_PHY_QCFG_RESP_AUTO_MODE_ONE_SPEED    0x2UL
+	#define PORT_PHY_QCFG_RESP_AUTO_MODE_ONE_OR_BELOW 0x3UL
+	#define PORT_PHY_QCFG_RESP_AUTO_MODE_SPEED_MASK   0x4UL
+	#define PORT_PHY_QCFG_RESP_AUTO_MODE_LAST        PORT_PHY_QCFG_RESP_AUTO_MODE_SPEED_MASK
+	u8	auto_pause;
+	#define PORT_PHY_QCFG_RESP_AUTO_PAUSE_TX                0x1UL
+	#define PORT_PHY_QCFG_RESP_AUTO_PAUSE_RX                0x2UL
+	#define PORT_PHY_QCFG_RESP_AUTO_PAUSE_AUTONEG_PAUSE     0x4UL
+	__le16	auto_link_speed;
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_100MB 0x1UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_1GB   0xaUL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_2GB   0x14UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_2_5GB 0x19UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_10GB  0x64UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_20GB  0xc8UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_25GB  0xfaUL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_40GB  0x190UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_50GB  0x1f4UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_100GB 0x3e8UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_10MB  0xffffUL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_LAST PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_10MB
+	__le16	auto_link_speed_mask;
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_100MBHD     0x1UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_100MB       0x2UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_1GBHD       0x4UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_1GB         0x8UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_2GB         0x10UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_2_5GB       0x20UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_10GB        0x40UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_20GB        0x80UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_25GB        0x100UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_40GB        0x200UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_50GB        0x400UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_100GB       0x800UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_10MBHD      0x1000UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_10MB        0x2000UL
+	u8	wirespeed;
+	#define PORT_PHY_QCFG_RESP_WIRESPEED_OFF 0x0UL
+	#define PORT_PHY_QCFG_RESP_WIRESPEED_ON  0x1UL
+	#define PORT_PHY_QCFG_RESP_WIRESPEED_LAST PORT_PHY_QCFG_RESP_WIRESPEED_ON
+	u8	lpbk;
+	#define PORT_PHY_QCFG_RESP_LPBK_NONE   0x0UL
+	#define PORT_PHY_QCFG_RESP_LPBK_LOCAL  0x1UL
+	#define PORT_PHY_QCFG_RESP_LPBK_REMOTE 0x2UL
+	#define PORT_PHY_QCFG_RESP_LPBK_LAST  PORT_PHY_QCFG_RESP_LPBK_REMOTE
+	u8	force_pause;
+	#define PORT_PHY_QCFG_RESP_FORCE_PAUSE_TX     0x1UL
+	#define PORT_PHY_QCFG_RESP_FORCE_PAUSE_RX     0x2UL
+	u8	module_status;
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_NONE          0x0UL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_DISABLETX     0x1UL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_WARNINGMSG    0x2UL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_PWRDOWN       0x3UL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_NOTINSERTED   0x4UL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_NOTAPPLICABLE 0xffUL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_LAST         PORT_PHY_QCFG_RESP_MODULE_STATUS_NOTAPPLICABLE
+	__le32	preemphasis;
+	u8	phy_maj;
+	u8	phy_min;
+	u8	phy_bld;
+	u8	phy_type;
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_UNKNOWN          0x0UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASECR           0x1UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASEKR4          0x2UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASELR           0x3UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASESR           0x4UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASEKR2          0x5UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASEKX           0x6UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASEKR           0x7UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASET            0x8UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASETE           0x9UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_SGMIIEXTPHY      0xaUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_25G_BASECR_CA_L  0xbUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_25G_BASECR_CA_S  0xcUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_25G_BASECR_CA_N  0xdUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_25G_BASESR       0xeUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASECR4     0xfUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASESR4     0x10UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASELR4     0x11UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASEER4     0x12UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASESR10    0x13UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_BASECR4      0x14UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_BASESR4      0x15UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_BASELR4      0x16UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_BASEER4      0x17UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_ACTIVE_CABLE 0x18UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_1G_BASET         0x19UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_1G_BASESX        0x1aUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_1G_BASECX        0x1bUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_LAST            PORT_PHY_QCFG_RESP_PHY_TYPE_1G_BASECX
+	u8	media_type;
+	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_UNKNOWN 0x0UL
+	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_TP      0x1UL
+	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_DAC     0x2UL
+	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_FIBRE   0x3UL
+	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_LAST   PORT_PHY_QCFG_RESP_MEDIA_TYPE_FIBRE
+	u8	xcvr_pkg_type;
+	#define PORT_PHY_QCFG_RESP_XCVR_PKG_TYPE_XCVR_INTERNAL 0x1UL
+	#define PORT_PHY_QCFG_RESP_XCVR_PKG_TYPE_XCVR_EXTERNAL 0x2UL
+	#define PORT_PHY_QCFG_RESP_XCVR_PKG_TYPE_LAST         PORT_PHY_QCFG_RESP_XCVR_PKG_TYPE_XCVR_EXTERNAL
+	u8	eee_config_phy_addr;
+	#define PORT_PHY_QCFG_RESP_PHY_ADDR_MASK              0x1fUL
+	#define PORT_PHY_QCFG_RESP_PHY_ADDR_SFT               0
+	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_MASK            0xe0UL
+	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_SFT             5
+	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_EEE_ENABLED      0x20UL
+	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_EEE_ACTIVE       0x40UL
+	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_EEE_TX_LPI       0x80UL
+	u8	parallel_detect;
+	#define PORT_PHY_QCFG_RESP_PARALLEL_DETECT     0x1UL
+	__le16	link_partner_adv_speeds;
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_100MBHD     0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_100MB       0x2UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_1GBHD       0x4UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_1GB         0x8UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_2GB         0x10UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_2_5GB       0x20UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_10GB        0x40UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_20GB        0x80UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_25GB        0x100UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_40GB        0x200UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_50GB        0x400UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_100GB       0x800UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_10MBHD      0x1000UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_10MB        0x2000UL
+	u8	link_partner_adv_auto_mode;
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_NONE         0x0UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_ALL_SPEEDS   0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_ONE_SPEED    0x2UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_ONE_OR_BELOW 0x3UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_SPEED_MASK   0x4UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_LAST        PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_SPEED_MASK
+	u8	link_partner_adv_pause;
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_PAUSE_TX     0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_PAUSE_RX     0x2UL
+	__le16	adv_eee_link_speed_mask;
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_RSVD1     0x1UL
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_100MB     0x2UL
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_RSVD2     0x4UL
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_1GB       0x8UL
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_RSVD3     0x10UL
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_RSVD4     0x20UL
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_10GB      0x40UL
+	__le16	link_partner_adv_eee_link_speed_mask;
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_RSVD1     0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_100MB     0x2UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_RSVD2     0x4UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_1GB       0x8UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_RSVD3     0x10UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_RSVD4     0x20UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_10GB      0x40UL
+	__le32	xcvr_identifier_type_tx_lpi_timer;
+	#define PORT_PHY_QCFG_RESP_TX_LPI_TIMER_MASK            0xffffffUL
+	#define PORT_PHY_QCFG_RESP_TX_LPI_TIMER_SFT             0
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_MASK    0xff000000UL
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_SFT     24
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_UNKNOWN   (0x0UL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_SFP       (0x3UL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFP      (0xcUL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFPPLUS  (0xdUL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFP28    (0x11UL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_LAST     PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFP28
+	__le16	fec_cfg;
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_NONE_SUPPORTED         0x1UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_AUTONEG_SUPPORTED      0x2UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_AUTONEG_ENABLED        0x4UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE74_SUPPORTED     0x8UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE74_ENABLED       0x10UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE91_SUPPORTED     0x20UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE91_ENABLED       0x40UL
+	u8	duplex_state;
+	#define PORT_PHY_QCFG_RESP_DUPLEX_STATE_HALF 0x0UL
+	#define PORT_PHY_QCFG_RESP_DUPLEX_STATE_FULL 0x1UL
+	#define PORT_PHY_QCFG_RESP_DUPLEX_STATE_LAST PORT_PHY_QCFG_RESP_DUPLEX_STATE_FULL
+	u8	option_flags;
+	#define PORT_PHY_QCFG_RESP_OPTION_FLAGS_MEDIA_AUTO_DETECT     0x1UL
+	char	phy_vendor_name[16];
+	char	phy_vendor_partnumber[16];
+	u8	unused_2[7];
+	u8	valid;
+};
+
+/* hwrm_port_mac_cfg_input (size:320b/40B) */
+struct hwrm_port_mac_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define PORT_MAC_CFG_REQ_FLAGS_MATCH_LINK                    0x1UL
+	#define PORT_MAC_CFG_REQ_FLAGS_VLAN_PRI2COS_ENABLE           0x2UL
+	#define PORT_MAC_CFG_REQ_FLAGS_TUNNEL_PRI2COS_ENABLE         0x4UL
+	#define PORT_MAC_CFG_REQ_FLAGS_IP_DSCP2COS_ENABLE            0x8UL
+	#define PORT_MAC_CFG_REQ_FLAGS_PTP_RX_TS_CAPTURE_ENABLE      0x10UL
+	#define PORT_MAC_CFG_REQ_FLAGS_PTP_RX_TS_CAPTURE_DISABLE     0x20UL
+	#define PORT_MAC_CFG_REQ_FLAGS_PTP_TX_TS_CAPTURE_ENABLE      0x40UL
+	#define PORT_MAC_CFG_REQ_FLAGS_PTP_TX_TS_CAPTURE_DISABLE     0x80UL
+	#define PORT_MAC_CFG_REQ_FLAGS_OOB_WOL_ENABLE                0x100UL
+	#define PORT_MAC_CFG_REQ_FLAGS_OOB_WOL_DISABLE               0x200UL
+	#define PORT_MAC_CFG_REQ_FLAGS_VLAN_PRI2COS_DISABLE          0x400UL
+	#define PORT_MAC_CFG_REQ_FLAGS_TUNNEL_PRI2COS_DISABLE        0x800UL
+	#define PORT_MAC_CFG_REQ_FLAGS_IP_DSCP2COS_DISABLE           0x1000UL
+	__le32	enables;
+	#define PORT_MAC_CFG_REQ_ENABLES_IPG                            0x1UL
+	#define PORT_MAC_CFG_REQ_ENABLES_LPBK                           0x2UL
+	#define PORT_MAC_CFG_REQ_ENABLES_VLAN_PRI2COS_MAP_PRI           0x4UL
+	#define PORT_MAC_CFG_REQ_ENABLES_TUNNEL_PRI2COS_MAP_PRI         0x10UL
+	#define PORT_MAC_CFG_REQ_ENABLES_DSCP2COS_MAP_PRI               0x20UL
+	#define PORT_MAC_CFG_REQ_ENABLES_RX_TS_CAPTURE_PTP_MSG_TYPE     0x40UL
+	#define PORT_MAC_CFG_REQ_ENABLES_TX_TS_CAPTURE_PTP_MSG_TYPE     0x80UL
+	#define PORT_MAC_CFG_REQ_ENABLES_COS_FIELD_CFG                  0x100UL
+	__le16	port_id;
+	u8	ipg;
+	u8	lpbk;
+	#define PORT_MAC_CFG_REQ_LPBK_NONE   0x0UL
+	#define PORT_MAC_CFG_REQ_LPBK_LOCAL  0x1UL
+	#define PORT_MAC_CFG_REQ_LPBK_REMOTE 0x2UL
+	#define PORT_MAC_CFG_REQ_LPBK_LAST  PORT_MAC_CFG_REQ_LPBK_REMOTE
+	u8	vlan_pri2cos_map_pri;
+	u8	reserved1;
+	u8	tunnel_pri2cos_map_pri;
+	u8	dscp2pri_map_pri;
+	__le16	rx_ts_capture_ptp_msg_type;
+	__le16	tx_ts_capture_ptp_msg_type;
+	u8	cos_field_cfg;
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_RSVD1                     0x1UL
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_MASK         0x6UL
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_SFT          1
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_INNERMOST      (0x0UL << 1)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_OUTER          (0x1UL << 1)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_OUTERMOST      (0x2UL << 1)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_UNSPECIFIED    (0x3UL << 1)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_LAST          PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_UNSPECIFIED
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_MASK       0x18UL
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_SFT        3
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_INNERMOST    (0x0UL << 3)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_OUTER        (0x1UL << 3)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_OUTERMOST    (0x2UL << 3)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_UNSPECIFIED  (0x3UL << 3)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_LAST        PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_UNSPECIFIED
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_DEFAULT_COS_MASK          0xe0UL
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_DEFAULT_COS_SFT           5
+	u8	unused_0[3];
+};
+
+/* hwrm_port_mac_cfg_output (size:128b/16B) */
+struct hwrm_port_mac_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	mru;
+	__le16	mtu;
+	u8	ipg;
+	u8	lpbk;
+	#define PORT_MAC_CFG_RESP_LPBK_NONE   0x0UL
+	#define PORT_MAC_CFG_RESP_LPBK_LOCAL  0x1UL
+	#define PORT_MAC_CFG_RESP_LPBK_REMOTE 0x2UL
+	#define PORT_MAC_CFG_RESP_LPBK_LAST  PORT_MAC_CFG_RESP_LPBK_REMOTE
+	u8	unused_0;
+	u8	valid;
+};
+
+/* hwrm_port_mac_ptp_qcfg_input (size:192b/24B) */
+struct hwrm_port_mac_ptp_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_port_mac_ptp_qcfg_output (size:640b/80B) */
+struct hwrm_port_mac_ptp_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	flags;
+	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_DIRECT_ACCESS     0x1UL
+	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_HWRM_ACCESS       0x2UL
+	u8	unused_0[3];
+	__le32	rx_ts_reg_off_lower;
+	__le32	rx_ts_reg_off_upper;
+	__le32	rx_ts_reg_off_seq_id;
+	__le32	rx_ts_reg_off_src_id_0;
+	__le32	rx_ts_reg_off_src_id_1;
+	__le32	rx_ts_reg_off_src_id_2;
+	__le32	rx_ts_reg_off_domain_id;
+	__le32	rx_ts_reg_off_fifo;
+	__le32	rx_ts_reg_off_fifo_adv;
+	__le32	rx_ts_reg_off_granularity;
+	__le32	tx_ts_reg_off_lower;
+	__le32	tx_ts_reg_off_upper;
+	__le32	tx_ts_reg_off_seq_id;
+	__le32	tx_ts_reg_off_fifo;
+	__le32	tx_ts_reg_off_granularity;
+	u8	unused_1[7];
+	u8	valid;
+};
+
+/* hwrm_port_qstats_input (size:320b/40B) */
+struct hwrm_port_qstats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+	__le64	tx_stat_host_addr;
+	__le64	rx_stat_host_addr;
+};
+
+/* hwrm_port_qstats_output (size:128b/16B) */
+struct hwrm_port_qstats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	tx_stat_size;
+	__le16	rx_stat_size;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_port_qstats_ext_input (size:320b/40B) */
+struct hwrm_port_qstats_ext_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	__le16	tx_stat_size;
+	__le16	rx_stat_size;
+	u8	unused_0[2];
+	__le64	tx_stat_host_addr;
+	__le64	rx_stat_host_addr;
+};
+
+/* hwrm_port_qstats_ext_output (size:128b/16B) */
+struct hwrm_port_qstats_ext_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	tx_stat_size;
+	__le16	rx_stat_size;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_port_lpbk_qstats_input (size:128b/16B) */
+struct hwrm_port_lpbk_qstats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_port_lpbk_qstats_output (size:768b/96B) */
+struct hwrm_port_lpbk_qstats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	lpbk_ucast_frames;
+	__le64	lpbk_mcast_frames;
+	__le64	lpbk_bcast_frames;
+	__le64	lpbk_ucast_bytes;
+	__le64	lpbk_mcast_bytes;
+	__le64	lpbk_bcast_bytes;
+	__le64	tx_stat_discard;
+	__le64	tx_stat_error;
+	__le64	rx_stat_discard;
+	__le64	rx_stat_error;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_clr_stats_input (size:192b/24B) */
+struct hwrm_port_clr_stats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_port_clr_stats_output (size:128b/16B) */
+struct hwrm_port_clr_stats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_lpbk_clr_stats_input (size:128b/16B) */
+struct hwrm_port_lpbk_clr_stats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_port_lpbk_clr_stats_output (size:128b/16B) */
+struct hwrm_port_lpbk_clr_stats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_phy_qcaps_input (size:192b/24B) */
+struct hwrm_port_phy_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_port_phy_qcaps_output (size:192b/24B) */
+struct hwrm_port_phy_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	flags;
+	#define PORT_PHY_QCAPS_RESP_FLAGS_EEE_SUPPORTED     0x1UL
+	#define PORT_PHY_QCAPS_RESP_FLAGS_RSVD1_MASK        0xfeUL
+	#define PORT_PHY_QCAPS_RESP_FLAGS_RSVD1_SFT         1
+	u8	port_cnt;
+	#define PORT_PHY_QCAPS_RESP_PORT_CNT_UNKNOWN 0x0UL
+	#define PORT_PHY_QCAPS_RESP_PORT_CNT_1       0x1UL
+	#define PORT_PHY_QCAPS_RESP_PORT_CNT_2       0x2UL
+	#define PORT_PHY_QCAPS_RESP_PORT_CNT_3       0x3UL
+	#define PORT_PHY_QCAPS_RESP_PORT_CNT_4       0x4UL
+	#define PORT_PHY_QCAPS_RESP_PORT_CNT_LAST   PORT_PHY_QCAPS_RESP_PORT_CNT_4
+	__le16	supported_speeds_force_mode;
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_100MBHD     0x1UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_100MB       0x2UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_1GBHD       0x4UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_1GB         0x8UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_2GB         0x10UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_2_5GB       0x20UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_10GB        0x40UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_20GB        0x80UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_25GB        0x100UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_40GB        0x200UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_50GB        0x400UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_100GB       0x800UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_10MBHD      0x1000UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_10MB        0x2000UL
+	__le16	supported_speeds_auto_mode;
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_100MBHD     0x1UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_100MB       0x2UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_1GBHD       0x4UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_1GB         0x8UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_2GB         0x10UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_2_5GB       0x20UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_10GB        0x40UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_20GB        0x80UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_25GB        0x100UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_40GB        0x200UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_50GB        0x400UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_100GB       0x800UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_10MBHD      0x1000UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_10MB        0x2000UL
+	__le16	supported_speeds_eee_mode;
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_RSVD1     0x1UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_100MB     0x2UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_RSVD2     0x4UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_1GB       0x8UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_RSVD3     0x10UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_RSVD4     0x20UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_10GB      0x40UL
+	__le32	tx_lpi_timer_low;
+	#define PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_LOW_MASK 0xffffffUL
+	#define PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_LOW_SFT 0
+	#define PORT_PHY_QCAPS_RESP_RSVD2_MASK           0xff000000UL
+	#define PORT_PHY_QCAPS_RESP_RSVD2_SFT            24
+	__le32	valid_tx_lpi_timer_high;
+	#define PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_HIGH_MASK 0xffffffUL
+	#define PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_HIGH_SFT 0
+	#define PORT_PHY_QCAPS_RESP_VALID_MASK            0xff000000UL
+	#define PORT_PHY_QCAPS_RESP_VALID_SFT             24
+};
+
+/* hwrm_port_phy_i2c_read_input (size:320b/40B) */
+struct hwrm_port_phy_i2c_read_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	__le32	enables;
+	#define PORT_PHY_I2C_READ_REQ_ENABLES_PAGE_OFFSET     0x1UL
+	__le16	port_id;
+	u8	i2c_slave_addr;
+	u8	unused_0;
+	__le16	page_number;
+	__le16	page_offset;
+	u8	data_length;
+	u8	unused_1[7];
+};
+
+/* hwrm_port_phy_i2c_read_output (size:640b/80B) */
+struct hwrm_port_phy_i2c_read_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	data[16];
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_led_cfg_input (size:512b/64B) */
+struct hwrm_port_led_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define PORT_LED_CFG_REQ_ENABLES_LED0_ID            0x1UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED0_STATE         0x2UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED0_COLOR         0x4UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED0_BLINK_ON      0x8UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED0_BLINK_OFF     0x10UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED0_GROUP_ID      0x20UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED1_ID            0x40UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED1_STATE         0x80UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED1_COLOR         0x100UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED1_BLINK_ON      0x200UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED1_BLINK_OFF     0x400UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED1_GROUP_ID      0x800UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED2_ID            0x1000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED2_STATE         0x2000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED2_COLOR         0x4000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED2_BLINK_ON      0x8000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED2_BLINK_OFF     0x10000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED2_GROUP_ID      0x20000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED3_ID            0x40000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED3_STATE         0x80000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED3_COLOR         0x100000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED3_BLINK_ON      0x200000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED3_BLINK_OFF     0x400000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED3_GROUP_ID      0x800000UL
+	__le16	port_id;
+	u8	num_leds;
+	u8	rsvd;
+	u8	led0_id;
+	u8	led0_state;
+	#define PORT_LED_CFG_REQ_LED0_STATE_DEFAULT  0x0UL
+	#define PORT_LED_CFG_REQ_LED0_STATE_OFF      0x1UL
+	#define PORT_LED_CFG_REQ_LED0_STATE_ON       0x2UL
+	#define PORT_LED_CFG_REQ_LED0_STATE_BLINK    0x3UL
+	#define PORT_LED_CFG_REQ_LED0_STATE_BLINKALT 0x4UL
+	#define PORT_LED_CFG_REQ_LED0_STATE_LAST    PORT_LED_CFG_REQ_LED0_STATE_BLINKALT
+	u8	led0_color;
+	#define PORT_LED_CFG_REQ_LED0_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_CFG_REQ_LED0_COLOR_AMBER      0x1UL
+	#define PORT_LED_CFG_REQ_LED0_COLOR_GREEN      0x2UL
+	#define PORT_LED_CFG_REQ_LED0_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_CFG_REQ_LED0_COLOR_LAST      PORT_LED_CFG_REQ_LED0_COLOR_GREENAMBER
+	u8	unused_0;
+	__le16	led0_blink_on;
+	__le16	led0_blink_off;
+	u8	led0_group_id;
+	u8	rsvd0;
+	u8	led1_id;
+	u8	led1_state;
+	#define PORT_LED_CFG_REQ_LED1_STATE_DEFAULT  0x0UL
+	#define PORT_LED_CFG_REQ_LED1_STATE_OFF      0x1UL
+	#define PORT_LED_CFG_REQ_LED1_STATE_ON       0x2UL
+	#define PORT_LED_CFG_REQ_LED1_STATE_BLINK    0x3UL
+	#define PORT_LED_CFG_REQ_LED1_STATE_BLINKALT 0x4UL
+	#define PORT_LED_CFG_REQ_LED1_STATE_LAST    PORT_LED_CFG_REQ_LED1_STATE_BLINKALT
+	u8	led1_color;
+	#define PORT_LED_CFG_REQ_LED1_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_CFG_REQ_LED1_COLOR_AMBER      0x1UL
+	#define PORT_LED_CFG_REQ_LED1_COLOR_GREEN      0x2UL
+	#define PORT_LED_CFG_REQ_LED1_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_CFG_REQ_LED1_COLOR_LAST      PORT_LED_CFG_REQ_LED1_COLOR_GREENAMBER
+	u8	unused_1;
+	__le16	led1_blink_on;
+	__le16	led1_blink_off;
+	u8	led1_group_id;
+	u8	rsvd1;
+	u8	led2_id;
+	u8	led2_state;
+	#define PORT_LED_CFG_REQ_LED2_STATE_DEFAULT  0x0UL
+	#define PORT_LED_CFG_REQ_LED2_STATE_OFF      0x1UL
+	#define PORT_LED_CFG_REQ_LED2_STATE_ON       0x2UL
+	#define PORT_LED_CFG_REQ_LED2_STATE_BLINK    0x3UL
+	#define PORT_LED_CFG_REQ_LED2_STATE_BLINKALT 0x4UL
+	#define PORT_LED_CFG_REQ_LED2_STATE_LAST    PORT_LED_CFG_REQ_LED2_STATE_BLINKALT
+	u8	led2_color;
+	#define PORT_LED_CFG_REQ_LED2_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_CFG_REQ_LED2_COLOR_AMBER      0x1UL
+	#define PORT_LED_CFG_REQ_LED2_COLOR_GREEN      0x2UL
+	#define PORT_LED_CFG_REQ_LED2_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_CFG_REQ_LED2_COLOR_LAST      PORT_LED_CFG_REQ_LED2_COLOR_GREENAMBER
+	u8	unused_2;
+	__le16	led2_blink_on;
+	__le16	led2_blink_off;
+	u8	led2_group_id;
+	u8	rsvd2;
+	u8	led3_id;
+	u8	led3_state;
+	#define PORT_LED_CFG_REQ_LED3_STATE_DEFAULT  0x0UL
+	#define PORT_LED_CFG_REQ_LED3_STATE_OFF      0x1UL
+	#define PORT_LED_CFG_REQ_LED3_STATE_ON       0x2UL
+	#define PORT_LED_CFG_REQ_LED3_STATE_BLINK    0x3UL
+	#define PORT_LED_CFG_REQ_LED3_STATE_BLINKALT 0x4UL
+	#define PORT_LED_CFG_REQ_LED3_STATE_LAST    PORT_LED_CFG_REQ_LED3_STATE_BLINKALT
+	u8	led3_color;
+	#define PORT_LED_CFG_REQ_LED3_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_CFG_REQ_LED3_COLOR_AMBER      0x1UL
+	#define PORT_LED_CFG_REQ_LED3_COLOR_GREEN      0x2UL
+	#define PORT_LED_CFG_REQ_LED3_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_CFG_REQ_LED3_COLOR_LAST      PORT_LED_CFG_REQ_LED3_COLOR_GREENAMBER
+	u8	unused_3;
+	__le16	led3_blink_on;
+	__le16	led3_blink_off;
+	u8	led3_group_id;
+	u8	rsvd3;
+};
+
+/* hwrm_port_led_cfg_output (size:128b/16B) */
+struct hwrm_port_led_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_led_qcfg_input (size:192b/24B) */
+struct hwrm_port_led_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_port_led_qcfg_output (size:448b/56B) */
+struct hwrm_port_led_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	num_leds;
+	u8	led0_id;
+	u8	led0_type;
+	#define PORT_LED_QCFG_RESP_LED0_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCFG_RESP_LED0_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCFG_RESP_LED0_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCFG_RESP_LED0_TYPE_LAST    PORT_LED_QCFG_RESP_LED0_TYPE_INVALID
+	u8	led0_state;
+	#define PORT_LED_QCFG_RESP_LED0_STATE_DEFAULT  0x0UL
+	#define PORT_LED_QCFG_RESP_LED0_STATE_OFF      0x1UL
+	#define PORT_LED_QCFG_RESP_LED0_STATE_ON       0x2UL
+	#define PORT_LED_QCFG_RESP_LED0_STATE_BLINK    0x3UL
+	#define PORT_LED_QCFG_RESP_LED0_STATE_BLINKALT 0x4UL
+	#define PORT_LED_QCFG_RESP_LED0_STATE_LAST    PORT_LED_QCFG_RESP_LED0_STATE_BLINKALT
+	u8	led0_color;
+	#define PORT_LED_QCFG_RESP_LED0_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_QCFG_RESP_LED0_COLOR_AMBER      0x1UL
+	#define PORT_LED_QCFG_RESP_LED0_COLOR_GREEN      0x2UL
+	#define PORT_LED_QCFG_RESP_LED0_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_QCFG_RESP_LED0_COLOR_LAST      PORT_LED_QCFG_RESP_LED0_COLOR_GREENAMBER
+	u8	unused_0;
+	__le16	led0_blink_on;
+	__le16	led0_blink_off;
+	u8	led0_group_id;
+	u8	led1_id;
+	u8	led1_type;
+	#define PORT_LED_QCFG_RESP_LED1_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCFG_RESP_LED1_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCFG_RESP_LED1_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCFG_RESP_LED1_TYPE_LAST    PORT_LED_QCFG_RESP_LED1_TYPE_INVALID
+	u8	led1_state;
+	#define PORT_LED_QCFG_RESP_LED1_STATE_DEFAULT  0x0UL
+	#define PORT_LED_QCFG_RESP_LED1_STATE_OFF      0x1UL
+	#define PORT_LED_QCFG_RESP_LED1_STATE_ON       0x2UL
+	#define PORT_LED_QCFG_RESP_LED1_STATE_BLINK    0x3UL
+	#define PORT_LED_QCFG_RESP_LED1_STATE_BLINKALT 0x4UL
+	#define PORT_LED_QCFG_RESP_LED1_STATE_LAST    PORT_LED_QCFG_RESP_LED1_STATE_BLINKALT
+	u8	led1_color;
+	#define PORT_LED_QCFG_RESP_LED1_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_QCFG_RESP_LED1_COLOR_AMBER      0x1UL
+	#define PORT_LED_QCFG_RESP_LED1_COLOR_GREEN      0x2UL
+	#define PORT_LED_QCFG_RESP_LED1_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_QCFG_RESP_LED1_COLOR_LAST      PORT_LED_QCFG_RESP_LED1_COLOR_GREENAMBER
+	u8	unused_1;
+	__le16	led1_blink_on;
+	__le16	led1_blink_off;
+	u8	led1_group_id;
+	u8	led2_id;
+	u8	led2_type;
+	#define PORT_LED_QCFG_RESP_LED2_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCFG_RESP_LED2_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCFG_RESP_LED2_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCFG_RESP_LED2_TYPE_LAST    PORT_LED_QCFG_RESP_LED2_TYPE_INVALID
+	u8	led2_state;
+	#define PORT_LED_QCFG_RESP_LED2_STATE_DEFAULT  0x0UL
+	#define PORT_LED_QCFG_RESP_LED2_STATE_OFF      0x1UL
+	#define PORT_LED_QCFG_RESP_LED2_STATE_ON       0x2UL
+	#define PORT_LED_QCFG_RESP_LED2_STATE_BLINK    0x3UL
+	#define PORT_LED_QCFG_RESP_LED2_STATE_BLINKALT 0x4UL
+	#define PORT_LED_QCFG_RESP_LED2_STATE_LAST    PORT_LED_QCFG_RESP_LED2_STATE_BLINKALT
+	u8	led2_color;
+	#define PORT_LED_QCFG_RESP_LED2_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_QCFG_RESP_LED2_COLOR_AMBER      0x1UL
+	#define PORT_LED_QCFG_RESP_LED2_COLOR_GREEN      0x2UL
+	#define PORT_LED_QCFG_RESP_LED2_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_QCFG_RESP_LED2_COLOR_LAST      PORT_LED_QCFG_RESP_LED2_COLOR_GREENAMBER
+	u8	unused_2;
+	__le16	led2_blink_on;
+	__le16	led2_blink_off;
+	u8	led2_group_id;
+	u8	led3_id;
+	u8	led3_type;
+	#define PORT_LED_QCFG_RESP_LED3_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCFG_RESP_LED3_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCFG_RESP_LED3_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCFG_RESP_LED3_TYPE_LAST    PORT_LED_QCFG_RESP_LED3_TYPE_INVALID
+	u8	led3_state;
+	#define PORT_LED_QCFG_RESP_LED3_STATE_DEFAULT  0x0UL
+	#define PORT_LED_QCFG_RESP_LED3_STATE_OFF      0x1UL
+	#define PORT_LED_QCFG_RESP_LED3_STATE_ON       0x2UL
+	#define PORT_LED_QCFG_RESP_LED3_STATE_BLINK    0x3UL
+	#define PORT_LED_QCFG_RESP_LED3_STATE_BLINKALT 0x4UL
+	#define PORT_LED_QCFG_RESP_LED3_STATE_LAST    PORT_LED_QCFG_RESP_LED3_STATE_BLINKALT
+	u8	led3_color;
+	#define PORT_LED_QCFG_RESP_LED3_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_QCFG_RESP_LED3_COLOR_AMBER      0x1UL
+	#define PORT_LED_QCFG_RESP_LED3_COLOR_GREEN      0x2UL
+	#define PORT_LED_QCFG_RESP_LED3_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_QCFG_RESP_LED3_COLOR_LAST      PORT_LED_QCFG_RESP_LED3_COLOR_GREENAMBER
+	u8	unused_3;
+	__le16	led3_blink_on;
+	__le16	led3_blink_off;
+	u8	led3_group_id;
+	u8	unused_4[6];
+	u8	valid;
+};
+
+/* hwrm_port_led_qcaps_input (size:192b/24B) */
+struct hwrm_port_led_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_port_led_qcaps_output (size:384b/48B) */
+struct hwrm_port_led_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	num_leds;
+	u8	unused[3];
+	u8	led0_id;
+	u8	led0_type;
+	#define PORT_LED_QCAPS_RESP_LED0_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCAPS_RESP_LED0_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED0_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCAPS_RESP_LED0_TYPE_LAST    PORT_LED_QCAPS_RESP_LED0_TYPE_INVALID
+	u8	led0_group_id;
+	u8	unused_0;
+	__le16	led0_state_caps;
+	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_ENABLED                 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_OFF_SUPPORTED           0x2UL
+	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_ON_SUPPORTED            0x4UL
+	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_BLINK_SUPPORTED         0x8UL
+	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_BLINK_ALT_SUPPORTED     0x10UL
+	__le16	led0_color_caps;
+	#define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_RSVD                0x1UL
+	#define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_AMBER_SUPPORTED     0x2UL
+	#define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_GREEN_SUPPORTED     0x4UL
+	u8	led1_id;
+	u8	led1_type;
+	#define PORT_LED_QCAPS_RESP_LED1_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCAPS_RESP_LED1_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED1_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCAPS_RESP_LED1_TYPE_LAST    PORT_LED_QCAPS_RESP_LED1_TYPE_INVALID
+	u8	led1_group_id;
+	u8	unused_1;
+	__le16	led1_state_caps;
+	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_ENABLED                 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_OFF_SUPPORTED           0x2UL
+	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_ON_SUPPORTED            0x4UL
+	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_BLINK_SUPPORTED         0x8UL
+	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_BLINK_ALT_SUPPORTED     0x10UL
+	__le16	led1_color_caps;
+	#define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_RSVD                0x1UL
+	#define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_AMBER_SUPPORTED     0x2UL
+	#define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_GREEN_SUPPORTED     0x4UL
+	u8	led2_id;
+	u8	led2_type;
+	#define PORT_LED_QCAPS_RESP_LED2_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCAPS_RESP_LED2_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED2_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCAPS_RESP_LED2_TYPE_LAST    PORT_LED_QCAPS_RESP_LED2_TYPE_INVALID
+	u8	led2_group_id;
+	u8	unused_2;
+	__le16	led2_state_caps;
+	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_ENABLED                 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_OFF_SUPPORTED           0x2UL
+	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_ON_SUPPORTED            0x4UL
+	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_BLINK_SUPPORTED         0x8UL
+	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_BLINK_ALT_SUPPORTED     0x10UL
+	__le16	led2_color_caps;
+	#define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_RSVD                0x1UL
+	#define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_AMBER_SUPPORTED     0x2UL
+	#define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_GREEN_SUPPORTED     0x4UL
+	u8	led3_id;
+	u8	led3_type;
+	#define PORT_LED_QCAPS_RESP_LED3_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCAPS_RESP_LED3_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED3_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCAPS_RESP_LED3_TYPE_LAST    PORT_LED_QCAPS_RESP_LED3_TYPE_INVALID
+	u8	led3_group_id;
+	u8	unused_3;
+	__le16	led3_state_caps;
+	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_ENABLED                 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_OFF_SUPPORTED           0x2UL
+	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_ON_SUPPORTED            0x4UL
+	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_BLINK_SUPPORTED         0x8UL
+	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_BLINK_ALT_SUPPORTED     0x10UL
+	__le16	led3_color_caps;
+	#define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_RSVD                0x1UL
+	#define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_AMBER_SUPPORTED     0x2UL
+	#define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_GREEN_SUPPORTED     0x4UL
+	u8	unused_4[3];
+	u8	valid;
+};
+
+/* hwrm_queue_qportcfg_input (size:192b/24B) */
+struct hwrm_queue_qportcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define QUEUE_QPORTCFG_REQ_FLAGS_PATH     0x1UL
+	#define QUEUE_QPORTCFG_REQ_FLAGS_PATH_TX    0x0UL
+	#define QUEUE_QPORTCFG_REQ_FLAGS_PATH_RX    0x1UL
+	#define QUEUE_QPORTCFG_REQ_FLAGS_PATH_LAST QUEUE_QPORTCFG_REQ_FLAGS_PATH_RX
+	__le16	port_id;
+	u8	drv_qmap_cap;
+	#define QUEUE_QPORTCFG_REQ_DRV_QMAP_CAP_DISABLED 0x0UL
+	#define QUEUE_QPORTCFG_REQ_DRV_QMAP_CAP_ENABLED  0x1UL
+	#define QUEUE_QPORTCFG_REQ_DRV_QMAP_CAP_LAST    QUEUE_QPORTCFG_REQ_DRV_QMAP_CAP_ENABLED
+	u8	unused_0;
+};
+
+/* hwrm_queue_qportcfg_output (size:256b/32B) */
+struct hwrm_queue_qportcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	max_configurable_queues;
+	u8	max_configurable_lossless_queues;
+	u8	queue_cfg_allowed;
+	u8	queue_cfg_info;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_CFG_INFO_ASYM_CFG     0x1UL
+	u8	queue_pfcenable_cfg_allowed;
+	u8	queue_pri2cos_cfg_allowed;
+	u8	queue_cos2bw_cfg_allowed;
+	u8	queue_id0;
+	u8	queue_id0_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id1;
+	u8	queue_id1_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id2;
+	u8	queue_id2_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id3;
+	u8	queue_id3_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id4;
+	u8	queue_id4_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id5;
+	u8	queue_id5_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id6;
+	u8	queue_id6_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id7;
+	u8	queue_id7_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_UNKNOWN
+	u8	valid;
+};
+
+/* hwrm_queue_cfg_input (size:320b/40B) */
+struct hwrm_queue_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define QUEUE_CFG_REQ_FLAGS_PATH_MASK 0x3UL
+	#define QUEUE_CFG_REQ_FLAGS_PATH_SFT  0
+	#define QUEUE_CFG_REQ_FLAGS_PATH_TX     0x0UL
+	#define QUEUE_CFG_REQ_FLAGS_PATH_RX     0x1UL
+	#define QUEUE_CFG_REQ_FLAGS_PATH_BIDIR  0x2UL
+	#define QUEUE_CFG_REQ_FLAGS_PATH_LAST  QUEUE_CFG_REQ_FLAGS_PATH_BIDIR
+	__le32	enables;
+	#define QUEUE_CFG_REQ_ENABLES_DFLT_LEN            0x1UL
+	#define QUEUE_CFG_REQ_ENABLES_SERVICE_PROFILE     0x2UL
+	__le32	queue_id;
+	__le32	dflt_len;
+	u8	service_profile;
+	#define QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSY    0x0UL
+	#define QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSLESS 0x1UL
+	#define QUEUE_CFG_REQ_SERVICE_PROFILE_UNKNOWN  0xffUL
+	#define QUEUE_CFG_REQ_SERVICE_PROFILE_LAST    QUEUE_CFG_REQ_SERVICE_PROFILE_UNKNOWN
+	u8	unused_0[7];
+};
+
+/* hwrm_queue_cfg_output (size:128b/16B) */
+struct hwrm_queue_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_queue_pfcenable_qcfg_input (size:192b/24B) */
+struct hwrm_queue_pfcenable_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_queue_pfcenable_qcfg_output (size:128b/16B) */
+struct hwrm_queue_pfcenable_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	flags;
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI0_PFC_ENABLED     0x1UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI1_PFC_ENABLED     0x2UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI2_PFC_ENABLED     0x4UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI3_PFC_ENABLED     0x8UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI4_PFC_ENABLED     0x10UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI5_PFC_ENABLED     0x20UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI6_PFC_ENABLED     0x40UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI7_PFC_ENABLED     0x80UL
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_queue_pfcenable_cfg_input (size:192b/24B) */
+struct hwrm_queue_pfcenable_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI0_PFC_ENABLED     0x1UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI1_PFC_ENABLED     0x2UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI2_PFC_ENABLED     0x4UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI3_PFC_ENABLED     0x8UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI4_PFC_ENABLED     0x10UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI5_PFC_ENABLED     0x20UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI6_PFC_ENABLED     0x40UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI7_PFC_ENABLED     0x80UL
+	__le16	port_id;
+	u8	unused_0[2];
+};
+
+/* hwrm_queue_pfcenable_cfg_output (size:128b/16B) */
+struct hwrm_queue_pfcenable_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_queue_pri2cos_qcfg_input (size:192b/24B) */
+struct hwrm_queue_pri2cos_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH      0x1UL
+	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH_TX     0x0UL
+	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH_RX     0x1UL
+	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH_LAST  QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH_RX
+	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_IVLAN     0x2UL
+	u8	port_id;
+	u8	unused_0[3];
+};
+
+/* hwrm_queue_pri2cos_qcfg_output (size:192b/24B) */
+struct hwrm_queue_pri2cos_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	pri0_cos_queue_id;
+	u8	pri1_cos_queue_id;
+	u8	pri2_cos_queue_id;
+	u8	pri3_cos_queue_id;
+	u8	pri4_cos_queue_id;
+	u8	pri5_cos_queue_id;
+	u8	pri6_cos_queue_id;
+	u8	pri7_cos_queue_id;
+	u8	queue_cfg_info;
+	#define QUEUE_PRI2COS_QCFG_RESP_QUEUE_CFG_INFO_ASYM_CFG     0x1UL
+	u8	unused_0[6];
+	u8	valid;
+};
+
+/* hwrm_queue_pri2cos_cfg_input (size:320b/40B) */
+struct hwrm_queue_pri2cos_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_MASK 0x3UL
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_SFT  0
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_TX     0x0UL
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_RX     0x1UL
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_BIDIR  0x2UL
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_LAST  QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_BIDIR
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_IVLAN     0x4UL
+	__le32	enables;
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI0_COS_QUEUE_ID     0x1UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI1_COS_QUEUE_ID     0x2UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI2_COS_QUEUE_ID     0x4UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI3_COS_QUEUE_ID     0x8UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI4_COS_QUEUE_ID     0x10UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI5_COS_QUEUE_ID     0x20UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI6_COS_QUEUE_ID     0x40UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI7_COS_QUEUE_ID     0x80UL
+	u8	port_id;
+	u8	pri0_cos_queue_id;
+	u8	pri1_cos_queue_id;
+	u8	pri2_cos_queue_id;
+	u8	pri3_cos_queue_id;
+	u8	pri4_cos_queue_id;
+	u8	pri5_cos_queue_id;
+	u8	pri6_cos_queue_id;
+	u8	pri7_cos_queue_id;
+	u8	unused_0[7];
+};
+
+/* hwrm_queue_pri2cos_cfg_output (size:128b/16B) */
+struct hwrm_queue_pri2cos_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_queue_cos2bw_qcfg_input (size:192b/24B) */
+struct hwrm_queue_cos2bw_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_queue_cos2bw_qcfg_output (size:896b/112B) */
+struct hwrm_queue_cos2bw_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	queue_id0;
+	u8	unused_0;
+	__le16	unused_1;
+	__le32	queue_id0_min_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id0_max_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id0_tsa_assign;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id0_pri_lvl;
+	u8	queue_id0_bw_weight;
+	u8	queue_id1;
+	__le32	queue_id1_min_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id1_max_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id1_tsa_assign;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID1_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id1_pri_lvl;
+	u8	queue_id1_bw_weight;
+	u8	queue_id2;
+	__le32	queue_id2_min_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id2_max_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id2_tsa_assign;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID2_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id2_pri_lvl;
+	u8	queue_id2_bw_weight;
+	u8	queue_id3;
+	__le32	queue_id3_min_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id3_max_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id3_tsa_assign;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID3_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id3_pri_lvl;
+	u8	queue_id3_bw_weight;
+	u8	queue_id4;
+	__le32	queue_id4_min_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id4_max_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id4_tsa_assign;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID4_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id4_pri_lvl;
+	u8	queue_id4_bw_weight;
+	u8	queue_id5;
+	__le32	queue_id5_min_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id5_max_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id5_tsa_assign;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID5_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id5_pri_lvl;
+	u8	queue_id5_bw_weight;
+	u8	queue_id6;
+	__le32	queue_id6_min_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id6_max_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id6_tsa_assign;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID6_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id6_pri_lvl;
+	u8	queue_id6_bw_weight;
+	u8	queue_id7;
+	__le32	queue_id7_min_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id7_max_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id7_tsa_assign;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID7_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id7_pri_lvl;
+	u8	queue_id7_bw_weight;
+	u8	unused_2[4];
+	u8	valid;
+};
+
+/* hwrm_queue_cos2bw_cfg_input (size:1024b/128B) */
+struct hwrm_queue_cos2bw_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	__le32	enables;
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID0_VALID     0x1UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID1_VALID     0x2UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID2_VALID     0x4UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID3_VALID     0x8UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID4_VALID     0x10UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID5_VALID     0x20UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID6_VALID     0x40UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID7_VALID     0x80UL
+	__le16	port_id;
+	u8	queue_id0;
+	u8	unused_0;
+	__le32	queue_id0_min_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id0_max_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id0_tsa_assign;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id0_pri_lvl;
+	u8	queue_id0_bw_weight;
+	u8	queue_id1;
+	__le32	queue_id1_min_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id1_max_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id1_tsa_assign;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID1_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id1_pri_lvl;
+	u8	queue_id1_bw_weight;
+	u8	queue_id2;
+	__le32	queue_id2_min_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id2_max_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id2_tsa_assign;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID2_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id2_pri_lvl;
+	u8	queue_id2_bw_weight;
+	u8	queue_id3;
+	__le32	queue_id3_min_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id3_max_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id3_tsa_assign;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID3_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id3_pri_lvl;
+	u8	queue_id3_bw_weight;
+	u8	queue_id4;
+	__le32	queue_id4_min_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id4_max_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id4_tsa_assign;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID4_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id4_pri_lvl;
+	u8	queue_id4_bw_weight;
+	u8	queue_id5;
+	__le32	queue_id5_min_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id5_max_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id5_tsa_assign;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID5_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id5_pri_lvl;
+	u8	queue_id5_bw_weight;
+	u8	queue_id6;
+	__le32	queue_id6_min_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id6_max_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id6_tsa_assign;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID6_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id6_pri_lvl;
+	u8	queue_id6_bw_weight;
+	u8	queue_id7;
+	__le32	queue_id7_min_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id7_max_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id7_tsa_assign;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID7_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id7_pri_lvl;
+	u8	queue_id7_bw_weight;
+	u8	unused_1[5];
+};
+
+/* hwrm_queue_cos2bw_cfg_output (size:128b/16B) */
+struct hwrm_queue_cos2bw_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_queue_dscp_qcaps_input (size:192b/24B) */
+struct hwrm_queue_dscp_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	port_id;
+	u8	unused_0[7];
+};
+
+/* hwrm_queue_dscp_qcaps_output (size:128b/16B) */
+struct hwrm_queue_dscp_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	num_dscp_bits;
+	u8	unused_0;
+	__le16	max_entries;
+	u8	unused_1[3];
+	u8	valid;
+};
+
+/* hwrm_queue_dscp2pri_qcfg_input (size:256b/32B) */
+struct hwrm_queue_dscp2pri_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	dest_data_addr;
+	u8	port_id;
+	u8	unused_0;
+	__le16	dest_data_buffer_size;
+	u8	unused_1[4];
+};
+
+/* hwrm_queue_dscp2pri_qcfg_output (size:128b/16B) */
+struct hwrm_queue_dscp2pri_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	entry_cnt;
+	u8	default_pri;
+	u8	unused_0[4];
+	u8	valid;
+};
+
+/* hwrm_queue_dscp2pri_cfg_input (size:320b/40B) */
+struct hwrm_queue_dscp2pri_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	src_data_addr;
+	__le32	flags;
+	#define QUEUE_DSCP2PRI_CFG_REQ_FLAGS_USE_HW_DEFAULT_PRI     0x1UL
+	__le32	enables;
+	#define QUEUE_DSCP2PRI_CFG_REQ_ENABLES_DEFAULT_PRI     0x1UL
+	u8	port_id;
+	u8	default_pri;
+	__le16	entry_cnt;
+	u8	unused_0[4];
+};
+
+/* hwrm_queue_dscp2pri_cfg_output (size:128b/16B) */
+struct hwrm_queue_dscp2pri_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_alloc_input (size:192b/24B) */
+struct hwrm_vnic_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define VNIC_ALLOC_REQ_FLAGS_DEFAULT     0x1UL
+	u8	unused_0[4];
+};
+
+/* hwrm_vnic_alloc_output (size:128b/16B) */
+struct hwrm_vnic_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	vnic_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_vnic_free_input (size:192b/24B) */
+struct hwrm_vnic_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	vnic_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_vnic_free_output (size:128b/16B) */
+struct hwrm_vnic_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_cfg_input (size:320b/40B) */
+struct hwrm_vnic_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define VNIC_CFG_REQ_FLAGS_DEFAULT                              0x1UL
+	#define VNIC_CFG_REQ_FLAGS_VLAN_STRIP_MODE                      0x2UL
+	#define VNIC_CFG_REQ_FLAGS_BD_STALL_MODE                        0x4UL
+	#define VNIC_CFG_REQ_FLAGS_ROCE_DUAL_VNIC_MODE                  0x8UL
+	#define VNIC_CFG_REQ_FLAGS_ROCE_ONLY_VNIC_MODE                  0x10UL
+	#define VNIC_CFG_REQ_FLAGS_RSS_DFLT_CR_MODE                     0x20UL
+	#define VNIC_CFG_REQ_FLAGS_ROCE_MIRRORING_CAPABLE_VNIC_MODE     0x40UL
+	__le32	enables;
+	#define VNIC_CFG_REQ_ENABLES_DFLT_RING_GRP     0x1UL
+	#define VNIC_CFG_REQ_ENABLES_RSS_RULE          0x2UL
+	#define VNIC_CFG_REQ_ENABLES_COS_RULE          0x4UL
+	#define VNIC_CFG_REQ_ENABLES_LB_RULE           0x8UL
+	#define VNIC_CFG_REQ_ENABLES_MRU               0x10UL
+	__le16	vnic_id;
+	__le16	dflt_ring_grp;
+	__le16	rss_rule;
+	__le16	cos_rule;
+	__le16	lb_rule;
+	__le16	mru;
+	u8	unused_0[4];
+};
+
+/* hwrm_vnic_cfg_output (size:128b/16B) */
+struct hwrm_vnic_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_qcaps_input (size:192b/24B) */
+struct hwrm_vnic_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	u8	unused_0[4];
+};
+
+/* hwrm_vnic_qcaps_output (size:192b/24B) */
+struct hwrm_vnic_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	mru;
+	u8	unused_0[2];
+	__le32	flags;
+	#define VNIC_QCAPS_RESP_FLAGS_UNUSED                              0x1UL
+	#define VNIC_QCAPS_RESP_FLAGS_VLAN_STRIP_CAP                      0x2UL
+	#define VNIC_QCAPS_RESP_FLAGS_BD_STALL_CAP                        0x4UL
+	#define VNIC_QCAPS_RESP_FLAGS_ROCE_DUAL_VNIC_CAP                  0x8UL
+	#define VNIC_QCAPS_RESP_FLAGS_ROCE_ONLY_VNIC_CAP                  0x10UL
+	#define VNIC_QCAPS_RESP_FLAGS_RSS_DFLT_CR_CAP                     0x20UL
+	#define VNIC_QCAPS_RESP_FLAGS_ROCE_MIRRORING_CAPABLE_VNIC_CAP     0x40UL
+	u8	unused_1[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_tpa_cfg_input (size:320b/40B) */
+struct hwrm_vnic_tpa_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define VNIC_TPA_CFG_REQ_FLAGS_TPA                       0x1UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_ENCAP_TPA                 0x2UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_RSC_WND_UPDATE            0x4UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_GRO                       0x8UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_AGG_WITH_ECN              0x10UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_AGG_WITH_SAME_GRE_SEQ     0x20UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_GRO_IPID_CHECK            0x40UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_GRO_TTL_CHECK             0x80UL
+	__le32	enables;
+	#define VNIC_TPA_CFG_REQ_ENABLES_MAX_AGG_SEGS      0x1UL
+	#define VNIC_TPA_CFG_REQ_ENABLES_MAX_AGGS          0x2UL
+	#define VNIC_TPA_CFG_REQ_ENABLES_MAX_AGG_TIMER     0x4UL
+	#define VNIC_TPA_CFG_REQ_ENABLES_MIN_AGG_LEN       0x8UL
+	__le16	vnic_id;
+	__le16	max_agg_segs;
+	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_1   0x0UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_2   0x1UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_4   0x2UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_8   0x3UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_MAX 0x1fUL
+	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_LAST VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_MAX
+	__le16	max_aggs;
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_1   0x0UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_2   0x1UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_4   0x2UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_8   0x3UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_16  0x4UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_MAX 0x7UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_LAST VNIC_TPA_CFG_REQ_MAX_AGGS_MAX
+	u8	unused_0[2];
+	__le32	max_agg_timer;
+	__le32	min_agg_len;
+};
+
+/* hwrm_vnic_tpa_cfg_output (size:128b/16B) */
+struct hwrm_vnic_tpa_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_tpa_qcfg_input (size:192b/24B) */
+struct hwrm_vnic_tpa_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	vnic_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_vnic_tpa_qcfg_output (size:256b/32B) */
+struct hwrm_vnic_tpa_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	flags;
+	#define VNIC_TPA_QCFG_RESP_FLAGS_TPA                       0x1UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_ENCAP_TPA                 0x2UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_RSC_WND_UPDATE            0x4UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_GRO                       0x8UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_AGG_WITH_ECN              0x10UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_AGG_WITH_SAME_GRE_SEQ     0x20UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_GRO_IPID_CHECK            0x40UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_GRO_TTL_CHECK             0x80UL
+	__le16	max_agg_segs;
+	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_1   0x0UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_2   0x1UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_4   0x2UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_8   0x3UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_MAX 0x1fUL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_LAST VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_MAX
+	__le16	max_aggs;
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_1   0x0UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_2   0x1UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_4   0x2UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_8   0x3UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_16  0x4UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_MAX 0x7UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_LAST VNIC_TPA_QCFG_RESP_MAX_AGGS_MAX
+	__le32	max_agg_timer;
+	__le32	min_agg_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_rss_cfg_input (size:384b/48B) */
+struct hwrm_vnic_rss_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	hash_type;
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_IPV4         0x1UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV4     0x2UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV4     0x4UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_IPV6         0x8UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV6     0x10UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV6     0x20UL
+	u8	unused_0[4];
+	__le64	ring_grp_tbl_addr;
+	__le64	hash_key_tbl_addr;
+	__le16	rss_ctx_idx;
+	u8	unused_1[6];
+};
+
+/* hwrm_vnic_rss_cfg_output (size:128b/16B) */
+struct hwrm_vnic_rss_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_plcmodes_cfg_input (size:320b/40B) */
+struct hwrm_vnic_plcmodes_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define VNIC_PLCMODES_CFG_REQ_FLAGS_REGULAR_PLACEMENT     0x1UL
+	#define VNIC_PLCMODES_CFG_REQ_FLAGS_JUMBO_PLACEMENT       0x2UL
+	#define VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_IPV4              0x4UL
+	#define VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_IPV6              0x8UL
+	#define VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_FCOE              0x10UL
+	#define VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_ROCE              0x20UL
+	__le32	enables;
+	#define VNIC_PLCMODES_CFG_REQ_ENABLES_JUMBO_THRESH_VALID      0x1UL
+	#define VNIC_PLCMODES_CFG_REQ_ENABLES_HDS_OFFSET_VALID        0x2UL
+	#define VNIC_PLCMODES_CFG_REQ_ENABLES_HDS_THRESHOLD_VALID     0x4UL
+	__le32	vnic_id;
+	__le16	jumbo_thresh;
+	__le16	hds_offset;
+	__le16	hds_threshold;
+	u8	unused_0[6];
+};
+
+/* hwrm_vnic_plcmodes_cfg_output (size:128b/16B) */
+struct hwrm_vnic_plcmodes_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_rss_cos_lb_ctx_alloc_input (size:128b/16B) */
+struct hwrm_vnic_rss_cos_lb_ctx_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_vnic_rss_cos_lb_ctx_alloc_output (size:128b/16B) */
+struct hwrm_vnic_rss_cos_lb_ctx_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	rss_cos_lb_ctx_id;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_vnic_rss_cos_lb_ctx_free_input (size:192b/24B) */
+struct hwrm_vnic_rss_cos_lb_ctx_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	rss_cos_lb_ctx_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_vnic_rss_cos_lb_ctx_free_output (size:128b/16B) */
+struct hwrm_vnic_rss_cos_lb_ctx_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_ring_alloc_input (size:640b/80B) */
+struct hwrm_ring_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define RING_ALLOC_REQ_ENABLES_RING_ARB_CFG          0x2UL
+	#define RING_ALLOC_REQ_ENABLES_STAT_CTX_ID_VALID     0x8UL
+	#define RING_ALLOC_REQ_ENABLES_MAX_BW_VALID          0x20UL
+	u8	ring_type;
+	#define RING_ALLOC_REQ_RING_TYPE_L2_CMPL   0x0UL
+	#define RING_ALLOC_REQ_RING_TYPE_TX        0x1UL
+	#define RING_ALLOC_REQ_RING_TYPE_RX        0x2UL
+	#define RING_ALLOC_REQ_RING_TYPE_ROCE_CMPL 0x3UL
+	#define RING_ALLOC_REQ_RING_TYPE_LAST     RING_ALLOC_REQ_RING_TYPE_ROCE_CMPL
+	u8	unused_0[3];
+	__le64	page_tbl_addr;
+	__le32	fbo;
+	u8	page_size;
+	u8	page_tbl_depth;
+	u8	unused_1[2];
+	__le32	length;
+	__le16	logical_id;
+	__le16	cmpl_ring_id;
+	__le16	queue_id;
+	u8	unused_2[2];
+	__le32	reserved1;
+	__le16	ring_arb_cfg;
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_MASK      0xfUL
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_SFT       0
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_SP          0x1UL
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_WFQ         0x2UL
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_LAST       RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_WFQ
+	#define RING_ALLOC_REQ_RING_ARB_CFG_RSVD_MASK            0xf0UL
+	#define RING_ALLOC_REQ_RING_ARB_CFG_RSVD_SFT             4
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_PARAM_MASK 0xff00UL
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_PARAM_SFT 8
+	__le16	unused_3;
+	__le32	reserved3;
+	__le32	stat_ctx_id;
+	__le32	reserved4;
+	__le32	max_bw;
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_SFT              0
+	#define RING_ALLOC_REQ_MAX_BW_SCALE                     0x10000000UL
+	#define RING_ALLOC_REQ_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define RING_ALLOC_REQ_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define RING_ALLOC_REQ_MAX_BW_SCALE_LAST                 RING_ALLOC_REQ_MAX_BW_SCALE_BYTES
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_LAST         RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	int_mode;
+	#define RING_ALLOC_REQ_INT_MODE_LEGACY 0x0UL
+	#define RING_ALLOC_REQ_INT_MODE_RSVD   0x1UL
+	#define RING_ALLOC_REQ_INT_MODE_MSIX   0x2UL
+	#define RING_ALLOC_REQ_INT_MODE_POLL   0x3UL
+	#define RING_ALLOC_REQ_INT_MODE_LAST  RING_ALLOC_REQ_INT_MODE_POLL
+	u8	unused_4[3];
+};
+
+/* hwrm_ring_alloc_output (size:128b/16B) */
+struct hwrm_ring_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	ring_id;
+	__le16	logical_ring_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_ring_free_input (size:192b/24B) */
+struct hwrm_ring_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	ring_type;
+	#define RING_FREE_REQ_RING_TYPE_L2_CMPL   0x0UL
+	#define RING_FREE_REQ_RING_TYPE_TX        0x1UL
+	#define RING_FREE_REQ_RING_TYPE_RX        0x2UL
+	#define RING_FREE_REQ_RING_TYPE_ROCE_CMPL 0x3UL
+	#define RING_FREE_REQ_RING_TYPE_LAST     RING_FREE_REQ_RING_TYPE_ROCE_CMPL
+	u8	unused_0;
+	__le16	ring_id;
+	u8	unused_1[4];
+};
+
+/* hwrm_ring_free_output (size:128b/16B) */
+struct hwrm_ring_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_ring_cmpl_ring_qaggint_params_input (size:192b/24B) */
+struct hwrm_ring_cmpl_ring_qaggint_params_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	ring_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_ring_cmpl_ring_qaggint_params_output (size:256b/32B) */
+struct hwrm_ring_cmpl_ring_qaggint_params_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	flags;
+	#define RING_CMPL_RING_QAGGINT_PARAMS_RESP_FLAGS_TIMER_RESET     0x1UL
+	#define RING_CMPL_RING_QAGGINT_PARAMS_RESP_FLAGS_RING_IDLE       0x2UL
+	__le16	num_cmpl_dma_aggr;
+	__le16	num_cmpl_dma_aggr_during_int;
+	__le16	cmpl_aggr_dma_tmr;
+	__le16	cmpl_aggr_dma_tmr_during_int;
+	__le16	int_lat_tmr_min;
+	__le16	int_lat_tmr_max;
+	__le16	num_cmpl_aggr_int;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_ring_cmpl_ring_cfg_aggint_params_input (size:320b/40B) */
+struct hwrm_ring_cmpl_ring_cfg_aggint_params_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	ring_id;
+	__le16	flags;
+	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_FLAGS_TIMER_RESET     0x1UL
+	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_FLAGS_RING_IDLE       0x2UL
+	__le16	num_cmpl_dma_aggr;
+	__le16	num_cmpl_dma_aggr_during_int;
+	__le16	cmpl_aggr_dma_tmr;
+	__le16	cmpl_aggr_dma_tmr_during_int;
+	__le16	int_lat_tmr_min;
+	__le16	int_lat_tmr_max;
+	__le16	num_cmpl_aggr_int;
+	u8	unused_0[6];
+};
+
+/* hwrm_ring_cmpl_ring_cfg_aggint_params_output (size:128b/16B) */
+struct hwrm_ring_cmpl_ring_cfg_aggint_params_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_ring_reset_input (size:192b/24B) */
+struct hwrm_ring_reset_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	ring_type;
+	#define RING_RESET_REQ_RING_TYPE_L2_CMPL   0x0UL
+	#define RING_RESET_REQ_RING_TYPE_TX        0x1UL
+	#define RING_RESET_REQ_RING_TYPE_RX        0x2UL
+	#define RING_RESET_REQ_RING_TYPE_ROCE_CMPL 0x3UL
+	#define RING_RESET_REQ_RING_TYPE_LAST     RING_RESET_REQ_RING_TYPE_ROCE_CMPL
+	u8	unused_0;
+	__le16	ring_id;
+	u8	unused_1[4];
+};
+
+/* hwrm_ring_reset_output (size:128b/16B) */
+struct hwrm_ring_reset_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_ring_grp_alloc_input (size:192b/24B) */
+struct hwrm_ring_grp_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	cr;
+	__le16	rr;
+	__le16	ar;
+	__le16	sc;
+};
+
+/* hwrm_ring_grp_alloc_output (size:128b/16B) */
+struct hwrm_ring_grp_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	ring_group_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_ring_grp_free_input (size:192b/24B) */
+struct hwrm_ring_grp_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	ring_group_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_ring_grp_free_output (size:128b/16B) */
+struct hwrm_ring_grp_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_l2_filter_alloc_input (size:768b/96B) */
+struct hwrm_cfa_l2_filter_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH          0x1UL
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH_TX         0x0UL
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH_RX         0x1UL
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH_LAST      CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH_RX
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_LOOPBACK      0x2UL
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_DROP          0x4UL
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_OUTERMOST     0x8UL
+	__le32	enables;
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_ADDR             0x1UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_ADDR_MASK        0x2UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_OVLAN            0x4UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_OVLAN_MASK       0x8UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_IVLAN            0x10UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_IVLAN_MASK       0x20UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_ADDR           0x40UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_ADDR_MASK      0x80UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_OVLAN          0x100UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_OVLAN_MASK     0x200UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_IVLAN          0x400UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_IVLAN_MASK     0x800UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_SRC_TYPE            0x1000UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_SRC_ID              0x2000UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE         0x4000UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_DST_ID              0x8000UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_MIRROR_VNIC_ID      0x10000UL
+	u8	l2_addr[6];
+	u8	unused_0[2];
+	u8	l2_addr_mask[6];
+	__le16	l2_ovlan;
+	__le16	l2_ovlan_mask;
+	__le16	l2_ivlan;
+	__le16	l2_ivlan_mask;
+	u8	unused_1[2];
+	u8	t_l2_addr[6];
+	u8	unused_2[2];
+	u8	t_l2_addr_mask[6];
+	__le16	t_l2_ovlan;
+	__le16	t_l2_ovlan_mask;
+	__le16	t_l2_ivlan;
+	__le16	t_l2_ivlan_mask;
+	u8	src_type;
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_NPORT 0x0UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_PF    0x1UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_VF    0x2UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_VNIC  0x3UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_KONG  0x4UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_APE   0x5UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_BONO  0x6UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_TANG  0x7UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_LAST CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_TANG
+	u8	unused_3;
+	__le32	src_id;
+	u8	tunnel_type;
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL 0x0UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN     0x1UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_NVGRE     0x2UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2GRE     0x3UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPIP      0x4UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_GENEVE    0x5UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_MPLS      0x6UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_STT       0x7UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE     0x8UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4  0x9UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL 0xffUL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_LAST     CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
+	u8	unused_4;
+	__le16	dst_id;
+	__le16	mirror_vnic_id;
+	u8	pri_hint;
+	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_NO_PREFER    0x0UL
+	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_ABOVE_FILTER 0x1UL
+	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_BELOW_FILTER 0x2UL
+	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_MAX          0x3UL
+	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_MIN          0x4UL
+	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_LAST        CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_MIN
+	u8	unused_5;
+	__le32	unused_6;
+	__le64	l2_filter_id_hint;
+};
+
+/* hwrm_cfa_l2_filter_alloc_output (size:192b/24B) */
+struct hwrm_cfa_l2_filter_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	l2_filter_id;
+	__le32	flow_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_cfa_l2_filter_free_input (size:192b/24B) */
+struct hwrm_cfa_l2_filter_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	l2_filter_id;
+};
+
+/* hwrm_cfa_l2_filter_free_output (size:128b/16B) */
+struct hwrm_cfa_l2_filter_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_l2_filter_cfg_input (size:320b/40B) */
+struct hwrm_cfa_l2_filter_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH     0x1UL
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_TX    0x0UL
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX    0x1UL
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_LAST CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_DROP     0x2UL
+	__le32	enables;
+	#define CFA_L2_FILTER_CFG_REQ_ENABLES_DST_ID                 0x1UL
+	#define CFA_L2_FILTER_CFG_REQ_ENABLES_NEW_MIRROR_VNIC_ID     0x2UL
+	__le64	l2_filter_id;
+	__le32	dst_id;
+	__le32	new_mirror_vnic_id;
+};
+
+/* hwrm_cfa_l2_filter_cfg_output (size:128b/16B) */
+struct hwrm_cfa_l2_filter_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_l2_set_rx_mask_input (size:448b/56B) */
+struct hwrm_cfa_l2_set_rx_mask_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	vnic_id;
+	__le32	mask;
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_MCAST               0x2UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_ALL_MCAST           0x4UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_BCAST               0x8UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS         0x10UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_OUTERMOST           0x20UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_VLANONLY            0x40UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_VLAN_NONVLAN        0x80UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_ANYVLAN_NONVLAN     0x100UL
+	__le64	mc_tbl_addr;
+	__le32	num_mc_entries;
+	u8	unused_0[4];
+	__le64	vlan_tag_tbl_addr;
+	__le32	num_vlan_tags;
+	u8	unused_1[4];
+};
+
+/* hwrm_cfa_l2_set_rx_mask_output (size:128b/16B) */
+struct hwrm_cfa_l2_set_rx_mask_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_l2_set_rx_mask_cmd_err (size:64b/8B) */
+struct hwrm_cfa_l2_set_rx_mask_cmd_err {
+	u8	code;
+	#define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_UNKNOWN                    0x0UL
+	#define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_NTUPLE_FILTER_CONFLICT_ERR 0x1UL
+	#define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_LAST                      CFA_L2_SET_RX_MASK_CMD_ERR_CODE_NTUPLE_FILTER_CONFLICT_ERR
+	u8	unused_0[7];
+};
+
+/* hwrm_cfa_tunnel_filter_alloc_input (size:704b/88B) */
+struct hwrm_cfa_tunnel_filter_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_FLAGS_LOOPBACK     0x1UL
+	__le32	enables;
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L2_FILTER_ID       0x1UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L2_ADDR            0x2UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L2_IVLAN           0x4UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L3_ADDR            0x8UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L3_ADDR_TYPE       0x10UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_T_L3_ADDR_TYPE     0x20UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_T_L3_ADDR          0x40UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE        0x80UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_VNI                0x100UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_DST_VNIC_ID        0x200UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_MIRROR_VNIC_ID     0x400UL
+	__le64	l2_filter_id;
+	u8	l2_addr[6];
+	__le16	l2_ivlan;
+	__le32	l3_addr[4];
+	__le32	t_l3_addr[4];
+	u8	l3_addr_type;
+	u8	t_l3_addr_type;
+	u8	tunnel_type;
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL 0x0UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN     0x1UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_NVGRE     0x2UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2GRE     0x3UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPIP      0x4UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_GENEVE    0x5UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_MPLS      0x6UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_STT       0x7UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE     0x8UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4  0x9UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL 0xffUL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_LAST     CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
+	u8	tunnel_flags;
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_FLAGS_TUN_FLAGS_OAM_CHECKSUM_EXPLHDR     0x1UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_FLAGS_TUN_FLAGS_CRITICAL_OPT_S1          0x2UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_FLAGS_TUN_FLAGS_EXTHDR_SEQNUM_S0         0x4UL
+	__le32	vni;
+	__le32	dst_vnic_id;
+	__le32	mirror_vnic_id;
+};
+
+/* hwrm_cfa_tunnel_filter_alloc_output (size:192b/24B) */
+struct hwrm_cfa_tunnel_filter_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	tunnel_filter_id;
+	__le32	flow_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_cfa_tunnel_filter_free_input (size:192b/24B) */
+struct hwrm_cfa_tunnel_filter_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	tunnel_filter_id;
+};
+
+/* hwrm_cfa_tunnel_filter_free_output (size:128b/16B) */
+struct hwrm_cfa_tunnel_filter_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vxlan_ipv4_hdr (size:128b/16B) */
+struct hwrm_vxlan_ipv4_hdr {
+	u8	ver_hlen;
+	#define VXLAN_IPV4_HDR_VER_HLEN_HEADER_LENGTH_MASK 0xfUL
+	#define VXLAN_IPV4_HDR_VER_HLEN_HEADER_LENGTH_SFT 0
+	#define VXLAN_IPV4_HDR_VER_HLEN_VERSION_MASK      0xf0UL
+	#define VXLAN_IPV4_HDR_VER_HLEN_VERSION_SFT       4
+	u8	tos;
+	__be16	ip_id;
+	__be16	flags_frag_offset;
+	u8	ttl;
+	u8	protocol;
+	__be32	src_ip_addr;
+	__be32	dest_ip_addr;
+};
+
+/* hwrm_vxlan_ipv6_hdr (size:320b/40B) */
+struct hwrm_vxlan_ipv6_hdr {
+	__be32	ver_tc_flow_label;
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_VER_SFT         0x1cUL
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_VER_MASK        0xf0000000UL
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_TC_SFT          0x14UL
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_TC_MASK         0xff00000UL
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_FLOW_LABEL_SFT  0x0UL
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_FLOW_LABEL_MASK 0xfffffUL
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_LAST           VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_FLOW_LABEL_MASK
+	__be16	payload_len;
+	u8	next_hdr;
+	u8	ttl;
+	__be32	src_ip_addr[4];
+	__be32	dest_ip_addr[4];
+};
+
+/* hwrm_cfa_encap_data_vxlan (size:576b/72B) */
+struct hwrm_cfa_encap_data_vxlan {
+	u8	src_mac_addr[6];
+	__le16	unused_0;
+	u8	dst_mac_addr[6];
+	u8	num_vlan_tags;
+	u8	unused_1;
+	__be16	ovlan_tpid;
+	__be16	ovlan_tci;
+	__be16	ivlan_tpid;
+	__be16	ivlan_tci;
+	__le32	l3[10];
+	#define CFA_ENCAP_DATA_VXLAN_L3_VER_MASK 0xfUL
+	#define CFA_ENCAP_DATA_VXLAN_L3_VER_IPV4 0x4UL
+	#define CFA_ENCAP_DATA_VXLAN_L3_VER_IPV6 0x6UL
+	#define CFA_ENCAP_DATA_VXLAN_L3_LAST    CFA_ENCAP_DATA_VXLAN_L3_VER_IPV6
+	__be16	src_port;
+	__be16	dst_port;
+	__be32	vni;
+};
+
+/* hwrm_cfa_encap_record_alloc_input (size:832b/104B) */
+struct hwrm_cfa_encap_record_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_FLAGS_LOOPBACK     0x1UL
+	u8	encap_type;
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VXLAN  0x1UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_NVGRE  0x2UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_L2GRE  0x3UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_IPIP   0x4UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_GENEVE 0x5UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_MPLS   0x6UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VLAN   0x7UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_IPGRE  0x8UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_LAST  CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_IPGRE
+	u8	unused_0[3];
+	__le32	encap_data[20];
+};
+
+/* hwrm_cfa_encap_record_alloc_output (size:128b/16B) */
+struct hwrm_cfa_encap_record_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	encap_record_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_cfa_encap_record_free_input (size:192b/24B) */
+struct hwrm_cfa_encap_record_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	encap_record_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_cfa_encap_record_free_output (size:128b/16B) */
+struct hwrm_cfa_encap_record_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_ntuple_filter_alloc_input (size:1024b/128B) */
+struct hwrm_cfa_ntuple_filter_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_LOOPBACK     0x1UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_DROP         0x2UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_METER        0x4UL
+	__le32	enables;
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_L2_FILTER_ID         0x1UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_ETHERTYPE            0x2UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE          0x4UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_MACADDR          0x8UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_IPADDR_TYPE          0x10UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_IPADDR           0x20UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_IPADDR_MASK      0x40UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_IPADDR           0x80UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_IPADDR_MASK      0x100UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_IP_PROTOCOL          0x200UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_PORT             0x400UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_PORT_MASK        0x800UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_PORT             0x1000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_PORT_MASK        0x2000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_PRI_HINT             0x4000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_NTUPLE_FILTER_ID     0x8000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_ID               0x10000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_MIRROR_VNIC_ID       0x20000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_MACADDR          0x40000UL
+	__le64	l2_filter_id;
+	u8	src_macaddr[6];
+	__be16	ethertype;
+	u8	ip_addr_type;
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_UNKNOWN 0x0UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV4    0x4UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6    0x6UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_LAST   CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6
+	u8	ip_protocol;
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_UNKNOWN 0x0UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_TCP     0x6UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_UDP     0x11UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_LAST   CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_UDP
+	__le16	dst_id;
+	__le16	mirror_vnic_id;
+	u8	tunnel_type;
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL 0x0UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN     0x1UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_NVGRE     0x2UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2GRE     0x3UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPIP      0x4UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_GENEVE    0x5UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_MPLS      0x6UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_STT       0x7UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE     0x8UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4  0x9UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL 0xffUL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_LAST     CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
+	u8	pri_hint;
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_NO_PREFER 0x0UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_ABOVE     0x1UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_BELOW     0x2UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_HIGHEST   0x3UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_LOWEST    0x4UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_LAST     CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_LOWEST
+	__be32	src_ipaddr[4];
+	__be32	src_ipaddr_mask[4];
+	__be32	dst_ipaddr[4];
+	__be32	dst_ipaddr_mask[4];
+	__be16	src_port;
+	__be16	src_port_mask;
+	__be16	dst_port;
+	__be16	dst_port_mask;
+	__le64	ntuple_filter_id_hint;
+};
+
+/* hwrm_cfa_ntuple_filter_alloc_output (size:192b/24B) */
+struct hwrm_cfa_ntuple_filter_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	ntuple_filter_id;
+	__le32	flow_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_cfa_ntuple_filter_alloc_cmd_err (size:64b/8B) */
+struct hwrm_cfa_ntuple_filter_alloc_cmd_err {
+	u8	code;
+	#define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_UNKNOWN                   0x0UL
+	#define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_RX_MASK_VLAN_CONFLICT_ERR 0x1UL
+	#define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_LAST                     CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_RX_MASK_VLAN_CONFLICT_ERR
+	u8	unused_0[7];
+};
+
+/* hwrm_cfa_ntuple_filter_free_input (size:192b/24B) */
+struct hwrm_cfa_ntuple_filter_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	ntuple_filter_id;
+};
+
+/* hwrm_cfa_ntuple_filter_free_output (size:128b/16B) */
+struct hwrm_cfa_ntuple_filter_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_ntuple_filter_cfg_input (size:384b/48B) */
+struct hwrm_cfa_ntuple_filter_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define CFA_NTUPLE_FILTER_CFG_REQ_ENABLES_NEW_DST_ID                0x1UL
+	#define CFA_NTUPLE_FILTER_CFG_REQ_ENABLES_NEW_MIRROR_VNIC_ID        0x2UL
+	#define CFA_NTUPLE_FILTER_CFG_REQ_ENABLES_NEW_METER_INSTANCE_ID     0x4UL
+	u8	unused_0[4];
+	__le64	ntuple_filter_id;
+	__le32	new_dst_id;
+	__le32	new_mirror_vnic_id;
+	__le16	new_meter_instance_id;
+	#define CFA_NTUPLE_FILTER_CFG_REQ_NEW_METER_INSTANCE_ID_INVALID 0xffffUL
+	#define CFA_NTUPLE_FILTER_CFG_REQ_NEW_METER_INSTANCE_ID_LAST   CFA_NTUPLE_FILTER_CFG_REQ_NEW_METER_INSTANCE_ID_INVALID
+	u8	unused_1[6];
+};
+
+/* hwrm_cfa_ntuple_filter_cfg_output (size:128b/16B) */
+struct hwrm_cfa_ntuple_filter_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_decap_filter_alloc_input (size:832b/104B) */
+struct hwrm_cfa_decap_filter_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_DECAP_FILTER_ALLOC_REQ_FLAGS_OVS_TUNNEL     0x1UL
+	__le32	enables;
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE        0x1UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_TUNNEL_ID          0x2UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_SRC_MACADDR        0x4UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_MACADDR        0x8UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_OVLAN_VID          0x10UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_IVLAN_VID          0x20UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_T_OVLAN_VID        0x40UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_T_IVLAN_VID        0x80UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_ETHERTYPE          0x100UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_SRC_IPADDR         0x200UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_IPADDR         0x400UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_IPADDR_TYPE        0x800UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_IP_PROTOCOL        0x1000UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_SRC_PORT           0x2000UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_PORT           0x4000UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_ID             0x8000UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_MIRROR_VNIC_ID     0x10000UL
+	__be32	tunnel_id;
+	u8	tunnel_type;
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL 0x0UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN     0x1UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_NVGRE     0x2UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2GRE     0x3UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPIP      0x4UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_GENEVE    0x5UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_MPLS      0x6UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_STT       0x7UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE     0x8UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4  0x9UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL 0xffUL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_LAST     CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
+	u8	unused_0;
+	__le16	unused_1;
+	u8	src_macaddr[6];
+	u8	unused_2[2];
+	u8	dst_macaddr[6];
+	__be16	ovlan_vid;
+	__be16	ivlan_vid;
+	__be16	t_ovlan_vid;
+	__be16	t_ivlan_vid;
+	__be16	ethertype;
+	u8	ip_addr_type;
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_UNKNOWN 0x0UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV4    0x4UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6    0x6UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_LAST   CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6
+	u8	ip_protocol;
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_UNKNOWN 0x0UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_TCP     0x6UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_UDP     0x11UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_LAST   CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_UDP
+	__le16	unused_3;
+	__le32	unused_4;
+	__be32	src_ipaddr[4];
+	__be32	dst_ipaddr[4];
+	__be16	src_port;
+	__be16	dst_port;
+	__le16	dst_id;
+	__le16	l2_ctxt_ref_id;
+};
+
+/* hwrm_cfa_decap_filter_alloc_output (size:128b/16B) */
+struct hwrm_cfa_decap_filter_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	decap_filter_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_cfa_decap_filter_free_input (size:192b/24B) */
+struct hwrm_cfa_decap_filter_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	decap_filter_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_cfa_decap_filter_free_output (size:128b/16B) */
+struct hwrm_cfa_decap_filter_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_flow_alloc_input (size:1024b/128B) */
+struct hwrm_cfa_flow_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	flags;
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_TUNNEL       0x1UL
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_MASK 0x6UL
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_SFT 1
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_NONE  (0x0UL << 1)
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_ONE   (0x1UL << 1)
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_TWO   (0x2UL << 1)
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_LAST CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_TWO
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_MASK 0x38UL
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_SFT 3
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_L2    (0x0UL << 3)
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_IPV4  (0x1UL << 3)
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_IPV6  (0x2UL << 3)
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_LAST CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_IPV6
+	__le16	src_fid;
+	__le32	tunnel_handle;
+	__le16	action_flags;
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_FWD                   0x1UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_RECYCLE               0x2UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_DROP                  0x4UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_METER                 0x8UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_TUNNEL                0x10UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_NAT_SRC               0x20UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_NAT_DEST              0x40UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_NAT_IPV4_ADDRESS      0x80UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_L2_HEADER_REWRITE     0x100UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_TTL_DECREMENT         0x200UL
+	__le16	dst_fid;
+	__be16	l2_rewrite_vlan_tpid;
+	__be16	l2_rewrite_vlan_tci;
+	__le16	act_meter_id;
+	__le16	ref_flow_handle;
+	__be16	ethertype;
+	__be16	outer_vlan_tci;
+	__be16	dmac[3];
+	__be16	inner_vlan_tci;
+	__be16	smac[3];
+	u8	ip_dst_mask_len;
+	u8	ip_src_mask_len;
+	__be32	ip_dst[4];
+	__be32	ip_src[4];
+	__be16	l4_src_port;
+	__be16	l4_src_port_mask;
+	__be16	l4_dst_port;
+	__be16	l4_dst_port_mask;
+	__be32	nat_ip_address[4];
+	__be16	l2_rewrite_dmac[3];
+	__be16	nat_port;
+	__be16	l2_rewrite_smac[3];
+	u8	ip_proto;
+	u8	unused_0;
+};
+
+/* hwrm_cfa_flow_alloc_output (size:128b/16B) */
+struct hwrm_cfa_flow_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	flow_handle;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_cfa_flow_free_input (size:192b/24B) */
+struct hwrm_cfa_flow_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	flow_handle;
+	u8	unused_0[6];
+};
+
+/* hwrm_cfa_flow_free_output (size:256b/32B) */
+struct hwrm_cfa_flow_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	packet;
+	__le64	byte;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_flow_stats_input (size:320b/40B) */
+struct hwrm_cfa_flow_stats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	num_flows;
+	__le16	flow_handle_0;
+	__le16	flow_handle_1;
+	__le16	flow_handle_2;
+	__le16	flow_handle_3;
+	__le16	flow_handle_4;
+	__le16	flow_handle_5;
+	__le16	flow_handle_6;
+	__le16	flow_handle_7;
+	__le16	flow_handle_8;
+	__le16	flow_handle_9;
+	u8	unused_0[2];
+};
+
+/* hwrm_cfa_flow_stats_output (size:1408b/176B) */
+struct hwrm_cfa_flow_stats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	packet_0;
+	__le64	packet_1;
+	__le64	packet_2;
+	__le64	packet_3;
+	__le64	packet_4;
+	__le64	packet_5;
+	__le64	packet_6;
+	__le64	packet_7;
+	__le64	packet_8;
+	__le64	packet_9;
+	__le64	byte_0;
+	__le64	byte_1;
+	__le64	byte_2;
+	__le64	byte_3;
+	__le64	byte_4;
+	__le64	byte_5;
+	__le64	byte_6;
+	__le64	byte_7;
+	__le64	byte_8;
+	__le64	byte_9;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_vfr_alloc_input (size:448b/56B) */
+struct hwrm_cfa_vfr_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	vf_id;
+	__le16	reserved;
+	u8	unused_0[4];
+	char	vfr_name[32];
+};
+
+/* hwrm_cfa_vfr_alloc_output (size:128b/16B) */
+struct hwrm_cfa_vfr_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	rx_cfa_code;
+	__le16	tx_cfa_action;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_cfa_vfr_free_input (size:384b/48B) */
+struct hwrm_cfa_vfr_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	char	vfr_name[32];
+};
+
+/* hwrm_cfa_vfr_free_output (size:128b/16B) */
+struct hwrm_cfa_vfr_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_tunnel_dst_port_query_input (size:192b/24B) */
+struct hwrm_tunnel_dst_port_query_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	tunnel_type;
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN    0x1UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_GENEVE   0x5UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_LAST    TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_V4
+	u8	unused_0[7];
+};
+
+/* hwrm_tunnel_dst_port_query_output (size:128b/16B) */
+struct hwrm_tunnel_dst_port_query_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	tunnel_dst_port_id;
+	__be16	tunnel_dst_port_val;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_tunnel_dst_port_alloc_input (size:192b/24B) */
+struct hwrm_tunnel_dst_port_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	tunnel_type;
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN    0x1UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GENEVE   0x5UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_LAST    TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4
+	u8	unused_0;
+	__be16	tunnel_dst_port_val;
+	u8	unused_1[4];
+};
+
+/* hwrm_tunnel_dst_port_alloc_output (size:128b/16B) */
+struct hwrm_tunnel_dst_port_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	tunnel_dst_port_id;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_tunnel_dst_port_free_input (size:192b/24B) */
+struct hwrm_tunnel_dst_port_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	tunnel_type;
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN    0x1UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE   0x5UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_V4 0x9UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_LAST    TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_V4
+	u8	unused_0;
+	__le16	tunnel_dst_port_id;
+	u8	unused_1[4];
+};
+
+/* hwrm_tunnel_dst_port_free_output (size:128b/16B) */
+struct hwrm_tunnel_dst_port_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_1[7];
+	u8	valid;
+};
+
+/* ctx_hw_stats (size:1280b/160B) */
+struct ctx_hw_stats {
+	__le64	rx_ucast_pkts;
+	__le64	rx_mcast_pkts;
+	__le64	rx_bcast_pkts;
+	__le64	rx_discard_pkts;
+	__le64	rx_drop_pkts;
+	__le64	rx_ucast_bytes;
+	__le64	rx_mcast_bytes;
+	__le64	rx_bcast_bytes;
+	__le64	tx_ucast_pkts;
+	__le64	tx_mcast_pkts;
+	__le64	tx_bcast_pkts;
+	__le64	tx_discard_pkts;
+	__le64	tx_drop_pkts;
+	__le64	tx_ucast_bytes;
+	__le64	tx_mcast_bytes;
+	__le64	tx_bcast_bytes;
+	__le64	tpa_pkts;
+	__le64	tpa_bytes;
+	__le64	tpa_events;
+	__le64	tpa_aborts;
+};
+
+/* hwrm_stat_ctx_alloc_input (size:256b/32B) */
+struct hwrm_stat_ctx_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	stats_dma_addr;
+	__le32	update_period_ms;
+	u8	stat_ctx_flags;
+	#define STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE     0x1UL
+	u8	unused_0[3];
+};
+
+/* hwrm_stat_ctx_alloc_output (size:128b/16B) */
+struct hwrm_stat_ctx_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	stat_ctx_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_stat_ctx_free_input (size:192b/24B) */
+struct hwrm_stat_ctx_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	stat_ctx_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_stat_ctx_free_output (size:128b/16B) */
+struct hwrm_stat_ctx_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	stat_ctx_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_stat_ctx_query_input (size:192b/24B) */
+struct hwrm_stat_ctx_query_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	stat_ctx_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_stat_ctx_query_output (size:1408b/176B) */
+struct hwrm_stat_ctx_query_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	tx_ucast_pkts;
+	__le64	tx_mcast_pkts;
+	__le64	tx_bcast_pkts;
+	__le64	tx_err_pkts;
+	__le64	tx_drop_pkts;
+	__le64	tx_ucast_bytes;
+	__le64	tx_mcast_bytes;
+	__le64	tx_bcast_bytes;
+	__le64	rx_ucast_pkts;
+	__le64	rx_mcast_pkts;
+	__le64	rx_bcast_pkts;
+	__le64	rx_err_pkts;
+	__le64	rx_drop_pkts;
+	__le64	rx_ucast_bytes;
+	__le64	rx_mcast_bytes;
+	__le64	rx_bcast_bytes;
+	__le64	rx_agg_pkts;
+	__le64	rx_agg_bytes;
+	__le64	rx_agg_events;
+	__le64	rx_agg_aborts;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_stat_ctx_clr_stats_input (size:192b/24B) */
+struct hwrm_stat_ctx_clr_stats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	stat_ctx_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_stat_ctx_clr_stats_output (size:128b/16B) */
+struct hwrm_stat_ctx_clr_stats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_pcie_qstats_input (size:256b/32B) */
+struct hwrm_pcie_qstats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	pcie_stat_size;
+	u8	unused_0[6];
+	__le64	pcie_stat_host_addr;
+};
+
+/* hwrm_pcie_qstats_output (size:128b/16B) */
+struct hwrm_pcie_qstats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	pcie_stat_size;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* tx_port_stats (size:3264b/408B) */
+struct tx_port_stats {
+	__le64	tx_64b_frames;
+	__le64	tx_65b_127b_frames;
+	__le64	tx_128b_255b_frames;
+	__le64	tx_256b_511b_frames;
+	__le64	tx_512b_1023b_frames;
+	__le64	tx_1024b_1518_frames;
+	__le64	tx_good_vlan_frames;
+	__le64	tx_1519b_2047_frames;
+	__le64	tx_2048b_4095b_frames;
+	__le64	tx_4096b_9216b_frames;
+	__le64	tx_9217b_16383b_frames;
+	__le64	tx_good_frames;
+	__le64	tx_total_frames;
+	__le64	tx_ucast_frames;
+	__le64	tx_mcast_frames;
+	__le64	tx_bcast_frames;
+	__le64	tx_pause_frames;
+	__le64	tx_pfc_frames;
+	__le64	tx_jabber_frames;
+	__le64	tx_fcs_err_frames;
+	__le64	tx_control_frames;
+	__le64	tx_oversz_frames;
+	__le64	tx_single_dfrl_frames;
+	__le64	tx_multi_dfrl_frames;
+	__le64	tx_single_coll_frames;
+	__le64	tx_multi_coll_frames;
+	__le64	tx_late_coll_frames;
+	__le64	tx_excessive_coll_frames;
+	__le64	tx_frag_frames;
+	__le64	tx_err;
+	__le64	tx_tagged_frames;
+	__le64	tx_dbl_tagged_frames;
+	__le64	tx_runt_frames;
+	__le64	tx_fifo_underruns;
+	__le64	tx_pfc_ena_frames_pri0;
+	__le64	tx_pfc_ena_frames_pri1;
+	__le64	tx_pfc_ena_frames_pri2;
+	__le64	tx_pfc_ena_frames_pri3;
+	__le64	tx_pfc_ena_frames_pri4;
+	__le64	tx_pfc_ena_frames_pri5;
+	__le64	tx_pfc_ena_frames_pri6;
+	__le64	tx_pfc_ena_frames_pri7;
+	__le64	tx_eee_lpi_events;
+	__le64	tx_eee_lpi_duration;
+	__le64	tx_llfc_logical_msgs;
+	__le64	tx_hcfc_msgs;
+	__le64	tx_total_collisions;
+	__le64	tx_bytes;
+	__le64	tx_xthol_frames;
+	__le64	tx_stat_discard;
+	__le64	tx_stat_error;
+};
+
+/* rx_port_stats (size:4224b/528B) */
+struct rx_port_stats {
+	__le64	rx_64b_frames;
+	__le64	rx_65b_127b_frames;
+	__le64	rx_128b_255b_frames;
+	__le64	rx_256b_511b_frames;
+	__le64	rx_512b_1023b_frames;
+	__le64	rx_1024b_1518_frames;
+	__le64	rx_good_vlan_frames;
+	__le64	rx_1519b_2047b_frames;
+	__le64	rx_2048b_4095b_frames;
+	__le64	rx_4096b_9216b_frames;
+	__le64	rx_9217b_16383b_frames;
+	__le64	rx_total_frames;
+	__le64	rx_ucast_frames;
+	__le64	rx_mcast_frames;
+	__le64	rx_bcast_frames;
+	__le64	rx_fcs_err_frames;
+	__le64	rx_ctrl_frames;
+	__le64	rx_pause_frames;
+	__le64	rx_pfc_frames;
+	__le64	rx_unsupported_opcode_frames;
+	__le64	rx_unsupported_da_pausepfc_frames;
+	__le64	rx_wrong_sa_frames;
+	__le64	rx_align_err_frames;
+	__le64	rx_oor_len_frames;
+	__le64	rx_code_err_frames;
+	__le64	rx_false_carrier_frames;
+	__le64	rx_ovrsz_frames;
+	__le64	rx_jbr_frames;
+	__le64	rx_mtu_err_frames;
+	__le64	rx_match_crc_frames;
+	__le64	rx_promiscuous_frames;
+	__le64	rx_tagged_frames;
+	__le64	rx_double_tagged_frames;
+	__le64	rx_trunc_frames;
+	__le64	rx_good_frames;
+	__le64	rx_pfc_xon2xoff_frames_pri0;
+	__le64	rx_pfc_xon2xoff_frames_pri1;
+	__le64	rx_pfc_xon2xoff_frames_pri2;
+	__le64	rx_pfc_xon2xoff_frames_pri3;
+	__le64	rx_pfc_xon2xoff_frames_pri4;
+	__le64	rx_pfc_xon2xoff_frames_pri5;
+	__le64	rx_pfc_xon2xoff_frames_pri6;
+	__le64	rx_pfc_xon2xoff_frames_pri7;
+	__le64	rx_pfc_ena_frames_pri0;
+	__le64	rx_pfc_ena_frames_pri1;
+	__le64	rx_pfc_ena_frames_pri2;
+	__le64	rx_pfc_ena_frames_pri3;
+	__le64	rx_pfc_ena_frames_pri4;
+	__le64	rx_pfc_ena_frames_pri5;
+	__le64	rx_pfc_ena_frames_pri6;
+	__le64	rx_pfc_ena_frames_pri7;
+	__le64	rx_sch_crc_err_frames;
+	__le64	rx_undrsz_frames;
+	__le64	rx_frag_frames;
+	__le64	rx_eee_lpi_events;
+	__le64	rx_eee_lpi_duration;
+	__le64	rx_llfc_physical_msgs;
+	__le64	rx_llfc_logical_msgs;
+	__le64	rx_llfc_msgs_with_crc_err;
+	__le64	rx_hcfc_msgs;
+	__le64	rx_hcfc_msgs_with_crc_err;
+	__le64	rx_bytes;
+	__le64	rx_runt_bytes;
+	__le64	rx_runt_frames;
+	__le64	rx_stat_discard;
+	__le64	rx_stat_err;
+};
+
+/* rx_port_stats_ext (size:320b/40B) */
+struct rx_port_stats_ext {
+	__le64	link_down_events;
+	__le64	continuous_pause_events;
+	__le64	resume_pause_events;
+	__le64	continuous_roce_pause_events;
+	__le64	resume_roce_pause_events;
+};
+
+/* pcie_ctx_hw_stats (size:768b/96B) */
+struct pcie_ctx_hw_stats {
+	__le64	pcie_pl_signal_integrity;
+	__le64	pcie_dl_signal_integrity;
+	__le64	pcie_tl_signal_integrity;
+	__le64	pcie_link_integrity;
+	__le64	pcie_tx_traffic_rate;
+	__le64	pcie_rx_traffic_rate;
+	__le64	pcie_tx_dllp_statistics;
+	__le64	pcie_rx_dllp_statistics;
+	__le64	pcie_equalization_time;
+	__le32	pcie_ltssm_histogram[4];
+	__le64	pcie_recovery_histogram;
+};
+
+/* hwrm_fw_reset_input (size:192b/24B) */
+struct hwrm_fw_reset_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	embedded_proc_type;
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_BOOT                 0x0UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_MGMT                 0x1UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_NETCTRL              0x2UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_ROCE                 0x3UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_HOST                 0x4UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_AP                   0x5UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_CHIP                 0x6UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_HOST_RESOURCE_REINIT 0x7UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_LAST                FW_RESET_REQ_EMBEDDED_PROC_TYPE_HOST_RESOURCE_REINIT
+	u8	selfrst_status;
+	#define FW_RESET_REQ_SELFRST_STATUS_SELFRSTNONE    0x0UL
+	#define FW_RESET_REQ_SELFRST_STATUS_SELFRSTASAP    0x1UL
+	#define FW_RESET_REQ_SELFRST_STATUS_SELFRSTPCIERST 0x2UL
+	#define FW_RESET_REQ_SELFRST_STATUS_LAST          FW_RESET_REQ_SELFRST_STATUS_SELFRSTPCIERST
+	u8	host_idx;
+	u8	unused_0[5];
+};
+
+/* hwrm_fw_reset_output (size:128b/16B) */
+struct hwrm_fw_reset_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	selfrst_status;
+	#define FW_RESET_RESP_SELFRST_STATUS_SELFRSTNONE    0x0UL
+	#define FW_RESET_RESP_SELFRST_STATUS_SELFRSTASAP    0x1UL
+	#define FW_RESET_RESP_SELFRST_STATUS_SELFRSTPCIERST 0x2UL
+	#define FW_RESET_RESP_SELFRST_STATUS_LAST          FW_RESET_RESP_SELFRST_STATUS_SELFRSTPCIERST
+	u8	unused_0[6];
+	u8	valid;
+};
+
+/* hwrm_fw_qstatus_input (size:192b/24B) */
+struct hwrm_fw_qstatus_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	embedded_proc_type;
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_BOOT    0x0UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_MGMT    0x1UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_NETCTRL 0x2UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_ROCE    0x3UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_HOST    0x4UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_AP      0x5UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_CHIP    0x6UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_LAST   FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_CHIP
+	u8	unused_0[7];
+};
+
+/* hwrm_fw_qstatus_output (size:128b/16B) */
+struct hwrm_fw_qstatus_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	selfrst_status;
+	#define FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTNONE    0x0UL
+	#define FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTASAP    0x1UL
+	#define FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTPCIERST 0x2UL
+	#define FW_QSTATUS_RESP_SELFRST_STATUS_LAST          FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTPCIERST
+	u8	unused_0[6];
+	u8	valid;
+};
+
+/* hwrm_fw_set_time_input (size:256b/32B) */
+struct hwrm_fw_set_time_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	year;
+	#define FW_SET_TIME_REQ_YEAR_UNKNOWN 0x0UL
+	#define FW_SET_TIME_REQ_YEAR_LAST   FW_SET_TIME_REQ_YEAR_UNKNOWN
+	u8	month;
+	u8	day;
+	u8	hour;
+	u8	minute;
+	u8	second;
+	u8	unused_0;
+	__le16	millisecond;
+	__le16	zone;
+	#define FW_SET_TIME_REQ_ZONE_UTC     0x0UL
+	#define FW_SET_TIME_REQ_ZONE_UNKNOWN 0xffffUL
+	#define FW_SET_TIME_REQ_ZONE_LAST   FW_SET_TIME_REQ_ZONE_UNKNOWN
+	u8	unused_1[4];
+};
+
+/* hwrm_fw_set_time_output (size:128b/16B) */
+struct hwrm_fw_set_time_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_struct_hdr (size:128b/16B) */
+struct hwrm_struct_hdr {
+	__le16	struct_id;
+	#define STRUCT_HDR_STRUCT_ID_LLDP_CFG           0x41bUL
+	#define STRUCT_HDR_STRUCT_ID_DCBX_ETS           0x41dUL
+	#define STRUCT_HDR_STRUCT_ID_DCBX_PFC           0x41fUL
+	#define STRUCT_HDR_STRUCT_ID_DCBX_APP           0x421UL
+	#define STRUCT_HDR_STRUCT_ID_DCBX_FEATURE_STATE 0x422UL
+	#define STRUCT_HDR_STRUCT_ID_LLDP_GENERIC       0x424UL
+	#define STRUCT_HDR_STRUCT_ID_LLDP_DEVICE        0x426UL
+	#define STRUCT_HDR_STRUCT_ID_AFM_OPAQUE         0x1UL
+	#define STRUCT_HDR_STRUCT_ID_PORT_DESCRIPTION   0xaUL
+	#define STRUCT_HDR_STRUCT_ID_RSS_V2             0x64UL
+	#define STRUCT_HDR_STRUCT_ID_LAST              STRUCT_HDR_STRUCT_ID_RSS_V2
+	__le16	len;
+	u8	version;
+	u8	count;
+	__le16	subtype;
+	__le16	next_offset;
+	#define STRUCT_HDR_NEXT_OFFSET_LAST 0x0UL
+	u8	unused_0[6];
+};
+
+/* hwrm_struct_data_dcbx_app (size:64b/8B) */
+struct hwrm_struct_data_dcbx_app {
+	__be16	protocol_id;
+	u8	protocol_selector;
+	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_ETHER_TYPE   0x1UL
+	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_TCP_PORT     0x2UL
+	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_UDP_PORT     0x3UL
+	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_TCP_UDP_PORT 0x4UL
+	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_LAST        STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_TCP_UDP_PORT
+	u8	priority;
+	u8	valid;
+	u8	unused_0[3];
+};
+
+/* hwrm_fw_set_structured_data_input (size:256b/32B) */
+struct hwrm_fw_set_structured_data_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	src_data_addr;
+	__le16	data_len;
+	u8	hdr_cnt;
+	u8	unused_0[5];
+};
+
+/* hwrm_fw_set_structured_data_output (size:128b/16B) */
+struct hwrm_fw_set_structured_data_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_fw_set_structured_data_cmd_err (size:64b/8B) */
+struct hwrm_fw_set_structured_data_cmd_err {
+	u8	code;
+	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_UNKNOWN     0x0UL
+	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_HDR_CNT 0x1UL
+	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_FMT     0x2UL
+	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID      0x3UL
+	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_LAST       FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID
+	u8	unused_0[7];
+};
+
+/* hwrm_fw_get_structured_data_input (size:256b/32B) */
+struct hwrm_fw_get_structured_data_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	dest_data_addr;
+	__le16	data_len;
+	__le16	structure_id;
+	__le16	subtype;
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_UNUSED                  0x0UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_ALL                     0xffffUL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NEAR_BRIDGE_ADMIN       0x100UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NEAR_BRIDGE_PEER        0x101UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NEAR_BRIDGE_OPERATIONAL 0x102UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NON_TPMR_ADMIN          0x200UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NON_TPMR_PEER           0x201UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NON_TPMR_OPERATIONAL    0x202UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_HOST_OPERATIONAL        0x300UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_LAST                   FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_HOST_OPERATIONAL
+	u8	count;
+	u8	unused_0;
+};
+
+/* hwrm_fw_get_structured_data_output (size:128b/16B) */
+struct hwrm_fw_get_structured_data_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	hdr_cnt;
+	u8	unused_0[6];
+	u8	valid;
+};
+
+/* hwrm_fw_get_structured_data_cmd_err (size:64b/8B) */
+struct hwrm_fw_get_structured_data_cmd_err {
+	u8	code;
+	#define FW_GET_STRUCTURED_DATA_CMD_ERR_CODE_UNKNOWN 0x0UL
+	#define FW_GET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID  0x3UL
+	#define FW_GET_STRUCTURED_DATA_CMD_ERR_CODE_LAST   FW_GET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID
+	u8	unused_0[7];
+};
+
+/* hwrm_exec_fwd_resp_input (size:1024b/128B) */
+struct hwrm_exec_fwd_resp_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	encap_request[26];
+	__le16	encap_resp_target_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_exec_fwd_resp_output (size:128b/16B) */
+struct hwrm_exec_fwd_resp_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_reject_fwd_resp_input (size:1024b/128B) */
+struct hwrm_reject_fwd_resp_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	encap_request[26];
+	__le16	encap_resp_target_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_reject_fwd_resp_output (size:128b/16B) */
+struct hwrm_reject_fwd_resp_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_fwd_resp_input (size:1024b/128B) */
+struct hwrm_fwd_resp_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	encap_resp_target_id;
+	__le16	encap_resp_cmpl_ring;
+	__le16	encap_resp_len;
+	u8	unused_0;
+	u8	unused_1;
+	__le64	encap_resp_addr;
+	__le32	encap_resp[24];
+};
+
+/* hwrm_fwd_resp_output (size:128b/16B) */
+struct hwrm_fwd_resp_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_fwd_async_event_cmpl_input (size:320b/40B) */
+struct hwrm_fwd_async_event_cmpl_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	encap_async_event_target_id;
+	u8	unused_0[6];
+	__le32	encap_async_event_cmpl[4];
+};
+
+/* hwrm_fwd_async_event_cmpl_output (size:128b/16B) */
+struct hwrm_fwd_async_event_cmpl_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_temp_monitor_query_input (size:128b/16B) */
+struct hwrm_temp_monitor_query_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_temp_monitor_query_output (size:128b/16B) */
+struct hwrm_temp_monitor_query_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	temp;
+	u8	unused_0[6];
+	u8	valid;
+};
+
+/* hwrm_wol_filter_alloc_input (size:512b/64B) */
+struct hwrm_wol_filter_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	__le32	enables;
+	#define WOL_FILTER_ALLOC_REQ_ENABLES_MAC_ADDRESS           0x1UL
+	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_OFFSET        0x2UL
+	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_BUF_SIZE      0x4UL
+	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_BUF_ADDR      0x8UL
+	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_MASK_ADDR     0x10UL
+	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_MASK_SIZE     0x20UL
+	__le16	port_id;
+	u8	wol_type;
+	#define WOL_FILTER_ALLOC_REQ_WOL_TYPE_MAGICPKT 0x0UL
+	#define WOL_FILTER_ALLOC_REQ_WOL_TYPE_BMP      0x1UL
+	#define WOL_FILTER_ALLOC_REQ_WOL_TYPE_INVALID  0xffUL
+	#define WOL_FILTER_ALLOC_REQ_WOL_TYPE_LAST    WOL_FILTER_ALLOC_REQ_WOL_TYPE_INVALID
+	u8	unused_0[5];
+	u8	mac_address[6];
+	__le16	pattern_offset;
+	__le16	pattern_buf_size;
+	__le16	pattern_mask_size;
+	u8	unused_1[4];
+	__le64	pattern_buf_addr;
+	__le64	pattern_mask_addr;
+};
+
+/* hwrm_wol_filter_alloc_output (size:128b/16B) */
+struct hwrm_wol_filter_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	wol_filter_id;
+	u8	unused_0[6];
+	u8	valid;
+};
+
+/* hwrm_wol_filter_free_input (size:256b/32B) */
+struct hwrm_wol_filter_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define WOL_FILTER_FREE_REQ_FLAGS_FREE_ALL_WOL_FILTERS     0x1UL
+	__le32	enables;
+	#define WOL_FILTER_FREE_REQ_ENABLES_WOL_FILTER_ID     0x1UL
+	__le16	port_id;
+	u8	wol_filter_id;
+	u8	unused_0[5];
+};
+
+/* hwrm_wol_filter_free_output (size:128b/16B) */
+struct hwrm_wol_filter_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_wol_filter_qcfg_input (size:448b/56B) */
+struct hwrm_wol_filter_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	__le16	handle;
+	u8	unused_0[4];
+	__le64	pattern_buf_addr;
+	__le16	pattern_buf_size;
+	u8	unused_1[6];
+	__le64	pattern_mask_addr;
+	__le16	pattern_mask_size;
+	u8	unused_2[6];
+};
+
+/* hwrm_wol_filter_qcfg_output (size:256b/32B) */
+struct hwrm_wol_filter_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	next_handle;
+	u8	wol_filter_id;
+	u8	wol_type;
+	#define WOL_FILTER_QCFG_RESP_WOL_TYPE_MAGICPKT 0x0UL
+	#define WOL_FILTER_QCFG_RESP_WOL_TYPE_BMP      0x1UL
+	#define WOL_FILTER_QCFG_RESP_WOL_TYPE_INVALID  0xffUL
+	#define WOL_FILTER_QCFG_RESP_WOL_TYPE_LAST    WOL_FILTER_QCFG_RESP_WOL_TYPE_INVALID
+	__le32	unused_0;
+	u8	mac_address[6];
+	__le16	pattern_offset;
+	__le16	pattern_size;
+	__le16	pattern_mask_size;
+	u8	unused_1[3];
+	u8	valid;
+};
+
+/* hwrm_wol_reason_qcfg_input (size:320b/40B) */
+struct hwrm_wol_reason_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+	__le64	wol_pkt_buf_addr;
+	__le16	wol_pkt_buf_size;
+	u8	unused_1[6];
+};
+
+/* hwrm_wol_reason_qcfg_output (size:128b/16B) */
+struct hwrm_wol_reason_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	wol_filter_id;
+	u8	wol_reason;
+	#define WOL_REASON_QCFG_RESP_WOL_REASON_MAGICPKT 0x0UL
+	#define WOL_REASON_QCFG_RESP_WOL_REASON_BMP      0x1UL
+	#define WOL_REASON_QCFG_RESP_WOL_REASON_INVALID  0xffUL
+	#define WOL_REASON_QCFG_RESP_WOL_REASON_LAST    WOL_REASON_QCFG_RESP_WOL_REASON_INVALID
+	u8	wol_pkt_len;
+	u8	unused_0[4];
+	u8	valid;
+};
+
+/* hwrm_nvm_read_input (size:320b/40B) */
+struct hwrm_nvm_read_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	host_dest_addr;
+	__le16	dir_idx;
+	u8	unused_0[2];
+	__le32	offset;
+	__le32	len;
+	u8	unused_1[4];
+};
+
+/* hwrm_nvm_read_output (size:128b/16B) */
+struct hwrm_nvm_read_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_get_dir_entries_input (size:192b/24B) */
+struct hwrm_nvm_get_dir_entries_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	host_dest_addr;
+};
+
+/* hwrm_nvm_get_dir_entries_output (size:128b/16B) */
+struct hwrm_nvm_get_dir_entries_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_get_dir_info_input (size:128b/16B) */
+struct hwrm_nvm_get_dir_info_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_nvm_get_dir_info_output (size:192b/24B) */
+struct hwrm_nvm_get_dir_info_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	entries;
+	__le32	entry_length;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_write_input (size:384b/48B) */
+struct hwrm_nvm_write_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	host_src_addr;
+	__le16	dir_type;
+	__le16	dir_ordinal;
+	__le16	dir_ext;
+	__le16	dir_attr;
+	__le32	dir_data_length;
+	__le16	option;
+	__le16	flags;
+	#define NVM_WRITE_REQ_FLAGS_KEEP_ORIG_ACTIVE_IMG     0x1UL
+	__le32	dir_item_length;
+	__le32	unused_0;
+};
+
+/* hwrm_nvm_write_output (size:128b/16B) */
+struct hwrm_nvm_write_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	dir_item_length;
+	__le16	dir_idx;
+	u8	unused_0;
+	u8	valid;
+};
+
+/* hwrm_nvm_write_cmd_err (size:64b/8B) */
+struct hwrm_nvm_write_cmd_err {
+	u8	code;
+	#define NVM_WRITE_CMD_ERR_CODE_UNKNOWN  0x0UL
+	#define NVM_WRITE_CMD_ERR_CODE_FRAG_ERR 0x1UL
+	#define NVM_WRITE_CMD_ERR_CODE_NO_SPACE 0x2UL
+	#define NVM_WRITE_CMD_ERR_CODE_LAST    NVM_WRITE_CMD_ERR_CODE_NO_SPACE
+	u8	unused_0[7];
+};
+
+/* hwrm_nvm_modify_input (size:320b/40B) */
+struct hwrm_nvm_modify_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	host_src_addr;
+	__le16	dir_idx;
+	u8	unused_0[2];
+	__le32	offset;
+	__le32	len;
+	u8	unused_1[4];
+};
+
+/* hwrm_nvm_modify_output (size:128b/16B) */
+struct hwrm_nvm_modify_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_find_dir_entry_input (size:256b/32B) */
+struct hwrm_nvm_find_dir_entry_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define NVM_FIND_DIR_ENTRY_REQ_ENABLES_DIR_IDX_VALID     0x1UL
+	__le16	dir_idx;
+	__le16	dir_type;
+	__le16	dir_ordinal;
+	__le16	dir_ext;
+	u8	opt_ordinal;
+	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_MASK 0x3UL
+	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_SFT 0
+	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_EQ    0x0UL
+	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_GE    0x1UL
+	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_GT    0x2UL
+	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_LAST NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_GT
+	u8	unused_0[3];
+};
+
+/* hwrm_nvm_find_dir_entry_output (size:256b/32B) */
+struct hwrm_nvm_find_dir_entry_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	dir_item_length;
+	__le32	dir_data_length;
+	__le32	fw_ver;
+	__le16	dir_ordinal;
+	__le16	dir_idx;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_erase_dir_entry_input (size:192b/24B) */
+struct hwrm_nvm_erase_dir_entry_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	dir_idx;
+	u8	unused_0[6];
+};
+
+/* hwrm_nvm_erase_dir_entry_output (size:128b/16B) */
+struct hwrm_nvm_erase_dir_entry_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_get_dev_info_input (size:128b/16B) */
+struct hwrm_nvm_get_dev_info_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_nvm_get_dev_info_output (size:256b/32B) */
+struct hwrm_nvm_get_dev_info_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	manufacturer_id;
+	__le16	device_id;
+	__le32	sector_size;
+	__le32	nvram_size;
+	__le32	reserved_size;
+	__le32	available_size;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_nvm_mod_dir_entry_input (size:256b/32B) */
+struct hwrm_nvm_mod_dir_entry_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define NVM_MOD_DIR_ENTRY_REQ_ENABLES_CHECKSUM     0x1UL
+	__le16	dir_idx;
+	__le16	dir_ordinal;
+	__le16	dir_ext;
+	__le16	dir_attr;
+	__le32	checksum;
+};
+
+/* hwrm_nvm_mod_dir_entry_output (size:128b/16B) */
+struct hwrm_nvm_mod_dir_entry_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_verify_update_input (size:192b/24B) */
+struct hwrm_nvm_verify_update_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	dir_type;
+	__le16	dir_ordinal;
+	__le16	dir_ext;
+	u8	unused_0[2];
+};
+
+/* hwrm_nvm_verify_update_output (size:128b/16B) */
+struct hwrm_nvm_verify_update_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_install_update_input (size:192b/24B) */
+struct hwrm_nvm_install_update_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	install_type;
+	#define NVM_INSTALL_UPDATE_REQ_INSTALL_TYPE_NORMAL 0x0UL
+	#define NVM_INSTALL_UPDATE_REQ_INSTALL_TYPE_ALL    0xffffffffUL
+	#define NVM_INSTALL_UPDATE_REQ_INSTALL_TYPE_LAST  NVM_INSTALL_UPDATE_REQ_INSTALL_TYPE_ALL
+	__le16	flags;
+	#define NVM_INSTALL_UPDATE_REQ_FLAGS_ERASE_UNUSED_SPACE     0x1UL
+	#define NVM_INSTALL_UPDATE_REQ_FLAGS_REMOVE_UNUSED_PKG      0x2UL
+	#define NVM_INSTALL_UPDATE_REQ_FLAGS_ALLOWED_TO_DEFRAG      0x4UL
+	u8	unused_0[2];
+};
+
+/* hwrm_nvm_install_update_output (size:192b/24B) */
+struct hwrm_nvm_install_update_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	installed_items;
+	u8	result;
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_SUCCESS 0x0UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_LAST   NVM_INSTALL_UPDATE_RESP_RESULT_SUCCESS
+	u8	problem_item;
+	#define NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_NONE    0x0UL
+	#define NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_PACKAGE 0xffUL
+	#define NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_LAST   NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_PACKAGE
+	u8	reset_required;
+	#define NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_NONE  0x0UL
+	#define NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_PCI   0x1UL
+	#define NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_POWER 0x2UL
+	#define NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_LAST NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_POWER
+	u8	unused_0[4];
+	u8	valid;
+};
+
+/* hwrm_nvm_install_update_cmd_err (size:64b/8B) */
+struct hwrm_nvm_install_update_cmd_err {
+	u8	code;
+	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_UNKNOWN  0x0UL
+	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_FRAG_ERR 0x1UL
+	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_SPACE 0x2UL
+	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_LAST    NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_SPACE
+	u8	unused_0[7];
+};
+
+/* hwrm_nvm_get_variable_input (size:320b/40B) */
+struct hwrm_nvm_get_variable_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	dest_data_addr;
+	__le16	data_len;
+	__le16	option_num;
+	#define NVM_GET_VARIABLE_REQ_OPTION_NUM_RSVD_0    0x0UL
+	#define NVM_GET_VARIABLE_REQ_OPTION_NUM_RSVD_FFFF 0xffffUL
+	#define NVM_GET_VARIABLE_REQ_OPTION_NUM_LAST     NVM_GET_VARIABLE_REQ_OPTION_NUM_RSVD_FFFF
+	__le16	dimensions;
+	__le16	index_0;
+	__le16	index_1;
+	__le16	index_2;
+	__le16	index_3;
+	u8	flags;
+	#define NVM_GET_VARIABLE_REQ_FLAGS_FACTORY_DFLT     0x1UL
+	u8	unused_0;
+};
+
+/* hwrm_nvm_get_variable_output (size:128b/16B) */
+struct hwrm_nvm_get_variable_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	data_len;
+	__le16	option_num;
+	#define NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_0    0x0UL
+	#define NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_FFFF 0xffffUL
+	#define NVM_GET_VARIABLE_RESP_OPTION_NUM_LAST     NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_FFFF
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_nvm_get_variable_cmd_err (size:64b/8B) */
+struct hwrm_nvm_get_variable_cmd_err {
+	u8	code;
+	#define NVM_GET_VARIABLE_CMD_ERR_CODE_UNKNOWN       0x0UL
+	#define NVM_GET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL
+	#define NVM_GET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR   0x2UL
+	#define NVM_GET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT 0x3UL
+	#define NVM_GET_VARIABLE_CMD_ERR_CODE_LAST         NVM_GET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT
+	u8	unused_0[7];
+};
+
+/* hwrm_nvm_set_variable_input (size:320b/40B) */
+struct hwrm_nvm_set_variable_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	src_data_addr;
+	__le16	data_len;
+	__le16	option_num;
+	#define NVM_SET_VARIABLE_REQ_OPTION_NUM_RSVD_0    0x0UL
+	#define NVM_SET_VARIABLE_REQ_OPTION_NUM_RSVD_FFFF 0xffffUL
+	#define NVM_SET_VARIABLE_REQ_OPTION_NUM_LAST     NVM_SET_VARIABLE_REQ_OPTION_NUM_RSVD_FFFF
+	__le16	dimensions;
+	__le16	index_0;
+	__le16	index_1;
+	__le16	index_2;
+	__le16	index_3;
+	u8	flags;
+	#define NVM_SET_VARIABLE_REQ_FLAGS_FORCE_FLUSH           0x1UL
+	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_MASK     0xeUL
+	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_SFT      1
+	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_NONE       (0x0UL << 1)
+	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_HMAC_SHA1  (0x1UL << 1)
+	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_LAST      NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_HMAC_SHA1
+	u8	unused_0;
+};
+
+/* hwrm_nvm_set_variable_output (size:128b/16B) */
+struct hwrm_nvm_set_variable_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_set_variable_cmd_err (size:64b/8B) */
+struct hwrm_nvm_set_variable_cmd_err {
+	u8	code;
+	#define NVM_SET_VARIABLE_CMD_ERR_CODE_UNKNOWN       0x0UL
+	#define NVM_SET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL
+	#define NVM_SET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR   0x2UL
+	#define NVM_SET_VARIABLE_CMD_ERR_CODE_LAST         NVM_SET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR
+	u8	unused_0[7];
+};
+
+/* hwrm_selftest_qlist_input (size:128b/16B) */
+struct hwrm_selftest_qlist_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_selftest_qlist_output (size:2240b/280B) */
+struct hwrm_selftest_qlist_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	num_tests;
+	u8	available_tests;
+	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_NVM_TEST                 0x1UL
+	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_LINK_TEST                0x2UL
+	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_REGISTER_TEST            0x4UL
+	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_MEMORY_TEST              0x8UL
+	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_PCIE_SERDES_TEST         0x10UL
+	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_ETHERNET_SERDES_TEST     0x20UL
+	u8	offline_tests;
+	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_NVM_TEST                 0x1UL
+	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_LINK_TEST                0x2UL
+	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_REGISTER_TEST            0x4UL
+	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_MEMORY_TEST              0x8UL
+	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_PCIE_SERDES_TEST         0x10UL
+	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_ETHERNET_SERDES_TEST     0x20UL
+	u8	unused_0;
+	__le16	test_timeout;
+	u8	unused_1[2];
+	char	test0_name[32];
+	char	test1_name[32];
+	char	test2_name[32];
+	char	test3_name[32];
+	char	test4_name[32];
+	char	test5_name[32];
+	char	test6_name[32];
+	char	test7_name[32];
+	u8	unused_2[7];
+	u8	valid;
+};
+
+/* hwrm_selftest_exec_input (size:192b/24B) */
+struct hwrm_selftest_exec_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	flags;
+	#define SELFTEST_EXEC_REQ_FLAGS_NVM_TEST                 0x1UL
+	#define SELFTEST_EXEC_REQ_FLAGS_LINK_TEST                0x2UL
+	#define SELFTEST_EXEC_REQ_FLAGS_REGISTER_TEST            0x4UL
+	#define SELFTEST_EXEC_REQ_FLAGS_MEMORY_TEST              0x8UL
+	#define SELFTEST_EXEC_REQ_FLAGS_PCIE_SERDES_TEST         0x10UL
+	#define SELFTEST_EXEC_REQ_FLAGS_ETHERNET_SERDES_TEST     0x20UL
+	u8	unused_0[7];
+};
+
+/* hwrm_selftest_exec_output (size:128b/16B) */
+struct hwrm_selftest_exec_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	requested_tests;
+	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_NVM_TEST                 0x1UL
+	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_LINK_TEST                0x2UL
+	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_REGISTER_TEST            0x4UL
+	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_MEMORY_TEST              0x8UL
+	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_PCIE_SERDES_TEST         0x10UL
+	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_ETHERNET_SERDES_TEST     0x20UL
+	u8	test_success;
+	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_NVM_TEST                 0x1UL
+	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_LINK_TEST                0x2UL
+	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_REGISTER_TEST            0x4UL
+	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_MEMORY_TEST              0x8UL
+	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_PCIE_SERDES_TEST         0x10UL
+	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_ETHERNET_SERDES_TEST     0x20UL
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_selftest_irq_input (size:128b/16B) */
+struct hwrm_selftest_irq_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_selftest_irq_output (size:128b/16B) */
+struct hwrm_selftest_irq_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+#endif /* _BNXT_HSI_H_ */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_nvm_defs.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_nvm_defs.h
new file mode 100644
index 0000000..8344481
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_nvm_defs.h
@@ -0,0 +1,72 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2014-2016 Broadcom Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef _BNXT_NVM_DEFS_H_
+#define _BNXT_NVM_DEFS_H_
+
+enum bnxt_nvm_directory_type {
+	BNX_DIR_TYPE_UNUSED = 0,
+	BNX_DIR_TYPE_PKG_LOG = 1,
+	BNX_DIR_TYPE_UPDATE = 2,
+	BNX_DIR_TYPE_CHIMP_PATCH = 3,
+	BNX_DIR_TYPE_BOOTCODE = 4,
+	BNX_DIR_TYPE_VPD = 5,
+	BNX_DIR_TYPE_EXP_ROM_MBA = 6,
+	BNX_DIR_TYPE_AVS = 7,
+	BNX_DIR_TYPE_PCIE = 8,
+	BNX_DIR_TYPE_PORT_MACRO = 9,
+	BNX_DIR_TYPE_APE_FW = 10,
+	BNX_DIR_TYPE_APE_PATCH = 11,
+	BNX_DIR_TYPE_KONG_FW = 12,
+	BNX_DIR_TYPE_KONG_PATCH = 13,
+	BNX_DIR_TYPE_BONO_FW = 14,
+	BNX_DIR_TYPE_BONO_PATCH = 15,
+	BNX_DIR_TYPE_TANG_FW = 16,
+	BNX_DIR_TYPE_TANG_PATCH = 17,
+	BNX_DIR_TYPE_BOOTCODE_2 = 18,
+	BNX_DIR_TYPE_CCM = 19,
+	BNX_DIR_TYPE_PCI_CFG = 20,
+	BNX_DIR_TYPE_TSCF_UCODE = 21,
+	BNX_DIR_TYPE_ISCSI_BOOT = 22,
+	BNX_DIR_TYPE_ISCSI_BOOT_IPV6 = 24,
+	BNX_DIR_TYPE_ISCSI_BOOT_IPV4N6 = 25,
+	BNX_DIR_TYPE_ISCSI_BOOT_CFG6 = 26,
+	BNX_DIR_TYPE_EXT_PHY = 27,
+	BNX_DIR_TYPE_SHARED_CFG = 40,
+	BNX_DIR_TYPE_PORT_CFG = 41,
+	BNX_DIR_TYPE_FUNC_CFG = 42,
+	BNX_DIR_TYPE_MGMT_CFG = 48,
+	BNX_DIR_TYPE_MGMT_DATA = 49,
+	BNX_DIR_TYPE_MGMT_WEB_DATA = 50,
+	BNX_DIR_TYPE_MGMT_WEB_META = 51,
+	BNX_DIR_TYPE_MGMT_EVENT_LOG = 52,
+	BNX_DIR_TYPE_MGMT_AUDIT_LOG = 53
+};
+
+#define BNX_DIR_ORDINAL_FIRST			0
+
+#define BNX_DIR_EXT_NONE			0
+#define BNX_DIR_EXT_INACTIVE			(1 << 0)
+#define BNX_DIR_EXT_UPDATE			(1 << 1)
+
+#define BNX_DIR_ATTR_NONE			0
+#define BNX_DIR_ATTR_NO_CHKSUM			(1 << 0)
+#define BNX_DIR_ATTR_PROP_STREAM		(1 << 1)
+
+enum bnxnvm_pkglog_field_index {
+	BNX_PKG_LOG_FIELD_IDX_INSTALLED_TIMESTAMP	= 0,
+	BNX_PKG_LOG_FIELD_IDX_PKG_DESCRIPTION		= 1,
+	BNX_PKG_LOG_FIELD_IDX_PKG_VERSION		= 2,
+	BNX_PKG_LOG_FIELD_IDX_PKG_TIMESTAMP		= 3,
+	BNX_PKG_LOG_FIELD_IDX_PKG_CHECKSUM		= 4,
+	BNX_PKG_LOG_FIELD_IDX_INSTALLED_ITEMS		= 5,
+	BNX_PKG_LOG_FIELD_IDX_INSTALLED_MASK		= 6
+};
+
+#endif				/* Don't add anything after this line */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
new file mode 100644
index 0000000..f952963
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -0,0 +1,1131 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2014-2016 Broadcom Corporation
+ * Copyright (c) 2016-2018 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/interrupt.h>
+#include <linux/etherdevice.h>
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+#include "bnxt_ulp.h"
+#include "bnxt_sriov.h"
+#include "bnxt_vfr.h"
+#include "bnxt_ethtool.h"
+
+#ifdef CONFIG_BNXT_SRIOV
+static int bnxt_hwrm_fwd_async_event_cmpl(struct bnxt *bp,
+					  struct bnxt_vf_info *vf, u16 event_id)
+{
+	struct hwrm_fwd_async_event_cmpl_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_fwd_async_event_cmpl_input req = {0};
+	struct hwrm_async_event_cmpl *async_cmpl;
+	int rc = 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FWD_ASYNC_EVENT_CMPL, -1, -1);
+	if (vf)
+		req.encap_async_event_target_id = cpu_to_le16(vf->fw_fid);
+	else
+		/* broadcast this async event to all VFs */
+		req.encap_async_event_target_id = cpu_to_le16(0xffff);
+	async_cmpl = (struct hwrm_async_event_cmpl *)req.encap_async_event_cmpl;
+	async_cmpl->type = cpu_to_le16(ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT);
+	async_cmpl->event_id = cpu_to_le16(event_id);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+
+	if (rc) {
+		netdev_err(bp->dev, "hwrm_fwd_async_event_cmpl failed. rc:%d\n",
+			   rc);
+		goto fwd_async_event_cmpl_exit;
+	}
+
+	if (resp->error_code) {
+		netdev_err(bp->dev, "hwrm_fwd_async_event_cmpl error %d\n",
+			   resp->error_code);
+		rc = -1;
+	}
+
+fwd_async_event_cmpl_exit:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_vf_ndo_prep(struct bnxt *bp, int vf_id)
+{
+	if (!test_bit(BNXT_STATE_OPEN, &bp->state)) {
+		netdev_err(bp->dev, "vf ndo called though PF is down\n");
+		return -EINVAL;
+	}
+	if (!bp->pf.active_vfs) {
+		netdev_err(bp->dev, "vf ndo called though sriov is disabled\n");
+		return -EINVAL;
+	}
+	if (vf_id >= bp->pf.active_vfs) {
+		netdev_err(bp->dev, "Invalid VF id %d\n", vf_id);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int bnxt_set_vf_spoofchk(struct net_device *dev, int vf_id, bool setting)
+{
+	struct hwrm_func_cfg_input req = {0};
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_vf_info *vf;
+	bool old_setting = false;
+	u32 func_flags;
+	int rc;
+
+	if (bp->hwrm_spec_code < 0x10701)
+		return -ENOTSUPP;
+
+	rc = bnxt_vf_ndo_prep(bp, vf_id);
+	if (rc)
+		return rc;
+
+	vf = &bp->pf.vf[vf_id];
+	if (vf->flags & BNXT_VF_SPOOFCHK)
+		old_setting = true;
+	if (old_setting == setting)
+		return 0;
+
+	func_flags = vf->func_flags;
+	if (setting)
+		func_flags |= FUNC_CFG_REQ_FLAGS_SRC_MAC_ADDR_CHECK_ENABLE;
+	else
+		func_flags |= FUNC_CFG_REQ_FLAGS_SRC_MAC_ADDR_CHECK_DISABLE;
+	/*TODO: if the driver supports VLAN filter on guest VLAN,
+	 * the spoof check should also include vlan anti-spoofing
+	 */
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+	req.fid = cpu_to_le16(vf->fw_fid);
+	req.flags = cpu_to_le32(func_flags);
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc) {
+		vf->func_flags = func_flags;
+		if (setting)
+			vf->flags |= BNXT_VF_SPOOFCHK;
+		else
+			vf->flags &= ~BNXT_VF_SPOOFCHK;
+	}
+	return rc;
+}
+
+int bnxt_set_vf_trust(struct net_device *dev, int vf_id, bool trusted)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_vf_info *vf;
+
+	if (bnxt_vf_ndo_prep(bp, vf_id))
+		return -EINVAL;
+
+	vf = &bp->pf.vf[vf_id];
+	if (trusted)
+		vf->flags |= BNXT_VF_TRUST;
+	else
+		vf->flags &= ~BNXT_VF_TRUST;
+
+	return 0;
+}
+
+int bnxt_get_vf_config(struct net_device *dev, int vf_id,
+		       struct ifla_vf_info *ivi)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_vf_info *vf;
+	int rc;
+
+	rc = bnxt_vf_ndo_prep(bp, vf_id);
+	if (rc)
+		return rc;
+
+	ivi->vf = vf_id;
+	vf = &bp->pf.vf[vf_id];
+
+	if (is_valid_ether_addr(vf->mac_addr))
+		memcpy(&ivi->mac, vf->mac_addr, ETH_ALEN);
+	else
+		memcpy(&ivi->mac, vf->vf_mac_addr, ETH_ALEN);
+	ivi->max_tx_rate = vf->max_tx_rate;
+	ivi->min_tx_rate = vf->min_tx_rate;
+	ivi->vlan = vf->vlan;
+	if (vf->flags & BNXT_VF_QOS)
+		ivi->qos = vf->vlan >> VLAN_PRIO_SHIFT;
+	else
+		ivi->qos = 0;
+	ivi->spoofchk = !!(vf->flags & BNXT_VF_SPOOFCHK);
+	ivi->trusted = !!(vf->flags & BNXT_VF_TRUST);
+	if (!(vf->flags & BNXT_VF_LINK_FORCED))
+		ivi->linkstate = IFLA_VF_LINK_STATE_AUTO;
+	else if (vf->flags & BNXT_VF_LINK_UP)
+		ivi->linkstate = IFLA_VF_LINK_STATE_ENABLE;
+	else
+		ivi->linkstate = IFLA_VF_LINK_STATE_DISABLE;
+
+	return 0;
+}
+
+int bnxt_set_vf_mac(struct net_device *dev, int vf_id, u8 *mac)
+{
+	struct hwrm_func_cfg_input req = {0};
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_vf_info *vf;
+	int rc;
+
+	rc = bnxt_vf_ndo_prep(bp, vf_id);
+	if (rc)
+		return rc;
+	/* reject bc or mc mac addr, zero mac addr means allow
+	 * VF to use its own mac addr
+	 */
+	if (is_multicast_ether_addr(mac)) {
+		netdev_err(dev, "Invalid VF ethernet address\n");
+		return -EINVAL;
+	}
+	vf = &bp->pf.vf[vf_id];
+
+	memcpy(vf->mac_addr, mac, ETH_ALEN);
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+	req.fid = cpu_to_le16(vf->fw_fid);
+	req.flags = cpu_to_le32(vf->func_flags);
+	req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_DFLT_MAC_ADDR);
+	memcpy(req.dflt_mac_addr, mac, ETH_ALEN);
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos,
+		     __be16 vlan_proto)
+{
+	struct hwrm_func_cfg_input req = {0};
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_vf_info *vf;
+	u16 vlan_tag;
+	int rc;
+
+	if (bp->hwrm_spec_code < 0x10201)
+		return -ENOTSUPP;
+
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
+	rc = bnxt_vf_ndo_prep(bp, vf_id);
+	if (rc)
+		return rc;
+
+	/* TODO: needed to implement proper handling of user priority,
+	 * currently fail the command if there is valid priority
+	 */
+	if (vlan_id > 4095 || qos)
+		return -EINVAL;
+
+	vf = &bp->pf.vf[vf_id];
+	vlan_tag = vlan_id;
+	if (vlan_tag == vf->vlan)
+		return 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+	req.fid = cpu_to_le16(vf->fw_fid);
+	req.flags = cpu_to_le32(vf->func_flags);
+	req.dflt_vlan = cpu_to_le16(vlan_tag);
+	req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_DFLT_VLAN);
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc)
+		vf->vlan = vlan_tag;
+	return rc;
+}
+
+int bnxt_set_vf_bw(struct net_device *dev, int vf_id, int min_tx_rate,
+		   int max_tx_rate)
+{
+	struct hwrm_func_cfg_input req = {0};
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_vf_info *vf;
+	u32 pf_link_speed;
+	int rc;
+
+	rc = bnxt_vf_ndo_prep(bp, vf_id);
+	if (rc)
+		return rc;
+
+	vf = &bp->pf.vf[vf_id];
+	pf_link_speed = bnxt_fw_to_ethtool_speed(bp->link_info.link_speed);
+	if (max_tx_rate > pf_link_speed) {
+		netdev_info(bp->dev, "max tx rate %d exceed PF link speed for VF %d\n",
+			    max_tx_rate, vf_id);
+		return -EINVAL;
+	}
+
+	if (min_tx_rate > pf_link_speed || min_tx_rate > max_tx_rate) {
+		netdev_info(bp->dev, "min tx rate %d is invalid for VF %d\n",
+			    min_tx_rate, vf_id);
+		return -EINVAL;
+	}
+	if (min_tx_rate == vf->min_tx_rate && max_tx_rate == vf->max_tx_rate)
+		return 0;
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+	req.fid = cpu_to_le16(vf->fw_fid);
+	req.flags = cpu_to_le32(vf->func_flags);
+	req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_MAX_BW);
+	req.max_bw = cpu_to_le32(max_tx_rate);
+	req.enables |= cpu_to_le32(FUNC_CFG_REQ_ENABLES_MIN_BW);
+	req.min_bw = cpu_to_le32(min_tx_rate);
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc) {
+		vf->min_tx_rate = min_tx_rate;
+		vf->max_tx_rate = max_tx_rate;
+	}
+	return rc;
+}
+
+int bnxt_set_vf_link_state(struct net_device *dev, int vf_id, int link)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_vf_info *vf;
+	int rc;
+
+	rc = bnxt_vf_ndo_prep(bp, vf_id);
+	if (rc)
+		return rc;
+
+	vf = &bp->pf.vf[vf_id];
+
+	vf->flags &= ~(BNXT_VF_LINK_UP | BNXT_VF_LINK_FORCED);
+	switch (link) {
+	case IFLA_VF_LINK_STATE_AUTO:
+		vf->flags |= BNXT_VF_LINK_UP;
+		break;
+	case IFLA_VF_LINK_STATE_DISABLE:
+		vf->flags |= BNXT_VF_LINK_FORCED;
+		break;
+	case IFLA_VF_LINK_STATE_ENABLE:
+		vf->flags |= BNXT_VF_LINK_UP | BNXT_VF_LINK_FORCED;
+		break;
+	default:
+		netdev_err(bp->dev, "Invalid link option\n");
+		rc = -EINVAL;
+		break;
+	}
+	if (vf->flags & (BNXT_VF_LINK_UP | BNXT_VF_LINK_FORCED))
+		rc = bnxt_hwrm_fwd_async_event_cmpl(bp, vf,
+			ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE);
+	return rc;
+}
+
+static int bnxt_set_vf_attr(struct bnxt *bp, int num_vfs)
+{
+	int i;
+	struct bnxt_vf_info *vf;
+
+	for (i = 0; i < num_vfs; i++) {
+		vf = &bp->pf.vf[i];
+		memset(vf, 0, sizeof(*vf));
+	}
+	return 0;
+}
+
+static int bnxt_hwrm_func_vf_resource_free(struct bnxt *bp, int num_vfs)
+{
+	int i, rc = 0;
+	struct bnxt_pf_info *pf = &bp->pf;
+	struct hwrm_func_vf_resc_free_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_RESC_FREE, -1, -1);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	for (i = pf->first_vf_id; i < pf->first_vf_id + num_vfs; i++) {
+		req.vf_id = cpu_to_le16(i);
+		rc = _hwrm_send_message(bp, &req, sizeof(req),
+					HWRM_CMD_TIMEOUT);
+		if (rc)
+			break;
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static void bnxt_free_vf_resources(struct bnxt *bp)
+{
+	struct pci_dev *pdev = bp->pdev;
+	int i;
+
+	kfree(bp->pf.vf_event_bmap);
+	bp->pf.vf_event_bmap = NULL;
+
+	for (i = 0; i < 4; i++) {
+		if (bp->pf.hwrm_cmd_req_addr[i]) {
+			dma_free_coherent(&pdev->dev, BNXT_PAGE_SIZE,
+					  bp->pf.hwrm_cmd_req_addr[i],
+					  bp->pf.hwrm_cmd_req_dma_addr[i]);
+			bp->pf.hwrm_cmd_req_addr[i] = NULL;
+		}
+	}
+
+	kfree(bp->pf.vf);
+	bp->pf.vf = NULL;
+}
+
+static int bnxt_alloc_vf_resources(struct bnxt *bp, int num_vfs)
+{
+	struct pci_dev *pdev = bp->pdev;
+	u32 nr_pages, size, i, j, k = 0;
+
+	bp->pf.vf = kcalloc(num_vfs, sizeof(struct bnxt_vf_info), GFP_KERNEL);
+	if (!bp->pf.vf)
+		return -ENOMEM;
+
+	bnxt_set_vf_attr(bp, num_vfs);
+
+	size = num_vfs * BNXT_HWRM_REQ_MAX_SIZE;
+	nr_pages = size / BNXT_PAGE_SIZE;
+	if (size & (BNXT_PAGE_SIZE - 1))
+		nr_pages++;
+
+	for (i = 0; i < nr_pages; i++) {
+		bp->pf.hwrm_cmd_req_addr[i] =
+			dma_alloc_coherent(&pdev->dev, BNXT_PAGE_SIZE,
+					   &bp->pf.hwrm_cmd_req_dma_addr[i],
+					   GFP_KERNEL);
+
+		if (!bp->pf.hwrm_cmd_req_addr[i])
+			return -ENOMEM;
+
+		for (j = 0; j < BNXT_HWRM_REQS_PER_PAGE && k < num_vfs; j++) {
+			struct bnxt_vf_info *vf = &bp->pf.vf[k];
+
+			vf->hwrm_cmd_req_addr = bp->pf.hwrm_cmd_req_addr[i] +
+						j * BNXT_HWRM_REQ_MAX_SIZE;
+			vf->hwrm_cmd_req_dma_addr =
+				bp->pf.hwrm_cmd_req_dma_addr[i] + j *
+				BNXT_HWRM_REQ_MAX_SIZE;
+			k++;
+		}
+	}
+
+	/* Max 128 VF's */
+	bp->pf.vf_event_bmap = kzalloc(16, GFP_KERNEL);
+	if (!bp->pf.vf_event_bmap)
+		return -ENOMEM;
+
+	bp->pf.hwrm_cmd_req_pages = nr_pages;
+	return 0;
+}
+
+static int bnxt_hwrm_func_buf_rgtr(struct bnxt *bp)
+{
+	struct hwrm_func_buf_rgtr_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_BUF_RGTR, -1, -1);
+
+	req.req_buf_num_pages = cpu_to_le16(bp->pf.hwrm_cmd_req_pages);
+	req.req_buf_page_size = cpu_to_le16(BNXT_PAGE_SHIFT);
+	req.req_buf_len = cpu_to_le16(BNXT_HWRM_REQ_MAX_SIZE);
+	req.req_buf_page_addr0 = cpu_to_le64(bp->pf.hwrm_cmd_req_dma_addr[0]);
+	req.req_buf_page_addr1 = cpu_to_le64(bp->pf.hwrm_cmd_req_dma_addr[1]);
+	req.req_buf_page_addr2 = cpu_to_le64(bp->pf.hwrm_cmd_req_dma_addr[2]);
+	req.req_buf_page_addr3 = cpu_to_le64(bp->pf.hwrm_cmd_req_dma_addr[3]);
+
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+/* Only called by PF to reserve resources for VFs, returns actual number of
+ * VFs configured, or < 0 on error.
+ */
+static int bnxt_hwrm_func_vf_resc_cfg(struct bnxt *bp, int num_vfs)
+{
+	struct hwrm_func_vf_resource_cfg_input req = {0};
+	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+	u16 vf_tx_rings, vf_rx_rings, vf_cp_rings;
+	u16 vf_stat_ctx, vf_vnics, vf_ring_grps;
+	struct bnxt_pf_info *pf = &bp->pf;
+	int i, rc = 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_RESOURCE_CFG, -1, -1);
+
+	vf_cp_rings = hw_resc->max_cp_rings - bp->cp_nr_rings;
+	vf_stat_ctx = hw_resc->max_stat_ctxs - bp->num_stat_ctxs;
+	if (bp->flags & BNXT_FLAG_AGG_RINGS)
+		vf_rx_rings = hw_resc->max_rx_rings - bp->rx_nr_rings * 2;
+	else
+		vf_rx_rings = hw_resc->max_rx_rings - bp->rx_nr_rings;
+	vf_ring_grps = hw_resc->max_hw_ring_grps - bp->rx_nr_rings;
+	vf_tx_rings = hw_resc->max_tx_rings - bp->tx_nr_rings;
+	vf_vnics = hw_resc->max_vnics - bp->nr_vnics;
+	vf_vnics = min_t(u16, vf_vnics, vf_rx_rings);
+
+	req.min_rsscos_ctx = cpu_to_le16(1);
+	req.max_rsscos_ctx = cpu_to_le16(1);
+	if (pf->vf_resv_strategy == BNXT_VF_RESV_STRATEGY_MINIMAL) {
+		req.min_cmpl_rings = cpu_to_le16(1);
+		req.min_tx_rings = cpu_to_le16(1);
+		req.min_rx_rings = cpu_to_le16(1);
+		req.min_l2_ctxs = cpu_to_le16(1);
+		req.min_vnics = cpu_to_le16(1);
+		req.min_stat_ctx = cpu_to_le16(1);
+		req.min_hw_ring_grps = cpu_to_le16(1);
+	} else {
+		vf_cp_rings /= num_vfs;
+		vf_tx_rings /= num_vfs;
+		vf_rx_rings /= num_vfs;
+		vf_vnics /= num_vfs;
+		vf_stat_ctx /= num_vfs;
+		vf_ring_grps /= num_vfs;
+
+		req.min_cmpl_rings = cpu_to_le16(vf_cp_rings);
+		req.min_tx_rings = cpu_to_le16(vf_tx_rings);
+		req.min_rx_rings = cpu_to_le16(vf_rx_rings);
+		req.min_l2_ctxs = cpu_to_le16(4);
+		req.min_vnics = cpu_to_le16(vf_vnics);
+		req.min_stat_ctx = cpu_to_le16(vf_stat_ctx);
+		req.min_hw_ring_grps = cpu_to_le16(vf_ring_grps);
+	}
+	req.max_cmpl_rings = cpu_to_le16(vf_cp_rings);
+	req.max_tx_rings = cpu_to_le16(vf_tx_rings);
+	req.max_rx_rings = cpu_to_le16(vf_rx_rings);
+	req.max_l2_ctxs = cpu_to_le16(4);
+	req.max_vnics = cpu_to_le16(vf_vnics);
+	req.max_stat_ctx = cpu_to_le16(vf_stat_ctx);
+	req.max_hw_ring_grps = cpu_to_le16(vf_ring_grps);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	for (i = 0; i < num_vfs; i++) {
+		req.vf_id = cpu_to_le16(pf->first_vf_id + i);
+		rc = _hwrm_send_message(bp, &req, sizeof(req),
+					HWRM_CMD_TIMEOUT);
+		if (rc) {
+			rc = -ENOMEM;
+			break;
+		}
+		pf->active_vfs = i + 1;
+		pf->vf[i].fw_fid = pf->first_vf_id + i;
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	if (pf->active_vfs) {
+		u16 n = pf->active_vfs;
+
+		hw_resc->max_tx_rings -= le16_to_cpu(req.min_tx_rings) * n;
+		hw_resc->max_rx_rings -= le16_to_cpu(req.min_rx_rings) * n;
+		hw_resc->max_hw_ring_grps -= le16_to_cpu(req.min_hw_ring_grps) *
+					     n;
+		hw_resc->max_cp_rings -= le16_to_cpu(req.min_cmpl_rings) * n;
+		hw_resc->max_rsscos_ctxs -= pf->active_vfs;
+		hw_resc->max_stat_ctxs -= le16_to_cpu(req.min_stat_ctx) * n;
+		hw_resc->max_vnics -= le16_to_cpu(req.min_vnics) * n;
+
+		rc = pf->active_vfs;
+	}
+	return rc;
+}
+
+/* Only called by PF to reserve resources for VFs, returns actual number of
+ * VFs configured, or < 0 on error.
+ */
+static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
+{
+	u32 rc = 0, mtu, i;
+	u16 vf_tx_rings, vf_rx_rings, vf_cp_rings, vf_stat_ctx, vf_vnics;
+	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+	u16 vf_ring_grps, max_stat_ctxs;
+	struct hwrm_func_cfg_input req = {0};
+	struct bnxt_pf_info *pf = &bp->pf;
+	int total_vf_tx_rings = 0;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+
+	max_stat_ctxs = hw_resc->max_stat_ctxs;
+
+	/* Remaining rings are distributed equally amongs VF's for now */
+	vf_cp_rings = (hw_resc->max_cp_rings - bp->cp_nr_rings) / num_vfs;
+	vf_stat_ctx = (max_stat_ctxs - bp->num_stat_ctxs) / num_vfs;
+	if (bp->flags & BNXT_FLAG_AGG_RINGS)
+		vf_rx_rings = (hw_resc->max_rx_rings - bp->rx_nr_rings * 2) /
+			      num_vfs;
+	else
+		vf_rx_rings = (hw_resc->max_rx_rings - bp->rx_nr_rings) /
+			      num_vfs;
+	vf_ring_grps = (hw_resc->max_hw_ring_grps - bp->rx_nr_rings) / num_vfs;
+	vf_tx_rings = (hw_resc->max_tx_rings - bp->tx_nr_rings) / num_vfs;
+	vf_vnics = (hw_resc->max_vnics - bp->nr_vnics) / num_vfs;
+	vf_vnics = min_t(u16, vf_vnics, vf_rx_rings);
+
+	req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_MTU |
+				  FUNC_CFG_REQ_ENABLES_MRU |
+				  FUNC_CFG_REQ_ENABLES_NUM_RSSCOS_CTXS |
+				  FUNC_CFG_REQ_ENABLES_NUM_STAT_CTXS |
+				  FUNC_CFG_REQ_ENABLES_NUM_CMPL_RINGS |
+				  FUNC_CFG_REQ_ENABLES_NUM_TX_RINGS |
+				  FUNC_CFG_REQ_ENABLES_NUM_RX_RINGS |
+				  FUNC_CFG_REQ_ENABLES_NUM_L2_CTXS |
+				  FUNC_CFG_REQ_ENABLES_NUM_VNICS |
+				  FUNC_CFG_REQ_ENABLES_NUM_HW_RING_GRPS);
+
+	mtu = bp->dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+	req.mru = cpu_to_le16(mtu);
+	req.mtu = cpu_to_le16(mtu);
+
+	req.num_rsscos_ctxs = cpu_to_le16(1);
+	req.num_cmpl_rings = cpu_to_le16(vf_cp_rings);
+	req.num_tx_rings = cpu_to_le16(vf_tx_rings);
+	req.num_rx_rings = cpu_to_le16(vf_rx_rings);
+	req.num_hw_ring_grps = cpu_to_le16(vf_ring_grps);
+	req.num_l2_ctxs = cpu_to_le16(4);
+
+	req.num_vnics = cpu_to_le16(vf_vnics);
+	/* FIXME spec currently uses 1 bit for stats ctx */
+	req.num_stat_ctxs = cpu_to_le16(vf_stat_ctx);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	for (i = 0; i < num_vfs; i++) {
+		int vf_tx_rsvd = vf_tx_rings;
+
+		req.fid = cpu_to_le16(pf->first_vf_id + i);
+		rc = _hwrm_send_message(bp, &req, sizeof(req),
+					HWRM_CMD_TIMEOUT);
+		if (rc)
+			break;
+		pf->active_vfs = i + 1;
+		pf->vf[i].fw_fid = le16_to_cpu(req.fid);
+		rc = __bnxt_hwrm_get_tx_rings(bp, pf->vf[i].fw_fid,
+					      &vf_tx_rsvd);
+		if (rc)
+			break;
+		total_vf_tx_rings += vf_tx_rsvd;
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	if (rc)
+		rc = -ENOMEM;
+	if (pf->active_vfs) {
+		hw_resc->max_tx_rings -= total_vf_tx_rings;
+		hw_resc->max_rx_rings -= vf_rx_rings * num_vfs;
+		hw_resc->max_hw_ring_grps -= vf_ring_grps * num_vfs;
+		hw_resc->max_cp_rings -= vf_cp_rings * num_vfs;
+		hw_resc->max_rsscos_ctxs -= num_vfs;
+		hw_resc->max_stat_ctxs -= vf_stat_ctx * num_vfs;
+		hw_resc->max_vnics -= vf_vnics * num_vfs;
+		rc = pf->active_vfs;
+	}
+	return rc;
+}
+
+static int bnxt_func_cfg(struct bnxt *bp, int num_vfs)
+{
+	if (bp->flags & BNXT_FLAG_NEW_RM)
+		return bnxt_hwrm_func_vf_resc_cfg(bp, num_vfs);
+	else
+		return bnxt_hwrm_func_cfg(bp, num_vfs);
+}
+
+static int bnxt_sriov_enable(struct bnxt *bp, int *num_vfs)
+{
+	int rc = 0, vfs_supported;
+	int min_rx_rings, min_tx_rings, min_rss_ctxs;
+	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+	int tx_ok = 0, rx_ok = 0, rss_ok = 0;
+	int avail_cp, avail_stat;
+
+	/* Check if we can enable requested num of vf's. At a mininum
+	 * we require 1 RX 1 TX rings for each VF. In this minimum conf
+	 * features like TPA will not be available.
+	 */
+	vfs_supported = *num_vfs;
+
+	avail_cp = hw_resc->max_cp_rings - bp->cp_nr_rings;
+	avail_stat = hw_resc->max_stat_ctxs - bp->num_stat_ctxs;
+	avail_cp = min_t(int, avail_cp, avail_stat);
+
+	while (vfs_supported) {
+		min_rx_rings = vfs_supported;
+		min_tx_rings = vfs_supported;
+		min_rss_ctxs = vfs_supported;
+
+		if (bp->flags & BNXT_FLAG_AGG_RINGS) {
+			if (hw_resc->max_rx_rings - bp->rx_nr_rings * 2 >=
+			    min_rx_rings)
+				rx_ok = 1;
+		} else {
+			if (hw_resc->max_rx_rings - bp->rx_nr_rings >=
+			    min_rx_rings)
+				rx_ok = 1;
+		}
+		if (hw_resc->max_vnics - bp->nr_vnics < min_rx_rings ||
+		    avail_cp < min_rx_rings)
+			rx_ok = 0;
+
+		if (hw_resc->max_tx_rings - bp->tx_nr_rings >= min_tx_rings &&
+		    avail_cp >= min_tx_rings)
+			tx_ok = 1;
+
+		if (hw_resc->max_rsscos_ctxs - bp->rsscos_nr_ctxs >=
+		    min_rss_ctxs)
+			rss_ok = 1;
+
+		if (tx_ok && rx_ok && rss_ok)
+			break;
+
+		vfs_supported--;
+	}
+
+	if (!vfs_supported) {
+		netdev_err(bp->dev, "Cannot enable VF's as all resources are used by PF\n");
+		return -EINVAL;
+	}
+
+	if (vfs_supported != *num_vfs) {
+		netdev_info(bp->dev, "Requested VFs %d, can enable %d\n",
+			    *num_vfs, vfs_supported);
+		*num_vfs = vfs_supported;
+	}
+
+	rc = bnxt_alloc_vf_resources(bp, *num_vfs);
+	if (rc)
+		goto err_out1;
+
+	/* Reserve resources for VFs */
+	rc = bnxt_func_cfg(bp, *num_vfs);
+	if (rc != *num_vfs) {
+		if (rc <= 0) {
+			netdev_warn(bp->dev, "Unable to reserve resources for SRIOV.\n");
+			*num_vfs = 0;
+			goto err_out2;
+		}
+		netdev_warn(bp->dev, "Only able to reserve resources for %d VFs.\n", rc);
+		*num_vfs = rc;
+	}
+
+	/* Register buffers for VFs */
+	rc = bnxt_hwrm_func_buf_rgtr(bp);
+	if (rc)
+		goto err_out2;
+
+	bnxt_ulp_sriov_cfg(bp, *num_vfs);
+
+	rc = pci_enable_sriov(bp->pdev, *num_vfs);
+	if (rc)
+		goto err_out2;
+
+	return 0;
+
+err_out2:
+	/* Free the resources reserved for various VF's */
+	bnxt_hwrm_func_vf_resource_free(bp, *num_vfs);
+
+err_out1:
+	bnxt_free_vf_resources(bp);
+
+	return rc;
+}
+
+void bnxt_sriov_disable(struct bnxt *bp)
+{
+	u16 num_vfs = pci_num_vf(bp->pdev);
+
+	if (!num_vfs)
+		return;
+
+	/* synchronize VF and VF-rep create and destroy */
+	mutex_lock(&bp->sriov_lock);
+	bnxt_vf_reps_destroy(bp);
+
+	if (pci_vfs_assigned(bp->pdev)) {
+		bnxt_hwrm_fwd_async_event_cmpl(
+			bp, NULL, ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD);
+		netdev_warn(bp->dev, "Unable to free %d VFs because some are assigned to VMs.\n",
+			    num_vfs);
+	} else {
+		pci_disable_sriov(bp->pdev);
+		/* Free the HW resources reserved for various VF's */
+		bnxt_hwrm_func_vf_resource_free(bp, num_vfs);
+	}
+	mutex_unlock(&bp->sriov_lock);
+
+	bnxt_free_vf_resources(bp);
+
+	bp->pf.active_vfs = 0;
+	/* Reclaim all resources for the PF. */
+	rtnl_lock();
+	bnxt_restore_pf_fw_resources(bp);
+	rtnl_unlock();
+
+	bnxt_ulp_sriov_cfg(bp, 0);
+}
+
+int bnxt_sriov_configure(struct pci_dev *pdev, int num_vfs)
+{
+	struct net_device *dev = pci_get_drvdata(pdev);
+	struct bnxt *bp = netdev_priv(dev);
+
+	if (!(bp->flags & BNXT_FLAG_USING_MSIX)) {
+		netdev_warn(dev, "Not allow SRIOV if the irq mode is not MSIX\n");
+		return 0;
+	}
+
+	rtnl_lock();
+	if (!netif_running(dev)) {
+		netdev_warn(dev, "Reject SRIOV config request since if is down!\n");
+		rtnl_unlock();
+		return 0;
+	}
+	bp->sriov_cfg = true;
+	rtnl_unlock();
+
+	if (pci_vfs_assigned(bp->pdev)) {
+		netdev_warn(dev, "Unable to configure SRIOV since some VFs are assigned to VMs.\n");
+		num_vfs = 0;
+		goto sriov_cfg_exit;
+	}
+
+	/* Check if enabled VFs is same as requested */
+	if (num_vfs && num_vfs == bp->pf.active_vfs)
+		goto sriov_cfg_exit;
+
+	/* if there are previous existing VFs, clean them up */
+	bnxt_sriov_disable(bp);
+	if (!num_vfs)
+		goto sriov_cfg_exit;
+
+	bnxt_sriov_enable(bp, &num_vfs);
+
+sriov_cfg_exit:
+	bp->sriov_cfg = false;
+	wake_up(&bp->sriov_cfg_wait);
+
+	return num_vfs;
+}
+
+static int bnxt_hwrm_fwd_resp(struct bnxt *bp, struct bnxt_vf_info *vf,
+			      void *encap_resp, __le64 encap_resp_addr,
+			      __le16 encap_resp_cpr, u32 msg_size)
+{
+	int rc = 0;
+	struct hwrm_fwd_resp_input req = {0};
+	struct hwrm_fwd_resp_output *resp = bp->hwrm_cmd_resp_addr;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FWD_RESP, -1, -1);
+
+	/* Set the new target id */
+	req.target_id = cpu_to_le16(vf->fw_fid);
+	req.encap_resp_target_id = cpu_to_le16(vf->fw_fid);
+	req.encap_resp_len = cpu_to_le16(msg_size);
+	req.encap_resp_addr = encap_resp_addr;
+	req.encap_resp_cmpl_ring = encap_resp_cpr;
+	memcpy(req.encap_resp, encap_resp, msg_size);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+
+	if (rc) {
+		netdev_err(bp->dev, "hwrm_fwd_resp failed. rc:%d\n", rc);
+		goto fwd_resp_exit;
+	}
+
+	if (resp->error_code) {
+		netdev_err(bp->dev, "hwrm_fwd_resp error %d\n",
+			   resp->error_code);
+		rc = -1;
+	}
+
+fwd_resp_exit:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_hwrm_fwd_err_resp(struct bnxt *bp, struct bnxt_vf_info *vf,
+				  u32 msg_size)
+{
+	int rc = 0;
+	struct hwrm_reject_fwd_resp_input req = {0};
+	struct hwrm_reject_fwd_resp_output *resp = bp->hwrm_cmd_resp_addr;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_REJECT_FWD_RESP, -1, -1);
+	/* Set the new target id */
+	req.target_id = cpu_to_le16(vf->fw_fid);
+	req.encap_resp_target_id = cpu_to_le16(vf->fw_fid);
+	memcpy(req.encap_request, vf->hwrm_cmd_req_addr, msg_size);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+
+	if (rc) {
+		netdev_err(bp->dev, "hwrm_fwd_err_resp failed. rc:%d\n", rc);
+		goto fwd_err_resp_exit;
+	}
+
+	if (resp->error_code) {
+		netdev_err(bp->dev, "hwrm_fwd_err_resp error %d\n",
+			   resp->error_code);
+		rc = -1;
+	}
+
+fwd_err_resp_exit:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_hwrm_exec_fwd_resp(struct bnxt *bp, struct bnxt_vf_info *vf,
+				   u32 msg_size)
+{
+	int rc = 0;
+	struct hwrm_exec_fwd_resp_input req = {0};
+	struct hwrm_exec_fwd_resp_output *resp = bp->hwrm_cmd_resp_addr;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_EXEC_FWD_RESP, -1, -1);
+	/* Set the new target id */
+	req.target_id = cpu_to_le16(vf->fw_fid);
+	req.encap_resp_target_id = cpu_to_le16(vf->fw_fid);
+	memcpy(req.encap_request, vf->hwrm_cmd_req_addr, msg_size);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+
+	if (rc) {
+		netdev_err(bp->dev, "hwrm_exec_fw_resp failed. rc:%d\n", rc);
+		goto exec_fwd_resp_exit;
+	}
+
+	if (resp->error_code) {
+		netdev_err(bp->dev, "hwrm_exec_fw_resp error %d\n",
+			   resp->error_code);
+		rc = -1;
+	}
+
+exec_fwd_resp_exit:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_vf_configure_mac(struct bnxt *bp, struct bnxt_vf_info *vf)
+{
+	u32 msg_size = sizeof(struct hwrm_func_vf_cfg_input);
+	struct hwrm_func_vf_cfg_input *req =
+		(struct hwrm_func_vf_cfg_input *)vf->hwrm_cmd_req_addr;
+
+	/* Allow VF to set a valid MAC address, if trust is set to on or
+	 * if the PF assigned MAC address is zero
+	 */
+	if (req->enables & cpu_to_le32(FUNC_VF_CFG_REQ_ENABLES_DFLT_MAC_ADDR)) {
+		if (is_valid_ether_addr(req->dflt_mac_addr) &&
+		    ((vf->flags & BNXT_VF_TRUST) ||
+		     (!is_valid_ether_addr(vf->mac_addr)))) {
+			ether_addr_copy(vf->vf_mac_addr, req->dflt_mac_addr);
+			return bnxt_hwrm_exec_fwd_resp(bp, vf, msg_size);
+		}
+		return bnxt_hwrm_fwd_err_resp(bp, vf, msg_size);
+	}
+	return bnxt_hwrm_exec_fwd_resp(bp, vf, msg_size);
+}
+
+static int bnxt_vf_validate_set_mac(struct bnxt *bp, struct bnxt_vf_info *vf)
+{
+	u32 msg_size = sizeof(struct hwrm_cfa_l2_filter_alloc_input);
+	struct hwrm_cfa_l2_filter_alloc_input *req =
+		(struct hwrm_cfa_l2_filter_alloc_input *)vf->hwrm_cmd_req_addr;
+	bool mac_ok = false;
+
+	if (!is_valid_ether_addr((const u8 *)req->l2_addr))
+		return bnxt_hwrm_fwd_err_resp(bp, vf, msg_size);
+
+	/* Allow VF to set a valid MAC address, if trust is set to on.
+	 * Or VF MAC address must first match MAC address in PF's context.
+	 * Otherwise, it must match the VF MAC address if firmware spec >=
+	 * 1.2.2
+	 */
+	if (vf->flags & BNXT_VF_TRUST) {
+		mac_ok = true;
+	} else if (is_valid_ether_addr(vf->mac_addr)) {
+		if (ether_addr_equal((const u8 *)req->l2_addr, vf->mac_addr))
+			mac_ok = true;
+	} else if (is_valid_ether_addr(vf->vf_mac_addr)) {
+		if (ether_addr_equal((const u8 *)req->l2_addr, vf->vf_mac_addr))
+			mac_ok = true;
+	} else if (bp->hwrm_spec_code < 0x10202) {
+		mac_ok = true;
+	} else {
+		mac_ok = true;
+	}
+	if (mac_ok)
+		return bnxt_hwrm_exec_fwd_resp(bp, vf, msg_size);
+	return bnxt_hwrm_fwd_err_resp(bp, vf, msg_size);
+}
+
+static int bnxt_vf_set_link(struct bnxt *bp, struct bnxt_vf_info *vf)
+{
+	int rc = 0;
+
+	if (!(vf->flags & BNXT_VF_LINK_FORCED)) {
+		/* real link */
+		rc = bnxt_hwrm_exec_fwd_resp(
+			bp, vf, sizeof(struct hwrm_port_phy_qcfg_input));
+	} else {
+		struct hwrm_port_phy_qcfg_output phy_qcfg_resp;
+		struct hwrm_port_phy_qcfg_input *phy_qcfg_req;
+
+		phy_qcfg_req =
+		(struct hwrm_port_phy_qcfg_input *)vf->hwrm_cmd_req_addr;
+		mutex_lock(&bp->hwrm_cmd_lock);
+		memcpy(&phy_qcfg_resp, &bp->link_info.phy_qcfg_resp,
+		       sizeof(phy_qcfg_resp));
+		mutex_unlock(&bp->hwrm_cmd_lock);
+		phy_qcfg_resp.resp_len = cpu_to_le16(sizeof(phy_qcfg_resp));
+		phy_qcfg_resp.seq_id = phy_qcfg_req->seq_id;
+		phy_qcfg_resp.valid = 1;
+
+		if (vf->flags & BNXT_VF_LINK_UP) {
+			/* if physical link is down, force link up on VF */
+			if (phy_qcfg_resp.link !=
+			    PORT_PHY_QCFG_RESP_LINK_LINK) {
+				phy_qcfg_resp.link =
+					PORT_PHY_QCFG_RESP_LINK_LINK;
+				phy_qcfg_resp.link_speed = cpu_to_le16(
+					PORT_PHY_QCFG_RESP_LINK_SPEED_10GB);
+				phy_qcfg_resp.duplex_cfg =
+					PORT_PHY_QCFG_RESP_DUPLEX_CFG_FULL;
+				phy_qcfg_resp.duplex_state =
+					PORT_PHY_QCFG_RESP_DUPLEX_STATE_FULL;
+				phy_qcfg_resp.pause =
+					(PORT_PHY_QCFG_RESP_PAUSE_TX |
+					 PORT_PHY_QCFG_RESP_PAUSE_RX);
+			}
+		} else {
+			/* force link down */
+			phy_qcfg_resp.link = PORT_PHY_QCFG_RESP_LINK_NO_LINK;
+			phy_qcfg_resp.link_speed = 0;
+			phy_qcfg_resp.duplex_state =
+				PORT_PHY_QCFG_RESP_DUPLEX_STATE_HALF;
+			phy_qcfg_resp.pause = 0;
+		}
+		rc = bnxt_hwrm_fwd_resp(bp, vf, &phy_qcfg_resp,
+					phy_qcfg_req->resp_addr,
+					phy_qcfg_req->cmpl_ring,
+					sizeof(phy_qcfg_resp));
+	}
+	return rc;
+}
+
+static int bnxt_vf_req_validate_snd(struct bnxt *bp, struct bnxt_vf_info *vf)
+{
+	int rc = 0;
+	struct input *encap_req = vf->hwrm_cmd_req_addr;
+	u32 req_type = le16_to_cpu(encap_req->req_type);
+
+	switch (req_type) {
+	case HWRM_FUNC_VF_CFG:
+		rc = bnxt_vf_configure_mac(bp, vf);
+		break;
+	case HWRM_CFA_L2_FILTER_ALLOC:
+		rc = bnxt_vf_validate_set_mac(bp, vf);
+		break;
+	case HWRM_FUNC_CFG:
+		/* TODO Validate if VF is allowed to change mac address,
+		 * mtu, num of rings etc
+		 */
+		rc = bnxt_hwrm_exec_fwd_resp(
+			bp, vf, sizeof(struct hwrm_func_cfg_input));
+		break;
+	case HWRM_PORT_PHY_QCFG:
+		rc = bnxt_vf_set_link(bp, vf);
+		break;
+	default:
+		break;
+	}
+	return rc;
+}
+
+void bnxt_hwrm_exec_fwd_req(struct bnxt *bp)
+{
+	u32 i = 0, active_vfs = bp->pf.active_vfs, vf_id;
+
+	/* Scan through VF's and process commands */
+	while (1) {
+		vf_id = find_next_bit(bp->pf.vf_event_bmap, active_vfs, i);
+		if (vf_id >= active_vfs)
+			break;
+
+		clear_bit(vf_id, bp->pf.vf_event_bmap);
+		bnxt_vf_req_validate_snd(bp, &bp->pf.vf[vf_id]);
+		i = vf_id + 1;
+	}
+}
+
+void bnxt_update_vf_mac(struct bnxt *bp)
+{
+	struct hwrm_func_qcaps_input req = {0};
+	struct hwrm_func_qcaps_output *resp = bp->hwrm_cmd_resp_addr;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCAPS, -1, -1);
+	req.fid = cpu_to_le16(0xffff);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	if (_hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT))
+		goto update_vf_mac_exit;
+
+	/* Store MAC address from the firmware.  There are 2 cases:
+	 * 1. MAC address is valid.  It is assigned from the PF and we
+	 *    need to override the current VF MAC address with it.
+	 * 2. MAC address is zero.  The VF will use a random MAC address by
+	 *    default but the stored zero MAC will allow the VF user to change
+	 *    the random MAC address using ndo_set_mac_address() if he wants.
+	 */
+	if (!ether_addr_equal(resp->mac_address, bp->vf.mac_addr))
+		memcpy(bp->vf.mac_addr, resp->mac_address, ETH_ALEN);
+
+	/* overwrite netdev dev_addr with admin VF MAC */
+	if (is_valid_ether_addr(bp->vf.mac_addr))
+		memcpy(bp->dev->dev_addr, bp->vf.mac_addr, ETH_ALEN);
+update_vf_mac_exit:
+	mutex_unlock(&bp->hwrm_cmd_lock);
+}
+
+int bnxt_approve_mac(struct bnxt *bp, u8 *mac)
+{
+	struct hwrm_func_vf_cfg_input req = {0};
+	int rc = 0;
+
+	if (!BNXT_VF(bp))
+		return 0;
+
+	if (bp->hwrm_spec_code < 0x10202) {
+		if (is_valid_ether_addr(bp->vf.mac_addr))
+			rc = -EADDRNOTAVAIL;
+		goto mac_done;
+	}
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_CFG, -1, -1);
+	req.enables = cpu_to_le32(FUNC_VF_CFG_REQ_ENABLES_DFLT_MAC_ADDR);
+	memcpy(req.dflt_mac_addr, mac, ETH_ALEN);
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+mac_done:
+	if (rc) {
+		rc = -EADDRNOTAVAIL;
+		netdev_warn(bp->dev, "VF MAC address %pM not approved by the PF\n",
+			    mac);
+	}
+	return rc;
+}
+#else
+
+void bnxt_sriov_disable(struct bnxt *bp)
+{
+}
+
+void bnxt_hwrm_exec_fwd_req(struct bnxt *bp)
+{
+	netdev_err(bp->dev, "Invalid VF message received when SRIOV is not enable\n");
+}
+
+void bnxt_update_vf_mac(struct bnxt *bp)
+{
+}
+
+int bnxt_approve_mac(struct bnxt *bp, u8 *mac)
+{
+	return 0;
+}
+#endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
new file mode 100644
index 0000000..d10f6f6
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
@@ -0,0 +1,26 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2014-2016 Broadcom Corporation
+ * Copyright (c) 2016-2018 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_SRIOV_H
+#define BNXT_SRIOV_H
+
+int bnxt_get_vf_config(struct net_device *, int, struct ifla_vf_info *);
+int bnxt_set_vf_mac(struct net_device *, int, u8 *);
+int bnxt_set_vf_vlan(struct net_device *, int, u16, u8, __be16);
+int bnxt_set_vf_bw(struct net_device *, int, int, int);
+int bnxt_set_vf_link_state(struct net_device *, int, int);
+int bnxt_set_vf_spoofchk(struct net_device *, int, bool);
+int bnxt_set_vf_trust(struct net_device *dev, int vf_id, bool trust);
+int bnxt_sriov_configure(struct pci_dev *pdev, int num_vfs);
+void bnxt_sriov_disable(struct bnxt *);
+void bnxt_hwrm_exec_fwd_req(struct bnxt *);
+void bnxt_update_vf_mac(struct bnxt *);
+int bnxt_approve_mac(struct bnxt *, u8 *);
+#endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
new file mode 100644
index 0000000..795f450
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -0,0 +1,1677 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2017 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_vlan.h>
+#include <net/flow_dissector.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_gact.h>
+#include <net/tc_act/tc_skbedit.h>
+#include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_vlan.h>
+#include <net/tc_act/tc_tunnel_key.h>
+
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+#include "bnxt_sriov.h"
+#include "bnxt_tc.h"
+#include "bnxt_vfr.h"
+
+#define BNXT_FID_INVALID			0xffff
+#define VLAN_TCI(vid, prio)	((vid) | ((prio) << VLAN_PRIO_SHIFT))
+
+/* Return the dst fid of the func for flow forwarding
+ * For PFs: src_fid is the fid of the PF
+ * For VF-reps: src_fid the fid of the VF
+ */
+static u16 bnxt_flow_get_dst_fid(struct bnxt *pf_bp, struct net_device *dev)
+{
+	struct bnxt *bp;
+
+	/* check if dev belongs to the same switch */
+	if (!switchdev_port_same_parent_id(pf_bp->dev, dev)) {
+		netdev_info(pf_bp->dev, "dev(ifindex=%d) not on same switch",
+			    dev->ifindex);
+		return BNXT_FID_INVALID;
+	}
+
+	/* Is dev a VF-rep? */
+	if (bnxt_dev_is_vf_rep(dev))
+		return bnxt_vf_rep_get_fid(dev);
+
+	bp = netdev_priv(dev);
+	return bp->pf.fw_fid;
+}
+
+static int bnxt_tc_parse_redir(struct bnxt *bp,
+			       struct bnxt_tc_actions *actions,
+			       const struct tc_action *tc_act)
+{
+	struct net_device *dev = tcf_mirred_dev(tc_act);
+
+	if (!dev) {
+		netdev_info(bp->dev, "no dev in mirred action");
+		return -EINVAL;
+	}
+
+	actions->flags |= BNXT_TC_ACTION_FLAG_FWD;
+	actions->dst_dev = dev;
+	return 0;
+}
+
+static void bnxt_tc_parse_vlan(struct bnxt *bp,
+			       struct bnxt_tc_actions *actions,
+			       const struct tc_action *tc_act)
+{
+	if (tcf_vlan_action(tc_act) == TCA_VLAN_ACT_POP) {
+		actions->flags |= BNXT_TC_ACTION_FLAG_POP_VLAN;
+	} else if (tcf_vlan_action(tc_act) == TCA_VLAN_ACT_PUSH) {
+		actions->flags |= BNXT_TC_ACTION_FLAG_PUSH_VLAN;
+		actions->push_vlan_tci = htons(tcf_vlan_push_vid(tc_act));
+		actions->push_vlan_tpid = tcf_vlan_push_proto(tc_act);
+	}
+}
+
+static int bnxt_tc_parse_tunnel_set(struct bnxt *bp,
+				    struct bnxt_tc_actions *actions,
+				    const struct tc_action *tc_act)
+{
+	struct ip_tunnel_info *tun_info = tcf_tunnel_info(tc_act);
+	struct ip_tunnel_key *tun_key = &tun_info->key;
+
+	if (ip_tunnel_info_af(tun_info) != AF_INET) {
+		netdev_info(bp->dev, "only IPv4 tunnel-encap is supported");
+		return -EOPNOTSUPP;
+	}
+
+	actions->tun_encap_key = *tun_key;
+	actions->flags |= BNXT_TC_ACTION_FLAG_TUNNEL_ENCAP;
+	return 0;
+}
+
+static int bnxt_tc_parse_actions(struct bnxt *bp,
+				 struct bnxt_tc_actions *actions,
+				 struct tcf_exts *tc_exts)
+{
+	const struct tc_action *tc_act;
+	LIST_HEAD(tc_actions);
+	int rc;
+
+	if (!tcf_exts_has_actions(tc_exts)) {
+		netdev_info(bp->dev, "no actions");
+		return -EINVAL;
+	}
+
+	tcf_exts_to_list(tc_exts, &tc_actions);
+	list_for_each_entry(tc_act, &tc_actions, list) {
+		/* Drop action */
+		if (is_tcf_gact_shot(tc_act)) {
+			actions->flags |= BNXT_TC_ACTION_FLAG_DROP;
+			return 0; /* don't bother with other actions */
+		}
+
+		/* Redirect action */
+		if (is_tcf_mirred_egress_redirect(tc_act)) {
+			rc = bnxt_tc_parse_redir(bp, actions, tc_act);
+			if (rc)
+				return rc;
+			continue;
+		}
+
+		/* Push/pop VLAN */
+		if (is_tcf_vlan(tc_act)) {
+			bnxt_tc_parse_vlan(bp, actions, tc_act);
+			continue;
+		}
+
+		/* Tunnel encap */
+		if (is_tcf_tunnel_set(tc_act)) {
+			rc = bnxt_tc_parse_tunnel_set(bp, actions, tc_act);
+			if (rc)
+				return rc;
+			continue;
+		}
+
+		/* Tunnel decap */
+		if (is_tcf_tunnel_release(tc_act)) {
+			actions->flags |= BNXT_TC_ACTION_FLAG_TUNNEL_DECAP;
+			continue;
+		}
+	}
+
+	if (actions->flags & BNXT_TC_ACTION_FLAG_FWD) {
+		if (actions->flags & BNXT_TC_ACTION_FLAG_TUNNEL_ENCAP) {
+			/* dst_fid is PF's fid */
+			actions->dst_fid = bp->pf.fw_fid;
+		} else {
+			/* find the FID from dst_dev */
+			actions->dst_fid =
+				bnxt_flow_get_dst_fid(bp, actions->dst_dev);
+			if (actions->dst_fid == BNXT_FID_INVALID)
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+#define GET_KEY(flow_cmd, key_type)					\
+		skb_flow_dissector_target((flow_cmd)->dissector, key_type,\
+					  (flow_cmd)->key)
+#define GET_MASK(flow_cmd, key_type)					\
+		skb_flow_dissector_target((flow_cmd)->dissector, key_type,\
+					  (flow_cmd)->mask)
+
+static int bnxt_tc_parse_flow(struct bnxt *bp,
+			      struct tc_cls_flower_offload *tc_flow_cmd,
+			      struct bnxt_tc_flow *flow)
+{
+	struct flow_dissector *dissector = tc_flow_cmd->dissector;
+	u16 addr_type = 0;
+
+	/* KEY_CONTROL and KEY_BASIC are needed for forming a meaningful key */
+	if ((dissector->used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL)) == 0 ||
+	    (dissector->used_keys & BIT(FLOW_DISSECTOR_KEY_BASIC)) == 0) {
+		netdev_info(bp->dev, "cannot form TC key: used_keys = 0x%x",
+			    dissector->used_keys);
+		return -EOPNOTSUPP;
+	}
+
+	if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
+		struct flow_dissector_key_control *key =
+			GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_CONTROL);
+
+		addr_type = key->addr_type;
+	}
+
+	if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *key =
+			GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_BASIC);
+		struct flow_dissector_key_basic *mask =
+			GET_MASK(tc_flow_cmd, FLOW_DISSECTOR_KEY_BASIC);
+
+		flow->l2_key.ether_type = key->n_proto;
+		flow->l2_mask.ether_type = mask->n_proto;
+
+		if (key->n_proto == htons(ETH_P_IP) ||
+		    key->n_proto == htons(ETH_P_IPV6)) {
+			flow->l4_key.ip_proto = key->ip_proto;
+			flow->l4_mask.ip_proto = mask->ip_proto;
+		}
+	}
+
+	if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+		struct flow_dissector_key_eth_addrs *key =
+			GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_ETH_ADDRS);
+		struct flow_dissector_key_eth_addrs *mask =
+			GET_MASK(tc_flow_cmd, FLOW_DISSECTOR_KEY_ETH_ADDRS);
+
+		flow->flags |= BNXT_TC_FLOW_FLAGS_ETH_ADDRS;
+		ether_addr_copy(flow->l2_key.dmac, key->dst);
+		ether_addr_copy(flow->l2_mask.dmac, mask->dst);
+		ether_addr_copy(flow->l2_key.smac, key->src);
+		ether_addr_copy(flow->l2_mask.smac, mask->src);
+	}
+
+	if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_VLAN)) {
+		struct flow_dissector_key_vlan *key =
+			GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_VLAN);
+		struct flow_dissector_key_vlan *mask =
+			GET_MASK(tc_flow_cmd, FLOW_DISSECTOR_KEY_VLAN);
+
+		flow->l2_key.inner_vlan_tci =
+		   cpu_to_be16(VLAN_TCI(key->vlan_id, key->vlan_priority));
+		flow->l2_mask.inner_vlan_tci =
+		   cpu_to_be16((VLAN_TCI(mask->vlan_id, mask->vlan_priority)));
+		flow->l2_key.inner_vlan_tpid = htons(ETH_P_8021Q);
+		flow->l2_mask.inner_vlan_tpid = htons(0xffff);
+		flow->l2_key.num_vlans = 1;
+	}
+
+	if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+		struct flow_dissector_key_ipv4_addrs *key =
+			GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_IPV4_ADDRS);
+		struct flow_dissector_key_ipv4_addrs *mask =
+			GET_MASK(tc_flow_cmd, FLOW_DISSECTOR_KEY_IPV4_ADDRS);
+
+		flow->flags |= BNXT_TC_FLOW_FLAGS_IPV4_ADDRS;
+		flow->l3_key.ipv4.daddr.s_addr = key->dst;
+		flow->l3_mask.ipv4.daddr.s_addr = mask->dst;
+		flow->l3_key.ipv4.saddr.s_addr = key->src;
+		flow->l3_mask.ipv4.saddr.s_addr = mask->src;
+	} else if (dissector_uses_key(dissector,
+				      FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+		struct flow_dissector_key_ipv6_addrs *key =
+			GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_IPV6_ADDRS);
+		struct flow_dissector_key_ipv6_addrs *mask =
+			GET_MASK(tc_flow_cmd, FLOW_DISSECTOR_KEY_IPV6_ADDRS);
+
+		flow->flags |= BNXT_TC_FLOW_FLAGS_IPV6_ADDRS;
+		flow->l3_key.ipv6.daddr = key->dst;
+		flow->l3_mask.ipv6.daddr = mask->dst;
+		flow->l3_key.ipv6.saddr = key->src;
+		flow->l3_mask.ipv6.saddr = mask->src;
+	}
+
+	if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_PORTS)) {
+		struct flow_dissector_key_ports *key =
+			GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_PORTS);
+		struct flow_dissector_key_ports *mask =
+			GET_MASK(tc_flow_cmd, FLOW_DISSECTOR_KEY_PORTS);
+
+		flow->flags |= BNXT_TC_FLOW_FLAGS_PORTS;
+		flow->l4_key.ports.dport = key->dst;
+		flow->l4_mask.ports.dport = mask->dst;
+		flow->l4_key.ports.sport = key->src;
+		flow->l4_mask.ports.sport = mask->src;
+	}
+
+	if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_ICMP)) {
+		struct flow_dissector_key_icmp *key =
+			GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_ICMP);
+		struct flow_dissector_key_icmp *mask =
+			GET_MASK(tc_flow_cmd, FLOW_DISSECTOR_KEY_ICMP);
+
+		flow->flags |= BNXT_TC_FLOW_FLAGS_ICMP;
+		flow->l4_key.icmp.type = key->type;
+		flow->l4_key.icmp.code = key->code;
+		flow->l4_mask.icmp.type = mask->type;
+		flow->l4_mask.icmp.code = mask->code;
+	}
+
+	if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) {
+		struct flow_dissector_key_control *key =
+			GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_ENC_CONTROL);
+
+		addr_type = key->addr_type;
+	}
+
+	if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
+		struct flow_dissector_key_ipv4_addrs *key =
+			GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS);
+		struct flow_dissector_key_ipv4_addrs *mask =
+				GET_MASK(tc_flow_cmd,
+					 FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS);
+
+		flow->flags |= BNXT_TC_FLOW_FLAGS_TUNL_IPV4_ADDRS;
+		flow->tun_key.u.ipv4.dst = key->dst;
+		flow->tun_mask.u.ipv4.dst = mask->dst;
+		flow->tun_key.u.ipv4.src = key->src;
+		flow->tun_mask.u.ipv4.src = mask->src;
+	} else if (dissector_uses_key(dissector,
+				      FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {
+		return -EOPNOTSUPP;
+	}
+
+	if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
+		struct flow_dissector_key_keyid *key =
+			GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_ENC_KEYID);
+		struct flow_dissector_key_keyid *mask =
+			GET_MASK(tc_flow_cmd, FLOW_DISSECTOR_KEY_ENC_KEYID);
+
+		flow->flags |= BNXT_TC_FLOW_FLAGS_TUNL_ID;
+		flow->tun_key.tun_id = key32_to_tunnel_id(key->keyid);
+		flow->tun_mask.tun_id = key32_to_tunnel_id(mask->keyid);
+	}
+
+	if (dissector_uses_key(dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
+		struct flow_dissector_key_ports *key =
+			GET_KEY(tc_flow_cmd, FLOW_DISSECTOR_KEY_ENC_PORTS);
+		struct flow_dissector_key_ports *mask =
+			GET_MASK(tc_flow_cmd, FLOW_DISSECTOR_KEY_ENC_PORTS);
+
+		flow->flags |= BNXT_TC_FLOW_FLAGS_TUNL_PORTS;
+		flow->tun_key.tp_dst = key->dst;
+		flow->tun_mask.tp_dst = mask->dst;
+		flow->tun_key.tp_src = key->src;
+		flow->tun_mask.tp_src = mask->src;
+	}
+
+	return bnxt_tc_parse_actions(bp, &flow->actions, tc_flow_cmd->exts);
+}
+
+static int bnxt_hwrm_cfa_flow_free(struct bnxt *bp, __le16 flow_handle)
+{
+	struct hwrm_cfa_flow_free_input req = { 0 };
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_FLOW_FREE, -1, -1);
+	req.flow_handle = flow_handle;
+
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		netdev_info(bp->dev, "Error: %s: flow_handle=0x%x rc=%d",
+			    __func__, flow_handle, rc);
+
+	if (rc)
+		rc = -EIO;
+	return rc;
+}
+
+static int ipv6_mask_len(struct in6_addr *mask)
+{
+	int mask_len = 0, i;
+
+	for (i = 0; i < 4; i++)
+		mask_len += inet_mask_len(mask->s6_addr32[i]);
+
+	return mask_len;
+}
+
+static bool is_wildcard(void *mask, int len)
+{
+	const u8 *p = mask;
+	int i;
+
+	for (i = 0; i < len; i++) {
+		if (p[i] != 0)
+			return false;
+	}
+	return true;
+}
+
+static bool is_exactmatch(void *mask, int len)
+{
+	const u8 *p = mask;
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (p[i] != 0xff)
+			return false;
+
+	return true;
+}
+
+static bool bits_set(void *key, int len)
+{
+	const u8 *p = key;
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (p[i] != 0)
+			return true;
+
+	return false;
+}
+
+static int bnxt_hwrm_cfa_flow_alloc(struct bnxt *bp, struct bnxt_tc_flow *flow,
+				    __le16 ref_flow_handle,
+				    __le32 tunnel_handle, __le16 *flow_handle)
+{
+	struct hwrm_cfa_flow_alloc_output *resp = bp->hwrm_cmd_resp_addr;
+	struct bnxt_tc_actions *actions = &flow->actions;
+	struct bnxt_tc_l3_key *l3_mask = &flow->l3_mask;
+	struct bnxt_tc_l3_key *l3_key = &flow->l3_key;
+	struct hwrm_cfa_flow_alloc_input req = { 0 };
+	u16 flow_flags = 0, action_flags = 0;
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_FLOW_ALLOC, -1, -1);
+
+	req.src_fid = cpu_to_le16(flow->src_fid);
+	req.ref_flow_handle = ref_flow_handle;
+
+	if (actions->flags & BNXT_TC_ACTION_FLAG_TUNNEL_DECAP ||
+	    actions->flags & BNXT_TC_ACTION_FLAG_TUNNEL_ENCAP) {
+		req.tunnel_handle = tunnel_handle;
+		flow_flags |= CFA_FLOW_ALLOC_REQ_FLAGS_TUNNEL;
+		action_flags |= CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_TUNNEL;
+	}
+
+	req.ethertype = flow->l2_key.ether_type;
+	req.ip_proto = flow->l4_key.ip_proto;
+
+	if (flow->flags & BNXT_TC_FLOW_FLAGS_ETH_ADDRS) {
+		memcpy(req.dmac, flow->l2_key.dmac, ETH_ALEN);
+		memcpy(req.smac, flow->l2_key.smac, ETH_ALEN);
+	}
+
+	if (flow->l2_key.num_vlans > 0) {
+		flow_flags |= CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_ONE;
+		/* FW expects the inner_vlan_tci value to be set
+		 * in outer_vlan_tci when num_vlans is 1 (which is
+		 * always the case in TC.)
+		 */
+		req.outer_vlan_tci = flow->l2_key.inner_vlan_tci;
+	}
+
+	/* If all IP and L4 fields are wildcarded then this is an L2 flow */
+	if (is_wildcard(l3_mask, sizeof(*l3_mask)) &&
+	    is_wildcard(&flow->l4_mask, sizeof(flow->l4_mask))) {
+		flow_flags |= CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_L2;
+	} else {
+		flow_flags |= flow->l2_key.ether_type == htons(ETH_P_IP) ?
+				CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_IPV4 :
+				CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_IPV6;
+
+		if (flow->flags & BNXT_TC_FLOW_FLAGS_IPV4_ADDRS) {
+			req.ip_dst[0] = l3_key->ipv4.daddr.s_addr;
+			req.ip_dst_mask_len =
+				inet_mask_len(l3_mask->ipv4.daddr.s_addr);
+			req.ip_src[0] = l3_key->ipv4.saddr.s_addr;
+			req.ip_src_mask_len =
+				inet_mask_len(l3_mask->ipv4.saddr.s_addr);
+		} else if (flow->flags & BNXT_TC_FLOW_FLAGS_IPV6_ADDRS) {
+			memcpy(req.ip_dst, l3_key->ipv6.daddr.s6_addr32,
+			       sizeof(req.ip_dst));
+			req.ip_dst_mask_len =
+					ipv6_mask_len(&l3_mask->ipv6.daddr);
+			memcpy(req.ip_src, l3_key->ipv6.saddr.s6_addr32,
+			       sizeof(req.ip_src));
+			req.ip_src_mask_len =
+					ipv6_mask_len(&l3_mask->ipv6.saddr);
+		}
+	}
+
+	if (flow->flags & BNXT_TC_FLOW_FLAGS_PORTS) {
+		req.l4_src_port = flow->l4_key.ports.sport;
+		req.l4_src_port_mask = flow->l4_mask.ports.sport;
+		req.l4_dst_port = flow->l4_key.ports.dport;
+		req.l4_dst_port_mask = flow->l4_mask.ports.dport;
+	} else if (flow->flags & BNXT_TC_FLOW_FLAGS_ICMP) {
+		/* l4 ports serve as type/code when ip_proto is ICMP */
+		req.l4_src_port = htons(flow->l4_key.icmp.type);
+		req.l4_src_port_mask = htons(flow->l4_mask.icmp.type);
+		req.l4_dst_port = htons(flow->l4_key.icmp.code);
+		req.l4_dst_port_mask = htons(flow->l4_mask.icmp.code);
+	}
+	req.flags = cpu_to_le16(flow_flags);
+
+	if (actions->flags & BNXT_TC_ACTION_FLAG_DROP) {
+		action_flags |= CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_DROP;
+	} else {
+		if (actions->flags & BNXT_TC_ACTION_FLAG_FWD) {
+			action_flags |= CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_FWD;
+			req.dst_fid = cpu_to_le16(actions->dst_fid);
+		}
+		if (actions->flags & BNXT_TC_ACTION_FLAG_PUSH_VLAN) {
+			action_flags |=
+			    CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_L2_HEADER_REWRITE;
+			req.l2_rewrite_vlan_tpid = actions->push_vlan_tpid;
+			req.l2_rewrite_vlan_tci = actions->push_vlan_tci;
+			memcpy(&req.l2_rewrite_dmac, &req.dmac, ETH_ALEN);
+			memcpy(&req.l2_rewrite_smac, &req.smac, ETH_ALEN);
+		}
+		if (actions->flags & BNXT_TC_ACTION_FLAG_POP_VLAN) {
+			action_flags |=
+			    CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_L2_HEADER_REWRITE;
+			/* Rewrite config with tpid = 0 implies vlan pop */
+			req.l2_rewrite_vlan_tpid = 0;
+			memcpy(&req.l2_rewrite_dmac, &req.dmac, ETH_ALEN);
+			memcpy(&req.l2_rewrite_smac, &req.smac, ETH_ALEN);
+		}
+	}
+	req.action_flags = cpu_to_le16(action_flags);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc)
+		*flow_handle = resp->flow_handle;
+	mutex_unlock(&bp->hwrm_cmd_lock);
+
+	if (rc == HWRM_ERR_CODE_RESOURCE_ALLOC_ERROR)
+		rc = -ENOSPC;
+	else if (rc)
+		rc = -EIO;
+	return rc;
+}
+
+static int hwrm_cfa_decap_filter_alloc(struct bnxt *bp,
+				       struct bnxt_tc_flow *flow,
+				       struct bnxt_tc_l2_key *l2_info,
+				       __le32 ref_decap_handle,
+				       __le32 *decap_filter_handle)
+{
+	struct hwrm_cfa_decap_filter_alloc_output *resp =
+						bp->hwrm_cmd_resp_addr;
+	struct hwrm_cfa_decap_filter_alloc_input req = { 0 };
+	struct ip_tunnel_key *tun_key = &flow->tun_key;
+	u32 enables = 0;
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_DECAP_FILTER_ALLOC, -1, -1);
+
+	req.flags = cpu_to_le32(CFA_DECAP_FILTER_ALLOC_REQ_FLAGS_OVS_TUNNEL);
+	enables |= CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE |
+		   CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_IP_PROTOCOL;
+	req.tunnel_type = CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN;
+	req.ip_protocol = CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_UDP;
+
+	if (flow->flags & BNXT_TC_FLOW_FLAGS_TUNL_ID) {
+		enables |= CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_TUNNEL_ID;
+		/* tunnel_id is wrongly defined in hsi defn. as __le32 */
+		req.tunnel_id = tunnel_id_to_key32(tun_key->tun_id);
+	}
+
+	if (flow->flags & BNXT_TC_FLOW_FLAGS_TUNL_ETH_ADDRS) {
+		enables |= CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_MACADDR;
+		ether_addr_copy(req.dst_macaddr, l2_info->dmac);
+	}
+	if (l2_info->num_vlans) {
+		enables |= CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_T_IVLAN_VID;
+		req.t_ivlan_vid = l2_info->inner_vlan_tci;
+	}
+
+	enables |= CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_ETHERTYPE;
+	req.ethertype = htons(ETH_P_IP);
+
+	if (flow->flags & BNXT_TC_FLOW_FLAGS_TUNL_IPV4_ADDRS) {
+		enables |= CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_SRC_IPADDR |
+			   CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_IPADDR |
+			   CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_IPADDR_TYPE;
+		req.ip_addr_type = CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV4;
+		req.dst_ipaddr[0] = tun_key->u.ipv4.dst;
+		req.src_ipaddr[0] = tun_key->u.ipv4.src;
+	}
+
+	if (flow->flags & BNXT_TC_FLOW_FLAGS_TUNL_PORTS) {
+		enables |= CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_PORT;
+		req.dst_port = tun_key->tp_dst;
+	}
+
+	/* Eventhough the decap_handle returned by hwrm_cfa_decap_filter_alloc
+	 * is defined as __le32, l2_ctxt_ref_id is defined in HSI as __le16.
+	 */
+	req.l2_ctxt_ref_id = (__force __le16)ref_decap_handle;
+	req.enables = cpu_to_le32(enables);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc)
+		*decap_filter_handle = resp->decap_filter_id;
+	else
+		netdev_info(bp->dev, "%s: Error rc=%d", __func__, rc);
+	mutex_unlock(&bp->hwrm_cmd_lock);
+
+	if (rc)
+		rc = -EIO;
+	return rc;
+}
+
+static int hwrm_cfa_decap_filter_free(struct bnxt *bp,
+				      __le32 decap_filter_handle)
+{
+	struct hwrm_cfa_decap_filter_free_input req = { 0 };
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_DECAP_FILTER_FREE, -1, -1);
+	req.decap_filter_id = decap_filter_handle;
+
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		netdev_info(bp->dev, "%s: Error rc=%d", __func__, rc);
+
+	if (rc)
+		rc = -EIO;
+	return rc;
+}
+
+static int hwrm_cfa_encap_record_alloc(struct bnxt *bp,
+				       struct ip_tunnel_key *encap_key,
+				       struct bnxt_tc_l2_key *l2_info,
+				       __le32 *encap_record_handle)
+{
+	struct hwrm_cfa_encap_record_alloc_output *resp =
+						bp->hwrm_cmd_resp_addr;
+	struct hwrm_cfa_encap_record_alloc_input req = { 0 };
+	struct hwrm_cfa_encap_data_vxlan *encap =
+			(struct hwrm_cfa_encap_data_vxlan *)&req.encap_data;
+	struct hwrm_vxlan_ipv4_hdr *encap_ipv4 =
+				(struct hwrm_vxlan_ipv4_hdr *)encap->l3;
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_ENCAP_RECORD_ALLOC, -1, -1);
+
+	req.encap_type = CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VXLAN;
+
+	ether_addr_copy(encap->dst_mac_addr, l2_info->dmac);
+	ether_addr_copy(encap->src_mac_addr, l2_info->smac);
+	if (l2_info->num_vlans) {
+		encap->num_vlan_tags = l2_info->num_vlans;
+		encap->ovlan_tci = l2_info->inner_vlan_tci;
+		encap->ovlan_tpid = l2_info->inner_vlan_tpid;
+	}
+
+	encap_ipv4->ver_hlen = 4 << VXLAN_IPV4_HDR_VER_HLEN_VERSION_SFT;
+	encap_ipv4->ver_hlen |= 5 << VXLAN_IPV4_HDR_VER_HLEN_HEADER_LENGTH_SFT;
+	encap_ipv4->ttl = encap_key->ttl;
+
+	encap_ipv4->dest_ip_addr = encap_key->u.ipv4.dst;
+	encap_ipv4->src_ip_addr = encap_key->u.ipv4.src;
+	encap_ipv4->protocol = IPPROTO_UDP;
+
+	encap->dst_port = encap_key->tp_dst;
+	encap->vni = tunnel_id_to_key32(encap_key->tun_id);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc)
+		*encap_record_handle = resp->encap_record_id;
+	else
+		netdev_info(bp->dev, "%s: Error rc=%d", __func__, rc);
+	mutex_unlock(&bp->hwrm_cmd_lock);
+
+	if (rc)
+		rc = -EIO;
+	return rc;
+}
+
+static int hwrm_cfa_encap_record_free(struct bnxt *bp,
+				      __le32 encap_record_handle)
+{
+	struct hwrm_cfa_encap_record_free_input req = { 0 };
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_ENCAP_RECORD_FREE, -1, -1);
+	req.encap_record_id = encap_record_handle;
+
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		netdev_info(bp->dev, "%s: Error rc=%d", __func__, rc);
+
+	if (rc)
+		rc = -EIO;
+	return rc;
+}
+
+static int bnxt_tc_put_l2_node(struct bnxt *bp,
+			       struct bnxt_tc_flow_node *flow_node)
+{
+	struct bnxt_tc_l2_node *l2_node = flow_node->l2_node;
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	int rc;
+
+	/* remove flow_node from the L2 shared flow list */
+	list_del(&flow_node->l2_list_node);
+	if (--l2_node->refcount == 0) {
+		rc =  rhashtable_remove_fast(&tc_info->l2_table, &l2_node->node,
+					     tc_info->l2_ht_params);
+		if (rc)
+			netdev_err(bp->dev,
+				   "Error: %s: rhashtable_remove_fast: %d",
+				   __func__, rc);
+		kfree_rcu(l2_node, rcu);
+	}
+	return 0;
+}
+
+static struct bnxt_tc_l2_node *
+bnxt_tc_get_l2_node(struct bnxt *bp, struct rhashtable *l2_table,
+		    struct rhashtable_params ht_params,
+		    struct bnxt_tc_l2_key *l2_key)
+{
+	struct bnxt_tc_l2_node *l2_node;
+	int rc;
+
+	l2_node = rhashtable_lookup_fast(l2_table, l2_key, ht_params);
+	if (!l2_node) {
+		l2_node = kzalloc(sizeof(*l2_node), GFP_KERNEL);
+		if (!l2_node) {
+			rc = -ENOMEM;
+			return NULL;
+		}
+
+		l2_node->key = *l2_key;
+		rc = rhashtable_insert_fast(l2_table, &l2_node->node,
+					    ht_params);
+		if (rc) {
+			kfree_rcu(l2_node, rcu);
+			netdev_err(bp->dev,
+				   "Error: %s: rhashtable_insert_fast: %d",
+				   __func__, rc);
+			return NULL;
+		}
+		INIT_LIST_HEAD(&l2_node->common_l2_flows);
+	}
+	return l2_node;
+}
+
+/* Get the ref_flow_handle for a flow by checking if there are any other
+ * flows that share the same L2 key as this flow.
+ */
+static int
+bnxt_tc_get_ref_flow_handle(struct bnxt *bp, struct bnxt_tc_flow *flow,
+			    struct bnxt_tc_flow_node *flow_node,
+			    __le16 *ref_flow_handle)
+{
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	struct bnxt_tc_flow_node *ref_flow_node;
+	struct bnxt_tc_l2_node *l2_node;
+
+	l2_node = bnxt_tc_get_l2_node(bp, &tc_info->l2_table,
+				      tc_info->l2_ht_params,
+				      &flow->l2_key);
+	if (!l2_node)
+		return -1;
+
+	/* If any other flow is using this l2_node, use it's flow_handle
+	 * as the ref_flow_handle
+	 */
+	if (l2_node->refcount > 0) {
+		ref_flow_node = list_first_entry(&l2_node->common_l2_flows,
+						 struct bnxt_tc_flow_node,
+						 l2_list_node);
+		*ref_flow_handle = ref_flow_node->flow_handle;
+	} else {
+		*ref_flow_handle = cpu_to_le16(0xffff);
+	}
+
+	/* Insert the l2_node into the flow_node so that subsequent flows
+	 * with a matching l2 key can use the flow_handle of this flow
+	 * as their ref_flow_handle
+	 */
+	flow_node->l2_node = l2_node;
+	list_add(&flow_node->l2_list_node, &l2_node->common_l2_flows);
+	l2_node->refcount++;
+	return 0;
+}
+
+/* After the flow parsing is done, this routine is used for checking
+ * if there are any aspects of the flow that prevent it from being
+ * offloaded.
+ */
+static bool bnxt_tc_can_offload(struct bnxt *bp, struct bnxt_tc_flow *flow)
+{
+	/* If L4 ports are specified then ip_proto must be TCP or UDP */
+	if ((flow->flags & BNXT_TC_FLOW_FLAGS_PORTS) &&
+	    (flow->l4_key.ip_proto != IPPROTO_TCP &&
+	     flow->l4_key.ip_proto != IPPROTO_UDP)) {
+		netdev_info(bp->dev, "Cannot offload non-TCP/UDP (%d) ports",
+			    flow->l4_key.ip_proto);
+		return false;
+	}
+
+	/* Currently source/dest MAC cannot be partial wildcard  */
+	if (bits_set(&flow->l2_key.smac, sizeof(flow->l2_key.smac)) &&
+	    !is_exactmatch(flow->l2_mask.smac, sizeof(flow->l2_mask.smac))) {
+		netdev_info(bp->dev, "Wildcard match unsupported for Source MAC\n");
+		return false;
+	}
+	if (bits_set(&flow->l2_key.dmac, sizeof(flow->l2_key.dmac)) &&
+	    !is_exactmatch(&flow->l2_mask.dmac, sizeof(flow->l2_mask.dmac))) {
+		netdev_info(bp->dev, "Wildcard match unsupported for Dest MAC\n");
+		return false;
+	}
+
+	/* Currently VLAN fields cannot be partial wildcard */
+	if (bits_set(&flow->l2_key.inner_vlan_tci,
+		     sizeof(flow->l2_key.inner_vlan_tci)) &&
+	    !is_exactmatch(&flow->l2_mask.inner_vlan_tci,
+			   sizeof(flow->l2_mask.inner_vlan_tci))) {
+		netdev_info(bp->dev, "Wildcard match unsupported for VLAN TCI\n");
+		return false;
+	}
+	if (bits_set(&flow->l2_key.inner_vlan_tpid,
+		     sizeof(flow->l2_key.inner_vlan_tpid)) &&
+	    !is_exactmatch(&flow->l2_mask.inner_vlan_tpid,
+			   sizeof(flow->l2_mask.inner_vlan_tpid))) {
+		netdev_info(bp->dev, "Wildcard match unsupported for VLAN TPID\n");
+		return false;
+	}
+
+	/* Currently Ethertype must be set */
+	if (!is_exactmatch(&flow->l2_mask.ether_type,
+			   sizeof(flow->l2_mask.ether_type))) {
+		netdev_info(bp->dev, "Wildcard match unsupported for Ethertype\n");
+		return false;
+	}
+
+	return true;
+}
+
+/* Returns the final refcount of the node on success
+ * or a -ve error code on failure
+ */
+static int bnxt_tc_put_tunnel_node(struct bnxt *bp,
+				   struct rhashtable *tunnel_table,
+				   struct rhashtable_params *ht_params,
+				   struct bnxt_tc_tunnel_node *tunnel_node)
+{
+	int rc;
+
+	if (--tunnel_node->refcount == 0) {
+		rc =  rhashtable_remove_fast(tunnel_table, &tunnel_node->node,
+					     *ht_params);
+		if (rc) {
+			netdev_err(bp->dev, "rhashtable_remove_fast rc=%d", rc);
+			rc = -1;
+		}
+		kfree_rcu(tunnel_node, rcu);
+		return rc;
+	} else {
+		return tunnel_node->refcount;
+	}
+}
+
+/* Get (or add) either encap or decap tunnel node from/to the supplied
+ * hash table.
+ */
+static struct bnxt_tc_tunnel_node *
+bnxt_tc_get_tunnel_node(struct bnxt *bp, struct rhashtable *tunnel_table,
+			struct rhashtable_params *ht_params,
+			struct ip_tunnel_key *tun_key)
+{
+	struct bnxt_tc_tunnel_node *tunnel_node;
+	int rc;
+
+	tunnel_node = rhashtable_lookup_fast(tunnel_table, tun_key, *ht_params);
+	if (!tunnel_node) {
+		tunnel_node = kzalloc(sizeof(*tunnel_node), GFP_KERNEL);
+		if (!tunnel_node) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		tunnel_node->key = *tun_key;
+		tunnel_node->tunnel_handle = INVALID_TUNNEL_HANDLE;
+		rc = rhashtable_insert_fast(tunnel_table, &tunnel_node->node,
+					    *ht_params);
+		if (rc) {
+			kfree_rcu(tunnel_node, rcu);
+			goto err;
+		}
+	}
+	tunnel_node->refcount++;
+	return tunnel_node;
+err:
+	netdev_info(bp->dev, "error rc=%d", rc);
+	return NULL;
+}
+
+static int bnxt_tc_get_ref_decap_handle(struct bnxt *bp,
+					struct bnxt_tc_flow *flow,
+					struct bnxt_tc_l2_key *l2_key,
+					struct bnxt_tc_flow_node *flow_node,
+					__le32 *ref_decap_handle)
+{
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	struct bnxt_tc_flow_node *ref_flow_node;
+	struct bnxt_tc_l2_node *decap_l2_node;
+
+	decap_l2_node = bnxt_tc_get_l2_node(bp, &tc_info->decap_l2_table,
+					    tc_info->decap_l2_ht_params,
+					    l2_key);
+	if (!decap_l2_node)
+		return -1;
+
+	/* If any other flow is using this decap_l2_node, use it's decap_handle
+	 * as the ref_decap_handle
+	 */
+	if (decap_l2_node->refcount > 0) {
+		ref_flow_node =
+			list_first_entry(&decap_l2_node->common_l2_flows,
+					 struct bnxt_tc_flow_node,
+					 decap_l2_list_node);
+		*ref_decap_handle = ref_flow_node->decap_node->tunnel_handle;
+	} else {
+		*ref_decap_handle = INVALID_TUNNEL_HANDLE;
+	}
+
+	/* Insert the l2_node into the flow_node so that subsequent flows
+	 * with a matching decap l2 key can use the decap_filter_handle of
+	 * this flow as their ref_decap_handle
+	 */
+	flow_node->decap_l2_node = decap_l2_node;
+	list_add(&flow_node->decap_l2_list_node,
+		 &decap_l2_node->common_l2_flows);
+	decap_l2_node->refcount++;
+	return 0;
+}
+
+static void bnxt_tc_put_decap_l2_node(struct bnxt *bp,
+				      struct bnxt_tc_flow_node *flow_node)
+{
+	struct bnxt_tc_l2_node *decap_l2_node = flow_node->decap_l2_node;
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	int rc;
+
+	/* remove flow_node from the decap L2 sharing flow list */
+	list_del(&flow_node->decap_l2_list_node);
+	if (--decap_l2_node->refcount == 0) {
+		rc =  rhashtable_remove_fast(&tc_info->decap_l2_table,
+					     &decap_l2_node->node,
+					     tc_info->decap_l2_ht_params);
+		if (rc)
+			netdev_err(bp->dev, "rhashtable_remove_fast rc=%d", rc);
+		kfree_rcu(decap_l2_node, rcu);
+	}
+}
+
+static void bnxt_tc_put_decap_handle(struct bnxt *bp,
+				     struct bnxt_tc_flow_node *flow_node)
+{
+	__le32 decap_handle = flow_node->decap_node->tunnel_handle;
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	int rc;
+
+	if (flow_node->decap_l2_node)
+		bnxt_tc_put_decap_l2_node(bp, flow_node);
+
+	rc = bnxt_tc_put_tunnel_node(bp, &tc_info->decap_table,
+				     &tc_info->decap_ht_params,
+				     flow_node->decap_node);
+	if (!rc && decap_handle != INVALID_TUNNEL_HANDLE)
+		hwrm_cfa_decap_filter_free(bp, decap_handle);
+}
+
+static int bnxt_tc_resolve_tunnel_hdrs(struct bnxt *bp,
+				       struct ip_tunnel_key *tun_key,
+				       struct bnxt_tc_l2_key *l2_info)
+{
+#ifdef CONFIG_INET
+	struct net_device *real_dst_dev = bp->dev;
+	struct flowi4 flow = { {0} };
+	struct net_device *dst_dev;
+	struct neighbour *nbr;
+	struct rtable *rt;
+	int rc;
+
+	flow.flowi4_proto = IPPROTO_UDP;
+	flow.fl4_dport = tun_key->tp_dst;
+	flow.daddr = tun_key->u.ipv4.dst;
+
+	rt = ip_route_output_key(dev_net(real_dst_dev), &flow);
+	if (IS_ERR(rt)) {
+		netdev_info(bp->dev, "no route to %pI4b", &flow.daddr);
+		return -EOPNOTSUPP;
+	}
+
+	/* The route must either point to the real_dst_dev or a dst_dev that
+	 * uses the real_dst_dev.
+	 */
+	dst_dev = rt->dst.dev;
+	if (is_vlan_dev(dst_dev)) {
+#if IS_ENABLED(CONFIG_VLAN_8021Q)
+		struct vlan_dev_priv *vlan = vlan_dev_priv(dst_dev);
+
+		if (vlan->real_dev != real_dst_dev) {
+			netdev_info(bp->dev,
+				    "dst_dev(%s) doesn't use PF-if(%s)",
+				    netdev_name(dst_dev),
+				    netdev_name(real_dst_dev));
+			rc = -EOPNOTSUPP;
+			goto put_rt;
+		}
+		l2_info->inner_vlan_tci = htons(vlan->vlan_id);
+		l2_info->inner_vlan_tpid = vlan->vlan_proto;
+		l2_info->num_vlans = 1;
+#endif
+	} else if (dst_dev != real_dst_dev) {
+		netdev_info(bp->dev,
+			    "dst_dev(%s) for %pI4b is not PF-if(%s)",
+			    netdev_name(dst_dev), &flow.daddr,
+			    netdev_name(real_dst_dev));
+		rc = -EOPNOTSUPP;
+		goto put_rt;
+	}
+
+	nbr = dst_neigh_lookup(&rt->dst, &flow.daddr);
+	if (!nbr) {
+		netdev_info(bp->dev, "can't lookup neighbor for %pI4b",
+			    &flow.daddr);
+		rc = -EOPNOTSUPP;
+		goto put_rt;
+	}
+
+	tun_key->u.ipv4.src = flow.saddr;
+	tun_key->ttl = ip4_dst_hoplimit(&rt->dst);
+	neigh_ha_snapshot(l2_info->dmac, nbr, dst_dev);
+	ether_addr_copy(l2_info->smac, dst_dev->dev_addr);
+	neigh_release(nbr);
+	ip_rt_put(rt);
+
+	return 0;
+put_rt:
+	ip_rt_put(rt);
+	return rc;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int bnxt_tc_get_decap_handle(struct bnxt *bp, struct bnxt_tc_flow *flow,
+				    struct bnxt_tc_flow_node *flow_node,
+				    __le32 *decap_filter_handle)
+{
+	struct ip_tunnel_key *decap_key = &flow->tun_key;
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	struct bnxt_tc_l2_key l2_info = { {0} };
+	struct bnxt_tc_tunnel_node *decap_node;
+	struct ip_tunnel_key tun_key = { 0 };
+	struct bnxt_tc_l2_key *decap_l2_info;
+	__le32 ref_decap_handle;
+	int rc;
+
+	/* Check if there's another flow using the same tunnel decap.
+	 * If not, add this tunnel to the table and resolve the other
+	 * tunnel header fileds. Ignore src_port in the tunnel_key,
+	 * since it is not required for decap filters.
+	 */
+	decap_key->tp_src = 0;
+	decap_node = bnxt_tc_get_tunnel_node(bp, &tc_info->decap_table,
+					     &tc_info->decap_ht_params,
+					     decap_key);
+	if (!decap_node)
+		return -ENOMEM;
+
+	flow_node->decap_node = decap_node;
+
+	if (decap_node->tunnel_handle != INVALID_TUNNEL_HANDLE)
+		goto done;
+
+	/* Resolve the L2 fields for tunnel decap
+	 * Resolve the route for remote vtep (saddr) of the decap key
+	 * Find it's next-hop mac addrs
+	 */
+	tun_key.u.ipv4.dst = flow->tun_key.u.ipv4.src;
+	tun_key.tp_dst = flow->tun_key.tp_dst;
+	rc = bnxt_tc_resolve_tunnel_hdrs(bp, &tun_key, &l2_info);
+	if (rc)
+		goto put_decap;
+
+	decap_l2_info = &decap_node->l2_info;
+	/* decap smac is wildcarded */
+	ether_addr_copy(decap_l2_info->dmac, l2_info.smac);
+	if (l2_info.num_vlans) {
+		decap_l2_info->num_vlans = l2_info.num_vlans;
+		decap_l2_info->inner_vlan_tpid = l2_info.inner_vlan_tpid;
+		decap_l2_info->inner_vlan_tci = l2_info.inner_vlan_tci;
+	}
+	flow->flags |= BNXT_TC_FLOW_FLAGS_TUNL_ETH_ADDRS;
+
+	/* For getting a decap_filter_handle we first need to check if
+	 * there are any other decap flows that share the same tunnel L2
+	 * key and if so, pass that flow's decap_filter_handle as the
+	 * ref_decap_handle for this flow.
+	 */
+	rc = bnxt_tc_get_ref_decap_handle(bp, flow, decap_l2_info, flow_node,
+					  &ref_decap_handle);
+	if (rc)
+		goto put_decap;
+
+	/* Issue the hwrm cmd to allocate a decap filter handle */
+	rc = hwrm_cfa_decap_filter_alloc(bp, flow, decap_l2_info,
+					 ref_decap_handle,
+					 &decap_node->tunnel_handle);
+	if (rc)
+		goto put_decap_l2;
+
+done:
+	*decap_filter_handle = decap_node->tunnel_handle;
+	return 0;
+
+put_decap_l2:
+	bnxt_tc_put_decap_l2_node(bp, flow_node);
+put_decap:
+	bnxt_tc_put_tunnel_node(bp, &tc_info->decap_table,
+				&tc_info->decap_ht_params,
+				flow_node->decap_node);
+	return rc;
+}
+
+static void bnxt_tc_put_encap_handle(struct bnxt *bp,
+				     struct bnxt_tc_tunnel_node *encap_node)
+{
+	__le32 encap_handle = encap_node->tunnel_handle;
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	int rc;
+
+	rc = bnxt_tc_put_tunnel_node(bp, &tc_info->encap_table,
+				     &tc_info->encap_ht_params, encap_node);
+	if (!rc && encap_handle != INVALID_TUNNEL_HANDLE)
+		hwrm_cfa_encap_record_free(bp, encap_handle);
+}
+
+/* Lookup the tunnel encap table and check if there's an encap_handle
+ * alloc'd already.
+ * If not, query L2 info via a route lookup and issue an encap_record_alloc
+ * cmd to FW.
+ */
+static int bnxt_tc_get_encap_handle(struct bnxt *bp, struct bnxt_tc_flow *flow,
+				    struct bnxt_tc_flow_node *flow_node,
+				    __le32 *encap_handle)
+{
+	struct ip_tunnel_key *encap_key = &flow->actions.tun_encap_key;
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	struct bnxt_tc_tunnel_node *encap_node;
+	int rc;
+
+	/* Check if there's another flow using the same tunnel encap.
+	 * If not, add this tunnel to the table and resolve the other
+	 * tunnel header fileds
+	 */
+	encap_node = bnxt_tc_get_tunnel_node(bp, &tc_info->encap_table,
+					     &tc_info->encap_ht_params,
+					     encap_key);
+	if (!encap_node)
+		return -ENOMEM;
+
+	flow_node->encap_node = encap_node;
+
+	if (encap_node->tunnel_handle != INVALID_TUNNEL_HANDLE)
+		goto done;
+
+	rc = bnxt_tc_resolve_tunnel_hdrs(bp, encap_key, &encap_node->l2_info);
+	if (rc)
+		goto put_encap;
+
+	/* Allocate a new tunnel encap record */
+	rc = hwrm_cfa_encap_record_alloc(bp, encap_key, &encap_node->l2_info,
+					 &encap_node->tunnel_handle);
+	if (rc)
+		goto put_encap;
+
+done:
+	*encap_handle = encap_node->tunnel_handle;
+	return 0;
+
+put_encap:
+	bnxt_tc_put_tunnel_node(bp, &tc_info->encap_table,
+				&tc_info->encap_ht_params, encap_node);
+	return rc;
+}
+
+static void bnxt_tc_put_tunnel_handle(struct bnxt *bp,
+				      struct bnxt_tc_flow *flow,
+				      struct bnxt_tc_flow_node *flow_node)
+{
+	if (flow->actions.flags & BNXT_TC_ACTION_FLAG_TUNNEL_DECAP)
+		bnxt_tc_put_decap_handle(bp, flow_node);
+	else if (flow->actions.flags & BNXT_TC_ACTION_FLAG_TUNNEL_ENCAP)
+		bnxt_tc_put_encap_handle(bp, flow_node->encap_node);
+}
+
+static int bnxt_tc_get_tunnel_handle(struct bnxt *bp,
+				     struct bnxt_tc_flow *flow,
+				     struct bnxt_tc_flow_node *flow_node,
+				     __le32 *tunnel_handle)
+{
+	if (flow->actions.flags & BNXT_TC_ACTION_FLAG_TUNNEL_DECAP)
+		return bnxt_tc_get_decap_handle(bp, flow, flow_node,
+						tunnel_handle);
+	else if (flow->actions.flags & BNXT_TC_ACTION_FLAG_TUNNEL_ENCAP)
+		return bnxt_tc_get_encap_handle(bp, flow, flow_node,
+						tunnel_handle);
+	else
+		return 0;
+}
+static int __bnxt_tc_del_flow(struct bnxt *bp,
+			      struct bnxt_tc_flow_node *flow_node)
+{
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	int rc;
+
+	/* send HWRM cmd to free the flow-id */
+	bnxt_hwrm_cfa_flow_free(bp, flow_node->flow_handle);
+
+	mutex_lock(&tc_info->lock);
+
+	/* release references to any tunnel encap/decap nodes */
+	bnxt_tc_put_tunnel_handle(bp, &flow_node->flow, flow_node);
+
+	/* release reference to l2 node */
+	bnxt_tc_put_l2_node(bp, flow_node);
+
+	mutex_unlock(&tc_info->lock);
+
+	rc = rhashtable_remove_fast(&tc_info->flow_table, &flow_node->node,
+				    tc_info->flow_ht_params);
+	if (rc)
+		netdev_err(bp->dev, "Error: %s: rhashtable_remove_fast rc=%d",
+			   __func__, rc);
+
+	kfree_rcu(flow_node, rcu);
+	return 0;
+}
+
+static void bnxt_tc_set_src_fid(struct bnxt *bp, struct bnxt_tc_flow *flow,
+				u16 src_fid)
+{
+	if (flow->actions.flags & BNXT_TC_ACTION_FLAG_TUNNEL_DECAP)
+		flow->src_fid = bp->pf.fw_fid;
+	else
+		flow->src_fid = src_fid;
+}
+
+/* Add a new flow or replace an existing flow.
+ * Notes on locking:
+ * There are essentially two critical sections here.
+ * 1. while adding a new flow
+ *    a) lookup l2-key
+ *    b) issue HWRM cmd and get flow_handle
+ *    c) link l2-key with flow
+ * 2. while deleting a flow
+ *    a) unlinking l2-key from flow
+ * A lock is needed to protect these two critical sections.
+ *
+ * The hash-tables are already protected by the rhashtable API.
+ */
+static int bnxt_tc_add_flow(struct bnxt *bp, u16 src_fid,
+			    struct tc_cls_flower_offload *tc_flow_cmd)
+{
+	struct bnxt_tc_flow_node *new_node, *old_node;
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	struct bnxt_tc_flow *flow;
+	__le32 tunnel_handle = 0;
+	__le16 ref_flow_handle;
+	int rc;
+
+	/* allocate memory for the new flow and it's node */
+	new_node = kzalloc(sizeof(*new_node), GFP_KERNEL);
+	if (!new_node) {
+		rc = -ENOMEM;
+		goto done;
+	}
+	new_node->cookie = tc_flow_cmd->cookie;
+	flow = &new_node->flow;
+
+	rc = bnxt_tc_parse_flow(bp, tc_flow_cmd, flow);
+	if (rc)
+		goto free_node;
+
+	bnxt_tc_set_src_fid(bp, flow, src_fid);
+
+	if (!bnxt_tc_can_offload(bp, flow)) {
+		rc = -ENOSPC;
+		goto free_node;
+	}
+
+	/* If a flow exists with the same cookie, delete it */
+	old_node = rhashtable_lookup_fast(&tc_info->flow_table,
+					  &tc_flow_cmd->cookie,
+					  tc_info->flow_ht_params);
+	if (old_node)
+		__bnxt_tc_del_flow(bp, old_node);
+
+	/* Check if the L2 part of the flow has been offloaded already.
+	 * If so, bump up it's refcnt and get it's reference handle.
+	 */
+	mutex_lock(&tc_info->lock);
+	rc = bnxt_tc_get_ref_flow_handle(bp, flow, new_node, &ref_flow_handle);
+	if (rc)
+		goto unlock;
+
+	/* If the flow involves tunnel encap/decap, get tunnel_handle */
+	rc = bnxt_tc_get_tunnel_handle(bp, flow, new_node, &tunnel_handle);
+	if (rc)
+		goto put_l2;
+
+	/* send HWRM cmd to alloc the flow */
+	rc = bnxt_hwrm_cfa_flow_alloc(bp, flow, ref_flow_handle,
+				      tunnel_handle, &new_node->flow_handle);
+	if (rc)
+		goto put_tunnel;
+
+	flow->lastused = jiffies;
+	spin_lock_init(&flow->stats_lock);
+	/* add new flow to flow-table */
+	rc = rhashtable_insert_fast(&tc_info->flow_table, &new_node->node,
+				    tc_info->flow_ht_params);
+	if (rc)
+		goto hwrm_flow_free;
+
+	mutex_unlock(&tc_info->lock);
+	return 0;
+
+hwrm_flow_free:
+	bnxt_hwrm_cfa_flow_free(bp, new_node->flow_handle);
+put_tunnel:
+	bnxt_tc_put_tunnel_handle(bp, flow, new_node);
+put_l2:
+	bnxt_tc_put_l2_node(bp, new_node);
+unlock:
+	mutex_unlock(&tc_info->lock);
+free_node:
+	kfree_rcu(new_node, rcu);
+done:
+	netdev_err(bp->dev, "Error: %s: cookie=0x%lx error=%d",
+		   __func__, tc_flow_cmd->cookie, rc);
+	return rc;
+}
+
+static int bnxt_tc_del_flow(struct bnxt *bp,
+			    struct tc_cls_flower_offload *tc_flow_cmd)
+{
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	struct bnxt_tc_flow_node *flow_node;
+
+	flow_node = rhashtable_lookup_fast(&tc_info->flow_table,
+					   &tc_flow_cmd->cookie,
+					   tc_info->flow_ht_params);
+	if (!flow_node)
+		return -EINVAL;
+
+	return __bnxt_tc_del_flow(bp, flow_node);
+}
+
+static int bnxt_tc_get_flow_stats(struct bnxt *bp,
+				  struct tc_cls_flower_offload *tc_flow_cmd)
+{
+	struct bnxt_tc_flow_stats stats, *curr_stats, *prev_stats;
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	struct bnxt_tc_flow_node *flow_node;
+	struct bnxt_tc_flow *flow;
+	unsigned long lastused;
+
+	flow_node = rhashtable_lookup_fast(&tc_info->flow_table,
+					   &tc_flow_cmd->cookie,
+					   tc_info->flow_ht_params);
+	if (!flow_node)
+		return -1;
+
+	flow = &flow_node->flow;
+	curr_stats = &flow->stats;
+	prev_stats = &flow->prev_stats;
+
+	spin_lock(&flow->stats_lock);
+	stats.packets = curr_stats->packets - prev_stats->packets;
+	stats.bytes = curr_stats->bytes - prev_stats->bytes;
+	*prev_stats = *curr_stats;
+	lastused = flow->lastused;
+	spin_unlock(&flow->stats_lock);
+
+	tcf_exts_stats_update(tc_flow_cmd->exts, stats.bytes, stats.packets,
+			      lastused);
+	return 0;
+}
+
+static int
+bnxt_hwrm_cfa_flow_stats_get(struct bnxt *bp, int num_flows,
+			     struct bnxt_tc_stats_batch stats_batch[])
+{
+	struct hwrm_cfa_flow_stats_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_cfa_flow_stats_input req = { 0 };
+	__le16 *req_flow_handles = &req.flow_handle_0;
+	int rc, i;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_FLOW_STATS, -1, -1);
+	req.num_flows = cpu_to_le16(num_flows);
+	for (i = 0; i < num_flows; i++) {
+		struct bnxt_tc_flow_node *flow_node = stats_batch[i].flow_node;
+
+		req_flow_handles[i] = flow_node->flow_handle;
+	}
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc) {
+		__le64 *resp_packets = &resp->packet_0;
+		__le64 *resp_bytes = &resp->byte_0;
+
+		for (i = 0; i < num_flows; i++) {
+			stats_batch[i].hw_stats.packets =
+						le64_to_cpu(resp_packets[i]);
+			stats_batch[i].hw_stats.bytes =
+						le64_to_cpu(resp_bytes[i]);
+		}
+	} else {
+		netdev_info(bp->dev, "error rc=%d", rc);
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+
+	if (rc)
+		rc = -EIO;
+	return rc;
+}
+
+/* Add val to accum while handling a possible wraparound
+ * of val. Eventhough val is of type u64, its actual width
+ * is denoted by mask and will wrap-around beyond that width.
+ */
+static void accumulate_val(u64 *accum, u64 val, u64 mask)
+{
+#define low_bits(x, mask)		((x) & (mask))
+#define high_bits(x, mask)		((x) & ~(mask))
+	bool wrapped = val < low_bits(*accum, mask);
+
+	*accum = high_bits(*accum, mask) + val;
+	if (wrapped)
+		*accum += (mask + 1);
+}
+
+/* The HW counters' width is much less than 64bits.
+ * Handle possible wrap-around while updating the stat counters
+ */
+static void bnxt_flow_stats_accum(struct bnxt_tc_info *tc_info,
+				  struct bnxt_tc_flow_stats *acc_stats,
+				  struct bnxt_tc_flow_stats *hw_stats)
+{
+	accumulate_val(&acc_stats->bytes, hw_stats->bytes, tc_info->bytes_mask);
+	accumulate_val(&acc_stats->packets, hw_stats->packets,
+		       tc_info->packets_mask);
+}
+
+static int
+bnxt_tc_flow_stats_batch_update(struct bnxt *bp, int num_flows,
+				struct bnxt_tc_stats_batch stats_batch[])
+{
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	int rc, i;
+
+	rc = bnxt_hwrm_cfa_flow_stats_get(bp, num_flows, stats_batch);
+	if (rc)
+		return rc;
+
+	for (i = 0; i < num_flows; i++) {
+		struct bnxt_tc_flow_node *flow_node = stats_batch[i].flow_node;
+		struct bnxt_tc_flow *flow = &flow_node->flow;
+
+		spin_lock(&flow->stats_lock);
+		bnxt_flow_stats_accum(tc_info, &flow->stats,
+				      &stats_batch[i].hw_stats);
+		if (flow->stats.packets != flow->prev_stats.packets)
+			flow->lastused = jiffies;
+		spin_unlock(&flow->stats_lock);
+	}
+
+	return 0;
+}
+
+static int
+bnxt_tc_flow_stats_batch_prep(struct bnxt *bp,
+			      struct bnxt_tc_stats_batch stats_batch[],
+			      int *num_flows)
+{
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	struct rhashtable_iter *iter = &tc_info->iter;
+	void *flow_node;
+	int rc, i;
+
+	rhashtable_walk_start(iter);
+
+	rc = 0;
+	for (i = 0; i < BNXT_FLOW_STATS_BATCH_MAX; i++) {
+		flow_node = rhashtable_walk_next(iter);
+		if (IS_ERR(flow_node)) {
+			i = 0;
+			if (PTR_ERR(flow_node) == -EAGAIN) {
+				continue;
+			} else {
+				rc = PTR_ERR(flow_node);
+				goto done;
+			}
+		}
+
+		/* No more flows */
+		if (!flow_node)
+			goto done;
+
+		stats_batch[i].flow_node = flow_node;
+	}
+done:
+	rhashtable_walk_stop(iter);
+	*num_flows = i;
+	return rc;
+}
+
+void bnxt_tc_flow_stats_work(struct bnxt *bp)
+{
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+	int num_flows, rc;
+
+	num_flows = atomic_read(&tc_info->flow_table.nelems);
+	if (!num_flows)
+		return;
+
+	rhashtable_walk_enter(&tc_info->flow_table, &tc_info->iter);
+
+	for (;;) {
+		rc = bnxt_tc_flow_stats_batch_prep(bp, tc_info->stats_batch,
+						   &num_flows);
+		if (rc) {
+			if (rc == -EAGAIN)
+				continue;
+			break;
+		}
+
+		if (!num_flows)
+			break;
+
+		bnxt_tc_flow_stats_batch_update(bp, num_flows,
+						tc_info->stats_batch);
+	}
+
+	rhashtable_walk_exit(&tc_info->iter);
+}
+
+int bnxt_tc_setup_flower(struct bnxt *bp, u16 src_fid,
+			 struct tc_cls_flower_offload *cls_flower)
+{
+	int rc = 0;
+
+	switch (cls_flower->command) {
+	case TC_CLSFLOWER_REPLACE:
+		rc = bnxt_tc_add_flow(bp, src_fid, cls_flower);
+		break;
+
+	case TC_CLSFLOWER_DESTROY:
+		rc = bnxt_tc_del_flow(bp, cls_flower);
+		break;
+
+	case TC_CLSFLOWER_STATS:
+		rc = bnxt_tc_get_flow_stats(bp, cls_flower);
+		break;
+	}
+	return rc;
+}
+
+static const struct rhashtable_params bnxt_tc_flow_ht_params = {
+	.head_offset = offsetof(struct bnxt_tc_flow_node, node),
+	.key_offset = offsetof(struct bnxt_tc_flow_node, cookie),
+	.key_len = sizeof(((struct bnxt_tc_flow_node *)0)->cookie),
+	.automatic_shrinking = true
+};
+
+static const struct rhashtable_params bnxt_tc_l2_ht_params = {
+	.head_offset = offsetof(struct bnxt_tc_l2_node, node),
+	.key_offset = offsetof(struct bnxt_tc_l2_node, key),
+	.key_len = BNXT_TC_L2_KEY_LEN,
+	.automatic_shrinking = true
+};
+
+static const struct rhashtable_params bnxt_tc_decap_l2_ht_params = {
+	.head_offset = offsetof(struct bnxt_tc_l2_node, node),
+	.key_offset = offsetof(struct bnxt_tc_l2_node, key),
+	.key_len = BNXT_TC_L2_KEY_LEN,
+	.automatic_shrinking = true
+};
+
+static const struct rhashtable_params bnxt_tc_tunnel_ht_params = {
+	.head_offset = offsetof(struct bnxt_tc_tunnel_node, node),
+	.key_offset = offsetof(struct bnxt_tc_tunnel_node, key),
+	.key_len = sizeof(struct ip_tunnel_key),
+	.automatic_shrinking = true
+};
+
+/* convert counter width in bits to a mask */
+#define mask(width)		((u64)~0 >> (64 - (width)))
+
+int bnxt_init_tc(struct bnxt *bp)
+{
+	struct bnxt_tc_info *tc_info;
+	int rc;
+
+	if (bp->hwrm_spec_code < 0x10803) {
+		netdev_warn(bp->dev,
+			    "Firmware does not support TC flower offload.\n");
+		return -ENOTSUPP;
+	}
+
+	tc_info = kzalloc(sizeof(*tc_info), GFP_KERNEL);
+	if (!tc_info)
+		return -ENOMEM;
+	mutex_init(&tc_info->lock);
+
+	/* Counter widths are programmed by FW */
+	tc_info->bytes_mask = mask(36);
+	tc_info->packets_mask = mask(28);
+
+	tc_info->flow_ht_params = bnxt_tc_flow_ht_params;
+	rc = rhashtable_init(&tc_info->flow_table, &tc_info->flow_ht_params);
+	if (rc)
+		goto free_tc_info;
+
+	tc_info->l2_ht_params = bnxt_tc_l2_ht_params;
+	rc = rhashtable_init(&tc_info->l2_table, &tc_info->l2_ht_params);
+	if (rc)
+		goto destroy_flow_table;
+
+	tc_info->decap_l2_ht_params = bnxt_tc_decap_l2_ht_params;
+	rc = rhashtable_init(&tc_info->decap_l2_table,
+			     &tc_info->decap_l2_ht_params);
+	if (rc)
+		goto destroy_l2_table;
+
+	tc_info->decap_ht_params = bnxt_tc_tunnel_ht_params;
+	rc = rhashtable_init(&tc_info->decap_table,
+			     &tc_info->decap_ht_params);
+	if (rc)
+		goto destroy_decap_l2_table;
+
+	tc_info->encap_ht_params = bnxt_tc_tunnel_ht_params;
+	rc = rhashtable_init(&tc_info->encap_table,
+			     &tc_info->encap_ht_params);
+	if (rc)
+		goto destroy_decap_table;
+
+	tc_info->enabled = true;
+	bp->dev->hw_features |= NETIF_F_HW_TC;
+	bp->dev->features |= NETIF_F_HW_TC;
+	bp->tc_info = tc_info;
+	return 0;
+
+destroy_decap_table:
+	rhashtable_destroy(&tc_info->decap_table);
+destroy_decap_l2_table:
+	rhashtable_destroy(&tc_info->decap_l2_table);
+destroy_l2_table:
+	rhashtable_destroy(&tc_info->l2_table);
+destroy_flow_table:
+	rhashtable_destroy(&tc_info->flow_table);
+free_tc_info:
+	kfree(tc_info);
+	return rc;
+}
+
+void bnxt_shutdown_tc(struct bnxt *bp)
+{
+	struct bnxt_tc_info *tc_info = bp->tc_info;
+
+	if (!bnxt_tc_flower_enabled(bp))
+		return;
+
+	rhashtable_destroy(&tc_info->flow_table);
+	rhashtable_destroy(&tc_info->l2_table);
+	rhashtable_destroy(&tc_info->decap_l2_table);
+	rhashtable_destroy(&tc_info->decap_table);
+	rhashtable_destroy(&tc_info->encap_table);
+	kfree(tc_info);
+	bp->tc_info = NULL;
+}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.h
new file mode 100644
index 0000000..97e09a8
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.h
@@ -0,0 +1,230 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2017 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_TC_H
+#define BNXT_TC_H
+
+#ifdef CONFIG_BNXT_FLOWER_OFFLOAD
+
+#include <net/ip_tunnels.h>
+
+/* Structs used for storing the filter/actions of the TC cmd.
+ */
+struct bnxt_tc_l2_key {
+	u8		dmac[ETH_ALEN];
+	u8		smac[ETH_ALEN];
+	__be16		inner_vlan_tpid;
+	__be16		inner_vlan_tci;
+	__be16		ether_type;
+	u8		num_vlans;
+};
+
+struct bnxt_tc_l3_key {
+	union {
+		struct {
+			struct in_addr daddr;
+			struct in_addr saddr;
+		} ipv4;
+		struct {
+			struct in6_addr daddr;
+			struct in6_addr saddr;
+		} ipv6;
+	};
+};
+
+struct bnxt_tc_l4_key {
+	u8  ip_proto;
+	union {
+		struct {
+			__be16 sport;
+			__be16 dport;
+		} ports;
+		struct {
+			u8 type;
+			u8 code;
+		} icmp;
+	};
+};
+
+struct bnxt_tc_tunnel_key {
+	struct bnxt_tc_l2_key	l2;
+	struct bnxt_tc_l3_key	l3;
+	struct bnxt_tc_l4_key	l4;
+	__be32			id;
+};
+
+struct bnxt_tc_actions {
+	u32				flags;
+#define BNXT_TC_ACTION_FLAG_FWD			BIT(0)
+#define BNXT_TC_ACTION_FLAG_FWD_VXLAN		BIT(1)
+#define BNXT_TC_ACTION_FLAG_PUSH_VLAN		BIT(3)
+#define BNXT_TC_ACTION_FLAG_POP_VLAN		BIT(4)
+#define BNXT_TC_ACTION_FLAG_DROP		BIT(5)
+#define BNXT_TC_ACTION_FLAG_TUNNEL_ENCAP	BIT(6)
+#define BNXT_TC_ACTION_FLAG_TUNNEL_DECAP	BIT(7)
+
+	u16				dst_fid;
+	struct net_device		*dst_dev;
+	__be16				push_vlan_tpid;
+	__be16				push_vlan_tci;
+
+	/* tunnel encap */
+	struct ip_tunnel_key		tun_encap_key;
+};
+
+struct bnxt_tc_flow {
+	u32				flags;
+#define BNXT_TC_FLOW_FLAGS_ETH_ADDRS		BIT(1)
+#define BNXT_TC_FLOW_FLAGS_IPV4_ADDRS		BIT(2)
+#define BNXT_TC_FLOW_FLAGS_IPV6_ADDRS		BIT(3)
+#define BNXT_TC_FLOW_FLAGS_PORTS		BIT(4)
+#define BNXT_TC_FLOW_FLAGS_ICMP			BIT(5)
+#define BNXT_TC_FLOW_FLAGS_TUNL_ETH_ADDRS	BIT(6)
+#define BNXT_TC_FLOW_FLAGS_TUNL_IPV4_ADDRS	BIT(7)
+#define BNXT_TC_FLOW_FLAGS_TUNL_IPV6_ADDRS	BIT(8)
+#define BNXT_TC_FLOW_FLAGS_TUNL_PORTS		BIT(9)
+#define BNXT_TC_FLOW_FLAGS_TUNL_ID		BIT(10)
+#define BNXT_TC_FLOW_FLAGS_TUNNEL	(BNXT_TC_FLOW_FLAGS_TUNL_ETH_ADDRS | \
+					 BNXT_TC_FLOW_FLAGS_TUNL_IPV4_ADDRS | \
+					 BNXT_TC_FLOW_FLAGS_TUNL_IPV6_ADDRS |\
+					 BNXT_TC_FLOW_FLAGS_TUNL_PORTS |\
+					 BNXT_TC_FLOW_FLAGS_TUNL_ID)
+
+	/* flow applicable to pkts ingressing on this fid */
+	u16				src_fid;
+	struct bnxt_tc_l2_key		l2_key;
+	struct bnxt_tc_l2_key		l2_mask;
+	struct bnxt_tc_l3_key		l3_key;
+	struct bnxt_tc_l3_key		l3_mask;
+	struct bnxt_tc_l4_key		l4_key;
+	struct bnxt_tc_l4_key		l4_mask;
+	struct ip_tunnel_key		tun_key;
+	struct ip_tunnel_key		tun_mask;
+
+	struct bnxt_tc_actions		actions;
+
+	/* updated stats accounting for hw-counter wrap-around */
+	struct bnxt_tc_flow_stats	stats;
+	/* previous snap-shot of stats */
+	struct bnxt_tc_flow_stats	prev_stats;
+	unsigned long			lastused; /* jiffies */
+	/* for calculating delta from prev_stats and
+	 * updating prev_stats atomically.
+	 */
+	spinlock_t			stats_lock;
+};
+
+/* Tunnel encap/decap hash table
+ * This table is used to maintain a list of flows that use
+ * the same tunnel encap/decap params (ip_daddrs, vni, udp_dport)
+ * and the FW returned handle.
+ * A separate table is maintained for encap and decap
+ */
+struct bnxt_tc_tunnel_node {
+	struct ip_tunnel_key		key;
+	struct rhash_head		node;
+
+	/* tunnel l2 info */
+	struct bnxt_tc_l2_key		l2_info;
+
+#define	INVALID_TUNNEL_HANDLE		cpu_to_le32(0xffffffff)
+	/* tunnel handle returned by FW */
+	__le32				tunnel_handle;
+
+	u32				refcount;
+	struct rcu_head			rcu;
+};
+
+/* L2 hash table
+ * The same data-struct is used for L2-flow table and L2-tunnel table.
+ * The L2 part of a flow or tunnel is stored in a hash table.
+ * A flow that shares the same L2 key/mask with an
+ * already existing flow/tunnel must refer to it's flow handle or
+ * decap_filter_id respectively.
+ */
+struct bnxt_tc_l2_node {
+	/* hash key: first 16b of key */
+#define BNXT_TC_L2_KEY_LEN			16
+	struct bnxt_tc_l2_key	key;
+	struct rhash_head	node;
+
+	/* a linked list of flows that share the same l2 key */
+	struct list_head	common_l2_flows;
+
+	/* number of flows/tunnels sharing the l2 key */
+	u16			refcount;
+
+	struct rcu_head		rcu;
+};
+
+struct bnxt_tc_flow_node {
+	/* hash key: provided by TC */
+	unsigned long			cookie;
+	struct rhash_head		node;
+
+	struct bnxt_tc_flow		flow;
+
+	__le16				flow_handle;
+
+	/* L2 node in l2 hashtable that shares flow's l2 key */
+	struct bnxt_tc_l2_node		*l2_node;
+	/* for the shared_flows list maintained in l2_node */
+	struct list_head		l2_list_node;
+
+	/* tunnel encap related */
+	struct bnxt_tc_tunnel_node	*encap_node;
+
+	/* tunnel decap related */
+	struct bnxt_tc_tunnel_node	*decap_node;
+	/* L2 node in tunnel-l2 hashtable that shares flow's tunnel l2 key */
+	struct bnxt_tc_l2_node		*decap_l2_node;
+	/* for the shared_flows list maintained in tunnel decap l2_node */
+	struct list_head		decap_l2_list_node;
+
+	struct rcu_head			rcu;
+};
+
+int bnxt_tc_setup_flower(struct bnxt *bp, u16 src_fid,
+			 struct tc_cls_flower_offload *cls_flower);
+int bnxt_init_tc(struct bnxt *bp);
+void bnxt_shutdown_tc(struct bnxt *bp);
+void bnxt_tc_flow_stats_work(struct bnxt *bp);
+
+static inline bool bnxt_tc_flower_enabled(struct bnxt *bp)
+{
+	return bp->tc_info && bp->tc_info->enabled;
+}
+
+#else /* CONFIG_BNXT_FLOWER_OFFLOAD */
+
+static inline int bnxt_tc_setup_flower(struct bnxt *bp, u16 src_fid,
+				       struct tc_cls_flower_offload *cls_flower)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int bnxt_init_tc(struct bnxt *bp)
+{
+	return 0;
+}
+
+static inline void bnxt_shutdown_tc(struct bnxt *bp)
+{
+}
+
+static inline void bnxt_tc_flow_stats_work(struct bnxt *bp)
+{
+}
+
+static inline bool bnxt_tc_flower_enabled(struct bnxt *bp)
+{
+	return false;
+}
+#endif /* CONFIG_BNXT_FLOWER_OFFLOAD */
+#endif /* BNXT_TC_H */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
new file mode 100644
index 0000000..347e4f9
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
@@ -0,0 +1,487 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2016-2018 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <asm/byteorder.h>
+#include <linux/bitmap.h>
+
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+#include "bnxt_ulp.h"
+
+static int bnxt_register_dev(struct bnxt_en_dev *edev, int ulp_id,
+			     struct bnxt_ulp_ops *ulp_ops, void *handle)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_ulp *ulp;
+
+	ASSERT_RTNL();
+	if (ulp_id >= BNXT_MAX_ULP)
+		return -EINVAL;
+
+	ulp = &edev->ulp_tbl[ulp_id];
+	if (rcu_access_pointer(ulp->ulp_ops)) {
+		netdev_err(bp->dev, "ulp id %d already registered\n", ulp_id);
+		return -EBUSY;
+	}
+	if (ulp_id == BNXT_ROCE_ULP) {
+		unsigned int max_stat_ctxs;
+
+		max_stat_ctxs = bnxt_get_max_func_stat_ctxs(bp);
+		if (max_stat_ctxs <= BNXT_MIN_ROCE_STAT_CTXS ||
+		    bp->num_stat_ctxs == max_stat_ctxs)
+			return -ENOMEM;
+		bnxt_set_max_func_stat_ctxs(bp, max_stat_ctxs -
+					    BNXT_MIN_ROCE_STAT_CTXS);
+	}
+
+	atomic_set(&ulp->ref_count, 0);
+	ulp->handle = handle;
+	rcu_assign_pointer(ulp->ulp_ops, ulp_ops);
+
+	if (ulp_id == BNXT_ROCE_ULP) {
+		if (test_bit(BNXT_STATE_OPEN, &bp->state))
+			bnxt_hwrm_vnic_cfg(bp, 0);
+	}
+
+	return 0;
+}
+
+static int bnxt_unregister_dev(struct bnxt_en_dev *edev, int ulp_id)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_ulp *ulp;
+	int i = 0;
+
+	ASSERT_RTNL();
+	if (ulp_id >= BNXT_MAX_ULP)
+		return -EINVAL;
+
+	ulp = &edev->ulp_tbl[ulp_id];
+	if (!rcu_access_pointer(ulp->ulp_ops)) {
+		netdev_err(bp->dev, "ulp id %d not registered\n", ulp_id);
+		return -EINVAL;
+	}
+	if (ulp_id == BNXT_ROCE_ULP) {
+		unsigned int max_stat_ctxs;
+
+		max_stat_ctxs = bnxt_get_max_func_stat_ctxs(bp);
+		bnxt_set_max_func_stat_ctxs(bp, max_stat_ctxs + 1);
+		if (ulp->msix_requested)
+			edev->en_ops->bnxt_free_msix(edev, ulp_id);
+	}
+	if (ulp->max_async_event_id)
+		bnxt_hwrm_func_rgtr_async_events(bp, NULL, 0);
+
+	RCU_INIT_POINTER(ulp->ulp_ops, NULL);
+	synchronize_rcu();
+	ulp->max_async_event_id = 0;
+	ulp->async_events_bmap = NULL;
+	while (atomic_read(&ulp->ref_count) != 0 && i < 10) {
+		msleep(100);
+		i++;
+	}
+	return 0;
+}
+
+static void bnxt_fill_msix_vecs(struct bnxt *bp, struct bnxt_msix_entry *ent)
+{
+	struct bnxt_en_dev *edev = bp->edev;
+	int num_msix, idx, i;
+
+	num_msix = edev->ulp_tbl[BNXT_ROCE_ULP].msix_requested;
+	idx = edev->ulp_tbl[BNXT_ROCE_ULP].msix_base;
+	for (i = 0; i < num_msix; i++) {
+		ent[i].vector = bp->irq_tbl[idx + i].vector;
+		ent[i].ring_idx = idx + i;
+		ent[i].db_offset = (idx + i) * 0x80;
+	}
+}
+
+static int bnxt_req_msix_vecs(struct bnxt_en_dev *edev, int ulp_id,
+			      struct bnxt_msix_entry *ent, int num_msix)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	int max_idx, max_cp_rings;
+	int avail_msix, idx;
+	int rc = 0;
+
+	ASSERT_RTNL();
+	if (ulp_id != BNXT_ROCE_ULP)
+		return -EINVAL;
+
+	if (!(bp->flags & BNXT_FLAG_USING_MSIX))
+		return -ENODEV;
+
+	if (edev->ulp_tbl[ulp_id].msix_requested)
+		return -EAGAIN;
+
+	max_cp_rings = bnxt_get_max_func_cp_rings(bp);
+	avail_msix = bnxt_get_avail_msix(bp, num_msix);
+	if (!avail_msix)
+		return -ENOMEM;
+	if (avail_msix > num_msix)
+		avail_msix = num_msix;
+
+	if (bp->flags & BNXT_FLAG_NEW_RM) {
+		idx = bp->cp_nr_rings;
+	} else {
+		max_idx = min_t(int, bp->total_irqs, max_cp_rings);
+		idx = max_idx - avail_msix;
+	}
+	edev->ulp_tbl[ulp_id].msix_base = idx;
+	edev->ulp_tbl[ulp_id].msix_requested = avail_msix;
+	if (bp->total_irqs < (idx + avail_msix)) {
+		if (netif_running(dev)) {
+			bnxt_close_nic(bp, true, false);
+			rc = bnxt_open_nic(bp, true, false);
+		} else {
+			rc = bnxt_reserve_rings(bp);
+		}
+	}
+	if (rc) {
+		edev->ulp_tbl[ulp_id].msix_requested = 0;
+		return -EAGAIN;
+	}
+
+	if (bp->flags & BNXT_FLAG_NEW_RM) {
+		struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+
+		avail_msix = hw_resc->resv_cp_rings - bp->cp_nr_rings;
+		edev->ulp_tbl[ulp_id].msix_requested = avail_msix;
+	}
+	bnxt_fill_msix_vecs(bp, ent);
+	bnxt_set_max_func_irqs(bp, bnxt_get_max_func_irqs(bp) - avail_msix);
+	bnxt_set_max_func_cp_rings(bp, max_cp_rings - avail_msix);
+	edev->flags |= BNXT_EN_FLAG_MSIX_REQUESTED;
+	return avail_msix;
+}
+
+static int bnxt_free_msix_vecs(struct bnxt_en_dev *edev, int ulp_id)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	int max_cp_rings, msix_requested;
+
+	ASSERT_RTNL();
+	if (ulp_id != BNXT_ROCE_ULP)
+		return -EINVAL;
+
+	if (!(edev->flags & BNXT_EN_FLAG_MSIX_REQUESTED))
+		return 0;
+
+	max_cp_rings = bnxt_get_max_func_cp_rings(bp);
+	msix_requested = edev->ulp_tbl[ulp_id].msix_requested;
+	bnxt_set_max_func_cp_rings(bp, max_cp_rings + msix_requested);
+	edev->ulp_tbl[ulp_id].msix_requested = 0;
+	bnxt_set_max_func_irqs(bp, bnxt_get_max_func_irqs(bp) + msix_requested);
+	edev->flags &= ~BNXT_EN_FLAG_MSIX_REQUESTED;
+	if (netif_running(dev)) {
+		bnxt_close_nic(bp, true, false);
+		bnxt_open_nic(bp, true, false);
+	}
+	return 0;
+}
+
+int bnxt_get_ulp_msix_num(struct bnxt *bp)
+{
+	if (bnxt_ulp_registered(bp->edev, BNXT_ROCE_ULP)) {
+		struct bnxt_en_dev *edev = bp->edev;
+
+		return edev->ulp_tbl[BNXT_ROCE_ULP].msix_requested;
+	}
+	return 0;
+}
+
+int bnxt_get_ulp_msix_base(struct bnxt *bp)
+{
+	if (bnxt_ulp_registered(bp->edev, BNXT_ROCE_ULP)) {
+		struct bnxt_en_dev *edev = bp->edev;
+
+		if (edev->ulp_tbl[BNXT_ROCE_ULP].msix_requested)
+			return edev->ulp_tbl[BNXT_ROCE_ULP].msix_base;
+	}
+	return 0;
+}
+
+void bnxt_subtract_ulp_resources(struct bnxt *bp, int ulp_id)
+{
+	ASSERT_RTNL();
+	if (bnxt_ulp_registered(bp->edev, ulp_id)) {
+		struct bnxt_en_dev *edev = bp->edev;
+		unsigned int msix_req, max;
+
+		msix_req = edev->ulp_tbl[ulp_id].msix_requested;
+		max = bnxt_get_max_func_cp_rings(bp);
+		bnxt_set_max_func_cp_rings(bp, max - msix_req);
+		max = bnxt_get_max_func_stat_ctxs(bp);
+		bnxt_set_max_func_stat_ctxs(bp, max - 1);
+	}
+}
+
+static int bnxt_send_msg(struct bnxt_en_dev *edev, int ulp_id,
+			 struct bnxt_fw_msg *fw_msg)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	struct input *req;
+	int rc;
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	req = fw_msg->msg;
+	req->resp_addr = cpu_to_le64(bp->hwrm_cmd_resp_dma_addr);
+	rc = _hwrm_send_message(bp, fw_msg->msg, fw_msg->msg_len,
+				fw_msg->timeout);
+	if (!rc) {
+		struct output *resp = bp->hwrm_cmd_resp_addr;
+		u32 len = le16_to_cpu(resp->resp_len);
+
+		if (fw_msg->resp_max_len < len)
+			len = fw_msg->resp_max_len;
+
+		memcpy(fw_msg->resp, resp, len);
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static void bnxt_ulp_get(struct bnxt_ulp *ulp)
+{
+	atomic_inc(&ulp->ref_count);
+}
+
+static void bnxt_ulp_put(struct bnxt_ulp *ulp)
+{
+	atomic_dec(&ulp->ref_count);
+}
+
+void bnxt_ulp_stop(struct bnxt *bp)
+{
+	struct bnxt_en_dev *edev = bp->edev;
+	struct bnxt_ulp_ops *ops;
+	int i;
+
+	if (!edev)
+		return;
+
+	for (i = 0; i < BNXT_MAX_ULP; i++) {
+		struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+		ops = rtnl_dereference(ulp->ulp_ops);
+		if (!ops || !ops->ulp_stop)
+			continue;
+		ops->ulp_stop(ulp->handle);
+	}
+}
+
+void bnxt_ulp_start(struct bnxt *bp)
+{
+	struct bnxt_en_dev *edev = bp->edev;
+	struct bnxt_ulp_ops *ops;
+	int i;
+
+	if (!edev)
+		return;
+
+	for (i = 0; i < BNXT_MAX_ULP; i++) {
+		struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+		ops = rtnl_dereference(ulp->ulp_ops);
+		if (!ops || !ops->ulp_start)
+			continue;
+		ops->ulp_start(ulp->handle);
+	}
+}
+
+void bnxt_ulp_sriov_cfg(struct bnxt *bp, int num_vfs)
+{
+	struct bnxt_en_dev *edev = bp->edev;
+	struct bnxt_ulp_ops *ops;
+	int i;
+
+	if (!edev)
+		return;
+
+	for (i = 0; i < BNXT_MAX_ULP; i++) {
+		struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+		rcu_read_lock();
+		ops = rcu_dereference(ulp->ulp_ops);
+		if (!ops || !ops->ulp_sriov_config) {
+			rcu_read_unlock();
+			continue;
+		}
+		bnxt_ulp_get(ulp);
+		rcu_read_unlock();
+		ops->ulp_sriov_config(ulp->handle, num_vfs);
+		bnxt_ulp_put(ulp);
+	}
+}
+
+void bnxt_ulp_shutdown(struct bnxt *bp)
+{
+	struct bnxt_en_dev *edev = bp->edev;
+	struct bnxt_ulp_ops *ops;
+	int i;
+
+	if (!edev)
+		return;
+
+	for (i = 0; i < BNXT_MAX_ULP; i++) {
+		struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+		ops = rtnl_dereference(ulp->ulp_ops);
+		if (!ops || !ops->ulp_shutdown)
+			continue;
+		ops->ulp_shutdown(ulp->handle);
+	}
+}
+
+void bnxt_ulp_irq_stop(struct bnxt *bp)
+{
+	struct bnxt_en_dev *edev = bp->edev;
+	struct bnxt_ulp_ops *ops;
+
+	if (!edev || !(edev->flags & BNXT_EN_FLAG_MSIX_REQUESTED))
+		return;
+
+	if (bnxt_ulp_registered(bp->edev, BNXT_ROCE_ULP)) {
+		struct bnxt_ulp *ulp = &edev->ulp_tbl[BNXT_ROCE_ULP];
+
+		if (!ulp->msix_requested)
+			return;
+
+		ops = rtnl_dereference(ulp->ulp_ops);
+		if (!ops || !ops->ulp_irq_stop)
+			return;
+		ops->ulp_irq_stop(ulp->handle);
+	}
+}
+
+void bnxt_ulp_irq_restart(struct bnxt *bp, int err)
+{
+	struct bnxt_en_dev *edev = bp->edev;
+	struct bnxt_ulp_ops *ops;
+
+	if (!edev || !(edev->flags & BNXT_EN_FLAG_MSIX_REQUESTED))
+		return;
+
+	if (bnxt_ulp_registered(bp->edev, BNXT_ROCE_ULP)) {
+		struct bnxt_ulp *ulp = &edev->ulp_tbl[BNXT_ROCE_ULP];
+		struct bnxt_msix_entry *ent = NULL;
+
+		if (!ulp->msix_requested)
+			return;
+
+		ops = rtnl_dereference(ulp->ulp_ops);
+		if (!ops || !ops->ulp_irq_restart)
+			return;
+
+		if (!err) {
+			ent = kcalloc(ulp->msix_requested, sizeof(*ent),
+				      GFP_KERNEL);
+			if (!ent)
+				return;
+			bnxt_fill_msix_vecs(bp, ent);
+		}
+		ops->ulp_irq_restart(ulp->handle, ent);
+		kfree(ent);
+	}
+}
+
+void bnxt_ulp_async_events(struct bnxt *bp, struct hwrm_async_event_cmpl *cmpl)
+{
+	u16 event_id = le16_to_cpu(cmpl->event_id);
+	struct bnxt_en_dev *edev = bp->edev;
+	struct bnxt_ulp_ops *ops;
+	int i;
+
+	if (!edev)
+		return;
+
+	rcu_read_lock();
+	for (i = 0; i < BNXT_MAX_ULP; i++) {
+		struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+		ops = rcu_dereference(ulp->ulp_ops);
+		if (!ops || !ops->ulp_async_notifier)
+			continue;
+		if (!ulp->async_events_bmap ||
+		    event_id > ulp->max_async_event_id)
+			continue;
+
+		/* Read max_async_event_id first before testing the bitmap. */
+		smp_rmb();
+		if (test_bit(event_id, ulp->async_events_bmap))
+			ops->ulp_async_notifier(ulp->handle, cmpl);
+	}
+	rcu_read_unlock();
+}
+
+static int bnxt_register_async_events(struct bnxt_en_dev *edev, int ulp_id,
+				      unsigned long *events_bmap, u16 max_id)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_ulp *ulp;
+
+	if (ulp_id >= BNXT_MAX_ULP)
+		return -EINVAL;
+
+	ulp = &edev->ulp_tbl[ulp_id];
+	ulp->async_events_bmap = events_bmap;
+	/* Make sure bnxt_ulp_async_events() sees this order */
+	smp_wmb();
+	ulp->max_async_event_id = max_id;
+	bnxt_hwrm_func_rgtr_async_events(bp, events_bmap, max_id + 1);
+	return 0;
+}
+
+static const struct bnxt_en_ops bnxt_en_ops_tbl = {
+	.bnxt_register_device	= bnxt_register_dev,
+	.bnxt_unregister_device	= bnxt_unregister_dev,
+	.bnxt_request_msix	= bnxt_req_msix_vecs,
+	.bnxt_free_msix		= bnxt_free_msix_vecs,
+	.bnxt_send_fw_msg	= bnxt_send_msg,
+	.bnxt_register_fw_async_events	= bnxt_register_async_events,
+};
+
+struct bnxt_en_dev *bnxt_ulp_probe(struct net_device *dev)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_en_dev *edev;
+
+	edev = bp->edev;
+	if (!edev) {
+		edev = kzalloc(sizeof(*edev), GFP_KERNEL);
+		if (!edev)
+			return ERR_PTR(-ENOMEM);
+		edev->en_ops = &bnxt_en_ops_tbl;
+		if (bp->flags & BNXT_FLAG_ROCEV1_CAP)
+			edev->flags |= BNXT_EN_FLAG_ROCEV1_CAP;
+		if (bp->flags & BNXT_FLAG_ROCEV2_CAP)
+			edev->flags |= BNXT_EN_FLAG_ROCEV2_CAP;
+		edev->net = dev;
+		edev->pdev = bp->pdev;
+		bp->edev = edev;
+	}
+	return bp->edev;
+}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h
new file mode 100644
index 0000000..df48ac7
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h
@@ -0,0 +1,103 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2016-2018 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_ULP_H
+#define BNXT_ULP_H
+
+#define BNXT_ROCE_ULP	0
+#define BNXT_OTHER_ULP	1
+#define BNXT_MAX_ULP	2
+
+#define BNXT_MIN_ROCE_CP_RINGS	2
+#define BNXT_MIN_ROCE_STAT_CTXS	1
+
+struct hwrm_async_event_cmpl;
+struct bnxt;
+
+struct bnxt_msix_entry {
+	u32	vector;
+	u32	ring_idx;
+	u32	db_offset;
+};
+
+struct bnxt_ulp_ops {
+	/* async_notifier() cannot sleep (in BH context) */
+	void (*ulp_async_notifier)(void *, struct hwrm_async_event_cmpl *);
+	void (*ulp_stop)(void *);
+	void (*ulp_start)(void *);
+	void (*ulp_sriov_config)(void *, int);
+	void (*ulp_shutdown)(void *);
+	void (*ulp_irq_stop)(void *);
+	void (*ulp_irq_restart)(void *, struct bnxt_msix_entry *);
+};
+
+struct bnxt_fw_msg {
+	void	*msg;
+	int	msg_len;
+	void	*resp;
+	int	resp_max_len;
+	int	timeout;
+};
+
+struct bnxt_ulp {
+	void		*handle;
+	struct bnxt_ulp_ops __rcu *ulp_ops;
+	unsigned long	*async_events_bmap;
+	u16		max_async_event_id;
+	u16		msix_requested;
+	u16		msix_base;
+	atomic_t	ref_count;
+};
+
+struct bnxt_en_dev {
+	struct net_device *net;
+	struct pci_dev *pdev;
+	u32 flags;
+	#define BNXT_EN_FLAG_ROCEV1_CAP		0x1
+	#define BNXT_EN_FLAG_ROCEV2_CAP		0x2
+	#define BNXT_EN_FLAG_ROCE_CAP		(BNXT_EN_FLAG_ROCEV1_CAP | \
+						 BNXT_EN_FLAG_ROCEV2_CAP)
+	#define BNXT_EN_FLAG_MSIX_REQUESTED	0x4
+	const struct bnxt_en_ops	*en_ops;
+	struct bnxt_ulp			ulp_tbl[BNXT_MAX_ULP];
+};
+
+struct bnxt_en_ops {
+	int (*bnxt_register_device)(struct bnxt_en_dev *, int,
+				    struct bnxt_ulp_ops *, void *);
+	int (*bnxt_unregister_device)(struct bnxt_en_dev *, int);
+	int (*bnxt_request_msix)(struct bnxt_en_dev *, int,
+				 struct bnxt_msix_entry *, int);
+	int (*bnxt_free_msix)(struct bnxt_en_dev *, int);
+	int (*bnxt_send_fw_msg)(struct bnxt_en_dev *, int,
+				struct bnxt_fw_msg *);
+	int (*bnxt_register_fw_async_events)(struct bnxt_en_dev *, int,
+					     unsigned long *, u16);
+};
+
+static inline bool bnxt_ulp_registered(struct bnxt_en_dev *edev, int ulp_id)
+{
+	if (edev && rcu_access_pointer(edev->ulp_tbl[ulp_id].ulp_ops))
+		return true;
+	return false;
+}
+
+int bnxt_get_ulp_msix_num(struct bnxt *bp);
+int bnxt_get_ulp_msix_base(struct bnxt *bp);
+void bnxt_subtract_ulp_resources(struct bnxt *bp, int ulp_id);
+void bnxt_ulp_stop(struct bnxt *bp);
+void bnxt_ulp_start(struct bnxt *bp);
+void bnxt_ulp_sriov_cfg(struct bnxt *bp, int num_vfs);
+void bnxt_ulp_shutdown(struct bnxt *bp);
+void bnxt_ulp_irq_stop(struct bnxt *bp);
+void bnxt_ulp_irq_restart(struct bnxt *bp, int err);
+void bnxt_ulp_async_events(struct bnxt *bp, struct hwrm_async_event_cmpl *cmpl);
+struct bnxt_en_dev *bnxt_ulp_probe(struct net_device *dev);
+
+#endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
new file mode 100644
index 0000000..38f635c
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
@@ -0,0 +1,564 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2016-2017 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/jhash.h>
+#include <net/pkt_cls.h>
+
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+#include "bnxt_vfr.h"
+#include "bnxt_devlink.h"
+#include "bnxt_tc.h"
+
+#ifdef CONFIG_BNXT_SRIOV
+
+#define CFA_HANDLE_INVALID		0xffff
+#define VF_IDX_INVALID			0xffff
+
+static int hwrm_cfa_vfr_alloc(struct bnxt *bp, u16 vf_idx,
+			      u16 *tx_cfa_action, u16 *rx_cfa_code)
+{
+	struct hwrm_cfa_vfr_alloc_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_cfa_vfr_alloc_input req = { 0 };
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_VFR_ALLOC, -1, -1);
+	req.vf_id = cpu_to_le16(vf_idx);
+	sprintf(req.vfr_name, "vfr%d", vf_idx);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc) {
+		*tx_cfa_action = le16_to_cpu(resp->tx_cfa_action);
+		*rx_cfa_code = le16_to_cpu(resp->rx_cfa_code);
+		netdev_dbg(bp->dev, "tx_cfa_action=0x%x, rx_cfa_code=0x%x",
+			   *tx_cfa_action, *rx_cfa_code);
+	} else {
+		netdev_info(bp->dev, "%s error rc=%d", __func__, rc);
+	}
+
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int hwrm_cfa_vfr_free(struct bnxt *bp, u16 vf_idx)
+{
+	struct hwrm_cfa_vfr_free_input req = { 0 };
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_CFA_VFR_FREE, -1, -1);
+	sprintf(req.vfr_name, "vfr%d", vf_idx);
+
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		netdev_info(bp->dev, "%s error rc=%d", __func__, rc);
+	return rc;
+}
+
+static int bnxt_hwrm_vfr_qcfg(struct bnxt *bp, struct bnxt_vf_rep *vf_rep,
+			      u16 *max_mtu)
+{
+	struct hwrm_func_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+	struct hwrm_func_qcfg_input req = {0};
+	u16 mtu;
+	int rc;
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1);
+	req.fid = cpu_to_le16(bp->pf.vf[vf_rep->vf_idx].fw_fid);
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+
+	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (!rc) {
+		mtu = le16_to_cpu(resp->max_mtu_configured);
+		if (!mtu)
+			*max_mtu = BNXT_MAX_MTU;
+		else
+			*max_mtu = mtu;
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+static int bnxt_vf_rep_open(struct net_device *dev)
+{
+	struct bnxt_vf_rep *vf_rep = netdev_priv(dev);
+	struct bnxt *bp = vf_rep->bp;
+
+	/* Enable link and TX only if the parent PF is open. */
+	if (netif_running(bp->dev)) {
+		netif_carrier_on(dev);
+		netif_tx_start_all_queues(dev);
+	}
+	return 0;
+}
+
+static int bnxt_vf_rep_close(struct net_device *dev)
+{
+	netif_carrier_off(dev);
+	netif_tx_disable(dev);
+
+	return 0;
+}
+
+static netdev_tx_t bnxt_vf_rep_xmit(struct sk_buff *skb,
+				    struct net_device *dev)
+{
+	struct bnxt_vf_rep *vf_rep = netdev_priv(dev);
+	int rc, len = skb->len;
+
+	skb_dst_drop(skb);
+	dst_hold((struct dst_entry *)vf_rep->dst);
+	skb_dst_set(skb, (struct dst_entry *)vf_rep->dst);
+	skb->dev = vf_rep->dst->u.port_info.lower_dev;
+
+	rc = dev_queue_xmit(skb);
+	if (!rc) {
+		vf_rep->tx_stats.packets++;
+		vf_rep->tx_stats.bytes += len;
+	}
+	return rc;
+}
+
+static void
+bnxt_vf_rep_get_stats64(struct net_device *dev,
+			struct rtnl_link_stats64 *stats)
+{
+	struct bnxt_vf_rep *vf_rep = netdev_priv(dev);
+
+	stats->rx_packets = vf_rep->rx_stats.packets;
+	stats->rx_bytes = vf_rep->rx_stats.bytes;
+	stats->tx_packets = vf_rep->tx_stats.packets;
+	stats->tx_bytes = vf_rep->tx_stats.bytes;
+}
+
+static int bnxt_vf_rep_setup_tc_block_cb(enum tc_setup_type type,
+					 void *type_data,
+					 void *cb_priv)
+{
+	struct bnxt_vf_rep *vf_rep = cb_priv;
+	struct bnxt *bp = vf_rep->bp;
+	int vf_fid = bp->pf.vf[vf_rep->vf_idx].fw_fid;
+
+	if (!bnxt_tc_flower_enabled(vf_rep->bp) ||
+	    !tc_cls_can_offload_and_chain0(bp->dev, type_data))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return bnxt_tc_setup_flower(bp, vf_fid, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int bnxt_vf_rep_setup_tc_block(struct net_device *dev,
+				      struct tc_block_offload *f)
+{
+	struct bnxt_vf_rep *vf_rep = netdev_priv(dev);
+
+	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		return tcf_block_cb_register(f->block,
+					     bnxt_vf_rep_setup_tc_block_cb,
+					     vf_rep, vf_rep);
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block,
+					bnxt_vf_rep_setup_tc_block_cb, vf_rep);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int bnxt_vf_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
+				void *type_data)
+{
+	switch (type) {
+	case TC_SETUP_BLOCK:
+		return bnxt_vf_rep_setup_tc_block(dev, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+struct net_device *bnxt_get_vf_rep(struct bnxt *bp, u16 cfa_code)
+{
+	u16 vf_idx;
+
+	if (cfa_code && bp->cfa_code_map && BNXT_PF(bp)) {
+		vf_idx = bp->cfa_code_map[cfa_code];
+		if (vf_idx != VF_IDX_INVALID)
+			return bp->vf_reps[vf_idx]->dev;
+	}
+	return NULL;
+}
+
+void bnxt_vf_rep_rx(struct bnxt *bp, struct sk_buff *skb)
+{
+	struct bnxt_vf_rep *vf_rep = netdev_priv(skb->dev);
+	struct bnxt_vf_rep_stats *rx_stats;
+
+	rx_stats = &vf_rep->rx_stats;
+	vf_rep->rx_stats.bytes += skb->len;
+	vf_rep->rx_stats.packets++;
+
+	netif_receive_skb(skb);
+}
+
+static int bnxt_vf_rep_get_phys_port_name(struct net_device *dev, char *buf,
+					  size_t len)
+{
+	struct bnxt_vf_rep *vf_rep = netdev_priv(dev);
+	struct pci_dev *pf_pdev = vf_rep->bp->pdev;
+	int rc;
+
+	rc = snprintf(buf, len, "pf%dvf%d", PCI_FUNC(pf_pdev->devfn),
+		      vf_rep->vf_idx);
+	if (rc >= len)
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+static void bnxt_vf_rep_get_drvinfo(struct net_device *dev,
+				    struct ethtool_drvinfo *info)
+{
+	strlcpy(info->driver, DRV_MODULE_NAME, sizeof(info->driver));
+	strlcpy(info->version, DRV_MODULE_VERSION, sizeof(info->version));
+}
+
+static int bnxt_vf_rep_port_attr_get(struct net_device *dev,
+				     struct switchdev_attr *attr)
+{
+	struct bnxt_vf_rep *vf_rep = netdev_priv(dev);
+
+	/* as only PORT_PARENT_ID is supported currently use common code
+	 * between PF and VF-rep for now.
+	 */
+	return bnxt_port_attr_get(vf_rep->bp, attr);
+}
+
+static const struct switchdev_ops bnxt_vf_rep_switchdev_ops = {
+	.switchdev_port_attr_get	= bnxt_vf_rep_port_attr_get
+};
+
+static const struct ethtool_ops bnxt_vf_rep_ethtool_ops = {
+	.get_drvinfo		= bnxt_vf_rep_get_drvinfo
+};
+
+static const struct net_device_ops bnxt_vf_rep_netdev_ops = {
+	.ndo_open		= bnxt_vf_rep_open,
+	.ndo_stop		= bnxt_vf_rep_close,
+	.ndo_start_xmit		= bnxt_vf_rep_xmit,
+	.ndo_get_stats64	= bnxt_vf_rep_get_stats64,
+	.ndo_setup_tc		= bnxt_vf_rep_setup_tc,
+	.ndo_get_phys_port_name = bnxt_vf_rep_get_phys_port_name
+};
+
+bool bnxt_dev_is_vf_rep(struct net_device *dev)
+{
+	return dev->netdev_ops == &bnxt_vf_rep_netdev_ops;
+}
+
+/* Called when the parent PF interface is closed:
+ * As the mode transition from SWITCHDEV to LEGACY
+ * happens under the rtnl_lock() this routine is safe
+ * under the rtnl_lock()
+ */
+void bnxt_vf_reps_close(struct bnxt *bp)
+{
+	struct bnxt_vf_rep *vf_rep;
+	u16 num_vfs, i;
+
+	if (bp->eswitch_mode != DEVLINK_ESWITCH_MODE_SWITCHDEV)
+		return;
+
+	num_vfs = pci_num_vf(bp->pdev);
+	for (i = 0; i < num_vfs; i++) {
+		vf_rep = bp->vf_reps[i];
+		if (netif_running(vf_rep->dev))
+			bnxt_vf_rep_close(vf_rep->dev);
+	}
+}
+
+/* Called when the parent PF interface is opened (re-opened):
+ * As the mode transition from SWITCHDEV to LEGACY
+ * happen under the rtnl_lock() this routine is safe
+ * under the rtnl_lock()
+ */
+void bnxt_vf_reps_open(struct bnxt *bp)
+{
+	int i;
+
+	if (bp->eswitch_mode != DEVLINK_ESWITCH_MODE_SWITCHDEV)
+		return;
+
+	for (i = 0; i < pci_num_vf(bp->pdev); i++)
+		bnxt_vf_rep_open(bp->vf_reps[i]->dev);
+}
+
+static void __bnxt_vf_reps_destroy(struct bnxt *bp)
+{
+	u16 num_vfs = pci_num_vf(bp->pdev);
+	struct bnxt_vf_rep *vf_rep;
+	int i;
+
+	for (i = 0; i < num_vfs; i++) {
+		vf_rep = bp->vf_reps[i];
+		if (vf_rep) {
+			dst_release((struct dst_entry *)vf_rep->dst);
+
+			if (vf_rep->tx_cfa_action != CFA_HANDLE_INVALID)
+				hwrm_cfa_vfr_free(bp, vf_rep->vf_idx);
+
+			if (vf_rep->dev) {
+				/* if register_netdev failed, then netdev_ops
+				 * would have been set to NULL
+				 */
+				if (vf_rep->dev->netdev_ops)
+					unregister_netdev(vf_rep->dev);
+				free_netdev(vf_rep->dev);
+			}
+		}
+	}
+
+	kfree(bp->vf_reps);
+	bp->vf_reps = NULL;
+}
+
+void bnxt_vf_reps_destroy(struct bnxt *bp)
+{
+	bool closed = false;
+
+	if (bp->eswitch_mode != DEVLINK_ESWITCH_MODE_SWITCHDEV)
+		return;
+
+	if (!bp->vf_reps)
+		return;
+
+	/* Ensure that parent PF's and VF-reps' RX/TX has been quiesced
+	 * before proceeding with VF-rep cleanup.
+	 */
+	rtnl_lock();
+	if (netif_running(bp->dev)) {
+		bnxt_close_nic(bp, false, false);
+		closed = true;
+	}
+	/* un-publish cfa_code_map so that RX path can't see it anymore */
+	kfree(bp->cfa_code_map);
+	bp->cfa_code_map = NULL;
+	bp->eswitch_mode = DEVLINK_ESWITCH_MODE_LEGACY;
+
+	if (closed)
+		bnxt_open_nic(bp, false, false);
+	rtnl_unlock();
+
+	/* Need to call vf_reps_destroy() outside of rntl_lock
+	 * as unregister_netdev takes rtnl_lock
+	 */
+	__bnxt_vf_reps_destroy(bp);
+}
+
+/* Use the OUI of the PF's perm addr and report the same mac addr
+ * for the same VF-rep each time
+ */
+static void bnxt_vf_rep_eth_addr_gen(u8 *src_mac, u16 vf_idx, u8 *mac)
+{
+	u32 addr;
+
+	ether_addr_copy(mac, src_mac);
+
+	addr = jhash(src_mac, ETH_ALEN, 0) + vf_idx;
+	mac[3] = (u8)(addr & 0xFF);
+	mac[4] = (u8)((addr >> 8) & 0xFF);
+	mac[5] = (u8)((addr >> 16) & 0xFF);
+}
+
+static void bnxt_vf_rep_netdev_init(struct bnxt *bp, struct bnxt_vf_rep *vf_rep,
+				    struct net_device *dev)
+{
+	struct net_device *pf_dev = bp->dev;
+	u16 max_mtu;
+
+	dev->netdev_ops = &bnxt_vf_rep_netdev_ops;
+	dev->ethtool_ops = &bnxt_vf_rep_ethtool_ops;
+	SWITCHDEV_SET_OPS(dev, &bnxt_vf_rep_switchdev_ops);
+	/* Just inherit all the featues of the parent PF as the VF-R
+	 * uses the RX/TX rings of the parent PF
+	 */
+	dev->hw_features = pf_dev->hw_features;
+	dev->gso_partial_features = pf_dev->gso_partial_features;
+	dev->vlan_features = pf_dev->vlan_features;
+	dev->hw_enc_features = pf_dev->hw_enc_features;
+	dev->features |= pf_dev->features;
+	bnxt_vf_rep_eth_addr_gen(bp->pf.mac_addr, vf_rep->vf_idx,
+				 dev->perm_addr);
+	ether_addr_copy(dev->dev_addr, dev->perm_addr);
+	/* Set VF-Rep's max-mtu to the corresponding VF's max-mtu */
+	if (!bnxt_hwrm_vfr_qcfg(bp, vf_rep, &max_mtu))
+		dev->max_mtu = max_mtu;
+	dev->min_mtu = ETH_ZLEN;
+}
+
+static int bnxt_pcie_dsn_get(struct bnxt *bp, u8 dsn[])
+{
+	struct pci_dev *pdev = bp->pdev;
+	int pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DSN);
+	u32 dw;
+
+	if (!pos) {
+		netdev_info(bp->dev, "Unable do read adapter's DSN");
+		return -EOPNOTSUPP;
+	}
+
+	/* DSN (two dw) is at an offset of 4 from the cap pos */
+	pos += 4;
+	pci_read_config_dword(pdev, pos, &dw);
+	put_unaligned_le32(dw, &dsn[0]);
+	pci_read_config_dword(pdev, pos + 4, &dw);
+	put_unaligned_le32(dw, &dsn[4]);
+	return 0;
+}
+
+static int bnxt_vf_reps_create(struct bnxt *bp)
+{
+	u16 *cfa_code_map = NULL, num_vfs = pci_num_vf(bp->pdev);
+	struct bnxt_vf_rep *vf_rep;
+	struct net_device *dev;
+	int rc, i;
+
+	bp->vf_reps = kcalloc(num_vfs, sizeof(vf_rep), GFP_KERNEL);
+	if (!bp->vf_reps)
+		return -ENOMEM;
+
+	/* storage for cfa_code to vf-idx mapping */
+	cfa_code_map = kmalloc(sizeof(*bp->cfa_code_map) * MAX_CFA_CODE,
+			       GFP_KERNEL);
+	if (!cfa_code_map) {
+		rc = -ENOMEM;
+		goto err;
+	}
+	for (i = 0; i < MAX_CFA_CODE; i++)
+		cfa_code_map[i] = VF_IDX_INVALID;
+
+	for (i = 0; i < num_vfs; i++) {
+		dev = alloc_etherdev(sizeof(*vf_rep));
+		if (!dev) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		vf_rep = netdev_priv(dev);
+		bp->vf_reps[i] = vf_rep;
+		vf_rep->dev = dev;
+		vf_rep->bp = bp;
+		vf_rep->vf_idx = i;
+		vf_rep->tx_cfa_action = CFA_HANDLE_INVALID;
+
+		/* get cfa handles from FW */
+		rc = hwrm_cfa_vfr_alloc(bp, vf_rep->vf_idx,
+					&vf_rep->tx_cfa_action,
+					&vf_rep->rx_cfa_code);
+		if (rc) {
+			rc = -ENOLINK;
+			goto err;
+		}
+		cfa_code_map[vf_rep->rx_cfa_code] = vf_rep->vf_idx;
+
+		vf_rep->dst = metadata_dst_alloc(0, METADATA_HW_PORT_MUX,
+						 GFP_KERNEL);
+		if (!vf_rep->dst) {
+			rc = -ENOMEM;
+			goto err;
+		}
+		/* only cfa_action is needed to mux a packet while TXing */
+		vf_rep->dst->u.port_info.port_id = vf_rep->tx_cfa_action;
+		vf_rep->dst->u.port_info.lower_dev = bp->dev;
+
+		bnxt_vf_rep_netdev_init(bp, vf_rep, dev);
+		rc = register_netdev(dev);
+		if (rc) {
+			/* no need for unregister_netdev in cleanup */
+			dev->netdev_ops = NULL;
+			goto err;
+		}
+	}
+
+	/* Read the adapter's DSN to use as the eswitch switch_id */
+	rc = bnxt_pcie_dsn_get(bp, bp->switch_id);
+	if (rc)
+		goto err;
+
+	/* publish cfa_code_map only after all VF-reps have been initialized */
+	bp->cfa_code_map = cfa_code_map;
+	bp->eswitch_mode = DEVLINK_ESWITCH_MODE_SWITCHDEV;
+	netif_keep_dst(bp->dev);
+	return 0;
+
+err:
+	netdev_info(bp->dev, "%s error=%d", __func__, rc);
+	kfree(cfa_code_map);
+	__bnxt_vf_reps_destroy(bp);
+	return rc;
+}
+
+/* Devlink related routines */
+int bnxt_dl_eswitch_mode_get(struct devlink *devlink, u16 *mode)
+{
+	struct bnxt *bp = bnxt_get_bp_from_dl(devlink);
+
+	*mode = bp->eswitch_mode;
+	return 0;
+}
+
+int bnxt_dl_eswitch_mode_set(struct devlink *devlink, u16 mode)
+{
+	struct bnxt *bp = bnxt_get_bp_from_dl(devlink);
+	int rc = 0;
+
+	mutex_lock(&bp->sriov_lock);
+	if (bp->eswitch_mode == mode) {
+		netdev_info(bp->dev, "already in %s eswitch mode",
+			    mode == DEVLINK_ESWITCH_MODE_LEGACY ?
+			    "legacy" : "switchdev");
+		rc = -EINVAL;
+		goto done;
+	}
+
+	switch (mode) {
+	case DEVLINK_ESWITCH_MODE_LEGACY:
+		bnxt_vf_reps_destroy(bp);
+		break;
+
+	case DEVLINK_ESWITCH_MODE_SWITCHDEV:
+		if (pci_num_vf(bp->pdev) == 0) {
+			netdev_info(bp->dev,
+				    "Enable VFs before setting switchdev mode");
+			rc = -EPERM;
+			goto done;
+		}
+		rc = bnxt_vf_reps_create(bp);
+		break;
+
+	default:
+		rc = -EINVAL;
+		goto done;
+	}
+done:
+	mutex_unlock(&bp->sriov_lock);
+	return rc;
+}
+
+#endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h
new file mode 100644
index 0000000..38b9a75
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.h
@@ -0,0 +1,64 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2016-2017 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_VFR_H
+#define BNXT_VFR_H
+
+#ifdef CONFIG_BNXT_SRIOV
+
+#define	MAX_CFA_CODE			65536
+
+void bnxt_vf_reps_destroy(struct bnxt *bp);
+void bnxt_vf_reps_close(struct bnxt *bp);
+void bnxt_vf_reps_open(struct bnxt *bp);
+void bnxt_vf_rep_rx(struct bnxt *bp, struct sk_buff *skb);
+struct net_device *bnxt_get_vf_rep(struct bnxt *bp, u16 cfa_code);
+
+static inline u16 bnxt_vf_rep_get_fid(struct net_device *dev)
+{
+	struct bnxt_vf_rep *vf_rep = netdev_priv(dev);
+	struct bnxt *bp = vf_rep->bp;
+
+	return bp->pf.vf[vf_rep->vf_idx].fw_fid;
+}
+
+bool bnxt_dev_is_vf_rep(struct net_device *dev);
+int bnxt_dl_eswitch_mode_get(struct devlink *devlink, u16 *mode);
+int bnxt_dl_eswitch_mode_set(struct devlink *devlink, u16 mode);
+
+#else
+
+static inline void bnxt_vf_reps_close(struct bnxt *bp)
+{
+}
+
+static inline void bnxt_vf_reps_open(struct bnxt *bp)
+{
+}
+
+static inline void bnxt_vf_rep_rx(struct bnxt *bp, struct sk_buff *skb)
+{
+}
+
+static inline struct net_device *bnxt_get_vf_rep(struct bnxt *bp, u16 cfa_code)
+{
+	return NULL;
+}
+
+static inline u16 bnxt_vf_rep_get_fid(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline bool bnxt_dev_is_vf_rep(struct net_device *dev)
+{
+	return false;
+}
+#endif /* CONFIG_BNXT_SRIOV */
+#endif /* BNXT_VFR_H */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
new file mode 100644
index 0000000..1389ab5
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -0,0 +1,231 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2016-2017 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <linux/filter.h>
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+#include "bnxt_xdp.h"
+
+void bnxt_xmit_xdp(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
+		   dma_addr_t mapping, u32 len, u16 rx_prod)
+{
+	struct bnxt_sw_tx_bd *tx_buf;
+	struct tx_bd *txbd;
+	u32 flags;
+	u16 prod;
+
+	prod = txr->tx_prod;
+	tx_buf = &txr->tx_buf_ring[prod];
+	tx_buf->rx_prod = rx_prod;
+
+	txbd = &txr->tx_desc_ring[TX_RING(prod)][TX_IDX(prod)];
+	flags = (len << TX_BD_LEN_SHIFT) | (1 << TX_BD_FLAGS_BD_CNT_SHIFT) |
+		TX_BD_FLAGS_PACKET_END | bnxt_lhint_arr[len >> 9];
+	txbd->tx_bd_len_flags_type = cpu_to_le32(flags);
+	txbd->tx_bd_opaque = prod;
+	txbd->tx_bd_haddr = cpu_to_le64(mapping);
+
+	prod = NEXT_TX(prod);
+	txr->tx_prod = prod;
+}
+
+void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
+{
+	struct bnxt_tx_ring_info *txr = bnapi->tx_ring;
+	struct bnxt_rx_ring_info *rxr = bnapi->rx_ring;
+	struct bnxt_sw_tx_bd *tx_buf;
+	u16 tx_cons = txr->tx_cons;
+	u16 last_tx_cons = tx_cons;
+	u16 rx_prod;
+	int i;
+
+	for (i = 0; i < nr_pkts; i++) {
+		last_tx_cons = tx_cons;
+		tx_cons = NEXT_TX(tx_cons);
+	}
+	txr->tx_cons = tx_cons;
+	if (bnxt_tx_avail(bp, txr) == bp->tx_ring_size) {
+		rx_prod = rxr->rx_prod;
+	} else {
+		tx_buf = &txr->tx_buf_ring[last_tx_cons];
+		rx_prod = tx_buf->rx_prod;
+	}
+	bnxt_db_write(bp, rxr->rx_doorbell, DB_KEY_RX | rx_prod);
+}
+
+/* returns the following:
+ * true    - packet consumed by XDP and new buffer is allocated.
+ * false   - packet should be passed to the stack.
+ */
+bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
+		 struct page *page, u8 **data_ptr, unsigned int *len, u8 *event)
+{
+	struct bpf_prog *xdp_prog = READ_ONCE(rxr->xdp_prog);
+	struct bnxt_tx_ring_info *txr;
+	struct bnxt_sw_rx_bd *rx_buf;
+	struct pci_dev *pdev;
+	struct xdp_buff xdp;
+	dma_addr_t mapping;
+	void *orig_data;
+	u32 tx_avail;
+	u32 offset;
+	u32 act;
+
+	if (!xdp_prog)
+		return false;
+
+	pdev = bp->pdev;
+	txr = rxr->bnapi->tx_ring;
+	rx_buf = &rxr->rx_buf_ring[cons];
+	offset = bp->rx_offset;
+
+	xdp.data_hard_start = *data_ptr - offset;
+	xdp.data = *data_ptr;
+	xdp_set_data_meta_invalid(&xdp);
+	xdp.data_end = *data_ptr + *len;
+	xdp.rxq = &rxr->xdp_rxq;
+	orig_data = xdp.data;
+	mapping = rx_buf->mapping - bp->rx_dma_offset;
+
+	dma_sync_single_for_cpu(&pdev->dev, mapping + offset, *len, bp->rx_dir);
+
+	rcu_read_lock();
+	act = bpf_prog_run_xdp(xdp_prog, &xdp);
+	rcu_read_unlock();
+
+	tx_avail = bnxt_tx_avail(bp, txr);
+	/* If the tx ring is not full, we must not update the rx producer yet
+	 * because we may still be transmitting on some BDs.
+	 */
+	if (tx_avail != bp->tx_ring_size)
+		*event &= ~BNXT_RX_EVENT;
+
+	if (orig_data != xdp.data) {
+		offset = xdp.data - xdp.data_hard_start;
+		*data_ptr = xdp.data_hard_start + offset;
+		*len = xdp.data_end - xdp.data;
+	}
+	switch (act) {
+	case XDP_PASS:
+		return false;
+
+	case XDP_TX:
+		if (tx_avail < 1) {
+			trace_xdp_exception(bp->dev, xdp_prog, act);
+			bnxt_reuse_rx_data(rxr, cons, page);
+			return true;
+		}
+
+		*event = BNXT_TX_EVENT;
+		dma_sync_single_for_device(&pdev->dev, mapping + offset, *len,
+					   bp->rx_dir);
+		bnxt_xmit_xdp(bp, txr, mapping + offset, *len,
+			      NEXT_RX(rxr->rx_prod));
+		bnxt_reuse_rx_data(rxr, cons, page);
+		return true;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		/* Fall thru */
+	case XDP_ABORTED:
+		trace_xdp_exception(bp->dev, xdp_prog, act);
+		/* Fall thru */
+	case XDP_DROP:
+		bnxt_reuse_rx_data(rxr, cons, page);
+		break;
+	}
+	return true;
+}
+
+/* Under rtnl_lock */
+static int bnxt_xdp_set(struct bnxt *bp, struct bpf_prog *prog)
+{
+	struct net_device *dev = bp->dev;
+	int tx_xdp = 0, rc, tc;
+	struct bpf_prog *old;
+
+	if (prog && bp->dev->mtu > BNXT_MAX_PAGE_MODE_MTU) {
+		netdev_warn(dev, "MTU %d larger than largest XDP supported MTU %d.\n",
+			    bp->dev->mtu, BNXT_MAX_PAGE_MODE_MTU);
+		return -EOPNOTSUPP;
+	}
+	if (!(bp->flags & BNXT_FLAG_SHARED_RINGS)) {
+		netdev_warn(dev, "ethtool rx/tx channels must be combined to support XDP.\n");
+		return -EOPNOTSUPP;
+	}
+	if (prog)
+		tx_xdp = bp->rx_nr_rings;
+
+	tc = netdev_get_num_tc(dev);
+	if (!tc)
+		tc = 1;
+	rc = bnxt_check_rings(bp, bp->tx_nr_rings_per_tc, bp->rx_nr_rings,
+			      true, tc, tx_xdp);
+	if (rc) {
+		netdev_warn(dev, "Unable to reserve enough TX rings to support XDP.\n");
+		return rc;
+	}
+	if (netif_running(dev))
+		bnxt_close_nic(bp, true, false);
+
+	old = xchg(&bp->xdp_prog, prog);
+	if (old)
+		bpf_prog_put(old);
+
+	if (prog) {
+		bnxt_set_rx_skb_mode(bp, true);
+	} else {
+		int rx, tx;
+
+		bnxt_set_rx_skb_mode(bp, false);
+		bnxt_get_max_rings(bp, &rx, &tx, true);
+		if (rx > 1) {
+			bp->flags &= ~BNXT_FLAG_NO_AGG_RINGS;
+			bp->dev->hw_features |= NETIF_F_LRO;
+		}
+	}
+	bp->tx_nr_rings_xdp = tx_xdp;
+	bp->tx_nr_rings = bp->tx_nr_rings_per_tc * tc + tx_xdp;
+	bp->cp_nr_rings = max_t(int, bp->tx_nr_rings, bp->rx_nr_rings);
+	bp->num_stat_ctxs = bp->cp_nr_rings;
+	bnxt_set_tpa_flags(bp);
+	bnxt_set_ring_params(bp);
+
+	if (netif_running(dev))
+		return bnxt_open_nic(bp, true, false);
+
+	return 0;
+}
+
+int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	int rc;
+
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		rc = bnxt_xdp_set(bp, xdp->prog);
+		break;
+	case XDP_QUERY_PROG:
+		xdp->prog_attached = !!bp->xdp_prog;
+		xdp->prog_id = bp->xdp_prog ? bp->xdp_prog->aux->id : 0;
+		rc = 0;
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+	return rc;
+}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
new file mode 100644
index 0000000..414b748
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h
@@ -0,0 +1,21 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2016-2017 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_XDP_H
+#define BNXT_XDP_H
+
+void bnxt_xmit_xdp(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
+		   dma_addr_t mapping, u32 len, u16 rx_prod);
+void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts);
+bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
+		 struct page *page, u8 **data_ptr, unsigned int *len,
+		 u8 *event);
+int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp);
+
+#endif
diff --git a/drivers/net/ethernet/chelsio/Kconfig b/drivers/net/ethernet/chelsio/Kconfig
new file mode 100644
index 0000000..e2cdfa7
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/Kconfig
@@ -0,0 +1,135 @@
+#
+# Chelsio device configuration
+#
+
+config NET_VENDOR_CHELSIO
+	bool "Chelsio devices"
+	default y
+	depends on PCI
+	---help---
+	  If you have a network (Ethernet) card belonging to this class, say Y.
+
+	  Note that the answer to this question doesn't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about Chelsio devices. If you say Y, you will be asked for
+	  your specific card in the following questions.
+
+if NET_VENDOR_CHELSIO
+
+config CHELSIO_T1
+	tristate "Chelsio 10Gb Ethernet support"
+	depends on PCI
+	select CRC32
+	select MDIO
+	---help---
+	  This driver supports Chelsio gigabit and 10-gigabit
+	  Ethernet cards. More information about adapter features and
+	  performance tuning is in <file:Documentation/networking/cxgb.txt>.
+
+	  For general information about Chelsio and our products, visit
+	  our website at <http://www.chelsio.com>.
+
+	  For customer support, please visit our customer support page at
+	  <http://www.chelsio.com/support.html>.
+
+	  Please send feedback to <linux-bugs@chelsio.com>.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called cxgb.
+
+config CHELSIO_T1_1G
+	bool "Chelsio gigabit Ethernet support"
+	depends on CHELSIO_T1
+	---help---
+	  Enables support for Chelsio's gigabit Ethernet PCI cards.  If you
+	  are using only 10G cards say 'N' here.
+
+config CHELSIO_T3
+	tristate "Chelsio Communications T3 10Gb Ethernet support"
+	depends on PCI && INET
+	select FW_LOADER
+	select MDIO
+	---help---
+	  This driver supports Chelsio T3-based gigabit and 10Gb Ethernet
+	  adapters.
+
+	  For general information about Chelsio and our products, visit
+	  our website at <http://www.chelsio.com>.
+
+	  For customer support, please visit our customer support page at
+	  <http://www.chelsio.com/support.html>.
+
+	  Please send feedback to <linux-bugs@chelsio.com>.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called cxgb3.
+
+config CHELSIO_T4
+	tristate "Chelsio Communications T4/T5/T6 Ethernet support"
+	depends on PCI && (IPV6 || IPV6=n)
+	select FW_LOADER
+	select MDIO
+	select ZLIB_DEFLATE
+	---help---
+	  This driver supports Chelsio T4, T5 & T6 based gigabit, 10Gb Ethernet
+	  adapter and T5/T6 based 40Gb and T6 based 25Gb, 50Gb and 100Gb
+	  Ethernet adapters.
+
+	  For general information about Chelsio and our products, visit
+	  our website at <http://www.chelsio.com>.
+
+	  For customer support, please visit our customer support page at
+	  <http://www.chelsio.com/support.html>.
+
+	  Please send feedback to <linux-bugs@chelsio.com>.
+
+	  To compile this driver as a module choose M here; the module
+	  will be called cxgb4.
+
+config CHELSIO_T4_DCB
+	bool "Data Center Bridging (DCB) Support for Chelsio T4/T5/T6 cards"
+	default n
+	depends on CHELSIO_T4 && DCB
+	---help---
+	  Enable DCB support through rtNetlink interface.
+	  Say Y here if you want to enable Data Center Bridging (DCB) support
+	  in the driver.
+
+	  If unsure, say N.
+
+config CHELSIO_T4_FCOE
+	bool "Fibre Channel over Ethernet (FCoE) Support for Chelsio T5 cards"
+	default n
+	depends on CHELSIO_T4 && CHELSIO_T4_DCB && FCOE
+	---help---
+	  Enable FCoE offload features.
+	  Say Y here if you want to enable Fibre Channel over Ethernet (FCoE) support
+	  in the driver.
+
+	  If unsure, say N.
+
+config CHELSIO_T4VF
+	tristate "Chelsio Communications T4/T5/T6 Virtual Function Ethernet support"
+	depends on PCI
+	---help---
+	  This driver supports Chelsio T4, T5 & T6 based gigabit, 10Gb Ethernet
+	  adapters and T5/T6 based 40Gb and T6 based 25Gb, 50Gb and 100Gb
+	  Ethernet adapters with PCI-E SR-IOV Virtual Functions.
+
+	  For general information about Chelsio and our products, visit
+	  our website at <http://www.chelsio.com>.
+
+	  For customer support, please visit our customer support page at
+	  <http://www.chelsio.com/support.html>.
+
+	  Please send feedback to <linux-bugs@chelsio.com>.
+
+	  To compile this driver as a module choose M here; the module
+	  will be called cxgb4vf.
+
+config CHELSIO_LIB
+	tristate
+	---help---
+	Common library for Chelsio drivers.
+
+endif # NET_VENDOR_CHELSIO
diff --git a/drivers/net/ethernet/chelsio/cxgb3/Makefile b/drivers/net/ethernet/chelsio/cxgb3/Makefile
new file mode 100644
index 0000000..29aff78
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/Makefile
@@ -0,0 +1,8 @@
+#
+# Chelsio T3 driver
+#
+
+obj-$(CONFIG_CHELSIO_T3) += cxgb3.o
+
+cxgb3-objs := cxgb3_main.o ael1002.o vsc8211.o t3_hw.o mc5.o \
+	      xgmac.o sge.o l2t.o cxgb3_offload.o aq100x.o
diff --git a/drivers/net/ethernet/chelsio/cxgb3/adapter.h b/drivers/net/ethernet/chelsio/cxgb3/adapter.h
new file mode 100644
index 0000000..087ff0f
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/adapter.h
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* This file should not be included directly.  Include common.h instead. */
+
+#ifndef __T3_ADAPTER_H__
+#define __T3_ADAPTER_H__
+
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/timer.h>
+#include <linux/cache.h>
+#include <linux/mutex.h>
+#include <linux/bitops.h>
+#include "t3cdev.h"
+#include <asm/io.h>
+
+struct adapter;
+struct sge_qset;
+struct port_info;
+
+enum mac_idx_types {
+	LAN_MAC_IDX	= 0,
+	SAN_MAC_IDX,
+
+	MAX_MAC_IDX
+};
+
+struct iscsi_config {
+	__u8	mac_addr[ETH_ALEN];
+	__u32	flags;
+	int (*send)(struct port_info *pi, struct sk_buff **skb);
+	int (*recv)(struct port_info *pi, struct sk_buff *skb);
+};
+
+struct port_info {
+	struct adapter *adapter;
+	struct sge_qset *qs;
+	u8 port_id;
+	u8 nqsets;
+	u8 first_qset;
+	struct cphy phy;
+	struct cmac mac;
+	struct link_config link_config;
+	int activity;
+	__be32 iscsi_ipv4addr;
+	struct iscsi_config iscsic;
+
+	int link_fault; /* link fault was detected */
+};
+
+enum {				/* adapter flags */
+	FULL_INIT_DONE = (1 << 0),
+	USING_MSI = (1 << 1),
+	USING_MSIX = (1 << 2),
+	QUEUES_BOUND = (1 << 3),
+	TP_PARITY_INIT = (1 << 4),
+	NAPI_INIT = (1 << 5),
+};
+
+struct fl_pg_chunk {
+	struct page *page;
+	void *va;
+	unsigned int offset;
+	unsigned long *p_cnt;
+	dma_addr_t mapping;
+};
+
+struct rx_desc;
+struct rx_sw_desc;
+
+struct sge_fl {                     /* SGE per free-buffer list state */
+	unsigned int buf_size;      /* size of each Rx buffer */
+	unsigned int credits;       /* # of available Rx buffers */
+	unsigned int pend_cred;     /* new buffers since last FL DB ring */
+	unsigned int size;          /* capacity of free list */
+	unsigned int cidx;          /* consumer index */
+	unsigned int pidx;          /* producer index */
+	unsigned int gen;           /* free list generation */
+	struct fl_pg_chunk pg_chunk;/* page chunk cache */
+	unsigned int use_pages;     /* whether FL uses pages or sk_buffs */
+	unsigned int order;	    /* order of page allocations */
+	unsigned int alloc_size;    /* size of allocated buffer */
+	struct rx_desc *desc;       /* address of HW Rx descriptor ring */
+	struct rx_sw_desc *sdesc;   /* address of SW Rx descriptor ring */
+	dma_addr_t   phys_addr;     /* physical address of HW ring start */
+	unsigned int cntxt_id;      /* SGE context id for the free list */
+	unsigned long empty;        /* # of times queue ran out of buffers */
+	unsigned long alloc_failed; /* # of times buffer allocation failed */
+};
+
+/*
+ * Bundle size for grouping offload RX packets for delivery to the stack.
+ * Don't make this too big as we do prefetch on each packet in a bundle.
+ */
+# define RX_BUNDLE_SIZE 8
+
+struct rsp_desc;
+
+struct sge_rspq {		/* state for an SGE response queue */
+	unsigned int credits;	/* # of pending response credits */
+	unsigned int size;	/* capacity of response queue */
+	unsigned int cidx;	/* consumer index */
+	unsigned int gen;	/* current generation bit */
+	unsigned int polling;	/* is the queue serviced through NAPI? */
+	unsigned int holdoff_tmr;	/* interrupt holdoff timer in 100ns */
+	unsigned int next_holdoff;	/* holdoff time for next interrupt */
+	unsigned int rx_recycle_buf; /* whether recycling occurred
+					within current sop-eop */
+	struct rsp_desc *desc;	/* address of HW response ring */
+	dma_addr_t phys_addr;	/* physical address of the ring */
+	unsigned int cntxt_id;	/* SGE context id for the response q */
+	spinlock_t lock;	/* guards response processing */
+	struct sk_buff_head rx_queue; /* offload packet receive queue */
+	struct sk_buff *pg_skb; /* used to build frag list in napi handler */
+
+	unsigned long offload_pkts;
+	unsigned long offload_bundles;
+	unsigned long eth_pkts;	/* # of ethernet packets */
+	unsigned long pure_rsps;	/* # of pure (non-data) responses */
+	unsigned long imm_data;	/* responses with immediate data */
+	unsigned long rx_drops;	/* # of packets dropped due to no mem */
+	unsigned long async_notif; /* # of asynchronous notification events */
+	unsigned long empty;	/* # of times queue ran out of credits */
+	unsigned long nomem;	/* # of responses deferred due to no mem */
+	unsigned long unhandled_irqs;	/* # of spurious intrs */
+	unsigned long starved;
+	unsigned long restarted;
+};
+
+struct tx_desc;
+struct tx_sw_desc;
+
+struct sge_txq {		/* state for an SGE Tx queue */
+	unsigned long flags;	/* HW DMA fetch status */
+	unsigned int in_use;	/* # of in-use Tx descriptors */
+	unsigned int size;	/* # of descriptors */
+	unsigned int processed;	/* total # of descs HW has processed */
+	unsigned int cleaned;	/* total # of descs SW has reclaimed */
+	unsigned int stop_thres;	/* SW TX queue suspend threshold */
+	unsigned int cidx;	/* consumer index */
+	unsigned int pidx;	/* producer index */
+	unsigned int gen;	/* current value of generation bit */
+	unsigned int unacked;	/* Tx descriptors used since last COMPL */
+	struct tx_desc *desc;	/* address of HW Tx descriptor ring */
+	struct tx_sw_desc *sdesc;	/* address of SW Tx descriptor ring */
+	spinlock_t lock;	/* guards enqueueing of new packets */
+	unsigned int token;	/* WR token */
+	dma_addr_t phys_addr;	/* physical address of the ring */
+	struct sk_buff_head sendq;	/* List of backpressured offload packets */
+	struct tasklet_struct qresume_tsk;	/* restarts the queue */
+	unsigned int cntxt_id;	/* SGE context id for the Tx q */
+	unsigned long stops;	/* # of times q has been stopped */
+	unsigned long restarts;	/* # of queue restarts */
+};
+
+enum {				/* per port SGE statistics */
+	SGE_PSTAT_TSO,		/* # of TSO requests */
+	SGE_PSTAT_RX_CSUM_GOOD,	/* # of successful RX csum offloads */
+	SGE_PSTAT_TX_CSUM,	/* # of TX checksum offloads */
+	SGE_PSTAT_VLANEX,	/* # of VLAN tag extractions */
+	SGE_PSTAT_VLANINS,	/* # of VLAN tag insertions */
+
+	SGE_PSTAT_MAX		/* must be last */
+};
+
+struct napi_gro_fraginfo;
+
+struct sge_qset {		/* an SGE queue set */
+	struct adapter *adap;
+	struct napi_struct napi;
+	struct sge_rspq rspq;
+	struct sge_fl fl[SGE_RXQ_PER_SET];
+	struct sge_txq txq[SGE_TXQ_PER_SET];
+	int nomem;
+	void *lro_va;
+	struct net_device *netdev;
+	struct netdev_queue *tx_q;	/* associated netdev TX queue */
+	unsigned long txq_stopped;	/* which Tx queues are stopped */
+	struct timer_list tx_reclaim_timer;	/* reclaims TX buffers */
+	struct timer_list rx_reclaim_timer;	/* reclaims RX buffers */
+	unsigned long port_stats[SGE_PSTAT_MAX];
+} ____cacheline_aligned;
+
+struct sge {
+	struct sge_qset qs[SGE_QSETS];
+	spinlock_t reg_lock;	/* guards non-atomic SGE registers (eg context) */
+};
+
+struct adapter {
+	struct t3cdev tdev;
+	struct list_head adapter_list;
+	void __iomem *regs;
+	struct pci_dev *pdev;
+	unsigned long registered_device_map;
+	unsigned long open_device_map;
+	unsigned long flags;
+
+	const char *name;
+	int msg_enable;
+	unsigned int mmio_len;
+
+	struct adapter_params params;
+	unsigned int slow_intr_mask;
+	unsigned long irq_stats[IRQ_NUM_STATS];
+
+	int msix_nvectors;
+	struct {
+		unsigned short vec;
+		char desc[22];
+	} msix_info[SGE_QSETS + 1];
+
+	/* T3 modules */
+	struct sge sge;
+	struct mc7 pmrx;
+	struct mc7 pmtx;
+	struct mc7 cm;
+	struct mc5 mc5;
+
+	struct net_device *port[MAX_NPORTS];
+	unsigned int check_task_cnt;
+	struct delayed_work adap_check_task;
+	struct work_struct ext_intr_handler_task;
+	struct work_struct fatal_error_handler_task;
+	struct work_struct link_fault_handler_task;
+
+	struct work_struct db_full_task;
+	struct work_struct db_empty_task;
+	struct work_struct db_drop_task;
+
+	struct dentry *debugfs_root;
+
+	struct mutex mdio_lock;
+	spinlock_t stats_lock;
+	spinlock_t work_lock;
+
+	struct sk_buff *nofail_skb;
+};
+
+static inline u32 t3_read_reg(struct adapter *adapter, u32 reg_addr)
+{
+	u32 val = readl(adapter->regs + reg_addr);
+
+	CH_DBG(adapter, MMIO, "read register 0x%x value 0x%x\n", reg_addr, val);
+	return val;
+}
+
+static inline void t3_write_reg(struct adapter *adapter, u32 reg_addr, u32 val)
+{
+	CH_DBG(adapter, MMIO, "setting register 0x%x to 0x%x\n", reg_addr, val);
+	writel(val, adapter->regs + reg_addr);
+}
+
+static inline struct port_info *adap2pinfo(struct adapter *adap, int idx)
+{
+	return netdev_priv(adap->port[idx]);
+}
+
+static inline int phy2portid(struct cphy *phy)
+{
+	struct adapter *adap = phy->adapter;
+	struct port_info *port0 = adap2pinfo(adap, 0);
+
+	return &port0->phy == phy ? 0 : 1;
+}
+
+#define OFFLOAD_DEVMAP_BIT 15
+
+#define tdev2adap(d) container_of(d, struct adapter, tdev)
+
+static inline int offload_running(struct adapter *adapter)
+{
+	return test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map);
+}
+
+int t3_offload_tx(struct t3cdev *tdev, struct sk_buff *skb);
+
+void t3_os_ext_intr_handler(struct adapter *adapter);
+void t3_os_link_changed(struct adapter *adapter, int port_id, int link_status,
+			int speed, int duplex, int fc);
+void t3_os_phymod_changed(struct adapter *adap, int port_id);
+void t3_os_link_fault(struct adapter *adapter, int port_id, int state);
+void t3_os_link_fault_handler(struct adapter *adapter, int port_id);
+
+void t3_sge_start(struct adapter *adap);
+void t3_sge_stop(struct adapter *adap);
+void t3_start_sge_timers(struct adapter *adap);
+void t3_stop_sge_timers(struct adapter *adap);
+void t3_free_sge_resources(struct adapter *adap);
+void t3_sge_err_intr_handler(struct adapter *adapter);
+irq_handler_t t3_intr_handler(struct adapter *adap, int polling);
+netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev);
+int t3_mgmt_tx(struct adapter *adap, struct sk_buff *skb);
+void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p);
+int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
+		      int irq_vec_idx, const struct qset_params *p,
+		      int ntxq, struct net_device *dev,
+		      struct netdev_queue *netdevq);
+extern struct workqueue_struct *cxgb3_wq;
+
+int t3_get_edc_fw(struct cphy *phy, int edc_idx, int size);
+
+#endif				/* __T3_ADAPTER_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/ael1002.c b/drivers/net/ethernet/chelsio/cxgb3/ael1002.c
new file mode 100644
index 0000000..dadf11e
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/ael1002.c
@@ -0,0 +1,941 @@
+/*
+ * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "common.h"
+#include "regs.h"
+
+enum {
+	AEL100X_TX_CONFIG1 = 0xc002,
+	AEL1002_PWR_DOWN_HI = 0xc011,
+	AEL1002_PWR_DOWN_LO = 0xc012,
+	AEL1002_XFI_EQL = 0xc015,
+	AEL1002_LB_EN = 0xc017,
+	AEL_OPT_SETTINGS = 0xc017,
+	AEL_I2C_CTRL = 0xc30a,
+	AEL_I2C_DATA = 0xc30b,
+	AEL_I2C_STAT = 0xc30c,
+	AEL2005_GPIO_CTRL = 0xc214,
+	AEL2005_GPIO_STAT = 0xc215,
+
+	AEL2020_GPIO_INTR   = 0xc103,	/* Latch High (LH) */
+	AEL2020_GPIO_CTRL   = 0xc108,	/* Store Clear (SC) */
+	AEL2020_GPIO_STAT   = 0xc10c,	/* Read Only (RO) */
+	AEL2020_GPIO_CFG    = 0xc110,	/* Read Write (RW) */
+
+	AEL2020_GPIO_SDA    = 0,	/* IN: i2c serial data */
+	AEL2020_GPIO_MODDET = 1,	/* IN: Module Detect */
+	AEL2020_GPIO_0      = 3,	/* IN: unassigned */
+	AEL2020_GPIO_1      = 2,	/* OUT: unassigned */
+	AEL2020_GPIO_LSTAT  = AEL2020_GPIO_1, /* wired to link status LED */
+};
+
+enum { edc_none, edc_sr, edc_twinax };
+
+/* PHY module I2C device address */
+enum {
+	MODULE_DEV_ADDR	= 0xa0,
+	SFF_DEV_ADDR	= 0xa2,
+};
+
+/* PHY transceiver type */
+enum {
+	phy_transtype_unknown = 0,
+	phy_transtype_sfp     = 3,
+	phy_transtype_xfp     = 6,
+};
+
+#define AEL2005_MODDET_IRQ 4
+
+struct reg_val {
+	unsigned short mmd_addr;
+	unsigned short reg_addr;
+	unsigned short clear_bits;
+	unsigned short set_bits;
+};
+
+static int set_phy_regs(struct cphy *phy, const struct reg_val *rv)
+{
+	int err;
+
+	for (err = 0; rv->mmd_addr && !err; rv++) {
+		if (rv->clear_bits == 0xffff)
+			err = t3_mdio_write(phy, rv->mmd_addr, rv->reg_addr,
+					    rv->set_bits);
+		else
+			err = t3_mdio_change_bits(phy, rv->mmd_addr,
+						  rv->reg_addr, rv->clear_bits,
+						  rv->set_bits);
+	}
+	return err;
+}
+
+static void ael100x_txon(struct cphy *phy)
+{
+	int tx_on_gpio =
+		phy->mdio.prtad == 0 ? F_GPIO7_OUT_VAL : F_GPIO2_OUT_VAL;
+
+	msleep(100);
+	t3_set_reg_field(phy->adapter, A_T3DBG_GPIO_EN, 0, tx_on_gpio);
+	msleep(30);
+}
+
+/*
+ * Read an 8-bit word from a device attached to the PHY's i2c bus.
+ */
+static int ael_i2c_rd(struct cphy *phy, int dev_addr, int word_addr)
+{
+	int i, err;
+	unsigned int stat, data;
+
+	err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL_I2C_CTRL,
+			    (dev_addr << 8) | (1 << 8) | word_addr);
+	if (err)
+		return err;
+
+	for (i = 0; i < 200; i++) {
+		msleep(1);
+		err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL_I2C_STAT, &stat);
+		if (err)
+			return err;
+		if ((stat & 3) == 1) {
+			err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL_I2C_DATA,
+					   &data);
+			if (err)
+				return err;
+			return data >> 8;
+		}
+	}
+	CH_WARN(phy->adapter, "PHY %u i2c read of dev.addr %#x.%#x timed out\n",
+		phy->mdio.prtad, dev_addr, word_addr);
+	return -ETIMEDOUT;
+}
+
+static int ael1002_power_down(struct cphy *phy, int enable)
+{
+	int err;
+
+	err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, MDIO_PMA_TXDIS, !!enable);
+	if (!err)
+		err = mdio_set_flag(&phy->mdio, phy->mdio.prtad,
+				    MDIO_MMD_PMAPMD, MDIO_CTRL1,
+				    MDIO_CTRL1_LPOWER, enable);
+	return err;
+}
+
+static int ael1002_reset(struct cphy *phy, int wait)
+{
+	int err;
+
+	if ((err = ael1002_power_down(phy, 0)) ||
+	    (err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL100X_TX_CONFIG1, 1)) ||
+	    (err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL1002_PWR_DOWN_HI, 0)) ||
+	    (err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL1002_PWR_DOWN_LO, 0)) ||
+	    (err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL1002_XFI_EQL, 0x18)) ||
+	    (err = t3_mdio_change_bits(phy, MDIO_MMD_PMAPMD, AEL1002_LB_EN,
+				       0, 1 << 5)))
+		return err;
+	return 0;
+}
+
+static int ael1002_intr_noop(struct cphy *phy)
+{
+	return 0;
+}
+
+/*
+ * Get link status for a 10GBASE-R device.
+ */
+static int get_link_status_r(struct cphy *phy, int *link_ok, int *speed,
+			     int *duplex, int *fc)
+{
+	if (link_ok) {
+		unsigned int stat0, stat1, stat2;
+		int err = t3_mdio_read(phy, MDIO_MMD_PMAPMD,
+				       MDIO_PMA_RXDET, &stat0);
+
+		if (!err)
+			err = t3_mdio_read(phy, MDIO_MMD_PCS,
+					   MDIO_PCS_10GBRT_STAT1, &stat1);
+		if (!err)
+			err = t3_mdio_read(phy, MDIO_MMD_PHYXS,
+					   MDIO_PHYXS_LNSTAT, &stat2);
+		if (err)
+			return err;
+		*link_ok = (stat0 & stat1 & (stat2 >> 12)) & 1;
+	}
+	if (speed)
+		*speed = SPEED_10000;
+	if (duplex)
+		*duplex = DUPLEX_FULL;
+	return 0;
+}
+
+static const struct cphy_ops ael1002_ops = {
+	.reset = ael1002_reset,
+	.intr_enable = ael1002_intr_noop,
+	.intr_disable = ael1002_intr_noop,
+	.intr_clear = ael1002_intr_noop,
+	.intr_handler = ael1002_intr_noop,
+	.get_link_status = get_link_status_r,
+	.power_down = ael1002_power_down,
+	.mmds = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS,
+};
+
+int t3_ael1002_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops)
+{
+	cphy_init(phy, adapter, phy_addr, &ael1002_ops, mdio_ops,
+		  SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE,
+		   "10GBASE-R");
+	ael100x_txon(phy);
+	return 0;
+}
+
+static int ael1006_reset(struct cphy *phy, int wait)
+{
+	return t3_phy_reset(phy, MDIO_MMD_PMAPMD, wait);
+}
+
+static const struct cphy_ops ael1006_ops = {
+	.reset = ael1006_reset,
+	.intr_enable = t3_phy_lasi_intr_enable,
+	.intr_disable = t3_phy_lasi_intr_disable,
+	.intr_clear = t3_phy_lasi_intr_clear,
+	.intr_handler = t3_phy_lasi_intr_handler,
+	.get_link_status = get_link_status_r,
+	.power_down = ael1002_power_down,
+	.mmds = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS,
+};
+
+int t3_ael1006_phy_prep(struct cphy *phy, struct adapter *adapter,
+			     int phy_addr, const struct mdio_ops *mdio_ops)
+{
+	cphy_init(phy, adapter, phy_addr, &ael1006_ops, mdio_ops,
+		  SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE,
+		   "10GBASE-SR");
+	ael100x_txon(phy);
+	return 0;
+}
+
+/*
+ * Decode our module type.
+ */
+static int ael2xxx_get_module_type(struct cphy *phy, int delay_ms)
+{
+	int v;
+
+	if (delay_ms)
+		msleep(delay_ms);
+
+	/* see SFF-8472 for below */
+	v = ael_i2c_rd(phy, MODULE_DEV_ADDR, 3);
+	if (v < 0)
+		return v;
+
+	if (v == 0x10)
+		return phy_modtype_sr;
+	if (v == 0x20)
+		return phy_modtype_lr;
+	if (v == 0x40)
+		return phy_modtype_lrm;
+
+	v = ael_i2c_rd(phy, MODULE_DEV_ADDR, 6);
+	if (v < 0)
+		return v;
+	if (v != 4)
+		goto unknown;
+
+	v = ael_i2c_rd(phy, MODULE_DEV_ADDR, 10);
+	if (v < 0)
+		return v;
+
+	if (v & 0x80) {
+		v = ael_i2c_rd(phy, MODULE_DEV_ADDR, 0x12);
+		if (v < 0)
+			return v;
+		return v > 10 ? phy_modtype_twinax_long : phy_modtype_twinax;
+	}
+unknown:
+	return phy_modtype_unknown;
+}
+
+/*
+ * Code to support the Aeluros/NetLogic 2005 10Gb PHY.
+ */
+static int ael2005_setup_sr_edc(struct cphy *phy)
+{
+	static const struct reg_val regs[] = {
+		{ MDIO_MMD_PMAPMD, 0xc003, 0xffff, 0x181 },
+		{ MDIO_MMD_PMAPMD, 0xc010, 0xffff, 0x448a },
+		{ MDIO_MMD_PMAPMD, 0xc04a, 0xffff, 0x5200 },
+		{ 0, 0, 0, 0 }
+	};
+
+	int i, err;
+
+	err = set_phy_regs(phy, regs);
+	if (err)
+		return err;
+
+	msleep(50);
+
+	if (phy->priv != edc_sr)
+		err = t3_get_edc_fw(phy, EDC_OPT_AEL2005,
+				    EDC_OPT_AEL2005_SIZE);
+	if (err)
+		return err;
+
+	for (i = 0; i <  EDC_OPT_AEL2005_SIZE / sizeof(u16) && !err; i += 2)
+		err = t3_mdio_write(phy, MDIO_MMD_PMAPMD,
+				    phy->phy_cache[i],
+				    phy->phy_cache[i + 1]);
+	if (!err)
+		phy->priv = edc_sr;
+	return err;
+}
+
+static int ael2005_setup_twinax_edc(struct cphy *phy, int modtype)
+{
+	static const struct reg_val regs[] = {
+		{ MDIO_MMD_PMAPMD, 0xc04a, 0xffff, 0x5a00 },
+		{ 0, 0, 0, 0 }
+	};
+	static const struct reg_val preemphasis[] = {
+		{ MDIO_MMD_PMAPMD, 0xc014, 0xffff, 0xfe16 },
+		{ MDIO_MMD_PMAPMD, 0xc015, 0xffff, 0xa000 },
+		{ 0, 0, 0, 0 }
+	};
+	int i, err;
+
+	err = set_phy_regs(phy, regs);
+	if (!err && modtype == phy_modtype_twinax_long)
+		err = set_phy_regs(phy, preemphasis);
+	if (err)
+		return err;
+
+	msleep(50);
+
+	if (phy->priv != edc_twinax)
+		err = t3_get_edc_fw(phy, EDC_TWX_AEL2005,
+				    EDC_TWX_AEL2005_SIZE);
+	if (err)
+		return err;
+
+	for (i = 0; i <  EDC_TWX_AEL2005_SIZE / sizeof(u16) && !err; i += 2)
+		err = t3_mdio_write(phy, MDIO_MMD_PMAPMD,
+				    phy->phy_cache[i],
+				    phy->phy_cache[i + 1]);
+	if (!err)
+		phy->priv = edc_twinax;
+	return err;
+}
+
+static int ael2005_get_module_type(struct cphy *phy, int delay_ms)
+{
+	int v;
+	unsigned int stat;
+
+	v = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL, &stat);
+	if (v)
+		return v;
+
+	if (stat & (1 << 8))			/* module absent */
+		return phy_modtype_none;
+
+	return ael2xxx_get_module_type(phy, delay_ms);
+}
+
+static int ael2005_intr_enable(struct cphy *phy)
+{
+	int err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL, 0x200);
+	return err ? err : t3_phy_lasi_intr_enable(phy);
+}
+
+static int ael2005_intr_disable(struct cphy *phy)
+{
+	int err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL, 0x100);
+	return err ? err : t3_phy_lasi_intr_disable(phy);
+}
+
+static int ael2005_intr_clear(struct cphy *phy)
+{
+	int err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL, 0xd00);
+	return err ? err : t3_phy_lasi_intr_clear(phy);
+}
+
+static int ael2005_reset(struct cphy *phy, int wait)
+{
+	static const struct reg_val regs0[] = {
+		{ MDIO_MMD_PMAPMD, 0xc001, 0, 1 << 5 },
+		{ MDIO_MMD_PMAPMD, 0xc017, 0, 1 << 5 },
+		{ MDIO_MMD_PMAPMD, 0xc013, 0xffff, 0xf341 },
+		{ MDIO_MMD_PMAPMD, 0xc210, 0xffff, 0x8000 },
+		{ MDIO_MMD_PMAPMD, 0xc210, 0xffff, 0x8100 },
+		{ MDIO_MMD_PMAPMD, 0xc210, 0xffff, 0x8000 },
+		{ MDIO_MMD_PMAPMD, 0xc210, 0xffff, 0 },
+		{ 0, 0, 0, 0 }
+	};
+	static const struct reg_val regs1[] = {
+		{ MDIO_MMD_PMAPMD, 0xca00, 0xffff, 0x0080 },
+		{ MDIO_MMD_PMAPMD, 0xca12, 0xffff, 0 },
+		{ 0, 0, 0, 0 }
+	};
+
+	int err;
+	unsigned int lasi_ctrl;
+
+	err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_CTRL,
+			   &lasi_ctrl);
+	if (err)
+		return err;
+
+	err = t3_phy_reset(phy, MDIO_MMD_PMAPMD, 0);
+	if (err)
+		return err;
+
+	msleep(125);
+	phy->priv = edc_none;
+	err = set_phy_regs(phy, regs0);
+	if (err)
+		return err;
+
+	msleep(50);
+
+	err = ael2005_get_module_type(phy, 0);
+	if (err < 0)
+		return err;
+	phy->modtype = err;
+
+	if (err == phy_modtype_twinax || err == phy_modtype_twinax_long)
+		err = ael2005_setup_twinax_edc(phy, err);
+	else
+		err = ael2005_setup_sr_edc(phy);
+	if (err)
+		return err;
+
+	err = set_phy_regs(phy, regs1);
+	if (err)
+		return err;
+
+	/* reset wipes out interrupts, reenable them if they were on */
+	if (lasi_ctrl & 1)
+		err = ael2005_intr_enable(phy);
+	return err;
+}
+
+static int ael2005_intr_handler(struct cphy *phy)
+{
+	unsigned int stat;
+	int ret, edc_needed, cause = 0;
+
+	ret = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_STAT, &stat);
+	if (ret)
+		return ret;
+
+	if (stat & AEL2005_MODDET_IRQ) {
+		ret = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL,
+				    0xd00);
+		if (ret)
+			return ret;
+
+		/* modules have max 300 ms init time after hot plug */
+		ret = ael2005_get_module_type(phy, 300);
+		if (ret < 0)
+			return ret;
+
+		phy->modtype = ret;
+		if (ret == phy_modtype_none)
+			edc_needed = phy->priv;       /* on unplug retain EDC */
+		else if (ret == phy_modtype_twinax ||
+			 ret == phy_modtype_twinax_long)
+			edc_needed = edc_twinax;
+		else
+			edc_needed = edc_sr;
+
+		if (edc_needed != phy->priv) {
+			ret = ael2005_reset(phy, 0);
+			return ret ? ret : cphy_cause_module_change;
+		}
+		cause = cphy_cause_module_change;
+	}
+
+	ret = t3_phy_lasi_intr_handler(phy);
+	if (ret < 0)
+		return ret;
+
+	ret |= cause;
+	return ret ? ret : cphy_cause_link_change;
+}
+
+static const struct cphy_ops ael2005_ops = {
+	.reset           = ael2005_reset,
+	.intr_enable     = ael2005_intr_enable,
+	.intr_disable    = ael2005_intr_disable,
+	.intr_clear      = ael2005_intr_clear,
+	.intr_handler    = ael2005_intr_handler,
+	.get_link_status = get_link_status_r,
+	.power_down      = ael1002_power_down,
+	.mmds            = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS,
+};
+
+int t3_ael2005_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops)
+{
+	cphy_init(phy, adapter, phy_addr, &ael2005_ops, mdio_ops,
+		  SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE |
+		  SUPPORTED_IRQ, "10GBASE-R");
+	msleep(125);
+	return t3_mdio_change_bits(phy, MDIO_MMD_PMAPMD, AEL_OPT_SETTINGS, 0,
+				   1 << 5);
+}
+
+/*
+ * Setup EDC and other parameters for operation with an optical module.
+ */
+static int ael2020_setup_sr_edc(struct cphy *phy)
+{
+	static const struct reg_val regs[] = {
+		/* set CDR offset to 10 */
+		{ MDIO_MMD_PMAPMD, 0xcc01, 0xffff, 0x488a },
+
+		/* adjust 10G RX bias current */
+		{ MDIO_MMD_PMAPMD, 0xcb1b, 0xffff, 0x0200 },
+		{ MDIO_MMD_PMAPMD, 0xcb1c, 0xffff, 0x00f0 },
+		{ MDIO_MMD_PMAPMD, 0xcc06, 0xffff, 0x00e0 },
+
+		/* end */
+		{ 0, 0, 0, 0 }
+	};
+	int err;
+
+	err = set_phy_regs(phy, regs);
+	msleep(50);
+	if (err)
+		return err;
+
+	phy->priv = edc_sr;
+	return 0;
+}
+
+/*
+ * Setup EDC and other parameters for operation with an TWINAX module.
+ */
+static int ael2020_setup_twinax_edc(struct cphy *phy, int modtype)
+{
+	/* set uC to 40MHz */
+	static const struct reg_val uCclock40MHz[] = {
+		{ MDIO_MMD_PMAPMD, 0xff28, 0xffff, 0x4001 },
+		{ MDIO_MMD_PMAPMD, 0xff2a, 0xffff, 0x0002 },
+		{ 0, 0, 0, 0 }
+	};
+
+	/* activate uC clock */
+	static const struct reg_val uCclockActivate[] = {
+		{ MDIO_MMD_PMAPMD, 0xd000, 0xffff, 0x5200 },
+		{ 0, 0, 0, 0 }
+	};
+
+	/* set PC to start of SRAM and activate uC */
+	static const struct reg_val uCactivate[] = {
+		{ MDIO_MMD_PMAPMD, 0xd080, 0xffff, 0x0100 },
+		{ MDIO_MMD_PMAPMD, 0xd092, 0xffff, 0x0000 },
+		{ 0, 0, 0, 0 }
+	};
+	int i, err;
+
+	/* set uC clock and activate it */
+	err = set_phy_regs(phy, uCclock40MHz);
+	msleep(500);
+	if (err)
+		return err;
+	err = set_phy_regs(phy, uCclockActivate);
+	msleep(500);
+	if (err)
+		return err;
+
+	if (phy->priv != edc_twinax)
+		err = t3_get_edc_fw(phy, EDC_TWX_AEL2020,
+				    EDC_TWX_AEL2020_SIZE);
+	if (err)
+		return err;
+
+	for (i = 0; i <  EDC_TWX_AEL2020_SIZE / sizeof(u16) && !err; i += 2)
+		err = t3_mdio_write(phy, MDIO_MMD_PMAPMD,
+				    phy->phy_cache[i],
+				    phy->phy_cache[i + 1]);
+	/* activate uC */
+	err = set_phy_regs(phy, uCactivate);
+	if (!err)
+		phy->priv = edc_twinax;
+	return err;
+}
+
+/*
+ * Return Module Type.
+ */
+static int ael2020_get_module_type(struct cphy *phy, int delay_ms)
+{
+	int v;
+	unsigned int stat;
+
+	v = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2020_GPIO_STAT, &stat);
+	if (v)
+		return v;
+
+	if (stat & (0x1 << (AEL2020_GPIO_MODDET*4))) {
+		/* module absent */
+		return phy_modtype_none;
+	}
+
+	return ael2xxx_get_module_type(phy, delay_ms);
+}
+
+/*
+ * Enable PHY interrupts.  We enable "Module Detection" interrupts (on any
+ * state transition) and then generic Link Alarm Status Interrupt (LASI).
+ */
+static int ael2020_intr_enable(struct cphy *phy)
+{
+	static const struct reg_val regs[] = {
+		/* output Module's Loss Of Signal (LOS) to LED */
+		{ MDIO_MMD_PMAPMD, AEL2020_GPIO_CFG+AEL2020_GPIO_LSTAT,
+			0xffff, 0x4 },
+		{ MDIO_MMD_PMAPMD, AEL2020_GPIO_CTRL,
+			0xffff, 0x8 << (AEL2020_GPIO_LSTAT*4) },
+
+		 /* enable module detect status change interrupts */
+		{ MDIO_MMD_PMAPMD, AEL2020_GPIO_CTRL,
+			0xffff, 0x2 << (AEL2020_GPIO_MODDET*4) },
+
+		/* end */
+		{ 0, 0, 0, 0 }
+	};
+	int err, link_ok = 0;
+
+	/* set up "link status" LED and enable module change interrupts */
+	err = set_phy_regs(phy, regs);
+	if (err)
+		return err;
+
+	err = get_link_status_r(phy, &link_ok, NULL, NULL, NULL);
+	if (err)
+		return err;
+	if (link_ok)
+		t3_link_changed(phy->adapter,
+				phy2portid(phy));
+
+	err = t3_phy_lasi_intr_enable(phy);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/*
+ * Disable PHY interrupts.  The mirror of the above ...
+ */
+static int ael2020_intr_disable(struct cphy *phy)
+{
+	static const struct reg_val regs[] = {
+		/* reset "link status" LED to "off" */
+		{ MDIO_MMD_PMAPMD, AEL2020_GPIO_CTRL,
+			0xffff, 0xb << (AEL2020_GPIO_LSTAT*4) },
+
+		/* disable module detect status change interrupts */
+		{ MDIO_MMD_PMAPMD, AEL2020_GPIO_CTRL,
+			0xffff, 0x1 << (AEL2020_GPIO_MODDET*4) },
+
+		/* end */
+		{ 0, 0, 0, 0 }
+	};
+	int err;
+
+	/* turn off "link status" LED and disable module change interrupts */
+	err = set_phy_regs(phy, regs);
+	if (err)
+		return err;
+
+	return t3_phy_lasi_intr_disable(phy);
+}
+
+/*
+ * Clear PHY interrupt state.
+ */
+static int ael2020_intr_clear(struct cphy *phy)
+{
+	/*
+	 * The GPIO Interrupt register on the AEL2020 is a "Latching High"
+	 * (LH) register which is cleared to the current state when it's read.
+	 * Thus, we simply read the register and discard the result.
+	 */
+	unsigned int stat;
+	int err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2020_GPIO_INTR, &stat);
+	return err ? err : t3_phy_lasi_intr_clear(phy);
+}
+
+static const struct reg_val ael2020_reset_regs[] = {
+	/* Erratum #2: CDRLOL asserted, causing PMA link down status */
+	{ MDIO_MMD_PMAPMD, 0xc003, 0xffff, 0x3101 },
+
+	/* force XAUI to send LF when RX_LOS is asserted */
+	{ MDIO_MMD_PMAPMD, 0xcd40, 0xffff, 0x0001 },
+
+	/* allow writes to transceiver module EEPROM on i2c bus */
+	{ MDIO_MMD_PMAPMD, 0xff02, 0xffff, 0x0023 },
+	{ MDIO_MMD_PMAPMD, 0xff03, 0xffff, 0x0000 },
+	{ MDIO_MMD_PMAPMD, 0xff04, 0xffff, 0x0000 },
+
+	/* end */
+	{ 0, 0, 0, 0 }
+};
+/*
+ * Reset the PHY and put it into a canonical operating state.
+ */
+static int ael2020_reset(struct cphy *phy, int wait)
+{
+	int err;
+	unsigned int lasi_ctrl;
+
+	/* grab current interrupt state */
+	err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_CTRL,
+			   &lasi_ctrl);
+	if (err)
+		return err;
+
+	err = t3_phy_reset(phy, MDIO_MMD_PMAPMD, 125);
+	if (err)
+		return err;
+	msleep(100);
+
+	/* basic initialization for all module types */
+	phy->priv = edc_none;
+	err = set_phy_regs(phy, ael2020_reset_regs);
+	if (err)
+		return err;
+
+	/* determine module type and perform appropriate initialization */
+	err = ael2020_get_module_type(phy, 0);
+	if (err < 0)
+		return err;
+	phy->modtype = (u8)err;
+	if (err == phy_modtype_twinax || err == phy_modtype_twinax_long)
+		err = ael2020_setup_twinax_edc(phy, err);
+	else
+		err = ael2020_setup_sr_edc(phy);
+	if (err)
+		return err;
+
+	/* reset wipes out interrupts, reenable them if they were on */
+	if (lasi_ctrl & 1)
+		err = ael2005_intr_enable(phy);
+	return err;
+}
+
+/*
+ * Handle a PHY interrupt.
+ */
+static int ael2020_intr_handler(struct cphy *phy)
+{
+	unsigned int stat;
+	int ret, edc_needed, cause = 0;
+
+	ret = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2020_GPIO_INTR, &stat);
+	if (ret)
+		return ret;
+
+	if (stat & (0x1 << AEL2020_GPIO_MODDET)) {
+		/* modules have max 300 ms init time after hot plug */
+		ret = ael2020_get_module_type(phy, 300);
+		if (ret < 0)
+			return ret;
+
+		phy->modtype = (u8)ret;
+		if (ret == phy_modtype_none)
+			edc_needed = phy->priv;       /* on unplug retain EDC */
+		else if (ret == phy_modtype_twinax ||
+			 ret == phy_modtype_twinax_long)
+			edc_needed = edc_twinax;
+		else
+			edc_needed = edc_sr;
+
+		if (edc_needed != phy->priv) {
+			ret = ael2020_reset(phy, 0);
+			return ret ? ret : cphy_cause_module_change;
+		}
+		cause = cphy_cause_module_change;
+	}
+
+	ret = t3_phy_lasi_intr_handler(phy);
+	if (ret < 0)
+		return ret;
+
+	ret |= cause;
+	return ret ? ret : cphy_cause_link_change;
+}
+
+static const struct cphy_ops ael2020_ops = {
+	.reset           = ael2020_reset,
+	.intr_enable     = ael2020_intr_enable,
+	.intr_disable    = ael2020_intr_disable,
+	.intr_clear      = ael2020_intr_clear,
+	.intr_handler    = ael2020_intr_handler,
+	.get_link_status = get_link_status_r,
+	.power_down      = ael1002_power_down,
+	.mmds		 = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS,
+};
+
+int t3_ael2020_phy_prep(struct cphy *phy, struct adapter *adapter, int phy_addr,
+			const struct mdio_ops *mdio_ops)
+{
+	int err;
+
+	cphy_init(phy, adapter, phy_addr, &ael2020_ops, mdio_ops,
+		  SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE |
+		  SUPPORTED_IRQ, "10GBASE-R");
+	msleep(125);
+
+	err = set_phy_regs(phy, ael2020_reset_regs);
+	if (err)
+		return err;
+	return 0;
+}
+
+/*
+ * Get link status for a 10GBASE-X device.
+ */
+static int get_link_status_x(struct cphy *phy, int *link_ok, int *speed,
+			     int *duplex, int *fc)
+{
+	if (link_ok) {
+		unsigned int stat0, stat1, stat2;
+		int err = t3_mdio_read(phy, MDIO_MMD_PMAPMD,
+				       MDIO_PMA_RXDET, &stat0);
+
+		if (!err)
+			err = t3_mdio_read(phy, MDIO_MMD_PCS,
+					   MDIO_PCS_10GBX_STAT1, &stat1);
+		if (!err)
+			err = t3_mdio_read(phy, MDIO_MMD_PHYXS,
+					   MDIO_PHYXS_LNSTAT, &stat2);
+		if (err)
+			return err;
+		*link_ok = (stat0 & (stat1 >> 12) & (stat2 >> 12)) & 1;
+	}
+	if (speed)
+		*speed = SPEED_10000;
+	if (duplex)
+		*duplex = DUPLEX_FULL;
+	return 0;
+}
+
+static const struct cphy_ops qt2045_ops = {
+	.reset = ael1006_reset,
+	.intr_enable = t3_phy_lasi_intr_enable,
+	.intr_disable = t3_phy_lasi_intr_disable,
+	.intr_clear = t3_phy_lasi_intr_clear,
+	.intr_handler = t3_phy_lasi_intr_handler,
+	.get_link_status = get_link_status_x,
+	.power_down = ael1002_power_down,
+	.mmds = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS,
+};
+
+int t3_qt2045_phy_prep(struct cphy *phy, struct adapter *adapter,
+		       int phy_addr, const struct mdio_ops *mdio_ops)
+{
+	unsigned int stat;
+
+	cphy_init(phy, adapter, phy_addr, &qt2045_ops, mdio_ops,
+		  SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_TP,
+		  "10GBASE-CX4");
+
+	/*
+	 * Some cards where the PHY is supposed to be at address 0 actually
+	 * have it at 1.
+	 */
+	if (!phy_addr &&
+	    !t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_STAT1, &stat) &&
+	    stat == 0xffff)
+		phy->mdio.prtad = 1;
+	return 0;
+}
+
+static int xaui_direct_reset(struct cphy *phy, int wait)
+{
+	return 0;
+}
+
+static int xaui_direct_get_link_status(struct cphy *phy, int *link_ok,
+				       int *speed, int *duplex, int *fc)
+{
+	if (link_ok) {
+		unsigned int status;
+		int prtad = phy->mdio.prtad;
+
+		status = t3_read_reg(phy->adapter,
+				     XGM_REG(A_XGM_SERDES_STAT0, prtad)) |
+		    t3_read_reg(phy->adapter,
+				    XGM_REG(A_XGM_SERDES_STAT1, prtad)) |
+		    t3_read_reg(phy->adapter,
+				XGM_REG(A_XGM_SERDES_STAT2, prtad)) |
+		    t3_read_reg(phy->adapter,
+				XGM_REG(A_XGM_SERDES_STAT3, prtad));
+		*link_ok = !(status & F_LOWSIG0);
+	}
+	if (speed)
+		*speed = SPEED_10000;
+	if (duplex)
+		*duplex = DUPLEX_FULL;
+	return 0;
+}
+
+static int xaui_direct_power_down(struct cphy *phy, int enable)
+{
+	return 0;
+}
+
+static const struct cphy_ops xaui_direct_ops = {
+	.reset = xaui_direct_reset,
+	.intr_enable = ael1002_intr_noop,
+	.intr_disable = ael1002_intr_noop,
+	.intr_clear = ael1002_intr_noop,
+	.intr_handler = ael1002_intr_noop,
+	.get_link_status = xaui_direct_get_link_status,
+	.power_down = xaui_direct_power_down,
+};
+
+int t3_xaui_direct_phy_prep(struct cphy *phy, struct adapter *adapter,
+			    int phy_addr, const struct mdio_ops *mdio_ops)
+{
+	cphy_init(phy, adapter, phy_addr, &xaui_direct_ops, mdio_ops,
+		  SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_TP,
+		  "10GBASE-CX4");
+	return 0;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/aq100x.c b/drivers/net/ethernet/chelsio/cxgb3/aq100x.c
new file mode 100644
index 0000000..6af5d20
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/aq100x.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "common.h"
+#include "regs.h"
+
+enum {
+	/* MDIO_DEV_PMA_PMD registers */
+	AQ_LINK_STAT	= 0xe800,
+	AQ_IMASK_PMA	= 0xf000,
+
+	/* MDIO_DEV_XGXS registers */
+	AQ_XAUI_RX_CFG	= 0xc400,
+	AQ_XAUI_TX_CFG	= 0xe400,
+
+	/* MDIO_DEV_ANEG registers */
+	AQ_1G_CTRL	= 0xc400,
+	AQ_ANEG_STAT	= 0xc800,
+
+	/* MDIO_DEV_VEND1 registers */
+	AQ_FW_VERSION	= 0x0020,
+	AQ_IFLAG_GLOBAL	= 0xfc00,
+	AQ_IMASK_GLOBAL	= 0xff00,
+};
+
+enum {
+	IMASK_PMA	= 1 << 2,
+	IMASK_GLOBAL	= 1 << 15,
+	ADV_1G_FULL	= 1 << 15,
+	ADV_1G_HALF	= 1 << 14,
+	ADV_10G_FULL	= 1 << 12,
+	AQ_RESET	= (1 << 14) | (1 << 15),
+	AQ_LOWPOWER	= 1 << 12,
+};
+
+static int aq100x_reset(struct cphy *phy, int wait)
+{
+	/*
+	 * Ignore the caller specified wait time; always wait for the reset to
+	 * complete. Can take up to 3s.
+	 */
+	int err = t3_phy_reset(phy, MDIO_MMD_VEND1, 3000);
+
+	if (err)
+		CH_WARN(phy->adapter, "PHY%d: reset failed (0x%x).\n",
+			phy->mdio.prtad, err);
+
+	return err;
+}
+
+static int aq100x_intr_enable(struct cphy *phy)
+{
+	int err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AQ_IMASK_PMA, IMASK_PMA);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_MMD_VEND1, AQ_IMASK_GLOBAL, IMASK_GLOBAL);
+	return err;
+}
+
+static int aq100x_intr_disable(struct cphy *phy)
+{
+	return t3_mdio_write(phy, MDIO_MMD_VEND1, AQ_IMASK_GLOBAL, 0);
+}
+
+static int aq100x_intr_clear(struct cphy *phy)
+{
+	unsigned int v;
+
+	t3_mdio_read(phy, MDIO_MMD_VEND1, AQ_IFLAG_GLOBAL, &v);
+	t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_STAT1, &v);
+
+	return 0;
+}
+
+static int aq100x_intr_handler(struct cphy *phy)
+{
+	int err;
+	unsigned int cause, v;
+
+	err = t3_mdio_read(phy, MDIO_MMD_VEND1, AQ_IFLAG_GLOBAL, &cause);
+	if (err)
+		return err;
+
+	/* Read (and reset) the latching version of the status */
+	t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_STAT1, &v);
+
+	return cphy_cause_link_change;
+}
+
+static int aq100x_power_down(struct cphy *phy, int off)
+{
+	return mdio_set_flag(&phy->mdio, phy->mdio.prtad,
+			     MDIO_MMD_PMAPMD, MDIO_CTRL1,
+			     MDIO_CTRL1_LPOWER, off);
+}
+
+static int aq100x_autoneg_enable(struct cphy *phy)
+{
+	int err;
+
+	err = aq100x_power_down(phy, 0);
+	if (!err)
+		err = mdio_set_flag(&phy->mdio, phy->mdio.prtad,
+				    MDIO_MMD_AN, MDIO_CTRL1,
+				    BMCR_ANENABLE | BMCR_ANRESTART, 1);
+
+	return err;
+}
+
+static int aq100x_autoneg_restart(struct cphy *phy)
+{
+	int err;
+
+	err = aq100x_power_down(phy, 0);
+	if (!err)
+		err = mdio_set_flag(&phy->mdio, phy->mdio.prtad,
+				    MDIO_MMD_AN, MDIO_CTRL1,
+				    BMCR_ANENABLE | BMCR_ANRESTART, 1);
+
+	return err;
+}
+
+static int aq100x_advertise(struct cphy *phy, unsigned int advertise_map)
+{
+	unsigned int adv;
+	int err;
+
+	/* 10G advertisement */
+	adv = 0;
+	if (advertise_map & ADVERTISED_10000baseT_Full)
+		adv |= ADV_10G_FULL;
+	err = t3_mdio_change_bits(phy, MDIO_MMD_AN, MDIO_AN_10GBT_CTRL,
+				  ADV_10G_FULL, adv);
+	if (err)
+		return err;
+
+	/* 1G advertisement */
+	adv = 0;
+	if (advertise_map & ADVERTISED_1000baseT_Full)
+		adv |= ADV_1G_FULL;
+	if (advertise_map & ADVERTISED_1000baseT_Half)
+		adv |= ADV_1G_HALF;
+	err = t3_mdio_change_bits(phy, MDIO_MMD_AN, AQ_1G_CTRL,
+				  ADV_1G_FULL | ADV_1G_HALF, adv);
+	if (err)
+		return err;
+
+	/* 100M, pause advertisement */
+	adv = 0;
+	if (advertise_map & ADVERTISED_100baseT_Half)
+		adv |= ADVERTISE_100HALF;
+	if (advertise_map & ADVERTISED_100baseT_Full)
+		adv |= ADVERTISE_100FULL;
+	if (advertise_map & ADVERTISED_Pause)
+		adv |= ADVERTISE_PAUSE_CAP;
+	if (advertise_map & ADVERTISED_Asym_Pause)
+		adv |= ADVERTISE_PAUSE_ASYM;
+	err = t3_mdio_change_bits(phy, MDIO_MMD_AN, MDIO_AN_ADVERTISE,
+				  0xfe0, adv);
+
+	return err;
+}
+
+static int aq100x_set_loopback(struct cphy *phy, int mmd, int dir, int enable)
+{
+	return mdio_set_flag(&phy->mdio, phy->mdio.prtad,
+			     MDIO_MMD_PMAPMD, MDIO_CTRL1,
+			     BMCR_LOOPBACK, enable);
+}
+
+static int aq100x_set_speed_duplex(struct cphy *phy, int speed, int duplex)
+{
+	/* no can do */
+	return -1;
+}
+
+static int aq100x_get_link_status(struct cphy *phy, int *link_ok,
+				  int *speed, int *duplex, int *fc)
+{
+	int err;
+	unsigned int v;
+
+	if (link_ok) {
+		err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AQ_LINK_STAT, &v);
+		if (err)
+			return err;
+
+		*link_ok = v & 1;
+		if (!*link_ok)
+			return 0;
+	}
+
+	err = t3_mdio_read(phy, MDIO_MMD_AN, AQ_ANEG_STAT, &v);
+	if (err)
+		return err;
+
+	if (speed) {
+		switch (v & 0x6) {
+		case 0x6:
+			*speed = SPEED_10000;
+			break;
+		case 0x4:
+			*speed = SPEED_1000;
+			break;
+		case 0x2:
+			*speed = SPEED_100;
+			break;
+		case 0x0:
+			*speed = SPEED_10;
+			break;
+		}
+	}
+
+	if (duplex)
+		*duplex = v & 1 ? DUPLEX_FULL : DUPLEX_HALF;
+
+	return 0;
+}
+
+static const struct cphy_ops aq100x_ops = {
+	.reset             = aq100x_reset,
+	.intr_enable       = aq100x_intr_enable,
+	.intr_disable      = aq100x_intr_disable,
+	.intr_clear        = aq100x_intr_clear,
+	.intr_handler      = aq100x_intr_handler,
+	.autoneg_enable    = aq100x_autoneg_enable,
+	.autoneg_restart   = aq100x_autoneg_restart,
+	.advertise         = aq100x_advertise,
+	.set_loopback      = aq100x_set_loopback,
+	.set_speed_duplex  = aq100x_set_speed_duplex,
+	.get_link_status   = aq100x_get_link_status,
+	.power_down        = aq100x_power_down,
+	.mmds 		   = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS,
+};
+
+int t3_aq100x_phy_prep(struct cphy *phy, struct adapter *adapter, int phy_addr,
+		       const struct mdio_ops *mdio_ops)
+{
+	unsigned int v, v2, gpio, wait;
+	int err;
+
+	cphy_init(phy, adapter, phy_addr, &aq100x_ops, mdio_ops,
+		  SUPPORTED_1000baseT_Full | SUPPORTED_10000baseT_Full |
+		  SUPPORTED_TP | SUPPORTED_Autoneg | SUPPORTED_AUI,
+		  "1000/10GBASE-T");
+
+	/*
+	 * The PHY has been out of reset ever since the system powered up.  So
+	 * we do a hard reset over here.
+	 */
+	gpio = phy_addr ? F_GPIO10_OUT_VAL : F_GPIO6_OUT_VAL;
+	t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, gpio, 0);
+	msleep(1);
+	t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, gpio, gpio);
+
+	/*
+	 * Give it enough time to load the firmware and get ready for mdio.
+	 */
+	msleep(1000);
+	wait = 500; /* in 10ms increments */
+	do {
+		err = t3_mdio_read(phy, MDIO_MMD_VEND1, MDIO_CTRL1, &v);
+		if (err || v == 0xffff) {
+
+			/* Allow prep_adapter to succeed when ffff is read */
+
+			CH_WARN(adapter, "PHY%d: reset failed (0x%x, 0x%x).\n",
+				phy_addr, err, v);
+			goto done;
+		}
+
+		v &= AQ_RESET;
+		if (v)
+			msleep(10);
+	} while (v && --wait);
+	if (v) {
+		CH_WARN(adapter, "PHY%d: reset timed out (0x%x).\n",
+			phy_addr, v);
+
+		goto done; /* let prep_adapter succeed */
+	}
+
+	/* Datasheet says 3s max but this has been observed */
+	wait = (500 - wait) * 10 + 1000;
+	if (wait > 3000)
+		CH_WARN(adapter, "PHY%d: reset took %ums\n", phy_addr, wait);
+
+	/* Firmware version check. */
+	t3_mdio_read(phy, MDIO_MMD_VEND1, AQ_FW_VERSION, &v);
+	if (v != 101)
+		CH_WARN(adapter, "PHY%d: unsupported firmware %d\n",
+			phy_addr, v);
+
+	/*
+	 * The PHY should start in really-low-power mode.  Prepare it for normal
+	 * operations.
+	 */
+	err = t3_mdio_read(phy, MDIO_MMD_VEND1, MDIO_CTRL1, &v);
+	if (err)
+		return err;
+	if (v & AQ_LOWPOWER) {
+		err = t3_mdio_change_bits(phy, MDIO_MMD_VEND1, MDIO_CTRL1,
+					  AQ_LOWPOWER, 0);
+		if (err)
+			return err;
+		msleep(10);
+	} else
+		CH_WARN(adapter, "PHY%d does not start in low power mode.\n",
+			phy_addr);
+
+	/*
+	 * Verify XAUI settings, but let prep succeed no matter what.
+	 */
+	v = v2 = 0;
+	t3_mdio_read(phy, MDIO_MMD_PHYXS, AQ_XAUI_RX_CFG, &v);
+	t3_mdio_read(phy, MDIO_MMD_PHYXS, AQ_XAUI_TX_CFG, &v2);
+	if (v != 0x1b || v2 != 0x1b)
+		CH_WARN(adapter,
+			"PHY%d: incorrect XAUI settings (0x%x, 0x%x).\n",
+			phy_addr, v, v2);
+
+done:
+	return err;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/common.h b/drivers/net/ethernet/chelsio/cxgb3/common.h
new file mode 100644
index 0000000..1bd7d89
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/common.h
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __CHELSIO_COMMON_H
+#define __CHELSIO_COMMON_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mdio.h>
+#include "version.h"
+
+#define CH_ERR(adap, fmt, ...)   dev_err(&adap->pdev->dev, fmt, ##__VA_ARGS__)
+#define CH_WARN(adap, fmt, ...)  dev_warn(&adap->pdev->dev, fmt, ##__VA_ARGS__)
+#define CH_ALERT(adap, fmt, ...) dev_alert(&adap->pdev->dev, fmt, ##__VA_ARGS__)
+
+/*
+ * More powerful macro that selectively prints messages based on msg_enable.
+ * For info and debugging messages.
+ */
+#define CH_MSG(adapter, level, category, fmt, ...) do { \
+	if ((adapter)->msg_enable & NETIF_MSG_##category) \
+		dev_printk(KERN_##level, &adapter->pdev->dev, fmt, \
+			   ## __VA_ARGS__); \
+} while (0)
+
+#ifdef DEBUG
+# define CH_DBG(adapter, category, fmt, ...) \
+	CH_MSG(adapter, DEBUG, category, fmt, ## __VA_ARGS__)
+#else
+# define CH_DBG(adapter, category, fmt, ...)
+#endif
+
+/* Additional NETIF_MSG_* categories */
+#define NETIF_MSG_MMIO 0x8000000
+
+enum {
+	MAX_NPORTS = 2,		/* max # of ports */
+	MAX_FRAME_SIZE = 10240,	/* max MAC frame size, including header + FCS */
+	EEPROMSIZE = 8192,	/* Serial EEPROM size */
+	SERNUM_LEN     = 16,    /* Serial # length */
+	RSS_TABLE_SIZE = 64,	/* size of RSS lookup and mapping tables */
+	TCB_SIZE = 128,		/* TCB size */
+	NMTUS = 16,		/* size of MTU table */
+	NCCTRL_WIN = 32,	/* # of congestion control windows */
+	PROTO_SRAM_LINES = 128, /* size of TP sram */
+};
+
+#define MAX_RX_COALESCING_LEN 12288U
+
+enum {
+	PAUSE_RX = 1 << 0,
+	PAUSE_TX = 1 << 1,
+	PAUSE_AUTONEG = 1 << 2
+};
+
+enum {
+	SUPPORTED_IRQ      = 1 << 24
+};
+
+enum {				/* adapter interrupt-maintained statistics */
+	STAT_ULP_CH0_PBL_OOB,
+	STAT_ULP_CH1_PBL_OOB,
+	STAT_PCI_CORR_ECC,
+
+	IRQ_NUM_STATS		/* keep last */
+};
+
+#define TP_VERSION_MAJOR	1
+#define TP_VERSION_MINOR	1
+#define TP_VERSION_MICRO	0
+
+#define S_TP_VERSION_MAJOR		16
+#define M_TP_VERSION_MAJOR		0xFF
+#define V_TP_VERSION_MAJOR(x)		((x) << S_TP_VERSION_MAJOR)
+#define G_TP_VERSION_MAJOR(x)		\
+	    (((x) >> S_TP_VERSION_MAJOR) & M_TP_VERSION_MAJOR)
+
+#define S_TP_VERSION_MINOR		8
+#define M_TP_VERSION_MINOR		0xFF
+#define V_TP_VERSION_MINOR(x)		((x) << S_TP_VERSION_MINOR)
+#define G_TP_VERSION_MINOR(x)		\
+	    (((x) >> S_TP_VERSION_MINOR) & M_TP_VERSION_MINOR)
+
+#define S_TP_VERSION_MICRO		0
+#define M_TP_VERSION_MICRO		0xFF
+#define V_TP_VERSION_MICRO(x)		((x) << S_TP_VERSION_MICRO)
+#define G_TP_VERSION_MICRO(x)		\
+	    (((x) >> S_TP_VERSION_MICRO) & M_TP_VERSION_MICRO)
+
+enum {
+	SGE_QSETS = 8,		/* # of SGE Tx/Rx/RspQ sets */
+	SGE_RXQ_PER_SET = 2,	/* # of Rx queues per set */
+	SGE_TXQ_PER_SET = 3	/* # of Tx queues per set */
+};
+
+enum sge_context_type {		/* SGE egress context types */
+	SGE_CNTXT_RDMA = 0,
+	SGE_CNTXT_ETH = 2,
+	SGE_CNTXT_OFLD = 4,
+	SGE_CNTXT_CTRL = 5
+};
+
+enum {
+	AN_PKT_SIZE = 32,	/* async notification packet size */
+	IMMED_PKT_SIZE = 48	/* packet size for immediate data */
+};
+
+struct sg_ent {			/* SGE scatter/gather entry */
+	__be32 len[2];
+	__be64 addr[2];
+};
+
+#ifndef SGE_NUM_GENBITS
+/* Must be 1 or 2 */
+# define SGE_NUM_GENBITS 2
+#endif
+
+#define TX_DESC_FLITS 16U
+#define WR_FLITS (TX_DESC_FLITS + 1 - SGE_NUM_GENBITS)
+
+struct cphy;
+struct adapter;
+
+struct mdio_ops {
+	int (*read)(struct net_device *dev, int phy_addr, int mmd_addr,
+		    u16 reg_addr);
+	int (*write)(struct net_device *dev, int phy_addr, int mmd_addr,
+		     u16 reg_addr, u16 val);
+	unsigned mode_support;
+};
+
+struct adapter_info {
+	unsigned char nports0;        /* # of ports on channel 0 */
+	unsigned char nports1;        /* # of ports on channel 1 */
+	unsigned char phy_base_addr;	/* MDIO PHY base address */
+	unsigned int gpio_out;	/* GPIO output settings */
+	unsigned char gpio_intr[MAX_NPORTS]; /* GPIO PHY IRQ pins */
+	unsigned long caps;	/* adapter capabilities */
+	const struct mdio_ops *mdio_ops;	/* MDIO operations */
+	const char *desc;	/* product description */
+};
+
+struct mc5_stats {
+	unsigned long parity_err;
+	unsigned long active_rgn_full;
+	unsigned long nfa_srch_err;
+	unsigned long unknown_cmd;
+	unsigned long reqq_parity_err;
+	unsigned long dispq_parity_err;
+	unsigned long del_act_empty;
+};
+
+struct mc7_stats {
+	unsigned long corr_err;
+	unsigned long uncorr_err;
+	unsigned long parity_err;
+	unsigned long addr_err;
+};
+
+struct mac_stats {
+	u64 tx_octets;		/* total # of octets in good frames */
+	u64 tx_octets_bad;	/* total # of octets in error frames */
+	u64 tx_frames;		/* all good frames */
+	u64 tx_mcast_frames;	/* good multicast frames */
+	u64 tx_bcast_frames;	/* good broadcast frames */
+	u64 tx_pause;		/* # of transmitted pause frames */
+	u64 tx_deferred;	/* frames with deferred transmissions */
+	u64 tx_late_collisions;	/* # of late collisions */
+	u64 tx_total_collisions;	/* # of total collisions */
+	u64 tx_excess_collisions;	/* frame errors from excessive collissions */
+	u64 tx_underrun;	/* # of Tx FIFO underruns */
+	u64 tx_len_errs;	/* # of Tx length errors */
+	u64 tx_mac_internal_errs;	/* # of internal MAC errors on Tx */
+	u64 tx_excess_deferral;	/* # of frames with excessive deferral */
+	u64 tx_fcs_errs;	/* # of frames with bad FCS */
+
+	u64 tx_frames_64;	/* # of Tx frames in a particular range */
+	u64 tx_frames_65_127;
+	u64 tx_frames_128_255;
+	u64 tx_frames_256_511;
+	u64 tx_frames_512_1023;
+	u64 tx_frames_1024_1518;
+	u64 tx_frames_1519_max;
+
+	u64 rx_octets;		/* total # of octets in good frames */
+	u64 rx_octets_bad;	/* total # of octets in error frames */
+	u64 rx_frames;		/* all good frames */
+	u64 rx_mcast_frames;	/* good multicast frames */
+	u64 rx_bcast_frames;	/* good broadcast frames */
+	u64 rx_pause;		/* # of received pause frames */
+	u64 rx_fcs_errs;	/* # of received frames with bad FCS */
+	u64 rx_align_errs;	/* alignment errors */
+	u64 rx_symbol_errs;	/* symbol errors */
+	u64 rx_data_errs;	/* data errors */
+	u64 rx_sequence_errs;	/* sequence errors */
+	u64 rx_runt;		/* # of runt frames */
+	u64 rx_jabber;		/* # of jabber frames */
+	u64 rx_short;		/* # of short frames */
+	u64 rx_too_long;	/* # of oversized frames */
+	u64 rx_mac_internal_errs;	/* # of internal MAC errors on Rx */
+
+	u64 rx_frames_64;	/* # of Rx frames in a particular range */
+	u64 rx_frames_65_127;
+	u64 rx_frames_128_255;
+	u64 rx_frames_256_511;
+	u64 rx_frames_512_1023;
+	u64 rx_frames_1024_1518;
+	u64 rx_frames_1519_max;
+
+	u64 rx_cong_drops;	/* # of Rx drops due to SGE congestion */
+
+	unsigned long tx_fifo_parity_err;
+	unsigned long rx_fifo_parity_err;
+	unsigned long tx_fifo_urun;
+	unsigned long rx_fifo_ovfl;
+	unsigned long serdes_signal_loss;
+	unsigned long xaui_pcs_ctc_err;
+	unsigned long xaui_pcs_align_change;
+
+	unsigned long num_toggled; /* # times toggled TxEn due to stuck TX */
+	unsigned long num_resets;  /* # times reset due to stuck TX */
+
+	unsigned long link_faults;  /* # detected link faults */
+};
+
+struct tp_mib_stats {
+	u32 ipInReceive_hi;
+	u32 ipInReceive_lo;
+	u32 ipInHdrErrors_hi;
+	u32 ipInHdrErrors_lo;
+	u32 ipInAddrErrors_hi;
+	u32 ipInAddrErrors_lo;
+	u32 ipInUnknownProtos_hi;
+	u32 ipInUnknownProtos_lo;
+	u32 ipInDiscards_hi;
+	u32 ipInDiscards_lo;
+	u32 ipInDelivers_hi;
+	u32 ipInDelivers_lo;
+	u32 ipOutRequests_hi;
+	u32 ipOutRequests_lo;
+	u32 ipOutDiscards_hi;
+	u32 ipOutDiscards_lo;
+	u32 ipOutNoRoutes_hi;
+	u32 ipOutNoRoutes_lo;
+	u32 ipReasmTimeout;
+	u32 ipReasmReqds;
+	u32 ipReasmOKs;
+	u32 ipReasmFails;
+
+	u32 reserved[8];
+
+	u32 tcpActiveOpens;
+	u32 tcpPassiveOpens;
+	u32 tcpAttemptFails;
+	u32 tcpEstabResets;
+	u32 tcpOutRsts;
+	u32 tcpCurrEstab;
+	u32 tcpInSegs_hi;
+	u32 tcpInSegs_lo;
+	u32 tcpOutSegs_hi;
+	u32 tcpOutSegs_lo;
+	u32 tcpRetransSeg_hi;
+	u32 tcpRetransSeg_lo;
+	u32 tcpInErrs_hi;
+	u32 tcpInErrs_lo;
+	u32 tcpRtoMin;
+	u32 tcpRtoMax;
+};
+
+struct tp_params {
+	unsigned int nchan;	/* # of channels */
+	unsigned int pmrx_size;	/* total PMRX capacity */
+	unsigned int pmtx_size;	/* total PMTX capacity */
+	unsigned int cm_size;	/* total CM capacity */
+	unsigned int chan_rx_size;	/* per channel Rx size */
+	unsigned int chan_tx_size;	/* per channel Tx size */
+	unsigned int rx_pg_size;	/* Rx page size */
+	unsigned int tx_pg_size;	/* Tx page size */
+	unsigned int rx_num_pgs;	/* # of Rx pages */
+	unsigned int tx_num_pgs;	/* # of Tx pages */
+	unsigned int ntimer_qs;	/* # of timer queues */
+};
+
+struct qset_params {		/* SGE queue set parameters */
+	unsigned int polling;	/* polling/interrupt service for rspq */
+	unsigned int coalesce_usecs;	/* irq coalescing timer */
+	unsigned int rspq_size;	/* # of entries in response queue */
+	unsigned int fl_size;	/* # of entries in regular free list */
+	unsigned int jumbo_size;	/* # of entries in jumbo free list */
+	unsigned int txq_size[SGE_TXQ_PER_SET];	/* Tx queue sizes */
+	unsigned int cong_thres;	/* FL congestion threshold */
+	unsigned int vector;		/* Interrupt (line or vector) number */
+};
+
+struct sge_params {
+	unsigned int max_pkt_size;	/* max offload pkt size */
+	struct qset_params qset[SGE_QSETS];
+};
+
+struct mc5_params {
+	unsigned int mode;	/* selects MC5 width */
+	unsigned int nservers;	/* size of server region */
+	unsigned int nfilters;	/* size of filter region */
+	unsigned int nroutes;	/* size of routing region */
+};
+
+/* Default MC5 region sizes */
+enum {
+	DEFAULT_NSERVERS = 512,
+	DEFAULT_NFILTERS = 128
+};
+
+/* MC5 modes, these must be non-0 */
+enum {
+	MC5_MODE_144_BIT = 1,
+	MC5_MODE_72_BIT = 2
+};
+
+/* MC5 min active region size */
+enum { MC5_MIN_TIDS = 16 };
+
+struct vpd_params {
+	unsigned int cclk;
+	unsigned int mclk;
+	unsigned int uclk;
+	unsigned int mdc;
+	unsigned int mem_timing;
+	u8 sn[SERNUM_LEN + 1];
+	u8 eth_base[6];
+	u8 port_type[MAX_NPORTS];
+	unsigned short xauicfg[2];
+};
+
+struct pci_params {
+	unsigned int vpd_cap_addr;
+	unsigned short speed;
+	unsigned char width;
+	unsigned char variant;
+};
+
+enum {
+	PCI_VARIANT_PCI,
+	PCI_VARIANT_PCIX_MODE1_PARITY,
+	PCI_VARIANT_PCIX_MODE1_ECC,
+	PCI_VARIANT_PCIX_266_MODE2,
+	PCI_VARIANT_PCIE
+};
+
+struct adapter_params {
+	struct sge_params sge;
+	struct mc5_params mc5;
+	struct tp_params tp;
+	struct vpd_params vpd;
+	struct pci_params pci;
+
+	const struct adapter_info *info;
+
+	unsigned short mtus[NMTUS];
+	unsigned short a_wnd[NCCTRL_WIN];
+	unsigned short b_wnd[NCCTRL_WIN];
+
+	unsigned int nports;	/* # of ethernet ports */
+	unsigned int chan_map;  /* bitmap of in-use Tx channels */
+	unsigned int stats_update_period;	/* MAC stats accumulation period */
+	unsigned int linkpoll_period;	/* link poll period in 0.1s */
+	unsigned int rev;	/* chip revision */
+	unsigned int offload;
+};
+
+enum {					    /* chip revisions */
+	T3_REV_A  = 0,
+	T3_REV_B  = 2,
+	T3_REV_B2 = 3,
+	T3_REV_C  = 4,
+};
+
+struct trace_params {
+	u32 sip;
+	u32 sip_mask;
+	u32 dip;
+	u32 dip_mask;
+	u16 sport;
+	u16 sport_mask;
+	u16 dport;
+	u16 dport_mask;
+	u32 vlan:12;
+	u32 vlan_mask:12;
+	u32 intf:4;
+	u32 intf_mask:4;
+	u8 proto;
+	u8 proto_mask;
+};
+
+struct link_config {
+	unsigned int supported;	/* link capabilities */
+	unsigned int advertising;	/* advertised capabilities */
+	unsigned short requested_speed;	/* speed user has requested */
+	unsigned short speed;	/* actual link speed */
+	unsigned char requested_duplex;	/* duplex user has requested */
+	unsigned char duplex;	/* actual link duplex */
+	unsigned char requested_fc;	/* flow control user has requested */
+	unsigned char fc;	/* actual link flow control */
+	unsigned char autoneg;	/* autonegotiating? */
+	unsigned int link_ok;	/* link up? */
+};
+
+#define SPEED_INVALID   0xffff
+#define DUPLEX_INVALID  0xff
+
+struct mc5 {
+	struct adapter *adapter;
+	unsigned int tcam_size;
+	unsigned char part_type;
+	unsigned char parity_enabled;
+	unsigned char mode;
+	struct mc5_stats stats;
+};
+
+static inline unsigned int t3_mc5_size(const struct mc5 *p)
+{
+	return p->tcam_size;
+}
+
+struct mc7 {
+	struct adapter *adapter;	/* backpointer to adapter */
+	unsigned int size;	/* memory size in bytes */
+	unsigned int width;	/* MC7 interface width */
+	unsigned int offset;	/* register address offset for MC7 instance */
+	const char *name;	/* name of MC7 instance */
+	struct mc7_stats stats;	/* MC7 statistics */
+};
+
+static inline unsigned int t3_mc7_size(const struct mc7 *p)
+{
+	return p->size;
+}
+
+struct cmac {
+	struct adapter *adapter;
+	unsigned int offset;
+	unsigned int nucast;	/* # of address filters for unicast MACs */
+	unsigned int tx_tcnt;
+	unsigned int tx_xcnt;
+	u64 tx_mcnt;
+	unsigned int rx_xcnt;
+	unsigned int rx_ocnt;
+	u64 rx_mcnt;
+	unsigned int toggle_cnt;
+	unsigned int txen;
+	u64 rx_pause;
+	struct mac_stats stats;
+};
+
+enum {
+	MAC_DIRECTION_RX = 1,
+	MAC_DIRECTION_TX = 2,
+	MAC_RXFIFO_SIZE = 32768
+};
+
+/* PHY loopback direction */
+enum {
+	PHY_LOOPBACK_TX = 1,
+	PHY_LOOPBACK_RX = 2
+};
+
+/* PHY interrupt types */
+enum {
+	cphy_cause_link_change = 1,
+	cphy_cause_fifo_error = 2,
+	cphy_cause_module_change = 4,
+};
+
+/* PHY module types */
+enum {
+	phy_modtype_none,
+	phy_modtype_sr,
+	phy_modtype_lr,
+	phy_modtype_lrm,
+	phy_modtype_twinax,
+	phy_modtype_twinax_long,
+	phy_modtype_unknown
+};
+
+/* PHY operations */
+struct cphy_ops {
+	int (*reset)(struct cphy *phy, int wait);
+
+	int (*intr_enable)(struct cphy *phy);
+	int (*intr_disable)(struct cphy *phy);
+	int (*intr_clear)(struct cphy *phy);
+	int (*intr_handler)(struct cphy *phy);
+
+	int (*autoneg_enable)(struct cphy *phy);
+	int (*autoneg_restart)(struct cphy *phy);
+
+	int (*advertise)(struct cphy *phy, unsigned int advertise_map);
+	int (*set_loopback)(struct cphy *phy, int mmd, int dir, int enable);
+	int (*set_speed_duplex)(struct cphy *phy, int speed, int duplex);
+	int (*get_link_status)(struct cphy *phy, int *link_ok, int *speed,
+			       int *duplex, int *fc);
+	int (*power_down)(struct cphy *phy, int enable);
+
+	u32 mmds;
+};
+enum {
+	EDC_OPT_AEL2005 = 0,
+	EDC_OPT_AEL2005_SIZE = 1084,
+	EDC_TWX_AEL2005 = 1,
+	EDC_TWX_AEL2005_SIZE = 1464,
+	EDC_TWX_AEL2020 = 2,
+	EDC_TWX_AEL2020_SIZE = 1628,
+	EDC_MAX_SIZE = EDC_TWX_AEL2020_SIZE, /* Max cache size */
+};
+
+/* A PHY instance */
+struct cphy {
+	u8 modtype;			/* PHY module type */
+	short priv;			/* scratch pad */
+	unsigned int caps;		/* PHY capabilities */
+	struct adapter *adapter;	/* associated adapter */
+	const char *desc;		/* PHY description */
+	unsigned long fifo_errors;	/* FIFO over/under-flows */
+	const struct cphy_ops *ops;	/* PHY operations */
+	struct mdio_if_info mdio;
+	u16 phy_cache[EDC_MAX_SIZE];	/* EDC cache */
+};
+
+/* Convenience MDIO read/write wrappers */
+static inline int t3_mdio_read(struct cphy *phy, int mmd, int reg,
+			       unsigned int *valp)
+{
+	int rc = phy->mdio.mdio_read(phy->mdio.dev, phy->mdio.prtad, mmd, reg);
+	*valp = (rc >= 0) ? rc : -1;
+	return (rc >= 0) ? 0 : rc;
+}
+
+static inline int t3_mdio_write(struct cphy *phy, int mmd, int reg,
+				unsigned int val)
+{
+	return phy->mdio.mdio_write(phy->mdio.dev, phy->mdio.prtad, mmd,
+				    reg, val);
+}
+
+/* Convenience initializer */
+static inline void cphy_init(struct cphy *phy, struct adapter *adapter,
+			     int phy_addr, const struct cphy_ops *phy_ops,
+			     const struct mdio_ops *mdio_ops,
+			      unsigned int caps, const char *desc)
+{
+	phy->caps = caps;
+	phy->adapter = adapter;
+	phy->desc = desc;
+	phy->ops = phy_ops;
+	if (mdio_ops) {
+		phy->mdio.prtad = phy_addr;
+		phy->mdio.mmds = phy_ops->mmds;
+		phy->mdio.mode_support = mdio_ops->mode_support;
+		phy->mdio.mdio_read = mdio_ops->read;
+		phy->mdio.mdio_write = mdio_ops->write;
+	}
+}
+
+/* Accumulate MAC statistics every 180 seconds.  For 1G we multiply by 10. */
+#define MAC_STATS_ACCUM_SECS 180
+
+#define XGM_REG(reg_addr, idx) \
+	((reg_addr) + (idx) * (XGMAC0_1_BASE_ADDR - XGMAC0_0_BASE_ADDR))
+
+struct addr_val_pair {
+	unsigned int reg_addr;
+	unsigned int val;
+};
+
+#include "adapter.h"
+
+#ifndef PCI_VENDOR_ID_CHELSIO
+# define PCI_VENDOR_ID_CHELSIO 0x1425
+#endif
+
+#define for_each_port(adapter, iter) \
+	for (iter = 0; iter < (adapter)->params.nports; ++iter)
+
+#define adapter_info(adap) ((adap)->params.info)
+
+static inline int uses_xaui(const struct adapter *adap)
+{
+	return adapter_info(adap)->caps & SUPPORTED_AUI;
+}
+
+static inline int is_10G(const struct adapter *adap)
+{
+	return adapter_info(adap)->caps & SUPPORTED_10000baseT_Full;
+}
+
+static inline int is_offload(const struct adapter *adap)
+{
+	return adap->params.offload;
+}
+
+static inline unsigned int core_ticks_per_usec(const struct adapter *adap)
+{
+	return adap->params.vpd.cclk / 1000;
+}
+
+static inline unsigned int is_pcie(const struct adapter *adap)
+{
+	return adap->params.pci.variant == PCI_VARIANT_PCIE;
+}
+
+void t3_set_reg_field(struct adapter *adap, unsigned int addr, u32 mask,
+		      u32 val);
+void t3_write_regs(struct adapter *adapter, const struct addr_val_pair *p,
+		   int n, unsigned int offset);
+int t3_wait_op_done_val(struct adapter *adapter, int reg, u32 mask,
+			int polarity, int attempts, int delay, u32 *valp);
+static inline int t3_wait_op_done(struct adapter *adapter, int reg, u32 mask,
+				  int polarity, int attempts, int delay)
+{
+	return t3_wait_op_done_val(adapter, reg, mask, polarity, attempts,
+				   delay, NULL);
+}
+int t3_mdio_change_bits(struct cphy *phy, int mmd, int reg, unsigned int clear,
+			unsigned int set);
+int t3_phy_reset(struct cphy *phy, int mmd, int wait);
+int t3_phy_advertise(struct cphy *phy, unsigned int advert);
+int t3_phy_advertise_fiber(struct cphy *phy, unsigned int advert);
+int t3_set_phy_speed_duplex(struct cphy *phy, int speed, int duplex);
+int t3_phy_lasi_intr_enable(struct cphy *phy);
+int t3_phy_lasi_intr_disable(struct cphy *phy);
+int t3_phy_lasi_intr_clear(struct cphy *phy);
+int t3_phy_lasi_intr_handler(struct cphy *phy);
+
+void t3_intr_enable(struct adapter *adapter);
+void t3_intr_disable(struct adapter *adapter);
+void t3_intr_clear(struct adapter *adapter);
+void t3_xgm_intr_enable(struct adapter *adapter, int idx);
+void t3_xgm_intr_disable(struct adapter *adapter, int idx);
+void t3_port_intr_enable(struct adapter *adapter, int idx);
+void t3_port_intr_disable(struct adapter *adapter, int idx);
+int t3_slow_intr_handler(struct adapter *adapter);
+int t3_phy_intr_handler(struct adapter *adapter);
+
+void t3_link_changed(struct adapter *adapter, int port_id);
+void t3_link_fault(struct adapter *adapter, int port_id);
+int t3_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc);
+const struct adapter_info *t3_get_adapter_info(unsigned int board_id);
+int t3_seeprom_read(struct adapter *adapter, u32 addr, __le32 *data);
+int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data);
+int t3_seeprom_wp(struct adapter *adapter, int enable);
+int t3_get_tp_version(struct adapter *adapter, u32 *vers);
+int t3_check_tpsram_version(struct adapter *adapter);
+int t3_check_tpsram(struct adapter *adapter, const u8 *tp_ram,
+		    unsigned int size);
+int t3_set_proto_sram(struct adapter *adap, const u8 *data);
+int t3_load_fw(struct adapter *adapter, const u8 * fw_data, unsigned int size);
+int t3_get_fw_version(struct adapter *adapter, u32 *vers);
+int t3_check_fw_version(struct adapter *adapter);
+int t3_init_hw(struct adapter *adapter, u32 fw_params);
+int t3_reset_adapter(struct adapter *adapter);
+int t3_prep_adapter(struct adapter *adapter, const struct adapter_info *ai,
+		    int reset);
+int t3_replay_prep_adapter(struct adapter *adapter);
+void t3_led_ready(struct adapter *adapter);
+void t3_fatal_err(struct adapter *adapter);
+void t3_set_vlan_accel(struct adapter *adapter, unsigned int ports, int on);
+void t3_config_rss(struct adapter *adapter, unsigned int rss_config,
+		   const u8 * cpus, const u16 *rspq);
+int t3_cim_ctl_blk_read(struct adapter *adap, unsigned int addr,
+			unsigned int n, unsigned int *valp);
+int t3_mc7_bd_read(struct mc7 *mc7, unsigned int start, unsigned int n,
+		   u64 *buf);
+
+int t3_mac_reset(struct cmac *mac);
+void t3b_pcs_reset(struct cmac *mac);
+void t3_mac_disable_exact_filters(struct cmac *mac);
+void t3_mac_enable_exact_filters(struct cmac *mac);
+int t3_mac_enable(struct cmac *mac, int which);
+int t3_mac_disable(struct cmac *mac, int which);
+int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu);
+int t3_mac_set_rx_mode(struct cmac *mac, struct net_device *dev);
+int t3_mac_set_address(struct cmac *mac, unsigned int idx, u8 addr[6]);
+int t3_mac_set_num_ucast(struct cmac *mac, int n);
+const struct mac_stats *t3_mac_update_stats(struct cmac *mac);
+int t3_mac_set_speed_duplex_fc(struct cmac *mac, int speed, int duplex, int fc);
+int t3b2_mac_watchdog_task(struct cmac *mac);
+
+void t3_mc5_prep(struct adapter *adapter, struct mc5 *mc5, int mode);
+int t3_mc5_init(struct mc5 *mc5, unsigned int nservers, unsigned int nfilters,
+		unsigned int nroutes);
+void t3_mc5_intr_handler(struct mc5 *mc5);
+
+void t3_tp_set_offload_mode(struct adapter *adap, int enable);
+void t3_tp_get_mib_stats(struct adapter *adap, struct tp_mib_stats *tps);
+void t3_load_mtus(struct adapter *adap, unsigned short mtus[NMTUS],
+		  unsigned short alpha[NCCTRL_WIN],
+		  unsigned short beta[NCCTRL_WIN], unsigned short mtu_cap);
+void t3_config_trace_filter(struct adapter *adapter,
+			    const struct trace_params *tp, int filter_index,
+			    int invert, int enable);
+int t3_config_sched(struct adapter *adap, unsigned int kbps, int sched);
+
+void t3_sge_prep(struct adapter *adap, struct sge_params *p);
+void t3_sge_init(struct adapter *adap, struct sge_params *p);
+int t3_sge_init_ecntxt(struct adapter *adapter, unsigned int id, int gts_enable,
+		       enum sge_context_type type, int respq, u64 base_addr,
+		       unsigned int size, unsigned int token, int gen,
+		       unsigned int cidx);
+int t3_sge_init_flcntxt(struct adapter *adapter, unsigned int id,
+			int gts_enable, u64 base_addr, unsigned int size,
+			unsigned int esize, unsigned int cong_thres, int gen,
+			unsigned int cidx);
+int t3_sge_init_rspcntxt(struct adapter *adapter, unsigned int id,
+			 int irq_vec_idx, u64 base_addr, unsigned int size,
+			 unsigned int fl_thres, int gen, unsigned int cidx);
+int t3_sge_init_cqcntxt(struct adapter *adapter, unsigned int id, u64 base_addr,
+			unsigned int size, int rspq, int ovfl_mode,
+			unsigned int credits, unsigned int credit_thres);
+int t3_sge_enable_ecntxt(struct adapter *adapter, unsigned int id, int enable);
+int t3_sge_disable_fl(struct adapter *adapter, unsigned int id);
+int t3_sge_disable_rspcntxt(struct adapter *adapter, unsigned int id);
+int t3_sge_disable_cqcntxt(struct adapter *adapter, unsigned int id);
+int t3_sge_cqcntxt_op(struct adapter *adapter, unsigned int id, unsigned int op,
+		      unsigned int credits);
+
+int t3_vsc8211_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops);
+int t3_ael1002_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops);
+int t3_ael1006_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops);
+int t3_ael2005_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops);
+int t3_ael2020_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops);
+int t3_qt2045_phy_prep(struct cphy *phy, struct adapter *adapter, int phy_addr,
+		       const struct mdio_ops *mdio_ops);
+int t3_xaui_direct_phy_prep(struct cphy *phy, struct adapter *adapter,
+			    int phy_addr, const struct mdio_ops *mdio_ops);
+int t3_aq100x_phy_prep(struct cphy *phy, struct adapter *adapter,
+			    int phy_addr, const struct mdio_ops *mdio_ops);
+#endif				/* __CHELSIO_COMMON_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ctl_defs.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ctl_defs.h
new file mode 100644
index 0000000..369fe71
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ctl_defs.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _CXGB3_OFFLOAD_CTL_DEFS_H
+#define _CXGB3_OFFLOAD_CTL_DEFS_H
+
+enum {
+	GET_MAX_OUTSTANDING_WR 	= 0,
+	GET_TX_MAX_CHUNK	= 1,
+	GET_TID_RANGE		= 2,
+	GET_STID_RANGE		= 3,
+	GET_RTBL_RANGE		= 4,
+	GET_L2T_CAPACITY	= 5,
+	GET_MTUS		= 6,
+	GET_WR_LEN		= 7,
+	GET_IFF_FROM_MAC	= 8,
+	GET_DDP_PARAMS		= 9,
+	GET_PORTS		= 10,
+
+	ULP_ISCSI_GET_PARAMS	= 11,
+	ULP_ISCSI_SET_PARAMS	= 12,
+
+	RDMA_GET_PARAMS		= 13,
+	RDMA_CQ_OP		= 14,
+	RDMA_CQ_SETUP		= 15,
+	RDMA_CQ_DISABLE		= 16,
+	RDMA_CTRL_QP_SETUP	= 17,
+	RDMA_GET_MEM		= 18,
+	RDMA_GET_MIB		= 19,
+
+	GET_RX_PAGE_INFO	= 50,
+	GET_ISCSI_IPV4ADDR	= 51,
+
+	GET_EMBEDDED_INFO	= 70,
+};
+
+/*
+ * Structure used to describe a TID range.  Valid TIDs are [base, base+num).
+ */
+struct tid_range {
+	unsigned int base;	/* first TID */
+	unsigned int num;	/* number of TIDs in range */
+};
+
+/*
+ * Structure used to request the size and contents of the MTU table.
+ */
+struct mtutab {
+	unsigned int size;	/* # of entries in the MTU table */
+	const unsigned short *mtus;	/* the MTU table values */
+};
+
+struct net_device;
+
+/*
+ * Structure used to request the adapter net_device owning a given MAC address.
+ */
+struct iff_mac {
+	struct net_device *dev;	/* the net_device */
+	const unsigned char *mac_addr;	/* MAC address to lookup */
+	u16 vlan_tag;
+};
+
+/* Structure used to request a port's iSCSI IPv4 address */
+struct iscsi_ipv4addr {
+	struct net_device *dev;	/* the net_device */
+	__be32 ipv4addr;	/* the return iSCSI IPv4 address */
+};
+
+struct pci_dev;
+
+/*
+ * Structure used to request the TCP DDP parameters.
+ */
+struct ddp_params {
+	unsigned int llimit;	/* TDDP region start address */
+	unsigned int ulimit;	/* TDDP region end address */
+	unsigned int tag_mask;	/* TDDP tag mask */
+	struct pci_dev *pdev;
+};
+
+struct adap_ports {
+	unsigned int nports;	/* number of ports on this adapter */
+	struct net_device *lldevs[2];
+};
+
+/*
+ * Structure used to return information to the iscsi layer.
+ */
+struct ulp_iscsi_info {
+	unsigned int offset;
+	unsigned int llimit;
+	unsigned int ulimit;
+	unsigned int tagmask;
+	u8 pgsz_factor[4];
+	unsigned int max_rxsz;
+	unsigned int max_txsz;
+	struct pci_dev *pdev;
+};
+
+/*
+ * Structure used to return information to the RDMA layer.
+ */
+struct rdma_info {
+	unsigned int tpt_base;	/* TPT base address */
+	unsigned int tpt_top;	/* TPT last entry address */
+	unsigned int pbl_base;	/* PBL base address */
+	unsigned int pbl_top;	/* PBL last entry address */
+	unsigned int rqt_base;	/* RQT base address */
+	unsigned int rqt_top;	/* RQT last entry address */
+	unsigned int udbell_len;	/* user doorbell region length */
+	unsigned long udbell_physbase;	/* user doorbell physical start addr */
+	void __iomem *kdb_addr;	/* kernel doorbell register address */
+	struct pci_dev *pdev;	/* associated PCI device */
+};
+
+/*
+ * Structure used to request an operation on an RDMA completion queue.
+ */
+struct rdma_cq_op {
+	unsigned int id;
+	unsigned int op;
+	unsigned int credits;
+};
+
+/*
+ * Structure used to setup RDMA completion queues.
+ */
+struct rdma_cq_setup {
+	unsigned int id;
+	unsigned long long base_addr;
+	unsigned int size;
+	unsigned int credits;
+	unsigned int credit_thres;
+	unsigned int ovfl_mode;
+};
+
+/*
+ * Structure used to setup the RDMA control egress context.
+ */
+struct rdma_ctrlqp_setup {
+	unsigned long long base_addr;
+	unsigned int size;
+};
+
+/*
+ * Offload TX/RX page information.
+ */
+struct ofld_page_info {
+	unsigned int page_size;  /* Page size, should be a power of 2 */
+	unsigned int num;        /* Number of pages */
+};
+
+/*
+ * Structure used to get firmware and protocol engine versions.
+ */
+struct ch_embedded_info {
+	u32 fw_vers;
+	u32 tp_vers;
+};
+#endif				/* _CXGB3_OFFLOAD_CTL_DEFS_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h
new file mode 100644
index 0000000..f04e81f
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2006-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _CHELSIO_DEFS_H
+#define _CHELSIO_DEFS_H
+
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+
+#include "t3cdev.h"
+
+#include "cxgb3_offload.h"
+
+#define VALIDATE_TID 1
+
+/*
+ * Map an ATID or STID to their entries in the corresponding TID tables.
+ */
+static inline union active_open_entry *atid2entry(const struct tid_info *t,
+						  unsigned int atid)
+{
+	return &t->atid_tab[atid - t->atid_base];
+}
+
+static inline union listen_entry *stid2entry(const struct tid_info *t,
+					     unsigned int stid)
+{
+	return &t->stid_tab[stid - t->stid_base];
+}
+
+/*
+ * Find the connection corresponding to a TID.
+ */
+static inline struct t3c_tid_entry *lookup_tid(const struct tid_info *t,
+					       unsigned int tid)
+{
+	struct t3c_tid_entry *t3c_tid = tid < t->ntids ?
+	    &(t->tid_tab[tid]) : NULL;
+
+	return (t3c_tid && t3c_tid->client) ? t3c_tid : NULL;
+}
+
+/*
+ * Find the connection corresponding to a server TID.
+ */
+static inline struct t3c_tid_entry *lookup_stid(const struct tid_info *t,
+						unsigned int tid)
+{
+	union listen_entry *e;
+
+	if (tid < t->stid_base || tid >= t->stid_base + t->nstids)
+		return NULL;
+
+	e = stid2entry(t, tid);
+	if ((void *)e->next >= (void *)t->tid_tab &&
+	    (void *)e->next < (void *)&t->atid_tab[t->natids])
+		return NULL;
+
+	return &e->t3c_tid;
+}
+
+/*
+ * Find the connection corresponding to an active-open TID.
+ */
+static inline struct t3c_tid_entry *lookup_atid(const struct tid_info *t,
+						unsigned int tid)
+{
+	union active_open_entry *e;
+
+	if (tid < t->atid_base || tid >= t->atid_base + t->natids)
+		return NULL;
+
+	e = atid2entry(t, tid);
+	if ((void *)e->next >= (void *)t->tid_tab &&
+	    (void *)e->next < (void *)&t->atid_tab[t->natids])
+		return NULL;
+
+	return &e->t3c_tid;
+}
+
+int attach_t3cdev(struct t3cdev *dev);
+void detach_t3cdev(struct t3cdev *dev);
+#endif
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ioctl.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ioctl.h
new file mode 100644
index 0000000..b19e437
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ioctl.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __CHIOCTL_H__
+#define __CHIOCTL_H__
+
+/*
+ * Ioctl commands specific to this driver.
+ */
+enum {
+	CHELSIO_GETMTUTAB 		= 1029,
+	CHELSIO_SETMTUTAB 		= 1030,
+	CHELSIO_SET_PM 			= 1032,
+	CHELSIO_GET_PM			= 1033,
+	CHELSIO_GET_MEM			= 1038,
+	CHELSIO_LOAD_FW			= 1041,
+	CHELSIO_SET_TRACE_FILTER	= 1044,
+	CHELSIO_SET_QSET_PARAMS		= 1045,
+	CHELSIO_GET_QSET_PARAMS		= 1046,
+	CHELSIO_SET_QSET_NUM		= 1047,
+	CHELSIO_GET_QSET_NUM		= 1048,
+};
+
+struct ch_reg {
+	uint32_t cmd;
+	uint32_t addr;
+	uint32_t val;
+};
+
+struct ch_cntxt {
+	uint32_t cmd;
+	uint32_t cntxt_type;
+	uint32_t cntxt_id;
+	uint32_t data[4];
+};
+
+/* context types */
+enum { CNTXT_TYPE_EGRESS, CNTXT_TYPE_FL, CNTXT_TYPE_RSP, CNTXT_TYPE_CQ };
+
+struct ch_desc {
+	uint32_t cmd;
+	uint32_t queue_num;
+	uint32_t idx;
+	uint32_t size;
+	uint8_t data[128];
+};
+
+struct ch_mem_range {
+	uint32_t cmd;
+	uint32_t mem_id;
+	uint32_t addr;
+	uint32_t len;
+	uint32_t version;
+	uint8_t buf[0];
+};
+
+struct ch_qset_params {
+	uint32_t cmd;
+	uint32_t qset_idx;
+	int32_t txq_size[3];
+	int32_t rspq_size;
+	int32_t fl_size[2];
+	int32_t intr_lat;
+	int32_t polling;
+	int32_t lro;
+	int32_t cong_thres;
+	int32_t  vector;
+	int32_t  qnum;
+};
+
+struct ch_pktsched_params {
+	uint32_t cmd;
+	uint8_t sched;
+	uint8_t idx;
+	uint8_t min;
+	uint8_t max;
+	uint8_t binding;
+};
+
+#ifndef TCB_SIZE
+# define TCB_SIZE   128
+#endif
+
+/* TCB size in 32-bit words */
+#define TCB_WORDS (TCB_SIZE / 4)
+
+enum { MEM_CM, MEM_PMRX, MEM_PMTX };	/* ch_mem_range.mem_id values */
+
+struct ch_mtus {
+	uint32_t cmd;
+	uint32_t nmtus;
+	uint16_t mtus[NMTUS];
+};
+
+struct ch_pm {
+	uint32_t cmd;
+	uint32_t tx_pg_sz;
+	uint32_t tx_num_pg;
+	uint32_t rx_pg_sz;
+	uint32_t rx_num_pg;
+	uint32_t pm_total;
+};
+
+struct ch_tcam {
+	uint32_t cmd;
+	uint32_t tcam_size;
+	uint32_t nservers;
+	uint32_t nroutes;
+	uint32_t nfilters;
+};
+
+struct ch_tcb {
+	uint32_t cmd;
+	uint32_t tcb_index;
+	uint32_t tcb_data[TCB_WORDS];
+};
+
+struct ch_tcam_word {
+	uint32_t cmd;
+	uint32_t addr;
+	uint32_t buf[3];
+};
+
+struct ch_trace {
+	uint32_t cmd;
+	uint32_t sip;
+	uint32_t sip_mask;
+	uint32_t dip;
+	uint32_t dip_mask;
+	uint16_t sport;
+	uint16_t sport_mask;
+	uint16_t dport;
+	uint16_t dport_mask;
+	uint32_t vlan:12;
+	uint32_t vlan_mask:12;
+	uint32_t intf:4;
+	uint32_t intf_mask:4;
+	uint8_t proto;
+	uint8_t proto_mask;
+	uint8_t invert_match:1;
+	uint8_t config_tx:1;
+	uint8_t config_rx:1;
+	uint8_t trace_tx:1;
+	uint8_t trace_rx:1;
+};
+
+#define SIOCCHIOCTL SIOCDEVPRIVATE
+
+#endif
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
new file mode 100644
index 0000000..2edfdbd
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -0,0 +1,3451 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/mdio.h>
+#include <linux/sockios.h>
+#include <linux/workqueue.h>
+#include <linux/proc_fs.h>
+#include <linux/rtnetlink.h>
+#include <linux/firmware.h>
+#include <linux/log2.h>
+#include <linux/stringify.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "common.h"
+#include "cxgb3_ioctl.h"
+#include "regs.h"
+#include "cxgb3_offload.h"
+#include "version.h"
+
+#include "cxgb3_ctl_defs.h"
+#include "t3_cpl.h"
+#include "firmware_exports.h"
+
+enum {
+	MAX_TXQ_ENTRIES = 16384,
+	MAX_CTRL_TXQ_ENTRIES = 1024,
+	MAX_RSPQ_ENTRIES = 16384,
+	MAX_RX_BUFFERS = 16384,
+	MAX_RX_JUMBO_BUFFERS = 16384,
+	MIN_TXQ_ENTRIES = 4,
+	MIN_CTRL_TXQ_ENTRIES = 4,
+	MIN_RSPQ_ENTRIES = 32,
+	MIN_FL_ENTRIES = 32
+};
+
+#define PORT_MASK ((1 << MAX_NPORTS) - 1)
+
+#define DFLT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK | \
+			 NETIF_MSG_TIMER | NETIF_MSG_IFDOWN | NETIF_MSG_IFUP |\
+			 NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR)
+
+#define EEPROM_MAGIC 0x38E2F10C
+
+#define CH_DEVICE(devid, idx) \
+	{ PCI_VENDOR_ID_CHELSIO, devid, PCI_ANY_ID, PCI_ANY_ID, 0, 0, idx }
+
+static const struct pci_device_id cxgb3_pci_tbl[] = {
+	CH_DEVICE(0x20, 0),	/* PE9000 */
+	CH_DEVICE(0x21, 1),	/* T302E */
+	CH_DEVICE(0x22, 2),	/* T310E */
+	CH_DEVICE(0x23, 3),	/* T320X */
+	CH_DEVICE(0x24, 1),	/* T302X */
+	CH_DEVICE(0x25, 3),	/* T320E */
+	CH_DEVICE(0x26, 2),	/* T310X */
+	CH_DEVICE(0x30, 2),	/* T3B10 */
+	CH_DEVICE(0x31, 3),	/* T3B20 */
+	CH_DEVICE(0x32, 1),	/* T3B02 */
+	CH_DEVICE(0x35, 6),	/* T3C20-derived T3C10 */
+	CH_DEVICE(0x36, 3),	/* S320E-CR */
+	CH_DEVICE(0x37, 7),	/* N320E-G2 */
+	{0,}
+};
+
+MODULE_DESCRIPTION(DRV_DESC);
+MODULE_AUTHOR("Chelsio Communications");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+MODULE_DEVICE_TABLE(pci, cxgb3_pci_tbl);
+
+static int dflt_msg_enable = DFLT_MSG_ENABLE;
+
+module_param(dflt_msg_enable, int, 0644);
+MODULE_PARM_DESC(dflt_msg_enable, "Chelsio T3 default message enable bitmap");
+
+/*
+ * The driver uses the best interrupt scheme available on a platform in the
+ * order MSI-X, MSI, legacy pin interrupts.  This parameter determines which
+ * of these schemes the driver may consider as follows:
+ *
+ * msi = 2: choose from among all three options
+ * msi = 1: only consider MSI and pin interrupts
+ * msi = 0: force pin interrupts
+ */
+static int msi = 2;
+
+module_param(msi, int, 0644);
+MODULE_PARM_DESC(msi, "whether to use MSI or MSI-X");
+
+/*
+ * The driver enables offload as a default.
+ * To disable it, use ofld_disable = 1.
+ */
+
+static int ofld_disable = 0;
+
+module_param(ofld_disable, int, 0644);
+MODULE_PARM_DESC(ofld_disable, "whether to enable offload at init time or not");
+
+/*
+ * We have work elements that we need to cancel when an interface is taken
+ * down.  Normally the work elements would be executed by keventd but that
+ * can deadlock because of linkwatch.  If our close method takes the rtnl
+ * lock and linkwatch is ahead of our work elements in keventd, linkwatch
+ * will block keventd as it needs the rtnl lock, and we'll deadlock waiting
+ * for our work to complete.  Get our own work queue to solve this.
+ */
+struct workqueue_struct *cxgb3_wq;
+
+/**
+ *	link_report - show link status and link speed/duplex
+ *	@p: the port whose settings are to be reported
+ *
+ *	Shows the link status, speed, and duplex of a port.
+ */
+static void link_report(struct net_device *dev)
+{
+	if (!netif_carrier_ok(dev))
+		netdev_info(dev, "link down\n");
+	else {
+		const char *s = "10Mbps";
+		const struct port_info *p = netdev_priv(dev);
+
+		switch (p->link_config.speed) {
+		case SPEED_10000:
+			s = "10Gbps";
+			break;
+		case SPEED_1000:
+			s = "1000Mbps";
+			break;
+		case SPEED_100:
+			s = "100Mbps";
+			break;
+		}
+
+		netdev_info(dev, "link up, %s, %s-duplex\n",
+			    s, p->link_config.duplex == DUPLEX_FULL
+			    ? "full" : "half");
+	}
+}
+
+static void enable_tx_fifo_drain(struct adapter *adapter,
+				 struct port_info *pi)
+{
+	t3_set_reg_field(adapter, A_XGM_TXFIFO_CFG + pi->mac.offset, 0,
+			 F_ENDROPPKT);
+	t3_write_reg(adapter, A_XGM_RX_CTRL + pi->mac.offset, 0);
+	t3_write_reg(adapter, A_XGM_TX_CTRL + pi->mac.offset, F_TXEN);
+	t3_write_reg(adapter, A_XGM_RX_CTRL + pi->mac.offset, F_RXEN);
+}
+
+static void disable_tx_fifo_drain(struct adapter *adapter,
+				  struct port_info *pi)
+{
+	t3_set_reg_field(adapter, A_XGM_TXFIFO_CFG + pi->mac.offset,
+			 F_ENDROPPKT, 0);
+}
+
+void t3_os_link_fault(struct adapter *adap, int port_id, int state)
+{
+	struct net_device *dev = adap->port[port_id];
+	struct port_info *pi = netdev_priv(dev);
+
+	if (state == netif_carrier_ok(dev))
+		return;
+
+	if (state) {
+		struct cmac *mac = &pi->mac;
+
+		netif_carrier_on(dev);
+
+		disable_tx_fifo_drain(adap, pi);
+
+		/* Clear local faults */
+		t3_xgm_intr_disable(adap, pi->port_id);
+		t3_read_reg(adap, A_XGM_INT_STATUS +
+				    pi->mac.offset);
+		t3_write_reg(adap,
+			     A_XGM_INT_CAUSE + pi->mac.offset,
+			     F_XGM_INT);
+
+		t3_set_reg_field(adap,
+				 A_XGM_INT_ENABLE +
+				 pi->mac.offset,
+				 F_XGM_INT, F_XGM_INT);
+		t3_xgm_intr_enable(adap, pi->port_id);
+
+		t3_mac_enable(mac, MAC_DIRECTION_TX);
+	} else {
+		netif_carrier_off(dev);
+
+		/* Flush TX FIFO */
+		enable_tx_fifo_drain(adap, pi);
+	}
+	link_report(dev);
+}
+
+/**
+ *	t3_os_link_changed - handle link status changes
+ *	@adapter: the adapter associated with the link change
+ *	@port_id: the port index whose limk status has changed
+ *	@link_stat: the new status of the link
+ *	@speed: the new speed setting
+ *	@duplex: the new duplex setting
+ *	@pause: the new flow-control setting
+ *
+ *	This is the OS-dependent handler for link status changes.  The OS
+ *	neutral handler takes care of most of the processing for these events,
+ *	then calls this handler for any OS-specific processing.
+ */
+void t3_os_link_changed(struct adapter *adapter, int port_id, int link_stat,
+			int speed, int duplex, int pause)
+{
+	struct net_device *dev = adapter->port[port_id];
+	struct port_info *pi = netdev_priv(dev);
+	struct cmac *mac = &pi->mac;
+
+	/* Skip changes from disabled ports. */
+	if (!netif_running(dev))
+		return;
+
+	if (link_stat != netif_carrier_ok(dev)) {
+		if (link_stat) {
+			disable_tx_fifo_drain(adapter, pi);
+
+			t3_mac_enable(mac, MAC_DIRECTION_RX);
+
+			/* Clear local faults */
+			t3_xgm_intr_disable(adapter, pi->port_id);
+			t3_read_reg(adapter, A_XGM_INT_STATUS +
+				    pi->mac.offset);
+			t3_write_reg(adapter,
+				     A_XGM_INT_CAUSE + pi->mac.offset,
+				     F_XGM_INT);
+
+			t3_set_reg_field(adapter,
+					 A_XGM_INT_ENABLE + pi->mac.offset,
+					 F_XGM_INT, F_XGM_INT);
+			t3_xgm_intr_enable(adapter, pi->port_id);
+
+			netif_carrier_on(dev);
+		} else {
+			netif_carrier_off(dev);
+
+			t3_xgm_intr_disable(adapter, pi->port_id);
+			t3_read_reg(adapter, A_XGM_INT_STATUS + pi->mac.offset);
+			t3_set_reg_field(adapter,
+					 A_XGM_INT_ENABLE + pi->mac.offset,
+					 F_XGM_INT, 0);
+
+			if (is_10G(adapter))
+				pi->phy.ops->power_down(&pi->phy, 1);
+
+			t3_read_reg(adapter, A_XGM_INT_STATUS + pi->mac.offset);
+			t3_mac_disable(mac, MAC_DIRECTION_RX);
+			t3_link_start(&pi->phy, mac, &pi->link_config);
+
+			/* Flush TX FIFO */
+			enable_tx_fifo_drain(adapter, pi);
+		}
+
+		link_report(dev);
+	}
+}
+
+/**
+ *	t3_os_phymod_changed - handle PHY module changes
+ *	@phy: the PHY reporting the module change
+ *	@mod_type: new module type
+ *
+ *	This is the OS-dependent handler for PHY module changes.  It is
+ *	invoked when a PHY module is removed or inserted for any OS-specific
+ *	processing.
+ */
+void t3_os_phymod_changed(struct adapter *adap, int port_id)
+{
+	static const char *mod_str[] = {
+		NULL, "SR", "LR", "LRM", "TWINAX", "TWINAX", "unknown"
+	};
+
+	const struct net_device *dev = adap->port[port_id];
+	const struct port_info *pi = netdev_priv(dev);
+
+	if (pi->phy.modtype == phy_modtype_none)
+		netdev_info(dev, "PHY module unplugged\n");
+	else
+		netdev_info(dev, "%s PHY module inserted\n",
+			    mod_str[pi->phy.modtype]);
+}
+
+static void cxgb_set_rxmode(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+
+	t3_mac_set_rx_mode(&pi->mac, dev);
+}
+
+/**
+ *	link_start - enable a port
+ *	@dev: the device to enable
+ *
+ *	Performs the MAC and PHY actions needed to enable a port.
+ */
+static void link_start(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct cmac *mac = &pi->mac;
+
+	t3_mac_reset(mac);
+	t3_mac_set_num_ucast(mac, MAX_MAC_IDX);
+	t3_mac_set_mtu(mac, dev->mtu);
+	t3_mac_set_address(mac, LAN_MAC_IDX, dev->dev_addr);
+	t3_mac_set_address(mac, SAN_MAC_IDX, pi->iscsic.mac_addr);
+	t3_mac_set_rx_mode(mac, dev);
+	t3_link_start(&pi->phy, mac, &pi->link_config);
+	t3_mac_enable(mac, MAC_DIRECTION_RX | MAC_DIRECTION_TX);
+}
+
+static inline void cxgb_disable_msi(struct adapter *adapter)
+{
+	if (adapter->flags & USING_MSIX) {
+		pci_disable_msix(adapter->pdev);
+		adapter->flags &= ~USING_MSIX;
+	} else if (adapter->flags & USING_MSI) {
+		pci_disable_msi(adapter->pdev);
+		adapter->flags &= ~USING_MSI;
+	}
+}
+
+/*
+ * Interrupt handler for asynchronous events used with MSI-X.
+ */
+static irqreturn_t t3_async_intr_handler(int irq, void *cookie)
+{
+	t3_slow_intr_handler(cookie);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Name the MSI-X interrupts.
+ */
+static void name_msix_vecs(struct adapter *adap)
+{
+	int i, j, msi_idx = 1, n = sizeof(adap->msix_info[0].desc) - 1;
+
+	snprintf(adap->msix_info[0].desc, n, "%s", adap->name);
+	adap->msix_info[0].desc[n] = 0;
+
+	for_each_port(adap, j) {
+		struct net_device *d = adap->port[j];
+		const struct port_info *pi = netdev_priv(d);
+
+		for (i = 0; i < pi->nqsets; i++, msi_idx++) {
+			snprintf(adap->msix_info[msi_idx].desc, n,
+				 "%s-%d", d->name, pi->first_qset + i);
+			adap->msix_info[msi_idx].desc[n] = 0;
+		}
+	}
+}
+
+static int request_msix_data_irqs(struct adapter *adap)
+{
+	int i, j, err, qidx = 0;
+
+	for_each_port(adap, i) {
+		int nqsets = adap2pinfo(adap, i)->nqsets;
+
+		for (j = 0; j < nqsets; ++j) {
+			err = request_irq(adap->msix_info[qidx + 1].vec,
+					  t3_intr_handler(adap,
+							  adap->sge.qs[qidx].
+							  rspq.polling), 0,
+					  adap->msix_info[qidx + 1].desc,
+					  &adap->sge.qs[qidx]);
+			if (err) {
+				while (--qidx >= 0)
+					free_irq(adap->msix_info[qidx + 1].vec,
+						 &adap->sge.qs[qidx]);
+				return err;
+			}
+			qidx++;
+		}
+	}
+	return 0;
+}
+
+static void free_irq_resources(struct adapter *adapter)
+{
+	if (adapter->flags & USING_MSIX) {
+		int i, n = 0;
+
+		free_irq(adapter->msix_info[0].vec, adapter);
+		for_each_port(adapter, i)
+			n += adap2pinfo(adapter, i)->nqsets;
+
+		for (i = 0; i < n; ++i)
+			free_irq(adapter->msix_info[i + 1].vec,
+				 &adapter->sge.qs[i]);
+	} else
+		free_irq(adapter->pdev->irq, adapter);
+}
+
+static int await_mgmt_replies(struct adapter *adap, unsigned long init_cnt,
+			      unsigned long n)
+{
+	int attempts = 10;
+
+	while (adap->sge.qs[0].rspq.offload_pkts < init_cnt + n) {
+		if (!--attempts)
+			return -ETIMEDOUT;
+		msleep(10);
+	}
+	return 0;
+}
+
+static int init_tp_parity(struct adapter *adap)
+{
+	int i;
+	struct sk_buff *skb;
+	struct cpl_set_tcb_field *greq;
+	unsigned long cnt = adap->sge.qs[0].rspq.offload_pkts;
+
+	t3_tp_set_offload_mode(adap, 1);
+
+	for (i = 0; i < 16; i++) {
+		struct cpl_smt_write_req *req;
+
+		skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+		if (!skb)
+			skb = adap->nofail_skb;
+		if (!skb)
+			goto alloc_skb_fail;
+
+		req = __skb_put_zero(skb, sizeof(*req));
+		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+		OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, i));
+		req->mtu_idx = NMTUS - 1;
+		req->iff = i;
+		t3_mgmt_tx(adap, skb);
+		if (skb == adap->nofail_skb) {
+			await_mgmt_replies(adap, cnt, i + 1);
+			adap->nofail_skb = alloc_skb(sizeof(*greq), GFP_KERNEL);
+			if (!adap->nofail_skb)
+				goto alloc_skb_fail;
+		}
+	}
+
+	for (i = 0; i < 2048; i++) {
+		struct cpl_l2t_write_req *req;
+
+		skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+		if (!skb)
+			skb = adap->nofail_skb;
+		if (!skb)
+			goto alloc_skb_fail;
+
+		req = __skb_put_zero(skb, sizeof(*req));
+		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+		OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, i));
+		req->params = htonl(V_L2T_W_IDX(i));
+		t3_mgmt_tx(adap, skb);
+		if (skb == adap->nofail_skb) {
+			await_mgmt_replies(adap, cnt, 16 + i + 1);
+			adap->nofail_skb = alloc_skb(sizeof(*greq), GFP_KERNEL);
+			if (!adap->nofail_skb)
+				goto alloc_skb_fail;
+		}
+	}
+
+	for (i = 0; i < 2048; i++) {
+		struct cpl_rte_write_req *req;
+
+		skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+		if (!skb)
+			skb = adap->nofail_skb;
+		if (!skb)
+			goto alloc_skb_fail;
+
+		req = __skb_put_zero(skb, sizeof(*req));
+		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+		OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RTE_WRITE_REQ, i));
+		req->l2t_idx = htonl(V_L2T_W_IDX(i));
+		t3_mgmt_tx(adap, skb);
+		if (skb == adap->nofail_skb) {
+			await_mgmt_replies(adap, cnt, 16 + 2048 + i + 1);
+			adap->nofail_skb = alloc_skb(sizeof(*greq), GFP_KERNEL);
+			if (!adap->nofail_skb)
+				goto alloc_skb_fail;
+		}
+	}
+
+	skb = alloc_skb(sizeof(*greq), GFP_KERNEL);
+	if (!skb)
+		skb = adap->nofail_skb;
+	if (!skb)
+		goto alloc_skb_fail;
+
+	greq = __skb_put_zero(skb, sizeof(*greq));
+	greq->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(greq) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, 0));
+	greq->mask = cpu_to_be64(1);
+	t3_mgmt_tx(adap, skb);
+
+	i = await_mgmt_replies(adap, cnt, 16 + 2048 + 2048 + 1);
+	if (skb == adap->nofail_skb) {
+		i = await_mgmt_replies(adap, cnt, 16 + 2048 + 2048 + 1);
+		adap->nofail_skb = alloc_skb(sizeof(*greq), GFP_KERNEL);
+	}
+
+	t3_tp_set_offload_mode(adap, 0);
+	return i;
+
+alloc_skb_fail:
+	t3_tp_set_offload_mode(adap, 0);
+	return -ENOMEM;
+}
+
+/**
+ *	setup_rss - configure RSS
+ *	@adap: the adapter
+ *
+ *	Sets up RSS to distribute packets to multiple receive queues.  We
+ *	configure the RSS CPU lookup table to distribute to the number of HW
+ *	receive queues, and the response queue lookup table to narrow that
+ *	down to the response queues actually configured for each port.
+ *	We always configure the RSS mapping for two ports since the mapping
+ *	table has plenty of entries.
+ */
+static void setup_rss(struct adapter *adap)
+{
+	int i;
+	unsigned int nq0 = adap2pinfo(adap, 0)->nqsets;
+	unsigned int nq1 = adap->port[1] ? adap2pinfo(adap, 1)->nqsets : 1;
+	u8 cpus[SGE_QSETS + 1];
+	u16 rspq_map[RSS_TABLE_SIZE + 1];
+
+	for (i = 0; i < SGE_QSETS; ++i)
+		cpus[i] = i;
+	cpus[SGE_QSETS] = 0xff;	/* terminator */
+
+	for (i = 0; i < RSS_TABLE_SIZE / 2; ++i) {
+		rspq_map[i] = i % nq0;
+		rspq_map[i + RSS_TABLE_SIZE / 2] = (i % nq1) + nq0;
+	}
+	rspq_map[RSS_TABLE_SIZE] = 0xffff; /* terminator */
+
+	t3_config_rss(adap, F_RQFEEDBACKENABLE | F_TNLLKPEN | F_TNLMAPEN |
+		      F_TNLPRTEN | F_TNL2TUPEN | F_TNL4TUPEN |
+		      V_RRCPLCPUSIZE(6) | F_HASHTOEPLITZ, cpus, rspq_map);
+}
+
+static void ring_dbs(struct adapter *adap)
+{
+	int i, j;
+
+	for (i = 0; i < SGE_QSETS; i++) {
+		struct sge_qset *qs = &adap->sge.qs[i];
+
+		if (qs->adap)
+			for (j = 0; j < SGE_TXQ_PER_SET; j++)
+				t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(qs->txq[j].cntxt_id));
+	}
+}
+
+static void init_napi(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < SGE_QSETS; i++) {
+		struct sge_qset *qs = &adap->sge.qs[i];
+
+		if (qs->adap)
+			netif_napi_add(qs->netdev, &qs->napi, qs->napi.poll,
+				       64);
+	}
+
+	/*
+	 * netif_napi_add() can be called only once per napi_struct because it
+	 * adds each new napi_struct to a list.  Be careful not to call it a
+	 * second time, e.g., during EEH recovery, by making a note of it.
+	 */
+	adap->flags |= NAPI_INIT;
+}
+
+/*
+ * Wait until all NAPI handlers are descheduled.  This includes the handlers of
+ * both netdevices representing interfaces and the dummy ones for the extra
+ * queues.
+ */
+static void quiesce_rx(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < SGE_QSETS; i++)
+		if (adap->sge.qs[i].adap)
+			napi_disable(&adap->sge.qs[i].napi);
+}
+
+static void enable_all_napi(struct adapter *adap)
+{
+	int i;
+	for (i = 0; i < SGE_QSETS; i++)
+		if (adap->sge.qs[i].adap)
+			napi_enable(&adap->sge.qs[i].napi);
+}
+
+/**
+ *	setup_sge_qsets - configure SGE Tx/Rx/response queues
+ *	@adap: the adapter
+ *
+ *	Determines how many sets of SGE queues to use and initializes them.
+ *	We support multiple queue sets per port if we have MSI-X, otherwise
+ *	just one queue set per port.
+ */
+static int setup_sge_qsets(struct adapter *adap)
+{
+	int i, j, err, irq_idx = 0, qset_idx = 0;
+	unsigned int ntxq = SGE_TXQ_PER_SET;
+
+	if (adap->params.rev > 0 && !(adap->flags & USING_MSI))
+		irq_idx = -1;
+
+	for_each_port(adap, i) {
+		struct net_device *dev = adap->port[i];
+		struct port_info *pi = netdev_priv(dev);
+
+		pi->qs = &adap->sge.qs[pi->first_qset];
+		for (j = 0; j < pi->nqsets; ++j, ++qset_idx) {
+			err = t3_sge_alloc_qset(adap, qset_idx, 1,
+				(adap->flags & USING_MSIX) ? qset_idx + 1 :
+							     irq_idx,
+				&adap->params.sge.qset[qset_idx], ntxq, dev,
+				netdev_get_tx_queue(dev, j));
+			if (err) {
+				t3_free_sge_resources(adap);
+				return err;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static ssize_t attr_show(struct device *d, char *buf,
+			 ssize_t(*format) (struct net_device *, char *))
+{
+	ssize_t len;
+
+	/* Synchronize with ioctls that may shut down the device */
+	rtnl_lock();
+	len = (*format) (to_net_dev(d), buf);
+	rtnl_unlock();
+	return len;
+}
+
+static ssize_t attr_store(struct device *d,
+			  const char *buf, size_t len,
+			  ssize_t(*set) (struct net_device *, unsigned int),
+			  unsigned int min_val, unsigned int max_val)
+{
+	ssize_t ret;
+	unsigned int val;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ret = kstrtouint(buf, 0, &val);
+	if (ret)
+		return ret;
+	if (val < min_val || val > max_val)
+		return -EINVAL;
+
+	rtnl_lock();
+	ret = (*set) (to_net_dev(d), val);
+	if (!ret)
+		ret = len;
+	rtnl_unlock();
+	return ret;
+}
+
+#define CXGB3_SHOW(name, val_expr) \
+static ssize_t format_##name(struct net_device *dev, char *buf) \
+{ \
+	struct port_info *pi = netdev_priv(dev); \
+	struct adapter *adap = pi->adapter; \
+	return sprintf(buf, "%u\n", val_expr); \
+} \
+static ssize_t show_##name(struct device *d, struct device_attribute *attr, \
+			   char *buf) \
+{ \
+	return attr_show(d, buf, format_##name); \
+}
+
+static ssize_t set_nfilters(struct net_device *dev, unsigned int val)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	int min_tids = is_offload(adap) ? MC5_MIN_TIDS : 0;
+
+	if (adap->flags & FULL_INIT_DONE)
+		return -EBUSY;
+	if (val && adap->params.rev == 0)
+		return -EINVAL;
+	if (val > t3_mc5_size(&adap->mc5) - adap->params.mc5.nservers -
+	    min_tids)
+		return -EINVAL;
+	adap->params.mc5.nfilters = val;
+	return 0;
+}
+
+static ssize_t store_nfilters(struct device *d, struct device_attribute *attr,
+			      const char *buf, size_t len)
+{
+	return attr_store(d, buf, len, set_nfilters, 0, ~0);
+}
+
+static ssize_t set_nservers(struct net_device *dev, unsigned int val)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+
+	if (adap->flags & FULL_INIT_DONE)
+		return -EBUSY;
+	if (val > t3_mc5_size(&adap->mc5) - adap->params.mc5.nfilters -
+	    MC5_MIN_TIDS)
+		return -EINVAL;
+	adap->params.mc5.nservers = val;
+	return 0;
+}
+
+static ssize_t store_nservers(struct device *d, struct device_attribute *attr,
+			      const char *buf, size_t len)
+{
+	return attr_store(d, buf, len, set_nservers, 0, ~0);
+}
+
+#define CXGB3_ATTR_R(name, val_expr) \
+CXGB3_SHOW(name, val_expr) \
+static DEVICE_ATTR(name, 0444, show_##name, NULL)
+
+#define CXGB3_ATTR_RW(name, val_expr, store_method) \
+CXGB3_SHOW(name, val_expr) \
+static DEVICE_ATTR(name, 0644, show_##name, store_method)
+
+CXGB3_ATTR_R(cam_size, t3_mc5_size(&adap->mc5));
+CXGB3_ATTR_RW(nfilters, adap->params.mc5.nfilters, store_nfilters);
+CXGB3_ATTR_RW(nservers, adap->params.mc5.nservers, store_nservers);
+
+static struct attribute *cxgb3_attrs[] = {
+	&dev_attr_cam_size.attr,
+	&dev_attr_nfilters.attr,
+	&dev_attr_nservers.attr,
+	NULL
+};
+
+static const struct attribute_group cxgb3_attr_group = {
+	.attrs = cxgb3_attrs,
+};
+
+static ssize_t tm_attr_show(struct device *d,
+			    char *buf, int sched)
+{
+	struct port_info *pi = netdev_priv(to_net_dev(d));
+	struct adapter *adap = pi->adapter;
+	unsigned int v, addr, bpt, cpt;
+	ssize_t len;
+
+	addr = A_TP_TX_MOD_Q1_Q0_RATE_LIMIT - sched / 2;
+	rtnl_lock();
+	t3_write_reg(adap, A_TP_TM_PIO_ADDR, addr);
+	v = t3_read_reg(adap, A_TP_TM_PIO_DATA);
+	if (sched & 1)
+		v >>= 16;
+	bpt = (v >> 8) & 0xff;
+	cpt = v & 0xff;
+	if (!cpt)
+		len = sprintf(buf, "disabled\n");
+	else {
+		v = (adap->params.vpd.cclk * 1000) / cpt;
+		len = sprintf(buf, "%u Kbps\n", (v * bpt) / 125);
+	}
+	rtnl_unlock();
+	return len;
+}
+
+static ssize_t tm_attr_store(struct device *d,
+			     const char *buf, size_t len, int sched)
+{
+	struct port_info *pi = netdev_priv(to_net_dev(d));
+	struct adapter *adap = pi->adapter;
+	unsigned int val;
+	ssize_t ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ret = kstrtouint(buf, 0, &val);
+	if (ret)
+		return ret;
+	if (val > 10000000)
+		return -EINVAL;
+
+	rtnl_lock();
+	ret = t3_config_sched(adap, val, sched);
+	if (!ret)
+		ret = len;
+	rtnl_unlock();
+	return ret;
+}
+
+#define TM_ATTR(name, sched) \
+static ssize_t show_##name(struct device *d, struct device_attribute *attr, \
+			   char *buf) \
+{ \
+	return tm_attr_show(d, buf, sched); \
+} \
+static ssize_t store_##name(struct device *d, struct device_attribute *attr, \
+			    const char *buf, size_t len) \
+{ \
+	return tm_attr_store(d, buf, len, sched); \
+} \
+static DEVICE_ATTR(name, 0644, show_##name, store_##name)
+
+TM_ATTR(sched0, 0);
+TM_ATTR(sched1, 1);
+TM_ATTR(sched2, 2);
+TM_ATTR(sched3, 3);
+TM_ATTR(sched4, 4);
+TM_ATTR(sched5, 5);
+TM_ATTR(sched6, 6);
+TM_ATTR(sched7, 7);
+
+static struct attribute *offload_attrs[] = {
+	&dev_attr_sched0.attr,
+	&dev_attr_sched1.attr,
+	&dev_attr_sched2.attr,
+	&dev_attr_sched3.attr,
+	&dev_attr_sched4.attr,
+	&dev_attr_sched5.attr,
+	&dev_attr_sched6.attr,
+	&dev_attr_sched7.attr,
+	NULL
+};
+
+static const struct attribute_group offload_attr_group = {
+	.attrs = offload_attrs,
+};
+
+/*
+ * Sends an sk_buff to an offload queue driver
+ * after dealing with any active network taps.
+ */
+static inline int offload_tx(struct t3cdev *tdev, struct sk_buff *skb)
+{
+	int ret;
+
+	local_bh_disable();
+	ret = t3_offload_tx(tdev, skb);
+	local_bh_enable();
+	return ret;
+}
+
+static int write_smt_entry(struct adapter *adapter, int idx)
+{
+	struct cpl_smt_write_req *req;
+	struct port_info *pi = netdev_priv(adapter->port[idx]);
+	struct sk_buff *skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+
+	if (!skb)
+		return -ENOMEM;
+
+	req = __skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, idx));
+	req->mtu_idx = NMTUS - 1;	/* should be 0 but there's a T3 bug */
+	req->iff = idx;
+	memcpy(req->src_mac0, adapter->port[idx]->dev_addr, ETH_ALEN);
+	memcpy(req->src_mac1, pi->iscsic.mac_addr, ETH_ALEN);
+	skb->priority = 1;
+	offload_tx(&adapter->tdev, skb);
+	return 0;
+}
+
+static int init_smt(struct adapter *adapter)
+{
+	int i;
+
+	for_each_port(adapter, i)
+	    write_smt_entry(adapter, i);
+	return 0;
+}
+
+static void init_port_mtus(struct adapter *adapter)
+{
+	unsigned int mtus = adapter->port[0]->mtu;
+
+	if (adapter->port[1])
+		mtus |= adapter->port[1]->mtu << 16;
+	t3_write_reg(adapter, A_TP_MTU_PORT_TABLE, mtus);
+}
+
+static int send_pktsched_cmd(struct adapter *adap, int sched, int qidx, int lo,
+			      int hi, int port)
+{
+	struct sk_buff *skb;
+	struct mngt_pktsched_wr *req;
+	int ret;
+
+	skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+	if (!skb)
+		skb = adap->nofail_skb;
+	if (!skb)
+		return -ENOMEM;
+
+	req = skb_put(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_MNGT));
+	req->mngt_opcode = FW_MNGTOPCODE_PKTSCHED_SET;
+	req->sched = sched;
+	req->idx = qidx;
+	req->min = lo;
+	req->max = hi;
+	req->binding = port;
+	ret = t3_mgmt_tx(adap, skb);
+	if (skb == adap->nofail_skb) {
+		adap->nofail_skb = alloc_skb(sizeof(struct cpl_set_tcb_field),
+					     GFP_KERNEL);
+		if (!adap->nofail_skb)
+			ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static int bind_qsets(struct adapter *adap)
+{
+	int i, j, err = 0;
+
+	for_each_port(adap, i) {
+		const struct port_info *pi = adap2pinfo(adap, i);
+
+		for (j = 0; j < pi->nqsets; ++j) {
+			int ret = send_pktsched_cmd(adap, 1,
+						    pi->first_qset + j, -1,
+						    -1, i);
+			if (ret)
+				err = ret;
+		}
+	}
+
+	return err;
+}
+
+#define FW_VERSION __stringify(FW_VERSION_MAJOR) "."			\
+	__stringify(FW_VERSION_MINOR) "." __stringify(FW_VERSION_MICRO)
+#define FW_FNAME "cxgb3/t3fw-" FW_VERSION ".bin"
+#define TPSRAM_VERSION __stringify(TP_VERSION_MAJOR) "."		\
+	__stringify(TP_VERSION_MINOR) "." __stringify(TP_VERSION_MICRO)
+#define TPSRAM_NAME "cxgb3/t3%c_psram-" TPSRAM_VERSION ".bin"
+#define AEL2005_OPT_EDC_NAME "cxgb3/ael2005_opt_edc.bin"
+#define AEL2005_TWX_EDC_NAME "cxgb3/ael2005_twx_edc.bin"
+#define AEL2020_TWX_EDC_NAME "cxgb3/ael2020_twx_edc.bin"
+MODULE_FIRMWARE(FW_FNAME);
+MODULE_FIRMWARE("cxgb3/t3b_psram-" TPSRAM_VERSION ".bin");
+MODULE_FIRMWARE("cxgb3/t3c_psram-" TPSRAM_VERSION ".bin");
+MODULE_FIRMWARE(AEL2005_OPT_EDC_NAME);
+MODULE_FIRMWARE(AEL2005_TWX_EDC_NAME);
+MODULE_FIRMWARE(AEL2020_TWX_EDC_NAME);
+
+static inline const char *get_edc_fw_name(int edc_idx)
+{
+	const char *fw_name = NULL;
+
+	switch (edc_idx) {
+	case EDC_OPT_AEL2005:
+		fw_name = AEL2005_OPT_EDC_NAME;
+		break;
+	case EDC_TWX_AEL2005:
+		fw_name = AEL2005_TWX_EDC_NAME;
+		break;
+	case EDC_TWX_AEL2020:
+		fw_name = AEL2020_TWX_EDC_NAME;
+		break;
+	}
+	return fw_name;
+}
+
+int t3_get_edc_fw(struct cphy *phy, int edc_idx, int size)
+{
+	struct adapter *adapter = phy->adapter;
+	const struct firmware *fw;
+	const char *fw_name;
+	u32 csum;
+	const __be32 *p;
+	u16 *cache = phy->phy_cache;
+	int i, ret = -EINVAL;
+
+	fw_name = get_edc_fw_name(edc_idx);
+	if (fw_name)
+		ret = request_firmware(&fw, fw_name, &adapter->pdev->dev);
+	if (ret < 0) {
+		dev_err(&adapter->pdev->dev,
+			"could not upgrade firmware: unable to load %s\n",
+			fw_name);
+		return ret;
+	}
+
+	/* check size, take checksum in account */
+	if (fw->size > size + 4) {
+		CH_ERR(adapter, "firmware image too large %u, expected %d\n",
+		       (unsigned int)fw->size, size + 4);
+		ret = -EINVAL;
+	}
+
+	/* compute checksum */
+	p = (const __be32 *)fw->data;
+	for (csum = 0, i = 0; i < fw->size / sizeof(csum); i++)
+		csum += ntohl(p[i]);
+
+	if (csum != 0xffffffff) {
+		CH_ERR(adapter, "corrupted firmware image, checksum %u\n",
+		       csum);
+		ret = -EINVAL;
+	}
+
+	for (i = 0; i < size / 4 ; i++) {
+		*cache++ = (be32_to_cpu(p[i]) & 0xffff0000) >> 16;
+		*cache++ = be32_to_cpu(p[i]) & 0xffff;
+	}
+
+	release_firmware(fw);
+
+	return ret;
+}
+
+static int upgrade_fw(struct adapter *adap)
+{
+	int ret;
+	const struct firmware *fw;
+	struct device *dev = &adap->pdev->dev;
+
+	ret = request_firmware(&fw, FW_FNAME, dev);
+	if (ret < 0) {
+		dev_err(dev, "could not upgrade firmware: unable to load %s\n",
+			FW_FNAME);
+		return ret;
+	}
+	ret = t3_load_fw(adap, fw->data, fw->size);
+	release_firmware(fw);
+
+	if (ret == 0)
+		dev_info(dev, "successful upgrade to firmware %d.%d.%d\n",
+			 FW_VERSION_MAJOR, FW_VERSION_MINOR, FW_VERSION_MICRO);
+	else
+		dev_err(dev, "failed to upgrade to firmware %d.%d.%d\n",
+			FW_VERSION_MAJOR, FW_VERSION_MINOR, FW_VERSION_MICRO);
+
+	return ret;
+}
+
+static inline char t3rev2char(struct adapter *adapter)
+{
+	char rev = 0;
+
+	switch(adapter->params.rev) {
+	case T3_REV_B:
+	case T3_REV_B2:
+		rev = 'b';
+		break;
+	case T3_REV_C:
+		rev = 'c';
+		break;
+	}
+	return rev;
+}
+
+static int update_tpsram(struct adapter *adap)
+{
+	const struct firmware *tpsram;
+	char buf[64];
+	struct device *dev = &adap->pdev->dev;
+	int ret;
+	char rev;
+
+	rev = t3rev2char(adap);
+	if (!rev)
+		return 0;
+
+	snprintf(buf, sizeof(buf), TPSRAM_NAME, rev);
+
+	ret = request_firmware(&tpsram, buf, dev);
+	if (ret < 0) {
+		dev_err(dev, "could not load TP SRAM: unable to load %s\n",
+			buf);
+		return ret;
+	}
+
+	ret = t3_check_tpsram(adap, tpsram->data, tpsram->size);
+	if (ret)
+		goto release_tpsram;
+
+	ret = t3_set_proto_sram(adap, tpsram->data);
+	if (ret == 0)
+		dev_info(dev,
+			 "successful update of protocol engine "
+			 "to %d.%d.%d\n",
+			 TP_VERSION_MAJOR, TP_VERSION_MINOR, TP_VERSION_MICRO);
+	else
+		dev_err(dev, "failed to update of protocol engine %d.%d.%d\n",
+			TP_VERSION_MAJOR, TP_VERSION_MINOR, TP_VERSION_MICRO);
+	if (ret)
+		dev_err(dev, "loading protocol SRAM failed\n");
+
+release_tpsram:
+	release_firmware(tpsram);
+
+	return ret;
+}
+
+/**
+ * t3_synchronize_rx - wait for current Rx processing on a port to complete
+ * @adap: the adapter
+ * @p: the port
+ *
+ * Ensures that current Rx processing on any of the queues associated with
+ * the given port completes before returning.  We do this by acquiring and
+ * releasing the locks of the response queues associated with the port.
+ */
+static void t3_synchronize_rx(struct adapter *adap, const struct port_info *p)
+{
+	int i;
+
+	for (i = p->first_qset; i < p->first_qset + p->nqsets; i++) {
+		struct sge_rspq *q = &adap->sge.qs[i].rspq;
+
+		spin_lock_irq(&q->lock);
+		spin_unlock_irq(&q->lock);
+	}
+}
+
+static void cxgb_vlan_mode(struct net_device *dev, netdev_features_t features)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	if (adapter->params.rev > 0) {
+		t3_set_vlan_accel(adapter, 1 << pi->port_id,
+				  features & NETIF_F_HW_VLAN_CTAG_RX);
+	} else {
+		/* single control for all ports */
+		unsigned int i, have_vlans = features & NETIF_F_HW_VLAN_CTAG_RX;
+
+		for_each_port(adapter, i)
+			have_vlans |=
+				adapter->port[i]->features &
+				NETIF_F_HW_VLAN_CTAG_RX;
+
+		t3_set_vlan_accel(adapter, 1, have_vlans);
+	}
+	t3_synchronize_rx(adapter, pi);
+}
+
+/**
+ *	cxgb_up - enable the adapter
+ *	@adapter: adapter being enabled
+ *
+ *	Called when the first port is enabled, this function performs the
+ *	actions necessary to make an adapter operational, such as completing
+ *	the initialization of HW modules, and enabling interrupts.
+ *
+ *	Must be called with the rtnl lock held.
+ */
+static int cxgb_up(struct adapter *adap)
+{
+	int i, err;
+
+	if (!(adap->flags & FULL_INIT_DONE)) {
+		err = t3_check_fw_version(adap);
+		if (err == -EINVAL) {
+			err = upgrade_fw(adap);
+			CH_WARN(adap, "FW upgrade to %d.%d.%d %s\n",
+				FW_VERSION_MAJOR, FW_VERSION_MINOR,
+				FW_VERSION_MICRO, err ? "failed" : "succeeded");
+		}
+
+		err = t3_check_tpsram_version(adap);
+		if (err == -EINVAL) {
+			err = update_tpsram(adap);
+			CH_WARN(adap, "TP upgrade to %d.%d.%d %s\n",
+				TP_VERSION_MAJOR, TP_VERSION_MINOR,
+				TP_VERSION_MICRO, err ? "failed" : "succeeded");
+		}
+
+		/*
+		 * Clear interrupts now to catch errors if t3_init_hw fails.
+		 * We clear them again later as initialization may trigger
+		 * conditions that can interrupt.
+		 */
+		t3_intr_clear(adap);
+
+		err = t3_init_hw(adap, 0);
+		if (err)
+			goto out;
+
+		t3_set_reg_field(adap, A_TP_PARA_REG5, 0, F_RXDDPOFFINIT);
+		t3_write_reg(adap, A_ULPRX_TDDP_PSZ, V_HPZ0(PAGE_SHIFT - 12));
+
+		err = setup_sge_qsets(adap);
+		if (err)
+			goto out;
+
+		for_each_port(adap, i)
+			cxgb_vlan_mode(adap->port[i], adap->port[i]->features);
+
+		setup_rss(adap);
+		if (!(adap->flags & NAPI_INIT))
+			init_napi(adap);
+
+		t3_start_sge_timers(adap);
+		adap->flags |= FULL_INIT_DONE;
+	}
+
+	t3_intr_clear(adap);
+
+	if (adap->flags & USING_MSIX) {
+		name_msix_vecs(adap);
+		err = request_irq(adap->msix_info[0].vec,
+				  t3_async_intr_handler, 0,
+				  adap->msix_info[0].desc, adap);
+		if (err)
+			goto irq_err;
+
+		err = request_msix_data_irqs(adap);
+		if (err) {
+			free_irq(adap->msix_info[0].vec, adap);
+			goto irq_err;
+		}
+	} else if ((err = request_irq(adap->pdev->irq,
+				      t3_intr_handler(adap,
+						      adap->sge.qs[0].rspq.
+						      polling),
+				      (adap->flags & USING_MSI) ?
+				       0 : IRQF_SHARED,
+				      adap->name, adap)))
+		goto irq_err;
+
+	enable_all_napi(adap);
+	t3_sge_start(adap);
+	t3_intr_enable(adap);
+
+	if (adap->params.rev >= T3_REV_C && !(adap->flags & TP_PARITY_INIT) &&
+	    is_offload(adap) && init_tp_parity(adap) == 0)
+		adap->flags |= TP_PARITY_INIT;
+
+	if (adap->flags & TP_PARITY_INIT) {
+		t3_write_reg(adap, A_TP_INT_CAUSE,
+			     F_CMCACHEPERR | F_ARPLUTPERR);
+		t3_write_reg(adap, A_TP_INT_ENABLE, 0x7fbfffff);
+	}
+
+	if (!(adap->flags & QUEUES_BOUND)) {
+		int ret = bind_qsets(adap);
+
+		if (ret < 0) {
+			CH_ERR(adap, "failed to bind qsets, err %d\n", ret);
+			t3_intr_disable(adap);
+			free_irq_resources(adap);
+			err = ret;
+			goto out;
+		}
+		adap->flags |= QUEUES_BOUND;
+	}
+
+out:
+	return err;
+irq_err:
+	CH_ERR(adap, "request_irq failed, err %d\n", err);
+	goto out;
+}
+
+/*
+ * Release resources when all the ports and offloading have been stopped.
+ */
+static void cxgb_down(struct adapter *adapter, int on_wq)
+{
+	t3_sge_stop(adapter);
+	spin_lock_irq(&adapter->work_lock);	/* sync with PHY intr task */
+	t3_intr_disable(adapter);
+	spin_unlock_irq(&adapter->work_lock);
+
+	free_irq_resources(adapter);
+	quiesce_rx(adapter);
+	t3_sge_stop(adapter);
+	if (!on_wq)
+		flush_workqueue(cxgb3_wq);/* wait for external IRQ handler */
+}
+
+static void schedule_chk_task(struct adapter *adap)
+{
+	unsigned int timeo;
+
+	timeo = adap->params.linkpoll_period ?
+	    (HZ * adap->params.linkpoll_period) / 10 :
+	    adap->params.stats_update_period * HZ;
+	if (timeo)
+		queue_delayed_work(cxgb3_wq, &adap->adap_check_task, timeo);
+}
+
+static int offload_open(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct t3cdev *tdev = dev2t3cdev(dev);
+	int adap_up = adapter->open_device_map & PORT_MASK;
+	int err;
+
+	if (test_and_set_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map))
+		return 0;
+
+	if (!adap_up && (err = cxgb_up(adapter)) < 0)
+		goto out;
+
+	t3_tp_set_offload_mode(adapter, 1);
+	tdev->lldev = adapter->port[0];
+	err = cxgb3_offload_activate(adapter);
+	if (err)
+		goto out;
+
+	init_port_mtus(adapter);
+	t3_load_mtus(adapter, adapter->params.mtus, adapter->params.a_wnd,
+		     adapter->params.b_wnd,
+		     adapter->params.rev == 0 ?
+		     adapter->port[0]->mtu : 0xffff);
+	init_smt(adapter);
+
+	if (sysfs_create_group(&tdev->lldev->dev.kobj, &offload_attr_group))
+		dev_dbg(&dev->dev, "cannot create sysfs group\n");
+
+	/* Call back all registered clients */
+	cxgb3_add_clients(tdev);
+
+out:
+	/* restore them in case the offload module has changed them */
+	if (err) {
+		t3_tp_set_offload_mode(adapter, 0);
+		clear_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map);
+		cxgb3_set_dummy_ops(tdev);
+	}
+	return err;
+}
+
+static int offload_close(struct t3cdev *tdev)
+{
+	struct adapter *adapter = tdev2adap(tdev);
+	struct t3c_data *td = T3C_DATA(tdev);
+
+	if (!test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map))
+		return 0;
+
+	/* Call back all registered clients */
+	cxgb3_remove_clients(tdev);
+
+	sysfs_remove_group(&tdev->lldev->dev.kobj, &offload_attr_group);
+
+	/* Flush work scheduled while releasing TIDs */
+	flush_work(&td->tid_release_task);
+
+	tdev->lldev = NULL;
+	cxgb3_set_dummy_ops(tdev);
+	t3_tp_set_offload_mode(adapter, 0);
+	clear_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map);
+
+	if (!adapter->open_device_map)
+		cxgb_down(adapter, 0);
+
+	cxgb3_offload_deactivate(adapter);
+	return 0;
+}
+
+static int cxgb_open(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int other_ports = adapter->open_device_map & PORT_MASK;
+	int err;
+
+	if (!adapter->open_device_map && (err = cxgb_up(adapter)) < 0)
+		return err;
+
+	set_bit(pi->port_id, &adapter->open_device_map);
+	if (is_offload(adapter) && !ofld_disable) {
+		err = offload_open(dev);
+		if (err)
+			pr_warn("Could not initialize offload capabilities\n");
+	}
+
+	netif_set_real_num_tx_queues(dev, pi->nqsets);
+	err = netif_set_real_num_rx_queues(dev, pi->nqsets);
+	if (err)
+		return err;
+	link_start(dev);
+	t3_port_intr_enable(adapter, pi->port_id);
+	netif_tx_start_all_queues(dev);
+	if (!other_ports)
+		schedule_chk_task(adapter);
+
+	cxgb3_event_notify(&adapter->tdev, OFFLOAD_PORT_UP, pi->port_id);
+	return 0;
+}
+
+static int __cxgb_close(struct net_device *dev, int on_wq)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	
+	if (!adapter->open_device_map)
+		return 0;
+
+	/* Stop link fault interrupts */
+	t3_xgm_intr_disable(adapter, pi->port_id);
+	t3_read_reg(adapter, A_XGM_INT_STATUS + pi->mac.offset);
+
+	t3_port_intr_disable(adapter, pi->port_id);
+	netif_tx_stop_all_queues(dev);
+	pi->phy.ops->power_down(&pi->phy, 1);
+	netif_carrier_off(dev);
+	t3_mac_disable(&pi->mac, MAC_DIRECTION_TX | MAC_DIRECTION_RX);
+
+	spin_lock_irq(&adapter->work_lock);	/* sync with update task */
+	clear_bit(pi->port_id, &adapter->open_device_map);
+	spin_unlock_irq(&adapter->work_lock);
+
+	if (!(adapter->open_device_map & PORT_MASK))
+		cancel_delayed_work_sync(&adapter->adap_check_task);
+
+	if (!adapter->open_device_map)
+		cxgb_down(adapter, on_wq);
+
+	cxgb3_event_notify(&adapter->tdev, OFFLOAD_PORT_DOWN, pi->port_id);
+	return 0;
+}
+
+static int cxgb_close(struct net_device *dev)
+{
+	return __cxgb_close(dev, 0);
+}
+
+static struct net_device_stats *cxgb_get_stats(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct net_device_stats *ns = &dev->stats;
+	const struct mac_stats *pstats;
+
+	spin_lock(&adapter->stats_lock);
+	pstats = t3_mac_update_stats(&pi->mac);
+	spin_unlock(&adapter->stats_lock);
+
+	ns->tx_bytes = pstats->tx_octets;
+	ns->tx_packets = pstats->tx_frames;
+	ns->rx_bytes = pstats->rx_octets;
+	ns->rx_packets = pstats->rx_frames;
+	ns->multicast = pstats->rx_mcast_frames;
+
+	ns->tx_errors = pstats->tx_underrun;
+	ns->rx_errors = pstats->rx_symbol_errs + pstats->rx_fcs_errs +
+	    pstats->rx_too_long + pstats->rx_jabber + pstats->rx_short +
+	    pstats->rx_fifo_ovfl;
+
+	/* detailed rx_errors */
+	ns->rx_length_errors = pstats->rx_jabber + pstats->rx_too_long;
+	ns->rx_over_errors = 0;
+	ns->rx_crc_errors = pstats->rx_fcs_errs;
+	ns->rx_frame_errors = pstats->rx_symbol_errs;
+	ns->rx_fifo_errors = pstats->rx_fifo_ovfl;
+	ns->rx_missed_errors = pstats->rx_cong_drops;
+
+	/* detailed tx_errors */
+	ns->tx_aborted_errors = 0;
+	ns->tx_carrier_errors = 0;
+	ns->tx_fifo_errors = pstats->tx_underrun;
+	ns->tx_heartbeat_errors = 0;
+	ns->tx_window_errors = 0;
+	return ns;
+}
+
+static u32 get_msglevel(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	return adapter->msg_enable;
+}
+
+static void set_msglevel(struct net_device *dev, u32 val)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	adapter->msg_enable = val;
+}
+
+static const char stats_strings[][ETH_GSTRING_LEN] = {
+	"TxOctetsOK         ",
+	"TxFramesOK         ",
+	"TxMulticastFramesOK",
+	"TxBroadcastFramesOK",
+	"TxPauseFrames      ",
+	"TxUnderrun         ",
+	"TxExtUnderrun      ",
+
+	"TxFrames64         ",
+	"TxFrames65To127    ",
+	"TxFrames128To255   ",
+	"TxFrames256To511   ",
+	"TxFrames512To1023  ",
+	"TxFrames1024To1518 ",
+	"TxFrames1519ToMax  ",
+
+	"RxOctetsOK         ",
+	"RxFramesOK         ",
+	"RxMulticastFramesOK",
+	"RxBroadcastFramesOK",
+	"RxPauseFrames      ",
+	"RxFCSErrors        ",
+	"RxSymbolErrors     ",
+	"RxShortErrors      ",
+	"RxJabberErrors     ",
+	"RxLengthErrors     ",
+	"RxFIFOoverflow     ",
+
+	"RxFrames64         ",
+	"RxFrames65To127    ",
+	"RxFrames128To255   ",
+	"RxFrames256To511   ",
+	"RxFrames512To1023  ",
+	"RxFrames1024To1518 ",
+	"RxFrames1519ToMax  ",
+
+	"PhyFIFOErrors      ",
+	"TSO                ",
+	"VLANextractions    ",
+	"VLANinsertions     ",
+	"TxCsumOffload      ",
+	"RxCsumGood         ",
+	"LroAggregated      ",
+	"LroFlushed         ",
+	"LroNoDesc          ",
+	"RxDrops            ",
+
+	"CheckTXEnToggled   ",
+	"CheckResets        ",
+
+	"LinkFaults         ",
+};
+
+static int get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(stats_strings);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+#define T3_REGMAP_SIZE (3 * 1024)
+
+static int get_regs_len(struct net_device *dev)
+{
+	return T3_REGMAP_SIZE;
+}
+
+static int get_eeprom_len(struct net_device *dev)
+{
+	return EEPROMSIZE;
+}
+
+static void get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	u32 fw_vers = 0;
+	u32 tp_vers = 0;
+
+	spin_lock(&adapter->stats_lock);
+	t3_get_fw_version(adapter, &fw_vers);
+	t3_get_tp_version(adapter, &tp_vers);
+	spin_unlock(&adapter->stats_lock);
+
+	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
+	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
+	strlcpy(info->bus_info, pci_name(adapter->pdev),
+		sizeof(info->bus_info));
+	if (fw_vers)
+		snprintf(info->fw_version, sizeof(info->fw_version),
+			 "%s %u.%u.%u TP %u.%u.%u",
+			 G_FW_VERSION_TYPE(fw_vers) ? "T" : "N",
+			 G_FW_VERSION_MAJOR(fw_vers),
+			 G_FW_VERSION_MINOR(fw_vers),
+			 G_FW_VERSION_MICRO(fw_vers),
+			 G_TP_VERSION_MAJOR(tp_vers),
+			 G_TP_VERSION_MINOR(tp_vers),
+			 G_TP_VERSION_MICRO(tp_vers));
+}
+
+static void get_strings(struct net_device *dev, u32 stringset, u8 * data)
+{
+	if (stringset == ETH_SS_STATS)
+		memcpy(data, stats_strings, sizeof(stats_strings));
+}
+
+static unsigned long collect_sge_port_stats(struct adapter *adapter,
+					    struct port_info *p, int idx)
+{
+	int i;
+	unsigned long tot = 0;
+
+	for (i = p->first_qset; i < p->first_qset + p->nqsets; ++i)
+		tot += adapter->sge.qs[i].port_stats[idx];
+	return tot;
+}
+
+static void get_stats(struct net_device *dev, struct ethtool_stats *stats,
+		      u64 *data)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	const struct mac_stats *s;
+
+	spin_lock(&adapter->stats_lock);
+	s = t3_mac_update_stats(&pi->mac);
+	spin_unlock(&adapter->stats_lock);
+
+	*data++ = s->tx_octets;
+	*data++ = s->tx_frames;
+	*data++ = s->tx_mcast_frames;
+	*data++ = s->tx_bcast_frames;
+	*data++ = s->tx_pause;
+	*data++ = s->tx_underrun;
+	*data++ = s->tx_fifo_urun;
+
+	*data++ = s->tx_frames_64;
+	*data++ = s->tx_frames_65_127;
+	*data++ = s->tx_frames_128_255;
+	*data++ = s->tx_frames_256_511;
+	*data++ = s->tx_frames_512_1023;
+	*data++ = s->tx_frames_1024_1518;
+	*data++ = s->tx_frames_1519_max;
+
+	*data++ = s->rx_octets;
+	*data++ = s->rx_frames;
+	*data++ = s->rx_mcast_frames;
+	*data++ = s->rx_bcast_frames;
+	*data++ = s->rx_pause;
+	*data++ = s->rx_fcs_errs;
+	*data++ = s->rx_symbol_errs;
+	*data++ = s->rx_short;
+	*data++ = s->rx_jabber;
+	*data++ = s->rx_too_long;
+	*data++ = s->rx_fifo_ovfl;
+
+	*data++ = s->rx_frames_64;
+	*data++ = s->rx_frames_65_127;
+	*data++ = s->rx_frames_128_255;
+	*data++ = s->rx_frames_256_511;
+	*data++ = s->rx_frames_512_1023;
+	*data++ = s->rx_frames_1024_1518;
+	*data++ = s->rx_frames_1519_max;
+
+	*data++ = pi->phy.fifo_errors;
+
+	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_TSO);
+	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_VLANEX);
+	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_VLANINS);
+	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_TX_CSUM);
+	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_RX_CSUM_GOOD);
+	*data++ = 0;
+	*data++ = 0;
+	*data++ = 0;
+	*data++ = s->rx_cong_drops;
+
+	*data++ = s->num_toggled;
+	*data++ = s->num_resets;
+
+	*data++ = s->link_faults;
+}
+
+static inline void reg_block_dump(struct adapter *ap, void *buf,
+				  unsigned int start, unsigned int end)
+{
+	u32 *p = buf + start;
+
+	for (; start <= end; start += sizeof(u32))
+		*p++ = t3_read_reg(ap, start);
+}
+
+static void get_regs(struct net_device *dev, struct ethtool_regs *regs,
+		     void *buf)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *ap = pi->adapter;
+
+	/*
+	 * Version scheme:
+	 * bits 0..9: chip version
+	 * bits 10..15: chip revision
+	 * bit 31: set for PCIe cards
+	 */
+	regs->version = 3 | (ap->params.rev << 10) | (is_pcie(ap) << 31);
+
+	/*
+	 * We skip the MAC statistics registers because they are clear-on-read.
+	 * Also reading multi-register stats would need to synchronize with the
+	 * periodic mac stats accumulation.  Hard to justify the complexity.
+	 */
+	memset(buf, 0, T3_REGMAP_SIZE);
+	reg_block_dump(ap, buf, 0, A_SG_RSPQ_CREDIT_RETURN);
+	reg_block_dump(ap, buf, A_SG_HI_DRB_HI_THRSH, A_ULPRX_PBL_ULIMIT);
+	reg_block_dump(ap, buf, A_ULPTX_CONFIG, A_MPS_INT_CAUSE);
+	reg_block_dump(ap, buf, A_CPL_SWITCH_CNTRL, A_CPL_MAP_TBL_DATA);
+	reg_block_dump(ap, buf, A_SMB_GLOBAL_TIME_CFG, A_XGM_SERDES_STAT3);
+	reg_block_dump(ap, buf, A_XGM_SERDES_STATUS0,
+		       XGM_REG(A_XGM_SERDES_STAT3, 1));
+	reg_block_dump(ap, buf, XGM_REG(A_XGM_SERDES_STATUS0, 1),
+		       XGM_REG(A_XGM_RX_SPI4_SOP_EOP_CNT, 1));
+}
+
+static int restart_autoneg(struct net_device *dev)
+{
+	struct port_info *p = netdev_priv(dev);
+
+	if (!netif_running(dev))
+		return -EAGAIN;
+	if (p->link_config.autoneg != AUTONEG_ENABLE)
+		return -EINVAL;
+	p->phy.ops->autoneg_restart(&p->phy);
+	return 0;
+}
+
+static int set_phys_id(struct net_device *dev,
+		       enum ethtool_phys_id_state state)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		return 1;	/* cycle on/off once per second */
+
+	case ETHTOOL_ID_OFF:
+		t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, F_GPIO0_OUT_VAL, 0);
+		break;
+
+	case ETHTOOL_ID_ON:
+	case ETHTOOL_ID_INACTIVE:
+		t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, F_GPIO0_OUT_VAL,
+			 F_GPIO0_OUT_VAL);
+	}
+
+	return 0;
+}
+
+static int get_link_ksettings(struct net_device *dev,
+			      struct ethtool_link_ksettings *cmd)
+{
+	struct port_info *p = netdev_priv(dev);
+	u32 supported;
+
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
+						p->link_config.supported);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
+						p->link_config.advertising);
+
+	if (netif_carrier_ok(dev)) {
+		cmd->base.speed = p->link_config.speed;
+		cmd->base.duplex = p->link_config.duplex;
+	} else {
+		cmd->base.speed = SPEED_UNKNOWN;
+		cmd->base.duplex = DUPLEX_UNKNOWN;
+	}
+
+	ethtool_convert_link_mode_to_legacy_u32(&supported,
+						cmd->link_modes.supported);
+
+	cmd->base.port = (supported & SUPPORTED_TP) ? PORT_TP : PORT_FIBRE;
+	cmd->base.phy_address = p->phy.mdio.prtad;
+	cmd->base.autoneg = p->link_config.autoneg;
+	return 0;
+}
+
+static int speed_duplex_to_caps(int speed, int duplex)
+{
+	int cap = 0;
+
+	switch (speed) {
+	case SPEED_10:
+		if (duplex == DUPLEX_FULL)
+			cap = SUPPORTED_10baseT_Full;
+		else
+			cap = SUPPORTED_10baseT_Half;
+		break;
+	case SPEED_100:
+		if (duplex == DUPLEX_FULL)
+			cap = SUPPORTED_100baseT_Full;
+		else
+			cap = SUPPORTED_100baseT_Half;
+		break;
+	case SPEED_1000:
+		if (duplex == DUPLEX_FULL)
+			cap = SUPPORTED_1000baseT_Full;
+		else
+			cap = SUPPORTED_1000baseT_Half;
+		break;
+	case SPEED_10000:
+		if (duplex == DUPLEX_FULL)
+			cap = SUPPORTED_10000baseT_Full;
+	}
+	return cap;
+}
+
+#define ADVERTISED_MASK (ADVERTISED_10baseT_Half | ADVERTISED_10baseT_Full | \
+		      ADVERTISED_100baseT_Half | ADVERTISED_100baseT_Full | \
+		      ADVERTISED_1000baseT_Half | ADVERTISED_1000baseT_Full | \
+		      ADVERTISED_10000baseT_Full)
+
+static int set_link_ksettings(struct net_device *dev,
+			      const struct ethtool_link_ksettings *cmd)
+{
+	struct port_info *p = netdev_priv(dev);
+	struct link_config *lc = &p->link_config;
+	u32 advertising;
+
+	ethtool_convert_link_mode_to_legacy_u32(&advertising,
+						cmd->link_modes.advertising);
+
+	if (!(lc->supported & SUPPORTED_Autoneg)) {
+		/*
+		 * PHY offers a single speed/duplex.  See if that's what's
+		 * being requested.
+		 */
+		if (cmd->base.autoneg == AUTONEG_DISABLE) {
+			u32 speed = cmd->base.speed;
+			int cap = speed_duplex_to_caps(speed, cmd->base.duplex);
+			if (lc->supported & cap)
+				return 0;
+		}
+		return -EINVAL;
+	}
+
+	if (cmd->base.autoneg == AUTONEG_DISABLE) {
+		u32 speed = cmd->base.speed;
+		int cap = speed_duplex_to_caps(speed, cmd->base.duplex);
+
+		if (!(lc->supported & cap) || (speed == SPEED_1000))
+			return -EINVAL;
+		lc->requested_speed = speed;
+		lc->requested_duplex = cmd->base.duplex;
+		lc->advertising = 0;
+	} else {
+		advertising &= ADVERTISED_MASK;
+		advertising &= lc->supported;
+		if (!advertising)
+			return -EINVAL;
+		lc->requested_speed = SPEED_INVALID;
+		lc->requested_duplex = DUPLEX_INVALID;
+		lc->advertising = advertising | ADVERTISED_Autoneg;
+	}
+	lc->autoneg = cmd->base.autoneg;
+	if (netif_running(dev))
+		t3_link_start(&p->phy, &p->mac, lc);
+	return 0;
+}
+
+static void get_pauseparam(struct net_device *dev,
+			   struct ethtool_pauseparam *epause)
+{
+	struct port_info *p = netdev_priv(dev);
+
+	epause->autoneg = (p->link_config.requested_fc & PAUSE_AUTONEG) != 0;
+	epause->rx_pause = (p->link_config.fc & PAUSE_RX) != 0;
+	epause->tx_pause = (p->link_config.fc & PAUSE_TX) != 0;
+}
+
+static int set_pauseparam(struct net_device *dev,
+			  struct ethtool_pauseparam *epause)
+{
+	struct port_info *p = netdev_priv(dev);
+	struct link_config *lc = &p->link_config;
+
+	if (epause->autoneg == AUTONEG_DISABLE)
+		lc->requested_fc = 0;
+	else if (lc->supported & SUPPORTED_Autoneg)
+		lc->requested_fc = PAUSE_AUTONEG;
+	else
+		return -EINVAL;
+
+	if (epause->rx_pause)
+		lc->requested_fc |= PAUSE_RX;
+	if (epause->tx_pause)
+		lc->requested_fc |= PAUSE_TX;
+	if (lc->autoneg == AUTONEG_ENABLE) {
+		if (netif_running(dev))
+			t3_link_start(&p->phy, &p->mac, lc);
+	} else {
+		lc->fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
+		if (netif_running(dev))
+			t3_mac_set_speed_duplex_fc(&p->mac, -1, -1, lc->fc);
+	}
+	return 0;
+}
+
+static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	const struct qset_params *q = &adapter->params.sge.qset[pi->first_qset];
+
+	e->rx_max_pending = MAX_RX_BUFFERS;
+	e->rx_jumbo_max_pending = MAX_RX_JUMBO_BUFFERS;
+	e->tx_max_pending = MAX_TXQ_ENTRIES;
+
+	e->rx_pending = q->fl_size;
+	e->rx_mini_pending = q->rspq_size;
+	e->rx_jumbo_pending = q->jumbo_size;
+	e->tx_pending = q->txq_size[0];
+}
+
+static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct qset_params *q;
+	int i;
+
+	if (e->rx_pending > MAX_RX_BUFFERS ||
+	    e->rx_jumbo_pending > MAX_RX_JUMBO_BUFFERS ||
+	    e->tx_pending > MAX_TXQ_ENTRIES ||
+	    e->rx_mini_pending > MAX_RSPQ_ENTRIES ||
+	    e->rx_mini_pending < MIN_RSPQ_ENTRIES ||
+	    e->rx_pending < MIN_FL_ENTRIES ||
+	    e->rx_jumbo_pending < MIN_FL_ENTRIES ||
+	    e->tx_pending < adapter->params.nports * MIN_TXQ_ENTRIES)
+		return -EINVAL;
+
+	if (adapter->flags & FULL_INIT_DONE)
+		return -EBUSY;
+
+	q = &adapter->params.sge.qset[pi->first_qset];
+	for (i = 0; i < pi->nqsets; ++i, ++q) {
+		q->rspq_size = e->rx_mini_pending;
+		q->fl_size = e->rx_pending;
+		q->jumbo_size = e->rx_jumbo_pending;
+		q->txq_size[0] = e->tx_pending;
+		q->txq_size[1] = e->tx_pending;
+		q->txq_size[2] = e->tx_pending;
+	}
+	return 0;
+}
+
+static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct qset_params *qsp;
+	struct sge_qset *qs;
+	int i;
+
+	if (c->rx_coalesce_usecs * 10 > M_NEWTIMER)
+		return -EINVAL;
+
+	for (i = 0; i < pi->nqsets; i++) {
+		qsp = &adapter->params.sge.qset[i];
+		qs = &adapter->sge.qs[i];
+		qsp->coalesce_usecs = c->rx_coalesce_usecs;
+		t3_update_qset_coalesce(qs, qsp);
+	}
+
+	return 0;
+}
+
+static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct qset_params *q = adapter->params.sge.qset;
+
+	c->rx_coalesce_usecs = q->coalesce_usecs;
+	return 0;
+}
+
+static int get_eeprom(struct net_device *dev, struct ethtool_eeprom *e,
+		      u8 * data)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int i, err = 0;
+
+	u8 *buf = kmalloc(EEPROMSIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	e->magic = EEPROM_MAGIC;
+	for (i = e->offset & ~3; !err && i < e->offset + e->len; i += 4)
+		err = t3_seeprom_read(adapter, i, (__le32 *) & buf[i]);
+
+	if (!err)
+		memcpy(data, buf + e->offset, e->len);
+	kfree(buf);
+	return err;
+}
+
+static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
+		      u8 * data)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	u32 aligned_offset, aligned_len;
+	__le32 *p;
+	u8 *buf;
+	int err;
+
+	if (eeprom->magic != EEPROM_MAGIC)
+		return -EINVAL;
+
+	aligned_offset = eeprom->offset & ~3;
+	aligned_len = (eeprom->len + (eeprom->offset & 3) + 3) & ~3;
+
+	if (aligned_offset != eeprom->offset || aligned_len != eeprom->len) {
+		buf = kmalloc(aligned_len, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		err = t3_seeprom_read(adapter, aligned_offset, (__le32 *) buf);
+		if (!err && aligned_len > 4)
+			err = t3_seeprom_read(adapter,
+					      aligned_offset + aligned_len - 4,
+					      (__le32 *) & buf[aligned_len - 4]);
+		if (err)
+			goto out;
+		memcpy(buf + (eeprom->offset & 3), data, eeprom->len);
+	} else
+		buf = data;
+
+	err = t3_seeprom_wp(adapter, 0);
+	if (err)
+		goto out;
+
+	for (p = (__le32 *) buf; !err && aligned_len; aligned_len -= 4, p++) {
+		err = t3_seeprom_write(adapter, aligned_offset, *p);
+		aligned_offset += 4;
+	}
+
+	if (!err)
+		err = t3_seeprom_wp(adapter, 1);
+out:
+	if (buf != data)
+		kfree(buf);
+	return err;
+}
+
+static void get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+	wol->supported = 0;
+	wol->wolopts = 0;
+	memset(&wol->sopass, 0, sizeof(wol->sopass));
+}
+
+static const struct ethtool_ops cxgb_ethtool_ops = {
+	.get_drvinfo = get_drvinfo,
+	.get_msglevel = get_msglevel,
+	.set_msglevel = set_msglevel,
+	.get_ringparam = get_sge_param,
+	.set_ringparam = set_sge_param,
+	.get_coalesce = get_coalesce,
+	.set_coalesce = set_coalesce,
+	.get_eeprom_len = get_eeprom_len,
+	.get_eeprom = get_eeprom,
+	.set_eeprom = set_eeprom,
+	.get_pauseparam = get_pauseparam,
+	.set_pauseparam = set_pauseparam,
+	.get_link = ethtool_op_get_link,
+	.get_strings = get_strings,
+	.set_phys_id = set_phys_id,
+	.nway_reset = restart_autoneg,
+	.get_sset_count = get_sset_count,
+	.get_ethtool_stats = get_stats,
+	.get_regs_len = get_regs_len,
+	.get_regs = get_regs,
+	.get_wol = get_wol,
+	.get_link_ksettings = get_link_ksettings,
+	.set_link_ksettings = set_link_ksettings,
+};
+
+static int in_range(int val, int lo, int hi)
+{
+	return val < 0 || (val <= hi && val >= lo);
+}
+
+static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	u32 cmd;
+	int ret;
+
+	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+		return -EFAULT;
+
+	switch (cmd) {
+	case CHELSIO_SET_QSET_PARAMS:{
+		int i;
+		struct qset_params *q;
+		struct ch_qset_params t;
+		int q1 = pi->first_qset;
+		int nqsets = pi->nqsets;
+
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (copy_from_user(&t, useraddr, sizeof(t)))
+			return -EFAULT;
+		if (t.qset_idx >= SGE_QSETS)
+			return -EINVAL;
+		if (!in_range(t.intr_lat, 0, M_NEWTIMER) ||
+		    !in_range(t.cong_thres, 0, 255) ||
+		    !in_range(t.txq_size[0], MIN_TXQ_ENTRIES,
+			      MAX_TXQ_ENTRIES) ||
+		    !in_range(t.txq_size[1], MIN_TXQ_ENTRIES,
+			      MAX_TXQ_ENTRIES) ||
+		    !in_range(t.txq_size[2], MIN_CTRL_TXQ_ENTRIES,
+			      MAX_CTRL_TXQ_ENTRIES) ||
+		    !in_range(t.fl_size[0], MIN_FL_ENTRIES,
+			      MAX_RX_BUFFERS) ||
+		    !in_range(t.fl_size[1], MIN_FL_ENTRIES,
+			      MAX_RX_JUMBO_BUFFERS) ||
+		    !in_range(t.rspq_size, MIN_RSPQ_ENTRIES,
+			      MAX_RSPQ_ENTRIES))
+			return -EINVAL;
+
+		if ((adapter->flags & FULL_INIT_DONE) &&
+			(t.rspq_size >= 0 || t.fl_size[0] >= 0 ||
+			t.fl_size[1] >= 0 || t.txq_size[0] >= 0 ||
+			t.txq_size[1] >= 0 || t.txq_size[2] >= 0 ||
+			t.polling >= 0 || t.cong_thres >= 0))
+			return -EBUSY;
+
+		/* Allow setting of any available qset when offload enabled */
+		if (test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) {
+			q1 = 0;
+			for_each_port(adapter, i) {
+				pi = adap2pinfo(adapter, i);
+				nqsets += pi->first_qset + pi->nqsets;
+			}
+		}
+
+		if (t.qset_idx < q1)
+			return -EINVAL;
+		if (t.qset_idx > q1 + nqsets - 1)
+			return -EINVAL;
+
+		q = &adapter->params.sge.qset[t.qset_idx];
+
+		if (t.rspq_size >= 0)
+			q->rspq_size = t.rspq_size;
+		if (t.fl_size[0] >= 0)
+			q->fl_size = t.fl_size[0];
+		if (t.fl_size[1] >= 0)
+			q->jumbo_size = t.fl_size[1];
+		if (t.txq_size[0] >= 0)
+			q->txq_size[0] = t.txq_size[0];
+		if (t.txq_size[1] >= 0)
+			q->txq_size[1] = t.txq_size[1];
+		if (t.txq_size[2] >= 0)
+			q->txq_size[2] = t.txq_size[2];
+		if (t.cong_thres >= 0)
+			q->cong_thres = t.cong_thres;
+		if (t.intr_lat >= 0) {
+			struct sge_qset *qs =
+				&adapter->sge.qs[t.qset_idx];
+
+			q->coalesce_usecs = t.intr_lat;
+			t3_update_qset_coalesce(qs, q);
+		}
+		if (t.polling >= 0) {
+			if (adapter->flags & USING_MSIX)
+				q->polling = t.polling;
+			else {
+				/* No polling with INTx for T3A */
+				if (adapter->params.rev == 0 &&
+					!(adapter->flags & USING_MSI))
+					t.polling = 0;
+
+				for (i = 0; i < SGE_QSETS; i++) {
+					q = &adapter->params.sge.
+						qset[i];
+					q->polling = t.polling;
+				}
+			}
+		}
+
+		if (t.lro >= 0) {
+			if (t.lro)
+				dev->wanted_features |= NETIF_F_GRO;
+			else
+				dev->wanted_features &= ~NETIF_F_GRO;
+			netdev_update_features(dev);
+		}
+
+		break;
+	}
+	case CHELSIO_GET_QSET_PARAMS:{
+		struct qset_params *q;
+		struct ch_qset_params t;
+		int q1 = pi->first_qset;
+		int nqsets = pi->nqsets;
+		int i;
+
+		if (copy_from_user(&t, useraddr, sizeof(t)))
+			return -EFAULT;
+
+		/* Display qsets for all ports when offload enabled */
+		if (test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) {
+			q1 = 0;
+			for_each_port(adapter, i) {
+				pi = adap2pinfo(adapter, i);
+				nqsets = pi->first_qset + pi->nqsets;
+			}
+		}
+
+		if (t.qset_idx >= nqsets)
+			return -EINVAL;
+
+		q = &adapter->params.sge.qset[q1 + t.qset_idx];
+		t.rspq_size = q->rspq_size;
+		t.txq_size[0] = q->txq_size[0];
+		t.txq_size[1] = q->txq_size[1];
+		t.txq_size[2] = q->txq_size[2];
+		t.fl_size[0] = q->fl_size;
+		t.fl_size[1] = q->jumbo_size;
+		t.polling = q->polling;
+		t.lro = !!(dev->features & NETIF_F_GRO);
+		t.intr_lat = q->coalesce_usecs;
+		t.cong_thres = q->cong_thres;
+		t.qnum = q1;
+
+		if (adapter->flags & USING_MSIX)
+			t.vector = adapter->msix_info[q1 + t.qset_idx + 1].vec;
+		else
+			t.vector = adapter->pdev->irq;
+
+		if (copy_to_user(useraddr, &t, sizeof(t)))
+			return -EFAULT;
+		break;
+	}
+	case CHELSIO_SET_QSET_NUM:{
+		struct ch_reg edata;
+		unsigned int i, first_qset = 0, other_qsets = 0;
+
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (adapter->flags & FULL_INIT_DONE)
+			return -EBUSY;
+		if (copy_from_user(&edata, useraddr, sizeof(edata)))
+			return -EFAULT;
+		if (edata.val < 1 ||
+			(edata.val > 1 && !(adapter->flags & USING_MSIX)))
+			return -EINVAL;
+
+		for_each_port(adapter, i)
+			if (adapter->port[i] && adapter->port[i] != dev)
+				other_qsets += adap2pinfo(adapter, i)->nqsets;
+
+		if (edata.val + other_qsets > SGE_QSETS)
+			return -EINVAL;
+
+		pi->nqsets = edata.val;
+
+		for_each_port(adapter, i)
+			if (adapter->port[i]) {
+				pi = adap2pinfo(adapter, i);
+				pi->first_qset = first_qset;
+				first_qset += pi->nqsets;
+			}
+		break;
+	}
+	case CHELSIO_GET_QSET_NUM:{
+		struct ch_reg edata;
+
+		memset(&edata, 0, sizeof(struct ch_reg));
+
+		edata.cmd = CHELSIO_GET_QSET_NUM;
+		edata.val = pi->nqsets;
+		if (copy_to_user(useraddr, &edata, sizeof(edata)))
+			return -EFAULT;
+		break;
+	}
+	case CHELSIO_LOAD_FW:{
+		u8 *fw_data;
+		struct ch_mem_range t;
+
+		if (!capable(CAP_SYS_RAWIO))
+			return -EPERM;
+		if (copy_from_user(&t, useraddr, sizeof(t)))
+			return -EFAULT;
+		/* Check t.len sanity ? */
+		fw_data = memdup_user(useraddr + sizeof(t), t.len);
+		if (IS_ERR(fw_data))
+			return PTR_ERR(fw_data);
+
+		ret = t3_load_fw(adapter, fw_data, t.len);
+		kfree(fw_data);
+		if (ret)
+			return ret;
+		break;
+	}
+	case CHELSIO_SETMTUTAB:{
+		struct ch_mtus m;
+		int i;
+
+		if (!is_offload(adapter))
+			return -EOPNOTSUPP;
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (offload_running(adapter))
+			return -EBUSY;
+		if (copy_from_user(&m, useraddr, sizeof(m)))
+			return -EFAULT;
+		if (m.nmtus != NMTUS)
+			return -EINVAL;
+		if (m.mtus[0] < 81)	/* accommodate SACK */
+			return -EINVAL;
+
+		/* MTUs must be in ascending order */
+		for (i = 1; i < NMTUS; ++i)
+			if (m.mtus[i] < m.mtus[i - 1])
+				return -EINVAL;
+
+		memcpy(adapter->params.mtus, m.mtus,
+			sizeof(adapter->params.mtus));
+		break;
+	}
+	case CHELSIO_GET_PM:{
+		struct tp_params *p = &adapter->params.tp;
+		struct ch_pm m = {.cmd = CHELSIO_GET_PM };
+
+		if (!is_offload(adapter))
+			return -EOPNOTSUPP;
+		m.tx_pg_sz = p->tx_pg_size;
+		m.tx_num_pg = p->tx_num_pgs;
+		m.rx_pg_sz = p->rx_pg_size;
+		m.rx_num_pg = p->rx_num_pgs;
+		m.pm_total = p->pmtx_size + p->chan_rx_size * p->nchan;
+		if (copy_to_user(useraddr, &m, sizeof(m)))
+			return -EFAULT;
+		break;
+	}
+	case CHELSIO_SET_PM:{
+		struct ch_pm m;
+		struct tp_params *p = &adapter->params.tp;
+
+		if (!is_offload(adapter))
+			return -EOPNOTSUPP;
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (adapter->flags & FULL_INIT_DONE)
+			return -EBUSY;
+		if (copy_from_user(&m, useraddr, sizeof(m)))
+			return -EFAULT;
+		if (!is_power_of_2(m.rx_pg_sz) ||
+			!is_power_of_2(m.tx_pg_sz))
+			return -EINVAL;	/* not power of 2 */
+		if (!(m.rx_pg_sz & 0x14000))
+			return -EINVAL;	/* not 16KB or 64KB */
+		if (!(m.tx_pg_sz & 0x1554000))
+			return -EINVAL;
+		if (m.tx_num_pg == -1)
+			m.tx_num_pg = p->tx_num_pgs;
+		if (m.rx_num_pg == -1)
+			m.rx_num_pg = p->rx_num_pgs;
+		if (m.tx_num_pg % 24 || m.rx_num_pg % 24)
+			return -EINVAL;
+		if (m.rx_num_pg * m.rx_pg_sz > p->chan_rx_size ||
+			m.tx_num_pg * m.tx_pg_sz > p->chan_tx_size)
+			return -EINVAL;
+		p->rx_pg_size = m.rx_pg_sz;
+		p->tx_pg_size = m.tx_pg_sz;
+		p->rx_num_pgs = m.rx_num_pg;
+		p->tx_num_pgs = m.tx_num_pg;
+		break;
+	}
+	case CHELSIO_GET_MEM:{
+		struct ch_mem_range t;
+		struct mc7 *mem;
+		u64 buf[32];
+
+		if (!is_offload(adapter))
+			return -EOPNOTSUPP;
+		if (!(adapter->flags & FULL_INIT_DONE))
+			return -EIO;	/* need the memory controllers */
+		if (copy_from_user(&t, useraddr, sizeof(t)))
+			return -EFAULT;
+		if ((t.addr & 7) || (t.len & 7))
+			return -EINVAL;
+		if (t.mem_id == MEM_CM)
+			mem = &adapter->cm;
+		else if (t.mem_id == MEM_PMRX)
+			mem = &adapter->pmrx;
+		else if (t.mem_id == MEM_PMTX)
+			mem = &adapter->pmtx;
+		else
+			return -EINVAL;
+
+		/*
+		 * Version scheme:
+		 * bits 0..9: chip version
+		 * bits 10..15: chip revision
+		 */
+		t.version = 3 | (adapter->params.rev << 10);
+		if (copy_to_user(useraddr, &t, sizeof(t)))
+			return -EFAULT;
+
+		/*
+		 * Read 256 bytes at a time as len can be large and we don't
+		 * want to use huge intermediate buffers.
+		 */
+		useraddr += sizeof(t);	/* advance to start of buffer */
+		while (t.len) {
+			unsigned int chunk =
+				min_t(unsigned int, t.len, sizeof(buf));
+
+			ret =
+				t3_mc7_bd_read(mem, t.addr / 8, chunk / 8,
+						buf);
+			if (ret)
+				return ret;
+			if (copy_to_user(useraddr, buf, chunk))
+				return -EFAULT;
+			useraddr += chunk;
+			t.addr += chunk;
+			t.len -= chunk;
+		}
+		break;
+	}
+	case CHELSIO_SET_TRACE_FILTER:{
+		struct ch_trace t;
+		const struct trace_params *tp;
+
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (!offload_running(adapter))
+			return -EAGAIN;
+		if (copy_from_user(&t, useraddr, sizeof(t)))
+			return -EFAULT;
+
+		tp = (const struct trace_params *)&t.sip;
+		if (t.config_tx)
+			t3_config_trace_filter(adapter, tp, 0,
+						t.invert_match,
+						t.trace_tx);
+		if (t.config_rx)
+			t3_config_trace_filter(adapter, tp, 1,
+						t.invert_match,
+						t.trace_rx);
+		break;
+	}
+	default:
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+static int cxgb_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
+{
+	struct mii_ioctl_data *data = if_mii(req);
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	switch (cmd) {
+	case SIOCGMIIREG:
+	case SIOCSMIIREG:
+		/* Convert phy_id from older PRTAD/DEVAD format */
+		if (is_10G(adapter) &&
+		    !mdio_phy_id_is_c45(data->phy_id) &&
+		    (data->phy_id & 0x1f00) &&
+		    !(data->phy_id & 0xe0e0))
+			data->phy_id = mdio_phy_id_c45(data->phy_id >> 8,
+						       data->phy_id & 0x1f);
+		/* FALLTHRU */
+	case SIOCGMIIPHY:
+		return mdio_mii_ioctl(&pi->phy.mdio, data, cmd);
+	case SIOCCHIOCTL:
+		return cxgb_extension_ioctl(dev, req->ifr_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int cxgb_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int ret;
+
+	if ((ret = t3_mac_set_mtu(&pi->mac, new_mtu)))
+		return ret;
+	dev->mtu = new_mtu;
+	init_port_mtus(adapter);
+	if (adapter->params.rev == 0 && offload_running(adapter))
+		t3_load_mtus(adapter, adapter->params.mtus,
+			     adapter->params.a_wnd, adapter->params.b_wnd,
+			     adapter->port[0]->mtu);
+	return 0;
+}
+
+static int cxgb_set_mac_addr(struct net_device *dev, void *p)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	t3_mac_set_address(&pi->mac, LAN_MAC_IDX, dev->dev_addr);
+	if (offload_running(adapter))
+		write_smt_entry(adapter, pi->port_id);
+	return 0;
+}
+
+static netdev_features_t cxgb_fix_features(struct net_device *dev,
+	netdev_features_t features)
+{
+	/*
+	 * Since there is no support for separate rx/tx vlan accel
+	 * enable/disable make sure tx flag is always in same state as rx.
+	 */
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
+	else
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
+
+	return features;
+}
+
+static int cxgb_set_features(struct net_device *dev, netdev_features_t features)
+{
+	netdev_features_t changed = dev->features ^ features;
+
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
+		cxgb_vlan_mode(dev, features);
+
+	return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void cxgb_netpoll(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int qidx;
+
+	for (qidx = pi->first_qset; qidx < pi->first_qset + pi->nqsets; qidx++) {
+		struct sge_qset *qs = &adapter->sge.qs[qidx];
+		void *source;
+
+		if (adapter->flags & USING_MSIX)
+			source = qs;
+		else
+			source = adapter;
+
+		t3_intr_handler(adapter, qs->rspq.polling) (0, source);
+	}
+}
+#endif
+
+/*
+ * Periodic accumulation of MAC statistics.
+ */
+static void mac_stats_update(struct adapter *adapter)
+{
+	int i;
+
+	for_each_port(adapter, i) {
+		struct net_device *dev = adapter->port[i];
+		struct port_info *p = netdev_priv(dev);
+
+		if (netif_running(dev)) {
+			spin_lock(&adapter->stats_lock);
+			t3_mac_update_stats(&p->mac);
+			spin_unlock(&adapter->stats_lock);
+		}
+	}
+}
+
+static void check_link_status(struct adapter *adapter)
+{
+	int i;
+
+	for_each_port(adapter, i) {
+		struct net_device *dev = adapter->port[i];
+		struct port_info *p = netdev_priv(dev);
+		int link_fault;
+
+		spin_lock_irq(&adapter->work_lock);
+		link_fault = p->link_fault;
+		spin_unlock_irq(&adapter->work_lock);
+
+		if (link_fault) {
+			t3_link_fault(adapter, i);
+			continue;
+		}
+
+		if (!(p->phy.caps & SUPPORTED_IRQ) && netif_running(dev)) {
+			t3_xgm_intr_disable(adapter, i);
+			t3_read_reg(adapter, A_XGM_INT_STATUS + p->mac.offset);
+
+			t3_link_changed(adapter, i);
+			t3_xgm_intr_enable(adapter, i);
+		}
+	}
+}
+
+static void check_t3b2_mac(struct adapter *adapter)
+{
+	int i;
+
+	if (!rtnl_trylock())	/* synchronize with ifdown */
+		return;
+
+	for_each_port(adapter, i) {
+		struct net_device *dev = adapter->port[i];
+		struct port_info *p = netdev_priv(dev);
+		int status;
+
+		if (!netif_running(dev))
+			continue;
+
+		status = 0;
+		if (netif_running(dev) && netif_carrier_ok(dev))
+			status = t3b2_mac_watchdog_task(&p->mac);
+		if (status == 1)
+			p->mac.stats.num_toggled++;
+		else if (status == 2) {
+			struct cmac *mac = &p->mac;
+
+			t3_mac_set_mtu(mac, dev->mtu);
+			t3_mac_set_address(mac, LAN_MAC_IDX, dev->dev_addr);
+			cxgb_set_rxmode(dev);
+			t3_link_start(&p->phy, mac, &p->link_config);
+			t3_mac_enable(mac, MAC_DIRECTION_RX | MAC_DIRECTION_TX);
+			t3_port_intr_enable(adapter, p->port_id);
+			p->mac.stats.num_resets++;
+		}
+	}
+	rtnl_unlock();
+}
+
+
+static void t3_adap_check_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       adap_check_task.work);
+	const struct adapter_params *p = &adapter->params;
+	int port;
+	unsigned int v, status, reset;
+
+	adapter->check_task_cnt++;
+
+	check_link_status(adapter);
+
+	/* Accumulate MAC stats if needed */
+	if (!p->linkpoll_period ||
+	    (adapter->check_task_cnt * p->linkpoll_period) / 10 >=
+	    p->stats_update_period) {
+		mac_stats_update(adapter);
+		adapter->check_task_cnt = 0;
+	}
+
+	if (p->rev == T3_REV_B2)
+		check_t3b2_mac(adapter);
+
+	/*
+	 * Scan the XGMAC's to check for various conditions which we want to
+	 * monitor in a periodic polling manner rather than via an interrupt
+	 * condition.  This is used for conditions which would otherwise flood
+	 * the system with interrupts and we only really need to know that the
+	 * conditions are "happening" ...  For each condition we count the
+	 * detection of the condition and reset it for the next polling loop.
+	 */
+	for_each_port(adapter, port) {
+		struct cmac *mac =  &adap2pinfo(adapter, port)->mac;
+		u32 cause;
+
+		cause = t3_read_reg(adapter, A_XGM_INT_CAUSE + mac->offset);
+		reset = 0;
+		if (cause & F_RXFIFO_OVERFLOW) {
+			mac->stats.rx_fifo_ovfl++;
+			reset |= F_RXFIFO_OVERFLOW;
+		}
+
+		t3_write_reg(adapter, A_XGM_INT_CAUSE + mac->offset, reset);
+	}
+
+	/*
+	 * We do the same as above for FL_EMPTY interrupts.
+	 */
+	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
+	reset = 0;
+
+	if (status & F_FLEMPTY) {
+		struct sge_qset *qs = &adapter->sge.qs[0];
+		int i = 0;
+
+		reset |= F_FLEMPTY;
+
+		v = (t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS) >> S_FL0EMPTY) &
+		    0xffff;
+
+		while (v) {
+			qs->fl[i].empty += (v & 1);
+			if (i)
+				qs++;
+			i ^= 1;
+			v >>= 1;
+		}
+	}
+
+	t3_write_reg(adapter, A_SG_INT_CAUSE, reset);
+
+	/* Schedule the next check update if any port is active. */
+	spin_lock_irq(&adapter->work_lock);
+	if (adapter->open_device_map & PORT_MASK)
+		schedule_chk_task(adapter);
+	spin_unlock_irq(&adapter->work_lock);
+}
+
+static void db_full_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       db_full_task);
+
+	cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_FULL, 0);
+}
+
+static void db_empty_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       db_empty_task);
+
+	cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_EMPTY, 0);
+}
+
+static void db_drop_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       db_drop_task);
+	unsigned long delay = 1000;
+	unsigned short r;
+
+	cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_DROP, 0);
+
+	/*
+	 * Sleep a while before ringing the driver qset dbs.
+	 * The delay is between 1000-2023 usecs.
+	 */
+	get_random_bytes(&r, 2);
+	delay += r & 1023;
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(usecs_to_jiffies(delay));
+	ring_dbs(adapter);
+}
+
+/*
+ * Processes external (PHY) interrupts in process context.
+ */
+static void ext_intr_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       ext_intr_handler_task);
+	int i;
+
+	/* Disable link fault interrupts */
+	for_each_port(adapter, i) {
+		struct net_device *dev = adapter->port[i];
+		struct port_info *p = netdev_priv(dev);
+
+		t3_xgm_intr_disable(adapter, i);
+		t3_read_reg(adapter, A_XGM_INT_STATUS + p->mac.offset);
+	}
+
+	/* Re-enable link fault interrupts */
+	t3_phy_intr_handler(adapter);
+
+	for_each_port(adapter, i)
+		t3_xgm_intr_enable(adapter, i);
+
+	/* Now reenable external interrupts */
+	spin_lock_irq(&adapter->work_lock);
+	if (adapter->slow_intr_mask) {
+		adapter->slow_intr_mask |= F_T3DBG;
+		t3_write_reg(adapter, A_PL_INT_CAUSE0, F_T3DBG);
+		t3_write_reg(adapter, A_PL_INT_ENABLE0,
+			     adapter->slow_intr_mask);
+	}
+	spin_unlock_irq(&adapter->work_lock);
+}
+
+/*
+ * Interrupt-context handler for external (PHY) interrupts.
+ */
+void t3_os_ext_intr_handler(struct adapter *adapter)
+{
+	/*
+	 * Schedule a task to handle external interrupts as they may be slow
+	 * and we use a mutex to protect MDIO registers.  We disable PHY
+	 * interrupts in the meantime and let the task reenable them when
+	 * it's done.
+	 */
+	spin_lock(&adapter->work_lock);
+	if (adapter->slow_intr_mask) {
+		adapter->slow_intr_mask &= ~F_T3DBG;
+		t3_write_reg(adapter, A_PL_INT_ENABLE0,
+			     adapter->slow_intr_mask);
+		queue_work(cxgb3_wq, &adapter->ext_intr_handler_task);
+	}
+	spin_unlock(&adapter->work_lock);
+}
+
+void t3_os_link_fault_handler(struct adapter *adapter, int port_id)
+{
+	struct net_device *netdev = adapter->port[port_id];
+	struct port_info *pi = netdev_priv(netdev);
+
+	spin_lock(&adapter->work_lock);
+	pi->link_fault = 1;
+	spin_unlock(&adapter->work_lock);
+}
+
+static int t3_adapter_error(struct adapter *adapter, int reset, int on_wq)
+{
+	int i, ret = 0;
+
+	if (is_offload(adapter) &&
+	    test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) {
+		cxgb3_event_notify(&adapter->tdev, OFFLOAD_STATUS_DOWN, 0);
+		offload_close(&adapter->tdev);
+	}
+
+	/* Stop all ports */
+	for_each_port(adapter, i) {
+		struct net_device *netdev = adapter->port[i];
+
+		if (netif_running(netdev))
+			__cxgb_close(netdev, on_wq);
+	}
+
+	/* Stop SGE timers */
+	t3_stop_sge_timers(adapter);
+
+	adapter->flags &= ~FULL_INIT_DONE;
+
+	if (reset)
+		ret = t3_reset_adapter(adapter);
+
+	pci_disable_device(adapter->pdev);
+
+	return ret;
+}
+
+static int t3_reenable_adapter(struct adapter *adapter)
+{
+	if (pci_enable_device(adapter->pdev)) {
+		dev_err(&adapter->pdev->dev,
+			"Cannot re-enable PCI device after reset.\n");
+		goto err;
+	}
+	pci_set_master(adapter->pdev);
+	pci_restore_state(adapter->pdev);
+	pci_save_state(adapter->pdev);
+
+	/* Free sge resources */
+	t3_free_sge_resources(adapter);
+
+	if (t3_replay_prep_adapter(adapter))
+		goto err;
+
+	return 0;
+err:
+	return -1;
+}
+
+static void t3_resume_ports(struct adapter *adapter)
+{
+	int i;
+
+	/* Restart the ports */
+	for_each_port(adapter, i) {
+		struct net_device *netdev = adapter->port[i];
+
+		if (netif_running(netdev)) {
+			if (cxgb_open(netdev)) {
+				dev_err(&adapter->pdev->dev,
+					"can't bring device back up"
+					" after reset\n");
+				continue;
+			}
+		}
+	}
+
+	if (is_offload(adapter) && !ofld_disable)
+		cxgb3_event_notify(&adapter->tdev, OFFLOAD_STATUS_UP, 0);
+}
+
+/*
+ * processes a fatal error.
+ * Bring the ports down, reset the chip, bring the ports back up.
+ */
+static void fatal_error_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       fatal_error_handler_task);
+	int err = 0;
+
+	rtnl_lock();
+	err = t3_adapter_error(adapter, 1, 1);
+	if (!err)
+		err = t3_reenable_adapter(adapter);
+	if (!err)
+		t3_resume_ports(adapter);
+
+	CH_ALERT(adapter, "adapter reset %s\n", err ? "failed" : "succeeded");
+	rtnl_unlock();
+}
+
+void t3_fatal_err(struct adapter *adapter)
+{
+	unsigned int fw_status[4];
+
+	if (adapter->flags & FULL_INIT_DONE) {
+		t3_sge_stop(adapter);
+		t3_write_reg(adapter, A_XGM_TX_CTRL, 0);
+		t3_write_reg(adapter, A_XGM_RX_CTRL, 0);
+		t3_write_reg(adapter, XGM_REG(A_XGM_TX_CTRL, 1), 0);
+		t3_write_reg(adapter, XGM_REG(A_XGM_RX_CTRL, 1), 0);
+
+		spin_lock(&adapter->work_lock);
+		t3_intr_disable(adapter);
+		queue_work(cxgb3_wq, &adapter->fatal_error_handler_task);
+		spin_unlock(&adapter->work_lock);
+	}
+	CH_ALERT(adapter, "encountered fatal error, operation suspended\n");
+	if (!t3_cim_ctl_blk_read(adapter, 0xa0, 4, fw_status))
+		CH_ALERT(adapter, "FW status: 0x%x, 0x%x, 0x%x, 0x%x\n",
+			 fw_status[0], fw_status[1],
+			 fw_status[2], fw_status[3]);
+}
+
+/**
+ * t3_io_error_detected - called when PCI error is detected
+ * @pdev: Pointer to PCI device
+ * @state: The current pci connection state
+ *
+ * This function is called after a PCI bus error affecting
+ * this device has been detected.
+ */
+static pci_ers_result_t t3_io_error_detected(struct pci_dev *pdev,
+					     pci_channel_state_t state)
+{
+	struct adapter *adapter = pci_get_drvdata(pdev);
+
+	if (state == pci_channel_io_perm_failure)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	t3_adapter_error(adapter, 0, 0);
+
+	/* Request a slot reset. */
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * t3_io_slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Restart the card from scratch, as if from a cold-boot.
+ */
+static pci_ers_result_t t3_io_slot_reset(struct pci_dev *pdev)
+{
+	struct adapter *adapter = pci_get_drvdata(pdev);
+
+	if (!t3_reenable_adapter(adapter))
+		return PCI_ERS_RESULT_RECOVERED;
+
+	return PCI_ERS_RESULT_DISCONNECT;
+}
+
+/**
+ * t3_io_resume - called when traffic can start flowing again.
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the error recovery driver tells us that
+ * its OK to resume normal operation.
+ */
+static void t3_io_resume(struct pci_dev *pdev)
+{
+	struct adapter *adapter = pci_get_drvdata(pdev);
+
+	CH_ALERT(adapter, "adapter recovering, PEX ERR 0x%x\n",
+		 t3_read_reg(adapter, A_PCIE_PEX_ERR));
+
+	rtnl_lock();
+	t3_resume_ports(adapter);
+	rtnl_unlock();
+}
+
+static const struct pci_error_handlers t3_err_handler = {
+	.error_detected = t3_io_error_detected,
+	.slot_reset = t3_io_slot_reset,
+	.resume = t3_io_resume,
+};
+
+/*
+ * Set the number of qsets based on the number of CPUs and the number of ports,
+ * not to exceed the number of available qsets, assuming there are enough qsets
+ * per port in HW.
+ */
+static void set_nqsets(struct adapter *adap)
+{
+	int i, j = 0;
+	int num_cpus = netif_get_num_default_rss_queues();
+	int hwports = adap->params.nports;
+	int nqsets = adap->msix_nvectors - 1;
+
+	if (adap->params.rev > 0 && adap->flags & USING_MSIX) {
+		if (hwports == 2 &&
+		    (hwports * nqsets > SGE_QSETS ||
+		     num_cpus >= nqsets / hwports))
+			nqsets /= hwports;
+		if (nqsets > num_cpus)
+			nqsets = num_cpus;
+		if (nqsets < 1 || hwports == 4)
+			nqsets = 1;
+	} else
+		nqsets = 1;
+
+	for_each_port(adap, i) {
+		struct port_info *pi = adap2pinfo(adap, i);
+
+		pi->first_qset = j;
+		pi->nqsets = nqsets;
+		j = pi->first_qset + nqsets;
+
+		dev_info(&adap->pdev->dev,
+			 "Port %d using %d queue sets.\n", i, nqsets);
+	}
+}
+
+static int cxgb_enable_msix(struct adapter *adap)
+{
+	struct msix_entry entries[SGE_QSETS + 1];
+	int vectors;
+	int i;
+
+	vectors = ARRAY_SIZE(entries);
+	for (i = 0; i < vectors; ++i)
+		entries[i].entry = i;
+
+	vectors = pci_enable_msix_range(adap->pdev, entries,
+					adap->params.nports + 1, vectors);
+	if (vectors < 0)
+		return vectors;
+
+	for (i = 0; i < vectors; ++i)
+		adap->msix_info[i].vec = entries[i].vector;
+	adap->msix_nvectors = vectors;
+
+	return 0;
+}
+
+static void print_port_info(struct adapter *adap, const struct adapter_info *ai)
+{
+	static const char *pci_variant[] = {
+		"PCI", "PCI-X", "PCI-X ECC", "PCI-X 266", "PCI Express"
+	};
+
+	int i;
+	char buf[80];
+
+	if (is_pcie(adap))
+		snprintf(buf, sizeof(buf), "%s x%d",
+			 pci_variant[adap->params.pci.variant],
+			 adap->params.pci.width);
+	else
+		snprintf(buf, sizeof(buf), "%s %dMHz/%d-bit",
+			 pci_variant[adap->params.pci.variant],
+			 adap->params.pci.speed, adap->params.pci.width);
+
+	for_each_port(adap, i) {
+		struct net_device *dev = adap->port[i];
+		const struct port_info *pi = netdev_priv(dev);
+
+		if (!test_bit(i, &adap->registered_device_map))
+			continue;
+		netdev_info(dev, "%s %s %sNIC (rev %d) %s%s\n",
+			    ai->desc, pi->phy.desc,
+			    is_offload(adap) ? "R" : "", adap->params.rev, buf,
+			    (adap->flags & USING_MSIX) ? " MSI-X" :
+			    (adap->flags & USING_MSI) ? " MSI" : "");
+		if (adap->name == dev->name && adap->params.vpd.mclk)
+			pr_info("%s: %uMB CM, %uMB PMTX, %uMB PMRX, S/N: %s\n",
+			       adap->name, t3_mc7_size(&adap->cm) >> 20,
+			       t3_mc7_size(&adap->pmtx) >> 20,
+			       t3_mc7_size(&adap->pmrx) >> 20,
+			       adap->params.vpd.sn);
+	}
+}
+
+static const struct net_device_ops cxgb_netdev_ops = {
+	.ndo_open		= cxgb_open,
+	.ndo_stop		= cxgb_close,
+	.ndo_start_xmit		= t3_eth_xmit,
+	.ndo_get_stats		= cxgb_get_stats,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_rx_mode	= cxgb_set_rxmode,
+	.ndo_do_ioctl		= cxgb_ioctl,
+	.ndo_change_mtu		= cxgb_change_mtu,
+	.ndo_set_mac_address	= cxgb_set_mac_addr,
+	.ndo_fix_features	= cxgb_fix_features,
+	.ndo_set_features	= cxgb_set_features,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= cxgb_netpoll,
+#endif
+};
+
+static void cxgb3_init_iscsi_mac(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+
+	memcpy(pi->iscsic.mac_addr, dev->dev_addr, ETH_ALEN);
+	pi->iscsic.mac_addr[3] |= 0x80;
+}
+
+#define TSO_FLAGS (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN)
+#define VLAN_FEAT (NETIF_F_SG | NETIF_F_IP_CSUM | TSO_FLAGS | \
+			NETIF_F_IPV6_CSUM | NETIF_F_HIGHDMA)
+static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int i, err, pci_using_dac = 0;
+	resource_size_t mmio_start, mmio_len;
+	const struct adapter_info *ai;
+	struct adapter *adapter = NULL;
+	struct port_info *pi;
+
+	pr_info_once("%s - version %s\n", DRV_DESC, DRV_VERSION);
+
+	if (!cxgb3_wq) {
+		cxgb3_wq = create_singlethread_workqueue(DRV_NAME);
+		if (!cxgb3_wq) {
+			pr_err("cannot initialize work queue\n");
+			return -ENOMEM;
+		}
+	}
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "cannot enable PCI device\n");
+		goto out;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		/* Just info, some other driver may have claimed the device. */
+		dev_info(&pdev->dev, "cannot obtain PCI resources\n");
+		goto out_disable_device;
+	}
+
+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+		pci_using_dac = 1;
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+		if (err) {
+			dev_err(&pdev->dev, "unable to obtain 64-bit DMA for "
+			       "coherent allocations\n");
+			goto out_release_regions;
+		}
+	} else if ((err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) != 0) {
+		dev_err(&pdev->dev, "no usable DMA configuration\n");
+		goto out_release_regions;
+	}
+
+	pci_set_master(pdev);
+	pci_save_state(pdev);
+
+	mmio_start = pci_resource_start(pdev, 0);
+	mmio_len = pci_resource_len(pdev, 0);
+	ai = t3_get_adapter_info(ent->driver_data);
+
+	adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+	if (!adapter) {
+		err = -ENOMEM;
+		goto out_release_regions;
+	}
+
+	adapter->nofail_skb =
+		alloc_skb(sizeof(struct cpl_set_tcb_field), GFP_KERNEL);
+	if (!adapter->nofail_skb) {
+		dev_err(&pdev->dev, "cannot allocate nofail buffer\n");
+		err = -ENOMEM;
+		goto out_free_adapter;
+	}
+
+	adapter->regs = ioremap_nocache(mmio_start, mmio_len);
+	if (!adapter->regs) {
+		dev_err(&pdev->dev, "cannot map device registers\n");
+		err = -ENOMEM;
+		goto out_free_adapter;
+	}
+
+	adapter->pdev = pdev;
+	adapter->name = pci_name(pdev);
+	adapter->msg_enable = dflt_msg_enable;
+	adapter->mmio_len = mmio_len;
+
+	mutex_init(&adapter->mdio_lock);
+	spin_lock_init(&adapter->work_lock);
+	spin_lock_init(&adapter->stats_lock);
+
+	INIT_LIST_HEAD(&adapter->adapter_list);
+	INIT_WORK(&adapter->ext_intr_handler_task, ext_intr_task);
+	INIT_WORK(&adapter->fatal_error_handler_task, fatal_error_task);
+
+	INIT_WORK(&adapter->db_full_task, db_full_task);
+	INIT_WORK(&adapter->db_empty_task, db_empty_task);
+	INIT_WORK(&adapter->db_drop_task, db_drop_task);
+
+	INIT_DELAYED_WORK(&adapter->adap_check_task, t3_adap_check_task);
+
+	for (i = 0; i < ai->nports0 + ai->nports1; ++i) {
+		struct net_device *netdev;
+
+		netdev = alloc_etherdev_mq(sizeof(struct port_info), SGE_QSETS);
+		if (!netdev) {
+			err = -ENOMEM;
+			goto out_free_dev;
+		}
+
+		SET_NETDEV_DEV(netdev, &pdev->dev);
+
+		adapter->port[i] = netdev;
+		pi = netdev_priv(netdev);
+		pi->adapter = adapter;
+		pi->port_id = i;
+		netif_carrier_off(netdev);
+		netdev->irq = pdev->irq;
+		netdev->mem_start = mmio_start;
+		netdev->mem_end = mmio_start + mmio_len - 1;
+		netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM |
+			NETIF_F_TSO | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX;
+		netdev->features |= netdev->hw_features |
+				    NETIF_F_HW_VLAN_CTAG_TX;
+		netdev->vlan_features |= netdev->features & VLAN_FEAT;
+		if (pci_using_dac)
+			netdev->features |= NETIF_F_HIGHDMA;
+
+		netdev->netdev_ops = &cxgb_netdev_ops;
+		netdev->ethtool_ops = &cxgb_ethtool_ops;
+		netdev->min_mtu = 81;
+		netdev->max_mtu = ETH_MAX_MTU;
+		netdev->dev_port = pi->port_id;
+	}
+
+	pci_set_drvdata(pdev, adapter);
+	if (t3_prep_adapter(adapter, ai, 1) < 0) {
+		err = -ENODEV;
+		goto out_free_dev;
+	}
+
+	/*
+	 * The card is now ready to go.  If any errors occur during device
+	 * registration we do not fail the whole card but rather proceed only
+	 * with the ports we manage to register successfully.  However we must
+	 * register at least one net device.
+	 */
+	for_each_port(adapter, i) {
+		err = register_netdev(adapter->port[i]);
+		if (err)
+			dev_warn(&pdev->dev,
+				 "cannot register net device %s, skipping\n",
+				 adapter->port[i]->name);
+		else {
+			/*
+			 * Change the name we use for messages to the name of
+			 * the first successfully registered interface.
+			 */
+			if (!adapter->registered_device_map)
+				adapter->name = adapter->port[i]->name;
+
+			__set_bit(i, &adapter->registered_device_map);
+		}
+	}
+	if (!adapter->registered_device_map) {
+		dev_err(&pdev->dev, "could not register any net devices\n");
+		goto out_free_dev;
+	}
+
+	for_each_port(adapter, i)
+		cxgb3_init_iscsi_mac(adapter->port[i]);
+
+	/* Driver's ready. Reflect it on LEDs */
+	t3_led_ready(adapter);
+
+	if (is_offload(adapter)) {
+		__set_bit(OFFLOAD_DEVMAP_BIT, &adapter->registered_device_map);
+		cxgb3_adapter_ofld(adapter);
+	}
+
+	/* See what interrupts we'll be using */
+	if (msi > 1 && cxgb_enable_msix(adapter) == 0)
+		adapter->flags |= USING_MSIX;
+	else if (msi > 0 && pci_enable_msi(pdev) == 0)
+		adapter->flags |= USING_MSI;
+
+	set_nqsets(adapter);
+
+	err = sysfs_create_group(&adapter->port[0]->dev.kobj,
+				 &cxgb3_attr_group);
+
+	print_port_info(adapter, ai);
+	return 0;
+
+out_free_dev:
+	iounmap(adapter->regs);
+	for (i = ai->nports0 + ai->nports1 - 1; i >= 0; --i)
+		if (adapter->port[i])
+			free_netdev(adapter->port[i]);
+
+out_free_adapter:
+	kfree(adapter);
+
+out_release_regions:
+	pci_release_regions(pdev);
+out_disable_device:
+	pci_disable_device(pdev);
+out:
+	return err;
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+	struct adapter *adapter = pci_get_drvdata(pdev);
+
+	if (adapter) {
+		int i;
+
+		t3_sge_stop(adapter);
+		sysfs_remove_group(&adapter->port[0]->dev.kobj,
+				   &cxgb3_attr_group);
+
+		if (is_offload(adapter)) {
+			cxgb3_adapter_unofld(adapter);
+			if (test_bit(OFFLOAD_DEVMAP_BIT,
+				     &adapter->open_device_map))
+				offload_close(&adapter->tdev);
+		}
+
+		for_each_port(adapter, i)
+		    if (test_bit(i, &adapter->registered_device_map))
+			unregister_netdev(adapter->port[i]);
+
+		t3_stop_sge_timers(adapter);
+		t3_free_sge_resources(adapter);
+		cxgb_disable_msi(adapter);
+
+		for_each_port(adapter, i)
+			if (adapter->port[i])
+				free_netdev(adapter->port[i]);
+
+		iounmap(adapter->regs);
+		if (adapter->nofail_skb)
+			kfree_skb(adapter->nofail_skb);
+		kfree(adapter);
+		pci_release_regions(pdev);
+		pci_disable_device(pdev);
+	}
+}
+
+static struct pci_driver driver = {
+	.name = DRV_NAME,
+	.id_table = cxgb3_pci_tbl,
+	.probe = init_one,
+	.remove = remove_one,
+	.err_handler = &t3_err_handler,
+};
+
+static int __init cxgb3_init_module(void)
+{
+	int ret;
+
+	cxgb3_offload_init();
+
+	ret = pci_register_driver(&driver);
+	return ret;
+}
+
+static void __exit cxgb3_cleanup_module(void)
+{
+	pci_unregister_driver(&driver);
+	if (cxgb3_wq)
+		destroy_workqueue(cxgb3_wq);
+}
+
+module_init(cxgb3_init_module);
+module_exit(cxgb3_cleanup_module);
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
new file mode 100644
index 0000000..50cd660
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
@@ -0,0 +1,1403 @@
+/*
+ * Copyright (c) 2006-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <net/neighbour.h>
+#include <linux/notifier.h>
+#include <linux/atomic.h>
+#include <linux/proc_fs.h>
+#include <linux/if_vlan.h>
+#include <net/netevent.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+
+#include "common.h"
+#include "regs.h"
+#include "cxgb3_ioctl.h"
+#include "cxgb3_ctl_defs.h"
+#include "cxgb3_defs.h"
+#include "l2t.h"
+#include "firmware_exports.h"
+#include "cxgb3_offload.h"
+
+static LIST_HEAD(client_list);
+static LIST_HEAD(ofld_dev_list);
+static DEFINE_MUTEX(cxgb3_db_lock);
+
+static DEFINE_RWLOCK(adapter_list_lock);
+static LIST_HEAD(adapter_list);
+
+static const unsigned int MAX_ATIDS = 64 * 1024;
+static const unsigned int ATID_BASE = 0x10000;
+
+static void cxgb_neigh_update(struct neighbour *neigh);
+static void cxgb_redirect(struct dst_entry *old, struct dst_entry *new,
+			  struct neighbour *neigh, const void *daddr);
+
+static inline int offload_activated(struct t3cdev *tdev)
+{
+	const struct adapter *adapter = tdev2adap(tdev);
+
+	return test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map);
+}
+
+/**
+ *	cxgb3_register_client - register an offload client
+ *	@client: the client
+ *
+ *	Add the client to the client list,
+ *	and call backs the client for each activated offload device
+ */
+void cxgb3_register_client(struct cxgb3_client *client)
+{
+	struct t3cdev *tdev;
+
+	mutex_lock(&cxgb3_db_lock);
+	list_add_tail(&client->client_list, &client_list);
+
+	if (client->add) {
+		list_for_each_entry(tdev, &ofld_dev_list, ofld_dev_list) {
+			if (offload_activated(tdev))
+				client->add(tdev);
+		}
+	}
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+EXPORT_SYMBOL(cxgb3_register_client);
+
+/**
+ *	cxgb3_unregister_client - unregister an offload client
+ *	@client: the client
+ *
+ *	Remove the client to the client list,
+ *	and call backs the client for each activated offload device.
+ */
+void cxgb3_unregister_client(struct cxgb3_client *client)
+{
+	struct t3cdev *tdev;
+
+	mutex_lock(&cxgb3_db_lock);
+	list_del(&client->client_list);
+
+	if (client->remove) {
+		list_for_each_entry(tdev, &ofld_dev_list, ofld_dev_list) {
+			if (offload_activated(tdev))
+				client->remove(tdev);
+		}
+	}
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+EXPORT_SYMBOL(cxgb3_unregister_client);
+
+/**
+ *	cxgb3_add_clients - activate registered clients for an offload device
+ *	@tdev: the offload device
+ *
+ *	Call backs all registered clients once a offload device is activated
+ */
+void cxgb3_add_clients(struct t3cdev *tdev)
+{
+	struct cxgb3_client *client;
+
+	mutex_lock(&cxgb3_db_lock);
+	list_for_each_entry(client, &client_list, client_list) {
+		if (client->add)
+			client->add(tdev);
+	}
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+/**
+ *	cxgb3_remove_clients - deactivates registered clients
+ *			       for an offload device
+ *	@tdev: the offload device
+ *
+ *	Call backs all registered clients once a offload device is deactivated
+ */
+void cxgb3_remove_clients(struct t3cdev *tdev)
+{
+	struct cxgb3_client *client;
+
+	mutex_lock(&cxgb3_db_lock);
+	list_for_each_entry(client, &client_list, client_list) {
+		if (client->remove)
+			client->remove(tdev);
+	}
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+void cxgb3_event_notify(struct t3cdev *tdev, u32 event, u32 port)
+{
+	struct cxgb3_client *client;
+
+	mutex_lock(&cxgb3_db_lock);
+	list_for_each_entry(client, &client_list, client_list) {
+		if (client->event_handler)
+			client->event_handler(tdev, event, port);
+	}
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+static struct net_device *get_iff_from_mac(struct adapter *adapter,
+					   const unsigned char *mac,
+					   unsigned int vlan)
+{
+	int i;
+
+	for_each_port(adapter, i) {
+		struct net_device *dev = adapter->port[i];
+
+		if (ether_addr_equal(dev->dev_addr, mac)) {
+			rcu_read_lock();
+			if (vlan && vlan != VLAN_VID_MASK) {
+				dev = __vlan_find_dev_deep_rcu(dev, htons(ETH_P_8021Q), vlan);
+			} else if (netif_is_bond_slave(dev)) {
+				struct net_device *upper_dev;
+
+				while ((upper_dev =
+					netdev_master_upper_dev_get_rcu(dev)))
+					dev = upper_dev;
+			}
+			rcu_read_unlock();
+			return dev;
+		}
+	}
+	return NULL;
+}
+
+static int cxgb_ulp_iscsi_ctl(struct adapter *adapter, unsigned int req,
+			      void *data)
+{
+	int i;
+	int ret = 0;
+	unsigned int val = 0;
+	struct ulp_iscsi_info *uiip = data;
+
+	switch (req) {
+	case ULP_ISCSI_GET_PARAMS:
+		uiip->pdev = adapter->pdev;
+		uiip->llimit = t3_read_reg(adapter, A_ULPRX_ISCSI_LLIMIT);
+		uiip->ulimit = t3_read_reg(adapter, A_ULPRX_ISCSI_ULIMIT);
+		uiip->tagmask = t3_read_reg(adapter, A_ULPRX_ISCSI_TAGMASK);
+
+		val = t3_read_reg(adapter, A_ULPRX_ISCSI_PSZ);
+		for (i = 0; i < 4; i++, val >>= 8)
+			uiip->pgsz_factor[i] = val & 0xFF;
+
+		val = t3_read_reg(adapter, A_TP_PARA_REG7);
+		uiip->max_txsz =
+		uiip->max_rxsz = min((val >> S_PMMAXXFERLEN0)&M_PMMAXXFERLEN0,
+				     (val >> S_PMMAXXFERLEN1)&M_PMMAXXFERLEN1);
+		/*
+		 * On tx, the iscsi pdu has to be <= tx page size and has to
+		 * fit into the Tx PM FIFO.
+		 */
+		val = min(adapter->params.tp.tx_pg_size,
+			  t3_read_reg(adapter, A_PM1_TX_CFG) >> 17);
+		uiip->max_txsz = min(val, uiip->max_txsz);
+
+		/* set MaxRxData to 16224 */
+		val = t3_read_reg(adapter, A_TP_PARA_REG2);
+		if ((val >> S_MAXRXDATA) != 0x3f60) {
+			val &= (M_RXCOALESCESIZE << S_RXCOALESCESIZE);
+			val |= V_MAXRXDATA(0x3f60);
+			pr_info("%s, iscsi set MaxRxData to 16224 (0x%x)\n",
+				adapter->name, val);
+			t3_write_reg(adapter, A_TP_PARA_REG2, val);
+		}
+
+		/*
+		 * on rx, the iscsi pdu has to be < rx page size and the
+		 * the max rx data length programmed in TP
+		 */
+		val = min(adapter->params.tp.rx_pg_size,
+			  ((t3_read_reg(adapter, A_TP_PARA_REG2)) >>
+				S_MAXRXDATA) & M_MAXRXDATA);
+		uiip->max_rxsz = min(val, uiip->max_rxsz);
+		break;
+	case ULP_ISCSI_SET_PARAMS:
+		t3_write_reg(adapter, A_ULPRX_ISCSI_TAGMASK, uiip->tagmask);
+		/* program the ddp page sizes */
+		for (i = 0; i < 4; i++)
+			val |= (uiip->pgsz_factor[i] & 0xF) << (8 * i);
+		if (val && (val != t3_read_reg(adapter, A_ULPRX_ISCSI_PSZ))) {
+			pr_info("%s, setting iscsi pgsz 0x%x, %u,%u,%u,%u\n",
+				adapter->name, val, uiip->pgsz_factor[0],
+				uiip->pgsz_factor[1], uiip->pgsz_factor[2],
+				uiip->pgsz_factor[3]);
+			t3_write_reg(adapter, A_ULPRX_ISCSI_PSZ, val);
+		}
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+	}
+	return ret;
+}
+
+/* Response queue used for RDMA events. */
+#define ASYNC_NOTIF_RSPQ 0
+
+static int cxgb_rdma_ctl(struct adapter *adapter, unsigned int req, void *data)
+{
+	int ret = 0;
+
+	switch (req) {
+	case RDMA_GET_PARAMS: {
+		struct rdma_info *rdma = data;
+		struct pci_dev *pdev = adapter->pdev;
+
+		rdma->udbell_physbase = pci_resource_start(pdev, 2);
+		rdma->udbell_len = pci_resource_len(pdev, 2);
+		rdma->tpt_base =
+			t3_read_reg(adapter, A_ULPTX_TPT_LLIMIT);
+		rdma->tpt_top = t3_read_reg(adapter, A_ULPTX_TPT_ULIMIT);
+		rdma->pbl_base =
+			t3_read_reg(adapter, A_ULPTX_PBL_LLIMIT);
+		rdma->pbl_top = t3_read_reg(adapter, A_ULPTX_PBL_ULIMIT);
+		rdma->rqt_base = t3_read_reg(adapter, A_ULPRX_RQ_LLIMIT);
+		rdma->rqt_top = t3_read_reg(adapter, A_ULPRX_RQ_ULIMIT);
+		rdma->kdb_addr = adapter->regs + A_SG_KDOORBELL;
+		rdma->pdev = pdev;
+		break;
+	}
+	case RDMA_CQ_OP:{
+		unsigned long flags;
+		struct rdma_cq_op *rdma = data;
+
+		/* may be called in any context */
+		spin_lock_irqsave(&adapter->sge.reg_lock, flags);
+		ret = t3_sge_cqcntxt_op(adapter, rdma->id, rdma->op,
+					rdma->credits);
+		spin_unlock_irqrestore(&adapter->sge.reg_lock, flags);
+		break;
+	}
+	case RDMA_GET_MEM:{
+		struct ch_mem_range *t = data;
+		struct mc7 *mem;
+
+		if ((t->addr & 7) || (t->len & 7))
+			return -EINVAL;
+		if (t->mem_id == MEM_CM)
+			mem = &adapter->cm;
+		else if (t->mem_id == MEM_PMRX)
+			mem = &adapter->pmrx;
+		else if (t->mem_id == MEM_PMTX)
+			mem = &adapter->pmtx;
+		else
+			return -EINVAL;
+
+		ret =
+			t3_mc7_bd_read(mem, t->addr / 8, t->len / 8,
+					(u64 *) t->buf);
+		if (ret)
+			return ret;
+		break;
+	}
+	case RDMA_CQ_SETUP:{
+		struct rdma_cq_setup *rdma = data;
+
+		spin_lock_irq(&adapter->sge.reg_lock);
+		ret =
+			t3_sge_init_cqcntxt(adapter, rdma->id,
+					rdma->base_addr, rdma->size,
+					ASYNC_NOTIF_RSPQ,
+					rdma->ovfl_mode, rdma->credits,
+					rdma->credit_thres);
+		spin_unlock_irq(&adapter->sge.reg_lock);
+		break;
+	}
+	case RDMA_CQ_DISABLE:
+		spin_lock_irq(&adapter->sge.reg_lock);
+		ret = t3_sge_disable_cqcntxt(adapter, *(unsigned int *)data);
+		spin_unlock_irq(&adapter->sge.reg_lock);
+		break;
+	case RDMA_CTRL_QP_SETUP:{
+		struct rdma_ctrlqp_setup *rdma = data;
+
+		spin_lock_irq(&adapter->sge.reg_lock);
+		ret = t3_sge_init_ecntxt(adapter, FW_RI_SGEEC_START, 0,
+						SGE_CNTXT_RDMA,
+						ASYNC_NOTIF_RSPQ,
+						rdma->base_addr, rdma->size,
+						FW_RI_TID_START, 1, 0);
+		spin_unlock_irq(&adapter->sge.reg_lock);
+		break;
+	}
+	case RDMA_GET_MIB: {
+		spin_lock(&adapter->stats_lock);
+		t3_tp_get_mib_stats(adapter, (struct tp_mib_stats *)data);
+		spin_unlock(&adapter->stats_lock);
+		break;
+	}
+	default:
+		ret = -EOPNOTSUPP;
+	}
+	return ret;
+}
+
+static int cxgb_offload_ctl(struct t3cdev *tdev, unsigned int req, void *data)
+{
+	struct adapter *adapter = tdev2adap(tdev);
+	struct tid_range *tid;
+	struct mtutab *mtup;
+	struct iff_mac *iffmacp;
+	struct ddp_params *ddpp;
+	struct adap_ports *ports;
+	struct ofld_page_info *rx_page_info;
+	struct tp_params *tp = &adapter->params.tp;
+	int i;
+
+	switch (req) {
+	case GET_MAX_OUTSTANDING_WR:
+		*(unsigned int *)data = FW_WR_NUM;
+		break;
+	case GET_WR_LEN:
+		*(unsigned int *)data = WR_FLITS;
+		break;
+	case GET_TX_MAX_CHUNK:
+		*(unsigned int *)data = 1 << 20;	/* 1MB */
+		break;
+	case GET_TID_RANGE:
+		tid = data;
+		tid->num = t3_mc5_size(&adapter->mc5) -
+		    adapter->params.mc5.nroutes -
+		    adapter->params.mc5.nfilters - adapter->params.mc5.nservers;
+		tid->base = 0;
+		break;
+	case GET_STID_RANGE:
+		tid = data;
+		tid->num = adapter->params.mc5.nservers;
+		tid->base = t3_mc5_size(&adapter->mc5) - tid->num -
+		    adapter->params.mc5.nfilters - adapter->params.mc5.nroutes;
+		break;
+	case GET_L2T_CAPACITY:
+		*(unsigned int *)data = 2048;
+		break;
+	case GET_MTUS:
+		mtup = data;
+		mtup->size = NMTUS;
+		mtup->mtus = adapter->params.mtus;
+		break;
+	case GET_IFF_FROM_MAC:
+		iffmacp = data;
+		iffmacp->dev = get_iff_from_mac(adapter, iffmacp->mac_addr,
+						iffmacp->vlan_tag &
+						VLAN_VID_MASK);
+		break;
+	case GET_DDP_PARAMS:
+		ddpp = data;
+		ddpp->llimit = t3_read_reg(adapter, A_ULPRX_TDDP_LLIMIT);
+		ddpp->ulimit = t3_read_reg(adapter, A_ULPRX_TDDP_ULIMIT);
+		ddpp->tag_mask = t3_read_reg(adapter, A_ULPRX_TDDP_TAGMASK);
+		break;
+	case GET_PORTS:
+		ports = data;
+		ports->nports = adapter->params.nports;
+		for_each_port(adapter, i)
+			ports->lldevs[i] = adapter->port[i];
+		break;
+	case ULP_ISCSI_GET_PARAMS:
+	case ULP_ISCSI_SET_PARAMS:
+		if (!offload_running(adapter))
+			return -EAGAIN;
+		return cxgb_ulp_iscsi_ctl(adapter, req, data);
+	case RDMA_GET_PARAMS:
+	case RDMA_CQ_OP:
+	case RDMA_CQ_SETUP:
+	case RDMA_CQ_DISABLE:
+	case RDMA_CTRL_QP_SETUP:
+	case RDMA_GET_MEM:
+	case RDMA_GET_MIB:
+		if (!offload_running(adapter))
+			return -EAGAIN;
+		return cxgb_rdma_ctl(adapter, req, data);
+	case GET_RX_PAGE_INFO:
+		rx_page_info = data;
+		rx_page_info->page_size = tp->rx_pg_size;
+		rx_page_info->num = tp->rx_num_pgs;
+		break;
+	case GET_ISCSI_IPV4ADDR: {
+		struct iscsi_ipv4addr *p = data;
+		struct port_info *pi = netdev_priv(p->dev);
+		p->ipv4addr = pi->iscsi_ipv4addr;
+		break;
+	}
+	case GET_EMBEDDED_INFO: {
+		struct ch_embedded_info *e = data;
+
+		spin_lock(&adapter->stats_lock);
+		t3_get_fw_version(adapter, &e->fw_vers);
+		t3_get_tp_version(adapter, &e->tp_vers);
+		spin_unlock(&adapter->stats_lock);
+		break;
+	}
+	default:
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+/*
+ * Dummy handler for Rx offload packets in case we get an offload packet before
+ * proper processing is setup.  This complains and drops the packet as it isn't
+ * normal to get offload packets at this stage.
+ */
+static int rx_offload_blackhole(struct t3cdev *dev, struct sk_buff **skbs,
+				int n)
+{
+	while (n--)
+		dev_kfree_skb_any(skbs[n]);
+	return 0;
+}
+
+static void dummy_neigh_update(struct t3cdev *dev, struct neighbour *neigh)
+{
+}
+
+void cxgb3_set_dummy_ops(struct t3cdev *dev)
+{
+	dev->recv = rx_offload_blackhole;
+	dev->neigh_update = dummy_neigh_update;
+}
+
+/*
+ * Free an active-open TID.
+ */
+void *cxgb3_free_atid(struct t3cdev *tdev, int atid)
+{
+	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
+	union active_open_entry *p = atid2entry(t, atid);
+	void *ctx = p->t3c_tid.ctx;
+
+	spin_lock_bh(&t->atid_lock);
+	p->next = t->afree;
+	t->afree = p;
+	t->atids_in_use--;
+	spin_unlock_bh(&t->atid_lock);
+
+	return ctx;
+}
+
+EXPORT_SYMBOL(cxgb3_free_atid);
+
+/*
+ * Free a server TID and return it to the free pool.
+ */
+void cxgb3_free_stid(struct t3cdev *tdev, int stid)
+{
+	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
+	union listen_entry *p = stid2entry(t, stid);
+
+	spin_lock_bh(&t->stid_lock);
+	p->next = t->sfree;
+	t->sfree = p;
+	t->stids_in_use--;
+	spin_unlock_bh(&t->stid_lock);
+}
+
+EXPORT_SYMBOL(cxgb3_free_stid);
+
+void cxgb3_insert_tid(struct t3cdev *tdev, struct cxgb3_client *client,
+		      void *ctx, unsigned int tid)
+{
+	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
+
+	t->tid_tab[tid].client = client;
+	t->tid_tab[tid].ctx = ctx;
+	atomic_inc(&t->tids_in_use);
+}
+
+EXPORT_SYMBOL(cxgb3_insert_tid);
+
+/*
+ * Populate a TID_RELEASE WR.  The skb must be already propely sized.
+ */
+static inline void mk_tid_release(struct sk_buff *skb, unsigned int tid)
+{
+	struct cpl_tid_release *req;
+
+	skb->priority = CPL_PRIORITY_SETUP;
+	req = __skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
+}
+
+static void t3_process_tid_release_list(struct work_struct *work)
+{
+	struct t3c_data *td = container_of(work, struct t3c_data,
+					   tid_release_task);
+	struct sk_buff *skb;
+	struct t3cdev *tdev = td->dev;
+
+
+	spin_lock_bh(&td->tid_release_lock);
+	while (td->tid_release_list) {
+		struct t3c_tid_entry *p = td->tid_release_list;
+
+		td->tid_release_list = p->ctx;
+		spin_unlock_bh(&td->tid_release_lock);
+
+		skb = alloc_skb(sizeof(struct cpl_tid_release),
+				GFP_KERNEL);
+		if (!skb)
+			skb = td->nofail_skb;
+		if (!skb) {
+			spin_lock_bh(&td->tid_release_lock);
+			p->ctx = (void *)td->tid_release_list;
+			td->tid_release_list = p;
+			break;
+		}
+		mk_tid_release(skb, p - td->tid_maps.tid_tab);
+		cxgb3_ofld_send(tdev, skb);
+		p->ctx = NULL;
+		if (skb == td->nofail_skb)
+			td->nofail_skb =
+				alloc_skb(sizeof(struct cpl_tid_release),
+					GFP_KERNEL);
+		spin_lock_bh(&td->tid_release_lock);
+	}
+	td->release_list_incomplete = (td->tid_release_list == NULL) ? 0 : 1;
+	spin_unlock_bh(&td->tid_release_lock);
+
+	if (!td->nofail_skb)
+		td->nofail_skb =
+			alloc_skb(sizeof(struct cpl_tid_release),
+				GFP_KERNEL);
+}
+
+/* use ctx as a next pointer in the tid release list */
+void cxgb3_queue_tid_release(struct t3cdev *tdev, unsigned int tid)
+{
+	struct t3c_data *td = T3C_DATA(tdev);
+	struct t3c_tid_entry *p = &td->tid_maps.tid_tab[tid];
+
+	spin_lock_bh(&td->tid_release_lock);
+	p->ctx = (void *)td->tid_release_list;
+	p->client = NULL;
+	td->tid_release_list = p;
+	if (!p->ctx || td->release_list_incomplete)
+		schedule_work(&td->tid_release_task);
+	spin_unlock_bh(&td->tid_release_lock);
+}
+
+EXPORT_SYMBOL(cxgb3_queue_tid_release);
+
+/*
+ * Remove a tid from the TID table.  A client may defer processing its last
+ * CPL message if it is locked at the time it arrives, and while the message
+ * sits in the client's backlog the TID may be reused for another connection.
+ * To handle this we atomically switch the TID association if it still points
+ * to the original client context.
+ */
+void cxgb3_remove_tid(struct t3cdev *tdev, void *ctx, unsigned int tid)
+{
+	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
+
+	BUG_ON(tid >= t->ntids);
+	if (tdev->type == T3A)
+		(void)cmpxchg(&t->tid_tab[tid].ctx, ctx, NULL);
+	else {
+		struct sk_buff *skb;
+
+		skb = alloc_skb(sizeof(struct cpl_tid_release), GFP_ATOMIC);
+		if (likely(skb)) {
+			mk_tid_release(skb, tid);
+			cxgb3_ofld_send(tdev, skb);
+			t->tid_tab[tid].ctx = NULL;
+		} else
+			cxgb3_queue_tid_release(tdev, tid);
+	}
+	atomic_dec(&t->tids_in_use);
+}
+
+EXPORT_SYMBOL(cxgb3_remove_tid);
+
+int cxgb3_alloc_atid(struct t3cdev *tdev, struct cxgb3_client *client,
+		     void *ctx)
+{
+	int atid = -1;
+	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
+
+	spin_lock_bh(&t->atid_lock);
+	if (t->afree &&
+	    t->atids_in_use + atomic_read(&t->tids_in_use) + MC5_MIN_TIDS <=
+	    t->ntids) {
+		union active_open_entry *p = t->afree;
+
+		atid = (p - t->atid_tab) + t->atid_base;
+		t->afree = p->next;
+		p->t3c_tid.ctx = ctx;
+		p->t3c_tid.client = client;
+		t->atids_in_use++;
+	}
+	spin_unlock_bh(&t->atid_lock);
+	return atid;
+}
+
+EXPORT_SYMBOL(cxgb3_alloc_atid);
+
+int cxgb3_alloc_stid(struct t3cdev *tdev, struct cxgb3_client *client,
+		     void *ctx)
+{
+	int stid = -1;
+	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
+
+	spin_lock_bh(&t->stid_lock);
+	if (t->sfree) {
+		union listen_entry *p = t->sfree;
+
+		stid = (p - t->stid_tab) + t->stid_base;
+		t->sfree = p->next;
+		p->t3c_tid.ctx = ctx;
+		p->t3c_tid.client = client;
+		t->stids_in_use++;
+	}
+	spin_unlock_bh(&t->stid_lock);
+	return stid;
+}
+
+EXPORT_SYMBOL(cxgb3_alloc_stid);
+
+/* Get the t3cdev associated with a net_device */
+struct t3cdev *dev2t3cdev(struct net_device *dev)
+{
+	const struct port_info *pi = netdev_priv(dev);
+
+	return (struct t3cdev *)pi->adapter;
+}
+
+EXPORT_SYMBOL(dev2t3cdev);
+
+static int do_smt_write_rpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_smt_write_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status != CPL_ERR_NONE)
+		pr_err("Unexpected SMT_WRITE_RPL status %u for entry %u\n",
+		       rpl->status, GET_TID(rpl));
+
+	return CPL_RET_BUF_DONE;
+}
+
+static int do_l2t_write_rpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_l2t_write_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status != CPL_ERR_NONE)
+		pr_err("Unexpected L2T_WRITE_RPL status %u for entry %u\n",
+		       rpl->status, GET_TID(rpl));
+
+	return CPL_RET_BUF_DONE;
+}
+
+static int do_rte_write_rpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_rte_write_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status != CPL_ERR_NONE)
+		pr_err("Unexpected RTE_WRITE_RPL status %u for entry %u\n",
+		       rpl->status, GET_TID(rpl));
+
+	return CPL_RET_BUF_DONE;
+}
+
+static int do_act_open_rpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+	unsigned int atid = G_TID(ntohl(rpl->atid));
+	struct t3c_tid_entry *t3c_tid;
+
+	t3c_tid = lookup_atid(&(T3C_DATA(dev))->tid_maps, atid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client &&
+	    t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[CPL_ACT_OPEN_RPL]) {
+		return t3c_tid->client->handlers[CPL_ACT_OPEN_RPL] (dev, skb,
+								    t3c_tid->
+								    ctx);
+	} else {
+		pr_err("%s: received clientless CPL command 0x%x\n",
+		       dev->name, CPL_ACT_OPEN_RPL);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int do_stid_rpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	union opcode_tid *p = cplhdr(skb);
+	unsigned int stid = G_TID(ntohl(p->opcode_tid));
+	struct t3c_tid_entry *t3c_tid;
+
+	t3c_tid = lookup_stid(&(T3C_DATA(dev))->tid_maps, stid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[p->opcode]) {
+		return t3c_tid->client->handlers[p->opcode] (dev, skb,
+							     t3c_tid->ctx);
+	} else {
+		pr_err("%s: received clientless CPL command 0x%x\n",
+		       dev->name, p->opcode);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int do_hwtid_rpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	union opcode_tid *p = cplhdr(skb);
+	unsigned int hwtid = G_TID(ntohl(p->opcode_tid));
+	struct t3c_tid_entry *t3c_tid;
+
+	t3c_tid = lookup_tid(&(T3C_DATA(dev))->tid_maps, hwtid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[p->opcode]) {
+		return t3c_tid->client->handlers[p->opcode]
+		    (dev, skb, t3c_tid->ctx);
+	} else {
+		pr_err("%s: received clientless CPL command 0x%x\n",
+		       dev->name, p->opcode);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int do_cr(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_pass_accept_req *req = cplhdr(skb);
+	unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+	struct tid_info *t = &(T3C_DATA(dev))->tid_maps;
+	struct t3c_tid_entry *t3c_tid;
+	unsigned int tid = GET_TID(req);
+
+	if (unlikely(tid >= t->ntids)) {
+		printk("%s: passive open TID %u too large\n",
+		       dev->name, tid);
+		t3_fatal_err(tdev2adap(dev));
+		return CPL_RET_BUF_DONE;
+	}
+
+	t3c_tid = lookup_stid(t, stid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[CPL_PASS_ACCEPT_REQ]) {
+		return t3c_tid->client->handlers[CPL_PASS_ACCEPT_REQ]
+		    (dev, skb, t3c_tid->ctx);
+	} else {
+		pr_err("%s: received clientless CPL command 0x%x\n",
+		       dev->name, CPL_PASS_ACCEPT_REQ);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+/*
+ * Returns an sk_buff for a reply CPL message of size len.  If the input
+ * sk_buff has no other users it is trimmed and reused, otherwise a new buffer
+ * is allocated.  The input skb must be of size at least len.  Note that this
+ * operation does not destroy the original skb data even if it decides to reuse
+ * the buffer.
+ */
+static struct sk_buff *cxgb3_get_cpl_reply_skb(struct sk_buff *skb, size_t len,
+					       gfp_t gfp)
+{
+	if (likely(!skb_cloned(skb))) {
+		BUG_ON(skb->len < len);
+		__skb_trim(skb, len);
+		skb_get(skb);
+	} else {
+		skb = alloc_skb(len, gfp);
+		if (skb)
+			__skb_put(skb, len);
+	}
+	return skb;
+}
+
+static int do_abort_req_rss(struct t3cdev *dev, struct sk_buff *skb)
+{
+	union opcode_tid *p = cplhdr(skb);
+	unsigned int hwtid = G_TID(ntohl(p->opcode_tid));
+	struct t3c_tid_entry *t3c_tid;
+
+	t3c_tid = lookup_tid(&(T3C_DATA(dev))->tid_maps, hwtid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[p->opcode]) {
+		return t3c_tid->client->handlers[p->opcode]
+		    (dev, skb, t3c_tid->ctx);
+	} else {
+		struct cpl_abort_req_rss *req = cplhdr(skb);
+		struct cpl_abort_rpl *rpl;
+		struct sk_buff *reply_skb;
+		unsigned int tid = GET_TID(req);
+		u8 cmd = req->status;
+
+		if (req->status == CPL_ERR_RTX_NEG_ADVICE ||
+		    req->status == CPL_ERR_PERSIST_NEG_ADVICE)
+			goto out;
+
+		reply_skb = cxgb3_get_cpl_reply_skb(skb,
+						    sizeof(struct
+							   cpl_abort_rpl),
+						    GFP_ATOMIC);
+
+		if (!reply_skb) {
+			printk("do_abort_req_rss: couldn't get skb!\n");
+			goto out;
+		}
+		reply_skb->priority = CPL_PRIORITY_DATA;
+		__skb_put(reply_skb, sizeof(struct cpl_abort_rpl));
+		rpl = cplhdr(reply_skb);
+		rpl->wr.wr_hi =
+		    htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+		rpl->wr.wr_lo = htonl(V_WR_TID(tid));
+		OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
+		rpl->cmd = cmd;
+		cxgb3_ofld_send(dev, reply_skb);
+out:
+		return CPL_RET_BUF_DONE;
+	}
+}
+
+static int do_act_establish(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_act_establish *req = cplhdr(skb);
+	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+	struct tid_info *t = &(T3C_DATA(dev))->tid_maps;
+	struct t3c_tid_entry *t3c_tid;
+	unsigned int tid = GET_TID(req);
+
+	if (unlikely(tid >= t->ntids)) {
+		printk("%s: active establish TID %u too large\n",
+		       dev->name, tid);
+		t3_fatal_err(tdev2adap(dev));
+		return CPL_RET_BUF_DONE;
+	}
+
+	t3c_tid = lookup_atid(t, atid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[CPL_ACT_ESTABLISH]) {
+		return t3c_tid->client->handlers[CPL_ACT_ESTABLISH]
+		    (dev, skb, t3c_tid->ctx);
+	} else {
+		pr_err("%s: received clientless CPL command 0x%x\n",
+		       dev->name, CPL_ACT_ESTABLISH);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int do_trace(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_trace_pkt *p = cplhdr(skb);
+
+	skb->protocol = htons(0xffff);
+	skb->dev = dev->lldev;
+	skb_pull(skb, sizeof(*p));
+	skb_reset_mac_header(skb);
+	netif_receive_skb(skb);
+	return 0;
+}
+
+/*
+ * That skb would better have come from process_responses() where we abuse
+ * ->priority and ->csum to carry our data.  NB: if we get to per-arch
+ * ->csum, the things might get really interesting here.
+ */
+
+static inline u32 get_hwtid(struct sk_buff *skb)
+{
+	return ntohl((__force __be32)skb->priority) >> 8 & 0xfffff;
+}
+
+static inline u32 get_opcode(struct sk_buff *skb)
+{
+	return G_OPCODE(ntohl((__force __be32)skb->csum));
+}
+
+static int do_term(struct t3cdev *dev, struct sk_buff *skb)
+{
+	unsigned int hwtid = get_hwtid(skb);
+	unsigned int opcode = get_opcode(skb);
+	struct t3c_tid_entry *t3c_tid;
+
+	t3c_tid = lookup_tid(&(T3C_DATA(dev))->tid_maps, hwtid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[opcode]) {
+		return t3c_tid->client->handlers[opcode] (dev, skb,
+							  t3c_tid->ctx);
+	} else {
+		pr_err("%s: received clientless CPL command 0x%x\n",
+		       dev->name, opcode);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int nb_callback(struct notifier_block *self, unsigned long event,
+		       void *ctx)
+{
+	switch (event) {
+	case (NETEVENT_NEIGH_UPDATE):{
+		cxgb_neigh_update((struct neighbour *)ctx);
+		break;
+	}
+	case (NETEVENT_REDIRECT):{
+		struct netevent_redirect *nr = ctx;
+		cxgb_redirect(nr->old, nr->new, nr->neigh,
+			      nr->daddr);
+		cxgb_neigh_update(nr->neigh);
+		break;
+	}
+	default:
+		break;
+	}
+	return 0;
+}
+
+static struct notifier_block nb = {
+	.notifier_call = nb_callback
+};
+
+/*
+ * Process a received packet with an unknown/unexpected CPL opcode.
+ */
+static int do_bad_cpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	pr_err("%s: received bad CPL command 0x%x\n", dev->name, *skb->data);
+	return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+}
+
+/*
+ * Handlers for each CPL opcode
+ */
+static cpl_handler_func cpl_handlers[NUM_CPL_CMDS];
+
+/*
+ * Add a new handler to the CPL dispatch table.  A NULL handler may be supplied
+ * to unregister an existing handler.
+ */
+void t3_register_cpl_handler(unsigned int opcode, cpl_handler_func h)
+{
+	if (opcode < NUM_CPL_CMDS)
+		cpl_handlers[opcode] = h ? h : do_bad_cpl;
+	else
+		pr_err("T3C: handler registration for opcode %x failed\n",
+		       opcode);
+}
+
+EXPORT_SYMBOL(t3_register_cpl_handler);
+
+/*
+ * T3CDEV's receive method.
+ */
+static int process_rx(struct t3cdev *dev, struct sk_buff **skbs, int n)
+{
+	while (n--) {
+		struct sk_buff *skb = *skbs++;
+		unsigned int opcode = get_opcode(skb);
+		int ret = cpl_handlers[opcode] (dev, skb);
+
+#if VALIDATE_TID
+		if (ret & CPL_RET_UNKNOWN_TID) {
+			union opcode_tid *p = cplhdr(skb);
+
+			pr_err("%s: CPL message (opcode %u) had unknown TID %u\n",
+			       dev->name, opcode, G_TID(ntohl(p->opcode_tid)));
+		}
+#endif
+		if (ret & CPL_RET_BUF_DONE)
+			kfree_skb(skb);
+	}
+	return 0;
+}
+
+/*
+ * Sends an sk_buff to a T3C driver after dealing with any active network taps.
+ */
+int cxgb3_ofld_send(struct t3cdev *dev, struct sk_buff *skb)
+{
+	int r;
+
+	local_bh_disable();
+	r = dev->send(dev, skb);
+	local_bh_enable();
+	return r;
+}
+
+EXPORT_SYMBOL(cxgb3_ofld_send);
+
+static int is_offloading(struct net_device *dev)
+{
+	struct adapter *adapter;
+	int i;
+
+	read_lock_bh(&adapter_list_lock);
+	list_for_each_entry(adapter, &adapter_list, adapter_list) {
+		for_each_port(adapter, i) {
+			if (dev == adapter->port[i]) {
+				read_unlock_bh(&adapter_list_lock);
+				return 1;
+			}
+		}
+	}
+	read_unlock_bh(&adapter_list_lock);
+	return 0;
+}
+
+static void cxgb_neigh_update(struct neighbour *neigh)
+{
+	struct net_device *dev;
+
+	if (!neigh)
+		return;
+	dev = neigh->dev;
+	if (dev && (is_offloading(dev))) {
+		struct t3cdev *tdev = dev2t3cdev(dev);
+
+		BUG_ON(!tdev);
+		t3_l2t_update(tdev, neigh);
+	}
+}
+
+static void set_l2t_ix(struct t3cdev *tdev, u32 tid, struct l2t_entry *e)
+{
+	struct sk_buff *skb;
+	struct cpl_set_tcb_field *req;
+
+	skb = alloc_skb(sizeof(*req), GFP_ATOMIC);
+	if (!skb) {
+		pr_err("%s: cannot allocate skb!\n", __func__);
+		return;
+	}
+	skb->priority = CPL_PRIORITY_CONTROL;
+	req = skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
+	req->reply = 0;
+	req->cpu_idx = 0;
+	req->word = htons(W_TCB_L2T_IX);
+	req->mask = cpu_to_be64(V_TCB_L2T_IX(M_TCB_L2T_IX));
+	req->val = cpu_to_be64(V_TCB_L2T_IX(e->idx));
+	tdev->send(tdev, skb);
+}
+
+static void cxgb_redirect(struct dst_entry *old, struct dst_entry *new,
+			  struct neighbour *neigh,
+			  const void *daddr)
+{
+	struct net_device *dev;
+	struct tid_info *ti;
+	struct t3cdev *tdev;
+	u32 tid;
+	int update_tcb;
+	struct l2t_entry *e;
+	struct t3c_tid_entry *te;
+
+	dev = neigh->dev;
+
+	if (!is_offloading(dev))
+		return;
+	tdev = dev2t3cdev(dev);
+	BUG_ON(!tdev);
+
+	/* Add new L2T entry */
+	e = t3_l2t_get(tdev, new, dev, daddr);
+	if (!e) {
+		pr_err("%s: couldn't allocate new l2t entry!\n", __func__);
+		return;
+	}
+
+	/* Walk tid table and notify clients of dst change. */
+	ti = &(T3C_DATA(tdev))->tid_maps;
+	for (tid = 0; tid < ti->ntids; tid++) {
+		te = lookup_tid(ti, tid);
+		BUG_ON(!te);
+		if (te && te->ctx && te->client && te->client->redirect) {
+			update_tcb = te->client->redirect(te->ctx, old, new, e);
+			if (update_tcb) {
+				rcu_read_lock();
+				l2t_hold(L2DATA(tdev), e);
+				rcu_read_unlock();
+				set_l2t_ix(tdev, tid, e);
+			}
+		}
+	}
+	l2t_release(tdev, e);
+}
+
+/*
+ * Allocate and initialize the TID tables.  Returns 0 on success.
+ */
+static int init_tid_tabs(struct tid_info *t, unsigned int ntids,
+			 unsigned int natids, unsigned int nstids,
+			 unsigned int atid_base, unsigned int stid_base)
+{
+	unsigned long size = ntids * sizeof(*t->tid_tab) +
+	    natids * sizeof(*t->atid_tab) + nstids * sizeof(*t->stid_tab);
+
+	t->tid_tab = kvzalloc(size, GFP_KERNEL);
+	if (!t->tid_tab)
+		return -ENOMEM;
+
+	t->stid_tab = (union listen_entry *)&t->tid_tab[ntids];
+	t->atid_tab = (union active_open_entry *)&t->stid_tab[nstids];
+	t->ntids = ntids;
+	t->nstids = nstids;
+	t->stid_base = stid_base;
+	t->sfree = NULL;
+	t->natids = natids;
+	t->atid_base = atid_base;
+	t->afree = NULL;
+	t->stids_in_use = t->atids_in_use = 0;
+	atomic_set(&t->tids_in_use, 0);
+	spin_lock_init(&t->stid_lock);
+	spin_lock_init(&t->atid_lock);
+
+	/*
+	 * Setup the free lists for stid_tab and atid_tab.
+	 */
+	if (nstids) {
+		while (--nstids)
+			t->stid_tab[nstids - 1].next = &t->stid_tab[nstids];
+		t->sfree = t->stid_tab;
+	}
+	if (natids) {
+		while (--natids)
+			t->atid_tab[natids - 1].next = &t->atid_tab[natids];
+		t->afree = t->atid_tab;
+	}
+	return 0;
+}
+
+static void free_tid_maps(struct tid_info *t)
+{
+	kvfree(t->tid_tab);
+}
+
+static inline void add_adapter(struct adapter *adap)
+{
+	write_lock_bh(&adapter_list_lock);
+	list_add_tail(&adap->adapter_list, &adapter_list);
+	write_unlock_bh(&adapter_list_lock);
+}
+
+static inline void remove_adapter(struct adapter *adap)
+{
+	write_lock_bh(&adapter_list_lock);
+	list_del(&adap->adapter_list);
+	write_unlock_bh(&adapter_list_lock);
+}
+
+int cxgb3_offload_activate(struct adapter *adapter)
+{
+	struct t3cdev *dev = &adapter->tdev;
+	int natids, err;
+	struct t3c_data *t;
+	struct tid_range stid_range, tid_range;
+	struct mtutab mtutab;
+	unsigned int l2t_capacity;
+	struct l2t_data *l2td;
+
+	t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	err = -EOPNOTSUPP;
+	if (dev->ctl(dev, GET_TX_MAX_CHUNK, &t->tx_max_chunk) < 0 ||
+	    dev->ctl(dev, GET_MAX_OUTSTANDING_WR, &t->max_wrs) < 0 ||
+	    dev->ctl(dev, GET_L2T_CAPACITY, &l2t_capacity) < 0 ||
+	    dev->ctl(dev, GET_MTUS, &mtutab) < 0 ||
+	    dev->ctl(dev, GET_TID_RANGE, &tid_range) < 0 ||
+	    dev->ctl(dev, GET_STID_RANGE, &stid_range) < 0)
+		goto out_free;
+
+	err = -ENOMEM;
+	l2td = t3_init_l2t(l2t_capacity);
+	if (!l2td)
+		goto out_free;
+
+	natids = min(tid_range.num / 2, MAX_ATIDS);
+	err = init_tid_tabs(&t->tid_maps, tid_range.num, natids,
+			    stid_range.num, ATID_BASE, stid_range.base);
+	if (err)
+		goto out_free_l2t;
+
+	t->mtus = mtutab.mtus;
+	t->nmtus = mtutab.size;
+
+	INIT_WORK(&t->tid_release_task, t3_process_tid_release_list);
+	spin_lock_init(&t->tid_release_lock);
+	INIT_LIST_HEAD(&t->list_node);
+	t->dev = dev;
+
+	RCU_INIT_POINTER(dev->l2opt, l2td);
+	T3C_DATA(dev) = t;
+	dev->recv = process_rx;
+	dev->neigh_update = t3_l2t_update;
+
+	/* Register netevent handler once */
+	if (list_empty(&adapter_list))
+		register_netevent_notifier(&nb);
+
+	t->nofail_skb = alloc_skb(sizeof(struct cpl_tid_release), GFP_KERNEL);
+	t->release_list_incomplete = 0;
+
+	add_adapter(adapter);
+	return 0;
+
+out_free_l2t:
+	kvfree(l2td);
+out_free:
+	kfree(t);
+	return err;
+}
+
+static void clean_l2_data(struct rcu_head *head)
+{
+	struct l2t_data *d = container_of(head, struct l2t_data, rcu_head);
+	kvfree(d);
+}
+
+
+void cxgb3_offload_deactivate(struct adapter *adapter)
+{
+	struct t3cdev *tdev = &adapter->tdev;
+	struct t3c_data *t = T3C_DATA(tdev);
+	struct l2t_data *d;
+
+	remove_adapter(adapter);
+	if (list_empty(&adapter_list))
+		unregister_netevent_notifier(&nb);
+
+	free_tid_maps(&t->tid_maps);
+	T3C_DATA(tdev) = NULL;
+	rcu_read_lock();
+	d = L2DATA(tdev);
+	rcu_read_unlock();
+	RCU_INIT_POINTER(tdev->l2opt, NULL);
+	call_rcu(&d->rcu_head, clean_l2_data);
+	if (t->nofail_skb)
+		kfree_skb(t->nofail_skb);
+	kfree(t);
+}
+
+static inline void register_tdev(struct t3cdev *tdev)
+{
+	static int unit;
+
+	mutex_lock(&cxgb3_db_lock);
+	snprintf(tdev->name, sizeof(tdev->name), "ofld_dev%d", unit++);
+	list_add_tail(&tdev->ofld_dev_list, &ofld_dev_list);
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+static inline void unregister_tdev(struct t3cdev *tdev)
+{
+	mutex_lock(&cxgb3_db_lock);
+	list_del(&tdev->ofld_dev_list);
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+static inline int adap2type(struct adapter *adapter)
+{
+	int type = 0;
+
+	switch (adapter->params.rev) {
+	case T3_REV_A:
+		type = T3A;
+		break;
+	case T3_REV_B:
+	case T3_REV_B2:
+		type = T3B;
+		break;
+	case T3_REV_C:
+		type = T3C;
+		break;
+	}
+	return type;
+}
+
+void cxgb3_adapter_ofld(struct adapter *adapter)
+{
+	struct t3cdev *tdev = &adapter->tdev;
+
+	INIT_LIST_HEAD(&tdev->ofld_dev_list);
+
+	cxgb3_set_dummy_ops(tdev);
+	tdev->send = t3_offload_tx;
+	tdev->ctl = cxgb_offload_ctl;
+	tdev->type = adap2type(adapter);
+
+	register_tdev(tdev);
+}
+
+void cxgb3_adapter_unofld(struct adapter *adapter)
+{
+	struct t3cdev *tdev = &adapter->tdev;
+
+	tdev->recv = NULL;
+	tdev->neigh_update = NULL;
+
+	unregister_tdev(tdev);
+}
+
+void __init cxgb3_offload_init(void)
+{
+	int i;
+
+	for (i = 0; i < NUM_CPL_CMDS; ++i)
+		cpl_handlers[i] = do_bad_cpl;
+
+	t3_register_cpl_handler(CPL_SMT_WRITE_RPL, do_smt_write_rpl);
+	t3_register_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl);
+	t3_register_cpl_handler(CPL_RTE_WRITE_RPL, do_rte_write_rpl);
+	t3_register_cpl_handler(CPL_PASS_OPEN_RPL, do_stid_rpl);
+	t3_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_stid_rpl);
+	t3_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_cr);
+	t3_register_cpl_handler(CPL_PASS_ESTABLISH, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ABORT_RPL_RSS, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ABORT_RPL, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_RX_URG_NOTIFY, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_RX_DATA, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_TX_DATA_ACK, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_TX_DMA_ACK, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
+	t3_register_cpl_handler(CPL_PEER_CLOSE, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_CLOSE_CON_RPL, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req_rss);
+	t3_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
+	t3_register_cpl_handler(CPL_SET_TCB_RPL, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_GET_TCB_RPL, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_RDMA_TERMINATE, do_term);
+	t3_register_cpl_handler(CPL_RDMA_EC_STATUS, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_TRACE_PKT, do_trace);
+	t3_register_cpl_handler(CPL_RX_DATA_DDP, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ISCSI_HDR, do_hwtid_rpl);
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h
new file mode 100644
index 0000000..929c298
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2006-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _CXGB3_OFFLOAD_H
+#define _CXGB3_OFFLOAD_H
+
+#include <linux/list.h>
+#include <linux/skbuff.h>
+
+#include "l2t.h"
+
+#include "t3cdev.h"
+#include "t3_cpl.h"
+
+struct adapter;
+
+void cxgb3_offload_init(void);
+
+void cxgb3_adapter_ofld(struct adapter *adapter);
+void cxgb3_adapter_unofld(struct adapter *adapter);
+int cxgb3_offload_activate(struct adapter *adapter);
+void cxgb3_offload_deactivate(struct adapter *adapter);
+
+void cxgb3_set_dummy_ops(struct t3cdev *dev);
+
+struct t3cdev *dev2t3cdev(struct net_device *dev);
+
+/*
+ * Client registration.  Users of T3 driver must register themselves.
+ * The T3 driver will call the add function of every client for each T3
+ * adapter activated, passing up the t3cdev ptr.  Each client fills out an
+ * array of callback functions to process CPL messages.
+ */
+
+void cxgb3_register_client(struct cxgb3_client *client);
+void cxgb3_unregister_client(struct cxgb3_client *client);
+void cxgb3_add_clients(struct t3cdev *tdev);
+void cxgb3_remove_clients(struct t3cdev *tdev);
+void cxgb3_event_notify(struct t3cdev *tdev, u32 event, u32 port);
+
+typedef int (*cxgb3_cpl_handler_func)(struct t3cdev *dev,
+				      struct sk_buff *skb, void *ctx);
+
+enum {
+	OFFLOAD_STATUS_UP,
+	OFFLOAD_STATUS_DOWN,
+	OFFLOAD_PORT_DOWN,
+	OFFLOAD_PORT_UP,
+	OFFLOAD_DB_FULL,
+	OFFLOAD_DB_EMPTY,
+	OFFLOAD_DB_DROP
+};
+
+struct cxgb3_client {
+	char *name;
+	void (*add) (struct t3cdev *);
+	void (*remove) (struct t3cdev *);
+	cxgb3_cpl_handler_func *handlers;
+	int (*redirect)(void *ctx, struct dst_entry *old,
+			struct dst_entry *new, struct l2t_entry *l2t);
+	struct list_head client_list;
+	void (*event_handler)(struct t3cdev *tdev, u32 event, u32 port);
+};
+
+/*
+ * TID allocation services.
+ */
+int cxgb3_alloc_atid(struct t3cdev *dev, struct cxgb3_client *client,
+		     void *ctx);
+int cxgb3_alloc_stid(struct t3cdev *dev, struct cxgb3_client *client,
+		     void *ctx);
+void *cxgb3_free_atid(struct t3cdev *dev, int atid);
+void cxgb3_free_stid(struct t3cdev *dev, int stid);
+void cxgb3_insert_tid(struct t3cdev *dev, struct cxgb3_client *client,
+		      void *ctx, unsigned int tid);
+void cxgb3_queue_tid_release(struct t3cdev *dev, unsigned int tid);
+void cxgb3_remove_tid(struct t3cdev *dev, void *ctx, unsigned int tid);
+
+struct t3c_tid_entry {
+	struct cxgb3_client *client;
+	void *ctx;
+};
+
+/* CPL message priority levels */
+enum {
+	CPL_PRIORITY_DATA = 0,	/* data messages */
+	CPL_PRIORITY_SETUP = 1,	/* connection setup messages */
+	CPL_PRIORITY_TEARDOWN = 0,	/* connection teardown messages */
+	CPL_PRIORITY_LISTEN = 1,	/* listen start/stop messages */
+	CPL_PRIORITY_ACK = 1,	/* RX ACK messages */
+	CPL_PRIORITY_CONTROL = 1	/* offload control messages */
+};
+
+/* Flags for return value of CPL message handlers */
+enum {
+	CPL_RET_BUF_DONE = 1, /* buffer processing done, buffer may be freed */
+	CPL_RET_BAD_MSG = 2,  /* bad CPL message (e.g., unknown opcode) */
+	CPL_RET_UNKNOWN_TID = 4	/* unexpected unknown TID */
+};
+
+typedef int (*cpl_handler_func)(struct t3cdev *dev, struct sk_buff *skb);
+
+/*
+ * Returns a pointer to the first byte of the CPL header in an sk_buff that
+ * contains a CPL message.
+ */
+static inline void *cplhdr(struct sk_buff *skb)
+{
+	return skb->data;
+}
+
+void t3_register_cpl_handler(unsigned int opcode, cpl_handler_func h);
+
+union listen_entry {
+	struct t3c_tid_entry t3c_tid;
+	union listen_entry *next;
+};
+
+union active_open_entry {
+	struct t3c_tid_entry t3c_tid;
+	union active_open_entry *next;
+};
+
+/*
+ * Holds the size, base address, free list start, etc of the TID, server TID,
+ * and active-open TID tables for a offload device.
+ * The tables themselves are allocated dynamically.
+ */
+struct tid_info {
+	struct t3c_tid_entry *tid_tab;
+	unsigned int ntids;
+	atomic_t tids_in_use;
+
+	union listen_entry *stid_tab;
+	unsigned int nstids;
+	unsigned int stid_base;
+
+	union active_open_entry *atid_tab;
+	unsigned int natids;
+	unsigned int atid_base;
+
+	/*
+	 * The following members are accessed R/W so we put them in their own
+	 * cache lines.
+	 *
+	 * XXX We could combine the atid fields above with the lock here since
+	 * atids are use once (unlike other tids).  OTOH the above fields are
+	 * usually in cache due to tid_tab.
+	 */
+	spinlock_t atid_lock ____cacheline_aligned_in_smp;
+	union active_open_entry *afree;
+	unsigned int atids_in_use;
+
+	spinlock_t stid_lock ____cacheline_aligned;
+	union listen_entry *sfree;
+	unsigned int stids_in_use;
+};
+
+struct t3c_data {
+	struct list_head list_node;
+	struct t3cdev *dev;
+	unsigned int tx_max_chunk;	/* max payload for TX_DATA */
+	unsigned int max_wrs;	/* max in-flight WRs per connection */
+	unsigned int nmtus;
+	const unsigned short *mtus;
+	struct tid_info tid_maps;
+
+	struct t3c_tid_entry *tid_release_list;
+	spinlock_t tid_release_lock;
+	struct work_struct tid_release_task;
+
+	struct sk_buff *nofail_skb;
+	unsigned int release_list_incomplete;
+};
+
+/*
+ * t3cdev -> t3c_data accessor
+ */
+#define T3C_DATA(dev) (*(struct t3c_data **)&(dev)->l4opt)
+
+#endif
diff --git a/drivers/net/ethernet/chelsio/cxgb3/firmware_exports.h b/drivers/net/ethernet/chelsio/cxgb3/firmware_exports.h
new file mode 100644
index 0000000..0d9b0e6
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/firmware_exports.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2004-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _FIRMWARE_EXPORTS_H_
+#define _FIRMWARE_EXPORTS_H_
+
+/* WR OPCODES supported by the firmware.
+ */
+#define	FW_WROPCODE_FORWARD			0x01
+#define FW_WROPCODE_BYPASS			0x05
+
+#define FW_WROPCODE_TUNNEL_TX_PKT		0x03
+
+#define FW_WROPOCDE_ULPTX_DATA_SGL		0x00
+#define FW_WROPCODE_ULPTX_MEM_READ		0x02
+#define FW_WROPCODE_ULPTX_PKT			0x04
+#define FW_WROPCODE_ULPTX_INVALIDATE		0x06
+
+#define FW_WROPCODE_TUNNEL_RX_PKT		0x07
+
+#define FW_WROPCODE_OFLD_GETTCB_RPL		0x08
+#define FW_WROPCODE_OFLD_CLOSE_CON		0x09
+#define FW_WROPCODE_OFLD_TP_ABORT_CON_REQ	0x0A
+#define FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL	0x0F
+#define FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ	0x0B
+#define FW_WROPCODE_OFLD_TP_ABORT_CON_RPL	0x0C
+#define FW_WROPCODE_OFLD_TX_DATA		0x0D
+#define FW_WROPCODE_OFLD_TX_DATA_ACK		0x0E
+
+#define FW_WROPCODE_RI_RDMA_INIT		0x10
+#define FW_WROPCODE_RI_RDMA_WRITE		0x11
+#define FW_WROPCODE_RI_RDMA_READ_REQ		0x12
+#define FW_WROPCODE_RI_RDMA_READ_RESP		0x13
+#define FW_WROPCODE_RI_SEND			0x14
+#define FW_WROPCODE_RI_TERMINATE		0x15
+#define FW_WROPCODE_RI_RDMA_READ		0x16
+#define FW_WROPCODE_RI_RECEIVE			0x17
+#define FW_WROPCODE_RI_BIND_MW			0x18
+#define FW_WROPCODE_RI_FASTREGISTER_MR		0x19
+#define FW_WROPCODE_RI_LOCAL_INV		0x1A
+#define FW_WROPCODE_RI_MODIFY_QP		0x1B
+#define FW_WROPCODE_RI_BYPASS			0x1C
+
+#define FW_WROPOCDE_RSVD			0x1E
+
+#define FW_WROPCODE_SGE_EGRESSCONTEXT_RR	0x1F
+
+#define FW_WROPCODE_MNGT			0x1D
+#define FW_MNGTOPCODE_PKTSCHED_SET		0x00
+
+/* Maximum size of a WR sent from the host, limited by the SGE.
+ *
+ * Note: WR coming from ULP or TP are only limited by CIM.
+ */
+#define FW_WR_SIZE			128
+
+/* Maximum number of outstanding WRs sent from the host. Value must be
+ * programmed in the CTRL/TUNNEL/QP SGE Egress Context and used by
+ * offload modules to limit the number of WRs per connection.
+ */
+#define FW_T3_WR_NUM			16
+#define FW_N3_WR_NUM			7
+
+#ifndef N3
+# define FW_WR_NUM			FW_T3_WR_NUM
+#else
+# define FW_WR_NUM			FW_N3_WR_NUM
+#endif
+
+/* FW_TUNNEL_NUM corresponds to the number of supported TUNNEL Queues. These
+ * queues must start at SGE Egress Context FW_TUNNEL_SGEEC_START and must
+ * start at 'TID' (or 'uP Token') FW_TUNNEL_TID_START.
+ *
+ * Ingress Traffic (e.g. DMA completion credit)  for TUNNEL Queue[i] is sent
+ * to RESP Queue[i].
+ */
+#define FW_TUNNEL_NUM			8
+#define FW_TUNNEL_SGEEC_START		8
+#define FW_TUNNEL_TID_START		65544
+
+/* FW_CTRL_NUM corresponds to the number of supported CTRL Queues. These queues
+ * must start at SGE Egress Context FW_CTRL_SGEEC_START and must start at 'TID'
+ * (or 'uP Token') FW_CTRL_TID_START.
+ *
+ * Ingress Traffic for CTRL Queue[i] is sent to RESP Queue[i].
+ */
+#define FW_CTRL_NUM			8
+#define FW_CTRL_SGEEC_START		65528
+#define FW_CTRL_TID_START		65536
+
+/* FW_OFLD_NUM corresponds to the number of supported OFFLOAD Queues. These
+ * queues must start at SGE Egress Context FW_OFLD_SGEEC_START.
+ *
+ * Note: the 'uP Token' in the SGE Egress Context fields is irrelevant for
+ * OFFLOAD Queues, as the host is responsible for providing the correct TID in
+ * every WR.
+ *
+ * Ingress Trafffic for OFFLOAD Queue[i] is sent to RESP Queue[i].
+ */
+#define FW_OFLD_NUM			8
+#define FW_OFLD_SGEEC_START		0
+
+/*
+ *
+ */
+#define FW_RI_NUM			1
+#define FW_RI_SGEEC_START		65527
+#define FW_RI_TID_START			65552
+
+/*
+ * The RX_PKT_TID
+ */
+#define FW_RX_PKT_NUM			1
+#define FW_RX_PKT_TID_START		65553
+
+/* FW_WRC_NUM corresponds to the number of Work Request Context that supported
+ * by the firmware.
+ */
+#define FW_WRC_NUM			\
+    (65536 + FW_TUNNEL_NUM + FW_CTRL_NUM + FW_RI_NUM + FW_RX_PKT_NUM)
+
+/*
+ * FW type and version.
+ */
+#define S_FW_VERSION_TYPE		28
+#define M_FW_VERSION_TYPE		0xF
+#define V_FW_VERSION_TYPE(x)		((x) << S_FW_VERSION_TYPE)
+#define G_FW_VERSION_TYPE(x)		\
+    (((x) >> S_FW_VERSION_TYPE) & M_FW_VERSION_TYPE)
+
+#define S_FW_VERSION_MAJOR		16
+#define M_FW_VERSION_MAJOR		0xFFF
+#define V_FW_VERSION_MAJOR(x)		((x) << S_FW_VERSION_MAJOR)
+#define G_FW_VERSION_MAJOR(x)		\
+    (((x) >> S_FW_VERSION_MAJOR) & M_FW_VERSION_MAJOR)
+
+#define S_FW_VERSION_MINOR		8
+#define M_FW_VERSION_MINOR		0xFF
+#define V_FW_VERSION_MINOR(x)		((x) << S_FW_VERSION_MINOR)
+#define G_FW_VERSION_MINOR(x)		\
+    (((x) >> S_FW_VERSION_MINOR) & M_FW_VERSION_MINOR)
+
+#define S_FW_VERSION_MICRO		0
+#define M_FW_VERSION_MICRO		0xFF
+#define V_FW_VERSION_MICRO(x)		((x) << S_FW_VERSION_MICRO)
+#define G_FW_VERSION_MICRO(x)		\
+    (((x) >> S_FW_VERSION_MICRO) & M_FW_VERSION_MICRO)
+
+#endif				/* _FIRMWARE_EXPORTS_H_ */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/l2t.c b/drivers/net/ethernet/chelsio/cxgb3/l2t.c
new file mode 100644
index 0000000..248e40c
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/l2t.c
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/neighbour.h>
+#include "common.h"
+#include "t3cdev.h"
+#include "cxgb3_defs.h"
+#include "l2t.h"
+#include "t3_cpl.h"
+#include "firmware_exports.h"
+
+#define VLAN_NONE 0xfff
+
+/*
+ * Module locking notes:  There is a RW lock protecting the L2 table as a
+ * whole plus a spinlock per L2T entry.  Entry lookups and allocations happen
+ * under the protection of the table lock, individual entry changes happen
+ * while holding that entry's spinlock.  The table lock nests outside the
+ * entry locks.  Allocations of new entries take the table lock as writers so
+ * no other lookups can happen while allocating new entries.  Entry updates
+ * take the table lock as readers so multiple entries can be updated in
+ * parallel.  An L2T entry can be dropped by decrementing its reference count
+ * and therefore can happen in parallel with entry allocation but no entry
+ * can change state or increment its ref count during allocation as both of
+ * these perform lookups.
+ */
+
+static inline unsigned int vlan_prio(const struct l2t_entry *e)
+{
+	return e->vlan >> 13;
+}
+
+static inline unsigned int arp_hash(u32 key, int ifindex,
+				    const struct l2t_data *d)
+{
+	return jhash_2words(key, ifindex, 0) & (d->nentries - 1);
+}
+
+static inline void neigh_replace(struct l2t_entry *e, struct neighbour *n)
+{
+	neigh_hold(n);
+	if (e->neigh)
+		neigh_release(e->neigh);
+	e->neigh = n;
+}
+
+/*
+ * Set up an L2T entry and send any packets waiting in the arp queue.  The
+ * supplied skb is used for the CPL_L2T_WRITE_REQ.  Must be called with the
+ * entry locked.
+ */
+static int setup_l2e_send_pending(struct t3cdev *dev, struct sk_buff *skb,
+				  struct l2t_entry *e)
+{
+	struct cpl_l2t_write_req *req;
+	struct sk_buff *tmp;
+
+	if (!skb) {
+		skb = alloc_skb(sizeof(*req), GFP_ATOMIC);
+		if (!skb)
+			return -ENOMEM;
+	}
+
+	req = __skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx));
+	req->params = htonl(V_L2T_W_IDX(e->idx) | V_L2T_W_IFF(e->smt_idx) |
+			    V_L2T_W_VLAN(e->vlan & VLAN_VID_MASK) |
+			    V_L2T_W_PRIO(vlan_prio(e)));
+	memcpy(e->dmac, e->neigh->ha, sizeof(e->dmac));
+	memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
+	skb->priority = CPL_PRIORITY_CONTROL;
+	cxgb3_ofld_send(dev, skb);
+
+	skb_queue_walk_safe(&e->arpq, skb, tmp) {
+		__skb_unlink(skb, &e->arpq);
+		cxgb3_ofld_send(dev, skb);
+	}
+	e->state = L2T_STATE_VALID;
+
+	return 0;
+}
+
+/*
+ * Add a packet to the an L2T entry's queue of packets awaiting resolution.
+ * Must be called with the entry's lock held.
+ */
+static inline void arpq_enqueue(struct l2t_entry *e, struct sk_buff *skb)
+{
+	__skb_queue_tail(&e->arpq, skb);
+}
+
+int t3_l2t_send_slow(struct t3cdev *dev, struct sk_buff *skb,
+		     struct l2t_entry *e)
+{
+again:
+	switch (e->state) {
+	case L2T_STATE_STALE:	/* entry is stale, kick off revalidation */
+		neigh_event_send(e->neigh, NULL);
+		spin_lock_bh(&e->lock);
+		if (e->state == L2T_STATE_STALE)
+			e->state = L2T_STATE_VALID;
+		spin_unlock_bh(&e->lock);
+	case L2T_STATE_VALID:	/* fast-path, send the packet on */
+		return cxgb3_ofld_send(dev, skb);
+	case L2T_STATE_RESOLVING:
+		spin_lock_bh(&e->lock);
+		if (e->state != L2T_STATE_RESOLVING) {
+			/* ARP already completed */
+			spin_unlock_bh(&e->lock);
+			goto again;
+		}
+		arpq_enqueue(e, skb);
+		spin_unlock_bh(&e->lock);
+
+		/*
+		 * Only the first packet added to the arpq should kick off
+		 * resolution.  However, because the alloc_skb below can fail,
+		 * we allow each packet added to the arpq to retry resolution
+		 * as a way of recovering from transient memory exhaustion.
+		 * A better way would be to use a work request to retry L2T
+		 * entries when there's no memory.
+		 */
+		if (!neigh_event_send(e->neigh, NULL)) {
+			skb = alloc_skb(sizeof(struct cpl_l2t_write_req),
+					GFP_ATOMIC);
+			if (!skb)
+				break;
+
+			spin_lock_bh(&e->lock);
+			if (!skb_queue_empty(&e->arpq))
+				setup_l2e_send_pending(dev, skb, e);
+			else	/* we lost the race */
+				__kfree_skb(skb);
+			spin_unlock_bh(&e->lock);
+		}
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(t3_l2t_send_slow);
+
+void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e)
+{
+again:
+	switch (e->state) {
+	case L2T_STATE_STALE:	/* entry is stale, kick off revalidation */
+		neigh_event_send(e->neigh, NULL);
+		spin_lock_bh(&e->lock);
+		if (e->state == L2T_STATE_STALE) {
+			e->state = L2T_STATE_VALID;
+		}
+		spin_unlock_bh(&e->lock);
+		return;
+	case L2T_STATE_VALID:	/* fast-path, send the packet on */
+		return;
+	case L2T_STATE_RESOLVING:
+		spin_lock_bh(&e->lock);
+		if (e->state != L2T_STATE_RESOLVING) {
+			/* ARP already completed */
+			spin_unlock_bh(&e->lock);
+			goto again;
+		}
+		spin_unlock_bh(&e->lock);
+
+		/*
+		 * Only the first packet added to the arpq should kick off
+		 * resolution.  However, because the alloc_skb below can fail,
+		 * we allow each packet added to the arpq to retry resolution
+		 * as a way of recovering from transient memory exhaustion.
+		 * A better way would be to use a work request to retry L2T
+		 * entries when there's no memory.
+		 */
+		neigh_event_send(e->neigh, NULL);
+	}
+}
+
+EXPORT_SYMBOL(t3_l2t_send_event);
+
+/*
+ * Allocate a free L2T entry.  Must be called with l2t_data.lock held.
+ */
+static struct l2t_entry *alloc_l2e(struct l2t_data *d)
+{
+	struct l2t_entry *end, *e, **p;
+
+	if (!atomic_read(&d->nfree))
+		return NULL;
+
+	/* there's definitely a free entry */
+	for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e)
+		if (atomic_read(&e->refcnt) == 0)
+			goto found;
+
+	for (e = &d->l2tab[1]; atomic_read(&e->refcnt); ++e) ;
+found:
+	d->rover = e + 1;
+	atomic_dec(&d->nfree);
+
+	/*
+	 * The entry we found may be an inactive entry that is
+	 * presently in the hash table.  We need to remove it.
+	 */
+	if (e->state != L2T_STATE_UNUSED) {
+		int hash = arp_hash(e->addr, e->ifindex, d);
+
+		for (p = &d->l2tab[hash].first; *p; p = &(*p)->next)
+			if (*p == e) {
+				*p = e->next;
+				break;
+			}
+		e->state = L2T_STATE_UNUSED;
+	}
+	return e;
+}
+
+/*
+ * Called when an L2T entry has no more users.  The entry is left in the hash
+ * table since it is likely to be reused but we also bump nfree to indicate
+ * that the entry can be reallocated for a different neighbor.  We also drop
+ * the existing neighbor reference in case the neighbor is going away and is
+ * waiting on our reference.
+ *
+ * Because entries can be reallocated to other neighbors once their ref count
+ * drops to 0 we need to take the entry's lock to avoid races with a new
+ * incarnation.
+ */
+void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e)
+{
+	spin_lock_bh(&e->lock);
+	if (atomic_read(&e->refcnt) == 0) {	/* hasn't been recycled */
+		if (e->neigh) {
+			neigh_release(e->neigh);
+			e->neigh = NULL;
+		}
+	}
+	spin_unlock_bh(&e->lock);
+	atomic_inc(&d->nfree);
+}
+
+EXPORT_SYMBOL(t3_l2e_free);
+
+/*
+ * Update an L2T entry that was previously used for the same next hop as neigh.
+ * Must be called with softirqs disabled.
+ */
+static inline void reuse_entry(struct l2t_entry *e, struct neighbour *neigh)
+{
+	unsigned int nud_state;
+
+	spin_lock(&e->lock);	/* avoid race with t3_l2t_free */
+
+	if (neigh != e->neigh)
+		neigh_replace(e, neigh);
+	nud_state = neigh->nud_state;
+	if (memcmp(e->dmac, neigh->ha, sizeof(e->dmac)) ||
+	    !(nud_state & NUD_VALID))
+		e->state = L2T_STATE_RESOLVING;
+	else if (nud_state & NUD_CONNECTED)
+		e->state = L2T_STATE_VALID;
+	else
+		e->state = L2T_STATE_STALE;
+	spin_unlock(&e->lock);
+}
+
+struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct dst_entry *dst,
+			     struct net_device *dev, const void *daddr)
+{
+	struct l2t_entry *e = NULL;
+	struct neighbour *neigh;
+	struct port_info *p;
+	struct l2t_data *d;
+	int hash;
+	u32 addr;
+	int ifidx;
+	int smt_idx;
+
+	rcu_read_lock();
+	neigh = dst_neigh_lookup(dst, daddr);
+	if (!neigh)
+		goto done_rcu;
+
+	addr = *(u32 *) neigh->primary_key;
+	ifidx = neigh->dev->ifindex;
+
+	if (!dev)
+		dev = neigh->dev;
+	p = netdev_priv(dev);
+	smt_idx = p->port_id;
+
+	d = L2DATA(cdev);
+	if (!d)
+		goto done_rcu;
+
+	hash = arp_hash(addr, ifidx, d);
+
+	write_lock_bh(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next)
+		if (e->addr == addr && e->ifindex == ifidx &&
+		    e->smt_idx == smt_idx) {
+			l2t_hold(d, e);
+			if (atomic_read(&e->refcnt) == 1)
+				reuse_entry(e, neigh);
+			goto done_unlock;
+		}
+
+	/* Need to allocate a new entry */
+	e = alloc_l2e(d);
+	if (e) {
+		spin_lock(&e->lock);	/* avoid race with t3_l2t_free */
+		e->next = d->l2tab[hash].first;
+		d->l2tab[hash].first = e;
+		e->state = L2T_STATE_RESOLVING;
+		e->addr = addr;
+		e->ifindex = ifidx;
+		e->smt_idx = smt_idx;
+		atomic_set(&e->refcnt, 1);
+		neigh_replace(e, neigh);
+		if (is_vlan_dev(neigh->dev))
+			e->vlan = vlan_dev_vlan_id(neigh->dev);
+		else
+			e->vlan = VLAN_NONE;
+		spin_unlock(&e->lock);
+	}
+done_unlock:
+	write_unlock_bh(&d->lock);
+done_rcu:
+	if (neigh)
+		neigh_release(neigh);
+	rcu_read_unlock();
+	return e;
+}
+
+EXPORT_SYMBOL(t3_l2t_get);
+
+/*
+ * Called when address resolution fails for an L2T entry to handle packets
+ * on the arpq head.  If a packet specifies a failure handler it is invoked,
+ * otherwise the packets is sent to the offload device.
+ *
+ * XXX: maybe we should abandon the latter behavior and just require a failure
+ * handler.
+ */
+static void handle_failed_resolution(struct t3cdev *dev, struct sk_buff_head *arpq)
+{
+	struct sk_buff *skb, *tmp;
+
+	skb_queue_walk_safe(arpq, skb, tmp) {
+		struct l2t_skb_cb *cb = L2T_SKB_CB(skb);
+
+		__skb_unlink(skb, arpq);
+		if (cb->arp_failure_handler)
+			cb->arp_failure_handler(dev, skb);
+		else
+			cxgb3_ofld_send(dev, skb);
+	}
+}
+
+/*
+ * Called when the host's ARP layer makes a change to some entry that is
+ * loaded into the HW L2 table.
+ */
+void t3_l2t_update(struct t3cdev *dev, struct neighbour *neigh)
+{
+	struct sk_buff_head arpq;
+	struct l2t_entry *e;
+	struct l2t_data *d = L2DATA(dev);
+	u32 addr = *(u32 *) neigh->primary_key;
+	int ifidx = neigh->dev->ifindex;
+	int hash = arp_hash(addr, ifidx, d);
+
+	read_lock_bh(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next)
+		if (e->addr == addr && e->ifindex == ifidx) {
+			spin_lock(&e->lock);
+			goto found;
+		}
+	read_unlock_bh(&d->lock);
+	return;
+
+found:
+	__skb_queue_head_init(&arpq);
+
+	read_unlock(&d->lock);
+	if (atomic_read(&e->refcnt)) {
+		if (neigh != e->neigh)
+			neigh_replace(e, neigh);
+
+		if (e->state == L2T_STATE_RESOLVING) {
+			if (neigh->nud_state & NUD_FAILED) {
+				skb_queue_splice_init(&e->arpq, &arpq);
+			} else if (neigh->nud_state & (NUD_CONNECTED|NUD_STALE))
+				setup_l2e_send_pending(dev, NULL, e);
+		} else {
+			e->state = neigh->nud_state & NUD_CONNECTED ?
+			    L2T_STATE_VALID : L2T_STATE_STALE;
+			if (!ether_addr_equal(e->dmac, neigh->ha))
+				setup_l2e_send_pending(dev, NULL, e);
+		}
+	}
+	spin_unlock_bh(&e->lock);
+
+	if (!skb_queue_empty(&arpq))
+		handle_failed_resolution(dev, &arpq);
+}
+
+struct l2t_data *t3_init_l2t(unsigned int l2t_capacity)
+{
+	struct l2t_data *d;
+	int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry);
+
+	d = kvzalloc(size, GFP_KERNEL);
+	if (!d)
+		return NULL;
+
+	d->nentries = l2t_capacity;
+	d->rover = &d->l2tab[1];	/* entry 0 is not used */
+	atomic_set(&d->nfree, l2t_capacity - 1);
+	rwlock_init(&d->lock);
+
+	for (i = 0; i < l2t_capacity; ++i) {
+		d->l2tab[i].idx = i;
+		d->l2tab[i].state = L2T_STATE_UNUSED;
+		__skb_queue_head_init(&d->l2tab[i].arpq);
+		spin_lock_init(&d->l2tab[i].lock);
+		atomic_set(&d->l2tab[i].refcnt, 0);
+	}
+	return d;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/l2t.h b/drivers/net/ethernet/chelsio/cxgb3/l2t.h
new file mode 100644
index 0000000..c2fd323
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/l2t.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _CHELSIO_L2T_H
+#define _CHELSIO_L2T_H
+
+#include <linux/spinlock.h>
+#include "t3cdev.h"
+#include <linux/atomic.h>
+
+enum {
+	L2T_STATE_VALID,	/* entry is up to date */
+	L2T_STATE_STALE,	/* entry may be used but needs revalidation */
+	L2T_STATE_RESOLVING,	/* entry needs address resolution */
+	L2T_STATE_UNUSED	/* entry not in use */
+};
+
+struct neighbour;
+struct sk_buff;
+
+/*
+ * Each L2T entry plays multiple roles.  First of all, it keeps state for the
+ * corresponding entry of the HW L2 table and maintains a queue of offload
+ * packets awaiting address resolution.  Second, it is a node of a hash table
+ * chain, where the nodes of the chain are linked together through their next
+ * pointer.  Finally, each node is a bucket of a hash table, pointing to the
+ * first element in its chain through its first pointer.
+ */
+struct l2t_entry {
+	u16 state;		/* entry state */
+	u16 idx;		/* entry index */
+	u32 addr;		/* dest IP address */
+	int ifindex;		/* neighbor's net_device's ifindex */
+	u16 smt_idx;		/* SMT index */
+	u16 vlan;		/* VLAN TCI (id: bits 0-11, prio: 13-15 */
+	struct neighbour *neigh;	/* associated neighbour */
+	struct l2t_entry *first;	/* start of hash chain */
+	struct l2t_entry *next;	/* next l2t_entry on chain */
+	struct sk_buff_head arpq;	/* queue of packets awaiting resolution */
+	spinlock_t lock;
+	atomic_t refcnt;	/* entry reference count */
+	u8 dmac[6];		/* neighbour's MAC address */
+};
+
+struct l2t_data {
+	unsigned int nentries;	/* number of entries */
+	struct l2t_entry *rover;	/* starting point for next allocation */
+	atomic_t nfree;		/* number of free entries */
+	rwlock_t lock;
+	struct l2t_entry l2tab[0];
+	struct rcu_head rcu_head;	/* to handle rcu cleanup */
+};
+
+typedef void (*arp_failure_handler_func)(struct t3cdev * dev,
+					 struct sk_buff * skb);
+
+/*
+ * Callback stored in an skb to handle address resolution failure.
+ */
+struct l2t_skb_cb {
+	arp_failure_handler_func arp_failure_handler;
+};
+
+#define L2T_SKB_CB(skb) ((struct l2t_skb_cb *)(skb)->cb)
+
+static inline void set_arp_failure_handler(struct sk_buff *skb,
+					   arp_failure_handler_func hnd)
+{
+	L2T_SKB_CB(skb)->arp_failure_handler = hnd;
+}
+
+/*
+ * Getting to the L2 data from an offload device.
+ */
+#define L2DATA(cdev) (rcu_dereference((cdev)->l2opt))
+
+#define W_TCB_L2T_IX    0
+#define S_TCB_L2T_IX    7
+#define M_TCB_L2T_IX    0x7ffULL
+#define V_TCB_L2T_IX(x) ((x) << S_TCB_L2T_IX)
+
+void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e);
+void t3_l2t_update(struct t3cdev *dev, struct neighbour *neigh);
+struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct dst_entry *dst,
+			     struct net_device *dev, const void *daddr);
+int t3_l2t_send_slow(struct t3cdev *dev, struct sk_buff *skb,
+		     struct l2t_entry *e);
+void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e);
+struct l2t_data *t3_init_l2t(unsigned int l2t_capacity);
+
+int cxgb3_ofld_send(struct t3cdev *dev, struct sk_buff *skb);
+
+static inline int l2t_send(struct t3cdev *dev, struct sk_buff *skb,
+			   struct l2t_entry *e)
+{
+	if (likely(e->state == L2T_STATE_VALID))
+		return cxgb3_ofld_send(dev, skb);
+	return t3_l2t_send_slow(dev, skb, e);
+}
+
+static inline void l2t_release(struct t3cdev *t, struct l2t_entry *e)
+{
+	struct l2t_data *d;
+
+	rcu_read_lock();
+	d = L2DATA(t);
+
+	if (atomic_dec_and_test(&e->refcnt) && d)
+		t3_l2e_free(d, e);
+
+	rcu_read_unlock();
+}
+
+static inline void l2t_hold(struct l2t_data *d, struct l2t_entry *e)
+{
+	if (d && atomic_add_return(1, &e->refcnt) == 1)	/* 0 -> 1 transition */
+		atomic_dec(&d->nfree);
+}
+
+#endif
diff --git a/drivers/net/ethernet/chelsio/cxgb3/mc5.c b/drivers/net/ethernet/chelsio/cxgb3/mc5.c
new file mode 100644
index 0000000..338301b
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/mc5.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "common.h"
+#include "regs.h"
+
+enum {
+	IDT75P52100 = 4,
+	IDT75N43102 = 5
+};
+
+/* DBGI command mode */
+enum {
+	DBGI_MODE_MBUS = 0,
+	DBGI_MODE_IDT52100 = 5
+};
+
+/* IDT 75P52100 commands */
+#define IDT_CMD_READ   0
+#define IDT_CMD_WRITE  1
+#define IDT_CMD_SEARCH 2
+#define IDT_CMD_LEARN  3
+
+/* IDT LAR register address and value for 144-bit mode (low 32 bits) */
+#define IDT_LAR_ADR0   	0x180006
+#define IDT_LAR_MODE144	0xffff0000
+
+/* IDT SCR and SSR addresses (low 32 bits) */
+#define IDT_SCR_ADR0  0x180000
+#define IDT_SSR0_ADR0 0x180002
+#define IDT_SSR1_ADR0 0x180004
+
+/* IDT GMR base address (low 32 bits) */
+#define IDT_GMR_BASE_ADR0 0x180020
+
+/* IDT data and mask array base addresses (low 32 bits) */
+#define IDT_DATARY_BASE_ADR0 0
+#define IDT_MSKARY_BASE_ADR0 0x80000
+
+/* IDT 75N43102 commands */
+#define IDT4_CMD_SEARCH144 3
+#define IDT4_CMD_WRITE     4
+#define IDT4_CMD_READ      5
+
+/* IDT 75N43102 SCR address (low 32 bits) */
+#define IDT4_SCR_ADR0  0x3
+
+/* IDT 75N43102 GMR base addresses (low 32 bits) */
+#define IDT4_GMR_BASE0 0x10
+#define IDT4_GMR_BASE1 0x20
+#define IDT4_GMR_BASE2 0x30
+
+/* IDT 75N43102 data and mask array base addresses (low 32 bits) */
+#define IDT4_DATARY_BASE_ADR0 0x1000000
+#define IDT4_MSKARY_BASE_ADR0 0x2000000
+
+#define MAX_WRITE_ATTEMPTS 5
+
+#define MAX_ROUTES 2048
+
+/*
+ * Issue a command to the TCAM and wait for its completion.  The address and
+ * any data required by the command must have been setup by the caller.
+ */
+static int mc5_cmd_write(struct adapter *adapter, u32 cmd)
+{
+	t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_CMD, cmd);
+	return t3_wait_op_done(adapter, A_MC5_DB_DBGI_RSP_STATUS,
+			       F_DBGIRSPVALID, 1, MAX_WRITE_ATTEMPTS, 1);
+}
+
+static inline void dbgi_wr_data3(struct adapter *adapter, u32 v1, u32 v2,
+				 u32 v3)
+{
+	t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_DATA0, v1);
+	t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_DATA1, v2);
+	t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_DATA2, v3);
+}
+
+/*
+ * Write data to the TCAM register at address (0, 0, addr_lo) using the TCAM
+ * command cmd.  The data to be written must have been set up by the caller.
+ * Returns -1 on failure, 0 on success.
+ */
+static int mc5_write(struct adapter *adapter, u32 addr_lo, u32 cmd)
+{
+	t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_ADDR0, addr_lo);
+	if (mc5_cmd_write(adapter, cmd) == 0)
+		return 0;
+	CH_ERR(adapter, "MC5 timeout writing to TCAM address 0x%x\n",
+	       addr_lo);
+	return -1;
+}
+
+static int init_mask_data_array(struct mc5 *mc5, u32 mask_array_base,
+				u32 data_array_base, u32 write_cmd,
+				int addr_shift)
+{
+	unsigned int i;
+	struct adapter *adap = mc5->adapter;
+
+	/*
+	 * We need the size of the TCAM data and mask arrays in terms of
+	 * 72-bit entries.
+	 */
+	unsigned int size72 = mc5->tcam_size;
+	unsigned int server_base = t3_read_reg(adap, A_MC5_DB_SERVER_INDEX);
+
+	if (mc5->mode == MC5_MODE_144_BIT) {
+		size72 *= 2;	/* 1 144-bit entry is 2 72-bit entries */
+		server_base *= 2;
+	}
+
+	/* Clear the data array */
+	dbgi_wr_data3(adap, 0, 0, 0);
+	for (i = 0; i < size72; i++)
+		if (mc5_write(adap, data_array_base + (i << addr_shift),
+			      write_cmd))
+			return -1;
+
+	/* Initialize the mask array. */
+	dbgi_wr_data3(adap, 0xffffffff, 0xffffffff, 0xff);
+	for (i = 0; i < size72; i++) {
+		if (i == server_base)	/* entering server or routing region */
+			t3_write_reg(adap, A_MC5_DB_DBGI_REQ_DATA0,
+				     mc5->mode == MC5_MODE_144_BIT ?
+				     0xfffffff9 : 0xfffffffd);
+		if (mc5_write(adap, mask_array_base + (i << addr_shift),
+			      write_cmd))
+			return -1;
+	}
+	return 0;
+}
+
+static int init_idt52100(struct mc5 *mc5)
+{
+	int i;
+	struct adapter *adap = mc5->adapter;
+
+	t3_write_reg(adap, A_MC5_DB_RSP_LATENCY,
+		     V_RDLAT(0x15) | V_LRNLAT(0x15) | V_SRCHLAT(0x15));
+	t3_write_reg(adap, A_MC5_DB_PART_ID_INDEX, 2);
+
+	/*
+	 * Use GMRs 14-15 for ELOOKUP, GMRs 12-13 for SYN lookups, and
+	 * GMRs 8-9 for ACK- and AOPEN searches.
+	 */
+	t3_write_reg(adap, A_MC5_DB_POPEN_DATA_WR_CMD, IDT_CMD_WRITE);
+	t3_write_reg(adap, A_MC5_DB_POPEN_MASK_WR_CMD, IDT_CMD_WRITE);
+	t3_write_reg(adap, A_MC5_DB_AOPEN_SRCH_CMD, IDT_CMD_SEARCH);
+	t3_write_reg(adap, A_MC5_DB_AOPEN_LRN_CMD, IDT_CMD_LEARN);
+	t3_write_reg(adap, A_MC5_DB_SYN_SRCH_CMD, IDT_CMD_SEARCH | 0x6000);
+	t3_write_reg(adap, A_MC5_DB_SYN_LRN_CMD, IDT_CMD_LEARN);
+	t3_write_reg(adap, A_MC5_DB_ACK_SRCH_CMD, IDT_CMD_SEARCH);
+	t3_write_reg(adap, A_MC5_DB_ACK_LRN_CMD, IDT_CMD_LEARN);
+	t3_write_reg(adap, A_MC5_DB_ILOOKUP_CMD, IDT_CMD_SEARCH);
+	t3_write_reg(adap, A_MC5_DB_ELOOKUP_CMD, IDT_CMD_SEARCH | 0x7000);
+	t3_write_reg(adap, A_MC5_DB_DATA_WRITE_CMD, IDT_CMD_WRITE);
+	t3_write_reg(adap, A_MC5_DB_DATA_READ_CMD, IDT_CMD_READ);
+
+	/* Set DBGI command mode for IDT TCAM. */
+	t3_write_reg(adap, A_MC5_DB_DBGI_CONFIG, DBGI_MODE_IDT52100);
+
+	/* Set up LAR */
+	dbgi_wr_data3(adap, IDT_LAR_MODE144, 0, 0);
+	if (mc5_write(adap, IDT_LAR_ADR0, IDT_CMD_WRITE))
+		goto err;
+
+	/* Set up SSRs */
+	dbgi_wr_data3(adap, 0xffffffff, 0xffffffff, 0);
+	if (mc5_write(adap, IDT_SSR0_ADR0, IDT_CMD_WRITE) ||
+	    mc5_write(adap, IDT_SSR1_ADR0, IDT_CMD_WRITE))
+		goto err;
+
+	/* Set up GMRs */
+	for (i = 0; i < 32; ++i) {
+		if (i >= 12 && i < 15)
+			dbgi_wr_data3(adap, 0xfffffff9, 0xffffffff, 0xff);
+		else if (i == 15)
+			dbgi_wr_data3(adap, 0xfffffff9, 0xffff8007, 0xff);
+		else
+			dbgi_wr_data3(adap, 0xffffffff, 0xffffffff, 0xff);
+
+		if (mc5_write(adap, IDT_GMR_BASE_ADR0 + i, IDT_CMD_WRITE))
+			goto err;
+	}
+
+	/* Set up SCR */
+	dbgi_wr_data3(adap, 1, 0, 0);
+	if (mc5_write(adap, IDT_SCR_ADR0, IDT_CMD_WRITE))
+		goto err;
+
+	return init_mask_data_array(mc5, IDT_MSKARY_BASE_ADR0,
+				    IDT_DATARY_BASE_ADR0, IDT_CMD_WRITE, 0);
+err:
+	return -EIO;
+}
+
+static int init_idt43102(struct mc5 *mc5)
+{
+	int i;
+	struct adapter *adap = mc5->adapter;
+
+	t3_write_reg(adap, A_MC5_DB_RSP_LATENCY,
+		     adap->params.rev == 0 ? V_RDLAT(0xd) | V_SRCHLAT(0x11) :
+		     V_RDLAT(0xd) | V_SRCHLAT(0x12));
+
+	/*
+	 * Use GMRs 24-25 for ELOOKUP, GMRs 20-21 for SYN lookups, and no mask
+	 * for ACK- and AOPEN searches.
+	 */
+	t3_write_reg(adap, A_MC5_DB_POPEN_DATA_WR_CMD, IDT4_CMD_WRITE);
+	t3_write_reg(adap, A_MC5_DB_POPEN_MASK_WR_CMD, IDT4_CMD_WRITE);
+	t3_write_reg(adap, A_MC5_DB_AOPEN_SRCH_CMD,
+		     IDT4_CMD_SEARCH144 | 0x3800);
+	t3_write_reg(adap, A_MC5_DB_SYN_SRCH_CMD, IDT4_CMD_SEARCH144);
+	t3_write_reg(adap, A_MC5_DB_ACK_SRCH_CMD, IDT4_CMD_SEARCH144 | 0x3800);
+	t3_write_reg(adap, A_MC5_DB_ILOOKUP_CMD, IDT4_CMD_SEARCH144 | 0x3800);
+	t3_write_reg(adap, A_MC5_DB_ELOOKUP_CMD, IDT4_CMD_SEARCH144 | 0x800);
+	t3_write_reg(adap, A_MC5_DB_DATA_WRITE_CMD, IDT4_CMD_WRITE);
+	t3_write_reg(adap, A_MC5_DB_DATA_READ_CMD, IDT4_CMD_READ);
+
+	t3_write_reg(adap, A_MC5_DB_PART_ID_INDEX, 3);
+
+	/* Set DBGI command mode for IDT TCAM. */
+	t3_write_reg(adap, A_MC5_DB_DBGI_CONFIG, DBGI_MODE_IDT52100);
+
+	/* Set up GMRs */
+	dbgi_wr_data3(adap, 0xffffffff, 0xffffffff, 0xff);
+	for (i = 0; i < 7; ++i)
+		if (mc5_write(adap, IDT4_GMR_BASE0 + i, IDT4_CMD_WRITE))
+			goto err;
+
+	for (i = 0; i < 4; ++i)
+		if (mc5_write(adap, IDT4_GMR_BASE2 + i, IDT4_CMD_WRITE))
+			goto err;
+
+	dbgi_wr_data3(adap, 0xfffffff9, 0xffffffff, 0xff);
+	if (mc5_write(adap, IDT4_GMR_BASE1, IDT4_CMD_WRITE) ||
+	    mc5_write(adap, IDT4_GMR_BASE1 + 1, IDT4_CMD_WRITE) ||
+	    mc5_write(adap, IDT4_GMR_BASE1 + 4, IDT4_CMD_WRITE))
+		goto err;
+
+	dbgi_wr_data3(adap, 0xfffffff9, 0xffff8007, 0xff);
+	if (mc5_write(adap, IDT4_GMR_BASE1 + 5, IDT4_CMD_WRITE))
+		goto err;
+
+	/* Set up SCR */
+	dbgi_wr_data3(adap, 0xf0000000, 0, 0);
+	if (mc5_write(adap, IDT4_SCR_ADR0, IDT4_CMD_WRITE))
+		goto err;
+
+	return init_mask_data_array(mc5, IDT4_MSKARY_BASE_ADR0,
+				    IDT4_DATARY_BASE_ADR0, IDT4_CMD_WRITE, 1);
+err:
+	return -EIO;
+}
+
+/* Put MC5 in DBGI mode. */
+static inline void mc5_dbgi_mode_enable(const struct mc5 *mc5)
+{
+	t3_write_reg(mc5->adapter, A_MC5_DB_CONFIG,
+		     V_TMMODE(mc5->mode == MC5_MODE_72_BIT) | F_DBGIEN);
+}
+
+/* Put MC5 in M-Bus mode. */
+static void mc5_dbgi_mode_disable(const struct mc5 *mc5)
+{
+	t3_write_reg(mc5->adapter, A_MC5_DB_CONFIG,
+		     V_TMMODE(mc5->mode == MC5_MODE_72_BIT) |
+		     V_COMPEN(mc5->mode == MC5_MODE_72_BIT) |
+		     V_PRTYEN(mc5->parity_enabled) | F_MBUSEN);
+}
+
+/*
+ * Initialization that requires the OS and protocol layers to already
+ * be initialized goes here.
+ */
+int t3_mc5_init(struct mc5 *mc5, unsigned int nservers, unsigned int nfilters,
+		unsigned int nroutes)
+{
+	u32 cfg;
+	int err;
+	unsigned int tcam_size = mc5->tcam_size;
+	struct adapter *adap = mc5->adapter;
+
+	if (!tcam_size)
+		return 0;
+
+	if (nroutes > MAX_ROUTES || nroutes + nservers + nfilters > tcam_size)
+		return -EINVAL;
+
+	/* Reset the TCAM */
+	cfg = t3_read_reg(adap, A_MC5_DB_CONFIG) & ~F_TMMODE;
+	cfg |= V_TMMODE(mc5->mode == MC5_MODE_72_BIT) | F_TMRST;
+	t3_write_reg(adap, A_MC5_DB_CONFIG, cfg);
+	if (t3_wait_op_done(adap, A_MC5_DB_CONFIG, F_TMRDY, 1, 500, 0)) {
+		CH_ERR(adap, "TCAM reset timed out\n");
+		return -1;
+	}
+
+	t3_write_reg(adap, A_MC5_DB_ROUTING_TABLE_INDEX, tcam_size - nroutes);
+	t3_write_reg(adap, A_MC5_DB_FILTER_TABLE,
+		     tcam_size - nroutes - nfilters);
+	t3_write_reg(adap, A_MC5_DB_SERVER_INDEX,
+		     tcam_size - nroutes - nfilters - nservers);
+
+	mc5->parity_enabled = 1;
+
+	/* All the TCAM addresses we access have only the low 32 bits non 0 */
+	t3_write_reg(adap, A_MC5_DB_DBGI_REQ_ADDR1, 0);
+	t3_write_reg(adap, A_MC5_DB_DBGI_REQ_ADDR2, 0);
+
+	mc5_dbgi_mode_enable(mc5);
+
+	switch (mc5->part_type) {
+	case IDT75P52100:
+		err = init_idt52100(mc5);
+		break;
+	case IDT75N43102:
+		err = init_idt43102(mc5);
+		break;
+	default:
+		CH_ERR(adap, "Unsupported TCAM type %d\n", mc5->part_type);
+		err = -EINVAL;
+		break;
+	}
+
+	mc5_dbgi_mode_disable(mc5);
+	return err;
+}
+
+
+#define MC5_INT_FATAL (F_PARITYERR | F_REQQPARERR | F_DISPQPARERR)
+
+/*
+ * MC5 interrupt handler
+ */
+void t3_mc5_intr_handler(struct mc5 *mc5)
+{
+	struct adapter *adap = mc5->adapter;
+	u32 cause = t3_read_reg(adap, A_MC5_DB_INT_CAUSE);
+
+	if ((cause & F_PARITYERR) && mc5->parity_enabled) {
+		CH_ALERT(adap, "MC5 parity error\n");
+		mc5->stats.parity_err++;
+	}
+
+	if (cause & F_REQQPARERR) {
+		CH_ALERT(adap, "MC5 request queue parity error\n");
+		mc5->stats.reqq_parity_err++;
+	}
+
+	if (cause & F_DISPQPARERR) {
+		CH_ALERT(adap, "MC5 dispatch queue parity error\n");
+		mc5->stats.dispq_parity_err++;
+	}
+
+	if (cause & F_ACTRGNFULL)
+		mc5->stats.active_rgn_full++;
+	if (cause & F_NFASRCHFAIL)
+		mc5->stats.nfa_srch_err++;
+	if (cause & F_UNKNOWNCMD)
+		mc5->stats.unknown_cmd++;
+	if (cause & F_DELACTEMPTY)
+		mc5->stats.del_act_empty++;
+	if (cause & MC5_INT_FATAL)
+		t3_fatal_err(adap);
+
+	t3_write_reg(adap, A_MC5_DB_INT_CAUSE, cause);
+}
+
+void t3_mc5_prep(struct adapter *adapter, struct mc5 *mc5, int mode)
+{
+#define K * 1024
+
+	static unsigned int tcam_part_size[] = {	/* in K 72-bit entries */
+		64 K, 128 K, 256 K, 32 K
+	};
+
+#undef K
+
+	u32 cfg = t3_read_reg(adapter, A_MC5_DB_CONFIG);
+
+	mc5->adapter = adapter;
+	mc5->mode = (unsigned char)mode;
+	mc5->part_type = (unsigned char)G_TMTYPE(cfg);
+	if (cfg & F_TMTYPEHI)
+		mc5->part_type |= 4;
+
+	mc5->tcam_size = tcam_part_size[G_TMPARTSIZE(cfg)];
+	if (mode == MC5_MODE_144_BIT)
+		mc5->tcam_size /= 2;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/regs.h b/drivers/net/ethernet/chelsio/cxgb3/regs.h
new file mode 100644
index 0000000..174eb45
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/regs.h
@@ -0,0 +1,2564 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define A_SG_CONTROL 0x0
+
+#define S_CONGMODE    29
+#define V_CONGMODE(x) ((x) << S_CONGMODE)
+#define F_CONGMODE    V_CONGMODE(1U)
+
+#define S_TNLFLMODE    28
+#define V_TNLFLMODE(x) ((x) << S_TNLFLMODE)
+#define F_TNLFLMODE    V_TNLFLMODE(1U)
+
+#define S_FATLPERREN    27
+#define V_FATLPERREN(x) ((x) << S_FATLPERREN)
+#define F_FATLPERREN    V_FATLPERREN(1U)
+
+#define S_DROPPKT    20
+#define V_DROPPKT(x) ((x) << S_DROPPKT)
+#define F_DROPPKT    V_DROPPKT(1U)
+
+#define S_EGRGENCTRL    19
+#define V_EGRGENCTRL(x) ((x) << S_EGRGENCTRL)
+#define F_EGRGENCTRL    V_EGRGENCTRL(1U)
+
+#define S_USERSPACESIZE    14
+#define M_USERSPACESIZE    0x1f
+#define V_USERSPACESIZE(x) ((x) << S_USERSPACESIZE)
+
+#define S_HOSTPAGESIZE    11
+#define M_HOSTPAGESIZE    0x7
+#define V_HOSTPAGESIZE(x) ((x) << S_HOSTPAGESIZE)
+
+#define S_FLMODE    9
+#define V_FLMODE(x) ((x) << S_FLMODE)
+#define F_FLMODE    V_FLMODE(1U)
+
+#define S_PKTSHIFT    6
+#define M_PKTSHIFT    0x7
+#define V_PKTSHIFT(x) ((x) << S_PKTSHIFT)
+
+#define S_ONEINTMULTQ    5
+#define V_ONEINTMULTQ(x) ((x) << S_ONEINTMULTQ)
+#define F_ONEINTMULTQ    V_ONEINTMULTQ(1U)
+
+#define S_BIGENDIANINGRESS    2
+#define V_BIGENDIANINGRESS(x) ((x) << S_BIGENDIANINGRESS)
+#define F_BIGENDIANINGRESS    V_BIGENDIANINGRESS(1U)
+
+#define S_ISCSICOALESCING    1
+#define V_ISCSICOALESCING(x) ((x) << S_ISCSICOALESCING)
+#define F_ISCSICOALESCING    V_ISCSICOALESCING(1U)
+
+#define S_GLOBALENABLE    0
+#define V_GLOBALENABLE(x) ((x) << S_GLOBALENABLE)
+#define F_GLOBALENABLE    V_GLOBALENABLE(1U)
+
+#define S_AVOIDCQOVFL    24
+#define V_AVOIDCQOVFL(x) ((x) << S_AVOIDCQOVFL)
+#define F_AVOIDCQOVFL    V_AVOIDCQOVFL(1U)
+
+#define S_OPTONEINTMULTQ    23
+#define V_OPTONEINTMULTQ(x) ((x) << S_OPTONEINTMULTQ)
+#define F_OPTONEINTMULTQ    V_OPTONEINTMULTQ(1U)
+
+#define S_CQCRDTCTRL    22
+#define V_CQCRDTCTRL(x) ((x) << S_CQCRDTCTRL)
+#define F_CQCRDTCTRL    V_CQCRDTCTRL(1U)
+
+#define A_SG_KDOORBELL 0x4
+
+#define S_SELEGRCNTX    31
+#define V_SELEGRCNTX(x) ((x) << S_SELEGRCNTX)
+#define F_SELEGRCNTX    V_SELEGRCNTX(1U)
+
+#define S_EGRCNTX    0
+#define M_EGRCNTX    0xffff
+#define V_EGRCNTX(x) ((x) << S_EGRCNTX)
+
+#define A_SG_GTS 0x8
+
+#define S_RSPQ    29
+#define M_RSPQ    0x7
+#define V_RSPQ(x) ((x) << S_RSPQ)
+#define G_RSPQ(x) (((x) >> S_RSPQ) & M_RSPQ)
+
+#define S_NEWTIMER    16
+#define M_NEWTIMER    0x1fff
+#define V_NEWTIMER(x) ((x) << S_NEWTIMER)
+
+#define S_NEWINDEX    0
+#define M_NEWINDEX    0xffff
+#define V_NEWINDEX(x) ((x) << S_NEWINDEX)
+
+#define A_SG_CONTEXT_CMD 0xc
+
+#define S_CONTEXT_CMD_OPCODE    28
+#define M_CONTEXT_CMD_OPCODE    0xf
+#define V_CONTEXT_CMD_OPCODE(x) ((x) << S_CONTEXT_CMD_OPCODE)
+
+#define S_CONTEXT_CMD_BUSY    27
+#define V_CONTEXT_CMD_BUSY(x) ((x) << S_CONTEXT_CMD_BUSY)
+#define F_CONTEXT_CMD_BUSY    V_CONTEXT_CMD_BUSY(1U)
+
+#define S_CQ_CREDIT    20
+
+#define M_CQ_CREDIT    0x7f
+
+#define V_CQ_CREDIT(x) ((x) << S_CQ_CREDIT)
+
+#define G_CQ_CREDIT(x) (((x) >> S_CQ_CREDIT) & M_CQ_CREDIT)
+
+#define S_CQ    19
+
+#define V_CQ(x) ((x) << S_CQ)
+#define F_CQ    V_CQ(1U)
+
+#define S_RESPONSEQ    18
+#define V_RESPONSEQ(x) ((x) << S_RESPONSEQ)
+#define F_RESPONSEQ    V_RESPONSEQ(1U)
+
+#define S_EGRESS    17
+#define V_EGRESS(x) ((x) << S_EGRESS)
+#define F_EGRESS    V_EGRESS(1U)
+
+#define S_FREELIST    16
+#define V_FREELIST(x) ((x) << S_FREELIST)
+#define F_FREELIST    V_FREELIST(1U)
+
+#define S_CONTEXT    0
+#define M_CONTEXT    0xffff
+#define V_CONTEXT(x) ((x) << S_CONTEXT)
+
+#define G_CONTEXT(x) (((x) >> S_CONTEXT) & M_CONTEXT)
+
+#define A_SG_CONTEXT_DATA0 0x10
+
+#define A_SG_CONTEXT_DATA1 0x14
+
+#define A_SG_CONTEXT_DATA2 0x18
+
+#define A_SG_CONTEXT_DATA3 0x1c
+
+#define A_SG_CONTEXT_MASK0 0x20
+
+#define A_SG_CONTEXT_MASK1 0x24
+
+#define A_SG_CONTEXT_MASK2 0x28
+
+#define A_SG_CONTEXT_MASK3 0x2c
+
+#define A_SG_RSPQ_CREDIT_RETURN 0x30
+
+#define S_CREDITS    0
+#define M_CREDITS    0xffff
+#define V_CREDITS(x) ((x) << S_CREDITS)
+
+#define A_SG_DATA_INTR 0x34
+
+#define S_ERRINTR    31
+#define V_ERRINTR(x) ((x) << S_ERRINTR)
+#define F_ERRINTR    V_ERRINTR(1U)
+
+#define A_SG_HI_DRB_HI_THRSH 0x38
+
+#define A_SG_HI_DRB_LO_THRSH 0x3c
+
+#define A_SG_LO_DRB_HI_THRSH 0x40
+
+#define A_SG_LO_DRB_LO_THRSH 0x44
+
+#define A_SG_RSPQ_FL_STATUS 0x4c
+
+#define S_RSPQ0DISABLED    8
+
+#define S_FL0EMPTY    16
+#define V_FL0EMPTY(x) ((x) << S_FL0EMPTY)
+#define F_FL0EMPTY    V_FL0EMPTY(1U)
+
+#define A_SG_EGR_RCQ_DRB_THRSH 0x54
+
+#define S_HIRCQDRBTHRSH    16
+#define M_HIRCQDRBTHRSH    0x7ff
+#define V_HIRCQDRBTHRSH(x) ((x) << S_HIRCQDRBTHRSH)
+
+#define S_LORCQDRBTHRSH    0
+#define M_LORCQDRBTHRSH    0x7ff
+#define V_LORCQDRBTHRSH(x) ((x) << S_LORCQDRBTHRSH)
+
+#define A_SG_EGR_CNTX_BADDR 0x58
+
+#define A_SG_INT_CAUSE 0x5c
+
+#define S_HIRCQPARITYERROR    31
+#define V_HIRCQPARITYERROR(x) ((x) << S_HIRCQPARITYERROR)
+#define F_HIRCQPARITYERROR    V_HIRCQPARITYERROR(1U)
+
+#define S_LORCQPARITYERROR    30
+#define V_LORCQPARITYERROR(x) ((x) << S_LORCQPARITYERROR)
+#define F_LORCQPARITYERROR    V_LORCQPARITYERROR(1U)
+
+#define S_HIDRBPARITYERROR    29
+#define V_HIDRBPARITYERROR(x) ((x) << S_HIDRBPARITYERROR)
+#define F_HIDRBPARITYERROR    V_HIDRBPARITYERROR(1U)
+
+#define S_LODRBPARITYERROR    28
+#define V_LODRBPARITYERROR(x) ((x) << S_LODRBPARITYERROR)
+#define F_LODRBPARITYERROR    V_LODRBPARITYERROR(1U)
+
+#define S_FLPARITYERROR    22
+#define M_FLPARITYERROR    0x3f
+#define V_FLPARITYERROR(x) ((x) << S_FLPARITYERROR)
+#define G_FLPARITYERROR(x) (((x) >> S_FLPARITYERROR) & M_FLPARITYERROR)
+
+#define S_ITPARITYERROR    20
+#define M_ITPARITYERROR    0x3
+#define V_ITPARITYERROR(x) ((x) << S_ITPARITYERROR)
+#define G_ITPARITYERROR(x) (((x) >> S_ITPARITYERROR) & M_ITPARITYERROR)
+
+#define S_IRPARITYERROR    19
+#define V_IRPARITYERROR(x) ((x) << S_IRPARITYERROR)
+#define F_IRPARITYERROR    V_IRPARITYERROR(1U)
+
+#define S_RCPARITYERROR    18
+#define V_RCPARITYERROR(x) ((x) << S_RCPARITYERROR)
+#define F_RCPARITYERROR    V_RCPARITYERROR(1U)
+
+#define S_OCPARITYERROR    17
+#define V_OCPARITYERROR(x) ((x) << S_OCPARITYERROR)
+#define F_OCPARITYERROR    V_OCPARITYERROR(1U)
+
+#define S_CPPARITYERROR    16
+#define V_CPPARITYERROR(x) ((x) << S_CPPARITYERROR)
+#define F_CPPARITYERROR    V_CPPARITYERROR(1U)
+
+#define S_R_REQ_FRAMINGERROR    15
+#define V_R_REQ_FRAMINGERROR(x) ((x) << S_R_REQ_FRAMINGERROR)
+#define F_R_REQ_FRAMINGERROR    V_R_REQ_FRAMINGERROR(1U)
+
+#define S_UC_REQ_FRAMINGERROR    14
+#define V_UC_REQ_FRAMINGERROR(x) ((x) << S_UC_REQ_FRAMINGERROR)
+#define F_UC_REQ_FRAMINGERROR    V_UC_REQ_FRAMINGERROR(1U)
+
+#define S_HICTLDRBDROPERR    13
+#define V_HICTLDRBDROPERR(x) ((x) << S_HICTLDRBDROPERR)
+#define F_HICTLDRBDROPERR    V_HICTLDRBDROPERR(1U)
+
+#define S_LOCTLDRBDROPERR    12
+#define V_LOCTLDRBDROPERR(x) ((x) << S_LOCTLDRBDROPERR)
+#define F_LOCTLDRBDROPERR    V_LOCTLDRBDROPERR(1U)
+
+#define S_HIPIODRBDROPERR    11
+#define V_HIPIODRBDROPERR(x) ((x) << S_HIPIODRBDROPERR)
+#define F_HIPIODRBDROPERR    V_HIPIODRBDROPERR(1U)
+
+#define S_LOPIODRBDROPERR    10
+#define V_LOPIODRBDROPERR(x) ((x) << S_LOPIODRBDROPERR)
+#define F_LOPIODRBDROPERR    V_LOPIODRBDROPERR(1U)
+
+#define S_HIPRIORITYDBFULL    7
+#define V_HIPRIORITYDBFULL(x) ((x) << S_HIPRIORITYDBFULL)
+#define F_HIPRIORITYDBFULL    V_HIPRIORITYDBFULL(1U)
+
+#define S_HIPRIORITYDBEMPTY   6
+#define V_HIPRIORITYDBEMPTY(x) ((x) << S_HIPRIORITYDBEMPTY)
+#define F_HIPRIORITYDBEMPTY    V_HIPRIORITYDBEMPTY(1U)
+
+#define S_LOPRIORITYDBFULL    5
+#define V_LOPRIORITYDBFULL(x) ((x) << S_LOPRIORITYDBFULL)
+#define F_LOPRIORITYDBFULL    V_LOPRIORITYDBFULL(1U)
+
+#define S_LOPRIORITYDBEMPTY   4
+#define V_LOPRIORITYDBEMPTY(x) ((x) << S_LOPRIORITYDBEMPTY)
+#define F_LOPRIORITYDBEMPTY    V_LOPRIORITYDBEMPTY(1U)
+
+#define S_RSPQDISABLED    3
+#define V_RSPQDISABLED(x) ((x) << S_RSPQDISABLED)
+#define F_RSPQDISABLED    V_RSPQDISABLED(1U)
+
+#define S_RSPQCREDITOVERFOW    2
+#define V_RSPQCREDITOVERFOW(x) ((x) << S_RSPQCREDITOVERFOW)
+#define F_RSPQCREDITOVERFOW    V_RSPQCREDITOVERFOW(1U)
+
+#define S_FLEMPTY    1
+#define V_FLEMPTY(x) ((x) << S_FLEMPTY)
+#define F_FLEMPTY    V_FLEMPTY(1U)
+
+#define A_SG_INT_ENABLE 0x60
+
+#define A_SG_CMDQ_CREDIT_TH 0x64
+
+#define S_TIMEOUT    8
+#define M_TIMEOUT    0xffffff
+#define V_TIMEOUT(x) ((x) << S_TIMEOUT)
+
+#define S_THRESHOLD    0
+#define M_THRESHOLD    0xff
+#define V_THRESHOLD(x) ((x) << S_THRESHOLD)
+
+#define A_SG_TIMER_TICK 0x68
+
+#define A_SG_CQ_CONTEXT_BADDR 0x6c
+
+#define A_SG_OCO_BASE 0x70
+
+#define S_BASE1    16
+#define M_BASE1    0xffff
+#define V_BASE1(x) ((x) << S_BASE1)
+
+#define A_SG_DRB_PRI_THRESH 0x74
+
+#define A_PCIX_INT_ENABLE 0x80
+
+#define S_MSIXPARERR    22
+#define M_MSIXPARERR    0x7
+
+#define V_MSIXPARERR(x) ((x) << S_MSIXPARERR)
+
+#define S_CFPARERR    18
+#define M_CFPARERR    0xf
+
+#define V_CFPARERR(x) ((x) << S_CFPARERR)
+
+#define S_RFPARERR    14
+#define M_RFPARERR    0xf
+
+#define V_RFPARERR(x) ((x) << S_RFPARERR)
+
+#define S_WFPARERR    12
+#define M_WFPARERR    0x3
+
+#define V_WFPARERR(x) ((x) << S_WFPARERR)
+
+#define S_PIOPARERR    11
+#define V_PIOPARERR(x) ((x) << S_PIOPARERR)
+#define F_PIOPARERR    V_PIOPARERR(1U)
+
+#define S_DETUNCECCERR    10
+#define V_DETUNCECCERR(x) ((x) << S_DETUNCECCERR)
+#define F_DETUNCECCERR    V_DETUNCECCERR(1U)
+
+#define S_DETCORECCERR    9
+#define V_DETCORECCERR(x) ((x) << S_DETCORECCERR)
+#define F_DETCORECCERR    V_DETCORECCERR(1U)
+
+#define S_RCVSPLCMPERR    8
+#define V_RCVSPLCMPERR(x) ((x) << S_RCVSPLCMPERR)
+#define F_RCVSPLCMPERR    V_RCVSPLCMPERR(1U)
+
+#define S_UNXSPLCMP    7
+#define V_UNXSPLCMP(x) ((x) << S_UNXSPLCMP)
+#define F_UNXSPLCMP    V_UNXSPLCMP(1U)
+
+#define S_SPLCMPDIS    6
+#define V_SPLCMPDIS(x) ((x) << S_SPLCMPDIS)
+#define F_SPLCMPDIS    V_SPLCMPDIS(1U)
+
+#define S_DETPARERR    5
+#define V_DETPARERR(x) ((x) << S_DETPARERR)
+#define F_DETPARERR    V_DETPARERR(1U)
+
+#define S_SIGSYSERR    4
+#define V_SIGSYSERR(x) ((x) << S_SIGSYSERR)
+#define F_SIGSYSERR    V_SIGSYSERR(1U)
+
+#define S_RCVMSTABT    3
+#define V_RCVMSTABT(x) ((x) << S_RCVMSTABT)
+#define F_RCVMSTABT    V_RCVMSTABT(1U)
+
+#define S_RCVTARABT    2
+#define V_RCVTARABT(x) ((x) << S_RCVTARABT)
+#define F_RCVTARABT    V_RCVTARABT(1U)
+
+#define S_SIGTARABT    1
+#define V_SIGTARABT(x) ((x) << S_SIGTARABT)
+#define F_SIGTARABT    V_SIGTARABT(1U)
+
+#define S_MSTDETPARERR    0
+#define V_MSTDETPARERR(x) ((x) << S_MSTDETPARERR)
+#define F_MSTDETPARERR    V_MSTDETPARERR(1U)
+
+#define A_PCIX_INT_CAUSE 0x84
+
+#define A_PCIX_CFG 0x88
+
+#define S_DMASTOPEN    19
+#define V_DMASTOPEN(x) ((x) << S_DMASTOPEN)
+#define F_DMASTOPEN    V_DMASTOPEN(1U)
+
+#define S_CLIDECEN    18
+#define V_CLIDECEN(x) ((x) << S_CLIDECEN)
+#define F_CLIDECEN    V_CLIDECEN(1U)
+
+#define A_PCIX_MODE 0x8c
+
+#define S_PCLKRANGE    6
+#define M_PCLKRANGE    0x3
+#define V_PCLKRANGE(x) ((x) << S_PCLKRANGE)
+#define G_PCLKRANGE(x) (((x) >> S_PCLKRANGE) & M_PCLKRANGE)
+
+#define S_PCIXINITPAT    2
+#define M_PCIXINITPAT    0xf
+#define V_PCIXINITPAT(x) ((x) << S_PCIXINITPAT)
+#define G_PCIXINITPAT(x) (((x) >> S_PCIXINITPAT) & M_PCIXINITPAT)
+
+#define S_64BIT    0
+#define V_64BIT(x) ((x) << S_64BIT)
+#define F_64BIT    V_64BIT(1U)
+
+#define A_PCIE_INT_ENABLE 0x80
+
+#define S_BISTERR    15
+#define M_BISTERR    0xff
+
+#define V_BISTERR(x) ((x) << S_BISTERR)
+
+#define S_TXPARERR    18
+#define V_TXPARERR(x) ((x) << S_TXPARERR)
+#define F_TXPARERR    V_TXPARERR(1U)
+
+#define S_RXPARERR    17
+#define V_RXPARERR(x) ((x) << S_RXPARERR)
+#define F_RXPARERR    V_RXPARERR(1U)
+
+#define S_RETRYLUTPARERR    16
+#define V_RETRYLUTPARERR(x) ((x) << S_RETRYLUTPARERR)
+#define F_RETRYLUTPARERR    V_RETRYLUTPARERR(1U)
+
+#define S_RETRYBUFPARERR    15
+#define V_RETRYBUFPARERR(x) ((x) << S_RETRYBUFPARERR)
+#define F_RETRYBUFPARERR    V_RETRYBUFPARERR(1U)
+
+#define S_PCIE_MSIXPARERR    12
+#define M_PCIE_MSIXPARERR    0x7
+
+#define V_PCIE_MSIXPARERR(x) ((x) << S_PCIE_MSIXPARERR)
+
+#define S_PCIE_CFPARERR    11
+#define V_PCIE_CFPARERR(x) ((x) << S_PCIE_CFPARERR)
+#define F_PCIE_CFPARERR    V_PCIE_CFPARERR(1U)
+
+#define S_PCIE_RFPARERR    10
+#define V_PCIE_RFPARERR(x) ((x) << S_PCIE_RFPARERR)
+#define F_PCIE_RFPARERR    V_PCIE_RFPARERR(1U)
+
+#define S_PCIE_WFPARERR    9
+#define V_PCIE_WFPARERR(x) ((x) << S_PCIE_WFPARERR)
+#define F_PCIE_WFPARERR    V_PCIE_WFPARERR(1U)
+
+#define S_PCIE_PIOPARERR    8
+#define V_PCIE_PIOPARERR(x) ((x) << S_PCIE_PIOPARERR)
+#define F_PCIE_PIOPARERR    V_PCIE_PIOPARERR(1U)
+
+#define S_UNXSPLCPLERRC    7
+#define V_UNXSPLCPLERRC(x) ((x) << S_UNXSPLCPLERRC)
+#define F_UNXSPLCPLERRC    V_UNXSPLCPLERRC(1U)
+
+#define S_UNXSPLCPLERRR    6
+#define V_UNXSPLCPLERRR(x) ((x) << S_UNXSPLCPLERRR)
+#define F_UNXSPLCPLERRR    V_UNXSPLCPLERRR(1U)
+
+#define S_PEXERR    0
+#define V_PEXERR(x) ((x) << S_PEXERR)
+#define F_PEXERR    V_PEXERR(1U)
+
+#define A_PCIE_INT_CAUSE 0x84
+
+#define S_PCIE_DMASTOPEN    24
+#define V_PCIE_DMASTOPEN(x) ((x) << S_PCIE_DMASTOPEN)
+#define F_PCIE_DMASTOPEN    V_PCIE_DMASTOPEN(1U)
+
+#define A_PCIE_CFG 0x88
+
+#define S_ENABLELINKDWNDRST    21
+#define V_ENABLELINKDWNDRST(x) ((x) << S_ENABLELINKDWNDRST)
+#define F_ENABLELINKDWNDRST    V_ENABLELINKDWNDRST(1U)
+
+#define S_ENABLELINKDOWNRST    20
+#define V_ENABLELINKDOWNRST(x) ((x) << S_ENABLELINKDOWNRST)
+#define F_ENABLELINKDOWNRST    V_ENABLELINKDOWNRST(1U)
+
+#define S_PCIE_CLIDECEN    16
+#define V_PCIE_CLIDECEN(x) ((x) << S_PCIE_CLIDECEN)
+#define F_PCIE_CLIDECEN    V_PCIE_CLIDECEN(1U)
+
+#define S_CRSTWRMMODE    0
+#define V_CRSTWRMMODE(x) ((x) << S_CRSTWRMMODE)
+#define F_CRSTWRMMODE    V_CRSTWRMMODE(1U)
+
+#define A_PCIE_MODE 0x8c
+
+#define S_NUMFSTTRNSEQRX    10
+#define M_NUMFSTTRNSEQRX    0xff
+#define V_NUMFSTTRNSEQRX(x) ((x) << S_NUMFSTTRNSEQRX)
+#define G_NUMFSTTRNSEQRX(x) (((x) >> S_NUMFSTTRNSEQRX) & M_NUMFSTTRNSEQRX)
+
+#define A_PCIE_PEX_CTRL0 0x98
+
+#define S_NUMFSTTRNSEQ    22
+#define M_NUMFSTTRNSEQ    0xff
+#define V_NUMFSTTRNSEQ(x) ((x) << S_NUMFSTTRNSEQ)
+#define G_NUMFSTTRNSEQ(x) (((x) >> S_NUMFSTTRNSEQ) & M_NUMFSTTRNSEQ)
+
+#define S_REPLAYLMT    2
+#define M_REPLAYLMT    0xfffff
+
+#define V_REPLAYLMT(x) ((x) << S_REPLAYLMT)
+
+#define A_PCIE_PEX_CTRL1 0x9c
+
+#define S_T3A_ACKLAT    0
+#define M_T3A_ACKLAT    0x7ff
+
+#define V_T3A_ACKLAT(x) ((x) << S_T3A_ACKLAT)
+
+#define S_ACKLAT    0
+#define M_ACKLAT    0x1fff
+
+#define V_ACKLAT(x) ((x) << S_ACKLAT)
+
+#define A_PCIE_PEX_ERR 0xa4
+
+#define A_T3DBG_GPIO_EN 0xd0
+
+#define S_GPIO11_OEN    27
+#define V_GPIO11_OEN(x) ((x) << S_GPIO11_OEN)
+#define F_GPIO11_OEN    V_GPIO11_OEN(1U)
+
+#define S_GPIO10_OEN    26
+#define V_GPIO10_OEN(x) ((x) << S_GPIO10_OEN)
+#define F_GPIO10_OEN    V_GPIO10_OEN(1U)
+
+#define S_GPIO7_OEN    23
+#define V_GPIO7_OEN(x) ((x) << S_GPIO7_OEN)
+#define F_GPIO7_OEN    V_GPIO7_OEN(1U)
+
+#define S_GPIO6_OEN    22
+#define V_GPIO6_OEN(x) ((x) << S_GPIO6_OEN)
+#define F_GPIO6_OEN    V_GPIO6_OEN(1U)
+
+#define S_GPIO5_OEN    21
+#define V_GPIO5_OEN(x) ((x) << S_GPIO5_OEN)
+#define F_GPIO5_OEN    V_GPIO5_OEN(1U)
+
+#define S_GPIO4_OEN    20
+#define V_GPIO4_OEN(x) ((x) << S_GPIO4_OEN)
+#define F_GPIO4_OEN    V_GPIO4_OEN(1U)
+
+#define S_GPIO2_OEN    18
+#define V_GPIO2_OEN(x) ((x) << S_GPIO2_OEN)
+#define F_GPIO2_OEN    V_GPIO2_OEN(1U)
+
+#define S_GPIO1_OEN    17
+#define V_GPIO1_OEN(x) ((x) << S_GPIO1_OEN)
+#define F_GPIO1_OEN    V_GPIO1_OEN(1U)
+
+#define S_GPIO0_OEN    16
+#define V_GPIO0_OEN(x) ((x) << S_GPIO0_OEN)
+#define F_GPIO0_OEN    V_GPIO0_OEN(1U)
+
+#define S_GPIO10_OUT_VAL    10
+#define V_GPIO10_OUT_VAL(x) ((x) << S_GPIO10_OUT_VAL)
+#define F_GPIO10_OUT_VAL    V_GPIO10_OUT_VAL(1U)
+
+#define S_GPIO7_OUT_VAL    7
+#define V_GPIO7_OUT_VAL(x) ((x) << S_GPIO7_OUT_VAL)
+#define F_GPIO7_OUT_VAL    V_GPIO7_OUT_VAL(1U)
+
+#define S_GPIO6_OUT_VAL    6
+#define V_GPIO6_OUT_VAL(x) ((x) << S_GPIO6_OUT_VAL)
+#define F_GPIO6_OUT_VAL    V_GPIO6_OUT_VAL(1U)
+
+#define S_GPIO5_OUT_VAL    5
+#define V_GPIO5_OUT_VAL(x) ((x) << S_GPIO5_OUT_VAL)
+#define F_GPIO5_OUT_VAL    V_GPIO5_OUT_VAL(1U)
+
+#define S_GPIO4_OUT_VAL    4
+#define V_GPIO4_OUT_VAL(x) ((x) << S_GPIO4_OUT_VAL)
+#define F_GPIO4_OUT_VAL    V_GPIO4_OUT_VAL(1U)
+
+#define S_GPIO2_OUT_VAL    2
+#define V_GPIO2_OUT_VAL(x) ((x) << S_GPIO2_OUT_VAL)
+#define F_GPIO2_OUT_VAL    V_GPIO2_OUT_VAL(1U)
+
+#define S_GPIO1_OUT_VAL    1
+#define V_GPIO1_OUT_VAL(x) ((x) << S_GPIO1_OUT_VAL)
+#define F_GPIO1_OUT_VAL    V_GPIO1_OUT_VAL(1U)
+
+#define S_GPIO0_OUT_VAL    0
+#define V_GPIO0_OUT_VAL(x) ((x) << S_GPIO0_OUT_VAL)
+#define F_GPIO0_OUT_VAL    V_GPIO0_OUT_VAL(1U)
+
+#define A_T3DBG_INT_ENABLE 0xd8
+
+#define S_GPIO11    11
+#define V_GPIO11(x) ((x) << S_GPIO11)
+#define F_GPIO11    V_GPIO11(1U)
+
+#define S_GPIO10    10
+#define V_GPIO10(x) ((x) << S_GPIO10)
+#define F_GPIO10    V_GPIO10(1U)
+
+#define S_GPIO9    9
+#define V_GPIO9(x) ((x) << S_GPIO9)
+#define F_GPIO9    V_GPIO9(1U)
+
+#define S_GPIO7    7
+#define V_GPIO7(x) ((x) << S_GPIO7)
+#define F_GPIO7    V_GPIO7(1U)
+
+#define S_GPIO6    6
+#define V_GPIO6(x) ((x) << S_GPIO6)
+#define F_GPIO6    V_GPIO6(1U)
+
+#define S_GPIO5    5
+#define V_GPIO5(x) ((x) << S_GPIO5)
+#define F_GPIO5    V_GPIO5(1U)
+
+#define S_GPIO4    4
+#define V_GPIO4(x) ((x) << S_GPIO4)
+#define F_GPIO4    V_GPIO4(1U)
+
+#define S_GPIO3    3
+#define V_GPIO3(x) ((x) << S_GPIO3)
+#define F_GPIO3    V_GPIO3(1U)
+
+#define S_GPIO2    2
+#define V_GPIO2(x) ((x) << S_GPIO2)
+#define F_GPIO2    V_GPIO2(1U)
+
+#define S_GPIO1    1
+#define V_GPIO1(x) ((x) << S_GPIO1)
+#define F_GPIO1    V_GPIO1(1U)
+
+#define S_GPIO0    0
+#define V_GPIO0(x) ((x) << S_GPIO0)
+#define F_GPIO0    V_GPIO0(1U)
+
+#define A_T3DBG_INT_CAUSE 0xdc
+
+#define A_T3DBG_GPIO_ACT_LOW 0xf0
+
+#define MC7_PMRX_BASE_ADDR 0x100
+
+#define A_MC7_CFG 0x100
+
+#define S_IFEN    13
+#define V_IFEN(x) ((x) << S_IFEN)
+#define F_IFEN    V_IFEN(1U)
+
+#define S_TERM150    11
+#define V_TERM150(x) ((x) << S_TERM150)
+#define F_TERM150    V_TERM150(1U)
+
+#define S_SLOW    10
+#define V_SLOW(x) ((x) << S_SLOW)
+#define F_SLOW    V_SLOW(1U)
+
+#define S_WIDTH    8
+#define M_WIDTH    0x3
+#define V_WIDTH(x) ((x) << S_WIDTH)
+#define G_WIDTH(x) (((x) >> S_WIDTH) & M_WIDTH)
+
+#define S_BKS    6
+#define V_BKS(x) ((x) << S_BKS)
+#define F_BKS    V_BKS(1U)
+
+#define S_ORG    5
+#define V_ORG(x) ((x) << S_ORG)
+#define F_ORG    V_ORG(1U)
+
+#define S_DEN    2
+#define M_DEN    0x7
+#define V_DEN(x) ((x) << S_DEN)
+#define G_DEN(x) (((x) >> S_DEN) & M_DEN)
+
+#define S_RDY    1
+#define V_RDY(x) ((x) << S_RDY)
+#define F_RDY    V_RDY(1U)
+
+#define S_CLKEN    0
+#define V_CLKEN(x) ((x) << S_CLKEN)
+#define F_CLKEN    V_CLKEN(1U)
+
+#define A_MC7_MODE 0x104
+
+#define S_BUSY    31
+#define V_BUSY(x) ((x) << S_BUSY)
+#define F_BUSY    V_BUSY(1U)
+
+#define A_MC7_EXT_MODE1 0x108
+
+#define A_MC7_EXT_MODE2 0x10c
+
+#define A_MC7_EXT_MODE3 0x110
+
+#define A_MC7_PRE 0x114
+
+#define A_MC7_REF 0x118
+
+#define S_PREREFDIV    1
+#define M_PREREFDIV    0x3fff
+#define V_PREREFDIV(x) ((x) << S_PREREFDIV)
+
+#define S_PERREFEN    0
+#define V_PERREFEN(x) ((x) << S_PERREFEN)
+#define F_PERREFEN    V_PERREFEN(1U)
+
+#define A_MC7_DLL 0x11c
+
+#define S_DLLENB    1
+#define V_DLLENB(x) ((x) << S_DLLENB)
+#define F_DLLENB    V_DLLENB(1U)
+
+#define S_DLLRST    0
+#define V_DLLRST(x) ((x) << S_DLLRST)
+#define F_DLLRST    V_DLLRST(1U)
+
+#define A_MC7_PARM 0x120
+
+#define S_ACTTOPREDLY    26
+#define M_ACTTOPREDLY    0xf
+#define V_ACTTOPREDLY(x) ((x) << S_ACTTOPREDLY)
+
+#define S_ACTTORDWRDLY    23
+#define M_ACTTORDWRDLY    0x7
+#define V_ACTTORDWRDLY(x) ((x) << S_ACTTORDWRDLY)
+
+#define S_PRECYC    20
+#define M_PRECYC    0x7
+#define V_PRECYC(x) ((x) << S_PRECYC)
+
+#define S_REFCYC    13
+#define M_REFCYC    0x7f
+#define V_REFCYC(x) ((x) << S_REFCYC)
+
+#define S_BKCYC    8
+#define M_BKCYC    0x1f
+#define V_BKCYC(x) ((x) << S_BKCYC)
+
+#define S_WRTORDDLY    4
+#define M_WRTORDDLY    0xf
+#define V_WRTORDDLY(x) ((x) << S_WRTORDDLY)
+
+#define S_RDTOWRDLY    0
+#define M_RDTOWRDLY    0xf
+#define V_RDTOWRDLY(x) ((x) << S_RDTOWRDLY)
+
+#define A_MC7_CAL 0x128
+
+#define S_CAL_FAULT    30
+#define V_CAL_FAULT(x) ((x) << S_CAL_FAULT)
+#define F_CAL_FAULT    V_CAL_FAULT(1U)
+
+#define S_SGL_CAL_EN    20
+#define V_SGL_CAL_EN(x) ((x) << S_SGL_CAL_EN)
+#define F_SGL_CAL_EN    V_SGL_CAL_EN(1U)
+
+#define A_MC7_ERR_ADDR 0x12c
+
+#define A_MC7_ECC 0x130
+
+#define S_ECCCHKEN    1
+#define V_ECCCHKEN(x) ((x) << S_ECCCHKEN)
+#define F_ECCCHKEN    V_ECCCHKEN(1U)
+
+#define S_ECCGENEN    0
+#define V_ECCGENEN(x) ((x) << S_ECCGENEN)
+#define F_ECCGENEN    V_ECCGENEN(1U)
+
+#define A_MC7_CE_ADDR 0x134
+
+#define A_MC7_CE_DATA0 0x138
+
+#define A_MC7_CE_DATA1 0x13c
+
+#define A_MC7_CE_DATA2 0x140
+
+#define S_DATA    0
+#define M_DATA    0xff
+
+#define G_DATA(x) (((x) >> S_DATA) & M_DATA)
+
+#define A_MC7_UE_ADDR 0x144
+
+#define A_MC7_UE_DATA0 0x148
+
+#define A_MC7_UE_DATA1 0x14c
+
+#define A_MC7_UE_DATA2 0x150
+
+#define A_MC7_BD_ADDR 0x154
+
+#define S_ADDR    3
+
+#define M_ADDR    0x1fffffff
+
+#define A_MC7_BD_DATA0 0x158
+
+#define A_MC7_BD_DATA1 0x15c
+
+#define A_MC7_BD_OP 0x164
+
+#define S_OP    0
+
+#define V_OP(x) ((x) << S_OP)
+#define F_OP    V_OP(1U)
+
+#define A_MC7_BIST_ADDR_BEG 0x168
+
+#define A_MC7_BIST_ADDR_END 0x16c
+
+#define A_MC7_BIST_DATA 0x170
+
+#define A_MC7_BIST_OP 0x174
+
+#define S_CONT    3
+#define V_CONT(x) ((x) << S_CONT)
+#define F_CONT    V_CONT(1U)
+
+#define A_MC7_INT_ENABLE 0x178
+
+#define S_AE    17
+#define V_AE(x) ((x) << S_AE)
+#define F_AE    V_AE(1U)
+
+#define S_PE    2
+#define M_PE    0x7fff
+
+#define V_PE(x) ((x) << S_PE)
+
+#define G_PE(x) (((x) >> S_PE) & M_PE)
+
+#define S_UE    1
+#define V_UE(x) ((x) << S_UE)
+#define F_UE    V_UE(1U)
+
+#define S_CE    0
+#define V_CE(x) ((x) << S_CE)
+#define F_CE    V_CE(1U)
+
+#define A_MC7_INT_CAUSE 0x17c
+
+#define MC7_PMTX_BASE_ADDR 0x180
+
+#define MC7_CM_BASE_ADDR 0x200
+
+#define A_CIM_BOOT_CFG 0x280
+
+#define S_BOOTADDR    2
+#define M_BOOTADDR    0x3fffffff
+#define V_BOOTADDR(x) ((x) << S_BOOTADDR)
+
+#define A_CIM_SDRAM_BASE_ADDR 0x28c
+
+#define A_CIM_SDRAM_ADDR_SIZE 0x290
+
+#define A_CIM_HOST_INT_ENABLE 0x298
+
+#define S_DTAGPARERR    28
+#define V_DTAGPARERR(x) ((x) << S_DTAGPARERR)
+#define F_DTAGPARERR    V_DTAGPARERR(1U)
+
+#define S_ITAGPARERR    27
+#define V_ITAGPARERR(x) ((x) << S_ITAGPARERR)
+#define F_ITAGPARERR    V_ITAGPARERR(1U)
+
+#define S_IBQTPPARERR    26
+#define V_IBQTPPARERR(x) ((x) << S_IBQTPPARERR)
+#define F_IBQTPPARERR    V_IBQTPPARERR(1U)
+
+#define S_IBQULPPARERR    25
+#define V_IBQULPPARERR(x) ((x) << S_IBQULPPARERR)
+#define F_IBQULPPARERR    V_IBQULPPARERR(1U)
+
+#define S_IBQSGEHIPARERR    24
+#define V_IBQSGEHIPARERR(x) ((x) << S_IBQSGEHIPARERR)
+#define F_IBQSGEHIPARERR    V_IBQSGEHIPARERR(1U)
+
+#define S_IBQSGELOPARERR    23
+#define V_IBQSGELOPARERR(x) ((x) << S_IBQSGELOPARERR)
+#define F_IBQSGELOPARERR    V_IBQSGELOPARERR(1U)
+
+#define S_OBQULPLOPARERR    22
+#define V_OBQULPLOPARERR(x) ((x) << S_OBQULPLOPARERR)
+#define F_OBQULPLOPARERR    V_OBQULPLOPARERR(1U)
+
+#define S_OBQULPHIPARERR    21
+#define V_OBQULPHIPARERR(x) ((x) << S_OBQULPHIPARERR)
+#define F_OBQULPHIPARERR    V_OBQULPHIPARERR(1U)
+
+#define S_OBQSGEPARERR    20
+#define V_OBQSGEPARERR(x) ((x) << S_OBQSGEPARERR)
+#define F_OBQSGEPARERR    V_OBQSGEPARERR(1U)
+
+#define S_DCACHEPARERR    19
+#define V_DCACHEPARERR(x) ((x) << S_DCACHEPARERR)
+#define F_DCACHEPARERR    V_DCACHEPARERR(1U)
+
+#define S_ICACHEPARERR    18
+#define V_ICACHEPARERR(x) ((x) << S_ICACHEPARERR)
+#define F_ICACHEPARERR    V_ICACHEPARERR(1U)
+
+#define S_DRAMPARERR    17
+#define V_DRAMPARERR(x) ((x) << S_DRAMPARERR)
+#define F_DRAMPARERR    V_DRAMPARERR(1U)
+
+#define A_CIM_HOST_INT_CAUSE 0x29c
+
+#define S_BLKWRPLINT    12
+#define V_BLKWRPLINT(x) ((x) << S_BLKWRPLINT)
+#define F_BLKWRPLINT    V_BLKWRPLINT(1U)
+
+#define S_BLKRDPLINT    11
+#define V_BLKRDPLINT(x) ((x) << S_BLKRDPLINT)
+#define F_BLKRDPLINT    V_BLKRDPLINT(1U)
+
+#define S_BLKWRCTLINT    10
+#define V_BLKWRCTLINT(x) ((x) << S_BLKWRCTLINT)
+#define F_BLKWRCTLINT    V_BLKWRCTLINT(1U)
+
+#define S_BLKRDCTLINT    9
+#define V_BLKRDCTLINT(x) ((x) << S_BLKRDCTLINT)
+#define F_BLKRDCTLINT    V_BLKRDCTLINT(1U)
+
+#define S_BLKWRFLASHINT    8
+#define V_BLKWRFLASHINT(x) ((x) << S_BLKWRFLASHINT)
+#define F_BLKWRFLASHINT    V_BLKWRFLASHINT(1U)
+
+#define S_BLKRDFLASHINT    7
+#define V_BLKRDFLASHINT(x) ((x) << S_BLKRDFLASHINT)
+#define F_BLKRDFLASHINT    V_BLKRDFLASHINT(1U)
+
+#define S_SGLWRFLASHINT    6
+#define V_SGLWRFLASHINT(x) ((x) << S_SGLWRFLASHINT)
+#define F_SGLWRFLASHINT    V_SGLWRFLASHINT(1U)
+
+#define S_WRBLKFLASHINT    5
+#define V_WRBLKFLASHINT(x) ((x) << S_WRBLKFLASHINT)
+#define F_WRBLKFLASHINT    V_WRBLKFLASHINT(1U)
+
+#define S_BLKWRBOOTINT    4
+#define V_BLKWRBOOTINT(x) ((x) << S_BLKWRBOOTINT)
+#define F_BLKWRBOOTINT    V_BLKWRBOOTINT(1U)
+
+#define S_FLASHRANGEINT    2
+#define V_FLASHRANGEINT(x) ((x) << S_FLASHRANGEINT)
+#define F_FLASHRANGEINT    V_FLASHRANGEINT(1U)
+
+#define S_SDRAMRANGEINT    1
+#define V_SDRAMRANGEINT(x) ((x) << S_SDRAMRANGEINT)
+#define F_SDRAMRANGEINT    V_SDRAMRANGEINT(1U)
+
+#define S_RSVDSPACEINT    0
+#define V_RSVDSPACEINT(x) ((x) << S_RSVDSPACEINT)
+#define F_RSVDSPACEINT    V_RSVDSPACEINT(1U)
+
+#define A_CIM_HOST_ACC_CTRL 0x2b0
+
+#define S_HOSTBUSY    17
+#define V_HOSTBUSY(x) ((x) << S_HOSTBUSY)
+#define F_HOSTBUSY    V_HOSTBUSY(1U)
+
+#define A_CIM_HOST_ACC_DATA 0x2b4
+
+#define A_CIM_IBQ_DBG_CFG 0x2c0
+
+#define S_IBQDBGADDR    16
+#define M_IBQDBGADDR    0x1ff
+#define V_IBQDBGADDR(x) ((x) << S_IBQDBGADDR)
+#define G_IBQDBGADDR(x) (((x) >> S_IBQDBGADDR) & M_IBQDBGADDR)
+
+#define S_IBQDBGQID    3
+#define M_IBQDBGQID    0x3
+#define V_IBQDBGQID(x) ((x) << S_IBQDBGQID)
+#define G_IBQDBGQID(x) (((x) >> S_IBQDBGQID) & M_IBQDBGQID)
+
+#define S_IBQDBGWR    2
+#define V_IBQDBGWR(x) ((x) << S_IBQDBGWR)
+#define F_IBQDBGWR    V_IBQDBGWR(1U)
+
+#define S_IBQDBGBUSY    1
+#define V_IBQDBGBUSY(x) ((x) << S_IBQDBGBUSY)
+#define F_IBQDBGBUSY    V_IBQDBGBUSY(1U)
+
+#define S_IBQDBGEN    0
+#define V_IBQDBGEN(x) ((x) << S_IBQDBGEN)
+#define F_IBQDBGEN    V_IBQDBGEN(1U)
+
+#define A_CIM_IBQ_DBG_DATA 0x2c8
+
+#define A_TP_IN_CONFIG 0x300
+
+#define S_RXFBARBPRIO    25
+#define V_RXFBARBPRIO(x) ((x) << S_RXFBARBPRIO)
+#define F_RXFBARBPRIO    V_RXFBARBPRIO(1U)
+
+#define S_TXFBARBPRIO    24
+#define V_TXFBARBPRIO(x) ((x) << S_TXFBARBPRIO)
+#define F_TXFBARBPRIO    V_TXFBARBPRIO(1U)
+
+#define S_NICMODE    14
+#define V_NICMODE(x) ((x) << S_NICMODE)
+#define F_NICMODE    V_NICMODE(1U)
+
+#define S_IPV6ENABLE    15
+#define V_IPV6ENABLE(x) ((x) << S_IPV6ENABLE)
+#define F_IPV6ENABLE    V_IPV6ENABLE(1U)
+
+#define A_TP_OUT_CONFIG 0x304
+
+#define S_VLANEXTRACTIONENABLE    12
+
+#define A_TP_GLOBAL_CONFIG 0x308
+
+#define S_TXPACINGENABLE    24
+#define V_TXPACINGENABLE(x) ((x) << S_TXPACINGENABLE)
+#define F_TXPACINGENABLE    V_TXPACINGENABLE(1U)
+
+#define S_PATHMTU    15
+#define V_PATHMTU(x) ((x) << S_PATHMTU)
+#define F_PATHMTU    V_PATHMTU(1U)
+
+#define S_IPCHECKSUMOFFLOAD    13
+#define V_IPCHECKSUMOFFLOAD(x) ((x) << S_IPCHECKSUMOFFLOAD)
+#define F_IPCHECKSUMOFFLOAD    V_IPCHECKSUMOFFLOAD(1U)
+
+#define S_UDPCHECKSUMOFFLOAD    12
+#define V_UDPCHECKSUMOFFLOAD(x) ((x) << S_UDPCHECKSUMOFFLOAD)
+#define F_UDPCHECKSUMOFFLOAD    V_UDPCHECKSUMOFFLOAD(1U)
+
+#define S_TCPCHECKSUMOFFLOAD    11
+#define V_TCPCHECKSUMOFFLOAD(x) ((x) << S_TCPCHECKSUMOFFLOAD)
+#define F_TCPCHECKSUMOFFLOAD    V_TCPCHECKSUMOFFLOAD(1U)
+
+#define S_IPTTL    0
+#define M_IPTTL    0xff
+#define V_IPTTL(x) ((x) << S_IPTTL)
+
+#define A_TP_CMM_MM_BASE 0x314
+
+#define A_TP_CMM_TIMER_BASE 0x318
+
+#define S_CMTIMERMAXNUM    28
+#define M_CMTIMERMAXNUM    0x3
+#define V_CMTIMERMAXNUM(x) ((x) << S_CMTIMERMAXNUM)
+
+#define A_TP_PMM_SIZE 0x31c
+
+#define A_TP_PMM_TX_BASE 0x320
+
+#define A_TP_PMM_RX_BASE 0x328
+
+#define A_TP_PMM_RX_PAGE_SIZE 0x32c
+
+#define A_TP_PMM_RX_MAX_PAGE 0x330
+
+#define A_TP_PMM_TX_PAGE_SIZE 0x334
+
+#define A_TP_PMM_TX_MAX_PAGE 0x338
+
+#define A_TP_TCP_OPTIONS 0x340
+
+#define S_MTUDEFAULT    16
+#define M_MTUDEFAULT    0xffff
+#define V_MTUDEFAULT(x) ((x) << S_MTUDEFAULT)
+
+#define S_MTUENABLE    10
+#define V_MTUENABLE(x) ((x) << S_MTUENABLE)
+#define F_MTUENABLE    V_MTUENABLE(1U)
+
+#define S_SACKRX    8
+#define V_SACKRX(x) ((x) << S_SACKRX)
+#define F_SACKRX    V_SACKRX(1U)
+
+#define S_SACKMODE    4
+
+#define M_SACKMODE    0x3
+
+#define V_SACKMODE(x) ((x) << S_SACKMODE)
+
+#define S_WINDOWSCALEMODE    2
+#define M_WINDOWSCALEMODE    0x3
+#define V_WINDOWSCALEMODE(x) ((x) << S_WINDOWSCALEMODE)
+
+#define S_TIMESTAMPSMODE    0
+
+#define M_TIMESTAMPSMODE    0x3
+
+#define V_TIMESTAMPSMODE(x) ((x) << S_TIMESTAMPSMODE)
+
+#define A_TP_DACK_CONFIG 0x344
+
+#define S_AUTOSTATE3    30
+#define M_AUTOSTATE3    0x3
+#define V_AUTOSTATE3(x) ((x) << S_AUTOSTATE3)
+
+#define S_AUTOSTATE2    28
+#define M_AUTOSTATE2    0x3
+#define V_AUTOSTATE2(x) ((x) << S_AUTOSTATE2)
+
+#define S_AUTOSTATE1    26
+#define M_AUTOSTATE1    0x3
+#define V_AUTOSTATE1(x) ((x) << S_AUTOSTATE1)
+
+#define S_BYTETHRESHOLD    5
+#define M_BYTETHRESHOLD    0xfffff
+#define V_BYTETHRESHOLD(x) ((x) << S_BYTETHRESHOLD)
+
+#define S_MSSTHRESHOLD    3
+#define M_MSSTHRESHOLD    0x3
+#define V_MSSTHRESHOLD(x) ((x) << S_MSSTHRESHOLD)
+
+#define S_AUTOCAREFUL    2
+#define V_AUTOCAREFUL(x) ((x) << S_AUTOCAREFUL)
+#define F_AUTOCAREFUL    V_AUTOCAREFUL(1U)
+
+#define S_AUTOENABLE    1
+#define V_AUTOENABLE(x) ((x) << S_AUTOENABLE)
+#define F_AUTOENABLE    V_AUTOENABLE(1U)
+
+#define S_DACK_MODE    0
+#define V_DACK_MODE(x) ((x) << S_DACK_MODE)
+#define F_DACK_MODE    V_DACK_MODE(1U)
+
+#define A_TP_PC_CONFIG 0x348
+
+#define S_TXTOSQUEUEMAPMODE    26
+#define V_TXTOSQUEUEMAPMODE(x) ((x) << S_TXTOSQUEUEMAPMODE)
+#define F_TXTOSQUEUEMAPMODE    V_TXTOSQUEUEMAPMODE(1U)
+
+#define S_ENABLEEPCMDAFULL    23
+#define V_ENABLEEPCMDAFULL(x) ((x) << S_ENABLEEPCMDAFULL)
+#define F_ENABLEEPCMDAFULL    V_ENABLEEPCMDAFULL(1U)
+
+#define S_MODULATEUNIONMODE    22
+#define V_MODULATEUNIONMODE(x) ((x) << S_MODULATEUNIONMODE)
+#define F_MODULATEUNIONMODE    V_MODULATEUNIONMODE(1U)
+
+#define S_TXDEFERENABLE    20
+#define V_TXDEFERENABLE(x) ((x) << S_TXDEFERENABLE)
+#define F_TXDEFERENABLE    V_TXDEFERENABLE(1U)
+
+#define S_RXCONGESTIONMODE    19
+#define V_RXCONGESTIONMODE(x) ((x) << S_RXCONGESTIONMODE)
+#define F_RXCONGESTIONMODE    V_RXCONGESTIONMODE(1U)
+
+#define S_HEARBEATDACK    16
+#define V_HEARBEATDACK(x) ((x) << S_HEARBEATDACK)
+#define F_HEARBEATDACK    V_HEARBEATDACK(1U)
+
+#define S_TXCONGESTIONMODE    15
+#define V_TXCONGESTIONMODE(x) ((x) << S_TXCONGESTIONMODE)
+#define F_TXCONGESTIONMODE    V_TXCONGESTIONMODE(1U)
+
+#define S_ENABLEOCSPIFULL    30
+#define V_ENABLEOCSPIFULL(x) ((x) << S_ENABLEOCSPIFULL)
+#define F_ENABLEOCSPIFULL    V_ENABLEOCSPIFULL(1U)
+
+#define S_LOCKTID    28
+#define V_LOCKTID(x) ((x) << S_LOCKTID)
+#define F_LOCKTID    V_LOCKTID(1U)
+
+#define S_TABLELATENCYDELTA    0
+#define M_TABLELATENCYDELTA    0xf
+#define V_TABLELATENCYDELTA(x) ((x) << S_TABLELATENCYDELTA)
+#define G_TABLELATENCYDELTA(x) \
+	(((x) >> S_TABLELATENCYDELTA) & M_TABLELATENCYDELTA)
+
+#define A_TP_PC_CONFIG2 0x34c
+
+#define S_DISBLEDAPARBIT0    15
+#define V_DISBLEDAPARBIT0(x) ((x) << S_DISBLEDAPARBIT0)
+#define F_DISBLEDAPARBIT0    V_DISBLEDAPARBIT0(1U)
+
+#define S_ENABLEARPMISS    13
+#define V_ENABLEARPMISS(x) ((x) << S_ENABLEARPMISS)
+#define F_ENABLEARPMISS    V_ENABLEARPMISS(1U)
+
+#define S_ENABLENONOFDTNLSYN    12
+#define V_ENABLENONOFDTNLSYN(x) ((x) << S_ENABLENONOFDTNLSYN)
+#define F_ENABLENONOFDTNLSYN    V_ENABLENONOFDTNLSYN(1U)
+
+#define S_ENABLEIPV6RSS    11
+#define V_ENABLEIPV6RSS(x) ((x) << S_ENABLEIPV6RSS)
+#define F_ENABLEIPV6RSS    V_ENABLEIPV6RSS(1U)
+
+#define S_CHDRAFULL    4
+#define V_CHDRAFULL(x) ((x) << S_CHDRAFULL)
+#define F_CHDRAFULL    V_CHDRAFULL(1U)
+
+#define A_TP_TCP_BACKOFF_REG0 0x350
+
+#define A_TP_TCP_BACKOFF_REG1 0x354
+
+#define A_TP_TCP_BACKOFF_REG2 0x358
+
+#define A_TP_TCP_BACKOFF_REG3 0x35c
+
+#define A_TP_PARA_REG2 0x368
+
+#define S_MAXRXDATA    16
+#define M_MAXRXDATA    0xffff
+#define V_MAXRXDATA(x) ((x) << S_MAXRXDATA)
+
+#define S_RXCOALESCESIZE    0
+#define M_RXCOALESCESIZE    0xffff
+#define V_RXCOALESCESIZE(x) ((x) << S_RXCOALESCESIZE)
+
+#define A_TP_PARA_REG3 0x36c
+
+#define S_TXDATAACKIDX    16
+#define M_TXDATAACKIDX    0xf
+
+#define V_TXDATAACKIDX(x) ((x) << S_TXDATAACKIDX)
+
+#define S_TXPACEAUTOSTRICT    10
+#define V_TXPACEAUTOSTRICT(x) ((x) << S_TXPACEAUTOSTRICT)
+#define F_TXPACEAUTOSTRICT    V_TXPACEAUTOSTRICT(1U)
+
+#define S_TXPACEFIXED    9
+#define V_TXPACEFIXED(x) ((x) << S_TXPACEFIXED)
+#define F_TXPACEFIXED    V_TXPACEFIXED(1U)
+
+#define S_TXPACEAUTO    8
+#define V_TXPACEAUTO(x) ((x) << S_TXPACEAUTO)
+#define F_TXPACEAUTO    V_TXPACEAUTO(1U)
+
+#define S_RXCOALESCEENABLE    1
+#define V_RXCOALESCEENABLE(x) ((x) << S_RXCOALESCEENABLE)
+#define F_RXCOALESCEENABLE    V_RXCOALESCEENABLE(1U)
+
+#define S_RXCOALESCEPSHEN    0
+#define V_RXCOALESCEPSHEN(x) ((x) << S_RXCOALESCEPSHEN)
+#define F_RXCOALESCEPSHEN    V_RXCOALESCEPSHEN(1U)
+
+#define A_TP_PARA_REG4 0x370
+
+#define A_TP_PARA_REG5 0x374
+
+#define S_RXDDPOFFINIT    3
+#define V_RXDDPOFFINIT(x) ((x) << S_RXDDPOFFINIT)
+#define F_RXDDPOFFINIT    V_RXDDPOFFINIT(1U)
+
+#define A_TP_PARA_REG6 0x378
+
+#define S_T3A_ENABLEESND    13
+#define V_T3A_ENABLEESND(x) ((x) << S_T3A_ENABLEESND)
+#define F_T3A_ENABLEESND    V_T3A_ENABLEESND(1U)
+
+#define S_ENABLEESND    11
+#define V_ENABLEESND(x) ((x) << S_ENABLEESND)
+#define F_ENABLEESND    V_ENABLEESND(1U)
+
+#define A_TP_PARA_REG7 0x37c
+
+#define S_PMMAXXFERLEN1    16
+#define M_PMMAXXFERLEN1    0xffff
+#define V_PMMAXXFERLEN1(x) ((x) << S_PMMAXXFERLEN1)
+
+#define S_PMMAXXFERLEN0    0
+#define M_PMMAXXFERLEN0    0xffff
+#define V_PMMAXXFERLEN0(x) ((x) << S_PMMAXXFERLEN0)
+
+#define A_TP_TIMER_RESOLUTION 0x390
+
+#define S_TIMERRESOLUTION    16
+#define M_TIMERRESOLUTION    0xff
+#define V_TIMERRESOLUTION(x) ((x) << S_TIMERRESOLUTION)
+
+#define S_TIMESTAMPRESOLUTION    8
+#define M_TIMESTAMPRESOLUTION    0xff
+#define V_TIMESTAMPRESOLUTION(x) ((x) << S_TIMESTAMPRESOLUTION)
+
+#define S_DELAYEDACKRESOLUTION    0
+#define M_DELAYEDACKRESOLUTION    0xff
+#define V_DELAYEDACKRESOLUTION(x) ((x) << S_DELAYEDACKRESOLUTION)
+
+#define A_TP_MSL 0x394
+
+#define A_TP_RXT_MIN 0x398
+
+#define A_TP_RXT_MAX 0x39c
+
+#define A_TP_PERS_MIN 0x3a0
+
+#define A_TP_PERS_MAX 0x3a4
+
+#define A_TP_KEEP_IDLE 0x3a8
+
+#define A_TP_KEEP_INTVL 0x3ac
+
+#define A_TP_INIT_SRTT 0x3b0
+
+#define A_TP_DACK_TIMER 0x3b4
+
+#define A_TP_FINWAIT2_TIMER 0x3b8
+
+#define A_TP_SHIFT_CNT 0x3c0
+
+#define S_SYNSHIFTMAX    24
+
+#define M_SYNSHIFTMAX    0xff
+
+#define V_SYNSHIFTMAX(x) ((x) << S_SYNSHIFTMAX)
+
+#define S_RXTSHIFTMAXR1    20
+
+#define M_RXTSHIFTMAXR1    0xf
+
+#define V_RXTSHIFTMAXR1(x) ((x) << S_RXTSHIFTMAXR1)
+
+#define S_RXTSHIFTMAXR2    16
+
+#define M_RXTSHIFTMAXR2    0xf
+
+#define V_RXTSHIFTMAXR2(x) ((x) << S_RXTSHIFTMAXR2)
+
+#define S_PERSHIFTBACKOFFMAX    12
+#define M_PERSHIFTBACKOFFMAX    0xf
+#define V_PERSHIFTBACKOFFMAX(x) ((x) << S_PERSHIFTBACKOFFMAX)
+
+#define S_PERSHIFTMAX    8
+#define M_PERSHIFTMAX    0xf
+#define V_PERSHIFTMAX(x) ((x) << S_PERSHIFTMAX)
+
+#define S_KEEPALIVEMAX    0
+
+#define M_KEEPALIVEMAX    0xff
+
+#define V_KEEPALIVEMAX(x) ((x) << S_KEEPALIVEMAX)
+
+#define A_TP_MTU_PORT_TABLE 0x3d0
+
+#define A_TP_CCTRL_TABLE 0x3dc
+
+#define A_TP_MTU_TABLE 0x3e4
+
+#define A_TP_RSS_MAP_TABLE 0x3e8
+
+#define A_TP_RSS_LKP_TABLE 0x3ec
+
+#define A_TP_RSS_CONFIG 0x3f0
+
+#define S_TNL4TUPEN    29
+#define V_TNL4TUPEN(x) ((x) << S_TNL4TUPEN)
+#define F_TNL4TUPEN    V_TNL4TUPEN(1U)
+
+#define S_TNL2TUPEN    28
+#define V_TNL2TUPEN(x) ((x) << S_TNL2TUPEN)
+#define F_TNL2TUPEN    V_TNL2TUPEN(1U)
+
+#define S_TNLPRTEN    26
+#define V_TNLPRTEN(x) ((x) << S_TNLPRTEN)
+#define F_TNLPRTEN    V_TNLPRTEN(1U)
+
+#define S_TNLMAPEN    25
+#define V_TNLMAPEN(x) ((x) << S_TNLMAPEN)
+#define F_TNLMAPEN    V_TNLMAPEN(1U)
+
+#define S_TNLLKPEN    24
+#define V_TNLLKPEN(x) ((x) << S_TNLLKPEN)
+#define F_TNLLKPEN    V_TNLLKPEN(1U)
+
+#define S_RRCPLMAPEN    7
+#define V_RRCPLMAPEN(x) ((x) << S_RRCPLMAPEN)
+#define F_RRCPLMAPEN    V_RRCPLMAPEN(1U)
+
+#define S_RRCPLCPUSIZE    4
+#define M_RRCPLCPUSIZE    0x7
+#define V_RRCPLCPUSIZE(x) ((x) << S_RRCPLCPUSIZE)
+
+#define S_RQFEEDBACKENABLE    3
+#define V_RQFEEDBACKENABLE(x) ((x) << S_RQFEEDBACKENABLE)
+#define F_RQFEEDBACKENABLE    V_RQFEEDBACKENABLE(1U)
+
+#define S_HASHTOEPLITZ    2
+#define V_HASHTOEPLITZ(x) ((x) << S_HASHTOEPLITZ)
+#define F_HASHTOEPLITZ    V_HASHTOEPLITZ(1U)
+
+#define S_DISABLE    0
+
+#define A_TP_TM_PIO_ADDR 0x418
+
+#define A_TP_TM_PIO_DATA 0x41c
+
+#define A_TP_TX_MOD_QUE_TABLE 0x420
+
+#define A_TP_TX_RESOURCE_LIMIT 0x424
+
+#define A_TP_TX_MOD_QUEUE_REQ_MAP 0x428
+
+#define S_TX_MOD_QUEUE_REQ_MAP    0
+#define M_TX_MOD_QUEUE_REQ_MAP    0xff
+#define V_TX_MOD_QUEUE_REQ_MAP(x) ((x) << S_TX_MOD_QUEUE_REQ_MAP)
+
+#define A_TP_TX_MOD_QUEUE_WEIGHT1 0x42c
+
+#define A_TP_TX_MOD_QUEUE_WEIGHT0 0x430
+
+#define A_TP_MOD_CHANNEL_WEIGHT 0x434
+
+#define A_TP_MOD_RATE_LIMIT 0x438
+
+#define A_TP_PIO_ADDR 0x440
+
+#define A_TP_PIO_DATA 0x444
+
+#define A_TP_RESET 0x44c
+
+#define S_FLSTINITENABLE    1
+#define V_FLSTINITENABLE(x) ((x) << S_FLSTINITENABLE)
+#define F_FLSTINITENABLE    V_FLSTINITENABLE(1U)
+
+#define S_TPRESET    0
+#define V_TPRESET(x) ((x) << S_TPRESET)
+#define F_TPRESET    V_TPRESET(1U)
+
+#define A_TP_CMM_MM_RX_FLST_BASE 0x460
+
+#define A_TP_CMM_MM_TX_FLST_BASE 0x464
+
+#define A_TP_CMM_MM_PS_FLST_BASE 0x468
+
+#define A_TP_MIB_INDEX 0x450
+
+#define A_TP_MIB_RDATA 0x454
+
+#define A_TP_CMM_MM_MAX_PSTRUCT 0x46c
+
+#define A_TP_INT_ENABLE 0x470
+
+#define S_FLMTXFLSTEMPTY    30
+#define V_FLMTXFLSTEMPTY(x) ((x) << S_FLMTXFLSTEMPTY)
+#define F_FLMTXFLSTEMPTY    V_FLMTXFLSTEMPTY(1U)
+
+#define S_FLMRXFLSTEMPTY    29
+#define V_FLMRXFLSTEMPTY(x) ((x) << S_FLMRXFLSTEMPTY)
+#define F_FLMRXFLSTEMPTY    V_FLMRXFLSTEMPTY(1U)
+
+#define S_ARPLUTPERR    26
+#define V_ARPLUTPERR(x) ((x) << S_ARPLUTPERR)
+#define F_ARPLUTPERR    V_ARPLUTPERR(1U)
+
+#define S_CMCACHEPERR    24
+#define V_CMCACHEPERR(x) ((x) << S_CMCACHEPERR)
+#define F_CMCACHEPERR    V_CMCACHEPERR(1U)
+
+#define A_TP_INT_CAUSE 0x474
+
+#define A_TP_TX_MOD_Q1_Q0_RATE_LIMIT 0x8
+
+#define A_TP_TX_DROP_CFG_CH0 0x12b
+
+#define A_TP_TX_DROP_MODE 0x12f
+
+#define A_TP_EGRESS_CONFIG 0x145
+
+#define S_REWRITEFORCETOSIZE    0
+#define V_REWRITEFORCETOSIZE(x) ((x) << S_REWRITEFORCETOSIZE)
+#define F_REWRITEFORCETOSIZE    V_REWRITEFORCETOSIZE(1U)
+
+#define A_TP_TX_TRC_KEY0 0x20
+
+#define A_TP_RX_TRC_KEY0 0x120
+
+#define A_TP_TX_DROP_CNT_CH0 0x12d
+
+#define S_TXDROPCNTCH0RCVD    0
+#define M_TXDROPCNTCH0RCVD    0xffff
+#define V_TXDROPCNTCH0RCVD(x) ((x) << S_TXDROPCNTCH0RCVD)
+#define G_TXDROPCNTCH0RCVD(x) (((x) >> S_TXDROPCNTCH0RCVD) & \
+			       M_TXDROPCNTCH0RCVD)
+
+#define A_TP_PROXY_FLOW_CNTL 0x4b0
+
+#define A_TP_EMBED_OP_FIELD0 0x4e8
+#define A_TP_EMBED_OP_FIELD1 0x4ec
+#define A_TP_EMBED_OP_FIELD2 0x4f0
+#define A_TP_EMBED_OP_FIELD3 0x4f4
+#define A_TP_EMBED_OP_FIELD4 0x4f8
+#define A_TP_EMBED_OP_FIELD5 0x4fc
+
+#define A_ULPRX_CTL 0x500
+
+#define S_ROUND_ROBIN    4
+#define V_ROUND_ROBIN(x) ((x) << S_ROUND_ROBIN)
+#define F_ROUND_ROBIN    V_ROUND_ROBIN(1U)
+
+#define A_ULPRX_INT_ENABLE 0x504
+
+#define S_DATASELFRAMEERR0    7
+#define V_DATASELFRAMEERR0(x) ((x) << S_DATASELFRAMEERR0)
+#define F_DATASELFRAMEERR0    V_DATASELFRAMEERR0(1U)
+
+#define S_DATASELFRAMEERR1    6
+#define V_DATASELFRAMEERR1(x) ((x) << S_DATASELFRAMEERR1)
+#define F_DATASELFRAMEERR1    V_DATASELFRAMEERR1(1U)
+
+#define S_PCMDMUXPERR    5
+#define V_PCMDMUXPERR(x) ((x) << S_PCMDMUXPERR)
+#define F_PCMDMUXPERR    V_PCMDMUXPERR(1U)
+
+#define S_ARBFPERR    4
+#define V_ARBFPERR(x) ((x) << S_ARBFPERR)
+#define F_ARBFPERR    V_ARBFPERR(1U)
+
+#define S_ARBPF0PERR    3
+#define V_ARBPF0PERR(x) ((x) << S_ARBPF0PERR)
+#define F_ARBPF0PERR    V_ARBPF0PERR(1U)
+
+#define S_ARBPF1PERR    2
+#define V_ARBPF1PERR(x) ((x) << S_ARBPF1PERR)
+#define F_ARBPF1PERR    V_ARBPF1PERR(1U)
+
+#define S_PARERRPCMD    1
+#define V_PARERRPCMD(x) ((x) << S_PARERRPCMD)
+#define F_PARERRPCMD    V_PARERRPCMD(1U)
+
+#define S_PARERRDATA    0
+#define V_PARERRDATA(x) ((x) << S_PARERRDATA)
+#define F_PARERRDATA    V_PARERRDATA(1U)
+
+#define A_ULPRX_INT_CAUSE 0x508
+
+#define A_ULPRX_ISCSI_LLIMIT 0x50c
+
+#define A_ULPRX_ISCSI_ULIMIT 0x510
+
+#define A_ULPRX_ISCSI_TAGMASK 0x514
+
+#define A_ULPRX_ISCSI_PSZ 0x518
+
+#define A_ULPRX_TDDP_LLIMIT 0x51c
+
+#define A_ULPRX_TDDP_ULIMIT 0x520
+#define A_ULPRX_TDDP_PSZ 0x528
+
+#define S_HPZ0    0
+#define M_HPZ0    0xf
+#define V_HPZ0(x) ((x) << S_HPZ0)
+#define G_HPZ0(x) (((x) >> S_HPZ0) & M_HPZ0)
+
+#define A_ULPRX_STAG_LLIMIT 0x52c
+
+#define A_ULPRX_STAG_ULIMIT 0x530
+
+#define A_ULPRX_RQ_LLIMIT 0x534
+
+#define A_ULPRX_RQ_ULIMIT 0x538
+
+#define A_ULPRX_PBL_LLIMIT 0x53c
+
+#define A_ULPRX_PBL_ULIMIT 0x540
+
+#define A_ULPRX_TDDP_TAGMASK 0x524
+
+#define A_ULPTX_CONFIG 0x580
+
+#define S_CFG_CQE_SOP_MASK    1
+#define V_CFG_CQE_SOP_MASK(x) ((x) << S_CFG_CQE_SOP_MASK)
+#define F_CFG_CQE_SOP_MASK    V_CFG_CQE_SOP_MASK(1U)
+
+#define S_CFG_RR_ARB    0
+#define V_CFG_RR_ARB(x) ((x) << S_CFG_RR_ARB)
+#define F_CFG_RR_ARB    V_CFG_RR_ARB(1U)
+
+#define A_ULPTX_INT_ENABLE 0x584
+
+#define S_PBL_BOUND_ERR_CH1    1
+#define V_PBL_BOUND_ERR_CH1(x) ((x) << S_PBL_BOUND_ERR_CH1)
+#define F_PBL_BOUND_ERR_CH1    V_PBL_BOUND_ERR_CH1(1U)
+
+#define S_PBL_BOUND_ERR_CH0    0
+#define V_PBL_BOUND_ERR_CH0(x) ((x) << S_PBL_BOUND_ERR_CH0)
+#define F_PBL_BOUND_ERR_CH0    V_PBL_BOUND_ERR_CH0(1U)
+
+#define A_ULPTX_INT_CAUSE 0x588
+
+#define A_ULPTX_TPT_LLIMIT 0x58c
+
+#define A_ULPTX_TPT_ULIMIT 0x590
+
+#define A_ULPTX_PBL_LLIMIT 0x594
+
+#define A_ULPTX_PBL_ULIMIT 0x598
+
+#define A_ULPTX_DMA_WEIGHT 0x5ac
+
+#define S_D1_WEIGHT    16
+#define M_D1_WEIGHT    0xffff
+#define V_D1_WEIGHT(x) ((x) << S_D1_WEIGHT)
+
+#define S_D0_WEIGHT    0
+#define M_D0_WEIGHT    0xffff
+#define V_D0_WEIGHT(x) ((x) << S_D0_WEIGHT)
+
+#define A_PM1_RX_CFG 0x5c0
+#define A_PM1_RX_MODE 0x5c4
+
+#define A_PM1_RX_INT_ENABLE 0x5d8
+
+#define S_ZERO_E_CMD_ERROR    18
+#define V_ZERO_E_CMD_ERROR(x) ((x) << S_ZERO_E_CMD_ERROR)
+#define F_ZERO_E_CMD_ERROR    V_ZERO_E_CMD_ERROR(1U)
+
+#define S_IESPI0_FIFO2X_RX_FRAMING_ERROR    17
+#define V_IESPI0_FIFO2X_RX_FRAMING_ERROR(x) ((x) << S_IESPI0_FIFO2X_RX_FRAMING_ERROR)
+#define F_IESPI0_FIFO2X_RX_FRAMING_ERROR    V_IESPI0_FIFO2X_RX_FRAMING_ERROR(1U)
+
+#define S_IESPI1_FIFO2X_RX_FRAMING_ERROR    16
+#define V_IESPI1_FIFO2X_RX_FRAMING_ERROR(x) ((x) << S_IESPI1_FIFO2X_RX_FRAMING_ERROR)
+#define F_IESPI1_FIFO2X_RX_FRAMING_ERROR    V_IESPI1_FIFO2X_RX_FRAMING_ERROR(1U)
+
+#define S_IESPI0_RX_FRAMING_ERROR    15
+#define V_IESPI0_RX_FRAMING_ERROR(x) ((x) << S_IESPI0_RX_FRAMING_ERROR)
+#define F_IESPI0_RX_FRAMING_ERROR    V_IESPI0_RX_FRAMING_ERROR(1U)
+
+#define S_IESPI1_RX_FRAMING_ERROR    14
+#define V_IESPI1_RX_FRAMING_ERROR(x) ((x) << S_IESPI1_RX_FRAMING_ERROR)
+#define F_IESPI1_RX_FRAMING_ERROR    V_IESPI1_RX_FRAMING_ERROR(1U)
+
+#define S_IESPI0_TX_FRAMING_ERROR    13
+#define V_IESPI0_TX_FRAMING_ERROR(x) ((x) << S_IESPI0_TX_FRAMING_ERROR)
+#define F_IESPI0_TX_FRAMING_ERROR    V_IESPI0_TX_FRAMING_ERROR(1U)
+
+#define S_IESPI1_TX_FRAMING_ERROR    12
+#define V_IESPI1_TX_FRAMING_ERROR(x) ((x) << S_IESPI1_TX_FRAMING_ERROR)
+#define F_IESPI1_TX_FRAMING_ERROR    V_IESPI1_TX_FRAMING_ERROR(1U)
+
+#define S_OCSPI0_RX_FRAMING_ERROR    11
+#define V_OCSPI0_RX_FRAMING_ERROR(x) ((x) << S_OCSPI0_RX_FRAMING_ERROR)
+#define F_OCSPI0_RX_FRAMING_ERROR    V_OCSPI0_RX_FRAMING_ERROR(1U)
+
+#define S_OCSPI1_RX_FRAMING_ERROR    10
+#define V_OCSPI1_RX_FRAMING_ERROR(x) ((x) << S_OCSPI1_RX_FRAMING_ERROR)
+#define F_OCSPI1_RX_FRAMING_ERROR    V_OCSPI1_RX_FRAMING_ERROR(1U)
+
+#define S_OCSPI0_TX_FRAMING_ERROR    9
+#define V_OCSPI0_TX_FRAMING_ERROR(x) ((x) << S_OCSPI0_TX_FRAMING_ERROR)
+#define F_OCSPI0_TX_FRAMING_ERROR    V_OCSPI0_TX_FRAMING_ERROR(1U)
+
+#define S_OCSPI1_TX_FRAMING_ERROR    8
+#define V_OCSPI1_TX_FRAMING_ERROR(x) ((x) << S_OCSPI1_TX_FRAMING_ERROR)
+#define F_OCSPI1_TX_FRAMING_ERROR    V_OCSPI1_TX_FRAMING_ERROR(1U)
+
+#define S_OCSPI0_OFIFO2X_TX_FRAMING_ERROR    7
+#define V_OCSPI0_OFIFO2X_TX_FRAMING_ERROR(x) ((x) << S_OCSPI0_OFIFO2X_TX_FRAMING_ERROR)
+#define F_OCSPI0_OFIFO2X_TX_FRAMING_ERROR    V_OCSPI0_OFIFO2X_TX_FRAMING_ERROR(1U)
+
+#define S_OCSPI1_OFIFO2X_TX_FRAMING_ERROR    6
+#define V_OCSPI1_OFIFO2X_TX_FRAMING_ERROR(x) ((x) << S_OCSPI1_OFIFO2X_TX_FRAMING_ERROR)
+#define F_OCSPI1_OFIFO2X_TX_FRAMING_ERROR    V_OCSPI1_OFIFO2X_TX_FRAMING_ERROR(1U)
+
+#define S_IESPI_PAR_ERROR    3
+#define M_IESPI_PAR_ERROR    0x7
+
+#define V_IESPI_PAR_ERROR(x) ((x) << S_IESPI_PAR_ERROR)
+
+#define S_OCSPI_PAR_ERROR    0
+#define M_OCSPI_PAR_ERROR    0x7
+
+#define V_OCSPI_PAR_ERROR(x) ((x) << S_OCSPI_PAR_ERROR)
+
+#define A_PM1_RX_INT_CAUSE 0x5dc
+
+#define A_PM1_TX_CFG 0x5e0
+#define A_PM1_TX_MODE 0x5e4
+
+#define A_PM1_TX_INT_ENABLE 0x5f8
+
+#define S_ZERO_C_CMD_ERROR    18
+#define V_ZERO_C_CMD_ERROR(x) ((x) << S_ZERO_C_CMD_ERROR)
+#define F_ZERO_C_CMD_ERROR    V_ZERO_C_CMD_ERROR(1U)
+
+#define S_ICSPI0_FIFO2X_RX_FRAMING_ERROR    17
+#define V_ICSPI0_FIFO2X_RX_FRAMING_ERROR(x) ((x) << S_ICSPI0_FIFO2X_RX_FRAMING_ERROR)
+#define F_ICSPI0_FIFO2X_RX_FRAMING_ERROR    V_ICSPI0_FIFO2X_RX_FRAMING_ERROR(1U)
+
+#define S_ICSPI1_FIFO2X_RX_FRAMING_ERROR    16
+#define V_ICSPI1_FIFO2X_RX_FRAMING_ERROR(x) ((x) << S_ICSPI1_FIFO2X_RX_FRAMING_ERROR)
+#define F_ICSPI1_FIFO2X_RX_FRAMING_ERROR    V_ICSPI1_FIFO2X_RX_FRAMING_ERROR(1U)
+
+#define S_ICSPI0_RX_FRAMING_ERROR    15
+#define V_ICSPI0_RX_FRAMING_ERROR(x) ((x) << S_ICSPI0_RX_FRAMING_ERROR)
+#define F_ICSPI0_RX_FRAMING_ERROR    V_ICSPI0_RX_FRAMING_ERROR(1U)
+
+#define S_ICSPI1_RX_FRAMING_ERROR    14
+#define V_ICSPI1_RX_FRAMING_ERROR(x) ((x) << S_ICSPI1_RX_FRAMING_ERROR)
+#define F_ICSPI1_RX_FRAMING_ERROR    V_ICSPI1_RX_FRAMING_ERROR(1U)
+
+#define S_ICSPI0_TX_FRAMING_ERROR    13
+#define V_ICSPI0_TX_FRAMING_ERROR(x) ((x) << S_ICSPI0_TX_FRAMING_ERROR)
+#define F_ICSPI0_TX_FRAMING_ERROR    V_ICSPI0_TX_FRAMING_ERROR(1U)
+
+#define S_ICSPI1_TX_FRAMING_ERROR    12
+#define V_ICSPI1_TX_FRAMING_ERROR(x) ((x) << S_ICSPI1_TX_FRAMING_ERROR)
+#define F_ICSPI1_TX_FRAMING_ERROR    V_ICSPI1_TX_FRAMING_ERROR(1U)
+
+#define S_OESPI0_RX_FRAMING_ERROR    11
+#define V_OESPI0_RX_FRAMING_ERROR(x) ((x) << S_OESPI0_RX_FRAMING_ERROR)
+#define F_OESPI0_RX_FRAMING_ERROR    V_OESPI0_RX_FRAMING_ERROR(1U)
+
+#define S_OESPI1_RX_FRAMING_ERROR    10
+#define V_OESPI1_RX_FRAMING_ERROR(x) ((x) << S_OESPI1_RX_FRAMING_ERROR)
+#define F_OESPI1_RX_FRAMING_ERROR    V_OESPI1_RX_FRAMING_ERROR(1U)
+
+#define S_OESPI0_TX_FRAMING_ERROR    9
+#define V_OESPI0_TX_FRAMING_ERROR(x) ((x) << S_OESPI0_TX_FRAMING_ERROR)
+#define F_OESPI0_TX_FRAMING_ERROR    V_OESPI0_TX_FRAMING_ERROR(1U)
+
+#define S_OESPI1_TX_FRAMING_ERROR    8
+#define V_OESPI1_TX_FRAMING_ERROR(x) ((x) << S_OESPI1_TX_FRAMING_ERROR)
+#define F_OESPI1_TX_FRAMING_ERROR    V_OESPI1_TX_FRAMING_ERROR(1U)
+
+#define S_OESPI0_OFIFO2X_TX_FRAMING_ERROR    7
+#define V_OESPI0_OFIFO2X_TX_FRAMING_ERROR(x) ((x) << S_OESPI0_OFIFO2X_TX_FRAMING_ERROR)
+#define F_OESPI0_OFIFO2X_TX_FRAMING_ERROR    V_OESPI0_OFIFO2X_TX_FRAMING_ERROR(1U)
+
+#define S_OESPI1_OFIFO2X_TX_FRAMING_ERROR    6
+#define V_OESPI1_OFIFO2X_TX_FRAMING_ERROR(x) ((x) << S_OESPI1_OFIFO2X_TX_FRAMING_ERROR)
+#define F_OESPI1_OFIFO2X_TX_FRAMING_ERROR    V_OESPI1_OFIFO2X_TX_FRAMING_ERROR(1U)
+
+#define S_ICSPI_PAR_ERROR    3
+#define M_ICSPI_PAR_ERROR    0x7
+
+#define V_ICSPI_PAR_ERROR(x) ((x) << S_ICSPI_PAR_ERROR)
+
+#define S_OESPI_PAR_ERROR    0
+#define M_OESPI_PAR_ERROR    0x7
+
+#define V_OESPI_PAR_ERROR(x) ((x) << S_OESPI_PAR_ERROR)
+
+#define A_PM1_TX_INT_CAUSE 0x5fc
+
+#define A_MPS_CFG 0x600
+
+#define S_TPRXPORTEN    4
+#define V_TPRXPORTEN(x) ((x) << S_TPRXPORTEN)
+#define F_TPRXPORTEN    V_TPRXPORTEN(1U)
+
+#define S_TPTXPORT1EN    3
+#define V_TPTXPORT1EN(x) ((x) << S_TPTXPORT1EN)
+#define F_TPTXPORT1EN    V_TPTXPORT1EN(1U)
+
+#define S_TPTXPORT0EN    2
+#define V_TPTXPORT0EN(x) ((x) << S_TPTXPORT0EN)
+#define F_TPTXPORT0EN    V_TPTXPORT0EN(1U)
+
+#define S_PORT1ACTIVE    1
+#define V_PORT1ACTIVE(x) ((x) << S_PORT1ACTIVE)
+#define F_PORT1ACTIVE    V_PORT1ACTIVE(1U)
+
+#define S_PORT0ACTIVE    0
+#define V_PORT0ACTIVE(x) ((x) << S_PORT0ACTIVE)
+#define F_PORT0ACTIVE    V_PORT0ACTIVE(1U)
+
+#define S_ENFORCEPKT    11
+#define V_ENFORCEPKT(x) ((x) << S_ENFORCEPKT)
+#define F_ENFORCEPKT    V_ENFORCEPKT(1U)
+
+#define A_MPS_INT_ENABLE 0x61c
+
+#define S_MCAPARERRENB    6
+#define M_MCAPARERRENB    0x7
+
+#define V_MCAPARERRENB(x) ((x) << S_MCAPARERRENB)
+
+#define S_RXTPPARERRENB    4
+#define M_RXTPPARERRENB    0x3
+
+#define V_RXTPPARERRENB(x) ((x) << S_RXTPPARERRENB)
+
+#define S_TX1TPPARERRENB    2
+#define M_TX1TPPARERRENB    0x3
+
+#define V_TX1TPPARERRENB(x) ((x) << S_TX1TPPARERRENB)
+
+#define S_TX0TPPARERRENB    0
+#define M_TX0TPPARERRENB    0x3
+
+#define V_TX0TPPARERRENB(x) ((x) << S_TX0TPPARERRENB)
+
+#define A_MPS_INT_CAUSE 0x620
+
+#define S_MCAPARERR    6
+#define M_MCAPARERR    0x7
+
+#define V_MCAPARERR(x) ((x) << S_MCAPARERR)
+
+#define S_RXTPPARERR    4
+#define M_RXTPPARERR    0x3
+
+#define V_RXTPPARERR(x) ((x) << S_RXTPPARERR)
+
+#define S_TX1TPPARERR    2
+#define M_TX1TPPARERR    0x3
+
+#define V_TX1TPPARERR(x) ((x) << S_TX1TPPARERR)
+
+#define S_TX0TPPARERR    0
+#define M_TX0TPPARERR    0x3
+
+#define V_TX0TPPARERR(x) ((x) << S_TX0TPPARERR)
+
+#define A_CPL_SWITCH_CNTRL 0x640
+
+#define A_CPL_INTR_ENABLE 0x650
+
+#define S_CIM_OP_MAP_PERR    5
+#define V_CIM_OP_MAP_PERR(x) ((x) << S_CIM_OP_MAP_PERR)
+#define F_CIM_OP_MAP_PERR    V_CIM_OP_MAP_PERR(1U)
+
+#define S_CIM_OVFL_ERROR    4
+#define V_CIM_OVFL_ERROR(x) ((x) << S_CIM_OVFL_ERROR)
+#define F_CIM_OVFL_ERROR    V_CIM_OVFL_ERROR(1U)
+
+#define S_TP_FRAMING_ERROR    3
+#define V_TP_FRAMING_ERROR(x) ((x) << S_TP_FRAMING_ERROR)
+#define F_TP_FRAMING_ERROR    V_TP_FRAMING_ERROR(1U)
+
+#define S_SGE_FRAMING_ERROR    2
+#define V_SGE_FRAMING_ERROR(x) ((x) << S_SGE_FRAMING_ERROR)
+#define F_SGE_FRAMING_ERROR    V_SGE_FRAMING_ERROR(1U)
+
+#define S_CIM_FRAMING_ERROR    1
+#define V_CIM_FRAMING_ERROR(x) ((x) << S_CIM_FRAMING_ERROR)
+#define F_CIM_FRAMING_ERROR    V_CIM_FRAMING_ERROR(1U)
+
+#define S_ZERO_SWITCH_ERROR    0
+#define V_ZERO_SWITCH_ERROR(x) ((x) << S_ZERO_SWITCH_ERROR)
+#define F_ZERO_SWITCH_ERROR    V_ZERO_SWITCH_ERROR(1U)
+
+#define A_CPL_INTR_CAUSE 0x654
+
+#define A_CPL_MAP_TBL_DATA 0x65c
+
+#define A_SMB_GLOBAL_TIME_CFG 0x660
+
+#define A_I2C_CFG 0x6a0
+
+#define S_I2C_CLKDIV    0
+#define M_I2C_CLKDIV    0xfff
+#define V_I2C_CLKDIV(x) ((x) << S_I2C_CLKDIV)
+
+#define A_MI1_CFG 0x6b0
+
+#define S_CLKDIV    5
+#define M_CLKDIV    0xff
+#define V_CLKDIV(x) ((x) << S_CLKDIV)
+
+#define S_ST    3
+
+#define M_ST    0x3
+
+#define V_ST(x) ((x) << S_ST)
+
+#define G_ST(x) (((x) >> S_ST) & M_ST)
+
+#define S_PREEN    2
+#define V_PREEN(x) ((x) << S_PREEN)
+#define F_PREEN    V_PREEN(1U)
+
+#define S_MDIINV    1
+#define V_MDIINV(x) ((x) << S_MDIINV)
+#define F_MDIINV    V_MDIINV(1U)
+
+#define S_MDIEN    0
+#define V_MDIEN(x) ((x) << S_MDIEN)
+#define F_MDIEN    V_MDIEN(1U)
+
+#define A_MI1_ADDR 0x6b4
+
+#define S_PHYADDR    5
+#define M_PHYADDR    0x1f
+#define V_PHYADDR(x) ((x) << S_PHYADDR)
+
+#define S_REGADDR    0
+#define M_REGADDR    0x1f
+#define V_REGADDR(x) ((x) << S_REGADDR)
+
+#define A_MI1_DATA 0x6b8
+
+#define A_MI1_OP 0x6bc
+
+#define S_MDI_OP    0
+#define M_MDI_OP    0x3
+#define V_MDI_OP(x) ((x) << S_MDI_OP)
+
+#define A_SF_DATA 0x6d8
+
+#define A_SF_OP 0x6dc
+
+#define S_BYTECNT    1
+#define M_BYTECNT    0x3
+#define V_BYTECNT(x) ((x) << S_BYTECNT)
+
+#define A_PL_INT_ENABLE0 0x6e0
+
+#define S_T3DBG    23
+#define V_T3DBG(x) ((x) << S_T3DBG)
+#define F_T3DBG    V_T3DBG(1U)
+
+#define S_XGMAC0_1    20
+#define V_XGMAC0_1(x) ((x) << S_XGMAC0_1)
+#define F_XGMAC0_1    V_XGMAC0_1(1U)
+
+#define S_XGMAC0_0    19
+#define V_XGMAC0_0(x) ((x) << S_XGMAC0_0)
+#define F_XGMAC0_0    V_XGMAC0_0(1U)
+
+#define S_MC5A    18
+#define V_MC5A(x) ((x) << S_MC5A)
+#define F_MC5A    V_MC5A(1U)
+
+#define S_CPL_SWITCH    12
+#define V_CPL_SWITCH(x) ((x) << S_CPL_SWITCH)
+#define F_CPL_SWITCH    V_CPL_SWITCH(1U)
+
+#define S_MPS0    11
+#define V_MPS0(x) ((x) << S_MPS0)
+#define F_MPS0    V_MPS0(1U)
+
+#define S_PM1_TX    10
+#define V_PM1_TX(x) ((x) << S_PM1_TX)
+#define F_PM1_TX    V_PM1_TX(1U)
+
+#define S_PM1_RX    9
+#define V_PM1_RX(x) ((x) << S_PM1_RX)
+#define F_PM1_RX    V_PM1_RX(1U)
+
+#define S_ULP2_TX    8
+#define V_ULP2_TX(x) ((x) << S_ULP2_TX)
+#define F_ULP2_TX    V_ULP2_TX(1U)
+
+#define S_ULP2_RX    7
+#define V_ULP2_RX(x) ((x) << S_ULP2_RX)
+#define F_ULP2_RX    V_ULP2_RX(1U)
+
+#define S_TP1    6
+#define V_TP1(x) ((x) << S_TP1)
+#define F_TP1    V_TP1(1U)
+
+#define S_CIM    5
+#define V_CIM(x) ((x) << S_CIM)
+#define F_CIM    V_CIM(1U)
+
+#define S_MC7_CM    4
+#define V_MC7_CM(x) ((x) << S_MC7_CM)
+#define F_MC7_CM    V_MC7_CM(1U)
+
+#define S_MC7_PMTX    3
+#define V_MC7_PMTX(x) ((x) << S_MC7_PMTX)
+#define F_MC7_PMTX    V_MC7_PMTX(1U)
+
+#define S_MC7_PMRX    2
+#define V_MC7_PMRX(x) ((x) << S_MC7_PMRX)
+#define F_MC7_PMRX    V_MC7_PMRX(1U)
+
+#define S_PCIM0    1
+#define V_PCIM0(x) ((x) << S_PCIM0)
+#define F_PCIM0    V_PCIM0(1U)
+
+#define S_SGE3    0
+#define V_SGE3(x) ((x) << S_SGE3)
+#define F_SGE3    V_SGE3(1U)
+
+#define A_PL_INT_CAUSE0 0x6e4
+
+#define A_PL_RST 0x6f0
+
+#define S_FATALPERREN    4
+#define V_FATALPERREN(x) ((x) << S_FATALPERREN)
+#define F_FATALPERREN    V_FATALPERREN(1U)
+
+#define S_CRSTWRM    1
+#define V_CRSTWRM(x) ((x) << S_CRSTWRM)
+#define F_CRSTWRM    V_CRSTWRM(1U)
+
+#define A_PL_REV 0x6f4
+
+#define A_PL_CLI 0x6f8
+
+#define A_MC5_DB_CONFIG 0x704
+
+#define S_TMTYPEHI    30
+#define V_TMTYPEHI(x) ((x) << S_TMTYPEHI)
+#define F_TMTYPEHI    V_TMTYPEHI(1U)
+
+#define S_TMPARTSIZE    28
+#define M_TMPARTSIZE    0x3
+#define V_TMPARTSIZE(x) ((x) << S_TMPARTSIZE)
+#define G_TMPARTSIZE(x) (((x) >> S_TMPARTSIZE) & M_TMPARTSIZE)
+
+#define S_TMTYPE    26
+#define M_TMTYPE    0x3
+#define V_TMTYPE(x) ((x) << S_TMTYPE)
+#define G_TMTYPE(x) (((x) >> S_TMTYPE) & M_TMTYPE)
+
+#define S_COMPEN    17
+#define V_COMPEN(x) ((x) << S_COMPEN)
+#define F_COMPEN    V_COMPEN(1U)
+
+#define S_PRTYEN    6
+#define V_PRTYEN(x) ((x) << S_PRTYEN)
+#define F_PRTYEN    V_PRTYEN(1U)
+
+#define S_MBUSEN    5
+#define V_MBUSEN(x) ((x) << S_MBUSEN)
+#define F_MBUSEN    V_MBUSEN(1U)
+
+#define S_DBGIEN    4
+#define V_DBGIEN(x) ((x) << S_DBGIEN)
+#define F_DBGIEN    V_DBGIEN(1U)
+
+#define S_TMRDY    2
+#define V_TMRDY(x) ((x) << S_TMRDY)
+#define F_TMRDY    V_TMRDY(1U)
+
+#define S_TMRST    1
+#define V_TMRST(x) ((x) << S_TMRST)
+#define F_TMRST    V_TMRST(1U)
+
+#define S_TMMODE    0
+#define V_TMMODE(x) ((x) << S_TMMODE)
+#define F_TMMODE    V_TMMODE(1U)
+
+#define A_MC5_DB_ROUTING_TABLE_INDEX 0x70c
+
+#define A_MC5_DB_FILTER_TABLE 0x710
+
+#define A_MC5_DB_SERVER_INDEX 0x714
+
+#define A_MC5_DB_RSP_LATENCY 0x720
+
+#define S_RDLAT    16
+#define M_RDLAT    0x1f
+#define V_RDLAT(x) ((x) << S_RDLAT)
+
+#define S_LRNLAT    8
+#define M_LRNLAT    0x1f
+#define V_LRNLAT(x) ((x) << S_LRNLAT)
+
+#define S_SRCHLAT    0
+#define M_SRCHLAT    0x1f
+#define V_SRCHLAT(x) ((x) << S_SRCHLAT)
+
+#define A_MC5_DB_PART_ID_INDEX 0x72c
+
+#define A_MC5_DB_INT_ENABLE 0x740
+
+#define S_DELACTEMPTY    18
+#define V_DELACTEMPTY(x) ((x) << S_DELACTEMPTY)
+#define F_DELACTEMPTY    V_DELACTEMPTY(1U)
+
+#define S_DISPQPARERR    17
+#define V_DISPQPARERR(x) ((x) << S_DISPQPARERR)
+#define F_DISPQPARERR    V_DISPQPARERR(1U)
+
+#define S_REQQPARERR    16
+#define V_REQQPARERR(x) ((x) << S_REQQPARERR)
+#define F_REQQPARERR    V_REQQPARERR(1U)
+
+#define S_UNKNOWNCMD    15
+#define V_UNKNOWNCMD(x) ((x) << S_UNKNOWNCMD)
+#define F_UNKNOWNCMD    V_UNKNOWNCMD(1U)
+
+#define S_NFASRCHFAIL    8
+#define V_NFASRCHFAIL(x) ((x) << S_NFASRCHFAIL)
+#define F_NFASRCHFAIL    V_NFASRCHFAIL(1U)
+
+#define S_ACTRGNFULL    7
+#define V_ACTRGNFULL(x) ((x) << S_ACTRGNFULL)
+#define F_ACTRGNFULL    V_ACTRGNFULL(1U)
+
+#define S_PARITYERR    6
+#define V_PARITYERR(x) ((x) << S_PARITYERR)
+#define F_PARITYERR    V_PARITYERR(1U)
+
+#define A_MC5_DB_INT_CAUSE 0x744
+
+#define A_MC5_DB_DBGI_CONFIG 0x774
+
+#define A_MC5_DB_DBGI_REQ_CMD 0x778
+
+#define A_MC5_DB_DBGI_REQ_ADDR0 0x77c
+
+#define A_MC5_DB_DBGI_REQ_ADDR1 0x780
+
+#define A_MC5_DB_DBGI_REQ_ADDR2 0x784
+
+#define A_MC5_DB_DBGI_REQ_DATA0 0x788
+
+#define A_MC5_DB_DBGI_REQ_DATA1 0x78c
+
+#define A_MC5_DB_DBGI_REQ_DATA2 0x790
+
+#define A_MC5_DB_DBGI_RSP_STATUS 0x7b0
+
+#define S_DBGIRSPVALID    0
+#define V_DBGIRSPVALID(x) ((x) << S_DBGIRSPVALID)
+#define F_DBGIRSPVALID    V_DBGIRSPVALID(1U)
+
+#define A_MC5_DB_DBGI_RSP_DATA0 0x7b4
+
+#define A_MC5_DB_DBGI_RSP_DATA1 0x7b8
+
+#define A_MC5_DB_DBGI_RSP_DATA2 0x7bc
+
+#define A_MC5_DB_POPEN_DATA_WR_CMD 0x7cc
+
+#define A_MC5_DB_POPEN_MASK_WR_CMD 0x7d0
+
+#define A_MC5_DB_AOPEN_SRCH_CMD 0x7d4
+
+#define A_MC5_DB_AOPEN_LRN_CMD 0x7d8
+
+#define A_MC5_DB_SYN_SRCH_CMD 0x7dc
+
+#define A_MC5_DB_SYN_LRN_CMD 0x7e0
+
+#define A_MC5_DB_ACK_SRCH_CMD 0x7e4
+
+#define A_MC5_DB_ACK_LRN_CMD 0x7e8
+
+#define A_MC5_DB_ILOOKUP_CMD 0x7ec
+
+#define A_MC5_DB_ELOOKUP_CMD 0x7f0
+
+#define A_MC5_DB_DATA_WRITE_CMD 0x7f4
+
+#define A_MC5_DB_DATA_READ_CMD 0x7f8
+
+#define XGMAC0_0_BASE_ADDR 0x800
+
+#define A_XGM_TX_CTRL 0x800
+
+#define S_TXEN    0
+#define V_TXEN(x) ((x) << S_TXEN)
+#define F_TXEN    V_TXEN(1U)
+
+#define A_XGM_TX_CFG 0x804
+
+#define S_TXPAUSEEN    0
+#define V_TXPAUSEEN(x) ((x) << S_TXPAUSEEN)
+#define F_TXPAUSEEN    V_TXPAUSEEN(1U)
+
+#define A_XGM_TX_PAUSE_QUANTA 0x808
+
+#define A_XGM_RX_CTRL 0x80c
+
+#define S_RXEN    0
+#define V_RXEN(x) ((x) << S_RXEN)
+#define F_RXEN    V_RXEN(1U)
+
+#define A_XGM_RX_CFG 0x810
+
+#define S_DISPAUSEFRAMES    9
+#define V_DISPAUSEFRAMES(x) ((x) << S_DISPAUSEFRAMES)
+#define F_DISPAUSEFRAMES    V_DISPAUSEFRAMES(1U)
+
+#define S_EN1536BFRAMES    8
+#define V_EN1536BFRAMES(x) ((x) << S_EN1536BFRAMES)
+#define F_EN1536BFRAMES    V_EN1536BFRAMES(1U)
+
+#define S_ENJUMBO    7
+#define V_ENJUMBO(x) ((x) << S_ENJUMBO)
+#define F_ENJUMBO    V_ENJUMBO(1U)
+
+#define S_RMFCS    6
+#define V_RMFCS(x) ((x) << S_RMFCS)
+#define F_RMFCS    V_RMFCS(1U)
+
+#define S_ENHASHMCAST    2
+#define V_ENHASHMCAST(x) ((x) << S_ENHASHMCAST)
+#define F_ENHASHMCAST    V_ENHASHMCAST(1U)
+
+#define S_COPYALLFRAMES    0
+#define V_COPYALLFRAMES(x) ((x) << S_COPYALLFRAMES)
+#define F_COPYALLFRAMES    V_COPYALLFRAMES(1U)
+
+#define S_DISBCAST    1
+#define V_DISBCAST(x) ((x) << S_DISBCAST)
+#define F_DISBCAST    V_DISBCAST(1U)
+
+#define A_XGM_RX_HASH_LOW 0x814
+
+#define A_XGM_RX_HASH_HIGH 0x818
+
+#define A_XGM_RX_EXACT_MATCH_LOW_1 0x81c
+
+#define A_XGM_RX_EXACT_MATCH_HIGH_1 0x820
+
+#define A_XGM_RX_EXACT_MATCH_LOW_2 0x824
+
+#define A_XGM_RX_EXACT_MATCH_LOW_3 0x82c
+
+#define A_XGM_RX_EXACT_MATCH_LOW_4 0x834
+
+#define A_XGM_RX_EXACT_MATCH_LOW_5 0x83c
+
+#define A_XGM_RX_EXACT_MATCH_LOW_6 0x844
+
+#define A_XGM_RX_EXACT_MATCH_LOW_7 0x84c
+
+#define A_XGM_RX_EXACT_MATCH_LOW_8 0x854
+
+#define A_XGM_INT_STATUS 0x86c
+
+#define S_LINKFAULTCHANGE    9
+#define V_LINKFAULTCHANGE(x) ((x) << S_LINKFAULTCHANGE)
+#define F_LINKFAULTCHANGE    V_LINKFAULTCHANGE(1U)
+
+#define A_XGM_XGM_INT_ENABLE 0x874
+#define A_XGM_XGM_INT_DISABLE 0x878
+
+#define A_XGM_STAT_CTRL 0x880
+
+#define S_CLRSTATS    2
+#define V_CLRSTATS(x) ((x) << S_CLRSTATS)
+#define F_CLRSTATS    V_CLRSTATS(1U)
+
+#define A_XGM_RXFIFO_CFG 0x884
+
+#define S_RXFIFO_EMPTY    31
+#define V_RXFIFO_EMPTY(x) ((x) << S_RXFIFO_EMPTY)
+#define F_RXFIFO_EMPTY    V_RXFIFO_EMPTY(1U)
+
+#define S_RXFIFOPAUSEHWM    17
+#define M_RXFIFOPAUSEHWM    0xfff
+
+#define V_RXFIFOPAUSEHWM(x) ((x) << S_RXFIFOPAUSEHWM)
+
+#define G_RXFIFOPAUSEHWM(x) (((x) >> S_RXFIFOPAUSEHWM) & M_RXFIFOPAUSEHWM)
+
+#define S_RXFIFOPAUSELWM    5
+#define M_RXFIFOPAUSELWM    0xfff
+
+#define V_RXFIFOPAUSELWM(x) ((x) << S_RXFIFOPAUSELWM)
+
+#define G_RXFIFOPAUSELWM(x) (((x) >> S_RXFIFOPAUSELWM) & M_RXFIFOPAUSELWM)
+
+#define S_RXSTRFRWRD    1
+#define V_RXSTRFRWRD(x) ((x) << S_RXSTRFRWRD)
+#define F_RXSTRFRWRD    V_RXSTRFRWRD(1U)
+
+#define S_DISERRFRAMES    0
+#define V_DISERRFRAMES(x) ((x) << S_DISERRFRAMES)
+#define F_DISERRFRAMES    V_DISERRFRAMES(1U)
+
+#define A_XGM_TXFIFO_CFG 0x888
+
+#define S_UNDERUNFIX    22
+#define V_UNDERUNFIX(x) ((x) << S_UNDERUNFIX)
+#define F_UNDERUNFIX    V_UNDERUNFIX(1U)
+
+#define S_TXIPG    13
+#define M_TXIPG    0xff
+#define V_TXIPG(x) ((x) << S_TXIPG)
+#define G_TXIPG(x) (((x) >> S_TXIPG) & M_TXIPG)
+
+#define S_TXFIFOTHRESH    4
+#define M_TXFIFOTHRESH    0x1ff
+
+#define V_TXFIFOTHRESH(x) ((x) << S_TXFIFOTHRESH)
+
+#define S_ENDROPPKT    21
+#define V_ENDROPPKT(x) ((x) << S_ENDROPPKT)
+#define F_ENDROPPKT    V_ENDROPPKT(1U)
+
+#define A_XGM_SERDES_CTRL 0x890
+#define A_XGM_SERDES_CTRL0 0x8e0
+
+#define S_SERDESRESET_    24
+#define V_SERDESRESET_(x) ((x) << S_SERDESRESET_)
+#define F_SERDESRESET_    V_SERDESRESET_(1U)
+
+#define S_RXENABLE    4
+#define V_RXENABLE(x) ((x) << S_RXENABLE)
+#define F_RXENABLE    V_RXENABLE(1U)
+
+#define S_TXENABLE    3
+#define V_TXENABLE(x) ((x) << S_TXENABLE)
+#define F_TXENABLE    V_TXENABLE(1U)
+
+#define A_XGM_PAUSE_TIMER 0x890
+
+#define A_XGM_RGMII_IMP 0x89c
+
+#define S_XGM_IMPSETUPDATE    6
+#define V_XGM_IMPSETUPDATE(x) ((x) << S_XGM_IMPSETUPDATE)
+#define F_XGM_IMPSETUPDATE    V_XGM_IMPSETUPDATE(1U)
+
+#define S_RGMIIIMPPD    3
+#define M_RGMIIIMPPD    0x7
+#define V_RGMIIIMPPD(x) ((x) << S_RGMIIIMPPD)
+
+#define S_RGMIIIMPPU    0
+#define M_RGMIIIMPPU    0x7
+#define V_RGMIIIMPPU(x) ((x) << S_RGMIIIMPPU)
+
+#define S_CALRESET    8
+#define V_CALRESET(x) ((x) << S_CALRESET)
+#define F_CALRESET    V_CALRESET(1U)
+
+#define S_CALUPDATE    7
+#define V_CALUPDATE(x) ((x) << S_CALUPDATE)
+#define F_CALUPDATE    V_CALUPDATE(1U)
+
+#define A_XGM_XAUI_IMP 0x8a0
+
+#define S_CALBUSY    31
+#define V_CALBUSY(x) ((x) << S_CALBUSY)
+#define F_CALBUSY    V_CALBUSY(1U)
+
+#define S_XGM_CALFAULT    29
+#define V_XGM_CALFAULT(x) ((x) << S_XGM_CALFAULT)
+#define F_XGM_CALFAULT    V_XGM_CALFAULT(1U)
+
+#define S_CALIMP    24
+#define M_CALIMP    0x1f
+#define V_CALIMP(x) ((x) << S_CALIMP)
+#define G_CALIMP(x) (((x) >> S_CALIMP) & M_CALIMP)
+
+#define S_XAUIIMP    0
+#define M_XAUIIMP    0x7
+#define V_XAUIIMP(x) ((x) << S_XAUIIMP)
+
+#define A_XGM_RX_MAX_PKT_SIZE 0x8a8
+
+#define S_RXMAXFRAMERSIZE    17
+#define M_RXMAXFRAMERSIZE    0x3fff
+#define V_RXMAXFRAMERSIZE(x) ((x) << S_RXMAXFRAMERSIZE)
+#define G_RXMAXFRAMERSIZE(x) (((x) >> S_RXMAXFRAMERSIZE) & M_RXMAXFRAMERSIZE)
+
+#define S_RXENFRAMER    14
+#define V_RXENFRAMER(x) ((x) << S_RXENFRAMER)
+#define F_RXENFRAMER    V_RXENFRAMER(1U)
+
+#define S_RXMAXPKTSIZE    0
+#define M_RXMAXPKTSIZE    0x3fff
+#define V_RXMAXPKTSIZE(x) ((x) << S_RXMAXPKTSIZE)
+#define G_RXMAXPKTSIZE(x) (((x) >> S_RXMAXPKTSIZE) & M_RXMAXPKTSIZE)
+
+#define A_XGM_RESET_CTRL 0x8ac
+
+#define S_XGMAC_STOP_EN    4
+#define V_XGMAC_STOP_EN(x) ((x) << S_XGMAC_STOP_EN)
+#define F_XGMAC_STOP_EN    V_XGMAC_STOP_EN(1U)
+
+#define S_XG2G_RESET_    3
+#define V_XG2G_RESET_(x) ((x) << S_XG2G_RESET_)
+#define F_XG2G_RESET_    V_XG2G_RESET_(1U)
+
+#define S_RGMII_RESET_    2
+#define V_RGMII_RESET_(x) ((x) << S_RGMII_RESET_)
+#define F_RGMII_RESET_    V_RGMII_RESET_(1U)
+
+#define S_PCS_RESET_    1
+#define V_PCS_RESET_(x) ((x) << S_PCS_RESET_)
+#define F_PCS_RESET_    V_PCS_RESET_(1U)
+
+#define S_MAC_RESET_    0
+#define V_MAC_RESET_(x) ((x) << S_MAC_RESET_)
+#define F_MAC_RESET_    V_MAC_RESET_(1U)
+
+#define A_XGM_PORT_CFG 0x8b8
+
+#define S_CLKDIVRESET_    3
+#define V_CLKDIVRESET_(x) ((x) << S_CLKDIVRESET_)
+#define F_CLKDIVRESET_    V_CLKDIVRESET_(1U)
+
+#define S_PORTSPEED    1
+#define M_PORTSPEED    0x3
+
+#define V_PORTSPEED(x) ((x) << S_PORTSPEED)
+
+#define S_ENRGMII    0
+#define V_ENRGMII(x) ((x) << S_ENRGMII)
+#define F_ENRGMII    V_ENRGMII(1U)
+
+#define A_XGM_INT_ENABLE 0x8d4
+
+#define S_TXFIFO_PRTY_ERR    17
+#define M_TXFIFO_PRTY_ERR    0x7
+
+#define V_TXFIFO_PRTY_ERR(x) ((x) << S_TXFIFO_PRTY_ERR)
+
+#define S_RXFIFO_PRTY_ERR    14
+#define M_RXFIFO_PRTY_ERR    0x7
+
+#define V_RXFIFO_PRTY_ERR(x) ((x) << S_RXFIFO_PRTY_ERR)
+
+#define S_TXFIFO_UNDERRUN    13
+#define V_TXFIFO_UNDERRUN(x) ((x) << S_TXFIFO_UNDERRUN)
+#define F_TXFIFO_UNDERRUN    V_TXFIFO_UNDERRUN(1U)
+
+#define S_RXFIFO_OVERFLOW    12
+#define V_RXFIFO_OVERFLOW(x) ((x) << S_RXFIFO_OVERFLOW)
+#define F_RXFIFO_OVERFLOW    V_RXFIFO_OVERFLOW(1U)
+
+#define S_SERDES_LOS    4
+#define M_SERDES_LOS    0xf
+
+#define V_SERDES_LOS(x) ((x) << S_SERDES_LOS)
+
+#define S_XAUIPCSCTCERR    3
+#define V_XAUIPCSCTCERR(x) ((x) << S_XAUIPCSCTCERR)
+#define F_XAUIPCSCTCERR    V_XAUIPCSCTCERR(1U)
+
+#define S_XAUIPCSALIGNCHANGE    2
+#define V_XAUIPCSALIGNCHANGE(x) ((x) << S_XAUIPCSALIGNCHANGE)
+#define F_XAUIPCSALIGNCHANGE    V_XAUIPCSALIGNCHANGE(1U)
+
+#define S_XGM_INT    0
+#define V_XGM_INT(x) ((x) << S_XGM_INT)
+#define F_XGM_INT    V_XGM_INT(1U)
+
+#define A_XGM_INT_CAUSE 0x8d8
+
+#define A_XGM_XAUI_ACT_CTRL 0x8dc
+
+#define S_TXACTENABLE    1
+#define V_TXACTENABLE(x) ((x) << S_TXACTENABLE)
+#define F_TXACTENABLE    V_TXACTENABLE(1U)
+
+#define S_RESET3    23
+#define V_RESET3(x) ((x) << S_RESET3)
+#define F_RESET3    V_RESET3(1U)
+
+#define S_RESET2    22
+#define V_RESET2(x) ((x) << S_RESET2)
+#define F_RESET2    V_RESET2(1U)
+
+#define S_RESET1    21
+#define V_RESET1(x) ((x) << S_RESET1)
+#define F_RESET1    V_RESET1(1U)
+
+#define S_RESET0    20
+#define V_RESET0(x) ((x) << S_RESET0)
+#define F_RESET0    V_RESET0(1U)
+
+#define S_PWRDN3    19
+#define V_PWRDN3(x) ((x) << S_PWRDN3)
+#define F_PWRDN3    V_PWRDN3(1U)
+
+#define S_PWRDN2    18
+#define V_PWRDN2(x) ((x) << S_PWRDN2)
+#define F_PWRDN2    V_PWRDN2(1U)
+
+#define S_PWRDN1    17
+#define V_PWRDN1(x) ((x) << S_PWRDN1)
+#define F_PWRDN1    V_PWRDN1(1U)
+
+#define S_PWRDN0    16
+#define V_PWRDN0(x) ((x) << S_PWRDN0)
+#define F_PWRDN0    V_PWRDN0(1U)
+
+#define S_RESETPLL23    15
+#define V_RESETPLL23(x) ((x) << S_RESETPLL23)
+#define F_RESETPLL23    V_RESETPLL23(1U)
+
+#define S_RESETPLL01    14
+#define V_RESETPLL01(x) ((x) << S_RESETPLL01)
+#define F_RESETPLL01    V_RESETPLL01(1U)
+
+#define A_XGM_SERDES_STAT0 0x8f0
+#define A_XGM_SERDES_STAT1 0x8f4
+#define A_XGM_SERDES_STAT2 0x8f8
+
+#define S_LOWSIG0    0
+#define V_LOWSIG0(x) ((x) << S_LOWSIG0)
+#define F_LOWSIG0    V_LOWSIG0(1U)
+
+#define A_XGM_SERDES_STAT3 0x8fc
+
+#define A_XGM_STAT_TX_BYTE_LOW 0x900
+
+#define A_XGM_STAT_TX_BYTE_HIGH 0x904
+
+#define A_XGM_STAT_TX_FRAME_LOW 0x908
+
+#define A_XGM_STAT_TX_FRAME_HIGH 0x90c
+
+#define A_XGM_STAT_TX_BCAST 0x910
+
+#define A_XGM_STAT_TX_MCAST 0x914
+
+#define A_XGM_STAT_TX_PAUSE 0x918
+
+#define A_XGM_STAT_TX_64B_FRAMES 0x91c
+
+#define A_XGM_STAT_TX_65_127B_FRAMES 0x920
+
+#define A_XGM_STAT_TX_128_255B_FRAMES 0x924
+
+#define A_XGM_STAT_TX_256_511B_FRAMES 0x928
+
+#define A_XGM_STAT_TX_512_1023B_FRAMES 0x92c
+
+#define A_XGM_STAT_TX_1024_1518B_FRAMES 0x930
+
+#define A_XGM_STAT_TX_1519_MAXB_FRAMES 0x934
+
+#define A_XGM_STAT_TX_ERR_FRAMES 0x938
+
+#define A_XGM_STAT_RX_BYTES_LOW 0x93c
+
+#define A_XGM_STAT_RX_BYTES_HIGH 0x940
+
+#define A_XGM_STAT_RX_FRAMES_LOW 0x944
+
+#define A_XGM_STAT_RX_FRAMES_HIGH 0x948
+
+#define A_XGM_STAT_RX_BCAST_FRAMES 0x94c
+
+#define A_XGM_STAT_RX_MCAST_FRAMES 0x950
+
+#define A_XGM_STAT_RX_PAUSE_FRAMES 0x954
+
+#define A_XGM_STAT_RX_64B_FRAMES 0x958
+
+#define A_XGM_STAT_RX_65_127B_FRAMES 0x95c
+
+#define A_XGM_STAT_RX_128_255B_FRAMES 0x960
+
+#define A_XGM_STAT_RX_256_511B_FRAMES 0x964
+
+#define A_XGM_STAT_RX_512_1023B_FRAMES 0x968
+
+#define A_XGM_STAT_RX_1024_1518B_FRAMES 0x96c
+
+#define A_XGM_STAT_RX_1519_MAXB_FRAMES 0x970
+
+#define A_XGM_STAT_RX_SHORT_FRAMES 0x974
+
+#define A_XGM_STAT_RX_OVERSIZE_FRAMES 0x978
+
+#define A_XGM_STAT_RX_JABBER_FRAMES 0x97c
+
+#define A_XGM_STAT_RX_CRC_ERR_FRAMES 0x980
+
+#define A_XGM_STAT_RX_LENGTH_ERR_FRAMES 0x984
+
+#define A_XGM_STAT_RX_SYM_CODE_ERR_FRAMES 0x988
+
+#define A_XGM_SERDES_STATUS0 0x98c
+
+#define A_XGM_SERDES_STATUS1 0x990
+
+#define S_CMULOCK    31
+#define V_CMULOCK(x) ((x) << S_CMULOCK)
+#define F_CMULOCK    V_CMULOCK(1U)
+
+#define A_XGM_RX_MAX_PKT_SIZE_ERR_CNT 0x9a4
+
+#define A_XGM_TX_SPI4_SOP_EOP_CNT 0x9a8
+
+#define S_TXSPI4SOPCNT    16
+#define M_TXSPI4SOPCNT    0xffff
+#define V_TXSPI4SOPCNT(x) ((x) << S_TXSPI4SOPCNT)
+#define G_TXSPI4SOPCNT(x) (((x) >> S_TXSPI4SOPCNT) & M_TXSPI4SOPCNT)
+
+#define A_XGM_RX_SPI4_SOP_EOP_CNT 0x9ac
+
+#define XGMAC0_1_BASE_ADDR 0xa00
diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c
new file mode 100644
index 0000000..e988caa
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c
@@ -0,0 +1,3371 @@
+/*
+ * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+#include <net/arp.h>
+#include "common.h"
+#include "regs.h"
+#include "sge_defs.h"
+#include "t3_cpl.h"
+#include "firmware_exports.h"
+#include "cxgb3_offload.h"
+
+#define USE_GTS 0
+
+#define SGE_RX_SM_BUF_SIZE 1536
+
+#define SGE_RX_COPY_THRES  256
+#define SGE_RX_PULL_LEN    128
+
+#define SGE_PG_RSVD SMP_CACHE_BYTES
+/*
+ * Page chunk size for FL0 buffers if FL0 is to be populated with page chunks.
+ * It must be a divisor of PAGE_SIZE.  If set to 0 FL0 will use sk_buffs
+ * directly.
+ */
+#define FL0_PG_CHUNK_SIZE  2048
+#define FL0_PG_ORDER 0
+#define FL0_PG_ALLOC_SIZE (PAGE_SIZE << FL0_PG_ORDER)
+#define FL1_PG_CHUNK_SIZE (PAGE_SIZE > 8192 ? 16384 : 8192)
+#define FL1_PG_ORDER (PAGE_SIZE > 8192 ? 0 : 1)
+#define FL1_PG_ALLOC_SIZE (PAGE_SIZE << FL1_PG_ORDER)
+
+#define SGE_RX_DROP_THRES 16
+#define RX_RECLAIM_PERIOD (HZ/4)
+
+/*
+ * Max number of Rx buffers we replenish at a time.
+ */
+#define MAX_RX_REFILL 16U
+/*
+ * Period of the Tx buffer reclaim timer.  This timer does not need to run
+ * frequently as Tx buffers are usually reclaimed by new Tx packets.
+ */
+#define TX_RECLAIM_PERIOD (HZ / 4)
+#define TX_RECLAIM_TIMER_CHUNK 64U
+#define TX_RECLAIM_CHUNK 16U
+
+/* WR size in bytes */
+#define WR_LEN (WR_FLITS * 8)
+
+/*
+ * Types of Tx queues in each queue set.  Order here matters, do not change.
+ */
+enum { TXQ_ETH, TXQ_OFLD, TXQ_CTRL };
+
+/* Values for sge_txq.flags */
+enum {
+	TXQ_RUNNING = 1 << 0,	/* fetch engine is running */
+	TXQ_LAST_PKT_DB = 1 << 1,	/* last packet rang the doorbell */
+};
+
+struct tx_desc {
+	__be64 flit[TX_DESC_FLITS];
+};
+
+struct rx_desc {
+	__be32 addr_lo;
+	__be32 len_gen;
+	__be32 gen2;
+	__be32 addr_hi;
+};
+
+struct tx_sw_desc {		/* SW state per Tx descriptor */
+	struct sk_buff *skb;
+	u8 eop;       /* set if last descriptor for packet */
+	u8 addr_idx;  /* buffer index of first SGL entry in descriptor */
+	u8 fragidx;   /* first page fragment associated with descriptor */
+	s8 sflit;     /* start flit of first SGL entry in descriptor */
+};
+
+struct rx_sw_desc {                /* SW state per Rx descriptor */
+	union {
+		struct sk_buff *skb;
+		struct fl_pg_chunk pg_chunk;
+	};
+	DEFINE_DMA_UNMAP_ADDR(dma_addr);
+};
+
+struct rsp_desc {		/* response queue descriptor */
+	struct rss_header rss_hdr;
+	__be32 flags;
+	__be32 len_cq;
+	u8 imm_data[47];
+	u8 intr_gen;
+};
+
+/*
+ * Holds unmapping information for Tx packets that need deferred unmapping.
+ * This structure lives at skb->head and must be allocated by callers.
+ */
+struct deferred_unmap_info {
+	struct pci_dev *pdev;
+	dma_addr_t addr[MAX_SKB_FRAGS + 1];
+};
+
+/*
+ * Maps a number of flits to the number of Tx descriptors that can hold them.
+ * The formula is
+ *
+ * desc = 1 + (flits - 2) / (WR_FLITS - 1).
+ *
+ * HW allows up to 4 descriptors to be combined into a WR.
+ */
+static u8 flit_desc_map[] = {
+	0,
+#if SGE_NUM_GENBITS == 1
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+#elif SGE_NUM_GENBITS == 2
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+#else
+# error "SGE_NUM_GENBITS must be 1 or 2"
+#endif
+};
+
+static inline struct sge_qset *fl_to_qset(const struct sge_fl *q, int qidx)
+{
+	return container_of(q, struct sge_qset, fl[qidx]);
+}
+
+static inline struct sge_qset *rspq_to_qset(const struct sge_rspq *q)
+{
+	return container_of(q, struct sge_qset, rspq);
+}
+
+static inline struct sge_qset *txq_to_qset(const struct sge_txq *q, int qidx)
+{
+	return container_of(q, struct sge_qset, txq[qidx]);
+}
+
+/**
+ *	refill_rspq - replenish an SGE response queue
+ *	@adapter: the adapter
+ *	@q: the response queue to replenish
+ *	@credits: how many new responses to make available
+ *
+ *	Replenishes a response queue by making the supplied number of responses
+ *	available to HW.
+ */
+static inline void refill_rspq(struct adapter *adapter,
+			       const struct sge_rspq *q, unsigned int credits)
+{
+	rmb();
+	t3_write_reg(adapter, A_SG_RSPQ_CREDIT_RETURN,
+		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
+}
+
+/**
+ *	need_skb_unmap - does the platform need unmapping of sk_buffs?
+ *
+ *	Returns true if the platform needs sk_buff unmapping.  The compiler
+ *	optimizes away unnecessary code if this returns true.
+ */
+static inline int need_skb_unmap(void)
+{
+#ifdef CONFIG_NEED_DMA_MAP_STATE
+	return 1;
+#else
+	return 0;
+#endif
+}
+
+/**
+ *	unmap_skb - unmap a packet main body and its page fragments
+ *	@skb: the packet
+ *	@q: the Tx queue containing Tx descriptors for the packet
+ *	@cidx: index of Tx descriptor
+ *	@pdev: the PCI device
+ *
+ *	Unmap the main body of an sk_buff and its page fragments, if any.
+ *	Because of the fairly complicated structure of our SGLs and the desire
+ *	to conserve space for metadata, the information necessary to unmap an
+ *	sk_buff is spread across the sk_buff itself (buffer lengths), the HW Tx
+ *	descriptors (the physical addresses of the various data buffers), and
+ *	the SW descriptor state (assorted indices).  The send functions
+ *	initialize the indices for the first packet descriptor so we can unmap
+ *	the buffers held in the first Tx descriptor here, and we have enough
+ *	information at this point to set the state for the next Tx descriptor.
+ *
+ *	Note that it is possible to clean up the first descriptor of a packet
+ *	before the send routines have written the next descriptors, but this
+ *	race does not cause any problem.  We just end up writing the unmapping
+ *	info for the descriptor first.
+ */
+static inline void unmap_skb(struct sk_buff *skb, struct sge_txq *q,
+			     unsigned int cidx, struct pci_dev *pdev)
+{
+	const struct sg_ent *sgp;
+	struct tx_sw_desc *d = &q->sdesc[cidx];
+	int nfrags, frag_idx, curflit, j = d->addr_idx;
+
+	sgp = (struct sg_ent *)&q->desc[cidx].flit[d->sflit];
+	frag_idx = d->fragidx;
+
+	if (frag_idx == 0 && skb_headlen(skb)) {
+		pci_unmap_single(pdev, be64_to_cpu(sgp->addr[0]),
+				 skb_headlen(skb), PCI_DMA_TODEVICE);
+		j = 1;
+	}
+
+	curflit = d->sflit + 1 + j;
+	nfrags = skb_shinfo(skb)->nr_frags;
+
+	while (frag_idx < nfrags && curflit < WR_FLITS) {
+		pci_unmap_page(pdev, be64_to_cpu(sgp->addr[j]),
+			       skb_frag_size(&skb_shinfo(skb)->frags[frag_idx]),
+			       PCI_DMA_TODEVICE);
+		j ^= 1;
+		if (j == 0) {
+			sgp++;
+			curflit++;
+		}
+		curflit++;
+		frag_idx++;
+	}
+
+	if (frag_idx < nfrags) {   /* SGL continues into next Tx descriptor */
+		d = cidx + 1 == q->size ? q->sdesc : d + 1;
+		d->fragidx = frag_idx;
+		d->addr_idx = j;
+		d->sflit = curflit - WR_FLITS - j; /* sflit can be -1 */
+	}
+}
+
+/**
+ *	free_tx_desc - reclaims Tx descriptors and their buffers
+ *	@adapter: the adapter
+ *	@q: the Tx queue to reclaim descriptors from
+ *	@n: the number of descriptors to reclaim
+ *
+ *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
+ *	Tx buffers.  Called with the Tx queue lock held.
+ */
+static void free_tx_desc(struct adapter *adapter, struct sge_txq *q,
+			 unsigned int n)
+{
+	struct tx_sw_desc *d;
+	struct pci_dev *pdev = adapter->pdev;
+	unsigned int cidx = q->cidx;
+
+	const int need_unmap = need_skb_unmap() &&
+			       q->cntxt_id >= FW_TUNNEL_SGEEC_START;
+
+	d = &q->sdesc[cidx];
+	while (n--) {
+		if (d->skb) {	/* an SGL is present */
+			if (need_unmap)
+				unmap_skb(d->skb, q, cidx, pdev);
+			if (d->eop) {
+				dev_consume_skb_any(d->skb);
+				d->skb = NULL;
+			}
+		}
+		++d;
+		if (++cidx == q->size) {
+			cidx = 0;
+			d = q->sdesc;
+		}
+	}
+	q->cidx = cidx;
+}
+
+/**
+ *	reclaim_completed_tx - reclaims completed Tx descriptors
+ *	@adapter: the adapter
+ *	@q: the Tx queue to reclaim completed descriptors from
+ *	@chunk: maximum number of descriptors to reclaim
+ *
+ *	Reclaims Tx descriptors that the SGE has indicated it has processed,
+ *	and frees the associated buffers if possible.  Called with the Tx
+ *	queue's lock held.
+ */
+static inline unsigned int reclaim_completed_tx(struct adapter *adapter,
+						struct sge_txq *q,
+						unsigned int chunk)
+{
+	unsigned int reclaim = q->processed - q->cleaned;
+
+	reclaim = min(chunk, reclaim);
+	if (reclaim) {
+		free_tx_desc(adapter, q, reclaim);
+		q->cleaned += reclaim;
+		q->in_use -= reclaim;
+	}
+	return q->processed - q->cleaned;
+}
+
+/**
+ *	should_restart_tx - are there enough resources to restart a Tx queue?
+ *	@q: the Tx queue
+ *
+ *	Checks if there are enough descriptors to restart a suspended Tx queue.
+ */
+static inline int should_restart_tx(const struct sge_txq *q)
+{
+	unsigned int r = q->processed - q->cleaned;
+
+	return q->in_use - r < (q->size >> 1);
+}
+
+static void clear_rx_desc(struct pci_dev *pdev, const struct sge_fl *q,
+			  struct rx_sw_desc *d)
+{
+	if (q->use_pages && d->pg_chunk.page) {
+		(*d->pg_chunk.p_cnt)--;
+		if (!*d->pg_chunk.p_cnt)
+			pci_unmap_page(pdev,
+				       d->pg_chunk.mapping,
+				       q->alloc_size, PCI_DMA_FROMDEVICE);
+
+		put_page(d->pg_chunk.page);
+		d->pg_chunk.page = NULL;
+	} else {
+		pci_unmap_single(pdev, dma_unmap_addr(d, dma_addr),
+				 q->buf_size, PCI_DMA_FROMDEVICE);
+		kfree_skb(d->skb);
+		d->skb = NULL;
+	}
+}
+
+/**
+ *	free_rx_bufs - free the Rx buffers on an SGE free list
+ *	@pdev: the PCI device associated with the adapter
+ *	@rxq: the SGE free list to clean up
+ *
+ *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
+ *	this queue should be stopped before calling this function.
+ */
+static void free_rx_bufs(struct pci_dev *pdev, struct sge_fl *q)
+{
+	unsigned int cidx = q->cidx;
+
+	while (q->credits--) {
+		struct rx_sw_desc *d = &q->sdesc[cidx];
+
+
+		clear_rx_desc(pdev, q, d);
+		if (++cidx == q->size)
+			cidx = 0;
+	}
+
+	if (q->pg_chunk.page) {
+		__free_pages(q->pg_chunk.page, q->order);
+		q->pg_chunk.page = NULL;
+	}
+}
+
+/**
+ *	add_one_rx_buf - add a packet buffer to a free-buffer list
+ *	@va:  buffer start VA
+ *	@len: the buffer length
+ *	@d: the HW Rx descriptor to write
+ *	@sd: the SW Rx descriptor to write
+ *	@gen: the generation bit value
+ *	@pdev: the PCI device associated with the adapter
+ *
+ *	Add a buffer of the given length to the supplied HW and SW Rx
+ *	descriptors.
+ */
+static inline int add_one_rx_buf(void *va, unsigned int len,
+				 struct rx_desc *d, struct rx_sw_desc *sd,
+				 unsigned int gen, struct pci_dev *pdev)
+{
+	dma_addr_t mapping;
+
+	mapping = pci_map_single(pdev, va, len, PCI_DMA_FROMDEVICE);
+	if (unlikely(pci_dma_mapping_error(pdev, mapping)))
+		return -ENOMEM;
+
+	dma_unmap_addr_set(sd, dma_addr, mapping);
+
+	d->addr_lo = cpu_to_be32(mapping);
+	d->addr_hi = cpu_to_be32((u64) mapping >> 32);
+	dma_wmb();
+	d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
+	d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
+	return 0;
+}
+
+static inline int add_one_rx_chunk(dma_addr_t mapping, struct rx_desc *d,
+				   unsigned int gen)
+{
+	d->addr_lo = cpu_to_be32(mapping);
+	d->addr_hi = cpu_to_be32((u64) mapping >> 32);
+	dma_wmb();
+	d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
+	d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
+	return 0;
+}
+
+static int alloc_pg_chunk(struct adapter *adapter, struct sge_fl *q,
+			  struct rx_sw_desc *sd, gfp_t gfp,
+			  unsigned int order)
+{
+	if (!q->pg_chunk.page) {
+		dma_addr_t mapping;
+
+		q->pg_chunk.page = alloc_pages(gfp, order);
+		if (unlikely(!q->pg_chunk.page))
+			return -ENOMEM;
+		q->pg_chunk.va = page_address(q->pg_chunk.page);
+		q->pg_chunk.p_cnt = q->pg_chunk.va + (PAGE_SIZE << order) -
+				    SGE_PG_RSVD;
+		q->pg_chunk.offset = 0;
+		mapping = pci_map_page(adapter->pdev, q->pg_chunk.page,
+				       0, q->alloc_size, PCI_DMA_FROMDEVICE);
+		if (unlikely(pci_dma_mapping_error(adapter->pdev, mapping))) {
+			__free_pages(q->pg_chunk.page, order);
+			q->pg_chunk.page = NULL;
+			return -EIO;
+		}
+		q->pg_chunk.mapping = mapping;
+	}
+	sd->pg_chunk = q->pg_chunk;
+
+	prefetch(sd->pg_chunk.p_cnt);
+
+	q->pg_chunk.offset += q->buf_size;
+	if (q->pg_chunk.offset == (PAGE_SIZE << order))
+		q->pg_chunk.page = NULL;
+	else {
+		q->pg_chunk.va += q->buf_size;
+		get_page(q->pg_chunk.page);
+	}
+
+	if (sd->pg_chunk.offset == 0)
+		*sd->pg_chunk.p_cnt = 1;
+	else
+		*sd->pg_chunk.p_cnt += 1;
+
+	return 0;
+}
+
+static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
+{
+	if (q->pend_cred >= q->credits / 4) {
+		q->pend_cred = 0;
+		wmb();
+		t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
+	}
+}
+
+/**
+ *	refill_fl - refill an SGE free-buffer list
+ *	@adapter: the adapter
+ *	@q: the free-list to refill
+ *	@n: the number of new buffers to allocate
+ *	@gfp: the gfp flags for allocating new buffers
+ *
+ *	(Re)populate an SGE free-buffer list with up to @n new packet buffers,
+ *	allocated with the supplied gfp flags.  The caller must assure that
+ *	@n does not exceed the queue's capacity.
+ */
+static int refill_fl(struct adapter *adap, struct sge_fl *q, int n, gfp_t gfp)
+{
+	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
+	struct rx_desc *d = &q->desc[q->pidx];
+	unsigned int count = 0;
+
+	while (n--) {
+		dma_addr_t mapping;
+		int err;
+
+		if (q->use_pages) {
+			if (unlikely(alloc_pg_chunk(adap, q, sd, gfp,
+						    q->order))) {
+nomem:				q->alloc_failed++;
+				break;
+			}
+			mapping = sd->pg_chunk.mapping + sd->pg_chunk.offset;
+			dma_unmap_addr_set(sd, dma_addr, mapping);
+
+			add_one_rx_chunk(mapping, d, q->gen);
+			pci_dma_sync_single_for_device(adap->pdev, mapping,
+						q->buf_size - SGE_PG_RSVD,
+						PCI_DMA_FROMDEVICE);
+		} else {
+			void *buf_start;
+
+			struct sk_buff *skb = alloc_skb(q->buf_size, gfp);
+			if (!skb)
+				goto nomem;
+
+			sd->skb = skb;
+			buf_start = skb->data;
+			err = add_one_rx_buf(buf_start, q->buf_size, d, sd,
+					     q->gen, adap->pdev);
+			if (unlikely(err)) {
+				clear_rx_desc(adap->pdev, q, sd);
+				break;
+			}
+		}
+
+		d++;
+		sd++;
+		if (++q->pidx == q->size) {
+			q->pidx = 0;
+			q->gen ^= 1;
+			sd = q->sdesc;
+			d = q->desc;
+		}
+		count++;
+	}
+
+	q->credits += count;
+	q->pend_cred += count;
+	ring_fl_db(adap, q);
+
+	return count;
+}
+
+static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
+{
+	refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits),
+		  GFP_ATOMIC | __GFP_COMP);
+}
+
+/**
+ *	recycle_rx_buf - recycle a receive buffer
+ *	@adapter: the adapter
+ *	@q: the SGE free list
+ *	@idx: index of buffer to recycle
+ *
+ *	Recycles the specified buffer on the given free list by adding it at
+ *	the next available slot on the list.
+ */
+static void recycle_rx_buf(struct adapter *adap, struct sge_fl *q,
+			   unsigned int idx)
+{
+	struct rx_desc *from = &q->desc[idx];
+	struct rx_desc *to = &q->desc[q->pidx];
+
+	q->sdesc[q->pidx] = q->sdesc[idx];
+	to->addr_lo = from->addr_lo;	/* already big endian */
+	to->addr_hi = from->addr_hi;	/* likewise */
+	dma_wmb();
+	to->len_gen = cpu_to_be32(V_FLD_GEN1(q->gen));
+	to->gen2 = cpu_to_be32(V_FLD_GEN2(q->gen));
+
+	if (++q->pidx == q->size) {
+		q->pidx = 0;
+		q->gen ^= 1;
+	}
+
+	q->credits++;
+	q->pend_cred++;
+	ring_fl_db(adap, q);
+}
+
+/**
+ *	alloc_ring - allocate resources for an SGE descriptor ring
+ *	@pdev: the PCI device
+ *	@nelem: the number of descriptors
+ *	@elem_size: the size of each descriptor
+ *	@sw_size: the size of the SW state associated with each ring element
+ *	@phys: the physical address of the allocated ring
+ *	@metadata: address of the array holding the SW state for the ring
+ *
+ *	Allocates resources for an SGE descriptor ring, such as Tx queues,
+ *	free buffer lists, or response queues.  Each SGE ring requires
+ *	space for its HW descriptors plus, optionally, space for the SW state
+ *	associated with each HW entry (the metadata).  The function returns
+ *	three values: the virtual address for the HW ring (the return value
+ *	of the function), the physical address of the HW ring, and the address
+ *	of the SW ring.
+ */
+static void *alloc_ring(struct pci_dev *pdev, size_t nelem, size_t elem_size,
+			size_t sw_size, dma_addr_t * phys, void *metadata)
+{
+	size_t len = nelem * elem_size;
+	void *s = NULL;
+	void *p = dma_alloc_coherent(&pdev->dev, len, phys, GFP_KERNEL);
+
+	if (!p)
+		return NULL;
+	if (sw_size && metadata) {
+		s = kcalloc(nelem, sw_size, GFP_KERNEL);
+
+		if (!s) {
+			dma_free_coherent(&pdev->dev, len, p, *phys);
+			return NULL;
+		}
+		*(void **)metadata = s;
+	}
+	memset(p, 0, len);
+	return p;
+}
+
+/**
+ *	t3_reset_qset - reset a sge qset
+ *	@q: the queue set
+ *
+ *	Reset the qset structure.
+ *	the NAPI structure is preserved in the event of
+ *	the qset's reincarnation, for example during EEH recovery.
+ */
+static void t3_reset_qset(struct sge_qset *q)
+{
+	if (q->adap &&
+	    !(q->adap->flags & NAPI_INIT)) {
+		memset(q, 0, sizeof(*q));
+		return;
+	}
+
+	q->adap = NULL;
+	memset(&q->rspq, 0, sizeof(q->rspq));
+	memset(q->fl, 0, sizeof(struct sge_fl) * SGE_RXQ_PER_SET);
+	memset(q->txq, 0, sizeof(struct sge_txq) * SGE_TXQ_PER_SET);
+	q->txq_stopped = 0;
+	q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
+	q->rx_reclaim_timer.function = NULL;
+	q->nomem = 0;
+	napi_free_frags(&q->napi);
+}
+
+
+/**
+ *	free_qset - free the resources of an SGE queue set
+ *	@adapter: the adapter owning the queue set
+ *	@q: the queue set
+ *
+ *	Release the HW and SW resources associated with an SGE queue set, such
+ *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
+ *	queue set must be quiesced prior to calling this.
+ */
+static void t3_free_qset(struct adapter *adapter, struct sge_qset *q)
+{
+	int i;
+	struct pci_dev *pdev = adapter->pdev;
+
+	for (i = 0; i < SGE_RXQ_PER_SET; ++i)
+		if (q->fl[i].desc) {
+			spin_lock_irq(&adapter->sge.reg_lock);
+			t3_sge_disable_fl(adapter, q->fl[i].cntxt_id);
+			spin_unlock_irq(&adapter->sge.reg_lock);
+			free_rx_bufs(pdev, &q->fl[i]);
+			kfree(q->fl[i].sdesc);
+			dma_free_coherent(&pdev->dev,
+					  q->fl[i].size *
+					  sizeof(struct rx_desc), q->fl[i].desc,
+					  q->fl[i].phys_addr);
+		}
+
+	for (i = 0; i < SGE_TXQ_PER_SET; ++i)
+		if (q->txq[i].desc) {
+			spin_lock_irq(&adapter->sge.reg_lock);
+			t3_sge_enable_ecntxt(adapter, q->txq[i].cntxt_id, 0);
+			spin_unlock_irq(&adapter->sge.reg_lock);
+			if (q->txq[i].sdesc) {
+				free_tx_desc(adapter, &q->txq[i],
+					     q->txq[i].in_use);
+				kfree(q->txq[i].sdesc);
+			}
+			dma_free_coherent(&pdev->dev,
+					  q->txq[i].size *
+					  sizeof(struct tx_desc),
+					  q->txq[i].desc, q->txq[i].phys_addr);
+			__skb_queue_purge(&q->txq[i].sendq);
+		}
+
+	if (q->rspq.desc) {
+		spin_lock_irq(&adapter->sge.reg_lock);
+		t3_sge_disable_rspcntxt(adapter, q->rspq.cntxt_id);
+		spin_unlock_irq(&adapter->sge.reg_lock);
+		dma_free_coherent(&pdev->dev,
+				  q->rspq.size * sizeof(struct rsp_desc),
+				  q->rspq.desc, q->rspq.phys_addr);
+	}
+
+	t3_reset_qset(q);
+}
+
+/**
+ *	init_qset_cntxt - initialize an SGE queue set context info
+ *	@qs: the queue set
+ *	@id: the queue set id
+ *
+ *	Initializes the TIDs and context ids for the queues of a queue set.
+ */
+static void init_qset_cntxt(struct sge_qset *qs, unsigned int id)
+{
+	qs->rspq.cntxt_id = id;
+	qs->fl[0].cntxt_id = 2 * id;
+	qs->fl[1].cntxt_id = 2 * id + 1;
+	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
+	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
+	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
+	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
+	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
+}
+
+/**
+ *	sgl_len - calculates the size of an SGL of the given capacity
+ *	@n: the number of SGL entries
+ *
+ *	Calculates the number of flits needed for a scatter/gather list that
+ *	can hold the given number of entries.
+ */
+static inline unsigned int sgl_len(unsigned int n)
+{
+	/* alternatively: 3 * (n / 2) + 2 * (n & 1) */
+	return (3 * n) / 2 + (n & 1);
+}
+
+/**
+ *	flits_to_desc - returns the num of Tx descriptors for the given flits
+ *	@n: the number of flits
+ *
+ *	Calculates the number of Tx descriptors needed for the supplied number
+ *	of flits.
+ */
+static inline unsigned int flits_to_desc(unsigned int n)
+{
+	BUG_ON(n >= ARRAY_SIZE(flit_desc_map));
+	return flit_desc_map[n];
+}
+
+/**
+ *	get_packet - return the next ingress packet buffer from a free list
+ *	@adap: the adapter that received the packet
+ *	@fl: the SGE free list holding the packet
+ *	@len: the packet length including any SGE padding
+ *	@drop_thres: # of remaining buffers before we start dropping packets
+ *
+ *	Get the next packet from a free list and complete setup of the
+ *	sk_buff.  If the packet is small we make a copy and recycle the
+ *	original buffer, otherwise we use the original buffer itself.  If a
+ *	positive drop threshold is supplied packets are dropped and their
+ *	buffers recycled if (a) the number of remaining buffers is under the
+ *	threshold and the packet is too big to copy, or (b) the packet should
+ *	be copied but there is no memory for the copy.
+ */
+static struct sk_buff *get_packet(struct adapter *adap, struct sge_fl *fl,
+				  unsigned int len, unsigned int drop_thres)
+{
+	struct sk_buff *skb = NULL;
+	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
+
+	prefetch(sd->skb->data);
+	fl->credits--;
+
+	if (len <= SGE_RX_COPY_THRES) {
+		skb = alloc_skb(len, GFP_ATOMIC);
+		if (likely(skb != NULL)) {
+			__skb_put(skb, len);
+			pci_dma_sync_single_for_cpu(adap->pdev,
+					    dma_unmap_addr(sd, dma_addr), len,
+					    PCI_DMA_FROMDEVICE);
+			memcpy(skb->data, sd->skb->data, len);
+			pci_dma_sync_single_for_device(adap->pdev,
+					    dma_unmap_addr(sd, dma_addr), len,
+					    PCI_DMA_FROMDEVICE);
+		} else if (!drop_thres)
+			goto use_orig_buf;
+recycle:
+		recycle_rx_buf(adap, fl, fl->cidx);
+		return skb;
+	}
+
+	if (unlikely(fl->credits < drop_thres) &&
+	    refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits - 1),
+		      GFP_ATOMIC | __GFP_COMP) == 0)
+		goto recycle;
+
+use_orig_buf:
+	pci_unmap_single(adap->pdev, dma_unmap_addr(sd, dma_addr),
+			 fl->buf_size, PCI_DMA_FROMDEVICE);
+	skb = sd->skb;
+	skb_put(skb, len);
+	__refill_fl(adap, fl);
+	return skb;
+}
+
+/**
+ *	get_packet_pg - return the next ingress packet buffer from a free list
+ *	@adap: the adapter that received the packet
+ *	@fl: the SGE free list holding the packet
+ *	@len: the packet length including any SGE padding
+ *	@drop_thres: # of remaining buffers before we start dropping packets
+ *
+ *	Get the next packet from a free list populated with page chunks.
+ *	If the packet is small we make a copy and recycle the original buffer,
+ *	otherwise we attach the original buffer as a page fragment to a fresh
+ *	sk_buff.  If a positive drop threshold is supplied packets are dropped
+ *	and their buffers recycled if (a) the number of remaining buffers is
+ *	under the threshold and the packet is too big to copy, or (b) there's
+ *	no system memory.
+ *
+ * 	Note: this function is similar to @get_packet but deals with Rx buffers
+ * 	that are page chunks rather than sk_buffs.
+ */
+static struct sk_buff *get_packet_pg(struct adapter *adap, struct sge_fl *fl,
+				     struct sge_rspq *q, unsigned int len,
+				     unsigned int drop_thres)
+{
+	struct sk_buff *newskb, *skb;
+	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
+
+	dma_addr_t dma_addr = dma_unmap_addr(sd, dma_addr);
+
+	newskb = skb = q->pg_skb;
+	if (!skb && (len <= SGE_RX_COPY_THRES)) {
+		newskb = alloc_skb(len, GFP_ATOMIC);
+		if (likely(newskb != NULL)) {
+			__skb_put(newskb, len);
+			pci_dma_sync_single_for_cpu(adap->pdev, dma_addr, len,
+					    PCI_DMA_FROMDEVICE);
+			memcpy(newskb->data, sd->pg_chunk.va, len);
+			pci_dma_sync_single_for_device(adap->pdev, dma_addr,
+						       len,
+						       PCI_DMA_FROMDEVICE);
+		} else if (!drop_thres)
+			return NULL;
+recycle:
+		fl->credits--;
+		recycle_rx_buf(adap, fl, fl->cidx);
+		q->rx_recycle_buf++;
+		return newskb;
+	}
+
+	if (unlikely(q->rx_recycle_buf || (!skb && fl->credits <= drop_thres)))
+		goto recycle;
+
+	prefetch(sd->pg_chunk.p_cnt);
+
+	if (!skb)
+		newskb = alloc_skb(SGE_RX_PULL_LEN, GFP_ATOMIC);
+
+	if (unlikely(!newskb)) {
+		if (!drop_thres)
+			return NULL;
+		goto recycle;
+	}
+
+	pci_dma_sync_single_for_cpu(adap->pdev, dma_addr, len,
+				    PCI_DMA_FROMDEVICE);
+	(*sd->pg_chunk.p_cnt)--;
+	if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page)
+		pci_unmap_page(adap->pdev,
+			       sd->pg_chunk.mapping,
+			       fl->alloc_size,
+			       PCI_DMA_FROMDEVICE);
+	if (!skb) {
+		__skb_put(newskb, SGE_RX_PULL_LEN);
+		memcpy(newskb->data, sd->pg_chunk.va, SGE_RX_PULL_LEN);
+		skb_fill_page_desc(newskb, 0, sd->pg_chunk.page,
+				   sd->pg_chunk.offset + SGE_RX_PULL_LEN,
+				   len - SGE_RX_PULL_LEN);
+		newskb->len = len;
+		newskb->data_len = len - SGE_RX_PULL_LEN;
+		newskb->truesize += newskb->data_len;
+	} else {
+		skb_fill_page_desc(newskb, skb_shinfo(newskb)->nr_frags,
+				   sd->pg_chunk.page,
+				   sd->pg_chunk.offset, len);
+		newskb->len += len;
+		newskb->data_len += len;
+		newskb->truesize += len;
+	}
+
+	fl->credits--;
+	/*
+	 * We do not refill FLs here, we let the caller do it to overlap a
+	 * prefetch.
+	 */
+	return newskb;
+}
+
+/**
+ *	get_imm_packet - return the next ingress packet buffer from a response
+ *	@resp: the response descriptor containing the packet data
+ *
+ *	Return a packet containing the immediate data of the given response.
+ */
+static inline struct sk_buff *get_imm_packet(const struct rsp_desc *resp)
+{
+	struct sk_buff *skb = alloc_skb(IMMED_PKT_SIZE, GFP_ATOMIC);
+
+	if (skb) {
+		__skb_put(skb, IMMED_PKT_SIZE);
+		skb_copy_to_linear_data(skb, resp->imm_data, IMMED_PKT_SIZE);
+	}
+	return skb;
+}
+
+/**
+ *	calc_tx_descs - calculate the number of Tx descriptors for a packet
+ *	@skb: the packet
+ *
+ * 	Returns the number of Tx descriptors needed for the given Ethernet
+ * 	packet.  Ethernet packets require addition of WR and CPL headers.
+ */
+static inline unsigned int calc_tx_descs(const struct sk_buff *skb)
+{
+	unsigned int flits;
+
+	if (skb->len <= WR_LEN - sizeof(struct cpl_tx_pkt))
+		return 1;
+
+	flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 2;
+	if (skb_shinfo(skb)->gso_size)
+		flits++;
+	return flits_to_desc(flits);
+}
+
+/*	map_skb - map a packet main body and its page fragments
+ *	@pdev: the PCI device
+ *	@skb: the packet
+ *	@addr: placeholder to save the mapped addresses
+ *
+ *	map the main body of an sk_buff and its page fragments, if any.
+ */
+static int map_skb(struct pci_dev *pdev, const struct sk_buff *skb,
+		   dma_addr_t *addr)
+{
+	const skb_frag_t *fp, *end;
+	const struct skb_shared_info *si;
+
+	if (skb_headlen(skb)) {
+		*addr = pci_map_single(pdev, skb->data, skb_headlen(skb),
+				       PCI_DMA_TODEVICE);
+		if (pci_dma_mapping_error(pdev, *addr))
+			goto out_err;
+		addr++;
+	}
+
+	si = skb_shinfo(skb);
+	end = &si->frags[si->nr_frags];
+
+	for (fp = si->frags; fp < end; fp++) {
+		*addr = skb_frag_dma_map(&pdev->dev, fp, 0, skb_frag_size(fp),
+					 DMA_TO_DEVICE);
+		if (pci_dma_mapping_error(pdev, *addr))
+			goto unwind;
+		addr++;
+	}
+	return 0;
+
+unwind:
+	while (fp-- > si->frags)
+		dma_unmap_page(&pdev->dev, *--addr, skb_frag_size(fp),
+			       DMA_TO_DEVICE);
+
+	pci_unmap_single(pdev, addr[-1], skb_headlen(skb), PCI_DMA_TODEVICE);
+out_err:
+	return -ENOMEM;
+}
+
+/**
+ *	write_sgl - populate a scatter/gather list for a packet
+ *	@skb: the packet
+ *	@sgp: the SGL to populate
+ *	@start: start address of skb main body data to include in the SGL
+ *	@len: length of skb main body data to include in the SGL
+ *	@addr: the list of the mapped addresses
+ *
+ *	Copies the scatter/gather list for the buffers that make up a packet
+ *	and returns the SGL size in 8-byte words.  The caller must size the SGL
+ *	appropriately.
+ */
+static inline unsigned int write_sgl(const struct sk_buff *skb,
+				     struct sg_ent *sgp, unsigned char *start,
+				     unsigned int len, const dma_addr_t *addr)
+{
+	unsigned int i, j = 0, k = 0, nfrags;
+
+	if (len) {
+		sgp->len[0] = cpu_to_be32(len);
+		sgp->addr[j++] = cpu_to_be64(addr[k++]);
+	}
+
+	nfrags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < nfrags; i++) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		sgp->len[j] = cpu_to_be32(skb_frag_size(frag));
+		sgp->addr[j] = cpu_to_be64(addr[k++]);
+		j ^= 1;
+		if (j == 0)
+			++sgp;
+	}
+	if (j)
+		sgp->len[j] = 0;
+	return ((nfrags + (len != 0)) * 3) / 2 + j;
+}
+
+/**
+ *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
+ *	@adap: the adapter
+ *	@q: the Tx queue
+ *
+ *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
+ *	where the HW is going to sleep just after we checked, however,
+ *	then the interrupt handler will detect the outstanding TX packet
+ *	and ring the doorbell for us.
+ *
+ *	When GTS is disabled we unconditionally ring the doorbell.
+ */
+static inline void check_ring_tx_db(struct adapter *adap, struct sge_txq *q)
+{
+#if USE_GTS
+	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
+	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
+		set_bit(TXQ_LAST_PKT_DB, &q->flags);
+		t3_write_reg(adap, A_SG_KDOORBELL,
+			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+	}
+#else
+	wmb();			/* write descriptors before telling HW */
+	t3_write_reg(adap, A_SG_KDOORBELL,
+		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+#endif
+}
+
+static inline void wr_gen2(struct tx_desc *d, unsigned int gen)
+{
+#if SGE_NUM_GENBITS == 2
+	d->flit[TX_DESC_FLITS - 1] = cpu_to_be64(gen);
+#endif
+}
+
+/**
+ *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
+ *	@ndesc: number of Tx descriptors spanned by the SGL
+ *	@skb: the packet corresponding to the WR
+ *	@d: first Tx descriptor to be written
+ *	@pidx: index of above descriptors
+ *	@q: the SGE Tx queue
+ *	@sgl: the SGL
+ *	@flits: number of flits to the start of the SGL in the first descriptor
+ *	@sgl_flits: the SGL size in flits
+ *	@gen: the Tx descriptor generation
+ *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
+ *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
+ *
+ *	Write a work request header and an associated SGL.  If the SGL is
+ *	small enough to fit into one Tx descriptor it has already been written
+ *	and we just need to write the WR header.  Otherwise we distribute the
+ *	SGL across the number of descriptors it spans.
+ */
+static void write_wr_hdr_sgl(unsigned int ndesc, struct sk_buff *skb,
+			     struct tx_desc *d, unsigned int pidx,
+			     const struct sge_txq *q,
+			     const struct sg_ent *sgl,
+			     unsigned int flits, unsigned int sgl_flits,
+			     unsigned int gen, __be32 wr_hi,
+			     __be32 wr_lo)
+{
+	struct work_request_hdr *wrp = (struct work_request_hdr *)d;
+	struct tx_sw_desc *sd = &q->sdesc[pidx];
+
+	sd->skb = skb;
+	if (need_skb_unmap()) {
+		sd->fragidx = 0;
+		sd->addr_idx = 0;
+		sd->sflit = flits;
+	}
+
+	if (likely(ndesc == 1)) {
+		sd->eop = 1;
+		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
+				   V_WR_SGLSFLT(flits)) | wr_hi;
+		dma_wmb();
+		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
+				   V_WR_GEN(gen)) | wr_lo;
+		wr_gen2(d, gen);
+	} else {
+		unsigned int ogen = gen;
+		const u64 *fp = (const u64 *)sgl;
+		struct work_request_hdr *wp = wrp;
+
+		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
+				   V_WR_SGLSFLT(flits)) | wr_hi;
+
+		while (sgl_flits) {
+			unsigned int avail = WR_FLITS - flits;
+
+			if (avail > sgl_flits)
+				avail = sgl_flits;
+			memcpy(&d->flit[flits], fp, avail * sizeof(*fp));
+			sgl_flits -= avail;
+			ndesc--;
+			if (!sgl_flits)
+				break;
+
+			fp += avail;
+			d++;
+			sd->eop = 0;
+			sd++;
+			if (++pidx == q->size) {
+				pidx = 0;
+				gen ^= 1;
+				d = q->desc;
+				sd = q->sdesc;
+			}
+
+			sd->skb = skb;
+			wrp = (struct work_request_hdr *)d;
+			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
+					   V_WR_SGLSFLT(1)) | wr_hi;
+			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
+							sgl_flits + 1)) |
+					   V_WR_GEN(gen)) | wr_lo;
+			wr_gen2(d, gen);
+			flits = 1;
+		}
+		sd->eop = 1;
+		wrp->wr_hi |= htonl(F_WR_EOP);
+		dma_wmb();
+		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
+		wr_gen2((struct tx_desc *)wp, ogen);
+		WARN_ON(ndesc != 0);
+	}
+}
+
+/**
+ *	write_tx_pkt_wr - write a TX_PKT work request
+ *	@adap: the adapter
+ *	@skb: the packet to send
+ *	@pi: the egress interface
+ *	@pidx: index of the first Tx descriptor to write
+ *	@gen: the generation value to use
+ *	@q: the Tx queue
+ *	@ndesc: number of descriptors the packet will occupy
+ *	@compl: the value of the COMPL bit to use
+ *
+ *	Generate a TX_PKT work request to send the supplied packet.
+ */
+static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
+			    const struct port_info *pi,
+			    unsigned int pidx, unsigned int gen,
+			    struct sge_txq *q, unsigned int ndesc,
+			    unsigned int compl, const dma_addr_t *addr)
+{
+	unsigned int flits, sgl_flits, cntrl, tso_info;
+	struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
+	struct tx_desc *d = &q->desc[pidx];
+	struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)d;
+
+	cpl->len = htonl(skb->len);
+	cntrl = V_TXPKT_INTF(pi->port_id);
+
+	if (skb_vlan_tag_present(skb))
+		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(skb_vlan_tag_get(skb));
+
+	tso_info = V_LSO_MSS(skb_shinfo(skb)->gso_size);
+	if (tso_info) {
+		int eth_type;
+		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)cpl;
+
+		d->flit[2] = 0;
+		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
+		hdr->cntrl = htonl(cntrl);
+		eth_type = skb_network_offset(skb) == ETH_HLEN ?
+		    CPL_ETH_II : CPL_ETH_II_VLAN;
+		tso_info |= V_LSO_ETH_TYPE(eth_type) |
+		    V_LSO_IPHDR_WORDS(ip_hdr(skb)->ihl) |
+		    V_LSO_TCPHDR_WORDS(tcp_hdr(skb)->doff);
+		hdr->lso_info = htonl(tso_info);
+		flits = 3;
+	} else {
+		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
+		cntrl |= F_TXPKT_IPCSUM_DIS;	/* SW calculates IP csum */
+		cntrl |= V_TXPKT_L4CSUM_DIS(skb->ip_summed != CHECKSUM_PARTIAL);
+		cpl->cntrl = htonl(cntrl);
+
+		if (skb->len <= WR_LEN - sizeof(*cpl)) {
+			q->sdesc[pidx].skb = NULL;
+			if (!skb->data_len)
+				skb_copy_from_linear_data(skb, &d->flit[2],
+							  skb->len);
+			else
+				skb_copy_bits(skb, 0, &d->flit[2], skb->len);
+
+			flits = (skb->len + 7) / 8 + 2;
+			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(skb->len & 7) |
+					      V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT)
+					      | F_WR_SOP | F_WR_EOP | compl);
+			dma_wmb();
+			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(gen) |
+					      V_WR_TID(q->token));
+			wr_gen2(d, gen);
+			dev_consume_skb_any(skb);
+			return;
+		}
+
+		flits = 2;
+	}
+
+	sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
+	sgl_flits = write_sgl(skb, sgp, skb->data, skb_headlen(skb), addr);
+
+	write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits, gen,
+			 htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | compl),
+			 htonl(V_WR_TID(q->token)));
+}
+
+static inline void t3_stop_tx_queue(struct netdev_queue *txq,
+				    struct sge_qset *qs, struct sge_txq *q)
+{
+	netif_tx_stop_queue(txq);
+	set_bit(TXQ_ETH, &qs->txq_stopped);
+	q->stops++;
+}
+
+/**
+ *	eth_xmit - add a packet to the Ethernet Tx queue
+ *	@skb: the packet
+ *	@dev: the egress net device
+ *
+ *	Add a packet to an SGE Tx queue.  Runs with softirqs disabled.
+ */
+netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	int qidx;
+	unsigned int ndesc, pidx, credits, gen, compl;
+	const struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct netdev_queue *txq;
+	struct sge_qset *qs;
+	struct sge_txq *q;
+	dma_addr_t addr[MAX_SKB_FRAGS + 1];
+
+	/*
+	 * The chip min packet length is 9 octets but play safe and reject
+	 * anything shorter than an Ethernet header.
+	 */
+	if (unlikely(skb->len < ETH_HLEN)) {
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+	qidx = skb_get_queue_mapping(skb);
+	qs = &pi->qs[qidx];
+	q = &qs->txq[TXQ_ETH];
+	txq = netdev_get_tx_queue(dev, qidx);
+
+	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
+
+	credits = q->size - q->in_use;
+	ndesc = calc_tx_descs(skb);
+
+	if (unlikely(credits < ndesc)) {
+		t3_stop_tx_queue(txq, qs, q);
+		dev_err(&adap->pdev->dev,
+			"%s: Tx ring %u full while queue awake!\n",
+			dev->name, q->cntxt_id & 7);
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Check if ethernet packet can't be sent as immediate data */
+	if (skb->len > (WR_LEN - sizeof(struct cpl_tx_pkt))) {
+		if (unlikely(map_skb(adap->pdev, skb, addr) < 0)) {
+			dev_kfree_skb(skb);
+			return NETDEV_TX_OK;
+		}
+	}
+
+	q->in_use += ndesc;
+	if (unlikely(credits - ndesc < q->stop_thres)) {
+		t3_stop_tx_queue(txq, qs, q);
+
+		if (should_restart_tx(q) &&
+		    test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
+			q->restarts++;
+			netif_tx_start_queue(txq);
+		}
+	}
+
+	gen = q->gen;
+	q->unacked += ndesc;
+	compl = (q->unacked & 8) << (S_WR_COMPL - 3);
+	q->unacked &= 7;
+	pidx = q->pidx;
+	q->pidx += ndesc;
+	if (q->pidx >= q->size) {
+		q->pidx -= q->size;
+		q->gen ^= 1;
+	}
+
+	/* update port statistics */
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		qs->port_stats[SGE_PSTAT_TX_CSUM]++;
+	if (skb_shinfo(skb)->gso_size)
+		qs->port_stats[SGE_PSTAT_TSO]++;
+	if (skb_vlan_tag_present(skb))
+		qs->port_stats[SGE_PSTAT_VLANINS]++;
+
+	/*
+	 * We do not use Tx completion interrupts to free DMAd Tx packets.
+	 * This is good for performance but means that we rely on new Tx
+	 * packets arriving to run the destructors of completed packets,
+	 * which open up space in their sockets' send queues.  Sometimes
+	 * we do not get such new packets causing Tx to stall.  A single
+	 * UDP transmitter is a good example of this situation.  We have
+	 * a clean up timer that periodically reclaims completed packets
+	 * but it doesn't run often enough (nor do we want it to) to prevent
+	 * lengthy stalls.  A solution to this problem is to run the
+	 * destructor early, after the packet is queued but before it's DMAd.
+	 * A cons is that we lie to socket memory accounting, but the amount
+	 * of extra memory is reasonable (limited by the number of Tx
+	 * descriptors), the packets do actually get freed quickly by new
+	 * packets almost always, and for protocols like TCP that wait for
+	 * acks to really free up the data the extra memory is even less.
+	 * On the positive side we run the destructors on the sending CPU
+	 * rather than on a potentially different completing CPU, usually a
+	 * good thing.  We also run them without holding our Tx queue lock,
+	 * unlike what reclaim_completed_tx() would otherwise do.
+	 *
+	 * Run the destructor before telling the DMA engine about the packet
+	 * to make sure it doesn't complete and get freed prematurely.
+	 */
+	if (likely(!skb_shared(skb)))
+		skb_orphan(skb);
+
+	write_tx_pkt_wr(adap, skb, pi, pidx, gen, q, ndesc, compl, addr);
+	check_ring_tx_db(adap, q);
+	return NETDEV_TX_OK;
+}
+
+/**
+ *	write_imm - write a packet into a Tx descriptor as immediate data
+ *	@d: the Tx descriptor to write
+ *	@skb: the packet
+ *	@len: the length of packet data to write as immediate data
+ *	@gen: the generation bit value to write
+ *
+ *	Writes a packet as immediate data into a Tx descriptor.  The packet
+ *	contains a work request at its beginning.  We must write the packet
+ *	carefully so the SGE doesn't read it accidentally before it's written
+ *	in its entirety.
+ */
+static inline void write_imm(struct tx_desc *d, struct sk_buff *skb,
+			     unsigned int len, unsigned int gen)
+{
+	struct work_request_hdr *from = (struct work_request_hdr *)skb->data;
+	struct work_request_hdr *to = (struct work_request_hdr *)d;
+
+	if (likely(!skb->data_len))
+		memcpy(&to[1], &from[1], len - sizeof(*from));
+	else
+		skb_copy_bits(skb, sizeof(*from), &to[1], len - sizeof(*from));
+
+	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
+					V_WR_BCNTLFLT(len & 7));
+	dma_wmb();
+	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
+					V_WR_LEN((len + 7) / 8));
+	wr_gen2(d, gen);
+	kfree_skb(skb);
+}
+
+/**
+ *	check_desc_avail - check descriptor availability on a send queue
+ *	@adap: the adapter
+ *	@q: the send queue
+ *	@skb: the packet needing the descriptors
+ *	@ndesc: the number of Tx descriptors needed
+ *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
+ *
+ *	Checks if the requested number of Tx descriptors is available on an
+ *	SGE send queue.  If the queue is already suspended or not enough
+ *	descriptors are available the packet is queued for later transmission.
+ *	Must be called with the Tx queue locked.
+ *
+ *	Returns 0 if enough descriptors are available, 1 if there aren't
+ *	enough descriptors and the packet has been queued, and 2 if the caller
+ *	needs to retry because there weren't enough descriptors at the
+ *	beginning of the call but some freed up in the mean time.
+ */
+static inline int check_desc_avail(struct adapter *adap, struct sge_txq *q,
+				   struct sk_buff *skb, unsigned int ndesc,
+				   unsigned int qid)
+{
+	if (unlikely(!skb_queue_empty(&q->sendq))) {
+	      addq_exit:__skb_queue_tail(&q->sendq, skb);
+		return 1;
+	}
+	if (unlikely(q->size - q->in_use < ndesc)) {
+		struct sge_qset *qs = txq_to_qset(q, qid);
+
+		set_bit(qid, &qs->txq_stopped);
+		smp_mb__after_atomic();
+
+		if (should_restart_tx(q) &&
+		    test_and_clear_bit(qid, &qs->txq_stopped))
+			return 2;
+
+		q->stops++;
+		goto addq_exit;
+	}
+	return 0;
+}
+
+/**
+ *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
+ *	@q: the SGE control Tx queue
+ *
+ *	This is a variant of reclaim_completed_tx() that is used for Tx queues
+ *	that send only immediate data (presently just the control queues) and
+ *	thus do not have any sk_buffs to release.
+ */
+static inline void reclaim_completed_tx_imm(struct sge_txq *q)
+{
+	unsigned int reclaim = q->processed - q->cleaned;
+
+	q->in_use -= reclaim;
+	q->cleaned += reclaim;
+}
+
+static inline int immediate(const struct sk_buff *skb)
+{
+	return skb->len <= WR_LEN;
+}
+
+/**
+ *	ctrl_xmit - send a packet through an SGE control Tx queue
+ *	@adap: the adapter
+ *	@q: the control queue
+ *	@skb: the packet
+ *
+ *	Send a packet through an SGE control Tx queue.  Packets sent through
+ *	a control queue must fit entirely as immediate data in a single Tx
+ *	descriptor and have no page fragments.
+ */
+static int ctrl_xmit(struct adapter *adap, struct sge_txq *q,
+		     struct sk_buff *skb)
+{
+	int ret;
+	struct work_request_hdr *wrp = (struct work_request_hdr *)skb->data;
+
+	if (unlikely(!immediate(skb))) {
+		WARN_ON(1);
+		dev_kfree_skb(skb);
+		return NET_XMIT_SUCCESS;
+	}
+
+	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
+	wrp->wr_lo = htonl(V_WR_TID(q->token));
+
+	spin_lock(&q->lock);
+      again:reclaim_completed_tx_imm(q);
+
+	ret = check_desc_avail(adap, q, skb, 1, TXQ_CTRL);
+	if (unlikely(ret)) {
+		if (ret == 1) {
+			spin_unlock(&q->lock);
+			return NET_XMIT_CN;
+		}
+		goto again;
+	}
+
+	write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
+
+	q->in_use++;
+	if (++q->pidx >= q->size) {
+		q->pidx = 0;
+		q->gen ^= 1;
+	}
+	spin_unlock(&q->lock);
+	wmb();
+	t3_write_reg(adap, A_SG_KDOORBELL,
+		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+	return NET_XMIT_SUCCESS;
+}
+
+/**
+ *	restart_ctrlq - restart a suspended control queue
+ *	@qs: the queue set cotaining the control queue
+ *
+ *	Resumes transmission on a suspended Tx control queue.
+ */
+static void restart_ctrlq(unsigned long data)
+{
+	struct sk_buff *skb;
+	struct sge_qset *qs = (struct sge_qset *)data;
+	struct sge_txq *q = &qs->txq[TXQ_CTRL];
+
+	spin_lock(&q->lock);
+      again:reclaim_completed_tx_imm(q);
+
+	while (q->in_use < q->size &&
+	       (skb = __skb_dequeue(&q->sendq)) != NULL) {
+
+		write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
+
+		if (++q->pidx >= q->size) {
+			q->pidx = 0;
+			q->gen ^= 1;
+		}
+		q->in_use++;
+	}
+
+	if (!skb_queue_empty(&q->sendq)) {
+		set_bit(TXQ_CTRL, &qs->txq_stopped);
+		smp_mb__after_atomic();
+
+		if (should_restart_tx(q) &&
+		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
+			goto again;
+		q->stops++;
+	}
+
+	spin_unlock(&q->lock);
+	wmb();
+	t3_write_reg(qs->adap, A_SG_KDOORBELL,
+		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+}
+
+/*
+ * Send a management message through control queue 0
+ */
+int t3_mgmt_tx(struct adapter *adap, struct sk_buff *skb)
+{
+	int ret;
+	local_bh_disable();
+	ret = ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], skb);
+	local_bh_enable();
+
+	return ret;
+}
+
+/**
+ *	deferred_unmap_destructor - unmap a packet when it is freed
+ *	@skb: the packet
+ *
+ *	This is the packet destructor used for Tx packets that need to remain
+ *	mapped until they are freed rather than until their Tx descriptors are
+ *	freed.
+ */
+static void deferred_unmap_destructor(struct sk_buff *skb)
+{
+	int i;
+	const dma_addr_t *p;
+	const struct skb_shared_info *si;
+	const struct deferred_unmap_info *dui;
+
+	dui = (struct deferred_unmap_info *)skb->head;
+	p = dui->addr;
+
+	if (skb_tail_pointer(skb) - skb_transport_header(skb))
+		pci_unmap_single(dui->pdev, *p++, skb_tail_pointer(skb) -
+				 skb_transport_header(skb), PCI_DMA_TODEVICE);
+
+	si = skb_shinfo(skb);
+	for (i = 0; i < si->nr_frags; i++)
+		pci_unmap_page(dui->pdev, *p++, skb_frag_size(&si->frags[i]),
+			       PCI_DMA_TODEVICE);
+}
+
+static void setup_deferred_unmapping(struct sk_buff *skb, struct pci_dev *pdev,
+				     const struct sg_ent *sgl, int sgl_flits)
+{
+	dma_addr_t *p;
+	struct deferred_unmap_info *dui;
+
+	dui = (struct deferred_unmap_info *)skb->head;
+	dui->pdev = pdev;
+	for (p = dui->addr; sgl_flits >= 3; sgl++, sgl_flits -= 3) {
+		*p++ = be64_to_cpu(sgl->addr[0]);
+		*p++ = be64_to_cpu(sgl->addr[1]);
+	}
+	if (sgl_flits)
+		*p = be64_to_cpu(sgl->addr[0]);
+}
+
+/**
+ *	write_ofld_wr - write an offload work request
+ *	@adap: the adapter
+ *	@skb: the packet to send
+ *	@q: the Tx queue
+ *	@pidx: index of the first Tx descriptor to write
+ *	@gen: the generation value to use
+ *	@ndesc: number of descriptors the packet will occupy
+ *
+ *	Write an offload work request to send the supplied packet.  The packet
+ *	data already carry the work request with most fields populated.
+ */
+static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb,
+			  struct sge_txq *q, unsigned int pidx,
+			  unsigned int gen, unsigned int ndesc,
+			  const dma_addr_t *addr)
+{
+	unsigned int sgl_flits, flits;
+	struct work_request_hdr *from;
+	struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
+	struct tx_desc *d = &q->desc[pidx];
+
+	if (immediate(skb)) {
+		q->sdesc[pidx].skb = NULL;
+		write_imm(d, skb, skb->len, gen);
+		return;
+	}
+
+	/* Only TX_DATA builds SGLs */
+
+	from = (struct work_request_hdr *)skb->data;
+	memcpy(&d->flit[1], &from[1],
+	       skb_transport_offset(skb) - sizeof(*from));
+
+	flits = skb_transport_offset(skb) / 8;
+	sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
+	sgl_flits = write_sgl(skb, sgp, skb_transport_header(skb),
+			      skb_tail_pointer(skb) - skb_transport_header(skb),
+			      addr);
+	if (need_skb_unmap()) {
+		setup_deferred_unmapping(skb, adap->pdev, sgp, sgl_flits);
+		skb->destructor = deferred_unmap_destructor;
+	}
+
+	write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits,
+			 gen, from->wr_hi, from->wr_lo);
+}
+
+/**
+ *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
+ *	@skb: the packet
+ *
+ * 	Returns the number of Tx descriptors needed for the given offload
+ * 	packet.  These packets are already fully constructed.
+ */
+static inline unsigned int calc_tx_descs_ofld(const struct sk_buff *skb)
+{
+	unsigned int flits, cnt;
+
+	if (skb->len <= WR_LEN)
+		return 1;	/* packet fits as immediate data */
+
+	flits = skb_transport_offset(skb) / 8;	/* headers */
+	cnt = skb_shinfo(skb)->nr_frags;
+	if (skb_tail_pointer(skb) != skb_transport_header(skb))
+		cnt++;
+	return flits_to_desc(flits + sgl_len(cnt));
+}
+
+/**
+ *	ofld_xmit - send a packet through an offload queue
+ *	@adap: the adapter
+ *	@q: the Tx offload queue
+ *	@skb: the packet
+ *
+ *	Send an offload packet through an SGE offload queue.
+ */
+static int ofld_xmit(struct adapter *adap, struct sge_txq *q,
+		     struct sk_buff *skb)
+{
+	int ret;
+	unsigned int ndesc = calc_tx_descs_ofld(skb), pidx, gen;
+
+	spin_lock(&q->lock);
+again:	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
+
+	ret = check_desc_avail(adap, q, skb, ndesc, TXQ_OFLD);
+	if (unlikely(ret)) {
+		if (ret == 1) {
+			skb->priority = ndesc;	/* save for restart */
+			spin_unlock(&q->lock);
+			return NET_XMIT_CN;
+		}
+		goto again;
+	}
+
+	if (!immediate(skb) &&
+	    map_skb(adap->pdev, skb, (dma_addr_t *)skb->head)) {
+		spin_unlock(&q->lock);
+		return NET_XMIT_SUCCESS;
+	}
+
+	gen = q->gen;
+	q->in_use += ndesc;
+	pidx = q->pidx;
+	q->pidx += ndesc;
+	if (q->pidx >= q->size) {
+		q->pidx -= q->size;
+		q->gen ^= 1;
+	}
+	spin_unlock(&q->lock);
+
+	write_ofld_wr(adap, skb, q, pidx, gen, ndesc, (dma_addr_t *)skb->head);
+	check_ring_tx_db(adap, q);
+	return NET_XMIT_SUCCESS;
+}
+
+/**
+ *	restart_offloadq - restart a suspended offload queue
+ *	@qs: the queue set cotaining the offload queue
+ *
+ *	Resumes transmission on a suspended Tx offload queue.
+ */
+static void restart_offloadq(unsigned long data)
+{
+	struct sk_buff *skb;
+	struct sge_qset *qs = (struct sge_qset *)data;
+	struct sge_txq *q = &qs->txq[TXQ_OFLD];
+	const struct port_info *pi = netdev_priv(qs->netdev);
+	struct adapter *adap = pi->adapter;
+	unsigned int written = 0;
+
+	spin_lock(&q->lock);
+again:	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
+
+	while ((skb = skb_peek(&q->sendq)) != NULL) {
+		unsigned int gen, pidx;
+		unsigned int ndesc = skb->priority;
+
+		if (unlikely(q->size - q->in_use < ndesc)) {
+			set_bit(TXQ_OFLD, &qs->txq_stopped);
+			smp_mb__after_atomic();
+
+			if (should_restart_tx(q) &&
+			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
+				goto again;
+			q->stops++;
+			break;
+		}
+
+		if (!immediate(skb) &&
+		    map_skb(adap->pdev, skb, (dma_addr_t *)skb->head))
+			break;
+
+		gen = q->gen;
+		q->in_use += ndesc;
+		pidx = q->pidx;
+		q->pidx += ndesc;
+		written += ndesc;
+		if (q->pidx >= q->size) {
+			q->pidx -= q->size;
+			q->gen ^= 1;
+		}
+		__skb_unlink(skb, &q->sendq);
+		spin_unlock(&q->lock);
+
+		write_ofld_wr(adap, skb, q, pidx, gen, ndesc,
+			      (dma_addr_t *)skb->head);
+		spin_lock(&q->lock);
+	}
+	spin_unlock(&q->lock);
+
+#if USE_GTS
+	set_bit(TXQ_RUNNING, &q->flags);
+	set_bit(TXQ_LAST_PKT_DB, &q->flags);
+#endif
+	wmb();
+	if (likely(written))
+		t3_write_reg(adap, A_SG_KDOORBELL,
+			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+}
+
+/**
+ *	queue_set - return the queue set a packet should use
+ *	@skb: the packet
+ *
+ *	Maps a packet to the SGE queue set it should use.  The desired queue
+ *	set is carried in bits 1-3 in the packet's priority.
+ */
+static inline int queue_set(const struct sk_buff *skb)
+{
+	return skb->priority >> 1;
+}
+
+/**
+ *	is_ctrl_pkt - return whether an offload packet is a control packet
+ *	@skb: the packet
+ *
+ *	Determines whether an offload packet should use an OFLD or a CTRL
+ *	Tx queue.  This is indicated by bit 0 in the packet's priority.
+ */
+static inline int is_ctrl_pkt(const struct sk_buff *skb)
+{
+	return skb->priority & 1;
+}
+
+/**
+ *	t3_offload_tx - send an offload packet
+ *	@tdev: the offload device to send to
+ *	@skb: the packet
+ *
+ *	Sends an offload packet.  We use the packet priority to select the
+ *	appropriate Tx queue as follows: bit 0 indicates whether the packet
+ *	should be sent as regular or control, bits 1-3 select the queue set.
+ */
+int t3_offload_tx(struct t3cdev *tdev, struct sk_buff *skb)
+{
+	struct adapter *adap = tdev2adap(tdev);
+	struct sge_qset *qs = &adap->sge.qs[queue_set(skb)];
+
+	if (unlikely(is_ctrl_pkt(skb)))
+		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], skb);
+
+	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], skb);
+}
+
+/**
+ *	offload_enqueue - add an offload packet to an SGE offload receive queue
+ *	@q: the SGE response queue
+ *	@skb: the packet
+ *
+ *	Add a new offload packet to an SGE response queue's offload packet
+ *	queue.  If the packet is the first on the queue it schedules the RX
+ *	softirq to process the queue.
+ */
+static inline void offload_enqueue(struct sge_rspq *q, struct sk_buff *skb)
+{
+	int was_empty = skb_queue_empty(&q->rx_queue);
+
+	__skb_queue_tail(&q->rx_queue, skb);
+
+	if (was_empty) {
+		struct sge_qset *qs = rspq_to_qset(q);
+
+		napi_schedule(&qs->napi);
+	}
+}
+
+/**
+ *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
+ *	@tdev: the offload device that will be receiving the packets
+ *	@q: the SGE response queue that assembled the bundle
+ *	@skbs: the partial bundle
+ *	@n: the number of packets in the bundle
+ *
+ *	Delivers a (partial) bundle of Rx offload packets to an offload device.
+ */
+static inline void deliver_partial_bundle(struct t3cdev *tdev,
+					  struct sge_rspq *q,
+					  struct sk_buff *skbs[], int n)
+{
+	if (n) {
+		q->offload_bundles++;
+		tdev->recv(tdev, skbs, n);
+	}
+}
+
+/**
+ *	ofld_poll - NAPI handler for offload packets in interrupt mode
+ *	@dev: the network device doing the polling
+ *	@budget: polling budget
+ *
+ *	The NAPI handler for offload packets when a response queue is serviced
+ *	by the hard interrupt handler, i.e., when it's operating in non-polling
+ *	mode.  Creates small packet batches and sends them through the offload
+ *	receive handler.  Batches need to be of modest size as we do prefetches
+ *	on the packets in each.
+ */
+static int ofld_poll(struct napi_struct *napi, int budget)
+{
+	struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
+	struct sge_rspq *q = &qs->rspq;
+	struct adapter *adapter = qs->adap;
+	int work_done = 0;
+
+	while (work_done < budget) {
+		struct sk_buff *skb, *tmp, *skbs[RX_BUNDLE_SIZE];
+		struct sk_buff_head queue;
+		int ngathered;
+
+		spin_lock_irq(&q->lock);
+		__skb_queue_head_init(&queue);
+		skb_queue_splice_init(&q->rx_queue, &queue);
+		if (skb_queue_empty(&queue)) {
+			napi_complete_done(napi, work_done);
+			spin_unlock_irq(&q->lock);
+			return work_done;
+		}
+		spin_unlock_irq(&q->lock);
+
+		ngathered = 0;
+		skb_queue_walk_safe(&queue, skb, tmp) {
+			if (work_done >= budget)
+				break;
+			work_done++;
+
+			__skb_unlink(skb, &queue);
+			prefetch(skb->data);
+			skbs[ngathered] = skb;
+			if (++ngathered == RX_BUNDLE_SIZE) {
+				q->offload_bundles++;
+				adapter->tdev.recv(&adapter->tdev, skbs,
+						   ngathered);
+				ngathered = 0;
+			}
+		}
+		if (!skb_queue_empty(&queue)) {
+			/* splice remaining packets back onto Rx queue */
+			spin_lock_irq(&q->lock);
+			skb_queue_splice(&queue, &q->rx_queue);
+			spin_unlock_irq(&q->lock);
+		}
+		deliver_partial_bundle(&adapter->tdev, q, skbs, ngathered);
+	}
+
+	return work_done;
+}
+
+/**
+ *	rx_offload - process a received offload packet
+ *	@tdev: the offload device receiving the packet
+ *	@rq: the response queue that received the packet
+ *	@skb: the packet
+ *	@rx_gather: a gather list of packets if we are building a bundle
+ *	@gather_idx: index of the next available slot in the bundle
+ *
+ *	Process an ingress offload pakcet and add it to the offload ingress
+ *	queue. 	Returns the index of the next available slot in the bundle.
+ */
+static inline int rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
+			     struct sk_buff *skb, struct sk_buff *rx_gather[],
+			     unsigned int gather_idx)
+{
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+
+	if (rq->polling) {
+		rx_gather[gather_idx++] = skb;
+		if (gather_idx == RX_BUNDLE_SIZE) {
+			tdev->recv(tdev, rx_gather, RX_BUNDLE_SIZE);
+			gather_idx = 0;
+			rq->offload_bundles++;
+		}
+	} else
+		offload_enqueue(rq, skb);
+
+	return gather_idx;
+}
+
+/**
+ *	restart_tx - check whether to restart suspended Tx queues
+ *	@qs: the queue set to resume
+ *
+ *	Restarts suspended Tx queues of an SGE queue set if they have enough
+ *	free resources to resume operation.
+ */
+static void restart_tx(struct sge_qset *qs)
+{
+	if (test_bit(TXQ_ETH, &qs->txq_stopped) &&
+	    should_restart_tx(&qs->txq[TXQ_ETH]) &&
+	    test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
+		qs->txq[TXQ_ETH].restarts++;
+		if (netif_running(qs->netdev))
+			netif_tx_wake_queue(qs->tx_q);
+	}
+
+	if (test_bit(TXQ_OFLD, &qs->txq_stopped) &&
+	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
+	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
+		qs->txq[TXQ_OFLD].restarts++;
+		tasklet_schedule(&qs->txq[TXQ_OFLD].qresume_tsk);
+	}
+	if (test_bit(TXQ_CTRL, &qs->txq_stopped) &&
+	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
+	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
+		qs->txq[TXQ_CTRL].restarts++;
+		tasklet_schedule(&qs->txq[TXQ_CTRL].qresume_tsk);
+	}
+}
+
+/**
+ *	cxgb3_arp_process - process an ARP request probing a private IP address
+ *	@adapter: the adapter
+ *	@skb: the skbuff containing the ARP request
+ *
+ *	Check if the ARP request is probing the private IP address
+ *	dedicated to iSCSI, generate an ARP reply if so.
+ */
+static void cxgb3_arp_process(struct port_info *pi, struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct arphdr *arp;
+	unsigned char *arp_ptr;
+	unsigned char *sha;
+	__be32 sip, tip;
+
+	if (!dev)
+		return;
+
+	skb_reset_network_header(skb);
+	arp = arp_hdr(skb);
+
+	if (arp->ar_op != htons(ARPOP_REQUEST))
+		return;
+
+	arp_ptr = (unsigned char *)(arp + 1);
+	sha = arp_ptr;
+	arp_ptr += dev->addr_len;
+	memcpy(&sip, arp_ptr, sizeof(sip));
+	arp_ptr += sizeof(sip);
+	arp_ptr += dev->addr_len;
+	memcpy(&tip, arp_ptr, sizeof(tip));
+
+	if (tip != pi->iscsi_ipv4addr)
+		return;
+
+	arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
+		 pi->iscsic.mac_addr, sha);
+
+}
+
+static inline int is_arp(struct sk_buff *skb)
+{
+	return skb->protocol == htons(ETH_P_ARP);
+}
+
+static void cxgb3_process_iscsi_prov_pack(struct port_info *pi,
+					struct sk_buff *skb)
+{
+	if (is_arp(skb)) {
+		cxgb3_arp_process(pi, skb);
+		return;
+	}
+
+	if (pi->iscsic.recv)
+		pi->iscsic.recv(pi, skb);
+
+}
+
+/**
+ *	rx_eth - process an ingress ethernet packet
+ *	@adap: the adapter
+ *	@rq: the response queue that received the packet
+ *	@skb: the packet
+ *	@pad: amount of padding at the start of the buffer
+ *
+ *	Process an ingress ethernet pakcet and deliver it to the stack.
+ *	The padding is 2 if the packet was delivered in an Rx buffer and 0
+ *	if it was immediate data in a response.
+ */
+static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
+		   struct sk_buff *skb, int pad, int lro)
+{
+	struct cpl_rx_pkt *p = (struct cpl_rx_pkt *)(skb->data + pad);
+	struct sge_qset *qs = rspq_to_qset(rq);
+	struct port_info *pi;
+
+	skb_pull(skb, sizeof(*p) + pad);
+	skb->protocol = eth_type_trans(skb, adap->port[p->iff]);
+	pi = netdev_priv(skb->dev);
+	if ((skb->dev->features & NETIF_F_RXCSUM) && p->csum_valid &&
+	    p->csum == htons(0xffff) && !p->fragment) {
+		qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else
+		skb_checksum_none_assert(skb);
+	skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]);
+
+	if (p->vlan_valid) {
+		qs->port_stats[SGE_PSTAT_VLANEX]++;
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(p->vlan));
+	}
+	if (rq->polling) {
+		if (lro)
+			napi_gro_receive(&qs->napi, skb);
+		else {
+			if (unlikely(pi->iscsic.flags))
+				cxgb3_process_iscsi_prov_pack(pi, skb);
+			netif_receive_skb(skb);
+		}
+	} else
+		netif_rx(skb);
+}
+
+static inline int is_eth_tcp(u32 rss)
+{
+	return G_HASHTYPE(ntohl(rss)) == RSS_HASH_4_TUPLE;
+}
+
+/**
+ *	lro_add_page - add a page chunk to an LRO session
+ *	@adap: the adapter
+ *	@qs: the associated queue set
+ *	@fl: the free list containing the page chunk to add
+ *	@len: packet length
+ *	@complete: Indicates the last fragment of a frame
+ *
+ *	Add a received packet contained in a page chunk to an existing LRO
+ *	session.
+ */
+static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
+			 struct sge_fl *fl, int len, int complete)
+{
+	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
+	struct port_info *pi = netdev_priv(qs->netdev);
+	struct sk_buff *skb = NULL;
+	struct cpl_rx_pkt *cpl;
+	struct skb_frag_struct *rx_frag;
+	int nr_frags;
+	int offset = 0;
+
+	if (!qs->nomem) {
+		skb = napi_get_frags(&qs->napi);
+		qs->nomem = !skb;
+	}
+
+	fl->credits--;
+
+	pci_dma_sync_single_for_cpu(adap->pdev,
+				    dma_unmap_addr(sd, dma_addr),
+				    fl->buf_size - SGE_PG_RSVD,
+				    PCI_DMA_FROMDEVICE);
+
+	(*sd->pg_chunk.p_cnt)--;
+	if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page)
+		pci_unmap_page(adap->pdev,
+			       sd->pg_chunk.mapping,
+			       fl->alloc_size,
+			       PCI_DMA_FROMDEVICE);
+
+	if (!skb) {
+		put_page(sd->pg_chunk.page);
+		if (complete)
+			qs->nomem = 0;
+		return;
+	}
+
+	rx_frag = skb_shinfo(skb)->frags;
+	nr_frags = skb_shinfo(skb)->nr_frags;
+
+	if (!nr_frags) {
+		offset = 2 + sizeof(struct cpl_rx_pkt);
+		cpl = qs->lro_va = sd->pg_chunk.va + 2;
+
+		if ((qs->netdev->features & NETIF_F_RXCSUM) &&
+		     cpl->csum_valid && cpl->csum == htons(0xffff)) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
+		} else
+			skb->ip_summed = CHECKSUM_NONE;
+	} else
+		cpl = qs->lro_va;
+
+	len -= offset;
+
+	rx_frag += nr_frags;
+	__skb_frag_set_page(rx_frag, sd->pg_chunk.page);
+	rx_frag->page_offset = sd->pg_chunk.offset + offset;
+	skb_frag_size_set(rx_frag, len);
+
+	skb->len += len;
+	skb->data_len += len;
+	skb->truesize += len;
+	skb_shinfo(skb)->nr_frags++;
+
+	if (!complete)
+		return;
+
+	skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]);
+
+	if (cpl->vlan_valid) {
+		qs->port_stats[SGE_PSTAT_VLANEX]++;
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(cpl->vlan));
+	}
+	napi_gro_frags(&qs->napi);
+}
+
+/**
+ *	handle_rsp_cntrl_info - handles control information in a response
+ *	@qs: the queue set corresponding to the response
+ *	@flags: the response control flags
+ *
+ *	Handles the control information of an SGE response, such as GTS
+ *	indications and completion credits for the queue set's Tx queues.
+ *	HW coalesces credits, we don't do any extra SW coalescing.
+ */
+static inline void handle_rsp_cntrl_info(struct sge_qset *qs, u32 flags)
+{
+	unsigned int credits;
+
+#if USE_GTS
+	if (flags & F_RSPD_TXQ0_GTS)
+		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
+#endif
+
+	credits = G_RSPD_TXQ0_CR(flags);
+	if (credits)
+		qs->txq[TXQ_ETH].processed += credits;
+
+	credits = G_RSPD_TXQ2_CR(flags);
+	if (credits)
+		qs->txq[TXQ_CTRL].processed += credits;
+
+# if USE_GTS
+	if (flags & F_RSPD_TXQ1_GTS)
+		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
+# endif
+	credits = G_RSPD_TXQ1_CR(flags);
+	if (credits)
+		qs->txq[TXQ_OFLD].processed += credits;
+}
+
+/**
+ *	check_ring_db - check if we need to ring any doorbells
+ *	@adapter: the adapter
+ *	@qs: the queue set whose Tx queues are to be examined
+ *	@sleeping: indicates which Tx queue sent GTS
+ *
+ *	Checks if some of a queue set's Tx queues need to ring their doorbells
+ *	to resume transmission after idling while they still have unprocessed
+ *	descriptors.
+ */
+static void check_ring_db(struct adapter *adap, struct sge_qset *qs,
+			  unsigned int sleeping)
+{
+	if (sleeping & F_RSPD_TXQ0_GTS) {
+		struct sge_txq *txq = &qs->txq[TXQ_ETH];
+
+		if (txq->cleaned + txq->in_use != txq->processed &&
+		    !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
+			set_bit(TXQ_RUNNING, &txq->flags);
+			t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
+				     V_EGRCNTX(txq->cntxt_id));
+		}
+	}
+
+	if (sleeping & F_RSPD_TXQ1_GTS) {
+		struct sge_txq *txq = &qs->txq[TXQ_OFLD];
+
+		if (txq->cleaned + txq->in_use != txq->processed &&
+		    !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
+			set_bit(TXQ_RUNNING, &txq->flags);
+			t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
+				     V_EGRCNTX(txq->cntxt_id));
+		}
+	}
+}
+
+/**
+ *	is_new_response - check if a response is newly written
+ *	@r: the response descriptor
+ *	@q: the response queue
+ *
+ *	Returns true if a response descriptor contains a yet unprocessed
+ *	response.
+ */
+static inline int is_new_response(const struct rsp_desc *r,
+				  const struct sge_rspq *q)
+{
+	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
+}
+
+static inline void clear_rspq_bufstate(struct sge_rspq * const q)
+{
+	q->pg_skb = NULL;
+	q->rx_recycle_buf = 0;
+}
+
+#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
+#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
+			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
+			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
+			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
+
+/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
+#define NOMEM_INTR_DELAY 2500
+
+/**
+ *	process_responses - process responses from an SGE response queue
+ *	@adap: the adapter
+ *	@qs: the queue set to which the response queue belongs
+ *	@budget: how many responses can be processed in this round
+ *
+ *	Process responses from an SGE response queue up to the supplied budget.
+ *	Responses include received packets as well as credits and other events
+ *	for the queues that belong to the response queue's queue set.
+ *	A negative budget is effectively unlimited.
+ *
+ *	Additionally choose the interrupt holdoff time for the next interrupt
+ *	on this queue.  If the system is under memory shortage use a fairly
+ *	long delay to help recovery.
+ */
+static int process_responses(struct adapter *adap, struct sge_qset *qs,
+			     int budget)
+{
+	struct sge_rspq *q = &qs->rspq;
+	struct rsp_desc *r = &q->desc[q->cidx];
+	int budget_left = budget;
+	unsigned int sleeping = 0;
+	struct sk_buff *offload_skbs[RX_BUNDLE_SIZE];
+	int ngathered = 0;
+
+	q->next_holdoff = q->holdoff_tmr;
+
+	while (likely(budget_left && is_new_response(r, q))) {
+		int packet_complete, eth, ethpad = 2;
+		int lro = !!(qs->netdev->features & NETIF_F_GRO);
+		struct sk_buff *skb = NULL;
+		u32 len, flags;
+		__be32 rss_hi, rss_lo;
+
+		dma_rmb();
+		eth = r->rss_hdr.opcode == CPL_RX_PKT;
+		rss_hi = *(const __be32 *)r;
+		rss_lo = r->rss_hdr.rss_hash_val;
+		flags = ntohl(r->flags);
+
+		if (unlikely(flags & F_RSPD_ASYNC_NOTIF)) {
+			skb = alloc_skb(AN_PKT_SIZE, GFP_ATOMIC);
+			if (!skb)
+				goto no_mem;
+
+			__skb_put_data(skb, r, AN_PKT_SIZE);
+			skb->data[0] = CPL_ASYNC_NOTIF;
+			rss_hi = htonl(CPL_ASYNC_NOTIF << 24);
+			q->async_notif++;
+		} else if (flags & F_RSPD_IMM_DATA_VALID) {
+			skb = get_imm_packet(r);
+			if (unlikely(!skb)) {
+no_mem:
+				q->next_holdoff = NOMEM_INTR_DELAY;
+				q->nomem++;
+				/* consume one credit since we tried */
+				budget_left--;
+				break;
+			}
+			q->imm_data++;
+			ethpad = 0;
+		} else if ((len = ntohl(r->len_cq)) != 0) {
+			struct sge_fl *fl;
+
+			lro &= eth && is_eth_tcp(rss_hi);
+
+			fl = (len & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
+			if (fl->use_pages) {
+				void *addr = fl->sdesc[fl->cidx].pg_chunk.va;
+
+				prefetch(addr);
+#if L1_CACHE_BYTES < 128
+				prefetch(addr + L1_CACHE_BYTES);
+#endif
+				__refill_fl(adap, fl);
+				if (lro > 0) {
+					lro_add_page(adap, qs, fl,
+						     G_RSPD_LEN(len),
+						     flags & F_RSPD_EOP);
+					 goto next_fl;
+				}
+
+				skb = get_packet_pg(adap, fl, q,
+						    G_RSPD_LEN(len),
+						    eth ?
+						    SGE_RX_DROP_THRES : 0);
+				q->pg_skb = skb;
+			} else
+				skb = get_packet(adap, fl, G_RSPD_LEN(len),
+						 eth ? SGE_RX_DROP_THRES : 0);
+			if (unlikely(!skb)) {
+				if (!eth)
+					goto no_mem;
+				q->rx_drops++;
+			} else if (unlikely(r->rss_hdr.opcode == CPL_TRACE_PKT))
+				__skb_pull(skb, 2);
+next_fl:
+			if (++fl->cidx == fl->size)
+				fl->cidx = 0;
+		} else
+			q->pure_rsps++;
+
+		if (flags & RSPD_CTRL_MASK) {
+			sleeping |= flags & RSPD_GTS_MASK;
+			handle_rsp_cntrl_info(qs, flags);
+		}
+
+		r++;
+		if (unlikely(++q->cidx == q->size)) {
+			q->cidx = 0;
+			q->gen ^= 1;
+			r = q->desc;
+		}
+		prefetch(r);
+
+		if (++q->credits >= (q->size / 4)) {
+			refill_rspq(adap, q, q->credits);
+			q->credits = 0;
+		}
+
+		packet_complete = flags &
+				  (F_RSPD_EOP | F_RSPD_IMM_DATA_VALID |
+				   F_RSPD_ASYNC_NOTIF);
+
+		if (skb != NULL && packet_complete) {
+			if (eth)
+				rx_eth(adap, q, skb, ethpad, lro);
+			else {
+				q->offload_pkts++;
+				/* Preserve the RSS info in csum & priority */
+				skb->csum = rss_hi;
+				skb->priority = rss_lo;
+				ngathered = rx_offload(&adap->tdev, q, skb,
+						       offload_skbs,
+						       ngathered);
+			}
+
+			if (flags & F_RSPD_EOP)
+				clear_rspq_bufstate(q);
+		}
+		--budget_left;
+	}
+
+	deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered);
+
+	if (sleeping)
+		check_ring_db(adap, qs, sleeping);
+
+	smp_mb();		/* commit Tx queue .processed updates */
+	if (unlikely(qs->txq_stopped != 0))
+		restart_tx(qs);
+
+	budget -= budget_left;
+	return budget;
+}
+
+static inline int is_pure_response(const struct rsp_desc *r)
+{
+	__be32 n = r->flags & htonl(F_RSPD_ASYNC_NOTIF | F_RSPD_IMM_DATA_VALID);
+
+	return (n | r->len_cq) == 0;
+}
+
+/**
+ *	napi_rx_handler - the NAPI handler for Rx processing
+ *	@napi: the napi instance
+ *	@budget: how many packets we can process in this round
+ *
+ *	Handler for new data events when using NAPI.
+ */
+static int napi_rx_handler(struct napi_struct *napi, int budget)
+{
+	struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
+	struct adapter *adap = qs->adap;
+	int work_done = process_responses(adap, qs, budget);
+
+	if (likely(work_done < budget)) {
+		napi_complete_done(napi, work_done);
+
+		/*
+		 * Because we don't atomically flush the following
+		 * write it is possible that in very rare cases it can
+		 * reach the device in a way that races with a new
+		 * response being written plus an error interrupt
+		 * causing the NAPI interrupt handler below to return
+		 * unhandled status to the OS.  To protect against
+		 * this would require flushing the write and doing
+		 * both the write and the flush with interrupts off.
+		 * Way too expensive and unjustifiable given the
+		 * rarity of the race.
+		 *
+		 * The race cannot happen at all with MSI-X.
+		 */
+		t3_write_reg(adap, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
+			     V_NEWTIMER(qs->rspq.next_holdoff) |
+			     V_NEWINDEX(qs->rspq.cidx));
+	}
+	return work_done;
+}
+
+/*
+ * Returns true if the device is already scheduled for polling.
+ */
+static inline int napi_is_scheduled(struct napi_struct *napi)
+{
+	return test_bit(NAPI_STATE_SCHED, &napi->state);
+}
+
+/**
+ *	process_pure_responses - process pure responses from a response queue
+ *	@adap: the adapter
+ *	@qs: the queue set owning the response queue
+ *	@r: the first pure response to process
+ *
+ *	A simpler version of process_responses() that handles only pure (i.e.,
+ *	non data-carrying) responses.  Such respones are too light-weight to
+ *	justify calling a softirq under NAPI, so we handle them specially in
+ *	the interrupt handler.  The function is called with a pointer to a
+ *	response, which the caller must ensure is a valid pure response.
+ *
+ *	Returns 1 if it encounters a valid data-carrying response, 0 otherwise.
+ */
+static int process_pure_responses(struct adapter *adap, struct sge_qset *qs,
+				  struct rsp_desc *r)
+{
+	struct sge_rspq *q = &qs->rspq;
+	unsigned int sleeping = 0;
+
+	do {
+		u32 flags = ntohl(r->flags);
+
+		r++;
+		if (unlikely(++q->cidx == q->size)) {
+			q->cidx = 0;
+			q->gen ^= 1;
+			r = q->desc;
+		}
+		prefetch(r);
+
+		if (flags & RSPD_CTRL_MASK) {
+			sleeping |= flags & RSPD_GTS_MASK;
+			handle_rsp_cntrl_info(qs, flags);
+		}
+
+		q->pure_rsps++;
+		if (++q->credits >= (q->size / 4)) {
+			refill_rspq(adap, q, q->credits);
+			q->credits = 0;
+		}
+		if (!is_new_response(r, q))
+			break;
+		dma_rmb();
+	} while (is_pure_response(r));
+
+	if (sleeping)
+		check_ring_db(adap, qs, sleeping);
+
+	smp_mb();		/* commit Tx queue .processed updates */
+	if (unlikely(qs->txq_stopped != 0))
+		restart_tx(qs);
+
+	return is_new_response(r, q);
+}
+
+/**
+ *	handle_responses - decide what to do with new responses in NAPI mode
+ *	@adap: the adapter
+ *	@q: the response queue
+ *
+ *	This is used by the NAPI interrupt handlers to decide what to do with
+ *	new SGE responses.  If there are no new responses it returns -1.  If
+ *	there are new responses and they are pure (i.e., non-data carrying)
+ *	it handles them straight in hard interrupt context as they are very
+ *	cheap and don't deliver any packets.  Finally, if there are any data
+ *	signaling responses it schedules the NAPI handler.  Returns 1 if it
+ *	schedules NAPI, 0 if all new responses were pure.
+ *
+ *	The caller must ascertain NAPI is not already running.
+ */
+static inline int handle_responses(struct adapter *adap, struct sge_rspq *q)
+{
+	struct sge_qset *qs = rspq_to_qset(q);
+	struct rsp_desc *r = &q->desc[q->cidx];
+
+	if (!is_new_response(r, q))
+		return -1;
+	dma_rmb();
+	if (is_pure_response(r) && process_pure_responses(adap, qs, r) == 0) {
+		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
+			     V_NEWTIMER(q->holdoff_tmr) | V_NEWINDEX(q->cidx));
+		return 0;
+	}
+	napi_schedule(&qs->napi);
+	return 1;
+}
+
+/*
+ * The MSI-X interrupt handler for an SGE response queue for the non-NAPI case
+ * (i.e., response queue serviced in hard interrupt).
+ */
+static irqreturn_t t3_sge_intr_msix(int irq, void *cookie)
+{
+	struct sge_qset *qs = cookie;
+	struct adapter *adap = qs->adap;
+	struct sge_rspq *q = &qs->rspq;
+
+	spin_lock(&q->lock);
+	if (process_responses(adap, qs, -1) == 0)
+		q->unhandled_irqs++;
+	t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
+		     V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
+	spin_unlock(&q->lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * The MSI-X interrupt handler for an SGE response queue for the NAPI case
+ * (i.e., response queue serviced by NAPI polling).
+ */
+static irqreturn_t t3_sge_intr_msix_napi(int irq, void *cookie)
+{
+	struct sge_qset *qs = cookie;
+	struct sge_rspq *q = &qs->rspq;
+
+	spin_lock(&q->lock);
+
+	if (handle_responses(qs->adap, q) < 0)
+		q->unhandled_irqs++;
+	spin_unlock(&q->lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * The non-NAPI MSI interrupt handler.  This needs to handle data events from
+ * SGE response queues as well as error and other async events as they all use
+ * the same MSI vector.  We use one SGE response queue per port in this mode
+ * and protect all response queues with queue 0's lock.
+ */
+static irqreturn_t t3_intr_msi(int irq, void *cookie)
+{
+	int new_packets = 0;
+	struct adapter *adap = cookie;
+	struct sge_rspq *q = &adap->sge.qs[0].rspq;
+
+	spin_lock(&q->lock);
+
+	if (process_responses(adap, &adap->sge.qs[0], -1)) {
+		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
+			     V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
+		new_packets = 1;
+	}
+
+	if (adap->params.nports == 2 &&
+	    process_responses(adap, &adap->sge.qs[1], -1)) {
+		struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
+
+		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q1->cntxt_id) |
+			     V_NEWTIMER(q1->next_holdoff) |
+			     V_NEWINDEX(q1->cidx));
+		new_packets = 1;
+	}
+
+	if (!new_packets && t3_slow_intr_handler(adap) == 0)
+		q->unhandled_irqs++;
+
+	spin_unlock(&q->lock);
+	return IRQ_HANDLED;
+}
+
+static int rspq_check_napi(struct sge_qset *qs)
+{
+	struct sge_rspq *q = &qs->rspq;
+
+	if (!napi_is_scheduled(&qs->napi) &&
+	    is_new_response(&q->desc[q->cidx], q)) {
+		napi_schedule(&qs->napi);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * The MSI interrupt handler for the NAPI case (i.e., response queues serviced
+ * by NAPI polling).  Handles data events from SGE response queues as well as
+ * error and other async events as they all use the same MSI vector.  We use
+ * one SGE response queue per port in this mode and protect all response
+ * queues with queue 0's lock.
+ */
+static irqreturn_t t3_intr_msi_napi(int irq, void *cookie)
+{
+	int new_packets;
+	struct adapter *adap = cookie;
+	struct sge_rspq *q = &adap->sge.qs[0].rspq;
+
+	spin_lock(&q->lock);
+
+	new_packets = rspq_check_napi(&adap->sge.qs[0]);
+	if (adap->params.nports == 2)
+		new_packets += rspq_check_napi(&adap->sge.qs[1]);
+	if (!new_packets && t3_slow_intr_handler(adap) == 0)
+		q->unhandled_irqs++;
+
+	spin_unlock(&q->lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * A helper function that processes responses and issues GTS.
+ */
+static inline int process_responses_gts(struct adapter *adap,
+					struct sge_rspq *rq)
+{
+	int work;
+
+	work = process_responses(adap, rspq_to_qset(rq), -1);
+	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
+		     V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
+	return work;
+}
+
+/*
+ * The legacy INTx interrupt handler.  This needs to handle data events from
+ * SGE response queues as well as error and other async events as they all use
+ * the same interrupt pin.  We use one SGE response queue per port in this mode
+ * and protect all response queues with queue 0's lock.
+ */
+static irqreturn_t t3_intr(int irq, void *cookie)
+{
+	int work_done, w0, w1;
+	struct adapter *adap = cookie;
+	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
+	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
+
+	spin_lock(&q0->lock);
+
+	w0 = is_new_response(&q0->desc[q0->cidx], q0);
+	w1 = adap->params.nports == 2 &&
+	    is_new_response(&q1->desc[q1->cidx], q1);
+
+	if (likely(w0 | w1)) {
+		t3_write_reg(adap, A_PL_CLI, 0);
+		t3_read_reg(adap, A_PL_CLI);	/* flush */
+
+		if (likely(w0))
+			process_responses_gts(adap, q0);
+
+		if (w1)
+			process_responses_gts(adap, q1);
+
+		work_done = w0 | w1;
+	} else
+		work_done = t3_slow_intr_handler(adap);
+
+	spin_unlock(&q0->lock);
+	return IRQ_RETVAL(work_done != 0);
+}
+
+/*
+ * Interrupt handler for legacy INTx interrupts for T3B-based cards.
+ * Handles data events from SGE response queues as well as error and other
+ * async events as they all use the same interrupt pin.  We use one SGE
+ * response queue per port in this mode and protect all response queues with
+ * queue 0's lock.
+ */
+static irqreturn_t t3b_intr(int irq, void *cookie)
+{
+	u32 map;
+	struct adapter *adap = cookie;
+	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
+
+	t3_write_reg(adap, A_PL_CLI, 0);
+	map = t3_read_reg(adap, A_SG_DATA_INTR);
+
+	if (unlikely(!map))	/* shared interrupt, most likely */
+		return IRQ_NONE;
+
+	spin_lock(&q0->lock);
+
+	if (unlikely(map & F_ERRINTR))
+		t3_slow_intr_handler(adap);
+
+	if (likely(map & 1))
+		process_responses_gts(adap, q0);
+
+	if (map & 2)
+		process_responses_gts(adap, &adap->sge.qs[1].rspq);
+
+	spin_unlock(&q0->lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * NAPI interrupt handler for legacy INTx interrupts for T3B-based cards.
+ * Handles data events from SGE response queues as well as error and other
+ * async events as they all use the same interrupt pin.  We use one SGE
+ * response queue per port in this mode and protect all response queues with
+ * queue 0's lock.
+ */
+static irqreturn_t t3b_intr_napi(int irq, void *cookie)
+{
+	u32 map;
+	struct adapter *adap = cookie;
+	struct sge_qset *qs0 = &adap->sge.qs[0];
+	struct sge_rspq *q0 = &qs0->rspq;
+
+	t3_write_reg(adap, A_PL_CLI, 0);
+	map = t3_read_reg(adap, A_SG_DATA_INTR);
+
+	if (unlikely(!map))	/* shared interrupt, most likely */
+		return IRQ_NONE;
+
+	spin_lock(&q0->lock);
+
+	if (unlikely(map & F_ERRINTR))
+		t3_slow_intr_handler(adap);
+
+	if (likely(map & 1))
+		napi_schedule(&qs0->napi);
+
+	if (map & 2)
+		napi_schedule(&adap->sge.qs[1].napi);
+
+	spin_unlock(&q0->lock);
+	return IRQ_HANDLED;
+}
+
+/**
+ *	t3_intr_handler - select the top-level interrupt handler
+ *	@adap: the adapter
+ *	@polling: whether using NAPI to service response queues
+ *
+ *	Selects the top-level interrupt handler based on the type of interrupts
+ *	(MSI-X, MSI, or legacy) and whether NAPI will be used to service the
+ *	response queues.
+ */
+irq_handler_t t3_intr_handler(struct adapter *adap, int polling)
+{
+	if (adap->flags & USING_MSIX)
+		return polling ? t3_sge_intr_msix_napi : t3_sge_intr_msix;
+	if (adap->flags & USING_MSI)
+		return polling ? t3_intr_msi_napi : t3_intr_msi;
+	if (adap->params.rev > 0)
+		return polling ? t3b_intr_napi : t3b_intr;
+	return t3_intr;
+}
+
+#define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
+		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
+		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
+		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
+		    F_HIRCQPARITYERROR)
+#define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
+#define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
+		      F_RSPQDISABLED)
+
+/**
+ *	t3_sge_err_intr_handler - SGE async event interrupt handler
+ *	@adapter: the adapter
+ *
+ *	Interrupt handler for SGE asynchronous (non-data) events.
+ */
+void t3_sge_err_intr_handler(struct adapter *adapter)
+{
+	unsigned int v, status = t3_read_reg(adapter, A_SG_INT_CAUSE) &
+				 ~F_FLEMPTY;
+
+	if (status & SGE_PARERR)
+		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
+			 status & SGE_PARERR);
+	if (status & SGE_FRAMINGERR)
+		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
+			 status & SGE_FRAMINGERR);
+
+	if (status & F_RSPQCREDITOVERFOW)
+		CH_ALERT(adapter, "SGE response queue credit overflow\n");
+
+	if (status & F_RSPQDISABLED) {
+		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
+
+		CH_ALERT(adapter,
+			 "packet delivered to disabled response queue "
+			 "(0x%x)\n", (v >> S_RSPQ0DISABLED) & 0xff);
+	}
+
+	if (status & (F_HIPIODRBDROPERR | F_LOPIODRBDROPERR))
+		queue_work(cxgb3_wq, &adapter->db_drop_task);
+
+	if (status & (F_HIPRIORITYDBFULL | F_LOPRIORITYDBFULL))
+		queue_work(cxgb3_wq, &adapter->db_full_task);
+
+	if (status & (F_HIPRIORITYDBEMPTY | F_LOPRIORITYDBEMPTY))
+		queue_work(cxgb3_wq, &adapter->db_empty_task);
+
+	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
+	if (status &  SGE_FATALERR)
+		t3_fatal_err(adapter);
+}
+
+/**
+ *	sge_timer_tx - perform periodic maintenance of an SGE qset
+ *	@data: the SGE queue set to maintain
+ *
+ *	Runs periodically from a timer to perform maintenance of an SGE queue
+ *	set.  It performs two tasks:
+ *
+ *	Cleans up any completed Tx descriptors that may still be pending.
+ *	Normal descriptor cleanup happens when new packets are added to a Tx
+ *	queue so this timer is relatively infrequent and does any cleanup only
+ *	if the Tx queue has not seen any new packets in a while.  We make a
+ *	best effort attempt to reclaim descriptors, in that we don't wait
+ *	around if we cannot get a queue's lock (which most likely is because
+ *	someone else is queueing new packets and so will also handle the clean
+ *	up).  Since control queues use immediate data exclusively we don't
+ *	bother cleaning them up here.
+ *
+ */
+static void sge_timer_tx(struct timer_list *t)
+{
+	struct sge_qset *qs = from_timer(qs, t, tx_reclaim_timer);
+	struct port_info *pi = netdev_priv(qs->netdev);
+	struct adapter *adap = pi->adapter;
+	unsigned int tbd[SGE_TXQ_PER_SET] = {0, 0};
+	unsigned long next_period;
+
+	if (__netif_tx_trylock(qs->tx_q)) {
+                tbd[TXQ_ETH] = reclaim_completed_tx(adap, &qs->txq[TXQ_ETH],
+                                                     TX_RECLAIM_TIMER_CHUNK);
+		__netif_tx_unlock(qs->tx_q);
+	}
+
+	if (spin_trylock(&qs->txq[TXQ_OFLD].lock)) {
+		tbd[TXQ_OFLD] = reclaim_completed_tx(adap, &qs->txq[TXQ_OFLD],
+						     TX_RECLAIM_TIMER_CHUNK);
+		spin_unlock(&qs->txq[TXQ_OFLD].lock);
+	}
+
+	next_period = TX_RECLAIM_PERIOD >>
+                      (max(tbd[TXQ_ETH], tbd[TXQ_OFLD]) /
+                      TX_RECLAIM_TIMER_CHUNK);
+	mod_timer(&qs->tx_reclaim_timer, jiffies + next_period);
+}
+
+/**
+ *	sge_timer_rx - perform periodic maintenance of an SGE qset
+ *	@data: the SGE queue set to maintain
+ *
+ *	a) Replenishes Rx queues that have run out due to memory shortage.
+ *	Normally new Rx buffers are added when existing ones are consumed but
+ *	when out of memory a queue can become empty.  We try to add only a few
+ *	buffers here, the queue will be replenished fully as these new buffers
+ *	are used up if memory shortage has subsided.
+ *
+ *	b) Return coalesced response queue credits in case a response queue is
+ *	starved.
+ *
+ */
+static void sge_timer_rx(struct timer_list *t)
+{
+	spinlock_t *lock;
+	struct sge_qset *qs = from_timer(qs, t, rx_reclaim_timer);
+	struct port_info *pi = netdev_priv(qs->netdev);
+	struct adapter *adap = pi->adapter;
+	u32 status;
+
+	lock = adap->params.rev > 0 ?
+	       &qs->rspq.lock : &adap->sge.qs[0].rspq.lock;
+
+	if (!spin_trylock_irq(lock))
+		goto out;
+
+	if (napi_is_scheduled(&qs->napi))
+		goto unlock;
+
+	if (adap->params.rev < 4) {
+		status = t3_read_reg(adap, A_SG_RSPQ_FL_STATUS);
+
+		if (status & (1 << qs->rspq.cntxt_id)) {
+			qs->rspq.starved++;
+			if (qs->rspq.credits) {
+				qs->rspq.credits--;
+				refill_rspq(adap, &qs->rspq, 1);
+				qs->rspq.restarted++;
+				t3_write_reg(adap, A_SG_RSPQ_FL_STATUS,
+					     1 << qs->rspq.cntxt_id);
+			}
+		}
+	}
+
+	if (qs->fl[0].credits < qs->fl[0].size)
+		__refill_fl(adap, &qs->fl[0]);
+	if (qs->fl[1].credits < qs->fl[1].size)
+		__refill_fl(adap, &qs->fl[1]);
+
+unlock:
+	spin_unlock_irq(lock);
+out:
+	mod_timer(&qs->rx_reclaim_timer, jiffies + RX_RECLAIM_PERIOD);
+}
+
+/**
+ *	t3_update_qset_coalesce - update coalescing settings for a queue set
+ *	@qs: the SGE queue set
+ *	@p: new queue set parameters
+ *
+ *	Update the coalescing settings for an SGE queue set.  Nothing is done
+ *	if the queue set is not initialized yet.
+ */
+void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
+{
+	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);/* can't be 0 */
+	qs->rspq.polling = p->polling;
+	qs->napi.poll = p->polling ? napi_rx_handler : ofld_poll;
+}
+
+/**
+ *	t3_sge_alloc_qset - initialize an SGE queue set
+ *	@adapter: the adapter
+ *	@id: the queue set id
+ *	@nports: how many Ethernet ports will be using this queue set
+ *	@irq_vec_idx: the IRQ vector index for response queue interrupts
+ *	@p: configuration parameters for this queue set
+ *	@ntxq: number of Tx queues for the queue set
+ *	@netdev: net device associated with this queue set
+ *	@netdevq: net device TX queue associated with this queue set
+ *
+ *	Allocate resources and initialize an SGE queue set.  A queue set
+ *	comprises a response queue, two Rx free-buffer queues, and up to 3
+ *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
+ *	queue, offload queue, and control queue.
+ */
+int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
+		      int irq_vec_idx, const struct qset_params *p,
+		      int ntxq, struct net_device *dev,
+		      struct netdev_queue *netdevq)
+{
+	int i, avail, ret = -ENOMEM;
+	struct sge_qset *q = &adapter->sge.qs[id];
+
+	init_qset_cntxt(q, id);
+	timer_setup(&q->tx_reclaim_timer, sge_timer_tx, 0);
+	timer_setup(&q->rx_reclaim_timer, sge_timer_rx, 0);
+
+	q->fl[0].desc = alloc_ring(adapter->pdev, p->fl_size,
+				   sizeof(struct rx_desc),
+				   sizeof(struct rx_sw_desc),
+				   &q->fl[0].phys_addr, &q->fl[0].sdesc);
+	if (!q->fl[0].desc)
+		goto err;
+
+	q->fl[1].desc = alloc_ring(adapter->pdev, p->jumbo_size,
+				   sizeof(struct rx_desc),
+				   sizeof(struct rx_sw_desc),
+				   &q->fl[1].phys_addr, &q->fl[1].sdesc);
+	if (!q->fl[1].desc)
+		goto err;
+
+	q->rspq.desc = alloc_ring(adapter->pdev, p->rspq_size,
+				  sizeof(struct rsp_desc), 0,
+				  &q->rspq.phys_addr, NULL);
+	if (!q->rspq.desc)
+		goto err;
+
+	for (i = 0; i < ntxq; ++i) {
+		/*
+		 * The control queue always uses immediate data so does not
+		 * need to keep track of any sk_buffs.
+		 */
+		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
+
+		q->txq[i].desc = alloc_ring(adapter->pdev, p->txq_size[i],
+					    sizeof(struct tx_desc), sz,
+					    &q->txq[i].phys_addr,
+					    &q->txq[i].sdesc);
+		if (!q->txq[i].desc)
+			goto err;
+
+		q->txq[i].gen = 1;
+		q->txq[i].size = p->txq_size[i];
+		spin_lock_init(&q->txq[i].lock);
+		skb_queue_head_init(&q->txq[i].sendq);
+	}
+
+	tasklet_init(&q->txq[TXQ_OFLD].qresume_tsk, restart_offloadq,
+		     (unsigned long)q);
+	tasklet_init(&q->txq[TXQ_CTRL].qresume_tsk, restart_ctrlq,
+		     (unsigned long)q);
+
+	q->fl[0].gen = q->fl[1].gen = 1;
+	q->fl[0].size = p->fl_size;
+	q->fl[1].size = p->jumbo_size;
+
+	q->rspq.gen = 1;
+	q->rspq.size = p->rspq_size;
+	spin_lock_init(&q->rspq.lock);
+	skb_queue_head_init(&q->rspq.rx_queue);
+
+	q->txq[TXQ_ETH].stop_thres = nports *
+	    flits_to_desc(sgl_len(MAX_SKB_FRAGS + 1) + 3);
+
+#if FL0_PG_CHUNK_SIZE > 0
+	q->fl[0].buf_size = FL0_PG_CHUNK_SIZE;
+#else
+	q->fl[0].buf_size = SGE_RX_SM_BUF_SIZE + sizeof(struct cpl_rx_data);
+#endif
+#if FL1_PG_CHUNK_SIZE > 0
+	q->fl[1].buf_size = FL1_PG_CHUNK_SIZE;
+#else
+	q->fl[1].buf_size = is_offload(adapter) ?
+		(16 * 1024) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
+		MAX_FRAME_SIZE + 2 + sizeof(struct cpl_rx_pkt);
+#endif
+
+	q->fl[0].use_pages = FL0_PG_CHUNK_SIZE > 0;
+	q->fl[1].use_pages = FL1_PG_CHUNK_SIZE > 0;
+	q->fl[0].order = FL0_PG_ORDER;
+	q->fl[1].order = FL1_PG_ORDER;
+	q->fl[0].alloc_size = FL0_PG_ALLOC_SIZE;
+	q->fl[1].alloc_size = FL1_PG_ALLOC_SIZE;
+
+	spin_lock_irq(&adapter->sge.reg_lock);
+
+	/* FL threshold comparison uses < */
+	ret = t3_sge_init_rspcntxt(adapter, q->rspq.cntxt_id, irq_vec_idx,
+				   q->rspq.phys_addr, q->rspq.size,
+				   q->fl[0].buf_size - SGE_PG_RSVD, 1, 0);
+	if (ret)
+		goto err_unlock;
+
+	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
+		ret = t3_sge_init_flcntxt(adapter, q->fl[i].cntxt_id, 0,
+					  q->fl[i].phys_addr, q->fl[i].size,
+					  q->fl[i].buf_size - SGE_PG_RSVD,
+					  p->cong_thres, 1, 0);
+		if (ret)
+			goto err_unlock;
+	}
+
+	ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
+				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
+				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
+				 1, 0);
+	if (ret)
+		goto err_unlock;
+
+	if (ntxq > 1) {
+		ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_OFLD].cntxt_id,
+					 USE_GTS, SGE_CNTXT_OFLD, id,
+					 q->txq[TXQ_OFLD].phys_addr,
+					 q->txq[TXQ_OFLD].size, 0, 1, 0);
+		if (ret)
+			goto err_unlock;
+	}
+
+	if (ntxq > 2) {
+		ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_CTRL].cntxt_id, 0,
+					 SGE_CNTXT_CTRL, id,
+					 q->txq[TXQ_CTRL].phys_addr,
+					 q->txq[TXQ_CTRL].size,
+					 q->txq[TXQ_CTRL].token, 1, 0);
+		if (ret)
+			goto err_unlock;
+	}
+
+	spin_unlock_irq(&adapter->sge.reg_lock);
+
+	q->adap = adapter;
+	q->netdev = dev;
+	q->tx_q = netdevq;
+	t3_update_qset_coalesce(q, p);
+
+	avail = refill_fl(adapter, &q->fl[0], q->fl[0].size,
+			  GFP_KERNEL | __GFP_COMP);
+	if (!avail) {
+		CH_ALERT(adapter, "free list queue 0 initialization failed\n");
+		goto err;
+	}
+	if (avail < q->fl[0].size)
+		CH_WARN(adapter, "free list queue 0 enabled with %d credits\n",
+			avail);
+
+	avail = refill_fl(adapter, &q->fl[1], q->fl[1].size,
+			  GFP_KERNEL | __GFP_COMP);
+	if (avail < q->fl[1].size)
+		CH_WARN(adapter, "free list queue 1 enabled with %d credits\n",
+			avail);
+	refill_rspq(adapter, &q->rspq, q->rspq.size - 1);
+
+	t3_write_reg(adapter, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
+		     V_NEWTIMER(q->rspq.holdoff_tmr));
+
+	return 0;
+
+err_unlock:
+	spin_unlock_irq(&adapter->sge.reg_lock);
+err:
+	t3_free_qset(adapter, q);
+	return ret;
+}
+
+/**
+ *      t3_start_sge_timers - start SGE timer call backs
+ *      @adap: the adapter
+ *
+ *      Starts each SGE queue set's timer call back
+ */
+void t3_start_sge_timers(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < SGE_QSETS; ++i) {
+		struct sge_qset *q = &adap->sge.qs[i];
+
+	if (q->tx_reclaim_timer.function)
+		mod_timer(&q->tx_reclaim_timer, jiffies + TX_RECLAIM_PERIOD);
+
+	if (q->rx_reclaim_timer.function)
+		mod_timer(&q->rx_reclaim_timer, jiffies + RX_RECLAIM_PERIOD);
+	}
+}
+
+/**
+ *	t3_stop_sge_timers - stop SGE timer call backs
+ *	@adap: the adapter
+ *
+ *	Stops each SGE queue set's timer call back
+ */
+void t3_stop_sge_timers(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < SGE_QSETS; ++i) {
+		struct sge_qset *q = &adap->sge.qs[i];
+
+		if (q->tx_reclaim_timer.function)
+			del_timer_sync(&q->tx_reclaim_timer);
+		if (q->rx_reclaim_timer.function)
+			del_timer_sync(&q->rx_reclaim_timer);
+	}
+}
+
+/**
+ *	t3_free_sge_resources - free SGE resources
+ *	@adap: the adapter
+ *
+ *	Frees resources used by the SGE queue sets.
+ */
+void t3_free_sge_resources(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < SGE_QSETS; ++i)
+		t3_free_qset(adap, &adap->sge.qs[i]);
+}
+
+/**
+ *	t3_sge_start - enable SGE
+ *	@adap: the adapter
+ *
+ *	Enables the SGE for DMAs.  This is the last step in starting packet
+ *	transfers.
+ */
+void t3_sge_start(struct adapter *adap)
+{
+	t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
+}
+
+/**
+ *	t3_sge_stop - disable SGE operation
+ *	@adap: the adapter
+ *
+ *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
+ *	from error interrupts) or from normal process context.  In the latter
+ *	case it also disables any pending queue restart tasklets.  Note that
+ *	if it is called in interrupt context it cannot disable the restart
+ *	tasklets as it cannot wait, however the tasklets will have no effect
+ *	since the doorbells are disabled and the driver will call this again
+ *	later from process context, at which time the tasklets will be stopped
+ *	if they are still running.
+ */
+void t3_sge_stop(struct adapter *adap)
+{
+	t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, 0);
+	if (!in_interrupt()) {
+		int i;
+
+		for (i = 0; i < SGE_QSETS; ++i) {
+			struct sge_qset *qs = &adap->sge.qs[i];
+
+			tasklet_kill(&qs->txq[TXQ_OFLD].qresume_tsk);
+			tasklet_kill(&qs->txq[TXQ_CTRL].qresume_tsk);
+		}
+	}
+}
+
+/**
+ *	t3_sge_init - initialize SGE
+ *	@adap: the adapter
+ *	@p: the SGE parameters
+ *
+ *	Performs SGE initialization needed every time after a chip reset.
+ *	We do not initialize any of the queue sets here, instead the driver
+ *	top-level must request those individually.  We also do not enable DMA
+ *	here, that should be done after the queues have been set up.
+ */
+void t3_sge_init(struct adapter *adap, struct sge_params *p)
+{
+	unsigned int ctrl, ups = ffs(pci_resource_len(adap->pdev, 2) >> 12);
+
+	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
+	    F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
+	    V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
+	    V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
+#if SGE_NUM_GENBITS == 1
+	ctrl |= F_EGRGENCTRL;
+#endif
+	if (adap->params.rev > 0) {
+		if (!(adap->flags & (USING_MSIX | USING_MSI)))
+			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
+	}
+	t3_write_reg(adap, A_SG_CONTROL, ctrl);
+	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
+		     V_LORCQDRBTHRSH(512));
+	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
+	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
+		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
+	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
+		     adap->params.rev < T3_REV_C ? 1000 : 500);
+	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
+	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
+	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
+	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
+	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
+}
+
+/**
+ *	t3_sge_prep - one-time SGE initialization
+ *	@adap: the associated adapter
+ *	@p: SGE parameters
+ *
+ *	Performs one-time initialization of SGE SW state.  Includes determining
+ *	defaults for the assorted SGE parameters, which admins can change until
+ *	they are used to initialize the SGE.
+ */
+void t3_sge_prep(struct adapter *adap, struct sge_params *p)
+{
+	int i;
+
+	p->max_pkt_size = (16 * 1024) - sizeof(struct cpl_rx_data) -
+	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	for (i = 0; i < SGE_QSETS; ++i) {
+		struct qset_params *q = p->qset + i;
+
+		q->polling = adap->params.rev > 0;
+		q->coalesce_usecs = 5;
+		q->rspq_size = 1024;
+		q->fl_size = 1024;
+ 		q->jumbo_size = 512;
+		q->txq_size[TXQ_ETH] = 1024;
+		q->txq_size[TXQ_OFLD] = 1024;
+		q->txq_size[TXQ_CTRL] = 256;
+		q->cong_thres = 0;
+	}
+
+	spin_lock_init(&adap->sge.reg_lock);
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge_defs.h b/drivers/net/ethernet/chelsio/cxgb3/sge_defs.h
new file mode 100644
index 0000000..c31ce8d
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge_defs.h
@@ -0,0 +1,256 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file is automatically generated --- any changes will be lost.
+ */
+
+#ifndef _SGE_DEFS_H
+#define _SGE_DEFS_H
+
+#define S_EC_CREDITS    0
+#define M_EC_CREDITS    0x7FFF
+#define V_EC_CREDITS(x) ((x) << S_EC_CREDITS)
+#define G_EC_CREDITS(x) (((x) >> S_EC_CREDITS) & M_EC_CREDITS)
+
+#define S_EC_GTS    15
+#define V_EC_GTS(x) ((x) << S_EC_GTS)
+#define F_EC_GTS    V_EC_GTS(1U)
+
+#define S_EC_INDEX    16
+#define M_EC_INDEX    0xFFFF
+#define V_EC_INDEX(x) ((x) << S_EC_INDEX)
+#define G_EC_INDEX(x) (((x) >> S_EC_INDEX) & M_EC_INDEX)
+
+#define S_EC_SIZE    0
+#define M_EC_SIZE    0xFFFF
+#define V_EC_SIZE(x) ((x) << S_EC_SIZE)
+#define G_EC_SIZE(x) (((x) >> S_EC_SIZE) & M_EC_SIZE)
+
+#define S_EC_BASE_LO    16
+#define M_EC_BASE_LO    0xFFFF
+#define V_EC_BASE_LO(x) ((x) << S_EC_BASE_LO)
+#define G_EC_BASE_LO(x) (((x) >> S_EC_BASE_LO) & M_EC_BASE_LO)
+
+#define S_EC_BASE_HI    0
+#define M_EC_BASE_HI    0xF
+#define V_EC_BASE_HI(x) ((x) << S_EC_BASE_HI)
+#define G_EC_BASE_HI(x) (((x) >> S_EC_BASE_HI) & M_EC_BASE_HI)
+
+#define S_EC_RESPQ    4
+#define M_EC_RESPQ    0x7
+#define V_EC_RESPQ(x) ((x) << S_EC_RESPQ)
+#define G_EC_RESPQ(x) (((x) >> S_EC_RESPQ) & M_EC_RESPQ)
+
+#define S_EC_TYPE    7
+#define M_EC_TYPE    0x7
+#define V_EC_TYPE(x) ((x) << S_EC_TYPE)
+#define G_EC_TYPE(x) (((x) >> S_EC_TYPE) & M_EC_TYPE)
+
+#define S_EC_GEN    10
+#define V_EC_GEN(x) ((x) << S_EC_GEN)
+#define F_EC_GEN    V_EC_GEN(1U)
+
+#define S_EC_UP_TOKEN    11
+#define M_EC_UP_TOKEN    0xFFFFF
+#define V_EC_UP_TOKEN(x) ((x) << S_EC_UP_TOKEN)
+#define G_EC_UP_TOKEN(x) (((x) >> S_EC_UP_TOKEN) & M_EC_UP_TOKEN)
+
+#define S_EC_VALID    31
+#define V_EC_VALID(x) ((x) << S_EC_VALID)
+#define F_EC_VALID    V_EC_VALID(1U)
+
+#define S_RQ_MSI_VEC    20
+#define M_RQ_MSI_VEC    0x3F
+#define V_RQ_MSI_VEC(x) ((x) << S_RQ_MSI_VEC)
+#define G_RQ_MSI_VEC(x) (((x) >> S_RQ_MSI_VEC) & M_RQ_MSI_VEC)
+
+#define S_RQ_INTR_EN    26
+#define V_RQ_INTR_EN(x) ((x) << S_RQ_INTR_EN)
+#define F_RQ_INTR_EN    V_RQ_INTR_EN(1U)
+
+#define S_RQ_GEN    28
+#define V_RQ_GEN(x) ((x) << S_RQ_GEN)
+#define F_RQ_GEN    V_RQ_GEN(1U)
+
+#define S_CQ_INDEX    0
+#define M_CQ_INDEX    0xFFFF
+#define V_CQ_INDEX(x) ((x) << S_CQ_INDEX)
+#define G_CQ_INDEX(x) (((x) >> S_CQ_INDEX) & M_CQ_INDEX)
+
+#define S_CQ_SIZE    16
+#define M_CQ_SIZE    0xFFFF
+#define V_CQ_SIZE(x) ((x) << S_CQ_SIZE)
+#define G_CQ_SIZE(x) (((x) >> S_CQ_SIZE) & M_CQ_SIZE)
+
+#define S_CQ_BASE_HI    0
+#define M_CQ_BASE_HI    0xFFFFF
+#define V_CQ_BASE_HI(x) ((x) << S_CQ_BASE_HI)
+#define G_CQ_BASE_HI(x) (((x) >> S_CQ_BASE_HI) & M_CQ_BASE_HI)
+
+#define S_CQ_RSPQ    20
+#define M_CQ_RSPQ    0x3F
+#define V_CQ_RSPQ(x) ((x) << S_CQ_RSPQ)
+#define G_CQ_RSPQ(x) (((x) >> S_CQ_RSPQ) & M_CQ_RSPQ)
+
+#define S_CQ_ASYNC_NOTIF    26
+#define V_CQ_ASYNC_NOTIF(x) ((x) << S_CQ_ASYNC_NOTIF)
+#define F_CQ_ASYNC_NOTIF    V_CQ_ASYNC_NOTIF(1U)
+
+#define S_CQ_ARMED    27
+#define V_CQ_ARMED(x) ((x) << S_CQ_ARMED)
+#define F_CQ_ARMED    V_CQ_ARMED(1U)
+
+#define S_CQ_ASYNC_NOTIF_SOL    28
+#define V_CQ_ASYNC_NOTIF_SOL(x) ((x) << S_CQ_ASYNC_NOTIF_SOL)
+#define F_CQ_ASYNC_NOTIF_SOL    V_CQ_ASYNC_NOTIF_SOL(1U)
+
+#define S_CQ_GEN    29
+#define V_CQ_GEN(x) ((x) << S_CQ_GEN)
+#define F_CQ_GEN    V_CQ_GEN(1U)
+
+#define S_CQ_ERR    30
+#define V_CQ_ERR(x) ((x) << S_CQ_ERR)
+#define F_CQ_ERR    V_CQ_ERR(1U)
+
+#define S_CQ_OVERFLOW_MODE    31
+#define V_CQ_OVERFLOW_MODE(x) ((x) << S_CQ_OVERFLOW_MODE)
+#define F_CQ_OVERFLOW_MODE    V_CQ_OVERFLOW_MODE(1U)
+
+#define S_CQ_CREDITS    0
+#define M_CQ_CREDITS    0xFFFF
+#define V_CQ_CREDITS(x) ((x) << S_CQ_CREDITS)
+#define G_CQ_CREDITS(x) (((x) >> S_CQ_CREDITS) & M_CQ_CREDITS)
+
+#define S_CQ_CREDIT_THRES    16
+#define M_CQ_CREDIT_THRES    0x1FFF
+#define V_CQ_CREDIT_THRES(x) ((x) << S_CQ_CREDIT_THRES)
+#define G_CQ_CREDIT_THRES(x) (((x) >> S_CQ_CREDIT_THRES) & M_CQ_CREDIT_THRES)
+
+#define S_FL_BASE_HI    0
+#define M_FL_BASE_HI    0xFFFFF
+#define V_FL_BASE_HI(x) ((x) << S_FL_BASE_HI)
+#define G_FL_BASE_HI(x) (((x) >> S_FL_BASE_HI) & M_FL_BASE_HI)
+
+#define S_FL_INDEX_LO    20
+#define M_FL_INDEX_LO    0xFFF
+#define V_FL_INDEX_LO(x) ((x) << S_FL_INDEX_LO)
+#define G_FL_INDEX_LO(x) (((x) >> S_FL_INDEX_LO) & M_FL_INDEX_LO)
+
+#define S_FL_INDEX_HI    0
+#define M_FL_INDEX_HI    0xF
+#define V_FL_INDEX_HI(x) ((x) << S_FL_INDEX_HI)
+#define G_FL_INDEX_HI(x) (((x) >> S_FL_INDEX_HI) & M_FL_INDEX_HI)
+
+#define S_FL_SIZE    4
+#define M_FL_SIZE    0xFFFF
+#define V_FL_SIZE(x) ((x) << S_FL_SIZE)
+#define G_FL_SIZE(x) (((x) >> S_FL_SIZE) & M_FL_SIZE)
+
+#define S_FL_GEN    20
+#define V_FL_GEN(x) ((x) << S_FL_GEN)
+#define F_FL_GEN    V_FL_GEN(1U)
+
+#define S_FL_ENTRY_SIZE_LO    21
+#define M_FL_ENTRY_SIZE_LO    0x7FF
+#define V_FL_ENTRY_SIZE_LO(x) ((x) << S_FL_ENTRY_SIZE_LO)
+#define G_FL_ENTRY_SIZE_LO(x) (((x) >> S_FL_ENTRY_SIZE_LO) & M_FL_ENTRY_SIZE_LO)
+
+#define S_FL_ENTRY_SIZE_HI    0
+#define M_FL_ENTRY_SIZE_HI    0x1FFFFF
+#define V_FL_ENTRY_SIZE_HI(x) ((x) << S_FL_ENTRY_SIZE_HI)
+#define G_FL_ENTRY_SIZE_HI(x) (((x) >> S_FL_ENTRY_SIZE_HI) & M_FL_ENTRY_SIZE_HI)
+
+#define S_FL_CONG_THRES    21
+#define M_FL_CONG_THRES    0x3FF
+#define V_FL_CONG_THRES(x) ((x) << S_FL_CONG_THRES)
+#define G_FL_CONG_THRES(x) (((x) >> S_FL_CONG_THRES) & M_FL_CONG_THRES)
+
+#define S_FL_GTS    31
+#define V_FL_GTS(x) ((x) << S_FL_GTS)
+#define F_FL_GTS    V_FL_GTS(1U)
+
+#define S_FLD_GEN1    31
+#define V_FLD_GEN1(x) ((x) << S_FLD_GEN1)
+#define F_FLD_GEN1    V_FLD_GEN1(1U)
+
+#define S_FLD_GEN2    0
+#define V_FLD_GEN2(x) ((x) << S_FLD_GEN2)
+#define F_FLD_GEN2    V_FLD_GEN2(1U)
+
+#define S_RSPD_TXQ1_CR    0
+#define M_RSPD_TXQ1_CR    0x7F
+#define V_RSPD_TXQ1_CR(x) ((x) << S_RSPD_TXQ1_CR)
+#define G_RSPD_TXQ1_CR(x) (((x) >> S_RSPD_TXQ1_CR) & M_RSPD_TXQ1_CR)
+
+#define S_RSPD_TXQ1_GTS    7
+#define V_RSPD_TXQ1_GTS(x) ((x) << S_RSPD_TXQ1_GTS)
+#define F_RSPD_TXQ1_GTS    V_RSPD_TXQ1_GTS(1U)
+
+#define S_RSPD_TXQ2_CR    8
+#define M_RSPD_TXQ2_CR    0x7F
+#define V_RSPD_TXQ2_CR(x) ((x) << S_RSPD_TXQ2_CR)
+#define G_RSPD_TXQ2_CR(x) (((x) >> S_RSPD_TXQ2_CR) & M_RSPD_TXQ2_CR)
+
+#define S_RSPD_TXQ2_GTS    15
+#define V_RSPD_TXQ2_GTS(x) ((x) << S_RSPD_TXQ2_GTS)
+#define F_RSPD_TXQ2_GTS    V_RSPD_TXQ2_GTS(1U)
+
+#define S_RSPD_TXQ0_CR    16
+#define M_RSPD_TXQ0_CR    0x7F
+#define V_RSPD_TXQ0_CR(x) ((x) << S_RSPD_TXQ0_CR)
+#define G_RSPD_TXQ0_CR(x) (((x) >> S_RSPD_TXQ0_CR) & M_RSPD_TXQ0_CR)
+
+#define S_RSPD_TXQ0_GTS    23
+#define V_RSPD_TXQ0_GTS(x) ((x) << S_RSPD_TXQ0_GTS)
+#define F_RSPD_TXQ0_GTS    V_RSPD_TXQ0_GTS(1U)
+
+#define S_RSPD_EOP    24
+#define V_RSPD_EOP(x) ((x) << S_RSPD_EOP)
+#define F_RSPD_EOP    V_RSPD_EOP(1U)
+
+#define S_RSPD_SOP    25
+#define V_RSPD_SOP(x) ((x) << S_RSPD_SOP)
+#define F_RSPD_SOP    V_RSPD_SOP(1U)
+
+#define S_RSPD_ASYNC_NOTIF    26
+#define V_RSPD_ASYNC_NOTIF(x) ((x) << S_RSPD_ASYNC_NOTIF)
+#define F_RSPD_ASYNC_NOTIF    V_RSPD_ASYNC_NOTIF(1U)
+
+#define S_RSPD_FL0_GTS    27
+#define V_RSPD_FL0_GTS(x) ((x) << S_RSPD_FL0_GTS)
+#define F_RSPD_FL0_GTS    V_RSPD_FL0_GTS(1U)
+
+#define S_RSPD_FL1_GTS    28
+#define V_RSPD_FL1_GTS(x) ((x) << S_RSPD_FL1_GTS)
+#define F_RSPD_FL1_GTS    V_RSPD_FL1_GTS(1U)
+
+#define S_RSPD_IMM_DATA_VALID    29
+#define V_RSPD_IMM_DATA_VALID(x) ((x) << S_RSPD_IMM_DATA_VALID)
+#define F_RSPD_IMM_DATA_VALID    V_RSPD_IMM_DATA_VALID(1U)
+
+#define S_RSPD_OFFLOAD    30
+#define V_RSPD_OFFLOAD(x) ((x) << S_RSPD_OFFLOAD)
+#define F_RSPD_OFFLOAD    V_RSPD_OFFLOAD(1U)
+
+#define S_RSPD_GEN1    31
+#define V_RSPD_GEN1(x) ((x) << S_RSPD_GEN1)
+#define F_RSPD_GEN1    V_RSPD_GEN1(1U)
+
+#define S_RSPD_LEN    0
+#define M_RSPD_LEN    0x7FFFFFFF
+#define V_RSPD_LEN(x) ((x) << S_RSPD_LEN)
+#define G_RSPD_LEN(x) (((x) >> S_RSPD_LEN) & M_RSPD_LEN)
+
+#define S_RSPD_FLQ    31
+#define V_RSPD_FLQ(x) ((x) << S_RSPD_FLQ)
+#define F_RSPD_FLQ    V_RSPD_FLQ(1U)
+
+#define S_RSPD_GEN2    0
+#define V_RSPD_GEN2(x) ((x) << S_RSPD_GEN2)
+#define F_RSPD_GEN2    V_RSPD_GEN2(1U)
+
+#define S_RSPD_INR_VEC    1
+#define M_RSPD_INR_VEC    0x7F
+#define V_RSPD_INR_VEC(x) ((x) << S_RSPD_INR_VEC)
+#define G_RSPD_INR_VEC(x) (((x) >> S_RSPD_INR_VEC) & M_RSPD_INR_VEC)
+
+#endif				/* _SGE_DEFS_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3_cpl.h b/drivers/net/ethernet/chelsio/cxgb3/t3_cpl.h
new file mode 100644
index 0000000..852c399
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/t3_cpl.h
@@ -0,0 +1,1495 @@
+/*
+ * Copyright (c) 2004-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef T3_CPL_H
+#define T3_CPL_H
+
+#if !defined(__LITTLE_ENDIAN_BITFIELD) && !defined(__BIG_ENDIAN_BITFIELD)
+# include <asm/byteorder.h>
+#endif
+
+enum CPL_opcode {
+	CPL_PASS_OPEN_REQ = 0x1,
+	CPL_PASS_ACCEPT_RPL = 0x2,
+	CPL_ACT_OPEN_REQ = 0x3,
+	CPL_SET_TCB = 0x4,
+	CPL_SET_TCB_FIELD = 0x5,
+	CPL_GET_TCB = 0x6,
+	CPL_PCMD = 0x7,
+	CPL_CLOSE_CON_REQ = 0x8,
+	CPL_CLOSE_LISTSRV_REQ = 0x9,
+	CPL_ABORT_REQ = 0xA,
+	CPL_ABORT_RPL = 0xB,
+	CPL_TX_DATA = 0xC,
+	CPL_RX_DATA_ACK = 0xD,
+	CPL_TX_PKT = 0xE,
+	CPL_RTE_DELETE_REQ = 0xF,
+	CPL_RTE_WRITE_REQ = 0x10,
+	CPL_RTE_READ_REQ = 0x11,
+	CPL_L2T_WRITE_REQ = 0x12,
+	CPL_L2T_READ_REQ = 0x13,
+	CPL_SMT_WRITE_REQ = 0x14,
+	CPL_SMT_READ_REQ = 0x15,
+	CPL_TX_PKT_LSO = 0x16,
+	CPL_PCMD_READ = 0x17,
+	CPL_BARRIER = 0x18,
+	CPL_TID_RELEASE = 0x1A,
+
+	CPL_CLOSE_LISTSRV_RPL = 0x20,
+	CPL_ERROR = 0x21,
+	CPL_GET_TCB_RPL = 0x22,
+	CPL_L2T_WRITE_RPL = 0x23,
+	CPL_PCMD_READ_RPL = 0x24,
+	CPL_PCMD_RPL = 0x25,
+	CPL_PEER_CLOSE = 0x26,
+	CPL_RTE_DELETE_RPL = 0x27,
+	CPL_RTE_WRITE_RPL = 0x28,
+	CPL_RX_DDP_COMPLETE = 0x29,
+	CPL_RX_PHYS_ADDR = 0x2A,
+	CPL_RX_PKT = 0x2B,
+	CPL_RX_URG_NOTIFY = 0x2C,
+	CPL_SET_TCB_RPL = 0x2D,
+	CPL_SMT_WRITE_RPL = 0x2E,
+	CPL_TX_DATA_ACK = 0x2F,
+
+	CPL_ABORT_REQ_RSS = 0x30,
+	CPL_ABORT_RPL_RSS = 0x31,
+	CPL_CLOSE_CON_RPL = 0x32,
+	CPL_ISCSI_HDR = 0x33,
+	CPL_L2T_READ_RPL = 0x34,
+	CPL_RDMA_CQE = 0x35,
+	CPL_RDMA_CQE_READ_RSP = 0x36,
+	CPL_RDMA_CQE_ERR = 0x37,
+	CPL_RTE_READ_RPL = 0x38,
+	CPL_RX_DATA = 0x39,
+
+	CPL_ACT_OPEN_RPL = 0x40,
+	CPL_PASS_OPEN_RPL = 0x41,
+	CPL_RX_DATA_DDP = 0x42,
+	CPL_SMT_READ_RPL = 0x43,
+
+	CPL_ACT_ESTABLISH = 0x50,
+	CPL_PASS_ESTABLISH = 0x51,
+
+	CPL_PASS_ACCEPT_REQ = 0x70,
+
+	CPL_ASYNC_NOTIF = 0x80,	/* fake opcode for async notifications */
+
+	CPL_TX_DMA_ACK = 0xA0,
+	CPL_RDMA_READ_REQ = 0xA1,
+	CPL_RDMA_TERMINATE = 0xA2,
+	CPL_TRACE_PKT = 0xA3,
+	CPL_RDMA_EC_STATUS = 0xA5,
+
+	NUM_CPL_CMDS		/* must be last and previous entries must be sorted */
+};
+
+enum CPL_error {
+	CPL_ERR_NONE = 0,
+	CPL_ERR_TCAM_PARITY = 1,
+	CPL_ERR_TCAM_FULL = 3,
+	CPL_ERR_CONN_RESET = 20,
+	CPL_ERR_CONN_EXIST = 22,
+	CPL_ERR_ARP_MISS = 23,
+	CPL_ERR_BAD_SYN = 24,
+	CPL_ERR_CONN_TIMEDOUT = 30,
+	CPL_ERR_XMIT_TIMEDOUT = 31,
+	CPL_ERR_PERSIST_TIMEDOUT = 32,
+	CPL_ERR_FINWAIT2_TIMEDOUT = 33,
+	CPL_ERR_KEEPALIVE_TIMEDOUT = 34,
+	CPL_ERR_RTX_NEG_ADVICE = 35,
+	CPL_ERR_PERSIST_NEG_ADVICE = 36,
+	CPL_ERR_ABORT_FAILED = 42,
+	CPL_ERR_GENERAL = 99
+};
+
+enum {
+	CPL_CONN_POLICY_AUTO = 0,
+	CPL_CONN_POLICY_ASK = 1,
+	CPL_CONN_POLICY_DENY = 3
+};
+
+enum {
+	ULP_MODE_NONE = 0,
+	ULP_MODE_ISCSI = 2,
+	ULP_MODE_RDMA = 4,
+	ULP_MODE_TCPDDP = 5
+};
+
+enum {
+	ULP_CRC_HEADER = 1 << 0,
+	ULP_CRC_DATA = 1 << 1
+};
+
+enum {
+	CPL_PASS_OPEN_ACCEPT,
+	CPL_PASS_OPEN_REJECT
+};
+
+enum {
+	CPL_ABORT_SEND_RST = 0,
+	CPL_ABORT_NO_RST,
+	CPL_ABORT_POST_CLOSE_REQ = 2
+};
+
+enum {				/* TX_PKT_LSO ethernet types */
+	CPL_ETH_II,
+	CPL_ETH_II_VLAN,
+	CPL_ETH_802_3,
+	CPL_ETH_802_3_VLAN
+};
+
+enum {				/* TCP congestion control algorithms */
+	CONG_ALG_RENO,
+	CONG_ALG_TAHOE,
+	CONG_ALG_NEWRENO,
+	CONG_ALG_HIGHSPEED
+};
+
+enum {			/* RSS hash type */
+	RSS_HASH_NONE = 0,
+	RSS_HASH_2_TUPLE = 1,
+	RSS_HASH_4_TUPLE = 2,
+	RSS_HASH_TCPV6 = 3
+};
+
+union opcode_tid {
+	__be32 opcode_tid;
+	__u8 opcode;
+};
+
+#define S_OPCODE 24
+#define V_OPCODE(x) ((x) << S_OPCODE)
+#define G_OPCODE(x) (((x) >> S_OPCODE) & 0xFF)
+#define G_TID(x)    ((x) & 0xFFFFFF)
+
+#define S_QNUM 0
+#define G_QNUM(x) (((x) >> S_QNUM) & 0xFFFF)
+
+#define S_HASHTYPE 22
+#define M_HASHTYPE 0x3
+#define G_HASHTYPE(x) (((x) >> S_HASHTYPE) & M_HASHTYPE)
+
+/* tid is assumed to be 24-bits */
+#define MK_OPCODE_TID(opcode, tid) (V_OPCODE(opcode) | (tid))
+
+#define OPCODE_TID(cmd) ((cmd)->ot.opcode_tid)
+
+/* extract the TID from a CPL command */
+#define GET_TID(cmd) (G_TID(ntohl(OPCODE_TID(cmd))))
+
+struct tcp_options {
+	__be16 mss;
+	__u8 wsf;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	 __u8:5;
+	__u8 ecn:1;
+	__u8 sack:1;
+	__u8 tstamp:1;
+#else
+	__u8 tstamp:1;
+	__u8 sack:1;
+	__u8 ecn:1;
+	 __u8:5;
+#endif
+};
+
+struct rss_header {
+	__u8 opcode;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 cpu_idx:6;
+	__u8 hash_type:2;
+#else
+	__u8 hash_type:2;
+	__u8 cpu_idx:6;
+#endif
+	__be16 cq_idx;
+	__be32 rss_hash_val;
+};
+
+#ifndef CHELSIO_FW
+struct work_request_hdr {
+	__be32 wr_hi;
+	__be32 wr_lo;
+};
+
+/* wr_hi fields */
+#define S_WR_SGE_CREDITS    0
+#define M_WR_SGE_CREDITS    0xFF
+#define V_WR_SGE_CREDITS(x) ((x) << S_WR_SGE_CREDITS)
+#define G_WR_SGE_CREDITS(x) (((x) >> S_WR_SGE_CREDITS) & M_WR_SGE_CREDITS)
+
+#define S_WR_SGLSFLT    8
+#define M_WR_SGLSFLT    0xFF
+#define V_WR_SGLSFLT(x) ((x) << S_WR_SGLSFLT)
+#define G_WR_SGLSFLT(x) (((x) >> S_WR_SGLSFLT) & M_WR_SGLSFLT)
+
+#define S_WR_BCNTLFLT    16
+#define M_WR_BCNTLFLT    0xF
+#define V_WR_BCNTLFLT(x) ((x) << S_WR_BCNTLFLT)
+#define G_WR_BCNTLFLT(x) (((x) >> S_WR_BCNTLFLT) & M_WR_BCNTLFLT)
+
+#define S_WR_DATATYPE    20
+#define V_WR_DATATYPE(x) ((x) << S_WR_DATATYPE)
+#define F_WR_DATATYPE    V_WR_DATATYPE(1U)
+
+#define S_WR_COMPL    21
+#define V_WR_COMPL(x) ((x) << S_WR_COMPL)
+#define F_WR_COMPL    V_WR_COMPL(1U)
+
+#define S_WR_EOP    22
+#define V_WR_EOP(x) ((x) << S_WR_EOP)
+#define F_WR_EOP    V_WR_EOP(1U)
+
+#define S_WR_SOP    23
+#define V_WR_SOP(x) ((x) << S_WR_SOP)
+#define F_WR_SOP    V_WR_SOP(1U)
+
+#define S_WR_OP    24
+#define M_WR_OP    0xFF
+#define V_WR_OP(x) ((x) << S_WR_OP)
+#define G_WR_OP(x) (((x) >> S_WR_OP) & M_WR_OP)
+
+/* wr_lo fields */
+#define S_WR_LEN    0
+#define M_WR_LEN    0xFF
+#define V_WR_LEN(x) ((x) << S_WR_LEN)
+#define G_WR_LEN(x) (((x) >> S_WR_LEN) & M_WR_LEN)
+
+#define S_WR_TID    8
+#define M_WR_TID    0xFFFFF
+#define V_WR_TID(x) ((x) << S_WR_TID)
+#define G_WR_TID(x) (((x) >> S_WR_TID) & M_WR_TID)
+
+#define S_WR_CR_FLUSH    30
+#define V_WR_CR_FLUSH(x) ((x) << S_WR_CR_FLUSH)
+#define F_WR_CR_FLUSH    V_WR_CR_FLUSH(1U)
+
+#define S_WR_GEN    31
+#define V_WR_GEN(x) ((x) << S_WR_GEN)
+#define F_WR_GEN    V_WR_GEN(1U)
+
+# define WR_HDR struct work_request_hdr wr
+# define RSS_HDR
+#else
+# define WR_HDR
+# define RSS_HDR struct rss_header rss_hdr;
+#endif
+
+/* option 0 lower-half fields */
+#define S_CPL_STATUS    0
+#define M_CPL_STATUS    0xFF
+#define V_CPL_STATUS(x) ((x) << S_CPL_STATUS)
+#define G_CPL_STATUS(x) (((x) >> S_CPL_STATUS) & M_CPL_STATUS)
+
+#define S_INJECT_TIMER    6
+#define V_INJECT_TIMER(x) ((x) << S_INJECT_TIMER)
+#define F_INJECT_TIMER    V_INJECT_TIMER(1U)
+
+#define S_NO_OFFLOAD    7
+#define V_NO_OFFLOAD(x) ((x) << S_NO_OFFLOAD)
+#define F_NO_OFFLOAD    V_NO_OFFLOAD(1U)
+
+#define S_ULP_MODE    8
+#define M_ULP_MODE    0xF
+#define V_ULP_MODE(x) ((x) << S_ULP_MODE)
+#define G_ULP_MODE(x) (((x) >> S_ULP_MODE) & M_ULP_MODE)
+
+#define S_RCV_BUFSIZ    12
+#define M_RCV_BUFSIZ    0x3FFF
+#define V_RCV_BUFSIZ(x) ((x) << S_RCV_BUFSIZ)
+#define G_RCV_BUFSIZ(x) (((x) >> S_RCV_BUFSIZ) & M_RCV_BUFSIZ)
+
+#define S_TOS    26
+#define M_TOS    0x3F
+#define V_TOS(x) ((x) << S_TOS)
+#define G_TOS(x) (((x) >> S_TOS) & M_TOS)
+
+/* option 0 upper-half fields */
+#define S_DELACK    0
+#define V_DELACK(x) ((x) << S_DELACK)
+#define F_DELACK    V_DELACK(1U)
+
+#define S_NO_CONG    1
+#define V_NO_CONG(x) ((x) << S_NO_CONG)
+#define F_NO_CONG    V_NO_CONG(1U)
+
+#define S_SRC_MAC_SEL    2
+#define M_SRC_MAC_SEL    0x3
+#define V_SRC_MAC_SEL(x) ((x) << S_SRC_MAC_SEL)
+#define G_SRC_MAC_SEL(x) (((x) >> S_SRC_MAC_SEL) & M_SRC_MAC_SEL)
+
+#define S_L2T_IDX    4
+#define M_L2T_IDX    0x7FF
+#define V_L2T_IDX(x) ((x) << S_L2T_IDX)
+#define G_L2T_IDX(x) (((x) >> S_L2T_IDX) & M_L2T_IDX)
+
+#define S_TX_CHANNEL    15
+#define V_TX_CHANNEL(x) ((x) << S_TX_CHANNEL)
+#define F_TX_CHANNEL    V_TX_CHANNEL(1U)
+
+#define S_TCAM_BYPASS    16
+#define V_TCAM_BYPASS(x) ((x) << S_TCAM_BYPASS)
+#define F_TCAM_BYPASS    V_TCAM_BYPASS(1U)
+
+#define S_NAGLE    17
+#define V_NAGLE(x) ((x) << S_NAGLE)
+#define F_NAGLE    V_NAGLE(1U)
+
+#define S_WND_SCALE    18
+#define M_WND_SCALE    0xF
+#define V_WND_SCALE(x) ((x) << S_WND_SCALE)
+#define G_WND_SCALE(x) (((x) >> S_WND_SCALE) & M_WND_SCALE)
+
+#define S_KEEP_ALIVE    22
+#define V_KEEP_ALIVE(x) ((x) << S_KEEP_ALIVE)
+#define F_KEEP_ALIVE    V_KEEP_ALIVE(1U)
+
+#define S_MAX_RETRANS    23
+#define M_MAX_RETRANS    0xF
+#define V_MAX_RETRANS(x) ((x) << S_MAX_RETRANS)
+#define G_MAX_RETRANS(x) (((x) >> S_MAX_RETRANS) & M_MAX_RETRANS)
+
+#define S_MAX_RETRANS_OVERRIDE    27
+#define V_MAX_RETRANS_OVERRIDE(x) ((x) << S_MAX_RETRANS_OVERRIDE)
+#define F_MAX_RETRANS_OVERRIDE    V_MAX_RETRANS_OVERRIDE(1U)
+
+#define S_MSS_IDX    28
+#define M_MSS_IDX    0xF
+#define V_MSS_IDX(x) ((x) << S_MSS_IDX)
+#define G_MSS_IDX(x) (((x) >> S_MSS_IDX) & M_MSS_IDX)
+
+/* option 1 fields */
+#define S_RSS_ENABLE    0
+#define V_RSS_ENABLE(x) ((x) << S_RSS_ENABLE)
+#define F_RSS_ENABLE    V_RSS_ENABLE(1U)
+
+#define S_RSS_MASK_LEN    1
+#define M_RSS_MASK_LEN    0x7
+#define V_RSS_MASK_LEN(x) ((x) << S_RSS_MASK_LEN)
+#define G_RSS_MASK_LEN(x) (((x) >> S_RSS_MASK_LEN) & M_RSS_MASK_LEN)
+
+#define S_CPU_IDX    4
+#define M_CPU_IDX    0x3F
+#define V_CPU_IDX(x) ((x) << S_CPU_IDX)
+#define G_CPU_IDX(x) (((x) >> S_CPU_IDX) & M_CPU_IDX)
+
+#define S_MAC_MATCH_VALID    18
+#define V_MAC_MATCH_VALID(x) ((x) << S_MAC_MATCH_VALID)
+#define F_MAC_MATCH_VALID    V_MAC_MATCH_VALID(1U)
+
+#define S_CONN_POLICY    19
+#define M_CONN_POLICY    0x3
+#define V_CONN_POLICY(x) ((x) << S_CONN_POLICY)
+#define G_CONN_POLICY(x) (((x) >> S_CONN_POLICY) & M_CONN_POLICY)
+
+#define S_SYN_DEFENSE    21
+#define V_SYN_DEFENSE(x) ((x) << S_SYN_DEFENSE)
+#define F_SYN_DEFENSE    V_SYN_DEFENSE(1U)
+
+#define S_VLAN_PRI    22
+#define M_VLAN_PRI    0x3
+#define V_VLAN_PRI(x) ((x) << S_VLAN_PRI)
+#define G_VLAN_PRI(x) (((x) >> S_VLAN_PRI) & M_VLAN_PRI)
+
+#define S_VLAN_PRI_VALID    24
+#define V_VLAN_PRI_VALID(x) ((x) << S_VLAN_PRI_VALID)
+#define F_VLAN_PRI_VALID    V_VLAN_PRI_VALID(1U)
+
+#define S_PKT_TYPE    25
+#define M_PKT_TYPE    0x3
+#define V_PKT_TYPE(x) ((x) << S_PKT_TYPE)
+#define G_PKT_TYPE(x) (((x) >> S_PKT_TYPE) & M_PKT_TYPE)
+
+#define S_MAC_MATCH    27
+#define M_MAC_MATCH    0x1F
+#define V_MAC_MATCH(x) ((x) << S_MAC_MATCH)
+#define G_MAC_MATCH(x) (((x) >> S_MAC_MATCH) & M_MAC_MATCH)
+
+/* option 2 fields */
+#define S_CPU_INDEX    0
+#define M_CPU_INDEX    0x7F
+#define V_CPU_INDEX(x) ((x) << S_CPU_INDEX)
+#define G_CPU_INDEX(x) (((x) >> S_CPU_INDEX) & M_CPU_INDEX)
+
+#define S_CPU_INDEX_VALID    7
+#define V_CPU_INDEX_VALID(x) ((x) << S_CPU_INDEX_VALID)
+#define F_CPU_INDEX_VALID    V_CPU_INDEX_VALID(1U)
+
+#define S_RX_COALESCE    8
+#define M_RX_COALESCE    0x3
+#define V_RX_COALESCE(x) ((x) << S_RX_COALESCE)
+#define G_RX_COALESCE(x) (((x) >> S_RX_COALESCE) & M_RX_COALESCE)
+
+#define S_RX_COALESCE_VALID    10
+#define V_RX_COALESCE_VALID(x) ((x) << S_RX_COALESCE_VALID)
+#define F_RX_COALESCE_VALID    V_RX_COALESCE_VALID(1U)
+
+#define S_CONG_CONTROL_FLAVOR    11
+#define M_CONG_CONTROL_FLAVOR    0x3
+#define V_CONG_CONTROL_FLAVOR(x) ((x) << S_CONG_CONTROL_FLAVOR)
+#define G_CONG_CONTROL_FLAVOR(x) (((x) >> S_CONG_CONTROL_FLAVOR) & M_CONG_CONTROL_FLAVOR)
+
+#define S_PACING_FLAVOR    13
+#define M_PACING_FLAVOR    0x3
+#define V_PACING_FLAVOR(x) ((x) << S_PACING_FLAVOR)
+#define G_PACING_FLAVOR(x) (((x) >> S_PACING_FLAVOR) & M_PACING_FLAVOR)
+
+#define S_FLAVORS_VALID    15
+#define V_FLAVORS_VALID(x) ((x) << S_FLAVORS_VALID)
+#define F_FLAVORS_VALID    V_FLAVORS_VALID(1U)
+
+#define S_RX_FC_DISABLE    16
+#define V_RX_FC_DISABLE(x) ((x) << S_RX_FC_DISABLE)
+#define F_RX_FC_DISABLE    V_RX_FC_DISABLE(1U)
+
+#define S_RX_FC_VALID    17
+#define V_RX_FC_VALID(x) ((x) << S_RX_FC_VALID)
+#define F_RX_FC_VALID    V_RX_FC_VALID(1U)
+
+struct cpl_pass_open_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be32 opt0h;
+	__be32 opt0l;
+	__be32 peer_netmask;
+	__be32 opt1;
+};
+
+struct cpl_pass_open_rpl {
+	RSS_HDR union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__u8 resvd[7];
+	__u8 status;
+};
+
+struct cpl_pass_establish {
+	RSS_HDR union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be32 tos_tid;
+	__be16 l2t_idx;
+	__be16 tcp_opt;
+	__be32 snd_isn;
+	__be32 rcv_isn;
+};
+
+/* cpl_pass_establish.tos_tid fields */
+#define S_PASS_OPEN_TID    0
+#define M_PASS_OPEN_TID    0xFFFFFF
+#define V_PASS_OPEN_TID(x) ((x) << S_PASS_OPEN_TID)
+#define G_PASS_OPEN_TID(x) (((x) >> S_PASS_OPEN_TID) & M_PASS_OPEN_TID)
+
+#define S_PASS_OPEN_TOS    24
+#define M_PASS_OPEN_TOS    0xFF
+#define V_PASS_OPEN_TOS(x) ((x) << S_PASS_OPEN_TOS)
+#define G_PASS_OPEN_TOS(x) (((x) >> S_PASS_OPEN_TOS) & M_PASS_OPEN_TOS)
+
+/* cpl_pass_establish.l2t_idx fields */
+#define S_L2T_IDX16    5
+#define M_L2T_IDX16    0x7FF
+#define V_L2T_IDX16(x) ((x) << S_L2T_IDX16)
+#define G_L2T_IDX16(x) (((x) >> S_L2T_IDX16) & M_L2T_IDX16)
+
+/* cpl_pass_establish.tcp_opt fields (also applies act_open_establish) */
+#define G_TCPOPT_WSCALE_OK(x)  (((x) >> 5) & 1)
+#define G_TCPOPT_SACK(x)       (((x) >> 6) & 1)
+#define G_TCPOPT_TSTAMP(x)     (((x) >> 7) & 1)
+#define G_TCPOPT_SND_WSCALE(x) (((x) >> 8) & 0xf)
+#define G_TCPOPT_MSS(x)        (((x) >> 12) & 0xf)
+
+struct cpl_pass_accept_req {
+	RSS_HDR union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be32 tos_tid;
+	struct tcp_options tcp_options;
+	__u8 dst_mac[6];
+	__be16 vlan_tag;
+	__u8 src_mac[6];
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	 __u8:3;
+	__u8 addr_idx:3;
+	__u8 port_idx:1;
+	__u8 exact_match:1;
+#else
+	__u8 exact_match:1;
+	__u8 port_idx:1;
+	__u8 addr_idx:3;
+	 __u8:3;
+#endif
+	__u8 rsvd;
+	__be32 rcv_isn;
+	__be32 rsvd2;
+};
+
+struct cpl_pass_accept_rpl {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 opt2;
+	__be32 rsvd;
+	__be32 peer_ip;
+	__be32 opt0h;
+	__be32 opt0l_status;
+};
+
+struct cpl_act_open_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be32 opt0h;
+	__be32 opt0l;
+	__be32 params;
+	__be32 opt2;
+};
+
+/* cpl_act_open_req.params fields */
+#define S_AOPEN_VLAN_PRI    9
+#define M_AOPEN_VLAN_PRI    0x3
+#define V_AOPEN_VLAN_PRI(x) ((x) << S_AOPEN_VLAN_PRI)
+#define G_AOPEN_VLAN_PRI(x) (((x) >> S_AOPEN_VLAN_PRI) & M_AOPEN_VLAN_PRI)
+
+#define S_AOPEN_VLAN_PRI_VALID    11
+#define V_AOPEN_VLAN_PRI_VALID(x) ((x) << S_AOPEN_VLAN_PRI_VALID)
+#define F_AOPEN_VLAN_PRI_VALID    V_AOPEN_VLAN_PRI_VALID(1U)
+
+#define S_AOPEN_PKT_TYPE    12
+#define M_AOPEN_PKT_TYPE    0x3
+#define V_AOPEN_PKT_TYPE(x) ((x) << S_AOPEN_PKT_TYPE)
+#define G_AOPEN_PKT_TYPE(x) (((x) >> S_AOPEN_PKT_TYPE) & M_AOPEN_PKT_TYPE)
+
+#define S_AOPEN_MAC_MATCH    14
+#define M_AOPEN_MAC_MATCH    0x1F
+#define V_AOPEN_MAC_MATCH(x) ((x) << S_AOPEN_MAC_MATCH)
+#define G_AOPEN_MAC_MATCH(x) (((x) >> S_AOPEN_MAC_MATCH) & M_AOPEN_MAC_MATCH)
+
+#define S_AOPEN_MAC_MATCH_VALID    19
+#define V_AOPEN_MAC_MATCH_VALID(x) ((x) << S_AOPEN_MAC_MATCH_VALID)
+#define F_AOPEN_MAC_MATCH_VALID    V_AOPEN_MAC_MATCH_VALID(1U)
+
+#define S_AOPEN_IFF_VLAN    20
+#define M_AOPEN_IFF_VLAN    0xFFF
+#define V_AOPEN_IFF_VLAN(x) ((x) << S_AOPEN_IFF_VLAN)
+#define G_AOPEN_IFF_VLAN(x) (((x) >> S_AOPEN_IFF_VLAN) & M_AOPEN_IFF_VLAN)
+
+struct cpl_act_open_rpl {
+	RSS_HDR union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be32 atid;
+	__u8 rsvd[3];
+	__u8 status;
+};
+
+struct cpl_act_establish {
+	RSS_HDR union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be32 tos_tid;
+	__be16 l2t_idx;
+	__be16 tcp_opt;
+	__be32 snd_isn;
+	__be32 rcv_isn;
+};
+
+struct cpl_get_tcb {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 cpuno;
+	__be16 rsvd;
+};
+
+struct cpl_get_tcb_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 rsvd;
+	__u8 status;
+	__be16 len;
+};
+
+struct cpl_set_tcb {
+	WR_HDR;
+	union opcode_tid ot;
+	__u8 reply;
+	__u8 cpu_idx;
+	__be16 len;
+};
+
+/* cpl_set_tcb.reply fields */
+#define S_NO_REPLY    7
+#define V_NO_REPLY(x) ((x) << S_NO_REPLY)
+#define F_NO_REPLY    V_NO_REPLY(1U)
+
+struct cpl_set_tcb_field {
+	WR_HDR;
+	union opcode_tid ot;
+	__u8 reply;
+	__u8 cpu_idx;
+	__be16 word;
+	__be64 mask;
+	__be64 val;
+};
+
+struct cpl_set_tcb_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 rsvd[3];
+	__u8 status;
+};
+
+struct cpl_pcmd {
+	WR_HDR;
+	union opcode_tid ot;
+	__u8 rsvd[3];
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 src:1;
+	__u8 bundle:1;
+	__u8 channel:1;
+	 __u8:5;
+#else
+	 __u8:5;
+	__u8 channel:1;
+	__u8 bundle:1;
+	__u8 src:1;
+#endif
+	__be32 pcmd_parm[2];
+};
+
+struct cpl_pcmd_reply {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd;
+	__be16 len;
+};
+
+struct cpl_close_con_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd;
+};
+
+struct cpl_close_con_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 rsvd[3];
+	__u8 status;
+	__be32 snd_nxt;
+	__be32 rcv_nxt;
+};
+
+struct cpl_close_listserv_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__u8 rsvd0;
+	__u8 cpu_idx;
+	__be16 rsvd1;
+};
+
+struct cpl_close_listserv_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 rsvd[3];
+	__u8 status;
+};
+
+struct cpl_abort_req_rss {
+	RSS_HDR union opcode_tid ot;
+	__be32 rsvd0;
+	__u8 rsvd1;
+	__u8 status;
+	__u8 rsvd2[6];
+};
+
+struct cpl_abort_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd0;
+	__u8 rsvd1;
+	__u8 cmd;
+	__u8 rsvd2[6];
+};
+
+struct cpl_abort_rpl_rss {
+	RSS_HDR union opcode_tid ot;
+	__be32 rsvd0;
+	__u8 rsvd1;
+	__u8 status;
+	__u8 rsvd2[6];
+};
+
+struct cpl_abort_rpl {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd0;
+	__u8 rsvd1;
+	__u8 cmd;
+	__u8 rsvd2[6];
+};
+
+struct cpl_peer_close {
+	RSS_HDR union opcode_tid ot;
+	__be32 rcv_nxt;
+};
+
+struct tx_data_wr {
+	__be32 wr_hi;
+	__be32 wr_lo;
+	__be32 len;
+	__be32 flags;
+	__be32 sndseq;
+	__be32 param;
+};
+
+/* tx_data_wr.flags fields */
+#define S_TX_ACK_PAGES	21
+#define M_TX_ACK_PAGES	0x7
+#define V_TX_ACK_PAGES(x) ((x) << S_TX_ACK_PAGES)
+#define G_TX_ACK_PAGES(x) (((x) >> S_TX_ACK_PAGES) & M_TX_ACK_PAGES)
+
+/* tx_data_wr.param fields */
+#define S_TX_PORT    0
+#define M_TX_PORT    0x7
+#define V_TX_PORT(x) ((x) << S_TX_PORT)
+#define G_TX_PORT(x) (((x) >> S_TX_PORT) & M_TX_PORT)
+
+#define S_TX_MSS    4
+#define M_TX_MSS    0xF
+#define V_TX_MSS(x) ((x) << S_TX_MSS)
+#define G_TX_MSS(x) (((x) >> S_TX_MSS) & M_TX_MSS)
+
+#define S_TX_QOS    8
+#define M_TX_QOS    0xFF
+#define V_TX_QOS(x) ((x) << S_TX_QOS)
+#define G_TX_QOS(x) (((x) >> S_TX_QOS) & M_TX_QOS)
+
+#define S_TX_SNDBUF 16
+#define M_TX_SNDBUF 0xFFFF
+#define V_TX_SNDBUF(x) ((x) << S_TX_SNDBUF)
+#define G_TX_SNDBUF(x) (((x) >> S_TX_SNDBUF) & M_TX_SNDBUF)
+
+struct cpl_tx_data {
+	union opcode_tid ot;
+	__be32 len;
+	__be32 rsvd;
+	__be16 urg;
+	__be16 flags;
+};
+
+/* cpl_tx_data.flags fields */
+#define S_TX_ULP_SUBMODE    6
+#define M_TX_ULP_SUBMODE    0xF
+#define V_TX_ULP_SUBMODE(x) ((x) << S_TX_ULP_SUBMODE)
+#define G_TX_ULP_SUBMODE(x) (((x) >> S_TX_ULP_SUBMODE) & M_TX_ULP_SUBMODE)
+
+#define S_TX_ULP_MODE    10
+#define M_TX_ULP_MODE    0xF
+#define V_TX_ULP_MODE(x) ((x) << S_TX_ULP_MODE)
+#define G_TX_ULP_MODE(x) (((x) >> S_TX_ULP_MODE) & M_TX_ULP_MODE)
+
+#define S_TX_SHOVE    14
+#define V_TX_SHOVE(x) ((x) << S_TX_SHOVE)
+#define F_TX_SHOVE    V_TX_SHOVE(1U)
+
+#define S_TX_MORE    15
+#define V_TX_MORE(x) ((x) << S_TX_MORE)
+#define F_TX_MORE    V_TX_MORE(1U)
+
+/* additional tx_data_wr.flags fields */
+#define S_TX_CPU_IDX    0
+#define M_TX_CPU_IDX    0x3F
+#define V_TX_CPU_IDX(x) ((x) << S_TX_CPU_IDX)
+#define G_TX_CPU_IDX(x) (((x) >> S_TX_CPU_IDX) & M_TX_CPU_IDX)
+
+#define S_TX_URG    16
+#define V_TX_URG(x) ((x) << S_TX_URG)
+#define F_TX_URG    V_TX_URG(1U)
+
+#define S_TX_CLOSE    17
+#define V_TX_CLOSE(x) ((x) << S_TX_CLOSE)
+#define F_TX_CLOSE    V_TX_CLOSE(1U)
+
+#define S_TX_INIT    18
+#define V_TX_INIT(x) ((x) << S_TX_INIT)
+#define F_TX_INIT    V_TX_INIT(1U)
+
+#define S_TX_IMM_ACK    19
+#define V_TX_IMM_ACK(x) ((x) << S_TX_IMM_ACK)
+#define F_TX_IMM_ACK    V_TX_IMM_ACK(1U)
+
+#define S_TX_IMM_DMA    20
+#define V_TX_IMM_DMA(x) ((x) << S_TX_IMM_DMA)
+#define F_TX_IMM_DMA    V_TX_IMM_DMA(1U)
+
+struct cpl_tx_data_ack {
+	RSS_HDR union opcode_tid ot;
+	__be32 ack_seq;
+};
+
+struct cpl_wr_ack {
+	RSS_HDR union opcode_tid ot;
+	__be16 credits;
+	__be16 rsvd;
+	__be32 snd_nxt;
+	__be32 snd_una;
+};
+
+struct cpl_rdma_ec_status {
+	RSS_HDR union opcode_tid ot;
+	__u8 rsvd[3];
+	__u8 status;
+};
+
+struct mngt_pktsched_wr {
+	__be32 wr_hi;
+	__be32 wr_lo;
+	__u8 mngt_opcode;
+	__u8 rsvd[7];
+	__u8 sched;
+	__u8 idx;
+	__u8 min;
+	__u8 max;
+	__u8 binding;
+	__u8 rsvd1[3];
+};
+
+struct cpl_iscsi_hdr {
+	RSS_HDR union opcode_tid ot;
+	__be16 pdu_len_ddp;
+	__be16 len;
+	__be32 seq;
+	__be16 urg;
+	__u8 rsvd;
+	__u8 status;
+};
+
+/* cpl_iscsi_hdr.pdu_len_ddp fields */
+#define S_ISCSI_PDU_LEN    0
+#define M_ISCSI_PDU_LEN    0x7FFF
+#define V_ISCSI_PDU_LEN(x) ((x) << S_ISCSI_PDU_LEN)
+#define G_ISCSI_PDU_LEN(x) (((x) >> S_ISCSI_PDU_LEN) & M_ISCSI_PDU_LEN)
+
+#define S_ISCSI_DDP    15
+#define V_ISCSI_DDP(x) ((x) << S_ISCSI_DDP)
+#define F_ISCSI_DDP    V_ISCSI_DDP(1U)
+
+struct cpl_rx_data {
+	RSS_HDR union opcode_tid ot;
+	__be16 rsvd;
+	__be16 len;
+	__be32 seq;
+	__be16 urg;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 dack_mode:2;
+	__u8 psh:1;
+	__u8 heartbeat:1;
+	 __u8:4;
+#else
+	 __u8:4;
+	__u8 heartbeat:1;
+	__u8 psh:1;
+	__u8 dack_mode:2;
+#endif
+	__u8 status;
+};
+
+struct cpl_rx_data_ack {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 credit_dack;
+};
+
+/* cpl_rx_data_ack.ack_seq fields */
+#define S_RX_CREDITS    0
+#define M_RX_CREDITS    0x7FFFFFF
+#define V_RX_CREDITS(x) ((x) << S_RX_CREDITS)
+#define G_RX_CREDITS(x) (((x) >> S_RX_CREDITS) & M_RX_CREDITS)
+
+#define S_RX_MODULATE    27
+#define V_RX_MODULATE(x) ((x) << S_RX_MODULATE)
+#define F_RX_MODULATE    V_RX_MODULATE(1U)
+
+#define S_RX_FORCE_ACK    28
+#define V_RX_FORCE_ACK(x) ((x) << S_RX_FORCE_ACK)
+#define F_RX_FORCE_ACK    V_RX_FORCE_ACK(1U)
+
+#define S_RX_DACK_MODE    29
+#define M_RX_DACK_MODE    0x3
+#define V_RX_DACK_MODE(x) ((x) << S_RX_DACK_MODE)
+#define G_RX_DACK_MODE(x) (((x) >> S_RX_DACK_MODE) & M_RX_DACK_MODE)
+
+#define S_RX_DACK_CHANGE    31
+#define V_RX_DACK_CHANGE(x) ((x) << S_RX_DACK_CHANGE)
+#define F_RX_DACK_CHANGE    V_RX_DACK_CHANGE(1U)
+
+struct cpl_rx_urg_notify {
+	RSS_HDR union opcode_tid ot;
+	__be32 seq;
+};
+
+struct cpl_rx_ddp_complete {
+	RSS_HDR union opcode_tid ot;
+	__be32 ddp_report;
+};
+
+struct cpl_rx_data_ddp {
+	RSS_HDR union opcode_tid ot;
+	__be16 urg;
+	__be16 len;
+	__be32 seq;
+	union {
+		__be32 nxt_seq;
+		__be32 ddp_report;
+	};
+	__be32 ulp_crc;
+	__be32 ddpvld_status;
+};
+
+/* cpl_rx_data_ddp.ddpvld_status fields */
+#define S_DDP_STATUS    0
+#define M_DDP_STATUS    0xFF
+#define V_DDP_STATUS(x) ((x) << S_DDP_STATUS)
+#define G_DDP_STATUS(x) (((x) >> S_DDP_STATUS) & M_DDP_STATUS)
+
+#define S_DDP_VALID    15
+#define M_DDP_VALID    0x1FFFF
+#define V_DDP_VALID(x) ((x) << S_DDP_VALID)
+#define G_DDP_VALID(x) (((x) >> S_DDP_VALID) & M_DDP_VALID)
+
+#define S_DDP_PPOD_MISMATCH    15
+#define V_DDP_PPOD_MISMATCH(x) ((x) << S_DDP_PPOD_MISMATCH)
+#define F_DDP_PPOD_MISMATCH    V_DDP_PPOD_MISMATCH(1U)
+
+#define S_DDP_PDU    16
+#define V_DDP_PDU(x) ((x) << S_DDP_PDU)
+#define F_DDP_PDU    V_DDP_PDU(1U)
+
+#define S_DDP_LLIMIT_ERR    17
+#define V_DDP_LLIMIT_ERR(x) ((x) << S_DDP_LLIMIT_ERR)
+#define F_DDP_LLIMIT_ERR    V_DDP_LLIMIT_ERR(1U)
+
+#define S_DDP_PPOD_PARITY_ERR    18
+#define V_DDP_PPOD_PARITY_ERR(x) ((x) << S_DDP_PPOD_PARITY_ERR)
+#define F_DDP_PPOD_PARITY_ERR    V_DDP_PPOD_PARITY_ERR(1U)
+
+#define S_DDP_PADDING_ERR    19
+#define V_DDP_PADDING_ERR(x) ((x) << S_DDP_PADDING_ERR)
+#define F_DDP_PADDING_ERR    V_DDP_PADDING_ERR(1U)
+
+#define S_DDP_HDRCRC_ERR    20
+#define V_DDP_HDRCRC_ERR(x) ((x) << S_DDP_HDRCRC_ERR)
+#define F_DDP_HDRCRC_ERR    V_DDP_HDRCRC_ERR(1U)
+
+#define S_DDP_DATACRC_ERR    21
+#define V_DDP_DATACRC_ERR(x) ((x) << S_DDP_DATACRC_ERR)
+#define F_DDP_DATACRC_ERR    V_DDP_DATACRC_ERR(1U)
+
+#define S_DDP_INVALID_TAG    22
+#define V_DDP_INVALID_TAG(x) ((x) << S_DDP_INVALID_TAG)
+#define F_DDP_INVALID_TAG    V_DDP_INVALID_TAG(1U)
+
+#define S_DDP_ULIMIT_ERR    23
+#define V_DDP_ULIMIT_ERR(x) ((x) << S_DDP_ULIMIT_ERR)
+#define F_DDP_ULIMIT_ERR    V_DDP_ULIMIT_ERR(1U)
+
+#define S_DDP_OFFSET_ERR    24
+#define V_DDP_OFFSET_ERR(x) ((x) << S_DDP_OFFSET_ERR)
+#define F_DDP_OFFSET_ERR    V_DDP_OFFSET_ERR(1U)
+
+#define S_DDP_COLOR_ERR    25
+#define V_DDP_COLOR_ERR(x) ((x) << S_DDP_COLOR_ERR)
+#define F_DDP_COLOR_ERR    V_DDP_COLOR_ERR(1U)
+
+#define S_DDP_TID_MISMATCH    26
+#define V_DDP_TID_MISMATCH(x) ((x) << S_DDP_TID_MISMATCH)
+#define F_DDP_TID_MISMATCH    V_DDP_TID_MISMATCH(1U)
+
+#define S_DDP_INVALID_PPOD    27
+#define V_DDP_INVALID_PPOD(x) ((x) << S_DDP_INVALID_PPOD)
+#define F_DDP_INVALID_PPOD    V_DDP_INVALID_PPOD(1U)
+
+#define S_DDP_ULP_MODE    28
+#define M_DDP_ULP_MODE    0xF
+#define V_DDP_ULP_MODE(x) ((x) << S_DDP_ULP_MODE)
+#define G_DDP_ULP_MODE(x) (((x) >> S_DDP_ULP_MODE) & M_DDP_ULP_MODE)
+
+/* cpl_rx_data_ddp.ddp_report fields */
+#define S_DDP_OFFSET    0
+#define M_DDP_OFFSET    0x3FFFFF
+#define V_DDP_OFFSET(x) ((x) << S_DDP_OFFSET)
+#define G_DDP_OFFSET(x) (((x) >> S_DDP_OFFSET) & M_DDP_OFFSET)
+
+#define S_DDP_URG    24
+#define V_DDP_URG(x) ((x) << S_DDP_URG)
+#define F_DDP_URG    V_DDP_URG(1U)
+
+#define S_DDP_PSH    25
+#define V_DDP_PSH(x) ((x) << S_DDP_PSH)
+#define F_DDP_PSH    V_DDP_PSH(1U)
+
+#define S_DDP_BUF_COMPLETE    26
+#define V_DDP_BUF_COMPLETE(x) ((x) << S_DDP_BUF_COMPLETE)
+#define F_DDP_BUF_COMPLETE    V_DDP_BUF_COMPLETE(1U)
+
+#define S_DDP_BUF_TIMED_OUT    27
+#define V_DDP_BUF_TIMED_OUT(x) ((x) << S_DDP_BUF_TIMED_OUT)
+#define F_DDP_BUF_TIMED_OUT    V_DDP_BUF_TIMED_OUT(1U)
+
+#define S_DDP_BUF_IDX    28
+#define V_DDP_BUF_IDX(x) ((x) << S_DDP_BUF_IDX)
+#define F_DDP_BUF_IDX    V_DDP_BUF_IDX(1U)
+
+struct cpl_tx_pkt {
+	WR_HDR;
+	__be32 cntrl;
+	__be32 len;
+};
+
+struct cpl_tx_pkt_lso {
+	WR_HDR;
+	__be32 cntrl;
+	__be32 len;
+
+	__be32 rsvd;
+	__be32 lso_info;
+};
+
+/* cpl_tx_pkt*.cntrl fields */
+#define S_TXPKT_VLAN    0
+#define M_TXPKT_VLAN    0xFFFF
+#define V_TXPKT_VLAN(x) ((x) << S_TXPKT_VLAN)
+#define G_TXPKT_VLAN(x) (((x) >> S_TXPKT_VLAN) & M_TXPKT_VLAN)
+
+#define S_TXPKT_INTF    16
+#define M_TXPKT_INTF    0xF
+#define V_TXPKT_INTF(x) ((x) << S_TXPKT_INTF)
+#define G_TXPKT_INTF(x) (((x) >> S_TXPKT_INTF) & M_TXPKT_INTF)
+
+#define S_TXPKT_IPCSUM_DIS    20
+#define V_TXPKT_IPCSUM_DIS(x) ((x) << S_TXPKT_IPCSUM_DIS)
+#define F_TXPKT_IPCSUM_DIS    V_TXPKT_IPCSUM_DIS(1U)
+
+#define S_TXPKT_L4CSUM_DIS    21
+#define V_TXPKT_L4CSUM_DIS(x) ((x) << S_TXPKT_L4CSUM_DIS)
+#define F_TXPKT_L4CSUM_DIS    V_TXPKT_L4CSUM_DIS(1U)
+
+#define S_TXPKT_VLAN_VLD    22
+#define V_TXPKT_VLAN_VLD(x) ((x) << S_TXPKT_VLAN_VLD)
+#define F_TXPKT_VLAN_VLD    V_TXPKT_VLAN_VLD(1U)
+
+#define S_TXPKT_LOOPBACK    23
+#define V_TXPKT_LOOPBACK(x) ((x) << S_TXPKT_LOOPBACK)
+#define F_TXPKT_LOOPBACK    V_TXPKT_LOOPBACK(1U)
+
+#define S_TXPKT_OPCODE    24
+#define M_TXPKT_OPCODE    0xFF
+#define V_TXPKT_OPCODE(x) ((x) << S_TXPKT_OPCODE)
+#define G_TXPKT_OPCODE(x) (((x) >> S_TXPKT_OPCODE) & M_TXPKT_OPCODE)
+
+/* cpl_tx_pkt_lso.lso_info fields */
+#define S_LSO_MSS    0
+#define M_LSO_MSS    0x3FFF
+#define V_LSO_MSS(x) ((x) << S_LSO_MSS)
+#define G_LSO_MSS(x) (((x) >> S_LSO_MSS) & M_LSO_MSS)
+
+#define S_LSO_ETH_TYPE    14
+#define M_LSO_ETH_TYPE    0x3
+#define V_LSO_ETH_TYPE(x) ((x) << S_LSO_ETH_TYPE)
+#define G_LSO_ETH_TYPE(x) (((x) >> S_LSO_ETH_TYPE) & M_LSO_ETH_TYPE)
+
+#define S_LSO_TCPHDR_WORDS    16
+#define M_LSO_TCPHDR_WORDS    0xF
+#define V_LSO_TCPHDR_WORDS(x) ((x) << S_LSO_TCPHDR_WORDS)
+#define G_LSO_TCPHDR_WORDS(x) (((x) >> S_LSO_TCPHDR_WORDS) & M_LSO_TCPHDR_WORDS)
+
+#define S_LSO_IPHDR_WORDS    20
+#define M_LSO_IPHDR_WORDS    0xF
+#define V_LSO_IPHDR_WORDS(x) ((x) << S_LSO_IPHDR_WORDS)
+#define G_LSO_IPHDR_WORDS(x) (((x) >> S_LSO_IPHDR_WORDS) & M_LSO_IPHDR_WORDS)
+
+#define S_LSO_IPV6    24
+#define V_LSO_IPV6(x) ((x) << S_LSO_IPV6)
+#define F_LSO_IPV6    V_LSO_IPV6(1U)
+
+struct cpl_trace_pkt {
+#ifdef CHELSIO_FW
+	__u8 rss_opcode;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 err:1;
+	 __u8:7;
+#else
+	 __u8:7;
+	__u8 err:1;
+#endif
+	__u8 rsvd0;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 qid:4;
+	 __u8:4;
+#else
+	 __u8:4;
+	__u8 qid:4;
+#endif
+	__be32 tstamp;
+#endif				/* CHELSIO_FW */
+
+	__u8 opcode;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 iff:4;
+	 __u8:4;
+#else
+	 __u8:4;
+	__u8 iff:4;
+#endif
+	__u8 rsvd[4];
+	__be16 len;
+};
+
+struct cpl_rx_pkt {
+	RSS_HDR __u8 opcode;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 iff:4;
+	__u8 csum_valid:1;
+	__u8 ipmi_pkt:1;
+	__u8 vlan_valid:1;
+	__u8 fragment:1;
+#else
+	__u8 fragment:1;
+	__u8 vlan_valid:1;
+	__u8 ipmi_pkt:1;
+	__u8 csum_valid:1;
+	__u8 iff:4;
+#endif
+	__be16 csum;
+	__be16 vlan;
+	__be16 len;
+};
+
+struct cpl_l2t_write_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 params;
+	__u8 rsvd[2];
+	__u8 dst_mac[6];
+};
+
+/* cpl_l2t_write_req.params fields */
+#define S_L2T_W_IDX    0
+#define M_L2T_W_IDX    0x7FF
+#define V_L2T_W_IDX(x) ((x) << S_L2T_W_IDX)
+#define G_L2T_W_IDX(x) (((x) >> S_L2T_W_IDX) & M_L2T_W_IDX)
+
+#define S_L2T_W_VLAN    11
+#define M_L2T_W_VLAN    0xFFF
+#define V_L2T_W_VLAN(x) ((x) << S_L2T_W_VLAN)
+#define G_L2T_W_VLAN(x) (((x) >> S_L2T_W_VLAN) & M_L2T_W_VLAN)
+
+#define S_L2T_W_IFF    23
+#define M_L2T_W_IFF    0xF
+#define V_L2T_W_IFF(x) ((x) << S_L2T_W_IFF)
+#define G_L2T_W_IFF(x) (((x) >> S_L2T_W_IFF) & M_L2T_W_IFF)
+
+#define S_L2T_W_PRIO    27
+#define M_L2T_W_PRIO    0x7
+#define V_L2T_W_PRIO(x) ((x) << S_L2T_W_PRIO)
+#define G_L2T_W_PRIO(x) (((x) >> S_L2T_W_PRIO) & M_L2T_W_PRIO)
+
+struct cpl_l2t_write_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd[3];
+};
+
+struct cpl_l2t_read_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 rsvd;
+	__be16 l2t_idx;
+};
+
+struct cpl_l2t_read_rpl {
+	RSS_HDR union opcode_tid ot;
+	__be32 params;
+	__u8 rsvd[2];
+	__u8 dst_mac[6];
+};
+
+/* cpl_l2t_read_rpl.params fields */
+#define S_L2T_R_PRIO    0
+#define M_L2T_R_PRIO    0x7
+#define V_L2T_R_PRIO(x) ((x) << S_L2T_R_PRIO)
+#define G_L2T_R_PRIO(x) (((x) >> S_L2T_R_PRIO) & M_L2T_R_PRIO)
+
+#define S_L2T_R_VLAN    8
+#define M_L2T_R_VLAN    0xFFF
+#define V_L2T_R_VLAN(x) ((x) << S_L2T_R_VLAN)
+#define G_L2T_R_VLAN(x) (((x) >> S_L2T_R_VLAN) & M_L2T_R_VLAN)
+
+#define S_L2T_R_IFF    20
+#define M_L2T_R_IFF    0xF
+#define V_L2T_R_IFF(x) ((x) << S_L2T_R_IFF)
+#define G_L2T_R_IFF(x) (((x) >> S_L2T_R_IFF) & M_L2T_R_IFF)
+
+#define S_L2T_STATUS    24
+#define M_L2T_STATUS    0xFF
+#define V_L2T_STATUS(x) ((x) << S_L2T_STATUS)
+#define G_L2T_STATUS(x) (((x) >> S_L2T_STATUS) & M_L2T_STATUS)
+
+struct cpl_smt_write_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__u8 rsvd0;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 mtu_idx:4;
+	__u8 iff:4;
+#else
+	__u8 iff:4;
+	__u8 mtu_idx:4;
+#endif
+	__be16 rsvd2;
+	__be16 rsvd3;
+	__u8 src_mac1[6];
+	__be16 rsvd4;
+	__u8 src_mac0[6];
+};
+
+struct cpl_smt_write_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd[3];
+};
+
+struct cpl_smt_read_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__u8 rsvd0;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	 __u8:4;
+	__u8 iff:4;
+#else
+	__u8 iff:4;
+	 __u8:4;
+#endif
+	__be16 rsvd2;
+};
+
+struct cpl_smt_read_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 mtu_idx:4;
+	 __u8:4;
+#else
+	 __u8:4;
+	__u8 mtu_idx:4;
+#endif
+	__be16 rsvd2;
+	__be16 rsvd3;
+	__u8 src_mac1[6];
+	__be16 rsvd4;
+	__u8 src_mac0[6];
+};
+
+struct cpl_rte_delete_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 params;
+};
+
+/* { cpl_rte_delete_req, cpl_rte_read_req }.params fields */
+#define S_RTE_REQ_LUT_IX    8
+#define M_RTE_REQ_LUT_IX    0x7FF
+#define V_RTE_REQ_LUT_IX(x) ((x) << S_RTE_REQ_LUT_IX)
+#define G_RTE_REQ_LUT_IX(x) (((x) >> S_RTE_REQ_LUT_IX) & M_RTE_REQ_LUT_IX)
+
+#define S_RTE_REQ_LUT_BASE    19
+#define M_RTE_REQ_LUT_BASE    0x7FF
+#define V_RTE_REQ_LUT_BASE(x) ((x) << S_RTE_REQ_LUT_BASE)
+#define G_RTE_REQ_LUT_BASE(x) (((x) >> S_RTE_REQ_LUT_BASE) & M_RTE_REQ_LUT_BASE)
+
+#define S_RTE_READ_REQ_SELECT    31
+#define V_RTE_READ_REQ_SELECT(x) ((x) << S_RTE_READ_REQ_SELECT)
+#define F_RTE_READ_REQ_SELECT    V_RTE_READ_REQ_SELECT(1U)
+
+struct cpl_rte_delete_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd[3];
+};
+
+struct cpl_rte_write_req {
+	WR_HDR;
+	union opcode_tid ot;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	 __u8:6;
+	__u8 write_tcam:1;
+	__u8 write_l2t_lut:1;
+#else
+	__u8 write_l2t_lut:1;
+	__u8 write_tcam:1;
+	 __u8:6;
+#endif
+	__u8 rsvd[3];
+	__be32 lut_params;
+	__be16 rsvd2;
+	__be16 l2t_idx;
+	__be32 netmask;
+	__be32 faddr;
+};
+
+/* cpl_rte_write_req.lut_params fields */
+#define S_RTE_WRITE_REQ_LUT_IX    10
+#define M_RTE_WRITE_REQ_LUT_IX    0x7FF
+#define V_RTE_WRITE_REQ_LUT_IX(x) ((x) << S_RTE_WRITE_REQ_LUT_IX)
+#define G_RTE_WRITE_REQ_LUT_IX(x) (((x) >> S_RTE_WRITE_REQ_LUT_IX) & M_RTE_WRITE_REQ_LUT_IX)
+
+#define S_RTE_WRITE_REQ_LUT_BASE    21
+#define M_RTE_WRITE_REQ_LUT_BASE    0x7FF
+#define V_RTE_WRITE_REQ_LUT_BASE(x) ((x) << S_RTE_WRITE_REQ_LUT_BASE)
+#define G_RTE_WRITE_REQ_LUT_BASE(x) (((x) >> S_RTE_WRITE_REQ_LUT_BASE) & M_RTE_WRITE_REQ_LUT_BASE)
+
+struct cpl_rte_write_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd[3];
+};
+
+struct cpl_rte_read_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 params;
+};
+
+struct cpl_rte_read_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd0;
+	__be16 l2t_idx;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	 __u8:7;
+	__u8 select:1;
+#else
+	__u8 select:1;
+	 __u8:7;
+#endif
+	__u8 rsvd2[3];
+	__be32 addr;
+};
+
+struct cpl_tid_release {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd;
+};
+
+struct cpl_barrier {
+	WR_HDR;
+	__u8 opcode;
+	__u8 rsvd[7];
+};
+
+struct cpl_rdma_read_req {
+	__u8 opcode;
+	__u8 rsvd[15];
+};
+
+struct cpl_rdma_terminate {
+#ifdef CHELSIO_FW
+	__u8 opcode;
+	__u8 rsvd[2];
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 rspq:3;
+	 __u8:5;
+#else
+	 __u8:5;
+	__u8 rspq:3;
+#endif
+	__be32 tid_len;
+#endif
+	__be32 msn;
+	__be32 mo;
+	__u8 data[0];
+};
+
+/* cpl_rdma_terminate.tid_len fields */
+#define S_FLIT_CNT    0
+#define M_FLIT_CNT    0xFF
+#define V_FLIT_CNT(x) ((x) << S_FLIT_CNT)
+#define G_FLIT_CNT(x) (((x) >> S_FLIT_CNT) & M_FLIT_CNT)
+
+#define S_TERM_TID    8
+#define M_TERM_TID    0xFFFFF
+#define V_TERM_TID(x) ((x) << S_TERM_TID)
+#define G_TERM_TID(x) (((x) >> S_TERM_TID) & M_TERM_TID)
+
+/* ULP_TX opcodes */
+enum { ULP_MEM_READ = 2, ULP_MEM_WRITE = 3, ULP_TXPKT = 4 };
+
+#define S_ULPTX_CMD	28
+#define M_ULPTX_CMD	0xF
+#define V_ULPTX_CMD(x)	((x) << S_ULPTX_CMD)
+
+#define S_ULPTX_NFLITS	0
+#define M_ULPTX_NFLITS	0xFF
+#define V_ULPTX_NFLITS(x) ((x) << S_ULPTX_NFLITS)
+
+struct ulp_mem_io {
+	WR_HDR;
+	__be32 cmd_lock_addr;
+	__be32 len;
+};
+
+/* ulp_mem_io.cmd_lock_addr fields */
+#define S_ULP_MEMIO_ADDR	0
+#define M_ULP_MEMIO_ADDR	0x7FFFFFF
+#define V_ULP_MEMIO_ADDR(x)	((x) << S_ULP_MEMIO_ADDR)
+#define S_ULP_MEMIO_LOCK	27
+#define V_ULP_MEMIO_LOCK(x)	((x) << S_ULP_MEMIO_LOCK)
+#define F_ULP_MEMIO_LOCK	V_ULP_MEMIO_LOCK(1U)
+
+/* ulp_mem_io.len fields */
+#define S_ULP_MEMIO_DATA_LEN	28
+#define M_ULP_MEMIO_DATA_LEN	0xF
+#define V_ULP_MEMIO_DATA_LEN(x)	((x) << S_ULP_MEMIO_DATA_LEN)
+
+#endif				/* T3_CPL_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c
new file mode 100644
index 0000000..080918a
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c
@@ -0,0 +1,3811 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "common.h"
+#include "regs.h"
+#include "sge_defs.h"
+#include "firmware_exports.h"
+
+static void t3_port_intr_clear(struct adapter *adapter, int idx);
+
+/**
+ *	t3_wait_op_done_val - wait until an operation is completed
+ *	@adapter: the adapter performing the operation
+ *	@reg: the register to check for completion
+ *	@mask: a single-bit field within @reg that indicates completion
+ *	@polarity: the value of the field when the operation is completed
+ *	@attempts: number of check iterations
+ *	@delay: delay in usecs between iterations
+ *	@valp: where to store the value of the register at completion time
+ *
+ *	Wait until an operation is completed by checking a bit in a register
+ *	up to @attempts times.  If @valp is not NULL the value of the register
+ *	at the time it indicated completion is stored there.  Returns 0 if the
+ *	operation completes and -EAGAIN otherwise.
+ */
+
+int t3_wait_op_done_val(struct adapter *adapter, int reg, u32 mask,
+			int polarity, int attempts, int delay, u32 *valp)
+{
+	while (1) {
+		u32 val = t3_read_reg(adapter, reg);
+
+		if (!!(val & mask) == polarity) {
+			if (valp)
+				*valp = val;
+			return 0;
+		}
+		if (--attempts == 0)
+			return -EAGAIN;
+		if (delay)
+			udelay(delay);
+	}
+}
+
+/**
+ *	t3_write_regs - write a bunch of registers
+ *	@adapter: the adapter to program
+ *	@p: an array of register address/register value pairs
+ *	@n: the number of address/value pairs
+ *	@offset: register address offset
+ *
+ *	Takes an array of register address/register value pairs and writes each
+ *	value to the corresponding register.  Register addresses are adjusted
+ *	by the supplied offset.
+ */
+void t3_write_regs(struct adapter *adapter, const struct addr_val_pair *p,
+		   int n, unsigned int offset)
+{
+	while (n--) {
+		t3_write_reg(adapter, p->reg_addr + offset, p->val);
+		p++;
+	}
+}
+
+/**
+ *	t3_set_reg_field - set a register field to a value
+ *	@adapter: the adapter to program
+ *	@addr: the register address
+ *	@mask: specifies the portion of the register to modify
+ *	@val: the new value for the register field
+ *
+ *	Sets a register field specified by the supplied mask to the
+ *	given value.
+ */
+void t3_set_reg_field(struct adapter *adapter, unsigned int addr, u32 mask,
+		      u32 val)
+{
+	u32 v = t3_read_reg(adapter, addr) & ~mask;
+
+	t3_write_reg(adapter, addr, v | val);
+	t3_read_reg(adapter, addr);	/* flush */
+}
+
+/**
+ *	t3_read_indirect - read indirectly addressed registers
+ *	@adap: the adapter
+ *	@addr_reg: register holding the indirect address
+ *	@data_reg: register holding the value of the indirect register
+ *	@vals: where the read register values are stored
+ *	@start_idx: index of first indirect register to read
+ *	@nregs: how many indirect registers to read
+ *
+ *	Reads registers that are accessed indirectly through an address/data
+ *	register pair.
+ */
+static void t3_read_indirect(struct adapter *adap, unsigned int addr_reg,
+			     unsigned int data_reg, u32 *vals,
+			     unsigned int nregs, unsigned int start_idx)
+{
+	while (nregs--) {
+		t3_write_reg(adap, addr_reg, start_idx);
+		*vals++ = t3_read_reg(adap, data_reg);
+		start_idx++;
+	}
+}
+
+/**
+ *	t3_mc7_bd_read - read from MC7 through backdoor accesses
+ *	@mc7: identifies MC7 to read from
+ *	@start: index of first 64-bit word to read
+ *	@n: number of 64-bit words to read
+ *	@buf: where to store the read result
+ *
+ *	Read n 64-bit words from MC7 starting at word start, using backdoor
+ *	accesses.
+ */
+int t3_mc7_bd_read(struct mc7 *mc7, unsigned int start, unsigned int n,
+		   u64 *buf)
+{
+	static const int shift[] = { 0, 0, 16, 24 };
+	static const int step[] = { 0, 32, 16, 8 };
+
+	unsigned int size64 = mc7->size / 8;	/* # of 64-bit words */
+	struct adapter *adap = mc7->adapter;
+
+	if (start >= size64 || start + n > size64)
+		return -EINVAL;
+
+	start *= (8 << mc7->width);
+	while (n--) {
+		int i;
+		u64 val64 = 0;
+
+		for (i = (1 << mc7->width) - 1; i >= 0; --i) {
+			int attempts = 10;
+			u32 val;
+
+			t3_write_reg(adap, mc7->offset + A_MC7_BD_ADDR, start);
+			t3_write_reg(adap, mc7->offset + A_MC7_BD_OP, 0);
+			val = t3_read_reg(adap, mc7->offset + A_MC7_BD_OP);
+			while ((val & F_BUSY) && attempts--)
+				val = t3_read_reg(adap,
+						  mc7->offset + A_MC7_BD_OP);
+			if (val & F_BUSY)
+				return -EIO;
+
+			val = t3_read_reg(adap, mc7->offset + A_MC7_BD_DATA1);
+			if (mc7->width == 0) {
+				val64 = t3_read_reg(adap,
+						    mc7->offset +
+						    A_MC7_BD_DATA0);
+				val64 |= (u64) val << 32;
+			} else {
+				if (mc7->width > 1)
+					val >>= shift[mc7->width];
+				val64 |= (u64) val << (step[mc7->width] * i);
+			}
+			start += 8;
+		}
+		*buf++ = val64;
+	}
+	return 0;
+}
+
+/*
+ * Initialize MI1.
+ */
+static void mi1_init(struct adapter *adap, const struct adapter_info *ai)
+{
+	u32 clkdiv = adap->params.vpd.cclk / (2 * adap->params.vpd.mdc) - 1;
+	u32 val = F_PREEN | V_CLKDIV(clkdiv);
+
+	t3_write_reg(adap, A_MI1_CFG, val);
+}
+
+#define MDIO_ATTEMPTS 20
+
+/*
+ * MI1 read/write operations for clause 22 PHYs.
+ */
+static int t3_mi1_read(struct net_device *dev, int phy_addr, int mmd_addr,
+		       u16 reg_addr)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int ret;
+	u32 addr = V_REGADDR(reg_addr) | V_PHYADDR(phy_addr);
+
+	mutex_lock(&adapter->mdio_lock);
+	t3_set_reg_field(adapter, A_MI1_CFG, V_ST(M_ST), V_ST(1));
+	t3_write_reg(adapter, A_MI1_ADDR, addr);
+	t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(2));
+	ret = t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0, MDIO_ATTEMPTS, 10);
+	if (!ret)
+		ret = t3_read_reg(adapter, A_MI1_DATA);
+	mutex_unlock(&adapter->mdio_lock);
+	return ret;
+}
+
+static int t3_mi1_write(struct net_device *dev, int phy_addr, int mmd_addr,
+			u16 reg_addr, u16 val)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int ret;
+	u32 addr = V_REGADDR(reg_addr) | V_PHYADDR(phy_addr);
+
+	mutex_lock(&adapter->mdio_lock);
+	t3_set_reg_field(adapter, A_MI1_CFG, V_ST(M_ST), V_ST(1));
+	t3_write_reg(adapter, A_MI1_ADDR, addr);
+	t3_write_reg(adapter, A_MI1_DATA, val);
+	t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(1));
+	ret = t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0, MDIO_ATTEMPTS, 10);
+	mutex_unlock(&adapter->mdio_lock);
+	return ret;
+}
+
+static const struct mdio_ops mi1_mdio_ops = {
+	.read = t3_mi1_read,
+	.write = t3_mi1_write,
+	.mode_support = MDIO_SUPPORTS_C22
+};
+
+/*
+ * Performs the address cycle for clause 45 PHYs.
+ * Must be called with the MDIO_LOCK held.
+ */
+static int mi1_wr_addr(struct adapter *adapter, int phy_addr, int mmd_addr,
+		       int reg_addr)
+{
+	u32 addr = V_REGADDR(mmd_addr) | V_PHYADDR(phy_addr);
+
+	t3_set_reg_field(adapter, A_MI1_CFG, V_ST(M_ST), 0);
+	t3_write_reg(adapter, A_MI1_ADDR, addr);
+	t3_write_reg(adapter, A_MI1_DATA, reg_addr);
+	t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(0));
+	return t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0,
+			       MDIO_ATTEMPTS, 10);
+}
+
+/*
+ * MI1 read/write operations for indirect-addressed PHYs.
+ */
+static int mi1_ext_read(struct net_device *dev, int phy_addr, int mmd_addr,
+			u16 reg_addr)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int ret;
+
+	mutex_lock(&adapter->mdio_lock);
+	ret = mi1_wr_addr(adapter, phy_addr, mmd_addr, reg_addr);
+	if (!ret) {
+		t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(3));
+		ret = t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0,
+				      MDIO_ATTEMPTS, 10);
+		if (!ret)
+			ret = t3_read_reg(adapter, A_MI1_DATA);
+	}
+	mutex_unlock(&adapter->mdio_lock);
+	return ret;
+}
+
+static int mi1_ext_write(struct net_device *dev, int phy_addr, int mmd_addr,
+			 u16 reg_addr, u16 val)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int ret;
+
+	mutex_lock(&adapter->mdio_lock);
+	ret = mi1_wr_addr(adapter, phy_addr, mmd_addr, reg_addr);
+	if (!ret) {
+		t3_write_reg(adapter, A_MI1_DATA, val);
+		t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(1));
+		ret = t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0,
+				      MDIO_ATTEMPTS, 10);
+	}
+	mutex_unlock(&adapter->mdio_lock);
+	return ret;
+}
+
+static const struct mdio_ops mi1_mdio_ext_ops = {
+	.read = mi1_ext_read,
+	.write = mi1_ext_write,
+	.mode_support = MDIO_SUPPORTS_C45 | MDIO_EMULATE_C22
+};
+
+/**
+ *	t3_mdio_change_bits - modify the value of a PHY register
+ *	@phy: the PHY to operate on
+ *	@mmd: the device address
+ *	@reg: the register address
+ *	@clear: what part of the register value to mask off
+ *	@set: what part of the register value to set
+ *
+ *	Changes the value of a PHY register by applying a mask to its current
+ *	value and ORing the result with a new value.
+ */
+int t3_mdio_change_bits(struct cphy *phy, int mmd, int reg, unsigned int clear,
+			unsigned int set)
+{
+	int ret;
+	unsigned int val;
+
+	ret = t3_mdio_read(phy, mmd, reg, &val);
+	if (!ret) {
+		val &= ~clear;
+		ret = t3_mdio_write(phy, mmd, reg, val | set);
+	}
+	return ret;
+}
+
+/**
+ *	t3_phy_reset - reset a PHY block
+ *	@phy: the PHY to operate on
+ *	@mmd: the device address of the PHY block to reset
+ *	@wait: how long to wait for the reset to complete in 1ms increments
+ *
+ *	Resets a PHY block and optionally waits for the reset to complete.
+ *	@mmd should be 0 for 10/100/1000 PHYs and the device address to reset
+ *	for 10G PHYs.
+ */
+int t3_phy_reset(struct cphy *phy, int mmd, int wait)
+{
+	int err;
+	unsigned int ctl;
+
+	err = t3_mdio_change_bits(phy, mmd, MDIO_CTRL1, MDIO_CTRL1_LPOWER,
+				  MDIO_CTRL1_RESET);
+	if (err || !wait)
+		return err;
+
+	do {
+		err = t3_mdio_read(phy, mmd, MDIO_CTRL1, &ctl);
+		if (err)
+			return err;
+		ctl &= MDIO_CTRL1_RESET;
+		if (ctl)
+			msleep(1);
+	} while (ctl && --wait);
+
+	return ctl ? -1 : 0;
+}
+
+/**
+ *	t3_phy_advertise - set the PHY advertisement registers for autoneg
+ *	@phy: the PHY to operate on
+ *	@advert: bitmap of capabilities the PHY should advertise
+ *
+ *	Sets a 10/100/1000 PHY's advertisement registers to advertise the
+ *	requested capabilities.
+ */
+int t3_phy_advertise(struct cphy *phy, unsigned int advert)
+{
+	int err;
+	unsigned int val = 0;
+
+	err = t3_mdio_read(phy, MDIO_DEVAD_NONE, MII_CTRL1000, &val);
+	if (err)
+		return err;
+
+	val &= ~(ADVERTISE_1000HALF | ADVERTISE_1000FULL);
+	if (advert & ADVERTISED_1000baseT_Half)
+		val |= ADVERTISE_1000HALF;
+	if (advert & ADVERTISED_1000baseT_Full)
+		val |= ADVERTISE_1000FULL;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, MII_CTRL1000, val);
+	if (err)
+		return err;
+
+	val = 1;
+	if (advert & ADVERTISED_10baseT_Half)
+		val |= ADVERTISE_10HALF;
+	if (advert & ADVERTISED_10baseT_Full)
+		val |= ADVERTISE_10FULL;
+	if (advert & ADVERTISED_100baseT_Half)
+		val |= ADVERTISE_100HALF;
+	if (advert & ADVERTISED_100baseT_Full)
+		val |= ADVERTISE_100FULL;
+	if (advert & ADVERTISED_Pause)
+		val |= ADVERTISE_PAUSE_CAP;
+	if (advert & ADVERTISED_Asym_Pause)
+		val |= ADVERTISE_PAUSE_ASYM;
+	return t3_mdio_write(phy, MDIO_DEVAD_NONE, MII_ADVERTISE, val);
+}
+
+/**
+ *	t3_phy_advertise_fiber - set fiber PHY advertisement register
+ *	@phy: the PHY to operate on
+ *	@advert: bitmap of capabilities the PHY should advertise
+ *
+ *	Sets a fiber PHY's advertisement register to advertise the
+ *	requested capabilities.
+ */
+int t3_phy_advertise_fiber(struct cphy *phy, unsigned int advert)
+{
+	unsigned int val = 0;
+
+	if (advert & ADVERTISED_1000baseT_Half)
+		val |= ADVERTISE_1000XHALF;
+	if (advert & ADVERTISED_1000baseT_Full)
+		val |= ADVERTISE_1000XFULL;
+	if (advert & ADVERTISED_Pause)
+		val |= ADVERTISE_1000XPAUSE;
+	if (advert & ADVERTISED_Asym_Pause)
+		val |= ADVERTISE_1000XPSE_ASYM;
+	return t3_mdio_write(phy, MDIO_DEVAD_NONE, MII_ADVERTISE, val);
+}
+
+/**
+ *	t3_set_phy_speed_duplex - force PHY speed and duplex
+ *	@phy: the PHY to operate on
+ *	@speed: requested PHY speed
+ *	@duplex: requested PHY duplex
+ *
+ *	Force a 10/100/1000 PHY's speed and duplex.  This also disables
+ *	auto-negotiation except for GigE, where auto-negotiation is mandatory.
+ */
+int t3_set_phy_speed_duplex(struct cphy *phy, int speed, int duplex)
+{
+	int err;
+	unsigned int ctl;
+
+	err = t3_mdio_read(phy, MDIO_DEVAD_NONE, MII_BMCR, &ctl);
+	if (err)
+		return err;
+
+	if (speed >= 0) {
+		ctl &= ~(BMCR_SPEED100 | BMCR_SPEED1000 | BMCR_ANENABLE);
+		if (speed == SPEED_100)
+			ctl |= BMCR_SPEED100;
+		else if (speed == SPEED_1000)
+			ctl |= BMCR_SPEED1000;
+	}
+	if (duplex >= 0) {
+		ctl &= ~(BMCR_FULLDPLX | BMCR_ANENABLE);
+		if (duplex == DUPLEX_FULL)
+			ctl |= BMCR_FULLDPLX;
+	}
+	if (ctl & BMCR_SPEED1000) /* auto-negotiation required for GigE */
+		ctl |= BMCR_ANENABLE;
+	return t3_mdio_write(phy, MDIO_DEVAD_NONE, MII_BMCR, ctl);
+}
+
+int t3_phy_lasi_intr_enable(struct cphy *phy)
+{
+	return t3_mdio_write(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_CTRL,
+			     MDIO_PMA_LASI_LSALARM);
+}
+
+int t3_phy_lasi_intr_disable(struct cphy *phy)
+{
+	return t3_mdio_write(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_CTRL, 0);
+}
+
+int t3_phy_lasi_intr_clear(struct cphy *phy)
+{
+	u32 val;
+
+	return t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_STAT, &val);
+}
+
+int t3_phy_lasi_intr_handler(struct cphy *phy)
+{
+	unsigned int status;
+	int err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_STAT,
+			       &status);
+
+	if (err)
+		return err;
+	return (status & MDIO_PMA_LASI_LSALARM) ? cphy_cause_link_change : 0;
+}
+
+static const struct adapter_info t3_adap_info[] = {
+	{1, 1, 0,
+	 F_GPIO2_OEN | F_GPIO4_OEN |
+	 F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, { S_GPIO3, S_GPIO5 }, 0,
+	 &mi1_mdio_ops, "Chelsio PE9000"},
+	{1, 1, 0,
+	 F_GPIO2_OEN | F_GPIO4_OEN |
+	 F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, { S_GPIO3, S_GPIO5 }, 0,
+	 &mi1_mdio_ops, "Chelsio T302"},
+	{1, 0, 0,
+	 F_GPIO1_OEN | F_GPIO6_OEN | F_GPIO7_OEN | F_GPIO10_OEN |
+	 F_GPIO11_OEN | F_GPIO1_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL,
+	 { 0 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI,
+	 &mi1_mdio_ext_ops, "Chelsio T310"},
+	{1, 1, 0,
+	 F_GPIO1_OEN | F_GPIO2_OEN | F_GPIO4_OEN | F_GPIO5_OEN | F_GPIO6_OEN |
+	 F_GPIO7_OEN | F_GPIO10_OEN | F_GPIO11_OEN | F_GPIO1_OUT_VAL |
+	 F_GPIO5_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL,
+	 { S_GPIO9, S_GPIO3 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI,
+	 &mi1_mdio_ext_ops, "Chelsio T320"},
+	{},
+	{},
+	{1, 0, 0,
+	 F_GPIO1_OEN | F_GPIO2_OEN | F_GPIO4_OEN | F_GPIO6_OEN | F_GPIO7_OEN |
+	 F_GPIO10_OEN | F_GPIO1_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL,
+	 { S_GPIO9 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI,
+	 &mi1_mdio_ext_ops, "Chelsio T310" },
+	{1, 0, 0,
+	 F_GPIO1_OEN | F_GPIO6_OEN | F_GPIO7_OEN |
+	 F_GPIO1_OUT_VAL | F_GPIO6_OUT_VAL,
+	 { S_GPIO9 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI,
+	 &mi1_mdio_ext_ops, "Chelsio N320E-G2" },
+};
+
+/*
+ * Return the adapter_info structure with a given index.  Out-of-range indices
+ * return NULL.
+ */
+const struct adapter_info *t3_get_adapter_info(unsigned int id)
+{
+	return id < ARRAY_SIZE(t3_adap_info) ? &t3_adap_info[id] : NULL;
+}
+
+struct port_type_info {
+	int (*phy_prep)(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *ops);
+};
+
+static const struct port_type_info port_types[] = {
+	{ NULL },
+	{ t3_ael1002_phy_prep },
+	{ t3_vsc8211_phy_prep },
+	{ NULL},
+	{ t3_xaui_direct_phy_prep },
+	{ t3_ael2005_phy_prep },
+	{ t3_qt2045_phy_prep },
+	{ t3_ael1006_phy_prep },
+	{ NULL },
+	{ t3_aq100x_phy_prep },
+	{ t3_ael2020_phy_prep },
+};
+
+#define VPD_ENTRY(name, len) \
+	u8 name##_kword[2]; u8 name##_len; u8 name##_data[len]
+
+/*
+ * Partial EEPROM Vital Product Data structure.  Includes only the ID and
+ * VPD-R sections.
+ */
+struct t3_vpd {
+	u8 id_tag;
+	u8 id_len[2];
+	u8 id_data[16];
+	u8 vpdr_tag;
+	u8 vpdr_len[2];
+	VPD_ENTRY(pn, 16);	/* part number */
+	VPD_ENTRY(ec, 16);	/* EC level */
+	VPD_ENTRY(sn, SERNUM_LEN); /* serial number */
+	VPD_ENTRY(na, 12);	/* MAC address base */
+	VPD_ENTRY(cclk, 6);	/* core clock */
+	VPD_ENTRY(mclk, 6);	/* mem clock */
+	VPD_ENTRY(uclk, 6);	/* uP clk */
+	VPD_ENTRY(mdc, 6);	/* MDIO clk */
+	VPD_ENTRY(mt, 2);	/* mem timing */
+	VPD_ENTRY(xaui0cfg, 6);	/* XAUI0 config */
+	VPD_ENTRY(xaui1cfg, 6);	/* XAUI1 config */
+	VPD_ENTRY(port0, 2);	/* PHY0 complex */
+	VPD_ENTRY(port1, 2);	/* PHY1 complex */
+	VPD_ENTRY(port2, 2);	/* PHY2 complex */
+	VPD_ENTRY(port3, 2);	/* PHY3 complex */
+	VPD_ENTRY(rv, 1);	/* csum */
+	u32 pad;		/* for multiple-of-4 sizing and alignment */
+};
+
+#define EEPROM_MAX_POLL   40
+#define EEPROM_STAT_ADDR  0x4000
+#define VPD_BASE          0xc00
+
+/**
+ *	t3_seeprom_read - read a VPD EEPROM location
+ *	@adapter: adapter to read
+ *	@addr: EEPROM address
+ *	@data: where to store the read data
+ *
+ *	Read a 32-bit word from a location in VPD EEPROM using the card's PCI
+ *	VPD ROM capability.  A zero is written to the flag bit when the
+ *	address is written to the control register.  The hardware device will
+ *	set the flag to 1 when 4 bytes have been read into the data register.
+ */
+int t3_seeprom_read(struct adapter *adapter, u32 addr, __le32 *data)
+{
+	u16 val;
+	int attempts = EEPROM_MAX_POLL;
+	u32 v;
+	unsigned int base = adapter->params.pci.vpd_cap_addr;
+
+	if ((addr >= EEPROMSIZE && addr != EEPROM_STAT_ADDR) || (addr & 3))
+		return -EINVAL;
+
+	pci_write_config_word(adapter->pdev, base + PCI_VPD_ADDR, addr);
+	do {
+		udelay(10);
+		pci_read_config_word(adapter->pdev, base + PCI_VPD_ADDR, &val);
+	} while (!(val & PCI_VPD_ADDR_F) && --attempts);
+
+	if (!(val & PCI_VPD_ADDR_F)) {
+		CH_ERR(adapter, "reading EEPROM address 0x%x failed\n", addr);
+		return -EIO;
+	}
+	pci_read_config_dword(adapter->pdev, base + PCI_VPD_DATA, &v);
+	*data = cpu_to_le32(v);
+	return 0;
+}
+
+/**
+ *	t3_seeprom_write - write a VPD EEPROM location
+ *	@adapter: adapter to write
+ *	@addr: EEPROM address
+ *	@data: value to write
+ *
+ *	Write a 32-bit word to a location in VPD EEPROM using the card's PCI
+ *	VPD ROM capability.
+ */
+int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data)
+{
+	u16 val;
+	int attempts = EEPROM_MAX_POLL;
+	unsigned int base = adapter->params.pci.vpd_cap_addr;
+
+	if ((addr >= EEPROMSIZE && addr != EEPROM_STAT_ADDR) || (addr & 3))
+		return -EINVAL;
+
+	pci_write_config_dword(adapter->pdev, base + PCI_VPD_DATA,
+			       le32_to_cpu(data));
+	pci_write_config_word(adapter->pdev,base + PCI_VPD_ADDR,
+			      addr | PCI_VPD_ADDR_F);
+	do {
+		msleep(1);
+		pci_read_config_word(adapter->pdev, base + PCI_VPD_ADDR, &val);
+	} while ((val & PCI_VPD_ADDR_F) && --attempts);
+
+	if (val & PCI_VPD_ADDR_F) {
+		CH_ERR(adapter, "write to EEPROM address 0x%x failed\n", addr);
+		return -EIO;
+	}
+	return 0;
+}
+
+/**
+ *	t3_seeprom_wp - enable/disable EEPROM write protection
+ *	@adapter: the adapter
+ *	@enable: 1 to enable write protection, 0 to disable it
+ *
+ *	Enables or disables write protection on the serial EEPROM.
+ */
+int t3_seeprom_wp(struct adapter *adapter, int enable)
+{
+	return t3_seeprom_write(adapter, EEPROM_STAT_ADDR, enable ? 0xc : 0);
+}
+
+static int vpdstrtouint(char *s, u8 len, unsigned int base, unsigned int *val)
+{
+	char tok[256];
+
+	memcpy(tok, s, len);
+	tok[len] = 0;
+	return kstrtouint(strim(tok), base, val);
+}
+
+static int vpdstrtou16(char *s, u8 len, unsigned int base, u16 *val)
+{
+	char tok[256];
+
+	memcpy(tok, s, len);
+	tok[len] = 0;
+	return kstrtou16(strim(tok), base, val);
+}
+
+/**
+ *	get_vpd_params - read VPD parameters from VPD EEPROM
+ *	@adapter: adapter to read
+ *	@p: where to store the parameters
+ *
+ *	Reads card parameters stored in VPD EEPROM.
+ */
+static int get_vpd_params(struct adapter *adapter, struct vpd_params *p)
+{
+	int i, addr, ret;
+	struct t3_vpd vpd;
+
+	/*
+	 * Card information is normally at VPD_BASE but some early cards had
+	 * it at 0.
+	 */
+	ret = t3_seeprom_read(adapter, VPD_BASE, (__le32 *)&vpd);
+	if (ret)
+		return ret;
+	addr = vpd.id_tag == 0x82 ? VPD_BASE : 0;
+
+	for (i = 0; i < sizeof(vpd); i += 4) {
+		ret = t3_seeprom_read(adapter, addr + i,
+				      (__le32 *)((u8 *)&vpd + i));
+		if (ret)
+			return ret;
+	}
+
+	ret = vpdstrtouint(vpd.cclk_data, vpd.cclk_len, 10, &p->cclk);
+	if (ret)
+		return ret;
+	ret = vpdstrtouint(vpd.mclk_data, vpd.mclk_len, 10, &p->mclk);
+	if (ret)
+		return ret;
+	ret = vpdstrtouint(vpd.uclk_data, vpd.uclk_len, 10, &p->uclk);
+	if (ret)
+		return ret;
+	ret = vpdstrtouint(vpd.mdc_data, vpd.mdc_len, 10, &p->mdc);
+	if (ret)
+		return ret;
+	ret = vpdstrtouint(vpd.mt_data, vpd.mt_len, 10, &p->mem_timing);
+	if (ret)
+		return ret;
+	memcpy(p->sn, vpd.sn_data, SERNUM_LEN);
+
+	/* Old eeproms didn't have port information */
+	if (adapter->params.rev == 0 && !vpd.port0_data[0]) {
+		p->port_type[0] = uses_xaui(adapter) ? 1 : 2;
+		p->port_type[1] = uses_xaui(adapter) ? 6 : 2;
+	} else {
+		p->port_type[0] = hex_to_bin(vpd.port0_data[0]);
+		p->port_type[1] = hex_to_bin(vpd.port1_data[0]);
+		ret = vpdstrtou16(vpd.xaui0cfg_data, vpd.xaui0cfg_len, 16,
+				  &p->xauicfg[0]);
+		if (ret)
+			return ret;
+		ret = vpdstrtou16(vpd.xaui1cfg_data, vpd.xaui1cfg_len, 16,
+				  &p->xauicfg[1]);
+		if (ret)
+			return ret;
+	}
+
+	ret = hex2bin(p->eth_base, vpd.na_data, 6);
+	if (ret < 0)
+		return -EINVAL;
+	return 0;
+}
+
+/* serial flash and firmware constants */
+enum {
+	SF_ATTEMPTS = 5,	/* max retries for SF1 operations */
+	SF_SEC_SIZE = 64 * 1024,	/* serial flash sector size */
+	SF_SIZE = SF_SEC_SIZE * 8,	/* serial flash size */
+
+	/* flash command opcodes */
+	SF_PROG_PAGE = 2,	/* program page */
+	SF_WR_DISABLE = 4,	/* disable writes */
+	SF_RD_STATUS = 5,	/* read status register */
+	SF_WR_ENABLE = 6,	/* enable writes */
+	SF_RD_DATA_FAST = 0xb,	/* read flash */
+	SF_ERASE_SECTOR = 0xd8,	/* erase sector */
+
+	FW_FLASH_BOOT_ADDR = 0x70000,	/* start address of FW in flash */
+	FW_VERS_ADDR = 0x7fffc,    /* flash address holding FW version */
+	FW_MIN_SIZE = 8            /* at least version and csum */
+};
+
+/**
+ *	sf1_read - read data from the serial flash
+ *	@adapter: the adapter
+ *	@byte_cnt: number of bytes to read
+ *	@cont: whether another operation will be chained
+ *	@valp: where to store the read data
+ *
+ *	Reads up to 4 bytes of data from the serial flash.  The location of
+ *	the read needs to be specified prior to calling this by issuing the
+ *	appropriate commands to the serial flash.
+ */
+static int sf1_read(struct adapter *adapter, unsigned int byte_cnt, int cont,
+		    u32 *valp)
+{
+	int ret;
+
+	if (!byte_cnt || byte_cnt > 4)
+		return -EINVAL;
+	if (t3_read_reg(adapter, A_SF_OP) & F_BUSY)
+		return -EBUSY;
+	t3_write_reg(adapter, A_SF_OP, V_CONT(cont) | V_BYTECNT(byte_cnt - 1));
+	ret = t3_wait_op_done(adapter, A_SF_OP, F_BUSY, 0, SF_ATTEMPTS, 10);
+	if (!ret)
+		*valp = t3_read_reg(adapter, A_SF_DATA);
+	return ret;
+}
+
+/**
+ *	sf1_write - write data to the serial flash
+ *	@adapter: the adapter
+ *	@byte_cnt: number of bytes to write
+ *	@cont: whether another operation will be chained
+ *	@val: value to write
+ *
+ *	Writes up to 4 bytes of data to the serial flash.  The location of
+ *	the write needs to be specified prior to calling this by issuing the
+ *	appropriate commands to the serial flash.
+ */
+static int sf1_write(struct adapter *adapter, unsigned int byte_cnt, int cont,
+		     u32 val)
+{
+	if (!byte_cnt || byte_cnt > 4)
+		return -EINVAL;
+	if (t3_read_reg(adapter, A_SF_OP) & F_BUSY)
+		return -EBUSY;
+	t3_write_reg(adapter, A_SF_DATA, val);
+	t3_write_reg(adapter, A_SF_OP,
+		     V_CONT(cont) | V_BYTECNT(byte_cnt - 1) | V_OP(1));
+	return t3_wait_op_done(adapter, A_SF_OP, F_BUSY, 0, SF_ATTEMPTS, 10);
+}
+
+/**
+ *	flash_wait_op - wait for a flash operation to complete
+ *	@adapter: the adapter
+ *	@attempts: max number of polls of the status register
+ *	@delay: delay between polls in ms
+ *
+ *	Wait for a flash operation to complete by polling the status register.
+ */
+static int flash_wait_op(struct adapter *adapter, int attempts, int delay)
+{
+	int ret;
+	u32 status;
+
+	while (1) {
+		if ((ret = sf1_write(adapter, 1, 1, SF_RD_STATUS)) != 0 ||
+		    (ret = sf1_read(adapter, 1, 0, &status)) != 0)
+			return ret;
+		if (!(status & 1))
+			return 0;
+		if (--attempts == 0)
+			return -EAGAIN;
+		if (delay)
+			msleep(delay);
+	}
+}
+
+/**
+ *	t3_read_flash - read words from serial flash
+ *	@adapter: the adapter
+ *	@addr: the start address for the read
+ *	@nwords: how many 32-bit words to read
+ *	@data: where to store the read data
+ *	@byte_oriented: whether to store data as bytes or as words
+ *
+ *	Read the specified number of 32-bit words from the serial flash.
+ *	If @byte_oriented is set the read data is stored as a byte array
+ *	(i.e., big-endian), otherwise as 32-bit words in the platform's
+ *	natural endianness.
+ */
+static int t3_read_flash(struct adapter *adapter, unsigned int addr,
+			 unsigned int nwords, u32 *data, int byte_oriented)
+{
+	int ret;
+
+	if (addr + nwords * sizeof(u32) > SF_SIZE || (addr & 3))
+		return -EINVAL;
+
+	addr = swab32(addr) | SF_RD_DATA_FAST;
+
+	if ((ret = sf1_write(adapter, 4, 1, addr)) != 0 ||
+	    (ret = sf1_read(adapter, 1, 1, data)) != 0)
+		return ret;
+
+	for (; nwords; nwords--, data++) {
+		ret = sf1_read(adapter, 4, nwords > 1, data);
+		if (ret)
+			return ret;
+		if (byte_oriented)
+			*data = htonl(*data);
+	}
+	return 0;
+}
+
+/**
+ *	t3_write_flash - write up to a page of data to the serial flash
+ *	@adapter: the adapter
+ *	@addr: the start address to write
+ *	@n: length of data to write
+ *	@data: the data to write
+ *
+ *	Writes up to a page of data (256 bytes) to the serial flash starting
+ *	at the given address.
+ */
+static int t3_write_flash(struct adapter *adapter, unsigned int addr,
+			  unsigned int n, const u8 *data)
+{
+	int ret;
+	u32 buf[64];
+	unsigned int i, c, left, val, offset = addr & 0xff;
+
+	if (addr + n > SF_SIZE || offset + n > 256)
+		return -EINVAL;
+
+	val = swab32(addr) | SF_PROG_PAGE;
+
+	if ((ret = sf1_write(adapter, 1, 0, SF_WR_ENABLE)) != 0 ||
+	    (ret = sf1_write(adapter, 4, 1, val)) != 0)
+		return ret;
+
+	for (left = n; left; left -= c) {
+		c = min(left, 4U);
+		for (val = 0, i = 0; i < c; ++i)
+			val = (val << 8) + *data++;
+
+		ret = sf1_write(adapter, c, c != left, val);
+		if (ret)
+			return ret;
+	}
+	if ((ret = flash_wait_op(adapter, 5, 1)) != 0)
+		return ret;
+
+	/* Read the page to verify the write succeeded */
+	ret = t3_read_flash(adapter, addr & ~0xff, ARRAY_SIZE(buf), buf, 1);
+	if (ret)
+		return ret;
+
+	if (memcmp(data - n, (u8 *) buf + offset, n))
+		return -EIO;
+	return 0;
+}
+
+/**
+ *	t3_get_tp_version - read the tp sram version
+ *	@adapter: the adapter
+ *	@vers: where to place the version
+ *
+ *	Reads the protocol sram version from sram.
+ */
+int t3_get_tp_version(struct adapter *adapter, u32 *vers)
+{
+	int ret;
+
+	/* Get version loaded in SRAM */
+	t3_write_reg(adapter, A_TP_EMBED_OP_FIELD0, 0);
+	ret = t3_wait_op_done(adapter, A_TP_EMBED_OP_FIELD0,
+			      1, 1, 5, 1);
+	if (ret)
+		return ret;
+
+	*vers = t3_read_reg(adapter, A_TP_EMBED_OP_FIELD1);
+
+	return 0;
+}
+
+/**
+ *	t3_check_tpsram_version - read the tp sram version
+ *	@adapter: the adapter
+ *
+ *	Reads the protocol sram version from flash.
+ */
+int t3_check_tpsram_version(struct adapter *adapter)
+{
+	int ret;
+	u32 vers;
+	unsigned int major, minor;
+
+	if (adapter->params.rev == T3_REV_A)
+		return 0;
+
+
+	ret = t3_get_tp_version(adapter, &vers);
+	if (ret)
+		return ret;
+
+	major = G_TP_VERSION_MAJOR(vers);
+	minor = G_TP_VERSION_MINOR(vers);
+
+	if (major == TP_VERSION_MAJOR && minor == TP_VERSION_MINOR)
+		return 0;
+	else {
+		CH_ERR(adapter, "found wrong TP version (%u.%u), "
+		       "driver compiled for version %d.%d\n", major, minor,
+		       TP_VERSION_MAJOR, TP_VERSION_MINOR);
+	}
+	return -EINVAL;
+}
+
+/**
+ *	t3_check_tpsram - check if provided protocol SRAM
+ *			  is compatible with this driver
+ *	@adapter: the adapter
+ *	@tp_sram: the firmware image to write
+ *	@size: image size
+ *
+ *	Checks if an adapter's tp sram is compatible with the driver.
+ *	Returns 0 if the versions are compatible, a negative error otherwise.
+ */
+int t3_check_tpsram(struct adapter *adapter, const u8 *tp_sram,
+		    unsigned int size)
+{
+	u32 csum;
+	unsigned int i;
+	const __be32 *p = (const __be32 *)tp_sram;
+
+	/* Verify checksum */
+	for (csum = 0, i = 0; i < size / sizeof(csum); i++)
+		csum += ntohl(p[i]);
+	if (csum != 0xffffffff) {
+		CH_ERR(adapter, "corrupted protocol SRAM image, checksum %u\n",
+		       csum);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+enum fw_version_type {
+	FW_VERSION_N3,
+	FW_VERSION_T3
+};
+
+/**
+ *	t3_get_fw_version - read the firmware version
+ *	@adapter: the adapter
+ *	@vers: where to place the version
+ *
+ *	Reads the FW version from flash.
+ */
+int t3_get_fw_version(struct adapter *adapter, u32 *vers)
+{
+	return t3_read_flash(adapter, FW_VERS_ADDR, 1, vers, 0);
+}
+
+/**
+ *	t3_check_fw_version - check if the FW is compatible with this driver
+ *	@adapter: the adapter
+ *
+ *	Checks if an adapter's FW is compatible with the driver.  Returns 0
+ *	if the versions are compatible, a negative error otherwise.
+ */
+int t3_check_fw_version(struct adapter *adapter)
+{
+	int ret;
+	u32 vers;
+	unsigned int type, major, minor;
+
+	ret = t3_get_fw_version(adapter, &vers);
+	if (ret)
+		return ret;
+
+	type = G_FW_VERSION_TYPE(vers);
+	major = G_FW_VERSION_MAJOR(vers);
+	minor = G_FW_VERSION_MINOR(vers);
+
+	if (type == FW_VERSION_T3 && major == FW_VERSION_MAJOR &&
+	    minor == FW_VERSION_MINOR)
+		return 0;
+	else if (major != FW_VERSION_MAJOR || minor < FW_VERSION_MINOR)
+		CH_WARN(adapter, "found old FW minor version(%u.%u), "
+		        "driver compiled for version %u.%u\n", major, minor,
+			FW_VERSION_MAJOR, FW_VERSION_MINOR);
+	else {
+		CH_WARN(adapter, "found newer FW version(%u.%u), "
+		        "driver compiled for version %u.%u\n", major, minor,
+			FW_VERSION_MAJOR, FW_VERSION_MINOR);
+			return 0;
+	}
+	return -EINVAL;
+}
+
+/**
+ *	t3_flash_erase_sectors - erase a range of flash sectors
+ *	@adapter: the adapter
+ *	@start: the first sector to erase
+ *	@end: the last sector to erase
+ *
+ *	Erases the sectors in the given range.
+ */
+static int t3_flash_erase_sectors(struct adapter *adapter, int start, int end)
+{
+	while (start <= end) {
+		int ret;
+
+		if ((ret = sf1_write(adapter, 1, 0, SF_WR_ENABLE)) != 0 ||
+		    (ret = sf1_write(adapter, 4, 0,
+				     SF_ERASE_SECTOR | (start << 8))) != 0 ||
+		    (ret = flash_wait_op(adapter, 5, 500)) != 0)
+			return ret;
+		start++;
+	}
+	return 0;
+}
+
+/**
+ *	t3_load_fw - download firmware
+ *	@adapter: the adapter
+ *	@fw_data: the firmware image to write
+ *	@size: image size
+ *
+ *	Write the supplied firmware image to the card's serial flash.
+ *	The FW image has the following sections: @size - 8 bytes of code and
+ *	data, followed by 4 bytes of FW version, followed by the 32-bit
+ *	1's complement checksum of the whole image.
+ */
+int t3_load_fw(struct adapter *adapter, const u8 *fw_data, unsigned int size)
+{
+	u32 csum;
+	unsigned int i;
+	const __be32 *p = (const __be32 *)fw_data;
+	int ret, addr, fw_sector = FW_FLASH_BOOT_ADDR >> 16;
+
+	if ((size & 3) || size < FW_MIN_SIZE)
+		return -EINVAL;
+	if (size > FW_VERS_ADDR + 8 - FW_FLASH_BOOT_ADDR)
+		return -EFBIG;
+
+	for (csum = 0, i = 0; i < size / sizeof(csum); i++)
+		csum += ntohl(p[i]);
+	if (csum != 0xffffffff) {
+		CH_ERR(adapter, "corrupted firmware image, checksum %u\n",
+		       csum);
+		return -EINVAL;
+	}
+
+	ret = t3_flash_erase_sectors(adapter, fw_sector, fw_sector);
+	if (ret)
+		goto out;
+
+	size -= 8;		/* trim off version and checksum */
+	for (addr = FW_FLASH_BOOT_ADDR; size;) {
+		unsigned int chunk_size = min(size, 256U);
+
+		ret = t3_write_flash(adapter, addr, chunk_size, fw_data);
+		if (ret)
+			goto out;
+
+		addr += chunk_size;
+		fw_data += chunk_size;
+		size -= chunk_size;
+	}
+
+	ret = t3_write_flash(adapter, FW_VERS_ADDR, 4, fw_data);
+out:
+	if (ret)
+		CH_ERR(adapter, "firmware download failed, error %d\n", ret);
+	return ret;
+}
+
+#define CIM_CTL_BASE 0x2000
+
+/**
+ *      t3_cim_ctl_blk_read - read a block from CIM control region
+ *
+ *      @adap: the adapter
+ *      @addr: the start address within the CIM control region
+ *      @n: number of words to read
+ *      @valp: where to store the result
+ *
+ *      Reads a block of 4-byte words from the CIM control region.
+ */
+int t3_cim_ctl_blk_read(struct adapter *adap, unsigned int addr,
+			unsigned int n, unsigned int *valp)
+{
+	int ret = 0;
+
+	if (t3_read_reg(adap, A_CIM_HOST_ACC_CTRL) & F_HOSTBUSY)
+		return -EBUSY;
+
+	for ( ; !ret && n--; addr += 4) {
+		t3_write_reg(adap, A_CIM_HOST_ACC_CTRL, CIM_CTL_BASE + addr);
+		ret = t3_wait_op_done(adap, A_CIM_HOST_ACC_CTRL, F_HOSTBUSY,
+				      0, 5, 2);
+		if (!ret)
+			*valp++ = t3_read_reg(adap, A_CIM_HOST_ACC_DATA);
+	}
+	return ret;
+}
+
+static void t3_gate_rx_traffic(struct cmac *mac, u32 *rx_cfg,
+			       u32 *rx_hash_high, u32 *rx_hash_low)
+{
+	/* stop Rx unicast traffic */
+	t3_mac_disable_exact_filters(mac);
+
+	/* stop broadcast, multicast, promiscuous mode traffic */
+	*rx_cfg = t3_read_reg(mac->adapter, A_XGM_RX_CFG);
+	t3_set_reg_field(mac->adapter, A_XGM_RX_CFG,
+			 F_ENHASHMCAST | F_DISBCAST | F_COPYALLFRAMES,
+			 F_DISBCAST);
+
+	*rx_hash_high = t3_read_reg(mac->adapter, A_XGM_RX_HASH_HIGH);
+	t3_write_reg(mac->adapter, A_XGM_RX_HASH_HIGH, 0);
+
+	*rx_hash_low = t3_read_reg(mac->adapter, A_XGM_RX_HASH_LOW);
+	t3_write_reg(mac->adapter, A_XGM_RX_HASH_LOW, 0);
+
+	/* Leave time to drain max RX fifo */
+	msleep(1);
+}
+
+static void t3_open_rx_traffic(struct cmac *mac, u32 rx_cfg,
+			       u32 rx_hash_high, u32 rx_hash_low)
+{
+	t3_mac_enable_exact_filters(mac);
+	t3_set_reg_field(mac->adapter, A_XGM_RX_CFG,
+			 F_ENHASHMCAST | F_DISBCAST | F_COPYALLFRAMES,
+			 rx_cfg);
+	t3_write_reg(mac->adapter, A_XGM_RX_HASH_HIGH, rx_hash_high);
+	t3_write_reg(mac->adapter, A_XGM_RX_HASH_LOW, rx_hash_low);
+}
+
+/**
+ *	t3_link_changed - handle interface link changes
+ *	@adapter: the adapter
+ *	@port_id: the port index that changed link state
+ *
+ *	Called when a port's link settings change to propagate the new values
+ *	to the associated PHY and MAC.  After performing the common tasks it
+ *	invokes an OS-specific handler.
+ */
+void t3_link_changed(struct adapter *adapter, int port_id)
+{
+	int link_ok, speed, duplex, fc;
+	struct port_info *pi = adap2pinfo(adapter, port_id);
+	struct cphy *phy = &pi->phy;
+	struct cmac *mac = &pi->mac;
+	struct link_config *lc = &pi->link_config;
+
+	phy->ops->get_link_status(phy, &link_ok, &speed, &duplex, &fc);
+
+	if (!lc->link_ok && link_ok) {
+		u32 rx_cfg, rx_hash_high, rx_hash_low;
+		u32 status;
+
+		t3_xgm_intr_enable(adapter, port_id);
+		t3_gate_rx_traffic(mac, &rx_cfg, &rx_hash_high, &rx_hash_low);
+		t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset, 0);
+		t3_mac_enable(mac, MAC_DIRECTION_RX);
+
+		status = t3_read_reg(adapter, A_XGM_INT_STATUS + mac->offset);
+		if (status & F_LINKFAULTCHANGE) {
+			mac->stats.link_faults++;
+			pi->link_fault = 1;
+		}
+		t3_open_rx_traffic(mac, rx_cfg, rx_hash_high, rx_hash_low);
+	}
+
+	if (lc->requested_fc & PAUSE_AUTONEG)
+		fc &= lc->requested_fc;
+	else
+		fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
+
+	if (link_ok == lc->link_ok && speed == lc->speed &&
+	    duplex == lc->duplex && fc == lc->fc)
+		return;                            /* nothing changed */
+
+	if (link_ok != lc->link_ok && adapter->params.rev > 0 &&
+	    uses_xaui(adapter)) {
+		if (link_ok)
+			t3b_pcs_reset(mac);
+		t3_write_reg(adapter, A_XGM_XAUI_ACT_CTRL + mac->offset,
+			     link_ok ? F_TXACTENABLE | F_RXEN : 0);
+	}
+	lc->link_ok = link_ok;
+	lc->speed = speed < 0 ? SPEED_INVALID : speed;
+	lc->duplex = duplex < 0 ? DUPLEX_INVALID : duplex;
+
+	if (link_ok && speed >= 0 && lc->autoneg == AUTONEG_ENABLE) {
+		/* Set MAC speed, duplex, and flow control to match PHY. */
+		t3_mac_set_speed_duplex_fc(mac, speed, duplex, fc);
+		lc->fc = fc;
+	}
+
+	t3_os_link_changed(adapter, port_id, link_ok && !pi->link_fault,
+			   speed, duplex, fc);
+}
+
+void t3_link_fault(struct adapter *adapter, int port_id)
+{
+	struct port_info *pi = adap2pinfo(adapter, port_id);
+	struct cmac *mac = &pi->mac;
+	struct cphy *phy = &pi->phy;
+	struct link_config *lc = &pi->link_config;
+	int link_ok, speed, duplex, fc, link_fault;
+	u32 rx_cfg, rx_hash_high, rx_hash_low;
+
+	t3_gate_rx_traffic(mac, &rx_cfg, &rx_hash_high, &rx_hash_low);
+
+	if (adapter->params.rev > 0 && uses_xaui(adapter))
+		t3_write_reg(adapter, A_XGM_XAUI_ACT_CTRL + mac->offset, 0);
+
+	t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset, 0);
+	t3_mac_enable(mac, MAC_DIRECTION_RX);
+
+	t3_open_rx_traffic(mac, rx_cfg, rx_hash_high, rx_hash_low);
+
+	link_fault = t3_read_reg(adapter,
+				 A_XGM_INT_STATUS + mac->offset);
+	link_fault &= F_LINKFAULTCHANGE;
+
+	link_ok = lc->link_ok;
+	speed = lc->speed;
+	duplex = lc->duplex;
+	fc = lc->fc;
+
+	phy->ops->get_link_status(phy, &link_ok, &speed, &duplex, &fc);
+
+	if (link_fault) {
+		lc->link_ok = 0;
+		lc->speed = SPEED_INVALID;
+		lc->duplex = DUPLEX_INVALID;
+
+		t3_os_link_fault(adapter, port_id, 0);
+
+		/* Account link faults only when the phy reports a link up */
+		if (link_ok)
+			mac->stats.link_faults++;
+	} else {
+		if (link_ok)
+			t3_write_reg(adapter, A_XGM_XAUI_ACT_CTRL + mac->offset,
+				     F_TXACTENABLE | F_RXEN);
+
+		pi->link_fault = 0;
+		lc->link_ok = (unsigned char)link_ok;
+		lc->speed = speed < 0 ? SPEED_INVALID : speed;
+		lc->duplex = duplex < 0 ? DUPLEX_INVALID : duplex;
+		t3_os_link_fault(adapter, port_id, link_ok);
+	}
+}
+
+/**
+ *	t3_link_start - apply link configuration to MAC/PHY
+ *	@phy: the PHY to setup
+ *	@mac: the MAC to setup
+ *	@lc: the requested link configuration
+ *
+ *	Set up a port's MAC and PHY according to a desired link configuration.
+ *	- If the PHY can auto-negotiate first decide what to advertise, then
+ *	  enable/disable auto-negotiation as desired, and reset.
+ *	- If the PHY does not auto-negotiate just reset it.
+ *	- If auto-negotiation is off set the MAC to the proper speed/duplex/FC,
+ *	  otherwise do it later based on the outcome of auto-negotiation.
+ */
+int t3_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc)
+{
+	unsigned int fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
+
+	lc->link_ok = 0;
+	if (lc->supported & SUPPORTED_Autoneg) {
+		lc->advertising &= ~(ADVERTISED_Asym_Pause | ADVERTISED_Pause);
+		if (fc) {
+			lc->advertising |= ADVERTISED_Asym_Pause;
+			if (fc & PAUSE_RX)
+				lc->advertising |= ADVERTISED_Pause;
+		}
+		phy->ops->advertise(phy, lc->advertising);
+
+		if (lc->autoneg == AUTONEG_DISABLE) {
+			lc->speed = lc->requested_speed;
+			lc->duplex = lc->requested_duplex;
+			lc->fc = (unsigned char)fc;
+			t3_mac_set_speed_duplex_fc(mac, lc->speed, lc->duplex,
+						   fc);
+			/* Also disables autoneg */
+			phy->ops->set_speed_duplex(phy, lc->speed, lc->duplex);
+		} else
+			phy->ops->autoneg_enable(phy);
+	} else {
+		t3_mac_set_speed_duplex_fc(mac, -1, -1, fc);
+		lc->fc = (unsigned char)fc;
+		phy->ops->reset(phy, 0);
+	}
+	return 0;
+}
+
+/**
+ *	t3_set_vlan_accel - control HW VLAN extraction
+ *	@adapter: the adapter
+ *	@ports: bitmap of adapter ports to operate on
+ *	@on: enable (1) or disable (0) HW VLAN extraction
+ *
+ *	Enables or disables HW extraction of VLAN tags for the given port.
+ */
+void t3_set_vlan_accel(struct adapter *adapter, unsigned int ports, int on)
+{
+	t3_set_reg_field(adapter, A_TP_OUT_CONFIG,
+			 ports << S_VLANEXTRACTIONENABLE,
+			 on ? (ports << S_VLANEXTRACTIONENABLE) : 0);
+}
+
+struct intr_info {
+	unsigned int mask;	/* bits to check in interrupt status */
+	const char *msg;	/* message to print or NULL */
+	short stat_idx;		/* stat counter to increment or -1 */
+	unsigned short fatal;	/* whether the condition reported is fatal */
+};
+
+/**
+ *	t3_handle_intr_status - table driven interrupt handler
+ *	@adapter: the adapter that generated the interrupt
+ *	@reg: the interrupt status register to process
+ *	@mask: a mask to apply to the interrupt status
+ *	@acts: table of interrupt actions
+ *	@stats: statistics counters tracking interrupt occurrences
+ *
+ *	A table driven interrupt handler that applies a set of masks to an
+ *	interrupt status word and performs the corresponding actions if the
+ *	interrupts described by the mask have occurred.  The actions include
+ *	optionally printing a warning or alert message, and optionally
+ *	incrementing a stat counter.  The table is terminated by an entry
+ *	specifying mask 0.  Returns the number of fatal interrupt conditions.
+ */
+static int t3_handle_intr_status(struct adapter *adapter, unsigned int reg,
+				 unsigned int mask,
+				 const struct intr_info *acts,
+				 unsigned long *stats)
+{
+	int fatal = 0;
+	unsigned int status = t3_read_reg(adapter, reg) & mask;
+
+	for (; acts->mask; ++acts) {
+		if (!(status & acts->mask))
+			continue;
+		if (acts->fatal) {
+			fatal++;
+			CH_ALERT(adapter, "%s (0x%x)\n",
+				 acts->msg, status & acts->mask);
+			status &= ~acts->mask;
+		} else if (acts->msg)
+			CH_WARN(adapter, "%s (0x%x)\n",
+				acts->msg, status & acts->mask);
+		if (acts->stat_idx >= 0)
+			stats[acts->stat_idx]++;
+	}
+	if (status)		/* clear processed interrupts */
+		t3_write_reg(adapter, reg, status);
+	return fatal;
+}
+
+#define SGE_INTR_MASK (F_RSPQDISABLED | \
+		       F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR | \
+		       F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
+		       F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
+		       V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
+		       F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
+		       F_HIRCQPARITYERROR | F_LOPRIORITYDBFULL | \
+		       F_HIPRIORITYDBFULL | F_LOPRIORITYDBEMPTY | \
+		       F_HIPRIORITYDBEMPTY | F_HIPIODRBDROPERR | \
+		       F_LOPIODRBDROPERR)
+#define MC5_INTR_MASK (F_PARITYERR | F_ACTRGNFULL | F_UNKNOWNCMD | \
+		       F_REQQPARERR | F_DISPQPARERR | F_DELACTEMPTY | \
+		       F_NFASRCHFAIL)
+#define MC7_INTR_MASK (F_AE | F_UE | F_CE | V_PE(M_PE))
+#define XGM_INTR_MASK (V_TXFIFO_PRTY_ERR(M_TXFIFO_PRTY_ERR) | \
+		       V_RXFIFO_PRTY_ERR(M_RXFIFO_PRTY_ERR) | \
+		       F_TXFIFO_UNDERRUN)
+#define PCIX_INTR_MASK (F_MSTDETPARERR | F_SIGTARABT | F_RCVTARABT | \
+			F_RCVMSTABT | F_SIGSYSERR | F_DETPARERR | \
+			F_SPLCMPDIS | F_UNXSPLCMP | F_RCVSPLCMPERR | \
+			F_DETCORECCERR | F_DETUNCECCERR | F_PIOPARERR | \
+			V_WFPARERR(M_WFPARERR) | V_RFPARERR(M_RFPARERR) | \
+			V_CFPARERR(M_CFPARERR) /* | V_MSIXPARERR(M_MSIXPARERR) */)
+#define PCIE_INTR_MASK (F_UNXSPLCPLERRR | F_UNXSPLCPLERRC | F_PCIE_PIOPARERR |\
+			F_PCIE_WFPARERR | F_PCIE_RFPARERR | F_PCIE_CFPARERR | \
+			/* V_PCIE_MSIXPARERR(M_PCIE_MSIXPARERR) | */ \
+			F_RETRYBUFPARERR | F_RETRYLUTPARERR | F_RXPARERR | \
+			F_TXPARERR | V_BISTERR(M_BISTERR))
+#define ULPRX_INTR_MASK (F_PARERRDATA | F_PARERRPCMD | F_ARBPF1PERR | \
+			 F_ARBPF0PERR | F_ARBFPERR | F_PCMDMUXPERR | \
+			 F_DATASELFRAMEERR1 | F_DATASELFRAMEERR0)
+#define ULPTX_INTR_MASK 0xfc
+#define CPLSW_INTR_MASK (F_CIM_OP_MAP_PERR | F_TP_FRAMING_ERROR | \
+			 F_SGE_FRAMING_ERROR | F_CIM_FRAMING_ERROR | \
+			 F_ZERO_SWITCH_ERROR)
+#define CIM_INTR_MASK (F_BLKWRPLINT | F_BLKRDPLINT | F_BLKWRCTLINT | \
+		       F_BLKRDCTLINT | F_BLKWRFLASHINT | F_BLKRDFLASHINT | \
+		       F_SGLWRFLASHINT | F_WRBLKFLASHINT | F_BLKWRBOOTINT | \
+	 	       F_FLASHRANGEINT | F_SDRAMRANGEINT | F_RSVDSPACEINT | \
+		       F_DRAMPARERR | F_ICACHEPARERR | F_DCACHEPARERR | \
+		       F_OBQSGEPARERR | F_OBQULPHIPARERR | F_OBQULPLOPARERR | \
+		       F_IBQSGELOPARERR | F_IBQSGEHIPARERR | F_IBQULPPARERR | \
+		       F_IBQTPPARERR | F_ITAGPARERR | F_DTAGPARERR)
+#define PMTX_INTR_MASK (F_ZERO_C_CMD_ERROR | ICSPI_FRM_ERR | OESPI_FRM_ERR | \
+			V_ICSPI_PAR_ERROR(M_ICSPI_PAR_ERROR) | \
+			V_OESPI_PAR_ERROR(M_OESPI_PAR_ERROR))
+#define PMRX_INTR_MASK (F_ZERO_E_CMD_ERROR | IESPI_FRM_ERR | OCSPI_FRM_ERR | \
+			V_IESPI_PAR_ERROR(M_IESPI_PAR_ERROR) | \
+			V_OCSPI_PAR_ERROR(M_OCSPI_PAR_ERROR))
+#define MPS_INTR_MASK (V_TX0TPPARERRENB(M_TX0TPPARERRENB) | \
+		       V_TX1TPPARERRENB(M_TX1TPPARERRENB) | \
+		       V_RXTPPARERRENB(M_RXTPPARERRENB) | \
+		       V_MCAPARERRENB(M_MCAPARERRENB))
+#define XGM_EXTRA_INTR_MASK (F_LINKFAULTCHANGE)
+#define PL_INTR_MASK (F_T3DBG | F_XGMAC0_0 | F_XGMAC0_1 | F_MC5A | F_PM1_TX | \
+		      F_PM1_RX | F_ULP2_TX | F_ULP2_RX | F_TP1 | F_CIM | \
+		      F_MC7_CM | F_MC7_PMTX | F_MC7_PMRX | F_SGE3 | F_PCIM0 | \
+		      F_MPS0 | F_CPL_SWITCH)
+/*
+ * Interrupt handler for the PCIX1 module.
+ */
+static void pci_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info pcix1_intr_info[] = {
+		{F_MSTDETPARERR, "PCI master detected parity error", -1, 1},
+		{F_SIGTARABT, "PCI signaled target abort", -1, 1},
+		{F_RCVTARABT, "PCI received target abort", -1, 1},
+		{F_RCVMSTABT, "PCI received master abort", -1, 1},
+		{F_SIGSYSERR, "PCI signaled system error", -1, 1},
+		{F_DETPARERR, "PCI detected parity error", -1, 1},
+		{F_SPLCMPDIS, "PCI split completion discarded", -1, 1},
+		{F_UNXSPLCMP, "PCI unexpected split completion error", -1, 1},
+		{F_RCVSPLCMPERR, "PCI received split completion error", -1,
+		 1},
+		{F_DETCORECCERR, "PCI correctable ECC error",
+		 STAT_PCI_CORR_ECC, 0},
+		{F_DETUNCECCERR, "PCI uncorrectable ECC error", -1, 1},
+		{F_PIOPARERR, "PCI PIO FIFO parity error", -1, 1},
+		{V_WFPARERR(M_WFPARERR), "PCI write FIFO parity error", -1,
+		 1},
+		{V_RFPARERR(M_RFPARERR), "PCI read FIFO parity error", -1,
+		 1},
+		{V_CFPARERR(M_CFPARERR), "PCI command FIFO parity error", -1,
+		 1},
+		{V_MSIXPARERR(M_MSIXPARERR), "PCI MSI-X table/PBA parity "
+		 "error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_PCIX_INT_CAUSE, PCIX_INTR_MASK,
+				  pcix1_intr_info, adapter->irq_stats))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * Interrupt handler for the PCIE module.
+ */
+static void pcie_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info pcie_intr_info[] = {
+		{F_PEXERR, "PCI PEX error", -1, 1},
+		{F_UNXSPLCPLERRR,
+		 "PCI unexpected split completion DMA read error", -1, 1},
+		{F_UNXSPLCPLERRC,
+		 "PCI unexpected split completion DMA command error", -1, 1},
+		{F_PCIE_PIOPARERR, "PCI PIO FIFO parity error", -1, 1},
+		{F_PCIE_WFPARERR, "PCI write FIFO parity error", -1, 1},
+		{F_PCIE_RFPARERR, "PCI read FIFO parity error", -1, 1},
+		{F_PCIE_CFPARERR, "PCI command FIFO parity error", -1, 1},
+		{V_PCIE_MSIXPARERR(M_PCIE_MSIXPARERR),
+		 "PCI MSI-X table/PBA parity error", -1, 1},
+		{F_RETRYBUFPARERR, "PCI retry buffer parity error", -1, 1},
+		{F_RETRYLUTPARERR, "PCI retry LUT parity error", -1, 1},
+		{F_RXPARERR, "PCI Rx parity error", -1, 1},
+		{F_TXPARERR, "PCI Tx parity error", -1, 1},
+		{V_BISTERR(M_BISTERR), "PCI BIST error", -1, 1},
+		{0}
+	};
+
+	if (t3_read_reg(adapter, A_PCIE_INT_CAUSE) & F_PEXERR)
+		CH_ALERT(adapter, "PEX error code 0x%x\n",
+			 t3_read_reg(adapter, A_PCIE_PEX_ERR));
+
+	if (t3_handle_intr_status(adapter, A_PCIE_INT_CAUSE, PCIE_INTR_MASK,
+				  pcie_intr_info, adapter->irq_stats))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * TP interrupt handler.
+ */
+static void tp_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info tp_intr_info[] = {
+		{0xffffff, "TP parity error", -1, 1},
+		{0x1000000, "TP out of Rx pages", -1, 1},
+		{0x2000000, "TP out of Tx pages", -1, 1},
+		{0}
+	};
+
+	static const struct intr_info tp_intr_info_t3c[] = {
+		{0x1fffffff, "TP parity error", -1, 1},
+		{F_FLMRXFLSTEMPTY, "TP out of Rx pages", -1, 1},
+		{F_FLMTXFLSTEMPTY, "TP out of Tx pages", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_TP_INT_CAUSE, 0xffffffff,
+				  adapter->params.rev < T3_REV_C ?
+				  tp_intr_info : tp_intr_info_t3c, NULL))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * CIM interrupt handler.
+ */
+static void cim_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info cim_intr_info[] = {
+		{F_RSVDSPACEINT, "CIM reserved space write", -1, 1},
+		{F_SDRAMRANGEINT, "CIM SDRAM address out of range", -1, 1},
+		{F_FLASHRANGEINT, "CIM flash address out of range", -1, 1},
+		{F_BLKWRBOOTINT, "CIM block write to boot space", -1, 1},
+		{F_WRBLKFLASHINT, "CIM write to cached flash space", -1, 1},
+		{F_SGLWRFLASHINT, "CIM single write to flash space", -1, 1},
+		{F_BLKRDFLASHINT, "CIM block read from flash space", -1, 1},
+		{F_BLKWRFLASHINT, "CIM block write to flash space", -1, 1},
+		{F_BLKRDCTLINT, "CIM block read from CTL space", -1, 1},
+		{F_BLKWRCTLINT, "CIM block write to CTL space", -1, 1},
+		{F_BLKRDPLINT, "CIM block read from PL space", -1, 1},
+		{F_BLKWRPLINT, "CIM block write to PL space", -1, 1},
+		{F_DRAMPARERR, "CIM DRAM parity error", -1, 1},
+		{F_ICACHEPARERR, "CIM icache parity error", -1, 1},
+		{F_DCACHEPARERR, "CIM dcache parity error", -1, 1},
+		{F_OBQSGEPARERR, "CIM OBQ SGE parity error", -1, 1},
+		{F_OBQULPHIPARERR, "CIM OBQ ULPHI parity error", -1, 1},
+		{F_OBQULPLOPARERR, "CIM OBQ ULPLO parity error", -1, 1},
+		{F_IBQSGELOPARERR, "CIM IBQ SGELO parity error", -1, 1},
+		{F_IBQSGEHIPARERR, "CIM IBQ SGEHI parity error", -1, 1},
+		{F_IBQULPPARERR, "CIM IBQ ULP parity error", -1, 1},
+		{F_IBQTPPARERR, "CIM IBQ TP parity error", -1, 1},
+		{F_ITAGPARERR, "CIM itag parity error", -1, 1},
+		{F_DTAGPARERR, "CIM dtag parity error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_CIM_HOST_INT_CAUSE, 0xffffffff,
+				  cim_intr_info, NULL))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * ULP RX interrupt handler.
+ */
+static void ulprx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info ulprx_intr_info[] = {
+		{F_PARERRDATA, "ULP RX data parity error", -1, 1},
+		{F_PARERRPCMD, "ULP RX command parity error", -1, 1},
+		{F_ARBPF1PERR, "ULP RX ArbPF1 parity error", -1, 1},
+		{F_ARBPF0PERR, "ULP RX ArbPF0 parity error", -1, 1},
+		{F_ARBFPERR, "ULP RX ArbF parity error", -1, 1},
+		{F_PCMDMUXPERR, "ULP RX PCMDMUX parity error", -1, 1},
+		{F_DATASELFRAMEERR1, "ULP RX frame error", -1, 1},
+		{F_DATASELFRAMEERR0, "ULP RX frame error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_ULPRX_INT_CAUSE, 0xffffffff,
+				  ulprx_intr_info, NULL))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * ULP TX interrupt handler.
+ */
+static void ulptx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info ulptx_intr_info[] = {
+		{F_PBL_BOUND_ERR_CH0, "ULP TX channel 0 PBL out of bounds",
+		 STAT_ULP_CH0_PBL_OOB, 0},
+		{F_PBL_BOUND_ERR_CH1, "ULP TX channel 1 PBL out of bounds",
+		 STAT_ULP_CH1_PBL_OOB, 0},
+		{0xfc, "ULP TX parity error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_ULPTX_INT_CAUSE, 0xffffffff,
+				  ulptx_intr_info, adapter->irq_stats))
+		t3_fatal_err(adapter);
+}
+
+#define ICSPI_FRM_ERR (F_ICSPI0_FIFO2X_RX_FRAMING_ERROR | \
+	F_ICSPI1_FIFO2X_RX_FRAMING_ERROR | F_ICSPI0_RX_FRAMING_ERROR | \
+	F_ICSPI1_RX_FRAMING_ERROR | F_ICSPI0_TX_FRAMING_ERROR | \
+	F_ICSPI1_TX_FRAMING_ERROR)
+#define OESPI_FRM_ERR (F_OESPI0_RX_FRAMING_ERROR | \
+	F_OESPI1_RX_FRAMING_ERROR | F_OESPI0_TX_FRAMING_ERROR | \
+	F_OESPI1_TX_FRAMING_ERROR | F_OESPI0_OFIFO2X_TX_FRAMING_ERROR | \
+	F_OESPI1_OFIFO2X_TX_FRAMING_ERROR)
+
+/*
+ * PM TX interrupt handler.
+ */
+static void pmtx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info pmtx_intr_info[] = {
+		{F_ZERO_C_CMD_ERROR, "PMTX 0-length pcmd", -1, 1},
+		{ICSPI_FRM_ERR, "PMTX ispi framing error", -1, 1},
+		{OESPI_FRM_ERR, "PMTX ospi framing error", -1, 1},
+		{V_ICSPI_PAR_ERROR(M_ICSPI_PAR_ERROR),
+		 "PMTX ispi parity error", -1, 1},
+		{V_OESPI_PAR_ERROR(M_OESPI_PAR_ERROR),
+		 "PMTX ospi parity error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_PM1_TX_INT_CAUSE, 0xffffffff,
+				  pmtx_intr_info, NULL))
+		t3_fatal_err(adapter);
+}
+
+#define IESPI_FRM_ERR (F_IESPI0_FIFO2X_RX_FRAMING_ERROR | \
+	F_IESPI1_FIFO2X_RX_FRAMING_ERROR | F_IESPI0_RX_FRAMING_ERROR | \
+	F_IESPI1_RX_FRAMING_ERROR | F_IESPI0_TX_FRAMING_ERROR | \
+	F_IESPI1_TX_FRAMING_ERROR)
+#define OCSPI_FRM_ERR (F_OCSPI0_RX_FRAMING_ERROR | \
+	F_OCSPI1_RX_FRAMING_ERROR | F_OCSPI0_TX_FRAMING_ERROR | \
+	F_OCSPI1_TX_FRAMING_ERROR | F_OCSPI0_OFIFO2X_TX_FRAMING_ERROR | \
+	F_OCSPI1_OFIFO2X_TX_FRAMING_ERROR)
+
+/*
+ * PM RX interrupt handler.
+ */
+static void pmrx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info pmrx_intr_info[] = {
+		{F_ZERO_E_CMD_ERROR, "PMRX 0-length pcmd", -1, 1},
+		{IESPI_FRM_ERR, "PMRX ispi framing error", -1, 1},
+		{OCSPI_FRM_ERR, "PMRX ospi framing error", -1, 1},
+		{V_IESPI_PAR_ERROR(M_IESPI_PAR_ERROR),
+		 "PMRX ispi parity error", -1, 1},
+		{V_OCSPI_PAR_ERROR(M_OCSPI_PAR_ERROR),
+		 "PMRX ospi parity error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_PM1_RX_INT_CAUSE, 0xffffffff,
+				  pmrx_intr_info, NULL))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * CPL switch interrupt handler.
+ */
+static void cplsw_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info cplsw_intr_info[] = {
+		{F_CIM_OP_MAP_PERR, "CPL switch CIM parity error", -1, 1},
+		{F_CIM_OVFL_ERROR, "CPL switch CIM overflow", -1, 1},
+		{F_TP_FRAMING_ERROR, "CPL switch TP framing error", -1, 1},
+		{F_SGE_FRAMING_ERROR, "CPL switch SGE framing error", -1, 1},
+		{F_CIM_FRAMING_ERROR, "CPL switch CIM framing error", -1, 1},
+		{F_ZERO_SWITCH_ERROR, "CPL switch no-switch error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_CPL_INTR_CAUSE, 0xffffffff,
+				  cplsw_intr_info, NULL))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * MPS interrupt handler.
+ */
+static void mps_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info mps_intr_info[] = {
+		{0x1ff, "MPS parity error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_MPS_INT_CAUSE, 0xffffffff,
+				  mps_intr_info, NULL))
+		t3_fatal_err(adapter);
+}
+
+#define MC7_INTR_FATAL (F_UE | V_PE(M_PE) | F_AE)
+
+/*
+ * MC7 interrupt handler.
+ */
+static void mc7_intr_handler(struct mc7 *mc7)
+{
+	struct adapter *adapter = mc7->adapter;
+	u32 cause = t3_read_reg(adapter, mc7->offset + A_MC7_INT_CAUSE);
+
+	if (cause & F_CE) {
+		mc7->stats.corr_err++;
+		CH_WARN(adapter, "%s MC7 correctable error at addr 0x%x, "
+			"data 0x%x 0x%x 0x%x\n", mc7->name,
+			t3_read_reg(adapter, mc7->offset + A_MC7_CE_ADDR),
+			t3_read_reg(adapter, mc7->offset + A_MC7_CE_DATA0),
+			t3_read_reg(adapter, mc7->offset + A_MC7_CE_DATA1),
+			t3_read_reg(adapter, mc7->offset + A_MC7_CE_DATA2));
+	}
+
+	if (cause & F_UE) {
+		mc7->stats.uncorr_err++;
+		CH_ALERT(adapter, "%s MC7 uncorrectable error at addr 0x%x, "
+			 "data 0x%x 0x%x 0x%x\n", mc7->name,
+			 t3_read_reg(adapter, mc7->offset + A_MC7_UE_ADDR),
+			 t3_read_reg(adapter, mc7->offset + A_MC7_UE_DATA0),
+			 t3_read_reg(adapter, mc7->offset + A_MC7_UE_DATA1),
+			 t3_read_reg(adapter, mc7->offset + A_MC7_UE_DATA2));
+	}
+
+	if (G_PE(cause)) {
+		mc7->stats.parity_err++;
+		CH_ALERT(adapter, "%s MC7 parity error 0x%x\n",
+			 mc7->name, G_PE(cause));
+	}
+
+	if (cause & F_AE) {
+		u32 addr = 0;
+
+		if (adapter->params.rev > 0)
+			addr = t3_read_reg(adapter,
+					   mc7->offset + A_MC7_ERR_ADDR);
+		mc7->stats.addr_err++;
+		CH_ALERT(adapter, "%s MC7 address error: 0x%x\n",
+			 mc7->name, addr);
+	}
+
+	if (cause & MC7_INTR_FATAL)
+		t3_fatal_err(adapter);
+
+	t3_write_reg(adapter, mc7->offset + A_MC7_INT_CAUSE, cause);
+}
+
+#define XGM_INTR_FATAL (V_TXFIFO_PRTY_ERR(M_TXFIFO_PRTY_ERR) | \
+			V_RXFIFO_PRTY_ERR(M_RXFIFO_PRTY_ERR))
+/*
+ * XGMAC interrupt handler.
+ */
+static int mac_intr_handler(struct adapter *adap, unsigned int idx)
+{
+	struct cmac *mac = &adap2pinfo(adap, idx)->mac;
+	/*
+	 * We mask out interrupt causes for which we're not taking interrupts.
+	 * This allows us to use polling logic to monitor some of the other
+	 * conditions when taking interrupts would impose too much load on the
+	 * system.
+	 */
+	u32 cause = t3_read_reg(adap, A_XGM_INT_CAUSE + mac->offset) &
+		    ~F_RXFIFO_OVERFLOW;
+
+	if (cause & V_TXFIFO_PRTY_ERR(M_TXFIFO_PRTY_ERR)) {
+		mac->stats.tx_fifo_parity_err++;
+		CH_ALERT(adap, "port%d: MAC TX FIFO parity error\n", idx);
+	}
+	if (cause & V_RXFIFO_PRTY_ERR(M_RXFIFO_PRTY_ERR)) {
+		mac->stats.rx_fifo_parity_err++;
+		CH_ALERT(adap, "port%d: MAC RX FIFO parity error\n", idx);
+	}
+	if (cause & F_TXFIFO_UNDERRUN)
+		mac->stats.tx_fifo_urun++;
+	if (cause & F_RXFIFO_OVERFLOW)
+		mac->stats.rx_fifo_ovfl++;
+	if (cause & V_SERDES_LOS(M_SERDES_LOS))
+		mac->stats.serdes_signal_loss++;
+	if (cause & F_XAUIPCSCTCERR)
+		mac->stats.xaui_pcs_ctc_err++;
+	if (cause & F_XAUIPCSALIGNCHANGE)
+		mac->stats.xaui_pcs_align_change++;
+	if (cause & F_XGM_INT) {
+		t3_set_reg_field(adap,
+				 A_XGM_INT_ENABLE + mac->offset,
+				 F_XGM_INT, 0);
+		mac->stats.link_faults++;
+
+		t3_os_link_fault_handler(adap, idx);
+	}
+
+	if (cause & XGM_INTR_FATAL)
+		t3_fatal_err(adap);
+
+	t3_write_reg(adap, A_XGM_INT_CAUSE + mac->offset, cause);
+	return cause != 0;
+}
+
+/*
+ * Interrupt handler for PHY events.
+ */
+int t3_phy_intr_handler(struct adapter *adapter)
+{
+	u32 i, cause = t3_read_reg(adapter, A_T3DBG_INT_CAUSE);
+
+	for_each_port(adapter, i) {
+		struct port_info *p = adap2pinfo(adapter, i);
+
+		if (!(p->phy.caps & SUPPORTED_IRQ))
+			continue;
+
+		if (cause & (1 << adapter_info(adapter)->gpio_intr[i])) {
+			int phy_cause = p->phy.ops->intr_handler(&p->phy);
+
+			if (phy_cause & cphy_cause_link_change)
+				t3_link_changed(adapter, i);
+			if (phy_cause & cphy_cause_fifo_error)
+				p->phy.fifo_errors++;
+			if (phy_cause & cphy_cause_module_change)
+				t3_os_phymod_changed(adapter, i);
+		}
+	}
+
+	t3_write_reg(adapter, A_T3DBG_INT_CAUSE, cause);
+	return 0;
+}
+
+/*
+ * T3 slow path (non-data) interrupt handler.
+ */
+int t3_slow_intr_handler(struct adapter *adapter)
+{
+	u32 cause = t3_read_reg(adapter, A_PL_INT_CAUSE0);
+
+	cause &= adapter->slow_intr_mask;
+	if (!cause)
+		return 0;
+	if (cause & F_PCIM0) {
+		if (is_pcie(adapter))
+			pcie_intr_handler(adapter);
+		else
+			pci_intr_handler(adapter);
+	}
+	if (cause & F_SGE3)
+		t3_sge_err_intr_handler(adapter);
+	if (cause & F_MC7_PMRX)
+		mc7_intr_handler(&adapter->pmrx);
+	if (cause & F_MC7_PMTX)
+		mc7_intr_handler(&adapter->pmtx);
+	if (cause & F_MC7_CM)
+		mc7_intr_handler(&adapter->cm);
+	if (cause & F_CIM)
+		cim_intr_handler(adapter);
+	if (cause & F_TP1)
+		tp_intr_handler(adapter);
+	if (cause & F_ULP2_RX)
+		ulprx_intr_handler(adapter);
+	if (cause & F_ULP2_TX)
+		ulptx_intr_handler(adapter);
+	if (cause & F_PM1_RX)
+		pmrx_intr_handler(adapter);
+	if (cause & F_PM1_TX)
+		pmtx_intr_handler(adapter);
+	if (cause & F_CPL_SWITCH)
+		cplsw_intr_handler(adapter);
+	if (cause & F_MPS0)
+		mps_intr_handler(adapter);
+	if (cause & F_MC5A)
+		t3_mc5_intr_handler(&adapter->mc5);
+	if (cause & F_XGMAC0_0)
+		mac_intr_handler(adapter, 0);
+	if (cause & F_XGMAC0_1)
+		mac_intr_handler(adapter, 1);
+	if (cause & F_T3DBG)
+		t3_os_ext_intr_handler(adapter);
+
+	/* Clear the interrupts just processed. */
+	t3_write_reg(adapter, A_PL_INT_CAUSE0, cause);
+	t3_read_reg(adapter, A_PL_INT_CAUSE0);	/* flush */
+	return 1;
+}
+
+static unsigned int calc_gpio_intr(struct adapter *adap)
+{
+	unsigned int i, gpi_intr = 0;
+
+	for_each_port(adap, i)
+		if ((adap2pinfo(adap, i)->phy.caps & SUPPORTED_IRQ) &&
+		    adapter_info(adap)->gpio_intr[i])
+			gpi_intr |= 1 << adapter_info(adap)->gpio_intr[i];
+	return gpi_intr;
+}
+
+/**
+ *	t3_intr_enable - enable interrupts
+ *	@adapter: the adapter whose interrupts should be enabled
+ *
+ *	Enable interrupts by setting the interrupt enable registers of the
+ *	various HW modules and then enabling the top-level interrupt
+ *	concentrator.
+ */
+void t3_intr_enable(struct adapter *adapter)
+{
+	static const struct addr_val_pair intr_en_avp[] = {
+		{A_SG_INT_ENABLE, SGE_INTR_MASK},
+		{A_MC7_INT_ENABLE, MC7_INTR_MASK},
+		{A_MC7_INT_ENABLE - MC7_PMRX_BASE_ADDR + MC7_PMTX_BASE_ADDR,
+		 MC7_INTR_MASK},
+		{A_MC7_INT_ENABLE - MC7_PMRX_BASE_ADDR + MC7_CM_BASE_ADDR,
+		 MC7_INTR_MASK},
+		{A_MC5_DB_INT_ENABLE, MC5_INTR_MASK},
+		{A_ULPRX_INT_ENABLE, ULPRX_INTR_MASK},
+		{A_PM1_TX_INT_ENABLE, PMTX_INTR_MASK},
+		{A_PM1_RX_INT_ENABLE, PMRX_INTR_MASK},
+		{A_CIM_HOST_INT_ENABLE, CIM_INTR_MASK},
+		{A_MPS_INT_ENABLE, MPS_INTR_MASK},
+	};
+
+	adapter->slow_intr_mask = PL_INTR_MASK;
+
+	t3_write_regs(adapter, intr_en_avp, ARRAY_SIZE(intr_en_avp), 0);
+	t3_write_reg(adapter, A_TP_INT_ENABLE,
+		     adapter->params.rev >= T3_REV_C ? 0x2bfffff : 0x3bfffff);
+
+	if (adapter->params.rev > 0) {
+		t3_write_reg(adapter, A_CPL_INTR_ENABLE,
+			     CPLSW_INTR_MASK | F_CIM_OVFL_ERROR);
+		t3_write_reg(adapter, A_ULPTX_INT_ENABLE,
+			     ULPTX_INTR_MASK | F_PBL_BOUND_ERR_CH0 |
+			     F_PBL_BOUND_ERR_CH1);
+	} else {
+		t3_write_reg(adapter, A_CPL_INTR_ENABLE, CPLSW_INTR_MASK);
+		t3_write_reg(adapter, A_ULPTX_INT_ENABLE, ULPTX_INTR_MASK);
+	}
+
+	t3_write_reg(adapter, A_T3DBG_INT_ENABLE, calc_gpio_intr(adapter));
+
+	if (is_pcie(adapter))
+		t3_write_reg(adapter, A_PCIE_INT_ENABLE, PCIE_INTR_MASK);
+	else
+		t3_write_reg(adapter, A_PCIX_INT_ENABLE, PCIX_INTR_MASK);
+	t3_write_reg(adapter, A_PL_INT_ENABLE0, adapter->slow_intr_mask);
+	t3_read_reg(adapter, A_PL_INT_ENABLE0);	/* flush */
+}
+
+/**
+ *	t3_intr_disable - disable a card's interrupts
+ *	@adapter: the adapter whose interrupts should be disabled
+ *
+ *	Disable interrupts.  We only disable the top-level interrupt
+ *	concentrator and the SGE data interrupts.
+ */
+void t3_intr_disable(struct adapter *adapter)
+{
+	t3_write_reg(adapter, A_PL_INT_ENABLE0, 0);
+	t3_read_reg(adapter, A_PL_INT_ENABLE0);	/* flush */
+	adapter->slow_intr_mask = 0;
+}
+
+/**
+ *	t3_intr_clear - clear all interrupts
+ *	@adapter: the adapter whose interrupts should be cleared
+ *
+ *	Clears all interrupts.
+ */
+void t3_intr_clear(struct adapter *adapter)
+{
+	static const unsigned int cause_reg_addr[] = {
+		A_SG_INT_CAUSE,
+		A_SG_RSPQ_FL_STATUS,
+		A_PCIX_INT_CAUSE,
+		A_MC7_INT_CAUSE,
+		A_MC7_INT_CAUSE - MC7_PMRX_BASE_ADDR + MC7_PMTX_BASE_ADDR,
+		A_MC7_INT_CAUSE - MC7_PMRX_BASE_ADDR + MC7_CM_BASE_ADDR,
+		A_CIM_HOST_INT_CAUSE,
+		A_TP_INT_CAUSE,
+		A_MC5_DB_INT_CAUSE,
+		A_ULPRX_INT_CAUSE,
+		A_ULPTX_INT_CAUSE,
+		A_CPL_INTR_CAUSE,
+		A_PM1_TX_INT_CAUSE,
+		A_PM1_RX_INT_CAUSE,
+		A_MPS_INT_CAUSE,
+		A_T3DBG_INT_CAUSE,
+	};
+	unsigned int i;
+
+	/* Clear PHY and MAC interrupts for each port. */
+	for_each_port(adapter, i)
+	    t3_port_intr_clear(adapter, i);
+
+	for (i = 0; i < ARRAY_SIZE(cause_reg_addr); ++i)
+		t3_write_reg(adapter, cause_reg_addr[i], 0xffffffff);
+
+	if (is_pcie(adapter))
+		t3_write_reg(adapter, A_PCIE_PEX_ERR, 0xffffffff);
+	t3_write_reg(adapter, A_PL_INT_CAUSE0, 0xffffffff);
+	t3_read_reg(adapter, A_PL_INT_CAUSE0);	/* flush */
+}
+
+void t3_xgm_intr_enable(struct adapter *adapter, int idx)
+{
+	struct port_info *pi = adap2pinfo(adapter, idx);
+
+	t3_write_reg(adapter, A_XGM_XGM_INT_ENABLE + pi->mac.offset,
+		     XGM_EXTRA_INTR_MASK);
+}
+
+void t3_xgm_intr_disable(struct adapter *adapter, int idx)
+{
+	struct port_info *pi = adap2pinfo(adapter, idx);
+
+	t3_write_reg(adapter, A_XGM_XGM_INT_DISABLE + pi->mac.offset,
+		     0x7ff);
+}
+
+/**
+ *	t3_port_intr_enable - enable port-specific interrupts
+ *	@adapter: associated adapter
+ *	@idx: index of port whose interrupts should be enabled
+ *
+ *	Enable port-specific (i.e., MAC and PHY) interrupts for the given
+ *	adapter port.
+ */
+void t3_port_intr_enable(struct adapter *adapter, int idx)
+{
+	struct cphy *phy = &adap2pinfo(adapter, idx)->phy;
+
+	t3_write_reg(adapter, XGM_REG(A_XGM_INT_ENABLE, idx), XGM_INTR_MASK);
+	t3_read_reg(adapter, XGM_REG(A_XGM_INT_ENABLE, idx)); /* flush */
+	phy->ops->intr_enable(phy);
+}
+
+/**
+ *	t3_port_intr_disable - disable port-specific interrupts
+ *	@adapter: associated adapter
+ *	@idx: index of port whose interrupts should be disabled
+ *
+ *	Disable port-specific (i.e., MAC and PHY) interrupts for the given
+ *	adapter port.
+ */
+void t3_port_intr_disable(struct adapter *adapter, int idx)
+{
+	struct cphy *phy = &adap2pinfo(adapter, idx)->phy;
+
+	t3_write_reg(adapter, XGM_REG(A_XGM_INT_ENABLE, idx), 0);
+	t3_read_reg(adapter, XGM_REG(A_XGM_INT_ENABLE, idx)); /* flush */
+	phy->ops->intr_disable(phy);
+}
+
+/**
+ *	t3_port_intr_clear - clear port-specific interrupts
+ *	@adapter: associated adapter
+ *	@idx: index of port whose interrupts to clear
+ *
+ *	Clear port-specific (i.e., MAC and PHY) interrupts for the given
+ *	adapter port.
+ */
+static void t3_port_intr_clear(struct adapter *adapter, int idx)
+{
+	struct cphy *phy = &adap2pinfo(adapter, idx)->phy;
+
+	t3_write_reg(adapter, XGM_REG(A_XGM_INT_CAUSE, idx), 0xffffffff);
+	t3_read_reg(adapter, XGM_REG(A_XGM_INT_CAUSE, idx)); /* flush */
+	phy->ops->intr_clear(phy);
+}
+
+#define SG_CONTEXT_CMD_ATTEMPTS 100
+
+/**
+ * 	t3_sge_write_context - write an SGE context
+ * 	@adapter: the adapter
+ * 	@id: the context id
+ * 	@type: the context type
+ *
+ * 	Program an SGE context with the values already loaded in the
+ * 	CONTEXT_DATA? registers.
+ */
+static int t3_sge_write_context(struct adapter *adapter, unsigned int id,
+				unsigned int type)
+{
+	if (type == F_RESPONSEQ) {
+		/*
+		 * Can't write the Response Queue Context bits for
+		 * Interrupt Armed or the Reserve bits after the chip
+		 * has been initialized out of reset.  Writing to these
+		 * bits can confuse the hardware.
+		 */
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK0, 0xffffffff);
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0xffffffff);
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0x17ffffff);
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0xffffffff);
+	} else {
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK0, 0xffffffff);
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0xffffffff);
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0xffffffff);
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0xffffffff);
+	}
+	t3_write_reg(adapter, A_SG_CONTEXT_CMD,
+		     V_CONTEXT_CMD_OPCODE(1) | type | V_CONTEXT(id));
+	return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+			       0, SG_CONTEXT_CMD_ATTEMPTS, 1);
+}
+
+/**
+ *	clear_sge_ctxt - completely clear an SGE context
+ *	@adapter: the adapter
+ *	@id: the context id
+ *	@type: the context type
+ *
+ *	Completely clear an SGE context.  Used predominantly at post-reset
+ *	initialization.  Note in particular that we don't skip writing to any
+ *	"sensitive bits" in the contexts the way that t3_sge_write_context()
+ *	does ...
+ */
+static int clear_sge_ctxt(struct adapter *adap, unsigned int id,
+			  unsigned int type)
+{
+	t3_write_reg(adap, A_SG_CONTEXT_DATA0, 0);
+	t3_write_reg(adap, A_SG_CONTEXT_DATA1, 0);
+	t3_write_reg(adap, A_SG_CONTEXT_DATA2, 0);
+	t3_write_reg(adap, A_SG_CONTEXT_DATA3, 0);
+	t3_write_reg(adap, A_SG_CONTEXT_MASK0, 0xffffffff);
+	t3_write_reg(adap, A_SG_CONTEXT_MASK1, 0xffffffff);
+	t3_write_reg(adap, A_SG_CONTEXT_MASK2, 0xffffffff);
+	t3_write_reg(adap, A_SG_CONTEXT_MASK3, 0xffffffff);
+	t3_write_reg(adap, A_SG_CONTEXT_CMD,
+		     V_CONTEXT_CMD_OPCODE(1) | type | V_CONTEXT(id));
+	return t3_wait_op_done(adap, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+			       0, SG_CONTEXT_CMD_ATTEMPTS, 1);
+}
+
+/**
+ *	t3_sge_init_ecntxt - initialize an SGE egress context
+ *	@adapter: the adapter to configure
+ *	@id: the context id
+ *	@gts_enable: whether to enable GTS for the context
+ *	@type: the egress context type
+ *	@respq: associated response queue
+ *	@base_addr: base address of queue
+ *	@size: number of queue entries
+ *	@token: uP token
+ *	@gen: initial generation value for the context
+ *	@cidx: consumer pointer
+ *
+ *	Initialize an SGE egress context and make it ready for use.  If the
+ *	platform allows concurrent context operations, the caller is
+ *	responsible for appropriate locking.
+ */
+int t3_sge_init_ecntxt(struct adapter *adapter, unsigned int id, int gts_enable,
+		       enum sge_context_type type, int respq, u64 base_addr,
+		       unsigned int size, unsigned int token, int gen,
+		       unsigned int cidx)
+{
+	unsigned int credits = type == SGE_CNTXT_OFLD ? 0 : FW_WR_NUM;
+
+	if (base_addr & 0xfff)	/* must be 4K aligned */
+		return -EINVAL;
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	base_addr >>= 12;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, V_EC_INDEX(cidx) |
+		     V_EC_CREDITS(credits) | V_EC_GTS(gts_enable));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA1, V_EC_SIZE(size) |
+		     V_EC_BASE_LO(base_addr & 0xffff));
+	base_addr >>= 16;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA2, base_addr);
+	base_addr >>= 32;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA3,
+		     V_EC_BASE_HI(base_addr & 0xf) | V_EC_RESPQ(respq) |
+		     V_EC_TYPE(type) | V_EC_GEN(gen) | V_EC_UP_TOKEN(token) |
+		     F_EC_VALID);
+	return t3_sge_write_context(adapter, id, F_EGRESS);
+}
+
+/**
+ *	t3_sge_init_flcntxt - initialize an SGE free-buffer list context
+ *	@adapter: the adapter to configure
+ *	@id: the context id
+ *	@gts_enable: whether to enable GTS for the context
+ *	@base_addr: base address of queue
+ *	@size: number of queue entries
+ *	@bsize: size of each buffer for this queue
+ *	@cong_thres: threshold to signal congestion to upstream producers
+ *	@gen: initial generation value for the context
+ *	@cidx: consumer pointer
+ *
+ *	Initialize an SGE free list context and make it ready for use.  The
+ *	caller is responsible for ensuring only one context operation occurs
+ *	at a time.
+ */
+int t3_sge_init_flcntxt(struct adapter *adapter, unsigned int id,
+			int gts_enable, u64 base_addr, unsigned int size,
+			unsigned int bsize, unsigned int cong_thres, int gen,
+			unsigned int cidx)
+{
+	if (base_addr & 0xfff)	/* must be 4K aligned */
+		return -EINVAL;
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	base_addr >>= 12;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, base_addr);
+	base_addr >>= 32;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA1,
+		     V_FL_BASE_HI((u32) base_addr) |
+		     V_FL_INDEX_LO(cidx & M_FL_INDEX_LO));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA2, V_FL_SIZE(size) |
+		     V_FL_GEN(gen) | V_FL_INDEX_HI(cidx >> 12) |
+		     V_FL_ENTRY_SIZE_LO(bsize & M_FL_ENTRY_SIZE_LO));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA3,
+		     V_FL_ENTRY_SIZE_HI(bsize >> (32 - S_FL_ENTRY_SIZE_LO)) |
+		     V_FL_CONG_THRES(cong_thres) | V_FL_GTS(gts_enable));
+	return t3_sge_write_context(adapter, id, F_FREELIST);
+}
+
+/**
+ *	t3_sge_init_rspcntxt - initialize an SGE response queue context
+ *	@adapter: the adapter to configure
+ *	@id: the context id
+ *	@irq_vec_idx: MSI-X interrupt vector index, 0 if no MSI-X, -1 if no IRQ
+ *	@base_addr: base address of queue
+ *	@size: number of queue entries
+ *	@fl_thres: threshold for selecting the normal or jumbo free list
+ *	@gen: initial generation value for the context
+ *	@cidx: consumer pointer
+ *
+ *	Initialize an SGE response queue context and make it ready for use.
+ *	The caller is responsible for ensuring only one context operation
+ *	occurs at a time.
+ */
+int t3_sge_init_rspcntxt(struct adapter *adapter, unsigned int id,
+			 int irq_vec_idx, u64 base_addr, unsigned int size,
+			 unsigned int fl_thres, int gen, unsigned int cidx)
+{
+	unsigned int intr = 0;
+
+	if (base_addr & 0xfff)	/* must be 4K aligned */
+		return -EINVAL;
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	base_addr >>= 12;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, V_CQ_SIZE(size) |
+		     V_CQ_INDEX(cidx));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA1, base_addr);
+	base_addr >>= 32;
+	if (irq_vec_idx >= 0)
+		intr = V_RQ_MSI_VEC(irq_vec_idx) | F_RQ_INTR_EN;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA2,
+		     V_CQ_BASE_HI((u32) base_addr) | intr | V_RQ_GEN(gen));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA3, fl_thres);
+	return t3_sge_write_context(adapter, id, F_RESPONSEQ);
+}
+
+/**
+ *	t3_sge_init_cqcntxt - initialize an SGE completion queue context
+ *	@adapter: the adapter to configure
+ *	@id: the context id
+ *	@base_addr: base address of queue
+ *	@size: number of queue entries
+ *	@rspq: response queue for async notifications
+ *	@ovfl_mode: CQ overflow mode
+ *	@credits: completion queue credits
+ *	@credit_thres: the credit threshold
+ *
+ *	Initialize an SGE completion queue context and make it ready for use.
+ *	The caller is responsible for ensuring only one context operation
+ *	occurs at a time.
+ */
+int t3_sge_init_cqcntxt(struct adapter *adapter, unsigned int id, u64 base_addr,
+			unsigned int size, int rspq, int ovfl_mode,
+			unsigned int credits, unsigned int credit_thres)
+{
+	if (base_addr & 0xfff)	/* must be 4K aligned */
+		return -EINVAL;
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	base_addr >>= 12;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, V_CQ_SIZE(size));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA1, base_addr);
+	base_addr >>= 32;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA2,
+		     V_CQ_BASE_HI((u32) base_addr) | V_CQ_RSPQ(rspq) |
+		     V_CQ_GEN(1) | V_CQ_OVERFLOW_MODE(ovfl_mode) |
+		     V_CQ_ERR(ovfl_mode));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA3, V_CQ_CREDITS(credits) |
+		     V_CQ_CREDIT_THRES(credit_thres));
+	return t3_sge_write_context(adapter, id, F_CQ);
+}
+
+/**
+ *	t3_sge_enable_ecntxt - enable/disable an SGE egress context
+ *	@adapter: the adapter
+ *	@id: the egress context id
+ *	@enable: enable (1) or disable (0) the context
+ *
+ *	Enable or disable an SGE egress context.  The caller is responsible for
+ *	ensuring only one context operation occurs at a time.
+ */
+int t3_sge_enable_ecntxt(struct adapter *adapter, unsigned int id, int enable)
+{
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK0, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK3, F_EC_VALID);
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA3, V_EC_VALID(enable));
+	t3_write_reg(adapter, A_SG_CONTEXT_CMD,
+		     V_CONTEXT_CMD_OPCODE(1) | F_EGRESS | V_CONTEXT(id));
+	return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+			       0, SG_CONTEXT_CMD_ATTEMPTS, 1);
+}
+
+/**
+ *	t3_sge_disable_fl - disable an SGE free-buffer list
+ *	@adapter: the adapter
+ *	@id: the free list context id
+ *
+ *	Disable an SGE free-buffer list.  The caller is responsible for
+ *	ensuring only one context operation occurs at a time.
+ */
+int t3_sge_disable_fl(struct adapter *adapter, unsigned int id)
+{
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK0, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK2, V_FL_SIZE(M_FL_SIZE));
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA2, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_CMD,
+		     V_CONTEXT_CMD_OPCODE(1) | F_FREELIST | V_CONTEXT(id));
+	return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+			       0, SG_CONTEXT_CMD_ATTEMPTS, 1);
+}
+
+/**
+ *	t3_sge_disable_rspcntxt - disable an SGE response queue
+ *	@adapter: the adapter
+ *	@id: the response queue context id
+ *
+ *	Disable an SGE response queue.  The caller is responsible for
+ *	ensuring only one context operation occurs at a time.
+ */
+int t3_sge_disable_rspcntxt(struct adapter *adapter, unsigned int id)
+{
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK0, V_CQ_SIZE(M_CQ_SIZE));
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_CMD,
+		     V_CONTEXT_CMD_OPCODE(1) | F_RESPONSEQ | V_CONTEXT(id));
+	return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+			       0, SG_CONTEXT_CMD_ATTEMPTS, 1);
+}
+
+/**
+ *	t3_sge_disable_cqcntxt - disable an SGE completion queue
+ *	@adapter: the adapter
+ *	@id: the completion queue context id
+ *
+ *	Disable an SGE completion queue.  The caller is responsible for
+ *	ensuring only one context operation occurs at a time.
+ */
+int t3_sge_disable_cqcntxt(struct adapter *adapter, unsigned int id)
+{
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK0, V_CQ_SIZE(M_CQ_SIZE));
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_CMD,
+		     V_CONTEXT_CMD_OPCODE(1) | F_CQ | V_CONTEXT(id));
+	return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+			       0, SG_CONTEXT_CMD_ATTEMPTS, 1);
+}
+
+/**
+ *	t3_sge_cqcntxt_op - perform an operation on a completion queue context
+ *	@adapter: the adapter
+ *	@id: the context id
+ *	@op: the operation to perform
+ *
+ *	Perform the selected operation on an SGE completion queue context.
+ *	The caller is responsible for ensuring only one context operation
+ *	occurs at a time.
+ */
+int t3_sge_cqcntxt_op(struct adapter *adapter, unsigned int id, unsigned int op,
+		      unsigned int credits)
+{
+	u32 val;
+
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, credits << 16);
+	t3_write_reg(adapter, A_SG_CONTEXT_CMD, V_CONTEXT_CMD_OPCODE(op) |
+		     V_CONTEXT(id) | F_CQ);
+	if (t3_wait_op_done_val(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+				0, SG_CONTEXT_CMD_ATTEMPTS, 1, &val))
+		return -EIO;
+
+	if (op >= 2 && op < 7) {
+		if (adapter->params.rev > 0)
+			return G_CQ_INDEX(val);
+
+		t3_write_reg(adapter, A_SG_CONTEXT_CMD,
+			     V_CONTEXT_CMD_OPCODE(0) | F_CQ | V_CONTEXT(id));
+		if (t3_wait_op_done(adapter, A_SG_CONTEXT_CMD,
+				    F_CONTEXT_CMD_BUSY, 0,
+				    SG_CONTEXT_CMD_ATTEMPTS, 1))
+			return -EIO;
+		return G_CQ_INDEX(t3_read_reg(adapter, A_SG_CONTEXT_DATA0));
+	}
+	return 0;
+}
+
+/**
+ *	t3_config_rss - configure Rx packet steering
+ *	@adapter: the adapter
+ *	@rss_config: RSS settings (written to TP_RSS_CONFIG)
+ *	@cpus: values for the CPU lookup table (0xff terminated)
+ *	@rspq: values for the response queue lookup table (0xffff terminated)
+ *
+ *	Programs the receive packet steering logic.  @cpus and @rspq provide
+ *	the values for the CPU and response queue lookup tables.  If they
+ *	provide fewer values than the size of the tables the supplied values
+ *	are used repeatedly until the tables are fully populated.
+ */
+void t3_config_rss(struct adapter *adapter, unsigned int rss_config,
+		   const u8 * cpus, const u16 *rspq)
+{
+	int i, j, cpu_idx = 0, q_idx = 0;
+
+	if (cpus)
+		for (i = 0; i < RSS_TABLE_SIZE; ++i) {
+			u32 val = i << 16;
+
+			for (j = 0; j < 2; ++j) {
+				val |= (cpus[cpu_idx++] & 0x3f) << (8 * j);
+				if (cpus[cpu_idx] == 0xff)
+					cpu_idx = 0;
+			}
+			t3_write_reg(adapter, A_TP_RSS_LKP_TABLE, val);
+		}
+
+	if (rspq)
+		for (i = 0; i < RSS_TABLE_SIZE; ++i) {
+			t3_write_reg(adapter, A_TP_RSS_MAP_TABLE,
+				     (i << 16) | rspq[q_idx++]);
+			if (rspq[q_idx] == 0xffff)
+				q_idx = 0;
+		}
+
+	t3_write_reg(adapter, A_TP_RSS_CONFIG, rss_config);
+}
+
+/**
+ *	t3_tp_set_offload_mode - put TP in NIC/offload mode
+ *	@adap: the adapter
+ *	@enable: 1 to select offload mode, 0 for regular NIC
+ *
+ *	Switches TP to NIC/offload mode.
+ */
+void t3_tp_set_offload_mode(struct adapter *adap, int enable)
+{
+	if (is_offload(adap) || !enable)
+		t3_set_reg_field(adap, A_TP_IN_CONFIG, F_NICMODE,
+				 V_NICMODE(!enable));
+}
+
+/**
+ *	pm_num_pages - calculate the number of pages of the payload memory
+ *	@mem_size: the size of the payload memory
+ *	@pg_size: the size of each payload memory page
+ *
+ *	Calculate the number of pages, each of the given size, that fit in a
+ *	memory of the specified size, respecting the HW requirement that the
+ *	number of pages must be a multiple of 24.
+ */
+static inline unsigned int pm_num_pages(unsigned int mem_size,
+					unsigned int pg_size)
+{
+	unsigned int n = mem_size / pg_size;
+
+	return n - n % 24;
+}
+
+#define mem_region(adap, start, size, reg) \
+	t3_write_reg((adap), A_ ## reg, (start)); \
+	start += size
+
+/**
+ *	partition_mem - partition memory and configure TP memory settings
+ *	@adap: the adapter
+ *	@p: the TP parameters
+ *
+ *	Partitions context and payload memory and configures TP's memory
+ *	registers.
+ */
+static void partition_mem(struct adapter *adap, const struct tp_params *p)
+{
+	unsigned int m, pstructs, tids = t3_mc5_size(&adap->mc5);
+	unsigned int timers = 0, timers_shift = 22;
+
+	if (adap->params.rev > 0) {
+		if (tids <= 16 * 1024) {
+			timers = 1;
+			timers_shift = 16;
+		} else if (tids <= 64 * 1024) {
+			timers = 2;
+			timers_shift = 18;
+		} else if (tids <= 256 * 1024) {
+			timers = 3;
+			timers_shift = 20;
+		}
+	}
+
+	t3_write_reg(adap, A_TP_PMM_SIZE,
+		     p->chan_rx_size | (p->chan_tx_size >> 16));
+
+	t3_write_reg(adap, A_TP_PMM_TX_BASE, 0);
+	t3_write_reg(adap, A_TP_PMM_TX_PAGE_SIZE, p->tx_pg_size);
+	t3_write_reg(adap, A_TP_PMM_TX_MAX_PAGE, p->tx_num_pgs);
+	t3_set_reg_field(adap, A_TP_PARA_REG3, V_TXDATAACKIDX(M_TXDATAACKIDX),
+			 V_TXDATAACKIDX(fls(p->tx_pg_size) - 12));
+
+	t3_write_reg(adap, A_TP_PMM_RX_BASE, 0);
+	t3_write_reg(adap, A_TP_PMM_RX_PAGE_SIZE, p->rx_pg_size);
+	t3_write_reg(adap, A_TP_PMM_RX_MAX_PAGE, p->rx_num_pgs);
+
+	pstructs = p->rx_num_pgs + p->tx_num_pgs;
+	/* Add a bit of headroom and make multiple of 24 */
+	pstructs += 48;
+	pstructs -= pstructs % 24;
+	t3_write_reg(adap, A_TP_CMM_MM_MAX_PSTRUCT, pstructs);
+
+	m = tids * TCB_SIZE;
+	mem_region(adap, m, (64 << 10) * 64, SG_EGR_CNTX_BADDR);
+	mem_region(adap, m, (64 << 10) * 64, SG_CQ_CONTEXT_BADDR);
+	t3_write_reg(adap, A_TP_CMM_TIMER_BASE, V_CMTIMERMAXNUM(timers) | m);
+	m += ((p->ntimer_qs - 1) << timers_shift) + (1 << 22);
+	mem_region(adap, m, pstructs * 64, TP_CMM_MM_BASE);
+	mem_region(adap, m, 64 * (pstructs / 24), TP_CMM_MM_PS_FLST_BASE);
+	mem_region(adap, m, 64 * (p->rx_num_pgs / 24), TP_CMM_MM_RX_FLST_BASE);
+	mem_region(adap, m, 64 * (p->tx_num_pgs / 24), TP_CMM_MM_TX_FLST_BASE);
+
+	m = (m + 4095) & ~0xfff;
+	t3_write_reg(adap, A_CIM_SDRAM_BASE_ADDR, m);
+	t3_write_reg(adap, A_CIM_SDRAM_ADDR_SIZE, p->cm_size - m);
+
+	tids = (p->cm_size - m - (3 << 20)) / 3072 - 32;
+	m = t3_mc5_size(&adap->mc5) - adap->params.mc5.nservers -
+	    adap->params.mc5.nfilters - adap->params.mc5.nroutes;
+	if (tids < m)
+		adap->params.mc5.nservers += m - tids;
+}
+
+static inline void tp_wr_indirect(struct adapter *adap, unsigned int addr,
+				  u32 val)
+{
+	t3_write_reg(adap, A_TP_PIO_ADDR, addr);
+	t3_write_reg(adap, A_TP_PIO_DATA, val);
+}
+
+static void tp_config(struct adapter *adap, const struct tp_params *p)
+{
+	t3_write_reg(adap, A_TP_GLOBAL_CONFIG, F_TXPACINGENABLE | F_PATHMTU |
+		     F_IPCHECKSUMOFFLOAD | F_UDPCHECKSUMOFFLOAD |
+		     F_TCPCHECKSUMOFFLOAD | V_IPTTL(64));
+	t3_write_reg(adap, A_TP_TCP_OPTIONS, V_MTUDEFAULT(576) |
+		     F_MTUENABLE | V_WINDOWSCALEMODE(1) |
+		     V_TIMESTAMPSMODE(1) | V_SACKMODE(1) | V_SACKRX(1));
+	t3_write_reg(adap, A_TP_DACK_CONFIG, V_AUTOSTATE3(1) |
+		     V_AUTOSTATE2(1) | V_AUTOSTATE1(0) |
+		     V_BYTETHRESHOLD(26880) | V_MSSTHRESHOLD(2) |
+		     F_AUTOCAREFUL | F_AUTOENABLE | V_DACK_MODE(1));
+	t3_set_reg_field(adap, A_TP_IN_CONFIG, F_RXFBARBPRIO | F_TXFBARBPRIO,
+			 F_IPV6ENABLE | F_NICMODE);
+	t3_write_reg(adap, A_TP_TX_RESOURCE_LIMIT, 0x18141814);
+	t3_write_reg(adap, A_TP_PARA_REG4, 0x5050105);
+	t3_set_reg_field(adap, A_TP_PARA_REG6, 0,
+			 adap->params.rev > 0 ? F_ENABLEESND :
+			 F_T3A_ENABLEESND);
+
+	t3_set_reg_field(adap, A_TP_PC_CONFIG,
+			 F_ENABLEEPCMDAFULL,
+			 F_ENABLEOCSPIFULL |F_TXDEFERENABLE | F_HEARBEATDACK |
+			 F_TXCONGESTIONMODE | F_RXCONGESTIONMODE);
+	t3_set_reg_field(adap, A_TP_PC_CONFIG2, F_CHDRAFULL,
+			 F_ENABLEIPV6RSS | F_ENABLENONOFDTNLSYN |
+			 F_ENABLEARPMISS | F_DISBLEDAPARBIT0);
+	t3_write_reg(adap, A_TP_PROXY_FLOW_CNTL, 1080);
+	t3_write_reg(adap, A_TP_PROXY_FLOW_CNTL, 1000);
+
+	if (adap->params.rev > 0) {
+		tp_wr_indirect(adap, A_TP_EGRESS_CONFIG, F_REWRITEFORCETOSIZE);
+		t3_set_reg_field(adap, A_TP_PARA_REG3, F_TXPACEAUTO,
+				 F_TXPACEAUTO);
+		t3_set_reg_field(adap, A_TP_PC_CONFIG, F_LOCKTID, F_LOCKTID);
+		t3_set_reg_field(adap, A_TP_PARA_REG3, 0, F_TXPACEAUTOSTRICT);
+	} else
+		t3_set_reg_field(adap, A_TP_PARA_REG3, 0, F_TXPACEFIXED);
+
+	if (adap->params.rev == T3_REV_C)
+		t3_set_reg_field(adap, A_TP_PC_CONFIG,
+				 V_TABLELATENCYDELTA(M_TABLELATENCYDELTA),
+				 V_TABLELATENCYDELTA(4));
+
+	t3_write_reg(adap, A_TP_TX_MOD_QUEUE_WEIGHT1, 0);
+	t3_write_reg(adap, A_TP_TX_MOD_QUEUE_WEIGHT0, 0);
+	t3_write_reg(adap, A_TP_MOD_CHANNEL_WEIGHT, 0);
+	t3_write_reg(adap, A_TP_MOD_RATE_LIMIT, 0xf2200000);
+}
+
+/* Desired TP timer resolution in usec */
+#define TP_TMR_RES 50
+
+/* TCP timer values in ms */
+#define TP_DACK_TIMER 50
+#define TP_RTO_MIN    250
+
+/**
+ *	tp_set_timers - set TP timing parameters
+ *	@adap: the adapter to set
+ *	@core_clk: the core clock frequency in Hz
+ *
+ *	Set TP's timing parameters, such as the various timer resolutions and
+ *	the TCP timer values.
+ */
+static void tp_set_timers(struct adapter *adap, unsigned int core_clk)
+{
+	unsigned int tre = fls(core_clk / (1000000 / TP_TMR_RES)) - 1;
+	unsigned int dack_re = fls(core_clk / 5000) - 1;	/* 200us */
+	unsigned int tstamp_re = fls(core_clk / 1000);	/* 1ms, at least */
+	unsigned int tps = core_clk >> tre;
+
+	t3_write_reg(adap, A_TP_TIMER_RESOLUTION, V_TIMERRESOLUTION(tre) |
+		     V_DELAYEDACKRESOLUTION(dack_re) |
+		     V_TIMESTAMPRESOLUTION(tstamp_re));
+	t3_write_reg(adap, A_TP_DACK_TIMER,
+		     (core_clk >> dack_re) / (1000 / TP_DACK_TIMER));
+	t3_write_reg(adap, A_TP_TCP_BACKOFF_REG0, 0x3020100);
+	t3_write_reg(adap, A_TP_TCP_BACKOFF_REG1, 0x7060504);
+	t3_write_reg(adap, A_TP_TCP_BACKOFF_REG2, 0xb0a0908);
+	t3_write_reg(adap, A_TP_TCP_BACKOFF_REG3, 0xf0e0d0c);
+	t3_write_reg(adap, A_TP_SHIFT_CNT, V_SYNSHIFTMAX(6) |
+		     V_RXTSHIFTMAXR1(4) | V_RXTSHIFTMAXR2(15) |
+		     V_PERSHIFTBACKOFFMAX(8) | V_PERSHIFTMAX(8) |
+		     V_KEEPALIVEMAX(9));
+
+#define SECONDS * tps
+
+	t3_write_reg(adap, A_TP_MSL, adap->params.rev > 0 ? 0 : 2 SECONDS);
+	t3_write_reg(adap, A_TP_RXT_MIN, tps / (1000 / TP_RTO_MIN));
+	t3_write_reg(adap, A_TP_RXT_MAX, 64 SECONDS);
+	t3_write_reg(adap, A_TP_PERS_MIN, 5 SECONDS);
+	t3_write_reg(adap, A_TP_PERS_MAX, 64 SECONDS);
+	t3_write_reg(adap, A_TP_KEEP_IDLE, 7200 SECONDS);
+	t3_write_reg(adap, A_TP_KEEP_INTVL, 75 SECONDS);
+	t3_write_reg(adap, A_TP_INIT_SRTT, 3 SECONDS);
+	t3_write_reg(adap, A_TP_FINWAIT2_TIMER, 600 SECONDS);
+
+#undef SECONDS
+}
+
+/**
+ *	t3_tp_set_coalescing_size - set receive coalescing size
+ *	@adap: the adapter
+ *	@size: the receive coalescing size
+ *	@psh: whether a set PSH bit should deliver coalesced data
+ *
+ *	Set the receive coalescing size and PSH bit handling.
+ */
+static int t3_tp_set_coalescing_size(struct adapter *adap,
+				     unsigned int size, int psh)
+{
+	u32 val;
+
+	if (size > MAX_RX_COALESCING_LEN)
+		return -EINVAL;
+
+	val = t3_read_reg(adap, A_TP_PARA_REG3);
+	val &= ~(F_RXCOALESCEENABLE | F_RXCOALESCEPSHEN);
+
+	if (size) {
+		val |= F_RXCOALESCEENABLE;
+		if (psh)
+			val |= F_RXCOALESCEPSHEN;
+		size = min(MAX_RX_COALESCING_LEN, size);
+		t3_write_reg(adap, A_TP_PARA_REG2, V_RXCOALESCESIZE(size) |
+			     V_MAXRXDATA(MAX_RX_COALESCING_LEN));
+	}
+	t3_write_reg(adap, A_TP_PARA_REG3, val);
+	return 0;
+}
+
+/**
+ *	t3_tp_set_max_rxsize - set the max receive size
+ *	@adap: the adapter
+ *	@size: the max receive size
+ *
+ *	Set TP's max receive size.  This is the limit that applies when
+ *	receive coalescing is disabled.
+ */
+static void t3_tp_set_max_rxsize(struct adapter *adap, unsigned int size)
+{
+	t3_write_reg(adap, A_TP_PARA_REG7,
+		     V_PMMAXXFERLEN0(size) | V_PMMAXXFERLEN1(size));
+}
+
+static void init_mtus(unsigned short mtus[])
+{
+	/*
+	 * See draft-mathis-plpmtud-00.txt for the values.  The min is 88 so
+	 * it can accommodate max size TCP/IP headers when SACK and timestamps
+	 * are enabled and still have at least 8 bytes of payload.
+	 */
+	mtus[0] = 88;
+	mtus[1] = 88;
+	mtus[2] = 256;
+	mtus[3] = 512;
+	mtus[4] = 576;
+	mtus[5] = 1024;
+	mtus[6] = 1280;
+	mtus[7] = 1492;
+	mtus[8] = 1500;
+	mtus[9] = 2002;
+	mtus[10] = 2048;
+	mtus[11] = 4096;
+	mtus[12] = 4352;
+	mtus[13] = 8192;
+	mtus[14] = 9000;
+	mtus[15] = 9600;
+}
+
+/*
+ * Initial congestion control parameters.
+ */
+static void init_cong_ctrl(unsigned short *a, unsigned short *b)
+{
+	a[0] = a[1] = a[2] = a[3] = a[4] = a[5] = a[6] = a[7] = a[8] = 1;
+	a[9] = 2;
+	a[10] = 3;
+	a[11] = 4;
+	a[12] = 5;
+	a[13] = 6;
+	a[14] = 7;
+	a[15] = 8;
+	a[16] = 9;
+	a[17] = 10;
+	a[18] = 14;
+	a[19] = 17;
+	a[20] = 21;
+	a[21] = 25;
+	a[22] = 30;
+	a[23] = 35;
+	a[24] = 45;
+	a[25] = 60;
+	a[26] = 80;
+	a[27] = 100;
+	a[28] = 200;
+	a[29] = 300;
+	a[30] = 400;
+	a[31] = 500;
+
+	b[0] = b[1] = b[2] = b[3] = b[4] = b[5] = b[6] = b[7] = b[8] = 0;
+	b[9] = b[10] = 1;
+	b[11] = b[12] = 2;
+	b[13] = b[14] = b[15] = b[16] = 3;
+	b[17] = b[18] = b[19] = b[20] = b[21] = 4;
+	b[22] = b[23] = b[24] = b[25] = b[26] = b[27] = 5;
+	b[28] = b[29] = 6;
+	b[30] = b[31] = 7;
+}
+
+/* The minimum additive increment value for the congestion control table */
+#define CC_MIN_INCR 2U
+
+/**
+ *	t3_load_mtus - write the MTU and congestion control HW tables
+ *	@adap: the adapter
+ *	@mtus: the unrestricted values for the MTU table
+ *	@alphs: the values for the congestion control alpha parameter
+ *	@beta: the values for the congestion control beta parameter
+ *	@mtu_cap: the maximum permitted effective MTU
+ *
+ *	Write the MTU table with the supplied MTUs capping each at &mtu_cap.
+ *	Update the high-speed congestion control table with the supplied alpha,
+ * 	beta, and MTUs.
+ */
+void t3_load_mtus(struct adapter *adap, unsigned short mtus[NMTUS],
+		  unsigned short alpha[NCCTRL_WIN],
+		  unsigned short beta[NCCTRL_WIN], unsigned short mtu_cap)
+{
+	static const unsigned int avg_pkts[NCCTRL_WIN] = {
+		2, 6, 10, 14, 20, 28, 40, 56, 80, 112, 160, 224, 320, 448, 640,
+		896, 1281, 1792, 2560, 3584, 5120, 7168, 10240, 14336, 20480,
+		28672, 40960, 57344, 81920, 114688, 163840, 229376
+	};
+
+	unsigned int i, w;
+
+	for (i = 0; i < NMTUS; ++i) {
+		unsigned int mtu = min(mtus[i], mtu_cap);
+		unsigned int log2 = fls(mtu);
+
+		if (!(mtu & ((1 << log2) >> 2)))	/* round */
+			log2--;
+		t3_write_reg(adap, A_TP_MTU_TABLE,
+			     (i << 24) | (log2 << 16) | mtu);
+
+		for (w = 0; w < NCCTRL_WIN; ++w) {
+			unsigned int inc;
+
+			inc = max(((mtu - 40) * alpha[w]) / avg_pkts[w],
+				  CC_MIN_INCR);
+
+			t3_write_reg(adap, A_TP_CCTRL_TABLE, (i << 21) |
+				     (w << 16) | (beta[w] << 13) | inc);
+		}
+	}
+}
+
+/**
+ *	t3_tp_get_mib_stats - read TP's MIB counters
+ *	@adap: the adapter
+ *	@tps: holds the returned counter values
+ *
+ *	Returns the values of TP's MIB counters.
+ */
+void t3_tp_get_mib_stats(struct adapter *adap, struct tp_mib_stats *tps)
+{
+	t3_read_indirect(adap, A_TP_MIB_INDEX, A_TP_MIB_RDATA, (u32 *) tps,
+			 sizeof(*tps) / sizeof(u32), 0);
+}
+
+#define ulp_region(adap, name, start, len) \
+	t3_write_reg((adap), A_ULPRX_ ## name ## _LLIMIT, (start)); \
+	t3_write_reg((adap), A_ULPRX_ ## name ## _ULIMIT, \
+		     (start) + (len) - 1); \
+	start += len
+
+#define ulptx_region(adap, name, start, len) \
+	t3_write_reg((adap), A_ULPTX_ ## name ## _LLIMIT, (start)); \
+	t3_write_reg((adap), A_ULPTX_ ## name ## _ULIMIT, \
+		     (start) + (len) - 1)
+
+static void ulp_config(struct adapter *adap, const struct tp_params *p)
+{
+	unsigned int m = p->chan_rx_size;
+
+	ulp_region(adap, ISCSI, m, p->chan_rx_size / 8);
+	ulp_region(adap, TDDP, m, p->chan_rx_size / 8);
+	ulptx_region(adap, TPT, m, p->chan_rx_size / 4);
+	ulp_region(adap, STAG, m, p->chan_rx_size / 4);
+	ulp_region(adap, RQ, m, p->chan_rx_size / 4);
+	ulptx_region(adap, PBL, m, p->chan_rx_size / 4);
+	ulp_region(adap, PBL, m, p->chan_rx_size / 4);
+	t3_write_reg(adap, A_ULPRX_TDDP_TAGMASK, 0xffffffff);
+}
+
+/**
+ *	t3_set_proto_sram - set the contents of the protocol sram
+ *	@adapter: the adapter
+ *	@data: the protocol image
+ *
+ *	Write the contents of the protocol SRAM.
+ */
+int t3_set_proto_sram(struct adapter *adap, const u8 *data)
+{
+	int i;
+	const __be32 *buf = (const __be32 *)data;
+
+	for (i = 0; i < PROTO_SRAM_LINES; i++) {
+		t3_write_reg(adap, A_TP_EMBED_OP_FIELD5, be32_to_cpu(*buf++));
+		t3_write_reg(adap, A_TP_EMBED_OP_FIELD4, be32_to_cpu(*buf++));
+		t3_write_reg(adap, A_TP_EMBED_OP_FIELD3, be32_to_cpu(*buf++));
+		t3_write_reg(adap, A_TP_EMBED_OP_FIELD2, be32_to_cpu(*buf++));
+		t3_write_reg(adap, A_TP_EMBED_OP_FIELD1, be32_to_cpu(*buf++));
+
+		t3_write_reg(adap, A_TP_EMBED_OP_FIELD0, i << 1 | 1 << 31);
+		if (t3_wait_op_done(adap, A_TP_EMBED_OP_FIELD0, 1, 1, 5, 1))
+			return -EIO;
+	}
+	t3_write_reg(adap, A_TP_EMBED_OP_FIELD0, 0);
+
+	return 0;
+}
+
+void t3_config_trace_filter(struct adapter *adapter,
+			    const struct trace_params *tp, int filter_index,
+			    int invert, int enable)
+{
+	u32 addr, key[4], mask[4];
+
+	key[0] = tp->sport | (tp->sip << 16);
+	key[1] = (tp->sip >> 16) | (tp->dport << 16);
+	key[2] = tp->dip;
+	key[3] = tp->proto | (tp->vlan << 8) | (tp->intf << 20);
+
+	mask[0] = tp->sport_mask | (tp->sip_mask << 16);
+	mask[1] = (tp->sip_mask >> 16) | (tp->dport_mask << 16);
+	mask[2] = tp->dip_mask;
+	mask[3] = tp->proto_mask | (tp->vlan_mask << 8) | (tp->intf_mask << 20);
+
+	if (invert)
+		key[3] |= (1 << 29);
+	if (enable)
+		key[3] |= (1 << 28);
+
+	addr = filter_index ? A_TP_RX_TRC_KEY0 : A_TP_TX_TRC_KEY0;
+	tp_wr_indirect(adapter, addr++, key[0]);
+	tp_wr_indirect(adapter, addr++, mask[0]);
+	tp_wr_indirect(adapter, addr++, key[1]);
+	tp_wr_indirect(adapter, addr++, mask[1]);
+	tp_wr_indirect(adapter, addr++, key[2]);
+	tp_wr_indirect(adapter, addr++, mask[2]);
+	tp_wr_indirect(adapter, addr++, key[3]);
+	tp_wr_indirect(adapter, addr, mask[3]);
+	t3_read_reg(adapter, A_TP_PIO_DATA);
+}
+
+/**
+ *	t3_config_sched - configure a HW traffic scheduler
+ *	@adap: the adapter
+ *	@kbps: target rate in Kbps
+ *	@sched: the scheduler index
+ *
+ *	Configure a HW scheduler for the target rate
+ */
+int t3_config_sched(struct adapter *adap, unsigned int kbps, int sched)
+{
+	unsigned int v, tps, cpt, bpt, delta, mindelta = ~0;
+	unsigned int clk = adap->params.vpd.cclk * 1000;
+	unsigned int selected_cpt = 0, selected_bpt = 0;
+
+	if (kbps > 0) {
+		kbps *= 125;	/* -> bytes */
+		for (cpt = 1; cpt <= 255; cpt++) {
+			tps = clk / cpt;
+			bpt = (kbps + tps / 2) / tps;
+			if (bpt > 0 && bpt <= 255) {
+				v = bpt * tps;
+				delta = v >= kbps ? v - kbps : kbps - v;
+				if (delta <= mindelta) {
+					mindelta = delta;
+					selected_cpt = cpt;
+					selected_bpt = bpt;
+				}
+			} else if (selected_cpt)
+				break;
+		}
+		if (!selected_cpt)
+			return -EINVAL;
+	}
+	t3_write_reg(adap, A_TP_TM_PIO_ADDR,
+		     A_TP_TX_MOD_Q1_Q0_RATE_LIMIT - sched / 2);
+	v = t3_read_reg(adap, A_TP_TM_PIO_DATA);
+	if (sched & 1)
+		v = (v & 0xffff) | (selected_cpt << 16) | (selected_bpt << 24);
+	else
+		v = (v & 0xffff0000) | selected_cpt | (selected_bpt << 8);
+	t3_write_reg(adap, A_TP_TM_PIO_DATA, v);
+	return 0;
+}
+
+static int tp_init(struct adapter *adap, const struct tp_params *p)
+{
+	int busy = 0;
+
+	tp_config(adap, p);
+	t3_set_vlan_accel(adap, 3, 0);
+
+	if (is_offload(adap)) {
+		tp_set_timers(adap, adap->params.vpd.cclk * 1000);
+		t3_write_reg(adap, A_TP_RESET, F_FLSTINITENABLE);
+		busy = t3_wait_op_done(adap, A_TP_RESET, F_FLSTINITENABLE,
+				       0, 1000, 5);
+		if (busy)
+			CH_ERR(adap, "TP initialization timed out\n");
+	}
+
+	if (!busy)
+		t3_write_reg(adap, A_TP_RESET, F_TPRESET);
+	return busy;
+}
+
+/*
+ * Perform the bits of HW initialization that are dependent on the Tx
+ * channels being used.
+ */
+static void chan_init_hw(struct adapter *adap, unsigned int chan_map)
+{
+	int i;
+
+	if (chan_map != 3) {                                 /* one channel */
+		t3_set_reg_field(adap, A_ULPRX_CTL, F_ROUND_ROBIN, 0);
+		t3_set_reg_field(adap, A_ULPTX_CONFIG, F_CFG_RR_ARB, 0);
+		t3_write_reg(adap, A_MPS_CFG, F_TPRXPORTEN | F_ENFORCEPKT |
+			     (chan_map == 1 ? F_TPTXPORT0EN | F_PORT0ACTIVE :
+					      F_TPTXPORT1EN | F_PORT1ACTIVE));
+		t3_write_reg(adap, A_PM1_TX_CFG,
+			     chan_map == 1 ? 0xffffffff : 0);
+	} else {                                             /* two channels */
+		t3_set_reg_field(adap, A_ULPRX_CTL, 0, F_ROUND_ROBIN);
+		t3_set_reg_field(adap, A_ULPTX_CONFIG, 0, F_CFG_RR_ARB);
+		t3_write_reg(adap, A_ULPTX_DMA_WEIGHT,
+			     V_D1_WEIGHT(16) | V_D0_WEIGHT(16));
+		t3_write_reg(adap, A_MPS_CFG, F_TPTXPORT0EN | F_TPTXPORT1EN |
+			     F_TPRXPORTEN | F_PORT0ACTIVE | F_PORT1ACTIVE |
+			     F_ENFORCEPKT);
+		t3_write_reg(adap, A_PM1_TX_CFG, 0x80008000);
+		t3_set_reg_field(adap, A_TP_PC_CONFIG, 0, F_TXTOSQUEUEMAPMODE);
+		t3_write_reg(adap, A_TP_TX_MOD_QUEUE_REQ_MAP,
+			     V_TX_MOD_QUEUE_REQ_MAP(0xaa));
+		for (i = 0; i < 16; i++)
+			t3_write_reg(adap, A_TP_TX_MOD_QUE_TABLE,
+				     (i << 16) | 0x1010);
+	}
+}
+
+static int calibrate_xgm(struct adapter *adapter)
+{
+	if (uses_xaui(adapter)) {
+		unsigned int v, i;
+
+		for (i = 0; i < 5; ++i) {
+			t3_write_reg(adapter, A_XGM_XAUI_IMP, 0);
+			t3_read_reg(adapter, A_XGM_XAUI_IMP);
+			msleep(1);
+			v = t3_read_reg(adapter, A_XGM_XAUI_IMP);
+			if (!(v & (F_XGM_CALFAULT | F_CALBUSY))) {
+				t3_write_reg(adapter, A_XGM_XAUI_IMP,
+					     V_XAUIIMP(G_CALIMP(v) >> 2));
+				return 0;
+			}
+		}
+		CH_ERR(adapter, "MAC calibration failed\n");
+		return -1;
+	} else {
+		t3_write_reg(adapter, A_XGM_RGMII_IMP,
+			     V_RGMIIIMPPD(2) | V_RGMIIIMPPU(3));
+		t3_set_reg_field(adapter, A_XGM_RGMII_IMP, F_XGM_IMPSETUPDATE,
+				 F_XGM_IMPSETUPDATE);
+	}
+	return 0;
+}
+
+static void calibrate_xgm_t3b(struct adapter *adapter)
+{
+	if (!uses_xaui(adapter)) {
+		t3_write_reg(adapter, A_XGM_RGMII_IMP, F_CALRESET |
+			     F_CALUPDATE | V_RGMIIIMPPD(2) | V_RGMIIIMPPU(3));
+		t3_set_reg_field(adapter, A_XGM_RGMII_IMP, F_CALRESET, 0);
+		t3_set_reg_field(adapter, A_XGM_RGMII_IMP, 0,
+				 F_XGM_IMPSETUPDATE);
+		t3_set_reg_field(adapter, A_XGM_RGMII_IMP, F_XGM_IMPSETUPDATE,
+				 0);
+		t3_set_reg_field(adapter, A_XGM_RGMII_IMP, F_CALUPDATE, 0);
+		t3_set_reg_field(adapter, A_XGM_RGMII_IMP, 0, F_CALUPDATE);
+	}
+}
+
+struct mc7_timing_params {
+	unsigned char ActToPreDly;
+	unsigned char ActToRdWrDly;
+	unsigned char PreCyc;
+	unsigned char RefCyc[5];
+	unsigned char BkCyc;
+	unsigned char WrToRdDly;
+	unsigned char RdToWrDly;
+};
+
+/*
+ * Write a value to a register and check that the write completed.  These
+ * writes normally complete in a cycle or two, so one read should suffice.
+ * The very first read exists to flush the posted write to the device.
+ */
+static int wrreg_wait(struct adapter *adapter, unsigned int addr, u32 val)
+{
+	t3_write_reg(adapter, addr, val);
+	t3_read_reg(adapter, addr);	/* flush */
+	if (!(t3_read_reg(adapter, addr) & F_BUSY))
+		return 0;
+	CH_ERR(adapter, "write to MC7 register 0x%x timed out\n", addr);
+	return -EIO;
+}
+
+static int mc7_init(struct mc7 *mc7, unsigned int mc7_clock, int mem_type)
+{
+	static const unsigned int mc7_mode[] = {
+		0x632, 0x642, 0x652, 0x432, 0x442
+	};
+	static const struct mc7_timing_params mc7_timings[] = {
+		{12, 3, 4, {20, 28, 34, 52, 0}, 15, 6, 4},
+		{12, 4, 5, {20, 28, 34, 52, 0}, 16, 7, 4},
+		{12, 5, 6, {20, 28, 34, 52, 0}, 17, 8, 4},
+		{9, 3, 4, {15, 21, 26, 39, 0}, 12, 6, 4},
+		{9, 4, 5, {15, 21, 26, 39, 0}, 13, 7, 4}
+	};
+
+	u32 val;
+	unsigned int width, density, slow, attempts;
+	struct adapter *adapter = mc7->adapter;
+	const struct mc7_timing_params *p = &mc7_timings[mem_type];
+
+	if (!mc7->size)
+		return 0;
+
+	val = t3_read_reg(adapter, mc7->offset + A_MC7_CFG);
+	slow = val & F_SLOW;
+	width = G_WIDTH(val);
+	density = G_DEN(val);
+
+	t3_write_reg(adapter, mc7->offset + A_MC7_CFG, val | F_IFEN);
+	val = t3_read_reg(adapter, mc7->offset + A_MC7_CFG);	/* flush */
+	msleep(1);
+
+	if (!slow) {
+		t3_write_reg(adapter, mc7->offset + A_MC7_CAL, F_SGL_CAL_EN);
+		t3_read_reg(adapter, mc7->offset + A_MC7_CAL);
+		msleep(1);
+		if (t3_read_reg(adapter, mc7->offset + A_MC7_CAL) &
+		    (F_BUSY | F_SGL_CAL_EN | F_CAL_FAULT)) {
+			CH_ERR(adapter, "%s MC7 calibration timed out\n",
+			       mc7->name);
+			goto out_fail;
+		}
+	}
+
+	t3_write_reg(adapter, mc7->offset + A_MC7_PARM,
+		     V_ACTTOPREDLY(p->ActToPreDly) |
+		     V_ACTTORDWRDLY(p->ActToRdWrDly) | V_PRECYC(p->PreCyc) |
+		     V_REFCYC(p->RefCyc[density]) | V_BKCYC(p->BkCyc) |
+		     V_WRTORDDLY(p->WrToRdDly) | V_RDTOWRDLY(p->RdToWrDly));
+
+	t3_write_reg(adapter, mc7->offset + A_MC7_CFG,
+		     val | F_CLKEN | F_TERM150);
+	t3_read_reg(adapter, mc7->offset + A_MC7_CFG);	/* flush */
+
+	if (!slow)
+		t3_set_reg_field(adapter, mc7->offset + A_MC7_DLL, F_DLLENB,
+				 F_DLLENB);
+	udelay(1);
+
+	val = slow ? 3 : 6;
+	if (wrreg_wait(adapter, mc7->offset + A_MC7_PRE, 0) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE2, 0) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE3, 0) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE1, val))
+		goto out_fail;
+
+	if (!slow) {
+		t3_write_reg(adapter, mc7->offset + A_MC7_MODE, 0x100);
+		t3_set_reg_field(adapter, mc7->offset + A_MC7_DLL, F_DLLRST, 0);
+		udelay(5);
+	}
+
+	if (wrreg_wait(adapter, mc7->offset + A_MC7_PRE, 0) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_REF, 0) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_REF, 0) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_MODE,
+		       mc7_mode[mem_type]) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE1, val | 0x380) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE1, val))
+		goto out_fail;
+
+	/* clock value is in KHz */
+	mc7_clock = mc7_clock * 7812 + mc7_clock / 2;	/* ns */
+	mc7_clock /= 1000000;	/* KHz->MHz, ns->us */
+
+	t3_write_reg(adapter, mc7->offset + A_MC7_REF,
+		     F_PERREFEN | V_PREREFDIV(mc7_clock));
+	t3_read_reg(adapter, mc7->offset + A_MC7_REF);	/* flush */
+
+	t3_write_reg(adapter, mc7->offset + A_MC7_ECC, F_ECCGENEN | F_ECCCHKEN);
+	t3_write_reg(adapter, mc7->offset + A_MC7_BIST_DATA, 0);
+	t3_write_reg(adapter, mc7->offset + A_MC7_BIST_ADDR_BEG, 0);
+	t3_write_reg(adapter, mc7->offset + A_MC7_BIST_ADDR_END,
+		     (mc7->size << width) - 1);
+	t3_write_reg(adapter, mc7->offset + A_MC7_BIST_OP, V_OP(1));
+	t3_read_reg(adapter, mc7->offset + A_MC7_BIST_OP);	/* flush */
+
+	attempts = 50;
+	do {
+		msleep(250);
+		val = t3_read_reg(adapter, mc7->offset + A_MC7_BIST_OP);
+	} while ((val & F_BUSY) && --attempts);
+	if (val & F_BUSY) {
+		CH_ERR(adapter, "%s MC7 BIST timed out\n", mc7->name);
+		goto out_fail;
+	}
+
+	/* Enable normal memory accesses. */
+	t3_set_reg_field(adapter, mc7->offset + A_MC7_CFG, 0, F_RDY);
+	return 0;
+
+out_fail:
+	return -1;
+}
+
+static void config_pcie(struct adapter *adap)
+{
+	static const u16 ack_lat[4][6] = {
+		{237, 416, 559, 1071, 2095, 4143},
+		{128, 217, 289, 545, 1057, 2081},
+		{73, 118, 154, 282, 538, 1050},
+		{67, 107, 86, 150, 278, 534}
+	};
+	static const u16 rpl_tmr[4][6] = {
+		{711, 1248, 1677, 3213, 6285, 12429},
+		{384, 651, 867, 1635, 3171, 6243},
+		{219, 354, 462, 846, 1614, 3150},
+		{201, 321, 258, 450, 834, 1602}
+	};
+
+	u16 val, devid;
+	unsigned int log2_width, pldsize;
+	unsigned int fst_trn_rx, fst_trn_tx, acklat, rpllmt;
+
+	pcie_capability_read_word(adap->pdev, PCI_EXP_DEVCTL, &val);
+	pldsize = (val & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
+
+	pci_read_config_word(adap->pdev, 0x2, &devid);
+	if (devid == 0x37) {
+		pcie_capability_write_word(adap->pdev, PCI_EXP_DEVCTL,
+					   val & ~PCI_EXP_DEVCTL_READRQ &
+					   ~PCI_EXP_DEVCTL_PAYLOAD);
+		pldsize = 0;
+	}
+
+	pcie_capability_read_word(adap->pdev, PCI_EXP_LNKCTL, &val);
+
+	fst_trn_tx = G_NUMFSTTRNSEQ(t3_read_reg(adap, A_PCIE_PEX_CTRL0));
+	fst_trn_rx = adap->params.rev == 0 ? fst_trn_tx :
+	    G_NUMFSTTRNSEQRX(t3_read_reg(adap, A_PCIE_MODE));
+	log2_width = fls(adap->params.pci.width) - 1;
+	acklat = ack_lat[log2_width][pldsize];
+	if (val & PCI_EXP_LNKCTL_ASPM_L0S)	/* check LOsEnable */
+		acklat += fst_trn_tx * 4;
+	rpllmt = rpl_tmr[log2_width][pldsize] + fst_trn_rx * 4;
+
+	if (adap->params.rev == 0)
+		t3_set_reg_field(adap, A_PCIE_PEX_CTRL1,
+				 V_T3A_ACKLAT(M_T3A_ACKLAT),
+				 V_T3A_ACKLAT(acklat));
+	else
+		t3_set_reg_field(adap, A_PCIE_PEX_CTRL1, V_ACKLAT(M_ACKLAT),
+				 V_ACKLAT(acklat));
+
+	t3_set_reg_field(adap, A_PCIE_PEX_CTRL0, V_REPLAYLMT(M_REPLAYLMT),
+			 V_REPLAYLMT(rpllmt));
+
+	t3_write_reg(adap, A_PCIE_PEX_ERR, 0xffffffff);
+	t3_set_reg_field(adap, A_PCIE_CFG, 0,
+			 F_ENABLELINKDWNDRST | F_ENABLELINKDOWNRST |
+			 F_PCIE_DMASTOPEN | F_PCIE_CLIDECEN);
+}
+
+/*
+ * Initialize and configure T3 HW modules.  This performs the
+ * initialization steps that need to be done once after a card is reset.
+ * MAC and PHY initialization is handled separarely whenever a port is enabled.
+ *
+ * fw_params are passed to FW and their value is platform dependent.  Only the
+ * top 8 bits are available for use, the rest must be 0.
+ */
+int t3_init_hw(struct adapter *adapter, u32 fw_params)
+{
+	int err = -EIO, attempts, i;
+	const struct vpd_params *vpd = &adapter->params.vpd;
+
+	if (adapter->params.rev > 0)
+		calibrate_xgm_t3b(adapter);
+	else if (calibrate_xgm(adapter))
+		goto out_err;
+
+	if (vpd->mclk) {
+		partition_mem(adapter, &adapter->params.tp);
+
+		if (mc7_init(&adapter->pmrx, vpd->mclk, vpd->mem_timing) ||
+		    mc7_init(&adapter->pmtx, vpd->mclk, vpd->mem_timing) ||
+		    mc7_init(&adapter->cm, vpd->mclk, vpd->mem_timing) ||
+		    t3_mc5_init(&adapter->mc5, adapter->params.mc5.nservers,
+				adapter->params.mc5.nfilters,
+				adapter->params.mc5.nroutes))
+			goto out_err;
+
+		for (i = 0; i < 32; i++)
+			if (clear_sge_ctxt(adapter, i, F_CQ))
+				goto out_err;
+	}
+
+	if (tp_init(adapter, &adapter->params.tp))
+		goto out_err;
+
+	t3_tp_set_coalescing_size(adapter,
+				  min(adapter->params.sge.max_pkt_size,
+				      MAX_RX_COALESCING_LEN), 1);
+	t3_tp_set_max_rxsize(adapter,
+			     min(adapter->params.sge.max_pkt_size, 16384U));
+	ulp_config(adapter, &adapter->params.tp);
+
+	if (is_pcie(adapter))
+		config_pcie(adapter);
+	else
+		t3_set_reg_field(adapter, A_PCIX_CFG, 0,
+				 F_DMASTOPEN | F_CLIDECEN);
+
+	if (adapter->params.rev == T3_REV_C)
+		t3_set_reg_field(adapter, A_ULPTX_CONFIG, 0,
+				 F_CFG_CQE_SOP_MASK);
+
+	t3_write_reg(adapter, A_PM1_RX_CFG, 0xffffffff);
+	t3_write_reg(adapter, A_PM1_RX_MODE, 0);
+	t3_write_reg(adapter, A_PM1_TX_MODE, 0);
+	chan_init_hw(adapter, adapter->params.chan_map);
+	t3_sge_init(adapter, &adapter->params.sge);
+	t3_set_reg_field(adapter, A_PL_RST, 0, F_FATALPERREN);
+
+	t3_write_reg(adapter, A_T3DBG_GPIO_ACT_LOW, calc_gpio_intr(adapter));
+
+	t3_write_reg(adapter, A_CIM_HOST_ACC_DATA, vpd->uclk | fw_params);
+	t3_write_reg(adapter, A_CIM_BOOT_CFG,
+		     V_BOOTADDR(FW_FLASH_BOOT_ADDR >> 2));
+	t3_read_reg(adapter, A_CIM_BOOT_CFG);	/* flush */
+
+	attempts = 100;
+	do {			/* wait for uP to initialize */
+		msleep(20);
+	} while (t3_read_reg(adapter, A_CIM_HOST_ACC_DATA) && --attempts);
+	if (!attempts) {
+		CH_ERR(adapter, "uP initialization timed out\n");
+		goto out_err;
+	}
+
+	err = 0;
+out_err:
+	return err;
+}
+
+/**
+ *	get_pci_mode - determine a card's PCI mode
+ *	@adapter: the adapter
+ *	@p: where to store the PCI settings
+ *
+ *	Determines a card's PCI mode and associated parameters, such as speed
+ *	and width.
+ */
+static void get_pci_mode(struct adapter *adapter, struct pci_params *p)
+{
+	static unsigned short speed_map[] = { 33, 66, 100, 133 };
+	u32 pci_mode;
+
+	if (pci_is_pcie(adapter->pdev)) {
+		u16 val;
+
+		p->variant = PCI_VARIANT_PCIE;
+		pcie_capability_read_word(adapter->pdev, PCI_EXP_LNKSTA, &val);
+		p->width = (val >> 4) & 0x3f;
+		return;
+	}
+
+	pci_mode = t3_read_reg(adapter, A_PCIX_MODE);
+	p->speed = speed_map[G_PCLKRANGE(pci_mode)];
+	p->width = (pci_mode & F_64BIT) ? 64 : 32;
+	pci_mode = G_PCIXINITPAT(pci_mode);
+	if (pci_mode == 0)
+		p->variant = PCI_VARIANT_PCI;
+	else if (pci_mode < 4)
+		p->variant = PCI_VARIANT_PCIX_MODE1_PARITY;
+	else if (pci_mode < 8)
+		p->variant = PCI_VARIANT_PCIX_MODE1_ECC;
+	else
+		p->variant = PCI_VARIANT_PCIX_266_MODE2;
+}
+
+/**
+ *	init_link_config - initialize a link's SW state
+ *	@lc: structure holding the link state
+ *	@ai: information about the current card
+ *
+ *	Initializes the SW state maintained for each link, including the link's
+ *	capabilities and default speed/duplex/flow-control/autonegotiation
+ *	settings.
+ */
+static void init_link_config(struct link_config *lc, unsigned int caps)
+{
+	lc->supported = caps;
+	lc->requested_speed = lc->speed = SPEED_INVALID;
+	lc->requested_duplex = lc->duplex = DUPLEX_INVALID;
+	lc->requested_fc = lc->fc = PAUSE_RX | PAUSE_TX;
+	if (lc->supported & SUPPORTED_Autoneg) {
+		lc->advertising = lc->supported;
+		lc->autoneg = AUTONEG_ENABLE;
+		lc->requested_fc |= PAUSE_AUTONEG;
+	} else {
+		lc->advertising = 0;
+		lc->autoneg = AUTONEG_DISABLE;
+	}
+}
+
+/**
+ *	mc7_calc_size - calculate MC7 memory size
+ *	@cfg: the MC7 configuration
+ *
+ *	Calculates the size of an MC7 memory in bytes from the value of its
+ *	configuration register.
+ */
+static unsigned int mc7_calc_size(u32 cfg)
+{
+	unsigned int width = G_WIDTH(cfg);
+	unsigned int banks = !!(cfg & F_BKS) + 1;
+	unsigned int org = !!(cfg & F_ORG) + 1;
+	unsigned int density = G_DEN(cfg);
+	unsigned int MBs = ((256 << density) * banks) / (org << width);
+
+	return MBs << 20;
+}
+
+static void mc7_prep(struct adapter *adapter, struct mc7 *mc7,
+		     unsigned int base_addr, const char *name)
+{
+	u32 cfg;
+
+	mc7->adapter = adapter;
+	mc7->name = name;
+	mc7->offset = base_addr - MC7_PMRX_BASE_ADDR;
+	cfg = t3_read_reg(adapter, mc7->offset + A_MC7_CFG);
+	mc7->size = G_DEN(cfg) == M_DEN ? 0 : mc7_calc_size(cfg);
+	mc7->width = G_WIDTH(cfg);
+}
+
+static void mac_prep(struct cmac *mac, struct adapter *adapter, int index)
+{
+	u16 devid;
+
+	mac->adapter = adapter;
+	pci_read_config_word(adapter->pdev, 0x2, &devid);
+
+	if (devid == 0x37 && !adapter->params.vpd.xauicfg[1])
+		index = 0;
+	mac->offset = (XGMAC0_1_BASE_ADDR - XGMAC0_0_BASE_ADDR) * index;
+	mac->nucast = 1;
+
+	if (adapter->params.rev == 0 && uses_xaui(adapter)) {
+		t3_write_reg(adapter, A_XGM_SERDES_CTRL + mac->offset,
+			     is_10G(adapter) ? 0x2901c04 : 0x2301c04);
+		t3_set_reg_field(adapter, A_XGM_PORT_CFG + mac->offset,
+				 F_ENRGMII, 0);
+	}
+}
+
+static void early_hw_init(struct adapter *adapter,
+			  const struct adapter_info *ai)
+{
+	u32 val = V_PORTSPEED(is_10G(adapter) ? 3 : 2);
+
+	mi1_init(adapter, ai);
+	t3_write_reg(adapter, A_I2C_CFG,	/* set for 80KHz */
+		     V_I2C_CLKDIV(adapter->params.vpd.cclk / 80 - 1));
+	t3_write_reg(adapter, A_T3DBG_GPIO_EN,
+		     ai->gpio_out | F_GPIO0_OEN | F_GPIO0_OUT_VAL);
+	t3_write_reg(adapter, A_MC5_DB_SERVER_INDEX, 0);
+	t3_write_reg(adapter, A_SG_OCO_BASE, V_BASE1(0xfff));
+
+	if (adapter->params.rev == 0 || !uses_xaui(adapter))
+		val |= F_ENRGMII;
+
+	/* Enable MAC clocks so we can access the registers */
+	t3_write_reg(adapter, A_XGM_PORT_CFG, val);
+	t3_read_reg(adapter, A_XGM_PORT_CFG);
+
+	val |= F_CLKDIVRESET_;
+	t3_write_reg(adapter, A_XGM_PORT_CFG, val);
+	t3_read_reg(adapter, A_XGM_PORT_CFG);
+	t3_write_reg(adapter, XGM_REG(A_XGM_PORT_CFG, 1), val);
+	t3_read_reg(adapter, A_XGM_PORT_CFG);
+}
+
+/*
+ * Reset the adapter.
+ * Older PCIe cards lose their config space during reset, PCI-X
+ * ones don't.
+ */
+int t3_reset_adapter(struct adapter *adapter)
+{
+	int i, save_and_restore_pcie =
+	    adapter->params.rev < T3_REV_B2 && is_pcie(adapter);
+	uint16_t devid = 0;
+
+	if (save_and_restore_pcie)
+		pci_save_state(adapter->pdev);
+	t3_write_reg(adapter, A_PL_RST, F_CRSTWRM | F_CRSTWRMMODE);
+
+	/*
+	 * Delay. Give Some time to device to reset fully.
+	 * XXX The delay time should be modified.
+	 */
+	for (i = 0; i < 10; i++) {
+		msleep(50);
+		pci_read_config_word(adapter->pdev, 0x00, &devid);
+		if (devid == 0x1425)
+			break;
+	}
+
+	if (devid != 0x1425)
+		return -1;
+
+	if (save_and_restore_pcie)
+		pci_restore_state(adapter->pdev);
+	return 0;
+}
+
+static int init_parity(struct adapter *adap)
+{
+		int i, err, addr;
+
+	if (t3_read_reg(adap, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	for (err = i = 0; !err && i < 16; i++)
+		err = clear_sge_ctxt(adap, i, F_EGRESS);
+	for (i = 0xfff0; !err && i <= 0xffff; i++)
+		err = clear_sge_ctxt(adap, i, F_EGRESS);
+	for (i = 0; !err && i < SGE_QSETS; i++)
+		err = clear_sge_ctxt(adap, i, F_RESPONSEQ);
+	if (err)
+		return err;
+
+	t3_write_reg(adap, A_CIM_IBQ_DBG_DATA, 0);
+	for (i = 0; i < 4; i++)
+		for (addr = 0; addr <= M_IBQDBGADDR; addr++) {
+			t3_write_reg(adap, A_CIM_IBQ_DBG_CFG, F_IBQDBGEN |
+				     F_IBQDBGWR | V_IBQDBGQID(i) |
+				     V_IBQDBGADDR(addr));
+			err = t3_wait_op_done(adap, A_CIM_IBQ_DBG_CFG,
+					      F_IBQDBGBUSY, 0, 2, 1);
+			if (err)
+				return err;
+		}
+	return 0;
+}
+
+/*
+ * Initialize adapter SW state for the various HW modules, set initial values
+ * for some adapter tunables, take PHYs out of reset, and initialize the MDIO
+ * interface.
+ */
+int t3_prep_adapter(struct adapter *adapter, const struct adapter_info *ai,
+		    int reset)
+{
+	int ret;
+	unsigned int i, j = -1;
+
+	get_pci_mode(adapter, &adapter->params.pci);
+
+	adapter->params.info = ai;
+	adapter->params.nports = ai->nports0 + ai->nports1;
+	adapter->params.chan_map = (!!ai->nports0) | (!!ai->nports1 << 1);
+	adapter->params.rev = t3_read_reg(adapter, A_PL_REV);
+	/*
+	 * We used to only run the "adapter check task" once a second if
+	 * we had PHYs which didn't support interrupts (we would check
+	 * their link status once a second).  Now we check other conditions
+	 * in that routine which could potentially impose a very high
+	 * interrupt load on the system.  As such, we now always scan the
+	 * adapter state once a second ...
+	 */
+	adapter->params.linkpoll_period = 10;
+	adapter->params.stats_update_period = is_10G(adapter) ?
+	    MAC_STATS_ACCUM_SECS : (MAC_STATS_ACCUM_SECS * 10);
+	adapter->params.pci.vpd_cap_addr =
+	    pci_find_capability(adapter->pdev, PCI_CAP_ID_VPD);
+	ret = get_vpd_params(adapter, &adapter->params.vpd);
+	if (ret < 0)
+		return ret;
+
+	if (reset && t3_reset_adapter(adapter))
+		return -1;
+
+	t3_sge_prep(adapter, &adapter->params.sge);
+
+	if (adapter->params.vpd.mclk) {
+		struct tp_params *p = &adapter->params.tp;
+
+		mc7_prep(adapter, &adapter->pmrx, MC7_PMRX_BASE_ADDR, "PMRX");
+		mc7_prep(adapter, &adapter->pmtx, MC7_PMTX_BASE_ADDR, "PMTX");
+		mc7_prep(adapter, &adapter->cm, MC7_CM_BASE_ADDR, "CM");
+
+		p->nchan = adapter->params.chan_map == 3 ? 2 : 1;
+		p->pmrx_size = t3_mc7_size(&adapter->pmrx);
+		p->pmtx_size = t3_mc7_size(&adapter->pmtx);
+		p->cm_size = t3_mc7_size(&adapter->cm);
+		p->chan_rx_size = p->pmrx_size / 2;	/* only 1 Rx channel */
+		p->chan_tx_size = p->pmtx_size / p->nchan;
+		p->rx_pg_size = 64 * 1024;
+		p->tx_pg_size = is_10G(adapter) ? 64 * 1024 : 16 * 1024;
+		p->rx_num_pgs = pm_num_pages(p->chan_rx_size, p->rx_pg_size);
+		p->tx_num_pgs = pm_num_pages(p->chan_tx_size, p->tx_pg_size);
+		p->ntimer_qs = p->cm_size >= (128 << 20) ||
+		    adapter->params.rev > 0 ? 12 : 6;
+	}
+
+	adapter->params.offload = t3_mc7_size(&adapter->pmrx) &&
+				  t3_mc7_size(&adapter->pmtx) &&
+				  t3_mc7_size(&adapter->cm);
+
+	if (is_offload(adapter)) {
+		adapter->params.mc5.nservers = DEFAULT_NSERVERS;
+		adapter->params.mc5.nfilters = adapter->params.rev > 0 ?
+		    DEFAULT_NFILTERS : 0;
+		adapter->params.mc5.nroutes = 0;
+		t3_mc5_prep(adapter, &adapter->mc5, MC5_MODE_144_BIT);
+
+		init_mtus(adapter->params.mtus);
+		init_cong_ctrl(adapter->params.a_wnd, adapter->params.b_wnd);
+	}
+
+	early_hw_init(adapter, ai);
+	ret = init_parity(adapter);
+	if (ret)
+		return ret;
+
+	for_each_port(adapter, i) {
+		u8 hw_addr[6];
+		const struct port_type_info *pti;
+		struct port_info *p = adap2pinfo(adapter, i);
+
+		while (!adapter->params.vpd.port_type[++j])
+			;
+
+		pti = &port_types[adapter->params.vpd.port_type[j]];
+		if (!pti->phy_prep) {
+			CH_ALERT(adapter, "Invalid port type index %d\n",
+				 adapter->params.vpd.port_type[j]);
+			return -EINVAL;
+		}
+
+		p->phy.mdio.dev = adapter->port[i];
+		ret = pti->phy_prep(&p->phy, adapter, ai->phy_base_addr + j,
+				    ai->mdio_ops);
+		if (ret)
+			return ret;
+		mac_prep(&p->mac, adapter, j);
+
+		/*
+		 * The VPD EEPROM stores the base Ethernet address for the
+		 * card.  A port's address is derived from the base by adding
+		 * the port's index to the base's low octet.
+		 */
+		memcpy(hw_addr, adapter->params.vpd.eth_base, 5);
+		hw_addr[5] = adapter->params.vpd.eth_base[5] + i;
+
+		memcpy(adapter->port[i]->dev_addr, hw_addr,
+		       ETH_ALEN);
+		init_link_config(&p->link_config, p->phy.caps);
+		p->phy.ops->power_down(&p->phy, 1);
+
+		/*
+		 * If the PHY doesn't support interrupts for link status
+		 * changes, schedule a scan of the adapter links at least
+		 * once a second.
+		 */
+		if (!(p->phy.caps & SUPPORTED_IRQ) &&
+		    adapter->params.linkpoll_period > 10)
+			adapter->params.linkpoll_period = 10;
+	}
+
+	return 0;
+}
+
+void t3_led_ready(struct adapter *adapter)
+{
+	t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, F_GPIO0_OUT_VAL,
+			 F_GPIO0_OUT_VAL);
+}
+
+int t3_replay_prep_adapter(struct adapter *adapter)
+{
+	const struct adapter_info *ai = adapter->params.info;
+	unsigned int i, j = -1;
+	int ret;
+
+	early_hw_init(adapter, ai);
+	ret = init_parity(adapter);
+	if (ret)
+		return ret;
+
+	for_each_port(adapter, i) {
+		const struct port_type_info *pti;
+		struct port_info *p = adap2pinfo(adapter, i);
+
+		while (!adapter->params.vpd.port_type[++j])
+			;
+
+		pti = &port_types[adapter->params.vpd.port_type[j]];
+		ret = pti->phy_prep(&p->phy, adapter, p->phy.mdio.prtad, NULL);
+		if (ret)
+			return ret;
+		p->phy.ops->power_down(&p->phy, 1);
+	}
+
+return 0;
+}
+
diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3cdev.h b/drivers/net/ethernet/chelsio/cxgb3/t3cdev.h
new file mode 100644
index 0000000..3c3e6cf
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/t3cdev.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2006-2008 Chelsio Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _T3CDEV_H_
+#define _T3CDEV_H_
+
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <net/neighbour.h>
+
+#define T3CNAMSIZ 16
+
+struct cxgb3_client;
+
+enum t3ctype {
+	T3A = 0,
+	T3B,
+	T3C,
+};
+
+struct t3cdev {
+	char name[T3CNAMSIZ];	/* T3C device name */
+	enum t3ctype type;
+	struct list_head ofld_dev_list;	/* for list linking */
+	struct net_device *lldev;	/* LL dev associated with T3C messages */
+	struct proc_dir_entry *proc_dir;	/* root of proc dir for this T3C */
+	int (*send)(struct t3cdev *dev, struct sk_buff *skb);
+	int (*recv)(struct t3cdev *dev, struct sk_buff **skb, int n);
+	int (*ctl)(struct t3cdev *dev, unsigned int req, void *data);
+	void (*neigh_update)(struct t3cdev *dev, struct neighbour *neigh);
+	void *priv;		/* driver private data */
+	void __rcu *l2opt;	/* optional layer 2 data */
+	void *l3opt;		/* optional layer 3 data */
+	void *l4opt;		/* optional layer 4 data */
+	void *ulp;		/* ulp stuff */
+	void *ulp_iscsi;	/* ulp iscsi */
+};
+
+#endif				/* _T3CDEV_H_ */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/version.h b/drivers/net/ethernet/chelsio/cxgb3/version.h
new file mode 100644
index 0000000..165bfb9
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/version.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* $Date: 2006/10/31 18:57:51 $ $RCSfile: version.h,v $ $Revision: 1.3 $ */
+#ifndef __CHELSIO_VERSION_H
+#define __CHELSIO_VERSION_H
+#define DRV_DESC "Chelsio T3 Network Driver"
+#define DRV_NAME "cxgb3"
+/* Driver version */
+#define DRV_VERSION "1.1.5-ko"
+
+/* Firmware version */
+#define FW_VERSION_MAJOR 7
+#define FW_VERSION_MINOR 12
+#define FW_VERSION_MICRO 0
+#endif				/* __CHELSIO_VERSION_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/vsc8211.c b/drivers/net/ethernet/chelsio/cxgb3/vsc8211.c
new file mode 100644
index 0000000..8638ad4
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/vsc8211.c
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "common.h"
+
+/* VSC8211 PHY specific registers. */
+enum {
+	VSC8211_SIGDET_CTRL = 19,
+	VSC8211_EXT_CTRL = 23,
+	VSC8211_INTR_ENABLE = 25,
+	VSC8211_INTR_STATUS = 26,
+	VSC8211_LED_CTRL = 27,
+	VSC8211_AUX_CTRL_STAT = 28,
+	VSC8211_EXT_PAGE_AXS = 31,
+};
+
+enum {
+	VSC_INTR_RX_ERR = 1 << 0,
+	VSC_INTR_MS_ERR = 1 << 1,  /* master/slave resolution error */
+	VSC_INTR_CABLE = 1 << 2,  /* cable impairment */
+	VSC_INTR_FALSE_CARR = 1 << 3,  /* false carrier */
+	VSC_INTR_MEDIA_CHG = 1 << 4,  /* AMS media change */
+	VSC_INTR_RX_FIFO = 1 << 5,  /* Rx FIFO over/underflow */
+	VSC_INTR_TX_FIFO = 1 << 6,  /* Tx FIFO over/underflow */
+	VSC_INTR_DESCRAMBL = 1 << 7,  /* descrambler lock-lost */
+	VSC_INTR_SYMBOL_ERR = 1 << 8,  /* symbol error */
+	VSC_INTR_NEG_DONE = 1 << 10, /* autoneg done */
+	VSC_INTR_NEG_ERR = 1 << 11, /* autoneg error */
+	VSC_INTR_DPLX_CHG = 1 << 12, /* duplex change */
+	VSC_INTR_LINK_CHG = 1 << 13, /* link change */
+	VSC_INTR_SPD_CHG = 1 << 14, /* speed change */
+	VSC_INTR_ENABLE = 1 << 15, /* interrupt enable */
+};
+
+enum {
+	VSC_CTRL_CLAUSE37_VIEW = 1 << 4,   /* Switch to Clause 37 view */
+	VSC_CTRL_MEDIA_MODE_HI = 0xf000    /* High part of media mode select */
+};
+
+#define CFG_CHG_INTR_MASK (VSC_INTR_LINK_CHG | VSC_INTR_NEG_ERR | \
+			   VSC_INTR_DPLX_CHG | VSC_INTR_SPD_CHG | \
+	 		   VSC_INTR_NEG_DONE)
+#define INTR_MASK (CFG_CHG_INTR_MASK | VSC_INTR_TX_FIFO | VSC_INTR_RX_FIFO | \
+		   VSC_INTR_ENABLE)
+
+/* PHY specific auxiliary control & status register fields */
+#define S_ACSR_ACTIPHY_TMR    0
+#define M_ACSR_ACTIPHY_TMR    0x3
+#define V_ACSR_ACTIPHY_TMR(x) ((x) << S_ACSR_ACTIPHY_TMR)
+
+#define S_ACSR_SPEED    3
+#define M_ACSR_SPEED    0x3
+#define G_ACSR_SPEED(x) (((x) >> S_ACSR_SPEED) & M_ACSR_SPEED)
+
+#define S_ACSR_DUPLEX 5
+#define F_ACSR_DUPLEX (1 << S_ACSR_DUPLEX)
+
+#define S_ACSR_ACTIPHY 6
+#define F_ACSR_ACTIPHY (1 << S_ACSR_ACTIPHY)
+
+/*
+ * Reset the PHY.  This PHY completes reset immediately so we never wait.
+ */
+static int vsc8211_reset(struct cphy *cphy, int wait)
+{
+	return t3_phy_reset(cphy, MDIO_DEVAD_NONE, 0);
+}
+
+static int vsc8211_intr_enable(struct cphy *cphy)
+{
+	return t3_mdio_write(cphy, MDIO_DEVAD_NONE, VSC8211_INTR_ENABLE,
+			     INTR_MASK);
+}
+
+static int vsc8211_intr_disable(struct cphy *cphy)
+{
+	return t3_mdio_write(cphy, MDIO_DEVAD_NONE, VSC8211_INTR_ENABLE, 0);
+}
+
+static int vsc8211_intr_clear(struct cphy *cphy)
+{
+	u32 val;
+
+	/* Clear PHY interrupts by reading the register. */
+	return t3_mdio_read(cphy, MDIO_DEVAD_NONE, VSC8211_INTR_STATUS, &val);
+}
+
+static int vsc8211_autoneg_enable(struct cphy *cphy)
+{
+	return t3_mdio_change_bits(cphy, MDIO_DEVAD_NONE, MII_BMCR,
+				   BMCR_PDOWN | BMCR_ISOLATE,
+				   BMCR_ANENABLE | BMCR_ANRESTART);
+}
+
+static int vsc8211_autoneg_restart(struct cphy *cphy)
+{
+	return t3_mdio_change_bits(cphy, MDIO_DEVAD_NONE, MII_BMCR,
+				   BMCR_PDOWN | BMCR_ISOLATE,
+				   BMCR_ANRESTART);
+}
+
+static int vsc8211_get_link_status(struct cphy *cphy, int *link_ok,
+				   int *speed, int *duplex, int *fc)
+{
+	unsigned int bmcr, status, lpa, adv;
+	int err, sp = -1, dplx = -1, pause = 0;
+
+	err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMCR, &bmcr);
+	if (!err)
+		err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMSR, &status);
+	if (err)
+		return err;
+
+	if (link_ok) {
+		/*
+		 * BMSR_LSTATUS is latch-low, so if it is 0 we need to read it
+		 * once more to get the current link state.
+		 */
+		if (!(status & BMSR_LSTATUS))
+			err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMSR,
+					   &status);
+		if (err)
+			return err;
+		*link_ok = (status & BMSR_LSTATUS) != 0;
+	}
+	if (!(bmcr & BMCR_ANENABLE)) {
+		dplx = (bmcr & BMCR_FULLDPLX) ? DUPLEX_FULL : DUPLEX_HALF;
+		if (bmcr & BMCR_SPEED1000)
+			sp = SPEED_1000;
+		else if (bmcr & BMCR_SPEED100)
+			sp = SPEED_100;
+		else
+			sp = SPEED_10;
+	} else if (status & BMSR_ANEGCOMPLETE) {
+		err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, VSC8211_AUX_CTRL_STAT,
+				   &status);
+		if (err)
+			return err;
+
+		dplx = (status & F_ACSR_DUPLEX) ? DUPLEX_FULL : DUPLEX_HALF;
+		sp = G_ACSR_SPEED(status);
+		if (sp == 0)
+			sp = SPEED_10;
+		else if (sp == 1)
+			sp = SPEED_100;
+		else
+			sp = SPEED_1000;
+
+		if (fc && dplx == DUPLEX_FULL) {
+			err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_LPA,
+					   &lpa);
+			if (!err)
+				err = t3_mdio_read(cphy, MDIO_DEVAD_NONE,
+						   MII_ADVERTISE, &adv);
+			if (err)
+				return err;
+
+			if (lpa & adv & ADVERTISE_PAUSE_CAP)
+				pause = PAUSE_RX | PAUSE_TX;
+			else if ((lpa & ADVERTISE_PAUSE_CAP) &&
+				 (lpa & ADVERTISE_PAUSE_ASYM) &&
+				 (adv & ADVERTISE_PAUSE_ASYM))
+				pause = PAUSE_TX;
+			else if ((lpa & ADVERTISE_PAUSE_ASYM) &&
+				 (adv & ADVERTISE_PAUSE_CAP))
+				pause = PAUSE_RX;
+		}
+	}
+	if (speed)
+		*speed = sp;
+	if (duplex)
+		*duplex = dplx;
+	if (fc)
+		*fc = pause;
+	return 0;
+}
+
+static int vsc8211_get_link_status_fiber(struct cphy *cphy, int *link_ok,
+					 int *speed, int *duplex, int *fc)
+{
+	unsigned int bmcr, status, lpa, adv;
+	int err, sp = -1, dplx = -1, pause = 0;
+
+	err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMCR, &bmcr);
+	if (!err)
+		err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMSR, &status);
+	if (err)
+		return err;
+
+	if (link_ok) {
+		/*
+		 * BMSR_LSTATUS is latch-low, so if it is 0 we need to read it
+		 * once more to get the current link state.
+		 */
+		if (!(status & BMSR_LSTATUS))
+			err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMSR,
+					   &status);
+		if (err)
+			return err;
+		*link_ok = (status & BMSR_LSTATUS) != 0;
+	}
+	if (!(bmcr & BMCR_ANENABLE)) {
+		dplx = (bmcr & BMCR_FULLDPLX) ? DUPLEX_FULL : DUPLEX_HALF;
+		if (bmcr & BMCR_SPEED1000)
+			sp = SPEED_1000;
+		else if (bmcr & BMCR_SPEED100)
+			sp = SPEED_100;
+		else
+			sp = SPEED_10;
+	} else if (status & BMSR_ANEGCOMPLETE) {
+		err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_LPA, &lpa);
+		if (!err)
+			err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_ADVERTISE,
+					   &adv);
+		if (err)
+			return err;
+
+		if (adv & lpa & ADVERTISE_1000XFULL) {
+			dplx = DUPLEX_FULL;
+			sp = SPEED_1000;
+		} else if (adv & lpa & ADVERTISE_1000XHALF) {
+			dplx = DUPLEX_HALF;
+			sp = SPEED_1000;
+		}
+
+		if (fc && dplx == DUPLEX_FULL) {
+			if (lpa & adv & ADVERTISE_1000XPAUSE)
+				pause = PAUSE_RX | PAUSE_TX;
+			else if ((lpa & ADVERTISE_1000XPAUSE) &&
+				 (adv & lpa & ADVERTISE_1000XPSE_ASYM))
+				pause = PAUSE_TX;
+			else if ((lpa & ADVERTISE_1000XPSE_ASYM) &&
+				 (adv & ADVERTISE_1000XPAUSE))
+				pause = PAUSE_RX;
+		}
+	}
+	if (speed)
+		*speed = sp;
+	if (duplex)
+		*duplex = dplx;
+	if (fc)
+		*fc = pause;
+	return 0;
+}
+
+#ifdef UNUSED
+/*
+ * Enable/disable auto MDI/MDI-X in forced link speed mode.
+ */
+static int vsc8211_set_automdi(struct cphy *phy, int enable)
+{
+	int err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_PAGE_AXS, 0x52b5);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, 18, 0x12);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, 17, enable ? 0x2803 : 0x3003);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, 16, 0x87fa);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_PAGE_AXS, 0);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+int vsc8211_set_speed_duplex(struct cphy *phy, int speed, int duplex)
+{
+	int err;
+
+	err = t3_set_phy_speed_duplex(phy, speed, duplex);
+	if (!err)
+		err = vsc8211_set_automdi(phy, 1);
+	return err;
+}
+#endif /* UNUSED */
+
+static int vsc8211_power_down(struct cphy *cphy, int enable)
+{
+	return t3_mdio_change_bits(cphy, 0, MII_BMCR, BMCR_PDOWN,
+				   enable ? BMCR_PDOWN : 0);
+}
+
+static int vsc8211_intr_handler(struct cphy *cphy)
+{
+	unsigned int cause;
+	int err, cphy_cause = 0;
+
+	err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, VSC8211_INTR_STATUS, &cause);
+	if (err)
+		return err;
+
+	cause &= INTR_MASK;
+	if (cause & CFG_CHG_INTR_MASK)
+		cphy_cause |= cphy_cause_link_change;
+	if (cause & (VSC_INTR_RX_FIFO | VSC_INTR_TX_FIFO))
+		cphy_cause |= cphy_cause_fifo_error;
+	return cphy_cause;
+}
+
+static const struct cphy_ops vsc8211_ops = {
+	.reset = vsc8211_reset,
+	.intr_enable = vsc8211_intr_enable,
+	.intr_disable = vsc8211_intr_disable,
+	.intr_clear = vsc8211_intr_clear,
+	.intr_handler = vsc8211_intr_handler,
+	.autoneg_enable = vsc8211_autoneg_enable,
+	.autoneg_restart = vsc8211_autoneg_restart,
+	.advertise = t3_phy_advertise,
+	.set_speed_duplex = t3_set_phy_speed_duplex,
+	.get_link_status = vsc8211_get_link_status,
+	.power_down = vsc8211_power_down,
+};
+
+static const struct cphy_ops vsc8211_fiber_ops = {
+	.reset = vsc8211_reset,
+	.intr_enable = vsc8211_intr_enable,
+	.intr_disable = vsc8211_intr_disable,
+	.intr_clear = vsc8211_intr_clear,
+	.intr_handler = vsc8211_intr_handler,
+	.autoneg_enable = vsc8211_autoneg_enable,
+	.autoneg_restart = vsc8211_autoneg_restart,
+	.advertise = t3_phy_advertise_fiber,
+	.set_speed_duplex = t3_set_phy_speed_duplex,
+	.get_link_status = vsc8211_get_link_status_fiber,
+	.power_down = vsc8211_power_down,
+};
+
+int t3_vsc8211_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops)
+{
+	int err;
+	unsigned int val;
+
+	cphy_init(phy, adapter, phy_addr, &vsc8211_ops, mdio_ops,
+		  SUPPORTED_10baseT_Full | SUPPORTED_100baseT_Full |
+		  SUPPORTED_1000baseT_Full | SUPPORTED_Autoneg | SUPPORTED_MII |
+		  SUPPORTED_TP | SUPPORTED_IRQ, "10/100/1000BASE-T");
+	msleep(20);       /* PHY needs ~10ms to start responding to MDIO */
+
+	err = t3_mdio_read(phy, MDIO_DEVAD_NONE, VSC8211_EXT_CTRL, &val);
+	if (err)
+		return err;
+	if (val & VSC_CTRL_MEDIA_MODE_HI) {
+		/* copper interface, just need to configure the LEDs */
+		return t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_LED_CTRL,
+				     0x100);
+	}
+
+	phy->caps = SUPPORTED_1000baseT_Full | SUPPORTED_Autoneg |
+		    SUPPORTED_MII | SUPPORTED_FIBRE | SUPPORTED_IRQ;
+	phy->desc = "1000BASE-X";
+	phy->ops = &vsc8211_fiber_ops;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_PAGE_AXS, 1);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_SIGDET_CTRL, 1);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_PAGE_AXS, 0);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_CTRL,
+			    val | VSC_CTRL_CLAUSE37_VIEW);
+	if (err)
+		return err;
+
+	err = vsc8211_reset(phy, 0);
+	if (err)
+		return err;
+
+	udelay(5); /* delay after reset before next SMI */
+	return 0;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/xgmac.c b/drivers/net/ethernet/chelsio/cxgb3/xgmac.c
new file mode 100644
index 0000000..3af19a5
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/xgmac.c
@@ -0,0 +1,657 @@
+/*
+ * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "common.h"
+#include "regs.h"
+
+/*
+ * # of exact address filters.  The first one is used for the station address,
+ * the rest are available for multicast addresses.
+ */
+#define EXACT_ADDR_FILTERS 8
+
+static inline int macidx(const struct cmac *mac)
+{
+	return mac->offset / (XGMAC0_1_BASE_ADDR - XGMAC0_0_BASE_ADDR);
+}
+
+static void xaui_serdes_reset(struct cmac *mac)
+{
+	static const unsigned int clear[] = {
+		F_PWRDN0 | F_PWRDN1, F_RESETPLL01, F_RESET0 | F_RESET1,
+		F_PWRDN2 | F_PWRDN3, F_RESETPLL23, F_RESET2 | F_RESET3
+	};
+
+	int i;
+	struct adapter *adap = mac->adapter;
+	u32 ctrl = A_XGM_SERDES_CTRL0 + mac->offset;
+
+	t3_write_reg(adap, ctrl, adap->params.vpd.xauicfg[macidx(mac)] |
+		     F_RESET3 | F_RESET2 | F_RESET1 | F_RESET0 |
+		     F_PWRDN3 | F_PWRDN2 | F_PWRDN1 | F_PWRDN0 |
+		     F_RESETPLL23 | F_RESETPLL01);
+	t3_read_reg(adap, ctrl);
+	udelay(15);
+
+	for (i = 0; i < ARRAY_SIZE(clear); i++) {
+		t3_set_reg_field(adap, ctrl, clear[i], 0);
+		udelay(15);
+	}
+}
+
+void t3b_pcs_reset(struct cmac *mac)
+{
+	t3_set_reg_field(mac->adapter, A_XGM_RESET_CTRL + mac->offset,
+			 F_PCS_RESET_, 0);
+	udelay(20);
+	t3_set_reg_field(mac->adapter, A_XGM_RESET_CTRL + mac->offset, 0,
+			 F_PCS_RESET_);
+}
+
+int t3_mac_reset(struct cmac *mac)
+{
+	static const struct addr_val_pair mac_reset_avp[] = {
+		{A_XGM_TX_CTRL, 0},
+		{A_XGM_RX_CTRL, 0},
+		{A_XGM_RX_CFG, F_DISPAUSEFRAMES | F_EN1536BFRAMES |
+		 F_RMFCS | F_ENJUMBO | F_ENHASHMCAST},
+		{A_XGM_RX_HASH_LOW, 0},
+		{A_XGM_RX_HASH_HIGH, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_1, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_2, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_3, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_4, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_5, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_6, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_7, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_8, 0},
+		{A_XGM_STAT_CTRL, F_CLRSTATS}
+	};
+	u32 val;
+	struct adapter *adap = mac->adapter;
+	unsigned int oft = mac->offset;
+
+	t3_write_reg(adap, A_XGM_RESET_CTRL + oft, F_MAC_RESET_);
+	t3_read_reg(adap, A_XGM_RESET_CTRL + oft);	/* flush */
+
+	t3_write_regs(adap, mac_reset_avp, ARRAY_SIZE(mac_reset_avp), oft);
+	t3_set_reg_field(adap, A_XGM_RXFIFO_CFG + oft,
+			 F_RXSTRFRWRD | F_DISERRFRAMES,
+			 uses_xaui(adap) ? 0 : F_RXSTRFRWRD);
+	t3_set_reg_field(adap, A_XGM_TXFIFO_CFG + oft, 0, F_UNDERUNFIX);
+
+	if (uses_xaui(adap)) {
+		if (adap->params.rev == 0) {
+			t3_set_reg_field(adap, A_XGM_SERDES_CTRL + oft, 0,
+					 F_RXENABLE | F_TXENABLE);
+			if (t3_wait_op_done(adap, A_XGM_SERDES_STATUS1 + oft,
+					    F_CMULOCK, 1, 5, 2)) {
+				CH_ERR(adap,
+				       "MAC %d XAUI SERDES CMU lock failed\n",
+				       macidx(mac));
+				return -1;
+			}
+			t3_set_reg_field(adap, A_XGM_SERDES_CTRL + oft, 0,
+					 F_SERDESRESET_);
+		} else
+			xaui_serdes_reset(mac);
+	}
+
+	t3_set_reg_field(adap, A_XGM_RX_MAX_PKT_SIZE + oft,
+			 V_RXMAXFRAMERSIZE(M_RXMAXFRAMERSIZE),
+			 V_RXMAXFRAMERSIZE(MAX_FRAME_SIZE) | F_RXENFRAMER);
+	val = F_MAC_RESET_ | F_XGMAC_STOP_EN;
+
+	if (is_10G(adap))
+		val |= F_PCS_RESET_;
+	else if (uses_xaui(adap))
+		val |= F_PCS_RESET_ | F_XG2G_RESET_;
+	else
+		val |= F_RGMII_RESET_ | F_XG2G_RESET_;
+	t3_write_reg(adap, A_XGM_RESET_CTRL + oft, val);
+	t3_read_reg(adap, A_XGM_RESET_CTRL + oft);	/* flush */
+	if ((val & F_PCS_RESET_) && adap->params.rev) {
+		msleep(1);
+		t3b_pcs_reset(mac);
+	}
+
+	memset(&mac->stats, 0, sizeof(mac->stats));
+	return 0;
+}
+
+static int t3b2_mac_reset(struct cmac *mac)
+{
+	struct adapter *adap = mac->adapter;
+	unsigned int oft = mac->offset, store;
+	int idx = macidx(mac);
+	u32 val;
+
+	if (!macidx(mac))
+		t3_set_reg_field(adap, A_MPS_CFG, F_PORT0ACTIVE, 0);
+	else
+		t3_set_reg_field(adap, A_MPS_CFG, F_PORT1ACTIVE, 0);
+
+	/* Stop NIC traffic to reduce the number of TXTOGGLES */
+	t3_set_reg_field(adap, A_MPS_CFG, F_ENFORCEPKT, 0);
+	/* Ensure TX drains */
+	t3_set_reg_field(adap, A_XGM_TX_CFG + oft, F_TXPAUSEEN, 0);
+
+	t3_write_reg(adap, A_XGM_RESET_CTRL + oft, F_MAC_RESET_);
+	t3_read_reg(adap, A_XGM_RESET_CTRL + oft);    /* flush */
+
+	/* Store A_TP_TX_DROP_CFG_CH0 */
+	t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CFG_CH0 + idx);
+	store = t3_read_reg(adap, A_TP_TX_DROP_CFG_CH0 + idx);
+
+	msleep(10);
+
+	/* Change DROP_CFG to 0xc0000011 */
+	t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CFG_CH0 + idx);
+	t3_write_reg(adap, A_TP_PIO_DATA, 0xc0000011);
+
+	/* Check for xgm Rx fifo empty */
+	/* Increased loop count to 1000 from 5 cover 1G and 100Mbps case */
+	if (t3_wait_op_done(adap, A_XGM_RX_MAX_PKT_SIZE_ERR_CNT + oft,
+			    0x80000000, 1, 1000, 2)) {
+		CH_ERR(adap, "MAC %d Rx fifo drain failed\n",
+		       macidx(mac));
+		return -1;
+	}
+
+	t3_write_reg(adap, A_XGM_RESET_CTRL + oft, 0);
+	t3_read_reg(adap, A_XGM_RESET_CTRL + oft);    /* flush */
+
+	val = F_MAC_RESET_;
+	if (is_10G(adap))
+		val |= F_PCS_RESET_;
+	else if (uses_xaui(adap))
+		val |= F_PCS_RESET_ | F_XG2G_RESET_;
+	else
+		val |= F_RGMII_RESET_ | F_XG2G_RESET_;
+	t3_write_reg(adap, A_XGM_RESET_CTRL + oft, val);
+	t3_read_reg(adap, A_XGM_RESET_CTRL + oft);  /* flush */
+	if ((val & F_PCS_RESET_) && adap->params.rev) {
+		msleep(1);
+		t3b_pcs_reset(mac);
+	}
+	t3_write_reg(adap, A_XGM_RX_CFG + oft,
+		     F_DISPAUSEFRAMES | F_EN1536BFRAMES |
+		     F_RMFCS | F_ENJUMBO | F_ENHASHMCAST);
+
+	/* Restore the DROP_CFG */
+	t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CFG_CH0 + idx);
+	t3_write_reg(adap, A_TP_PIO_DATA, store);
+
+	if (!idx)
+		t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT0ACTIVE);
+	else
+		t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT1ACTIVE);
+
+	/* re-enable nic traffic */
+	t3_set_reg_field(adap, A_MPS_CFG, F_ENFORCEPKT, 1);
+
+	/*  Set: re-enable NIC traffic */
+	t3_set_reg_field(adap, A_MPS_CFG, F_ENFORCEPKT, 1);
+
+	return 0;
+}
+
+/*
+ * Set the exact match register 'idx' to recognize the given Ethernet address.
+ */
+static void set_addr_filter(struct cmac *mac, int idx, const u8 * addr)
+{
+	u32 addr_lo, addr_hi;
+	unsigned int oft = mac->offset + idx * 8;
+
+	addr_lo = (addr[3] << 24) | (addr[2] << 16) | (addr[1] << 8) | addr[0];
+	addr_hi = (addr[5] << 8) | addr[4];
+
+	t3_write_reg(mac->adapter, A_XGM_RX_EXACT_MATCH_LOW_1 + oft, addr_lo);
+	t3_write_reg(mac->adapter, A_XGM_RX_EXACT_MATCH_HIGH_1 + oft, addr_hi);
+}
+
+/* Set one of the station's unicast MAC addresses. */
+int t3_mac_set_address(struct cmac *mac, unsigned int idx, u8 addr[6])
+{
+	if (idx >= mac->nucast)
+		return -EINVAL;
+	set_addr_filter(mac, idx, addr);
+	return 0;
+}
+
+/*
+ * Specify the number of exact address filters that should be reserved for
+ * unicast addresses.  Caller should reload the unicast and multicast addresses
+ * after calling this.
+ */
+int t3_mac_set_num_ucast(struct cmac *mac, int n)
+{
+	if (n > EXACT_ADDR_FILTERS)
+		return -EINVAL;
+	mac->nucast = n;
+	return 0;
+}
+
+void t3_mac_disable_exact_filters(struct cmac *mac)
+{
+	unsigned int i, reg = mac->offset + A_XGM_RX_EXACT_MATCH_LOW_1;
+
+	for (i = 0; i < EXACT_ADDR_FILTERS; i++, reg += 8) {
+		u32 v = t3_read_reg(mac->adapter, reg);
+		t3_write_reg(mac->adapter, reg, v);
+	}
+	t3_read_reg(mac->adapter, A_XGM_RX_EXACT_MATCH_LOW_1);	/* flush */
+}
+
+void t3_mac_enable_exact_filters(struct cmac *mac)
+{
+	unsigned int i, reg = mac->offset + A_XGM_RX_EXACT_MATCH_HIGH_1;
+
+	for (i = 0; i < EXACT_ADDR_FILTERS; i++, reg += 8) {
+		u32 v = t3_read_reg(mac->adapter, reg);
+		t3_write_reg(mac->adapter, reg, v);
+	}
+	t3_read_reg(mac->adapter, A_XGM_RX_EXACT_MATCH_LOW_1);	/* flush */
+}
+
+/* Calculate the RX hash filter index of an Ethernet address */
+static int hash_hw_addr(const u8 * addr)
+{
+	int hash = 0, octet, bit, i = 0, c;
+
+	for (octet = 0; octet < 6; ++octet)
+		for (c = addr[octet], bit = 0; bit < 8; c >>= 1, ++bit) {
+			hash ^= (c & 1) << i;
+			if (++i == 6)
+				i = 0;
+		}
+	return hash;
+}
+
+int t3_mac_set_rx_mode(struct cmac *mac, struct net_device *dev)
+{
+	u32 val, hash_lo, hash_hi;
+	struct adapter *adap = mac->adapter;
+	unsigned int oft = mac->offset;
+
+	val = t3_read_reg(adap, A_XGM_RX_CFG + oft) & ~F_COPYALLFRAMES;
+	if (dev->flags & IFF_PROMISC)
+		val |= F_COPYALLFRAMES;
+	t3_write_reg(adap, A_XGM_RX_CFG + oft, val);
+
+	if (dev->flags & IFF_ALLMULTI)
+		hash_lo = hash_hi = 0xffffffff;
+	else {
+		struct netdev_hw_addr *ha;
+		int exact_addr_idx = mac->nucast;
+
+		hash_lo = hash_hi = 0;
+		netdev_for_each_mc_addr(ha, dev)
+			if (exact_addr_idx < EXACT_ADDR_FILTERS)
+				set_addr_filter(mac, exact_addr_idx++,
+						ha->addr);
+			else {
+				int hash = hash_hw_addr(ha->addr);
+
+				if (hash < 32)
+					hash_lo |= (1 << hash);
+				else
+					hash_hi |= (1 << (hash - 32));
+			}
+	}
+
+	t3_write_reg(adap, A_XGM_RX_HASH_LOW + oft, hash_lo);
+	t3_write_reg(adap, A_XGM_RX_HASH_HIGH + oft, hash_hi);
+	return 0;
+}
+
+static int rx_fifo_hwm(int mtu)
+{
+	int hwm;
+
+	hwm = max(MAC_RXFIFO_SIZE - 3 * mtu, (MAC_RXFIFO_SIZE * 38) / 100);
+	return min(hwm, MAC_RXFIFO_SIZE - 8192);
+}
+
+int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu)
+{
+	int hwm, lwm, divisor;
+	int ipg;
+	unsigned int thres, v, reg;
+	struct adapter *adap = mac->adapter;
+
+	/*
+	 * MAX_FRAME_SIZE inludes header + FCS, mtu doesn't.  The HW max
+	 * packet size register includes header, but not FCS.
+	 */
+	mtu += 14;
+	if (mtu > 1536)
+		mtu += 4;
+
+	if (mtu > MAX_FRAME_SIZE - 4)
+		return -EINVAL;
+	t3_write_reg(adap, A_XGM_RX_MAX_PKT_SIZE + mac->offset, mtu);
+
+	if (adap->params.rev >= T3_REV_B2 &&
+	    (t3_read_reg(adap, A_XGM_RX_CTRL + mac->offset) & F_RXEN)) {
+		t3_mac_disable_exact_filters(mac);
+		v = t3_read_reg(adap, A_XGM_RX_CFG + mac->offset);
+		t3_set_reg_field(adap, A_XGM_RX_CFG + mac->offset,
+				 F_ENHASHMCAST | F_COPYALLFRAMES, F_DISBCAST);
+
+		reg = adap->params.rev == T3_REV_B2 ?
+			A_XGM_RX_MAX_PKT_SIZE_ERR_CNT : A_XGM_RXFIFO_CFG;
+
+		/* drain RX FIFO */
+		if (t3_wait_op_done(adap, reg + mac->offset,
+				    F_RXFIFO_EMPTY, 1, 20, 5)) {
+			t3_write_reg(adap, A_XGM_RX_CFG + mac->offset, v);
+			t3_mac_enable_exact_filters(mac);
+			return -EIO;
+		}
+		t3_set_reg_field(adap, A_XGM_RX_MAX_PKT_SIZE + mac->offset,
+				 V_RXMAXPKTSIZE(M_RXMAXPKTSIZE),
+				 V_RXMAXPKTSIZE(mtu));
+		t3_write_reg(adap, A_XGM_RX_CFG + mac->offset, v);
+		t3_mac_enable_exact_filters(mac);
+	} else
+		t3_set_reg_field(adap, A_XGM_RX_MAX_PKT_SIZE + mac->offset,
+				 V_RXMAXPKTSIZE(M_RXMAXPKTSIZE),
+				 V_RXMAXPKTSIZE(mtu));
+
+	/*
+	 * Adjust the PAUSE frame watermarks.  We always set the LWM, and the
+	 * HWM only if flow-control is enabled.
+	 */
+	hwm = rx_fifo_hwm(mtu);
+	lwm = min(3 * (int)mtu, MAC_RXFIFO_SIZE / 4);
+	v = t3_read_reg(adap, A_XGM_RXFIFO_CFG + mac->offset);
+	v &= ~V_RXFIFOPAUSELWM(M_RXFIFOPAUSELWM);
+	v |= V_RXFIFOPAUSELWM(lwm / 8);
+	if (G_RXFIFOPAUSEHWM(v))
+		v = (v & ~V_RXFIFOPAUSEHWM(M_RXFIFOPAUSEHWM)) |
+		    V_RXFIFOPAUSEHWM(hwm / 8);
+
+	t3_write_reg(adap, A_XGM_RXFIFO_CFG + mac->offset, v);
+
+	/* Adjust the TX FIFO threshold based on the MTU */
+	thres = (adap->params.vpd.cclk * 1000) / 15625;
+	thres = (thres * mtu) / 1000;
+	if (is_10G(adap))
+		thres /= 10;
+	thres = mtu > thres ? (mtu - thres + 7) / 8 : 0;
+	thres = max(thres, 8U);	/* need at least 8 */
+	ipg = (adap->params.rev == T3_REV_C) ? 0 : 1;
+	t3_set_reg_field(adap, A_XGM_TXFIFO_CFG + mac->offset,
+			 V_TXFIFOTHRESH(M_TXFIFOTHRESH) | V_TXIPG(M_TXIPG),
+			 V_TXFIFOTHRESH(thres) | V_TXIPG(ipg));
+
+	if (adap->params.rev > 0) {
+		divisor = (adap->params.rev == T3_REV_C) ? 64 : 8;
+		t3_write_reg(adap, A_XGM_PAUSE_TIMER + mac->offset,
+			     (hwm - lwm) * 4 / divisor);
+	}
+	t3_write_reg(adap, A_XGM_TX_PAUSE_QUANTA + mac->offset,
+		     MAC_RXFIFO_SIZE * 4 * 8 / 512);
+	return 0;
+}
+
+int t3_mac_set_speed_duplex_fc(struct cmac *mac, int speed, int duplex, int fc)
+{
+	u32 val;
+	struct adapter *adap = mac->adapter;
+	unsigned int oft = mac->offset;
+
+	if (duplex >= 0 && duplex != DUPLEX_FULL)
+		return -EINVAL;
+	if (speed >= 0) {
+		if (speed == SPEED_10)
+			val = V_PORTSPEED(0);
+		else if (speed == SPEED_100)
+			val = V_PORTSPEED(1);
+		else if (speed == SPEED_1000)
+			val = V_PORTSPEED(2);
+		else if (speed == SPEED_10000)
+			val = V_PORTSPEED(3);
+		else
+			return -EINVAL;
+
+		t3_set_reg_field(adap, A_XGM_PORT_CFG + oft,
+				 V_PORTSPEED(M_PORTSPEED), val);
+	}
+
+	val = t3_read_reg(adap, A_XGM_RXFIFO_CFG + oft);
+	val &= ~V_RXFIFOPAUSEHWM(M_RXFIFOPAUSEHWM);
+	if (fc & PAUSE_TX) {
+		u32 rx_max_pkt_size =
+		    G_RXMAXPKTSIZE(t3_read_reg(adap,
+					       A_XGM_RX_MAX_PKT_SIZE + oft));
+		val |= V_RXFIFOPAUSEHWM(rx_fifo_hwm(rx_max_pkt_size) / 8);
+	}
+	t3_write_reg(adap, A_XGM_RXFIFO_CFG + oft, val);
+
+	t3_set_reg_field(adap, A_XGM_TX_CFG + oft, F_TXPAUSEEN,
+			 (fc & PAUSE_RX) ? F_TXPAUSEEN : 0);
+	return 0;
+}
+
+int t3_mac_enable(struct cmac *mac, int which)
+{
+	int idx = macidx(mac);
+	struct adapter *adap = mac->adapter;
+	unsigned int oft = mac->offset;
+	struct mac_stats *s = &mac->stats;
+
+	if (which & MAC_DIRECTION_TX) {
+		t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CFG_CH0 + idx);
+		t3_write_reg(adap, A_TP_PIO_DATA,
+			     adap->params.rev == T3_REV_C ?
+			     0xc4ffff01 : 0xc0ede401);
+		t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_MODE);
+		t3_set_reg_field(adap, A_TP_PIO_DATA, 1 << idx,
+				 adap->params.rev == T3_REV_C ? 0 : 1 << idx);
+
+		t3_write_reg(adap, A_XGM_TX_CTRL + oft, F_TXEN);
+
+		t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CNT_CH0 + idx);
+		mac->tx_mcnt = s->tx_frames;
+		mac->tx_tcnt = (G_TXDROPCNTCH0RCVD(t3_read_reg(adap,
+							A_TP_PIO_DATA)));
+		mac->tx_xcnt = (G_TXSPI4SOPCNT(t3_read_reg(adap,
+						A_XGM_TX_SPI4_SOP_EOP_CNT +
+						oft)));
+		mac->rx_mcnt = s->rx_frames;
+		mac->rx_pause = s->rx_pause;
+		mac->rx_xcnt = (G_TXSPI4SOPCNT(t3_read_reg(adap,
+						A_XGM_RX_SPI4_SOP_EOP_CNT +
+						oft)));
+		mac->rx_ocnt = s->rx_fifo_ovfl;
+		mac->txen = F_TXEN;
+		mac->toggle_cnt = 0;
+	}
+	if (which & MAC_DIRECTION_RX)
+		t3_write_reg(adap, A_XGM_RX_CTRL + oft, F_RXEN);
+	return 0;
+}
+
+int t3_mac_disable(struct cmac *mac, int which)
+{
+	struct adapter *adap = mac->adapter;
+
+	if (which & MAC_DIRECTION_TX) {
+		t3_write_reg(adap, A_XGM_TX_CTRL + mac->offset, 0);
+		mac->txen = 0;
+	}
+	if (which & MAC_DIRECTION_RX) {
+		int val = F_MAC_RESET_;
+
+		t3_set_reg_field(mac->adapter, A_XGM_RESET_CTRL + mac->offset,
+				 F_PCS_RESET_, 0);
+		msleep(100);
+		t3_write_reg(adap, A_XGM_RX_CTRL + mac->offset, 0);
+		if (is_10G(adap))
+			val |= F_PCS_RESET_;
+		else if (uses_xaui(adap))
+			val |= F_PCS_RESET_ | F_XG2G_RESET_;
+		else
+			val |= F_RGMII_RESET_ | F_XG2G_RESET_;
+		t3_write_reg(mac->adapter, A_XGM_RESET_CTRL + mac->offset, val);
+	}
+	return 0;
+}
+
+int t3b2_mac_watchdog_task(struct cmac *mac)
+{
+	struct adapter *adap = mac->adapter;
+	struct mac_stats *s = &mac->stats;
+	unsigned int tx_tcnt, tx_xcnt;
+	u64 tx_mcnt = s->tx_frames;
+	int status;
+
+	status = 0;
+	tx_xcnt = 1;		/* By default tx_xcnt is making progress */
+	tx_tcnt = mac->tx_tcnt;	/* If tx_mcnt is progressing ignore tx_tcnt */
+	if (tx_mcnt == mac->tx_mcnt && mac->rx_pause == s->rx_pause) {
+		tx_xcnt = (G_TXSPI4SOPCNT(t3_read_reg(adap,
+						A_XGM_TX_SPI4_SOP_EOP_CNT +
+					       	mac->offset)));
+		if (tx_xcnt == 0) {
+			t3_write_reg(adap, A_TP_PIO_ADDR,
+				     A_TP_TX_DROP_CNT_CH0 + macidx(mac));
+			tx_tcnt = (G_TXDROPCNTCH0RCVD(t3_read_reg(adap,
+						      A_TP_PIO_DATA)));
+		} else {
+			goto out;
+		}
+	} else {
+		mac->toggle_cnt = 0;
+		goto out;
+	}
+
+	if ((tx_tcnt != mac->tx_tcnt) && (mac->tx_xcnt == 0)) {
+		if (mac->toggle_cnt > 4) {
+			status = 2;
+			goto out;
+		} else {
+			status = 1;
+			goto out;
+		}
+	} else {
+		mac->toggle_cnt = 0;
+		goto out;
+	}
+
+out:
+	mac->tx_tcnt = tx_tcnt;
+	mac->tx_xcnt = tx_xcnt;
+	mac->tx_mcnt = s->tx_frames;
+	mac->rx_pause = s->rx_pause;
+	if (status == 1) {
+		t3_write_reg(adap, A_XGM_TX_CTRL + mac->offset, 0);
+		t3_read_reg(adap, A_XGM_TX_CTRL + mac->offset);  /* flush */
+		t3_write_reg(adap, A_XGM_TX_CTRL + mac->offset, mac->txen);
+		t3_read_reg(adap, A_XGM_TX_CTRL + mac->offset);  /* flush */
+		mac->toggle_cnt++;
+	} else if (status == 2) {
+		t3b2_mac_reset(mac);
+		mac->toggle_cnt = 0;
+	}
+	return status;
+}
+
+/*
+ * This function is called periodically to accumulate the current values of the
+ * RMON counters into the port statistics.  Since the packet counters are only
+ * 32 bits they can overflow in ~286 secs at 10G, so the function should be
+ * called more frequently than that.  The byte counters are 45-bit wide, they
+ * would overflow in ~7.8 hours.
+ */
+const struct mac_stats *t3_mac_update_stats(struct cmac *mac)
+{
+#define RMON_READ(mac, addr) t3_read_reg(mac->adapter, addr + mac->offset)
+#define RMON_UPDATE(mac, name, reg) \
+	(mac)->stats.name += (u64)RMON_READ(mac, A_XGM_STAT_##reg)
+#define RMON_UPDATE64(mac, name, reg_lo, reg_hi) \
+	(mac)->stats.name += RMON_READ(mac, A_XGM_STAT_##reg_lo) + \
+			     ((u64)RMON_READ(mac, A_XGM_STAT_##reg_hi) << 32)
+
+	u32 v, lo;
+
+	RMON_UPDATE64(mac, rx_octets, RX_BYTES_LOW, RX_BYTES_HIGH);
+	RMON_UPDATE64(mac, rx_frames, RX_FRAMES_LOW, RX_FRAMES_HIGH);
+	RMON_UPDATE(mac, rx_mcast_frames, RX_MCAST_FRAMES);
+	RMON_UPDATE(mac, rx_bcast_frames, RX_BCAST_FRAMES);
+	RMON_UPDATE(mac, rx_fcs_errs, RX_CRC_ERR_FRAMES);
+	RMON_UPDATE(mac, rx_pause, RX_PAUSE_FRAMES);
+	RMON_UPDATE(mac, rx_jabber, RX_JABBER_FRAMES);
+	RMON_UPDATE(mac, rx_short, RX_SHORT_FRAMES);
+	RMON_UPDATE(mac, rx_symbol_errs, RX_SYM_CODE_ERR_FRAMES);
+
+	RMON_UPDATE(mac, rx_too_long, RX_OVERSIZE_FRAMES);
+
+	v = RMON_READ(mac, A_XGM_RX_MAX_PKT_SIZE_ERR_CNT);
+	if (mac->adapter->params.rev == T3_REV_B2)
+		v &= 0x7fffffff;
+	mac->stats.rx_too_long += v;
+
+	RMON_UPDATE(mac, rx_frames_64, RX_64B_FRAMES);
+	RMON_UPDATE(mac, rx_frames_65_127, RX_65_127B_FRAMES);
+	RMON_UPDATE(mac, rx_frames_128_255, RX_128_255B_FRAMES);
+	RMON_UPDATE(mac, rx_frames_256_511, RX_256_511B_FRAMES);
+	RMON_UPDATE(mac, rx_frames_512_1023, RX_512_1023B_FRAMES);
+	RMON_UPDATE(mac, rx_frames_1024_1518, RX_1024_1518B_FRAMES);
+	RMON_UPDATE(mac, rx_frames_1519_max, RX_1519_MAXB_FRAMES);
+
+	RMON_UPDATE64(mac, tx_octets, TX_BYTE_LOW, TX_BYTE_HIGH);
+	RMON_UPDATE64(mac, tx_frames, TX_FRAME_LOW, TX_FRAME_HIGH);
+	RMON_UPDATE(mac, tx_mcast_frames, TX_MCAST);
+	RMON_UPDATE(mac, tx_bcast_frames, TX_BCAST);
+	RMON_UPDATE(mac, tx_pause, TX_PAUSE);
+	/* This counts error frames in general (bad FCS, underrun, etc). */
+	RMON_UPDATE(mac, tx_underrun, TX_ERR_FRAMES);
+
+	RMON_UPDATE(mac, tx_frames_64, TX_64B_FRAMES);
+	RMON_UPDATE(mac, tx_frames_65_127, TX_65_127B_FRAMES);
+	RMON_UPDATE(mac, tx_frames_128_255, TX_128_255B_FRAMES);
+	RMON_UPDATE(mac, tx_frames_256_511, TX_256_511B_FRAMES);
+	RMON_UPDATE(mac, tx_frames_512_1023, TX_512_1023B_FRAMES);
+	RMON_UPDATE(mac, tx_frames_1024_1518, TX_1024_1518B_FRAMES);
+	RMON_UPDATE(mac, tx_frames_1519_max, TX_1519_MAXB_FRAMES);
+
+	/* The next stat isn't clear-on-read. */
+	t3_write_reg(mac->adapter, A_TP_MIB_INDEX, mac->offset ? 51 : 50);
+	v = t3_read_reg(mac->adapter, A_TP_MIB_RDATA);
+	lo = (u32) mac->stats.rx_cong_drops;
+	mac->stats.rx_cong_drops += (u64) (v - lo);
+
+	return &mac->stats;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/Makefile b/drivers/net/ethernet/chelsio/cxgb4/Makefile
new file mode 100644
index 0000000..bea6a05
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Chelsio T4 driver
+#
+
+obj-$(CONFIG_CHELSIO_T4) += cxgb4.o
+
+cxgb4-objs := cxgb4_main.o l2t.o smt.o t4_hw.o sge.o clip_tbl.o cxgb4_ethtool.o \
+	      cxgb4_uld.o srq.o sched.o cxgb4_filter.o cxgb4_tc_u32.o \
+	      cxgb4_ptp.o cxgb4_tc_flower.o cxgb4_cudbg.o \
+	      cudbg_common.o cudbg_lib.o cudbg_zlib.o
+cxgb4-$(CONFIG_CHELSIO_T4_DCB) +=  cxgb4_dcb.o
+cxgb4-$(CONFIG_CHELSIO_T4_FCOE) +=  cxgb4_fcoe.o
+cxgb4-$(CONFIG_DEBUG_FS) += cxgb4_debugfs.o
diff --git a/drivers/net/ethernet/chelsio/cxgb4/clip_tbl.c b/drivers/net/ethernet/chelsio/cxgb4/clip_tbl.c
new file mode 100644
index 0000000..2900390
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/clip_tbl.c
@@ -0,0 +1,332 @@
+/*
+ *  This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *  Copyright (C) 2003-2014 Chelsio Communications.  All rights reserved.
+ *
+ *  Written by Deepak (deepak.s@chelsio.com)
+ *
+ *  This program is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the LICENSE file included in this
+ *  release for licensing terms and conditions.
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/jhash.h>
+#include <linux/if_vlan.h>
+#include <net/addrconf.h>
+#include "cxgb4.h"
+#include "clip_tbl.h"
+
+static inline unsigned int ipv4_clip_hash(struct clip_tbl *c, const u32 *key)
+{
+	unsigned int clipt_size_half = c->clipt_size / 2;
+
+	return jhash_1word(*key, 0) % clipt_size_half;
+}
+
+static inline unsigned int ipv6_clip_hash(struct clip_tbl *d, const u32 *key)
+{
+	unsigned int clipt_size_half = d->clipt_size / 2;
+	u32 xor = key[0] ^ key[1] ^ key[2] ^ key[3];
+
+	return clipt_size_half +
+		(jhash_1word(xor, 0) % clipt_size_half);
+}
+
+static unsigned int clip_addr_hash(struct clip_tbl *ctbl, const u32 *addr,
+				   u8 v6)
+{
+	return v6 ? ipv6_clip_hash(ctbl, addr) :
+			ipv4_clip_hash(ctbl, addr);
+}
+
+static int clip6_get_mbox(const struct net_device *dev,
+			  const struct in6_addr *lip)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct fw_clip_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_write = htonl(FW_CMD_OP_V(FW_CLIP_CMD) |
+			      FW_CMD_REQUEST_F | FW_CMD_WRITE_F);
+	c.alloc_to_len16 = htonl(FW_CLIP_CMD_ALLOC_F | FW_LEN16(c));
+	*(__be64 *)&c.ip_hi = *(__be64 *)(lip->s6_addr);
+	*(__be64 *)&c.ip_lo = *(__be64 *)(lip->s6_addr + 8);
+	return t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, false);
+}
+
+static int clip6_release_mbox(const struct net_device *dev,
+			      const struct in6_addr *lip)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct fw_clip_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_write = htonl(FW_CMD_OP_V(FW_CLIP_CMD) |
+			      FW_CMD_REQUEST_F | FW_CMD_READ_F);
+	c.alloc_to_len16 = htonl(FW_CLIP_CMD_FREE_F | FW_LEN16(c));
+	*(__be64 *)&c.ip_hi = *(__be64 *)(lip->s6_addr);
+	*(__be64 *)&c.ip_lo = *(__be64 *)(lip->s6_addr + 8);
+	return t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, false);
+}
+
+int cxgb4_clip_get(const struct net_device *dev, const u32 *lip, u8 v6)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct clip_tbl *ctbl = adap->clipt;
+	struct clip_entry *ce, *cte;
+	u32 *addr = (u32 *)lip;
+	int hash;
+	int ret = -1;
+
+	if (!ctbl)
+		return 0;
+
+	hash = clip_addr_hash(ctbl, addr, v6);
+
+	read_lock_bh(&ctbl->lock);
+	list_for_each_entry(cte, &ctbl->hash_list[hash], list) {
+		if (cte->addr6.sin6_family == AF_INET6 && v6)
+			ret = memcmp(lip, cte->addr6.sin6_addr.s6_addr,
+				     sizeof(struct in6_addr));
+		else if (cte->addr.sin_family == AF_INET && !v6)
+			ret = memcmp(lip, (char *)(&cte->addr.sin_addr),
+				     sizeof(struct in_addr));
+		if (!ret) {
+			ce = cte;
+			read_unlock_bh(&ctbl->lock);
+			refcount_inc(&ce->refcnt);
+			return 0;
+		}
+	}
+	read_unlock_bh(&ctbl->lock);
+
+	write_lock_bh(&ctbl->lock);
+	if (!list_empty(&ctbl->ce_free_head)) {
+		ce = list_first_entry(&ctbl->ce_free_head,
+				      struct clip_entry, list);
+		list_del(&ce->list);
+		INIT_LIST_HEAD(&ce->list);
+		spin_lock_init(&ce->lock);
+		refcount_set(&ce->refcnt, 0);
+		atomic_dec(&ctbl->nfree);
+		list_add_tail(&ce->list, &ctbl->hash_list[hash]);
+		if (v6) {
+			ce->addr6.sin6_family = AF_INET6;
+			memcpy(ce->addr6.sin6_addr.s6_addr,
+			       lip, sizeof(struct in6_addr));
+			ret = clip6_get_mbox(dev, (const struct in6_addr *)lip);
+			if (ret) {
+				write_unlock_bh(&ctbl->lock);
+				dev_err(adap->pdev_dev,
+					"CLIP FW cmd failed with error %d, "
+					"Connections using %pI6c wont be "
+					"offloaded",
+					ret, ce->addr6.sin6_addr.s6_addr);
+				return ret;
+			}
+		} else {
+			ce->addr.sin_family = AF_INET;
+			memcpy((char *)(&ce->addr.sin_addr), lip,
+			       sizeof(struct in_addr));
+		}
+	} else {
+		write_unlock_bh(&ctbl->lock);
+		dev_info(adap->pdev_dev, "CLIP table overflow, "
+			 "Connections using %pI6c wont be offloaded",
+			 (void *)lip);
+		return -ENOMEM;
+	}
+	write_unlock_bh(&ctbl->lock);
+	refcount_set(&ce->refcnt, 1);
+	return 0;
+}
+EXPORT_SYMBOL(cxgb4_clip_get);
+
+void cxgb4_clip_release(const struct net_device *dev, const u32 *lip, u8 v6)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct clip_tbl *ctbl = adap->clipt;
+	struct clip_entry *ce, *cte;
+	u32 *addr = (u32 *)lip;
+	int hash;
+	int ret = -1;
+
+	if (!ctbl)
+		return;
+
+	hash = clip_addr_hash(ctbl, addr, v6);
+
+	read_lock_bh(&ctbl->lock);
+	list_for_each_entry(cte, &ctbl->hash_list[hash], list) {
+		if (cte->addr6.sin6_family == AF_INET6 && v6)
+			ret = memcmp(lip, cte->addr6.sin6_addr.s6_addr,
+				     sizeof(struct in6_addr));
+		else if (cte->addr.sin_family == AF_INET && !v6)
+			ret = memcmp(lip, (char *)(&cte->addr.sin_addr),
+				     sizeof(struct in_addr));
+		if (!ret) {
+			ce = cte;
+			read_unlock_bh(&ctbl->lock);
+			goto found;
+		}
+	}
+	read_unlock_bh(&ctbl->lock);
+
+	return;
+found:
+	write_lock_bh(&ctbl->lock);
+	spin_lock_bh(&ce->lock);
+	if (refcount_dec_and_test(&ce->refcnt)) {
+		list_del(&ce->list);
+		INIT_LIST_HEAD(&ce->list);
+		list_add_tail(&ce->list, &ctbl->ce_free_head);
+		atomic_inc(&ctbl->nfree);
+		if (v6)
+			clip6_release_mbox(dev, (const struct in6_addr *)lip);
+	}
+	spin_unlock_bh(&ce->lock);
+	write_unlock_bh(&ctbl->lock);
+}
+EXPORT_SYMBOL(cxgb4_clip_release);
+
+/* Retrieves IPv6 addresses from a root device (bond, vlan) associated with
+ * a physical device.
+ * The physical device reference is needed to send the actul CLIP command.
+ */
+static int cxgb4_update_dev_clip(struct net_device *root_dev,
+				 struct net_device *dev)
+{
+	struct inet6_dev *idev = NULL;
+	struct inet6_ifaddr *ifa;
+	int ret = 0;
+
+	idev = __in6_dev_get(root_dev);
+	if (!idev)
+		return ret;
+
+	read_lock_bh(&idev->lock);
+	list_for_each_entry(ifa, &idev->addr_list, if_list) {
+		ret = cxgb4_clip_get(dev, (const u32 *)ifa->addr.s6_addr, 1);
+		if (ret < 0)
+			break;
+	}
+	read_unlock_bh(&idev->lock);
+
+	return ret;
+}
+
+int cxgb4_update_root_dev_clip(struct net_device *dev)
+{
+	struct net_device *root_dev = NULL;
+	int i, ret = 0;
+
+	/* First populate the real net device's IPv6 addresses */
+	ret = cxgb4_update_dev_clip(dev, dev);
+	if (ret)
+		return ret;
+
+	/* Parse all bond and vlan devices layered on top of the physical dev */
+	root_dev = netdev_master_upper_dev_get_rcu(dev);
+	if (root_dev) {
+		ret = cxgb4_update_dev_clip(root_dev, dev);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < VLAN_N_VID; i++) {
+		root_dev = __vlan_find_dev_deep_rcu(dev, htons(ETH_P_8021Q), i);
+		if (!root_dev)
+			continue;
+
+		ret = cxgb4_update_dev_clip(root_dev, dev);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(cxgb4_update_root_dev_clip);
+
+int clip_tbl_show(struct seq_file *seq, void *v)
+{
+	struct adapter *adapter = seq->private;
+	struct clip_tbl *ctbl = adapter->clipt;
+	struct clip_entry *ce;
+	char ip[60];
+	int i;
+
+	read_lock_bh(&ctbl->lock);
+
+	seq_puts(seq, "IP Address                  Users\n");
+	for (i = 0 ; i < ctbl->clipt_size;  ++i) {
+		list_for_each_entry(ce, &ctbl->hash_list[i], list) {
+			ip[0] = '\0';
+			sprintf(ip, "%pISc", &ce->addr);
+			seq_printf(seq, "%-25s   %u\n", ip,
+				   refcount_read(&ce->refcnt));
+		}
+	}
+	seq_printf(seq, "Free clip entries : %d\n", atomic_read(&ctbl->nfree));
+
+	read_unlock_bh(&ctbl->lock);
+
+	return 0;
+}
+
+struct clip_tbl *t4_init_clip_tbl(unsigned int clipt_start,
+				  unsigned int clipt_end)
+{
+	struct clip_entry *cl_list;
+	struct clip_tbl *ctbl;
+	unsigned int clipt_size;
+	int i;
+
+	if (clipt_start >= clipt_end)
+		return NULL;
+	clipt_size = clipt_end - clipt_start + 1;
+	if (clipt_size < CLIPT_MIN_HASH_BUCKETS)
+		return NULL;
+
+	ctbl = kvzalloc(sizeof(*ctbl) +
+			    clipt_size*sizeof(struct list_head), GFP_KERNEL);
+	if (!ctbl)
+		return NULL;
+
+	ctbl->clipt_start = clipt_start;
+	ctbl->clipt_size = clipt_size;
+	INIT_LIST_HEAD(&ctbl->ce_free_head);
+
+	atomic_set(&ctbl->nfree, clipt_size);
+	rwlock_init(&ctbl->lock);
+
+	for (i = 0; i < ctbl->clipt_size; ++i)
+		INIT_LIST_HEAD(&ctbl->hash_list[i]);
+
+	cl_list = kvzalloc(clipt_size*sizeof(struct clip_entry), GFP_KERNEL);
+	if (!cl_list) {
+		kvfree(ctbl);
+		return NULL;
+	}
+	ctbl->cl_list = (void *)cl_list;
+
+	for (i = 0; i < clipt_size; i++) {
+		INIT_LIST_HEAD(&cl_list[i].list);
+		list_add_tail(&cl_list[i].list, &ctbl->ce_free_head);
+	}
+
+	return ctbl;
+}
+
+void t4_cleanup_clip_tbl(struct adapter *adap)
+{
+	struct clip_tbl *ctbl = adap->clipt;
+
+	if (ctbl) {
+		if (ctbl->cl_list)
+			kvfree(ctbl->cl_list);
+		kvfree(ctbl);
+	}
+}
+EXPORT_SYMBOL(t4_cleanup_clip_tbl);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/clip_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/clip_tbl.h
new file mode 100644
index 0000000..a0e0ae1
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/clip_tbl.h
@@ -0,0 +1,45 @@
+/*
+ *  This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *  Copyright (C) 2003-2014 Chelsio Communications.  All rights reserved.
+ *
+ *  Written by Deepak (deepak.s@chelsio.com)
+ *
+ *  This program is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the LICENSE file included in this
+ *  release for licensing terms and conditions.
+ */
+
+#include <linux/refcount.h>
+
+struct clip_entry {
+	spinlock_t lock;	/* Hold while modifying clip reference */
+	refcount_t refcnt;
+	struct list_head list;
+	union {
+		struct sockaddr_in addr;
+		struct sockaddr_in6 addr6;
+	};
+};
+
+struct clip_tbl {
+	unsigned int clipt_start;
+	unsigned int clipt_size;
+	rwlock_t lock;
+	atomic_t nfree;
+	struct list_head ce_free_head;
+	void *cl_list;
+	struct list_head hash_list[0];
+};
+
+enum {
+	CLIPT_MIN_HASH_BUCKETS = 2,
+};
+
+struct clip_tbl *t4_init_clip_tbl(unsigned int clipt_start,
+				  unsigned int clipt_end);
+int cxgb4_clip_get(const struct net_device *dev, const u32 *lip, u8 v6);
+void cxgb4_clip_release(const struct net_device *dev, const u32 *lip, u8 v6);
+int clip_tbl_show(struct seq_file *seq, void *v);
+int cxgb4_update_root_dev_clip(struct net_device *dev);
+void t4_cleanup_clip_tbl(struct adapter *adap);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_common.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_common.c
new file mode 100644
index 0000000..8edc498
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_common.c
@@ -0,0 +1,68 @@
+/*
+ *  Copyright (C) 2017 Chelsio Communications.  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#include "cxgb4.h"
+#include "cudbg_if.h"
+#include "cudbg_lib_common.h"
+
+int cudbg_get_buff(struct cudbg_init *pdbg_init,
+		   struct cudbg_buffer *pdbg_buff, u32 size,
+		   struct cudbg_buffer *pin_buff)
+{
+	u32 offset;
+
+	offset = pdbg_buff->offset;
+	if (offset + size > pdbg_buff->size)
+		return CUDBG_STATUS_NO_MEM;
+
+	if (pdbg_init->compress_type != CUDBG_COMPRESSION_NONE) {
+		if (size > pdbg_init->compress_buff_size)
+			return CUDBG_STATUS_NO_MEM;
+
+		pin_buff->data = (char *)pdbg_init->compress_buff;
+		pin_buff->offset = 0;
+		pin_buff->size = size;
+		return 0;
+	}
+
+	pin_buff->data = (char *)pdbg_buff->data + offset;
+	pin_buff->offset = offset;
+	pin_buff->size = size;
+	return 0;
+}
+
+void cudbg_put_buff(struct cudbg_init *pdbg_init,
+		    struct cudbg_buffer *pin_buff)
+{
+	/* Clear compression buffer for re-use */
+	if (pdbg_init->compress_type != CUDBG_COMPRESSION_NONE)
+		memset(pdbg_init->compress_buff, 0,
+		       pdbg_init->compress_buff_size);
+
+	pin_buff->data = NULL;
+	pin_buff->offset = 0;
+	pin_buff->size = 0;
+}
+
+void cudbg_update_buff(struct cudbg_buffer *pin_buff,
+		       struct cudbg_buffer *pout_buff)
+{
+	/* We already write to buffer provided by ethool, so just
+	 * increment offset to next free space.
+	 */
+	pout_buff->offset += pin_buff->size;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
new file mode 100644
index 0000000..dc25066
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
@@ -0,0 +1,452 @@
+/*
+ *  Copyright (C) 2017 Chelsio Communications.  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#ifndef __CUDBG_ENTITY_H__
+#define __CUDBG_ENTITY_H__
+
+#define EDC0_FLAG 0
+#define EDC1_FLAG 1
+#define MC_FLAG 2
+#define MC0_FLAG 3
+#define MC1_FLAG 4
+#define HMA_FLAG 5
+
+#define CUDBG_ENTITY_SIGNATURE 0xCCEDB001
+
+struct cudbg_mbox_log {
+	struct mbox_cmd entry;
+	u32 hi[MBOX_LEN / 8];
+	u32 lo[MBOX_LEN / 8];
+};
+
+struct cudbg_cim_qcfg {
+	u8 chip;
+	u16 base[CIM_NUM_IBQ + CIM_NUM_OBQ_T5];
+	u16 size[CIM_NUM_IBQ + CIM_NUM_OBQ_T5];
+	u16 thres[CIM_NUM_IBQ];
+	u32 obq_wr[2 * CIM_NUM_OBQ_T5];
+	u32 stat[4 * (CIM_NUM_IBQ + CIM_NUM_OBQ_T5)];
+};
+
+struct cudbg_rss_vf_conf {
+	u32 rss_vf_vfl;
+	u32 rss_vf_vfh;
+};
+
+struct cudbg_pm_stats {
+	u32 tx_cnt[T6_PM_NSTATS];
+	u32 rx_cnt[T6_PM_NSTATS];
+	u64 tx_cyc[T6_PM_NSTATS];
+	u64 rx_cyc[T6_PM_NSTATS];
+};
+
+struct cudbg_hw_sched {
+	u32 kbps[NTX_SCHED];
+	u32 ipg[NTX_SCHED];
+	u32 pace_tab[NTX_SCHED];
+	u32 mode;
+	u32 map;
+};
+
+struct ireg_field {
+	u32 ireg_addr;
+	u32 ireg_data;
+	u32 ireg_local_offset;
+	u32 ireg_offset_range;
+};
+
+struct ireg_buf {
+	struct ireg_field tp_pio;
+	u32 outbuf[32];
+};
+
+struct cudbg_ulprx_la {
+	u32 data[ULPRX_LA_SIZE * 8];
+	u32 size;
+};
+
+struct cudbg_tp_la {
+	u32 size;
+	u32 mode;
+	u8 data[0];
+};
+
+static const char * const cudbg_region[] = {
+	"DBQ contexts:", "IMSG contexts:", "FLM cache:", "TCBs:",
+	"Pstructs:", "Timers:", "Rx FL:", "Tx FL:", "Pstruct FL:",
+	"Tx payload:", "Rx payload:", "LE hash:", "iSCSI region:",
+	"TDDP region:", "TPT region:", "STAG region:", "RQ region:",
+	"RQUDP region:", "PBL region:", "TXPBL region:",
+	"DBVFIFO region:", "ULPRX state:", "ULPTX state:",
+	"On-chip queues:"
+};
+
+/* Memory region info relative to current memory (i.e. wrt 0). */
+struct cudbg_region_info {
+	bool exist; /* Does region exists in current memory? */
+	u32 start;  /* Start wrt 0 */
+	u32 end;    /* End wrt 0 */
+};
+
+struct cudbg_mem_desc {
+	u32 base;
+	u32 limit;
+	u32 idx;
+};
+
+struct cudbg_meminfo {
+	struct cudbg_mem_desc avail[4];
+	struct cudbg_mem_desc mem[ARRAY_SIZE(cudbg_region) + 3];
+	u32 avail_c;
+	u32 mem_c;
+	u32 up_ram_lo;
+	u32 up_ram_hi;
+	u32 up_extmem2_lo;
+	u32 up_extmem2_hi;
+	u32 rx_pages_data[3];
+	u32 tx_pages_data[4];
+	u32 p_structs;
+	u32 reserved[12];
+	u32 port_used[4];
+	u32 port_alloc[4];
+	u32 loopback_used[NCHAN];
+	u32 loopback_alloc[NCHAN];
+};
+
+struct cudbg_cim_pif_la {
+	int size;
+	u8 data[0];
+};
+
+struct cudbg_clk_info {
+	u64 retransmit_min;
+	u64 retransmit_max;
+	u64 persist_timer_min;
+	u64 persist_timer_max;
+	u64 keepalive_idle_timer;
+	u64 keepalive_interval;
+	u64 initial_srtt;
+	u64 finwait2_timer;
+	u32 dack_timer;
+	u32 res;
+	u32 cclk_ps;
+	u32 tre;
+	u32 dack_re;
+};
+
+struct cudbg_tid_info_region {
+	u32 ntids;
+	u32 nstids;
+	u32 stid_base;
+	u32 hash_base;
+
+	u32 natids;
+	u32 nftids;
+	u32 ftid_base;
+	u32 aftid_base;
+	u32 aftid_end;
+
+	u32 sftid_base;
+	u32 nsftids;
+
+	u32 uotid_base;
+	u32 nuotids;
+
+	u32 sb;
+	u32 flags;
+	u32 le_db_conf;
+	u32 ip_users;
+	u32 ipv6_users;
+
+	u32 hpftid_base;
+	u32 nhpftids;
+};
+
+#define CUDBG_TID_INFO_REV 1
+
+struct cudbg_tid_info_region_rev1 {
+	struct cudbg_ver_hdr ver_hdr;
+	struct cudbg_tid_info_region tid;
+	u32 tid_start;
+	u32 reserved[16];
+};
+
+#define CUDBG_LOWMEM_MAX_CTXT_QIDS 256
+#define CUDBG_MAX_FL_QIDS 1024
+
+struct cudbg_ch_cntxt {
+	u32 cntxt_type;
+	u32 cntxt_id;
+	u32 data[SGE_CTXT_SIZE / 4];
+};
+
+#define CUDBG_MAX_RPLC_SIZE 128
+
+struct cudbg_mps_tcam {
+	u64 mask;
+	u32 rplc[8];
+	u32 idx;
+	u32 cls_lo;
+	u32 cls_hi;
+	u32 rplc_size;
+	u32 vniy;
+	u32 vnix;
+	u32 dip_hit;
+	u32 vlan_vld;
+	u32 repli;
+	u16 ivlan;
+	u8 addr[ETH_ALEN];
+	u8 lookup_type;
+	u8 port_num;
+	u8 reserved[2];
+};
+
+#define CUDBG_VPD_PF_SIZE 0x800
+#define CUDBG_SCFG_VER_ADDR 0x06
+#define CUDBG_SCFG_VER_LEN 4
+#define CUDBG_VPD_VER_ADDR 0x18c7
+#define CUDBG_VPD_VER_LEN 2
+
+struct cudbg_vpd_data {
+	u8 sn[SERNUM_LEN + 1];
+	u8 bn[PN_LEN + 1];
+	u8 na[MACADDR_LEN + 1];
+	u8 mn[ID_LEN + 1];
+	u16 fw_major;
+	u16 fw_minor;
+	u16 fw_micro;
+	u16 fw_build;
+	u32 scfg_vers;
+	u32 vpd_vers;
+};
+
+#define CUDBG_MAX_TCAM_TID 0x800
+
+enum cudbg_le_entry_types {
+	LE_ET_UNKNOWN = 0,
+	LE_ET_TCAM_CON = 1,
+	LE_ET_TCAM_SERVER = 2,
+	LE_ET_TCAM_FILTER = 3,
+	LE_ET_TCAM_CLIP = 4,
+	LE_ET_TCAM_ROUTING = 5,
+	LE_ET_HASH_CON = 6,
+	LE_ET_INVALID_TID = 8,
+};
+
+struct cudbg_tcam {
+	u32 filter_start;
+	u32 server_start;
+	u32 clip_start;
+	u32 routing_start;
+	u32 tid_hash_base;
+	u32 max_tid;
+};
+
+struct cudbg_tid_data {
+	u32 tid;
+	u32 dbig_cmd;
+	u32 dbig_conf;
+	u32 dbig_rsp_stat;
+	u32 data[NUM_LE_DB_DBGI_RSP_DATA_INSTANCES];
+};
+
+#define CUDBG_NUM_ULPTX 11
+#define CUDBG_NUM_ULPTX_READ 512
+
+struct cudbg_ulptx_la {
+	u32 rdptr[CUDBG_NUM_ULPTX];
+	u32 wrptr[CUDBG_NUM_ULPTX];
+	u32 rddata[CUDBG_NUM_ULPTX];
+	u32 rd_data[CUDBG_NUM_ULPTX][CUDBG_NUM_ULPTX_READ];
+};
+
+#define CUDBG_CHAC_PBT_ADDR 0x2800
+#define CUDBG_CHAC_PBT_LRF  0x3000
+#define CUDBG_CHAC_PBT_DATA 0x3800
+#define CUDBG_PBT_DYNAMIC_ENTRIES 8
+#define CUDBG_PBT_STATIC_ENTRIES 16
+#define CUDBG_LRF_ENTRIES 8
+#define CUDBG_PBT_DATA_ENTRIES 512
+
+struct cudbg_pbt_tables {
+	u32 pbt_dynamic[CUDBG_PBT_DYNAMIC_ENTRIES];
+	u32 pbt_static[CUDBG_PBT_STATIC_ENTRIES];
+	u32 lrf_table[CUDBG_LRF_ENTRIES];
+	u32 pbt_data[CUDBG_PBT_DATA_ENTRIES];
+};
+
+#define IREG_NUM_ELEM 4
+
+static const u32 t6_tp_pio_array[][IREG_NUM_ELEM] = {
+	{0x7e40, 0x7e44, 0x020, 28}, /* t6_tp_pio_regs_20_to_3b */
+	{0x7e40, 0x7e44, 0x040, 10}, /* t6_tp_pio_regs_40_to_49 */
+	{0x7e40, 0x7e44, 0x050, 10}, /* t6_tp_pio_regs_50_to_59 */
+	{0x7e40, 0x7e44, 0x060, 14}, /* t6_tp_pio_regs_60_to_6d */
+	{0x7e40, 0x7e44, 0x06F, 1}, /* t6_tp_pio_regs_6f */
+	{0x7e40, 0x7e44, 0x070, 6}, /* t6_tp_pio_regs_70_to_75 */
+	{0x7e40, 0x7e44, 0x130, 18}, /* t6_tp_pio_regs_130_to_141 */
+	{0x7e40, 0x7e44, 0x145, 19}, /* t6_tp_pio_regs_145_to_157 */
+	{0x7e40, 0x7e44, 0x160, 1}, /* t6_tp_pio_regs_160 */
+	{0x7e40, 0x7e44, 0x230, 25}, /* t6_tp_pio_regs_230_to_248 */
+	{0x7e40, 0x7e44, 0x24a, 3}, /* t6_tp_pio_regs_24c */
+	{0x7e40, 0x7e44, 0x8C0, 1} /* t6_tp_pio_regs_8c0 */
+};
+
+static const u32 t5_tp_pio_array[][IREG_NUM_ELEM] = {
+	{0x7e40, 0x7e44, 0x020, 28}, /* t5_tp_pio_regs_20_to_3b */
+	{0x7e40, 0x7e44, 0x040, 19}, /* t5_tp_pio_regs_40_to_52 */
+	{0x7e40, 0x7e44, 0x054, 2}, /* t5_tp_pio_regs_54_to_55 */
+	{0x7e40, 0x7e44, 0x060, 13}, /* t5_tp_pio_regs_60_to_6c */
+	{0x7e40, 0x7e44, 0x06F, 1}, /* t5_tp_pio_regs_6f */
+	{0x7e40, 0x7e44, 0x120, 4}, /* t5_tp_pio_regs_120_to_123 */
+	{0x7e40, 0x7e44, 0x12b, 2}, /* t5_tp_pio_regs_12b_to_12c */
+	{0x7e40, 0x7e44, 0x12f, 21}, /* t5_tp_pio_regs_12f_to_143 */
+	{0x7e40, 0x7e44, 0x145, 19}, /* t5_tp_pio_regs_145_to_157 */
+	{0x7e40, 0x7e44, 0x230, 25}, /* t5_tp_pio_regs_230_to_248 */
+	{0x7e40, 0x7e44, 0x8C0, 1} /* t5_tp_pio_regs_8c0 */
+};
+
+static const u32 t6_tp_tm_pio_array[][IREG_NUM_ELEM] = {
+	{0x7e18, 0x7e1c, 0x0, 12}
+};
+
+static const u32 t5_tp_tm_pio_array[][IREG_NUM_ELEM] = {
+	{0x7e18, 0x7e1c, 0x0, 12}
+};
+
+static const u32 t6_tp_mib_index_array[6][IREG_NUM_ELEM] = {
+	{0x7e50, 0x7e54, 0x0, 13},
+	{0x7e50, 0x7e54, 0x10, 6},
+	{0x7e50, 0x7e54, 0x18, 21},
+	{0x7e50, 0x7e54, 0x30, 32},
+	{0x7e50, 0x7e54, 0x50, 22},
+	{0x7e50, 0x7e54, 0x68, 12}
+};
+
+static const u32 t5_tp_mib_index_array[9][IREG_NUM_ELEM] = {
+	{0x7e50, 0x7e54, 0x0, 13},
+	{0x7e50, 0x7e54, 0x10, 6},
+	{0x7e50, 0x7e54, 0x18, 8},
+	{0x7e50, 0x7e54, 0x20, 13},
+	{0x7e50, 0x7e54, 0x30, 16},
+	{0x7e50, 0x7e54, 0x40, 16},
+	{0x7e50, 0x7e54, 0x50, 16},
+	{0x7e50, 0x7e54, 0x60, 6},
+	{0x7e50, 0x7e54, 0x68, 4}
+};
+
+static const u32 t5_sge_dbg_index_array[2][IREG_NUM_ELEM] = {
+	{0x10cc, 0x10d0, 0x0, 16},
+	{0x10cc, 0x10d4, 0x0, 16},
+};
+
+static const u32 t5_pcie_pdbg_array[][IREG_NUM_ELEM] = {
+	{0x5a04, 0x5a0c, 0x00, 0x20}, /* t5_pcie_pdbg_regs_00_to_20 */
+	{0x5a04, 0x5a0c, 0x21, 0x20}, /* t5_pcie_pdbg_regs_21_to_40 */
+	{0x5a04, 0x5a0c, 0x41, 0x10}, /* t5_pcie_pdbg_regs_41_to_50 */
+};
+
+static const u32 t5_pcie_cdbg_array[][IREG_NUM_ELEM] = {
+	{0x5a10, 0x5a18, 0x00, 0x20}, /* t5_pcie_cdbg_regs_00_to_20 */
+	{0x5a10, 0x5a18, 0x21, 0x18}, /* t5_pcie_cdbg_regs_21_to_37 */
+};
+
+static const u32 t5_pm_rx_array[][IREG_NUM_ELEM] = {
+	{0x8FD0, 0x8FD4, 0x10000, 0x20}, /* t5_pm_rx_regs_10000_to_10020 */
+	{0x8FD0, 0x8FD4, 0x10021, 0x0D}, /* t5_pm_rx_regs_10021_to_1002c */
+};
+
+static const u32 t5_pm_tx_array[][IREG_NUM_ELEM] = {
+	{0x8FF0, 0x8FF4, 0x10000, 0x20}, /* t5_pm_tx_regs_10000_to_10020 */
+	{0x8FF0, 0x8FF4, 0x10021, 0x1D}, /* t5_pm_tx_regs_10021_to_1003c */
+};
+
+#define CUDBG_NUM_PCIE_CONFIG_REGS 0x61
+
+static const u32 t5_pcie_config_array[][2] = {
+	{0x0, 0x34},
+	{0x3c, 0x40},
+	{0x50, 0x64},
+	{0x70, 0x80},
+	{0x94, 0xa0},
+	{0xb0, 0xb8},
+	{0xd0, 0xd4},
+	{0x100, 0x128},
+	{0x140, 0x148},
+	{0x150, 0x164},
+	{0x170, 0x178},
+	{0x180, 0x194},
+	{0x1a0, 0x1b8},
+	{0x1c0, 0x208},
+};
+
+static const u32 t6_ma_ireg_array[][IREG_NUM_ELEM] = {
+	{0x78f8, 0x78fc, 0xa000, 23}, /* t6_ma_regs_a000_to_a016 */
+	{0x78f8, 0x78fc, 0xa400, 30}, /* t6_ma_regs_a400_to_a41e */
+	{0x78f8, 0x78fc, 0xa800, 20} /* t6_ma_regs_a800_to_a813 */
+};
+
+static const u32 t6_ma_ireg_array2[][IREG_NUM_ELEM] = {
+	{0x78f8, 0x78fc, 0xe400, 17}, /* t6_ma_regs_e400_to_e600 */
+	{0x78f8, 0x78fc, 0xe640, 13} /* t6_ma_regs_e640_to_e7c0 */
+};
+
+static const u32 t6_up_cim_reg_array[][IREG_NUM_ELEM + 1] = {
+	{0x7b50, 0x7b54, 0x2000, 0x20, 0}, /* up_cim_2000_to_207c */
+	{0x7b50, 0x7b54, 0x2080, 0x1d, 0}, /* up_cim_2080_to_20fc */
+	{0x7b50, 0x7b54, 0x00, 0x20, 0}, /* up_cim_00_to_7c */
+	{0x7b50, 0x7b54, 0x80, 0x20, 0}, /* up_cim_80_to_fc */
+	{0x7b50, 0x7b54, 0x100, 0x11, 0}, /* up_cim_100_to_14c */
+	{0x7b50, 0x7b54, 0x200, 0x10, 0}, /* up_cim_200_to_23c */
+	{0x7b50, 0x7b54, 0x240, 0x2, 0}, /* up_cim_240_to_244 */
+	{0x7b50, 0x7b54, 0x250, 0x2, 0}, /* up_cim_250_to_254 */
+	{0x7b50, 0x7b54, 0x260, 0x2, 0}, /* up_cim_260_to_264 */
+	{0x7b50, 0x7b54, 0x270, 0x2, 0}, /* up_cim_270_to_274 */
+	{0x7b50, 0x7b54, 0x280, 0x20, 0}, /* up_cim_280_to_2fc */
+	{0x7b50, 0x7b54, 0x300, 0x20, 0}, /* up_cim_300_to_37c */
+	{0x7b50, 0x7b54, 0x380, 0x14, 0}, /* up_cim_380_to_3cc */
+	{0x7b50, 0x7b54, 0x4900, 0x4, 0x4}, /* up_cim_4900_to_4c60 */
+	{0x7b50, 0x7b54, 0x4904, 0x4, 0x4}, /* up_cim_4904_to_4c64 */
+	{0x7b50, 0x7b54, 0x4908, 0x4, 0x4}, /* up_cim_4908_to_4c68 */
+	{0x7b50, 0x7b54, 0x4910, 0x4, 0x4}, /* up_cim_4910_to_4c70 */
+	{0x7b50, 0x7b54, 0x4914, 0x4, 0x4}, /* up_cim_4914_to_4c74 */
+	{0x7b50, 0x7b54, 0x4920, 0x10, 0x10}, /* up_cim_4920_to_4a10 */
+	{0x7b50, 0x7b54, 0x4924, 0x10, 0x10}, /* up_cim_4924_to_4a14 */
+	{0x7b50, 0x7b54, 0x4928, 0x10, 0x10}, /* up_cim_4928_to_4a18 */
+	{0x7b50, 0x7b54, 0x492c, 0x10, 0x10}, /* up_cim_492c_to_4a1c */
+};
+
+static const u32 t5_up_cim_reg_array[][IREG_NUM_ELEM + 1] = {
+	{0x7b50, 0x7b54, 0x2000, 0x20, 0}, /* up_cim_2000_to_207c */
+	{0x7b50, 0x7b54, 0x2080, 0x19, 0}, /* up_cim_2080_to_20ec */
+	{0x7b50, 0x7b54, 0x00, 0x20, 0}, /* up_cim_00_to_7c */
+	{0x7b50, 0x7b54, 0x80, 0x20, 0}, /* up_cim_80_to_fc */
+	{0x7b50, 0x7b54, 0x100, 0x11, 0}, /* up_cim_100_to_14c */
+	{0x7b50, 0x7b54, 0x200, 0x10, 0}, /* up_cim_200_to_23c */
+	{0x7b50, 0x7b54, 0x240, 0x2, 0}, /* up_cim_240_to_244 */
+	{0x7b50, 0x7b54, 0x250, 0x2, 0}, /* up_cim_250_to_254 */
+	{0x7b50, 0x7b54, 0x260, 0x2, 0}, /* up_cim_260_to_264 */
+	{0x7b50, 0x7b54, 0x270, 0x2, 0}, /* up_cim_270_to_274 */
+	{0x7b50, 0x7b54, 0x280, 0x20, 0}, /* up_cim_280_to_2fc */
+	{0x7b50, 0x7b54, 0x300, 0x20, 0}, /* up_cim_300_to_37c */
+	{0x7b50, 0x7b54, 0x380, 0x14, 0}, /* up_cim_380_to_3cc */
+};
+
+static const u32 t6_hma_ireg_array[][IREG_NUM_ELEM] = {
+	{0x51320, 0x51324, 0xa000, 32} /* t6_hma_regs_a000_to_a01f */
+};
+#endif /* __CUDBG_ENTITY_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_if.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_if.h
new file mode 100644
index 0000000..8568a51
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_if.h
@@ -0,0 +1,100 @@
+/*
+ *  Copyright (C) 2017 Chelsio Communications.  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#ifndef __CUDBG_IF_H__
+#define __CUDBG_IF_H__
+
+/* Error codes */
+#define CUDBG_STATUS_NO_MEM -19
+#define CUDBG_STATUS_ENTITY_NOT_FOUND -24
+#define CUDBG_STATUS_NOT_IMPLEMENTED -28
+#define CUDBG_SYSTEM_ERROR -29
+#define CUDBG_STATUS_CCLK_NOT_DEFINED -32
+
+#define CUDBG_MAJOR_VERSION 1
+#define CUDBG_MINOR_VERSION 14
+
+enum cudbg_dbg_entity_type {
+	CUDBG_REG_DUMP = 1,
+	CUDBG_DEV_LOG = 2,
+	CUDBG_CIM_LA = 3,
+	CUDBG_CIM_MA_LA = 4,
+	CUDBG_CIM_QCFG = 5,
+	CUDBG_CIM_IBQ_TP0 = 6,
+	CUDBG_CIM_IBQ_TP1 = 7,
+	CUDBG_CIM_IBQ_ULP = 8,
+	CUDBG_CIM_IBQ_SGE0 = 9,
+	CUDBG_CIM_IBQ_SGE1 = 10,
+	CUDBG_CIM_IBQ_NCSI = 11,
+	CUDBG_CIM_OBQ_ULP0 = 12,
+	CUDBG_CIM_OBQ_ULP1 = 13,
+	CUDBG_CIM_OBQ_ULP2 = 14,
+	CUDBG_CIM_OBQ_ULP3 = 15,
+	CUDBG_CIM_OBQ_SGE = 16,
+	CUDBG_CIM_OBQ_NCSI = 17,
+	CUDBG_EDC0 = 18,
+	CUDBG_EDC1 = 19,
+	CUDBG_MC0 = 20,
+	CUDBG_MC1 = 21,
+	CUDBG_RSS = 22,
+	CUDBG_RSS_VF_CONF = 25,
+	CUDBG_PATH_MTU = 27,
+	CUDBG_PM_STATS = 30,
+	CUDBG_HW_SCHED = 31,
+	CUDBG_TP_INDIRECT = 36,
+	CUDBG_SGE_INDIRECT = 37,
+	CUDBG_ULPRX_LA = 41,
+	CUDBG_TP_LA = 43,
+	CUDBG_MEMINFO = 44,
+	CUDBG_CIM_PIF_LA = 45,
+	CUDBG_CLK = 46,
+	CUDBG_CIM_OBQ_RXQ0 = 47,
+	CUDBG_CIM_OBQ_RXQ1 = 48,
+	CUDBG_PCIE_INDIRECT = 50,
+	CUDBG_PM_INDIRECT = 51,
+	CUDBG_TID_INFO = 54,
+	CUDBG_PCIE_CONFIG = 55,
+	CUDBG_DUMP_CONTEXT = 56,
+	CUDBG_MPS_TCAM = 57,
+	CUDBG_VPD_DATA = 58,
+	CUDBG_LE_TCAM = 59,
+	CUDBG_CCTRL = 60,
+	CUDBG_MA_INDIRECT = 61,
+	CUDBG_ULPTX_LA = 62,
+	CUDBG_UP_CIM_INDIRECT = 64,
+	CUDBG_PBT_TABLE = 65,
+	CUDBG_MBOX_LOG = 66,
+	CUDBG_HMA_INDIRECT = 67,
+	CUDBG_HMA = 68,
+	CUDBG_MAX_ENTITY = 70,
+};
+
+struct cudbg_init {
+	struct adapter *adap; /* Pointer to adapter structure */
+	void *outbuf; /* Output buffer */
+	u32 outbuf_size;  /* Output buffer size */
+	u8 compress_type; /* Type of compression to use */
+	void *compress_buff; /* Compression buffer */
+	u32 compress_buff_size; /* Compression buffer size */
+	void *workspace; /* Workspace for zlib */
+};
+
+static inline unsigned int cudbg_mbytes_to_bytes(unsigned int size)
+{
+	return size * 1024 * 1024;
+}
+#endif /* __CUDBG_IF_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
new file mode 100644
index 0000000..9da6f57
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
@@ -0,0 +1,2774 @@
+/*
+ *  Copyright (C) 2017 Chelsio Communications.  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#include <linux/sort.h>
+
+#include "t4_regs.h"
+#include "cxgb4.h"
+#include "cudbg_if.h"
+#include "cudbg_lib_common.h"
+#include "cudbg_entity.h"
+#include "cudbg_lib.h"
+#include "cudbg_zlib.h"
+
+static int cudbg_do_compression(struct cudbg_init *pdbg_init,
+				struct cudbg_buffer *pin_buff,
+				struct cudbg_buffer *dbg_buff)
+{
+	struct cudbg_buffer temp_in_buff = { 0 };
+	int bytes_left, bytes_read, bytes;
+	u32 offset = dbg_buff->offset;
+	int rc;
+
+	temp_in_buff.offset = pin_buff->offset;
+	temp_in_buff.data = pin_buff->data;
+	temp_in_buff.size = pin_buff->size;
+
+	bytes_left = pin_buff->size;
+	bytes_read = 0;
+	while (bytes_left > 0) {
+		/* Do compression in smaller chunks */
+		bytes = min_t(unsigned long, bytes_left,
+			      (unsigned long)CUDBG_CHUNK_SIZE);
+		temp_in_buff.data = (char *)pin_buff->data + bytes_read;
+		temp_in_buff.size = bytes;
+		rc = cudbg_compress_buff(pdbg_init, &temp_in_buff, dbg_buff);
+		if (rc)
+			return rc;
+		bytes_left -= bytes;
+		bytes_read += bytes;
+	}
+
+	pin_buff->size = dbg_buff->offset - offset;
+	return 0;
+}
+
+static int cudbg_write_and_release_buff(struct cudbg_init *pdbg_init,
+					struct cudbg_buffer *pin_buff,
+					struct cudbg_buffer *dbg_buff)
+{
+	int rc = 0;
+
+	if (pdbg_init->compress_type == CUDBG_COMPRESSION_NONE) {
+		cudbg_update_buff(pin_buff, dbg_buff);
+	} else {
+		rc = cudbg_do_compression(pdbg_init, pin_buff, dbg_buff);
+		if (rc)
+			goto out;
+	}
+
+out:
+	cudbg_put_buff(pdbg_init, pin_buff);
+	return rc;
+}
+
+static int is_fw_attached(struct cudbg_init *pdbg_init)
+{
+	struct adapter *padap = pdbg_init->adap;
+
+	if (!(padap->flags & FW_OK) || padap->use_bd)
+		return 0;
+
+	return 1;
+}
+
+/* This function will add additional padding bytes into debug_buffer to make it
+ * 4 byte aligned.
+ */
+void cudbg_align_debug_buffer(struct cudbg_buffer *dbg_buff,
+			      struct cudbg_entity_hdr *entity_hdr)
+{
+	u8 zero_buf[4] = {0};
+	u8 padding, remain;
+
+	remain = (dbg_buff->offset - entity_hdr->start_offset) % 4;
+	padding = 4 - remain;
+	if (remain) {
+		memcpy(((u8 *)dbg_buff->data) + dbg_buff->offset, &zero_buf,
+		       padding);
+		dbg_buff->offset += padding;
+		entity_hdr->num_pad = padding;
+	}
+	entity_hdr->size = dbg_buff->offset - entity_hdr->start_offset;
+}
+
+struct cudbg_entity_hdr *cudbg_get_entity_hdr(void *outbuf, int i)
+{
+	struct cudbg_hdr *cudbg_hdr = (struct cudbg_hdr *)outbuf;
+
+	return (struct cudbg_entity_hdr *)
+	       ((char *)outbuf + cudbg_hdr->hdr_len +
+		(sizeof(struct cudbg_entity_hdr) * (i - 1)));
+}
+
+static int cudbg_read_vpd_reg(struct adapter *padap, u32 addr, u32 len,
+			      void *dest)
+{
+	int vaddr, rc;
+
+	vaddr = t4_eeprom_ptov(addr, padap->pf, EEPROMPFSIZE);
+	if (vaddr < 0)
+		return vaddr;
+
+	rc = pci_read_vpd(padap->pdev, vaddr, len, dest);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
+static int cudbg_mem_desc_cmp(const void *a, const void *b)
+{
+	return ((const struct cudbg_mem_desc *)a)->base -
+	       ((const struct cudbg_mem_desc *)b)->base;
+}
+
+int cudbg_fill_meminfo(struct adapter *padap,
+		       struct cudbg_meminfo *meminfo_buff)
+{
+	struct cudbg_mem_desc *md;
+	u32 lo, hi, used, alloc;
+	int n, i;
+
+	memset(meminfo_buff->avail, 0,
+	       ARRAY_SIZE(meminfo_buff->avail) *
+	       sizeof(struct cudbg_mem_desc));
+	memset(meminfo_buff->mem, 0,
+	       (ARRAY_SIZE(cudbg_region) + 3) * sizeof(struct cudbg_mem_desc));
+	md  = meminfo_buff->mem;
+
+	for (i = 0; i < ARRAY_SIZE(meminfo_buff->mem); i++) {
+		meminfo_buff->mem[i].limit = 0;
+		meminfo_buff->mem[i].idx = i;
+	}
+
+	/* Find and sort the populated memory ranges */
+	i = 0;
+	lo = t4_read_reg(padap, MA_TARGET_MEM_ENABLE_A);
+	if (lo & EDRAM0_ENABLE_F) {
+		hi = t4_read_reg(padap, MA_EDRAM0_BAR_A);
+		meminfo_buff->avail[i].base =
+			cudbg_mbytes_to_bytes(EDRAM0_BASE_G(hi));
+		meminfo_buff->avail[i].limit =
+			meminfo_buff->avail[i].base +
+			cudbg_mbytes_to_bytes(EDRAM0_SIZE_G(hi));
+		meminfo_buff->avail[i].idx = 0;
+		i++;
+	}
+
+	if (lo & EDRAM1_ENABLE_F) {
+		hi =  t4_read_reg(padap, MA_EDRAM1_BAR_A);
+		meminfo_buff->avail[i].base =
+			cudbg_mbytes_to_bytes(EDRAM1_BASE_G(hi));
+		meminfo_buff->avail[i].limit =
+			meminfo_buff->avail[i].base +
+			cudbg_mbytes_to_bytes(EDRAM1_SIZE_G(hi));
+		meminfo_buff->avail[i].idx = 1;
+		i++;
+	}
+
+	if (is_t5(padap->params.chip)) {
+		if (lo & EXT_MEM0_ENABLE_F) {
+			hi = t4_read_reg(padap, MA_EXT_MEMORY0_BAR_A);
+			meminfo_buff->avail[i].base =
+				cudbg_mbytes_to_bytes(EXT_MEM_BASE_G(hi));
+			meminfo_buff->avail[i].limit =
+				meminfo_buff->avail[i].base +
+				cudbg_mbytes_to_bytes(EXT_MEM_SIZE_G(hi));
+			meminfo_buff->avail[i].idx = 3;
+			i++;
+		}
+
+		if (lo & EXT_MEM1_ENABLE_F) {
+			hi = t4_read_reg(padap, MA_EXT_MEMORY1_BAR_A);
+			meminfo_buff->avail[i].base =
+				cudbg_mbytes_to_bytes(EXT_MEM1_BASE_G(hi));
+			meminfo_buff->avail[i].limit =
+				meminfo_buff->avail[i].base +
+				cudbg_mbytes_to_bytes(EXT_MEM1_SIZE_G(hi));
+			meminfo_buff->avail[i].idx = 4;
+			i++;
+		}
+	} else {
+		if (lo & EXT_MEM_ENABLE_F) {
+			hi = t4_read_reg(padap, MA_EXT_MEMORY_BAR_A);
+			meminfo_buff->avail[i].base =
+				cudbg_mbytes_to_bytes(EXT_MEM_BASE_G(hi));
+			meminfo_buff->avail[i].limit =
+				meminfo_buff->avail[i].base +
+				cudbg_mbytes_to_bytes(EXT_MEM_SIZE_G(hi));
+			meminfo_buff->avail[i].idx = 2;
+			i++;
+		}
+
+		if (lo & HMA_MUX_F) {
+			hi = t4_read_reg(padap, MA_EXT_MEMORY1_BAR_A);
+			meminfo_buff->avail[i].base =
+				cudbg_mbytes_to_bytes(EXT_MEM1_BASE_G(hi));
+			meminfo_buff->avail[i].limit =
+				meminfo_buff->avail[i].base +
+				cudbg_mbytes_to_bytes(EXT_MEM1_SIZE_G(hi));
+			meminfo_buff->avail[i].idx = 5;
+			i++;
+		}
+	}
+
+	if (!i) /* no memory available */
+		return CUDBG_STATUS_ENTITY_NOT_FOUND;
+
+	meminfo_buff->avail_c = i;
+	sort(meminfo_buff->avail, i, sizeof(struct cudbg_mem_desc),
+	     cudbg_mem_desc_cmp, NULL);
+	(md++)->base = t4_read_reg(padap, SGE_DBQ_CTXT_BADDR_A);
+	(md++)->base = t4_read_reg(padap, SGE_IMSG_CTXT_BADDR_A);
+	(md++)->base = t4_read_reg(padap, SGE_FLM_CACHE_BADDR_A);
+	(md++)->base = t4_read_reg(padap, TP_CMM_TCB_BASE_A);
+	(md++)->base = t4_read_reg(padap, TP_CMM_MM_BASE_A);
+	(md++)->base = t4_read_reg(padap, TP_CMM_TIMER_BASE_A);
+	(md++)->base = t4_read_reg(padap, TP_CMM_MM_RX_FLST_BASE_A);
+	(md++)->base = t4_read_reg(padap, TP_CMM_MM_TX_FLST_BASE_A);
+	(md++)->base = t4_read_reg(padap, TP_CMM_MM_PS_FLST_BASE_A);
+
+	/* the next few have explicit upper bounds */
+	md->base = t4_read_reg(padap, TP_PMM_TX_BASE_A);
+	md->limit = md->base - 1 +
+		    t4_read_reg(padap, TP_PMM_TX_PAGE_SIZE_A) *
+		    PMTXMAXPAGE_G(t4_read_reg(padap, TP_PMM_TX_MAX_PAGE_A));
+	md++;
+
+	md->base = t4_read_reg(padap, TP_PMM_RX_BASE_A);
+	md->limit = md->base - 1 +
+		    t4_read_reg(padap, TP_PMM_RX_PAGE_SIZE_A) *
+		    PMRXMAXPAGE_G(t4_read_reg(padap, TP_PMM_RX_MAX_PAGE_A));
+	md++;
+
+	if (t4_read_reg(padap, LE_DB_CONFIG_A) & HASHEN_F) {
+		if (CHELSIO_CHIP_VERSION(padap->params.chip) <= CHELSIO_T5) {
+			hi = t4_read_reg(padap, LE_DB_TID_HASHBASE_A) / 4;
+			md->base = t4_read_reg(padap, LE_DB_HASH_TID_BASE_A);
+		} else {
+			hi = t4_read_reg(padap, LE_DB_HASH_TID_BASE_A);
+			md->base = t4_read_reg(padap,
+					       LE_DB_HASH_TBL_BASE_ADDR_A);
+		}
+		md->limit = 0;
+	} else {
+		md->base = 0;
+		md->idx = ARRAY_SIZE(cudbg_region);  /* hide it */
+	}
+	md++;
+
+#define ulp_region(reg) do { \
+	md->base = t4_read_reg(padap, ULP_ ## reg ## _LLIMIT_A);\
+	(md++)->limit = t4_read_reg(padap, ULP_ ## reg ## _ULIMIT_A);\
+} while (0)
+
+	ulp_region(RX_ISCSI);
+	ulp_region(RX_TDDP);
+	ulp_region(TX_TPT);
+	ulp_region(RX_STAG);
+	ulp_region(RX_RQ);
+	ulp_region(RX_RQUDP);
+	ulp_region(RX_PBL);
+	ulp_region(TX_PBL);
+#undef ulp_region
+	md->base = 0;
+	md->idx = ARRAY_SIZE(cudbg_region);
+	if (!is_t4(padap->params.chip)) {
+		u32 fifo_size = t4_read_reg(padap, SGE_DBVFIFO_SIZE_A);
+		u32 sge_ctrl = t4_read_reg(padap, SGE_CONTROL2_A);
+		u32 size = 0;
+
+		if (is_t5(padap->params.chip)) {
+			if (sge_ctrl & VFIFO_ENABLE_F)
+				size = DBVFIFO_SIZE_G(fifo_size);
+		} else {
+			size = T6_DBVFIFO_SIZE_G(fifo_size);
+		}
+
+		if (size) {
+			md->base = BASEADDR_G(t4_read_reg(padap,
+							  SGE_DBVFIFO_BADDR_A));
+			md->limit = md->base + (size << 2) - 1;
+		}
+	}
+
+	md++;
+
+	md->base = t4_read_reg(padap, ULP_RX_CTX_BASE_A);
+	md->limit = 0;
+	md++;
+	md->base = t4_read_reg(padap, ULP_TX_ERR_TABLE_BASE_A);
+	md->limit = 0;
+	md++;
+
+	md->base = padap->vres.ocq.start;
+	if (padap->vres.ocq.size)
+		md->limit = md->base + padap->vres.ocq.size - 1;
+	else
+		md->idx = ARRAY_SIZE(cudbg_region);  /* hide it */
+	md++;
+
+	/* add any address-space holes, there can be up to 3 */
+	for (n = 0; n < i - 1; n++)
+		if (meminfo_buff->avail[n].limit <
+		    meminfo_buff->avail[n + 1].base)
+			(md++)->base = meminfo_buff->avail[n].limit;
+
+	if (meminfo_buff->avail[n].limit)
+		(md++)->base = meminfo_buff->avail[n].limit;
+
+	n = md - meminfo_buff->mem;
+	meminfo_buff->mem_c = n;
+
+	sort(meminfo_buff->mem, n, sizeof(struct cudbg_mem_desc),
+	     cudbg_mem_desc_cmp, NULL);
+
+	lo = t4_read_reg(padap, CIM_SDRAM_BASE_ADDR_A);
+	hi = t4_read_reg(padap, CIM_SDRAM_ADDR_SIZE_A) + lo - 1;
+	meminfo_buff->up_ram_lo = lo;
+	meminfo_buff->up_ram_hi = hi;
+
+	lo = t4_read_reg(padap, CIM_EXTMEM2_BASE_ADDR_A);
+	hi = t4_read_reg(padap, CIM_EXTMEM2_ADDR_SIZE_A) + lo - 1;
+	meminfo_buff->up_extmem2_lo = lo;
+	meminfo_buff->up_extmem2_hi = hi;
+
+	lo = t4_read_reg(padap, TP_PMM_RX_MAX_PAGE_A);
+	meminfo_buff->rx_pages_data[0] =  PMRXMAXPAGE_G(lo);
+	meminfo_buff->rx_pages_data[1] =
+		t4_read_reg(padap, TP_PMM_RX_PAGE_SIZE_A) >> 10;
+	meminfo_buff->rx_pages_data[2] = (lo & PMRXNUMCHN_F) ? 2 : 1;
+
+	lo = t4_read_reg(padap, TP_PMM_TX_MAX_PAGE_A);
+	hi = t4_read_reg(padap, TP_PMM_TX_PAGE_SIZE_A);
+	meminfo_buff->tx_pages_data[0] = PMTXMAXPAGE_G(lo);
+	meminfo_buff->tx_pages_data[1] =
+		hi >= (1 << 20) ? (hi >> 20) : (hi >> 10);
+	meminfo_buff->tx_pages_data[2] =
+		hi >= (1 << 20) ? 'M' : 'K';
+	meminfo_buff->tx_pages_data[3] = 1 << PMTXNUMCHN_G(lo);
+
+	meminfo_buff->p_structs = t4_read_reg(padap, TP_CMM_MM_MAX_PSTRUCT_A);
+
+	for (i = 0; i < 4; i++) {
+		if (CHELSIO_CHIP_VERSION(padap->params.chip) > CHELSIO_T5)
+			lo = t4_read_reg(padap,
+					 MPS_RX_MAC_BG_PG_CNT0_A + i * 4);
+		else
+			lo = t4_read_reg(padap, MPS_RX_PG_RSV0_A + i * 4);
+		if (is_t5(padap->params.chip)) {
+			used = T5_USED_G(lo);
+			alloc = T5_ALLOC_G(lo);
+		} else {
+			used = USED_G(lo);
+			alloc = ALLOC_G(lo);
+		}
+		meminfo_buff->port_used[i] = used;
+		meminfo_buff->port_alloc[i] = alloc;
+	}
+
+	for (i = 0; i < padap->params.arch.nchan; i++) {
+		if (CHELSIO_CHIP_VERSION(padap->params.chip) > CHELSIO_T5)
+			lo = t4_read_reg(padap,
+					 MPS_RX_LPBK_BG_PG_CNT0_A + i * 4);
+		else
+			lo = t4_read_reg(padap, MPS_RX_PG_RSV4_A + i * 4);
+		if (is_t5(padap->params.chip)) {
+			used = T5_USED_G(lo);
+			alloc = T5_ALLOC_G(lo);
+		} else {
+			used = USED_G(lo);
+			alloc = ALLOC_G(lo);
+		}
+		meminfo_buff->loopback_used[i] = used;
+		meminfo_buff->loopback_alloc[i] = alloc;
+	}
+
+	return 0;
+}
+
+int cudbg_collect_reg_dump(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	u32 buf_size = 0;
+	int rc = 0;
+
+	if (is_t4(padap->params.chip))
+		buf_size = T4_REGMAP_SIZE;
+	else if (is_t5(padap->params.chip) || is_t6(padap->params.chip))
+		buf_size = T5_REGMAP_SIZE;
+
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, buf_size, &temp_buff);
+	if (rc)
+		return rc;
+	t4_get_regs(padap, (void *)temp_buff.data, temp_buff.size);
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_fw_devlog(struct cudbg_init *pdbg_init,
+			    struct cudbg_buffer *dbg_buff,
+			    struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct devlog_params *dparams;
+	int rc = 0;
+
+	rc = t4_init_devlog_params(padap);
+	if (rc < 0) {
+		cudbg_err->sys_err = rc;
+		return rc;
+	}
+
+	dparams = &padap->params.devlog;
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, dparams->size, &temp_buff);
+	if (rc)
+		return rc;
+
+	/* Collect FW devlog */
+	if (dparams->start != 0) {
+		spin_lock(&padap->win0_lock);
+		rc = t4_memory_rw(padap, padap->params.drv_memwin,
+				  dparams->memtype, dparams->start,
+				  dparams->size,
+				  (__be32 *)(char *)temp_buff.data,
+				  1);
+		spin_unlock(&padap->win0_lock);
+		if (rc) {
+			cudbg_err->sys_err = rc;
+			cudbg_put_buff(pdbg_init, &temp_buff);
+			return rc;
+		}
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_cim_la(struct cudbg_init *pdbg_init,
+			 struct cudbg_buffer *dbg_buff,
+			 struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	int size, rc;
+	u32 cfg = 0;
+
+	if (is_t6(padap->params.chip)) {
+		size = padap->params.cim_la_size / 10 + 1;
+		size *= 10 * sizeof(u32);
+	} else {
+		size = padap->params.cim_la_size / 8;
+		size *= 8 * sizeof(u32);
+	}
+
+	size += sizeof(cfg);
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	rc = t4_cim_read(padap, UP_UP_DBG_LA_CFG_A, 1, &cfg);
+	if (rc) {
+		cudbg_err->sys_err = rc;
+		cudbg_put_buff(pdbg_init, &temp_buff);
+		return rc;
+	}
+
+	memcpy((char *)temp_buff.data, &cfg, sizeof(cfg));
+	rc = t4_cim_read_la(padap,
+			    (u32 *)((char *)temp_buff.data + sizeof(cfg)),
+			    NULL);
+	if (rc < 0) {
+		cudbg_err->sys_err = rc;
+		cudbg_put_buff(pdbg_init, &temp_buff);
+		return rc;
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_cim_ma_la(struct cudbg_init *pdbg_init,
+			    struct cudbg_buffer *dbg_buff,
+			    struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	int size, rc;
+
+	size = 2 * CIM_MALA_SIZE * 5 * sizeof(u32);
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	t4_cim_read_ma_la(padap,
+			  (u32 *)temp_buff.data,
+			  (u32 *)((char *)temp_buff.data +
+				  5 * CIM_MALA_SIZE));
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_cim_qcfg(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_cim_qcfg *cim_qcfg_data;
+	int rc;
+
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, sizeof(struct cudbg_cim_qcfg),
+			    &temp_buff);
+	if (rc)
+		return rc;
+
+	cim_qcfg_data = (struct cudbg_cim_qcfg *)temp_buff.data;
+	cim_qcfg_data->chip = padap->params.chip;
+	rc = t4_cim_read(padap, UP_IBQ_0_RDADDR_A,
+			 ARRAY_SIZE(cim_qcfg_data->stat), cim_qcfg_data->stat);
+	if (rc) {
+		cudbg_err->sys_err = rc;
+		cudbg_put_buff(pdbg_init, &temp_buff);
+		return rc;
+	}
+
+	rc = t4_cim_read(padap, UP_OBQ_0_REALADDR_A,
+			 ARRAY_SIZE(cim_qcfg_data->obq_wr),
+			 cim_qcfg_data->obq_wr);
+	if (rc) {
+		cudbg_err->sys_err = rc;
+		cudbg_put_buff(pdbg_init, &temp_buff);
+		return rc;
+	}
+
+	t4_read_cimq_cfg(padap, cim_qcfg_data->base, cim_qcfg_data->size,
+			 cim_qcfg_data->thres);
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+static int cudbg_read_cim_ibq(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err, int qid)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	int no_of_read_words, rc = 0;
+	u32 qsize;
+
+	/* collect CIM IBQ */
+	qsize = CIM_IBQ_SIZE * 4 * sizeof(u32);
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, qsize, &temp_buff);
+	if (rc)
+		return rc;
+
+	/* t4_read_cim_ibq will return no. of read words or error */
+	no_of_read_words = t4_read_cim_ibq(padap, qid,
+					   (u32 *)temp_buff.data, qsize);
+	/* no_of_read_words is less than or equal to 0 means error */
+	if (no_of_read_words <= 0) {
+		if (!no_of_read_words)
+			rc = CUDBG_SYSTEM_ERROR;
+		else
+			rc = no_of_read_words;
+		cudbg_err->sys_err = rc;
+		cudbg_put_buff(pdbg_init, &temp_buff);
+		return rc;
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_cim_ibq_tp0(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_ibq(pdbg_init, dbg_buff, cudbg_err, 0);
+}
+
+int cudbg_collect_cim_ibq_tp1(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_ibq(pdbg_init, dbg_buff, cudbg_err, 1);
+}
+
+int cudbg_collect_cim_ibq_ulp(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_ibq(pdbg_init, dbg_buff, cudbg_err, 2);
+}
+
+int cudbg_collect_cim_ibq_sge0(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_ibq(pdbg_init, dbg_buff, cudbg_err, 3);
+}
+
+int cudbg_collect_cim_ibq_sge1(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_ibq(pdbg_init, dbg_buff, cudbg_err, 4);
+}
+
+int cudbg_collect_cim_ibq_ncsi(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_ibq(pdbg_init, dbg_buff, cudbg_err, 5);
+}
+
+u32 cudbg_cim_obq_size(struct adapter *padap, int qid)
+{
+	u32 value;
+
+	t4_write_reg(padap, CIM_QUEUE_CONFIG_REF_A, OBQSELECT_F |
+		     QUENUMSELECT_V(qid));
+	value = t4_read_reg(padap, CIM_QUEUE_CONFIG_CTRL_A);
+	value = CIMQSIZE_G(value) * 64; /* size in number of words */
+	return value * sizeof(u32);
+}
+
+static int cudbg_read_cim_obq(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err, int qid)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	int no_of_read_words, rc = 0;
+	u32 qsize;
+
+	/* collect CIM OBQ */
+	qsize =  cudbg_cim_obq_size(padap, qid);
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, qsize, &temp_buff);
+	if (rc)
+		return rc;
+
+	/* t4_read_cim_obq will return no. of read words or error */
+	no_of_read_words = t4_read_cim_obq(padap, qid,
+					   (u32 *)temp_buff.data, qsize);
+	/* no_of_read_words is less than or equal to 0 means error */
+	if (no_of_read_words <= 0) {
+		if (!no_of_read_words)
+			rc = CUDBG_SYSTEM_ERROR;
+		else
+			rc = no_of_read_words;
+		cudbg_err->sys_err = rc;
+		cudbg_put_buff(pdbg_init, &temp_buff);
+		return rc;
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_cim_obq_ulp0(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 0);
+}
+
+int cudbg_collect_cim_obq_ulp1(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 1);
+}
+
+int cudbg_collect_cim_obq_ulp2(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 2);
+}
+
+int cudbg_collect_cim_obq_ulp3(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 3);
+}
+
+int cudbg_collect_cim_obq_sge(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 4);
+}
+
+int cudbg_collect_cim_obq_ncsi(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 5);
+}
+
+int cudbg_collect_obq_sge_rx_q0(struct cudbg_init *pdbg_init,
+				struct cudbg_buffer *dbg_buff,
+				struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 6);
+}
+
+int cudbg_collect_obq_sge_rx_q1(struct cudbg_init *pdbg_init,
+				struct cudbg_buffer *dbg_buff,
+				struct cudbg_error *cudbg_err)
+{
+	return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 7);
+}
+
+static int cudbg_meminfo_get_mem_index(struct adapter *padap,
+				       struct cudbg_meminfo *mem_info,
+				       u8 mem_type, u8 *idx)
+{
+	u8 i, flag;
+
+	switch (mem_type) {
+	case MEM_EDC0:
+		flag = EDC0_FLAG;
+		break;
+	case MEM_EDC1:
+		flag = EDC1_FLAG;
+		break;
+	case MEM_MC0:
+		/* Some T5 cards have both MC0 and MC1. */
+		flag = is_t5(padap->params.chip) ? MC0_FLAG : MC_FLAG;
+		break;
+	case MEM_MC1:
+		flag = MC1_FLAG;
+		break;
+	case MEM_HMA:
+		flag = HMA_FLAG;
+		break;
+	default:
+		return CUDBG_STATUS_ENTITY_NOT_FOUND;
+	}
+
+	for (i = 0; i < mem_info->avail_c; i++) {
+		if (mem_info->avail[i].idx == flag) {
+			*idx = i;
+			return 0;
+		}
+	}
+
+	return CUDBG_STATUS_ENTITY_NOT_FOUND;
+}
+
+/* Fetch the @region_name's start and end from @meminfo. */
+static int cudbg_get_mem_region(struct adapter *padap,
+				struct cudbg_meminfo *meminfo,
+				u8 mem_type, const char *region_name,
+				struct cudbg_mem_desc *mem_desc)
+{
+	u8 mc, found = 0;
+	u32 i, idx = 0;
+	int rc;
+
+	rc = cudbg_meminfo_get_mem_index(padap, meminfo, mem_type, &mc);
+	if (rc)
+		return rc;
+
+	for (i = 0; i < ARRAY_SIZE(cudbg_region); i++) {
+		if (!strcmp(cudbg_region[i], region_name)) {
+			found = 1;
+			idx = i;
+			break;
+		}
+	}
+	if (!found)
+		return -EINVAL;
+
+	found = 0;
+	for (i = 0; i < meminfo->mem_c; i++) {
+		if (meminfo->mem[i].idx >= ARRAY_SIZE(cudbg_region))
+			continue; /* Skip holes */
+
+		if (!(meminfo->mem[i].limit))
+			meminfo->mem[i].limit =
+				i < meminfo->mem_c - 1 ?
+				meminfo->mem[i + 1].base - 1 : ~0;
+
+		if (meminfo->mem[i].idx == idx) {
+			/* Check if the region exists in @mem_type memory */
+			if (meminfo->mem[i].base < meminfo->avail[mc].base &&
+			    meminfo->mem[i].limit < meminfo->avail[mc].base)
+				return -EINVAL;
+
+			if (meminfo->mem[i].base > meminfo->avail[mc].limit)
+				return -EINVAL;
+
+			memcpy(mem_desc, &meminfo->mem[i],
+			       sizeof(struct cudbg_mem_desc));
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* Fetch and update the start and end of the requested memory region w.r.t 0
+ * in the corresponding EDC/MC/HMA.
+ */
+static int cudbg_get_mem_relative(struct adapter *padap,
+				  struct cudbg_meminfo *meminfo,
+				  u8 mem_type, u32 *out_base, u32 *out_end)
+{
+	u8 mc_idx;
+	int rc;
+
+	rc = cudbg_meminfo_get_mem_index(padap, meminfo, mem_type, &mc_idx);
+	if (rc)
+		return rc;
+
+	if (*out_base < meminfo->avail[mc_idx].base)
+		*out_base = 0;
+	else
+		*out_base -= meminfo->avail[mc_idx].base;
+
+	if (*out_end > meminfo->avail[mc_idx].limit)
+		*out_end = meminfo->avail[mc_idx].limit;
+	else
+		*out_end -= meminfo->avail[mc_idx].base;
+
+	return 0;
+}
+
+/* Get TX and RX Payload region */
+static int cudbg_get_payload_range(struct adapter *padap, u8 mem_type,
+				   const char *region_name,
+				   struct cudbg_region_info *payload)
+{
+	struct cudbg_mem_desc mem_desc = { 0 };
+	struct cudbg_meminfo meminfo;
+	int rc;
+
+	rc = cudbg_fill_meminfo(padap, &meminfo);
+	if (rc)
+		return rc;
+
+	rc = cudbg_get_mem_region(padap, &meminfo, mem_type, region_name,
+				  &mem_desc);
+	if (rc) {
+		payload->exist = false;
+		return 0;
+	}
+
+	payload->exist = true;
+	payload->start = mem_desc.base;
+	payload->end = mem_desc.limit;
+
+	return cudbg_get_mem_relative(padap, &meminfo, mem_type,
+				      &payload->start, &payload->end);
+}
+
+static int cudbg_memory_read(struct cudbg_init *pdbg_init, int win,
+			     int mtype, u32 addr, u32 len, void *hbuf)
+{
+	u32 win_pf, memoffset, mem_aperture, mem_base;
+	struct adapter *adap = pdbg_init->adap;
+	u32 pos, offset, resid;
+	u32 *res_buf;
+	u64 *buf;
+	int ret;
+
+	/* Argument sanity checks ...
+	 */
+	if (addr & 0x3 || (uintptr_t)hbuf & 0x3)
+		return -EINVAL;
+
+	buf = (u64 *)hbuf;
+
+	/* Try to do 64-bit reads.  Residual will be handled later. */
+	resid = len & 0x7;
+	len -= resid;
+
+	ret = t4_memory_rw_init(adap, win, mtype, &memoffset, &mem_base,
+				&mem_aperture);
+	if (ret)
+		return ret;
+
+	addr = addr + memoffset;
+	win_pf = is_t4(adap->params.chip) ? 0 : PFNUM_V(adap->pf);
+
+	pos = addr & ~(mem_aperture - 1);
+	offset = addr - pos;
+
+	/* Set up initial PCI-E Memory Window to cover the start of our
+	 * transfer.
+	 */
+	t4_memory_update_win(adap, win, pos | win_pf);
+
+	/* Transfer data from the adapter */
+	while (len > 0) {
+		*buf++ = le64_to_cpu((__force __le64)
+				     t4_read_reg64(adap, mem_base + offset));
+		offset += sizeof(u64);
+		len -= sizeof(u64);
+
+		/* If we've reached the end of our current window aperture,
+		 * move the PCI-E Memory Window on to the next.
+		 */
+		if (offset == mem_aperture) {
+			pos += mem_aperture;
+			offset = 0;
+			t4_memory_update_win(adap, win, pos | win_pf);
+		}
+	}
+
+	res_buf = (u32 *)buf;
+	/* Read residual in 32-bit multiples */
+	while (resid > sizeof(u32)) {
+		*res_buf++ = le32_to_cpu((__force __le32)
+					 t4_read_reg(adap, mem_base + offset));
+		offset += sizeof(u32);
+		resid -= sizeof(u32);
+
+		/* If we've reached the end of our current window aperture,
+		 * move the PCI-E Memory Window on to the next.
+		 */
+		if (offset == mem_aperture) {
+			pos += mem_aperture;
+			offset = 0;
+			t4_memory_update_win(adap, win, pos | win_pf);
+		}
+	}
+
+	/* Transfer residual < 32-bits */
+	if (resid)
+		t4_memory_rw_residual(adap, resid, mem_base + offset,
+				      (u8 *)res_buf, T4_MEMORY_READ);
+
+	return 0;
+}
+
+#define CUDBG_YIELD_ITERATION 256
+
+static int cudbg_read_fw_mem(struct cudbg_init *pdbg_init,
+			     struct cudbg_buffer *dbg_buff, u8 mem_type,
+			     unsigned long tot_len,
+			     struct cudbg_error *cudbg_err)
+{
+	static const char * const region_name[] = { "Tx payload:",
+						    "Rx payload:" };
+	unsigned long bytes, bytes_left, bytes_read = 0;
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_region_info payload[2];
+	u32 yield_count = 0;
+	int rc = 0;
+	u8 i;
+
+	/* Get TX/RX Payload region range if they exist */
+	memset(payload, 0, sizeof(payload));
+	for (i = 0; i < ARRAY_SIZE(region_name); i++) {
+		rc = cudbg_get_payload_range(padap, mem_type, region_name[i],
+					     &payload[i]);
+		if (rc)
+			return rc;
+
+		if (payload[i].exist) {
+			/* Align start and end to avoid wrap around */
+			payload[i].start = roundup(payload[i].start,
+						   CUDBG_CHUNK_SIZE);
+			payload[i].end = rounddown(payload[i].end,
+						   CUDBG_CHUNK_SIZE);
+		}
+	}
+
+	bytes_left = tot_len;
+	while (bytes_left > 0) {
+		/* As MC size is huge and read through PIO access, this
+		 * loop will hold cpu for a longer time. OS may think that
+		 * the process is hanged and will generate CPU stall traces.
+		 * So yield the cpu regularly.
+		 */
+		yield_count++;
+		if (!(yield_count % CUDBG_YIELD_ITERATION))
+			schedule();
+
+		bytes = min_t(unsigned long, bytes_left,
+			      (unsigned long)CUDBG_CHUNK_SIZE);
+		rc = cudbg_get_buff(pdbg_init, dbg_buff, bytes, &temp_buff);
+		if (rc)
+			return rc;
+
+		for (i = 0; i < ARRAY_SIZE(payload); i++)
+			if (payload[i].exist &&
+			    bytes_read >= payload[i].start &&
+			    bytes_read + bytes <= payload[i].end)
+				/* TX and RX Payload regions can't overlap */
+				goto skip_read;
+
+		spin_lock(&padap->win0_lock);
+		rc = cudbg_memory_read(pdbg_init, MEMWIN_NIC, mem_type,
+				       bytes_read, bytes, temp_buff.data);
+		spin_unlock(&padap->win0_lock);
+		if (rc) {
+			cudbg_err->sys_err = rc;
+			cudbg_put_buff(pdbg_init, &temp_buff);
+			return rc;
+		}
+
+skip_read:
+		bytes_left -= bytes;
+		bytes_read += bytes;
+		rc = cudbg_write_and_release_buff(pdbg_init, &temp_buff,
+						  dbg_buff);
+		if (rc) {
+			cudbg_put_buff(pdbg_init, &temp_buff);
+			return rc;
+		}
+	}
+	return rc;
+}
+
+static void cudbg_t4_fwcache(struct cudbg_init *pdbg_init,
+			     struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	int rc;
+
+	if (is_fw_attached(pdbg_init)) {
+		/* Flush uP dcache before reading edcX/mcX  */
+		rc = t4_fwcache(padap, FW_PARAM_DEV_FWCACHE_FLUSH);
+		if (rc)
+			cudbg_err->sys_warn = rc;
+	}
+}
+
+static int cudbg_collect_mem_region(struct cudbg_init *pdbg_init,
+				    struct cudbg_buffer *dbg_buff,
+				    struct cudbg_error *cudbg_err,
+				    u8 mem_type)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_meminfo mem_info;
+	unsigned long size;
+	u8 mc_idx;
+	int rc;
+
+	memset(&mem_info, 0, sizeof(struct cudbg_meminfo));
+	rc = cudbg_fill_meminfo(padap, &mem_info);
+	if (rc)
+		return rc;
+
+	cudbg_t4_fwcache(pdbg_init, cudbg_err);
+	rc = cudbg_meminfo_get_mem_index(padap, &mem_info, mem_type, &mc_idx);
+	if (rc)
+		return rc;
+
+	size = mem_info.avail[mc_idx].limit - mem_info.avail[mc_idx].base;
+	return cudbg_read_fw_mem(pdbg_init, dbg_buff, mem_type, size,
+				 cudbg_err);
+}
+
+int cudbg_collect_edc0_meminfo(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err)
+{
+	return cudbg_collect_mem_region(pdbg_init, dbg_buff, cudbg_err,
+					MEM_EDC0);
+}
+
+int cudbg_collect_edc1_meminfo(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err)
+{
+	return cudbg_collect_mem_region(pdbg_init, dbg_buff, cudbg_err,
+					MEM_EDC1);
+}
+
+int cudbg_collect_mc0_meminfo(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err)
+{
+	return cudbg_collect_mem_region(pdbg_init, dbg_buff, cudbg_err,
+					MEM_MC0);
+}
+
+int cudbg_collect_mc1_meminfo(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err)
+{
+	return cudbg_collect_mem_region(pdbg_init, dbg_buff, cudbg_err,
+					MEM_MC1);
+}
+
+int cudbg_collect_hma_meminfo(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err)
+{
+	return cudbg_collect_mem_region(pdbg_init, dbg_buff, cudbg_err,
+					MEM_HMA);
+}
+
+int cudbg_collect_rss(struct cudbg_init *pdbg_init,
+		      struct cudbg_buffer *dbg_buff,
+		      struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	int rc, nentries;
+
+	nentries = t4_chip_rss_size(padap);
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, nentries * sizeof(u16),
+			    &temp_buff);
+	if (rc)
+		return rc;
+
+	rc = t4_read_rss(padap, (u16 *)temp_buff.data);
+	if (rc) {
+		cudbg_err->sys_err = rc;
+		cudbg_put_buff(pdbg_init, &temp_buff);
+		return rc;
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_rss_vf_config(struct cudbg_init *pdbg_init,
+				struct cudbg_buffer *dbg_buff,
+				struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_rss_vf_conf *vfconf;
+	int vf, rc, vf_count;
+
+	vf_count = padap->params.arch.vfcount;
+	rc = cudbg_get_buff(pdbg_init, dbg_buff,
+			    vf_count * sizeof(struct cudbg_rss_vf_conf),
+			    &temp_buff);
+	if (rc)
+		return rc;
+
+	vfconf = (struct cudbg_rss_vf_conf *)temp_buff.data;
+	for (vf = 0; vf < vf_count; vf++)
+		t4_read_rss_vf_config(padap, vf, &vfconf[vf].rss_vf_vfl,
+				      &vfconf[vf].rss_vf_vfh, true);
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_path_mtu(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	int rc;
+
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, NMTUS * sizeof(u16),
+			    &temp_buff);
+	if (rc)
+		return rc;
+
+	t4_read_mtu_tbl(padap, (u16 *)temp_buff.data, NULL);
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_pm_stats(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_pm_stats *pm_stats_buff;
+	int rc;
+
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, sizeof(struct cudbg_pm_stats),
+			    &temp_buff);
+	if (rc)
+		return rc;
+
+	pm_stats_buff = (struct cudbg_pm_stats *)temp_buff.data;
+	t4_pmtx_get_stats(padap, pm_stats_buff->tx_cnt, pm_stats_buff->tx_cyc);
+	t4_pmrx_get_stats(padap, pm_stats_buff->rx_cnt, pm_stats_buff->rx_cyc);
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_hw_sched(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_hw_sched *hw_sched_buff;
+	int i, rc = 0;
+
+	if (!padap->params.vpd.cclk)
+		return CUDBG_STATUS_CCLK_NOT_DEFINED;
+
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, sizeof(struct cudbg_hw_sched),
+			    &temp_buff);
+	hw_sched_buff = (struct cudbg_hw_sched *)temp_buff.data;
+	hw_sched_buff->map = t4_read_reg(padap, TP_TX_MOD_QUEUE_REQ_MAP_A);
+	hw_sched_buff->mode = TIMERMODE_G(t4_read_reg(padap, TP_MOD_CONFIG_A));
+	t4_read_pace_tbl(padap, hw_sched_buff->pace_tab);
+	for (i = 0; i < NTX_SCHED; ++i)
+		t4_get_tx_sched(padap, i, &hw_sched_buff->kbps[i],
+				&hw_sched_buff->ipg[i], true);
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_tp_indirect(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct ireg_buf *ch_tp_pio;
+	int i, rc, n = 0;
+	u32 size;
+
+	if (is_t5(padap->params.chip))
+		n = sizeof(t5_tp_pio_array) +
+		    sizeof(t5_tp_tm_pio_array) +
+		    sizeof(t5_tp_mib_index_array);
+	else
+		n = sizeof(t6_tp_pio_array) +
+		    sizeof(t6_tp_tm_pio_array) +
+		    sizeof(t6_tp_mib_index_array);
+
+	n = n / (IREG_NUM_ELEM * sizeof(u32));
+	size = sizeof(struct ireg_buf) * n;
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	ch_tp_pio = (struct ireg_buf *)temp_buff.data;
+
+	/* TP_PIO */
+	if (is_t5(padap->params.chip))
+		n = sizeof(t5_tp_pio_array) / (IREG_NUM_ELEM * sizeof(u32));
+	else if (is_t6(padap->params.chip))
+		n = sizeof(t6_tp_pio_array) / (IREG_NUM_ELEM * sizeof(u32));
+
+	for (i = 0; i < n; i++) {
+		struct ireg_field *tp_pio = &ch_tp_pio->tp_pio;
+		u32 *buff = ch_tp_pio->outbuf;
+
+		if (is_t5(padap->params.chip)) {
+			tp_pio->ireg_addr = t5_tp_pio_array[i][0];
+			tp_pio->ireg_data = t5_tp_pio_array[i][1];
+			tp_pio->ireg_local_offset = t5_tp_pio_array[i][2];
+			tp_pio->ireg_offset_range = t5_tp_pio_array[i][3];
+		} else if (is_t6(padap->params.chip)) {
+			tp_pio->ireg_addr = t6_tp_pio_array[i][0];
+			tp_pio->ireg_data = t6_tp_pio_array[i][1];
+			tp_pio->ireg_local_offset = t6_tp_pio_array[i][2];
+			tp_pio->ireg_offset_range = t6_tp_pio_array[i][3];
+		}
+		t4_tp_pio_read(padap, buff, tp_pio->ireg_offset_range,
+			       tp_pio->ireg_local_offset, true);
+		ch_tp_pio++;
+	}
+
+	/* TP_TM_PIO */
+	if (is_t5(padap->params.chip))
+		n = sizeof(t5_tp_tm_pio_array) / (IREG_NUM_ELEM * sizeof(u32));
+	else if (is_t6(padap->params.chip))
+		n = sizeof(t6_tp_tm_pio_array) / (IREG_NUM_ELEM * sizeof(u32));
+
+	for (i = 0; i < n; i++) {
+		struct ireg_field *tp_pio = &ch_tp_pio->tp_pio;
+		u32 *buff = ch_tp_pio->outbuf;
+
+		if (is_t5(padap->params.chip)) {
+			tp_pio->ireg_addr = t5_tp_tm_pio_array[i][0];
+			tp_pio->ireg_data = t5_tp_tm_pio_array[i][1];
+			tp_pio->ireg_local_offset = t5_tp_tm_pio_array[i][2];
+			tp_pio->ireg_offset_range = t5_tp_tm_pio_array[i][3];
+		} else if (is_t6(padap->params.chip)) {
+			tp_pio->ireg_addr = t6_tp_tm_pio_array[i][0];
+			tp_pio->ireg_data = t6_tp_tm_pio_array[i][1];
+			tp_pio->ireg_local_offset = t6_tp_tm_pio_array[i][2];
+			tp_pio->ireg_offset_range = t6_tp_tm_pio_array[i][3];
+		}
+		t4_tp_tm_pio_read(padap, buff, tp_pio->ireg_offset_range,
+				  tp_pio->ireg_local_offset, true);
+		ch_tp_pio++;
+	}
+
+	/* TP_MIB_INDEX */
+	if (is_t5(padap->params.chip))
+		n = sizeof(t5_tp_mib_index_array) /
+		    (IREG_NUM_ELEM * sizeof(u32));
+	else if (is_t6(padap->params.chip))
+		n = sizeof(t6_tp_mib_index_array) /
+		    (IREG_NUM_ELEM * sizeof(u32));
+
+	for (i = 0; i < n ; i++) {
+		struct ireg_field *tp_pio = &ch_tp_pio->tp_pio;
+		u32 *buff = ch_tp_pio->outbuf;
+
+		if (is_t5(padap->params.chip)) {
+			tp_pio->ireg_addr = t5_tp_mib_index_array[i][0];
+			tp_pio->ireg_data = t5_tp_mib_index_array[i][1];
+			tp_pio->ireg_local_offset =
+				t5_tp_mib_index_array[i][2];
+			tp_pio->ireg_offset_range =
+				t5_tp_mib_index_array[i][3];
+		} else if (is_t6(padap->params.chip)) {
+			tp_pio->ireg_addr = t6_tp_mib_index_array[i][0];
+			tp_pio->ireg_data = t6_tp_mib_index_array[i][1];
+			tp_pio->ireg_local_offset =
+				t6_tp_mib_index_array[i][2];
+			tp_pio->ireg_offset_range =
+				t6_tp_mib_index_array[i][3];
+		}
+		t4_tp_mib_read(padap, buff, tp_pio->ireg_offset_range,
+			       tp_pio->ireg_local_offset, true);
+		ch_tp_pio++;
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_sge_indirect(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct ireg_buf *ch_sge_dbg;
+	int i, rc;
+
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, sizeof(*ch_sge_dbg) * 2,
+			    &temp_buff);
+	if (rc)
+		return rc;
+
+	ch_sge_dbg = (struct ireg_buf *)temp_buff.data;
+	for (i = 0; i < 2; i++) {
+		struct ireg_field *sge_pio = &ch_sge_dbg->tp_pio;
+		u32 *buff = ch_sge_dbg->outbuf;
+
+		sge_pio->ireg_addr = t5_sge_dbg_index_array[i][0];
+		sge_pio->ireg_data = t5_sge_dbg_index_array[i][1];
+		sge_pio->ireg_local_offset = t5_sge_dbg_index_array[i][2];
+		sge_pio->ireg_offset_range = t5_sge_dbg_index_array[i][3];
+		t4_read_indirect(padap,
+				 sge_pio->ireg_addr,
+				 sge_pio->ireg_data,
+				 buff,
+				 sge_pio->ireg_offset_range,
+				 sge_pio->ireg_local_offset);
+		ch_sge_dbg++;
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_ulprx_la(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_ulprx_la *ulprx_la_buff;
+	int rc;
+
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, sizeof(struct cudbg_ulprx_la),
+			    &temp_buff);
+	if (rc)
+		return rc;
+
+	ulprx_la_buff = (struct cudbg_ulprx_la *)temp_buff.data;
+	t4_ulprx_read_la(padap, (u32 *)ulprx_la_buff->data);
+	ulprx_la_buff->size = ULPRX_LA_SIZE;
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_tp_la(struct cudbg_init *pdbg_init,
+			struct cudbg_buffer *dbg_buff,
+			struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_tp_la *tp_la_buff;
+	int size, rc;
+
+	size = sizeof(struct cudbg_tp_la) + TPLA_SIZE *  sizeof(u64);
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	tp_la_buff = (struct cudbg_tp_la *)temp_buff.data;
+	tp_la_buff->mode = DBGLAMODE_G(t4_read_reg(padap, TP_DBG_LA_CONFIG_A));
+	t4_tp_read_la(padap, (u64 *)tp_la_buff->data, NULL);
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_meminfo(struct cudbg_init *pdbg_init,
+			  struct cudbg_buffer *dbg_buff,
+			  struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_meminfo *meminfo_buff;
+	int rc;
+
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, sizeof(struct cudbg_meminfo),
+			    &temp_buff);
+	if (rc)
+		return rc;
+
+	meminfo_buff = (struct cudbg_meminfo *)temp_buff.data;
+	rc = cudbg_fill_meminfo(padap, meminfo_buff);
+	if (rc) {
+		cudbg_err->sys_err = rc;
+		cudbg_put_buff(pdbg_init, &temp_buff);
+		return rc;
+	}
+
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_cim_pif_la(struct cudbg_init *pdbg_init,
+			     struct cudbg_buffer *dbg_buff,
+			     struct cudbg_error *cudbg_err)
+{
+	struct cudbg_cim_pif_la *cim_pif_la_buff;
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	int size, rc;
+
+	size = sizeof(struct cudbg_cim_pif_la) +
+	       2 * CIM_PIFLA_SIZE * 6 * sizeof(u32);
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	cim_pif_la_buff = (struct cudbg_cim_pif_la *)temp_buff.data;
+	cim_pif_la_buff->size = CIM_PIFLA_SIZE;
+	t4_cim_read_pif_la(padap, (u32 *)cim_pif_la_buff->data,
+			   (u32 *)cim_pif_la_buff->data + 6 * CIM_PIFLA_SIZE,
+			   NULL, NULL);
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_clk_info(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_clk_info *clk_info_buff;
+	u64 tp_tick_us;
+	int rc;
+
+	if (!padap->params.vpd.cclk)
+		return CUDBG_STATUS_CCLK_NOT_DEFINED;
+
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, sizeof(struct cudbg_clk_info),
+			    &temp_buff);
+	if (rc)
+		return rc;
+
+	clk_info_buff = (struct cudbg_clk_info *)temp_buff.data;
+	clk_info_buff->cclk_ps = 1000000000 / padap->params.vpd.cclk; /* psec */
+	clk_info_buff->res = t4_read_reg(padap, TP_TIMER_RESOLUTION_A);
+	clk_info_buff->tre = TIMERRESOLUTION_G(clk_info_buff->res);
+	clk_info_buff->dack_re = DELAYEDACKRESOLUTION_G(clk_info_buff->res);
+	tp_tick_us = (clk_info_buff->cclk_ps << clk_info_buff->tre) / 1000000;
+
+	clk_info_buff->dack_timer =
+		(clk_info_buff->cclk_ps << clk_info_buff->dack_re) / 1000000 *
+		t4_read_reg(padap, TP_DACK_TIMER_A);
+	clk_info_buff->retransmit_min =
+		tp_tick_us * t4_read_reg(padap, TP_RXT_MIN_A);
+	clk_info_buff->retransmit_max =
+		tp_tick_us * t4_read_reg(padap, TP_RXT_MAX_A);
+	clk_info_buff->persist_timer_min =
+		tp_tick_us * t4_read_reg(padap, TP_PERS_MIN_A);
+	clk_info_buff->persist_timer_max =
+		tp_tick_us * t4_read_reg(padap, TP_PERS_MAX_A);
+	clk_info_buff->keepalive_idle_timer =
+		tp_tick_us * t4_read_reg(padap, TP_KEEP_IDLE_A);
+	clk_info_buff->keepalive_interval =
+		tp_tick_us * t4_read_reg(padap, TP_KEEP_INTVL_A);
+	clk_info_buff->initial_srtt =
+		tp_tick_us * INITSRTT_G(t4_read_reg(padap, TP_INIT_SRTT_A));
+	clk_info_buff->finwait2_timer =
+		tp_tick_us * t4_read_reg(padap, TP_FINWAIT2_TIMER_A);
+
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_pcie_indirect(struct cudbg_init *pdbg_init,
+				struct cudbg_buffer *dbg_buff,
+				struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct ireg_buf *ch_pcie;
+	int i, rc, n;
+	u32 size;
+
+	n = sizeof(t5_pcie_pdbg_array) / (IREG_NUM_ELEM * sizeof(u32));
+	size = sizeof(struct ireg_buf) * n * 2;
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	ch_pcie = (struct ireg_buf *)temp_buff.data;
+	/* PCIE_PDBG */
+	for (i = 0; i < n; i++) {
+		struct ireg_field *pcie_pio = &ch_pcie->tp_pio;
+		u32 *buff = ch_pcie->outbuf;
+
+		pcie_pio->ireg_addr = t5_pcie_pdbg_array[i][0];
+		pcie_pio->ireg_data = t5_pcie_pdbg_array[i][1];
+		pcie_pio->ireg_local_offset = t5_pcie_pdbg_array[i][2];
+		pcie_pio->ireg_offset_range = t5_pcie_pdbg_array[i][3];
+		t4_read_indirect(padap,
+				 pcie_pio->ireg_addr,
+				 pcie_pio->ireg_data,
+				 buff,
+				 pcie_pio->ireg_offset_range,
+				 pcie_pio->ireg_local_offset);
+		ch_pcie++;
+	}
+
+	/* PCIE_CDBG */
+	n = sizeof(t5_pcie_cdbg_array) / (IREG_NUM_ELEM * sizeof(u32));
+	for (i = 0; i < n; i++) {
+		struct ireg_field *pcie_pio = &ch_pcie->tp_pio;
+		u32 *buff = ch_pcie->outbuf;
+
+		pcie_pio->ireg_addr = t5_pcie_cdbg_array[i][0];
+		pcie_pio->ireg_data = t5_pcie_cdbg_array[i][1];
+		pcie_pio->ireg_local_offset = t5_pcie_cdbg_array[i][2];
+		pcie_pio->ireg_offset_range = t5_pcie_cdbg_array[i][3];
+		t4_read_indirect(padap,
+				 pcie_pio->ireg_addr,
+				 pcie_pio->ireg_data,
+				 buff,
+				 pcie_pio->ireg_offset_range,
+				 pcie_pio->ireg_local_offset);
+		ch_pcie++;
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_pm_indirect(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct ireg_buf *ch_pm;
+	int i, rc, n;
+	u32 size;
+
+	n = sizeof(t5_pm_rx_array) / (IREG_NUM_ELEM * sizeof(u32));
+	size = sizeof(struct ireg_buf) * n * 2;
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	ch_pm = (struct ireg_buf *)temp_buff.data;
+	/* PM_RX */
+	for (i = 0; i < n; i++) {
+		struct ireg_field *pm_pio = &ch_pm->tp_pio;
+		u32 *buff = ch_pm->outbuf;
+
+		pm_pio->ireg_addr = t5_pm_rx_array[i][0];
+		pm_pio->ireg_data = t5_pm_rx_array[i][1];
+		pm_pio->ireg_local_offset = t5_pm_rx_array[i][2];
+		pm_pio->ireg_offset_range = t5_pm_rx_array[i][3];
+		t4_read_indirect(padap,
+				 pm_pio->ireg_addr,
+				 pm_pio->ireg_data,
+				 buff,
+				 pm_pio->ireg_offset_range,
+				 pm_pio->ireg_local_offset);
+		ch_pm++;
+	}
+
+	/* PM_TX */
+	n = sizeof(t5_pm_tx_array) / (IREG_NUM_ELEM * sizeof(u32));
+	for (i = 0; i < n; i++) {
+		struct ireg_field *pm_pio = &ch_pm->tp_pio;
+		u32 *buff = ch_pm->outbuf;
+
+		pm_pio->ireg_addr = t5_pm_tx_array[i][0];
+		pm_pio->ireg_data = t5_pm_tx_array[i][1];
+		pm_pio->ireg_local_offset = t5_pm_tx_array[i][2];
+		pm_pio->ireg_offset_range = t5_pm_tx_array[i][3];
+		t4_read_indirect(padap,
+				 pm_pio->ireg_addr,
+				 pm_pio->ireg_data,
+				 buff,
+				 pm_pio->ireg_offset_range,
+				 pm_pio->ireg_local_offset);
+		ch_pm++;
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_tid(struct cudbg_init *pdbg_init,
+		      struct cudbg_buffer *dbg_buff,
+		      struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_tid_info_region_rev1 *tid1;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_tid_info_region *tid;
+	u32 para[2], val[2];
+	int rc;
+
+	rc = cudbg_get_buff(pdbg_init, dbg_buff,
+			    sizeof(struct cudbg_tid_info_region_rev1),
+			    &temp_buff);
+	if (rc)
+		return rc;
+
+	tid1 = (struct cudbg_tid_info_region_rev1 *)temp_buff.data;
+	tid = &tid1->tid;
+	tid1->ver_hdr.signature = CUDBG_ENTITY_SIGNATURE;
+	tid1->ver_hdr.revision = CUDBG_TID_INFO_REV;
+	tid1->ver_hdr.size = sizeof(struct cudbg_tid_info_region_rev1) -
+			     sizeof(struct cudbg_ver_hdr);
+
+	/* If firmware is not attached/alive, use backdoor register
+	 * access to collect dump.
+	 */
+	if (!is_fw_attached(pdbg_init))
+		goto fill_tid;
+
+#define FW_PARAM_PFVF_A(param) \
+	(FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_PFVF) | \
+	 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_PFVF_##param) | \
+	 FW_PARAMS_PARAM_Y_V(0) | \
+	 FW_PARAMS_PARAM_Z_V(0))
+
+	para[0] = FW_PARAM_PFVF_A(ETHOFLD_START);
+	para[1] = FW_PARAM_PFVF_A(ETHOFLD_END);
+	rc = t4_query_params(padap, padap->mbox, padap->pf, 0, 2, para, val);
+	if (rc <  0) {
+		cudbg_err->sys_err = rc;
+		cudbg_put_buff(pdbg_init, &temp_buff);
+		return rc;
+	}
+	tid->uotid_base = val[0];
+	tid->nuotids = val[1] - val[0] + 1;
+
+	if (is_t5(padap->params.chip)) {
+		tid->sb = t4_read_reg(padap, LE_DB_SERVER_INDEX_A) / 4;
+	} else if (is_t6(padap->params.chip)) {
+		tid1->tid_start =
+			t4_read_reg(padap, LE_DB_ACTIVE_TABLE_START_INDEX_A);
+		tid->sb = t4_read_reg(padap, LE_DB_SRVR_START_INDEX_A);
+
+		para[0] = FW_PARAM_PFVF_A(HPFILTER_START);
+		para[1] = FW_PARAM_PFVF_A(HPFILTER_END);
+		rc = t4_query_params(padap, padap->mbox, padap->pf, 0, 2,
+				     para, val);
+		if (rc < 0) {
+			cudbg_err->sys_err = rc;
+			cudbg_put_buff(pdbg_init, &temp_buff);
+			return rc;
+		}
+		tid->hpftid_base = val[0];
+		tid->nhpftids = val[1] - val[0] + 1;
+	}
+
+#undef FW_PARAM_PFVF_A
+
+fill_tid:
+	tid->ntids = padap->tids.ntids;
+	tid->nstids = padap->tids.nstids;
+	tid->stid_base = padap->tids.stid_base;
+	tid->hash_base = padap->tids.hash_base;
+
+	tid->natids = padap->tids.natids;
+	tid->nftids = padap->tids.nftids;
+	tid->ftid_base = padap->tids.ftid_base;
+	tid->aftid_base = padap->tids.aftid_base;
+	tid->aftid_end = padap->tids.aftid_end;
+
+	tid->sftid_base = padap->tids.sftid_base;
+	tid->nsftids = padap->tids.nsftids;
+
+	tid->flags = padap->flags;
+	tid->le_db_conf = t4_read_reg(padap, LE_DB_CONFIG_A);
+	tid->ip_users = t4_read_reg(padap, LE_DB_ACT_CNT_IPV4_A);
+	tid->ipv6_users = t4_read_reg(padap, LE_DB_ACT_CNT_IPV6_A);
+
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_pcie_config(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	u32 size, *value, j;
+	int i, rc, n;
+
+	size = sizeof(u32) * CUDBG_NUM_PCIE_CONFIG_REGS;
+	n = sizeof(t5_pcie_config_array) / (2 * sizeof(u32));
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	value = (u32 *)temp_buff.data;
+	for (i = 0; i < n; i++) {
+		for (j = t5_pcie_config_array[i][0];
+		     j <= t5_pcie_config_array[i][1]; j += 4) {
+			t4_hw_pci_read_cfg4(padap, j, value);
+			value++;
+		}
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+static int cudbg_sge_ctxt_check_valid(u32 *buf, int type)
+{
+	int index, bit, bit_pos = 0;
+
+	switch (type) {
+	case CTXT_EGRESS:
+		bit_pos = 176;
+		break;
+	case CTXT_INGRESS:
+		bit_pos = 141;
+		break;
+	case CTXT_FLM:
+		bit_pos = 89;
+		break;
+	}
+	index = bit_pos / 32;
+	bit =  bit_pos % 32;
+	return buf[index] & (1U << bit);
+}
+
+static int cudbg_get_ctxt_region_info(struct adapter *padap,
+				      struct cudbg_region_info *ctx_info,
+				      u8 *mem_type)
+{
+	struct cudbg_mem_desc mem_desc;
+	struct cudbg_meminfo meminfo;
+	u32 i, j, value, found;
+	u8 flq;
+	int rc;
+
+	rc = cudbg_fill_meminfo(padap, &meminfo);
+	if (rc)
+		return rc;
+
+	/* Get EGRESS and INGRESS context region size */
+	for (i = CTXT_EGRESS; i <= CTXT_INGRESS; i++) {
+		found = 0;
+		memset(&mem_desc, 0, sizeof(struct cudbg_mem_desc));
+		for (j = 0; j < ARRAY_SIZE(meminfo.avail); j++) {
+			rc = cudbg_get_mem_region(padap, &meminfo, j,
+						  cudbg_region[i],
+						  &mem_desc);
+			if (!rc) {
+				found = 1;
+				rc = cudbg_get_mem_relative(padap, &meminfo, j,
+							    &mem_desc.base,
+							    &mem_desc.limit);
+				if (rc) {
+					ctx_info[i].exist = false;
+					break;
+				}
+				ctx_info[i].exist = true;
+				ctx_info[i].start = mem_desc.base;
+				ctx_info[i].end = mem_desc.limit;
+				mem_type[i] = j;
+				break;
+			}
+		}
+		if (!found)
+			ctx_info[i].exist = false;
+	}
+
+	/* Get FLM and CNM max qid. */
+	value = t4_read_reg(padap, SGE_FLM_CFG_A);
+
+	/* Get number of data freelist queues */
+	flq = HDRSTARTFLQ_G(value);
+	ctx_info[CTXT_FLM].exist = true;
+	ctx_info[CTXT_FLM].end = (CUDBG_MAX_FL_QIDS >> flq) * SGE_CTXT_SIZE;
+
+	/* The number of CONM contexts are same as number of freelist
+	 * queues.
+	 */
+	ctx_info[CTXT_CNM].exist = true;
+	ctx_info[CTXT_CNM].end = ctx_info[CTXT_FLM].end;
+
+	return 0;
+}
+
+int cudbg_dump_context_size(struct adapter *padap)
+{
+	struct cudbg_region_info region_info[CTXT_CNM + 1] = { {0} };
+	u8 mem_type[CTXT_INGRESS + 1] = { 0 };
+	u32 i, size = 0;
+	int rc;
+
+	/* Get max valid qid for each type of queue */
+	rc = cudbg_get_ctxt_region_info(padap, region_info, mem_type);
+	if (rc)
+		return rc;
+
+	for (i = 0; i < CTXT_CNM; i++) {
+		if (!region_info[i].exist) {
+			if (i == CTXT_EGRESS || i == CTXT_INGRESS)
+				size += CUDBG_LOWMEM_MAX_CTXT_QIDS *
+					SGE_CTXT_SIZE;
+			continue;
+		}
+
+		size += (region_info[i].end - region_info[i].start + 1) /
+			SGE_CTXT_SIZE;
+	}
+	return size * sizeof(struct cudbg_ch_cntxt);
+}
+
+static void cudbg_read_sge_ctxt(struct cudbg_init *pdbg_init, u32 cid,
+				enum ctxt_type ctype, u32 *data)
+{
+	struct adapter *padap = pdbg_init->adap;
+	int rc = -1;
+
+	/* Under heavy traffic, the SGE Queue contexts registers will be
+	 * frequently accessed by firmware.
+	 *
+	 * To avoid conflicts with firmware, always ask firmware to fetch
+	 * the SGE Queue contexts via mailbox. On failure, fallback to
+	 * accessing hardware registers directly.
+	 */
+	if (is_fw_attached(pdbg_init))
+		rc = t4_sge_ctxt_rd(padap, padap->mbox, cid, ctype, data);
+	if (rc)
+		t4_sge_ctxt_rd_bd(padap, cid, ctype, data);
+}
+
+static void cudbg_get_sge_ctxt_fw(struct cudbg_init *pdbg_init, u32 max_qid,
+				  u8 ctxt_type,
+				  struct cudbg_ch_cntxt **out_buff)
+{
+	struct cudbg_ch_cntxt *buff = *out_buff;
+	int rc;
+	u32 j;
+
+	for (j = 0; j < max_qid; j++) {
+		cudbg_read_sge_ctxt(pdbg_init, j, ctxt_type, buff->data);
+		rc = cudbg_sge_ctxt_check_valid(buff->data, ctxt_type);
+		if (!rc)
+			continue;
+
+		buff->cntxt_type = ctxt_type;
+		buff->cntxt_id = j;
+		buff++;
+		if (ctxt_type == CTXT_FLM) {
+			cudbg_read_sge_ctxt(pdbg_init, j, CTXT_CNM, buff->data);
+			buff->cntxt_type = CTXT_CNM;
+			buff->cntxt_id = j;
+			buff++;
+		}
+	}
+
+	*out_buff = buff;
+}
+
+int cudbg_collect_dump_context(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err)
+{
+	struct cudbg_region_info region_info[CTXT_CNM + 1] = { {0} };
+	struct adapter *padap = pdbg_init->adap;
+	u32 j, size, max_ctx_size, max_ctx_qid;
+	u8 mem_type[CTXT_INGRESS + 1] = { 0 };
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_ch_cntxt *buff;
+	u64 *dst_off, *src_off;
+	u8 *ctx_buf;
+	u8 i, k;
+	int rc;
+
+	/* Get max valid qid for each type of queue */
+	rc = cudbg_get_ctxt_region_info(padap, region_info, mem_type);
+	if (rc)
+		return rc;
+
+	rc = cudbg_dump_context_size(padap);
+	if (rc <= 0)
+		return CUDBG_STATUS_ENTITY_NOT_FOUND;
+
+	size = rc;
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	/* Get buffer with enough space to read the biggest context
+	 * region in memory.
+	 */
+	max_ctx_size = max(region_info[CTXT_EGRESS].end -
+			   region_info[CTXT_EGRESS].start + 1,
+			   region_info[CTXT_INGRESS].end -
+			   region_info[CTXT_INGRESS].start + 1);
+
+	ctx_buf = kvzalloc(max_ctx_size, GFP_KERNEL);
+	if (!ctx_buf) {
+		cudbg_put_buff(pdbg_init, &temp_buff);
+		return -ENOMEM;
+	}
+
+	buff = (struct cudbg_ch_cntxt *)temp_buff.data;
+
+	/* Collect EGRESS and INGRESS context data.
+	 * In case of failures, fallback to collecting via FW or
+	 * backdoor access.
+	 */
+	for (i = CTXT_EGRESS; i <= CTXT_INGRESS; i++) {
+		if (!region_info[i].exist) {
+			max_ctx_qid = CUDBG_LOWMEM_MAX_CTXT_QIDS;
+			cudbg_get_sge_ctxt_fw(pdbg_init, max_ctx_qid, i,
+					      &buff);
+			continue;
+		}
+
+		max_ctx_size = region_info[i].end - region_info[i].start + 1;
+		max_ctx_qid = max_ctx_size / SGE_CTXT_SIZE;
+
+		/* If firmware is not attached/alive, use backdoor register
+		 * access to collect dump.
+		 */
+		if (is_fw_attached(pdbg_init)) {
+			t4_sge_ctxt_flush(padap, padap->mbox, i);
+
+			rc = t4_memory_rw(padap, MEMWIN_NIC, mem_type[i],
+					  region_info[i].start, max_ctx_size,
+					  (__be32 *)ctx_buf, 1);
+		}
+
+		if (rc || !is_fw_attached(pdbg_init)) {
+			max_ctx_qid = CUDBG_LOWMEM_MAX_CTXT_QIDS;
+			cudbg_get_sge_ctxt_fw(pdbg_init, max_ctx_qid, i,
+					      &buff);
+			continue;
+		}
+
+		for (j = 0; j < max_ctx_qid; j++) {
+			src_off = (u64 *)(ctx_buf + j * SGE_CTXT_SIZE);
+			dst_off = (u64 *)buff->data;
+
+			/* The data is stored in 64-bit cpu order.  Convert it
+			 * to big endian before parsing.
+			 */
+			for (k = 0; k < SGE_CTXT_SIZE / sizeof(u64); k++)
+				dst_off[k] = cpu_to_be64(src_off[k]);
+
+			rc = cudbg_sge_ctxt_check_valid(buff->data, i);
+			if (!rc)
+				continue;
+
+			buff->cntxt_type = i;
+			buff->cntxt_id = j;
+			buff++;
+		}
+	}
+
+	kvfree(ctx_buf);
+
+	/* Collect FREELIST and CONGESTION MANAGER contexts */
+	max_ctx_size = region_info[CTXT_FLM].end -
+		       region_info[CTXT_FLM].start + 1;
+	max_ctx_qid = max_ctx_size / SGE_CTXT_SIZE;
+	/* Since FLM and CONM are 1-to-1 mapped, the below function
+	 * will fetch both FLM and CONM contexts.
+	 */
+	cudbg_get_sge_ctxt_fw(pdbg_init, max_ctx_qid, CTXT_FLM, &buff);
+
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+static inline void cudbg_tcamxy2valmask(u64 x, u64 y, u8 *addr, u64 *mask)
+{
+	*mask = x | y;
+	y = (__force u64)cpu_to_be64(y);
+	memcpy(addr, (char *)&y + 2, ETH_ALEN);
+}
+
+static void cudbg_mps_rpl_backdoor(struct adapter *padap,
+				   struct fw_ldst_mps_rplc *mps_rplc)
+{
+	if (is_t5(padap->params.chip)) {
+		mps_rplc->rplc255_224 = htonl(t4_read_reg(padap,
+							  MPS_VF_RPLCT_MAP3_A));
+		mps_rplc->rplc223_192 = htonl(t4_read_reg(padap,
+							  MPS_VF_RPLCT_MAP2_A));
+		mps_rplc->rplc191_160 = htonl(t4_read_reg(padap,
+							  MPS_VF_RPLCT_MAP1_A));
+		mps_rplc->rplc159_128 = htonl(t4_read_reg(padap,
+							  MPS_VF_RPLCT_MAP0_A));
+	} else {
+		mps_rplc->rplc255_224 = htonl(t4_read_reg(padap,
+							  MPS_VF_RPLCT_MAP7_A));
+		mps_rplc->rplc223_192 = htonl(t4_read_reg(padap,
+							  MPS_VF_RPLCT_MAP6_A));
+		mps_rplc->rplc191_160 = htonl(t4_read_reg(padap,
+							  MPS_VF_RPLCT_MAP5_A));
+		mps_rplc->rplc159_128 = htonl(t4_read_reg(padap,
+							  MPS_VF_RPLCT_MAP4_A));
+	}
+	mps_rplc->rplc127_96 = htonl(t4_read_reg(padap, MPS_VF_RPLCT_MAP3_A));
+	mps_rplc->rplc95_64 = htonl(t4_read_reg(padap, MPS_VF_RPLCT_MAP2_A));
+	mps_rplc->rplc63_32 = htonl(t4_read_reg(padap, MPS_VF_RPLCT_MAP1_A));
+	mps_rplc->rplc31_0 = htonl(t4_read_reg(padap, MPS_VF_RPLCT_MAP0_A));
+}
+
+static int cudbg_collect_tcam_index(struct cudbg_init *pdbg_init,
+				    struct cudbg_mps_tcam *tcam, u32 idx)
+{
+	struct adapter *padap = pdbg_init->adap;
+	u64 tcamy, tcamx, val;
+	u32 ctl, data2;
+	int rc = 0;
+
+	if (CHELSIO_CHIP_VERSION(padap->params.chip) >= CHELSIO_T6) {
+		/* CtlReqID   - 1: use Host Driver Requester ID
+		 * CtlCmdType - 0: Read, 1: Write
+		 * CtlTcamSel - 0: TCAM0, 1: TCAM1
+		 * CtlXYBitSel- 0: Y bit, 1: X bit
+		 */
+
+		/* Read tcamy */
+		ctl = CTLREQID_V(1) | CTLCMDTYPE_V(0) | CTLXYBITSEL_V(0);
+		if (idx < 256)
+			ctl |= CTLTCAMINDEX_V(idx) | CTLTCAMSEL_V(0);
+		else
+			ctl |= CTLTCAMINDEX_V(idx - 256) | CTLTCAMSEL_V(1);
+
+		t4_write_reg(padap, MPS_CLS_TCAM_DATA2_CTL_A, ctl);
+		val = t4_read_reg(padap, MPS_CLS_TCAM_RDATA1_REQ_ID1_A);
+		tcamy = DMACH_G(val) << 32;
+		tcamy |= t4_read_reg(padap, MPS_CLS_TCAM_RDATA0_REQ_ID1_A);
+		data2 = t4_read_reg(padap, MPS_CLS_TCAM_RDATA2_REQ_ID1_A);
+		tcam->lookup_type = DATALKPTYPE_G(data2);
+
+		/* 0 - Outer header, 1 - Inner header
+		 * [71:48] bit locations are overloaded for
+		 * outer vs. inner lookup types.
+		 */
+		if (tcam->lookup_type && tcam->lookup_type != DATALKPTYPE_M) {
+			/* Inner header VNI */
+			tcam->vniy = (data2 & DATAVIDH2_F) | DATAVIDH1_G(data2);
+			tcam->vniy = (tcam->vniy << 16) | VIDL_G(val);
+			tcam->dip_hit = data2 & DATADIPHIT_F;
+		} else {
+			tcam->vlan_vld = data2 & DATAVIDH2_F;
+			tcam->ivlan = VIDL_G(val);
+		}
+
+		tcam->port_num = DATAPORTNUM_G(data2);
+
+		/* Read tcamx. Change the control param */
+		ctl |= CTLXYBITSEL_V(1);
+		t4_write_reg(padap, MPS_CLS_TCAM_DATA2_CTL_A, ctl);
+		val = t4_read_reg(padap, MPS_CLS_TCAM_RDATA1_REQ_ID1_A);
+		tcamx = DMACH_G(val) << 32;
+		tcamx |= t4_read_reg(padap, MPS_CLS_TCAM_RDATA0_REQ_ID1_A);
+		data2 = t4_read_reg(padap, MPS_CLS_TCAM_RDATA2_REQ_ID1_A);
+		if (tcam->lookup_type && tcam->lookup_type != DATALKPTYPE_M) {
+			/* Inner header VNI mask */
+			tcam->vnix = (data2 & DATAVIDH2_F) | DATAVIDH1_G(data2);
+			tcam->vnix = (tcam->vnix << 16) | VIDL_G(val);
+		}
+	} else {
+		tcamy = t4_read_reg64(padap, MPS_CLS_TCAM_Y_L(idx));
+		tcamx = t4_read_reg64(padap, MPS_CLS_TCAM_X_L(idx));
+	}
+
+	/* If no entry, return */
+	if (tcamx & tcamy)
+		return rc;
+
+	tcam->cls_lo = t4_read_reg(padap, MPS_CLS_SRAM_L(idx));
+	tcam->cls_hi = t4_read_reg(padap, MPS_CLS_SRAM_H(idx));
+
+	if (is_t5(padap->params.chip))
+		tcam->repli = (tcam->cls_lo & REPLICATE_F);
+	else if (is_t6(padap->params.chip))
+		tcam->repli = (tcam->cls_lo & T6_REPLICATE_F);
+
+	if (tcam->repli) {
+		struct fw_ldst_cmd ldst_cmd;
+		struct fw_ldst_mps_rplc mps_rplc;
+
+		memset(&ldst_cmd, 0, sizeof(ldst_cmd));
+		ldst_cmd.op_to_addrspace =
+			htonl(FW_CMD_OP_V(FW_LDST_CMD) |
+			      FW_CMD_REQUEST_F | FW_CMD_READ_F |
+			      FW_LDST_CMD_ADDRSPACE_V(FW_LDST_ADDRSPC_MPS));
+		ldst_cmd.cycles_to_len16 = htonl(FW_LEN16(ldst_cmd));
+		ldst_cmd.u.mps.rplc.fid_idx =
+			htons(FW_LDST_CMD_FID_V(FW_LDST_MPS_RPLC) |
+			      FW_LDST_CMD_IDX_V(idx));
+
+		/* If firmware is not attached/alive, use backdoor register
+		 * access to collect dump.
+		 */
+		if (is_fw_attached(pdbg_init))
+			rc = t4_wr_mbox(padap, padap->mbox, &ldst_cmd,
+					sizeof(ldst_cmd), &ldst_cmd);
+
+		if (rc || !is_fw_attached(pdbg_init)) {
+			cudbg_mps_rpl_backdoor(padap, &mps_rplc);
+			/* Ignore error since we collected directly from
+			 * reading registers.
+			 */
+			rc = 0;
+		} else {
+			mps_rplc = ldst_cmd.u.mps.rplc;
+		}
+
+		tcam->rplc[0] = ntohl(mps_rplc.rplc31_0);
+		tcam->rplc[1] = ntohl(mps_rplc.rplc63_32);
+		tcam->rplc[2] = ntohl(mps_rplc.rplc95_64);
+		tcam->rplc[3] = ntohl(mps_rplc.rplc127_96);
+		if (padap->params.arch.mps_rplc_size > CUDBG_MAX_RPLC_SIZE) {
+			tcam->rplc[4] = ntohl(mps_rplc.rplc159_128);
+			tcam->rplc[5] = ntohl(mps_rplc.rplc191_160);
+			tcam->rplc[6] = ntohl(mps_rplc.rplc223_192);
+			tcam->rplc[7] = ntohl(mps_rplc.rplc255_224);
+		}
+	}
+	cudbg_tcamxy2valmask(tcamx, tcamy, tcam->addr, &tcam->mask);
+	tcam->idx = idx;
+	tcam->rplc_size = padap->params.arch.mps_rplc_size;
+	return rc;
+}
+
+int cudbg_collect_mps_tcam(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	u32 size = 0, i, n, total_size = 0;
+	struct cudbg_mps_tcam *tcam;
+	int rc;
+
+	n = padap->params.arch.mps_tcam_size;
+	size = sizeof(struct cudbg_mps_tcam) * n;
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	tcam = (struct cudbg_mps_tcam *)temp_buff.data;
+	for (i = 0; i < n; i++) {
+		rc = cudbg_collect_tcam_index(pdbg_init, tcam, i);
+		if (rc) {
+			cudbg_err->sys_err = rc;
+			cudbg_put_buff(pdbg_init, &temp_buff);
+			return rc;
+		}
+		total_size += sizeof(struct cudbg_mps_tcam);
+		tcam++;
+	}
+
+	if (!total_size) {
+		rc = CUDBG_SYSTEM_ERROR;
+		cudbg_err->sys_err = rc;
+		cudbg_put_buff(pdbg_init, &temp_buff);
+		return rc;
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_vpd_data(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	char vpd_str[CUDBG_VPD_VER_LEN + 1];
+	u32 scfg_vers, vpd_vers, fw_vers;
+	struct cudbg_vpd_data *vpd_data;
+	struct vpd_params vpd = { 0 };
+	int rc, ret;
+
+	rc = t4_get_raw_vpd_params(padap, &vpd);
+	if (rc)
+		return rc;
+
+	rc = t4_get_fw_version(padap, &fw_vers);
+	if (rc)
+		return rc;
+
+	/* Serial Configuration Version is located beyond the PF's vpd size.
+	 * Temporarily give access to entire EEPROM to get it.
+	 */
+	rc = pci_set_vpd_size(padap->pdev, EEPROMVSIZE);
+	if (rc < 0)
+		return rc;
+
+	ret = cudbg_read_vpd_reg(padap, CUDBG_SCFG_VER_ADDR, CUDBG_SCFG_VER_LEN,
+				 &scfg_vers);
+
+	/* Restore back to original PF's vpd size */
+	rc = pci_set_vpd_size(padap->pdev, CUDBG_VPD_PF_SIZE);
+	if (rc < 0)
+		return rc;
+
+	if (ret)
+		return ret;
+
+	rc = cudbg_read_vpd_reg(padap, CUDBG_VPD_VER_ADDR, CUDBG_VPD_VER_LEN,
+				vpd_str);
+	if (rc)
+		return rc;
+
+	vpd_str[CUDBG_VPD_VER_LEN] = '\0';
+	rc = kstrtouint(vpd_str, 0, &vpd_vers);
+	if (rc)
+		return rc;
+
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, sizeof(struct cudbg_vpd_data),
+			    &temp_buff);
+	if (rc)
+		return rc;
+
+	vpd_data = (struct cudbg_vpd_data *)temp_buff.data;
+	memcpy(vpd_data->sn, vpd.sn, SERNUM_LEN + 1);
+	memcpy(vpd_data->bn, vpd.pn, PN_LEN + 1);
+	memcpy(vpd_data->na, vpd.na, MACADDR_LEN + 1);
+	memcpy(vpd_data->mn, vpd.id, ID_LEN + 1);
+	vpd_data->scfg_vers = scfg_vers;
+	vpd_data->vpd_vers = vpd_vers;
+	vpd_data->fw_major = FW_HDR_FW_VER_MAJOR_G(fw_vers);
+	vpd_data->fw_minor = FW_HDR_FW_VER_MINOR_G(fw_vers);
+	vpd_data->fw_micro = FW_HDR_FW_VER_MICRO_G(fw_vers);
+	vpd_data->fw_build = FW_HDR_FW_VER_BUILD_G(fw_vers);
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+static int cudbg_read_tid(struct cudbg_init *pdbg_init, u32 tid,
+			  struct cudbg_tid_data *tid_data)
+{
+	struct adapter *padap = pdbg_init->adap;
+	int i, cmd_retry = 8;
+	u32 val;
+
+	/* Fill REQ_DATA regs with 0's */
+	for (i = 0; i < NUM_LE_DB_DBGI_REQ_DATA_INSTANCES; i++)
+		t4_write_reg(padap, LE_DB_DBGI_REQ_DATA_A + (i << 2), 0);
+
+	/* Write DBIG command */
+	val = DBGICMD_V(4) | DBGITID_V(tid);
+	t4_write_reg(padap, LE_DB_DBGI_REQ_TCAM_CMD_A, val);
+	tid_data->dbig_cmd = val;
+
+	val = DBGICMDSTRT_F | DBGICMDMODE_V(1); /* LE mode */
+	t4_write_reg(padap, LE_DB_DBGI_CONFIG_A, val);
+	tid_data->dbig_conf = val;
+
+	/* Poll the DBGICMDBUSY bit */
+	val = 1;
+	while (val) {
+		val = t4_read_reg(padap, LE_DB_DBGI_CONFIG_A);
+		val = val & DBGICMDBUSY_F;
+		cmd_retry--;
+		if (!cmd_retry)
+			return CUDBG_SYSTEM_ERROR;
+	}
+
+	/* Check RESP status */
+	val = t4_read_reg(padap, LE_DB_DBGI_RSP_STATUS_A);
+	tid_data->dbig_rsp_stat = val;
+	if (!(val & 1))
+		return CUDBG_SYSTEM_ERROR;
+
+	/* Read RESP data */
+	for (i = 0; i < NUM_LE_DB_DBGI_RSP_DATA_INSTANCES; i++)
+		tid_data->data[i] = t4_read_reg(padap,
+						LE_DB_DBGI_RSP_DATA_A +
+						(i << 2));
+	tid_data->tid = tid;
+	return 0;
+}
+
+static int cudbg_get_le_type(u32 tid, struct cudbg_tcam tcam_region)
+{
+	int type = LE_ET_UNKNOWN;
+
+	if (tid < tcam_region.server_start)
+		type = LE_ET_TCAM_CON;
+	else if (tid < tcam_region.filter_start)
+		type = LE_ET_TCAM_SERVER;
+	else if (tid < tcam_region.clip_start)
+		type = LE_ET_TCAM_FILTER;
+	else if (tid < tcam_region.routing_start)
+		type = LE_ET_TCAM_CLIP;
+	else if (tid < tcam_region.tid_hash_base)
+		type = LE_ET_TCAM_ROUTING;
+	else if (tid < tcam_region.max_tid)
+		type = LE_ET_HASH_CON;
+	else
+		type = LE_ET_INVALID_TID;
+
+	return type;
+}
+
+static int cudbg_is_ipv6_entry(struct cudbg_tid_data *tid_data,
+			       struct cudbg_tcam tcam_region)
+{
+	int ipv6 = 0;
+	int le_type;
+
+	le_type = cudbg_get_le_type(tid_data->tid, tcam_region);
+	if (tid_data->tid & 1)
+		return 0;
+
+	if (le_type == LE_ET_HASH_CON) {
+		ipv6 = tid_data->data[16] & 0x8000;
+	} else if (le_type == LE_ET_TCAM_CON) {
+		ipv6 = tid_data->data[16] & 0x8000;
+		if (ipv6)
+			ipv6 = tid_data->data[9] == 0x00C00000;
+	} else {
+		ipv6 = 0;
+	}
+	return ipv6;
+}
+
+void cudbg_fill_le_tcam_info(struct adapter *padap,
+			     struct cudbg_tcam *tcam_region)
+{
+	u32 value;
+
+	/* Get the LE regions */
+	value = t4_read_reg(padap, LE_DB_TID_HASHBASE_A); /* hash base index */
+	tcam_region->tid_hash_base = value;
+
+	/* Get routing table index */
+	value = t4_read_reg(padap, LE_DB_ROUTING_TABLE_INDEX_A);
+	tcam_region->routing_start = value;
+
+	/*Get clip table index */
+	value = t4_read_reg(padap, LE_DB_CLIP_TABLE_INDEX_A);
+	tcam_region->clip_start = value;
+
+	/* Get filter table index */
+	value = t4_read_reg(padap, LE_DB_FILTER_TABLE_INDEX_A);
+	tcam_region->filter_start = value;
+
+	/* Get server table index */
+	value = t4_read_reg(padap, LE_DB_SERVER_INDEX_A);
+	tcam_region->server_start = value;
+
+	/* Check whether hash is enabled and calculate the max tids */
+	value = t4_read_reg(padap, LE_DB_CONFIG_A);
+	if ((value >> HASHEN_S) & 1) {
+		value = t4_read_reg(padap, LE_DB_HASH_CONFIG_A);
+		if (CHELSIO_CHIP_VERSION(padap->params.chip) > CHELSIO_T5) {
+			tcam_region->max_tid = (value & 0xFFFFF) +
+					       tcam_region->tid_hash_base;
+		} else {
+			value = HASHTIDSIZE_G(value);
+			value = 1 << value;
+			tcam_region->max_tid = value +
+					       tcam_region->tid_hash_base;
+		}
+	} else { /* hash not enabled */
+		tcam_region->max_tid = CUDBG_MAX_TCAM_TID;
+	}
+}
+
+int cudbg_collect_le_tcam(struct cudbg_init *pdbg_init,
+			  struct cudbg_buffer *dbg_buff,
+			  struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_tcam tcam_region = { 0 };
+	struct cudbg_tid_data *tid_data;
+	u32 bytes = 0;
+	int rc, size;
+	u32 i;
+
+	cudbg_fill_le_tcam_info(padap, &tcam_region);
+
+	size = sizeof(struct cudbg_tid_data) * tcam_region.max_tid;
+	size += sizeof(struct cudbg_tcam);
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	memcpy(temp_buff.data, &tcam_region, sizeof(struct cudbg_tcam));
+	bytes = sizeof(struct cudbg_tcam);
+	tid_data = (struct cudbg_tid_data *)(temp_buff.data + bytes);
+	/* read all tid */
+	for (i = 0; i < tcam_region.max_tid; ) {
+		rc = cudbg_read_tid(pdbg_init, i, tid_data);
+		if (rc) {
+			cudbg_err->sys_err = rc;
+			cudbg_put_buff(pdbg_init, &temp_buff);
+			return rc;
+		}
+
+		/* ipv6 takes two tids */
+		cudbg_is_ipv6_entry(tid_data, tcam_region) ? i += 2 : i++;
+
+		tid_data++;
+		bytes += sizeof(struct cudbg_tid_data);
+	}
+
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_cctrl(struct cudbg_init *pdbg_init,
+			struct cudbg_buffer *dbg_buff,
+			struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	u32 size;
+	int rc;
+
+	size = sizeof(u16) * NMTUS * NCCTRL_WIN;
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	t4_read_cong_tbl(padap, (void *)temp_buff.data);
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_ma_indirect(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct ireg_buf *ma_indr;
+	int i, rc, n;
+	u32 size, j;
+
+	if (CHELSIO_CHIP_VERSION(padap->params.chip) < CHELSIO_T6)
+		return CUDBG_STATUS_ENTITY_NOT_FOUND;
+
+	n = sizeof(t6_ma_ireg_array) / (IREG_NUM_ELEM * sizeof(u32));
+	size = sizeof(struct ireg_buf) * n * 2;
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	ma_indr = (struct ireg_buf *)temp_buff.data;
+	for (i = 0; i < n; i++) {
+		struct ireg_field *ma_fli = &ma_indr->tp_pio;
+		u32 *buff = ma_indr->outbuf;
+
+		ma_fli->ireg_addr = t6_ma_ireg_array[i][0];
+		ma_fli->ireg_data = t6_ma_ireg_array[i][1];
+		ma_fli->ireg_local_offset = t6_ma_ireg_array[i][2];
+		ma_fli->ireg_offset_range = t6_ma_ireg_array[i][3];
+		t4_read_indirect(padap, ma_fli->ireg_addr, ma_fli->ireg_data,
+				 buff, ma_fli->ireg_offset_range,
+				 ma_fli->ireg_local_offset);
+		ma_indr++;
+	}
+
+	n = sizeof(t6_ma_ireg_array2) / (IREG_NUM_ELEM * sizeof(u32));
+	for (i = 0; i < n; i++) {
+		struct ireg_field *ma_fli = &ma_indr->tp_pio;
+		u32 *buff = ma_indr->outbuf;
+
+		ma_fli->ireg_addr = t6_ma_ireg_array2[i][0];
+		ma_fli->ireg_data = t6_ma_ireg_array2[i][1];
+		ma_fli->ireg_local_offset = t6_ma_ireg_array2[i][2];
+		for (j = 0; j < t6_ma_ireg_array2[i][3]; j++) {
+			t4_read_indirect(padap, ma_fli->ireg_addr,
+					 ma_fli->ireg_data, buff, 1,
+					 ma_fli->ireg_local_offset);
+			buff++;
+			ma_fli->ireg_local_offset += 0x20;
+		}
+		ma_indr++;
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_ulptx_la(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_ulptx_la *ulptx_la_buff;
+	u32 i, j;
+	int rc;
+
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, sizeof(struct cudbg_ulptx_la),
+			    &temp_buff);
+	if (rc)
+		return rc;
+
+	ulptx_la_buff = (struct cudbg_ulptx_la *)temp_buff.data;
+	for (i = 0; i < CUDBG_NUM_ULPTX; i++) {
+		ulptx_la_buff->rdptr[i] = t4_read_reg(padap,
+						      ULP_TX_LA_RDPTR_0_A +
+						      0x10 * i);
+		ulptx_la_buff->wrptr[i] = t4_read_reg(padap,
+						      ULP_TX_LA_WRPTR_0_A +
+						      0x10 * i);
+		ulptx_la_buff->rddata[i] = t4_read_reg(padap,
+						       ULP_TX_LA_RDDATA_0_A +
+						       0x10 * i);
+		for (j = 0; j < CUDBG_NUM_ULPTX_READ; j++)
+			ulptx_la_buff->rd_data[i][j] =
+				t4_read_reg(padap,
+					    ULP_TX_LA_RDDATA_0_A + 0x10 * i);
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_up_cim_indirect(struct cudbg_init *pdbg_init,
+				  struct cudbg_buffer *dbg_buff,
+				  struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	u32 local_offset, local_range;
+	struct ireg_buf *up_cim;
+	u32 size, j, iter;
+	u32 instance = 0;
+	int i, rc, n;
+
+	if (is_t5(padap->params.chip))
+		n = sizeof(t5_up_cim_reg_array) /
+		    ((IREG_NUM_ELEM + 1) * sizeof(u32));
+	else if (is_t6(padap->params.chip))
+		n = sizeof(t6_up_cim_reg_array) /
+		    ((IREG_NUM_ELEM + 1) * sizeof(u32));
+	else
+		return CUDBG_STATUS_NOT_IMPLEMENTED;
+
+	size = sizeof(struct ireg_buf) * n;
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	up_cim = (struct ireg_buf *)temp_buff.data;
+	for (i = 0; i < n; i++) {
+		struct ireg_field *up_cim_reg = &up_cim->tp_pio;
+		u32 *buff = up_cim->outbuf;
+
+		if (is_t5(padap->params.chip)) {
+			up_cim_reg->ireg_addr = t5_up_cim_reg_array[i][0];
+			up_cim_reg->ireg_data = t5_up_cim_reg_array[i][1];
+			up_cim_reg->ireg_local_offset =
+						t5_up_cim_reg_array[i][2];
+			up_cim_reg->ireg_offset_range =
+						t5_up_cim_reg_array[i][3];
+			instance = t5_up_cim_reg_array[i][4];
+		} else if (is_t6(padap->params.chip)) {
+			up_cim_reg->ireg_addr = t6_up_cim_reg_array[i][0];
+			up_cim_reg->ireg_data = t6_up_cim_reg_array[i][1];
+			up_cim_reg->ireg_local_offset =
+						t6_up_cim_reg_array[i][2];
+			up_cim_reg->ireg_offset_range =
+						t6_up_cim_reg_array[i][3];
+			instance = t6_up_cim_reg_array[i][4];
+		}
+
+		switch (instance) {
+		case NUM_CIM_CTL_TSCH_CHANNEL_INSTANCES:
+			iter = up_cim_reg->ireg_offset_range;
+			local_offset = 0x120;
+			local_range = 1;
+			break;
+		case NUM_CIM_CTL_TSCH_CHANNEL_TSCH_CLASS_INSTANCES:
+			iter = up_cim_reg->ireg_offset_range;
+			local_offset = 0x10;
+			local_range = 1;
+			break;
+		default:
+			iter = 1;
+			local_offset = 0;
+			local_range = up_cim_reg->ireg_offset_range;
+			break;
+		}
+
+		for (j = 0; j < iter; j++, buff++) {
+			rc = t4_cim_read(padap,
+					 up_cim_reg->ireg_local_offset +
+					 (j * local_offset), local_range, buff);
+			if (rc) {
+				cudbg_put_buff(pdbg_init, &temp_buff);
+				return rc;
+			}
+		}
+		up_cim++;
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_pbt_tables(struct cudbg_init *pdbg_init,
+			     struct cudbg_buffer *dbg_buff,
+			     struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct cudbg_pbt_tables *pbt;
+	int i, rc;
+	u32 addr;
+
+	rc = cudbg_get_buff(pdbg_init, dbg_buff,
+			    sizeof(struct cudbg_pbt_tables),
+			    &temp_buff);
+	if (rc)
+		return rc;
+
+	pbt = (struct cudbg_pbt_tables *)temp_buff.data;
+	/* PBT dynamic entries */
+	addr = CUDBG_CHAC_PBT_ADDR;
+	for (i = 0; i < CUDBG_PBT_DYNAMIC_ENTRIES; i++) {
+		rc = t4_cim_read(padap, addr + (i * 4), 1,
+				 &pbt->pbt_dynamic[i]);
+		if (rc) {
+			cudbg_err->sys_err = rc;
+			cudbg_put_buff(pdbg_init, &temp_buff);
+			return rc;
+		}
+	}
+
+	/* PBT static entries */
+	/* static entries start when bit 6 is set */
+	addr = CUDBG_CHAC_PBT_ADDR + (1 << 6);
+	for (i = 0; i < CUDBG_PBT_STATIC_ENTRIES; i++) {
+		rc = t4_cim_read(padap, addr + (i * 4), 1,
+				 &pbt->pbt_static[i]);
+		if (rc) {
+			cudbg_err->sys_err = rc;
+			cudbg_put_buff(pdbg_init, &temp_buff);
+			return rc;
+		}
+	}
+
+	/* LRF entries */
+	addr = CUDBG_CHAC_PBT_LRF;
+	for (i = 0; i < CUDBG_LRF_ENTRIES; i++) {
+		rc = t4_cim_read(padap, addr + (i * 4), 1,
+				 &pbt->lrf_table[i]);
+		if (rc) {
+			cudbg_err->sys_err = rc;
+			cudbg_put_buff(pdbg_init, &temp_buff);
+			return rc;
+		}
+	}
+
+	/* PBT data entries */
+	addr = CUDBG_CHAC_PBT_DATA;
+	for (i = 0; i < CUDBG_PBT_DATA_ENTRIES; i++) {
+		rc = t4_cim_read(padap, addr + (i * 4), 1,
+				 &pbt->pbt_data[i]);
+		if (rc) {
+			cudbg_err->sys_err = rc;
+			cudbg_put_buff(pdbg_init, &temp_buff);
+			return rc;
+		}
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_mbox_log(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_mbox_log *mboxlog = NULL;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct mbox_cmd_log *log = NULL;
+	struct mbox_cmd *entry;
+	unsigned int entry_idx;
+	u16 mbox_cmds;
+	int i, k, rc;
+	u64 flit;
+	u32 size;
+
+	log = padap->mbox_log;
+	mbox_cmds = padap->mbox_log->size;
+	size = sizeof(struct cudbg_mbox_log) * mbox_cmds;
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	mboxlog = (struct cudbg_mbox_log *)temp_buff.data;
+	for (k = 0; k < mbox_cmds; k++) {
+		entry_idx = log->cursor + k;
+		if (entry_idx >= log->size)
+			entry_idx -= log->size;
+
+		entry = mbox_cmd_log_entry(log, entry_idx);
+		/* skip over unused entries */
+		if (entry->timestamp == 0)
+			continue;
+
+		memcpy(&mboxlog->entry, entry, sizeof(struct mbox_cmd));
+		for (i = 0; i < MBOX_LEN / 8; i++) {
+			flit = entry->cmd[i];
+			mboxlog->hi[i] = (u32)(flit >> 32);
+			mboxlog->lo[i] = (u32)flit;
+		}
+		mboxlog++;
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
+
+int cudbg_collect_hma_indirect(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err)
+{
+	struct adapter *padap = pdbg_init->adap;
+	struct cudbg_buffer temp_buff = { 0 };
+	struct ireg_buf *hma_indr;
+	int i, rc, n;
+	u32 size;
+
+	if (CHELSIO_CHIP_VERSION(padap->params.chip) < CHELSIO_T6)
+		return CUDBG_STATUS_ENTITY_NOT_FOUND;
+
+	n = sizeof(t6_hma_ireg_array) / (IREG_NUM_ELEM * sizeof(u32));
+	size = sizeof(struct ireg_buf) * n;
+	rc = cudbg_get_buff(pdbg_init, dbg_buff, size, &temp_buff);
+	if (rc)
+		return rc;
+
+	hma_indr = (struct ireg_buf *)temp_buff.data;
+	for (i = 0; i < n; i++) {
+		struct ireg_field *hma_fli = &hma_indr->tp_pio;
+		u32 *buff = hma_indr->outbuf;
+
+		hma_fli->ireg_addr = t6_hma_ireg_array[i][0];
+		hma_fli->ireg_data = t6_hma_ireg_array[i][1];
+		hma_fli->ireg_local_offset = t6_hma_ireg_array[i][2];
+		hma_fli->ireg_offset_range = t6_hma_ireg_array[i][3];
+		t4_read_indirect(padap, hma_fli->ireg_addr, hma_fli->ireg_data,
+				 buff, hma_fli->ireg_offset_range,
+				 hma_fli->ireg_local_offset);
+		hma_indr++;
+	}
+	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.h
new file mode 100644
index 0000000..eebefe7
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.h
@@ -0,0 +1,185 @@
+/*
+ *  Copyright (C) 2017 Chelsio Communications.  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#ifndef __CUDBG_LIB_H__
+#define __CUDBG_LIB_H__
+
+int cudbg_collect_reg_dump(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err);
+int cudbg_collect_fw_devlog(struct cudbg_init *pdbg_init,
+			    struct cudbg_buffer *dbg_buff,
+			    struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_la(struct cudbg_init *pdbg_init,
+			 struct cudbg_buffer *dbg_buff,
+			 struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_ma_la(struct cudbg_init *pdbg_init,
+			    struct cudbg_buffer *dbg_buff,
+			    struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_qcfg(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_ibq_tp0(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_ibq_tp1(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_ibq_ulp(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_ibq_sge0(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_ibq_sge1(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_ibq_ncsi(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_obq_ulp0(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_obq_ulp1(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_obq_ulp2(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_obq_ulp3(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_obq_sge(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_obq_ncsi(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err);
+int cudbg_collect_edc0_meminfo(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err);
+int cudbg_collect_edc1_meminfo(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err);
+int cudbg_collect_mc0_meminfo(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err);
+int cudbg_collect_mc1_meminfo(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err);
+int cudbg_collect_rss(struct cudbg_init *pdbg_init,
+		      struct cudbg_buffer *dbg_buff,
+		      struct cudbg_error *cudbg_err);
+int cudbg_collect_rss_vf_config(struct cudbg_init *pdbg_init,
+				struct cudbg_buffer *dbg_buff,
+				struct cudbg_error *cudbg_err);
+int cudbg_collect_tp_indirect(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err);
+int cudbg_collect_path_mtu(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err);
+int cudbg_collect_pm_stats(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err);
+int cudbg_collect_hw_sched(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err);
+int cudbg_collect_sge_indirect(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err);
+int cudbg_collect_ulprx_la(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err);
+int cudbg_collect_tp_la(struct cudbg_init *pdbg_init,
+			struct cudbg_buffer *dbg_buff,
+			struct cudbg_error *cudbg_err);
+int cudbg_collect_meminfo(struct cudbg_init *pdbg_init,
+			  struct cudbg_buffer *dbg_buff,
+			  struct cudbg_error *cudbg_err);
+int cudbg_collect_cim_pif_la(struct cudbg_init *pdbg_init,
+			     struct cudbg_buffer *dbg_buff,
+			     struct cudbg_error *cudbg_err);
+int cudbg_collect_clk_info(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err);
+int cudbg_collect_obq_sge_rx_q0(struct cudbg_init *pdbg_init,
+				struct cudbg_buffer *dbg_buff,
+				struct cudbg_error *cudbg_err);
+int cudbg_collect_obq_sge_rx_q1(struct cudbg_init *pdbg_init,
+				struct cudbg_buffer *dbg_buff,
+				struct cudbg_error *cudbg_err);
+int cudbg_collect_pcie_indirect(struct cudbg_init *pdbg_init,
+				struct cudbg_buffer *dbg_buff,
+				struct cudbg_error *cudbg_err);
+int cudbg_collect_pm_indirect(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err);
+int cudbg_collect_tid(struct cudbg_init *pdbg_init,
+		      struct cudbg_buffer *dbg_buff,
+		      struct cudbg_error *cudbg_err);
+int cudbg_collect_pcie_config(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err);
+int cudbg_collect_dump_context(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err);
+int cudbg_collect_mps_tcam(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err);
+int cudbg_collect_vpd_data(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err);
+int cudbg_collect_le_tcam(struct cudbg_init *pdbg_init,
+			  struct cudbg_buffer *dbg_buff,
+			  struct cudbg_error *cudbg_err);
+int cudbg_collect_cctrl(struct cudbg_init *pdbg_init,
+			struct cudbg_buffer *dbg_buff,
+			struct cudbg_error *cudbg_err);
+int cudbg_collect_ma_indirect(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err);
+int cudbg_collect_ulptx_la(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err);
+int cudbg_collect_up_cim_indirect(struct cudbg_init *pdbg_init,
+				  struct cudbg_buffer *dbg_buff,
+				  struct cudbg_error *cudbg_err);
+int cudbg_collect_pbt_tables(struct cudbg_init *pdbg_init,
+			     struct cudbg_buffer *dbg_buff,
+			     struct cudbg_error *cudbg_err);
+int cudbg_collect_mbox_log(struct cudbg_init *pdbg_init,
+			   struct cudbg_buffer *dbg_buff,
+			   struct cudbg_error *cudbg_err);
+int cudbg_collect_hma_indirect(struct cudbg_init *pdbg_init,
+			       struct cudbg_buffer *dbg_buff,
+			       struct cudbg_error *cudbg_err);
+int cudbg_collect_hma_meminfo(struct cudbg_init *pdbg_init,
+			      struct cudbg_buffer *dbg_buff,
+			      struct cudbg_error *cudbg_err);
+
+struct cudbg_entity_hdr *cudbg_get_entity_hdr(void *outbuf, int i);
+void cudbg_align_debug_buffer(struct cudbg_buffer *dbg_buff,
+			      struct cudbg_entity_hdr *entity_hdr);
+u32 cudbg_cim_obq_size(struct adapter *padap, int qid);
+int cudbg_dump_context_size(struct adapter *padap);
+
+int cudbg_fill_meminfo(struct adapter *padap,
+		       struct cudbg_meminfo *meminfo_buff);
+void cudbg_fill_le_tcam_info(struct adapter *padap,
+			     struct cudbg_tcam *tcam_region);
+#endif /* __CUDBG_LIB_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib_common.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib_common.h
new file mode 100644
index 0000000..8150ea8
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib_common.h
@@ -0,0 +1,89 @@
+/*
+ *  Copyright (C) 2017 Chelsio Communications.  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#ifndef __CUDBG_LIB_COMMON_H__
+#define __CUDBG_LIB_COMMON_H__
+
+#define CUDBG_SIGNATURE 67856866 /* CUDB in ascii */
+
+enum cudbg_dump_type {
+	CUDBG_DUMP_TYPE_MINI = 1,
+};
+
+enum cudbg_compression_type {
+	CUDBG_COMPRESSION_NONE = 1,
+	CUDBG_COMPRESSION_ZLIB,
+};
+
+struct cudbg_hdr {
+	u32 signature;
+	u32 hdr_len;
+	u16 major_ver;
+	u16 minor_ver;
+	u32 data_len;
+	u32 hdr_flags;
+	u16 max_entities;
+	u8 chip_ver;
+	u8 dump_type:3;
+	u8 reserved1:1;
+	u8 compress_type:4;
+	u32 reserved[8];
+};
+
+struct cudbg_entity_hdr {
+	u32 entity_type;
+	u32 start_offset;
+	u32 size;
+	int hdr_flags;
+	u32 sys_warn;
+	u32 sys_err;
+	u8 num_pad;
+	u8 flag;             /* bit 0 is used to indicate ext data */
+	u8 reserved1[2];
+	u32 next_ext_offset; /* pointer to next extended entity meta data */
+	u32 reserved[5];
+};
+
+struct cudbg_ver_hdr {
+	u32 signature;
+	u16 revision;
+	u16 size;
+};
+
+struct cudbg_buffer {
+	u32 size;
+	u32 offset;
+	char *data;
+};
+
+struct cudbg_error {
+	int sys_err;
+	int sys_warn;
+	int app_err;
+};
+
+#define CDUMP_MAX_COMP_BUF_SIZE ((64 * 1024) - 1)
+#define CUDBG_CHUNK_SIZE ((CDUMP_MAX_COMP_BUF_SIZE / 1024) * 1024)
+
+int cudbg_get_buff(struct cudbg_init *pdbg_init,
+		   struct cudbg_buffer *pdbg_buff, u32 size,
+		   struct cudbg_buffer *pin_buff);
+void cudbg_put_buff(struct cudbg_init *pdbg_init,
+		    struct cudbg_buffer *pin_buff);
+void cudbg_update_buff(struct cudbg_buffer *pin_buff,
+		       struct cudbg_buffer *pout_buff);
+#endif /* __CUDBG_LIB_COMMON_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_zlib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_zlib.c
new file mode 100644
index 0000000..25cc06d
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_zlib.c
@@ -0,0 +1,82 @@
+/*
+ *  Copyright (C) 2018 Chelsio Communications.  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#include <linux/zlib.h>
+
+#include "cxgb4.h"
+#include "cudbg_if.h"
+#include "cudbg_lib_common.h"
+#include "cudbg_zlib.h"
+
+static int cudbg_get_compress_hdr(struct cudbg_buffer *pdbg_buff,
+				  struct cudbg_buffer *pin_buff)
+{
+	if (pdbg_buff->offset + sizeof(struct cudbg_compress_hdr) >
+	    pdbg_buff->size)
+		return CUDBG_STATUS_NO_MEM;
+
+	pin_buff->data = (char *)pdbg_buff->data + pdbg_buff->offset;
+	pin_buff->offset = 0;
+	pin_buff->size = sizeof(struct cudbg_compress_hdr);
+	pdbg_buff->offset += sizeof(struct cudbg_compress_hdr);
+	return 0;
+}
+
+int cudbg_compress_buff(struct cudbg_init *pdbg_init,
+			struct cudbg_buffer *pin_buff,
+			struct cudbg_buffer *pout_buff)
+{
+	struct cudbg_buffer temp_buff = { 0 };
+	struct z_stream_s compress_stream;
+	struct cudbg_compress_hdr *c_hdr;
+	int rc;
+
+	/* Write compression header to output buffer before compression */
+	rc = cudbg_get_compress_hdr(pout_buff, &temp_buff);
+	if (rc)
+		return rc;
+
+	c_hdr = (struct cudbg_compress_hdr *)temp_buff.data;
+	c_hdr->compress_id = CUDBG_ZLIB_COMPRESS_ID;
+
+	memset(&compress_stream, 0, sizeof(struct z_stream_s));
+	compress_stream.workspace = pdbg_init->workspace;
+	rc = zlib_deflateInit2(&compress_stream, Z_DEFAULT_COMPRESSION,
+			       Z_DEFLATED, CUDBG_ZLIB_WIN_BITS,
+			       CUDBG_ZLIB_MEM_LVL, Z_DEFAULT_STRATEGY);
+	if (rc != Z_OK)
+		return CUDBG_SYSTEM_ERROR;
+
+	compress_stream.next_in = pin_buff->data;
+	compress_stream.avail_in = pin_buff->size;
+	compress_stream.next_out = pout_buff->data + pout_buff->offset;
+	compress_stream.avail_out = pout_buff->size - pout_buff->offset;
+
+	rc = zlib_deflate(&compress_stream, Z_FINISH);
+	if (rc != Z_STREAM_END)
+		return CUDBG_SYSTEM_ERROR;
+
+	rc = zlib_deflateEnd(&compress_stream);
+	if (rc != Z_OK)
+		return CUDBG_SYSTEM_ERROR;
+
+	c_hdr->compress_size = compress_stream.total_out;
+	c_hdr->decompress_size = pin_buff->size;
+	pout_buff->offset += compress_stream.total_out;
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_zlib.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_zlib.h
new file mode 100644
index 0000000..60d2380
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_zlib.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (C) 2018 Chelsio Communications.  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#ifndef __CUDBG_ZLIB_H__
+#define __CUDBG_ZLIB_H__
+
+#include <linux/zlib.h>
+
+#define CUDBG_ZLIB_COMPRESS_ID 17
+#define CUDBG_ZLIB_WIN_BITS 12
+#define CUDBG_ZLIB_MEM_LVL 4
+
+struct cudbg_compress_hdr {
+	u32 compress_id;
+	u64 decompress_size;
+	u64 compress_size;
+	u64 rsvd[32];
+};
+
+static inline int cudbg_get_workspace_size(void)
+{
+	return zlib_deflate_workspacesize(CUDBG_ZLIB_WIN_BITS,
+					  CUDBG_ZLIB_MEM_LVL);
+}
+
+int cudbg_compress_buff(struct cudbg_init *pdbg_init,
+			struct cudbg_buffer *pin_buff,
+			struct cudbg_buffer *pout_buff);
+#endif /* __CUDBG_ZLIB_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
new file mode 100644
index 0000000..688f954
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -0,0 +1,1790 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_H__
+#define __CXGB4_H__
+
+#include "t4_hw.h"
+
+#include <linux/bitops.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/vmalloc.h>
+#include <linux/etherdevice.h>
+#include <linux/net_tstamp.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/ptp_classify.h>
+#include <asm/io.h>
+#include "t4_chip_type.h"
+#include "cxgb4_uld.h"
+
+#define CH_WARN(adap, fmt, ...) dev_warn(adap->pdev_dev, fmt, ## __VA_ARGS__)
+extern struct list_head adapter_list;
+extern struct mutex uld_mutex;
+
+/* Suspend an Ethernet Tx queue with fewer available descriptors than this.
+ * This is the same as calc_tx_descs() for a TSO packet with
+ * nr_frags == MAX_SKB_FRAGS.
+ */
+#define ETHTXQ_STOP_THRES \
+	(1 + DIV_ROUND_UP((3 * MAX_SKB_FRAGS) / 2 + (MAX_SKB_FRAGS & 1), 8))
+
+enum {
+	MAX_NPORTS	= 4,     /* max # of ports */
+	SERNUM_LEN	= 24,    /* Serial # length */
+	EC_LEN		= 16,    /* E/C length */
+	ID_LEN		= 16,    /* ID length */
+	PN_LEN		= 16,    /* Part Number length */
+	MACADDR_LEN	= 12,    /* MAC Address length */
+};
+
+enum {
+	T4_REGMAP_SIZE = (160 * 1024),
+	T5_REGMAP_SIZE = (332 * 1024),
+};
+
+enum {
+	MEM_EDC0,
+	MEM_EDC1,
+	MEM_MC,
+	MEM_MC0 = MEM_MC,
+	MEM_MC1,
+	MEM_HMA,
+};
+
+enum {
+	MEMWIN0_APERTURE = 2048,
+	MEMWIN0_BASE     = 0x1b800,
+	MEMWIN1_APERTURE = 32768,
+	MEMWIN1_BASE     = 0x28000,
+	MEMWIN1_BASE_T5  = 0x52000,
+	MEMWIN2_APERTURE = 65536,
+	MEMWIN2_BASE     = 0x30000,
+	MEMWIN2_APERTURE_T5 = 131072,
+	MEMWIN2_BASE_T5  = 0x60000,
+};
+
+enum dev_master {
+	MASTER_CANT,
+	MASTER_MAY,
+	MASTER_MUST
+};
+
+enum dev_state {
+	DEV_STATE_UNINIT,
+	DEV_STATE_INIT,
+	DEV_STATE_ERR
+};
+
+enum cc_pause {
+	PAUSE_RX      = 1 << 0,
+	PAUSE_TX      = 1 << 1,
+	PAUSE_AUTONEG = 1 << 2
+};
+
+enum cc_fec {
+	FEC_AUTO      = 1 << 0,	 /* IEEE 802.3 "automatic" */
+	FEC_RS        = 1 << 1,  /* Reed-Solomon */
+	FEC_BASER_RS  = 1 << 2   /* BaseR/Reed-Solomon */
+};
+
+struct port_stats {
+	u64 tx_octets;            /* total # of octets in good frames */
+	u64 tx_frames;            /* all good frames */
+	u64 tx_bcast_frames;      /* all broadcast frames */
+	u64 tx_mcast_frames;      /* all multicast frames */
+	u64 tx_ucast_frames;      /* all unicast frames */
+	u64 tx_error_frames;      /* all error frames */
+
+	u64 tx_frames_64;         /* # of Tx frames in a particular range */
+	u64 tx_frames_65_127;
+	u64 tx_frames_128_255;
+	u64 tx_frames_256_511;
+	u64 tx_frames_512_1023;
+	u64 tx_frames_1024_1518;
+	u64 tx_frames_1519_max;
+
+	u64 tx_drop;              /* # of dropped Tx frames */
+	u64 tx_pause;             /* # of transmitted pause frames */
+	u64 tx_ppp0;              /* # of transmitted PPP prio 0 frames */
+	u64 tx_ppp1;              /* # of transmitted PPP prio 1 frames */
+	u64 tx_ppp2;              /* # of transmitted PPP prio 2 frames */
+	u64 tx_ppp3;              /* # of transmitted PPP prio 3 frames */
+	u64 tx_ppp4;              /* # of transmitted PPP prio 4 frames */
+	u64 tx_ppp5;              /* # of transmitted PPP prio 5 frames */
+	u64 tx_ppp6;              /* # of transmitted PPP prio 6 frames */
+	u64 tx_ppp7;              /* # of transmitted PPP prio 7 frames */
+
+	u64 rx_octets;            /* total # of octets in good frames */
+	u64 rx_frames;            /* all good frames */
+	u64 rx_bcast_frames;      /* all broadcast frames */
+	u64 rx_mcast_frames;      /* all multicast frames */
+	u64 rx_ucast_frames;      /* all unicast frames */
+	u64 rx_too_long;          /* # of frames exceeding MTU */
+	u64 rx_jabber;            /* # of jabber frames */
+	u64 rx_fcs_err;           /* # of received frames with bad FCS */
+	u64 rx_len_err;           /* # of received frames with length error */
+	u64 rx_symbol_err;        /* symbol errors */
+	u64 rx_runt;              /* # of short frames */
+
+	u64 rx_frames_64;         /* # of Rx frames in a particular range */
+	u64 rx_frames_65_127;
+	u64 rx_frames_128_255;
+	u64 rx_frames_256_511;
+	u64 rx_frames_512_1023;
+	u64 rx_frames_1024_1518;
+	u64 rx_frames_1519_max;
+
+	u64 rx_pause;             /* # of received pause frames */
+	u64 rx_ppp0;              /* # of received PPP prio 0 frames */
+	u64 rx_ppp1;              /* # of received PPP prio 1 frames */
+	u64 rx_ppp2;              /* # of received PPP prio 2 frames */
+	u64 rx_ppp3;              /* # of received PPP prio 3 frames */
+	u64 rx_ppp4;              /* # of received PPP prio 4 frames */
+	u64 rx_ppp5;              /* # of received PPP prio 5 frames */
+	u64 rx_ppp6;              /* # of received PPP prio 6 frames */
+	u64 rx_ppp7;              /* # of received PPP prio 7 frames */
+
+	u64 rx_ovflow0;           /* drops due to buffer-group 0 overflows */
+	u64 rx_ovflow1;           /* drops due to buffer-group 1 overflows */
+	u64 rx_ovflow2;           /* drops due to buffer-group 2 overflows */
+	u64 rx_ovflow3;           /* drops due to buffer-group 3 overflows */
+	u64 rx_trunc0;            /* buffer-group 0 truncated packets */
+	u64 rx_trunc1;            /* buffer-group 1 truncated packets */
+	u64 rx_trunc2;            /* buffer-group 2 truncated packets */
+	u64 rx_trunc3;            /* buffer-group 3 truncated packets */
+};
+
+struct lb_port_stats {
+	u64 octets;
+	u64 frames;
+	u64 bcast_frames;
+	u64 mcast_frames;
+	u64 ucast_frames;
+	u64 error_frames;
+
+	u64 frames_64;
+	u64 frames_65_127;
+	u64 frames_128_255;
+	u64 frames_256_511;
+	u64 frames_512_1023;
+	u64 frames_1024_1518;
+	u64 frames_1519_max;
+
+	u64 drop;
+
+	u64 ovflow0;
+	u64 ovflow1;
+	u64 ovflow2;
+	u64 ovflow3;
+	u64 trunc0;
+	u64 trunc1;
+	u64 trunc2;
+	u64 trunc3;
+};
+
+struct tp_tcp_stats {
+	u32 tcp_out_rsts;
+	u64 tcp_in_segs;
+	u64 tcp_out_segs;
+	u64 tcp_retrans_segs;
+};
+
+struct tp_usm_stats {
+	u32 frames;
+	u32 drops;
+	u64 octets;
+};
+
+struct tp_fcoe_stats {
+	u32 frames_ddp;
+	u32 frames_drop;
+	u64 octets_ddp;
+};
+
+struct tp_err_stats {
+	u32 mac_in_errs[4];
+	u32 hdr_in_errs[4];
+	u32 tcp_in_errs[4];
+	u32 tnl_cong_drops[4];
+	u32 ofld_chan_drops[4];
+	u32 tnl_tx_drops[4];
+	u32 ofld_vlan_drops[4];
+	u32 tcp6_in_errs[4];
+	u32 ofld_no_neigh;
+	u32 ofld_cong_defer;
+};
+
+struct tp_cpl_stats {
+	u32 req[4];
+	u32 rsp[4];
+};
+
+struct tp_rdma_stats {
+	u32 rqe_dfr_pkt;
+	u32 rqe_dfr_mod;
+};
+
+struct sge_params {
+	u32 hps;			/* host page size for our PF/VF */
+	u32 eq_qpp;			/* egress queues/page for our PF/VF */
+	u32 iq_qpp;			/* egress queues/page for our PF/VF */
+};
+
+struct tp_params {
+	unsigned int tre;            /* log2 of core clocks per TP tick */
+	unsigned int la_mask;        /* what events are recorded by TP LA */
+	unsigned short tx_modq_map;  /* TX modulation scheduler queue to */
+				     /* channel map */
+
+	uint32_t dack_re;            /* DACK timer resolution */
+	unsigned short tx_modq[NCHAN];	/* channel to modulation queue map */
+
+	u32 vlan_pri_map;               /* cached TP_VLAN_PRI_MAP */
+	u32 ingress_config;             /* cached TP_INGRESS_CONFIG */
+
+	/* cached TP_OUT_CONFIG compressed error vector
+	 * and passing outer header info for encapsulated packets.
+	 */
+	int rx_pkt_encap;
+
+	/* TP_VLAN_PRI_MAP Compressed Filter Tuple field offsets.  This is a
+	 * subset of the set of fields which may be present in the Compressed
+	 * Filter Tuple portion of filters and TCP TCB connections.  The
+	 * fields which are present are controlled by the TP_VLAN_PRI_MAP.
+	 * Since a variable number of fields may or may not be present, their
+	 * shifted field positions within the Compressed Filter Tuple may
+	 * vary, or not even be present if the field isn't selected in
+	 * TP_VLAN_PRI_MAP.  Since some of these fields are needed in various
+	 * places we store their offsets here, or a -1 if the field isn't
+	 * present.
+	 */
+	int fcoe_shift;
+	int port_shift;
+	int vnic_shift;
+	int vlan_shift;
+	int tos_shift;
+	int protocol_shift;
+	int ethertype_shift;
+	int macmatch_shift;
+	int matchtype_shift;
+	int frag_shift;
+
+	u64 hash_filter_mask;
+};
+
+struct vpd_params {
+	unsigned int cclk;
+	u8 ec[EC_LEN + 1];
+	u8 sn[SERNUM_LEN + 1];
+	u8 id[ID_LEN + 1];
+	u8 pn[PN_LEN + 1];
+	u8 na[MACADDR_LEN + 1];
+};
+
+struct pci_params {
+	unsigned int vpd_cap_addr;
+	unsigned char speed;
+	unsigned char width;
+};
+
+struct devlog_params {
+	u32 memtype;                    /* which memory (EDC0, EDC1, MC) */
+	u32 start;                      /* start of log in firmware memory */
+	u32 size;                       /* size of log */
+};
+
+/* Stores chip specific parameters */
+struct arch_specific_params {
+	u8 nchan;
+	u8 pm_stats_cnt;
+	u8 cng_ch_bits_log;		/* congestion channel map bits width */
+	u16 mps_rplc_size;
+	u16 vfcount;
+	u32 sge_fl_db;
+	u16 mps_tcam_size;
+};
+
+struct adapter_params {
+	struct sge_params sge;
+	struct tp_params  tp;
+	struct vpd_params vpd;
+	struct pci_params pci;
+	struct devlog_params devlog;
+	enum pcie_memwin drv_memwin;
+
+	unsigned int cim_la_size;
+
+	unsigned int sf_size;             /* serial flash size in bytes */
+	unsigned int sf_nsec;             /* # of flash sectors */
+
+	unsigned int fw_vers;		  /* firmware version */
+	unsigned int bs_vers;		  /* bootstrap version */
+	unsigned int tp_vers;		  /* TP microcode version */
+	unsigned int er_vers;		  /* expansion ROM version */
+	unsigned int scfg_vers;		  /* Serial Configuration version */
+	unsigned int vpd_vers;		  /* VPD Version */
+	u8 api_vers[7];
+
+	unsigned short mtus[NMTUS];
+	unsigned short a_wnd[NCCTRL_WIN];
+	unsigned short b_wnd[NCCTRL_WIN];
+
+	unsigned char nports;             /* # of ethernet ports */
+	unsigned char portvec;
+	enum chip_type chip;               /* chip code */
+	struct arch_specific_params arch;  /* chip specific params */
+	unsigned char offload;
+	unsigned char crypto;		/* HW capability for crypto */
+
+	unsigned char bypass;
+	unsigned char hash_filter;
+
+	unsigned int ofldq_wr_cred;
+	bool ulptx_memwrite_dsgl;          /* use of T5 DSGL allowed */
+
+	unsigned int nsched_cls;          /* number of traffic classes */
+	unsigned int max_ordird_qp;       /* Max read depth per RDMA QP */
+	unsigned int max_ird_adapter;     /* Max read depth per adapter */
+	bool fr_nsmr_tpte_wr_support;	  /* FW support for FR_NSMR_TPTE_WR */
+	u8 fw_caps_support;		/* 32-bit Port Capabilities */
+	bool filter2_wr_support;	/* FW support for FILTER2_WR */
+
+	/* MPS Buffer Group Map[per Port].  Bit i is set if buffer group i is
+	 * used by the Port
+	 */
+	u8 mps_bg_map[MAX_NPORTS];	/* MPS Buffer Group Map */
+	bool write_w_imm_support;       /* FW supports WRITE_WITH_IMMEDIATE */
+	bool write_cmpl_support;        /* FW supports WRITE_CMPL */
+};
+
+/* State needed to monitor the forward progress of SGE Ingress DMA activities
+ * and possible hangs.
+ */
+struct sge_idma_monitor_state {
+	unsigned int idma_1s_thresh;	/* 1s threshold in Core Clock ticks */
+	unsigned int idma_stalled[2];	/* synthesized stalled timers in HZ */
+	unsigned int idma_state[2];	/* IDMA Hang detect state */
+	unsigned int idma_qid[2];	/* IDMA Hung Ingress Queue ID */
+	unsigned int idma_warn[2];	/* time to warning in HZ */
+};
+
+/* Firmware Mailbox Command/Reply log.  All values are in Host-Endian format.
+ * The access and execute times are signed in order to accommodate negative
+ * error returns.
+ */
+struct mbox_cmd {
+	u64 cmd[MBOX_LEN / 8];		/* a Firmware Mailbox Command/Reply */
+	u64 timestamp;			/* OS-dependent timestamp */
+	u32 seqno;			/* sequence number */
+	s16 access;			/* time (ms) to access mailbox */
+	s16 execute;			/* time (ms) to execute */
+};
+
+struct mbox_cmd_log {
+	unsigned int size;		/* number of entries in the log */
+	unsigned int cursor;		/* next position in the log to write */
+	u32 seqno;			/* next sequence number */
+	/* variable length mailbox command log starts here */
+};
+
+/* Given a pointer to a Firmware Mailbox Command Log and a log entry index,
+ * return a pointer to the specified entry.
+ */
+static inline struct mbox_cmd *mbox_cmd_log_entry(struct mbox_cmd_log *log,
+						  unsigned int entry_idx)
+{
+	return &((struct mbox_cmd *)&(log)[1])[entry_idx];
+}
+
+#include "t4fw_api.h"
+
+#define FW_VERSION(chip) ( \
+		FW_HDR_FW_VER_MAJOR_G(chip##FW_VERSION_MAJOR) | \
+		FW_HDR_FW_VER_MINOR_G(chip##FW_VERSION_MINOR) | \
+		FW_HDR_FW_VER_MICRO_G(chip##FW_VERSION_MICRO) | \
+		FW_HDR_FW_VER_BUILD_G(chip##FW_VERSION_BUILD))
+#define FW_INTFVER(chip, intf) (FW_HDR_INTFVER_##intf)
+
+struct fw_info {
+	u8 chip;
+	char *fs_name;
+	char *fw_mod_name;
+	struct fw_hdr fw_hdr;
+};
+
+struct trace_params {
+	u32 data[TRACE_LEN / 4];
+	u32 mask[TRACE_LEN / 4];
+	unsigned short snap_len;
+	unsigned short min_len;
+	unsigned char skip_ofst;
+	unsigned char skip_len;
+	unsigned char invert;
+	unsigned char port;
+};
+
+/* Firmware Port Capabilities types. */
+
+typedef u16 fw_port_cap16_t;	/* 16-bit Port Capabilities integral value */
+typedef u32 fw_port_cap32_t;	/* 32-bit Port Capabilities integral value */
+
+enum fw_caps {
+	FW_CAPS_UNKNOWN	= 0,	/* 0'ed out initial state */
+	FW_CAPS16	= 1,	/* old Firmware: 16-bit Port Capabilities */
+	FW_CAPS32	= 2,	/* new Firmware: 32-bit Port Capabilities */
+};
+
+struct link_config {
+	fw_port_cap32_t pcaps;           /* link capabilities */
+	fw_port_cap32_t def_acaps;       /* default advertised capabilities */
+	fw_port_cap32_t acaps;           /* advertised capabilities */
+	fw_port_cap32_t lpacaps;         /* peer advertised capabilities */
+
+	fw_port_cap32_t speed_caps;      /* speed(s) user has requested */
+	unsigned int   speed;            /* actual link speed (Mb/s) */
+
+	enum cc_pause  requested_fc;     /* flow control user has requested */
+	enum cc_pause  fc;               /* actual link flow control */
+
+	enum cc_fec    requested_fec;	 /* Forward Error Correction: */
+	enum cc_fec    fec;		 /* requested and actual in use */
+
+	unsigned char  autoneg;          /* autonegotiating? */
+
+	unsigned char  link_ok;          /* link up? */
+	unsigned char  link_down_rc;     /* link down reason */
+};
+
+#define FW_LEN16(fw_struct) FW_CMD_LEN16_V(sizeof(fw_struct) / 16)
+
+enum {
+	MAX_ETH_QSETS = 32,           /* # of Ethernet Tx/Rx queue sets */
+	MAX_OFLD_QSETS = 16,          /* # of offload Tx, iscsi Rx queue sets */
+	MAX_CTRL_QUEUES = NCHAN,      /* # of control Tx queues */
+};
+
+enum {
+	MAX_TXQ_ENTRIES      = 16384,
+	MAX_CTRL_TXQ_ENTRIES = 1024,
+	MAX_RSPQ_ENTRIES     = 16384,
+	MAX_RX_BUFFERS       = 16384,
+	MIN_TXQ_ENTRIES      = 32,
+	MIN_CTRL_TXQ_ENTRIES = 32,
+	MIN_RSPQ_ENTRIES     = 128,
+	MIN_FL_ENTRIES       = 16
+};
+
+enum {
+	INGQ_EXTRAS = 2,        /* firmware event queue and */
+				/*   forwarded interrupts */
+	MAX_INGQ = MAX_ETH_QSETS + INGQ_EXTRAS,
+};
+
+struct adapter;
+struct sge_rspq;
+
+#include "cxgb4_dcb.h"
+
+#ifdef CONFIG_CHELSIO_T4_FCOE
+#include "cxgb4_fcoe.h"
+#endif /* CONFIG_CHELSIO_T4_FCOE */
+
+struct port_info {
+	struct adapter *adapter;
+	u16    viid;
+	s16    xact_addr_filt;        /* index of exact MAC address filter */
+	u16    rss_size;              /* size of VI's RSS table slice */
+	s8     mdio_addr;
+	enum fw_port_type port_type;
+	u8     mod_type;
+	u8     port_id;
+	u8     tx_chan;
+	u8     lport;                 /* associated offload logical port */
+	u8     nqsets;                /* # of qsets */
+	u8     first_qset;            /* index of first qset */
+	u8     rss_mode;
+	struct link_config link_cfg;
+	u16   *rss;
+	struct port_stats stats_base;
+#ifdef CONFIG_CHELSIO_T4_DCB
+	struct port_dcb_info dcb;     /* Data Center Bridging support */
+#endif
+#ifdef CONFIG_CHELSIO_T4_FCOE
+	struct cxgb_fcoe fcoe;
+#endif /* CONFIG_CHELSIO_T4_FCOE */
+	bool rxtstamp;  /* Enable TS */
+	struct hwtstamp_config tstamp_config;
+	bool ptp_enable;
+	struct sched_table *sched_tbl;
+};
+
+struct dentry;
+struct work_struct;
+
+enum {                                 /* adapter flags */
+	FULL_INIT_DONE     = (1 << 0),
+	DEV_ENABLED        = (1 << 1),
+	USING_MSI          = (1 << 2),
+	USING_MSIX         = (1 << 3),
+	FW_OK              = (1 << 4),
+	RSS_TNLALLLOOKUP   = (1 << 5),
+	USING_SOFT_PARAMS  = (1 << 6),
+	MASTER_PF          = (1 << 7),
+	FW_OFLD_CONN       = (1 << 9),
+	ROOT_NO_RELAXED_ORDERING = (1 << 10),
+	SHUTTING_DOWN	   = (1 << 11),
+};
+
+enum {
+	ULP_CRYPTO_LOOKASIDE = 1 << 0,
+	ULP_CRYPTO_IPSEC_INLINE = 1 << 1,
+};
+
+struct rx_sw_desc;
+
+struct sge_fl {                     /* SGE free-buffer queue state */
+	unsigned int avail;         /* # of available Rx buffers */
+	unsigned int pend_cred;     /* new buffers since last FL DB ring */
+	unsigned int cidx;          /* consumer index */
+	unsigned int pidx;          /* producer index */
+	unsigned long alloc_failed; /* # of times buffer allocation failed */
+	unsigned long large_alloc_failed;
+	unsigned long mapping_err;  /* # of RX Buffer DMA Mapping failures */
+	unsigned long low;          /* # of times momentarily starving */
+	unsigned long starving;
+	/* RO fields */
+	unsigned int cntxt_id;      /* SGE context id for the free list */
+	unsigned int size;          /* capacity of free list */
+	struct rx_sw_desc *sdesc;   /* address of SW Rx descriptor ring */
+	__be64 *desc;               /* address of HW Rx descriptor ring */
+	dma_addr_t addr;            /* bus address of HW ring start */
+	void __iomem *bar2_addr;    /* address of BAR2 Queue registers */
+	unsigned int bar2_qid;      /* Queue ID for BAR2 Queue registers */
+};
+
+/* A packet gather list */
+struct pkt_gl {
+	u64 sgetstamp;		    /* SGE Time Stamp for Ingress Packet */
+	struct page_frag frags[MAX_SKB_FRAGS];
+	void *va;                         /* virtual address of first byte */
+	unsigned int nfrags;              /* # of fragments */
+	unsigned int tot_len;             /* total length of fragments */
+};
+
+typedef int (*rspq_handler_t)(struct sge_rspq *q, const __be64 *rsp,
+			      const struct pkt_gl *gl);
+typedef void (*rspq_flush_handler_t)(struct sge_rspq *q);
+/* LRO related declarations for ULD */
+struct t4_lro_mgr {
+#define MAX_LRO_SESSIONS		64
+	u8 lro_session_cnt;         /* # of sessions to aggregate */
+	unsigned long lro_pkts;     /* # of LRO super packets */
+	unsigned long lro_merged;   /* # of wire packets merged by LRO */
+	struct sk_buff_head lroq;   /* list of aggregated sessions */
+};
+
+struct sge_rspq {                   /* state for an SGE response queue */
+	struct napi_struct napi;
+	const __be64 *cur_desc;     /* current descriptor in queue */
+	unsigned int cidx;          /* consumer index */
+	u8 gen;                     /* current generation bit */
+	u8 intr_params;             /* interrupt holdoff parameters */
+	u8 next_intr_params;        /* holdoff params for next interrupt */
+	u8 adaptive_rx;
+	u8 pktcnt_idx;              /* interrupt packet threshold */
+	u8 uld;                     /* ULD handling this queue */
+	u8 idx;                     /* queue index within its group */
+	int offset;                 /* offset into current Rx buffer */
+	u16 cntxt_id;               /* SGE context id for the response q */
+	u16 abs_id;                 /* absolute SGE id for the response q */
+	__be64 *desc;               /* address of HW response ring */
+	dma_addr_t phys_addr;       /* physical address of the ring */
+	void __iomem *bar2_addr;    /* address of BAR2 Queue registers */
+	unsigned int bar2_qid;      /* Queue ID for BAR2 Queue registers */
+	unsigned int iqe_len;       /* entry size */
+	unsigned int size;          /* capacity of response queue */
+	struct adapter *adap;
+	struct net_device *netdev;  /* associated net device */
+	rspq_handler_t handler;
+	rspq_flush_handler_t flush_handler;
+	struct t4_lro_mgr lro_mgr;
+};
+
+struct sge_eth_stats {              /* Ethernet queue statistics */
+	unsigned long pkts;         /* # of ethernet packets */
+	unsigned long lro_pkts;     /* # of LRO super packets */
+	unsigned long lro_merged;   /* # of wire packets merged by LRO */
+	unsigned long rx_cso;       /* # of Rx checksum offloads */
+	unsigned long vlan_ex;      /* # of Rx VLAN extractions */
+	unsigned long rx_drops;     /* # of packets dropped due to no mem */
+};
+
+struct sge_eth_rxq {                /* SW Ethernet Rx queue */
+	struct sge_rspq rspq;
+	struct sge_fl fl;
+	struct sge_eth_stats stats;
+} ____cacheline_aligned_in_smp;
+
+struct sge_ofld_stats {             /* offload queue statistics */
+	unsigned long pkts;         /* # of packets */
+	unsigned long imm;          /* # of immediate-data packets */
+	unsigned long an;           /* # of asynchronous notifications */
+	unsigned long nomem;        /* # of responses deferred due to no mem */
+};
+
+struct sge_ofld_rxq {               /* SW offload Rx queue */
+	struct sge_rspq rspq;
+	struct sge_fl fl;
+	struct sge_ofld_stats stats;
+} ____cacheline_aligned_in_smp;
+
+struct tx_desc {
+	__be64 flit[8];
+};
+
+struct tx_sw_desc;
+
+struct sge_txq {
+	unsigned int  in_use;       /* # of in-use Tx descriptors */
+	unsigned int  q_type;	    /* Q type Eth/Ctrl/Ofld */
+	unsigned int  size;         /* # of descriptors */
+	unsigned int  cidx;         /* SW consumer index */
+	unsigned int  pidx;         /* producer index */
+	unsigned long stops;        /* # of times q has been stopped */
+	unsigned long restarts;     /* # of queue restarts */
+	unsigned int  cntxt_id;     /* SGE context id for the Tx q */
+	struct tx_desc *desc;       /* address of HW Tx descriptor ring */
+	struct tx_sw_desc *sdesc;   /* address of SW Tx descriptor ring */
+	struct sge_qstat *stat;     /* queue status entry */
+	dma_addr_t    phys_addr;    /* physical address of the ring */
+	spinlock_t db_lock;
+	int db_disabled;
+	unsigned short db_pidx;
+	unsigned short db_pidx_inc;
+	void __iomem *bar2_addr;    /* address of BAR2 Queue registers */
+	unsigned int bar2_qid;      /* Queue ID for BAR2 Queue registers */
+};
+
+struct sge_eth_txq {                /* state for an SGE Ethernet Tx queue */
+	struct sge_txq q;
+	struct netdev_queue *txq;   /* associated netdev TX queue */
+#ifdef CONFIG_CHELSIO_T4_DCB
+	u8 dcb_prio;		    /* DCB Priority bound to queue */
+#endif
+	unsigned long tso;          /* # of TSO requests */
+	unsigned long tx_cso;       /* # of Tx checksum offloads */
+	unsigned long vlan_ins;     /* # of Tx VLAN insertions */
+	unsigned long mapping_err;  /* # of I/O MMU packet mapping errors */
+} ____cacheline_aligned_in_smp;
+
+struct sge_uld_txq {               /* state for an SGE offload Tx queue */
+	struct sge_txq q;
+	struct adapter *adap;
+	struct sk_buff_head sendq;  /* list of backpressured packets */
+	struct tasklet_struct qresume_tsk; /* restarts the queue */
+	bool service_ofldq_running; /* service_ofldq() is processing sendq */
+	u8 full;                    /* the Tx ring is full */
+	unsigned long mapping_err;  /* # of I/O MMU packet mapping errors */
+} ____cacheline_aligned_in_smp;
+
+struct sge_ctrl_txq {               /* state for an SGE control Tx queue */
+	struct sge_txq q;
+	struct adapter *adap;
+	struct sk_buff_head sendq;  /* list of backpressured packets */
+	struct tasklet_struct qresume_tsk; /* restarts the queue */
+	u8 full;                    /* the Tx ring is full */
+} ____cacheline_aligned_in_smp;
+
+struct sge_uld_rxq_info {
+	char name[IFNAMSIZ];	/* name of ULD driver */
+	struct sge_ofld_rxq *uldrxq; /* Rxq's for ULD */
+	u16 *msix_tbl;		/* msix_tbl for uld */
+	u16 *rspq_id;		/* response queue id's of rxq */
+	u16 nrxq;		/* # of ingress uld queues */
+	u16 nciq;		/* # of completion queues */
+	u8 uld;			/* uld type */
+};
+
+struct sge_uld_txq_info {
+	struct sge_uld_txq *uldtxq; /* Txq's for ULD */
+	atomic_t users;		/* num users */
+	u16 ntxq;		/* # of egress uld queues */
+};
+
+struct sge {
+	struct sge_eth_txq ethtxq[MAX_ETH_QSETS];
+	struct sge_eth_txq ptptxq;
+	struct sge_ctrl_txq ctrlq[MAX_CTRL_QUEUES];
+
+	struct sge_eth_rxq ethrxq[MAX_ETH_QSETS];
+	struct sge_rspq fw_evtq ____cacheline_aligned_in_smp;
+	struct sge_uld_rxq_info **uld_rxq_info;
+	struct sge_uld_txq_info **uld_txq_info;
+
+	struct sge_rspq intrq ____cacheline_aligned_in_smp;
+	spinlock_t intrq_lock;
+
+	u16 max_ethqsets;           /* # of available Ethernet queue sets */
+	u16 ethqsets;               /* # of active Ethernet queue sets */
+	u16 ethtxq_rover;           /* Tx queue to clean up next */
+	u16 ofldqsets;              /* # of active ofld queue sets */
+	u16 nqs_per_uld;	    /* # of Rx queues per ULD */
+	u16 timer_val[SGE_NTIMERS];
+	u8 counter_val[SGE_NCOUNTERS];
+	u32 fl_pg_order;            /* large page allocation size */
+	u32 stat_len;               /* length of status page at ring end */
+	u32 pktshift;               /* padding between CPL & packet data */
+	u32 fl_align;               /* response queue message alignment */
+	u32 fl_starve_thres;        /* Free List starvation threshold */
+
+	struct sge_idma_monitor_state idma_monitor;
+	unsigned int egr_start;
+	unsigned int egr_sz;
+	unsigned int ingr_start;
+	unsigned int ingr_sz;
+	void **egr_map;    /* qid->queue egress queue map */
+	struct sge_rspq **ingr_map; /* qid->queue ingress queue map */
+	unsigned long *starving_fl;
+	unsigned long *txq_maperr;
+	unsigned long *blocked_fl;
+	struct timer_list rx_timer; /* refills starving FLs */
+	struct timer_list tx_timer; /* checks Tx queues */
+};
+
+#define for_each_ethrxq(sge, i) for (i = 0; i < (sge)->ethqsets; i++)
+#define for_each_ofldtxq(sge, i) for (i = 0; i < (sge)->ofldqsets; i++)
+
+struct l2t_data;
+
+#ifdef CONFIG_PCI_IOV
+
+/* T4 supports SRIOV on PF0-3 and T5 on PF0-7.  However, the Serial
+ * Configuration initialization for T5 only has SR-IOV functionality enabled
+ * on PF0-3 in order to simplify everything.
+ */
+#define NUM_OF_PF_WITH_SRIOV 4
+
+#endif
+
+struct doorbell_stats {
+	u32 db_drop;
+	u32 db_empty;
+	u32 db_full;
+};
+
+struct hash_mac_addr {
+	struct list_head list;
+	u8 addr[ETH_ALEN];
+};
+
+struct uld_msix_bmap {
+	unsigned long *msix_bmap;
+	unsigned int mapsize;
+	spinlock_t lock; /* lock for acquiring bitmap */
+};
+
+struct uld_msix_info {
+	unsigned short vec;
+	char desc[IFNAMSIZ + 10];
+	unsigned int idx;
+};
+
+struct vf_info {
+	unsigned char vf_mac_addr[ETH_ALEN];
+	unsigned int tx_rate;
+	bool pf_set_mac;
+	u16 vlan;
+};
+
+enum {
+	HMA_DMA_MAPPED_FLAG = 1
+};
+
+struct hma_data {
+	unsigned char flags;
+	struct sg_table *sgt;
+	dma_addr_t *phy_addr;	/* physical address of the page */
+};
+
+struct mbox_list {
+	struct list_head list;
+};
+
+struct mps_encap_entry {
+	atomic_t refcnt;
+};
+
+struct adapter {
+	void __iomem *regs;
+	void __iomem *bar2;
+	u32 t4_bar0;
+	struct pci_dev *pdev;
+	struct device *pdev_dev;
+	const char *name;
+	unsigned int mbox;
+	unsigned int pf;
+	unsigned int flags;
+	unsigned int adap_idx;
+	enum chip_type chip;
+
+	int msg_enable;
+	__be16 vxlan_port;
+	u8 vxlan_port_cnt;
+	__be16 geneve_port;
+	u8 geneve_port_cnt;
+
+	struct adapter_params params;
+	struct cxgb4_virt_res vres;
+	unsigned int swintr;
+
+	struct {
+		unsigned short vec;
+		char desc[IFNAMSIZ + 10];
+	} msix_info[MAX_INGQ + 1];
+	struct uld_msix_info *msix_info_ulds; /* msix info for uld's */
+	struct uld_msix_bmap msix_bmap_ulds; /* msix bitmap for all uld */
+	int msi_idx;
+
+	struct doorbell_stats db_stats;
+	struct sge sge;
+
+	struct net_device *port[MAX_NPORTS];
+	u8 chan_map[NCHAN];                   /* channel -> port map */
+
+	struct vf_info *vfinfo;
+	u8 num_vfs;
+
+	u32 filter_mode;
+	unsigned int l2t_start;
+	unsigned int l2t_end;
+	struct l2t_data *l2t;
+	unsigned int clipt_start;
+	unsigned int clipt_end;
+	struct clip_tbl *clipt;
+	unsigned int rawf_start;
+	unsigned int rawf_cnt;
+	struct smt_data *smt;
+	struct mps_encap_entry *mps_encap;
+	struct cxgb4_uld_info *uld;
+	void *uld_handle[CXGB4_ULD_MAX];
+	unsigned int num_uld;
+	unsigned int num_ofld_uld;
+	struct list_head list_node;
+	struct list_head rcu_node;
+	struct list_head mac_hlist; /* list of MAC addresses in MPS Hash */
+
+	void *iscsi_ppm;
+
+	struct tid_info tids;
+	void **tid_release_head;
+	spinlock_t tid_release_lock;
+	struct workqueue_struct *workq;
+	struct work_struct tid_release_task;
+	struct work_struct db_full_task;
+	struct work_struct db_drop_task;
+	struct work_struct fatal_err_notify_task;
+	bool tid_release_task_busy;
+
+	/* lock for mailbox cmd list */
+	spinlock_t mbox_lock;
+	struct mbox_list mlist;
+
+	/* support for mailbox command/reply logging */
+#define T4_OS_LOG_MBOX_CMDS 256
+	struct mbox_cmd_log *mbox_log;
+
+	struct mutex uld_mutex;
+
+	struct dentry *debugfs_root;
+	bool use_bd;     /* Use SGE Back Door intfc for reading SGE Contexts */
+	bool trace_rss;	/* 1 implies that different RSS flit per filter is
+			 * used per filter else if 0 default RSS flit is
+			 * used for all 4 filters.
+			 */
+
+	struct ptp_clock *ptp_clock;
+	struct ptp_clock_info ptp_clock_info;
+	struct sk_buff *ptp_tx_skb;
+	/* ptp lock */
+	spinlock_t ptp_lock;
+	spinlock_t stats_lock;
+	spinlock_t win0_lock ____cacheline_aligned_in_smp;
+
+	/* TC u32 offload */
+	struct cxgb4_tc_u32_table *tc_u32;
+	struct chcr_stats_debug chcr_stats;
+
+	/* TC flower offload */
+	struct rhashtable flower_tbl;
+	struct rhashtable_params flower_ht_params;
+	struct timer_list flower_stats_timer;
+	struct work_struct flower_stats_work;
+
+	/* Ethtool Dump */
+	struct ethtool_dump eth_dump;
+
+	/* HMA */
+	struct hma_data hma;
+
+	struct srq_data *srq;
+};
+
+/* Support for "sched-class" command to allow a TX Scheduling Class to be
+ * programmed with various parameters.
+ */
+struct ch_sched_params {
+	s8   type;                     /* packet or flow */
+	union {
+		struct {
+			s8   level;    /* scheduler hierarchy level */
+			s8   mode;     /* per-class or per-flow */
+			s8   rateunit; /* bit or packet rate */
+			s8   ratemode; /* %port relative or kbps absolute */
+			s8   channel;  /* scheduler channel [0..N] */
+			s8   class;    /* scheduler class [0..N] */
+			s32  minrate;  /* minimum rate */
+			s32  maxrate;  /* maximum rate */
+			s16  weight;   /* percent weight */
+			s16  pktsize;  /* average packet size */
+		} params;
+	} u;
+};
+
+enum {
+	SCHED_CLASS_TYPE_PACKET = 0,    /* class type */
+};
+
+enum {
+	SCHED_CLASS_LEVEL_CL_RL = 0,    /* class rate limiter */
+};
+
+enum {
+	SCHED_CLASS_MODE_CLASS = 0,     /* per-class scheduling */
+};
+
+enum {
+	SCHED_CLASS_RATEUNIT_BITS = 0,  /* bit rate scheduling */
+};
+
+enum {
+	SCHED_CLASS_RATEMODE_ABS = 1,   /* Kb/s */
+};
+
+struct tx_sw_desc {                /* SW state per Tx descriptor */
+	struct sk_buff *skb;
+	struct ulptx_sgl *sgl;
+};
+
+/* Support for "sched_queue" command to allow one or more NIC TX Queues
+ * to be bound to a TX Scheduling Class.
+ */
+struct ch_sched_queue {
+	s8   queue;    /* queue index */
+	s8   class;    /* class index */
+};
+
+/* Defined bit width of user definable filter tuples
+ */
+#define ETHTYPE_BITWIDTH 16
+#define FRAG_BITWIDTH 1
+#define MACIDX_BITWIDTH 9
+#define FCOE_BITWIDTH 1
+#define IPORT_BITWIDTH 3
+#define MATCHTYPE_BITWIDTH 3
+#define PROTO_BITWIDTH 8
+#define TOS_BITWIDTH 8
+#define PF_BITWIDTH 8
+#define VF_BITWIDTH 8
+#define IVLAN_BITWIDTH 16
+#define OVLAN_BITWIDTH 16
+
+/* Filter matching rules.  These consist of a set of ingress packet field
+ * (value, mask) tuples.  The associated ingress packet field matches the
+ * tuple when ((field & mask) == value).  (Thus a wildcard "don't care" field
+ * rule can be constructed by specifying a tuple of (0, 0).)  A filter rule
+ * matches an ingress packet when all of the individual individual field
+ * matching rules are true.
+ *
+ * Partial field masks are always valid, however, while it may be easy to
+ * understand their meanings for some fields (e.g. IP address to match a
+ * subnet), for others making sensible partial masks is less intuitive (e.g.
+ * MPS match type) ...
+ *
+ * Most of the following data structures are modeled on T4 capabilities.
+ * Drivers for earlier chips use the subsets which make sense for those chips.
+ * We really need to come up with a hardware-independent mechanism to
+ * represent hardware filter capabilities ...
+ */
+struct ch_filter_tuple {
+	/* Compressed header matching field rules.  The TP_VLAN_PRI_MAP
+	 * register selects which of these fields will participate in the
+	 * filter match rules -- up to a maximum of 36 bits.  Because
+	 * TP_VLAN_PRI_MAP is a global register, all filters must use the same
+	 * set of fields.
+	 */
+	uint32_t ethtype:ETHTYPE_BITWIDTH;      /* Ethernet type */
+	uint32_t frag:FRAG_BITWIDTH;            /* IP fragmentation header */
+	uint32_t ivlan_vld:1;                   /* inner VLAN valid */
+	uint32_t ovlan_vld:1;                   /* outer VLAN valid */
+	uint32_t pfvf_vld:1;                    /* PF/VF valid */
+	uint32_t macidx:MACIDX_BITWIDTH;        /* exact match MAC index */
+	uint32_t fcoe:FCOE_BITWIDTH;            /* FCoE packet */
+	uint32_t iport:IPORT_BITWIDTH;          /* ingress port */
+	uint32_t matchtype:MATCHTYPE_BITWIDTH;  /* MPS match type */
+	uint32_t proto:PROTO_BITWIDTH;          /* protocol type */
+	uint32_t tos:TOS_BITWIDTH;              /* TOS/Traffic Type */
+	uint32_t pf:PF_BITWIDTH;                /* PCI-E PF ID */
+	uint32_t vf:VF_BITWIDTH;                /* PCI-E VF ID */
+	uint32_t ivlan:IVLAN_BITWIDTH;          /* inner VLAN */
+	uint32_t ovlan:OVLAN_BITWIDTH;          /* outer VLAN */
+
+	/* Uncompressed header matching field rules.  These are always
+	 * available for field rules.
+	 */
+	uint8_t lip[16];        /* local IP address (IPv4 in [3:0]) */
+	uint8_t fip[16];        /* foreign IP address (IPv4 in [3:0]) */
+	uint16_t lport;         /* local port */
+	uint16_t fport;         /* foreign port */
+};
+
+/* A filter ioctl command.
+ */
+struct ch_filter_specification {
+	/* Administrative fields for filter.
+	 */
+	uint32_t hitcnts:1;     /* count filter hits in TCB */
+	uint32_t prio:1;        /* filter has priority over active/server */
+
+	/* Fundamental filter typing.  This is the one element of filter
+	 * matching that doesn't exist as a (value, mask) tuple.
+	 */
+	uint32_t type:1;        /* 0 => IPv4, 1 => IPv6 */
+	u32 hash:1;		/* 0 => wild-card, 1 => exact-match */
+
+	/* Packet dispatch information.  Ingress packets which match the
+	 * filter rules will be dropped, passed to the host or switched back
+	 * out as egress packets.
+	 */
+	uint32_t action:2;      /* drop, pass, switch */
+
+	uint32_t rpttid:1;      /* report TID in RSS hash field */
+
+	uint32_t dirsteer:1;    /* 0 => RSS, 1 => steer to iq */
+	uint32_t iq:10;         /* ingress queue */
+
+	uint32_t maskhash:1;    /* dirsteer=0: store RSS hash in TCB */
+	uint32_t dirsteerhash:1;/* dirsteer=1: 0 => TCB contains RSS hash */
+				/*             1 => TCB contains IQ ID */
+
+	/* Switch proxy/rewrite fields.  An ingress packet which matches a
+	 * filter with "switch" set will be looped back out as an egress
+	 * packet -- potentially with some Ethernet header rewriting.
+	 */
+	uint32_t eport:2;       /* egress port to switch packet out */
+	uint32_t newdmac:1;     /* rewrite destination MAC address */
+	uint32_t newsmac:1;     /* rewrite source MAC address */
+	uint32_t newvlan:2;     /* rewrite VLAN Tag */
+	uint32_t nat_mode:3;    /* specify NAT operation mode */
+	uint8_t dmac[ETH_ALEN]; /* new destination MAC address */
+	uint8_t smac[ETH_ALEN]; /* new source MAC address */
+	uint16_t vlan;          /* VLAN Tag to insert */
+
+	u8 nat_lip[16];		/* local IP to use after NAT'ing */
+	u8 nat_fip[16];		/* foreign IP to use after NAT'ing */
+	u16 nat_lport;		/* local port to use after NAT'ing */
+	u16 nat_fport;		/* foreign port to use after NAT'ing */
+
+	/* reservation for future additions */
+	u8 rsvd[24];
+
+	/* Filter rule value/mask pairs.
+	 */
+	struct ch_filter_tuple val;
+	struct ch_filter_tuple mask;
+};
+
+enum {
+	FILTER_PASS = 0,        /* default */
+	FILTER_DROP,
+	FILTER_SWITCH
+};
+
+enum {
+	VLAN_NOCHANGE = 0,      /* default */
+	VLAN_REMOVE,
+	VLAN_INSERT,
+	VLAN_REWRITE
+};
+
+enum {
+	NAT_MODE_NONE = 0,	/* No NAT performed */
+	NAT_MODE_DIP,		/* NAT on Dst IP */
+	NAT_MODE_DIP_DP,	/* NAT on Dst IP, Dst Port */
+	NAT_MODE_DIP_DP_SIP,	/* NAT on Dst IP, Dst Port and Src IP */
+	NAT_MODE_DIP_DP_SP,	/* NAT on Dst IP, Dst Port and Src Port */
+	NAT_MODE_SIP_SP,	/* NAT on Src IP and Src Port */
+	NAT_MODE_DIP_SIP_SP,	/* NAT on Dst IP, Src IP and Src Port */
+	NAT_MODE_ALL		/* NAT on entire 4-tuple */
+};
+
+/* Host shadow copy of ingress filter entry.  This is in host native format
+ * and doesn't match the ordering or bit order, etc. of the hardware of the
+ * firmware command.  The use of bit-field structure elements is purely to
+ * remind ourselves of the field size limitations and save memory in the case
+ * where the filter table is large.
+ */
+struct filter_entry {
+	/* Administrative fields for filter. */
+	u32 valid:1;            /* filter allocated and valid */
+	u32 locked:1;           /* filter is administratively locked */
+
+	u32 pending:1;          /* filter action is pending firmware reply */
+	struct filter_ctx *ctx; /* Caller's completion hook */
+	struct l2t_entry *l2t;  /* Layer Two Table entry for dmac */
+	struct smt_entry *smt;  /* Source Mac Table entry for smac */
+	struct net_device *dev; /* Associated net device */
+	u32 tid;                /* This will store the actual tid */
+
+	/* The filter itself.  Most of this is a straight copy of information
+	 * provided by the extended ioctl().  Some fields are translated to
+	 * internal forms -- for instance the Ingress Queue ID passed in from
+	 * the ioctl() is translated into the Absolute Ingress Queue ID.
+	 */
+	struct ch_filter_specification fs;
+};
+
+static inline int is_offload(const struct adapter *adap)
+{
+	return adap->params.offload;
+}
+
+static inline int is_hashfilter(const struct adapter *adap)
+{
+	return adap->params.hash_filter;
+}
+
+static inline int is_pci_uld(const struct adapter *adap)
+{
+	return adap->params.crypto;
+}
+
+static inline int is_uld(const struct adapter *adap)
+{
+	return (adap->params.offload || adap->params.crypto);
+}
+
+static inline u32 t4_read_reg(struct adapter *adap, u32 reg_addr)
+{
+	return readl(adap->regs + reg_addr);
+}
+
+static inline void t4_write_reg(struct adapter *adap, u32 reg_addr, u32 val)
+{
+	writel(val, adap->regs + reg_addr);
+}
+
+#ifndef readq
+static inline u64 readq(const volatile void __iomem *addr)
+{
+	return readl(addr) + ((u64)readl(addr + 4) << 32);
+}
+
+static inline void writeq(u64 val, volatile void __iomem *addr)
+{
+	writel(val, addr);
+	writel(val >> 32, addr + 4);
+}
+#endif
+
+static inline u64 t4_read_reg64(struct adapter *adap, u32 reg_addr)
+{
+	return readq(adap->regs + reg_addr);
+}
+
+static inline void t4_write_reg64(struct adapter *adap, u32 reg_addr, u64 val)
+{
+	writeq(val, adap->regs + reg_addr);
+}
+
+/**
+ * t4_set_hw_addr - store a port's MAC address in SW
+ * @adapter: the adapter
+ * @port_idx: the port index
+ * @hw_addr: the Ethernet address
+ *
+ * Store the Ethernet address of the given port in SW.  Called by the common
+ * code when it retrieves a port's Ethernet address from EEPROM.
+ */
+static inline void t4_set_hw_addr(struct adapter *adapter, int port_idx,
+				  u8 hw_addr[])
+{
+	ether_addr_copy(adapter->port[port_idx]->dev_addr, hw_addr);
+	ether_addr_copy(adapter->port[port_idx]->perm_addr, hw_addr);
+}
+
+/**
+ * netdev2pinfo - return the port_info structure associated with a net_device
+ * @dev: the netdev
+ *
+ * Return the struct port_info associated with a net_device
+ */
+static inline struct port_info *netdev2pinfo(const struct net_device *dev)
+{
+	return netdev_priv(dev);
+}
+
+/**
+ * adap2pinfo - return the port_info of a port
+ * @adap: the adapter
+ * @idx: the port index
+ *
+ * Return the port_info structure for the port of the given index.
+ */
+static inline struct port_info *adap2pinfo(struct adapter *adap, int idx)
+{
+	return netdev_priv(adap->port[idx]);
+}
+
+/**
+ * netdev2adap - return the adapter structure associated with a net_device
+ * @dev: the netdev
+ *
+ * Return the struct adapter associated with a net_device
+ */
+static inline struct adapter *netdev2adap(const struct net_device *dev)
+{
+	return netdev2pinfo(dev)->adapter;
+}
+
+/* Return a version number to identify the type of adapter.  The scheme is:
+ * - bits 0..9: chip version
+ * - bits 10..15: chip revision
+ * - bits 16..23: register dump version
+ */
+static inline unsigned int mk_adap_vers(struct adapter *ap)
+{
+	return CHELSIO_CHIP_VERSION(ap->params.chip) |
+		(CHELSIO_CHIP_RELEASE(ap->params.chip) << 10) | (1 << 16);
+}
+
+/* Return a queue's interrupt hold-off time in us.  0 means no timer. */
+static inline unsigned int qtimer_val(const struct adapter *adap,
+				      const struct sge_rspq *q)
+{
+	unsigned int idx = q->intr_params >> 1;
+
+	return idx < SGE_NTIMERS ? adap->sge.timer_val[idx] : 0;
+}
+
+/* driver version & name used for ethtool_drvinfo */
+extern char cxgb4_driver_name[];
+extern const char cxgb4_driver_version[];
+
+void t4_os_portmod_changed(const struct adapter *adap, int port_id);
+void t4_os_link_changed(struct adapter *adap, int port_id, int link_stat);
+
+void t4_free_sge_resources(struct adapter *adap);
+void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q);
+irq_handler_t t4_intr_handler(struct adapter *adap);
+netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev);
+int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
+		     const struct pkt_gl *gl);
+int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb);
+int t4_ofld_send(struct adapter *adap, struct sk_buff *skb);
+int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
+		     struct net_device *dev, int intr_idx,
+		     struct sge_fl *fl, rspq_handler_t hnd,
+		     rspq_flush_handler_t flush_handler, int cong);
+int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
+			 struct net_device *dev, struct netdev_queue *netdevq,
+			 unsigned int iqid);
+int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
+			  struct net_device *dev, unsigned int iqid,
+			  unsigned int cmplqid);
+int t4_sge_mod_ctrl_txq(struct adapter *adap, unsigned int eqid,
+			unsigned int cmplqid);
+int t4_sge_alloc_uld_txq(struct adapter *adap, struct sge_uld_txq *txq,
+			 struct net_device *dev, unsigned int iqid,
+			 unsigned int uld_type);
+irqreturn_t t4_sge_intr_msix(int irq, void *cookie);
+int t4_sge_init(struct adapter *adap);
+void t4_sge_start(struct adapter *adap);
+void t4_sge_stop(struct adapter *adap);
+void cxgb4_set_ethtool_ops(struct net_device *netdev);
+int cxgb4_write_rss(const struct port_info *pi, const u16 *queues);
+enum cpl_tx_tnl_lso_type cxgb_encap_offload_supported(struct sk_buff *skb);
+extern int dbfifo_int_thresh;
+
+#define for_each_port(adapter, iter) \
+	for (iter = 0; iter < (adapter)->params.nports; ++iter)
+
+static inline int is_bypass(struct adapter *adap)
+{
+	return adap->params.bypass;
+}
+
+static inline int is_bypass_device(int device)
+{
+	/* this should be set based upon device capabilities */
+	switch (device) {
+	case 0x440b:
+	case 0x440c:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static inline int is_10gbt_device(int device)
+{
+	/* this should be set based upon device capabilities */
+	switch (device) {
+	case 0x4409:
+	case 0x4486:
+		return 1;
+
+	default:
+		return 0;
+	}
+}
+
+static inline unsigned int core_ticks_per_usec(const struct adapter *adap)
+{
+	return adap->params.vpd.cclk / 1000;
+}
+
+static inline unsigned int us_to_core_ticks(const struct adapter *adap,
+					    unsigned int us)
+{
+	return (us * adap->params.vpd.cclk) / 1000;
+}
+
+static inline unsigned int core_ticks_to_us(const struct adapter *adapter,
+					    unsigned int ticks)
+{
+	/* add Core Clock / 2 to round ticks to nearest uS */
+	return ((ticks * 1000 + adapter->params.vpd.cclk/2) /
+		adapter->params.vpd.cclk);
+}
+
+static inline unsigned int dack_ticks_to_usec(const struct adapter *adap,
+					      unsigned int ticks)
+{
+	return (ticks << adap->params.tp.dack_re) / core_ticks_per_usec(adap);
+}
+
+void t4_set_reg_field(struct adapter *adap, unsigned int addr, u32 mask,
+		      u32 val);
+
+int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
+			    int size, void *rpl, bool sleep_ok, int timeout);
+int t4_wr_mbox_meat(struct adapter *adap, int mbox, const void *cmd, int size,
+		    void *rpl, bool sleep_ok);
+
+static inline int t4_wr_mbox_timeout(struct adapter *adap, int mbox,
+				     const void *cmd, int size, void *rpl,
+				     int timeout)
+{
+	return t4_wr_mbox_meat_timeout(adap, mbox, cmd, size, rpl, true,
+				       timeout);
+}
+
+static inline int t4_wr_mbox(struct adapter *adap, int mbox, const void *cmd,
+			     int size, void *rpl)
+{
+	return t4_wr_mbox_meat(adap, mbox, cmd, size, rpl, true);
+}
+
+static inline int t4_wr_mbox_ns(struct adapter *adap, int mbox, const void *cmd,
+				int size, void *rpl)
+{
+	return t4_wr_mbox_meat(adap, mbox, cmd, size, rpl, false);
+}
+
+/**
+ *	hash_mac_addr - return the hash value of a MAC address
+ *	@addr: the 48-bit Ethernet MAC address
+ *
+ *	Hashes a MAC address according to the hash function used by HW inexact
+ *	(hash) address matching.
+ */
+static inline int hash_mac_addr(const u8 *addr)
+{
+	u32 a = ((u32)addr[0] << 16) | ((u32)addr[1] << 8) | addr[2];
+	u32 b = ((u32)addr[3] << 16) | ((u32)addr[4] << 8) | addr[5];
+
+	a ^= b;
+	a ^= (a >> 12);
+	a ^= (a >> 6);
+	return a & 0x3f;
+}
+
+int cxgb4_set_rspq_intr_params(struct sge_rspq *q, unsigned int us,
+			       unsigned int cnt);
+static inline void init_rspq(struct adapter *adap, struct sge_rspq *q,
+			     unsigned int us, unsigned int cnt,
+			     unsigned int size, unsigned int iqe_size)
+{
+	q->adap = adap;
+	cxgb4_set_rspq_intr_params(q, us, cnt);
+	q->iqe_len = iqe_size;
+	q->size = size;
+}
+
+/**
+ *     t4_is_inserted_mod_type - is a plugged in Firmware Module Type
+ *     @fw_mod_type: the Firmware Mofule Type
+ *
+ *     Return whether the Firmware Module Type represents a real Transceiver
+ *     Module/Cable Module Type which has been inserted.
+ */
+static inline bool t4_is_inserted_mod_type(unsigned int fw_mod_type)
+{
+	return (fw_mod_type != FW_PORT_MOD_TYPE_NONE &&
+		fw_mod_type != FW_PORT_MOD_TYPE_NOTSUPPORTED &&
+		fw_mod_type != FW_PORT_MOD_TYPE_UNKNOWN &&
+		fw_mod_type != FW_PORT_MOD_TYPE_ERROR);
+}
+
+void t4_write_indirect(struct adapter *adap, unsigned int addr_reg,
+		       unsigned int data_reg, const u32 *vals,
+		       unsigned int nregs, unsigned int start_idx);
+void t4_read_indirect(struct adapter *adap, unsigned int addr_reg,
+		      unsigned int data_reg, u32 *vals, unsigned int nregs,
+		      unsigned int start_idx);
+void t4_hw_pci_read_cfg4(struct adapter *adapter, int reg, u32 *val);
+
+struct fw_filter_wr;
+
+void t4_intr_enable(struct adapter *adapter);
+void t4_intr_disable(struct adapter *adapter);
+int t4_slow_intr_handler(struct adapter *adapter);
+
+int t4_wait_dev_ready(void __iomem *regs);
+int t4_link_l1cfg(struct adapter *adap, unsigned int mbox, unsigned int port,
+		  struct link_config *lc);
+int t4_restart_aneg(struct adapter *adap, unsigned int mbox, unsigned int port);
+
+u32 t4_read_pcie_cfg4(struct adapter *adap, int reg);
+u32 t4_get_util_window(struct adapter *adap);
+void t4_setup_memwin(struct adapter *adap, u32 memwin_base, u32 window);
+
+int t4_memory_rw_init(struct adapter *adap, int win, int mtype, u32 *mem_off,
+		      u32 *mem_base, u32 *mem_aperture);
+void t4_memory_update_win(struct adapter *adap, int win, u32 addr);
+void t4_memory_rw_residual(struct adapter *adap, u32 off, u32 addr, u8 *buf,
+			   int dir);
+#define T4_MEMORY_WRITE	0
+#define T4_MEMORY_READ	1
+int t4_memory_rw(struct adapter *adap, int win, int mtype, u32 addr, u32 len,
+		 void *buf, int dir);
+static inline int t4_memory_write(struct adapter *adap, int mtype, u32 addr,
+				  u32 len, __be32 *buf)
+{
+	return t4_memory_rw(adap, 0, mtype, addr, len, buf, 0);
+}
+
+unsigned int t4_get_regs_len(struct adapter *adapter);
+void t4_get_regs(struct adapter *adap, void *buf, size_t buf_size);
+
+int t4_eeprom_ptov(unsigned int phys_addr, unsigned int fn, unsigned int sz);
+int t4_seeprom_wp(struct adapter *adapter, bool enable);
+int t4_get_raw_vpd_params(struct adapter *adapter, struct vpd_params *p);
+int t4_get_vpd_params(struct adapter *adapter, struct vpd_params *p);
+int t4_read_flash(struct adapter *adapter, unsigned int addr,
+		  unsigned int nwords, u32 *data, int byte_oriented);
+int t4_load_fw(struct adapter *adapter, const u8 *fw_data, unsigned int size);
+int t4_load_phy_fw(struct adapter *adap,
+		   int win, spinlock_t *lock,
+		   int (*phy_fw_version)(const u8 *, size_t),
+		   const u8 *phy_fw_data, size_t phy_fw_size);
+int t4_phy_fw_ver(struct adapter *adap, int *phy_fw_ver);
+int t4_fwcache(struct adapter *adap, enum fw_params_param_dev_fwcache op);
+int t4_fw_upgrade(struct adapter *adap, unsigned int mbox,
+		  const u8 *fw_data, unsigned int size, int force);
+int t4_fl_pkt_align(struct adapter *adap);
+unsigned int t4_flash_cfg_addr(struct adapter *adapter);
+int t4_check_fw_version(struct adapter *adap);
+int t4_load_cfg(struct adapter *adapter, const u8 *cfg_data, unsigned int size);
+int t4_get_fw_version(struct adapter *adapter, u32 *vers);
+int t4_get_bs_version(struct adapter *adapter, u32 *vers);
+int t4_get_tp_version(struct adapter *adapter, u32 *vers);
+int t4_get_exprom_version(struct adapter *adapter, u32 *vers);
+int t4_get_scfg_version(struct adapter *adapter, u32 *vers);
+int t4_get_vpd_version(struct adapter *adapter, u32 *vers);
+int t4_get_version_info(struct adapter *adapter);
+void t4_dump_version_info(struct adapter *adapter);
+int t4_prep_fw(struct adapter *adap, struct fw_info *fw_info,
+	       const u8 *fw_data, unsigned int fw_size,
+	       struct fw_hdr *card_fw, enum dev_state state, int *reset);
+int t4_prep_adapter(struct adapter *adapter);
+int t4_shutdown_adapter(struct adapter *adapter);
+
+enum t4_bar2_qtype { T4_BAR2_QTYPE_EGRESS, T4_BAR2_QTYPE_INGRESS };
+int t4_bar2_sge_qregs(struct adapter *adapter,
+		      unsigned int qid,
+		      enum t4_bar2_qtype qtype,
+		      int user,
+		      u64 *pbar2_qoffset,
+		      unsigned int *pbar2_qid);
+
+unsigned int qtimer_val(const struct adapter *adap,
+			const struct sge_rspq *q);
+
+int t4_init_devlog_params(struct adapter *adapter);
+int t4_init_sge_params(struct adapter *adapter);
+int t4_init_tp_params(struct adapter *adap, bool sleep_ok);
+int t4_filter_field_shift(const struct adapter *adap, int filter_sel);
+int t4_init_rss_mode(struct adapter *adap, int mbox);
+int t4_init_portinfo(struct port_info *pi, int mbox,
+		     int port, int pf, int vf, u8 mac[]);
+int t4_port_init(struct adapter *adap, int mbox, int pf, int vf);
+void t4_fatal_err(struct adapter *adapter);
+unsigned int t4_chip_rss_size(struct adapter *adapter);
+int t4_config_rss_range(struct adapter *adapter, int mbox, unsigned int viid,
+			int start, int n, const u16 *rspq, unsigned int nrspq);
+int t4_config_glbl_rss(struct adapter *adapter, int mbox, unsigned int mode,
+		       unsigned int flags);
+int t4_config_vi_rss(struct adapter *adapter, int mbox, unsigned int viid,
+		     unsigned int flags, unsigned int defq);
+int t4_read_rss(struct adapter *adapter, u16 *entries);
+void t4_read_rss_key(struct adapter *adapter, u32 *key, bool sleep_ok);
+void t4_write_rss_key(struct adapter *adap, const u32 *key, int idx,
+		      bool sleep_ok);
+void t4_read_rss_pf_config(struct adapter *adapter, unsigned int index,
+			   u32 *valp, bool sleep_ok);
+void t4_read_rss_vf_config(struct adapter *adapter, unsigned int index,
+			   u32 *vfl, u32 *vfh, bool sleep_ok);
+u32 t4_read_rss_pf_map(struct adapter *adapter, bool sleep_ok);
+u32 t4_read_rss_pf_mask(struct adapter *adapter, bool sleep_ok);
+
+unsigned int t4_get_mps_bg_map(struct adapter *adapter, int pidx);
+unsigned int t4_get_tp_ch_map(struct adapter *adapter, int pidx);
+void t4_pmtx_get_stats(struct adapter *adap, u32 cnt[], u64 cycles[]);
+void t4_pmrx_get_stats(struct adapter *adap, u32 cnt[], u64 cycles[]);
+int t4_read_cim_ibq(struct adapter *adap, unsigned int qid, u32 *data,
+		    size_t n);
+int t4_read_cim_obq(struct adapter *adap, unsigned int qid, u32 *data,
+		    size_t n);
+int t4_cim_read(struct adapter *adap, unsigned int addr, unsigned int n,
+		unsigned int *valp);
+int t4_cim_write(struct adapter *adap, unsigned int addr, unsigned int n,
+		 const unsigned int *valp);
+int t4_cim_read_la(struct adapter *adap, u32 *la_buf, unsigned int *wrptr);
+void t4_cim_read_pif_la(struct adapter *adap, u32 *pif_req, u32 *pif_rsp,
+			unsigned int *pif_req_wrptr,
+			unsigned int *pif_rsp_wrptr);
+void t4_cim_read_ma_la(struct adapter *adap, u32 *ma_req, u32 *ma_rsp);
+void t4_read_cimq_cfg(struct adapter *adap, u16 *base, u16 *size, u16 *thres);
+const char *t4_get_port_type_description(enum fw_port_type port_type);
+void t4_get_port_stats(struct adapter *adap, int idx, struct port_stats *p);
+void t4_get_port_stats_offset(struct adapter *adap, int idx,
+			      struct port_stats *stats,
+			      struct port_stats *offset);
+void t4_get_lb_stats(struct adapter *adap, int idx, struct lb_port_stats *p);
+void t4_read_mtu_tbl(struct adapter *adap, u16 *mtus, u8 *mtu_log);
+void t4_read_cong_tbl(struct adapter *adap, u16 incr[NMTUS][NCCTRL_WIN]);
+void t4_tp_wr_bits_indirect(struct adapter *adap, unsigned int addr,
+			    unsigned int mask, unsigned int val);
+void t4_tp_read_la(struct adapter *adap, u64 *la_buf, unsigned int *wrptr);
+void t4_tp_get_err_stats(struct adapter *adap, struct tp_err_stats *st,
+			 bool sleep_ok);
+void t4_tp_get_cpl_stats(struct adapter *adap, struct tp_cpl_stats *st,
+			 bool sleep_ok);
+void t4_tp_get_rdma_stats(struct adapter *adap, struct tp_rdma_stats *st,
+			  bool sleep_ok);
+void t4_get_usm_stats(struct adapter *adap, struct tp_usm_stats *st,
+		      bool sleep_ok);
+void t4_tp_get_tcp_stats(struct adapter *adap, struct tp_tcp_stats *v4,
+			 struct tp_tcp_stats *v6, bool sleep_ok);
+void t4_get_fcoe_stats(struct adapter *adap, unsigned int idx,
+		       struct tp_fcoe_stats *st, bool sleep_ok);
+void t4_load_mtus(struct adapter *adap, const unsigned short *mtus,
+		  const unsigned short *alpha, const unsigned short *beta);
+
+void t4_ulprx_read_la(struct adapter *adap, u32 *la_buf);
+
+void t4_get_chan_txrate(struct adapter *adap, u64 *nic_rate, u64 *ofld_rate);
+void t4_mk_filtdelwr(unsigned int ftid, struct fw_filter_wr *wr, int qid);
+
+void t4_wol_magic_enable(struct adapter *adap, unsigned int port,
+			 const u8 *addr);
+int t4_wol_pat_enable(struct adapter *adap, unsigned int port, unsigned int map,
+		      u64 mask0, u64 mask1, unsigned int crc, bool enable);
+
+int t4_fw_hello(struct adapter *adap, unsigned int mbox, unsigned int evt_mbox,
+		enum dev_master master, enum dev_state *state);
+int t4_fw_bye(struct adapter *adap, unsigned int mbox);
+int t4_early_init(struct adapter *adap, unsigned int mbox);
+int t4_fw_reset(struct adapter *adap, unsigned int mbox, int reset);
+int t4_fixup_host_params(struct adapter *adap, unsigned int page_size,
+			  unsigned int cache_line_size);
+int t4_fw_initialize(struct adapter *adap, unsigned int mbox);
+int t4_query_params(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		    unsigned int vf, unsigned int nparams, const u32 *params,
+		    u32 *val);
+int t4_query_params_ns(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		       unsigned int vf, unsigned int nparams, const u32 *params,
+		       u32 *val);
+int t4_query_params_rw(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		       unsigned int vf, unsigned int nparams, const u32 *params,
+		       u32 *val, int rw, bool sleep_ok);
+int t4_set_params_timeout(struct adapter *adap, unsigned int mbox,
+			  unsigned int pf, unsigned int vf,
+			  unsigned int nparams, const u32 *params,
+			  const u32 *val, int timeout);
+int t4_set_params(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		  unsigned int vf, unsigned int nparams, const u32 *params,
+		  const u32 *val);
+int t4_cfg_pfvf(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		unsigned int vf, unsigned int txq, unsigned int txq_eth_ctrl,
+		unsigned int rxqi, unsigned int rxq, unsigned int tc,
+		unsigned int vi, unsigned int cmask, unsigned int pmask,
+		unsigned int nexact, unsigned int rcaps, unsigned int wxcaps);
+int t4_alloc_vi(struct adapter *adap, unsigned int mbox, unsigned int port,
+		unsigned int pf, unsigned int vf, unsigned int nmac, u8 *mac,
+		unsigned int *rss_size);
+int t4_free_vi(struct adapter *adap, unsigned int mbox,
+	       unsigned int pf, unsigned int vf,
+	       unsigned int viid);
+int t4_set_rxmode(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		int mtu, int promisc, int all_multi, int bcast, int vlanex,
+		bool sleep_ok);
+int t4_free_raw_mac_filt(struct adapter *adap, unsigned int viid,
+			 const u8 *addr, const u8 *mask, unsigned int idx,
+			 u8 lookup_type, u8 port_id, bool sleep_ok);
+int t4_alloc_raw_mac_filt(struct adapter *adap, unsigned int viid,
+			  const u8 *addr, const u8 *mask, unsigned int idx,
+			  u8 lookup_type, u8 port_id, bool sleep_ok);
+int t4_alloc_mac_filt(struct adapter *adap, unsigned int mbox,
+		      unsigned int viid, bool free, unsigned int naddr,
+		      const u8 **addr, u16 *idx, u64 *hash, bool sleep_ok);
+int t4_free_mac_filt(struct adapter *adap, unsigned int mbox,
+		     unsigned int viid, unsigned int naddr,
+		     const u8 **addr, bool sleep_ok);
+int t4_change_mac(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		  int idx, const u8 *addr, bool persist, bool add_smt);
+int t4_set_addr_hash(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		     bool ucast, u64 vec, bool sleep_ok);
+int t4_enable_vi_params(struct adapter *adap, unsigned int mbox,
+			unsigned int viid, bool rx_en, bool tx_en, bool dcb_en);
+int t4_enable_vi(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		 bool rx_en, bool tx_en);
+int t4_identify_port(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		     unsigned int nblinks);
+int t4_mdio_rd(struct adapter *adap, unsigned int mbox, unsigned int phy_addr,
+	       unsigned int mmd, unsigned int reg, u16 *valp);
+int t4_mdio_wr(struct adapter *adap, unsigned int mbox, unsigned int phy_addr,
+	       unsigned int mmd, unsigned int reg, u16 val);
+int t4_iq_stop(struct adapter *adap, unsigned int mbox, unsigned int pf,
+	       unsigned int vf, unsigned int iqtype, unsigned int iqid,
+	       unsigned int fl0id, unsigned int fl1id);
+int t4_iq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+	       unsigned int vf, unsigned int iqtype, unsigned int iqid,
+	       unsigned int fl0id, unsigned int fl1id);
+int t4_eth_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		   unsigned int vf, unsigned int eqid);
+int t4_ctrl_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		    unsigned int vf, unsigned int eqid);
+int t4_ofld_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		    unsigned int vf, unsigned int eqid);
+int t4_sge_ctxt_flush(struct adapter *adap, unsigned int mbox, int ctxt_type);
+void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl);
+int t4_update_port_info(struct port_info *pi);
+int t4_get_link_params(struct port_info *pi, unsigned int *link_okp,
+		       unsigned int *speedp, unsigned int *mtup);
+int t4_handle_fw_rpl(struct adapter *adap, const __be64 *rpl);
+void t4_db_full(struct adapter *adapter);
+void t4_db_dropped(struct adapter *adapter);
+int t4_set_trace_filter(struct adapter *adapter, const struct trace_params *tp,
+			int filter_index, int enable);
+void t4_get_trace_filter(struct adapter *adapter, struct trace_params *tp,
+			 int filter_index, int *enabled);
+int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox,
+			 u32 addr, u32 val);
+void t4_read_pace_tbl(struct adapter *adap, unsigned int pace_vals[NTX_SCHED]);
+void t4_get_tx_sched(struct adapter *adap, unsigned int sched,
+		     unsigned int *kbps, unsigned int *ipg, bool sleep_ok);
+int t4_sge_ctxt_rd(struct adapter *adap, unsigned int mbox, unsigned int cid,
+		   enum ctxt_type ctype, u32 *data);
+int t4_sge_ctxt_rd_bd(struct adapter *adap, unsigned int cid,
+		      enum ctxt_type ctype, u32 *data);
+int t4_sched_params(struct adapter *adapter, int type, int level, int mode,
+		    int rateunit, int ratemode, int channel, int class,
+		    int minrate, int maxrate, int weight, int pktsize);
+void t4_sge_decode_idma_state(struct adapter *adapter, int state);
+void t4_idma_monitor_init(struct adapter *adapter,
+			  struct sge_idma_monitor_state *idma);
+void t4_idma_monitor(struct adapter *adapter,
+		     struct sge_idma_monitor_state *idma,
+		     int hz, int ticks);
+int t4_set_vf_mac_acl(struct adapter *adapter, unsigned int vf,
+		      unsigned int naddr, u8 *addr);
+void t4_tp_pio_read(struct adapter *adap, u32 *buff, u32 nregs,
+		    u32 start_index, bool sleep_ok);
+void t4_tp_tm_pio_read(struct adapter *adap, u32 *buff, u32 nregs,
+		       u32 start_index, bool sleep_ok);
+void t4_tp_mib_read(struct adapter *adap, u32 *buff, u32 nregs,
+		    u32 start_index, bool sleep_ok);
+
+void t4_uld_mem_free(struct adapter *adap);
+int t4_uld_mem_alloc(struct adapter *adap);
+void t4_uld_clean_up(struct adapter *adap);
+void t4_register_netevent_notifier(void);
+int t4_i2c_rd(struct adapter *adap, unsigned int mbox, int port,
+	      unsigned int devid, unsigned int offset,
+	      unsigned int len, u8 *buf);
+void free_rspq_fl(struct adapter *adap, struct sge_rspq *rq, struct sge_fl *fl);
+void free_tx_desc(struct adapter *adap, struct sge_txq *q,
+		  unsigned int n, bool unmap);
+void free_txq(struct adapter *adap, struct sge_txq *q);
+void cxgb4_reclaim_completed_tx(struct adapter *adap,
+				struct sge_txq *q, bool unmap);
+int cxgb4_map_skb(struct device *dev, const struct sk_buff *skb,
+		  dma_addr_t *addr);
+void cxgb4_inline_tx_skb(const struct sk_buff *skb, const struct sge_txq *q,
+			 void *pos);
+void cxgb4_write_sgl(const struct sk_buff *skb, struct sge_txq *q,
+		     struct ulptx_sgl *sgl, u64 *end, unsigned int start,
+		     const dma_addr_t *addr);
+void cxgb4_ring_tx_db(struct adapter *adap, struct sge_txq *q, int n);
+int t4_set_vlan_acl(struct adapter *adap, unsigned int mbox, unsigned int vf,
+		    u16 vlan);
+#endif /* __CXGB4_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
new file mode 100644
index 0000000..143686c
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
@@ -0,0 +1,490 @@
+/*
+ *  Copyright (C) 2017 Chelsio Communications.  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#include "t4_regs.h"
+#include "cxgb4.h"
+#include "cxgb4_cudbg.h"
+#include "cudbg_zlib.h"
+
+static const struct cxgb4_collect_entity cxgb4_collect_mem_dump[] = {
+	{ CUDBG_EDC0, cudbg_collect_edc0_meminfo },
+	{ CUDBG_EDC1, cudbg_collect_edc1_meminfo },
+	{ CUDBG_MC0, cudbg_collect_mc0_meminfo },
+	{ CUDBG_MC1, cudbg_collect_mc1_meminfo },
+	{ CUDBG_HMA, cudbg_collect_hma_meminfo },
+};
+
+static const struct cxgb4_collect_entity cxgb4_collect_hw_dump[] = {
+	{ CUDBG_MBOX_LOG, cudbg_collect_mbox_log },
+	{ CUDBG_DEV_LOG, cudbg_collect_fw_devlog },
+	{ CUDBG_REG_DUMP, cudbg_collect_reg_dump },
+	{ CUDBG_CIM_LA, cudbg_collect_cim_la },
+	{ CUDBG_CIM_MA_LA, cudbg_collect_cim_ma_la },
+	{ CUDBG_CIM_QCFG, cudbg_collect_cim_qcfg },
+	{ CUDBG_CIM_IBQ_TP0, cudbg_collect_cim_ibq_tp0 },
+	{ CUDBG_CIM_IBQ_TP1, cudbg_collect_cim_ibq_tp1 },
+	{ CUDBG_CIM_IBQ_ULP, cudbg_collect_cim_ibq_ulp },
+	{ CUDBG_CIM_IBQ_SGE0, cudbg_collect_cim_ibq_sge0 },
+	{ CUDBG_CIM_IBQ_SGE1, cudbg_collect_cim_ibq_sge1 },
+	{ CUDBG_CIM_IBQ_NCSI, cudbg_collect_cim_ibq_ncsi },
+	{ CUDBG_CIM_OBQ_ULP0, cudbg_collect_cim_obq_ulp0 },
+	{ CUDBG_CIM_OBQ_ULP1, cudbg_collect_cim_obq_ulp1 },
+	{ CUDBG_CIM_OBQ_ULP2, cudbg_collect_cim_obq_ulp2 },
+	{ CUDBG_CIM_OBQ_ULP3, cudbg_collect_cim_obq_ulp3 },
+	{ CUDBG_CIM_OBQ_SGE, cudbg_collect_cim_obq_sge },
+	{ CUDBG_CIM_OBQ_NCSI, cudbg_collect_cim_obq_ncsi },
+	{ CUDBG_RSS, cudbg_collect_rss },
+	{ CUDBG_RSS_VF_CONF, cudbg_collect_rss_vf_config },
+	{ CUDBG_PATH_MTU, cudbg_collect_path_mtu },
+	{ CUDBG_PM_STATS, cudbg_collect_pm_stats },
+	{ CUDBG_HW_SCHED, cudbg_collect_hw_sched },
+	{ CUDBG_TP_INDIRECT, cudbg_collect_tp_indirect },
+	{ CUDBG_SGE_INDIRECT, cudbg_collect_sge_indirect },
+	{ CUDBG_ULPRX_LA, cudbg_collect_ulprx_la },
+	{ CUDBG_TP_LA, cudbg_collect_tp_la },
+	{ CUDBG_MEMINFO, cudbg_collect_meminfo },
+	{ CUDBG_CIM_PIF_LA, cudbg_collect_cim_pif_la },
+	{ CUDBG_CLK, cudbg_collect_clk_info },
+	{ CUDBG_CIM_OBQ_RXQ0, cudbg_collect_obq_sge_rx_q0 },
+	{ CUDBG_CIM_OBQ_RXQ1, cudbg_collect_obq_sge_rx_q1 },
+	{ CUDBG_PCIE_INDIRECT, cudbg_collect_pcie_indirect },
+	{ CUDBG_PM_INDIRECT, cudbg_collect_pm_indirect },
+	{ CUDBG_TID_INFO, cudbg_collect_tid },
+	{ CUDBG_PCIE_CONFIG, cudbg_collect_pcie_config },
+	{ CUDBG_DUMP_CONTEXT, cudbg_collect_dump_context },
+	{ CUDBG_MPS_TCAM, cudbg_collect_mps_tcam },
+	{ CUDBG_VPD_DATA, cudbg_collect_vpd_data },
+	{ CUDBG_LE_TCAM, cudbg_collect_le_tcam },
+	{ CUDBG_CCTRL, cudbg_collect_cctrl },
+	{ CUDBG_MA_INDIRECT, cudbg_collect_ma_indirect },
+	{ CUDBG_ULPTX_LA, cudbg_collect_ulptx_la },
+	{ CUDBG_UP_CIM_INDIRECT, cudbg_collect_up_cim_indirect },
+	{ CUDBG_PBT_TABLE, cudbg_collect_pbt_tables },
+	{ CUDBG_HMA_INDIRECT, cudbg_collect_hma_indirect },
+};
+
+static u32 cxgb4_get_entity_length(struct adapter *adap, u32 entity)
+{
+	struct cudbg_tcam tcam_region = { 0 };
+	u32 value, n = 0, len = 0;
+
+	switch (entity) {
+	case CUDBG_REG_DUMP:
+		switch (CHELSIO_CHIP_VERSION(adap->params.chip)) {
+		case CHELSIO_T4:
+			len = T4_REGMAP_SIZE;
+			break;
+		case CHELSIO_T5:
+		case CHELSIO_T6:
+			len = T5_REGMAP_SIZE;
+			break;
+		default:
+			break;
+		}
+		break;
+	case CUDBG_DEV_LOG:
+		len = adap->params.devlog.size;
+		break;
+	case CUDBG_CIM_LA:
+		if (is_t6(adap->params.chip)) {
+			len = adap->params.cim_la_size / 10 + 1;
+			len *= 10 * sizeof(u32);
+		} else {
+			len = adap->params.cim_la_size / 8;
+			len *= 8 * sizeof(u32);
+		}
+		len += sizeof(u32); /* for reading CIM LA configuration */
+		break;
+	case CUDBG_CIM_MA_LA:
+		len = 2 * CIM_MALA_SIZE * 5 * sizeof(u32);
+		break;
+	case CUDBG_CIM_QCFG:
+		len = sizeof(struct cudbg_cim_qcfg);
+		break;
+	case CUDBG_CIM_IBQ_TP0:
+	case CUDBG_CIM_IBQ_TP1:
+	case CUDBG_CIM_IBQ_ULP:
+	case CUDBG_CIM_IBQ_SGE0:
+	case CUDBG_CIM_IBQ_SGE1:
+	case CUDBG_CIM_IBQ_NCSI:
+		len = CIM_IBQ_SIZE * 4 * sizeof(u32);
+		break;
+	case CUDBG_CIM_OBQ_ULP0:
+		len = cudbg_cim_obq_size(adap, 0);
+		break;
+	case CUDBG_CIM_OBQ_ULP1:
+		len = cudbg_cim_obq_size(adap, 1);
+		break;
+	case CUDBG_CIM_OBQ_ULP2:
+		len = cudbg_cim_obq_size(adap, 2);
+		break;
+	case CUDBG_CIM_OBQ_ULP3:
+		len = cudbg_cim_obq_size(adap, 3);
+		break;
+	case CUDBG_CIM_OBQ_SGE:
+		len = cudbg_cim_obq_size(adap, 4);
+		break;
+	case CUDBG_CIM_OBQ_NCSI:
+		len = cudbg_cim_obq_size(adap, 5);
+		break;
+	case CUDBG_CIM_OBQ_RXQ0:
+		len = cudbg_cim_obq_size(adap, 6);
+		break;
+	case CUDBG_CIM_OBQ_RXQ1:
+		len = cudbg_cim_obq_size(adap, 7);
+		break;
+	case CUDBG_EDC0:
+		value = t4_read_reg(adap, MA_TARGET_MEM_ENABLE_A);
+		if (value & EDRAM0_ENABLE_F) {
+			value = t4_read_reg(adap, MA_EDRAM0_BAR_A);
+			len = EDRAM0_SIZE_G(value);
+		}
+		len = cudbg_mbytes_to_bytes(len);
+		break;
+	case CUDBG_EDC1:
+		value = t4_read_reg(adap, MA_TARGET_MEM_ENABLE_A);
+		if (value & EDRAM1_ENABLE_F) {
+			value = t4_read_reg(adap, MA_EDRAM1_BAR_A);
+			len = EDRAM1_SIZE_G(value);
+		}
+		len = cudbg_mbytes_to_bytes(len);
+		break;
+	case CUDBG_MC0:
+		value = t4_read_reg(adap, MA_TARGET_MEM_ENABLE_A);
+		if (value & EXT_MEM0_ENABLE_F) {
+			value = t4_read_reg(adap, MA_EXT_MEMORY0_BAR_A);
+			len = EXT_MEM0_SIZE_G(value);
+		}
+		len = cudbg_mbytes_to_bytes(len);
+		break;
+	case CUDBG_MC1:
+		value = t4_read_reg(adap, MA_TARGET_MEM_ENABLE_A);
+		if (value & EXT_MEM1_ENABLE_F) {
+			value = t4_read_reg(adap, MA_EXT_MEMORY1_BAR_A);
+			len = EXT_MEM1_SIZE_G(value);
+		}
+		len = cudbg_mbytes_to_bytes(len);
+		break;
+	case CUDBG_RSS:
+		len = t4_chip_rss_size(adap) * sizeof(u16);
+		break;
+	case CUDBG_RSS_VF_CONF:
+		len = adap->params.arch.vfcount *
+		      sizeof(struct cudbg_rss_vf_conf);
+		break;
+	case CUDBG_PATH_MTU:
+		len = NMTUS * sizeof(u16);
+		break;
+	case CUDBG_PM_STATS:
+		len = sizeof(struct cudbg_pm_stats);
+		break;
+	case CUDBG_HW_SCHED:
+		len = sizeof(struct cudbg_hw_sched);
+		break;
+	case CUDBG_TP_INDIRECT:
+		switch (CHELSIO_CHIP_VERSION(adap->params.chip)) {
+		case CHELSIO_T5:
+			n = sizeof(t5_tp_pio_array) +
+			    sizeof(t5_tp_tm_pio_array) +
+			    sizeof(t5_tp_mib_index_array);
+			break;
+		case CHELSIO_T6:
+			n = sizeof(t6_tp_pio_array) +
+			    sizeof(t6_tp_tm_pio_array) +
+			    sizeof(t6_tp_mib_index_array);
+			break;
+		default:
+			break;
+		}
+		n = n / (IREG_NUM_ELEM * sizeof(u32));
+		len = sizeof(struct ireg_buf) * n;
+		break;
+	case CUDBG_SGE_INDIRECT:
+		len = sizeof(struct ireg_buf) * 2;
+		break;
+	case CUDBG_ULPRX_LA:
+		len = sizeof(struct cudbg_ulprx_la);
+		break;
+	case CUDBG_TP_LA:
+		len = sizeof(struct cudbg_tp_la) + TPLA_SIZE * sizeof(u64);
+		break;
+	case CUDBG_MEMINFO:
+		len = sizeof(struct cudbg_meminfo);
+		break;
+	case CUDBG_CIM_PIF_LA:
+		len = sizeof(struct cudbg_cim_pif_la);
+		len += 2 * CIM_PIFLA_SIZE * 6 * sizeof(u32);
+		break;
+	case CUDBG_CLK:
+		len = sizeof(struct cudbg_clk_info);
+		break;
+	case CUDBG_PCIE_INDIRECT:
+		n = sizeof(t5_pcie_pdbg_array) / (IREG_NUM_ELEM * sizeof(u32));
+		len = sizeof(struct ireg_buf) * n * 2;
+		break;
+	case CUDBG_PM_INDIRECT:
+		n = sizeof(t5_pm_rx_array) / (IREG_NUM_ELEM * sizeof(u32));
+		len = sizeof(struct ireg_buf) * n * 2;
+		break;
+	case CUDBG_TID_INFO:
+		len = sizeof(struct cudbg_tid_info_region_rev1);
+		break;
+	case CUDBG_PCIE_CONFIG:
+		len = sizeof(u32) * CUDBG_NUM_PCIE_CONFIG_REGS;
+		break;
+	case CUDBG_DUMP_CONTEXT:
+		len = cudbg_dump_context_size(adap);
+		break;
+	case CUDBG_MPS_TCAM:
+		len = sizeof(struct cudbg_mps_tcam) *
+		      adap->params.arch.mps_tcam_size;
+		break;
+	case CUDBG_VPD_DATA:
+		len = sizeof(struct cudbg_vpd_data);
+		break;
+	case CUDBG_LE_TCAM:
+		cudbg_fill_le_tcam_info(adap, &tcam_region);
+		len = sizeof(struct cudbg_tcam) +
+		      sizeof(struct cudbg_tid_data) * tcam_region.max_tid;
+		break;
+	case CUDBG_CCTRL:
+		len = sizeof(u16) * NMTUS * NCCTRL_WIN;
+		break;
+	case CUDBG_MA_INDIRECT:
+		if (CHELSIO_CHIP_VERSION(adap->params.chip) > CHELSIO_T5) {
+			n = sizeof(t6_ma_ireg_array) /
+			    (IREG_NUM_ELEM * sizeof(u32));
+			len = sizeof(struct ireg_buf) * n * 2;
+		}
+		break;
+	case CUDBG_ULPTX_LA:
+		len = sizeof(struct cudbg_ulptx_la);
+		break;
+	case CUDBG_UP_CIM_INDIRECT:
+		n = 0;
+		if (is_t5(adap->params.chip))
+			n = sizeof(t5_up_cim_reg_array) /
+			    ((IREG_NUM_ELEM + 1) * sizeof(u32));
+		else if (is_t6(adap->params.chip))
+			n = sizeof(t6_up_cim_reg_array) /
+			    ((IREG_NUM_ELEM + 1) * sizeof(u32));
+		len = sizeof(struct ireg_buf) * n;
+		break;
+	case CUDBG_PBT_TABLE:
+		len = sizeof(struct cudbg_pbt_tables);
+		break;
+	case CUDBG_MBOX_LOG:
+		len = sizeof(struct cudbg_mbox_log) * adap->mbox_log->size;
+		break;
+	case CUDBG_HMA_INDIRECT:
+		if (CHELSIO_CHIP_VERSION(adap->params.chip) > CHELSIO_T5) {
+			n = sizeof(t6_hma_ireg_array) /
+			    (IREG_NUM_ELEM * sizeof(u32));
+			len = sizeof(struct ireg_buf) * n;
+		}
+		break;
+	case CUDBG_HMA:
+		value = t4_read_reg(adap, MA_TARGET_MEM_ENABLE_A);
+		if (value & HMA_MUX_F) {
+			/* In T6, there's no MC1.  So, HMA shares MC1
+			 * address space.
+			 */
+			value = t4_read_reg(adap, MA_EXT_MEMORY1_BAR_A);
+			len = EXT_MEM1_SIZE_G(value);
+		}
+		len = cudbg_mbytes_to_bytes(len);
+		break;
+	default:
+		break;
+	}
+
+	return len;
+}
+
+u32 cxgb4_get_dump_length(struct adapter *adap, u32 flag)
+{
+	u32 i, entity;
+	u32 len = 0;
+	u32 wsize;
+
+	if (flag & CXGB4_ETH_DUMP_HW) {
+		for (i = 0; i < ARRAY_SIZE(cxgb4_collect_hw_dump); i++) {
+			entity = cxgb4_collect_hw_dump[i].entity;
+			len += cxgb4_get_entity_length(adap, entity);
+		}
+	}
+
+	if (flag & CXGB4_ETH_DUMP_MEM) {
+		for (i = 0; i < ARRAY_SIZE(cxgb4_collect_mem_dump); i++) {
+			entity = cxgb4_collect_mem_dump[i].entity;
+			len += cxgb4_get_entity_length(adap, entity);
+		}
+	}
+
+	/* If compression is enabled, a smaller destination buffer is enough */
+	wsize = cudbg_get_workspace_size();
+	if (wsize && len > CUDBG_DUMP_BUFF_SIZE)
+		len = CUDBG_DUMP_BUFF_SIZE;
+
+	return len;
+}
+
+static void cxgb4_cudbg_collect_entity(struct cudbg_init *pdbg_init,
+				       struct cudbg_buffer *dbg_buff,
+				       const struct cxgb4_collect_entity *e_arr,
+				       u32 arr_size, void *buf, u32 *tot_size)
+{
+	struct cudbg_error cudbg_err = { 0 };
+	struct cudbg_entity_hdr *entity_hdr;
+	u32 i, total_size = 0;
+	int ret;
+
+	for (i = 0; i < arr_size; i++) {
+		const struct cxgb4_collect_entity *e = &e_arr[i];
+
+		entity_hdr = cudbg_get_entity_hdr(buf, e->entity);
+		entity_hdr->entity_type = e->entity;
+		entity_hdr->start_offset = dbg_buff->offset;
+		memset(&cudbg_err, 0, sizeof(struct cudbg_error));
+		ret = e->collect_cb(pdbg_init, dbg_buff, &cudbg_err);
+		if (ret) {
+			entity_hdr->size = 0;
+			dbg_buff->offset = entity_hdr->start_offset;
+		} else {
+			cudbg_align_debug_buffer(dbg_buff, entity_hdr);
+		}
+
+		/* Log error and continue with next entity */
+		if (cudbg_err.sys_err)
+			ret = CUDBG_SYSTEM_ERROR;
+
+		entity_hdr->hdr_flags = ret;
+		entity_hdr->sys_err = cudbg_err.sys_err;
+		entity_hdr->sys_warn = cudbg_err.sys_warn;
+		total_size += entity_hdr->size;
+	}
+
+	*tot_size += total_size;
+}
+
+static int cudbg_alloc_compress_buff(struct cudbg_init *pdbg_init)
+{
+	u32 workspace_size;
+
+	workspace_size = cudbg_get_workspace_size();
+	pdbg_init->compress_buff = vzalloc(CUDBG_COMPRESS_BUFF_SIZE +
+					   workspace_size);
+	if (!pdbg_init->compress_buff)
+		return -ENOMEM;
+
+	pdbg_init->compress_buff_size = CUDBG_COMPRESS_BUFF_SIZE;
+	pdbg_init->workspace = (u8 *)pdbg_init->compress_buff +
+			       CUDBG_COMPRESS_BUFF_SIZE - workspace_size;
+	return 0;
+}
+
+static void cudbg_free_compress_buff(struct cudbg_init *pdbg_init)
+{
+	if (pdbg_init->compress_buff)
+		vfree(pdbg_init->compress_buff);
+}
+
+int cxgb4_cudbg_collect(struct adapter *adap, void *buf, u32 *buf_size,
+			u32 flag)
+{
+	struct cudbg_buffer dbg_buff = { 0 };
+	u32 size, min_size, total_size = 0;
+	struct cudbg_init cudbg_init;
+	struct cudbg_hdr *cudbg_hdr;
+	int rc;
+
+	size = *buf_size;
+
+	memset(&cudbg_init, 0, sizeof(struct cudbg_init));
+	cudbg_init.adap = adap;
+	cudbg_init.outbuf = buf;
+	cudbg_init.outbuf_size = size;
+
+	dbg_buff.data = buf;
+	dbg_buff.size = size;
+	dbg_buff.offset = 0;
+
+	cudbg_hdr = (struct cudbg_hdr *)buf;
+	cudbg_hdr->signature = CUDBG_SIGNATURE;
+	cudbg_hdr->hdr_len = sizeof(struct cudbg_hdr);
+	cudbg_hdr->major_ver = CUDBG_MAJOR_VERSION;
+	cudbg_hdr->minor_ver = CUDBG_MINOR_VERSION;
+	cudbg_hdr->max_entities = CUDBG_MAX_ENTITY;
+	cudbg_hdr->chip_ver = adap->params.chip;
+	cudbg_hdr->dump_type = CUDBG_DUMP_TYPE_MINI;
+
+	min_size = sizeof(struct cudbg_hdr) +
+		   sizeof(struct cudbg_entity_hdr) *
+		   cudbg_hdr->max_entities;
+	if (size < min_size)
+		return -ENOMEM;
+
+	rc = cudbg_get_workspace_size();
+	if (rc) {
+		/* Zlib available.  So, use zlib deflate */
+		cudbg_init.compress_type = CUDBG_COMPRESSION_ZLIB;
+		rc = cudbg_alloc_compress_buff(&cudbg_init);
+		if (rc) {
+			/* Ignore error and continue without compression. */
+			dev_warn(adap->pdev_dev,
+				 "Fail allocating compression buffer ret: %d.  Continuing without compression.\n",
+				 rc);
+			cudbg_init.compress_type = CUDBG_COMPRESSION_NONE;
+			rc = 0;
+		}
+	} else {
+		cudbg_init.compress_type = CUDBG_COMPRESSION_NONE;
+	}
+
+	cudbg_hdr->compress_type = cudbg_init.compress_type;
+	dbg_buff.offset += min_size;
+	total_size = dbg_buff.offset;
+
+	if (flag & CXGB4_ETH_DUMP_HW)
+		cxgb4_cudbg_collect_entity(&cudbg_init, &dbg_buff,
+					   cxgb4_collect_hw_dump,
+					   ARRAY_SIZE(cxgb4_collect_hw_dump),
+					   buf,
+					   &total_size);
+
+	if (flag & CXGB4_ETH_DUMP_MEM)
+		cxgb4_cudbg_collect_entity(&cudbg_init, &dbg_buff,
+					   cxgb4_collect_mem_dump,
+					   ARRAY_SIZE(cxgb4_collect_mem_dump),
+					   buf,
+					   &total_size);
+
+	cudbg_free_compress_buff(&cudbg_init);
+	cudbg_hdr->data_len = total_size;
+	if (cudbg_init.compress_type != CUDBG_COMPRESSION_NONE)
+		*buf_size = size;
+	else
+		*buf_size = total_size;
+	return 0;
+}
+
+void cxgb4_init_ethtool_dump(struct adapter *adapter)
+{
+	adapter->eth_dump.flag = CXGB4_ETH_DUMP_NONE;
+	adapter->eth_dump.version = adapter->params.fw_vers;
+	adapter->eth_dump.len = 0;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
new file mode 100644
index 0000000..ce1ac9a
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (C) 2017 Chelsio Communications.  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#ifndef __CXGB4_CUDBG_H__
+#define __CXGB4_CUDBG_H__
+
+#include "cudbg_if.h"
+#include "cudbg_lib_common.h"
+#include "cudbg_entity.h"
+#include "cudbg_lib.h"
+
+#define CUDBG_DUMP_BUFF_SIZE (32 * 1024 * 1024) /* 32 MB */
+#define CUDBG_COMPRESS_BUFF_SIZE (4 * 1024 * 1024) /* 4 MB */
+
+typedef int (*cudbg_collect_callback_t)(struct cudbg_init *pdbg_init,
+					struct cudbg_buffer *dbg_buff,
+					struct cudbg_error *cudbg_err);
+
+struct cxgb4_collect_entity {
+	enum cudbg_dbg_entity_type entity;
+	cudbg_collect_callback_t collect_cb;
+};
+
+enum CXGB4_ETHTOOL_DUMP_FLAGS {
+	CXGB4_ETH_DUMP_NONE = ETH_FW_DUMP_DISABLE,
+	CXGB4_ETH_DUMP_MEM = (1 << 0), /* On-Chip Memory Dumps */
+	CXGB4_ETH_DUMP_HW = (1 << 1), /* various FW and HW dumps */
+};
+
+u32 cxgb4_get_dump_length(struct adapter *adap, u32 flag);
+int cxgb4_cudbg_collect(struct adapter *adap, void *buf, u32 *buf_size,
+			u32 flag);
+void cxgb4_init_ethtool_dump(struct adapter *adapter);
+#endif /* __CXGB4_CUDBG_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c
new file mode 100644
index 0000000..4e7f72b
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c
@@ -0,0 +1,1266 @@
+/*
+ *  Copyright (C) 2013-2014 Chelsio Communications.  All rights reserved.
+ *
+ *  Written by Anish Bhatt (anish@chelsio.com)
+ *	       Casey Leedom (leedom@chelsio.com)
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#include "cxgb4.h"
+
+/* DCBx version control
+ */
+static const char * const dcb_ver_array[] = {
+	"Unknown",
+	"DCBx-CIN",
+	"DCBx-CEE 1.01",
+	"DCBx-IEEE",
+	"", "", "",
+	"Auto Negotiated"
+};
+
+static inline bool cxgb4_dcb_state_synced(enum cxgb4_dcb_state state)
+{
+	if (state == CXGB4_DCB_STATE_FW_ALLSYNCED ||
+	    state == CXGB4_DCB_STATE_HOST)
+		return true;
+	else
+		return false;
+}
+
+/* Initialize a port's Data Center Bridging state.
+ */
+void cxgb4_dcb_state_init(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+	int version_temp = dcb->dcb_version;
+
+	memset(dcb, 0, sizeof(struct port_dcb_info));
+	dcb->state = CXGB4_DCB_STATE_START;
+	if (version_temp)
+		dcb->dcb_version = version_temp;
+
+	netdev_dbg(dev, "%s: Initializing DCB state for port[%d]\n",
+		    __func__, pi->port_id);
+}
+
+void cxgb4_dcb_version_init(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+
+	/* Any writes here are only done on kernels that exlicitly need
+	 * a specific version, say < 2.6.38 which only support CEE
+	 */
+	dcb->dcb_version = FW_PORT_DCB_VER_AUTO;
+}
+
+static void cxgb4_dcb_cleanup_apps(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	struct port_dcb_info *dcb = &pi->dcb;
+	struct dcb_app app;
+	int i, err;
+
+	/* zero priority implies remove */
+	app.priority = 0;
+
+	for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) {
+		/* Check if app list is exhausted */
+		if (!dcb->app_priority[i].protocolid)
+			break;
+
+		app.protocol = dcb->app_priority[i].protocolid;
+
+		if (dcb->dcb_version == FW_PORT_DCB_VER_IEEE) {
+			app.priority = dcb->app_priority[i].user_prio_map;
+			app.selector = dcb->app_priority[i].sel_field + 1;
+			err = dcb_ieee_delapp(dev, &app);
+		} else {
+			app.selector = !!(dcb->app_priority[i].sel_field);
+			err = dcb_setapp(dev, &app);
+		}
+
+		if (err) {
+			dev_err(adap->pdev_dev,
+				"Failed DCB Clear %s Application Priority: sel=%d, prot=%d, , err=%d\n",
+				dcb_ver_array[dcb->dcb_version], app.selector,
+				app.protocol, -err);
+			break;
+		}
+	}
+}
+
+/* Reset a port's Data Center Bridging state.  Typically used after a
+ * Link Down event.
+ */
+void cxgb4_dcb_reset(struct net_device *dev)
+{
+	cxgb4_dcb_cleanup_apps(dev);
+	cxgb4_dcb_state_init(dev);
+}
+
+/* Finite State machine for Data Center Bridging.
+ */
+void cxgb4_dcb_state_fsm(struct net_device *dev,
+			 enum cxgb4_dcb_state_input transition_to)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+	struct adapter *adap = pi->adapter;
+	enum cxgb4_dcb_state current_state = dcb->state;
+
+	netdev_dbg(dev, "%s: State change from %d to %d for %s\n",
+		    __func__, dcb->state, transition_to, dev->name);
+
+	switch (current_state) {
+	case CXGB4_DCB_STATE_START: {
+		switch (transition_to) {
+		case CXGB4_DCB_INPUT_FW_DISABLED: {
+			/* we're going to use Host DCB */
+			dcb->state = CXGB4_DCB_STATE_HOST;
+			dcb->supported = CXGB4_DCBX_HOST_SUPPORT;
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_ENABLED: {
+			/* we're going to use Firmware DCB */
+			dcb->state = CXGB4_DCB_STATE_FW_INCOMPLETE;
+			dcb->supported = DCB_CAP_DCBX_LLD_MANAGED;
+			if (dcb->dcb_version == FW_PORT_DCB_VER_IEEE)
+				dcb->supported |= DCB_CAP_DCBX_VER_IEEE;
+			else
+				dcb->supported |= DCB_CAP_DCBX_VER_CEE;
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
+			/* expected transition */
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
+			dcb->state = CXGB4_DCB_STATE_FW_ALLSYNCED;
+			break;
+		}
+
+		default:
+			goto bad_state_input;
+		}
+		break;
+	}
+
+	case CXGB4_DCB_STATE_FW_INCOMPLETE: {
+		switch (transition_to) {
+		case CXGB4_DCB_INPUT_FW_ENABLED: {
+			/* we're alreaady in firmware DCB mode */
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
+			/* we're already incomplete */
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
+			dcb->state = CXGB4_DCB_STATE_FW_ALLSYNCED;
+			dcb->enabled = 1;
+			linkwatch_fire_event(dev);
+			break;
+		}
+
+		default:
+			goto bad_state_input;
+		}
+		break;
+	}
+
+	case CXGB4_DCB_STATE_FW_ALLSYNCED: {
+		switch (transition_to) {
+		case CXGB4_DCB_INPUT_FW_ENABLED: {
+			/* we're alreaady in firmware DCB mode */
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
+			/* We were successfully running with firmware DCB but
+			 * now it's telling us that it's in an "incomplete
+			 * state.  We need to reset back to a ground state
+			 * of incomplete.
+			 */
+			cxgb4_dcb_reset(dev);
+			dcb->state = CXGB4_DCB_STATE_FW_INCOMPLETE;
+			dcb->supported = CXGB4_DCBX_FW_SUPPORT;
+			linkwatch_fire_event(dev);
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
+			/* we're already all sync'ed
+			 * this is only applicable for IEEE or
+			 * when another VI already completed negotiaton
+			 */
+			dcb->enabled = 1;
+			linkwatch_fire_event(dev);
+			break;
+		}
+
+		default:
+			goto bad_state_input;
+		}
+		break;
+	}
+
+	case CXGB4_DCB_STATE_HOST: {
+		switch (transition_to) {
+		case CXGB4_DCB_INPUT_FW_DISABLED: {
+			/* we're alreaady in Host DCB mode */
+			break;
+		}
+
+		default:
+			goto bad_state_input;
+		}
+		break;
+	}
+
+	default:
+		goto bad_state_transition;
+	}
+	return;
+
+bad_state_input:
+	dev_err(adap->pdev_dev, "cxgb4_dcb_state_fsm: illegal input symbol %d\n",
+		transition_to);
+	return;
+
+bad_state_transition:
+	dev_err(adap->pdev_dev, "cxgb4_dcb_state_fsm: bad state transition, state = %d, input = %d\n",
+		current_state, transition_to);
+}
+
+/* Handle a DCB/DCBX update message from the firmware.
+ */
+void cxgb4_dcb_handle_fw_update(struct adapter *adap,
+				const struct fw_port_cmd *pcmd)
+{
+	const union fw_port_dcb *fwdcb = &pcmd->u.dcb;
+	int port = FW_PORT_CMD_PORTID_G(be32_to_cpu(pcmd->op_to_portid));
+	struct net_device *dev = adap->port[adap->chan_map[port]];
+	struct port_info *pi = netdev_priv(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+	int dcb_type = pcmd->u.dcb.pgid.type;
+	int dcb_running_version;
+
+	/* Handle Firmware DCB Control messages separately since they drive
+	 * our state machine.
+	 */
+	if (dcb_type == FW_PORT_DCB_TYPE_CONTROL) {
+		enum cxgb4_dcb_state_input input =
+			((pcmd->u.dcb.control.all_syncd_pkd &
+			  FW_PORT_CMD_ALL_SYNCD_F)
+			 ? CXGB4_DCB_STATE_FW_ALLSYNCED
+			 : CXGB4_DCB_STATE_FW_INCOMPLETE);
+
+		if (dcb->dcb_version != FW_PORT_DCB_VER_UNKNOWN) {
+			dcb_running_version = FW_PORT_CMD_DCB_VERSION_G(
+				be16_to_cpu(
+				pcmd->u.dcb.control.dcb_version_to_app_state));
+			if (dcb_running_version == FW_PORT_DCB_VER_CEE1D01 ||
+			    dcb_running_version == FW_PORT_DCB_VER_IEEE) {
+				dcb->dcb_version = dcb_running_version;
+				dev_warn(adap->pdev_dev, "Interface %s is running %s\n",
+					 dev->name,
+					 dcb_ver_array[dcb->dcb_version]);
+			} else {
+				dev_warn(adap->pdev_dev,
+					 "Something screwed up, requested firmware for %s, but firmware returned %s instead\n",
+					 dcb_ver_array[dcb->dcb_version],
+					 dcb_ver_array[dcb_running_version]);
+				dcb->dcb_version = FW_PORT_DCB_VER_UNKNOWN;
+			}
+		}
+
+		cxgb4_dcb_state_fsm(dev, input);
+		return;
+	}
+
+	/* It's weird, and almost certainly an error, to get Firmware DCB
+	 * messages when we either haven't been told whether we're going to be
+	 * doing Host or Firmware DCB; and even worse when we've been told
+	 * that we're doing Host DCB!
+	 */
+	if (dcb->state == CXGB4_DCB_STATE_START ||
+	    dcb->state == CXGB4_DCB_STATE_HOST) {
+		dev_err(adap->pdev_dev, "Receiving Firmware DCB messages in State %d\n",
+			dcb->state);
+		return;
+	}
+
+	/* Now handle the general Firmware DCB update messages ...
+	 */
+	switch (dcb_type) {
+	case FW_PORT_DCB_TYPE_PGID:
+		dcb->pgid = be32_to_cpu(fwdcb->pgid.pgid);
+		dcb->msgs |= CXGB4_DCB_FW_PGID;
+		break;
+
+	case FW_PORT_DCB_TYPE_PGRATE:
+		dcb->pg_num_tcs_supported = fwdcb->pgrate.num_tcs_supported;
+		memcpy(dcb->pgrate, &fwdcb->pgrate.pgrate,
+		       sizeof(dcb->pgrate));
+		memcpy(dcb->tsa, &fwdcb->pgrate.tsa,
+		       sizeof(dcb->tsa));
+		dcb->msgs |= CXGB4_DCB_FW_PGRATE;
+		if (dcb->msgs & CXGB4_DCB_FW_PGID)
+			IEEE_FAUX_SYNC(dev, dcb);
+		break;
+
+	case FW_PORT_DCB_TYPE_PRIORATE:
+		memcpy(dcb->priorate, &fwdcb->priorate.strict_priorate,
+		       sizeof(dcb->priorate));
+		dcb->msgs |= CXGB4_DCB_FW_PRIORATE;
+		break;
+
+	case FW_PORT_DCB_TYPE_PFC:
+		dcb->pfcen = fwdcb->pfc.pfcen;
+		dcb->pfc_num_tcs_supported = fwdcb->pfc.max_pfc_tcs;
+		dcb->msgs |= CXGB4_DCB_FW_PFC;
+		IEEE_FAUX_SYNC(dev, dcb);
+		break;
+
+	case FW_PORT_DCB_TYPE_APP_ID: {
+		const struct fw_port_app_priority *fwap = &fwdcb->app_priority;
+		int idx = fwap->idx;
+		struct app_priority *ap = &dcb->app_priority[idx];
+
+		struct dcb_app app = {
+			.protocol = be16_to_cpu(fwap->protocolid),
+		};
+		int err;
+
+		/* Convert from firmware format to relevant format
+		 * when using app selector
+		 */
+		if (dcb->dcb_version == FW_PORT_DCB_VER_IEEE) {
+			app.selector = (fwap->sel_field + 1);
+			app.priority = ffs(fwap->user_prio_map) - 1;
+			err = dcb_ieee_setapp(dev, &app);
+			IEEE_FAUX_SYNC(dev, dcb);
+		} else {
+			/* Default is CEE */
+			app.selector = !!(fwap->sel_field);
+			app.priority = fwap->user_prio_map;
+			err = dcb_setapp(dev, &app);
+		}
+
+		if (err)
+			dev_err(adap->pdev_dev,
+				"Failed DCB Set Application Priority: sel=%d, prot=%d, prio=%d, err=%d\n",
+				app.selector, app.protocol, app.priority, -err);
+
+		ap->user_prio_map = fwap->user_prio_map;
+		ap->sel_field = fwap->sel_field;
+		ap->protocolid = be16_to_cpu(fwap->protocolid);
+		dcb->msgs |= CXGB4_DCB_FW_APP_ID;
+		break;
+	}
+
+	default:
+		dev_err(adap->pdev_dev, "Unknown DCB update type received %x\n",
+			dcb_type);
+		break;
+	}
+}
+
+/* Data Center Bridging netlink operations.
+ */
+
+
+/* Get current DCB enabled/disabled state.
+ */
+static u8 cxgb4_getstate(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	return pi->dcb.enabled;
+}
+
+/* Set DCB enabled/disabled.
+ */
+static u8 cxgb4_setstate(struct net_device *dev, u8 enabled)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	/* If DCBx is host-managed, dcb is enabled by outside lldp agents */
+	if (pi->dcb.state == CXGB4_DCB_STATE_HOST) {
+		pi->dcb.enabled = enabled;
+		return 0;
+	}
+
+	/* Firmware doesn't provide any mechanism to control the DCB state.
+	 */
+	if (enabled != (pi->dcb.state == CXGB4_DCB_STATE_FW_ALLSYNCED))
+		return 1;
+
+	return 0;
+}
+
+static void cxgb4_getpgtccfg(struct net_device *dev, int tc,
+			     u8 *prio_type, u8 *pgid, u8 *bw_per,
+			     u8 *up_tc_map, int local)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int err;
+
+	*prio_type = *pgid = *bw_per = *up_tc_map = 0;
+
+	if (local)
+		INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+	else
+		INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+
+	pcmd.u.dcb.pgid.type = FW_PORT_DCB_TYPE_PGID;
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGID failed with %d\n", -err);
+		return;
+	}
+	*pgid = (be32_to_cpu(pcmd.u.dcb.pgid.pgid) >> (tc * 4)) & 0xf;
+
+	if (local)
+		INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+	else
+		INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+	pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE;
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n",
+			-err);
+		return;
+	}
+
+	*bw_per = pcmd.u.dcb.pgrate.pgrate[*pgid];
+	*up_tc_map = (1 << tc);
+
+	/* prio_type is link strict */
+	if (*pgid != 0xF)
+		*prio_type = 0x2;
+}
+
+static void cxgb4_getpgtccfg_tx(struct net_device *dev, int tc,
+				u8 *prio_type, u8 *pgid, u8 *bw_per,
+				u8 *up_tc_map)
+{
+	/* tc 0 is written at MSB position */
+	return cxgb4_getpgtccfg(dev, (7 - tc), prio_type, pgid, bw_per,
+				up_tc_map, 1);
+}
+
+
+static void cxgb4_getpgtccfg_rx(struct net_device *dev, int tc,
+				u8 *prio_type, u8 *pgid, u8 *bw_per,
+				u8 *up_tc_map)
+{
+	/* tc 0 is written at MSB position */
+	return cxgb4_getpgtccfg(dev, (7 - tc), prio_type, pgid, bw_per,
+				up_tc_map, 0);
+}
+
+static void cxgb4_setpgtccfg_tx(struct net_device *dev, int tc,
+				u8 prio_type, u8 pgid, u8 bw_per,
+				u8 up_tc_map)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int fw_tc = 7 - tc;
+	u32 _pgid;
+	int err;
+
+	if (pgid == DCB_ATTR_VALUE_UNDEFINED)
+		return;
+	if (bw_per == DCB_ATTR_VALUE_UNDEFINED)
+		return;
+
+	INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+	pcmd.u.dcb.pgid.type = FW_PORT_DCB_TYPE_PGID;
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGID failed with %d\n", -err);
+		return;
+	}
+
+	_pgid = be32_to_cpu(pcmd.u.dcb.pgid.pgid);
+	_pgid &= ~(0xF << (fw_tc * 4));
+	_pgid |= pgid << (fw_tc * 4);
+	pcmd.u.dcb.pgid.pgid = cpu_to_be32(_pgid);
+
+	INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id);
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB write PGID failed with %d\n",
+			-err);
+		return;
+	}
+
+	memset(&pcmd, 0, sizeof(struct fw_port_cmd));
+
+	INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+	pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE;
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n",
+			-err);
+		return;
+	}
+
+	pcmd.u.dcb.pgrate.pgrate[pgid] = bw_per;
+
+	INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id);
+	if (pi->dcb.state == CXGB4_DCB_STATE_HOST)
+		pcmd.op_to_portid |= cpu_to_be32(FW_PORT_CMD_APPLY_F);
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS)
+		dev_err(adap->pdev_dev, "DCB write PGRATE failed with %d\n",
+			-err);
+}
+
+static void cxgb4_getpgbwgcfg(struct net_device *dev, int pgid, u8 *bw_per,
+			      int local)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int err;
+
+	if (local)
+		INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+	else
+		INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+
+	pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE;
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n",
+			-err);
+		return;
+	}
+
+	*bw_per = pcmd.u.dcb.pgrate.pgrate[pgid];
+}
+
+static void cxgb4_getpgbwgcfg_tx(struct net_device *dev, int pgid, u8 *bw_per)
+{
+	return cxgb4_getpgbwgcfg(dev, pgid, bw_per, 1);
+}
+
+static void cxgb4_getpgbwgcfg_rx(struct net_device *dev, int pgid, u8 *bw_per)
+{
+	return cxgb4_getpgbwgcfg(dev, pgid, bw_per, 0);
+}
+
+static void cxgb4_setpgbwgcfg_tx(struct net_device *dev, int pgid,
+				 u8 bw_per)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int err;
+
+	INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+	pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE;
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n",
+			-err);
+		return;
+	}
+
+	pcmd.u.dcb.pgrate.pgrate[pgid] = bw_per;
+
+	INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id);
+	if (pi->dcb.state == CXGB4_DCB_STATE_HOST)
+		pcmd.op_to_portid |= cpu_to_be32(FW_PORT_CMD_APPLY_F);
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+
+	if (err != FW_PORT_DCB_CFG_SUCCESS)
+		dev_err(adap->pdev_dev, "DCB write PGRATE failed with %d\n",
+			-err);
+}
+
+/* Return whether the specified Traffic Class Priority has Priority Pause
+ * Frames enabled.
+ */
+static void cxgb4_getpfccfg(struct net_device *dev, int priority, u8 *pfccfg)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+
+	if (!cxgb4_dcb_state_synced(dcb->state) ||
+	    priority >= CXGB4_MAX_PRIORITY)
+		*pfccfg = 0;
+	else
+		*pfccfg = (pi->dcb.pfcen >> (7 - priority)) & 1;
+}
+
+/* Enable/disable Priority Pause Frames for the specified Traffic Class
+ * Priority.
+ */
+static void cxgb4_setpfccfg(struct net_device *dev, int priority, u8 pfccfg)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int err;
+
+	if (!cxgb4_dcb_state_synced(pi->dcb.state) ||
+	    priority >= CXGB4_MAX_PRIORITY)
+		return;
+
+	INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id);
+	if (pi->dcb.state == CXGB4_DCB_STATE_HOST)
+		pcmd.op_to_portid |= cpu_to_be32(FW_PORT_CMD_APPLY_F);
+
+	pcmd.u.dcb.pfc.type = FW_PORT_DCB_TYPE_PFC;
+	pcmd.u.dcb.pfc.pfcen = pi->dcb.pfcen;
+
+	if (pfccfg)
+		pcmd.u.dcb.pfc.pfcen |= (1 << (7 - priority));
+	else
+		pcmd.u.dcb.pfc.pfcen &= (~(1 << (7 - priority)));
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB PFC write failed with %d\n", -err);
+		return;
+	}
+
+	pi->dcb.pfcen = pcmd.u.dcb.pfc.pfcen;
+}
+
+static u8 cxgb4_setall(struct net_device *dev)
+{
+	return 0;
+}
+
+/* Return DCB capabilities.
+ */
+static u8 cxgb4_getcap(struct net_device *dev, int cap_id, u8 *caps)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	switch (cap_id) {
+	case DCB_CAP_ATTR_PG:
+	case DCB_CAP_ATTR_PFC:
+		*caps = true;
+		break;
+
+	case DCB_CAP_ATTR_PG_TCS:
+		/* 8 priorities for PG represented by bitmap */
+		*caps = 0x80;
+		break;
+
+	case DCB_CAP_ATTR_PFC_TCS:
+		/* 8 priorities for PFC represented by bitmap */
+		*caps = 0x80;
+		break;
+
+	case DCB_CAP_ATTR_GSP:
+		*caps = true;
+		break;
+
+	case DCB_CAP_ATTR_UP2TC:
+	case DCB_CAP_ATTR_BCN:
+		*caps = false;
+		break;
+
+	case DCB_CAP_ATTR_DCBX:
+		*caps = pi->dcb.supported;
+		break;
+
+	default:
+		*caps = false;
+	}
+
+	return 0;
+}
+
+/* Return the number of Traffic Classes for the indicated Traffic Class ID.
+ */
+static int cxgb4_getnumtcs(struct net_device *dev, int tcs_id, u8 *num)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	switch (tcs_id) {
+	case DCB_NUMTCS_ATTR_PG:
+		if (pi->dcb.msgs & CXGB4_DCB_FW_PGRATE)
+			*num = pi->dcb.pg_num_tcs_supported;
+		else
+			*num = 0x8;
+		break;
+
+	case DCB_NUMTCS_ATTR_PFC:
+		*num = 0x8;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Set the number of Traffic Classes supported for the indicated Traffic Class
+ * ID.
+ */
+static int cxgb4_setnumtcs(struct net_device *dev, int tcs_id, u8 num)
+{
+	/* Setting the number of Traffic Classes isn't supported.
+	 */
+	return -ENOSYS;
+}
+
+/* Return whether Priority Flow Control is enabled.  */
+static u8 cxgb4_getpfcstate(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	if (!cxgb4_dcb_state_synced(pi->dcb.state))
+		return false;
+
+	return pi->dcb.pfcen != 0;
+}
+
+/* Enable/disable Priority Flow Control. */
+static void cxgb4_setpfcstate(struct net_device *dev, u8 state)
+{
+	/* We can't enable/disable Priority Flow Control but we also can't
+	 * return an error ...
+	 */
+}
+
+/* Return the Application User Priority Map associated with the specified
+ * Application ID.
+ */
+static int __cxgb4_getapp(struct net_device *dev, u8 app_idtype, u16 app_id,
+			  int peer)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int i;
+
+	if (!cxgb4_dcb_state_synced(pi->dcb.state))
+		return 0;
+
+	for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) {
+		struct fw_port_cmd pcmd;
+		int err;
+
+		if (peer)
+			INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+		else
+			INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+
+		pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID;
+		pcmd.u.dcb.app_priority.idx = i;
+
+		err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+		if (err != FW_PORT_DCB_CFG_SUCCESS) {
+			dev_err(adap->pdev_dev, "DCB APP read failed with %d\n",
+				-err);
+			return err;
+		}
+		if (be16_to_cpu(pcmd.u.dcb.app_priority.protocolid) == app_id)
+			if (pcmd.u.dcb.app_priority.sel_field == app_idtype)
+				return pcmd.u.dcb.app_priority.user_prio_map;
+
+		/* exhausted app list */
+		if (!pcmd.u.dcb.app_priority.protocolid)
+			break;
+	}
+
+	return -EEXIST;
+}
+
+/* Return the Application User Priority Map associated with the specified
+ * Application ID.
+ */
+static int cxgb4_getapp(struct net_device *dev, u8 app_idtype, u16 app_id)
+{
+	/* Convert app_idtype to firmware format before querying */
+	return __cxgb4_getapp(dev, app_idtype == DCB_APP_IDTYPE_ETHTYPE ?
+			      app_idtype : 3, app_id, 0);
+}
+
+/* Write a new Application User Priority Map for the specified Application ID
+ */
+static int __cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
+			  u8 app_prio)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int i, err;
+
+
+	if (!cxgb4_dcb_state_synced(pi->dcb.state))
+		return -EINVAL;
+
+	/* DCB info gets thrown away on link up */
+	if (!netif_carrier_ok(dev))
+		return -ENOLINK;
+
+	for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) {
+		INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+		pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID;
+		pcmd.u.dcb.app_priority.idx = i;
+		err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+
+		if (err != FW_PORT_DCB_CFG_SUCCESS) {
+			dev_err(adap->pdev_dev, "DCB app table read failed with %d\n",
+				-err);
+			return err;
+		}
+		if (be16_to_cpu(pcmd.u.dcb.app_priority.protocolid) == app_id) {
+			/* overwrite existing app table */
+			pcmd.u.dcb.app_priority.protocolid = 0;
+			break;
+		}
+		/* find first empty slot */
+		if (!pcmd.u.dcb.app_priority.protocolid)
+			break;
+	}
+
+	if (i == CXGB4_MAX_DCBX_APP_SUPPORTED) {
+		/* no empty slots available */
+		dev_err(adap->pdev_dev, "DCB app table full\n");
+		return -EBUSY;
+	}
+
+	/* write out new app table entry */
+	INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id);
+	if (pi->dcb.state == CXGB4_DCB_STATE_HOST)
+		pcmd.op_to_portid |= cpu_to_be32(FW_PORT_CMD_APPLY_F);
+
+	pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID;
+	pcmd.u.dcb.app_priority.protocolid = cpu_to_be16(app_id);
+	pcmd.u.dcb.app_priority.sel_field = app_idtype;
+	pcmd.u.dcb.app_priority.user_prio_map = app_prio;
+	pcmd.u.dcb.app_priority.idx = i;
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB app table write failed with %d\n",
+			-err);
+		return err;
+	}
+
+	return 0;
+}
+
+/* Priority for CEE inside dcb_app is bitmask, with 0 being an invalid value */
+static int cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
+			u8 app_prio)
+{
+	int ret;
+	struct dcb_app app = {
+		.selector = app_idtype,
+		.protocol = app_id,
+		.priority = app_prio,
+	};
+
+	if (app_idtype != DCB_APP_IDTYPE_ETHTYPE &&
+	    app_idtype != DCB_APP_IDTYPE_PORTNUM)
+		return -EINVAL;
+
+	/* Convert app_idtype to a format that firmware understands */
+	ret = __cxgb4_setapp(dev, app_idtype == DCB_APP_IDTYPE_ETHTYPE ?
+			      app_idtype : 3, app_id, app_prio);
+	if (ret)
+		return ret;
+
+	return dcb_setapp(dev, &app);
+}
+
+/* Return whether IEEE Data Center Bridging has been negotiated.
+ */
+static inline int
+cxgb4_ieee_negotiation_complete(struct net_device *dev,
+				enum cxgb4_dcb_fw_msgs dcb_subtype)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+
+	if (dcb->state == CXGB4_DCB_STATE_FW_ALLSYNCED)
+		if (dcb_subtype && !(dcb->msgs & dcb_subtype))
+			return 0;
+
+	return (cxgb4_dcb_state_synced(dcb->state) &&
+		(dcb->supported & DCB_CAP_DCBX_VER_IEEE));
+}
+
+static int cxgb4_ieee_read_ets(struct net_device *dev, struct ieee_ets *ets,
+			       int local)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+	struct adapter *adap = pi->adapter;
+	uint32_t tc_info;
+	struct fw_port_cmd pcmd;
+	int i, bwg, err;
+
+	if (!(dcb->msgs & (CXGB4_DCB_FW_PGID | CXGB4_DCB_FW_PGRATE)))
+		return 0;
+
+	ets->ets_cap =  dcb->pg_num_tcs_supported;
+
+	if (local) {
+		ets->willing = 1;
+		INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+	} else {
+		INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+	}
+
+	pcmd.u.dcb.pgid.type = FW_PORT_DCB_TYPE_PGID;
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGID failed with %d\n", -err);
+		return err;
+	}
+
+	tc_info = be32_to_cpu(pcmd.u.dcb.pgid.pgid);
+
+	if (local)
+		INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+	else
+		INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+
+	pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE;
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n",
+			-err);
+		return err;
+	}
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		bwg = (tc_info >> ((7 - i) * 4)) & 0xF;
+		ets->prio_tc[i] = bwg;
+		ets->tc_tx_bw[i] = pcmd.u.dcb.pgrate.pgrate[i];
+		ets->tc_rx_bw[i] = ets->tc_tx_bw[i];
+		ets->tc_tsa[i] = pcmd.u.dcb.pgrate.tsa[i];
+	}
+
+	return 0;
+}
+
+static int cxgb4_ieee_get_ets(struct net_device *dev, struct ieee_ets *ets)
+{
+	return cxgb4_ieee_read_ets(dev, ets, 1);
+}
+
+/* We reuse this for peer PFC as well, as we can't have it enabled one way */
+static int cxgb4_ieee_get_pfc(struct net_device *dev, struct ieee_pfc *pfc)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+
+	memset(pfc, 0, sizeof(struct ieee_pfc));
+
+	if (!(dcb->msgs & CXGB4_DCB_FW_PFC))
+		return 0;
+
+	pfc->pfc_cap = dcb->pfc_num_tcs_supported;
+	pfc->pfc_en = bitswap_1(dcb->pfcen);
+
+	return 0;
+}
+
+static int cxgb4_ieee_peer_ets(struct net_device *dev, struct ieee_ets *ets)
+{
+	return cxgb4_ieee_read_ets(dev, ets, 0);
+}
+
+/* Fill in the Application User Priority Map associated with the
+ * specified Application.
+ * Priority for IEEE dcb_app is an integer, with 0 being a valid value
+ */
+static int cxgb4_ieee_getapp(struct net_device *dev, struct dcb_app *app)
+{
+	int prio;
+
+	if (!cxgb4_ieee_negotiation_complete(dev, CXGB4_DCB_FW_APP_ID))
+		return -EINVAL;
+	if (!(app->selector && app->protocol))
+		return -EINVAL;
+
+	/* Try querying firmware first, use firmware format */
+	prio = __cxgb4_getapp(dev, app->selector - 1, app->protocol, 0);
+
+	if (prio < 0)
+		prio = dcb_ieee_getapp_mask(dev, app);
+
+	app->priority = ffs(prio) - 1;
+	return 0;
+}
+
+/* Write a new Application User Priority Map for the specified Application ID.
+ * Priority for IEEE dcb_app is an integer, with 0 being a valid value
+ */
+static int cxgb4_ieee_setapp(struct net_device *dev, struct dcb_app *app)
+{
+	int ret;
+
+	if (!cxgb4_ieee_negotiation_complete(dev, CXGB4_DCB_FW_APP_ID))
+		return -EINVAL;
+	if (!(app->selector && app->protocol))
+		return -EINVAL;
+
+	if (!(app->selector > IEEE_8021QAZ_APP_SEL_ETHERTYPE  &&
+	      app->selector < IEEE_8021QAZ_APP_SEL_ANY))
+		return -EINVAL;
+
+	/* change selector to a format that firmware understands */
+	ret = __cxgb4_setapp(dev, app->selector - 1, app->protocol,
+			     (1 << app->priority));
+	if (ret)
+		return ret;
+
+	return dcb_ieee_setapp(dev, app);
+}
+
+/* Return our DCBX parameters.
+ */
+static u8 cxgb4_getdcbx(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	/* This is already set by cxgb4_set_dcb_caps, so just return it */
+	return pi->dcb.supported;
+}
+
+/* Set our DCBX parameters.
+ */
+static u8 cxgb4_setdcbx(struct net_device *dev, u8 dcb_request)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	/* Filter out requests which exceed our capabilities.
+	 */
+	if ((dcb_request & (CXGB4_DCBX_FW_SUPPORT | CXGB4_DCBX_HOST_SUPPORT))
+	    != dcb_request)
+		return 1;
+
+	/* Can't enable DCB if we haven't successfully negotiated it.
+	 */
+	if (!cxgb4_dcb_state_synced(pi->dcb.state))
+		return 1;
+
+	/* There's currently no mechanism to allow for the firmware DCBX
+	 * negotiation to be changed from the Host Driver.  If the caller
+	 * requests exactly the same parameters that we already have then
+	 * we'll allow them to be successfully "set" ...
+	 */
+	if (dcb_request != pi->dcb.supported)
+		return 1;
+
+	pi->dcb.supported = dcb_request;
+	return 0;
+}
+
+static int cxgb4_getpeer_app(struct net_device *dev,
+			     struct dcb_peer_app_info *info, u16 *app_count)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int i, err = 0;
+
+	if (!cxgb4_dcb_state_synced(pi->dcb.state))
+		return 1;
+
+	info->willing = 0;
+	info->error = 0;
+
+	*app_count = 0;
+	for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) {
+		INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+		pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID;
+		pcmd.u.dcb.app_priority.idx = *app_count;
+		err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+
+		if (err != FW_PORT_DCB_CFG_SUCCESS) {
+			dev_err(adap->pdev_dev, "DCB app table read failed with %d\n",
+				-err);
+			return err;
+		}
+
+		/* find first empty slot */
+		if (!pcmd.u.dcb.app_priority.protocolid)
+			break;
+	}
+	*app_count = i;
+	return err;
+}
+
+static int cxgb4_getpeerapp_tbl(struct net_device *dev, struct dcb_app *table)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int i, err = 0;
+
+	if (!cxgb4_dcb_state_synced(pi->dcb.state))
+		return 1;
+
+	for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) {
+		INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+		pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID;
+		pcmd.u.dcb.app_priority.idx = i;
+		err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+
+		if (err != FW_PORT_DCB_CFG_SUCCESS) {
+			dev_err(adap->pdev_dev, "DCB app table read failed with %d\n",
+				-err);
+			return err;
+		}
+
+		/* find first empty slot */
+		if (!pcmd.u.dcb.app_priority.protocolid)
+			break;
+
+		table[i].selector = (pcmd.u.dcb.app_priority.sel_field + 1);
+		table[i].protocol =
+			be16_to_cpu(pcmd.u.dcb.app_priority.protocolid);
+		table[i].priority =
+			ffs(pcmd.u.dcb.app_priority.user_prio_map) - 1;
+	}
+	return err;
+}
+
+/* Return Priority Group information.
+ */
+static int cxgb4_cee_peer_getpg(struct net_device *dev, struct cee_pg *pg)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	u32 pgid;
+	int i, err;
+
+	/* We're always "willing" -- the Switch Fabric always dictates the
+	 * DCBX parameters to us.
+	 */
+	pg->willing = true;
+
+	INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+	pcmd.u.dcb.pgid.type = FW_PORT_DCB_TYPE_PGID;
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGID failed with %d\n", -err);
+		return err;
+	}
+	pgid = be32_to_cpu(pcmd.u.dcb.pgid.pgid);
+
+	for (i = 0; i < CXGB4_MAX_PRIORITY; i++)
+		pg->prio_pg[7 - i] = (pgid >> (i * 4)) & 0xF;
+
+	INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+	pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE;
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n",
+			-err);
+		return err;
+	}
+
+	for (i = 0; i < CXGB4_MAX_PRIORITY; i++)
+		pg->pg_bw[i] = pcmd.u.dcb.pgrate.pgrate[i];
+
+	pg->tcs_supported = pcmd.u.dcb.pgrate.num_tcs_supported;
+
+	return 0;
+}
+
+/* Return Priority Flow Control information.
+ */
+static int cxgb4_cee_peer_getpfc(struct net_device *dev, struct cee_pfc *pfc)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	cxgb4_getnumtcs(dev, DCB_NUMTCS_ATTR_PFC, &(pfc->tcs_supported));
+
+	/* Firmware sends this to us in a formwat that is a bit flipped version
+	 * of spec, correct it before we send it to host. This is taken care of
+	 * by bit shifting in other uses of pfcen
+	 */
+	pfc->pfc_en = bitswap_1(pi->dcb.pfcen);
+
+	pfc->tcs_supported = pi->dcb.pfc_num_tcs_supported;
+
+	return 0;
+}
+
+const struct dcbnl_rtnl_ops cxgb4_dcb_ops = {
+	.ieee_getets		= cxgb4_ieee_get_ets,
+	.ieee_getpfc		= cxgb4_ieee_get_pfc,
+	.ieee_getapp		= cxgb4_ieee_getapp,
+	.ieee_setapp		= cxgb4_ieee_setapp,
+	.ieee_peer_getets	= cxgb4_ieee_peer_ets,
+	.ieee_peer_getpfc	= cxgb4_ieee_get_pfc,
+
+	/* CEE std */
+	.getstate		= cxgb4_getstate,
+	.setstate		= cxgb4_setstate,
+	.getpgtccfgtx		= cxgb4_getpgtccfg_tx,
+	.getpgbwgcfgtx		= cxgb4_getpgbwgcfg_tx,
+	.getpgtccfgrx		= cxgb4_getpgtccfg_rx,
+	.getpgbwgcfgrx		= cxgb4_getpgbwgcfg_rx,
+	.setpgtccfgtx		= cxgb4_setpgtccfg_tx,
+	.setpgbwgcfgtx		= cxgb4_setpgbwgcfg_tx,
+	.setpfccfg		= cxgb4_setpfccfg,
+	.getpfccfg		= cxgb4_getpfccfg,
+	.setall			= cxgb4_setall,
+	.getcap			= cxgb4_getcap,
+	.getnumtcs		= cxgb4_getnumtcs,
+	.setnumtcs		= cxgb4_setnumtcs,
+	.getpfcstate		= cxgb4_getpfcstate,
+	.setpfcstate		= cxgb4_setpfcstate,
+	.getapp			= cxgb4_getapp,
+	.setapp			= cxgb4_setapp,
+
+	/* DCBX configuration */
+	.getdcbx		= cxgb4_getdcbx,
+	.setdcbx		= cxgb4_setdcbx,
+
+	/* peer apps */
+	.peer_getappinfo	= cxgb4_getpeer_app,
+	.peer_getapptable	= cxgb4_getpeerapp_tbl,
+
+	/* CEE peer */
+	.cee_peer_getpg		= cxgb4_cee_peer_getpg,
+	.cee_peer_getpfc	= cxgb4_cee_peer_getpfc,
+};
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h
new file mode 100644
index 0000000..02040b9
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h
@@ -0,0 +1,163 @@
+/*
+ *  Copyright (C) 2013-2014 Chelsio Communications.  All rights reserved.
+ *
+ *  Written by Anish Bhatt (anish@chelsio.com)
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#ifndef __CXGB4_DCB_H
+#define __CXGB4_DCB_H
+
+#include <linux/netdevice.h>
+#include <linux/dcbnl.h>
+#include <net/dcbnl.h>
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+
+#define CXGB4_DCBX_FW_SUPPORT \
+	(DCB_CAP_DCBX_VER_CEE | \
+	 DCB_CAP_DCBX_VER_IEEE | \
+	 DCB_CAP_DCBX_LLD_MANAGED)
+#define CXGB4_DCBX_HOST_SUPPORT \
+	(DCB_CAP_DCBX_VER_CEE | \
+	 DCB_CAP_DCBX_VER_IEEE | \
+	 DCB_CAP_DCBX_HOST)
+
+#define CXGB4_MAX_PRIORITY      CXGB4_MAX_DCBX_APP_SUPPORTED
+#define CXGB4_MAX_TCS           CXGB4_MAX_DCBX_APP_SUPPORTED
+
+#define INIT_PORT_DCB_CMD(__pcmd, __port, __op, __action) \
+	do { \
+		memset(&(__pcmd), 0, sizeof(__pcmd)); \
+		(__pcmd).op_to_portid = \
+			cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) | \
+				    FW_CMD_REQUEST_F | \
+				    FW_CMD_##__op##_F | \
+				    FW_PORT_CMD_PORTID_V(__port)); \
+		(__pcmd).action_to_len16 = \
+			cpu_to_be32(FW_PORT_CMD_ACTION_V(__action) | \
+				    FW_LEN16(pcmd)); \
+	} while (0)
+
+#define INIT_PORT_DCB_READ_PEER_CMD(__pcmd, __port) \
+	INIT_PORT_DCB_CMD(__pcmd, __port, READ, FW_PORT_ACTION_DCB_READ_RECV)
+
+#define INIT_PORT_DCB_READ_LOCAL_CMD(__pcmd, __port) \
+	INIT_PORT_DCB_CMD(__pcmd, __port, READ, FW_PORT_ACTION_DCB_READ_TRANS)
+
+#define INIT_PORT_DCB_READ_SYNC_CMD(__pcmd, __port) \
+	INIT_PORT_DCB_CMD(__pcmd, __port, READ, FW_PORT_ACTION_DCB_READ_DET)
+
+#define INIT_PORT_DCB_WRITE_CMD(__pcmd, __port) \
+	INIT_PORT_DCB_CMD(__pcmd, __port, EXEC, FW_PORT_ACTION_L2_DCB_CFG)
+
+#define IEEE_FAUX_SYNC(__dev, __dcb) \
+	do { \
+		if ((__dcb)->dcb_version == FW_PORT_DCB_VER_IEEE) \
+			cxgb4_dcb_state_fsm((__dev), \
+					    CXGB4_DCB_STATE_FW_ALLSYNCED); \
+	} while (0)
+
+/* States we can be in for a port's Data Center Bridging.
+ */
+enum cxgb4_dcb_state {
+	CXGB4_DCB_STATE_START,		/* initial unknown state */
+	CXGB4_DCB_STATE_HOST,		/* we're using Host DCB (if at all) */
+	CXGB4_DCB_STATE_FW_INCOMPLETE,	/* using firmware DCB, incomplete */
+	CXGB4_DCB_STATE_FW_ALLSYNCED,	/* using firmware DCB, all sync'ed */
+};
+
+/* Data Center Bridging state input for the Finite State Machine.
+ */
+enum cxgb4_dcb_state_input {
+	/* Input from the firmware.
+	 */
+	CXGB4_DCB_INPUT_FW_DISABLED,	/* firmware DCB disabled */
+	CXGB4_DCB_INPUT_FW_ENABLED,	/* firmware DCB enabled */
+	CXGB4_DCB_INPUT_FW_INCOMPLETE,	/* firmware reports incomplete DCB */
+	CXGB4_DCB_INPUT_FW_ALLSYNCED,	/* firmware reports all sync'ed */
+
+};
+
+/* Firmware DCB messages that we've received so far ...
+ */
+enum cxgb4_dcb_fw_msgs {
+	CXGB4_DCB_FW_PGID	= 0x01,
+	CXGB4_DCB_FW_PGRATE	= 0x02,
+	CXGB4_DCB_FW_PRIORATE	= 0x04,
+	CXGB4_DCB_FW_PFC	= 0x08,
+	CXGB4_DCB_FW_APP_ID	= 0x10,
+};
+
+#define CXGB4_MAX_DCBX_APP_SUPPORTED 8
+
+/* Data Center Bridging support;
+ */
+struct port_dcb_info {
+	enum cxgb4_dcb_state state;	/* DCB State Machine */
+	enum cxgb4_dcb_fw_msgs msgs;	/* DCB Firmware messages received */
+	unsigned int supported;		/* OS DCB capabilities supported */
+	bool enabled;			/* OS Enabled state */
+
+	/* Cached copies of DCB information sent by the firmware (in Host
+	 * Native Endian format).
+	 */
+	u32	pgid;			/* Priority Group[0..7] */
+	u8	dcb_version;		/* Running DCBx version */
+	u8	pfcen;			/* Priority Flow Control[0..7] */
+	u8	pg_num_tcs_supported;	/* max PG Traffic Classes */
+	u8	pfc_num_tcs_supported;	/* max PFC Traffic Classes */
+	u8	pgrate[8];		/* Priority Group Rate[0..7] */
+	u8	priorate[8];		/* Priority Rate[0..7] */
+	u8	tsa[8];			/* TSA Algorithm[0..7] */
+	struct app_priority { /* Application Information */
+		u8	user_prio_map;	/* Priority Map bitfield */
+		u8	sel_field;	/* Protocol ID interpretation */
+		u16	protocolid;	/* Protocol ID */
+	} app_priority[CXGB4_MAX_DCBX_APP_SUPPORTED];
+};
+
+void cxgb4_dcb_state_init(struct net_device *);
+void cxgb4_dcb_version_init(struct net_device *);
+void cxgb4_dcb_reset(struct net_device *dev);
+void cxgb4_dcb_state_fsm(struct net_device *, enum cxgb4_dcb_state_input);
+void cxgb4_dcb_handle_fw_update(struct adapter *, const struct fw_port_cmd *);
+void cxgb4_dcb_set_caps(struct adapter *, const struct fw_port_cmd *);
+extern const struct dcbnl_rtnl_ops cxgb4_dcb_ops;
+
+static inline __u8 bitswap_1(unsigned char val)
+{
+	return ((val & 0x80) >> 7) |
+	       ((val & 0x40) >> 5) |
+	       ((val & 0x20) >> 3) |
+	       ((val & 0x10) >> 1) |
+	       ((val & 0x08) << 1) |
+	       ((val & 0x04) << 3) |
+	       ((val & 0x02) << 5) |
+	       ((val & 0x01) << 7);
+}
+#define CXGB4_DCB_ENABLED true
+
+#else /* !CONFIG_CHELSIO_T4_DCB */
+
+static inline void cxgb4_dcb_state_init(struct net_device *dev)
+{
+}
+
+#define CXGB4_DCB_ENABLED false
+
+#endif /* !CONFIG_CHELSIO_T4_DCB */
+
+#endif /* __CXGB4_DCB_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
new file mode 100644
index 0000000..251d5bd
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
@@ -0,0 +1,3061 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/string_helpers.h>
+#include <linux/sort.h>
+#include <linux/ctype.h>
+
+#include "cxgb4.h"
+#include "t4_regs.h"
+#include "t4_values.h"
+#include "t4fw_api.h"
+#include "cxgb4_debugfs.h"
+#include "clip_tbl.h"
+#include "l2t.h"
+#include "cudbg_if.h"
+#include "cudbg_lib_common.h"
+#include "cudbg_entity.h"
+#include "cudbg_lib.h"
+
+/* generic seq_file support for showing a table of size rows x width. */
+static void *seq_tab_get_idx(struct seq_tab *tb, loff_t pos)
+{
+	pos -= tb->skip_first;
+	return pos >= tb->rows ? NULL : &tb->data[pos * tb->width];
+}
+
+static void *seq_tab_start(struct seq_file *seq, loff_t *pos)
+{
+	struct seq_tab *tb = seq->private;
+
+	if (tb->skip_first && *pos == 0)
+		return SEQ_START_TOKEN;
+
+	return seq_tab_get_idx(tb, *pos);
+}
+
+static void *seq_tab_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	v = seq_tab_get_idx(seq->private, *pos + 1);
+	if (v)
+		++*pos;
+	return v;
+}
+
+static void seq_tab_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int seq_tab_show(struct seq_file *seq, void *v)
+{
+	const struct seq_tab *tb = seq->private;
+
+	return tb->show(seq, v, ((char *)v - tb->data) / tb->width);
+}
+
+static const struct seq_operations seq_tab_ops = {
+	.start = seq_tab_start,
+	.next  = seq_tab_next,
+	.stop  = seq_tab_stop,
+	.show  = seq_tab_show
+};
+
+struct seq_tab *seq_open_tab(struct file *f, unsigned int rows,
+			     unsigned int width, unsigned int have_header,
+			     int (*show)(struct seq_file *seq, void *v, int i))
+{
+	struct seq_tab *p;
+
+	p = __seq_open_private(f, &seq_tab_ops, sizeof(*p) + rows * width);
+	if (p) {
+		p->show = show;
+		p->rows = rows;
+		p->width = width;
+		p->skip_first = have_header != 0;
+	}
+	return p;
+}
+
+/* Trim the size of a seq_tab to the supplied number of rows.  The operation is
+ * irreversible.
+ */
+static int seq_tab_trim(struct seq_tab *p, unsigned int new_rows)
+{
+	if (new_rows > p->rows)
+		return -EINVAL;
+	p->rows = new_rows;
+	return 0;
+}
+
+static int cim_la_show(struct seq_file *seq, void *v, int idx)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "Status   Data      PC     LS0Stat  LS0Addr "
+			 "            LS0Data\n");
+	else {
+		const u32 *p = v;
+
+		seq_printf(seq,
+			   "  %02x  %x%07x %x%07x %08x %08x %08x%08x%08x%08x\n",
+			   (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4,
+			   p[1] & 0xf, p[2] >> 4, p[2] & 0xf, p[3], p[4], p[5],
+			   p[6], p[7]);
+	}
+	return 0;
+}
+
+static int cim_la_show_3in1(struct seq_file *seq, void *v, int idx)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "Status   Data      PC\n");
+	} else {
+		const u32 *p = v;
+
+		seq_printf(seq, "  %02x   %08x %08x\n", p[5] & 0xff, p[6],
+			   p[7]);
+		seq_printf(seq, "  %02x   %02x%06x %02x%06x\n",
+			   (p[3] >> 8) & 0xff, p[3] & 0xff, p[4] >> 8,
+			   p[4] & 0xff, p[5] >> 8);
+		seq_printf(seq, "  %02x   %x%07x %x%07x\n", (p[0] >> 4) & 0xff,
+			   p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4);
+	}
+	return 0;
+}
+
+static int cim_la_show_t6(struct seq_file *seq, void *v, int idx)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "Status   Inst    Data      PC     LS0Stat  "
+			 "LS0Addr  LS0Data  LS1Stat  LS1Addr  LS1Data\n");
+	} else {
+		const u32 *p = v;
+
+		seq_printf(seq, "  %02x   %04x%04x %04x%04x %04x%04x %08x %08x %08x %08x %08x %08x\n",
+			   (p[9] >> 16) & 0xff,       /* Status */
+			   p[9] & 0xffff, p[8] >> 16, /* Inst */
+			   p[8] & 0xffff, p[7] >> 16, /* Data */
+			   p[7] & 0xffff, p[6] >> 16, /* PC */
+			   p[2], p[1], p[0],      /* LS0 Stat, Addr and Data */
+			   p[5], p[4], p[3]);     /* LS1 Stat, Addr and Data */
+	}
+	return 0;
+}
+
+static int cim_la_show_pc_t6(struct seq_file *seq, void *v, int idx)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "Status   Inst    Data      PC\n");
+	} else {
+		const u32 *p = v;
+
+		seq_printf(seq, "  %02x   %08x %08x %08x\n",
+			   p[3] & 0xff, p[2], p[1], p[0]);
+		seq_printf(seq, "  %02x   %02x%06x %02x%06x %02x%06x\n",
+			   (p[6] >> 8) & 0xff, p[6] & 0xff, p[5] >> 8,
+			   p[5] & 0xff, p[4] >> 8, p[4] & 0xff, p[3] >> 8);
+		seq_printf(seq, "  %02x   %04x%04x %04x%04x %04x%04x\n",
+			   (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16,
+			   p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff,
+			   p[6] >> 16);
+	}
+	return 0;
+}
+
+static int cim_la_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	unsigned int cfg;
+	struct seq_tab *p;
+	struct adapter *adap = inode->i_private;
+
+	ret = t4_cim_read(adap, UP_UP_DBG_LA_CFG_A, 1, &cfg);
+	if (ret)
+		return ret;
+
+	if (is_t6(adap->params.chip)) {
+		/* +1 to account for integer division of CIMLA_SIZE/10 */
+		p = seq_open_tab(file, (adap->params.cim_la_size / 10) + 1,
+				 10 * sizeof(u32), 1,
+				 cfg & UPDBGLACAPTPCONLY_F ?
+					cim_la_show_pc_t6 : cim_la_show_t6);
+	} else {
+		p = seq_open_tab(file, adap->params.cim_la_size / 8,
+				 8 * sizeof(u32), 1,
+				 cfg & UPDBGLACAPTPCONLY_F ? cim_la_show_3in1 :
+							     cim_la_show);
+	}
+	if (!p)
+		return -ENOMEM;
+
+	ret = t4_cim_read_la(adap, (u32 *)p->data, NULL);
+	if (ret)
+		seq_release_private(inode, file);
+	return ret;
+}
+
+static const struct file_operations cim_la_fops = {
+	.owner   = THIS_MODULE,
+	.open    = cim_la_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private
+};
+
+static int cim_pif_la_show(struct seq_file *seq, void *v, int idx)
+{
+	const u32 *p = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "Cntl ID DataBE   Addr                 Data\n");
+	} else if (idx < CIM_PIFLA_SIZE) {
+		seq_printf(seq, " %02x  %02x  %04x  %08x %08x%08x%08x%08x\n",
+			   (p[5] >> 22) & 0xff, (p[5] >> 16) & 0x3f,
+			   p[5] & 0xffff, p[4], p[3], p[2], p[1], p[0]);
+	} else {
+		if (idx == CIM_PIFLA_SIZE)
+			seq_puts(seq, "\nCntl ID               Data\n");
+		seq_printf(seq, " %02x  %02x %08x%08x%08x%08x\n",
+			   (p[4] >> 6) & 0xff, p[4] & 0x3f,
+			   p[3], p[2], p[1], p[0]);
+	}
+	return 0;
+}
+
+static int cim_pif_la_open(struct inode *inode, struct file *file)
+{
+	struct seq_tab *p;
+	struct adapter *adap = inode->i_private;
+
+	p = seq_open_tab(file, 2 * CIM_PIFLA_SIZE, 6 * sizeof(u32), 1,
+			 cim_pif_la_show);
+	if (!p)
+		return -ENOMEM;
+
+	t4_cim_read_pif_la(adap, (u32 *)p->data,
+			   (u32 *)p->data + 6 * CIM_PIFLA_SIZE, NULL, NULL);
+	return 0;
+}
+
+static const struct file_operations cim_pif_la_fops = {
+	.owner   = THIS_MODULE,
+	.open    = cim_pif_la_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private
+};
+
+static int cim_ma_la_show(struct seq_file *seq, void *v, int idx)
+{
+	const u32 *p = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "\n");
+	} else if (idx < CIM_MALA_SIZE) {
+		seq_printf(seq, "%02x%08x%08x%08x%08x\n",
+			   p[4], p[3], p[2], p[1], p[0]);
+	} else {
+		if (idx == CIM_MALA_SIZE)
+			seq_puts(seq,
+				 "\nCnt ID Tag UE       Data       RDY VLD\n");
+		seq_printf(seq, "%3u %2u  %x   %u %08x%08x  %u   %u\n",
+			   (p[2] >> 10) & 0xff, (p[2] >> 7) & 7,
+			   (p[2] >> 3) & 0xf, (p[2] >> 2) & 1,
+			   (p[1] >> 2) | ((p[2] & 3) << 30),
+			   (p[0] >> 2) | ((p[1] & 3) << 30), (p[0] >> 1) & 1,
+			   p[0] & 1);
+	}
+	return 0;
+}
+
+static int cim_ma_la_open(struct inode *inode, struct file *file)
+{
+	struct seq_tab *p;
+	struct adapter *adap = inode->i_private;
+
+	p = seq_open_tab(file, 2 * CIM_MALA_SIZE, 5 * sizeof(u32), 1,
+			 cim_ma_la_show);
+	if (!p)
+		return -ENOMEM;
+
+	t4_cim_read_ma_la(adap, (u32 *)p->data,
+			  (u32 *)p->data + 5 * CIM_MALA_SIZE);
+	return 0;
+}
+
+static const struct file_operations cim_ma_la_fops = {
+	.owner   = THIS_MODULE,
+	.open    = cim_ma_la_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private
+};
+
+static int cim_qcfg_show(struct seq_file *seq, void *v)
+{
+	static const char * const qname[] = {
+		"TP0", "TP1", "ULP", "SGE0", "SGE1", "NC-SI",
+		"ULP0", "ULP1", "ULP2", "ULP3", "SGE", "NC-SI",
+		"SGE0-RX", "SGE1-RX"
+	};
+
+	int i;
+	struct adapter *adap = seq->private;
+	u16 base[CIM_NUM_IBQ + CIM_NUM_OBQ_T5];
+	u16 size[CIM_NUM_IBQ + CIM_NUM_OBQ_T5];
+	u32 stat[(4 * (CIM_NUM_IBQ + CIM_NUM_OBQ_T5))];
+	u16 thres[CIM_NUM_IBQ];
+	u32 obq_wr_t4[2 * CIM_NUM_OBQ], *wr;
+	u32 obq_wr_t5[2 * CIM_NUM_OBQ_T5];
+	u32 *p = stat;
+	int cim_num_obq = is_t4(adap->params.chip) ?
+				CIM_NUM_OBQ : CIM_NUM_OBQ_T5;
+
+	i = t4_cim_read(adap, is_t4(adap->params.chip) ? UP_IBQ_0_RDADDR_A :
+			UP_IBQ_0_SHADOW_RDADDR_A,
+			ARRAY_SIZE(stat), stat);
+	if (!i) {
+		if (is_t4(adap->params.chip)) {
+			i = t4_cim_read(adap, UP_OBQ_0_REALADDR_A,
+					ARRAY_SIZE(obq_wr_t4), obq_wr_t4);
+			wr = obq_wr_t4;
+		} else {
+			i = t4_cim_read(adap, UP_OBQ_0_SHADOW_REALADDR_A,
+					ARRAY_SIZE(obq_wr_t5), obq_wr_t5);
+			wr = obq_wr_t5;
+		}
+	}
+	if (i)
+		return i;
+
+	t4_read_cimq_cfg(adap, base, size, thres);
+
+	seq_printf(seq,
+		   "  Queue  Base  Size Thres  RdPtr WrPtr  SOP  EOP Avail\n");
+	for (i = 0; i < CIM_NUM_IBQ; i++, p += 4)
+		seq_printf(seq, "%7s %5x %5u %5u %6x  %4x %4u %4u %5u\n",
+			   qname[i], base[i], size[i], thres[i],
+			   IBQRDADDR_G(p[0]), IBQWRADDR_G(p[1]),
+			   QUESOPCNT_G(p[3]), QUEEOPCNT_G(p[3]),
+			   QUEREMFLITS_G(p[2]) * 16);
+	for ( ; i < CIM_NUM_IBQ + cim_num_obq; i++, p += 4, wr += 2)
+		seq_printf(seq, "%7s %5x %5u %12x  %4x %4u %4u %5u\n",
+			   qname[i], base[i], size[i],
+			   QUERDADDR_G(p[0]) & 0x3fff, wr[0] - base[i],
+			   QUESOPCNT_G(p[3]), QUEEOPCNT_G(p[3]),
+			   QUEREMFLITS_G(p[2]) * 16);
+	return 0;
+}
+
+static int cim_qcfg_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cim_qcfg_show, inode->i_private);
+}
+
+static const struct file_operations cim_qcfg_fops = {
+	.owner   = THIS_MODULE,
+	.open    = cim_qcfg_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+static int cimq_show(struct seq_file *seq, void *v, int idx)
+{
+	const u32 *p = v;
+
+	seq_printf(seq, "%#06x: %08x %08x %08x %08x\n", idx * 16, p[0], p[1],
+		   p[2], p[3]);
+	return 0;
+}
+
+static int cim_ibq_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct seq_tab *p;
+	unsigned int qid = (uintptr_t)inode->i_private & 7;
+	struct adapter *adap = inode->i_private - qid;
+
+	p = seq_open_tab(file, CIM_IBQ_SIZE, 4 * sizeof(u32), 0, cimq_show);
+	if (!p)
+		return -ENOMEM;
+
+	ret = t4_read_cim_ibq(adap, qid, (u32 *)p->data, CIM_IBQ_SIZE * 4);
+	if (ret < 0)
+		seq_release_private(inode, file);
+	else
+		ret = 0;
+	return ret;
+}
+
+static const struct file_operations cim_ibq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = cim_ibq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private
+};
+
+static int cim_obq_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct seq_tab *p;
+	unsigned int qid = (uintptr_t)inode->i_private & 7;
+	struct adapter *adap = inode->i_private - qid;
+
+	p = seq_open_tab(file, 6 * CIM_OBQ_SIZE, 4 * sizeof(u32), 0, cimq_show);
+	if (!p)
+		return -ENOMEM;
+
+	ret = t4_read_cim_obq(adap, qid, (u32 *)p->data, 6 * CIM_OBQ_SIZE * 4);
+	if (ret < 0) {
+		seq_release_private(inode, file);
+	} else {
+		seq_tab_trim(p, ret / 4);
+		ret = 0;
+	}
+	return ret;
+}
+
+static const struct file_operations cim_obq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = cim_obq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private
+};
+
+struct field_desc {
+	const char *name;
+	unsigned int start;
+	unsigned int width;
+};
+
+static void field_desc_show(struct seq_file *seq, u64 v,
+			    const struct field_desc *p)
+{
+	char buf[32];
+	int line_size = 0;
+
+	while (p->name) {
+		u64 mask = (1ULL << p->width) - 1;
+		int len = scnprintf(buf, sizeof(buf), "%s: %llu", p->name,
+				    ((unsigned long long)v >> p->start) & mask);
+
+		if (line_size + len >= 79) {
+			line_size = 8;
+			seq_puts(seq, "\n        ");
+		}
+		seq_printf(seq, "%s ", buf);
+		line_size += len + 1;
+		p++;
+	}
+	seq_putc(seq, '\n');
+}
+
+static struct field_desc tp_la0[] = {
+	{ "RcfOpCodeOut", 60, 4 },
+	{ "State", 56, 4 },
+	{ "WcfState", 52, 4 },
+	{ "RcfOpcSrcOut", 50, 2 },
+	{ "CRxError", 49, 1 },
+	{ "ERxError", 48, 1 },
+	{ "SanityFailed", 47, 1 },
+	{ "SpuriousMsg", 46, 1 },
+	{ "FlushInputMsg", 45, 1 },
+	{ "FlushInputCpl", 44, 1 },
+	{ "RssUpBit", 43, 1 },
+	{ "RssFilterHit", 42, 1 },
+	{ "Tid", 32, 10 },
+	{ "InitTcb", 31, 1 },
+	{ "LineNumber", 24, 7 },
+	{ "Emsg", 23, 1 },
+	{ "EdataOut", 22, 1 },
+	{ "Cmsg", 21, 1 },
+	{ "CdataOut", 20, 1 },
+	{ "EreadPdu", 19, 1 },
+	{ "CreadPdu", 18, 1 },
+	{ "TunnelPkt", 17, 1 },
+	{ "RcfPeerFin", 16, 1 },
+	{ "RcfReasonOut", 12, 4 },
+	{ "TxCchannel", 10, 2 },
+	{ "RcfTxChannel", 8, 2 },
+	{ "RxEchannel", 6, 2 },
+	{ "RcfRxChannel", 5, 1 },
+	{ "RcfDataOutSrdy", 4, 1 },
+	{ "RxDvld", 3, 1 },
+	{ "RxOoDvld", 2, 1 },
+	{ "RxCongestion", 1, 1 },
+	{ "TxCongestion", 0, 1 },
+	{ NULL }
+};
+
+static int tp_la_show(struct seq_file *seq, void *v, int idx)
+{
+	const u64 *p = v;
+
+	field_desc_show(seq, *p, tp_la0);
+	return 0;
+}
+
+static int tp_la_show2(struct seq_file *seq, void *v, int idx)
+{
+	const u64 *p = v;
+
+	if (idx)
+		seq_putc(seq, '\n');
+	field_desc_show(seq, p[0], tp_la0);
+	if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL)
+		field_desc_show(seq, p[1], tp_la0);
+	return 0;
+}
+
+static int tp_la_show3(struct seq_file *seq, void *v, int idx)
+{
+	static struct field_desc tp_la1[] = {
+		{ "CplCmdIn", 56, 8 },
+		{ "CplCmdOut", 48, 8 },
+		{ "ESynOut", 47, 1 },
+		{ "EAckOut", 46, 1 },
+		{ "EFinOut", 45, 1 },
+		{ "ERstOut", 44, 1 },
+		{ "SynIn", 43, 1 },
+		{ "AckIn", 42, 1 },
+		{ "FinIn", 41, 1 },
+		{ "RstIn", 40, 1 },
+		{ "DataIn", 39, 1 },
+		{ "DataInVld", 38, 1 },
+		{ "PadIn", 37, 1 },
+		{ "RxBufEmpty", 36, 1 },
+		{ "RxDdp", 35, 1 },
+		{ "RxFbCongestion", 34, 1 },
+		{ "TxFbCongestion", 33, 1 },
+		{ "TxPktSumSrdy", 32, 1 },
+		{ "RcfUlpType", 28, 4 },
+		{ "Eread", 27, 1 },
+		{ "Ebypass", 26, 1 },
+		{ "Esave", 25, 1 },
+		{ "Static0", 24, 1 },
+		{ "Cread", 23, 1 },
+		{ "Cbypass", 22, 1 },
+		{ "Csave", 21, 1 },
+		{ "CPktOut", 20, 1 },
+		{ "RxPagePoolFull", 18, 2 },
+		{ "RxLpbkPkt", 17, 1 },
+		{ "TxLpbkPkt", 16, 1 },
+		{ "RxVfValid", 15, 1 },
+		{ "SynLearned", 14, 1 },
+		{ "SetDelEntry", 13, 1 },
+		{ "SetInvEntry", 12, 1 },
+		{ "CpcmdDvld", 11, 1 },
+		{ "CpcmdSave", 10, 1 },
+		{ "RxPstructsFull", 8, 2 },
+		{ "EpcmdDvld", 7, 1 },
+		{ "EpcmdFlush", 6, 1 },
+		{ "EpcmdTrimPrefix", 5, 1 },
+		{ "EpcmdTrimPostfix", 4, 1 },
+		{ "ERssIp4Pkt", 3, 1 },
+		{ "ERssIp6Pkt", 2, 1 },
+		{ "ERssTcpUdpPkt", 1, 1 },
+		{ "ERssFceFipPkt", 0, 1 },
+		{ NULL }
+	};
+	static struct field_desc tp_la2[] = {
+		{ "CplCmdIn", 56, 8 },
+		{ "MpsVfVld", 55, 1 },
+		{ "MpsPf", 52, 3 },
+		{ "MpsVf", 44, 8 },
+		{ "SynIn", 43, 1 },
+		{ "AckIn", 42, 1 },
+		{ "FinIn", 41, 1 },
+		{ "RstIn", 40, 1 },
+		{ "DataIn", 39, 1 },
+		{ "DataInVld", 38, 1 },
+		{ "PadIn", 37, 1 },
+		{ "RxBufEmpty", 36, 1 },
+		{ "RxDdp", 35, 1 },
+		{ "RxFbCongestion", 34, 1 },
+		{ "TxFbCongestion", 33, 1 },
+		{ "TxPktSumSrdy", 32, 1 },
+		{ "RcfUlpType", 28, 4 },
+		{ "Eread", 27, 1 },
+		{ "Ebypass", 26, 1 },
+		{ "Esave", 25, 1 },
+		{ "Static0", 24, 1 },
+		{ "Cread", 23, 1 },
+		{ "Cbypass", 22, 1 },
+		{ "Csave", 21, 1 },
+		{ "CPktOut", 20, 1 },
+		{ "RxPagePoolFull", 18, 2 },
+		{ "RxLpbkPkt", 17, 1 },
+		{ "TxLpbkPkt", 16, 1 },
+		{ "RxVfValid", 15, 1 },
+		{ "SynLearned", 14, 1 },
+		{ "SetDelEntry", 13, 1 },
+		{ "SetInvEntry", 12, 1 },
+		{ "CpcmdDvld", 11, 1 },
+		{ "CpcmdSave", 10, 1 },
+		{ "RxPstructsFull", 8, 2 },
+		{ "EpcmdDvld", 7, 1 },
+		{ "EpcmdFlush", 6, 1 },
+		{ "EpcmdTrimPrefix", 5, 1 },
+		{ "EpcmdTrimPostfix", 4, 1 },
+		{ "ERssIp4Pkt", 3, 1 },
+		{ "ERssIp6Pkt", 2, 1 },
+		{ "ERssTcpUdpPkt", 1, 1 },
+		{ "ERssFceFipPkt", 0, 1 },
+		{ NULL }
+	};
+	const u64 *p = v;
+
+	if (idx)
+		seq_putc(seq, '\n');
+	field_desc_show(seq, p[0], tp_la0);
+	if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL)
+		field_desc_show(seq, p[1], (p[0] & BIT(17)) ? tp_la2 : tp_la1);
+	return 0;
+}
+
+static int tp_la_open(struct inode *inode, struct file *file)
+{
+	struct seq_tab *p;
+	struct adapter *adap = inode->i_private;
+
+	switch (DBGLAMODE_G(t4_read_reg(adap, TP_DBG_LA_CONFIG_A))) {
+	case 2:
+		p = seq_open_tab(file, TPLA_SIZE / 2, 2 * sizeof(u64), 0,
+				 tp_la_show2);
+		break;
+	case 3:
+		p = seq_open_tab(file, TPLA_SIZE / 2, 2 * sizeof(u64), 0,
+				 tp_la_show3);
+		break;
+	default:
+		p = seq_open_tab(file, TPLA_SIZE, sizeof(u64), 0, tp_la_show);
+	}
+	if (!p)
+		return -ENOMEM;
+
+	t4_tp_read_la(adap, (u64 *)p->data, NULL);
+	return 0;
+}
+
+static ssize_t tp_la_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *pos)
+{
+	int err;
+	char s[32];
+	unsigned long val;
+	size_t size = min(sizeof(s) - 1, count);
+	struct adapter *adap = file_inode(file)->i_private;
+
+	if (copy_from_user(s, buf, size))
+		return -EFAULT;
+	s[size] = '\0';
+	err = kstrtoul(s, 0, &val);
+	if (err)
+		return err;
+	if (val > 0xffff)
+		return -EINVAL;
+	adap->params.tp.la_mask = val << 16;
+	t4_set_reg_field(adap, TP_DBG_LA_CONFIG_A, 0xffff0000U,
+			 adap->params.tp.la_mask);
+	return count;
+}
+
+static const struct file_operations tp_la_fops = {
+	.owner   = THIS_MODULE,
+	.open    = tp_la_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private,
+	.write   = tp_la_write
+};
+
+static int ulprx_la_show(struct seq_file *seq, void *v, int idx)
+{
+	const u32 *p = v;
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "      Pcmd        Type   Message"
+			 "                Data\n");
+	else
+		seq_printf(seq, "%08x%08x  %4x  %08x  %08x%08x%08x%08x\n",
+			   p[1], p[0], p[2], p[3], p[7], p[6], p[5], p[4]);
+	return 0;
+}
+
+static int ulprx_la_open(struct inode *inode, struct file *file)
+{
+	struct seq_tab *p;
+	struct adapter *adap = inode->i_private;
+
+	p = seq_open_tab(file, ULPRX_LA_SIZE, 8 * sizeof(u32), 1,
+			 ulprx_la_show);
+	if (!p)
+		return -ENOMEM;
+
+	t4_ulprx_read_la(adap, (u32 *)p->data);
+	return 0;
+}
+
+static const struct file_operations ulprx_la_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ulprx_la_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private
+};
+
+/* Show the PM memory stats.  These stats include:
+ *
+ * TX:
+ *   Read: memory read operation
+ *   Write Bypass: cut-through
+ *   Bypass + mem: cut-through and save copy
+ *
+ * RX:
+ *   Read: memory read
+ *   Write Bypass: cut-through
+ *   Flush: payload trim or drop
+ */
+static int pm_stats_show(struct seq_file *seq, void *v)
+{
+	static const char * const tx_pm_stats[] = {
+		"Read:", "Write bypass:", "Write mem:", "Bypass + mem:"
+	};
+	static const char * const rx_pm_stats[] = {
+		"Read:", "Write bypass:", "Write mem:", "Flush:"
+	};
+
+	int i;
+	u32 tx_cnt[T6_PM_NSTATS], rx_cnt[T6_PM_NSTATS];
+	u64 tx_cyc[T6_PM_NSTATS], rx_cyc[T6_PM_NSTATS];
+	struct adapter *adap = seq->private;
+
+	t4_pmtx_get_stats(adap, tx_cnt, tx_cyc);
+	t4_pmrx_get_stats(adap, rx_cnt, rx_cyc);
+
+	seq_printf(seq, "%13s %10s  %20s\n", " ", "Tx pcmds", "Tx bytes");
+	for (i = 0; i < PM_NSTATS - 1; i++)
+		seq_printf(seq, "%-13s %10u  %20llu\n",
+			   tx_pm_stats[i], tx_cnt[i], tx_cyc[i]);
+
+	seq_printf(seq, "%13s %10s  %20s\n", " ", "Rx pcmds", "Rx bytes");
+	for (i = 0; i < PM_NSTATS - 1; i++)
+		seq_printf(seq, "%-13s %10u  %20llu\n",
+			   rx_pm_stats[i], rx_cnt[i], rx_cyc[i]);
+
+	if (CHELSIO_CHIP_VERSION(adap->params.chip) > CHELSIO_T5) {
+		/* In T5 the granularity of the total wait is too fine.
+		 * It is not useful as it reaches the max value too fast.
+		 * Hence display this Input FIFO wait for T6 onwards.
+		 */
+		seq_printf(seq, "%13s %10s  %20s\n",
+			   " ", "Total wait", "Total Occupancy");
+		seq_printf(seq, "Tx FIFO wait  %10u  %20llu\n",
+			   tx_cnt[i], tx_cyc[i]);
+		seq_printf(seq, "Rx FIFO wait  %10u  %20llu\n",
+			   rx_cnt[i], rx_cyc[i]);
+
+		/* Skip index 6 as there is nothing useful ihere */
+		i += 2;
+
+		/* At index 7, a new stat for read latency (count, total wait)
+		 * is added.
+		 */
+		seq_printf(seq, "%13s %10s  %20s\n",
+			   " ", "Reads", "Total wait");
+		seq_printf(seq, "Tx latency    %10u  %20llu\n",
+			   tx_cnt[i], tx_cyc[i]);
+		seq_printf(seq, "Rx latency    %10u  %20llu\n",
+			   rx_cnt[i], rx_cyc[i]);
+	}
+	return 0;
+}
+
+static int pm_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pm_stats_show, inode->i_private);
+}
+
+static ssize_t pm_stats_clear(struct file *file, const char __user *buf,
+			      size_t count, loff_t *pos)
+{
+	struct adapter *adap = file_inode(file)->i_private;
+
+	t4_write_reg(adap, PM_RX_STAT_CONFIG_A, 0);
+	t4_write_reg(adap, PM_TX_STAT_CONFIG_A, 0);
+	return count;
+}
+
+static const struct file_operations pm_stats_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = pm_stats_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+	.write   = pm_stats_clear
+};
+
+static int tx_rate_show(struct seq_file *seq, void *v)
+{
+	u64 nrate[NCHAN], orate[NCHAN];
+	struct adapter *adap = seq->private;
+
+	t4_get_chan_txrate(adap, nrate, orate);
+	if (adap->params.arch.nchan == NCHAN) {
+		seq_puts(seq, "              channel 0   channel 1   "
+			 "channel 2   channel 3\n");
+		seq_printf(seq, "NIC B/s:     %10llu  %10llu  %10llu  %10llu\n",
+			   (unsigned long long)nrate[0],
+			   (unsigned long long)nrate[1],
+			   (unsigned long long)nrate[2],
+			   (unsigned long long)nrate[3]);
+		seq_printf(seq, "Offload B/s: %10llu  %10llu  %10llu  %10llu\n",
+			   (unsigned long long)orate[0],
+			   (unsigned long long)orate[1],
+			   (unsigned long long)orate[2],
+			   (unsigned long long)orate[3]);
+	} else {
+		seq_puts(seq, "              channel 0   channel 1\n");
+		seq_printf(seq, "NIC B/s:     %10llu  %10llu\n",
+			   (unsigned long long)nrate[0],
+			   (unsigned long long)nrate[1]);
+		seq_printf(seq, "Offload B/s: %10llu  %10llu\n",
+			   (unsigned long long)orate[0],
+			   (unsigned long long)orate[1]);
+	}
+	return 0;
+}
+
+DEFINE_SIMPLE_DEBUGFS_FILE(tx_rate);
+
+static int cctrl_tbl_show(struct seq_file *seq, void *v)
+{
+	static const char * const dec_fac[] = {
+		"0.5", "0.5625", "0.625", "0.6875", "0.75", "0.8125", "0.875",
+		"0.9375" };
+
+	int i;
+	u16 (*incr)[NCCTRL_WIN];
+	struct adapter *adap = seq->private;
+
+	incr = kmalloc(sizeof(*incr) * NMTUS, GFP_KERNEL);
+	if (!incr)
+		return -ENOMEM;
+
+	t4_read_cong_tbl(adap, incr);
+
+	for (i = 0; i < NCCTRL_WIN; ++i) {
+		seq_printf(seq, "%2d: %4u %4u %4u %4u %4u %4u %4u %4u\n", i,
+			   incr[0][i], incr[1][i], incr[2][i], incr[3][i],
+			   incr[4][i], incr[5][i], incr[6][i], incr[7][i]);
+		seq_printf(seq, "%8u %4u %4u %4u %4u %4u %4u %4u %5u %s\n",
+			   incr[8][i], incr[9][i], incr[10][i], incr[11][i],
+			   incr[12][i], incr[13][i], incr[14][i], incr[15][i],
+			   adap->params.a_wnd[i],
+			   dec_fac[adap->params.b_wnd[i]]);
+	}
+
+	kfree(incr);
+	return 0;
+}
+
+DEFINE_SIMPLE_DEBUGFS_FILE(cctrl_tbl);
+
+/* Format a value in a unit that differs from the value's native unit by the
+ * given factor.
+ */
+static char *unit_conv(char *buf, size_t len, unsigned int val,
+		       unsigned int factor)
+{
+	unsigned int rem = val % factor;
+
+	if (rem == 0) {
+		snprintf(buf, len, "%u", val / factor);
+	} else {
+		while (rem % 10 == 0)
+			rem /= 10;
+		snprintf(buf, len, "%u.%u", val / factor, rem);
+	}
+	return buf;
+}
+
+static int clk_show(struct seq_file *seq, void *v)
+{
+	char buf[32];
+	struct adapter *adap = seq->private;
+	unsigned int cclk_ps = 1000000000 / adap->params.vpd.cclk;  /* in ps */
+	u32 res = t4_read_reg(adap, TP_TIMER_RESOLUTION_A);
+	unsigned int tre = TIMERRESOLUTION_G(res);
+	unsigned int dack_re = DELAYEDACKRESOLUTION_G(res);
+	unsigned long long tp_tick_us = (cclk_ps << tre) / 1000000; /* in us */
+
+	seq_printf(seq, "Core clock period: %s ns\n",
+		   unit_conv(buf, sizeof(buf), cclk_ps, 1000));
+	seq_printf(seq, "TP timer tick: %s us\n",
+		   unit_conv(buf, sizeof(buf), (cclk_ps << tre), 1000000));
+	seq_printf(seq, "TCP timestamp tick: %s us\n",
+		   unit_conv(buf, sizeof(buf),
+			     (cclk_ps << TIMESTAMPRESOLUTION_G(res)), 1000000));
+	seq_printf(seq, "DACK tick: %s us\n",
+		   unit_conv(buf, sizeof(buf), (cclk_ps << dack_re), 1000000));
+	seq_printf(seq, "DACK timer: %u us\n",
+		   ((cclk_ps << dack_re) / 1000000) *
+		   t4_read_reg(adap, TP_DACK_TIMER_A));
+	seq_printf(seq, "Retransmit min: %llu us\n",
+		   tp_tick_us * t4_read_reg(adap, TP_RXT_MIN_A));
+	seq_printf(seq, "Retransmit max: %llu us\n",
+		   tp_tick_us * t4_read_reg(adap, TP_RXT_MAX_A));
+	seq_printf(seq, "Persist timer min: %llu us\n",
+		   tp_tick_us * t4_read_reg(adap, TP_PERS_MIN_A));
+	seq_printf(seq, "Persist timer max: %llu us\n",
+		   tp_tick_us * t4_read_reg(adap, TP_PERS_MAX_A));
+	seq_printf(seq, "Keepalive idle timer: %llu us\n",
+		   tp_tick_us * t4_read_reg(adap, TP_KEEP_IDLE_A));
+	seq_printf(seq, "Keepalive interval: %llu us\n",
+		   tp_tick_us * t4_read_reg(adap, TP_KEEP_INTVL_A));
+	seq_printf(seq, "Initial SRTT: %llu us\n",
+		   tp_tick_us * INITSRTT_G(t4_read_reg(adap, TP_INIT_SRTT_A)));
+	seq_printf(seq, "FINWAIT2 timer: %llu us\n",
+		   tp_tick_us * t4_read_reg(adap, TP_FINWAIT2_TIMER_A));
+
+	return 0;
+}
+
+DEFINE_SIMPLE_DEBUGFS_FILE(clk);
+
+/* Firmware Device Log dump. */
+static const char * const devlog_level_strings[] = {
+	[FW_DEVLOG_LEVEL_EMERG]		= "EMERG",
+	[FW_DEVLOG_LEVEL_CRIT]		= "CRIT",
+	[FW_DEVLOG_LEVEL_ERR]		= "ERR",
+	[FW_DEVLOG_LEVEL_NOTICE]	= "NOTICE",
+	[FW_DEVLOG_LEVEL_INFO]		= "INFO",
+	[FW_DEVLOG_LEVEL_DEBUG]		= "DEBUG"
+};
+
+static const char * const devlog_facility_strings[] = {
+	[FW_DEVLOG_FACILITY_CORE]	= "CORE",
+	[FW_DEVLOG_FACILITY_CF]         = "CF",
+	[FW_DEVLOG_FACILITY_SCHED]	= "SCHED",
+	[FW_DEVLOG_FACILITY_TIMER]	= "TIMER",
+	[FW_DEVLOG_FACILITY_RES]	= "RES",
+	[FW_DEVLOG_FACILITY_HW]		= "HW",
+	[FW_DEVLOG_FACILITY_FLR]	= "FLR",
+	[FW_DEVLOG_FACILITY_DMAQ]	= "DMAQ",
+	[FW_DEVLOG_FACILITY_PHY]	= "PHY",
+	[FW_DEVLOG_FACILITY_MAC]	= "MAC",
+	[FW_DEVLOG_FACILITY_PORT]	= "PORT",
+	[FW_DEVLOG_FACILITY_VI]		= "VI",
+	[FW_DEVLOG_FACILITY_FILTER]	= "FILTER",
+	[FW_DEVLOG_FACILITY_ACL]	= "ACL",
+	[FW_DEVLOG_FACILITY_TM]		= "TM",
+	[FW_DEVLOG_FACILITY_QFC]	= "QFC",
+	[FW_DEVLOG_FACILITY_DCB]	= "DCB",
+	[FW_DEVLOG_FACILITY_ETH]	= "ETH",
+	[FW_DEVLOG_FACILITY_OFLD]	= "OFLD",
+	[FW_DEVLOG_FACILITY_RI]		= "RI",
+	[FW_DEVLOG_FACILITY_ISCSI]	= "ISCSI",
+	[FW_DEVLOG_FACILITY_FCOE]	= "FCOE",
+	[FW_DEVLOG_FACILITY_FOISCSI]	= "FOISCSI",
+	[FW_DEVLOG_FACILITY_FOFCOE]	= "FOFCOE"
+};
+
+/* Information gathered by Device Log Open routine for the display routine.
+ */
+struct devlog_info {
+	unsigned int nentries;		/* number of entries in log[] */
+	unsigned int first;		/* first [temporal] entry in log[] */
+	struct fw_devlog_e log[0];	/* Firmware Device Log */
+};
+
+/* Dump a Firmaware Device Log entry.
+ */
+static int devlog_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%10s  %15s  %8s  %8s  %s\n",
+			   "Seq#", "Tstamp", "Level", "Facility", "Message");
+	else {
+		struct devlog_info *dinfo = seq->private;
+		int fidx = (uintptr_t)v - 2;
+		unsigned long index;
+		struct fw_devlog_e *e;
+
+		/* Get a pointer to the log entry to display.  Skip unused log
+		 * entries.
+		 */
+		index = dinfo->first + fidx;
+		if (index >= dinfo->nentries)
+			index -= dinfo->nentries;
+		e = &dinfo->log[index];
+		if (e->timestamp == 0)
+			return 0;
+
+		/* Print the message.  This depends on the firmware using
+		 * exactly the same formating strings as the kernel so we may
+		 * eventually have to put a format interpreter in here ...
+		 */
+		seq_printf(seq, "%10d  %15llu  %8s  %8s  ",
+			   be32_to_cpu(e->seqno),
+			   be64_to_cpu(e->timestamp),
+			   (e->level < ARRAY_SIZE(devlog_level_strings)
+			    ? devlog_level_strings[e->level]
+			    : "UNKNOWN"),
+			   (e->facility < ARRAY_SIZE(devlog_facility_strings)
+			    ? devlog_facility_strings[e->facility]
+			    : "UNKNOWN"));
+		seq_printf(seq, e->fmt,
+			   be32_to_cpu(e->params[0]),
+			   be32_to_cpu(e->params[1]),
+			   be32_to_cpu(e->params[2]),
+			   be32_to_cpu(e->params[3]),
+			   be32_to_cpu(e->params[4]),
+			   be32_to_cpu(e->params[5]),
+			   be32_to_cpu(e->params[6]),
+			   be32_to_cpu(e->params[7]));
+	}
+	return 0;
+}
+
+/* Sequential File Operations for Device Log.
+ */
+static inline void *devlog_get_idx(struct devlog_info *dinfo, loff_t pos)
+{
+	if (pos > dinfo->nentries)
+		return NULL;
+
+	return (void *)(uintptr_t)(pos + 1);
+}
+
+static void *devlog_start(struct seq_file *seq, loff_t *pos)
+{
+	struct devlog_info *dinfo = seq->private;
+
+	return (*pos
+		? devlog_get_idx(dinfo, *pos)
+		: SEQ_START_TOKEN);
+}
+
+static void *devlog_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct devlog_info *dinfo = seq->private;
+
+	(*pos)++;
+	return devlog_get_idx(dinfo, *pos);
+}
+
+static void devlog_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations devlog_seq_ops = {
+	.start = devlog_start,
+	.next  = devlog_next,
+	.stop  = devlog_stop,
+	.show  = devlog_show
+};
+
+/* Set up for reading the firmware's device log.  We read the entire log here
+ * and then display it incrementally in devlog_show().
+ */
+static int devlog_open(struct inode *inode, struct file *file)
+{
+	struct adapter *adap = inode->i_private;
+	struct devlog_params *dparams = &adap->params.devlog;
+	struct devlog_info *dinfo;
+	unsigned int index;
+	u32 fseqno;
+	int ret;
+
+	/* If we don't know where the log is we can't do anything.
+	 */
+	if (dparams->start == 0)
+		return -ENXIO;
+
+	/* Allocate the space to read in the firmware's device log and set up
+	 * for the iterated call to our display function.
+	 */
+	dinfo = __seq_open_private(file, &devlog_seq_ops,
+				   sizeof(*dinfo) + dparams->size);
+	if (!dinfo)
+		return -ENOMEM;
+
+	/* Record the basic log buffer information and read in the raw log.
+	 */
+	dinfo->nentries = (dparams->size / sizeof(struct fw_devlog_e));
+	dinfo->first = 0;
+	spin_lock(&adap->win0_lock);
+	ret = t4_memory_rw(adap, adap->params.drv_memwin, dparams->memtype,
+			   dparams->start, dparams->size, (__be32 *)dinfo->log,
+			   T4_MEMORY_READ);
+	spin_unlock(&adap->win0_lock);
+	if (ret) {
+		seq_release_private(inode, file);
+		return ret;
+	}
+
+	/* Find the earliest (lowest Sequence Number) log entry in the
+	 * circular Device Log.
+	 */
+	for (fseqno = ~((u32)0), index = 0; index < dinfo->nentries; index++) {
+		struct fw_devlog_e *e = &dinfo->log[index];
+		__u32 seqno;
+
+		if (e->timestamp == 0)
+			continue;
+
+		seqno = be32_to_cpu(e->seqno);
+		if (seqno < fseqno) {
+			fseqno = seqno;
+			dinfo->first = index;
+		}
+	}
+	return 0;
+}
+
+static const struct file_operations devlog_fops = {
+	.owner   = THIS_MODULE,
+	.open    = devlog_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private
+};
+
+/* Show Firmware Mailbox Command/Reply Log
+ *
+ * Note that we don't do any locking when dumping the Firmware Mailbox Log so
+ * it's possible that we can catch things during a log update and therefore
+ * see partially corrupted log entries.  But it's probably Good Enough(tm).
+ * If we ever decide that we want to make sure that we're dumping a coherent
+ * log, we'd need to perform locking in the mailbox logging and in
+ * mboxlog_open() where we'd need to grab the entire mailbox log in one go
+ * like we do for the Firmware Device Log.
+ */
+static int mboxlog_show(struct seq_file *seq, void *v)
+{
+	struct adapter *adapter = seq->private;
+	struct mbox_cmd_log *log = adapter->mbox_log;
+	struct mbox_cmd *entry;
+	int entry_idx, i;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq,
+			   "%10s  %15s  %5s  %5s  %s\n",
+			   "Seq#", "Tstamp", "Atime", "Etime",
+			   "Command/Reply");
+		return 0;
+	}
+
+	entry_idx = log->cursor + ((uintptr_t)v - 2);
+	if (entry_idx >= log->size)
+		entry_idx -= log->size;
+	entry = mbox_cmd_log_entry(log, entry_idx);
+
+	/* skip over unused entries */
+	if (entry->timestamp == 0)
+		return 0;
+
+	seq_printf(seq, "%10u  %15llu  %5d  %5d",
+		   entry->seqno, entry->timestamp,
+		   entry->access, entry->execute);
+	for (i = 0; i < MBOX_LEN / 8; i++) {
+		u64 flit = entry->cmd[i];
+		u32 hi = (u32)(flit >> 32);
+		u32 lo = (u32)flit;
+
+		seq_printf(seq, "  %08x %08x", hi, lo);
+	}
+	seq_puts(seq, "\n");
+	return 0;
+}
+
+static inline void *mboxlog_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct adapter *adapter = seq->private;
+	struct mbox_cmd_log *log = adapter->mbox_log;
+
+	return ((pos <= log->size) ? (void *)(uintptr_t)(pos + 1) : NULL);
+}
+
+static void *mboxlog_start(struct seq_file *seq, loff_t *pos)
+{
+	return *pos ? mboxlog_get_idx(seq, *pos) : SEQ_START_TOKEN;
+}
+
+static void *mboxlog_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return mboxlog_get_idx(seq, *pos);
+}
+
+static void mboxlog_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations mboxlog_seq_ops = {
+	.start = mboxlog_start,
+	.next  = mboxlog_next,
+	.stop  = mboxlog_stop,
+	.show  = mboxlog_show
+};
+
+static int mboxlog_open(struct inode *inode, struct file *file)
+{
+	int res = seq_open(file, &mboxlog_seq_ops);
+
+	if (!res) {
+		struct seq_file *seq = file->private_data;
+
+		seq->private = inode->i_private;
+	}
+	return res;
+}
+
+static const struct file_operations mboxlog_fops = {
+	.owner   = THIS_MODULE,
+	.open    = mboxlog_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+static int mbox_show(struct seq_file *seq, void *v)
+{
+	static const char * const owner[] = { "none", "FW", "driver",
+					      "unknown", "<unread>" };
+
+	int i;
+	unsigned int mbox = (uintptr_t)seq->private & 7;
+	struct adapter *adap = seq->private - mbox;
+	void __iomem *addr = adap->regs + PF_REG(mbox, CIM_PF_MAILBOX_DATA_A);
+
+	/* For T4 we don't have a shadow copy of the Mailbox Control register.
+	 * And since reading that real register causes a side effect of
+	 * granting ownership, we're best of simply not reading it at all.
+	 */
+	if (is_t4(adap->params.chip)) {
+		i = 4; /* index of "<unread>" */
+	} else {
+		unsigned int ctrl_reg = CIM_PF_MAILBOX_CTRL_SHADOW_COPY_A;
+		void __iomem *ctrl = adap->regs + PF_REG(mbox, ctrl_reg);
+
+		i = MBOWNER_G(readl(ctrl));
+	}
+
+	seq_printf(seq, "mailbox owned by %s\n\n", owner[i]);
+
+	for (i = 0; i < MBOX_LEN; i += 8)
+		seq_printf(seq, "%016llx\n",
+			   (unsigned long long)readq(addr + i));
+	return 0;
+}
+
+static int mbox_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mbox_show, inode->i_private);
+}
+
+static ssize_t mbox_write(struct file *file, const char __user *buf,
+			  size_t count, loff_t *pos)
+{
+	int i;
+	char c = '\n', s[256];
+	unsigned long long data[8];
+	const struct inode *ino;
+	unsigned int mbox;
+	struct adapter *adap;
+	void __iomem *addr;
+	void __iomem *ctrl;
+
+	if (count > sizeof(s) - 1 || !count)
+		return -EINVAL;
+	if (copy_from_user(s, buf, count))
+		return -EFAULT;
+	s[count] = '\0';
+
+	if (sscanf(s, "%llx %llx %llx %llx %llx %llx %llx %llx%c", &data[0],
+		   &data[1], &data[2], &data[3], &data[4], &data[5], &data[6],
+		   &data[7], &c) < 8 || c != '\n')
+		return -EINVAL;
+
+	ino = file_inode(file);
+	mbox = (uintptr_t)ino->i_private & 7;
+	adap = ino->i_private - mbox;
+	addr = adap->regs + PF_REG(mbox, CIM_PF_MAILBOX_DATA_A);
+	ctrl = addr + MBOX_LEN;
+
+	if (MBOWNER_G(readl(ctrl)) != X_MBOWNER_PL)
+		return -EBUSY;
+
+	for (i = 0; i < 8; i++)
+		writeq(data[i], addr + 8 * i);
+
+	writel(MBMSGVALID_F | MBOWNER_V(X_MBOWNER_FW), ctrl);
+	return count;
+}
+
+static const struct file_operations mbox_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = mbox_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+	.write   = mbox_write
+};
+
+static int mps_trc_show(struct seq_file *seq, void *v)
+{
+	int enabled, i;
+	struct trace_params tp;
+	unsigned int trcidx = (uintptr_t)seq->private & 3;
+	struct adapter *adap = seq->private - trcidx;
+
+	t4_get_trace_filter(adap, &tp, trcidx, &enabled);
+	if (!enabled) {
+		seq_puts(seq, "tracer is disabled\n");
+		return 0;
+	}
+
+	if (tp.skip_ofst * 8 >= TRACE_LEN) {
+		dev_err(adap->pdev_dev, "illegal trace pattern skip offset\n");
+		return -EINVAL;
+	}
+	if (tp.port < 8) {
+		i = adap->chan_map[tp.port & 3];
+		if (i >= MAX_NPORTS) {
+			dev_err(adap->pdev_dev, "tracer %u is assigned "
+				"to non-existing port\n", trcidx);
+			return -EINVAL;
+		}
+		seq_printf(seq, "tracer is capturing %s %s, ",
+			   adap->port[i]->name, tp.port < 4 ? "Rx" : "Tx");
+	} else
+		seq_printf(seq, "tracer is capturing loopback %d, ",
+			   tp.port - 8);
+	seq_printf(seq, "snap length: %u, min length: %u\n", tp.snap_len,
+		   tp.min_len);
+	seq_printf(seq, "packets captured %smatch filter\n",
+		   tp.invert ? "do not " : "");
+
+	if (tp.skip_ofst) {
+		seq_puts(seq, "filter pattern: ");
+		for (i = 0; i < tp.skip_ofst * 2; i += 2)
+			seq_printf(seq, "%08x%08x", tp.data[i], tp.data[i + 1]);
+		seq_putc(seq, '/');
+		for (i = 0; i < tp.skip_ofst * 2; i += 2)
+			seq_printf(seq, "%08x%08x", tp.mask[i], tp.mask[i + 1]);
+		seq_puts(seq, "@0\n");
+	}
+
+	seq_puts(seq, "filter pattern: ");
+	for (i = tp.skip_ofst * 2; i < TRACE_LEN / 4; i += 2)
+		seq_printf(seq, "%08x%08x", tp.data[i], tp.data[i + 1]);
+	seq_putc(seq, '/');
+	for (i = tp.skip_ofst * 2; i < TRACE_LEN / 4; i += 2)
+		seq_printf(seq, "%08x%08x", tp.mask[i], tp.mask[i + 1]);
+	seq_printf(seq, "@%u\n", (tp.skip_ofst + tp.skip_len) * 8);
+	return 0;
+}
+
+static int mps_trc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mps_trc_show, inode->i_private);
+}
+
+static unsigned int xdigit2int(unsigned char c)
+{
+	return isdigit(c) ? c - '0' : tolower(c) - 'a' + 10;
+}
+
+#define TRC_PORT_NONE 0xff
+#define TRC_RSS_ENABLE 0x33
+#define TRC_RSS_DISABLE 0x13
+
+/* Set an MPS trace filter.  Syntax is:
+ *
+ * disable
+ *
+ * to disable tracing, or
+ *
+ * interface qid=<qid no> [snaplen=<val>] [minlen=<val>] [not] [<pattern>]...
+ *
+ * where interface is one of rxN, txN, or loopbackN, N = 0..3, qid can be one
+ * of the NIC's response qid obtained from sge_qinfo and pattern has the form
+ *
+ * <pattern data>[/<pattern mask>][@<anchor>]
+ *
+ * Up to 2 filter patterns can be specified.  If 2 are supplied the first one
+ * must be anchored at 0.  An omitted mask is taken as a mask of 1s, an omitted
+ * anchor is taken as 0.
+ */
+static ssize_t mps_trc_write(struct file *file, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	int i, enable, ret;
+	u32 *data, *mask;
+	struct trace_params tp;
+	const struct inode *ino;
+	unsigned int trcidx;
+	char *s, *p, *word, *end;
+	struct adapter *adap;
+	u32 j;
+
+	ino = file_inode(file);
+	trcidx = (uintptr_t)ino->i_private & 3;
+	adap = ino->i_private - trcidx;
+
+	/* Don't accept input more than 1K, can't be anything valid except lots
+	 * of whitespace.  Well, use less.
+	 */
+	if (count > 1024)
+		return -EFBIG;
+	p = s = kzalloc(count + 1, GFP_USER);
+	if (!s)
+		return -ENOMEM;
+	if (copy_from_user(s, buf, count)) {
+		count = -EFAULT;
+		goto out;
+	}
+
+	if (s[count - 1] == '\n')
+		s[count - 1] = '\0';
+
+	enable = strcmp("disable", s) != 0;
+	if (!enable)
+		goto apply;
+
+	/* enable or disable trace multi rss filter */
+	if (adap->trace_rss)
+		t4_write_reg(adap, MPS_TRC_CFG_A, TRC_RSS_ENABLE);
+	else
+		t4_write_reg(adap, MPS_TRC_CFG_A, TRC_RSS_DISABLE);
+
+	memset(&tp, 0, sizeof(tp));
+	tp.port = TRC_PORT_NONE;
+	i = 0;	/* counts pattern nibbles */
+
+	while (p) {
+		while (isspace(*p))
+			p++;
+		word = strsep(&p, " ");
+		if (!*word)
+			break;
+
+		if (!strncmp(word, "qid=", 4)) {
+			end = (char *)word + 4;
+			ret = kstrtouint(end, 10, &j);
+			if (ret)
+				goto out;
+			if (!adap->trace_rss) {
+				t4_write_reg(adap, MPS_T5_TRC_RSS_CONTROL_A, j);
+				continue;
+			}
+
+			switch (trcidx) {
+			case 0:
+				t4_write_reg(adap, MPS_TRC_RSS_CONTROL_A, j);
+				break;
+			case 1:
+				t4_write_reg(adap,
+					     MPS_TRC_FILTER1_RSS_CONTROL_A, j);
+				break;
+			case 2:
+				t4_write_reg(adap,
+					     MPS_TRC_FILTER2_RSS_CONTROL_A, j);
+				break;
+			case 3:
+				t4_write_reg(adap,
+					     MPS_TRC_FILTER3_RSS_CONTROL_A, j);
+				break;
+			}
+			continue;
+		}
+		if (!strncmp(word, "snaplen=", 8)) {
+			end = (char *)word + 8;
+			ret = kstrtouint(end, 10, &j);
+			if (ret || j > 9600) {
+inval:				count = -EINVAL;
+				goto out;
+			}
+			tp.snap_len = j;
+			continue;
+		}
+		if (!strncmp(word, "minlen=", 7)) {
+			end = (char *)word + 7;
+			ret = kstrtouint(end, 10, &j);
+			if (ret || j > TFMINPKTSIZE_M)
+				goto inval;
+			tp.min_len = j;
+			continue;
+		}
+		if (!strcmp(word, "not")) {
+			tp.invert = !tp.invert;
+			continue;
+		}
+		if (!strncmp(word, "loopback", 8) && tp.port == TRC_PORT_NONE) {
+			if (word[8] < '0' || word[8] > '3' || word[9])
+				goto inval;
+			tp.port = word[8] - '0' + 8;
+			continue;
+		}
+		if (!strncmp(word, "tx", 2) && tp.port == TRC_PORT_NONE) {
+			if (word[2] < '0' || word[2] > '3' || word[3])
+				goto inval;
+			tp.port = word[2] - '0' + 4;
+			if (adap->chan_map[tp.port & 3] >= MAX_NPORTS)
+				goto inval;
+			continue;
+		}
+		if (!strncmp(word, "rx", 2) && tp.port == TRC_PORT_NONE) {
+			if (word[2] < '0' || word[2] > '3' || word[3])
+				goto inval;
+			tp.port = word[2] - '0';
+			if (adap->chan_map[tp.port] >= MAX_NPORTS)
+				goto inval;
+			continue;
+		}
+		if (!isxdigit(*word))
+			goto inval;
+
+		/* we have found a trace pattern */
+		if (i) {                            /* split pattern */
+			if (tp.skip_len)            /* too many splits */
+				goto inval;
+			tp.skip_ofst = i / 16;
+		}
+
+		data = &tp.data[i / 8];
+		mask = &tp.mask[i / 8];
+		j = i;
+
+		while (isxdigit(*word)) {
+			if (i >= TRACE_LEN * 2) {
+				count = -EFBIG;
+				goto out;
+			}
+			*data = (*data << 4) + xdigit2int(*word++);
+			if (++i % 8 == 0)
+				data++;
+		}
+		if (*word == '/') {
+			word++;
+			while (isxdigit(*word)) {
+				if (j >= i)         /* mask longer than data */
+					goto inval;
+				*mask = (*mask << 4) + xdigit2int(*word++);
+				if (++j % 8 == 0)
+					mask++;
+			}
+			if (i != j)                 /* mask shorter than data */
+				goto inval;
+		} else {                            /* no mask, use all 1s */
+			for ( ; i - j >= 8; j += 8)
+				*mask++ = 0xffffffff;
+			if (i % 8)
+				*mask = (1 << (i % 8) * 4) - 1;
+		}
+		if (*word == '@') {
+			end = (char *)word + 1;
+			ret = kstrtouint(end, 10, &j);
+			if (*end && *end != '\n')
+				goto inval;
+			if (j & 7)          /* doesn't start at multiple of 8 */
+				goto inval;
+			j /= 8;
+			if (j < tp.skip_ofst)     /* overlaps earlier pattern */
+				goto inval;
+			if (j - tp.skip_ofst > 31)            /* skip too big */
+				goto inval;
+			tp.skip_len = j - tp.skip_ofst;
+		}
+		if (i % 8) {
+			*data <<= (8 - i % 8) * 4;
+			*mask <<= (8 - i % 8) * 4;
+			i = (i + 15) & ~15;         /* 8-byte align */
+		}
+	}
+
+	if (tp.port == TRC_PORT_NONE)
+		goto inval;
+
+apply:
+	i = t4_set_trace_filter(adap, &tp, trcidx, enable);
+	if (i)
+		count = i;
+out:
+	kfree(s);
+	return count;
+}
+
+static const struct file_operations mps_trc_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = mps_trc_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+	.write   = mps_trc_write
+};
+
+static ssize_t flash_read(struct file *file, char __user *buf, size_t count,
+			  loff_t *ppos)
+{
+	loff_t pos = *ppos;
+	loff_t avail = file_inode(file)->i_size;
+	struct adapter *adap = file->private_data;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= avail)
+		return 0;
+	if (count > avail - pos)
+		count = avail - pos;
+
+	while (count) {
+		size_t len;
+		int ret, ofst;
+		u8 data[256];
+
+		ofst = pos & 3;
+		len = min(count + ofst, sizeof(data));
+		ret = t4_read_flash(adap, pos - ofst, (len + 3) / 4,
+				    (u32 *)data, 1);
+		if (ret)
+			return ret;
+
+		len -= ofst;
+		if (copy_to_user(buf, data + ofst, len))
+			return -EFAULT;
+
+		buf += len;
+		pos += len;
+		count -= len;
+	}
+	count = pos - *ppos;
+	*ppos = pos;
+	return count;
+}
+
+static const struct file_operations flash_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = mem_open,
+	.read    = flash_read,
+	.llseek  = default_llseek,
+};
+
+static inline void tcamxy2valmask(u64 x, u64 y, u8 *addr, u64 *mask)
+{
+	*mask = x | y;
+	y = (__force u64)cpu_to_be64(y);
+	memcpy(addr, (char *)&y + 2, ETH_ALEN);
+}
+
+static int mps_tcam_show(struct seq_file *seq, void *v)
+{
+	struct adapter *adap = seq->private;
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
+	if (v == SEQ_START_TOKEN) {
+		if (chip_ver > CHELSIO_T5) {
+			seq_puts(seq, "Idx  Ethernet address     Mask     "
+				 "  VNI   Mask   IVLAN Vld "
+				 "DIP_Hit   Lookup  Port "
+				 "Vld Ports PF  VF                           "
+				 "Replication                                "
+				 "    P0 P1 P2 P3  ML\n");
+		} else {
+			if (adap->params.arch.mps_rplc_size > 128)
+				seq_puts(seq, "Idx  Ethernet address     Mask     "
+					 "Vld Ports PF  VF                           "
+					 "Replication                                "
+					 "    P0 P1 P2 P3  ML\n");
+			else
+				seq_puts(seq, "Idx  Ethernet address     Mask     "
+					 "Vld Ports PF  VF              Replication"
+					 "	         P0 P1 P2 P3  ML\n");
+		}
+	} else {
+		u64 mask;
+		u8 addr[ETH_ALEN];
+		bool replicate, dip_hit = false, vlan_vld = false;
+		unsigned int idx = (uintptr_t)v - 2;
+		u64 tcamy, tcamx, val;
+		u32 cls_lo, cls_hi, ctl, data2, vnix = 0, vniy = 0;
+		u32 rplc[8] = {0};
+		u8 lookup_type = 0, port_num = 0;
+		u16 ivlan = 0;
+
+		if (chip_ver > CHELSIO_T5) {
+			/* CtlCmdType - 0: Read, 1: Write
+			 * CtlTcamSel - 0: TCAM0, 1: TCAM1
+			 * CtlXYBitSel- 0: Y bit, 1: X bit
+			 */
+
+			/* Read tcamy */
+			ctl = CTLCMDTYPE_V(0) | CTLXYBITSEL_V(0);
+			if (idx < 256)
+				ctl |= CTLTCAMINDEX_V(idx) | CTLTCAMSEL_V(0);
+			else
+				ctl |= CTLTCAMINDEX_V(idx - 256) |
+				       CTLTCAMSEL_V(1);
+			t4_write_reg(adap, MPS_CLS_TCAM_DATA2_CTL_A, ctl);
+			val = t4_read_reg(adap, MPS_CLS_TCAM_DATA1_A);
+			tcamy = DMACH_G(val) << 32;
+			tcamy |= t4_read_reg(adap, MPS_CLS_TCAM_DATA0_A);
+			data2 = t4_read_reg(adap, MPS_CLS_TCAM_DATA2_CTL_A);
+			lookup_type = DATALKPTYPE_G(data2);
+			/* 0 - Outer header, 1 - Inner header
+			 * [71:48] bit locations are overloaded for
+			 * outer vs. inner lookup types.
+			 */
+			if (lookup_type && (lookup_type != DATALKPTYPE_M)) {
+				/* Inner header VNI */
+				vniy = (data2 & DATAVIDH2_F) |
+				       (DATAVIDH1_G(data2) << 16) | VIDL_G(val);
+				dip_hit = data2 & DATADIPHIT_F;
+			} else {
+				vlan_vld = data2 & DATAVIDH2_F;
+				ivlan = VIDL_G(val);
+			}
+			port_num = DATAPORTNUM_G(data2);
+
+			/* Read tcamx. Change the control param */
+			vnix = 0;
+			ctl |= CTLXYBITSEL_V(1);
+			t4_write_reg(adap, MPS_CLS_TCAM_DATA2_CTL_A, ctl);
+			val = t4_read_reg(adap, MPS_CLS_TCAM_DATA1_A);
+			tcamx = DMACH_G(val) << 32;
+			tcamx |= t4_read_reg(adap, MPS_CLS_TCAM_DATA0_A);
+			data2 = t4_read_reg(adap, MPS_CLS_TCAM_DATA2_CTL_A);
+			if (lookup_type && (lookup_type != DATALKPTYPE_M)) {
+				/* Inner header VNI mask */
+				vnix = (data2 & DATAVIDH2_F) |
+				       (DATAVIDH1_G(data2) << 16) | VIDL_G(val);
+			}
+		} else {
+			tcamy = t4_read_reg64(adap, MPS_CLS_TCAM_Y_L(idx));
+			tcamx = t4_read_reg64(adap, MPS_CLS_TCAM_X_L(idx));
+		}
+
+		cls_lo = t4_read_reg(adap, MPS_CLS_SRAM_L(idx));
+		cls_hi = t4_read_reg(adap, MPS_CLS_SRAM_H(idx));
+
+		if (tcamx & tcamy) {
+			seq_printf(seq, "%3u         -\n", idx);
+			goto out;
+		}
+
+		rplc[0] = rplc[1] = rplc[2] = rplc[3] = 0;
+		if (chip_ver > CHELSIO_T5)
+			replicate = (cls_lo & T6_REPLICATE_F);
+		else
+			replicate = (cls_lo & REPLICATE_F);
+
+		if (replicate) {
+			struct fw_ldst_cmd ldst_cmd;
+			int ret;
+			struct fw_ldst_mps_rplc mps_rplc;
+			u32 ldst_addrspc;
+
+			memset(&ldst_cmd, 0, sizeof(ldst_cmd));
+			ldst_addrspc =
+				FW_LDST_CMD_ADDRSPACE_V(FW_LDST_ADDRSPC_MPS);
+			ldst_cmd.op_to_addrspace =
+				htonl(FW_CMD_OP_V(FW_LDST_CMD) |
+				      FW_CMD_REQUEST_F |
+				      FW_CMD_READ_F |
+				      ldst_addrspc);
+			ldst_cmd.cycles_to_len16 = htonl(FW_LEN16(ldst_cmd));
+			ldst_cmd.u.mps.rplc.fid_idx =
+				htons(FW_LDST_CMD_FID_V(FW_LDST_MPS_RPLC) |
+				      FW_LDST_CMD_IDX_V(idx));
+			ret = t4_wr_mbox(adap, adap->mbox, &ldst_cmd,
+					 sizeof(ldst_cmd), &ldst_cmd);
+			if (ret)
+				dev_warn(adap->pdev_dev, "Can't read MPS "
+					 "replication map for idx %d: %d\n",
+					 idx, -ret);
+			else {
+				mps_rplc = ldst_cmd.u.mps.rplc;
+				rplc[0] = ntohl(mps_rplc.rplc31_0);
+				rplc[1] = ntohl(mps_rplc.rplc63_32);
+				rplc[2] = ntohl(mps_rplc.rplc95_64);
+				rplc[3] = ntohl(mps_rplc.rplc127_96);
+				if (adap->params.arch.mps_rplc_size > 128) {
+					rplc[4] = ntohl(mps_rplc.rplc159_128);
+					rplc[5] = ntohl(mps_rplc.rplc191_160);
+					rplc[6] = ntohl(mps_rplc.rplc223_192);
+					rplc[7] = ntohl(mps_rplc.rplc255_224);
+				}
+			}
+		}
+
+		tcamxy2valmask(tcamx, tcamy, addr, &mask);
+		if (chip_ver > CHELSIO_T5) {
+			/* Inner header lookup */
+			if (lookup_type && (lookup_type != DATALKPTYPE_M)) {
+				seq_printf(seq,
+					   "%3u %02x:%02x:%02x:%02x:%02x:%02x "
+					   "%012llx %06x %06x    -    -   %3c"
+					   "      'I'  %4x   "
+					   "%3c   %#x%4u%4d", idx, addr[0],
+					   addr[1], addr[2], addr[3],
+					   addr[4], addr[5],
+					   (unsigned long long)mask,
+					   vniy, (vnix | vniy),
+					   dip_hit ? 'Y' : 'N',
+					   port_num,
+					   (cls_lo & T6_SRAM_VLD_F) ? 'Y' : 'N',
+					   PORTMAP_G(cls_hi),
+					   T6_PF_G(cls_lo),
+					   (cls_lo & T6_VF_VALID_F) ?
+					   T6_VF_G(cls_lo) : -1);
+			} else {
+				seq_printf(seq,
+					   "%3u %02x:%02x:%02x:%02x:%02x:%02x "
+					   "%012llx    -       -   ",
+					   idx, addr[0], addr[1], addr[2],
+					   addr[3], addr[4], addr[5],
+					   (unsigned long long)mask);
+
+				if (vlan_vld)
+					seq_printf(seq, "%4u   Y     ", ivlan);
+				else
+					seq_puts(seq, "  -    N     ");
+
+				seq_printf(seq,
+					   "-      %3c  %4x   %3c   %#x%4u%4d",
+					   lookup_type ? 'I' : 'O', port_num,
+					   (cls_lo & T6_SRAM_VLD_F) ? 'Y' : 'N',
+					   PORTMAP_G(cls_hi),
+					   T6_PF_G(cls_lo),
+					   (cls_lo & T6_VF_VALID_F) ?
+					   T6_VF_G(cls_lo) : -1);
+			}
+		} else
+			seq_printf(seq, "%3u %02x:%02x:%02x:%02x:%02x:%02x "
+				   "%012llx%3c   %#x%4u%4d",
+				   idx, addr[0], addr[1], addr[2], addr[3],
+				   addr[4], addr[5], (unsigned long long)mask,
+				   (cls_lo & SRAM_VLD_F) ? 'Y' : 'N',
+				   PORTMAP_G(cls_hi),
+				   PF_G(cls_lo),
+				   (cls_lo & VF_VALID_F) ? VF_G(cls_lo) : -1);
+
+		if (replicate) {
+			if (adap->params.arch.mps_rplc_size > 128)
+				seq_printf(seq, " %08x %08x %08x %08x "
+					   "%08x %08x %08x %08x",
+					   rplc[7], rplc[6], rplc[5], rplc[4],
+					   rplc[3], rplc[2], rplc[1], rplc[0]);
+			else
+				seq_printf(seq, " %08x %08x %08x %08x",
+					   rplc[3], rplc[2], rplc[1], rplc[0]);
+		} else {
+			if (adap->params.arch.mps_rplc_size > 128)
+				seq_printf(seq, "%72c", ' ');
+			else
+				seq_printf(seq, "%36c", ' ');
+		}
+
+		if (chip_ver > CHELSIO_T5)
+			seq_printf(seq, "%4u%3u%3u%3u %#x\n",
+				   T6_SRAM_PRIO0_G(cls_lo),
+				   T6_SRAM_PRIO1_G(cls_lo),
+				   T6_SRAM_PRIO2_G(cls_lo),
+				   T6_SRAM_PRIO3_G(cls_lo),
+				   (cls_lo >> T6_MULTILISTEN0_S) & 0xf);
+		else
+			seq_printf(seq, "%4u%3u%3u%3u %#x\n",
+				   SRAM_PRIO0_G(cls_lo), SRAM_PRIO1_G(cls_lo),
+				   SRAM_PRIO2_G(cls_lo), SRAM_PRIO3_G(cls_lo),
+				   (cls_lo >> MULTILISTEN0_S) & 0xf);
+	}
+out:	return 0;
+}
+
+static inline void *mps_tcam_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct adapter *adap = seq->private;
+	int max_mac_addr = is_t4(adap->params.chip) ?
+				NUM_MPS_CLS_SRAM_L_INSTANCES :
+				NUM_MPS_T5_CLS_SRAM_L_INSTANCES;
+	return ((pos <= max_mac_addr) ? (void *)(uintptr_t)(pos + 1) : NULL);
+}
+
+static void *mps_tcam_start(struct seq_file *seq, loff_t *pos)
+{
+	return *pos ? mps_tcam_get_idx(seq, *pos) : SEQ_START_TOKEN;
+}
+
+static void *mps_tcam_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return mps_tcam_get_idx(seq, *pos);
+}
+
+static void mps_tcam_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations mps_tcam_seq_ops = {
+	.start = mps_tcam_start,
+	.next  = mps_tcam_next,
+	.stop  = mps_tcam_stop,
+	.show  = mps_tcam_show
+};
+
+static int mps_tcam_open(struct inode *inode, struct file *file)
+{
+	int res = seq_open(file, &mps_tcam_seq_ops);
+
+	if (!res) {
+		struct seq_file *seq = file->private_data;
+
+		seq->private = inode->i_private;
+	}
+	return res;
+}
+
+static const struct file_operations mps_tcam_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = mps_tcam_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+/* Display various sensor information.
+ */
+static int sensors_show(struct seq_file *seq, void *v)
+{
+	struct adapter *adap = seq->private;
+	u32 param[7], val[7];
+	int ret;
+
+	/* Note that if the sensors haven't been initialized and turned on
+	 * we'll get values of 0, so treat those as "<unknown>" ...
+	 */
+	param[0] = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+		    FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_DIAG) |
+		    FW_PARAMS_PARAM_Y_V(FW_PARAM_DEV_DIAG_TMP));
+	param[1] = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+		    FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_DIAG) |
+		    FW_PARAMS_PARAM_Y_V(FW_PARAM_DEV_DIAG_VDD));
+	ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 2,
+			      param, val);
+
+	if (ret < 0 || val[0] == 0)
+		seq_puts(seq, "Temperature: <unknown>\n");
+	else
+		seq_printf(seq, "Temperature: %dC\n", val[0]);
+
+	if (ret < 0 || val[1] == 0)
+		seq_puts(seq, "Core VDD:    <unknown>\n");
+	else
+		seq_printf(seq, "Core VDD:    %dmV\n", val[1]);
+
+	return 0;
+}
+
+DEFINE_SIMPLE_DEBUGFS_FILE(sensors);
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int clip_tbl_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, clip_tbl_show, inode->i_private);
+}
+
+static const struct file_operations clip_tbl_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = clip_tbl_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release
+};
+#endif
+
+/*RSS Table.
+ */
+
+static int rss_show(struct seq_file *seq, void *v, int idx)
+{
+	u16 *entry = v;
+
+	seq_printf(seq, "%4d:  %4u  %4u  %4u  %4u  %4u  %4u  %4u  %4u\n",
+		   idx * 8, entry[0], entry[1], entry[2], entry[3], entry[4],
+		   entry[5], entry[6], entry[7]);
+	return 0;
+}
+
+static int rss_open(struct inode *inode, struct file *file)
+{
+	struct adapter *adap = inode->i_private;
+	int ret, nentries;
+	struct seq_tab *p;
+
+	nentries = t4_chip_rss_size(adap);
+	p = seq_open_tab(file, nentries / 8, 8 * sizeof(u16), 0, rss_show);
+	if (!p)
+		return -ENOMEM;
+
+	ret = t4_read_rss(adap, (u16 *)p->data);
+	if (ret)
+		seq_release_private(inode, file);
+
+	return ret;
+}
+
+static const struct file_operations rss_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = rss_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private
+};
+
+/* RSS Configuration.
+ */
+
+/* Small utility function to return the strings "yes" or "no" if the supplied
+ * argument is non-zero.
+ */
+static const char *yesno(int x)
+{
+	static const char *yes = "yes";
+	static const char *no = "no";
+
+	return x ? yes : no;
+}
+
+static int rss_config_show(struct seq_file *seq, void *v)
+{
+	struct adapter *adapter = seq->private;
+	static const char * const keymode[] = {
+		"global",
+		"global and per-VF scramble",
+		"per-PF and per-VF scramble",
+		"per-VF and per-VF scramble",
+	};
+	u32 rssconf;
+
+	rssconf = t4_read_reg(adapter, TP_RSS_CONFIG_A);
+	seq_printf(seq, "TP_RSS_CONFIG: %#x\n", rssconf);
+	seq_printf(seq, "  Tnl4TupEnIpv6: %3s\n", yesno(rssconf &
+							TNL4TUPENIPV6_F));
+	seq_printf(seq, "  Tnl2TupEnIpv6: %3s\n", yesno(rssconf &
+							TNL2TUPENIPV6_F));
+	seq_printf(seq, "  Tnl4TupEnIpv4: %3s\n", yesno(rssconf &
+							TNL4TUPENIPV4_F));
+	seq_printf(seq, "  Tnl2TupEnIpv4: %3s\n", yesno(rssconf &
+							TNL2TUPENIPV4_F));
+	seq_printf(seq, "  TnlTcpSel:     %3s\n", yesno(rssconf & TNLTCPSEL_F));
+	seq_printf(seq, "  TnlIp6Sel:     %3s\n", yesno(rssconf & TNLIP6SEL_F));
+	seq_printf(seq, "  TnlVrtSel:     %3s\n", yesno(rssconf & TNLVRTSEL_F));
+	seq_printf(seq, "  TnlMapEn:      %3s\n", yesno(rssconf & TNLMAPEN_F));
+	seq_printf(seq, "  OfdHashSave:   %3s\n", yesno(rssconf &
+							OFDHASHSAVE_F));
+	seq_printf(seq, "  OfdVrtSel:     %3s\n", yesno(rssconf & OFDVRTSEL_F));
+	seq_printf(seq, "  OfdMapEn:      %3s\n", yesno(rssconf & OFDMAPEN_F));
+	seq_printf(seq, "  OfdLkpEn:      %3s\n", yesno(rssconf & OFDLKPEN_F));
+	seq_printf(seq, "  Syn4TupEnIpv6: %3s\n", yesno(rssconf &
+							SYN4TUPENIPV6_F));
+	seq_printf(seq, "  Syn2TupEnIpv6: %3s\n", yesno(rssconf &
+							SYN2TUPENIPV6_F));
+	seq_printf(seq, "  Syn4TupEnIpv4: %3s\n", yesno(rssconf &
+							SYN4TUPENIPV4_F));
+	seq_printf(seq, "  Syn2TupEnIpv4: %3s\n", yesno(rssconf &
+							SYN2TUPENIPV4_F));
+	seq_printf(seq, "  Syn4TupEnIpv6: %3s\n", yesno(rssconf &
+							SYN4TUPENIPV6_F));
+	seq_printf(seq, "  SynIp6Sel:     %3s\n", yesno(rssconf & SYNIP6SEL_F));
+	seq_printf(seq, "  SynVrt6Sel:    %3s\n", yesno(rssconf & SYNVRTSEL_F));
+	seq_printf(seq, "  SynMapEn:      %3s\n", yesno(rssconf & SYNMAPEN_F));
+	seq_printf(seq, "  SynLkpEn:      %3s\n", yesno(rssconf & SYNLKPEN_F));
+	seq_printf(seq, "  ChnEn:         %3s\n", yesno(rssconf &
+							CHANNELENABLE_F));
+	seq_printf(seq, "  PrtEn:         %3s\n", yesno(rssconf &
+							PORTENABLE_F));
+	seq_printf(seq, "  TnlAllLkp:     %3s\n", yesno(rssconf &
+							TNLALLLOOKUP_F));
+	seq_printf(seq, "  VrtEn:         %3s\n", yesno(rssconf &
+							VIRTENABLE_F));
+	seq_printf(seq, "  CngEn:         %3s\n", yesno(rssconf &
+							CONGESTIONENABLE_F));
+	seq_printf(seq, "  HashToeplitz:  %3s\n", yesno(rssconf &
+							HASHTOEPLITZ_F));
+	seq_printf(seq, "  Udp4En:        %3s\n", yesno(rssconf & UDPENABLE_F));
+	seq_printf(seq, "  Disable:       %3s\n", yesno(rssconf & DISABLE_F));
+
+	seq_puts(seq, "\n");
+
+	rssconf = t4_read_reg(adapter, TP_RSS_CONFIG_TNL_A);
+	seq_printf(seq, "TP_RSS_CONFIG_TNL: %#x\n", rssconf);
+	seq_printf(seq, "  MaskSize:      %3d\n", MASKSIZE_G(rssconf));
+	seq_printf(seq, "  MaskFilter:    %3d\n", MASKFILTER_G(rssconf));
+	if (CHELSIO_CHIP_VERSION(adapter->params.chip) > CHELSIO_T5) {
+		seq_printf(seq, "  HashAll:     %3s\n",
+			   yesno(rssconf & HASHALL_F));
+		seq_printf(seq, "  HashEth:     %3s\n",
+			   yesno(rssconf & HASHETH_F));
+	}
+	seq_printf(seq, "  UseWireCh:     %3s\n", yesno(rssconf & USEWIRECH_F));
+
+	seq_puts(seq, "\n");
+
+	rssconf = t4_read_reg(adapter, TP_RSS_CONFIG_OFD_A);
+	seq_printf(seq, "TP_RSS_CONFIG_OFD: %#x\n", rssconf);
+	seq_printf(seq, "  MaskSize:      %3d\n", MASKSIZE_G(rssconf));
+	seq_printf(seq, "  RRCplMapEn:    %3s\n", yesno(rssconf &
+							RRCPLMAPEN_F));
+	seq_printf(seq, "  RRCplQueWidth: %3d\n", RRCPLQUEWIDTH_G(rssconf));
+
+	seq_puts(seq, "\n");
+
+	rssconf = t4_read_reg(adapter, TP_RSS_CONFIG_SYN_A);
+	seq_printf(seq, "TP_RSS_CONFIG_SYN: %#x\n", rssconf);
+	seq_printf(seq, "  MaskSize:      %3d\n", MASKSIZE_G(rssconf));
+	seq_printf(seq, "  UseWireCh:     %3s\n", yesno(rssconf & USEWIRECH_F));
+
+	seq_puts(seq, "\n");
+
+	rssconf = t4_read_reg(adapter, TP_RSS_CONFIG_VRT_A);
+	seq_printf(seq, "TP_RSS_CONFIG_VRT: %#x\n", rssconf);
+	if (CHELSIO_CHIP_VERSION(adapter->params.chip) > CHELSIO_T5) {
+		seq_printf(seq, "  KeyWrAddrX:     %3d\n",
+			   KEYWRADDRX_G(rssconf));
+		seq_printf(seq, "  KeyExtend:      %3s\n",
+			   yesno(rssconf & KEYEXTEND_F));
+	}
+	seq_printf(seq, "  VfRdRg:        %3s\n", yesno(rssconf & VFRDRG_F));
+	seq_printf(seq, "  VfRdEn:        %3s\n", yesno(rssconf & VFRDEN_F));
+	seq_printf(seq, "  VfPerrEn:      %3s\n", yesno(rssconf & VFPERREN_F));
+	seq_printf(seq, "  KeyPerrEn:     %3s\n", yesno(rssconf & KEYPERREN_F));
+	seq_printf(seq, "  DisVfVlan:     %3s\n", yesno(rssconf &
+							DISABLEVLAN_F));
+	seq_printf(seq, "  EnUpSwt:       %3s\n", yesno(rssconf & ENABLEUP0_F));
+	seq_printf(seq, "  HashDelay:     %3d\n", HASHDELAY_G(rssconf));
+	if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5)
+		seq_printf(seq, "  VfWrAddr:      %3d\n", VFWRADDR_G(rssconf));
+	else
+		seq_printf(seq, "  VfWrAddr:      %3d\n",
+			   T6_VFWRADDR_G(rssconf));
+	seq_printf(seq, "  KeyMode:       %s\n", keymode[KEYMODE_G(rssconf)]);
+	seq_printf(seq, "  VfWrEn:        %3s\n", yesno(rssconf & VFWREN_F));
+	seq_printf(seq, "  KeyWrEn:       %3s\n", yesno(rssconf & KEYWREN_F));
+	seq_printf(seq, "  KeyWrAddr:     %3d\n", KEYWRADDR_G(rssconf));
+
+	seq_puts(seq, "\n");
+
+	rssconf = t4_read_reg(adapter, TP_RSS_CONFIG_CNG_A);
+	seq_printf(seq, "TP_RSS_CONFIG_CNG: %#x\n", rssconf);
+	seq_printf(seq, "  ChnCount3:     %3s\n", yesno(rssconf & CHNCOUNT3_F));
+	seq_printf(seq, "  ChnCount2:     %3s\n", yesno(rssconf & CHNCOUNT2_F));
+	seq_printf(seq, "  ChnCount1:     %3s\n", yesno(rssconf & CHNCOUNT1_F));
+	seq_printf(seq, "  ChnCount0:     %3s\n", yesno(rssconf & CHNCOUNT0_F));
+	seq_printf(seq, "  ChnUndFlow3:   %3s\n", yesno(rssconf &
+							CHNUNDFLOW3_F));
+	seq_printf(seq, "  ChnUndFlow2:   %3s\n", yesno(rssconf &
+							CHNUNDFLOW2_F));
+	seq_printf(seq, "  ChnUndFlow1:   %3s\n", yesno(rssconf &
+							CHNUNDFLOW1_F));
+	seq_printf(seq, "  ChnUndFlow0:   %3s\n", yesno(rssconf &
+							CHNUNDFLOW0_F));
+	seq_printf(seq, "  RstChn3:       %3s\n", yesno(rssconf & RSTCHN3_F));
+	seq_printf(seq, "  RstChn2:       %3s\n", yesno(rssconf & RSTCHN2_F));
+	seq_printf(seq, "  RstChn1:       %3s\n", yesno(rssconf & RSTCHN1_F));
+	seq_printf(seq, "  RstChn0:       %3s\n", yesno(rssconf & RSTCHN0_F));
+	seq_printf(seq, "  UpdVld:        %3s\n", yesno(rssconf & UPDVLD_F));
+	seq_printf(seq, "  Xoff:          %3s\n", yesno(rssconf & XOFF_F));
+	seq_printf(seq, "  UpdChn3:       %3s\n", yesno(rssconf & UPDCHN3_F));
+	seq_printf(seq, "  UpdChn2:       %3s\n", yesno(rssconf & UPDCHN2_F));
+	seq_printf(seq, "  UpdChn1:       %3s\n", yesno(rssconf & UPDCHN1_F));
+	seq_printf(seq, "  UpdChn0:       %3s\n", yesno(rssconf & UPDCHN0_F));
+	seq_printf(seq, "  Queue:         %3d\n", QUEUE_G(rssconf));
+
+	return 0;
+}
+
+DEFINE_SIMPLE_DEBUGFS_FILE(rss_config);
+
+/* RSS Secret Key.
+ */
+
+static int rss_key_show(struct seq_file *seq, void *v)
+{
+	u32 key[10];
+
+	t4_read_rss_key(seq->private, key, true);
+	seq_printf(seq, "%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x\n",
+		   key[9], key[8], key[7], key[6], key[5], key[4], key[3],
+		   key[2], key[1], key[0]);
+	return 0;
+}
+
+static int rss_key_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rss_key_show, inode->i_private);
+}
+
+static ssize_t rss_key_write(struct file *file, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	int i, j;
+	u32 key[10];
+	char s[100], *p;
+	struct adapter *adap = file_inode(file)->i_private;
+
+	if (count > sizeof(s) - 1)
+		return -EINVAL;
+	if (copy_from_user(s, buf, count))
+		return -EFAULT;
+	for (i = count; i > 0 && isspace(s[i - 1]); i--)
+		;
+	s[i] = '\0';
+
+	for (p = s, i = 9; i >= 0; i--) {
+		key[i] = 0;
+		for (j = 0; j < 8; j++, p++) {
+			if (!isxdigit(*p))
+				return -EINVAL;
+			key[i] = (key[i] << 4) | hex2val(*p);
+		}
+	}
+
+	t4_write_rss_key(adap, key, -1, true);
+	return count;
+}
+
+static const struct file_operations rss_key_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = rss_key_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+	.write   = rss_key_write
+};
+
+/* PF RSS Configuration.
+ */
+
+struct rss_pf_conf {
+	u32 rss_pf_map;
+	u32 rss_pf_mask;
+	u32 rss_pf_config;
+};
+
+static int rss_pf_config_show(struct seq_file *seq, void *v, int idx)
+{
+	struct rss_pf_conf *pfconf;
+
+	if (v == SEQ_START_TOKEN) {
+		/* use the 0th entry to dump the PF Map Index Size */
+		pfconf = seq->private + offsetof(struct seq_tab, data);
+		seq_printf(seq, "PF Map Index Size = %d\n\n",
+			   LKPIDXSIZE_G(pfconf->rss_pf_map));
+
+		seq_puts(seq, "     RSS              PF   VF    Hash Tuple Enable         Default\n");
+		seq_puts(seq, "     Enable       IPF Mask Mask  IPv6      IPv4      UDP   Queue\n");
+		seq_puts(seq, " PF  Map Chn Prt  Map Size Size  Four Two  Four Two  Four  Ch1  Ch0\n");
+	} else {
+		#define G_PFnLKPIDX(map, n) \
+			(((map) >> PF1LKPIDX_S*(n)) & PF0LKPIDX_M)
+		#define G_PFnMSKSIZE(mask, n) \
+			(((mask) >> PF1MSKSIZE_S*(n)) & PF1MSKSIZE_M)
+
+		pfconf = v;
+		seq_printf(seq, "%3d  %3s %3s %3s  %3d  %3d  %3d   %3s %3s   %3s %3s   %3s  %3d  %3d\n",
+			   idx,
+			   yesno(pfconf->rss_pf_config & MAPENABLE_F),
+			   yesno(pfconf->rss_pf_config & CHNENABLE_F),
+			   yesno(pfconf->rss_pf_config & PRTENABLE_F),
+			   G_PFnLKPIDX(pfconf->rss_pf_map, idx),
+			   G_PFnMSKSIZE(pfconf->rss_pf_mask, idx),
+			   IVFWIDTH_G(pfconf->rss_pf_config),
+			   yesno(pfconf->rss_pf_config & IP6FOURTUPEN_F),
+			   yesno(pfconf->rss_pf_config & IP6TWOTUPEN_F),
+			   yesno(pfconf->rss_pf_config & IP4FOURTUPEN_F),
+			   yesno(pfconf->rss_pf_config & IP4TWOTUPEN_F),
+			   yesno(pfconf->rss_pf_config & UDPFOURTUPEN_F),
+			   CH1DEFAULTQUEUE_G(pfconf->rss_pf_config),
+			   CH0DEFAULTQUEUE_G(pfconf->rss_pf_config));
+
+		#undef G_PFnLKPIDX
+		#undef G_PFnMSKSIZE
+	}
+	return 0;
+}
+
+static int rss_pf_config_open(struct inode *inode, struct file *file)
+{
+	struct adapter *adapter = inode->i_private;
+	struct seq_tab *p;
+	u32 rss_pf_map, rss_pf_mask;
+	struct rss_pf_conf *pfconf;
+	int pf;
+
+	p = seq_open_tab(file, 8, sizeof(*pfconf), 1, rss_pf_config_show);
+	if (!p)
+		return -ENOMEM;
+
+	pfconf = (struct rss_pf_conf *)p->data;
+	rss_pf_map = t4_read_rss_pf_map(adapter, true);
+	rss_pf_mask = t4_read_rss_pf_mask(adapter, true);
+	for (pf = 0; pf < 8; pf++) {
+		pfconf[pf].rss_pf_map = rss_pf_map;
+		pfconf[pf].rss_pf_mask = rss_pf_mask;
+		t4_read_rss_pf_config(adapter, pf, &pfconf[pf].rss_pf_config,
+				      true);
+	}
+	return 0;
+}
+
+static const struct file_operations rss_pf_config_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = rss_pf_config_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private
+};
+
+/* VF RSS Configuration.
+ */
+
+struct rss_vf_conf {
+	u32 rss_vf_vfl;
+	u32 rss_vf_vfh;
+};
+
+static int rss_vf_config_show(struct seq_file *seq, void *v, int idx)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "     RSS                     Hash Tuple Enable\n");
+		seq_puts(seq, "     Enable   IVF  Dis  Enb  IPv6      IPv4      UDP    Def  Secret Key\n");
+		seq_puts(seq, " VF  Chn Prt  Map  VLAN  uP  Four Two  Four Two  Four   Que  Idx       Hash\n");
+	} else {
+		struct rss_vf_conf *vfconf = v;
+
+		seq_printf(seq, "%3d  %3s %3s  %3d   %3s %3s   %3s %3s   %3s  %3s   %3s  %4d  %3d %#10x\n",
+			   idx,
+			   yesno(vfconf->rss_vf_vfh & VFCHNEN_F),
+			   yesno(vfconf->rss_vf_vfh & VFPRTEN_F),
+			   VFLKPIDX_G(vfconf->rss_vf_vfh),
+			   yesno(vfconf->rss_vf_vfh & VFVLNEX_F),
+			   yesno(vfconf->rss_vf_vfh & VFUPEN_F),
+			   yesno(vfconf->rss_vf_vfh & VFIP4FOURTUPEN_F),
+			   yesno(vfconf->rss_vf_vfh & VFIP6TWOTUPEN_F),
+			   yesno(vfconf->rss_vf_vfh & VFIP4FOURTUPEN_F),
+			   yesno(vfconf->rss_vf_vfh & VFIP4TWOTUPEN_F),
+			   yesno(vfconf->rss_vf_vfh & ENABLEUDPHASH_F),
+			   DEFAULTQUEUE_G(vfconf->rss_vf_vfh),
+			   KEYINDEX_G(vfconf->rss_vf_vfh),
+			   vfconf->rss_vf_vfl);
+	}
+	return 0;
+}
+
+static int rss_vf_config_open(struct inode *inode, struct file *file)
+{
+	struct adapter *adapter = inode->i_private;
+	struct seq_tab *p;
+	struct rss_vf_conf *vfconf;
+	int vf, vfcount = adapter->params.arch.vfcount;
+
+	p = seq_open_tab(file, vfcount, sizeof(*vfconf), 1, rss_vf_config_show);
+	if (!p)
+		return -ENOMEM;
+
+	vfconf = (struct rss_vf_conf *)p->data;
+	for (vf = 0; vf < vfcount; vf++) {
+		t4_read_rss_vf_config(adapter, vf, &vfconf[vf].rss_vf_vfl,
+				      &vfconf[vf].rss_vf_vfh, true);
+	}
+	return 0;
+}
+
+static const struct file_operations rss_vf_config_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = rss_vf_config_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private
+};
+
+/**
+ * ethqset2pinfo - return port_info of an Ethernet Queue Set
+ * @adap: the adapter
+ * @qset: Ethernet Queue Set
+ */
+static inline struct port_info *ethqset2pinfo(struct adapter *adap, int qset)
+{
+	int pidx;
+
+	for_each_port(adap, pidx) {
+		struct port_info *pi = adap2pinfo(adap, pidx);
+
+		if (qset >= pi->first_qset &&
+		    qset < pi->first_qset + pi->nqsets)
+			return pi;
+	}
+
+	/* should never happen! */
+	BUG_ON(1);
+	return NULL;
+}
+
+static int sge_qinfo_show(struct seq_file *seq, void *v)
+{
+	struct adapter *adap = seq->private;
+	int eth_entries = DIV_ROUND_UP(adap->sge.ethqsets, 4);
+	int ofld_entries = DIV_ROUND_UP(adap->sge.ofldqsets, 4);
+	int ctrl_entries = DIV_ROUND_UP(MAX_CTRL_QUEUES, 4);
+	int i, r = (uintptr_t)v - 1;
+	int ofld_idx = r - eth_entries;
+	int ctrl_idx =  ofld_idx - ofld_entries;
+	int fq_idx =  ctrl_idx - ctrl_entries;
+
+	if (r)
+		seq_putc(seq, '\n');
+
+#define S3(fmt_spec, s, v) \
+do { \
+	seq_printf(seq, "%-12s", s); \
+	for (i = 0; i < n; ++i) \
+		seq_printf(seq, " %16" fmt_spec, v); \
+		seq_putc(seq, '\n'); \
+} while (0)
+#define S(s, v) S3("s", s, v)
+#define T3(fmt_spec, s, v) S3(fmt_spec, s, tx[i].v)
+#define T(s, v) S3("u", s, tx[i].v)
+#define TL(s, v) T3("lu", s, v)
+#define R3(fmt_spec, s, v) S3(fmt_spec, s, rx[i].v)
+#define R(s, v) S3("u", s, rx[i].v)
+#define RL(s, v) R3("lu", s, v)
+
+	if (r < eth_entries) {
+		int base_qset = r * 4;
+		const struct sge_eth_rxq *rx = &adap->sge.ethrxq[base_qset];
+		const struct sge_eth_txq *tx = &adap->sge.ethtxq[base_qset];
+		int n = min(4, adap->sge.ethqsets - 4 * r);
+
+		S("QType:", "Ethernet");
+		S("Interface:",
+		  rx[i].rspq.netdev ? rx[i].rspq.netdev->name : "N/A");
+		T("TxQ ID:", q.cntxt_id);
+		T("TxQ size:", q.size);
+		T("TxQ inuse:", q.in_use);
+		T("TxQ CIDX:", q.cidx);
+		T("TxQ PIDX:", q.pidx);
+#ifdef CONFIG_CHELSIO_T4_DCB
+		T("DCB Prio:", dcb_prio);
+		S3("u", "DCB PGID:",
+		   (ethqset2pinfo(adap, base_qset + i)->dcb.pgid >>
+		    4*(7-tx[i].dcb_prio)) & 0xf);
+		S3("u", "DCB PFC:",
+		   (ethqset2pinfo(adap, base_qset + i)->dcb.pfcen >>
+		    1*(7-tx[i].dcb_prio)) & 0x1);
+#endif
+		R("RspQ ID:", rspq.abs_id);
+		R("RspQ size:", rspq.size);
+		R("RspQE size:", rspq.iqe_len);
+		R("RspQ CIDX:", rspq.cidx);
+		R("RspQ Gen:", rspq.gen);
+		S3("u", "Intr delay:", qtimer_val(adap, &rx[i].rspq));
+		S3("u", "Intr pktcnt:",
+		   adap->sge.counter_val[rx[i].rspq.pktcnt_idx]);
+		R("FL ID:", fl.cntxt_id);
+		R("FL size:", fl.size - 8);
+		R("FL pend:", fl.pend_cred);
+		R("FL avail:", fl.avail);
+		R("FL PIDX:", fl.pidx);
+		R("FL CIDX:", fl.cidx);
+		RL("RxPackets:", stats.pkts);
+		RL("RxCSO:", stats.rx_cso);
+		RL("VLANxtract:", stats.vlan_ex);
+		RL("LROmerged:", stats.lro_merged);
+		RL("LROpackets:", stats.lro_pkts);
+		RL("RxDrops:", stats.rx_drops);
+		TL("TSO:", tso);
+		TL("TxCSO:", tx_cso);
+		TL("VLANins:", vlan_ins);
+		TL("TxQFull:", q.stops);
+		TL("TxQRestarts:", q.restarts);
+		TL("TxMapErr:", mapping_err);
+		RL("FLAllocErr:", fl.alloc_failed);
+		RL("FLLrgAlcErr:", fl.large_alloc_failed);
+		RL("FLMapErr:", fl.mapping_err);
+		RL("FLLow:", fl.low);
+		RL("FLStarving:", fl.starving);
+
+	} else if (ctrl_idx < ctrl_entries) {
+		const struct sge_ctrl_txq *tx = &adap->sge.ctrlq[ctrl_idx * 4];
+		int n = min(4, adap->params.nports - 4 * ctrl_idx);
+
+		S("QType:", "Control");
+		T("TxQ ID:", q.cntxt_id);
+		T("TxQ size:", q.size);
+		T("TxQ inuse:", q.in_use);
+		T("TxQ CIDX:", q.cidx);
+		T("TxQ PIDX:", q.pidx);
+		TL("TxQFull:", q.stops);
+		TL("TxQRestarts:", q.restarts);
+	} else if (fq_idx == 0) {
+		const struct sge_rspq *evtq = &adap->sge.fw_evtq;
+
+		seq_printf(seq, "%-12s %16s\n", "QType:", "FW event queue");
+		seq_printf(seq, "%-12s %16u\n", "RspQ ID:", evtq->abs_id);
+		seq_printf(seq, "%-12s %16u\n", "RspQ size:", evtq->size);
+		seq_printf(seq, "%-12s %16u\n", "RspQE size:", evtq->iqe_len);
+		seq_printf(seq, "%-12s %16u\n", "RspQ CIDX:", evtq->cidx);
+		seq_printf(seq, "%-12s %16u\n", "RspQ Gen:", evtq->gen);
+		seq_printf(seq, "%-12s %16u\n", "Intr delay:",
+			   qtimer_val(adap, evtq));
+		seq_printf(seq, "%-12s %16u\n", "Intr pktcnt:",
+			   adap->sge.counter_val[evtq->pktcnt_idx]);
+	}
+#undef R
+#undef RL
+#undef T
+#undef TL
+#undef S
+#undef R3
+#undef T3
+#undef S3
+	return 0;
+}
+
+static int sge_queue_entries(const struct adapter *adap)
+{
+	return DIV_ROUND_UP(adap->sge.ethqsets, 4) +
+	       DIV_ROUND_UP(adap->sge.ofldqsets, 4) +
+	       DIV_ROUND_UP(MAX_CTRL_QUEUES, 4) + 1;
+}
+
+static void *sge_queue_start(struct seq_file *seq, loff_t *pos)
+{
+	int entries = sge_queue_entries(seq->private);
+
+	return *pos < entries ? (void *)((uintptr_t)*pos + 1) : NULL;
+}
+
+static void sge_queue_stop(struct seq_file *seq, void *v)
+{
+}
+
+static void *sge_queue_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	int entries = sge_queue_entries(seq->private);
+
+	++*pos;
+	return *pos < entries ? (void *)((uintptr_t)*pos + 1) : NULL;
+}
+
+static const struct seq_operations sge_qinfo_seq_ops = {
+	.start = sge_queue_start,
+	.next  = sge_queue_next,
+	.stop  = sge_queue_stop,
+	.show  = sge_qinfo_show
+};
+
+static int sge_qinfo_open(struct inode *inode, struct file *file)
+{
+	int res = seq_open(file, &sge_qinfo_seq_ops);
+
+	if (!res) {
+		struct seq_file *seq = file->private_data;
+
+		seq->private = inode->i_private;
+	}
+	return res;
+}
+
+static const struct file_operations sge_qinfo_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = sge_qinfo_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+int mem_open(struct inode *inode, struct file *file)
+{
+	unsigned int mem;
+	struct adapter *adap;
+
+	file->private_data = inode->i_private;
+
+	mem = (uintptr_t)file->private_data & 0x7;
+	adap = file->private_data - mem;
+
+	(void)t4_fwcache(adap, FW_PARAM_DEV_FWCACHE_FLUSH);
+
+	return 0;
+}
+
+static ssize_t mem_read(struct file *file, char __user *buf, size_t count,
+			loff_t *ppos)
+{
+	loff_t pos = *ppos;
+	loff_t avail = file_inode(file)->i_size;
+	unsigned int mem = (uintptr_t)file->private_data & 0x7;
+	struct adapter *adap = file->private_data - mem;
+	__be32 *data;
+	int ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= avail)
+		return 0;
+	if (count > avail - pos)
+		count = avail - pos;
+
+	data = kvzalloc(count, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	spin_lock(&adap->win0_lock);
+	ret = t4_memory_rw(adap, 0, mem, pos, count, data, T4_MEMORY_READ);
+	spin_unlock(&adap->win0_lock);
+	if (ret) {
+		kvfree(data);
+		return ret;
+	}
+	ret = copy_to_user(buf, data, count);
+
+	kvfree(data);
+	if (ret)
+		return -EFAULT;
+
+	*ppos = pos + count;
+	return count;
+}
+static const struct file_operations mem_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = simple_open,
+	.read    = mem_read,
+	.llseek  = default_llseek,
+};
+
+static int tid_info_show(struct seq_file *seq, void *v)
+{
+	unsigned int tid_start = 0;
+	struct adapter *adap = seq->private;
+	const struct tid_info *t = &adap->tids;
+	enum chip_type chip = CHELSIO_CHIP_VERSION(adap->params.chip);
+
+	if (chip > CHELSIO_T5)
+		tid_start = t4_read_reg(adap, LE_DB_ACTIVE_TABLE_START_INDEX_A);
+
+	if (t4_read_reg(adap, LE_DB_CONFIG_A) & HASHEN_F) {
+		unsigned int sb;
+		seq_printf(seq, "Connections in use: %u\n",
+			   atomic_read(&t->conns_in_use));
+
+		if (chip <= CHELSIO_T5)
+			sb = t4_read_reg(adap, LE_DB_SERVER_INDEX_A) / 4;
+		else
+			sb = t4_read_reg(adap, LE_DB_SRVR_START_INDEX_A);
+
+		if (sb) {
+			seq_printf(seq, "TID range: %u..%u/%u..%u", tid_start,
+				   sb - 1, adap->tids.hash_base,
+				   t->ntids - 1);
+			seq_printf(seq, ", in use: %u/%u\n",
+				   atomic_read(&t->tids_in_use),
+				   atomic_read(&t->hash_tids_in_use));
+		} else if (adap->flags & FW_OFLD_CONN) {
+			seq_printf(seq, "TID range: %u..%u/%u..%u",
+				   t->aftid_base,
+				   t->aftid_end,
+				   adap->tids.hash_base,
+				   t->ntids - 1);
+			seq_printf(seq, ", in use: %u/%u\n",
+				   atomic_read(&t->tids_in_use),
+				   atomic_read(&t->hash_tids_in_use));
+		} else {
+			seq_printf(seq, "TID range: %u..%u",
+				   adap->tids.hash_base,
+				   t->ntids - 1);
+			seq_printf(seq, ", in use: %u\n",
+				   atomic_read(&t->hash_tids_in_use));
+		}
+	} else if (t->ntids) {
+		seq_printf(seq, "Connections in use: %u\n",
+			   atomic_read(&t->conns_in_use));
+
+		seq_printf(seq, "TID range: %u..%u", tid_start,
+			   tid_start + t->ntids - 1);
+		seq_printf(seq, ", in use: %u\n",
+			   atomic_read(&t->tids_in_use));
+	}
+
+	if (t->nstids)
+		seq_printf(seq, "STID range: %u..%u, in use-IPv4/IPv6: %u/%u\n",
+			   (!t->stid_base &&
+			   (chip <= CHELSIO_T5)) ?
+			   t->stid_base + 1 : t->stid_base,
+			   t->stid_base + t->nstids - 1,
+			   t->stids_in_use - t->v6_stids_in_use,
+			   t->v6_stids_in_use);
+
+	if (t->natids)
+		seq_printf(seq, "ATID range: 0..%u, in use: %u\n",
+			   t->natids - 1, t->atids_in_use);
+	seq_printf(seq, "FTID range: %u..%u\n", t->ftid_base,
+		   t->ftid_base + t->nftids - 1);
+	if (t->nsftids)
+		seq_printf(seq, "SFTID range: %u..%u in use: %u\n",
+			   t->sftid_base, t->sftid_base + t->nsftids - 2,
+			   t->sftids_in_use);
+	if (t->ntids)
+		seq_printf(seq, "HW TID usage: %u IP users, %u IPv6 users\n",
+			   t4_read_reg(adap, LE_DB_ACT_CNT_IPV4_A),
+			   t4_read_reg(adap, LE_DB_ACT_CNT_IPV6_A));
+	return 0;
+}
+
+DEFINE_SIMPLE_DEBUGFS_FILE(tid_info);
+
+static void add_debugfs_mem(struct adapter *adap, const char *name,
+			    unsigned int idx, unsigned int size_mb)
+{
+	debugfs_create_file_size(name, 0400, adap->debugfs_root,
+				 (void *)adap + idx, &mem_debugfs_fops,
+				 size_mb << 20);
+}
+
+static ssize_t blocked_fl_read(struct file *filp, char __user *ubuf,
+			       size_t count, loff_t *ppos)
+{
+	int len;
+	const struct adapter *adap = filp->private_data;
+	char *buf;
+	ssize_t size = (adap->sge.egr_sz + 3) / 4 +
+			adap->sge.egr_sz / 32 + 2; /* includes ,/\n/\0 */
+
+	buf = kzalloc(size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	len = snprintf(buf, size - 1, "%*pb\n",
+		       adap->sge.egr_sz, adap->sge.blocked_fl);
+	len += sprintf(buf + len, "\n");
+	size = simple_read_from_buffer(ubuf, count, ppos, buf, len);
+	kvfree(buf);
+	return size;
+}
+
+static ssize_t blocked_fl_write(struct file *filp, const char __user *ubuf,
+				size_t count, loff_t *ppos)
+{
+	int err;
+	unsigned long *t;
+	struct adapter *adap = filp->private_data;
+
+	t = kcalloc(BITS_TO_LONGS(adap->sge.egr_sz), sizeof(long), GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	err = bitmap_parse_user(ubuf, count, t, adap->sge.egr_sz);
+	if (err)
+		return err;
+
+	bitmap_copy(adap->sge.blocked_fl, t, adap->sge.egr_sz);
+	kvfree(t);
+	return count;
+}
+
+static const struct file_operations blocked_fl_fops = {
+	.owner   = THIS_MODULE,
+	.open    = simple_open,
+	.read    = blocked_fl_read,
+	.write   = blocked_fl_write,
+	.llseek  = generic_file_llseek,
+};
+
+static void mem_region_show(struct seq_file *seq, const char *name,
+			    unsigned int from, unsigned int to)
+{
+	char buf[40];
+
+	string_get_size((u64)to - from + 1, 1, STRING_UNITS_2, buf,
+			sizeof(buf));
+	seq_printf(seq, "%-15s %#x-%#x [%s]\n", name, from, to, buf);
+}
+
+static int meminfo_show(struct seq_file *seq, void *v)
+{
+	static const char * const memory[] = { "EDC0:", "EDC1:", "MC:",
+					       "MC0:", "MC1:", "HMA:"};
+	struct adapter *adap = seq->private;
+	struct cudbg_meminfo meminfo;
+	int i, rc;
+
+	memset(&meminfo, 0, sizeof(struct cudbg_meminfo));
+	rc = cudbg_fill_meminfo(adap, &meminfo);
+	if (rc)
+		return -ENXIO;
+
+	for (i = 0; i < meminfo.avail_c; i++)
+		mem_region_show(seq, memory[meminfo.avail[i].idx],
+				meminfo.avail[i].base,
+				meminfo.avail[i].limit - 1);
+
+	seq_putc(seq, '\n');
+	for (i = 0; i < meminfo.mem_c; i++) {
+		if (meminfo.mem[i].idx >= ARRAY_SIZE(cudbg_region))
+			continue;                        /* skip holes */
+		if (!meminfo.mem[i].limit)
+			meminfo.mem[i].limit =
+				i < meminfo.mem_c - 1 ?
+				meminfo.mem[i + 1].base - 1 : ~0;
+		mem_region_show(seq, cudbg_region[meminfo.mem[i].idx],
+				meminfo.mem[i].base, meminfo.mem[i].limit);
+	}
+
+	seq_putc(seq, '\n');
+	mem_region_show(seq, "uP RAM:", meminfo.up_ram_lo, meminfo.up_ram_hi);
+	mem_region_show(seq, "uP Extmem2:", meminfo.up_extmem2_lo,
+			meminfo.up_extmem2_hi);
+
+	seq_printf(seq, "\n%u Rx pages of size %uKiB for %u channels\n",
+		   meminfo.rx_pages_data[0], meminfo.rx_pages_data[1],
+		   meminfo.rx_pages_data[2]);
+
+	seq_printf(seq, "%u Tx pages of size %u%ciB for %u channels\n",
+		   meminfo.tx_pages_data[0], meminfo.tx_pages_data[1],
+		   meminfo.tx_pages_data[2], meminfo.tx_pages_data[3]);
+
+	seq_printf(seq, "%u p-structs\n\n", meminfo.p_structs);
+
+	for (i = 0; i < 4; i++)
+		/* For T6 these are MAC buffer groups */
+		seq_printf(seq, "Port %d using %u pages out of %u allocated\n",
+			   i, meminfo.port_used[i], meminfo.port_alloc[i]);
+
+	for (i = 0; i < adap->params.arch.nchan; i++)
+		/* For T6 these are MAC buffer groups */
+		seq_printf(seq,
+			   "Loopback %d using %u pages out of %u allocated\n",
+			   i, meminfo.loopback_used[i],
+			   meminfo.loopback_alloc[i]);
+
+	return 0;
+}
+
+static int meminfo_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, meminfo_show, inode->i_private);
+}
+
+static const struct file_operations meminfo_fops = {
+	.owner   = THIS_MODULE,
+	.open    = meminfo_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+static int chcr_show(struct seq_file *seq, void *v)
+{
+	struct adapter *adap = seq->private;
+
+	seq_puts(seq, "Chelsio Crypto Accelerator Stats \n");
+	seq_printf(seq, "Cipher Ops: %10u \n",
+		   atomic_read(&adap->chcr_stats.cipher_rqst));
+	seq_printf(seq, "Digest Ops: %10u \n",
+		   atomic_read(&adap->chcr_stats.digest_rqst));
+	seq_printf(seq, "Aead Ops: %10u \n",
+		   atomic_read(&adap->chcr_stats.aead_rqst));
+	seq_printf(seq, "Completion: %10u \n",
+		   atomic_read(&adap->chcr_stats.complete));
+	seq_printf(seq, "Error: %10u \n",
+		   atomic_read(&adap->chcr_stats.error));
+	seq_printf(seq, "Fallback: %10u \n",
+		   atomic_read(&adap->chcr_stats.fallback));
+	seq_printf(seq, "IPSec PDU: %10u\n",
+		   atomic_read(&adap->chcr_stats.ipsec_cnt));
+	return 0;
+}
+
+
+static int chcr_stats_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, chcr_show, inode->i_private);
+}
+
+static const struct file_operations chcr_stats_debugfs_fops = {
+        .owner   = THIS_MODULE,
+        .open    = chcr_stats_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = single_release,
+};
+/* Add an array of Debug FS files.
+ */
+void add_debugfs_files(struct adapter *adap,
+		       struct t4_debugfs_entry *files,
+		       unsigned int nfiles)
+{
+	int i;
+
+	/* debugfs support is best effort */
+	for (i = 0; i < nfiles; i++)
+		debugfs_create_file(files[i].name, files[i].mode,
+				    adap->debugfs_root,
+				    (void *)adap + files[i].data,
+				    files[i].ops);
+}
+
+int t4_setup_debugfs(struct adapter *adap)
+{
+	int i;
+	u32 size = 0;
+	struct dentry *de;
+
+	static struct t4_debugfs_entry t4_debugfs_files[] = {
+		{ "cim_la", &cim_la_fops, 0400, 0 },
+		{ "cim_pif_la", &cim_pif_la_fops, 0400, 0 },
+		{ "cim_ma_la", &cim_ma_la_fops, 0400, 0 },
+		{ "cim_qcfg", &cim_qcfg_fops, 0400, 0 },
+		{ "clk", &clk_debugfs_fops, 0400, 0 },
+		{ "devlog", &devlog_fops, 0400, 0 },
+		{ "mboxlog", &mboxlog_fops, 0400, 0 },
+		{ "mbox0", &mbox_debugfs_fops, 0600, 0 },
+		{ "mbox1", &mbox_debugfs_fops, 0600, 1 },
+		{ "mbox2", &mbox_debugfs_fops, 0600, 2 },
+		{ "mbox3", &mbox_debugfs_fops, 0600, 3 },
+		{ "mbox4", &mbox_debugfs_fops, 0600, 4 },
+		{ "mbox5", &mbox_debugfs_fops, 0600, 5 },
+		{ "mbox6", &mbox_debugfs_fops, 0600, 6 },
+		{ "mbox7", &mbox_debugfs_fops, 0600, 7 },
+		{ "trace0", &mps_trc_debugfs_fops, 0600, 0 },
+		{ "trace1", &mps_trc_debugfs_fops, 0600, 1 },
+		{ "trace2", &mps_trc_debugfs_fops, 0600, 2 },
+		{ "trace3", &mps_trc_debugfs_fops, 0600, 3 },
+		{ "l2t", &t4_l2t_fops, 0400, 0},
+		{ "mps_tcam", &mps_tcam_debugfs_fops, 0400, 0 },
+		{ "rss", &rss_debugfs_fops, 0400, 0 },
+		{ "rss_config", &rss_config_debugfs_fops, 0400, 0 },
+		{ "rss_key", &rss_key_debugfs_fops, 0400, 0 },
+		{ "rss_pf_config", &rss_pf_config_debugfs_fops, 0400, 0 },
+		{ "rss_vf_config", &rss_vf_config_debugfs_fops, 0400, 0 },
+		{ "sge_qinfo", &sge_qinfo_debugfs_fops, 0400, 0 },
+		{ "ibq_tp0",  &cim_ibq_fops, 0400, 0 },
+		{ "ibq_tp1",  &cim_ibq_fops, 0400, 1 },
+		{ "ibq_ulp",  &cim_ibq_fops, 0400, 2 },
+		{ "ibq_sge0", &cim_ibq_fops, 0400, 3 },
+		{ "ibq_sge1", &cim_ibq_fops, 0400, 4 },
+		{ "ibq_ncsi", &cim_ibq_fops, 0400, 5 },
+		{ "obq_ulp0", &cim_obq_fops, 0400, 0 },
+		{ "obq_ulp1", &cim_obq_fops, 0400, 1 },
+		{ "obq_ulp2", &cim_obq_fops, 0400, 2 },
+		{ "obq_ulp3", &cim_obq_fops, 0400, 3 },
+		{ "obq_sge",  &cim_obq_fops, 0400, 4 },
+		{ "obq_ncsi", &cim_obq_fops, 0400, 5 },
+		{ "tp_la", &tp_la_fops, 0400, 0 },
+		{ "ulprx_la", &ulprx_la_fops, 0400, 0 },
+		{ "sensors", &sensors_debugfs_fops, 0400, 0 },
+		{ "pm_stats", &pm_stats_debugfs_fops, 0400, 0 },
+		{ "tx_rate", &tx_rate_debugfs_fops, 0400, 0 },
+		{ "cctrl", &cctrl_tbl_debugfs_fops, 0400, 0 },
+#if IS_ENABLED(CONFIG_IPV6)
+		{ "clip_tbl", &clip_tbl_debugfs_fops, 0400, 0 },
+#endif
+		{ "tids", &tid_info_debugfs_fops, 0400, 0},
+		{ "blocked_fl", &blocked_fl_fops, 0600, 0 },
+		{ "meminfo", &meminfo_fops, 0400, 0 },
+		{ "crypto", &chcr_stats_debugfs_fops, 0400, 0 },
+	};
+
+	/* Debug FS nodes common to all T5 and later adapters.
+	 */
+	static struct t4_debugfs_entry t5_debugfs_files[] = {
+		{ "obq_sge_rx_q0", &cim_obq_fops, 0400, 6 },
+		{ "obq_sge_rx_q1", &cim_obq_fops, 0400, 7 },
+	};
+
+	add_debugfs_files(adap,
+			  t4_debugfs_files,
+			  ARRAY_SIZE(t4_debugfs_files));
+	if (!is_t4(adap->params.chip))
+		add_debugfs_files(adap,
+				  t5_debugfs_files,
+				  ARRAY_SIZE(t5_debugfs_files));
+
+	i = t4_read_reg(adap, MA_TARGET_MEM_ENABLE_A);
+	if (i & EDRAM0_ENABLE_F) {
+		size = t4_read_reg(adap, MA_EDRAM0_BAR_A);
+		add_debugfs_mem(adap, "edc0", MEM_EDC0, EDRAM0_SIZE_G(size));
+	}
+	if (i & EDRAM1_ENABLE_F) {
+		size = t4_read_reg(adap, MA_EDRAM1_BAR_A);
+		add_debugfs_mem(adap, "edc1", MEM_EDC1, EDRAM1_SIZE_G(size));
+	}
+	if (is_t5(adap->params.chip)) {
+		if (i & EXT_MEM0_ENABLE_F) {
+			size = t4_read_reg(adap, MA_EXT_MEMORY0_BAR_A);
+			add_debugfs_mem(adap, "mc0", MEM_MC0,
+					EXT_MEM0_SIZE_G(size));
+		}
+		if (i & EXT_MEM1_ENABLE_F) {
+			size = t4_read_reg(adap, MA_EXT_MEMORY1_BAR_A);
+			add_debugfs_mem(adap, "mc1", MEM_MC1,
+					EXT_MEM1_SIZE_G(size));
+		}
+	} else {
+		if (i & EXT_MEM_ENABLE_F) {
+			size = t4_read_reg(adap, MA_EXT_MEMORY_BAR_A);
+			add_debugfs_mem(adap, "mc", MEM_MC,
+					EXT_MEM_SIZE_G(size));
+		}
+
+		if (i & HMA_MUX_F) {
+			size = t4_read_reg(adap, MA_EXT_MEMORY1_BAR_A);
+			add_debugfs_mem(adap, "hma", MEM_HMA,
+					EXT_MEM1_SIZE_G(size));
+		}
+	}
+
+	de = debugfs_create_file_size("flash", 0400, adap->debugfs_root, adap,
+				      &flash_debugfs_fops, adap->params.sf_size);
+	debugfs_create_bool("use_backdoor", 0600,
+			    adap->debugfs_root, &adap->use_bd);
+	debugfs_create_bool("trace_rss", 0600,
+			    adap->debugfs_root, &adap->trace_rss);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.h
new file mode 100644
index 0000000..23f43a0
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.h
@@ -0,0 +1,83 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_DEBUGFS_H
+#define __CXGB4_DEBUGFS_H
+
+#include <linux/export.h>
+
+#define DEFINE_SIMPLE_DEBUGFS_FILE(name) \
+static int name##_open(struct inode *inode, struct file *file) \
+{ \
+	return single_open(file, name##_show, inode->i_private); \
+} \
+static const struct file_operations name##_debugfs_fops = { \
+	.owner   = THIS_MODULE, \
+	.open    = name##_open, \
+	.read    = seq_read, \
+	.llseek  = seq_lseek, \
+	.release = single_release \
+}
+
+struct t4_debugfs_entry {
+	const char *name;
+	const struct file_operations *ops;
+	umode_t mode;
+	unsigned char data;
+};
+
+struct seq_tab {
+	int (*show)(struct seq_file *seq, void *v, int idx);
+	unsigned int rows;        /* # of entries */
+	unsigned char width;      /* size in bytes of each entry */
+	unsigned char skip_first; /* whether the first line is a header */
+	char data[0];             /* the table data */
+};
+
+static inline unsigned int hex2val(char c)
+{
+	return isdigit(c) ? c - '0' : tolower(c) - 'a' + 10;
+}
+
+struct seq_tab *seq_open_tab(struct file *f, unsigned int rows,
+			     unsigned int width, unsigned int have_header,
+			     int (*show)(struct seq_file *seq, void *v, int i));
+
+int t4_setup_debugfs(struct adapter *adap);
+void add_debugfs_files(struct adapter *adap,
+		       struct t4_debugfs_entry *files,
+		       unsigned int nfiles);
+int mem_open(struct inode *inode, struct file *file);
+
+#endif
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
new file mode 100644
index 0000000..59d04d7
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
@@ -0,0 +1,1547 @@
+/*
+ *  Copyright (C) 2013-2015 Chelsio Communications.  All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#include <linux/firmware.h>
+#include <linux/mdio.h>
+
+#include "cxgb4.h"
+#include "t4_regs.h"
+#include "t4fw_api.h"
+#include "cxgb4_cudbg.h"
+
+#define EEPROM_MAGIC 0x38E2F10C
+
+static u32 get_msglevel(struct net_device *dev)
+{
+	return netdev2adap(dev)->msg_enable;
+}
+
+static void set_msglevel(struct net_device *dev, u32 val)
+{
+	netdev2adap(dev)->msg_enable = val;
+}
+
+static const char stats_strings[][ETH_GSTRING_LEN] = {
+	"tx_octets_ok           ",
+	"tx_frames_ok           ",
+	"tx_broadcast_frames    ",
+	"tx_multicast_frames    ",
+	"tx_unicast_frames      ",
+	"tx_error_frames        ",
+
+	"tx_frames_64           ",
+	"tx_frames_65_to_127    ",
+	"tx_frames_128_to_255   ",
+	"tx_frames_256_to_511   ",
+	"tx_frames_512_to_1023  ",
+	"tx_frames_1024_to_1518 ",
+	"tx_frames_1519_to_max  ",
+
+	"tx_frames_dropped      ",
+	"tx_pause_frames        ",
+	"tx_ppp0_frames         ",
+	"tx_ppp1_frames         ",
+	"tx_ppp2_frames         ",
+	"tx_ppp3_frames         ",
+	"tx_ppp4_frames         ",
+	"tx_ppp5_frames         ",
+	"tx_ppp6_frames         ",
+	"tx_ppp7_frames         ",
+
+	"rx_octets_ok           ",
+	"rx_frames_ok           ",
+	"rx_broadcast_frames    ",
+	"rx_multicast_frames    ",
+	"rx_unicast_frames      ",
+
+	"rx_frames_too_long     ",
+	"rx_jabber_errors       ",
+	"rx_fcs_errors          ",
+	"rx_length_errors       ",
+	"rx_symbol_errors       ",
+	"rx_runt_frames         ",
+
+	"rx_frames_64           ",
+	"rx_frames_65_to_127    ",
+	"rx_frames_128_to_255   ",
+	"rx_frames_256_to_511   ",
+	"rx_frames_512_to_1023  ",
+	"rx_frames_1024_to_1518 ",
+	"rx_frames_1519_to_max  ",
+
+	"rx_pause_frames        ",
+	"rx_ppp0_frames         ",
+	"rx_ppp1_frames         ",
+	"rx_ppp2_frames         ",
+	"rx_ppp3_frames         ",
+	"rx_ppp4_frames         ",
+	"rx_ppp5_frames         ",
+	"rx_ppp6_frames         ",
+	"rx_ppp7_frames         ",
+
+	"rx_bg0_frames_dropped  ",
+	"rx_bg1_frames_dropped  ",
+	"rx_bg2_frames_dropped  ",
+	"rx_bg3_frames_dropped  ",
+	"rx_bg0_frames_trunc    ",
+	"rx_bg1_frames_trunc    ",
+	"rx_bg2_frames_trunc    ",
+	"rx_bg3_frames_trunc    ",
+
+	"tso                    ",
+	"tx_csum_offload        ",
+	"rx_csum_good           ",
+	"vlan_extractions       ",
+	"vlan_insertions        ",
+	"gro_packets            ",
+	"gro_merged             ",
+};
+
+static char adapter_stats_strings[][ETH_GSTRING_LEN] = {
+	"db_drop                ",
+	"db_full                ",
+	"db_empty               ",
+	"tcp_ipv4_out_rsts      ",
+	"tcp_ipv4_in_segs       ",
+	"tcp_ipv4_out_segs      ",
+	"tcp_ipv4_retrans_segs  ",
+	"tcp_ipv6_out_rsts      ",
+	"tcp_ipv6_in_segs       ",
+	"tcp_ipv6_out_segs      ",
+	"tcp_ipv6_retrans_segs  ",
+	"usm_ddp_frames         ",
+	"usm_ddp_octets         ",
+	"usm_ddp_drops          ",
+	"rdma_no_rqe_mod_defer  ",
+	"rdma_no_rqe_pkt_defer  ",
+	"tp_err_ofld_no_neigh   ",
+	"tp_err_ofld_cong_defer ",
+	"write_coal_success     ",
+	"write_coal_fail        ",
+};
+
+static char channel_stats_strings[][ETH_GSTRING_LEN] = {
+	"--------Channel--------- ",
+	"tp_cpl_requests        ",
+	"tp_cpl_responses       ",
+	"tp_mac_in_errs         ",
+	"tp_hdr_in_errs         ",
+	"tp_tcp_in_errs         ",
+	"tp_tcp6_in_errs        ",
+	"tp_tnl_cong_drops      ",
+	"tp_tnl_tx_drops        ",
+	"tp_ofld_vlan_drops     ",
+	"tp_ofld_chan_drops     ",
+	"fcoe_octets_ddp        ",
+	"fcoe_frames_ddp        ",
+	"fcoe_frames_drop       ",
+};
+
+static char loopback_stats_strings[][ETH_GSTRING_LEN] = {
+	"-------Loopback----------- ",
+	"octets_ok              ",
+	"frames_ok              ",
+	"bcast_frames           ",
+	"mcast_frames           ",
+	"ucast_frames           ",
+	"error_frames           ",
+	"frames_64              ",
+	"frames_65_to_127       ",
+	"frames_128_to_255      ",
+	"frames_256_to_511      ",
+	"frames_512_to_1023     ",
+	"frames_1024_to_1518    ",
+	"frames_1519_to_max     ",
+	"frames_dropped         ",
+	"bg0_frames_dropped     ",
+	"bg1_frames_dropped     ",
+	"bg2_frames_dropped     ",
+	"bg3_frames_dropped     ",
+	"bg0_frames_trunc       ",
+	"bg1_frames_trunc       ",
+	"bg2_frames_trunc       ",
+	"bg3_frames_trunc       ",
+};
+
+static int get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(stats_strings) +
+		       ARRAY_SIZE(adapter_stats_strings) +
+		       ARRAY_SIZE(channel_stats_strings) +
+		       ARRAY_SIZE(loopback_stats_strings);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int get_regs_len(struct net_device *dev)
+{
+	struct adapter *adap = netdev2adap(dev);
+
+	return t4_get_regs_len(adap);
+}
+
+static int get_eeprom_len(struct net_device *dev)
+{
+	return EEPROMSIZE;
+}
+
+static void get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+	struct adapter *adapter = netdev2adap(dev);
+	u32 exprom_vers;
+
+	strlcpy(info->driver, cxgb4_driver_name, sizeof(info->driver));
+	strlcpy(info->version, cxgb4_driver_version,
+		sizeof(info->version));
+	strlcpy(info->bus_info, pci_name(adapter->pdev),
+		sizeof(info->bus_info));
+	info->regdump_len = get_regs_len(dev);
+
+	if (!adapter->params.fw_vers)
+		strcpy(info->fw_version, "N/A");
+	else
+		snprintf(info->fw_version, sizeof(info->fw_version),
+			 "%u.%u.%u.%u, TP %u.%u.%u.%u",
+			 FW_HDR_FW_VER_MAJOR_G(adapter->params.fw_vers),
+			 FW_HDR_FW_VER_MINOR_G(adapter->params.fw_vers),
+			 FW_HDR_FW_VER_MICRO_G(adapter->params.fw_vers),
+			 FW_HDR_FW_VER_BUILD_G(adapter->params.fw_vers),
+			 FW_HDR_FW_VER_MAJOR_G(adapter->params.tp_vers),
+			 FW_HDR_FW_VER_MINOR_G(adapter->params.tp_vers),
+			 FW_HDR_FW_VER_MICRO_G(adapter->params.tp_vers),
+			 FW_HDR_FW_VER_BUILD_G(adapter->params.tp_vers));
+
+	if (!t4_get_exprom_version(adapter, &exprom_vers))
+		snprintf(info->erom_version, sizeof(info->erom_version),
+			 "%u.%u.%u.%u",
+			 FW_HDR_FW_VER_MAJOR_G(exprom_vers),
+			 FW_HDR_FW_VER_MINOR_G(exprom_vers),
+			 FW_HDR_FW_VER_MICRO_G(exprom_vers),
+			 FW_HDR_FW_VER_BUILD_G(exprom_vers));
+}
+
+static void get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+	if (stringset == ETH_SS_STATS) {
+		memcpy(data, stats_strings, sizeof(stats_strings));
+		data += sizeof(stats_strings);
+		memcpy(data, adapter_stats_strings,
+		       sizeof(adapter_stats_strings));
+		data += sizeof(adapter_stats_strings);
+		memcpy(data, channel_stats_strings,
+		       sizeof(channel_stats_strings));
+		data += sizeof(channel_stats_strings);
+		memcpy(data, loopback_stats_strings,
+		       sizeof(loopback_stats_strings));
+	}
+}
+
+/* port stats maintained per queue of the port. They should be in the same
+ * order as in stats_strings above.
+ */
+struct queue_port_stats {
+	u64 tso;
+	u64 tx_csum;
+	u64 rx_csum;
+	u64 vlan_ex;
+	u64 vlan_ins;
+	u64 gro_pkts;
+	u64 gro_merged;
+};
+
+struct adapter_stats {
+	u64 db_drop;
+	u64 db_full;
+	u64 db_empty;
+	u64 tcp_v4_out_rsts;
+	u64 tcp_v4_in_segs;
+	u64 tcp_v4_out_segs;
+	u64 tcp_v4_retrans_segs;
+	u64 tcp_v6_out_rsts;
+	u64 tcp_v6_in_segs;
+	u64 tcp_v6_out_segs;
+	u64 tcp_v6_retrans_segs;
+	u64 frames;
+	u64 octets;
+	u64 drops;
+	u64 rqe_dfr_mod;
+	u64 rqe_dfr_pkt;
+	u64 ofld_no_neigh;
+	u64 ofld_cong_defer;
+	u64 wc_success;
+	u64 wc_fail;
+};
+
+struct channel_stats {
+	u64 cpl_req;
+	u64 cpl_rsp;
+	u64 mac_in_errs;
+	u64 hdr_in_errs;
+	u64 tcp_in_errs;
+	u64 tcp6_in_errs;
+	u64 tnl_cong_drops;
+	u64 tnl_tx_drops;
+	u64 ofld_vlan_drops;
+	u64 ofld_chan_drops;
+	u64 octets_ddp;
+	u64 frames_ddp;
+	u64 frames_drop;
+};
+
+static void collect_sge_port_stats(const struct adapter *adap,
+				   const struct port_info *p,
+				   struct queue_port_stats *s)
+{
+	int i;
+	const struct sge_eth_txq *tx = &adap->sge.ethtxq[p->first_qset];
+	const struct sge_eth_rxq *rx = &adap->sge.ethrxq[p->first_qset];
+
+	memset(s, 0, sizeof(*s));
+	for (i = 0; i < p->nqsets; i++, rx++, tx++) {
+		s->tso += tx->tso;
+		s->tx_csum += tx->tx_cso;
+		s->rx_csum += rx->stats.rx_cso;
+		s->vlan_ex += rx->stats.vlan_ex;
+		s->vlan_ins += tx->vlan_ins;
+		s->gro_pkts += rx->stats.lro_pkts;
+		s->gro_merged += rx->stats.lro_merged;
+	}
+}
+
+static void collect_adapter_stats(struct adapter *adap, struct adapter_stats *s)
+{
+	struct tp_tcp_stats v4, v6;
+	struct tp_rdma_stats rdma_stats;
+	struct tp_err_stats err_stats;
+	struct tp_usm_stats usm_stats;
+	u64 val1, val2;
+
+	memset(s, 0, sizeof(*s));
+
+	spin_lock(&adap->stats_lock);
+	t4_tp_get_tcp_stats(adap, &v4, &v6, false);
+	t4_tp_get_rdma_stats(adap, &rdma_stats, false);
+	t4_get_usm_stats(adap, &usm_stats, false);
+	t4_tp_get_err_stats(adap, &err_stats, false);
+	spin_unlock(&adap->stats_lock);
+
+	s->db_drop = adap->db_stats.db_drop;
+	s->db_full = adap->db_stats.db_full;
+	s->db_empty = adap->db_stats.db_empty;
+
+	s->tcp_v4_out_rsts = v4.tcp_out_rsts;
+	s->tcp_v4_in_segs = v4.tcp_in_segs;
+	s->tcp_v4_out_segs = v4.tcp_out_segs;
+	s->tcp_v4_retrans_segs = v4.tcp_retrans_segs;
+	s->tcp_v6_out_rsts = v6.tcp_out_rsts;
+	s->tcp_v6_in_segs = v6.tcp_in_segs;
+	s->tcp_v6_out_segs = v6.tcp_out_segs;
+	s->tcp_v6_retrans_segs = v6.tcp_retrans_segs;
+
+	if (is_offload(adap)) {
+		s->frames = usm_stats.frames;
+		s->octets = usm_stats.octets;
+		s->drops = usm_stats.drops;
+		s->rqe_dfr_mod = rdma_stats.rqe_dfr_mod;
+		s->rqe_dfr_pkt = rdma_stats.rqe_dfr_pkt;
+	}
+
+	s->ofld_no_neigh = err_stats.ofld_no_neigh;
+	s->ofld_cong_defer = err_stats.ofld_cong_defer;
+
+	if (!is_t4(adap->params.chip)) {
+		int v;
+
+		v = t4_read_reg(adap, SGE_STAT_CFG_A);
+		if (STATSOURCE_T5_G(v) == 7) {
+			val2 = t4_read_reg(adap, SGE_STAT_MATCH_A);
+			val1 = t4_read_reg(adap, SGE_STAT_TOTAL_A);
+			s->wc_success = val1 - val2;
+			s->wc_fail = val2;
+		}
+	}
+}
+
+static void collect_channel_stats(struct adapter *adap, struct channel_stats *s,
+				  u8 i)
+{
+	struct tp_cpl_stats cpl_stats;
+	struct tp_err_stats err_stats;
+	struct tp_fcoe_stats fcoe_stats;
+
+	memset(s, 0, sizeof(*s));
+
+	spin_lock(&adap->stats_lock);
+	t4_tp_get_cpl_stats(adap, &cpl_stats, false);
+	t4_tp_get_err_stats(adap, &err_stats, false);
+	t4_get_fcoe_stats(adap, i, &fcoe_stats, false);
+	spin_unlock(&adap->stats_lock);
+
+	s->cpl_req = cpl_stats.req[i];
+	s->cpl_rsp = cpl_stats.rsp[i];
+	s->mac_in_errs = err_stats.mac_in_errs[i];
+	s->hdr_in_errs = err_stats.hdr_in_errs[i];
+	s->tcp_in_errs = err_stats.tcp_in_errs[i];
+	s->tcp6_in_errs = err_stats.tcp6_in_errs[i];
+	s->tnl_cong_drops = err_stats.tnl_cong_drops[i];
+	s->tnl_tx_drops = err_stats.tnl_tx_drops[i];
+	s->ofld_vlan_drops = err_stats.ofld_vlan_drops[i];
+	s->ofld_chan_drops = err_stats.ofld_chan_drops[i];
+	s->octets_ddp = fcoe_stats.octets_ddp;
+	s->frames_ddp = fcoe_stats.frames_ddp;
+	s->frames_drop = fcoe_stats.frames_drop;
+}
+
+static void get_stats(struct net_device *dev, struct ethtool_stats *stats,
+		      u64 *data)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct lb_port_stats s;
+	int i;
+	u64 *p0;
+
+	t4_get_port_stats_offset(adapter, pi->tx_chan,
+				 (struct port_stats *)data,
+				 &pi->stats_base);
+
+	data += sizeof(struct port_stats) / sizeof(u64);
+	collect_sge_port_stats(adapter, pi, (struct queue_port_stats *)data);
+	data += sizeof(struct queue_port_stats) / sizeof(u64);
+	collect_adapter_stats(adapter, (struct adapter_stats *)data);
+	data += sizeof(struct adapter_stats) / sizeof(u64);
+
+	*data++ = (u64)pi->port_id;
+	collect_channel_stats(adapter, (struct channel_stats *)data,
+			      pi->port_id);
+	data += sizeof(struct channel_stats) / sizeof(u64);
+
+	*data++ = (u64)pi->port_id;
+	memset(&s, 0, sizeof(s));
+	t4_get_lb_stats(adapter, pi->port_id, &s);
+
+	p0 = &s.octets;
+	for (i = 0; i < ARRAY_SIZE(loopback_stats_strings) - 1; i++)
+		*data++ = (unsigned long long)*p0++;
+}
+
+static void get_regs(struct net_device *dev, struct ethtool_regs *regs,
+		     void *buf)
+{
+	struct adapter *adap = netdev2adap(dev);
+	size_t buf_size;
+
+	buf_size = t4_get_regs_len(adap);
+	regs->version = mk_adap_vers(adap);
+	t4_get_regs(adap, buf, buf_size);
+}
+
+static int restart_autoneg(struct net_device *dev)
+{
+	struct port_info *p = netdev_priv(dev);
+
+	if (!netif_running(dev))
+		return -EAGAIN;
+	if (p->link_cfg.autoneg != AUTONEG_ENABLE)
+		return -EINVAL;
+	t4_restart_aneg(p->adapter, p->adapter->pf, p->tx_chan);
+	return 0;
+}
+
+static int identify_port(struct net_device *dev,
+			 enum ethtool_phys_id_state state)
+{
+	unsigned int val;
+	struct adapter *adap = netdev2adap(dev);
+
+	if (state == ETHTOOL_ID_ACTIVE)
+		val = 0xffff;
+	else if (state == ETHTOOL_ID_INACTIVE)
+		val = 0;
+	else
+		return -EINVAL;
+
+	return t4_identify_port(adap, adap->pf, netdev2pinfo(dev)->viid, val);
+}
+
+/**
+ *	from_fw_port_mod_type - translate Firmware Port/Module type to Ethtool
+ *	@port_type: Firmware Port Type
+ *	@mod_type: Firmware Module Type
+ *
+ *	Translate Firmware Port/Module type to Ethtool Port Type.
+ */
+static int from_fw_port_mod_type(enum fw_port_type port_type,
+				 enum fw_port_module_type mod_type)
+{
+	if (port_type == FW_PORT_TYPE_BT_SGMII ||
+	    port_type == FW_PORT_TYPE_BT_XFI ||
+	    port_type == FW_PORT_TYPE_BT_XAUI) {
+		return PORT_TP;
+	} else if (port_type == FW_PORT_TYPE_FIBER_XFI ||
+		   port_type == FW_PORT_TYPE_FIBER_XAUI) {
+		return PORT_FIBRE;
+	} else if (port_type == FW_PORT_TYPE_SFP ||
+		   port_type == FW_PORT_TYPE_QSFP_10G ||
+		   port_type == FW_PORT_TYPE_QSA ||
+		   port_type == FW_PORT_TYPE_QSFP ||
+		   port_type == FW_PORT_TYPE_CR4_QSFP ||
+		   port_type == FW_PORT_TYPE_CR_QSFP ||
+		   port_type == FW_PORT_TYPE_CR2_QSFP ||
+		   port_type == FW_PORT_TYPE_SFP28) {
+		if (mod_type == FW_PORT_MOD_TYPE_LR ||
+		    mod_type == FW_PORT_MOD_TYPE_SR ||
+		    mod_type == FW_PORT_MOD_TYPE_ER ||
+		    mod_type == FW_PORT_MOD_TYPE_LRM)
+			return PORT_FIBRE;
+		else if (mod_type == FW_PORT_MOD_TYPE_TWINAX_PASSIVE ||
+			 mod_type == FW_PORT_MOD_TYPE_TWINAX_ACTIVE)
+			return PORT_DA;
+		else
+			return PORT_OTHER;
+	} else if (port_type == FW_PORT_TYPE_KR4_100G ||
+		   port_type == FW_PORT_TYPE_KR_SFP28 ||
+		   port_type == FW_PORT_TYPE_KR_XLAUI) {
+		return PORT_NONE;
+	}
+
+	return PORT_OTHER;
+}
+
+/**
+ *	speed_to_fw_caps - translate Port Speed to Firmware Port Capabilities
+ *	@speed: speed in Kb/s
+ *
+ *	Translates a specific Port Speed into a Firmware Port Capabilities
+ *	value.
+ */
+static unsigned int speed_to_fw_caps(int speed)
+{
+	if (speed == 100)
+		return FW_PORT_CAP32_SPEED_100M;
+	if (speed == 1000)
+		return FW_PORT_CAP32_SPEED_1G;
+	if (speed == 10000)
+		return FW_PORT_CAP32_SPEED_10G;
+	if (speed == 25000)
+		return FW_PORT_CAP32_SPEED_25G;
+	if (speed == 40000)
+		return FW_PORT_CAP32_SPEED_40G;
+	if (speed == 50000)
+		return FW_PORT_CAP32_SPEED_50G;
+	if (speed == 100000)
+		return FW_PORT_CAP32_SPEED_100G;
+	if (speed == 200000)
+		return FW_PORT_CAP32_SPEED_200G;
+	if (speed == 400000)
+		return FW_PORT_CAP32_SPEED_400G;
+	return 0;
+}
+
+/**
+ *	fw_caps_to_lmm - translate Firmware to ethtool Link Mode Mask
+ *	@port_type: Firmware Port Type
+ *	@fw_caps: Firmware Port Capabilities
+ *	@link_mode_mask: ethtool Link Mode Mask
+ *
+ *	Translate a Firmware Port Capabilities specification to an ethtool
+ *	Link Mode Mask.
+ */
+static void fw_caps_to_lmm(enum fw_port_type port_type,
+			   unsigned int fw_caps,
+			   unsigned long *link_mode_mask)
+{
+	#define SET_LMM(__lmm_name) \
+		__set_bit(ETHTOOL_LINK_MODE_ ## __lmm_name ## _BIT, \
+			  link_mode_mask)
+
+	#define FW_CAPS_TO_LMM(__fw_name, __lmm_name) \
+		do { \
+			if (fw_caps & FW_PORT_CAP32_ ## __fw_name) \
+				SET_LMM(__lmm_name); \
+		} while (0)
+
+	switch (port_type) {
+	case FW_PORT_TYPE_BT_SGMII:
+	case FW_PORT_TYPE_BT_XFI:
+	case FW_PORT_TYPE_BT_XAUI:
+		SET_LMM(TP);
+		FW_CAPS_TO_LMM(SPEED_100M, 100baseT_Full);
+		FW_CAPS_TO_LMM(SPEED_1G, 1000baseT_Full);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseT_Full);
+		break;
+
+	case FW_PORT_TYPE_KX4:
+	case FW_PORT_TYPE_KX:
+		SET_LMM(Backplane);
+		FW_CAPS_TO_LMM(SPEED_1G, 1000baseKX_Full);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseKX4_Full);
+		break;
+
+	case FW_PORT_TYPE_KR:
+		SET_LMM(Backplane);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseKR_Full);
+		break;
+
+	case FW_PORT_TYPE_BP_AP:
+		SET_LMM(Backplane);
+		FW_CAPS_TO_LMM(SPEED_1G, 1000baseKX_Full);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseR_FEC);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseKR_Full);
+		break;
+
+	case FW_PORT_TYPE_BP4_AP:
+		SET_LMM(Backplane);
+		FW_CAPS_TO_LMM(SPEED_1G, 1000baseKX_Full);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseR_FEC);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseKR_Full);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseKX4_Full);
+		break;
+
+	case FW_PORT_TYPE_FIBER_XFI:
+	case FW_PORT_TYPE_FIBER_XAUI:
+	case FW_PORT_TYPE_SFP:
+	case FW_PORT_TYPE_QSFP_10G:
+	case FW_PORT_TYPE_QSA:
+		SET_LMM(FIBRE);
+		FW_CAPS_TO_LMM(SPEED_1G, 1000baseT_Full);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseT_Full);
+		break;
+
+	case FW_PORT_TYPE_BP40_BA:
+	case FW_PORT_TYPE_QSFP:
+		SET_LMM(FIBRE);
+		FW_CAPS_TO_LMM(SPEED_1G, 1000baseT_Full);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseT_Full);
+		FW_CAPS_TO_LMM(SPEED_40G, 40000baseSR4_Full);
+		break;
+
+	case FW_PORT_TYPE_CR_QSFP:
+	case FW_PORT_TYPE_SFP28:
+		SET_LMM(FIBRE);
+		FW_CAPS_TO_LMM(SPEED_1G, 1000baseT_Full);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseT_Full);
+		FW_CAPS_TO_LMM(SPEED_25G, 25000baseCR_Full);
+		break;
+
+	case FW_PORT_TYPE_KR_SFP28:
+		SET_LMM(Backplane);
+		FW_CAPS_TO_LMM(SPEED_1G, 1000baseT_Full);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseKR_Full);
+		FW_CAPS_TO_LMM(SPEED_25G, 25000baseKR_Full);
+		break;
+
+	case FW_PORT_TYPE_KR_XLAUI:
+		SET_LMM(Backplane);
+		FW_CAPS_TO_LMM(SPEED_1G, 1000baseKX_Full);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseKR_Full);
+		FW_CAPS_TO_LMM(SPEED_40G, 40000baseKR4_Full);
+		break;
+
+	case FW_PORT_TYPE_CR2_QSFP:
+		SET_LMM(FIBRE);
+		FW_CAPS_TO_LMM(SPEED_50G, 50000baseSR2_Full);
+		break;
+
+	case FW_PORT_TYPE_KR4_100G:
+	case FW_PORT_TYPE_CR4_QSFP:
+		SET_LMM(FIBRE);
+		FW_CAPS_TO_LMM(SPEED_1G,  1000baseT_Full);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseSR_Full);
+		FW_CAPS_TO_LMM(SPEED_40G, 40000baseSR4_Full);
+		FW_CAPS_TO_LMM(SPEED_25G, 25000baseCR_Full);
+		FW_CAPS_TO_LMM(SPEED_50G, 50000baseCR2_Full);
+		FW_CAPS_TO_LMM(SPEED_100G, 100000baseCR4_Full);
+		break;
+
+	default:
+		break;
+	}
+
+	FW_CAPS_TO_LMM(ANEG, Autoneg);
+	FW_CAPS_TO_LMM(802_3_PAUSE, Pause);
+	FW_CAPS_TO_LMM(802_3_ASM_DIR, Asym_Pause);
+
+	#undef FW_CAPS_TO_LMM
+	#undef SET_LMM
+}
+
+/**
+ *	lmm_to_fw_caps - translate ethtool Link Mode Mask to Firmware
+ *	capabilities
+ *	@et_lmm: ethtool Link Mode Mask
+ *
+ *	Translate ethtool Link Mode Mask into a Firmware Port capabilities
+ *	value.
+ */
+static unsigned int lmm_to_fw_caps(const unsigned long *link_mode_mask)
+{
+	unsigned int fw_caps = 0;
+
+	#define LMM_TO_FW_CAPS(__lmm_name, __fw_name) \
+		do { \
+			if (test_bit(ETHTOOL_LINK_MODE_ ## __lmm_name ## _BIT, \
+				     link_mode_mask)) \
+				fw_caps |= FW_PORT_CAP32_ ## __fw_name; \
+		} while (0)
+
+	LMM_TO_FW_CAPS(100baseT_Full, SPEED_100M);
+	LMM_TO_FW_CAPS(1000baseT_Full, SPEED_1G);
+	LMM_TO_FW_CAPS(10000baseT_Full, SPEED_10G);
+	LMM_TO_FW_CAPS(40000baseSR4_Full, SPEED_40G);
+	LMM_TO_FW_CAPS(25000baseCR_Full, SPEED_25G);
+	LMM_TO_FW_CAPS(50000baseCR2_Full, SPEED_50G);
+	LMM_TO_FW_CAPS(100000baseCR4_Full, SPEED_100G);
+
+	#undef LMM_TO_FW_CAPS
+
+	return fw_caps;
+}
+
+static int get_link_ksettings(struct net_device *dev,
+			      struct ethtool_link_ksettings *link_ksettings)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct ethtool_link_settings *base = &link_ksettings->base;
+
+	/* For the nonce, the Firmware doesn't send up Port State changes
+	 * when the Virtual Interface attached to the Port is down.  So
+	 * if it's down, let's grab any changes.
+	 */
+	if (!netif_running(dev))
+		(void)t4_update_port_info(pi);
+
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, supported);
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising);
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, lp_advertising);
+
+	base->port = from_fw_port_mod_type(pi->port_type, pi->mod_type);
+
+	if (pi->mdio_addr >= 0) {
+		base->phy_address = pi->mdio_addr;
+		base->mdio_support = (pi->port_type == FW_PORT_TYPE_BT_SGMII
+				      ? ETH_MDIO_SUPPORTS_C22
+				      : ETH_MDIO_SUPPORTS_C45);
+	} else {
+		base->phy_address = 255;
+		base->mdio_support = 0;
+	}
+
+	fw_caps_to_lmm(pi->port_type, pi->link_cfg.pcaps,
+		       link_ksettings->link_modes.supported);
+	fw_caps_to_lmm(pi->port_type, pi->link_cfg.acaps,
+		       link_ksettings->link_modes.advertising);
+	fw_caps_to_lmm(pi->port_type, pi->link_cfg.lpacaps,
+		       link_ksettings->link_modes.lp_advertising);
+
+	if (netif_carrier_ok(dev)) {
+		base->speed = pi->link_cfg.speed;
+		base->duplex = DUPLEX_FULL;
+	} else {
+		base->speed = SPEED_UNKNOWN;
+		base->duplex = DUPLEX_UNKNOWN;
+	}
+
+	if (pi->link_cfg.fc & PAUSE_RX) {
+		if (pi->link_cfg.fc & PAUSE_TX) {
+			ethtool_link_ksettings_add_link_mode(link_ksettings,
+							     advertising,
+							     Pause);
+		} else {
+			ethtool_link_ksettings_add_link_mode(link_ksettings,
+							     advertising,
+							     Asym_Pause);
+		}
+	} else if (pi->link_cfg.fc & PAUSE_TX) {
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising,
+						     Asym_Pause);
+	}
+
+	base->autoneg = pi->link_cfg.autoneg;
+	if (pi->link_cfg.pcaps & FW_PORT_CAP32_ANEG)
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, Autoneg);
+	if (pi->link_cfg.autoneg)
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, Autoneg);
+
+	return 0;
+}
+
+static int set_link_ksettings(struct net_device *dev,
+			    const struct ethtool_link_ksettings *link_ksettings)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct link_config *lc = &pi->link_cfg;
+	const struct ethtool_link_settings *base = &link_ksettings->base;
+	struct link_config old_lc;
+	unsigned int fw_caps;
+	int ret = 0;
+
+	/* only full-duplex supported */
+	if (base->duplex != DUPLEX_FULL)
+		return -EINVAL;
+
+	if (!(lc->pcaps & FW_PORT_CAP32_ANEG)) {
+		/* PHY offers a single speed.  See if that's what's
+		 * being requested.
+		 */
+		if (base->autoneg == AUTONEG_DISABLE &&
+		    (lc->pcaps & speed_to_fw_caps(base->speed)))
+			return 0;
+		return -EINVAL;
+	}
+
+	old_lc = *lc;
+	if (base->autoneg == AUTONEG_DISABLE) {
+		fw_caps = speed_to_fw_caps(base->speed);
+
+		if (!(lc->pcaps & fw_caps))
+			return -EINVAL;
+		lc->speed_caps = fw_caps;
+		lc->acaps = 0;
+	} else {
+		fw_caps =
+			 lmm_to_fw_caps(link_ksettings->link_modes.advertising);
+		if (!(lc->pcaps & fw_caps))
+			return -EINVAL;
+		lc->speed_caps = 0;
+		lc->acaps = fw_caps | FW_PORT_CAP32_ANEG;
+	}
+	lc->autoneg = base->autoneg;
+
+	/* If the firmware rejects the Link Configuration request, back out
+	 * the changes and report the error.
+	 */
+	ret = t4_link_l1cfg(pi->adapter, pi->adapter->mbox, pi->tx_chan, lc);
+	if (ret)
+		*lc = old_lc;
+
+	return ret;
+}
+
+/* Translate the Firmware FEC value into the ethtool value. */
+static inline unsigned int fwcap_to_eth_fec(unsigned int fw_fec)
+{
+	unsigned int eth_fec = 0;
+
+	if (fw_fec & FW_PORT_CAP32_FEC_RS)
+		eth_fec |= ETHTOOL_FEC_RS;
+	if (fw_fec & FW_PORT_CAP32_FEC_BASER_RS)
+		eth_fec |= ETHTOOL_FEC_BASER;
+
+	/* if nothing is set, then FEC is off */
+	if (!eth_fec)
+		eth_fec = ETHTOOL_FEC_OFF;
+
+	return eth_fec;
+}
+
+/* Translate Common Code FEC value into ethtool value. */
+static inline unsigned int cc_to_eth_fec(unsigned int cc_fec)
+{
+	unsigned int eth_fec = 0;
+
+	if (cc_fec & FEC_AUTO)
+		eth_fec |= ETHTOOL_FEC_AUTO;
+	if (cc_fec & FEC_RS)
+		eth_fec |= ETHTOOL_FEC_RS;
+	if (cc_fec & FEC_BASER_RS)
+		eth_fec |= ETHTOOL_FEC_BASER;
+
+	/* if nothing is set, then FEC is off */
+	if (!eth_fec)
+		eth_fec = ETHTOOL_FEC_OFF;
+
+	return eth_fec;
+}
+
+/* Translate ethtool FEC value into Common Code value. */
+static inline unsigned int eth_to_cc_fec(unsigned int eth_fec)
+{
+	unsigned int cc_fec = 0;
+
+	if (eth_fec & ETHTOOL_FEC_OFF)
+		return cc_fec;
+
+	if (eth_fec & ETHTOOL_FEC_AUTO)
+		cc_fec |= FEC_AUTO;
+	if (eth_fec & ETHTOOL_FEC_RS)
+		cc_fec |= FEC_RS;
+	if (eth_fec & ETHTOOL_FEC_BASER)
+		cc_fec |= FEC_BASER_RS;
+
+	return cc_fec;
+}
+
+static int get_fecparam(struct net_device *dev, struct ethtool_fecparam *fec)
+{
+	const struct port_info *pi = netdev_priv(dev);
+	const struct link_config *lc = &pi->link_cfg;
+
+	/* Translate the Firmware FEC Support into the ethtool value.  We
+	 * always support IEEE 802.3 "automatic" selection of Link FEC type if
+	 * any FEC is supported.
+	 */
+	fec->fec = fwcap_to_eth_fec(lc->pcaps);
+	if (fec->fec != ETHTOOL_FEC_OFF)
+		fec->fec |= ETHTOOL_FEC_AUTO;
+
+	/* Translate the current internal FEC parameters into the
+	 * ethtool values.
+	 */
+	fec->active_fec = cc_to_eth_fec(lc->fec);
+
+	return 0;
+}
+
+static int set_fecparam(struct net_device *dev, struct ethtool_fecparam *fec)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct link_config *lc = &pi->link_cfg;
+	struct link_config old_lc;
+	int ret;
+
+	/* Save old Link Configuration in case the L1 Configure below
+	 * fails.
+	 */
+	old_lc = *lc;
+
+	/* Try to perform the L1 Configure and return the result of that
+	 * effort.  If it fails, revert the attempted change.
+	 */
+	lc->requested_fec = eth_to_cc_fec(fec->fec);
+	ret = t4_link_l1cfg(pi->adapter, pi->adapter->mbox,
+			    pi->tx_chan, lc);
+	if (ret)
+		*lc = old_lc;
+	return ret;
+}
+
+static void get_pauseparam(struct net_device *dev,
+			   struct ethtool_pauseparam *epause)
+{
+	struct port_info *p = netdev_priv(dev);
+
+	epause->autoneg = (p->link_cfg.requested_fc & PAUSE_AUTONEG) != 0;
+	epause->rx_pause = (p->link_cfg.fc & PAUSE_RX) != 0;
+	epause->tx_pause = (p->link_cfg.fc & PAUSE_TX) != 0;
+}
+
+static int set_pauseparam(struct net_device *dev,
+			  struct ethtool_pauseparam *epause)
+{
+	struct port_info *p = netdev_priv(dev);
+	struct link_config *lc = &p->link_cfg;
+
+	if (epause->autoneg == AUTONEG_DISABLE)
+		lc->requested_fc = 0;
+	else if (lc->pcaps & FW_PORT_CAP32_ANEG)
+		lc->requested_fc = PAUSE_AUTONEG;
+	else
+		return -EINVAL;
+
+	if (epause->rx_pause)
+		lc->requested_fc |= PAUSE_RX;
+	if (epause->tx_pause)
+		lc->requested_fc |= PAUSE_TX;
+	if (netif_running(dev))
+		return t4_link_l1cfg(p->adapter, p->adapter->mbox, p->tx_chan,
+				     lc);
+	return 0;
+}
+
+static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e)
+{
+	const struct port_info *pi = netdev_priv(dev);
+	const struct sge *s = &pi->adapter->sge;
+
+	e->rx_max_pending = MAX_RX_BUFFERS;
+	e->rx_mini_max_pending = MAX_RSPQ_ENTRIES;
+	e->rx_jumbo_max_pending = 0;
+	e->tx_max_pending = MAX_TXQ_ENTRIES;
+
+	e->rx_pending = s->ethrxq[pi->first_qset].fl.size - 8;
+	e->rx_mini_pending = s->ethrxq[pi->first_qset].rspq.size;
+	e->rx_jumbo_pending = 0;
+	e->tx_pending = s->ethtxq[pi->first_qset].q.size;
+}
+
+static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e)
+{
+	int i;
+	const struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct sge *s = &adapter->sge;
+
+	if (e->rx_pending > MAX_RX_BUFFERS || e->rx_jumbo_pending ||
+	    e->tx_pending > MAX_TXQ_ENTRIES ||
+	    e->rx_mini_pending > MAX_RSPQ_ENTRIES ||
+	    e->rx_mini_pending < MIN_RSPQ_ENTRIES ||
+	    e->rx_pending < MIN_FL_ENTRIES || e->tx_pending < MIN_TXQ_ENTRIES)
+		return -EINVAL;
+
+	if (adapter->flags & FULL_INIT_DONE)
+		return -EBUSY;
+
+	for (i = 0; i < pi->nqsets; ++i) {
+		s->ethtxq[pi->first_qset + i].q.size = e->tx_pending;
+		s->ethrxq[pi->first_qset + i].fl.size = e->rx_pending + 8;
+		s->ethrxq[pi->first_qset + i].rspq.size = e->rx_mini_pending;
+	}
+	return 0;
+}
+
+/**
+ * set_rx_intr_params - set a net devices's RX interrupt holdoff paramete!
+ * @dev: the network device
+ * @us: the hold-off time in us, or 0 to disable timer
+ * @cnt: the hold-off packet count, or 0 to disable counter
+ *
+ * Set the RX interrupt hold-off parameters for a network device.
+ */
+static int set_rx_intr_params(struct net_device *dev,
+			      unsigned int us, unsigned int cnt)
+{
+	int i, err;
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge_eth_rxq *q = &adap->sge.ethrxq[pi->first_qset];
+
+	for (i = 0; i < pi->nqsets; i++, q++) {
+		err = cxgb4_set_rspq_intr_params(&q->rspq, us, cnt);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int set_adaptive_rx_setting(struct net_device *dev, int adaptive_rx)
+{
+	int i;
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge_eth_rxq *q = &adap->sge.ethrxq[pi->first_qset];
+
+	for (i = 0; i < pi->nqsets; i++, q++)
+		q->rspq.adaptive_rx = adaptive_rx;
+
+	return 0;
+}
+
+static int get_adaptive_rx_setting(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge_eth_rxq *q = &adap->sge.ethrxq[pi->first_qset];
+
+	return q->rspq.adaptive_rx;
+}
+
+static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+{
+	set_adaptive_rx_setting(dev, c->use_adaptive_rx_coalesce);
+	return set_rx_intr_params(dev, c->rx_coalesce_usecs,
+				  c->rx_max_coalesced_frames);
+}
+
+static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+{
+	const struct port_info *pi = netdev_priv(dev);
+	const struct adapter *adap = pi->adapter;
+	const struct sge_rspq *rq = &adap->sge.ethrxq[pi->first_qset].rspq;
+
+	c->rx_coalesce_usecs = qtimer_val(adap, rq);
+	c->rx_max_coalesced_frames = (rq->intr_params & QINTR_CNT_EN_F) ?
+		adap->sge.counter_val[rq->pktcnt_idx] : 0;
+	c->use_adaptive_rx_coalesce = get_adaptive_rx_setting(dev);
+	return 0;
+}
+
+/* The next two routines implement eeprom read/write from physical addresses.
+ */
+static int eeprom_rd_phys(struct adapter *adap, unsigned int phys_addr, u32 *v)
+{
+	int vaddr = t4_eeprom_ptov(phys_addr, adap->pf, EEPROMPFSIZE);
+
+	if (vaddr >= 0)
+		vaddr = pci_read_vpd(adap->pdev, vaddr, sizeof(u32), v);
+	return vaddr < 0 ? vaddr : 0;
+}
+
+static int eeprom_wr_phys(struct adapter *adap, unsigned int phys_addr, u32 v)
+{
+	int vaddr = t4_eeprom_ptov(phys_addr, adap->pf, EEPROMPFSIZE);
+
+	if (vaddr >= 0)
+		vaddr = pci_write_vpd(adap->pdev, vaddr, sizeof(u32), &v);
+	return vaddr < 0 ? vaddr : 0;
+}
+
+#define EEPROM_MAGIC 0x38E2F10C
+
+static int get_eeprom(struct net_device *dev, struct ethtool_eeprom *e,
+		      u8 *data)
+{
+	int i, err = 0;
+	struct adapter *adapter = netdev2adap(dev);
+	u8 *buf = kvzalloc(EEPROMSIZE, GFP_KERNEL);
+
+	if (!buf)
+		return -ENOMEM;
+
+	e->magic = EEPROM_MAGIC;
+	for (i = e->offset & ~3; !err && i < e->offset + e->len; i += 4)
+		err = eeprom_rd_phys(adapter, i, (u32 *)&buf[i]);
+
+	if (!err)
+		memcpy(data, buf + e->offset, e->len);
+	kvfree(buf);
+	return err;
+}
+
+static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
+		      u8 *data)
+{
+	u8 *buf;
+	int err = 0;
+	u32 aligned_offset, aligned_len, *p;
+	struct adapter *adapter = netdev2adap(dev);
+
+	if (eeprom->magic != EEPROM_MAGIC)
+		return -EINVAL;
+
+	aligned_offset = eeprom->offset & ~3;
+	aligned_len = (eeprom->len + (eeprom->offset & 3) + 3) & ~3;
+
+	if (adapter->pf > 0) {
+		u32 start = 1024 + adapter->pf * EEPROMPFSIZE;
+
+		if (aligned_offset < start ||
+		    aligned_offset + aligned_len > start + EEPROMPFSIZE)
+			return -EPERM;
+	}
+
+	if (aligned_offset != eeprom->offset || aligned_len != eeprom->len) {
+		/* RMW possibly needed for first or last words.
+		 */
+		buf = kvzalloc(aligned_len, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		err = eeprom_rd_phys(adapter, aligned_offset, (u32 *)buf);
+		if (!err && aligned_len > 4)
+			err = eeprom_rd_phys(adapter,
+					     aligned_offset + aligned_len - 4,
+					     (u32 *)&buf[aligned_len - 4]);
+		if (err)
+			goto out;
+		memcpy(buf + (eeprom->offset & 3), data, eeprom->len);
+	} else {
+		buf = data;
+	}
+
+	err = t4_seeprom_wp(adapter, false);
+	if (err)
+		goto out;
+
+	for (p = (u32 *)buf; !err && aligned_len; aligned_len -= 4, p++) {
+		err = eeprom_wr_phys(adapter, aligned_offset, *p);
+		aligned_offset += 4;
+	}
+
+	if (!err)
+		err = t4_seeprom_wp(adapter, true);
+out:
+	if (buf != data)
+		kvfree(buf);
+	return err;
+}
+
+static int set_flash(struct net_device *netdev, struct ethtool_flash *ef)
+{
+	int ret;
+	const struct firmware *fw;
+	struct adapter *adap = netdev2adap(netdev);
+	unsigned int mbox = PCIE_FW_MASTER_M + 1;
+	u32 pcie_fw;
+	unsigned int master;
+	u8 master_vld = 0;
+
+	pcie_fw = t4_read_reg(adap, PCIE_FW_A);
+	master = PCIE_FW_MASTER_G(pcie_fw);
+	if (pcie_fw & PCIE_FW_MASTER_VLD_F)
+		master_vld = 1;
+	/* if csiostor is the master return */
+	if (master_vld && (master != adap->pf)) {
+		dev_warn(adap->pdev_dev,
+			 "cxgb4 driver needs to be loaded as MASTER to support FW flash\n");
+		return -EOPNOTSUPP;
+	}
+
+	ef->data[sizeof(ef->data) - 1] = '\0';
+	ret = request_firmware(&fw, ef->data, adap->pdev_dev);
+	if (ret < 0)
+		return ret;
+
+	/* If the adapter has been fully initialized then we'll go ahead and
+	 * try to get the firmware's cooperation in upgrading to the new
+	 * firmware image otherwise we'll try to do the entire job from the
+	 * host ... and we always "force" the operation in this path.
+	 */
+	if (adap->flags & FULL_INIT_DONE)
+		mbox = adap->mbox;
+
+	ret = t4_fw_upgrade(adap, mbox, fw->data, fw->size, 1);
+	release_firmware(fw);
+	if (!ret)
+		dev_info(adap->pdev_dev,
+			 "loaded firmware %s, reload cxgb4 driver\n", ef->data);
+	return ret;
+}
+
+static int get_ts_info(struct net_device *dev, struct ethtool_ts_info *ts_info)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct  adapter *adapter = pi->adapter;
+
+	ts_info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
+				   SOF_TIMESTAMPING_RX_SOFTWARE |
+				   SOF_TIMESTAMPING_SOFTWARE;
+
+	ts_info->so_timestamping |= SOF_TIMESTAMPING_RX_HARDWARE |
+				    SOF_TIMESTAMPING_TX_HARDWARE |
+				    SOF_TIMESTAMPING_RAW_HARDWARE;
+
+	ts_info->tx_types = (1 << HWTSTAMP_TX_OFF) |
+			    (1 << HWTSTAMP_TX_ON);
+
+	ts_info->rx_filters = (1 << HWTSTAMP_FILTER_NONE) |
+			      (1 << HWTSTAMP_FILTER_PTP_V2_L4_EVENT) |
+			      (1 << HWTSTAMP_FILTER_PTP_V1_L4_SYNC) |
+			      (1 << HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ) |
+			      (1 << HWTSTAMP_FILTER_PTP_V2_L4_SYNC) |
+			      (1 << HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ);
+
+	if (adapter->ptp_clock)
+		ts_info->phc_index = ptp_clock_index(adapter->ptp_clock);
+	else
+		ts_info->phc_index = -1;
+
+	return 0;
+}
+
+static u32 get_rss_table_size(struct net_device *dev)
+{
+	const struct port_info *pi = netdev_priv(dev);
+
+	return pi->rss_size;
+}
+
+static int get_rss_table(struct net_device *dev, u32 *p, u8 *key, u8 *hfunc)
+{
+	const struct port_info *pi = netdev_priv(dev);
+	unsigned int n = pi->rss_size;
+
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+	if (!p)
+		return 0;
+	while (n--)
+		p[n] = pi->rss[n];
+	return 0;
+}
+
+static int set_rss_table(struct net_device *dev, const u32 *p, const u8 *key,
+			 const u8 hfunc)
+{
+	unsigned int i;
+	struct port_info *pi = netdev_priv(dev);
+
+	/* We require at least one supported parameter to be changed and no
+	 * change in any of the unsupported parameters
+	 */
+	if (key ||
+	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+		return -EOPNOTSUPP;
+	if (!p)
+		return 0;
+
+	/* Interface must be brought up atleast once */
+	if (pi->adapter->flags & FULL_INIT_DONE) {
+		for (i = 0; i < pi->rss_size; i++)
+			pi->rss[i] = p[i];
+
+		return cxgb4_write_rss(pi, pi->rss);
+	}
+
+	return -EPERM;
+}
+
+static int get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info,
+		     u32 *rules)
+{
+	const struct port_info *pi = netdev_priv(dev);
+
+	switch (info->cmd) {
+	case ETHTOOL_GRXFH: {
+		unsigned int v = pi->rss_mode;
+
+		info->data = 0;
+		switch (info->flow_type) {
+		case TCP_V4_FLOW:
+			if (v & FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN_F)
+				info->data = RXH_IP_SRC | RXH_IP_DST |
+					     RXH_L4_B_0_1 | RXH_L4_B_2_3;
+			else if (v & FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_F)
+				info->data = RXH_IP_SRC | RXH_IP_DST;
+			break;
+		case UDP_V4_FLOW:
+			if ((v & FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN_F) &&
+			    (v & FW_RSS_VI_CONFIG_CMD_UDPEN_F))
+				info->data = RXH_IP_SRC | RXH_IP_DST |
+					     RXH_L4_B_0_1 | RXH_L4_B_2_3;
+			else if (v & FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_F)
+				info->data = RXH_IP_SRC | RXH_IP_DST;
+			break;
+		case SCTP_V4_FLOW:
+		case AH_ESP_V4_FLOW:
+		case IPV4_FLOW:
+			if (v & FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_F)
+				info->data = RXH_IP_SRC | RXH_IP_DST;
+			break;
+		case TCP_V6_FLOW:
+			if (v & FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN_F)
+				info->data = RXH_IP_SRC | RXH_IP_DST |
+					     RXH_L4_B_0_1 | RXH_L4_B_2_3;
+			else if (v & FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_F)
+				info->data = RXH_IP_SRC | RXH_IP_DST;
+			break;
+		case UDP_V6_FLOW:
+			if ((v & FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN_F) &&
+			    (v & FW_RSS_VI_CONFIG_CMD_UDPEN_F))
+				info->data = RXH_IP_SRC | RXH_IP_DST |
+					     RXH_L4_B_0_1 | RXH_L4_B_2_3;
+			else if (v & FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_F)
+				info->data = RXH_IP_SRC | RXH_IP_DST;
+			break;
+		case SCTP_V6_FLOW:
+		case AH_ESP_V6_FLOW:
+		case IPV6_FLOW:
+			if (v & FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_F)
+				info->data = RXH_IP_SRC | RXH_IP_DST;
+			break;
+		}
+		return 0;
+	}
+	case ETHTOOL_GRXRINGS:
+		info->data = pi->nqsets;
+		return 0;
+	}
+	return -EOPNOTSUPP;
+}
+
+static int set_dump(struct net_device *dev, struct ethtool_dump *eth_dump)
+{
+	struct adapter *adapter = netdev2adap(dev);
+	u32 len = 0;
+
+	len = sizeof(struct cudbg_hdr) +
+	      sizeof(struct cudbg_entity_hdr) * CUDBG_MAX_ENTITY;
+	len += cxgb4_get_dump_length(adapter, eth_dump->flag);
+
+	adapter->eth_dump.flag = eth_dump->flag;
+	adapter->eth_dump.len = len;
+	return 0;
+}
+
+static int get_dump_flag(struct net_device *dev, struct ethtool_dump *eth_dump)
+{
+	struct adapter *adapter = netdev2adap(dev);
+
+	eth_dump->flag = adapter->eth_dump.flag;
+	eth_dump->len = adapter->eth_dump.len;
+	eth_dump->version = adapter->eth_dump.version;
+	return 0;
+}
+
+static int get_dump_data(struct net_device *dev, struct ethtool_dump *eth_dump,
+			 void *buf)
+{
+	struct adapter *adapter = netdev2adap(dev);
+	u32 len = 0;
+	int ret = 0;
+
+	if (adapter->eth_dump.flag == CXGB4_ETH_DUMP_NONE)
+		return -ENOENT;
+
+	len = sizeof(struct cudbg_hdr) +
+	      sizeof(struct cudbg_entity_hdr) * CUDBG_MAX_ENTITY;
+	len += cxgb4_get_dump_length(adapter, adapter->eth_dump.flag);
+	if (eth_dump->len < len)
+		return -ENOMEM;
+
+	ret = cxgb4_cudbg_collect(adapter, buf, &len, adapter->eth_dump.flag);
+	if (ret)
+		return ret;
+
+	eth_dump->flag = adapter->eth_dump.flag;
+	eth_dump->len = len;
+	eth_dump->version = adapter->eth_dump.version;
+	return 0;
+}
+
+static int cxgb4_get_module_info(struct net_device *dev,
+				 struct ethtool_modinfo *modinfo)
+{
+	struct port_info *pi = netdev_priv(dev);
+	u8 sff8472_comp, sff_diag_type, sff_rev;
+	struct adapter *adapter = pi->adapter;
+	int ret;
+
+	if (!t4_is_inserted_mod_type(pi->mod_type))
+		return -EINVAL;
+
+	switch (pi->port_type) {
+	case FW_PORT_TYPE_SFP:
+	case FW_PORT_TYPE_QSA:
+	case FW_PORT_TYPE_SFP28:
+		ret = t4_i2c_rd(adapter, adapter->mbox, pi->tx_chan,
+				I2C_DEV_ADDR_A0, SFF_8472_COMP_ADDR,
+				SFF_8472_COMP_LEN, &sff8472_comp);
+		if (ret)
+			return ret;
+		ret = t4_i2c_rd(adapter, adapter->mbox, pi->tx_chan,
+				I2C_DEV_ADDR_A0, SFP_DIAG_TYPE_ADDR,
+				SFP_DIAG_TYPE_LEN, &sff_diag_type);
+		if (ret)
+			return ret;
+
+		if (!sff8472_comp || (sff_diag_type & 4)) {
+			modinfo->type = ETH_MODULE_SFF_8079;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8472;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		}
+		break;
+
+	case FW_PORT_TYPE_QSFP:
+	case FW_PORT_TYPE_QSFP_10G:
+	case FW_PORT_TYPE_CR_QSFP:
+	case FW_PORT_TYPE_CR2_QSFP:
+	case FW_PORT_TYPE_CR4_QSFP:
+		ret = t4_i2c_rd(adapter, adapter->mbox, pi->tx_chan,
+				I2C_DEV_ADDR_A0, SFF_REV_ADDR,
+				SFF_REV_LEN, &sff_rev);
+		/* For QSFP type ports, revision value >= 3
+		 * means the SFP is 8636 compliant.
+		 */
+		if (ret)
+			return ret;
+		if (sff_rev >= 0x3) {
+			modinfo->type = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		}
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int cxgb4_get_module_eeprom(struct net_device *dev,
+				   struct ethtool_eeprom *eprom, u8 *data)
+{
+	int ret = 0, offset = eprom->offset, len = eprom->len;
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	memset(data, 0, eprom->len);
+	if (offset + len <= I2C_PAGE_SIZE)
+		return t4_i2c_rd(adapter, adapter->mbox, pi->tx_chan,
+				 I2C_DEV_ADDR_A0, offset, len, data);
+
+	/* offset + len spans 0xa0 and 0xa1 pages */
+	if (offset <= I2C_PAGE_SIZE) {
+		/* read 0xa0 page */
+		len = I2C_PAGE_SIZE - offset;
+		ret =  t4_i2c_rd(adapter, adapter->mbox, pi->tx_chan,
+				 I2C_DEV_ADDR_A0, offset, len, data);
+		if (ret)
+			return ret;
+		offset = I2C_PAGE_SIZE;
+		/* Remaining bytes to be read from second page =
+		 * Total length - bytes read from first page
+		 */
+		len = eprom->len - len;
+	}
+	/* Read additional optical diagnostics from page 0xa2 if supported */
+	return t4_i2c_rd(adapter, adapter->mbox, pi->tx_chan, I2C_DEV_ADDR_A2,
+			 offset, len, &data[eprom->len - len]);
+}
+
+static const struct ethtool_ops cxgb_ethtool_ops = {
+	.get_link_ksettings = get_link_ksettings,
+	.set_link_ksettings = set_link_ksettings,
+	.get_fecparam      = get_fecparam,
+	.set_fecparam      = set_fecparam,
+	.get_drvinfo       = get_drvinfo,
+	.get_msglevel      = get_msglevel,
+	.set_msglevel      = set_msglevel,
+	.get_ringparam     = get_sge_param,
+	.set_ringparam     = set_sge_param,
+	.get_coalesce      = get_coalesce,
+	.set_coalesce      = set_coalesce,
+	.get_eeprom_len    = get_eeprom_len,
+	.get_eeprom        = get_eeprom,
+	.set_eeprom        = set_eeprom,
+	.get_pauseparam    = get_pauseparam,
+	.set_pauseparam    = set_pauseparam,
+	.get_link          = ethtool_op_get_link,
+	.get_strings       = get_strings,
+	.set_phys_id       = identify_port,
+	.nway_reset        = restart_autoneg,
+	.get_sset_count    = get_sset_count,
+	.get_ethtool_stats = get_stats,
+	.get_regs_len      = get_regs_len,
+	.get_regs          = get_regs,
+	.get_rxnfc         = get_rxnfc,
+	.get_rxfh_indir_size = get_rss_table_size,
+	.get_rxfh	   = get_rss_table,
+	.set_rxfh	   = set_rss_table,
+	.flash_device      = set_flash,
+	.get_ts_info       = get_ts_info,
+	.set_dump          = set_dump,
+	.get_dump_flag     = get_dump_flag,
+	.get_dump_data     = get_dump_data,
+	.get_module_info   = cxgb4_get_module_info,
+	.get_module_eeprom = cxgb4_get_module_eeprom,
+};
+
+void cxgb4_set_ethtool_ops(struct net_device *netdev)
+{
+	netdev->ethtool_ops = &cxgb_ethtool_ops;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_fcoe.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_fcoe.c
new file mode 100644
index 0000000..6c8a62e
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_fcoe.c
@@ -0,0 +1,122 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2015 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef CONFIG_CHELSIO_T4_FCOE
+
+#include <scsi/fc/fc_fs.h>
+#include <scsi/libfcoe.h>
+#include "cxgb4.h"
+
+bool cxgb_fcoe_sof_eof_supported(struct adapter *adap, struct sk_buff *skb)
+{
+	struct fcoe_hdr *fcoeh = (struct fcoe_hdr *)skb_network_header(skb);
+	u8 sof = fcoeh->fcoe_sof;
+	u8 eof = 0;
+
+	if ((sof != FC_SOF_I3) && (sof != FC_SOF_N3)) {
+		dev_err(adap->pdev_dev, "Unsupported SOF 0x%x\n", sof);
+		return false;
+	}
+
+	skb_copy_bits(skb, skb->len - 4, &eof, 1);
+
+	if ((eof != FC_EOF_N) && (eof != FC_EOF_T)) {
+		dev_err(adap->pdev_dev, "Unsupported EOF 0x%x\n", eof);
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * cxgb_fcoe_enable - enable FCoE offload features
+ * @netdev: net device
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+int cxgb_fcoe_enable(struct net_device *netdev)
+{
+	struct port_info *pi = netdev_priv(netdev);
+	struct adapter *adap = pi->adapter;
+	struct cxgb_fcoe *fcoe = &pi->fcoe;
+
+	if (is_t4(adap->params.chip))
+		return -EINVAL;
+
+	if (!(adap->flags & FULL_INIT_DONE))
+		return -EINVAL;
+
+	dev_info(adap->pdev_dev, "Enabling FCoE offload features\n");
+
+	netdev->features |= NETIF_F_FCOE_CRC;
+	netdev->vlan_features |= NETIF_F_FCOE_CRC;
+	netdev->features |= NETIF_F_FCOE_MTU;
+	netdev->vlan_features |= NETIF_F_FCOE_MTU;
+
+	netdev_features_change(netdev);
+
+	fcoe->flags |= CXGB_FCOE_ENABLED;
+
+	return 0;
+}
+
+/**
+ * cxgb_fcoe_disable - disable FCoE offload
+ * @netdev: net device
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+int cxgb_fcoe_disable(struct net_device *netdev)
+{
+	struct port_info *pi = netdev_priv(netdev);
+	struct adapter *adap = pi->adapter;
+	struct cxgb_fcoe *fcoe = &pi->fcoe;
+
+	if (!(fcoe->flags & CXGB_FCOE_ENABLED))
+		return -EINVAL;
+
+	dev_info(adap->pdev_dev, "Disabling FCoE offload features\n");
+
+	fcoe->flags &= ~CXGB_FCOE_ENABLED;
+
+	netdev->features &= ~NETIF_F_FCOE_CRC;
+	netdev->vlan_features &= ~NETIF_F_FCOE_CRC;
+	netdev->features &= ~NETIF_F_FCOE_MTU;
+	netdev->vlan_features &= ~NETIF_F_FCOE_MTU;
+
+	netdev_features_change(netdev);
+
+	return 0;
+}
+#endif /* CONFIG_CHELSIO_T4_FCOE */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_fcoe.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_fcoe.h
new file mode 100644
index 0000000..bf9258a
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_fcoe.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2015 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_FCOE_H__
+#define __CXGB4_FCOE_H__
+
+#ifdef CONFIG_CHELSIO_T4_FCOE
+
+#define CXGB_FCOE_TXPKT_CSUM_START	28
+#define CXGB_FCOE_TXPKT_CSUM_END	8
+
+/* fcoe flags */
+enum {
+	CXGB_FCOE_ENABLED     = (1 << 0),
+};
+
+struct cxgb_fcoe {
+	u8	flags;
+};
+
+int cxgb_fcoe_enable(struct net_device *);
+int cxgb_fcoe_disable(struct net_device *);
+bool cxgb_fcoe_sof_eof_supported(struct adapter *, struct sk_buff *);
+
+#endif /* CONFIG_CHELSIO_T4_FCOE */
+#endif /* __CXGB4_FCOE_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
new file mode 100644
index 0000000..b76447b
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -0,0 +1,1757 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <net/ipv6.h>
+
+#include "cxgb4.h"
+#include "t4_regs.h"
+#include "t4_tcb.h"
+#include "t4_values.h"
+#include "clip_tbl.h"
+#include "l2t.h"
+#include "smt.h"
+#include "t4fw_api.h"
+#include "cxgb4_filter.h"
+
+static inline bool is_field_set(u32 val, u32 mask)
+{
+	return val || mask;
+}
+
+static inline bool unsupported(u32 conf, u32 conf_mask, u32 val, u32 mask)
+{
+	return !(conf & conf_mask) && is_field_set(val, mask);
+}
+
+static int set_tcb_field(struct adapter *adap, struct filter_entry *f,
+			 unsigned int ftid,  u16 word, u64 mask, u64 val,
+			 int no_reply)
+{
+	struct cpl_set_tcb_field *req;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(sizeof(struct cpl_set_tcb_field), GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	req = (struct cpl_set_tcb_field *)__skb_put(skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	INIT_TP_WR_CPL(req, CPL_SET_TCB_FIELD, ftid);
+	req->reply_ctrl = htons(REPLY_CHAN_V(0) |
+				QUEUENO_V(adap->sge.fw_evtq.abs_id) |
+				NO_REPLY_V(no_reply));
+	req->word_cookie = htons(TCB_WORD_V(word) | TCB_COOKIE_V(ftid));
+	req->mask = cpu_to_be64(mask);
+	req->val = cpu_to_be64(val);
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, f->fs.val.iport & 0x3);
+	t4_ofld_send(adap, skb);
+	return 0;
+}
+
+/* Set one of the t_flags bits in the TCB.
+ */
+static int set_tcb_tflag(struct adapter *adap, struct filter_entry *f,
+			 unsigned int ftid, unsigned int bit_pos,
+			 unsigned int val, int no_reply)
+{
+	return set_tcb_field(adap, f, ftid,  TCB_T_FLAGS_W, 1ULL << bit_pos,
+			     (unsigned long long)val << bit_pos, no_reply);
+}
+
+static void mk_abort_req_ulp(struct cpl_abort_req *abort_req, unsigned int tid)
+{
+	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)abort_req;
+	struct ulptx_idata *sc = (struct ulptx_idata *)(txpkt + 1);
+
+	txpkt->cmd_dest = htonl(ULPTX_CMD_V(ULP_TX_PKT) | ULP_TXPKT_DEST_V(0));
+	txpkt->len = htonl(DIV_ROUND_UP(sizeof(*abort_req), 16));
+	sc->cmd_more = htonl(ULPTX_CMD_V(ULP_TX_SC_IMM));
+	sc->len = htonl(sizeof(*abort_req) - sizeof(struct work_request_hdr));
+	OPCODE_TID(abort_req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
+	abort_req->rsvd0 = htonl(0);
+	abort_req->rsvd1 = 0;
+	abort_req->cmd = CPL_ABORT_NO_RST;
+}
+
+static void mk_abort_rpl_ulp(struct cpl_abort_rpl *abort_rpl, unsigned int tid)
+{
+	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)abort_rpl;
+	struct ulptx_idata *sc = (struct ulptx_idata *)(txpkt + 1);
+
+	txpkt->cmd_dest = htonl(ULPTX_CMD_V(ULP_TX_PKT) | ULP_TXPKT_DEST_V(0));
+	txpkt->len = htonl(DIV_ROUND_UP(sizeof(*abort_rpl), 16));
+	sc->cmd_more = htonl(ULPTX_CMD_V(ULP_TX_SC_IMM));
+	sc->len = htonl(sizeof(*abort_rpl) - sizeof(struct work_request_hdr));
+	OPCODE_TID(abort_rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
+	abort_rpl->rsvd0 = htonl(0);
+	abort_rpl->rsvd1 = 0;
+	abort_rpl->cmd = CPL_ABORT_NO_RST;
+}
+
+static void mk_set_tcb_ulp(struct filter_entry *f,
+			   struct cpl_set_tcb_field *req,
+			   unsigned int word, u64 mask, u64 val,
+			   u8 cookie, int no_reply)
+{
+	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
+	struct ulptx_idata *sc = (struct ulptx_idata *)(txpkt + 1);
+
+	txpkt->cmd_dest = htonl(ULPTX_CMD_V(ULP_TX_PKT) | ULP_TXPKT_DEST_V(0));
+	txpkt->len = htonl(DIV_ROUND_UP(sizeof(*req), 16));
+	sc->cmd_more = htonl(ULPTX_CMD_V(ULP_TX_SC_IMM));
+	sc->len = htonl(sizeof(*req) - sizeof(struct work_request_hdr));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, f->tid));
+	req->reply_ctrl = htons(NO_REPLY_V(no_reply) | REPLY_CHAN_V(0) |
+				QUEUENO_V(0));
+	req->word_cookie = htons(TCB_WORD_V(word) | TCB_COOKIE_V(cookie));
+	req->mask = cpu_to_be64(mask);
+	req->val = cpu_to_be64(val);
+	sc = (struct ulptx_idata *)(req + 1);
+	sc->cmd_more = htonl(ULPTX_CMD_V(ULP_TX_SC_NOOP));
+	sc->len = htonl(0);
+}
+
+static int configure_filter_smac(struct adapter *adap, struct filter_entry *f)
+{
+	int err;
+
+	/* do a set-tcb for smac-sel and CWR bit.. */
+	err = set_tcb_tflag(adap, f, f->tid, TF_CCTRL_CWR_S, 1, 1);
+	if (err)
+		goto smac_err;
+
+	err = set_tcb_field(adap, f, f->tid, TCB_SMAC_SEL_W,
+			    TCB_SMAC_SEL_V(TCB_SMAC_SEL_M),
+			    TCB_SMAC_SEL_V(f->smt->idx), 1);
+	if (!err)
+		return 0;
+
+smac_err:
+	dev_err(adap->pdev_dev, "filter %u smac config failed with error %u\n",
+		f->tid, err);
+	return err;
+}
+
+static void set_nat_params(struct adapter *adap, struct filter_entry *f,
+			   unsigned int tid, bool dip, bool sip, bool dp,
+			   bool sp)
+{
+	if (dip) {
+		if (f->fs.type) {
+			set_tcb_field(adap, f, tid, TCB_SND_UNA_RAW_W,
+				      WORD_MASK, f->fs.nat_lip[15] |
+				      f->fs.nat_lip[14] << 8 |
+				      f->fs.nat_lip[13] << 16 |
+				      f->fs.nat_lip[12] << 24, 1);
+
+			set_tcb_field(adap, f, tid, TCB_SND_UNA_RAW_W + 1,
+				      WORD_MASK, f->fs.nat_lip[11] |
+				      f->fs.nat_lip[10] << 8 |
+				      f->fs.nat_lip[9] << 16 |
+				      f->fs.nat_lip[8] << 24, 1);
+
+			set_tcb_field(adap, f, tid, TCB_SND_UNA_RAW_W + 2,
+				      WORD_MASK, f->fs.nat_lip[7] |
+				      f->fs.nat_lip[6] << 8 |
+				      f->fs.nat_lip[5] << 16 |
+				      f->fs.nat_lip[4] << 24, 1);
+
+			set_tcb_field(adap, f, tid, TCB_SND_UNA_RAW_W + 3,
+				      WORD_MASK, f->fs.nat_lip[3] |
+				      f->fs.nat_lip[2] << 8 |
+				      f->fs.nat_lip[1] << 16 |
+				      f->fs.nat_lip[0] << 24, 1);
+		} else {
+			set_tcb_field(adap, f, tid, TCB_RX_FRAG3_LEN_RAW_W,
+				      WORD_MASK, f->fs.nat_lip[3] |
+				      f->fs.nat_lip[2] << 8 |
+				      f->fs.nat_lip[1] << 16 |
+				      f->fs.nat_lip[0] << 24, 1);
+		}
+	}
+
+	if (sip) {
+		if (f->fs.type) {
+			set_tcb_field(adap, f, tid, TCB_RX_FRAG2_PTR_RAW_W,
+				      WORD_MASK, f->fs.nat_fip[15] |
+				      f->fs.nat_fip[14] << 8 |
+				      f->fs.nat_fip[13] << 16 |
+				      f->fs.nat_fip[12] << 24, 1);
+
+			set_tcb_field(adap, f, tid, TCB_RX_FRAG2_PTR_RAW_W + 1,
+				      WORD_MASK, f->fs.nat_fip[11] |
+				      f->fs.nat_fip[10] << 8 |
+				      f->fs.nat_fip[9] << 16 |
+				      f->fs.nat_fip[8] << 24, 1);
+
+			set_tcb_field(adap, f, tid, TCB_RX_FRAG2_PTR_RAW_W + 2,
+				      WORD_MASK, f->fs.nat_fip[7] |
+				      f->fs.nat_fip[6] << 8 |
+				      f->fs.nat_fip[5] << 16 |
+				      f->fs.nat_fip[4] << 24, 1);
+
+			set_tcb_field(adap, f, tid, TCB_RX_FRAG2_PTR_RAW_W + 3,
+				      WORD_MASK, f->fs.nat_fip[3] |
+				      f->fs.nat_fip[2] << 8 |
+				      f->fs.nat_fip[1] << 16 |
+				      f->fs.nat_fip[0] << 24, 1);
+
+		} else {
+			set_tcb_field(adap, f, tid,
+				      TCB_RX_FRAG3_START_IDX_OFFSET_RAW_W,
+				      WORD_MASK, f->fs.nat_fip[3] |
+				      f->fs.nat_fip[2] << 8 |
+				      f->fs.nat_fip[1] << 16 |
+				      f->fs.nat_fip[0] << 24, 1);
+		}
+	}
+
+	set_tcb_field(adap, f, tid, TCB_PDU_HDR_LEN_W, WORD_MASK,
+		      (dp ? f->fs.nat_lport : 0) |
+		      (sp ? f->fs.nat_fport << 16 : 0), 1);
+}
+
+/* Validate filter spec against configuration done on the card. */
+static int validate_filter(struct net_device *dev,
+			   struct ch_filter_specification *fs)
+{
+	struct adapter *adapter = netdev2adap(dev);
+	u32 fconf, iconf;
+
+	/* Check for unconfigured fields being used. */
+	fconf = adapter->params.tp.vlan_pri_map;
+	iconf = adapter->params.tp.ingress_config;
+
+	if (unsupported(fconf, FCOE_F, fs->val.fcoe, fs->mask.fcoe) ||
+	    unsupported(fconf, PORT_F, fs->val.iport, fs->mask.iport) ||
+	    unsupported(fconf, TOS_F, fs->val.tos, fs->mask.tos) ||
+	    unsupported(fconf, ETHERTYPE_F, fs->val.ethtype,
+			fs->mask.ethtype) ||
+	    unsupported(fconf, MACMATCH_F, fs->val.macidx, fs->mask.macidx) ||
+	    unsupported(fconf, MPSHITTYPE_F, fs->val.matchtype,
+			fs->mask.matchtype) ||
+	    unsupported(fconf, FRAGMENTATION_F, fs->val.frag, fs->mask.frag) ||
+	    unsupported(fconf, PROTOCOL_F, fs->val.proto, fs->mask.proto) ||
+	    unsupported(fconf, VNIC_ID_F, fs->val.pfvf_vld,
+			fs->mask.pfvf_vld) ||
+	    unsupported(fconf, VNIC_ID_F, fs->val.ovlan_vld,
+			fs->mask.ovlan_vld) ||
+	    unsupported(fconf, VLAN_F, fs->val.ivlan_vld, fs->mask.ivlan_vld))
+		return -EOPNOTSUPP;
+
+	/* T4 inconveniently uses the same FT_VNIC_ID_W bits for both the Outer
+	 * VLAN Tag and PF/VF/VFvld fields based on VNIC_F being set
+	 * in TP_INGRESS_CONFIG.  Hense the somewhat crazy checks
+	 * below.  Additionally, since the T4 firmware interface also
+	 * carries that overlap, we need to translate any PF/VF
+	 * specification into that internal format below.
+	 */
+	if (is_field_set(fs->val.pfvf_vld, fs->mask.pfvf_vld) &&
+	    is_field_set(fs->val.ovlan_vld, fs->mask.ovlan_vld))
+		return -EOPNOTSUPP;
+	if (unsupported(iconf, VNIC_F, fs->val.pfvf_vld, fs->mask.pfvf_vld) ||
+	    (is_field_set(fs->val.ovlan_vld, fs->mask.ovlan_vld) &&
+	     (iconf & VNIC_F)))
+		return -EOPNOTSUPP;
+	if (fs->val.pf > 0x7 || fs->val.vf > 0x7f)
+		return -ERANGE;
+	fs->mask.pf &= 0x7;
+	fs->mask.vf &= 0x7f;
+
+	/* If the user is requesting that the filter action loop
+	 * matching packets back out one of our ports, make sure that
+	 * the egress port is in range.
+	 */
+	if (fs->action == FILTER_SWITCH &&
+	    fs->eport >= adapter->params.nports)
+		return -ERANGE;
+
+	/* Don't allow various trivially obvious bogus out-of-range values... */
+	if (fs->val.iport >= adapter->params.nports)
+		return -ERANGE;
+
+	/* T4 doesn't support removing VLAN Tags for loop back filters. */
+	if (is_t4(adapter->params.chip) &&
+	    fs->action == FILTER_SWITCH &&
+	    (fs->newvlan == VLAN_REMOVE ||
+	     fs->newvlan == VLAN_REWRITE))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static int get_filter_steerq(struct net_device *dev,
+			     struct ch_filter_specification *fs)
+{
+	struct adapter *adapter = netdev2adap(dev);
+	int iq;
+
+	/* If the user has requested steering matching Ingress Packets
+	 * to a specific Queue Set, we need to make sure it's in range
+	 * for the port and map that into the Absolute Queue ID of the
+	 * Queue Set's Response Queue.
+	 */
+	if (!fs->dirsteer) {
+		if (fs->iq)
+			return -EINVAL;
+		iq = 0;
+	} else {
+		struct port_info *pi = netdev_priv(dev);
+
+		/* If the iq id is greater than the number of qsets,
+		 * then assume it is an absolute qid.
+		 */
+		if (fs->iq < pi->nqsets)
+			iq = adapter->sge.ethrxq[pi->first_qset +
+						 fs->iq].rspq.abs_id;
+		else
+			iq = fs->iq;
+	}
+
+	return iq;
+}
+
+static int get_filter_count(struct adapter *adapter, unsigned int fidx,
+			    u64 *pkts, u64 *bytes, bool hash)
+{
+	unsigned int tcb_base, tcbaddr;
+	unsigned int word_offset;
+	struct filter_entry *f;
+	__be64 be64_byte_count;
+	int ret;
+
+	tcb_base = t4_read_reg(adapter, TP_CMM_TCB_BASE_A);
+	if (is_hashfilter(adapter) && hash) {
+		if (fidx < adapter->tids.ntids) {
+			f = adapter->tids.tid_tab[fidx];
+			if (!f)
+				return -EINVAL;
+		} else {
+			return -E2BIG;
+		}
+	} else {
+		if ((fidx != (adapter->tids.nftids +
+			      adapter->tids.nsftids - 1)) &&
+		    fidx >= adapter->tids.nftids)
+			return -E2BIG;
+
+		f = &adapter->tids.ftid_tab[fidx];
+		if (!f->valid)
+			return -EINVAL;
+	}
+	tcbaddr = tcb_base + f->tid * TCB_SIZE;
+
+	spin_lock(&adapter->win0_lock);
+	if (is_t4(adapter->params.chip)) {
+		__be64 be64_count;
+
+		/* T4 doesn't maintain byte counts in hw */
+		*bytes = 0;
+
+		/* Get pkts */
+		word_offset = 4;
+		ret = t4_memory_rw(adapter, MEMWIN_NIC, MEM_EDC0,
+				   tcbaddr + (word_offset * sizeof(__be32)),
+				   sizeof(be64_count),
+				   (__be32 *)&be64_count,
+				   T4_MEMORY_READ);
+		if (ret < 0)
+			goto out;
+		*pkts = be64_to_cpu(be64_count);
+	} else {
+		__be32 be32_count;
+
+		/* Get bytes */
+		word_offset = 4;
+		ret = t4_memory_rw(adapter, MEMWIN_NIC, MEM_EDC0,
+				   tcbaddr + (word_offset * sizeof(__be32)),
+				   sizeof(be64_byte_count),
+				   &be64_byte_count,
+				   T4_MEMORY_READ);
+		if (ret < 0)
+			goto out;
+		*bytes = be64_to_cpu(be64_byte_count);
+
+		/* Get pkts */
+		word_offset = 6;
+		ret = t4_memory_rw(adapter, MEMWIN_NIC, MEM_EDC0,
+				   tcbaddr + (word_offset * sizeof(__be32)),
+				   sizeof(be32_count),
+				   &be32_count,
+				   T4_MEMORY_READ);
+		if (ret < 0)
+			goto out;
+		*pkts = (u64)be32_to_cpu(be32_count);
+	}
+
+out:
+	spin_unlock(&adapter->win0_lock);
+	return ret;
+}
+
+int cxgb4_get_filter_counters(struct net_device *dev, unsigned int fidx,
+			      u64 *hitcnt, u64 *bytecnt, bool hash)
+{
+	struct adapter *adapter = netdev2adap(dev);
+
+	return get_filter_count(adapter, fidx, hitcnt, bytecnt, hash);
+}
+
+int cxgb4_get_free_ftid(struct net_device *dev, int family)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct tid_info *t = &adap->tids;
+	int ftid;
+
+	spin_lock_bh(&t->ftid_lock);
+	if (family == PF_INET) {
+		ftid = find_first_zero_bit(t->ftid_bmap, t->nftids);
+		if (ftid >= t->nftids)
+			ftid = -1;
+	} else {
+		if (is_t6(adap->params.chip)) {
+			ftid = bitmap_find_free_region(t->ftid_bmap,
+						       t->nftids, 1);
+			if (ftid < 0)
+				goto out_unlock;
+
+			/* this is only a lookup, keep the found region
+			 * unallocated
+			 */
+			bitmap_release_region(t->ftid_bmap, ftid, 1);
+		} else {
+			ftid = bitmap_find_free_region(t->ftid_bmap,
+						       t->nftids, 2);
+			if (ftid < 0)
+				goto out_unlock;
+
+			bitmap_release_region(t->ftid_bmap, ftid, 2);
+		}
+	}
+out_unlock:
+	spin_unlock_bh(&t->ftid_lock);
+	return ftid;
+}
+
+static int cxgb4_set_ftid(struct tid_info *t, int fidx, int family,
+			  unsigned int chip_ver)
+{
+	spin_lock_bh(&t->ftid_lock);
+
+	if (test_bit(fidx, t->ftid_bmap)) {
+		spin_unlock_bh(&t->ftid_lock);
+		return -EBUSY;
+	}
+
+	if (family == PF_INET) {
+		__set_bit(fidx, t->ftid_bmap);
+	} else {
+		if (chip_ver < CHELSIO_T6)
+			bitmap_allocate_region(t->ftid_bmap, fidx, 2);
+		else
+			bitmap_allocate_region(t->ftid_bmap, fidx, 1);
+	}
+
+	spin_unlock_bh(&t->ftid_lock);
+	return 0;
+}
+
+static void cxgb4_clear_ftid(struct tid_info *t, int fidx, int family,
+			     unsigned int chip_ver)
+{
+	spin_lock_bh(&t->ftid_lock);
+	if (family == PF_INET) {
+		__clear_bit(fidx, t->ftid_bmap);
+	} else {
+		if (chip_ver < CHELSIO_T6)
+			bitmap_release_region(t->ftid_bmap, fidx, 2);
+		else
+			bitmap_release_region(t->ftid_bmap, fidx, 1);
+	}
+	spin_unlock_bh(&t->ftid_lock);
+}
+
+/* Delete the filter at a specified index. */
+static int del_filter_wr(struct adapter *adapter, int fidx)
+{
+	struct filter_entry *f = &adapter->tids.ftid_tab[fidx];
+	struct fw_filter_wr *fwr;
+	struct sk_buff *skb;
+	unsigned int len;
+
+	len = sizeof(*fwr);
+
+	skb = alloc_skb(len, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	fwr = __skb_put(skb, len);
+	t4_mk_filtdelwr(f->tid, fwr, (adapter->flags & SHUTTING_DOWN) ? -1
+			: adapter->sge.fw_evtq.abs_id);
+
+	/* Mark the filter as "pending" and ship off the Filter Work Request.
+	 * When we get the Work Request Reply we'll clear the pending status.
+	 */
+	f->pending = 1;
+	t4_mgmt_tx(adapter, skb);
+	return 0;
+}
+
+/* Send a Work Request to write the filter at a specified index.  We construct
+ * a Firmware Filter Work Request to have the work done and put the indicated
+ * filter into "pending" mode which will prevent any further actions against
+ * it till we get a reply from the firmware on the completion status of the
+ * request.
+ */
+int set_filter_wr(struct adapter *adapter, int fidx)
+{
+	struct filter_entry *f = &adapter->tids.ftid_tab[fidx];
+	struct fw_filter2_wr *fwr;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(sizeof(*fwr), GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	/* If the new filter requires loopback Destination MAC and/or VLAN
+	 * rewriting then we need to allocate a Layer 2 Table (L2T) entry for
+	 * the filter.
+	 */
+	if (f->fs.newdmac || f->fs.newvlan) {
+		/* allocate L2T entry for new filter */
+		f->l2t = t4_l2t_alloc_switching(adapter, f->fs.vlan,
+						f->fs.eport, f->fs.dmac);
+		if (!f->l2t) {
+			kfree_skb(skb);
+			return -ENOMEM;
+		}
+	}
+
+	/* If the new filter requires loopback Source MAC rewriting then
+	 * we need to allocate a SMT entry for the filter.
+	 */
+	if (f->fs.newsmac) {
+		f->smt = cxgb4_smt_alloc_switching(f->dev, f->fs.smac);
+		if (!f->smt) {
+			if (f->l2t) {
+				cxgb4_l2t_release(f->l2t);
+				f->l2t = NULL;
+			}
+			kfree_skb(skb);
+			return -ENOMEM;
+		}
+	}
+
+	fwr = __skb_put_zero(skb, sizeof(*fwr));
+
+	/* It would be nice to put most of the following in t4_hw.c but most
+	 * of the work is translating the cxgbtool ch_filter_specification
+	 * into the Work Request and the definition of that structure is
+	 * currently in cxgbtool.h which isn't appropriate to pull into the
+	 * common code.  We may eventually try to come up with a more neutral
+	 * filter specification structure but for now it's easiest to simply
+	 * put this fairly direct code in line ...
+	 */
+	if (adapter->params.filter2_wr_support)
+		fwr->op_pkd = htonl(FW_WR_OP_V(FW_FILTER2_WR));
+	else
+		fwr->op_pkd = htonl(FW_WR_OP_V(FW_FILTER_WR));
+	fwr->len16_pkd = htonl(FW_WR_LEN16_V(sizeof(*fwr) / 16));
+	fwr->tid_to_iq =
+		htonl(FW_FILTER_WR_TID_V(f->tid) |
+		      FW_FILTER_WR_RQTYPE_V(f->fs.type) |
+		      FW_FILTER_WR_NOREPLY_V(0) |
+		      FW_FILTER_WR_IQ_V(f->fs.iq));
+	fwr->del_filter_to_l2tix =
+		htonl(FW_FILTER_WR_RPTTID_V(f->fs.rpttid) |
+		      FW_FILTER_WR_DROP_V(f->fs.action == FILTER_DROP) |
+		      FW_FILTER_WR_DIRSTEER_V(f->fs.dirsteer) |
+		      FW_FILTER_WR_MASKHASH_V(f->fs.maskhash) |
+		      FW_FILTER_WR_DIRSTEERHASH_V(f->fs.dirsteerhash) |
+		      FW_FILTER_WR_LPBK_V(f->fs.action == FILTER_SWITCH) |
+		      FW_FILTER_WR_DMAC_V(f->fs.newdmac) |
+		      FW_FILTER_WR_INSVLAN_V(f->fs.newvlan == VLAN_INSERT ||
+					     f->fs.newvlan == VLAN_REWRITE) |
+		      FW_FILTER_WR_RMVLAN_V(f->fs.newvlan == VLAN_REMOVE ||
+					    f->fs.newvlan == VLAN_REWRITE) |
+		      FW_FILTER_WR_HITCNTS_V(f->fs.hitcnts) |
+		      FW_FILTER_WR_TXCHAN_V(f->fs.eport) |
+		      FW_FILTER_WR_PRIO_V(f->fs.prio) |
+		      FW_FILTER_WR_L2TIX_V(f->l2t ? f->l2t->idx : 0));
+	fwr->ethtype = htons(f->fs.val.ethtype);
+	fwr->ethtypem = htons(f->fs.mask.ethtype);
+	fwr->frag_to_ovlan_vldm =
+		(FW_FILTER_WR_FRAG_V(f->fs.val.frag) |
+		 FW_FILTER_WR_FRAGM_V(f->fs.mask.frag) |
+		 FW_FILTER_WR_IVLAN_VLD_V(f->fs.val.ivlan_vld) |
+		 FW_FILTER_WR_OVLAN_VLD_V(f->fs.val.ovlan_vld) |
+		 FW_FILTER_WR_IVLAN_VLDM_V(f->fs.mask.ivlan_vld) |
+		 FW_FILTER_WR_OVLAN_VLDM_V(f->fs.mask.ovlan_vld));
+	fwr->smac_sel = 0;
+	fwr->rx_chan_rx_rpl_iq =
+		htons(FW_FILTER_WR_RX_CHAN_V(0) |
+		      FW_FILTER_WR_RX_RPL_IQ_V(adapter->sge.fw_evtq.abs_id));
+	fwr->maci_to_matchtypem =
+		htonl(FW_FILTER_WR_MACI_V(f->fs.val.macidx) |
+		      FW_FILTER_WR_MACIM_V(f->fs.mask.macidx) |
+		      FW_FILTER_WR_FCOE_V(f->fs.val.fcoe) |
+		      FW_FILTER_WR_FCOEM_V(f->fs.mask.fcoe) |
+		      FW_FILTER_WR_PORT_V(f->fs.val.iport) |
+		      FW_FILTER_WR_PORTM_V(f->fs.mask.iport) |
+		      FW_FILTER_WR_MATCHTYPE_V(f->fs.val.matchtype) |
+		      FW_FILTER_WR_MATCHTYPEM_V(f->fs.mask.matchtype));
+	fwr->ptcl = f->fs.val.proto;
+	fwr->ptclm = f->fs.mask.proto;
+	fwr->ttyp = f->fs.val.tos;
+	fwr->ttypm = f->fs.mask.tos;
+	fwr->ivlan = htons(f->fs.val.ivlan);
+	fwr->ivlanm = htons(f->fs.mask.ivlan);
+	fwr->ovlan = htons(f->fs.val.ovlan);
+	fwr->ovlanm = htons(f->fs.mask.ovlan);
+	memcpy(fwr->lip, f->fs.val.lip, sizeof(fwr->lip));
+	memcpy(fwr->lipm, f->fs.mask.lip, sizeof(fwr->lipm));
+	memcpy(fwr->fip, f->fs.val.fip, sizeof(fwr->fip));
+	memcpy(fwr->fipm, f->fs.mask.fip, sizeof(fwr->fipm));
+	fwr->lp = htons(f->fs.val.lport);
+	fwr->lpm = htons(f->fs.mask.lport);
+	fwr->fp = htons(f->fs.val.fport);
+	fwr->fpm = htons(f->fs.mask.fport);
+
+	if (adapter->params.filter2_wr_support) {
+		fwr->natmode_to_ulp_type =
+			FW_FILTER2_WR_ULP_TYPE_V(f->fs.nat_mode ?
+						 ULP_MODE_TCPDDP :
+						 ULP_MODE_NONE) |
+			FW_FILTER2_WR_NATMODE_V(f->fs.nat_mode);
+		memcpy(fwr->newlip, f->fs.nat_lip, sizeof(fwr->newlip));
+		memcpy(fwr->newfip, f->fs.nat_fip, sizeof(fwr->newfip));
+		fwr->newlport = htons(f->fs.nat_lport);
+		fwr->newfport = htons(f->fs.nat_fport);
+	}
+
+	/* Mark the filter as "pending" and ship off the Filter Work Request.
+	 * When we get the Work Request Reply we'll clear the pending status.
+	 */
+	f->pending = 1;
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, f->fs.val.iport & 0x3);
+	t4_ofld_send(adapter, skb);
+	return 0;
+}
+
+/* Return an error number if the indicated filter isn't writable ... */
+int writable_filter(struct filter_entry *f)
+{
+	if (f->locked)
+		return -EPERM;
+	if (f->pending)
+		return -EBUSY;
+
+	return 0;
+}
+
+/* Delete the filter at the specified index (if valid).  The checks for all
+ * the common problems with doing this like the filter being locked, currently
+ * pending in another operation, etc.
+ */
+int delete_filter(struct adapter *adapter, unsigned int fidx)
+{
+	struct filter_entry *f;
+	int ret;
+
+	if (fidx >= adapter->tids.nftids + adapter->tids.nsftids)
+		return -EINVAL;
+
+	f = &adapter->tids.ftid_tab[fidx];
+	ret = writable_filter(f);
+	if (ret)
+		return ret;
+	if (f->valid)
+		return del_filter_wr(adapter, fidx);
+
+	return 0;
+}
+
+/* Clear a filter and release any of its resources that we own.  This also
+ * clears the filter's "pending" status.
+ */
+void clear_filter(struct adapter *adap, struct filter_entry *f)
+{
+	/* If the new or old filter have loopback rewriteing rules then we'll
+	 * need to free any existing L2T, SMT, CLIP entries of filter
+	 * rule.
+	 */
+	if (f->l2t)
+		cxgb4_l2t_release(f->l2t);
+
+	if (f->smt)
+		cxgb4_smt_release(f->smt);
+
+	if ((f->fs.hash || is_t6(adap->params.chip)) && f->fs.type)
+		cxgb4_clip_release(f->dev, (const u32 *)&f->fs.val.lip, 1);
+
+	/* The zeroing of the filter rule below clears the filter valid,
+	 * pending, locked flags, l2t pointer, etc. so it's all we need for
+	 * this operation.
+	 */
+	memset(f, 0, sizeof(*f));
+}
+
+void clear_all_filters(struct adapter *adapter)
+{
+	unsigned int i;
+
+	if (adapter->tids.ftid_tab) {
+		struct filter_entry *f = &adapter->tids.ftid_tab[0];
+		unsigned int max_ftid = adapter->tids.nftids +
+					adapter->tids.nsftids;
+
+		for (i = 0; i < max_ftid; i++, f++)
+			if (f->valid || f->pending)
+				clear_filter(adapter, f);
+	}
+}
+
+/* Fill up default masks for set match fields. */
+static void fill_default_mask(struct ch_filter_specification *fs)
+{
+	unsigned int lip = 0, lip_mask = 0;
+	unsigned int fip = 0, fip_mask = 0;
+	unsigned int i;
+
+	if (fs->val.iport && !fs->mask.iport)
+		fs->mask.iport |= ~0;
+	if (fs->val.fcoe && !fs->mask.fcoe)
+		fs->mask.fcoe |= ~0;
+	if (fs->val.matchtype && !fs->mask.matchtype)
+		fs->mask.matchtype |= ~0;
+	if (fs->val.macidx && !fs->mask.macidx)
+		fs->mask.macidx |= ~0;
+	if (fs->val.ethtype && !fs->mask.ethtype)
+		fs->mask.ethtype |= ~0;
+	if (fs->val.ivlan && !fs->mask.ivlan)
+		fs->mask.ivlan |= ~0;
+	if (fs->val.ovlan && !fs->mask.ovlan)
+		fs->mask.ovlan |= ~0;
+	if (fs->val.frag && !fs->mask.frag)
+		fs->mask.frag |= ~0;
+	if (fs->val.tos && !fs->mask.tos)
+		fs->mask.tos |= ~0;
+	if (fs->val.proto && !fs->mask.proto)
+		fs->mask.proto |= ~0;
+
+	for (i = 0; i < ARRAY_SIZE(fs->val.lip); i++) {
+		lip |= fs->val.lip[i];
+		lip_mask |= fs->mask.lip[i];
+		fip |= fs->val.fip[i];
+		fip_mask |= fs->mask.fip[i];
+	}
+
+	if (lip && !lip_mask)
+		memset(fs->mask.lip, ~0, sizeof(fs->mask.lip));
+
+	if (fip && !fip_mask)
+		memset(fs->mask.fip, ~0, sizeof(fs->mask.lip));
+
+	if (fs->val.lport && !fs->mask.lport)
+		fs->mask.lport = ~0;
+	if (fs->val.fport && !fs->mask.fport)
+		fs->mask.fport = ~0;
+}
+
+static bool is_addr_all_mask(u8 *ipmask, int family)
+{
+	if (family == AF_INET) {
+		struct in_addr *addr;
+
+		addr = (struct in_addr *)ipmask;
+		if (addr->s_addr == 0xffffffff)
+			return true;
+	} else if (family == AF_INET6) {
+		struct in6_addr *addr6;
+
+		addr6 = (struct in6_addr *)ipmask;
+		if (addr6->s6_addr32[0] == 0xffffffff &&
+		    addr6->s6_addr32[1] == 0xffffffff &&
+		    addr6->s6_addr32[2] == 0xffffffff &&
+		    addr6->s6_addr32[3] == 0xffffffff)
+			return true;
+	}
+	return false;
+}
+
+static bool is_inaddr_any(u8 *ip, int family)
+{
+	int addr_type;
+
+	if (family == AF_INET) {
+		struct in_addr *addr;
+
+		addr = (struct in_addr *)ip;
+		if (addr->s_addr == htonl(INADDR_ANY))
+			return true;
+	} else if (family == AF_INET6) {
+		struct in6_addr *addr6;
+
+		addr6 = (struct in6_addr *)ip;
+		addr_type = ipv6_addr_type((const struct in6_addr *)
+					   &addr6);
+		if (addr_type == IPV6_ADDR_ANY)
+			return true;
+	}
+	return false;
+}
+
+bool is_filter_exact_match(struct adapter *adap,
+			   struct ch_filter_specification *fs)
+{
+	struct tp_params *tp = &adap->params.tp;
+	u64 hash_filter_mask = tp->hash_filter_mask;
+	u64 ntuple_mask = 0;
+
+	if (!is_hashfilter(adap))
+		return false;
+
+	if (fs->type) {
+		if (is_inaddr_any(fs->val.fip, AF_INET6) ||
+		    !is_addr_all_mask(fs->mask.fip, AF_INET6))
+			return false;
+
+		if (is_inaddr_any(fs->val.lip, AF_INET6) ||
+		    !is_addr_all_mask(fs->mask.lip, AF_INET6))
+			return false;
+	} else {
+		if (is_inaddr_any(fs->val.fip, AF_INET) ||
+		    !is_addr_all_mask(fs->mask.fip, AF_INET))
+			return false;
+
+		if (is_inaddr_any(fs->val.lip, AF_INET) ||
+		    !is_addr_all_mask(fs->mask.lip, AF_INET))
+			return false;
+	}
+
+	if (!fs->val.lport || fs->mask.lport != 0xffff)
+		return false;
+
+	if (!fs->val.fport || fs->mask.fport != 0xffff)
+		return false;
+
+	/* calculate tuple mask and compare with mask configured in hw */
+	if (tp->fcoe_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.fcoe << tp->fcoe_shift;
+
+	if (tp->port_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.iport << tp->port_shift;
+
+	if (tp->vnic_shift >= 0) {
+		if ((adap->params.tp.ingress_config & VNIC_F))
+			ntuple_mask |= (u64)fs->mask.pfvf_vld << tp->vnic_shift;
+		else
+			ntuple_mask |= (u64)fs->mask.ovlan_vld <<
+				tp->vnic_shift;
+	}
+
+	if (tp->vlan_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.ivlan << tp->vlan_shift;
+
+	if (tp->tos_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.tos << tp->tos_shift;
+
+	if (tp->protocol_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.proto << tp->protocol_shift;
+
+	if (tp->ethertype_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.ethtype << tp->ethertype_shift;
+
+	if (tp->macmatch_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.macidx << tp->macmatch_shift;
+
+	if (tp->matchtype_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.matchtype << tp->matchtype_shift;
+
+	if (tp->frag_shift >= 0)
+		ntuple_mask |= (u64)fs->mask.frag << tp->frag_shift;
+
+	if (ntuple_mask != hash_filter_mask)
+		return false;
+
+	return true;
+}
+
+static u64 hash_filter_ntuple(struct ch_filter_specification *fs,
+			      struct net_device *dev)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct tp_params *tp = &adap->params.tp;
+	u64 ntuple = 0;
+
+	/* Initialize each of the fields which we care about which are present
+	 * in the Compressed Filter Tuple.
+	 */
+	if (tp->vlan_shift >= 0 && fs->mask.ivlan)
+		ntuple |= (FT_VLAN_VLD_F | fs->val.ivlan) << tp->vlan_shift;
+
+	if (tp->port_shift >= 0 && fs->mask.iport)
+		ntuple |= (u64)fs->val.iport << tp->port_shift;
+
+	if (tp->protocol_shift >= 0) {
+		if (!fs->val.proto)
+			ntuple |= (u64)IPPROTO_TCP << tp->protocol_shift;
+		else
+			ntuple |= (u64)fs->val.proto << tp->protocol_shift;
+	}
+
+	if (tp->tos_shift >= 0 && fs->mask.tos)
+		ntuple |= (u64)(fs->val.tos) << tp->tos_shift;
+
+	if (tp->vnic_shift >= 0) {
+		if ((adap->params.tp.ingress_config & VNIC_F) &&
+		    fs->mask.pfvf_vld)
+			ntuple |= (u64)((fs->val.pfvf_vld << 16) |
+					(fs->val.pf << 13) |
+					(fs->val.vf)) << tp->vnic_shift;
+		else
+			ntuple |= (u64)((fs->val.ovlan_vld << 16) |
+					(fs->val.ovlan)) << tp->vnic_shift;
+	}
+
+	if (tp->macmatch_shift >= 0 && fs->mask.macidx)
+		ntuple |= (u64)(fs->val.macidx) << tp->macmatch_shift;
+
+	if (tp->ethertype_shift >= 0 && fs->mask.ethtype)
+		ntuple |= (u64)(fs->val.ethtype) << tp->ethertype_shift;
+
+	if (tp->matchtype_shift >= 0 && fs->mask.matchtype)
+		ntuple |= (u64)(fs->val.matchtype) << tp->matchtype_shift;
+
+	if (tp->frag_shift >= 0 && fs->mask.frag)
+		ntuple |= (u64)(fs->val.frag) << tp->frag_shift;
+
+	if (tp->fcoe_shift >= 0 && fs->mask.fcoe)
+		ntuple |= (u64)(fs->val.fcoe) << tp->fcoe_shift;
+	return ntuple;
+}
+
+static void mk_act_open_req6(struct filter_entry *f, struct sk_buff *skb,
+			     unsigned int qid_filterid, struct adapter *adap)
+{
+	struct cpl_t6_act_open_req6 *t6req = NULL;
+	struct cpl_act_open_req6 *req = NULL;
+
+	t6req = (struct cpl_t6_act_open_req6 *)__skb_put(skb, sizeof(*t6req));
+	INIT_TP_WR(t6req, 0);
+	req = (struct cpl_act_open_req6 *)t6req;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, qid_filterid));
+	req->local_port = cpu_to_be16(f->fs.val.lport);
+	req->peer_port = cpu_to_be16(f->fs.val.fport);
+	req->local_ip_hi = *(__be64 *)(&f->fs.val.lip);
+	req->local_ip_lo = *(((__be64 *)&f->fs.val.lip) + 1);
+	req->peer_ip_hi = *(__be64 *)(&f->fs.val.fip);
+	req->peer_ip_lo = *(((__be64 *)&f->fs.val.fip) + 1);
+	req->opt0 = cpu_to_be64(NAGLE_V(f->fs.newvlan == VLAN_REMOVE ||
+					f->fs.newvlan == VLAN_REWRITE) |
+				DELACK_V(f->fs.hitcnts) |
+				L2T_IDX_V(f->l2t ? f->l2t->idx : 0) |
+				SMAC_SEL_V((cxgb4_port_viid(f->dev) &
+					    0x7F) << 1) |
+				TX_CHAN_V(f->fs.eport) |
+				NO_CONG_V(f->fs.rpttid) |
+				ULP_MODE_V(f->fs.nat_mode ?
+					   ULP_MODE_TCPDDP : ULP_MODE_NONE) |
+				TCAM_BYPASS_F | NON_OFFLOAD_F);
+	t6req->params = cpu_to_be64(FILTER_TUPLE_V(hash_filter_ntuple(&f->fs,
+								      f->dev)));
+	t6req->opt2 = htonl(RSS_QUEUE_VALID_F |
+			    RSS_QUEUE_V(f->fs.iq) |
+			    TX_QUEUE_V(f->fs.nat_mode) |
+			    T5_OPT_2_VALID_F |
+			    RX_CHANNEL_F |
+			    CONG_CNTRL_V((f->fs.action == FILTER_DROP) |
+					 (f->fs.dirsteer << 1)) |
+			    PACE_V((f->fs.maskhash) |
+				   ((f->fs.dirsteerhash) << 1)) |
+			    CCTRL_ECN_V(f->fs.action == FILTER_SWITCH));
+}
+
+static void mk_act_open_req(struct filter_entry *f, struct sk_buff *skb,
+			    unsigned int qid_filterid, struct adapter *adap)
+{
+	struct cpl_t6_act_open_req *t6req = NULL;
+	struct cpl_act_open_req *req = NULL;
+
+	t6req = (struct cpl_t6_act_open_req *)__skb_put(skb, sizeof(*t6req));
+	INIT_TP_WR(t6req, 0);
+	req = (struct cpl_act_open_req *)t6req;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_filterid));
+	req->local_port = cpu_to_be16(f->fs.val.lport);
+	req->peer_port = cpu_to_be16(f->fs.val.fport);
+	req->local_ip = f->fs.val.lip[0] | f->fs.val.lip[1] << 8 |
+		f->fs.val.lip[2] << 16 | f->fs.val.lip[3] << 24;
+	req->peer_ip = f->fs.val.fip[0] | f->fs.val.fip[1] << 8 |
+		f->fs.val.fip[2] << 16 | f->fs.val.fip[3] << 24;
+	req->opt0 = cpu_to_be64(NAGLE_V(f->fs.newvlan == VLAN_REMOVE ||
+					f->fs.newvlan == VLAN_REWRITE) |
+				DELACK_V(f->fs.hitcnts) |
+				L2T_IDX_V(f->l2t ? f->l2t->idx : 0) |
+				SMAC_SEL_V((cxgb4_port_viid(f->dev) &
+					    0x7F) << 1) |
+				TX_CHAN_V(f->fs.eport) |
+				NO_CONG_V(f->fs.rpttid) |
+				ULP_MODE_V(f->fs.nat_mode ?
+					   ULP_MODE_TCPDDP : ULP_MODE_NONE) |
+				TCAM_BYPASS_F | NON_OFFLOAD_F);
+
+	t6req->params = cpu_to_be64(FILTER_TUPLE_V(hash_filter_ntuple(&f->fs,
+								      f->dev)));
+	t6req->opt2 = htonl(RSS_QUEUE_VALID_F |
+			    RSS_QUEUE_V(f->fs.iq) |
+			    TX_QUEUE_V(f->fs.nat_mode) |
+			    T5_OPT_2_VALID_F |
+			    RX_CHANNEL_F |
+			    CONG_CNTRL_V((f->fs.action == FILTER_DROP) |
+					 (f->fs.dirsteer << 1)) |
+			    PACE_V((f->fs.maskhash) |
+				   ((f->fs.dirsteerhash) << 1)) |
+			    CCTRL_ECN_V(f->fs.action == FILTER_SWITCH));
+}
+
+static int cxgb4_set_hash_filter(struct net_device *dev,
+				 struct ch_filter_specification *fs,
+				 struct filter_ctx *ctx)
+{
+	struct adapter *adapter = netdev2adap(dev);
+	struct tid_info *t = &adapter->tids;
+	struct filter_entry *f;
+	struct sk_buff *skb;
+	int iq, atid, size;
+	int ret = 0;
+	u32 iconf;
+
+	fill_default_mask(fs);
+	ret = validate_filter(dev, fs);
+	if (ret)
+		return ret;
+
+	iq = get_filter_steerq(dev, fs);
+	if (iq < 0)
+		return iq;
+
+	f = kzalloc(sizeof(*f), GFP_KERNEL);
+	if (!f)
+		return -ENOMEM;
+
+	f->fs = *fs;
+	f->ctx = ctx;
+	f->dev = dev;
+	f->fs.iq = iq;
+
+	/* If the new filter requires loopback Destination MAC and/or VLAN
+	 * rewriting then we need to allocate a Layer 2 Table (L2T) entry for
+	 * the filter.
+	 */
+	if (f->fs.newdmac || f->fs.newvlan) {
+		/* allocate L2T entry for new filter */
+		f->l2t = t4_l2t_alloc_switching(adapter, f->fs.vlan,
+						f->fs.eport, f->fs.dmac);
+		if (!f->l2t) {
+			ret = -ENOMEM;
+			goto out_err;
+		}
+	}
+
+	/* If the new filter requires loopback Source MAC rewriting then
+	 * we need to allocate a SMT entry for the filter.
+	 */
+	if (f->fs.newsmac) {
+		f->smt = cxgb4_smt_alloc_switching(f->dev, f->fs.smac);
+		if (!f->smt) {
+			if (f->l2t) {
+				cxgb4_l2t_release(f->l2t);
+				f->l2t = NULL;
+			}
+			ret = -ENOMEM;
+			goto free_l2t;
+		}
+	}
+
+	atid = cxgb4_alloc_atid(t, f);
+	if (atid < 0) {
+		ret = atid;
+		goto free_smt;
+	}
+
+	iconf = adapter->params.tp.ingress_config;
+	if (iconf & VNIC_F) {
+		f->fs.val.ovlan = (fs->val.pf << 13) | fs->val.vf;
+		f->fs.mask.ovlan = (fs->mask.pf << 13) | fs->mask.vf;
+		f->fs.val.ovlan_vld = fs->val.pfvf_vld;
+		f->fs.mask.ovlan_vld = fs->mask.pfvf_vld;
+	}
+
+	size = sizeof(struct cpl_t6_act_open_req);
+	if (f->fs.type) {
+		ret = cxgb4_clip_get(f->dev, (const u32 *)&f->fs.val.lip, 1);
+		if (ret)
+			goto free_atid;
+
+		skb = alloc_skb(size, GFP_KERNEL);
+		if (!skb) {
+			ret = -ENOMEM;
+			goto free_clip;
+		}
+
+		mk_act_open_req6(f, skb,
+				 ((adapter->sge.fw_evtq.abs_id << 14) | atid),
+				 adapter);
+	} else {
+		skb = alloc_skb(size, GFP_KERNEL);
+		if (!skb) {
+			ret = -ENOMEM;
+			goto free_atid;
+		}
+
+		mk_act_open_req(f, skb,
+				((adapter->sge.fw_evtq.abs_id << 14) | atid),
+				adapter);
+	}
+
+	f->pending = 1;
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, f->fs.val.iport & 0x3);
+	t4_ofld_send(adapter, skb);
+	return 0;
+
+free_clip:
+	cxgb4_clip_release(f->dev, (const u32 *)&f->fs.val.lip, 1);
+
+free_atid:
+	cxgb4_free_atid(t, atid);
+
+free_smt:
+	if (f->smt) {
+		cxgb4_smt_release(f->smt);
+		f->smt = NULL;
+	}
+
+free_l2t:
+	if (f->l2t) {
+		cxgb4_l2t_release(f->l2t);
+		f->l2t = NULL;
+	}
+
+out_err:
+	kfree(f);
+	return ret;
+}
+
+/* Check a Chelsio Filter Request for validity, convert it into our internal
+ * format and send it to the hardware.  Return 0 on success, an error number
+ * otherwise.  We attach any provided filter operation context to the internal
+ * filter specification in order to facilitate signaling completion of the
+ * operation.
+ */
+int __cxgb4_set_filter(struct net_device *dev, int filter_id,
+		       struct ch_filter_specification *fs,
+		       struct filter_ctx *ctx)
+{
+	struct adapter *adapter = netdev2adap(dev);
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adapter->params.chip);
+	unsigned int max_fidx, fidx;
+	struct filter_entry *f;
+	u32 iconf;
+	int iq, ret;
+
+	if (fs->hash) {
+		if (is_hashfilter(adapter))
+			return cxgb4_set_hash_filter(dev, fs, ctx);
+		netdev_err(dev, "%s: Exact-match filters only supported with Hash Filter configuration\n",
+			   __func__);
+		return -EINVAL;
+	}
+
+	max_fidx = adapter->tids.nftids;
+	if (filter_id != (max_fidx + adapter->tids.nsftids - 1) &&
+	    filter_id >= max_fidx)
+		return -E2BIG;
+
+	fill_default_mask(fs);
+
+	ret = validate_filter(dev, fs);
+	if (ret)
+		return ret;
+
+	iq = get_filter_steerq(dev, fs);
+	if (iq < 0)
+		return iq;
+
+	/* IPv6 filters occupy four slots and must be aligned on
+	 * four-slot boundaries.  IPv4 filters only occupy a single
+	 * slot and have no alignment requirements but writing a new
+	 * IPv4 filter into the middle of an existing IPv6 filter
+	 * requires clearing the old IPv6 filter and hence we prevent
+	 * insertion.
+	 */
+	if (fs->type == 0) { /* IPv4 */
+		/* For T6, If our IPv4 filter isn't being written to a
+		 * multiple of two filter index and there's an IPv6
+		 * filter at the multiple of 2 base slot, then we need
+		 * to delete that IPv6 filter ...
+		 * For adapters below T6, IPv6 filter occupies 4 entries.
+		 * Hence we need to delete the filter in multiple of 4 slot.
+		 */
+		if (chip_ver < CHELSIO_T6)
+			fidx = filter_id & ~0x3;
+		else
+			fidx = filter_id & ~0x1;
+
+		if (fidx != filter_id &&
+		    adapter->tids.ftid_tab[fidx].fs.type) {
+			f = &adapter->tids.ftid_tab[fidx];
+			if (f->valid) {
+				dev_err(adapter->pdev_dev,
+					"Invalid location. IPv6 requires 4 slots and is occupying slots %u to %u\n",
+					fidx, fidx + 3);
+				return -EINVAL;
+			}
+		}
+	} else { /* IPv6 */
+		if (chip_ver < CHELSIO_T6) {
+			/* Ensure that the IPv6 filter is aligned on a
+			 * multiple of 4 boundary.
+			 */
+			if (filter_id & 0x3) {
+				dev_err(adapter->pdev_dev,
+					"Invalid location. IPv6 must be aligned on a 4-slot boundary\n");
+				return -EINVAL;
+			}
+
+			/* Check all except the base overlapping IPv4 filter
+			 * slots.
+			 */
+			for (fidx = filter_id + 1; fidx < filter_id + 4;
+			     fidx++) {
+				f = &adapter->tids.ftid_tab[fidx];
+				if (f->valid) {
+					dev_err(adapter->pdev_dev,
+						"Invalid location.  IPv6 requires 4 slots and an IPv4 filter exists at %u\n",
+						fidx);
+					return -EBUSY;
+				}
+			}
+		} else {
+			/* For T6, CLIP being enabled, IPv6 filter would occupy
+			 * 2 entries.
+			 */
+			if (filter_id & 0x1)
+				return -EINVAL;
+			/* Check overlapping IPv4 filter slot */
+			fidx = filter_id + 1;
+			f = &adapter->tids.ftid_tab[fidx];
+			if (f->valid) {
+				pr_err("%s: IPv6 filter requires 2 indices. IPv4 filter already present at %d. Please remove IPv4 filter first.\n",
+				       __func__, fidx);
+				return -EBUSY;
+			}
+		}
+	}
+
+	/* Check to make sure that provided filter index is not
+	 * already in use by someone else
+	 */
+	f = &adapter->tids.ftid_tab[filter_id];
+	if (f->valid)
+		return -EBUSY;
+
+	fidx = filter_id + adapter->tids.ftid_base;
+	ret = cxgb4_set_ftid(&adapter->tids, filter_id,
+			     fs->type ? PF_INET6 : PF_INET,
+			     chip_ver);
+	if (ret)
+		return ret;
+
+	/* Check t  make sure the filter requested is writable ... */
+	ret = writable_filter(f);
+	if (ret) {
+		/* Clear the bits we have set above */
+		cxgb4_clear_ftid(&adapter->tids, filter_id,
+				 fs->type ? PF_INET6 : PF_INET,
+				 chip_ver);
+		return ret;
+	}
+
+	if (is_t6(adapter->params.chip) && fs->type &&
+	    ipv6_addr_type((const struct in6_addr *)fs->val.lip) !=
+	    IPV6_ADDR_ANY) {
+		ret = cxgb4_clip_get(dev, (const u32 *)&fs->val.lip, 1);
+		if (ret) {
+			cxgb4_clear_ftid(&adapter->tids, filter_id, PF_INET6,
+					 chip_ver);
+			return ret;
+		}
+	}
+
+	/* Convert the filter specification into our internal format.
+	 * We copy the PF/VF specification into the Outer VLAN field
+	 * here so the rest of the code -- including the interface to
+	 * the firmware -- doesn't have to constantly do these checks.
+	 */
+	f->fs = *fs;
+	f->fs.iq = iq;
+	f->dev = dev;
+
+	iconf = adapter->params.tp.ingress_config;
+	if (iconf & VNIC_F) {
+		f->fs.val.ovlan = (fs->val.pf << 13) | fs->val.vf;
+		f->fs.mask.ovlan = (fs->mask.pf << 13) | fs->mask.vf;
+		f->fs.val.ovlan_vld = fs->val.pfvf_vld;
+		f->fs.mask.ovlan_vld = fs->mask.pfvf_vld;
+	}
+
+	/* Attempt to set the filter.  If we don't succeed, we clear
+	 * it and return the failure.
+	 */
+	f->ctx = ctx;
+	f->tid = fidx; /* Save the actual tid */
+	ret = set_filter_wr(adapter, filter_id);
+	if (ret) {
+		cxgb4_clear_ftid(&adapter->tids, filter_id,
+				 fs->type ? PF_INET6 : PF_INET,
+				 chip_ver);
+		clear_filter(adapter, f);
+	}
+
+	return ret;
+}
+
+static int cxgb4_del_hash_filter(struct net_device *dev, int filter_id,
+				 struct filter_ctx *ctx)
+{
+	struct adapter *adapter = netdev2adap(dev);
+	struct tid_info *t = &adapter->tids;
+	struct cpl_abort_req *abort_req;
+	struct cpl_abort_rpl *abort_rpl;
+	struct cpl_set_tcb_field *req;
+	struct ulptx_idata *aligner;
+	struct work_request_hdr *wr;
+	struct filter_entry *f;
+	struct sk_buff *skb;
+	unsigned int wrlen;
+	int ret;
+
+	netdev_dbg(dev, "%s: filter_id = %d ; nftids = %d\n",
+		   __func__, filter_id, adapter->tids.nftids);
+
+	if (filter_id > adapter->tids.ntids)
+		return -E2BIG;
+
+	f = lookup_tid(t, filter_id);
+	if (!f) {
+		netdev_err(dev, "%s: no filter entry for filter_id = %d",
+			   __func__, filter_id);
+		return -EINVAL;
+	}
+
+	ret = writable_filter(f);
+	if (ret)
+		return ret;
+
+	if (!f->valid)
+		return -EINVAL;
+
+	f->ctx = ctx;
+	f->pending = 1;
+	wrlen = roundup(sizeof(*wr) + (sizeof(*req) + sizeof(*aligner))
+			+ sizeof(*abort_req) + sizeof(*abort_rpl), 16);
+	skb = alloc_skb(wrlen, GFP_KERNEL);
+	if (!skb) {
+		netdev_err(dev, "%s: could not allocate skb ..\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, f->fs.val.iport & 0x3);
+	req = (struct cpl_set_tcb_field *)__skb_put(skb, wrlen);
+	INIT_ULPTX_WR(req, wrlen, 0, 0);
+	wr = (struct work_request_hdr *)req;
+	wr++;
+	req = (struct cpl_set_tcb_field *)wr;
+	mk_set_tcb_ulp(f, req, TCB_RSS_INFO_W, TCB_RSS_INFO_V(TCB_RSS_INFO_M),
+		       TCB_RSS_INFO_V(adapter->sge.fw_evtq.abs_id), 0, 1);
+	aligner = (struct ulptx_idata *)(req + 1);
+	abort_req = (struct cpl_abort_req *)(aligner + 1);
+	mk_abort_req_ulp(abort_req, f->tid);
+	abort_rpl = (struct cpl_abort_rpl *)(abort_req + 1);
+	mk_abort_rpl_ulp(abort_rpl, f->tid);
+	t4_ofld_send(adapter, skb);
+	return 0;
+}
+
+/* Check a delete filter request for validity and send it to the hardware.
+ * Return 0 on success, an error number otherwise.  We attach any provided
+ * filter operation context to the internal filter specification in order to
+ * facilitate signaling completion of the operation.
+ */
+int __cxgb4_del_filter(struct net_device *dev, int filter_id,
+		       struct ch_filter_specification *fs,
+		       struct filter_ctx *ctx)
+{
+	struct adapter *adapter = netdev2adap(dev);
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adapter->params.chip);
+	struct filter_entry *f;
+	unsigned int max_fidx;
+	int ret;
+
+	if (fs && fs->hash) {
+		if (is_hashfilter(adapter))
+			return cxgb4_del_hash_filter(dev, filter_id, ctx);
+		netdev_err(dev, "%s: Exact-match filters only supported with Hash Filter configuration\n",
+			   __func__);
+		return -EINVAL;
+	}
+
+	max_fidx = adapter->tids.nftids;
+	if (filter_id != (max_fidx + adapter->tids.nsftids - 1) &&
+	    filter_id >= max_fidx)
+		return -E2BIG;
+
+	f = &adapter->tids.ftid_tab[filter_id];
+	ret = writable_filter(f);
+	if (ret)
+		return ret;
+
+	if (f->valid) {
+		f->ctx = ctx;
+		cxgb4_clear_ftid(&adapter->tids, filter_id,
+				 f->fs.type ? PF_INET6 : PF_INET,
+				 chip_ver);
+		return del_filter_wr(adapter, filter_id);
+	}
+
+	/* If the caller has passed in a Completion Context then we need to
+	 * mark it as a successful completion so they don't stall waiting
+	 * for it.
+	 */
+	if (ctx) {
+		ctx->result = 0;
+		complete(&ctx->completion);
+	}
+	return ret;
+}
+
+int cxgb4_set_filter(struct net_device *dev, int filter_id,
+		     struct ch_filter_specification *fs)
+{
+	struct filter_ctx ctx;
+	int ret;
+
+	init_completion(&ctx.completion);
+
+	ret = __cxgb4_set_filter(dev, filter_id, fs, &ctx);
+	if (ret)
+		goto out;
+
+	/* Wait for reply */
+	ret = wait_for_completion_timeout(&ctx.completion, 10 * HZ);
+	if (!ret)
+		return -ETIMEDOUT;
+
+	ret = ctx.result;
+out:
+	return ret;
+}
+
+int cxgb4_del_filter(struct net_device *dev, int filter_id,
+		     struct ch_filter_specification *fs)
+{
+	struct filter_ctx ctx;
+	int ret;
+
+	/* If we are shutting down the adapter do not wait for completion */
+	if (netdev2adap(dev)->flags & SHUTTING_DOWN)
+		return __cxgb4_del_filter(dev, filter_id, fs, NULL);
+
+	init_completion(&ctx.completion);
+
+	ret = __cxgb4_del_filter(dev, filter_id, fs, &ctx);
+	if (ret)
+		goto out;
+
+	/* Wait for reply */
+	ret = wait_for_completion_timeout(&ctx.completion, 10 * HZ);
+	if (!ret)
+		return -ETIMEDOUT;
+
+	ret = ctx.result;
+out:
+	return ret;
+}
+
+static int configure_filter_tcb(struct adapter *adap, unsigned int tid,
+				struct filter_entry *f)
+{
+	if (f->fs.hitcnts)
+		set_tcb_field(adap, f, tid, TCB_TIMESTAMP_W,
+			      TCB_TIMESTAMP_V(TCB_TIMESTAMP_M) |
+			      TCB_RTT_TS_RECENT_AGE_V(TCB_RTT_TS_RECENT_AGE_M),
+			      TCB_TIMESTAMP_V(0ULL) |
+			      TCB_RTT_TS_RECENT_AGE_V(0ULL),
+			      1);
+
+	if (f->fs.newdmac)
+		set_tcb_tflag(adap, f, tid, TF_CCTRL_ECE_S, 1,
+			      1);
+
+	if (f->fs.newvlan == VLAN_INSERT ||
+	    f->fs.newvlan == VLAN_REWRITE)
+		set_tcb_tflag(adap, f, tid, TF_CCTRL_RFR_S, 1,
+			      1);
+	if (f->fs.newsmac)
+		configure_filter_smac(adap, f);
+
+	if (f->fs.nat_mode) {
+		switch (f->fs.nat_mode) {
+		case NAT_MODE_DIP:
+			set_nat_params(adap, f, tid, true, false, false, false);
+			break;
+
+		case NAT_MODE_DIP_DP:
+			set_nat_params(adap, f, tid, true, false, true, false);
+			break;
+
+		case NAT_MODE_DIP_DP_SIP:
+			set_nat_params(adap, f, tid, true, true, true, false);
+			break;
+		case NAT_MODE_DIP_DP_SP:
+			set_nat_params(adap, f, tid, true, false, true, true);
+			break;
+
+		case NAT_MODE_SIP_SP:
+			set_nat_params(adap, f, tid, false, true, false, true);
+			break;
+
+		case NAT_MODE_DIP_SIP_SP:
+			set_nat_params(adap, f, tid, true, true, false, true);
+			break;
+
+		case NAT_MODE_ALL:
+			set_nat_params(adap, f, tid, true, true, true, true);
+			break;
+
+		default:
+			pr_err("%s: Invalid NAT mode: %d\n",
+			       __func__, f->fs.nat_mode);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+void hash_del_filter_rpl(struct adapter *adap,
+			 const struct cpl_abort_rpl_rss *rpl)
+{
+	unsigned int status = rpl->status;
+	struct tid_info *t = &adap->tids;
+	unsigned int tid = GET_TID(rpl);
+	struct filter_ctx *ctx = NULL;
+	struct filter_entry *f;
+
+	dev_dbg(adap->pdev_dev, "%s: status = %u; tid = %u\n",
+		__func__, status, tid);
+
+	f = lookup_tid(t, tid);
+	if (!f) {
+		dev_err(adap->pdev_dev, "%s:could not find filter entry",
+			__func__);
+		return;
+	}
+	ctx = f->ctx;
+	f->ctx = NULL;
+	clear_filter(adap, f);
+	cxgb4_remove_tid(t, 0, tid, 0);
+	kfree(f);
+	if (ctx) {
+		ctx->result = 0;
+		complete(&ctx->completion);
+	}
+}
+
+void hash_filter_rpl(struct adapter *adap, const struct cpl_act_open_rpl *rpl)
+{
+	unsigned int ftid = TID_TID_G(AOPEN_ATID_G(ntohl(rpl->atid_status)));
+	unsigned int status  = AOPEN_STATUS_G(ntohl(rpl->atid_status));
+	struct tid_info *t = &adap->tids;
+	unsigned int tid = GET_TID(rpl);
+	struct filter_ctx *ctx = NULL;
+	struct filter_entry *f;
+
+	dev_dbg(adap->pdev_dev, "%s: tid = %u; atid = %u; status = %u\n",
+		__func__, tid, ftid, status);
+
+	f = lookup_atid(t, ftid);
+	if (!f) {
+		dev_err(adap->pdev_dev, "%s:could not find filter entry",
+			__func__);
+		return;
+	}
+	ctx = f->ctx;
+	f->ctx = NULL;
+
+	switch (status) {
+	case CPL_ERR_NONE:
+		f->tid = tid;
+		f->pending = 0;
+		f->valid = 1;
+		cxgb4_insert_tid(t, f, f->tid, 0);
+		cxgb4_free_atid(t, ftid);
+		if (ctx) {
+			ctx->tid = f->tid;
+			ctx->result = 0;
+		}
+		if (configure_filter_tcb(adap, tid, f)) {
+			clear_filter(adap, f);
+			cxgb4_remove_tid(t, 0, tid, 0);
+			kfree(f);
+			if (ctx) {
+				ctx->result = -EINVAL;
+				complete(&ctx->completion);
+			}
+			return;
+		}
+		break;
+
+	default:
+		dev_err(adap->pdev_dev, "%s: filter creation PROBLEM; status = %u\n",
+			__func__, status);
+
+		if (ctx) {
+			if (status == CPL_ERR_TCAM_FULL)
+				ctx->result = -EAGAIN;
+			else
+				ctx->result = -EINVAL;
+		}
+		clear_filter(adap, f);
+		cxgb4_free_atid(t, ftid);
+		kfree(f);
+	}
+	if (ctx)
+		complete(&ctx->completion);
+}
+
+/* Handle a filter write/deletion reply. */
+void filter_rpl(struct adapter *adap, const struct cpl_set_tcb_rpl *rpl)
+{
+	unsigned int tid = GET_TID(rpl);
+	struct filter_entry *f = NULL;
+	unsigned int max_fidx;
+	int idx;
+
+	max_fidx = adap->tids.nftids + adap->tids.nsftids;
+	/* Get the corresponding filter entry for this tid */
+	if (adap->tids.ftid_tab) {
+		/* Check this in normal filter region */
+		idx = tid - adap->tids.ftid_base;
+		if (idx >= max_fidx)
+			return;
+		f = &adap->tids.ftid_tab[idx];
+		if (f->tid != tid)
+			return;
+	}
+
+	/* We found the filter entry for this tid */
+	if (f) {
+		unsigned int ret = TCB_COOKIE_G(rpl->cookie);
+		struct filter_ctx *ctx;
+
+		/* Pull off any filter operation context attached to the
+		 * filter.
+		 */
+		ctx = f->ctx;
+		f->ctx = NULL;
+
+		if (ret == FW_FILTER_WR_FLT_DELETED) {
+			/* Clear the filter when we get confirmation from the
+			 * hardware that the filter has been deleted.
+			 */
+			clear_filter(adap, f);
+			if (ctx)
+				ctx->result = 0;
+		} else if (ret == FW_FILTER_WR_FLT_ADDED) {
+			int err = 0;
+
+			if (f->fs.newsmac)
+				err = configure_filter_smac(adap, f);
+
+			if (!err) {
+				f->pending = 0;  /* async setup completed */
+				f->valid = 1;
+				if (ctx) {
+					ctx->result = 0;
+					ctx->tid = idx;
+				}
+			} else {
+				clear_filter(adap, f);
+				if (ctx)
+					ctx->result = err;
+			}
+		} else {
+			/* Something went wrong.  Issue a warning about the
+			 * problem and clear everything out.
+			 */
+			dev_err(adap->pdev_dev, "filter %u setup failed with error %u\n",
+				idx, ret);
+			clear_filter(adap, f);
+			if (ctx)
+				ctx->result = -EINVAL;
+		}
+		if (ctx)
+			complete(&ctx->completion);
+	}
+}
+
+int init_hash_filter(struct adapter *adap)
+{
+	/* On T6, verify the necessary register configs and warn the user in
+	 * case of improper config
+	 */
+	if (is_t6(adap->params.chip)) {
+		if (TCAM_ACTV_HIT_G(t4_read_reg(adap, LE_DB_RSP_CODE_0_A)) != 4)
+			goto err;
+
+		if (HASH_ACTV_HIT_G(t4_read_reg(adap, LE_DB_RSP_CODE_1_A)) != 4)
+			goto err;
+	} else {
+		dev_err(adap->pdev_dev, "Hash filter supported only on T6\n");
+		return -EINVAL;
+	}
+	adap->params.hash_filter = 1;
+	return 0;
+err:
+	dev_warn(adap->pdev_dev, "Invalid hash filter config!\n");
+	return -EINVAL;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h
new file mode 100644
index 0000000..8db5fca
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h
@@ -0,0 +1,56 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_FILTER_H
+#define __CXGB4_FILTER_H
+
+#include "t4_msg.h"
+
+#define WORD_MASK	0xffffffff
+
+void filter_rpl(struct adapter *adap, const struct cpl_set_tcb_rpl *rpl);
+void hash_filter_rpl(struct adapter *adap, const struct cpl_act_open_rpl *rpl);
+void hash_del_filter_rpl(struct adapter *adap,
+			 const struct cpl_abort_rpl_rss *rpl);
+void clear_filter(struct adapter *adap, struct filter_entry *f);
+
+int set_filter_wr(struct adapter *adapter, int fidx);
+int delete_filter(struct adapter *adapter, unsigned int fidx);
+
+int writable_filter(struct filter_entry *f);
+void clear_all_filters(struct adapter *adapter);
+int init_hash_filter(struct adapter *adap);
+bool is_filter_exact_match(struct adapter *adap,
+			   struct ch_filter_specification *fs);
+#endif /* __CXGB4_FILTER_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
new file mode 100644
index 0000000..005283c
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -0,0 +1,6003 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitmap.h>
+#include <linux/crc32.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/err.h>
+#include <linux/etherdevice.h>
+#include <linux/firmware.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/log2.h>
+#include <linux/mdio.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/aer.h>
+#include <linux/rtnetlink.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/sockios.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <net/neighbour.h>
+#include <net/netevent.h>
+#include <net/addrconf.h>
+#include <net/bonding.h>
+#include <net/addrconf.h>
+#include <linux/uaccess.h>
+#include <linux/crash_dump.h>
+#include <net/udp_tunnel.h>
+
+#include "cxgb4.h"
+#include "cxgb4_filter.h"
+#include "t4_regs.h"
+#include "t4_values.h"
+#include "t4_msg.h"
+#include "t4fw_api.h"
+#include "t4fw_version.h"
+#include "cxgb4_dcb.h"
+#include "srq.h"
+#include "cxgb4_debugfs.h"
+#include "clip_tbl.h"
+#include "l2t.h"
+#include "smt.h"
+#include "sched.h"
+#include "cxgb4_tc_u32.h"
+#include "cxgb4_tc_flower.h"
+#include "cxgb4_ptp.h"
+#include "cxgb4_cudbg.h"
+
+char cxgb4_driver_name[] = KBUILD_MODNAME;
+
+#ifdef DRV_VERSION
+#undef DRV_VERSION
+#endif
+#define DRV_VERSION "2.0.0-ko"
+const char cxgb4_driver_version[] = DRV_VERSION;
+#define DRV_DESC "Chelsio T4/T5/T6 Network Driver"
+
+#define DFLT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK | \
+			 NETIF_MSG_TIMER | NETIF_MSG_IFDOWN | NETIF_MSG_IFUP |\
+			 NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR)
+
+/* Macros needed to support the PCI Device ID Table ...
+ */
+#define CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN \
+	static const struct pci_device_id cxgb4_pci_tbl[] = {
+#define CXGB4_UNIFIED_PF 0x4
+
+#define CH_PCI_DEVICE_ID_FUNCTION CXGB4_UNIFIED_PF
+
+/* Include PCI Device IDs for both PF4 and PF0-3 so our PCI probe() routine is
+ * called for both.
+ */
+#define CH_PCI_DEVICE_ID_FUNCTION2 0x0
+
+#define CH_PCI_ID_TABLE_ENTRY(devid) \
+		{PCI_VDEVICE(CHELSIO, (devid)), CXGB4_UNIFIED_PF}
+
+#define CH_PCI_DEVICE_ID_TABLE_DEFINE_END \
+		{ 0, } \
+	}
+
+#include "t4_pci_id_tbl.h"
+
+#define FW4_FNAME "cxgb4/t4fw.bin"
+#define FW5_FNAME "cxgb4/t5fw.bin"
+#define FW6_FNAME "cxgb4/t6fw.bin"
+#define FW4_CFNAME "cxgb4/t4-config.txt"
+#define FW5_CFNAME "cxgb4/t5-config.txt"
+#define FW6_CFNAME "cxgb4/t6-config.txt"
+#define PHY_AQ1202_FIRMWARE "cxgb4/aq1202_fw.cld"
+#define PHY_BCM84834_FIRMWARE "cxgb4/bcm8483.bin"
+#define PHY_AQ1202_DEVICEID 0x4409
+#define PHY_BCM84834_DEVICEID 0x4486
+
+MODULE_DESCRIPTION(DRV_DESC);
+MODULE_AUTHOR("Chelsio Communications");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+MODULE_DEVICE_TABLE(pci, cxgb4_pci_tbl);
+MODULE_FIRMWARE(FW4_FNAME);
+MODULE_FIRMWARE(FW5_FNAME);
+MODULE_FIRMWARE(FW6_FNAME);
+
+/*
+ * The driver uses the best interrupt scheme available on a platform in the
+ * order MSI-X, MSI, legacy INTx interrupts.  This parameter determines which
+ * of these schemes the driver may consider as follows:
+ *
+ * msi = 2: choose from among all three options
+ * msi = 1: only consider MSI and INTx interrupts
+ * msi = 0: force INTx interrupts
+ */
+static int msi = 2;
+
+module_param(msi, int, 0644);
+MODULE_PARM_DESC(msi, "whether to use INTx (0), MSI (1) or MSI-X (2)");
+
+/*
+ * Normally we tell the chip to deliver Ingress Packets into our DMA buffers
+ * offset by 2 bytes in order to have the IP headers line up on 4-byte
+ * boundaries.  This is a requirement for many architectures which will throw
+ * a machine check fault if an attempt is made to access one of the 4-byte IP
+ * header fields on a non-4-byte boundary.  And it's a major performance issue
+ * even on some architectures which allow it like some implementations of the
+ * x86 ISA.  However, some architectures don't mind this and for some very
+ * edge-case performance sensitive applications (like forwarding large volumes
+ * of small packets), setting this DMA offset to 0 will decrease the number of
+ * PCI-E Bus transfers enough to measurably affect performance.
+ */
+static int rx_dma_offset = 2;
+
+/* TX Queue select used to determine what algorithm to use for selecting TX
+ * queue. Select between the kernel provided function (select_queue=0) or user
+ * cxgb_select_queue function (select_queue=1)
+ *
+ * Default: select_queue=0
+ */
+static int select_queue;
+module_param(select_queue, int, 0644);
+MODULE_PARM_DESC(select_queue,
+		 "Select between kernel provided method of selecting or driver method of selecting TX queue. Default is kernel method.");
+
+static struct dentry *cxgb4_debugfs_root;
+
+LIST_HEAD(adapter_list);
+DEFINE_MUTEX(uld_mutex);
+
+static void link_report(struct net_device *dev)
+{
+	if (!netif_carrier_ok(dev))
+		netdev_info(dev, "link down\n");
+	else {
+		static const char *fc[] = { "no", "Rx", "Tx", "Tx/Rx" };
+
+		const char *s;
+		const struct port_info *p = netdev_priv(dev);
+
+		switch (p->link_cfg.speed) {
+		case 100:
+			s = "100Mbps";
+			break;
+		case 1000:
+			s = "1Gbps";
+			break;
+		case 10000:
+			s = "10Gbps";
+			break;
+		case 25000:
+			s = "25Gbps";
+			break;
+		case 40000:
+			s = "40Gbps";
+			break;
+		case 50000:
+			s = "50Gbps";
+			break;
+		case 100000:
+			s = "100Gbps";
+			break;
+		default:
+			pr_info("%s: unsupported speed: %d\n",
+				dev->name, p->link_cfg.speed);
+			return;
+		}
+
+		netdev_info(dev, "link up, %s, full-duplex, %s PAUSE\n", s,
+			    fc[p->link_cfg.fc]);
+	}
+}
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+/* Set up/tear down Data Center Bridging Priority mapping for a net device. */
+static void dcb_tx_queue_prio_enable(struct net_device *dev, int enable)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge_eth_txq *txq = &adap->sge.ethtxq[pi->first_qset];
+	int i;
+
+	/* We use a simple mapping of Port TX Queue Index to DCB
+	 * Priority when we're enabling DCB.
+	 */
+	for (i = 0; i < pi->nqsets; i++, txq++) {
+		u32 name, value;
+		int err;
+
+		name = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DMAQ) |
+			FW_PARAMS_PARAM_X_V(
+				FW_PARAMS_PARAM_DMAQ_EQ_DCBPRIO_ETH) |
+			FW_PARAMS_PARAM_YZ_V(txq->q.cntxt_id));
+		value = enable ? i : 0xffffffff;
+
+		/* Since we can be called while atomic (from "interrupt
+		 * level") we need to issue the Set Parameters Commannd
+		 * without sleeping (timeout < 0).
+		 */
+		err = t4_set_params_timeout(adap, adap->mbox, adap->pf, 0, 1,
+					    &name, &value,
+					    -FW_CMD_MAX_TIMEOUT);
+
+		if (err)
+			dev_err(adap->pdev_dev,
+				"Can't %s DCB Priority on port %d, TX Queue %d: err=%d\n",
+				enable ? "set" : "unset", pi->port_id, i, -err);
+		else
+			txq->dcb_prio = value;
+	}
+}
+
+static int cxgb4_dcb_enabled(const struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+
+	if (!pi->dcb.enabled)
+		return 0;
+
+	return ((pi->dcb.state == CXGB4_DCB_STATE_FW_ALLSYNCED) ||
+		(pi->dcb.state == CXGB4_DCB_STATE_HOST));
+}
+#endif /* CONFIG_CHELSIO_T4_DCB */
+
+void t4_os_link_changed(struct adapter *adapter, int port_id, int link_stat)
+{
+	struct net_device *dev = adapter->port[port_id];
+
+	/* Skip changes from disabled ports. */
+	if (netif_running(dev) && link_stat != netif_carrier_ok(dev)) {
+		if (link_stat)
+			netif_carrier_on(dev);
+		else {
+#ifdef CONFIG_CHELSIO_T4_DCB
+			if (cxgb4_dcb_enabled(dev)) {
+				cxgb4_dcb_reset(dev);
+				dcb_tx_queue_prio_enable(dev, false);
+			}
+#endif /* CONFIG_CHELSIO_T4_DCB */
+			netif_carrier_off(dev);
+		}
+
+		link_report(dev);
+	}
+}
+
+void t4_os_portmod_changed(const struct adapter *adap, int port_id)
+{
+	static const char *mod_str[] = {
+		NULL, "LR", "SR", "ER", "passive DA", "active DA", "LRM"
+	};
+
+	const struct net_device *dev = adap->port[port_id];
+	const struct port_info *pi = netdev_priv(dev);
+
+	if (pi->mod_type == FW_PORT_MOD_TYPE_NONE)
+		netdev_info(dev, "port module unplugged\n");
+	else if (pi->mod_type < ARRAY_SIZE(mod_str))
+		netdev_info(dev, "%s module inserted\n", mod_str[pi->mod_type]);
+	else if (pi->mod_type == FW_PORT_MOD_TYPE_NOTSUPPORTED)
+		netdev_info(dev, "%s: unsupported port module inserted\n",
+			    dev->name);
+	else if (pi->mod_type == FW_PORT_MOD_TYPE_UNKNOWN)
+		netdev_info(dev, "%s: unknown port module inserted\n",
+			    dev->name);
+	else if (pi->mod_type == FW_PORT_MOD_TYPE_ERROR)
+		netdev_info(dev, "%s: transceiver module error\n", dev->name);
+	else
+		netdev_info(dev, "%s: unknown module type %d inserted\n",
+			    dev->name, pi->mod_type);
+}
+
+int dbfifo_int_thresh = 10; /* 10 == 640 entry threshold */
+module_param(dbfifo_int_thresh, int, 0644);
+MODULE_PARM_DESC(dbfifo_int_thresh, "doorbell fifo interrupt threshold");
+
+/*
+ * usecs to sleep while draining the dbfifo
+ */
+static int dbfifo_drain_delay = 1000;
+module_param(dbfifo_drain_delay, int, 0644);
+MODULE_PARM_DESC(dbfifo_drain_delay,
+		 "usecs to sleep while draining the dbfifo");
+
+static inline int cxgb4_set_addr_hash(struct port_info *pi)
+{
+	struct adapter *adap = pi->adapter;
+	u64 vec = 0;
+	bool ucast = false;
+	struct hash_mac_addr *entry;
+
+	/* Calculate the hash vector for the updated list and program it */
+	list_for_each_entry(entry, &adap->mac_hlist, list) {
+		ucast |= is_unicast_ether_addr(entry->addr);
+		vec |= (1ULL << hash_mac_addr(entry->addr));
+	}
+	return t4_set_addr_hash(adap, adap->mbox, pi->viid, ucast,
+				vec, false);
+}
+
+static int cxgb4_mac_sync(struct net_device *netdev, const u8 *mac_addr)
+{
+	struct port_info *pi = netdev_priv(netdev);
+	struct adapter *adap = pi->adapter;
+	int ret;
+	u64 mhash = 0;
+	u64 uhash = 0;
+	bool free = false;
+	bool ucast = is_unicast_ether_addr(mac_addr);
+	const u8 *maclist[1] = {mac_addr};
+	struct hash_mac_addr *new_entry;
+
+	ret = t4_alloc_mac_filt(adap, adap->mbox, pi->viid, free, 1, maclist,
+				NULL, ucast ? &uhash : &mhash, false);
+	if (ret < 0)
+		goto out;
+	/* if hash != 0, then add the addr to hash addr list
+	 * so on the end we will calculate the hash for the
+	 * list and program it
+	 */
+	if (uhash || mhash) {
+		new_entry = kzalloc(sizeof(*new_entry), GFP_ATOMIC);
+		if (!new_entry)
+			return -ENOMEM;
+		ether_addr_copy(new_entry->addr, mac_addr);
+		list_add_tail(&new_entry->list, &adap->mac_hlist);
+		ret = cxgb4_set_addr_hash(pi);
+	}
+out:
+	return ret < 0 ? ret : 0;
+}
+
+static int cxgb4_mac_unsync(struct net_device *netdev, const u8 *mac_addr)
+{
+	struct port_info *pi = netdev_priv(netdev);
+	struct adapter *adap = pi->adapter;
+	int ret;
+	const u8 *maclist[1] = {mac_addr};
+	struct hash_mac_addr *entry, *tmp;
+
+	/* If the MAC address to be removed is in the hash addr
+	 * list, delete it from the list and update hash vector
+	 */
+	list_for_each_entry_safe(entry, tmp, &adap->mac_hlist, list) {
+		if (ether_addr_equal(entry->addr, mac_addr)) {
+			list_del(&entry->list);
+			kfree(entry);
+			return cxgb4_set_addr_hash(pi);
+		}
+	}
+
+	ret = t4_free_mac_filt(adap, adap->mbox, pi->viid, 1, maclist, false);
+	return ret < 0 ? -EINVAL : 0;
+}
+
+/*
+ * Set Rx properties of a port, such as promiscruity, address filters, and MTU.
+ * If @mtu is -1 it is left unchanged.
+ */
+static int set_rxmode(struct net_device *dev, int mtu, bool sleep_ok)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	__dev_uc_sync(dev, cxgb4_mac_sync, cxgb4_mac_unsync);
+	__dev_mc_sync(dev, cxgb4_mac_sync, cxgb4_mac_unsync);
+
+	return t4_set_rxmode(adapter, adapter->mbox, pi->viid, mtu,
+			     (dev->flags & IFF_PROMISC) ? 1 : 0,
+			     (dev->flags & IFF_ALLMULTI) ? 1 : 0, 1, -1,
+			     sleep_ok);
+}
+
+/**
+ *	link_start - enable a port
+ *	@dev: the port to enable
+ *
+ *	Performs the MAC and PHY actions needed to enable a port.
+ */
+static int link_start(struct net_device *dev)
+{
+	int ret;
+	struct port_info *pi = netdev_priv(dev);
+	unsigned int mb = pi->adapter->pf;
+
+	/*
+	 * We do not set address filters and promiscuity here, the stack does
+	 * that step explicitly.
+	 */
+	ret = t4_set_rxmode(pi->adapter, mb, pi->viid, dev->mtu, -1, -1, -1,
+			    !!(dev->features & NETIF_F_HW_VLAN_CTAG_RX), true);
+	if (ret == 0) {
+		ret = t4_change_mac(pi->adapter, mb, pi->viid,
+				    pi->xact_addr_filt, dev->dev_addr, true,
+				    true);
+		if (ret >= 0) {
+			pi->xact_addr_filt = ret;
+			ret = 0;
+		}
+	}
+	if (ret == 0)
+		ret = t4_link_l1cfg(pi->adapter, mb, pi->tx_chan,
+				    &pi->link_cfg);
+	if (ret == 0) {
+		local_bh_disable();
+		ret = t4_enable_vi_params(pi->adapter, mb, pi->viid, true,
+					  true, CXGB4_DCB_ENABLED);
+		local_bh_enable();
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+/* Handle a Data Center Bridging update message from the firmware. */
+static void dcb_rpl(struct adapter *adap, const struct fw_port_cmd *pcmd)
+{
+	int port = FW_PORT_CMD_PORTID_G(ntohl(pcmd->op_to_portid));
+	struct net_device *dev = adap->port[adap->chan_map[port]];
+	int old_dcb_enabled = cxgb4_dcb_enabled(dev);
+	int new_dcb_enabled;
+
+	cxgb4_dcb_handle_fw_update(adap, pcmd);
+	new_dcb_enabled = cxgb4_dcb_enabled(dev);
+
+	/* If the DCB has become enabled or disabled on the port then we're
+	 * going to need to set up/tear down DCB Priority parameters for the
+	 * TX Queues associated with the port.
+	 */
+	if (new_dcb_enabled != old_dcb_enabled)
+		dcb_tx_queue_prio_enable(dev, new_dcb_enabled);
+}
+#endif /* CONFIG_CHELSIO_T4_DCB */
+
+/* Response queue handler for the FW event queue.
+ */
+static int fwevtq_handler(struct sge_rspq *q, const __be64 *rsp,
+			  const struct pkt_gl *gl)
+{
+	u8 opcode = ((const struct rss_header *)rsp)->opcode;
+
+	rsp++;                                          /* skip RSS header */
+
+	/* FW can send EGR_UPDATEs encapsulated in a CPL_FW4_MSG.
+	 */
+	if (unlikely(opcode == CPL_FW4_MSG &&
+	   ((const struct cpl_fw4_msg *)rsp)->type == FW_TYPE_RSSCPL)) {
+		rsp++;
+		opcode = ((const struct rss_header *)rsp)->opcode;
+		rsp++;
+		if (opcode != CPL_SGE_EGR_UPDATE) {
+			dev_err(q->adap->pdev_dev, "unexpected FW4/CPL %#x on FW event queue\n"
+				, opcode);
+			goto out;
+		}
+	}
+
+	if (likely(opcode == CPL_SGE_EGR_UPDATE)) {
+		const struct cpl_sge_egr_update *p = (void *)rsp;
+		unsigned int qid = EGR_QID_G(ntohl(p->opcode_qid));
+		struct sge_txq *txq;
+
+		txq = q->adap->sge.egr_map[qid - q->adap->sge.egr_start];
+		txq->restarts++;
+		if (txq->q_type == CXGB4_TXQ_ETH) {
+			struct sge_eth_txq *eq;
+
+			eq = container_of(txq, struct sge_eth_txq, q);
+			netif_tx_wake_queue(eq->txq);
+		} else {
+			struct sge_uld_txq *oq;
+
+			oq = container_of(txq, struct sge_uld_txq, q);
+			tasklet_schedule(&oq->qresume_tsk);
+		}
+	} else if (opcode == CPL_FW6_MSG || opcode == CPL_FW4_MSG) {
+		const struct cpl_fw6_msg *p = (void *)rsp;
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+		const struct fw_port_cmd *pcmd = (const void *)p->data;
+		unsigned int cmd = FW_CMD_OP_G(ntohl(pcmd->op_to_portid));
+		unsigned int action =
+			FW_PORT_CMD_ACTION_G(ntohl(pcmd->action_to_len16));
+
+		if (cmd == FW_PORT_CMD &&
+		    (action == FW_PORT_ACTION_GET_PORT_INFO ||
+		     action == FW_PORT_ACTION_GET_PORT_INFO32)) {
+			int port = FW_PORT_CMD_PORTID_G(
+					be32_to_cpu(pcmd->op_to_portid));
+			struct net_device *dev;
+			int dcbxdis, state_input;
+
+			dev = q->adap->port[q->adap->chan_map[port]];
+			dcbxdis = (action == FW_PORT_ACTION_GET_PORT_INFO
+				   ? !!(pcmd->u.info.dcbxdis_pkd &
+					FW_PORT_CMD_DCBXDIS_F)
+				   : !!(pcmd->u.info32.lstatus32_to_cbllen32 &
+					FW_PORT_CMD_DCBXDIS32_F));
+			state_input = (dcbxdis
+				       ? CXGB4_DCB_INPUT_FW_DISABLED
+				       : CXGB4_DCB_INPUT_FW_ENABLED);
+
+			cxgb4_dcb_state_fsm(dev, state_input);
+		}
+
+		if (cmd == FW_PORT_CMD &&
+		    action == FW_PORT_ACTION_L2_DCB_CFG)
+			dcb_rpl(q->adap, pcmd);
+		else
+#endif
+			if (p->type == 0)
+				t4_handle_fw_rpl(q->adap, p->data);
+	} else if (opcode == CPL_L2T_WRITE_RPL) {
+		const struct cpl_l2t_write_rpl *p = (void *)rsp;
+
+		do_l2t_write_rpl(q->adap, p);
+	} else if (opcode == CPL_SMT_WRITE_RPL) {
+		const struct cpl_smt_write_rpl *p = (void *)rsp;
+
+		do_smt_write_rpl(q->adap, p);
+	} else if (opcode == CPL_SET_TCB_RPL) {
+		const struct cpl_set_tcb_rpl *p = (void *)rsp;
+
+		filter_rpl(q->adap, p);
+	} else if (opcode == CPL_ACT_OPEN_RPL) {
+		const struct cpl_act_open_rpl *p = (void *)rsp;
+
+		hash_filter_rpl(q->adap, p);
+	} else if (opcode == CPL_ABORT_RPL_RSS) {
+		const struct cpl_abort_rpl_rss *p = (void *)rsp;
+
+		hash_del_filter_rpl(q->adap, p);
+	} else if (opcode == CPL_SRQ_TABLE_RPL) {
+		const struct cpl_srq_table_rpl *p = (void *)rsp;
+
+		do_srq_table_rpl(q->adap, p);
+	} else
+		dev_err(q->adap->pdev_dev,
+			"unexpected CPL %#x on FW event queue\n", opcode);
+out:
+	return 0;
+}
+
+static void disable_msi(struct adapter *adapter)
+{
+	if (adapter->flags & USING_MSIX) {
+		pci_disable_msix(adapter->pdev);
+		adapter->flags &= ~USING_MSIX;
+	} else if (adapter->flags & USING_MSI) {
+		pci_disable_msi(adapter->pdev);
+		adapter->flags &= ~USING_MSI;
+	}
+}
+
+/*
+ * Interrupt handler for non-data events used with MSI-X.
+ */
+static irqreturn_t t4_nondata_intr(int irq, void *cookie)
+{
+	struct adapter *adap = cookie;
+	u32 v = t4_read_reg(adap, MYPF_REG(PL_PF_INT_CAUSE_A));
+
+	if (v & PFSW_F) {
+		adap->swintr = 1;
+		t4_write_reg(adap, MYPF_REG(PL_PF_INT_CAUSE_A), v);
+	}
+	if (adap->flags & MASTER_PF)
+		t4_slow_intr_handler(adap);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Name the MSI-X interrupts.
+ */
+static void name_msix_vecs(struct adapter *adap)
+{
+	int i, j, msi_idx = 2, n = sizeof(adap->msix_info[0].desc);
+
+	/* non-data interrupts */
+	snprintf(adap->msix_info[0].desc, n, "%s", adap->port[0]->name);
+
+	/* FW events */
+	snprintf(adap->msix_info[1].desc, n, "%s-FWeventq",
+		 adap->port[0]->name);
+
+	/* Ethernet queues */
+	for_each_port(adap, j) {
+		struct net_device *d = adap->port[j];
+		const struct port_info *pi = netdev_priv(d);
+
+		for (i = 0; i < pi->nqsets; i++, msi_idx++)
+			snprintf(adap->msix_info[msi_idx].desc, n, "%s-Rx%d",
+				 d->name, i);
+	}
+}
+
+static int request_msix_queue_irqs(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+	int err, ethqidx;
+	int msi_index = 2;
+
+	err = request_irq(adap->msix_info[1].vec, t4_sge_intr_msix, 0,
+			  adap->msix_info[1].desc, &s->fw_evtq);
+	if (err)
+		return err;
+
+	for_each_ethrxq(s, ethqidx) {
+		err = request_irq(adap->msix_info[msi_index].vec,
+				  t4_sge_intr_msix, 0,
+				  adap->msix_info[msi_index].desc,
+				  &s->ethrxq[ethqidx].rspq);
+		if (err)
+			goto unwind;
+		msi_index++;
+	}
+	return 0;
+
+unwind:
+	while (--ethqidx >= 0)
+		free_irq(adap->msix_info[--msi_index].vec,
+			 &s->ethrxq[ethqidx].rspq);
+	free_irq(adap->msix_info[1].vec, &s->fw_evtq);
+	return err;
+}
+
+static void free_msix_queue_irqs(struct adapter *adap)
+{
+	int i, msi_index = 2;
+	struct sge *s = &adap->sge;
+
+	free_irq(adap->msix_info[1].vec, &s->fw_evtq);
+	for_each_ethrxq(s, i)
+		free_irq(adap->msix_info[msi_index++].vec, &s->ethrxq[i].rspq);
+}
+
+/**
+ *	cxgb4_write_rss - write the RSS table for a given port
+ *	@pi: the port
+ *	@queues: array of queue indices for RSS
+ *
+ *	Sets up the portion of the HW RSS table for the port's VI to distribute
+ *	packets to the Rx queues in @queues.
+ *	Should never be called before setting up sge eth rx queues
+ */
+int cxgb4_write_rss(const struct port_info *pi, const u16 *queues)
+{
+	u16 *rss;
+	int i, err;
+	struct adapter *adapter = pi->adapter;
+	const struct sge_eth_rxq *rxq;
+
+	rxq = &adapter->sge.ethrxq[pi->first_qset];
+	rss = kmalloc(pi->rss_size * sizeof(u16), GFP_KERNEL);
+	if (!rss)
+		return -ENOMEM;
+
+	/* map the queue indices to queue ids */
+	for (i = 0; i < pi->rss_size; i++, queues++)
+		rss[i] = rxq[*queues].rspq.abs_id;
+
+	err = t4_config_rss_range(adapter, adapter->pf, pi->viid, 0,
+				  pi->rss_size, rss, pi->rss_size);
+	/* If Tunnel All Lookup isn't specified in the global RSS
+	 * Configuration, then we need to specify a default Ingress
+	 * Queue for any ingress packets which aren't hashed.  We'll
+	 * use our first ingress queue ...
+	 */
+	if (!err)
+		err = t4_config_vi_rss(adapter, adapter->mbox, pi->viid,
+				       FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN_F |
+				       FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_F |
+				       FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN_F |
+				       FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_F |
+				       FW_RSS_VI_CONFIG_CMD_UDPEN_F,
+				       rss[0]);
+	kfree(rss);
+	return err;
+}
+
+/**
+ *	setup_rss - configure RSS
+ *	@adap: the adapter
+ *
+ *	Sets up RSS for each port.
+ */
+static int setup_rss(struct adapter *adap)
+{
+	int i, j, err;
+
+	for_each_port(adap, i) {
+		const struct port_info *pi = adap2pinfo(adap, i);
+
+		/* Fill default values with equal distribution */
+		for (j = 0; j < pi->rss_size; j++)
+			pi->rss[j] = j % pi->nqsets;
+
+		err = cxgb4_write_rss(pi, pi->rss);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/*
+ * Return the channel of the ingress queue with the given qid.
+ */
+static unsigned int rxq_to_chan(const struct sge *p, unsigned int qid)
+{
+	qid -= p->ingr_start;
+	return netdev2pinfo(p->ingr_map[qid]->netdev)->tx_chan;
+}
+
+/*
+ * Wait until all NAPI handlers are descheduled.
+ */
+static void quiesce_rx(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < adap->sge.ingr_sz; i++) {
+		struct sge_rspq *q = adap->sge.ingr_map[i];
+
+		if (q && q->handler)
+			napi_disable(&q->napi);
+	}
+}
+
+/* Disable interrupt and napi handler */
+static void disable_interrupts(struct adapter *adap)
+{
+	if (adap->flags & FULL_INIT_DONE) {
+		t4_intr_disable(adap);
+		if (adap->flags & USING_MSIX) {
+			free_msix_queue_irqs(adap);
+			free_irq(adap->msix_info[0].vec, adap);
+		} else {
+			free_irq(adap->pdev->irq, adap);
+		}
+		quiesce_rx(adap);
+	}
+}
+
+/*
+ * Enable NAPI scheduling and interrupt generation for all Rx queues.
+ */
+static void enable_rx(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < adap->sge.ingr_sz; i++) {
+		struct sge_rspq *q = adap->sge.ingr_map[i];
+
+		if (!q)
+			continue;
+		if (q->handler)
+			napi_enable(&q->napi);
+
+		/* 0-increment GTS to start the timer and enable interrupts */
+		t4_write_reg(adap, MYPF_REG(SGE_PF_GTS_A),
+			     SEINTARM_V(q->intr_params) |
+			     INGRESSQID_V(q->cntxt_id));
+	}
+}
+
+
+static int setup_fw_sge_queues(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+	int err = 0;
+
+	bitmap_zero(s->starving_fl, s->egr_sz);
+	bitmap_zero(s->txq_maperr, s->egr_sz);
+
+	if (adap->flags & USING_MSIX)
+		adap->msi_idx = 1;         /* vector 0 is for non-queue interrupts */
+	else {
+		err = t4_sge_alloc_rxq(adap, &s->intrq, false, adap->port[0], 0,
+				       NULL, NULL, NULL, -1);
+		if (err)
+			return err;
+		adap->msi_idx = -((int)s->intrq.abs_id + 1);
+	}
+
+	err = t4_sge_alloc_rxq(adap, &s->fw_evtq, true, adap->port[0],
+			       adap->msi_idx, NULL, fwevtq_handler, NULL, -1);
+	return err;
+}
+
+/**
+ *	setup_sge_queues - configure SGE Tx/Rx/response queues
+ *	@adap: the adapter
+ *
+ *	Determines how many sets of SGE queues to use and initializes them.
+ *	We support multiple queue sets per port if we have MSI-X, otherwise
+ *	just one queue set per port.
+ */
+static int setup_sge_queues(struct adapter *adap)
+{
+	int err, i, j;
+	struct sge *s = &adap->sge;
+	struct sge_uld_rxq_info *rxq_info = NULL;
+	unsigned int cmplqid = 0;
+
+	if (is_uld(adap))
+		rxq_info = s->uld_rxq_info[CXGB4_ULD_RDMA];
+
+	for_each_port(adap, i) {
+		struct net_device *dev = adap->port[i];
+		struct port_info *pi = netdev_priv(dev);
+		struct sge_eth_rxq *q = &s->ethrxq[pi->first_qset];
+		struct sge_eth_txq *t = &s->ethtxq[pi->first_qset];
+
+		for (j = 0; j < pi->nqsets; j++, q++) {
+			if (adap->msi_idx > 0)
+				adap->msi_idx++;
+			err = t4_sge_alloc_rxq(adap, &q->rspq, false, dev,
+					       adap->msi_idx, &q->fl,
+					       t4_ethrx_handler,
+					       NULL,
+					       t4_get_tp_ch_map(adap,
+								pi->tx_chan));
+			if (err)
+				goto freeout;
+			q->rspq.idx = j;
+			memset(&q->stats, 0, sizeof(q->stats));
+		}
+		for (j = 0; j < pi->nqsets; j++, t++) {
+			err = t4_sge_alloc_eth_txq(adap, t, dev,
+					netdev_get_tx_queue(dev, j),
+					s->fw_evtq.cntxt_id);
+			if (err)
+				goto freeout;
+		}
+	}
+
+	for_each_port(adap, i) {
+		/* Note that cmplqid below is 0 if we don't
+		 * have RDMA queues, and that's the right value.
+		 */
+		if (rxq_info)
+			cmplqid	= rxq_info->uldrxq[i].rspq.cntxt_id;
+
+		err = t4_sge_alloc_ctrl_txq(adap, &s->ctrlq[i], adap->port[i],
+					    s->fw_evtq.cntxt_id, cmplqid);
+		if (err)
+			goto freeout;
+	}
+
+	if (!is_t4(adap->params.chip)) {
+		err = t4_sge_alloc_eth_txq(adap, &s->ptptxq, adap->port[0],
+					   netdev_get_tx_queue(adap->port[0], 0)
+					   , s->fw_evtq.cntxt_id);
+		if (err)
+			goto freeout;
+	}
+
+	t4_write_reg(adap, is_t4(adap->params.chip) ?
+				MPS_TRC_RSS_CONTROL_A :
+				MPS_T5_TRC_RSS_CONTROL_A,
+		     RSSCONTROL_V(netdev2pinfo(adap->port[0])->tx_chan) |
+		     QUEUENUMBER_V(s->ethrxq[0].rspq.abs_id));
+	return 0;
+freeout:
+	t4_free_sge_resources(adap);
+	return err;
+}
+
+static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb,
+			     void *accel_priv, select_queue_fallback_t fallback)
+{
+	int txq;
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+	/* If a Data Center Bridging has been successfully negotiated on this
+	 * link then we'll use the skb's priority to map it to a TX Queue.
+	 * The skb's priority is determined via the VLAN Tag Priority Code
+	 * Point field.
+	 */
+	if (cxgb4_dcb_enabled(dev) && !is_kdump_kernel()) {
+		u16 vlan_tci;
+		int err;
+
+		err = vlan_get_tag(skb, &vlan_tci);
+		if (unlikely(err)) {
+			if (net_ratelimit())
+				netdev_warn(dev,
+					    "TX Packet without VLAN Tag on DCB Link\n");
+			txq = 0;
+		} else {
+			txq = (vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+#ifdef CONFIG_CHELSIO_T4_FCOE
+			if (skb->protocol == htons(ETH_P_FCOE))
+				txq = skb->priority & 0x7;
+#endif /* CONFIG_CHELSIO_T4_FCOE */
+		}
+		return txq;
+	}
+#endif /* CONFIG_CHELSIO_T4_DCB */
+
+	if (select_queue) {
+		txq = (skb_rx_queue_recorded(skb)
+			? skb_get_rx_queue(skb)
+			: smp_processor_id());
+
+		while (unlikely(txq >= dev->real_num_tx_queues))
+			txq -= dev->real_num_tx_queues;
+
+		return txq;
+	}
+
+	return fallback(dev, skb) % dev->real_num_tx_queues;
+}
+
+static int closest_timer(const struct sge *s, int time)
+{
+	int i, delta, match = 0, min_delta = INT_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->timer_val); i++) {
+		delta = time - s->timer_val[i];
+		if (delta < 0)
+			delta = -delta;
+		if (delta < min_delta) {
+			min_delta = delta;
+			match = i;
+		}
+	}
+	return match;
+}
+
+static int closest_thres(const struct sge *s, int thres)
+{
+	int i, delta, match = 0, min_delta = INT_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->counter_val); i++) {
+		delta = thres - s->counter_val[i];
+		if (delta < 0)
+			delta = -delta;
+		if (delta < min_delta) {
+			min_delta = delta;
+			match = i;
+		}
+	}
+	return match;
+}
+
+/**
+ *	cxgb4_set_rspq_intr_params - set a queue's interrupt holdoff parameters
+ *	@q: the Rx queue
+ *	@us: the hold-off time in us, or 0 to disable timer
+ *	@cnt: the hold-off packet count, or 0 to disable counter
+ *
+ *	Sets an Rx queue's interrupt hold-off time and packet count.  At least
+ *	one of the two needs to be enabled for the queue to generate interrupts.
+ */
+int cxgb4_set_rspq_intr_params(struct sge_rspq *q,
+			       unsigned int us, unsigned int cnt)
+{
+	struct adapter *adap = q->adap;
+
+	if ((us | cnt) == 0)
+		cnt = 1;
+
+	if (cnt) {
+		int err;
+		u32 v, new_idx;
+
+		new_idx = closest_thres(&adap->sge, cnt);
+		if (q->desc && q->pktcnt_idx != new_idx) {
+			/* the queue has already been created, update it */
+			v = FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DMAQ) |
+			    FW_PARAMS_PARAM_X_V(
+					FW_PARAMS_PARAM_DMAQ_IQ_INTCNTTHRESH) |
+			    FW_PARAMS_PARAM_YZ_V(q->cntxt_id);
+			err = t4_set_params(adap, adap->mbox, adap->pf, 0, 1,
+					    &v, &new_idx);
+			if (err)
+				return err;
+		}
+		q->pktcnt_idx = new_idx;
+	}
+
+	us = us == 0 ? 6 : closest_timer(&adap->sge, us);
+	q->intr_params = QINTR_TIMER_IDX_V(us) | QINTR_CNT_EN_V(cnt > 0);
+	return 0;
+}
+
+static int cxgb_set_features(struct net_device *dev, netdev_features_t features)
+{
+	const struct port_info *pi = netdev_priv(dev);
+	netdev_features_t changed = dev->features ^ features;
+	int err;
+
+	if (!(changed & NETIF_F_HW_VLAN_CTAG_RX))
+		return 0;
+
+	err = t4_set_rxmode(pi->adapter, pi->adapter->pf, pi->viid, -1,
+			    -1, -1, -1,
+			    !!(features & NETIF_F_HW_VLAN_CTAG_RX), true);
+	if (unlikely(err))
+		dev->features = features ^ NETIF_F_HW_VLAN_CTAG_RX;
+	return err;
+}
+
+static int setup_debugfs(struct adapter *adap)
+{
+	if (IS_ERR_OR_NULL(adap->debugfs_root))
+		return -1;
+
+#ifdef CONFIG_DEBUG_FS
+	t4_setup_debugfs(adap);
+#endif
+	return 0;
+}
+
+/*
+ * upper-layer driver support
+ */
+
+/*
+ * Allocate an active-open TID and set it to the supplied value.
+ */
+int cxgb4_alloc_atid(struct tid_info *t, void *data)
+{
+	int atid = -1;
+
+	spin_lock_bh(&t->atid_lock);
+	if (t->afree) {
+		union aopen_entry *p = t->afree;
+
+		atid = (p - t->atid_tab) + t->atid_base;
+		t->afree = p->next;
+		p->data = data;
+		t->atids_in_use++;
+	}
+	spin_unlock_bh(&t->atid_lock);
+	return atid;
+}
+EXPORT_SYMBOL(cxgb4_alloc_atid);
+
+/*
+ * Release an active-open TID.
+ */
+void cxgb4_free_atid(struct tid_info *t, unsigned int atid)
+{
+	union aopen_entry *p = &t->atid_tab[atid - t->atid_base];
+
+	spin_lock_bh(&t->atid_lock);
+	p->next = t->afree;
+	t->afree = p;
+	t->atids_in_use--;
+	spin_unlock_bh(&t->atid_lock);
+}
+EXPORT_SYMBOL(cxgb4_free_atid);
+
+/*
+ * Allocate a server TID and set it to the supplied value.
+ */
+int cxgb4_alloc_stid(struct tid_info *t, int family, void *data)
+{
+	int stid;
+
+	spin_lock_bh(&t->stid_lock);
+	if (family == PF_INET) {
+		stid = find_first_zero_bit(t->stid_bmap, t->nstids);
+		if (stid < t->nstids)
+			__set_bit(stid, t->stid_bmap);
+		else
+			stid = -1;
+	} else {
+		stid = bitmap_find_free_region(t->stid_bmap, t->nstids, 1);
+		if (stid < 0)
+			stid = -1;
+	}
+	if (stid >= 0) {
+		t->stid_tab[stid].data = data;
+		stid += t->stid_base;
+		/* IPv6 requires max of 520 bits or 16 cells in TCAM
+		 * This is equivalent to 4 TIDs. With CLIP enabled it
+		 * needs 2 TIDs.
+		 */
+		if (family == PF_INET6) {
+			t->stids_in_use += 2;
+			t->v6_stids_in_use += 2;
+		} else {
+			t->stids_in_use++;
+		}
+	}
+	spin_unlock_bh(&t->stid_lock);
+	return stid;
+}
+EXPORT_SYMBOL(cxgb4_alloc_stid);
+
+/* Allocate a server filter TID and set it to the supplied value.
+ */
+int cxgb4_alloc_sftid(struct tid_info *t, int family, void *data)
+{
+	int stid;
+
+	spin_lock_bh(&t->stid_lock);
+	if (family == PF_INET) {
+		stid = find_next_zero_bit(t->stid_bmap,
+				t->nstids + t->nsftids, t->nstids);
+		if (stid < (t->nstids + t->nsftids))
+			__set_bit(stid, t->stid_bmap);
+		else
+			stid = -1;
+	} else {
+		stid = -1;
+	}
+	if (stid >= 0) {
+		t->stid_tab[stid].data = data;
+		stid -= t->nstids;
+		stid += t->sftid_base;
+		t->sftids_in_use++;
+	}
+	spin_unlock_bh(&t->stid_lock);
+	return stid;
+}
+EXPORT_SYMBOL(cxgb4_alloc_sftid);
+
+/* Release a server TID.
+ */
+void cxgb4_free_stid(struct tid_info *t, unsigned int stid, int family)
+{
+	/* Is it a server filter TID? */
+	if (t->nsftids && (stid >= t->sftid_base)) {
+		stid -= t->sftid_base;
+		stid += t->nstids;
+	} else {
+		stid -= t->stid_base;
+	}
+
+	spin_lock_bh(&t->stid_lock);
+	if (family == PF_INET)
+		__clear_bit(stid, t->stid_bmap);
+	else
+		bitmap_release_region(t->stid_bmap, stid, 1);
+	t->stid_tab[stid].data = NULL;
+	if (stid < t->nstids) {
+		if (family == PF_INET6) {
+			t->stids_in_use -= 2;
+			t->v6_stids_in_use -= 2;
+		} else {
+			t->stids_in_use--;
+		}
+	} else {
+		t->sftids_in_use--;
+	}
+
+	spin_unlock_bh(&t->stid_lock);
+}
+EXPORT_SYMBOL(cxgb4_free_stid);
+
+/*
+ * Populate a TID_RELEASE WR.  Caller must properly size the skb.
+ */
+static void mk_tid_release(struct sk_buff *skb, unsigned int chan,
+			   unsigned int tid)
+{
+	struct cpl_tid_release *req;
+
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, chan);
+	req = __skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, tid);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
+}
+
+/*
+ * Queue a TID release request and if necessary schedule a work queue to
+ * process it.
+ */
+static void cxgb4_queue_tid_release(struct tid_info *t, unsigned int chan,
+				    unsigned int tid)
+{
+	void **p = &t->tid_tab[tid];
+	struct adapter *adap = container_of(t, struct adapter, tids);
+
+	spin_lock_bh(&adap->tid_release_lock);
+	*p = adap->tid_release_head;
+	/* Low 2 bits encode the Tx channel number */
+	adap->tid_release_head = (void **)((uintptr_t)p | chan);
+	if (!adap->tid_release_task_busy) {
+		adap->tid_release_task_busy = true;
+		queue_work(adap->workq, &adap->tid_release_task);
+	}
+	spin_unlock_bh(&adap->tid_release_lock);
+}
+
+/*
+ * Process the list of pending TID release requests.
+ */
+static void process_tid_release_list(struct work_struct *work)
+{
+	struct sk_buff *skb;
+	struct adapter *adap;
+
+	adap = container_of(work, struct adapter, tid_release_task);
+
+	spin_lock_bh(&adap->tid_release_lock);
+	while (adap->tid_release_head) {
+		void **p = adap->tid_release_head;
+		unsigned int chan = (uintptr_t)p & 3;
+		p = (void *)p - chan;
+
+		adap->tid_release_head = *p;
+		*p = NULL;
+		spin_unlock_bh(&adap->tid_release_lock);
+
+		while (!(skb = alloc_skb(sizeof(struct cpl_tid_release),
+					 GFP_KERNEL)))
+			schedule_timeout_uninterruptible(1);
+
+		mk_tid_release(skb, chan, p - adap->tids.tid_tab);
+		t4_ofld_send(adap, skb);
+		spin_lock_bh(&adap->tid_release_lock);
+	}
+	adap->tid_release_task_busy = false;
+	spin_unlock_bh(&adap->tid_release_lock);
+}
+
+/*
+ * Release a TID and inform HW.  If we are unable to allocate the release
+ * message we defer to a work queue.
+ */
+void cxgb4_remove_tid(struct tid_info *t, unsigned int chan, unsigned int tid,
+		      unsigned short family)
+{
+	struct sk_buff *skb;
+	struct adapter *adap = container_of(t, struct adapter, tids);
+
+	WARN_ON(tid >= t->ntids);
+
+	if (t->tid_tab[tid]) {
+		t->tid_tab[tid] = NULL;
+		atomic_dec(&t->conns_in_use);
+		if (t->hash_base && (tid >= t->hash_base)) {
+			if (family == AF_INET6)
+				atomic_sub(2, &t->hash_tids_in_use);
+			else
+				atomic_dec(&t->hash_tids_in_use);
+		} else {
+			if (family == AF_INET6)
+				atomic_sub(2, &t->tids_in_use);
+			else
+				atomic_dec(&t->tids_in_use);
+		}
+	}
+
+	skb = alloc_skb(sizeof(struct cpl_tid_release), GFP_ATOMIC);
+	if (likely(skb)) {
+		mk_tid_release(skb, chan, tid);
+		t4_ofld_send(adap, skb);
+	} else
+		cxgb4_queue_tid_release(t, chan, tid);
+}
+EXPORT_SYMBOL(cxgb4_remove_tid);
+
+/*
+ * Allocate and initialize the TID tables.  Returns 0 on success.
+ */
+static int tid_init(struct tid_info *t)
+{
+	struct adapter *adap = container_of(t, struct adapter, tids);
+	unsigned int max_ftids = t->nftids + t->nsftids;
+	unsigned int natids = t->natids;
+	unsigned int stid_bmap_size;
+	unsigned int ftid_bmap_size;
+	size_t size;
+
+	stid_bmap_size = BITS_TO_LONGS(t->nstids + t->nsftids);
+	ftid_bmap_size = BITS_TO_LONGS(t->nftids);
+	size = t->ntids * sizeof(*t->tid_tab) +
+	       natids * sizeof(*t->atid_tab) +
+	       t->nstids * sizeof(*t->stid_tab) +
+	       t->nsftids * sizeof(*t->stid_tab) +
+	       stid_bmap_size * sizeof(long) +
+	       max_ftids * sizeof(*t->ftid_tab) +
+	       ftid_bmap_size * sizeof(long);
+
+	t->tid_tab = kvzalloc(size, GFP_KERNEL);
+	if (!t->tid_tab)
+		return -ENOMEM;
+
+	t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids];
+	t->stid_tab = (struct serv_entry *)&t->atid_tab[natids];
+	t->stid_bmap = (unsigned long *)&t->stid_tab[t->nstids + t->nsftids];
+	t->ftid_tab = (struct filter_entry *)&t->stid_bmap[stid_bmap_size];
+	t->ftid_bmap = (unsigned long *)&t->ftid_tab[max_ftids];
+	spin_lock_init(&t->stid_lock);
+	spin_lock_init(&t->atid_lock);
+	spin_lock_init(&t->ftid_lock);
+
+	t->stids_in_use = 0;
+	t->v6_stids_in_use = 0;
+	t->sftids_in_use = 0;
+	t->afree = NULL;
+	t->atids_in_use = 0;
+	atomic_set(&t->tids_in_use, 0);
+	atomic_set(&t->conns_in_use, 0);
+	atomic_set(&t->hash_tids_in_use, 0);
+
+	/* Setup the free list for atid_tab and clear the stid bitmap. */
+	if (natids) {
+		while (--natids)
+			t->atid_tab[natids - 1].next = &t->atid_tab[natids];
+		t->afree = t->atid_tab;
+	}
+
+	if (is_offload(adap)) {
+		bitmap_zero(t->stid_bmap, t->nstids + t->nsftids);
+		/* Reserve stid 0 for T4/T5 adapters */
+		if (!t->stid_base &&
+		    CHELSIO_CHIP_VERSION(adap->params.chip) <= CHELSIO_T5)
+			__set_bit(0, t->stid_bmap);
+	}
+
+	bitmap_zero(t->ftid_bmap, t->nftids);
+	return 0;
+}
+
+/**
+ *	cxgb4_create_server - create an IP server
+ *	@dev: the device
+ *	@stid: the server TID
+ *	@sip: local IP address to bind server to
+ *	@sport: the server's TCP port
+ *	@queue: queue to direct messages from this server to
+ *
+ *	Create an IP server for the given port and address.
+ *	Returns <0 on error and one of the %NET_XMIT_* values on success.
+ */
+int cxgb4_create_server(const struct net_device *dev, unsigned int stid,
+			__be32 sip, __be16 sport, __be16 vlan,
+			unsigned int queue)
+{
+	unsigned int chan;
+	struct sk_buff *skb;
+	struct adapter *adap;
+	struct cpl_pass_open_req *req;
+	int ret;
+
+	skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	adap = netdev2adap(dev);
+	req = __skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, stid));
+	req->local_port = sport;
+	req->peer_port = htons(0);
+	req->local_ip = sip;
+	req->peer_ip = htonl(0);
+	chan = rxq_to_chan(&adap->sge, queue);
+	req->opt0 = cpu_to_be64(TX_CHAN_V(chan));
+	req->opt1 = cpu_to_be64(CONN_POLICY_V(CPL_CONN_POLICY_ASK) |
+				SYN_RSS_ENABLE_F | SYN_RSS_QUEUE_V(queue));
+	ret = t4_mgmt_tx(adap, skb);
+	return net_xmit_eval(ret);
+}
+EXPORT_SYMBOL(cxgb4_create_server);
+
+/*	cxgb4_create_server6 - create an IPv6 server
+ *	@dev: the device
+ *	@stid: the server TID
+ *	@sip: local IPv6 address to bind server to
+ *	@sport: the server's TCP port
+ *	@queue: queue to direct messages from this server to
+ *
+ *	Create an IPv6 server for the given port and address.
+ *	Returns <0 on error and one of the %NET_XMIT_* values on success.
+ */
+int cxgb4_create_server6(const struct net_device *dev, unsigned int stid,
+			 const struct in6_addr *sip, __be16 sport,
+			 unsigned int queue)
+{
+	unsigned int chan;
+	struct sk_buff *skb;
+	struct adapter *adap;
+	struct cpl_pass_open_req6 *req;
+	int ret;
+
+	skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	adap = netdev2adap(dev);
+	req = __skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, stid));
+	req->local_port = sport;
+	req->peer_port = htons(0);
+	req->local_ip_hi = *(__be64 *)(sip->s6_addr);
+	req->local_ip_lo = *(__be64 *)(sip->s6_addr + 8);
+	req->peer_ip_hi = cpu_to_be64(0);
+	req->peer_ip_lo = cpu_to_be64(0);
+	chan = rxq_to_chan(&adap->sge, queue);
+	req->opt0 = cpu_to_be64(TX_CHAN_V(chan));
+	req->opt1 = cpu_to_be64(CONN_POLICY_V(CPL_CONN_POLICY_ASK) |
+				SYN_RSS_ENABLE_F | SYN_RSS_QUEUE_V(queue));
+	ret = t4_mgmt_tx(adap, skb);
+	return net_xmit_eval(ret);
+}
+EXPORT_SYMBOL(cxgb4_create_server6);
+
+int cxgb4_remove_server(const struct net_device *dev, unsigned int stid,
+			unsigned int queue, bool ipv6)
+{
+	struct sk_buff *skb;
+	struct adapter *adap;
+	struct cpl_close_listsvr_req *req;
+	int ret;
+
+	adap = netdev2adap(dev);
+
+	skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	req = __skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid));
+	req->reply_ctrl = htons(NO_REPLY_V(0) | (ipv6 ? LISTSVR_IPV6_V(1) :
+				LISTSVR_IPV6_V(0)) | QUEUENO_V(queue));
+	ret = t4_mgmt_tx(adap, skb);
+	return net_xmit_eval(ret);
+}
+EXPORT_SYMBOL(cxgb4_remove_server);
+
+/**
+ *	cxgb4_best_mtu - find the entry in the MTU table closest to an MTU
+ *	@mtus: the HW MTU table
+ *	@mtu: the target MTU
+ *	@idx: index of selected entry in the MTU table
+ *
+ *	Returns the index and the value in the HW MTU table that is closest to
+ *	but does not exceed @mtu, unless @mtu is smaller than any value in the
+ *	table, in which case that smallest available value is selected.
+ */
+unsigned int cxgb4_best_mtu(const unsigned short *mtus, unsigned short mtu,
+			    unsigned int *idx)
+{
+	unsigned int i = 0;
+
+	while (i < NMTUS - 1 && mtus[i + 1] <= mtu)
+		++i;
+	if (idx)
+		*idx = i;
+	return mtus[i];
+}
+EXPORT_SYMBOL(cxgb4_best_mtu);
+
+/**
+ *     cxgb4_best_aligned_mtu - find best MTU, [hopefully] data size aligned
+ *     @mtus: the HW MTU table
+ *     @header_size: Header Size
+ *     @data_size_max: maximum Data Segment Size
+ *     @data_size_align: desired Data Segment Size Alignment (2^N)
+ *     @mtu_idxp: HW MTU Table Index return value pointer (possibly NULL)
+ *
+ *     Similar to cxgb4_best_mtu() but instead of searching the Hardware
+ *     MTU Table based solely on a Maximum MTU parameter, we break that
+ *     parameter up into a Header Size and Maximum Data Segment Size, and
+ *     provide a desired Data Segment Size Alignment.  If we find an MTU in
+ *     the Hardware MTU Table which will result in a Data Segment Size with
+ *     the requested alignment _and_ that MTU isn't "too far" from the
+ *     closest MTU, then we'll return that rather than the closest MTU.
+ */
+unsigned int cxgb4_best_aligned_mtu(const unsigned short *mtus,
+				    unsigned short header_size,
+				    unsigned short data_size_max,
+				    unsigned short data_size_align,
+				    unsigned int *mtu_idxp)
+{
+	unsigned short max_mtu = header_size + data_size_max;
+	unsigned short data_size_align_mask = data_size_align - 1;
+	int mtu_idx, aligned_mtu_idx;
+
+	/* Scan the MTU Table till we find an MTU which is larger than our
+	 * Maximum MTU or we reach the end of the table.  Along the way,
+	 * record the last MTU found, if any, which will result in a Data
+	 * Segment Length matching the requested alignment.
+	 */
+	for (mtu_idx = 0, aligned_mtu_idx = -1; mtu_idx < NMTUS; mtu_idx++) {
+		unsigned short data_size = mtus[mtu_idx] - header_size;
+
+		/* If this MTU minus the Header Size would result in a
+		 * Data Segment Size of the desired alignment, remember it.
+		 */
+		if ((data_size & data_size_align_mask) == 0)
+			aligned_mtu_idx = mtu_idx;
+
+		/* If we're not at the end of the Hardware MTU Table and the
+		 * next element is larger than our Maximum MTU, drop out of
+		 * the loop.
+		 */
+		if (mtu_idx+1 < NMTUS && mtus[mtu_idx+1] > max_mtu)
+			break;
+	}
+
+	/* If we fell out of the loop because we ran to the end of the table,
+	 * then we just have to use the last [largest] entry.
+	 */
+	if (mtu_idx == NMTUS)
+		mtu_idx--;
+
+	/* If we found an MTU which resulted in the requested Data Segment
+	 * Length alignment and that's "not far" from the largest MTU which is
+	 * less than or equal to the maximum MTU, then use that.
+	 */
+	if (aligned_mtu_idx >= 0 &&
+	    mtu_idx - aligned_mtu_idx <= 1)
+		mtu_idx = aligned_mtu_idx;
+
+	/* If the caller has passed in an MTU Index pointer, pass the
+	 * MTU Index back.  Return the MTU value.
+	 */
+	if (mtu_idxp)
+		*mtu_idxp = mtu_idx;
+	return mtus[mtu_idx];
+}
+EXPORT_SYMBOL(cxgb4_best_aligned_mtu);
+
+/**
+ *	cxgb4_tp_smt_idx - Get the Source Mac Table index for this VI
+ *	@chip: chip type
+ *	@viid: VI id of the given port
+ *
+ *	Return the SMT index for this VI.
+ */
+unsigned int cxgb4_tp_smt_idx(enum chip_type chip, unsigned int viid)
+{
+	/* In T4/T5, SMT contains 256 SMAC entries organized in
+	 * 128 rows of 2 entries each.
+	 * In T6, SMT contains 256 SMAC entries in 256 rows.
+	 * TODO: The below code needs to be updated when we add support
+	 * for 256 VFs.
+	 */
+	if (CHELSIO_CHIP_VERSION(chip) <= CHELSIO_T5)
+		return ((viid & 0x7f) << 1);
+	else
+		return (viid & 0x7f);
+}
+EXPORT_SYMBOL(cxgb4_tp_smt_idx);
+
+/**
+ *	cxgb4_port_chan - get the HW channel of a port
+ *	@dev: the net device for the port
+ *
+ *	Return the HW Tx channel of the given port.
+ */
+unsigned int cxgb4_port_chan(const struct net_device *dev)
+{
+	return netdev2pinfo(dev)->tx_chan;
+}
+EXPORT_SYMBOL(cxgb4_port_chan);
+
+unsigned int cxgb4_dbfifo_count(const struct net_device *dev, int lpfifo)
+{
+	struct adapter *adap = netdev2adap(dev);
+	u32 v1, v2, lp_count, hp_count;
+
+	v1 = t4_read_reg(adap, SGE_DBFIFO_STATUS_A);
+	v2 = t4_read_reg(adap, SGE_DBFIFO_STATUS2_A);
+	if (is_t4(adap->params.chip)) {
+		lp_count = LP_COUNT_G(v1);
+		hp_count = HP_COUNT_G(v1);
+	} else {
+		lp_count = LP_COUNT_T5_G(v1);
+		hp_count = HP_COUNT_T5_G(v2);
+	}
+	return lpfifo ? lp_count : hp_count;
+}
+EXPORT_SYMBOL(cxgb4_dbfifo_count);
+
+/**
+ *	cxgb4_port_viid - get the VI id of a port
+ *	@dev: the net device for the port
+ *
+ *	Return the VI id of the given port.
+ */
+unsigned int cxgb4_port_viid(const struct net_device *dev)
+{
+	return netdev2pinfo(dev)->viid;
+}
+EXPORT_SYMBOL(cxgb4_port_viid);
+
+/**
+ *	cxgb4_port_idx - get the index of a port
+ *	@dev: the net device for the port
+ *
+ *	Return the index of the given port.
+ */
+unsigned int cxgb4_port_idx(const struct net_device *dev)
+{
+	return netdev2pinfo(dev)->port_id;
+}
+EXPORT_SYMBOL(cxgb4_port_idx);
+
+void cxgb4_get_tcp_stats(struct pci_dev *pdev, struct tp_tcp_stats *v4,
+			 struct tp_tcp_stats *v6)
+{
+	struct adapter *adap = pci_get_drvdata(pdev);
+
+	spin_lock(&adap->stats_lock);
+	t4_tp_get_tcp_stats(adap, v4, v6, false);
+	spin_unlock(&adap->stats_lock);
+}
+EXPORT_SYMBOL(cxgb4_get_tcp_stats);
+
+void cxgb4_iscsi_init(struct net_device *dev, unsigned int tag_mask,
+		      const unsigned int *pgsz_order)
+{
+	struct adapter *adap = netdev2adap(dev);
+
+	t4_write_reg(adap, ULP_RX_ISCSI_TAGMASK_A, tag_mask);
+	t4_write_reg(adap, ULP_RX_ISCSI_PSZ_A, HPZ0_V(pgsz_order[0]) |
+		     HPZ1_V(pgsz_order[1]) | HPZ2_V(pgsz_order[2]) |
+		     HPZ3_V(pgsz_order[3]));
+}
+EXPORT_SYMBOL(cxgb4_iscsi_init);
+
+int cxgb4_flush_eq_cache(struct net_device *dev)
+{
+	struct adapter *adap = netdev2adap(dev);
+
+	return t4_sge_ctxt_flush(adap, adap->mbox, CTXT_EGRESS);
+}
+EXPORT_SYMBOL(cxgb4_flush_eq_cache);
+
+static int read_eq_indices(struct adapter *adap, u16 qid, u16 *pidx, u16 *cidx)
+{
+	u32 addr = t4_read_reg(adap, SGE_DBQ_CTXT_BADDR_A) + 24 * qid + 8;
+	__be64 indices;
+	int ret;
+
+	spin_lock(&adap->win0_lock);
+	ret = t4_memory_rw(adap, 0, MEM_EDC0, addr,
+			   sizeof(indices), (__be32 *)&indices,
+			   T4_MEMORY_READ);
+	spin_unlock(&adap->win0_lock);
+	if (!ret) {
+		*cidx = (be64_to_cpu(indices) >> 25) & 0xffff;
+		*pidx = (be64_to_cpu(indices) >> 9) & 0xffff;
+	}
+	return ret;
+}
+
+int cxgb4_sync_txq_pidx(struct net_device *dev, u16 qid, u16 pidx,
+			u16 size)
+{
+	struct adapter *adap = netdev2adap(dev);
+	u16 hw_pidx, hw_cidx;
+	int ret;
+
+	ret = read_eq_indices(adap, qid, &hw_pidx, &hw_cidx);
+	if (ret)
+		goto out;
+
+	if (pidx != hw_pidx) {
+		u16 delta;
+		u32 val;
+
+		if (pidx >= hw_pidx)
+			delta = pidx - hw_pidx;
+		else
+			delta = size - hw_pidx + pidx;
+
+		if (is_t4(adap->params.chip))
+			val = PIDX_V(delta);
+		else
+			val = PIDX_T5_V(delta);
+		wmb();
+		t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
+			     QID_V(qid) | val);
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(cxgb4_sync_txq_pidx);
+
+int cxgb4_read_tpte(struct net_device *dev, u32 stag, __be32 *tpte)
+{
+	u32 edc0_size, edc1_size, mc0_size, mc1_size, size;
+	u32 edc0_end, edc1_end, mc0_end, mc1_end;
+	u32 offset, memtype, memaddr;
+	struct adapter *adap;
+	u32 hma_size = 0;
+	int ret;
+
+	adap = netdev2adap(dev);
+
+	offset = ((stag >> 8) * 32) + adap->vres.stag.start;
+
+	/* Figure out where the offset lands in the Memory Type/Address scheme.
+	 * This code assumes that the memory is laid out starting at offset 0
+	 * with no breaks as: EDC0, EDC1, MC0, MC1. All cards have both EDC0
+	 * and EDC1.  Some cards will have neither MC0 nor MC1, most cards have
+	 * MC0, and some have both MC0 and MC1.
+	 */
+	size = t4_read_reg(adap, MA_EDRAM0_BAR_A);
+	edc0_size = EDRAM0_SIZE_G(size) << 20;
+	size = t4_read_reg(adap, MA_EDRAM1_BAR_A);
+	edc1_size = EDRAM1_SIZE_G(size) << 20;
+	size = t4_read_reg(adap, MA_EXT_MEMORY0_BAR_A);
+	mc0_size = EXT_MEM0_SIZE_G(size) << 20;
+
+	if (t4_read_reg(adap, MA_TARGET_MEM_ENABLE_A) & HMA_MUX_F) {
+		size = t4_read_reg(adap, MA_EXT_MEMORY1_BAR_A);
+		hma_size = EXT_MEM1_SIZE_G(size) << 20;
+	}
+	edc0_end = edc0_size;
+	edc1_end = edc0_end + edc1_size;
+	mc0_end = edc1_end + mc0_size;
+
+	if (offset < edc0_end) {
+		memtype = MEM_EDC0;
+		memaddr = offset;
+	} else if (offset < edc1_end) {
+		memtype = MEM_EDC1;
+		memaddr = offset - edc0_end;
+	} else {
+		if (hma_size && (offset < (edc1_end + hma_size))) {
+			memtype = MEM_HMA;
+			memaddr = offset - edc1_end;
+		} else if (offset < mc0_end) {
+			memtype = MEM_MC0;
+			memaddr = offset - edc1_end;
+		} else if (is_t5(adap->params.chip)) {
+			size = t4_read_reg(adap, MA_EXT_MEMORY1_BAR_A);
+			mc1_size = EXT_MEM1_SIZE_G(size) << 20;
+			mc1_end = mc0_end + mc1_size;
+			if (offset < mc1_end) {
+				memtype = MEM_MC1;
+				memaddr = offset - mc0_end;
+			} else {
+				/* offset beyond the end of any memory */
+				goto err;
+			}
+		} else {
+			/* T4/T6 only has a single memory channel */
+			goto err;
+		}
+	}
+
+	spin_lock(&adap->win0_lock);
+	ret = t4_memory_rw(adap, 0, memtype, memaddr, 32, tpte, T4_MEMORY_READ);
+	spin_unlock(&adap->win0_lock);
+	return ret;
+
+err:
+	dev_err(adap->pdev_dev, "stag %#x, offset %#x out of range\n",
+		stag, offset);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(cxgb4_read_tpte);
+
+u64 cxgb4_read_sge_timestamp(struct net_device *dev)
+{
+	u32 hi, lo;
+	struct adapter *adap;
+
+	adap = netdev2adap(dev);
+	lo = t4_read_reg(adap, SGE_TIMESTAMP_LO_A);
+	hi = TSVAL_G(t4_read_reg(adap, SGE_TIMESTAMP_HI_A));
+
+	return ((u64)hi << 32) | (u64)lo;
+}
+EXPORT_SYMBOL(cxgb4_read_sge_timestamp);
+
+int cxgb4_bar2_sge_qregs(struct net_device *dev,
+			 unsigned int qid,
+			 enum cxgb4_bar2_qtype qtype,
+			 int user,
+			 u64 *pbar2_qoffset,
+			 unsigned int *pbar2_qid)
+{
+	return t4_bar2_sge_qregs(netdev2adap(dev),
+				 qid,
+				 (qtype == CXGB4_BAR2_QTYPE_EGRESS
+				  ? T4_BAR2_QTYPE_EGRESS
+				  : T4_BAR2_QTYPE_INGRESS),
+				 user,
+				 pbar2_qoffset,
+				 pbar2_qid);
+}
+EXPORT_SYMBOL(cxgb4_bar2_sge_qregs);
+
+static struct pci_driver cxgb4_driver;
+
+static void check_neigh_update(struct neighbour *neigh)
+{
+	const struct device *parent;
+	const struct net_device *netdev = neigh->dev;
+
+	if (is_vlan_dev(netdev))
+		netdev = vlan_dev_real_dev(netdev);
+	parent = netdev->dev.parent;
+	if (parent && parent->driver == &cxgb4_driver.driver)
+		t4_l2t_update(dev_get_drvdata(parent), neigh);
+}
+
+static int netevent_cb(struct notifier_block *nb, unsigned long event,
+		       void *data)
+{
+	switch (event) {
+	case NETEVENT_NEIGH_UPDATE:
+		check_neigh_update(data);
+		break;
+	case NETEVENT_REDIRECT:
+	default:
+		break;
+	}
+	return 0;
+}
+
+static bool netevent_registered;
+static struct notifier_block cxgb4_netevent_nb = {
+	.notifier_call = netevent_cb
+};
+
+static void drain_db_fifo(struct adapter *adap, int usecs)
+{
+	u32 v1, v2, lp_count, hp_count;
+
+	do {
+		v1 = t4_read_reg(adap, SGE_DBFIFO_STATUS_A);
+		v2 = t4_read_reg(adap, SGE_DBFIFO_STATUS2_A);
+		if (is_t4(adap->params.chip)) {
+			lp_count = LP_COUNT_G(v1);
+			hp_count = HP_COUNT_G(v1);
+		} else {
+			lp_count = LP_COUNT_T5_G(v1);
+			hp_count = HP_COUNT_T5_G(v2);
+		}
+
+		if (lp_count == 0 && hp_count == 0)
+			break;
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(usecs_to_jiffies(usecs));
+	} while (1);
+}
+
+static void disable_txq_db(struct sge_txq *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->db_lock, flags);
+	q->db_disabled = 1;
+	spin_unlock_irqrestore(&q->db_lock, flags);
+}
+
+static void enable_txq_db(struct adapter *adap, struct sge_txq *q)
+{
+	spin_lock_irq(&q->db_lock);
+	if (q->db_pidx_inc) {
+		/* Make sure that all writes to the TX descriptors
+		 * are committed before we tell HW about them.
+		 */
+		wmb();
+		t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
+			     QID_V(q->cntxt_id) | PIDX_V(q->db_pidx_inc));
+		q->db_pidx_inc = 0;
+	}
+	q->db_disabled = 0;
+	spin_unlock_irq(&q->db_lock);
+}
+
+static void disable_dbs(struct adapter *adap)
+{
+	int i;
+
+	for_each_ethrxq(&adap->sge, i)
+		disable_txq_db(&adap->sge.ethtxq[i].q);
+	if (is_offload(adap)) {
+		struct sge_uld_txq_info *txq_info =
+			adap->sge.uld_txq_info[CXGB4_TX_OFLD];
+
+		if (txq_info) {
+			for_each_ofldtxq(&adap->sge, i) {
+				struct sge_uld_txq *txq = &txq_info->uldtxq[i];
+
+				disable_txq_db(&txq->q);
+			}
+		}
+	}
+	for_each_port(adap, i)
+		disable_txq_db(&adap->sge.ctrlq[i].q);
+}
+
+static void enable_dbs(struct adapter *adap)
+{
+	int i;
+
+	for_each_ethrxq(&adap->sge, i)
+		enable_txq_db(adap, &adap->sge.ethtxq[i].q);
+	if (is_offload(adap)) {
+		struct sge_uld_txq_info *txq_info =
+			adap->sge.uld_txq_info[CXGB4_TX_OFLD];
+
+		if (txq_info) {
+			for_each_ofldtxq(&adap->sge, i) {
+				struct sge_uld_txq *txq = &txq_info->uldtxq[i];
+
+				enable_txq_db(adap, &txq->q);
+			}
+		}
+	}
+	for_each_port(adap, i)
+		enable_txq_db(adap, &adap->sge.ctrlq[i].q);
+}
+
+static void notify_rdma_uld(struct adapter *adap, enum cxgb4_control cmd)
+{
+	enum cxgb4_uld type = CXGB4_ULD_RDMA;
+
+	if (adap->uld && adap->uld[type].handle)
+		adap->uld[type].control(adap->uld[type].handle, cmd);
+}
+
+static void process_db_full(struct work_struct *work)
+{
+	struct adapter *adap;
+
+	adap = container_of(work, struct adapter, db_full_task);
+
+	drain_db_fifo(adap, dbfifo_drain_delay);
+	enable_dbs(adap);
+	notify_rdma_uld(adap, CXGB4_CONTROL_DB_EMPTY);
+	if (CHELSIO_CHIP_VERSION(adap->params.chip) <= CHELSIO_T5)
+		t4_set_reg_field(adap, SGE_INT_ENABLE3_A,
+				 DBFIFO_HP_INT_F | DBFIFO_LP_INT_F,
+				 DBFIFO_HP_INT_F | DBFIFO_LP_INT_F);
+	else
+		t4_set_reg_field(adap, SGE_INT_ENABLE3_A,
+				 DBFIFO_LP_INT_F, DBFIFO_LP_INT_F);
+}
+
+static void sync_txq_pidx(struct adapter *adap, struct sge_txq *q)
+{
+	u16 hw_pidx, hw_cidx;
+	int ret;
+
+	spin_lock_irq(&q->db_lock);
+	ret = read_eq_indices(adap, (u16)q->cntxt_id, &hw_pidx, &hw_cidx);
+	if (ret)
+		goto out;
+	if (q->db_pidx != hw_pidx) {
+		u16 delta;
+		u32 val;
+
+		if (q->db_pidx >= hw_pidx)
+			delta = q->db_pidx - hw_pidx;
+		else
+			delta = q->size - hw_pidx + q->db_pidx;
+
+		if (is_t4(adap->params.chip))
+			val = PIDX_V(delta);
+		else
+			val = PIDX_T5_V(delta);
+		wmb();
+		t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
+			     QID_V(q->cntxt_id) | val);
+	}
+out:
+	q->db_disabled = 0;
+	q->db_pidx_inc = 0;
+	spin_unlock_irq(&q->db_lock);
+	if (ret)
+		CH_WARN(adap, "DB drop recovery failed.\n");
+}
+
+static void recover_all_queues(struct adapter *adap)
+{
+	int i;
+
+	for_each_ethrxq(&adap->sge, i)
+		sync_txq_pidx(adap, &adap->sge.ethtxq[i].q);
+	if (is_offload(adap)) {
+		struct sge_uld_txq_info *txq_info =
+			adap->sge.uld_txq_info[CXGB4_TX_OFLD];
+		if (txq_info) {
+			for_each_ofldtxq(&adap->sge, i) {
+				struct sge_uld_txq *txq = &txq_info->uldtxq[i];
+
+				sync_txq_pidx(adap, &txq->q);
+			}
+		}
+	}
+	for_each_port(adap, i)
+		sync_txq_pidx(adap, &adap->sge.ctrlq[i].q);
+}
+
+static void process_db_drop(struct work_struct *work)
+{
+	struct adapter *adap;
+
+	adap = container_of(work, struct adapter, db_drop_task);
+
+	if (is_t4(adap->params.chip)) {
+		drain_db_fifo(adap, dbfifo_drain_delay);
+		notify_rdma_uld(adap, CXGB4_CONTROL_DB_DROP);
+		drain_db_fifo(adap, dbfifo_drain_delay);
+		recover_all_queues(adap);
+		drain_db_fifo(adap, dbfifo_drain_delay);
+		enable_dbs(adap);
+		notify_rdma_uld(adap, CXGB4_CONTROL_DB_EMPTY);
+	} else if (is_t5(adap->params.chip)) {
+		u32 dropped_db = t4_read_reg(adap, 0x010ac);
+		u16 qid = (dropped_db >> 15) & 0x1ffff;
+		u16 pidx_inc = dropped_db & 0x1fff;
+		u64 bar2_qoffset;
+		unsigned int bar2_qid;
+		int ret;
+
+		ret = t4_bar2_sge_qregs(adap, qid, T4_BAR2_QTYPE_EGRESS,
+					0, &bar2_qoffset, &bar2_qid);
+		if (ret)
+			dev_err(adap->pdev_dev, "doorbell drop recovery: "
+				"qid=%d, pidx_inc=%d\n", qid, pidx_inc);
+		else
+			writel(PIDX_T5_V(pidx_inc) | QID_V(bar2_qid),
+			       adap->bar2 + bar2_qoffset + SGE_UDB_KDOORBELL);
+
+		/* Re-enable BAR2 WC */
+		t4_set_reg_field(adap, 0x10b0, 1<<15, 1<<15);
+	}
+
+	if (CHELSIO_CHIP_VERSION(adap->params.chip) <= CHELSIO_T5)
+		t4_set_reg_field(adap, SGE_DOORBELL_CONTROL_A, DROPPED_DB_F, 0);
+}
+
+void t4_db_full(struct adapter *adap)
+{
+	if (is_t4(adap->params.chip)) {
+		disable_dbs(adap);
+		notify_rdma_uld(adap, CXGB4_CONTROL_DB_FULL);
+		t4_set_reg_field(adap, SGE_INT_ENABLE3_A,
+				 DBFIFO_HP_INT_F | DBFIFO_LP_INT_F, 0);
+		queue_work(adap->workq, &adap->db_full_task);
+	}
+}
+
+void t4_db_dropped(struct adapter *adap)
+{
+	if (is_t4(adap->params.chip)) {
+		disable_dbs(adap);
+		notify_rdma_uld(adap, CXGB4_CONTROL_DB_FULL);
+	}
+	queue_work(adap->workq, &adap->db_drop_task);
+}
+
+void t4_register_netevent_notifier(void)
+{
+	if (!netevent_registered) {
+		register_netevent_notifier(&cxgb4_netevent_nb);
+		netevent_registered = true;
+	}
+}
+
+static void detach_ulds(struct adapter *adap)
+{
+	unsigned int i;
+
+	mutex_lock(&uld_mutex);
+	list_del(&adap->list_node);
+
+	for (i = 0; i < CXGB4_ULD_MAX; i++)
+		if (adap->uld && adap->uld[i].handle)
+			adap->uld[i].state_change(adap->uld[i].handle,
+					     CXGB4_STATE_DETACH);
+
+	if (netevent_registered && list_empty(&adapter_list)) {
+		unregister_netevent_notifier(&cxgb4_netevent_nb);
+		netevent_registered = false;
+	}
+	mutex_unlock(&uld_mutex);
+}
+
+static void notify_ulds(struct adapter *adap, enum cxgb4_state new_state)
+{
+	unsigned int i;
+
+	mutex_lock(&uld_mutex);
+	for (i = 0; i < CXGB4_ULD_MAX; i++)
+		if (adap->uld && adap->uld[i].handle)
+			adap->uld[i].state_change(adap->uld[i].handle,
+						  new_state);
+	mutex_unlock(&uld_mutex);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int cxgb4_inet6addr_handler(struct notifier_block *this,
+				   unsigned long event, void *data)
+{
+	struct inet6_ifaddr *ifa = data;
+	struct net_device *event_dev = ifa->idev->dev;
+	const struct device *parent = NULL;
+#if IS_ENABLED(CONFIG_BONDING)
+	struct adapter *adap;
+#endif
+	if (is_vlan_dev(event_dev))
+		event_dev = vlan_dev_real_dev(event_dev);
+#if IS_ENABLED(CONFIG_BONDING)
+	if (event_dev->flags & IFF_MASTER) {
+		list_for_each_entry(adap, &adapter_list, list_node) {
+			switch (event) {
+			case NETDEV_UP:
+				cxgb4_clip_get(adap->port[0],
+					       (const u32 *)ifa, 1);
+				break;
+			case NETDEV_DOWN:
+				cxgb4_clip_release(adap->port[0],
+						   (const u32 *)ifa, 1);
+				break;
+			default:
+				break;
+			}
+		}
+		return NOTIFY_OK;
+	}
+#endif
+
+	if (event_dev)
+		parent = event_dev->dev.parent;
+
+	if (parent && parent->driver == &cxgb4_driver.driver) {
+		switch (event) {
+		case NETDEV_UP:
+			cxgb4_clip_get(event_dev, (const u32 *)ifa, 1);
+			break;
+		case NETDEV_DOWN:
+			cxgb4_clip_release(event_dev, (const u32 *)ifa, 1);
+			break;
+		default:
+			break;
+		}
+	}
+	return NOTIFY_OK;
+}
+
+static bool inet6addr_registered;
+static struct notifier_block cxgb4_inet6addr_notifier = {
+	.notifier_call = cxgb4_inet6addr_handler
+};
+
+static void update_clip(const struct adapter *adap)
+{
+	int i;
+	struct net_device *dev;
+	int ret;
+
+	rcu_read_lock();
+
+	for (i = 0; i < MAX_NPORTS; i++) {
+		dev = adap->port[i];
+		ret = 0;
+
+		if (dev)
+			ret = cxgb4_update_root_dev_clip(dev);
+
+		if (ret < 0)
+			break;
+	}
+	rcu_read_unlock();
+}
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+
+/**
+ *	cxgb_up - enable the adapter
+ *	@adap: adapter being enabled
+ *
+ *	Called when the first port is enabled, this function performs the
+ *	actions necessary to make an adapter operational, such as completing
+ *	the initialization of HW modules, and enabling interrupts.
+ *
+ *	Must be called with the rtnl lock held.
+ */
+static int cxgb_up(struct adapter *adap)
+{
+	int err;
+
+	mutex_lock(&uld_mutex);
+	err = setup_sge_queues(adap);
+	if (err)
+		goto rel_lock;
+	err = setup_rss(adap);
+	if (err)
+		goto freeq;
+
+	if (adap->flags & USING_MSIX) {
+		name_msix_vecs(adap);
+		err = request_irq(adap->msix_info[0].vec, t4_nondata_intr, 0,
+				  adap->msix_info[0].desc, adap);
+		if (err)
+			goto irq_err;
+		err = request_msix_queue_irqs(adap);
+		if (err) {
+			free_irq(adap->msix_info[0].vec, adap);
+			goto irq_err;
+		}
+	} else {
+		err = request_irq(adap->pdev->irq, t4_intr_handler(adap),
+				  (adap->flags & USING_MSI) ? 0 : IRQF_SHARED,
+				  adap->port[0]->name, adap);
+		if (err)
+			goto irq_err;
+	}
+
+	enable_rx(adap);
+	t4_sge_start(adap);
+	t4_intr_enable(adap);
+	adap->flags |= FULL_INIT_DONE;
+	mutex_unlock(&uld_mutex);
+
+	notify_ulds(adap, CXGB4_STATE_UP);
+#if IS_ENABLED(CONFIG_IPV6)
+	update_clip(adap);
+#endif
+	/* Initialize hash mac addr list*/
+	INIT_LIST_HEAD(&adap->mac_hlist);
+	return err;
+
+ irq_err:
+	dev_err(adap->pdev_dev, "request_irq failed, err %d\n", err);
+ freeq:
+	t4_free_sge_resources(adap);
+ rel_lock:
+	mutex_unlock(&uld_mutex);
+	return err;
+}
+
+static void cxgb_down(struct adapter *adapter)
+{
+	cancel_work_sync(&adapter->tid_release_task);
+	cancel_work_sync(&adapter->db_full_task);
+	cancel_work_sync(&adapter->db_drop_task);
+	adapter->tid_release_task_busy = false;
+	adapter->tid_release_head = NULL;
+
+	t4_sge_stop(adapter);
+	t4_free_sge_resources(adapter);
+	adapter->flags &= ~FULL_INIT_DONE;
+}
+
+/*
+ * net_device operations
+ */
+static int cxgb_open(struct net_device *dev)
+{
+	int err;
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	netif_carrier_off(dev);
+
+	if (!(adapter->flags & FULL_INIT_DONE)) {
+		err = cxgb_up(adapter);
+		if (err < 0)
+			return err;
+	}
+
+	/* It's possible that the basic port information could have
+	 * changed since we first read it.
+	 */
+	err = t4_update_port_info(pi);
+	if (err < 0)
+		return err;
+
+	err = link_start(dev);
+	if (!err)
+		netif_tx_start_all_queues(dev);
+	return err;
+}
+
+static int cxgb_close(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int ret;
+
+	netif_tx_stop_all_queues(dev);
+	netif_carrier_off(dev);
+	ret = t4_enable_vi(adapter, adapter->pf, pi->viid, false, false);
+#ifdef CONFIG_CHELSIO_T4_DCB
+	cxgb4_dcb_reset(dev);
+	dcb_tx_queue_prio_enable(dev, false);
+#endif
+	return ret;
+}
+
+int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid,
+		__be32 sip, __be16 sport, __be16 vlan,
+		unsigned int queue, unsigned char port, unsigned char mask)
+{
+	int ret;
+	struct filter_entry *f;
+	struct adapter *adap;
+	int i;
+	u8 *val;
+
+	adap = netdev2adap(dev);
+
+	/* Adjust stid to correct filter index */
+	stid -= adap->tids.sftid_base;
+	stid += adap->tids.nftids;
+
+	/* Check to make sure the filter requested is writable ...
+	 */
+	f = &adap->tids.ftid_tab[stid];
+	ret = writable_filter(f);
+	if (ret)
+		return ret;
+
+	/* Clear out any old resources being used by the filter before
+	 * we start constructing the new filter.
+	 */
+	if (f->valid)
+		clear_filter(adap, f);
+
+	/* Clear out filter specifications */
+	memset(&f->fs, 0, sizeof(struct ch_filter_specification));
+	f->fs.val.lport = cpu_to_be16(sport);
+	f->fs.mask.lport  = ~0;
+	val = (u8 *)&sip;
+	if ((val[0] | val[1] | val[2] | val[3]) != 0) {
+		for (i = 0; i < 4; i++) {
+			f->fs.val.lip[i] = val[i];
+			f->fs.mask.lip[i] = ~0;
+		}
+		if (adap->params.tp.vlan_pri_map & PORT_F) {
+			f->fs.val.iport = port;
+			f->fs.mask.iport = mask;
+		}
+	}
+
+	if (adap->params.tp.vlan_pri_map & PROTOCOL_F) {
+		f->fs.val.proto = IPPROTO_TCP;
+		f->fs.mask.proto = ~0;
+	}
+
+	f->fs.dirsteer = 1;
+	f->fs.iq = queue;
+	/* Mark filter as locked */
+	f->locked = 1;
+	f->fs.rpttid = 1;
+
+	/* Save the actual tid. We need this to get the corresponding
+	 * filter entry structure in filter_rpl.
+	 */
+	f->tid = stid + adap->tids.ftid_base;
+	ret = set_filter_wr(adap, stid);
+	if (ret) {
+		clear_filter(adap, f);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(cxgb4_create_server_filter);
+
+int cxgb4_remove_server_filter(const struct net_device *dev, unsigned int stid,
+		unsigned int queue, bool ipv6)
+{
+	struct filter_entry *f;
+	struct adapter *adap;
+
+	adap = netdev2adap(dev);
+
+	/* Adjust stid to correct filter index */
+	stid -= adap->tids.sftid_base;
+	stid += adap->tids.nftids;
+
+	f = &adap->tids.ftid_tab[stid];
+	/* Unlock the filter */
+	f->locked = 0;
+
+	return delete_filter(adap, stid);
+}
+EXPORT_SYMBOL(cxgb4_remove_server_filter);
+
+static void cxgb_get_stats(struct net_device *dev,
+			   struct rtnl_link_stats64 *ns)
+{
+	struct port_stats stats;
+	struct port_info *p = netdev_priv(dev);
+	struct adapter *adapter = p->adapter;
+
+	/* Block retrieving statistics during EEH error
+	 * recovery. Otherwise, the recovery might fail
+	 * and the PCI device will be removed permanently
+	 */
+	spin_lock(&adapter->stats_lock);
+	if (!netif_device_present(dev)) {
+		spin_unlock(&adapter->stats_lock);
+		return;
+	}
+	t4_get_port_stats_offset(adapter, p->tx_chan, &stats,
+				 &p->stats_base);
+	spin_unlock(&adapter->stats_lock);
+
+	ns->tx_bytes   = stats.tx_octets;
+	ns->tx_packets = stats.tx_frames;
+	ns->rx_bytes   = stats.rx_octets;
+	ns->rx_packets = stats.rx_frames;
+	ns->multicast  = stats.rx_mcast_frames;
+
+	/* detailed rx_errors */
+	ns->rx_length_errors = stats.rx_jabber + stats.rx_too_long +
+			       stats.rx_runt;
+	ns->rx_over_errors   = 0;
+	ns->rx_crc_errors    = stats.rx_fcs_err;
+	ns->rx_frame_errors  = stats.rx_symbol_err;
+	ns->rx_dropped	     = stats.rx_ovflow0 + stats.rx_ovflow1 +
+			       stats.rx_ovflow2 + stats.rx_ovflow3 +
+			       stats.rx_trunc0 + stats.rx_trunc1 +
+			       stats.rx_trunc2 + stats.rx_trunc3;
+	ns->rx_missed_errors = 0;
+
+	/* detailed tx_errors */
+	ns->tx_aborted_errors   = 0;
+	ns->tx_carrier_errors   = 0;
+	ns->tx_fifo_errors      = 0;
+	ns->tx_heartbeat_errors = 0;
+	ns->tx_window_errors    = 0;
+
+	ns->tx_errors = stats.tx_error_frames;
+	ns->rx_errors = stats.rx_symbol_err + stats.rx_fcs_err +
+		ns->rx_length_errors + stats.rx_len_err + ns->rx_fifo_errors;
+}
+
+static int cxgb_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
+{
+	unsigned int mbox;
+	int ret = 0, prtad, devad;
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct mii_ioctl_data *data = (struct mii_ioctl_data *)&req->ifr_data;
+
+	switch (cmd) {
+	case SIOCGMIIPHY:
+		if (pi->mdio_addr < 0)
+			return -EOPNOTSUPP;
+		data->phy_id = pi->mdio_addr;
+		break;
+	case SIOCGMIIREG:
+	case SIOCSMIIREG:
+		if (mdio_phy_id_is_c45(data->phy_id)) {
+			prtad = mdio_phy_id_prtad(data->phy_id);
+			devad = mdio_phy_id_devad(data->phy_id);
+		} else if (data->phy_id < 32) {
+			prtad = data->phy_id;
+			devad = 0;
+			data->reg_num &= 0x1f;
+		} else
+			return -EINVAL;
+
+		mbox = pi->adapter->pf;
+		if (cmd == SIOCGMIIREG)
+			ret = t4_mdio_rd(pi->adapter, mbox, prtad, devad,
+					 data->reg_num, &data->val_out);
+		else
+			ret = t4_mdio_wr(pi->adapter, mbox, prtad, devad,
+					 data->reg_num, data->val_in);
+		break;
+	case SIOCGHWTSTAMP:
+		return copy_to_user(req->ifr_data, &pi->tstamp_config,
+				    sizeof(pi->tstamp_config)) ?
+			-EFAULT : 0;
+	case SIOCSHWTSTAMP:
+		if (copy_from_user(&pi->tstamp_config, req->ifr_data,
+				   sizeof(pi->tstamp_config)))
+			return -EFAULT;
+
+		if (!is_t4(adapter->params.chip)) {
+			switch (pi->tstamp_config.tx_type) {
+			case HWTSTAMP_TX_OFF:
+			case HWTSTAMP_TX_ON:
+				break;
+			default:
+				return -ERANGE;
+			}
+
+			switch (pi->tstamp_config.rx_filter) {
+			case HWTSTAMP_FILTER_NONE:
+				pi->rxtstamp = false;
+				break;
+			case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+			case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+				cxgb4_ptprx_timestamping(pi, pi->port_id,
+							 PTP_TS_L4);
+				break;
+			case HWTSTAMP_FILTER_PTP_V2_EVENT:
+				cxgb4_ptprx_timestamping(pi, pi->port_id,
+							 PTP_TS_L2_L4);
+				break;
+			case HWTSTAMP_FILTER_ALL:
+			case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+			case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+			case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+			case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+				pi->rxtstamp = true;
+				break;
+			default:
+				pi->tstamp_config.rx_filter =
+					HWTSTAMP_FILTER_NONE;
+				return -ERANGE;
+			}
+
+			if ((pi->tstamp_config.tx_type == HWTSTAMP_TX_OFF) &&
+			    (pi->tstamp_config.rx_filter ==
+				HWTSTAMP_FILTER_NONE)) {
+				if (cxgb4_ptp_txtype(adapter, pi->port_id) >= 0)
+					pi->ptp_enable = false;
+			}
+
+			if (pi->tstamp_config.rx_filter !=
+				HWTSTAMP_FILTER_NONE) {
+				if (cxgb4_ptp_redirect_rx_packet(adapter,
+								 pi) >= 0)
+					pi->ptp_enable = true;
+			}
+		} else {
+			/* For T4 Adapters */
+			switch (pi->tstamp_config.rx_filter) {
+			case HWTSTAMP_FILTER_NONE:
+			pi->rxtstamp = false;
+			break;
+			case HWTSTAMP_FILTER_ALL:
+			pi->rxtstamp = true;
+			break;
+			default:
+			pi->tstamp_config.rx_filter =
+			HWTSTAMP_FILTER_NONE;
+			return -ERANGE;
+			}
+		}
+		return copy_to_user(req->ifr_data, &pi->tstamp_config,
+				    sizeof(pi->tstamp_config)) ?
+			-EFAULT : 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+	return ret;
+}
+
+static void cxgb_set_rxmode(struct net_device *dev)
+{
+	/* unfortunately we can't return errors to the stack */
+	set_rxmode(dev, -1, false);
+}
+
+static int cxgb_change_mtu(struct net_device *dev, int new_mtu)
+{
+	int ret;
+	struct port_info *pi = netdev_priv(dev);
+
+	ret = t4_set_rxmode(pi->adapter, pi->adapter->pf, pi->viid, new_mtu, -1,
+			    -1, -1, -1, true);
+	if (!ret)
+		dev->mtu = new_mtu;
+	return ret;
+}
+
+#ifdef CONFIG_PCI_IOV
+static int cxgb4_mgmt_open(struct net_device *dev)
+{
+	/* Turn carrier off since we don't have to transmit anything on this
+	 * interface.
+	 */
+	netif_carrier_off(dev);
+	return 0;
+}
+
+/* Fill MAC address that will be assigned by the FW */
+static void cxgb4_mgmt_fill_vf_station_mac_addr(struct adapter *adap)
+{
+	u8 hw_addr[ETH_ALEN], macaddr[ETH_ALEN];
+	unsigned int i, vf, nvfs;
+	u16 a, b;
+	int err;
+	u8 *na;
+
+	adap->params.pci.vpd_cap_addr = pci_find_capability(adap->pdev,
+							    PCI_CAP_ID_VPD);
+	err = t4_get_raw_vpd_params(adap, &adap->params.vpd);
+	if (err)
+		return;
+
+	na = adap->params.vpd.na;
+	for (i = 0; i < ETH_ALEN; i++)
+		hw_addr[i] = (hex2val(na[2 * i + 0]) * 16 +
+			      hex2val(na[2 * i + 1]));
+
+	a = (hw_addr[0] << 8) | hw_addr[1];
+	b = (hw_addr[1] << 8) | hw_addr[2];
+	a ^= b;
+	a |= 0x0200;    /* locally assigned Ethernet MAC address */
+	a &= ~0x0100;   /* not a multicast Ethernet MAC address */
+	macaddr[0] = a >> 8;
+	macaddr[1] = a & 0xff;
+
+	for (i = 2; i < 5; i++)
+		macaddr[i] = hw_addr[i + 1];
+
+	for (vf = 0, nvfs = pci_sriov_get_totalvfs(adap->pdev);
+		vf < nvfs; vf++) {
+		macaddr[5] = adap->pf * 16 + vf;
+		ether_addr_copy(adap->vfinfo[vf].vf_mac_addr, macaddr);
+	}
+}
+
+static int cxgb4_mgmt_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	int ret;
+
+	/* verify MAC addr is valid */
+	if (!is_valid_ether_addr(mac)) {
+		dev_err(pi->adapter->pdev_dev,
+			"Invalid Ethernet address %pM for VF %d\n",
+			mac, vf);
+		return -EINVAL;
+	}
+
+	dev_info(pi->adapter->pdev_dev,
+		 "Setting MAC %pM on VF %d\n", mac, vf);
+	ret = t4_set_vf_mac_acl(adap, vf + 1, 1, mac);
+	if (!ret)
+		ether_addr_copy(adap->vfinfo[vf].vf_mac_addr, mac);
+	return ret;
+}
+
+static int cxgb4_mgmt_get_vf_config(struct net_device *dev,
+				    int vf, struct ifla_vf_info *ivi)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct vf_info *vfinfo;
+
+	if (vf >= adap->num_vfs)
+		return -EINVAL;
+	vfinfo = &adap->vfinfo[vf];
+
+	ivi->vf = vf;
+	ivi->max_tx_rate = vfinfo->tx_rate;
+	ivi->min_tx_rate = 0;
+	ether_addr_copy(ivi->mac, vfinfo->vf_mac_addr);
+	ivi->vlan = vfinfo->vlan;
+	return 0;
+}
+
+static int cxgb4_mgmt_get_phys_port_id(struct net_device *dev,
+				       struct netdev_phys_item_id *ppid)
+{
+	struct port_info *pi = netdev_priv(dev);
+	unsigned int phy_port_id;
+
+	phy_port_id = pi->adapter->adap_idx * 10 + pi->port_id;
+	ppid->id_len = sizeof(phy_port_id);
+	memcpy(ppid->id, &phy_port_id, ppid->id_len);
+	return 0;
+}
+
+static int cxgb4_mgmt_set_vf_rate(struct net_device *dev, int vf,
+				  int min_tx_rate, int max_tx_rate)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	unsigned int link_ok, speed, mtu;
+	u32 fw_pfvf, fw_class;
+	int class_id = vf;
+	int ret;
+	u16 pktsize;
+
+	if (vf >= adap->num_vfs)
+		return -EINVAL;
+
+	if (min_tx_rate) {
+		dev_err(adap->pdev_dev,
+			"Min tx rate (%d) (> 0) for VF %d is Invalid.\n",
+			min_tx_rate, vf);
+		return -EINVAL;
+	}
+
+	ret = t4_get_link_params(pi, &link_ok, &speed, &mtu);
+	if (ret != FW_SUCCESS) {
+		dev_err(adap->pdev_dev,
+			"Failed to get link information for VF %d\n", vf);
+		return -EINVAL;
+	}
+
+	if (!link_ok) {
+		dev_err(adap->pdev_dev, "Link down for VF %d\n", vf);
+		return -EINVAL;
+	}
+
+	if (max_tx_rate > speed) {
+		dev_err(adap->pdev_dev,
+			"Max tx rate %d for VF %d can't be > link-speed %u",
+			max_tx_rate, vf, speed);
+		return -EINVAL;
+	}
+
+	pktsize = mtu;
+	/* subtract ethhdr size and 4 bytes crc since, f/w appends it */
+	pktsize = pktsize - sizeof(struct ethhdr) - 4;
+	/* subtract ipv4 hdr size, tcp hdr size to get typical IPv4 MSS size */
+	pktsize = pktsize - sizeof(struct iphdr) - sizeof(struct tcphdr);
+	/* configure Traffic Class for rate-limiting */
+	ret = t4_sched_params(adap, SCHED_CLASS_TYPE_PACKET,
+			      SCHED_CLASS_LEVEL_CL_RL,
+			      SCHED_CLASS_MODE_CLASS,
+			      SCHED_CLASS_RATEUNIT_BITS,
+			      SCHED_CLASS_RATEMODE_ABS,
+			      pi->tx_chan, class_id, 0,
+			      max_tx_rate * 1000, 0, pktsize);
+	if (ret) {
+		dev_err(adap->pdev_dev, "Err %d for Traffic Class config\n",
+			ret);
+		return -EINVAL;
+	}
+	dev_info(adap->pdev_dev,
+		 "Class %d with MSS %u configured with rate %u\n",
+		 class_id, pktsize, max_tx_rate);
+
+	/* bind VF to configured Traffic Class */
+	fw_pfvf = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_PFVF) |
+		   FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_PFVF_SCHEDCLASS_ETH));
+	fw_class = class_id;
+	ret = t4_set_params(adap, adap->mbox, adap->pf, vf + 1, 1, &fw_pfvf,
+			    &fw_class);
+	if (ret) {
+		dev_err(adap->pdev_dev,
+			"Err %d in binding VF %d to Traffic Class %d\n",
+			ret, vf, class_id);
+		return -EINVAL;
+	}
+	dev_info(adap->pdev_dev, "PF %d VF %d is bound to Class %d\n",
+		 adap->pf, vf, class_id);
+	adap->vfinfo[vf].tx_rate = max_tx_rate;
+	return 0;
+}
+
+static int cxgb4_mgmt_set_vf_vlan(struct net_device *dev, int vf,
+				  u16 vlan, u8 qos, __be16 vlan_proto)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	int ret;
+
+	if (vf >= adap->num_vfs || vlan > 4095 || qos > 7)
+		return -EINVAL;
+
+	if (vlan_proto != htons(ETH_P_8021Q) || qos != 0)
+		return -EPROTONOSUPPORT;
+
+	ret = t4_set_vlan_acl(adap, adap->mbox, vf + 1, vlan);
+	if (!ret) {
+		adap->vfinfo[vf].vlan = vlan;
+		return 0;
+	}
+
+	dev_err(adap->pdev_dev, "Err %d %s VLAN ACL for PF/VF %d/%d\n",
+		ret, (vlan ? "setting" : "clearing"), adap->pf, vf);
+	return ret;
+}
+#endif /* CONFIG_PCI_IOV */
+
+static int cxgb_set_mac_addr(struct net_device *dev, void *p)
+{
+	int ret;
+	struct sockaddr *addr = p;
+	struct port_info *pi = netdev_priv(dev);
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	ret = t4_change_mac(pi->adapter, pi->adapter->pf, pi->viid,
+			    pi->xact_addr_filt, addr->sa_data, true, true);
+	if (ret < 0)
+		return ret;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	pi->xact_addr_filt = ret;
+	return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void cxgb_netpoll(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+
+	if (adap->flags & USING_MSIX) {
+		int i;
+		struct sge_eth_rxq *rx = &adap->sge.ethrxq[pi->first_qset];
+
+		for (i = pi->nqsets; i; i--, rx++)
+			t4_sge_intr_msix(0, &rx->rspq);
+	} else
+		t4_intr_handler(adap)(0, adap);
+}
+#endif
+
+static int cxgb_set_tx_maxrate(struct net_device *dev, int index, u32 rate)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sched_class *e;
+	struct ch_sched_params p;
+	struct ch_sched_queue qe;
+	u32 req_rate;
+	int err = 0;
+
+	if (!can_sched(dev))
+		return -ENOTSUPP;
+
+	if (index < 0 || index > pi->nqsets - 1)
+		return -EINVAL;
+
+	if (!(adap->flags & FULL_INIT_DONE)) {
+		dev_err(adap->pdev_dev,
+			"Failed to rate limit on queue %d. Link Down?\n",
+			index);
+		return -EINVAL;
+	}
+
+	/* Convert from Mbps to Kbps */
+	req_rate = rate << 10;
+
+	/* Max rate is 100 Gbps */
+	if (req_rate >= SCHED_MAX_RATE_KBPS) {
+		dev_err(adap->pdev_dev,
+			"Invalid rate %u Mbps, Max rate is %u Mbps\n",
+			rate, SCHED_MAX_RATE_KBPS >> 10);
+		return -ERANGE;
+	}
+
+	/* First unbind the queue from any existing class */
+	memset(&qe, 0, sizeof(qe));
+	qe.queue = index;
+	qe.class = SCHED_CLS_NONE;
+
+	err = cxgb4_sched_class_unbind(dev, (void *)(&qe), SCHED_QUEUE);
+	if (err) {
+		dev_err(adap->pdev_dev,
+			"Unbinding Queue %d on port %d fail. Err: %d\n",
+			index, pi->port_id, err);
+		return err;
+	}
+
+	/* Queue already unbound */
+	if (!req_rate)
+		return 0;
+
+	/* Fetch any available unused or matching scheduling class */
+	memset(&p, 0, sizeof(p));
+	p.type = SCHED_CLASS_TYPE_PACKET;
+	p.u.params.level    = SCHED_CLASS_LEVEL_CL_RL;
+	p.u.params.mode     = SCHED_CLASS_MODE_CLASS;
+	p.u.params.rateunit = SCHED_CLASS_RATEUNIT_BITS;
+	p.u.params.ratemode = SCHED_CLASS_RATEMODE_ABS;
+	p.u.params.channel  = pi->tx_chan;
+	p.u.params.class    = SCHED_CLS_NONE;
+	p.u.params.minrate  = 0;
+	p.u.params.maxrate  = req_rate;
+	p.u.params.weight   = 0;
+	p.u.params.pktsize  = dev->mtu;
+
+	e = cxgb4_sched_class_alloc(dev, &p);
+	if (!e)
+		return -ENOMEM;
+
+	/* Bind the queue to a scheduling class */
+	memset(&qe, 0, sizeof(qe));
+	qe.queue = index;
+	qe.class = e->idx;
+
+	err = cxgb4_sched_class_bind(dev, (void *)(&qe), SCHED_QUEUE);
+	if (err)
+		dev_err(adap->pdev_dev,
+			"Queue rate limiting failed. Err: %d\n", err);
+	return err;
+}
+
+static int cxgb_setup_tc_flower(struct net_device *dev,
+				struct tc_cls_flower_offload *cls_flower)
+{
+	switch (cls_flower->command) {
+	case TC_CLSFLOWER_REPLACE:
+		return cxgb4_tc_flower_replace(dev, cls_flower);
+	case TC_CLSFLOWER_DESTROY:
+		return cxgb4_tc_flower_destroy(dev, cls_flower);
+	case TC_CLSFLOWER_STATS:
+		return cxgb4_tc_flower_stats(dev, cls_flower);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int cxgb_setup_tc_cls_u32(struct net_device *dev,
+				 struct tc_cls_u32_offload *cls_u32)
+{
+	switch (cls_u32->command) {
+	case TC_CLSU32_NEW_KNODE:
+	case TC_CLSU32_REPLACE_KNODE:
+		return cxgb4_config_knode(dev, cls_u32);
+	case TC_CLSU32_DELETE_KNODE:
+		return cxgb4_delete_knode(dev, cls_u32);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int cxgb_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+				  void *cb_priv)
+{
+	struct net_device *dev = cb_priv;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = netdev2adap(dev);
+
+	if (!(adap->flags & FULL_INIT_DONE)) {
+		dev_err(adap->pdev_dev,
+			"Failed to setup tc on port %d. Link Down?\n",
+			pi->port_id);
+		return -EINVAL;
+	}
+
+	if (!tc_cls_can_offload_and_chain0(dev, type_data))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_CLSU32:
+		return cxgb_setup_tc_cls_u32(dev, type_data);
+	case TC_SETUP_CLSFLOWER:
+		return cxgb_setup_tc_flower(dev, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int cxgb_setup_tc_block(struct net_device *dev,
+			       struct tc_block_offload *f)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		return tcf_block_cb_register(f->block, cxgb_setup_tc_block_cb,
+					     pi, dev);
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block, cxgb_setup_tc_block_cb, pi);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int cxgb_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			 void *type_data)
+{
+	switch (type) {
+	case TC_SETUP_BLOCK:
+		return cxgb_setup_tc_block(dev, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void cxgb_del_udp_tunnel(struct net_device *netdev,
+				struct udp_tunnel_info *ti)
+{
+	struct port_info *pi = netdev_priv(netdev);
+	struct adapter *adapter = pi->adapter;
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adapter->params.chip);
+	u8 match_all_mac[] = { 0, 0, 0, 0, 0, 0 };
+	int ret = 0, i;
+
+	if (chip_ver < CHELSIO_T6)
+		return;
+
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		if (!adapter->vxlan_port_cnt ||
+		    adapter->vxlan_port != ti->port)
+			return; /* Invalid VxLAN destination port */
+
+		adapter->vxlan_port_cnt--;
+		if (adapter->vxlan_port_cnt)
+			return;
+
+		adapter->vxlan_port = 0;
+		t4_write_reg(adapter, MPS_RX_VXLAN_TYPE_A, 0);
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		if (!adapter->geneve_port_cnt ||
+		    adapter->geneve_port != ti->port)
+			return; /* Invalid GENEVE destination port */
+
+		adapter->geneve_port_cnt--;
+		if (adapter->geneve_port_cnt)
+			return;
+
+		adapter->geneve_port = 0;
+		t4_write_reg(adapter, MPS_RX_GENEVE_TYPE_A, 0);
+	default:
+		return;
+	}
+
+	/* Matchall mac entries can be deleted only after all tunnel ports
+	 * are brought down or removed.
+	 */
+	if (!adapter->rawf_cnt)
+		return;
+	for_each_port(adapter, i) {
+		pi = adap2pinfo(adapter, i);
+		ret = t4_free_raw_mac_filt(adapter, pi->viid,
+					   match_all_mac, match_all_mac,
+					   adapter->rawf_start +
+					    pi->port_id,
+					   1, pi->port_id, true);
+		if (ret < 0) {
+			netdev_info(netdev, "Failed to free mac filter entry, for port %d\n",
+				    i);
+			return;
+		}
+		atomic_dec(&adapter->mps_encap[adapter->rawf_start +
+			   pi->port_id].refcnt);
+	}
+}
+
+static void cxgb_add_udp_tunnel(struct net_device *netdev,
+				struct udp_tunnel_info *ti)
+{
+	struct port_info *pi = netdev_priv(netdev);
+	struct adapter *adapter = pi->adapter;
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adapter->params.chip);
+	u8 match_all_mac[] = { 0, 0, 0, 0, 0, 0 };
+	int i, ret;
+
+	if (chip_ver < CHELSIO_T6 || !adapter->rawf_cnt)
+		return;
+
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		/* Callback for adding vxlan port can be called with the same
+		 * port for both IPv4 and IPv6. We should not disable the
+		 * offloading when the same port for both protocols is added
+		 * and later one of them is removed.
+		 */
+		if (adapter->vxlan_port_cnt &&
+		    adapter->vxlan_port == ti->port) {
+			adapter->vxlan_port_cnt++;
+			return;
+		}
+
+		/* We will support only one VxLAN port */
+		if (adapter->vxlan_port_cnt) {
+			netdev_info(netdev, "UDP port %d already offloaded, not adding port %d\n",
+				    be16_to_cpu(adapter->vxlan_port),
+				    be16_to_cpu(ti->port));
+			return;
+		}
+
+		adapter->vxlan_port = ti->port;
+		adapter->vxlan_port_cnt = 1;
+
+		t4_write_reg(adapter, MPS_RX_VXLAN_TYPE_A,
+			     VXLAN_V(be16_to_cpu(ti->port)) | VXLAN_EN_F);
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		if (adapter->geneve_port_cnt &&
+		    adapter->geneve_port == ti->port) {
+			adapter->geneve_port_cnt++;
+			return;
+		}
+
+		/* We will support only one GENEVE port */
+		if (adapter->geneve_port_cnt) {
+			netdev_info(netdev, "UDP port %d already offloaded, not adding port %d\n",
+				    be16_to_cpu(adapter->geneve_port),
+				    be16_to_cpu(ti->port));
+			return;
+		}
+
+		adapter->geneve_port = ti->port;
+		adapter->geneve_port_cnt = 1;
+
+		t4_write_reg(adapter, MPS_RX_GENEVE_TYPE_A,
+			     GENEVE_V(be16_to_cpu(ti->port)) | GENEVE_EN_F);
+	default:
+		return;
+	}
+
+	/* Create a 'match all' mac filter entry for inner mac,
+	 * if raw mac interface is supported. Once the linux kernel provides
+	 * driver entry points for adding/deleting the inner mac addresses,
+	 * we will remove this 'match all' entry and fallback to adding
+	 * exact match filters.
+	 */
+	for_each_port(adapter, i) {
+		pi = adap2pinfo(adapter, i);
+
+		ret = t4_alloc_raw_mac_filt(adapter, pi->viid,
+					    match_all_mac,
+					    match_all_mac,
+					    adapter->rawf_start +
+					    pi->port_id,
+					    1, pi->port_id, true);
+		if (ret < 0) {
+			netdev_info(netdev, "Failed to allocate a mac filter entry, not adding port %d\n",
+				    be16_to_cpu(ti->port));
+			cxgb_del_udp_tunnel(netdev, ti);
+			return;
+		}
+		atomic_inc(&adapter->mps_encap[ret].refcnt);
+	}
+}
+
+static netdev_features_t cxgb_features_check(struct sk_buff *skb,
+					     struct net_device *dev,
+					     netdev_features_t features)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	if (CHELSIO_CHIP_VERSION(adapter->params.chip) < CHELSIO_T6)
+		return features;
+
+	/* Check if hw supports offload for this packet */
+	if (!skb->encapsulation || cxgb_encap_offload_supported(skb))
+		return features;
+
+	/* Offload is not supported for this encapsulated packet */
+	return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+}
+
+static netdev_features_t cxgb_fix_features(struct net_device *dev,
+					   netdev_features_t features)
+{
+	/* Disable GRO, if RX_CSUM is disabled */
+	if (!(features & NETIF_F_RXCSUM))
+		features &= ~NETIF_F_GRO;
+
+	return features;
+}
+
+static const struct net_device_ops cxgb4_netdev_ops = {
+	.ndo_open             = cxgb_open,
+	.ndo_stop             = cxgb_close,
+	.ndo_start_xmit       = t4_eth_xmit,
+	.ndo_select_queue     =	cxgb_select_queue,
+	.ndo_get_stats64      = cxgb_get_stats,
+	.ndo_set_rx_mode      = cxgb_set_rxmode,
+	.ndo_set_mac_address  = cxgb_set_mac_addr,
+	.ndo_set_features     = cxgb_set_features,
+	.ndo_validate_addr    = eth_validate_addr,
+	.ndo_do_ioctl         = cxgb_ioctl,
+	.ndo_change_mtu       = cxgb_change_mtu,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller  = cxgb_netpoll,
+#endif
+#ifdef CONFIG_CHELSIO_T4_FCOE
+	.ndo_fcoe_enable      = cxgb_fcoe_enable,
+	.ndo_fcoe_disable     = cxgb_fcoe_disable,
+#endif /* CONFIG_CHELSIO_T4_FCOE */
+	.ndo_set_tx_maxrate   = cxgb_set_tx_maxrate,
+	.ndo_setup_tc         = cxgb_setup_tc,
+	.ndo_udp_tunnel_add   = cxgb_add_udp_tunnel,
+	.ndo_udp_tunnel_del   = cxgb_del_udp_tunnel,
+	.ndo_features_check   = cxgb_features_check,
+	.ndo_fix_features     = cxgb_fix_features,
+};
+
+#ifdef CONFIG_PCI_IOV
+static const struct net_device_ops cxgb4_mgmt_netdev_ops = {
+	.ndo_open             = cxgb4_mgmt_open,
+	.ndo_set_vf_mac       = cxgb4_mgmt_set_vf_mac,
+	.ndo_get_vf_config    = cxgb4_mgmt_get_vf_config,
+	.ndo_set_vf_rate      = cxgb4_mgmt_set_vf_rate,
+	.ndo_get_phys_port_id = cxgb4_mgmt_get_phys_port_id,
+	.ndo_set_vf_vlan      = cxgb4_mgmt_set_vf_vlan,
+};
+#endif
+
+static void cxgb4_mgmt_get_drvinfo(struct net_device *dev,
+				   struct ethtool_drvinfo *info)
+{
+	struct adapter *adapter = netdev2adap(dev);
+
+	strlcpy(info->driver, cxgb4_driver_name, sizeof(info->driver));
+	strlcpy(info->version, cxgb4_driver_version,
+		sizeof(info->version));
+	strlcpy(info->bus_info, pci_name(adapter->pdev),
+		sizeof(info->bus_info));
+}
+
+static const struct ethtool_ops cxgb4_mgmt_ethtool_ops = {
+	.get_drvinfo       = cxgb4_mgmt_get_drvinfo,
+};
+
+static void notify_fatal_err(struct work_struct *work)
+{
+	struct adapter *adap;
+
+	adap = container_of(work, struct adapter, fatal_err_notify_task);
+	notify_ulds(adap, CXGB4_STATE_FATAL_ERROR);
+}
+
+void t4_fatal_err(struct adapter *adap)
+{
+	int port;
+
+	if (pci_channel_offline(adap->pdev))
+		return;
+
+	/* Disable the SGE since ULDs are going to free resources that
+	 * could be exposed to the adapter.  RDMA MWs for example...
+	 */
+	t4_shutdown_adapter(adap);
+	for_each_port(adap, port) {
+		struct net_device *dev = adap->port[port];
+
+		/* If we get here in very early initialization the network
+		 * devices may not have been set up yet.
+		 */
+		if (!dev)
+			continue;
+
+		netif_tx_stop_all_queues(dev);
+		netif_carrier_off(dev);
+	}
+	dev_alert(adap->pdev_dev, "encountered fatal error, adapter stopped\n");
+	queue_work(adap->workq, &adap->fatal_err_notify_task);
+}
+
+static void setup_memwin(struct adapter *adap)
+{
+	u32 nic_win_base = t4_get_util_window(adap);
+
+	t4_setup_memwin(adap, nic_win_base, MEMWIN_NIC);
+}
+
+static void setup_memwin_rdma(struct adapter *adap)
+{
+	if (adap->vres.ocq.size) {
+		u32 start;
+		unsigned int sz_kb;
+
+		start = t4_read_pcie_cfg4(adap, PCI_BASE_ADDRESS_2);
+		start &= PCI_BASE_ADDRESS_MEM_MASK;
+		start += OCQ_WIN_OFFSET(adap->pdev, &adap->vres);
+		sz_kb = roundup_pow_of_two(adap->vres.ocq.size) >> 10;
+		t4_write_reg(adap,
+			     PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN_A, 3),
+			     start | BIR_V(1) | WINDOW_V(ilog2(sz_kb)));
+		t4_write_reg(adap,
+			     PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET_A, 3),
+			     adap->vres.ocq.start);
+		t4_read_reg(adap,
+			    PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET_A, 3));
+	}
+}
+
+/* HMA Definitions */
+
+/* The maximum number of address that can be send in a single FW cmd */
+#define HMA_MAX_ADDR_IN_CMD	5
+
+#define HMA_PAGE_SIZE		PAGE_SIZE
+
+#define HMA_MAX_NO_FW_ADDRESS	(16 << 10)  /* FW supports 16K addresses */
+
+#define HMA_PAGE_ORDER					\
+	((HMA_PAGE_SIZE < HMA_MAX_NO_FW_ADDRESS) ?	\
+	ilog2(HMA_MAX_NO_FW_ADDRESS / HMA_PAGE_SIZE) : 0)
+
+/* The minimum and maximum possible HMA sizes that can be specified in the FW
+ * configuration(in units of MB).
+ */
+#define HMA_MIN_TOTAL_SIZE	1
+#define HMA_MAX_TOTAL_SIZE				\
+	(((HMA_PAGE_SIZE << HMA_PAGE_ORDER) *		\
+	  HMA_MAX_NO_FW_ADDRESS) >> 20)
+
+static void adap_free_hma_mem(struct adapter *adapter)
+{
+	struct scatterlist *iter;
+	struct page *page;
+	int i;
+
+	if (!adapter->hma.sgt)
+		return;
+
+	if (adapter->hma.flags & HMA_DMA_MAPPED_FLAG) {
+		dma_unmap_sg(adapter->pdev_dev, adapter->hma.sgt->sgl,
+			     adapter->hma.sgt->nents, PCI_DMA_BIDIRECTIONAL);
+		adapter->hma.flags &= ~HMA_DMA_MAPPED_FLAG;
+	}
+
+	for_each_sg(adapter->hma.sgt->sgl, iter,
+		    adapter->hma.sgt->orig_nents, i) {
+		page = sg_page(iter);
+		if (page)
+			__free_pages(page, HMA_PAGE_ORDER);
+	}
+
+	kfree(adapter->hma.phy_addr);
+	sg_free_table(adapter->hma.sgt);
+	kfree(adapter->hma.sgt);
+	adapter->hma.sgt = NULL;
+}
+
+static int adap_config_hma(struct adapter *adapter)
+{
+	struct scatterlist *sgl, *iter;
+	struct sg_table *sgt;
+	struct page *newpage;
+	unsigned int i, j, k;
+	u32 param, hma_size;
+	unsigned int ncmds;
+	size_t page_size;
+	u32 page_order;
+	int node, ret;
+
+	/* HMA is supported only for T6+ cards.
+	 * Avoid initializing HMA in kdump kernels.
+	 */
+	if (is_kdump_kernel() ||
+	    CHELSIO_CHIP_VERSION(adapter->params.chip) < CHELSIO_T6)
+		return 0;
+
+	/* Get the HMA region size required by fw */
+	param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+		 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_HMA_SIZE));
+	ret = t4_query_params(adapter, adapter->mbox, adapter->pf, 0,
+			      1, &param, &hma_size);
+	/* An error means card has its own memory or HMA is not supported by
+	 * the firmware. Return without any errors.
+	 */
+	if (ret || !hma_size)
+		return 0;
+
+	if (hma_size < HMA_MIN_TOTAL_SIZE ||
+	    hma_size > HMA_MAX_TOTAL_SIZE) {
+		dev_err(adapter->pdev_dev,
+			"HMA size %uMB beyond bounds(%u-%lu)MB\n",
+			hma_size, HMA_MIN_TOTAL_SIZE, HMA_MAX_TOTAL_SIZE);
+		return -EINVAL;
+	}
+
+	page_size = HMA_PAGE_SIZE;
+	page_order = HMA_PAGE_ORDER;
+	adapter->hma.sgt = kzalloc(sizeof(*adapter->hma.sgt), GFP_KERNEL);
+	if (unlikely(!adapter->hma.sgt)) {
+		dev_err(adapter->pdev_dev, "HMA SG table allocation failed\n");
+		return -ENOMEM;
+	}
+	sgt = adapter->hma.sgt;
+	/* FW returned value will be in MB's
+	 */
+	sgt->orig_nents = (hma_size << 20) / (page_size << page_order);
+	if (sg_alloc_table(sgt, sgt->orig_nents, GFP_KERNEL)) {
+		dev_err(adapter->pdev_dev, "HMA SGL allocation failed\n");
+		kfree(adapter->hma.sgt);
+		adapter->hma.sgt = NULL;
+		return -ENOMEM;
+	}
+
+	sgl = adapter->hma.sgt->sgl;
+	node = dev_to_node(adapter->pdev_dev);
+	for_each_sg(sgl, iter, sgt->orig_nents, i) {
+		newpage = alloc_pages_node(node, __GFP_NOWARN | GFP_KERNEL |
+					   __GFP_ZERO, page_order);
+		if (!newpage) {
+			dev_err(adapter->pdev_dev,
+				"Not enough memory for HMA page allocation\n");
+			ret = -ENOMEM;
+			goto free_hma;
+		}
+		sg_set_page(iter, newpage, page_size << page_order, 0);
+	}
+
+	sgt->nents = dma_map_sg(adapter->pdev_dev, sgl, sgt->orig_nents,
+				DMA_BIDIRECTIONAL);
+	if (!sgt->nents) {
+		dev_err(adapter->pdev_dev,
+			"Not enough memory for HMA DMA mapping");
+		ret = -ENOMEM;
+		goto free_hma;
+	}
+	adapter->hma.flags |= HMA_DMA_MAPPED_FLAG;
+
+	adapter->hma.phy_addr = kcalloc(sgt->nents, sizeof(dma_addr_t),
+					GFP_KERNEL);
+	if (unlikely(!adapter->hma.phy_addr))
+		goto free_hma;
+
+	for_each_sg(sgl, iter, sgt->nents, i) {
+		newpage = sg_page(iter);
+		adapter->hma.phy_addr[i] = sg_dma_address(iter);
+	}
+
+	ncmds = DIV_ROUND_UP(sgt->nents, HMA_MAX_ADDR_IN_CMD);
+	/* Pass on the addresses to firmware */
+	for (i = 0, k = 0; i < ncmds; i++, k += HMA_MAX_ADDR_IN_CMD) {
+		struct fw_hma_cmd hma_cmd;
+		u8 naddr = HMA_MAX_ADDR_IN_CMD;
+		u8 soc = 0, eoc = 0;
+		u8 hma_mode = 1; /* Presently we support only Page table mode */
+
+		soc = (i == 0) ? 1 : 0;
+		eoc = (i == ncmds - 1) ? 1 : 0;
+
+		/* For last cmd, set naddr corresponding to remaining
+		 * addresses
+		 */
+		if (i == ncmds - 1) {
+			naddr = sgt->nents % HMA_MAX_ADDR_IN_CMD;
+			naddr = naddr ? naddr : HMA_MAX_ADDR_IN_CMD;
+		}
+		memset(&hma_cmd, 0, sizeof(hma_cmd));
+		hma_cmd.op_pkd = htonl(FW_CMD_OP_V(FW_HMA_CMD) |
+				       FW_CMD_REQUEST_F | FW_CMD_WRITE_F);
+		hma_cmd.retval_len16 = htonl(FW_LEN16(hma_cmd));
+
+		hma_cmd.mode_to_pcie_params =
+			htonl(FW_HMA_CMD_MODE_V(hma_mode) |
+			      FW_HMA_CMD_SOC_V(soc) | FW_HMA_CMD_EOC_V(eoc));
+
+		/* HMA cmd size specified in MB's */
+		hma_cmd.naddr_size =
+			htonl(FW_HMA_CMD_SIZE_V(hma_size) |
+			      FW_HMA_CMD_NADDR_V(naddr));
+
+		/* Total Page size specified in units of 4K */
+		hma_cmd.addr_size_pkd =
+			htonl(FW_HMA_CMD_ADDR_SIZE_V
+				((page_size << page_order) >> 12));
+
+		/* Fill the 5 addresses */
+		for (j = 0; j < naddr; j++) {
+			hma_cmd.phy_address[j] =
+				cpu_to_be64(adapter->hma.phy_addr[j + k]);
+		}
+		ret = t4_wr_mbox(adapter, adapter->mbox, &hma_cmd,
+				 sizeof(hma_cmd), &hma_cmd);
+		if (ret) {
+			dev_err(adapter->pdev_dev,
+				"HMA FW command failed with err %d\n", ret);
+			goto free_hma;
+		}
+	}
+
+	if (!ret)
+		dev_info(adapter->pdev_dev,
+			 "Reserved %uMB host memory for HMA\n", hma_size);
+	return ret;
+
+free_hma:
+	adap_free_hma_mem(adapter);
+	return ret;
+}
+
+static int adap_init1(struct adapter *adap, struct fw_caps_config_cmd *c)
+{
+	u32 v;
+	int ret;
+
+	/* get device capabilities */
+	memset(c, 0, sizeof(*c));
+	c->op_to_write = htonl(FW_CMD_OP_V(FW_CAPS_CONFIG_CMD) |
+			       FW_CMD_REQUEST_F | FW_CMD_READ_F);
+	c->cfvalid_to_len16 = htonl(FW_LEN16(*c));
+	ret = t4_wr_mbox(adap, adap->mbox, c, sizeof(*c), c);
+	if (ret < 0)
+		return ret;
+
+	c->op_to_write = htonl(FW_CMD_OP_V(FW_CAPS_CONFIG_CMD) |
+			       FW_CMD_REQUEST_F | FW_CMD_WRITE_F);
+	ret = t4_wr_mbox(adap, adap->mbox, c, sizeof(*c), NULL);
+	if (ret < 0)
+		return ret;
+
+	ret = t4_config_glbl_rss(adap, adap->pf,
+				 FW_RSS_GLB_CONFIG_CMD_MODE_BASICVIRTUAL,
+				 FW_RSS_GLB_CONFIG_CMD_TNLMAPEN_F |
+				 FW_RSS_GLB_CONFIG_CMD_TNLALLLKP_F);
+	if (ret < 0)
+		return ret;
+
+	ret = t4_cfg_pfvf(adap, adap->mbox, adap->pf, 0, adap->sge.egr_sz, 64,
+			  MAX_INGQ, 0, 0, 4, 0xf, 0xf, 16, FW_CMD_CAP_PF,
+			  FW_CMD_CAP_PF);
+	if (ret < 0)
+		return ret;
+
+	t4_sge_init(adap);
+
+	/* tweak some settings */
+	t4_write_reg(adap, TP_SHIFT_CNT_A, 0x64f8849);
+	t4_write_reg(adap, ULP_RX_TDDP_PSZ_A, HPZ0_V(PAGE_SHIFT - 12));
+	t4_write_reg(adap, TP_PIO_ADDR_A, TP_INGRESS_CONFIG_A);
+	v = t4_read_reg(adap, TP_PIO_DATA_A);
+	t4_write_reg(adap, TP_PIO_DATA_A, v & ~CSUM_HAS_PSEUDO_HDR_F);
+
+	/* first 4 Tx modulation queues point to consecutive Tx channels */
+	adap->params.tp.tx_modq_map = 0xE4;
+	t4_write_reg(adap, TP_TX_MOD_QUEUE_REQ_MAP_A,
+		     TX_MOD_QUEUE_REQ_MAP_V(adap->params.tp.tx_modq_map));
+
+	/* associate each Tx modulation queue with consecutive Tx channels */
+	v = 0x84218421;
+	t4_write_indirect(adap, TP_PIO_ADDR_A, TP_PIO_DATA_A,
+			  &v, 1, TP_TX_SCHED_HDR_A);
+	t4_write_indirect(adap, TP_PIO_ADDR_A, TP_PIO_DATA_A,
+			  &v, 1, TP_TX_SCHED_FIFO_A);
+	t4_write_indirect(adap, TP_PIO_ADDR_A, TP_PIO_DATA_A,
+			  &v, 1, TP_TX_SCHED_PCMD_A);
+
+#define T4_TX_MODQ_10G_WEIGHT_DEFAULT 16 /* in KB units */
+	if (is_offload(adap)) {
+		t4_write_reg(adap, TP_TX_MOD_QUEUE_WEIGHT0_A,
+			     TX_MODQ_WEIGHT0_V(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     TX_MODQ_WEIGHT1_V(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     TX_MODQ_WEIGHT2_V(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     TX_MODQ_WEIGHT3_V(T4_TX_MODQ_10G_WEIGHT_DEFAULT));
+		t4_write_reg(adap, TP_TX_MOD_CHANNEL_WEIGHT_A,
+			     TX_MODQ_WEIGHT0_V(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     TX_MODQ_WEIGHT1_V(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     TX_MODQ_WEIGHT2_V(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     TX_MODQ_WEIGHT3_V(T4_TX_MODQ_10G_WEIGHT_DEFAULT));
+	}
+
+	/* get basic stuff going */
+	return t4_early_init(adap, adap->pf);
+}
+
+/*
+ * Max # of ATIDs.  The absolute HW max is 16K but we keep it lower.
+ */
+#define MAX_ATIDS 8192U
+
+/*
+ * Phase 0 of initialization: contact FW, obtain config, perform basic init.
+ *
+ * If the firmware we're dealing with has Configuration File support, then
+ * we use that to perform all configuration
+ */
+
+/*
+ * Tweak configuration based on module parameters, etc.  Most of these have
+ * defaults assigned to them by Firmware Configuration Files (if we're using
+ * them) but need to be explicitly set if we're using hard-coded
+ * initialization.  But even in the case of using Firmware Configuration
+ * Files, we'd like to expose the ability to change these via module
+ * parameters so these are essentially common tweaks/settings for
+ * Configuration Files and hard-coded initialization ...
+ */
+static int adap_init0_tweaks(struct adapter *adapter)
+{
+	/*
+	 * Fix up various Host-Dependent Parameters like Page Size, Cache
+	 * Line Size, etc.  The firmware default is for a 4KB Page Size and
+	 * 64B Cache Line Size ...
+	 */
+	t4_fixup_host_params(adapter, PAGE_SIZE, L1_CACHE_BYTES);
+
+	/*
+	 * Process module parameters which affect early initialization.
+	 */
+	if (rx_dma_offset != 2 && rx_dma_offset != 0) {
+		dev_err(&adapter->pdev->dev,
+			"Ignoring illegal rx_dma_offset=%d, using 2\n",
+			rx_dma_offset);
+		rx_dma_offset = 2;
+	}
+	t4_set_reg_field(adapter, SGE_CONTROL_A,
+			 PKTSHIFT_V(PKTSHIFT_M),
+			 PKTSHIFT_V(rx_dma_offset));
+
+	/*
+	 * Don't include the "IP Pseudo Header" in CPL_RX_PKT checksums: Linux
+	 * adds the pseudo header itself.
+	 */
+	t4_tp_wr_bits_indirect(adapter, TP_INGRESS_CONFIG_A,
+			       CSUM_HAS_PSEUDO_HDR_F, 0);
+
+	return 0;
+}
+
+/* 10Gb/s-BT PHY Support. chip-external 10Gb/s-BT PHYs are complex chips
+ * unto themselves and they contain their own firmware to perform their
+ * tasks ...
+ */
+static int phy_aq1202_version(const u8 *phy_fw_data,
+			      size_t phy_fw_size)
+{
+	int offset;
+
+	/* At offset 0x8 you're looking for the primary image's
+	 * starting offset which is 3 Bytes wide
+	 *
+	 * At offset 0xa of the primary image, you look for the offset
+	 * of the DRAM segment which is 3 Bytes wide.
+	 *
+	 * The FW version is at offset 0x27e of the DRAM and is 2 Bytes
+	 * wide
+	 */
+	#define be16(__p) (((__p)[0] << 8) | (__p)[1])
+	#define le16(__p) ((__p)[0] | ((__p)[1] << 8))
+	#define le24(__p) (le16(__p) | ((__p)[2] << 16))
+
+	offset = le24(phy_fw_data + 0x8) << 12;
+	offset = le24(phy_fw_data + offset + 0xa);
+	return be16(phy_fw_data + offset + 0x27e);
+
+	#undef be16
+	#undef le16
+	#undef le24
+}
+
+static struct info_10gbt_phy_fw {
+	unsigned int phy_fw_id;		/* PCI Device ID */
+	char *phy_fw_file;		/* /lib/firmware/ PHY Firmware file */
+	int (*phy_fw_version)(const u8 *phy_fw_data, size_t phy_fw_size);
+	int phy_flash;			/* Has FLASH for PHY Firmware */
+} phy_info_array[] = {
+	{
+		PHY_AQ1202_DEVICEID,
+		PHY_AQ1202_FIRMWARE,
+		phy_aq1202_version,
+		1,
+	},
+	{
+		PHY_BCM84834_DEVICEID,
+		PHY_BCM84834_FIRMWARE,
+		NULL,
+		0,
+	},
+	{ 0, NULL, NULL },
+};
+
+static struct info_10gbt_phy_fw *find_phy_info(int devid)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(phy_info_array); i++) {
+		if (phy_info_array[i].phy_fw_id == devid)
+			return &phy_info_array[i];
+	}
+	return NULL;
+}
+
+/* Handle updating of chip-external 10Gb/s-BT PHY firmware.  This needs to
+ * happen after the FW_RESET_CMD but before the FW_INITIALIZE_CMD.  On error
+ * we return a negative error number.  If we transfer new firmware we return 1
+ * (from t4_load_phy_fw()).  If we don't do anything we return 0.
+ */
+static int adap_init0_phy(struct adapter *adap)
+{
+	const struct firmware *phyf;
+	int ret;
+	struct info_10gbt_phy_fw *phy_info;
+
+	/* Use the device ID to determine which PHY file to flash.
+	 */
+	phy_info = find_phy_info(adap->pdev->device);
+	if (!phy_info) {
+		dev_warn(adap->pdev_dev,
+			 "No PHY Firmware file found for this PHY\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* If we have a T4 PHY firmware file under /lib/firmware/cxgb4/, then
+	 * use that. The adapter firmware provides us with a memory buffer
+	 * where we can load a PHY firmware file from the host if we want to
+	 * override the PHY firmware File in flash.
+	 */
+	ret = request_firmware_direct(&phyf, phy_info->phy_fw_file,
+				      adap->pdev_dev);
+	if (ret < 0) {
+		/* For adapters without FLASH attached to PHY for their
+		 * firmware, it's obviously a fatal error if we can't get the
+		 * firmware to the adapter.  For adapters with PHY firmware
+		 * FLASH storage, it's worth a warning if we can't find the
+		 * PHY Firmware but we'll neuter the error ...
+		 */
+		dev_err(adap->pdev_dev, "unable to find PHY Firmware image "
+			"/lib/firmware/%s, error %d\n",
+			phy_info->phy_fw_file, -ret);
+		if (phy_info->phy_flash) {
+			int cur_phy_fw_ver = 0;
+
+			t4_phy_fw_ver(adap, &cur_phy_fw_ver);
+			dev_warn(adap->pdev_dev, "continuing with, on-adapter "
+				 "FLASH copy, version %#x\n", cur_phy_fw_ver);
+			ret = 0;
+		}
+
+		return ret;
+	}
+
+	/* Load PHY Firmware onto adapter.
+	 */
+	ret = t4_load_phy_fw(adap, MEMWIN_NIC, &adap->win0_lock,
+			     phy_info->phy_fw_version,
+			     (u8 *)phyf->data, phyf->size);
+	if (ret < 0)
+		dev_err(adap->pdev_dev, "PHY Firmware transfer error %d\n",
+			-ret);
+	else if (ret > 0) {
+		int new_phy_fw_ver = 0;
+
+		if (phy_info->phy_fw_version)
+			new_phy_fw_ver = phy_info->phy_fw_version(phyf->data,
+								  phyf->size);
+		dev_info(adap->pdev_dev, "Successfully transferred PHY "
+			 "Firmware /lib/firmware/%s, version %#x\n",
+			 phy_info->phy_fw_file, new_phy_fw_ver);
+	}
+
+	release_firmware(phyf);
+
+	return ret;
+}
+
+/*
+ * Attempt to initialize the adapter via a Firmware Configuration File.
+ */
+static int adap_init0_config(struct adapter *adapter, int reset)
+{
+	struct fw_caps_config_cmd caps_cmd;
+	const struct firmware *cf;
+	unsigned long mtype = 0, maddr = 0;
+	u32 finiver, finicsum, cfcsum;
+	int ret;
+	int config_issued = 0;
+	char *fw_config_file, fw_config_file_path[256];
+	char *config_name = NULL;
+
+	/*
+	 * Reset device if necessary.
+	 */
+	if (reset) {
+		ret = t4_fw_reset(adapter, adapter->mbox,
+				  PIORSTMODE_F | PIORST_F);
+		if (ret < 0)
+			goto bye;
+	}
+
+	/* If this is a 10Gb/s-BT adapter make sure the chip-external
+	 * 10Gb/s-BT PHYs have up-to-date firmware.  Note that this step needs
+	 * to be performed after any global adapter RESET above since some
+	 * PHYs only have local RAM copies of the PHY firmware.
+	 */
+	if (is_10gbt_device(adapter->pdev->device)) {
+		ret = adap_init0_phy(adapter);
+		if (ret < 0)
+			goto bye;
+	}
+	/*
+	 * If we have a T4 configuration file under /lib/firmware/cxgb4/,
+	 * then use that.  Otherwise, use the configuration file stored
+	 * in the adapter flash ...
+	 */
+	switch (CHELSIO_CHIP_VERSION(adapter->params.chip)) {
+	case CHELSIO_T4:
+		fw_config_file = FW4_CFNAME;
+		break;
+	case CHELSIO_T5:
+		fw_config_file = FW5_CFNAME;
+		break;
+	case CHELSIO_T6:
+		fw_config_file = FW6_CFNAME;
+		break;
+	default:
+		dev_err(adapter->pdev_dev, "Device %d is not supported\n",
+		       adapter->pdev->device);
+		ret = -EINVAL;
+		goto bye;
+	}
+
+	ret = request_firmware(&cf, fw_config_file, adapter->pdev_dev);
+	if (ret < 0) {
+		config_name = "On FLASH";
+		mtype = FW_MEMTYPE_CF_FLASH;
+		maddr = t4_flash_cfg_addr(adapter);
+	} else {
+		u32 params[7], val[7];
+
+		sprintf(fw_config_file_path,
+			"/lib/firmware/%s", fw_config_file);
+		config_name = fw_config_file_path;
+
+		if (cf->size >= FLASH_CFG_MAX_SIZE)
+			ret = -ENOMEM;
+		else {
+			params[0] = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+			     FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_CF));
+			ret = t4_query_params(adapter, adapter->mbox,
+					      adapter->pf, 0, 1, params, val);
+			if (ret == 0) {
+				/*
+				 * For t4_memory_rw() below addresses and
+				 * sizes have to be in terms of multiples of 4
+				 * bytes.  So, if the Configuration File isn't
+				 * a multiple of 4 bytes in length we'll have
+				 * to write that out separately since we can't
+				 * guarantee that the bytes following the
+				 * residual byte in the buffer returned by
+				 * request_firmware() are zeroed out ...
+				 */
+				size_t resid = cf->size & 0x3;
+				size_t size = cf->size & ~0x3;
+				__be32 *data = (__be32 *)cf->data;
+
+				mtype = FW_PARAMS_PARAM_Y_G(val[0]);
+				maddr = FW_PARAMS_PARAM_Z_G(val[0]) << 16;
+
+				spin_lock(&adapter->win0_lock);
+				ret = t4_memory_rw(adapter, 0, mtype, maddr,
+						   size, data, T4_MEMORY_WRITE);
+				if (ret == 0 && resid != 0) {
+					union {
+						__be32 word;
+						char buf[4];
+					} last;
+					int i;
+
+					last.word = data[size >> 2];
+					for (i = resid; i < 4; i++)
+						last.buf[i] = 0;
+					ret = t4_memory_rw(adapter, 0, mtype,
+							   maddr + size,
+							   4, &last.word,
+							   T4_MEMORY_WRITE);
+				}
+				spin_unlock(&adapter->win0_lock);
+			}
+		}
+
+		release_firmware(cf);
+		if (ret)
+			goto bye;
+	}
+
+	/*
+	 * Issue a Capability Configuration command to the firmware to get it
+	 * to parse the Configuration File.  We don't use t4_fw_config_file()
+	 * because we want the ability to modify various features after we've
+	 * processed the configuration file ...
+	 */
+	memset(&caps_cmd, 0, sizeof(caps_cmd));
+	caps_cmd.op_to_write =
+		htonl(FW_CMD_OP_V(FW_CAPS_CONFIG_CMD) |
+		      FW_CMD_REQUEST_F |
+		      FW_CMD_READ_F);
+	caps_cmd.cfvalid_to_len16 =
+		htonl(FW_CAPS_CONFIG_CMD_CFVALID_F |
+		      FW_CAPS_CONFIG_CMD_MEMTYPE_CF_V(mtype) |
+		      FW_CAPS_CONFIG_CMD_MEMADDR64K_CF_V(maddr >> 16) |
+		      FW_LEN16(caps_cmd));
+	ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd, sizeof(caps_cmd),
+			 &caps_cmd);
+
+	/* If the CAPS_CONFIG failed with an ENOENT (for a Firmware
+	 * Configuration File in FLASH), our last gasp effort is to use the
+	 * Firmware Configuration File which is embedded in the firmware.  A
+	 * very few early versions of the firmware didn't have one embedded
+	 * but we can ignore those.
+	 */
+	if (ret == -ENOENT) {
+		memset(&caps_cmd, 0, sizeof(caps_cmd));
+		caps_cmd.op_to_write =
+			htonl(FW_CMD_OP_V(FW_CAPS_CONFIG_CMD) |
+					FW_CMD_REQUEST_F |
+					FW_CMD_READ_F);
+		caps_cmd.cfvalid_to_len16 = htonl(FW_LEN16(caps_cmd));
+		ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd,
+				sizeof(caps_cmd), &caps_cmd);
+		config_name = "Firmware Default";
+	}
+
+	config_issued = 1;
+	if (ret < 0)
+		goto bye;
+
+	finiver = ntohl(caps_cmd.finiver);
+	finicsum = ntohl(caps_cmd.finicsum);
+	cfcsum = ntohl(caps_cmd.cfcsum);
+	if (finicsum != cfcsum)
+		dev_warn(adapter->pdev_dev, "Configuration File checksum "\
+			 "mismatch: [fini] csum=%#x, computed csum=%#x\n",
+			 finicsum, cfcsum);
+
+	/*
+	 * And now tell the firmware to use the configuration we just loaded.
+	 */
+	caps_cmd.op_to_write =
+		htonl(FW_CMD_OP_V(FW_CAPS_CONFIG_CMD) |
+		      FW_CMD_REQUEST_F |
+		      FW_CMD_WRITE_F);
+	caps_cmd.cfvalid_to_len16 = htonl(FW_LEN16(caps_cmd));
+	ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd, sizeof(caps_cmd),
+			 NULL);
+	if (ret < 0)
+		goto bye;
+
+	/*
+	 * Tweak configuration based on system architecture, module
+	 * parameters, etc.
+	 */
+	ret = adap_init0_tweaks(adapter);
+	if (ret < 0)
+		goto bye;
+
+	/* We will proceed even if HMA init fails. */
+	ret = adap_config_hma(adapter);
+	if (ret)
+		dev_err(adapter->pdev_dev,
+			"HMA configuration failed with error %d\n", ret);
+
+	/*
+	 * And finally tell the firmware to initialize itself using the
+	 * parameters from the Configuration File.
+	 */
+	ret = t4_fw_initialize(adapter, adapter->mbox);
+	if (ret < 0)
+		goto bye;
+
+	/* Emit Firmware Configuration File information and return
+	 * successfully.
+	 */
+	dev_info(adapter->pdev_dev, "Successfully configured using Firmware "\
+		 "Configuration File \"%s\", version %#x, computed checksum %#x\n",
+		 config_name, finiver, cfcsum);
+	return 0;
+
+	/*
+	 * Something bad happened.  Return the error ...  (If the "error"
+	 * is that there's no Configuration File on the adapter we don't
+	 * want to issue a warning since this is fairly common.)
+	 */
+bye:
+	if (config_issued && ret != -ENOENT)
+		dev_warn(adapter->pdev_dev, "\"%s\" configuration file error %d\n",
+			 config_name, -ret);
+	return ret;
+}
+
+static struct fw_info fw_info_array[] = {
+	{
+		.chip = CHELSIO_T4,
+		.fs_name = FW4_CFNAME,
+		.fw_mod_name = FW4_FNAME,
+		.fw_hdr = {
+			.chip = FW_HDR_CHIP_T4,
+			.fw_ver = __cpu_to_be32(FW_VERSION(T4)),
+			.intfver_nic = FW_INTFVER(T4, NIC),
+			.intfver_vnic = FW_INTFVER(T4, VNIC),
+			.intfver_ri = FW_INTFVER(T4, RI),
+			.intfver_iscsi = FW_INTFVER(T4, ISCSI),
+			.intfver_fcoe = FW_INTFVER(T4, FCOE),
+		},
+	}, {
+		.chip = CHELSIO_T5,
+		.fs_name = FW5_CFNAME,
+		.fw_mod_name = FW5_FNAME,
+		.fw_hdr = {
+			.chip = FW_HDR_CHIP_T5,
+			.fw_ver = __cpu_to_be32(FW_VERSION(T5)),
+			.intfver_nic = FW_INTFVER(T5, NIC),
+			.intfver_vnic = FW_INTFVER(T5, VNIC),
+			.intfver_ri = FW_INTFVER(T5, RI),
+			.intfver_iscsi = FW_INTFVER(T5, ISCSI),
+			.intfver_fcoe = FW_INTFVER(T5, FCOE),
+		},
+	}, {
+		.chip = CHELSIO_T6,
+		.fs_name = FW6_CFNAME,
+		.fw_mod_name = FW6_FNAME,
+		.fw_hdr = {
+			.chip = FW_HDR_CHIP_T6,
+			.fw_ver = __cpu_to_be32(FW_VERSION(T6)),
+			.intfver_nic = FW_INTFVER(T6, NIC),
+			.intfver_vnic = FW_INTFVER(T6, VNIC),
+			.intfver_ofld = FW_INTFVER(T6, OFLD),
+			.intfver_ri = FW_INTFVER(T6, RI),
+			.intfver_iscsipdu = FW_INTFVER(T6, ISCSIPDU),
+			.intfver_iscsi = FW_INTFVER(T6, ISCSI),
+			.intfver_fcoepdu = FW_INTFVER(T6, FCOEPDU),
+			.intfver_fcoe = FW_INTFVER(T6, FCOE),
+		},
+	}
+
+};
+
+static struct fw_info *find_fw_info(int chip)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(fw_info_array); i++) {
+		if (fw_info_array[i].chip == chip)
+			return &fw_info_array[i];
+	}
+	return NULL;
+}
+
+/*
+ * Phase 0 of initialization: contact FW, obtain config, perform basic init.
+ */
+static int adap_init0(struct adapter *adap)
+{
+	int ret;
+	u32 v, port_vec;
+	enum dev_state state;
+	u32 params[7], val[7];
+	struct fw_caps_config_cmd caps_cmd;
+	int reset = 1;
+
+	/* Grab Firmware Device Log parameters as early as possible so we have
+	 * access to it for debugging, etc.
+	 */
+	ret = t4_init_devlog_params(adap);
+	if (ret < 0)
+		return ret;
+
+	/* Contact FW, advertising Master capability */
+	ret = t4_fw_hello(adap, adap->mbox, adap->mbox,
+			  is_kdump_kernel() ? MASTER_MUST : MASTER_MAY, &state);
+	if (ret < 0) {
+		dev_err(adap->pdev_dev, "could not connect to FW, error %d\n",
+			ret);
+		return ret;
+	}
+	if (ret == adap->mbox)
+		adap->flags |= MASTER_PF;
+
+	/*
+	 * If we're the Master PF Driver and the device is uninitialized,
+	 * then let's consider upgrading the firmware ...  (We always want
+	 * to check the firmware version number in order to A. get it for
+	 * later reporting and B. to warn if the currently loaded firmware
+	 * is excessively mismatched relative to the driver.)
+	 */
+
+	t4_get_version_info(adap);
+	ret = t4_check_fw_version(adap);
+	/* If firmware is too old (not supported by driver) force an update. */
+	if (ret)
+		state = DEV_STATE_UNINIT;
+	if ((adap->flags & MASTER_PF) && state != DEV_STATE_INIT) {
+		struct fw_info *fw_info;
+		struct fw_hdr *card_fw;
+		const struct firmware *fw;
+		const u8 *fw_data = NULL;
+		unsigned int fw_size = 0;
+
+		/* This is the firmware whose headers the driver was compiled
+		 * against
+		 */
+		fw_info = find_fw_info(CHELSIO_CHIP_VERSION(adap->params.chip));
+		if (fw_info == NULL) {
+			dev_err(adap->pdev_dev,
+				"unable to get firmware info for chip %d.\n",
+				CHELSIO_CHIP_VERSION(adap->params.chip));
+			return -EINVAL;
+		}
+
+		/* allocate memory to read the header of the firmware on the
+		 * card
+		 */
+		card_fw = kvzalloc(sizeof(*card_fw), GFP_KERNEL);
+
+		/* Get FW from from /lib/firmware/ */
+		ret = request_firmware(&fw, fw_info->fw_mod_name,
+				       adap->pdev_dev);
+		if (ret < 0) {
+			dev_err(adap->pdev_dev,
+				"unable to load firmware image %s, error %d\n",
+				fw_info->fw_mod_name, ret);
+		} else {
+			fw_data = fw->data;
+			fw_size = fw->size;
+		}
+
+		/* upgrade FW logic */
+		ret = t4_prep_fw(adap, fw_info, fw_data, fw_size, card_fw,
+				 state, &reset);
+
+		/* Cleaning up */
+		release_firmware(fw);
+		kvfree(card_fw);
+
+		if (ret < 0)
+			goto bye;
+	}
+
+	/*
+	 * Grab VPD parameters.  This should be done after we establish a
+	 * connection to the firmware since some of the VPD parameters
+	 * (notably the Core Clock frequency) are retrieved via requests to
+	 * the firmware.  On the other hand, we need these fairly early on
+	 * so we do this right after getting ahold of the firmware.
+	 */
+	ret = t4_get_vpd_params(adap, &adap->params.vpd);
+	if (ret < 0)
+		goto bye;
+
+	/*
+	 * Find out what ports are available to us.  Note that we need to do
+	 * this before calling adap_init0_no_config() since it needs nports
+	 * and portvec ...
+	 */
+	v =
+	    FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+	    FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_PORTVEC);
+	ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 1, &v, &port_vec);
+	if (ret < 0)
+		goto bye;
+
+	adap->params.nports = hweight32(port_vec);
+	adap->params.portvec = port_vec;
+
+	/* If the firmware is initialized already, emit a simply note to that
+	 * effect. Otherwise, it's time to try initializing the adapter.
+	 */
+	if (state == DEV_STATE_INIT) {
+		ret = adap_config_hma(adap);
+		if (ret)
+			dev_err(adap->pdev_dev,
+				"HMA configuration failed with error %d\n",
+				ret);
+		dev_info(adap->pdev_dev, "Coming up as %s: "\
+			 "Adapter already initialized\n",
+			 adap->flags & MASTER_PF ? "MASTER" : "SLAVE");
+	} else {
+		dev_info(adap->pdev_dev, "Coming up as MASTER: "\
+			 "Initializing adapter\n");
+
+		/* Find out whether we're dealing with a version of the
+		 * firmware which has configuration file support.
+		 */
+		params[0] = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+			     FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_CF));
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 1,
+				      params, val);
+
+		/* If the firmware doesn't support Configuration Files,
+		 * return an error.
+		 */
+		if (ret < 0) {
+			dev_err(adap->pdev_dev, "firmware doesn't support "
+				"Firmware Configuration Files\n");
+			goto bye;
+		}
+
+		/* The firmware provides us with a memory buffer where we can
+		 * load a Configuration File from the host if we want to
+		 * override the Configuration File in flash.
+		 */
+		ret = adap_init0_config(adap, reset);
+		if (ret == -ENOENT) {
+			dev_err(adap->pdev_dev, "no Configuration File "
+				"present on adapter.\n");
+			goto bye;
+		}
+		if (ret < 0) {
+			dev_err(adap->pdev_dev, "could not initialize "
+				"adapter, error %d\n", -ret);
+			goto bye;
+		}
+	}
+
+	/* Give the SGE code a chance to pull in anything that it needs ...
+	 * Note that this must be called after we retrieve our VPD parameters
+	 * in order to know how to convert core ticks to seconds, etc.
+	 */
+	ret = t4_sge_init(adap);
+	if (ret < 0)
+		goto bye;
+
+	if (is_bypass_device(adap->pdev->device))
+		adap->params.bypass = 1;
+
+	/*
+	 * Grab some of our basic fundamental operating parameters.
+	 */
+#define FW_PARAM_DEV(param) \
+	(FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) | \
+	FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_##param))
+
+#define FW_PARAM_PFVF(param) \
+	FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_PFVF) | \
+	FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_PFVF_##param)|  \
+	FW_PARAMS_PARAM_Y_V(0) | \
+	FW_PARAMS_PARAM_Z_V(0)
+
+	params[0] = FW_PARAM_PFVF(EQ_START);
+	params[1] = FW_PARAM_PFVF(L2T_START);
+	params[2] = FW_PARAM_PFVF(L2T_END);
+	params[3] = FW_PARAM_PFVF(FILTER_START);
+	params[4] = FW_PARAM_PFVF(FILTER_END);
+	params[5] = FW_PARAM_PFVF(IQFLINT_START);
+	ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 6, params, val);
+	if (ret < 0)
+		goto bye;
+	adap->sge.egr_start = val[0];
+	adap->l2t_start = val[1];
+	adap->l2t_end = val[2];
+	adap->tids.ftid_base = val[3];
+	adap->tids.nftids = val[4] - val[3] + 1;
+	adap->sge.ingr_start = val[5];
+
+	/* qids (ingress/egress) returned from firmware can be anywhere
+	 * in the range from EQ(IQFLINT)_START to EQ(IQFLINT)_END.
+	 * Hence driver needs to allocate memory for this range to
+	 * store the queue info. Get the highest IQFLINT/EQ index returned
+	 * in FW_EQ_*_CMD.alloc command.
+	 */
+	params[0] = FW_PARAM_PFVF(EQ_END);
+	params[1] = FW_PARAM_PFVF(IQFLINT_END);
+	ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 2, params, val);
+	if (ret < 0)
+		goto bye;
+	adap->sge.egr_sz = val[0] - adap->sge.egr_start + 1;
+	adap->sge.ingr_sz = val[1] - adap->sge.ingr_start + 1;
+
+	adap->sge.egr_map = kcalloc(adap->sge.egr_sz,
+				    sizeof(*adap->sge.egr_map), GFP_KERNEL);
+	if (!adap->sge.egr_map) {
+		ret = -ENOMEM;
+		goto bye;
+	}
+
+	adap->sge.ingr_map = kcalloc(adap->sge.ingr_sz,
+				     sizeof(*adap->sge.ingr_map), GFP_KERNEL);
+	if (!adap->sge.ingr_map) {
+		ret = -ENOMEM;
+		goto bye;
+	}
+
+	/* Allocate the memory for the vaious egress queue bitmaps
+	 * ie starving_fl, txq_maperr and blocked_fl.
+	 */
+	adap->sge.starving_fl =	kcalloc(BITS_TO_LONGS(adap->sge.egr_sz),
+					sizeof(long), GFP_KERNEL);
+	if (!adap->sge.starving_fl) {
+		ret = -ENOMEM;
+		goto bye;
+	}
+
+	adap->sge.txq_maperr = kcalloc(BITS_TO_LONGS(adap->sge.egr_sz),
+				       sizeof(long), GFP_KERNEL);
+	if (!adap->sge.txq_maperr) {
+		ret = -ENOMEM;
+		goto bye;
+	}
+
+#ifdef CONFIG_DEBUG_FS
+	adap->sge.blocked_fl = kcalloc(BITS_TO_LONGS(adap->sge.egr_sz),
+				       sizeof(long), GFP_KERNEL);
+	if (!adap->sge.blocked_fl) {
+		ret = -ENOMEM;
+		goto bye;
+	}
+#endif
+
+	params[0] = FW_PARAM_PFVF(CLIP_START);
+	params[1] = FW_PARAM_PFVF(CLIP_END);
+	ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 2, params, val);
+	if (ret < 0)
+		goto bye;
+	adap->clipt_start = val[0];
+	adap->clipt_end = val[1];
+
+	/* We don't yet have a PARAMs calls to retrieve the number of Traffic
+	 * Classes supported by the hardware/firmware so we hard code it here
+	 * for now.
+	 */
+	adap->params.nsched_cls = is_t4(adap->params.chip) ? 15 : 16;
+
+	/* query params related to active filter region */
+	params[0] = FW_PARAM_PFVF(ACTIVE_FILTER_START);
+	params[1] = FW_PARAM_PFVF(ACTIVE_FILTER_END);
+	ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 2, params, val);
+	/* If Active filter size is set we enable establishing
+	 * offload connection through firmware work request
+	 */
+	if ((val[0] != val[1]) && (ret >= 0)) {
+		adap->flags |= FW_OFLD_CONN;
+		adap->tids.aftid_base = val[0];
+		adap->tids.aftid_end = val[1];
+	}
+
+	/* If we're running on newer firmware, let it know that we're
+	 * prepared to deal with encapsulated CPL messages.  Older
+	 * firmware won't understand this and we'll just get
+	 * unencapsulated messages ...
+	 */
+	params[0] = FW_PARAM_PFVF(CPLFW4MSG_ENCAP);
+	val[0] = 1;
+	(void)t4_set_params(adap, adap->mbox, adap->pf, 0, 1, params, val);
+
+	/*
+	 * Find out whether we're allowed to use the T5+ ULPTX MEMWRITE DSGL
+	 * capability.  Earlier versions of the firmware didn't have the
+	 * ULPTX_MEMWRITE_DSGL so we'll interpret a query failure as no
+	 * permission to use ULPTX MEMWRITE DSGL.
+	 */
+	if (is_t4(adap->params.chip)) {
+		adap->params.ulptx_memwrite_dsgl = false;
+	} else {
+		params[0] = FW_PARAM_DEV(ULPTX_MEMWRITE_DSGL);
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0,
+				      1, params, val);
+		adap->params.ulptx_memwrite_dsgl = (ret == 0 && val[0] != 0);
+	}
+
+	/* See if FW supports FW_RI_FR_NSMR_TPTE_WR work request */
+	params[0] = FW_PARAM_DEV(RI_FR_NSMR_TPTE_WR);
+	ret = t4_query_params(adap, adap->mbox, adap->pf, 0,
+			      1, params, val);
+	adap->params.fr_nsmr_tpte_wr_support = (ret == 0 && val[0] != 0);
+
+	/* See if FW supports FW_FILTER2 work request */
+	if (is_t4(adap->params.chip)) {
+		adap->params.filter2_wr_support = 0;
+	} else {
+		params[0] = FW_PARAM_DEV(FILTER2_WR);
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0,
+				      1, params, val);
+		adap->params.filter2_wr_support = (ret == 0 && val[0] != 0);
+	}
+
+	/*
+	 * Get device capabilities so we can determine what resources we need
+	 * to manage.
+	 */
+	memset(&caps_cmd, 0, sizeof(caps_cmd));
+	caps_cmd.op_to_write = htonl(FW_CMD_OP_V(FW_CAPS_CONFIG_CMD) |
+				     FW_CMD_REQUEST_F | FW_CMD_READ_F);
+	caps_cmd.cfvalid_to_len16 = htonl(FW_LEN16(caps_cmd));
+	ret = t4_wr_mbox(adap, adap->mbox, &caps_cmd, sizeof(caps_cmd),
+			 &caps_cmd);
+	if (ret < 0)
+		goto bye;
+
+	if (caps_cmd.ofldcaps ||
+	    (caps_cmd.niccaps & htons(FW_CAPS_CONFIG_NIC_HASHFILTER))) {
+		/* query offload-related parameters */
+		params[0] = FW_PARAM_DEV(NTID);
+		params[1] = FW_PARAM_PFVF(SERVER_START);
+		params[2] = FW_PARAM_PFVF(SERVER_END);
+		params[3] = FW_PARAM_PFVF(TDDP_START);
+		params[4] = FW_PARAM_PFVF(TDDP_END);
+		params[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ);
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 6,
+				      params, val);
+		if (ret < 0)
+			goto bye;
+		adap->tids.ntids = val[0];
+		adap->tids.natids = min(adap->tids.ntids / 2, MAX_ATIDS);
+		adap->tids.stid_base = val[1];
+		adap->tids.nstids = val[2] - val[1] + 1;
+		/*
+		 * Setup server filter region. Divide the available filter
+		 * region into two parts. Regular filters get 1/3rd and server
+		 * filters get 2/3rd part. This is only enabled if workarond
+		 * path is enabled.
+		 * 1. For regular filters.
+		 * 2. Server filter: This are special filters which are used
+		 * to redirect SYN packets to offload queue.
+		 */
+		if (adap->flags & FW_OFLD_CONN && !is_bypass(adap)) {
+			adap->tids.sftid_base = adap->tids.ftid_base +
+					DIV_ROUND_UP(adap->tids.nftids, 3);
+			adap->tids.nsftids = adap->tids.nftids -
+					 DIV_ROUND_UP(adap->tids.nftids, 3);
+			adap->tids.nftids = adap->tids.sftid_base -
+						adap->tids.ftid_base;
+		}
+		adap->vres.ddp.start = val[3];
+		adap->vres.ddp.size = val[4] - val[3] + 1;
+		adap->params.ofldq_wr_cred = val[5];
+
+		if (caps_cmd.niccaps & htons(FW_CAPS_CONFIG_NIC_HASHFILTER)) {
+			ret = init_hash_filter(adap);
+			if (ret < 0)
+				goto bye;
+		} else {
+			adap->params.offload = 1;
+			adap->num_ofld_uld += 1;
+		}
+	}
+	if (caps_cmd.rdmacaps) {
+		params[0] = FW_PARAM_PFVF(STAG_START);
+		params[1] = FW_PARAM_PFVF(STAG_END);
+		params[2] = FW_PARAM_PFVF(RQ_START);
+		params[3] = FW_PARAM_PFVF(RQ_END);
+		params[4] = FW_PARAM_PFVF(PBL_START);
+		params[5] = FW_PARAM_PFVF(PBL_END);
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 6,
+				      params, val);
+		if (ret < 0)
+			goto bye;
+		adap->vres.stag.start = val[0];
+		adap->vres.stag.size = val[1] - val[0] + 1;
+		adap->vres.rq.start = val[2];
+		adap->vres.rq.size = val[3] - val[2] + 1;
+		adap->vres.pbl.start = val[4];
+		adap->vres.pbl.size = val[5] - val[4] + 1;
+
+		params[0] = FW_PARAM_PFVF(SRQ_START);
+		params[1] = FW_PARAM_PFVF(SRQ_END);
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 2,
+				      params, val);
+		if (!ret) {
+			adap->vres.srq.start = val[0];
+			adap->vres.srq.size = val[1] - val[0] + 1;
+		}
+		if (adap->vres.srq.size) {
+			adap->srq = t4_init_srq(adap->vres.srq.size);
+			if (!adap->srq)
+				dev_warn(&adap->pdev->dev, "could not allocate SRQ, continuing\n");
+		}
+
+		params[0] = FW_PARAM_PFVF(SQRQ_START);
+		params[1] = FW_PARAM_PFVF(SQRQ_END);
+		params[2] = FW_PARAM_PFVF(CQ_START);
+		params[3] = FW_PARAM_PFVF(CQ_END);
+		params[4] = FW_PARAM_PFVF(OCQ_START);
+		params[5] = FW_PARAM_PFVF(OCQ_END);
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 6, params,
+				      val);
+		if (ret < 0)
+			goto bye;
+		adap->vres.qp.start = val[0];
+		adap->vres.qp.size = val[1] - val[0] + 1;
+		adap->vres.cq.start = val[2];
+		adap->vres.cq.size = val[3] - val[2] + 1;
+		adap->vres.ocq.start = val[4];
+		adap->vres.ocq.size = val[5] - val[4] + 1;
+
+		params[0] = FW_PARAM_DEV(MAXORDIRD_QP);
+		params[1] = FW_PARAM_DEV(MAXIRD_ADAPTER);
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 2, params,
+				      val);
+		if (ret < 0) {
+			adap->params.max_ordird_qp = 8;
+			adap->params.max_ird_adapter = 32 * adap->tids.ntids;
+			ret = 0;
+		} else {
+			adap->params.max_ordird_qp = val[0];
+			adap->params.max_ird_adapter = val[1];
+		}
+		dev_info(adap->pdev_dev,
+			 "max_ordird_qp %d max_ird_adapter %d\n",
+			 adap->params.max_ordird_qp,
+			 adap->params.max_ird_adapter);
+
+		/* Enable write_with_immediate if FW supports it */
+		params[0] = FW_PARAM_DEV(RDMA_WRITE_WITH_IMM);
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 1, params,
+				      val);
+		adap->params.write_w_imm_support = (ret == 0 && val[0] != 0);
+
+		/* Enable write_cmpl if FW supports it */
+		params[0] = FW_PARAM_DEV(RI_WRITE_CMPL_WR);
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 1, params,
+				      val);
+		adap->params.write_cmpl_support = (ret == 0 && val[0] != 0);
+		adap->num_ofld_uld += 2;
+	}
+	if (caps_cmd.iscsicaps) {
+		params[0] = FW_PARAM_PFVF(ISCSI_START);
+		params[1] = FW_PARAM_PFVF(ISCSI_END);
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 2,
+				      params, val);
+		if (ret < 0)
+			goto bye;
+		adap->vres.iscsi.start = val[0];
+		adap->vres.iscsi.size = val[1] - val[0] + 1;
+		/* LIO target and cxgb4i initiaitor */
+		adap->num_ofld_uld += 2;
+	}
+	if (caps_cmd.cryptocaps) {
+		if (ntohs(caps_cmd.cryptocaps) &
+		    FW_CAPS_CONFIG_CRYPTO_LOOKASIDE) {
+			params[0] = FW_PARAM_PFVF(NCRYPTO_LOOKASIDE);
+			ret = t4_query_params(adap, adap->mbox, adap->pf, 0,
+					      2, params, val);
+			if (ret < 0) {
+				if (ret != -EINVAL)
+					goto bye;
+			} else {
+				adap->vres.ncrypto_fc = val[0];
+			}
+			adap->num_ofld_uld += 1;
+		}
+		if (ntohs(caps_cmd.cryptocaps) &
+		    FW_CAPS_CONFIG_TLS_INLINE) {
+			params[0] = FW_PARAM_PFVF(TLS_START);
+			params[1] = FW_PARAM_PFVF(TLS_END);
+			ret = t4_query_params(adap, adap->mbox, adap->pf, 0,
+					      2, params, val);
+			if (ret < 0)
+				goto bye;
+			adap->vres.key.start = val[0];
+			adap->vres.key.size = val[1] - val[0] + 1;
+			adap->num_uld += 1;
+		}
+		adap->params.crypto = ntohs(caps_cmd.cryptocaps);
+	}
+#undef FW_PARAM_PFVF
+#undef FW_PARAM_DEV
+
+	/* The MTU/MSS Table is initialized by now, so load their values.  If
+	 * we're initializing the adapter, then we'll make any modifications
+	 * we want to the MTU/MSS Table and also initialize the congestion
+	 * parameters.
+	 */
+	t4_read_mtu_tbl(adap, adap->params.mtus, NULL);
+	if (state != DEV_STATE_INIT) {
+		int i;
+
+		/* The default MTU Table contains values 1492 and 1500.
+		 * However, for TCP, it's better to have two values which are
+		 * a multiple of 8 +/- 4 bytes apart near this popular MTU.
+		 * This allows us to have a TCP Data Payload which is a
+		 * multiple of 8 regardless of what combination of TCP Options
+		 * are in use (always a multiple of 4 bytes) which is
+		 * important for performance reasons.  For instance, if no
+		 * options are in use, then we have a 20-byte IP header and a
+		 * 20-byte TCP header.  In this case, a 1500-byte MSS would
+		 * result in a TCP Data Payload of 1500 - 40 == 1460 bytes
+		 * which is not a multiple of 8.  So using an MSS of 1488 in
+		 * this case results in a TCP Data Payload of 1448 bytes which
+		 * is a multiple of 8.  On the other hand, if 12-byte TCP Time
+		 * Stamps have been negotiated, then an MTU of 1500 bytes
+		 * results in a TCP Data Payload of 1448 bytes which, as
+		 * above, is a multiple of 8 bytes ...
+		 */
+		for (i = 0; i < NMTUS; i++)
+			if (adap->params.mtus[i] == 1492) {
+				adap->params.mtus[i] = 1488;
+				break;
+			}
+
+		t4_load_mtus(adap, adap->params.mtus, adap->params.a_wnd,
+			     adap->params.b_wnd);
+	}
+	t4_init_sge_params(adap);
+	adap->flags |= FW_OK;
+	t4_init_tp_params(adap, true);
+	return 0;
+
+	/*
+	 * Something bad happened.  If a command timed out or failed with EIO
+	 * FW does not operate within its spec or something catastrophic
+	 * happened to HW/FW, stop issuing commands.
+	 */
+bye:
+	adap_free_hma_mem(adap);
+	kfree(adap->sge.egr_map);
+	kfree(adap->sge.ingr_map);
+	kfree(adap->sge.starving_fl);
+	kfree(adap->sge.txq_maperr);
+#ifdef CONFIG_DEBUG_FS
+	kfree(adap->sge.blocked_fl);
+#endif
+	if (ret != -ETIMEDOUT && ret != -EIO)
+		t4_fw_bye(adap, adap->mbox);
+	return ret;
+}
+
+/* EEH callbacks */
+
+static pci_ers_result_t eeh_err_detected(struct pci_dev *pdev,
+					 pci_channel_state_t state)
+{
+	int i;
+	struct adapter *adap = pci_get_drvdata(pdev);
+
+	if (!adap)
+		goto out;
+
+	rtnl_lock();
+	adap->flags &= ~FW_OK;
+	notify_ulds(adap, CXGB4_STATE_START_RECOVERY);
+	spin_lock(&adap->stats_lock);
+	for_each_port(adap, i) {
+		struct net_device *dev = adap->port[i];
+		if (dev) {
+			netif_device_detach(dev);
+			netif_carrier_off(dev);
+		}
+	}
+	spin_unlock(&adap->stats_lock);
+	disable_interrupts(adap);
+	if (adap->flags & FULL_INIT_DONE)
+		cxgb_down(adap);
+	rtnl_unlock();
+	if ((adap->flags & DEV_ENABLED)) {
+		pci_disable_device(pdev);
+		adap->flags &= ~DEV_ENABLED;
+	}
+out:	return state == pci_channel_io_perm_failure ?
+		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t eeh_slot_reset(struct pci_dev *pdev)
+{
+	int i, ret;
+	struct fw_caps_config_cmd c;
+	struct adapter *adap = pci_get_drvdata(pdev);
+
+	if (!adap) {
+		pci_restore_state(pdev);
+		pci_save_state(pdev);
+		return PCI_ERS_RESULT_RECOVERED;
+	}
+
+	if (!(adap->flags & DEV_ENABLED)) {
+		if (pci_enable_device(pdev)) {
+			dev_err(&pdev->dev, "Cannot reenable PCI "
+					    "device after reset\n");
+			return PCI_ERS_RESULT_DISCONNECT;
+		}
+		adap->flags |= DEV_ENABLED;
+	}
+
+	pci_set_master(pdev);
+	pci_restore_state(pdev);
+	pci_save_state(pdev);
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+
+	if (t4_wait_dev_ready(adap->regs) < 0)
+		return PCI_ERS_RESULT_DISCONNECT;
+	if (t4_fw_hello(adap, adap->mbox, adap->pf, MASTER_MUST, NULL) < 0)
+		return PCI_ERS_RESULT_DISCONNECT;
+	adap->flags |= FW_OK;
+	if (adap_init1(adap, &c))
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	for_each_port(adap, i) {
+		struct port_info *p = adap2pinfo(adap, i);
+
+		ret = t4_alloc_vi(adap, adap->mbox, p->tx_chan, adap->pf, 0, 1,
+				  NULL, NULL);
+		if (ret < 0)
+			return PCI_ERS_RESULT_DISCONNECT;
+		p->viid = ret;
+		p->xact_addr_filt = -1;
+	}
+
+	t4_load_mtus(adap, adap->params.mtus, adap->params.a_wnd,
+		     adap->params.b_wnd);
+	setup_memwin(adap);
+	if (cxgb_up(adap))
+		return PCI_ERS_RESULT_DISCONNECT;
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void eeh_resume(struct pci_dev *pdev)
+{
+	int i;
+	struct adapter *adap = pci_get_drvdata(pdev);
+
+	if (!adap)
+		return;
+
+	rtnl_lock();
+	for_each_port(adap, i) {
+		struct net_device *dev = adap->port[i];
+		if (dev) {
+			if (netif_running(dev)) {
+				link_start(dev);
+				cxgb_set_rxmode(dev);
+			}
+			netif_device_attach(dev);
+		}
+	}
+	rtnl_unlock();
+}
+
+static const struct pci_error_handlers cxgb4_eeh = {
+	.error_detected = eeh_err_detected,
+	.slot_reset     = eeh_slot_reset,
+	.resume         = eeh_resume,
+};
+
+/* Return true if the Link Configuration supports "High Speeds" (those greater
+ * than 1Gb/s).
+ */
+static inline bool is_x_10g_port(const struct link_config *lc)
+{
+	unsigned int speeds, high_speeds;
+
+	speeds = FW_PORT_CAP32_SPEED_V(FW_PORT_CAP32_SPEED_G(lc->pcaps));
+	high_speeds = speeds &
+			~(FW_PORT_CAP32_SPEED_100M | FW_PORT_CAP32_SPEED_1G);
+
+	return high_speeds != 0;
+}
+
+/*
+ * Perform default configuration of DMA queues depending on the number and type
+ * of ports we found and the number of available CPUs.  Most settings can be
+ * modified by the admin prior to actual use.
+ */
+static void cfg_queues(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+	int i = 0, n10g = 0, qidx = 0;
+#ifndef CONFIG_CHELSIO_T4_DCB
+	int q10g = 0;
+#endif
+
+	/* Reduce memory usage in kdump environment, disable all offload.
+	 */
+	if (is_kdump_kernel() || (is_uld(adap) && t4_uld_mem_alloc(adap))) {
+		adap->params.offload = 0;
+		adap->params.crypto = 0;
+	}
+
+	n10g += is_x_10g_port(&adap2pinfo(adap, i)->link_cfg);
+#ifdef CONFIG_CHELSIO_T4_DCB
+	/* For Data Center Bridging support we need to be able to support up
+	 * to 8 Traffic Priorities; each of which will be assigned to its
+	 * own TX Queue in order to prevent Head-Of-Line Blocking.
+	 */
+	if (adap->params.nports * 8 > MAX_ETH_QSETS) {
+		dev_err(adap->pdev_dev, "MAX_ETH_QSETS=%d < %d!\n",
+			MAX_ETH_QSETS, adap->params.nports * 8);
+		BUG_ON(1);
+	}
+
+	for_each_port(adap, i) {
+		struct port_info *pi = adap2pinfo(adap, i);
+
+		pi->first_qset = qidx;
+		pi->nqsets = is_kdump_kernel() ? 1 : 8;
+		qidx += pi->nqsets;
+	}
+#else /* !CONFIG_CHELSIO_T4_DCB */
+	/*
+	 * We default to 1 queue per non-10G port and up to # of cores queues
+	 * per 10G port.
+	 */
+	if (n10g)
+		q10g = (MAX_ETH_QSETS - (adap->params.nports - n10g)) / n10g;
+	if (q10g > netif_get_num_default_rss_queues())
+		q10g = netif_get_num_default_rss_queues();
+
+	if (is_kdump_kernel())
+		q10g = 1;
+
+	for_each_port(adap, i) {
+		struct port_info *pi = adap2pinfo(adap, i);
+
+		pi->first_qset = qidx;
+		pi->nqsets = is_x_10g_port(&pi->link_cfg) ? q10g : 1;
+		qidx += pi->nqsets;
+	}
+#endif /* !CONFIG_CHELSIO_T4_DCB */
+
+	s->ethqsets = qidx;
+	s->max_ethqsets = qidx;   /* MSI-X may lower it later */
+
+	if (is_uld(adap)) {
+		/*
+		 * For offload we use 1 queue/channel if all ports are up to 1G,
+		 * otherwise we divide all available queues amongst the channels
+		 * capped by the number of available cores.
+		 */
+		if (n10g) {
+			i = min_t(int, MAX_OFLD_QSETS, num_online_cpus());
+			s->ofldqsets = roundup(i, adap->params.nports);
+		} else {
+			s->ofldqsets = adap->params.nports;
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(s->ethrxq); i++) {
+		struct sge_eth_rxq *r = &s->ethrxq[i];
+
+		init_rspq(adap, &r->rspq, 5, 10, 1024, 64);
+		r->fl.size = 72;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(s->ethtxq); i++)
+		s->ethtxq[i].q.size = 1024;
+
+	for (i = 0; i < ARRAY_SIZE(s->ctrlq); i++)
+		s->ctrlq[i].q.size = 512;
+
+	if (!is_t4(adap->params.chip))
+		s->ptptxq.q.size = 8;
+
+	init_rspq(adap, &s->fw_evtq, 0, 1, 1024, 64);
+	init_rspq(adap, &s->intrq, 0, 1, 512, 64);
+}
+
+/*
+ * Reduce the number of Ethernet queues across all ports to at most n.
+ * n provides at least one queue per port.
+ */
+static void reduce_ethqs(struct adapter *adap, int n)
+{
+	int i;
+	struct port_info *pi;
+
+	while (n < adap->sge.ethqsets)
+		for_each_port(adap, i) {
+			pi = adap2pinfo(adap, i);
+			if (pi->nqsets > 1) {
+				pi->nqsets--;
+				adap->sge.ethqsets--;
+				if (adap->sge.ethqsets <= n)
+					break;
+			}
+		}
+
+	n = 0;
+	for_each_port(adap, i) {
+		pi = adap2pinfo(adap, i);
+		pi->first_qset = n;
+		n += pi->nqsets;
+	}
+}
+
+static int get_msix_info(struct adapter *adap)
+{
+	struct uld_msix_info *msix_info;
+	unsigned int max_ingq = 0;
+
+	if (is_offload(adap))
+		max_ingq += MAX_OFLD_QSETS * adap->num_ofld_uld;
+	if (is_pci_uld(adap))
+		max_ingq += MAX_OFLD_QSETS * adap->num_uld;
+
+	if (!max_ingq)
+		goto out;
+
+	msix_info = kcalloc(max_ingq, sizeof(*msix_info), GFP_KERNEL);
+	if (!msix_info)
+		return -ENOMEM;
+
+	adap->msix_bmap_ulds.msix_bmap = kcalloc(BITS_TO_LONGS(max_ingq),
+						 sizeof(long), GFP_KERNEL);
+	if (!adap->msix_bmap_ulds.msix_bmap) {
+		kfree(msix_info);
+		return -ENOMEM;
+	}
+	spin_lock_init(&adap->msix_bmap_ulds.lock);
+	adap->msix_info_ulds = msix_info;
+out:
+	return 0;
+}
+
+static void free_msix_info(struct adapter *adap)
+{
+	if (!(adap->num_uld && adap->num_ofld_uld))
+		return;
+
+	kfree(adap->msix_info_ulds);
+	kfree(adap->msix_bmap_ulds.msix_bmap);
+}
+
+/* 2 MSI-X vectors needed for the FW queue and non-data interrupts */
+#define EXTRA_VECS 2
+
+static int enable_msix(struct adapter *adap)
+{
+	int ofld_need = 0, uld_need = 0;
+	int i, j, want, need, allocated;
+	struct sge *s = &adap->sge;
+	unsigned int nchan = adap->params.nports;
+	struct msix_entry *entries;
+	int max_ingq = MAX_INGQ;
+
+	if (is_pci_uld(adap))
+		max_ingq += (MAX_OFLD_QSETS * adap->num_uld);
+	if (is_offload(adap))
+		max_ingq += (MAX_OFLD_QSETS * adap->num_ofld_uld);
+	entries = kmalloc(sizeof(*entries) * (max_ingq + 1),
+			  GFP_KERNEL);
+	if (!entries)
+		return -ENOMEM;
+
+	/* map for msix */
+	if (get_msix_info(adap)) {
+		adap->params.offload = 0;
+		adap->params.crypto = 0;
+	}
+
+	for (i = 0; i < max_ingq + 1; ++i)
+		entries[i].entry = i;
+
+	want = s->max_ethqsets + EXTRA_VECS;
+	if (is_offload(adap)) {
+		want += adap->num_ofld_uld * s->ofldqsets;
+		ofld_need = adap->num_ofld_uld * nchan;
+	}
+	if (is_pci_uld(adap)) {
+		want += adap->num_uld * s->ofldqsets;
+		uld_need = adap->num_uld * nchan;
+	}
+#ifdef CONFIG_CHELSIO_T4_DCB
+	/* For Data Center Bridging we need 8 Ethernet TX Priority Queues for
+	 * each port.
+	 */
+	need = 8 * adap->params.nports + EXTRA_VECS + ofld_need + uld_need;
+#else
+	need = adap->params.nports + EXTRA_VECS + ofld_need + uld_need;
+#endif
+	allocated = pci_enable_msix_range(adap->pdev, entries, need, want);
+	if (allocated < 0) {
+		dev_info(adap->pdev_dev, "not enough MSI-X vectors left,"
+			 " not using MSI-X\n");
+		kfree(entries);
+		return allocated;
+	}
+
+	/* Distribute available vectors to the various queue groups.
+	 * Every group gets its minimum requirement and NIC gets top
+	 * priority for leftovers.
+	 */
+	i = allocated - EXTRA_VECS - ofld_need - uld_need;
+	if (i < s->max_ethqsets) {
+		s->max_ethqsets = i;
+		if (i < s->ethqsets)
+			reduce_ethqs(adap, i);
+	}
+	if (is_uld(adap)) {
+		if (allocated < want)
+			s->nqs_per_uld = nchan;
+		else
+			s->nqs_per_uld = s->ofldqsets;
+	}
+
+	for (i = 0; i < (s->max_ethqsets + EXTRA_VECS); ++i)
+		adap->msix_info[i].vec = entries[i].vector;
+	if (is_uld(adap)) {
+		for (j = 0 ; i < allocated; ++i, j++) {
+			adap->msix_info_ulds[j].vec = entries[i].vector;
+			adap->msix_info_ulds[j].idx = i;
+		}
+		adap->msix_bmap_ulds.mapsize = j;
+	}
+	dev_info(adap->pdev_dev, "%d MSI-X vectors allocated, "
+		 "nic %d per uld %d\n",
+		 allocated, s->max_ethqsets, s->nqs_per_uld);
+
+	kfree(entries);
+	return 0;
+}
+
+#undef EXTRA_VECS
+
+static int init_rss(struct adapter *adap)
+{
+	unsigned int i;
+	int err;
+
+	err = t4_init_rss_mode(adap, adap->mbox);
+	if (err)
+		return err;
+
+	for_each_port(adap, i) {
+		struct port_info *pi = adap2pinfo(adap, i);
+
+		pi->rss = kcalloc(pi->rss_size, sizeof(u16), GFP_KERNEL);
+		if (!pi->rss)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+static int cxgb4_get_pcie_dev_link_caps(struct adapter *adap,
+					enum pci_bus_speed *speed,
+					enum pcie_link_width *width)
+{
+	u32 lnkcap1, lnkcap2;
+	int err1, err2;
+
+#define  PCIE_MLW_CAP_SHIFT 4   /* start of MLW mask in link capabilities */
+
+	*speed = PCI_SPEED_UNKNOWN;
+	*width = PCIE_LNK_WIDTH_UNKNOWN;
+
+	err1 = pcie_capability_read_dword(adap->pdev, PCI_EXP_LNKCAP,
+					  &lnkcap1);
+	err2 = pcie_capability_read_dword(adap->pdev, PCI_EXP_LNKCAP2,
+					  &lnkcap2);
+	if (!err2 && lnkcap2) { /* PCIe r3.0-compliant */
+		if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
+			*speed = PCIE_SPEED_8_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_5_0GB)
+			*speed = PCIE_SPEED_5_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_2_5GB)
+			*speed = PCIE_SPEED_2_5GT;
+	}
+	if (!err1) {
+		*width = (lnkcap1 & PCI_EXP_LNKCAP_MLW) >> PCIE_MLW_CAP_SHIFT;
+		if (!lnkcap2) { /* pre-r3.0 */
+			if (lnkcap1 & PCI_EXP_LNKCAP_SLS_5_0GB)
+				*speed = PCIE_SPEED_5_0GT;
+			else if (lnkcap1 & PCI_EXP_LNKCAP_SLS_2_5GB)
+				*speed = PCIE_SPEED_2_5GT;
+		}
+	}
+
+	if (*speed == PCI_SPEED_UNKNOWN || *width == PCIE_LNK_WIDTH_UNKNOWN)
+		return err1 ? err1 : err2 ? err2 : -EINVAL;
+	return 0;
+}
+
+static void cxgb4_check_pcie_caps(struct adapter *adap)
+{
+	enum pcie_link_width width, width_cap;
+	enum pci_bus_speed speed, speed_cap;
+
+#define PCIE_SPEED_STR(speed) \
+	(speed == PCIE_SPEED_8_0GT ? "8.0GT/s" : \
+	 speed == PCIE_SPEED_5_0GT ? "5.0GT/s" : \
+	 speed == PCIE_SPEED_2_5GT ? "2.5GT/s" : \
+	 "Unknown")
+
+	if (cxgb4_get_pcie_dev_link_caps(adap, &speed_cap, &width_cap)) {
+		dev_warn(adap->pdev_dev,
+			 "Unable to determine PCIe device BW capabilities\n");
+		return;
+	}
+
+	if (pcie_get_minimum_link(adap->pdev, &speed, &width) ||
+	    speed == PCI_SPEED_UNKNOWN || width == PCIE_LNK_WIDTH_UNKNOWN) {
+		dev_warn(adap->pdev_dev,
+			 "Unable to determine PCI Express bandwidth.\n");
+		return;
+	}
+
+	dev_info(adap->pdev_dev, "PCIe link speed is %s, device supports %s\n",
+		 PCIE_SPEED_STR(speed), PCIE_SPEED_STR(speed_cap));
+	dev_info(adap->pdev_dev, "PCIe link width is x%d, device supports x%d\n",
+		 width, width_cap);
+	if (speed < speed_cap || width < width_cap)
+		dev_info(adap->pdev_dev,
+			 "A slot with more lanes and/or higher speed is "
+			 "suggested for optimal performance.\n");
+}
+
+/* Dump basic information about the adapter */
+static void print_adapter_info(struct adapter *adapter)
+{
+	/* Hardware/Firmware/etc. Version/Revision IDs */
+	t4_dump_version_info(adapter);
+
+	/* Software/Hardware configuration */
+	dev_info(adapter->pdev_dev, "Configuration: %sNIC %s, %s capable\n",
+		 is_offload(adapter) ? "R" : "",
+		 ((adapter->flags & USING_MSIX) ? "MSI-X" :
+		  (adapter->flags & USING_MSI) ? "MSI" : ""),
+		 is_offload(adapter) ? "Offload" : "non-Offload");
+}
+
+static void print_port_info(const struct net_device *dev)
+{
+	char buf[80];
+	char *bufp = buf;
+	const char *spd = "";
+	const struct port_info *pi = netdev_priv(dev);
+	const struct adapter *adap = pi->adapter;
+
+	if (adap->params.pci.speed == PCI_EXP_LNKSTA_CLS_2_5GB)
+		spd = " 2.5 GT/s";
+	else if (adap->params.pci.speed == PCI_EXP_LNKSTA_CLS_5_0GB)
+		spd = " 5 GT/s";
+	else if (adap->params.pci.speed == PCI_EXP_LNKSTA_CLS_8_0GB)
+		spd = " 8 GT/s";
+
+	if (pi->link_cfg.pcaps & FW_PORT_CAP32_SPEED_100M)
+		bufp += sprintf(bufp, "100M/");
+	if (pi->link_cfg.pcaps & FW_PORT_CAP32_SPEED_1G)
+		bufp += sprintf(bufp, "1G/");
+	if (pi->link_cfg.pcaps & FW_PORT_CAP32_SPEED_10G)
+		bufp += sprintf(bufp, "10G/");
+	if (pi->link_cfg.pcaps & FW_PORT_CAP32_SPEED_25G)
+		bufp += sprintf(bufp, "25G/");
+	if (pi->link_cfg.pcaps & FW_PORT_CAP32_SPEED_40G)
+		bufp += sprintf(bufp, "40G/");
+	if (pi->link_cfg.pcaps & FW_PORT_CAP32_SPEED_50G)
+		bufp += sprintf(bufp, "50G/");
+	if (pi->link_cfg.pcaps & FW_PORT_CAP32_SPEED_100G)
+		bufp += sprintf(bufp, "100G/");
+	if (pi->link_cfg.pcaps & FW_PORT_CAP32_SPEED_200G)
+		bufp += sprintf(bufp, "200G/");
+	if (pi->link_cfg.pcaps & FW_PORT_CAP32_SPEED_400G)
+		bufp += sprintf(bufp, "400G/");
+	if (bufp != buf)
+		--bufp;
+	sprintf(bufp, "BASE-%s", t4_get_port_type_description(pi->port_type));
+
+	netdev_info(dev, "%s: Chelsio %s (%s) %s\n",
+		    dev->name, adap->params.vpd.id, adap->name, buf);
+}
+
+/*
+ * Free the following resources:
+ * - memory used for tables
+ * - MSI/MSI-X
+ * - net devices
+ * - resources FW is holding for us
+ */
+static void free_some_resources(struct adapter *adapter)
+{
+	unsigned int i;
+
+	kvfree(adapter->smt);
+	kvfree(adapter->l2t);
+	kvfree(adapter->srq);
+	t4_cleanup_sched(adapter);
+	kvfree(adapter->tids.tid_tab);
+	cxgb4_cleanup_tc_flower(adapter);
+	cxgb4_cleanup_tc_u32(adapter);
+	kfree(adapter->sge.egr_map);
+	kfree(adapter->sge.ingr_map);
+	kfree(adapter->sge.starving_fl);
+	kfree(adapter->sge.txq_maperr);
+#ifdef CONFIG_DEBUG_FS
+	kfree(adapter->sge.blocked_fl);
+#endif
+	disable_msi(adapter);
+
+	for_each_port(adapter, i)
+		if (adapter->port[i]) {
+			struct port_info *pi = adap2pinfo(adapter, i);
+
+			if (pi->viid != 0)
+				t4_free_vi(adapter, adapter->mbox, adapter->pf,
+					   0, pi->viid);
+			kfree(adap2pinfo(adapter, i)->rss);
+			free_netdev(adapter->port[i]);
+		}
+	if (adapter->flags & FW_OK)
+		t4_fw_bye(adapter, adapter->pf);
+}
+
+#define TSO_FLAGS (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN)
+#define VLAN_FEAT (NETIF_F_SG | NETIF_F_IP_CSUM | TSO_FLAGS | \
+		   NETIF_F_IPV6_CSUM | NETIF_F_HIGHDMA)
+#define SEGMENT_SIZE 128
+
+static int get_chip_type(struct pci_dev *pdev, u32 pl_rev)
+{
+	u16 device_id;
+
+	/* Retrieve adapter's device ID */
+	pci_read_config_word(pdev, PCI_DEVICE_ID, &device_id);
+
+	switch (device_id >> 12) {
+	case CHELSIO_T4:
+		return CHELSIO_CHIP_CODE(CHELSIO_T4, pl_rev);
+	case CHELSIO_T5:
+		return CHELSIO_CHIP_CODE(CHELSIO_T5, pl_rev);
+	case CHELSIO_T6:
+		return CHELSIO_CHIP_CODE(CHELSIO_T6, pl_rev);
+	default:
+		dev_err(&pdev->dev, "Device %d is not supported\n",
+			device_id);
+	}
+	return -EINVAL;
+}
+
+#ifdef CONFIG_PCI_IOV
+static void cxgb4_mgmt_setup(struct net_device *dev)
+{
+	dev->type = ARPHRD_NONE;
+	dev->mtu = 0;
+	dev->hard_header_len = 0;
+	dev->addr_len = 0;
+	dev->tx_queue_len = 0;
+	dev->flags |= IFF_NOARP;
+	dev->priv_flags |= IFF_NO_QUEUE;
+
+	/* Initialize the device structure. */
+	dev->netdev_ops = &cxgb4_mgmt_netdev_ops;
+	dev->ethtool_ops = &cxgb4_mgmt_ethtool_ops;
+}
+
+static int cxgb4_iov_configure(struct pci_dev *pdev, int num_vfs)
+{
+	struct adapter *adap = pci_get_drvdata(pdev);
+	int err = 0;
+	int current_vfs = pci_num_vf(pdev);
+	u32 pcie_fw;
+
+	pcie_fw = readl(adap->regs + PCIE_FW_A);
+	/* Check if cxgb4 is the MASTER and fw is initialized */
+	if (num_vfs &&
+	    (!(pcie_fw & PCIE_FW_INIT_F) ||
+	    !(pcie_fw & PCIE_FW_MASTER_VLD_F) ||
+	    PCIE_FW_MASTER_G(pcie_fw) != CXGB4_UNIFIED_PF)) {
+		dev_warn(&pdev->dev,
+			 "cxgb4 driver needs to be MASTER to support SRIOV\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* If any of the VF's is already assigned to Guest OS, then
+	 * SRIOV for the same cannot be modified
+	 */
+	if (current_vfs && pci_vfs_assigned(pdev)) {
+		dev_err(&pdev->dev,
+			"Cannot modify SR-IOV while VFs are assigned\n");
+		return current_vfs;
+	}
+	/* Note that the upper-level code ensures that we're never called with
+	 * a non-zero "num_vfs" when we already have VFs instantiated.  But
+	 * it never hurts to code defensively.
+	 */
+	if (num_vfs != 0 && current_vfs != 0)
+		return -EBUSY;
+
+	/* Nothing to do for no change. */
+	if (num_vfs == current_vfs)
+		return num_vfs;
+
+	/* Disable SRIOV when zero is passed. */
+	if (!num_vfs) {
+		pci_disable_sriov(pdev);
+		/* free VF Management Interface */
+		unregister_netdev(adap->port[0]);
+		free_netdev(adap->port[0]);
+		adap->port[0] = NULL;
+
+		/* free VF resources */
+		adap->num_vfs = 0;
+		kfree(adap->vfinfo);
+		adap->vfinfo = NULL;
+		return 0;
+	}
+
+	if (!current_vfs) {
+		struct fw_pfvf_cmd port_cmd, port_rpl;
+		struct net_device *netdev;
+		unsigned int pmask, port;
+		struct pci_dev *pbridge;
+		struct port_info *pi;
+		char name[IFNAMSIZ];
+		u32 devcap2;
+		u16 flags;
+		int pos;
+
+		/* If we want to instantiate Virtual Functions, then our
+		 * parent bridge's PCI-E needs to support Alternative Routing
+		 * ID (ARI) because our VFs will show up at function offset 8
+		 * and above.
+		 */
+		pbridge = pdev->bus->self;
+		pos = pci_find_capability(pbridge, PCI_CAP_ID_EXP);
+		pci_read_config_word(pbridge, pos + PCI_EXP_FLAGS, &flags);
+		pci_read_config_dword(pbridge, pos + PCI_EXP_DEVCAP2, &devcap2);
+
+		if ((flags & PCI_EXP_FLAGS_VERS) < 2 ||
+		    !(devcap2 & PCI_EXP_DEVCAP2_ARI)) {
+			/* Our parent bridge does not support ARI so issue a
+			 * warning and skip instantiating the VFs.  They
+			 * won't be reachable.
+			 */
+			dev_warn(&pdev->dev, "Parent bridge %02x:%02x.%x doesn't support ARI; can't instantiate Virtual Functions\n",
+				 pbridge->bus->number, PCI_SLOT(pbridge->devfn),
+				 PCI_FUNC(pbridge->devfn));
+			return -ENOTSUPP;
+		}
+		memset(&port_cmd, 0, sizeof(port_cmd));
+		port_cmd.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_PFVF_CMD) |
+						 FW_CMD_REQUEST_F |
+						 FW_CMD_READ_F |
+						 FW_PFVF_CMD_PFN_V(adap->pf) |
+						 FW_PFVF_CMD_VFN_V(0));
+		port_cmd.retval_len16 = cpu_to_be32(FW_LEN16(port_cmd));
+		err = t4_wr_mbox(adap, adap->mbox, &port_cmd, sizeof(port_cmd),
+				 &port_rpl);
+		if (err)
+			return err;
+		pmask = FW_PFVF_CMD_PMASK_G(be32_to_cpu(port_rpl.type_to_neq));
+		port = ffs(pmask) - 1;
+		/* Allocate VF Management Interface. */
+		snprintf(name, IFNAMSIZ, "mgmtpf%d,%d", adap->adap_idx,
+			 adap->pf);
+		netdev = alloc_netdev(sizeof(struct port_info),
+				      name, NET_NAME_UNKNOWN, cxgb4_mgmt_setup);
+		if (!netdev)
+			return -ENOMEM;
+
+		pi = netdev_priv(netdev);
+		pi->adapter = adap;
+		pi->lport = port;
+		pi->tx_chan = port;
+		SET_NETDEV_DEV(netdev, &pdev->dev);
+
+		adap->port[0] = netdev;
+		pi->port_id = 0;
+
+		err = register_netdev(adap->port[0]);
+		if (err) {
+			pr_info("Unable to register VF mgmt netdev %s\n", name);
+			free_netdev(adap->port[0]);
+			adap->port[0] = NULL;
+			return err;
+		}
+		/* Allocate and set up VF Information. */
+		adap->vfinfo = kcalloc(pci_sriov_get_totalvfs(pdev),
+				       sizeof(struct vf_info), GFP_KERNEL);
+		if (!adap->vfinfo) {
+			unregister_netdev(adap->port[0]);
+			free_netdev(adap->port[0]);
+			adap->port[0] = NULL;
+			return -ENOMEM;
+		}
+		cxgb4_mgmt_fill_vf_station_mac_addr(adap);
+	}
+	/* Instantiate the requested number of VFs. */
+	err = pci_enable_sriov(pdev, num_vfs);
+	if (err) {
+		pr_info("Unable to instantiate %d VFs\n", num_vfs);
+		if (!current_vfs) {
+			unregister_netdev(adap->port[0]);
+			free_netdev(adap->port[0]);
+			adap->port[0] = NULL;
+			kfree(adap->vfinfo);
+			adap->vfinfo = NULL;
+		}
+		return err;
+	}
+
+	adap->num_vfs = num_vfs;
+	return num_vfs;
+}
+#endif /* CONFIG_PCI_IOV */
+
+static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int func, i, err, s_qpp, qpp, num_seg;
+	struct port_info *pi;
+	bool highdma = false;
+	struct adapter *adapter = NULL;
+	struct net_device *netdev;
+	void __iomem *regs;
+	u32 whoami, pl_rev;
+	enum chip_type chip;
+	static int adap_idx = 1;
+
+	printk_once(KERN_INFO "%s - version %s\n", DRV_DESC, DRV_VERSION);
+
+	err = pci_request_regions(pdev, KBUILD_MODNAME);
+	if (err) {
+		/* Just info, some other driver may have claimed the device. */
+		dev_info(&pdev->dev, "cannot obtain PCI resources\n");
+		return err;
+	}
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "cannot enable PCI device\n");
+		goto out_release_regions;
+	}
+
+	regs = pci_ioremap_bar(pdev, 0);
+	if (!regs) {
+		dev_err(&pdev->dev, "cannot map device registers\n");
+		err = -ENOMEM;
+		goto out_disable_device;
+	}
+
+	adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+	if (!adapter) {
+		err = -ENOMEM;
+		goto out_unmap_bar0;
+	}
+
+	adapter->regs = regs;
+	err = t4_wait_dev_ready(regs);
+	if (err < 0)
+		goto out_free_adapter;
+
+	/* We control everything through one PF */
+	whoami = readl(regs + PL_WHOAMI_A);
+	pl_rev = REV_G(readl(regs + PL_REV_A));
+	chip = get_chip_type(pdev, pl_rev);
+	func = CHELSIO_CHIP_VERSION(chip) <= CHELSIO_T5 ?
+		SOURCEPF_G(whoami) : T6_SOURCEPF_G(whoami);
+
+	adapter->pdev = pdev;
+	adapter->pdev_dev = &pdev->dev;
+	adapter->name = pci_name(pdev);
+	adapter->mbox = func;
+	adapter->pf = func;
+	adapter->params.chip = chip;
+	adapter->adap_idx = adap_idx;
+	adapter->msg_enable = DFLT_MSG_ENABLE;
+	adapter->mbox_log = kzalloc(sizeof(*adapter->mbox_log) +
+				    (sizeof(struct mbox_cmd) *
+				     T4_OS_LOG_MBOX_CMDS),
+				    GFP_KERNEL);
+	if (!adapter->mbox_log) {
+		err = -ENOMEM;
+		goto out_free_adapter;
+	}
+	spin_lock_init(&adapter->mbox_lock);
+	INIT_LIST_HEAD(&adapter->mlist.list);
+	adapter->mbox_log->size = T4_OS_LOG_MBOX_CMDS;
+	pci_set_drvdata(pdev, adapter);
+
+	if (func != ent->driver_data) {
+		pci_disable_device(pdev);
+		pci_save_state(pdev);        /* to restore SR-IOV later */
+		return 0;
+	}
+
+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+		highdma = true;
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+		if (err) {
+			dev_err(&pdev->dev, "unable to obtain 64-bit DMA for "
+				"coherent allocations\n");
+			goto out_free_adapter;
+		}
+	} else {
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "no usable DMA configuration\n");
+			goto out_free_adapter;
+		}
+	}
+
+	pci_enable_pcie_error_reporting(pdev);
+	pci_set_master(pdev);
+	pci_save_state(pdev);
+	adap_idx++;
+	adapter->workq = create_singlethread_workqueue("cxgb4");
+	if (!adapter->workq) {
+		err = -ENOMEM;
+		goto out_free_adapter;
+	}
+
+	/* PCI device has been enabled */
+	adapter->flags |= DEV_ENABLED;
+	memset(adapter->chan_map, 0xff, sizeof(adapter->chan_map));
+
+	/* If possible, we use PCIe Relaxed Ordering Attribute to deliver
+	 * Ingress Packet Data to Free List Buffers in order to allow for
+	 * chipset performance optimizations between the Root Complex and
+	 * Memory Controllers.  (Messages to the associated Ingress Queue
+	 * notifying new Packet Placement in the Free Lists Buffers will be
+	 * send without the Relaxed Ordering Attribute thus guaranteeing that
+	 * all preceding PCIe Transaction Layer Packets will be processed
+	 * first.)  But some Root Complexes have various issues with Upstream
+	 * Transaction Layer Packets with the Relaxed Ordering Attribute set.
+	 * The PCIe devices which under the Root Complexes will be cleared the
+	 * Relaxed Ordering bit in the configuration space, So we check our
+	 * PCIe configuration space to see if it's flagged with advice against
+	 * using Relaxed Ordering.
+	 */
+	if (!pcie_relaxed_ordering_enabled(pdev))
+		adapter->flags |= ROOT_NO_RELAXED_ORDERING;
+
+	spin_lock_init(&adapter->stats_lock);
+	spin_lock_init(&adapter->tid_release_lock);
+	spin_lock_init(&adapter->win0_lock);
+
+	INIT_WORK(&adapter->tid_release_task, process_tid_release_list);
+	INIT_WORK(&adapter->db_full_task, process_db_full);
+	INIT_WORK(&adapter->db_drop_task, process_db_drop);
+	INIT_WORK(&adapter->fatal_err_notify_task, notify_fatal_err);
+
+	err = t4_prep_adapter(adapter);
+	if (err)
+		goto out_free_adapter;
+
+
+	if (!is_t4(adapter->params.chip)) {
+		s_qpp = (QUEUESPERPAGEPF0_S +
+			(QUEUESPERPAGEPF1_S - QUEUESPERPAGEPF0_S) *
+			adapter->pf);
+		qpp = 1 << QUEUESPERPAGEPF0_G(t4_read_reg(adapter,
+		      SGE_EGRESS_QUEUES_PER_PAGE_PF_A) >> s_qpp);
+		num_seg = PAGE_SIZE / SEGMENT_SIZE;
+
+		/* Each segment size is 128B. Write coalescing is enabled only
+		 * when SGE_EGRESS_QUEUES_PER_PAGE_PF reg value for the
+		 * queue is less no of segments that can be accommodated in
+		 * a page size.
+		 */
+		if (qpp > num_seg) {
+			dev_err(&pdev->dev,
+				"Incorrect number of egress queues per page\n");
+			err = -EINVAL;
+			goto out_free_adapter;
+		}
+		adapter->bar2 = ioremap_wc(pci_resource_start(pdev, 2),
+		pci_resource_len(pdev, 2));
+		if (!adapter->bar2) {
+			dev_err(&pdev->dev, "cannot map device bar2 region\n");
+			err = -ENOMEM;
+			goto out_free_adapter;
+		}
+	}
+
+	setup_memwin(adapter);
+	err = adap_init0(adapter);
+#ifdef CONFIG_DEBUG_FS
+	bitmap_zero(adapter->sge.blocked_fl, adapter->sge.egr_sz);
+#endif
+	setup_memwin_rdma(adapter);
+	if (err)
+		goto out_unmap_bar;
+
+	/* configure SGE_STAT_CFG_A to read WC stats */
+	if (!is_t4(adapter->params.chip))
+		t4_write_reg(adapter, SGE_STAT_CFG_A, STATSOURCE_T5_V(7) |
+			     (is_t5(adapter->params.chip) ? STATMODE_V(0) :
+			      T6_STATMODE_V(0)));
+
+	for_each_port(adapter, i) {
+		netdev = alloc_etherdev_mq(sizeof(struct port_info),
+					   MAX_ETH_QSETS);
+		if (!netdev) {
+			err = -ENOMEM;
+			goto out_free_dev;
+		}
+
+		SET_NETDEV_DEV(netdev, &pdev->dev);
+
+		adapter->port[i] = netdev;
+		pi = netdev_priv(netdev);
+		pi->adapter = adapter;
+		pi->xact_addr_filt = -1;
+		pi->port_id = i;
+		netdev->irq = pdev->irq;
+
+		netdev->hw_features = NETIF_F_SG | TSO_FLAGS |
+			NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+			NETIF_F_RXCSUM | NETIF_F_RXHASH |
+			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
+			NETIF_F_HW_TC;
+
+		if (CHELSIO_CHIP_VERSION(chip) > CHELSIO_T5)
+			netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL;
+
+		if (highdma)
+			netdev->hw_features |= NETIF_F_HIGHDMA;
+		netdev->features |= netdev->hw_features;
+		netdev->vlan_features = netdev->features & VLAN_FEAT;
+
+		netdev->priv_flags |= IFF_UNICAST_FLT;
+
+		/* MTU range: 81 - 9600 */
+		netdev->min_mtu = 81;              /* accommodate SACK */
+		netdev->max_mtu = MAX_MTU;
+
+		netdev->netdev_ops = &cxgb4_netdev_ops;
+#ifdef CONFIG_CHELSIO_T4_DCB
+		netdev->dcbnl_ops = &cxgb4_dcb_ops;
+		cxgb4_dcb_state_init(netdev);
+#endif
+		cxgb4_set_ethtool_ops(netdev);
+	}
+
+	cxgb4_init_ethtool_dump(adapter);
+
+	pci_set_drvdata(pdev, adapter);
+
+	if (adapter->flags & FW_OK) {
+		err = t4_port_init(adapter, func, func, 0);
+		if (err)
+			goto out_free_dev;
+	} else if (adapter->params.nports == 1) {
+		/* If we don't have a connection to the firmware -- possibly
+		 * because of an error -- grab the raw VPD parameters so we
+		 * can set the proper MAC Address on the debug network
+		 * interface that we've created.
+		 */
+		u8 hw_addr[ETH_ALEN];
+		u8 *na = adapter->params.vpd.na;
+
+		err = t4_get_raw_vpd_params(adapter, &adapter->params.vpd);
+		if (!err) {
+			for (i = 0; i < ETH_ALEN; i++)
+				hw_addr[i] = (hex2val(na[2 * i + 0]) * 16 +
+					      hex2val(na[2 * i + 1]));
+			t4_set_hw_addr(adapter, 0, hw_addr);
+		}
+	}
+
+	/* Configure queues and allocate tables now, they can be needed as
+	 * soon as the first register_netdev completes.
+	 */
+	cfg_queues(adapter);
+
+	adapter->smt = t4_init_smt();
+	if (!adapter->smt) {
+		/* We tolerate a lack of SMT, giving up some functionality */
+		dev_warn(&pdev->dev, "could not allocate SMT, continuing\n");
+	}
+
+	adapter->l2t = t4_init_l2t(adapter->l2t_start, adapter->l2t_end);
+	if (!adapter->l2t) {
+		/* We tolerate a lack of L2T, giving up some functionality */
+		dev_warn(&pdev->dev, "could not allocate L2T, continuing\n");
+		adapter->params.offload = 0;
+	}
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if ((CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5) &&
+	    (!(t4_read_reg(adapter, LE_DB_CONFIG_A) & ASLIPCOMPEN_F))) {
+		/* CLIP functionality is not present in hardware,
+		 * hence disable all offload features
+		 */
+		dev_warn(&pdev->dev,
+			 "CLIP not enabled in hardware, continuing\n");
+		adapter->params.offload = 0;
+	} else {
+		adapter->clipt = t4_init_clip_tbl(adapter->clipt_start,
+						  adapter->clipt_end);
+		if (!adapter->clipt) {
+			/* We tolerate a lack of clip_table, giving up
+			 * some functionality
+			 */
+			dev_warn(&pdev->dev,
+				 "could not allocate Clip table, continuing\n");
+			adapter->params.offload = 0;
+		}
+	}
+#endif
+
+	for_each_port(adapter, i) {
+		pi = adap2pinfo(adapter, i);
+		pi->sched_tbl = t4_init_sched(adapter->params.nsched_cls);
+		if (!pi->sched_tbl)
+			dev_warn(&pdev->dev,
+				 "could not activate scheduling on port %d\n",
+				 i);
+	}
+
+	if (tid_init(&adapter->tids) < 0) {
+		dev_warn(&pdev->dev, "could not allocate TID table, "
+			 "continuing\n");
+		adapter->params.offload = 0;
+	} else {
+		adapter->tc_u32 = cxgb4_init_tc_u32(adapter);
+		if (!adapter->tc_u32)
+			dev_warn(&pdev->dev,
+				 "could not offload tc u32, continuing\n");
+
+		if (cxgb4_init_tc_flower(adapter))
+			dev_warn(&pdev->dev,
+				 "could not offload tc flower, continuing\n");
+	}
+
+	if (is_offload(adapter) || is_hashfilter(adapter)) {
+		if (t4_read_reg(adapter, LE_DB_CONFIG_A) & HASHEN_F) {
+			u32 hash_base, hash_reg;
+
+			if (chip <= CHELSIO_T5) {
+				hash_reg = LE_DB_TID_HASHBASE_A;
+				hash_base = t4_read_reg(adapter, hash_reg);
+				adapter->tids.hash_base = hash_base / 4;
+			} else {
+				hash_reg = T6_LE_DB_HASH_TID_BASE_A;
+				hash_base = t4_read_reg(adapter, hash_reg);
+				adapter->tids.hash_base = hash_base;
+			}
+		}
+	}
+
+	/* See what interrupts we'll be using */
+	if (msi > 1 && enable_msix(adapter) == 0)
+		adapter->flags |= USING_MSIX;
+	else if (msi > 0 && pci_enable_msi(pdev) == 0) {
+		adapter->flags |= USING_MSI;
+		if (msi > 1)
+			free_msix_info(adapter);
+	}
+
+	/* check for PCI Express bandwidth capabiltites */
+	cxgb4_check_pcie_caps(adapter);
+
+	err = init_rss(adapter);
+	if (err)
+		goto out_free_dev;
+
+	err = setup_fw_sge_queues(adapter);
+	if (err) {
+		dev_err(adapter->pdev_dev,
+			"FW sge queue allocation failed, err %d", err);
+		goto out_free_dev;
+	}
+
+	/*
+	 * The card is now ready to go.  If any errors occur during device
+	 * registration we do not fail the whole card but rather proceed only
+	 * with the ports we manage to register successfully.  However we must
+	 * register at least one net device.
+	 */
+	for_each_port(adapter, i) {
+		pi = adap2pinfo(adapter, i);
+		adapter->port[i]->dev_port = pi->lport;
+		netif_set_real_num_tx_queues(adapter->port[i], pi->nqsets);
+		netif_set_real_num_rx_queues(adapter->port[i], pi->nqsets);
+
+		netif_carrier_off(adapter->port[i]);
+
+		err = register_netdev(adapter->port[i]);
+		if (err)
+			break;
+		adapter->chan_map[pi->tx_chan] = i;
+		print_port_info(adapter->port[i]);
+	}
+	if (i == 0) {
+		dev_err(&pdev->dev, "could not register any net devices\n");
+		goto out_free_dev;
+	}
+	if (err) {
+		dev_warn(&pdev->dev, "only %d net devices registered\n", i);
+		err = 0;
+	}
+
+	if (cxgb4_debugfs_root) {
+		adapter->debugfs_root = debugfs_create_dir(pci_name(pdev),
+							   cxgb4_debugfs_root);
+		setup_debugfs(adapter);
+	}
+
+	/* PCIe EEH recovery on powerpc platforms needs fundamental reset */
+	pdev->needs_freset = 1;
+
+	if (is_uld(adapter)) {
+		mutex_lock(&uld_mutex);
+		list_add_tail(&adapter->list_node, &adapter_list);
+		mutex_unlock(&uld_mutex);
+	}
+
+	if (!is_t4(adapter->params.chip))
+		cxgb4_ptp_init(adapter);
+
+	print_adapter_info(adapter);
+	return 0;
+
+ out_free_dev:
+	t4_free_sge_resources(adapter);
+	free_some_resources(adapter);
+	if (adapter->flags & USING_MSIX)
+		free_msix_info(adapter);
+	if (adapter->num_uld || adapter->num_ofld_uld)
+		t4_uld_mem_free(adapter);
+ out_unmap_bar:
+	if (!is_t4(adapter->params.chip))
+		iounmap(adapter->bar2);
+ out_free_adapter:
+	if (adapter->workq)
+		destroy_workqueue(adapter->workq);
+
+	kfree(adapter->mbox_log);
+	kfree(adapter);
+ out_unmap_bar0:
+	iounmap(regs);
+ out_disable_device:
+	pci_disable_pcie_error_reporting(pdev);
+	pci_disable_device(pdev);
+ out_release_regions:
+	pci_release_regions(pdev);
+	return err;
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+	struct adapter *adapter = pci_get_drvdata(pdev);
+
+	if (!adapter) {
+		pci_release_regions(pdev);
+		return;
+	}
+
+	adapter->flags |= SHUTTING_DOWN;
+
+	if (adapter->pf == 4) {
+		int i;
+
+		/* Tear down per-adapter Work Queue first since it can contain
+		 * references to our adapter data structure.
+		 */
+		destroy_workqueue(adapter->workq);
+
+		if (is_uld(adapter)) {
+			detach_ulds(adapter);
+			t4_uld_clean_up(adapter);
+		}
+
+		adap_free_hma_mem(adapter);
+
+		disable_interrupts(adapter);
+
+		for_each_port(adapter, i)
+			if (adapter->port[i]->reg_state == NETREG_REGISTERED)
+				unregister_netdev(adapter->port[i]);
+
+		debugfs_remove_recursive(adapter->debugfs_root);
+
+		if (!is_t4(adapter->params.chip))
+			cxgb4_ptp_stop(adapter);
+
+		/* If we allocated filters, free up state associated with any
+		 * valid filters ...
+		 */
+		clear_all_filters(adapter);
+
+		if (adapter->flags & FULL_INIT_DONE)
+			cxgb_down(adapter);
+
+		if (adapter->flags & USING_MSIX)
+			free_msix_info(adapter);
+		if (adapter->num_uld || adapter->num_ofld_uld)
+			t4_uld_mem_free(adapter);
+		free_some_resources(adapter);
+#if IS_ENABLED(CONFIG_IPV6)
+		t4_cleanup_clip_tbl(adapter);
+#endif
+		if (!is_t4(adapter->params.chip))
+			iounmap(adapter->bar2);
+	}
+#ifdef CONFIG_PCI_IOV
+	else {
+		cxgb4_iov_configure(adapter->pdev, 0);
+	}
+#endif
+	iounmap(adapter->regs);
+	pci_disable_pcie_error_reporting(pdev);
+	if ((adapter->flags & DEV_ENABLED)) {
+		pci_disable_device(pdev);
+		adapter->flags &= ~DEV_ENABLED;
+	}
+	pci_release_regions(pdev);
+	kfree(adapter->mbox_log);
+	synchronize_rcu();
+	kfree(adapter);
+}
+
+/* "Shutdown" quiesces the device, stopping Ingress Packet and Interrupt
+ * delivery.  This is essentially a stripped down version of the PCI remove()
+ * function where we do the minimal amount of work necessary to shutdown any
+ * further activity.
+ */
+static void shutdown_one(struct pci_dev *pdev)
+{
+	struct adapter *adapter = pci_get_drvdata(pdev);
+
+	/* As with remove_one() above (see extended comment), we only want do
+	 * do cleanup on PCI Devices which went all the way through init_one()
+	 * ...
+	 */
+	if (!adapter) {
+		pci_release_regions(pdev);
+		return;
+	}
+
+	adapter->flags |= SHUTTING_DOWN;
+
+	if (adapter->pf == 4) {
+		int i;
+
+		for_each_port(adapter, i)
+			if (adapter->port[i]->reg_state == NETREG_REGISTERED)
+				cxgb_close(adapter->port[i]);
+
+		if (is_uld(adapter)) {
+			detach_ulds(adapter);
+			t4_uld_clean_up(adapter);
+		}
+
+		disable_interrupts(adapter);
+		disable_msi(adapter);
+
+		t4_sge_stop(adapter);
+		if (adapter->flags & FW_OK)
+			t4_fw_bye(adapter, adapter->mbox);
+	}
+}
+
+static struct pci_driver cxgb4_driver = {
+	.name     = KBUILD_MODNAME,
+	.id_table = cxgb4_pci_tbl,
+	.probe    = init_one,
+	.remove   = remove_one,
+	.shutdown = shutdown_one,
+#ifdef CONFIG_PCI_IOV
+	.sriov_configure = cxgb4_iov_configure,
+#endif
+	.err_handler = &cxgb4_eeh,
+};
+
+static int __init cxgb4_init_module(void)
+{
+	int ret;
+
+	/* Debugfs support is optional, just warn if this fails */
+	cxgb4_debugfs_root = debugfs_create_dir(KBUILD_MODNAME, NULL);
+	if (!cxgb4_debugfs_root)
+		pr_warn("could not create debugfs entry, continuing\n");
+
+	ret = pci_register_driver(&cxgb4_driver);
+	if (ret < 0)
+		debugfs_remove(cxgb4_debugfs_root);
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (!inet6addr_registered) {
+		register_inet6addr_notifier(&cxgb4_inet6addr_notifier);
+		inet6addr_registered = true;
+	}
+#endif
+
+	return ret;
+}
+
+static void __exit cxgb4_cleanup_module(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (inet6addr_registered) {
+		unregister_inet6addr_notifier(&cxgb4_inet6addr_notifier);
+		inet6addr_registered = false;
+	}
+#endif
+	pci_unregister_driver(&cxgb4_driver);
+	debugfs_remove(cxgb4_debugfs_root);  /* NULL ok */
+}
+
+module_init(cxgb4_init_module);
+module_exit(cxgb4_cleanup_module);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c
new file mode 100644
index 0000000..9f9d6ca
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c
@@ -0,0 +1,476 @@
+/*
+ * cxgb4_ptp.c:Chelsio PTP support for T5/T6
+ *
+ * Copyright (c) 2003-2017 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Written by: Atul Gupta (atul.gupta@chelsio.com)
+ */
+
+#include <linux/module.h>
+#include <linux/net_tstamp.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/pps_kernel.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/ptp_classify.h>
+#include <linux/udp.h>
+
+#include "cxgb4.h"
+#include "t4_hw.h"
+#include "t4_regs.h"
+#include "t4_msg.h"
+#include "t4fw_api.h"
+#include "cxgb4_ptp.h"
+
+/**
+ * cxgb4_ptp_is_ptp_tx - determine whether TX packet is PTP or not
+ * @skb: skb of outgoing ptp request
+ *
+ */
+bool cxgb4_ptp_is_ptp_tx(struct sk_buff *skb)
+{
+	struct udphdr *uh;
+
+	uh = udp_hdr(skb);
+	return skb->len >= PTP_MIN_LENGTH &&
+		skb->len <= PTP_IN_TRANSMIT_PACKET_MAXNUM &&
+		likely(skb->protocol == htons(ETH_P_IP)) &&
+		ip_hdr(skb)->protocol == IPPROTO_UDP &&
+		uh->dest == htons(PTP_EVENT_PORT);
+}
+
+bool is_ptp_enabled(struct sk_buff *skb, struct net_device *dev)
+{
+	struct port_info *pi;
+
+	pi = netdev_priv(dev);
+	return (pi->ptp_enable && cxgb4_xmit_with_hwtstamp(skb) &&
+		cxgb4_ptp_is_ptp_tx(skb));
+}
+
+/**
+ * cxgb4_ptp_is_ptp_rx - determine whether RX packet is PTP or not
+ * @skb: skb of incoming ptp request
+ *
+ */
+bool cxgb4_ptp_is_ptp_rx(struct sk_buff *skb)
+{
+	struct udphdr *uh = (struct udphdr *)(skb->data + ETH_HLEN +
+					      IPV4_HLEN(skb->data));
+
+	return  uh->dest == htons(PTP_EVENT_PORT) &&
+		uh->source == htons(PTP_EVENT_PORT);
+}
+
+/**
+ * cxgb4_ptp_read_hwstamp - read timestamp for TX event PTP message
+ * @adapter: board private structure
+ * @pi: port private structure
+ *
+ */
+void cxgb4_ptp_read_hwstamp(struct adapter *adapter, struct port_info *pi)
+{
+	struct skb_shared_hwtstamps *skb_ts = NULL;
+	u64 tx_ts;
+
+	skb_ts = skb_hwtstamps(adapter->ptp_tx_skb);
+
+	tx_ts = t4_read_reg(adapter,
+			    T5_PORT_REG(pi->port_id, MAC_PORT_TX_TS_VAL_LO));
+
+	tx_ts |= (u64)t4_read_reg(adapter,
+				  T5_PORT_REG(pi->port_id,
+					      MAC_PORT_TX_TS_VAL_HI)) << 32;
+	skb_ts->hwtstamp = ns_to_ktime(tx_ts);
+	skb_tstamp_tx(adapter->ptp_tx_skb, skb_ts);
+	dev_kfree_skb_any(adapter->ptp_tx_skb);
+	spin_lock(&adapter->ptp_lock);
+	adapter->ptp_tx_skb = NULL;
+	spin_unlock(&adapter->ptp_lock);
+}
+
+/**
+ * cxgb4_ptprx_timestamping - Enable Timestamp for RX PTP event message
+ * @pi: port private structure
+ * @port: pot number
+ * @mode: RX mode
+ *
+ */
+int cxgb4_ptprx_timestamping(struct port_info *pi, u8 port, u16 mode)
+{
+	struct adapter *adapter = pi->adapter;
+	struct fw_ptp_cmd c;
+	int err;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PTP_CMD) |
+				     FW_CMD_REQUEST_F |
+				     FW_CMD_WRITE_F |
+				     FW_PTP_CMD_PORTID_V(port));
+	c.retval_len16 = cpu_to_be32(FW_CMD_LEN16_V(sizeof(c) / 16));
+	c.u.init.sc = FW_PTP_SC_RXTIME_STAMP;
+	c.u.init.mode = cpu_to_be16(mode);
+
+	err = t4_wr_mbox(adapter, adapter->mbox, &c, sizeof(c), NULL);
+	if (err < 0)
+		dev_err(adapter->pdev_dev,
+			"PTP: %s error %d\n", __func__, -err);
+	return err;
+}
+
+int cxgb4_ptp_txtype(struct adapter *adapter, u8 port)
+{
+	struct fw_ptp_cmd c;
+	int err;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PTP_CMD) |
+				     FW_CMD_REQUEST_F |
+				     FW_CMD_WRITE_F |
+				     FW_PTP_CMD_PORTID_V(port));
+	c.retval_len16 = cpu_to_be32(FW_CMD_LEN16_V(sizeof(c) / 16));
+	c.u.init.sc = FW_PTP_SC_TX_TYPE;
+	c.u.init.mode = cpu_to_be16(PTP_TS_NONE);
+
+	err = t4_wr_mbox(adapter, adapter->mbox, &c, sizeof(c), NULL);
+	if (err < 0)
+		dev_err(adapter->pdev_dev,
+			"PTP: %s error %d\n", __func__, -err);
+
+	return err;
+}
+
+int cxgb4_ptp_redirect_rx_packet(struct adapter *adapter, struct port_info *pi)
+{
+	struct sge *s = &adapter->sge;
+	struct sge_eth_rxq *receive_q =  &s->ethrxq[pi->first_qset];
+	struct fw_ptp_cmd c;
+	int err;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PTP_CMD) |
+				     FW_CMD_REQUEST_F |
+				     FW_CMD_WRITE_F |
+				     FW_PTP_CMD_PORTID_V(pi->port_id));
+
+	c.retval_len16 = cpu_to_be32(FW_CMD_LEN16_V(sizeof(c) / 16));
+	c.u.init.sc = FW_PTP_SC_RDRX_TYPE;
+	c.u.init.txchan = pi->tx_chan;
+	c.u.init.absid = cpu_to_be16(receive_q->rspq.abs_id);
+
+	err = t4_wr_mbox(adapter, adapter->mbox, &c, sizeof(c), NULL);
+	if (err < 0)
+		dev_err(adapter->pdev_dev,
+			"PTP: %s error %d\n", __func__, -err);
+	return err;
+}
+
+/**
+ * @ptp: ptp clock structure
+ * @ppb: Desired frequency change in parts per billion
+ *
+ * Adjust the frequency of the PHC cycle counter by the indicated ppb from
+ * the base frequency.
+ */
+static int cxgb4_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
+{
+	struct adapter *adapter = (struct adapter *)container_of(ptp,
+				   struct adapter, ptp_clock_info);
+	struct fw_ptp_cmd c;
+	int err;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PTP_CMD) |
+				     FW_CMD_REQUEST_F |
+				     FW_CMD_WRITE_F |
+				     FW_PTP_CMD_PORTID_V(0));
+	c.retval_len16 = cpu_to_be32(FW_CMD_LEN16_V(sizeof(c) / 16));
+	c.u.ts.sc = FW_PTP_SC_ADJ_FREQ;
+	c.u.ts.sign = (ppb < 0) ? 1 : 0;
+	if (ppb < 0)
+		ppb = -ppb;
+	c.u.ts.ppb = cpu_to_be32(ppb);
+
+	err = t4_wr_mbox(adapter, adapter->mbox, &c, sizeof(c), NULL);
+	if (err < 0)
+		dev_err(adapter->pdev_dev,
+			"PTP: %s error %d\n", __func__, -err);
+
+	return err;
+}
+
+/**
+ * cxgb4_ptp_fineadjtime - Shift the time of the hardware clock
+ * @ptp: ptp clock structure
+ * @delta: Desired change in nanoseconds
+ *
+ * Adjust the timer by resetting the timecounter structure.
+ */
+static int  cxgb4_ptp_fineadjtime(struct adapter *adapter, s64 delta)
+{
+	struct fw_ptp_cmd c;
+	int err;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PTP_CMD) |
+			     FW_CMD_REQUEST_F |
+			     FW_CMD_WRITE_F |
+			     FW_PTP_CMD_PORTID_V(0));
+	c.retval_len16 = cpu_to_be32(FW_CMD_LEN16_V(sizeof(c) / 16));
+	c.u.ts.sc = FW_PTP_SC_ADJ_FTIME;
+	c.u.ts.tm = cpu_to_be64(delta);
+
+	err = t4_wr_mbox(adapter, adapter->mbox, &c, sizeof(c), NULL);
+	if (err < 0)
+		dev_err(adapter->pdev_dev,
+			"PTP: %s error %d\n", __func__, -err);
+	return err;
+}
+
+/**
+ * cxgb4_ptp_adjtime - Shift the time of the hardware clock
+ * @ptp: ptp clock structure
+ * @delta: Desired change in nanoseconds
+ *
+ * Adjust the timer by resetting the timecounter structure.
+ */
+static int cxgb4_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct adapter *adapter =
+		(struct adapter *)container_of(ptp, struct adapter,
+					       ptp_clock_info);
+	struct fw_ptp_cmd c;
+	s64 sign = 1;
+	int err;
+
+	if (delta < 0)
+		sign = -1;
+
+	if (delta * sign > PTP_CLOCK_MAX_ADJTIME) {
+		memset(&c, 0, sizeof(c));
+		c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PTP_CMD) |
+					     FW_CMD_REQUEST_F |
+					     FW_CMD_WRITE_F |
+					     FW_PTP_CMD_PORTID_V(0));
+		c.retval_len16 = cpu_to_be32(FW_CMD_LEN16_V(sizeof(c) / 16));
+		c.u.ts.sc = FW_PTP_SC_ADJ_TIME;
+		c.u.ts.sign = (delta < 0) ? 1 : 0;
+		if (delta < 0)
+			delta = -delta;
+		c.u.ts.tm = cpu_to_be64(delta);
+
+		err = t4_wr_mbox(adapter, adapter->mbox, &c, sizeof(c), NULL);
+		if (err < 0)
+			dev_err(adapter->pdev_dev,
+				"PTP: %s error %d\n", __func__, -err);
+	} else {
+		err = cxgb4_ptp_fineadjtime(adapter, delta);
+	}
+
+	return err;
+}
+
+/**
+ * cxgb4_ptp_gettime - Reads the current time from the hardware clock
+ * @ptp: ptp clock structure
+ * @ts: timespec structure to hold the current time value
+ *
+ * Read the timecounter and return the correct value in ns after converting
+ * it into a struct timespec.
+ */
+static int cxgb4_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+	struct adapter *adapter = (struct adapter *)container_of(ptp,
+				   struct adapter, ptp_clock_info);
+	struct fw_ptp_cmd c;
+	u64 ns;
+	int err;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PTP_CMD) |
+				     FW_CMD_REQUEST_F |
+				     FW_CMD_READ_F |
+				     FW_PTP_CMD_PORTID_V(0));
+	c.retval_len16 = cpu_to_be32(FW_CMD_LEN16_V(sizeof(c) / 16));
+	c.u.ts.sc = FW_PTP_SC_GET_TIME;
+
+	err = t4_wr_mbox(adapter, adapter->mbox, &c, sizeof(c), &c);
+	if (err < 0) {
+		dev_err(adapter->pdev_dev,
+			"PTP: %s error %d\n", __func__, -err);
+		return err;
+	}
+
+	/* convert to timespec*/
+	ns = be64_to_cpu(c.u.ts.tm);
+	*ts = ns_to_timespec64(ns);
+
+	return err;
+}
+
+/**
+ *  cxgb4_ptp_settime - Set the current time on the hardware clock
+ *  @ptp: ptp clock structure
+ *  @ts: timespec containing the new time for the cycle counter
+ *
+ *  Reset value to new base value instead of the kernel
+ *  wall timer value.
+ */
+static int cxgb4_ptp_settime(struct ptp_clock_info *ptp,
+			     const struct timespec64 *ts)
+{
+	struct adapter *adapter = (struct adapter *)container_of(ptp,
+				   struct adapter, ptp_clock_info);
+	struct fw_ptp_cmd c;
+	u64 ns;
+	int err;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PTP_CMD) |
+				     FW_CMD_REQUEST_F |
+				     FW_CMD_WRITE_F |
+				     FW_PTP_CMD_PORTID_V(0));
+	c.retval_len16 = cpu_to_be32(FW_CMD_LEN16_V(sizeof(c) / 16));
+	c.u.ts.sc = FW_PTP_SC_SET_TIME;
+
+	ns = timespec64_to_ns(ts);
+	c.u.ts.tm = cpu_to_be64(ns);
+
+	err =  t4_wr_mbox(adapter, adapter->mbox, &c, sizeof(c), NULL);
+	if (err < 0)
+		dev_err(adapter->pdev_dev,
+			"PTP: %s error %d\n", __func__, -err);
+
+	return err;
+}
+
+static void cxgb4_init_ptp_timer(struct adapter *adapter)
+{
+	struct fw_ptp_cmd c;
+	int err;
+
+	memset(&c, 0, sizeof(c));
+		c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PTP_CMD) |
+					     FW_CMD_REQUEST_F |
+					     FW_CMD_WRITE_F |
+					     FW_PTP_CMD_PORTID_V(0));
+	c.retval_len16 = cpu_to_be32(FW_CMD_LEN16_V(sizeof(c) / 16));
+	c.u.scmd.sc = FW_PTP_SC_INIT_TIMER;
+
+	err = t4_wr_mbox(adapter, adapter->mbox, &c, sizeof(c), NULL);
+	if (err < 0)
+		dev_err(adapter->pdev_dev,
+			"PTP: %s error %d\n", __func__, -err);
+}
+
+/**
+ * cxgb4_ptp_enable - enable or disable an ancillary feature
+ * @ptp: ptp clock structure
+ * @request: Desired resource to enable or disable
+ * @on: Caller passes one to enable or zero to disable
+ *
+ * Enable (or disable) ancillary features of the PHC subsystem.
+ * Currently, no ancillary features are supported.
+ */
+static int cxgb4_ptp_enable(struct ptp_clock_info __always_unused *ptp,
+			    struct ptp_clock_request __always_unused *request,
+			    int __always_unused on)
+{
+	return -ENOTSUPP;
+}
+
+static const struct ptp_clock_info cxgb4_ptp_clock_info = {
+	.owner          = THIS_MODULE,
+	.name           = "cxgb4_clock",
+	.max_adj        = MAX_PTP_FREQ_ADJ,
+	.n_alarm        = 0,
+	.n_ext_ts       = 0,
+	.n_per_out      = 0,
+	.pps            = 0,
+	.adjfreq        = cxgb4_ptp_adjfreq,
+	.adjtime        = cxgb4_ptp_adjtime,
+	.gettime64      = cxgb4_ptp_gettime,
+	.settime64      = cxgb4_ptp_settime,
+	.enable         = cxgb4_ptp_enable,
+};
+
+/**
+ * cxgb4_ptp_init - initialize PTP for devices which support it
+ * @adapter: board private structure
+ *
+ * This function performs the required steps for enabling PTP support.
+ */
+void cxgb4_ptp_init(struct adapter *adapter)
+{
+	struct timespec64 now;
+	 /* no need to create a clock device if we already have one */
+	if (!IS_ERR_OR_NULL(adapter->ptp_clock))
+		return;
+
+	adapter->ptp_tx_skb = NULL;
+	adapter->ptp_clock_info = cxgb4_ptp_clock_info;
+	spin_lock_init(&adapter->ptp_lock);
+
+	adapter->ptp_clock = ptp_clock_register(&adapter->ptp_clock_info,
+						&adapter->pdev->dev);
+	if (IS_ERR_OR_NULL(adapter->ptp_clock)) {
+		adapter->ptp_clock = NULL;
+		dev_err(adapter->pdev_dev,
+			"PTP %s Clock registration has failed\n", __func__);
+		return;
+	}
+
+	now = ktime_to_timespec64(ktime_get_real());
+	cxgb4_init_ptp_timer(adapter);
+	if (cxgb4_ptp_settime(&adapter->ptp_clock_info, &now) < 0) {
+		ptp_clock_unregister(adapter->ptp_clock);
+		adapter->ptp_clock = NULL;
+	}
+}
+
+/**
+ * cxgb4_ptp_remove - disable PTP device and stop the overflow check
+ * @adapter: board private structure
+ *
+ * Stop the PTP support.
+ */
+void cxgb4_ptp_stop(struct adapter *adapter)
+{
+	if (adapter->ptp_tx_skb) {
+		dev_kfree_skb_any(adapter->ptp_tx_skb);
+		adapter->ptp_tx_skb = NULL;
+	}
+
+	if (adapter->ptp_clock) {
+		ptp_clock_unregister(adapter->ptp_clock);
+		adapter->ptp_clock = NULL;
+	}
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.h
new file mode 100644
index 0000000..cccfae8
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.h
@@ -0,0 +1,74 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2017 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_PTP_H__
+#define __CXGB4_PTP_H__
+
+/* Maximum parts-per-billion adjustment that is acceptable */
+#define MAX_PTP_FREQ_ADJ		1000000
+#define PTP_CLOCK_MAX_ADJTIME		10000000 /* 10 ms */
+
+#define PTP_MIN_LENGTH			63
+#define PTP_IN_TRANSMIT_PACKET_MAXNUM	240
+#define PTP_EVENT_PORT			319
+
+enum ptp_rx_filter_mode {
+	PTP_TS_NONE = 0,
+	PTP_TS_L2,
+	PTP_TS_L4,
+	PTP_TS_L2_L4
+};
+
+struct port_info;
+
+static inline bool cxgb4_xmit_with_hwtstamp(struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP;
+}
+
+static inline void cxgb4_xmit_hwtstamp_pending(struct sk_buff *skb)
+{
+	skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+}
+
+void cxgb4_ptp_init(struct adapter *adap);
+void cxgb4_ptp_stop(struct adapter *adap);
+bool cxgb4_ptp_is_ptp_tx(struct sk_buff *skb);
+bool cxgb4_ptp_is_ptp_rx(struct sk_buff *skb);
+int cxgb4_ptprx_timestamping(struct port_info *pi, u8 port, u16 mode);
+int cxgb4_ptp_redirect_rx_packet(struct adapter *adap, struct port_info *pi);
+int cxgb4_ptp_txtype(struct adapter *adap, u8 port_id);
+void cxgb4_ptp_read_hwstamp(struct adapter *adap, struct port_info *pi);
+bool is_ptp_enabled(struct sk_buff *skb, struct net_device *dev);
+#endif /* __CXGB4_PTP_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
new file mode 100644
index 0000000..3656336
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -0,0 +1,876 @@
+/*
+ * This file is part of the Chelsio T4/T5/T6 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2017 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_pedit.h>
+#include <net/tc_act/tc_gact.h>
+#include <net/tc_act/tc_vlan.h>
+
+#include "cxgb4.h"
+#include "cxgb4_filter.h"
+#include "cxgb4_tc_flower.h"
+
+#define STATS_CHECK_PERIOD (HZ / 2)
+
+static struct ch_tc_pedit_fields pedits[] = {
+	PEDIT_FIELDS(ETH_, DMAC_31_0, 4, dmac, 0),
+	PEDIT_FIELDS(ETH_, DMAC_47_32, 2, dmac, 4),
+	PEDIT_FIELDS(ETH_, SMAC_15_0, 2, smac, 0),
+	PEDIT_FIELDS(ETH_, SMAC_47_16, 4, smac, 2),
+	PEDIT_FIELDS(IP4_, SRC, 4, nat_fip, 0),
+	PEDIT_FIELDS(IP4_, DST, 4, nat_lip, 0),
+	PEDIT_FIELDS(IP6_, SRC_31_0, 4, nat_fip, 0),
+	PEDIT_FIELDS(IP6_, SRC_63_32, 4, nat_fip, 4),
+	PEDIT_FIELDS(IP6_, SRC_95_64, 4, nat_fip, 8),
+	PEDIT_FIELDS(IP6_, SRC_127_96, 4, nat_fip, 12),
+	PEDIT_FIELDS(IP6_, DST_31_0, 4, nat_lip, 0),
+	PEDIT_FIELDS(IP6_, DST_63_32, 4, nat_lip, 4),
+	PEDIT_FIELDS(IP6_, DST_95_64, 4, nat_lip, 8),
+	PEDIT_FIELDS(IP6_, DST_127_96, 4, nat_lip, 12),
+	PEDIT_FIELDS(TCP_, SPORT, 2, nat_fport, 0),
+	PEDIT_FIELDS(TCP_, DPORT, 2, nat_lport, 0),
+	PEDIT_FIELDS(UDP_, SPORT, 2, nat_fport, 0),
+	PEDIT_FIELDS(UDP_, DPORT, 2, nat_lport, 0),
+};
+
+static struct ch_tc_flower_entry *allocate_flower_entry(void)
+{
+	struct ch_tc_flower_entry *new = kzalloc(sizeof(*new), GFP_KERNEL);
+	spin_lock_init(&new->lock);
+	return new;
+}
+
+/* Must be called with either RTNL or rcu_read_lock */
+static struct ch_tc_flower_entry *ch_flower_lookup(struct adapter *adap,
+						   unsigned long flower_cookie)
+{
+	return rhashtable_lookup_fast(&adap->flower_tbl, &flower_cookie,
+				      adap->flower_ht_params);
+}
+
+static void cxgb4_process_flow_match(struct net_device *dev,
+				     struct tc_cls_flower_offload *cls,
+				     struct ch_filter_specification *fs)
+{
+	u16 addr_type = 0;
+
+	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
+		struct flow_dissector_key_control *key =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_CONTROL,
+						  cls->key);
+
+		addr_type = key->addr_type;
+	}
+
+	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *key =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  cls->key);
+		struct flow_dissector_key_basic *mask =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  cls->mask);
+		u16 ethtype_key = ntohs(key->n_proto);
+		u16 ethtype_mask = ntohs(mask->n_proto);
+
+		if (ethtype_key == ETH_P_ALL) {
+			ethtype_key = 0;
+			ethtype_mask = 0;
+		}
+
+		if (ethtype_key == ETH_P_IPV6)
+			fs->type = 1;
+
+		fs->val.ethtype = ethtype_key;
+		fs->mask.ethtype = ethtype_mask;
+		fs->val.proto = key->ip_proto;
+		fs->mask.proto = mask->ip_proto;
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+		struct flow_dissector_key_ipv4_addrs *key =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						  cls->key);
+		struct flow_dissector_key_ipv4_addrs *mask =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						  cls->mask);
+		fs->type = 0;
+		memcpy(&fs->val.lip[0], &key->dst, sizeof(key->dst));
+		memcpy(&fs->val.fip[0], &key->src, sizeof(key->src));
+		memcpy(&fs->mask.lip[0], &mask->dst, sizeof(mask->dst));
+		memcpy(&fs->mask.fip[0], &mask->src, sizeof(mask->src));
+
+		/* also initialize nat_lip/fip to same values */
+		memcpy(&fs->nat_lip[0], &key->dst, sizeof(key->dst));
+		memcpy(&fs->nat_fip[0], &key->src, sizeof(key->src));
+
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+		struct flow_dissector_key_ipv6_addrs *key =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+						  cls->key);
+		struct flow_dissector_key_ipv6_addrs *mask =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+						  cls->mask);
+
+		fs->type = 1;
+		memcpy(&fs->val.lip[0], key->dst.s6_addr, sizeof(key->dst));
+		memcpy(&fs->val.fip[0], key->src.s6_addr, sizeof(key->src));
+		memcpy(&fs->mask.lip[0], mask->dst.s6_addr, sizeof(mask->dst));
+		memcpy(&fs->mask.fip[0], mask->src.s6_addr, sizeof(mask->src));
+
+		/* also initialize nat_lip/fip to same values */
+		memcpy(&fs->nat_lip[0], key->dst.s6_addr, sizeof(key->dst));
+		memcpy(&fs->nat_fip[0], key->src.s6_addr, sizeof(key->src));
+	}
+
+	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_PORTS)) {
+		struct flow_dissector_key_ports *key, *mask;
+
+		key = skb_flow_dissector_target(cls->dissector,
+						FLOW_DISSECTOR_KEY_PORTS,
+						cls->key);
+		mask = skb_flow_dissector_target(cls->dissector,
+						 FLOW_DISSECTOR_KEY_PORTS,
+						 cls->mask);
+		fs->val.lport = cpu_to_be16(key->dst);
+		fs->mask.lport = cpu_to_be16(mask->dst);
+		fs->val.fport = cpu_to_be16(key->src);
+		fs->mask.fport = cpu_to_be16(mask->src);
+
+		/* also initialize nat_lport/fport to same values */
+		fs->nat_lport = cpu_to_be16(key->dst);
+		fs->nat_fport = cpu_to_be16(key->src);
+	}
+
+	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_IP)) {
+		struct flow_dissector_key_ip *key, *mask;
+
+		key = skb_flow_dissector_target(cls->dissector,
+						FLOW_DISSECTOR_KEY_IP,
+						cls->key);
+		mask = skb_flow_dissector_target(cls->dissector,
+						 FLOW_DISSECTOR_KEY_IP,
+						 cls->mask);
+		fs->val.tos = key->tos;
+		fs->mask.tos = mask->tos;
+	}
+
+	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_VLAN)) {
+		struct flow_dissector_key_vlan *key, *mask;
+		u16 vlan_tci, vlan_tci_mask;
+
+		key = skb_flow_dissector_target(cls->dissector,
+						FLOW_DISSECTOR_KEY_VLAN,
+						cls->key);
+		mask = skb_flow_dissector_target(cls->dissector,
+						 FLOW_DISSECTOR_KEY_VLAN,
+						 cls->mask);
+		vlan_tci = key->vlan_id | (key->vlan_priority <<
+					   VLAN_PRIO_SHIFT);
+		vlan_tci_mask = mask->vlan_id | (mask->vlan_priority <<
+						 VLAN_PRIO_SHIFT);
+		fs->val.ivlan = vlan_tci;
+		fs->mask.ivlan = vlan_tci_mask;
+
+		/* Chelsio adapters use ivlan_vld bit to match vlan packets
+		 * as 802.1Q. Also, when vlan tag is present in packets,
+		 * ethtype match is used then to match on ethtype of inner
+		 * header ie. the header following the vlan header.
+		 * So, set the ivlan_vld based on ethtype info supplied by
+		 * TC for vlan packets if its 802.1Q. And then reset the
+		 * ethtype value else, hw will try to match the supplied
+		 * ethtype value with ethtype of inner header.
+		 */
+		if (fs->val.ethtype == ETH_P_8021Q) {
+			fs->val.ivlan_vld = 1;
+			fs->mask.ivlan_vld = 1;
+			fs->val.ethtype = 0;
+			fs->mask.ethtype = 0;
+		}
+	}
+
+	/* Match only packets coming from the ingress port where this
+	 * filter will be created.
+	 */
+	fs->val.iport = netdev2pinfo(dev)->port_id;
+	fs->mask.iport = ~0;
+}
+
+static int cxgb4_validate_flow_match(struct net_device *dev,
+				     struct tc_cls_flower_offload *cls)
+{
+	u16 ethtype_mask = 0;
+	u16 ethtype_key = 0;
+
+	if (cls->dissector->used_keys &
+	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
+	      BIT(FLOW_DISSECTOR_KEY_BASIC) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_PORTS) |
+	      BIT(FLOW_DISSECTOR_KEY_VLAN) |
+	      BIT(FLOW_DISSECTOR_KEY_IP))) {
+		netdev_warn(dev, "Unsupported key used: 0x%x\n",
+			    cls->dissector->used_keys);
+		return -EOPNOTSUPP;
+	}
+
+	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *key =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  cls->key);
+		struct flow_dissector_key_basic *mask =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  cls->mask);
+		ethtype_key = ntohs(key->n_proto);
+		ethtype_mask = ntohs(mask->n_proto);
+	}
+
+	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_IP)) {
+		u16 eth_ip_type = ethtype_key & ethtype_mask;
+		struct flow_dissector_key_ip *mask;
+
+		if (eth_ip_type != ETH_P_IP && eth_ip_type != ETH_P_IPV6) {
+			netdev_err(dev, "IP Key supported only with IPv4/v6");
+			return -EINVAL;
+		}
+
+		mask = skb_flow_dissector_target(cls->dissector,
+						 FLOW_DISSECTOR_KEY_IP,
+						 cls->mask);
+		if (mask->ttl) {
+			netdev_warn(dev, "ttl match unsupported for offload");
+			return -EOPNOTSUPP;
+		}
+	}
+
+	return 0;
+}
+
+static void offload_pedit(struct ch_filter_specification *fs, u32 val, u32 mask,
+			  u8 field)
+{
+	u32 set_val = val & ~mask;
+	u32 offset = 0;
+	u8 size = 1;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pedits); i++) {
+		if (pedits[i].field == field) {
+			offset = pedits[i].offset;
+			size = pedits[i].size;
+			break;
+		}
+	}
+	memcpy((u8 *)fs + offset, &set_val, size);
+}
+
+static void process_pedit_field(struct ch_filter_specification *fs, u32 val,
+				u32 mask, u32 offset, u8 htype)
+{
+	switch (htype) {
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH:
+		switch (offset) {
+		case PEDIT_ETH_DMAC_31_0:
+			fs->newdmac = 1;
+			offload_pedit(fs, val, mask, ETH_DMAC_31_0);
+			break;
+		case PEDIT_ETH_DMAC_47_32_SMAC_15_0:
+			if (~mask & PEDIT_ETH_DMAC_MASK)
+				offload_pedit(fs, val, mask, ETH_DMAC_47_32);
+			else
+				offload_pedit(fs, val >> 16, mask >> 16,
+					      ETH_SMAC_15_0);
+			break;
+		case PEDIT_ETH_SMAC_47_16:
+			fs->newsmac = 1;
+			offload_pedit(fs, val, mask, ETH_SMAC_47_16);
+		}
+		break;
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
+		switch (offset) {
+		case PEDIT_IP4_SRC:
+			offload_pedit(fs, val, mask, IP4_SRC);
+			break;
+		case PEDIT_IP4_DST:
+			offload_pedit(fs, val, mask, IP4_DST);
+		}
+		fs->nat_mode = NAT_MODE_ALL;
+		break;
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6:
+		switch (offset) {
+		case PEDIT_IP6_SRC_31_0:
+			offload_pedit(fs, val, mask, IP6_SRC_31_0);
+			break;
+		case PEDIT_IP6_SRC_63_32:
+			offload_pedit(fs, val, mask, IP6_SRC_63_32);
+			break;
+		case PEDIT_IP6_SRC_95_64:
+			offload_pedit(fs, val, mask, IP6_SRC_95_64);
+			break;
+		case PEDIT_IP6_SRC_127_96:
+			offload_pedit(fs, val, mask, IP6_SRC_127_96);
+			break;
+		case PEDIT_IP6_DST_31_0:
+			offload_pedit(fs, val, mask, IP6_DST_31_0);
+			break;
+		case PEDIT_IP6_DST_63_32:
+			offload_pedit(fs, val, mask, IP6_DST_63_32);
+			break;
+		case PEDIT_IP6_DST_95_64:
+			offload_pedit(fs, val, mask, IP6_DST_95_64);
+			break;
+		case PEDIT_IP6_DST_127_96:
+			offload_pedit(fs, val, mask, IP6_DST_127_96);
+		}
+		fs->nat_mode = NAT_MODE_ALL;
+		break;
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP:
+		switch (offset) {
+		case PEDIT_TCP_SPORT_DPORT:
+			if (~mask & PEDIT_TCP_UDP_SPORT_MASK)
+				offload_pedit(fs, cpu_to_be32(val) >> 16,
+					      cpu_to_be32(mask) >> 16,
+					      TCP_SPORT);
+			else
+				offload_pedit(fs, cpu_to_be32(val),
+					      cpu_to_be32(mask), TCP_DPORT);
+		}
+		fs->nat_mode = NAT_MODE_ALL;
+		break;
+	case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP:
+		switch (offset) {
+		case PEDIT_UDP_SPORT_DPORT:
+			if (~mask & PEDIT_TCP_UDP_SPORT_MASK)
+				offload_pedit(fs, cpu_to_be32(val) >> 16,
+					      cpu_to_be32(mask) >> 16,
+					      UDP_SPORT);
+			else
+				offload_pedit(fs, cpu_to_be32(val),
+					      cpu_to_be32(mask), UDP_DPORT);
+		}
+		fs->nat_mode = NAT_MODE_ALL;
+	}
+}
+
+static void cxgb4_process_flow_actions(struct net_device *in,
+				       struct tc_cls_flower_offload *cls,
+				       struct ch_filter_specification *fs)
+{
+	const struct tc_action *a;
+	LIST_HEAD(actions);
+
+	tcf_exts_to_list(cls->exts, &actions);
+	list_for_each_entry(a, &actions, list) {
+		if (is_tcf_gact_ok(a)) {
+			fs->action = FILTER_PASS;
+		} else if (is_tcf_gact_shot(a)) {
+			fs->action = FILTER_DROP;
+		} else if (is_tcf_mirred_egress_redirect(a)) {
+			struct net_device *out = tcf_mirred_dev(a);
+			struct port_info *pi = netdev_priv(out);
+
+			fs->action = FILTER_SWITCH;
+			fs->eport = pi->port_id;
+		} else if (is_tcf_vlan(a)) {
+			u32 vlan_action = tcf_vlan_action(a);
+			u8 prio = tcf_vlan_push_prio(a);
+			u16 vid = tcf_vlan_push_vid(a);
+			u16 vlan_tci = (prio << VLAN_PRIO_SHIFT) | vid;
+
+			switch (vlan_action) {
+			case TCA_VLAN_ACT_POP:
+				fs->newvlan |= VLAN_REMOVE;
+				break;
+			case TCA_VLAN_ACT_PUSH:
+				fs->newvlan |= VLAN_INSERT;
+				fs->vlan = vlan_tci;
+				break;
+			case TCA_VLAN_ACT_MODIFY:
+				fs->newvlan |= VLAN_REWRITE;
+				fs->vlan = vlan_tci;
+				break;
+			default:
+				break;
+			}
+		} else if (is_tcf_pedit(a)) {
+			u32 mask, val, offset;
+			int nkeys, i;
+			u8 htype;
+
+			nkeys = tcf_pedit_nkeys(a);
+			for (i = 0; i < nkeys; i++) {
+				htype = tcf_pedit_htype(a, i);
+				mask = tcf_pedit_mask(a, i);
+				val = tcf_pedit_val(a, i);
+				offset = tcf_pedit_offset(a, i);
+
+				process_pedit_field(fs, val, mask, offset,
+						    htype);
+			}
+		}
+	}
+}
+
+static bool valid_l4_mask(u32 mask)
+{
+	u16 hi, lo;
+
+	/* Either the upper 16-bits (SPORT) OR the lower
+	 * 16-bits (DPORT) can be set, but NOT BOTH.
+	 */
+	hi = (mask >> 16) & 0xFFFF;
+	lo = mask & 0xFFFF;
+
+	return hi && lo ? false : true;
+}
+
+static bool valid_pedit_action(struct net_device *dev,
+			       const struct tc_action *a)
+{
+	u32 mask, offset;
+	u8 cmd, htype;
+	int nkeys, i;
+
+	nkeys = tcf_pedit_nkeys(a);
+	for (i = 0; i < nkeys; i++) {
+		htype = tcf_pedit_htype(a, i);
+		cmd = tcf_pedit_cmd(a, i);
+		mask = tcf_pedit_mask(a, i);
+		offset = tcf_pedit_offset(a, i);
+
+		if (cmd != TCA_PEDIT_KEY_EX_CMD_SET) {
+			netdev_err(dev, "%s: Unsupported pedit cmd\n",
+				   __func__);
+			return false;
+		}
+
+		switch (htype) {
+		case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH:
+			switch (offset) {
+			case PEDIT_ETH_DMAC_31_0:
+			case PEDIT_ETH_DMAC_47_32_SMAC_15_0:
+			case PEDIT_ETH_SMAC_47_16:
+				break;
+			default:
+				netdev_err(dev, "%s: Unsupported pedit field\n",
+					   __func__);
+				return false;
+			}
+			break;
+		case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
+			switch (offset) {
+			case PEDIT_IP4_SRC:
+			case PEDIT_IP4_DST:
+				break;
+			default:
+				netdev_err(dev, "%s: Unsupported pedit field\n",
+					   __func__);
+				return false;
+			}
+			break;
+		case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6:
+			switch (offset) {
+			case PEDIT_IP6_SRC_31_0:
+			case PEDIT_IP6_SRC_63_32:
+			case PEDIT_IP6_SRC_95_64:
+			case PEDIT_IP6_SRC_127_96:
+			case PEDIT_IP6_DST_31_0:
+			case PEDIT_IP6_DST_63_32:
+			case PEDIT_IP6_DST_95_64:
+			case PEDIT_IP6_DST_127_96:
+				break;
+			default:
+				netdev_err(dev, "%s: Unsupported pedit field\n",
+					   __func__);
+				return false;
+			}
+			break;
+		case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP:
+			switch (offset) {
+			case PEDIT_TCP_SPORT_DPORT:
+				if (!valid_l4_mask(~mask)) {
+					netdev_err(dev, "%s: Unsupported mask for TCP L4 ports\n",
+						   __func__);
+					return false;
+				}
+				break;
+			default:
+				netdev_err(dev, "%s: Unsupported pedit field\n",
+					   __func__);
+				return false;
+			}
+			break;
+		case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP:
+			switch (offset) {
+			case PEDIT_UDP_SPORT_DPORT:
+				if (!valid_l4_mask(~mask)) {
+					netdev_err(dev, "%s: Unsupported mask for UDP L4 ports\n",
+						   __func__);
+					return false;
+				}
+				break;
+			default:
+				netdev_err(dev, "%s: Unsupported pedit field\n",
+					   __func__);
+				return false;
+			}
+			break;
+		default:
+			netdev_err(dev, "%s: Unsupported pedit type\n",
+				   __func__);
+			return false;
+		}
+	}
+	return true;
+}
+
+static int cxgb4_validate_flow_actions(struct net_device *dev,
+				       struct tc_cls_flower_offload *cls)
+{
+	const struct tc_action *a;
+	bool act_redir = false;
+	bool act_pedit = false;
+	bool act_vlan = false;
+	LIST_HEAD(actions);
+
+	tcf_exts_to_list(cls->exts, &actions);
+	list_for_each_entry(a, &actions, list) {
+		if (is_tcf_gact_ok(a)) {
+			/* Do nothing */
+		} else if (is_tcf_gact_shot(a)) {
+			/* Do nothing */
+		} else if (is_tcf_mirred_egress_redirect(a)) {
+			struct adapter *adap = netdev2adap(dev);
+			struct net_device *n_dev, *target_dev;
+			unsigned int i;
+			bool found = false;
+
+			target_dev = tcf_mirred_dev(a);
+			for_each_port(adap, i) {
+				n_dev = adap->port[i];
+				if (target_dev == n_dev) {
+					found = true;
+					break;
+				}
+			}
+
+			/* If interface doesn't belong to our hw, then
+			 * the provided output port is not valid
+			 */
+			if (!found) {
+				netdev_err(dev, "%s: Out port invalid\n",
+					   __func__);
+				return -EINVAL;
+			}
+			act_redir = true;
+		} else if (is_tcf_vlan(a)) {
+			u16 proto = be16_to_cpu(tcf_vlan_push_proto(a));
+			u32 vlan_action = tcf_vlan_action(a);
+
+			switch (vlan_action) {
+			case TCA_VLAN_ACT_POP:
+				break;
+			case TCA_VLAN_ACT_PUSH:
+			case TCA_VLAN_ACT_MODIFY:
+				if (proto != ETH_P_8021Q) {
+					netdev_err(dev, "%s: Unsupported vlan proto\n",
+						   __func__);
+					return -EOPNOTSUPP;
+				}
+				break;
+			default:
+				netdev_err(dev, "%s: Unsupported vlan action\n",
+					   __func__);
+				return -EOPNOTSUPP;
+			}
+			act_vlan = true;
+		} else if (is_tcf_pedit(a)) {
+			bool pedit_valid = valid_pedit_action(dev, a);
+
+			if (!pedit_valid)
+				return -EOPNOTSUPP;
+			act_pedit = true;
+		} else {
+			netdev_err(dev, "%s: Unsupported action\n", __func__);
+			return -EOPNOTSUPP;
+		}
+	}
+
+	if ((act_pedit || act_vlan) && !act_redir) {
+		netdev_err(dev, "%s: pedit/vlan rewrite invalid without egress redirect\n",
+			   __func__);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int cxgb4_tc_flower_replace(struct net_device *dev,
+			    struct tc_cls_flower_offload *cls)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct ch_tc_flower_entry *ch_flower;
+	struct ch_filter_specification *fs;
+	struct filter_ctx ctx;
+	int fidx;
+	int ret;
+
+	if (cxgb4_validate_flow_actions(dev, cls))
+		return -EOPNOTSUPP;
+
+	if (cxgb4_validate_flow_match(dev, cls))
+		return -EOPNOTSUPP;
+
+	ch_flower = allocate_flower_entry();
+	if (!ch_flower) {
+		netdev_err(dev, "%s: ch_flower alloc failed.\n", __func__);
+		return -ENOMEM;
+	}
+
+	fs = &ch_flower->fs;
+	fs->hitcnts = 1;
+	cxgb4_process_flow_match(dev, cls, fs);
+	cxgb4_process_flow_actions(dev, cls, fs);
+
+	fs->hash = is_filter_exact_match(adap, fs);
+	if (fs->hash) {
+		fidx = 0;
+	} else {
+		fidx = cxgb4_get_free_ftid(dev, fs->type ? PF_INET6 : PF_INET);
+		if (fidx < 0) {
+			netdev_err(dev, "%s: No fidx for offload.\n", __func__);
+			ret = -ENOMEM;
+			goto free_entry;
+		}
+	}
+
+	init_completion(&ctx.completion);
+	ret = __cxgb4_set_filter(dev, fidx, fs, &ctx);
+	if (ret) {
+		netdev_err(dev, "%s: filter creation err %d\n",
+			   __func__, ret);
+		goto free_entry;
+	}
+
+	/* Wait for reply */
+	ret = wait_for_completion_timeout(&ctx.completion, 10 * HZ);
+	if (!ret) {
+		ret = -ETIMEDOUT;
+		goto free_entry;
+	}
+
+	ret = ctx.result;
+	/* Check if hw returned error for filter creation */
+	if (ret) {
+		netdev_err(dev, "%s: filter creation err %d\n",
+			   __func__, ret);
+		goto free_entry;
+	}
+
+	ch_flower->tc_flower_cookie = cls->cookie;
+	ch_flower->filter_id = ctx.tid;
+	ret = rhashtable_insert_fast(&adap->flower_tbl, &ch_flower->node,
+				     adap->flower_ht_params);
+	if (ret)
+		goto del_filter;
+
+	return 0;
+
+del_filter:
+	cxgb4_del_filter(dev, ch_flower->filter_id, &ch_flower->fs);
+
+free_entry:
+	kfree(ch_flower);
+	return ret;
+}
+
+int cxgb4_tc_flower_destroy(struct net_device *dev,
+			    struct tc_cls_flower_offload *cls)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct ch_tc_flower_entry *ch_flower;
+	int ret;
+
+	ch_flower = ch_flower_lookup(adap, cls->cookie);
+	if (!ch_flower)
+		return -ENOENT;
+
+	ret = cxgb4_del_filter(dev, ch_flower->filter_id, &ch_flower->fs);
+	if (ret)
+		goto err;
+
+	ret = rhashtable_remove_fast(&adap->flower_tbl, &ch_flower->node,
+				     adap->flower_ht_params);
+	if (ret) {
+		netdev_err(dev, "Flow remove from rhashtable failed");
+		goto err;
+	}
+	kfree_rcu(ch_flower, rcu);
+
+err:
+	return ret;
+}
+
+static void ch_flower_stats_handler(struct work_struct *work)
+{
+	struct adapter *adap = container_of(work, struct adapter,
+					    flower_stats_work);
+	struct ch_tc_flower_entry *flower_entry;
+	struct ch_tc_flower_stats *ofld_stats;
+	struct rhashtable_iter iter;
+	u64 packets;
+	u64 bytes;
+	int ret;
+
+	rhashtable_walk_enter(&adap->flower_tbl, &iter);
+	do {
+		rhashtable_walk_start(&iter);
+
+		while ((flower_entry = rhashtable_walk_next(&iter)) &&
+		       !IS_ERR(flower_entry)) {
+			ret = cxgb4_get_filter_counters(adap->port[0],
+							flower_entry->filter_id,
+							&packets, &bytes,
+							flower_entry->fs.hash);
+			if (!ret) {
+				spin_lock(&flower_entry->lock);
+				ofld_stats = &flower_entry->stats;
+
+				if (ofld_stats->prev_packet_count != packets) {
+					ofld_stats->prev_packet_count = packets;
+					ofld_stats->last_used = jiffies;
+				}
+				spin_unlock(&flower_entry->lock);
+			}
+		}
+
+		rhashtable_walk_stop(&iter);
+
+	} while (flower_entry == ERR_PTR(-EAGAIN));
+	rhashtable_walk_exit(&iter);
+	mod_timer(&adap->flower_stats_timer, jiffies + STATS_CHECK_PERIOD);
+}
+
+static void ch_flower_stats_cb(struct timer_list *t)
+{
+	struct adapter *adap = from_timer(adap, t, flower_stats_timer);
+
+	schedule_work(&adap->flower_stats_work);
+}
+
+int cxgb4_tc_flower_stats(struct net_device *dev,
+			  struct tc_cls_flower_offload *cls)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct ch_tc_flower_stats *ofld_stats;
+	struct ch_tc_flower_entry *ch_flower;
+	u64 packets;
+	u64 bytes;
+	int ret;
+
+	ch_flower = ch_flower_lookup(adap, cls->cookie);
+	if (!ch_flower) {
+		ret = -ENOENT;
+		goto err;
+	}
+
+	ret = cxgb4_get_filter_counters(dev, ch_flower->filter_id,
+					&packets, &bytes,
+					ch_flower->fs.hash);
+	if (ret < 0)
+		goto err;
+
+	spin_lock_bh(&ch_flower->lock);
+	ofld_stats = &ch_flower->stats;
+	if (ofld_stats->packet_count != packets) {
+		if (ofld_stats->prev_packet_count != packets)
+			ofld_stats->last_used = jiffies;
+		tcf_exts_stats_update(cls->exts, bytes - ofld_stats->byte_count,
+				      packets - ofld_stats->packet_count,
+				      ofld_stats->last_used);
+
+		ofld_stats->packet_count = packets;
+		ofld_stats->byte_count = bytes;
+		ofld_stats->prev_packet_count = packets;
+	}
+	spin_unlock_bh(&ch_flower->lock);
+	return 0;
+
+err:
+	return ret;
+}
+
+static const struct rhashtable_params cxgb4_tc_flower_ht_params = {
+	.nelem_hint = 384,
+	.head_offset = offsetof(struct ch_tc_flower_entry, node),
+	.key_offset = offsetof(struct ch_tc_flower_entry, tc_flower_cookie),
+	.key_len = sizeof(((struct ch_tc_flower_entry *)0)->tc_flower_cookie),
+	.max_size = 524288,
+	.min_size = 512,
+	.automatic_shrinking = true
+};
+
+int cxgb4_init_tc_flower(struct adapter *adap)
+{
+	int ret;
+
+	adap->flower_ht_params = cxgb4_tc_flower_ht_params;
+	ret = rhashtable_init(&adap->flower_tbl, &adap->flower_ht_params);
+	if (ret)
+		return ret;
+
+	INIT_WORK(&adap->flower_stats_work, ch_flower_stats_handler);
+	timer_setup(&adap->flower_stats_timer, ch_flower_stats_cb, 0);
+	mod_timer(&adap->flower_stats_timer, jiffies + STATS_CHECK_PERIOD);
+	return 0;
+}
+
+void cxgb4_cleanup_tc_flower(struct adapter *adap)
+{
+	if (adap->flower_stats_timer.function)
+		del_timer_sync(&adap->flower_stats_timer);
+	cancel_work_sync(&adap->flower_stats_work);
+	rhashtable_destroy(&adap->flower_tbl);
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
new file mode 100644
index 0000000..050c8a5
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
@@ -0,0 +1,120 @@
+/*
+ * This file is part of the Chelsio T4/T5/T6 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2017 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_TC_FLOWER_H
+#define __CXGB4_TC_FLOWER_H
+
+#include <net/pkt_cls.h>
+
+struct ch_tc_flower_stats {
+	u64 prev_packet_count;
+	u64 packet_count;
+	u64 byte_count;
+	u64 last_used;
+};
+
+struct ch_tc_flower_entry {
+	struct ch_filter_specification fs;
+	struct ch_tc_flower_stats stats;
+	unsigned long tc_flower_cookie;
+	struct rhash_head node;
+	struct rcu_head rcu;
+	spinlock_t lock; /* lock for stats */
+	u32 filter_id;
+};
+
+enum {
+	ETH_DMAC_31_0,	/* dmac bits 0.. 31 */
+	ETH_DMAC_47_32,	/* dmac bits 32..47 */
+	ETH_SMAC_15_0,	/* smac bits 0.. 15 */
+	ETH_SMAC_47_16,	/* smac bits 16..47 */
+
+	IP4_SRC,	/* 32-bit IPv4 src  */
+	IP4_DST,	/* 32-bit IPv4 dst  */
+
+	IP6_SRC_31_0,	/* src bits 0..  31 */
+	IP6_SRC_63_32,	/* src bits 63.. 32 */
+	IP6_SRC_95_64,	/* src bits 95.. 64 */
+	IP6_SRC_127_96,	/* src bits 127..96 */
+
+	IP6_DST_31_0,	/* dst bits 0..  31 */
+	IP6_DST_63_32,	/* dst bits 63.. 32 */
+	IP6_DST_95_64,	/* dst bits 95.. 64 */
+	IP6_DST_127_96,	/* dst bits 127..96 */
+
+	TCP_SPORT,	/* 16-bit TCP sport */
+	TCP_DPORT,	/* 16-bit TCP dport */
+
+	UDP_SPORT,	/* 16-bit UDP sport */
+	UDP_DPORT,	/* 16-bit UDP dport */
+};
+
+struct ch_tc_pedit_fields {
+	u8 field;
+	u8 size;
+	u32 offset;
+};
+
+#define PEDIT_FIELDS(type, field, size, fs_field, offset) \
+	{ type## field, size, \
+		offsetof(struct ch_filter_specification, fs_field) + (offset) }
+
+#define PEDIT_ETH_DMAC_MASK		0xffff
+#define PEDIT_TCP_UDP_SPORT_MASK	0xffff
+#define PEDIT_ETH_DMAC_31_0		0x0
+#define PEDIT_ETH_DMAC_47_32_SMAC_15_0	0x4
+#define PEDIT_ETH_SMAC_47_16		0x8
+#define PEDIT_IP4_SRC			0xC
+#define PEDIT_IP4_DST			0x10
+#define PEDIT_IP6_SRC_31_0		0x8
+#define PEDIT_IP6_SRC_63_32		0xC
+#define PEDIT_IP6_SRC_95_64		0x10
+#define PEDIT_IP6_SRC_127_96		0x14
+#define PEDIT_IP6_DST_31_0		0x18
+#define PEDIT_IP6_DST_63_32		0x1C
+#define PEDIT_IP6_DST_95_64		0x20
+#define PEDIT_IP6_DST_127_96		0x24
+#define PEDIT_TCP_SPORT_DPORT		0x0
+#define PEDIT_UDP_SPORT_DPORT		0x0
+
+int cxgb4_tc_flower_replace(struct net_device *dev,
+			    struct tc_cls_flower_offload *cls);
+int cxgb4_tc_flower_destroy(struct net_device *dev,
+			    struct tc_cls_flower_offload *cls);
+int cxgb4_tc_flower_stats(struct net_device *dev,
+			  struct tc_cls_flower_offload *cls);
+
+int cxgb4_init_tc_flower(struct adapter *adap);
+void cxgb4_cleanup_tc_flower(struct adapter *adap);
+#endif /* __CXGB4_TC_FLOWER_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
new file mode 100644
index 0000000..ab174bc
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
@@ -0,0 +1,480 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <net/tc_act/tc_gact.h>
+#include <net/tc_act/tc_mirred.h>
+
+#include "cxgb4.h"
+#include "cxgb4_tc_u32_parse.h"
+#include "cxgb4_tc_u32.h"
+
+/* Fill ch_filter_specification with parsed match value/mask pair. */
+static int fill_match_fields(struct adapter *adap,
+			     struct ch_filter_specification *fs,
+			     struct tc_cls_u32_offload *cls,
+			     const struct cxgb4_match_field *entry,
+			     bool next_header)
+{
+	unsigned int i, j;
+	u32 val, mask;
+	int off, err;
+	bool found;
+
+	for (i = 0; i < cls->knode.sel->nkeys; i++) {
+		off = cls->knode.sel->keys[i].off;
+		val = cls->knode.sel->keys[i].val;
+		mask = cls->knode.sel->keys[i].mask;
+
+		if (next_header) {
+			/* For next headers, parse only keys with offmask */
+			if (!cls->knode.sel->keys[i].offmask)
+				continue;
+		} else {
+			/* For the remaining, parse only keys without offmask */
+			if (cls->knode.sel->keys[i].offmask)
+				continue;
+		}
+
+		found = false;
+
+		for (j = 0; entry[j].val; j++) {
+			if (off == entry[j].off) {
+				found = true;
+				err = entry[j].val(fs, val, mask);
+				if (err)
+					return err;
+				break;
+			}
+		}
+
+		if (!found)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Fill ch_filter_specification with parsed action. */
+static int fill_action_fields(struct adapter *adap,
+			      struct ch_filter_specification *fs,
+			      struct tc_cls_u32_offload *cls)
+{
+	unsigned int num_actions = 0;
+	const struct tc_action *a;
+	struct tcf_exts *exts;
+	LIST_HEAD(actions);
+
+	exts = cls->knode.exts;
+	if (!tcf_exts_has_actions(exts))
+		return -EINVAL;
+
+	tcf_exts_to_list(exts, &actions);
+	list_for_each_entry(a, &actions, list) {
+		/* Don't allow more than one action per rule. */
+		if (num_actions)
+			return -EINVAL;
+
+		/* Drop in hardware. */
+		if (is_tcf_gact_shot(a)) {
+			fs->action = FILTER_DROP;
+			num_actions++;
+			continue;
+		}
+
+		/* Re-direct to specified port in hardware. */
+		if (is_tcf_mirred_egress_redirect(a)) {
+			struct net_device *n_dev, *target_dev;
+			bool found = false;
+			unsigned int i;
+
+			target_dev = tcf_mirred_dev(a);
+			for_each_port(adap, i) {
+				n_dev = adap->port[i];
+				if (target_dev == n_dev) {
+					fs->action = FILTER_SWITCH;
+					fs->eport = i;
+					found = true;
+					break;
+				}
+			}
+
+			/* Interface doesn't belong to any port of
+			 * the underlying hardware.
+			 */
+			if (!found)
+				return -EINVAL;
+
+			num_actions++;
+			continue;
+		}
+
+		/* Un-supported action. */
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int cxgb4_config_knode(struct net_device *dev, struct tc_cls_u32_offload *cls)
+{
+	const struct cxgb4_match_field *start, *link_start = NULL;
+	struct adapter *adapter = netdev2adap(dev);
+	__be16 protocol = cls->common.protocol;
+	struct ch_filter_specification fs;
+	struct cxgb4_tc_u32_table *t;
+	struct cxgb4_link *link;
+	unsigned int filter_id;
+	u32 uhtid, link_uhtid;
+	bool is_ipv6 = false;
+	int ret;
+
+	if (!can_tc_u32_offload(dev))
+		return -EOPNOTSUPP;
+
+	if (protocol != htons(ETH_P_IP) && protocol != htons(ETH_P_IPV6))
+		return -EOPNOTSUPP;
+
+	/* Fetch the location to insert the filter. */
+	filter_id = cls->knode.handle & 0xFFFFF;
+
+	if (filter_id > adapter->tids.nftids) {
+		dev_err(adapter->pdev_dev,
+			"Location %d out of range for insertion. Max: %d\n",
+			filter_id, adapter->tids.nftids);
+		return -ERANGE;
+	}
+
+	t = adapter->tc_u32;
+	uhtid = TC_U32_USERHTID(cls->knode.handle);
+	link_uhtid = TC_U32_USERHTID(cls->knode.link_handle);
+
+	/* Ensure that uhtid is either root u32 (i.e. 0x800)
+	 * or a a valid linked bucket.
+	 */
+	if (uhtid != 0x800 && uhtid >= t->size)
+		return -EINVAL;
+
+	/* Ensure link handle uhtid is sane, if specified. */
+	if (link_uhtid >= t->size)
+		return -EINVAL;
+
+	memset(&fs, 0, sizeof(fs));
+
+	if (protocol == htons(ETH_P_IPV6)) {
+		start = cxgb4_ipv6_fields;
+		is_ipv6 = true;
+	} else {
+		start = cxgb4_ipv4_fields;
+		is_ipv6 = false;
+	}
+
+	if (uhtid != 0x800) {
+		/* Link must exist from root node before insertion. */
+		if (!t->table[uhtid - 1].link_handle)
+			return -EINVAL;
+
+		/* Link must have a valid supported next header. */
+		link_start = t->table[uhtid - 1].match_field;
+		if (!link_start)
+			return -EINVAL;
+	}
+
+	/* Parse links and record them for subsequent jumps to valid
+	 * next headers.
+	 */
+	if (link_uhtid) {
+		const struct cxgb4_next_header *next;
+		bool found = false;
+		unsigned int i, j;
+		u32 val, mask;
+		int off;
+
+		if (t->table[link_uhtid - 1].link_handle) {
+			dev_err(adapter->pdev_dev,
+				"Link handle exists for: 0x%x\n",
+				link_uhtid);
+			return -EINVAL;
+		}
+
+		next = is_ipv6 ? cxgb4_ipv6_jumps : cxgb4_ipv4_jumps;
+
+		/* Try to find matches that allow jumps to next header. */
+		for (i = 0; next[i].jump; i++) {
+			if (next[i].offoff != cls->knode.sel->offoff ||
+			    next[i].shift != cls->knode.sel->offshift ||
+			    next[i].mask != cls->knode.sel->offmask ||
+			    next[i].offset != cls->knode.sel->off)
+				continue;
+
+			/* Found a possible candidate.  Find a key that
+			 * matches the corresponding offset, value, and
+			 * mask to jump to next header.
+			 */
+			for (j = 0; j < cls->knode.sel->nkeys; j++) {
+				off = cls->knode.sel->keys[j].off;
+				val = cls->knode.sel->keys[j].val;
+				mask = cls->knode.sel->keys[j].mask;
+
+				if (next[i].match_off == off &&
+				    next[i].match_val == val &&
+				    next[i].match_mask == mask) {
+					found = true;
+					break;
+				}
+			}
+
+			if (!found)
+				continue; /* Try next candidate. */
+
+			/* Candidate to jump to next header found.
+			 * Translate all keys to internal specification
+			 * and store them in jump table. This spec is copied
+			 * later to set the actual filters.
+			 */
+			ret = fill_match_fields(adapter, &fs, cls,
+						start, false);
+			if (ret)
+				goto out;
+
+			link = &t->table[link_uhtid - 1];
+			link->match_field = next[i].jump;
+			link->link_handle = cls->knode.handle;
+			memcpy(&link->fs, &fs, sizeof(fs));
+			break;
+		}
+
+		/* No candidate found to jump to next header. */
+		if (!found)
+			return -EINVAL;
+
+		return 0;
+	}
+
+	/* Fill ch_filter_specification match fields to be shipped to hardware.
+	 * Copy the linked spec (if any) first.  And then update the spec as
+	 * needed.
+	 */
+	if (uhtid != 0x800 && t->table[uhtid - 1].link_handle) {
+		/* Copy linked ch_filter_specification */
+		memcpy(&fs, &t->table[uhtid - 1].fs, sizeof(fs));
+		ret = fill_match_fields(adapter, &fs, cls,
+					link_start, true);
+		if (ret)
+			goto out;
+	}
+
+	ret = fill_match_fields(adapter, &fs, cls, start, false);
+	if (ret)
+		goto out;
+
+	/* Fill ch_filter_specification action fields to be shipped to
+	 * hardware.
+	 */
+	ret = fill_action_fields(adapter, &fs, cls);
+	if (ret)
+		goto out;
+
+	/* The filter spec has been completely built from the info
+	 * provided from u32.  We now set some default fields in the
+	 * spec for sanity.
+	 */
+
+	/* Match only packets coming from the ingress port where this
+	 * filter will be created.
+	 */
+	fs.val.iport = netdev2pinfo(dev)->port_id;
+	fs.mask.iport = ~0;
+
+	/* Enable filter hit counts. */
+	fs.hitcnts = 1;
+
+	/* Set type of filter - IPv6 or IPv4 */
+	fs.type = is_ipv6 ? 1 : 0;
+
+	/* Set the filter */
+	ret = cxgb4_set_filter(dev, filter_id, &fs);
+	if (ret)
+		goto out;
+
+	/* If this is a linked bucket, then set the corresponding
+	 * entry in the bitmap to mark it as belonging to this linked
+	 * bucket.
+	 */
+	if (uhtid != 0x800 && t->table[uhtid - 1].link_handle)
+		set_bit(filter_id, t->table[uhtid - 1].tid_map);
+
+out:
+	return ret;
+}
+
+int cxgb4_delete_knode(struct net_device *dev, struct tc_cls_u32_offload *cls)
+{
+	struct adapter *adapter = netdev2adap(dev);
+	unsigned int filter_id, max_tids, i, j;
+	struct cxgb4_link *link = NULL;
+	struct cxgb4_tc_u32_table *t;
+	u32 handle, uhtid;
+	int ret;
+
+	if (!can_tc_u32_offload(dev))
+		return -EOPNOTSUPP;
+
+	/* Fetch the location to delete the filter. */
+	filter_id = cls->knode.handle & 0xFFFFF;
+
+	if (filter_id > adapter->tids.nftids) {
+		dev_err(adapter->pdev_dev,
+			"Location %d out of range for deletion. Max: %d\n",
+			filter_id, adapter->tids.nftids);
+		return -ERANGE;
+	}
+
+	t = adapter->tc_u32;
+	handle = cls->knode.handle;
+	uhtid = TC_U32_USERHTID(cls->knode.handle);
+
+	/* Ensure that uhtid is either root u32 (i.e. 0x800)
+	 * or a a valid linked bucket.
+	 */
+	if (uhtid != 0x800 && uhtid >= t->size)
+		return -EINVAL;
+
+	/* Delete the specified filter */
+	if (uhtid != 0x800) {
+		link = &t->table[uhtid - 1];
+		if (!link->link_handle)
+			return -EINVAL;
+
+		if (!test_bit(filter_id, link->tid_map))
+			return -EINVAL;
+	}
+
+	ret = cxgb4_del_filter(dev, filter_id, NULL);
+	if (ret)
+		goto out;
+
+	if (link)
+		clear_bit(filter_id, link->tid_map);
+
+	/* If a link is being deleted, then delete all filters
+	 * associated with the link.
+	 */
+	max_tids = adapter->tids.nftids;
+	for (i = 0; i < t->size; i++) {
+		link = &t->table[i];
+
+		if (link->link_handle == handle) {
+			for (j = 0; j < max_tids; j++) {
+				if (!test_bit(j, link->tid_map))
+					continue;
+
+				ret = __cxgb4_del_filter(dev, j, NULL, NULL);
+				if (ret)
+					goto out;
+
+				clear_bit(j, link->tid_map);
+			}
+
+			/* Clear the link state */
+			link->match_field = NULL;
+			link->link_handle = 0;
+			memset(&link->fs, 0, sizeof(link->fs));
+			break;
+		}
+	}
+
+out:
+	return ret;
+}
+
+void cxgb4_cleanup_tc_u32(struct adapter *adap)
+{
+	struct cxgb4_tc_u32_table *t;
+	unsigned int i;
+
+	if (!adap->tc_u32)
+		return;
+
+	/* Free up all allocated memory. */
+	t = adap->tc_u32;
+	for (i = 0; i < t->size; i++) {
+		struct cxgb4_link *link = &t->table[i];
+
+		kvfree(link->tid_map);
+	}
+	kvfree(adap->tc_u32);
+}
+
+struct cxgb4_tc_u32_table *cxgb4_init_tc_u32(struct adapter *adap)
+{
+	unsigned int max_tids = adap->tids.nftids;
+	struct cxgb4_tc_u32_table *t;
+	unsigned int i;
+
+	if (!max_tids)
+		return NULL;
+
+	t = kvzalloc(sizeof(*t) +
+			 (max_tids * sizeof(struct cxgb4_link)), GFP_KERNEL);
+	if (!t)
+		return NULL;
+
+	t->size = max_tids;
+
+	for (i = 0; i < t->size; i++) {
+		struct cxgb4_link *link = &t->table[i];
+		unsigned int bmap_size;
+
+		bmap_size = BITS_TO_LONGS(max_tids);
+		link->tid_map = kvzalloc(sizeof(unsigned long) * bmap_size, GFP_KERNEL);
+		if (!link->tid_map)
+			goto out_no_mem;
+		bitmap_zero(link->tid_map, max_tids);
+	}
+
+	return t;
+
+out_no_mem:
+	for (i = 0; i < t->size; i++) {
+		struct cxgb4_link *link = &t->table[i];
+
+		if (link->tid_map)
+			kvfree(link->tid_map);
+	}
+
+	if (t)
+		kvfree(t);
+
+	return NULL;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h
new file mode 100644
index 0000000..70a07b7
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h
@@ -0,0 +1,52 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_TC_U32_H
+#define __CXGB4_TC_U32_H
+
+#include <net/pkt_cls.h>
+
+static inline bool can_tc_u32_offload(struct net_device *dev)
+{
+	struct adapter *adap = netdev2adap(dev);
+
+	return (dev->features & NETIF_F_HW_TC) && adap->tc_u32 ? true : false;
+}
+
+int cxgb4_config_knode(struct net_device *dev, struct tc_cls_u32_offload *cls);
+int cxgb4_delete_knode(struct net_device *dev, struct tc_cls_u32_offload *cls);
+
+void cxgb4_cleanup_tc_u32(struct adapter *adapter);
+struct cxgb4_tc_u32_table *cxgb4_init_tc_u32(struct adapter *adap);
+#endif /* __CXGB4_TC_U32_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h
new file mode 100644
index 0000000..a4b99ed
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h
@@ -0,0 +1,294 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_TC_U32_PARSE_H
+#define __CXGB4_TC_U32_PARSE_H
+
+struct cxgb4_match_field {
+	int off; /* Offset from the beginning of the header to match */
+	/* Fill the value/mask pair in the spec if matched */
+	int (*val)(struct ch_filter_specification *f, u32 val, u32 mask);
+};
+
+/* IPv4 match fields */
+static inline int cxgb4_fill_ipv4_tos(struct ch_filter_specification *f,
+				      u32 val, u32 mask)
+{
+	f->val.tos  = (ntohl(val)  >> 16) & 0x000000FF;
+	f->mask.tos = (ntohl(mask) >> 16) & 0x000000FF;
+
+	return 0;
+}
+
+static inline int cxgb4_fill_ipv4_frag(struct ch_filter_specification *f,
+				       u32 val, u32 mask)
+{
+	u32 mask_val;
+	u8 frag_val;
+
+	frag_val = (ntohl(val) >> 13) & 0x00000007;
+	mask_val = ntohl(mask) & 0x0000FFFF;
+
+	if (frag_val == 0x1 && mask_val != 0x3FFF) { /* MF set */
+		f->val.frag = 1;
+		f->mask.frag = 1;
+	} else if (frag_val == 0x2 && mask_val != 0x3FFF) { /* DF set */
+		f->val.frag = 0;
+		f->mask.frag = 1;
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline int cxgb4_fill_ipv4_proto(struct ch_filter_specification *f,
+					u32 val, u32 mask)
+{
+	f->val.proto  = (ntohl(val)  >> 16) & 0x000000FF;
+	f->mask.proto = (ntohl(mask) >> 16) & 0x000000FF;
+
+	return 0;
+}
+
+static inline int cxgb4_fill_ipv4_src_ip(struct ch_filter_specification *f,
+					 u32 val, u32 mask)
+{
+	memcpy(&f->val.fip[0],  &val,  sizeof(u32));
+	memcpy(&f->mask.fip[0], &mask, sizeof(u32));
+
+	return 0;
+}
+
+static inline int cxgb4_fill_ipv4_dst_ip(struct ch_filter_specification *f,
+					 u32 val, u32 mask)
+{
+	memcpy(&f->val.lip[0],  &val,  sizeof(u32));
+	memcpy(&f->mask.lip[0], &mask, sizeof(u32));
+
+	return 0;
+}
+
+static const struct cxgb4_match_field cxgb4_ipv4_fields[] = {
+	{ .off = 0,  .val = cxgb4_fill_ipv4_tos },
+	{ .off = 4,  .val = cxgb4_fill_ipv4_frag },
+	{ .off = 8,  .val = cxgb4_fill_ipv4_proto },
+	{ .off = 12, .val = cxgb4_fill_ipv4_src_ip },
+	{ .off = 16, .val = cxgb4_fill_ipv4_dst_ip },
+	{ .val = NULL }
+};
+
+/* IPv6 match fields */
+static inline int cxgb4_fill_ipv6_tos(struct ch_filter_specification *f,
+				      u32 val, u32 mask)
+{
+	f->val.tos  = (ntohl(val)  >> 20) & 0x000000FF;
+	f->mask.tos = (ntohl(mask) >> 20) & 0x000000FF;
+
+	return 0;
+}
+
+static inline int cxgb4_fill_ipv6_proto(struct ch_filter_specification *f,
+					u32 val, u32 mask)
+{
+	f->val.proto  = (ntohl(val)  >> 8) & 0x000000FF;
+	f->mask.proto = (ntohl(mask) >> 8) & 0x000000FF;
+
+	return 0;
+}
+
+static inline int cxgb4_fill_ipv6_src_ip0(struct ch_filter_specification *f,
+					  u32 val, u32 mask)
+{
+	memcpy(&f->val.fip[0],  &val,  sizeof(u32));
+	memcpy(&f->mask.fip[0], &mask, sizeof(u32));
+
+	return 0;
+}
+
+static inline int cxgb4_fill_ipv6_src_ip1(struct ch_filter_specification *f,
+					  u32 val, u32 mask)
+{
+	memcpy(&f->val.fip[4],  &val,  sizeof(u32));
+	memcpy(&f->mask.fip[4], &mask, sizeof(u32));
+
+	return 0;
+}
+
+static inline int cxgb4_fill_ipv6_src_ip2(struct ch_filter_specification *f,
+					  u32 val, u32 mask)
+{
+	memcpy(&f->val.fip[8],  &val,  sizeof(u32));
+	memcpy(&f->mask.fip[8], &mask, sizeof(u32));
+
+	return 0;
+}
+
+static inline int cxgb4_fill_ipv6_src_ip3(struct ch_filter_specification *f,
+					  u32 val, u32 mask)
+{
+	memcpy(&f->val.fip[12],  &val,  sizeof(u32));
+	memcpy(&f->mask.fip[12], &mask, sizeof(u32));
+
+	return 0;
+}
+
+static inline int cxgb4_fill_ipv6_dst_ip0(struct ch_filter_specification *f,
+					  u32 val, u32 mask)
+{
+	memcpy(&f->val.lip[0],  &val,  sizeof(u32));
+	memcpy(&f->mask.lip[0], &mask, sizeof(u32));
+
+	return 0;
+}
+
+static inline int cxgb4_fill_ipv6_dst_ip1(struct ch_filter_specification *f,
+					  u32 val, u32 mask)
+{
+	memcpy(&f->val.lip[4],  &val,  sizeof(u32));
+	memcpy(&f->mask.lip[4], &mask, sizeof(u32));
+
+	return 0;
+}
+
+static inline int cxgb4_fill_ipv6_dst_ip2(struct ch_filter_specification *f,
+					  u32 val, u32 mask)
+{
+	memcpy(&f->val.lip[8],  &val,  sizeof(u32));
+	memcpy(&f->mask.lip[8], &mask, sizeof(u32));
+
+	return 0;
+}
+
+static inline int cxgb4_fill_ipv6_dst_ip3(struct ch_filter_specification *f,
+					  u32 val, u32 mask)
+{
+	memcpy(&f->val.lip[12],  &val,  sizeof(u32));
+	memcpy(&f->mask.lip[12], &mask, sizeof(u32));
+
+	return 0;
+}
+
+static const struct cxgb4_match_field cxgb4_ipv6_fields[] = {
+	{ .off = 0,  .val = cxgb4_fill_ipv6_tos },
+	{ .off = 4,  .val = cxgb4_fill_ipv6_proto },
+	{ .off = 8,  .val = cxgb4_fill_ipv6_src_ip0 },
+	{ .off = 12, .val = cxgb4_fill_ipv6_src_ip1 },
+	{ .off = 16, .val = cxgb4_fill_ipv6_src_ip2 },
+	{ .off = 20, .val = cxgb4_fill_ipv6_src_ip3 },
+	{ .off = 24, .val = cxgb4_fill_ipv6_dst_ip0 },
+	{ .off = 28, .val = cxgb4_fill_ipv6_dst_ip1 },
+	{ .off = 32, .val = cxgb4_fill_ipv6_dst_ip2 },
+	{ .off = 36, .val = cxgb4_fill_ipv6_dst_ip3 },
+	{ .val = NULL }
+};
+
+/* TCP/UDP match */
+static inline int cxgb4_fill_l4_ports(struct ch_filter_specification *f,
+				      u32 val, u32 mask)
+{
+	f->val.fport  = ntohl(val)  >> 16;
+	f->mask.fport = ntohl(mask) >> 16;
+	f->val.lport  = ntohl(val)  & 0x0000FFFF;
+	f->mask.lport = ntohl(mask) & 0x0000FFFF;
+
+	return 0;
+};
+
+static const struct cxgb4_match_field cxgb4_tcp_fields[] = {
+	{ .off = 0, .val = cxgb4_fill_l4_ports },
+	{ .val = NULL }
+};
+
+static const struct cxgb4_match_field cxgb4_udp_fields[] = {
+	{ .off = 0, .val = cxgb4_fill_l4_ports },
+	{ .val = NULL }
+};
+
+struct cxgb4_next_header {
+	unsigned int offset; /* Offset to next header */
+	/* offset, shift, and mask added to offset above
+	 * to get to next header.  Useful when using a header
+	 * field's value to jump to next header such as IHL field
+	 * in IPv4 header.
+	 */
+	unsigned int offoff;
+	u32 shift;
+	u32 mask;
+	/* match criteria to make this jump */
+	unsigned int match_off;
+	u32 match_val;
+	u32 match_mask;
+	/* location of jump to make */
+	const struct cxgb4_match_field *jump;
+};
+
+/* Accept a rule with a jump to transport layer header based on IHL field in
+ * IPv4 header.
+ */
+static const struct cxgb4_next_header cxgb4_ipv4_jumps[] = {
+	{ .offset = 0, .offoff = 0, .shift = 6, .mask = 0xF,
+	  .match_off = 8, .match_val = 0x600, .match_mask = 0xFF00,
+	  .jump = cxgb4_tcp_fields },
+	{ .offset = 0, .offoff = 0, .shift = 6, .mask = 0xF,
+	  .match_off = 8, .match_val = 0x1100, .match_mask = 0xFF00,
+	  .jump = cxgb4_udp_fields },
+	{ .jump = NULL }
+};
+
+/* Accept a rule with a jump directly past the 40 Bytes of IPv6 fixed header
+ * to get to transport layer header.
+ */
+static const struct cxgb4_next_header cxgb4_ipv6_jumps[] = {
+	{ .offset = 0x28, .offoff = 0, .shift = 0, .mask = 0,
+	  .match_off = 4, .match_val = 0x60000, .match_mask = 0xFF0000,
+	  .jump = cxgb4_tcp_fields },
+	{ .offset = 0x28, .offoff = 0, .shift = 0, .mask = 0,
+	  .match_off = 4, .match_val = 0x110000, .match_mask = 0xFF0000,
+	  .jump = cxgb4_udp_fields },
+	{ .jump = NULL }
+};
+
+struct cxgb4_link {
+	const struct cxgb4_match_field *match_field;  /* Next header */
+	struct ch_filter_specification fs; /* Match spec associated with link */
+	u32 link_handle;         /* Knode handle associated with the link */
+	unsigned long *tid_map;  /* Bitmap for filter tids */
+};
+
+struct cxgb4_tc_u32_table {
+	unsigned int size;          /* number of entries in table */
+	struct cxgb4_link table[0]; /* Jump table */
+};
+#endif /* __CXGB4_TC_U32_PARSE_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
new file mode 100644
index 0000000..a95cde0
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
@@ -0,0 +1,812 @@
+/*
+ * cxgb4_uld.c:Chelsio Upper Layer Driver Interface for T4/T5/T6 SGE management
+ *
+ * Copyright (c) 2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *  Written by: Atul Gupta (atul.gupta@chelsio.com)
+ *  Written by: Hariprasad Shenai (hariprasad@chelsio.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/pci.h>
+
+#include "cxgb4.h"
+#include "cxgb4_uld.h"
+#include "t4_regs.h"
+#include "t4fw_api.h"
+#include "t4_msg.h"
+
+#define for_each_uldrxq(m, i) for (i = 0; i < ((m)->nrxq + (m)->nciq); i++)
+
+static int get_msix_idx_from_bmap(struct adapter *adap)
+{
+	struct uld_msix_bmap *bmap = &adap->msix_bmap_ulds;
+	unsigned long flags;
+	unsigned int msix_idx;
+
+	spin_lock_irqsave(&bmap->lock, flags);
+	msix_idx = find_first_zero_bit(bmap->msix_bmap, bmap->mapsize);
+	if (msix_idx < bmap->mapsize) {
+		__set_bit(msix_idx, bmap->msix_bmap);
+	} else {
+		spin_unlock_irqrestore(&bmap->lock, flags);
+		return -ENOSPC;
+	}
+
+	spin_unlock_irqrestore(&bmap->lock, flags);
+	return msix_idx;
+}
+
+static void free_msix_idx_in_bmap(struct adapter *adap, unsigned int msix_idx)
+{
+	struct uld_msix_bmap *bmap = &adap->msix_bmap_ulds;
+	unsigned long flags;
+
+	spin_lock_irqsave(&bmap->lock, flags);
+	 __clear_bit(msix_idx, bmap->msix_bmap);
+	spin_unlock_irqrestore(&bmap->lock, flags);
+}
+
+/* Flush the aggregated lro sessions */
+static void uldrx_flush_handler(struct sge_rspq *q)
+{
+	struct adapter *adap = q->adap;
+
+	if (adap->uld[q->uld].lro_flush)
+		adap->uld[q->uld].lro_flush(&q->lro_mgr);
+}
+
+/**
+ *	uldrx_handler - response queue handler for ULD queues
+ *	@q: the response queue that received the packet
+ *	@rsp: the response queue descriptor holding the offload message
+ *	@gl: the gather list of packet fragments
+ *
+ *	Deliver an ingress offload packet to a ULD.  All processing is done by
+ *	the ULD, we just maintain statistics.
+ */
+static int uldrx_handler(struct sge_rspq *q, const __be64 *rsp,
+			 const struct pkt_gl *gl)
+{
+	struct adapter *adap = q->adap;
+	struct sge_ofld_rxq *rxq = container_of(q, struct sge_ofld_rxq, rspq);
+	int ret;
+
+	/* FW can send CPLs encapsulated in a CPL_FW4_MSG */
+	if (((const struct rss_header *)rsp)->opcode == CPL_FW4_MSG &&
+	    ((const struct cpl_fw4_msg *)(rsp + 1))->type == FW_TYPE_RSSCPL)
+		rsp += 2;
+
+	if (q->flush_handler)
+		ret = adap->uld[q->uld].lro_rx_handler(adap->uld[q->uld].handle,
+				rsp, gl, &q->lro_mgr,
+				&q->napi);
+	else
+		ret = adap->uld[q->uld].rx_handler(adap->uld[q->uld].handle,
+				rsp, gl);
+
+	if (ret) {
+		rxq->stats.nomem++;
+		return -1;
+	}
+
+	if (!gl)
+		rxq->stats.imm++;
+	else if (gl == CXGB4_MSG_AN)
+		rxq->stats.an++;
+	else
+		rxq->stats.pkts++;
+	return 0;
+}
+
+static int alloc_uld_rxqs(struct adapter *adap,
+			  struct sge_uld_rxq_info *rxq_info, bool lro)
+{
+	struct sge *s = &adap->sge;
+	unsigned int nq = rxq_info->nrxq + rxq_info->nciq;
+	struct sge_ofld_rxq *q = rxq_info->uldrxq;
+	unsigned short *ids = rxq_info->rspq_id;
+	unsigned int bmap_idx = 0;
+	unsigned int per_chan;
+	int i, err, msi_idx, que_idx = 0;
+
+	per_chan = rxq_info->nrxq / adap->params.nports;
+
+	if (adap->flags & USING_MSIX)
+		msi_idx = 1;
+	else
+		msi_idx = -((int)s->intrq.abs_id + 1);
+
+	for (i = 0; i < nq; i++, q++) {
+		if (i == rxq_info->nrxq) {
+			/* start allocation of concentrator queues */
+			per_chan = rxq_info->nciq / adap->params.nports;
+			que_idx = 0;
+		}
+
+		if (msi_idx >= 0) {
+			bmap_idx = get_msix_idx_from_bmap(adap);
+			msi_idx = adap->msix_info_ulds[bmap_idx].idx;
+		}
+		err = t4_sge_alloc_rxq(adap, &q->rspq, false,
+				       adap->port[que_idx++ / per_chan],
+				       msi_idx,
+				       q->fl.size ? &q->fl : NULL,
+				       uldrx_handler,
+				       lro ? uldrx_flush_handler : NULL,
+				       0);
+		if (err)
+			goto freeout;
+		if (msi_idx >= 0)
+			rxq_info->msix_tbl[i] = bmap_idx;
+		memset(&q->stats, 0, sizeof(q->stats));
+		if (ids)
+			ids[i] = q->rspq.abs_id;
+	}
+	return 0;
+freeout:
+	q = rxq_info->uldrxq;
+	for ( ; i; i--, q++) {
+		if (q->rspq.desc)
+			free_rspq_fl(adap, &q->rspq,
+				     q->fl.size ? &q->fl : NULL);
+	}
+	return err;
+}
+
+static int
+setup_sge_queues_uld(struct adapter *adap, unsigned int uld_type, bool lro)
+{
+	struct sge_uld_rxq_info *rxq_info = adap->sge.uld_rxq_info[uld_type];
+	int i, ret = 0;
+
+	if (adap->flags & USING_MSIX) {
+		rxq_info->msix_tbl = kcalloc((rxq_info->nrxq + rxq_info->nciq),
+					     sizeof(unsigned short),
+					     GFP_KERNEL);
+		if (!rxq_info->msix_tbl)
+			return -ENOMEM;
+	}
+
+	ret = !(!alloc_uld_rxqs(adap, rxq_info, lro));
+
+	/* Tell uP to route control queue completions to rdma rspq */
+	if (adap->flags & FULL_INIT_DONE &&
+	    !ret && uld_type == CXGB4_ULD_RDMA) {
+		struct sge *s = &adap->sge;
+		unsigned int cmplqid;
+		u32 param, cmdop;
+
+		cmdop = FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_CTRL;
+		for_each_port(adap, i) {
+			cmplqid = rxq_info->uldrxq[i].rspq.cntxt_id;
+			param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DMAQ) |
+				 FW_PARAMS_PARAM_X_V(cmdop) |
+				 FW_PARAMS_PARAM_YZ_V(s->ctrlq[i].q.cntxt_id));
+			ret = t4_set_params(adap, adap->mbox, adap->pf,
+					    0, 1, &param, &cmplqid);
+		}
+	}
+	return ret;
+}
+
+static void t4_free_uld_rxqs(struct adapter *adap, int n,
+			     struct sge_ofld_rxq *q)
+{
+	for ( ; n; n--, q++) {
+		if (q->rspq.desc)
+			free_rspq_fl(adap, &q->rspq,
+				     q->fl.size ? &q->fl : NULL);
+	}
+}
+
+static void free_sge_queues_uld(struct adapter *adap, unsigned int uld_type)
+{
+	struct sge_uld_rxq_info *rxq_info = adap->sge.uld_rxq_info[uld_type];
+
+	if (adap->flags & FULL_INIT_DONE && uld_type == CXGB4_ULD_RDMA) {
+		struct sge *s = &adap->sge;
+		u32 param, cmdop, cmplqid = 0;
+		int i;
+
+		cmdop = FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_CTRL;
+		for_each_port(adap, i) {
+			param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DMAQ) |
+				 FW_PARAMS_PARAM_X_V(cmdop) |
+				 FW_PARAMS_PARAM_YZ_V(s->ctrlq[i].q.cntxt_id));
+			t4_set_params(adap, adap->mbox, adap->pf,
+				      0, 1, &param, &cmplqid);
+		}
+	}
+
+	if (rxq_info->nciq)
+		t4_free_uld_rxqs(adap, rxq_info->nciq,
+				 rxq_info->uldrxq + rxq_info->nrxq);
+	t4_free_uld_rxqs(adap, rxq_info->nrxq, rxq_info->uldrxq);
+	if (adap->flags & USING_MSIX)
+		kfree(rxq_info->msix_tbl);
+}
+
+static int cfg_queues_uld(struct adapter *adap, unsigned int uld_type,
+			  const struct cxgb4_uld_info *uld_info)
+{
+	struct sge *s = &adap->sge;
+	struct sge_uld_rxq_info *rxq_info;
+	int i, nrxq, ciq_size;
+
+	rxq_info = kzalloc(sizeof(*rxq_info), GFP_KERNEL);
+	if (!rxq_info)
+		return -ENOMEM;
+
+	if (adap->flags & USING_MSIX && uld_info->nrxq > s->nqs_per_uld) {
+		i = s->nqs_per_uld;
+		rxq_info->nrxq = roundup(i, adap->params.nports);
+	} else {
+		i = min_t(int, uld_info->nrxq,
+			  num_online_cpus());
+		rxq_info->nrxq = roundup(i, adap->params.nports);
+	}
+	if (!uld_info->ciq) {
+		rxq_info->nciq = 0;
+	} else  {
+		if (adap->flags & USING_MSIX)
+			rxq_info->nciq = min_t(int, s->nqs_per_uld,
+					       num_online_cpus());
+		else
+			rxq_info->nciq = min_t(int, MAX_OFLD_QSETS,
+					       num_online_cpus());
+		rxq_info->nciq = ((rxq_info->nciq / adap->params.nports) *
+				  adap->params.nports);
+		rxq_info->nciq = max_t(int, rxq_info->nciq,
+				       adap->params.nports);
+	}
+
+	nrxq = rxq_info->nrxq + rxq_info->nciq; /* total rxq's */
+	rxq_info->uldrxq = kcalloc(nrxq, sizeof(struct sge_ofld_rxq),
+				   GFP_KERNEL);
+	if (!rxq_info->uldrxq) {
+		kfree(rxq_info);
+		return -ENOMEM;
+	}
+
+	rxq_info->rspq_id = kcalloc(nrxq, sizeof(unsigned short), GFP_KERNEL);
+	if (!rxq_info->rspq_id) {
+		kfree(rxq_info->uldrxq);
+		kfree(rxq_info);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < rxq_info->nrxq; i++) {
+		struct sge_ofld_rxq *r = &rxq_info->uldrxq[i];
+
+		init_rspq(adap, &r->rspq, 5, 1, uld_info->rxq_size, 64);
+		r->rspq.uld = uld_type;
+		r->fl.size = 72;
+	}
+
+	ciq_size = 64 + adap->vres.cq.size + adap->tids.nftids;
+	if (ciq_size > SGE_MAX_IQ_SIZE) {
+		dev_warn(adap->pdev_dev, "CIQ size too small for available IQs\n");
+		ciq_size = SGE_MAX_IQ_SIZE;
+	}
+
+	for (i = rxq_info->nrxq; i < nrxq; i++) {
+		struct sge_ofld_rxq *r = &rxq_info->uldrxq[i];
+
+		init_rspq(adap, &r->rspq, 5, 1, ciq_size, 64);
+		r->rspq.uld = uld_type;
+	}
+
+	memcpy(rxq_info->name, uld_info->name, IFNAMSIZ);
+	adap->sge.uld_rxq_info[uld_type] = rxq_info;
+
+	return 0;
+}
+
+static void free_queues_uld(struct adapter *adap, unsigned int uld_type)
+{
+	struct sge_uld_rxq_info *rxq_info = adap->sge.uld_rxq_info[uld_type];
+
+	adap->sge.uld_rxq_info[uld_type] = NULL;
+	kfree(rxq_info->rspq_id);
+	kfree(rxq_info->uldrxq);
+	kfree(rxq_info);
+}
+
+static int
+request_msix_queue_irqs_uld(struct adapter *adap, unsigned int uld_type)
+{
+	struct sge_uld_rxq_info *rxq_info = adap->sge.uld_rxq_info[uld_type];
+	int err = 0;
+	unsigned int idx, bmap_idx;
+
+	for_each_uldrxq(rxq_info, idx) {
+		bmap_idx = rxq_info->msix_tbl[idx];
+		err = request_irq(adap->msix_info_ulds[bmap_idx].vec,
+				  t4_sge_intr_msix, 0,
+				  adap->msix_info_ulds[bmap_idx].desc,
+				  &rxq_info->uldrxq[idx].rspq);
+		if (err)
+			goto unwind;
+	}
+	return 0;
+unwind:
+	while (idx-- > 0) {
+		bmap_idx = rxq_info->msix_tbl[idx];
+		free_msix_idx_in_bmap(adap, bmap_idx);
+		free_irq(adap->msix_info_ulds[bmap_idx].vec,
+			 &rxq_info->uldrxq[idx].rspq);
+	}
+	return err;
+}
+
+static void
+free_msix_queue_irqs_uld(struct adapter *adap, unsigned int uld_type)
+{
+	struct sge_uld_rxq_info *rxq_info = adap->sge.uld_rxq_info[uld_type];
+	unsigned int idx, bmap_idx;
+
+	for_each_uldrxq(rxq_info, idx) {
+		bmap_idx = rxq_info->msix_tbl[idx];
+
+		free_msix_idx_in_bmap(adap, bmap_idx);
+		free_irq(adap->msix_info_ulds[bmap_idx].vec,
+			 &rxq_info->uldrxq[idx].rspq);
+	}
+}
+
+static void name_msix_vecs_uld(struct adapter *adap, unsigned int uld_type)
+{
+	struct sge_uld_rxq_info *rxq_info = adap->sge.uld_rxq_info[uld_type];
+	int n = sizeof(adap->msix_info_ulds[0].desc);
+	unsigned int idx, bmap_idx;
+
+	for_each_uldrxq(rxq_info, idx) {
+		bmap_idx = rxq_info->msix_tbl[idx];
+
+		snprintf(adap->msix_info_ulds[bmap_idx].desc, n, "%s-%s%d",
+			 adap->port[0]->name, rxq_info->name, idx);
+	}
+}
+
+static void enable_rx(struct adapter *adap, struct sge_rspq *q)
+{
+	if (!q)
+		return;
+
+	if (q->handler)
+		napi_enable(&q->napi);
+
+	/* 0-increment GTS to start the timer and enable interrupts */
+	t4_write_reg(adap, MYPF_REG(SGE_PF_GTS_A),
+		     SEINTARM_V(q->intr_params) |
+		     INGRESSQID_V(q->cntxt_id));
+}
+
+static void quiesce_rx(struct adapter *adap, struct sge_rspq *q)
+{
+	if (q && q->handler)
+		napi_disable(&q->napi);
+}
+
+static void enable_rx_uld(struct adapter *adap, unsigned int uld_type)
+{
+	struct sge_uld_rxq_info *rxq_info = adap->sge.uld_rxq_info[uld_type];
+	int idx;
+
+	for_each_uldrxq(rxq_info, idx)
+		enable_rx(adap, &rxq_info->uldrxq[idx].rspq);
+}
+
+static void quiesce_rx_uld(struct adapter *adap, unsigned int uld_type)
+{
+	struct sge_uld_rxq_info *rxq_info = adap->sge.uld_rxq_info[uld_type];
+	int idx;
+
+	for_each_uldrxq(rxq_info, idx)
+		quiesce_rx(adap, &rxq_info->uldrxq[idx].rspq);
+}
+
+static void
+free_sge_txq_uld(struct adapter *adap, struct sge_uld_txq_info *txq_info)
+{
+	int nq = txq_info->ntxq;
+	int i;
+
+	for (i = 0; i < nq; i++) {
+		struct sge_uld_txq *txq = &txq_info->uldtxq[i];
+
+		if (txq && txq->q.desc) {
+			tasklet_kill(&txq->qresume_tsk);
+			t4_ofld_eq_free(adap, adap->mbox, adap->pf, 0,
+					txq->q.cntxt_id);
+			free_tx_desc(adap, &txq->q, txq->q.in_use, false);
+			kfree(txq->q.sdesc);
+			__skb_queue_purge(&txq->sendq);
+			free_txq(adap, &txq->q);
+		}
+	}
+}
+
+static int
+alloc_sge_txq_uld(struct adapter *adap, struct sge_uld_txq_info *txq_info,
+		  unsigned int uld_type)
+{
+	struct sge *s = &adap->sge;
+	int nq = txq_info->ntxq;
+	int i, j, err;
+
+	j = nq / adap->params.nports;
+	for (i = 0; i < nq; i++) {
+		struct sge_uld_txq *txq = &txq_info->uldtxq[i];
+
+		txq->q.size = 1024;
+		err = t4_sge_alloc_uld_txq(adap, txq, adap->port[i / j],
+					   s->fw_evtq.cntxt_id, uld_type);
+		if (err)
+			goto freeout;
+	}
+	return 0;
+freeout:
+	free_sge_txq_uld(adap, txq_info);
+	return err;
+}
+
+static void
+release_sge_txq_uld(struct adapter *adap, unsigned int uld_type)
+{
+	struct sge_uld_txq_info *txq_info = NULL;
+	int tx_uld_type = TX_ULD(uld_type);
+
+	txq_info = adap->sge.uld_txq_info[tx_uld_type];
+
+	if (txq_info && atomic_dec_and_test(&txq_info->users)) {
+		free_sge_txq_uld(adap, txq_info);
+		kfree(txq_info->uldtxq);
+		kfree(txq_info);
+		adap->sge.uld_txq_info[tx_uld_type] = NULL;
+	}
+}
+
+static int
+setup_sge_txq_uld(struct adapter *adap, unsigned int uld_type,
+		  const struct cxgb4_uld_info *uld_info)
+{
+	struct sge_uld_txq_info *txq_info = NULL;
+	int tx_uld_type, i;
+
+	tx_uld_type = TX_ULD(uld_type);
+	txq_info = adap->sge.uld_txq_info[tx_uld_type];
+
+	if ((tx_uld_type == CXGB4_TX_OFLD) && txq_info &&
+	    (atomic_inc_return(&txq_info->users) > 1))
+		return 0;
+
+	txq_info = kzalloc(sizeof(*txq_info), GFP_KERNEL);
+	if (!txq_info)
+		return -ENOMEM;
+
+	i = min_t(int, uld_info->ntxq, num_online_cpus());
+	txq_info->ntxq = roundup(i, adap->params.nports);
+
+	txq_info->uldtxq = kcalloc(txq_info->ntxq, sizeof(struct sge_uld_txq),
+				   GFP_KERNEL);
+	if (!txq_info->uldtxq) {
+		kfree(txq_info);
+		return -ENOMEM;
+	}
+
+	if (alloc_sge_txq_uld(adap, txq_info, tx_uld_type)) {
+		kfree(txq_info->uldtxq);
+		kfree(txq_info);
+		return -ENOMEM;
+	}
+
+	atomic_inc(&txq_info->users);
+	adap->sge.uld_txq_info[tx_uld_type] = txq_info;
+	return 0;
+}
+
+static void uld_queue_init(struct adapter *adap, unsigned int uld_type,
+			   struct cxgb4_lld_info *lli)
+{
+	struct sge_uld_rxq_info *rxq_info = adap->sge.uld_rxq_info[uld_type];
+
+	lli->rxq_ids = rxq_info->rspq_id;
+	lli->nrxq = rxq_info->nrxq;
+	lli->ciq_ids = rxq_info->rspq_id + rxq_info->nrxq;
+	lli->nciq = rxq_info->nciq;
+}
+
+int t4_uld_mem_alloc(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+
+	adap->uld = kcalloc(CXGB4_ULD_MAX, sizeof(*adap->uld), GFP_KERNEL);
+	if (!adap->uld)
+		return -ENOMEM;
+
+	s->uld_rxq_info = kzalloc(CXGB4_ULD_MAX *
+				  sizeof(struct sge_uld_rxq_info *),
+				  GFP_KERNEL);
+	if (!s->uld_rxq_info)
+		goto err_uld;
+
+	s->uld_txq_info = kzalloc(CXGB4_TX_MAX *
+				  sizeof(struct sge_uld_txq_info *),
+				  GFP_KERNEL);
+	if (!s->uld_txq_info)
+		goto err_uld_rx;
+	return 0;
+
+err_uld_rx:
+	kfree(s->uld_rxq_info);
+err_uld:
+	kfree(adap->uld);
+	return -ENOMEM;
+}
+
+void t4_uld_mem_free(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+
+	kfree(s->uld_txq_info);
+	kfree(s->uld_rxq_info);
+	kfree(adap->uld);
+}
+
+/* This function should be called with uld_mutex taken. */
+static void cxgb4_shutdown_uld_adapter(struct adapter *adap, enum cxgb4_uld type)
+{
+	if (adap->uld[type].handle) {
+		adap->uld[type].handle = NULL;
+		adap->uld[type].add = NULL;
+		release_sge_txq_uld(adap, type);
+
+		if (adap->flags & FULL_INIT_DONE)
+			quiesce_rx_uld(adap, type);
+
+		if (adap->flags & USING_MSIX)
+			free_msix_queue_irqs_uld(adap, type);
+
+		free_sge_queues_uld(adap, type);
+		free_queues_uld(adap, type);
+	}
+}
+
+void t4_uld_clean_up(struct adapter *adap)
+{
+	unsigned int i;
+
+	mutex_lock(&uld_mutex);
+	for (i = 0; i < CXGB4_ULD_MAX; i++) {
+		if (!adap->uld[i].handle)
+			continue;
+
+		cxgb4_shutdown_uld_adapter(adap, i);
+	}
+	mutex_unlock(&uld_mutex);
+}
+
+static void uld_init(struct adapter *adap, struct cxgb4_lld_info *lld)
+{
+	int i;
+
+	lld->pdev = adap->pdev;
+	lld->pf = adap->pf;
+	lld->l2t = adap->l2t;
+	lld->tids = &adap->tids;
+	lld->ports = adap->port;
+	lld->vr = &adap->vres;
+	lld->mtus = adap->params.mtus;
+	lld->ntxq = adap->sge.ofldqsets;
+	lld->nchan = adap->params.nports;
+	lld->nports = adap->params.nports;
+	lld->wr_cred = adap->params.ofldq_wr_cred;
+	lld->crypto = adap->params.crypto;
+	lld->iscsi_iolen = MAXRXDATA_G(t4_read_reg(adap, TP_PARA_REG2_A));
+	lld->iscsi_tagmask = t4_read_reg(adap, ULP_RX_ISCSI_TAGMASK_A);
+	lld->iscsi_pgsz_order = t4_read_reg(adap, ULP_RX_ISCSI_PSZ_A);
+	lld->iscsi_llimit = t4_read_reg(adap, ULP_RX_ISCSI_LLIMIT_A);
+	lld->iscsi_ppm = &adap->iscsi_ppm;
+	lld->adapter_type = adap->params.chip;
+	lld->cclk_ps = 1000000000 / adap->params.vpd.cclk;
+	lld->udb_density = 1 << adap->params.sge.eq_qpp;
+	lld->ucq_density = 1 << adap->params.sge.iq_qpp;
+	lld->filt_mode = adap->params.tp.vlan_pri_map;
+	/* MODQ_REQ_MAP sets queues 0-3 to chan 0-3 */
+	for (i = 0; i < NCHAN; i++)
+		lld->tx_modq[i] = i;
+	lld->gts_reg = adap->regs + MYPF_REG(SGE_PF_GTS_A);
+	lld->db_reg = adap->regs + MYPF_REG(SGE_PF_KDOORBELL_A);
+	lld->fw_vers = adap->params.fw_vers;
+	lld->dbfifo_int_thresh = dbfifo_int_thresh;
+	lld->sge_ingpadboundary = adap->sge.fl_align;
+	lld->sge_egrstatuspagesize = adap->sge.stat_len;
+	lld->sge_pktshift = adap->sge.pktshift;
+	lld->ulp_crypto = adap->params.crypto;
+	lld->enable_fw_ofld_conn = adap->flags & FW_OFLD_CONN;
+	lld->max_ordird_qp = adap->params.max_ordird_qp;
+	lld->max_ird_adapter = adap->params.max_ird_adapter;
+	lld->ulptx_memwrite_dsgl = adap->params.ulptx_memwrite_dsgl;
+	lld->nodeid = dev_to_node(adap->pdev_dev);
+	lld->fr_nsmr_tpte_wr_support = adap->params.fr_nsmr_tpte_wr_support;
+	lld->write_w_imm_support = adap->params.write_w_imm_support;
+	lld->write_cmpl_support = adap->params.write_cmpl_support;
+}
+
+static void uld_attach(struct adapter *adap, unsigned int uld)
+{
+	void *handle;
+	struct cxgb4_lld_info lli;
+
+	uld_init(adap, &lli);
+	uld_queue_init(adap, uld, &lli);
+
+	handle = adap->uld[uld].add(&lli);
+	if (IS_ERR(handle)) {
+		dev_warn(adap->pdev_dev,
+			 "could not attach to the %s driver, error %ld\n",
+			 adap->uld[uld].name, PTR_ERR(handle));
+		return;
+	}
+
+	adap->uld[uld].handle = handle;
+	t4_register_netevent_notifier();
+
+	if (adap->flags & FULL_INIT_DONE)
+		adap->uld[uld].state_change(handle, CXGB4_STATE_UP);
+}
+
+/**
+ *	cxgb4_register_uld - register an upper-layer driver
+ *	@type: the ULD type
+ *	@p: the ULD methods
+ *
+ *	Registers an upper-layer driver with this driver and notifies the ULD
+ *	about any presently available devices that support its type.  Returns
+ *	%-EBUSY if a ULD of the same type is already registered.
+ */
+int cxgb4_register_uld(enum cxgb4_uld type,
+		       const struct cxgb4_uld_info *p)
+{
+	int ret = 0;
+	unsigned int adap_idx = 0;
+	struct adapter *adap;
+
+	if (type >= CXGB4_ULD_MAX)
+		return -EINVAL;
+
+	mutex_lock(&uld_mutex);
+	list_for_each_entry(adap, &adapter_list, list_node) {
+		if ((type == CXGB4_ULD_CRYPTO && !is_pci_uld(adap)) ||
+		    (type != CXGB4_ULD_CRYPTO && !is_offload(adap)))
+			continue;
+		if (type == CXGB4_ULD_ISCSIT && is_t4(adap->params.chip))
+			continue;
+		ret = cfg_queues_uld(adap, type, p);
+		if (ret)
+			goto out;
+		ret = setup_sge_queues_uld(adap, type, p->lro);
+		if (ret)
+			goto free_queues;
+		if (adap->flags & USING_MSIX) {
+			name_msix_vecs_uld(adap, type);
+			ret = request_msix_queue_irqs_uld(adap, type);
+			if (ret)
+				goto free_rxq;
+		}
+		if (adap->flags & FULL_INIT_DONE)
+			enable_rx_uld(adap, type);
+		if (adap->uld[type].add) {
+			ret = -EBUSY;
+			goto free_irq;
+		}
+		ret = setup_sge_txq_uld(adap, type, p);
+		if (ret)
+			goto free_irq;
+		adap->uld[type] = *p;
+		uld_attach(adap, type);
+		adap_idx++;
+	}
+	mutex_unlock(&uld_mutex);
+	return 0;
+
+free_irq:
+	if (adap->flags & FULL_INIT_DONE)
+		quiesce_rx_uld(adap, type);
+	if (adap->flags & USING_MSIX)
+		free_msix_queue_irqs_uld(adap, type);
+free_rxq:
+	free_sge_queues_uld(adap, type);
+free_queues:
+	free_queues_uld(adap, type);
+out:
+
+	list_for_each_entry(adap, &adapter_list, list_node) {
+		if ((type == CXGB4_ULD_CRYPTO && !is_pci_uld(adap)) ||
+		    (type != CXGB4_ULD_CRYPTO && !is_offload(adap)))
+			continue;
+		if (type == CXGB4_ULD_ISCSIT && is_t4(adap->params.chip))
+			continue;
+		if (!adap_idx)
+			break;
+		adap->uld[type].handle = NULL;
+		adap->uld[type].add = NULL;
+		release_sge_txq_uld(adap, type);
+		if (adap->flags & FULL_INIT_DONE)
+			quiesce_rx_uld(adap, type);
+		if (adap->flags & USING_MSIX)
+			free_msix_queue_irqs_uld(adap, type);
+		free_sge_queues_uld(adap, type);
+		free_queues_uld(adap, type);
+		adap_idx--;
+	}
+	mutex_unlock(&uld_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(cxgb4_register_uld);
+
+/**
+ *	cxgb4_unregister_uld - unregister an upper-layer driver
+ *	@type: the ULD type
+ *
+ *	Unregisters an existing upper-layer driver.
+ */
+int cxgb4_unregister_uld(enum cxgb4_uld type)
+{
+	struct adapter *adap;
+
+	if (type >= CXGB4_ULD_MAX)
+		return -EINVAL;
+
+	mutex_lock(&uld_mutex);
+	list_for_each_entry(adap, &adapter_list, list_node) {
+		if ((type == CXGB4_ULD_CRYPTO && !is_pci_uld(adap)) ||
+		    (type != CXGB4_ULD_CRYPTO && !is_offload(adap)))
+			continue;
+		if (type == CXGB4_ULD_ISCSIT && is_t4(adap->params.chip))
+			continue;
+
+		cxgb4_shutdown_uld_adapter(adap, type);
+	}
+	mutex_unlock(&uld_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(cxgb4_unregister_uld);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
new file mode 100644
index 0000000..de9ad31
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -0,0 +1,424 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_ULD_H
+#define __CXGB4_ULD_H
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/atomic.h>
+#include "cxgb4.h"
+
+#define MAX_ULD_QSETS 16
+
+/* CPL message priority levels */
+enum {
+	CPL_PRIORITY_DATA     = 0,  /* data messages */
+	CPL_PRIORITY_SETUP    = 1,  /* connection setup messages */
+	CPL_PRIORITY_TEARDOWN = 0,  /* connection teardown messages */
+	CPL_PRIORITY_LISTEN   = 1,  /* listen start/stop messages */
+	CPL_PRIORITY_ACK      = 1,  /* RX ACK messages */
+	CPL_PRIORITY_CONTROL  = 1   /* control messages */
+};
+
+#define INIT_TP_WR(w, tid) do { \
+	(w)->wr.wr_hi = htonl(FW_WR_OP_V(FW_TP_WR) | \
+			      FW_WR_IMMDLEN_V(sizeof(*w) - sizeof(w->wr))); \
+	(w)->wr.wr_mid = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*w), 16)) | \
+			       FW_WR_FLOWID_V(tid)); \
+	(w)->wr.wr_lo = cpu_to_be64(0); \
+} while (0)
+
+#define INIT_TP_WR_CPL(w, cpl, tid) do { \
+	INIT_TP_WR(w, tid); \
+	OPCODE_TID(w) = htonl(MK_OPCODE_TID(cpl, tid)); \
+} while (0)
+
+#define INIT_ULPTX_WR(w, wrlen, atomic, tid) do { \
+	(w)->wr.wr_hi = htonl(FW_WR_OP_V(FW_ULPTX_WR) | \
+			      FW_WR_ATOMIC_V(atomic)); \
+	(w)->wr.wr_mid = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(wrlen, 16)) | \
+			       FW_WR_FLOWID_V(tid)); \
+	(w)->wr.wr_lo = cpu_to_be64(0); \
+} while (0)
+
+/* Special asynchronous notification message */
+#define CXGB4_MSG_AN ((void *)1)
+#define TX_ULD(uld)(((uld) != CXGB4_ULD_CRYPTO) ? CXGB4_TX_OFLD :\
+		      CXGB4_TX_CRYPTO)
+
+struct serv_entry {
+	void *data;
+};
+
+union aopen_entry {
+	void *data;
+	union aopen_entry *next;
+};
+
+/*
+ * Holds the size, base address, free list start, etc of the TID, server TID,
+ * and active-open TID tables.  The tables themselves are allocated dynamically.
+ */
+struct tid_info {
+	void **tid_tab;
+	unsigned int ntids;
+
+	struct serv_entry *stid_tab;
+	unsigned long *stid_bmap;
+	unsigned int nstids;
+	unsigned int stid_base;
+	unsigned int hash_base;
+
+	union aopen_entry *atid_tab;
+	unsigned int natids;
+	unsigned int atid_base;
+
+	struct filter_entry *ftid_tab;
+	unsigned long *ftid_bmap;
+	unsigned int nftids;
+	unsigned int ftid_base;
+	unsigned int aftid_base;
+	unsigned int aftid_end;
+	/* Server filter region */
+	unsigned int sftid_base;
+	unsigned int nsftids;
+
+	spinlock_t atid_lock ____cacheline_aligned_in_smp;
+	union aopen_entry *afree;
+	unsigned int atids_in_use;
+
+	spinlock_t stid_lock;
+	unsigned int stids_in_use;
+	unsigned int v6_stids_in_use;
+	unsigned int sftids_in_use;
+
+	/* TIDs in the TCAM */
+	atomic_t tids_in_use;
+	/* TIDs in the HASH */
+	atomic_t hash_tids_in_use;
+	atomic_t conns_in_use;
+	/* lock for setting/clearing filter bitmap */
+	spinlock_t ftid_lock;
+};
+
+static inline void *lookup_tid(const struct tid_info *t, unsigned int tid)
+{
+	return tid < t->ntids ? t->tid_tab[tid] : NULL;
+}
+
+static inline void *lookup_atid(const struct tid_info *t, unsigned int atid)
+{
+	return atid < t->natids ? t->atid_tab[atid].data : NULL;
+}
+
+static inline void *lookup_stid(const struct tid_info *t, unsigned int stid)
+{
+	/* Is it a server filter TID? */
+	if (t->nsftids && (stid >= t->sftid_base)) {
+		stid -= t->sftid_base;
+		stid += t->nstids;
+	} else {
+		stid -= t->stid_base;
+	}
+
+	return stid < (t->nstids + t->nsftids) ? t->stid_tab[stid].data : NULL;
+}
+
+static inline void cxgb4_insert_tid(struct tid_info *t, void *data,
+				    unsigned int tid, unsigned short family)
+{
+	t->tid_tab[tid] = data;
+	if (t->hash_base && (tid >= t->hash_base)) {
+		if (family == AF_INET6)
+			atomic_add(2, &t->hash_tids_in_use);
+		else
+			atomic_inc(&t->hash_tids_in_use);
+	} else {
+		if (family == AF_INET6)
+			atomic_add(2, &t->tids_in_use);
+		else
+			atomic_inc(&t->tids_in_use);
+	}
+	atomic_inc(&t->conns_in_use);
+}
+
+int cxgb4_alloc_atid(struct tid_info *t, void *data);
+int cxgb4_alloc_stid(struct tid_info *t, int family, void *data);
+int cxgb4_alloc_sftid(struct tid_info *t, int family, void *data);
+void cxgb4_free_atid(struct tid_info *t, unsigned int atid);
+void cxgb4_free_stid(struct tid_info *t, unsigned int stid, int family);
+void cxgb4_remove_tid(struct tid_info *t, unsigned int qid, unsigned int tid,
+		      unsigned short family);
+struct in6_addr;
+
+int cxgb4_create_server(const struct net_device *dev, unsigned int stid,
+			__be32 sip, __be16 sport, __be16 vlan,
+			unsigned int queue);
+int cxgb4_create_server6(const struct net_device *dev, unsigned int stid,
+			 const struct in6_addr *sip, __be16 sport,
+			 unsigned int queue);
+int cxgb4_remove_server(const struct net_device *dev, unsigned int stid,
+			unsigned int queue, bool ipv6);
+int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid,
+			       __be32 sip, __be16 sport, __be16 vlan,
+			       unsigned int queue,
+			       unsigned char port, unsigned char mask);
+int cxgb4_remove_server_filter(const struct net_device *dev, unsigned int stid,
+			       unsigned int queue, bool ipv6);
+
+/* Filter operation context to allow callers of cxgb4_set_filter() and
+ * cxgb4_del_filter() to wait for an asynchronous completion.
+ */
+struct filter_ctx {
+	struct completion completion;	/* completion rendezvous */
+	void *closure;			/* caller's opaque information */
+	int result;			/* result of operation */
+	u32 tid;			/* to store tid */
+};
+
+struct ch_filter_specification;
+
+int cxgb4_get_free_ftid(struct net_device *dev, int family);
+int __cxgb4_set_filter(struct net_device *dev, int filter_id,
+		       struct ch_filter_specification *fs,
+		       struct filter_ctx *ctx);
+int __cxgb4_del_filter(struct net_device *dev, int filter_id,
+		       struct ch_filter_specification *fs,
+		       struct filter_ctx *ctx);
+int cxgb4_set_filter(struct net_device *dev, int filter_id,
+		     struct ch_filter_specification *fs);
+int cxgb4_del_filter(struct net_device *dev, int filter_id,
+		     struct ch_filter_specification *fs);
+int cxgb4_get_filter_counters(struct net_device *dev, unsigned int fidx,
+			      u64 *hitcnt, u64 *bytecnt, bool hash);
+
+static inline void set_wr_txq(struct sk_buff *skb, int prio, int queue)
+{
+	skb_set_queue_mapping(skb, (queue << 1) | prio);
+}
+
+enum cxgb4_uld {
+	CXGB4_ULD_INIT,
+	CXGB4_ULD_RDMA,
+	CXGB4_ULD_ISCSI,
+	CXGB4_ULD_ISCSIT,
+	CXGB4_ULD_CRYPTO,
+	CXGB4_ULD_TLS,
+	CXGB4_ULD_MAX
+};
+
+enum cxgb4_tx_uld {
+	CXGB4_TX_OFLD,
+	CXGB4_TX_CRYPTO,
+	CXGB4_TX_MAX
+};
+
+enum cxgb4_txq_type {
+	CXGB4_TXQ_ETH,
+	CXGB4_TXQ_ULD,
+	CXGB4_TXQ_CTRL,
+	CXGB4_TXQ_MAX
+};
+
+enum cxgb4_state {
+	CXGB4_STATE_UP,
+	CXGB4_STATE_START_RECOVERY,
+	CXGB4_STATE_DOWN,
+	CXGB4_STATE_DETACH,
+	CXGB4_STATE_FATAL_ERROR
+};
+
+enum cxgb4_control {
+	CXGB4_CONTROL_DB_FULL,
+	CXGB4_CONTROL_DB_EMPTY,
+	CXGB4_CONTROL_DB_DROP,
+};
+
+struct pci_dev;
+struct l2t_data;
+struct net_device;
+struct pkt_gl;
+struct tp_tcp_stats;
+struct t4_lro_mgr;
+
+struct cxgb4_range {
+	unsigned int start;
+	unsigned int size;
+};
+
+struct cxgb4_virt_res {                      /* virtualized HW resources */
+	struct cxgb4_range ddp;
+	struct cxgb4_range iscsi;
+	struct cxgb4_range stag;
+	struct cxgb4_range rq;
+	struct cxgb4_range srq;
+	struct cxgb4_range pbl;
+	struct cxgb4_range qp;
+	struct cxgb4_range cq;
+	struct cxgb4_range ocq;
+	struct cxgb4_range key;
+	unsigned int ncrypto_fc;
+};
+
+struct chcr_stats_debug {
+	atomic_t cipher_rqst;
+	atomic_t digest_rqst;
+	atomic_t aead_rqst;
+	atomic_t complete;
+	atomic_t error;
+	atomic_t fallback;
+	atomic_t ipsec_cnt;
+	atomic_t tls_pdu_tx;
+	atomic_t tls_pdu_rx;
+	atomic_t tls_key;
+};
+
+#define OCQ_WIN_OFFSET(pdev, vres) \
+	(pci_resource_len((pdev), 2) - roundup_pow_of_two((vres)->ocq.size))
+
+/*
+ * Block of information the LLD provides to ULDs attaching to a device.
+ */
+struct cxgb4_lld_info {
+	struct pci_dev *pdev;                /* associated PCI device */
+	struct l2t_data *l2t;                /* L2 table */
+	struct tid_info *tids;               /* TID table */
+	struct net_device **ports;           /* device ports */
+	const struct cxgb4_virt_res *vr;     /* assorted HW resources */
+	const unsigned short *mtus;          /* MTU table */
+	const unsigned short *rxq_ids;       /* the ULD's Rx queue ids */
+	const unsigned short *ciq_ids;       /* the ULD's concentrator IQ ids */
+	unsigned short nrxq;                 /* # of Rx queues */
+	unsigned short ntxq;                 /* # of Tx queues */
+	unsigned short nciq;		     /* # of concentrator IQ */
+	unsigned char nchan:4;               /* # of channels */
+	unsigned char nports:4;              /* # of ports */
+	unsigned char wr_cred;               /* WR 16-byte credits */
+	unsigned char adapter_type;          /* type of adapter */
+	unsigned char fw_api_ver;            /* FW API version */
+	unsigned char crypto;                /* crypto support */
+	unsigned int fw_vers;                /* FW version */
+	unsigned int iscsi_iolen;            /* iSCSI max I/O length */
+	unsigned int cclk_ps;                /* Core clock period in psec */
+	unsigned short udb_density;          /* # of user DB/page */
+	unsigned short ucq_density;          /* # of user CQs/page */
+	unsigned short filt_mode;            /* filter optional components */
+	unsigned short tx_modq[NCHAN];       /* maps each tx channel to a */
+					     /* scheduler queue */
+	void __iomem *gts_reg;               /* address of GTS register */
+	void __iomem *db_reg;                /* address of kernel doorbell */
+	int dbfifo_int_thresh;		     /* doorbell fifo int threshold */
+	unsigned int sge_ingpadboundary;     /* SGE ingress padding boundary */
+	unsigned int sge_egrstatuspagesize;  /* SGE egress status page size */
+	unsigned int sge_pktshift;           /* Padding between CPL and */
+					     /*	packet data */
+	unsigned int pf;		     /* Physical Function we're using */
+	bool enable_fw_ofld_conn;            /* Enable connection through fw */
+					     /* WR */
+	unsigned int max_ordird_qp;          /* Max ORD/IRD depth per RDMA QP */
+	unsigned int max_ird_adapter;        /* Max IRD memory per adapter */
+	bool ulptx_memwrite_dsgl;            /* use of T5 DSGL allowed */
+	unsigned int iscsi_tagmask;	     /* iscsi ddp tag mask */
+	unsigned int iscsi_pgsz_order;	     /* iscsi ddp page size orders */
+	unsigned int iscsi_llimit;	     /* chip's iscsi region llimit */
+	unsigned int ulp_crypto;             /* crypto lookaside support */
+	void **iscsi_ppm;		     /* iscsi page pod manager */
+	int nodeid;			     /* device numa node id */
+	bool fr_nsmr_tpte_wr_support;	     /* FW supports FR_NSMR_TPTE_WR */
+	bool write_w_imm_support;         /* FW supports WRITE_WITH_IMMEDIATE */
+	bool write_cmpl_support;             /* FW supports WRITE_CMPL WR */
+};
+
+struct cxgb4_uld_info {
+	char name[IFNAMSIZ];
+	void *handle;
+	unsigned int nrxq;
+	unsigned int rxq_size;
+	unsigned int ntxq;
+	bool ciq;
+	bool lro;
+	void *(*add)(const struct cxgb4_lld_info *p);
+	int (*rx_handler)(void *handle, const __be64 *rsp,
+			  const struct pkt_gl *gl);
+	int (*state_change)(void *handle, enum cxgb4_state new_state);
+	int (*control)(void *handle, enum cxgb4_control control, ...);
+	int (*lro_rx_handler)(void *handle, const __be64 *rsp,
+			      const struct pkt_gl *gl,
+			      struct t4_lro_mgr *lro_mgr,
+			      struct napi_struct *napi);
+	void (*lro_flush)(struct t4_lro_mgr *);
+	int (*tx_handler)(struct sk_buff *skb, struct net_device *dev);
+};
+
+int cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p);
+int cxgb4_unregister_uld(enum cxgb4_uld type);
+int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb);
+int cxgb4_immdata_send(struct net_device *dev, unsigned int idx,
+		       const void *src, unsigned int len);
+int cxgb4_crypto_send(struct net_device *dev, struct sk_buff *skb);
+unsigned int cxgb4_dbfifo_count(const struct net_device *dev, int lpfifo);
+unsigned int cxgb4_port_chan(const struct net_device *dev);
+unsigned int cxgb4_port_viid(const struct net_device *dev);
+unsigned int cxgb4_tp_smt_idx(enum chip_type chip, unsigned int viid);
+unsigned int cxgb4_port_idx(const struct net_device *dev);
+unsigned int cxgb4_best_mtu(const unsigned short *mtus, unsigned short mtu,
+			    unsigned int *idx);
+unsigned int cxgb4_best_aligned_mtu(const unsigned short *mtus,
+				    unsigned short header_size,
+				    unsigned short data_size_max,
+				    unsigned short data_size_align,
+				    unsigned int *mtu_idxp);
+void cxgb4_get_tcp_stats(struct pci_dev *pdev, struct tp_tcp_stats *v4,
+			 struct tp_tcp_stats *v6);
+void cxgb4_iscsi_init(struct net_device *dev, unsigned int tag_mask,
+		      const unsigned int *pgsz_order);
+struct sk_buff *cxgb4_pktgl_to_skb(const struct pkt_gl *gl,
+				   unsigned int skb_len, unsigned int pull_len);
+int cxgb4_sync_txq_pidx(struct net_device *dev, u16 qid, u16 pidx, u16 size);
+int cxgb4_flush_eq_cache(struct net_device *dev);
+int cxgb4_read_tpte(struct net_device *dev, u32 stag, __be32 *tpte);
+u64 cxgb4_read_sge_timestamp(struct net_device *dev);
+
+enum cxgb4_bar2_qtype { CXGB4_BAR2_QTYPE_EGRESS, CXGB4_BAR2_QTYPE_INGRESS };
+int cxgb4_bar2_sge_qregs(struct net_device *dev,
+			 unsigned int qid,
+			 enum cxgb4_bar2_qtype qtype,
+			 int user,
+			 u64 *pbar2_qoffset,
+			 unsigned int *pbar2_qid);
+
+#endif  /* !__CXGB4_ULD_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
new file mode 100644
index 0000000..1817a03
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
@@ -0,0 +1,759 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <net/neighbour.h>
+#include "cxgb4.h"
+#include "l2t.h"
+#include "t4_msg.h"
+#include "t4fw_api.h"
+#include "t4_regs.h"
+#include "t4_values.h"
+
+/* identifies sync vs async L2T_WRITE_REQs */
+#define SYNC_WR_S    12
+#define SYNC_WR_V(x) ((x) << SYNC_WR_S)
+#define SYNC_WR_F    SYNC_WR_V(1)
+
+struct l2t_data {
+	unsigned int l2t_start;     /* start index of our piece of the L2T */
+	unsigned int l2t_size;      /* number of entries in l2tab */
+	rwlock_t lock;
+	atomic_t nfree;             /* number of free entries */
+	struct l2t_entry *rover;    /* starting point for next allocation */
+	struct l2t_entry l2tab[0];  /* MUST BE LAST */
+};
+
+static inline unsigned int vlan_prio(const struct l2t_entry *e)
+{
+	return e->vlan >> VLAN_PRIO_SHIFT;
+}
+
+static inline void l2t_hold(struct l2t_data *d, struct l2t_entry *e)
+{
+	if (atomic_add_return(1, &e->refcnt) == 1)  /* 0 -> 1 transition */
+		atomic_dec(&d->nfree);
+}
+
+/*
+ * To avoid having to check address families we do not allow v4 and v6
+ * neighbors to be on the same hash chain.  We keep v4 entries in the first
+ * half of available hash buckets and v6 in the second.  We need at least two
+ * entries in our L2T for this scheme to work.
+ */
+enum {
+	L2T_MIN_HASH_BUCKETS = 2,
+};
+
+static inline unsigned int arp_hash(struct l2t_data *d, const u32 *key,
+				    int ifindex)
+{
+	unsigned int l2t_size_half = d->l2t_size / 2;
+
+	return jhash_2words(*key, ifindex, 0) % l2t_size_half;
+}
+
+static inline unsigned int ipv6_hash(struct l2t_data *d, const u32 *key,
+				     int ifindex)
+{
+	unsigned int l2t_size_half = d->l2t_size / 2;
+	u32 xor = key[0] ^ key[1] ^ key[2] ^ key[3];
+
+	return (l2t_size_half +
+		(jhash_2words(xor, ifindex, 0) % l2t_size_half));
+}
+
+static unsigned int addr_hash(struct l2t_data *d, const u32 *addr,
+			      int addr_len, int ifindex)
+{
+	return addr_len == 4 ? arp_hash(d, addr, ifindex) :
+			       ipv6_hash(d, addr, ifindex);
+}
+
+/*
+ * Checks if an L2T entry is for the given IP/IPv6 address.  It does not check
+ * whether the L2T entry and the address are of the same address family.
+ * Callers ensure an address is only checked against L2T entries of the same
+ * family, something made trivial by the separation of IP and IPv6 hash chains
+ * mentioned above.  Returns 0 if there's a match,
+ */
+static int addreq(const struct l2t_entry *e, const u32 *addr)
+{
+	if (e->v6)
+		return (e->addr[0] ^ addr[0]) | (e->addr[1] ^ addr[1]) |
+		       (e->addr[2] ^ addr[2]) | (e->addr[3] ^ addr[3]);
+	return e->addr[0] ^ addr[0];
+}
+
+static void neigh_replace(struct l2t_entry *e, struct neighbour *n)
+{
+	neigh_hold(n);
+	if (e->neigh)
+		neigh_release(e->neigh);
+	e->neigh = n;
+}
+
+/*
+ * Write an L2T entry.  Must be called with the entry locked.
+ * The write may be synchronous or asynchronous.
+ */
+static int write_l2e(struct adapter *adap, struct l2t_entry *e, int sync)
+{
+	struct l2t_data *d = adap->l2t;
+	unsigned int l2t_idx = e->idx + d->l2t_start;
+	struct sk_buff *skb;
+	struct cpl_l2t_write_req *req;
+
+	skb = alloc_skb(sizeof(*req), GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	req = __skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, 0);
+
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ,
+					l2t_idx | (sync ? SYNC_WR_F : 0) |
+					TID_QID_V(adap->sge.fw_evtq.abs_id)));
+	req->params = htons(L2T_W_PORT_V(e->lport) | L2T_W_NOREPLY_V(!sync));
+	req->l2t_idx = htons(l2t_idx);
+	req->vlan = htons(e->vlan);
+	if (e->neigh && !(e->neigh->dev->flags & IFF_LOOPBACK))
+		memcpy(e->dmac, e->neigh->ha, sizeof(e->dmac));
+	memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
+
+	t4_mgmt_tx(adap, skb);
+
+	if (sync && e->state != L2T_STATE_SWITCHING)
+		e->state = L2T_STATE_SYNC_WRITE;
+	return 0;
+}
+
+/*
+ * Send packets waiting in an L2T entry's ARP queue.  Must be called with the
+ * entry locked.
+ */
+static void send_pending(struct adapter *adap, struct l2t_entry *e)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&e->arpq)) != NULL)
+		t4_ofld_send(adap, skb);
+}
+
+/*
+ * Process a CPL_L2T_WRITE_RPL.  Wake up the ARP queue if it completes a
+ * synchronous L2T_WRITE.  Note that the TID in the reply is really the L2T
+ * index it refers to.
+ */
+void do_l2t_write_rpl(struct adapter *adap, const struct cpl_l2t_write_rpl *rpl)
+{
+	struct l2t_data *d = adap->l2t;
+	unsigned int tid = GET_TID(rpl);
+	unsigned int l2t_idx = tid % L2T_SIZE;
+
+	if (unlikely(rpl->status != CPL_ERR_NONE)) {
+		dev_err(adap->pdev_dev,
+			"Unexpected L2T_WRITE_RPL status %u for entry %u\n",
+			rpl->status, l2t_idx);
+		return;
+	}
+
+	if (tid & SYNC_WR_F) {
+		struct l2t_entry *e = &d->l2tab[l2t_idx - d->l2t_start];
+
+		spin_lock(&e->lock);
+		if (e->state != L2T_STATE_SWITCHING) {
+			send_pending(adap, e);
+			e->state = (e->neigh->nud_state & NUD_STALE) ?
+					L2T_STATE_STALE : L2T_STATE_VALID;
+		}
+		spin_unlock(&e->lock);
+	}
+}
+
+/*
+ * Add a packet to an L2T entry's queue of packets awaiting resolution.
+ * Must be called with the entry's lock held.
+ */
+static inline void arpq_enqueue(struct l2t_entry *e, struct sk_buff *skb)
+{
+	__skb_queue_tail(&e->arpq, skb);
+}
+
+int cxgb4_l2t_send(struct net_device *dev, struct sk_buff *skb,
+		   struct l2t_entry *e)
+{
+	struct adapter *adap = netdev2adap(dev);
+
+again:
+	switch (e->state) {
+	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
+		neigh_event_send(e->neigh, NULL);
+		spin_lock_bh(&e->lock);
+		if (e->state == L2T_STATE_STALE)
+			e->state = L2T_STATE_VALID;
+		spin_unlock_bh(&e->lock);
+	case L2T_STATE_VALID:     /* fast-path, send the packet on */
+		return t4_ofld_send(adap, skb);
+	case L2T_STATE_RESOLVING:
+	case L2T_STATE_SYNC_WRITE:
+		spin_lock_bh(&e->lock);
+		if (e->state != L2T_STATE_SYNC_WRITE &&
+		    e->state != L2T_STATE_RESOLVING) {
+			spin_unlock_bh(&e->lock);
+			goto again;
+		}
+		arpq_enqueue(e, skb);
+		spin_unlock_bh(&e->lock);
+
+		if (e->state == L2T_STATE_RESOLVING &&
+		    !neigh_event_send(e->neigh, NULL)) {
+			spin_lock_bh(&e->lock);
+			if (e->state == L2T_STATE_RESOLVING &&
+			    !skb_queue_empty(&e->arpq))
+				write_l2e(adap, e, 1);
+			spin_unlock_bh(&e->lock);
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cxgb4_l2t_send);
+
+/*
+ * Allocate a free L2T entry.  Must be called with l2t_data.lock held.
+ */
+static struct l2t_entry *alloc_l2e(struct l2t_data *d)
+{
+	struct l2t_entry *end, *e, **p;
+
+	if (!atomic_read(&d->nfree))
+		return NULL;
+
+	/* there's definitely a free entry */
+	for (e = d->rover, end = &d->l2tab[d->l2t_size]; e != end; ++e)
+		if (atomic_read(&e->refcnt) == 0)
+			goto found;
+
+	for (e = d->l2tab; atomic_read(&e->refcnt); ++e)
+		;
+found:
+	d->rover = e + 1;
+	atomic_dec(&d->nfree);
+
+	/*
+	 * The entry we found may be an inactive entry that is
+	 * presently in the hash table.  We need to remove it.
+	 */
+	if (e->state < L2T_STATE_SWITCHING)
+		for (p = &d->l2tab[e->hash].first; *p; p = &(*p)->next)
+			if (*p == e) {
+				*p = e->next;
+				e->next = NULL;
+				break;
+			}
+
+	e->state = L2T_STATE_UNUSED;
+	return e;
+}
+
+static struct l2t_entry *find_or_alloc_l2e(struct l2t_data *d, u16 vlan,
+					   u8 port, u8 *dmac)
+{
+	struct l2t_entry *end, *e, **p;
+	struct l2t_entry *first_free = NULL;
+
+	for (e = &d->l2tab[0], end = &d->l2tab[d->l2t_size]; e != end; ++e) {
+		if (atomic_read(&e->refcnt) == 0) {
+			if (!first_free)
+				first_free = e;
+		} else {
+			if (e->state == L2T_STATE_SWITCHING) {
+				if (ether_addr_equal(e->dmac, dmac) &&
+				    (e->vlan == vlan) && (e->lport == port))
+					goto exists;
+			}
+		}
+	}
+
+	if (first_free) {
+		e = first_free;
+		goto found;
+	}
+
+	return NULL;
+
+found:
+	/* The entry we found may be an inactive entry that is
+	 * presently in the hash table.  We need to remove it.
+	 */
+	if (e->state < L2T_STATE_SWITCHING)
+		for (p = &d->l2tab[e->hash].first; *p; p = &(*p)->next)
+			if (*p == e) {
+				*p = e->next;
+				e->next = NULL;
+				break;
+			}
+	e->state = L2T_STATE_UNUSED;
+
+exists:
+	return e;
+}
+
+/* Called when an L2T entry has no more users.  The entry is left in the hash
+ * table since it is likely to be reused but we also bump nfree to indicate
+ * that the entry can be reallocated for a different neighbor.  We also drop
+ * the existing neighbor reference in case the neighbor is going away and is
+ * waiting on our reference.
+ *
+ * Because entries can be reallocated to other neighbors once their ref count
+ * drops to 0 we need to take the entry's lock to avoid races with a new
+ * incarnation.
+ */
+static void _t4_l2e_free(struct l2t_entry *e)
+{
+	struct l2t_data *d;
+	struct sk_buff *skb;
+
+	if (atomic_read(&e->refcnt) == 0) {  /* hasn't been recycled */
+		if (e->neigh) {
+			neigh_release(e->neigh);
+			e->neigh = NULL;
+		}
+		while ((skb = __skb_dequeue(&e->arpq)) != NULL)
+			kfree_skb(skb);
+	}
+
+	d = container_of(e, struct l2t_data, l2tab[e->idx]);
+	atomic_inc(&d->nfree);
+}
+
+/* Locked version of _t4_l2e_free */
+static void t4_l2e_free(struct l2t_entry *e)
+{
+	struct l2t_data *d;
+	struct sk_buff *skb;
+
+	spin_lock_bh(&e->lock);
+	if (atomic_read(&e->refcnt) == 0) {  /* hasn't been recycled */
+		if (e->neigh) {
+			neigh_release(e->neigh);
+			e->neigh = NULL;
+		}
+		while ((skb = __skb_dequeue(&e->arpq)) != NULL)
+			kfree_skb(skb);
+	}
+	spin_unlock_bh(&e->lock);
+
+	d = container_of(e, struct l2t_data, l2tab[e->idx]);
+	atomic_inc(&d->nfree);
+}
+
+void cxgb4_l2t_release(struct l2t_entry *e)
+{
+	if (atomic_dec_and_test(&e->refcnt))
+		t4_l2e_free(e);
+}
+EXPORT_SYMBOL(cxgb4_l2t_release);
+
+/*
+ * Update an L2T entry that was previously used for the same next hop as neigh.
+ * Must be called with softirqs disabled.
+ */
+static void reuse_entry(struct l2t_entry *e, struct neighbour *neigh)
+{
+	unsigned int nud_state;
+
+	spin_lock(&e->lock);                /* avoid race with t4_l2t_free */
+	if (neigh != e->neigh)
+		neigh_replace(e, neigh);
+	nud_state = neigh->nud_state;
+	if (memcmp(e->dmac, neigh->ha, sizeof(e->dmac)) ||
+	    !(nud_state & NUD_VALID))
+		e->state = L2T_STATE_RESOLVING;
+	else if (nud_state & NUD_CONNECTED)
+		e->state = L2T_STATE_VALID;
+	else
+		e->state = L2T_STATE_STALE;
+	spin_unlock(&e->lock);
+}
+
+struct l2t_entry *cxgb4_l2t_get(struct l2t_data *d, struct neighbour *neigh,
+				const struct net_device *physdev,
+				unsigned int priority)
+{
+	u8 lport;
+	u16 vlan;
+	struct l2t_entry *e;
+	unsigned int addr_len = neigh->tbl->key_len;
+	u32 *addr = (u32 *)neigh->primary_key;
+	int ifidx = neigh->dev->ifindex;
+	int hash = addr_hash(d, addr, addr_len, ifidx);
+
+	if (neigh->dev->flags & IFF_LOOPBACK)
+		lport = netdev2pinfo(physdev)->tx_chan + 4;
+	else
+		lport = netdev2pinfo(physdev)->lport;
+
+	if (is_vlan_dev(neigh->dev))
+		vlan = vlan_dev_vlan_id(neigh->dev);
+	else
+		vlan = VLAN_NONE;
+
+	write_lock_bh(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next)
+		if (!addreq(e, addr) && e->ifindex == ifidx &&
+		    e->vlan == vlan && e->lport == lport) {
+			l2t_hold(d, e);
+			if (atomic_read(&e->refcnt) == 1)
+				reuse_entry(e, neigh);
+			goto done;
+		}
+
+	/* Need to allocate a new entry */
+	e = alloc_l2e(d);
+	if (e) {
+		spin_lock(&e->lock);          /* avoid race with t4_l2t_free */
+		e->state = L2T_STATE_RESOLVING;
+		if (neigh->dev->flags & IFF_LOOPBACK)
+			memcpy(e->dmac, physdev->dev_addr, sizeof(e->dmac));
+		memcpy(e->addr, addr, addr_len);
+		e->ifindex = ifidx;
+		e->hash = hash;
+		e->lport = lport;
+		e->v6 = addr_len == 16;
+		atomic_set(&e->refcnt, 1);
+		neigh_replace(e, neigh);
+		e->vlan = vlan;
+		e->next = d->l2tab[hash].first;
+		d->l2tab[hash].first = e;
+		spin_unlock(&e->lock);
+	}
+done:
+	write_unlock_bh(&d->lock);
+	return e;
+}
+EXPORT_SYMBOL(cxgb4_l2t_get);
+
+u64 cxgb4_select_ntuple(struct net_device *dev,
+			const struct l2t_entry *l2t)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct tp_params *tp = &adap->params.tp;
+	u64 ntuple = 0;
+
+	/* Initialize each of the fields which we care about which are present
+	 * in the Compressed Filter Tuple.
+	 */
+	if (tp->vlan_shift >= 0 && l2t->vlan != VLAN_NONE)
+		ntuple |= (u64)(FT_VLAN_VLD_F | l2t->vlan) << tp->vlan_shift;
+
+	if (tp->port_shift >= 0)
+		ntuple |= (u64)l2t->lport << tp->port_shift;
+
+	if (tp->protocol_shift >= 0)
+		ntuple |= (u64)IPPROTO_TCP << tp->protocol_shift;
+
+	if (tp->vnic_shift >= 0) {
+		u32 viid = cxgb4_port_viid(dev);
+		u32 vf = FW_VIID_VIN_G(viid);
+		u32 pf = FW_VIID_PFN_G(viid);
+		u32 vld = FW_VIID_VIVLD_G(viid);
+
+		ntuple |= (u64)(FT_VNID_ID_VF_V(vf) |
+				FT_VNID_ID_PF_V(pf) |
+				FT_VNID_ID_VLD_V(vld)) << tp->vnic_shift;
+	}
+
+	return ntuple;
+}
+EXPORT_SYMBOL(cxgb4_select_ntuple);
+
+/*
+ * Called when address resolution fails for an L2T entry to handle packets
+ * on the arpq head.  If a packet specifies a failure handler it is invoked,
+ * otherwise the packet is sent to the device.
+ */
+static void handle_failed_resolution(struct adapter *adap, struct l2t_entry *e)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&e->arpq)) != NULL) {
+		const struct l2t_skb_cb *cb = L2T_SKB_CB(skb);
+
+		spin_unlock(&e->lock);
+		if (cb->arp_err_handler)
+			cb->arp_err_handler(cb->handle, skb);
+		else
+			t4_ofld_send(adap, skb);
+		spin_lock(&e->lock);
+	}
+}
+
+/*
+ * Called when the host's neighbor layer makes a change to some entry that is
+ * loaded into the HW L2 table.
+ */
+void t4_l2t_update(struct adapter *adap, struct neighbour *neigh)
+{
+	struct l2t_entry *e;
+	struct sk_buff_head *arpq = NULL;
+	struct l2t_data *d = adap->l2t;
+	unsigned int addr_len = neigh->tbl->key_len;
+	u32 *addr = (u32 *) neigh->primary_key;
+	int ifidx = neigh->dev->ifindex;
+	int hash = addr_hash(d, addr, addr_len, ifidx);
+
+	read_lock_bh(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next)
+		if (!addreq(e, addr) && e->ifindex == ifidx) {
+			spin_lock(&e->lock);
+			if (atomic_read(&e->refcnt))
+				goto found;
+			spin_unlock(&e->lock);
+			break;
+		}
+	read_unlock_bh(&d->lock);
+	return;
+
+ found:
+	read_unlock(&d->lock);
+
+	if (neigh != e->neigh)
+		neigh_replace(e, neigh);
+
+	if (e->state == L2T_STATE_RESOLVING) {
+		if (neigh->nud_state & NUD_FAILED) {
+			arpq = &e->arpq;
+		} else if ((neigh->nud_state & (NUD_CONNECTED | NUD_STALE)) &&
+			   !skb_queue_empty(&e->arpq)) {
+			write_l2e(adap, e, 1);
+		}
+	} else {
+		e->state = neigh->nud_state & NUD_CONNECTED ?
+			L2T_STATE_VALID : L2T_STATE_STALE;
+		if (memcmp(e->dmac, neigh->ha, sizeof(e->dmac)))
+			write_l2e(adap, e, 0);
+	}
+
+	if (arpq)
+		handle_failed_resolution(adap, e);
+	spin_unlock_bh(&e->lock);
+}
+
+/* Allocate an L2T entry for use by a switching rule.  Such need to be
+ * explicitly freed and while busy they are not on any hash chain, so normal
+ * address resolution updates do not see them.
+ */
+struct l2t_entry *t4_l2t_alloc_switching(struct adapter *adap, u16 vlan,
+					 u8 port, u8 *eth_addr)
+{
+	struct l2t_data *d = adap->l2t;
+	struct l2t_entry *e;
+	int ret;
+
+	write_lock_bh(&d->lock);
+	e = find_or_alloc_l2e(d, vlan, port, eth_addr);
+	if (e) {
+		spin_lock(&e->lock);          /* avoid race with t4_l2t_free */
+		if (!atomic_read(&e->refcnt)) {
+			e->state = L2T_STATE_SWITCHING;
+			e->vlan = vlan;
+			e->lport = port;
+			ether_addr_copy(e->dmac, eth_addr);
+			atomic_set(&e->refcnt, 1);
+			ret = write_l2e(adap, e, 0);
+			if (ret < 0) {
+				_t4_l2e_free(e);
+				spin_unlock(&e->lock);
+				write_unlock_bh(&d->lock);
+				return NULL;
+			}
+		} else {
+			atomic_inc(&e->refcnt);
+		}
+
+		spin_unlock(&e->lock);
+	}
+	write_unlock_bh(&d->lock);
+	return e;
+}
+
+/**
+ * @dev: net_device pointer
+ * @vlan: VLAN Id
+ * @port: Associated port
+ * @dmac: Destination MAC address to add to L2T
+ * Returns pointer to the allocated l2t entry
+ *
+ * Allocates an L2T entry for use by switching rule of a filter
+ */
+struct l2t_entry *cxgb4_l2t_alloc_switching(struct net_device *dev, u16 vlan,
+					    u8 port, u8 *dmac)
+{
+	struct adapter *adap = netdev2adap(dev);
+
+	return t4_l2t_alloc_switching(adap, vlan, port, dmac);
+}
+EXPORT_SYMBOL(cxgb4_l2t_alloc_switching);
+
+struct l2t_data *t4_init_l2t(unsigned int l2t_start, unsigned int l2t_end)
+{
+	unsigned int l2t_size;
+	int i;
+	struct l2t_data *d;
+
+	if (l2t_start >= l2t_end || l2t_end >= L2T_SIZE)
+		return NULL;
+	l2t_size = l2t_end - l2t_start + 1;
+	if (l2t_size < L2T_MIN_HASH_BUCKETS)
+		return NULL;
+
+	d = kvzalloc(sizeof(*d) + l2t_size * sizeof(struct l2t_entry), GFP_KERNEL);
+	if (!d)
+		return NULL;
+
+	d->l2t_start = l2t_start;
+	d->l2t_size = l2t_size;
+
+	d->rover = d->l2tab;
+	atomic_set(&d->nfree, l2t_size);
+	rwlock_init(&d->lock);
+
+	for (i = 0; i < d->l2t_size; ++i) {
+		d->l2tab[i].idx = i;
+		d->l2tab[i].state = L2T_STATE_UNUSED;
+		spin_lock_init(&d->l2tab[i].lock);
+		atomic_set(&d->l2tab[i].refcnt, 0);
+		skb_queue_head_init(&d->l2tab[i].arpq);
+	}
+	return d;
+}
+
+static inline void *l2t_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct l2t_data *d = seq->private;
+
+	return pos >= d->l2t_size ? NULL : &d->l2tab[pos];
+}
+
+static void *l2t_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	return *pos ? l2t_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *l2t_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	v = l2t_get_idx(seq, *pos);
+	if (v)
+		++*pos;
+	return v;
+}
+
+static void l2t_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static char l2e_state(const struct l2t_entry *e)
+{
+	switch (e->state) {
+	case L2T_STATE_VALID: return 'V';
+	case L2T_STATE_STALE: return 'S';
+	case L2T_STATE_SYNC_WRITE: return 'W';
+	case L2T_STATE_RESOLVING:
+		return skb_queue_empty(&e->arpq) ? 'R' : 'A';
+	case L2T_STATE_SWITCHING: return 'X';
+	default:
+		return 'U';
+	}
+}
+
+static int l2t_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, " Idx IP address                "
+			 "Ethernet address  VLAN/P LP State Users Port\n");
+	else {
+		char ip[60];
+		struct l2t_data *d = seq->private;
+		struct l2t_entry *e = v;
+
+		spin_lock_bh(&e->lock);
+		if (e->state == L2T_STATE_SWITCHING)
+			ip[0] = '\0';
+		else
+			sprintf(ip, e->v6 ? "%pI6c" : "%pI4", e->addr);
+		seq_printf(seq, "%4u %-25s %17pM %4d %u %2u   %c   %5u %s\n",
+			   e->idx + d->l2t_start, ip, e->dmac,
+			   e->vlan & VLAN_VID_MASK, vlan_prio(e), e->lport,
+			   l2e_state(e), atomic_read(&e->refcnt),
+			   e->neigh ? e->neigh->dev->name : "");
+		spin_unlock_bh(&e->lock);
+	}
+	return 0;
+}
+
+static const struct seq_operations l2t_seq_ops = {
+	.start = l2t_seq_start,
+	.next = l2t_seq_next,
+	.stop = l2t_seq_stop,
+	.show = l2t_seq_show
+};
+
+static int l2t_seq_open(struct inode *inode, struct file *file)
+{
+	int rc = seq_open(file, &l2t_seq_ops);
+
+	if (!rc) {
+		struct adapter *adap = inode->i_private;
+		struct seq_file *seq = file->private_data;
+
+		seq->private = adap->l2t;
+	}
+	return rc;
+}
+
+const struct file_operations t4_l2t_fops = {
+	.owner = THIS_MODULE,
+	.open = l2t_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.h b/drivers/net/ethernet/chelsio/cxgb4/l2t.h
new file mode 100644
index 0000000..79665bd
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.h
@@ -0,0 +1,127 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_L2T_H
+#define __CXGB4_L2T_H
+
+#include <linux/spinlock.h>
+#include <linux/if_ether.h>
+#include <linux/atomic.h>
+
+#define VLAN_NONE 0xfff
+
+enum { L2T_SIZE = 4096 };     /* # of L2T entries */
+
+enum {
+	L2T_STATE_VALID,      /* entry is up to date */
+	L2T_STATE_STALE,      /* entry may be used but needs revalidation */
+	L2T_STATE_RESOLVING,  /* entry needs address resolution */
+	L2T_STATE_SYNC_WRITE, /* synchronous write of entry underway */
+	L2T_STATE_NOARP,      /* Netdev down or removed*/
+
+	/* when state is one of the below the entry is not hashed */
+	L2T_STATE_SWITCHING,  /* entry is being used by a switching filter */
+	L2T_STATE_UNUSED      /* entry not in use */
+};
+
+struct adapter;
+struct l2t_data;
+struct neighbour;
+struct net_device;
+struct file_operations;
+struct cpl_l2t_write_rpl;
+
+/*
+ * Each L2T entry plays multiple roles.  First of all, it keeps state for the
+ * corresponding entry of the HW L2 table and maintains a queue of offload
+ * packets awaiting address resolution.  Second, it is a node of a hash table
+ * chain, where the nodes of the chain are linked together through their next
+ * pointer.  Finally, each node is a bucket of a hash table, pointing to the
+ * first element in its chain through its first pointer.
+ */
+struct l2t_entry {
+	u16 state;                  /* entry state */
+	u16 idx;                    /* entry index within in-memory table */
+	u32 addr[4];                /* next hop IP or IPv6 address */
+	int ifindex;                /* neighbor's net_device's ifindex */
+	struct neighbour *neigh;    /* associated neighbour */
+	struct l2t_entry *first;    /* start of hash chain */
+	struct l2t_entry *next;     /* next l2t_entry on chain */
+	struct sk_buff_head arpq;   /* packet queue awaiting resolution */
+	spinlock_t lock;
+	atomic_t refcnt;            /* entry reference count */
+	u16 hash;                   /* hash bucket the entry is on */
+	u16 vlan;                   /* VLAN TCI (id: bits 0-11, prio: 13-15 */
+	u8 v6;                      /* whether entry is for IPv6 */
+	u8 lport;                   /* associated offload logical interface */
+	u8 dmac[ETH_ALEN];          /* neighbour's MAC address */
+};
+
+typedef void (*arp_err_handler_t)(void *handle, struct sk_buff *skb);
+
+/*
+ * Callback stored in an skb to handle address resolution failure.
+ */
+struct l2t_skb_cb {
+	void *handle;
+	arp_err_handler_t arp_err_handler;
+};
+
+#define L2T_SKB_CB(skb) ((struct l2t_skb_cb *)(skb)->cb)
+
+static inline void t4_set_arp_err_handler(struct sk_buff *skb, void *handle,
+					  arp_err_handler_t handler)
+{
+	L2T_SKB_CB(skb)->handle = handle;
+	L2T_SKB_CB(skb)->arp_err_handler = handler;
+}
+
+void cxgb4_l2t_release(struct l2t_entry *e);
+int cxgb4_l2t_send(struct net_device *dev, struct sk_buff *skb,
+		   struct l2t_entry *e);
+struct l2t_entry *cxgb4_l2t_get(struct l2t_data *d, struct neighbour *neigh,
+				const struct net_device *physdev,
+				unsigned int priority);
+u64 cxgb4_select_ntuple(struct net_device *dev,
+			const struct l2t_entry *l2t);
+struct l2t_entry *cxgb4_l2t_alloc_switching(struct net_device *dev, u16 vlan,
+					    u8 port, u8 *dmac);
+void t4_l2t_update(struct adapter *adap, struct neighbour *neigh);
+struct l2t_entry *t4_l2t_alloc_switching(struct adapter *adap, u16 vlan,
+					 u8 port, u8 *dmac);
+struct l2t_data *t4_init_l2t(unsigned int l2t_start, unsigned int l2t_end);
+void do_l2t_write_rpl(struct adapter *p, const struct cpl_l2t_write_rpl *rpl);
+
+extern const struct file_operations t4_l2t_fops;
+#endif  /* __CXGB4_L2T_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sched.c b/drivers/net/ethernet/chelsio/cxgb4/sched.c
new file mode 100644
index 0000000..9148abb
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/sched.c
@@ -0,0 +1,553 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+
+#include "cxgb4.h"
+#include "sched.h"
+
+/* Spinlock must be held by caller */
+static int t4_sched_class_fw_cmd(struct port_info *pi,
+				 struct ch_sched_params *p,
+				 enum sched_fw_ops op)
+{
+	struct adapter *adap = pi->adapter;
+	struct sched_table *s = pi->sched_tbl;
+	struct sched_class *e;
+	int err = 0;
+
+	e = &s->tab[p->u.params.class];
+	switch (op) {
+	case SCHED_FW_OP_ADD:
+		err = t4_sched_params(adap, p->type,
+				      p->u.params.level, p->u.params.mode,
+				      p->u.params.rateunit,
+				      p->u.params.ratemode,
+				      p->u.params.channel, e->idx,
+				      p->u.params.minrate, p->u.params.maxrate,
+				      p->u.params.weight, p->u.params.pktsize);
+		break;
+	default:
+		err = -ENOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+/* Spinlock must be held by caller */
+static int t4_sched_bind_unbind_op(struct port_info *pi, void *arg,
+				   enum sched_bind_type type, bool bind)
+{
+	struct adapter *adap = pi->adapter;
+	u32 fw_mnem, fw_class, fw_param;
+	unsigned int pf = adap->pf;
+	unsigned int vf = 0;
+	int err = 0;
+
+	switch (type) {
+	case SCHED_QUEUE: {
+		struct sched_queue_entry *qe;
+
+		qe = (struct sched_queue_entry *)arg;
+
+		/* Create a template for the FW_PARAMS_CMD mnemonic and
+		 * value (TX Scheduling Class in this case).
+		 */
+		fw_mnem = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DMAQ) |
+			   FW_PARAMS_PARAM_X_V(
+				   FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH));
+		fw_class = bind ? qe->param.class : FW_SCHED_CLS_NONE;
+		fw_param = (fw_mnem | FW_PARAMS_PARAM_YZ_V(qe->cntxt_id));
+
+		pf = adap->pf;
+		vf = 0;
+		break;
+	}
+	default:
+		err = -ENOTSUPP;
+		goto out;
+	}
+
+	err = t4_set_params(adap, adap->mbox, pf, vf, 1, &fw_param, &fw_class);
+
+out:
+	return err;
+}
+
+static struct sched_class *t4_sched_queue_lookup(struct port_info *pi,
+						 const unsigned int qid,
+						 int *index)
+{
+	struct sched_table *s = pi->sched_tbl;
+	struct sched_class *e, *end;
+	struct sched_class *found = NULL;
+	int i;
+
+	/* Look for a class with matching bound queue parameters */
+	end = &s->tab[s->sched_size];
+	for (e = &s->tab[0]; e != end; ++e) {
+		struct sched_queue_entry *qe;
+
+		i = 0;
+		if (e->state == SCHED_STATE_UNUSED)
+			continue;
+
+		list_for_each_entry(qe, &e->queue_list, list) {
+			if (qe->cntxt_id == qid) {
+				found = e;
+				if (index)
+					*index = i;
+				break;
+			}
+			i++;
+		}
+
+		if (found)
+			break;
+	}
+
+	return found;
+}
+
+static int t4_sched_queue_unbind(struct port_info *pi, struct ch_sched_queue *p)
+{
+	struct adapter *adap = pi->adapter;
+	struct sched_class *e;
+	struct sched_queue_entry *qe = NULL;
+	struct sge_eth_txq *txq;
+	unsigned int qid;
+	int index = -1;
+	int err = 0;
+
+	if (p->queue < 0 || p->queue >= pi->nqsets)
+		return -ERANGE;
+
+	txq = &adap->sge.ethtxq[pi->first_qset + p->queue];
+	qid = txq->q.cntxt_id;
+
+	/* Find the existing class that the queue is bound to */
+	e = t4_sched_queue_lookup(pi, qid, &index);
+	if (e && index >= 0) {
+		int i = 0;
+
+		spin_lock(&e->lock);
+		list_for_each_entry(qe, &e->queue_list, list) {
+			if (i == index)
+				break;
+			i++;
+		}
+		err = t4_sched_bind_unbind_op(pi, (void *)qe, SCHED_QUEUE,
+					      false);
+		if (err) {
+			spin_unlock(&e->lock);
+			goto out;
+		}
+
+		list_del(&qe->list);
+		kvfree(qe);
+		if (atomic_dec_and_test(&e->refcnt)) {
+			e->state = SCHED_STATE_UNUSED;
+			memset(&e->info, 0, sizeof(e->info));
+		}
+		spin_unlock(&e->lock);
+	}
+out:
+	return err;
+}
+
+static int t4_sched_queue_bind(struct port_info *pi, struct ch_sched_queue *p)
+{
+	struct adapter *adap = pi->adapter;
+	struct sched_table *s = pi->sched_tbl;
+	struct sched_class *e;
+	struct sched_queue_entry *qe = NULL;
+	struct sge_eth_txq *txq;
+	unsigned int qid;
+	int err = 0;
+
+	if (p->queue < 0 || p->queue >= pi->nqsets)
+		return -ERANGE;
+
+	qe = kvzalloc(sizeof(struct sched_queue_entry), GFP_KERNEL);
+	if (!qe)
+		return -ENOMEM;
+
+	txq = &adap->sge.ethtxq[pi->first_qset + p->queue];
+	qid = txq->q.cntxt_id;
+
+	/* Unbind queue from any existing class */
+	err = t4_sched_queue_unbind(pi, p);
+	if (err) {
+		kvfree(qe);
+		goto out;
+	}
+
+	/* Bind queue to specified class */
+	memset(qe, 0, sizeof(*qe));
+	qe->cntxt_id = qid;
+	memcpy(&qe->param, p, sizeof(qe->param));
+
+	e = &s->tab[qe->param.class];
+	spin_lock(&e->lock);
+	err = t4_sched_bind_unbind_op(pi, (void *)qe, SCHED_QUEUE, true);
+	if (err) {
+		kvfree(qe);
+		spin_unlock(&e->lock);
+		goto out;
+	}
+
+	list_add_tail(&qe->list, &e->queue_list);
+	atomic_inc(&e->refcnt);
+	spin_unlock(&e->lock);
+out:
+	return err;
+}
+
+static void t4_sched_class_unbind_all(struct port_info *pi,
+				      struct sched_class *e,
+				      enum sched_bind_type type)
+{
+	if (!e)
+		return;
+
+	switch (type) {
+	case SCHED_QUEUE: {
+		struct sched_queue_entry *qe;
+
+		list_for_each_entry(qe, &e->queue_list, list)
+			t4_sched_queue_unbind(pi, &qe->param);
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+static int t4_sched_class_bind_unbind_op(struct port_info *pi, void *arg,
+					 enum sched_bind_type type, bool bind)
+{
+	int err = 0;
+
+	if (!arg)
+		return -EINVAL;
+
+	switch (type) {
+	case SCHED_QUEUE: {
+		struct ch_sched_queue *qe = (struct ch_sched_queue *)arg;
+
+		if (bind)
+			err = t4_sched_queue_bind(pi, qe);
+		else
+			err = t4_sched_queue_unbind(pi, qe);
+		break;
+	}
+	default:
+		err = -ENOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+/**
+ * cxgb4_sched_class_bind - Bind an entity to a scheduling class
+ * @dev: net_device pointer
+ * @arg: Entity opaque data
+ * @type: Entity type (Queue)
+ *
+ * Binds an entity (queue) to a scheduling class.  If the entity
+ * is bound to another class, it will be unbound from the other class
+ * and bound to the class specified in @arg.
+ */
+int cxgb4_sched_class_bind(struct net_device *dev, void *arg,
+			   enum sched_bind_type type)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct sched_table *s;
+	int err = 0;
+	u8 class_id;
+
+	if (!can_sched(dev))
+		return -ENOTSUPP;
+
+	if (!arg)
+		return -EINVAL;
+
+	switch (type) {
+	case SCHED_QUEUE: {
+		struct ch_sched_queue *qe = (struct ch_sched_queue *)arg;
+
+		class_id = qe->class;
+		break;
+	}
+	default:
+		return -ENOTSUPP;
+	}
+
+	if (!valid_class_id(dev, class_id))
+		return -EINVAL;
+
+	if (class_id == SCHED_CLS_NONE)
+		return -ENOTSUPP;
+
+	s = pi->sched_tbl;
+	write_lock(&s->rw_lock);
+	err = t4_sched_class_bind_unbind_op(pi, arg, type, true);
+	write_unlock(&s->rw_lock);
+
+	return err;
+}
+
+/**
+ * cxgb4_sched_class_unbind - Unbind an entity from a scheduling class
+ * @dev: net_device pointer
+ * @arg: Entity opaque data
+ * @type: Entity type (Queue)
+ *
+ * Unbinds an entity (queue) from a scheduling class.
+ */
+int cxgb4_sched_class_unbind(struct net_device *dev, void *arg,
+			     enum sched_bind_type type)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct sched_table *s;
+	int err = 0;
+	u8 class_id;
+
+	if (!can_sched(dev))
+		return -ENOTSUPP;
+
+	if (!arg)
+		return -EINVAL;
+
+	switch (type) {
+	case SCHED_QUEUE: {
+		struct ch_sched_queue *qe = (struct ch_sched_queue *)arg;
+
+		class_id = qe->class;
+		break;
+	}
+	default:
+		return -ENOTSUPP;
+	}
+
+	if (!valid_class_id(dev, class_id))
+		return -EINVAL;
+
+	s = pi->sched_tbl;
+	write_lock(&s->rw_lock);
+	err = t4_sched_class_bind_unbind_op(pi, arg, type, false);
+	write_unlock(&s->rw_lock);
+
+	return err;
+}
+
+/* If @p is NULL, fetch any available unused class */
+static struct sched_class *t4_sched_class_lookup(struct port_info *pi,
+						const struct ch_sched_params *p)
+{
+	struct sched_table *s = pi->sched_tbl;
+	struct sched_class *e, *end;
+	struct sched_class *found = NULL;
+
+	if (!p) {
+		/* Get any available unused class */
+		end = &s->tab[s->sched_size];
+		for (e = &s->tab[0]; e != end; ++e) {
+			if (e->state == SCHED_STATE_UNUSED) {
+				found = e;
+				break;
+			}
+		}
+	} else {
+		/* Look for a class with matching scheduling parameters */
+		struct ch_sched_params info;
+		struct ch_sched_params tp;
+
+		memcpy(&tp, p, sizeof(tp));
+		/* Don't try to match class parameter */
+		tp.u.params.class = SCHED_CLS_NONE;
+
+		end = &s->tab[s->sched_size];
+		for (e = &s->tab[0]; e != end; ++e) {
+			if (e->state == SCHED_STATE_UNUSED)
+				continue;
+
+			memcpy(&info, &e->info, sizeof(info));
+			/* Don't try to match class parameter */
+			info.u.params.class = SCHED_CLS_NONE;
+
+			if ((info.type == tp.type) &&
+			    (!memcmp(&info.u.params, &tp.u.params,
+				     sizeof(info.u.params)))) {
+				found = e;
+				break;
+			}
+		}
+	}
+
+	return found;
+}
+
+static struct sched_class *t4_sched_class_alloc(struct port_info *pi,
+						struct ch_sched_params *p)
+{
+	struct sched_table *s = pi->sched_tbl;
+	struct sched_class *e;
+	u8 class_id;
+	int err;
+
+	if (!p)
+		return NULL;
+
+	class_id = p->u.params.class;
+
+	/* Only accept search for existing class with matching params
+	 * or allocation of new class with specified params
+	 */
+	if (class_id != SCHED_CLS_NONE)
+		return NULL;
+
+	write_lock(&s->rw_lock);
+	/* See if there's an exisiting class with same
+	 * requested sched params
+	 */
+	e = t4_sched_class_lookup(pi, p);
+	if (!e) {
+		struct ch_sched_params np;
+
+		/* Fetch any available unused class */
+		e = t4_sched_class_lookup(pi, NULL);
+		if (!e)
+			goto out;
+
+		memcpy(&np, p, sizeof(np));
+		np.u.params.class = e->idx;
+
+		spin_lock(&e->lock);
+		/* New class */
+		err = t4_sched_class_fw_cmd(pi, &np, SCHED_FW_OP_ADD);
+		if (err) {
+			spin_unlock(&e->lock);
+			e = NULL;
+			goto out;
+		}
+		memcpy(&e->info, &np, sizeof(e->info));
+		atomic_set(&e->refcnt, 0);
+		e->state = SCHED_STATE_ACTIVE;
+		spin_unlock(&e->lock);
+	}
+
+out:
+	write_unlock(&s->rw_lock);
+	return e;
+}
+
+/**
+ * cxgb4_sched_class_alloc - allocate a scheduling class
+ * @dev: net_device pointer
+ * @p: new scheduling class to create.
+ *
+ * Returns pointer to the scheduling class created.  If @p is NULL, then
+ * it allocates and returns any available unused scheduling class. If a
+ * scheduling class with matching @p is found, then the matching class is
+ * returned.
+ */
+struct sched_class *cxgb4_sched_class_alloc(struct net_device *dev,
+					    struct ch_sched_params *p)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	u8 class_id;
+
+	if (!can_sched(dev))
+		return NULL;
+
+	class_id = p->u.params.class;
+	if (!valid_class_id(dev, class_id))
+		return NULL;
+
+	return t4_sched_class_alloc(pi, p);
+}
+
+static void t4_sched_class_free(struct port_info *pi, struct sched_class *e)
+{
+	t4_sched_class_unbind_all(pi, e, SCHED_QUEUE);
+}
+
+struct sched_table *t4_init_sched(unsigned int sched_size)
+{
+	struct sched_table *s;
+	unsigned int i;
+
+	s = kvzalloc(sizeof(*s) + sched_size * sizeof(struct sched_class), GFP_KERNEL);
+	if (!s)
+		return NULL;
+
+	s->sched_size = sched_size;
+	rwlock_init(&s->rw_lock);
+
+	for (i = 0; i < s->sched_size; i++) {
+		memset(&s->tab[i], 0, sizeof(struct sched_class));
+		s->tab[i].idx = i;
+		s->tab[i].state = SCHED_STATE_UNUSED;
+		INIT_LIST_HEAD(&s->tab[i].queue_list);
+		spin_lock_init(&s->tab[i].lock);
+		atomic_set(&s->tab[i].refcnt, 0);
+	}
+	return s;
+}
+
+void t4_cleanup_sched(struct adapter *adap)
+{
+	struct sched_table *s;
+	unsigned int j, i;
+
+	for_each_port(adap, j) {
+		struct port_info *pi = netdev2pinfo(adap->port[j]);
+
+		s = pi->sched_tbl;
+		for (i = 0; i < s->sched_size; i++) {
+			struct sched_class *e;
+
+			write_lock(&s->rw_lock);
+			e = &s->tab[i];
+			if (e->state == SCHED_STATE_ACTIVE)
+				t4_sched_class_free(pi, e);
+			write_unlock(&s->rw_lock);
+		}
+		kvfree(s);
+	}
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sched.h b/drivers/net/ethernet/chelsio/cxgb4/sched.h
new file mode 100644
index 0000000..3a49e00
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/sched.h
@@ -0,0 +1,110 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_SCHED_H
+#define __CXGB4_SCHED_H
+
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+
+#define SCHED_CLS_NONE 0xff
+
+#define FW_SCHED_CLS_NONE 0xffffffff
+
+/* Max rate that can be set to a scheduling class is 100 Gbps */
+#define SCHED_MAX_RATE_KBPS 100000000U
+
+enum {
+	SCHED_STATE_ACTIVE,
+	SCHED_STATE_UNUSED,
+};
+
+enum sched_fw_ops {
+	SCHED_FW_OP_ADD,
+};
+
+enum sched_bind_type {
+	SCHED_QUEUE,
+};
+
+struct sched_queue_entry {
+	struct list_head list;
+	unsigned int cntxt_id;
+	struct ch_sched_queue param;
+};
+
+struct sched_class {
+	u8 state;
+	u8 idx;
+	struct ch_sched_params info;
+	struct list_head queue_list;
+	spinlock_t lock; /* Per class lock */
+	atomic_t refcnt;
+};
+
+struct sched_table {      /* per port scheduling table */
+	u8 sched_size;
+	rwlock_t rw_lock; /* Table lock */
+	struct sched_class tab[0];
+};
+
+static inline bool can_sched(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	return !pi->sched_tbl ? false : true;
+}
+
+static inline bool valid_class_id(struct net_device *dev, u8 class_id)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	if ((class_id > pi->sched_tbl->sched_size - 1) &&
+	    (class_id != SCHED_CLS_NONE))
+		return false;
+
+	return true;
+}
+
+int cxgb4_sched_class_bind(struct net_device *dev, void *arg,
+			   enum sched_bind_type type);
+int cxgb4_sched_class_unbind(struct net_device *dev, void *arg,
+			     enum sched_bind_type type);
+
+struct sched_class *cxgb4_sched_class_alloc(struct net_device *dev,
+					    struct ch_sched_params *p);
+
+struct sched_table *t4_init_sched(unsigned int size);
+void t4_cleanup_sched(struct adapter *adap);
+#endif  /* __CXGB4_SCHED_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
new file mode 100644
index 0000000..1a28df1
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -0,0 +1,3715 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/dma-mapping.h>
+#include <linux/jiffies.h>
+#include <linux/prefetch.h>
+#include <linux/export.h>
+#include <net/xfrm.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <net/busy_poll.h>
+#ifdef CONFIG_CHELSIO_T4_FCOE
+#include <scsi/fc/fc_fcoe.h>
+#endif /* CONFIG_CHELSIO_T4_FCOE */
+#include "cxgb4.h"
+#include "t4_regs.h"
+#include "t4_values.h"
+#include "t4_msg.h"
+#include "t4fw_api.h"
+#include "cxgb4_ptp.h"
+#include "cxgb4_uld.h"
+
+/*
+ * Rx buffer size.  We use largish buffers if possible but settle for single
+ * pages under memory shortage.
+ */
+#if PAGE_SHIFT >= 16
+# define FL_PG_ORDER 0
+#else
+# define FL_PG_ORDER (16 - PAGE_SHIFT)
+#endif
+
+/* RX_PULL_LEN should be <= RX_COPY_THRES */
+#define RX_COPY_THRES    256
+#define RX_PULL_LEN      128
+
+/*
+ * Main body length for sk_buffs used for Rx Ethernet packets with fragments.
+ * Should be >= RX_PULL_LEN but possibly bigger to give pskb_may_pull some room.
+ */
+#define RX_PKT_SKB_LEN   512
+
+/*
+ * Max number of Tx descriptors we clean up at a time.  Should be modest as
+ * freeing skbs isn't cheap and it happens while holding locks.  We just need
+ * to free packets faster than they arrive, we eventually catch up and keep
+ * the amortized cost reasonable.  Must be >= 2 * TXQ_STOP_THRES.
+ */
+#define MAX_TX_RECLAIM 16
+
+/*
+ * Max number of Rx buffers we replenish at a time.  Again keep this modest,
+ * allocating buffers isn't cheap either.
+ */
+#define MAX_RX_REFILL 16U
+
+/*
+ * Period of the Rx queue check timer.  This timer is infrequent as it has
+ * something to do only when the system experiences severe memory shortage.
+ */
+#define RX_QCHECK_PERIOD (HZ / 2)
+
+/*
+ * Period of the Tx queue check timer.
+ */
+#define TX_QCHECK_PERIOD (HZ / 2)
+
+/*
+ * Max number of Tx descriptors to be reclaimed by the Tx timer.
+ */
+#define MAX_TIMER_TX_RECLAIM 100
+
+/*
+ * Timer index used when backing off due to memory shortage.
+ */
+#define NOMEM_TMR_IDX (SGE_NTIMERS - 1)
+
+/*
+ * Suspension threshold for non-Ethernet Tx queues.  We require enough room
+ * for a full sized WR.
+ */
+#define TXQ_STOP_THRES (SGE_MAX_WR_LEN / sizeof(struct tx_desc))
+
+/*
+ * Max Tx descriptor space we allow for an Ethernet packet to be inlined
+ * into a WR.
+ */
+#define MAX_IMM_TX_PKT_LEN 256
+
+/*
+ * Max size of a WR sent through a control Tx queue.
+ */
+#define MAX_CTRL_WR_LEN SGE_MAX_WR_LEN
+
+struct rx_sw_desc {                /* SW state per Rx descriptor */
+	struct page *page;
+	dma_addr_t dma_addr;
+};
+
+/*
+ * Rx buffer sizes for "useskbs" Free List buffers (one ingress packet pe skb
+ * buffer).  We currently only support two sizes for 1500- and 9000-byte MTUs.
+ * We could easily support more but there doesn't seem to be much need for
+ * that ...
+ */
+#define FL_MTU_SMALL 1500
+#define FL_MTU_LARGE 9000
+
+static inline unsigned int fl_mtu_bufsize(struct adapter *adapter,
+					  unsigned int mtu)
+{
+	struct sge *s = &adapter->sge;
+
+	return ALIGN(s->pktshift + ETH_HLEN + VLAN_HLEN + mtu, s->fl_align);
+}
+
+#define FL_MTU_SMALL_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_SMALL)
+#define FL_MTU_LARGE_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_LARGE)
+
+/*
+ * Bits 0..3 of rx_sw_desc.dma_addr have special meaning.  The hardware uses
+ * these to specify the buffer size as an index into the SGE Free List Buffer
+ * Size register array.  We also use bit 4, when the buffer has been unmapped
+ * for DMA, but this is of course never sent to the hardware and is only used
+ * to prevent double unmappings.  All of the above requires that the Free List
+ * Buffers which we allocate have the bottom 5 bits free (0) -- i.e. are
+ * 32-byte or or a power of 2 greater in alignment.  Since the SGE's minimal
+ * Free List Buffer alignment is 32 bytes, this works out for us ...
+ */
+enum {
+	RX_BUF_FLAGS     = 0x1f,   /* bottom five bits are special */
+	RX_BUF_SIZE      = 0x0f,   /* bottom three bits are for buf sizes */
+	RX_UNMAPPED_BUF  = 0x10,   /* buffer is not mapped */
+
+	/*
+	 * XXX We shouldn't depend on being able to use these indices.
+	 * XXX Especially when some other Master PF has initialized the
+	 * XXX adapter or we use the Firmware Configuration File.  We
+	 * XXX should really search through the Host Buffer Size register
+	 * XXX array for the appropriately sized buffer indices.
+	 */
+	RX_SMALL_PG_BUF  = 0x0,   /* small (PAGE_SIZE) page buffer */
+	RX_LARGE_PG_BUF  = 0x1,   /* buffer large (FL_PG_ORDER) page buffer */
+
+	RX_SMALL_MTU_BUF = 0x2,   /* small MTU buffer */
+	RX_LARGE_MTU_BUF = 0x3,   /* large MTU buffer */
+};
+
+static int timer_pkt_quota[] = {1, 1, 2, 3, 4, 5};
+#define MIN_NAPI_WORK  1
+
+static inline dma_addr_t get_buf_addr(const struct rx_sw_desc *d)
+{
+	return d->dma_addr & ~(dma_addr_t)RX_BUF_FLAGS;
+}
+
+static inline bool is_buf_mapped(const struct rx_sw_desc *d)
+{
+	return !(d->dma_addr & RX_UNMAPPED_BUF);
+}
+
+/**
+ *	txq_avail - return the number of available slots in a Tx queue
+ *	@q: the Tx queue
+ *
+ *	Returns the number of descriptors in a Tx queue available to write new
+ *	packets.
+ */
+static inline unsigned int txq_avail(const struct sge_txq *q)
+{
+	return q->size - 1 - q->in_use;
+}
+
+/**
+ *	fl_cap - return the capacity of a free-buffer list
+ *	@fl: the FL
+ *
+ *	Returns the capacity of a free-buffer list.  The capacity is less than
+ *	the size because one descriptor needs to be left unpopulated, otherwise
+ *	HW will think the FL is empty.
+ */
+static inline unsigned int fl_cap(const struct sge_fl *fl)
+{
+	return fl->size - 8;   /* 1 descriptor = 8 buffers */
+}
+
+/**
+ *	fl_starving - return whether a Free List is starving.
+ *	@adapter: pointer to the adapter
+ *	@fl: the Free List
+ *
+ *	Tests specified Free List to see whether the number of buffers
+ *	available to the hardware has falled below our "starvation"
+ *	threshold.
+ */
+static inline bool fl_starving(const struct adapter *adapter,
+			       const struct sge_fl *fl)
+{
+	const struct sge *s = &adapter->sge;
+
+	return fl->avail - fl->pend_cred <= s->fl_starve_thres;
+}
+
+int cxgb4_map_skb(struct device *dev, const struct sk_buff *skb,
+		  dma_addr_t *addr)
+{
+	const skb_frag_t *fp, *end;
+	const struct skb_shared_info *si;
+
+	*addr = dma_map_single(dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE);
+	if (dma_mapping_error(dev, *addr))
+		goto out_err;
+
+	si = skb_shinfo(skb);
+	end = &si->frags[si->nr_frags];
+
+	for (fp = si->frags; fp < end; fp++) {
+		*++addr = skb_frag_dma_map(dev, fp, 0, skb_frag_size(fp),
+					   DMA_TO_DEVICE);
+		if (dma_mapping_error(dev, *addr))
+			goto unwind;
+	}
+	return 0;
+
+unwind:
+	while (fp-- > si->frags)
+		dma_unmap_page(dev, *--addr, skb_frag_size(fp), DMA_TO_DEVICE);
+
+	dma_unmap_single(dev, addr[-1], skb_headlen(skb), DMA_TO_DEVICE);
+out_err:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(cxgb4_map_skb);
+
+#ifdef CONFIG_NEED_DMA_MAP_STATE
+static void unmap_skb(struct device *dev, const struct sk_buff *skb,
+		      const dma_addr_t *addr)
+{
+	const skb_frag_t *fp, *end;
+	const struct skb_shared_info *si;
+
+	dma_unmap_single(dev, *addr++, skb_headlen(skb), DMA_TO_DEVICE);
+
+	si = skb_shinfo(skb);
+	end = &si->frags[si->nr_frags];
+	for (fp = si->frags; fp < end; fp++)
+		dma_unmap_page(dev, *addr++, skb_frag_size(fp), DMA_TO_DEVICE);
+}
+
+/**
+ *	deferred_unmap_destructor - unmap a packet when it is freed
+ *	@skb: the packet
+ *
+ *	This is the packet destructor used for Tx packets that need to remain
+ *	mapped until they are freed rather than until their Tx descriptors are
+ *	freed.
+ */
+static void deferred_unmap_destructor(struct sk_buff *skb)
+{
+	unmap_skb(skb->dev->dev.parent, skb, (dma_addr_t *)skb->head);
+}
+#endif
+
+static void unmap_sgl(struct device *dev, const struct sk_buff *skb,
+		      const struct ulptx_sgl *sgl, const struct sge_txq *q)
+{
+	const struct ulptx_sge_pair *p;
+	unsigned int nfrags = skb_shinfo(skb)->nr_frags;
+
+	if (likely(skb_headlen(skb)))
+		dma_unmap_single(dev, be64_to_cpu(sgl->addr0), ntohl(sgl->len0),
+				 DMA_TO_DEVICE);
+	else {
+		dma_unmap_page(dev, be64_to_cpu(sgl->addr0), ntohl(sgl->len0),
+			       DMA_TO_DEVICE);
+		nfrags--;
+	}
+
+	/*
+	 * the complexity below is because of the possibility of a wrap-around
+	 * in the middle of an SGL
+	 */
+	for (p = sgl->sge; nfrags >= 2; nfrags -= 2) {
+		if (likely((u8 *)(p + 1) <= (u8 *)q->stat)) {
+unmap:			dma_unmap_page(dev, be64_to_cpu(p->addr[0]),
+				       ntohl(p->len[0]), DMA_TO_DEVICE);
+			dma_unmap_page(dev, be64_to_cpu(p->addr[1]),
+				       ntohl(p->len[1]), DMA_TO_DEVICE);
+			p++;
+		} else if ((u8 *)p == (u8 *)q->stat) {
+			p = (const struct ulptx_sge_pair *)q->desc;
+			goto unmap;
+		} else if ((u8 *)p + 8 == (u8 *)q->stat) {
+			const __be64 *addr = (const __be64 *)q->desc;
+
+			dma_unmap_page(dev, be64_to_cpu(addr[0]),
+				       ntohl(p->len[0]), DMA_TO_DEVICE);
+			dma_unmap_page(dev, be64_to_cpu(addr[1]),
+				       ntohl(p->len[1]), DMA_TO_DEVICE);
+			p = (const struct ulptx_sge_pair *)&addr[2];
+		} else {
+			const __be64 *addr = (const __be64 *)q->desc;
+
+			dma_unmap_page(dev, be64_to_cpu(p->addr[0]),
+				       ntohl(p->len[0]), DMA_TO_DEVICE);
+			dma_unmap_page(dev, be64_to_cpu(addr[0]),
+				       ntohl(p->len[1]), DMA_TO_DEVICE);
+			p = (const struct ulptx_sge_pair *)&addr[1];
+		}
+	}
+	if (nfrags) {
+		__be64 addr;
+
+		if ((u8 *)p == (u8 *)q->stat)
+			p = (const struct ulptx_sge_pair *)q->desc;
+		addr = (u8 *)p + 16 <= (u8 *)q->stat ? p->addr[0] :
+						       *(const __be64 *)q->desc;
+		dma_unmap_page(dev, be64_to_cpu(addr), ntohl(p->len[0]),
+			       DMA_TO_DEVICE);
+	}
+}
+
+/**
+ *	free_tx_desc - reclaims Tx descriptors and their buffers
+ *	@adapter: the adapter
+ *	@q: the Tx queue to reclaim descriptors from
+ *	@n: the number of descriptors to reclaim
+ *	@unmap: whether the buffers should be unmapped for DMA
+ *
+ *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
+ *	Tx buffers.  Called with the Tx queue lock held.
+ */
+void free_tx_desc(struct adapter *adap, struct sge_txq *q,
+		  unsigned int n, bool unmap)
+{
+	struct tx_sw_desc *d;
+	unsigned int cidx = q->cidx;
+	struct device *dev = adap->pdev_dev;
+
+	d = &q->sdesc[cidx];
+	while (n--) {
+		if (d->skb) {                       /* an SGL is present */
+			if (unmap)
+				unmap_sgl(dev, d->skb, d->sgl, q);
+			dev_consume_skb_any(d->skb);
+			d->skb = NULL;
+		}
+		++d;
+		if (++cidx == q->size) {
+			cidx = 0;
+			d = q->sdesc;
+		}
+	}
+	q->cidx = cidx;
+}
+
+/*
+ * Return the number of reclaimable descriptors in a Tx queue.
+ */
+static inline int reclaimable(const struct sge_txq *q)
+{
+	int hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
+	hw_cidx -= q->cidx;
+	return hw_cidx < 0 ? hw_cidx + q->size : hw_cidx;
+}
+
+/**
+ *	cxgb4_reclaim_completed_tx - reclaims completed Tx descriptors
+ *	@adap: the adapter
+ *	@q: the Tx queue to reclaim completed descriptors from
+ *	@unmap: whether the buffers should be unmapped for DMA
+ *
+ *	Reclaims Tx descriptors that the SGE has indicated it has processed,
+ *	and frees the associated buffers if possible.  Called with the Tx
+ *	queue locked.
+ */
+inline void cxgb4_reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
+					bool unmap)
+{
+	int avail = reclaimable(q);
+
+	if (avail) {
+		/*
+		 * Limit the amount of clean up work we do at a time to keep
+		 * the Tx lock hold time O(1).
+		 */
+		if (avail > MAX_TX_RECLAIM)
+			avail = MAX_TX_RECLAIM;
+
+		free_tx_desc(adap, q, avail, unmap);
+		q->in_use -= avail;
+	}
+}
+EXPORT_SYMBOL(cxgb4_reclaim_completed_tx);
+
+static inline int get_buf_size(struct adapter *adapter,
+			       const struct rx_sw_desc *d)
+{
+	struct sge *s = &adapter->sge;
+	unsigned int rx_buf_size_idx = d->dma_addr & RX_BUF_SIZE;
+	int buf_size;
+
+	switch (rx_buf_size_idx) {
+	case RX_SMALL_PG_BUF:
+		buf_size = PAGE_SIZE;
+		break;
+
+	case RX_LARGE_PG_BUF:
+		buf_size = PAGE_SIZE << s->fl_pg_order;
+		break;
+
+	case RX_SMALL_MTU_BUF:
+		buf_size = FL_MTU_SMALL_BUFSIZE(adapter);
+		break;
+
+	case RX_LARGE_MTU_BUF:
+		buf_size = FL_MTU_LARGE_BUFSIZE(adapter);
+		break;
+
+	default:
+		BUG_ON(1);
+	}
+
+	return buf_size;
+}
+
+/**
+ *	free_rx_bufs - free the Rx buffers on an SGE free list
+ *	@adap: the adapter
+ *	@q: the SGE free list to free buffers from
+ *	@n: how many buffers to free
+ *
+ *	Release the next @n buffers on an SGE free-buffer Rx queue.   The
+ *	buffers must be made inaccessible to HW before calling this function.
+ */
+static void free_rx_bufs(struct adapter *adap, struct sge_fl *q, int n)
+{
+	while (n--) {
+		struct rx_sw_desc *d = &q->sdesc[q->cidx];
+
+		if (is_buf_mapped(d))
+			dma_unmap_page(adap->pdev_dev, get_buf_addr(d),
+				       get_buf_size(adap, d),
+				       PCI_DMA_FROMDEVICE);
+		put_page(d->page);
+		d->page = NULL;
+		if (++q->cidx == q->size)
+			q->cidx = 0;
+		q->avail--;
+	}
+}
+
+/**
+ *	unmap_rx_buf - unmap the current Rx buffer on an SGE free list
+ *	@adap: the adapter
+ *	@q: the SGE free list
+ *
+ *	Unmap the current buffer on an SGE free-buffer Rx queue.   The
+ *	buffer must be made inaccessible to HW before calling this function.
+ *
+ *	This is similar to @free_rx_bufs above but does not free the buffer.
+ *	Do note that the FL still loses any further access to the buffer.
+ */
+static void unmap_rx_buf(struct adapter *adap, struct sge_fl *q)
+{
+	struct rx_sw_desc *d = &q->sdesc[q->cidx];
+
+	if (is_buf_mapped(d))
+		dma_unmap_page(adap->pdev_dev, get_buf_addr(d),
+			       get_buf_size(adap, d), PCI_DMA_FROMDEVICE);
+	d->page = NULL;
+	if (++q->cidx == q->size)
+		q->cidx = 0;
+	q->avail--;
+}
+
+static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
+{
+	if (q->pend_cred >= 8) {
+		u32 val = adap->params.arch.sge_fl_db;
+
+		if (is_t4(adap->params.chip))
+			val |= PIDX_V(q->pend_cred / 8);
+		else
+			val |= PIDX_T5_V(q->pend_cred / 8);
+
+		/* Make sure all memory writes to the Free List queue are
+		 * committed before we tell the hardware about them.
+		 */
+		wmb();
+
+		/* If we don't have access to the new User Doorbell (T5+), use
+		 * the old doorbell mechanism; otherwise use the new BAR2
+		 * mechanism.
+		 */
+		if (unlikely(q->bar2_addr == NULL)) {
+			t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
+				     val | QID_V(q->cntxt_id));
+		} else {
+			writel(val | QID_V(q->bar2_qid),
+			       q->bar2_addr + SGE_UDB_KDOORBELL);
+
+			/* This Write memory Barrier will force the write to
+			 * the User Doorbell area to be flushed.
+			 */
+			wmb();
+		}
+		q->pend_cred &= 7;
+	}
+}
+
+static inline void set_rx_sw_desc(struct rx_sw_desc *sd, struct page *pg,
+				  dma_addr_t mapping)
+{
+	sd->page = pg;
+	sd->dma_addr = mapping;      /* includes size low bits */
+}
+
+/**
+ *	refill_fl - refill an SGE Rx buffer ring
+ *	@adap: the adapter
+ *	@q: the ring to refill
+ *	@n: the number of new buffers to allocate
+ *	@gfp: the gfp flags for the allocations
+ *
+ *	(Re)populate an SGE free-buffer queue with up to @n new packet buffers,
+ *	allocated with the supplied gfp flags.  The caller must assure that
+ *	@n does not exceed the queue's capacity.  If afterwards the queue is
+ *	found critically low mark it as starving in the bitmap of starving FLs.
+ *
+ *	Returns the number of buffers allocated.
+ */
+static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n,
+			      gfp_t gfp)
+{
+	struct sge *s = &adap->sge;
+	struct page *pg;
+	dma_addr_t mapping;
+	unsigned int cred = q->avail;
+	__be64 *d = &q->desc[q->pidx];
+	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
+	int node;
+
+#ifdef CONFIG_DEBUG_FS
+	if (test_bit(q->cntxt_id - adap->sge.egr_start, adap->sge.blocked_fl))
+		goto out;
+#endif
+
+	gfp |= __GFP_NOWARN;
+	node = dev_to_node(adap->pdev_dev);
+
+	if (s->fl_pg_order == 0)
+		goto alloc_small_pages;
+
+	/*
+	 * Prefer large buffers
+	 */
+	while (n) {
+		pg = alloc_pages_node(node, gfp | __GFP_COMP, s->fl_pg_order);
+		if (unlikely(!pg)) {
+			q->large_alloc_failed++;
+			break;       /* fall back to single pages */
+		}
+
+		mapping = dma_map_page(adap->pdev_dev, pg, 0,
+				       PAGE_SIZE << s->fl_pg_order,
+				       PCI_DMA_FROMDEVICE);
+		if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) {
+			__free_pages(pg, s->fl_pg_order);
+			q->mapping_err++;
+			goto out;   /* do not try small pages for this error */
+		}
+		mapping |= RX_LARGE_PG_BUF;
+		*d++ = cpu_to_be64(mapping);
+
+		set_rx_sw_desc(sd, pg, mapping);
+		sd++;
+
+		q->avail++;
+		if (++q->pidx == q->size) {
+			q->pidx = 0;
+			sd = q->sdesc;
+			d = q->desc;
+		}
+		n--;
+	}
+
+alloc_small_pages:
+	while (n--) {
+		pg = alloc_pages_node(node, gfp, 0);
+		if (unlikely(!pg)) {
+			q->alloc_failed++;
+			break;
+		}
+
+		mapping = dma_map_page(adap->pdev_dev, pg, 0, PAGE_SIZE,
+				       PCI_DMA_FROMDEVICE);
+		if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) {
+			put_page(pg);
+			q->mapping_err++;
+			goto out;
+		}
+		*d++ = cpu_to_be64(mapping);
+
+		set_rx_sw_desc(sd, pg, mapping);
+		sd++;
+
+		q->avail++;
+		if (++q->pidx == q->size) {
+			q->pidx = 0;
+			sd = q->sdesc;
+			d = q->desc;
+		}
+	}
+
+out:	cred = q->avail - cred;
+	q->pend_cred += cred;
+	ring_fl_db(adap, q);
+
+	if (unlikely(fl_starving(adap, q))) {
+		smp_wmb();
+		q->low++;
+		set_bit(q->cntxt_id - adap->sge.egr_start,
+			adap->sge.starving_fl);
+	}
+
+	return cred;
+}
+
+static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
+{
+	refill_fl(adap, fl, min(MAX_RX_REFILL, fl_cap(fl) - fl->avail),
+		  GFP_ATOMIC);
+}
+
+/**
+ *	alloc_ring - allocate resources for an SGE descriptor ring
+ *	@dev: the PCI device's core device
+ *	@nelem: the number of descriptors
+ *	@elem_size: the size of each descriptor
+ *	@sw_size: the size of the SW state associated with each ring element
+ *	@phys: the physical address of the allocated ring
+ *	@metadata: address of the array holding the SW state for the ring
+ *	@stat_size: extra space in HW ring for status information
+ *	@node: preferred node for memory allocations
+ *
+ *	Allocates resources for an SGE descriptor ring, such as Tx queues,
+ *	free buffer lists, or response queues.  Each SGE ring requires
+ *	space for its HW descriptors plus, optionally, space for the SW state
+ *	associated with each HW entry (the metadata).  The function returns
+ *	three values: the virtual address for the HW ring (the return value
+ *	of the function), the bus address of the HW ring, and the address
+ *	of the SW ring.
+ */
+static void *alloc_ring(struct device *dev, size_t nelem, size_t elem_size,
+			size_t sw_size, dma_addr_t *phys, void *metadata,
+			size_t stat_size, int node)
+{
+	size_t len = nelem * elem_size + stat_size;
+	void *s = NULL;
+	void *p = dma_alloc_coherent(dev, len, phys, GFP_KERNEL);
+
+	if (!p)
+		return NULL;
+	if (sw_size) {
+		s = kzalloc_node(nelem * sw_size, GFP_KERNEL, node);
+
+		if (!s) {
+			dma_free_coherent(dev, len, p, *phys);
+			return NULL;
+		}
+	}
+	if (metadata)
+		*(void **)metadata = s;
+	memset(p, 0, len);
+	return p;
+}
+
+/**
+ *	sgl_len - calculates the size of an SGL of the given capacity
+ *	@n: the number of SGL entries
+ *
+ *	Calculates the number of flits needed for a scatter/gather list that
+ *	can hold the given number of entries.
+ */
+static inline unsigned int sgl_len(unsigned int n)
+{
+	/* A Direct Scatter Gather List uses 32-bit lengths and 64-bit PCI DMA
+	 * addresses.  The DSGL Work Request starts off with a 32-bit DSGL
+	 * ULPTX header, then Length0, then Address0, then, for 1 <= i <= N,
+	 * repeated sequences of { Length[i], Length[i+1], Address[i],
+	 * Address[i+1] } (this ensures that all addresses are on 64-bit
+	 * boundaries).  If N is even, then Length[N+1] should be set to 0 and
+	 * Address[N+1] is omitted.
+	 *
+	 * The following calculation incorporates all of the above.  It's
+	 * somewhat hard to follow but, briefly: the "+2" accounts for the
+	 * first two flits which include the DSGL header, Length0 and
+	 * Address0; the "(3*(n-1))/2" covers the main body of list entries (3
+	 * flits for every pair of the remaining N) +1 if (n-1) is odd; and
+	 * finally the "+((n-1)&1)" adds the one remaining flit needed if
+	 * (n-1) is odd ...
+	 */
+	n--;
+	return (3 * n) / 2 + (n & 1) + 2;
+}
+
+/**
+ *	flits_to_desc - returns the num of Tx descriptors for the given flits
+ *	@n: the number of flits
+ *
+ *	Returns the number of Tx descriptors needed for the supplied number
+ *	of flits.
+ */
+static inline unsigned int flits_to_desc(unsigned int n)
+{
+	BUG_ON(n > SGE_MAX_WR_LEN / 8);
+	return DIV_ROUND_UP(n, 8);
+}
+
+/**
+ *	is_eth_imm - can an Ethernet packet be sent as immediate data?
+ *	@skb: the packet
+ *
+ *	Returns whether an Ethernet packet is small enough to fit as
+ *	immediate data. Return value corresponds to headroom required.
+ */
+static inline int is_eth_imm(const struct sk_buff *skb, unsigned int chip_ver)
+{
+	int hdrlen = 0;
+
+	if (skb->encapsulation && skb_shinfo(skb)->gso_size &&
+	    chip_ver > CHELSIO_T5) {
+		hdrlen = sizeof(struct cpl_tx_tnl_lso);
+		hdrlen += sizeof(struct cpl_tx_pkt_core);
+	} else {
+		hdrlen = skb_shinfo(skb)->gso_size ?
+			 sizeof(struct cpl_tx_pkt_lso_core) : 0;
+		hdrlen += sizeof(struct cpl_tx_pkt);
+	}
+	if (skb->len <= MAX_IMM_TX_PKT_LEN - hdrlen)
+		return hdrlen;
+	return 0;
+}
+
+/**
+ *	calc_tx_flits - calculate the number of flits for a packet Tx WR
+ *	@skb: the packet
+ *
+ *	Returns the number of flits needed for a Tx WR for the given Ethernet
+ *	packet, including the needed WR and CPL headers.
+ */
+static inline unsigned int calc_tx_flits(const struct sk_buff *skb,
+					 unsigned int chip_ver)
+{
+	unsigned int flits;
+	int hdrlen = is_eth_imm(skb, chip_ver);
+
+	/* If the skb is small enough, we can pump it out as a work request
+	 * with only immediate data.  In that case we just have to have the
+	 * TX Packet header plus the skb data in the Work Request.
+	 */
+
+	if (hdrlen)
+		return DIV_ROUND_UP(skb->len + hdrlen, sizeof(__be64));
+
+	/* Otherwise, we're going to have to construct a Scatter gather list
+	 * of the skb body and fragments.  We also include the flits necessary
+	 * for the TX Packet Work Request and CPL.  We always have a firmware
+	 * Write Header (incorporated as part of the cpl_tx_pkt_lso and
+	 * cpl_tx_pkt structures), followed by either a TX Packet Write CPL
+	 * message or, if we're doing a Large Send Offload, an LSO CPL message
+	 * with an embedded TX Packet Write CPL message.
+	 */
+	flits = sgl_len(skb_shinfo(skb)->nr_frags + 1);
+	if (skb_shinfo(skb)->gso_size) {
+		if (skb->encapsulation && chip_ver > CHELSIO_T5)
+			hdrlen = sizeof(struct fw_eth_tx_pkt_wr) +
+				 sizeof(struct cpl_tx_tnl_lso);
+		else
+			hdrlen = sizeof(struct fw_eth_tx_pkt_wr) +
+				 sizeof(struct cpl_tx_pkt_lso_core);
+
+		hdrlen += sizeof(struct cpl_tx_pkt_core);
+		flits += (hdrlen / sizeof(__be64));
+	} else {
+		flits += (sizeof(struct fw_eth_tx_pkt_wr) +
+			  sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
+	}
+	return flits;
+}
+
+/**
+ *	calc_tx_descs - calculate the number of Tx descriptors for a packet
+ *	@skb: the packet
+ *
+ *	Returns the number of Tx descriptors needed for the given Ethernet
+ *	packet, including the needed WR and CPL headers.
+ */
+static inline unsigned int calc_tx_descs(const struct sk_buff *skb,
+					 unsigned int chip_ver)
+{
+	return flits_to_desc(calc_tx_flits(skb, chip_ver));
+}
+
+/**
+ *	cxgb4_write_sgl - populate a scatter/gather list for a packet
+ *	@skb: the packet
+ *	@q: the Tx queue we are writing into
+ *	@sgl: starting location for writing the SGL
+ *	@end: points right after the end of the SGL
+ *	@start: start offset into skb main-body data to include in the SGL
+ *	@addr: the list of bus addresses for the SGL elements
+ *
+ *	Generates a gather list for the buffers that make up a packet.
+ *	The caller must provide adequate space for the SGL that will be written.
+ *	The SGL includes all of the packet's page fragments and the data in its
+ *	main body except for the first @start bytes.  @sgl must be 16-byte
+ *	aligned and within a Tx descriptor with available space.  @end points
+ *	right after the end of the SGL but does not account for any potential
+ *	wrap around, i.e., @end > @sgl.
+ */
+void cxgb4_write_sgl(const struct sk_buff *skb, struct sge_txq *q,
+		     struct ulptx_sgl *sgl, u64 *end, unsigned int start,
+		     const dma_addr_t *addr)
+{
+	unsigned int i, len;
+	struct ulptx_sge_pair *to;
+	const struct skb_shared_info *si = skb_shinfo(skb);
+	unsigned int nfrags = si->nr_frags;
+	struct ulptx_sge_pair buf[MAX_SKB_FRAGS / 2 + 1];
+
+	len = skb_headlen(skb) - start;
+	if (likely(len)) {
+		sgl->len0 = htonl(len);
+		sgl->addr0 = cpu_to_be64(addr[0] + start);
+		nfrags++;
+	} else {
+		sgl->len0 = htonl(skb_frag_size(&si->frags[0]));
+		sgl->addr0 = cpu_to_be64(addr[1]);
+	}
+
+	sgl->cmd_nsge = htonl(ULPTX_CMD_V(ULP_TX_SC_DSGL) |
+			      ULPTX_NSGE_V(nfrags));
+	if (likely(--nfrags == 0))
+		return;
+	/*
+	 * Most of the complexity below deals with the possibility we hit the
+	 * end of the queue in the middle of writing the SGL.  For this case
+	 * only we create the SGL in a temporary buffer and then copy it.
+	 */
+	to = (u8 *)end > (u8 *)q->stat ? buf : sgl->sge;
+
+	for (i = (nfrags != si->nr_frags); nfrags >= 2; nfrags -= 2, to++) {
+		to->len[0] = cpu_to_be32(skb_frag_size(&si->frags[i]));
+		to->len[1] = cpu_to_be32(skb_frag_size(&si->frags[++i]));
+		to->addr[0] = cpu_to_be64(addr[i]);
+		to->addr[1] = cpu_to_be64(addr[++i]);
+	}
+	if (nfrags) {
+		to->len[0] = cpu_to_be32(skb_frag_size(&si->frags[i]));
+		to->len[1] = cpu_to_be32(0);
+		to->addr[0] = cpu_to_be64(addr[i + 1]);
+	}
+	if (unlikely((u8 *)end > (u8 *)q->stat)) {
+		unsigned int part0 = (u8 *)q->stat - (u8 *)sgl->sge, part1;
+
+		if (likely(part0))
+			memcpy(sgl->sge, buf, part0);
+		part1 = (u8 *)end - (u8 *)q->stat;
+		memcpy(q->desc, (u8 *)buf + part0, part1);
+		end = (void *)q->desc + part1;
+	}
+	if ((uintptr_t)end & 8)           /* 0-pad to multiple of 16 */
+		*end = 0;
+}
+EXPORT_SYMBOL(cxgb4_write_sgl);
+
+/* This function copies 64 byte coalesced work request to
+ * memory mapped BAR2 space. For coalesced WR SGE fetches
+ * data from the FIFO instead of from Host.
+ */
+static void cxgb_pio_copy(u64 __iomem *dst, u64 *src)
+{
+	int count = 8;
+
+	while (count) {
+		writeq(*src, dst);
+		src++;
+		dst++;
+		count--;
+	}
+}
+
+/**
+ *	cxgb4_ring_tx_db - check and potentially ring a Tx queue's doorbell
+ *	@adap: the adapter
+ *	@q: the Tx queue
+ *	@n: number of new descriptors to give to HW
+ *
+ *	Ring the doorbel for a Tx queue.
+ */
+inline void cxgb4_ring_tx_db(struct adapter *adap, struct sge_txq *q, int n)
+{
+	/* Make sure that all writes to the TX Descriptors are committed
+	 * before we tell the hardware about them.
+	 */
+	wmb();
+
+	/* If we don't have access to the new User Doorbell (T5+), use the old
+	 * doorbell mechanism; otherwise use the new BAR2 mechanism.
+	 */
+	if (unlikely(q->bar2_addr == NULL)) {
+		u32 val = PIDX_V(n);
+		unsigned long flags;
+
+		/* For T4 we need to participate in the Doorbell Recovery
+		 * mechanism.
+		 */
+		spin_lock_irqsave(&q->db_lock, flags);
+		if (!q->db_disabled)
+			t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL_A),
+				     QID_V(q->cntxt_id) | val);
+		else
+			q->db_pidx_inc += n;
+		q->db_pidx = q->pidx;
+		spin_unlock_irqrestore(&q->db_lock, flags);
+	} else {
+		u32 val = PIDX_T5_V(n);
+
+		/* T4 and later chips share the same PIDX field offset within
+		 * the doorbell, but T5 and later shrank the field in order to
+		 * gain a bit for Doorbell Priority.  The field was absurdly
+		 * large in the first place (14 bits) so we just use the T5
+		 * and later limits and warn if a Queue ID is too large.
+		 */
+		WARN_ON(val & DBPRIO_F);
+
+		/* If we're only writing a single TX Descriptor and we can use
+		 * Inferred QID registers, we can use the Write Combining
+		 * Gather Buffer; otherwise we use the simple doorbell.
+		 */
+		if (n == 1 && q->bar2_qid == 0) {
+			int index = (q->pidx
+				     ? (q->pidx - 1)
+				     : (q->size - 1));
+			u64 *wr = (u64 *)&q->desc[index];
+
+			cxgb_pio_copy((u64 __iomem *)
+				      (q->bar2_addr + SGE_UDB_WCDOORBELL),
+				      wr);
+		} else {
+			writel(val | QID_V(q->bar2_qid),
+			       q->bar2_addr + SGE_UDB_KDOORBELL);
+		}
+
+		/* This Write Memory Barrier will force the write to the User
+		 * Doorbell area to be flushed.  This is needed to prevent
+		 * writes on different CPUs for the same queue from hitting
+		 * the adapter out of order.  This is required when some Work
+		 * Requests take the Write Combine Gather Buffer path (user
+		 * doorbell area offset [SGE_UDB_WCDOORBELL..+63]) and some
+		 * take the traditional path where we simply increment the
+		 * PIDX (User Doorbell area SGE_UDB_KDOORBELL) and have the
+		 * hardware DMA read the actual Work Request.
+		 */
+		wmb();
+	}
+}
+EXPORT_SYMBOL(cxgb4_ring_tx_db);
+
+/**
+ *	cxgb4_inline_tx_skb - inline a packet's data into Tx descriptors
+ *	@skb: the packet
+ *	@q: the Tx queue where the packet will be inlined
+ *	@pos: starting position in the Tx queue where to inline the packet
+ *
+ *	Inline a packet's contents directly into Tx descriptors, starting at
+ *	the given position within the Tx DMA ring.
+ *	Most of the complexity of this operation is dealing with wrap arounds
+ *	in the middle of the packet we want to inline.
+ */
+void cxgb4_inline_tx_skb(const struct sk_buff *skb,
+			 const struct sge_txq *q, void *pos)
+{
+	int left = (void *)q->stat - pos;
+	u64 *p;
+
+	if (likely(skb->len <= left)) {
+		if (likely(!skb->data_len))
+			skb_copy_from_linear_data(skb, pos, skb->len);
+		else
+			skb_copy_bits(skb, 0, pos, skb->len);
+		pos += skb->len;
+	} else {
+		skb_copy_bits(skb, 0, pos, left);
+		skb_copy_bits(skb, left, q->desc, skb->len - left);
+		pos = (void *)q->desc + (skb->len - left);
+	}
+
+	/* 0-pad to multiple of 16 */
+	p = PTR_ALIGN(pos, 8);
+	if ((uintptr_t)p & 8)
+		*p = 0;
+}
+EXPORT_SYMBOL(cxgb4_inline_tx_skb);
+
+static void *inline_tx_skb_header(const struct sk_buff *skb,
+				  const struct sge_txq *q,  void *pos,
+				  int length)
+{
+	u64 *p;
+	int left = (void *)q->stat - pos;
+
+	if (likely(length <= left)) {
+		memcpy(pos, skb->data, length);
+		pos += length;
+	} else {
+		memcpy(pos, skb->data, left);
+		memcpy(q->desc, skb->data + left, length - left);
+		pos = (void *)q->desc + (length - left);
+	}
+	/* 0-pad to multiple of 16 */
+	p = PTR_ALIGN(pos, 8);
+	if ((uintptr_t)p & 8) {
+		*p = 0;
+		return p + 1;
+	}
+	return p;
+}
+
+/*
+ * Figure out what HW csum a packet wants and return the appropriate control
+ * bits.
+ */
+static u64 hwcsum(enum chip_type chip, const struct sk_buff *skb)
+{
+	int csum_type;
+	const struct iphdr *iph = ip_hdr(skb);
+
+	if (iph->version == 4) {
+		if (iph->protocol == IPPROTO_TCP)
+			csum_type = TX_CSUM_TCPIP;
+		else if (iph->protocol == IPPROTO_UDP)
+			csum_type = TX_CSUM_UDPIP;
+		else {
+nocsum:			/*
+			 * unknown protocol, disable HW csum
+			 * and hope a bad packet is detected
+			 */
+			return TXPKT_L4CSUM_DIS_F;
+		}
+	} else {
+		/*
+		 * this doesn't work with extension headers
+		 */
+		const struct ipv6hdr *ip6h = (const struct ipv6hdr *)iph;
+
+		if (ip6h->nexthdr == IPPROTO_TCP)
+			csum_type = TX_CSUM_TCPIP6;
+		else if (ip6h->nexthdr == IPPROTO_UDP)
+			csum_type = TX_CSUM_UDPIP6;
+		else
+			goto nocsum;
+	}
+
+	if (likely(csum_type >= TX_CSUM_TCPIP)) {
+		u64 hdr_len = TXPKT_IPHDR_LEN_V(skb_network_header_len(skb));
+		int eth_hdr_len = skb_network_offset(skb) - ETH_HLEN;
+
+		if (CHELSIO_CHIP_VERSION(chip) <= CHELSIO_T5)
+			hdr_len |= TXPKT_ETHHDR_LEN_V(eth_hdr_len);
+		else
+			hdr_len |= T6_TXPKT_ETHHDR_LEN_V(eth_hdr_len);
+		return TXPKT_CSUM_TYPE_V(csum_type) | hdr_len;
+	} else {
+		int start = skb_transport_offset(skb);
+
+		return TXPKT_CSUM_TYPE_V(csum_type) |
+			TXPKT_CSUM_START_V(start) |
+			TXPKT_CSUM_LOC_V(start + skb->csum_offset);
+	}
+}
+
+static void eth_txq_stop(struct sge_eth_txq *q)
+{
+	netif_tx_stop_queue(q->txq);
+	q->q.stops++;
+}
+
+static inline void txq_advance(struct sge_txq *q, unsigned int n)
+{
+	q->in_use += n;
+	q->pidx += n;
+	if (q->pidx >= q->size)
+		q->pidx -= q->size;
+}
+
+#ifdef CONFIG_CHELSIO_T4_FCOE
+static inline int
+cxgb_fcoe_offload(struct sk_buff *skb, struct adapter *adap,
+		  const struct port_info *pi, u64 *cntrl)
+{
+	const struct cxgb_fcoe *fcoe = &pi->fcoe;
+
+	if (!(fcoe->flags & CXGB_FCOE_ENABLED))
+		return 0;
+
+	if (skb->protocol != htons(ETH_P_FCOE))
+		return 0;
+
+	skb_reset_mac_header(skb);
+	skb->mac_len = sizeof(struct ethhdr);
+
+	skb_set_network_header(skb, skb->mac_len);
+	skb_set_transport_header(skb, skb->mac_len + sizeof(struct fcoe_hdr));
+
+	if (!cxgb_fcoe_sof_eof_supported(adap, skb))
+		return -ENOTSUPP;
+
+	/* FC CRC offload */
+	*cntrl = TXPKT_CSUM_TYPE_V(TX_CSUM_FCOE) |
+		     TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F |
+		     TXPKT_CSUM_START_V(CXGB_FCOE_TXPKT_CSUM_START) |
+		     TXPKT_CSUM_END_V(CXGB_FCOE_TXPKT_CSUM_END) |
+		     TXPKT_CSUM_LOC_V(CXGB_FCOE_TXPKT_CSUM_END);
+	return 0;
+}
+#endif /* CONFIG_CHELSIO_T4_FCOE */
+
+/* Returns tunnel type if hardware supports offloading of the same.
+ * It is called only for T5 and onwards.
+ */
+enum cpl_tx_tnl_lso_type cxgb_encap_offload_supported(struct sk_buff *skb)
+{
+	u8 l4_hdr = 0;
+	enum cpl_tx_tnl_lso_type tnl_type = TX_TNL_TYPE_OPAQUE;
+	struct port_info *pi = netdev_priv(skb->dev);
+	struct adapter *adapter = pi->adapter;
+
+	if (skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
+	    skb->inner_protocol != htons(ETH_P_TEB))
+		return tnl_type;
+
+	switch (vlan_get_protocol(skb)) {
+	case htons(ETH_P_IP):
+		l4_hdr = ip_hdr(skb)->protocol;
+		break;
+	case htons(ETH_P_IPV6):
+		l4_hdr = ipv6_hdr(skb)->nexthdr;
+		break;
+	default:
+		return tnl_type;
+	}
+
+	switch (l4_hdr) {
+	case IPPROTO_UDP:
+		if (adapter->vxlan_port == udp_hdr(skb)->dest)
+			tnl_type = TX_TNL_TYPE_VXLAN;
+		else if (adapter->geneve_port == udp_hdr(skb)->dest)
+			tnl_type = TX_TNL_TYPE_GENEVE;
+		break;
+	default:
+		return tnl_type;
+	}
+
+	return tnl_type;
+}
+
+static inline void t6_fill_tnl_lso(struct sk_buff *skb,
+				   struct cpl_tx_tnl_lso *tnl_lso,
+				   enum cpl_tx_tnl_lso_type tnl_type)
+{
+	u32 val;
+	int in_eth_xtra_len;
+	int l3hdr_len = skb_network_header_len(skb);
+	int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
+	const struct skb_shared_info *ssi = skb_shinfo(skb);
+	bool v6 = (ip_hdr(skb)->version == 6);
+
+	val = CPL_TX_TNL_LSO_OPCODE_V(CPL_TX_TNL_LSO) |
+	      CPL_TX_TNL_LSO_FIRST_F |
+	      CPL_TX_TNL_LSO_LAST_F |
+	      (v6 ? CPL_TX_TNL_LSO_IPV6OUT_F : 0) |
+	      CPL_TX_TNL_LSO_ETHHDRLENOUT_V(eth_xtra_len / 4) |
+	      CPL_TX_TNL_LSO_IPHDRLENOUT_V(l3hdr_len / 4) |
+	      (v6 ? 0 : CPL_TX_TNL_LSO_IPHDRCHKOUT_F) |
+	      CPL_TX_TNL_LSO_IPLENSETOUT_F |
+	      (v6 ? 0 : CPL_TX_TNL_LSO_IPIDINCOUT_F);
+	tnl_lso->op_to_IpIdSplitOut = htonl(val);
+
+	tnl_lso->IpIdOffsetOut = 0;
+
+	/* Get the tunnel header length */
+	val = skb_inner_mac_header(skb) - skb_mac_header(skb);
+	in_eth_xtra_len = skb_inner_network_header(skb) -
+			  skb_inner_mac_header(skb) - ETH_HLEN;
+
+	switch (tnl_type) {
+	case TX_TNL_TYPE_VXLAN:
+	case TX_TNL_TYPE_GENEVE:
+		tnl_lso->UdpLenSetOut_to_TnlHdrLen =
+			htons(CPL_TX_TNL_LSO_UDPCHKCLROUT_F |
+			CPL_TX_TNL_LSO_UDPLENSETOUT_F);
+		break;
+	default:
+		tnl_lso->UdpLenSetOut_to_TnlHdrLen = 0;
+		break;
+	}
+
+	tnl_lso->UdpLenSetOut_to_TnlHdrLen |=
+		 htons(CPL_TX_TNL_LSO_TNLHDRLEN_V(val) |
+		       CPL_TX_TNL_LSO_TNLTYPE_V(tnl_type));
+
+	tnl_lso->r1 = 0;
+
+	val = CPL_TX_TNL_LSO_ETHHDRLEN_V(in_eth_xtra_len / 4) |
+	      CPL_TX_TNL_LSO_IPV6_V(inner_ip_hdr(skb)->version == 6) |
+	      CPL_TX_TNL_LSO_IPHDRLEN_V(skb_inner_network_header_len(skb) / 4) |
+	      CPL_TX_TNL_LSO_TCPHDRLEN_V(inner_tcp_hdrlen(skb) / 4);
+	tnl_lso->Flow_to_TcpHdrLen = htonl(val);
+
+	tnl_lso->IpIdOffset = htons(0);
+
+	tnl_lso->IpIdSplit_to_Mss = htons(CPL_TX_TNL_LSO_MSS_V(ssi->gso_size));
+	tnl_lso->TCPSeqOffset = htonl(0);
+	tnl_lso->EthLenOffset_Size = htonl(CPL_TX_TNL_LSO_SIZE_V(skb->len));
+}
+
+/**
+ *	t4_eth_xmit - add a packet to an Ethernet Tx queue
+ *	@skb: the packet
+ *	@dev: the egress net device
+ *
+ *	Add a packet to an SGE Ethernet Tx queue.  Runs with softirqs disabled.
+ */
+netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	u32 wr_mid, ctrl0, op;
+	u64 cntrl, *end;
+	int qidx, credits;
+	unsigned int flits, ndesc;
+	struct adapter *adap;
+	struct sge_eth_txq *q;
+	const struct port_info *pi;
+	struct fw_eth_tx_pkt_wr *wr;
+	struct cpl_tx_pkt_core *cpl;
+	const struct skb_shared_info *ssi;
+	dma_addr_t addr[MAX_SKB_FRAGS + 1];
+	bool immediate = false;
+	int len, max_pkt_len;
+	bool ptp_enabled = is_ptp_enabled(skb, dev);
+	unsigned int chip_ver;
+	enum cpl_tx_tnl_lso_type tnl_type = TX_TNL_TYPE_OPAQUE;
+
+#ifdef CONFIG_CHELSIO_T4_FCOE
+	int err;
+#endif /* CONFIG_CHELSIO_T4_FCOE */
+
+	/*
+	 * The chip min packet length is 10 octets but play safe and reject
+	 * anything shorter than an Ethernet header.
+	 */
+	if (unlikely(skb->len < ETH_HLEN)) {
+out_free:	dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+	/* Discard the packet if the length is greater than mtu */
+	max_pkt_len = ETH_HLEN + dev->mtu;
+	if (skb_vlan_tagged(skb))
+		max_pkt_len += VLAN_HLEN;
+	if (!skb_shinfo(skb)->gso_size && (unlikely(skb->len > max_pkt_len)))
+		goto out_free;
+
+	pi = netdev_priv(dev);
+	adap = pi->adapter;
+	ssi = skb_shinfo(skb);
+#ifdef CONFIG_CHELSIO_IPSEC_INLINE
+	if (xfrm_offload(skb) && !ssi->gso_size)
+		return adap->uld[CXGB4_ULD_CRYPTO].tx_handler(skb, dev);
+#endif /* CHELSIO_IPSEC_INLINE */
+
+	qidx = skb_get_queue_mapping(skb);
+	if (ptp_enabled) {
+		spin_lock(&adap->ptp_lock);
+		if (!(adap->ptp_tx_skb)) {
+			skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+			adap->ptp_tx_skb = skb_get(skb);
+		} else {
+			spin_unlock(&adap->ptp_lock);
+			goto out_free;
+		}
+		q = &adap->sge.ptptxq;
+	} else {
+		q = &adap->sge.ethtxq[qidx + pi->first_qset];
+	}
+	skb_tx_timestamp(skb);
+
+	cxgb4_reclaim_completed_tx(adap, &q->q, true);
+	cntrl = TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F;
+
+#ifdef CONFIG_CHELSIO_T4_FCOE
+	err = cxgb_fcoe_offload(skb, adap, pi, &cntrl);
+	if (unlikely(err == -ENOTSUPP)) {
+		if (ptp_enabled)
+			spin_unlock(&adap->ptp_lock);
+		goto out_free;
+	}
+#endif /* CONFIG_CHELSIO_T4_FCOE */
+
+	chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
+	flits = calc_tx_flits(skb, chip_ver);
+	ndesc = flits_to_desc(flits);
+	credits = txq_avail(&q->q) - ndesc;
+
+	if (unlikely(credits < 0)) {
+		eth_txq_stop(q);
+		dev_err(adap->pdev_dev,
+			"%s: Tx ring %u full while queue awake!\n",
+			dev->name, qidx);
+		if (ptp_enabled)
+			spin_unlock(&adap->ptp_lock);
+		return NETDEV_TX_BUSY;
+	}
+
+	if (is_eth_imm(skb, chip_ver))
+		immediate = true;
+
+	if (skb->encapsulation && chip_ver > CHELSIO_T5)
+		tnl_type = cxgb_encap_offload_supported(skb);
+
+	if (!immediate &&
+	    unlikely(cxgb4_map_skb(adap->pdev_dev, skb, addr) < 0)) {
+		q->mapping_err++;
+		if (ptp_enabled)
+			spin_unlock(&adap->ptp_lock);
+		goto out_free;
+	}
+
+	wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2));
+	if (unlikely(credits < ETHTXQ_STOP_THRES)) {
+		eth_txq_stop(q);
+		wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
+	}
+
+	wr = (void *)&q->q.desc[q->q.pidx];
+	wr->equiq_to_len16 = htonl(wr_mid);
+	wr->r3 = cpu_to_be64(0);
+	end = (u64 *)wr + flits;
+
+	len = immediate ? skb->len : 0;
+	if (ssi->gso_size) {
+		struct cpl_tx_pkt_lso *lso = (void *)wr;
+		bool v6 = (ssi->gso_type & SKB_GSO_TCPV6) != 0;
+		int l3hdr_len = skb_network_header_len(skb);
+		int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
+		struct cpl_tx_tnl_lso *tnl_lso = (void *)(wr + 1);
+
+		if (tnl_type)
+			len += sizeof(*tnl_lso);
+		else
+			len += sizeof(*lso);
+
+		wr->op_immdlen = htonl(FW_WR_OP_V(FW_ETH_TX_PKT_WR) |
+				       FW_WR_IMMDLEN_V(len));
+		if (tnl_type) {
+			struct iphdr *iph = ip_hdr(skb);
+
+			t6_fill_tnl_lso(skb, tnl_lso, tnl_type);
+			cpl = (void *)(tnl_lso + 1);
+			/* Driver is expected to compute partial checksum that
+			 * does not include the IP Total Length.
+			 */
+			if (iph->version == 4) {
+				iph->check = 0;
+				iph->tot_len = 0;
+				iph->check = (u16)(~ip_fast_csum((u8 *)iph,
+								 iph->ihl));
+			}
+			if (skb->ip_summed == CHECKSUM_PARTIAL)
+				cntrl = hwcsum(adap->params.chip, skb);
+		} else {
+			lso->c.lso_ctrl = htonl(LSO_OPCODE_V(CPL_TX_PKT_LSO) |
+					  LSO_FIRST_SLICE_F | LSO_LAST_SLICE_F |
+					  LSO_IPV6_V(v6) |
+					  LSO_ETHHDR_LEN_V(eth_xtra_len / 4) |
+					  LSO_IPHDR_LEN_V(l3hdr_len / 4) |
+					  LSO_TCPHDR_LEN_V(tcp_hdr(skb)->doff));
+			lso->c.ipid_ofst = htons(0);
+			lso->c.mss = htons(ssi->gso_size);
+			lso->c.seqno_offset = htonl(0);
+			if (is_t4(adap->params.chip))
+				lso->c.len = htonl(skb->len);
+			else
+				lso->c.len =
+					htonl(LSO_T5_XFER_SIZE_V(skb->len));
+			cpl = (void *)(lso + 1);
+
+			if (CHELSIO_CHIP_VERSION(adap->params.chip)
+			    <= CHELSIO_T5)
+				cntrl =	TXPKT_ETHHDR_LEN_V(eth_xtra_len);
+			else
+				cntrl = T6_TXPKT_ETHHDR_LEN_V(eth_xtra_len);
+
+			cntrl |= TXPKT_CSUM_TYPE_V(v6 ?
+				 TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) |
+				 TXPKT_IPHDR_LEN_V(l3hdr_len);
+		}
+		q->tso++;
+		q->tx_cso += ssi->gso_segs;
+	} else {
+		len += sizeof(*cpl);
+		if (ptp_enabled)
+			op = FW_PTP_TX_PKT_WR;
+		else
+			op = FW_ETH_TX_PKT_WR;
+		wr->op_immdlen = htonl(FW_WR_OP_V(op) |
+				       FW_WR_IMMDLEN_V(len));
+		cpl = (void *)(wr + 1);
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			cntrl = hwcsum(adap->params.chip, skb) |
+				TXPKT_IPCSUM_DIS_F;
+			q->tx_cso++;
+		}
+	}
+
+	if (skb_vlan_tag_present(skb)) {
+		q->vlan_ins++;
+		cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb));
+#ifdef CONFIG_CHELSIO_T4_FCOE
+		if (skb->protocol == htons(ETH_P_FCOE))
+			cntrl |= TXPKT_VLAN_V(
+				 ((skb->priority & 0x7) << VLAN_PRIO_SHIFT));
+#endif /* CONFIG_CHELSIO_T4_FCOE */
+	}
+
+	ctrl0 = TXPKT_OPCODE_V(CPL_TX_PKT_XT) | TXPKT_INTF_V(pi->tx_chan) |
+		TXPKT_PF_V(adap->pf);
+	if (ptp_enabled)
+		ctrl0 |= TXPKT_TSTAMP_F;
+#ifdef CONFIG_CHELSIO_T4_DCB
+	if (is_t4(adap->params.chip))
+		ctrl0 |= TXPKT_OVLAN_IDX_V(q->dcb_prio);
+	else
+		ctrl0 |= TXPKT_T5_OVLAN_IDX_V(q->dcb_prio);
+#endif
+	cpl->ctrl0 = htonl(ctrl0);
+	cpl->pack = htons(0);
+	cpl->len = htons(skb->len);
+	cpl->ctrl1 = cpu_to_be64(cntrl);
+
+	if (immediate) {
+		cxgb4_inline_tx_skb(skb, &q->q, cpl + 1);
+		dev_consume_skb_any(skb);
+	} else {
+		int last_desc;
+
+		cxgb4_write_sgl(skb, &q->q, (struct ulptx_sgl *)(cpl + 1),
+				end, 0, addr);
+		skb_orphan(skb);
+
+		last_desc = q->q.pidx + ndesc - 1;
+		if (last_desc >= q->q.size)
+			last_desc -= q->q.size;
+		q->q.sdesc[last_desc].skb = skb;
+		q->q.sdesc[last_desc].sgl = (struct ulptx_sgl *)(cpl + 1);
+	}
+
+	txq_advance(&q->q, ndesc);
+
+	cxgb4_ring_tx_db(adap, &q->q, ndesc);
+	if (ptp_enabled)
+		spin_unlock(&adap->ptp_lock);
+	return NETDEV_TX_OK;
+}
+
+/**
+ *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
+ *	@q: the SGE control Tx queue
+ *
+ *	This is a variant of cxgb4_reclaim_completed_tx() that is used
+ *	for Tx queues that send only immediate data (presently just
+ *	the control queues) and	thus do not have any sk_buffs to release.
+ */
+static inline void reclaim_completed_tx_imm(struct sge_txq *q)
+{
+	int hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
+	int reclaim = hw_cidx - q->cidx;
+
+	if (reclaim < 0)
+		reclaim += q->size;
+
+	q->in_use -= reclaim;
+	q->cidx = hw_cidx;
+}
+
+/**
+ *	is_imm - check whether a packet can be sent as immediate data
+ *	@skb: the packet
+ *
+ *	Returns true if a packet can be sent as a WR with immediate data.
+ */
+static inline int is_imm(const struct sk_buff *skb)
+{
+	return skb->len <= MAX_CTRL_WR_LEN;
+}
+
+/**
+ *	ctrlq_check_stop - check if a control queue is full and should stop
+ *	@q: the queue
+ *	@wr: most recent WR written to the queue
+ *
+ *	Check if a control queue has become full and should be stopped.
+ *	We clean up control queue descriptors very lazily, only when we are out.
+ *	If the queue is still full after reclaiming any completed descriptors
+ *	we suspend it and have the last WR wake it up.
+ */
+static void ctrlq_check_stop(struct sge_ctrl_txq *q, struct fw_wr_hdr *wr)
+{
+	reclaim_completed_tx_imm(&q->q);
+	if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) {
+		wr->lo |= htonl(FW_WR_EQUEQ_F | FW_WR_EQUIQ_F);
+		q->q.stops++;
+		q->full = 1;
+	}
+}
+
+/**
+ *	ctrl_xmit - send a packet through an SGE control Tx queue
+ *	@q: the control queue
+ *	@skb: the packet
+ *
+ *	Send a packet through an SGE control Tx queue.  Packets sent through
+ *	a control queue must fit entirely as immediate data.
+ */
+static int ctrl_xmit(struct sge_ctrl_txq *q, struct sk_buff *skb)
+{
+	unsigned int ndesc;
+	struct fw_wr_hdr *wr;
+
+	if (unlikely(!is_imm(skb))) {
+		WARN_ON(1);
+		dev_kfree_skb(skb);
+		return NET_XMIT_DROP;
+	}
+
+	ndesc = DIV_ROUND_UP(skb->len, sizeof(struct tx_desc));
+	spin_lock(&q->sendq.lock);
+
+	if (unlikely(q->full)) {
+		skb->priority = ndesc;                  /* save for restart */
+		__skb_queue_tail(&q->sendq, skb);
+		spin_unlock(&q->sendq.lock);
+		return NET_XMIT_CN;
+	}
+
+	wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx];
+	cxgb4_inline_tx_skb(skb, &q->q, wr);
+
+	txq_advance(&q->q, ndesc);
+	if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES))
+		ctrlq_check_stop(q, wr);
+
+	cxgb4_ring_tx_db(q->adap, &q->q, ndesc);
+	spin_unlock(&q->sendq.lock);
+
+	kfree_skb(skb);
+	return NET_XMIT_SUCCESS;
+}
+
+/**
+ *	restart_ctrlq - restart a suspended control queue
+ *	@data: the control queue to restart
+ *
+ *	Resumes transmission on a suspended Tx control queue.
+ */
+static void restart_ctrlq(unsigned long data)
+{
+	struct sk_buff *skb;
+	unsigned int written = 0;
+	struct sge_ctrl_txq *q = (struct sge_ctrl_txq *)data;
+
+	spin_lock(&q->sendq.lock);
+	reclaim_completed_tx_imm(&q->q);
+	BUG_ON(txq_avail(&q->q) < TXQ_STOP_THRES);  /* q should be empty */
+
+	while ((skb = __skb_dequeue(&q->sendq)) != NULL) {
+		struct fw_wr_hdr *wr;
+		unsigned int ndesc = skb->priority;     /* previously saved */
+
+		written += ndesc;
+		/* Write descriptors and free skbs outside the lock to limit
+		 * wait times.  q->full is still set so new skbs will be queued.
+		 */
+		wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx];
+		txq_advance(&q->q, ndesc);
+		spin_unlock(&q->sendq.lock);
+
+		cxgb4_inline_tx_skb(skb, &q->q, wr);
+		kfree_skb(skb);
+
+		if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) {
+			unsigned long old = q->q.stops;
+
+			ctrlq_check_stop(q, wr);
+			if (q->q.stops != old) {          /* suspended anew */
+				spin_lock(&q->sendq.lock);
+				goto ringdb;
+			}
+		}
+		if (written > 16) {
+			cxgb4_ring_tx_db(q->adap, &q->q, written);
+			written = 0;
+		}
+		spin_lock(&q->sendq.lock);
+	}
+	q->full = 0;
+ringdb:
+	if (written)
+		cxgb4_ring_tx_db(q->adap, &q->q, written);
+	spin_unlock(&q->sendq.lock);
+}
+
+/**
+ *	t4_mgmt_tx - send a management message
+ *	@adap: the adapter
+ *	@skb: the packet containing the management message
+ *
+ *	Send a management message through control queue 0.
+ */
+int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb)
+{
+	int ret;
+
+	local_bh_disable();
+	ret = ctrl_xmit(&adap->sge.ctrlq[0], skb);
+	local_bh_enable();
+	return ret;
+}
+
+/**
+ *	is_ofld_imm - check whether a packet can be sent as immediate data
+ *	@skb: the packet
+ *
+ *	Returns true if a packet can be sent as an offload WR with immediate
+ *	data.  We currently use the same limit as for Ethernet packets.
+ */
+static inline int is_ofld_imm(const struct sk_buff *skb)
+{
+	struct work_request_hdr *req = (struct work_request_hdr *)skb->data;
+	unsigned long opcode = FW_WR_OP_G(ntohl(req->wr_hi));
+
+	if (opcode == FW_CRYPTO_LOOKASIDE_WR)
+		return skb->len <= SGE_MAX_WR_LEN;
+	else
+		return skb->len <= MAX_IMM_TX_PKT_LEN;
+}
+
+/**
+ *	calc_tx_flits_ofld - calculate # of flits for an offload packet
+ *	@skb: the packet
+ *
+ *	Returns the number of flits needed for the given offload packet.
+ *	These packets are already fully constructed and no additional headers
+ *	will be added.
+ */
+static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb)
+{
+	unsigned int flits, cnt;
+
+	if (is_ofld_imm(skb))
+		return DIV_ROUND_UP(skb->len, 8);
+
+	flits = skb_transport_offset(skb) / 8U;   /* headers */
+	cnt = skb_shinfo(skb)->nr_frags;
+	if (skb_tail_pointer(skb) != skb_transport_header(skb))
+		cnt++;
+	return flits + sgl_len(cnt);
+}
+
+/**
+ *	txq_stop_maperr - stop a Tx queue due to I/O MMU exhaustion
+ *	@adap: the adapter
+ *	@q: the queue to stop
+ *
+ *	Mark a Tx queue stopped due to I/O MMU exhaustion and resulting
+ *	inability to map packets.  A periodic timer attempts to restart
+ *	queues so marked.
+ */
+static void txq_stop_maperr(struct sge_uld_txq *q)
+{
+	q->mapping_err++;
+	q->q.stops++;
+	set_bit(q->q.cntxt_id - q->adap->sge.egr_start,
+		q->adap->sge.txq_maperr);
+}
+
+/**
+ *	ofldtxq_stop - stop an offload Tx queue that has become full
+ *	@q: the queue to stop
+ *	@wr: the Work Request causing the queue to become full
+ *
+ *	Stops an offload Tx queue that has become full and modifies the packet
+ *	being written to request a wakeup.
+ */
+static void ofldtxq_stop(struct sge_uld_txq *q, struct fw_wr_hdr *wr)
+{
+	wr->lo |= htonl(FW_WR_EQUEQ_F | FW_WR_EQUIQ_F);
+	q->q.stops++;
+	q->full = 1;
+}
+
+/**
+ *	service_ofldq - service/restart a suspended offload queue
+ *	@q: the offload queue
+ *
+ *	Services an offload Tx queue by moving packets from its Pending Send
+ *	Queue to the Hardware TX ring.  The function starts and ends with the
+ *	Send Queue locked, but drops the lock while putting the skb at the
+ *	head of the Send Queue onto the Hardware TX Ring.  Dropping the lock
+ *	allows more skbs to be added to the Send Queue by other threads.
+ *	The packet being processed at the head of the Pending Send Queue is
+ *	left on the queue in case we experience DMA Mapping errors, etc.
+ *	and need to give up and restart later.
+ *
+ *	service_ofldq() can be thought of as a task which opportunistically
+ *	uses other threads execution contexts.  We use the Offload Queue
+ *	boolean "service_ofldq_running" to make sure that only one instance
+ *	is ever running at a time ...
+ */
+static void service_ofldq(struct sge_uld_txq *q)
+{
+	u64 *pos, *before, *end;
+	int credits;
+	struct sk_buff *skb;
+	struct sge_txq *txq;
+	unsigned int left;
+	unsigned int written = 0;
+	unsigned int flits, ndesc;
+
+	/* If another thread is currently in service_ofldq() processing the
+	 * Pending Send Queue then there's nothing to do. Otherwise, flag
+	 * that we're doing the work and continue.  Examining/modifying
+	 * the Offload Queue boolean "service_ofldq_running" must be done
+	 * while holding the Pending Send Queue Lock.
+	 */
+	if (q->service_ofldq_running)
+		return;
+	q->service_ofldq_running = true;
+
+	while ((skb = skb_peek(&q->sendq)) != NULL && !q->full) {
+		/* We drop the lock while we're working with the skb at the
+		 * head of the Pending Send Queue.  This allows more skbs to
+		 * be added to the Pending Send Queue while we're working on
+		 * this one.  We don't need to lock to guard the TX Ring
+		 * updates because only one thread of execution is ever
+		 * allowed into service_ofldq() at a time.
+		 */
+		spin_unlock(&q->sendq.lock);
+
+		cxgb4_reclaim_completed_tx(q->adap, &q->q, false);
+
+		flits = skb->priority;                /* previously saved */
+		ndesc = flits_to_desc(flits);
+		credits = txq_avail(&q->q) - ndesc;
+		BUG_ON(credits < 0);
+		if (unlikely(credits < TXQ_STOP_THRES))
+			ofldtxq_stop(q, (struct fw_wr_hdr *)skb->data);
+
+		pos = (u64 *)&q->q.desc[q->q.pidx];
+		if (is_ofld_imm(skb))
+			cxgb4_inline_tx_skb(skb, &q->q, pos);
+		else if (cxgb4_map_skb(q->adap->pdev_dev, skb,
+				       (dma_addr_t *)skb->head)) {
+			txq_stop_maperr(q);
+			spin_lock(&q->sendq.lock);
+			break;
+		} else {
+			int last_desc, hdr_len = skb_transport_offset(skb);
+
+			/* The WR headers  may not fit within one descriptor.
+			 * So we need to deal with wrap-around here.
+			 */
+			before = (u64 *)pos;
+			end = (u64 *)pos + flits;
+			txq = &q->q;
+			pos = (void *)inline_tx_skb_header(skb, &q->q,
+							   (void *)pos,
+							   hdr_len);
+			if (before > (u64 *)pos) {
+				left = (u8 *)end - (u8 *)txq->stat;
+				end = (void *)txq->desc + left;
+			}
+
+			/* If current position is already at the end of the
+			 * ofld queue, reset the current to point to
+			 * start of the queue and update the end ptr as well.
+			 */
+			if (pos == (u64 *)txq->stat) {
+				left = (u8 *)end - (u8 *)txq->stat;
+				end = (void *)txq->desc + left;
+				pos = (void *)txq->desc;
+			}
+
+			cxgb4_write_sgl(skb, &q->q, (void *)pos,
+					end, hdr_len,
+					(dma_addr_t *)skb->head);
+#ifdef CONFIG_NEED_DMA_MAP_STATE
+			skb->dev = q->adap->port[0];
+			skb->destructor = deferred_unmap_destructor;
+#endif
+			last_desc = q->q.pidx + ndesc - 1;
+			if (last_desc >= q->q.size)
+				last_desc -= q->q.size;
+			q->q.sdesc[last_desc].skb = skb;
+		}
+
+		txq_advance(&q->q, ndesc);
+		written += ndesc;
+		if (unlikely(written > 32)) {
+			cxgb4_ring_tx_db(q->adap, &q->q, written);
+			written = 0;
+		}
+
+		/* Reacquire the Pending Send Queue Lock so we can unlink the
+		 * skb we've just successfully transferred to the TX Ring and
+		 * loop for the next skb which may be at the head of the
+		 * Pending Send Queue.
+		 */
+		spin_lock(&q->sendq.lock);
+		__skb_unlink(skb, &q->sendq);
+		if (is_ofld_imm(skb))
+			kfree_skb(skb);
+	}
+	if (likely(written))
+		cxgb4_ring_tx_db(q->adap, &q->q, written);
+
+	/*Indicate that no thread is processing the Pending Send Queue
+	 * currently.
+	 */
+	q->service_ofldq_running = false;
+}
+
+/**
+ *	ofld_xmit - send a packet through an offload queue
+ *	@q: the Tx offload queue
+ *	@skb: the packet
+ *
+ *	Send an offload packet through an SGE offload queue.
+ */
+static int ofld_xmit(struct sge_uld_txq *q, struct sk_buff *skb)
+{
+	skb->priority = calc_tx_flits_ofld(skb);       /* save for restart */
+	spin_lock(&q->sendq.lock);
+
+	/* Queue the new skb onto the Offload Queue's Pending Send Queue.  If
+	 * that results in this new skb being the only one on the queue, start
+	 * servicing it.  If there are other skbs already on the list, then
+	 * either the queue is currently being processed or it's been stopped
+	 * for some reason and it'll be restarted at a later time.  Restart
+	 * paths are triggered by events like experiencing a DMA Mapping Error
+	 * or filling the Hardware TX Ring.
+	 */
+	__skb_queue_tail(&q->sendq, skb);
+	if (q->sendq.qlen == 1)
+		service_ofldq(q);
+
+	spin_unlock(&q->sendq.lock);
+	return NET_XMIT_SUCCESS;
+}
+
+/**
+ *	restart_ofldq - restart a suspended offload queue
+ *	@data: the offload queue to restart
+ *
+ *	Resumes transmission on a suspended Tx offload queue.
+ */
+static void restart_ofldq(unsigned long data)
+{
+	struct sge_uld_txq *q = (struct sge_uld_txq *)data;
+
+	spin_lock(&q->sendq.lock);
+	q->full = 0;            /* the queue actually is completely empty now */
+	service_ofldq(q);
+	spin_unlock(&q->sendq.lock);
+}
+
+/**
+ *	skb_txq - return the Tx queue an offload packet should use
+ *	@skb: the packet
+ *
+ *	Returns the Tx queue an offload packet should use as indicated by bits
+ *	1-15 in the packet's queue_mapping.
+ */
+static inline unsigned int skb_txq(const struct sk_buff *skb)
+{
+	return skb->queue_mapping >> 1;
+}
+
+/**
+ *	is_ctrl_pkt - return whether an offload packet is a control packet
+ *	@skb: the packet
+ *
+ *	Returns whether an offload packet should use an OFLD or a CTRL
+ *	Tx queue as indicated by bit 0 in the packet's queue_mapping.
+ */
+static inline unsigned int is_ctrl_pkt(const struct sk_buff *skb)
+{
+	return skb->queue_mapping & 1;
+}
+
+static inline int uld_send(struct adapter *adap, struct sk_buff *skb,
+			   unsigned int tx_uld_type)
+{
+	struct sge_uld_txq_info *txq_info;
+	struct sge_uld_txq *txq;
+	unsigned int idx = skb_txq(skb);
+
+	if (unlikely(is_ctrl_pkt(skb))) {
+		/* Single ctrl queue is a requirement for LE workaround path */
+		if (adap->tids.nsftids)
+			idx = 0;
+		return ctrl_xmit(&adap->sge.ctrlq[idx], skb);
+	}
+
+	txq_info = adap->sge.uld_txq_info[tx_uld_type];
+	if (unlikely(!txq_info)) {
+		WARN_ON(true);
+		return NET_XMIT_DROP;
+	}
+
+	txq = &txq_info->uldtxq[idx];
+	return ofld_xmit(txq, skb);
+}
+
+/**
+ *	t4_ofld_send - send an offload packet
+ *	@adap: the adapter
+ *	@skb: the packet
+ *
+ *	Sends an offload packet.  We use the packet queue_mapping to select the
+ *	appropriate Tx queue as follows: bit 0 indicates whether the packet
+ *	should be sent as regular or control, bits 1-15 select the queue.
+ */
+int t4_ofld_send(struct adapter *adap, struct sk_buff *skb)
+{
+	int ret;
+
+	local_bh_disable();
+	ret = uld_send(adap, skb, CXGB4_TX_OFLD);
+	local_bh_enable();
+	return ret;
+}
+
+/**
+ *	cxgb4_ofld_send - send an offload packet
+ *	@dev: the net device
+ *	@skb: the packet
+ *
+ *	Sends an offload packet.  This is an exported version of @t4_ofld_send,
+ *	intended for ULDs.
+ */
+int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb)
+{
+	return t4_ofld_send(netdev2adap(dev), skb);
+}
+EXPORT_SYMBOL(cxgb4_ofld_send);
+
+static void *inline_tx_header(const void *src,
+			      const struct sge_txq *q,
+			      void *pos, int length)
+{
+	int left = (void *)q->stat - pos;
+	u64 *p;
+
+	if (likely(length <= left)) {
+		memcpy(pos, src, length);
+		pos += length;
+	} else {
+		memcpy(pos, src, left);
+		memcpy(q->desc, src + left, length - left);
+		pos = (void *)q->desc + (length - left);
+	}
+	/* 0-pad to multiple of 16 */
+	p = PTR_ALIGN(pos, 8);
+	if ((uintptr_t)p & 8) {
+		*p = 0;
+		return p + 1;
+	}
+	return p;
+}
+
+/**
+ *      ofld_xmit_direct - copy a WR into offload queue
+ *      @q: the Tx offload queue
+ *      @src: location of WR
+ *      @len: WR length
+ *
+ *      Copy an immediate WR into an uncontended SGE offload queue.
+ */
+static int ofld_xmit_direct(struct sge_uld_txq *q, const void *src,
+			    unsigned int len)
+{
+	unsigned int ndesc;
+	int credits;
+	u64 *pos;
+
+	/* Use the lower limit as the cut-off */
+	if (len > MAX_IMM_OFLD_TX_DATA_WR_LEN) {
+		WARN_ON(1);
+		return NET_XMIT_DROP;
+	}
+
+	/* Don't return NET_XMIT_CN here as the current
+	 * implementation doesn't queue the request
+	 * using an skb when the following conditions not met
+	 */
+	if (!spin_trylock(&q->sendq.lock))
+		return NET_XMIT_DROP;
+
+	if (q->full || !skb_queue_empty(&q->sendq) ||
+	    q->service_ofldq_running) {
+		spin_unlock(&q->sendq.lock);
+		return NET_XMIT_DROP;
+	}
+	ndesc = flits_to_desc(DIV_ROUND_UP(len, 8));
+	credits = txq_avail(&q->q) - ndesc;
+	pos = (u64 *)&q->q.desc[q->q.pidx];
+
+	/* ofldtxq_stop modifies WR header in-situ */
+	inline_tx_header(src, &q->q, pos, len);
+	if (unlikely(credits < TXQ_STOP_THRES))
+		ofldtxq_stop(q, (struct fw_wr_hdr *)pos);
+	txq_advance(&q->q, ndesc);
+	cxgb4_ring_tx_db(q->adap, &q->q, ndesc);
+
+	spin_unlock(&q->sendq.lock);
+	return NET_XMIT_SUCCESS;
+}
+
+int cxgb4_immdata_send(struct net_device *dev, unsigned int idx,
+		       const void *src, unsigned int len)
+{
+	struct sge_uld_txq_info *txq_info;
+	struct sge_uld_txq *txq;
+	struct adapter *adap;
+	int ret;
+
+	adap = netdev2adap(dev);
+
+	local_bh_disable();
+	txq_info = adap->sge.uld_txq_info[CXGB4_TX_OFLD];
+	if (unlikely(!txq_info)) {
+		WARN_ON(true);
+		local_bh_enable();
+		return NET_XMIT_DROP;
+	}
+	txq = &txq_info->uldtxq[idx];
+
+	ret = ofld_xmit_direct(txq, src, len);
+	local_bh_enable();
+	return net_xmit_eval(ret);
+}
+EXPORT_SYMBOL(cxgb4_immdata_send);
+
+/**
+ *	t4_crypto_send - send crypto packet
+ *	@adap: the adapter
+ *	@skb: the packet
+ *
+ *	Sends crypto packet.  We use the packet queue_mapping to select the
+ *	appropriate Tx queue as follows: bit 0 indicates whether the packet
+ *	should be sent as regular or control, bits 1-15 select the queue.
+ */
+static int t4_crypto_send(struct adapter *adap, struct sk_buff *skb)
+{
+	int ret;
+
+	local_bh_disable();
+	ret = uld_send(adap, skb, CXGB4_TX_CRYPTO);
+	local_bh_enable();
+	return ret;
+}
+
+/**
+ *	cxgb4_crypto_send - send crypto packet
+ *	@dev: the net device
+ *	@skb: the packet
+ *
+ *	Sends crypto packet.  This is an exported version of @t4_crypto_send,
+ *	intended for ULDs.
+ */
+int cxgb4_crypto_send(struct net_device *dev, struct sk_buff *skb)
+{
+	return t4_crypto_send(netdev2adap(dev), skb);
+}
+EXPORT_SYMBOL(cxgb4_crypto_send);
+
+static inline void copy_frags(struct sk_buff *skb,
+			      const struct pkt_gl *gl, unsigned int offset)
+{
+	int i;
+
+	/* usually there's just one frag */
+	__skb_fill_page_desc(skb, 0, gl->frags[0].page,
+			     gl->frags[0].offset + offset,
+			     gl->frags[0].size - offset);
+	skb_shinfo(skb)->nr_frags = gl->nfrags;
+	for (i = 1; i < gl->nfrags; i++)
+		__skb_fill_page_desc(skb, i, gl->frags[i].page,
+				     gl->frags[i].offset,
+				     gl->frags[i].size);
+
+	/* get a reference to the last page, we don't own it */
+	get_page(gl->frags[gl->nfrags - 1].page);
+}
+
+/**
+ *	cxgb4_pktgl_to_skb - build an sk_buff from a packet gather list
+ *	@gl: the gather list
+ *	@skb_len: size of sk_buff main body if it carries fragments
+ *	@pull_len: amount of data to move to the sk_buff's main body
+ *
+ *	Builds an sk_buff from the given packet gather list.  Returns the
+ *	sk_buff or %NULL if sk_buff allocation failed.
+ */
+struct sk_buff *cxgb4_pktgl_to_skb(const struct pkt_gl *gl,
+				   unsigned int skb_len, unsigned int pull_len)
+{
+	struct sk_buff *skb;
+
+	/*
+	 * Below we rely on RX_COPY_THRES being less than the smallest Rx buffer
+	 * size, which is expected since buffers are at least PAGE_SIZEd.
+	 * In this case packets up to RX_COPY_THRES have only one fragment.
+	 */
+	if (gl->tot_len <= RX_COPY_THRES) {
+		skb = dev_alloc_skb(gl->tot_len);
+		if (unlikely(!skb))
+			goto out;
+		__skb_put(skb, gl->tot_len);
+		skb_copy_to_linear_data(skb, gl->va, gl->tot_len);
+	} else {
+		skb = dev_alloc_skb(skb_len);
+		if (unlikely(!skb))
+			goto out;
+		__skb_put(skb, pull_len);
+		skb_copy_to_linear_data(skb, gl->va, pull_len);
+
+		copy_frags(skb, gl, pull_len);
+		skb->len = gl->tot_len;
+		skb->data_len = skb->len - pull_len;
+		skb->truesize += skb->data_len;
+	}
+out:	return skb;
+}
+EXPORT_SYMBOL(cxgb4_pktgl_to_skb);
+
+/**
+ *	t4_pktgl_free - free a packet gather list
+ *	@gl: the gather list
+ *
+ *	Releases the pages of a packet gather list.  We do not own the last
+ *	page on the list and do not free it.
+ */
+static void t4_pktgl_free(const struct pkt_gl *gl)
+{
+	int n;
+	const struct page_frag *p;
+
+	for (p = gl->frags, n = gl->nfrags - 1; n--; p++)
+		put_page(p->page);
+}
+
+/*
+ * Process an MPS trace packet.  Give it an unused protocol number so it won't
+ * be delivered to anyone and send it to the stack for capture.
+ */
+static noinline int handle_trace_pkt(struct adapter *adap,
+				     const struct pkt_gl *gl)
+{
+	struct sk_buff *skb;
+
+	skb = cxgb4_pktgl_to_skb(gl, RX_PULL_LEN, RX_PULL_LEN);
+	if (unlikely(!skb)) {
+		t4_pktgl_free(gl);
+		return 0;
+	}
+
+	if (is_t4(adap->params.chip))
+		__skb_pull(skb, sizeof(struct cpl_trace_pkt));
+	else
+		__skb_pull(skb, sizeof(struct cpl_t5_trace_pkt));
+
+	skb_reset_mac_header(skb);
+	skb->protocol = htons(0xffff);
+	skb->dev = adap->port[0];
+	netif_receive_skb(skb);
+	return 0;
+}
+
+/**
+ * cxgb4_sgetim_to_hwtstamp - convert sge time stamp to hw time stamp
+ * @adap: the adapter
+ * @hwtstamps: time stamp structure to update
+ * @sgetstamp: 60bit iqe timestamp
+ *
+ * Every ingress queue entry has the 60-bit timestamp, convert that timestamp
+ * which is in Core Clock ticks into ktime_t and assign it
+ **/
+static void cxgb4_sgetim_to_hwtstamp(struct adapter *adap,
+				     struct skb_shared_hwtstamps *hwtstamps,
+				     u64 sgetstamp)
+{
+	u64 ns;
+	u64 tmp = (sgetstamp * 1000 * 1000 + adap->params.vpd.cclk / 2);
+
+	ns = div_u64(tmp, adap->params.vpd.cclk);
+
+	memset(hwtstamps, 0, sizeof(*hwtstamps));
+	hwtstamps->hwtstamp = ns_to_ktime(ns);
+}
+
+static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl,
+		   const struct cpl_rx_pkt *pkt)
+{
+	struct adapter *adapter = rxq->rspq.adap;
+	struct sge *s = &adapter->sge;
+	struct port_info *pi;
+	int ret;
+	struct sk_buff *skb;
+
+	skb = napi_get_frags(&rxq->rspq.napi);
+	if (unlikely(!skb)) {
+		t4_pktgl_free(gl);
+		rxq->stats.rx_drops++;
+		return;
+	}
+
+	copy_frags(skb, gl, s->pktshift);
+	skb->len = gl->tot_len - s->pktshift;
+	skb->data_len = skb->len;
+	skb->truesize += skb->data_len;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	skb_record_rx_queue(skb, rxq->rspq.idx);
+	pi = netdev_priv(skb->dev);
+	if (pi->rxtstamp)
+		cxgb4_sgetim_to_hwtstamp(adapter, skb_hwtstamps(skb),
+					 gl->sgetstamp);
+	if (rxq->rspq.netdev->features & NETIF_F_RXHASH)
+		skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val,
+			     PKT_HASH_TYPE_L3);
+
+	if (unlikely(pkt->vlan_ex)) {
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan));
+		rxq->stats.vlan_ex++;
+	}
+	ret = napi_gro_frags(&rxq->rspq.napi);
+	if (ret == GRO_HELD)
+		rxq->stats.lro_pkts++;
+	else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE)
+		rxq->stats.lro_merged++;
+	rxq->stats.pkts++;
+	rxq->stats.rx_cso++;
+}
+
+enum {
+	RX_NON_PTP_PKT = 0,
+	RX_PTP_PKT_SUC = 1,
+	RX_PTP_PKT_ERR = 2
+};
+
+/**
+ *     t4_systim_to_hwstamp - read hardware time stamp
+ *     @adap: the adapter
+ *     @skb: the packet
+ *
+ *     Read Time Stamp from MPS packet and insert in skb which
+ *     is forwarded to PTP application
+ */
+static noinline int t4_systim_to_hwstamp(struct adapter *adapter,
+					 struct sk_buff *skb)
+{
+	struct skb_shared_hwtstamps *hwtstamps;
+	struct cpl_rx_mps_pkt *cpl = NULL;
+	unsigned char *data;
+	int offset;
+
+	cpl = (struct cpl_rx_mps_pkt *)skb->data;
+	if (!(CPL_RX_MPS_PKT_TYPE_G(ntohl(cpl->op_to_r1_hi)) &
+	     X_CPL_RX_MPS_PKT_TYPE_PTP))
+		return RX_PTP_PKT_ERR;
+
+	data = skb->data + sizeof(*cpl);
+	skb_pull(skb, 2 * sizeof(u64) + sizeof(struct cpl_rx_mps_pkt));
+	offset = ETH_HLEN + IPV4_HLEN(skb->data) + UDP_HLEN;
+	if (skb->len < offset + OFF_PTP_SEQUENCE_ID + sizeof(short))
+		return RX_PTP_PKT_ERR;
+
+	hwtstamps = skb_hwtstamps(skb);
+	memset(hwtstamps, 0, sizeof(*hwtstamps));
+	hwtstamps->hwtstamp = ns_to_ktime(be64_to_cpu(*((u64 *)data)));
+
+	return RX_PTP_PKT_SUC;
+}
+
+/**
+ *     t4_rx_hststamp - Recv PTP Event Message
+ *     @adap: the adapter
+ *     @rsp: the response queue descriptor holding the RX_PKT message
+ *     @skb: the packet
+ *
+ *     PTP enabled and MPS packet, read HW timestamp
+ */
+static int t4_rx_hststamp(struct adapter *adapter, const __be64 *rsp,
+			  struct sge_eth_rxq *rxq, struct sk_buff *skb)
+{
+	int ret;
+
+	if (unlikely((*(u8 *)rsp == CPL_RX_MPS_PKT) &&
+		     !is_t4(adapter->params.chip))) {
+		ret = t4_systim_to_hwstamp(adapter, skb);
+		if (ret == RX_PTP_PKT_ERR) {
+			kfree_skb(skb);
+			rxq->stats.rx_drops++;
+		}
+		return ret;
+	}
+	return RX_NON_PTP_PKT;
+}
+
+/**
+ *      t4_tx_hststamp - Loopback PTP Transmit Event Message
+ *      @adap: the adapter
+ *      @skb: the packet
+ *      @dev: the ingress net device
+ *
+ *      Read hardware timestamp for the loopback PTP Tx event message
+ */
+static int t4_tx_hststamp(struct adapter *adapter, struct sk_buff *skb,
+			  struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+
+	if (!is_t4(adapter->params.chip) && adapter->ptp_tx_skb) {
+		cxgb4_ptp_read_hwstamp(adapter, pi);
+		kfree_skb(skb);
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ *	t4_ethrx_handler - process an ingress ethernet packet
+ *	@q: the response queue that received the packet
+ *	@rsp: the response queue descriptor holding the RX_PKT message
+ *	@si: the gather list of packet fragments
+ *
+ *	Process an ingress ethernet packet and deliver it to the stack.
+ */
+int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
+		     const struct pkt_gl *si)
+{
+	bool csum_ok;
+	struct sk_buff *skb;
+	const struct cpl_rx_pkt *pkt;
+	struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq);
+	struct adapter *adapter = q->adap;
+	struct sge *s = &q->adap->sge;
+	int cpl_trace_pkt = is_t4(q->adap->params.chip) ?
+			    CPL_TRACE_PKT : CPL_TRACE_PKT_T5;
+	u16 err_vec;
+	struct port_info *pi;
+	int ret = 0;
+
+	if (unlikely(*(u8 *)rsp == cpl_trace_pkt))
+		return handle_trace_pkt(q->adap, si);
+
+	pkt = (const struct cpl_rx_pkt *)rsp;
+	/* Compressed error vector is enabled for T6 only */
+	if (q->adap->params.tp.rx_pkt_encap)
+		err_vec = T6_COMPR_RXERR_VEC_G(be16_to_cpu(pkt->err_vec));
+	else
+		err_vec = be16_to_cpu(pkt->err_vec);
+
+	csum_ok = pkt->csum_calc && !err_vec &&
+		  (q->netdev->features & NETIF_F_RXCSUM);
+	if ((pkt->l2info & htonl(RXF_TCP_F)) &&
+	    (q->netdev->features & NETIF_F_GRO) && csum_ok && !pkt->ip_frag) {
+		do_gro(rxq, si, pkt);
+		return 0;
+	}
+
+	skb = cxgb4_pktgl_to_skb(si, RX_PKT_SKB_LEN, RX_PULL_LEN);
+	if (unlikely(!skb)) {
+		t4_pktgl_free(si);
+		rxq->stats.rx_drops++;
+		return 0;
+	}
+	pi = netdev_priv(q->netdev);
+
+	/* Handle PTP Event Rx packet */
+	if (unlikely(pi->ptp_enable)) {
+		ret = t4_rx_hststamp(adapter, rsp, rxq, skb);
+		if (ret == RX_PTP_PKT_ERR)
+			return 0;
+	}
+	if (likely(!ret))
+		__skb_pull(skb, s->pktshift); /* remove ethernet header pad */
+
+	/* Handle the PTP Event Tx Loopback packet */
+	if (unlikely(pi->ptp_enable && !ret &&
+		     (pkt->l2info & htonl(RXF_UDP_F)) &&
+		     cxgb4_ptp_is_ptp_rx(skb))) {
+		if (!t4_tx_hststamp(adapter, skb, q->netdev))
+			return 0;
+	}
+
+	skb->protocol = eth_type_trans(skb, q->netdev);
+	skb_record_rx_queue(skb, q->idx);
+	if (skb->dev->features & NETIF_F_RXHASH)
+		skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val,
+			     PKT_HASH_TYPE_L3);
+
+	rxq->stats.pkts++;
+
+	if (pi->rxtstamp)
+		cxgb4_sgetim_to_hwtstamp(q->adap, skb_hwtstamps(skb),
+					 si->sgetstamp);
+	if (csum_ok && (pkt->l2info & htonl(RXF_UDP_F | RXF_TCP_F))) {
+		if (!pkt->ip_frag) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			rxq->stats.rx_cso++;
+		} else if (pkt->l2info & htonl(RXF_IP_F)) {
+			__sum16 c = (__force __sum16)pkt->csum;
+			skb->csum = csum_unfold(c);
+			skb->ip_summed = CHECKSUM_COMPLETE;
+			rxq->stats.rx_cso++;
+		}
+	} else {
+		skb_checksum_none_assert(skb);
+#ifdef CONFIG_CHELSIO_T4_FCOE
+#define CPL_RX_PKT_FLAGS (RXF_PSH_F | RXF_SYN_F | RXF_UDP_F | \
+			  RXF_TCP_F | RXF_IP_F | RXF_IP6_F | RXF_LRO_F)
+
+		if (!(pkt->l2info & cpu_to_be32(CPL_RX_PKT_FLAGS))) {
+			if ((pkt->l2info & cpu_to_be32(RXF_FCOE_F)) &&
+			    (pi->fcoe.flags & CXGB_FCOE_ENABLED)) {
+				if (q->adap->params.tp.rx_pkt_encap)
+					csum_ok = err_vec &
+						  T6_COMPR_RXERR_SUM_F;
+				else
+					csum_ok = err_vec & RXERR_CSUM_F;
+				if (!csum_ok)
+					skb->ip_summed = CHECKSUM_UNNECESSARY;
+			}
+		}
+
+#undef CPL_RX_PKT_FLAGS
+#endif /* CONFIG_CHELSIO_T4_FCOE */
+	}
+
+	if (unlikely(pkt->vlan_ex)) {
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan));
+		rxq->stats.vlan_ex++;
+	}
+	skb_mark_napi_id(skb, &q->napi);
+	netif_receive_skb(skb);
+	return 0;
+}
+
+/**
+ *	restore_rx_bufs - put back a packet's Rx buffers
+ *	@si: the packet gather list
+ *	@q: the SGE free list
+ *	@frags: number of FL buffers to restore
+ *
+ *	Puts back on an FL the Rx buffers associated with @si.  The buffers
+ *	have already been unmapped and are left unmapped, we mark them so to
+ *	prevent further unmapping attempts.
+ *
+ *	This function undoes a series of @unmap_rx_buf calls when we find out
+ *	that the current packet can't be processed right away afterall and we
+ *	need to come back to it later.  This is a very rare event and there's
+ *	no effort to make this particularly efficient.
+ */
+static void restore_rx_bufs(const struct pkt_gl *si, struct sge_fl *q,
+			    int frags)
+{
+	struct rx_sw_desc *d;
+
+	while (frags--) {
+		if (q->cidx == 0)
+			q->cidx = q->size - 1;
+		else
+			q->cidx--;
+		d = &q->sdesc[q->cidx];
+		d->page = si->frags[frags].page;
+		d->dma_addr |= RX_UNMAPPED_BUF;
+		q->avail++;
+	}
+}
+
+/**
+ *	is_new_response - check if a response is newly written
+ *	@r: the response descriptor
+ *	@q: the response queue
+ *
+ *	Returns true if a response descriptor contains a yet unprocessed
+ *	response.
+ */
+static inline bool is_new_response(const struct rsp_ctrl *r,
+				   const struct sge_rspq *q)
+{
+	return (r->type_gen >> RSPD_GEN_S) == q->gen;
+}
+
+/**
+ *	rspq_next - advance to the next entry in a response queue
+ *	@q: the queue
+ *
+ *	Updates the state of a response queue to advance it to the next entry.
+ */
+static inline void rspq_next(struct sge_rspq *q)
+{
+	q->cur_desc = (void *)q->cur_desc + q->iqe_len;
+	if (unlikely(++q->cidx == q->size)) {
+		q->cidx = 0;
+		q->gen ^= 1;
+		q->cur_desc = q->desc;
+	}
+}
+
+/**
+ *	process_responses - process responses from an SGE response queue
+ *	@q: the ingress queue to process
+ *	@budget: how many responses can be processed in this round
+ *
+ *	Process responses from an SGE response queue up to the supplied budget.
+ *	Responses include received packets as well as control messages from FW
+ *	or HW.
+ *
+ *	Additionally choose the interrupt holdoff time for the next interrupt
+ *	on this queue.  If the system is under memory shortage use a fairly
+ *	long delay to help recovery.
+ */
+static int process_responses(struct sge_rspq *q, int budget)
+{
+	int ret, rsp_type;
+	int budget_left = budget;
+	const struct rsp_ctrl *rc;
+	struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq);
+	struct adapter *adapter = q->adap;
+	struct sge *s = &adapter->sge;
+
+	while (likely(budget_left)) {
+		rc = (void *)q->cur_desc + (q->iqe_len - sizeof(*rc));
+		if (!is_new_response(rc, q)) {
+			if (q->flush_handler)
+				q->flush_handler(q);
+			break;
+		}
+
+		dma_rmb();
+		rsp_type = RSPD_TYPE_G(rc->type_gen);
+		if (likely(rsp_type == RSPD_TYPE_FLBUF_X)) {
+			struct page_frag *fp;
+			struct pkt_gl si;
+			const struct rx_sw_desc *rsd;
+			u32 len = ntohl(rc->pldbuflen_qid), bufsz, frags;
+
+			if (len & RSPD_NEWBUF_F) {
+				if (likely(q->offset > 0)) {
+					free_rx_bufs(q->adap, &rxq->fl, 1);
+					q->offset = 0;
+				}
+				len = RSPD_LEN_G(len);
+			}
+			si.tot_len = len;
+
+			/* gather packet fragments */
+			for (frags = 0, fp = si.frags; ; frags++, fp++) {
+				rsd = &rxq->fl.sdesc[rxq->fl.cidx];
+				bufsz = get_buf_size(adapter, rsd);
+				fp->page = rsd->page;
+				fp->offset = q->offset;
+				fp->size = min(bufsz, len);
+				len -= fp->size;
+				if (!len)
+					break;
+				unmap_rx_buf(q->adap, &rxq->fl);
+			}
+
+			si.sgetstamp = SGE_TIMESTAMP_G(
+					be64_to_cpu(rc->last_flit));
+			/*
+			 * Last buffer remains mapped so explicitly make it
+			 * coherent for CPU access.
+			 */
+			dma_sync_single_for_cpu(q->adap->pdev_dev,
+						get_buf_addr(rsd),
+						fp->size, DMA_FROM_DEVICE);
+
+			si.va = page_address(si.frags[0].page) +
+				si.frags[0].offset;
+			prefetch(si.va);
+
+			si.nfrags = frags + 1;
+			ret = q->handler(q, q->cur_desc, &si);
+			if (likely(ret == 0))
+				q->offset += ALIGN(fp->size, s->fl_align);
+			else
+				restore_rx_bufs(&si, &rxq->fl, frags);
+		} else if (likely(rsp_type == RSPD_TYPE_CPL_X)) {
+			ret = q->handler(q, q->cur_desc, NULL);
+		} else {
+			ret = q->handler(q, (const __be64 *)rc, CXGB4_MSG_AN);
+		}
+
+		if (unlikely(ret)) {
+			/* couldn't process descriptor, back off for recovery */
+			q->next_intr_params = QINTR_TIMER_IDX_V(NOMEM_TMR_IDX);
+			break;
+		}
+
+		rspq_next(q);
+		budget_left--;
+	}
+
+	if (q->offset >= 0 && fl_cap(&rxq->fl) - rxq->fl.avail >= 16)
+		__refill_fl(q->adap, &rxq->fl);
+	return budget - budget_left;
+}
+
+/**
+ *	napi_rx_handler - the NAPI handler for Rx processing
+ *	@napi: the napi instance
+ *	@budget: how many packets we can process in this round
+ *
+ *	Handler for new data events when using NAPI.  This does not need any
+ *	locking or protection from interrupts as data interrupts are off at
+ *	this point and other adapter interrupts do not interfere (the latter
+ *	in not a concern at all with MSI-X as non-data interrupts then have
+ *	a separate handler).
+ */
+static int napi_rx_handler(struct napi_struct *napi, int budget)
+{
+	unsigned int params;
+	struct sge_rspq *q = container_of(napi, struct sge_rspq, napi);
+	int work_done;
+	u32 val;
+
+	work_done = process_responses(q, budget);
+	if (likely(work_done < budget)) {
+		int timer_index;
+
+		napi_complete_done(napi, work_done);
+		timer_index = QINTR_TIMER_IDX_G(q->next_intr_params);
+
+		if (q->adaptive_rx) {
+			if (work_done > max(timer_pkt_quota[timer_index],
+					    MIN_NAPI_WORK))
+				timer_index = (timer_index + 1);
+			else
+				timer_index = timer_index - 1;
+
+			timer_index = clamp(timer_index, 0, SGE_TIMERREGS - 1);
+			q->next_intr_params =
+					QINTR_TIMER_IDX_V(timer_index) |
+					QINTR_CNT_EN_V(0);
+			params = q->next_intr_params;
+		} else {
+			params = q->next_intr_params;
+			q->next_intr_params = q->intr_params;
+		}
+	} else
+		params = QINTR_TIMER_IDX_V(7);
+
+	val = CIDXINC_V(work_done) | SEINTARM_V(params);
+
+	/* If we don't have access to the new User GTS (T5+), use the old
+	 * doorbell mechanism; otherwise use the new BAR2 mechanism.
+	 */
+	if (unlikely(q->bar2_addr == NULL)) {
+		t4_write_reg(q->adap, MYPF_REG(SGE_PF_GTS_A),
+			     val | INGRESSQID_V((u32)q->cntxt_id));
+	} else {
+		writel(val | INGRESSQID_V(q->bar2_qid),
+		       q->bar2_addr + SGE_UDB_GTS);
+		wmb();
+	}
+	return work_done;
+}
+
+/*
+ * The MSI-X interrupt handler for an SGE response queue.
+ */
+irqreturn_t t4_sge_intr_msix(int irq, void *cookie)
+{
+	struct sge_rspq *q = cookie;
+
+	napi_schedule(&q->napi);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Process the indirect interrupt entries in the interrupt queue and kick off
+ * NAPI for each queue that has generated an entry.
+ */
+static unsigned int process_intrq(struct adapter *adap)
+{
+	unsigned int credits;
+	const struct rsp_ctrl *rc;
+	struct sge_rspq *q = &adap->sge.intrq;
+	u32 val;
+
+	spin_lock(&adap->sge.intrq_lock);
+	for (credits = 0; ; credits++) {
+		rc = (void *)q->cur_desc + (q->iqe_len - sizeof(*rc));
+		if (!is_new_response(rc, q))
+			break;
+
+		dma_rmb();
+		if (RSPD_TYPE_G(rc->type_gen) == RSPD_TYPE_INTR_X) {
+			unsigned int qid = ntohl(rc->pldbuflen_qid);
+
+			qid -= adap->sge.ingr_start;
+			napi_schedule(&adap->sge.ingr_map[qid]->napi);
+		}
+
+		rspq_next(q);
+	}
+
+	val =  CIDXINC_V(credits) | SEINTARM_V(q->intr_params);
+
+	/* If we don't have access to the new User GTS (T5+), use the old
+	 * doorbell mechanism; otherwise use the new BAR2 mechanism.
+	 */
+	if (unlikely(q->bar2_addr == NULL)) {
+		t4_write_reg(adap, MYPF_REG(SGE_PF_GTS_A),
+			     val | INGRESSQID_V(q->cntxt_id));
+	} else {
+		writel(val | INGRESSQID_V(q->bar2_qid),
+		       q->bar2_addr + SGE_UDB_GTS);
+		wmb();
+	}
+	spin_unlock(&adap->sge.intrq_lock);
+	return credits;
+}
+
+/*
+ * The MSI interrupt handler, which handles data events from SGE response queues
+ * as well as error and other async events as they all use the same MSI vector.
+ */
+static irqreturn_t t4_intr_msi(int irq, void *cookie)
+{
+	struct adapter *adap = cookie;
+
+	if (adap->flags & MASTER_PF)
+		t4_slow_intr_handler(adap);
+	process_intrq(adap);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Interrupt handler for legacy INTx interrupts.
+ * Handles data events from SGE response queues as well as error and other
+ * async events as they all use the same interrupt line.
+ */
+static irqreturn_t t4_intr_intx(int irq, void *cookie)
+{
+	struct adapter *adap = cookie;
+
+	t4_write_reg(adap, MYPF_REG(PCIE_PF_CLI_A), 0);
+	if (((adap->flags & MASTER_PF) && t4_slow_intr_handler(adap)) |
+	    process_intrq(adap))
+		return IRQ_HANDLED;
+	return IRQ_NONE;             /* probably shared interrupt */
+}
+
+/**
+ *	t4_intr_handler - select the top-level interrupt handler
+ *	@adap: the adapter
+ *
+ *	Selects the top-level interrupt handler based on the type of interrupts
+ *	(MSI-X, MSI, or INTx).
+ */
+irq_handler_t t4_intr_handler(struct adapter *adap)
+{
+	if (adap->flags & USING_MSIX)
+		return t4_sge_intr_msix;
+	if (adap->flags & USING_MSI)
+		return t4_intr_msi;
+	return t4_intr_intx;
+}
+
+static void sge_rx_timer_cb(struct timer_list *t)
+{
+	unsigned long m;
+	unsigned int i;
+	struct adapter *adap = from_timer(adap, t, sge.rx_timer);
+	struct sge *s = &adap->sge;
+
+	for (i = 0; i < BITS_TO_LONGS(s->egr_sz); i++)
+		for (m = s->starving_fl[i]; m; m &= m - 1) {
+			struct sge_eth_rxq *rxq;
+			unsigned int id = __ffs(m) + i * BITS_PER_LONG;
+			struct sge_fl *fl = s->egr_map[id];
+
+			clear_bit(id, s->starving_fl);
+			smp_mb__after_atomic();
+
+			if (fl_starving(adap, fl)) {
+				rxq = container_of(fl, struct sge_eth_rxq, fl);
+				if (napi_reschedule(&rxq->rspq.napi))
+					fl->starving++;
+				else
+					set_bit(id, s->starving_fl);
+			}
+		}
+	/* The remainder of the SGE RX Timer Callback routine is dedicated to
+	 * global Master PF activities like checking for chip ingress stalls,
+	 * etc.
+	 */
+	if (!(adap->flags & MASTER_PF))
+		goto done;
+
+	t4_idma_monitor(adap, &s->idma_monitor, HZ, RX_QCHECK_PERIOD);
+
+done:
+	mod_timer(&s->rx_timer, jiffies + RX_QCHECK_PERIOD);
+}
+
+static void sge_tx_timer_cb(struct timer_list *t)
+{
+	unsigned long m;
+	unsigned int i, budget;
+	struct adapter *adap = from_timer(adap, t, sge.tx_timer);
+	struct sge *s = &adap->sge;
+
+	for (i = 0; i < BITS_TO_LONGS(s->egr_sz); i++)
+		for (m = s->txq_maperr[i]; m; m &= m - 1) {
+			unsigned long id = __ffs(m) + i * BITS_PER_LONG;
+			struct sge_uld_txq *txq = s->egr_map[id];
+
+			clear_bit(id, s->txq_maperr);
+			tasklet_schedule(&txq->qresume_tsk);
+		}
+
+	if (!is_t4(adap->params.chip)) {
+		struct sge_eth_txq *q = &s->ptptxq;
+		int avail;
+
+		spin_lock(&adap->ptp_lock);
+		avail = reclaimable(&q->q);
+
+		if (avail) {
+			free_tx_desc(adap, &q->q, avail, false);
+			q->q.in_use -= avail;
+		}
+		spin_unlock(&adap->ptp_lock);
+	}
+
+	budget = MAX_TIMER_TX_RECLAIM;
+	i = s->ethtxq_rover;
+	do {
+		struct sge_eth_txq *q = &s->ethtxq[i];
+
+		if (q->q.in_use &&
+		    time_after_eq(jiffies, q->txq->trans_start + HZ / 100) &&
+		    __netif_tx_trylock(q->txq)) {
+			int avail = reclaimable(&q->q);
+
+			if (avail) {
+				if (avail > budget)
+					avail = budget;
+
+				free_tx_desc(adap, &q->q, avail, true);
+				q->q.in_use -= avail;
+				budget -= avail;
+			}
+			__netif_tx_unlock(q->txq);
+		}
+
+		if (++i >= s->ethqsets)
+			i = 0;
+	} while (budget && i != s->ethtxq_rover);
+	s->ethtxq_rover = i;
+	mod_timer(&s->tx_timer, jiffies + (budget ? TX_QCHECK_PERIOD : 2));
+}
+
+/**
+ *	bar2_address - return the BAR2 address for an SGE Queue's Registers
+ *	@adapter: the adapter
+ *	@qid: the SGE Queue ID
+ *	@qtype: the SGE Queue Type (Egress or Ingress)
+ *	@pbar2_qid: BAR2 Queue ID or 0 for Queue ID inferred SGE Queues
+ *
+ *	Returns the BAR2 address for the SGE Queue Registers associated with
+ *	@qid.  If BAR2 SGE Registers aren't available, returns NULL.  Also
+ *	returns the BAR2 Queue ID to be used with writes to the BAR2 SGE
+ *	Queue Registers.  If the BAR2 Queue ID is 0, then "Inferred Queue ID"
+ *	Registers are supported (e.g. the Write Combining Doorbell Buffer).
+ */
+static void __iomem *bar2_address(struct adapter *adapter,
+				  unsigned int qid,
+				  enum t4_bar2_qtype qtype,
+				  unsigned int *pbar2_qid)
+{
+	u64 bar2_qoffset;
+	int ret;
+
+	ret = t4_bar2_sge_qregs(adapter, qid, qtype, 0,
+				&bar2_qoffset, pbar2_qid);
+	if (ret)
+		return NULL;
+
+	return adapter->bar2 + bar2_qoffset;
+}
+
+/* @intr_idx: MSI/MSI-X vector if >=0, -(absolute qid + 1) if < 0
+ * @cong: < 0 -> no congestion feedback, >= 0 -> congestion channel map
+ */
+int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
+		     struct net_device *dev, int intr_idx,
+		     struct sge_fl *fl, rspq_handler_t hnd,
+		     rspq_flush_handler_t flush_hnd, int cong)
+{
+	int ret, flsz = 0;
+	struct fw_iq_cmd c;
+	struct sge *s = &adap->sge;
+	struct port_info *pi = netdev_priv(dev);
+	int relaxed = !(adap->flags & ROOT_NO_RELAXED_ORDERING);
+
+	/* Size needs to be multiple of 16, including status entry. */
+	iq->size = roundup(iq->size, 16);
+
+	iq->desc = alloc_ring(adap->pdev_dev, iq->size, iq->iqe_len, 0,
+			      &iq->phys_addr, NULL, 0,
+			      dev_to_node(adap->pdev_dev));
+	if (!iq->desc)
+		return -ENOMEM;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(FW_CMD_OP_V(FW_IQ_CMD) | FW_CMD_REQUEST_F |
+			    FW_CMD_WRITE_F | FW_CMD_EXEC_F |
+			    FW_IQ_CMD_PFN_V(adap->pf) | FW_IQ_CMD_VFN_V(0));
+	c.alloc_to_len16 = htonl(FW_IQ_CMD_ALLOC_F | FW_IQ_CMD_IQSTART_F |
+				 FW_LEN16(c));
+	c.type_to_iqandstindex = htonl(FW_IQ_CMD_TYPE_V(FW_IQ_TYPE_FL_INT_CAP) |
+		FW_IQ_CMD_IQASYNCH_V(fwevtq) | FW_IQ_CMD_VIID_V(pi->viid) |
+		FW_IQ_CMD_IQANDST_V(intr_idx < 0) |
+		FW_IQ_CMD_IQANUD_V(UPDATEDELIVERY_INTERRUPT_X) |
+		FW_IQ_CMD_IQANDSTINDEX_V(intr_idx >= 0 ? intr_idx :
+							-intr_idx - 1));
+	c.iqdroprss_to_iqesize = htons(FW_IQ_CMD_IQPCIECH_V(pi->tx_chan) |
+		FW_IQ_CMD_IQGTSMODE_F |
+		FW_IQ_CMD_IQINTCNTTHRESH_V(iq->pktcnt_idx) |
+		FW_IQ_CMD_IQESIZE_V(ilog2(iq->iqe_len) - 4));
+	c.iqsize = htons(iq->size);
+	c.iqaddr = cpu_to_be64(iq->phys_addr);
+	if (cong >= 0)
+		c.iqns_to_fl0congen = htonl(FW_IQ_CMD_IQFLINTCONGEN_F);
+
+	if (fl) {
+		enum chip_type chip = CHELSIO_CHIP_VERSION(adap->params.chip);
+
+		/* Allocate the ring for the hardware free list (with space
+		 * for its status page) along with the associated software
+		 * descriptor ring.  The free list size needs to be a multiple
+		 * of the Egress Queue Unit and at least 2 Egress Units larger
+		 * than the SGE's Egress Congrestion Threshold
+		 * (fl_starve_thres - 1).
+		 */
+		if (fl->size < s->fl_starve_thres - 1 + 2 * 8)
+			fl->size = s->fl_starve_thres - 1 + 2 * 8;
+		fl->size = roundup(fl->size, 8);
+		fl->desc = alloc_ring(adap->pdev_dev, fl->size, sizeof(__be64),
+				      sizeof(struct rx_sw_desc), &fl->addr,
+				      &fl->sdesc, s->stat_len,
+				      dev_to_node(adap->pdev_dev));
+		if (!fl->desc)
+			goto fl_nomem;
+
+		flsz = fl->size / 8 + s->stat_len / sizeof(struct tx_desc);
+		c.iqns_to_fl0congen |= htonl(FW_IQ_CMD_FL0PACKEN_F |
+					     FW_IQ_CMD_FL0FETCHRO_V(relaxed) |
+					     FW_IQ_CMD_FL0DATARO_V(relaxed) |
+					     FW_IQ_CMD_FL0PADEN_F);
+		if (cong >= 0)
+			c.iqns_to_fl0congen |=
+				htonl(FW_IQ_CMD_FL0CNGCHMAP_V(cong) |
+				      FW_IQ_CMD_FL0CONGCIF_F |
+				      FW_IQ_CMD_FL0CONGEN_F);
+		/* In T6, for egress queue type FL there is internal overhead
+		 * of 16B for header going into FLM module.  Hence the maximum
+		 * allowed burst size is 448 bytes.  For T4/T5, the hardware
+		 * doesn't coalesce fetch requests if more than 64 bytes of
+		 * Free List pointers are provided, so we use a 128-byte Fetch
+		 * Burst Minimum there (T6 implements coalescing so we can use
+		 * the smaller 64-byte value there).
+		 */
+		c.fl0dcaen_to_fl0cidxfthresh =
+			htons(FW_IQ_CMD_FL0FBMIN_V(chip <= CHELSIO_T5 ?
+						   FETCHBURSTMIN_128B_X :
+						   FETCHBURSTMIN_64B_X) |
+			      FW_IQ_CMD_FL0FBMAX_V((chip <= CHELSIO_T5) ?
+						   FETCHBURSTMAX_512B_X :
+						   FETCHBURSTMAX_256B_X));
+		c.fl0size = htons(flsz);
+		c.fl0addr = cpu_to_be64(fl->addr);
+	}
+
+	ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
+	if (ret)
+		goto err;
+
+	netif_napi_add(dev, &iq->napi, napi_rx_handler, 64);
+	iq->cur_desc = iq->desc;
+	iq->cidx = 0;
+	iq->gen = 1;
+	iq->next_intr_params = iq->intr_params;
+	iq->cntxt_id = ntohs(c.iqid);
+	iq->abs_id = ntohs(c.physiqid);
+	iq->bar2_addr = bar2_address(adap,
+				     iq->cntxt_id,
+				     T4_BAR2_QTYPE_INGRESS,
+				     &iq->bar2_qid);
+	iq->size--;                           /* subtract status entry */
+	iq->netdev = dev;
+	iq->handler = hnd;
+	iq->flush_handler = flush_hnd;
+
+	memset(&iq->lro_mgr, 0, sizeof(struct t4_lro_mgr));
+	skb_queue_head_init(&iq->lro_mgr.lroq);
+
+	/* set offset to -1 to distinguish ingress queues without FL */
+	iq->offset = fl ? 0 : -1;
+
+	adap->sge.ingr_map[iq->cntxt_id - adap->sge.ingr_start] = iq;
+
+	if (fl) {
+		fl->cntxt_id = ntohs(c.fl0id);
+		fl->avail = fl->pend_cred = 0;
+		fl->pidx = fl->cidx = 0;
+		fl->alloc_failed = fl->large_alloc_failed = fl->starving = 0;
+		adap->sge.egr_map[fl->cntxt_id - adap->sge.egr_start] = fl;
+
+		/* Note, we must initialize the BAR2 Free List User Doorbell
+		 * information before refilling the Free List!
+		 */
+		fl->bar2_addr = bar2_address(adap,
+					     fl->cntxt_id,
+					     T4_BAR2_QTYPE_EGRESS,
+					     &fl->bar2_qid);
+		refill_fl(adap, fl, fl_cap(fl), GFP_KERNEL);
+	}
+
+	/* For T5 and later we attempt to set up the Congestion Manager values
+	 * of the new RX Ethernet Queue.  This should really be handled by
+	 * firmware because it's more complex than any host driver wants to
+	 * get involved with and it's different per chip and this is almost
+	 * certainly wrong.  Firmware would be wrong as well, but it would be
+	 * a lot easier to fix in one place ...  For now we do something very
+	 * simple (and hopefully less wrong).
+	 */
+	if (!is_t4(adap->params.chip) && cong >= 0) {
+		u32 param, val, ch_map = 0;
+		int i;
+		u16 cng_ch_bits_log = adap->params.arch.cng_ch_bits_log;
+
+		param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DMAQ) |
+			 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
+			 FW_PARAMS_PARAM_YZ_V(iq->cntxt_id));
+		if (cong == 0) {
+			val = CONMCTXT_CNGTPMODE_V(CONMCTXT_CNGTPMODE_QUEUE_X);
+		} else {
+			val =
+			    CONMCTXT_CNGTPMODE_V(CONMCTXT_CNGTPMODE_CHANNEL_X);
+			for (i = 0; i < 4; i++) {
+				if (cong & (1 << i))
+					ch_map |= 1 << (i << cng_ch_bits_log);
+			}
+			val |= CONMCTXT_CNGCHMAP_V(ch_map);
+		}
+		ret = t4_set_params(adap, adap->mbox, adap->pf, 0, 1,
+				    &param, &val);
+		if (ret)
+			dev_warn(adap->pdev_dev, "Failed to set Congestion"
+				 " Manager Context for Ingress Queue %d: %d\n",
+				 iq->cntxt_id, -ret);
+	}
+
+	return 0;
+
+fl_nomem:
+	ret = -ENOMEM;
+err:
+	if (iq->desc) {
+		dma_free_coherent(adap->pdev_dev, iq->size * iq->iqe_len,
+				  iq->desc, iq->phys_addr);
+		iq->desc = NULL;
+	}
+	if (fl && fl->desc) {
+		kfree(fl->sdesc);
+		fl->sdesc = NULL;
+		dma_free_coherent(adap->pdev_dev, flsz * sizeof(struct tx_desc),
+				  fl->desc, fl->addr);
+		fl->desc = NULL;
+	}
+	return ret;
+}
+
+static void init_txq(struct adapter *adap, struct sge_txq *q, unsigned int id)
+{
+	q->cntxt_id = id;
+	q->bar2_addr = bar2_address(adap,
+				    q->cntxt_id,
+				    T4_BAR2_QTYPE_EGRESS,
+				    &q->bar2_qid);
+	q->in_use = 0;
+	q->cidx = q->pidx = 0;
+	q->stops = q->restarts = 0;
+	q->stat = (void *)&q->desc[q->size];
+	spin_lock_init(&q->db_lock);
+	adap->sge.egr_map[id - adap->sge.egr_start] = q;
+}
+
+int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
+			 struct net_device *dev, struct netdev_queue *netdevq,
+			 unsigned int iqid)
+{
+	int ret, nentries;
+	struct fw_eq_eth_cmd c;
+	struct sge *s = &adap->sge;
+	struct port_info *pi = netdev_priv(dev);
+
+	/* Add status entries */
+	nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
+
+	txq->q.desc = alloc_ring(adap->pdev_dev, txq->q.size,
+			sizeof(struct tx_desc), sizeof(struct tx_sw_desc),
+			&txq->q.phys_addr, &txq->q.sdesc, s->stat_len,
+			netdev_queue_numa_node_read(netdevq));
+	if (!txq->q.desc)
+		return -ENOMEM;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(FW_CMD_OP_V(FW_EQ_ETH_CMD) | FW_CMD_REQUEST_F |
+			    FW_CMD_WRITE_F | FW_CMD_EXEC_F |
+			    FW_EQ_ETH_CMD_PFN_V(adap->pf) |
+			    FW_EQ_ETH_CMD_VFN_V(0));
+	c.alloc_to_len16 = htonl(FW_EQ_ETH_CMD_ALLOC_F |
+				 FW_EQ_ETH_CMD_EQSTART_F | FW_LEN16(c));
+	c.viid_pkd = htonl(FW_EQ_ETH_CMD_AUTOEQUEQE_F |
+			   FW_EQ_ETH_CMD_VIID_V(pi->viid));
+	c.fetchszm_to_iqid =
+		htonl(FW_EQ_ETH_CMD_HOSTFCMODE_V(HOSTFCMODE_STATUS_PAGE_X) |
+		      FW_EQ_ETH_CMD_PCIECHN_V(pi->tx_chan) |
+		      FW_EQ_ETH_CMD_FETCHRO_F | FW_EQ_ETH_CMD_IQID_V(iqid));
+	c.dcaen_to_eqsize =
+		htonl(FW_EQ_ETH_CMD_FBMIN_V(FETCHBURSTMIN_64B_X) |
+		      FW_EQ_ETH_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
+		      FW_EQ_ETH_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
+		      FW_EQ_ETH_CMD_EQSIZE_V(nentries));
+	c.eqaddr = cpu_to_be64(txq->q.phys_addr);
+
+	ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
+	if (ret) {
+		kfree(txq->q.sdesc);
+		txq->q.sdesc = NULL;
+		dma_free_coherent(adap->pdev_dev,
+				  nentries * sizeof(struct tx_desc),
+				  txq->q.desc, txq->q.phys_addr);
+		txq->q.desc = NULL;
+		return ret;
+	}
+
+	txq->q.q_type = CXGB4_TXQ_ETH;
+	init_txq(adap, &txq->q, FW_EQ_ETH_CMD_EQID_G(ntohl(c.eqid_pkd)));
+	txq->txq = netdevq;
+	txq->tso = txq->tx_cso = txq->vlan_ins = 0;
+	txq->mapping_err = 0;
+	return 0;
+}
+
+int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
+			  struct net_device *dev, unsigned int iqid,
+			  unsigned int cmplqid)
+{
+	int ret, nentries;
+	struct fw_eq_ctrl_cmd c;
+	struct sge *s = &adap->sge;
+	struct port_info *pi = netdev_priv(dev);
+
+	/* Add status entries */
+	nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
+
+	txq->q.desc = alloc_ring(adap->pdev_dev, nentries,
+				 sizeof(struct tx_desc), 0, &txq->q.phys_addr,
+				 NULL, 0, dev_to_node(adap->pdev_dev));
+	if (!txq->q.desc)
+		return -ENOMEM;
+
+	c.op_to_vfn = htonl(FW_CMD_OP_V(FW_EQ_CTRL_CMD) | FW_CMD_REQUEST_F |
+			    FW_CMD_WRITE_F | FW_CMD_EXEC_F |
+			    FW_EQ_CTRL_CMD_PFN_V(adap->pf) |
+			    FW_EQ_CTRL_CMD_VFN_V(0));
+	c.alloc_to_len16 = htonl(FW_EQ_CTRL_CMD_ALLOC_F |
+				 FW_EQ_CTRL_CMD_EQSTART_F | FW_LEN16(c));
+	c.cmpliqid_eqid = htonl(FW_EQ_CTRL_CMD_CMPLIQID_V(cmplqid));
+	c.physeqid_pkd = htonl(0);
+	c.fetchszm_to_iqid =
+		htonl(FW_EQ_CTRL_CMD_HOSTFCMODE_V(HOSTFCMODE_STATUS_PAGE_X) |
+		      FW_EQ_CTRL_CMD_PCIECHN_V(pi->tx_chan) |
+		      FW_EQ_CTRL_CMD_FETCHRO_F | FW_EQ_CTRL_CMD_IQID_V(iqid));
+	c.dcaen_to_eqsize =
+		htonl(FW_EQ_CTRL_CMD_FBMIN_V(FETCHBURSTMIN_64B_X) |
+		      FW_EQ_CTRL_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
+		      FW_EQ_CTRL_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
+		      FW_EQ_CTRL_CMD_EQSIZE_V(nentries));
+	c.eqaddr = cpu_to_be64(txq->q.phys_addr);
+
+	ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
+	if (ret) {
+		dma_free_coherent(adap->pdev_dev,
+				  nentries * sizeof(struct tx_desc),
+				  txq->q.desc, txq->q.phys_addr);
+		txq->q.desc = NULL;
+		return ret;
+	}
+
+	txq->q.q_type = CXGB4_TXQ_CTRL;
+	init_txq(adap, &txq->q, FW_EQ_CTRL_CMD_EQID_G(ntohl(c.cmpliqid_eqid)));
+	txq->adap = adap;
+	skb_queue_head_init(&txq->sendq);
+	tasklet_init(&txq->qresume_tsk, restart_ctrlq, (unsigned long)txq);
+	txq->full = 0;
+	return 0;
+}
+
+int t4_sge_mod_ctrl_txq(struct adapter *adap, unsigned int eqid,
+			unsigned int cmplqid)
+{
+	u32 param, val;
+
+	param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DMAQ) |
+		 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_CTRL) |
+		 FW_PARAMS_PARAM_YZ_V(eqid));
+	val = cmplqid;
+	return t4_set_params(adap, adap->mbox, adap->pf, 0, 1, &param, &val);
+}
+
+int t4_sge_alloc_uld_txq(struct adapter *adap, struct sge_uld_txq *txq,
+			 struct net_device *dev, unsigned int iqid,
+			 unsigned int uld_type)
+{
+	int ret, nentries;
+	struct fw_eq_ofld_cmd c;
+	struct sge *s = &adap->sge;
+	struct port_info *pi = netdev_priv(dev);
+	int cmd = FW_EQ_OFLD_CMD;
+
+	/* Add status entries */
+	nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
+
+	txq->q.desc = alloc_ring(adap->pdev_dev, txq->q.size,
+			sizeof(struct tx_desc), sizeof(struct tx_sw_desc),
+			&txq->q.phys_addr, &txq->q.sdesc, s->stat_len,
+			NUMA_NO_NODE);
+	if (!txq->q.desc)
+		return -ENOMEM;
+
+	memset(&c, 0, sizeof(c));
+	if (unlikely(uld_type == CXGB4_TX_CRYPTO))
+		cmd = FW_EQ_CTRL_CMD;
+	c.op_to_vfn = htonl(FW_CMD_OP_V(cmd) | FW_CMD_REQUEST_F |
+			    FW_CMD_WRITE_F | FW_CMD_EXEC_F |
+			    FW_EQ_OFLD_CMD_PFN_V(adap->pf) |
+			    FW_EQ_OFLD_CMD_VFN_V(0));
+	c.alloc_to_len16 = htonl(FW_EQ_OFLD_CMD_ALLOC_F |
+				 FW_EQ_OFLD_CMD_EQSTART_F | FW_LEN16(c));
+	c.fetchszm_to_iqid =
+		htonl(FW_EQ_OFLD_CMD_HOSTFCMODE_V(HOSTFCMODE_STATUS_PAGE_X) |
+		      FW_EQ_OFLD_CMD_PCIECHN_V(pi->tx_chan) |
+		      FW_EQ_OFLD_CMD_FETCHRO_F | FW_EQ_OFLD_CMD_IQID_V(iqid));
+	c.dcaen_to_eqsize =
+		htonl(FW_EQ_OFLD_CMD_FBMIN_V(FETCHBURSTMIN_64B_X) |
+		      FW_EQ_OFLD_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
+		      FW_EQ_OFLD_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
+		      FW_EQ_OFLD_CMD_EQSIZE_V(nentries));
+	c.eqaddr = cpu_to_be64(txq->q.phys_addr);
+
+	ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
+	if (ret) {
+		kfree(txq->q.sdesc);
+		txq->q.sdesc = NULL;
+		dma_free_coherent(adap->pdev_dev,
+				  nentries * sizeof(struct tx_desc),
+				  txq->q.desc, txq->q.phys_addr);
+		txq->q.desc = NULL;
+		return ret;
+	}
+
+	txq->q.q_type = CXGB4_TXQ_ULD;
+	init_txq(adap, &txq->q, FW_EQ_OFLD_CMD_EQID_G(ntohl(c.eqid_pkd)));
+	txq->adap = adap;
+	skb_queue_head_init(&txq->sendq);
+	tasklet_init(&txq->qresume_tsk, restart_ofldq, (unsigned long)txq);
+	txq->full = 0;
+	txq->mapping_err = 0;
+	return 0;
+}
+
+void free_txq(struct adapter *adap, struct sge_txq *q)
+{
+	struct sge *s = &adap->sge;
+
+	dma_free_coherent(adap->pdev_dev,
+			  q->size * sizeof(struct tx_desc) + s->stat_len,
+			  q->desc, q->phys_addr);
+	q->cntxt_id = 0;
+	q->sdesc = NULL;
+	q->desc = NULL;
+}
+
+void free_rspq_fl(struct adapter *adap, struct sge_rspq *rq,
+		  struct sge_fl *fl)
+{
+	struct sge *s = &adap->sge;
+	unsigned int fl_id = fl ? fl->cntxt_id : 0xffff;
+
+	adap->sge.ingr_map[rq->cntxt_id - adap->sge.ingr_start] = NULL;
+	t4_iq_free(adap, adap->mbox, adap->pf, 0, FW_IQ_TYPE_FL_INT_CAP,
+		   rq->cntxt_id, fl_id, 0xffff);
+	dma_free_coherent(adap->pdev_dev, (rq->size + 1) * rq->iqe_len,
+			  rq->desc, rq->phys_addr);
+	netif_napi_del(&rq->napi);
+	rq->netdev = NULL;
+	rq->cntxt_id = rq->abs_id = 0;
+	rq->desc = NULL;
+
+	if (fl) {
+		free_rx_bufs(adap, fl, fl->avail);
+		dma_free_coherent(adap->pdev_dev, fl->size * 8 + s->stat_len,
+				  fl->desc, fl->addr);
+		kfree(fl->sdesc);
+		fl->sdesc = NULL;
+		fl->cntxt_id = 0;
+		fl->desc = NULL;
+	}
+}
+
+/**
+ *      t4_free_ofld_rxqs - free a block of consecutive Rx queues
+ *      @adap: the adapter
+ *      @n: number of queues
+ *      @q: pointer to first queue
+ *
+ *      Release the resources of a consecutive block of offload Rx queues.
+ */
+void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q)
+{
+	for ( ; n; n--, q++)
+		if (q->rspq.desc)
+			free_rspq_fl(adap, &q->rspq,
+				     q->fl.size ? &q->fl : NULL);
+}
+
+/**
+ *	t4_free_sge_resources - free SGE resources
+ *	@adap: the adapter
+ *
+ *	Frees resources used by the SGE queue sets.
+ */
+void t4_free_sge_resources(struct adapter *adap)
+{
+	int i;
+	struct sge_eth_rxq *eq;
+	struct sge_eth_txq *etq;
+
+	/* stop all Rx queues in order to start them draining */
+	for (i = 0; i < adap->sge.ethqsets; i++) {
+		eq = &adap->sge.ethrxq[i];
+		if (eq->rspq.desc)
+			t4_iq_stop(adap, adap->mbox, adap->pf, 0,
+				   FW_IQ_TYPE_FL_INT_CAP,
+				   eq->rspq.cntxt_id,
+				   eq->fl.size ? eq->fl.cntxt_id : 0xffff,
+				   0xffff);
+	}
+
+	/* clean up Ethernet Tx/Rx queues */
+	for (i = 0; i < adap->sge.ethqsets; i++) {
+		eq = &adap->sge.ethrxq[i];
+		if (eq->rspq.desc)
+			free_rspq_fl(adap, &eq->rspq,
+				     eq->fl.size ? &eq->fl : NULL);
+
+		etq = &adap->sge.ethtxq[i];
+		if (etq->q.desc) {
+			t4_eth_eq_free(adap, adap->mbox, adap->pf, 0,
+				       etq->q.cntxt_id);
+			__netif_tx_lock_bh(etq->txq);
+			free_tx_desc(adap, &etq->q, etq->q.in_use, true);
+			__netif_tx_unlock_bh(etq->txq);
+			kfree(etq->q.sdesc);
+			free_txq(adap, &etq->q);
+		}
+	}
+
+	/* clean up control Tx queues */
+	for (i = 0; i < ARRAY_SIZE(adap->sge.ctrlq); i++) {
+		struct sge_ctrl_txq *cq = &adap->sge.ctrlq[i];
+
+		if (cq->q.desc) {
+			tasklet_kill(&cq->qresume_tsk);
+			t4_ctrl_eq_free(adap, adap->mbox, adap->pf, 0,
+					cq->q.cntxt_id);
+			__skb_queue_purge(&cq->sendq);
+			free_txq(adap, &cq->q);
+		}
+	}
+
+	if (adap->sge.fw_evtq.desc)
+		free_rspq_fl(adap, &adap->sge.fw_evtq, NULL);
+
+	if (adap->sge.intrq.desc)
+		free_rspq_fl(adap, &adap->sge.intrq, NULL);
+
+	if (!is_t4(adap->params.chip)) {
+		etq = &adap->sge.ptptxq;
+		if (etq->q.desc) {
+			t4_eth_eq_free(adap, adap->mbox, adap->pf, 0,
+				       etq->q.cntxt_id);
+			spin_lock_bh(&adap->ptp_lock);
+			free_tx_desc(adap, &etq->q, etq->q.in_use, true);
+			spin_unlock_bh(&adap->ptp_lock);
+			kfree(etq->q.sdesc);
+			free_txq(adap, &etq->q);
+		}
+	}
+
+	/* clear the reverse egress queue map */
+	memset(adap->sge.egr_map, 0,
+	       adap->sge.egr_sz * sizeof(*adap->sge.egr_map));
+}
+
+void t4_sge_start(struct adapter *adap)
+{
+	adap->sge.ethtxq_rover = 0;
+	mod_timer(&adap->sge.rx_timer, jiffies + RX_QCHECK_PERIOD);
+	mod_timer(&adap->sge.tx_timer, jiffies + TX_QCHECK_PERIOD);
+}
+
+/**
+ *	t4_sge_stop - disable SGE operation
+ *	@adap: the adapter
+ *
+ *	Stop tasklets and timers associated with the DMA engine.  Note that
+ *	this is effective only if measures have been taken to disable any HW
+ *	events that may restart them.
+ */
+void t4_sge_stop(struct adapter *adap)
+{
+	int i;
+	struct sge *s = &adap->sge;
+
+	if (in_interrupt())  /* actions below require waiting */
+		return;
+
+	if (s->rx_timer.function)
+		del_timer_sync(&s->rx_timer);
+	if (s->tx_timer.function)
+		del_timer_sync(&s->tx_timer);
+
+	if (is_offload(adap)) {
+		struct sge_uld_txq_info *txq_info;
+
+		txq_info = adap->sge.uld_txq_info[CXGB4_TX_OFLD];
+		if (txq_info) {
+			struct sge_uld_txq *txq = txq_info->uldtxq;
+
+			for_each_ofldtxq(&adap->sge, i) {
+				if (txq->q.desc)
+					tasklet_kill(&txq->qresume_tsk);
+			}
+		}
+	}
+
+	if (is_pci_uld(adap)) {
+		struct sge_uld_txq_info *txq_info;
+
+		txq_info = adap->sge.uld_txq_info[CXGB4_TX_CRYPTO];
+		if (txq_info) {
+			struct sge_uld_txq *txq = txq_info->uldtxq;
+
+			for_each_ofldtxq(&adap->sge, i) {
+				if (txq->q.desc)
+					tasklet_kill(&txq->qresume_tsk);
+			}
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(s->ctrlq); i++) {
+		struct sge_ctrl_txq *cq = &s->ctrlq[i];
+
+		if (cq->q.desc)
+			tasklet_kill(&cq->qresume_tsk);
+	}
+}
+
+/**
+ *	t4_sge_init_soft - grab core SGE values needed by SGE code
+ *	@adap: the adapter
+ *
+ *	We need to grab the SGE operating parameters that we need to have
+ *	in order to do our job and make sure we can live with them.
+ */
+
+static int t4_sge_init_soft(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+	u32 fl_small_pg, fl_large_pg, fl_small_mtu, fl_large_mtu;
+	u32 timer_value_0_and_1, timer_value_2_and_3, timer_value_4_and_5;
+	u32 ingress_rx_threshold;
+
+	/*
+	 * Verify that CPL messages are going to the Ingress Queue for
+	 * process_responses() and that only packet data is going to the
+	 * Free Lists.
+	 */
+	if ((t4_read_reg(adap, SGE_CONTROL_A) & RXPKTCPLMODE_F) !=
+	    RXPKTCPLMODE_V(RXPKTCPLMODE_SPLIT_X)) {
+		dev_err(adap->pdev_dev, "bad SGE CPL MODE\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Validate the Host Buffer Register Array indices that we want to
+	 * use ...
+	 *
+	 * XXX Note that we should really read through the Host Buffer Size
+	 * XXX register array and find the indices of the Buffer Sizes which
+	 * XXX meet our needs!
+	 */
+	#define READ_FL_BUF(x) \
+		t4_read_reg(adap, SGE_FL_BUFFER_SIZE0_A+(x)*sizeof(u32))
+
+	fl_small_pg = READ_FL_BUF(RX_SMALL_PG_BUF);
+	fl_large_pg = READ_FL_BUF(RX_LARGE_PG_BUF);
+	fl_small_mtu = READ_FL_BUF(RX_SMALL_MTU_BUF);
+	fl_large_mtu = READ_FL_BUF(RX_LARGE_MTU_BUF);
+
+	/* We only bother using the Large Page logic if the Large Page Buffer
+	 * is larger than our Page Size Buffer.
+	 */
+	if (fl_large_pg <= fl_small_pg)
+		fl_large_pg = 0;
+
+	#undef READ_FL_BUF
+
+	/* The Page Size Buffer must be exactly equal to our Page Size and the
+	 * Large Page Size Buffer should be 0 (per above) or a power of 2.
+	 */
+	if (fl_small_pg != PAGE_SIZE ||
+	    (fl_large_pg & (fl_large_pg-1)) != 0) {
+		dev_err(adap->pdev_dev, "bad SGE FL page buffer sizes [%d, %d]\n",
+			fl_small_pg, fl_large_pg);
+		return -EINVAL;
+	}
+	if (fl_large_pg)
+		s->fl_pg_order = ilog2(fl_large_pg) - PAGE_SHIFT;
+
+	if (fl_small_mtu < FL_MTU_SMALL_BUFSIZE(adap) ||
+	    fl_large_mtu < FL_MTU_LARGE_BUFSIZE(adap)) {
+		dev_err(adap->pdev_dev, "bad SGE FL MTU sizes [%d, %d]\n",
+			fl_small_mtu, fl_large_mtu);
+		return -EINVAL;
+	}
+
+	/*
+	 * Retrieve our RX interrupt holdoff timer values and counter
+	 * threshold values from the SGE parameters.
+	 */
+	timer_value_0_and_1 = t4_read_reg(adap, SGE_TIMER_VALUE_0_AND_1_A);
+	timer_value_2_and_3 = t4_read_reg(adap, SGE_TIMER_VALUE_2_AND_3_A);
+	timer_value_4_and_5 = t4_read_reg(adap, SGE_TIMER_VALUE_4_AND_5_A);
+	s->timer_val[0] = core_ticks_to_us(adap,
+		TIMERVALUE0_G(timer_value_0_and_1));
+	s->timer_val[1] = core_ticks_to_us(adap,
+		TIMERVALUE1_G(timer_value_0_and_1));
+	s->timer_val[2] = core_ticks_to_us(adap,
+		TIMERVALUE2_G(timer_value_2_and_3));
+	s->timer_val[3] = core_ticks_to_us(adap,
+		TIMERVALUE3_G(timer_value_2_and_3));
+	s->timer_val[4] = core_ticks_to_us(adap,
+		TIMERVALUE4_G(timer_value_4_and_5));
+	s->timer_val[5] = core_ticks_to_us(adap,
+		TIMERVALUE5_G(timer_value_4_and_5));
+
+	ingress_rx_threshold = t4_read_reg(adap, SGE_INGRESS_RX_THRESHOLD_A);
+	s->counter_val[0] = THRESHOLD_0_G(ingress_rx_threshold);
+	s->counter_val[1] = THRESHOLD_1_G(ingress_rx_threshold);
+	s->counter_val[2] = THRESHOLD_2_G(ingress_rx_threshold);
+	s->counter_val[3] = THRESHOLD_3_G(ingress_rx_threshold);
+
+	return 0;
+}
+
+/**
+ *     t4_sge_init - initialize SGE
+ *     @adap: the adapter
+ *
+ *     Perform low-level SGE code initialization needed every time after a
+ *     chip reset.
+ */
+int t4_sge_init(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+	u32 sge_control, sge_conm_ctrl;
+	int ret, egress_threshold;
+
+	/*
+	 * Ingress Padding Boundary and Egress Status Page Size are set up by
+	 * t4_fixup_host_params().
+	 */
+	sge_control = t4_read_reg(adap, SGE_CONTROL_A);
+	s->pktshift = PKTSHIFT_G(sge_control);
+	s->stat_len = (sge_control & EGRSTATUSPAGESIZE_F) ? 128 : 64;
+
+	s->fl_align = t4_fl_pkt_align(adap);
+	ret = t4_sge_init_soft(adap);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * A FL with <= fl_starve_thres buffers is starving and a periodic
+	 * timer will attempt to refill it.  This needs to be larger than the
+	 * SGE's Egress Congestion Threshold.  If it isn't, then we can get
+	 * stuck waiting for new packets while the SGE is waiting for us to
+	 * give it more Free List entries.  (Note that the SGE's Egress
+	 * Congestion Threshold is in units of 2 Free List pointers.) For T4,
+	 * there was only a single field to control this.  For T5 there's the
+	 * original field which now only applies to Unpacked Mode Free List
+	 * buffers and a new field which only applies to Packed Mode Free List
+	 * buffers.
+	 */
+	sge_conm_ctrl = t4_read_reg(adap, SGE_CONM_CTRL_A);
+	switch (CHELSIO_CHIP_VERSION(adap->params.chip)) {
+	case CHELSIO_T4:
+		egress_threshold = EGRTHRESHOLD_G(sge_conm_ctrl);
+		break;
+	case CHELSIO_T5:
+		egress_threshold = EGRTHRESHOLDPACKING_G(sge_conm_ctrl);
+		break;
+	case CHELSIO_T6:
+		egress_threshold = T6_EGRTHRESHOLDPACKING_G(sge_conm_ctrl);
+		break;
+	default:
+		dev_err(adap->pdev_dev, "Unsupported Chip version %d\n",
+			CHELSIO_CHIP_VERSION(adap->params.chip));
+		return -EINVAL;
+	}
+	s->fl_starve_thres = 2*egress_threshold + 1;
+
+	t4_idma_monitor_init(adap, &s->idma_monitor);
+
+	/* Set up timers used for recuring callbacks to process RX and TX
+	 * administrative tasks.
+	 */
+	timer_setup(&s->rx_timer, sge_rx_timer_cb, 0);
+	timer_setup(&s->tx_timer, sge_tx_timer_cb, 0);
+
+	spin_lock_init(&s->intrq_lock);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/smt.c b/drivers/net/ethernet/chelsio/cxgb4/smt.c
new file mode 100644
index 0000000..7b2207a
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/smt.c
@@ -0,0 +1,247 @@
+/*
+ * This file is part of the Chelsio T4/T5/T6 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2017 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "cxgb4.h"
+#include "smt.h"
+#include "t4_msg.h"
+#include "t4fw_api.h"
+#include "t4_regs.h"
+#include "t4_values.h"
+
+struct smt_data *t4_init_smt(void)
+{
+	unsigned int smt_size;
+	struct smt_data *s;
+	int i;
+
+	smt_size = SMT_SIZE;
+
+	s = kvzalloc(sizeof(*s) + smt_size * sizeof(struct smt_entry),
+		     GFP_KERNEL);
+	if (!s)
+		return NULL;
+	s->smt_size = smt_size;
+	rwlock_init(&s->lock);
+	for (i = 0; i < s->smt_size; ++i) {
+		s->smtab[i].idx = i;
+		s->smtab[i].state = SMT_STATE_UNUSED;
+		memset(&s->smtab[i].src_mac, 0, ETH_ALEN);
+		spin_lock_init(&s->smtab[i].lock);
+		atomic_set(&s->smtab[i].refcnt, 0);
+	}
+	return s;
+}
+
+static struct smt_entry *find_or_alloc_smte(struct smt_data *s, u8 *smac)
+{
+	struct smt_entry *first_free = NULL;
+	struct smt_entry *e, *end;
+
+	for (e = &s->smtab[0], end = &s->smtab[s->smt_size]; e != end; ++e) {
+		if (atomic_read(&e->refcnt) == 0) {
+			if (!first_free)
+				first_free = e;
+		} else {
+			if (e->state == SMT_STATE_SWITCHING) {
+				/* This entry is actually in use. See if we can
+				 * re-use it ?
+				 */
+				if (memcmp(e->src_mac, smac, ETH_ALEN) == 0)
+					goto found_reuse;
+			}
+		}
+	}
+
+	if (first_free) {
+		e = first_free;
+		goto found;
+	}
+	return NULL;
+
+found:
+	e->state = SMT_STATE_UNUSED;
+
+found_reuse:
+	return e;
+}
+
+static void t4_smte_free(struct smt_entry *e)
+{
+	spin_lock_bh(&e->lock);
+	if (atomic_read(&e->refcnt) == 0) {  /* hasn't been recycled */
+		e->state = SMT_STATE_UNUSED;
+	}
+	spin_unlock_bh(&e->lock);
+}
+
+/**
+ * @e: smt entry to release
+ *
+ * Releases ref count and frees up an smt entry from SMT table
+ */
+void cxgb4_smt_release(struct smt_entry *e)
+{
+	if (atomic_dec_and_test(&e->refcnt))
+		t4_smte_free(e);
+}
+EXPORT_SYMBOL(cxgb4_smt_release);
+
+void do_smt_write_rpl(struct adapter *adap, const struct cpl_smt_write_rpl *rpl)
+{
+	unsigned int smtidx = TID_TID_G(GET_TID(rpl));
+	struct smt_data *s = adap->smt;
+
+	if (unlikely(rpl->status != CPL_ERR_NONE)) {
+		struct smt_entry *e = &s->smtab[smtidx];
+
+		dev_err(adap->pdev_dev,
+			"Unexpected SMT_WRITE_RPL status %u for entry %u\n",
+			rpl->status, smtidx);
+		spin_lock(&e->lock);
+		e->state = SMT_STATE_ERROR;
+		spin_unlock(&e->lock);
+		return;
+	}
+}
+
+static int write_smt_entry(struct adapter *adapter, struct smt_entry *e)
+{
+	struct cpl_t6_smt_write_req *t6req;
+	struct smt_data *s = adapter->smt;
+	struct cpl_smt_write_req *req;
+	struct sk_buff *skb;
+	int size;
+	u8 row;
+
+	if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5) {
+		size = sizeof(*req);
+		skb = alloc_skb(size, GFP_ATOMIC);
+		if (!skb)
+			return -ENOMEM;
+		/* Source MAC Table (SMT) contains 256 SMAC entries
+		 * organized in 128 rows of 2 entries each.
+		 */
+		req = (struct cpl_smt_write_req *)__skb_put(skb, size);
+		INIT_TP_WR(req, 0);
+
+		/* Each row contains an SMAC pair.
+		 * LSB selects the SMAC entry within a row
+		 */
+		row = (e->idx >> 1);
+		if (e->idx & 1) {
+			req->pfvf1 = 0x0;
+			memcpy(req->src_mac1, e->src_mac, ETH_ALEN);
+
+			/* fill pfvf0/src_mac0 with entry
+			 * at prev index from smt-tab.
+			 */
+			req->pfvf0 = 0x0;
+			memcpy(req->src_mac0, s->smtab[e->idx - 1].src_mac,
+			       ETH_ALEN);
+		} else {
+			req->pfvf0 = 0x0;
+			memcpy(req->src_mac0, e->src_mac, ETH_ALEN);
+
+			/* fill pfvf1/src_mac1 with entry
+			 * at next index from smt-tab
+			 */
+			req->pfvf1 = 0x0;
+			memcpy(req->src_mac1, s->smtab[e->idx + 1].src_mac,
+			       ETH_ALEN);
+		}
+	} else {
+		size = sizeof(*t6req);
+		skb = alloc_skb(size, GFP_ATOMIC);
+		if (!skb)
+			return -ENOMEM;
+		/* Source MAC Table (SMT) contains 256 SMAC entries */
+		t6req = (struct cpl_t6_smt_write_req *)__skb_put(skb, size);
+		INIT_TP_WR(t6req, 0);
+		req = (struct cpl_smt_write_req *)t6req;
+
+		/* fill pfvf0/src_mac0 from smt-tab */
+		req->pfvf0 = 0x0;
+		memcpy(req->src_mac0, s->smtab[e->idx].src_mac, ETH_ALEN);
+		row = e->idx;
+	}
+
+	OPCODE_TID(req) =
+		htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, e->idx |
+				    TID_QID_V(adapter->sge.fw_evtq.abs_id)));
+	req->params = htonl(SMTW_NORPL_V(0) |
+			    SMTW_IDX_V(row) |
+			    SMTW_OVLAN_IDX_V(0));
+	t4_mgmt_tx(adapter, skb);
+	return 0;
+}
+
+static struct smt_entry *t4_smt_alloc_switching(struct adapter *adap, u16 pfvf,
+						u8 *smac)
+{
+	struct smt_data *s = adap->smt;
+	struct smt_entry *e;
+
+	write_lock_bh(&s->lock);
+	e = find_or_alloc_smte(s, smac);
+	if (e) {
+		spin_lock(&e->lock);
+		if (!atomic_read(&e->refcnt)) {
+			atomic_set(&e->refcnt, 1);
+			e->state = SMT_STATE_SWITCHING;
+			e->pfvf = pfvf;
+			memcpy(e->src_mac, smac, ETH_ALEN);
+			write_smt_entry(adap, e);
+		} else {
+			atomic_inc(&e->refcnt);
+		}
+		spin_unlock(&e->lock);
+	}
+	write_unlock_bh(&s->lock);
+	return e;
+}
+
+/**
+ * @dev: net_device pointer
+ * @smac: MAC address to add to SMT
+ * Returns pointer to the SMT entry created
+ *
+ * Allocates an SMT entry to be used by switching rule of a filter.
+ */
+struct smt_entry *cxgb4_smt_alloc_switching(struct net_device *dev, u8 *smac)
+{
+	struct adapter *adap = netdev2adap(dev);
+
+	return t4_smt_alloc_switching(adap, 0x0, smac);
+}
+EXPORT_SYMBOL(cxgb4_smt_alloc_switching);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/smt.h b/drivers/net/ethernet/chelsio/cxgb4/smt.h
new file mode 100644
index 0000000..d6c2cc2
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/smt.h
@@ -0,0 +1,76 @@
+/*
+ * This file is part of the Chelsio T4/T5/T6 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2017 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_SMT_H
+#define __CXGB4_SMT_H
+
+#include <linux/spinlock.h>
+#include <linux/if_ether.h>
+#include <linux/atomic.h>
+
+struct adapter;
+struct cpl_smt_write_rpl;
+
+/* SMT related handling. Heavily adapted based on l2t ops in l2t.h/l2t.c
+ */
+enum {
+	SMT_STATE_SWITCHING,
+	SMT_STATE_UNUSED,
+	SMT_STATE_ERROR
+};
+
+enum {
+	SMT_SIZE = 256
+};
+
+struct smt_entry {
+	u16 state;
+	u16 idx;
+	u16 pfvf;
+	u8 src_mac[ETH_ALEN];
+	atomic_t refcnt;
+	spinlock_t lock;	/* protect smt entry add,removal */
+};
+
+struct smt_data {
+	unsigned int smt_size;
+	rwlock_t lock;
+	struct smt_entry smtab[0];
+};
+
+struct smt_data *t4_init_smt(void);
+struct smt_entry *cxgb4_smt_alloc_switching(struct net_device *dev, u8 *smac);
+void cxgb4_smt_release(struct smt_entry *e);
+void do_smt_write_rpl(struct adapter *p, const struct cpl_smt_write_rpl *rpl);
+#endif /* __CXGB4_SMT_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/srq.c b/drivers/net/ethernet/chelsio/cxgb4/srq.c
new file mode 100644
index 0000000..6228a57
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/srq.c
@@ -0,0 +1,138 @@
+/*
+ * This file is part of the Chelsio T6 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2017-2018 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "cxgb4.h"
+#include "t4_msg.h"
+#include "srq.h"
+
+struct srq_data *t4_init_srq(int srq_size)
+{
+	struct srq_data *s;
+
+	s = kvzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return NULL;
+
+	s->srq_size = srq_size;
+	init_completion(&s->comp);
+	mutex_init(&s->lock);
+
+	return s;
+}
+
+/* cxgb4_get_srq_entry: read the SRQ table entry
+ * @dev: Pointer to the net_device
+ * @idx: Index to the srq
+ * @entryp: pointer to the srq entry
+ *
+ * Sends CPL_SRQ_TABLE_REQ message for the given index.
+ * Contents will be returned in CPL_SRQ_TABLE_RPL message.
+ *
+ * Returns zero if the read is successful, else a error
+ * number will be returned. Caller should not use the srq
+ * entry if the return value is non-zero.
+ *
+ *
+ */
+int cxgb4_get_srq_entry(struct net_device *dev,
+			int srq_idx, struct srq_entry *entryp)
+{
+	struct cpl_srq_table_req *req;
+	struct adapter *adap;
+	struct sk_buff *skb;
+	struct srq_data *s;
+	int rc = -ENODEV;
+
+	adap = netdev2adap(dev);
+	s = adap->srq;
+
+	if (!(adap->flags & FULL_INIT_DONE) || !s)
+		goto out;
+
+	skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+	req = (struct cpl_srq_table_req *)
+		__skb_put(skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SRQ_TABLE_REQ,
+					      TID_TID_V(srq_idx) |
+				TID_QID_V(adap->sge.fw_evtq.abs_id)));
+	req->idx = srq_idx;
+
+	mutex_lock(&s->lock);
+
+	s->entryp = entryp;
+	t4_mgmt_tx(adap, skb);
+
+	rc = wait_for_completion_timeout(&s->comp, SRQ_WAIT_TO);
+	if (rc)
+		rc = 0;
+	else /* !rc means we timed out */
+		rc = -ETIMEDOUT;
+
+	WARN_ON_ONCE(entryp->idx != srq_idx);
+	mutex_unlock(&s->lock);
+out:
+	return rc;
+}
+EXPORT_SYMBOL(cxgb4_get_srq_entry);
+
+void do_srq_table_rpl(struct adapter *adap,
+		      const struct cpl_srq_table_rpl *rpl)
+{
+	unsigned int idx = TID_TID_G(GET_TID(rpl));
+	struct srq_data *s = adap->srq;
+	struct srq_entry *e;
+
+	if (unlikely(rpl->status != CPL_CONTAINS_READ_RPL)) {
+		dev_err(adap->pdev_dev,
+			"Unexpected SRQ_TABLE_RPL status %u for entry %u\n",
+				rpl->status, idx);
+		goto out;
+	}
+
+	/* Store the read entry */
+	e = s->entryp;
+	e->valid = 1;
+	e->idx = idx;
+	e->pdid = SRQT_PDID_G(be64_to_cpu(rpl->rsvd_pdid));
+	e->qlen = SRQT_QLEN_G(be32_to_cpu(rpl->qlen_qbase));
+	e->qbase = SRQT_QBASE_G(be32_to_cpu(rpl->qlen_qbase));
+	e->cur_msn = be16_to_cpu(rpl->cur_msn);
+	e->max_msn = be16_to_cpu(rpl->max_msn);
+out:
+	complete(&s->comp);
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/srq.h b/drivers/net/ethernet/chelsio/cxgb4/srq.h
new file mode 100644
index 0000000..ec85cf9
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/srq.h
@@ -0,0 +1,65 @@
+/*
+ * This file is part of the Chelsio T6 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2017-2018 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_SRQ_H
+#define __CXGB4_SRQ_H
+
+struct adapter;
+struct cpl_srq_table_rpl;
+
+#define SRQ_WAIT_TO	(HZ * 5)
+
+struct srq_entry {
+	u8 valid;
+	u8 idx;
+	u8 qlen;
+	u16 pdid;
+	u16 cur_msn;
+	u16 max_msn;
+	u32 qbase;
+};
+
+struct srq_data {
+	unsigned int srq_size;
+	struct srq_entry *entryp;
+	struct completion comp;
+	struct mutex lock; /* generic mutex for srq data */
+};
+
+struct srq_data *t4_init_srq(int srq_size);
+int cxgb4_get_srq_entry(struct net_device *dev,
+			int srq_idx, struct srq_entry *entryp);
+void do_srq_table_rpl(struct adapter *adap,
+		      const struct cpl_srq_table_rpl *rpl);
+#endif  /* __CXGB4_SRQ_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_chip_type.h b/drivers/net/ethernet/chelsio/cxgb4/t4_chip_type.h
new file mode 100644
index 0000000..54b7181
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_chip_type.h
@@ -0,0 +1,85 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2015 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __T4_CHIP_TYPE_H__
+#define __T4_CHIP_TYPE_H__
+
+#define CHELSIO_T4		0x4
+#define CHELSIO_T5		0x5
+#define CHELSIO_T6		0x6
+
+/* We code the Chelsio T4 Family "Chip Code" as a tuple:
+ *
+ *     (Chip Version, Chip Revision)
+ *
+ * where:
+ *
+ *     Chip Version: is T4, T5, etc.
+ *     Chip Revision: is the FAB "spin" of the Chip Version.
+ */
+#define CHELSIO_CHIP_CODE(version, revision) (((version) << 4) | (revision))
+#define CHELSIO_CHIP_VERSION(code) (((code) >> 4) & 0xf)
+#define CHELSIO_CHIP_RELEASE(code) ((code) & 0xf)
+
+enum chip_type {
+	T4_A1 = CHELSIO_CHIP_CODE(CHELSIO_T4, 1),
+	T4_A2 = CHELSIO_CHIP_CODE(CHELSIO_T4, 2),
+	T4_FIRST_REV	= T4_A1,
+	T4_LAST_REV	= T4_A2,
+
+	T5_A0 = CHELSIO_CHIP_CODE(CHELSIO_T5, 0),
+	T5_A1 = CHELSIO_CHIP_CODE(CHELSIO_T5, 1),
+	T5_FIRST_REV	= T5_A0,
+	T5_LAST_REV	= T5_A1,
+
+	T6_A0 = CHELSIO_CHIP_CODE(CHELSIO_T6, 0),
+	T6_FIRST_REV	= T6_A0,
+	T6_LAST_REV	= T6_A0,
+};
+
+static inline int is_t4(enum chip_type chip)
+{
+	return (CHELSIO_CHIP_VERSION(chip) == CHELSIO_T4);
+}
+
+static inline int is_t5(enum chip_type chip)
+{
+	return (CHELSIO_CHIP_VERSION(chip) == CHELSIO_T5);
+}
+
+static inline int is_t6(enum chip_type chip)
+{
+	return (CHELSIO_CHIP_VERSION(chip) == CHELSIO_T6);
+}
+
+#endif /* __T4_CHIP_TYPE_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
new file mode 100644
index 0000000..7cb3ef4
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -0,0 +1,9998 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include "cxgb4.h"
+#include "t4_regs.h"
+#include "t4_values.h"
+#include "t4fw_api.h"
+#include "t4fw_version.h"
+
+/**
+ *	t4_wait_op_done_val - wait until an operation is completed
+ *	@adapter: the adapter performing the operation
+ *	@reg: the register to check for completion
+ *	@mask: a single-bit field within @reg that indicates completion
+ *	@polarity: the value of the field when the operation is completed
+ *	@attempts: number of check iterations
+ *	@delay: delay in usecs between iterations
+ *	@valp: where to store the value of the register at completion time
+ *
+ *	Wait until an operation is completed by checking a bit in a register
+ *	up to @attempts times.  If @valp is not NULL the value of the register
+ *	at the time it indicated completion is stored there.  Returns 0 if the
+ *	operation completes and	-EAGAIN	otherwise.
+ */
+static int t4_wait_op_done_val(struct adapter *adapter, int reg, u32 mask,
+			       int polarity, int attempts, int delay, u32 *valp)
+{
+	while (1) {
+		u32 val = t4_read_reg(adapter, reg);
+
+		if (!!(val & mask) == polarity) {
+			if (valp)
+				*valp = val;
+			return 0;
+		}
+		if (--attempts == 0)
+			return -EAGAIN;
+		if (delay)
+			udelay(delay);
+	}
+}
+
+static inline int t4_wait_op_done(struct adapter *adapter, int reg, u32 mask,
+				  int polarity, int attempts, int delay)
+{
+	return t4_wait_op_done_val(adapter, reg, mask, polarity, attempts,
+				   delay, NULL);
+}
+
+/**
+ *	t4_set_reg_field - set a register field to a value
+ *	@adapter: the adapter to program
+ *	@addr: the register address
+ *	@mask: specifies the portion of the register to modify
+ *	@val: the new value for the register field
+ *
+ *	Sets a register field specified by the supplied mask to the
+ *	given value.
+ */
+void t4_set_reg_field(struct adapter *adapter, unsigned int addr, u32 mask,
+		      u32 val)
+{
+	u32 v = t4_read_reg(adapter, addr) & ~mask;
+
+	t4_write_reg(adapter, addr, v | val);
+	(void) t4_read_reg(adapter, addr);      /* flush */
+}
+
+/**
+ *	t4_read_indirect - read indirectly addressed registers
+ *	@adap: the adapter
+ *	@addr_reg: register holding the indirect address
+ *	@data_reg: register holding the value of the indirect register
+ *	@vals: where the read register values are stored
+ *	@nregs: how many indirect registers to read
+ *	@start_idx: index of first indirect register to read
+ *
+ *	Reads registers that are accessed indirectly through an address/data
+ *	register pair.
+ */
+void t4_read_indirect(struct adapter *adap, unsigned int addr_reg,
+			     unsigned int data_reg, u32 *vals,
+			     unsigned int nregs, unsigned int start_idx)
+{
+	while (nregs--) {
+		t4_write_reg(adap, addr_reg, start_idx);
+		*vals++ = t4_read_reg(adap, data_reg);
+		start_idx++;
+	}
+}
+
+/**
+ *	t4_write_indirect - write indirectly addressed registers
+ *	@adap: the adapter
+ *	@addr_reg: register holding the indirect addresses
+ *	@data_reg: register holding the value for the indirect registers
+ *	@vals: values to write
+ *	@nregs: how many indirect registers to write
+ *	@start_idx: address of first indirect register to write
+ *
+ *	Writes a sequential block of registers that are accessed indirectly
+ *	through an address/data register pair.
+ */
+void t4_write_indirect(struct adapter *adap, unsigned int addr_reg,
+		       unsigned int data_reg, const u32 *vals,
+		       unsigned int nregs, unsigned int start_idx)
+{
+	while (nregs--) {
+		t4_write_reg(adap, addr_reg, start_idx++);
+		t4_write_reg(adap, data_reg, *vals++);
+	}
+}
+
+/*
+ * Read a 32-bit PCI Configuration Space register via the PCI-E backdoor
+ * mechanism.  This guarantees that we get the real value even if we're
+ * operating within a Virtual Machine and the Hypervisor is trapping our
+ * Configuration Space accesses.
+ */
+void t4_hw_pci_read_cfg4(struct adapter *adap, int reg, u32 *val)
+{
+	u32 req = FUNCTION_V(adap->pf) | REGISTER_V(reg);
+
+	if (CHELSIO_CHIP_VERSION(adap->params.chip) <= CHELSIO_T5)
+		req |= ENABLE_F;
+	else
+		req |= T6_ENABLE_F;
+
+	if (is_t4(adap->params.chip))
+		req |= LOCALCFG_F;
+
+	t4_write_reg(adap, PCIE_CFG_SPACE_REQ_A, req);
+	*val = t4_read_reg(adap, PCIE_CFG_SPACE_DATA_A);
+
+	/* Reset ENABLE to 0 so reads of PCIE_CFG_SPACE_DATA won't cause a
+	 * Configuration Space read.  (None of the other fields matter when
+	 * ENABLE is 0 so a simple register write is easier than a
+	 * read-modify-write via t4_set_reg_field().)
+	 */
+	t4_write_reg(adap, PCIE_CFG_SPACE_REQ_A, 0);
+}
+
+/*
+ * t4_report_fw_error - report firmware error
+ * @adap: the adapter
+ *
+ * The adapter firmware can indicate error conditions to the host.
+ * If the firmware has indicated an error, print out the reason for
+ * the firmware error.
+ */
+static void t4_report_fw_error(struct adapter *adap)
+{
+	static const char *const reason[] = {
+		"Crash",                        /* PCIE_FW_EVAL_CRASH */
+		"During Device Preparation",    /* PCIE_FW_EVAL_PREP */
+		"During Device Configuration",  /* PCIE_FW_EVAL_CONF */
+		"During Device Initialization", /* PCIE_FW_EVAL_INIT */
+		"Unexpected Event",             /* PCIE_FW_EVAL_UNEXPECTEDEVENT */
+		"Insufficient Airflow",         /* PCIE_FW_EVAL_OVERHEAT */
+		"Device Shutdown",              /* PCIE_FW_EVAL_DEVICESHUTDOWN */
+		"Reserved",                     /* reserved */
+	};
+	u32 pcie_fw;
+
+	pcie_fw = t4_read_reg(adap, PCIE_FW_A);
+	if (pcie_fw & PCIE_FW_ERR_F) {
+		dev_err(adap->pdev_dev, "Firmware reports adapter error: %s\n",
+			reason[PCIE_FW_EVAL_G(pcie_fw)]);
+		adap->flags &= ~FW_OK;
+	}
+}
+
+/*
+ * Get the reply to a mailbox command and store it in @rpl in big-endian order.
+ */
+static void get_mbox_rpl(struct adapter *adap, __be64 *rpl, int nflit,
+			 u32 mbox_addr)
+{
+	for ( ; nflit; nflit--, mbox_addr += 8)
+		*rpl++ = cpu_to_be64(t4_read_reg64(adap, mbox_addr));
+}
+
+/*
+ * Handle a FW assertion reported in a mailbox.
+ */
+static void fw_asrt(struct adapter *adap, u32 mbox_addr)
+{
+	struct fw_debug_cmd asrt;
+
+	get_mbox_rpl(adap, (__be64 *)&asrt, sizeof(asrt) / 8, mbox_addr);
+	dev_alert(adap->pdev_dev,
+		  "FW assertion at %.16s:%u, val0 %#x, val1 %#x\n",
+		  asrt.u.assert.filename_0_7, be32_to_cpu(asrt.u.assert.line),
+		  be32_to_cpu(asrt.u.assert.x), be32_to_cpu(asrt.u.assert.y));
+}
+
+/**
+ *	t4_record_mbox - record a Firmware Mailbox Command/Reply in the log
+ *	@adapter: the adapter
+ *	@cmd: the Firmware Mailbox Command or Reply
+ *	@size: command length in bytes
+ *	@access: the time (ms) needed to access the Firmware Mailbox
+ *	@execute: the time (ms) the command spent being executed
+ */
+static void t4_record_mbox(struct adapter *adapter,
+			   const __be64 *cmd, unsigned int size,
+			   int access, int execute)
+{
+	struct mbox_cmd_log *log = adapter->mbox_log;
+	struct mbox_cmd *entry;
+	int i;
+
+	entry = mbox_cmd_log_entry(log, log->cursor++);
+	if (log->cursor == log->size)
+		log->cursor = 0;
+
+	for (i = 0; i < size / 8; i++)
+		entry->cmd[i] = be64_to_cpu(cmd[i]);
+	while (i < MBOX_LEN / 8)
+		entry->cmd[i++] = 0;
+	entry->timestamp = jiffies;
+	entry->seqno = log->seqno++;
+	entry->access = access;
+	entry->execute = execute;
+}
+
+/**
+ *	t4_wr_mbox_meat_timeout - send a command to FW through the given mailbox
+ *	@adap: the adapter
+ *	@mbox: index of the mailbox to use
+ *	@cmd: the command to write
+ *	@size: command length in bytes
+ *	@rpl: where to optionally store the reply
+ *	@sleep_ok: if true we may sleep while awaiting command completion
+ *	@timeout: time to wait for command to finish before timing out
+ *
+ *	Sends the given command to FW through the selected mailbox and waits
+ *	for the FW to execute the command.  If @rpl is not %NULL it is used to
+ *	store the FW's reply to the command.  The command and its optional
+ *	reply are of the same length.  FW can take up to %FW_CMD_MAX_TIMEOUT ms
+ *	to respond.  @sleep_ok determines whether we may sleep while awaiting
+ *	the response.  If sleeping is allowed we use progressive backoff
+ *	otherwise we spin.
+ *
+ *	The return value is 0 on success or a negative errno on failure.  A
+ *	failure can happen either because we are not able to execute the
+ *	command or FW executes it but signals an error.  In the latter case
+ *	the return value is the error code indicated by FW (negated).
+ */
+int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
+			    int size, void *rpl, bool sleep_ok, int timeout)
+{
+	static const int delay[] = {
+		1, 1, 3, 5, 10, 10, 20, 50, 100, 200
+	};
+
+	struct mbox_list entry;
+	u16 access = 0;
+	u16 execute = 0;
+	u32 v;
+	u64 res;
+	int i, ms, delay_idx, ret;
+	const __be64 *p = cmd;
+	u32 data_reg = PF_REG(mbox, CIM_PF_MAILBOX_DATA_A);
+	u32 ctl_reg = PF_REG(mbox, CIM_PF_MAILBOX_CTRL_A);
+	__be64 cmd_rpl[MBOX_LEN / 8];
+	u32 pcie_fw;
+
+	if ((size & 15) || size > MBOX_LEN)
+		return -EINVAL;
+
+	/*
+	 * If the device is off-line, as in EEH, commands will time out.
+	 * Fail them early so we don't waste time waiting.
+	 */
+	if (adap->pdev->error_state != pci_channel_io_normal)
+		return -EIO;
+
+	/* If we have a negative timeout, that implies that we can't sleep. */
+	if (timeout < 0) {
+		sleep_ok = false;
+		timeout = -timeout;
+	}
+
+	/* Queue ourselves onto the mailbox access list.  When our entry is at
+	 * the front of the list, we have rights to access the mailbox.  So we
+	 * wait [for a while] till we're at the front [or bail out with an
+	 * EBUSY] ...
+	 */
+	spin_lock_bh(&adap->mbox_lock);
+	list_add_tail(&entry.list, &adap->mlist.list);
+	spin_unlock_bh(&adap->mbox_lock);
+
+	delay_idx = 0;
+	ms = delay[0];
+
+	for (i = 0; ; i += ms) {
+		/* If we've waited too long, return a busy indication.  This
+		 * really ought to be based on our initial position in the
+		 * mailbox access list but this is a start.  We very rearely
+		 * contend on access to the mailbox ...
+		 */
+		pcie_fw = t4_read_reg(adap, PCIE_FW_A);
+		if (i > FW_CMD_MAX_TIMEOUT || (pcie_fw & PCIE_FW_ERR_F)) {
+			spin_lock_bh(&adap->mbox_lock);
+			list_del(&entry.list);
+			spin_unlock_bh(&adap->mbox_lock);
+			ret = (pcie_fw & PCIE_FW_ERR_F) ? -ENXIO : -EBUSY;
+			t4_record_mbox(adap, cmd, size, access, ret);
+			return ret;
+		}
+
+		/* If we're at the head, break out and start the mailbox
+		 * protocol.
+		 */
+		if (list_first_entry(&adap->mlist.list, struct mbox_list,
+				     list) == &entry)
+			break;
+
+		/* Delay for a bit before checking again ... */
+		if (sleep_ok) {
+			ms = delay[delay_idx];  /* last element may repeat */
+			if (delay_idx < ARRAY_SIZE(delay) - 1)
+				delay_idx++;
+			msleep(ms);
+		} else {
+			mdelay(ms);
+		}
+	}
+
+	/* Loop trying to get ownership of the mailbox.  Return an error
+	 * if we can't gain ownership.
+	 */
+	v = MBOWNER_G(t4_read_reg(adap, ctl_reg));
+	for (i = 0; v == MBOX_OWNER_NONE && i < 3; i++)
+		v = MBOWNER_G(t4_read_reg(adap, ctl_reg));
+	if (v != MBOX_OWNER_DRV) {
+		spin_lock_bh(&adap->mbox_lock);
+		list_del(&entry.list);
+		spin_unlock_bh(&adap->mbox_lock);
+		ret = (v == MBOX_OWNER_FW) ? -EBUSY : -ETIMEDOUT;
+		t4_record_mbox(adap, cmd, size, access, ret);
+		return ret;
+	}
+
+	/* Copy in the new mailbox command and send it on its way ... */
+	t4_record_mbox(adap, cmd, size, access, 0);
+	for (i = 0; i < size; i += 8)
+		t4_write_reg64(adap, data_reg + i, be64_to_cpu(*p++));
+
+	t4_write_reg(adap, ctl_reg, MBMSGVALID_F | MBOWNER_V(MBOX_OWNER_FW));
+	t4_read_reg(adap, ctl_reg);          /* flush write */
+
+	delay_idx = 0;
+	ms = delay[0];
+
+	for (i = 0;
+	     !((pcie_fw = t4_read_reg(adap, PCIE_FW_A)) & PCIE_FW_ERR_F) &&
+	     i < timeout;
+	     i += ms) {
+		if (sleep_ok) {
+			ms = delay[delay_idx];  /* last element may repeat */
+			if (delay_idx < ARRAY_SIZE(delay) - 1)
+				delay_idx++;
+			msleep(ms);
+		} else
+			mdelay(ms);
+
+		v = t4_read_reg(adap, ctl_reg);
+		if (MBOWNER_G(v) == MBOX_OWNER_DRV) {
+			if (!(v & MBMSGVALID_F)) {
+				t4_write_reg(adap, ctl_reg, 0);
+				continue;
+			}
+
+			get_mbox_rpl(adap, cmd_rpl, MBOX_LEN / 8, data_reg);
+			res = be64_to_cpu(cmd_rpl[0]);
+
+			if (FW_CMD_OP_G(res >> 32) == FW_DEBUG_CMD) {
+				fw_asrt(adap, data_reg);
+				res = FW_CMD_RETVAL_V(EIO);
+			} else if (rpl) {
+				memcpy(rpl, cmd_rpl, size);
+			}
+
+			t4_write_reg(adap, ctl_reg, 0);
+
+			execute = i + ms;
+			t4_record_mbox(adap, cmd_rpl,
+				       MBOX_LEN, access, execute);
+			spin_lock_bh(&adap->mbox_lock);
+			list_del(&entry.list);
+			spin_unlock_bh(&adap->mbox_lock);
+			return -FW_CMD_RETVAL_G((int)res);
+		}
+	}
+
+	ret = (pcie_fw & PCIE_FW_ERR_F) ? -ENXIO : -ETIMEDOUT;
+	t4_record_mbox(adap, cmd, size, access, ret);
+	dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n",
+		*(const u8 *)cmd, mbox);
+	t4_report_fw_error(adap);
+	spin_lock_bh(&adap->mbox_lock);
+	list_del(&entry.list);
+	spin_unlock_bh(&adap->mbox_lock);
+	t4_fatal_err(adap);
+	return ret;
+}
+
+int t4_wr_mbox_meat(struct adapter *adap, int mbox, const void *cmd, int size,
+		    void *rpl, bool sleep_ok)
+{
+	return t4_wr_mbox_meat_timeout(adap, mbox, cmd, size, rpl, sleep_ok,
+				       FW_CMD_MAX_TIMEOUT);
+}
+
+static int t4_edc_err_read(struct adapter *adap, int idx)
+{
+	u32 edc_ecc_err_addr_reg;
+	u32 rdata_reg;
+
+	if (is_t4(adap->params.chip)) {
+		CH_WARN(adap, "%s: T4 NOT supported.\n", __func__);
+		return 0;
+	}
+	if (idx != 0 && idx != 1) {
+		CH_WARN(adap, "%s: idx %d NOT supported.\n", __func__, idx);
+		return 0;
+	}
+
+	edc_ecc_err_addr_reg = EDC_T5_REG(EDC_H_ECC_ERR_ADDR_A, idx);
+	rdata_reg = EDC_T5_REG(EDC_H_BIST_STATUS_RDATA_A, idx);
+
+	CH_WARN(adap,
+		"edc%d err addr 0x%x: 0x%x.\n",
+		idx, edc_ecc_err_addr_reg,
+		t4_read_reg(adap, edc_ecc_err_addr_reg));
+	CH_WARN(adap,
+		"bist: 0x%x, status %llx %llx %llx %llx %llx %llx %llx %llx %llx.\n",
+		rdata_reg,
+		(unsigned long long)t4_read_reg64(adap, rdata_reg),
+		(unsigned long long)t4_read_reg64(adap, rdata_reg + 8),
+		(unsigned long long)t4_read_reg64(adap, rdata_reg + 16),
+		(unsigned long long)t4_read_reg64(adap, rdata_reg + 24),
+		(unsigned long long)t4_read_reg64(adap, rdata_reg + 32),
+		(unsigned long long)t4_read_reg64(adap, rdata_reg + 40),
+		(unsigned long long)t4_read_reg64(adap, rdata_reg + 48),
+		(unsigned long long)t4_read_reg64(adap, rdata_reg + 56),
+		(unsigned long long)t4_read_reg64(adap, rdata_reg + 64));
+
+	return 0;
+}
+
+/**
+ * t4_memory_rw_init - Get memory window relative offset, base, and size.
+ * @adap: the adapter
+ * @win: PCI-E Memory Window to use
+ * @mtype: memory type: MEM_EDC0, MEM_EDC1, MEM_HMA or MEM_MC
+ * @mem_off: memory relative offset with respect to @mtype.
+ * @mem_base: configured memory base address.
+ * @mem_aperture: configured memory window aperture.
+ *
+ * Get the configured memory window's relative offset, base, and size.
+ */
+int t4_memory_rw_init(struct adapter *adap, int win, int mtype, u32 *mem_off,
+		      u32 *mem_base, u32 *mem_aperture)
+{
+	u32 edc_size, mc_size, mem_reg;
+
+	/* Offset into the region of memory which is being accessed
+	 * MEM_EDC0 = 0
+	 * MEM_EDC1 = 1
+	 * MEM_MC   = 2 -- MEM_MC for chips with only 1 memory controller
+	 * MEM_MC1  = 3 -- for chips with 2 memory controllers (e.g. T5)
+	 * MEM_HMA  = 4
+	 */
+	edc_size  = EDRAM0_SIZE_G(t4_read_reg(adap, MA_EDRAM0_BAR_A));
+	if (mtype == MEM_HMA) {
+		*mem_off = 2 * (edc_size * 1024 * 1024);
+	} else if (mtype != MEM_MC1) {
+		*mem_off = (mtype * (edc_size * 1024 * 1024));
+	} else {
+		mc_size = EXT_MEM0_SIZE_G(t4_read_reg(adap,
+						      MA_EXT_MEMORY0_BAR_A));
+		*mem_off = (MEM_MC0 * edc_size + mc_size) * 1024 * 1024;
+	}
+
+	/* Each PCI-E Memory Window is programmed with a window size -- or
+	 * "aperture" -- which controls the granularity of its mapping onto
+	 * adapter memory.  We need to grab that aperture in order to know
+	 * how to use the specified window.  The window is also programmed
+	 * with the base address of the Memory Window in BAR0's address
+	 * space.  For T4 this is an absolute PCI-E Bus Address.  For T5
+	 * the address is relative to BAR0.
+	 */
+	mem_reg = t4_read_reg(adap,
+			      PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN_A,
+						  win));
+	/* a dead adapter will return 0xffffffff for PIO reads */
+	if (mem_reg == 0xffffffff)
+		return -ENXIO;
+
+	*mem_aperture = 1 << (WINDOW_G(mem_reg) + WINDOW_SHIFT_X);
+	*mem_base = PCIEOFST_G(mem_reg) << PCIEOFST_SHIFT_X;
+	if (is_t4(adap->params.chip))
+		*mem_base -= adap->t4_bar0;
+
+	return 0;
+}
+
+/**
+ * t4_memory_update_win - Move memory window to specified address.
+ * @adap: the adapter
+ * @win: PCI-E Memory Window to use
+ * @addr: location to move.
+ *
+ * Move memory window to specified address.
+ */
+void t4_memory_update_win(struct adapter *adap, int win, u32 addr)
+{
+	t4_write_reg(adap,
+		     PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET_A, win),
+		     addr);
+	/* Read it back to ensure that changes propagate before we
+	 * attempt to use the new value.
+	 */
+	t4_read_reg(adap,
+		    PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET_A, win));
+}
+
+/**
+ * t4_memory_rw_residual - Read/Write residual data.
+ * @adap: the adapter
+ * @off: relative offset within residual to start read/write.
+ * @addr: address within indicated memory type.
+ * @buf: host memory buffer
+ * @dir: direction of transfer T4_MEMORY_READ (1) or T4_MEMORY_WRITE (0)
+ *
+ * Read/Write residual data less than 32-bits.
+ */
+void t4_memory_rw_residual(struct adapter *adap, u32 off, u32 addr, u8 *buf,
+			   int dir)
+{
+	union {
+		u32 word;
+		char byte[4];
+	} last;
+	unsigned char *bp;
+	int i;
+
+	if (dir == T4_MEMORY_READ) {
+		last.word = le32_to_cpu((__force __le32)
+					t4_read_reg(adap, addr));
+		for (bp = (unsigned char *)buf, i = off; i < 4; i++)
+			bp[i] = last.byte[i];
+	} else {
+		last.word = *buf;
+		for (i = off; i < 4; i++)
+			last.byte[i] = 0;
+		t4_write_reg(adap, addr,
+			     (__force u32)cpu_to_le32(last.word));
+	}
+}
+
+/**
+ *	t4_memory_rw - read/write EDC 0, EDC 1 or MC via PCIE memory window
+ *	@adap: the adapter
+ *	@win: PCI-E Memory Window to use
+ *	@mtype: memory type: MEM_EDC0, MEM_EDC1 or MEM_MC
+ *	@addr: address within indicated memory type
+ *	@len: amount of memory to transfer
+ *	@hbuf: host memory buffer
+ *	@dir: direction of transfer T4_MEMORY_READ (1) or T4_MEMORY_WRITE (0)
+ *
+ *	Reads/writes an [almost] arbitrary memory region in the firmware: the
+ *	firmware memory address and host buffer must be aligned on 32-bit
+ *	boudaries; the length may be arbitrary.  The memory is transferred as
+ *	a raw byte sequence from/to the firmware's memory.  If this memory
+ *	contains data structures which contain multi-byte integers, it's the
+ *	caller's responsibility to perform appropriate byte order conversions.
+ */
+int t4_memory_rw(struct adapter *adap, int win, int mtype, u32 addr,
+		 u32 len, void *hbuf, int dir)
+{
+	u32 pos, offset, resid, memoffset;
+	u32 win_pf, mem_aperture, mem_base;
+	u32 *buf;
+	int ret;
+
+	/* Argument sanity checks ...
+	 */
+	if (addr & 0x3 || (uintptr_t)hbuf & 0x3)
+		return -EINVAL;
+	buf = (u32 *)hbuf;
+
+	/* It's convenient to be able to handle lengths which aren't a
+	 * multiple of 32-bits because we often end up transferring files to
+	 * the firmware.  So we'll handle that by normalizing the length here
+	 * and then handling any residual transfer at the end.
+	 */
+	resid = len & 0x3;
+	len -= resid;
+
+	ret = t4_memory_rw_init(adap, win, mtype, &memoffset, &mem_base,
+				&mem_aperture);
+	if (ret)
+		return ret;
+
+	/* Determine the PCIE_MEM_ACCESS_OFFSET */
+	addr = addr + memoffset;
+
+	win_pf = is_t4(adap->params.chip) ? 0 : PFNUM_V(adap->pf);
+
+	/* Calculate our initial PCI-E Memory Window Position and Offset into
+	 * that Window.
+	 */
+	pos = addr & ~(mem_aperture - 1);
+	offset = addr - pos;
+
+	/* Set up initial PCI-E Memory Window to cover the start of our
+	 * transfer.
+	 */
+	t4_memory_update_win(adap, win, pos | win_pf);
+
+	/* Transfer data to/from the adapter as long as there's an integral
+	 * number of 32-bit transfers to complete.
+	 *
+	 * A note on Endianness issues:
+	 *
+	 * The "register" reads and writes below from/to the PCI-E Memory
+	 * Window invoke the standard adapter Big-Endian to PCI-E Link
+	 * Little-Endian "swizzel."  As a result, if we have the following
+	 * data in adapter memory:
+	 *
+	 *     Memory:  ... | b0 | b1 | b2 | b3 | ...
+	 *     Address:      i+0  i+1  i+2  i+3
+	 *
+	 * Then a read of the adapter memory via the PCI-E Memory Window
+	 * will yield:
+	 *
+	 *     x = readl(i)
+	 *         31                  0
+	 *         [ b3 | b2 | b1 | b0 ]
+	 *
+	 * If this value is stored into local memory on a Little-Endian system
+	 * it will show up correctly in local memory as:
+	 *
+	 *     ( ..., b0, b1, b2, b3, ... )
+	 *
+	 * But on a Big-Endian system, the store will show up in memory
+	 * incorrectly swizzled as:
+	 *
+	 *     ( ..., b3, b2, b1, b0, ... )
+	 *
+	 * So we need to account for this in the reads and writes to the
+	 * PCI-E Memory Window below by undoing the register read/write
+	 * swizzels.
+	 */
+	while (len > 0) {
+		if (dir == T4_MEMORY_READ)
+			*buf++ = le32_to_cpu((__force __le32)t4_read_reg(adap,
+						mem_base + offset));
+		else
+			t4_write_reg(adap, mem_base + offset,
+				     (__force u32)cpu_to_le32(*buf++));
+		offset += sizeof(__be32);
+		len -= sizeof(__be32);
+
+		/* If we've reached the end of our current window aperture,
+		 * move the PCI-E Memory Window on to the next.  Note that
+		 * doing this here after "len" may be 0 allows us to set up
+		 * the PCI-E Memory Window for a possible final residual
+		 * transfer below ...
+		 */
+		if (offset == mem_aperture) {
+			pos += mem_aperture;
+			offset = 0;
+			t4_memory_update_win(adap, win, pos | win_pf);
+		}
+	}
+
+	/* If the original transfer had a length which wasn't a multiple of
+	 * 32-bits, now's where we need to finish off the transfer of the
+	 * residual amount.  The PCI-E Memory Window has already been moved
+	 * above (if necessary) to cover this final transfer.
+	 */
+	if (resid)
+		t4_memory_rw_residual(adap, resid, mem_base + offset,
+				      (u8 *)buf, dir);
+
+	return 0;
+}
+
+/* Return the specified PCI-E Configuration Space register from our Physical
+ * Function.  We try first via a Firmware LDST Command since we prefer to let
+ * the firmware own all of these registers, but if that fails we go for it
+ * directly ourselves.
+ */
+u32 t4_read_pcie_cfg4(struct adapter *adap, int reg)
+{
+	u32 val, ldst_addrspace;
+
+	/* If fw_attach != 0, construct and send the Firmware LDST Command to
+	 * retrieve the specified PCI-E Configuration Space register.
+	 */
+	struct fw_ldst_cmd ldst_cmd;
+	int ret;
+
+	memset(&ldst_cmd, 0, sizeof(ldst_cmd));
+	ldst_addrspace = FW_LDST_CMD_ADDRSPACE_V(FW_LDST_ADDRSPC_FUNC_PCIE);
+	ldst_cmd.op_to_addrspace = cpu_to_be32(FW_CMD_OP_V(FW_LDST_CMD) |
+					       FW_CMD_REQUEST_F |
+					       FW_CMD_READ_F |
+					       ldst_addrspace);
+	ldst_cmd.cycles_to_len16 = cpu_to_be32(FW_LEN16(ldst_cmd));
+	ldst_cmd.u.pcie.select_naccess = FW_LDST_CMD_NACCESS_V(1);
+	ldst_cmd.u.pcie.ctrl_to_fn =
+		(FW_LDST_CMD_LC_F | FW_LDST_CMD_FN_V(adap->pf));
+	ldst_cmd.u.pcie.r = reg;
+
+	/* If the LDST Command succeeds, return the result, otherwise
+	 * fall through to reading it directly ourselves ...
+	 */
+	ret = t4_wr_mbox(adap, adap->mbox, &ldst_cmd, sizeof(ldst_cmd),
+			 &ldst_cmd);
+	if (ret == 0)
+		val = be32_to_cpu(ldst_cmd.u.pcie.data[0]);
+	else
+		/* Read the desired Configuration Space register via the PCI-E
+		 * Backdoor mechanism.
+		 */
+		t4_hw_pci_read_cfg4(adap, reg, &val);
+	return val;
+}
+
+/* Get the window based on base passed to it.
+ * Window aperture is currently unhandled, but there is no use case for it
+ * right now
+ */
+static u32 t4_get_window(struct adapter *adap, u32 pci_base, u64 pci_mask,
+			 u32 memwin_base)
+{
+	u32 ret;
+
+	if (is_t4(adap->params.chip)) {
+		u32 bar0;
+
+		/* Truncation intentional: we only read the bottom 32-bits of
+		 * the 64-bit BAR0/BAR1 ...  We use the hardware backdoor
+		 * mechanism to read BAR0 instead of using
+		 * pci_resource_start() because we could be operating from
+		 * within a Virtual Machine which is trapping our accesses to
+		 * our Configuration Space and we need to set up the PCI-E
+		 * Memory Window decoders with the actual addresses which will
+		 * be coming across the PCI-E link.
+		 */
+		bar0 = t4_read_pcie_cfg4(adap, pci_base);
+		bar0 &= pci_mask;
+		adap->t4_bar0 = bar0;
+
+		ret = bar0 + memwin_base;
+	} else {
+		/* For T5, only relative offset inside the PCIe BAR is passed */
+		ret = memwin_base;
+	}
+	return ret;
+}
+
+/* Get the default utility window (win0) used by everyone */
+u32 t4_get_util_window(struct adapter *adap)
+{
+	return t4_get_window(adap, PCI_BASE_ADDRESS_0,
+			     PCI_BASE_ADDRESS_MEM_MASK, MEMWIN0_BASE);
+}
+
+/* Set up memory window for accessing adapter memory ranges.  (Read
+ * back MA register to ensure that changes propagate before we attempt
+ * to use the new values.)
+ */
+void t4_setup_memwin(struct adapter *adap, u32 memwin_base, u32 window)
+{
+	t4_write_reg(adap,
+		     PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN_A, window),
+		     memwin_base | BIR_V(0) |
+		     WINDOW_V(ilog2(MEMWIN0_APERTURE) - WINDOW_SHIFT_X));
+	t4_read_reg(adap,
+		    PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN_A, window));
+}
+
+/**
+ *	t4_get_regs_len - return the size of the chips register set
+ *	@adapter: the adapter
+ *
+ *	Returns the size of the chip's BAR0 register space.
+ */
+unsigned int t4_get_regs_len(struct adapter *adapter)
+{
+	unsigned int chip_version = CHELSIO_CHIP_VERSION(adapter->params.chip);
+
+	switch (chip_version) {
+	case CHELSIO_T4:
+		return T4_REGMAP_SIZE;
+
+	case CHELSIO_T5:
+	case CHELSIO_T6:
+		return T5_REGMAP_SIZE;
+	}
+
+	dev_err(adapter->pdev_dev,
+		"Unsupported chip version %d\n", chip_version);
+	return 0;
+}
+
+/**
+ *	t4_get_regs - read chip registers into provided buffer
+ *	@adap: the adapter
+ *	@buf: register buffer
+ *	@buf_size: size (in bytes) of register buffer
+ *
+ *	If the provided register buffer isn't large enough for the chip's
+ *	full register range, the register dump will be truncated to the
+ *	register buffer's size.
+ */
+void t4_get_regs(struct adapter *adap, void *buf, size_t buf_size)
+{
+	static const unsigned int t4_reg_ranges[] = {
+		0x1008, 0x1108,
+		0x1180, 0x1184,
+		0x1190, 0x1194,
+		0x11a0, 0x11a4,
+		0x11b0, 0x11b4,
+		0x11fc, 0x123c,
+		0x1300, 0x173c,
+		0x1800, 0x18fc,
+		0x3000, 0x30d8,
+		0x30e0, 0x30e4,
+		0x30ec, 0x5910,
+		0x5920, 0x5924,
+		0x5960, 0x5960,
+		0x5968, 0x5968,
+		0x5970, 0x5970,
+		0x5978, 0x5978,
+		0x5980, 0x5980,
+		0x5988, 0x5988,
+		0x5990, 0x5990,
+		0x5998, 0x5998,
+		0x59a0, 0x59d4,
+		0x5a00, 0x5ae0,
+		0x5ae8, 0x5ae8,
+		0x5af0, 0x5af0,
+		0x5af8, 0x5af8,
+		0x6000, 0x6098,
+		0x6100, 0x6150,
+		0x6200, 0x6208,
+		0x6240, 0x6248,
+		0x6280, 0x62b0,
+		0x62c0, 0x6338,
+		0x6370, 0x638c,
+		0x6400, 0x643c,
+		0x6500, 0x6524,
+		0x6a00, 0x6a04,
+		0x6a14, 0x6a38,
+		0x6a60, 0x6a70,
+		0x6a78, 0x6a78,
+		0x6b00, 0x6b0c,
+		0x6b1c, 0x6b84,
+		0x6bf0, 0x6bf8,
+		0x6c00, 0x6c0c,
+		0x6c1c, 0x6c84,
+		0x6cf0, 0x6cf8,
+		0x6d00, 0x6d0c,
+		0x6d1c, 0x6d84,
+		0x6df0, 0x6df8,
+		0x6e00, 0x6e0c,
+		0x6e1c, 0x6e84,
+		0x6ef0, 0x6ef8,
+		0x6f00, 0x6f0c,
+		0x6f1c, 0x6f84,
+		0x6ff0, 0x6ff8,
+		0x7000, 0x700c,
+		0x701c, 0x7084,
+		0x70f0, 0x70f8,
+		0x7100, 0x710c,
+		0x711c, 0x7184,
+		0x71f0, 0x71f8,
+		0x7200, 0x720c,
+		0x721c, 0x7284,
+		0x72f0, 0x72f8,
+		0x7300, 0x730c,
+		0x731c, 0x7384,
+		0x73f0, 0x73f8,
+		0x7400, 0x7450,
+		0x7500, 0x7530,
+		0x7600, 0x760c,
+		0x7614, 0x761c,
+		0x7680, 0x76cc,
+		0x7700, 0x7798,
+		0x77c0, 0x77fc,
+		0x7900, 0x79fc,
+		0x7b00, 0x7b58,
+		0x7b60, 0x7b84,
+		0x7b8c, 0x7c38,
+		0x7d00, 0x7d38,
+		0x7d40, 0x7d80,
+		0x7d8c, 0x7ddc,
+		0x7de4, 0x7e04,
+		0x7e10, 0x7e1c,
+		0x7e24, 0x7e38,
+		0x7e40, 0x7e44,
+		0x7e4c, 0x7e78,
+		0x7e80, 0x7ea4,
+		0x7eac, 0x7edc,
+		0x7ee8, 0x7efc,
+		0x8dc0, 0x8e04,
+		0x8e10, 0x8e1c,
+		0x8e30, 0x8e78,
+		0x8ea0, 0x8eb8,
+		0x8ec0, 0x8f6c,
+		0x8fc0, 0x9008,
+		0x9010, 0x9058,
+		0x9060, 0x9060,
+		0x9068, 0x9074,
+		0x90fc, 0x90fc,
+		0x9400, 0x9408,
+		0x9410, 0x9458,
+		0x9600, 0x9600,
+		0x9608, 0x9638,
+		0x9640, 0x96bc,
+		0x9800, 0x9808,
+		0x9820, 0x983c,
+		0x9850, 0x9864,
+		0x9c00, 0x9c6c,
+		0x9c80, 0x9cec,
+		0x9d00, 0x9d6c,
+		0x9d80, 0x9dec,
+		0x9e00, 0x9e6c,
+		0x9e80, 0x9eec,
+		0x9f00, 0x9f6c,
+		0x9f80, 0x9fec,
+		0xd004, 0xd004,
+		0xd010, 0xd03c,
+		0xdfc0, 0xdfe0,
+		0xe000, 0xea7c,
+		0xf000, 0x11110,
+		0x11118, 0x11190,
+		0x19040, 0x1906c,
+		0x19078, 0x19080,
+		0x1908c, 0x190e4,
+		0x190f0, 0x190f8,
+		0x19100, 0x19110,
+		0x19120, 0x19124,
+		0x19150, 0x19194,
+		0x1919c, 0x191b0,
+		0x191d0, 0x191e8,
+		0x19238, 0x1924c,
+		0x193f8, 0x1943c,
+		0x1944c, 0x19474,
+		0x19490, 0x194e0,
+		0x194f0, 0x194f8,
+		0x19800, 0x19c08,
+		0x19c10, 0x19c90,
+		0x19ca0, 0x19ce4,
+		0x19cf0, 0x19d40,
+		0x19d50, 0x19d94,
+		0x19da0, 0x19de8,
+		0x19df0, 0x19e40,
+		0x19e50, 0x19e90,
+		0x19ea0, 0x19f4c,
+		0x1a000, 0x1a004,
+		0x1a010, 0x1a06c,
+		0x1a0b0, 0x1a0e4,
+		0x1a0ec, 0x1a0f4,
+		0x1a100, 0x1a108,
+		0x1a114, 0x1a120,
+		0x1a128, 0x1a130,
+		0x1a138, 0x1a138,
+		0x1a190, 0x1a1c4,
+		0x1a1fc, 0x1a1fc,
+		0x1e040, 0x1e04c,
+		0x1e284, 0x1e28c,
+		0x1e2c0, 0x1e2c0,
+		0x1e2e0, 0x1e2e0,
+		0x1e300, 0x1e384,
+		0x1e3c0, 0x1e3c8,
+		0x1e440, 0x1e44c,
+		0x1e684, 0x1e68c,
+		0x1e6c0, 0x1e6c0,
+		0x1e6e0, 0x1e6e0,
+		0x1e700, 0x1e784,
+		0x1e7c0, 0x1e7c8,
+		0x1e840, 0x1e84c,
+		0x1ea84, 0x1ea8c,
+		0x1eac0, 0x1eac0,
+		0x1eae0, 0x1eae0,
+		0x1eb00, 0x1eb84,
+		0x1ebc0, 0x1ebc8,
+		0x1ec40, 0x1ec4c,
+		0x1ee84, 0x1ee8c,
+		0x1eec0, 0x1eec0,
+		0x1eee0, 0x1eee0,
+		0x1ef00, 0x1ef84,
+		0x1efc0, 0x1efc8,
+		0x1f040, 0x1f04c,
+		0x1f284, 0x1f28c,
+		0x1f2c0, 0x1f2c0,
+		0x1f2e0, 0x1f2e0,
+		0x1f300, 0x1f384,
+		0x1f3c0, 0x1f3c8,
+		0x1f440, 0x1f44c,
+		0x1f684, 0x1f68c,
+		0x1f6c0, 0x1f6c0,
+		0x1f6e0, 0x1f6e0,
+		0x1f700, 0x1f784,
+		0x1f7c0, 0x1f7c8,
+		0x1f840, 0x1f84c,
+		0x1fa84, 0x1fa8c,
+		0x1fac0, 0x1fac0,
+		0x1fae0, 0x1fae0,
+		0x1fb00, 0x1fb84,
+		0x1fbc0, 0x1fbc8,
+		0x1fc40, 0x1fc4c,
+		0x1fe84, 0x1fe8c,
+		0x1fec0, 0x1fec0,
+		0x1fee0, 0x1fee0,
+		0x1ff00, 0x1ff84,
+		0x1ffc0, 0x1ffc8,
+		0x20000, 0x2002c,
+		0x20100, 0x2013c,
+		0x20190, 0x201a0,
+		0x201a8, 0x201b8,
+		0x201c4, 0x201c8,
+		0x20200, 0x20318,
+		0x20400, 0x204b4,
+		0x204c0, 0x20528,
+		0x20540, 0x20614,
+		0x21000, 0x21040,
+		0x2104c, 0x21060,
+		0x210c0, 0x210ec,
+		0x21200, 0x21268,
+		0x21270, 0x21284,
+		0x212fc, 0x21388,
+		0x21400, 0x21404,
+		0x21500, 0x21500,
+		0x21510, 0x21518,
+		0x2152c, 0x21530,
+		0x2153c, 0x2153c,
+		0x21550, 0x21554,
+		0x21600, 0x21600,
+		0x21608, 0x2161c,
+		0x21624, 0x21628,
+		0x21630, 0x21634,
+		0x2163c, 0x2163c,
+		0x21700, 0x2171c,
+		0x21780, 0x2178c,
+		0x21800, 0x21818,
+		0x21820, 0x21828,
+		0x21830, 0x21848,
+		0x21850, 0x21854,
+		0x21860, 0x21868,
+		0x21870, 0x21870,
+		0x21878, 0x21898,
+		0x218a0, 0x218a8,
+		0x218b0, 0x218c8,
+		0x218d0, 0x218d4,
+		0x218e0, 0x218e8,
+		0x218f0, 0x218f0,
+		0x218f8, 0x21a18,
+		0x21a20, 0x21a28,
+		0x21a30, 0x21a48,
+		0x21a50, 0x21a54,
+		0x21a60, 0x21a68,
+		0x21a70, 0x21a70,
+		0x21a78, 0x21a98,
+		0x21aa0, 0x21aa8,
+		0x21ab0, 0x21ac8,
+		0x21ad0, 0x21ad4,
+		0x21ae0, 0x21ae8,
+		0x21af0, 0x21af0,
+		0x21af8, 0x21c18,
+		0x21c20, 0x21c20,
+		0x21c28, 0x21c30,
+		0x21c38, 0x21c38,
+		0x21c80, 0x21c98,
+		0x21ca0, 0x21ca8,
+		0x21cb0, 0x21cc8,
+		0x21cd0, 0x21cd4,
+		0x21ce0, 0x21ce8,
+		0x21cf0, 0x21cf0,
+		0x21cf8, 0x21d7c,
+		0x21e00, 0x21e04,
+		0x22000, 0x2202c,
+		0x22100, 0x2213c,
+		0x22190, 0x221a0,
+		0x221a8, 0x221b8,
+		0x221c4, 0x221c8,
+		0x22200, 0x22318,
+		0x22400, 0x224b4,
+		0x224c0, 0x22528,
+		0x22540, 0x22614,
+		0x23000, 0x23040,
+		0x2304c, 0x23060,
+		0x230c0, 0x230ec,
+		0x23200, 0x23268,
+		0x23270, 0x23284,
+		0x232fc, 0x23388,
+		0x23400, 0x23404,
+		0x23500, 0x23500,
+		0x23510, 0x23518,
+		0x2352c, 0x23530,
+		0x2353c, 0x2353c,
+		0x23550, 0x23554,
+		0x23600, 0x23600,
+		0x23608, 0x2361c,
+		0x23624, 0x23628,
+		0x23630, 0x23634,
+		0x2363c, 0x2363c,
+		0x23700, 0x2371c,
+		0x23780, 0x2378c,
+		0x23800, 0x23818,
+		0x23820, 0x23828,
+		0x23830, 0x23848,
+		0x23850, 0x23854,
+		0x23860, 0x23868,
+		0x23870, 0x23870,
+		0x23878, 0x23898,
+		0x238a0, 0x238a8,
+		0x238b0, 0x238c8,
+		0x238d0, 0x238d4,
+		0x238e0, 0x238e8,
+		0x238f0, 0x238f0,
+		0x238f8, 0x23a18,
+		0x23a20, 0x23a28,
+		0x23a30, 0x23a48,
+		0x23a50, 0x23a54,
+		0x23a60, 0x23a68,
+		0x23a70, 0x23a70,
+		0x23a78, 0x23a98,
+		0x23aa0, 0x23aa8,
+		0x23ab0, 0x23ac8,
+		0x23ad0, 0x23ad4,
+		0x23ae0, 0x23ae8,
+		0x23af0, 0x23af0,
+		0x23af8, 0x23c18,
+		0x23c20, 0x23c20,
+		0x23c28, 0x23c30,
+		0x23c38, 0x23c38,
+		0x23c80, 0x23c98,
+		0x23ca0, 0x23ca8,
+		0x23cb0, 0x23cc8,
+		0x23cd0, 0x23cd4,
+		0x23ce0, 0x23ce8,
+		0x23cf0, 0x23cf0,
+		0x23cf8, 0x23d7c,
+		0x23e00, 0x23e04,
+		0x24000, 0x2402c,
+		0x24100, 0x2413c,
+		0x24190, 0x241a0,
+		0x241a8, 0x241b8,
+		0x241c4, 0x241c8,
+		0x24200, 0x24318,
+		0x24400, 0x244b4,
+		0x244c0, 0x24528,
+		0x24540, 0x24614,
+		0x25000, 0x25040,
+		0x2504c, 0x25060,
+		0x250c0, 0x250ec,
+		0x25200, 0x25268,
+		0x25270, 0x25284,
+		0x252fc, 0x25388,
+		0x25400, 0x25404,
+		0x25500, 0x25500,
+		0x25510, 0x25518,
+		0x2552c, 0x25530,
+		0x2553c, 0x2553c,
+		0x25550, 0x25554,
+		0x25600, 0x25600,
+		0x25608, 0x2561c,
+		0x25624, 0x25628,
+		0x25630, 0x25634,
+		0x2563c, 0x2563c,
+		0x25700, 0x2571c,
+		0x25780, 0x2578c,
+		0x25800, 0x25818,
+		0x25820, 0x25828,
+		0x25830, 0x25848,
+		0x25850, 0x25854,
+		0x25860, 0x25868,
+		0x25870, 0x25870,
+		0x25878, 0x25898,
+		0x258a0, 0x258a8,
+		0x258b0, 0x258c8,
+		0x258d0, 0x258d4,
+		0x258e0, 0x258e8,
+		0x258f0, 0x258f0,
+		0x258f8, 0x25a18,
+		0x25a20, 0x25a28,
+		0x25a30, 0x25a48,
+		0x25a50, 0x25a54,
+		0x25a60, 0x25a68,
+		0x25a70, 0x25a70,
+		0x25a78, 0x25a98,
+		0x25aa0, 0x25aa8,
+		0x25ab0, 0x25ac8,
+		0x25ad0, 0x25ad4,
+		0x25ae0, 0x25ae8,
+		0x25af0, 0x25af0,
+		0x25af8, 0x25c18,
+		0x25c20, 0x25c20,
+		0x25c28, 0x25c30,
+		0x25c38, 0x25c38,
+		0x25c80, 0x25c98,
+		0x25ca0, 0x25ca8,
+		0x25cb0, 0x25cc8,
+		0x25cd0, 0x25cd4,
+		0x25ce0, 0x25ce8,
+		0x25cf0, 0x25cf0,
+		0x25cf8, 0x25d7c,
+		0x25e00, 0x25e04,
+		0x26000, 0x2602c,
+		0x26100, 0x2613c,
+		0x26190, 0x261a0,
+		0x261a8, 0x261b8,
+		0x261c4, 0x261c8,
+		0x26200, 0x26318,
+		0x26400, 0x264b4,
+		0x264c0, 0x26528,
+		0x26540, 0x26614,
+		0x27000, 0x27040,
+		0x2704c, 0x27060,
+		0x270c0, 0x270ec,
+		0x27200, 0x27268,
+		0x27270, 0x27284,
+		0x272fc, 0x27388,
+		0x27400, 0x27404,
+		0x27500, 0x27500,
+		0x27510, 0x27518,
+		0x2752c, 0x27530,
+		0x2753c, 0x2753c,
+		0x27550, 0x27554,
+		0x27600, 0x27600,
+		0x27608, 0x2761c,
+		0x27624, 0x27628,
+		0x27630, 0x27634,
+		0x2763c, 0x2763c,
+		0x27700, 0x2771c,
+		0x27780, 0x2778c,
+		0x27800, 0x27818,
+		0x27820, 0x27828,
+		0x27830, 0x27848,
+		0x27850, 0x27854,
+		0x27860, 0x27868,
+		0x27870, 0x27870,
+		0x27878, 0x27898,
+		0x278a0, 0x278a8,
+		0x278b0, 0x278c8,
+		0x278d0, 0x278d4,
+		0x278e0, 0x278e8,
+		0x278f0, 0x278f0,
+		0x278f8, 0x27a18,
+		0x27a20, 0x27a28,
+		0x27a30, 0x27a48,
+		0x27a50, 0x27a54,
+		0x27a60, 0x27a68,
+		0x27a70, 0x27a70,
+		0x27a78, 0x27a98,
+		0x27aa0, 0x27aa8,
+		0x27ab0, 0x27ac8,
+		0x27ad0, 0x27ad4,
+		0x27ae0, 0x27ae8,
+		0x27af0, 0x27af0,
+		0x27af8, 0x27c18,
+		0x27c20, 0x27c20,
+		0x27c28, 0x27c30,
+		0x27c38, 0x27c38,
+		0x27c80, 0x27c98,
+		0x27ca0, 0x27ca8,
+		0x27cb0, 0x27cc8,
+		0x27cd0, 0x27cd4,
+		0x27ce0, 0x27ce8,
+		0x27cf0, 0x27cf0,
+		0x27cf8, 0x27d7c,
+		0x27e00, 0x27e04,
+	};
+
+	static const unsigned int t5_reg_ranges[] = {
+		0x1008, 0x10c0,
+		0x10cc, 0x10f8,
+		0x1100, 0x1100,
+		0x110c, 0x1148,
+		0x1180, 0x1184,
+		0x1190, 0x1194,
+		0x11a0, 0x11a4,
+		0x11b0, 0x11b4,
+		0x11fc, 0x123c,
+		0x1280, 0x173c,
+		0x1800, 0x18fc,
+		0x3000, 0x3028,
+		0x3060, 0x30b0,
+		0x30b8, 0x30d8,
+		0x30e0, 0x30fc,
+		0x3140, 0x357c,
+		0x35a8, 0x35cc,
+		0x35ec, 0x35ec,
+		0x3600, 0x5624,
+		0x56cc, 0x56ec,
+		0x56f4, 0x5720,
+		0x5728, 0x575c,
+		0x580c, 0x5814,
+		0x5890, 0x589c,
+		0x58a4, 0x58ac,
+		0x58b8, 0x58bc,
+		0x5940, 0x59c8,
+		0x59d0, 0x59dc,
+		0x59fc, 0x5a18,
+		0x5a60, 0x5a70,
+		0x5a80, 0x5a9c,
+		0x5b94, 0x5bfc,
+		0x6000, 0x6020,
+		0x6028, 0x6040,
+		0x6058, 0x609c,
+		0x60a8, 0x614c,
+		0x7700, 0x7798,
+		0x77c0, 0x78fc,
+		0x7b00, 0x7b58,
+		0x7b60, 0x7b84,
+		0x7b8c, 0x7c54,
+		0x7d00, 0x7d38,
+		0x7d40, 0x7d80,
+		0x7d8c, 0x7ddc,
+		0x7de4, 0x7e04,
+		0x7e10, 0x7e1c,
+		0x7e24, 0x7e38,
+		0x7e40, 0x7e44,
+		0x7e4c, 0x7e78,
+		0x7e80, 0x7edc,
+		0x7ee8, 0x7efc,
+		0x8dc0, 0x8de0,
+		0x8df8, 0x8e04,
+		0x8e10, 0x8e84,
+		0x8ea0, 0x8f84,
+		0x8fc0, 0x9058,
+		0x9060, 0x9060,
+		0x9068, 0x90f8,
+		0x9400, 0x9408,
+		0x9410, 0x9470,
+		0x9600, 0x9600,
+		0x9608, 0x9638,
+		0x9640, 0x96f4,
+		0x9800, 0x9808,
+		0x9820, 0x983c,
+		0x9850, 0x9864,
+		0x9c00, 0x9c6c,
+		0x9c80, 0x9cec,
+		0x9d00, 0x9d6c,
+		0x9d80, 0x9dec,
+		0x9e00, 0x9e6c,
+		0x9e80, 0x9eec,
+		0x9f00, 0x9f6c,
+		0x9f80, 0xa020,
+		0xd004, 0xd004,
+		0xd010, 0xd03c,
+		0xdfc0, 0xdfe0,
+		0xe000, 0x1106c,
+		0x11074, 0x11088,
+		0x1109c, 0x1117c,
+		0x11190, 0x11204,
+		0x19040, 0x1906c,
+		0x19078, 0x19080,
+		0x1908c, 0x190e8,
+		0x190f0, 0x190f8,
+		0x19100, 0x19110,
+		0x19120, 0x19124,
+		0x19150, 0x19194,
+		0x1919c, 0x191b0,
+		0x191d0, 0x191e8,
+		0x19238, 0x19290,
+		0x193f8, 0x19428,
+		0x19430, 0x19444,
+		0x1944c, 0x1946c,
+		0x19474, 0x19474,
+		0x19490, 0x194cc,
+		0x194f0, 0x194f8,
+		0x19c00, 0x19c08,
+		0x19c10, 0x19c60,
+		0x19c94, 0x19ce4,
+		0x19cf0, 0x19d40,
+		0x19d50, 0x19d94,
+		0x19da0, 0x19de8,
+		0x19df0, 0x19e10,
+		0x19e50, 0x19e90,
+		0x19ea0, 0x19f24,
+		0x19f34, 0x19f34,
+		0x19f40, 0x19f50,
+		0x19f90, 0x19fb4,
+		0x19fc4, 0x19fe4,
+		0x1a000, 0x1a004,
+		0x1a010, 0x1a06c,
+		0x1a0b0, 0x1a0e4,
+		0x1a0ec, 0x1a0f8,
+		0x1a100, 0x1a108,
+		0x1a114, 0x1a120,
+		0x1a128, 0x1a130,
+		0x1a138, 0x1a138,
+		0x1a190, 0x1a1c4,
+		0x1a1fc, 0x1a1fc,
+		0x1e008, 0x1e00c,
+		0x1e040, 0x1e044,
+		0x1e04c, 0x1e04c,
+		0x1e284, 0x1e290,
+		0x1e2c0, 0x1e2c0,
+		0x1e2e0, 0x1e2e0,
+		0x1e300, 0x1e384,
+		0x1e3c0, 0x1e3c8,
+		0x1e408, 0x1e40c,
+		0x1e440, 0x1e444,
+		0x1e44c, 0x1e44c,
+		0x1e684, 0x1e690,
+		0x1e6c0, 0x1e6c0,
+		0x1e6e0, 0x1e6e0,
+		0x1e700, 0x1e784,
+		0x1e7c0, 0x1e7c8,
+		0x1e808, 0x1e80c,
+		0x1e840, 0x1e844,
+		0x1e84c, 0x1e84c,
+		0x1ea84, 0x1ea90,
+		0x1eac0, 0x1eac0,
+		0x1eae0, 0x1eae0,
+		0x1eb00, 0x1eb84,
+		0x1ebc0, 0x1ebc8,
+		0x1ec08, 0x1ec0c,
+		0x1ec40, 0x1ec44,
+		0x1ec4c, 0x1ec4c,
+		0x1ee84, 0x1ee90,
+		0x1eec0, 0x1eec0,
+		0x1eee0, 0x1eee0,
+		0x1ef00, 0x1ef84,
+		0x1efc0, 0x1efc8,
+		0x1f008, 0x1f00c,
+		0x1f040, 0x1f044,
+		0x1f04c, 0x1f04c,
+		0x1f284, 0x1f290,
+		0x1f2c0, 0x1f2c0,
+		0x1f2e0, 0x1f2e0,
+		0x1f300, 0x1f384,
+		0x1f3c0, 0x1f3c8,
+		0x1f408, 0x1f40c,
+		0x1f440, 0x1f444,
+		0x1f44c, 0x1f44c,
+		0x1f684, 0x1f690,
+		0x1f6c0, 0x1f6c0,
+		0x1f6e0, 0x1f6e0,
+		0x1f700, 0x1f784,
+		0x1f7c0, 0x1f7c8,
+		0x1f808, 0x1f80c,
+		0x1f840, 0x1f844,
+		0x1f84c, 0x1f84c,
+		0x1fa84, 0x1fa90,
+		0x1fac0, 0x1fac0,
+		0x1fae0, 0x1fae0,
+		0x1fb00, 0x1fb84,
+		0x1fbc0, 0x1fbc8,
+		0x1fc08, 0x1fc0c,
+		0x1fc40, 0x1fc44,
+		0x1fc4c, 0x1fc4c,
+		0x1fe84, 0x1fe90,
+		0x1fec0, 0x1fec0,
+		0x1fee0, 0x1fee0,
+		0x1ff00, 0x1ff84,
+		0x1ffc0, 0x1ffc8,
+		0x30000, 0x30030,
+		0x30100, 0x30144,
+		0x30190, 0x301a0,
+		0x301a8, 0x301b8,
+		0x301c4, 0x301c8,
+		0x301d0, 0x301d0,
+		0x30200, 0x30318,
+		0x30400, 0x304b4,
+		0x304c0, 0x3052c,
+		0x30540, 0x3061c,
+		0x30800, 0x30828,
+		0x30834, 0x30834,
+		0x308c0, 0x30908,
+		0x30910, 0x309ac,
+		0x30a00, 0x30a14,
+		0x30a1c, 0x30a2c,
+		0x30a44, 0x30a50,
+		0x30a74, 0x30a74,
+		0x30a7c, 0x30afc,
+		0x30b08, 0x30c24,
+		0x30d00, 0x30d00,
+		0x30d08, 0x30d14,
+		0x30d1c, 0x30d20,
+		0x30d3c, 0x30d3c,
+		0x30d48, 0x30d50,
+		0x31200, 0x3120c,
+		0x31220, 0x31220,
+		0x31240, 0x31240,
+		0x31600, 0x3160c,
+		0x31a00, 0x31a1c,
+		0x31e00, 0x31e20,
+		0x31e38, 0x31e3c,
+		0x31e80, 0x31e80,
+		0x31e88, 0x31ea8,
+		0x31eb0, 0x31eb4,
+		0x31ec8, 0x31ed4,
+		0x31fb8, 0x32004,
+		0x32200, 0x32200,
+		0x32208, 0x32240,
+		0x32248, 0x32280,
+		0x32288, 0x322c0,
+		0x322c8, 0x322fc,
+		0x32600, 0x32630,
+		0x32a00, 0x32abc,
+		0x32b00, 0x32b10,
+		0x32b20, 0x32b30,
+		0x32b40, 0x32b50,
+		0x32b60, 0x32b70,
+		0x33000, 0x33028,
+		0x33030, 0x33048,
+		0x33060, 0x33068,
+		0x33070, 0x3309c,
+		0x330f0, 0x33128,
+		0x33130, 0x33148,
+		0x33160, 0x33168,
+		0x33170, 0x3319c,
+		0x331f0, 0x33238,
+		0x33240, 0x33240,
+		0x33248, 0x33250,
+		0x3325c, 0x33264,
+		0x33270, 0x332b8,
+		0x332c0, 0x332e4,
+		0x332f8, 0x33338,
+		0x33340, 0x33340,
+		0x33348, 0x33350,
+		0x3335c, 0x33364,
+		0x33370, 0x333b8,
+		0x333c0, 0x333e4,
+		0x333f8, 0x33428,
+		0x33430, 0x33448,
+		0x33460, 0x33468,
+		0x33470, 0x3349c,
+		0x334f0, 0x33528,
+		0x33530, 0x33548,
+		0x33560, 0x33568,
+		0x33570, 0x3359c,
+		0x335f0, 0x33638,
+		0x33640, 0x33640,
+		0x33648, 0x33650,
+		0x3365c, 0x33664,
+		0x33670, 0x336b8,
+		0x336c0, 0x336e4,
+		0x336f8, 0x33738,
+		0x33740, 0x33740,
+		0x33748, 0x33750,
+		0x3375c, 0x33764,
+		0x33770, 0x337b8,
+		0x337c0, 0x337e4,
+		0x337f8, 0x337fc,
+		0x33814, 0x33814,
+		0x3382c, 0x3382c,
+		0x33880, 0x3388c,
+		0x338e8, 0x338ec,
+		0x33900, 0x33928,
+		0x33930, 0x33948,
+		0x33960, 0x33968,
+		0x33970, 0x3399c,
+		0x339f0, 0x33a38,
+		0x33a40, 0x33a40,
+		0x33a48, 0x33a50,
+		0x33a5c, 0x33a64,
+		0x33a70, 0x33ab8,
+		0x33ac0, 0x33ae4,
+		0x33af8, 0x33b10,
+		0x33b28, 0x33b28,
+		0x33b3c, 0x33b50,
+		0x33bf0, 0x33c10,
+		0x33c28, 0x33c28,
+		0x33c3c, 0x33c50,
+		0x33cf0, 0x33cfc,
+		0x34000, 0x34030,
+		0x34100, 0x34144,
+		0x34190, 0x341a0,
+		0x341a8, 0x341b8,
+		0x341c4, 0x341c8,
+		0x341d0, 0x341d0,
+		0x34200, 0x34318,
+		0x34400, 0x344b4,
+		0x344c0, 0x3452c,
+		0x34540, 0x3461c,
+		0x34800, 0x34828,
+		0x34834, 0x34834,
+		0x348c0, 0x34908,
+		0x34910, 0x349ac,
+		0x34a00, 0x34a14,
+		0x34a1c, 0x34a2c,
+		0x34a44, 0x34a50,
+		0x34a74, 0x34a74,
+		0x34a7c, 0x34afc,
+		0x34b08, 0x34c24,
+		0x34d00, 0x34d00,
+		0x34d08, 0x34d14,
+		0x34d1c, 0x34d20,
+		0x34d3c, 0x34d3c,
+		0x34d48, 0x34d50,
+		0x35200, 0x3520c,
+		0x35220, 0x35220,
+		0x35240, 0x35240,
+		0x35600, 0x3560c,
+		0x35a00, 0x35a1c,
+		0x35e00, 0x35e20,
+		0x35e38, 0x35e3c,
+		0x35e80, 0x35e80,
+		0x35e88, 0x35ea8,
+		0x35eb0, 0x35eb4,
+		0x35ec8, 0x35ed4,
+		0x35fb8, 0x36004,
+		0x36200, 0x36200,
+		0x36208, 0x36240,
+		0x36248, 0x36280,
+		0x36288, 0x362c0,
+		0x362c8, 0x362fc,
+		0x36600, 0x36630,
+		0x36a00, 0x36abc,
+		0x36b00, 0x36b10,
+		0x36b20, 0x36b30,
+		0x36b40, 0x36b50,
+		0x36b60, 0x36b70,
+		0x37000, 0x37028,
+		0x37030, 0x37048,
+		0x37060, 0x37068,
+		0x37070, 0x3709c,
+		0x370f0, 0x37128,
+		0x37130, 0x37148,
+		0x37160, 0x37168,
+		0x37170, 0x3719c,
+		0x371f0, 0x37238,
+		0x37240, 0x37240,
+		0x37248, 0x37250,
+		0x3725c, 0x37264,
+		0x37270, 0x372b8,
+		0x372c0, 0x372e4,
+		0x372f8, 0x37338,
+		0x37340, 0x37340,
+		0x37348, 0x37350,
+		0x3735c, 0x37364,
+		0x37370, 0x373b8,
+		0x373c0, 0x373e4,
+		0x373f8, 0x37428,
+		0x37430, 0x37448,
+		0x37460, 0x37468,
+		0x37470, 0x3749c,
+		0x374f0, 0x37528,
+		0x37530, 0x37548,
+		0x37560, 0x37568,
+		0x37570, 0x3759c,
+		0x375f0, 0x37638,
+		0x37640, 0x37640,
+		0x37648, 0x37650,
+		0x3765c, 0x37664,
+		0x37670, 0x376b8,
+		0x376c0, 0x376e4,
+		0x376f8, 0x37738,
+		0x37740, 0x37740,
+		0x37748, 0x37750,
+		0x3775c, 0x37764,
+		0x37770, 0x377b8,
+		0x377c0, 0x377e4,
+		0x377f8, 0x377fc,
+		0x37814, 0x37814,
+		0x3782c, 0x3782c,
+		0x37880, 0x3788c,
+		0x378e8, 0x378ec,
+		0x37900, 0x37928,
+		0x37930, 0x37948,
+		0x37960, 0x37968,
+		0x37970, 0x3799c,
+		0x379f0, 0x37a38,
+		0x37a40, 0x37a40,
+		0x37a48, 0x37a50,
+		0x37a5c, 0x37a64,
+		0x37a70, 0x37ab8,
+		0x37ac0, 0x37ae4,
+		0x37af8, 0x37b10,
+		0x37b28, 0x37b28,
+		0x37b3c, 0x37b50,
+		0x37bf0, 0x37c10,
+		0x37c28, 0x37c28,
+		0x37c3c, 0x37c50,
+		0x37cf0, 0x37cfc,
+		0x38000, 0x38030,
+		0x38100, 0x38144,
+		0x38190, 0x381a0,
+		0x381a8, 0x381b8,
+		0x381c4, 0x381c8,
+		0x381d0, 0x381d0,
+		0x38200, 0x38318,
+		0x38400, 0x384b4,
+		0x384c0, 0x3852c,
+		0x38540, 0x3861c,
+		0x38800, 0x38828,
+		0x38834, 0x38834,
+		0x388c0, 0x38908,
+		0x38910, 0x389ac,
+		0x38a00, 0x38a14,
+		0x38a1c, 0x38a2c,
+		0x38a44, 0x38a50,
+		0x38a74, 0x38a74,
+		0x38a7c, 0x38afc,
+		0x38b08, 0x38c24,
+		0x38d00, 0x38d00,
+		0x38d08, 0x38d14,
+		0x38d1c, 0x38d20,
+		0x38d3c, 0x38d3c,
+		0x38d48, 0x38d50,
+		0x39200, 0x3920c,
+		0x39220, 0x39220,
+		0x39240, 0x39240,
+		0x39600, 0x3960c,
+		0x39a00, 0x39a1c,
+		0x39e00, 0x39e20,
+		0x39e38, 0x39e3c,
+		0x39e80, 0x39e80,
+		0x39e88, 0x39ea8,
+		0x39eb0, 0x39eb4,
+		0x39ec8, 0x39ed4,
+		0x39fb8, 0x3a004,
+		0x3a200, 0x3a200,
+		0x3a208, 0x3a240,
+		0x3a248, 0x3a280,
+		0x3a288, 0x3a2c0,
+		0x3a2c8, 0x3a2fc,
+		0x3a600, 0x3a630,
+		0x3aa00, 0x3aabc,
+		0x3ab00, 0x3ab10,
+		0x3ab20, 0x3ab30,
+		0x3ab40, 0x3ab50,
+		0x3ab60, 0x3ab70,
+		0x3b000, 0x3b028,
+		0x3b030, 0x3b048,
+		0x3b060, 0x3b068,
+		0x3b070, 0x3b09c,
+		0x3b0f0, 0x3b128,
+		0x3b130, 0x3b148,
+		0x3b160, 0x3b168,
+		0x3b170, 0x3b19c,
+		0x3b1f0, 0x3b238,
+		0x3b240, 0x3b240,
+		0x3b248, 0x3b250,
+		0x3b25c, 0x3b264,
+		0x3b270, 0x3b2b8,
+		0x3b2c0, 0x3b2e4,
+		0x3b2f8, 0x3b338,
+		0x3b340, 0x3b340,
+		0x3b348, 0x3b350,
+		0x3b35c, 0x3b364,
+		0x3b370, 0x3b3b8,
+		0x3b3c0, 0x3b3e4,
+		0x3b3f8, 0x3b428,
+		0x3b430, 0x3b448,
+		0x3b460, 0x3b468,
+		0x3b470, 0x3b49c,
+		0x3b4f0, 0x3b528,
+		0x3b530, 0x3b548,
+		0x3b560, 0x3b568,
+		0x3b570, 0x3b59c,
+		0x3b5f0, 0x3b638,
+		0x3b640, 0x3b640,
+		0x3b648, 0x3b650,
+		0x3b65c, 0x3b664,
+		0x3b670, 0x3b6b8,
+		0x3b6c0, 0x3b6e4,
+		0x3b6f8, 0x3b738,
+		0x3b740, 0x3b740,
+		0x3b748, 0x3b750,
+		0x3b75c, 0x3b764,
+		0x3b770, 0x3b7b8,
+		0x3b7c0, 0x3b7e4,
+		0x3b7f8, 0x3b7fc,
+		0x3b814, 0x3b814,
+		0x3b82c, 0x3b82c,
+		0x3b880, 0x3b88c,
+		0x3b8e8, 0x3b8ec,
+		0x3b900, 0x3b928,
+		0x3b930, 0x3b948,
+		0x3b960, 0x3b968,
+		0x3b970, 0x3b99c,
+		0x3b9f0, 0x3ba38,
+		0x3ba40, 0x3ba40,
+		0x3ba48, 0x3ba50,
+		0x3ba5c, 0x3ba64,
+		0x3ba70, 0x3bab8,
+		0x3bac0, 0x3bae4,
+		0x3baf8, 0x3bb10,
+		0x3bb28, 0x3bb28,
+		0x3bb3c, 0x3bb50,
+		0x3bbf0, 0x3bc10,
+		0x3bc28, 0x3bc28,
+		0x3bc3c, 0x3bc50,
+		0x3bcf0, 0x3bcfc,
+		0x3c000, 0x3c030,
+		0x3c100, 0x3c144,
+		0x3c190, 0x3c1a0,
+		0x3c1a8, 0x3c1b8,
+		0x3c1c4, 0x3c1c8,
+		0x3c1d0, 0x3c1d0,
+		0x3c200, 0x3c318,
+		0x3c400, 0x3c4b4,
+		0x3c4c0, 0x3c52c,
+		0x3c540, 0x3c61c,
+		0x3c800, 0x3c828,
+		0x3c834, 0x3c834,
+		0x3c8c0, 0x3c908,
+		0x3c910, 0x3c9ac,
+		0x3ca00, 0x3ca14,
+		0x3ca1c, 0x3ca2c,
+		0x3ca44, 0x3ca50,
+		0x3ca74, 0x3ca74,
+		0x3ca7c, 0x3cafc,
+		0x3cb08, 0x3cc24,
+		0x3cd00, 0x3cd00,
+		0x3cd08, 0x3cd14,
+		0x3cd1c, 0x3cd20,
+		0x3cd3c, 0x3cd3c,
+		0x3cd48, 0x3cd50,
+		0x3d200, 0x3d20c,
+		0x3d220, 0x3d220,
+		0x3d240, 0x3d240,
+		0x3d600, 0x3d60c,
+		0x3da00, 0x3da1c,
+		0x3de00, 0x3de20,
+		0x3de38, 0x3de3c,
+		0x3de80, 0x3de80,
+		0x3de88, 0x3dea8,
+		0x3deb0, 0x3deb4,
+		0x3dec8, 0x3ded4,
+		0x3dfb8, 0x3e004,
+		0x3e200, 0x3e200,
+		0x3e208, 0x3e240,
+		0x3e248, 0x3e280,
+		0x3e288, 0x3e2c0,
+		0x3e2c8, 0x3e2fc,
+		0x3e600, 0x3e630,
+		0x3ea00, 0x3eabc,
+		0x3eb00, 0x3eb10,
+		0x3eb20, 0x3eb30,
+		0x3eb40, 0x3eb50,
+		0x3eb60, 0x3eb70,
+		0x3f000, 0x3f028,
+		0x3f030, 0x3f048,
+		0x3f060, 0x3f068,
+		0x3f070, 0x3f09c,
+		0x3f0f0, 0x3f128,
+		0x3f130, 0x3f148,
+		0x3f160, 0x3f168,
+		0x3f170, 0x3f19c,
+		0x3f1f0, 0x3f238,
+		0x3f240, 0x3f240,
+		0x3f248, 0x3f250,
+		0x3f25c, 0x3f264,
+		0x3f270, 0x3f2b8,
+		0x3f2c0, 0x3f2e4,
+		0x3f2f8, 0x3f338,
+		0x3f340, 0x3f340,
+		0x3f348, 0x3f350,
+		0x3f35c, 0x3f364,
+		0x3f370, 0x3f3b8,
+		0x3f3c0, 0x3f3e4,
+		0x3f3f8, 0x3f428,
+		0x3f430, 0x3f448,
+		0x3f460, 0x3f468,
+		0x3f470, 0x3f49c,
+		0x3f4f0, 0x3f528,
+		0x3f530, 0x3f548,
+		0x3f560, 0x3f568,
+		0x3f570, 0x3f59c,
+		0x3f5f0, 0x3f638,
+		0x3f640, 0x3f640,
+		0x3f648, 0x3f650,
+		0x3f65c, 0x3f664,
+		0x3f670, 0x3f6b8,
+		0x3f6c0, 0x3f6e4,
+		0x3f6f8, 0x3f738,
+		0x3f740, 0x3f740,
+		0x3f748, 0x3f750,
+		0x3f75c, 0x3f764,
+		0x3f770, 0x3f7b8,
+		0x3f7c0, 0x3f7e4,
+		0x3f7f8, 0x3f7fc,
+		0x3f814, 0x3f814,
+		0x3f82c, 0x3f82c,
+		0x3f880, 0x3f88c,
+		0x3f8e8, 0x3f8ec,
+		0x3f900, 0x3f928,
+		0x3f930, 0x3f948,
+		0x3f960, 0x3f968,
+		0x3f970, 0x3f99c,
+		0x3f9f0, 0x3fa38,
+		0x3fa40, 0x3fa40,
+		0x3fa48, 0x3fa50,
+		0x3fa5c, 0x3fa64,
+		0x3fa70, 0x3fab8,
+		0x3fac0, 0x3fae4,
+		0x3faf8, 0x3fb10,
+		0x3fb28, 0x3fb28,
+		0x3fb3c, 0x3fb50,
+		0x3fbf0, 0x3fc10,
+		0x3fc28, 0x3fc28,
+		0x3fc3c, 0x3fc50,
+		0x3fcf0, 0x3fcfc,
+		0x40000, 0x4000c,
+		0x40040, 0x40050,
+		0x40060, 0x40068,
+		0x4007c, 0x4008c,
+		0x40094, 0x400b0,
+		0x400c0, 0x40144,
+		0x40180, 0x4018c,
+		0x40200, 0x40254,
+		0x40260, 0x40264,
+		0x40270, 0x40288,
+		0x40290, 0x40298,
+		0x402ac, 0x402c8,
+		0x402d0, 0x402e0,
+		0x402f0, 0x402f0,
+		0x40300, 0x4033c,
+		0x403f8, 0x403fc,
+		0x41304, 0x413c4,
+		0x41400, 0x4140c,
+		0x41414, 0x4141c,
+		0x41480, 0x414d0,
+		0x44000, 0x44054,
+		0x4405c, 0x44078,
+		0x440c0, 0x44174,
+		0x44180, 0x441ac,
+		0x441b4, 0x441b8,
+		0x441c0, 0x44254,
+		0x4425c, 0x44278,
+		0x442c0, 0x44374,
+		0x44380, 0x443ac,
+		0x443b4, 0x443b8,
+		0x443c0, 0x44454,
+		0x4445c, 0x44478,
+		0x444c0, 0x44574,
+		0x44580, 0x445ac,
+		0x445b4, 0x445b8,
+		0x445c0, 0x44654,
+		0x4465c, 0x44678,
+		0x446c0, 0x44774,
+		0x44780, 0x447ac,
+		0x447b4, 0x447b8,
+		0x447c0, 0x44854,
+		0x4485c, 0x44878,
+		0x448c0, 0x44974,
+		0x44980, 0x449ac,
+		0x449b4, 0x449b8,
+		0x449c0, 0x449fc,
+		0x45000, 0x45004,
+		0x45010, 0x45030,
+		0x45040, 0x45060,
+		0x45068, 0x45068,
+		0x45080, 0x45084,
+		0x450a0, 0x450b0,
+		0x45200, 0x45204,
+		0x45210, 0x45230,
+		0x45240, 0x45260,
+		0x45268, 0x45268,
+		0x45280, 0x45284,
+		0x452a0, 0x452b0,
+		0x460c0, 0x460e4,
+		0x47000, 0x4703c,
+		0x47044, 0x4708c,
+		0x47200, 0x47250,
+		0x47400, 0x47408,
+		0x47414, 0x47420,
+		0x47600, 0x47618,
+		0x47800, 0x47814,
+		0x48000, 0x4800c,
+		0x48040, 0x48050,
+		0x48060, 0x48068,
+		0x4807c, 0x4808c,
+		0x48094, 0x480b0,
+		0x480c0, 0x48144,
+		0x48180, 0x4818c,
+		0x48200, 0x48254,
+		0x48260, 0x48264,
+		0x48270, 0x48288,
+		0x48290, 0x48298,
+		0x482ac, 0x482c8,
+		0x482d0, 0x482e0,
+		0x482f0, 0x482f0,
+		0x48300, 0x4833c,
+		0x483f8, 0x483fc,
+		0x49304, 0x493c4,
+		0x49400, 0x4940c,
+		0x49414, 0x4941c,
+		0x49480, 0x494d0,
+		0x4c000, 0x4c054,
+		0x4c05c, 0x4c078,
+		0x4c0c0, 0x4c174,
+		0x4c180, 0x4c1ac,
+		0x4c1b4, 0x4c1b8,
+		0x4c1c0, 0x4c254,
+		0x4c25c, 0x4c278,
+		0x4c2c0, 0x4c374,
+		0x4c380, 0x4c3ac,
+		0x4c3b4, 0x4c3b8,
+		0x4c3c0, 0x4c454,
+		0x4c45c, 0x4c478,
+		0x4c4c0, 0x4c574,
+		0x4c580, 0x4c5ac,
+		0x4c5b4, 0x4c5b8,
+		0x4c5c0, 0x4c654,
+		0x4c65c, 0x4c678,
+		0x4c6c0, 0x4c774,
+		0x4c780, 0x4c7ac,
+		0x4c7b4, 0x4c7b8,
+		0x4c7c0, 0x4c854,
+		0x4c85c, 0x4c878,
+		0x4c8c0, 0x4c974,
+		0x4c980, 0x4c9ac,
+		0x4c9b4, 0x4c9b8,
+		0x4c9c0, 0x4c9fc,
+		0x4d000, 0x4d004,
+		0x4d010, 0x4d030,
+		0x4d040, 0x4d060,
+		0x4d068, 0x4d068,
+		0x4d080, 0x4d084,
+		0x4d0a0, 0x4d0b0,
+		0x4d200, 0x4d204,
+		0x4d210, 0x4d230,
+		0x4d240, 0x4d260,
+		0x4d268, 0x4d268,
+		0x4d280, 0x4d284,
+		0x4d2a0, 0x4d2b0,
+		0x4e0c0, 0x4e0e4,
+		0x4f000, 0x4f03c,
+		0x4f044, 0x4f08c,
+		0x4f200, 0x4f250,
+		0x4f400, 0x4f408,
+		0x4f414, 0x4f420,
+		0x4f600, 0x4f618,
+		0x4f800, 0x4f814,
+		0x50000, 0x50084,
+		0x50090, 0x500cc,
+		0x50400, 0x50400,
+		0x50800, 0x50884,
+		0x50890, 0x508cc,
+		0x50c00, 0x50c00,
+		0x51000, 0x5101c,
+		0x51300, 0x51308,
+	};
+
+	static const unsigned int t6_reg_ranges[] = {
+		0x1008, 0x101c,
+		0x1024, 0x10a8,
+		0x10b4, 0x10f8,
+		0x1100, 0x1114,
+		0x111c, 0x112c,
+		0x1138, 0x113c,
+		0x1144, 0x114c,
+		0x1180, 0x1184,
+		0x1190, 0x1194,
+		0x11a0, 0x11a4,
+		0x11b0, 0x11b4,
+		0x11fc, 0x1274,
+		0x1280, 0x133c,
+		0x1800, 0x18fc,
+		0x3000, 0x302c,
+		0x3060, 0x30b0,
+		0x30b8, 0x30d8,
+		0x30e0, 0x30fc,
+		0x3140, 0x357c,
+		0x35a8, 0x35cc,
+		0x35ec, 0x35ec,
+		0x3600, 0x5624,
+		0x56cc, 0x56ec,
+		0x56f4, 0x5720,
+		0x5728, 0x575c,
+		0x580c, 0x5814,
+		0x5890, 0x589c,
+		0x58a4, 0x58ac,
+		0x58b8, 0x58bc,
+		0x5940, 0x595c,
+		0x5980, 0x598c,
+		0x59b0, 0x59c8,
+		0x59d0, 0x59dc,
+		0x59fc, 0x5a18,
+		0x5a60, 0x5a6c,
+		0x5a80, 0x5a8c,
+		0x5a94, 0x5a9c,
+		0x5b94, 0x5bfc,
+		0x5c10, 0x5e48,
+		0x5e50, 0x5e94,
+		0x5ea0, 0x5eb0,
+		0x5ec0, 0x5ec0,
+		0x5ec8, 0x5ed0,
+		0x5ee0, 0x5ee0,
+		0x5ef0, 0x5ef0,
+		0x5f00, 0x5f00,
+		0x6000, 0x6020,
+		0x6028, 0x6040,
+		0x6058, 0x609c,
+		0x60a8, 0x619c,
+		0x7700, 0x7798,
+		0x77c0, 0x7880,
+		0x78cc, 0x78fc,
+		0x7b00, 0x7b58,
+		0x7b60, 0x7b84,
+		0x7b8c, 0x7c54,
+		0x7d00, 0x7d38,
+		0x7d40, 0x7d84,
+		0x7d8c, 0x7ddc,
+		0x7de4, 0x7e04,
+		0x7e10, 0x7e1c,
+		0x7e24, 0x7e38,
+		0x7e40, 0x7e44,
+		0x7e4c, 0x7e78,
+		0x7e80, 0x7edc,
+		0x7ee8, 0x7efc,
+		0x8dc0, 0x8de4,
+		0x8df8, 0x8e04,
+		0x8e10, 0x8e84,
+		0x8ea0, 0x8f88,
+		0x8fb8, 0x9058,
+		0x9060, 0x9060,
+		0x9068, 0x90f8,
+		0x9100, 0x9124,
+		0x9400, 0x9470,
+		0x9600, 0x9600,
+		0x9608, 0x9638,
+		0x9640, 0x9704,
+		0x9710, 0x971c,
+		0x9800, 0x9808,
+		0x9820, 0x983c,
+		0x9850, 0x9864,
+		0x9c00, 0x9c6c,
+		0x9c80, 0x9cec,
+		0x9d00, 0x9d6c,
+		0x9d80, 0x9dec,
+		0x9e00, 0x9e6c,
+		0x9e80, 0x9eec,
+		0x9f00, 0x9f6c,
+		0x9f80, 0xa020,
+		0xd004, 0xd03c,
+		0xd100, 0xd118,
+		0xd200, 0xd214,
+		0xd220, 0xd234,
+		0xd240, 0xd254,
+		0xd260, 0xd274,
+		0xd280, 0xd294,
+		0xd2a0, 0xd2b4,
+		0xd2c0, 0xd2d4,
+		0xd2e0, 0xd2f4,
+		0xd300, 0xd31c,
+		0xdfc0, 0xdfe0,
+		0xe000, 0xf008,
+		0xf010, 0xf018,
+		0xf020, 0xf028,
+		0x11000, 0x11014,
+		0x11048, 0x1106c,
+		0x11074, 0x11088,
+		0x11098, 0x11120,
+		0x1112c, 0x1117c,
+		0x11190, 0x112e0,
+		0x11300, 0x1130c,
+		0x12000, 0x1206c,
+		0x19040, 0x1906c,
+		0x19078, 0x19080,
+		0x1908c, 0x190e8,
+		0x190f0, 0x190f8,
+		0x19100, 0x19110,
+		0x19120, 0x19124,
+		0x19150, 0x19194,
+		0x1919c, 0x191b0,
+		0x191d0, 0x191e8,
+		0x19238, 0x19290,
+		0x192a4, 0x192b0,
+		0x192bc, 0x192bc,
+		0x19348, 0x1934c,
+		0x193f8, 0x19418,
+		0x19420, 0x19428,
+		0x19430, 0x19444,
+		0x1944c, 0x1946c,
+		0x19474, 0x19474,
+		0x19490, 0x194cc,
+		0x194f0, 0x194f8,
+		0x19c00, 0x19c48,
+		0x19c50, 0x19c80,
+		0x19c94, 0x19c98,
+		0x19ca0, 0x19cbc,
+		0x19ce4, 0x19ce4,
+		0x19cf0, 0x19cf8,
+		0x19d00, 0x19d28,
+		0x19d50, 0x19d78,
+		0x19d94, 0x19d98,
+		0x19da0, 0x19dc8,
+		0x19df0, 0x19e10,
+		0x19e50, 0x19e6c,
+		0x19ea0, 0x19ebc,
+		0x19ec4, 0x19ef4,
+		0x19f04, 0x19f2c,
+		0x19f34, 0x19f34,
+		0x19f40, 0x19f50,
+		0x19f90, 0x19fac,
+		0x19fc4, 0x19fc8,
+		0x19fd0, 0x19fe4,
+		0x1a000, 0x1a004,
+		0x1a010, 0x1a06c,
+		0x1a0b0, 0x1a0e4,
+		0x1a0ec, 0x1a0f8,
+		0x1a100, 0x1a108,
+		0x1a114, 0x1a120,
+		0x1a128, 0x1a130,
+		0x1a138, 0x1a138,
+		0x1a190, 0x1a1c4,
+		0x1a1fc, 0x1a1fc,
+		0x1e008, 0x1e00c,
+		0x1e040, 0x1e044,
+		0x1e04c, 0x1e04c,
+		0x1e284, 0x1e290,
+		0x1e2c0, 0x1e2c0,
+		0x1e2e0, 0x1e2e0,
+		0x1e300, 0x1e384,
+		0x1e3c0, 0x1e3c8,
+		0x1e408, 0x1e40c,
+		0x1e440, 0x1e444,
+		0x1e44c, 0x1e44c,
+		0x1e684, 0x1e690,
+		0x1e6c0, 0x1e6c0,
+		0x1e6e0, 0x1e6e0,
+		0x1e700, 0x1e784,
+		0x1e7c0, 0x1e7c8,
+		0x1e808, 0x1e80c,
+		0x1e840, 0x1e844,
+		0x1e84c, 0x1e84c,
+		0x1ea84, 0x1ea90,
+		0x1eac0, 0x1eac0,
+		0x1eae0, 0x1eae0,
+		0x1eb00, 0x1eb84,
+		0x1ebc0, 0x1ebc8,
+		0x1ec08, 0x1ec0c,
+		0x1ec40, 0x1ec44,
+		0x1ec4c, 0x1ec4c,
+		0x1ee84, 0x1ee90,
+		0x1eec0, 0x1eec0,
+		0x1eee0, 0x1eee0,
+		0x1ef00, 0x1ef84,
+		0x1efc0, 0x1efc8,
+		0x1f008, 0x1f00c,
+		0x1f040, 0x1f044,
+		0x1f04c, 0x1f04c,
+		0x1f284, 0x1f290,
+		0x1f2c0, 0x1f2c0,
+		0x1f2e0, 0x1f2e0,
+		0x1f300, 0x1f384,
+		0x1f3c0, 0x1f3c8,
+		0x1f408, 0x1f40c,
+		0x1f440, 0x1f444,
+		0x1f44c, 0x1f44c,
+		0x1f684, 0x1f690,
+		0x1f6c0, 0x1f6c0,
+		0x1f6e0, 0x1f6e0,
+		0x1f700, 0x1f784,
+		0x1f7c0, 0x1f7c8,
+		0x1f808, 0x1f80c,
+		0x1f840, 0x1f844,
+		0x1f84c, 0x1f84c,
+		0x1fa84, 0x1fa90,
+		0x1fac0, 0x1fac0,
+		0x1fae0, 0x1fae0,
+		0x1fb00, 0x1fb84,
+		0x1fbc0, 0x1fbc8,
+		0x1fc08, 0x1fc0c,
+		0x1fc40, 0x1fc44,
+		0x1fc4c, 0x1fc4c,
+		0x1fe84, 0x1fe90,
+		0x1fec0, 0x1fec0,
+		0x1fee0, 0x1fee0,
+		0x1ff00, 0x1ff84,
+		0x1ffc0, 0x1ffc8,
+		0x30000, 0x30030,
+		0x30100, 0x30168,
+		0x30190, 0x301a0,
+		0x301a8, 0x301b8,
+		0x301c4, 0x301c8,
+		0x301d0, 0x301d0,
+		0x30200, 0x30320,
+		0x30400, 0x304b4,
+		0x304c0, 0x3052c,
+		0x30540, 0x3061c,
+		0x30800, 0x308a0,
+		0x308c0, 0x30908,
+		0x30910, 0x309b8,
+		0x30a00, 0x30a04,
+		0x30a0c, 0x30a14,
+		0x30a1c, 0x30a2c,
+		0x30a44, 0x30a50,
+		0x30a74, 0x30a74,
+		0x30a7c, 0x30afc,
+		0x30b08, 0x30c24,
+		0x30d00, 0x30d14,
+		0x30d1c, 0x30d3c,
+		0x30d44, 0x30d4c,
+		0x30d54, 0x30d74,
+		0x30d7c, 0x30d7c,
+		0x30de0, 0x30de0,
+		0x30e00, 0x30ed4,
+		0x30f00, 0x30fa4,
+		0x30fc0, 0x30fc4,
+		0x31000, 0x31004,
+		0x31080, 0x310fc,
+		0x31208, 0x31220,
+		0x3123c, 0x31254,
+		0x31300, 0x31300,
+		0x31308, 0x3131c,
+		0x31338, 0x3133c,
+		0x31380, 0x31380,
+		0x31388, 0x313a8,
+		0x313b4, 0x313b4,
+		0x31400, 0x31420,
+		0x31438, 0x3143c,
+		0x31480, 0x31480,
+		0x314a8, 0x314a8,
+		0x314b0, 0x314b4,
+		0x314c8, 0x314d4,
+		0x31a40, 0x31a4c,
+		0x31af0, 0x31b20,
+		0x31b38, 0x31b3c,
+		0x31b80, 0x31b80,
+		0x31ba8, 0x31ba8,
+		0x31bb0, 0x31bb4,
+		0x31bc8, 0x31bd4,
+		0x32140, 0x3218c,
+		0x321f0, 0x321f4,
+		0x32200, 0x32200,
+		0x32218, 0x32218,
+		0x32400, 0x32400,
+		0x32408, 0x3241c,
+		0x32618, 0x32620,
+		0x32664, 0x32664,
+		0x326a8, 0x326a8,
+		0x326ec, 0x326ec,
+		0x32a00, 0x32abc,
+		0x32b00, 0x32b18,
+		0x32b20, 0x32b38,
+		0x32b40, 0x32b58,
+		0x32b60, 0x32b78,
+		0x32c00, 0x32c00,
+		0x32c08, 0x32c3c,
+		0x33000, 0x3302c,
+		0x33034, 0x33050,
+		0x33058, 0x33058,
+		0x33060, 0x3308c,
+		0x3309c, 0x330ac,
+		0x330c0, 0x330c0,
+		0x330c8, 0x330d0,
+		0x330d8, 0x330e0,
+		0x330ec, 0x3312c,
+		0x33134, 0x33150,
+		0x33158, 0x33158,
+		0x33160, 0x3318c,
+		0x3319c, 0x331ac,
+		0x331c0, 0x331c0,
+		0x331c8, 0x331d0,
+		0x331d8, 0x331e0,
+		0x331ec, 0x33290,
+		0x33298, 0x332c4,
+		0x332e4, 0x33390,
+		0x33398, 0x333c4,
+		0x333e4, 0x3342c,
+		0x33434, 0x33450,
+		0x33458, 0x33458,
+		0x33460, 0x3348c,
+		0x3349c, 0x334ac,
+		0x334c0, 0x334c0,
+		0x334c8, 0x334d0,
+		0x334d8, 0x334e0,
+		0x334ec, 0x3352c,
+		0x33534, 0x33550,
+		0x33558, 0x33558,
+		0x33560, 0x3358c,
+		0x3359c, 0x335ac,
+		0x335c0, 0x335c0,
+		0x335c8, 0x335d0,
+		0x335d8, 0x335e0,
+		0x335ec, 0x33690,
+		0x33698, 0x336c4,
+		0x336e4, 0x33790,
+		0x33798, 0x337c4,
+		0x337e4, 0x337fc,
+		0x33814, 0x33814,
+		0x33854, 0x33868,
+		0x33880, 0x3388c,
+		0x338c0, 0x338d0,
+		0x338e8, 0x338ec,
+		0x33900, 0x3392c,
+		0x33934, 0x33950,
+		0x33958, 0x33958,
+		0x33960, 0x3398c,
+		0x3399c, 0x339ac,
+		0x339c0, 0x339c0,
+		0x339c8, 0x339d0,
+		0x339d8, 0x339e0,
+		0x339ec, 0x33a90,
+		0x33a98, 0x33ac4,
+		0x33ae4, 0x33b10,
+		0x33b24, 0x33b28,
+		0x33b38, 0x33b50,
+		0x33bf0, 0x33c10,
+		0x33c24, 0x33c28,
+		0x33c38, 0x33c50,
+		0x33cf0, 0x33cfc,
+		0x34000, 0x34030,
+		0x34100, 0x34168,
+		0x34190, 0x341a0,
+		0x341a8, 0x341b8,
+		0x341c4, 0x341c8,
+		0x341d0, 0x341d0,
+		0x34200, 0x34320,
+		0x34400, 0x344b4,
+		0x344c0, 0x3452c,
+		0x34540, 0x3461c,
+		0x34800, 0x348a0,
+		0x348c0, 0x34908,
+		0x34910, 0x349b8,
+		0x34a00, 0x34a04,
+		0x34a0c, 0x34a14,
+		0x34a1c, 0x34a2c,
+		0x34a44, 0x34a50,
+		0x34a74, 0x34a74,
+		0x34a7c, 0x34afc,
+		0x34b08, 0x34c24,
+		0x34d00, 0x34d14,
+		0x34d1c, 0x34d3c,
+		0x34d44, 0x34d4c,
+		0x34d54, 0x34d74,
+		0x34d7c, 0x34d7c,
+		0x34de0, 0x34de0,
+		0x34e00, 0x34ed4,
+		0x34f00, 0x34fa4,
+		0x34fc0, 0x34fc4,
+		0x35000, 0x35004,
+		0x35080, 0x350fc,
+		0x35208, 0x35220,
+		0x3523c, 0x35254,
+		0x35300, 0x35300,
+		0x35308, 0x3531c,
+		0x35338, 0x3533c,
+		0x35380, 0x35380,
+		0x35388, 0x353a8,
+		0x353b4, 0x353b4,
+		0x35400, 0x35420,
+		0x35438, 0x3543c,
+		0x35480, 0x35480,
+		0x354a8, 0x354a8,
+		0x354b0, 0x354b4,
+		0x354c8, 0x354d4,
+		0x35a40, 0x35a4c,
+		0x35af0, 0x35b20,
+		0x35b38, 0x35b3c,
+		0x35b80, 0x35b80,
+		0x35ba8, 0x35ba8,
+		0x35bb0, 0x35bb4,
+		0x35bc8, 0x35bd4,
+		0x36140, 0x3618c,
+		0x361f0, 0x361f4,
+		0x36200, 0x36200,
+		0x36218, 0x36218,
+		0x36400, 0x36400,
+		0x36408, 0x3641c,
+		0x36618, 0x36620,
+		0x36664, 0x36664,
+		0x366a8, 0x366a8,
+		0x366ec, 0x366ec,
+		0x36a00, 0x36abc,
+		0x36b00, 0x36b18,
+		0x36b20, 0x36b38,
+		0x36b40, 0x36b58,
+		0x36b60, 0x36b78,
+		0x36c00, 0x36c00,
+		0x36c08, 0x36c3c,
+		0x37000, 0x3702c,
+		0x37034, 0x37050,
+		0x37058, 0x37058,
+		0x37060, 0x3708c,
+		0x3709c, 0x370ac,
+		0x370c0, 0x370c0,
+		0x370c8, 0x370d0,
+		0x370d8, 0x370e0,
+		0x370ec, 0x3712c,
+		0x37134, 0x37150,
+		0x37158, 0x37158,
+		0x37160, 0x3718c,
+		0x3719c, 0x371ac,
+		0x371c0, 0x371c0,
+		0x371c8, 0x371d0,
+		0x371d8, 0x371e0,
+		0x371ec, 0x37290,
+		0x37298, 0x372c4,
+		0x372e4, 0x37390,
+		0x37398, 0x373c4,
+		0x373e4, 0x3742c,
+		0x37434, 0x37450,
+		0x37458, 0x37458,
+		0x37460, 0x3748c,
+		0x3749c, 0x374ac,
+		0x374c0, 0x374c0,
+		0x374c8, 0x374d0,
+		0x374d8, 0x374e0,
+		0x374ec, 0x3752c,
+		0x37534, 0x37550,
+		0x37558, 0x37558,
+		0x37560, 0x3758c,
+		0x3759c, 0x375ac,
+		0x375c0, 0x375c0,
+		0x375c8, 0x375d0,
+		0x375d8, 0x375e0,
+		0x375ec, 0x37690,
+		0x37698, 0x376c4,
+		0x376e4, 0x37790,
+		0x37798, 0x377c4,
+		0x377e4, 0x377fc,
+		0x37814, 0x37814,
+		0x37854, 0x37868,
+		0x37880, 0x3788c,
+		0x378c0, 0x378d0,
+		0x378e8, 0x378ec,
+		0x37900, 0x3792c,
+		0x37934, 0x37950,
+		0x37958, 0x37958,
+		0x37960, 0x3798c,
+		0x3799c, 0x379ac,
+		0x379c0, 0x379c0,
+		0x379c8, 0x379d0,
+		0x379d8, 0x379e0,
+		0x379ec, 0x37a90,
+		0x37a98, 0x37ac4,
+		0x37ae4, 0x37b10,
+		0x37b24, 0x37b28,
+		0x37b38, 0x37b50,
+		0x37bf0, 0x37c10,
+		0x37c24, 0x37c28,
+		0x37c38, 0x37c50,
+		0x37cf0, 0x37cfc,
+		0x40040, 0x40040,
+		0x40080, 0x40084,
+		0x40100, 0x40100,
+		0x40140, 0x401bc,
+		0x40200, 0x40214,
+		0x40228, 0x40228,
+		0x40240, 0x40258,
+		0x40280, 0x40280,
+		0x40304, 0x40304,
+		0x40330, 0x4033c,
+		0x41304, 0x413c8,
+		0x413d0, 0x413dc,
+		0x413f0, 0x413f0,
+		0x41400, 0x4140c,
+		0x41414, 0x4141c,
+		0x41480, 0x414d0,
+		0x44000, 0x4407c,
+		0x440c0, 0x441ac,
+		0x441b4, 0x4427c,
+		0x442c0, 0x443ac,
+		0x443b4, 0x4447c,
+		0x444c0, 0x445ac,
+		0x445b4, 0x4467c,
+		0x446c0, 0x447ac,
+		0x447b4, 0x4487c,
+		0x448c0, 0x449ac,
+		0x449b4, 0x44a7c,
+		0x44ac0, 0x44bac,
+		0x44bb4, 0x44c7c,
+		0x44cc0, 0x44dac,
+		0x44db4, 0x44e7c,
+		0x44ec0, 0x44fac,
+		0x44fb4, 0x4507c,
+		0x450c0, 0x451ac,
+		0x451b4, 0x451fc,
+		0x45800, 0x45804,
+		0x45810, 0x45830,
+		0x45840, 0x45860,
+		0x45868, 0x45868,
+		0x45880, 0x45884,
+		0x458a0, 0x458b0,
+		0x45a00, 0x45a04,
+		0x45a10, 0x45a30,
+		0x45a40, 0x45a60,
+		0x45a68, 0x45a68,
+		0x45a80, 0x45a84,
+		0x45aa0, 0x45ab0,
+		0x460c0, 0x460e4,
+		0x47000, 0x4703c,
+		0x47044, 0x4708c,
+		0x47200, 0x47250,
+		0x47400, 0x47408,
+		0x47414, 0x47420,
+		0x47600, 0x47618,
+		0x47800, 0x47814,
+		0x47820, 0x4782c,
+		0x50000, 0x50084,
+		0x50090, 0x500cc,
+		0x50300, 0x50384,
+		0x50400, 0x50400,
+		0x50800, 0x50884,
+		0x50890, 0x508cc,
+		0x50b00, 0x50b84,
+		0x50c00, 0x50c00,
+		0x51000, 0x51020,
+		0x51028, 0x510b0,
+		0x51300, 0x51324,
+	};
+
+	u32 *buf_end = (u32 *)((char *)buf + buf_size);
+	const unsigned int *reg_ranges;
+	int reg_ranges_size, range;
+	unsigned int chip_version = CHELSIO_CHIP_VERSION(adap->params.chip);
+
+	/* Select the right set of register ranges to dump depending on the
+	 * adapter chip type.
+	 */
+	switch (chip_version) {
+	case CHELSIO_T4:
+		reg_ranges = t4_reg_ranges;
+		reg_ranges_size = ARRAY_SIZE(t4_reg_ranges);
+		break;
+
+	case CHELSIO_T5:
+		reg_ranges = t5_reg_ranges;
+		reg_ranges_size = ARRAY_SIZE(t5_reg_ranges);
+		break;
+
+	case CHELSIO_T6:
+		reg_ranges = t6_reg_ranges;
+		reg_ranges_size = ARRAY_SIZE(t6_reg_ranges);
+		break;
+
+	default:
+		dev_err(adap->pdev_dev,
+			"Unsupported chip version %d\n", chip_version);
+		return;
+	}
+
+	/* Clear the register buffer and insert the appropriate register
+	 * values selected by the above register ranges.
+	 */
+	memset(buf, 0, buf_size);
+	for (range = 0; range < reg_ranges_size; range += 2) {
+		unsigned int reg = reg_ranges[range];
+		unsigned int last_reg = reg_ranges[range + 1];
+		u32 *bufp = (u32 *)((char *)buf + reg);
+
+		/* Iterate across the register range filling in the register
+		 * buffer but don't write past the end of the register buffer.
+		 */
+		while (reg <= last_reg && bufp < buf_end) {
+			*bufp++ = t4_read_reg(adap, reg);
+			reg += sizeof(u32);
+		}
+	}
+}
+
+#define EEPROM_STAT_ADDR   0x7bfc
+#define VPD_BASE           0x400
+#define VPD_BASE_OLD       0
+#define VPD_LEN            1024
+#define CHELSIO_VPD_UNIQUE_ID 0x82
+
+/**
+ * t4_eeprom_ptov - translate a physical EEPROM address to virtual
+ * @phys_addr: the physical EEPROM address
+ * @fn: the PCI function number
+ * @sz: size of function-specific area
+ *
+ * Translate a physical EEPROM address to virtual.  The first 1K is
+ * accessed through virtual addresses starting at 31K, the rest is
+ * accessed through virtual addresses starting at 0.
+ *
+ * The mapping is as follows:
+ * [0..1K) -> [31K..32K)
+ * [1K..1K+A) -> [31K-A..31K)
+ * [1K+A..ES) -> [0..ES-A-1K)
+ *
+ * where A = @fn * @sz, and ES = EEPROM size.
+ */
+int t4_eeprom_ptov(unsigned int phys_addr, unsigned int fn, unsigned int sz)
+{
+	fn *= sz;
+	if (phys_addr < 1024)
+		return phys_addr + (31 << 10);
+	if (phys_addr < 1024 + fn)
+		return 31744 - fn + phys_addr - 1024;
+	if (phys_addr < EEPROMSIZE)
+		return phys_addr - 1024 - fn;
+	return -EINVAL;
+}
+
+/**
+ *	t4_seeprom_wp - enable/disable EEPROM write protection
+ *	@adapter: the adapter
+ *	@enable: whether to enable or disable write protection
+ *
+ *	Enables or disables write protection on the serial EEPROM.
+ */
+int t4_seeprom_wp(struct adapter *adapter, bool enable)
+{
+	unsigned int v = enable ? 0xc : 0;
+	int ret = pci_write_vpd(adapter->pdev, EEPROM_STAT_ADDR, 4, &v);
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ *	t4_get_raw_vpd_params - read VPD parameters from VPD EEPROM
+ *	@adapter: adapter to read
+ *	@p: where to store the parameters
+ *
+ *	Reads card parameters stored in VPD EEPROM.
+ */
+int t4_get_raw_vpd_params(struct adapter *adapter, struct vpd_params *p)
+{
+	int i, ret = 0, addr;
+	int ec, sn, pn, na;
+	u8 *vpd, csum;
+	unsigned int vpdr_len, kw_offset, id_len;
+
+	vpd = vmalloc(VPD_LEN);
+	if (!vpd)
+		return -ENOMEM;
+
+	/* Card information normally starts at VPD_BASE but early cards had
+	 * it at 0.
+	 */
+	ret = pci_read_vpd(adapter->pdev, VPD_BASE, sizeof(u32), vpd);
+	if (ret < 0)
+		goto out;
+
+	/* The VPD shall have a unique identifier specified by the PCI SIG.
+	 * For chelsio adapters, the identifier is 0x82. The first byte of a VPD
+	 * shall be CHELSIO_VPD_UNIQUE_ID (0x82). The VPD programming software
+	 * is expected to automatically put this entry at the
+	 * beginning of the VPD.
+	 */
+	addr = *vpd == CHELSIO_VPD_UNIQUE_ID ? VPD_BASE : VPD_BASE_OLD;
+
+	ret = pci_read_vpd(adapter->pdev, addr, VPD_LEN, vpd);
+	if (ret < 0)
+		goto out;
+
+	if (vpd[0] != PCI_VPD_LRDT_ID_STRING) {
+		dev_err(adapter->pdev_dev, "missing VPD ID string\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	id_len = pci_vpd_lrdt_size(vpd);
+	if (id_len > ID_LEN)
+		id_len = ID_LEN;
+
+	i = pci_vpd_find_tag(vpd, 0, VPD_LEN, PCI_VPD_LRDT_RO_DATA);
+	if (i < 0) {
+		dev_err(adapter->pdev_dev, "missing VPD-R section\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	vpdr_len = pci_vpd_lrdt_size(&vpd[i]);
+	kw_offset = i + PCI_VPD_LRDT_TAG_SIZE;
+	if (vpdr_len + kw_offset > VPD_LEN) {
+		dev_err(adapter->pdev_dev, "bad VPD-R length %u\n", vpdr_len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+#define FIND_VPD_KW(var, name) do { \
+	var = pci_vpd_find_info_keyword(vpd, kw_offset, vpdr_len, name); \
+	if (var < 0) { \
+		dev_err(adapter->pdev_dev, "missing VPD keyword " name "\n"); \
+		ret = -EINVAL; \
+		goto out; \
+	} \
+	var += PCI_VPD_INFO_FLD_HDR_SIZE; \
+} while (0)
+
+	FIND_VPD_KW(i, "RV");
+	for (csum = 0; i >= 0; i--)
+		csum += vpd[i];
+
+	if (csum) {
+		dev_err(adapter->pdev_dev,
+			"corrupted VPD EEPROM, actual csum %u\n", csum);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	FIND_VPD_KW(ec, "EC");
+	FIND_VPD_KW(sn, "SN");
+	FIND_VPD_KW(pn, "PN");
+	FIND_VPD_KW(na, "NA");
+#undef FIND_VPD_KW
+
+	memcpy(p->id, vpd + PCI_VPD_LRDT_TAG_SIZE, id_len);
+	strim(p->id);
+	memcpy(p->ec, vpd + ec, EC_LEN);
+	strim(p->ec);
+	i = pci_vpd_info_field_size(vpd + sn - PCI_VPD_INFO_FLD_HDR_SIZE);
+	memcpy(p->sn, vpd + sn, min(i, SERNUM_LEN));
+	strim(p->sn);
+	i = pci_vpd_info_field_size(vpd + pn - PCI_VPD_INFO_FLD_HDR_SIZE);
+	memcpy(p->pn, vpd + pn, min(i, PN_LEN));
+	strim(p->pn);
+	memcpy(p->na, vpd + na, min(i, MACADDR_LEN));
+	strim((char *)p->na);
+
+out:
+	vfree(vpd);
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ *	t4_get_vpd_params - read VPD parameters & retrieve Core Clock
+ *	@adapter: adapter to read
+ *	@p: where to store the parameters
+ *
+ *	Reads card parameters stored in VPD EEPROM and retrieves the Core
+ *	Clock.  This can only be called after a connection to the firmware
+ *	is established.
+ */
+int t4_get_vpd_params(struct adapter *adapter, struct vpd_params *p)
+{
+	u32 cclk_param, cclk_val;
+	int ret;
+
+	/* Grab the raw VPD parameters.
+	 */
+	ret = t4_get_raw_vpd_params(adapter, p);
+	if (ret)
+		return ret;
+
+	/* Ask firmware for the Core Clock since it knows how to translate the
+	 * Reference Clock ('V2') VPD field into a Core Clock value ...
+	 */
+	cclk_param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+		      FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_CCLK));
+	ret = t4_query_params(adapter, adapter->mbox, adapter->pf, 0,
+			      1, &cclk_param, &cclk_val);
+
+	if (ret)
+		return ret;
+	p->cclk = cclk_val;
+
+	return 0;
+}
+
+/* serial flash and firmware constants */
+enum {
+	SF_ATTEMPTS = 10,             /* max retries for SF operations */
+
+	/* flash command opcodes */
+	SF_PROG_PAGE    = 2,          /* program page */
+	SF_WR_DISABLE   = 4,          /* disable writes */
+	SF_RD_STATUS    = 5,          /* read status register */
+	SF_WR_ENABLE    = 6,          /* enable writes */
+	SF_RD_DATA_FAST = 0xb,        /* read flash */
+	SF_RD_ID        = 0x9f,       /* read ID */
+	SF_ERASE_SECTOR = 0xd8,       /* erase sector */
+};
+
+/**
+ *	sf1_read - read data from the serial flash
+ *	@adapter: the adapter
+ *	@byte_cnt: number of bytes to read
+ *	@cont: whether another operation will be chained
+ *	@lock: whether to lock SF for PL access only
+ *	@valp: where to store the read data
+ *
+ *	Reads up to 4 bytes of data from the serial flash.  The location of
+ *	the read needs to be specified prior to calling this by issuing the
+ *	appropriate commands to the serial flash.
+ */
+static int sf1_read(struct adapter *adapter, unsigned int byte_cnt, int cont,
+		    int lock, u32 *valp)
+{
+	int ret;
+
+	if (!byte_cnt || byte_cnt > 4)
+		return -EINVAL;
+	if (t4_read_reg(adapter, SF_OP_A) & SF_BUSY_F)
+		return -EBUSY;
+	t4_write_reg(adapter, SF_OP_A, SF_LOCK_V(lock) |
+		     SF_CONT_V(cont) | BYTECNT_V(byte_cnt - 1));
+	ret = t4_wait_op_done(adapter, SF_OP_A, SF_BUSY_F, 0, SF_ATTEMPTS, 5);
+	if (!ret)
+		*valp = t4_read_reg(adapter, SF_DATA_A);
+	return ret;
+}
+
+/**
+ *	sf1_write - write data to the serial flash
+ *	@adapter: the adapter
+ *	@byte_cnt: number of bytes to write
+ *	@cont: whether another operation will be chained
+ *	@lock: whether to lock SF for PL access only
+ *	@val: value to write
+ *
+ *	Writes up to 4 bytes of data to the serial flash.  The location of
+ *	the write needs to be specified prior to calling this by issuing the
+ *	appropriate commands to the serial flash.
+ */
+static int sf1_write(struct adapter *adapter, unsigned int byte_cnt, int cont,
+		     int lock, u32 val)
+{
+	if (!byte_cnt || byte_cnt > 4)
+		return -EINVAL;
+	if (t4_read_reg(adapter, SF_OP_A) & SF_BUSY_F)
+		return -EBUSY;
+	t4_write_reg(adapter, SF_DATA_A, val);
+	t4_write_reg(adapter, SF_OP_A, SF_LOCK_V(lock) |
+		     SF_CONT_V(cont) | BYTECNT_V(byte_cnt - 1) | OP_V(1));
+	return t4_wait_op_done(adapter, SF_OP_A, SF_BUSY_F, 0, SF_ATTEMPTS, 5);
+}
+
+/**
+ *	flash_wait_op - wait for a flash operation to complete
+ *	@adapter: the adapter
+ *	@attempts: max number of polls of the status register
+ *	@delay: delay between polls in ms
+ *
+ *	Wait for a flash operation to complete by polling the status register.
+ */
+static int flash_wait_op(struct adapter *adapter, int attempts, int delay)
+{
+	int ret;
+	u32 status;
+
+	while (1) {
+		if ((ret = sf1_write(adapter, 1, 1, 1, SF_RD_STATUS)) != 0 ||
+		    (ret = sf1_read(adapter, 1, 0, 1, &status)) != 0)
+			return ret;
+		if (!(status & 1))
+			return 0;
+		if (--attempts == 0)
+			return -EAGAIN;
+		if (delay)
+			msleep(delay);
+	}
+}
+
+/**
+ *	t4_read_flash - read words from serial flash
+ *	@adapter: the adapter
+ *	@addr: the start address for the read
+ *	@nwords: how many 32-bit words to read
+ *	@data: where to store the read data
+ *	@byte_oriented: whether to store data as bytes or as words
+ *
+ *	Read the specified number of 32-bit words from the serial flash.
+ *	If @byte_oriented is set the read data is stored as a byte array
+ *	(i.e., big-endian), otherwise as 32-bit words in the platform's
+ *	natural endianness.
+ */
+int t4_read_flash(struct adapter *adapter, unsigned int addr,
+		  unsigned int nwords, u32 *data, int byte_oriented)
+{
+	int ret;
+
+	if (addr + nwords * sizeof(u32) > adapter->params.sf_size || (addr & 3))
+		return -EINVAL;
+
+	addr = swab32(addr) | SF_RD_DATA_FAST;
+
+	if ((ret = sf1_write(adapter, 4, 1, 0, addr)) != 0 ||
+	    (ret = sf1_read(adapter, 1, 1, 0, data)) != 0)
+		return ret;
+
+	for ( ; nwords; nwords--, data++) {
+		ret = sf1_read(adapter, 4, nwords > 1, nwords == 1, data);
+		if (nwords == 1)
+			t4_write_reg(adapter, SF_OP_A, 0);    /* unlock SF */
+		if (ret)
+			return ret;
+		if (byte_oriented)
+			*data = (__force __u32)(cpu_to_be32(*data));
+	}
+	return 0;
+}
+
+/**
+ *	t4_write_flash - write up to a page of data to the serial flash
+ *	@adapter: the adapter
+ *	@addr: the start address to write
+ *	@n: length of data to write in bytes
+ *	@data: the data to write
+ *
+ *	Writes up to a page of data (256 bytes) to the serial flash starting
+ *	at the given address.  All the data must be written to the same page.
+ */
+static int t4_write_flash(struct adapter *adapter, unsigned int addr,
+			  unsigned int n, const u8 *data)
+{
+	int ret;
+	u32 buf[64];
+	unsigned int i, c, left, val, offset = addr & 0xff;
+
+	if (addr >= adapter->params.sf_size || offset + n > SF_PAGE_SIZE)
+		return -EINVAL;
+
+	val = swab32(addr) | SF_PROG_PAGE;
+
+	if ((ret = sf1_write(adapter, 1, 0, 1, SF_WR_ENABLE)) != 0 ||
+	    (ret = sf1_write(adapter, 4, 1, 1, val)) != 0)
+		goto unlock;
+
+	for (left = n; left; left -= c) {
+		c = min(left, 4U);
+		for (val = 0, i = 0; i < c; ++i)
+			val = (val << 8) + *data++;
+
+		ret = sf1_write(adapter, c, c != left, 1, val);
+		if (ret)
+			goto unlock;
+	}
+	ret = flash_wait_op(adapter, 8, 1);
+	if (ret)
+		goto unlock;
+
+	t4_write_reg(adapter, SF_OP_A, 0);    /* unlock SF */
+
+	/* Read the page to verify the write succeeded */
+	ret = t4_read_flash(adapter, addr & ~0xff, ARRAY_SIZE(buf), buf, 1);
+	if (ret)
+		return ret;
+
+	if (memcmp(data - n, (u8 *)buf + offset, n)) {
+		dev_err(adapter->pdev_dev,
+			"failed to correctly write the flash page at %#x\n",
+			addr);
+		return -EIO;
+	}
+	return 0;
+
+unlock:
+	t4_write_reg(adapter, SF_OP_A, 0);    /* unlock SF */
+	return ret;
+}
+
+/**
+ *	t4_get_fw_version - read the firmware version
+ *	@adapter: the adapter
+ *	@vers: where to place the version
+ *
+ *	Reads the FW version from flash.
+ */
+int t4_get_fw_version(struct adapter *adapter, u32 *vers)
+{
+	return t4_read_flash(adapter, FLASH_FW_START +
+			     offsetof(struct fw_hdr, fw_ver), 1,
+			     vers, 0);
+}
+
+/**
+ *	t4_get_bs_version - read the firmware bootstrap version
+ *	@adapter: the adapter
+ *	@vers: where to place the version
+ *
+ *	Reads the FW Bootstrap version from flash.
+ */
+int t4_get_bs_version(struct adapter *adapter, u32 *vers)
+{
+	return t4_read_flash(adapter, FLASH_FWBOOTSTRAP_START +
+			     offsetof(struct fw_hdr, fw_ver), 1,
+			     vers, 0);
+}
+
+/**
+ *	t4_get_tp_version - read the TP microcode version
+ *	@adapter: the adapter
+ *	@vers: where to place the version
+ *
+ *	Reads the TP microcode version from flash.
+ */
+int t4_get_tp_version(struct adapter *adapter, u32 *vers)
+{
+	return t4_read_flash(adapter, FLASH_FW_START +
+			     offsetof(struct fw_hdr, tp_microcode_ver),
+			     1, vers, 0);
+}
+
+/**
+ *	t4_get_exprom_version - return the Expansion ROM version (if any)
+ *	@adapter: the adapter
+ *	@vers: where to place the version
+ *
+ *	Reads the Expansion ROM header from FLASH and returns the version
+ *	number (if present) through the @vers return value pointer.  We return
+ *	this in the Firmware Version Format since it's convenient.  Return
+ *	0 on success, -ENOENT if no Expansion ROM is present.
+ */
+int t4_get_exprom_version(struct adapter *adap, u32 *vers)
+{
+	struct exprom_header {
+		unsigned char hdr_arr[16];	/* must start with 0x55aa */
+		unsigned char hdr_ver[4];	/* Expansion ROM version */
+	} *hdr;
+	u32 exprom_header_buf[DIV_ROUND_UP(sizeof(struct exprom_header),
+					   sizeof(u32))];
+	int ret;
+
+	ret = t4_read_flash(adap, FLASH_EXP_ROM_START,
+			    ARRAY_SIZE(exprom_header_buf), exprom_header_buf,
+			    0);
+	if (ret)
+		return ret;
+
+	hdr = (struct exprom_header *)exprom_header_buf;
+	if (hdr->hdr_arr[0] != 0x55 || hdr->hdr_arr[1] != 0xaa)
+		return -ENOENT;
+
+	*vers = (FW_HDR_FW_VER_MAJOR_V(hdr->hdr_ver[0]) |
+		 FW_HDR_FW_VER_MINOR_V(hdr->hdr_ver[1]) |
+		 FW_HDR_FW_VER_MICRO_V(hdr->hdr_ver[2]) |
+		 FW_HDR_FW_VER_BUILD_V(hdr->hdr_ver[3]));
+	return 0;
+}
+
+/**
+ *      t4_get_vpd_version - return the VPD version
+ *      @adapter: the adapter
+ *      @vers: where to place the version
+ *
+ *      Reads the VPD via the Firmware interface (thus this can only be called
+ *      once we're ready to issue Firmware commands).  The format of the
+ *      VPD version is adapter specific.  Returns 0 on success, an error on
+ *      failure.
+ *
+ *      Note that early versions of the Firmware didn't include the ability
+ *      to retrieve the VPD version, so we zero-out the return-value parameter
+ *      in that case to avoid leaving it with garbage in it.
+ *
+ *      Also note that the Firmware will return its cached copy of the VPD
+ *      Revision ID, not the actual Revision ID as written in the Serial
+ *      EEPROM.  This is only an issue if a new VPD has been written and the
+ *      Firmware/Chip haven't yet gone through a RESET sequence.  So it's best
+ *      to defer calling this routine till after a FW_RESET_CMD has been issued
+ *      if the Host Driver will be performing a full adapter initialization.
+ */
+int t4_get_vpd_version(struct adapter *adapter, u32 *vers)
+{
+	u32 vpdrev_param;
+	int ret;
+
+	vpdrev_param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+			FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_VPDREV));
+	ret = t4_query_params(adapter, adapter->mbox, adapter->pf, 0,
+			      1, &vpdrev_param, vers);
+	if (ret)
+		*vers = 0;
+	return ret;
+}
+
+/**
+ *      t4_get_scfg_version - return the Serial Configuration version
+ *      @adapter: the adapter
+ *      @vers: where to place the version
+ *
+ *      Reads the Serial Configuration Version via the Firmware interface
+ *      (thus this can only be called once we're ready to issue Firmware
+ *      commands).  The format of the Serial Configuration version is
+ *      adapter specific.  Returns 0 on success, an error on failure.
+ *
+ *      Note that early versions of the Firmware didn't include the ability
+ *      to retrieve the Serial Configuration version, so we zero-out the
+ *      return-value parameter in that case to avoid leaving it with
+ *      garbage in it.
+ *
+ *      Also note that the Firmware will return its cached copy of the Serial
+ *      Initialization Revision ID, not the actual Revision ID as written in
+ *      the Serial EEPROM.  This is only an issue if a new VPD has been written
+ *      and the Firmware/Chip haven't yet gone through a RESET sequence.  So
+ *      it's best to defer calling this routine till after a FW_RESET_CMD has
+ *      been issued if the Host Driver will be performing a full adapter
+ *      initialization.
+ */
+int t4_get_scfg_version(struct adapter *adapter, u32 *vers)
+{
+	u32 scfgrev_param;
+	int ret;
+
+	scfgrev_param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+			 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_SCFGREV));
+	ret = t4_query_params(adapter, adapter->mbox, adapter->pf, 0,
+			      1, &scfgrev_param, vers);
+	if (ret)
+		*vers = 0;
+	return ret;
+}
+
+/**
+ *      t4_get_version_info - extract various chip/firmware version information
+ *      @adapter: the adapter
+ *
+ *      Reads various chip/firmware version numbers and stores them into the
+ *      adapter Adapter Parameters structure.  If any of the efforts fails
+ *      the first failure will be returned, but all of the version numbers
+ *      will be read.
+ */
+int t4_get_version_info(struct adapter *adapter)
+{
+	int ret = 0;
+
+	#define FIRST_RET(__getvinfo) \
+	do { \
+		int __ret = __getvinfo; \
+		if (__ret && !ret) \
+			ret = __ret; \
+	} while (0)
+
+	FIRST_RET(t4_get_fw_version(adapter, &adapter->params.fw_vers));
+	FIRST_RET(t4_get_bs_version(adapter, &adapter->params.bs_vers));
+	FIRST_RET(t4_get_tp_version(adapter, &adapter->params.tp_vers));
+	FIRST_RET(t4_get_exprom_version(adapter, &adapter->params.er_vers));
+	FIRST_RET(t4_get_scfg_version(adapter, &adapter->params.scfg_vers));
+	FIRST_RET(t4_get_vpd_version(adapter, &adapter->params.vpd_vers));
+
+	#undef FIRST_RET
+	return ret;
+}
+
+/**
+ *      t4_dump_version_info - dump all of the adapter configuration IDs
+ *      @adapter: the adapter
+ *
+ *      Dumps all of the various bits of adapter configuration version/revision
+ *      IDs information.  This is typically called at some point after
+ *      t4_get_version_info() has been called.
+ */
+void t4_dump_version_info(struct adapter *adapter)
+{
+	/* Device information */
+	dev_info(adapter->pdev_dev, "Chelsio %s rev %d\n",
+		 adapter->params.vpd.id,
+		 CHELSIO_CHIP_RELEASE(adapter->params.chip));
+	dev_info(adapter->pdev_dev, "S/N: %s, P/N: %s\n",
+		 adapter->params.vpd.sn, adapter->params.vpd.pn);
+
+	/* Firmware Version */
+	if (!adapter->params.fw_vers)
+		dev_warn(adapter->pdev_dev, "No firmware loaded\n");
+	else
+		dev_info(adapter->pdev_dev, "Firmware version: %u.%u.%u.%u\n",
+			 FW_HDR_FW_VER_MAJOR_G(adapter->params.fw_vers),
+			 FW_HDR_FW_VER_MINOR_G(adapter->params.fw_vers),
+			 FW_HDR_FW_VER_MICRO_G(adapter->params.fw_vers),
+			 FW_HDR_FW_VER_BUILD_G(adapter->params.fw_vers));
+
+	/* Bootstrap Firmware Version. (Some adapters don't have Bootstrap
+	 * Firmware, so dev_info() is more appropriate here.)
+	 */
+	if (!adapter->params.bs_vers)
+		dev_info(adapter->pdev_dev, "No bootstrap loaded\n");
+	else
+		dev_info(adapter->pdev_dev, "Bootstrap version: %u.%u.%u.%u\n",
+			 FW_HDR_FW_VER_MAJOR_G(adapter->params.bs_vers),
+			 FW_HDR_FW_VER_MINOR_G(adapter->params.bs_vers),
+			 FW_HDR_FW_VER_MICRO_G(adapter->params.bs_vers),
+			 FW_HDR_FW_VER_BUILD_G(adapter->params.bs_vers));
+
+	/* TP Microcode Version */
+	if (!adapter->params.tp_vers)
+		dev_warn(adapter->pdev_dev, "No TP Microcode loaded\n");
+	else
+		dev_info(adapter->pdev_dev,
+			 "TP Microcode version: %u.%u.%u.%u\n",
+			 FW_HDR_FW_VER_MAJOR_G(adapter->params.tp_vers),
+			 FW_HDR_FW_VER_MINOR_G(adapter->params.tp_vers),
+			 FW_HDR_FW_VER_MICRO_G(adapter->params.tp_vers),
+			 FW_HDR_FW_VER_BUILD_G(adapter->params.tp_vers));
+
+	/* Expansion ROM version */
+	if (!adapter->params.er_vers)
+		dev_info(adapter->pdev_dev, "No Expansion ROM loaded\n");
+	else
+		dev_info(adapter->pdev_dev,
+			 "Expansion ROM version: %u.%u.%u.%u\n",
+			 FW_HDR_FW_VER_MAJOR_G(adapter->params.er_vers),
+			 FW_HDR_FW_VER_MINOR_G(adapter->params.er_vers),
+			 FW_HDR_FW_VER_MICRO_G(adapter->params.er_vers),
+			 FW_HDR_FW_VER_BUILD_G(adapter->params.er_vers));
+
+	/* Serial Configuration version */
+	dev_info(adapter->pdev_dev, "Serial Configuration version: %#x\n",
+		 adapter->params.scfg_vers);
+
+	/* VPD Version */
+	dev_info(adapter->pdev_dev, "VPD version: %#x\n",
+		 adapter->params.vpd_vers);
+}
+
+/**
+ *	t4_check_fw_version - check if the FW is supported with this driver
+ *	@adap: the adapter
+ *
+ *	Checks if an adapter's FW is compatible with the driver.  Returns 0
+ *	if there's exact match, a negative error if the version could not be
+ *	read or there's a major version mismatch
+ */
+int t4_check_fw_version(struct adapter *adap)
+{
+	int i, ret, major, minor, micro;
+	int exp_major, exp_minor, exp_micro;
+	unsigned int chip_version = CHELSIO_CHIP_VERSION(adap->params.chip);
+
+	ret = t4_get_fw_version(adap, &adap->params.fw_vers);
+	/* Try multiple times before returning error */
+	for (i = 0; (ret == -EBUSY || ret == -EAGAIN) && i < 3; i++)
+		ret = t4_get_fw_version(adap, &adap->params.fw_vers);
+
+	if (ret)
+		return ret;
+
+	major = FW_HDR_FW_VER_MAJOR_G(adap->params.fw_vers);
+	minor = FW_HDR_FW_VER_MINOR_G(adap->params.fw_vers);
+	micro = FW_HDR_FW_VER_MICRO_G(adap->params.fw_vers);
+
+	switch (chip_version) {
+	case CHELSIO_T4:
+		exp_major = T4FW_MIN_VERSION_MAJOR;
+		exp_minor = T4FW_MIN_VERSION_MINOR;
+		exp_micro = T4FW_MIN_VERSION_MICRO;
+		break;
+	case CHELSIO_T5:
+		exp_major = T5FW_MIN_VERSION_MAJOR;
+		exp_minor = T5FW_MIN_VERSION_MINOR;
+		exp_micro = T5FW_MIN_VERSION_MICRO;
+		break;
+	case CHELSIO_T6:
+		exp_major = T6FW_MIN_VERSION_MAJOR;
+		exp_minor = T6FW_MIN_VERSION_MINOR;
+		exp_micro = T6FW_MIN_VERSION_MICRO;
+		break;
+	default:
+		dev_err(adap->pdev_dev, "Unsupported chip type, %x\n",
+			adap->chip);
+		return -EINVAL;
+	}
+
+	if (major < exp_major || (major == exp_major && minor < exp_minor) ||
+	    (major == exp_major && minor == exp_minor && micro < exp_micro)) {
+		dev_err(adap->pdev_dev,
+			"Card has firmware version %u.%u.%u, minimum "
+			"supported firmware is %u.%u.%u.\n", major, minor,
+			micro, exp_major, exp_minor, exp_micro);
+		return -EFAULT;
+	}
+	return 0;
+}
+
+/* Is the given firmware API compatible with the one the driver was compiled
+ * with?
+ */
+static int fw_compatible(const struct fw_hdr *hdr1, const struct fw_hdr *hdr2)
+{
+
+	/* short circuit if it's the exact same firmware version */
+	if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver)
+		return 1;
+
+#define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x)
+	if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) &&
+	    SAME_INTF(ri) && SAME_INTF(iscsi) && SAME_INTF(fcoe))
+		return 1;
+#undef SAME_INTF
+
+	return 0;
+}
+
+/* The firmware in the filesystem is usable, but should it be installed?
+ * This routine explains itself in detail if it indicates the filesystem
+ * firmware should be installed.
+ */
+static int should_install_fs_fw(struct adapter *adap, int card_fw_usable,
+				int k, int c)
+{
+	const char *reason;
+
+	if (!card_fw_usable) {
+		reason = "incompatible or unusable";
+		goto install;
+	}
+
+	if (k > c) {
+		reason = "older than the version supported with this driver";
+		goto install;
+	}
+
+	return 0;
+
+install:
+	dev_err(adap->pdev_dev, "firmware on card (%u.%u.%u.%u) is %s, "
+		"installing firmware %u.%u.%u.%u on card.\n",
+		FW_HDR_FW_VER_MAJOR_G(c), FW_HDR_FW_VER_MINOR_G(c),
+		FW_HDR_FW_VER_MICRO_G(c), FW_HDR_FW_VER_BUILD_G(c), reason,
+		FW_HDR_FW_VER_MAJOR_G(k), FW_HDR_FW_VER_MINOR_G(k),
+		FW_HDR_FW_VER_MICRO_G(k), FW_HDR_FW_VER_BUILD_G(k));
+
+	return 1;
+}
+
+int t4_prep_fw(struct adapter *adap, struct fw_info *fw_info,
+	       const u8 *fw_data, unsigned int fw_size,
+	       struct fw_hdr *card_fw, enum dev_state state,
+	       int *reset)
+{
+	int ret, card_fw_usable, fs_fw_usable;
+	const struct fw_hdr *fs_fw;
+	const struct fw_hdr *drv_fw;
+
+	drv_fw = &fw_info->fw_hdr;
+
+	/* Read the header of the firmware on the card */
+	ret = -t4_read_flash(adap, FLASH_FW_START,
+			    sizeof(*card_fw) / sizeof(uint32_t),
+			    (uint32_t *)card_fw, 1);
+	if (ret == 0) {
+		card_fw_usable = fw_compatible(drv_fw, (const void *)card_fw);
+	} else {
+		dev_err(adap->pdev_dev,
+			"Unable to read card's firmware header: %d\n", ret);
+		card_fw_usable = 0;
+	}
+
+	if (fw_data != NULL) {
+		fs_fw = (const void *)fw_data;
+		fs_fw_usable = fw_compatible(drv_fw, fs_fw);
+	} else {
+		fs_fw = NULL;
+		fs_fw_usable = 0;
+	}
+
+	if (card_fw_usable && card_fw->fw_ver == drv_fw->fw_ver &&
+	    (!fs_fw_usable || fs_fw->fw_ver == drv_fw->fw_ver)) {
+		/* Common case: the firmware on the card is an exact match and
+		 * the filesystem one is an exact match too, or the filesystem
+		 * one is absent/incompatible.
+		 */
+	} else if (fs_fw_usable && state == DEV_STATE_UNINIT &&
+		   should_install_fs_fw(adap, card_fw_usable,
+					be32_to_cpu(fs_fw->fw_ver),
+					be32_to_cpu(card_fw->fw_ver))) {
+		ret = -t4_fw_upgrade(adap, adap->mbox, fw_data,
+				     fw_size, 0);
+		if (ret != 0) {
+			dev_err(adap->pdev_dev,
+				"failed to install firmware: %d\n", ret);
+			goto bye;
+		}
+
+		/* Installed successfully, update the cached header too. */
+		*card_fw = *fs_fw;
+		card_fw_usable = 1;
+		*reset = 0;	/* already reset as part of load_fw */
+	}
+
+	if (!card_fw_usable) {
+		uint32_t d, c, k;
+
+		d = be32_to_cpu(drv_fw->fw_ver);
+		c = be32_to_cpu(card_fw->fw_ver);
+		k = fs_fw ? be32_to_cpu(fs_fw->fw_ver) : 0;
+
+		dev_err(adap->pdev_dev, "Cannot find a usable firmware: "
+			"chip state %d, "
+			"driver compiled with %d.%d.%d.%d, "
+			"card has %d.%d.%d.%d, filesystem has %d.%d.%d.%d\n",
+			state,
+			FW_HDR_FW_VER_MAJOR_G(d), FW_HDR_FW_VER_MINOR_G(d),
+			FW_HDR_FW_VER_MICRO_G(d), FW_HDR_FW_VER_BUILD_G(d),
+			FW_HDR_FW_VER_MAJOR_G(c), FW_HDR_FW_VER_MINOR_G(c),
+			FW_HDR_FW_VER_MICRO_G(c), FW_HDR_FW_VER_BUILD_G(c),
+			FW_HDR_FW_VER_MAJOR_G(k), FW_HDR_FW_VER_MINOR_G(k),
+			FW_HDR_FW_VER_MICRO_G(k), FW_HDR_FW_VER_BUILD_G(k));
+		ret = EINVAL;
+		goto bye;
+	}
+
+	/* We're using whatever's on the card and it's known to be good. */
+	adap->params.fw_vers = be32_to_cpu(card_fw->fw_ver);
+	adap->params.tp_vers = be32_to_cpu(card_fw->tp_microcode_ver);
+
+bye:
+	return ret;
+}
+
+/**
+ *	t4_flash_erase_sectors - erase a range of flash sectors
+ *	@adapter: the adapter
+ *	@start: the first sector to erase
+ *	@end: the last sector to erase
+ *
+ *	Erases the sectors in the given inclusive range.
+ */
+static int t4_flash_erase_sectors(struct adapter *adapter, int start, int end)
+{
+	int ret = 0;
+
+	if (end >= adapter->params.sf_nsec)
+		return -EINVAL;
+
+	while (start <= end) {
+		if ((ret = sf1_write(adapter, 1, 0, 1, SF_WR_ENABLE)) != 0 ||
+		    (ret = sf1_write(adapter, 4, 0, 1,
+				     SF_ERASE_SECTOR | (start << 8))) != 0 ||
+		    (ret = flash_wait_op(adapter, 14, 500)) != 0) {
+			dev_err(adapter->pdev_dev,
+				"erase of flash sector %d failed, error %d\n",
+				start, ret);
+			break;
+		}
+		start++;
+	}
+	t4_write_reg(adapter, SF_OP_A, 0);    /* unlock SF */
+	return ret;
+}
+
+/**
+ *	t4_flash_cfg_addr - return the address of the flash configuration file
+ *	@adapter: the adapter
+ *
+ *	Return the address within the flash where the Firmware Configuration
+ *	File is stored.
+ */
+unsigned int t4_flash_cfg_addr(struct adapter *adapter)
+{
+	if (adapter->params.sf_size == 0x100000)
+		return FLASH_FPGA_CFG_START;
+	else
+		return FLASH_CFG_START;
+}
+
+/* Return TRUE if the specified firmware matches the adapter.  I.e. T4
+ * firmware for T4 adapters, T5 firmware for T5 adapters, etc.  We go ahead
+ * and emit an error message for mismatched firmware to save our caller the
+ * effort ...
+ */
+static bool t4_fw_matches_chip(const struct adapter *adap,
+			       const struct fw_hdr *hdr)
+{
+	/* The expression below will return FALSE for any unsupported adapter
+	 * which will keep us "honest" in the future ...
+	 */
+	if ((is_t4(adap->params.chip) && hdr->chip == FW_HDR_CHIP_T4) ||
+	    (is_t5(adap->params.chip) && hdr->chip == FW_HDR_CHIP_T5) ||
+	    (is_t6(adap->params.chip) && hdr->chip == FW_HDR_CHIP_T6))
+		return true;
+
+	dev_err(adap->pdev_dev,
+		"FW image (%d) is not suitable for this adapter (%d)\n",
+		hdr->chip, CHELSIO_CHIP_VERSION(adap->params.chip));
+	return false;
+}
+
+/**
+ *	t4_load_fw - download firmware
+ *	@adap: the adapter
+ *	@fw_data: the firmware image to write
+ *	@size: image size
+ *
+ *	Write the supplied firmware image to the card's serial flash.
+ */
+int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size)
+{
+	u32 csum;
+	int ret, addr;
+	unsigned int i;
+	u8 first_page[SF_PAGE_SIZE];
+	const __be32 *p = (const __be32 *)fw_data;
+	const struct fw_hdr *hdr = (const struct fw_hdr *)fw_data;
+	unsigned int sf_sec_size = adap->params.sf_size / adap->params.sf_nsec;
+	unsigned int fw_start_sec = FLASH_FW_START_SEC;
+	unsigned int fw_size = FLASH_FW_MAX_SIZE;
+	unsigned int fw_start = FLASH_FW_START;
+
+	if (!size) {
+		dev_err(adap->pdev_dev, "FW image has no data\n");
+		return -EINVAL;
+	}
+	if (size & 511) {
+		dev_err(adap->pdev_dev,
+			"FW image size not multiple of 512 bytes\n");
+		return -EINVAL;
+	}
+	if ((unsigned int)be16_to_cpu(hdr->len512) * 512 != size) {
+		dev_err(adap->pdev_dev,
+			"FW image size differs from size in FW header\n");
+		return -EINVAL;
+	}
+	if (size > fw_size) {
+		dev_err(adap->pdev_dev, "FW image too large, max is %u bytes\n",
+			fw_size);
+		return -EFBIG;
+	}
+	if (!t4_fw_matches_chip(adap, hdr))
+		return -EINVAL;
+
+	for (csum = 0, i = 0; i < size / sizeof(csum); i++)
+		csum += be32_to_cpu(p[i]);
+
+	if (csum != 0xffffffff) {
+		dev_err(adap->pdev_dev,
+			"corrupted firmware image, checksum %#x\n", csum);
+		return -EINVAL;
+	}
+
+	i = DIV_ROUND_UP(size, sf_sec_size);        /* # of sectors spanned */
+	ret = t4_flash_erase_sectors(adap, fw_start_sec, fw_start_sec + i - 1);
+	if (ret)
+		goto out;
+
+	/*
+	 * We write the correct version at the end so the driver can see a bad
+	 * version if the FW write fails.  Start by writing a copy of the
+	 * first page with a bad version.
+	 */
+	memcpy(first_page, fw_data, SF_PAGE_SIZE);
+	((struct fw_hdr *)first_page)->fw_ver = cpu_to_be32(0xffffffff);
+	ret = t4_write_flash(adap, fw_start, SF_PAGE_SIZE, first_page);
+	if (ret)
+		goto out;
+
+	addr = fw_start;
+	for (size -= SF_PAGE_SIZE; size; size -= SF_PAGE_SIZE) {
+		addr += SF_PAGE_SIZE;
+		fw_data += SF_PAGE_SIZE;
+		ret = t4_write_flash(adap, addr, SF_PAGE_SIZE, fw_data);
+		if (ret)
+			goto out;
+	}
+
+	ret = t4_write_flash(adap,
+			     fw_start + offsetof(struct fw_hdr, fw_ver),
+			     sizeof(hdr->fw_ver), (const u8 *)&hdr->fw_ver);
+out:
+	if (ret)
+		dev_err(adap->pdev_dev, "firmware download failed, error %d\n",
+			ret);
+	else
+		ret = t4_get_fw_version(adap, &adap->params.fw_vers);
+	return ret;
+}
+
+/**
+ *	t4_phy_fw_ver - return current PHY firmware version
+ *	@adap: the adapter
+ *	@phy_fw_ver: return value buffer for PHY firmware version
+ *
+ *	Returns the current version of external PHY firmware on the
+ *	adapter.
+ */
+int t4_phy_fw_ver(struct adapter *adap, int *phy_fw_ver)
+{
+	u32 param, val;
+	int ret;
+
+	param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+		 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_PHYFW) |
+		 FW_PARAMS_PARAM_Y_V(adap->params.portvec) |
+		 FW_PARAMS_PARAM_Z_V(FW_PARAMS_PARAM_DEV_PHYFW_VERSION));
+	ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 1,
+			      &param, &val);
+	if (ret < 0)
+		return ret;
+	*phy_fw_ver = val;
+	return 0;
+}
+
+/**
+ *	t4_load_phy_fw - download port PHY firmware
+ *	@adap: the adapter
+ *	@win: the PCI-E Memory Window index to use for t4_memory_rw()
+ *	@win_lock: the lock to use to guard the memory copy
+ *	@phy_fw_version: function to check PHY firmware versions
+ *	@phy_fw_data: the PHY firmware image to write
+ *	@phy_fw_size: image size
+ *
+ *	Transfer the specified PHY firmware to the adapter.  If a non-NULL
+ *	@phy_fw_version is supplied, then it will be used to determine if
+ *	it's necessary to perform the transfer by comparing the version
+ *	of any existing adapter PHY firmware with that of the passed in
+ *	PHY firmware image.  If @win_lock is non-NULL then it will be used
+ *	around the call to t4_memory_rw() which transfers the PHY firmware
+ *	to the adapter.
+ *
+ *	A negative error number will be returned if an error occurs.  If
+ *	version number support is available and there's no need to upgrade
+ *	the firmware, 0 will be returned.  If firmware is successfully
+ *	transferred to the adapter, 1 will be retured.
+ *
+ *	NOTE: some adapters only have local RAM to store the PHY firmware.  As
+ *	a result, a RESET of the adapter would cause that RAM to lose its
+ *	contents.  Thus, loading PHY firmware on such adapters must happen
+ *	after any FW_RESET_CMDs ...
+ */
+int t4_load_phy_fw(struct adapter *adap,
+		   int win, spinlock_t *win_lock,
+		   int (*phy_fw_version)(const u8 *, size_t),
+		   const u8 *phy_fw_data, size_t phy_fw_size)
+{
+	unsigned long mtype = 0, maddr = 0;
+	u32 param, val;
+	int cur_phy_fw_ver = 0, new_phy_fw_vers = 0;
+	int ret;
+
+	/* If we have version number support, then check to see if the adapter
+	 * already has up-to-date PHY firmware loaded.
+	 */
+	 if (phy_fw_version) {
+		new_phy_fw_vers = phy_fw_version(phy_fw_data, phy_fw_size);
+		ret = t4_phy_fw_ver(adap, &cur_phy_fw_ver);
+		if (ret < 0)
+			return ret;
+
+		if (cur_phy_fw_ver >= new_phy_fw_vers) {
+			CH_WARN(adap, "PHY Firmware already up-to-date, "
+				"version %#x\n", cur_phy_fw_ver);
+			return 0;
+		}
+	}
+
+	/* Ask the firmware where it wants us to copy the PHY firmware image.
+	 * The size of the file requires a special version of the READ coommand
+	 * which will pass the file size via the values field in PARAMS_CMD and
+	 * retrieve the return value from firmware and place it in the same
+	 * buffer values
+	 */
+	param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+		 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_PHYFW) |
+		 FW_PARAMS_PARAM_Y_V(adap->params.portvec) |
+		 FW_PARAMS_PARAM_Z_V(FW_PARAMS_PARAM_DEV_PHYFW_DOWNLOAD));
+	val = phy_fw_size;
+	ret = t4_query_params_rw(adap, adap->mbox, adap->pf, 0, 1,
+				 &param, &val, 1, true);
+	if (ret < 0)
+		return ret;
+	mtype = val >> 8;
+	maddr = (val & 0xff) << 16;
+
+	/* Copy the supplied PHY Firmware image to the adapter memory location
+	 * allocated by the adapter firmware.
+	 */
+	if (win_lock)
+		spin_lock_bh(win_lock);
+	ret = t4_memory_rw(adap, win, mtype, maddr,
+			   phy_fw_size, (__be32 *)phy_fw_data,
+			   T4_MEMORY_WRITE);
+	if (win_lock)
+		spin_unlock_bh(win_lock);
+	if (ret)
+		return ret;
+
+	/* Tell the firmware that the PHY firmware image has been written to
+	 * RAM and it can now start copying it over to the PHYs.  The chip
+	 * firmware will RESET the affected PHYs as part of this operation
+	 * leaving them running the new PHY firmware image.
+	 */
+	param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+		 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_PHYFW) |
+		 FW_PARAMS_PARAM_Y_V(adap->params.portvec) |
+		 FW_PARAMS_PARAM_Z_V(FW_PARAMS_PARAM_DEV_PHYFW_DOWNLOAD));
+	ret = t4_set_params_timeout(adap, adap->mbox, adap->pf, 0, 1,
+				    &param, &val, 30000);
+
+	/* If we have version number support, then check to see that the new
+	 * firmware got loaded properly.
+	 */
+	if (phy_fw_version) {
+		ret = t4_phy_fw_ver(adap, &cur_phy_fw_ver);
+		if (ret < 0)
+			return ret;
+
+		if (cur_phy_fw_ver != new_phy_fw_vers) {
+			CH_WARN(adap, "PHY Firmware did not update: "
+				"version on adapter %#x, "
+				"version flashed %#x\n",
+				cur_phy_fw_ver, new_phy_fw_vers);
+			return -ENXIO;
+		}
+	}
+
+	return 1;
+}
+
+/**
+ *	t4_fwcache - firmware cache operation
+ *	@adap: the adapter
+ *	@op  : the operation (flush or flush and invalidate)
+ */
+int t4_fwcache(struct adapter *adap, enum fw_params_param_dev_fwcache op)
+{
+	struct fw_params_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn =
+		cpu_to_be32(FW_CMD_OP_V(FW_PARAMS_CMD) |
+			    FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+			    FW_PARAMS_CMD_PFN_V(adap->pf) |
+			    FW_PARAMS_CMD_VFN_V(0));
+	c.retval_len16 = cpu_to_be32(FW_LEN16(c));
+	c.param[0].mnem =
+		cpu_to_be32(FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+			    FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_FWCACHE));
+	c.param[0].val = (__force __be32)op;
+
+	return t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), NULL);
+}
+
+void t4_cim_read_pif_la(struct adapter *adap, u32 *pif_req, u32 *pif_rsp,
+			unsigned int *pif_req_wrptr,
+			unsigned int *pif_rsp_wrptr)
+{
+	int i, j;
+	u32 cfg, val, req, rsp;
+
+	cfg = t4_read_reg(adap, CIM_DEBUGCFG_A);
+	if (cfg & LADBGEN_F)
+		t4_write_reg(adap, CIM_DEBUGCFG_A, cfg ^ LADBGEN_F);
+
+	val = t4_read_reg(adap, CIM_DEBUGSTS_A);
+	req = POLADBGWRPTR_G(val);
+	rsp = PILADBGWRPTR_G(val);
+	if (pif_req_wrptr)
+		*pif_req_wrptr = req;
+	if (pif_rsp_wrptr)
+		*pif_rsp_wrptr = rsp;
+
+	for (i = 0; i < CIM_PIFLA_SIZE; i++) {
+		for (j = 0; j < 6; j++) {
+			t4_write_reg(adap, CIM_DEBUGCFG_A, POLADBGRDPTR_V(req) |
+				     PILADBGRDPTR_V(rsp));
+			*pif_req++ = t4_read_reg(adap, CIM_PO_LA_DEBUGDATA_A);
+			*pif_rsp++ = t4_read_reg(adap, CIM_PI_LA_DEBUGDATA_A);
+			req++;
+			rsp++;
+		}
+		req = (req + 2) & POLADBGRDPTR_M;
+		rsp = (rsp + 2) & PILADBGRDPTR_M;
+	}
+	t4_write_reg(adap, CIM_DEBUGCFG_A, cfg);
+}
+
+void t4_cim_read_ma_la(struct adapter *adap, u32 *ma_req, u32 *ma_rsp)
+{
+	u32 cfg;
+	int i, j, idx;
+
+	cfg = t4_read_reg(adap, CIM_DEBUGCFG_A);
+	if (cfg & LADBGEN_F)
+		t4_write_reg(adap, CIM_DEBUGCFG_A, cfg ^ LADBGEN_F);
+
+	for (i = 0; i < CIM_MALA_SIZE; i++) {
+		for (j = 0; j < 5; j++) {
+			idx = 8 * i + j;
+			t4_write_reg(adap, CIM_DEBUGCFG_A, POLADBGRDPTR_V(idx) |
+				     PILADBGRDPTR_V(idx));
+			*ma_req++ = t4_read_reg(adap, CIM_PO_LA_MADEBUGDATA_A);
+			*ma_rsp++ = t4_read_reg(adap, CIM_PI_LA_MADEBUGDATA_A);
+		}
+	}
+	t4_write_reg(adap, CIM_DEBUGCFG_A, cfg);
+}
+
+void t4_ulprx_read_la(struct adapter *adap, u32 *la_buf)
+{
+	unsigned int i, j;
+
+	for (i = 0; i < 8; i++) {
+		u32 *p = la_buf + i;
+
+		t4_write_reg(adap, ULP_RX_LA_CTL_A, i);
+		j = t4_read_reg(adap, ULP_RX_LA_WRPTR_A);
+		t4_write_reg(adap, ULP_RX_LA_RDPTR_A, j);
+		for (j = 0; j < ULPRX_LA_SIZE; j++, p += 8)
+			*p = t4_read_reg(adap, ULP_RX_LA_RDDATA_A);
+	}
+}
+
+#define ADVERT_MASK (FW_PORT_CAP32_SPEED_V(FW_PORT_CAP32_SPEED_M) | \
+		     FW_PORT_CAP32_ANEG)
+
+/**
+ *	fwcaps16_to_caps32 - convert 16-bit Port Capabilities to 32-bits
+ *	@caps16: a 16-bit Port Capabilities value
+ *
+ *	Returns the equivalent 32-bit Port Capabilities value.
+ */
+static fw_port_cap32_t fwcaps16_to_caps32(fw_port_cap16_t caps16)
+{
+	fw_port_cap32_t caps32 = 0;
+
+	#define CAP16_TO_CAP32(__cap) \
+		do { \
+			if (caps16 & FW_PORT_CAP_##__cap) \
+				caps32 |= FW_PORT_CAP32_##__cap; \
+		} while (0)
+
+	CAP16_TO_CAP32(SPEED_100M);
+	CAP16_TO_CAP32(SPEED_1G);
+	CAP16_TO_CAP32(SPEED_25G);
+	CAP16_TO_CAP32(SPEED_10G);
+	CAP16_TO_CAP32(SPEED_40G);
+	CAP16_TO_CAP32(SPEED_100G);
+	CAP16_TO_CAP32(FC_RX);
+	CAP16_TO_CAP32(FC_TX);
+	CAP16_TO_CAP32(ANEG);
+	CAP16_TO_CAP32(MDIX);
+	CAP16_TO_CAP32(MDIAUTO);
+	CAP16_TO_CAP32(FEC_RS);
+	CAP16_TO_CAP32(FEC_BASER_RS);
+	CAP16_TO_CAP32(802_3_PAUSE);
+	CAP16_TO_CAP32(802_3_ASM_DIR);
+
+	#undef CAP16_TO_CAP32
+
+	return caps32;
+}
+
+/**
+ *	fwcaps32_to_caps16 - convert 32-bit Port Capabilities to 16-bits
+ *	@caps32: a 32-bit Port Capabilities value
+ *
+ *	Returns the equivalent 16-bit Port Capabilities value.  Note that
+ *	not all 32-bit Port Capabilities can be represented in the 16-bit
+ *	Port Capabilities and some fields/values may not make it.
+ */
+static fw_port_cap16_t fwcaps32_to_caps16(fw_port_cap32_t caps32)
+{
+	fw_port_cap16_t caps16 = 0;
+
+	#define CAP32_TO_CAP16(__cap) \
+		do { \
+			if (caps32 & FW_PORT_CAP32_##__cap) \
+				caps16 |= FW_PORT_CAP_##__cap; \
+		} while (0)
+
+	CAP32_TO_CAP16(SPEED_100M);
+	CAP32_TO_CAP16(SPEED_1G);
+	CAP32_TO_CAP16(SPEED_10G);
+	CAP32_TO_CAP16(SPEED_25G);
+	CAP32_TO_CAP16(SPEED_40G);
+	CAP32_TO_CAP16(SPEED_100G);
+	CAP32_TO_CAP16(FC_RX);
+	CAP32_TO_CAP16(FC_TX);
+	CAP32_TO_CAP16(802_3_PAUSE);
+	CAP32_TO_CAP16(802_3_ASM_DIR);
+	CAP32_TO_CAP16(ANEG);
+	CAP32_TO_CAP16(MDIX);
+	CAP32_TO_CAP16(MDIAUTO);
+	CAP32_TO_CAP16(FEC_RS);
+	CAP32_TO_CAP16(FEC_BASER_RS);
+
+	#undef CAP32_TO_CAP16
+
+	return caps16;
+}
+
+/* Translate Firmware Port Capabilities Pause specification to Common Code */
+static inline enum cc_pause fwcap_to_cc_pause(fw_port_cap32_t fw_pause)
+{
+	enum cc_pause cc_pause = 0;
+
+	if (fw_pause & FW_PORT_CAP32_FC_RX)
+		cc_pause |= PAUSE_RX;
+	if (fw_pause & FW_PORT_CAP32_FC_TX)
+		cc_pause |= PAUSE_TX;
+
+	return cc_pause;
+}
+
+/* Translate Common Code Pause specification into Firmware Port Capabilities */
+static inline fw_port_cap32_t cc_to_fwcap_pause(enum cc_pause cc_pause)
+{
+	fw_port_cap32_t fw_pause = 0;
+
+	if (cc_pause & PAUSE_RX)
+		fw_pause |= FW_PORT_CAP32_FC_RX;
+	if (cc_pause & PAUSE_TX)
+		fw_pause |= FW_PORT_CAP32_FC_TX;
+
+	return fw_pause;
+}
+
+/* Translate Firmware Forward Error Correction specification to Common Code */
+static inline enum cc_fec fwcap_to_cc_fec(fw_port_cap32_t fw_fec)
+{
+	enum cc_fec cc_fec = 0;
+
+	if (fw_fec & FW_PORT_CAP32_FEC_RS)
+		cc_fec |= FEC_RS;
+	if (fw_fec & FW_PORT_CAP32_FEC_BASER_RS)
+		cc_fec |= FEC_BASER_RS;
+
+	return cc_fec;
+}
+
+/* Translate Common Code Forward Error Correction specification to Firmware */
+static inline fw_port_cap32_t cc_to_fwcap_fec(enum cc_fec cc_fec)
+{
+	fw_port_cap32_t fw_fec = 0;
+
+	if (cc_fec & FEC_RS)
+		fw_fec |= FW_PORT_CAP32_FEC_RS;
+	if (cc_fec & FEC_BASER_RS)
+		fw_fec |= FW_PORT_CAP32_FEC_BASER_RS;
+
+	return fw_fec;
+}
+
+/**
+ *	t4_link_l1cfg - apply link configuration to MAC/PHY
+ *	@adapter: the adapter
+ *	@mbox: the Firmware Mailbox to use
+ *	@port: the Port ID
+ *	@lc: the Port's Link Configuration
+ *
+ *	Set up a port's MAC and PHY according to a desired link configuration.
+ *	- If the PHY can auto-negotiate first decide what to advertise, then
+ *	  enable/disable auto-negotiation as desired, and reset.
+ *	- If the PHY does not auto-negotiate just reset it.
+ *	- If auto-negotiation is off set the MAC to the proper speed/duplex/FC,
+ *	  otherwise do it later based on the outcome of auto-negotiation.
+ */
+int t4_link_l1cfg(struct adapter *adapter, unsigned int mbox,
+		  unsigned int port, struct link_config *lc)
+{
+	unsigned int fw_caps = adapter->params.fw_caps_support;
+	struct fw_port_cmd cmd;
+	unsigned int fw_mdi = FW_PORT_CAP32_MDI_V(FW_PORT_CAP32_MDI_AUTO);
+	fw_port_cap32_t fw_fc, cc_fec, fw_fec, rcap;
+
+	/* Convert driver coding of Pause Frame Flow Control settings into the
+	 * Firmware's API.
+	 */
+	fw_fc = cc_to_fwcap_pause(lc->requested_fc);
+
+	/* Convert Common Code Forward Error Control settings into the
+	 * Firmware's API.  If the current Requested FEC has "Automatic"
+	 * (IEEE 802.3) specified, then we use whatever the Firmware
+	 * sent us as part of it's IEEE 802.3-based interpratation of
+	 * the Transceiver Module EPROM FEC parameters.  Otherwise we
+	 * use whatever is in the current Requested FEC settings.
+	 */
+	if (lc->requested_fec & FEC_AUTO)
+		cc_fec = fwcap_to_cc_fec(lc->def_acaps);
+	else
+		cc_fec = lc->requested_fec;
+	fw_fec = cc_to_fwcap_fec(cc_fec);
+
+	/* Figure out what our Requested Port Capabilities are going to be.
+	 */
+	if (!(lc->pcaps & FW_PORT_CAP32_ANEG)) {
+		rcap = (lc->pcaps & ADVERT_MASK) | fw_fc | fw_fec;
+		lc->fc = lc->requested_fc & ~PAUSE_AUTONEG;
+		lc->fec = cc_fec;
+	} else if (lc->autoneg == AUTONEG_DISABLE) {
+		rcap = lc->speed_caps | fw_fc | fw_fec | fw_mdi;
+		lc->fc = lc->requested_fc & ~PAUSE_AUTONEG;
+		lc->fec = cc_fec;
+	} else {
+		rcap = lc->acaps | fw_fc | fw_fec | fw_mdi;
+	}
+
+	/* And send that on to the Firmware ...
+	 */
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
+				       FW_CMD_REQUEST_F | FW_CMD_EXEC_F |
+				       FW_PORT_CMD_PORTID_V(port));
+	cmd.action_to_len16 =
+		cpu_to_be32(FW_PORT_CMD_ACTION_V(fw_caps == FW_CAPS16
+						 ? FW_PORT_ACTION_L1_CFG
+						 : FW_PORT_ACTION_L1_CFG32) |
+			    FW_LEN16(cmd));
+	if (fw_caps == FW_CAPS16)
+		cmd.u.l1cfg.rcap = cpu_to_be32(fwcaps32_to_caps16(rcap));
+	else
+		cmd.u.l1cfg32.rcap32 = cpu_to_be32(rcap);
+	return t4_wr_mbox(adapter, mbox, &cmd, sizeof(cmd), NULL);
+}
+
+/**
+ *	t4_restart_aneg - restart autonegotiation
+ *	@adap: the adapter
+ *	@mbox: mbox to use for the FW command
+ *	@port: the port id
+ *
+ *	Restarts autonegotiation for the selected port.
+ */
+int t4_restart_aneg(struct adapter *adap, unsigned int mbox, unsigned int port)
+{
+	struct fw_port_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
+				     FW_CMD_REQUEST_F | FW_CMD_EXEC_F |
+				     FW_PORT_CMD_PORTID_V(port));
+	c.action_to_len16 =
+		cpu_to_be32(FW_PORT_CMD_ACTION_V(FW_PORT_ACTION_L1_CFG) |
+			    FW_LEN16(c));
+	c.u.l1cfg.rcap = cpu_to_be32(FW_PORT_CAP32_ANEG);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+typedef void (*int_handler_t)(struct adapter *adap);
+
+struct intr_info {
+	unsigned int mask;       /* bits to check in interrupt status */
+	const char *msg;         /* message to print or NULL */
+	short stat_idx;          /* stat counter to increment or -1 */
+	unsigned short fatal;    /* whether the condition reported is fatal */
+	int_handler_t int_handler; /* platform-specific int handler */
+};
+
+/**
+ *	t4_handle_intr_status - table driven interrupt handler
+ *	@adapter: the adapter that generated the interrupt
+ *	@reg: the interrupt status register to process
+ *	@acts: table of interrupt actions
+ *
+ *	A table driven interrupt handler that applies a set of masks to an
+ *	interrupt status word and performs the corresponding actions if the
+ *	interrupts described by the mask have occurred.  The actions include
+ *	optionally emitting a warning or alert message.  The table is terminated
+ *	by an entry specifying mask 0.  Returns the number of fatal interrupt
+ *	conditions.
+ */
+static int t4_handle_intr_status(struct adapter *adapter, unsigned int reg,
+				 const struct intr_info *acts)
+{
+	int fatal = 0;
+	unsigned int mask = 0;
+	unsigned int status = t4_read_reg(adapter, reg);
+
+	for ( ; acts->mask; ++acts) {
+		if (!(status & acts->mask))
+			continue;
+		if (acts->fatal) {
+			fatal++;
+			dev_alert(adapter->pdev_dev, "%s (0x%x)\n", acts->msg,
+				  status & acts->mask);
+		} else if (acts->msg && printk_ratelimit())
+			dev_warn(adapter->pdev_dev, "%s (0x%x)\n", acts->msg,
+				 status & acts->mask);
+		if (acts->int_handler)
+			acts->int_handler(adapter);
+		mask |= acts->mask;
+	}
+	status &= mask;
+	if (status)                           /* clear processed interrupts */
+		t4_write_reg(adapter, reg, status);
+	return fatal;
+}
+
+/*
+ * Interrupt handler for the PCIE module.
+ */
+static void pcie_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info sysbus_intr_info[] = {
+		{ RNPP_F, "RXNP array parity error", -1, 1 },
+		{ RPCP_F, "RXPC array parity error", -1, 1 },
+		{ RCIP_F, "RXCIF array parity error", -1, 1 },
+		{ RCCP_F, "Rx completions control array parity error", -1, 1 },
+		{ RFTP_F, "RXFT array parity error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info pcie_port_intr_info[] = {
+		{ TPCP_F, "TXPC array parity error", -1, 1 },
+		{ TNPP_F, "TXNP array parity error", -1, 1 },
+		{ TFTP_F, "TXFT array parity error", -1, 1 },
+		{ TCAP_F, "TXCA array parity error", -1, 1 },
+		{ TCIP_F, "TXCIF array parity error", -1, 1 },
+		{ RCAP_F, "RXCA array parity error", -1, 1 },
+		{ OTDD_F, "outbound request TLP discarded", -1, 1 },
+		{ RDPE_F, "Rx data parity error", -1, 1 },
+		{ TDUE_F, "Tx uncorrectable data error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info pcie_intr_info[] = {
+		{ MSIADDRLPERR_F, "MSI AddrL parity error", -1, 1 },
+		{ MSIADDRHPERR_F, "MSI AddrH parity error", -1, 1 },
+		{ MSIDATAPERR_F, "MSI data parity error", -1, 1 },
+		{ MSIXADDRLPERR_F, "MSI-X AddrL parity error", -1, 1 },
+		{ MSIXADDRHPERR_F, "MSI-X AddrH parity error", -1, 1 },
+		{ MSIXDATAPERR_F, "MSI-X data parity error", -1, 1 },
+		{ MSIXDIPERR_F, "MSI-X DI parity error", -1, 1 },
+		{ PIOCPLPERR_F, "PCI PIO completion FIFO parity error", -1, 1 },
+		{ PIOREQPERR_F, "PCI PIO request FIFO parity error", -1, 1 },
+		{ TARTAGPERR_F, "PCI PCI target tag FIFO parity error", -1, 1 },
+		{ CCNTPERR_F, "PCI CMD channel count parity error", -1, 1 },
+		{ CREQPERR_F, "PCI CMD channel request parity error", -1, 1 },
+		{ CRSPPERR_F, "PCI CMD channel response parity error", -1, 1 },
+		{ DCNTPERR_F, "PCI DMA channel count parity error", -1, 1 },
+		{ DREQPERR_F, "PCI DMA channel request parity error", -1, 1 },
+		{ DRSPPERR_F, "PCI DMA channel response parity error", -1, 1 },
+		{ HCNTPERR_F, "PCI HMA channel count parity error", -1, 1 },
+		{ HREQPERR_F, "PCI HMA channel request parity error", -1, 1 },
+		{ HRSPPERR_F, "PCI HMA channel response parity error", -1, 1 },
+		{ CFGSNPPERR_F, "PCI config snoop FIFO parity error", -1, 1 },
+		{ FIDPERR_F, "PCI FID parity error", -1, 1 },
+		{ INTXCLRPERR_F, "PCI INTx clear parity error", -1, 1 },
+		{ MATAGPERR_F, "PCI MA tag parity error", -1, 1 },
+		{ PIOTAGPERR_F, "PCI PIO tag parity error", -1, 1 },
+		{ RXCPLPERR_F, "PCI Rx completion parity error", -1, 1 },
+		{ RXWRPERR_F, "PCI Rx write parity error", -1, 1 },
+		{ RPLPERR_F, "PCI replay buffer parity error", -1, 1 },
+		{ PCIESINT_F, "PCI core secondary fault", -1, 1 },
+		{ PCIEPINT_F, "PCI core primary fault", -1, 1 },
+		{ UNXSPLCPLERR_F, "PCI unexpected split completion error",
+		  -1, 0 },
+		{ 0 }
+	};
+
+	static struct intr_info t5_pcie_intr_info[] = {
+		{ MSTGRPPERR_F, "Master Response Read Queue parity error",
+		  -1, 1 },
+		{ MSTTIMEOUTPERR_F, "Master Timeout FIFO parity error", -1, 1 },
+		{ MSIXSTIPERR_F, "MSI-X STI SRAM parity error", -1, 1 },
+		{ MSIXADDRLPERR_F, "MSI-X AddrL parity error", -1, 1 },
+		{ MSIXADDRHPERR_F, "MSI-X AddrH parity error", -1, 1 },
+		{ MSIXDATAPERR_F, "MSI-X data parity error", -1, 1 },
+		{ MSIXDIPERR_F, "MSI-X DI parity error", -1, 1 },
+		{ PIOCPLGRPPERR_F, "PCI PIO completion Group FIFO parity error",
+		  -1, 1 },
+		{ PIOREQGRPPERR_F, "PCI PIO request Group FIFO parity error",
+		  -1, 1 },
+		{ TARTAGPERR_F, "PCI PCI target tag FIFO parity error", -1, 1 },
+		{ MSTTAGQPERR_F, "PCI master tag queue parity error", -1, 1 },
+		{ CREQPERR_F, "PCI CMD channel request parity error", -1, 1 },
+		{ CRSPPERR_F, "PCI CMD channel response parity error", -1, 1 },
+		{ DREQWRPERR_F, "PCI DMA channel write request parity error",
+		  -1, 1 },
+		{ DREQPERR_F, "PCI DMA channel request parity error", -1, 1 },
+		{ DRSPPERR_F, "PCI DMA channel response parity error", -1, 1 },
+		{ HREQWRPERR_F, "PCI HMA channel count parity error", -1, 1 },
+		{ HREQPERR_F, "PCI HMA channel request parity error", -1, 1 },
+		{ HRSPPERR_F, "PCI HMA channel response parity error", -1, 1 },
+		{ CFGSNPPERR_F, "PCI config snoop FIFO parity error", -1, 1 },
+		{ FIDPERR_F, "PCI FID parity error", -1, 1 },
+		{ VFIDPERR_F, "PCI INTx clear parity error", -1, 1 },
+		{ MAGRPPERR_F, "PCI MA group FIFO parity error", -1, 1 },
+		{ PIOTAGPERR_F, "PCI PIO tag parity error", -1, 1 },
+		{ IPRXHDRGRPPERR_F, "PCI IP Rx header group parity error",
+		  -1, 1 },
+		{ IPRXDATAGRPPERR_F, "PCI IP Rx data group parity error",
+		  -1, 1 },
+		{ RPLPERR_F, "PCI IP replay buffer parity error", -1, 1 },
+		{ IPSOTPERR_F, "PCI IP SOT buffer parity error", -1, 1 },
+		{ TRGT1GRPPERR_F, "PCI TRGT1 group FIFOs parity error", -1, 1 },
+		{ READRSPERR_F, "Outbound read error", -1, 0 },
+		{ 0 }
+	};
+
+	int fat;
+
+	if (is_t4(adapter->params.chip))
+		fat = t4_handle_intr_status(adapter,
+				PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS_A,
+				sysbus_intr_info) +
+			t4_handle_intr_status(adapter,
+					PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS_A,
+					pcie_port_intr_info) +
+			t4_handle_intr_status(adapter, PCIE_INT_CAUSE_A,
+					      pcie_intr_info);
+	else
+		fat = t4_handle_intr_status(adapter, PCIE_INT_CAUSE_A,
+					    t5_pcie_intr_info);
+
+	if (fat)
+		t4_fatal_err(adapter);
+}
+
+/*
+ * TP interrupt handler.
+ */
+static void tp_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info tp_intr_info[] = {
+		{ 0x3fffffff, "TP parity error", -1, 1 },
+		{ FLMTXFLSTEMPTY_F, "TP out of Tx pages", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adapter, TP_INT_CAUSE_A, tp_intr_info))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * SGE interrupt handler.
+ */
+static void sge_intr_handler(struct adapter *adapter)
+{
+	u64 v;
+	u32 err;
+
+	static const struct intr_info sge_intr_info[] = {
+		{ ERR_CPL_EXCEED_IQE_SIZE_F,
+		  "SGE received CPL exceeding IQE size", -1, 1 },
+		{ ERR_INVALID_CIDX_INC_F,
+		  "SGE GTS CIDX increment too large", -1, 0 },
+		{ ERR_CPL_OPCODE_0_F, "SGE received 0-length CPL", -1, 0 },
+		{ DBFIFO_LP_INT_F, NULL, -1, 0, t4_db_full },
+		{ ERR_DATA_CPL_ON_HIGH_QID1_F | ERR_DATA_CPL_ON_HIGH_QID0_F,
+		  "SGE IQID > 1023 received CPL for FL", -1, 0 },
+		{ ERR_BAD_DB_PIDX3_F, "SGE DBP 3 pidx increment too large", -1,
+		  0 },
+		{ ERR_BAD_DB_PIDX2_F, "SGE DBP 2 pidx increment too large", -1,
+		  0 },
+		{ ERR_BAD_DB_PIDX1_F, "SGE DBP 1 pidx increment too large", -1,
+		  0 },
+		{ ERR_BAD_DB_PIDX0_F, "SGE DBP 0 pidx increment too large", -1,
+		  0 },
+		{ ERR_ING_CTXT_PRIO_F,
+		  "SGE too many priority ingress contexts", -1, 0 },
+		{ INGRESS_SIZE_ERR_F, "SGE illegal ingress QID", -1, 0 },
+		{ EGRESS_SIZE_ERR_F, "SGE illegal egress QID", -1, 0 },
+		{ 0 }
+	};
+
+	static struct intr_info t4t5_sge_intr_info[] = {
+		{ ERR_DROPPED_DB_F, NULL, -1, 0, t4_db_dropped },
+		{ DBFIFO_HP_INT_F, NULL, -1, 0, t4_db_full },
+		{ ERR_EGR_CTXT_PRIO_F,
+		  "SGE too many priority egress contexts", -1, 0 },
+		{ 0 }
+	};
+
+	v = (u64)t4_read_reg(adapter, SGE_INT_CAUSE1_A) |
+		((u64)t4_read_reg(adapter, SGE_INT_CAUSE2_A) << 32);
+	if (v) {
+		dev_alert(adapter->pdev_dev, "SGE parity error (%#llx)\n",
+				(unsigned long long)v);
+		t4_write_reg(adapter, SGE_INT_CAUSE1_A, v);
+		t4_write_reg(adapter, SGE_INT_CAUSE2_A, v >> 32);
+	}
+
+	v |= t4_handle_intr_status(adapter, SGE_INT_CAUSE3_A, sge_intr_info);
+	if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5)
+		v |= t4_handle_intr_status(adapter, SGE_INT_CAUSE3_A,
+					   t4t5_sge_intr_info);
+
+	err = t4_read_reg(adapter, SGE_ERROR_STATS_A);
+	if (err & ERROR_QID_VALID_F) {
+		dev_err(adapter->pdev_dev, "SGE error for queue %u\n",
+			ERROR_QID_G(err));
+		if (err & UNCAPTURED_ERROR_F)
+			dev_err(adapter->pdev_dev,
+				"SGE UNCAPTURED_ERROR set (clearing)\n");
+		t4_write_reg(adapter, SGE_ERROR_STATS_A, ERROR_QID_VALID_F |
+			     UNCAPTURED_ERROR_F);
+	}
+
+	if (v != 0)
+		t4_fatal_err(adapter);
+}
+
+#define CIM_OBQ_INTR (OBQULP0PARERR_F | OBQULP1PARERR_F | OBQULP2PARERR_F |\
+		      OBQULP3PARERR_F | OBQSGEPARERR_F | OBQNCSIPARERR_F)
+#define CIM_IBQ_INTR (IBQTP0PARERR_F | IBQTP1PARERR_F | IBQULPPARERR_F |\
+		      IBQSGEHIPARERR_F | IBQSGELOPARERR_F | IBQNCSIPARERR_F)
+
+/*
+ * CIM interrupt handler.
+ */
+static void cim_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info cim_intr_info[] = {
+		{ PREFDROPINT_F, "CIM control register prefetch drop", -1, 1 },
+		{ CIM_OBQ_INTR, "CIM OBQ parity error", -1, 1 },
+		{ CIM_IBQ_INTR, "CIM IBQ parity error", -1, 1 },
+		{ MBUPPARERR_F, "CIM mailbox uP parity error", -1, 1 },
+		{ MBHOSTPARERR_F, "CIM mailbox host parity error", -1, 1 },
+		{ TIEQINPARERRINT_F, "CIM TIEQ outgoing parity error", -1, 1 },
+		{ TIEQOUTPARERRINT_F, "CIM TIEQ incoming parity error", -1, 1 },
+		{ TIMER0INT_F, "CIM TIMER0 interrupt", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info cim_upintr_info[] = {
+		{ RSVDSPACEINT_F, "CIM reserved space access", -1, 1 },
+		{ ILLTRANSINT_F, "CIM illegal transaction", -1, 1 },
+		{ ILLWRINT_F, "CIM illegal write", -1, 1 },
+		{ ILLRDINT_F, "CIM illegal read", -1, 1 },
+		{ ILLRDBEINT_F, "CIM illegal read BE", -1, 1 },
+		{ ILLWRBEINT_F, "CIM illegal write BE", -1, 1 },
+		{ SGLRDBOOTINT_F, "CIM single read from boot space", -1, 1 },
+		{ SGLWRBOOTINT_F, "CIM single write to boot space", -1, 1 },
+		{ BLKWRBOOTINT_F, "CIM block write to boot space", -1, 1 },
+		{ SGLRDFLASHINT_F, "CIM single read from flash space", -1, 1 },
+		{ SGLWRFLASHINT_F, "CIM single write to flash space", -1, 1 },
+		{ BLKWRFLASHINT_F, "CIM block write to flash space", -1, 1 },
+		{ SGLRDEEPROMINT_F, "CIM single EEPROM read", -1, 1 },
+		{ SGLWREEPROMINT_F, "CIM single EEPROM write", -1, 1 },
+		{ BLKRDEEPROMINT_F, "CIM block EEPROM read", -1, 1 },
+		{ BLKWREEPROMINT_F, "CIM block EEPROM write", -1, 1 },
+		{ SGLRDCTLINT_F, "CIM single read from CTL space", -1, 1 },
+		{ SGLWRCTLINT_F, "CIM single write to CTL space", -1, 1 },
+		{ BLKRDCTLINT_F, "CIM block read from CTL space", -1, 1 },
+		{ BLKWRCTLINT_F, "CIM block write to CTL space", -1, 1 },
+		{ SGLRDPLINT_F, "CIM single read from PL space", -1, 1 },
+		{ SGLWRPLINT_F, "CIM single write to PL space", -1, 1 },
+		{ BLKRDPLINT_F, "CIM block read from PL space", -1, 1 },
+		{ BLKWRPLINT_F, "CIM block write to PL space", -1, 1 },
+		{ REQOVRLOOKUPINT_F, "CIM request FIFO overwrite", -1, 1 },
+		{ RSPOVRLOOKUPINT_F, "CIM response FIFO overwrite", -1, 1 },
+		{ TIMEOUTINT_F, "CIM PIF timeout", -1, 1 },
+		{ TIMEOUTMAINT_F, "CIM PIF MA timeout", -1, 1 },
+		{ 0 }
+	};
+
+	u32 val, fw_err;
+	int fat;
+
+	fw_err = t4_read_reg(adapter, PCIE_FW_A);
+	if (fw_err & PCIE_FW_ERR_F)
+		t4_report_fw_error(adapter);
+
+	/* When the Firmware detects an internal error which normally
+	 * wouldn't raise a Host Interrupt, it forces a CIM Timer0 interrupt
+	 * in order to make sure the Host sees the Firmware Crash.  So
+	 * if we have a Timer0 interrupt and don't see a Firmware Crash,
+	 * ignore the Timer0 interrupt.
+	 */
+
+	val = t4_read_reg(adapter, CIM_HOST_INT_CAUSE_A);
+	if (val & TIMER0INT_F)
+		if (!(fw_err & PCIE_FW_ERR_F) ||
+		    (PCIE_FW_EVAL_G(fw_err) != PCIE_FW_EVAL_CRASH))
+			t4_write_reg(adapter, CIM_HOST_INT_CAUSE_A,
+				     TIMER0INT_F);
+
+	fat = t4_handle_intr_status(adapter, CIM_HOST_INT_CAUSE_A,
+				    cim_intr_info) +
+	      t4_handle_intr_status(adapter, CIM_HOST_UPACC_INT_CAUSE_A,
+				    cim_upintr_info);
+	if (fat)
+		t4_fatal_err(adapter);
+}
+
+/*
+ * ULP RX interrupt handler.
+ */
+static void ulprx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info ulprx_intr_info[] = {
+		{ 0x1800000, "ULPRX context error", -1, 1 },
+		{ 0x7fffff, "ULPRX parity error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adapter, ULP_RX_INT_CAUSE_A, ulprx_intr_info))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * ULP TX interrupt handler.
+ */
+static void ulptx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info ulptx_intr_info[] = {
+		{ PBL_BOUND_ERR_CH3_F, "ULPTX channel 3 PBL out of bounds", -1,
+		  0 },
+		{ PBL_BOUND_ERR_CH2_F, "ULPTX channel 2 PBL out of bounds", -1,
+		  0 },
+		{ PBL_BOUND_ERR_CH1_F, "ULPTX channel 1 PBL out of bounds", -1,
+		  0 },
+		{ PBL_BOUND_ERR_CH0_F, "ULPTX channel 0 PBL out of bounds", -1,
+		  0 },
+		{ 0xfffffff, "ULPTX parity error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adapter, ULP_TX_INT_CAUSE_A, ulptx_intr_info))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * PM TX interrupt handler.
+ */
+static void pmtx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info pmtx_intr_info[] = {
+		{ PCMD_LEN_OVFL0_F, "PMTX channel 0 pcmd too large", -1, 1 },
+		{ PCMD_LEN_OVFL1_F, "PMTX channel 1 pcmd too large", -1, 1 },
+		{ PCMD_LEN_OVFL2_F, "PMTX channel 2 pcmd too large", -1, 1 },
+		{ ZERO_C_CMD_ERROR_F, "PMTX 0-length pcmd", -1, 1 },
+		{ PMTX_FRAMING_ERROR_F, "PMTX framing error", -1, 1 },
+		{ OESPI_PAR_ERROR_F, "PMTX oespi parity error", -1, 1 },
+		{ DB_OPTIONS_PAR_ERROR_F, "PMTX db_options parity error",
+		  -1, 1 },
+		{ ICSPI_PAR_ERROR_F, "PMTX icspi parity error", -1, 1 },
+		{ PMTX_C_PCMD_PAR_ERROR_F, "PMTX c_pcmd parity error", -1, 1},
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adapter, PM_TX_INT_CAUSE_A, pmtx_intr_info))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * PM RX interrupt handler.
+ */
+static void pmrx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info pmrx_intr_info[] = {
+		{ ZERO_E_CMD_ERROR_F, "PMRX 0-length pcmd", -1, 1 },
+		{ PMRX_FRAMING_ERROR_F, "PMRX framing error", -1, 1 },
+		{ OCSPI_PAR_ERROR_F, "PMRX ocspi parity error", -1, 1 },
+		{ DB_OPTIONS_PAR_ERROR_F, "PMRX db_options parity error",
+		  -1, 1 },
+		{ IESPI_PAR_ERROR_F, "PMRX iespi parity error", -1, 1 },
+		{ PMRX_E_PCMD_PAR_ERROR_F, "PMRX e_pcmd parity error", -1, 1},
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adapter, PM_RX_INT_CAUSE_A, pmrx_intr_info))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * CPL switch interrupt handler.
+ */
+static void cplsw_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info cplsw_intr_info[] = {
+		{ CIM_OP_MAP_PERR_F, "CPLSW CIM op_map parity error", -1, 1 },
+		{ CIM_OVFL_ERROR_F, "CPLSW CIM overflow", -1, 1 },
+		{ TP_FRAMING_ERROR_F, "CPLSW TP framing error", -1, 1 },
+		{ SGE_FRAMING_ERROR_F, "CPLSW SGE framing error", -1, 1 },
+		{ CIM_FRAMING_ERROR_F, "CPLSW CIM framing error", -1, 1 },
+		{ ZERO_SWITCH_ERROR_F, "CPLSW no-switch error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adapter, CPL_INTR_CAUSE_A, cplsw_intr_info))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * LE interrupt handler.
+ */
+static void le_intr_handler(struct adapter *adap)
+{
+	enum chip_type chip = CHELSIO_CHIP_VERSION(adap->params.chip);
+	static const struct intr_info le_intr_info[] = {
+		{ LIPMISS_F, "LE LIP miss", -1, 0 },
+		{ LIP0_F, "LE 0 LIP error", -1, 0 },
+		{ PARITYERR_F, "LE parity error", -1, 1 },
+		{ UNKNOWNCMD_F, "LE unknown command", -1, 1 },
+		{ REQQPARERR_F, "LE request queue parity error", -1, 1 },
+		{ 0 }
+	};
+
+	static struct intr_info t6_le_intr_info[] = {
+		{ T6_LIPMISS_F, "LE LIP miss", -1, 0 },
+		{ T6_LIP0_F, "LE 0 LIP error", -1, 0 },
+		{ TCAMINTPERR_F, "LE parity error", -1, 1 },
+		{ T6_UNKNOWNCMD_F, "LE unknown command", -1, 1 },
+		{ SSRAMINTPERR_F, "LE request queue parity error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adap, LE_DB_INT_CAUSE_A,
+				  (chip <= CHELSIO_T5) ?
+				  le_intr_info : t6_le_intr_info))
+		t4_fatal_err(adap);
+}
+
+/*
+ * MPS interrupt handler.
+ */
+static void mps_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info mps_rx_intr_info[] = {
+		{ 0xffffff, "MPS Rx parity error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info mps_tx_intr_info[] = {
+		{ TPFIFO_V(TPFIFO_M), "MPS Tx TP FIFO parity error", -1, 1 },
+		{ NCSIFIFO_F, "MPS Tx NC-SI FIFO parity error", -1, 1 },
+		{ TXDATAFIFO_V(TXDATAFIFO_M), "MPS Tx data FIFO parity error",
+		  -1, 1 },
+		{ TXDESCFIFO_V(TXDESCFIFO_M), "MPS Tx desc FIFO parity error",
+		  -1, 1 },
+		{ BUBBLE_F, "MPS Tx underflow", -1, 1 },
+		{ SECNTERR_F, "MPS Tx SOP/EOP error", -1, 1 },
+		{ FRMERR_F, "MPS Tx framing error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info t6_mps_tx_intr_info[] = {
+		{ TPFIFO_V(TPFIFO_M), "MPS Tx TP FIFO parity error", -1, 1 },
+		{ NCSIFIFO_F, "MPS Tx NC-SI FIFO parity error", -1, 1 },
+		{ TXDATAFIFO_V(TXDATAFIFO_M), "MPS Tx data FIFO parity error",
+		  -1, 1 },
+		{ TXDESCFIFO_V(TXDESCFIFO_M), "MPS Tx desc FIFO parity error",
+		  -1, 1 },
+		/* MPS Tx Bubble is normal for T6 */
+		{ SECNTERR_F, "MPS Tx SOP/EOP error", -1, 1 },
+		{ FRMERR_F, "MPS Tx framing error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info mps_trc_intr_info[] = {
+		{ FILTMEM_V(FILTMEM_M), "MPS TRC filter parity error", -1, 1 },
+		{ PKTFIFO_V(PKTFIFO_M), "MPS TRC packet FIFO parity error",
+		  -1, 1 },
+		{ MISCPERR_F, "MPS TRC misc parity error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info mps_stat_sram_intr_info[] = {
+		{ 0x1fffff, "MPS statistics SRAM parity error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info mps_stat_tx_intr_info[] = {
+		{ 0xfffff, "MPS statistics Tx FIFO parity error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info mps_stat_rx_intr_info[] = {
+		{ 0xffffff, "MPS statistics Rx FIFO parity error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info mps_cls_intr_info[] = {
+		{ MATCHSRAM_F, "MPS match SRAM parity error", -1, 1 },
+		{ MATCHTCAM_F, "MPS match TCAM parity error", -1, 1 },
+		{ HASHSRAM_F, "MPS hash SRAM parity error", -1, 1 },
+		{ 0 }
+	};
+
+	int fat;
+
+	fat = t4_handle_intr_status(adapter, MPS_RX_PERR_INT_CAUSE_A,
+				    mps_rx_intr_info) +
+	      t4_handle_intr_status(adapter, MPS_TX_INT_CAUSE_A,
+				    is_t6(adapter->params.chip)
+				    ? t6_mps_tx_intr_info
+				    : mps_tx_intr_info) +
+	      t4_handle_intr_status(adapter, MPS_TRC_INT_CAUSE_A,
+				    mps_trc_intr_info) +
+	      t4_handle_intr_status(adapter, MPS_STAT_PERR_INT_CAUSE_SRAM_A,
+				    mps_stat_sram_intr_info) +
+	      t4_handle_intr_status(adapter, MPS_STAT_PERR_INT_CAUSE_TX_FIFO_A,
+				    mps_stat_tx_intr_info) +
+	      t4_handle_intr_status(adapter, MPS_STAT_PERR_INT_CAUSE_RX_FIFO_A,
+				    mps_stat_rx_intr_info) +
+	      t4_handle_intr_status(adapter, MPS_CLS_INT_CAUSE_A,
+				    mps_cls_intr_info);
+
+	t4_write_reg(adapter, MPS_INT_CAUSE_A, 0);
+	t4_read_reg(adapter, MPS_INT_CAUSE_A);                    /* flush */
+	if (fat)
+		t4_fatal_err(adapter);
+}
+
+#define MEM_INT_MASK (PERR_INT_CAUSE_F | ECC_CE_INT_CAUSE_F | \
+		      ECC_UE_INT_CAUSE_F)
+
+/*
+ * EDC/MC interrupt handler.
+ */
+static void mem_intr_handler(struct adapter *adapter, int idx)
+{
+	static const char name[4][7] = { "EDC0", "EDC1", "MC/MC0", "MC1" };
+
+	unsigned int addr, cnt_addr, v;
+
+	if (idx <= MEM_EDC1) {
+		addr = EDC_REG(EDC_INT_CAUSE_A, idx);
+		cnt_addr = EDC_REG(EDC_ECC_STATUS_A, idx);
+	} else if (idx == MEM_MC) {
+		if (is_t4(adapter->params.chip)) {
+			addr = MC_INT_CAUSE_A;
+			cnt_addr = MC_ECC_STATUS_A;
+		} else {
+			addr = MC_P_INT_CAUSE_A;
+			cnt_addr = MC_P_ECC_STATUS_A;
+		}
+	} else {
+		addr = MC_REG(MC_P_INT_CAUSE_A, 1);
+		cnt_addr = MC_REG(MC_P_ECC_STATUS_A, 1);
+	}
+
+	v = t4_read_reg(adapter, addr) & MEM_INT_MASK;
+	if (v & PERR_INT_CAUSE_F)
+		dev_alert(adapter->pdev_dev, "%s FIFO parity error\n",
+			  name[idx]);
+	if (v & ECC_CE_INT_CAUSE_F) {
+		u32 cnt = ECC_CECNT_G(t4_read_reg(adapter, cnt_addr));
+
+		t4_edc_err_read(adapter, idx);
+
+		t4_write_reg(adapter, cnt_addr, ECC_CECNT_V(ECC_CECNT_M));
+		if (printk_ratelimit())
+			dev_warn(adapter->pdev_dev,
+				 "%u %s correctable ECC data error%s\n",
+				 cnt, name[idx], cnt > 1 ? "s" : "");
+	}
+	if (v & ECC_UE_INT_CAUSE_F)
+		dev_alert(adapter->pdev_dev,
+			  "%s uncorrectable ECC data error\n", name[idx]);
+
+	t4_write_reg(adapter, addr, v);
+	if (v & (PERR_INT_CAUSE_F | ECC_UE_INT_CAUSE_F))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * MA interrupt handler.
+ */
+static void ma_intr_handler(struct adapter *adap)
+{
+	u32 v, status = t4_read_reg(adap, MA_INT_CAUSE_A);
+
+	if (status & MEM_PERR_INT_CAUSE_F) {
+		dev_alert(adap->pdev_dev,
+			  "MA parity error, parity status %#x\n",
+			  t4_read_reg(adap, MA_PARITY_ERROR_STATUS1_A));
+		if (is_t5(adap->params.chip))
+			dev_alert(adap->pdev_dev,
+				  "MA parity error, parity status %#x\n",
+				  t4_read_reg(adap,
+					      MA_PARITY_ERROR_STATUS2_A));
+	}
+	if (status & MEM_WRAP_INT_CAUSE_F) {
+		v = t4_read_reg(adap, MA_INT_WRAP_STATUS_A);
+		dev_alert(adap->pdev_dev, "MA address wrap-around error by "
+			  "client %u to address %#x\n",
+			  MEM_WRAP_CLIENT_NUM_G(v),
+			  MEM_WRAP_ADDRESS_G(v) << 4);
+	}
+	t4_write_reg(adap, MA_INT_CAUSE_A, status);
+	t4_fatal_err(adap);
+}
+
+/*
+ * SMB interrupt handler.
+ */
+static void smb_intr_handler(struct adapter *adap)
+{
+	static const struct intr_info smb_intr_info[] = {
+		{ MSTTXFIFOPARINT_F, "SMB master Tx FIFO parity error", -1, 1 },
+		{ MSTRXFIFOPARINT_F, "SMB master Rx FIFO parity error", -1, 1 },
+		{ SLVFIFOPARINT_F, "SMB slave FIFO parity error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adap, SMB_INT_CAUSE_A, smb_intr_info))
+		t4_fatal_err(adap);
+}
+
+/*
+ * NC-SI interrupt handler.
+ */
+static void ncsi_intr_handler(struct adapter *adap)
+{
+	static const struct intr_info ncsi_intr_info[] = {
+		{ CIM_DM_PRTY_ERR_F, "NC-SI CIM parity error", -1, 1 },
+		{ MPS_DM_PRTY_ERR_F, "NC-SI MPS parity error", -1, 1 },
+		{ TXFIFO_PRTY_ERR_F, "NC-SI Tx FIFO parity error", -1, 1 },
+		{ RXFIFO_PRTY_ERR_F, "NC-SI Rx FIFO parity error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adap, NCSI_INT_CAUSE_A, ncsi_intr_info))
+		t4_fatal_err(adap);
+}
+
+/*
+ * XGMAC interrupt handler.
+ */
+static void xgmac_intr_handler(struct adapter *adap, int port)
+{
+	u32 v, int_cause_reg;
+
+	if (is_t4(adap->params.chip))
+		int_cause_reg = PORT_REG(port, XGMAC_PORT_INT_CAUSE_A);
+	else
+		int_cause_reg = T5_PORT_REG(port, MAC_PORT_INT_CAUSE_A);
+
+	v = t4_read_reg(adap, int_cause_reg);
+
+	v &= TXFIFO_PRTY_ERR_F | RXFIFO_PRTY_ERR_F;
+	if (!v)
+		return;
+
+	if (v & TXFIFO_PRTY_ERR_F)
+		dev_alert(adap->pdev_dev, "XGMAC %d Tx FIFO parity error\n",
+			  port);
+	if (v & RXFIFO_PRTY_ERR_F)
+		dev_alert(adap->pdev_dev, "XGMAC %d Rx FIFO parity error\n",
+			  port);
+	t4_write_reg(adap, PORT_REG(port, XGMAC_PORT_INT_CAUSE_A), v);
+	t4_fatal_err(adap);
+}
+
+/*
+ * PL interrupt handler.
+ */
+static void pl_intr_handler(struct adapter *adap)
+{
+	static const struct intr_info pl_intr_info[] = {
+		{ FATALPERR_F, "T4 fatal parity error", -1, 1 },
+		{ PERRVFID_F, "PL VFID_MAP parity error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adap, PL_PL_INT_CAUSE_A, pl_intr_info))
+		t4_fatal_err(adap);
+}
+
+#define PF_INTR_MASK (PFSW_F)
+#define GLBL_INTR_MASK (CIM_F | MPS_F | PL_F | PCIE_F | MC_F | EDC0_F | \
+		EDC1_F | LE_F | TP_F | MA_F | PM_TX_F | PM_RX_F | ULP_RX_F | \
+		CPL_SWITCH_F | SGE_F | ULP_TX_F | SF_F)
+
+/**
+ *	t4_slow_intr_handler - control path interrupt handler
+ *	@adapter: the adapter
+ *
+ *	T4 interrupt handler for non-data global interrupt events, e.g., errors.
+ *	The designation 'slow' is because it involves register reads, while
+ *	data interrupts typically don't involve any MMIOs.
+ */
+int t4_slow_intr_handler(struct adapter *adapter)
+{
+	u32 cause = t4_read_reg(adapter, PL_INT_CAUSE_A);
+
+	if (!(cause & GLBL_INTR_MASK))
+		return 0;
+	if (cause & CIM_F)
+		cim_intr_handler(adapter);
+	if (cause & MPS_F)
+		mps_intr_handler(adapter);
+	if (cause & NCSI_F)
+		ncsi_intr_handler(adapter);
+	if (cause & PL_F)
+		pl_intr_handler(adapter);
+	if (cause & SMB_F)
+		smb_intr_handler(adapter);
+	if (cause & XGMAC0_F)
+		xgmac_intr_handler(adapter, 0);
+	if (cause & XGMAC1_F)
+		xgmac_intr_handler(adapter, 1);
+	if (cause & XGMAC_KR0_F)
+		xgmac_intr_handler(adapter, 2);
+	if (cause & XGMAC_KR1_F)
+		xgmac_intr_handler(adapter, 3);
+	if (cause & PCIE_F)
+		pcie_intr_handler(adapter);
+	if (cause & MC_F)
+		mem_intr_handler(adapter, MEM_MC);
+	if (is_t5(adapter->params.chip) && (cause & MC1_F))
+		mem_intr_handler(adapter, MEM_MC1);
+	if (cause & EDC0_F)
+		mem_intr_handler(adapter, MEM_EDC0);
+	if (cause & EDC1_F)
+		mem_intr_handler(adapter, MEM_EDC1);
+	if (cause & LE_F)
+		le_intr_handler(adapter);
+	if (cause & TP_F)
+		tp_intr_handler(adapter);
+	if (cause & MA_F)
+		ma_intr_handler(adapter);
+	if (cause & PM_TX_F)
+		pmtx_intr_handler(adapter);
+	if (cause & PM_RX_F)
+		pmrx_intr_handler(adapter);
+	if (cause & ULP_RX_F)
+		ulprx_intr_handler(adapter);
+	if (cause & CPL_SWITCH_F)
+		cplsw_intr_handler(adapter);
+	if (cause & SGE_F)
+		sge_intr_handler(adapter);
+	if (cause & ULP_TX_F)
+		ulptx_intr_handler(adapter);
+
+	/* Clear the interrupts just processed for which we are the master. */
+	t4_write_reg(adapter, PL_INT_CAUSE_A, cause & GLBL_INTR_MASK);
+	(void)t4_read_reg(adapter, PL_INT_CAUSE_A); /* flush */
+	return 1;
+}
+
+/**
+ *	t4_intr_enable - enable interrupts
+ *	@adapter: the adapter whose interrupts should be enabled
+ *
+ *	Enable PF-specific interrupts for the calling function and the top-level
+ *	interrupt concentrator for global interrupts.  Interrupts are already
+ *	enabled at each module,	here we just enable the roots of the interrupt
+ *	hierarchies.
+ *
+ *	Note: this function should be called only when the driver manages
+ *	non PF-specific interrupts from the various HW modules.  Only one PCI
+ *	function at a time should be doing this.
+ */
+void t4_intr_enable(struct adapter *adapter)
+{
+	u32 val = 0;
+	u32 whoami = t4_read_reg(adapter, PL_WHOAMI_A);
+	u32 pf = CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5 ?
+			SOURCEPF_G(whoami) : T6_SOURCEPF_G(whoami);
+
+	if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5)
+		val = ERR_DROPPED_DB_F | ERR_EGR_CTXT_PRIO_F | DBFIFO_HP_INT_F;
+	t4_write_reg(adapter, SGE_INT_ENABLE3_A, ERR_CPL_EXCEED_IQE_SIZE_F |
+		     ERR_INVALID_CIDX_INC_F | ERR_CPL_OPCODE_0_F |
+		     ERR_DATA_CPL_ON_HIGH_QID1_F | INGRESS_SIZE_ERR_F |
+		     ERR_DATA_CPL_ON_HIGH_QID0_F | ERR_BAD_DB_PIDX3_F |
+		     ERR_BAD_DB_PIDX2_F | ERR_BAD_DB_PIDX1_F |
+		     ERR_BAD_DB_PIDX0_F | ERR_ING_CTXT_PRIO_F |
+		     DBFIFO_LP_INT_F | EGRESS_SIZE_ERR_F | val);
+	t4_write_reg(adapter, MYPF_REG(PL_PF_INT_ENABLE_A), PF_INTR_MASK);
+	t4_set_reg_field(adapter, PL_INT_MAP0_A, 0, 1 << pf);
+}
+
+/**
+ *	t4_intr_disable - disable interrupts
+ *	@adapter: the adapter whose interrupts should be disabled
+ *
+ *	Disable interrupts.  We only disable the top-level interrupt
+ *	concentrators.  The caller must be a PCI function managing global
+ *	interrupts.
+ */
+void t4_intr_disable(struct adapter *adapter)
+{
+	u32 whoami, pf;
+
+	if (pci_channel_offline(adapter->pdev))
+		return;
+
+	whoami = t4_read_reg(adapter, PL_WHOAMI_A);
+	pf = CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5 ?
+			SOURCEPF_G(whoami) : T6_SOURCEPF_G(whoami);
+
+	t4_write_reg(adapter, MYPF_REG(PL_PF_INT_ENABLE_A), 0);
+	t4_set_reg_field(adapter, PL_INT_MAP0_A, 1 << pf, 0);
+}
+
+unsigned int t4_chip_rss_size(struct adapter *adap)
+{
+	if (CHELSIO_CHIP_VERSION(adap->params.chip) <= CHELSIO_T5)
+		return RSS_NENTRIES;
+	else
+		return T6_RSS_NENTRIES;
+}
+
+/**
+ *	t4_config_rss_range - configure a portion of the RSS mapping table
+ *	@adapter: the adapter
+ *	@mbox: mbox to use for the FW command
+ *	@viid: virtual interface whose RSS subtable is to be written
+ *	@start: start entry in the table to write
+ *	@n: how many table entries to write
+ *	@rspq: values for the response queue lookup table
+ *	@nrspq: number of values in @rspq
+ *
+ *	Programs the selected part of the VI's RSS mapping table with the
+ *	provided values.  If @nrspq < @n the supplied values are used repeatedly
+ *	until the full table range is populated.
+ *
+ *	The caller must ensure the values in @rspq are in the range allowed for
+ *	@viid.
+ */
+int t4_config_rss_range(struct adapter *adapter, int mbox, unsigned int viid,
+			int start, int n, const u16 *rspq, unsigned int nrspq)
+{
+	int ret;
+	const u16 *rsp = rspq;
+	const u16 *rsp_end = rspq + nrspq;
+	struct fw_rss_ind_tbl_cmd cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_RSS_IND_TBL_CMD) |
+			       FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+			       FW_RSS_IND_TBL_CMD_VIID_V(viid));
+	cmd.retval_len16 = cpu_to_be32(FW_LEN16(cmd));
+
+	/* each fw_rss_ind_tbl_cmd takes up to 32 entries */
+	while (n > 0) {
+		int nq = min(n, 32);
+		__be32 *qp = &cmd.iq0_to_iq2;
+
+		cmd.niqid = cpu_to_be16(nq);
+		cmd.startidx = cpu_to_be16(start);
+
+		start += nq;
+		n -= nq;
+
+		while (nq > 0) {
+			unsigned int v;
+
+			v = FW_RSS_IND_TBL_CMD_IQ0_V(*rsp);
+			if (++rsp >= rsp_end)
+				rsp = rspq;
+			v |= FW_RSS_IND_TBL_CMD_IQ1_V(*rsp);
+			if (++rsp >= rsp_end)
+				rsp = rspq;
+			v |= FW_RSS_IND_TBL_CMD_IQ2_V(*rsp);
+			if (++rsp >= rsp_end)
+				rsp = rspq;
+
+			*qp++ = cpu_to_be32(v);
+			nq -= 3;
+		}
+
+		ret = t4_wr_mbox(adapter, mbox, &cmd, sizeof(cmd), NULL);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+/**
+ *	t4_config_glbl_rss - configure the global RSS mode
+ *	@adapter: the adapter
+ *	@mbox: mbox to use for the FW command
+ *	@mode: global RSS mode
+ *	@flags: mode-specific flags
+ *
+ *	Sets the global RSS mode.
+ */
+int t4_config_glbl_rss(struct adapter *adapter, int mbox, unsigned int mode,
+		       unsigned int flags)
+{
+	struct fw_rss_glb_config_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_write = cpu_to_be32(FW_CMD_OP_V(FW_RSS_GLB_CONFIG_CMD) |
+				    FW_CMD_REQUEST_F | FW_CMD_WRITE_F);
+	c.retval_len16 = cpu_to_be32(FW_LEN16(c));
+	if (mode == FW_RSS_GLB_CONFIG_CMD_MODE_MANUAL) {
+		c.u.manual.mode_pkd =
+			cpu_to_be32(FW_RSS_GLB_CONFIG_CMD_MODE_V(mode));
+	} else if (mode == FW_RSS_GLB_CONFIG_CMD_MODE_BASICVIRTUAL) {
+		c.u.basicvirtual.mode_pkd =
+			cpu_to_be32(FW_RSS_GLB_CONFIG_CMD_MODE_V(mode));
+		c.u.basicvirtual.synmapen_to_hashtoeplitz = cpu_to_be32(flags);
+	} else
+		return -EINVAL;
+	return t4_wr_mbox(adapter, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_config_vi_rss - configure per VI RSS settings
+ *	@adapter: the adapter
+ *	@mbox: mbox to use for the FW command
+ *	@viid: the VI id
+ *	@flags: RSS flags
+ *	@defq: id of the default RSS queue for the VI.
+ *
+ *	Configures VI-specific RSS properties.
+ */
+int t4_config_vi_rss(struct adapter *adapter, int mbox, unsigned int viid,
+		     unsigned int flags, unsigned int defq)
+{
+	struct fw_rss_vi_config_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_RSS_VI_CONFIG_CMD) |
+				   FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+				   FW_RSS_VI_CONFIG_CMD_VIID_V(viid));
+	c.retval_len16 = cpu_to_be32(FW_LEN16(c));
+	c.u.basicvirtual.defaultq_to_udpen = cpu_to_be32(flags |
+					FW_RSS_VI_CONFIG_CMD_DEFAULTQ_V(defq));
+	return t4_wr_mbox(adapter, mbox, &c, sizeof(c), NULL);
+}
+
+/* Read an RSS table row */
+static int rd_rss_row(struct adapter *adap, int row, u32 *val)
+{
+	t4_write_reg(adap, TP_RSS_LKP_TABLE_A, 0xfff00000 | row);
+	return t4_wait_op_done_val(adap, TP_RSS_LKP_TABLE_A, LKPTBLROWVLD_F, 1,
+				   5, 0, val);
+}
+
+/**
+ *	t4_read_rss - read the contents of the RSS mapping table
+ *	@adapter: the adapter
+ *	@map: holds the contents of the RSS mapping table
+ *
+ *	Reads the contents of the RSS hash->queue mapping table.
+ */
+int t4_read_rss(struct adapter *adapter, u16 *map)
+{
+	int i, ret, nentries;
+	u32 val;
+
+	nentries = t4_chip_rss_size(adapter);
+	for (i = 0; i < nentries / 2; ++i) {
+		ret = rd_rss_row(adapter, i, &val);
+		if (ret)
+			return ret;
+		*map++ = LKPTBLQUEUE0_G(val);
+		*map++ = LKPTBLQUEUE1_G(val);
+	}
+	return 0;
+}
+
+static unsigned int t4_use_ldst(struct adapter *adap)
+{
+	return (adap->flags & FW_OK) && !adap->use_bd;
+}
+
+/**
+ * t4_tp_fw_ldst_rw - Access TP indirect register through LDST
+ * @adap: the adapter
+ * @cmd: TP fw ldst address space type
+ * @vals: where the indirect register values are stored/written
+ * @nregs: how many indirect registers to read/write
+ * @start_idx: index of first indirect register to read/write
+ * @rw: Read (1) or Write (0)
+ * @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ * Access TP indirect registers through LDST
+ */
+static int t4_tp_fw_ldst_rw(struct adapter *adap, int cmd, u32 *vals,
+			    unsigned int nregs, unsigned int start_index,
+			    unsigned int rw, bool sleep_ok)
+{
+	int ret = 0;
+	unsigned int i;
+	struct fw_ldst_cmd c;
+
+	for (i = 0; i < nregs; i++) {
+		memset(&c, 0, sizeof(c));
+		c.op_to_addrspace = cpu_to_be32(FW_CMD_OP_V(FW_LDST_CMD) |
+						FW_CMD_REQUEST_F |
+						(rw ? FW_CMD_READ_F :
+						      FW_CMD_WRITE_F) |
+						FW_LDST_CMD_ADDRSPACE_V(cmd));
+		c.cycles_to_len16 = cpu_to_be32(FW_LEN16(c));
+
+		c.u.addrval.addr = cpu_to_be32(start_index + i);
+		c.u.addrval.val  = rw ? 0 : cpu_to_be32(vals[i]);
+		ret = t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c,
+				      sleep_ok);
+		if (ret)
+			return ret;
+
+		if (rw)
+			vals[i] = be32_to_cpu(c.u.addrval.val);
+	}
+	return 0;
+}
+
+/**
+ * t4_tp_indirect_rw - Read/Write TP indirect register through LDST or backdoor
+ * @adap: the adapter
+ * @reg_addr: Address Register
+ * @reg_data: Data register
+ * @buff: where the indirect register values are stored/written
+ * @nregs: how many indirect registers to read/write
+ * @start_index: index of first indirect register to read/write
+ * @rw: READ(1) or WRITE(0)
+ * @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ * Read/Write TP indirect registers through LDST if possible.
+ * Else, use backdoor access
+ **/
+static void t4_tp_indirect_rw(struct adapter *adap, u32 reg_addr, u32 reg_data,
+			      u32 *buff, u32 nregs, u32 start_index, int rw,
+			      bool sleep_ok)
+{
+	int rc = -EINVAL;
+	int cmd;
+
+	switch (reg_addr) {
+	case TP_PIO_ADDR_A:
+		cmd = FW_LDST_ADDRSPC_TP_PIO;
+		break;
+	case TP_TM_PIO_ADDR_A:
+		cmd = FW_LDST_ADDRSPC_TP_TM_PIO;
+		break;
+	case TP_MIB_INDEX_A:
+		cmd = FW_LDST_ADDRSPC_TP_MIB;
+		break;
+	default:
+		goto indirect_access;
+	}
+
+	if (t4_use_ldst(adap))
+		rc = t4_tp_fw_ldst_rw(adap, cmd, buff, nregs, start_index, rw,
+				      sleep_ok);
+
+indirect_access:
+
+	if (rc) {
+		if (rw)
+			t4_read_indirect(adap, reg_addr, reg_data, buff, nregs,
+					 start_index);
+		else
+			t4_write_indirect(adap, reg_addr, reg_data, buff, nregs,
+					  start_index);
+	}
+}
+
+/**
+ * t4_tp_pio_read - Read TP PIO registers
+ * @adap: the adapter
+ * @buff: where the indirect register values are written
+ * @nregs: how many indirect registers to read
+ * @start_index: index of first indirect register to read
+ * @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ * Read TP PIO Registers
+ **/
+void t4_tp_pio_read(struct adapter *adap, u32 *buff, u32 nregs,
+		    u32 start_index, bool sleep_ok)
+{
+	t4_tp_indirect_rw(adap, TP_PIO_ADDR_A, TP_PIO_DATA_A, buff, nregs,
+			  start_index, 1, sleep_ok);
+}
+
+/**
+ * t4_tp_pio_write - Write TP PIO registers
+ * @adap: the adapter
+ * @buff: where the indirect register values are stored
+ * @nregs: how many indirect registers to write
+ * @start_index: index of first indirect register to write
+ * @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ * Write TP PIO Registers
+ **/
+static void t4_tp_pio_write(struct adapter *adap, u32 *buff, u32 nregs,
+			    u32 start_index, bool sleep_ok)
+{
+	t4_tp_indirect_rw(adap, TP_PIO_ADDR_A, TP_PIO_DATA_A, buff, nregs,
+			  start_index, 0, sleep_ok);
+}
+
+/**
+ * t4_tp_tm_pio_read - Read TP TM PIO registers
+ * @adap: the adapter
+ * @buff: where the indirect register values are written
+ * @nregs: how many indirect registers to read
+ * @start_index: index of first indirect register to read
+ * @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ * Read TP TM PIO Registers
+ **/
+void t4_tp_tm_pio_read(struct adapter *adap, u32 *buff, u32 nregs,
+		       u32 start_index, bool sleep_ok)
+{
+	t4_tp_indirect_rw(adap, TP_TM_PIO_ADDR_A, TP_TM_PIO_DATA_A, buff,
+			  nregs, start_index, 1, sleep_ok);
+}
+
+/**
+ * t4_tp_mib_read - Read TP MIB registers
+ * @adap: the adapter
+ * @buff: where the indirect register values are written
+ * @nregs: how many indirect registers to read
+ * @start_index: index of first indirect register to read
+ * @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ * Read TP MIB Registers
+ **/
+void t4_tp_mib_read(struct adapter *adap, u32 *buff, u32 nregs, u32 start_index,
+		    bool sleep_ok)
+{
+	t4_tp_indirect_rw(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, buff, nregs,
+			  start_index, 1, sleep_ok);
+}
+
+/**
+ *	t4_read_rss_key - read the global RSS key
+ *	@adap: the adapter
+ *	@key: 10-entry array holding the 320-bit RSS key
+ *      @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Reads the global 320-bit RSS key.
+ */
+void t4_read_rss_key(struct adapter *adap, u32 *key, bool sleep_ok)
+{
+	t4_tp_pio_read(adap, key, 10, TP_RSS_SECRET_KEY0_A, sleep_ok);
+}
+
+/**
+ *	t4_write_rss_key - program one of the RSS keys
+ *	@adap: the adapter
+ *	@key: 10-entry array holding the 320-bit RSS key
+ *	@idx: which RSS key to write
+ *      @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Writes one of the RSS keys with the given 320-bit value.  If @idx is
+ *	0..15 the corresponding entry in the RSS key table is written,
+ *	otherwise the global RSS key is written.
+ */
+void t4_write_rss_key(struct adapter *adap, const u32 *key, int idx,
+		      bool sleep_ok)
+{
+	u8 rss_key_addr_cnt = 16;
+	u32 vrt = t4_read_reg(adap, TP_RSS_CONFIG_VRT_A);
+
+	/* T6 and later: for KeyMode 3 (per-vf and per-vf scramble),
+	 * allows access to key addresses 16-63 by using KeyWrAddrX
+	 * as index[5:4](upper 2) into key table
+	 */
+	if ((CHELSIO_CHIP_VERSION(adap->params.chip) > CHELSIO_T5) &&
+	    (vrt & KEYEXTEND_F) && (KEYMODE_G(vrt) == 3))
+		rss_key_addr_cnt = 32;
+
+	t4_tp_pio_write(adap, (void *)key, 10, TP_RSS_SECRET_KEY0_A, sleep_ok);
+
+	if (idx >= 0 && idx < rss_key_addr_cnt) {
+		if (rss_key_addr_cnt > 16)
+			t4_write_reg(adap, TP_RSS_CONFIG_VRT_A,
+				     KEYWRADDRX_V(idx >> 4) |
+				     T6_VFWRADDR_V(idx) | KEYWREN_F);
+		else
+			t4_write_reg(adap, TP_RSS_CONFIG_VRT_A,
+				     KEYWRADDR_V(idx) | KEYWREN_F);
+	}
+}
+
+/**
+ *	t4_read_rss_pf_config - read PF RSS Configuration Table
+ *	@adapter: the adapter
+ *	@index: the entry in the PF RSS table to read
+ *	@valp: where to store the returned value
+ *      @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Reads the PF RSS Configuration Table at the specified index and returns
+ *	the value found there.
+ */
+void t4_read_rss_pf_config(struct adapter *adapter, unsigned int index,
+			   u32 *valp, bool sleep_ok)
+{
+	t4_tp_pio_read(adapter, valp, 1, TP_RSS_PF0_CONFIG_A + index, sleep_ok);
+}
+
+/**
+ *	t4_read_rss_vf_config - read VF RSS Configuration Table
+ *	@adapter: the adapter
+ *	@index: the entry in the VF RSS table to read
+ *	@vfl: where to store the returned VFL
+ *	@vfh: where to store the returned VFH
+ *      @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Reads the VF RSS Configuration Table at the specified index and returns
+ *	the (VFL, VFH) values found there.
+ */
+void t4_read_rss_vf_config(struct adapter *adapter, unsigned int index,
+			   u32 *vfl, u32 *vfh, bool sleep_ok)
+{
+	u32 vrt, mask, data;
+
+	if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5) {
+		mask = VFWRADDR_V(VFWRADDR_M);
+		data = VFWRADDR_V(index);
+	} else {
+		 mask =  T6_VFWRADDR_V(T6_VFWRADDR_M);
+		 data = T6_VFWRADDR_V(index);
+	}
+
+	/* Request that the index'th VF Table values be read into VFL/VFH.
+	 */
+	vrt = t4_read_reg(adapter, TP_RSS_CONFIG_VRT_A);
+	vrt &= ~(VFRDRG_F | VFWREN_F | KEYWREN_F | mask);
+	vrt |= data | VFRDEN_F;
+	t4_write_reg(adapter, TP_RSS_CONFIG_VRT_A, vrt);
+
+	/* Grab the VFL/VFH values ...
+	 */
+	t4_tp_pio_read(adapter, vfl, 1, TP_RSS_VFL_CONFIG_A, sleep_ok);
+	t4_tp_pio_read(adapter, vfh, 1, TP_RSS_VFH_CONFIG_A, sleep_ok);
+}
+
+/**
+ *	t4_read_rss_pf_map - read PF RSS Map
+ *	@adapter: the adapter
+ *      @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Reads the PF RSS Map register and returns its value.
+ */
+u32 t4_read_rss_pf_map(struct adapter *adapter, bool sleep_ok)
+{
+	u32 pfmap;
+
+	t4_tp_pio_read(adapter, &pfmap, 1, TP_RSS_PF_MAP_A, sleep_ok);
+	return pfmap;
+}
+
+/**
+ *	t4_read_rss_pf_mask - read PF RSS Mask
+ *	@adapter: the adapter
+ *      @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Reads the PF RSS Mask register and returns its value.
+ */
+u32 t4_read_rss_pf_mask(struct adapter *adapter, bool sleep_ok)
+{
+	u32 pfmask;
+
+	t4_tp_pio_read(adapter, &pfmask, 1, TP_RSS_PF_MSK_A, sleep_ok);
+	return pfmask;
+}
+
+/**
+ *	t4_tp_get_tcp_stats - read TP's TCP MIB counters
+ *	@adap: the adapter
+ *	@v4: holds the TCP/IP counter values
+ *	@v6: holds the TCP/IPv6 counter values
+ *      @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Returns the values of TP's TCP/IP and TCP/IPv6 MIB counters.
+ *	Either @v4 or @v6 may be %NULL to skip the corresponding stats.
+ */
+void t4_tp_get_tcp_stats(struct adapter *adap, struct tp_tcp_stats *v4,
+			 struct tp_tcp_stats *v6, bool sleep_ok)
+{
+	u32 val[TP_MIB_TCP_RXT_SEG_LO_A - TP_MIB_TCP_OUT_RST_A + 1];
+
+#define STAT_IDX(x) ((TP_MIB_TCP_##x##_A) - TP_MIB_TCP_OUT_RST_A)
+#define STAT(x)     val[STAT_IDX(x)]
+#define STAT64(x)   (((u64)STAT(x##_HI) << 32) | STAT(x##_LO))
+
+	if (v4) {
+		t4_tp_mib_read(adap, val, ARRAY_SIZE(val),
+			       TP_MIB_TCP_OUT_RST_A, sleep_ok);
+		v4->tcp_out_rsts = STAT(OUT_RST);
+		v4->tcp_in_segs  = STAT64(IN_SEG);
+		v4->tcp_out_segs = STAT64(OUT_SEG);
+		v4->tcp_retrans_segs = STAT64(RXT_SEG);
+	}
+	if (v6) {
+		t4_tp_mib_read(adap, val, ARRAY_SIZE(val),
+			       TP_MIB_TCP_V6OUT_RST_A, sleep_ok);
+		v6->tcp_out_rsts = STAT(OUT_RST);
+		v6->tcp_in_segs  = STAT64(IN_SEG);
+		v6->tcp_out_segs = STAT64(OUT_SEG);
+		v6->tcp_retrans_segs = STAT64(RXT_SEG);
+	}
+#undef STAT64
+#undef STAT
+#undef STAT_IDX
+}
+
+/**
+ *	t4_tp_get_err_stats - read TP's error MIB counters
+ *	@adap: the adapter
+ *	@st: holds the counter values
+ *      @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Returns the values of TP's error counters.
+ */
+void t4_tp_get_err_stats(struct adapter *adap, struct tp_err_stats *st,
+			 bool sleep_ok)
+{
+	int nchan = adap->params.arch.nchan;
+
+	t4_tp_mib_read(adap, st->mac_in_errs, nchan, TP_MIB_MAC_IN_ERR_0_A,
+		       sleep_ok);
+	t4_tp_mib_read(adap, st->hdr_in_errs, nchan, TP_MIB_HDR_IN_ERR_0_A,
+		       sleep_ok);
+	t4_tp_mib_read(adap, st->tcp_in_errs, nchan, TP_MIB_TCP_IN_ERR_0_A,
+		       sleep_ok);
+	t4_tp_mib_read(adap, st->tnl_cong_drops, nchan,
+		       TP_MIB_TNL_CNG_DROP_0_A, sleep_ok);
+	t4_tp_mib_read(adap, st->ofld_chan_drops, nchan,
+		       TP_MIB_OFD_CHN_DROP_0_A, sleep_ok);
+	t4_tp_mib_read(adap, st->tnl_tx_drops, nchan, TP_MIB_TNL_DROP_0_A,
+		       sleep_ok);
+	t4_tp_mib_read(adap, st->ofld_vlan_drops, nchan,
+		       TP_MIB_OFD_VLN_DROP_0_A, sleep_ok);
+	t4_tp_mib_read(adap, st->tcp6_in_errs, nchan,
+		       TP_MIB_TCP_V6IN_ERR_0_A, sleep_ok);
+	t4_tp_mib_read(adap, &st->ofld_no_neigh, 2, TP_MIB_OFD_ARP_DROP_A,
+		       sleep_ok);
+}
+
+/**
+ *	t4_tp_get_cpl_stats - read TP's CPL MIB counters
+ *	@adap: the adapter
+ *	@st: holds the counter values
+ *      @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Returns the values of TP's CPL counters.
+ */
+void t4_tp_get_cpl_stats(struct adapter *adap, struct tp_cpl_stats *st,
+			 bool sleep_ok)
+{
+	int nchan = adap->params.arch.nchan;
+
+	t4_tp_mib_read(adap, st->req, nchan, TP_MIB_CPL_IN_REQ_0_A, sleep_ok);
+
+	t4_tp_mib_read(adap, st->rsp, nchan, TP_MIB_CPL_OUT_RSP_0_A, sleep_ok);
+}
+
+/**
+ *	t4_tp_get_rdma_stats - read TP's RDMA MIB counters
+ *	@adap: the adapter
+ *	@st: holds the counter values
+ *      @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Returns the values of TP's RDMA counters.
+ */
+void t4_tp_get_rdma_stats(struct adapter *adap, struct tp_rdma_stats *st,
+			  bool sleep_ok)
+{
+	t4_tp_mib_read(adap, &st->rqe_dfr_pkt, 2, TP_MIB_RQE_DFR_PKT_A,
+		       sleep_ok);
+}
+
+/**
+ *	t4_get_fcoe_stats - read TP's FCoE MIB counters for a port
+ *	@adap: the adapter
+ *	@idx: the port index
+ *	@st: holds the counter values
+ *      @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Returns the values of TP's FCoE counters for the selected port.
+ */
+void t4_get_fcoe_stats(struct adapter *adap, unsigned int idx,
+		       struct tp_fcoe_stats *st, bool sleep_ok)
+{
+	u32 val[2];
+
+	t4_tp_mib_read(adap, &st->frames_ddp, 1, TP_MIB_FCOE_DDP_0_A + idx,
+		       sleep_ok);
+
+	t4_tp_mib_read(adap, &st->frames_drop, 1,
+		       TP_MIB_FCOE_DROP_0_A + idx, sleep_ok);
+
+	t4_tp_mib_read(adap, val, 2, TP_MIB_FCOE_BYTE_0_HI_A + 2 * idx,
+		       sleep_ok);
+
+	st->octets_ddp = ((u64)val[0] << 32) | val[1];
+}
+
+/**
+ *	t4_get_usm_stats - read TP's non-TCP DDP MIB counters
+ *	@adap: the adapter
+ *	@st: holds the counter values
+ *      @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Returns the values of TP's counters for non-TCP directly-placed packets.
+ */
+void t4_get_usm_stats(struct adapter *adap, struct tp_usm_stats *st,
+		      bool sleep_ok)
+{
+	u32 val[4];
+
+	t4_tp_mib_read(adap, val, 4, TP_MIB_USM_PKTS_A, sleep_ok);
+	st->frames = val[0];
+	st->drops = val[1];
+	st->octets = ((u64)val[2] << 32) | val[3];
+}
+
+/**
+ *	t4_read_mtu_tbl - returns the values in the HW path MTU table
+ *	@adap: the adapter
+ *	@mtus: where to store the MTU values
+ *	@mtu_log: where to store the MTU base-2 log (may be %NULL)
+ *
+ *	Reads the HW path MTU table.
+ */
+void t4_read_mtu_tbl(struct adapter *adap, u16 *mtus, u8 *mtu_log)
+{
+	u32 v;
+	int i;
+
+	for (i = 0; i < NMTUS; ++i) {
+		t4_write_reg(adap, TP_MTU_TABLE_A,
+			     MTUINDEX_V(0xff) | MTUVALUE_V(i));
+		v = t4_read_reg(adap, TP_MTU_TABLE_A);
+		mtus[i] = MTUVALUE_G(v);
+		if (mtu_log)
+			mtu_log[i] = MTUWIDTH_G(v);
+	}
+}
+
+/**
+ *	t4_read_cong_tbl - reads the congestion control table
+ *	@adap: the adapter
+ *	@incr: where to store the alpha values
+ *
+ *	Reads the additive increments programmed into the HW congestion
+ *	control table.
+ */
+void t4_read_cong_tbl(struct adapter *adap, u16 incr[NMTUS][NCCTRL_WIN])
+{
+	unsigned int mtu, w;
+
+	for (mtu = 0; mtu < NMTUS; ++mtu)
+		for (w = 0; w < NCCTRL_WIN; ++w) {
+			t4_write_reg(adap, TP_CCTRL_TABLE_A,
+				     ROWINDEX_V(0xffff) | (mtu << 5) | w);
+			incr[mtu][w] = (u16)t4_read_reg(adap,
+						TP_CCTRL_TABLE_A) & 0x1fff;
+		}
+}
+
+/**
+ *	t4_tp_wr_bits_indirect - set/clear bits in an indirect TP register
+ *	@adap: the adapter
+ *	@addr: the indirect TP register address
+ *	@mask: specifies the field within the register to modify
+ *	@val: new value for the field
+ *
+ *	Sets a field of an indirect TP register to the given value.
+ */
+void t4_tp_wr_bits_indirect(struct adapter *adap, unsigned int addr,
+			    unsigned int mask, unsigned int val)
+{
+	t4_write_reg(adap, TP_PIO_ADDR_A, addr);
+	val |= t4_read_reg(adap, TP_PIO_DATA_A) & ~mask;
+	t4_write_reg(adap, TP_PIO_DATA_A, val);
+}
+
+/**
+ *	init_cong_ctrl - initialize congestion control parameters
+ *	@a: the alpha values for congestion control
+ *	@b: the beta values for congestion control
+ *
+ *	Initialize the congestion control parameters.
+ */
+static void init_cong_ctrl(unsigned short *a, unsigned short *b)
+{
+	a[0] = a[1] = a[2] = a[3] = a[4] = a[5] = a[6] = a[7] = a[8] = 1;
+	a[9] = 2;
+	a[10] = 3;
+	a[11] = 4;
+	a[12] = 5;
+	a[13] = 6;
+	a[14] = 7;
+	a[15] = 8;
+	a[16] = 9;
+	a[17] = 10;
+	a[18] = 14;
+	a[19] = 17;
+	a[20] = 21;
+	a[21] = 25;
+	a[22] = 30;
+	a[23] = 35;
+	a[24] = 45;
+	a[25] = 60;
+	a[26] = 80;
+	a[27] = 100;
+	a[28] = 200;
+	a[29] = 300;
+	a[30] = 400;
+	a[31] = 500;
+
+	b[0] = b[1] = b[2] = b[3] = b[4] = b[5] = b[6] = b[7] = b[8] = 0;
+	b[9] = b[10] = 1;
+	b[11] = b[12] = 2;
+	b[13] = b[14] = b[15] = b[16] = 3;
+	b[17] = b[18] = b[19] = b[20] = b[21] = 4;
+	b[22] = b[23] = b[24] = b[25] = b[26] = b[27] = 5;
+	b[28] = b[29] = 6;
+	b[30] = b[31] = 7;
+}
+
+/* The minimum additive increment value for the congestion control table */
+#define CC_MIN_INCR 2U
+
+/**
+ *	t4_load_mtus - write the MTU and congestion control HW tables
+ *	@adap: the adapter
+ *	@mtus: the values for the MTU table
+ *	@alpha: the values for the congestion control alpha parameter
+ *	@beta: the values for the congestion control beta parameter
+ *
+ *	Write the HW MTU table with the supplied MTUs and the high-speed
+ *	congestion control table with the supplied alpha, beta, and MTUs.
+ *	We write the two tables together because the additive increments
+ *	depend on the MTUs.
+ */
+void t4_load_mtus(struct adapter *adap, const unsigned short *mtus,
+		  const unsigned short *alpha, const unsigned short *beta)
+{
+	static const unsigned int avg_pkts[NCCTRL_WIN] = {
+		2, 6, 10, 14, 20, 28, 40, 56, 80, 112, 160, 224, 320, 448, 640,
+		896, 1281, 1792, 2560, 3584, 5120, 7168, 10240, 14336, 20480,
+		28672, 40960, 57344, 81920, 114688, 163840, 229376
+	};
+
+	unsigned int i, w;
+
+	for (i = 0; i < NMTUS; ++i) {
+		unsigned int mtu = mtus[i];
+		unsigned int log2 = fls(mtu);
+
+		if (!(mtu & ((1 << log2) >> 2)))     /* round */
+			log2--;
+		t4_write_reg(adap, TP_MTU_TABLE_A, MTUINDEX_V(i) |
+			     MTUWIDTH_V(log2) | MTUVALUE_V(mtu));
+
+		for (w = 0; w < NCCTRL_WIN; ++w) {
+			unsigned int inc;
+
+			inc = max(((mtu - 40) * alpha[w]) / avg_pkts[w],
+				  CC_MIN_INCR);
+
+			t4_write_reg(adap, TP_CCTRL_TABLE_A, (i << 21) |
+				     (w << 16) | (beta[w] << 13) | inc);
+		}
+	}
+}
+
+/* Calculates a rate in bytes/s given the number of 256-byte units per 4K core
+ * clocks.  The formula is
+ *
+ * bytes/s = bytes256 * 256 * ClkFreq / 4096
+ *
+ * which is equivalent to
+ *
+ * bytes/s = 62.5 * bytes256 * ClkFreq_ms
+ */
+static u64 chan_rate(struct adapter *adap, unsigned int bytes256)
+{
+	u64 v = bytes256 * adap->params.vpd.cclk;
+
+	return v * 62 + v / 2;
+}
+
+/**
+ *	t4_get_chan_txrate - get the current per channel Tx rates
+ *	@adap: the adapter
+ *	@nic_rate: rates for NIC traffic
+ *	@ofld_rate: rates for offloaded traffic
+ *
+ *	Return the current Tx rates in bytes/s for NIC and offloaded traffic
+ *	for each channel.
+ */
+void t4_get_chan_txrate(struct adapter *adap, u64 *nic_rate, u64 *ofld_rate)
+{
+	u32 v;
+
+	v = t4_read_reg(adap, TP_TX_TRATE_A);
+	nic_rate[0] = chan_rate(adap, TNLRATE0_G(v));
+	nic_rate[1] = chan_rate(adap, TNLRATE1_G(v));
+	if (adap->params.arch.nchan == NCHAN) {
+		nic_rate[2] = chan_rate(adap, TNLRATE2_G(v));
+		nic_rate[3] = chan_rate(adap, TNLRATE3_G(v));
+	}
+
+	v = t4_read_reg(adap, TP_TX_ORATE_A);
+	ofld_rate[0] = chan_rate(adap, OFDRATE0_G(v));
+	ofld_rate[1] = chan_rate(adap, OFDRATE1_G(v));
+	if (adap->params.arch.nchan == NCHAN) {
+		ofld_rate[2] = chan_rate(adap, OFDRATE2_G(v));
+		ofld_rate[3] = chan_rate(adap, OFDRATE3_G(v));
+	}
+}
+
+/**
+ *	t4_set_trace_filter - configure one of the tracing filters
+ *	@adap: the adapter
+ *	@tp: the desired trace filter parameters
+ *	@idx: which filter to configure
+ *	@enable: whether to enable or disable the filter
+ *
+ *	Configures one of the tracing filters available in HW.  If @enable is
+ *	%0 @tp is not examined and may be %NULL. The user is responsible to
+ *	set the single/multiple trace mode by writing to MPS_TRC_CFG_A register
+ */
+int t4_set_trace_filter(struct adapter *adap, const struct trace_params *tp,
+			int idx, int enable)
+{
+	int i, ofst = idx * 4;
+	u32 data_reg, mask_reg, cfg;
+	u32 multitrc = TRCMULTIFILTER_F;
+
+	if (!enable) {
+		t4_write_reg(adap, MPS_TRC_FILTER_MATCH_CTL_A_A + ofst, 0);
+		return 0;
+	}
+
+	cfg = t4_read_reg(adap, MPS_TRC_CFG_A);
+	if (cfg & TRCMULTIFILTER_F) {
+		/* If multiple tracers are enabled, then maximum
+		 * capture size is 2.5KB (FIFO size of a single channel)
+		 * minus 2 flits for CPL_TRACE_PKT header.
+		 */
+		if (tp->snap_len > ((10 * 1024 / 4) - (2 * 8)))
+			return -EINVAL;
+	} else {
+		/* If multiple tracers are disabled, to avoid deadlocks
+		 * maximum packet capture size of 9600 bytes is recommended.
+		 * Also in this mode, only trace0 can be enabled and running.
+		 */
+		multitrc = 0;
+		if (tp->snap_len > 9600 || idx)
+			return -EINVAL;
+	}
+
+	if (tp->port > (is_t4(adap->params.chip) ? 11 : 19) || tp->invert > 1 ||
+	    tp->skip_len > TFLENGTH_M || tp->skip_ofst > TFOFFSET_M ||
+	    tp->min_len > TFMINPKTSIZE_M)
+		return -EINVAL;
+
+	/* stop the tracer we'll be changing */
+	t4_write_reg(adap, MPS_TRC_FILTER_MATCH_CTL_A_A + ofst, 0);
+
+	idx *= (MPS_TRC_FILTER1_MATCH_A - MPS_TRC_FILTER0_MATCH_A);
+	data_reg = MPS_TRC_FILTER0_MATCH_A + idx;
+	mask_reg = MPS_TRC_FILTER0_DONT_CARE_A + idx;
+
+	for (i = 0; i < TRACE_LEN / 4; i++, data_reg += 4, mask_reg += 4) {
+		t4_write_reg(adap, data_reg, tp->data[i]);
+		t4_write_reg(adap, mask_reg, ~tp->mask[i]);
+	}
+	t4_write_reg(adap, MPS_TRC_FILTER_MATCH_CTL_B_A + ofst,
+		     TFCAPTUREMAX_V(tp->snap_len) |
+		     TFMINPKTSIZE_V(tp->min_len));
+	t4_write_reg(adap, MPS_TRC_FILTER_MATCH_CTL_A_A + ofst,
+		     TFOFFSET_V(tp->skip_ofst) | TFLENGTH_V(tp->skip_len) |
+		     (is_t4(adap->params.chip) ?
+		     TFPORT_V(tp->port) | TFEN_F | TFINVERTMATCH_V(tp->invert) :
+		     T5_TFPORT_V(tp->port) | T5_TFEN_F |
+		     T5_TFINVERTMATCH_V(tp->invert)));
+
+	return 0;
+}
+
+/**
+ *	t4_get_trace_filter - query one of the tracing filters
+ *	@adap: the adapter
+ *	@tp: the current trace filter parameters
+ *	@idx: which trace filter to query
+ *	@enabled: non-zero if the filter is enabled
+ *
+ *	Returns the current settings of one of the HW tracing filters.
+ */
+void t4_get_trace_filter(struct adapter *adap, struct trace_params *tp, int idx,
+			 int *enabled)
+{
+	u32 ctla, ctlb;
+	int i, ofst = idx * 4;
+	u32 data_reg, mask_reg;
+
+	ctla = t4_read_reg(adap, MPS_TRC_FILTER_MATCH_CTL_A_A + ofst);
+	ctlb = t4_read_reg(adap, MPS_TRC_FILTER_MATCH_CTL_B_A + ofst);
+
+	if (is_t4(adap->params.chip)) {
+		*enabled = !!(ctla & TFEN_F);
+		tp->port =  TFPORT_G(ctla);
+		tp->invert = !!(ctla & TFINVERTMATCH_F);
+	} else {
+		*enabled = !!(ctla & T5_TFEN_F);
+		tp->port = T5_TFPORT_G(ctla);
+		tp->invert = !!(ctla & T5_TFINVERTMATCH_F);
+	}
+	tp->snap_len = TFCAPTUREMAX_G(ctlb);
+	tp->min_len = TFMINPKTSIZE_G(ctlb);
+	tp->skip_ofst = TFOFFSET_G(ctla);
+	tp->skip_len = TFLENGTH_G(ctla);
+
+	ofst = (MPS_TRC_FILTER1_MATCH_A - MPS_TRC_FILTER0_MATCH_A) * idx;
+	data_reg = MPS_TRC_FILTER0_MATCH_A + ofst;
+	mask_reg = MPS_TRC_FILTER0_DONT_CARE_A + ofst;
+
+	for (i = 0; i < TRACE_LEN / 4; i++, data_reg += 4, mask_reg += 4) {
+		tp->mask[i] = ~t4_read_reg(adap, mask_reg);
+		tp->data[i] = t4_read_reg(adap, data_reg) & tp->mask[i];
+	}
+}
+
+/**
+ *	t4_pmtx_get_stats - returns the HW stats from PMTX
+ *	@adap: the adapter
+ *	@cnt: where to store the count statistics
+ *	@cycles: where to store the cycle statistics
+ *
+ *	Returns performance statistics from PMTX.
+ */
+void t4_pmtx_get_stats(struct adapter *adap, u32 cnt[], u64 cycles[])
+{
+	int i;
+	u32 data[2];
+
+	for (i = 0; i < adap->params.arch.pm_stats_cnt; i++) {
+		t4_write_reg(adap, PM_TX_STAT_CONFIG_A, i + 1);
+		cnt[i] = t4_read_reg(adap, PM_TX_STAT_COUNT_A);
+		if (is_t4(adap->params.chip)) {
+			cycles[i] = t4_read_reg64(adap, PM_TX_STAT_LSB_A);
+		} else {
+			t4_read_indirect(adap, PM_TX_DBG_CTRL_A,
+					 PM_TX_DBG_DATA_A, data, 2,
+					 PM_TX_DBG_STAT_MSB_A);
+			cycles[i] = (((u64)data[0] << 32) | data[1]);
+		}
+	}
+}
+
+/**
+ *	t4_pmrx_get_stats - returns the HW stats from PMRX
+ *	@adap: the adapter
+ *	@cnt: where to store the count statistics
+ *	@cycles: where to store the cycle statistics
+ *
+ *	Returns performance statistics from PMRX.
+ */
+void t4_pmrx_get_stats(struct adapter *adap, u32 cnt[], u64 cycles[])
+{
+	int i;
+	u32 data[2];
+
+	for (i = 0; i < adap->params.arch.pm_stats_cnt; i++) {
+		t4_write_reg(adap, PM_RX_STAT_CONFIG_A, i + 1);
+		cnt[i] = t4_read_reg(adap, PM_RX_STAT_COUNT_A);
+		if (is_t4(adap->params.chip)) {
+			cycles[i] = t4_read_reg64(adap, PM_RX_STAT_LSB_A);
+		} else {
+			t4_read_indirect(adap, PM_RX_DBG_CTRL_A,
+					 PM_RX_DBG_DATA_A, data, 2,
+					 PM_RX_DBG_STAT_MSB_A);
+			cycles[i] = (((u64)data[0] << 32) | data[1]);
+		}
+	}
+}
+
+/**
+ *	compute_mps_bg_map - compute the MPS Buffer Group Map for a Port
+ *	@adap: the adapter
+ *	@pidx: the port index
+ *
+ *	Computes and returns a bitmap indicating which MPS buffer groups are
+ *	associated with the given Port.  Bit i is set if buffer group i is
+ *	used by the Port.
+ */
+static inline unsigned int compute_mps_bg_map(struct adapter *adapter,
+					      int pidx)
+{
+	unsigned int chip_version, nports;
+
+	chip_version = CHELSIO_CHIP_VERSION(adapter->params.chip);
+	nports = 1 << NUMPORTS_G(t4_read_reg(adapter, MPS_CMN_CTL_A));
+
+	switch (chip_version) {
+	case CHELSIO_T4:
+	case CHELSIO_T5:
+		switch (nports) {
+		case 1: return 0xf;
+		case 2: return 3 << (2 * pidx);
+		case 4: return 1 << pidx;
+		}
+		break;
+
+	case CHELSIO_T6:
+		switch (nports) {
+		case 2: return 1 << (2 * pidx);
+		}
+		break;
+	}
+
+	dev_err(adapter->pdev_dev, "Need MPS Buffer Group Map for Chip %0x, Nports %d\n",
+		chip_version, nports);
+
+	return 0;
+}
+
+/**
+ *	t4_get_mps_bg_map - return the buffer groups associated with a port
+ *	@adapter: the adapter
+ *	@pidx: the port index
+ *
+ *	Returns a bitmap indicating which MPS buffer groups are associated
+ *	with the given Port.  Bit i is set if buffer group i is used by the
+ *	Port.
+ */
+unsigned int t4_get_mps_bg_map(struct adapter *adapter, int pidx)
+{
+	u8 *mps_bg_map;
+	unsigned int nports;
+
+	nports = 1 << NUMPORTS_G(t4_read_reg(adapter, MPS_CMN_CTL_A));
+	if (pidx >= nports) {
+		CH_WARN(adapter, "MPS Port Index %d >= Nports %d\n",
+			pidx, nports);
+		return 0;
+	}
+
+	/* If we've already retrieved/computed this, just return the result.
+	 */
+	mps_bg_map = adapter->params.mps_bg_map;
+	if (mps_bg_map[pidx])
+		return mps_bg_map[pidx];
+
+	/* Newer Firmware can tell us what the MPS Buffer Group Map is.
+	 * If we're talking to such Firmware, let it tell us.  If the new
+	 * API isn't supported, revert back to old hardcoded way.  The value
+	 * obtained from Firmware is encoded in below format:
+	 *
+	 * val = (( MPSBGMAP[Port 3] << 24 ) |
+	 *        ( MPSBGMAP[Port 2] << 16 ) |
+	 *        ( MPSBGMAP[Port 1] <<  8 ) |
+	 *        ( MPSBGMAP[Port 0] <<  0 ))
+	 */
+	if (adapter->flags & FW_OK) {
+		u32 param, val;
+		int ret;
+
+		param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+			 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_MPSBGMAP));
+		ret = t4_query_params_ns(adapter, adapter->mbox, adapter->pf,
+					 0, 1, &param, &val);
+		if (!ret) {
+			int p;
+
+			/* Store the BG Map for all of the Ports in order to
+			 * avoid more calls to the Firmware in the future.
+			 */
+			for (p = 0; p < MAX_NPORTS; p++, val >>= 8)
+				mps_bg_map[p] = val & 0xff;
+
+			return mps_bg_map[pidx];
+		}
+	}
+
+	/* Either we're not talking to the Firmware or we're dealing with
+	 * older Firmware which doesn't support the new API to get the MPS
+	 * Buffer Group Map.  Fall back to computing it ourselves.
+	 */
+	mps_bg_map[pidx] = compute_mps_bg_map(adapter, pidx);
+	return mps_bg_map[pidx];
+}
+
+/**
+ *	t4_get_tp_ch_map - return TP ingress channels associated with a port
+ *	@adapter: the adapter
+ *	@pidx: the port index
+ *
+ *	Returns a bitmap indicating which TP Ingress Channels are associated
+ *	with a given Port.  Bit i is set if TP Ingress Channel i is used by
+ *	the Port.
+ */
+unsigned int t4_get_tp_ch_map(struct adapter *adap, int pidx)
+{
+	unsigned int chip_version = CHELSIO_CHIP_VERSION(adap->params.chip);
+	unsigned int nports = 1 << NUMPORTS_G(t4_read_reg(adap, MPS_CMN_CTL_A));
+
+	if (pidx >= nports) {
+		dev_warn(adap->pdev_dev, "TP Port Index %d >= Nports %d\n",
+			 pidx, nports);
+		return 0;
+	}
+
+	switch (chip_version) {
+	case CHELSIO_T4:
+	case CHELSIO_T5:
+		/* Note that this happens to be the same values as the MPS
+		 * Buffer Group Map for these Chips.  But we replicate the code
+		 * here because they're really separate concepts.
+		 */
+		switch (nports) {
+		case 1: return 0xf;
+		case 2: return 3 << (2 * pidx);
+		case 4: return 1 << pidx;
+		}
+		break;
+
+	case CHELSIO_T6:
+		switch (nports) {
+		case 1:
+		case 2: return 1 << pidx;
+		}
+		break;
+	}
+
+	dev_err(adap->pdev_dev, "Need TP Channel Map for Chip %0x, Nports %d\n",
+		chip_version, nports);
+	return 0;
+}
+
+/**
+ *      t4_get_port_type_description - return Port Type string description
+ *      @port_type: firmware Port Type enumeration
+ */
+const char *t4_get_port_type_description(enum fw_port_type port_type)
+{
+	static const char *const port_type_description[] = {
+		"Fiber_XFI",
+		"Fiber_XAUI",
+		"BT_SGMII",
+		"BT_XFI",
+		"BT_XAUI",
+		"KX4",
+		"CX4",
+		"KX",
+		"KR",
+		"SFP",
+		"BP_AP",
+		"BP4_AP",
+		"QSFP_10G",
+		"QSA",
+		"QSFP",
+		"BP40_BA",
+		"KR4_100G",
+		"CR4_QSFP",
+		"CR_QSFP",
+		"CR2_QSFP",
+		"SFP28",
+		"KR_SFP28",
+		"KR_XLAUI"
+	};
+
+	if (port_type < ARRAY_SIZE(port_type_description))
+		return port_type_description[port_type];
+	return "UNKNOWN";
+}
+
+/**
+ *      t4_get_port_stats_offset - collect port stats relative to a previous
+ *                                 snapshot
+ *      @adap: The adapter
+ *      @idx: The port
+ *      @stats: Current stats to fill
+ *      @offset: Previous stats snapshot
+ */
+void t4_get_port_stats_offset(struct adapter *adap, int idx,
+			      struct port_stats *stats,
+			      struct port_stats *offset)
+{
+	u64 *s, *o;
+	int i;
+
+	t4_get_port_stats(adap, idx, stats);
+	for (i = 0, s = (u64 *)stats, o = (u64 *)offset;
+			i < (sizeof(struct port_stats) / sizeof(u64));
+			i++, s++, o++)
+		*s -= *o;
+}
+
+/**
+ *	t4_get_port_stats - collect port statistics
+ *	@adap: the adapter
+ *	@idx: the port index
+ *	@p: the stats structure to fill
+ *
+ *	Collect statistics related to the given port from HW.
+ */
+void t4_get_port_stats(struct adapter *adap, int idx, struct port_stats *p)
+{
+	u32 bgmap = t4_get_mps_bg_map(adap, idx);
+	u32 stat_ctl = t4_read_reg(adap, MPS_STAT_CTL_A);
+
+#define GET_STAT(name) \
+	t4_read_reg64(adap, \
+	(is_t4(adap->params.chip) ? PORT_REG(idx, MPS_PORT_STAT_##name##_L) : \
+	T5_PORT_REG(idx, MPS_PORT_STAT_##name##_L)))
+#define GET_STAT_COM(name) t4_read_reg64(adap, MPS_STAT_##name##_L)
+
+	p->tx_octets           = GET_STAT(TX_PORT_BYTES);
+	p->tx_frames           = GET_STAT(TX_PORT_FRAMES);
+	p->tx_bcast_frames     = GET_STAT(TX_PORT_BCAST);
+	p->tx_mcast_frames     = GET_STAT(TX_PORT_MCAST);
+	p->tx_ucast_frames     = GET_STAT(TX_PORT_UCAST);
+	p->tx_error_frames     = GET_STAT(TX_PORT_ERROR);
+	p->tx_frames_64        = GET_STAT(TX_PORT_64B);
+	p->tx_frames_65_127    = GET_STAT(TX_PORT_65B_127B);
+	p->tx_frames_128_255   = GET_STAT(TX_PORT_128B_255B);
+	p->tx_frames_256_511   = GET_STAT(TX_PORT_256B_511B);
+	p->tx_frames_512_1023  = GET_STAT(TX_PORT_512B_1023B);
+	p->tx_frames_1024_1518 = GET_STAT(TX_PORT_1024B_1518B);
+	p->tx_frames_1519_max  = GET_STAT(TX_PORT_1519B_MAX);
+	p->tx_drop             = GET_STAT(TX_PORT_DROP);
+	p->tx_pause            = GET_STAT(TX_PORT_PAUSE);
+	p->tx_ppp0             = GET_STAT(TX_PORT_PPP0);
+	p->tx_ppp1             = GET_STAT(TX_PORT_PPP1);
+	p->tx_ppp2             = GET_STAT(TX_PORT_PPP2);
+	p->tx_ppp3             = GET_STAT(TX_PORT_PPP3);
+	p->tx_ppp4             = GET_STAT(TX_PORT_PPP4);
+	p->tx_ppp5             = GET_STAT(TX_PORT_PPP5);
+	p->tx_ppp6             = GET_STAT(TX_PORT_PPP6);
+	p->tx_ppp7             = GET_STAT(TX_PORT_PPP7);
+
+	if (CHELSIO_CHIP_VERSION(adap->params.chip) >= CHELSIO_T5) {
+		if (stat_ctl & COUNTPAUSESTATTX_F)
+			p->tx_frames_64 -= p->tx_pause;
+		if (stat_ctl & COUNTPAUSEMCTX_F)
+			p->tx_mcast_frames -= p->tx_pause;
+	}
+	p->rx_octets           = GET_STAT(RX_PORT_BYTES);
+	p->rx_frames           = GET_STAT(RX_PORT_FRAMES);
+	p->rx_bcast_frames     = GET_STAT(RX_PORT_BCAST);
+	p->rx_mcast_frames     = GET_STAT(RX_PORT_MCAST);
+	p->rx_ucast_frames     = GET_STAT(RX_PORT_UCAST);
+	p->rx_too_long         = GET_STAT(RX_PORT_MTU_ERROR);
+	p->rx_jabber           = GET_STAT(RX_PORT_MTU_CRC_ERROR);
+	p->rx_fcs_err          = GET_STAT(RX_PORT_CRC_ERROR);
+	p->rx_len_err          = GET_STAT(RX_PORT_LEN_ERROR);
+	p->rx_symbol_err       = GET_STAT(RX_PORT_SYM_ERROR);
+	p->rx_runt             = GET_STAT(RX_PORT_LESS_64B);
+	p->rx_frames_64        = GET_STAT(RX_PORT_64B);
+	p->rx_frames_65_127    = GET_STAT(RX_PORT_65B_127B);
+	p->rx_frames_128_255   = GET_STAT(RX_PORT_128B_255B);
+	p->rx_frames_256_511   = GET_STAT(RX_PORT_256B_511B);
+	p->rx_frames_512_1023  = GET_STAT(RX_PORT_512B_1023B);
+	p->rx_frames_1024_1518 = GET_STAT(RX_PORT_1024B_1518B);
+	p->rx_frames_1519_max  = GET_STAT(RX_PORT_1519B_MAX);
+	p->rx_pause            = GET_STAT(RX_PORT_PAUSE);
+	p->rx_ppp0             = GET_STAT(RX_PORT_PPP0);
+	p->rx_ppp1             = GET_STAT(RX_PORT_PPP1);
+	p->rx_ppp2             = GET_STAT(RX_PORT_PPP2);
+	p->rx_ppp3             = GET_STAT(RX_PORT_PPP3);
+	p->rx_ppp4             = GET_STAT(RX_PORT_PPP4);
+	p->rx_ppp5             = GET_STAT(RX_PORT_PPP5);
+	p->rx_ppp6             = GET_STAT(RX_PORT_PPP6);
+	p->rx_ppp7             = GET_STAT(RX_PORT_PPP7);
+
+	if (CHELSIO_CHIP_VERSION(adap->params.chip) >= CHELSIO_T5) {
+		if (stat_ctl & COUNTPAUSESTATRX_F)
+			p->rx_frames_64 -= p->rx_pause;
+		if (stat_ctl & COUNTPAUSEMCRX_F)
+			p->rx_mcast_frames -= p->rx_pause;
+	}
+
+	p->rx_ovflow0 = (bgmap & 1) ? GET_STAT_COM(RX_BG_0_MAC_DROP_FRAME) : 0;
+	p->rx_ovflow1 = (bgmap & 2) ? GET_STAT_COM(RX_BG_1_MAC_DROP_FRAME) : 0;
+	p->rx_ovflow2 = (bgmap & 4) ? GET_STAT_COM(RX_BG_2_MAC_DROP_FRAME) : 0;
+	p->rx_ovflow3 = (bgmap & 8) ? GET_STAT_COM(RX_BG_3_MAC_DROP_FRAME) : 0;
+	p->rx_trunc0 = (bgmap & 1) ? GET_STAT_COM(RX_BG_0_MAC_TRUNC_FRAME) : 0;
+	p->rx_trunc1 = (bgmap & 2) ? GET_STAT_COM(RX_BG_1_MAC_TRUNC_FRAME) : 0;
+	p->rx_trunc2 = (bgmap & 4) ? GET_STAT_COM(RX_BG_2_MAC_TRUNC_FRAME) : 0;
+	p->rx_trunc3 = (bgmap & 8) ? GET_STAT_COM(RX_BG_3_MAC_TRUNC_FRAME) : 0;
+
+#undef GET_STAT
+#undef GET_STAT_COM
+}
+
+/**
+ *	t4_get_lb_stats - collect loopback port statistics
+ *	@adap: the adapter
+ *	@idx: the loopback port index
+ *	@p: the stats structure to fill
+ *
+ *	Return HW statistics for the given loopback port.
+ */
+void t4_get_lb_stats(struct adapter *adap, int idx, struct lb_port_stats *p)
+{
+	u32 bgmap = t4_get_mps_bg_map(adap, idx);
+
+#define GET_STAT(name) \
+	t4_read_reg64(adap, \
+	(is_t4(adap->params.chip) ? \
+	PORT_REG(idx, MPS_PORT_STAT_LB_PORT_##name##_L) : \
+	T5_PORT_REG(idx, MPS_PORT_STAT_LB_PORT_##name##_L)))
+#define GET_STAT_COM(name) t4_read_reg64(adap, MPS_STAT_##name##_L)
+
+	p->octets           = GET_STAT(BYTES);
+	p->frames           = GET_STAT(FRAMES);
+	p->bcast_frames     = GET_STAT(BCAST);
+	p->mcast_frames     = GET_STAT(MCAST);
+	p->ucast_frames     = GET_STAT(UCAST);
+	p->error_frames     = GET_STAT(ERROR);
+
+	p->frames_64        = GET_STAT(64B);
+	p->frames_65_127    = GET_STAT(65B_127B);
+	p->frames_128_255   = GET_STAT(128B_255B);
+	p->frames_256_511   = GET_STAT(256B_511B);
+	p->frames_512_1023  = GET_STAT(512B_1023B);
+	p->frames_1024_1518 = GET_STAT(1024B_1518B);
+	p->frames_1519_max  = GET_STAT(1519B_MAX);
+	p->drop             = GET_STAT(DROP_FRAMES);
+
+	p->ovflow0 = (bgmap & 1) ? GET_STAT_COM(RX_BG_0_LB_DROP_FRAME) : 0;
+	p->ovflow1 = (bgmap & 2) ? GET_STAT_COM(RX_BG_1_LB_DROP_FRAME) : 0;
+	p->ovflow2 = (bgmap & 4) ? GET_STAT_COM(RX_BG_2_LB_DROP_FRAME) : 0;
+	p->ovflow3 = (bgmap & 8) ? GET_STAT_COM(RX_BG_3_LB_DROP_FRAME) : 0;
+	p->trunc0 = (bgmap & 1) ? GET_STAT_COM(RX_BG_0_LB_TRUNC_FRAME) : 0;
+	p->trunc1 = (bgmap & 2) ? GET_STAT_COM(RX_BG_1_LB_TRUNC_FRAME) : 0;
+	p->trunc2 = (bgmap & 4) ? GET_STAT_COM(RX_BG_2_LB_TRUNC_FRAME) : 0;
+	p->trunc3 = (bgmap & 8) ? GET_STAT_COM(RX_BG_3_LB_TRUNC_FRAME) : 0;
+
+#undef GET_STAT
+#undef GET_STAT_COM
+}
+
+/*     t4_mk_filtdelwr - create a delete filter WR
+ *     @ftid: the filter ID
+ *     @wr: the filter work request to populate
+ *     @qid: ingress queue to receive the delete notification
+ *
+ *     Creates a filter work request to delete the supplied filter.  If @qid is
+ *     negative the delete notification is suppressed.
+ */
+void t4_mk_filtdelwr(unsigned int ftid, struct fw_filter_wr *wr, int qid)
+{
+	memset(wr, 0, sizeof(*wr));
+	wr->op_pkd = cpu_to_be32(FW_WR_OP_V(FW_FILTER_WR));
+	wr->len16_pkd = cpu_to_be32(FW_WR_LEN16_V(sizeof(*wr) / 16));
+	wr->tid_to_iq = cpu_to_be32(FW_FILTER_WR_TID_V(ftid) |
+				    FW_FILTER_WR_NOREPLY_V(qid < 0));
+	wr->del_filter_to_l2tix = cpu_to_be32(FW_FILTER_WR_DEL_FILTER_F);
+	if (qid >= 0)
+		wr->rx_chan_rx_rpl_iq =
+			cpu_to_be16(FW_FILTER_WR_RX_RPL_IQ_V(qid));
+}
+
+#define INIT_CMD(var, cmd, rd_wr) do { \
+	(var).op_to_write = cpu_to_be32(FW_CMD_OP_V(FW_##cmd##_CMD) | \
+					FW_CMD_REQUEST_F | \
+					FW_CMD_##rd_wr##_F); \
+	(var).retval_len16 = cpu_to_be32(FW_LEN16(var)); \
+} while (0)
+
+int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox,
+			  u32 addr, u32 val)
+{
+	u32 ldst_addrspace;
+	struct fw_ldst_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	ldst_addrspace = FW_LDST_CMD_ADDRSPACE_V(FW_LDST_ADDRSPC_FIRMWARE);
+	c.op_to_addrspace = cpu_to_be32(FW_CMD_OP_V(FW_LDST_CMD) |
+					FW_CMD_REQUEST_F |
+					FW_CMD_WRITE_F |
+					ldst_addrspace);
+	c.cycles_to_len16 = cpu_to_be32(FW_LEN16(c));
+	c.u.addrval.addr = cpu_to_be32(addr);
+	c.u.addrval.val = cpu_to_be32(val);
+
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_mdio_rd - read a PHY register through MDIO
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@phy_addr: the PHY address
+ *	@mmd: the PHY MMD to access (0 for clause 22 PHYs)
+ *	@reg: the register to read
+ *	@valp: where to store the value
+ *
+ *	Issues a FW command through the given mailbox to read a PHY register.
+ */
+int t4_mdio_rd(struct adapter *adap, unsigned int mbox, unsigned int phy_addr,
+	       unsigned int mmd, unsigned int reg, u16 *valp)
+{
+	int ret;
+	u32 ldst_addrspace;
+	struct fw_ldst_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	ldst_addrspace = FW_LDST_CMD_ADDRSPACE_V(FW_LDST_ADDRSPC_MDIO);
+	c.op_to_addrspace = cpu_to_be32(FW_CMD_OP_V(FW_LDST_CMD) |
+					FW_CMD_REQUEST_F | FW_CMD_READ_F |
+					ldst_addrspace);
+	c.cycles_to_len16 = cpu_to_be32(FW_LEN16(c));
+	c.u.mdio.paddr_mmd = cpu_to_be16(FW_LDST_CMD_PADDR_V(phy_addr) |
+					 FW_LDST_CMD_MMD_V(mmd));
+	c.u.mdio.raddr = cpu_to_be16(reg);
+
+	ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
+	if (ret == 0)
+		*valp = be16_to_cpu(c.u.mdio.rval);
+	return ret;
+}
+
+/**
+ *	t4_mdio_wr - write a PHY register through MDIO
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@phy_addr: the PHY address
+ *	@mmd: the PHY MMD to access (0 for clause 22 PHYs)
+ *	@reg: the register to write
+ *	@valp: value to write
+ *
+ *	Issues a FW command through the given mailbox to write a PHY register.
+ */
+int t4_mdio_wr(struct adapter *adap, unsigned int mbox, unsigned int phy_addr,
+	       unsigned int mmd, unsigned int reg, u16 val)
+{
+	u32 ldst_addrspace;
+	struct fw_ldst_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	ldst_addrspace = FW_LDST_CMD_ADDRSPACE_V(FW_LDST_ADDRSPC_MDIO);
+	c.op_to_addrspace = cpu_to_be32(FW_CMD_OP_V(FW_LDST_CMD) |
+					FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+					ldst_addrspace);
+	c.cycles_to_len16 = cpu_to_be32(FW_LEN16(c));
+	c.u.mdio.paddr_mmd = cpu_to_be16(FW_LDST_CMD_PADDR_V(phy_addr) |
+					 FW_LDST_CMD_MMD_V(mmd));
+	c.u.mdio.raddr = cpu_to_be16(reg);
+	c.u.mdio.rval = cpu_to_be16(val);
+
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_sge_decode_idma_state - decode the idma state
+ *	@adap: the adapter
+ *	@state: the state idma is stuck in
+ */
+void t4_sge_decode_idma_state(struct adapter *adapter, int state)
+{
+	static const char * const t4_decode[] = {
+		"IDMA_IDLE",
+		"IDMA_PUSH_MORE_CPL_FIFO",
+		"IDMA_PUSH_CPL_MSG_HEADER_TO_FIFO",
+		"Not used",
+		"IDMA_PHYSADDR_SEND_PCIEHDR",
+		"IDMA_PHYSADDR_SEND_PAYLOAD_FIRST",
+		"IDMA_PHYSADDR_SEND_PAYLOAD",
+		"IDMA_SEND_FIFO_TO_IMSG",
+		"IDMA_FL_REQ_DATA_FL_PREP",
+		"IDMA_FL_REQ_DATA_FL",
+		"IDMA_FL_DROP",
+		"IDMA_FL_H_REQ_HEADER_FL",
+		"IDMA_FL_H_SEND_PCIEHDR",
+		"IDMA_FL_H_PUSH_CPL_FIFO",
+		"IDMA_FL_H_SEND_CPL",
+		"IDMA_FL_H_SEND_IP_HDR_FIRST",
+		"IDMA_FL_H_SEND_IP_HDR",
+		"IDMA_FL_H_REQ_NEXT_HEADER_FL",
+		"IDMA_FL_H_SEND_NEXT_PCIEHDR",
+		"IDMA_FL_H_SEND_IP_HDR_PADDING",
+		"IDMA_FL_D_SEND_PCIEHDR",
+		"IDMA_FL_D_SEND_CPL_AND_IP_HDR",
+		"IDMA_FL_D_REQ_NEXT_DATA_FL",
+		"IDMA_FL_SEND_PCIEHDR",
+		"IDMA_FL_PUSH_CPL_FIFO",
+		"IDMA_FL_SEND_CPL",
+		"IDMA_FL_SEND_PAYLOAD_FIRST",
+		"IDMA_FL_SEND_PAYLOAD",
+		"IDMA_FL_REQ_NEXT_DATA_FL",
+		"IDMA_FL_SEND_NEXT_PCIEHDR",
+		"IDMA_FL_SEND_PADDING",
+		"IDMA_FL_SEND_COMPLETION_TO_IMSG",
+		"IDMA_FL_SEND_FIFO_TO_IMSG",
+		"IDMA_FL_REQ_DATAFL_DONE",
+		"IDMA_FL_REQ_HEADERFL_DONE",
+	};
+	static const char * const t5_decode[] = {
+		"IDMA_IDLE",
+		"IDMA_ALMOST_IDLE",
+		"IDMA_PUSH_MORE_CPL_FIFO",
+		"IDMA_PUSH_CPL_MSG_HEADER_TO_FIFO",
+		"IDMA_SGEFLRFLUSH_SEND_PCIEHDR",
+		"IDMA_PHYSADDR_SEND_PCIEHDR",
+		"IDMA_PHYSADDR_SEND_PAYLOAD_FIRST",
+		"IDMA_PHYSADDR_SEND_PAYLOAD",
+		"IDMA_SEND_FIFO_TO_IMSG",
+		"IDMA_FL_REQ_DATA_FL",
+		"IDMA_FL_DROP",
+		"IDMA_FL_DROP_SEND_INC",
+		"IDMA_FL_H_REQ_HEADER_FL",
+		"IDMA_FL_H_SEND_PCIEHDR",
+		"IDMA_FL_H_PUSH_CPL_FIFO",
+		"IDMA_FL_H_SEND_CPL",
+		"IDMA_FL_H_SEND_IP_HDR_FIRST",
+		"IDMA_FL_H_SEND_IP_HDR",
+		"IDMA_FL_H_REQ_NEXT_HEADER_FL",
+		"IDMA_FL_H_SEND_NEXT_PCIEHDR",
+		"IDMA_FL_H_SEND_IP_HDR_PADDING",
+		"IDMA_FL_D_SEND_PCIEHDR",
+		"IDMA_FL_D_SEND_CPL_AND_IP_HDR",
+		"IDMA_FL_D_REQ_NEXT_DATA_FL",
+		"IDMA_FL_SEND_PCIEHDR",
+		"IDMA_FL_PUSH_CPL_FIFO",
+		"IDMA_FL_SEND_CPL",
+		"IDMA_FL_SEND_PAYLOAD_FIRST",
+		"IDMA_FL_SEND_PAYLOAD",
+		"IDMA_FL_REQ_NEXT_DATA_FL",
+		"IDMA_FL_SEND_NEXT_PCIEHDR",
+		"IDMA_FL_SEND_PADDING",
+		"IDMA_FL_SEND_COMPLETION_TO_IMSG",
+	};
+	static const char * const t6_decode[] = {
+		"IDMA_IDLE",
+		"IDMA_PUSH_MORE_CPL_FIFO",
+		"IDMA_PUSH_CPL_MSG_HEADER_TO_FIFO",
+		"IDMA_SGEFLRFLUSH_SEND_PCIEHDR",
+		"IDMA_PHYSADDR_SEND_PCIEHDR",
+		"IDMA_PHYSADDR_SEND_PAYLOAD_FIRST",
+		"IDMA_PHYSADDR_SEND_PAYLOAD",
+		"IDMA_FL_REQ_DATA_FL",
+		"IDMA_FL_DROP",
+		"IDMA_FL_DROP_SEND_INC",
+		"IDMA_FL_H_REQ_HEADER_FL",
+		"IDMA_FL_H_SEND_PCIEHDR",
+		"IDMA_FL_H_PUSH_CPL_FIFO",
+		"IDMA_FL_H_SEND_CPL",
+		"IDMA_FL_H_SEND_IP_HDR_FIRST",
+		"IDMA_FL_H_SEND_IP_HDR",
+		"IDMA_FL_H_REQ_NEXT_HEADER_FL",
+		"IDMA_FL_H_SEND_NEXT_PCIEHDR",
+		"IDMA_FL_H_SEND_IP_HDR_PADDING",
+		"IDMA_FL_D_SEND_PCIEHDR",
+		"IDMA_FL_D_SEND_CPL_AND_IP_HDR",
+		"IDMA_FL_D_REQ_NEXT_DATA_FL",
+		"IDMA_FL_SEND_PCIEHDR",
+		"IDMA_FL_PUSH_CPL_FIFO",
+		"IDMA_FL_SEND_CPL",
+		"IDMA_FL_SEND_PAYLOAD_FIRST",
+		"IDMA_FL_SEND_PAYLOAD",
+		"IDMA_FL_REQ_NEXT_DATA_FL",
+		"IDMA_FL_SEND_NEXT_PCIEHDR",
+		"IDMA_FL_SEND_PADDING",
+		"IDMA_FL_SEND_COMPLETION_TO_IMSG",
+	};
+	static const u32 sge_regs[] = {
+		SGE_DEBUG_DATA_LOW_INDEX_2_A,
+		SGE_DEBUG_DATA_LOW_INDEX_3_A,
+		SGE_DEBUG_DATA_HIGH_INDEX_10_A,
+	};
+	const char **sge_idma_decode;
+	int sge_idma_decode_nstates;
+	int i;
+	unsigned int chip_version = CHELSIO_CHIP_VERSION(adapter->params.chip);
+
+	/* Select the right set of decode strings to dump depending on the
+	 * adapter chip type.
+	 */
+	switch (chip_version) {
+	case CHELSIO_T4:
+		sge_idma_decode = (const char **)t4_decode;
+		sge_idma_decode_nstates = ARRAY_SIZE(t4_decode);
+		break;
+
+	case CHELSIO_T5:
+		sge_idma_decode = (const char **)t5_decode;
+		sge_idma_decode_nstates = ARRAY_SIZE(t5_decode);
+		break;
+
+	case CHELSIO_T6:
+		sge_idma_decode = (const char **)t6_decode;
+		sge_idma_decode_nstates = ARRAY_SIZE(t6_decode);
+		break;
+
+	default:
+		dev_err(adapter->pdev_dev,
+			"Unsupported chip version %d\n", chip_version);
+		return;
+	}
+
+	if (is_t4(adapter->params.chip)) {
+		sge_idma_decode = (const char **)t4_decode;
+		sge_idma_decode_nstates = ARRAY_SIZE(t4_decode);
+	} else {
+		sge_idma_decode = (const char **)t5_decode;
+		sge_idma_decode_nstates = ARRAY_SIZE(t5_decode);
+	}
+
+	if (state < sge_idma_decode_nstates)
+		CH_WARN(adapter, "idma state %s\n", sge_idma_decode[state]);
+	else
+		CH_WARN(adapter, "idma state %d unknown\n", state);
+
+	for (i = 0; i < ARRAY_SIZE(sge_regs); i++)
+		CH_WARN(adapter, "SGE register %#x value %#x\n",
+			sge_regs[i], t4_read_reg(adapter, sge_regs[i]));
+}
+
+/**
+ *      t4_sge_ctxt_flush - flush the SGE context cache
+ *      @adap: the adapter
+ *      @mbox: mailbox to use for the FW command
+ *      @ctx_type: Egress or Ingress
+ *
+ *      Issues a FW command through the given mailbox to flush the
+ *      SGE context cache.
+ */
+int t4_sge_ctxt_flush(struct adapter *adap, unsigned int mbox, int ctxt_type)
+{
+	int ret;
+	u32 ldst_addrspace;
+	struct fw_ldst_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	ldst_addrspace = FW_LDST_CMD_ADDRSPACE_V(ctxt_type == CTXT_EGRESS ?
+						 FW_LDST_ADDRSPC_SGE_EGRC :
+						 FW_LDST_ADDRSPC_SGE_INGC);
+	c.op_to_addrspace = cpu_to_be32(FW_CMD_OP_V(FW_LDST_CMD) |
+					FW_CMD_REQUEST_F | FW_CMD_READ_F |
+					ldst_addrspace);
+	c.cycles_to_len16 = cpu_to_be32(FW_LEN16(c));
+	c.u.idctxt.msg_ctxtflush = cpu_to_be32(FW_LDST_CMD_CTXTFLUSH_F);
+
+	ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
+	return ret;
+}
+
+/**
+ *      t4_fw_hello - establish communication with FW
+ *      @adap: the adapter
+ *      @mbox: mailbox to use for the FW command
+ *      @evt_mbox: mailbox to receive async FW events
+ *      @master: specifies the caller's willingness to be the device master
+ *	@state: returns the current device state (if non-NULL)
+ *
+ *	Issues a command to establish communication with FW.  Returns either
+ *	an error (negative integer) or the mailbox of the Master PF.
+ */
+int t4_fw_hello(struct adapter *adap, unsigned int mbox, unsigned int evt_mbox,
+		enum dev_master master, enum dev_state *state)
+{
+	int ret;
+	struct fw_hello_cmd c;
+	u32 v;
+	unsigned int master_mbox;
+	int retries = FW_CMD_HELLO_RETRIES;
+
+retry:
+	memset(&c, 0, sizeof(c));
+	INIT_CMD(c, HELLO, WRITE);
+	c.err_to_clearinit = cpu_to_be32(
+		FW_HELLO_CMD_MASTERDIS_V(master == MASTER_CANT) |
+		FW_HELLO_CMD_MASTERFORCE_V(master == MASTER_MUST) |
+		FW_HELLO_CMD_MBMASTER_V(master == MASTER_MUST ?
+					mbox : FW_HELLO_CMD_MBMASTER_M) |
+		FW_HELLO_CMD_MBASYNCNOT_V(evt_mbox) |
+		FW_HELLO_CMD_STAGE_V(fw_hello_cmd_stage_os) |
+		FW_HELLO_CMD_CLEARINIT_F);
+
+	/*
+	 * Issue the HELLO command to the firmware.  If it's not successful
+	 * but indicates that we got a "busy" or "timeout" condition, retry
+	 * the HELLO until we exhaust our retry limit.  If we do exceed our
+	 * retry limit, check to see if the firmware left us any error
+	 * information and report that if so.
+	 */
+	ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
+	if (ret < 0) {
+		if ((ret == -EBUSY || ret == -ETIMEDOUT) && retries-- > 0)
+			goto retry;
+		if (t4_read_reg(adap, PCIE_FW_A) & PCIE_FW_ERR_F)
+			t4_report_fw_error(adap);
+		return ret;
+	}
+
+	v = be32_to_cpu(c.err_to_clearinit);
+	master_mbox = FW_HELLO_CMD_MBMASTER_G(v);
+	if (state) {
+		if (v & FW_HELLO_CMD_ERR_F)
+			*state = DEV_STATE_ERR;
+		else if (v & FW_HELLO_CMD_INIT_F)
+			*state = DEV_STATE_INIT;
+		else
+			*state = DEV_STATE_UNINIT;
+	}
+
+	/*
+	 * If we're not the Master PF then we need to wait around for the
+	 * Master PF Driver to finish setting up the adapter.
+	 *
+	 * Note that we also do this wait if we're a non-Master-capable PF and
+	 * there is no current Master PF; a Master PF may show up momentarily
+	 * and we wouldn't want to fail pointlessly.  (This can happen when an
+	 * OS loads lots of different drivers rapidly at the same time).  In
+	 * this case, the Master PF returned by the firmware will be
+	 * PCIE_FW_MASTER_M so the test below will work ...
+	 */
+	if ((v & (FW_HELLO_CMD_ERR_F|FW_HELLO_CMD_INIT_F)) == 0 &&
+	    master_mbox != mbox) {
+		int waiting = FW_CMD_HELLO_TIMEOUT;
+
+		/*
+		 * Wait for the firmware to either indicate an error or
+		 * initialized state.  If we see either of these we bail out
+		 * and report the issue to the caller.  If we exhaust the
+		 * "hello timeout" and we haven't exhausted our retries, try
+		 * again.  Otherwise bail with a timeout error.
+		 */
+		for (;;) {
+			u32 pcie_fw;
+
+			msleep(50);
+			waiting -= 50;
+
+			/*
+			 * If neither Error nor Initialialized are indicated
+			 * by the firmware keep waiting till we exaust our
+			 * timeout ... and then retry if we haven't exhausted
+			 * our retries ...
+			 */
+			pcie_fw = t4_read_reg(adap, PCIE_FW_A);
+			if (!(pcie_fw & (PCIE_FW_ERR_F|PCIE_FW_INIT_F))) {
+				if (waiting <= 0) {
+					if (retries-- > 0)
+						goto retry;
+
+					return -ETIMEDOUT;
+				}
+				continue;
+			}
+
+			/*
+			 * We either have an Error or Initialized condition
+			 * report errors preferentially.
+			 */
+			if (state) {
+				if (pcie_fw & PCIE_FW_ERR_F)
+					*state = DEV_STATE_ERR;
+				else if (pcie_fw & PCIE_FW_INIT_F)
+					*state = DEV_STATE_INIT;
+			}
+
+			/*
+			 * If we arrived before a Master PF was selected and
+			 * there's not a valid Master PF, grab its identity
+			 * for our caller.
+			 */
+			if (master_mbox == PCIE_FW_MASTER_M &&
+			    (pcie_fw & PCIE_FW_MASTER_VLD_F))
+				master_mbox = PCIE_FW_MASTER_G(pcie_fw);
+			break;
+		}
+	}
+
+	return master_mbox;
+}
+
+/**
+ *	t4_fw_bye - end communication with FW
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *
+ *	Issues a command to terminate communication with FW.
+ */
+int t4_fw_bye(struct adapter *adap, unsigned int mbox)
+{
+	struct fw_bye_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	INIT_CMD(c, BYE, WRITE);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_init_cmd - ask FW to initialize the device
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *
+ *	Issues a command to FW to partially initialize the device.  This
+ *	performs initialization that generally doesn't depend on user input.
+ */
+int t4_early_init(struct adapter *adap, unsigned int mbox)
+{
+	struct fw_initialize_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	INIT_CMD(c, INITIALIZE, WRITE);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_fw_reset - issue a reset to FW
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@reset: specifies the type of reset to perform
+ *
+ *	Issues a reset command of the specified type to FW.
+ */
+int t4_fw_reset(struct adapter *adap, unsigned int mbox, int reset)
+{
+	struct fw_reset_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	INIT_CMD(c, RESET, WRITE);
+	c.val = cpu_to_be32(reset);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_fw_halt - issue a reset/halt to FW and put uP into RESET
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW RESET command (if desired)
+ *	@force: force uP into RESET even if FW RESET command fails
+ *
+ *	Issues a RESET command to firmware (if desired) with a HALT indication
+ *	and then puts the microprocessor into RESET state.  The RESET command
+ *	will only be issued if a legitimate mailbox is provided (mbox <=
+ *	PCIE_FW_MASTER_M).
+ *
+ *	This is generally used in order for the host to safely manipulate the
+ *	adapter without fear of conflicting with whatever the firmware might
+ *	be doing.  The only way out of this state is to RESTART the firmware
+ *	...
+ */
+static int t4_fw_halt(struct adapter *adap, unsigned int mbox, int force)
+{
+	int ret = 0;
+
+	/*
+	 * If a legitimate mailbox is provided, issue a RESET command
+	 * with a HALT indication.
+	 */
+	if (mbox <= PCIE_FW_MASTER_M) {
+		struct fw_reset_cmd c;
+
+		memset(&c, 0, sizeof(c));
+		INIT_CMD(c, RESET, WRITE);
+		c.val = cpu_to_be32(PIORST_F | PIORSTMODE_F);
+		c.halt_pkd = cpu_to_be32(FW_RESET_CMD_HALT_F);
+		ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+	}
+
+	/*
+	 * Normally we won't complete the operation if the firmware RESET
+	 * command fails but if our caller insists we'll go ahead and put the
+	 * uP into RESET.  This can be useful if the firmware is hung or even
+	 * missing ...  We'll have to take the risk of putting the uP into
+	 * RESET without the cooperation of firmware in that case.
+	 *
+	 * We also force the firmware's HALT flag to be on in case we bypassed
+	 * the firmware RESET command above or we're dealing with old firmware
+	 * which doesn't have the HALT capability.  This will serve as a flag
+	 * for the incoming firmware to know that it's coming out of a HALT
+	 * rather than a RESET ... if it's new enough to understand that ...
+	 */
+	if (ret == 0 || force) {
+		t4_set_reg_field(adap, CIM_BOOT_CFG_A, UPCRST_F, UPCRST_F);
+		t4_set_reg_field(adap, PCIE_FW_A, PCIE_FW_HALT_F,
+				 PCIE_FW_HALT_F);
+	}
+
+	/*
+	 * And we always return the result of the firmware RESET command
+	 * even when we force the uP into RESET ...
+	 */
+	return ret;
+}
+
+/**
+ *	t4_fw_restart - restart the firmware by taking the uP out of RESET
+ *	@adap: the adapter
+ *	@reset: if we want to do a RESET to restart things
+ *
+ *	Restart firmware previously halted by t4_fw_halt().  On successful
+ *	return the previous PF Master remains as the new PF Master and there
+ *	is no need to issue a new HELLO command, etc.
+ *
+ *	We do this in two ways:
+ *
+ *	 1. If we're dealing with newer firmware we'll simply want to take
+ *	    the chip's microprocessor out of RESET.  This will cause the
+ *	    firmware to start up from its start vector.  And then we'll loop
+ *	    until the firmware indicates it's started again (PCIE_FW.HALT
+ *	    reset to 0) or we timeout.
+ *
+ *	 2. If we're dealing with older firmware then we'll need to RESET
+ *	    the chip since older firmware won't recognize the PCIE_FW.HALT
+ *	    flag and automatically RESET itself on startup.
+ */
+static int t4_fw_restart(struct adapter *adap, unsigned int mbox, int reset)
+{
+	if (reset) {
+		/*
+		 * Since we're directing the RESET instead of the firmware
+		 * doing it automatically, we need to clear the PCIE_FW.HALT
+		 * bit.
+		 */
+		t4_set_reg_field(adap, PCIE_FW_A, PCIE_FW_HALT_F, 0);
+
+		/*
+		 * If we've been given a valid mailbox, first try to get the
+		 * firmware to do the RESET.  If that works, great and we can
+		 * return success.  Otherwise, if we haven't been given a
+		 * valid mailbox or the RESET command failed, fall back to
+		 * hitting the chip with a hammer.
+		 */
+		if (mbox <= PCIE_FW_MASTER_M) {
+			t4_set_reg_field(adap, CIM_BOOT_CFG_A, UPCRST_F, 0);
+			msleep(100);
+			if (t4_fw_reset(adap, mbox,
+					PIORST_F | PIORSTMODE_F) == 0)
+				return 0;
+		}
+
+		t4_write_reg(adap, PL_RST_A, PIORST_F | PIORSTMODE_F);
+		msleep(2000);
+	} else {
+		int ms;
+
+		t4_set_reg_field(adap, CIM_BOOT_CFG_A, UPCRST_F, 0);
+		for (ms = 0; ms < FW_CMD_MAX_TIMEOUT; ) {
+			if (!(t4_read_reg(adap, PCIE_FW_A) & PCIE_FW_HALT_F))
+				return 0;
+			msleep(100);
+			ms += 100;
+		}
+		return -ETIMEDOUT;
+	}
+	return 0;
+}
+
+/**
+ *	t4_fw_upgrade - perform all of the steps necessary to upgrade FW
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW RESET command (if desired)
+ *	@fw_data: the firmware image to write
+ *	@size: image size
+ *	@force: force upgrade even if firmware doesn't cooperate
+ *
+ *	Perform all of the steps necessary for upgrading an adapter's
+ *	firmware image.  Normally this requires the cooperation of the
+ *	existing firmware in order to halt all existing activities
+ *	but if an invalid mailbox token is passed in we skip that step
+ *	(though we'll still put the adapter microprocessor into RESET in
+ *	that case).
+ *
+ *	On successful return the new firmware will have been loaded and
+ *	the adapter will have been fully RESET losing all previous setup
+ *	state.  On unsuccessful return the adapter may be completely hosed ...
+ *	positive errno indicates that the adapter is ~probably~ intact, a
+ *	negative errno indicates that things are looking bad ...
+ */
+int t4_fw_upgrade(struct adapter *adap, unsigned int mbox,
+		  const u8 *fw_data, unsigned int size, int force)
+{
+	const struct fw_hdr *fw_hdr = (const struct fw_hdr *)fw_data;
+	int reset, ret;
+
+	if (!t4_fw_matches_chip(adap, fw_hdr))
+		return -EINVAL;
+
+	/* Disable FW_OK flag so that mbox commands with FW_OK flag set
+	 * wont be sent when we are flashing FW.
+	 */
+	adap->flags &= ~FW_OK;
+
+	ret = t4_fw_halt(adap, mbox, force);
+	if (ret < 0 && !force)
+		goto out;
+
+	ret = t4_load_fw(adap, fw_data, size);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * If there was a Firmware Configuration File stored in FLASH,
+	 * there's a good chance that it won't be compatible with the new
+	 * Firmware.  In order to prevent difficult to diagnose adapter
+	 * initialization issues, we clear out the Firmware Configuration File
+	 * portion of the FLASH .  The user will need to re-FLASH a new
+	 * Firmware Configuration File which is compatible with the new
+	 * Firmware if that's desired.
+	 */
+	(void)t4_load_cfg(adap, NULL, 0);
+
+	/*
+	 * Older versions of the firmware don't understand the new
+	 * PCIE_FW.HALT flag and so won't know to perform a RESET when they
+	 * restart.  So for newly loaded older firmware we'll have to do the
+	 * RESET for it so it starts up on a clean slate.  We can tell if
+	 * the newly loaded firmware will handle this right by checking
+	 * its header flags to see if it advertises the capability.
+	 */
+	reset = ((be32_to_cpu(fw_hdr->flags) & FW_HDR_FLAGS_RESET_HALT) == 0);
+	ret = t4_fw_restart(adap, mbox, reset);
+
+	/* Grab potentially new Firmware Device Log parameters so we can see
+	 * how healthy the new Firmware is.  It's okay to contact the new
+	 * Firmware for these parameters even though, as far as it's
+	 * concerned, we've never said "HELLO" to it ...
+	 */
+	(void)t4_init_devlog_params(adap);
+out:
+	adap->flags |= FW_OK;
+	return ret;
+}
+
+/**
+ *	t4_fl_pkt_align - return the fl packet alignment
+ *	@adap: the adapter
+ *
+ *	T4 has a single field to specify the packing and padding boundary.
+ *	T5 onwards has separate fields for this and hence the alignment for
+ *	next packet offset is maximum of these two.
+ *
+ */
+int t4_fl_pkt_align(struct adapter *adap)
+{
+	u32 sge_control, sge_control2;
+	unsigned int ingpadboundary, ingpackboundary, fl_align, ingpad_shift;
+
+	sge_control = t4_read_reg(adap, SGE_CONTROL_A);
+
+	/* T4 uses a single control field to specify both the PCIe Padding and
+	 * Packing Boundary.  T5 introduced the ability to specify these
+	 * separately.  The actual Ingress Packet Data alignment boundary
+	 * within Packed Buffer Mode is the maximum of these two
+	 * specifications.  (Note that it makes no real practical sense to
+	 * have the Pading Boudary be larger than the Packing Boundary but you
+	 * could set the chip up that way and, in fact, legacy T4 code would
+	 * end doing this because it would initialize the Padding Boundary and
+	 * leave the Packing Boundary initialized to 0 (16 bytes).)
+	 * Padding Boundary values in T6 starts from 8B,
+	 * where as it is 32B for T4 and T5.
+	 */
+	if (CHELSIO_CHIP_VERSION(adap->params.chip) <= CHELSIO_T5)
+		ingpad_shift = INGPADBOUNDARY_SHIFT_X;
+	else
+		ingpad_shift = T6_INGPADBOUNDARY_SHIFT_X;
+
+	ingpadboundary = 1 << (INGPADBOUNDARY_G(sge_control) + ingpad_shift);
+
+	fl_align = ingpadboundary;
+	if (!is_t4(adap->params.chip)) {
+		/* T5 has a weird interpretation of one of the PCIe Packing
+		 * Boundary values.  No idea why ...
+		 */
+		sge_control2 = t4_read_reg(adap, SGE_CONTROL2_A);
+		ingpackboundary = INGPACKBOUNDARY_G(sge_control2);
+		if (ingpackboundary == INGPACKBOUNDARY_16B_X)
+			ingpackboundary = 16;
+		else
+			ingpackboundary = 1 << (ingpackboundary +
+						INGPACKBOUNDARY_SHIFT_X);
+
+		fl_align = max(ingpadboundary, ingpackboundary);
+	}
+	return fl_align;
+}
+
+/**
+ *	t4_fixup_host_params - fix up host-dependent parameters
+ *	@adap: the adapter
+ *	@page_size: the host's Base Page Size
+ *	@cache_line_size: the host's Cache Line Size
+ *
+ *	Various registers in T4 contain values which are dependent on the
+ *	host's Base Page and Cache Line Sizes.  This function will fix all of
+ *	those registers with the appropriate values as passed in ...
+ */
+int t4_fixup_host_params(struct adapter *adap, unsigned int page_size,
+			 unsigned int cache_line_size)
+{
+	unsigned int page_shift = fls(page_size) - 1;
+	unsigned int sge_hps = page_shift - 10;
+	unsigned int stat_len = cache_line_size > 64 ? 128 : 64;
+	unsigned int fl_align = cache_line_size < 32 ? 32 : cache_line_size;
+	unsigned int fl_align_log = fls(fl_align) - 1;
+
+	t4_write_reg(adap, SGE_HOST_PAGE_SIZE_A,
+		     HOSTPAGESIZEPF0_V(sge_hps) |
+		     HOSTPAGESIZEPF1_V(sge_hps) |
+		     HOSTPAGESIZEPF2_V(sge_hps) |
+		     HOSTPAGESIZEPF3_V(sge_hps) |
+		     HOSTPAGESIZEPF4_V(sge_hps) |
+		     HOSTPAGESIZEPF5_V(sge_hps) |
+		     HOSTPAGESIZEPF6_V(sge_hps) |
+		     HOSTPAGESIZEPF7_V(sge_hps));
+
+	if (is_t4(adap->params.chip)) {
+		t4_set_reg_field(adap, SGE_CONTROL_A,
+				 INGPADBOUNDARY_V(INGPADBOUNDARY_M) |
+				 EGRSTATUSPAGESIZE_F,
+				 INGPADBOUNDARY_V(fl_align_log -
+						  INGPADBOUNDARY_SHIFT_X) |
+				 EGRSTATUSPAGESIZE_V(stat_len != 64));
+	} else {
+		unsigned int pack_align;
+		unsigned int ingpad, ingpack;
+		unsigned int pcie_cap;
+
+		/* T5 introduced the separation of the Free List Padding and
+		 * Packing Boundaries.  Thus, we can select a smaller Padding
+		 * Boundary to avoid uselessly chewing up PCIe Link and Memory
+		 * Bandwidth, and use a Packing Boundary which is large enough
+		 * to avoid false sharing between CPUs, etc.
+		 *
+		 * For the PCI Link, the smaller the Padding Boundary the
+		 * better.  For the Memory Controller, a smaller Padding
+		 * Boundary is better until we cross under the Memory Line
+		 * Size (the minimum unit of transfer to/from Memory).  If we
+		 * have a Padding Boundary which is smaller than the Memory
+		 * Line Size, that'll involve a Read-Modify-Write cycle on the
+		 * Memory Controller which is never good.
+		 */
+
+		/* We want the Packing Boundary to be based on the Cache Line
+		 * Size in order to help avoid False Sharing performance
+		 * issues between CPUs, etc.  We also want the Packing
+		 * Boundary to incorporate the PCI-E Maximum Payload Size.  We
+		 * get best performance when the Packing Boundary is a
+		 * multiple of the Maximum Payload Size.
+		 */
+		pack_align = fl_align;
+		pcie_cap = pci_find_capability(adap->pdev, PCI_CAP_ID_EXP);
+		if (pcie_cap) {
+			unsigned int mps, mps_log;
+			u16 devctl;
+
+			/* The PCIe Device Control Maximum Payload Size field
+			 * [bits 7:5] encodes sizes as powers of 2 starting at
+			 * 128 bytes.
+			 */
+			pci_read_config_word(adap->pdev,
+					     pcie_cap + PCI_EXP_DEVCTL,
+					     &devctl);
+			mps_log = ((devctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5) + 7;
+			mps = 1 << mps_log;
+			if (mps > pack_align)
+				pack_align = mps;
+		}
+
+		/* N.B. T5/T6 have a crazy special interpretation of the "0"
+		 * value for the Packing Boundary.  This corresponds to 16
+		 * bytes instead of the expected 32 bytes.  So if we want 32
+		 * bytes, the best we can really do is 64 bytes ...
+		 */
+		if (pack_align <= 16) {
+			ingpack = INGPACKBOUNDARY_16B_X;
+			fl_align = 16;
+		} else if (pack_align == 32) {
+			ingpack = INGPACKBOUNDARY_64B_X;
+			fl_align = 64;
+		} else {
+			unsigned int pack_align_log = fls(pack_align) - 1;
+
+			ingpack = pack_align_log - INGPACKBOUNDARY_SHIFT_X;
+			fl_align = pack_align;
+		}
+
+		/* Use the smallest Ingress Padding which isn't smaller than
+		 * the Memory Controller Read/Write Size.  We'll take that as
+		 * being 8 bytes since we don't know of any system with a
+		 * wider Memory Controller Bus Width.
+		 */
+		if (is_t5(adap->params.chip))
+			ingpad = INGPADBOUNDARY_32B_X;
+		else
+			ingpad = T6_INGPADBOUNDARY_8B_X;
+
+		t4_set_reg_field(adap, SGE_CONTROL_A,
+				 INGPADBOUNDARY_V(INGPADBOUNDARY_M) |
+				 EGRSTATUSPAGESIZE_F,
+				 INGPADBOUNDARY_V(ingpad) |
+				 EGRSTATUSPAGESIZE_V(stat_len != 64));
+		t4_set_reg_field(adap, SGE_CONTROL2_A,
+				 INGPACKBOUNDARY_V(INGPACKBOUNDARY_M),
+				 INGPACKBOUNDARY_V(ingpack));
+	}
+	/*
+	 * Adjust various SGE Free List Host Buffer Sizes.
+	 *
+	 * This is something of a crock since we're using fixed indices into
+	 * the array which are also known by the sge.c code and the T4
+	 * Firmware Configuration File.  We need to come up with a much better
+	 * approach to managing this array.  For now, the first four entries
+	 * are:
+	 *
+	 *   0: Host Page Size
+	 *   1: 64KB
+	 *   2: Buffer size corresponding to 1500 byte MTU (unpacked mode)
+	 *   3: Buffer size corresponding to 9000 byte MTU (unpacked mode)
+	 *
+	 * For the single-MTU buffers in unpacked mode we need to include
+	 * space for the SGE Control Packet Shift, 14 byte Ethernet header,
+	 * possible 4 byte VLAN tag, all rounded up to the next Ingress Packet
+	 * Padding boundary.  All of these are accommodated in the Factory
+	 * Default Firmware Configuration File but we need to adjust it for
+	 * this host's cache line size.
+	 */
+	t4_write_reg(adap, SGE_FL_BUFFER_SIZE0_A, page_size);
+	t4_write_reg(adap, SGE_FL_BUFFER_SIZE2_A,
+		     (t4_read_reg(adap, SGE_FL_BUFFER_SIZE2_A) + fl_align-1)
+		     & ~(fl_align-1));
+	t4_write_reg(adap, SGE_FL_BUFFER_SIZE3_A,
+		     (t4_read_reg(adap, SGE_FL_BUFFER_SIZE3_A) + fl_align-1)
+		     & ~(fl_align-1));
+
+	t4_write_reg(adap, ULP_RX_TDDP_PSZ_A, HPZ0_V(page_shift - 12));
+
+	return 0;
+}
+
+/**
+ *	t4_fw_initialize - ask FW to initialize the device
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *
+ *	Issues a command to FW to partially initialize the device.  This
+ *	performs initialization that generally doesn't depend on user input.
+ */
+int t4_fw_initialize(struct adapter *adap, unsigned int mbox)
+{
+	struct fw_initialize_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	INIT_CMD(c, INITIALIZE, WRITE);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_query_params_rw - query FW or device parameters
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF
+ *	@vf: the VF
+ *	@nparams: the number of parameters
+ *	@params: the parameter names
+ *	@val: the parameter values
+ *	@rw: Write and read flag
+ *	@sleep_ok: if true, we may sleep awaiting mbox cmd completion
+ *
+ *	Reads the value of FW or device parameters.  Up to 7 parameters can be
+ *	queried at once.
+ */
+int t4_query_params_rw(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		       unsigned int vf, unsigned int nparams, const u32 *params,
+		       u32 *val, int rw, bool sleep_ok)
+{
+	int i, ret;
+	struct fw_params_cmd c;
+	__be32 *p = &c.param[0].mnem;
+
+	if (nparams > 7)
+		return -EINVAL;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_PARAMS_CMD) |
+				  FW_CMD_REQUEST_F | FW_CMD_READ_F |
+				  FW_PARAMS_CMD_PFN_V(pf) |
+				  FW_PARAMS_CMD_VFN_V(vf));
+	c.retval_len16 = cpu_to_be32(FW_LEN16(c));
+
+	for (i = 0; i < nparams; i++) {
+		*p++ = cpu_to_be32(*params++);
+		if (rw)
+			*p = cpu_to_be32(*(val + i));
+		p++;
+	}
+
+	ret = t4_wr_mbox_meat(adap, mbox, &c, sizeof(c), &c, sleep_ok);
+	if (ret == 0)
+		for (i = 0, p = &c.param[0].val; i < nparams; i++, p += 2)
+			*val++ = be32_to_cpu(*p);
+	return ret;
+}
+
+int t4_query_params(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		    unsigned int vf, unsigned int nparams, const u32 *params,
+		    u32 *val)
+{
+	return t4_query_params_rw(adap, mbox, pf, vf, nparams, params, val, 0,
+				  true);
+}
+
+int t4_query_params_ns(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		       unsigned int vf, unsigned int nparams, const u32 *params,
+		       u32 *val)
+{
+	return t4_query_params_rw(adap, mbox, pf, vf, nparams, params, val, 0,
+				  false);
+}
+
+/**
+ *      t4_set_params_timeout - sets FW or device parameters
+ *      @adap: the adapter
+ *      @mbox: mailbox to use for the FW command
+ *      @pf: the PF
+ *      @vf: the VF
+ *      @nparams: the number of parameters
+ *      @params: the parameter names
+ *      @val: the parameter values
+ *      @timeout: the timeout time
+ *
+ *      Sets the value of FW or device parameters.  Up to 7 parameters can be
+ *      specified at once.
+ */
+int t4_set_params_timeout(struct adapter *adap, unsigned int mbox,
+			  unsigned int pf, unsigned int vf,
+			  unsigned int nparams, const u32 *params,
+			  const u32 *val, int timeout)
+{
+	struct fw_params_cmd c;
+	__be32 *p = &c.param[0].mnem;
+
+	if (nparams > 7)
+		return -EINVAL;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_PARAMS_CMD) |
+				  FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+				  FW_PARAMS_CMD_PFN_V(pf) |
+				  FW_PARAMS_CMD_VFN_V(vf));
+	c.retval_len16 = cpu_to_be32(FW_LEN16(c));
+
+	while (nparams--) {
+		*p++ = cpu_to_be32(*params++);
+		*p++ = cpu_to_be32(*val++);
+	}
+
+	return t4_wr_mbox_timeout(adap, mbox, &c, sizeof(c), NULL, timeout);
+}
+
+/**
+ *	t4_set_params - sets FW or device parameters
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF
+ *	@vf: the VF
+ *	@nparams: the number of parameters
+ *	@params: the parameter names
+ *	@val: the parameter values
+ *
+ *	Sets the value of FW or device parameters.  Up to 7 parameters can be
+ *	specified at once.
+ */
+int t4_set_params(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		  unsigned int vf, unsigned int nparams, const u32 *params,
+		  const u32 *val)
+{
+	return t4_set_params_timeout(adap, mbox, pf, vf, nparams, params, val,
+				     FW_CMD_MAX_TIMEOUT);
+}
+
+/**
+ *	t4_cfg_pfvf - configure PF/VF resource limits
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF being configured
+ *	@vf: the VF being configured
+ *	@txq: the max number of egress queues
+ *	@txq_eth_ctrl: the max number of egress Ethernet or control queues
+ *	@rxqi: the max number of interrupt-capable ingress queues
+ *	@rxq: the max number of interruptless ingress queues
+ *	@tc: the PCI traffic class
+ *	@vi: the max number of virtual interfaces
+ *	@cmask: the channel access rights mask for the PF/VF
+ *	@pmask: the port access rights mask for the PF/VF
+ *	@nexact: the maximum number of exact MPS filters
+ *	@rcaps: read capabilities
+ *	@wxcaps: write/execute capabilities
+ *
+ *	Configures resource limits and capabilities for a physical or virtual
+ *	function.
+ */
+int t4_cfg_pfvf(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		unsigned int vf, unsigned int txq, unsigned int txq_eth_ctrl,
+		unsigned int rxqi, unsigned int rxq, unsigned int tc,
+		unsigned int vi, unsigned int cmask, unsigned int pmask,
+		unsigned int nexact, unsigned int rcaps, unsigned int wxcaps)
+{
+	struct fw_pfvf_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_PFVF_CMD) | FW_CMD_REQUEST_F |
+				  FW_CMD_WRITE_F | FW_PFVF_CMD_PFN_V(pf) |
+				  FW_PFVF_CMD_VFN_V(vf));
+	c.retval_len16 = cpu_to_be32(FW_LEN16(c));
+	c.niqflint_niq = cpu_to_be32(FW_PFVF_CMD_NIQFLINT_V(rxqi) |
+				     FW_PFVF_CMD_NIQ_V(rxq));
+	c.type_to_neq = cpu_to_be32(FW_PFVF_CMD_CMASK_V(cmask) |
+				    FW_PFVF_CMD_PMASK_V(pmask) |
+				    FW_PFVF_CMD_NEQ_V(txq));
+	c.tc_to_nexactf = cpu_to_be32(FW_PFVF_CMD_TC_V(tc) |
+				      FW_PFVF_CMD_NVI_V(vi) |
+				      FW_PFVF_CMD_NEXACTF_V(nexact));
+	c.r_caps_to_nethctrl = cpu_to_be32(FW_PFVF_CMD_R_CAPS_V(rcaps) |
+					FW_PFVF_CMD_WX_CAPS_V(wxcaps) |
+					FW_PFVF_CMD_NETHCTRL_V(txq_eth_ctrl));
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_alloc_vi - allocate a virtual interface
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@port: physical port associated with the VI
+ *	@pf: the PF owning the VI
+ *	@vf: the VF owning the VI
+ *	@nmac: number of MAC addresses needed (1 to 5)
+ *	@mac: the MAC addresses of the VI
+ *	@rss_size: size of RSS table slice associated with this VI
+ *
+ *	Allocates a virtual interface for the given physical port.  If @mac is
+ *	not %NULL it contains the MAC addresses of the VI as assigned by FW.
+ *	@mac should be large enough to hold @nmac Ethernet addresses, they are
+ *	stored consecutively so the space needed is @nmac * 6 bytes.
+ *	Returns a negative error number or the non-negative VI id.
+ */
+int t4_alloc_vi(struct adapter *adap, unsigned int mbox, unsigned int port,
+		unsigned int pf, unsigned int vf, unsigned int nmac, u8 *mac,
+		unsigned int *rss_size)
+{
+	int ret;
+	struct fw_vi_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_VI_CMD) | FW_CMD_REQUEST_F |
+				  FW_CMD_WRITE_F | FW_CMD_EXEC_F |
+				  FW_VI_CMD_PFN_V(pf) | FW_VI_CMD_VFN_V(vf));
+	c.alloc_to_len16 = cpu_to_be32(FW_VI_CMD_ALLOC_F | FW_LEN16(c));
+	c.portid_pkd = FW_VI_CMD_PORTID_V(port);
+	c.nmac = nmac - 1;
+
+	ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
+	if (ret)
+		return ret;
+
+	if (mac) {
+		memcpy(mac, c.mac, sizeof(c.mac));
+		switch (nmac) {
+		case 5:
+			memcpy(mac + 24, c.nmac3, sizeof(c.nmac3));
+		case 4:
+			memcpy(mac + 18, c.nmac2, sizeof(c.nmac2));
+		case 3:
+			memcpy(mac + 12, c.nmac1, sizeof(c.nmac1));
+		case 2:
+			memcpy(mac + 6,  c.nmac0, sizeof(c.nmac0));
+		}
+	}
+	if (rss_size)
+		*rss_size = FW_VI_CMD_RSSSIZE_G(be16_to_cpu(c.rsssize_pkd));
+	return FW_VI_CMD_VIID_G(be16_to_cpu(c.type_viid));
+}
+
+/**
+ *	t4_free_vi - free a virtual interface
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF owning the VI
+ *	@vf: the VF owning the VI
+ *	@viid: virtual interface identifiler
+ *
+ *	Free a previously allocated virtual interface.
+ */
+int t4_free_vi(struct adapter *adap, unsigned int mbox, unsigned int pf,
+	       unsigned int vf, unsigned int viid)
+{
+	struct fw_vi_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_VI_CMD) |
+				  FW_CMD_REQUEST_F |
+				  FW_CMD_EXEC_F |
+				  FW_VI_CMD_PFN_V(pf) |
+				  FW_VI_CMD_VFN_V(vf));
+	c.alloc_to_len16 = cpu_to_be32(FW_VI_CMD_FREE_F | FW_LEN16(c));
+	c.type_viid = cpu_to_be16(FW_VI_CMD_VIID_V(viid));
+
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
+}
+
+/**
+ *	t4_set_rxmode - set Rx properties of a virtual interface
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@viid: the VI id
+ *	@mtu: the new MTU or -1
+ *	@promisc: 1 to enable promiscuous mode, 0 to disable it, -1 no change
+ *	@all_multi: 1 to enable all-multi mode, 0 to disable it, -1 no change
+ *	@bcast: 1 to enable broadcast Rx, 0 to disable it, -1 no change
+ *	@vlanex: 1 to enable HW VLAN extraction, 0 to disable it, -1 no change
+ *	@sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Sets Rx properties of a virtual interface.
+ */
+int t4_set_rxmode(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		  int mtu, int promisc, int all_multi, int bcast, int vlanex,
+		  bool sleep_ok)
+{
+	struct fw_vi_rxmode_cmd c;
+
+	/* convert to FW values */
+	if (mtu < 0)
+		mtu = FW_RXMODE_MTU_NO_CHG;
+	if (promisc < 0)
+		promisc = FW_VI_RXMODE_CMD_PROMISCEN_M;
+	if (all_multi < 0)
+		all_multi = FW_VI_RXMODE_CMD_ALLMULTIEN_M;
+	if (bcast < 0)
+		bcast = FW_VI_RXMODE_CMD_BROADCASTEN_M;
+	if (vlanex < 0)
+		vlanex = FW_VI_RXMODE_CMD_VLANEXEN_M;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_VI_RXMODE_CMD) |
+				   FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+				   FW_VI_RXMODE_CMD_VIID_V(viid));
+	c.retval_len16 = cpu_to_be32(FW_LEN16(c));
+	c.mtu_to_vlanexen =
+		cpu_to_be32(FW_VI_RXMODE_CMD_MTU_V(mtu) |
+			    FW_VI_RXMODE_CMD_PROMISCEN_V(promisc) |
+			    FW_VI_RXMODE_CMD_ALLMULTIEN_V(all_multi) |
+			    FW_VI_RXMODE_CMD_BROADCASTEN_V(bcast) |
+			    FW_VI_RXMODE_CMD_VLANEXEN_V(vlanex));
+	return t4_wr_mbox_meat(adap, mbox, &c, sizeof(c), NULL, sleep_ok);
+}
+
+/**
+ *	t4_free_raw_mac_filt - Frees a raw mac entry in mps tcam
+ *	@adap: the adapter
+ *	@viid: the VI id
+ *	@addr: the MAC address
+ *	@mask: the mask
+ *	@idx: index of the entry in mps tcam
+ *	@lookup_type: MAC address for inner (1) or outer (0) header
+ *	@port_id: the port index
+ *	@sleep_ok: call is allowed to sleep
+ *
+ *	Removes the mac entry at the specified index using raw mac interface.
+ *
+ *	Returns a negative error number on failure.
+ */
+int t4_free_raw_mac_filt(struct adapter *adap, unsigned int viid,
+			 const u8 *addr, const u8 *mask, unsigned int idx,
+			 u8 lookup_type, u8 port_id, bool sleep_ok)
+{
+	struct fw_vi_mac_cmd c;
+	struct fw_vi_mac_raw *p = &c.u.raw;
+	u32 val;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_VI_MAC_CMD) |
+				   FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+				   FW_CMD_EXEC_V(0) |
+				   FW_VI_MAC_CMD_VIID_V(viid));
+	val = FW_CMD_LEN16_V(1) |
+	      FW_VI_MAC_CMD_ENTRY_TYPE_V(FW_VI_MAC_TYPE_RAW);
+	c.freemacs_to_len16 = cpu_to_be32(FW_VI_MAC_CMD_FREEMACS_V(0) |
+					  FW_CMD_LEN16_V(val));
+
+	p->raw_idx_pkd = cpu_to_be32(FW_VI_MAC_CMD_RAW_IDX_V(idx) |
+				     FW_VI_MAC_ID_BASED_FREE);
+
+	/* Lookup Type. Outer header: 0, Inner header: 1 */
+	p->data0_pkd = cpu_to_be32(DATALKPTYPE_V(lookup_type) |
+				   DATAPORTNUM_V(port_id));
+	/* Lookup mask and port mask */
+	p->data0m_pkd = cpu_to_be64(DATALKPTYPE_V(DATALKPTYPE_M) |
+				    DATAPORTNUM_V(DATAPORTNUM_M));
+
+	/* Copy the address and the mask */
+	memcpy((u8 *)&p->data1[0] + 2, addr, ETH_ALEN);
+	memcpy((u8 *)&p->data1m[0] + 2, mask, ETH_ALEN);
+
+	return t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, sleep_ok);
+}
+
+/**
+ *	t4_alloc_raw_mac_filt - Adds a mac entry in mps tcam
+ *	@adap: the adapter
+ *	@viid: the VI id
+ *	@mac: the MAC address
+ *	@mask: the mask
+ *	@idx: index at which to add this entry
+ *	@port_id: the port index
+ *	@lookup_type: MAC address for inner (1) or outer (0) header
+ *	@sleep_ok: call is allowed to sleep
+ *
+ *	Adds the mac entry at the specified index using raw mac interface.
+ *
+ *	Returns a negative error number or the allocated index for this mac.
+ */
+int t4_alloc_raw_mac_filt(struct adapter *adap, unsigned int viid,
+			  const u8 *addr, const u8 *mask, unsigned int idx,
+			  u8 lookup_type, u8 port_id, bool sleep_ok)
+{
+	int ret = 0;
+	struct fw_vi_mac_cmd c;
+	struct fw_vi_mac_raw *p = &c.u.raw;
+	u32 val;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_VI_MAC_CMD) |
+				   FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+				   FW_VI_MAC_CMD_VIID_V(viid));
+	val = FW_CMD_LEN16_V(1) |
+	      FW_VI_MAC_CMD_ENTRY_TYPE_V(FW_VI_MAC_TYPE_RAW);
+	c.freemacs_to_len16 = cpu_to_be32(val);
+
+	/* Specify that this is an inner mac address */
+	p->raw_idx_pkd = cpu_to_be32(FW_VI_MAC_CMD_RAW_IDX_V(idx));
+
+	/* Lookup Type. Outer header: 0, Inner header: 1 */
+	p->data0_pkd = cpu_to_be32(DATALKPTYPE_V(lookup_type) |
+				   DATAPORTNUM_V(port_id));
+	/* Lookup mask and port mask */
+	p->data0m_pkd = cpu_to_be64(DATALKPTYPE_V(DATALKPTYPE_M) |
+				    DATAPORTNUM_V(DATAPORTNUM_M));
+
+	/* Copy the address and the mask */
+	memcpy((u8 *)&p->data1[0] + 2, addr, ETH_ALEN);
+	memcpy((u8 *)&p->data1m[0] + 2, mask, ETH_ALEN);
+
+	ret = t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, sleep_ok);
+	if (ret == 0) {
+		ret = FW_VI_MAC_CMD_RAW_IDX_G(be32_to_cpu(p->raw_idx_pkd));
+		if (ret != idx)
+			ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+/**
+ *	t4_alloc_mac_filt - allocates exact-match filters for MAC addresses
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@viid: the VI id
+ *	@free: if true any existing filters for this VI id are first removed
+ *	@naddr: the number of MAC addresses to allocate filters for (up to 7)
+ *	@addr: the MAC address(es)
+ *	@idx: where to store the index of each allocated filter
+ *	@hash: pointer to hash address filter bitmap
+ *	@sleep_ok: call is allowed to sleep
+ *
+ *	Allocates an exact-match filter for each of the supplied addresses and
+ *	sets it to the corresponding address.  If @idx is not %NULL it should
+ *	have at least @naddr entries, each of which will be set to the index of
+ *	the filter allocated for the corresponding MAC address.  If a filter
+ *	could not be allocated for an address its index is set to 0xffff.
+ *	If @hash is not %NULL addresses that fail to allocate an exact filter
+ *	are hashed and update the hash filter bitmap pointed at by @hash.
+ *
+ *	Returns a negative error number or the number of filters allocated.
+ */
+int t4_alloc_mac_filt(struct adapter *adap, unsigned int mbox,
+		      unsigned int viid, bool free, unsigned int naddr,
+		      const u8 **addr, u16 *idx, u64 *hash, bool sleep_ok)
+{
+	int offset, ret = 0;
+	struct fw_vi_mac_cmd c;
+	unsigned int nfilters = 0;
+	unsigned int max_naddr = adap->params.arch.mps_tcam_size;
+	unsigned int rem = naddr;
+
+	if (naddr > max_naddr)
+		return -EINVAL;
+
+	for (offset = 0; offset < naddr ; /**/) {
+		unsigned int fw_naddr = (rem < ARRAY_SIZE(c.u.exact) ?
+					 rem : ARRAY_SIZE(c.u.exact));
+		size_t len16 = DIV_ROUND_UP(offsetof(struct fw_vi_mac_cmd,
+						     u.exact[fw_naddr]), 16);
+		struct fw_vi_mac_exact *p;
+		int i;
+
+		memset(&c, 0, sizeof(c));
+		c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_VI_MAC_CMD) |
+					   FW_CMD_REQUEST_F |
+					   FW_CMD_WRITE_F |
+					   FW_CMD_EXEC_V(free) |
+					   FW_VI_MAC_CMD_VIID_V(viid));
+		c.freemacs_to_len16 =
+			cpu_to_be32(FW_VI_MAC_CMD_FREEMACS_V(free) |
+				    FW_CMD_LEN16_V(len16));
+
+		for (i = 0, p = c.u.exact; i < fw_naddr; i++, p++) {
+			p->valid_to_idx =
+				cpu_to_be16(FW_VI_MAC_CMD_VALID_F |
+					    FW_VI_MAC_CMD_IDX_V(
+						    FW_VI_MAC_ADD_MAC));
+			memcpy(p->macaddr, addr[offset + i],
+			       sizeof(p->macaddr));
+		}
+
+		/* It's okay if we run out of space in our MAC address arena.
+		 * Some of the addresses we submit may get stored so we need
+		 * to run through the reply to see what the results were ...
+		 */
+		ret = t4_wr_mbox_meat(adap, mbox, &c, sizeof(c), &c, sleep_ok);
+		if (ret && ret != -FW_ENOMEM)
+			break;
+
+		for (i = 0, p = c.u.exact; i < fw_naddr; i++, p++) {
+			u16 index = FW_VI_MAC_CMD_IDX_G(
+					be16_to_cpu(p->valid_to_idx));
+
+			if (idx)
+				idx[offset + i] = (index >= max_naddr ?
+						   0xffff : index);
+			if (index < max_naddr)
+				nfilters++;
+			else if (hash)
+				*hash |= (1ULL <<
+					  hash_mac_addr(addr[offset + i]));
+		}
+
+		free = false;
+		offset += fw_naddr;
+		rem -= fw_naddr;
+	}
+
+	if (ret == 0 || ret == -FW_ENOMEM)
+		ret = nfilters;
+	return ret;
+}
+
+/**
+ *	t4_free_mac_filt - frees exact-match filters of given MAC addresses
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@viid: the VI id
+ *	@naddr: the number of MAC addresses to allocate filters for (up to 7)
+ *	@addr: the MAC address(es)
+ *	@sleep_ok: call is allowed to sleep
+ *
+ *	Frees the exact-match filter for each of the supplied addresses
+ *
+ *	Returns a negative error number or the number of filters freed.
+ */
+int t4_free_mac_filt(struct adapter *adap, unsigned int mbox,
+		     unsigned int viid, unsigned int naddr,
+		     const u8 **addr, bool sleep_ok)
+{
+	int offset, ret = 0;
+	struct fw_vi_mac_cmd c;
+	unsigned int nfilters = 0;
+	unsigned int max_naddr = is_t4(adap->params.chip) ?
+				       NUM_MPS_CLS_SRAM_L_INSTANCES :
+				       NUM_MPS_T5_CLS_SRAM_L_INSTANCES;
+	unsigned int rem = naddr;
+
+	if (naddr > max_naddr)
+		return -EINVAL;
+
+	for (offset = 0; offset < (int)naddr ; /**/) {
+		unsigned int fw_naddr = (rem < ARRAY_SIZE(c.u.exact)
+					 ? rem
+					 : ARRAY_SIZE(c.u.exact));
+		size_t len16 = DIV_ROUND_UP(offsetof(struct fw_vi_mac_cmd,
+						     u.exact[fw_naddr]), 16);
+		struct fw_vi_mac_exact *p;
+		int i;
+
+		memset(&c, 0, sizeof(c));
+		c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_VI_MAC_CMD) |
+				     FW_CMD_REQUEST_F |
+				     FW_CMD_WRITE_F |
+				     FW_CMD_EXEC_V(0) |
+				     FW_VI_MAC_CMD_VIID_V(viid));
+		c.freemacs_to_len16 =
+				cpu_to_be32(FW_VI_MAC_CMD_FREEMACS_V(0) |
+					    FW_CMD_LEN16_V(len16));
+
+		for (i = 0, p = c.u.exact; i < (int)fw_naddr; i++, p++) {
+			p->valid_to_idx = cpu_to_be16(
+				FW_VI_MAC_CMD_VALID_F |
+				FW_VI_MAC_CMD_IDX_V(FW_VI_MAC_MAC_BASED_FREE));
+			memcpy(p->macaddr, addr[offset+i], sizeof(p->macaddr));
+		}
+
+		ret = t4_wr_mbox_meat(adap, mbox, &c, sizeof(c), &c, sleep_ok);
+		if (ret)
+			break;
+
+		for (i = 0, p = c.u.exact; i < fw_naddr; i++, p++) {
+			u16 index = FW_VI_MAC_CMD_IDX_G(
+						be16_to_cpu(p->valid_to_idx));
+
+			if (index < max_naddr)
+				nfilters++;
+		}
+
+		offset += fw_naddr;
+		rem -= fw_naddr;
+	}
+
+	if (ret == 0)
+		ret = nfilters;
+	return ret;
+}
+
+/**
+ *	t4_change_mac - modifies the exact-match filter for a MAC address
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@viid: the VI id
+ *	@idx: index of existing filter for old value of MAC address, or -1
+ *	@addr: the new MAC address value
+ *	@persist: whether a new MAC allocation should be persistent
+ *	@add_smt: if true also add the address to the HW SMT
+ *
+ *	Modifies an exact-match filter and sets it to the new MAC address.
+ *	Note that in general it is not possible to modify the value of a given
+ *	filter so the generic way to modify an address filter is to free the one
+ *	being used by the old address value and allocate a new filter for the
+ *	new address value.  @idx can be -1 if the address is a new addition.
+ *
+ *	Returns a negative error number or the index of the filter with the new
+ *	MAC value.
+ */
+int t4_change_mac(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		  int idx, const u8 *addr, bool persist, bool add_smt)
+{
+	int ret, mode;
+	struct fw_vi_mac_cmd c;
+	struct fw_vi_mac_exact *p = c.u.exact;
+	unsigned int max_mac_addr = adap->params.arch.mps_tcam_size;
+
+	if (idx < 0)                             /* new allocation */
+		idx = persist ? FW_VI_MAC_ADD_PERSIST_MAC : FW_VI_MAC_ADD_MAC;
+	mode = add_smt ? FW_VI_MAC_SMT_AND_MPSTCAM : FW_VI_MAC_MPS_TCAM_ENTRY;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_VI_MAC_CMD) |
+				   FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+				   FW_VI_MAC_CMD_VIID_V(viid));
+	c.freemacs_to_len16 = cpu_to_be32(FW_CMD_LEN16_V(1));
+	p->valid_to_idx = cpu_to_be16(FW_VI_MAC_CMD_VALID_F |
+				      FW_VI_MAC_CMD_SMAC_RESULT_V(mode) |
+				      FW_VI_MAC_CMD_IDX_V(idx));
+	memcpy(p->macaddr, addr, sizeof(p->macaddr));
+
+	ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
+	if (ret == 0) {
+		ret = FW_VI_MAC_CMD_IDX_G(be16_to_cpu(p->valid_to_idx));
+		if (ret >= max_mac_addr)
+			ret = -ENOMEM;
+	}
+	return ret;
+}
+
+/**
+ *	t4_set_addr_hash - program the MAC inexact-match hash filter
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@viid: the VI id
+ *	@ucast: whether the hash filter should also match unicast addresses
+ *	@vec: the value to be written to the hash filter
+ *	@sleep_ok: call is allowed to sleep
+ *
+ *	Sets the 64-bit inexact-match hash filter for a virtual interface.
+ */
+int t4_set_addr_hash(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		     bool ucast, u64 vec, bool sleep_ok)
+{
+	struct fw_vi_mac_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_VI_MAC_CMD) |
+				   FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+				   FW_VI_ENABLE_CMD_VIID_V(viid));
+	c.freemacs_to_len16 = cpu_to_be32(FW_VI_MAC_CMD_HASHVECEN_F |
+					  FW_VI_MAC_CMD_HASHUNIEN_V(ucast) |
+					  FW_CMD_LEN16_V(1));
+	c.u.hash.hashvec = cpu_to_be64(vec);
+	return t4_wr_mbox_meat(adap, mbox, &c, sizeof(c), NULL, sleep_ok);
+}
+
+/**
+ *      t4_enable_vi_params - enable/disable a virtual interface
+ *      @adap: the adapter
+ *      @mbox: mailbox to use for the FW command
+ *      @viid: the VI id
+ *      @rx_en: 1=enable Rx, 0=disable Rx
+ *      @tx_en: 1=enable Tx, 0=disable Tx
+ *      @dcb_en: 1=enable delivery of Data Center Bridging messages.
+ *
+ *      Enables/disables a virtual interface.  Note that setting DCB Enable
+ *      only makes sense when enabling a Virtual Interface ...
+ */
+int t4_enable_vi_params(struct adapter *adap, unsigned int mbox,
+			unsigned int viid, bool rx_en, bool tx_en, bool dcb_en)
+{
+	struct fw_vi_enable_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_VI_ENABLE_CMD) |
+				   FW_CMD_REQUEST_F | FW_CMD_EXEC_F |
+				   FW_VI_ENABLE_CMD_VIID_V(viid));
+	c.ien_to_len16 = cpu_to_be32(FW_VI_ENABLE_CMD_IEN_V(rx_en) |
+				     FW_VI_ENABLE_CMD_EEN_V(tx_en) |
+				     FW_VI_ENABLE_CMD_DCB_INFO_V(dcb_en) |
+				     FW_LEN16(c));
+	return t4_wr_mbox_ns(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_enable_vi - enable/disable a virtual interface
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@viid: the VI id
+ *	@rx_en: 1=enable Rx, 0=disable Rx
+ *	@tx_en: 1=enable Tx, 0=disable Tx
+ *
+ *	Enables/disables a virtual interface.
+ */
+int t4_enable_vi(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		 bool rx_en, bool tx_en)
+{
+	return t4_enable_vi_params(adap, mbox, viid, rx_en, tx_en, 0);
+}
+
+/**
+ *	t4_identify_port - identify a VI's port by blinking its LED
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@viid: the VI id
+ *	@nblinks: how many times to blink LED at 2.5 Hz
+ *
+ *	Identifies a VI's port by blinking its LED.
+ */
+int t4_identify_port(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		     unsigned int nblinks)
+{
+	struct fw_vi_enable_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_VI_ENABLE_CMD) |
+				   FW_CMD_REQUEST_F | FW_CMD_EXEC_F |
+				   FW_VI_ENABLE_CMD_VIID_V(viid));
+	c.ien_to_len16 = cpu_to_be32(FW_VI_ENABLE_CMD_LED_F | FW_LEN16(c));
+	c.blinkdur = cpu_to_be16(nblinks);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_iq_stop - stop an ingress queue and its FLs
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF owning the queues
+ *	@vf: the VF owning the queues
+ *	@iqtype: the ingress queue type (FW_IQ_TYPE_FL_INT_CAP, etc.)
+ *	@iqid: ingress queue id
+ *	@fl0id: FL0 queue id or 0xffff if no attached FL0
+ *	@fl1id: FL1 queue id or 0xffff if no attached FL1
+ *
+ *	Stops an ingress queue and its associated FLs, if any.  This causes
+ *	any current or future data/messages destined for these queues to be
+ *	tossed.
+ */
+int t4_iq_stop(struct adapter *adap, unsigned int mbox, unsigned int pf,
+	       unsigned int vf, unsigned int iqtype, unsigned int iqid,
+	       unsigned int fl0id, unsigned int fl1id)
+{
+	struct fw_iq_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_IQ_CMD) | FW_CMD_REQUEST_F |
+				  FW_CMD_EXEC_F | FW_IQ_CMD_PFN_V(pf) |
+				  FW_IQ_CMD_VFN_V(vf));
+	c.alloc_to_len16 = cpu_to_be32(FW_IQ_CMD_IQSTOP_F | FW_LEN16(c));
+	c.type_to_iqandstindex = cpu_to_be32(FW_IQ_CMD_TYPE_V(iqtype));
+	c.iqid = cpu_to_be16(iqid);
+	c.fl0id = cpu_to_be16(fl0id);
+	c.fl1id = cpu_to_be16(fl1id);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_iq_free - free an ingress queue and its FLs
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF owning the queues
+ *	@vf: the VF owning the queues
+ *	@iqtype: the ingress queue type
+ *	@iqid: ingress queue id
+ *	@fl0id: FL0 queue id or 0xffff if no attached FL0
+ *	@fl1id: FL1 queue id or 0xffff if no attached FL1
+ *
+ *	Frees an ingress queue and its associated FLs, if any.
+ */
+int t4_iq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+	       unsigned int vf, unsigned int iqtype, unsigned int iqid,
+	       unsigned int fl0id, unsigned int fl1id)
+{
+	struct fw_iq_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_IQ_CMD) | FW_CMD_REQUEST_F |
+				  FW_CMD_EXEC_F | FW_IQ_CMD_PFN_V(pf) |
+				  FW_IQ_CMD_VFN_V(vf));
+	c.alloc_to_len16 = cpu_to_be32(FW_IQ_CMD_FREE_F | FW_LEN16(c));
+	c.type_to_iqandstindex = cpu_to_be32(FW_IQ_CMD_TYPE_V(iqtype));
+	c.iqid = cpu_to_be16(iqid);
+	c.fl0id = cpu_to_be16(fl0id);
+	c.fl1id = cpu_to_be16(fl1id);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_eth_eq_free - free an Ethernet egress queue
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF owning the queue
+ *	@vf: the VF owning the queue
+ *	@eqid: egress queue id
+ *
+ *	Frees an Ethernet egress queue.
+ */
+int t4_eth_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		   unsigned int vf, unsigned int eqid)
+{
+	struct fw_eq_eth_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_EQ_ETH_CMD) |
+				  FW_CMD_REQUEST_F | FW_CMD_EXEC_F |
+				  FW_EQ_ETH_CMD_PFN_V(pf) |
+				  FW_EQ_ETH_CMD_VFN_V(vf));
+	c.alloc_to_len16 = cpu_to_be32(FW_EQ_ETH_CMD_FREE_F | FW_LEN16(c));
+	c.eqid_pkd = cpu_to_be32(FW_EQ_ETH_CMD_EQID_V(eqid));
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_ctrl_eq_free - free a control egress queue
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF owning the queue
+ *	@vf: the VF owning the queue
+ *	@eqid: egress queue id
+ *
+ *	Frees a control egress queue.
+ */
+int t4_ctrl_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		    unsigned int vf, unsigned int eqid)
+{
+	struct fw_eq_ctrl_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_EQ_CTRL_CMD) |
+				  FW_CMD_REQUEST_F | FW_CMD_EXEC_F |
+				  FW_EQ_CTRL_CMD_PFN_V(pf) |
+				  FW_EQ_CTRL_CMD_VFN_V(vf));
+	c.alloc_to_len16 = cpu_to_be32(FW_EQ_CTRL_CMD_FREE_F | FW_LEN16(c));
+	c.cmpliqid_eqid = cpu_to_be32(FW_EQ_CTRL_CMD_EQID_V(eqid));
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_ofld_eq_free - free an offload egress queue
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF owning the queue
+ *	@vf: the VF owning the queue
+ *	@eqid: egress queue id
+ *
+ *	Frees a control egress queue.
+ */
+int t4_ofld_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		    unsigned int vf, unsigned int eqid)
+{
+	struct fw_eq_ofld_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_EQ_OFLD_CMD) |
+				  FW_CMD_REQUEST_F | FW_CMD_EXEC_F |
+				  FW_EQ_OFLD_CMD_PFN_V(pf) |
+				  FW_EQ_OFLD_CMD_VFN_V(vf));
+	c.alloc_to_len16 = cpu_to_be32(FW_EQ_OFLD_CMD_FREE_F | FW_LEN16(c));
+	c.eqid_pkd = cpu_to_be32(FW_EQ_OFLD_CMD_EQID_V(eqid));
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_link_down_rc_str - return a string for a Link Down Reason Code
+ *	@adap: the adapter
+ *	@link_down_rc: Link Down Reason Code
+ *
+ *	Returns a string representation of the Link Down Reason Code.
+ */
+static const char *t4_link_down_rc_str(unsigned char link_down_rc)
+{
+	static const char * const reason[] = {
+		"Link Down",
+		"Remote Fault",
+		"Auto-negotiation Failure",
+		"Reserved",
+		"Insufficient Airflow",
+		"Unable To Determine Reason",
+		"No RX Signal Detected",
+		"Reserved",
+	};
+
+	if (link_down_rc >= ARRAY_SIZE(reason))
+		return "Bad Reason Code";
+
+	return reason[link_down_rc];
+}
+
+/**
+ * Return the highest speed set in the port capabilities, in Mb/s.
+ */
+static unsigned int fwcap_to_speed(fw_port_cap32_t caps)
+{
+	#define TEST_SPEED_RETURN(__caps_speed, __speed) \
+		do { \
+			if (caps & FW_PORT_CAP32_SPEED_##__caps_speed) \
+				return __speed; \
+		} while (0)
+
+	TEST_SPEED_RETURN(400G, 400000);
+	TEST_SPEED_RETURN(200G, 200000);
+	TEST_SPEED_RETURN(100G, 100000);
+	TEST_SPEED_RETURN(50G,   50000);
+	TEST_SPEED_RETURN(40G,   40000);
+	TEST_SPEED_RETURN(25G,   25000);
+	TEST_SPEED_RETURN(10G,   10000);
+	TEST_SPEED_RETURN(1G,     1000);
+	TEST_SPEED_RETURN(100M,    100);
+
+	#undef TEST_SPEED_RETURN
+
+	return 0;
+}
+
+/**
+ *	fwcap_to_fwspeed - return highest speed in Port Capabilities
+ *	@acaps: advertised Port Capabilities
+ *
+ *	Get the highest speed for the port from the advertised Port
+ *	Capabilities.  It will be either the highest speed from the list of
+ *	speeds or whatever user has set using ethtool.
+ */
+static fw_port_cap32_t fwcap_to_fwspeed(fw_port_cap32_t acaps)
+{
+	#define TEST_SPEED_RETURN(__caps_speed) \
+		do { \
+			if (acaps & FW_PORT_CAP32_SPEED_##__caps_speed) \
+				return FW_PORT_CAP32_SPEED_##__caps_speed; \
+		} while (0)
+
+	TEST_SPEED_RETURN(400G);
+	TEST_SPEED_RETURN(200G);
+	TEST_SPEED_RETURN(100G);
+	TEST_SPEED_RETURN(50G);
+	TEST_SPEED_RETURN(40G);
+	TEST_SPEED_RETURN(25G);
+	TEST_SPEED_RETURN(10G);
+	TEST_SPEED_RETURN(1G);
+	TEST_SPEED_RETURN(100M);
+
+	#undef TEST_SPEED_RETURN
+
+	return 0;
+}
+
+/**
+ *	lstatus_to_fwcap - translate old lstatus to 32-bit Port Capabilities
+ *	@lstatus: old FW_PORT_ACTION_GET_PORT_INFO lstatus value
+ *
+ *	Translates old FW_PORT_ACTION_GET_PORT_INFO lstatus field into new
+ *	32-bit Port Capabilities value.
+ */
+static fw_port_cap32_t lstatus_to_fwcap(u32 lstatus)
+{
+	fw_port_cap32_t linkattr = 0;
+
+	/* Unfortunately the format of the Link Status in the old
+	 * 16-bit Port Information message isn't the same as the
+	 * 16-bit Port Capabilities bitfield used everywhere else ...
+	 */
+	if (lstatus & FW_PORT_CMD_RXPAUSE_F)
+		linkattr |= FW_PORT_CAP32_FC_RX;
+	if (lstatus & FW_PORT_CMD_TXPAUSE_F)
+		linkattr |= FW_PORT_CAP32_FC_TX;
+	if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_100M))
+		linkattr |= FW_PORT_CAP32_SPEED_100M;
+	if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_1G))
+		linkattr |= FW_PORT_CAP32_SPEED_1G;
+	if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_10G))
+		linkattr |= FW_PORT_CAP32_SPEED_10G;
+	if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_25G))
+		linkattr |= FW_PORT_CAP32_SPEED_25G;
+	if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_40G))
+		linkattr |= FW_PORT_CAP32_SPEED_40G;
+	if (lstatus & FW_PORT_CMD_LSPEED_V(FW_PORT_CAP_SPEED_100G))
+		linkattr |= FW_PORT_CAP32_SPEED_100G;
+
+	return linkattr;
+}
+
+/**
+ *	t4_handle_get_port_info - process a FW reply message
+ *	@pi: the port info
+ *	@rpl: start of the FW message
+ *
+ *	Processes a GET_PORT_INFO FW reply message.
+ */
+void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl)
+{
+	const struct fw_port_cmd *cmd = (const void *)rpl;
+	int action = FW_PORT_CMD_ACTION_G(be32_to_cpu(cmd->action_to_len16));
+	struct adapter *adapter = pi->adapter;
+	struct link_config *lc = &pi->link_cfg;
+	int link_ok, linkdnrc;
+	enum fw_port_type port_type;
+	enum fw_port_module_type mod_type;
+	unsigned int speed, fc, fec;
+	fw_port_cap32_t pcaps, acaps, lpacaps, linkattr;
+
+	/* Extract the various fields from the Port Information message.
+	 */
+	switch (action) {
+	case FW_PORT_ACTION_GET_PORT_INFO: {
+		u32 lstatus = be32_to_cpu(cmd->u.info.lstatus_to_modtype);
+
+		link_ok = (lstatus & FW_PORT_CMD_LSTATUS_F) != 0;
+		linkdnrc = FW_PORT_CMD_LINKDNRC_G(lstatus);
+		port_type = FW_PORT_CMD_PTYPE_G(lstatus);
+		mod_type = FW_PORT_CMD_MODTYPE_G(lstatus);
+		pcaps = fwcaps16_to_caps32(be16_to_cpu(cmd->u.info.pcap));
+		acaps = fwcaps16_to_caps32(be16_to_cpu(cmd->u.info.acap));
+		lpacaps = fwcaps16_to_caps32(be16_to_cpu(cmd->u.info.lpacap));
+		linkattr = lstatus_to_fwcap(lstatus);
+		break;
+	}
+
+	case FW_PORT_ACTION_GET_PORT_INFO32: {
+		u32 lstatus32;
+
+		lstatus32 = be32_to_cpu(cmd->u.info32.lstatus32_to_cbllen32);
+		link_ok = (lstatus32 & FW_PORT_CMD_LSTATUS32_F) != 0;
+		linkdnrc = FW_PORT_CMD_LINKDNRC32_G(lstatus32);
+		port_type = FW_PORT_CMD_PORTTYPE32_G(lstatus32);
+		mod_type = FW_PORT_CMD_MODTYPE32_G(lstatus32);
+		pcaps = be32_to_cpu(cmd->u.info32.pcaps32);
+		acaps = be32_to_cpu(cmd->u.info32.acaps32);
+		lpacaps = be32_to_cpu(cmd->u.info32.lpacaps32);
+		linkattr = be32_to_cpu(cmd->u.info32.linkattr32);
+		break;
+	}
+
+	default:
+		dev_err(adapter->pdev_dev, "Handle Port Information: Bad Command/Action %#x\n",
+			be32_to_cpu(cmd->action_to_len16));
+		return;
+	}
+
+	fec = fwcap_to_cc_fec(acaps);
+	fc = fwcap_to_cc_pause(linkattr);
+	speed = fwcap_to_speed(linkattr);
+
+	if (mod_type != pi->mod_type) {
+		/* With the newer SFP28 and QSFP28 Transceiver Module Types,
+		 * various fundamental Port Capabilities which used to be
+		 * immutable can now change radically.  We can now have
+		 * Speeds, Auto-Negotiation, Forward Error Correction, etc.
+		 * all change based on what Transceiver Module is inserted.
+		 * So we need to record the Physical "Port" Capabilities on
+		 * every Transceiver Module change.
+		 */
+		lc->pcaps = pcaps;
+
+		/* When a new Transceiver Module is inserted, the Firmware
+		 * will examine its i2c EPROM to determine its type and
+		 * general operating parameters including things like Forward
+		 * Error Control, etc.  Various IEEE 802.3 standards dictate
+		 * how to interpret these i2c values to determine default
+		 * "sutomatic" settings.  We record these for future use when
+		 * the user explicitly requests these standards-based values.
+		 */
+		lc->def_acaps = acaps;
+
+		/* Some versions of the early T6 Firmware "cheated" when
+		 * handling different Transceiver Modules by changing the
+		 * underlaying Port Type reported to the Host Drivers.  As
+		 * such we need to capture whatever Port Type the Firmware
+		 * sends us and record it in case it's different from what we
+		 * were told earlier.  Unfortunately, since Firmware is
+		 * forever, we'll need to keep this code here forever, but in
+		 * later T6 Firmware it should just be an assignment of the
+		 * same value already recorded.
+		 */
+		pi->port_type = port_type;
+
+		pi->mod_type = mod_type;
+		t4_os_portmod_changed(adapter, pi->port_id);
+	}
+
+	if (link_ok != lc->link_ok || speed != lc->speed ||
+	    fc != lc->fc || fec != lc->fec) {	/* something changed */
+		if (!link_ok && lc->link_ok) {
+			lc->link_down_rc = linkdnrc;
+			dev_warn(adapter->pdev_dev, "Port %d link down, reason: %s\n",
+				 pi->tx_chan, t4_link_down_rc_str(linkdnrc));
+		}
+		lc->link_ok = link_ok;
+		lc->speed = speed;
+		lc->fc = fc;
+		lc->fec = fec;
+
+		lc->lpacaps = lpacaps;
+		lc->acaps = acaps & ADVERT_MASK;
+
+		if (lc->acaps & FW_PORT_CAP32_ANEG) {
+			lc->autoneg = AUTONEG_ENABLE;
+		} else {
+			/* When Autoneg is disabled, user needs to set
+			 * single speed.
+			 * Similar to cxgb4_ethtool.c: set_link_ksettings
+			 */
+			lc->acaps = 0;
+			lc->speed_caps = fwcap_to_fwspeed(acaps);
+			lc->autoneg = AUTONEG_DISABLE;
+		}
+
+		t4_os_link_changed(adapter, pi->port_id, link_ok);
+	}
+}
+
+/**
+ *	t4_update_port_info - retrieve and update port information if changed
+ *	@pi: the port_info
+ *
+ *	We issue a Get Port Information Command to the Firmware and, if
+ *	successful, we check to see if anything is different from what we
+ *	last recorded and update things accordingly.
+ */
+int t4_update_port_info(struct port_info *pi)
+{
+	unsigned int fw_caps = pi->adapter->params.fw_caps_support;
+	struct fw_port_cmd port_cmd;
+	int ret;
+
+	memset(&port_cmd, 0, sizeof(port_cmd));
+	port_cmd.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
+					    FW_CMD_REQUEST_F | FW_CMD_READ_F |
+					    FW_PORT_CMD_PORTID_V(pi->tx_chan));
+	port_cmd.action_to_len16 = cpu_to_be32(
+		FW_PORT_CMD_ACTION_V(fw_caps == FW_CAPS16
+				     ? FW_PORT_ACTION_GET_PORT_INFO
+				     : FW_PORT_ACTION_GET_PORT_INFO32) |
+		FW_LEN16(port_cmd));
+	ret = t4_wr_mbox(pi->adapter, pi->adapter->mbox,
+			 &port_cmd, sizeof(port_cmd), &port_cmd);
+	if (ret)
+		return ret;
+
+	t4_handle_get_port_info(pi, (__be64 *)&port_cmd);
+	return 0;
+}
+
+/**
+ *	t4_get_link_params - retrieve basic link parameters for given port
+ *	@pi: the port
+ *	@link_okp: value return pointer for link up/down
+ *	@speedp: value return pointer for speed (Mb/s)
+ *	@mtup: value return pointer for mtu
+ *
+ *	Retrieves basic link parameters for a port: link up/down, speed (Mb/s),
+ *	and MTU for a specified port.  A negative error is returned on
+ *	failure; 0 on success.
+ */
+int t4_get_link_params(struct port_info *pi, unsigned int *link_okp,
+		       unsigned int *speedp, unsigned int *mtup)
+{
+	unsigned int fw_caps = pi->adapter->params.fw_caps_support;
+	struct fw_port_cmd port_cmd;
+	unsigned int action, link_ok, speed, mtu;
+	fw_port_cap32_t linkattr;
+	int ret;
+
+	memset(&port_cmd, 0, sizeof(port_cmd));
+	port_cmd.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
+					    FW_CMD_REQUEST_F | FW_CMD_READ_F |
+					    FW_PORT_CMD_PORTID_V(pi->tx_chan));
+	action = (fw_caps == FW_CAPS16
+		  ? FW_PORT_ACTION_GET_PORT_INFO
+		  : FW_PORT_ACTION_GET_PORT_INFO32);
+	port_cmd.action_to_len16 = cpu_to_be32(
+		FW_PORT_CMD_ACTION_V(action) |
+		FW_LEN16(port_cmd));
+	ret = t4_wr_mbox(pi->adapter, pi->adapter->mbox,
+			 &port_cmd, sizeof(port_cmd), &port_cmd);
+	if (ret)
+		return ret;
+
+	if (action == FW_PORT_ACTION_GET_PORT_INFO) {
+		u32 lstatus = be32_to_cpu(port_cmd.u.info.lstatus_to_modtype);
+
+		link_ok = !!(lstatus & FW_PORT_CMD_LSTATUS_F);
+		linkattr = lstatus_to_fwcap(lstatus);
+		mtu = be16_to_cpu(port_cmd.u.info.mtu);
+	} else {
+		u32 lstatus32 =
+			   be32_to_cpu(port_cmd.u.info32.lstatus32_to_cbllen32);
+
+		link_ok = !!(lstatus32 & FW_PORT_CMD_LSTATUS32_F);
+		linkattr = be32_to_cpu(port_cmd.u.info32.linkattr32);
+		mtu = FW_PORT_CMD_MTU32_G(
+			be32_to_cpu(port_cmd.u.info32.auxlinfo32_mtu32));
+	}
+	speed = fwcap_to_speed(linkattr);
+
+	*link_okp = link_ok;
+	*speedp = fwcap_to_speed(linkattr);
+	*mtup = mtu;
+
+	return 0;
+}
+
+/**
+ *      t4_handle_fw_rpl - process a FW reply message
+ *      @adap: the adapter
+ *      @rpl: start of the FW message
+ *
+ *      Processes a FW message, such as link state change messages.
+ */
+int t4_handle_fw_rpl(struct adapter *adap, const __be64 *rpl)
+{
+	u8 opcode = *(const u8 *)rpl;
+
+	/* This might be a port command ... this simplifies the following
+	 * conditionals ...  We can get away with pre-dereferencing
+	 * action_to_len16 because it's in the first 16 bytes and all messages
+	 * will be at least that long.
+	 */
+	const struct fw_port_cmd *p = (const void *)rpl;
+	unsigned int action =
+		FW_PORT_CMD_ACTION_G(be32_to_cpu(p->action_to_len16));
+
+	if (opcode == FW_PORT_CMD &&
+	    (action == FW_PORT_ACTION_GET_PORT_INFO ||
+	     action == FW_PORT_ACTION_GET_PORT_INFO32)) {
+		int i;
+		int chan = FW_PORT_CMD_PORTID_G(be32_to_cpu(p->op_to_portid));
+		struct port_info *pi = NULL;
+
+		for_each_port(adap, i) {
+			pi = adap2pinfo(adap, i);
+			if (pi->tx_chan == chan)
+				break;
+		}
+
+		t4_handle_get_port_info(pi, rpl);
+	} else {
+		dev_warn(adap->pdev_dev, "Unknown firmware reply %d\n",
+			 opcode);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void get_pci_mode(struct adapter *adapter, struct pci_params *p)
+{
+	u16 val;
+
+	if (pci_is_pcie(adapter->pdev)) {
+		pcie_capability_read_word(adapter->pdev, PCI_EXP_LNKSTA, &val);
+		p->speed = val & PCI_EXP_LNKSTA_CLS;
+		p->width = (val & PCI_EXP_LNKSTA_NLW) >> 4;
+	}
+}
+
+/**
+ *	init_link_config - initialize a link's SW state
+ *	@lc: pointer to structure holding the link state
+ *	@pcaps: link Port Capabilities
+ *	@acaps: link current Advertised Port Capabilities
+ *
+ *	Initializes the SW state maintained for each link, including the link's
+ *	capabilities and default speed/flow-control/autonegotiation settings.
+ */
+static void init_link_config(struct link_config *lc, fw_port_cap32_t pcaps,
+			     fw_port_cap32_t acaps)
+{
+	lc->pcaps = pcaps;
+	lc->def_acaps = acaps;
+	lc->lpacaps = 0;
+	lc->speed_caps = 0;
+	lc->speed = 0;
+	lc->requested_fc = lc->fc = PAUSE_RX | PAUSE_TX;
+
+	/* For Forward Error Control, we default to whatever the Firmware
+	 * tells us the Link is currently advertising.
+	 */
+	lc->requested_fec = FEC_AUTO;
+	lc->fec = fwcap_to_cc_fec(lc->def_acaps);
+
+	if (lc->pcaps & FW_PORT_CAP32_ANEG) {
+		lc->acaps = lc->pcaps & ADVERT_MASK;
+		lc->autoneg = AUTONEG_ENABLE;
+		lc->requested_fc |= PAUSE_AUTONEG;
+	} else {
+		lc->acaps = 0;
+		lc->autoneg = AUTONEG_DISABLE;
+	}
+}
+
+#define CIM_PF_NOACCESS 0xeeeeeeee
+
+int t4_wait_dev_ready(void __iomem *regs)
+{
+	u32 whoami;
+
+	whoami = readl(regs + PL_WHOAMI_A);
+	if (whoami != 0xffffffff && whoami != CIM_PF_NOACCESS)
+		return 0;
+
+	msleep(500);
+	whoami = readl(regs + PL_WHOAMI_A);
+	return (whoami != 0xffffffff && whoami != CIM_PF_NOACCESS ? 0 : -EIO);
+}
+
+struct flash_desc {
+	u32 vendor_and_model_id;
+	u32 size_mb;
+};
+
+static int t4_get_flash_params(struct adapter *adap)
+{
+	/* Table for non-Numonix supported flash parts.  Numonix parts are left
+	 * to the preexisting code.  All flash parts have 64KB sectors.
+	 */
+	static struct flash_desc supported_flash[] = {
+		{ 0x150201, 4 << 20 },       /* Spansion 4MB S25FL032P */
+	};
+
+	unsigned int part, manufacturer;
+	unsigned int density, size;
+	u32 flashid = 0;
+	int ret;
+
+	/* Issue a Read ID Command to the Flash part.  We decode supported
+	 * Flash parts and their sizes from this.  There's a newer Query
+	 * Command which can retrieve detailed geometry information but many
+	 * Flash parts don't support it.
+	 */
+
+	ret = sf1_write(adap, 1, 1, 0, SF_RD_ID);
+	if (!ret)
+		ret = sf1_read(adap, 3, 0, 1, &flashid);
+	t4_write_reg(adap, SF_OP_A, 0);                    /* unlock SF */
+	if (ret)
+		return ret;
+
+	/* Check to see if it's one of our non-standard supported Flash parts.
+	 */
+	for (part = 0; part < ARRAY_SIZE(supported_flash); part++)
+		if (supported_flash[part].vendor_and_model_id == flashid) {
+			adap->params.sf_size = supported_flash[part].size_mb;
+			adap->params.sf_nsec =
+				adap->params.sf_size / SF_SEC_SIZE;
+			goto found;
+		}
+
+	/* Decode Flash part size.  The code below looks repetative with
+	 * common encodings, but that's not guaranteed in the JEDEC
+	 * specification for the Read JADEC ID command.  The only thing that
+	 * we're guaranteed by the JADEC specification is where the
+	 * Manufacturer ID is in the returned result.  After that each
+	 * Manufacturer ~could~ encode things completely differently.
+	 * Note, all Flash parts must have 64KB sectors.
+	 */
+	manufacturer = flashid & 0xff;
+	switch (manufacturer) {
+	case 0x20: { /* Micron/Numonix */
+		/* This Density -> Size decoding table is taken from Micron
+		 * Data Sheets.
+		 */
+		density = (flashid >> 16) & 0xff;
+		switch (density) {
+		case 0x14: /* 1MB */
+			size = 1 << 20;
+			break;
+		case 0x15: /* 2MB */
+			size = 1 << 21;
+			break;
+		case 0x16: /* 4MB */
+			size = 1 << 22;
+			break;
+		case 0x17: /* 8MB */
+			size = 1 << 23;
+			break;
+		case 0x18: /* 16MB */
+			size = 1 << 24;
+			break;
+		case 0x19: /* 32MB */
+			size = 1 << 25;
+			break;
+		case 0x20: /* 64MB */
+			size = 1 << 26;
+			break;
+		case 0x21: /* 128MB */
+			size = 1 << 27;
+			break;
+		case 0x22: /* 256MB */
+			size = 1 << 28;
+			break;
+
+		default:
+			dev_err(adap->pdev_dev, "Micron Flash Part has bad size, ID = %#x, Density code = %#x\n",
+				flashid, density);
+			return -EINVAL;
+		}
+		break;
+	}
+	case 0x9d: { /* ISSI -- Integrated Silicon Solution, Inc. */
+		/* This Density -> Size decoding table is taken from ISSI
+		 * Data Sheets.
+		 */
+		density = (flashid >> 16) & 0xff;
+		switch (density) {
+		case 0x16: /* 32 MB */
+			size = 1 << 25;
+			break;
+		case 0x17: /* 64MB */
+			size = 1 << 26;
+			break;
+		default:
+			dev_err(adap->pdev_dev, "ISSI Flash Part has bad size, ID = %#x, Density code = %#x\n",
+				flashid, density);
+			return -EINVAL;
+		}
+		break;
+	}
+	case 0xc2: { /* Macronix */
+		/* This Density -> Size decoding table is taken from Macronix
+		 * Data Sheets.
+		 */
+		density = (flashid >> 16) & 0xff;
+		switch (density) {
+		case 0x17: /* 8MB */
+			size = 1 << 23;
+			break;
+		case 0x18: /* 16MB */
+			size = 1 << 24;
+			break;
+		default:
+			dev_err(adap->pdev_dev, "Macronix Flash Part has bad size, ID = %#x, Density code = %#x\n",
+				flashid, density);
+			return -EINVAL;
+		}
+		break;
+	}
+	case 0xef: { /* Winbond */
+		/* This Density -> Size decoding table is taken from Winbond
+		 * Data Sheets.
+		 */
+		density = (flashid >> 16) & 0xff;
+		switch (density) {
+		case 0x17: /* 8MB */
+			size = 1 << 23;
+			break;
+		case 0x18: /* 16MB */
+			size = 1 << 24;
+			break;
+		default:
+			dev_err(adap->pdev_dev, "Winbond Flash Part has bad size, ID = %#x, Density code = %#x\n",
+				flashid, density);
+			return -EINVAL;
+		}
+		break;
+	}
+	default:
+		dev_err(adap->pdev_dev, "Unsupported Flash Part, ID = %#x\n",
+			flashid);
+		return -EINVAL;
+	}
+
+	/* Store decoded Flash size and fall through into vetting code. */
+	adap->params.sf_size = size;
+	adap->params.sf_nsec = size / SF_SEC_SIZE;
+
+found:
+	if (adap->params.sf_size < FLASH_MIN_SIZE)
+		dev_warn(adap->pdev_dev, "WARNING: Flash Part ID %#x, size %#x < %#x\n",
+			 flashid, adap->params.sf_size, FLASH_MIN_SIZE);
+	return 0;
+}
+
+/**
+ *	t4_prep_adapter - prepare SW and HW for operation
+ *	@adapter: the adapter
+ *	@reset: if true perform a HW reset
+ *
+ *	Initialize adapter SW state for the various HW modules, set initial
+ *	values for some adapter tunables, take PHYs out of reset, and
+ *	initialize the MDIO interface.
+ */
+int t4_prep_adapter(struct adapter *adapter)
+{
+	int ret, ver;
+	uint16_t device_id;
+	u32 pl_rev;
+
+	get_pci_mode(adapter, &adapter->params.pci);
+	pl_rev = REV_G(t4_read_reg(adapter, PL_REV_A));
+
+	ret = t4_get_flash_params(adapter);
+	if (ret < 0) {
+		dev_err(adapter->pdev_dev, "error %d identifying flash\n", ret);
+		return ret;
+	}
+
+	/* Retrieve adapter's device ID
+	 */
+	pci_read_config_word(adapter->pdev, PCI_DEVICE_ID, &device_id);
+	ver = device_id >> 12;
+	adapter->params.chip = 0;
+	switch (ver) {
+	case CHELSIO_T4:
+		adapter->params.chip |= CHELSIO_CHIP_CODE(CHELSIO_T4, pl_rev);
+		adapter->params.arch.sge_fl_db = DBPRIO_F;
+		adapter->params.arch.mps_tcam_size =
+				 NUM_MPS_CLS_SRAM_L_INSTANCES;
+		adapter->params.arch.mps_rplc_size = 128;
+		adapter->params.arch.nchan = NCHAN;
+		adapter->params.arch.pm_stats_cnt = PM_NSTATS;
+		adapter->params.arch.vfcount = 128;
+		/* Congestion map is for 4 channels so that
+		 * MPS can have 4 priority per port.
+		 */
+		adapter->params.arch.cng_ch_bits_log = 2;
+		break;
+	case CHELSIO_T5:
+		adapter->params.chip |= CHELSIO_CHIP_CODE(CHELSIO_T5, pl_rev);
+		adapter->params.arch.sge_fl_db = DBPRIO_F | DBTYPE_F;
+		adapter->params.arch.mps_tcam_size =
+				 NUM_MPS_T5_CLS_SRAM_L_INSTANCES;
+		adapter->params.arch.mps_rplc_size = 128;
+		adapter->params.arch.nchan = NCHAN;
+		adapter->params.arch.pm_stats_cnt = PM_NSTATS;
+		adapter->params.arch.vfcount = 128;
+		adapter->params.arch.cng_ch_bits_log = 2;
+		break;
+	case CHELSIO_T6:
+		adapter->params.chip |= CHELSIO_CHIP_CODE(CHELSIO_T6, pl_rev);
+		adapter->params.arch.sge_fl_db = 0;
+		adapter->params.arch.mps_tcam_size =
+				 NUM_MPS_T5_CLS_SRAM_L_INSTANCES;
+		adapter->params.arch.mps_rplc_size = 256;
+		adapter->params.arch.nchan = 2;
+		adapter->params.arch.pm_stats_cnt = T6_PM_NSTATS;
+		adapter->params.arch.vfcount = 256;
+		/* Congestion map will be for 2 channels so that
+		 * MPS can have 8 priority per port.
+		 */
+		adapter->params.arch.cng_ch_bits_log = 3;
+		break;
+	default:
+		dev_err(adapter->pdev_dev, "Device %d is not supported\n",
+			device_id);
+		return -EINVAL;
+	}
+
+	adapter->params.cim_la_size = CIMLA_SIZE;
+	init_cong_ctrl(adapter->params.a_wnd, adapter->params.b_wnd);
+
+	/*
+	 * Default port for debugging in case we can't reach FW.
+	 */
+	adapter->params.nports = 1;
+	adapter->params.portvec = 1;
+	adapter->params.vpd.cclk = 50000;
+
+	/* Set PCIe completion timeout to 4 seconds. */
+	pcie_capability_clear_and_set_word(adapter->pdev, PCI_EXP_DEVCTL2,
+					   PCI_EXP_DEVCTL2_COMP_TIMEOUT, 0xd);
+	return 0;
+}
+
+/**
+ *	t4_shutdown_adapter - shut down adapter, host & wire
+ *	@adapter: the adapter
+ *
+ *	Perform an emergency shutdown of the adapter and stop it from
+ *	continuing any further communication on the ports or DMA to the
+ *	host.  This is typically used when the adapter and/or firmware
+ *	have crashed and we want to prevent any further accidental
+ *	communication with the rest of the world.  This will also force
+ *	the port Link Status to go down -- if register writes work --
+ *	which should help our peers figure out that we're down.
+ */
+int t4_shutdown_adapter(struct adapter *adapter)
+{
+	int port;
+
+	t4_intr_disable(adapter);
+	t4_write_reg(adapter, DBG_GPIO_EN_A, 0);
+	for_each_port(adapter, port) {
+		u32 a_port_cfg = is_t4(adapter->params.chip) ?
+				       PORT_REG(port, XGMAC_PORT_CFG_A) :
+				       T5_PORT_REG(port, MAC_PORT_CFG_A);
+
+		t4_write_reg(adapter, a_port_cfg,
+			     t4_read_reg(adapter, a_port_cfg)
+			     & ~SIGNAL_DET_V(1));
+	}
+	t4_set_reg_field(adapter, SGE_CONTROL_A, GLOBALENABLE_F, 0);
+
+	return 0;
+}
+
+/**
+ *	t4_bar2_sge_qregs - return BAR2 SGE Queue register information
+ *	@adapter: the adapter
+ *	@qid: the Queue ID
+ *	@qtype: the Ingress or Egress type for @qid
+ *	@user: true if this request is for a user mode queue
+ *	@pbar2_qoffset: BAR2 Queue Offset
+ *	@pbar2_qid: BAR2 Queue ID or 0 for Queue ID inferred SGE Queues
+ *
+ *	Returns the BAR2 SGE Queue Registers information associated with the
+ *	indicated Absolute Queue ID.  These are passed back in return value
+ *	pointers.  @qtype should be T4_BAR2_QTYPE_EGRESS for Egress Queue
+ *	and T4_BAR2_QTYPE_INGRESS for Ingress Queues.
+ *
+ *	This may return an error which indicates that BAR2 SGE Queue
+ *	registers aren't available.  If an error is not returned, then the
+ *	following values are returned:
+ *
+ *	  *@pbar2_qoffset: the BAR2 Offset of the @qid Registers
+ *	  *@pbar2_qid: the BAR2 SGE Queue ID or 0 of @qid
+ *
+ *	If the returned BAR2 Queue ID is 0, then BAR2 SGE registers which
+ *	require the "Inferred Queue ID" ability may be used.  E.g. the
+ *	Write Combining Doorbell Buffer. If the BAR2 Queue ID is not 0,
+ *	then these "Inferred Queue ID" register may not be used.
+ */
+int t4_bar2_sge_qregs(struct adapter *adapter,
+		      unsigned int qid,
+		      enum t4_bar2_qtype qtype,
+		      int user,
+		      u64 *pbar2_qoffset,
+		      unsigned int *pbar2_qid)
+{
+	unsigned int page_shift, page_size, qpp_shift, qpp_mask;
+	u64 bar2_page_offset, bar2_qoffset;
+	unsigned int bar2_qid, bar2_qid_offset, bar2_qinferred;
+
+	/* T4 doesn't support BAR2 SGE Queue registers for kernel mode queues */
+	if (!user && is_t4(adapter->params.chip))
+		return -EINVAL;
+
+	/* Get our SGE Page Size parameters.
+	 */
+	page_shift = adapter->params.sge.hps + 10;
+	page_size = 1 << page_shift;
+
+	/* Get the right Queues per Page parameters for our Queue.
+	 */
+	qpp_shift = (qtype == T4_BAR2_QTYPE_EGRESS
+		     ? adapter->params.sge.eq_qpp
+		     : adapter->params.sge.iq_qpp);
+	qpp_mask = (1 << qpp_shift) - 1;
+
+	/*  Calculate the basics of the BAR2 SGE Queue register area:
+	 *  o The BAR2 page the Queue registers will be in.
+	 *  o The BAR2 Queue ID.
+	 *  o The BAR2 Queue ID Offset into the BAR2 page.
+	 */
+	bar2_page_offset = ((u64)(qid >> qpp_shift) << page_shift);
+	bar2_qid = qid & qpp_mask;
+	bar2_qid_offset = bar2_qid * SGE_UDB_SIZE;
+
+	/* If the BAR2 Queue ID Offset is less than the Page Size, then the
+	 * hardware will infer the Absolute Queue ID simply from the writes to
+	 * the BAR2 Queue ID Offset within the BAR2 Page (and we need to use a
+	 * BAR2 Queue ID of 0 for those writes).  Otherwise, we'll simply
+	 * write to the first BAR2 SGE Queue Area within the BAR2 Page with
+	 * the BAR2 Queue ID and the hardware will infer the Absolute Queue ID
+	 * from the BAR2 Page and BAR2 Queue ID.
+	 *
+	 * One important censequence of this is that some BAR2 SGE registers
+	 * have a "Queue ID" field and we can write the BAR2 SGE Queue ID
+	 * there.  But other registers synthesize the SGE Queue ID purely
+	 * from the writes to the registers -- the Write Combined Doorbell
+	 * Buffer is a good example.  These BAR2 SGE Registers are only
+	 * available for those BAR2 SGE Register areas where the SGE Absolute
+	 * Queue ID can be inferred from simple writes.
+	 */
+	bar2_qoffset = bar2_page_offset;
+	bar2_qinferred = (bar2_qid_offset < page_size);
+	if (bar2_qinferred) {
+		bar2_qoffset += bar2_qid_offset;
+		bar2_qid = 0;
+	}
+
+	*pbar2_qoffset = bar2_qoffset;
+	*pbar2_qid = bar2_qid;
+	return 0;
+}
+
+/**
+ *	t4_init_devlog_params - initialize adapter->params.devlog
+ *	@adap: the adapter
+ *
+ *	Initialize various fields of the adapter's Firmware Device Log
+ *	Parameters structure.
+ */
+int t4_init_devlog_params(struct adapter *adap)
+{
+	struct devlog_params *dparams = &adap->params.devlog;
+	u32 pf_dparams;
+	unsigned int devlog_meminfo;
+	struct fw_devlog_cmd devlog_cmd;
+	int ret;
+
+	/* If we're dealing with newer firmware, the Device Log Paramerters
+	 * are stored in a designated register which allows us to access the
+	 * Device Log even if we can't talk to the firmware.
+	 */
+	pf_dparams =
+		t4_read_reg(adap, PCIE_FW_REG(PCIE_FW_PF_A, PCIE_FW_PF_DEVLOG));
+	if (pf_dparams) {
+		unsigned int nentries, nentries128;
+
+		dparams->memtype = PCIE_FW_PF_DEVLOG_MEMTYPE_G(pf_dparams);
+		dparams->start = PCIE_FW_PF_DEVLOG_ADDR16_G(pf_dparams) << 4;
+
+		nentries128 = PCIE_FW_PF_DEVLOG_NENTRIES128_G(pf_dparams);
+		nentries = (nentries128 + 1) * 128;
+		dparams->size = nentries * sizeof(struct fw_devlog_e);
+
+		return 0;
+	}
+
+	/* Otherwise, ask the firmware for it's Device Log Parameters.
+	 */
+	memset(&devlog_cmd, 0, sizeof(devlog_cmd));
+	devlog_cmd.op_to_write = cpu_to_be32(FW_CMD_OP_V(FW_DEVLOG_CMD) |
+					     FW_CMD_REQUEST_F | FW_CMD_READ_F);
+	devlog_cmd.retval_len16 = cpu_to_be32(FW_LEN16(devlog_cmd));
+	ret = t4_wr_mbox(adap, adap->mbox, &devlog_cmd, sizeof(devlog_cmd),
+			 &devlog_cmd);
+	if (ret)
+		return ret;
+
+	devlog_meminfo =
+		be32_to_cpu(devlog_cmd.memtype_devlog_memaddr16_devlog);
+	dparams->memtype = FW_DEVLOG_CMD_MEMTYPE_DEVLOG_G(devlog_meminfo);
+	dparams->start = FW_DEVLOG_CMD_MEMADDR16_DEVLOG_G(devlog_meminfo) << 4;
+	dparams->size = be32_to_cpu(devlog_cmd.memsize_devlog);
+
+	return 0;
+}
+
+/**
+ *	t4_init_sge_params - initialize adap->params.sge
+ *	@adapter: the adapter
+ *
+ *	Initialize various fields of the adapter's SGE Parameters structure.
+ */
+int t4_init_sge_params(struct adapter *adapter)
+{
+	struct sge_params *sge_params = &adapter->params.sge;
+	u32 hps, qpp;
+	unsigned int s_hps, s_qpp;
+
+	/* Extract the SGE Page Size for our PF.
+	 */
+	hps = t4_read_reg(adapter, SGE_HOST_PAGE_SIZE_A);
+	s_hps = (HOSTPAGESIZEPF0_S +
+		 (HOSTPAGESIZEPF1_S - HOSTPAGESIZEPF0_S) * adapter->pf);
+	sge_params->hps = ((hps >> s_hps) & HOSTPAGESIZEPF0_M);
+
+	/* Extract the SGE Egress and Ingess Queues Per Page for our PF.
+	 */
+	s_qpp = (QUEUESPERPAGEPF0_S +
+		(QUEUESPERPAGEPF1_S - QUEUESPERPAGEPF0_S) * adapter->pf);
+	qpp = t4_read_reg(adapter, SGE_EGRESS_QUEUES_PER_PAGE_PF_A);
+	sge_params->eq_qpp = ((qpp >> s_qpp) & QUEUESPERPAGEPF0_M);
+	qpp = t4_read_reg(adapter, SGE_INGRESS_QUEUES_PER_PAGE_PF_A);
+	sge_params->iq_qpp = ((qpp >> s_qpp) & QUEUESPERPAGEPF0_M);
+
+	return 0;
+}
+
+/**
+ *      t4_init_tp_params - initialize adap->params.tp
+ *      @adap: the adapter
+ *      @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *      Initialize various fields of the adapter's TP Parameters structure.
+ */
+int t4_init_tp_params(struct adapter *adap, bool sleep_ok)
+{
+	int chan;
+	u32 v;
+
+	v = t4_read_reg(adap, TP_TIMER_RESOLUTION_A);
+	adap->params.tp.tre = TIMERRESOLUTION_G(v);
+	adap->params.tp.dack_re = DELAYEDACKRESOLUTION_G(v);
+
+	/* MODQ_REQ_MAP defaults to setting queues 0-3 to chan 0-3 */
+	for (chan = 0; chan < NCHAN; chan++)
+		adap->params.tp.tx_modq[chan] = chan;
+
+	/* Cache the adapter's Compressed Filter Mode and global Incress
+	 * Configuration.
+	 */
+	t4_tp_pio_read(adap, &adap->params.tp.vlan_pri_map, 1,
+		       TP_VLAN_PRI_MAP_A, sleep_ok);
+	t4_tp_pio_read(adap, &adap->params.tp.ingress_config, 1,
+		       TP_INGRESS_CONFIG_A, sleep_ok);
+
+	/* For T6, cache the adapter's compressed error vector
+	 * and passing outer header info for encapsulated packets.
+	 */
+	if (CHELSIO_CHIP_VERSION(adap->params.chip) > CHELSIO_T5) {
+		v = t4_read_reg(adap, TP_OUT_CONFIG_A);
+		adap->params.tp.rx_pkt_encap = (v & CRXPKTENC_F) ? 1 : 0;
+	}
+
+	/* Now that we have TP_VLAN_PRI_MAP cached, we can calculate the field
+	 * shift positions of several elements of the Compressed Filter Tuple
+	 * for this adapter which we need frequently ...
+	 */
+	adap->params.tp.fcoe_shift = t4_filter_field_shift(adap, FCOE_F);
+	adap->params.tp.port_shift = t4_filter_field_shift(adap, PORT_F);
+	adap->params.tp.vnic_shift = t4_filter_field_shift(adap, VNIC_ID_F);
+	adap->params.tp.vlan_shift = t4_filter_field_shift(adap, VLAN_F);
+	adap->params.tp.tos_shift = t4_filter_field_shift(adap, TOS_F);
+	adap->params.tp.protocol_shift = t4_filter_field_shift(adap,
+							       PROTOCOL_F);
+	adap->params.tp.ethertype_shift = t4_filter_field_shift(adap,
+								ETHERTYPE_F);
+	adap->params.tp.macmatch_shift = t4_filter_field_shift(adap,
+							       MACMATCH_F);
+	adap->params.tp.matchtype_shift = t4_filter_field_shift(adap,
+								MPSHITTYPE_F);
+	adap->params.tp.frag_shift = t4_filter_field_shift(adap,
+							   FRAGMENTATION_F);
+
+	/* If TP_INGRESS_CONFIG.VNID == 0, then TP_VLAN_PRI_MAP.VNIC_ID
+	 * represents the presence of an Outer VLAN instead of a VNIC ID.
+	 */
+	if ((adap->params.tp.ingress_config & VNIC_F) == 0)
+		adap->params.tp.vnic_shift = -1;
+
+	v = t4_read_reg(adap, LE_3_DB_HASH_MASK_GEN_IPV4_T6_A);
+	adap->params.tp.hash_filter_mask = v;
+	v = t4_read_reg(adap, LE_4_DB_HASH_MASK_GEN_IPV4_T6_A);
+	adap->params.tp.hash_filter_mask |= ((u64)v << 32);
+	return 0;
+}
+
+/**
+ *      t4_filter_field_shift - calculate filter field shift
+ *      @adap: the adapter
+ *      @filter_sel: the desired field (from TP_VLAN_PRI_MAP bits)
+ *
+ *      Return the shift position of a filter field within the Compressed
+ *      Filter Tuple.  The filter field is specified via its selection bit
+ *      within TP_VLAN_PRI_MAL (filter mode).  E.g. F_VLAN.
+ */
+int t4_filter_field_shift(const struct adapter *adap, int filter_sel)
+{
+	unsigned int filter_mode = adap->params.tp.vlan_pri_map;
+	unsigned int sel;
+	int field_shift;
+
+	if ((filter_mode & filter_sel) == 0)
+		return -1;
+
+	for (sel = 1, field_shift = 0; sel < filter_sel; sel <<= 1) {
+		switch (filter_mode & sel) {
+		case FCOE_F:
+			field_shift += FT_FCOE_W;
+			break;
+		case PORT_F:
+			field_shift += FT_PORT_W;
+			break;
+		case VNIC_ID_F:
+			field_shift += FT_VNIC_ID_W;
+			break;
+		case VLAN_F:
+			field_shift += FT_VLAN_W;
+			break;
+		case TOS_F:
+			field_shift += FT_TOS_W;
+			break;
+		case PROTOCOL_F:
+			field_shift += FT_PROTOCOL_W;
+			break;
+		case ETHERTYPE_F:
+			field_shift += FT_ETHERTYPE_W;
+			break;
+		case MACMATCH_F:
+			field_shift += FT_MACMATCH_W;
+			break;
+		case MPSHITTYPE_F:
+			field_shift += FT_MPSHITTYPE_W;
+			break;
+		case FRAGMENTATION_F:
+			field_shift += FT_FRAGMENTATION_W;
+			break;
+		}
+	}
+	return field_shift;
+}
+
+int t4_init_rss_mode(struct adapter *adap, int mbox)
+{
+	int i, ret;
+	struct fw_rss_vi_config_cmd rvc;
+
+	memset(&rvc, 0, sizeof(rvc));
+
+	for_each_port(adap, i) {
+		struct port_info *p = adap2pinfo(adap, i);
+
+		rvc.op_to_viid =
+			cpu_to_be32(FW_CMD_OP_V(FW_RSS_VI_CONFIG_CMD) |
+				    FW_CMD_REQUEST_F | FW_CMD_READ_F |
+				    FW_RSS_VI_CONFIG_CMD_VIID_V(p->viid));
+		rvc.retval_len16 = cpu_to_be32(FW_LEN16(rvc));
+		ret = t4_wr_mbox(adap, mbox, &rvc, sizeof(rvc), &rvc);
+		if (ret)
+			return ret;
+		p->rss_mode = be32_to_cpu(rvc.u.basicvirtual.defaultq_to_udpen);
+	}
+	return 0;
+}
+
+/**
+ *	t4_init_portinfo - allocate a virtual interface and initialize port_info
+ *	@pi: the port_info
+ *	@mbox: mailbox to use for the FW command
+ *	@port: physical port associated with the VI
+ *	@pf: the PF owning the VI
+ *	@vf: the VF owning the VI
+ *	@mac: the MAC address of the VI
+ *
+ *	Allocates a virtual interface for the given physical port.  If @mac is
+ *	not %NULL it contains the MAC address of the VI as assigned by FW.
+ *	@mac should be large enough to hold an Ethernet address.
+ *	Returns < 0 on error.
+ */
+int t4_init_portinfo(struct port_info *pi, int mbox,
+		     int port, int pf, int vf, u8 mac[])
+{
+	struct adapter *adapter = pi->adapter;
+	unsigned int fw_caps = adapter->params.fw_caps_support;
+	struct fw_port_cmd cmd;
+	unsigned int rss_size;
+	enum fw_port_type port_type;
+	int mdio_addr;
+	fw_port_cap32_t pcaps, acaps;
+	int ret;
+
+	/* If we haven't yet determined whether we're talking to Firmware
+	 * which knows the new 32-bit Port Capabilities, it's time to find
+	 * out now.  This will also tell new Firmware to send us Port Status
+	 * Updates using the new 32-bit Port Capabilities version of the
+	 * Port Information message.
+	 */
+	if (fw_caps == FW_CAPS_UNKNOWN) {
+		u32 param, val;
+
+		param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_PFVF) |
+			 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_PFVF_PORT_CAPS32));
+		val = 1;
+		ret = t4_set_params(adapter, mbox, pf, vf, 1, &param, &val);
+		fw_caps = (ret == 0 ? FW_CAPS32 : FW_CAPS16);
+		adapter->params.fw_caps_support = fw_caps;
+	}
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) |
+				       FW_CMD_REQUEST_F | FW_CMD_READ_F |
+				       FW_PORT_CMD_PORTID_V(port));
+	cmd.action_to_len16 = cpu_to_be32(
+		FW_PORT_CMD_ACTION_V(fw_caps == FW_CAPS16
+				     ? FW_PORT_ACTION_GET_PORT_INFO
+				     : FW_PORT_ACTION_GET_PORT_INFO32) |
+		FW_LEN16(cmd));
+	ret = t4_wr_mbox(pi->adapter, mbox, &cmd, sizeof(cmd), &cmd);
+	if (ret)
+		return ret;
+
+	/* Extract the various fields from the Port Information message.
+	 */
+	if (fw_caps == FW_CAPS16) {
+		u32 lstatus = be32_to_cpu(cmd.u.info.lstatus_to_modtype);
+
+		port_type = FW_PORT_CMD_PTYPE_G(lstatus);
+		mdio_addr = ((lstatus & FW_PORT_CMD_MDIOCAP_F)
+			     ? FW_PORT_CMD_MDIOADDR_G(lstatus)
+			     : -1);
+		pcaps = fwcaps16_to_caps32(be16_to_cpu(cmd.u.info.pcap));
+		acaps = fwcaps16_to_caps32(be16_to_cpu(cmd.u.info.acap));
+	} else {
+		u32 lstatus32 = be32_to_cpu(cmd.u.info32.lstatus32_to_cbllen32);
+
+		port_type = FW_PORT_CMD_PORTTYPE32_G(lstatus32);
+		mdio_addr = ((lstatus32 & FW_PORT_CMD_MDIOCAP32_F)
+			     ? FW_PORT_CMD_MDIOADDR32_G(lstatus32)
+			     : -1);
+		pcaps = be32_to_cpu(cmd.u.info32.pcaps32);
+		acaps = be32_to_cpu(cmd.u.info32.acaps32);
+	}
+
+	ret = t4_alloc_vi(pi->adapter, mbox, port, pf, vf, 1, mac, &rss_size);
+	if (ret < 0)
+		return ret;
+
+	pi->viid = ret;
+	pi->tx_chan = port;
+	pi->lport = port;
+	pi->rss_size = rss_size;
+
+	pi->port_type = port_type;
+	pi->mdio_addr = mdio_addr;
+	pi->mod_type = FW_PORT_MOD_TYPE_NA;
+
+	init_link_config(&pi->link_cfg, pcaps, acaps);
+	return 0;
+}
+
+int t4_port_init(struct adapter *adap, int mbox, int pf, int vf)
+{
+	u8 addr[6];
+	int ret, i, j = 0;
+
+	for_each_port(adap, i) {
+		struct port_info *pi = adap2pinfo(adap, i);
+
+		while ((adap->params.portvec & (1 << j)) == 0)
+			j++;
+
+		ret = t4_init_portinfo(pi, mbox, j, pf, vf, addr);
+		if (ret)
+			return ret;
+
+		memcpy(adap->port[i]->dev_addr, addr, ETH_ALEN);
+		j++;
+	}
+	return 0;
+}
+
+/**
+ *	t4_read_cimq_cfg - read CIM queue configuration
+ *	@adap: the adapter
+ *	@base: holds the queue base addresses in bytes
+ *	@size: holds the queue sizes in bytes
+ *	@thres: holds the queue full thresholds in bytes
+ *
+ *	Returns the current configuration of the CIM queues, starting with
+ *	the IBQs, then the OBQs.
+ */
+void t4_read_cimq_cfg(struct adapter *adap, u16 *base, u16 *size, u16 *thres)
+{
+	unsigned int i, v;
+	int cim_num_obq = is_t4(adap->params.chip) ?
+				CIM_NUM_OBQ : CIM_NUM_OBQ_T5;
+
+	for (i = 0; i < CIM_NUM_IBQ; i++) {
+		t4_write_reg(adap, CIM_QUEUE_CONFIG_REF_A, IBQSELECT_F |
+			     QUENUMSELECT_V(i));
+		v = t4_read_reg(adap, CIM_QUEUE_CONFIG_CTRL_A);
+		/* value is in 256-byte units */
+		*base++ = CIMQBASE_G(v) * 256;
+		*size++ = CIMQSIZE_G(v) * 256;
+		*thres++ = QUEFULLTHRSH_G(v) * 8; /* 8-byte unit */
+	}
+	for (i = 0; i < cim_num_obq; i++) {
+		t4_write_reg(adap, CIM_QUEUE_CONFIG_REF_A, OBQSELECT_F |
+			     QUENUMSELECT_V(i));
+		v = t4_read_reg(adap, CIM_QUEUE_CONFIG_CTRL_A);
+		/* value is in 256-byte units */
+		*base++ = CIMQBASE_G(v) * 256;
+		*size++ = CIMQSIZE_G(v) * 256;
+	}
+}
+
+/**
+ *	t4_read_cim_ibq - read the contents of a CIM inbound queue
+ *	@adap: the adapter
+ *	@qid: the queue index
+ *	@data: where to store the queue contents
+ *	@n: capacity of @data in 32-bit words
+ *
+ *	Reads the contents of the selected CIM queue starting at address 0 up
+ *	to the capacity of @data.  @n must be a multiple of 4.  Returns < 0 on
+ *	error and the number of 32-bit words actually read on success.
+ */
+int t4_read_cim_ibq(struct adapter *adap, unsigned int qid, u32 *data, size_t n)
+{
+	int i, err, attempts;
+	unsigned int addr;
+	const unsigned int nwords = CIM_IBQ_SIZE * 4;
+
+	if (qid > 5 || (n & 3))
+		return -EINVAL;
+
+	addr = qid * nwords;
+	if (n > nwords)
+		n = nwords;
+
+	/* It might take 3-10ms before the IBQ debug read access is allowed.
+	 * Wait for 1 Sec with a delay of 1 usec.
+	 */
+	attempts = 1000000;
+
+	for (i = 0; i < n; i++, addr++) {
+		t4_write_reg(adap, CIM_IBQ_DBG_CFG_A, IBQDBGADDR_V(addr) |
+			     IBQDBGEN_F);
+		err = t4_wait_op_done(adap, CIM_IBQ_DBG_CFG_A, IBQDBGBUSY_F, 0,
+				      attempts, 1);
+		if (err)
+			return err;
+		*data++ = t4_read_reg(adap, CIM_IBQ_DBG_DATA_A);
+	}
+	t4_write_reg(adap, CIM_IBQ_DBG_CFG_A, 0);
+	return i;
+}
+
+/**
+ *	t4_read_cim_obq - read the contents of a CIM outbound queue
+ *	@adap: the adapter
+ *	@qid: the queue index
+ *	@data: where to store the queue contents
+ *	@n: capacity of @data in 32-bit words
+ *
+ *	Reads the contents of the selected CIM queue starting at address 0 up
+ *	to the capacity of @data.  @n must be a multiple of 4.  Returns < 0 on
+ *	error and the number of 32-bit words actually read on success.
+ */
+int t4_read_cim_obq(struct adapter *adap, unsigned int qid, u32 *data, size_t n)
+{
+	int i, err;
+	unsigned int addr, v, nwords;
+	int cim_num_obq = is_t4(adap->params.chip) ?
+				CIM_NUM_OBQ : CIM_NUM_OBQ_T5;
+
+	if ((qid > (cim_num_obq - 1)) || (n & 3))
+		return -EINVAL;
+
+	t4_write_reg(adap, CIM_QUEUE_CONFIG_REF_A, OBQSELECT_F |
+		     QUENUMSELECT_V(qid));
+	v = t4_read_reg(adap, CIM_QUEUE_CONFIG_CTRL_A);
+
+	addr = CIMQBASE_G(v) * 64;    /* muliple of 256 -> muliple of 4 */
+	nwords = CIMQSIZE_G(v) * 64;  /* same */
+	if (n > nwords)
+		n = nwords;
+
+	for (i = 0; i < n; i++, addr++) {
+		t4_write_reg(adap, CIM_OBQ_DBG_CFG_A, OBQDBGADDR_V(addr) |
+			     OBQDBGEN_F);
+		err = t4_wait_op_done(adap, CIM_OBQ_DBG_CFG_A, OBQDBGBUSY_F, 0,
+				      2, 1);
+		if (err)
+			return err;
+		*data++ = t4_read_reg(adap, CIM_OBQ_DBG_DATA_A);
+	}
+	t4_write_reg(adap, CIM_OBQ_DBG_CFG_A, 0);
+	return i;
+}
+
+/**
+ *	t4_cim_read - read a block from CIM internal address space
+ *	@adap: the adapter
+ *	@addr: the start address within the CIM address space
+ *	@n: number of words to read
+ *	@valp: where to store the result
+ *
+ *	Reads a block of 4-byte words from the CIM intenal address space.
+ */
+int t4_cim_read(struct adapter *adap, unsigned int addr, unsigned int n,
+		unsigned int *valp)
+{
+	int ret = 0;
+
+	if (t4_read_reg(adap, CIM_HOST_ACC_CTRL_A) & HOSTBUSY_F)
+		return -EBUSY;
+
+	for ( ; !ret && n--; addr += 4) {
+		t4_write_reg(adap, CIM_HOST_ACC_CTRL_A, addr);
+		ret = t4_wait_op_done(adap, CIM_HOST_ACC_CTRL_A, HOSTBUSY_F,
+				      0, 5, 2);
+		if (!ret)
+			*valp++ = t4_read_reg(adap, CIM_HOST_ACC_DATA_A);
+	}
+	return ret;
+}
+
+/**
+ *	t4_cim_write - write a block into CIM internal address space
+ *	@adap: the adapter
+ *	@addr: the start address within the CIM address space
+ *	@n: number of words to write
+ *	@valp: set of values to write
+ *
+ *	Writes a block of 4-byte words into the CIM intenal address space.
+ */
+int t4_cim_write(struct adapter *adap, unsigned int addr, unsigned int n,
+		 const unsigned int *valp)
+{
+	int ret = 0;
+
+	if (t4_read_reg(adap, CIM_HOST_ACC_CTRL_A) & HOSTBUSY_F)
+		return -EBUSY;
+
+	for ( ; !ret && n--; addr += 4) {
+		t4_write_reg(adap, CIM_HOST_ACC_DATA_A, *valp++);
+		t4_write_reg(adap, CIM_HOST_ACC_CTRL_A, addr | HOSTWRITE_F);
+		ret = t4_wait_op_done(adap, CIM_HOST_ACC_CTRL_A, HOSTBUSY_F,
+				      0, 5, 2);
+	}
+	return ret;
+}
+
+static int t4_cim_write1(struct adapter *adap, unsigned int addr,
+			 unsigned int val)
+{
+	return t4_cim_write(adap, addr, 1, &val);
+}
+
+/**
+ *	t4_cim_read_la - read CIM LA capture buffer
+ *	@adap: the adapter
+ *	@la_buf: where to store the LA data
+ *	@wrptr: the HW write pointer within the capture buffer
+ *
+ *	Reads the contents of the CIM LA buffer with the most recent entry at
+ *	the end	of the returned data and with the entry at @wrptr first.
+ *	We try to leave the LA in the running state we find it in.
+ */
+int t4_cim_read_la(struct adapter *adap, u32 *la_buf, unsigned int *wrptr)
+{
+	int i, ret;
+	unsigned int cfg, val, idx;
+
+	ret = t4_cim_read(adap, UP_UP_DBG_LA_CFG_A, 1, &cfg);
+	if (ret)
+		return ret;
+
+	if (cfg & UPDBGLAEN_F) {	/* LA is running, freeze it */
+		ret = t4_cim_write1(adap, UP_UP_DBG_LA_CFG_A, 0);
+		if (ret)
+			return ret;
+	}
+
+	ret = t4_cim_read(adap, UP_UP_DBG_LA_CFG_A, 1, &val);
+	if (ret)
+		goto restart;
+
+	idx = UPDBGLAWRPTR_G(val);
+	if (wrptr)
+		*wrptr = idx;
+
+	for (i = 0; i < adap->params.cim_la_size; i++) {
+		ret = t4_cim_write1(adap, UP_UP_DBG_LA_CFG_A,
+				    UPDBGLARDPTR_V(idx) | UPDBGLARDEN_F);
+		if (ret)
+			break;
+		ret = t4_cim_read(adap, UP_UP_DBG_LA_CFG_A, 1, &val);
+		if (ret)
+			break;
+		if (val & UPDBGLARDEN_F) {
+			ret = -ETIMEDOUT;
+			break;
+		}
+		ret = t4_cim_read(adap, UP_UP_DBG_LA_DATA_A, 1, &la_buf[i]);
+		if (ret)
+			break;
+
+		/* Bits 0-3 of UpDbgLaRdPtr can be between 0000 to 1001 to
+		 * identify the 32-bit portion of the full 312-bit data
+		 */
+		if (is_t6(adap->params.chip) && (idx & 0xf) >= 9)
+			idx = (idx & 0xff0) + 0x10;
+		else
+			idx++;
+		/* address can't exceed 0xfff */
+		idx &= UPDBGLARDPTR_M;
+	}
+restart:
+	if (cfg & UPDBGLAEN_F) {
+		int r = t4_cim_write1(adap, UP_UP_DBG_LA_CFG_A,
+				      cfg & ~UPDBGLARDEN_F);
+		if (!ret)
+			ret = r;
+	}
+	return ret;
+}
+
+/**
+ *	t4_tp_read_la - read TP LA capture buffer
+ *	@adap: the adapter
+ *	@la_buf: where to store the LA data
+ *	@wrptr: the HW write pointer within the capture buffer
+ *
+ *	Reads the contents of the TP LA buffer with the most recent entry at
+ *	the end	of the returned data and with the entry at @wrptr first.
+ *	We leave the LA in the running state we find it in.
+ */
+void t4_tp_read_la(struct adapter *adap, u64 *la_buf, unsigned int *wrptr)
+{
+	bool last_incomplete;
+	unsigned int i, cfg, val, idx;
+
+	cfg = t4_read_reg(adap, TP_DBG_LA_CONFIG_A) & 0xffff;
+	if (cfg & DBGLAENABLE_F)			/* freeze LA */
+		t4_write_reg(adap, TP_DBG_LA_CONFIG_A,
+			     adap->params.tp.la_mask | (cfg ^ DBGLAENABLE_F));
+
+	val = t4_read_reg(adap, TP_DBG_LA_CONFIG_A);
+	idx = DBGLAWPTR_G(val);
+	last_incomplete = DBGLAMODE_G(val) >= 2 && (val & DBGLAWHLF_F) == 0;
+	if (last_incomplete)
+		idx = (idx + 1) & DBGLARPTR_M;
+	if (wrptr)
+		*wrptr = idx;
+
+	val &= 0xffff;
+	val &= ~DBGLARPTR_V(DBGLARPTR_M);
+	val |= adap->params.tp.la_mask;
+
+	for (i = 0; i < TPLA_SIZE; i++) {
+		t4_write_reg(adap, TP_DBG_LA_CONFIG_A, DBGLARPTR_V(idx) | val);
+		la_buf[i] = t4_read_reg64(adap, TP_DBG_LA_DATAL_A);
+		idx = (idx + 1) & DBGLARPTR_M;
+	}
+
+	/* Wipe out last entry if it isn't valid */
+	if (last_incomplete)
+		la_buf[TPLA_SIZE - 1] = ~0ULL;
+
+	if (cfg & DBGLAENABLE_F)                    /* restore running state */
+		t4_write_reg(adap, TP_DBG_LA_CONFIG_A,
+			     cfg | adap->params.tp.la_mask);
+}
+
+/* SGE Hung Ingress DMA Warning Threshold time and Warning Repeat Rate (in
+ * seconds).  If we find one of the SGE Ingress DMA State Machines in the same
+ * state for more than the Warning Threshold then we'll issue a warning about
+ * a potential hang.  We'll repeat the warning as the SGE Ingress DMA Channel
+ * appears to be hung every Warning Repeat second till the situation clears.
+ * If the situation clears, we'll note that as well.
+ */
+#define SGE_IDMA_WARN_THRESH 1
+#define SGE_IDMA_WARN_REPEAT 300
+
+/**
+ *	t4_idma_monitor_init - initialize SGE Ingress DMA Monitor
+ *	@adapter: the adapter
+ *	@idma: the adapter IDMA Monitor state
+ *
+ *	Initialize the state of an SGE Ingress DMA Monitor.
+ */
+void t4_idma_monitor_init(struct adapter *adapter,
+			  struct sge_idma_monitor_state *idma)
+{
+	/* Initialize the state variables for detecting an SGE Ingress DMA
+	 * hang.  The SGE has internal counters which count up on each clock
+	 * tick whenever the SGE finds its Ingress DMA State Engines in the
+	 * same state they were on the previous clock tick.  The clock used is
+	 * the Core Clock so we have a limit on the maximum "time" they can
+	 * record; typically a very small number of seconds.  For instance,
+	 * with a 600MHz Core Clock, we can only count up to a bit more than
+	 * 7s.  So we'll synthesize a larger counter in order to not run the
+	 * risk of having the "timers" overflow and give us the flexibility to
+	 * maintain a Hung SGE State Machine of our own which operates across
+	 * a longer time frame.
+	 */
+	idma->idma_1s_thresh = core_ticks_per_usec(adapter) * 1000000; /* 1s */
+	idma->idma_stalled[0] = 0;
+	idma->idma_stalled[1] = 0;
+}
+
+/**
+ *	t4_idma_monitor - monitor SGE Ingress DMA state
+ *	@adapter: the adapter
+ *	@idma: the adapter IDMA Monitor state
+ *	@hz: number of ticks/second
+ *	@ticks: number of ticks since the last IDMA Monitor call
+ */
+void t4_idma_monitor(struct adapter *adapter,
+		     struct sge_idma_monitor_state *idma,
+		     int hz, int ticks)
+{
+	int i, idma_same_state_cnt[2];
+
+	 /* Read the SGE Debug Ingress DMA Same State Count registers.  These
+	  * are counters inside the SGE which count up on each clock when the
+	  * SGE finds its Ingress DMA State Engines in the same states they
+	  * were in the previous clock.  The counters will peg out at
+	  * 0xffffffff without wrapping around so once they pass the 1s
+	  * threshold they'll stay above that till the IDMA state changes.
+	  */
+	t4_write_reg(adapter, SGE_DEBUG_INDEX_A, 13);
+	idma_same_state_cnt[0] = t4_read_reg(adapter, SGE_DEBUG_DATA_HIGH_A);
+	idma_same_state_cnt[1] = t4_read_reg(adapter, SGE_DEBUG_DATA_LOW_A);
+
+	for (i = 0; i < 2; i++) {
+		u32 debug0, debug11;
+
+		/* If the Ingress DMA Same State Counter ("timer") is less
+		 * than 1s, then we can reset our synthesized Stall Timer and
+		 * continue.  If we have previously emitted warnings about a
+		 * potential stalled Ingress Queue, issue a note indicating
+		 * that the Ingress Queue has resumed forward progress.
+		 */
+		if (idma_same_state_cnt[i] < idma->idma_1s_thresh) {
+			if (idma->idma_stalled[i] >= SGE_IDMA_WARN_THRESH * hz)
+				dev_warn(adapter->pdev_dev, "SGE idma%d, queue %u, "
+					 "resumed after %d seconds\n",
+					 i, idma->idma_qid[i],
+					 idma->idma_stalled[i] / hz);
+			idma->idma_stalled[i] = 0;
+			continue;
+		}
+
+		/* Synthesize an SGE Ingress DMA Same State Timer in the Hz
+		 * domain.  The first time we get here it'll be because we
+		 * passed the 1s Threshold; each additional time it'll be
+		 * because the RX Timer Callback is being fired on its regular
+		 * schedule.
+		 *
+		 * If the stall is below our Potential Hung Ingress Queue
+		 * Warning Threshold, continue.
+		 */
+		if (idma->idma_stalled[i] == 0) {
+			idma->idma_stalled[i] = hz;
+			idma->idma_warn[i] = 0;
+		} else {
+			idma->idma_stalled[i] += ticks;
+			idma->idma_warn[i] -= ticks;
+		}
+
+		if (idma->idma_stalled[i] < SGE_IDMA_WARN_THRESH * hz)
+			continue;
+
+		/* We'll issue a warning every SGE_IDMA_WARN_REPEAT seconds.
+		 */
+		if (idma->idma_warn[i] > 0)
+			continue;
+		idma->idma_warn[i] = SGE_IDMA_WARN_REPEAT * hz;
+
+		/* Read and save the SGE IDMA State and Queue ID information.
+		 * We do this every time in case it changes across time ...
+		 * can't be too careful ...
+		 */
+		t4_write_reg(adapter, SGE_DEBUG_INDEX_A, 0);
+		debug0 = t4_read_reg(adapter, SGE_DEBUG_DATA_LOW_A);
+		idma->idma_state[i] = (debug0 >> (i * 9)) & 0x3f;
+
+		t4_write_reg(adapter, SGE_DEBUG_INDEX_A, 11);
+		debug11 = t4_read_reg(adapter, SGE_DEBUG_DATA_LOW_A);
+		idma->idma_qid[i] = (debug11 >> (i * 16)) & 0xffff;
+
+		dev_warn(adapter->pdev_dev, "SGE idma%u, queue %u, potentially stuck in "
+			 "state %u for %d seconds (debug0=%#x, debug11=%#x)\n",
+			 i, idma->idma_qid[i], idma->idma_state[i],
+			 idma->idma_stalled[i] / hz,
+			 debug0, debug11);
+		t4_sge_decode_idma_state(adapter, idma->idma_state[i]);
+	}
+}
+
+/**
+ *	t4_load_cfg - download config file
+ *	@adap: the adapter
+ *	@cfg_data: the cfg text file to write
+ *	@size: text file size
+ *
+ *	Write the supplied config text file to the card's serial flash.
+ */
+int t4_load_cfg(struct adapter *adap, const u8 *cfg_data, unsigned int size)
+{
+	int ret, i, n, cfg_addr;
+	unsigned int addr;
+	unsigned int flash_cfg_start_sec;
+	unsigned int sf_sec_size = adap->params.sf_size / adap->params.sf_nsec;
+
+	cfg_addr = t4_flash_cfg_addr(adap);
+	if (cfg_addr < 0)
+		return cfg_addr;
+
+	addr = cfg_addr;
+	flash_cfg_start_sec = addr / SF_SEC_SIZE;
+
+	if (size > FLASH_CFG_MAX_SIZE) {
+		dev_err(adap->pdev_dev, "cfg file too large, max is %u bytes\n",
+			FLASH_CFG_MAX_SIZE);
+		return -EFBIG;
+	}
+
+	i = DIV_ROUND_UP(FLASH_CFG_MAX_SIZE,	/* # of sectors spanned */
+			 sf_sec_size);
+	ret = t4_flash_erase_sectors(adap, flash_cfg_start_sec,
+				     flash_cfg_start_sec + i - 1);
+	/* If size == 0 then we're simply erasing the FLASH sectors associated
+	 * with the on-adapter Firmware Configuration File.
+	 */
+	if (ret || size == 0)
+		goto out;
+
+	/* this will write to the flash up to SF_PAGE_SIZE at a time */
+	for (i = 0; i < size; i += SF_PAGE_SIZE) {
+		if ((size - i) <  SF_PAGE_SIZE)
+			n = size - i;
+		else
+			n = SF_PAGE_SIZE;
+		ret = t4_write_flash(adap, addr, n, cfg_data);
+		if (ret)
+			goto out;
+
+		addr += SF_PAGE_SIZE;
+		cfg_data += SF_PAGE_SIZE;
+	}
+
+out:
+	if (ret)
+		dev_err(adap->pdev_dev, "config file %s failed %d\n",
+			(size == 0 ? "clear" : "download"), ret);
+	return ret;
+}
+
+/**
+ *	t4_set_vf_mac - Set MAC address for the specified VF
+ *	@adapter: The adapter
+ *	@vf: one of the VFs instantiated by the specified PF
+ *	@naddr: the number of MAC addresses
+ *	@addr: the MAC address(es) to be set to the specified VF
+ */
+int t4_set_vf_mac_acl(struct adapter *adapter, unsigned int vf,
+		      unsigned int naddr, u8 *addr)
+{
+	struct fw_acl_mac_cmd cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_ACL_MAC_CMD) |
+				    FW_CMD_REQUEST_F |
+				    FW_CMD_WRITE_F |
+				    FW_ACL_MAC_CMD_PFN_V(adapter->pf) |
+				    FW_ACL_MAC_CMD_VFN_V(vf));
+
+	/* Note: Do not enable the ACL */
+	cmd.en_to_len16 = cpu_to_be32((unsigned int)FW_LEN16(cmd));
+	cmd.nmac = naddr;
+
+	switch (adapter->pf) {
+	case 3:
+		memcpy(cmd.macaddr3, addr, sizeof(cmd.macaddr3));
+		break;
+	case 2:
+		memcpy(cmd.macaddr2, addr, sizeof(cmd.macaddr2));
+		break;
+	case 1:
+		memcpy(cmd.macaddr1, addr, sizeof(cmd.macaddr1));
+		break;
+	case 0:
+		memcpy(cmd.macaddr0, addr, sizeof(cmd.macaddr0));
+		break;
+	}
+
+	return t4_wr_mbox(adapter, adapter->mbox, &cmd, sizeof(cmd), &cmd);
+}
+
+/**
+ * t4_read_pace_tbl - read the pace table
+ * @adap: the adapter
+ * @pace_vals: holds the returned values
+ *
+ * Returns the values of TP's pace table in microseconds.
+ */
+void t4_read_pace_tbl(struct adapter *adap, unsigned int pace_vals[NTX_SCHED])
+{
+	unsigned int i, v;
+
+	for (i = 0; i < NTX_SCHED; i++) {
+		t4_write_reg(adap, TP_PACE_TABLE_A, 0xffff0000 + i);
+		v = t4_read_reg(adap, TP_PACE_TABLE_A);
+		pace_vals[i] = dack_ticks_to_usec(adap, v);
+	}
+}
+
+/**
+ * t4_get_tx_sched - get the configuration of a Tx HW traffic scheduler
+ * @adap: the adapter
+ * @sched: the scheduler index
+ * @kbps: the byte rate in Kbps
+ * @ipg: the interpacket delay in tenths of nanoseconds
+ * @sleep_ok: if true we may sleep while awaiting command completion
+ *
+ * Return the current configuration of a HW Tx scheduler.
+ */
+void t4_get_tx_sched(struct adapter *adap, unsigned int sched,
+		     unsigned int *kbps, unsigned int *ipg, bool sleep_ok)
+{
+	unsigned int v, addr, bpt, cpt;
+
+	if (kbps) {
+		addr = TP_TX_MOD_Q1_Q0_RATE_LIMIT_A - sched / 2;
+		t4_tp_tm_pio_read(adap, &v, 1, addr, sleep_ok);
+		if (sched & 1)
+			v >>= 16;
+		bpt = (v >> 8) & 0xff;
+		cpt = v & 0xff;
+		if (!cpt) {
+			*kbps = 0;	/* scheduler disabled */
+		} else {
+			v = (adap->params.vpd.cclk * 1000) / cpt; /* ticks/s */
+			*kbps = (v * bpt) / 125;
+		}
+	}
+	if (ipg) {
+		addr = TP_TX_MOD_Q1_Q0_TIMER_SEPARATOR_A - sched / 2;
+		t4_tp_tm_pio_read(adap, &v, 1, addr, sleep_ok);
+		if (sched & 1)
+			v >>= 16;
+		v &= 0xffff;
+		*ipg = (10000 * v) / core_ticks_per_usec(adap);
+	}
+}
+
+/* t4_sge_ctxt_rd - read an SGE context through FW
+ * @adap: the adapter
+ * @mbox: mailbox to use for the FW command
+ * @cid: the context id
+ * @ctype: the context type
+ * @data: where to store the context data
+ *
+ * Issues a FW command through the given mailbox to read an SGE context.
+ */
+int t4_sge_ctxt_rd(struct adapter *adap, unsigned int mbox, unsigned int cid,
+		   enum ctxt_type ctype, u32 *data)
+{
+	struct fw_ldst_cmd c;
+	int ret;
+
+	if (ctype == CTXT_FLM)
+		ret = FW_LDST_ADDRSPC_SGE_FLMC;
+	else
+		ret = FW_LDST_ADDRSPC_SGE_CONMC;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_addrspace = cpu_to_be32(FW_CMD_OP_V(FW_LDST_CMD) |
+					FW_CMD_REQUEST_F | FW_CMD_READ_F |
+					FW_LDST_CMD_ADDRSPACE_V(ret));
+	c.cycles_to_len16 = cpu_to_be32(FW_LEN16(c));
+	c.u.idctxt.physid = cpu_to_be32(cid);
+
+	ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
+	if (ret == 0) {
+		data[0] = be32_to_cpu(c.u.idctxt.ctxt_data0);
+		data[1] = be32_to_cpu(c.u.idctxt.ctxt_data1);
+		data[2] = be32_to_cpu(c.u.idctxt.ctxt_data2);
+		data[3] = be32_to_cpu(c.u.idctxt.ctxt_data3);
+		data[4] = be32_to_cpu(c.u.idctxt.ctxt_data4);
+		data[5] = be32_to_cpu(c.u.idctxt.ctxt_data5);
+	}
+	return ret;
+}
+
+/**
+ * t4_sge_ctxt_rd_bd - read an SGE context bypassing FW
+ * @adap: the adapter
+ * @cid: the context id
+ * @ctype: the context type
+ * @data: where to store the context data
+ *
+ * Reads an SGE context directly, bypassing FW.  This is only for
+ * debugging when FW is unavailable.
+ */
+int t4_sge_ctxt_rd_bd(struct adapter *adap, unsigned int cid,
+		      enum ctxt_type ctype, u32 *data)
+{
+	int i, ret;
+
+	t4_write_reg(adap, SGE_CTXT_CMD_A, CTXTQID_V(cid) | CTXTTYPE_V(ctype));
+	ret = t4_wait_op_done(adap, SGE_CTXT_CMD_A, BUSY_F, 0, 3, 1);
+	if (!ret)
+		for (i = SGE_CTXT_DATA0_A; i <= SGE_CTXT_DATA5_A; i += 4)
+			*data++ = t4_read_reg(adap, i);
+	return ret;
+}
+
+int t4_sched_params(struct adapter *adapter, int type, int level, int mode,
+		    int rateunit, int ratemode, int channel, int class,
+		    int minrate, int maxrate, int weight, int pktsize)
+{
+	struct fw_sched_cmd cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.op_to_write = cpu_to_be32(FW_CMD_OP_V(FW_SCHED_CMD) |
+				      FW_CMD_REQUEST_F |
+				      FW_CMD_WRITE_F);
+	cmd.retval_len16 = cpu_to_be32(FW_LEN16(cmd));
+
+	cmd.u.params.sc = FW_SCHED_SC_PARAMS;
+	cmd.u.params.type = type;
+	cmd.u.params.level = level;
+	cmd.u.params.mode = mode;
+	cmd.u.params.ch = channel;
+	cmd.u.params.cl = class;
+	cmd.u.params.unit = rateunit;
+	cmd.u.params.rate = ratemode;
+	cmd.u.params.min = cpu_to_be32(minrate);
+	cmd.u.params.max = cpu_to_be32(maxrate);
+	cmd.u.params.weight = cpu_to_be16(weight);
+	cmd.u.params.pktsize = cpu_to_be16(pktsize);
+
+	return t4_wr_mbox_meat(adapter, adapter->mbox, &cmd, sizeof(cmd),
+			       NULL, 1);
+}
+
+/**
+ *	t4_i2c_rd - read I2C data from adapter
+ *	@adap: the adapter
+ *	@port: Port number if per-port device; <0 if not
+ *	@devid: per-port device ID or absolute device ID
+ *	@offset: byte offset into device I2C space
+ *	@len: byte length of I2C space data
+ *	@buf: buffer in which to return I2C data
+ *
+ *	Reads the I2C data from the indicated device and location.
+ */
+int t4_i2c_rd(struct adapter *adap, unsigned int mbox, int port,
+	      unsigned int devid, unsigned int offset,
+	      unsigned int len, u8 *buf)
+{
+	struct fw_ldst_cmd ldst_cmd, ldst_rpl;
+	unsigned int i2c_max = sizeof(ldst_cmd.u.i2c.data);
+	int ret = 0;
+
+	if (len > I2C_PAGE_SIZE)
+		return -EINVAL;
+
+	/* Dont allow reads that spans multiple pages */
+	if (offset < I2C_PAGE_SIZE && offset + len > I2C_PAGE_SIZE)
+		return -EINVAL;
+
+	memset(&ldst_cmd, 0, sizeof(ldst_cmd));
+	ldst_cmd.op_to_addrspace =
+		cpu_to_be32(FW_CMD_OP_V(FW_LDST_CMD) |
+			    FW_CMD_REQUEST_F |
+			    FW_CMD_READ_F |
+			    FW_LDST_CMD_ADDRSPACE_V(FW_LDST_ADDRSPC_I2C));
+	ldst_cmd.cycles_to_len16 = cpu_to_be32(FW_LEN16(ldst_cmd));
+	ldst_cmd.u.i2c.pid = (port < 0 ? 0xff : port);
+	ldst_cmd.u.i2c.did = devid;
+
+	while (len > 0) {
+		unsigned int i2c_len = (len < i2c_max) ? len : i2c_max;
+
+		ldst_cmd.u.i2c.boffset = offset;
+		ldst_cmd.u.i2c.blen = i2c_len;
+
+		ret = t4_wr_mbox(adap, mbox, &ldst_cmd, sizeof(ldst_cmd),
+				 &ldst_rpl);
+		if (ret)
+			break;
+
+		memcpy(buf, ldst_rpl.u.i2c.data, i2c_len);
+		offset += i2c_len;
+		buf += i2c_len;
+		len -= i2c_len;
+	}
+
+	return ret;
+}
+
+/**
+ *      t4_set_vlan_acl - Set a VLAN id for the specified VF
+ *      @adapter: the adapter
+ *      @mbox: mailbox to use for the FW command
+ *      @vf: one of the VFs instantiated by the specified PF
+ *      @vlan: The vlanid to be set
+ */
+int t4_set_vlan_acl(struct adapter *adap, unsigned int mbox, unsigned int vf,
+		    u16 vlan)
+{
+	struct fw_acl_vlan_cmd vlan_cmd;
+	unsigned int enable;
+
+	enable = (vlan ? FW_ACL_VLAN_CMD_EN_F : 0);
+	memset(&vlan_cmd, 0, sizeof(vlan_cmd));
+	vlan_cmd.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_ACL_VLAN_CMD) |
+					 FW_CMD_REQUEST_F |
+					 FW_CMD_WRITE_F |
+					 FW_CMD_EXEC_F |
+					 FW_ACL_VLAN_CMD_PFN_V(adap->pf) |
+					 FW_ACL_VLAN_CMD_VFN_V(vf));
+	vlan_cmd.en_to_len16 = cpu_to_be32(enable | FW_LEN16(vlan_cmd));
+	/* Drop all packets that donot match vlan id */
+	vlan_cmd.dropnovlan_fm = FW_ACL_VLAN_CMD_FM_F;
+	if (enable != 0) {
+		vlan_cmd.nvlan = 1;
+		vlan_cmd.vlanid[0] = cpu_to_be16(vlan);
+	}
+
+	return t4_wr_mbox(adap, adap->mbox, &vlan_cmd, sizeof(vlan_cmd), NULL);
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h
new file mode 100644
index 0000000..361d503
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h
@@ -0,0 +1,300 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __T4_HW_H
+#define __T4_HW_H
+
+#include <linux/types.h>
+
+enum {
+	NCHAN           = 4,    /* # of HW channels */
+	MAX_MTU         = 9600, /* max MAC MTU, excluding header + FCS */
+	EEPROMSIZE      = 17408,/* Serial EEPROM physical size */
+	EEPROMVSIZE     = 32768,/* Serial EEPROM virtual address space size */
+	EEPROMPFSIZE    = 1024, /* EEPROM writable area size for PFn, n>0 */
+	RSS_NENTRIES    = 2048, /* # of entries in RSS mapping table */
+	T6_RSS_NENTRIES = 4096, /* # of entries in RSS mapping table */
+	TCB_SIZE        = 128,  /* TCB size */
+	NMTUS           = 16,   /* size of MTU table */
+	NCCTRL_WIN      = 32,   /* # of congestion control windows */
+	NTX_SCHED       = 8,    /* # of HW Tx scheduling queues */
+	PM_NSTATS       = 5,    /* # of PM stats */
+	T6_PM_NSTATS    = 7,    /* # of PM stats in T6 */
+	MBOX_LEN        = 64,   /* mailbox size in bytes */
+	TRACE_LEN       = 112,  /* length of trace data and mask */
+	FILTER_OPT_LEN  = 36,   /* filter tuple width for optional components */
+};
+
+enum {
+	CIM_NUM_IBQ    = 6,     /* # of CIM IBQs */
+	CIM_NUM_OBQ    = 6,     /* # of CIM OBQs */
+	CIM_NUM_OBQ_T5 = 8,     /* # of CIM OBQs for T5 adapter */
+	CIMLA_SIZE     = 2048,  /* # of 32-bit words in CIM LA */
+	CIM_PIFLA_SIZE = 64,    /* # of 192-bit words in CIM PIF LA */
+	CIM_MALA_SIZE  = 64,    /* # of 160-bit words in CIM MA LA */
+	CIM_IBQ_SIZE   = 128,   /* # of 128-bit words in a CIM IBQ */
+	CIM_OBQ_SIZE   = 128,   /* # of 128-bit words in a CIM OBQ */
+	TPLA_SIZE      = 128,   /* # of 64-bit words in TP LA */
+	ULPRX_LA_SIZE  = 512,   /* # of 256-bit words in ULP_RX LA */
+};
+
+/* SGE context types */
+enum ctxt_type {
+	CTXT_EGRESS,
+	CTXT_INGRESS,
+	CTXT_FLM,
+	CTXT_CNM,
+};
+
+enum {
+	SF_PAGE_SIZE = 256,           /* serial flash page size */
+	SF_SEC_SIZE = 64 * 1024,      /* serial flash sector size */
+};
+
+enum { RSP_TYPE_FLBUF, RSP_TYPE_CPL, RSP_TYPE_INTR }; /* response entry types */
+
+enum { MBOX_OWNER_NONE, MBOX_OWNER_FW, MBOX_OWNER_DRV };    /* mailbox owners */
+
+enum {
+	SGE_MAX_WR_LEN = 512,     /* max WR size in bytes */
+	SGE_CTXT_SIZE = 24,       /* size of SGE context */
+	SGE_NTIMERS = 6,          /* # of interrupt holdoff timer values */
+	SGE_NCOUNTERS = 4,        /* # of interrupt packet counter values */
+	SGE_MAX_IQ_SIZE = 65520,
+
+	SGE_TIMER_RSTRT_CNTR = 6, /* restart RX packet threshold counter */
+	SGE_TIMER_UPD_CIDX = 7,   /* update cidx only */
+
+	SGE_EQ_IDXSIZE = 64,      /* egress queue pidx/cidx unit size */
+
+	SGE_INTRDST_PCI = 0,      /* interrupt destination is PCI-E */
+	SGE_INTRDST_IQ = 1,       /*   destination is an ingress queue */
+
+	SGE_UPDATEDEL_NONE = 0,   /* ingress queue pidx update delivery */
+	SGE_UPDATEDEL_INTR = 1,   /*   interrupt */
+	SGE_UPDATEDEL_STPG = 2,   /*   status page */
+	SGE_UPDATEDEL_BOTH = 3,   /*   interrupt and status page */
+
+	SGE_HOSTFCMODE_NONE = 0,  /* egress queue cidx updates */
+	SGE_HOSTFCMODE_IQ = 1,    /*   sent to ingress queue */
+	SGE_HOSTFCMODE_STPG = 2,  /*   sent to status page */
+	SGE_HOSTFCMODE_BOTH = 3,  /*   ingress queue and status page */
+
+	SGE_FETCHBURSTMIN_16B = 0,/* egress queue descriptor fetch minimum */
+	SGE_FETCHBURSTMIN_32B = 1,
+	SGE_FETCHBURSTMIN_64B = 2,
+	SGE_FETCHBURSTMIN_128B = 3,
+
+	SGE_FETCHBURSTMAX_64B = 0,/* egress queue descriptor fetch maximum */
+	SGE_FETCHBURSTMAX_128B = 1,
+	SGE_FETCHBURSTMAX_256B = 2,
+	SGE_FETCHBURSTMAX_512B = 3,
+
+	SGE_CIDXFLUSHTHRESH_1 = 0,/* egress queue cidx flush threshold */
+	SGE_CIDXFLUSHTHRESH_2 = 1,
+	SGE_CIDXFLUSHTHRESH_4 = 2,
+	SGE_CIDXFLUSHTHRESH_8 = 3,
+	SGE_CIDXFLUSHTHRESH_16 = 4,
+	SGE_CIDXFLUSHTHRESH_32 = 5,
+	SGE_CIDXFLUSHTHRESH_64 = 6,
+	SGE_CIDXFLUSHTHRESH_128 = 7,
+
+	SGE_INGPADBOUNDARY_SHIFT = 5,/* ingress queue pad boundary */
+};
+
+/* PCI-e memory window access */
+enum pcie_memwin {
+	MEMWIN_NIC      = 0,
+	MEMWIN_RSVD1    = 1,
+	MEMWIN_RSVD2    = 2,
+	MEMWIN_RDMA     = 3,
+	MEMWIN_RSVD4    = 4,
+	MEMWIN_FOISCSI  = 5,
+	MEMWIN_CSIOSTOR = 6,
+	MEMWIN_RSVD7    = 7,
+};
+
+struct sge_qstat {                /* data written to SGE queue status entries */
+	__be32 qid;
+	__be16 cidx;
+	__be16 pidx;
+};
+
+/*
+ * Structure for last 128 bits of response descriptors
+ */
+struct rsp_ctrl {
+	__be32 hdrbuflen_pidx;
+	__be32 pldbuflen_qid;
+	union {
+		u8 type_gen;
+		__be64 last_flit;
+	};
+};
+
+#define RSPD_NEWBUF_S    31
+#define RSPD_NEWBUF_V(x) ((x) << RSPD_NEWBUF_S)
+#define RSPD_NEWBUF_F    RSPD_NEWBUF_V(1U)
+
+#define RSPD_LEN_S    0
+#define RSPD_LEN_M    0x7fffffff
+#define RSPD_LEN_G(x) (((x) >> RSPD_LEN_S) & RSPD_LEN_M)
+
+#define RSPD_QID_S    RSPD_LEN_S
+#define RSPD_QID_M    RSPD_LEN_M
+#define RSPD_QID_G(x) RSPD_LEN_G(x)
+
+#define RSPD_GEN_S    7
+
+#define RSPD_TYPE_S    4
+#define RSPD_TYPE_M    0x3
+#define RSPD_TYPE_G(x) (((x) >> RSPD_TYPE_S) & RSPD_TYPE_M)
+
+/* Rx queue interrupt deferral fields: counter enable and timer index */
+#define QINTR_CNT_EN_S    0
+#define QINTR_CNT_EN_V(x) ((x) << QINTR_CNT_EN_S)
+#define QINTR_CNT_EN_F    QINTR_CNT_EN_V(1U)
+
+#define QINTR_TIMER_IDX_S    1
+#define QINTR_TIMER_IDX_M    0x7
+#define QINTR_TIMER_IDX_V(x) ((x) << QINTR_TIMER_IDX_S)
+#define QINTR_TIMER_IDX_G(x) (((x) >> QINTR_TIMER_IDX_S) & QINTR_TIMER_IDX_M)
+
+/*
+ * Flash layout.
+ */
+#define FLASH_START(start)	((start) * SF_SEC_SIZE)
+#define FLASH_MAX_SIZE(nsecs)	((nsecs) * SF_SEC_SIZE)
+
+enum {
+	/*
+	 * Various Expansion-ROM boot images, etc.
+	 */
+	FLASH_EXP_ROM_START_SEC = 0,
+	FLASH_EXP_ROM_NSECS = 6,
+	FLASH_EXP_ROM_START = FLASH_START(FLASH_EXP_ROM_START_SEC),
+	FLASH_EXP_ROM_MAX_SIZE = FLASH_MAX_SIZE(FLASH_EXP_ROM_NSECS),
+
+	/*
+	 * iSCSI Boot Firmware Table (iBFT) and other driver-related
+	 * parameters ...
+	 */
+	FLASH_IBFT_START_SEC = 6,
+	FLASH_IBFT_NSECS = 1,
+	FLASH_IBFT_START = FLASH_START(FLASH_IBFT_START_SEC),
+	FLASH_IBFT_MAX_SIZE = FLASH_MAX_SIZE(FLASH_IBFT_NSECS),
+
+	/*
+	 * Boot configuration data.
+	 */
+	FLASH_BOOTCFG_START_SEC = 7,
+	FLASH_BOOTCFG_NSECS = 1,
+	FLASH_BOOTCFG_START = FLASH_START(FLASH_BOOTCFG_START_SEC),
+	FLASH_BOOTCFG_MAX_SIZE = FLASH_MAX_SIZE(FLASH_BOOTCFG_NSECS),
+
+	/*
+	 * Location of firmware image in FLASH.
+	 */
+	FLASH_FW_START_SEC = 8,
+	FLASH_FW_NSECS = 16,
+	FLASH_FW_START = FLASH_START(FLASH_FW_START_SEC),
+	FLASH_FW_MAX_SIZE = FLASH_MAX_SIZE(FLASH_FW_NSECS),
+
+	/* Location of bootstrap firmware image in FLASH.
+	 */
+	FLASH_FWBOOTSTRAP_START_SEC = 27,
+	FLASH_FWBOOTSTRAP_NSECS = 1,
+	FLASH_FWBOOTSTRAP_START = FLASH_START(FLASH_FWBOOTSTRAP_START_SEC),
+	FLASH_FWBOOTSTRAP_MAX_SIZE = FLASH_MAX_SIZE(FLASH_FWBOOTSTRAP_NSECS),
+
+	/*
+	 * iSCSI persistent/crash information.
+	 */
+	FLASH_ISCSI_CRASH_START_SEC = 29,
+	FLASH_ISCSI_CRASH_NSECS = 1,
+	FLASH_ISCSI_CRASH_START = FLASH_START(FLASH_ISCSI_CRASH_START_SEC),
+	FLASH_ISCSI_CRASH_MAX_SIZE = FLASH_MAX_SIZE(FLASH_ISCSI_CRASH_NSECS),
+
+	/*
+	 * FCoE persistent/crash information.
+	 */
+	FLASH_FCOE_CRASH_START_SEC = 30,
+	FLASH_FCOE_CRASH_NSECS = 1,
+	FLASH_FCOE_CRASH_START = FLASH_START(FLASH_FCOE_CRASH_START_SEC),
+	FLASH_FCOE_CRASH_MAX_SIZE = FLASH_MAX_SIZE(FLASH_FCOE_CRASH_NSECS),
+
+	/*
+	 * Location of Firmware Configuration File in FLASH.  Since the FPGA
+	 * "FLASH" is smaller we need to store the Configuration File in a
+	 * different location -- which will overlap the end of the firmware
+	 * image if firmware ever gets that large ...
+	 */
+	FLASH_CFG_START_SEC = 31,
+	FLASH_CFG_NSECS = 1,
+	FLASH_CFG_START = FLASH_START(FLASH_CFG_START_SEC),
+	FLASH_CFG_MAX_SIZE = FLASH_MAX_SIZE(FLASH_CFG_NSECS),
+
+	/* We don't support FLASH devices which can't support the full
+	 * standard set of sections which we need for normal
+	 * operations.
+	 */
+	FLASH_MIN_SIZE = FLASH_CFG_START + FLASH_CFG_MAX_SIZE,
+
+	FLASH_FPGA_CFG_START_SEC = 15,
+	FLASH_FPGA_CFG_START = FLASH_START(FLASH_FPGA_CFG_START_SEC),
+
+	/*
+	 * Sectors 32-63 are reserved for FLASH failover.
+	 */
+};
+
+#undef FLASH_START
+#undef FLASH_MAX_SIZE
+
+#define SGE_TIMESTAMP_S 0
+#define SGE_TIMESTAMP_M 0xfffffffffffffffULL
+#define SGE_TIMESTAMP_V(x) ((__u64)(x) << SGE_TIMESTAMP_S)
+#define SGE_TIMESTAMP_G(x) (((__u64)(x) >> SGE_TIMESTAMP_S) & SGE_TIMESTAMP_M)
+
+#define I2C_DEV_ADDR_A0		0xa0
+#define I2C_DEV_ADDR_A2		0xa2
+#define I2C_PAGE_SIZE		0x100
+#define SFP_DIAG_TYPE_ADDR	0x5c
+#define SFP_DIAG_TYPE_LEN	0x1
+#define SFF_8472_COMP_ADDR	0x5e
+#define SFF_8472_COMP_LEN	0x1
+#define SFF_REV_ADDR		0x1
+#define SFF_REV_LEN		0x1
+
+#endif /* __T4_HW_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
new file mode 100644
index 0000000..fe2029e
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
@@ -0,0 +1,2304 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __T4_MSG_H
+#define __T4_MSG_H
+
+#include <linux/types.h>
+
+enum {
+	CPL_PASS_OPEN_REQ     = 0x1,
+	CPL_PASS_ACCEPT_RPL   = 0x2,
+	CPL_ACT_OPEN_REQ      = 0x3,
+	CPL_SET_TCB_FIELD     = 0x5,
+	CPL_GET_TCB           = 0x6,
+	CPL_CLOSE_CON_REQ     = 0x8,
+	CPL_CLOSE_LISTSRV_REQ = 0x9,
+	CPL_ABORT_REQ         = 0xA,
+	CPL_ABORT_RPL         = 0xB,
+	CPL_RX_DATA_ACK       = 0xD,
+	CPL_TX_PKT            = 0xE,
+	CPL_L2T_WRITE_REQ     = 0x12,
+	CPL_SMT_WRITE_REQ     = 0x14,
+	CPL_TID_RELEASE       = 0x1A,
+	CPL_SRQ_TABLE_REQ     = 0x1C,
+	CPL_TX_DATA_ISO	      = 0x1F,
+
+	CPL_CLOSE_LISTSRV_RPL = 0x20,
+	CPL_L2T_WRITE_RPL     = 0x23,
+	CPL_PASS_OPEN_RPL     = 0x24,
+	CPL_ACT_OPEN_RPL      = 0x25,
+	CPL_PEER_CLOSE        = 0x26,
+	CPL_ABORT_REQ_RSS     = 0x2B,
+	CPL_ABORT_RPL_RSS     = 0x2D,
+	CPL_SMT_WRITE_RPL     = 0x2E,
+
+	CPL_RX_PHYS_ADDR      = 0x30,
+	CPL_CLOSE_CON_RPL     = 0x32,
+	CPL_ISCSI_HDR         = 0x33,
+	CPL_RDMA_CQE          = 0x35,
+	CPL_RDMA_CQE_READ_RSP = 0x36,
+	CPL_RDMA_CQE_ERR      = 0x37,
+	CPL_RX_DATA           = 0x39,
+	CPL_SET_TCB_RPL       = 0x3A,
+	CPL_RX_PKT            = 0x3B,
+	CPL_RX_DDP_COMPLETE   = 0x3F,
+
+	CPL_ACT_ESTABLISH     = 0x40,
+	CPL_PASS_ESTABLISH    = 0x41,
+	CPL_RX_DATA_DDP       = 0x42,
+	CPL_PASS_ACCEPT_REQ   = 0x44,
+	CPL_RX_ISCSI_CMP      = 0x45,
+	CPL_TRACE_PKT_T5      = 0x48,
+	CPL_RX_ISCSI_DDP      = 0x49,
+	CPL_RX_TLS_CMP        = 0x4E,
+
+	CPL_RDMA_READ_REQ     = 0x60,
+
+	CPL_PASS_OPEN_REQ6    = 0x81,
+	CPL_ACT_OPEN_REQ6     = 0x83,
+
+	CPL_TX_TLS_PDU        = 0x88,
+	CPL_TX_TLS_SFO        = 0x89,
+	CPL_TX_SEC_PDU        = 0x8A,
+	CPL_TX_TLS_ACK        = 0x8B,
+
+	CPL_RDMA_TERMINATE    = 0xA2,
+	CPL_RDMA_WRITE        = 0xA4,
+	CPL_SGE_EGR_UPDATE    = 0xA5,
+	CPL_RX_MPS_PKT        = 0xAF,
+
+	CPL_TRACE_PKT         = 0xB0,
+	CPL_TLS_DATA          = 0xB1,
+	CPL_ISCSI_DATA	      = 0xB2,
+
+	CPL_FW4_MSG           = 0xC0,
+	CPL_FW4_PLD           = 0xC1,
+	CPL_FW4_ACK           = 0xC3,
+	CPL_SRQ_TABLE_RPL     = 0xCC,
+
+	CPL_RX_PHYS_DSGL      = 0xD0,
+
+	CPL_FW6_MSG           = 0xE0,
+	CPL_FW6_PLD           = 0xE1,
+	CPL_TX_TNL_LSO        = 0xEC,
+	CPL_TX_PKT_LSO        = 0xED,
+	CPL_TX_PKT_XT         = 0xEE,
+
+	NUM_CPL_CMDS
+};
+
+enum CPL_error {
+	CPL_ERR_NONE               = 0,
+	CPL_ERR_TCAM_PARITY        = 1,
+	CPL_ERR_TCAM_MISS          = 2,
+	CPL_ERR_TCAM_FULL          = 3,
+	CPL_ERR_BAD_LENGTH         = 15,
+	CPL_ERR_BAD_ROUTE          = 18,
+	CPL_ERR_CONN_RESET         = 20,
+	CPL_ERR_CONN_EXIST_SYNRECV = 21,
+	CPL_ERR_CONN_EXIST         = 22,
+	CPL_ERR_ARP_MISS           = 23,
+	CPL_ERR_BAD_SYN            = 24,
+	CPL_ERR_CONN_TIMEDOUT      = 30,
+	CPL_ERR_XMIT_TIMEDOUT      = 31,
+	CPL_ERR_PERSIST_TIMEDOUT   = 32,
+	CPL_ERR_FINWAIT2_TIMEDOUT  = 33,
+	CPL_ERR_KEEPALIVE_TIMEDOUT = 34,
+	CPL_ERR_RTX_NEG_ADVICE     = 35,
+	CPL_ERR_PERSIST_NEG_ADVICE = 36,
+	CPL_ERR_KEEPALV_NEG_ADVICE = 37,
+	CPL_ERR_ABORT_FAILED       = 42,
+	CPL_ERR_IWARP_FLM          = 50,
+	CPL_CONTAINS_READ_RPL      = 60,
+	CPL_CONTAINS_WRITE_RPL     = 61,
+};
+
+enum {
+	CPL_CONN_POLICY_AUTO = 0,
+	CPL_CONN_POLICY_ASK  = 1,
+	CPL_CONN_POLICY_FILTER = 2,
+	CPL_CONN_POLICY_DENY = 3
+};
+
+enum {
+	ULP_MODE_NONE          = 0,
+	ULP_MODE_ISCSI         = 2,
+	ULP_MODE_RDMA          = 4,
+	ULP_MODE_TCPDDP	       = 5,
+	ULP_MODE_FCOE          = 6,
+	ULP_MODE_TLS           = 8,
+};
+
+enum {
+	ULP_CRC_HEADER = 1 << 0,
+	ULP_CRC_DATA   = 1 << 1
+};
+
+enum {
+	CPL_ABORT_SEND_RST = 0,
+	CPL_ABORT_NO_RST,
+};
+
+enum {                     /* TX_PKT_XT checksum types */
+	TX_CSUM_TCP    = 0,
+	TX_CSUM_UDP    = 1,
+	TX_CSUM_CRC16  = 4,
+	TX_CSUM_CRC32  = 5,
+	TX_CSUM_CRC32C = 6,
+	TX_CSUM_FCOE   = 7,
+	TX_CSUM_TCPIP  = 8,
+	TX_CSUM_UDPIP  = 9,
+	TX_CSUM_TCPIP6 = 10,
+	TX_CSUM_UDPIP6 = 11,
+	TX_CSUM_IP     = 12,
+};
+
+union opcode_tid {
+	__be32 opcode_tid;
+	u8 opcode;
+};
+
+#define CPL_OPCODE_S    24
+#define CPL_OPCODE_V(x) ((x) << CPL_OPCODE_S)
+#define CPL_OPCODE_G(x) (((x) >> CPL_OPCODE_S) & 0xFF)
+#define TID_G(x)    ((x) & 0xFFFFFF)
+
+/* tid is assumed to be 24-bits */
+#define MK_OPCODE_TID(opcode, tid) (CPL_OPCODE_V(opcode) | (tid))
+
+#define OPCODE_TID(cmd) ((cmd)->ot.opcode_tid)
+
+/* extract the TID from a CPL command */
+#define GET_TID(cmd) (TID_G(be32_to_cpu(OPCODE_TID(cmd))))
+
+/* partitioning of TID fields that also carry a queue id */
+#define TID_TID_S    0
+#define TID_TID_M    0x3fff
+#define TID_TID_V(x) ((x) << TID_TID_S)
+#define TID_TID_G(x) (((x) >> TID_TID_S) & TID_TID_M)
+
+#define TID_QID_S    14
+#define TID_QID_M    0x3ff
+#define TID_QID_V(x) ((x) << TID_QID_S)
+#define TID_QID_G(x) (((x) >> TID_QID_S) & TID_QID_M)
+
+struct rss_header {
+	u8 opcode;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 channel:2;
+	u8 filter_hit:1;
+	u8 filter_tid:1;
+	u8 hash_type:2;
+	u8 ipv6:1;
+	u8 send2fw:1;
+#else
+	u8 send2fw:1;
+	u8 ipv6:1;
+	u8 hash_type:2;
+	u8 filter_tid:1;
+	u8 filter_hit:1;
+	u8 channel:2;
+#endif
+	__be16 qid;
+	__be32 hash_val;
+};
+
+struct work_request_hdr {
+	__be32 wr_hi;
+	__be32 wr_mid;
+	__be64 wr_lo;
+};
+
+/* wr_hi fields */
+#define WR_OP_S    24
+#define WR_OP_V(x) ((__u64)(x) << WR_OP_S)
+
+#define WR_HDR struct work_request_hdr wr
+
+/* option 0 fields */
+#define TX_CHAN_S    2
+#define TX_CHAN_V(x) ((x) << TX_CHAN_S)
+
+#define ULP_MODE_S    8
+#define ULP_MODE_V(x) ((x) << ULP_MODE_S)
+
+#define RCV_BUFSIZ_S    12
+#define RCV_BUFSIZ_M    0x3FFU
+#define RCV_BUFSIZ_V(x) ((x) << RCV_BUFSIZ_S)
+
+#define SMAC_SEL_S    28
+#define SMAC_SEL_V(x) ((__u64)(x) << SMAC_SEL_S)
+
+#define L2T_IDX_S    36
+#define L2T_IDX_V(x) ((__u64)(x) << L2T_IDX_S)
+
+#define WND_SCALE_S    50
+#define WND_SCALE_V(x) ((__u64)(x) << WND_SCALE_S)
+
+#define KEEP_ALIVE_S    54
+#define KEEP_ALIVE_V(x) ((__u64)(x) << KEEP_ALIVE_S)
+#define KEEP_ALIVE_F    KEEP_ALIVE_V(1ULL)
+
+#define MSS_IDX_S    60
+#define MSS_IDX_M    0xF
+#define MSS_IDX_V(x) ((__u64)(x) << MSS_IDX_S)
+#define MSS_IDX_G(x) (((x) >> MSS_IDX_S) & MSS_IDX_M)
+
+/* option 2 fields */
+#define RSS_QUEUE_S    0
+#define RSS_QUEUE_M    0x3FF
+#define RSS_QUEUE_V(x) ((x) << RSS_QUEUE_S)
+#define RSS_QUEUE_G(x) (((x) >> RSS_QUEUE_S) & RSS_QUEUE_M)
+
+#define RSS_QUEUE_VALID_S    10
+#define RSS_QUEUE_VALID_V(x) ((x) << RSS_QUEUE_VALID_S)
+#define RSS_QUEUE_VALID_F    RSS_QUEUE_VALID_V(1U)
+
+#define RX_FC_DISABLE_S    20
+#define RX_FC_DISABLE_V(x) ((x) << RX_FC_DISABLE_S)
+#define RX_FC_DISABLE_F    RX_FC_DISABLE_V(1U)
+
+#define RX_FC_VALID_S    22
+#define RX_FC_VALID_V(x) ((x) << RX_FC_VALID_S)
+#define RX_FC_VALID_F    RX_FC_VALID_V(1U)
+
+#define RX_CHANNEL_S    26
+#define RX_CHANNEL_V(x) ((x) << RX_CHANNEL_S)
+#define RX_CHANNEL_F	RX_CHANNEL_V(1U)
+
+#define WND_SCALE_EN_S    28
+#define WND_SCALE_EN_V(x) ((x) << WND_SCALE_EN_S)
+#define WND_SCALE_EN_F    WND_SCALE_EN_V(1U)
+
+#define T5_OPT_2_VALID_S    31
+#define T5_OPT_2_VALID_V(x) ((x) << T5_OPT_2_VALID_S)
+#define T5_OPT_2_VALID_F    T5_OPT_2_VALID_V(1U)
+
+struct cpl_pass_open_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be64 opt0;
+	__be64 opt1;
+};
+
+/* option 0 fields */
+#define NO_CONG_S    4
+#define NO_CONG_V(x) ((x) << NO_CONG_S)
+#define NO_CONG_F    NO_CONG_V(1U)
+
+#define DELACK_S    5
+#define DELACK_V(x) ((x) << DELACK_S)
+#define DELACK_F    DELACK_V(1U)
+
+#define NON_OFFLOAD_S		7
+#define NON_OFFLOAD_V(x)	((x) << NON_OFFLOAD_S)
+#define NON_OFFLOAD_F		NON_OFFLOAD_V(1U)
+
+#define DSCP_S    22
+#define DSCP_M    0x3F
+#define DSCP_V(x) ((x) << DSCP_S)
+#define DSCP_G(x) (((x) >> DSCP_S) & DSCP_M)
+
+#define TCAM_BYPASS_S    48
+#define TCAM_BYPASS_V(x) ((__u64)(x) << TCAM_BYPASS_S)
+#define TCAM_BYPASS_F    TCAM_BYPASS_V(1ULL)
+
+#define NAGLE_S    49
+#define NAGLE_V(x) ((__u64)(x) << NAGLE_S)
+#define NAGLE_F    NAGLE_V(1ULL)
+
+/* option 1 fields */
+#define SYN_RSS_ENABLE_S    0
+#define SYN_RSS_ENABLE_V(x) ((x) << SYN_RSS_ENABLE_S)
+#define SYN_RSS_ENABLE_F    SYN_RSS_ENABLE_V(1U)
+
+#define SYN_RSS_QUEUE_S    2
+#define SYN_RSS_QUEUE_V(x) ((x) << SYN_RSS_QUEUE_S)
+
+#define CONN_POLICY_S    22
+#define CONN_POLICY_V(x) ((x) << CONN_POLICY_S)
+
+struct cpl_pass_open_req6 {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be64 local_ip_hi;
+	__be64 local_ip_lo;
+	__be64 peer_ip_hi;
+	__be64 peer_ip_lo;
+	__be64 opt0;
+	__be64 opt1;
+};
+
+struct cpl_pass_open_rpl {
+	union opcode_tid ot;
+	u8 rsvd[3];
+	u8 status;
+};
+
+struct tcp_options {
+	__be16 mss;
+	__u8 wsf;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8:4;
+	__u8 unknown:1;
+	__u8:1;
+	__u8 sack:1;
+	__u8 tstamp:1;
+#else
+	__u8 tstamp:1;
+	__u8 sack:1;
+	__u8:1;
+	__u8 unknown:1;
+	__u8:4;
+#endif
+};
+
+struct cpl_pass_accept_req {
+	union opcode_tid ot;
+	__be16 rsvd;
+	__be16 len;
+	__be32 hdr_len;
+	__be16 vlan;
+	__be16 l2info;
+	__be32 tos_stid;
+	struct tcp_options tcpopt;
+};
+
+/* cpl_pass_accept_req.hdr_len fields */
+#define SYN_RX_CHAN_S    0
+#define SYN_RX_CHAN_M    0xF
+#define SYN_RX_CHAN_V(x) ((x) << SYN_RX_CHAN_S)
+#define SYN_RX_CHAN_G(x) (((x) >> SYN_RX_CHAN_S) & SYN_RX_CHAN_M)
+
+#define TCP_HDR_LEN_S    10
+#define TCP_HDR_LEN_M    0x3F
+#define TCP_HDR_LEN_V(x) ((x) << TCP_HDR_LEN_S)
+#define TCP_HDR_LEN_G(x) (((x) >> TCP_HDR_LEN_S) & TCP_HDR_LEN_M)
+
+#define IP_HDR_LEN_S    16
+#define IP_HDR_LEN_M    0x3FF
+#define IP_HDR_LEN_V(x) ((x) << IP_HDR_LEN_S)
+#define IP_HDR_LEN_G(x) (((x) >> IP_HDR_LEN_S) & IP_HDR_LEN_M)
+
+#define ETH_HDR_LEN_S    26
+#define ETH_HDR_LEN_M    0x1F
+#define ETH_HDR_LEN_V(x) ((x) << ETH_HDR_LEN_S)
+#define ETH_HDR_LEN_G(x) (((x) >> ETH_HDR_LEN_S) & ETH_HDR_LEN_M)
+
+/* cpl_pass_accept_req.l2info fields */
+#define SYN_MAC_IDX_S    0
+#define SYN_MAC_IDX_M    0x1FF
+#define SYN_MAC_IDX_V(x) ((x) << SYN_MAC_IDX_S)
+#define SYN_MAC_IDX_G(x) (((x) >> SYN_MAC_IDX_S) & SYN_MAC_IDX_M)
+
+#define SYN_XACT_MATCH_S    9
+#define SYN_XACT_MATCH_V(x) ((x) << SYN_XACT_MATCH_S)
+#define SYN_XACT_MATCH_F    SYN_XACT_MATCH_V(1U)
+
+#define SYN_INTF_S    12
+#define SYN_INTF_M    0xF
+#define SYN_INTF_V(x) ((x) << SYN_INTF_S)
+#define SYN_INTF_G(x) (((x) >> SYN_INTF_S) & SYN_INTF_M)
+
+enum {                     /* TCP congestion control algorithms */
+	CONG_ALG_RENO,
+	CONG_ALG_TAHOE,
+	CONG_ALG_NEWRENO,
+	CONG_ALG_HIGHSPEED
+};
+
+#define CONG_CNTRL_S    14
+#define CONG_CNTRL_M    0x3
+#define CONG_CNTRL_V(x) ((x) << CONG_CNTRL_S)
+#define CONG_CNTRL_G(x) (((x) >> CONG_CNTRL_S) & CONG_CNTRL_M)
+
+#define T5_ISS_S    18
+#define T5_ISS_V(x) ((x) << T5_ISS_S)
+#define T5_ISS_F    T5_ISS_V(1U)
+
+struct cpl_pass_accept_rpl {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 opt2;
+	__be64 opt0;
+};
+
+/* option 2 fields */
+#define RX_COALESCE_VALID_S    11
+#define RX_COALESCE_VALID_V(x) ((x) << RX_COALESCE_VALID_S)
+#define RX_COALESCE_VALID_F    RX_COALESCE_VALID_V(1U)
+
+#define RX_COALESCE_S    12
+#define RX_COALESCE_V(x) ((x) << RX_COALESCE_S)
+
+#define PACE_S    16
+#define PACE_V(x) ((x) << PACE_S)
+
+#define TX_QUEUE_S    23
+#define TX_QUEUE_M    0x7
+#define TX_QUEUE_V(x) ((x) << TX_QUEUE_S)
+#define TX_QUEUE_G(x) (((x) >> TX_QUEUE_S) & TX_QUEUE_M)
+
+#define CCTRL_ECN_S    27
+#define CCTRL_ECN_V(x) ((x) << CCTRL_ECN_S)
+#define CCTRL_ECN_F    CCTRL_ECN_V(1U)
+
+#define TSTAMPS_EN_S    29
+#define TSTAMPS_EN_V(x) ((x) << TSTAMPS_EN_S)
+#define TSTAMPS_EN_F    TSTAMPS_EN_V(1U)
+
+#define SACK_EN_S    30
+#define SACK_EN_V(x) ((x) << SACK_EN_S)
+#define SACK_EN_F    SACK_EN_V(1U)
+
+struct cpl_t5_pass_accept_rpl {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 opt2;
+	__be64 opt0;
+	__be32 iss;
+	__be32 rsvd;
+};
+
+struct cpl_act_open_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be64 opt0;
+	__be32 params;
+	__be32 opt2;
+};
+
+#define FILTER_TUPLE_S  24
+#define FILTER_TUPLE_M  0xFFFFFFFFFF
+#define FILTER_TUPLE_V(x) ((x) << FILTER_TUPLE_S)
+#define FILTER_TUPLE_G(x) (((x) >> FILTER_TUPLE_S) & FILTER_TUPLE_M)
+struct cpl_t5_act_open_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be64 opt0;
+	__be32 rsvd;
+	__be32 opt2;
+	__be64 params;
+};
+
+struct cpl_t6_act_open_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be64 opt0;
+	__be32 rsvd;
+	__be32 opt2;
+	__be64 params;
+	__be32 rsvd2;
+	__be32 opt3;
+};
+
+struct cpl_act_open_req6 {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be64 local_ip_hi;
+	__be64 local_ip_lo;
+	__be64 peer_ip_hi;
+	__be64 peer_ip_lo;
+	__be64 opt0;
+	__be32 params;
+	__be32 opt2;
+};
+
+struct cpl_t5_act_open_req6 {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be64 local_ip_hi;
+	__be64 local_ip_lo;
+	__be64 peer_ip_hi;
+	__be64 peer_ip_lo;
+	__be64 opt0;
+	__be32 rsvd;
+	__be32 opt2;
+	__be64 params;
+};
+
+struct cpl_t6_act_open_req6 {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be64 local_ip_hi;
+	__be64 local_ip_lo;
+	__be64 peer_ip_hi;
+	__be64 peer_ip_lo;
+	__be64 opt0;
+	__be32 rsvd;
+	__be32 opt2;
+	__be64 params;
+	__be32 rsvd2;
+	__be32 opt3;
+};
+
+struct cpl_act_open_rpl {
+	union opcode_tid ot;
+	__be32 atid_status;
+};
+
+/* cpl_act_open_rpl.atid_status fields */
+#define AOPEN_STATUS_S    0
+#define AOPEN_STATUS_M    0xFF
+#define AOPEN_STATUS_G(x) (((x) >> AOPEN_STATUS_S) & AOPEN_STATUS_M)
+
+#define AOPEN_ATID_S    8
+#define AOPEN_ATID_M    0xFFFFFF
+#define AOPEN_ATID_G(x) (((x) >> AOPEN_ATID_S) & AOPEN_ATID_M)
+
+struct cpl_pass_establish {
+	union opcode_tid ot;
+	__be32 rsvd;
+	__be32 tos_stid;
+	__be16 mac_idx;
+	__be16 tcp_opt;
+	__be32 snd_isn;
+	__be32 rcv_isn;
+};
+
+/* cpl_pass_establish.tos_stid fields */
+#define PASS_OPEN_TID_S    0
+#define PASS_OPEN_TID_M    0xFFFFFF
+#define PASS_OPEN_TID_V(x) ((x) << PASS_OPEN_TID_S)
+#define PASS_OPEN_TID_G(x) (((x) >> PASS_OPEN_TID_S) & PASS_OPEN_TID_M)
+
+#define PASS_OPEN_TOS_S    24
+#define PASS_OPEN_TOS_M    0xFF
+#define PASS_OPEN_TOS_V(x) ((x) << PASS_OPEN_TOS_S)
+#define PASS_OPEN_TOS_G(x) (((x) >> PASS_OPEN_TOS_S) & PASS_OPEN_TOS_M)
+
+/* cpl_pass_establish.tcp_opt fields (also applies to act_open_establish) */
+#define TCPOPT_WSCALE_OK_S	5
+#define TCPOPT_WSCALE_OK_M	0x1
+#define TCPOPT_WSCALE_OK_G(x)	\
+	(((x) >> TCPOPT_WSCALE_OK_S) & TCPOPT_WSCALE_OK_M)
+
+#define TCPOPT_SACK_S		6
+#define TCPOPT_SACK_M		0x1
+#define TCPOPT_SACK_G(x)	(((x) >> TCPOPT_SACK_S) & TCPOPT_SACK_M)
+
+#define TCPOPT_TSTAMP_S		7
+#define TCPOPT_TSTAMP_M		0x1
+#define TCPOPT_TSTAMP_G(x)	(((x) >> TCPOPT_TSTAMP_S) & TCPOPT_TSTAMP_M)
+
+#define TCPOPT_SND_WSCALE_S	8
+#define TCPOPT_SND_WSCALE_M	0xF
+#define TCPOPT_SND_WSCALE_G(x)	\
+	(((x) >> TCPOPT_SND_WSCALE_S) & TCPOPT_SND_WSCALE_M)
+
+#define TCPOPT_MSS_S	12
+#define TCPOPT_MSS_M	0xF
+#define TCPOPT_MSS_G(x)	(((x) >> TCPOPT_MSS_S) & TCPOPT_MSS_M)
+
+#define T6_TCP_HDR_LEN_S   8
+#define T6_TCP_HDR_LEN_V(x) ((x) << T6_TCP_HDR_LEN_S)
+#define T6_TCP_HDR_LEN_G(x) (((x) >> T6_TCP_HDR_LEN_S) & TCP_HDR_LEN_M)
+
+#define T6_IP_HDR_LEN_S    14
+#define T6_IP_HDR_LEN_V(x) ((x) << T6_IP_HDR_LEN_S)
+#define T6_IP_HDR_LEN_G(x) (((x) >> T6_IP_HDR_LEN_S) & IP_HDR_LEN_M)
+
+#define T6_ETH_HDR_LEN_S    24
+#define T6_ETH_HDR_LEN_M    0xFF
+#define T6_ETH_HDR_LEN_V(x) ((x) << T6_ETH_HDR_LEN_S)
+#define T6_ETH_HDR_LEN_G(x) (((x) >> T6_ETH_HDR_LEN_S) & T6_ETH_HDR_LEN_M)
+
+struct cpl_act_establish {
+	union opcode_tid ot;
+	__be32 rsvd;
+	__be32 tos_atid;
+	__be16 mac_idx;
+	__be16 tcp_opt;
+	__be32 snd_isn;
+	__be32 rcv_isn;
+};
+
+struct cpl_get_tcb {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 reply_ctrl;
+	__be16 cookie;
+};
+
+/* cpl_get_tcb.reply_ctrl fields */
+#define QUEUENO_S    0
+#define QUEUENO_V(x) ((x) << QUEUENO_S)
+
+#define REPLY_CHAN_S    14
+#define REPLY_CHAN_V(x) ((x) << REPLY_CHAN_S)
+#define REPLY_CHAN_F    REPLY_CHAN_V(1U)
+
+#define NO_REPLY_S    15
+#define NO_REPLY_V(x) ((x) << NO_REPLY_S)
+#define NO_REPLY_F    NO_REPLY_V(1U)
+
+struct cpl_set_tcb_field {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 reply_ctrl;
+	__be16 word_cookie;
+	__be64 mask;
+	__be64 val;
+};
+
+/* cpl_set_tcb_field.word_cookie fields */
+#define TCB_WORD_S	0
+#define TCB_WORD_V(x)	((x) << TCB_WORD_S)
+
+#define TCB_COOKIE_S    5
+#define TCB_COOKIE_M    0x7
+#define TCB_COOKIE_V(x) ((x) << TCB_COOKIE_S)
+#define TCB_COOKIE_G(x) (((x) >> TCB_COOKIE_S) & TCB_COOKIE_M)
+
+struct cpl_set_tcb_rpl {
+	union opcode_tid ot;
+	__be16 rsvd;
+	u8 cookie;
+	u8 status;
+	__be64 oldval;
+};
+
+struct cpl_close_con_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd;
+};
+
+struct cpl_close_con_rpl {
+	union opcode_tid ot;
+	u8 rsvd[3];
+	u8 status;
+	__be32 snd_nxt;
+	__be32 rcv_nxt;
+};
+
+struct cpl_close_listsvr_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 reply_ctrl;
+	__be16 rsvd;
+};
+
+/* additional cpl_close_listsvr_req.reply_ctrl field */
+#define LISTSVR_IPV6_S    14
+#define LISTSVR_IPV6_V(x) ((x) << LISTSVR_IPV6_S)
+#define LISTSVR_IPV6_F    LISTSVR_IPV6_V(1U)
+
+struct cpl_close_listsvr_rpl {
+	union opcode_tid ot;
+	u8 rsvd[3];
+	u8 status;
+};
+
+struct cpl_abort_req_rss {
+	union opcode_tid ot;
+	u8 rsvd[3];
+	u8 status;
+};
+
+struct cpl_abort_req_rss6 {
+	WR_HDR;
+	union opcode_tid ot;
+	__u32 srqidx_status;
+};
+
+#define ABORT_RSS_STATUS_S    0
+#define ABORT_RSS_STATUS_M    0xff
+#define ABORT_RSS_STATUS_V(x) ((x) << ABORT_RSS_STATUS_S)
+#define ABORT_RSS_STATUS_G(x) (((x) >> ABORT_RSS_STATUS_S) & ABORT_RSS_STATUS_M)
+
+#define ABORT_RSS_SRQIDX_S    8
+#define ABORT_RSS_SRQIDX_M    0xffffff
+#define ABORT_RSS_SRQIDX_V(x) ((x) << ABORT_RSS_SRQIDX_S)
+#define ABORT_RSS_SRQIDX_G(x) (((x) >> ABORT_RSS_SRQIDX_S) & ABORT_RSS_SRQIDX_M)
+
+struct cpl_abort_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd0;
+	u8 rsvd1;
+	u8 cmd;
+	u8 rsvd2[6];
+};
+
+struct cpl_abort_rpl_rss {
+	union opcode_tid ot;
+	u8 rsvd[3];
+	u8 status;
+};
+
+struct cpl_abort_rpl_rss6 {
+	union opcode_tid ot;
+	__u32 srqidx_status;
+};
+
+struct cpl_abort_rpl {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd0;
+	u8 rsvd1;
+	u8 cmd;
+	u8 rsvd2[6];
+};
+
+struct cpl_peer_close {
+	union opcode_tid ot;
+	__be32 rcv_nxt;
+};
+
+struct cpl_tid_release {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd;
+};
+
+struct cpl_tx_pkt_core {
+	__be32 ctrl0;
+	__be16 pack;
+	__be16 len;
+	__be64 ctrl1;
+};
+
+struct cpl_tx_pkt {
+	WR_HDR;
+	struct cpl_tx_pkt_core c;
+};
+
+#define cpl_tx_pkt_xt cpl_tx_pkt
+
+/* cpl_tx_pkt_core.ctrl0 fields */
+#define TXPKT_VF_S    0
+#define TXPKT_VF_V(x) ((x) << TXPKT_VF_S)
+
+#define TXPKT_PF_S    8
+#define TXPKT_PF_V(x) ((x) << TXPKT_PF_S)
+
+#define TXPKT_VF_VLD_S    11
+#define TXPKT_VF_VLD_V(x) ((x) << TXPKT_VF_VLD_S)
+#define TXPKT_VF_VLD_F    TXPKT_VF_VLD_V(1U)
+
+#define TXPKT_OVLAN_IDX_S    12
+#define TXPKT_OVLAN_IDX_V(x) ((x) << TXPKT_OVLAN_IDX_S)
+
+#define TXPKT_T5_OVLAN_IDX_S	12
+#define TXPKT_T5_OVLAN_IDX_V(x)	((x) << TXPKT_T5_OVLAN_IDX_S)
+
+#define TXPKT_INTF_S    16
+#define TXPKT_INTF_V(x) ((x) << TXPKT_INTF_S)
+
+#define TXPKT_INS_OVLAN_S    21
+#define TXPKT_INS_OVLAN_V(x) ((x) << TXPKT_INS_OVLAN_S)
+#define TXPKT_INS_OVLAN_F    TXPKT_INS_OVLAN_V(1U)
+
+#define TXPKT_TSTAMP_S    23
+#define TXPKT_TSTAMP_V(x) ((x) << TXPKT_TSTAMP_S)
+#define TXPKT_TSTAMP_F    TXPKT_TSTAMP_V(1ULL)
+
+#define TXPKT_OPCODE_S    24
+#define TXPKT_OPCODE_V(x) ((x) << TXPKT_OPCODE_S)
+
+/* cpl_tx_pkt_core.ctrl1 fields */
+#define TXPKT_CSUM_END_S    12
+#define TXPKT_CSUM_END_V(x) ((x) << TXPKT_CSUM_END_S)
+
+#define TXPKT_CSUM_START_S    20
+#define TXPKT_CSUM_START_V(x) ((x) << TXPKT_CSUM_START_S)
+
+#define TXPKT_IPHDR_LEN_S    20
+#define TXPKT_IPHDR_LEN_V(x) ((__u64)(x) << TXPKT_IPHDR_LEN_S)
+
+#define TXPKT_CSUM_LOC_S    30
+#define TXPKT_CSUM_LOC_V(x) ((__u64)(x) << TXPKT_CSUM_LOC_S)
+
+#define TXPKT_ETHHDR_LEN_S    34
+#define TXPKT_ETHHDR_LEN_V(x) ((__u64)(x) << TXPKT_ETHHDR_LEN_S)
+
+#define T6_TXPKT_ETHHDR_LEN_S    32
+#define T6_TXPKT_ETHHDR_LEN_V(x) ((__u64)(x) << T6_TXPKT_ETHHDR_LEN_S)
+
+#define TXPKT_CSUM_TYPE_S    40
+#define TXPKT_CSUM_TYPE_V(x) ((__u64)(x) << TXPKT_CSUM_TYPE_S)
+
+#define TXPKT_VLAN_S    44
+#define TXPKT_VLAN_V(x) ((__u64)(x) << TXPKT_VLAN_S)
+
+#define TXPKT_VLAN_VLD_S    60
+#define TXPKT_VLAN_VLD_V(x) ((__u64)(x) << TXPKT_VLAN_VLD_S)
+#define TXPKT_VLAN_VLD_F    TXPKT_VLAN_VLD_V(1ULL)
+
+#define TXPKT_IPCSUM_DIS_S    62
+#define TXPKT_IPCSUM_DIS_V(x) ((__u64)(x) << TXPKT_IPCSUM_DIS_S)
+#define TXPKT_IPCSUM_DIS_F    TXPKT_IPCSUM_DIS_V(1ULL)
+
+#define TXPKT_L4CSUM_DIS_S    63
+#define TXPKT_L4CSUM_DIS_V(x) ((__u64)(x) << TXPKT_L4CSUM_DIS_S)
+#define TXPKT_L4CSUM_DIS_F    TXPKT_L4CSUM_DIS_V(1ULL)
+
+struct cpl_tx_pkt_lso_core {
+	__be32 lso_ctrl;
+	__be16 ipid_ofst;
+	__be16 mss;
+	__be32 seqno_offset;
+	__be32 len;
+	/* encapsulated CPL (TX_PKT, TX_PKT_XT or TX_DATA) follows here */
+};
+
+/* cpl_tx_pkt_lso_core.lso_ctrl fields */
+#define LSO_TCPHDR_LEN_S    0
+#define LSO_TCPHDR_LEN_V(x) ((x) << LSO_TCPHDR_LEN_S)
+
+#define LSO_IPHDR_LEN_S    4
+#define LSO_IPHDR_LEN_V(x) ((x) << LSO_IPHDR_LEN_S)
+
+#define LSO_ETHHDR_LEN_S    16
+#define LSO_ETHHDR_LEN_V(x) ((x) << LSO_ETHHDR_LEN_S)
+
+#define LSO_IPV6_S    20
+#define LSO_IPV6_V(x) ((x) << LSO_IPV6_S)
+#define LSO_IPV6_F    LSO_IPV6_V(1U)
+
+#define LSO_LAST_SLICE_S    22
+#define LSO_LAST_SLICE_V(x) ((x) << LSO_LAST_SLICE_S)
+#define LSO_LAST_SLICE_F    LSO_LAST_SLICE_V(1U)
+
+#define LSO_FIRST_SLICE_S    23
+#define LSO_FIRST_SLICE_V(x) ((x) << LSO_FIRST_SLICE_S)
+#define LSO_FIRST_SLICE_F    LSO_FIRST_SLICE_V(1U)
+
+#define LSO_OPCODE_S    24
+#define LSO_OPCODE_V(x) ((x) << LSO_OPCODE_S)
+
+#define LSO_T5_XFER_SIZE_S	   0
+#define LSO_T5_XFER_SIZE_V(x) ((x) << LSO_T5_XFER_SIZE_S)
+
+struct cpl_tx_pkt_lso {
+	WR_HDR;
+	struct cpl_tx_pkt_lso_core c;
+	/* encapsulated CPL (TX_PKT, TX_PKT_XT or TX_DATA) follows here */
+};
+
+struct cpl_iscsi_hdr {
+	union opcode_tid ot;
+	__be16 pdu_len_ddp;
+	__be16 len;
+	__be32 seq;
+	__be16 urg;
+	u8 rsvd;
+	u8 status;
+};
+
+/* cpl_iscsi_hdr.pdu_len_ddp fields */
+#define ISCSI_PDU_LEN_S    0
+#define ISCSI_PDU_LEN_M    0x7FFF
+#define ISCSI_PDU_LEN_V(x) ((x) << ISCSI_PDU_LEN_S)
+#define ISCSI_PDU_LEN_G(x) (((x) >> ISCSI_PDU_LEN_S) & ISCSI_PDU_LEN_M)
+
+#define ISCSI_DDP_S    15
+#define ISCSI_DDP_V(x) ((x) << ISCSI_DDP_S)
+#define ISCSI_DDP_F    ISCSI_DDP_V(1U)
+
+struct cpl_rx_data_ddp {
+	union opcode_tid ot;
+	__be16 urg;
+	__be16 len;
+	__be32 seq;
+	union {
+		__be32 nxt_seq;
+		__be32 ddp_report;
+	};
+	__be32 ulp_crc;
+	__be32 ddpvld;
+};
+
+#define cpl_rx_iscsi_ddp cpl_rx_data_ddp
+
+struct cpl_iscsi_data {
+	union opcode_tid ot;
+	__u8 rsvd0[2];
+	__be16 len;
+	__be32 seq;
+	__be16 urg;
+	__u8 rsvd1;
+	__u8 status;
+};
+
+struct cpl_rx_iscsi_cmp {
+	union opcode_tid ot;
+	__be16 pdu_len_ddp;
+	__be16 len;
+	__be32 seq;
+	__be16 urg;
+	__u8 rsvd;
+	__u8 status;
+	__be32 ulp_crc;
+	__be32 ddpvld;
+};
+
+struct cpl_tx_data_iso {
+	__be32 op_to_scsi;
+	__u8   reserved1;
+	__u8   ahs_len;
+	__be16 mpdu;
+	__be32 burst_size;
+	__be32 len;
+	__be32 reserved2_seglen_offset;
+	__be32 datasn_offset;
+	__be32 buffer_offset;
+	__be32 reserved3;
+
+	/* encapsulated CPL_TX_DATA follows here */
+};
+
+/* cpl_tx_data_iso.op_to_scsi fields */
+#define CPL_TX_DATA_ISO_OP_S	24
+#define CPL_TX_DATA_ISO_OP_M	0xff
+#define CPL_TX_DATA_ISO_OP_V(x)	((x) << CPL_TX_DATA_ISO_OP_S)
+#define CPL_TX_DATA_ISO_OP_G(x)	\
+	(((x) >> CPL_TX_DATA_ISO_OP_S) & CPL_TX_DATA_ISO_OP_M)
+
+#define CPL_TX_DATA_ISO_FIRST_S		23
+#define CPL_TX_DATA_ISO_FIRST_M		0x1
+#define CPL_TX_DATA_ISO_FIRST_V(x)	((x) << CPL_TX_DATA_ISO_FIRST_S)
+#define CPL_TX_DATA_ISO_FIRST_G(x)	\
+	(((x) >> CPL_TX_DATA_ISO_FIRST_S) & CPL_TX_DATA_ISO_FIRST_M)
+#define CPL_TX_DATA_ISO_FIRST_F	CPL_TX_DATA_ISO_FIRST_V(1U)
+
+#define CPL_TX_DATA_ISO_LAST_S		22
+#define CPL_TX_DATA_ISO_LAST_M		0x1
+#define CPL_TX_DATA_ISO_LAST_V(x)	((x) << CPL_TX_DATA_ISO_LAST_S)
+#define CPL_TX_DATA_ISO_LAST_G(x)	\
+	(((x) >> CPL_TX_DATA_ISO_LAST_S) & CPL_TX_DATA_ISO_LAST_M)
+#define CPL_TX_DATA_ISO_LAST_F	CPL_TX_DATA_ISO_LAST_V(1U)
+
+#define CPL_TX_DATA_ISO_CPLHDRLEN_S	21
+#define CPL_TX_DATA_ISO_CPLHDRLEN_M	0x1
+#define CPL_TX_DATA_ISO_CPLHDRLEN_V(x)	((x) << CPL_TX_DATA_ISO_CPLHDRLEN_S)
+#define CPL_TX_DATA_ISO_CPLHDRLEN_G(x)	\
+	(((x) >> CPL_TX_DATA_ISO_CPLHDRLEN_S) & CPL_TX_DATA_ISO_CPLHDRLEN_M)
+#define CPL_TX_DATA_ISO_CPLHDRLEN_F	CPL_TX_DATA_ISO_CPLHDRLEN_V(1U)
+
+#define CPL_TX_DATA_ISO_HDRCRC_S	20
+#define CPL_TX_DATA_ISO_HDRCRC_M	0x1
+#define CPL_TX_DATA_ISO_HDRCRC_V(x)	((x) << CPL_TX_DATA_ISO_HDRCRC_S)
+#define CPL_TX_DATA_ISO_HDRCRC_G(x)	\
+	(((x) >> CPL_TX_DATA_ISO_HDRCRC_S) & CPL_TX_DATA_ISO_HDRCRC_M)
+#define CPL_TX_DATA_ISO_HDRCRC_F	CPL_TX_DATA_ISO_HDRCRC_V(1U)
+
+#define CPL_TX_DATA_ISO_PLDCRC_S	19
+#define CPL_TX_DATA_ISO_PLDCRC_M	0x1
+#define CPL_TX_DATA_ISO_PLDCRC_V(x)	((x) << CPL_TX_DATA_ISO_PLDCRC_S)
+#define CPL_TX_DATA_ISO_PLDCRC_G(x)	\
+	(((x) >> CPL_TX_DATA_ISO_PLDCRC_S) & CPL_TX_DATA_ISO_PLDCRC_M)
+#define CPL_TX_DATA_ISO_PLDCRC_F	CPL_TX_DATA_ISO_PLDCRC_V(1U)
+
+#define CPL_TX_DATA_ISO_IMMEDIATE_S	18
+#define CPL_TX_DATA_ISO_IMMEDIATE_M	0x1
+#define CPL_TX_DATA_ISO_IMMEDIATE_V(x)	((x) << CPL_TX_DATA_ISO_IMMEDIATE_S)
+#define CPL_TX_DATA_ISO_IMMEDIATE_G(x)	\
+	(((x) >> CPL_TX_DATA_ISO_IMMEDIATE_S) & CPL_TX_DATA_ISO_IMMEDIATE_M)
+#define CPL_TX_DATA_ISO_IMMEDIATE_F	CPL_TX_DATA_ISO_IMMEDIATE_V(1U)
+
+#define CPL_TX_DATA_ISO_SCSI_S		16
+#define CPL_TX_DATA_ISO_SCSI_M		0x3
+#define CPL_TX_DATA_ISO_SCSI_V(x)	((x) << CPL_TX_DATA_ISO_SCSI_S)
+#define CPL_TX_DATA_ISO_SCSI_G(x)	\
+	(((x) >> CPL_TX_DATA_ISO_SCSI_S) & CPL_TX_DATA_ISO_SCSI_M)
+
+/* cpl_tx_data_iso.reserved2_seglen_offset fields */
+#define CPL_TX_DATA_ISO_SEGLEN_OFFSET_S		0
+#define CPL_TX_DATA_ISO_SEGLEN_OFFSET_M		0xffffff
+#define CPL_TX_DATA_ISO_SEGLEN_OFFSET_V(x)	\
+	((x) << CPL_TX_DATA_ISO_SEGLEN_OFFSET_S)
+#define CPL_TX_DATA_ISO_SEGLEN_OFFSET_G(x)	\
+	(((x) >> CPL_TX_DATA_ISO_SEGLEN_OFFSET_S) & \
+	 CPL_TX_DATA_ISO_SEGLEN_OFFSET_M)
+
+struct cpl_rx_data {
+	union opcode_tid ot;
+	__be16 rsvd;
+	__be16 len;
+	__be32 seq;
+	__be16 urg;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 dack_mode:2;
+	u8 psh:1;
+	u8 heartbeat:1;
+	u8 ddp_off:1;
+	u8 :3;
+#else
+	u8 :3;
+	u8 ddp_off:1;
+	u8 heartbeat:1;
+	u8 psh:1;
+	u8 dack_mode:2;
+#endif
+	u8 status;
+};
+
+struct cpl_rx_data_ack {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 credit_dack;
+};
+
+/* cpl_rx_data_ack.ack_seq fields */
+#define RX_CREDITS_S    0
+#define RX_CREDITS_V(x) ((x) << RX_CREDITS_S)
+
+#define RX_FORCE_ACK_S    28
+#define RX_FORCE_ACK_V(x) ((x) << RX_FORCE_ACK_S)
+#define RX_FORCE_ACK_F    RX_FORCE_ACK_V(1U)
+
+#define RX_DACK_MODE_S    29
+#define RX_DACK_MODE_M    0x3
+#define RX_DACK_MODE_V(x) ((x) << RX_DACK_MODE_S)
+#define RX_DACK_MODE_G(x) (((x) >> RX_DACK_MODE_S) & RX_DACK_MODE_M)
+
+#define RX_DACK_CHANGE_S    31
+#define RX_DACK_CHANGE_V(x) ((x) << RX_DACK_CHANGE_S)
+#define RX_DACK_CHANGE_F    RX_DACK_CHANGE_V(1U)
+
+struct cpl_rx_pkt {
+	struct rss_header rsshdr;
+	u8 opcode;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 iff:4;
+	u8 csum_calc:1;
+	u8 ipmi_pkt:1;
+	u8 vlan_ex:1;
+	u8 ip_frag:1;
+#else
+	u8 ip_frag:1;
+	u8 vlan_ex:1;
+	u8 ipmi_pkt:1;
+	u8 csum_calc:1;
+	u8 iff:4;
+#endif
+	__be16 csum;
+	__be16 vlan;
+	__be16 len;
+	__be32 l2info;
+	__be16 hdr_len;
+	__be16 err_vec;
+};
+
+#define RX_T6_ETHHDR_LEN_M    0xFF
+#define RX_T6_ETHHDR_LEN_G(x) (((x) >> RX_ETHHDR_LEN_S) & RX_T6_ETHHDR_LEN_M)
+
+#define RXF_PSH_S    20
+#define RXF_PSH_V(x) ((x) << RXF_PSH_S)
+#define RXF_PSH_F    RXF_PSH_V(1U)
+
+#define RXF_SYN_S    21
+#define RXF_SYN_V(x) ((x) << RXF_SYN_S)
+#define RXF_SYN_F    RXF_SYN_V(1U)
+
+#define RXF_UDP_S    22
+#define RXF_UDP_V(x) ((x) << RXF_UDP_S)
+#define RXF_UDP_F    RXF_UDP_V(1U)
+
+#define RXF_TCP_S    23
+#define RXF_TCP_V(x) ((x) << RXF_TCP_S)
+#define RXF_TCP_F    RXF_TCP_V(1U)
+
+#define RXF_IP_S    24
+#define RXF_IP_V(x) ((x) << RXF_IP_S)
+#define RXF_IP_F    RXF_IP_V(1U)
+
+#define RXF_IP6_S    25
+#define RXF_IP6_V(x) ((x) << RXF_IP6_S)
+#define RXF_IP6_F    RXF_IP6_V(1U)
+
+#define RXF_SYN_COOKIE_S    26
+#define RXF_SYN_COOKIE_V(x) ((x) << RXF_SYN_COOKIE_S)
+#define RXF_SYN_COOKIE_F    RXF_SYN_COOKIE_V(1U)
+
+#define RXF_FCOE_S    26
+#define RXF_FCOE_V(x) ((x) << RXF_FCOE_S)
+#define RXF_FCOE_F    RXF_FCOE_V(1U)
+
+#define RXF_LRO_S    27
+#define RXF_LRO_V(x) ((x) << RXF_LRO_S)
+#define RXF_LRO_F    RXF_LRO_V(1U)
+
+/* rx_pkt.l2info fields */
+#define RX_ETHHDR_LEN_S    0
+#define RX_ETHHDR_LEN_M    0x1F
+#define RX_ETHHDR_LEN_V(x) ((x) << RX_ETHHDR_LEN_S)
+#define RX_ETHHDR_LEN_G(x) (((x) >> RX_ETHHDR_LEN_S) & RX_ETHHDR_LEN_M)
+
+#define RX_T5_ETHHDR_LEN_S    0
+#define RX_T5_ETHHDR_LEN_M    0x3F
+#define RX_T5_ETHHDR_LEN_V(x) ((x) << RX_T5_ETHHDR_LEN_S)
+#define RX_T5_ETHHDR_LEN_G(x) (((x) >> RX_T5_ETHHDR_LEN_S) & RX_T5_ETHHDR_LEN_M)
+
+#define RX_MACIDX_S    8
+#define RX_MACIDX_M    0x1FF
+#define RX_MACIDX_V(x) ((x) << RX_MACIDX_S)
+#define RX_MACIDX_G(x) (((x) >> RX_MACIDX_S) & RX_MACIDX_M)
+
+#define RXF_SYN_S    21
+#define RXF_SYN_V(x) ((x) << RXF_SYN_S)
+#define RXF_SYN_F    RXF_SYN_V(1U)
+
+#define RX_CHAN_S    28
+#define RX_CHAN_M    0xF
+#define RX_CHAN_V(x) ((x) << RX_CHAN_S)
+#define RX_CHAN_G(x) (((x) >> RX_CHAN_S) & RX_CHAN_M)
+
+/* rx_pkt.hdr_len fields */
+#define RX_TCPHDR_LEN_S    0
+#define RX_TCPHDR_LEN_M    0x3F
+#define RX_TCPHDR_LEN_V(x) ((x) << RX_TCPHDR_LEN_S)
+#define RX_TCPHDR_LEN_G(x) (((x) >> RX_TCPHDR_LEN_S) & RX_TCPHDR_LEN_M)
+
+#define RX_IPHDR_LEN_S    6
+#define RX_IPHDR_LEN_M    0x3FF
+#define RX_IPHDR_LEN_V(x) ((x) << RX_IPHDR_LEN_S)
+#define RX_IPHDR_LEN_G(x) (((x) >> RX_IPHDR_LEN_S) & RX_IPHDR_LEN_M)
+
+/* rx_pkt.err_vec fields */
+#define RXERR_CSUM_S    13
+#define RXERR_CSUM_V(x) ((x) << RXERR_CSUM_S)
+#define RXERR_CSUM_F    RXERR_CSUM_V(1U)
+
+#define T6_COMPR_RXERR_LEN_S    1
+#define T6_COMPR_RXERR_LEN_V(x) ((x) << T6_COMPR_RXERR_LEN_S)
+#define T6_COMPR_RXERR_LEN_F    T6_COMPR_RXERR_LEN_V(1U)
+
+#define T6_COMPR_RXERR_VEC_S    0
+#define T6_COMPR_RXERR_VEC_M    0x3F
+#define T6_COMPR_RXERR_VEC_V(x) ((x) << T6_COMPR_RXERR_LEN_S)
+#define T6_COMPR_RXERR_VEC_G(x) \
+		(((x) >> T6_COMPR_RXERR_VEC_S) & T6_COMPR_RXERR_VEC_M)
+
+/* Logical OR of RX_ERROR_CSUM, RX_ERROR_CSIP */
+#define T6_COMPR_RXERR_SUM_S    4
+#define T6_COMPR_RXERR_SUM_V(x) ((x) << T6_COMPR_RXERR_SUM_S)
+#define T6_COMPR_RXERR_SUM_F    T6_COMPR_RXERR_SUM_V(1U)
+
+struct cpl_trace_pkt {
+	u8 opcode;
+	u8 intf;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 runt:4;
+	u8 filter_hit:4;
+	u8 :6;
+	u8 err:1;
+	u8 trunc:1;
+#else
+	u8 filter_hit:4;
+	u8 runt:4;
+	u8 trunc:1;
+	u8 err:1;
+	u8 :6;
+#endif
+	__be16 rsvd;
+	__be16 len;
+	__be64 tstamp;
+};
+
+struct cpl_t5_trace_pkt {
+	__u8 opcode;
+	__u8 intf;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 runt:4;
+	__u8 filter_hit:4;
+	__u8:6;
+	__u8 err:1;
+	__u8 trunc:1;
+#else
+	__u8 filter_hit:4;
+	__u8 runt:4;
+	__u8 trunc:1;
+	__u8 err:1;
+	__u8:6;
+#endif
+	__be16 rsvd;
+	__be16 len;
+	__be64 tstamp;
+	__be64 rsvd1;
+};
+
+struct cpl_l2t_write_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 params;
+	__be16 l2t_idx;
+	__be16 vlan;
+	u8 dst_mac[6];
+};
+
+/* cpl_l2t_write_req.params fields */
+#define L2T_W_INFO_S    2
+#define L2T_W_INFO_V(x) ((x) << L2T_W_INFO_S)
+
+#define L2T_W_PORT_S    8
+#define L2T_W_PORT_V(x) ((x) << L2T_W_PORT_S)
+
+#define L2T_W_NOREPLY_S    15
+#define L2T_W_NOREPLY_V(x) ((x) << L2T_W_NOREPLY_S)
+#define L2T_W_NOREPLY_F    L2T_W_NOREPLY_V(1U)
+
+#define CPL_L2T_VLAN_NONE 0xfff
+
+struct cpl_l2t_write_rpl {
+	union opcode_tid ot;
+	u8 status;
+	u8 rsvd[3];
+};
+
+struct cpl_smt_write_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 params;
+	__be16 pfvf1;
+	u8 src_mac1[6];
+	__be16 pfvf0;
+	u8 src_mac0[6];
+};
+
+struct cpl_t6_smt_write_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 params;
+	__be64 tag;
+	__be16 pfvf0;
+	u8 src_mac0[6];
+	__be32 local_ip;
+	__be32 rsvd;
+};
+
+struct cpl_smt_write_rpl {
+	union opcode_tid ot;
+	u8 status;
+	u8 rsvd[3];
+};
+
+/* cpl_smt_{read,write}_req.params fields */
+#define SMTW_OVLAN_IDX_S	16
+#define SMTW_OVLAN_IDX_V(x)	((x) << SMTW_OVLAN_IDX_S)
+
+#define SMTW_IDX_S	20
+#define SMTW_IDX_V(x)	((x) << SMTW_IDX_S)
+
+#define SMTW_NORPL_S	31
+#define SMTW_NORPL_V(x)	((x) << SMTW_NORPL_S)
+#define SMTW_NORPL_F	SMTW_NORPL_V(1U)
+
+struct cpl_rdma_terminate {
+	union opcode_tid ot;
+	__be16 rsvd;
+	__be16 len;
+};
+
+struct cpl_sge_egr_update {
+	__be32 opcode_qid;
+	__be16 cidx;
+	__be16 pidx;
+};
+
+/* cpl_sge_egr_update.ot fields */
+#define EGR_QID_S    0
+#define EGR_QID_M    0x1FFFF
+#define EGR_QID_G(x) (((x) >> EGR_QID_S) & EGR_QID_M)
+
+/* cpl_fw*.type values */
+enum {
+	FW_TYPE_CMD_RPL = 0,
+	FW_TYPE_WR_RPL = 1,
+	FW_TYPE_CQE = 2,
+	FW_TYPE_OFLD_CONNECTION_WR_RPL = 3,
+	FW_TYPE_RSSCPL = 4,
+};
+
+struct cpl_fw4_pld {
+	u8 opcode;
+	u8 rsvd0[3];
+	u8 type;
+	u8 rsvd1;
+	__be16 len;
+	__be64 data;
+	__be64 rsvd2;
+};
+
+struct cpl_fw6_pld {
+	u8 opcode;
+	u8 rsvd[5];
+	__be16 len;
+	__be64 data[4];
+};
+
+struct cpl_fw4_msg {
+	u8 opcode;
+	u8 type;
+	__be16 rsvd0;
+	__be32 rsvd1;
+	__be64 data[2];
+};
+
+struct cpl_fw4_ack {
+	union opcode_tid ot;
+	u8 credits;
+	u8 rsvd0[2];
+	u8 seq_vld;
+	__be32 snd_nxt;
+	__be32 snd_una;
+	__be64 rsvd1;
+};
+
+enum {
+	CPL_FW4_ACK_FLAGS_SEQVAL	= 0x1,	/* seqn valid */
+	CPL_FW4_ACK_FLAGS_CH		= 0x2,	/* channel change complete */
+	CPL_FW4_ACK_FLAGS_FLOWC		= 0x4,	/* fw_flowc_wr complete */
+};
+
+struct cpl_fw6_msg {
+	u8 opcode;
+	u8 type;
+	__be16 rsvd0;
+	__be32 rsvd1;
+	__be64 data[4];
+};
+
+/* cpl_fw6_msg.type values */
+enum {
+	FW6_TYPE_CMD_RPL = 0,
+	FW6_TYPE_WR_RPL = 1,
+	FW6_TYPE_CQE = 2,
+	FW6_TYPE_OFLD_CONNECTION_WR_RPL = 3,
+	FW6_TYPE_RSSCPL = FW_TYPE_RSSCPL,
+};
+
+struct cpl_fw6_msg_ofld_connection_wr_rpl {
+	__u64   cookie;
+	__be32  tid;    /* or atid in case of active failure */
+	__u8    t_state;
+	__u8    retval;
+	__u8    rsvd[2];
+};
+
+struct cpl_tx_data {
+	union opcode_tid ot;
+	__be32 len;
+	__be32 rsvd;
+	__be32 flags;
+};
+
+/* cpl_tx_data.flags field */
+#define TX_FORCE_S	13
+#define TX_FORCE_V(x)	((x) << TX_FORCE_S)
+
+#define T6_TX_FORCE_S		20
+#define T6_TX_FORCE_V(x)	((x) << T6_TX_FORCE_S)
+#define T6_TX_FORCE_F		T6_TX_FORCE_V(1U)
+
+#define TX_SHOVE_S    14
+#define TX_SHOVE_V(x) ((x) << TX_SHOVE_S)
+
+#define TX_ULP_MODE_S    10
+#define TX_ULP_MODE_M    0x7
+#define TX_ULP_MODE_V(x) ((x) << TX_ULP_MODE_S)
+#define TX_ULP_MODE_G(x) (((x) >> TX_ULP_MODE_S) & TX_ULP_MODE_M)
+
+enum {
+	ULP_TX_MEM_READ = 2,
+	ULP_TX_MEM_WRITE = 3,
+	ULP_TX_PKT = 4
+};
+
+enum {
+	ULP_TX_SC_NOOP = 0x80,
+	ULP_TX_SC_IMM  = 0x81,
+	ULP_TX_SC_DSGL = 0x82,
+	ULP_TX_SC_ISGL = 0x83,
+	ULP_TX_SC_MEMRD = 0x86
+};
+
+#define ULPTX_CMD_S    24
+#define ULPTX_CMD_V(x) ((x) << ULPTX_CMD_S)
+
+#define ULPTX_LEN16_S    0
+#define ULPTX_LEN16_M    0xFF
+#define ULPTX_LEN16_V(x) ((x) << ULPTX_LEN16_S)
+
+#define ULP_TX_SC_MORE_S 23
+#define ULP_TX_SC_MORE_V(x) ((x) << ULP_TX_SC_MORE_S)
+#define ULP_TX_SC_MORE_F  ULP_TX_SC_MORE_V(1U)
+
+struct ulptx_sge_pair {
+	__be32 len[2];
+	__be64 addr[2];
+};
+
+struct ulptx_sgl {
+	__be32 cmd_nsge;
+	__be32 len0;
+	__be64 addr0;
+	struct ulptx_sge_pair sge[0];
+};
+
+struct ulptx_idata {
+	__be32 cmd_more;
+	__be32 len;
+};
+
+struct ulp_txpkt {
+	__be32 cmd_dest;
+	__be32 len;
+};
+
+#define ULPTX_CMD_S    24
+#define ULPTX_CMD_M    0xFF
+#define ULPTX_CMD_V(x) ((x) << ULPTX_CMD_S)
+
+#define ULPTX_NSGE_S    0
+#define ULPTX_NSGE_V(x) ((x) << ULPTX_NSGE_S)
+
+#define ULPTX_MORE_S	23
+#define ULPTX_MORE_V(x)	((x) << ULPTX_MORE_S)
+#define ULPTX_MORE_F	ULPTX_MORE_V(1U)
+
+#define ULP_TXPKT_DEST_S    16
+#define ULP_TXPKT_DEST_M    0x3
+#define ULP_TXPKT_DEST_V(x) ((x) << ULP_TXPKT_DEST_S)
+
+#define ULP_TXPKT_FID_S     4
+#define ULP_TXPKT_FID_M     0x7ff
+#define ULP_TXPKT_FID_V(x)  ((x) << ULP_TXPKT_FID_S)
+
+#define ULP_TXPKT_RO_S      3
+#define ULP_TXPKT_RO_V(x) ((x) << ULP_TXPKT_RO_S)
+#define ULP_TXPKT_RO_F ULP_TXPKT_RO_V(1U)
+
+enum cpl_tx_tnl_lso_type {
+	TX_TNL_TYPE_OPAQUE,
+	TX_TNL_TYPE_NVGRE,
+	TX_TNL_TYPE_VXLAN,
+	TX_TNL_TYPE_GENEVE,
+};
+
+struct cpl_tx_tnl_lso {
+	__be32 op_to_IpIdSplitOut;
+	__be16 IpIdOffsetOut;
+	__be16 UdpLenSetOut_to_TnlHdrLen;
+	__be64 r1;
+	__be32 Flow_to_TcpHdrLen;
+	__be16 IpIdOffset;
+	__be16 IpIdSplit_to_Mss;
+	__be32 TCPSeqOffset;
+	__be32 EthLenOffset_Size;
+	/* encapsulated CPL (TX_PKT_XT) follows here */
+};
+
+#define CPL_TX_TNL_LSO_OPCODE_S		24
+#define CPL_TX_TNL_LSO_OPCODE_M		0xff
+#define CPL_TX_TNL_LSO_OPCODE_V(x)      ((x) << CPL_TX_TNL_LSO_OPCODE_S)
+#define CPL_TX_TNL_LSO_OPCODE_G(x)      \
+	(((x) >> CPL_TX_TNL_LSO_OPCODE_S) & CPL_TX_TNL_LSO_OPCODE_M)
+
+#define CPL_TX_TNL_LSO_FIRST_S		23
+#define CPL_TX_TNL_LSO_FIRST_M		0x1
+#define CPL_TX_TNL_LSO_FIRST_V(x)	((x) << CPL_TX_TNL_LSO_FIRST_S)
+#define CPL_TX_TNL_LSO_FIRST_G(x)	\
+	(((x) >> CPL_TX_TNL_LSO_FIRST_S) & CPL_TX_TNL_LSO_FIRST_M)
+#define CPL_TX_TNL_LSO_FIRST_F		CPL_TX_TNL_LSO_FIRST_V(1U)
+
+#define CPL_TX_TNL_LSO_LAST_S		22
+#define CPL_TX_TNL_LSO_LAST_M		0x1
+#define CPL_TX_TNL_LSO_LAST_V(x)	((x) << CPL_TX_TNL_LSO_LAST_S)
+#define CPL_TX_TNL_LSO_LAST_G(x)	\
+	(((x) >> CPL_TX_TNL_LSO_LAST_S) & CPL_TX_TNL_LSO_LAST_M)
+#define CPL_TX_TNL_LSO_LAST_F		CPL_TX_TNL_LSO_LAST_V(1U)
+
+#define CPL_TX_TNL_LSO_ETHHDRLENXOUT_S	21
+#define CPL_TX_TNL_LSO_ETHHDRLENXOUT_M	0x1
+#define CPL_TX_TNL_LSO_ETHHDRLENXOUT_V(x) \
+	((x) << CPL_TX_TNL_LSO_ETHHDRLENXOUT_S)
+#define CPL_TX_TNL_LSO_ETHHDRLENXOUT_G(x) \
+	(((x) >> CPL_TX_TNL_LSO_ETHHDRLENXOUT_S) & \
+	 CPL_TX_TNL_LSO_ETHHDRLENXOUT_M)
+#define CPL_TX_TNL_LSO_ETHHDRLENXOUT_F CPL_TX_TNL_LSO_ETHHDRLENXOUT_V(1U)
+
+#define CPL_TX_TNL_LSO_IPV6OUT_S	20
+#define CPL_TX_TNL_LSO_IPV6OUT_M	0x1
+#define CPL_TX_TNL_LSO_IPV6OUT_V(x)	((x) << CPL_TX_TNL_LSO_IPV6OUT_S)
+#define CPL_TX_TNL_LSO_IPV6OUT_G(x)	\
+	(((x) >> CPL_TX_TNL_LSO_IPV6OUT_S) & CPL_TX_TNL_LSO_IPV6OUT_M)
+#define CPL_TX_TNL_LSO_IPV6OUT_F        CPL_TX_TNL_LSO_IPV6OUT_V(1U)
+
+#define CPL_TX_TNL_LSO_ETHHDRLEN_S	16
+#define CPL_TX_TNL_LSO_ETHHDRLEN_M	0xf
+#define CPL_TX_TNL_LSO_ETHHDRLEN_V(x)	((x) << CPL_TX_TNL_LSO_ETHHDRLEN_S)
+#define CPL_TX_TNL_LSO_ETHHDRLEN_G(x)	\
+	(((x) >> CPL_TX_TNL_LSO_ETHHDRLEN_S) & CPL_TX_TNL_LSO_ETHHDRLEN_M)
+
+#define CPL_TX_TNL_LSO_IPHDRLEN_S	4
+#define CPL_TX_TNL_LSO_IPHDRLEN_M	0xfff
+#define CPL_TX_TNL_LSO_IPHDRLEN_V(x)	((x) << CPL_TX_TNL_LSO_IPHDRLEN_S)
+#define CPL_TX_TNL_LSO_IPHDRLEN_G(x)    \
+	(((x) >> CPL_TX_TNL_LSO_IPHDRLEN_S) & CPL_TX_TNL_LSO_IPHDRLEN_M)
+
+#define CPL_TX_TNL_LSO_TCPHDRLEN_S	0
+#define CPL_TX_TNL_LSO_TCPHDRLEN_M	0xf
+#define CPL_TX_TNL_LSO_TCPHDRLEN_V(x)	((x) << CPL_TX_TNL_LSO_TCPHDRLEN_S)
+#define CPL_TX_TNL_LSO_TCPHDRLEN_G(x)   \
+	(((x) >> CPL_TX_TNL_LSO_TCPHDRLEN_S) & CPL_TX_TNL_LSO_TCPHDRLEN_M)
+
+#define CPL_TX_TNL_LSO_MSS_S            0
+#define CPL_TX_TNL_LSO_MSS_M            0x3fff
+#define CPL_TX_TNL_LSO_MSS_V(x)         ((x) << CPL_TX_TNL_LSO_MSS_S)
+#define CPL_TX_TNL_LSO_MSS_G(x)         \
+	(((x) >> CPL_TX_TNL_LSO_MSS_S) & CPL_TX_TNL_LSO_MSS_M)
+
+#define CPL_TX_TNL_LSO_SIZE_S		0
+#define CPL_TX_TNL_LSO_SIZE_M		0xfffffff
+#define CPL_TX_TNL_LSO_SIZE_V(x)	((x) << CPL_TX_TNL_LSO_SIZE_S)
+#define CPL_TX_TNL_LSO_SIZE_G(x)	\
+	(((x) >> CPL_TX_TNL_LSO_SIZE_S) & CPL_TX_TNL_LSO_SIZE_M)
+
+#define CPL_TX_TNL_LSO_ETHHDRLENOUT_S   16
+#define CPL_TX_TNL_LSO_ETHHDRLENOUT_M   0xf
+#define CPL_TX_TNL_LSO_ETHHDRLENOUT_V(x) \
+	((x) << CPL_TX_TNL_LSO_ETHHDRLENOUT_S)
+#define CPL_TX_TNL_LSO_ETHHDRLENOUT_G(x) \
+	(((x) >> CPL_TX_TNL_LSO_ETHHDRLENOUT_S) & CPL_TX_TNL_LSO_ETHHDRLENOUT_M)
+
+#define CPL_TX_TNL_LSO_IPHDRLENOUT_S    4
+#define CPL_TX_TNL_LSO_IPHDRLENOUT_M    0xfff
+#define CPL_TX_TNL_LSO_IPHDRLENOUT_V(x) ((x) << CPL_TX_TNL_LSO_IPHDRLENOUT_S)
+#define CPL_TX_TNL_LSO_IPHDRLENOUT_G(x) \
+	(((x) >> CPL_TX_TNL_LSO_IPHDRLENOUT_S) & CPL_TX_TNL_LSO_IPHDRLENOUT_M)
+
+#define CPL_TX_TNL_LSO_IPHDRCHKOUT_S    3
+#define CPL_TX_TNL_LSO_IPHDRCHKOUT_M    0x1
+#define CPL_TX_TNL_LSO_IPHDRCHKOUT_V(x) ((x) << CPL_TX_TNL_LSO_IPHDRCHKOUT_S)
+#define CPL_TX_TNL_LSO_IPHDRCHKOUT_G(x) \
+	(((x) >> CPL_TX_TNL_LSO_IPHDRCHKOUT_S) & CPL_TX_TNL_LSO_IPHDRCHKOUT_M)
+#define CPL_TX_TNL_LSO_IPHDRCHKOUT_F    CPL_TX_TNL_LSO_IPHDRCHKOUT_V(1U)
+
+#define CPL_TX_TNL_LSO_IPLENSETOUT_S	2
+#define CPL_TX_TNL_LSO_IPLENSETOUT_M	0x1
+#define CPL_TX_TNL_LSO_IPLENSETOUT_V(x) ((x) << CPL_TX_TNL_LSO_IPLENSETOUT_S)
+#define CPL_TX_TNL_LSO_IPLENSETOUT_G(x) \
+	(((x) >> CPL_TX_TNL_LSO_IPLENSETOUT_S) & CPL_TX_TNL_LSO_IPLENSETOUT_M)
+#define CPL_TX_TNL_LSO_IPLENSETOUT_F	CPL_TX_TNL_LSO_IPLENSETOUT_V(1U)
+
+#define CPL_TX_TNL_LSO_IPIDINCOUT_S	1
+#define CPL_TX_TNL_LSO_IPIDINCOUT_M	0x1
+#define CPL_TX_TNL_LSO_IPIDINCOUT_V(x)  ((x) << CPL_TX_TNL_LSO_IPIDINCOUT_S)
+#define CPL_TX_TNL_LSO_IPIDINCOUT_G(x)  \
+	(((x) >> CPL_TX_TNL_LSO_IPIDINCOUT_S) & CPL_TX_TNL_LSO_IPIDINCOUT_M)
+#define CPL_TX_TNL_LSO_IPIDINCOUT_F     CPL_TX_TNL_LSO_IPIDINCOUT_V(1U)
+
+#define CPL_TX_TNL_LSO_UDPCHKCLROUT_S   14
+#define CPL_TX_TNL_LSO_UDPCHKCLROUT_M   0x1
+#define CPL_TX_TNL_LSO_UDPCHKCLROUT_V(x) \
+	((x) << CPL_TX_TNL_LSO_UDPCHKCLROUT_S)
+#define CPL_TX_TNL_LSO_UDPCHKCLROUT_G(x) \
+	(((x) >> CPL_TX_TNL_LSO_UDPCHKCLROUT_S) & \
+	 CPL_TX_TNL_LSO_UDPCHKCLROUT_M)
+#define CPL_TX_TNL_LSO_UDPCHKCLROUT_F   CPL_TX_TNL_LSO_UDPCHKCLROUT_V(1U)
+
+#define CPL_TX_TNL_LSO_UDPLENSETOUT_S   15
+#define CPL_TX_TNL_LSO_UDPLENSETOUT_M   0x1
+#define CPL_TX_TNL_LSO_UDPLENSETOUT_V(x) \
+	((x) << CPL_TX_TNL_LSO_UDPLENSETOUT_S)
+#define CPL_TX_TNL_LSO_UDPLENSETOUT_G(x) \
+	(((x) >> CPL_TX_TNL_LSO_UDPLENSETOUT_S) & \
+	 CPL_TX_TNL_LSO_UDPLENSETOUT_M)
+#define CPL_TX_TNL_LSO_UDPLENSETOUT_F   CPL_TX_TNL_LSO_UDPLENSETOUT_V(1U)
+
+#define CPL_TX_TNL_LSO_TNLTYPE_S	12
+#define CPL_TX_TNL_LSO_TNLTYPE_M	0x3
+#define CPL_TX_TNL_LSO_TNLTYPE_V(x)	((x) << CPL_TX_TNL_LSO_TNLTYPE_S)
+#define CPL_TX_TNL_LSO_TNLTYPE_G(x)	\
+	(((x) >> CPL_TX_TNL_LSO_TNLTYPE_S) & CPL_TX_TNL_LSO_TNLTYPE_M)
+
+#define S_CPL_TX_TNL_LSO_ETHHDRLEN	16
+#define M_CPL_TX_TNL_LSO_ETHHDRLEN	0xf
+#define V_CPL_TX_TNL_LSO_ETHHDRLEN(x)	((x) << S_CPL_TX_TNL_LSO_ETHHDRLEN)
+#define G_CPL_TX_TNL_LSO_ETHHDRLEN(x)	\
+	(((x) >> S_CPL_TX_TNL_LSO_ETHHDRLEN) & M_CPL_TX_TNL_LSO_ETHHDRLEN)
+
+#define CPL_TX_TNL_LSO_TNLHDRLEN_S      0
+#define CPL_TX_TNL_LSO_TNLHDRLEN_M      0xfff
+#define CPL_TX_TNL_LSO_TNLHDRLEN_V(x)	((x) << CPL_TX_TNL_LSO_TNLHDRLEN_S)
+#define CPL_TX_TNL_LSO_TNLHDRLEN_G(x)   \
+	(((x) >> CPL_TX_TNL_LSO_TNLHDRLEN_S) & CPL_TX_TNL_LSO_TNLHDRLEN_M)
+
+#define CPL_TX_TNL_LSO_IPV6_S		20
+#define CPL_TX_TNL_LSO_IPV6_M		0x1
+#define CPL_TX_TNL_LSO_IPV6_V(x)	((x) << CPL_TX_TNL_LSO_IPV6_S)
+#define CPL_TX_TNL_LSO_IPV6_G(x)	\
+	(((x) >> CPL_TX_TNL_LSO_IPV6_S) & CPL_TX_TNL_LSO_IPV6_M)
+#define CPL_TX_TNL_LSO_IPV6_F		CPL_TX_TNL_LSO_IPV6_V(1U)
+
+#define ULP_TX_SC_MORE_S 23
+#define ULP_TX_SC_MORE_V(x) ((x) << ULP_TX_SC_MORE_S)
+#define ULP_TX_SC_MORE_F  ULP_TX_SC_MORE_V(1U)
+
+struct ulp_mem_io {
+	WR_HDR;
+	__be32 cmd;
+	__be32 len16;             /* command length */
+	__be32 dlen;              /* data length in 32-byte units */
+	__be32 lock_addr;
+};
+
+#define ULP_MEMIO_LOCK_S    31
+#define ULP_MEMIO_LOCK_V(x) ((x) << ULP_MEMIO_LOCK_S)
+#define ULP_MEMIO_LOCK_F    ULP_MEMIO_LOCK_V(1U)
+
+/* additional ulp_mem_io.cmd fields */
+#define ULP_MEMIO_ORDER_S    23
+#define ULP_MEMIO_ORDER_V(x) ((x) << ULP_MEMIO_ORDER_S)
+#define ULP_MEMIO_ORDER_F    ULP_MEMIO_ORDER_V(1U)
+
+#define T5_ULP_MEMIO_IMM_S    23
+#define T5_ULP_MEMIO_IMM_V(x) ((x) << T5_ULP_MEMIO_IMM_S)
+#define T5_ULP_MEMIO_IMM_F    T5_ULP_MEMIO_IMM_V(1U)
+
+#define T5_ULP_MEMIO_ORDER_S    22
+#define T5_ULP_MEMIO_ORDER_V(x) ((x) << T5_ULP_MEMIO_ORDER_S)
+#define T5_ULP_MEMIO_ORDER_F    T5_ULP_MEMIO_ORDER_V(1U)
+
+#define T5_ULP_MEMIO_FID_S	4
+#define T5_ULP_MEMIO_FID_M	0x7ff
+#define T5_ULP_MEMIO_FID_V(x)	((x) << T5_ULP_MEMIO_FID_S)
+
+/* ulp_mem_io.lock_addr fields */
+#define ULP_MEMIO_ADDR_S    0
+#define ULP_MEMIO_ADDR_V(x) ((x) << ULP_MEMIO_ADDR_S)
+
+/* ulp_mem_io.dlen fields */
+#define ULP_MEMIO_DATA_LEN_S    0
+#define ULP_MEMIO_DATA_LEN_V(x) ((x) << ULP_MEMIO_DATA_LEN_S)
+
+#define ULPTX_NSGE_S    0
+#define ULPTX_NSGE_M    0xFFFF
+#define ULPTX_NSGE_V(x) ((x) << ULPTX_NSGE_S)
+#define ULPTX_NSGE_G(x) (((x) >> ULPTX_NSGE_S) & ULPTX_NSGE_M)
+
+struct ulptx_sc_memrd {
+	__be32 cmd_to_len;
+	__be32 addr;
+};
+
+#define ULP_TXPKT_DATAMODIFY_S       23
+#define ULP_TXPKT_DATAMODIFY_M       0x1
+#define ULP_TXPKT_DATAMODIFY_V(x)    ((x) << ULP_TXPKT_DATAMODIFY_S)
+#define ULP_TXPKT_DATAMODIFY_G(x)    \
+	(((x) >> ULP_TXPKT_DATAMODIFY_S) & ULP_TXPKT_DATAMODIFY__M)
+#define ULP_TXPKT_DATAMODIFY_F       ULP_TXPKT_DATAMODIFY_V(1U)
+
+#define ULP_TXPKT_CHANNELID_S        22
+#define ULP_TXPKT_CHANNELID_M        0x1
+#define ULP_TXPKT_CHANNELID_V(x)     ((x) << ULP_TXPKT_CHANNELID_S)
+#define ULP_TXPKT_CHANNELID_G(x)     \
+	(((x) >> ULP_TXPKT_CHANNELID_S) & ULP_TXPKT_CHANNELID_M)
+#define ULP_TXPKT_CHANNELID_F        ULP_TXPKT_CHANNELID_V(1U)
+
+#define SCMD_SEQ_NO_CTRL_S      29
+#define SCMD_SEQ_NO_CTRL_M      0x3
+#define SCMD_SEQ_NO_CTRL_V(x)   ((x) << SCMD_SEQ_NO_CTRL_S)
+#define SCMD_SEQ_NO_CTRL_G(x)   \
+	(((x) >> SCMD_SEQ_NO_CTRL_S) & SCMD_SEQ_NO_CTRL_M)
+
+/* StsFieldPrsnt- Status field at the end of the TLS PDU */
+#define SCMD_STATUS_PRESENT_S   28
+#define SCMD_STATUS_PRESENT_M   0x1
+#define SCMD_STATUS_PRESENT_V(x)    ((x) << SCMD_STATUS_PRESENT_S)
+#define SCMD_STATUS_PRESENT_G(x)    \
+	(((x) >> SCMD_STATUS_PRESENT_S) & SCMD_STATUS_PRESENT_M)
+#define SCMD_STATUS_PRESENT_F   SCMD_STATUS_PRESENT_V(1U)
+
+/* ProtoVersion - Protocol Version 0: 1.2, 1:1.1, 2:DTLS, 3:Generic,
+ * 3-15: Reserved.
+ */
+#define SCMD_PROTO_VERSION_S    24
+#define SCMD_PROTO_VERSION_M    0xf
+#define SCMD_PROTO_VERSION_V(x) ((x) << SCMD_PROTO_VERSION_S)
+#define SCMD_PROTO_VERSION_G(x) \
+	(((x) >> SCMD_PROTO_VERSION_S) & SCMD_PROTO_VERSION_M)
+
+/* EncDecCtrl - Encryption/Decryption Control. 0: Encrypt, 1: Decrypt */
+#define SCMD_ENC_DEC_CTRL_S     23
+#define SCMD_ENC_DEC_CTRL_M     0x1
+#define SCMD_ENC_DEC_CTRL_V(x)  ((x) << SCMD_ENC_DEC_CTRL_S)
+#define SCMD_ENC_DEC_CTRL_G(x)  \
+	(((x) >> SCMD_ENC_DEC_CTRL_S) & SCMD_ENC_DEC_CTRL_M)
+#define SCMD_ENC_DEC_CTRL_F SCMD_ENC_DEC_CTRL_V(1U)
+
+/* CipherAuthSeqCtrl - Cipher Authentication Sequence Control. */
+#define SCMD_CIPH_AUTH_SEQ_CTRL_S       22
+#define SCMD_CIPH_AUTH_SEQ_CTRL_M       0x1
+#define SCMD_CIPH_AUTH_SEQ_CTRL_V(x)    \
+	((x) << SCMD_CIPH_AUTH_SEQ_CTRL_S)
+#define SCMD_CIPH_AUTH_SEQ_CTRL_G(x)    \
+	(((x) >> SCMD_CIPH_AUTH_SEQ_CTRL_S) & SCMD_CIPH_AUTH_SEQ_CTRL_M)
+#define SCMD_CIPH_AUTH_SEQ_CTRL_F   SCMD_CIPH_AUTH_SEQ_CTRL_V(1U)
+
+/* CiphMode -  Cipher Mode. 0: NOP, 1:AES-CBC, 2:AES-GCM, 3:AES-CTR,
+ * 4:Generic-AES, 5-15: Reserved.
+ */
+#define SCMD_CIPH_MODE_S    18
+#define SCMD_CIPH_MODE_M    0xf
+#define SCMD_CIPH_MODE_V(x) ((x) << SCMD_CIPH_MODE_S)
+#define SCMD_CIPH_MODE_G(x) \
+	(((x) >> SCMD_CIPH_MODE_S) & SCMD_CIPH_MODE_M)
+
+/* AuthMode - Auth Mode. 0: NOP, 1:SHA1, 2:SHA2-224, 3:SHA2-256
+ * 4-15: Reserved
+ */
+#define SCMD_AUTH_MODE_S    14
+#define SCMD_AUTH_MODE_M    0xf
+#define SCMD_AUTH_MODE_V(x) ((x) << SCMD_AUTH_MODE_S)
+#define SCMD_AUTH_MODE_G(x) \
+	(((x) >> SCMD_AUTH_MODE_S) & SCMD_AUTH_MODE_M)
+
+/* HmacCtrl - HMAC Control. 0:NOP, 1:No truncation, 2:Support HMAC Truncation
+ * per RFC 4366, 3:IPSec 96 bits, 4-7:Reserved
+ */
+#define SCMD_HMAC_CTRL_S    11
+#define SCMD_HMAC_CTRL_M    0x7
+#define SCMD_HMAC_CTRL_V(x) ((x) << SCMD_HMAC_CTRL_S)
+#define SCMD_HMAC_CTRL_G(x) \
+	(((x) >> SCMD_HMAC_CTRL_S) & SCMD_HMAC_CTRL_M)
+
+/* IvSize - IV size in units of 2 bytes */
+#define SCMD_IV_SIZE_S  7
+#define SCMD_IV_SIZE_M  0xf
+#define SCMD_IV_SIZE_V(x)   ((x) << SCMD_IV_SIZE_S)
+#define SCMD_IV_SIZE_G(x)   \
+	(((x) >> SCMD_IV_SIZE_S) & SCMD_IV_SIZE_M)
+
+/* NumIVs - Number of IVs */
+#define SCMD_NUM_IVS_S  0
+#define SCMD_NUM_IVS_M  0x7f
+#define SCMD_NUM_IVS_V(x)   ((x) << SCMD_NUM_IVS_S)
+#define SCMD_NUM_IVS_G(x)   \
+	(((x) >> SCMD_NUM_IVS_S) & SCMD_NUM_IVS_M)
+
+/* EnbDbgId - If this is enabled upper 20 (63:44) bits if SeqNumber
+ * (below) are used as Cid (connection id for debug status), these
+ * bits are padded to zero for forming the 64 bit
+ * sequence number for TLS
+ */
+#define SCMD_ENB_DBGID_S  31
+#define SCMD_ENB_DBGID_M  0x1
+#define SCMD_ENB_DBGID_V(x)   ((x) << SCMD_ENB_DBGID_S)
+#define SCMD_ENB_DBGID_G(x)   \
+	(((x) >> SCMD_ENB_DBGID_S) & SCMD_ENB_DBGID_M)
+
+/* IV generation in SW. */
+#define SCMD_IV_GEN_CTRL_S      30
+#define SCMD_IV_GEN_CTRL_M      0x1
+#define SCMD_IV_GEN_CTRL_V(x)   ((x) << SCMD_IV_GEN_CTRL_S)
+#define SCMD_IV_GEN_CTRL_G(x)   \
+	(((x) >> SCMD_IV_GEN_CTRL_S) & SCMD_IV_GEN_CTRL_M)
+#define SCMD_IV_GEN_CTRL_F  SCMD_IV_GEN_CTRL_V(1U)
+
+/* More frags */
+#define SCMD_MORE_FRAGS_S   20
+#define SCMD_MORE_FRAGS_M   0x1
+#define SCMD_MORE_FRAGS_V(x)    ((x) << SCMD_MORE_FRAGS_S)
+#define SCMD_MORE_FRAGS_G(x)    (((x) >> SCMD_MORE_FRAGS_S) & SCMD_MORE_FRAGS_M)
+
+/*last frag */
+#define SCMD_LAST_FRAG_S    19
+#define SCMD_LAST_FRAG_M    0x1
+#define SCMD_LAST_FRAG_V(x) ((x) << SCMD_LAST_FRAG_S)
+#define SCMD_LAST_FRAG_G(x) (((x) >> SCMD_LAST_FRAG_S) & SCMD_LAST_FRAG_M)
+
+/* TlsCompPdu */
+#define SCMD_TLS_COMPPDU_S    18
+#define SCMD_TLS_COMPPDU_M    0x1
+#define SCMD_TLS_COMPPDU_V(x) ((x) << SCMD_TLS_COMPPDU_S)
+#define SCMD_TLS_COMPPDU_G(x) (((x) >> SCMD_TLS_COMPPDU_S) & SCMD_TLS_COMPPDU_M)
+
+/* KeyCntxtInline - Key context inline after the scmd  OR PayloadOnly*/
+#define SCMD_KEY_CTX_INLINE_S   17
+#define SCMD_KEY_CTX_INLINE_M   0x1
+#define SCMD_KEY_CTX_INLINE_V(x)    ((x) << SCMD_KEY_CTX_INLINE_S)
+#define SCMD_KEY_CTX_INLINE_G(x)    \
+	(((x) >> SCMD_KEY_CTX_INLINE_S) & SCMD_KEY_CTX_INLINE_M)
+#define SCMD_KEY_CTX_INLINE_F   SCMD_KEY_CTX_INLINE_V(1U)
+
+/* TLSFragEnable - 0: Host created TLS PDUs, 1: TLS Framgmentation in ASIC */
+#define SCMD_TLS_FRAG_ENABLE_S  16
+#define SCMD_TLS_FRAG_ENABLE_M  0x1
+#define SCMD_TLS_FRAG_ENABLE_V(x)   ((x) << SCMD_TLS_FRAG_ENABLE_S)
+#define SCMD_TLS_FRAG_ENABLE_G(x)   \
+	(((x) >> SCMD_TLS_FRAG_ENABLE_S) & SCMD_TLS_FRAG_ENABLE_M)
+#define SCMD_TLS_FRAG_ENABLE_F  SCMD_TLS_FRAG_ENABLE_V(1U)
+
+/* MacOnly - Only send the MAC and discard PDU. This is valid for hash only
+ * modes, in this case TLS_TX  will drop the PDU and only
+ * send back the MAC bytes.
+ */
+#define SCMD_MAC_ONLY_S 15
+#define SCMD_MAC_ONLY_M 0x1
+#define SCMD_MAC_ONLY_V(x)  ((x) << SCMD_MAC_ONLY_S)
+#define SCMD_MAC_ONLY_G(x)  \
+	(((x) >> SCMD_MAC_ONLY_S) & SCMD_MAC_ONLY_M)
+#define SCMD_MAC_ONLY_F SCMD_MAC_ONLY_V(1U)
+
+/* AadIVDrop - Drop the AAD and IV fields. Useful in protocols
+ * which have complex AAD and IV formations Eg:AES-CCM
+ */
+#define SCMD_AADIVDROP_S 14
+#define SCMD_AADIVDROP_M 0x1
+#define SCMD_AADIVDROP_V(x)  ((x) << SCMD_AADIVDROP_S)
+#define SCMD_AADIVDROP_G(x)  \
+	(((x) >> SCMD_AADIVDROP_S) & SCMD_AADIVDROP_M)
+#define SCMD_AADIVDROP_F SCMD_AADIVDROP_V(1U)
+
+/* HdrLength - Length of all headers excluding TLS header
+ * present before start of crypto PDU/payload.
+ */
+#define SCMD_HDR_LEN_S  0
+#define SCMD_HDR_LEN_M  0x3fff
+#define SCMD_HDR_LEN_V(x)   ((x) << SCMD_HDR_LEN_S)
+#define SCMD_HDR_LEN_G(x)   \
+	(((x) >> SCMD_HDR_LEN_S) & SCMD_HDR_LEN_M)
+
+struct cpl_tx_sec_pdu {
+	__be32 op_ivinsrtofst;
+	__be32 pldlen;
+	__be32 aadstart_cipherstop_hi;
+	__be32 cipherstop_lo_authinsert;
+	__be32 seqno_numivs;
+	__be32 ivgen_hdrlen;
+	__be64 scmd1;
+};
+
+#define CPL_TX_SEC_PDU_OPCODE_S     24
+#define CPL_TX_SEC_PDU_OPCODE_M     0xff
+#define CPL_TX_SEC_PDU_OPCODE_V(x)  ((x) << CPL_TX_SEC_PDU_OPCODE_S)
+#define CPL_TX_SEC_PDU_OPCODE_G(x)  \
+	(((x) >> CPL_TX_SEC_PDU_OPCODE_S) & CPL_TX_SEC_PDU_OPCODE_M)
+
+/* RX Channel Id */
+#define CPL_TX_SEC_PDU_RXCHID_S  22
+#define CPL_TX_SEC_PDU_RXCHID_M  0x1
+#define CPL_TX_SEC_PDU_RXCHID_V(x)   ((x) << CPL_TX_SEC_PDU_RXCHID_S)
+#define CPL_TX_SEC_PDU_RXCHID_G(x)   \
+	(((x) >> CPL_TX_SEC_PDU_RXCHID_S) & CPL_TX_SEC_PDU_RXCHID_M)
+#define CPL_TX_SEC_PDU_RXCHID_F  CPL_TX_SEC_PDU_RXCHID_V(1U)
+
+/* Ack Follows */
+#define CPL_TX_SEC_PDU_ACKFOLLOWS_S  21
+#define CPL_TX_SEC_PDU_ACKFOLLOWS_M  0x1
+#define CPL_TX_SEC_PDU_ACKFOLLOWS_V(x)   ((x) << CPL_TX_SEC_PDU_ACKFOLLOWS_S)
+#define CPL_TX_SEC_PDU_ACKFOLLOWS_G(x)   \
+	(((x) >> CPL_TX_SEC_PDU_ACKFOLLOWS_S) & CPL_TX_SEC_PDU_ACKFOLLOWS_M)
+#define CPL_TX_SEC_PDU_ACKFOLLOWS_F  CPL_TX_SEC_PDU_ACKFOLLOWS_V(1U)
+
+/* Loopback bit in cpl_tx_sec_pdu */
+#define CPL_TX_SEC_PDU_ULPTXLPBK_S  20
+#define CPL_TX_SEC_PDU_ULPTXLPBK_M  0x1
+#define CPL_TX_SEC_PDU_ULPTXLPBK_V(x)   ((x) << CPL_TX_SEC_PDU_ULPTXLPBK_S)
+#define CPL_TX_SEC_PDU_ULPTXLPBK_G(x)   \
+	(((x) >> CPL_TX_SEC_PDU_ULPTXLPBK_S) & CPL_TX_SEC_PDU_ULPTXLPBK_M)
+#define CPL_TX_SEC_PDU_ULPTXLPBK_F  CPL_TX_SEC_PDU_ULPTXLPBK_V(1U)
+
+/* Length of cpl header encapsulated */
+#define CPL_TX_SEC_PDU_CPLLEN_S     16
+#define CPL_TX_SEC_PDU_CPLLEN_M     0xf
+#define CPL_TX_SEC_PDU_CPLLEN_V(x)  ((x) << CPL_TX_SEC_PDU_CPLLEN_S)
+#define CPL_TX_SEC_PDU_CPLLEN_G(x)  \
+	(((x) >> CPL_TX_SEC_PDU_CPLLEN_S) & CPL_TX_SEC_PDU_CPLLEN_M)
+
+/* PlaceHolder */
+#define CPL_TX_SEC_PDU_PLACEHOLDER_S    10
+#define CPL_TX_SEC_PDU_PLACEHOLDER_M    0x1
+#define CPL_TX_SEC_PDU_PLACEHOLDER_V(x) ((x) << CPL_TX_SEC_PDU_PLACEHOLDER_S)
+#define CPL_TX_SEC_PDU_PLACEHOLDER_G(x) \
+	(((x) >> CPL_TX_SEC_PDU_PLACEHOLDER_S) & \
+	 CPL_TX_SEC_PDU_PLACEHOLDER_M)
+
+/* IvInsrtOffset: Insertion location for IV */
+#define CPL_TX_SEC_PDU_IVINSRTOFST_S    0
+#define CPL_TX_SEC_PDU_IVINSRTOFST_M    0x3ff
+#define CPL_TX_SEC_PDU_IVINSRTOFST_V(x) ((x) << CPL_TX_SEC_PDU_IVINSRTOFST_S)
+#define CPL_TX_SEC_PDU_IVINSRTOFST_G(x) \
+	(((x) >> CPL_TX_SEC_PDU_IVINSRTOFST_S) & \
+	 CPL_TX_SEC_PDU_IVINSRTOFST_M)
+
+/* AadStartOffset: Offset in bytes for AAD start from
+ * the first byte following the pkt headers (0-255 bytes)
+ */
+#define CPL_TX_SEC_PDU_AADSTART_S   24
+#define CPL_TX_SEC_PDU_AADSTART_M   0xff
+#define CPL_TX_SEC_PDU_AADSTART_V(x)    ((x) << CPL_TX_SEC_PDU_AADSTART_S)
+#define CPL_TX_SEC_PDU_AADSTART_G(x)    \
+	(((x) >> CPL_TX_SEC_PDU_AADSTART_S) & \
+	 CPL_TX_SEC_PDU_AADSTART_M)
+
+/* AadStopOffset: offset in bytes for AAD stop/end from the first byte following
+ * the pkt headers (0-511 bytes)
+ */
+#define CPL_TX_SEC_PDU_AADSTOP_S    15
+#define CPL_TX_SEC_PDU_AADSTOP_M    0x1ff
+#define CPL_TX_SEC_PDU_AADSTOP_V(x) ((x) << CPL_TX_SEC_PDU_AADSTOP_S)
+#define CPL_TX_SEC_PDU_AADSTOP_G(x) \
+	(((x) >> CPL_TX_SEC_PDU_AADSTOP_S) & CPL_TX_SEC_PDU_AADSTOP_M)
+
+/* CipherStartOffset: offset in bytes for encryption/decryption start from the
+ * first byte following the pkt headers (0-1023 bytes)
+ */
+#define CPL_TX_SEC_PDU_CIPHERSTART_S    5
+#define CPL_TX_SEC_PDU_CIPHERSTART_M    0x3ff
+#define CPL_TX_SEC_PDU_CIPHERSTART_V(x) ((x) << CPL_TX_SEC_PDU_CIPHERSTART_S)
+#define CPL_TX_SEC_PDU_CIPHERSTART_G(x) \
+	(((x) >> CPL_TX_SEC_PDU_CIPHERSTART_S) & \
+	 CPL_TX_SEC_PDU_CIPHERSTART_M)
+
+/* CipherStopOffset: offset in bytes for encryption/decryption end
+ * from end of the payload of this command (0-511 bytes)
+ */
+#define CPL_TX_SEC_PDU_CIPHERSTOP_HI_S      0
+#define CPL_TX_SEC_PDU_CIPHERSTOP_HI_M      0x1f
+#define CPL_TX_SEC_PDU_CIPHERSTOP_HI_V(x)   \
+	((x) << CPL_TX_SEC_PDU_CIPHERSTOP_HI_S)
+#define CPL_TX_SEC_PDU_CIPHERSTOP_HI_G(x)   \
+	(((x) >> CPL_TX_SEC_PDU_CIPHERSTOP_HI_S) & \
+	 CPL_TX_SEC_PDU_CIPHERSTOP_HI_M)
+
+#define CPL_TX_SEC_PDU_CIPHERSTOP_LO_S      28
+#define CPL_TX_SEC_PDU_CIPHERSTOP_LO_M      0xf
+#define CPL_TX_SEC_PDU_CIPHERSTOP_LO_V(x)   \
+	((x) << CPL_TX_SEC_PDU_CIPHERSTOP_LO_S)
+#define CPL_TX_SEC_PDU_CIPHERSTOP_LO_G(x)   \
+	(((x) >> CPL_TX_SEC_PDU_CIPHERSTOP_LO_S) & \
+	 CPL_TX_SEC_PDU_CIPHERSTOP_LO_M)
+
+/* AuthStartOffset: offset in bytes for authentication start from
+ * the first byte following the pkt headers (0-1023)
+ */
+#define CPL_TX_SEC_PDU_AUTHSTART_S  18
+#define CPL_TX_SEC_PDU_AUTHSTART_M  0x3ff
+#define CPL_TX_SEC_PDU_AUTHSTART_V(x)   ((x) << CPL_TX_SEC_PDU_AUTHSTART_S)
+#define CPL_TX_SEC_PDU_AUTHSTART_G(x)   \
+	(((x) >> CPL_TX_SEC_PDU_AUTHSTART_S) & \
+	 CPL_TX_SEC_PDU_AUTHSTART_M)
+
+/* AuthStopOffset: offset in bytes for authentication
+ * end from end of the payload of this command (0-511 Bytes)
+ */
+#define CPL_TX_SEC_PDU_AUTHSTOP_S   9
+#define CPL_TX_SEC_PDU_AUTHSTOP_M   0x1ff
+#define CPL_TX_SEC_PDU_AUTHSTOP_V(x)    ((x) << CPL_TX_SEC_PDU_AUTHSTOP_S)
+#define CPL_TX_SEC_PDU_AUTHSTOP_G(x)    \
+	(((x) >> CPL_TX_SEC_PDU_AUTHSTOP_S) & \
+	 CPL_TX_SEC_PDU_AUTHSTOP_M)
+
+/* AuthInsrtOffset: offset in bytes for authentication insertion
+ * from end of the payload of this command (0-511 bytes)
+ */
+#define CPL_TX_SEC_PDU_AUTHINSERT_S 0
+#define CPL_TX_SEC_PDU_AUTHINSERT_M 0x1ff
+#define CPL_TX_SEC_PDU_AUTHINSERT_V(x)  ((x) << CPL_TX_SEC_PDU_AUTHINSERT_S)
+#define CPL_TX_SEC_PDU_AUTHINSERT_G(x)  \
+	(((x) >> CPL_TX_SEC_PDU_AUTHINSERT_S) & \
+	 CPL_TX_SEC_PDU_AUTHINSERT_M)
+
+struct cpl_rx_phys_dsgl {
+	__be32 op_to_tid;
+	__be32 pcirlxorder_to_noofsgentr;
+	struct rss_header rss_hdr_int;
+};
+
+#define CPL_RX_PHYS_DSGL_OPCODE_S       24
+#define CPL_RX_PHYS_DSGL_OPCODE_M       0xff
+#define CPL_RX_PHYS_DSGL_OPCODE_V(x)    ((x) << CPL_RX_PHYS_DSGL_OPCODE_S)
+#define CPL_RX_PHYS_DSGL_OPCODE_G(x)    \
+	(((x) >> CPL_RX_PHYS_DSGL_OPCODE_S) & CPL_RX_PHYS_DSGL_OPCODE_M)
+
+#define CPL_RX_PHYS_DSGL_ISRDMA_S       23
+#define CPL_RX_PHYS_DSGL_ISRDMA_M       0x1
+#define CPL_RX_PHYS_DSGL_ISRDMA_V(x)    ((x) << CPL_RX_PHYS_DSGL_ISRDMA_S)
+#define CPL_RX_PHYS_DSGL_ISRDMA_G(x)    \
+	(((x) >> CPL_RX_PHYS_DSGL_ISRDMA_S) & CPL_RX_PHYS_DSGL_ISRDMA_M)
+#define CPL_RX_PHYS_DSGL_ISRDMA_F       CPL_RX_PHYS_DSGL_ISRDMA_V(1U)
+
+#define CPL_RX_PHYS_DSGL_RSVD1_S        20
+#define CPL_RX_PHYS_DSGL_RSVD1_M        0x7
+#define CPL_RX_PHYS_DSGL_RSVD1_V(x)     ((x) << CPL_RX_PHYS_DSGL_RSVD1_S)
+#define CPL_RX_PHYS_DSGL_RSVD1_G(x)     \
+	(((x) >> CPL_RX_PHYS_DSGL_RSVD1_S) & \
+	 CPL_RX_PHYS_DSGL_RSVD1_M)
+
+#define CPL_RX_PHYS_DSGL_PCIRLXORDER_S          31
+#define CPL_RX_PHYS_DSGL_PCIRLXORDER_M          0x1
+#define CPL_RX_PHYS_DSGL_PCIRLXORDER_V(x)       \
+	((x) << CPL_RX_PHYS_DSGL_PCIRLXORDER_S)
+#define CPL_RX_PHYS_DSGL_PCIRLXORDER_G(x)       \
+	(((x) >> CPL_RX_PHYS_DSGL_PCIRLXORDER_S) & \
+	 CPL_RX_PHYS_DSGL_PCIRLXORDER_M)
+#define CPL_RX_PHYS_DSGL_PCIRLXORDER_F  CPL_RX_PHYS_DSGL_PCIRLXORDER_V(1U)
+
+#define CPL_RX_PHYS_DSGL_PCINOSNOOP_S           30
+#define CPL_RX_PHYS_DSGL_PCINOSNOOP_M           0x1
+#define CPL_RX_PHYS_DSGL_PCINOSNOOP_V(x)        \
+	((x) << CPL_RX_PHYS_DSGL_PCINOSNOOP_S)
+#define CPL_RX_PHYS_DSGL_PCINOSNOOP_G(x)        \
+	(((x) >> CPL_RX_PHYS_DSGL_PCINOSNOOP_S) & \
+	 CPL_RX_PHYS_DSGL_PCINOSNOOP_M)
+
+#define CPL_RX_PHYS_DSGL_PCINOSNOOP_F   CPL_RX_PHYS_DSGL_PCINOSNOOP_V(1U)
+
+#define CPL_RX_PHYS_DSGL_PCITPHNTENB_S          29
+#define CPL_RX_PHYS_DSGL_PCITPHNTENB_M          0x1
+#define CPL_RX_PHYS_DSGL_PCITPHNTENB_V(x)       \
+	((x) << CPL_RX_PHYS_DSGL_PCITPHNTENB_S)
+#define CPL_RX_PHYS_DSGL_PCITPHNTENB_G(x)       \
+	(((x) >> CPL_RX_PHYS_DSGL_PCITPHNTENB_S) & \
+	 CPL_RX_PHYS_DSGL_PCITPHNTENB_M)
+#define CPL_RX_PHYS_DSGL_PCITPHNTENB_F  CPL_RX_PHYS_DSGL_PCITPHNTENB_V(1U)
+
+#define CPL_RX_PHYS_DSGL_PCITPHNT_S     27
+#define CPL_RX_PHYS_DSGL_PCITPHNT_M     0x3
+#define CPL_RX_PHYS_DSGL_PCITPHNT_V(x)  ((x) << CPL_RX_PHYS_DSGL_PCITPHNT_S)
+#define CPL_RX_PHYS_DSGL_PCITPHNT_G(x)  \
+	(((x) >> CPL_RX_PHYS_DSGL_PCITPHNT_S) & \
+	 CPL_RX_PHYS_DSGL_PCITPHNT_M)
+
+#define CPL_RX_PHYS_DSGL_DCAID_S        16
+#define CPL_RX_PHYS_DSGL_DCAID_M        0x7ff
+#define CPL_RX_PHYS_DSGL_DCAID_V(x)     ((x) << CPL_RX_PHYS_DSGL_DCAID_S)
+#define CPL_RX_PHYS_DSGL_DCAID_G(x)     \
+	(((x) >> CPL_RX_PHYS_DSGL_DCAID_S) & \
+	 CPL_RX_PHYS_DSGL_DCAID_M)
+
+#define CPL_RX_PHYS_DSGL_NOOFSGENTR_S           0
+#define CPL_RX_PHYS_DSGL_NOOFSGENTR_M           0xffff
+#define CPL_RX_PHYS_DSGL_NOOFSGENTR_V(x)        \
+	((x) << CPL_RX_PHYS_DSGL_NOOFSGENTR_S)
+#define CPL_RX_PHYS_DSGL_NOOFSGENTR_G(x)        \
+	(((x) >> CPL_RX_PHYS_DSGL_NOOFSGENTR_S) & \
+	 CPL_RX_PHYS_DSGL_NOOFSGENTR_M)
+
+struct cpl_rx_mps_pkt {
+	__be32 op_to_r1_hi;
+	__be32 r1_lo_length;
+};
+
+#define CPL_RX_MPS_PKT_OP_S     24
+#define CPL_RX_MPS_PKT_OP_M     0xff
+#define CPL_RX_MPS_PKT_OP_V(x)  ((x) << CPL_RX_MPS_PKT_OP_S)
+#define CPL_RX_MPS_PKT_OP_G(x)  \
+	(((x) >> CPL_RX_MPS_PKT_OP_S) & CPL_RX_MPS_PKT_OP_M)
+
+#define CPL_RX_MPS_PKT_TYPE_S           20
+#define CPL_RX_MPS_PKT_TYPE_M           0xf
+#define CPL_RX_MPS_PKT_TYPE_V(x)        ((x) << CPL_RX_MPS_PKT_TYPE_S)
+#define CPL_RX_MPS_PKT_TYPE_G(x)        \
+	(((x) >> CPL_RX_MPS_PKT_TYPE_S) & CPL_RX_MPS_PKT_TYPE_M)
+
+enum {
+	X_CPL_RX_MPS_PKT_TYPE_PAUSE = 1 << 0,
+	X_CPL_RX_MPS_PKT_TYPE_PPP   = 1 << 1,
+	X_CPL_RX_MPS_PKT_TYPE_QFC   = 1 << 2,
+	X_CPL_RX_MPS_PKT_TYPE_PTP   = 1 << 3
+};
+
+struct cpl_srq_table_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd[2];
+	__u8 idx;
+	__be64 rsvd_pdid;
+	__be32 qlen_qbase;
+	__be16 cur_msn;
+	__be16 max_msn;
+};
+
+struct cpl_srq_table_rpl {
+	union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd[2];
+	__u8 idx;
+	__be64 rsvd_pdid;
+	__be32 qlen_qbase;
+	__be16 cur_msn;
+	__be16 max_msn;
+};
+
+/* cpl_srq_table_{req,rpl}.params fields */
+#define SRQT_QLEN_S   28
+#define SRQT_QLEN_M   0xF
+#define SRQT_QLEN_V(x) ((x) << SRQT_QLEN_S)
+#define SRQT_QLEN_G(x) (((x) >> SRQT_QLEN_S) & SRQT_QLEN_M)
+
+#define SRQT_QBASE_S    0
+#define SRQT_QBASE_M   0x3FFFFFF
+#define SRQT_QBASE_V(x) ((x) << SRQT_QBASE_S)
+#define SRQT_QBASE_G(x) (((x) >> SRQT_QBASE_S) & SRQT_QBASE_M)
+
+#define SRQT_PDID_S    0
+#define SRQT_PDID_M   0xFF
+#define SRQT_PDID_V(x) ((x) << SRQT_PDID_S)
+#define SRQT_PDID_G(x) (((x) >> SRQT_PDID_S) & SRQT_PDID_M)
+
+#define SRQT_IDX_S    0
+#define SRQT_IDX_M    0xF
+#define SRQT_IDX_V(x) ((x) << SRQT_IDX_S)
+#define SRQT_IDX_G(x) (((x) >> SRQT_IDX_S) & SRQT_IDX_M)
+
+struct cpl_tx_tls_sfo {
+	__be32 op_to_seg_len;
+	__be32 pld_len;
+	__be32 type_protover;
+	__be32 r1_lo;
+	__be32 seqno_numivs;
+	__be32 ivgen_hdrlen;
+	__be64 scmd1;
+};
+
+/* cpl_tx_tls_sfo macros */
+#define CPL_TX_TLS_SFO_OPCODE_S         24
+#define CPL_TX_TLS_SFO_OPCODE_V(x)      ((x) << CPL_TX_TLS_SFO_OPCODE_S)
+
+#define CPL_TX_TLS_SFO_DATA_TYPE_S      20
+#define CPL_TX_TLS_SFO_DATA_TYPE_V(x)   ((x) << CPL_TX_TLS_SFO_DATA_TYPE_S)
+
+#define CPL_TX_TLS_SFO_CPL_LEN_S        16
+#define CPL_TX_TLS_SFO_CPL_LEN_V(x)     ((x) << CPL_TX_TLS_SFO_CPL_LEN_S)
+
+#define CPL_TX_TLS_SFO_SEG_LEN_S        0
+#define CPL_TX_TLS_SFO_SEG_LEN_M        0xffff
+#define CPL_TX_TLS_SFO_SEG_LEN_V(x)     ((x) << CPL_TX_TLS_SFO_SEG_LEN_S)
+#define CPL_TX_TLS_SFO_SEG_LEN_G(x)     \
+	(((x) >> CPL_TX_TLS_SFO_SEG_LEN_S) & CPL_TX_TLS_SFO_SEG_LEN_M)
+
+#define CPL_TX_TLS_SFO_TYPE_S           24
+#define CPL_TX_TLS_SFO_TYPE_M           0xff
+#define CPL_TX_TLS_SFO_TYPE_V(x)        ((x) << CPL_TX_TLS_SFO_TYPE_S)
+#define CPL_TX_TLS_SFO_TYPE_G(x)        \
+	(((x) >> CPL_TX_TLS_SFO_TYPE_S) & CPL_TX_TLS_SFO_TYPE_M)
+
+#define CPL_TX_TLS_SFO_PROTOVER_S       8
+#define CPL_TX_TLS_SFO_PROTOVER_M       0xffff
+#define CPL_TX_TLS_SFO_PROTOVER_V(x)    ((x) << CPL_TX_TLS_SFO_PROTOVER_S)
+#define CPL_TX_TLS_SFO_PROTOVER_G(x)    \
+	(((x) >> CPL_TX_TLS_SFO_PROTOVER_S) & CPL_TX_TLS_SFO_PROTOVER_M)
+
+struct cpl_tls_data {
+	struct rss_header rsshdr;
+	union opcode_tid ot;
+	__be32 length_pkd;
+	__be32 seq;
+	__be32 r1;
+};
+
+#define CPL_TLS_DATA_OPCODE_S           24
+#define CPL_TLS_DATA_OPCODE_M           0xff
+#define CPL_TLS_DATA_OPCODE_V(x)        ((x) << CPL_TLS_DATA_OPCODE_S)
+#define CPL_TLS_DATA_OPCODE_G(x)        \
+	(((x) >> CPL_TLS_DATA_OPCODE_S) & CPL_TLS_DATA_OPCODE_M)
+
+#define CPL_TLS_DATA_TID_S              0
+#define CPL_TLS_DATA_TID_M              0xffffff
+#define CPL_TLS_DATA_TID_V(x)           ((x) << CPL_TLS_DATA_TID_S)
+#define CPL_TLS_DATA_TID_G(x)           \
+	(((x) >> CPL_TLS_DATA_TID_S) & CPL_TLS_DATA_TID_M)
+
+#define CPL_TLS_DATA_LENGTH_S           0
+#define CPL_TLS_DATA_LENGTH_M           0xffff
+#define CPL_TLS_DATA_LENGTH_V(x)        ((x) << CPL_TLS_DATA_LENGTH_S)
+#define CPL_TLS_DATA_LENGTH_G(x)        \
+	(((x) >> CPL_TLS_DATA_LENGTH_S) & CPL_TLS_DATA_LENGTH_M)
+
+struct cpl_rx_tls_cmp {
+	struct rss_header rsshdr;
+	union opcode_tid ot;
+	__be32 pdulength_length;
+	__be32 seq;
+	__be32 ddp_report;
+	__be32 r;
+	__be32 ddp_valid;
+};
+
+#define CPL_RX_TLS_CMP_OPCODE_S         24
+#define CPL_RX_TLS_CMP_OPCODE_M         0xff
+#define CPL_RX_TLS_CMP_OPCODE_V(x)      ((x) << CPL_RX_TLS_CMP_OPCODE_S)
+#define CPL_RX_TLS_CMP_OPCODE_G(x)      \
+	(((x) >> CPL_RX_TLS_CMP_OPCODE_S) & CPL_RX_TLS_CMP_OPCODE_M)
+
+#define CPL_RX_TLS_CMP_TID_S            0
+#define CPL_RX_TLS_CMP_TID_M            0xffffff
+#define CPL_RX_TLS_CMP_TID_V(x)         ((x) << CPL_RX_TLS_CMP_TID_S)
+#define CPL_RX_TLS_CMP_TID_G(x)         \
+	(((x) >> CPL_RX_TLS_CMP_TID_S) & CPL_RX_TLS_CMP_TID_M)
+
+#define CPL_RX_TLS_CMP_PDULENGTH_S      16
+#define CPL_RX_TLS_CMP_PDULENGTH_M      0xffff
+#define CPL_RX_TLS_CMP_PDULENGTH_V(x)   ((x) << CPL_RX_TLS_CMP_PDULENGTH_S)
+#define CPL_RX_TLS_CMP_PDULENGTH_G(x)   \
+	(((x) >> CPL_RX_TLS_CMP_PDULENGTH_S) & CPL_RX_TLS_CMP_PDULENGTH_M)
+
+#define CPL_RX_TLS_CMP_LENGTH_S         0
+#define CPL_RX_TLS_CMP_LENGTH_M         0xffff
+#define CPL_RX_TLS_CMP_LENGTH_V(x)      ((x) << CPL_RX_TLS_CMP_LENGTH_S)
+#define CPL_RX_TLS_CMP_LENGTH_G(x)      \
+	(((x) >> CPL_RX_TLS_CMP_LENGTH_S) & CPL_RX_TLS_CMP_LENGTH_M)
+#endif  /* __T4_MSG_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
new file mode 100644
index 0000000..51b1803
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
@@ -0,0 +1,213 @@
+/*
+ * This file is part of the Chelsio T4/T5 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __T4_PCI_ID_TBL_H__
+#define __T4_PCI_ID_TBL_H__
+
+/* The code can defined cpp macros for creating a PCI Device ID Table. This is
+ * useful because it allows the PCI ID Table to be maintained in a single place.
+ *
+ * The macros are:
+ *
+ * CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN
+ *   -- Used to start the definition of the PCI ID Table.
+ *
+ * CH_PCI_DEVICE_ID_FUNCTION
+ *   -- The PCI Function Number to use in the PCI Device ID Table.  "0"
+ *   -- for drivers attaching to PF0-3, "4" for drivers attaching to PF4,
+ *   -- "8" for drivers attaching to SR-IOV Virtual Functions, etc.
+ *
+ * CH_PCI_DEVICE_ID_FUNCTION2 [optional]
+ *   -- If defined, create a PCI Device ID Table with both
+ *   -- CH_PCI_DEVICE_ID_FUNCTION and CH_PCI_DEVICE_ID_FUNCTION2 populated.
+ *
+ * CH_PCI_ID_TABLE_ENTRY(DeviceID)
+ *   -- Used for the individual PCI Device ID entries.  Note that we will
+ *   -- be adding a trailing comma (",") after all of the entries (and
+ *   -- between the pairs of entries if CH_PCI_DEVICE_ID_FUNCTION2 is defined).
+ *
+ * CH_PCI_DEVICE_ID_TABLE_DEFINE_END
+ *   -- Used to finish the definition of the PCI ID Table.  Note that we
+ *   -- will be adding a trailing semi-colon (";") here.
+ */
+#ifndef CH_PCI_DEVICE_ID_FUNCTION
+#error CH_PCI_DEVICE_ID_FUNCTION not defined!
+#endif
+#ifndef CH_PCI_ID_TABLE_ENTRY
+#error CH_PCI_ID_TABLE_ENTRY not defined!
+#endif
+#ifndef CH_PCI_DEVICE_ID_TABLE_DEFINE_END
+#error CH_PCI_DEVICE_ID_TABLE_DEFINE_END not defined!
+#endif
+
+/* T4 and later ASICs use a PCI Device ID scheme of 0xVFPP where:
+ *
+ *   V  = "4" for T4; "5" for T5, etc.
+ *   F  = "0" for PF 0..3; "4".."7" for PF4..7; and "8" for VFs
+ *   PP = adapter product designation
+ *
+ * We use this consistency in order to create the proper PCI Device IDs
+ * for the specified CH_PCI_DEVICE_ID_FUNCTION.
+ */
+#ifndef CH_PCI_DEVICE_ID_FUNCTION2
+#define CH_PCI_ID_TABLE_FENTRY(devid) \
+	CH_PCI_ID_TABLE_ENTRY((devid) | \
+			      ((CH_PCI_DEVICE_ID_FUNCTION) << 8))
+#else
+#define CH_PCI_ID_TABLE_FENTRY(devid) \
+	CH_PCI_ID_TABLE_ENTRY((devid) | \
+			      ((CH_PCI_DEVICE_ID_FUNCTION) << 8)), \
+	CH_PCI_ID_TABLE_ENTRY((devid) | \
+			      ((CH_PCI_DEVICE_ID_FUNCTION2) << 8))
+#endif
+
+CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN
+	/* T4 adapters:
+	 */
+	CH_PCI_ID_TABLE_FENTRY(0x4000),	/* T440-dbg */
+	CH_PCI_ID_TABLE_FENTRY(0x4001),	/* T420-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x4002),	/* T422-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x4003),	/* T440-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x4004),	/* T420-bch */
+	CH_PCI_ID_TABLE_FENTRY(0x4005),	/* T440-bch */
+	CH_PCI_ID_TABLE_FENTRY(0x4006),	/* T440-ch */
+	CH_PCI_ID_TABLE_FENTRY(0x4007),	/* T420-so */
+	CH_PCI_ID_TABLE_FENTRY(0x4008),	/* T420-cx */
+	CH_PCI_ID_TABLE_FENTRY(0x4009),	/* T420-bt */
+	CH_PCI_ID_TABLE_FENTRY(0x400a),	/* T404-bt */
+	CH_PCI_ID_TABLE_FENTRY(0x400b),	/* B420-sr */
+	CH_PCI_ID_TABLE_FENTRY(0x400c),	/* B404-bt */
+	CH_PCI_ID_TABLE_FENTRY(0x400d),	/* T480-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x400e),	/* T440-LP-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x4080),	/* Custom T480-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x4081),	/* Custom T440-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x4082),	/* Custom T420-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x4083),	/* Custom T420-xaui */
+	CH_PCI_ID_TABLE_FENTRY(0x4084),	/* Custom T440-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x4085),	/* Custom T420-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x4086),	/* Custom T440-bt */
+	CH_PCI_ID_TABLE_FENTRY(0x4087),	/* Custom T440-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x4088),	/* Custom T440 2-xaui, 2-xfi */
+
+	/* T5 adapters:
+	 */
+	CH_PCI_ID_TABLE_FENTRY(0x5000),	/* T580-dbg */
+	CH_PCI_ID_TABLE_FENTRY(0x5001),	/* T520-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x5002),	/* T522-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x5003),	/* T540-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x5004),	/* T520-bch */
+	CH_PCI_ID_TABLE_FENTRY(0x5005),	/* T540-bch */
+	CH_PCI_ID_TABLE_FENTRY(0x5006),	/* T540-ch */
+	CH_PCI_ID_TABLE_FENTRY(0x5007),	/* T520-so */
+	CH_PCI_ID_TABLE_FENTRY(0x5008),	/* T520-cx */
+	CH_PCI_ID_TABLE_FENTRY(0x5009),	/* T520-bt */
+	CH_PCI_ID_TABLE_FENTRY(0x500a),	/* T504-bt */
+	CH_PCI_ID_TABLE_FENTRY(0x500b),	/* B520-sr */
+	CH_PCI_ID_TABLE_FENTRY(0x500c),	/* B504-bt */
+	CH_PCI_ID_TABLE_FENTRY(0x500d),	/* T580-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x500e),	/* T540-LP-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x5010),	/* T580-LP-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x5011),	/* T520-LL-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x5012),	/* T560-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x5013),	/* T580-chr */
+	CH_PCI_ID_TABLE_FENTRY(0x5014),	/* T580-so */
+	CH_PCI_ID_TABLE_FENTRY(0x5015),	/* T502-bt */
+	CH_PCI_ID_TABLE_FENTRY(0x5016),	/* T580-OCP-SO */
+	CH_PCI_ID_TABLE_FENTRY(0x5017),	/* T520-OCP-SO */
+	CH_PCI_ID_TABLE_FENTRY(0x5018),	/* T540-BT */
+	CH_PCI_ID_TABLE_FENTRY(0x5080),	/* Custom T540-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x5081),	/* Custom T540-LL-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x5082),	/* Custom T504-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x5083),	/* Custom T540-LP-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5084),	/* Custom T580-cr */
+	CH_PCI_ID_TABLE_FENTRY(0x5085),	/* Custom 3x T580-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5086),	/* Custom 2x T580-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5087),	/* Custom T580-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5088),	/* Custom T570-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5089),	/* Custom T520-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5090),	/* Custom T540-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5091),	/* Custom T522-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5092),	/* Custom T520-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5093),	/* Custom T580-LP-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5094),	/* Custom T540-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5095),	/* Custom T540-CR-SO */
+	CH_PCI_ID_TABLE_FENTRY(0x5096),	/* Custom T580-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5097),	/* Custom T520-KR */
+	CH_PCI_ID_TABLE_FENTRY(0x5098),	/* Custom 2x40G QSFP */
+	CH_PCI_ID_TABLE_FENTRY(0x5099),	/* Custom 2x40G QSFP */
+	CH_PCI_ID_TABLE_FENTRY(0x509a),	/* Custom T520-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x509b),	/* Custom T540-CR LOM */
+	CH_PCI_ID_TABLE_FENTRY(0x509c),	/* Custom T520-CR*/
+	CH_PCI_ID_TABLE_FENTRY(0x509d),	/* Custom T540-CR*/
+	CH_PCI_ID_TABLE_FENTRY(0x509e), /* Custom T520-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x509f), /* Custom T540-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x50a0), /* Custom T540-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x50a1), /* Custom T540-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x50a2), /* Custom T540-KR4 */
+	CH_PCI_ID_TABLE_FENTRY(0x50a3), /* Custom T580-KR4 */
+	CH_PCI_ID_TABLE_FENTRY(0x50a4), /* Custom 2x T540-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x50a5), /* Custom T522-BT */
+	CH_PCI_ID_TABLE_FENTRY(0x50a6), /* Custom T522-BT-SO */
+	CH_PCI_ID_TABLE_FENTRY(0x50a7), /* Custom T580-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x50a8), /* Custom T580-KR */
+	CH_PCI_ID_TABLE_FENTRY(0x50a9), /* Custom T580-KR */
+	CH_PCI_ID_TABLE_FENTRY(0x50aa), /* Custom T580-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x50ab), /* Custom T520-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x50ac), /* Custom T540-BT */
+
+	/* T6 adapters:
+	 */
+	CH_PCI_ID_TABLE_FENTRY(0x6001),
+	CH_PCI_ID_TABLE_FENTRY(0x6002),
+	CH_PCI_ID_TABLE_FENTRY(0x6003),
+	CH_PCI_ID_TABLE_FENTRY(0x6004),
+	CH_PCI_ID_TABLE_FENTRY(0x6005),
+	CH_PCI_ID_TABLE_FENTRY(0x6006),
+	CH_PCI_ID_TABLE_FENTRY(0x6007),
+	CH_PCI_ID_TABLE_FENTRY(0x6008),
+	CH_PCI_ID_TABLE_FENTRY(0x6009),
+	CH_PCI_ID_TABLE_FENTRY(0x600d),
+	CH_PCI_ID_TABLE_FENTRY(0x6011),
+	CH_PCI_ID_TABLE_FENTRY(0x6014),
+	CH_PCI_ID_TABLE_FENTRY(0x6015),
+	CH_PCI_ID_TABLE_FENTRY(0x6080),
+	CH_PCI_ID_TABLE_FENTRY(0x6081),
+	CH_PCI_ID_TABLE_FENTRY(0x6082), /* Custom T6225-CR SFP28 */
+	CH_PCI_ID_TABLE_FENTRY(0x6083), /* Custom T62100-CR QSFP28 */
+	CH_PCI_ID_TABLE_FENTRY(0x6084), /* Custom T64100-CR QSFP28 */
+	CH_PCI_ID_TABLE_FENTRY(0x6085), /* Custom T6240-SO */
+	CH_PCI_ID_TABLE_FENTRY(0x6086), /* Custom T6225-SO-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x6087), /* Custom T6225-CR */
+CH_PCI_DEVICE_ID_TABLE_DEFINE_END;
+
+#endif /* __T4_PCI_ID_TBL_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
new file mode 100644
index 0000000..276fdf2
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
@@ -0,0 +1,3315 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __T4_REGS_H
+#define __T4_REGS_H
+
+#define MYPF_BASE 0x1b000
+#define MYPF_REG(reg_addr) (MYPF_BASE + (reg_addr))
+
+#define PF0_BASE 0x1e000
+#define PF0_REG(reg_addr) (PF0_BASE + (reg_addr))
+
+#define PF_STRIDE 0x400
+#define PF_BASE(idx) (PF0_BASE + (idx) * PF_STRIDE)
+#define PF_REG(idx, reg) (PF_BASE(idx) + (reg))
+
+#define NUM_CIM_CTL_TSCH_CHANNEL_INSTANCES 4
+#define NUM_CIM_CTL_TSCH_CHANNEL_TSCH_CLASS_INSTANCES 16
+
+#define MYPORT_BASE 0x1c000
+#define MYPORT_REG(reg_addr) (MYPORT_BASE + (reg_addr))
+
+#define PORT0_BASE 0x20000
+#define PORT0_REG(reg_addr) (PORT0_BASE + (reg_addr))
+
+#define PORT_STRIDE 0x2000
+#define PORT_BASE(idx) (PORT0_BASE + (idx) * PORT_STRIDE)
+#define PORT_REG(idx, reg) (PORT_BASE(idx) + (reg))
+
+#define EDC_STRIDE (EDC_1_BASE_ADDR - EDC_0_BASE_ADDR)
+#define EDC_REG(reg, idx) (reg + EDC_STRIDE * idx)
+
+#define PCIE_MEM_ACCESS_REG(reg_addr, idx) ((reg_addr) + (idx) * 8)
+#define PCIE_MAILBOX_REG(reg_addr, idx) ((reg_addr) + (idx) * 8)
+#define MC_BIST_STATUS_REG(reg_addr, idx) ((reg_addr) + (idx) * 4)
+#define EDC_BIST_STATUS_REG(reg_addr, idx) ((reg_addr) + (idx) * 4)
+
+#define PCIE_FW_REG(reg_addr, idx) ((reg_addr) + (idx) * 4)
+
+#define NUM_LE_DB_DBGI_REQ_DATA_INSTANCES 17
+#define NUM_LE_DB_DBGI_RSP_DATA_INSTANCES 17
+
+#define SGE_PF_KDOORBELL_A 0x0
+
+#define QID_S    15
+#define QID_V(x) ((x) << QID_S)
+
+#define DBPRIO_S    14
+#define DBPRIO_V(x) ((x) << DBPRIO_S)
+#define DBPRIO_F    DBPRIO_V(1U)
+
+#define PIDX_S    0
+#define PIDX_V(x) ((x) << PIDX_S)
+
+#define SGE_VF_KDOORBELL_A 0x0
+
+#define DBTYPE_S    13
+#define DBTYPE_V(x) ((x) << DBTYPE_S)
+#define DBTYPE_F    DBTYPE_V(1U)
+
+#define PIDX_T5_S    0
+#define PIDX_T5_M    0x1fffU
+#define PIDX_T5_V(x) ((x) << PIDX_T5_S)
+#define PIDX_T5_G(x) (((x) >> PIDX_T5_S) & PIDX_T5_M)
+
+#define SGE_PF_GTS_A 0x4
+
+#define INGRESSQID_S    16
+#define INGRESSQID_V(x) ((x) << INGRESSQID_S)
+
+#define TIMERREG_S    13
+#define TIMERREG_V(x) ((x) << TIMERREG_S)
+
+#define SEINTARM_S    12
+#define SEINTARM_V(x) ((x) << SEINTARM_S)
+
+#define CIDXINC_S    0
+#define CIDXINC_M    0xfffU
+#define CIDXINC_V(x) ((x) << CIDXINC_S)
+
+#define SGE_CONTROL_A	0x1008
+#define SGE_CONTROL2_A	0x1124
+
+#define RXPKTCPLMODE_S    18
+#define RXPKTCPLMODE_V(x) ((x) << RXPKTCPLMODE_S)
+#define RXPKTCPLMODE_F    RXPKTCPLMODE_V(1U)
+
+#define EGRSTATUSPAGESIZE_S    17
+#define EGRSTATUSPAGESIZE_V(x) ((x) << EGRSTATUSPAGESIZE_S)
+#define EGRSTATUSPAGESIZE_F    EGRSTATUSPAGESIZE_V(1U)
+
+#define PKTSHIFT_S    10
+#define PKTSHIFT_M    0x7U
+#define PKTSHIFT_V(x) ((x) << PKTSHIFT_S)
+#define PKTSHIFT_G(x) (((x) >> PKTSHIFT_S) & PKTSHIFT_M)
+
+#define INGPCIEBOUNDARY_S    7
+#define INGPCIEBOUNDARY_V(x) ((x) << INGPCIEBOUNDARY_S)
+
+#define INGPADBOUNDARY_S    4
+#define INGPADBOUNDARY_M    0x7U
+#define INGPADBOUNDARY_V(x) ((x) << INGPADBOUNDARY_S)
+#define INGPADBOUNDARY_G(x) (((x) >> INGPADBOUNDARY_S) & INGPADBOUNDARY_M)
+
+#define EGRPCIEBOUNDARY_S    1
+#define EGRPCIEBOUNDARY_V(x) ((x) << EGRPCIEBOUNDARY_S)
+
+#define  INGPACKBOUNDARY_S	16
+#define  INGPACKBOUNDARY_M	0x7U
+#define  INGPACKBOUNDARY_V(x)	((x) << INGPACKBOUNDARY_S)
+#define  INGPACKBOUNDARY_G(x)	(((x) >> INGPACKBOUNDARY_S) \
+				 & INGPACKBOUNDARY_M)
+
+#define VFIFO_ENABLE_S    10
+#define VFIFO_ENABLE_V(x) ((x) << VFIFO_ENABLE_S)
+#define VFIFO_ENABLE_F    VFIFO_ENABLE_V(1U)
+
+#define SGE_DBVFIFO_BADDR_A 0x1138
+
+#define DBVFIFO_SIZE_S    6
+#define DBVFIFO_SIZE_M    0xfffU
+#define DBVFIFO_SIZE_G(x) (((x) >> DBVFIFO_SIZE_S) & DBVFIFO_SIZE_M)
+
+#define T6_DBVFIFO_SIZE_S    0
+#define T6_DBVFIFO_SIZE_M    0x1fffU
+#define T6_DBVFIFO_SIZE_G(x) (((x) >> T6_DBVFIFO_SIZE_S) & T6_DBVFIFO_SIZE_M)
+
+#define SGE_CTXT_CMD_A 0x11fc
+
+#define BUSY_S    31
+#define BUSY_V(x) ((x) << BUSY_S)
+#define BUSY_F    BUSY_V(1U)
+
+#define CTXTTYPE_S    24
+#define CTXTTYPE_M    0x3U
+#define CTXTTYPE_V(x) ((x) << CTXTTYPE_S)
+
+#define CTXTQID_S    0
+#define CTXTQID_M    0x1ffffU
+#define CTXTQID_V(x) ((x) << CTXTQID_S)
+
+#define SGE_CTXT_DATA0_A 0x1200
+#define SGE_CTXT_DATA5_A 0x1214
+
+#define GLOBALENABLE_S    0
+#define GLOBALENABLE_V(x) ((x) << GLOBALENABLE_S)
+#define GLOBALENABLE_F    GLOBALENABLE_V(1U)
+
+#define SGE_HOST_PAGE_SIZE_A 0x100c
+
+#define HOSTPAGESIZEPF7_S    28
+#define HOSTPAGESIZEPF7_M    0xfU
+#define HOSTPAGESIZEPF7_V(x) ((x) << HOSTPAGESIZEPF7_S)
+#define HOSTPAGESIZEPF7_G(x) (((x) >> HOSTPAGESIZEPF7_S) & HOSTPAGESIZEPF7_M)
+
+#define HOSTPAGESIZEPF6_S    24
+#define HOSTPAGESIZEPF6_M    0xfU
+#define HOSTPAGESIZEPF6_V(x) ((x) << HOSTPAGESIZEPF6_S)
+#define HOSTPAGESIZEPF6_G(x) (((x) >> HOSTPAGESIZEPF6_S) & HOSTPAGESIZEPF6_M)
+
+#define HOSTPAGESIZEPF5_S    20
+#define HOSTPAGESIZEPF5_M    0xfU
+#define HOSTPAGESIZEPF5_V(x) ((x) << HOSTPAGESIZEPF5_S)
+#define HOSTPAGESIZEPF5_G(x) (((x) >> HOSTPAGESIZEPF5_S) & HOSTPAGESIZEPF5_M)
+
+#define HOSTPAGESIZEPF4_S    16
+#define HOSTPAGESIZEPF4_M    0xfU
+#define HOSTPAGESIZEPF4_V(x) ((x) << HOSTPAGESIZEPF4_S)
+#define HOSTPAGESIZEPF4_G(x) (((x) >> HOSTPAGESIZEPF4_S) & HOSTPAGESIZEPF4_M)
+
+#define HOSTPAGESIZEPF3_S    12
+#define HOSTPAGESIZEPF3_M    0xfU
+#define HOSTPAGESIZEPF3_V(x) ((x) << HOSTPAGESIZEPF3_S)
+#define HOSTPAGESIZEPF3_G(x) (((x) >> HOSTPAGESIZEPF3_S) & HOSTPAGESIZEPF3_M)
+
+#define HOSTPAGESIZEPF2_S    8
+#define HOSTPAGESIZEPF2_M    0xfU
+#define HOSTPAGESIZEPF2_V(x) ((x) << HOSTPAGESIZEPF2_S)
+#define HOSTPAGESIZEPF2_G(x) (((x) >> HOSTPAGESIZEPF2_S) & HOSTPAGESIZEPF2_M)
+
+#define HOSTPAGESIZEPF1_S    4
+#define HOSTPAGESIZEPF1_M    0xfU
+#define HOSTPAGESIZEPF1_V(x) ((x) << HOSTPAGESIZEPF1_S)
+#define HOSTPAGESIZEPF1_G(x) (((x) >> HOSTPAGESIZEPF1_S) & HOSTPAGESIZEPF1_M)
+
+#define HOSTPAGESIZEPF0_S    0
+#define HOSTPAGESIZEPF0_M    0xfU
+#define HOSTPAGESIZEPF0_V(x) ((x) << HOSTPAGESIZEPF0_S)
+#define HOSTPAGESIZEPF0_G(x) (((x) >> HOSTPAGESIZEPF0_S) & HOSTPAGESIZEPF0_M)
+
+#define SGE_EGRESS_QUEUES_PER_PAGE_PF_A 0x1010
+#define SGE_EGRESS_QUEUES_PER_PAGE_VF_A 0x1014
+
+#define QUEUESPERPAGEPF1_S    4
+
+#define QUEUESPERPAGEPF0_S    0
+#define QUEUESPERPAGEPF0_M    0xfU
+#define QUEUESPERPAGEPF0_V(x) ((x) << QUEUESPERPAGEPF0_S)
+#define QUEUESPERPAGEPF0_G(x) (((x) >> QUEUESPERPAGEPF0_S) & QUEUESPERPAGEPF0_M)
+
+#define SGE_INT_CAUSE1_A	0x1024
+#define SGE_INT_CAUSE2_A	0x1030
+#define SGE_INT_CAUSE3_A	0x103c
+
+#define ERR_FLM_DBP_S    31
+#define ERR_FLM_DBP_V(x) ((x) << ERR_FLM_DBP_S)
+#define ERR_FLM_DBP_F    ERR_FLM_DBP_V(1U)
+
+#define ERR_FLM_IDMA1_S    30
+#define ERR_FLM_IDMA1_V(x) ((x) << ERR_FLM_IDMA1_S)
+#define ERR_FLM_IDMA1_F    ERR_FLM_IDMA1_V(1U)
+
+#define ERR_FLM_IDMA0_S    29
+#define ERR_FLM_IDMA0_V(x) ((x) << ERR_FLM_IDMA0_S)
+#define ERR_FLM_IDMA0_F    ERR_FLM_IDMA0_V(1U)
+
+#define ERR_FLM_HINT_S    28
+#define ERR_FLM_HINT_V(x) ((x) << ERR_FLM_HINT_S)
+#define ERR_FLM_HINT_F    ERR_FLM_HINT_V(1U)
+
+#define ERR_PCIE_ERROR3_S    27
+#define ERR_PCIE_ERROR3_V(x) ((x) << ERR_PCIE_ERROR3_S)
+#define ERR_PCIE_ERROR3_F    ERR_PCIE_ERROR3_V(1U)
+
+#define ERR_PCIE_ERROR2_S    26
+#define ERR_PCIE_ERROR2_V(x) ((x) << ERR_PCIE_ERROR2_S)
+#define ERR_PCIE_ERROR2_F    ERR_PCIE_ERROR2_V(1U)
+
+#define ERR_PCIE_ERROR1_S    25
+#define ERR_PCIE_ERROR1_V(x) ((x) << ERR_PCIE_ERROR1_S)
+#define ERR_PCIE_ERROR1_F    ERR_PCIE_ERROR1_V(1U)
+
+#define ERR_PCIE_ERROR0_S    24
+#define ERR_PCIE_ERROR0_V(x) ((x) << ERR_PCIE_ERROR0_S)
+#define ERR_PCIE_ERROR0_F    ERR_PCIE_ERROR0_V(1U)
+
+#define ERR_CPL_EXCEED_IQE_SIZE_S    22
+#define ERR_CPL_EXCEED_IQE_SIZE_V(x) ((x) << ERR_CPL_EXCEED_IQE_SIZE_S)
+#define ERR_CPL_EXCEED_IQE_SIZE_F    ERR_CPL_EXCEED_IQE_SIZE_V(1U)
+
+#define ERR_INVALID_CIDX_INC_S    21
+#define ERR_INVALID_CIDX_INC_V(x) ((x) << ERR_INVALID_CIDX_INC_S)
+#define ERR_INVALID_CIDX_INC_F    ERR_INVALID_CIDX_INC_V(1U)
+
+#define ERR_CPL_OPCODE_0_S    19
+#define ERR_CPL_OPCODE_0_V(x) ((x) << ERR_CPL_OPCODE_0_S)
+#define ERR_CPL_OPCODE_0_F    ERR_CPL_OPCODE_0_V(1U)
+
+#define ERR_DROPPED_DB_S    18
+#define ERR_DROPPED_DB_V(x) ((x) << ERR_DROPPED_DB_S)
+#define ERR_DROPPED_DB_F    ERR_DROPPED_DB_V(1U)
+
+#define ERR_DATA_CPL_ON_HIGH_QID1_S    17
+#define ERR_DATA_CPL_ON_HIGH_QID1_V(x) ((x) << ERR_DATA_CPL_ON_HIGH_QID1_S)
+#define ERR_DATA_CPL_ON_HIGH_QID1_F    ERR_DATA_CPL_ON_HIGH_QID1_V(1U)
+
+#define ERR_DATA_CPL_ON_HIGH_QID0_S    16
+#define ERR_DATA_CPL_ON_HIGH_QID0_V(x) ((x) << ERR_DATA_CPL_ON_HIGH_QID0_S)
+#define ERR_DATA_CPL_ON_HIGH_QID0_F    ERR_DATA_CPL_ON_HIGH_QID0_V(1U)
+
+#define ERR_BAD_DB_PIDX3_S    15
+#define ERR_BAD_DB_PIDX3_V(x) ((x) << ERR_BAD_DB_PIDX3_S)
+#define ERR_BAD_DB_PIDX3_F    ERR_BAD_DB_PIDX3_V(1U)
+
+#define ERR_BAD_DB_PIDX2_S    14
+#define ERR_BAD_DB_PIDX2_V(x) ((x) << ERR_BAD_DB_PIDX2_S)
+#define ERR_BAD_DB_PIDX2_F    ERR_BAD_DB_PIDX2_V(1U)
+
+#define ERR_BAD_DB_PIDX1_S    13
+#define ERR_BAD_DB_PIDX1_V(x) ((x) << ERR_BAD_DB_PIDX1_S)
+#define ERR_BAD_DB_PIDX1_F    ERR_BAD_DB_PIDX1_V(1U)
+
+#define ERR_BAD_DB_PIDX0_S    12
+#define ERR_BAD_DB_PIDX0_V(x) ((x) << ERR_BAD_DB_PIDX0_S)
+#define ERR_BAD_DB_PIDX0_F    ERR_BAD_DB_PIDX0_V(1U)
+
+#define ERR_ING_CTXT_PRIO_S    10
+#define ERR_ING_CTXT_PRIO_V(x) ((x) << ERR_ING_CTXT_PRIO_S)
+#define ERR_ING_CTXT_PRIO_F    ERR_ING_CTXT_PRIO_V(1U)
+
+#define ERR_EGR_CTXT_PRIO_S    9
+#define ERR_EGR_CTXT_PRIO_V(x) ((x) << ERR_EGR_CTXT_PRIO_S)
+#define ERR_EGR_CTXT_PRIO_F    ERR_EGR_CTXT_PRIO_V(1U)
+
+#define DBFIFO_HP_INT_S    8
+#define DBFIFO_HP_INT_V(x) ((x) << DBFIFO_HP_INT_S)
+#define DBFIFO_HP_INT_F    DBFIFO_HP_INT_V(1U)
+
+#define DBFIFO_LP_INT_S    7
+#define DBFIFO_LP_INT_V(x) ((x) << DBFIFO_LP_INT_S)
+#define DBFIFO_LP_INT_F    DBFIFO_LP_INT_V(1U)
+
+#define INGRESS_SIZE_ERR_S    5
+#define INGRESS_SIZE_ERR_V(x) ((x) << INGRESS_SIZE_ERR_S)
+#define INGRESS_SIZE_ERR_F    INGRESS_SIZE_ERR_V(1U)
+
+#define EGRESS_SIZE_ERR_S    4
+#define EGRESS_SIZE_ERR_V(x) ((x) << EGRESS_SIZE_ERR_S)
+#define EGRESS_SIZE_ERR_F    EGRESS_SIZE_ERR_V(1U)
+
+#define SGE_INT_ENABLE3_A 0x1040
+#define SGE_FL_BUFFER_SIZE0_A 0x1044
+#define SGE_FL_BUFFER_SIZE1_A 0x1048
+#define SGE_FL_BUFFER_SIZE2_A 0x104c
+#define SGE_FL_BUFFER_SIZE3_A 0x1050
+#define SGE_FL_BUFFER_SIZE4_A 0x1054
+#define SGE_FL_BUFFER_SIZE5_A 0x1058
+#define SGE_FL_BUFFER_SIZE6_A 0x105c
+#define SGE_FL_BUFFER_SIZE7_A 0x1060
+#define SGE_FL_BUFFER_SIZE8_A 0x1064
+
+#define SGE_IMSG_CTXT_BADDR_A 0x1088
+#define SGE_FLM_CACHE_BADDR_A 0x108c
+#define SGE_FLM_CFG_A 0x1090
+
+#define NOHDR_S    18
+#define NOHDR_V(x) ((x) << NOHDR_S)
+#define NOHDR_F    NOHDR_V(1U)
+
+#define HDRSTARTFLQ_S    11
+#define HDRSTARTFLQ_M    0x7U
+#define HDRSTARTFLQ_G(x) (((x) >> HDRSTARTFLQ_S) & HDRSTARTFLQ_M)
+
+#define SGE_INGRESS_RX_THRESHOLD_A 0x10a0
+
+#define THRESHOLD_0_S    24
+#define THRESHOLD_0_M    0x3fU
+#define THRESHOLD_0_V(x) ((x) << THRESHOLD_0_S)
+#define THRESHOLD_0_G(x) (((x) >> THRESHOLD_0_S) & THRESHOLD_0_M)
+
+#define THRESHOLD_1_S    16
+#define THRESHOLD_1_M    0x3fU
+#define THRESHOLD_1_V(x) ((x) << THRESHOLD_1_S)
+#define THRESHOLD_1_G(x) (((x) >> THRESHOLD_1_S) & THRESHOLD_1_M)
+
+#define THRESHOLD_2_S    8
+#define THRESHOLD_2_M    0x3fU
+#define THRESHOLD_2_V(x) ((x) << THRESHOLD_2_S)
+#define THRESHOLD_2_G(x) (((x) >> THRESHOLD_2_S) & THRESHOLD_2_M)
+
+#define THRESHOLD_3_S    0
+#define THRESHOLD_3_M    0x3fU
+#define THRESHOLD_3_V(x) ((x) << THRESHOLD_3_S)
+#define THRESHOLD_3_G(x) (((x) >> THRESHOLD_3_S) & THRESHOLD_3_M)
+
+#define SGE_CONM_CTRL_A 0x1094
+
+#define EGRTHRESHOLD_S    8
+#define EGRTHRESHOLD_M    0x3fU
+#define EGRTHRESHOLD_V(x) ((x) << EGRTHRESHOLD_S)
+#define EGRTHRESHOLD_G(x) (((x) >> EGRTHRESHOLD_S) & EGRTHRESHOLD_M)
+
+#define EGRTHRESHOLDPACKING_S    14
+#define EGRTHRESHOLDPACKING_M    0x3fU
+#define EGRTHRESHOLDPACKING_V(x) ((x) << EGRTHRESHOLDPACKING_S)
+#define EGRTHRESHOLDPACKING_G(x) \
+	(((x) >> EGRTHRESHOLDPACKING_S) & EGRTHRESHOLDPACKING_M)
+
+#define T6_EGRTHRESHOLDPACKING_S    16
+#define T6_EGRTHRESHOLDPACKING_M    0xffU
+#define T6_EGRTHRESHOLDPACKING_G(x) \
+	(((x) >> T6_EGRTHRESHOLDPACKING_S) & T6_EGRTHRESHOLDPACKING_M)
+
+#define SGE_TIMESTAMP_LO_A 0x1098
+#define SGE_TIMESTAMP_HI_A 0x109c
+
+#define TSOP_S    28
+#define TSOP_M    0x3U
+#define TSOP_V(x) ((x) << TSOP_S)
+#define TSOP_G(x) (((x) >> TSOP_S) & TSOP_M)
+
+#define TSVAL_S    0
+#define TSVAL_M    0xfffffffU
+#define TSVAL_V(x) ((x) << TSVAL_S)
+#define TSVAL_G(x) (((x) >> TSVAL_S) & TSVAL_M)
+
+#define SGE_DBFIFO_STATUS_A 0x10a4
+#define SGE_DBVFIFO_SIZE_A 0x113c
+
+#define HP_INT_THRESH_S    28
+#define HP_INT_THRESH_M    0xfU
+#define HP_INT_THRESH_V(x) ((x) << HP_INT_THRESH_S)
+
+#define LP_INT_THRESH_S    12
+#define LP_INT_THRESH_M    0xfU
+#define LP_INT_THRESH_V(x) ((x) << LP_INT_THRESH_S)
+
+#define SGE_DOORBELL_CONTROL_A 0x10a8
+
+#define NOCOALESCE_S    26
+#define NOCOALESCE_V(x) ((x) << NOCOALESCE_S)
+#define NOCOALESCE_F    NOCOALESCE_V(1U)
+
+#define ENABLE_DROP_S    13
+#define ENABLE_DROP_V(x) ((x) << ENABLE_DROP_S)
+#define ENABLE_DROP_F    ENABLE_DROP_V(1U)
+
+#define SGE_TIMER_VALUE_0_AND_1_A 0x10b8
+
+#define TIMERVALUE0_S    16
+#define TIMERVALUE0_M    0xffffU
+#define TIMERVALUE0_V(x) ((x) << TIMERVALUE0_S)
+#define TIMERVALUE0_G(x) (((x) >> TIMERVALUE0_S) & TIMERVALUE0_M)
+
+#define TIMERVALUE1_S    0
+#define TIMERVALUE1_M    0xffffU
+#define TIMERVALUE1_V(x) ((x) << TIMERVALUE1_S)
+#define TIMERVALUE1_G(x) (((x) >> TIMERVALUE1_S) & TIMERVALUE1_M)
+
+#define SGE_TIMER_VALUE_2_AND_3_A 0x10bc
+
+#define TIMERVALUE2_S    16
+#define TIMERVALUE2_M    0xffffU
+#define TIMERVALUE2_V(x) ((x) << TIMERVALUE2_S)
+#define TIMERVALUE2_G(x) (((x) >> TIMERVALUE2_S) & TIMERVALUE2_M)
+
+#define TIMERVALUE3_S    0
+#define TIMERVALUE3_M    0xffffU
+#define TIMERVALUE3_V(x) ((x) << TIMERVALUE3_S)
+#define TIMERVALUE3_G(x) (((x) >> TIMERVALUE3_S) & TIMERVALUE3_M)
+
+#define SGE_TIMER_VALUE_4_AND_5_A 0x10c0
+
+#define TIMERVALUE4_S    16
+#define TIMERVALUE4_M    0xffffU
+#define TIMERVALUE4_V(x) ((x) << TIMERVALUE4_S)
+#define TIMERVALUE4_G(x) (((x) >> TIMERVALUE4_S) & TIMERVALUE4_M)
+
+#define TIMERVALUE5_S    0
+#define TIMERVALUE5_M    0xffffU
+#define TIMERVALUE5_V(x) ((x) << TIMERVALUE5_S)
+#define TIMERVALUE5_G(x) (((x) >> TIMERVALUE5_S) & TIMERVALUE5_M)
+
+#define SGE_DEBUG_INDEX_A 0x10cc
+#define SGE_DEBUG_DATA_HIGH_A 0x10d0
+#define SGE_DEBUG_DATA_LOW_A 0x10d4
+
+#define SGE_DEBUG_DATA_LOW_INDEX_2_A	0x12c8
+#define SGE_DEBUG_DATA_LOW_INDEX_3_A	0x12cc
+#define SGE_DEBUG_DATA_HIGH_INDEX_10_A	0x12a8
+
+#define SGE_INGRESS_QUEUES_PER_PAGE_PF_A 0x10f4
+#define SGE_INGRESS_QUEUES_PER_PAGE_VF_A 0x10f8
+
+#define SGE_ERROR_STATS_A 0x1100
+
+#define UNCAPTURED_ERROR_S    18
+#define UNCAPTURED_ERROR_V(x) ((x) << UNCAPTURED_ERROR_S)
+#define UNCAPTURED_ERROR_F    UNCAPTURED_ERROR_V(1U)
+
+#define ERROR_QID_VALID_S    17
+#define ERROR_QID_VALID_V(x) ((x) << ERROR_QID_VALID_S)
+#define ERROR_QID_VALID_F    ERROR_QID_VALID_V(1U)
+
+#define ERROR_QID_S    0
+#define ERROR_QID_M    0x1ffffU
+#define ERROR_QID_G(x) (((x) >> ERROR_QID_S) & ERROR_QID_M)
+
+#define HP_INT_THRESH_S    28
+#define HP_INT_THRESH_M    0xfU
+#define HP_INT_THRESH_V(x) ((x) << HP_INT_THRESH_S)
+
+#define HP_COUNT_S    16
+#define HP_COUNT_M    0x7ffU
+#define HP_COUNT_G(x) (((x) >> HP_COUNT_S) & HP_COUNT_M)
+
+#define LP_INT_THRESH_S    12
+#define LP_INT_THRESH_M    0xfU
+#define LP_INT_THRESH_V(x) ((x) << LP_INT_THRESH_S)
+
+#define LP_COUNT_S    0
+#define LP_COUNT_M    0x7ffU
+#define LP_COUNT_G(x) (((x) >> LP_COUNT_S) & LP_COUNT_M)
+
+#define LP_INT_THRESH_T5_S    18
+#define LP_INT_THRESH_T5_M    0xfffU
+#define LP_INT_THRESH_T5_V(x) ((x) << LP_INT_THRESH_T5_S)
+
+#define LP_COUNT_T5_S    0
+#define LP_COUNT_T5_M    0x3ffffU
+#define LP_COUNT_T5_G(x) (((x) >> LP_COUNT_T5_S) & LP_COUNT_T5_M)
+
+#define SGE_DOORBELL_CONTROL_A 0x10a8
+
+#define SGE_STAT_TOTAL_A	0x10e4
+#define SGE_STAT_MATCH_A	0x10e8
+#define SGE_STAT_CFG_A		0x10ec
+
+#define STATMODE_S    2
+#define STATMODE_V(x) ((x) << STATMODE_S)
+
+#define STATSOURCE_T5_S    9
+#define STATSOURCE_T5_M    0xfU
+#define STATSOURCE_T5_V(x) ((x) << STATSOURCE_T5_S)
+#define STATSOURCE_T5_G(x) (((x) >> STATSOURCE_T5_S) & STATSOURCE_T5_M)
+
+#define T6_STATMODE_S    0
+#define T6_STATMODE_V(x) ((x) << T6_STATMODE_S)
+
+#define SGE_DBFIFO_STATUS2_A 0x1118
+
+#define HP_INT_THRESH_T5_S    10
+#define HP_INT_THRESH_T5_M    0xfU
+#define HP_INT_THRESH_T5_V(x) ((x) << HP_INT_THRESH_T5_S)
+
+#define HP_COUNT_T5_S    0
+#define HP_COUNT_T5_M    0x3ffU
+#define HP_COUNT_T5_G(x) (((x) >> HP_COUNT_T5_S) & HP_COUNT_T5_M)
+
+#define ENABLE_DROP_S    13
+#define ENABLE_DROP_V(x) ((x) << ENABLE_DROP_S)
+#define ENABLE_DROP_F    ENABLE_DROP_V(1U)
+
+#define DROPPED_DB_S    0
+#define DROPPED_DB_V(x) ((x) << DROPPED_DB_S)
+#define DROPPED_DB_F    DROPPED_DB_V(1U)
+
+#define SGE_CTXT_CMD_A 0x11fc
+#define SGE_DBQ_CTXT_BADDR_A 0x1084
+
+/* registers for module PCIE */
+#define PCIE_PF_CFG_A	0x40
+
+#define AIVEC_S    4
+#define AIVEC_M    0x3ffU
+#define AIVEC_V(x) ((x) << AIVEC_S)
+
+#define PCIE_PF_CLI_A	0x44
+#define PCIE_INT_CAUSE_A	0x3004
+
+#define UNXSPLCPLERR_S    29
+#define UNXSPLCPLERR_V(x) ((x) << UNXSPLCPLERR_S)
+#define UNXSPLCPLERR_F    UNXSPLCPLERR_V(1U)
+
+#define PCIEPINT_S    28
+#define PCIEPINT_V(x) ((x) << PCIEPINT_S)
+#define PCIEPINT_F    PCIEPINT_V(1U)
+
+#define PCIESINT_S    27
+#define PCIESINT_V(x) ((x) << PCIESINT_S)
+#define PCIESINT_F    PCIESINT_V(1U)
+
+#define RPLPERR_S    26
+#define RPLPERR_V(x) ((x) << RPLPERR_S)
+#define RPLPERR_F    RPLPERR_V(1U)
+
+#define RXWRPERR_S    25
+#define RXWRPERR_V(x) ((x) << RXWRPERR_S)
+#define RXWRPERR_F    RXWRPERR_V(1U)
+
+#define RXCPLPERR_S    24
+#define RXCPLPERR_V(x) ((x) << RXCPLPERR_S)
+#define RXCPLPERR_F    RXCPLPERR_V(1U)
+
+#define PIOTAGPERR_S    23
+#define PIOTAGPERR_V(x) ((x) << PIOTAGPERR_S)
+#define PIOTAGPERR_F    PIOTAGPERR_V(1U)
+
+#define MATAGPERR_S    22
+#define MATAGPERR_V(x) ((x) << MATAGPERR_S)
+#define MATAGPERR_F    MATAGPERR_V(1U)
+
+#define INTXCLRPERR_S    21
+#define INTXCLRPERR_V(x) ((x) << INTXCLRPERR_S)
+#define INTXCLRPERR_F    INTXCLRPERR_V(1U)
+
+#define FIDPERR_S    20
+#define FIDPERR_V(x) ((x) << FIDPERR_S)
+#define FIDPERR_F    FIDPERR_V(1U)
+
+#define CFGSNPPERR_S    19
+#define CFGSNPPERR_V(x) ((x) << CFGSNPPERR_S)
+#define CFGSNPPERR_F    CFGSNPPERR_V(1U)
+
+#define HRSPPERR_S    18
+#define HRSPPERR_V(x) ((x) << HRSPPERR_S)
+#define HRSPPERR_F    HRSPPERR_V(1U)
+
+#define HREQPERR_S    17
+#define HREQPERR_V(x) ((x) << HREQPERR_S)
+#define HREQPERR_F    HREQPERR_V(1U)
+
+#define HCNTPERR_S    16
+#define HCNTPERR_V(x) ((x) << HCNTPERR_S)
+#define HCNTPERR_F    HCNTPERR_V(1U)
+
+#define DRSPPERR_S    15
+#define DRSPPERR_V(x) ((x) << DRSPPERR_S)
+#define DRSPPERR_F    DRSPPERR_V(1U)
+
+#define DREQPERR_S    14
+#define DREQPERR_V(x) ((x) << DREQPERR_S)
+#define DREQPERR_F    DREQPERR_V(1U)
+
+#define DCNTPERR_S    13
+#define DCNTPERR_V(x) ((x) << DCNTPERR_S)
+#define DCNTPERR_F    DCNTPERR_V(1U)
+
+#define CRSPPERR_S    12
+#define CRSPPERR_V(x) ((x) << CRSPPERR_S)
+#define CRSPPERR_F    CRSPPERR_V(1U)
+
+#define CREQPERR_S    11
+#define CREQPERR_V(x) ((x) << CREQPERR_S)
+#define CREQPERR_F    CREQPERR_V(1U)
+
+#define CCNTPERR_S    10
+#define CCNTPERR_V(x) ((x) << CCNTPERR_S)
+#define CCNTPERR_F    CCNTPERR_V(1U)
+
+#define TARTAGPERR_S    9
+#define TARTAGPERR_V(x) ((x) << TARTAGPERR_S)
+#define TARTAGPERR_F    TARTAGPERR_V(1U)
+
+#define PIOREQPERR_S    8
+#define PIOREQPERR_V(x) ((x) << PIOREQPERR_S)
+#define PIOREQPERR_F    PIOREQPERR_V(1U)
+
+#define PIOCPLPERR_S    7
+#define PIOCPLPERR_V(x) ((x) << PIOCPLPERR_S)
+#define PIOCPLPERR_F    PIOCPLPERR_V(1U)
+
+#define MSIXDIPERR_S    6
+#define MSIXDIPERR_V(x) ((x) << MSIXDIPERR_S)
+#define MSIXDIPERR_F    MSIXDIPERR_V(1U)
+
+#define MSIXDATAPERR_S    5
+#define MSIXDATAPERR_V(x) ((x) << MSIXDATAPERR_S)
+#define MSIXDATAPERR_F    MSIXDATAPERR_V(1U)
+
+#define MSIXADDRHPERR_S    4
+#define MSIXADDRHPERR_V(x) ((x) << MSIXADDRHPERR_S)
+#define MSIXADDRHPERR_F    MSIXADDRHPERR_V(1U)
+
+#define MSIXADDRLPERR_S    3
+#define MSIXADDRLPERR_V(x) ((x) << MSIXADDRLPERR_S)
+#define MSIXADDRLPERR_F    MSIXADDRLPERR_V(1U)
+
+#define MSIDATAPERR_S    2
+#define MSIDATAPERR_V(x) ((x) << MSIDATAPERR_S)
+#define MSIDATAPERR_F    MSIDATAPERR_V(1U)
+
+#define MSIADDRHPERR_S    1
+#define MSIADDRHPERR_V(x) ((x) << MSIADDRHPERR_S)
+#define MSIADDRHPERR_F    MSIADDRHPERR_V(1U)
+
+#define MSIADDRLPERR_S    0
+#define MSIADDRLPERR_V(x) ((x) << MSIADDRLPERR_S)
+#define MSIADDRLPERR_F    MSIADDRLPERR_V(1U)
+
+#define READRSPERR_S    29
+#define READRSPERR_V(x) ((x) << READRSPERR_S)
+#define READRSPERR_F    READRSPERR_V(1U)
+
+#define TRGT1GRPPERR_S    28
+#define TRGT1GRPPERR_V(x) ((x) << TRGT1GRPPERR_S)
+#define TRGT1GRPPERR_F    TRGT1GRPPERR_V(1U)
+
+#define IPSOTPERR_S    27
+#define IPSOTPERR_V(x) ((x) << IPSOTPERR_S)
+#define IPSOTPERR_F    IPSOTPERR_V(1U)
+
+#define IPRETRYPERR_S    26
+#define IPRETRYPERR_V(x) ((x) << IPRETRYPERR_S)
+#define IPRETRYPERR_F    IPRETRYPERR_V(1U)
+
+#define IPRXDATAGRPPERR_S    25
+#define IPRXDATAGRPPERR_V(x) ((x) << IPRXDATAGRPPERR_S)
+#define IPRXDATAGRPPERR_F    IPRXDATAGRPPERR_V(1U)
+
+#define IPRXHDRGRPPERR_S    24
+#define IPRXHDRGRPPERR_V(x) ((x) << IPRXHDRGRPPERR_S)
+#define IPRXHDRGRPPERR_F    IPRXHDRGRPPERR_V(1U)
+
+#define MAGRPPERR_S    22
+#define MAGRPPERR_V(x) ((x) << MAGRPPERR_S)
+#define MAGRPPERR_F    MAGRPPERR_V(1U)
+
+#define VFIDPERR_S    21
+#define VFIDPERR_V(x) ((x) << VFIDPERR_S)
+#define VFIDPERR_F    VFIDPERR_V(1U)
+
+#define HREQWRPERR_S    16
+#define HREQWRPERR_V(x) ((x) << HREQWRPERR_S)
+#define HREQWRPERR_F    HREQWRPERR_V(1U)
+
+#define DREQWRPERR_S    13
+#define DREQWRPERR_V(x) ((x) << DREQWRPERR_S)
+#define DREQWRPERR_F    DREQWRPERR_V(1U)
+
+#define CREQRDPERR_S    11
+#define CREQRDPERR_V(x) ((x) << CREQRDPERR_S)
+#define CREQRDPERR_F    CREQRDPERR_V(1U)
+
+#define MSTTAGQPERR_S    10
+#define MSTTAGQPERR_V(x) ((x) << MSTTAGQPERR_S)
+#define MSTTAGQPERR_F    MSTTAGQPERR_V(1U)
+
+#define PIOREQGRPPERR_S    8
+#define PIOREQGRPPERR_V(x) ((x) << PIOREQGRPPERR_S)
+#define PIOREQGRPPERR_F    PIOREQGRPPERR_V(1U)
+
+#define PIOCPLGRPPERR_S    7
+#define PIOCPLGRPPERR_V(x) ((x) << PIOCPLGRPPERR_S)
+#define PIOCPLGRPPERR_F    PIOCPLGRPPERR_V(1U)
+
+#define MSIXSTIPERR_S    2
+#define MSIXSTIPERR_V(x) ((x) << MSIXSTIPERR_S)
+#define MSIXSTIPERR_F    MSIXSTIPERR_V(1U)
+
+#define MSTTIMEOUTPERR_S    1
+#define MSTTIMEOUTPERR_V(x) ((x) << MSTTIMEOUTPERR_S)
+#define MSTTIMEOUTPERR_F    MSTTIMEOUTPERR_V(1U)
+
+#define MSTGRPPERR_S    0
+#define MSTGRPPERR_V(x) ((x) << MSTGRPPERR_S)
+#define MSTGRPPERR_F    MSTGRPPERR_V(1U)
+
+#define PCIE_NONFAT_ERR_A	0x3010
+#define PCIE_CFG_SPACE_REQ_A	0x3060
+#define PCIE_CFG_SPACE_DATA_A	0x3064
+#define PCIE_MEM_ACCESS_BASE_WIN_A 0x3068
+
+#define PCIEOFST_S    10
+#define PCIEOFST_M    0x3fffffU
+#define PCIEOFST_G(x) (((x) >> PCIEOFST_S) & PCIEOFST_M)
+
+#define BIR_S    8
+#define BIR_M    0x3U
+#define BIR_V(x) ((x) << BIR_S)
+#define BIR_G(x) (((x) >> BIR_S) & BIR_M)
+
+#define WINDOW_S    0
+#define WINDOW_M    0xffU
+#define WINDOW_V(x) ((x) << WINDOW_S)
+#define WINDOW_G(x) (((x) >> WINDOW_S) & WINDOW_M)
+
+#define PCIE_MEM_ACCESS_OFFSET_A 0x306c
+
+#define ENABLE_S    30
+#define ENABLE_V(x) ((x) << ENABLE_S)
+#define ENABLE_F    ENABLE_V(1U)
+
+#define LOCALCFG_S    28
+#define LOCALCFG_V(x) ((x) << LOCALCFG_S)
+#define LOCALCFG_F    LOCALCFG_V(1U)
+
+#define FUNCTION_S    12
+#define FUNCTION_V(x) ((x) << FUNCTION_S)
+
+#define REGISTER_S    0
+#define REGISTER_V(x) ((x) << REGISTER_S)
+
+#define T6_ENABLE_S    31
+#define T6_ENABLE_V(x) ((x) << T6_ENABLE_S)
+#define T6_ENABLE_F    T6_ENABLE_V(1U)
+
+#define PFNUM_S    0
+#define PFNUM_V(x) ((x) << PFNUM_S)
+
+#define PCIE_FW_A 0x30b8
+#define PCIE_FW_PF_A 0x30bc
+
+#define PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS_A 0x5908
+
+#define RNPP_S    31
+#define RNPP_V(x) ((x) << RNPP_S)
+#define RNPP_F    RNPP_V(1U)
+
+#define RPCP_S    29
+#define RPCP_V(x) ((x) << RPCP_S)
+#define RPCP_F    RPCP_V(1U)
+
+#define RCIP_S    27
+#define RCIP_V(x) ((x) << RCIP_S)
+#define RCIP_F    RCIP_V(1U)
+
+#define RCCP_S    26
+#define RCCP_V(x) ((x) << RCCP_S)
+#define RCCP_F    RCCP_V(1U)
+
+#define RFTP_S    23
+#define RFTP_V(x) ((x) << RFTP_S)
+#define RFTP_F    RFTP_V(1U)
+
+#define PTRP_S    20
+#define PTRP_V(x) ((x) << PTRP_S)
+#define PTRP_F    PTRP_V(1U)
+
+#define PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS_A 0x59a4
+
+#define TPCP_S    30
+#define TPCP_V(x) ((x) << TPCP_S)
+#define TPCP_F    TPCP_V(1U)
+
+#define TNPP_S    29
+#define TNPP_V(x) ((x) << TNPP_S)
+#define TNPP_F    TNPP_V(1U)
+
+#define TFTP_S    28
+#define TFTP_V(x) ((x) << TFTP_S)
+#define TFTP_F    TFTP_V(1U)
+
+#define TCAP_S    27
+#define TCAP_V(x) ((x) << TCAP_S)
+#define TCAP_F    TCAP_V(1U)
+
+#define TCIP_S    26
+#define TCIP_V(x) ((x) << TCIP_S)
+#define TCIP_F    TCIP_V(1U)
+
+#define RCAP_S    25
+#define RCAP_V(x) ((x) << RCAP_S)
+#define RCAP_F    RCAP_V(1U)
+
+#define PLUP_S    23
+#define PLUP_V(x) ((x) << PLUP_S)
+#define PLUP_F    PLUP_V(1U)
+
+#define PLDN_S    22
+#define PLDN_V(x) ((x) << PLDN_S)
+#define PLDN_F    PLDN_V(1U)
+
+#define OTDD_S    21
+#define OTDD_V(x) ((x) << OTDD_S)
+#define OTDD_F    OTDD_V(1U)
+
+#define GTRP_S    20
+#define GTRP_V(x) ((x) << GTRP_S)
+#define GTRP_F    GTRP_V(1U)
+
+#define RDPE_S    18
+#define RDPE_V(x) ((x) << RDPE_S)
+#define RDPE_F    RDPE_V(1U)
+
+#define TDCE_S    17
+#define TDCE_V(x) ((x) << TDCE_S)
+#define TDCE_F    TDCE_V(1U)
+
+#define TDUE_S    16
+#define TDUE_V(x) ((x) << TDUE_S)
+#define TDUE_F    TDUE_V(1U)
+
+/* registers for module MC */
+#define MC_INT_CAUSE_A		0x7518
+#define MC_P_INT_CAUSE_A	0x41318
+
+#define ECC_UE_INT_CAUSE_S    2
+#define ECC_UE_INT_CAUSE_V(x) ((x) << ECC_UE_INT_CAUSE_S)
+#define ECC_UE_INT_CAUSE_F    ECC_UE_INT_CAUSE_V(1U)
+
+#define ECC_CE_INT_CAUSE_S    1
+#define ECC_CE_INT_CAUSE_V(x) ((x) << ECC_CE_INT_CAUSE_S)
+#define ECC_CE_INT_CAUSE_F    ECC_CE_INT_CAUSE_V(1U)
+
+#define PERR_INT_CAUSE_S    0
+#define PERR_INT_CAUSE_V(x) ((x) << PERR_INT_CAUSE_S)
+#define PERR_INT_CAUSE_F    PERR_INT_CAUSE_V(1U)
+
+#define DBG_GPIO_EN_A		0x6010
+#define XGMAC_PORT_CFG_A	0x1000
+#define MAC_PORT_CFG_A		0x800
+
+#define SIGNAL_DET_S    14
+#define SIGNAL_DET_V(x) ((x) << SIGNAL_DET_S)
+#define SIGNAL_DET_F    SIGNAL_DET_V(1U)
+
+#define MC_ECC_STATUS_A		0x751c
+#define MC_P_ECC_STATUS_A	0x4131c
+
+#define ECC_CECNT_S    16
+#define ECC_CECNT_M    0xffffU
+#define ECC_CECNT_V(x) ((x) << ECC_CECNT_S)
+#define ECC_CECNT_G(x) (((x) >> ECC_CECNT_S) & ECC_CECNT_M)
+
+#define ECC_UECNT_S    0
+#define ECC_UECNT_M    0xffffU
+#define ECC_UECNT_V(x) ((x) << ECC_UECNT_S)
+#define ECC_UECNT_G(x) (((x) >> ECC_UECNT_S) & ECC_UECNT_M)
+
+#define MC_BIST_CMD_A 0x7600
+
+#define START_BIST_S    31
+#define START_BIST_V(x) ((x) << START_BIST_S)
+#define START_BIST_F    START_BIST_V(1U)
+
+#define BIST_CMD_GAP_S    8
+#define BIST_CMD_GAP_V(x) ((x) << BIST_CMD_GAP_S)
+
+#define BIST_OPCODE_S    0
+#define BIST_OPCODE_V(x) ((x) << BIST_OPCODE_S)
+
+#define MC_BIST_CMD_ADDR_A 0x7604
+#define MC_BIST_CMD_LEN_A 0x7608
+#define MC_BIST_DATA_PATTERN_A 0x760c
+
+#define MC_BIST_STATUS_RDATA_A 0x7688
+
+/* registers for module MA */
+#define MA_EDRAM0_BAR_A 0x77c0
+
+#define EDRAM0_BASE_S    16
+#define EDRAM0_BASE_M    0xfffU
+#define EDRAM0_BASE_G(x) (((x) >> EDRAM0_BASE_S) & EDRAM0_BASE_M)
+
+#define EDRAM0_SIZE_S    0
+#define EDRAM0_SIZE_M    0xfffU
+#define EDRAM0_SIZE_V(x) ((x) << EDRAM0_SIZE_S)
+#define EDRAM0_SIZE_G(x) (((x) >> EDRAM0_SIZE_S) & EDRAM0_SIZE_M)
+
+#define MA_EDRAM1_BAR_A 0x77c4
+
+#define EDRAM1_BASE_S    16
+#define EDRAM1_BASE_M    0xfffU
+#define EDRAM1_BASE_G(x) (((x) >> EDRAM1_BASE_S) & EDRAM1_BASE_M)
+
+#define EDRAM1_SIZE_S    0
+#define EDRAM1_SIZE_M    0xfffU
+#define EDRAM1_SIZE_V(x) ((x) << EDRAM1_SIZE_S)
+#define EDRAM1_SIZE_G(x) (((x) >> EDRAM1_SIZE_S) & EDRAM1_SIZE_M)
+
+#define MA_EXT_MEMORY_BAR_A 0x77c8
+
+#define EXT_MEM_BASE_S    16
+#define EXT_MEM_BASE_M    0xfffU
+#define EXT_MEM_BASE_V(x) ((x) << EXT_MEM_BASE_S)
+#define EXT_MEM_BASE_G(x) (((x) >> EXT_MEM_BASE_S) & EXT_MEM_BASE_M)
+
+#define EXT_MEM_SIZE_S    0
+#define EXT_MEM_SIZE_M    0xfffU
+#define EXT_MEM_SIZE_V(x) ((x) << EXT_MEM_SIZE_S)
+#define EXT_MEM_SIZE_G(x) (((x) >> EXT_MEM_SIZE_S) & EXT_MEM_SIZE_M)
+
+#define MA_EXT_MEMORY1_BAR_A 0x7808
+
+#define HMA_MUX_S    5
+#define HMA_MUX_V(x) ((x) << HMA_MUX_S)
+#define HMA_MUX_F    HMA_MUX_V(1U)
+
+#define EXT_MEM1_BASE_S    16
+#define EXT_MEM1_BASE_M    0xfffU
+#define EXT_MEM1_BASE_G(x) (((x) >> EXT_MEM1_BASE_S) & EXT_MEM1_BASE_M)
+
+#define EXT_MEM1_SIZE_S    0
+#define EXT_MEM1_SIZE_M    0xfffU
+#define EXT_MEM1_SIZE_V(x) ((x) << EXT_MEM1_SIZE_S)
+#define EXT_MEM1_SIZE_G(x) (((x) >> EXT_MEM1_SIZE_S) & EXT_MEM1_SIZE_M)
+
+#define MA_EXT_MEMORY0_BAR_A 0x77c8
+
+#define EXT_MEM0_BASE_S    16
+#define EXT_MEM0_BASE_M    0xfffU
+#define EXT_MEM0_BASE_G(x) (((x) >> EXT_MEM0_BASE_S) & EXT_MEM0_BASE_M)
+
+#define EXT_MEM0_SIZE_S    0
+#define EXT_MEM0_SIZE_M    0xfffU
+#define EXT_MEM0_SIZE_V(x) ((x) << EXT_MEM0_SIZE_S)
+#define EXT_MEM0_SIZE_G(x) (((x) >> EXT_MEM0_SIZE_S) & EXT_MEM0_SIZE_M)
+
+#define MA_TARGET_MEM_ENABLE_A 0x77d8
+
+#define EXT_MEM_ENABLE_S    2
+#define EXT_MEM_ENABLE_V(x) ((x) << EXT_MEM_ENABLE_S)
+#define EXT_MEM_ENABLE_F    EXT_MEM_ENABLE_V(1U)
+
+#define EDRAM1_ENABLE_S    1
+#define EDRAM1_ENABLE_V(x) ((x) << EDRAM1_ENABLE_S)
+#define EDRAM1_ENABLE_F    EDRAM1_ENABLE_V(1U)
+
+#define EDRAM0_ENABLE_S    0
+#define EDRAM0_ENABLE_V(x) ((x) << EDRAM0_ENABLE_S)
+#define EDRAM0_ENABLE_F    EDRAM0_ENABLE_V(1U)
+
+#define EXT_MEM1_ENABLE_S    4
+#define EXT_MEM1_ENABLE_V(x) ((x) << EXT_MEM1_ENABLE_S)
+#define EXT_MEM1_ENABLE_F    EXT_MEM1_ENABLE_V(1U)
+
+#define EXT_MEM0_ENABLE_S    2
+#define EXT_MEM0_ENABLE_V(x) ((x) << EXT_MEM0_ENABLE_S)
+#define EXT_MEM0_ENABLE_F    EXT_MEM0_ENABLE_V(1U)
+
+#define MA_INT_CAUSE_A	0x77e0
+
+#define MEM_PERR_INT_CAUSE_S    1
+#define MEM_PERR_INT_CAUSE_V(x) ((x) << MEM_PERR_INT_CAUSE_S)
+#define MEM_PERR_INT_CAUSE_F    MEM_PERR_INT_CAUSE_V(1U)
+
+#define MEM_WRAP_INT_CAUSE_S    0
+#define MEM_WRAP_INT_CAUSE_V(x) ((x) << MEM_WRAP_INT_CAUSE_S)
+#define MEM_WRAP_INT_CAUSE_F    MEM_WRAP_INT_CAUSE_V(1U)
+
+#define MA_INT_WRAP_STATUS_A	0x77e4
+
+#define MEM_WRAP_ADDRESS_S    4
+#define MEM_WRAP_ADDRESS_M    0xfffffffU
+#define MEM_WRAP_ADDRESS_G(x) (((x) >> MEM_WRAP_ADDRESS_S) & MEM_WRAP_ADDRESS_M)
+
+#define MEM_WRAP_CLIENT_NUM_S    0
+#define MEM_WRAP_CLIENT_NUM_M    0xfU
+#define MEM_WRAP_CLIENT_NUM_G(x) \
+	(((x) >> MEM_WRAP_CLIENT_NUM_S) & MEM_WRAP_CLIENT_NUM_M)
+
+#define MA_PARITY_ERROR_STATUS_A	0x77f4
+#define MA_PARITY_ERROR_STATUS1_A	0x77f4
+#define MA_PARITY_ERROR_STATUS2_A	0x7804
+
+/* registers for module EDC_0 */
+#define EDC_0_BASE_ADDR		0x7900
+
+#define EDC_BIST_CMD_A		0x7904
+#define EDC_BIST_CMD_ADDR_A	0x7908
+#define EDC_BIST_CMD_LEN_A	0x790c
+#define EDC_BIST_DATA_PATTERN_A 0x7910
+#define EDC_BIST_STATUS_RDATA_A	0x7928
+#define EDC_INT_CAUSE_A		0x7978
+
+#define ECC_UE_PAR_S    5
+#define ECC_UE_PAR_V(x) ((x) << ECC_UE_PAR_S)
+#define ECC_UE_PAR_F    ECC_UE_PAR_V(1U)
+
+#define ECC_CE_PAR_S    4
+#define ECC_CE_PAR_V(x) ((x) << ECC_CE_PAR_S)
+#define ECC_CE_PAR_F    ECC_CE_PAR_V(1U)
+
+#define PERR_PAR_CAUSE_S    3
+#define PERR_PAR_CAUSE_V(x) ((x) << PERR_PAR_CAUSE_S)
+#define PERR_PAR_CAUSE_F    PERR_PAR_CAUSE_V(1U)
+
+#define EDC_ECC_STATUS_A	0x797c
+
+/* registers for module EDC_1 */
+#define EDC_1_BASE_ADDR	0x7980
+
+/* registers for module CIM */
+#define CIM_BOOT_CFG_A 0x7b00
+#define CIM_SDRAM_BASE_ADDR_A 0x7b14
+#define CIM_SDRAM_ADDR_SIZE_A 0x7b18
+#define CIM_EXTMEM2_BASE_ADDR_A 0x7b1c
+#define CIM_EXTMEM2_ADDR_SIZE_A 0x7b20
+#define CIM_PF_MAILBOX_CTRL_SHADOW_COPY_A 0x290
+
+#define  BOOTADDR_M	0xffffff00U
+
+#define UPCRST_S    0
+#define UPCRST_V(x) ((x) << UPCRST_S)
+#define UPCRST_F    UPCRST_V(1U)
+
+#define CIM_PF_MAILBOX_DATA_A 0x240
+#define CIM_PF_MAILBOX_CTRL_A 0x280
+
+#define MBMSGVALID_S    3
+#define MBMSGVALID_V(x) ((x) << MBMSGVALID_S)
+#define MBMSGVALID_F    MBMSGVALID_V(1U)
+
+#define MBINTREQ_S    2
+#define MBINTREQ_V(x) ((x) << MBINTREQ_S)
+#define MBINTREQ_F    MBINTREQ_V(1U)
+
+#define MBOWNER_S    0
+#define MBOWNER_M    0x3U
+#define MBOWNER_V(x) ((x) << MBOWNER_S)
+#define MBOWNER_G(x) (((x) >> MBOWNER_S) & MBOWNER_M)
+
+#define CIM_PF_HOST_INT_ENABLE_A 0x288
+
+#define MBMSGRDYINTEN_S    19
+#define MBMSGRDYINTEN_V(x) ((x) << MBMSGRDYINTEN_S)
+#define MBMSGRDYINTEN_F    MBMSGRDYINTEN_V(1U)
+
+#define CIM_PF_HOST_INT_CAUSE_A 0x28c
+
+#define MBMSGRDYINT_S    19
+#define MBMSGRDYINT_V(x) ((x) << MBMSGRDYINT_S)
+#define MBMSGRDYINT_F    MBMSGRDYINT_V(1U)
+
+#define CIM_HOST_INT_CAUSE_A 0x7b2c
+
+#define TIEQOUTPARERRINT_S    20
+#define TIEQOUTPARERRINT_V(x) ((x) << TIEQOUTPARERRINT_S)
+#define TIEQOUTPARERRINT_F    TIEQOUTPARERRINT_V(1U)
+
+#define TIEQINPARERRINT_S    19
+#define TIEQINPARERRINT_V(x) ((x) << TIEQINPARERRINT_S)
+#define TIEQINPARERRINT_F    TIEQINPARERRINT_V(1U)
+
+#define TIMER0INT_S    2
+#define TIMER0INT_V(x) ((x) << TIMER0INT_S)
+#define TIMER0INT_F    TIMER0INT_V(1U)
+
+#define PREFDROPINT_S    1
+#define PREFDROPINT_V(x) ((x) << PREFDROPINT_S)
+#define PREFDROPINT_F    PREFDROPINT_V(1U)
+
+#define UPACCNONZERO_S    0
+#define UPACCNONZERO_V(x) ((x) << UPACCNONZERO_S)
+#define UPACCNONZERO_F    UPACCNONZERO_V(1U)
+
+#define MBHOSTPARERR_S    18
+#define MBHOSTPARERR_V(x) ((x) << MBHOSTPARERR_S)
+#define MBHOSTPARERR_F    MBHOSTPARERR_V(1U)
+
+#define MBUPPARERR_S    17
+#define MBUPPARERR_V(x) ((x) << MBUPPARERR_S)
+#define MBUPPARERR_F    MBUPPARERR_V(1U)
+
+#define IBQTP0PARERR_S    16
+#define IBQTP0PARERR_V(x) ((x) << IBQTP0PARERR_S)
+#define IBQTP0PARERR_F    IBQTP0PARERR_V(1U)
+
+#define IBQTP1PARERR_S    15
+#define IBQTP1PARERR_V(x) ((x) << IBQTP1PARERR_S)
+#define IBQTP1PARERR_F    IBQTP1PARERR_V(1U)
+
+#define IBQULPPARERR_S    14
+#define IBQULPPARERR_V(x) ((x) << IBQULPPARERR_S)
+#define IBQULPPARERR_F    IBQULPPARERR_V(1U)
+
+#define IBQSGELOPARERR_S    13
+#define IBQSGELOPARERR_V(x) ((x) << IBQSGELOPARERR_S)
+#define IBQSGELOPARERR_F    IBQSGELOPARERR_V(1U)
+
+#define IBQSGEHIPARERR_S    12
+#define IBQSGEHIPARERR_V(x) ((x) << IBQSGEHIPARERR_S)
+#define IBQSGEHIPARERR_F    IBQSGEHIPARERR_V(1U)
+
+#define IBQNCSIPARERR_S    11
+#define IBQNCSIPARERR_V(x) ((x) << IBQNCSIPARERR_S)
+#define IBQNCSIPARERR_F    IBQNCSIPARERR_V(1U)
+
+#define OBQULP0PARERR_S    10
+#define OBQULP0PARERR_V(x) ((x) << OBQULP0PARERR_S)
+#define OBQULP0PARERR_F    OBQULP0PARERR_V(1U)
+
+#define OBQULP1PARERR_S    9
+#define OBQULP1PARERR_V(x) ((x) << OBQULP1PARERR_S)
+#define OBQULP1PARERR_F    OBQULP1PARERR_V(1U)
+
+#define OBQULP2PARERR_S    8
+#define OBQULP2PARERR_V(x) ((x) << OBQULP2PARERR_S)
+#define OBQULP2PARERR_F    OBQULP2PARERR_V(1U)
+
+#define OBQULP3PARERR_S    7
+#define OBQULP3PARERR_V(x) ((x) << OBQULP3PARERR_S)
+#define OBQULP3PARERR_F    OBQULP3PARERR_V(1U)
+
+#define OBQSGEPARERR_S    6
+#define OBQSGEPARERR_V(x) ((x) << OBQSGEPARERR_S)
+#define OBQSGEPARERR_F    OBQSGEPARERR_V(1U)
+
+#define OBQNCSIPARERR_S    5
+#define OBQNCSIPARERR_V(x) ((x) << OBQNCSIPARERR_S)
+#define OBQNCSIPARERR_F    OBQNCSIPARERR_V(1U)
+
+#define CIM_HOST_UPACC_INT_CAUSE_A 0x7b34
+
+#define EEPROMWRINT_S    30
+#define EEPROMWRINT_V(x) ((x) << EEPROMWRINT_S)
+#define EEPROMWRINT_F    EEPROMWRINT_V(1U)
+
+#define TIMEOUTMAINT_S    29
+#define TIMEOUTMAINT_V(x) ((x) << TIMEOUTMAINT_S)
+#define TIMEOUTMAINT_F    TIMEOUTMAINT_V(1U)
+
+#define TIMEOUTINT_S    28
+#define TIMEOUTINT_V(x) ((x) << TIMEOUTINT_S)
+#define TIMEOUTINT_F    TIMEOUTINT_V(1U)
+
+#define RSPOVRLOOKUPINT_S    27
+#define RSPOVRLOOKUPINT_V(x) ((x) << RSPOVRLOOKUPINT_S)
+#define RSPOVRLOOKUPINT_F    RSPOVRLOOKUPINT_V(1U)
+
+#define REQOVRLOOKUPINT_S    26
+#define REQOVRLOOKUPINT_V(x) ((x) << REQOVRLOOKUPINT_S)
+#define REQOVRLOOKUPINT_F    REQOVRLOOKUPINT_V(1U)
+
+#define BLKWRPLINT_S    25
+#define BLKWRPLINT_V(x) ((x) << BLKWRPLINT_S)
+#define BLKWRPLINT_F    BLKWRPLINT_V(1U)
+
+#define BLKRDPLINT_S    24
+#define BLKRDPLINT_V(x) ((x) << BLKRDPLINT_S)
+#define BLKRDPLINT_F    BLKRDPLINT_V(1U)
+
+#define SGLWRPLINT_S    23
+#define SGLWRPLINT_V(x) ((x) << SGLWRPLINT_S)
+#define SGLWRPLINT_F    SGLWRPLINT_V(1U)
+
+#define SGLRDPLINT_S    22
+#define SGLRDPLINT_V(x) ((x) << SGLRDPLINT_S)
+#define SGLRDPLINT_F    SGLRDPLINT_V(1U)
+
+#define BLKWRCTLINT_S    21
+#define BLKWRCTLINT_V(x) ((x) << BLKWRCTLINT_S)
+#define BLKWRCTLINT_F    BLKWRCTLINT_V(1U)
+
+#define BLKRDCTLINT_S    20
+#define BLKRDCTLINT_V(x) ((x) << BLKRDCTLINT_S)
+#define BLKRDCTLINT_F    BLKRDCTLINT_V(1U)
+
+#define SGLWRCTLINT_S    19
+#define SGLWRCTLINT_V(x) ((x) << SGLWRCTLINT_S)
+#define SGLWRCTLINT_F    SGLWRCTLINT_V(1U)
+
+#define SGLRDCTLINT_S    18
+#define SGLRDCTLINT_V(x) ((x) << SGLRDCTLINT_S)
+#define SGLRDCTLINT_F    SGLRDCTLINT_V(1U)
+
+#define BLKWREEPROMINT_S    17
+#define BLKWREEPROMINT_V(x) ((x) << BLKWREEPROMINT_S)
+#define BLKWREEPROMINT_F    BLKWREEPROMINT_V(1U)
+
+#define BLKRDEEPROMINT_S    16
+#define BLKRDEEPROMINT_V(x) ((x) << BLKRDEEPROMINT_S)
+#define BLKRDEEPROMINT_F    BLKRDEEPROMINT_V(1U)
+
+#define SGLWREEPROMINT_S    15
+#define SGLWREEPROMINT_V(x) ((x) << SGLWREEPROMINT_S)
+#define SGLWREEPROMINT_F    SGLWREEPROMINT_V(1U)
+
+#define SGLRDEEPROMINT_S    14
+#define SGLRDEEPROMINT_V(x) ((x) << SGLRDEEPROMINT_S)
+#define SGLRDEEPROMINT_F    SGLRDEEPROMINT_V(1U)
+
+#define BLKWRFLASHINT_S    13
+#define BLKWRFLASHINT_V(x) ((x) << BLKWRFLASHINT_S)
+#define BLKWRFLASHINT_F    BLKWRFLASHINT_V(1U)
+
+#define BLKRDFLASHINT_S    12
+#define BLKRDFLASHINT_V(x) ((x) << BLKRDFLASHINT_S)
+#define BLKRDFLASHINT_F    BLKRDFLASHINT_V(1U)
+
+#define SGLWRFLASHINT_S    11
+#define SGLWRFLASHINT_V(x) ((x) << SGLWRFLASHINT_S)
+#define SGLWRFLASHINT_F    SGLWRFLASHINT_V(1U)
+
+#define SGLRDFLASHINT_S    10
+#define SGLRDFLASHINT_V(x) ((x) << SGLRDFLASHINT_S)
+#define SGLRDFLASHINT_F    SGLRDFLASHINT_V(1U)
+
+#define BLKWRBOOTINT_S    9
+#define BLKWRBOOTINT_V(x) ((x) << BLKWRBOOTINT_S)
+#define BLKWRBOOTINT_F    BLKWRBOOTINT_V(1U)
+
+#define BLKRDBOOTINT_S    8
+#define BLKRDBOOTINT_V(x) ((x) << BLKRDBOOTINT_S)
+#define BLKRDBOOTINT_F    BLKRDBOOTINT_V(1U)
+
+#define SGLWRBOOTINT_S    7
+#define SGLWRBOOTINT_V(x) ((x) << SGLWRBOOTINT_S)
+#define SGLWRBOOTINT_F    SGLWRBOOTINT_V(1U)
+
+#define SGLRDBOOTINT_S    6
+#define SGLRDBOOTINT_V(x) ((x) << SGLRDBOOTINT_S)
+#define SGLRDBOOTINT_F    SGLRDBOOTINT_V(1U)
+
+#define ILLWRBEINT_S    5
+#define ILLWRBEINT_V(x) ((x) << ILLWRBEINT_S)
+#define ILLWRBEINT_F    ILLWRBEINT_V(1U)
+
+#define ILLRDBEINT_S    4
+#define ILLRDBEINT_V(x) ((x) << ILLRDBEINT_S)
+#define ILLRDBEINT_F    ILLRDBEINT_V(1U)
+
+#define ILLRDINT_S    3
+#define ILLRDINT_V(x) ((x) << ILLRDINT_S)
+#define ILLRDINT_F    ILLRDINT_V(1U)
+
+#define ILLWRINT_S    2
+#define ILLWRINT_V(x) ((x) << ILLWRINT_S)
+#define ILLWRINT_F    ILLWRINT_V(1U)
+
+#define ILLTRANSINT_S    1
+#define ILLTRANSINT_V(x) ((x) << ILLTRANSINT_S)
+#define ILLTRANSINT_F    ILLTRANSINT_V(1U)
+
+#define RSVDSPACEINT_S    0
+#define RSVDSPACEINT_V(x) ((x) << RSVDSPACEINT_S)
+#define RSVDSPACEINT_F    RSVDSPACEINT_V(1U)
+
+/* registers for module TP */
+#define DBGLAWHLF_S    23
+#define DBGLAWHLF_V(x) ((x) << DBGLAWHLF_S)
+#define DBGLAWHLF_F    DBGLAWHLF_V(1U)
+
+#define DBGLAWPTR_S    16
+#define DBGLAWPTR_M    0x7fU
+#define DBGLAWPTR_G(x) (((x) >> DBGLAWPTR_S) & DBGLAWPTR_M)
+
+#define DBGLAENABLE_S    12
+#define DBGLAENABLE_V(x) ((x) << DBGLAENABLE_S)
+#define DBGLAENABLE_F    DBGLAENABLE_V(1U)
+
+#define DBGLARPTR_S    0
+#define DBGLARPTR_M    0x7fU
+#define DBGLARPTR_V(x) ((x) << DBGLARPTR_S)
+
+#define CRXPKTENC_S    3
+#define CRXPKTENC_V(x) ((x) << CRXPKTENC_S)
+#define CRXPKTENC_F    CRXPKTENC_V(1U)
+
+#define TP_DBG_LA_DATAL_A	0x7ed8
+#define TP_DBG_LA_CONFIG_A	0x7ed4
+#define TP_OUT_CONFIG_A		0x7d04
+#define TP_GLOBAL_CONFIG_A	0x7d08
+
+#define TP_CMM_TCB_BASE_A 0x7d10
+#define TP_CMM_MM_BASE_A 0x7d14
+#define TP_CMM_TIMER_BASE_A 0x7d18
+#define TP_PMM_TX_BASE_A 0x7d20
+#define TP_PMM_RX_BASE_A 0x7d28
+#define TP_PMM_RX_PAGE_SIZE_A 0x7d2c
+#define TP_PMM_RX_MAX_PAGE_A 0x7d30
+#define TP_PMM_TX_PAGE_SIZE_A 0x7d34
+#define TP_PMM_TX_MAX_PAGE_A 0x7d38
+#define TP_CMM_MM_MAX_PSTRUCT_A 0x7e6c
+
+#define PMRXNUMCHN_S    31
+#define PMRXNUMCHN_V(x) ((x) << PMRXNUMCHN_S)
+#define PMRXNUMCHN_F    PMRXNUMCHN_V(1U)
+
+#define PMTXNUMCHN_S    30
+#define PMTXNUMCHN_M    0x3U
+#define PMTXNUMCHN_G(x) (((x) >> PMTXNUMCHN_S) & PMTXNUMCHN_M)
+
+#define PMTXMAXPAGE_S    0
+#define PMTXMAXPAGE_M    0x1fffffU
+#define PMTXMAXPAGE_G(x) (((x) >> PMTXMAXPAGE_S) & PMTXMAXPAGE_M)
+
+#define PMRXMAXPAGE_S    0
+#define PMRXMAXPAGE_M    0x1fffffU
+#define PMRXMAXPAGE_G(x) (((x) >> PMRXMAXPAGE_S) & PMRXMAXPAGE_M)
+
+#define DBGLAMODE_S	14
+#define DBGLAMODE_M	0x3U
+#define DBGLAMODE_G(x)	(((x) >> DBGLAMODE_S) & DBGLAMODE_M)
+
+#define FIVETUPLELOOKUP_S    17
+#define FIVETUPLELOOKUP_M    0x3U
+#define FIVETUPLELOOKUP_V(x) ((x) << FIVETUPLELOOKUP_S)
+#define FIVETUPLELOOKUP_G(x) (((x) >> FIVETUPLELOOKUP_S) & FIVETUPLELOOKUP_M)
+
+#define TP_PARA_REG2_A 0x7d68
+
+#define MAXRXDATA_S    16
+#define MAXRXDATA_M    0xffffU
+#define MAXRXDATA_G(x) (((x) >> MAXRXDATA_S) & MAXRXDATA_M)
+
+#define TP_TIMER_RESOLUTION_A 0x7d90
+
+#define TIMERRESOLUTION_S    16
+#define TIMERRESOLUTION_M    0xffU
+#define TIMERRESOLUTION_G(x) (((x) >> TIMERRESOLUTION_S) & TIMERRESOLUTION_M)
+
+#define TIMESTAMPRESOLUTION_S    8
+#define TIMESTAMPRESOLUTION_M    0xffU
+#define TIMESTAMPRESOLUTION_G(x) \
+	(((x) >> TIMESTAMPRESOLUTION_S) & TIMESTAMPRESOLUTION_M)
+
+#define DELAYEDACKRESOLUTION_S    0
+#define DELAYEDACKRESOLUTION_M    0xffU
+#define DELAYEDACKRESOLUTION_G(x) \
+	(((x) >> DELAYEDACKRESOLUTION_S) & DELAYEDACKRESOLUTION_M)
+
+#define TP_SHIFT_CNT_A 0x7dc0
+#define TP_RXT_MIN_A 0x7d98
+#define TP_RXT_MAX_A 0x7d9c
+#define TP_PERS_MIN_A 0x7da0
+#define TP_PERS_MAX_A 0x7da4
+#define TP_KEEP_IDLE_A 0x7da8
+#define TP_KEEP_INTVL_A 0x7dac
+#define TP_INIT_SRTT_A 0x7db0
+#define TP_DACK_TIMER_A 0x7db4
+#define TP_FINWAIT2_TIMER_A 0x7db8
+
+#define INITSRTT_S    0
+#define INITSRTT_M    0xffffU
+#define INITSRTT_G(x) (((x) >> INITSRTT_S) & INITSRTT_M)
+
+#define PERSMAX_S    0
+#define PERSMAX_M    0x3fffffffU
+#define PERSMAX_V(x) ((x) << PERSMAX_S)
+#define PERSMAX_G(x) (((x) >> PERSMAX_S) & PERSMAX_M)
+
+#define SYNSHIFTMAX_S    24
+#define SYNSHIFTMAX_M    0xffU
+#define SYNSHIFTMAX_V(x) ((x) << SYNSHIFTMAX_S)
+#define SYNSHIFTMAX_G(x) (((x) >> SYNSHIFTMAX_S) & SYNSHIFTMAX_M)
+
+#define RXTSHIFTMAXR1_S    20
+#define RXTSHIFTMAXR1_M    0xfU
+#define RXTSHIFTMAXR1_V(x) ((x) << RXTSHIFTMAXR1_S)
+#define RXTSHIFTMAXR1_G(x) (((x) >> RXTSHIFTMAXR1_S) & RXTSHIFTMAXR1_M)
+
+#define RXTSHIFTMAXR2_S    16
+#define RXTSHIFTMAXR2_M    0xfU
+#define RXTSHIFTMAXR2_V(x) ((x) << RXTSHIFTMAXR2_S)
+#define RXTSHIFTMAXR2_G(x) (((x) >> RXTSHIFTMAXR2_S) & RXTSHIFTMAXR2_M)
+
+#define PERSHIFTBACKOFFMAX_S    12
+#define PERSHIFTBACKOFFMAX_M    0xfU
+#define PERSHIFTBACKOFFMAX_V(x) ((x) << PERSHIFTBACKOFFMAX_S)
+#define PERSHIFTBACKOFFMAX_G(x) \
+	(((x) >> PERSHIFTBACKOFFMAX_S) & PERSHIFTBACKOFFMAX_M)
+
+#define PERSHIFTMAX_S    8
+#define PERSHIFTMAX_M    0xfU
+#define PERSHIFTMAX_V(x) ((x) << PERSHIFTMAX_S)
+#define PERSHIFTMAX_G(x) (((x) >> PERSHIFTMAX_S) & PERSHIFTMAX_M)
+
+#define KEEPALIVEMAXR1_S    4
+#define KEEPALIVEMAXR1_M    0xfU
+#define KEEPALIVEMAXR1_V(x) ((x) << KEEPALIVEMAXR1_S)
+#define KEEPALIVEMAXR1_G(x) (((x) >> KEEPALIVEMAXR1_S) & KEEPALIVEMAXR1_M)
+
+#define KEEPALIVEMAXR2_S    0
+#define KEEPALIVEMAXR2_M    0xfU
+#define KEEPALIVEMAXR2_V(x) ((x) << KEEPALIVEMAXR2_S)
+#define KEEPALIVEMAXR2_G(x) (((x) >> KEEPALIVEMAXR2_S) & KEEPALIVEMAXR2_M)
+
+#define ROWINDEX_S    16
+#define ROWINDEX_V(x) ((x) << ROWINDEX_S)
+
+#define TP_CCTRL_TABLE_A	0x7ddc
+#define TP_PACE_TABLE_A 0x7dd8
+#define TP_MTU_TABLE_A		0x7de4
+
+#define MTUINDEX_S    24
+#define MTUINDEX_V(x) ((x) << MTUINDEX_S)
+
+#define MTUWIDTH_S    16
+#define MTUWIDTH_M    0xfU
+#define MTUWIDTH_V(x) ((x) << MTUWIDTH_S)
+#define MTUWIDTH_G(x) (((x) >> MTUWIDTH_S) & MTUWIDTH_M)
+
+#define MTUVALUE_S    0
+#define MTUVALUE_M    0x3fffU
+#define MTUVALUE_V(x) ((x) << MTUVALUE_S)
+#define MTUVALUE_G(x) (((x) >> MTUVALUE_S) & MTUVALUE_M)
+
+#define TP_RSS_LKP_TABLE_A	0x7dec
+#define TP_CMM_MM_RX_FLST_BASE_A 0x7e60
+#define TP_CMM_MM_TX_FLST_BASE_A 0x7e64
+#define TP_CMM_MM_PS_FLST_BASE_A 0x7e68
+
+#define LKPTBLROWVLD_S    31
+#define LKPTBLROWVLD_V(x) ((x) << LKPTBLROWVLD_S)
+#define LKPTBLROWVLD_F    LKPTBLROWVLD_V(1U)
+
+#define LKPTBLQUEUE1_S    10
+#define LKPTBLQUEUE1_M    0x3ffU
+#define LKPTBLQUEUE1_G(x) (((x) >> LKPTBLQUEUE1_S) & LKPTBLQUEUE1_M)
+
+#define LKPTBLQUEUE0_S    0
+#define LKPTBLQUEUE0_M    0x3ffU
+#define LKPTBLQUEUE0_G(x) (((x) >> LKPTBLQUEUE0_S) & LKPTBLQUEUE0_M)
+
+#define TP_TM_PIO_ADDR_A 0x7e18
+#define TP_TM_PIO_DATA_A 0x7e1c
+#define TP_MOD_CONFIG_A 0x7e24
+
+#define TIMERMODE_S    8
+#define TIMERMODE_M    0xffU
+#define TIMERMODE_G(x) (((x) >> TIMERMODE_S) & TIMERMODE_M)
+
+#define TP_TX_MOD_Q1_Q0_TIMER_SEPARATOR_A 0x3
+#define TP_TX_MOD_Q1_Q0_RATE_LIMIT_A 0x8
+
+#define TP_PIO_ADDR_A	0x7e40
+#define TP_PIO_DATA_A	0x7e44
+#define TP_MIB_INDEX_A	0x7e50
+#define TP_MIB_DATA_A	0x7e54
+#define TP_INT_CAUSE_A	0x7e74
+
+#define FLMTXFLSTEMPTY_S    30
+#define FLMTXFLSTEMPTY_V(x) ((x) << FLMTXFLSTEMPTY_S)
+#define FLMTXFLSTEMPTY_F    FLMTXFLSTEMPTY_V(1U)
+
+#define TP_TX_ORATE_A 0x7ebc
+
+#define OFDRATE3_S    24
+#define OFDRATE3_M    0xffU
+#define OFDRATE3_G(x) (((x) >> OFDRATE3_S) & OFDRATE3_M)
+
+#define OFDRATE2_S    16
+#define OFDRATE2_M    0xffU
+#define OFDRATE2_G(x) (((x) >> OFDRATE2_S) & OFDRATE2_M)
+
+#define OFDRATE1_S    8
+#define OFDRATE1_M    0xffU
+#define OFDRATE1_G(x) (((x) >> OFDRATE1_S) & OFDRATE1_M)
+
+#define OFDRATE0_S    0
+#define OFDRATE0_M    0xffU
+#define OFDRATE0_G(x) (((x) >> OFDRATE0_S) & OFDRATE0_M)
+
+#define TP_TX_TRATE_A 0x7ed0
+
+#define TNLRATE3_S    24
+#define TNLRATE3_M    0xffU
+#define TNLRATE3_G(x) (((x) >> TNLRATE3_S) & TNLRATE3_M)
+
+#define TNLRATE2_S    16
+#define TNLRATE2_M    0xffU
+#define TNLRATE2_G(x) (((x) >> TNLRATE2_S) & TNLRATE2_M)
+
+#define TNLRATE1_S    8
+#define TNLRATE1_M    0xffU
+#define TNLRATE1_G(x) (((x) >> TNLRATE1_S) & TNLRATE1_M)
+
+#define TNLRATE0_S    0
+#define TNLRATE0_M    0xffU
+#define TNLRATE0_G(x) (((x) >> TNLRATE0_S) & TNLRATE0_M)
+
+#define TP_VLAN_PRI_MAP_A 0x140
+
+#define FRAGMENTATION_S    9
+#define FRAGMENTATION_V(x) ((x) << FRAGMENTATION_S)
+#define FRAGMENTATION_F    FRAGMENTATION_V(1U)
+
+#define MPSHITTYPE_S    8
+#define MPSHITTYPE_V(x) ((x) << MPSHITTYPE_S)
+#define MPSHITTYPE_F    MPSHITTYPE_V(1U)
+
+#define MACMATCH_S    7
+#define MACMATCH_V(x) ((x) << MACMATCH_S)
+#define MACMATCH_F    MACMATCH_V(1U)
+
+#define ETHERTYPE_S    6
+#define ETHERTYPE_V(x) ((x) << ETHERTYPE_S)
+#define ETHERTYPE_F    ETHERTYPE_V(1U)
+
+#define PROTOCOL_S    5
+#define PROTOCOL_V(x) ((x) << PROTOCOL_S)
+#define PROTOCOL_F    PROTOCOL_V(1U)
+
+#define TOS_S    4
+#define TOS_V(x) ((x) << TOS_S)
+#define TOS_F    TOS_V(1U)
+
+#define VLAN_S    3
+#define VLAN_V(x) ((x) << VLAN_S)
+#define VLAN_F    VLAN_V(1U)
+
+#define VNIC_ID_S    2
+#define VNIC_ID_V(x) ((x) << VNIC_ID_S)
+#define VNIC_ID_F    VNIC_ID_V(1U)
+
+#define PORT_S    1
+#define PORT_V(x) ((x) << PORT_S)
+#define PORT_F    PORT_V(1U)
+
+#define FCOE_S    0
+#define FCOE_V(x) ((x) << FCOE_S)
+#define FCOE_F    FCOE_V(1U)
+
+#define FILTERMODE_S    15
+#define FILTERMODE_V(x) ((x) << FILTERMODE_S)
+#define FILTERMODE_F    FILTERMODE_V(1U)
+
+#define FCOEMASK_S    14
+#define FCOEMASK_V(x) ((x) << FCOEMASK_S)
+#define FCOEMASK_F    FCOEMASK_V(1U)
+
+#define TP_INGRESS_CONFIG_A	0x141
+
+#define VNIC_S    11
+#define VNIC_V(x) ((x) << VNIC_S)
+#define VNIC_F    VNIC_V(1U)
+
+#define CSUM_HAS_PSEUDO_HDR_S    10
+#define CSUM_HAS_PSEUDO_HDR_V(x) ((x) << CSUM_HAS_PSEUDO_HDR_S)
+#define CSUM_HAS_PSEUDO_HDR_F    CSUM_HAS_PSEUDO_HDR_V(1U)
+
+#define TP_MIB_MAC_IN_ERR_0_A	0x0
+#define TP_MIB_HDR_IN_ERR_0_A	0x4
+#define TP_MIB_TCP_IN_ERR_0_A	0x8
+#define TP_MIB_TCP_OUT_RST_A	0xc
+#define TP_MIB_TCP_IN_SEG_HI_A	0x10
+#define TP_MIB_TCP_IN_SEG_LO_A	0x11
+#define TP_MIB_TCP_OUT_SEG_HI_A	0x12
+#define TP_MIB_TCP_OUT_SEG_LO_A 0x13
+#define TP_MIB_TCP_RXT_SEG_HI_A	0x14
+#define TP_MIB_TCP_RXT_SEG_LO_A	0x15
+#define TP_MIB_TNL_CNG_DROP_0_A 0x18
+#define TP_MIB_OFD_CHN_DROP_0_A 0x1c
+#define TP_MIB_TCP_V6IN_ERR_0_A 0x28
+#define TP_MIB_TCP_V6OUT_RST_A	0x2c
+#define TP_MIB_OFD_ARP_DROP_A	0x36
+#define TP_MIB_CPL_IN_REQ_0_A	0x38
+#define TP_MIB_CPL_OUT_RSP_0_A	0x3c
+#define TP_MIB_TNL_DROP_0_A	0x44
+#define TP_MIB_FCOE_DDP_0_A	0x48
+#define TP_MIB_FCOE_DROP_0_A	0x4c
+#define TP_MIB_FCOE_BYTE_0_HI_A	0x50
+#define TP_MIB_OFD_VLN_DROP_0_A	0x58
+#define TP_MIB_USM_PKTS_A	0x5c
+#define TP_MIB_RQE_DFR_PKT_A	0x64
+
+#define ULP_TX_INT_CAUSE_A	0x8dcc
+#define ULP_TX_TPT_LLIMIT_A	0x8dd4
+#define ULP_TX_TPT_ULIMIT_A	0x8dd8
+#define ULP_TX_PBL_LLIMIT_A	0x8ddc
+#define ULP_TX_PBL_ULIMIT_A	0x8de0
+#define ULP_TX_ERR_TABLE_BASE_A 0x8e04
+
+#define PBL_BOUND_ERR_CH3_S    31
+#define PBL_BOUND_ERR_CH3_V(x) ((x) << PBL_BOUND_ERR_CH3_S)
+#define PBL_BOUND_ERR_CH3_F    PBL_BOUND_ERR_CH3_V(1U)
+
+#define PBL_BOUND_ERR_CH2_S    30
+#define PBL_BOUND_ERR_CH2_V(x) ((x) << PBL_BOUND_ERR_CH2_S)
+#define PBL_BOUND_ERR_CH2_F    PBL_BOUND_ERR_CH2_V(1U)
+
+#define PBL_BOUND_ERR_CH1_S    29
+#define PBL_BOUND_ERR_CH1_V(x) ((x) << PBL_BOUND_ERR_CH1_S)
+#define PBL_BOUND_ERR_CH1_F    PBL_BOUND_ERR_CH1_V(1U)
+
+#define PBL_BOUND_ERR_CH0_S    28
+#define PBL_BOUND_ERR_CH0_V(x) ((x) << PBL_BOUND_ERR_CH0_S)
+#define PBL_BOUND_ERR_CH0_F    PBL_BOUND_ERR_CH0_V(1U)
+
+#define PM_RX_INT_CAUSE_A	0x8fdc
+#define PM_RX_STAT_CONFIG_A 0x8fc8
+#define PM_RX_STAT_COUNT_A 0x8fcc
+#define PM_RX_STAT_LSB_A 0x8fd0
+#define PM_RX_DBG_CTRL_A 0x8fd0
+#define PM_RX_DBG_DATA_A 0x8fd4
+#define PM_RX_DBG_STAT_MSB_A 0x10013
+
+#define PMRX_FRAMING_ERROR_F	0x003ffff0U
+
+#define ZERO_E_CMD_ERROR_S    22
+#define ZERO_E_CMD_ERROR_V(x) ((x) << ZERO_E_CMD_ERROR_S)
+#define ZERO_E_CMD_ERROR_F    ZERO_E_CMD_ERROR_V(1U)
+
+#define OCSPI_PAR_ERROR_S    3
+#define OCSPI_PAR_ERROR_V(x) ((x) << OCSPI_PAR_ERROR_S)
+#define OCSPI_PAR_ERROR_F    OCSPI_PAR_ERROR_V(1U)
+
+#define DB_OPTIONS_PAR_ERROR_S    2
+#define DB_OPTIONS_PAR_ERROR_V(x) ((x) << DB_OPTIONS_PAR_ERROR_S)
+#define DB_OPTIONS_PAR_ERROR_F    DB_OPTIONS_PAR_ERROR_V(1U)
+
+#define IESPI_PAR_ERROR_S    1
+#define IESPI_PAR_ERROR_V(x) ((x) << IESPI_PAR_ERROR_S)
+#define IESPI_PAR_ERROR_F    IESPI_PAR_ERROR_V(1U)
+
+#define ULP_TX_LA_RDPTR_0_A 0x8ec0
+#define ULP_TX_LA_RDDATA_0_A 0x8ec4
+#define ULP_TX_LA_WRPTR_0_A 0x8ec8
+
+#define PMRX_E_PCMD_PAR_ERROR_S    0
+#define PMRX_E_PCMD_PAR_ERROR_V(x) ((x) << PMRX_E_PCMD_PAR_ERROR_S)
+#define PMRX_E_PCMD_PAR_ERROR_F    PMRX_E_PCMD_PAR_ERROR_V(1U)
+
+#define PM_TX_INT_CAUSE_A	0x8ffc
+#define PM_TX_STAT_CONFIG_A 0x8fe8
+#define PM_TX_STAT_COUNT_A 0x8fec
+#define PM_TX_STAT_LSB_A 0x8ff0
+#define PM_TX_DBG_CTRL_A 0x8ff0
+#define PM_TX_DBG_DATA_A 0x8ff4
+#define PM_TX_DBG_STAT_MSB_A 0x1001a
+
+#define PCMD_LEN_OVFL0_S    31
+#define PCMD_LEN_OVFL0_V(x) ((x) << PCMD_LEN_OVFL0_S)
+#define PCMD_LEN_OVFL0_F    PCMD_LEN_OVFL0_V(1U)
+
+#define PCMD_LEN_OVFL1_S    30
+#define PCMD_LEN_OVFL1_V(x) ((x) << PCMD_LEN_OVFL1_S)
+#define PCMD_LEN_OVFL1_F    PCMD_LEN_OVFL1_V(1U)
+
+#define PCMD_LEN_OVFL2_S    29
+#define PCMD_LEN_OVFL2_V(x) ((x) << PCMD_LEN_OVFL2_S)
+#define PCMD_LEN_OVFL2_F    PCMD_LEN_OVFL2_V(1U)
+
+#define ZERO_C_CMD_ERROR_S    28
+#define ZERO_C_CMD_ERROR_V(x) ((x) << ZERO_C_CMD_ERROR_S)
+#define ZERO_C_CMD_ERROR_F    ZERO_C_CMD_ERROR_V(1U)
+
+#define  PMTX_FRAMING_ERROR_F 0x0ffffff0U
+
+#define OESPI_PAR_ERROR_S    3
+#define OESPI_PAR_ERROR_V(x) ((x) << OESPI_PAR_ERROR_S)
+#define OESPI_PAR_ERROR_F    OESPI_PAR_ERROR_V(1U)
+
+#define ICSPI_PAR_ERROR_S    1
+#define ICSPI_PAR_ERROR_V(x) ((x) << ICSPI_PAR_ERROR_S)
+#define ICSPI_PAR_ERROR_F    ICSPI_PAR_ERROR_V(1U)
+
+#define PMTX_C_PCMD_PAR_ERROR_S    0
+#define PMTX_C_PCMD_PAR_ERROR_V(x) ((x) << PMTX_C_PCMD_PAR_ERROR_S)
+#define PMTX_C_PCMD_PAR_ERROR_F    PMTX_C_PCMD_PAR_ERROR_V(1U)
+
+#define MPS_PORT_STAT_TX_PORT_BYTES_L 0x400
+#define MPS_PORT_STAT_TX_PORT_BYTES_H 0x404
+#define MPS_PORT_STAT_TX_PORT_FRAMES_L 0x408
+#define MPS_PORT_STAT_TX_PORT_FRAMES_H 0x40c
+#define MPS_PORT_STAT_TX_PORT_BCAST_L 0x410
+#define MPS_PORT_STAT_TX_PORT_BCAST_H 0x414
+#define MPS_PORT_STAT_TX_PORT_MCAST_L 0x418
+#define MPS_PORT_STAT_TX_PORT_MCAST_H 0x41c
+#define MPS_PORT_STAT_TX_PORT_UCAST_L 0x420
+#define MPS_PORT_STAT_TX_PORT_UCAST_H 0x424
+#define MPS_PORT_STAT_TX_PORT_ERROR_L 0x428
+#define MPS_PORT_STAT_TX_PORT_ERROR_H 0x42c
+#define MPS_PORT_STAT_TX_PORT_64B_L 0x430
+#define MPS_PORT_STAT_TX_PORT_64B_H 0x434
+#define MPS_PORT_STAT_TX_PORT_65B_127B_L 0x438
+#define MPS_PORT_STAT_TX_PORT_65B_127B_H 0x43c
+#define MPS_PORT_STAT_TX_PORT_128B_255B_L 0x440
+#define MPS_PORT_STAT_TX_PORT_128B_255B_H 0x444
+#define MPS_PORT_STAT_TX_PORT_256B_511B_L 0x448
+#define MPS_PORT_STAT_TX_PORT_256B_511B_H 0x44c
+#define MPS_PORT_STAT_TX_PORT_512B_1023B_L 0x450
+#define MPS_PORT_STAT_TX_PORT_512B_1023B_H 0x454
+#define MPS_PORT_STAT_TX_PORT_1024B_1518B_L 0x458
+#define MPS_PORT_STAT_TX_PORT_1024B_1518B_H 0x45c
+#define MPS_PORT_STAT_TX_PORT_1519B_MAX_L 0x460
+#define MPS_PORT_STAT_TX_PORT_1519B_MAX_H 0x464
+#define MPS_PORT_STAT_TX_PORT_DROP_L 0x468
+#define MPS_PORT_STAT_TX_PORT_DROP_H 0x46c
+#define MPS_PORT_STAT_TX_PORT_PAUSE_L 0x470
+#define MPS_PORT_STAT_TX_PORT_PAUSE_H 0x474
+#define MPS_PORT_STAT_TX_PORT_PPP0_L 0x478
+#define MPS_PORT_STAT_TX_PORT_PPP0_H 0x47c
+#define MPS_PORT_STAT_TX_PORT_PPP1_L 0x480
+#define MPS_PORT_STAT_TX_PORT_PPP1_H 0x484
+#define MPS_PORT_STAT_TX_PORT_PPP2_L 0x488
+#define MPS_PORT_STAT_TX_PORT_PPP2_H 0x48c
+#define MPS_PORT_STAT_TX_PORT_PPP3_L 0x490
+#define MPS_PORT_STAT_TX_PORT_PPP3_H 0x494
+#define MPS_PORT_STAT_TX_PORT_PPP4_L 0x498
+#define MPS_PORT_STAT_TX_PORT_PPP4_H 0x49c
+#define MPS_PORT_STAT_TX_PORT_PPP5_L 0x4a0
+#define MPS_PORT_STAT_TX_PORT_PPP5_H 0x4a4
+#define MPS_PORT_STAT_TX_PORT_PPP6_L 0x4a8
+#define MPS_PORT_STAT_TX_PORT_PPP6_H 0x4ac
+#define MPS_PORT_STAT_TX_PORT_PPP7_L 0x4b0
+#define MPS_PORT_STAT_TX_PORT_PPP7_H 0x4b4
+#define MPS_PORT_STAT_LB_PORT_BYTES_L 0x4c0
+#define MPS_PORT_STAT_LB_PORT_BYTES_H 0x4c4
+#define MPS_PORT_STAT_LB_PORT_FRAMES_L 0x4c8
+#define MPS_PORT_STAT_LB_PORT_FRAMES_H 0x4cc
+#define MPS_PORT_STAT_LB_PORT_BCAST_L 0x4d0
+#define MPS_PORT_STAT_LB_PORT_BCAST_H 0x4d4
+#define MPS_PORT_STAT_LB_PORT_MCAST_L 0x4d8
+#define MPS_PORT_STAT_LB_PORT_MCAST_H 0x4dc
+#define MPS_PORT_STAT_LB_PORT_UCAST_L 0x4e0
+#define MPS_PORT_STAT_LB_PORT_UCAST_H 0x4e4
+#define MPS_PORT_STAT_LB_PORT_ERROR_L 0x4e8
+#define MPS_PORT_STAT_LB_PORT_ERROR_H 0x4ec
+#define MPS_PORT_STAT_LB_PORT_64B_L 0x4f0
+#define MPS_PORT_STAT_LB_PORT_64B_H 0x4f4
+#define MPS_PORT_STAT_LB_PORT_65B_127B_L 0x4f8
+#define MPS_PORT_STAT_LB_PORT_65B_127B_H 0x4fc
+#define MPS_PORT_STAT_LB_PORT_128B_255B_L 0x500
+#define MPS_PORT_STAT_LB_PORT_128B_255B_H 0x504
+#define MPS_PORT_STAT_LB_PORT_256B_511B_L 0x508
+#define MPS_PORT_STAT_LB_PORT_256B_511B_H 0x50c
+#define MPS_PORT_STAT_LB_PORT_512B_1023B_L 0x510
+#define MPS_PORT_STAT_LB_PORT_512B_1023B_H 0x514
+#define MPS_PORT_STAT_LB_PORT_1024B_1518B_L 0x518
+#define MPS_PORT_STAT_LB_PORT_1024B_1518B_H 0x51c
+#define MPS_PORT_STAT_LB_PORT_1519B_MAX_L 0x520
+#define MPS_PORT_STAT_LB_PORT_1519B_MAX_H 0x524
+#define MPS_PORT_STAT_LB_PORT_DROP_FRAMES 0x528
+#define MPS_PORT_STAT_LB_PORT_DROP_FRAMES_L 0x528
+#define MPS_PORT_STAT_RX_PORT_BYTES_L 0x540
+#define MPS_PORT_STAT_RX_PORT_BYTES_H 0x544
+#define MPS_PORT_STAT_RX_PORT_FRAMES_L 0x548
+#define MPS_PORT_STAT_RX_PORT_FRAMES_H 0x54c
+#define MPS_PORT_STAT_RX_PORT_BCAST_L 0x550
+#define MPS_PORT_STAT_RX_PORT_BCAST_H 0x554
+#define MPS_PORT_STAT_RX_PORT_MCAST_L 0x558
+#define MPS_PORT_STAT_RX_PORT_MCAST_H 0x55c
+#define MPS_PORT_STAT_RX_PORT_UCAST_L 0x560
+#define MPS_PORT_STAT_RX_PORT_UCAST_H 0x564
+#define MPS_PORT_STAT_RX_PORT_MTU_ERROR_L 0x568
+#define MPS_PORT_STAT_RX_PORT_MTU_ERROR_H 0x56c
+#define MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_L 0x570
+#define MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_H 0x574
+#define MPS_PORT_STAT_RX_PORT_CRC_ERROR_L 0x578
+#define MPS_PORT_STAT_RX_PORT_CRC_ERROR_H 0x57c
+#define MPS_PORT_STAT_RX_PORT_LEN_ERROR_L 0x580
+#define MPS_PORT_STAT_RX_PORT_LEN_ERROR_H 0x584
+#define MPS_PORT_STAT_RX_PORT_SYM_ERROR_L 0x588
+#define MPS_PORT_STAT_RX_PORT_SYM_ERROR_H 0x58c
+#define MPS_PORT_STAT_RX_PORT_64B_L 0x590
+#define MPS_PORT_STAT_RX_PORT_64B_H 0x594
+#define MPS_PORT_STAT_RX_PORT_65B_127B_L 0x598
+#define MPS_PORT_STAT_RX_PORT_65B_127B_H 0x59c
+#define MPS_PORT_STAT_RX_PORT_128B_255B_L 0x5a0
+#define MPS_PORT_STAT_RX_PORT_128B_255B_H 0x5a4
+#define MPS_PORT_STAT_RX_PORT_256B_511B_L 0x5a8
+#define MPS_PORT_STAT_RX_PORT_256B_511B_H 0x5ac
+#define MPS_PORT_STAT_RX_PORT_512B_1023B_L 0x5b0
+#define MPS_PORT_STAT_RX_PORT_512B_1023B_H 0x5b4
+#define MPS_PORT_STAT_RX_PORT_1024B_1518B_L 0x5b8
+#define MPS_PORT_STAT_RX_PORT_1024B_1518B_H 0x5bc
+#define MPS_PORT_STAT_RX_PORT_1519B_MAX_L 0x5c0
+#define MPS_PORT_STAT_RX_PORT_1519B_MAX_H 0x5c4
+#define MPS_PORT_STAT_RX_PORT_PAUSE_L 0x5c8
+#define MPS_PORT_STAT_RX_PORT_PAUSE_H 0x5cc
+#define MPS_PORT_STAT_RX_PORT_PPP0_L 0x5d0
+#define MPS_PORT_STAT_RX_PORT_PPP0_H 0x5d4
+#define MPS_PORT_STAT_RX_PORT_PPP1_L 0x5d8
+#define MPS_PORT_STAT_RX_PORT_PPP1_H 0x5dc
+#define MPS_PORT_STAT_RX_PORT_PPP2_L 0x5e0
+#define MPS_PORT_STAT_RX_PORT_PPP2_H 0x5e4
+#define MPS_PORT_STAT_RX_PORT_PPP3_L 0x5e8
+#define MPS_PORT_STAT_RX_PORT_PPP3_H 0x5ec
+#define MPS_PORT_STAT_RX_PORT_PPP4_L 0x5f0
+#define MPS_PORT_STAT_RX_PORT_PPP4_H 0x5f4
+#define MPS_PORT_STAT_RX_PORT_PPP5_L 0x5f8
+#define MPS_PORT_STAT_RX_PORT_PPP5_H 0x5fc
+#define MPS_PORT_STAT_RX_PORT_PPP6_L 0x600
+#define MPS_PORT_STAT_RX_PORT_PPP6_H 0x604
+#define MPS_PORT_STAT_RX_PORT_PPP7_L 0x608
+#define MPS_PORT_STAT_RX_PORT_PPP7_H 0x60c
+#define MPS_PORT_STAT_RX_PORT_LESS_64B_L 0x610
+#define MPS_PORT_STAT_RX_PORT_LESS_64B_H 0x614
+#define MAC_PORT_MAGIC_MACID_LO 0x824
+#define MAC_PORT_MAGIC_MACID_HI 0x828
+#define MAC_PORT_TX_TS_VAL_LO   0x928
+#define MAC_PORT_TX_TS_VAL_HI   0x92c
+
+#define MAC_PORT_EPIO_DATA0_A 0x8c0
+#define MAC_PORT_EPIO_DATA1_A 0x8c4
+#define MAC_PORT_EPIO_DATA2_A 0x8c8
+#define MAC_PORT_EPIO_DATA3_A 0x8cc
+#define MAC_PORT_EPIO_OP_A 0x8d0
+
+#define MAC_PORT_CFG2_A 0x818
+
+#define MPS_CMN_CTL_A	0x9000
+
+#define COUNTPAUSEMCRX_S    5
+#define COUNTPAUSEMCRX_V(x) ((x) << COUNTPAUSEMCRX_S)
+#define COUNTPAUSEMCRX_F    COUNTPAUSEMCRX_V(1U)
+
+#define COUNTPAUSESTATRX_S    4
+#define COUNTPAUSESTATRX_V(x) ((x) << COUNTPAUSESTATRX_S)
+#define COUNTPAUSESTATRX_F    COUNTPAUSESTATRX_V(1U)
+
+#define COUNTPAUSEMCTX_S    3
+#define COUNTPAUSEMCTX_V(x) ((x) << COUNTPAUSEMCTX_S)
+#define COUNTPAUSEMCTX_F    COUNTPAUSEMCTX_V(1U)
+
+#define COUNTPAUSESTATTX_S    2
+#define COUNTPAUSESTATTX_V(x) ((x) << COUNTPAUSESTATTX_S)
+#define COUNTPAUSESTATTX_F    COUNTPAUSESTATTX_V(1U)
+
+#define NUMPORTS_S    0
+#define NUMPORTS_M    0x3U
+#define NUMPORTS_G(x) (((x) >> NUMPORTS_S) & NUMPORTS_M)
+
+#define MPS_INT_CAUSE_A 0x9008
+#define MPS_TX_INT_CAUSE_A 0x9408
+#define MPS_STAT_CTL_A 0x9600
+
+#define FRMERR_S    15
+#define FRMERR_V(x) ((x) << FRMERR_S)
+#define FRMERR_F    FRMERR_V(1U)
+
+#define SECNTERR_S    14
+#define SECNTERR_V(x) ((x) << SECNTERR_S)
+#define SECNTERR_F    SECNTERR_V(1U)
+
+#define BUBBLE_S    13
+#define BUBBLE_V(x) ((x) << BUBBLE_S)
+#define BUBBLE_F    BUBBLE_V(1U)
+
+#define TXDESCFIFO_S    9
+#define TXDESCFIFO_M    0xfU
+#define TXDESCFIFO_V(x) ((x) << TXDESCFIFO_S)
+
+#define TXDATAFIFO_S    5
+#define TXDATAFIFO_M    0xfU
+#define TXDATAFIFO_V(x) ((x) << TXDATAFIFO_S)
+
+#define NCSIFIFO_S    4
+#define NCSIFIFO_V(x) ((x) << NCSIFIFO_S)
+#define NCSIFIFO_F    NCSIFIFO_V(1U)
+
+#define TPFIFO_S    0
+#define TPFIFO_M    0xfU
+#define TPFIFO_V(x) ((x) << TPFIFO_S)
+
+#define MPS_STAT_PERR_INT_CAUSE_SRAM_A		0x9614
+#define MPS_STAT_PERR_INT_CAUSE_TX_FIFO_A	0x9620
+#define MPS_STAT_PERR_INT_CAUSE_RX_FIFO_A	0x962c
+
+#define MPS_STAT_RX_BG_0_MAC_DROP_FRAME_L 0x9640
+#define MPS_STAT_RX_BG_0_MAC_DROP_FRAME_H 0x9644
+#define MPS_STAT_RX_BG_1_MAC_DROP_FRAME_L 0x9648
+#define MPS_STAT_RX_BG_1_MAC_DROP_FRAME_H 0x964c
+#define MPS_STAT_RX_BG_2_MAC_DROP_FRAME_L 0x9650
+#define MPS_STAT_RX_BG_2_MAC_DROP_FRAME_H 0x9654
+#define MPS_STAT_RX_BG_3_MAC_DROP_FRAME_L 0x9658
+#define MPS_STAT_RX_BG_3_MAC_DROP_FRAME_H 0x965c
+#define MPS_STAT_RX_BG_0_LB_DROP_FRAME_L 0x9660
+#define MPS_STAT_RX_BG_0_LB_DROP_FRAME_H 0x9664
+#define MPS_STAT_RX_BG_1_LB_DROP_FRAME_L 0x9668
+#define MPS_STAT_RX_BG_1_LB_DROP_FRAME_H 0x966c
+#define MPS_STAT_RX_BG_2_LB_DROP_FRAME_L 0x9670
+#define MPS_STAT_RX_BG_2_LB_DROP_FRAME_H 0x9674
+#define MPS_STAT_RX_BG_3_LB_DROP_FRAME_L 0x9678
+#define MPS_STAT_RX_BG_3_LB_DROP_FRAME_H 0x967c
+#define MPS_STAT_RX_BG_0_MAC_TRUNC_FRAME_L 0x9680
+#define MPS_STAT_RX_BG_0_MAC_TRUNC_FRAME_H 0x9684
+#define MPS_STAT_RX_BG_1_MAC_TRUNC_FRAME_L 0x9688
+#define MPS_STAT_RX_BG_1_MAC_TRUNC_FRAME_H 0x968c
+#define MPS_STAT_RX_BG_2_MAC_TRUNC_FRAME_L 0x9690
+#define MPS_STAT_RX_BG_2_MAC_TRUNC_FRAME_H 0x9694
+#define MPS_STAT_RX_BG_3_MAC_TRUNC_FRAME_L 0x9698
+#define MPS_STAT_RX_BG_3_MAC_TRUNC_FRAME_H 0x969c
+#define MPS_STAT_RX_BG_0_LB_TRUNC_FRAME_L 0x96a0
+#define MPS_STAT_RX_BG_0_LB_TRUNC_FRAME_H 0x96a4
+#define MPS_STAT_RX_BG_1_LB_TRUNC_FRAME_L 0x96a8
+#define MPS_STAT_RX_BG_1_LB_TRUNC_FRAME_H 0x96ac
+#define MPS_STAT_RX_BG_2_LB_TRUNC_FRAME_L 0x96b0
+#define MPS_STAT_RX_BG_2_LB_TRUNC_FRAME_H 0x96b4
+#define MPS_STAT_RX_BG_3_LB_TRUNC_FRAME_L 0x96b8
+#define MPS_STAT_RX_BG_3_LB_TRUNC_FRAME_H 0x96bc
+
+#define MPS_TRC_CFG_A 0x9800
+
+#define TRCFIFOEMPTY_S    4
+#define TRCFIFOEMPTY_V(x) ((x) << TRCFIFOEMPTY_S)
+#define TRCFIFOEMPTY_F    TRCFIFOEMPTY_V(1U)
+
+#define TRCIGNOREDROPINPUT_S    3
+#define TRCIGNOREDROPINPUT_V(x) ((x) << TRCIGNOREDROPINPUT_S)
+#define TRCIGNOREDROPINPUT_F    TRCIGNOREDROPINPUT_V(1U)
+
+#define TRCKEEPDUPLICATES_S    2
+#define TRCKEEPDUPLICATES_V(x) ((x) << TRCKEEPDUPLICATES_S)
+#define TRCKEEPDUPLICATES_F    TRCKEEPDUPLICATES_V(1U)
+
+#define TRCEN_S    1
+#define TRCEN_V(x) ((x) << TRCEN_S)
+#define TRCEN_F    TRCEN_V(1U)
+
+#define TRCMULTIFILTER_S    0
+#define TRCMULTIFILTER_V(x) ((x) << TRCMULTIFILTER_S)
+#define TRCMULTIFILTER_F    TRCMULTIFILTER_V(1U)
+
+#define MPS_TRC_RSS_CONTROL_A		0x9808
+#define MPS_TRC_FILTER1_RSS_CONTROL_A	0x9ff4
+#define MPS_TRC_FILTER2_RSS_CONTROL_A	0x9ffc
+#define MPS_TRC_FILTER3_RSS_CONTROL_A	0xa004
+#define MPS_T5_TRC_RSS_CONTROL_A	0xa00c
+
+#define RSSCONTROL_S    16
+#define RSSCONTROL_V(x) ((x) << RSSCONTROL_S)
+
+#define QUEUENUMBER_S    0
+#define QUEUENUMBER_V(x) ((x) << QUEUENUMBER_S)
+
+#define TFINVERTMATCH_S    24
+#define TFINVERTMATCH_V(x) ((x) << TFINVERTMATCH_S)
+#define TFINVERTMATCH_F    TFINVERTMATCH_V(1U)
+
+#define TFEN_S    22
+#define TFEN_V(x) ((x) << TFEN_S)
+#define TFEN_F    TFEN_V(1U)
+
+#define TFPORT_S    18
+#define TFPORT_M    0xfU
+#define TFPORT_V(x) ((x) << TFPORT_S)
+#define TFPORT_G(x) (((x) >> TFPORT_S) & TFPORT_M)
+
+#define TFLENGTH_S    8
+#define TFLENGTH_M    0x1fU
+#define TFLENGTH_V(x) ((x) << TFLENGTH_S)
+#define TFLENGTH_G(x) (((x) >> TFLENGTH_S) & TFLENGTH_M)
+
+#define TFOFFSET_S    0
+#define TFOFFSET_M    0x1fU
+#define TFOFFSET_V(x) ((x) << TFOFFSET_S)
+#define TFOFFSET_G(x) (((x) >> TFOFFSET_S) & TFOFFSET_M)
+
+#define T5_TFINVERTMATCH_S    25
+#define T5_TFINVERTMATCH_V(x) ((x) << T5_TFINVERTMATCH_S)
+#define T5_TFINVERTMATCH_F    T5_TFINVERTMATCH_V(1U)
+
+#define T5_TFEN_S    23
+#define T5_TFEN_V(x) ((x) << T5_TFEN_S)
+#define T5_TFEN_F    T5_TFEN_V(1U)
+
+#define T5_TFPORT_S    18
+#define T5_TFPORT_M    0x1fU
+#define T5_TFPORT_V(x) ((x) << T5_TFPORT_S)
+#define T5_TFPORT_G(x) (((x) >> T5_TFPORT_S) & T5_TFPORT_M)
+
+#define MPS_TRC_FILTER_MATCH_CTL_A_A 0x9810
+#define MPS_TRC_FILTER_MATCH_CTL_B_A 0x9820
+
+#define TFMINPKTSIZE_S    16
+#define TFMINPKTSIZE_M    0x1ffU
+#define TFMINPKTSIZE_V(x) ((x) << TFMINPKTSIZE_S)
+#define TFMINPKTSIZE_G(x) (((x) >> TFMINPKTSIZE_S) & TFMINPKTSIZE_M)
+
+#define TFCAPTUREMAX_S    0
+#define TFCAPTUREMAX_M    0x3fffU
+#define TFCAPTUREMAX_V(x) ((x) << TFCAPTUREMAX_S)
+#define TFCAPTUREMAX_G(x) (((x) >> TFCAPTUREMAX_S) & TFCAPTUREMAX_M)
+
+#define MPS_TRC_FILTER0_MATCH_A 0x9c00
+#define MPS_TRC_FILTER0_DONT_CARE_A 0x9c80
+#define MPS_TRC_FILTER1_MATCH_A 0x9d00
+
+#define TP_RSS_CONFIG_A 0x7df0
+
+#define TNL4TUPENIPV6_S    31
+#define TNL4TUPENIPV6_V(x) ((x) << TNL4TUPENIPV6_S)
+#define TNL4TUPENIPV6_F    TNL4TUPENIPV6_V(1U)
+
+#define TNL2TUPENIPV6_S    30
+#define TNL2TUPENIPV6_V(x) ((x) << TNL2TUPENIPV6_S)
+#define TNL2TUPENIPV6_F    TNL2TUPENIPV6_V(1U)
+
+#define TNL4TUPENIPV4_S    29
+#define TNL4TUPENIPV4_V(x) ((x) << TNL4TUPENIPV4_S)
+#define TNL4TUPENIPV4_F    TNL4TUPENIPV4_V(1U)
+
+#define TNL2TUPENIPV4_S    28
+#define TNL2TUPENIPV4_V(x) ((x) << TNL2TUPENIPV4_S)
+#define TNL2TUPENIPV4_F    TNL2TUPENIPV4_V(1U)
+
+#define TNLTCPSEL_S    27
+#define TNLTCPSEL_V(x) ((x) << TNLTCPSEL_S)
+#define TNLTCPSEL_F    TNLTCPSEL_V(1U)
+
+#define TNLIP6SEL_S    26
+#define TNLIP6SEL_V(x) ((x) << TNLIP6SEL_S)
+#define TNLIP6SEL_F    TNLIP6SEL_V(1U)
+
+#define TNLVRTSEL_S    25
+#define TNLVRTSEL_V(x) ((x) << TNLVRTSEL_S)
+#define TNLVRTSEL_F    TNLVRTSEL_V(1U)
+
+#define TNLMAPEN_S    24
+#define TNLMAPEN_V(x) ((x) << TNLMAPEN_S)
+#define TNLMAPEN_F    TNLMAPEN_V(1U)
+
+#define OFDHASHSAVE_S    19
+#define OFDHASHSAVE_V(x) ((x) << OFDHASHSAVE_S)
+#define OFDHASHSAVE_F    OFDHASHSAVE_V(1U)
+
+#define OFDVRTSEL_S    18
+#define OFDVRTSEL_V(x) ((x) << OFDVRTSEL_S)
+#define OFDVRTSEL_F    OFDVRTSEL_V(1U)
+
+#define OFDMAPEN_S    17
+#define OFDMAPEN_V(x) ((x) << OFDMAPEN_S)
+#define OFDMAPEN_F    OFDMAPEN_V(1U)
+
+#define OFDLKPEN_S    16
+#define OFDLKPEN_V(x) ((x) << OFDLKPEN_S)
+#define OFDLKPEN_F    OFDLKPEN_V(1U)
+
+#define SYN4TUPENIPV6_S    15
+#define SYN4TUPENIPV6_V(x) ((x) << SYN4TUPENIPV6_S)
+#define SYN4TUPENIPV6_F    SYN4TUPENIPV6_V(1U)
+
+#define SYN2TUPENIPV6_S    14
+#define SYN2TUPENIPV6_V(x) ((x) << SYN2TUPENIPV6_S)
+#define SYN2TUPENIPV6_F    SYN2TUPENIPV6_V(1U)
+
+#define SYN4TUPENIPV4_S    13
+#define SYN4TUPENIPV4_V(x) ((x) << SYN4TUPENIPV4_S)
+#define SYN4TUPENIPV4_F    SYN4TUPENIPV4_V(1U)
+
+#define SYN2TUPENIPV4_S    12
+#define SYN2TUPENIPV4_V(x) ((x) << SYN2TUPENIPV4_S)
+#define SYN2TUPENIPV4_F    SYN2TUPENIPV4_V(1U)
+
+#define SYNIP6SEL_S    11
+#define SYNIP6SEL_V(x) ((x) << SYNIP6SEL_S)
+#define SYNIP6SEL_F    SYNIP6SEL_V(1U)
+
+#define SYNVRTSEL_S    10
+#define SYNVRTSEL_V(x) ((x) << SYNVRTSEL_S)
+#define SYNVRTSEL_F    SYNVRTSEL_V(1U)
+
+#define SYNMAPEN_S    9
+#define SYNMAPEN_V(x) ((x) << SYNMAPEN_S)
+#define SYNMAPEN_F    SYNMAPEN_V(1U)
+
+#define SYNLKPEN_S    8
+#define SYNLKPEN_V(x) ((x) << SYNLKPEN_S)
+#define SYNLKPEN_F    SYNLKPEN_V(1U)
+
+#define CHANNELENABLE_S    7
+#define CHANNELENABLE_V(x) ((x) << CHANNELENABLE_S)
+#define CHANNELENABLE_F    CHANNELENABLE_V(1U)
+
+#define PORTENABLE_S    6
+#define PORTENABLE_V(x) ((x) << PORTENABLE_S)
+#define PORTENABLE_F    PORTENABLE_V(1U)
+
+#define TNLALLLOOKUP_S    5
+#define TNLALLLOOKUP_V(x) ((x) << TNLALLLOOKUP_S)
+#define TNLALLLOOKUP_F    TNLALLLOOKUP_V(1U)
+
+#define VIRTENABLE_S    4
+#define VIRTENABLE_V(x) ((x) << VIRTENABLE_S)
+#define VIRTENABLE_F    VIRTENABLE_V(1U)
+
+#define CONGESTIONENABLE_S    3
+#define CONGESTIONENABLE_V(x) ((x) << CONGESTIONENABLE_S)
+#define CONGESTIONENABLE_F    CONGESTIONENABLE_V(1U)
+
+#define HASHTOEPLITZ_S    2
+#define HASHTOEPLITZ_V(x) ((x) << HASHTOEPLITZ_S)
+#define HASHTOEPLITZ_F    HASHTOEPLITZ_V(1U)
+
+#define UDPENABLE_S    1
+#define UDPENABLE_V(x) ((x) << UDPENABLE_S)
+#define UDPENABLE_F    UDPENABLE_V(1U)
+
+#define DISABLE_S    0
+#define DISABLE_V(x) ((x) << DISABLE_S)
+#define DISABLE_F    DISABLE_V(1U)
+
+#define TP_RSS_CONFIG_TNL_A 0x7df4
+
+#define MASKSIZE_S    28
+#define MASKSIZE_M    0xfU
+#define MASKSIZE_V(x) ((x) << MASKSIZE_S)
+#define MASKSIZE_G(x) (((x) >> MASKSIZE_S) & MASKSIZE_M)
+
+#define MASKFILTER_S    16
+#define MASKFILTER_M    0x7ffU
+#define MASKFILTER_V(x) ((x) << MASKFILTER_S)
+#define MASKFILTER_G(x) (((x) >> MASKFILTER_S) & MASKFILTER_M)
+
+#define USEWIRECH_S    0
+#define USEWIRECH_V(x) ((x) << USEWIRECH_S)
+#define USEWIRECH_F    USEWIRECH_V(1U)
+
+#define HASHALL_S    2
+#define HASHALL_V(x) ((x) << HASHALL_S)
+#define HASHALL_F    HASHALL_V(1U)
+
+#define HASHETH_S    1
+#define HASHETH_V(x) ((x) << HASHETH_S)
+#define HASHETH_F    HASHETH_V(1U)
+
+#define TP_RSS_CONFIG_OFD_A 0x7df8
+
+#define RRCPLMAPEN_S    20
+#define RRCPLMAPEN_V(x) ((x) << RRCPLMAPEN_S)
+#define RRCPLMAPEN_F    RRCPLMAPEN_V(1U)
+
+#define RRCPLQUEWIDTH_S    16
+#define RRCPLQUEWIDTH_M    0xfU
+#define RRCPLQUEWIDTH_V(x) ((x) << RRCPLQUEWIDTH_S)
+#define RRCPLQUEWIDTH_G(x) (((x) >> RRCPLQUEWIDTH_S) & RRCPLQUEWIDTH_M)
+
+#define TP_RSS_CONFIG_SYN_A 0x7dfc
+#define TP_RSS_CONFIG_VRT_A 0x7e00
+
+#define VFRDRG_S    25
+#define VFRDRG_V(x) ((x) << VFRDRG_S)
+#define VFRDRG_F    VFRDRG_V(1U)
+
+#define VFRDEN_S    24
+#define VFRDEN_V(x) ((x) << VFRDEN_S)
+#define VFRDEN_F    VFRDEN_V(1U)
+
+#define VFPERREN_S    23
+#define VFPERREN_V(x) ((x) << VFPERREN_S)
+#define VFPERREN_F    VFPERREN_V(1U)
+
+#define KEYPERREN_S    22
+#define KEYPERREN_V(x) ((x) << KEYPERREN_S)
+#define KEYPERREN_F    KEYPERREN_V(1U)
+
+#define DISABLEVLAN_S    21
+#define DISABLEVLAN_V(x) ((x) << DISABLEVLAN_S)
+#define DISABLEVLAN_F    DISABLEVLAN_V(1U)
+
+#define ENABLEUP0_S    20
+#define ENABLEUP0_V(x) ((x) << ENABLEUP0_S)
+#define ENABLEUP0_F    ENABLEUP0_V(1U)
+
+#define HASHDELAY_S    16
+#define HASHDELAY_M    0xfU
+#define HASHDELAY_V(x) ((x) << HASHDELAY_S)
+#define HASHDELAY_G(x) (((x) >> HASHDELAY_S) & HASHDELAY_M)
+
+#define VFWRADDR_S    8
+#define VFWRADDR_M    0x7fU
+#define VFWRADDR_V(x) ((x) << VFWRADDR_S)
+#define VFWRADDR_G(x) (((x) >> VFWRADDR_S) & VFWRADDR_M)
+
+#define KEYMODE_S    6
+#define KEYMODE_M    0x3U
+#define KEYMODE_V(x) ((x) << KEYMODE_S)
+#define KEYMODE_G(x) (((x) >> KEYMODE_S) & KEYMODE_M)
+
+#define VFWREN_S    5
+#define VFWREN_V(x) ((x) << VFWREN_S)
+#define VFWREN_F    VFWREN_V(1U)
+
+#define KEYWREN_S    4
+#define KEYWREN_V(x) ((x) << KEYWREN_S)
+#define KEYWREN_F    KEYWREN_V(1U)
+
+#define KEYWRADDR_S    0
+#define KEYWRADDR_M    0xfU
+#define KEYWRADDR_V(x) ((x) << KEYWRADDR_S)
+#define KEYWRADDR_G(x) (((x) >> KEYWRADDR_S) & KEYWRADDR_M)
+
+#define KEYWRADDRX_S    30
+#define KEYWRADDRX_M    0x3U
+#define KEYWRADDRX_V(x) ((x) << KEYWRADDRX_S)
+#define KEYWRADDRX_G(x) (((x) >> KEYWRADDRX_S) & KEYWRADDRX_M)
+
+#define KEYEXTEND_S    26
+#define KEYEXTEND_V(x) ((x) << KEYEXTEND_S)
+#define KEYEXTEND_F    KEYEXTEND_V(1U)
+
+#define LKPIDXSIZE_S    24
+#define LKPIDXSIZE_M    0x3U
+#define LKPIDXSIZE_V(x) ((x) << LKPIDXSIZE_S)
+#define LKPIDXSIZE_G(x) (((x) >> LKPIDXSIZE_S) & LKPIDXSIZE_M)
+
+#define TP_RSS_VFL_CONFIG_A 0x3a
+#define TP_RSS_VFH_CONFIG_A 0x3b
+
+#define ENABLEUDPHASH_S    31
+#define ENABLEUDPHASH_V(x) ((x) << ENABLEUDPHASH_S)
+#define ENABLEUDPHASH_F    ENABLEUDPHASH_V(1U)
+
+#define VFUPEN_S    30
+#define VFUPEN_V(x) ((x) << VFUPEN_S)
+#define VFUPEN_F    VFUPEN_V(1U)
+
+#define VFVLNEX_S    28
+#define VFVLNEX_V(x) ((x) << VFVLNEX_S)
+#define VFVLNEX_F    VFVLNEX_V(1U)
+
+#define VFPRTEN_S    27
+#define VFPRTEN_V(x) ((x) << VFPRTEN_S)
+#define VFPRTEN_F    VFPRTEN_V(1U)
+
+#define VFCHNEN_S    26
+#define VFCHNEN_V(x) ((x) << VFCHNEN_S)
+#define VFCHNEN_F    VFCHNEN_V(1U)
+
+#define DEFAULTQUEUE_S    16
+#define DEFAULTQUEUE_M    0x3ffU
+#define DEFAULTQUEUE_G(x) (((x) >> DEFAULTQUEUE_S) & DEFAULTQUEUE_M)
+
+#define VFIP6TWOTUPEN_S    6
+#define VFIP6TWOTUPEN_V(x) ((x) << VFIP6TWOTUPEN_S)
+#define VFIP6TWOTUPEN_F    VFIP6TWOTUPEN_V(1U)
+
+#define VFIP4FOURTUPEN_S    5
+#define VFIP4FOURTUPEN_V(x) ((x) << VFIP4FOURTUPEN_S)
+#define VFIP4FOURTUPEN_F    VFIP4FOURTUPEN_V(1U)
+
+#define VFIP4TWOTUPEN_S    4
+#define VFIP4TWOTUPEN_V(x) ((x) << VFIP4TWOTUPEN_S)
+#define VFIP4TWOTUPEN_F    VFIP4TWOTUPEN_V(1U)
+
+#define KEYINDEX_S    0
+#define KEYINDEX_M    0xfU
+#define KEYINDEX_G(x) (((x) >> KEYINDEX_S) & KEYINDEX_M)
+
+#define MAPENABLE_S    31
+#define MAPENABLE_V(x) ((x) << MAPENABLE_S)
+#define MAPENABLE_F    MAPENABLE_V(1U)
+
+#define CHNENABLE_S    30
+#define CHNENABLE_V(x) ((x) << CHNENABLE_S)
+#define CHNENABLE_F    CHNENABLE_V(1U)
+
+#define LE_DB_DBGI_CONFIG_A 0x19cf0
+
+#define DBGICMDBUSY_S    3
+#define DBGICMDBUSY_V(x) ((x) << DBGICMDBUSY_S)
+#define DBGICMDBUSY_F    DBGICMDBUSY_V(1U)
+
+#define DBGICMDSTRT_S    2
+#define DBGICMDSTRT_V(x) ((x) << DBGICMDSTRT_S)
+#define DBGICMDSTRT_F    DBGICMDSTRT_V(1U)
+
+#define DBGICMDMODE_S    0
+#define DBGICMDMODE_M    0x3U
+#define DBGICMDMODE_V(x) ((x) << DBGICMDMODE_S)
+
+#define LE_DB_DBGI_REQ_TCAM_CMD_A 0x19cf4
+
+#define DBGICMD_S    20
+#define DBGICMD_M    0xfU
+#define DBGICMD_V(x) ((x) << DBGICMD_S)
+
+#define DBGITID_S    0
+#define DBGITID_M    0xfffffU
+#define DBGITID_V(x) ((x) << DBGITID_S)
+
+#define LE_DB_DBGI_REQ_DATA_A 0x19d00
+#define LE_DB_DBGI_RSP_STATUS_A 0x19d94
+
+#define LE_DB_DBGI_RSP_DATA_A 0x19da0
+
+#define PRTENABLE_S    29
+#define PRTENABLE_V(x) ((x) << PRTENABLE_S)
+#define PRTENABLE_F    PRTENABLE_V(1U)
+
+#define UDPFOURTUPEN_S    28
+#define UDPFOURTUPEN_V(x) ((x) << UDPFOURTUPEN_S)
+#define UDPFOURTUPEN_F    UDPFOURTUPEN_V(1U)
+
+#define IP6FOURTUPEN_S    27
+#define IP6FOURTUPEN_V(x) ((x) << IP6FOURTUPEN_S)
+#define IP6FOURTUPEN_F    IP6FOURTUPEN_V(1U)
+
+#define IP6TWOTUPEN_S    26
+#define IP6TWOTUPEN_V(x) ((x) << IP6TWOTUPEN_S)
+#define IP6TWOTUPEN_F    IP6TWOTUPEN_V(1U)
+
+#define IP4FOURTUPEN_S    25
+#define IP4FOURTUPEN_V(x) ((x) << IP4FOURTUPEN_S)
+#define IP4FOURTUPEN_F    IP4FOURTUPEN_V(1U)
+
+#define IP4TWOTUPEN_S    24
+#define IP4TWOTUPEN_V(x) ((x) << IP4TWOTUPEN_S)
+#define IP4TWOTUPEN_F    IP4TWOTUPEN_V(1U)
+
+#define IVFWIDTH_S    20
+#define IVFWIDTH_M    0xfU
+#define IVFWIDTH_V(x) ((x) << IVFWIDTH_S)
+#define IVFWIDTH_G(x) (((x) >> IVFWIDTH_S) & IVFWIDTH_M)
+
+#define CH1DEFAULTQUEUE_S    10
+#define CH1DEFAULTQUEUE_M    0x3ffU
+#define CH1DEFAULTQUEUE_V(x) ((x) << CH1DEFAULTQUEUE_S)
+#define CH1DEFAULTQUEUE_G(x) (((x) >> CH1DEFAULTQUEUE_S) & CH1DEFAULTQUEUE_M)
+
+#define CH0DEFAULTQUEUE_S    0
+#define CH0DEFAULTQUEUE_M    0x3ffU
+#define CH0DEFAULTQUEUE_V(x) ((x) << CH0DEFAULTQUEUE_S)
+#define CH0DEFAULTQUEUE_G(x) (((x) >> CH0DEFAULTQUEUE_S) & CH0DEFAULTQUEUE_M)
+
+#define VFLKPIDX_S    8
+#define VFLKPIDX_M    0xffU
+#define VFLKPIDX_G(x) (((x) >> VFLKPIDX_S) & VFLKPIDX_M)
+
+#define T6_VFWRADDR_S    8
+#define T6_VFWRADDR_M    0xffU
+#define T6_VFWRADDR_V(x) ((x) << T6_VFWRADDR_S)
+#define T6_VFWRADDR_G(x) (((x) >> T6_VFWRADDR_S) & T6_VFWRADDR_M)
+
+#define TP_RSS_CONFIG_CNG_A 0x7e04
+#define TP_RSS_SECRET_KEY0_A 0x40
+#define TP_RSS_PF0_CONFIG_A 0x30
+#define TP_RSS_PF_MAP_A 0x38
+#define TP_RSS_PF_MSK_A 0x39
+
+#define PF1LKPIDX_S    3
+
+#define PF0LKPIDX_M    0x7U
+
+#define PF1MSKSIZE_S    4
+#define PF1MSKSIZE_M    0xfU
+
+#define CHNCOUNT3_S    31
+#define CHNCOUNT3_V(x) ((x) << CHNCOUNT3_S)
+#define CHNCOUNT3_F    CHNCOUNT3_V(1U)
+
+#define CHNCOUNT2_S    30
+#define CHNCOUNT2_V(x) ((x) << CHNCOUNT2_S)
+#define CHNCOUNT2_F    CHNCOUNT2_V(1U)
+
+#define CHNCOUNT1_S    29
+#define CHNCOUNT1_V(x) ((x) << CHNCOUNT1_S)
+#define CHNCOUNT1_F    CHNCOUNT1_V(1U)
+
+#define CHNCOUNT0_S    28
+#define CHNCOUNT0_V(x) ((x) << CHNCOUNT0_S)
+#define CHNCOUNT0_F    CHNCOUNT0_V(1U)
+
+#define CHNUNDFLOW3_S    27
+#define CHNUNDFLOW3_V(x) ((x) << CHNUNDFLOW3_S)
+#define CHNUNDFLOW3_F    CHNUNDFLOW3_V(1U)
+
+#define CHNUNDFLOW2_S    26
+#define CHNUNDFLOW2_V(x) ((x) << CHNUNDFLOW2_S)
+#define CHNUNDFLOW2_F    CHNUNDFLOW2_V(1U)
+
+#define CHNUNDFLOW1_S    25
+#define CHNUNDFLOW1_V(x) ((x) << CHNUNDFLOW1_S)
+#define CHNUNDFLOW1_F    CHNUNDFLOW1_V(1U)
+
+#define CHNUNDFLOW0_S    24
+#define CHNUNDFLOW0_V(x) ((x) << CHNUNDFLOW0_S)
+#define CHNUNDFLOW0_F    CHNUNDFLOW0_V(1U)
+
+#define RSTCHN3_S    19
+#define RSTCHN3_V(x) ((x) << RSTCHN3_S)
+#define RSTCHN3_F    RSTCHN3_V(1U)
+
+#define RSTCHN2_S    18
+#define RSTCHN2_V(x) ((x) << RSTCHN2_S)
+#define RSTCHN2_F    RSTCHN2_V(1U)
+
+#define RSTCHN1_S    17
+#define RSTCHN1_V(x) ((x) << RSTCHN1_S)
+#define RSTCHN1_F    RSTCHN1_V(1U)
+
+#define RSTCHN0_S    16
+#define RSTCHN0_V(x) ((x) << RSTCHN0_S)
+#define RSTCHN0_F    RSTCHN0_V(1U)
+
+#define UPDVLD_S    15
+#define UPDVLD_V(x) ((x) << UPDVLD_S)
+#define UPDVLD_F    UPDVLD_V(1U)
+
+#define XOFF_S    14
+#define XOFF_V(x) ((x) << XOFF_S)
+#define XOFF_F    XOFF_V(1U)
+
+#define UPDCHN3_S    13
+#define UPDCHN3_V(x) ((x) << UPDCHN3_S)
+#define UPDCHN3_F    UPDCHN3_V(1U)
+
+#define UPDCHN2_S    12
+#define UPDCHN2_V(x) ((x) << UPDCHN2_S)
+#define UPDCHN2_F    UPDCHN2_V(1U)
+
+#define UPDCHN1_S    11
+#define UPDCHN1_V(x) ((x) << UPDCHN1_S)
+#define UPDCHN1_F    UPDCHN1_V(1U)
+
+#define UPDCHN0_S    10
+#define UPDCHN0_V(x) ((x) << UPDCHN0_S)
+#define UPDCHN0_F    UPDCHN0_V(1U)
+
+#define QUEUE_S    0
+#define QUEUE_M    0x3ffU
+#define QUEUE_V(x) ((x) << QUEUE_S)
+#define QUEUE_G(x) (((x) >> QUEUE_S) & QUEUE_M)
+
+#define MPS_TRC_INT_CAUSE_A	0x985c
+
+#define MISCPERR_S    8
+#define MISCPERR_V(x) ((x) << MISCPERR_S)
+#define MISCPERR_F    MISCPERR_V(1U)
+
+#define PKTFIFO_S    4
+#define PKTFIFO_M    0xfU
+#define PKTFIFO_V(x) ((x) << PKTFIFO_S)
+
+#define FILTMEM_S    0
+#define FILTMEM_M    0xfU
+#define FILTMEM_V(x) ((x) << FILTMEM_S)
+
+#define MPS_CLS_INT_CAUSE_A 0xd028
+
+#define HASHSRAM_S    2
+#define HASHSRAM_V(x) ((x) << HASHSRAM_S)
+#define HASHSRAM_F    HASHSRAM_V(1U)
+
+#define MATCHTCAM_S    1
+#define MATCHTCAM_V(x) ((x) << MATCHTCAM_S)
+#define MATCHTCAM_F    MATCHTCAM_V(1U)
+
+#define MATCHSRAM_S    0
+#define MATCHSRAM_V(x) ((x) << MATCHSRAM_S)
+#define MATCHSRAM_F    MATCHSRAM_V(1U)
+
+#define MPS_RX_PG_RSV0_A 0x11010
+#define MPS_RX_PG_RSV4_A 0x11020
+#define MPS_RX_PERR_INT_CAUSE_A 0x11074
+#define MPS_RX_MAC_BG_PG_CNT0_A 0x11208
+#define MPS_RX_LPBK_BG_PG_CNT0_A 0x11218
+
+#define MPS_RX_VXLAN_TYPE_A 0x11234
+
+#define VXLAN_EN_S    16
+#define VXLAN_EN_V(x) ((x) << VXLAN_EN_S)
+#define VXLAN_EN_F    VXLAN_EN_V(1U)
+
+#define VXLAN_S    0
+#define VXLAN_M    0xffffU
+#define VXLAN_V(x) ((x) << VXLAN_S)
+#define VXLAN_G(x) (((x) >> VXLAN_S) & VXLAN_M)
+
+#define MPS_RX_GENEVE_TYPE_A 0x11238
+
+#define GENEVE_EN_S    16
+#define GENEVE_EN_V(x) ((x) << GENEVE_EN_S)
+#define GENEVE_EN_F    GENEVE_EN_V(1U)
+
+#define GENEVE_S    0
+#define GENEVE_M    0xffffU
+#define GENEVE_V(x) ((x) << GENEVE_S)
+#define GENEVE_G(x) (((x) >> GENEVE_S) & GENEVE_M)
+
+#define MPS_CLS_TCAM_Y_L_A 0xf000
+#define MPS_CLS_TCAM_DATA0_A 0xf000
+#define MPS_CLS_TCAM_DATA1_A 0xf004
+
+#define CTLREQID_S    30
+#define CTLREQID_V(x) ((x) << CTLREQID_S)
+
+#define MPS_VF_RPLCT_MAP0_A 0x1111c
+#define MPS_VF_RPLCT_MAP1_A 0x11120
+#define MPS_VF_RPLCT_MAP2_A 0x11124
+#define MPS_VF_RPLCT_MAP3_A 0x11128
+#define MPS_VF_RPLCT_MAP4_A 0x11300
+#define MPS_VF_RPLCT_MAP5_A 0x11304
+#define MPS_VF_RPLCT_MAP6_A 0x11308
+#define MPS_VF_RPLCT_MAP7_A 0x1130c
+
+#define VIDL_S    16
+#define VIDL_M    0xffffU
+#define VIDL_G(x) (((x) >> VIDL_S) & VIDL_M)
+
+#define DATALKPTYPE_S    10
+#define DATALKPTYPE_M    0x3U
+#define DATALKPTYPE_G(x) (((x) >> DATALKPTYPE_S) & DATALKPTYPE_M)
+
+#define DATAPORTNUM_S    12
+#define DATAPORTNUM_M    0xfU
+#define DATAPORTNUM_V(x) ((x) << DATAPORTNUM_S)
+#define DATAPORTNUM_G(x) (((x) >> DATAPORTNUM_S) & DATAPORTNUM_M)
+
+#define DATALKPTYPE_S    10
+#define DATALKPTYPE_M    0x3U
+#define DATALKPTYPE_V(x) ((x) << DATALKPTYPE_S)
+#define DATALKPTYPE_G(x) (((x) >> DATALKPTYPE_S) & DATALKPTYPE_M)
+
+#define DATADIPHIT_S    8
+#define DATADIPHIT_V(x) ((x) << DATADIPHIT_S)
+#define DATADIPHIT_F    DATADIPHIT_V(1U)
+
+#define DATAVIDH2_S    7
+#define DATAVIDH2_V(x) ((x) << DATAVIDH2_S)
+#define DATAVIDH2_F    DATAVIDH2_V(1U)
+
+#define DATAVIDH1_S    0
+#define DATAVIDH1_M    0x7fU
+#define DATAVIDH1_G(x) (((x) >> DATAVIDH1_S) & DATAVIDH1_M)
+
+#define MPS_CLS_TCAM_RDATA0_REQ_ID1_A 0xf020
+#define MPS_CLS_TCAM_RDATA1_REQ_ID1_A 0xf024
+#define MPS_CLS_TCAM_RDATA2_REQ_ID1_A 0xf028
+
+#define USED_S    16
+#define USED_M    0x7ffU
+#define USED_G(x) (((x) >> USED_S) & USED_M)
+
+#define ALLOC_S    0
+#define ALLOC_M    0x7ffU
+#define ALLOC_G(x) (((x) >> ALLOC_S) & ALLOC_M)
+
+#define T5_USED_S    16
+#define T5_USED_M    0xfffU
+#define T5_USED_G(x) (((x) >> T5_USED_S) & T5_USED_M)
+
+#define T5_ALLOC_S    0
+#define T5_ALLOC_M    0xfffU
+#define T5_ALLOC_G(x) (((x) >> T5_ALLOC_S) & T5_ALLOC_M)
+
+#define DMACH_S    0
+#define DMACH_M    0xffffU
+#define DMACH_G(x) (((x) >> DMACH_S) & DMACH_M)
+
+#define MPS_CLS_TCAM_X_L_A 0xf008
+#define MPS_CLS_TCAM_DATA2_CTL_A 0xf008
+
+#define CTLCMDTYPE_S    31
+#define CTLCMDTYPE_V(x) ((x) << CTLCMDTYPE_S)
+#define CTLCMDTYPE_F    CTLCMDTYPE_V(1U)
+
+#define CTLTCAMSEL_S    25
+#define CTLTCAMSEL_V(x) ((x) << CTLTCAMSEL_S)
+
+#define CTLTCAMINDEX_S    17
+#define CTLTCAMINDEX_V(x) ((x) << CTLTCAMINDEX_S)
+
+#define CTLXYBITSEL_S    16
+#define CTLXYBITSEL_V(x) ((x) << CTLXYBITSEL_S)
+
+#define MPS_CLS_TCAM_Y_L(idx) (MPS_CLS_TCAM_Y_L_A + (idx) * 16)
+#define NUM_MPS_CLS_TCAM_Y_L_INSTANCES 512
+
+#define MPS_CLS_TCAM_X_L(idx) (MPS_CLS_TCAM_X_L_A + (idx) * 16)
+#define NUM_MPS_CLS_TCAM_X_L_INSTANCES 512
+
+#define MPS_CLS_SRAM_L_A 0xe000
+
+#define T6_MULTILISTEN0_S    26
+
+#define T6_SRAM_PRIO3_S    23
+#define T6_SRAM_PRIO3_M    0x7U
+#define T6_SRAM_PRIO3_G(x) (((x) >> T6_SRAM_PRIO3_S) & T6_SRAM_PRIO3_M)
+
+#define T6_SRAM_PRIO2_S    20
+#define T6_SRAM_PRIO2_M    0x7U
+#define T6_SRAM_PRIO2_G(x) (((x) >> T6_SRAM_PRIO2_S) & T6_SRAM_PRIO2_M)
+
+#define T6_SRAM_PRIO1_S    17
+#define T6_SRAM_PRIO1_M    0x7U
+#define T6_SRAM_PRIO1_G(x) (((x) >> T6_SRAM_PRIO1_S) & T6_SRAM_PRIO1_M)
+
+#define T6_SRAM_PRIO0_S    14
+#define T6_SRAM_PRIO0_M    0x7U
+#define T6_SRAM_PRIO0_G(x) (((x) >> T6_SRAM_PRIO0_S) & T6_SRAM_PRIO0_M)
+
+#define T6_SRAM_VLD_S    13
+#define T6_SRAM_VLD_V(x) ((x) << T6_SRAM_VLD_S)
+#define T6_SRAM_VLD_F    T6_SRAM_VLD_V(1U)
+
+#define T6_REPLICATE_S    12
+#define T6_REPLICATE_V(x) ((x) << T6_REPLICATE_S)
+#define T6_REPLICATE_F    T6_REPLICATE_V(1U)
+
+#define T6_PF_S    9
+#define T6_PF_M    0x7U
+#define T6_PF_G(x) (((x) >> T6_PF_S) & T6_PF_M)
+
+#define T6_VF_VALID_S    8
+#define T6_VF_VALID_V(x) ((x) << T6_VF_VALID_S)
+#define T6_VF_VALID_F    T6_VF_VALID_V(1U)
+
+#define T6_VF_S    0
+#define T6_VF_M    0xffU
+#define T6_VF_G(x) (((x) >> T6_VF_S) & T6_VF_M)
+
+#define MPS_CLS_SRAM_H_A 0xe004
+
+#define MPS_CLS_SRAM_L(idx) (MPS_CLS_SRAM_L_A + (idx) * 8)
+#define NUM_MPS_CLS_SRAM_L_INSTANCES 336
+
+#define MPS_CLS_SRAM_H(idx) (MPS_CLS_SRAM_H_A + (idx) * 8)
+#define NUM_MPS_CLS_SRAM_H_INSTANCES 336
+
+#define MULTILISTEN0_S    25
+
+#define REPLICATE_S    11
+#define REPLICATE_V(x) ((x) << REPLICATE_S)
+#define REPLICATE_F    REPLICATE_V(1U)
+
+#define PF_S    8
+#define PF_M    0x7U
+#define PF_G(x) (((x) >> PF_S) & PF_M)
+
+#define VF_VALID_S    7
+#define VF_VALID_V(x) ((x) << VF_VALID_S)
+#define VF_VALID_F    VF_VALID_V(1U)
+
+#define VF_S    0
+#define VF_M    0x7fU
+#define VF_G(x) (((x) >> VF_S) & VF_M)
+
+#define SRAM_PRIO3_S    22
+#define SRAM_PRIO3_M    0x7U
+#define SRAM_PRIO3_G(x) (((x) >> SRAM_PRIO3_S) & SRAM_PRIO3_M)
+
+#define SRAM_PRIO2_S    19
+#define SRAM_PRIO2_M    0x7U
+#define SRAM_PRIO2_G(x) (((x) >> SRAM_PRIO2_S) & SRAM_PRIO2_M)
+
+#define SRAM_PRIO1_S    16
+#define SRAM_PRIO1_M    0x7U
+#define SRAM_PRIO1_G(x) (((x) >> SRAM_PRIO1_S) & SRAM_PRIO1_M)
+
+#define SRAM_PRIO0_S    13
+#define SRAM_PRIO0_M    0x7U
+#define SRAM_PRIO0_G(x) (((x) >> SRAM_PRIO0_S) & SRAM_PRIO0_M)
+
+#define SRAM_VLD_S    12
+#define SRAM_VLD_V(x) ((x) << SRAM_VLD_S)
+#define SRAM_VLD_F    SRAM_VLD_V(1U)
+
+#define PORTMAP_S    0
+#define PORTMAP_M    0xfU
+#define PORTMAP_G(x) (((x) >> PORTMAP_S) & PORTMAP_M)
+
+#define CPL_INTR_CAUSE_A 0x19054
+
+#define CIM_OP_MAP_PERR_S    5
+#define CIM_OP_MAP_PERR_V(x) ((x) << CIM_OP_MAP_PERR_S)
+#define CIM_OP_MAP_PERR_F    CIM_OP_MAP_PERR_V(1U)
+
+#define CIM_OVFL_ERROR_S    4
+#define CIM_OVFL_ERROR_V(x) ((x) << CIM_OVFL_ERROR_S)
+#define CIM_OVFL_ERROR_F    CIM_OVFL_ERROR_V(1U)
+
+#define TP_FRAMING_ERROR_S    3
+#define TP_FRAMING_ERROR_V(x) ((x) << TP_FRAMING_ERROR_S)
+#define TP_FRAMING_ERROR_F    TP_FRAMING_ERROR_V(1U)
+
+#define SGE_FRAMING_ERROR_S    2
+#define SGE_FRAMING_ERROR_V(x) ((x) << SGE_FRAMING_ERROR_S)
+#define SGE_FRAMING_ERROR_F    SGE_FRAMING_ERROR_V(1U)
+
+#define CIM_FRAMING_ERROR_S    1
+#define CIM_FRAMING_ERROR_V(x) ((x) << CIM_FRAMING_ERROR_S)
+#define CIM_FRAMING_ERROR_F    CIM_FRAMING_ERROR_V(1U)
+
+#define ZERO_SWITCH_ERROR_S    0
+#define ZERO_SWITCH_ERROR_V(x) ((x) << ZERO_SWITCH_ERROR_S)
+#define ZERO_SWITCH_ERROR_F    ZERO_SWITCH_ERROR_V(1U)
+
+#define SMB_INT_CAUSE_A 0x19090
+
+#define MSTTXFIFOPARINT_S    21
+#define MSTTXFIFOPARINT_V(x) ((x) << MSTTXFIFOPARINT_S)
+#define MSTTXFIFOPARINT_F    MSTTXFIFOPARINT_V(1U)
+
+#define MSTRXFIFOPARINT_S    20
+#define MSTRXFIFOPARINT_V(x) ((x) << MSTRXFIFOPARINT_S)
+#define MSTRXFIFOPARINT_F    MSTRXFIFOPARINT_V(1U)
+
+#define SLVFIFOPARINT_S    19
+#define SLVFIFOPARINT_V(x) ((x) << SLVFIFOPARINT_S)
+#define SLVFIFOPARINT_F    SLVFIFOPARINT_V(1U)
+
+#define ULP_RX_INT_CAUSE_A 0x19158
+#define ULP_RX_ISCSI_LLIMIT_A 0x1915c
+#define ULP_RX_ISCSI_ULIMIT_A 0x19160
+#define ULP_RX_ISCSI_TAGMASK_A 0x19164
+#define ULP_RX_ISCSI_PSZ_A 0x19168
+#define ULP_RX_TDDP_LLIMIT_A 0x1916c
+#define ULP_RX_TDDP_ULIMIT_A 0x19170
+#define ULP_RX_STAG_LLIMIT_A 0x1917c
+#define ULP_RX_STAG_ULIMIT_A 0x19180
+#define ULP_RX_RQ_LLIMIT_A 0x19184
+#define ULP_RX_RQ_ULIMIT_A 0x19188
+#define ULP_RX_PBL_LLIMIT_A 0x1918c
+#define ULP_RX_PBL_ULIMIT_A 0x19190
+#define ULP_RX_CTX_BASE_A 0x19194
+#define ULP_RX_RQUDP_LLIMIT_A 0x191a4
+#define ULP_RX_RQUDP_ULIMIT_A 0x191a8
+#define ULP_RX_LA_CTL_A 0x1923c
+#define ULP_RX_LA_RDPTR_A 0x19240
+#define ULP_RX_LA_RDDATA_A 0x19244
+#define ULP_RX_LA_WRPTR_A 0x19248
+#define ULP_RX_TLS_KEY_LLIMIT_A 0x192ac
+#define ULP_RX_TLS_KEY_ULIMIT_A 0x192b0
+
+#define HPZ3_S    24
+#define HPZ3_V(x) ((x) << HPZ3_S)
+
+#define HPZ2_S    16
+#define HPZ2_V(x) ((x) << HPZ2_S)
+
+#define HPZ1_S    8
+#define HPZ1_V(x) ((x) << HPZ1_S)
+
+#define HPZ0_S    0
+#define HPZ0_V(x) ((x) << HPZ0_S)
+
+#define ULP_RX_TDDP_PSZ_A 0x19178
+
+/* registers for module SF */
+#define SF_DATA_A 0x193f8
+#define SF_OP_A 0x193fc
+
+#define SF_BUSY_S    31
+#define SF_BUSY_V(x) ((x) << SF_BUSY_S)
+#define SF_BUSY_F    SF_BUSY_V(1U)
+
+#define SF_LOCK_S    4
+#define SF_LOCK_V(x) ((x) << SF_LOCK_S)
+#define SF_LOCK_F    SF_LOCK_V(1U)
+
+#define SF_CONT_S    3
+#define SF_CONT_V(x) ((x) << SF_CONT_S)
+#define SF_CONT_F    SF_CONT_V(1U)
+
+#define BYTECNT_S    1
+#define BYTECNT_V(x) ((x) << BYTECNT_S)
+
+#define OP_S    0
+#define OP_V(x) ((x) << OP_S)
+#define OP_F    OP_V(1U)
+
+#define PL_PF_INT_CAUSE_A 0x3c0
+
+#define PFSW_S    3
+#define PFSW_V(x) ((x) << PFSW_S)
+#define PFSW_F    PFSW_V(1U)
+
+#define PFCIM_S    1
+#define PFCIM_V(x) ((x) << PFCIM_S)
+#define PFCIM_F    PFCIM_V(1U)
+
+#define PL_PF_INT_ENABLE_A 0x3c4
+#define PL_PF_CTL_A 0x3c8
+
+#define PL_WHOAMI_A 0x19400
+
+#define SOURCEPF_S    8
+#define SOURCEPF_M    0x7U
+#define SOURCEPF_G(x) (((x) >> SOURCEPF_S) & SOURCEPF_M)
+
+#define T6_SOURCEPF_S    9
+#define T6_SOURCEPF_M    0x7U
+#define T6_SOURCEPF_G(x) (((x) >> T6_SOURCEPF_S) & T6_SOURCEPF_M)
+
+#define PL_INT_CAUSE_A 0x1940c
+
+#define ULP_TX_S    27
+#define ULP_TX_V(x) ((x) << ULP_TX_S)
+#define ULP_TX_F    ULP_TX_V(1U)
+
+#define SGE_S    26
+#define SGE_V(x) ((x) << SGE_S)
+#define SGE_F    SGE_V(1U)
+
+#define CPL_SWITCH_S    24
+#define CPL_SWITCH_V(x) ((x) << CPL_SWITCH_S)
+#define CPL_SWITCH_F    CPL_SWITCH_V(1U)
+
+#define ULP_RX_S    23
+#define ULP_RX_V(x) ((x) << ULP_RX_S)
+#define ULP_RX_F    ULP_RX_V(1U)
+
+#define PM_RX_S    22
+#define PM_RX_V(x) ((x) << PM_RX_S)
+#define PM_RX_F    PM_RX_V(1U)
+
+#define PM_TX_S    21
+#define PM_TX_V(x) ((x) << PM_TX_S)
+#define PM_TX_F    PM_TX_V(1U)
+
+#define MA_S    20
+#define MA_V(x) ((x) << MA_S)
+#define MA_F    MA_V(1U)
+
+#define TP_S    19
+#define TP_V(x) ((x) << TP_S)
+#define TP_F    TP_V(1U)
+
+#define LE_S    18
+#define LE_V(x) ((x) << LE_S)
+#define LE_F    LE_V(1U)
+
+#define EDC1_S    17
+#define EDC1_V(x) ((x) << EDC1_S)
+#define EDC1_F    EDC1_V(1U)
+
+#define EDC0_S    16
+#define EDC0_V(x) ((x) << EDC0_S)
+#define EDC0_F    EDC0_V(1U)
+
+#define MC_S    15
+#define MC_V(x) ((x) << MC_S)
+#define MC_F    MC_V(1U)
+
+#define PCIE_S    14
+#define PCIE_V(x) ((x) << PCIE_S)
+#define PCIE_F    PCIE_V(1U)
+
+#define XGMAC_KR1_S    12
+#define XGMAC_KR1_V(x) ((x) << XGMAC_KR1_S)
+#define XGMAC_KR1_F    XGMAC_KR1_V(1U)
+
+#define XGMAC_KR0_S    11
+#define XGMAC_KR0_V(x) ((x) << XGMAC_KR0_S)
+#define XGMAC_KR0_F    XGMAC_KR0_V(1U)
+
+#define XGMAC1_S    10
+#define XGMAC1_V(x) ((x) << XGMAC1_S)
+#define XGMAC1_F    XGMAC1_V(1U)
+
+#define XGMAC0_S    9
+#define XGMAC0_V(x) ((x) << XGMAC0_S)
+#define XGMAC0_F    XGMAC0_V(1U)
+
+#define SMB_S    8
+#define SMB_V(x) ((x) << SMB_S)
+#define SMB_F    SMB_V(1U)
+
+#define SF_S    7
+#define SF_V(x) ((x) << SF_S)
+#define SF_F    SF_V(1U)
+
+#define PL_S    6
+#define PL_V(x) ((x) << PL_S)
+#define PL_F    PL_V(1U)
+
+#define NCSI_S    5
+#define NCSI_V(x) ((x) << NCSI_S)
+#define NCSI_F    NCSI_V(1U)
+
+#define MPS_S    4
+#define MPS_V(x) ((x) << MPS_S)
+#define MPS_F    MPS_V(1U)
+
+#define CIM_S    0
+#define CIM_V(x) ((x) << CIM_S)
+#define CIM_F    CIM_V(1U)
+
+#define MC1_S    31
+#define MC1_V(x) ((x) << MC1_S)
+#define MC1_F    MC1_V(1U)
+
+#define PL_INT_ENABLE_A 0x19410
+#define PL_INT_MAP0_A 0x19414
+#define PL_RST_A 0x19428
+
+#define PIORST_S    1
+#define PIORST_V(x) ((x) << PIORST_S)
+#define PIORST_F    PIORST_V(1U)
+
+#define PIORSTMODE_S    0
+#define PIORSTMODE_V(x) ((x) << PIORSTMODE_S)
+#define PIORSTMODE_F    PIORSTMODE_V(1U)
+
+#define PL_PL_INT_CAUSE_A 0x19430
+
+#define FATALPERR_S    4
+#define FATALPERR_V(x) ((x) << FATALPERR_S)
+#define FATALPERR_F    FATALPERR_V(1U)
+
+#define PERRVFID_S    0
+#define PERRVFID_V(x) ((x) << PERRVFID_S)
+#define PERRVFID_F    PERRVFID_V(1U)
+
+#define PL_REV_A 0x1943c
+
+#define REV_S    0
+#define REV_M    0xfU
+#define REV_V(x) ((x) << REV_S)
+#define REV_G(x) (((x) >> REV_S) & REV_M)
+
+#define T6_UNKNOWNCMD_S    3
+#define T6_UNKNOWNCMD_V(x) ((x) << T6_UNKNOWNCMD_S)
+#define T6_UNKNOWNCMD_F    T6_UNKNOWNCMD_V(1U)
+
+#define T6_LIP0_S    2
+#define T6_LIP0_V(x) ((x) << T6_LIP0_S)
+#define T6_LIP0_F    T6_LIP0_V(1U)
+
+#define T6_LIPMISS_S    1
+#define T6_LIPMISS_V(x) ((x) << T6_LIPMISS_S)
+#define T6_LIPMISS_F    T6_LIPMISS_V(1U)
+
+#define LE_DB_CONFIG_A 0x19c04
+#define LE_DB_ROUTING_TABLE_INDEX_A 0x19c10
+#define LE_DB_ACTIVE_TABLE_START_INDEX_A 0x19c10
+#define LE_DB_FILTER_TABLE_INDEX_A 0x19c14
+#define LE_DB_SERVER_INDEX_A 0x19c18
+#define LE_DB_SRVR_START_INDEX_A 0x19c18
+#define LE_DB_CLIP_TABLE_INDEX_A 0x19c1c
+#define LE_DB_ACT_CNT_IPV4_A 0x19c20
+#define LE_DB_ACT_CNT_IPV6_A 0x19c24
+#define LE_DB_HASH_CONFIG_A 0x19c28
+
+#define HASHTIDSIZE_S    16
+#define HASHTIDSIZE_M    0x3fU
+#define HASHTIDSIZE_G(x) (((x) >> HASHTIDSIZE_S) & HASHTIDSIZE_M)
+
+#define LE_DB_HASH_TID_BASE_A 0x19c30
+#define LE_DB_HASH_TBL_BASE_ADDR_A 0x19c30
+#define LE_DB_INT_CAUSE_A 0x19c3c
+#define LE_DB_TID_HASHBASE_A 0x19df8
+#define T6_LE_DB_HASH_TID_BASE_A 0x19df8
+
+#define HASHEN_S    20
+#define HASHEN_V(x) ((x) << HASHEN_S)
+#define HASHEN_F    HASHEN_V(1U)
+
+#define ASLIPCOMPEN_S    17
+#define ASLIPCOMPEN_V(x) ((x) << ASLIPCOMPEN_S)
+#define ASLIPCOMPEN_F    ASLIPCOMPEN_V(1U)
+
+#define REQQPARERR_S    16
+#define REQQPARERR_V(x) ((x) << REQQPARERR_S)
+#define REQQPARERR_F    REQQPARERR_V(1U)
+
+#define UNKNOWNCMD_S    15
+#define UNKNOWNCMD_V(x) ((x) << UNKNOWNCMD_S)
+#define UNKNOWNCMD_F    UNKNOWNCMD_V(1U)
+
+#define PARITYERR_S    6
+#define PARITYERR_V(x) ((x) << PARITYERR_S)
+#define PARITYERR_F    PARITYERR_V(1U)
+
+#define LIPMISS_S    5
+#define LIPMISS_V(x) ((x) << LIPMISS_S)
+#define LIPMISS_F    LIPMISS_V(1U)
+
+#define LIP0_S    4
+#define LIP0_V(x) ((x) << LIP0_S)
+#define LIP0_F    LIP0_V(1U)
+
+#define BASEADDR_S    3
+#define BASEADDR_M    0x1fffffffU
+#define BASEADDR_G(x) (((x) >> BASEADDR_S) & BASEADDR_M)
+
+#define TCAMINTPERR_S    13
+#define TCAMINTPERR_V(x) ((x) << TCAMINTPERR_S)
+#define TCAMINTPERR_F    TCAMINTPERR_V(1U)
+
+#define SSRAMINTPERR_S    10
+#define SSRAMINTPERR_V(x) ((x) << SSRAMINTPERR_S)
+#define SSRAMINTPERR_F    SSRAMINTPERR_V(1U)
+
+#define LE_DB_RSP_CODE_0_A	0x19c74
+
+#define TCAM_ACTV_HIT_S		0
+#define TCAM_ACTV_HIT_M		0x1fU
+#define TCAM_ACTV_HIT_V(x)	((x) << TCAM_ACTV_HIT_S)
+#define TCAM_ACTV_HIT_G(x)	(((x) >> TCAM_ACTV_HIT_S) & TCAM_ACTV_HIT_M)
+
+#define LE_DB_RSP_CODE_1_A     0x19c78
+
+#define HASH_ACTV_HIT_S		25
+#define HASH_ACTV_HIT_M		0x1fU
+#define HASH_ACTV_HIT_V(x)	((x) << HASH_ACTV_HIT_S)
+#define HASH_ACTV_HIT_G(x)	(((x) >> HASH_ACTV_HIT_S) & HASH_ACTV_HIT_M)
+
+#define LE_3_DB_HASH_MASK_GEN_IPV4_T6_A	0x19eac
+#define LE_4_DB_HASH_MASK_GEN_IPV4_T6_A	0x19eb0
+
+#define NCSI_INT_CAUSE_A 0x1a0d8
+
+#define CIM_DM_PRTY_ERR_S    8
+#define CIM_DM_PRTY_ERR_V(x) ((x) << CIM_DM_PRTY_ERR_S)
+#define CIM_DM_PRTY_ERR_F    CIM_DM_PRTY_ERR_V(1U)
+
+#define MPS_DM_PRTY_ERR_S    7
+#define MPS_DM_PRTY_ERR_V(x) ((x) << MPS_DM_PRTY_ERR_S)
+#define MPS_DM_PRTY_ERR_F    MPS_DM_PRTY_ERR_V(1U)
+
+#define TXFIFO_PRTY_ERR_S    1
+#define TXFIFO_PRTY_ERR_V(x) ((x) << TXFIFO_PRTY_ERR_S)
+#define TXFIFO_PRTY_ERR_F    TXFIFO_PRTY_ERR_V(1U)
+
+#define RXFIFO_PRTY_ERR_S    0
+#define RXFIFO_PRTY_ERR_V(x) ((x) << RXFIFO_PRTY_ERR_S)
+#define RXFIFO_PRTY_ERR_F    RXFIFO_PRTY_ERR_V(1U)
+
+#define XGMAC_PORT_CFG2_A 0x1018
+
+#define PATEN_S    18
+#define PATEN_V(x) ((x) << PATEN_S)
+#define PATEN_F    PATEN_V(1U)
+
+#define MAGICEN_S    17
+#define MAGICEN_V(x) ((x) << MAGICEN_S)
+#define MAGICEN_F    MAGICEN_V(1U)
+
+#define XGMAC_PORT_MAGIC_MACID_LO 0x1024
+#define XGMAC_PORT_MAGIC_MACID_HI 0x1028
+
+#define XGMAC_PORT_EPIO_DATA0_A 0x10c0
+#define XGMAC_PORT_EPIO_DATA1_A 0x10c4
+#define XGMAC_PORT_EPIO_DATA2_A 0x10c8
+#define XGMAC_PORT_EPIO_DATA3_A 0x10cc
+#define XGMAC_PORT_EPIO_OP_A 0x10d0
+
+#define EPIOWR_S    8
+#define EPIOWR_V(x) ((x) << EPIOWR_S)
+#define EPIOWR_F    EPIOWR_V(1U)
+
+#define ADDRESS_S    0
+#define ADDRESS_V(x) ((x) << ADDRESS_S)
+
+#define MAC_PORT_INT_CAUSE_A 0x8dc
+#define XGMAC_PORT_INT_CAUSE_A 0x10dc
+
+#define TP_TX_MOD_QUEUE_REQ_MAP_A 0x7e28
+
+#define TP_TX_MOD_QUEUE_WEIGHT0_A 0x7e30
+#define TP_TX_MOD_CHANNEL_WEIGHT_A 0x7e34
+
+#define TX_MOD_QUEUE_REQ_MAP_S    0
+#define TX_MOD_QUEUE_REQ_MAP_V(x) ((x) << TX_MOD_QUEUE_REQ_MAP_S)
+
+#define TX_MODQ_WEIGHT3_S    24
+#define TX_MODQ_WEIGHT3_V(x) ((x) << TX_MODQ_WEIGHT3_S)
+
+#define TX_MODQ_WEIGHT2_S    16
+#define TX_MODQ_WEIGHT2_V(x) ((x) << TX_MODQ_WEIGHT2_S)
+
+#define TX_MODQ_WEIGHT1_S    8
+#define TX_MODQ_WEIGHT1_V(x) ((x) << TX_MODQ_WEIGHT1_S)
+
+#define TX_MODQ_WEIGHT0_S    0
+#define TX_MODQ_WEIGHT0_V(x) ((x) << TX_MODQ_WEIGHT0_S)
+
+#define TP_TX_SCHED_HDR_A 0x23
+#define TP_TX_SCHED_FIFO_A 0x24
+#define TP_TX_SCHED_PCMD_A 0x25
+
+#define NUM_MPS_CLS_SRAM_L_INSTANCES 336
+#define NUM_MPS_T5_CLS_SRAM_L_INSTANCES 512
+
+#define T5_PORT0_BASE 0x30000
+#define T5_PORT_STRIDE 0x4000
+#define T5_PORT_BASE(idx) (T5_PORT0_BASE + (idx) * T5_PORT_STRIDE)
+#define T5_PORT_REG(idx, reg) (T5_PORT_BASE(idx) + (reg))
+
+#define MC_0_BASE_ADDR 0x40000
+#define MC_1_BASE_ADDR 0x48000
+#define MC_STRIDE (MC_1_BASE_ADDR - MC_0_BASE_ADDR)
+#define MC_REG(reg, idx) (reg + MC_STRIDE * idx)
+
+#define MC_P_BIST_CMD_A			0x41400
+#define MC_P_BIST_CMD_ADDR_A		0x41404
+#define MC_P_BIST_CMD_LEN_A		0x41408
+#define MC_P_BIST_DATA_PATTERN_A	0x4140c
+#define MC_P_BIST_STATUS_RDATA_A	0x41488
+
+#define EDC_T50_BASE_ADDR		0x50000
+
+#define EDC_H_BIST_CMD_A		0x50004
+#define EDC_H_BIST_CMD_ADDR_A		0x50008
+#define EDC_H_BIST_CMD_LEN_A		0x5000c
+#define EDC_H_BIST_DATA_PATTERN_A	0x50010
+#define EDC_H_BIST_STATUS_RDATA_A	0x50028
+
+#define EDC_H_ECC_ERR_ADDR_A		0x50084
+#define EDC_T51_BASE_ADDR		0x50800
+
+#define EDC_T5_STRIDE (EDC_T51_BASE_ADDR - EDC_T50_BASE_ADDR)
+#define EDC_T5_REG(reg, idx) (reg + EDC_T5_STRIDE * idx)
+
+#define PL_VF_REV_A 0x4
+#define PL_VF_WHOAMI_A 0x0
+#define PL_VF_REVISION_A 0x8
+
+/* registers for module CIM */
+#define CIM_HOST_ACC_CTRL_A	0x7b50
+#define CIM_HOST_ACC_DATA_A	0x7b54
+#define UP_UP_DBG_LA_CFG_A	0x140
+#define UP_UP_DBG_LA_DATA_A	0x144
+
+#define HOSTBUSY_S	17
+#define HOSTBUSY_V(x)	((x) << HOSTBUSY_S)
+#define HOSTBUSY_F	HOSTBUSY_V(1U)
+
+#define HOSTWRITE_S	16
+#define HOSTWRITE_V(x)	((x) << HOSTWRITE_S)
+#define HOSTWRITE_F	HOSTWRITE_V(1U)
+
+#define CIM_IBQ_DBG_CFG_A 0x7b60
+
+#define IBQDBGADDR_S    16
+#define IBQDBGADDR_M    0xfffU
+#define IBQDBGADDR_V(x) ((x) << IBQDBGADDR_S)
+#define IBQDBGADDR_G(x) (((x) >> IBQDBGADDR_S) & IBQDBGADDR_M)
+
+#define IBQDBGBUSY_S    1
+#define IBQDBGBUSY_V(x) ((x) << IBQDBGBUSY_S)
+#define IBQDBGBUSY_F    IBQDBGBUSY_V(1U)
+
+#define IBQDBGEN_S    0
+#define IBQDBGEN_V(x) ((x) << IBQDBGEN_S)
+#define IBQDBGEN_F    IBQDBGEN_V(1U)
+
+#define CIM_OBQ_DBG_CFG_A 0x7b64
+
+#define OBQDBGADDR_S    16
+#define OBQDBGADDR_M    0xfffU
+#define OBQDBGADDR_V(x) ((x) << OBQDBGADDR_S)
+#define OBQDBGADDR_G(x) (((x) >> OBQDBGADDR_S) & OBQDBGADDR_M)
+
+#define OBQDBGBUSY_S    1
+#define OBQDBGBUSY_V(x) ((x) << OBQDBGBUSY_S)
+#define OBQDBGBUSY_F    OBQDBGBUSY_V(1U)
+
+#define OBQDBGEN_S    0
+#define OBQDBGEN_V(x) ((x) << OBQDBGEN_S)
+#define OBQDBGEN_F    OBQDBGEN_V(1U)
+
+#define CIM_IBQ_DBG_DATA_A 0x7b68
+#define CIM_OBQ_DBG_DATA_A 0x7b6c
+#define CIM_DEBUGCFG_A 0x7b70
+#define CIM_DEBUGSTS_A 0x7b74
+
+#define POLADBGRDPTR_S		23
+#define POLADBGRDPTR_M		0x1ffU
+#define POLADBGRDPTR_V(x)	((x) << POLADBGRDPTR_S)
+
+#define POLADBGWRPTR_S		16
+#define POLADBGWRPTR_M		0x1ffU
+#define POLADBGWRPTR_G(x)	(((x) >> POLADBGWRPTR_S) & POLADBGWRPTR_M)
+
+#define PILADBGRDPTR_S		14
+#define PILADBGRDPTR_M		0x1ffU
+#define PILADBGRDPTR_V(x)	((x) << PILADBGRDPTR_S)
+
+#define PILADBGWRPTR_S		0
+#define PILADBGWRPTR_M		0x1ffU
+#define PILADBGWRPTR_G(x)	(((x) >> PILADBGWRPTR_S) & PILADBGWRPTR_M)
+
+#define LADBGEN_S	12
+#define LADBGEN_V(x)	((x) << LADBGEN_S)
+#define LADBGEN_F	LADBGEN_V(1U)
+
+#define CIM_PO_LA_DEBUGDATA_A 0x7b78
+#define CIM_PI_LA_DEBUGDATA_A 0x7b7c
+#define CIM_PO_LA_MADEBUGDATA_A	0x7b80
+#define CIM_PI_LA_MADEBUGDATA_A	0x7b84
+
+#define UPDBGLARDEN_S		1
+#define UPDBGLARDEN_V(x)	((x) << UPDBGLARDEN_S)
+#define UPDBGLARDEN_F		UPDBGLARDEN_V(1U)
+
+#define UPDBGLAEN_S	0
+#define UPDBGLAEN_V(x)	((x) << UPDBGLAEN_S)
+#define UPDBGLAEN_F	UPDBGLAEN_V(1U)
+
+#define UPDBGLARDPTR_S		2
+#define UPDBGLARDPTR_M		0xfffU
+#define UPDBGLARDPTR_V(x)	((x) << UPDBGLARDPTR_S)
+
+#define UPDBGLAWRPTR_S    16
+#define UPDBGLAWRPTR_M    0xfffU
+#define UPDBGLAWRPTR_G(x) (((x) >> UPDBGLAWRPTR_S) & UPDBGLAWRPTR_M)
+
+#define UPDBGLACAPTPCONLY_S	30
+#define UPDBGLACAPTPCONLY_V(x)	((x) << UPDBGLACAPTPCONLY_S)
+#define UPDBGLACAPTPCONLY_F	UPDBGLACAPTPCONLY_V(1U)
+
+#define CIM_QUEUE_CONFIG_REF_A 0x7b48
+#define CIM_QUEUE_CONFIG_CTRL_A 0x7b4c
+
+#define CIMQSIZE_S    24
+#define CIMQSIZE_M    0x3fU
+#define CIMQSIZE_G(x) (((x) >> CIMQSIZE_S) & CIMQSIZE_M)
+
+#define CIMQBASE_S    16
+#define CIMQBASE_M    0x3fU
+#define CIMQBASE_G(x) (((x) >> CIMQBASE_S) & CIMQBASE_M)
+
+#define QUEFULLTHRSH_S    0
+#define QUEFULLTHRSH_M    0x1ffU
+#define QUEFULLTHRSH_G(x) (((x) >> QUEFULLTHRSH_S) & QUEFULLTHRSH_M)
+
+#define UP_IBQ_0_RDADDR_A 0x10
+#define UP_IBQ_0_SHADOW_RDADDR_A 0x280
+#define UP_OBQ_0_REALADDR_A 0x104
+#define UP_OBQ_0_SHADOW_REALADDR_A 0x394
+
+#define IBQRDADDR_S    0
+#define IBQRDADDR_M    0x1fffU
+#define IBQRDADDR_G(x) (((x) >> IBQRDADDR_S) & IBQRDADDR_M)
+
+#define IBQWRADDR_S    0
+#define IBQWRADDR_M    0x1fffU
+#define IBQWRADDR_G(x) (((x) >> IBQWRADDR_S) & IBQWRADDR_M)
+
+#define QUERDADDR_S    0
+#define QUERDADDR_M    0x7fffU
+#define QUERDADDR_G(x) (((x) >> QUERDADDR_S) & QUERDADDR_M)
+
+#define QUEREMFLITS_S    0
+#define QUEREMFLITS_M    0x7ffU
+#define QUEREMFLITS_G(x) (((x) >> QUEREMFLITS_S) & QUEREMFLITS_M)
+
+#define QUEEOPCNT_S    16
+#define QUEEOPCNT_M    0xfffU
+#define QUEEOPCNT_G(x) (((x) >> QUEEOPCNT_S) & QUEEOPCNT_M)
+
+#define QUESOPCNT_S    0
+#define QUESOPCNT_M    0xfffU
+#define QUESOPCNT_G(x) (((x) >> QUESOPCNT_S) & QUESOPCNT_M)
+
+#define OBQSELECT_S    4
+#define OBQSELECT_V(x) ((x) << OBQSELECT_S)
+#define OBQSELECT_F    OBQSELECT_V(1U)
+
+#define IBQSELECT_S    3
+#define IBQSELECT_V(x) ((x) << IBQSELECT_S)
+#define IBQSELECT_F    IBQSELECT_V(1U)
+
+#define QUENUMSELECT_S    0
+#define QUENUMSELECT_V(x) ((x) << QUENUMSELECT_S)
+
+#endif /* __T4_REGS_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h b/drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h
new file mode 100644
index 0000000..3297ce0
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h
@@ -0,0 +1,69 @@
+/*
+ * This file is part of the Chelsio T4/T5/T6 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2017 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __T4_TCB_H
+#define __T4_TCB_H
+
+#define TCB_SMAC_SEL_W		0
+#define TCB_SMAC_SEL_S		24
+#define TCB_SMAC_SEL_M		0xffULL
+#define TCB_SMAC_SEL_V(x)	((x) << TCB_SMAC_SEL_S)
+
+#define TCB_T_FLAGS_W		1
+
+#define TF_CCTRL_ECE_S		60
+#define TF_CCTRL_CWR_S		61
+#define TF_CCTRL_RFR_S		62
+
+#define TCB_RSS_INFO_W		3
+#define TCB_RSS_INFO_S		0
+#define TCB_RSS_INFO_M		0x3ffULL
+#define TCB_RSS_INFO_V(x)	((x) << TCB_RSS_INFO_S)
+
+#define TCB_TIMESTAMP_W		5
+#define TCB_TIMESTAMP_S		0
+#define TCB_TIMESTAMP_M		0xffffffffULL
+#define TCB_TIMESTAMP_V(x)	((x) << TCB_TIMESTAMP_S)
+
+#define TCB_RTT_TS_RECENT_AGE_W		6
+#define TCB_RTT_TS_RECENT_AGE_S		0
+#define TCB_RTT_TS_RECENT_AGE_M		0xffffffffULL
+#define TCB_RTT_TS_RECENT_AGE_V(x)	((x) << TCB_RTT_TS_RECENT_AGE_S)
+
+#define TCB_SND_UNA_RAW_W	10
+#define TCB_RX_FRAG2_PTR_RAW_W	27
+#define TCB_RX_FRAG3_LEN_RAW_W	29
+#define TCB_RX_FRAG3_START_IDX_OFFSET_RAW_W	30
+#define TCB_PDU_HDR_LEN_W	31
+#endif /* __T4_TCB_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_values.h b/drivers/net/ethernet/chelsio/cxgb4/t4_values.h
new file mode 100644
index 0000000..f6558cb
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_values.h
@@ -0,0 +1,156 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __T4_VALUES_H__
+#define __T4_VALUES_H__
+
+/* This file contains definitions for various T4 register value hardware
+ * constants.  The types of values encoded here are predominantly those for
+ * register fields which control "modal" behavior.  For the most part, we do
+ * not include definitions for register fields which are simple numeric
+ * metrics, etc.
+ */
+
+/* SGE register field values.
+ */
+
+/* CONTROL1 register */
+#define RXPKTCPLMODE_SPLIT_X		1
+
+#define INGPCIEBOUNDARY_SHIFT_X		5
+#define INGPCIEBOUNDARY_32B_X		0
+
+#define INGPADBOUNDARY_SHIFT_X		5
+
+#define T6_INGPADBOUNDARY_SHIFT_X	3
+#define T6_INGPADBOUNDARY_8B_X		0
+#define T6_INGPADBOUNDARY_32B_X		2
+
+#define INGPADBOUNDARY_32B_X		0
+
+/* CONTROL2 register */
+#define INGPACKBOUNDARY_SHIFT_X		5
+#define INGPACKBOUNDARY_16B_X		0
+#define INGPACKBOUNDARY_64B_X		1
+
+/* GTS register */
+#define SGE_TIMERREGS			6
+#define TIMERREG_COUNTER0_X		0
+
+#define FETCHBURSTMIN_64B_X		2
+#define FETCHBURSTMIN_128B_X		3
+
+#define FETCHBURSTMAX_256B_X		2
+#define FETCHBURSTMAX_512B_X		3
+
+#define HOSTFCMODE_STATUS_PAGE_X	2
+
+#define CIDXFLUSHTHRESH_32_X		5
+
+#define UPDATEDELIVERY_INTERRUPT_X	1
+
+#define RSPD_TYPE_FLBUF_X		0
+#define RSPD_TYPE_CPL_X			1
+#define RSPD_TYPE_INTR_X		2
+
+/* Congestion Manager Definitions.
+ */
+#define CONMCTXT_CNGTPMODE_S		19
+#define CONMCTXT_CNGTPMODE_V(x)		((x) << CONMCTXT_CNGTPMODE_S)
+#define CONMCTXT_CNGCHMAP_S		0
+#define CONMCTXT_CNGCHMAP_V(x)		((x) << CONMCTXT_CNGCHMAP_S)
+#define CONMCTXT_CNGTPMODE_CHANNEL_X	2
+#define CONMCTXT_CNGTPMODE_QUEUE_X	1
+
+/* T5 and later support a new BAR2-based doorbell mechanism for Egress Queues.
+ * The User Doorbells are each 128 bytes in length with a Simple Doorbell at
+ * offsets 8x and a Write Combining single 64-byte Egress Queue Unit
+ * (IDXSIZE_UNIT_X) Gather Buffer interface at offset 64.  For Ingress Queues,
+ * we have a Going To Sleep register at offsets 8x+4.
+ *
+ * As noted above, we have many instances of the Simple Doorbell and Going To
+ * Sleep registers at offsets 8x and 8x+4, respectively.  We want to use a
+ * non-64-byte aligned offset for the Simple Doorbell in order to attempt to
+ * avoid buffering of the writes to the Simple Doorbell and we want to use a
+ * non-contiguous offset for the Going To Sleep writes in order to avoid
+ * possible combining between them.
+ */
+#define SGE_UDB_SIZE		128
+#define SGE_UDB_KDOORBELL	8
+#define SGE_UDB_GTS		20
+#define SGE_UDB_WCDOORBELL	64
+
+/* CIM register field values.
+ */
+#define X_MBOWNER_FW			1
+#define X_MBOWNER_PL			2
+
+/* PCI-E definitions */
+#define WINDOW_SHIFT_X		10
+#define PCIEOFST_SHIFT_X	10
+
+/* TP_VLAN_PRI_MAP controls which subset of fields will be present in the
+ * Compressed Filter Tuple for LE filters.  Each bit set in TP_VLAN_PRI_MAP
+ * selects for a particular field being present.  These fields, when present
+ * in the Compressed Filter Tuple, have the following widths in bits.
+ */
+#define FT_FCOE_W                       1
+#define FT_PORT_W                       3
+#define FT_VNIC_ID_W                    17
+#define FT_VLAN_W                       17
+#define FT_TOS_W                        8
+#define FT_PROTOCOL_W                   8
+#define FT_ETHERTYPE_W                  16
+#define FT_MACMATCH_W                   9
+#define FT_MPSHITTYPE_W                 3
+#define FT_FRAGMENTATION_W              1
+
+/* Some of the Compressed Filter Tuple fields have internal structure.  These
+ * bit shifts/masks describe those structures.  All shifts are relative to the
+ * base position of the fields within the Compressed Filter Tuple
+ */
+#define FT_VLAN_VLD_S                   16
+#define FT_VLAN_VLD_V(x)                ((x) << FT_VLAN_VLD_S)
+#define FT_VLAN_VLD_F                   FT_VLAN_VLD_V(1U)
+
+#define FT_VNID_ID_VF_S                 0
+#define FT_VNID_ID_VF_V(x)              ((x) << FT_VNID_ID_VF_S)
+
+#define FT_VNID_ID_PF_S                 7
+#define FT_VNID_ID_PF_V(x)              ((x) << FT_VNID_ID_PF_S)
+
+#define FT_VNID_ID_VLD_S                16
+#define FT_VNID_ID_VLD_V(x)             ((x) << FT_VNID_ID_VLD_S)
+
+#endif /* __T4_VALUES_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
new file mode 100644
index 0000000..e3d4751
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -0,0 +1,3999 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2009-2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _T4FW_INTERFACE_H_
+#define _T4FW_INTERFACE_H_
+
+enum fw_retval {
+	FW_SUCCESS		= 0,	/* completed successfully */
+	FW_EPERM		= 1,	/* operation not permitted */
+	FW_ENOENT		= 2,	/* no such file or directory */
+	FW_EIO			= 5,	/* input/output error; hw bad */
+	FW_ENOEXEC		= 8,	/* exec format error; inv microcode */
+	FW_EAGAIN		= 11,	/* try again */
+	FW_ENOMEM		= 12,	/* out of memory */
+	FW_EFAULT		= 14,	/* bad address; fw bad */
+	FW_EBUSY		= 16,	/* resource busy */
+	FW_EEXIST		= 17,	/* file exists */
+	FW_ENODEV		= 19,	/* no such device */
+	FW_EINVAL		= 22,	/* invalid argument */
+	FW_ENOSPC		= 28,	/* no space left on device */
+	FW_ENOSYS		= 38,	/* functionality not implemented */
+	FW_ENODATA		= 61,	/* no data available */
+	FW_EPROTO		= 71,	/* protocol error */
+	FW_EADDRINUSE		= 98,	/* address already in use */
+	FW_EADDRNOTAVAIL	= 99,	/* cannot assigned requested address */
+	FW_ENETDOWN		= 100,	/* network is down */
+	FW_ENETUNREACH		= 101,	/* network is unreachable */
+	FW_ENOBUFS		= 105,	/* no buffer space available */
+	FW_ETIMEDOUT		= 110,	/* timeout */
+	FW_EINPROGRESS		= 115,	/* fw internal */
+	FW_SCSI_ABORT_REQUESTED	= 128,	/* */
+	FW_SCSI_ABORT_TIMEDOUT	= 129,	/* */
+	FW_SCSI_ABORTED		= 130,	/* */
+	FW_SCSI_CLOSE_REQUESTED	= 131,	/* */
+	FW_ERR_LINK_DOWN	= 132,	/* */
+	FW_RDEV_NOT_READY	= 133,	/* */
+	FW_ERR_RDEV_LOST	= 134,	/* */
+	FW_ERR_RDEV_LOGO	= 135,	/* */
+	FW_FCOE_NO_XCHG		= 136,	/* */
+	FW_SCSI_RSP_ERR		= 137,	/* */
+	FW_ERR_RDEV_IMPL_LOGO	= 138,	/* */
+	FW_SCSI_UNDER_FLOW_ERR  = 139,	/* */
+	FW_SCSI_OVER_FLOW_ERR   = 140,	/* */
+	FW_SCSI_DDP_ERR		= 141,	/* DDP error*/
+	FW_SCSI_TASK_ERR	= 142,	/* No SCSI tasks available */
+};
+
+#define FW_T4VF_SGE_BASE_ADDR      0x0000
+#define FW_T4VF_MPS_BASE_ADDR      0x0100
+#define FW_T4VF_PL_BASE_ADDR       0x0200
+#define FW_T4VF_MBDATA_BASE_ADDR   0x0240
+#define FW_T4VF_CIM_BASE_ADDR      0x0300
+
+enum fw_wr_opcodes {
+	FW_FILTER_WR                   = 0x02,
+	FW_ULPTX_WR                    = 0x04,
+	FW_TP_WR                       = 0x05,
+	FW_ETH_TX_PKT_WR               = 0x08,
+	FW_OFLD_CONNECTION_WR          = 0x2f,
+	FW_FLOWC_WR                    = 0x0a,
+	FW_OFLD_TX_DATA_WR             = 0x0b,
+	FW_CMD_WR                      = 0x10,
+	FW_ETH_TX_PKT_VM_WR            = 0x11,
+	FW_RI_RES_WR                   = 0x0c,
+	FW_RI_INIT_WR                  = 0x0d,
+	FW_RI_RDMA_WRITE_WR            = 0x14,
+	FW_RI_SEND_WR                  = 0x15,
+	FW_RI_RDMA_READ_WR             = 0x16,
+	FW_RI_RECV_WR                  = 0x17,
+	FW_RI_BIND_MW_WR               = 0x18,
+	FW_RI_FR_NSMR_WR               = 0x19,
+	FW_RI_FR_NSMR_TPTE_WR	       = 0x20,
+	FW_RI_RDMA_WRITE_CMPL_WR       = 0x21,
+	FW_RI_INV_LSTAG_WR             = 0x1a,
+	FW_ISCSI_TX_DATA_WR	       = 0x45,
+	FW_PTP_TX_PKT_WR               = 0x46,
+	FW_TLSTX_DATA_WR	       = 0x68,
+	FW_CRYPTO_LOOKASIDE_WR         = 0X6d,
+	FW_LASTC2E_WR                  = 0x70,
+	FW_FILTER2_WR		       = 0x77
+};
+
+struct fw_wr_hdr {
+	__be32 hi;
+	__be32 lo;
+};
+
+/* work request opcode (hi) */
+#define FW_WR_OP_S	24
+#define FW_WR_OP_M      0xff
+#define FW_WR_OP_V(x)   ((x) << FW_WR_OP_S)
+#define FW_WR_OP_G(x)   (((x) >> FW_WR_OP_S) & FW_WR_OP_M)
+
+/* atomic flag (hi) - firmware encapsulates CPLs in CPL_BARRIER */
+#define FW_WR_ATOMIC_S		23
+#define FW_WR_ATOMIC_V(x)	((x) << FW_WR_ATOMIC_S)
+
+/* flush flag (hi) - firmware flushes flushable work request buffered
+ * in the flow context.
+ */
+#define FW_WR_FLUSH_S     22
+#define FW_WR_FLUSH_V(x)  ((x) << FW_WR_FLUSH_S)
+
+/* completion flag (hi) - firmware generates a cpl_fw6_ack */
+#define FW_WR_COMPL_S     21
+#define FW_WR_COMPL_V(x)  ((x) << FW_WR_COMPL_S)
+#define FW_WR_COMPL_F     FW_WR_COMPL_V(1U)
+
+/* work request immediate data length (hi) */
+#define FW_WR_IMMDLEN_S 0
+#define FW_WR_IMMDLEN_M 0xff
+#define FW_WR_IMMDLEN_V(x)      ((x) << FW_WR_IMMDLEN_S)
+
+/* egress queue status update to associated ingress queue entry (lo) */
+#define FW_WR_EQUIQ_S           31
+#define FW_WR_EQUIQ_V(x)        ((x) << FW_WR_EQUIQ_S)
+#define FW_WR_EQUIQ_F           FW_WR_EQUIQ_V(1U)
+
+/* egress queue status update to egress queue status entry (lo) */
+#define FW_WR_EQUEQ_S           30
+#define FW_WR_EQUEQ_V(x)        ((x) << FW_WR_EQUEQ_S)
+#define FW_WR_EQUEQ_F           FW_WR_EQUEQ_V(1U)
+
+/* flow context identifier (lo) */
+#define FW_WR_FLOWID_S          8
+#define FW_WR_FLOWID_V(x)       ((x) << FW_WR_FLOWID_S)
+
+/* length in units of 16-bytes (lo) */
+#define FW_WR_LEN16_S           0
+#define FW_WR_LEN16_V(x)        ((x) << FW_WR_LEN16_S)
+
+#define HW_TPL_FR_MT_PR_IV_P_FC         0X32B
+#define HW_TPL_FR_MT_PR_OV_P_FC         0X327
+
+/* filter wr reply code in cookie in CPL_SET_TCB_RPL */
+enum fw_filter_wr_cookie {
+	FW_FILTER_WR_SUCCESS,
+	FW_FILTER_WR_FLT_ADDED,
+	FW_FILTER_WR_FLT_DELETED,
+	FW_FILTER_WR_SMT_TBL_FULL,
+	FW_FILTER_WR_EINVAL,
+};
+
+struct fw_filter_wr {
+	__be32 op_pkd;
+	__be32 len16_pkd;
+	__be64 r3;
+	__be32 tid_to_iq;
+	__be32 del_filter_to_l2tix;
+	__be16 ethtype;
+	__be16 ethtypem;
+	__u8   frag_to_ovlan_vldm;
+	__u8   smac_sel;
+	__be16 rx_chan_rx_rpl_iq;
+	__be32 maci_to_matchtypem;
+	__u8   ptcl;
+	__u8   ptclm;
+	__u8   ttyp;
+	__u8   ttypm;
+	__be16 ivlan;
+	__be16 ivlanm;
+	__be16 ovlan;
+	__be16 ovlanm;
+	__u8   lip[16];
+	__u8   lipm[16];
+	__u8   fip[16];
+	__u8   fipm[16];
+	__be16 lp;
+	__be16 lpm;
+	__be16 fp;
+	__be16 fpm;
+	__be16 r7;
+	__u8   sma[6];
+};
+
+struct fw_filter2_wr {
+	__be32 op_pkd;
+	__be32 len16_pkd;
+	__be64 r3;
+	__be32 tid_to_iq;
+	__be32 del_filter_to_l2tix;
+	__be16 ethtype;
+	__be16 ethtypem;
+	__u8   frag_to_ovlan_vldm;
+	__u8   smac_sel;
+	__be16 rx_chan_rx_rpl_iq;
+	__be32 maci_to_matchtypem;
+	__u8   ptcl;
+	__u8   ptclm;
+	__u8   ttyp;
+	__u8   ttypm;
+	__be16 ivlan;
+	__be16 ivlanm;
+	__be16 ovlan;
+	__be16 ovlanm;
+	__u8   lip[16];
+	__u8   lipm[16];
+	__u8   fip[16];
+	__u8   fipm[16];
+	__be16 lp;
+	__be16 lpm;
+	__be16 fp;
+	__be16 fpm;
+	__be16 r7;
+	__u8   sma[6];
+	__be16 r8;
+	__u8   filter_type_swapmac;
+	__u8   natmode_to_ulp_type;
+	__be16 newlport;
+	__be16 newfport;
+	__u8   newlip[16];
+	__u8   newfip[16];
+	__be32 natseqcheck;
+	__be32 r9;
+	__be64 r10;
+	__be64 r11;
+	__be64 r12;
+	__be64 r13;
+};
+
+#define FW_FILTER_WR_TID_S      12
+#define FW_FILTER_WR_TID_M      0xfffff
+#define FW_FILTER_WR_TID_V(x)   ((x) << FW_FILTER_WR_TID_S)
+#define FW_FILTER_WR_TID_G(x)   \
+	(((x) >> FW_FILTER_WR_TID_S) & FW_FILTER_WR_TID_M)
+
+#define FW_FILTER_WR_RQTYPE_S           11
+#define FW_FILTER_WR_RQTYPE_M           0x1
+#define FW_FILTER_WR_RQTYPE_V(x)        ((x) << FW_FILTER_WR_RQTYPE_S)
+#define FW_FILTER_WR_RQTYPE_G(x)        \
+	(((x) >> FW_FILTER_WR_RQTYPE_S) & FW_FILTER_WR_RQTYPE_M)
+#define FW_FILTER_WR_RQTYPE_F   FW_FILTER_WR_RQTYPE_V(1U)
+
+#define FW_FILTER_WR_NOREPLY_S          10
+#define FW_FILTER_WR_NOREPLY_M          0x1
+#define FW_FILTER_WR_NOREPLY_V(x)       ((x) << FW_FILTER_WR_NOREPLY_S)
+#define FW_FILTER_WR_NOREPLY_G(x)       \
+	(((x) >> FW_FILTER_WR_NOREPLY_S) & FW_FILTER_WR_NOREPLY_M)
+#define FW_FILTER_WR_NOREPLY_F  FW_FILTER_WR_NOREPLY_V(1U)
+
+#define FW_FILTER_WR_IQ_S       0
+#define FW_FILTER_WR_IQ_M       0x3ff
+#define FW_FILTER_WR_IQ_V(x)    ((x) << FW_FILTER_WR_IQ_S)
+#define FW_FILTER_WR_IQ_G(x)    \
+	(((x) >> FW_FILTER_WR_IQ_S) & FW_FILTER_WR_IQ_M)
+
+#define FW_FILTER_WR_DEL_FILTER_S       31
+#define FW_FILTER_WR_DEL_FILTER_M       0x1
+#define FW_FILTER_WR_DEL_FILTER_V(x)    ((x) << FW_FILTER_WR_DEL_FILTER_S)
+#define FW_FILTER_WR_DEL_FILTER_G(x)    \
+	(((x) >> FW_FILTER_WR_DEL_FILTER_S) & FW_FILTER_WR_DEL_FILTER_M)
+#define FW_FILTER_WR_DEL_FILTER_F       FW_FILTER_WR_DEL_FILTER_V(1U)
+
+#define FW_FILTER_WR_RPTTID_S           25
+#define FW_FILTER_WR_RPTTID_M           0x1
+#define FW_FILTER_WR_RPTTID_V(x)        ((x) << FW_FILTER_WR_RPTTID_S)
+#define FW_FILTER_WR_RPTTID_G(x)        \
+	(((x) >> FW_FILTER_WR_RPTTID_S) & FW_FILTER_WR_RPTTID_M)
+#define FW_FILTER_WR_RPTTID_F   FW_FILTER_WR_RPTTID_V(1U)
+
+#define FW_FILTER_WR_DROP_S     24
+#define FW_FILTER_WR_DROP_M     0x1
+#define FW_FILTER_WR_DROP_V(x)  ((x) << FW_FILTER_WR_DROP_S)
+#define FW_FILTER_WR_DROP_G(x)  \
+	(((x) >> FW_FILTER_WR_DROP_S) & FW_FILTER_WR_DROP_M)
+#define FW_FILTER_WR_DROP_F     FW_FILTER_WR_DROP_V(1U)
+
+#define FW_FILTER_WR_DIRSTEER_S         23
+#define FW_FILTER_WR_DIRSTEER_M         0x1
+#define FW_FILTER_WR_DIRSTEER_V(x)      ((x) << FW_FILTER_WR_DIRSTEER_S)
+#define FW_FILTER_WR_DIRSTEER_G(x)      \
+	(((x) >> FW_FILTER_WR_DIRSTEER_S) & FW_FILTER_WR_DIRSTEER_M)
+#define FW_FILTER_WR_DIRSTEER_F FW_FILTER_WR_DIRSTEER_V(1U)
+
+#define FW_FILTER_WR_MASKHASH_S         22
+#define FW_FILTER_WR_MASKHASH_M         0x1
+#define FW_FILTER_WR_MASKHASH_V(x)      ((x) << FW_FILTER_WR_MASKHASH_S)
+#define FW_FILTER_WR_MASKHASH_G(x)      \
+	(((x) >> FW_FILTER_WR_MASKHASH_S) & FW_FILTER_WR_MASKHASH_M)
+#define FW_FILTER_WR_MASKHASH_F FW_FILTER_WR_MASKHASH_V(1U)
+
+#define FW_FILTER_WR_DIRSTEERHASH_S     21
+#define FW_FILTER_WR_DIRSTEERHASH_M     0x1
+#define FW_FILTER_WR_DIRSTEERHASH_V(x)  ((x) << FW_FILTER_WR_DIRSTEERHASH_S)
+#define FW_FILTER_WR_DIRSTEERHASH_G(x)  \
+	(((x) >> FW_FILTER_WR_DIRSTEERHASH_S) & FW_FILTER_WR_DIRSTEERHASH_M)
+#define FW_FILTER_WR_DIRSTEERHASH_F     FW_FILTER_WR_DIRSTEERHASH_V(1U)
+
+#define FW_FILTER_WR_LPBK_S     20
+#define FW_FILTER_WR_LPBK_M     0x1
+#define FW_FILTER_WR_LPBK_V(x)  ((x) << FW_FILTER_WR_LPBK_S)
+#define FW_FILTER_WR_LPBK_G(x)  \
+	(((x) >> FW_FILTER_WR_LPBK_S) & FW_FILTER_WR_LPBK_M)
+#define FW_FILTER_WR_LPBK_F     FW_FILTER_WR_LPBK_V(1U)
+
+#define FW_FILTER_WR_DMAC_S     19
+#define FW_FILTER_WR_DMAC_M     0x1
+#define FW_FILTER_WR_DMAC_V(x)  ((x) << FW_FILTER_WR_DMAC_S)
+#define FW_FILTER_WR_DMAC_G(x)  \
+	(((x) >> FW_FILTER_WR_DMAC_S) & FW_FILTER_WR_DMAC_M)
+#define FW_FILTER_WR_DMAC_F     FW_FILTER_WR_DMAC_V(1U)
+
+#define FW_FILTER_WR_SMAC_S     18
+#define FW_FILTER_WR_SMAC_M     0x1
+#define FW_FILTER_WR_SMAC_V(x)  ((x) << FW_FILTER_WR_SMAC_S)
+#define FW_FILTER_WR_SMAC_G(x)  \
+	(((x) >> FW_FILTER_WR_SMAC_S) & FW_FILTER_WR_SMAC_M)
+#define FW_FILTER_WR_SMAC_F     FW_FILTER_WR_SMAC_V(1U)
+
+#define FW_FILTER_WR_INSVLAN_S          17
+#define FW_FILTER_WR_INSVLAN_M          0x1
+#define FW_FILTER_WR_INSVLAN_V(x)       ((x) << FW_FILTER_WR_INSVLAN_S)
+#define FW_FILTER_WR_INSVLAN_G(x)       \
+	(((x) >> FW_FILTER_WR_INSVLAN_S) & FW_FILTER_WR_INSVLAN_M)
+#define FW_FILTER_WR_INSVLAN_F  FW_FILTER_WR_INSVLAN_V(1U)
+
+#define FW_FILTER_WR_RMVLAN_S           16
+#define FW_FILTER_WR_RMVLAN_M           0x1
+#define FW_FILTER_WR_RMVLAN_V(x)        ((x) << FW_FILTER_WR_RMVLAN_S)
+#define FW_FILTER_WR_RMVLAN_G(x)        \
+	(((x) >> FW_FILTER_WR_RMVLAN_S) & FW_FILTER_WR_RMVLAN_M)
+#define FW_FILTER_WR_RMVLAN_F   FW_FILTER_WR_RMVLAN_V(1U)
+
+#define FW_FILTER_WR_HITCNTS_S          15
+#define FW_FILTER_WR_HITCNTS_M          0x1
+#define FW_FILTER_WR_HITCNTS_V(x)       ((x) << FW_FILTER_WR_HITCNTS_S)
+#define FW_FILTER_WR_HITCNTS_G(x)       \
+	(((x) >> FW_FILTER_WR_HITCNTS_S) & FW_FILTER_WR_HITCNTS_M)
+#define FW_FILTER_WR_HITCNTS_F  FW_FILTER_WR_HITCNTS_V(1U)
+
+#define FW_FILTER_WR_TXCHAN_S           13
+#define FW_FILTER_WR_TXCHAN_M           0x3
+#define FW_FILTER_WR_TXCHAN_V(x)        ((x) << FW_FILTER_WR_TXCHAN_S)
+#define FW_FILTER_WR_TXCHAN_G(x)        \
+	(((x) >> FW_FILTER_WR_TXCHAN_S) & FW_FILTER_WR_TXCHAN_M)
+
+#define FW_FILTER_WR_PRIO_S     12
+#define FW_FILTER_WR_PRIO_M     0x1
+#define FW_FILTER_WR_PRIO_V(x)  ((x) << FW_FILTER_WR_PRIO_S)
+#define FW_FILTER_WR_PRIO_G(x)  \
+	(((x) >> FW_FILTER_WR_PRIO_S) & FW_FILTER_WR_PRIO_M)
+#define FW_FILTER_WR_PRIO_F     FW_FILTER_WR_PRIO_V(1U)
+
+#define FW_FILTER_WR_L2TIX_S    0
+#define FW_FILTER_WR_L2TIX_M    0xfff
+#define FW_FILTER_WR_L2TIX_V(x) ((x) << FW_FILTER_WR_L2TIX_S)
+#define FW_FILTER_WR_L2TIX_G(x) \
+	(((x) >> FW_FILTER_WR_L2TIX_S) & FW_FILTER_WR_L2TIX_M)
+
+#define FW_FILTER_WR_FRAG_S     7
+#define FW_FILTER_WR_FRAG_M     0x1
+#define FW_FILTER_WR_FRAG_V(x)  ((x) << FW_FILTER_WR_FRAG_S)
+#define FW_FILTER_WR_FRAG_G(x)  \
+	(((x) >> FW_FILTER_WR_FRAG_S) & FW_FILTER_WR_FRAG_M)
+#define FW_FILTER_WR_FRAG_F     FW_FILTER_WR_FRAG_V(1U)
+
+#define FW_FILTER_WR_FRAGM_S    6
+#define FW_FILTER_WR_FRAGM_M    0x1
+#define FW_FILTER_WR_FRAGM_V(x) ((x) << FW_FILTER_WR_FRAGM_S)
+#define FW_FILTER_WR_FRAGM_G(x) \
+	(((x) >> FW_FILTER_WR_FRAGM_S) & FW_FILTER_WR_FRAGM_M)
+#define FW_FILTER_WR_FRAGM_F    FW_FILTER_WR_FRAGM_V(1U)
+
+#define FW_FILTER_WR_IVLAN_VLD_S        5
+#define FW_FILTER_WR_IVLAN_VLD_M        0x1
+#define FW_FILTER_WR_IVLAN_VLD_V(x)     ((x) << FW_FILTER_WR_IVLAN_VLD_S)
+#define FW_FILTER_WR_IVLAN_VLD_G(x)     \
+	(((x) >> FW_FILTER_WR_IVLAN_VLD_S) & FW_FILTER_WR_IVLAN_VLD_M)
+#define FW_FILTER_WR_IVLAN_VLD_F        FW_FILTER_WR_IVLAN_VLD_V(1U)
+
+#define FW_FILTER_WR_OVLAN_VLD_S        4
+#define FW_FILTER_WR_OVLAN_VLD_M        0x1
+#define FW_FILTER_WR_OVLAN_VLD_V(x)     ((x) << FW_FILTER_WR_OVLAN_VLD_S)
+#define FW_FILTER_WR_OVLAN_VLD_G(x)     \
+	(((x) >> FW_FILTER_WR_OVLAN_VLD_S) & FW_FILTER_WR_OVLAN_VLD_M)
+#define FW_FILTER_WR_OVLAN_VLD_F        FW_FILTER_WR_OVLAN_VLD_V(1U)
+
+#define FW_FILTER_WR_IVLAN_VLDM_S       3
+#define FW_FILTER_WR_IVLAN_VLDM_M       0x1
+#define FW_FILTER_WR_IVLAN_VLDM_V(x)    ((x) << FW_FILTER_WR_IVLAN_VLDM_S)
+#define FW_FILTER_WR_IVLAN_VLDM_G(x)    \
+	(((x) >> FW_FILTER_WR_IVLAN_VLDM_S) & FW_FILTER_WR_IVLAN_VLDM_M)
+#define FW_FILTER_WR_IVLAN_VLDM_F       FW_FILTER_WR_IVLAN_VLDM_V(1U)
+
+#define FW_FILTER_WR_OVLAN_VLDM_S       2
+#define FW_FILTER_WR_OVLAN_VLDM_M       0x1
+#define FW_FILTER_WR_OVLAN_VLDM_V(x)    ((x) << FW_FILTER_WR_OVLAN_VLDM_S)
+#define FW_FILTER_WR_OVLAN_VLDM_G(x)    \
+	(((x) >> FW_FILTER_WR_OVLAN_VLDM_S) & FW_FILTER_WR_OVLAN_VLDM_M)
+#define FW_FILTER_WR_OVLAN_VLDM_F       FW_FILTER_WR_OVLAN_VLDM_V(1U)
+
+#define FW_FILTER_WR_RX_CHAN_S          15
+#define FW_FILTER_WR_RX_CHAN_M          0x1
+#define FW_FILTER_WR_RX_CHAN_V(x)       ((x) << FW_FILTER_WR_RX_CHAN_S)
+#define FW_FILTER_WR_RX_CHAN_G(x)       \
+	(((x) >> FW_FILTER_WR_RX_CHAN_S) & FW_FILTER_WR_RX_CHAN_M)
+#define FW_FILTER_WR_RX_CHAN_F  FW_FILTER_WR_RX_CHAN_V(1U)
+
+#define FW_FILTER_WR_RX_RPL_IQ_S        0
+#define FW_FILTER_WR_RX_RPL_IQ_M        0x3ff
+#define FW_FILTER_WR_RX_RPL_IQ_V(x)     ((x) << FW_FILTER_WR_RX_RPL_IQ_S)
+#define FW_FILTER_WR_RX_RPL_IQ_G(x)     \
+	(((x) >> FW_FILTER_WR_RX_RPL_IQ_S) & FW_FILTER_WR_RX_RPL_IQ_M)
+
+#define FW_FILTER2_WR_FILTER_TYPE_S	1
+#define FW_FILTER2_WR_FILTER_TYPE_M	0x1
+#define FW_FILTER2_WR_FILTER_TYPE_V(x)	((x) << FW_FILTER2_WR_FILTER_TYPE_S)
+#define FW_FILTER2_WR_FILTER_TYPE_G(x)  \
+	(((x) >> FW_FILTER2_WR_FILTER_TYPE_S) & FW_FILTER2_WR_FILTER_TYPE_M)
+#define FW_FILTER2_WR_FILTER_TYPE_F	FW_FILTER2_WR_FILTER_TYPE_V(1U)
+
+#define FW_FILTER2_WR_NATMODE_S		5
+#define FW_FILTER2_WR_NATMODE_M		0x7
+#define FW_FILTER2_WR_NATMODE_V(x)	((x) << FW_FILTER2_WR_NATMODE_S)
+#define FW_FILTER2_WR_NATMODE_G(x)      \
+	(((x) >> FW_FILTER2_WR_NATMODE_S) & FW_FILTER2_WR_NATMODE_M)
+
+#define FW_FILTER2_WR_NATFLAGCHECK_S	4
+#define FW_FILTER2_WR_NATFLAGCHECK_M	0x1
+#define FW_FILTER2_WR_NATFLAGCHECK_V(x)	((x) << FW_FILTER2_WR_NATFLAGCHECK_S)
+#define FW_FILTER2_WR_NATFLAGCHECK_G(x) \
+	(((x) >> FW_FILTER2_WR_NATFLAGCHECK_S) & FW_FILTER2_WR_NATFLAGCHECK_M)
+#define FW_FILTER2_WR_NATFLAGCHECK_F	FW_FILTER2_WR_NATFLAGCHECK_V(1U)
+
+#define FW_FILTER2_WR_ULP_TYPE_S	0
+#define FW_FILTER2_WR_ULP_TYPE_M	0xf
+#define FW_FILTER2_WR_ULP_TYPE_V(x)	((x) << FW_FILTER2_WR_ULP_TYPE_S)
+#define FW_FILTER2_WR_ULP_TYPE_G(x)     \
+	(((x) >> FW_FILTER2_WR_ULP_TYPE_S) & FW_FILTER2_WR_ULP_TYPE_M)
+
+#define FW_FILTER_WR_MACI_S     23
+#define FW_FILTER_WR_MACI_M     0x1ff
+#define FW_FILTER_WR_MACI_V(x)  ((x) << FW_FILTER_WR_MACI_S)
+#define FW_FILTER_WR_MACI_G(x)  \
+	(((x) >> FW_FILTER_WR_MACI_S) & FW_FILTER_WR_MACI_M)
+
+#define FW_FILTER_WR_MACIM_S    14
+#define FW_FILTER_WR_MACIM_M    0x1ff
+#define FW_FILTER_WR_MACIM_V(x) ((x) << FW_FILTER_WR_MACIM_S)
+#define FW_FILTER_WR_MACIM_G(x) \
+	(((x) >> FW_FILTER_WR_MACIM_S) & FW_FILTER_WR_MACIM_M)
+
+#define FW_FILTER_WR_FCOE_S     13
+#define FW_FILTER_WR_FCOE_M     0x1
+#define FW_FILTER_WR_FCOE_V(x)  ((x) << FW_FILTER_WR_FCOE_S)
+#define FW_FILTER_WR_FCOE_G(x)  \
+	(((x) >> FW_FILTER_WR_FCOE_S) & FW_FILTER_WR_FCOE_M)
+#define FW_FILTER_WR_FCOE_F     FW_FILTER_WR_FCOE_V(1U)
+
+#define FW_FILTER_WR_FCOEM_S    12
+#define FW_FILTER_WR_FCOEM_M    0x1
+#define FW_FILTER_WR_FCOEM_V(x) ((x) << FW_FILTER_WR_FCOEM_S)
+#define FW_FILTER_WR_FCOEM_G(x) \
+	(((x) >> FW_FILTER_WR_FCOEM_S) & FW_FILTER_WR_FCOEM_M)
+#define FW_FILTER_WR_FCOEM_F    FW_FILTER_WR_FCOEM_V(1U)
+
+#define FW_FILTER_WR_PORT_S     9
+#define FW_FILTER_WR_PORT_M     0x7
+#define FW_FILTER_WR_PORT_V(x)  ((x) << FW_FILTER_WR_PORT_S)
+#define FW_FILTER_WR_PORT_G(x)  \
+	(((x) >> FW_FILTER_WR_PORT_S) & FW_FILTER_WR_PORT_M)
+
+#define FW_FILTER_WR_PORTM_S    6
+#define FW_FILTER_WR_PORTM_M    0x7
+#define FW_FILTER_WR_PORTM_V(x) ((x) << FW_FILTER_WR_PORTM_S)
+#define FW_FILTER_WR_PORTM_G(x) \
+	(((x) >> FW_FILTER_WR_PORTM_S) & FW_FILTER_WR_PORTM_M)
+
+#define FW_FILTER_WR_MATCHTYPE_S        3
+#define FW_FILTER_WR_MATCHTYPE_M        0x7
+#define FW_FILTER_WR_MATCHTYPE_V(x)     ((x) << FW_FILTER_WR_MATCHTYPE_S)
+#define FW_FILTER_WR_MATCHTYPE_G(x)     \
+	(((x) >> FW_FILTER_WR_MATCHTYPE_S) & FW_FILTER_WR_MATCHTYPE_M)
+
+#define FW_FILTER_WR_MATCHTYPEM_S       0
+#define FW_FILTER_WR_MATCHTYPEM_M       0x7
+#define FW_FILTER_WR_MATCHTYPEM_V(x)    ((x) << FW_FILTER_WR_MATCHTYPEM_S)
+#define FW_FILTER_WR_MATCHTYPEM_G(x)    \
+	(((x) >> FW_FILTER_WR_MATCHTYPEM_S) & FW_FILTER_WR_MATCHTYPEM_M)
+
+struct fw_ulptx_wr {
+	__be32 op_to_compl;
+	__be32 flowid_len16;
+	u64 cookie;
+};
+
+#define FW_ULPTX_WR_DATA_S      28
+#define FW_ULPTX_WR_DATA_M      0x1
+#define FW_ULPTX_WR_DATA_V(x)   ((x) << FW_ULPTX_WR_DATA_S)
+#define FW_ULPTX_WR_DATA_G(x)   \
+	(((x) >> FW_ULPTX_WR_DATA_S) & FW_ULPTX_WR_DATA_M)
+#define FW_ULPTX_WR_DATA_F      FW_ULPTX_WR_DATA_V(1U)
+
+struct fw_tp_wr {
+	__be32 op_to_immdlen;
+	__be32 flowid_len16;
+	u64 cookie;
+};
+
+struct fw_eth_tx_pkt_wr {
+	__be32 op_immdlen;
+	__be32 equiq_to_len16;
+	__be64 r3;
+};
+
+struct fw_ofld_connection_wr {
+	__be32 op_compl;
+	__be32 len16_pkd;
+	__u64  cookie;
+	__be64 r2;
+	__be64 r3;
+	struct fw_ofld_connection_le {
+		__be32 version_cpl;
+		__be32 filter;
+		__be32 r1;
+		__be16 lport;
+		__be16 pport;
+		union fw_ofld_connection_leip {
+			struct fw_ofld_connection_le_ipv4 {
+				__be32 pip;
+				__be32 lip;
+				__be64 r0;
+				__be64 r1;
+				__be64 r2;
+			} ipv4;
+			struct fw_ofld_connection_le_ipv6 {
+				__be64 pip_hi;
+				__be64 pip_lo;
+				__be64 lip_hi;
+				__be64 lip_lo;
+			} ipv6;
+		} u;
+	} le;
+	struct fw_ofld_connection_tcb {
+		__be32 t_state_to_astid;
+		__be16 cplrxdataack_cplpassacceptrpl;
+		__be16 rcv_adv;
+		__be32 rcv_nxt;
+		__be32 tx_max;
+		__be64 opt0;
+		__be32 opt2;
+		__be32 r1;
+		__be64 r2;
+		__be64 r3;
+	} tcb;
+};
+
+#define FW_OFLD_CONNECTION_WR_VERSION_S                31
+#define FW_OFLD_CONNECTION_WR_VERSION_M                0x1
+#define FW_OFLD_CONNECTION_WR_VERSION_V(x)     \
+	((x) << FW_OFLD_CONNECTION_WR_VERSION_S)
+#define FW_OFLD_CONNECTION_WR_VERSION_G(x)     \
+	(((x) >> FW_OFLD_CONNECTION_WR_VERSION_S) & \
+	FW_OFLD_CONNECTION_WR_VERSION_M)
+#define FW_OFLD_CONNECTION_WR_VERSION_F        \
+	FW_OFLD_CONNECTION_WR_VERSION_V(1U)
+
+#define FW_OFLD_CONNECTION_WR_CPL_S    30
+#define FW_OFLD_CONNECTION_WR_CPL_M    0x1
+#define FW_OFLD_CONNECTION_WR_CPL_V(x) ((x) << FW_OFLD_CONNECTION_WR_CPL_S)
+#define FW_OFLD_CONNECTION_WR_CPL_G(x) \
+	(((x) >> FW_OFLD_CONNECTION_WR_CPL_S) & FW_OFLD_CONNECTION_WR_CPL_M)
+#define FW_OFLD_CONNECTION_WR_CPL_F    FW_OFLD_CONNECTION_WR_CPL_V(1U)
+
+#define FW_OFLD_CONNECTION_WR_T_STATE_S                28
+#define FW_OFLD_CONNECTION_WR_T_STATE_M                0xf
+#define FW_OFLD_CONNECTION_WR_T_STATE_V(x)     \
+	((x) << FW_OFLD_CONNECTION_WR_T_STATE_S)
+#define FW_OFLD_CONNECTION_WR_T_STATE_G(x)     \
+	(((x) >> FW_OFLD_CONNECTION_WR_T_STATE_S) & \
+	FW_OFLD_CONNECTION_WR_T_STATE_M)
+
+#define FW_OFLD_CONNECTION_WR_RCV_SCALE_S      24
+#define FW_OFLD_CONNECTION_WR_RCV_SCALE_M      0xf
+#define FW_OFLD_CONNECTION_WR_RCV_SCALE_V(x)   \
+	((x) << FW_OFLD_CONNECTION_WR_RCV_SCALE_S)
+#define FW_OFLD_CONNECTION_WR_RCV_SCALE_G(x)   \
+	(((x) >> FW_OFLD_CONNECTION_WR_RCV_SCALE_S) & \
+	FW_OFLD_CONNECTION_WR_RCV_SCALE_M)
+
+#define FW_OFLD_CONNECTION_WR_ASTID_S          0
+#define FW_OFLD_CONNECTION_WR_ASTID_M          0xffffff
+#define FW_OFLD_CONNECTION_WR_ASTID_V(x)       \
+	((x) << FW_OFLD_CONNECTION_WR_ASTID_S)
+#define FW_OFLD_CONNECTION_WR_ASTID_G(x)       \
+	(((x) >> FW_OFLD_CONNECTION_WR_ASTID_S) & FW_OFLD_CONNECTION_WR_ASTID_M)
+
+#define FW_OFLD_CONNECTION_WR_CPLRXDATAACK_S   15
+#define FW_OFLD_CONNECTION_WR_CPLRXDATAACK_M   0x1
+#define FW_OFLD_CONNECTION_WR_CPLRXDATAACK_V(x)        \
+	((x) << FW_OFLD_CONNECTION_WR_CPLRXDATAACK_S)
+#define FW_OFLD_CONNECTION_WR_CPLRXDATAACK_G(x)        \
+	(((x) >> FW_OFLD_CONNECTION_WR_CPLRXDATAACK_S) & \
+	FW_OFLD_CONNECTION_WR_CPLRXDATAACK_M)
+#define FW_OFLD_CONNECTION_WR_CPLRXDATAACK_F   \
+	FW_OFLD_CONNECTION_WR_CPLRXDATAACK_V(1U)
+
+#define FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_S       14
+#define FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_M       0x1
+#define FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_V(x)    \
+	((x) << FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_S)
+#define FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_G(x)    \
+	(((x) >> FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_S) & \
+	FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_M)
+#define FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_F       \
+	FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_V(1U)
+
+enum fw_flowc_mnem_tcpstate {
+	FW_FLOWC_MNEM_TCPSTATE_CLOSED   = 0, /* illegal */
+	FW_FLOWC_MNEM_TCPSTATE_LISTEN   = 1, /* illegal */
+	FW_FLOWC_MNEM_TCPSTATE_SYNSENT  = 2, /* illegal */
+	FW_FLOWC_MNEM_TCPSTATE_SYNRECEIVED = 3, /* illegal */
+	FW_FLOWC_MNEM_TCPSTATE_ESTABLISHED = 4, /* default */
+	FW_FLOWC_MNEM_TCPSTATE_CLOSEWAIT = 5, /* got peer close already */
+	FW_FLOWC_MNEM_TCPSTATE_FINWAIT1 = 6, /* haven't gotten ACK for FIN and
+					      * will resend FIN - equiv ESTAB
+					      */
+	FW_FLOWC_MNEM_TCPSTATE_CLOSING  = 7, /* haven't gotten ACK for FIN and
+					      * will resend FIN but have
+					      * received FIN
+					      */
+	FW_FLOWC_MNEM_TCPSTATE_LASTACK  = 8, /* haven't gotten ACK for FIN and
+					      * will resend FIN but have
+					      * received FIN
+					      */
+	FW_FLOWC_MNEM_TCPSTATE_FINWAIT2 = 9, /* sent FIN and got FIN + ACK,
+					      * waiting for FIN
+					      */
+	FW_FLOWC_MNEM_TCPSTATE_TIMEWAIT = 10, /* not expected */
+};
+
+enum fw_flowc_mnem {
+	FW_FLOWC_MNEM_PFNVFN,		/* PFN [15:8] VFN [7:0] */
+	FW_FLOWC_MNEM_CH,
+	FW_FLOWC_MNEM_PORT,
+	FW_FLOWC_MNEM_IQID,
+	FW_FLOWC_MNEM_SNDNXT,
+	FW_FLOWC_MNEM_RCVNXT,
+	FW_FLOWC_MNEM_SNDBUF,
+	FW_FLOWC_MNEM_MSS,
+	FW_FLOWC_MNEM_TXDATAPLEN_MAX,
+	FW_FLOWC_MNEM_TCPSTATE,
+	FW_FLOWC_MNEM_EOSTATE,
+	FW_FLOWC_MNEM_SCHEDCLASS,
+	FW_FLOWC_MNEM_DCBPRIO,
+	FW_FLOWC_MNEM_SND_SCALE,
+	FW_FLOWC_MNEM_RCV_SCALE,
+	FW_FLOWC_MNEM_ULD_MODE,
+	FW_FLOWC_MNEM_MAX,
+};
+
+struct fw_flowc_mnemval {
+	u8 mnemonic;
+	u8 r4[3];
+	__be32 val;
+};
+
+struct fw_flowc_wr {
+	__be32 op_to_nparams;
+	__be32 flowid_len16;
+	struct fw_flowc_mnemval mnemval[0];
+};
+
+#define FW_FLOWC_WR_NPARAMS_S           0
+#define FW_FLOWC_WR_NPARAMS_V(x)        ((x) << FW_FLOWC_WR_NPARAMS_S)
+
+struct fw_ofld_tx_data_wr {
+	__be32 op_to_immdlen;
+	__be32 flowid_len16;
+	__be32 plen;
+	__be32 tunnel_to_proxy;
+};
+
+#define FW_OFLD_TX_DATA_WR_ALIGNPLD_S   30
+#define FW_OFLD_TX_DATA_WR_ALIGNPLD_V(x) ((x) << FW_OFLD_TX_DATA_WR_ALIGNPLD_S)
+#define FW_OFLD_TX_DATA_WR_ALIGNPLD_F   FW_OFLD_TX_DATA_WR_ALIGNPLD_V(1U)
+
+#define FW_OFLD_TX_DATA_WR_SHOVE_S      29
+#define FW_OFLD_TX_DATA_WR_SHOVE_V(x)   ((x) << FW_OFLD_TX_DATA_WR_SHOVE_S)
+#define FW_OFLD_TX_DATA_WR_SHOVE_F      FW_OFLD_TX_DATA_WR_SHOVE_V(1U)
+
+#define FW_OFLD_TX_DATA_WR_TUNNEL_S     19
+#define FW_OFLD_TX_DATA_WR_TUNNEL_V(x)  ((x) << FW_OFLD_TX_DATA_WR_TUNNEL_S)
+
+#define FW_OFLD_TX_DATA_WR_SAVE_S       18
+#define FW_OFLD_TX_DATA_WR_SAVE_V(x)    ((x) << FW_OFLD_TX_DATA_WR_SAVE_S)
+
+#define FW_OFLD_TX_DATA_WR_FLUSH_S      17
+#define FW_OFLD_TX_DATA_WR_FLUSH_V(x)   ((x) << FW_OFLD_TX_DATA_WR_FLUSH_S)
+#define FW_OFLD_TX_DATA_WR_FLUSH_F      FW_OFLD_TX_DATA_WR_FLUSH_V(1U)
+
+#define FW_OFLD_TX_DATA_WR_URGENT_S     16
+#define FW_OFLD_TX_DATA_WR_URGENT_V(x)  ((x) << FW_OFLD_TX_DATA_WR_URGENT_S)
+
+#define FW_OFLD_TX_DATA_WR_MORE_S       15
+#define FW_OFLD_TX_DATA_WR_MORE_V(x)    ((x) << FW_OFLD_TX_DATA_WR_MORE_S)
+
+#define FW_OFLD_TX_DATA_WR_ULPMODE_S    10
+#define FW_OFLD_TX_DATA_WR_ULPMODE_V(x) ((x) << FW_OFLD_TX_DATA_WR_ULPMODE_S)
+
+#define FW_OFLD_TX_DATA_WR_ULPSUBMODE_S         6
+#define FW_OFLD_TX_DATA_WR_ULPSUBMODE_V(x)      \
+	((x) << FW_OFLD_TX_DATA_WR_ULPSUBMODE_S)
+
+struct fw_cmd_wr {
+	__be32 op_dma;
+	__be32 len16_pkd;
+	__be64 cookie_daddr;
+};
+
+#define FW_CMD_WR_DMA_S         17
+#define FW_CMD_WR_DMA_V(x)      ((x) << FW_CMD_WR_DMA_S)
+
+struct fw_eth_tx_pkt_vm_wr {
+	__be32 op_immdlen;
+	__be32 equiq_to_len16;
+	__be32 r3[2];
+	u8 ethmacdst[6];
+	u8 ethmacsrc[6];
+	__be16 ethtype;
+	__be16 vlantci;
+};
+
+#define FW_CMD_MAX_TIMEOUT 10000
+
+/*
+ * If a host driver does a HELLO and discovers that there's already a MASTER
+ * selected, we may have to wait for that MASTER to finish issuing RESET,
+ * configuration and INITIALIZE commands.  Also, there's a possibility that
+ * our own HELLO may get lost if it happens right as the MASTER is issuign a
+ * RESET command, so we need to be willing to make a few retries of our HELLO.
+ */
+#define FW_CMD_HELLO_TIMEOUT	(3 * FW_CMD_MAX_TIMEOUT)
+#define FW_CMD_HELLO_RETRIES	3
+
+
+enum fw_cmd_opcodes {
+	FW_LDST_CMD                    = 0x01,
+	FW_RESET_CMD                   = 0x03,
+	FW_HELLO_CMD                   = 0x04,
+	FW_BYE_CMD                     = 0x05,
+	FW_INITIALIZE_CMD              = 0x06,
+	FW_CAPS_CONFIG_CMD             = 0x07,
+	FW_PARAMS_CMD                  = 0x08,
+	FW_PFVF_CMD                    = 0x09,
+	FW_IQ_CMD                      = 0x10,
+	FW_EQ_MNGT_CMD                 = 0x11,
+	FW_EQ_ETH_CMD                  = 0x12,
+	FW_EQ_CTRL_CMD                 = 0x13,
+	FW_EQ_OFLD_CMD                 = 0x21,
+	FW_VI_CMD                      = 0x14,
+	FW_VI_MAC_CMD                  = 0x15,
+	FW_VI_RXMODE_CMD               = 0x16,
+	FW_VI_ENABLE_CMD               = 0x17,
+	FW_ACL_MAC_CMD                 = 0x18,
+	FW_ACL_VLAN_CMD                = 0x19,
+	FW_VI_STATS_CMD                = 0x1a,
+	FW_PORT_CMD                    = 0x1b,
+	FW_PORT_STATS_CMD              = 0x1c,
+	FW_PORT_LB_STATS_CMD           = 0x1d,
+	FW_PORT_TRACE_CMD              = 0x1e,
+	FW_PORT_TRACE_MMAP_CMD         = 0x1f,
+	FW_RSS_IND_TBL_CMD             = 0x20,
+	FW_RSS_GLB_CONFIG_CMD          = 0x22,
+	FW_RSS_VI_CONFIG_CMD           = 0x23,
+	FW_SCHED_CMD                   = 0x24,
+	FW_DEVLOG_CMD                  = 0x25,
+	FW_CLIP_CMD                    = 0x28,
+	FW_PTP_CMD                     = 0x3e,
+	FW_HMA_CMD                     = 0x3f,
+	FW_LASTC2E_CMD                 = 0x40,
+	FW_ERROR_CMD                   = 0x80,
+	FW_DEBUG_CMD                   = 0x81,
+};
+
+enum fw_cmd_cap {
+	FW_CMD_CAP_PF                  = 0x01,
+	FW_CMD_CAP_DMAQ                = 0x02,
+	FW_CMD_CAP_PORT                = 0x04,
+	FW_CMD_CAP_PORTPROMISC         = 0x08,
+	FW_CMD_CAP_PORTSTATS           = 0x10,
+	FW_CMD_CAP_VF                  = 0x80,
+};
+
+/*
+ * Generic command header flit0
+ */
+struct fw_cmd_hdr {
+	__be32 hi;
+	__be32 lo;
+};
+
+#define FW_CMD_OP_S             24
+#define FW_CMD_OP_M             0xff
+#define FW_CMD_OP_V(x)          ((x) << FW_CMD_OP_S)
+#define FW_CMD_OP_G(x)          (((x) >> FW_CMD_OP_S) & FW_CMD_OP_M)
+
+#define FW_CMD_REQUEST_S        23
+#define FW_CMD_REQUEST_V(x)     ((x) << FW_CMD_REQUEST_S)
+#define FW_CMD_REQUEST_F        FW_CMD_REQUEST_V(1U)
+
+#define FW_CMD_READ_S           22
+#define FW_CMD_READ_V(x)        ((x) << FW_CMD_READ_S)
+#define FW_CMD_READ_F           FW_CMD_READ_V(1U)
+
+#define FW_CMD_WRITE_S          21
+#define FW_CMD_WRITE_V(x)       ((x) << FW_CMD_WRITE_S)
+#define FW_CMD_WRITE_F          FW_CMD_WRITE_V(1U)
+
+#define FW_CMD_EXEC_S           20
+#define FW_CMD_EXEC_V(x)        ((x) << FW_CMD_EXEC_S)
+#define FW_CMD_EXEC_F           FW_CMD_EXEC_V(1U)
+
+#define FW_CMD_RAMASK_S         20
+#define FW_CMD_RAMASK_V(x)      ((x) << FW_CMD_RAMASK_S)
+
+#define FW_CMD_RETVAL_S         8
+#define FW_CMD_RETVAL_M         0xff
+#define FW_CMD_RETVAL_V(x)      ((x) << FW_CMD_RETVAL_S)
+#define FW_CMD_RETVAL_G(x)      (((x) >> FW_CMD_RETVAL_S) & FW_CMD_RETVAL_M)
+
+#define FW_CMD_LEN16_S          0
+#define FW_CMD_LEN16_V(x)       ((x) << FW_CMD_LEN16_S)
+
+#define FW_LEN16(fw_struct)	FW_CMD_LEN16_V(sizeof(fw_struct) / 16)
+
+enum fw_ldst_addrspc {
+	FW_LDST_ADDRSPC_FIRMWARE  = 0x0001,
+	FW_LDST_ADDRSPC_SGE_EGRC  = 0x0008,
+	FW_LDST_ADDRSPC_SGE_INGC  = 0x0009,
+	FW_LDST_ADDRSPC_SGE_FLMC  = 0x000a,
+	FW_LDST_ADDRSPC_SGE_CONMC = 0x000b,
+	FW_LDST_ADDRSPC_TP_PIO    = 0x0010,
+	FW_LDST_ADDRSPC_TP_TM_PIO = 0x0011,
+	FW_LDST_ADDRSPC_TP_MIB    = 0x0012,
+	FW_LDST_ADDRSPC_MDIO      = 0x0018,
+	FW_LDST_ADDRSPC_MPS       = 0x0020,
+	FW_LDST_ADDRSPC_FUNC      = 0x0028,
+	FW_LDST_ADDRSPC_FUNC_PCIE = 0x0029,
+	FW_LDST_ADDRSPC_I2C       = 0x0038,
+};
+
+enum fw_ldst_mps_fid {
+	FW_LDST_MPS_ATRB,
+	FW_LDST_MPS_RPLC
+};
+
+enum fw_ldst_func_access_ctl {
+	FW_LDST_FUNC_ACC_CTL_VIID,
+	FW_LDST_FUNC_ACC_CTL_FID
+};
+
+enum fw_ldst_func_mod_index {
+	FW_LDST_FUNC_MPS
+};
+
+struct fw_ldst_cmd {
+	__be32 op_to_addrspace;
+	__be32 cycles_to_len16;
+	union fw_ldst {
+		struct fw_ldst_addrval {
+			__be32 addr;
+			__be32 val;
+		} addrval;
+		struct fw_ldst_idctxt {
+			__be32 physid;
+			__be32 msg_ctxtflush;
+			__be32 ctxt_data7;
+			__be32 ctxt_data6;
+			__be32 ctxt_data5;
+			__be32 ctxt_data4;
+			__be32 ctxt_data3;
+			__be32 ctxt_data2;
+			__be32 ctxt_data1;
+			__be32 ctxt_data0;
+		} idctxt;
+		struct fw_ldst_mdio {
+			__be16 paddr_mmd;
+			__be16 raddr;
+			__be16 vctl;
+			__be16 rval;
+		} mdio;
+		struct fw_ldst_cim_rq {
+			u8 req_first64[8];
+			u8 req_second64[8];
+			u8 resp_first64[8];
+			u8 resp_second64[8];
+			__be32 r3[2];
+		} cim_rq;
+		union fw_ldst_mps {
+			struct fw_ldst_mps_rplc {
+				__be16 fid_idx;
+				__be16 rplcpf_pkd;
+				__be32 rplc255_224;
+				__be32 rplc223_192;
+				__be32 rplc191_160;
+				__be32 rplc159_128;
+				__be32 rplc127_96;
+				__be32 rplc95_64;
+				__be32 rplc63_32;
+				__be32 rplc31_0;
+			} rplc;
+			struct fw_ldst_mps_atrb {
+				__be16 fid_mpsid;
+				__be16 r2[3];
+				__be32 r3[2];
+				__be32 r4;
+				__be32 atrb;
+				__be16 vlan[16];
+			} atrb;
+		} mps;
+		struct fw_ldst_func {
+			u8 access_ctl;
+			u8 mod_index;
+			__be16 ctl_id;
+			__be32 offset;
+			__be64 data0;
+			__be64 data1;
+		} func;
+		struct fw_ldst_pcie {
+			u8 ctrl_to_fn;
+			u8 bnum;
+			u8 r;
+			u8 ext_r;
+			u8 select_naccess;
+			u8 pcie_fn;
+			__be16 nset_pkd;
+			__be32 data[12];
+		} pcie;
+		struct fw_ldst_i2c_deprecated {
+			u8 pid_pkd;
+			u8 base;
+			u8 boffset;
+			u8 data;
+			__be32 r9;
+		} i2c_deprecated;
+		struct fw_ldst_i2c {
+			u8 pid;
+			u8 did;
+			u8 boffset;
+			u8 blen;
+			__be32 r9;
+			__u8   data[48];
+		} i2c;
+		struct fw_ldst_le {
+			__be32 index;
+			__be32 r9;
+			u8 val[33];
+			u8 r11[7];
+		} le;
+	} u;
+};
+
+#define FW_LDST_CMD_ADDRSPACE_S		0
+#define FW_LDST_CMD_ADDRSPACE_V(x)	((x) << FW_LDST_CMD_ADDRSPACE_S)
+
+#define FW_LDST_CMD_MSG_S       31
+#define FW_LDST_CMD_MSG_V(x)	((x) << FW_LDST_CMD_MSG_S)
+
+#define FW_LDST_CMD_CTXTFLUSH_S		30
+#define FW_LDST_CMD_CTXTFLUSH_V(x)	((x) << FW_LDST_CMD_CTXTFLUSH_S)
+#define FW_LDST_CMD_CTXTFLUSH_F		FW_LDST_CMD_CTXTFLUSH_V(1U)
+
+#define FW_LDST_CMD_PADDR_S     8
+#define FW_LDST_CMD_PADDR_V(x)	((x) << FW_LDST_CMD_PADDR_S)
+
+#define FW_LDST_CMD_MMD_S       0
+#define FW_LDST_CMD_MMD_V(x)	((x) << FW_LDST_CMD_MMD_S)
+
+#define FW_LDST_CMD_FID_S       15
+#define FW_LDST_CMD_FID_V(x)	((x) << FW_LDST_CMD_FID_S)
+
+#define FW_LDST_CMD_IDX_S	0
+#define FW_LDST_CMD_IDX_V(x)	((x) << FW_LDST_CMD_IDX_S)
+
+#define FW_LDST_CMD_RPLCPF_S    0
+#define FW_LDST_CMD_RPLCPF_V(x)	((x) << FW_LDST_CMD_RPLCPF_S)
+
+#define FW_LDST_CMD_LC_S        4
+#define FW_LDST_CMD_LC_V(x)     ((x) << FW_LDST_CMD_LC_S)
+#define FW_LDST_CMD_LC_F	FW_LDST_CMD_LC_V(1U)
+
+#define FW_LDST_CMD_FN_S        0
+#define FW_LDST_CMD_FN_V(x)	((x) << FW_LDST_CMD_FN_S)
+
+#define FW_LDST_CMD_NACCESS_S           0
+#define FW_LDST_CMD_NACCESS_V(x)	((x) << FW_LDST_CMD_NACCESS_S)
+
+struct fw_reset_cmd {
+	__be32 op_to_write;
+	__be32 retval_len16;
+	__be32 val;
+	__be32 halt_pkd;
+};
+
+#define FW_RESET_CMD_HALT_S	31
+#define FW_RESET_CMD_HALT_M     0x1
+#define FW_RESET_CMD_HALT_V(x)	((x) << FW_RESET_CMD_HALT_S)
+#define FW_RESET_CMD_HALT_G(x)  \
+	(((x) >> FW_RESET_CMD_HALT_S) & FW_RESET_CMD_HALT_M)
+#define FW_RESET_CMD_HALT_F	FW_RESET_CMD_HALT_V(1U)
+
+enum fw_hellow_cmd {
+	fw_hello_cmd_stage_os		= 0x0
+};
+
+struct fw_hello_cmd {
+	__be32 op_to_write;
+	__be32 retval_len16;
+	__be32 err_to_clearinit;
+	__be32 fwrev;
+};
+
+#define FW_HELLO_CMD_ERR_S      31
+#define FW_HELLO_CMD_ERR_V(x)   ((x) << FW_HELLO_CMD_ERR_S)
+#define FW_HELLO_CMD_ERR_F	FW_HELLO_CMD_ERR_V(1U)
+
+#define FW_HELLO_CMD_INIT_S     30
+#define FW_HELLO_CMD_INIT_V(x)  ((x) << FW_HELLO_CMD_INIT_S)
+#define FW_HELLO_CMD_INIT_F	FW_HELLO_CMD_INIT_V(1U)
+
+#define FW_HELLO_CMD_MASTERDIS_S	29
+#define FW_HELLO_CMD_MASTERDIS_V(x)	((x) << FW_HELLO_CMD_MASTERDIS_S)
+
+#define FW_HELLO_CMD_MASTERFORCE_S      28
+#define FW_HELLO_CMD_MASTERFORCE_V(x)	((x) << FW_HELLO_CMD_MASTERFORCE_S)
+
+#define FW_HELLO_CMD_MBMASTER_S		24
+#define FW_HELLO_CMD_MBMASTER_M		0xfU
+#define FW_HELLO_CMD_MBMASTER_V(x)	((x) << FW_HELLO_CMD_MBMASTER_S)
+#define FW_HELLO_CMD_MBMASTER_G(x)	\
+	(((x) >> FW_HELLO_CMD_MBMASTER_S) & FW_HELLO_CMD_MBMASTER_M)
+
+#define FW_HELLO_CMD_MBASYNCNOTINT_S    23
+#define FW_HELLO_CMD_MBASYNCNOTINT_V(x)	((x) << FW_HELLO_CMD_MBASYNCNOTINT_S)
+
+#define FW_HELLO_CMD_MBASYNCNOT_S       20
+#define FW_HELLO_CMD_MBASYNCNOT_V(x)	((x) << FW_HELLO_CMD_MBASYNCNOT_S)
+
+#define FW_HELLO_CMD_STAGE_S		17
+#define FW_HELLO_CMD_STAGE_V(x)		((x) << FW_HELLO_CMD_STAGE_S)
+
+#define FW_HELLO_CMD_CLEARINIT_S        16
+#define FW_HELLO_CMD_CLEARINIT_V(x)     ((x) << FW_HELLO_CMD_CLEARINIT_S)
+#define FW_HELLO_CMD_CLEARINIT_F	FW_HELLO_CMD_CLEARINIT_V(1U)
+
+struct fw_bye_cmd {
+	__be32 op_to_write;
+	__be32 retval_len16;
+	__be64 r3;
+};
+
+struct fw_initialize_cmd {
+	__be32 op_to_write;
+	__be32 retval_len16;
+	__be64 r3;
+};
+
+enum fw_caps_config_hm {
+	FW_CAPS_CONFIG_HM_PCIE		= 0x00000001,
+	FW_CAPS_CONFIG_HM_PL		= 0x00000002,
+	FW_CAPS_CONFIG_HM_SGE		= 0x00000004,
+	FW_CAPS_CONFIG_HM_CIM		= 0x00000008,
+	FW_CAPS_CONFIG_HM_ULPTX		= 0x00000010,
+	FW_CAPS_CONFIG_HM_TP		= 0x00000020,
+	FW_CAPS_CONFIG_HM_ULPRX		= 0x00000040,
+	FW_CAPS_CONFIG_HM_PMRX		= 0x00000080,
+	FW_CAPS_CONFIG_HM_PMTX		= 0x00000100,
+	FW_CAPS_CONFIG_HM_MC		= 0x00000200,
+	FW_CAPS_CONFIG_HM_LE		= 0x00000400,
+	FW_CAPS_CONFIG_HM_MPS		= 0x00000800,
+	FW_CAPS_CONFIG_HM_XGMAC		= 0x00001000,
+	FW_CAPS_CONFIG_HM_CPLSWITCH	= 0x00002000,
+	FW_CAPS_CONFIG_HM_T4DBG		= 0x00004000,
+	FW_CAPS_CONFIG_HM_MI		= 0x00008000,
+	FW_CAPS_CONFIG_HM_I2CM		= 0x00010000,
+	FW_CAPS_CONFIG_HM_NCSI		= 0x00020000,
+	FW_CAPS_CONFIG_HM_SMB		= 0x00040000,
+	FW_CAPS_CONFIG_HM_MA		= 0x00080000,
+	FW_CAPS_CONFIG_HM_EDRAM		= 0x00100000,
+	FW_CAPS_CONFIG_HM_PMU		= 0x00200000,
+	FW_CAPS_CONFIG_HM_UART		= 0x00400000,
+	FW_CAPS_CONFIG_HM_SF		= 0x00800000,
+};
+
+enum fw_caps_config_nbm {
+	FW_CAPS_CONFIG_NBM_IPMI		= 0x00000001,
+	FW_CAPS_CONFIG_NBM_NCSI		= 0x00000002,
+};
+
+enum fw_caps_config_link {
+	FW_CAPS_CONFIG_LINK_PPP		= 0x00000001,
+	FW_CAPS_CONFIG_LINK_QFC		= 0x00000002,
+	FW_CAPS_CONFIG_LINK_DCBX	= 0x00000004,
+};
+
+enum fw_caps_config_switch {
+	FW_CAPS_CONFIG_SWITCH_INGRESS	= 0x00000001,
+	FW_CAPS_CONFIG_SWITCH_EGRESS	= 0x00000002,
+};
+
+enum fw_caps_config_nic {
+	FW_CAPS_CONFIG_NIC		= 0x00000001,
+	FW_CAPS_CONFIG_NIC_VM		= 0x00000002,
+	FW_CAPS_CONFIG_NIC_HASHFILTER	= 0x00000020,
+};
+
+enum fw_caps_config_ofld {
+	FW_CAPS_CONFIG_OFLD		= 0x00000001,
+};
+
+enum fw_caps_config_rdma {
+	FW_CAPS_CONFIG_RDMA_RDDP	= 0x00000001,
+	FW_CAPS_CONFIG_RDMA_RDMAC	= 0x00000002,
+};
+
+enum fw_caps_config_iscsi {
+	FW_CAPS_CONFIG_ISCSI_INITIATOR_PDU = 0x00000001,
+	FW_CAPS_CONFIG_ISCSI_TARGET_PDU = 0x00000002,
+	FW_CAPS_CONFIG_ISCSI_INITIATOR_CNXOFLD = 0x00000004,
+	FW_CAPS_CONFIG_ISCSI_TARGET_CNXOFLD = 0x00000008,
+};
+
+enum fw_caps_config_crypto {
+	FW_CAPS_CONFIG_CRYPTO_LOOKASIDE = 0x00000001,
+	FW_CAPS_CONFIG_TLS_INLINE = 0x00000002,
+	FW_CAPS_CONFIG_IPSEC_INLINE = 0x00000004,
+};
+
+enum fw_caps_config_fcoe {
+	FW_CAPS_CONFIG_FCOE_INITIATOR	= 0x00000001,
+	FW_CAPS_CONFIG_FCOE_TARGET	= 0x00000002,
+	FW_CAPS_CONFIG_FCOE_CTRL_OFLD	= 0x00000004,
+};
+
+enum fw_memtype_cf {
+	FW_MEMTYPE_CF_EDC0		= 0x0,
+	FW_MEMTYPE_CF_EDC1		= 0x1,
+	FW_MEMTYPE_CF_EXTMEM		= 0x2,
+	FW_MEMTYPE_CF_FLASH		= 0x4,
+	FW_MEMTYPE_CF_INTERNAL		= 0x5,
+	FW_MEMTYPE_CF_EXTMEM1           = 0x6,
+	FW_MEMTYPE_CF_HMA		= 0x7,
+};
+
+struct fw_caps_config_cmd {
+	__be32 op_to_write;
+	__be32 cfvalid_to_len16;
+	__be32 r2;
+	__be32 hwmbitmap;
+	__be16 nbmcaps;
+	__be16 linkcaps;
+	__be16 switchcaps;
+	__be16 r3;
+	__be16 niccaps;
+	__be16 ofldcaps;
+	__be16 rdmacaps;
+	__be16 cryptocaps;
+	__be16 iscsicaps;
+	__be16 fcoecaps;
+	__be32 cfcsum;
+	__be32 finiver;
+	__be32 finicsum;
+};
+
+#define FW_CAPS_CONFIG_CMD_CFVALID_S    27
+#define FW_CAPS_CONFIG_CMD_CFVALID_V(x) ((x) << FW_CAPS_CONFIG_CMD_CFVALID_S)
+#define FW_CAPS_CONFIG_CMD_CFVALID_F    FW_CAPS_CONFIG_CMD_CFVALID_V(1U)
+
+#define FW_CAPS_CONFIG_CMD_MEMTYPE_CF_S		24
+#define FW_CAPS_CONFIG_CMD_MEMTYPE_CF_V(x)	\
+	((x) << FW_CAPS_CONFIG_CMD_MEMTYPE_CF_S)
+
+#define FW_CAPS_CONFIG_CMD_MEMADDR64K_CF_S      16
+#define FW_CAPS_CONFIG_CMD_MEMADDR64K_CF_V(x)	\
+	((x) << FW_CAPS_CONFIG_CMD_MEMADDR64K_CF_S)
+
+/*
+ * params command mnemonics
+ */
+enum fw_params_mnem {
+	FW_PARAMS_MNEM_DEV		= 1,	/* device params */
+	FW_PARAMS_MNEM_PFVF		= 2,	/* function params */
+	FW_PARAMS_MNEM_REG		= 3,	/* limited register access */
+	FW_PARAMS_MNEM_DMAQ		= 4,	/* dma queue params */
+	FW_PARAMS_MNEM_CHNET            = 5,    /* chnet params */
+	FW_PARAMS_MNEM_LAST
+};
+
+/*
+ * device parameters
+ */
+enum fw_params_param_dev {
+	FW_PARAMS_PARAM_DEV_CCLK	= 0x00, /* chip core clock in khz */
+	FW_PARAMS_PARAM_DEV_PORTVEC	= 0x01, /* the port vector */
+	FW_PARAMS_PARAM_DEV_NTID	= 0x02, /* reads the number of TIDs
+						 * allocated by the device's
+						 * Lookup Engine
+						 */
+	FW_PARAMS_PARAM_DEV_FLOWC_BUFFIFO_SZ = 0x03,
+	FW_PARAMS_PARAM_DEV_INTVER_NIC	= 0x04,
+	FW_PARAMS_PARAM_DEV_INTVER_VNIC = 0x05,
+	FW_PARAMS_PARAM_DEV_INTVER_OFLD = 0x06,
+	FW_PARAMS_PARAM_DEV_INTVER_RI	= 0x07,
+	FW_PARAMS_PARAM_DEV_INTVER_ISCSIPDU = 0x08,
+	FW_PARAMS_PARAM_DEV_INTVER_ISCSI = 0x09,
+	FW_PARAMS_PARAM_DEV_INTVER_FCOE = 0x0A,
+	FW_PARAMS_PARAM_DEV_FWREV = 0x0B,
+	FW_PARAMS_PARAM_DEV_TPREV = 0x0C,
+	FW_PARAMS_PARAM_DEV_CF = 0x0D,
+	FW_PARAMS_PARAM_DEV_PHYFW = 0x0F,
+	FW_PARAMS_PARAM_DEV_DIAG = 0x11,
+	FW_PARAMS_PARAM_DEV_MAXORDIRD_QP = 0x13, /* max supported QP IRD/ORD */
+	FW_PARAMS_PARAM_DEV_MAXIRD_ADAPTER = 0x14, /* max supported adap IRD */
+	FW_PARAMS_PARAM_DEV_ULPTX_MEMWRITE_DSGL = 0x17,
+	FW_PARAMS_PARAM_DEV_FWCACHE = 0x18,
+	FW_PARAMS_PARAM_DEV_SCFGREV = 0x1A,
+	FW_PARAMS_PARAM_DEV_VPDREV = 0x1B,
+	FW_PARAMS_PARAM_DEV_RI_FR_NSMR_TPTE_WR	= 0x1C,
+	FW_PARAMS_PARAM_DEV_FILTER2_WR  = 0x1D,
+	FW_PARAMS_PARAM_DEV_MPSBGMAP	= 0x1E,
+	FW_PARAMS_PARAM_DEV_HMA_SIZE	= 0x20,
+	FW_PARAMS_PARAM_DEV_RDMA_WRITE_WITH_IMM = 0x21,
+	FW_PARAMS_PARAM_DEV_RI_WRITE_CMPL_WR    = 0x24,
+};
+
+/*
+ * physical and virtual function parameters
+ */
+enum fw_params_param_pfvf {
+	FW_PARAMS_PARAM_PFVF_RWXCAPS	= 0x00,
+	FW_PARAMS_PARAM_PFVF_ROUTE_START = 0x01,
+	FW_PARAMS_PARAM_PFVF_ROUTE_END = 0x02,
+	FW_PARAMS_PARAM_PFVF_CLIP_START = 0x03,
+	FW_PARAMS_PARAM_PFVF_CLIP_END = 0x04,
+	FW_PARAMS_PARAM_PFVF_FILTER_START = 0x05,
+	FW_PARAMS_PARAM_PFVF_FILTER_END = 0x06,
+	FW_PARAMS_PARAM_PFVF_SERVER_START = 0x07,
+	FW_PARAMS_PARAM_PFVF_SERVER_END = 0x08,
+	FW_PARAMS_PARAM_PFVF_TDDP_START = 0x09,
+	FW_PARAMS_PARAM_PFVF_TDDP_END = 0x0A,
+	FW_PARAMS_PARAM_PFVF_ISCSI_START = 0x0B,
+	FW_PARAMS_PARAM_PFVF_ISCSI_END = 0x0C,
+	FW_PARAMS_PARAM_PFVF_STAG_START = 0x0D,
+	FW_PARAMS_PARAM_PFVF_STAG_END = 0x0E,
+	FW_PARAMS_PARAM_PFVF_RQ_START = 0x1F,
+	FW_PARAMS_PARAM_PFVF_RQ_END	= 0x10,
+	FW_PARAMS_PARAM_PFVF_PBL_START = 0x11,
+	FW_PARAMS_PARAM_PFVF_PBL_END	= 0x12,
+	FW_PARAMS_PARAM_PFVF_L2T_START = 0x13,
+	FW_PARAMS_PARAM_PFVF_L2T_END = 0x14,
+	FW_PARAMS_PARAM_PFVF_SQRQ_START = 0x15,
+	FW_PARAMS_PARAM_PFVF_SQRQ_END	= 0x16,
+	FW_PARAMS_PARAM_PFVF_CQ_START	= 0x17,
+	FW_PARAMS_PARAM_PFVF_CQ_END	= 0x18,
+	FW_PARAMS_PARAM_PFVF_SRQ_START  = 0x19,
+	FW_PARAMS_PARAM_PFVF_SRQ_END    = 0x1A,
+	FW_PARAMS_PARAM_PFVF_SCHEDCLASS_ETH = 0x20,
+	FW_PARAMS_PARAM_PFVF_VIID       = 0x24,
+	FW_PARAMS_PARAM_PFVF_CPMASK     = 0x25,
+	FW_PARAMS_PARAM_PFVF_OCQ_START  = 0x26,
+	FW_PARAMS_PARAM_PFVF_OCQ_END    = 0x27,
+	FW_PARAMS_PARAM_PFVF_CONM_MAP   = 0x28,
+	FW_PARAMS_PARAM_PFVF_IQFLINT_START = 0x29,
+	FW_PARAMS_PARAM_PFVF_IQFLINT_END = 0x2A,
+	FW_PARAMS_PARAM_PFVF_EQ_START	= 0x2B,
+	FW_PARAMS_PARAM_PFVF_EQ_END	= 0x2C,
+	FW_PARAMS_PARAM_PFVF_ACTIVE_FILTER_START = 0x2D,
+	FW_PARAMS_PARAM_PFVF_ACTIVE_FILTER_END = 0x2E,
+	FW_PARAMS_PARAM_PFVF_ETHOFLD_START = 0x2F,
+	FW_PARAMS_PARAM_PFVF_ETHOFLD_END = 0x30,
+	FW_PARAMS_PARAM_PFVF_CPLFW4MSG_ENCAP = 0x31,
+	FW_PARAMS_PARAM_PFVF_HPFILTER_START = 0x32,
+	FW_PARAMS_PARAM_PFVF_HPFILTER_END = 0x33,
+	FW_PARAMS_PARAM_PFVF_TLS_START = 0x34,
+	FW_PARAMS_PARAM_PFVF_TLS_END = 0x35,
+	FW_PARAMS_PARAM_PFVF_NCRYPTO_LOOKASIDE = 0x39,
+	FW_PARAMS_PARAM_PFVF_PORT_CAPS32 = 0x3A,
+};
+
+/*
+ * dma queue parameters
+ */
+enum fw_params_param_dmaq {
+	FW_PARAMS_PARAM_DMAQ_IQ_DCAEN_DCACPU = 0x00,
+	FW_PARAMS_PARAM_DMAQ_IQ_INTCNTTHRESH = 0x01,
+	FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_MNGT = 0x10,
+	FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_CTRL = 0x11,
+	FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH = 0x12,
+	FW_PARAMS_PARAM_DMAQ_EQ_DCBPRIO_ETH = 0x13,
+	FW_PARAMS_PARAM_DMAQ_CONM_CTXT = 0x20,
+};
+
+enum fw_params_param_dev_phyfw {
+	FW_PARAMS_PARAM_DEV_PHYFW_DOWNLOAD = 0x00,
+	FW_PARAMS_PARAM_DEV_PHYFW_VERSION = 0x01,
+};
+
+enum fw_params_param_dev_diag {
+	FW_PARAM_DEV_DIAG_TMP		= 0x00,
+	FW_PARAM_DEV_DIAG_VDD		= 0x01,
+};
+
+enum fw_params_param_dev_fwcache {
+	FW_PARAM_DEV_FWCACHE_FLUSH      = 0x00,
+	FW_PARAM_DEV_FWCACHE_FLUSHINV   = 0x01,
+};
+
+#define FW_PARAMS_MNEM_S	24
+#define FW_PARAMS_MNEM_V(x)	((x) << FW_PARAMS_MNEM_S)
+
+#define FW_PARAMS_PARAM_X_S     16
+#define FW_PARAMS_PARAM_X_V(x)	((x) << FW_PARAMS_PARAM_X_S)
+
+#define FW_PARAMS_PARAM_Y_S	8
+#define FW_PARAMS_PARAM_Y_M	0xffU
+#define FW_PARAMS_PARAM_Y_V(x)	((x) << FW_PARAMS_PARAM_Y_S)
+#define FW_PARAMS_PARAM_Y_G(x)	(((x) >> FW_PARAMS_PARAM_Y_S) &\
+		FW_PARAMS_PARAM_Y_M)
+
+#define FW_PARAMS_PARAM_Z_S	0
+#define FW_PARAMS_PARAM_Z_M	0xffu
+#define FW_PARAMS_PARAM_Z_V(x)	((x) << FW_PARAMS_PARAM_Z_S)
+#define FW_PARAMS_PARAM_Z_G(x)	(((x) >> FW_PARAMS_PARAM_Z_S) &\
+		FW_PARAMS_PARAM_Z_M)
+
+#define FW_PARAMS_PARAM_XYZ_S		0
+#define FW_PARAMS_PARAM_XYZ_V(x)	((x) << FW_PARAMS_PARAM_XYZ_S)
+
+#define FW_PARAMS_PARAM_YZ_S		0
+#define FW_PARAMS_PARAM_YZ_V(x)		((x) << FW_PARAMS_PARAM_YZ_S)
+
+struct fw_params_cmd {
+	__be32 op_to_vfn;
+	__be32 retval_len16;
+	struct fw_params_param {
+		__be32 mnem;
+		__be32 val;
+	} param[7];
+};
+
+#define FW_PARAMS_CMD_PFN_S     8
+#define FW_PARAMS_CMD_PFN_V(x)	((x) << FW_PARAMS_CMD_PFN_S)
+
+#define FW_PARAMS_CMD_VFN_S     0
+#define FW_PARAMS_CMD_VFN_V(x)	((x) << FW_PARAMS_CMD_VFN_S)
+
+struct fw_pfvf_cmd {
+	__be32 op_to_vfn;
+	__be32 retval_len16;
+	__be32 niqflint_niq;
+	__be32 type_to_neq;
+	__be32 tc_to_nexactf;
+	__be32 r_caps_to_nethctrl;
+	__be16 nricq;
+	__be16 nriqp;
+	__be32 r4;
+};
+
+#define FW_PFVF_CMD_PFN_S	8
+#define FW_PFVF_CMD_PFN_V(x)	((x) << FW_PFVF_CMD_PFN_S)
+
+#define FW_PFVF_CMD_VFN_S       0
+#define FW_PFVF_CMD_VFN_V(x)	((x) << FW_PFVF_CMD_VFN_S)
+
+#define FW_PFVF_CMD_NIQFLINT_S          20
+#define FW_PFVF_CMD_NIQFLINT_M          0xfff
+#define FW_PFVF_CMD_NIQFLINT_V(x)	((x) << FW_PFVF_CMD_NIQFLINT_S)
+#define FW_PFVF_CMD_NIQFLINT_G(x)	\
+	(((x) >> FW_PFVF_CMD_NIQFLINT_S) & FW_PFVF_CMD_NIQFLINT_M)
+
+#define FW_PFVF_CMD_NIQ_S       0
+#define FW_PFVF_CMD_NIQ_M       0xfffff
+#define FW_PFVF_CMD_NIQ_V(x)	((x) << FW_PFVF_CMD_NIQ_S)
+#define FW_PFVF_CMD_NIQ_G(x)	\
+	(((x) >> FW_PFVF_CMD_NIQ_S) & FW_PFVF_CMD_NIQ_M)
+
+#define FW_PFVF_CMD_TYPE_S      31
+#define FW_PFVF_CMD_TYPE_M      0x1
+#define FW_PFVF_CMD_TYPE_V(x)   ((x) << FW_PFVF_CMD_TYPE_S)
+#define FW_PFVF_CMD_TYPE_G(x)	\
+	(((x) >> FW_PFVF_CMD_TYPE_S) & FW_PFVF_CMD_TYPE_M)
+#define FW_PFVF_CMD_TYPE_F      FW_PFVF_CMD_TYPE_V(1U)
+
+#define FW_PFVF_CMD_CMASK_S     24
+#define FW_PFVF_CMD_CMASK_M	0xf
+#define FW_PFVF_CMD_CMASK_V(x)	((x) << FW_PFVF_CMD_CMASK_S)
+#define FW_PFVF_CMD_CMASK_G(x)	\
+	(((x) >> FW_PFVF_CMD_CMASK_S) & FW_PFVF_CMD_CMASK_M)
+
+#define FW_PFVF_CMD_PMASK_S     20
+#define FW_PFVF_CMD_PMASK_M	0xf
+#define FW_PFVF_CMD_PMASK_V(x)	((x) << FW_PFVF_CMD_PMASK_S)
+#define FW_PFVF_CMD_PMASK_G(x) \
+	(((x) >> FW_PFVF_CMD_PMASK_S) & FW_PFVF_CMD_PMASK_M)
+
+#define FW_PFVF_CMD_NEQ_S       0
+#define FW_PFVF_CMD_NEQ_M       0xfffff
+#define FW_PFVF_CMD_NEQ_V(x)	((x) << FW_PFVF_CMD_NEQ_S)
+#define FW_PFVF_CMD_NEQ_G(x)	\
+	(((x) >> FW_PFVF_CMD_NEQ_S) & FW_PFVF_CMD_NEQ_M)
+
+#define FW_PFVF_CMD_TC_S        24
+#define FW_PFVF_CMD_TC_M        0xff
+#define FW_PFVF_CMD_TC_V(x)	((x) << FW_PFVF_CMD_TC_S)
+#define FW_PFVF_CMD_TC_G(x)	(((x) >> FW_PFVF_CMD_TC_S) & FW_PFVF_CMD_TC_M)
+
+#define FW_PFVF_CMD_NVI_S       16
+#define FW_PFVF_CMD_NVI_M       0xff
+#define FW_PFVF_CMD_NVI_V(x)	((x) << FW_PFVF_CMD_NVI_S)
+#define FW_PFVF_CMD_NVI_G(x)	(((x) >> FW_PFVF_CMD_NVI_S) & FW_PFVF_CMD_NVI_M)
+
+#define FW_PFVF_CMD_NEXACTF_S           0
+#define FW_PFVF_CMD_NEXACTF_M           0xffff
+#define FW_PFVF_CMD_NEXACTF_V(x)	((x) << FW_PFVF_CMD_NEXACTF_S)
+#define FW_PFVF_CMD_NEXACTF_G(x)	\
+	(((x) >> FW_PFVF_CMD_NEXACTF_S) & FW_PFVF_CMD_NEXACTF_M)
+
+#define FW_PFVF_CMD_R_CAPS_S    24
+#define FW_PFVF_CMD_R_CAPS_M    0xff
+#define FW_PFVF_CMD_R_CAPS_V(x) ((x) << FW_PFVF_CMD_R_CAPS_S)
+#define FW_PFVF_CMD_R_CAPS_G(x) \
+	(((x) >> FW_PFVF_CMD_R_CAPS_S) & FW_PFVF_CMD_R_CAPS_M)
+
+#define FW_PFVF_CMD_WX_CAPS_S           16
+#define FW_PFVF_CMD_WX_CAPS_M           0xff
+#define FW_PFVF_CMD_WX_CAPS_V(x)	((x) << FW_PFVF_CMD_WX_CAPS_S)
+#define FW_PFVF_CMD_WX_CAPS_G(x)	\
+	(((x) >> FW_PFVF_CMD_WX_CAPS_S) & FW_PFVF_CMD_WX_CAPS_M)
+
+#define FW_PFVF_CMD_NETHCTRL_S          0
+#define FW_PFVF_CMD_NETHCTRL_M          0xffff
+#define FW_PFVF_CMD_NETHCTRL_V(x)	((x) << FW_PFVF_CMD_NETHCTRL_S)
+#define FW_PFVF_CMD_NETHCTRL_G(x)	\
+	(((x) >> FW_PFVF_CMD_NETHCTRL_S) & FW_PFVF_CMD_NETHCTRL_M)
+
+enum fw_iq_type {
+	FW_IQ_TYPE_FL_INT_CAP,
+	FW_IQ_TYPE_NO_FL_INT_CAP
+};
+
+struct fw_iq_cmd {
+	__be32 op_to_vfn;
+	__be32 alloc_to_len16;
+	__be16 physiqid;
+	__be16 iqid;
+	__be16 fl0id;
+	__be16 fl1id;
+	__be32 type_to_iqandstindex;
+	__be16 iqdroprss_to_iqesize;
+	__be16 iqsize;
+	__be64 iqaddr;
+	__be32 iqns_to_fl0congen;
+	__be16 fl0dcaen_to_fl0cidxfthresh;
+	__be16 fl0size;
+	__be64 fl0addr;
+	__be32 fl1cngchmap_to_fl1congen;
+	__be16 fl1dcaen_to_fl1cidxfthresh;
+	__be16 fl1size;
+	__be64 fl1addr;
+};
+
+#define FW_IQ_CMD_PFN_S		8
+#define FW_IQ_CMD_PFN_V(x)	((x) << FW_IQ_CMD_PFN_S)
+
+#define FW_IQ_CMD_VFN_S		0
+#define FW_IQ_CMD_VFN_V(x)	((x) << FW_IQ_CMD_VFN_S)
+
+#define FW_IQ_CMD_ALLOC_S	31
+#define FW_IQ_CMD_ALLOC_V(x)	((x) << FW_IQ_CMD_ALLOC_S)
+#define FW_IQ_CMD_ALLOC_F	FW_IQ_CMD_ALLOC_V(1U)
+
+#define FW_IQ_CMD_FREE_S	30
+#define FW_IQ_CMD_FREE_V(x)	((x) << FW_IQ_CMD_FREE_S)
+#define FW_IQ_CMD_FREE_F	FW_IQ_CMD_FREE_V(1U)
+
+#define FW_IQ_CMD_MODIFY_S	29
+#define FW_IQ_CMD_MODIFY_V(x)	((x) << FW_IQ_CMD_MODIFY_S)
+#define FW_IQ_CMD_MODIFY_F	FW_IQ_CMD_MODIFY_V(1U)
+
+#define FW_IQ_CMD_IQSTART_S	28
+#define FW_IQ_CMD_IQSTART_V(x)	((x) << FW_IQ_CMD_IQSTART_S)
+#define FW_IQ_CMD_IQSTART_F	FW_IQ_CMD_IQSTART_V(1U)
+
+#define FW_IQ_CMD_IQSTOP_S	27
+#define FW_IQ_CMD_IQSTOP_V(x)	((x) << FW_IQ_CMD_IQSTOP_S)
+#define FW_IQ_CMD_IQSTOP_F	FW_IQ_CMD_IQSTOP_V(1U)
+
+#define FW_IQ_CMD_TYPE_S	29
+#define FW_IQ_CMD_TYPE_V(x)	((x) << FW_IQ_CMD_TYPE_S)
+
+#define FW_IQ_CMD_IQASYNCH_S	28
+#define FW_IQ_CMD_IQASYNCH_V(x)	((x) << FW_IQ_CMD_IQASYNCH_S)
+
+#define FW_IQ_CMD_VIID_S	16
+#define FW_IQ_CMD_VIID_V(x)	((x) << FW_IQ_CMD_VIID_S)
+
+#define FW_IQ_CMD_IQANDST_S	15
+#define FW_IQ_CMD_IQANDST_V(x)	((x) << FW_IQ_CMD_IQANDST_S)
+
+#define FW_IQ_CMD_IQANUS_S	14
+#define FW_IQ_CMD_IQANUS_V(x)	((x) << FW_IQ_CMD_IQANUS_S)
+
+#define FW_IQ_CMD_IQANUD_S	12
+#define FW_IQ_CMD_IQANUD_V(x)	((x) << FW_IQ_CMD_IQANUD_S)
+
+#define FW_IQ_CMD_IQANDSTINDEX_S	0
+#define FW_IQ_CMD_IQANDSTINDEX_V(x)	((x) << FW_IQ_CMD_IQANDSTINDEX_S)
+
+#define FW_IQ_CMD_IQDROPRSS_S		15
+#define FW_IQ_CMD_IQDROPRSS_V(x)	((x) << FW_IQ_CMD_IQDROPRSS_S)
+#define FW_IQ_CMD_IQDROPRSS_F	FW_IQ_CMD_IQDROPRSS_V(1U)
+
+#define FW_IQ_CMD_IQGTSMODE_S		14
+#define FW_IQ_CMD_IQGTSMODE_V(x)	((x) << FW_IQ_CMD_IQGTSMODE_S)
+#define FW_IQ_CMD_IQGTSMODE_F		FW_IQ_CMD_IQGTSMODE_V(1U)
+
+#define FW_IQ_CMD_IQPCIECH_S	12
+#define FW_IQ_CMD_IQPCIECH_V(x)	((x) << FW_IQ_CMD_IQPCIECH_S)
+
+#define FW_IQ_CMD_IQDCAEN_S	11
+#define FW_IQ_CMD_IQDCAEN_V(x)	((x) << FW_IQ_CMD_IQDCAEN_S)
+
+#define FW_IQ_CMD_IQDCACPU_S	6
+#define FW_IQ_CMD_IQDCACPU_V(x)	((x) << FW_IQ_CMD_IQDCACPU_S)
+
+#define FW_IQ_CMD_IQINTCNTTHRESH_S	4
+#define FW_IQ_CMD_IQINTCNTTHRESH_V(x)	((x) << FW_IQ_CMD_IQINTCNTTHRESH_S)
+
+#define FW_IQ_CMD_IQO_S		3
+#define FW_IQ_CMD_IQO_V(x)	((x) << FW_IQ_CMD_IQO_S)
+#define FW_IQ_CMD_IQO_F		FW_IQ_CMD_IQO_V(1U)
+
+#define FW_IQ_CMD_IQCPRIO_S	2
+#define FW_IQ_CMD_IQCPRIO_V(x)	((x) << FW_IQ_CMD_IQCPRIO_S)
+
+#define FW_IQ_CMD_IQESIZE_S	0
+#define FW_IQ_CMD_IQESIZE_V(x)	((x) << FW_IQ_CMD_IQESIZE_S)
+
+#define FW_IQ_CMD_IQNS_S	31
+#define FW_IQ_CMD_IQNS_V(x)	((x) << FW_IQ_CMD_IQNS_S)
+
+#define FW_IQ_CMD_IQRO_S	30
+#define FW_IQ_CMD_IQRO_V(x)	((x) << FW_IQ_CMD_IQRO_S)
+
+#define FW_IQ_CMD_IQFLINTIQHSEN_S	28
+#define FW_IQ_CMD_IQFLINTIQHSEN_V(x)	((x) << FW_IQ_CMD_IQFLINTIQHSEN_S)
+
+#define FW_IQ_CMD_IQFLINTCONGEN_S	27
+#define FW_IQ_CMD_IQFLINTCONGEN_V(x)	((x) << FW_IQ_CMD_IQFLINTCONGEN_S)
+#define FW_IQ_CMD_IQFLINTCONGEN_F	FW_IQ_CMD_IQFLINTCONGEN_V(1U)
+
+#define FW_IQ_CMD_IQFLINTISCSIC_S	26
+#define FW_IQ_CMD_IQFLINTISCSIC_V(x)	((x) << FW_IQ_CMD_IQFLINTISCSIC_S)
+
+#define FW_IQ_CMD_FL0CNGCHMAP_S		20
+#define FW_IQ_CMD_FL0CNGCHMAP_V(x)	((x) << FW_IQ_CMD_FL0CNGCHMAP_S)
+
+#define FW_IQ_CMD_FL0CACHELOCK_S	15
+#define FW_IQ_CMD_FL0CACHELOCK_V(x)	((x) << FW_IQ_CMD_FL0CACHELOCK_S)
+
+#define FW_IQ_CMD_FL0DBP_S	14
+#define FW_IQ_CMD_FL0DBP_V(x)	((x) << FW_IQ_CMD_FL0DBP_S)
+
+#define FW_IQ_CMD_FL0DATANS_S		13
+#define FW_IQ_CMD_FL0DATANS_V(x)	((x) << FW_IQ_CMD_FL0DATANS_S)
+
+#define FW_IQ_CMD_FL0DATARO_S		12
+#define FW_IQ_CMD_FL0DATARO_V(x)	((x) << FW_IQ_CMD_FL0DATARO_S)
+#define FW_IQ_CMD_FL0DATARO_F		FW_IQ_CMD_FL0DATARO_V(1U)
+
+#define FW_IQ_CMD_FL0CONGCIF_S		11
+#define FW_IQ_CMD_FL0CONGCIF_V(x)	((x) << FW_IQ_CMD_FL0CONGCIF_S)
+#define FW_IQ_CMD_FL0CONGCIF_F		FW_IQ_CMD_FL0CONGCIF_V(1U)
+
+#define FW_IQ_CMD_FL0ONCHIP_S		10
+#define FW_IQ_CMD_FL0ONCHIP_V(x)	((x) << FW_IQ_CMD_FL0ONCHIP_S)
+
+#define FW_IQ_CMD_FL0STATUSPGNS_S	9
+#define FW_IQ_CMD_FL0STATUSPGNS_V(x)	((x) << FW_IQ_CMD_FL0STATUSPGNS_S)
+
+#define FW_IQ_CMD_FL0STATUSPGRO_S	8
+#define FW_IQ_CMD_FL0STATUSPGRO_V(x)	((x) << FW_IQ_CMD_FL0STATUSPGRO_S)
+
+#define FW_IQ_CMD_FL0FETCHNS_S		7
+#define FW_IQ_CMD_FL0FETCHNS_V(x)	((x) << FW_IQ_CMD_FL0FETCHNS_S)
+
+#define FW_IQ_CMD_FL0FETCHRO_S		6
+#define FW_IQ_CMD_FL0FETCHRO_V(x)	((x) << FW_IQ_CMD_FL0FETCHRO_S)
+#define FW_IQ_CMD_FL0FETCHRO_F		FW_IQ_CMD_FL0FETCHRO_V(1U)
+
+#define FW_IQ_CMD_FL0HOSTFCMODE_S	4
+#define FW_IQ_CMD_FL0HOSTFCMODE_V(x)	((x) << FW_IQ_CMD_FL0HOSTFCMODE_S)
+
+#define FW_IQ_CMD_FL0CPRIO_S	3
+#define FW_IQ_CMD_FL0CPRIO_V(x)	((x) << FW_IQ_CMD_FL0CPRIO_S)
+
+#define FW_IQ_CMD_FL0PADEN_S	2
+#define FW_IQ_CMD_FL0PADEN_V(x)	((x) << FW_IQ_CMD_FL0PADEN_S)
+#define FW_IQ_CMD_FL0PADEN_F	FW_IQ_CMD_FL0PADEN_V(1U)
+
+#define FW_IQ_CMD_FL0PACKEN_S		1
+#define FW_IQ_CMD_FL0PACKEN_V(x)	((x) << FW_IQ_CMD_FL0PACKEN_S)
+#define FW_IQ_CMD_FL0PACKEN_F		FW_IQ_CMD_FL0PACKEN_V(1U)
+
+#define FW_IQ_CMD_FL0CONGEN_S		0
+#define FW_IQ_CMD_FL0CONGEN_V(x)	((x) << FW_IQ_CMD_FL0CONGEN_S)
+#define FW_IQ_CMD_FL0CONGEN_F		FW_IQ_CMD_FL0CONGEN_V(1U)
+
+#define FW_IQ_CMD_FL0DCAEN_S	15
+#define FW_IQ_CMD_FL0DCAEN_V(x)	((x) << FW_IQ_CMD_FL0DCAEN_S)
+
+#define FW_IQ_CMD_FL0DCACPU_S		10
+#define FW_IQ_CMD_FL0DCACPU_V(x)	((x) << FW_IQ_CMD_FL0DCACPU_S)
+
+#define FW_IQ_CMD_FL0FBMIN_S	7
+#define FW_IQ_CMD_FL0FBMIN_V(x)	((x) << FW_IQ_CMD_FL0FBMIN_S)
+
+#define FW_IQ_CMD_FL0FBMAX_S	4
+#define FW_IQ_CMD_FL0FBMAX_V(x)	((x) << FW_IQ_CMD_FL0FBMAX_S)
+
+#define FW_IQ_CMD_FL0CIDXFTHRESHO_S	3
+#define FW_IQ_CMD_FL0CIDXFTHRESHO_V(x)	((x) << FW_IQ_CMD_FL0CIDXFTHRESHO_S)
+#define FW_IQ_CMD_FL0CIDXFTHRESHO_F	FW_IQ_CMD_FL0CIDXFTHRESHO_V(1U)
+
+#define FW_IQ_CMD_FL0CIDXFTHRESH_S	0
+#define FW_IQ_CMD_FL0CIDXFTHRESH_V(x)	((x) << FW_IQ_CMD_FL0CIDXFTHRESH_S)
+
+#define FW_IQ_CMD_FL1CNGCHMAP_S		20
+#define FW_IQ_CMD_FL1CNGCHMAP_V(x)	((x) << FW_IQ_CMD_FL1CNGCHMAP_S)
+
+#define FW_IQ_CMD_FL1CACHELOCK_S	15
+#define FW_IQ_CMD_FL1CACHELOCK_V(x)	((x) << FW_IQ_CMD_FL1CACHELOCK_S)
+
+#define FW_IQ_CMD_FL1DBP_S	14
+#define FW_IQ_CMD_FL1DBP_V(x)	((x) << FW_IQ_CMD_FL1DBP_S)
+
+#define FW_IQ_CMD_FL1DATANS_S		13
+#define FW_IQ_CMD_FL1DATANS_V(x)	((x) << FW_IQ_CMD_FL1DATANS_S)
+
+#define FW_IQ_CMD_FL1DATARO_S		12
+#define FW_IQ_CMD_FL1DATARO_V(x)	((x) << FW_IQ_CMD_FL1DATARO_S)
+
+#define FW_IQ_CMD_FL1CONGCIF_S		11
+#define FW_IQ_CMD_FL1CONGCIF_V(x)	((x) << FW_IQ_CMD_FL1CONGCIF_S)
+
+#define FW_IQ_CMD_FL1ONCHIP_S		10
+#define FW_IQ_CMD_FL1ONCHIP_V(x)	((x) << FW_IQ_CMD_FL1ONCHIP_S)
+
+#define FW_IQ_CMD_FL1STATUSPGNS_S	9
+#define FW_IQ_CMD_FL1STATUSPGNS_V(x)	((x) << FW_IQ_CMD_FL1STATUSPGNS_S)
+
+#define FW_IQ_CMD_FL1STATUSPGRO_S	8
+#define FW_IQ_CMD_FL1STATUSPGRO_V(x)	((x) << FW_IQ_CMD_FL1STATUSPGRO_S)
+
+#define FW_IQ_CMD_FL1FETCHNS_S		7
+#define FW_IQ_CMD_FL1FETCHNS_V(x)	((x) << FW_IQ_CMD_FL1FETCHNS_S)
+
+#define FW_IQ_CMD_FL1FETCHRO_S		6
+#define FW_IQ_CMD_FL1FETCHRO_V(x)	((x) << FW_IQ_CMD_FL1FETCHRO_S)
+
+#define FW_IQ_CMD_FL1HOSTFCMODE_S	4
+#define FW_IQ_CMD_FL1HOSTFCMODE_V(x)	((x) << FW_IQ_CMD_FL1HOSTFCMODE_S)
+
+#define FW_IQ_CMD_FL1CPRIO_S	3
+#define FW_IQ_CMD_FL1CPRIO_V(x)	((x) << FW_IQ_CMD_FL1CPRIO_S)
+
+#define FW_IQ_CMD_FL1PADEN_S	2
+#define FW_IQ_CMD_FL1PADEN_V(x)	((x) << FW_IQ_CMD_FL1PADEN_S)
+#define FW_IQ_CMD_FL1PADEN_F	FW_IQ_CMD_FL1PADEN_V(1U)
+
+#define FW_IQ_CMD_FL1PACKEN_S		1
+#define FW_IQ_CMD_FL1PACKEN_V(x)	((x) << FW_IQ_CMD_FL1PACKEN_S)
+#define FW_IQ_CMD_FL1PACKEN_F	FW_IQ_CMD_FL1PACKEN_V(1U)
+
+#define FW_IQ_CMD_FL1CONGEN_S		0
+#define FW_IQ_CMD_FL1CONGEN_V(x)	((x) << FW_IQ_CMD_FL1CONGEN_S)
+#define FW_IQ_CMD_FL1CONGEN_F	FW_IQ_CMD_FL1CONGEN_V(1U)
+
+#define FW_IQ_CMD_FL1DCAEN_S	15
+#define FW_IQ_CMD_FL1DCAEN_V(x)	((x) << FW_IQ_CMD_FL1DCAEN_S)
+
+#define FW_IQ_CMD_FL1DCACPU_S		10
+#define FW_IQ_CMD_FL1DCACPU_V(x)	((x) << FW_IQ_CMD_FL1DCACPU_S)
+
+#define FW_IQ_CMD_FL1FBMIN_S	7
+#define FW_IQ_CMD_FL1FBMIN_V(x)	((x) << FW_IQ_CMD_FL1FBMIN_S)
+
+#define FW_IQ_CMD_FL1FBMAX_S	4
+#define FW_IQ_CMD_FL1FBMAX_V(x)	((x) << FW_IQ_CMD_FL1FBMAX_S)
+
+#define FW_IQ_CMD_FL1CIDXFTHRESHO_S	3
+#define FW_IQ_CMD_FL1CIDXFTHRESHO_V(x)	((x) << FW_IQ_CMD_FL1CIDXFTHRESHO_S)
+#define FW_IQ_CMD_FL1CIDXFTHRESHO_F	FW_IQ_CMD_FL1CIDXFTHRESHO_V(1U)
+
+#define FW_IQ_CMD_FL1CIDXFTHRESH_S	0
+#define FW_IQ_CMD_FL1CIDXFTHRESH_V(x)	((x) << FW_IQ_CMD_FL1CIDXFTHRESH_S)
+
+struct fw_eq_eth_cmd {
+	__be32 op_to_vfn;
+	__be32 alloc_to_len16;
+	__be32 eqid_pkd;
+	__be32 physeqid_pkd;
+	__be32 fetchszm_to_iqid;
+	__be32 dcaen_to_eqsize;
+	__be64 eqaddr;
+	__be32 viid_pkd;
+	__be32 r8_lo;
+	__be64 r9;
+};
+
+#define FW_EQ_ETH_CMD_PFN_S	8
+#define FW_EQ_ETH_CMD_PFN_V(x)	((x) << FW_EQ_ETH_CMD_PFN_S)
+
+#define FW_EQ_ETH_CMD_VFN_S	0
+#define FW_EQ_ETH_CMD_VFN_V(x)	((x) << FW_EQ_ETH_CMD_VFN_S)
+
+#define FW_EQ_ETH_CMD_ALLOC_S		31
+#define FW_EQ_ETH_CMD_ALLOC_V(x)	((x) << FW_EQ_ETH_CMD_ALLOC_S)
+#define FW_EQ_ETH_CMD_ALLOC_F	FW_EQ_ETH_CMD_ALLOC_V(1U)
+
+#define FW_EQ_ETH_CMD_FREE_S	30
+#define FW_EQ_ETH_CMD_FREE_V(x)	((x) << FW_EQ_ETH_CMD_FREE_S)
+#define FW_EQ_ETH_CMD_FREE_F	FW_EQ_ETH_CMD_FREE_V(1U)
+
+#define FW_EQ_ETH_CMD_MODIFY_S		29
+#define FW_EQ_ETH_CMD_MODIFY_V(x)	((x) << FW_EQ_ETH_CMD_MODIFY_S)
+#define FW_EQ_ETH_CMD_MODIFY_F	FW_EQ_ETH_CMD_MODIFY_V(1U)
+
+#define FW_EQ_ETH_CMD_EQSTART_S		28
+#define FW_EQ_ETH_CMD_EQSTART_V(x)	((x) << FW_EQ_ETH_CMD_EQSTART_S)
+#define FW_EQ_ETH_CMD_EQSTART_F	FW_EQ_ETH_CMD_EQSTART_V(1U)
+
+#define FW_EQ_ETH_CMD_EQSTOP_S		27
+#define FW_EQ_ETH_CMD_EQSTOP_V(x)	((x) << FW_EQ_ETH_CMD_EQSTOP_S)
+#define FW_EQ_ETH_CMD_EQSTOP_F	FW_EQ_ETH_CMD_EQSTOP_V(1U)
+
+#define FW_EQ_ETH_CMD_EQID_S	0
+#define FW_EQ_ETH_CMD_EQID_M	0xfffff
+#define FW_EQ_ETH_CMD_EQID_V(x)	((x) << FW_EQ_ETH_CMD_EQID_S)
+#define FW_EQ_ETH_CMD_EQID_G(x)	\
+	(((x) >> FW_EQ_ETH_CMD_EQID_S) & FW_EQ_ETH_CMD_EQID_M)
+
+#define FW_EQ_ETH_CMD_PHYSEQID_S	0
+#define FW_EQ_ETH_CMD_PHYSEQID_M	0xfffff
+#define FW_EQ_ETH_CMD_PHYSEQID_V(x)	((x) << FW_EQ_ETH_CMD_PHYSEQID_S)
+#define FW_EQ_ETH_CMD_PHYSEQID_G(x)	\
+	(((x) >> FW_EQ_ETH_CMD_PHYSEQID_S) & FW_EQ_ETH_CMD_PHYSEQID_M)
+
+#define FW_EQ_ETH_CMD_FETCHSZM_S	26
+#define FW_EQ_ETH_CMD_FETCHSZM_V(x)	((x) << FW_EQ_ETH_CMD_FETCHSZM_S)
+#define FW_EQ_ETH_CMD_FETCHSZM_F	FW_EQ_ETH_CMD_FETCHSZM_V(1U)
+
+#define FW_EQ_ETH_CMD_STATUSPGNS_S	25
+#define FW_EQ_ETH_CMD_STATUSPGNS_V(x)	((x) << FW_EQ_ETH_CMD_STATUSPGNS_S)
+
+#define FW_EQ_ETH_CMD_STATUSPGRO_S	24
+#define FW_EQ_ETH_CMD_STATUSPGRO_V(x)	((x) << FW_EQ_ETH_CMD_STATUSPGRO_S)
+
+#define FW_EQ_ETH_CMD_FETCHNS_S		23
+#define FW_EQ_ETH_CMD_FETCHNS_V(x)	((x) << FW_EQ_ETH_CMD_FETCHNS_S)
+
+#define FW_EQ_ETH_CMD_FETCHRO_S		22
+#define FW_EQ_ETH_CMD_FETCHRO_V(x)	((x) << FW_EQ_ETH_CMD_FETCHRO_S)
+#define FW_EQ_ETH_CMD_FETCHRO_F		FW_EQ_ETH_CMD_FETCHRO_V(1U)
+
+#define FW_EQ_ETH_CMD_HOSTFCMODE_S	20
+#define FW_EQ_ETH_CMD_HOSTFCMODE_V(x)	((x) << FW_EQ_ETH_CMD_HOSTFCMODE_S)
+
+#define FW_EQ_ETH_CMD_CPRIO_S		19
+#define FW_EQ_ETH_CMD_CPRIO_V(x)	((x) << FW_EQ_ETH_CMD_CPRIO_S)
+
+#define FW_EQ_ETH_CMD_ONCHIP_S		18
+#define FW_EQ_ETH_CMD_ONCHIP_V(x)	((x) << FW_EQ_ETH_CMD_ONCHIP_S)
+
+#define FW_EQ_ETH_CMD_PCIECHN_S		16
+#define FW_EQ_ETH_CMD_PCIECHN_V(x)	((x) << FW_EQ_ETH_CMD_PCIECHN_S)
+
+#define FW_EQ_ETH_CMD_IQID_S	0
+#define FW_EQ_ETH_CMD_IQID_V(x)	((x) << FW_EQ_ETH_CMD_IQID_S)
+
+#define FW_EQ_ETH_CMD_DCAEN_S		31
+#define FW_EQ_ETH_CMD_DCAEN_V(x)	((x) << FW_EQ_ETH_CMD_DCAEN_S)
+
+#define FW_EQ_ETH_CMD_DCACPU_S		26
+#define FW_EQ_ETH_CMD_DCACPU_V(x)	((x) << FW_EQ_ETH_CMD_DCACPU_S)
+
+#define FW_EQ_ETH_CMD_FBMIN_S		23
+#define FW_EQ_ETH_CMD_FBMIN_V(x)	((x) << FW_EQ_ETH_CMD_FBMIN_S)
+
+#define FW_EQ_ETH_CMD_FBMAX_S		20
+#define FW_EQ_ETH_CMD_FBMAX_V(x)	((x) << FW_EQ_ETH_CMD_FBMAX_S)
+
+#define FW_EQ_ETH_CMD_CIDXFTHRESHO_S	19
+#define FW_EQ_ETH_CMD_CIDXFTHRESHO_V(x)	((x) << FW_EQ_ETH_CMD_CIDXFTHRESHO_S)
+
+#define FW_EQ_ETH_CMD_CIDXFTHRESH_S	16
+#define FW_EQ_ETH_CMD_CIDXFTHRESH_V(x)	((x) << FW_EQ_ETH_CMD_CIDXFTHRESH_S)
+
+#define FW_EQ_ETH_CMD_EQSIZE_S		0
+#define FW_EQ_ETH_CMD_EQSIZE_V(x)	((x) << FW_EQ_ETH_CMD_EQSIZE_S)
+
+#define FW_EQ_ETH_CMD_AUTOEQUEQE_S	30
+#define FW_EQ_ETH_CMD_AUTOEQUEQE_V(x)	((x) << FW_EQ_ETH_CMD_AUTOEQUEQE_S)
+#define FW_EQ_ETH_CMD_AUTOEQUEQE_F	FW_EQ_ETH_CMD_AUTOEQUEQE_V(1U)
+
+#define FW_EQ_ETH_CMD_VIID_S	16
+#define FW_EQ_ETH_CMD_VIID_V(x)	((x) << FW_EQ_ETH_CMD_VIID_S)
+
+struct fw_eq_ctrl_cmd {
+	__be32 op_to_vfn;
+	__be32 alloc_to_len16;
+	__be32 cmpliqid_eqid;
+	__be32 physeqid_pkd;
+	__be32 fetchszm_to_iqid;
+	__be32 dcaen_to_eqsize;
+	__be64 eqaddr;
+};
+
+#define FW_EQ_CTRL_CMD_PFN_S	8
+#define FW_EQ_CTRL_CMD_PFN_V(x)	((x) << FW_EQ_CTRL_CMD_PFN_S)
+
+#define FW_EQ_CTRL_CMD_VFN_S	0
+#define FW_EQ_CTRL_CMD_VFN_V(x)	((x) << FW_EQ_CTRL_CMD_VFN_S)
+
+#define FW_EQ_CTRL_CMD_ALLOC_S		31
+#define FW_EQ_CTRL_CMD_ALLOC_V(x)	((x) << FW_EQ_CTRL_CMD_ALLOC_S)
+#define FW_EQ_CTRL_CMD_ALLOC_F		FW_EQ_CTRL_CMD_ALLOC_V(1U)
+
+#define FW_EQ_CTRL_CMD_FREE_S		30
+#define FW_EQ_CTRL_CMD_FREE_V(x)	((x) << FW_EQ_CTRL_CMD_FREE_S)
+#define FW_EQ_CTRL_CMD_FREE_F		FW_EQ_CTRL_CMD_FREE_V(1U)
+
+#define FW_EQ_CTRL_CMD_MODIFY_S		29
+#define FW_EQ_CTRL_CMD_MODIFY_V(x)	((x) << FW_EQ_CTRL_CMD_MODIFY_S)
+#define FW_EQ_CTRL_CMD_MODIFY_F		FW_EQ_CTRL_CMD_MODIFY_V(1U)
+
+#define FW_EQ_CTRL_CMD_EQSTART_S	28
+#define FW_EQ_CTRL_CMD_EQSTART_V(x)	((x) << FW_EQ_CTRL_CMD_EQSTART_S)
+#define FW_EQ_CTRL_CMD_EQSTART_F	FW_EQ_CTRL_CMD_EQSTART_V(1U)
+
+#define FW_EQ_CTRL_CMD_EQSTOP_S		27
+#define FW_EQ_CTRL_CMD_EQSTOP_V(x)	((x) << FW_EQ_CTRL_CMD_EQSTOP_S)
+#define FW_EQ_CTRL_CMD_EQSTOP_F		FW_EQ_CTRL_CMD_EQSTOP_V(1U)
+
+#define FW_EQ_CTRL_CMD_CMPLIQID_S	20
+#define FW_EQ_CTRL_CMD_CMPLIQID_V(x)	((x) << FW_EQ_CTRL_CMD_CMPLIQID_S)
+
+#define FW_EQ_CTRL_CMD_EQID_S		0
+#define FW_EQ_CTRL_CMD_EQID_M		0xfffff
+#define FW_EQ_CTRL_CMD_EQID_V(x)	((x) << FW_EQ_CTRL_CMD_EQID_S)
+#define FW_EQ_CTRL_CMD_EQID_G(x)	\
+	(((x) >> FW_EQ_CTRL_CMD_EQID_S) & FW_EQ_CTRL_CMD_EQID_M)
+
+#define FW_EQ_CTRL_CMD_PHYSEQID_S	0
+#define FW_EQ_CTRL_CMD_PHYSEQID_M	0xfffff
+#define FW_EQ_CTRL_CMD_PHYSEQID_G(x)	\
+	(((x) >> FW_EQ_CTRL_CMD_PHYSEQID_S) & FW_EQ_CTRL_CMD_PHYSEQID_M)
+
+#define FW_EQ_CTRL_CMD_FETCHSZM_S	26
+#define FW_EQ_CTRL_CMD_FETCHSZM_V(x)	((x) << FW_EQ_CTRL_CMD_FETCHSZM_S)
+#define FW_EQ_CTRL_CMD_FETCHSZM_F	FW_EQ_CTRL_CMD_FETCHSZM_V(1U)
+
+#define FW_EQ_CTRL_CMD_STATUSPGNS_S	25
+#define FW_EQ_CTRL_CMD_STATUSPGNS_V(x)	((x) << FW_EQ_CTRL_CMD_STATUSPGNS_S)
+#define FW_EQ_CTRL_CMD_STATUSPGNS_F	FW_EQ_CTRL_CMD_STATUSPGNS_V(1U)
+
+#define FW_EQ_CTRL_CMD_STATUSPGRO_S	24
+#define FW_EQ_CTRL_CMD_STATUSPGRO_V(x)	((x) << FW_EQ_CTRL_CMD_STATUSPGRO_S)
+#define FW_EQ_CTRL_CMD_STATUSPGRO_F	FW_EQ_CTRL_CMD_STATUSPGRO_V(1U)
+
+#define FW_EQ_CTRL_CMD_FETCHNS_S	23
+#define FW_EQ_CTRL_CMD_FETCHNS_V(x)	((x) << FW_EQ_CTRL_CMD_FETCHNS_S)
+#define FW_EQ_CTRL_CMD_FETCHNS_F	FW_EQ_CTRL_CMD_FETCHNS_V(1U)
+
+#define FW_EQ_CTRL_CMD_FETCHRO_S	22
+#define FW_EQ_CTRL_CMD_FETCHRO_V(x)	((x) << FW_EQ_CTRL_CMD_FETCHRO_S)
+#define FW_EQ_CTRL_CMD_FETCHRO_F	FW_EQ_CTRL_CMD_FETCHRO_V(1U)
+
+#define FW_EQ_CTRL_CMD_HOSTFCMODE_S	20
+#define FW_EQ_CTRL_CMD_HOSTFCMODE_V(x)	((x) << FW_EQ_CTRL_CMD_HOSTFCMODE_S)
+
+#define FW_EQ_CTRL_CMD_CPRIO_S		19
+#define FW_EQ_CTRL_CMD_CPRIO_V(x)	((x) << FW_EQ_CTRL_CMD_CPRIO_S)
+
+#define FW_EQ_CTRL_CMD_ONCHIP_S		18
+#define FW_EQ_CTRL_CMD_ONCHIP_V(x)	((x) << FW_EQ_CTRL_CMD_ONCHIP_S)
+
+#define FW_EQ_CTRL_CMD_PCIECHN_S	16
+#define FW_EQ_CTRL_CMD_PCIECHN_V(x)	((x) << FW_EQ_CTRL_CMD_PCIECHN_S)
+
+#define FW_EQ_CTRL_CMD_IQID_S		0
+#define FW_EQ_CTRL_CMD_IQID_V(x)	((x) << FW_EQ_CTRL_CMD_IQID_S)
+
+#define FW_EQ_CTRL_CMD_DCAEN_S		31
+#define FW_EQ_CTRL_CMD_DCAEN_V(x)	((x) << FW_EQ_CTRL_CMD_DCAEN_S)
+
+#define FW_EQ_CTRL_CMD_DCACPU_S		26
+#define FW_EQ_CTRL_CMD_DCACPU_V(x)	((x) << FW_EQ_CTRL_CMD_DCACPU_S)
+
+#define FW_EQ_CTRL_CMD_FBMIN_S		23
+#define FW_EQ_CTRL_CMD_FBMIN_V(x)	((x) << FW_EQ_CTRL_CMD_FBMIN_S)
+
+#define FW_EQ_CTRL_CMD_FBMAX_S		20
+#define FW_EQ_CTRL_CMD_FBMAX_V(x)	((x) << FW_EQ_CTRL_CMD_FBMAX_S)
+
+#define FW_EQ_CTRL_CMD_CIDXFTHRESHO_S		19
+#define FW_EQ_CTRL_CMD_CIDXFTHRESHO_V(x)	\
+	((x) << FW_EQ_CTRL_CMD_CIDXFTHRESHO_S)
+
+#define FW_EQ_CTRL_CMD_CIDXFTHRESH_S	16
+#define FW_EQ_CTRL_CMD_CIDXFTHRESH_V(x)	((x) << FW_EQ_CTRL_CMD_CIDXFTHRESH_S)
+
+#define FW_EQ_CTRL_CMD_EQSIZE_S		0
+#define FW_EQ_CTRL_CMD_EQSIZE_V(x)	((x) << FW_EQ_CTRL_CMD_EQSIZE_S)
+
+struct fw_eq_ofld_cmd {
+	__be32 op_to_vfn;
+	__be32 alloc_to_len16;
+	__be32 eqid_pkd;
+	__be32 physeqid_pkd;
+	__be32 fetchszm_to_iqid;
+	__be32 dcaen_to_eqsize;
+	__be64 eqaddr;
+};
+
+#define FW_EQ_OFLD_CMD_PFN_S	8
+#define FW_EQ_OFLD_CMD_PFN_V(x)	((x) << FW_EQ_OFLD_CMD_PFN_S)
+
+#define FW_EQ_OFLD_CMD_VFN_S	0
+#define FW_EQ_OFLD_CMD_VFN_V(x)	((x) << FW_EQ_OFLD_CMD_VFN_S)
+
+#define FW_EQ_OFLD_CMD_ALLOC_S		31
+#define FW_EQ_OFLD_CMD_ALLOC_V(x)	((x) << FW_EQ_OFLD_CMD_ALLOC_S)
+#define FW_EQ_OFLD_CMD_ALLOC_F		FW_EQ_OFLD_CMD_ALLOC_V(1U)
+
+#define FW_EQ_OFLD_CMD_FREE_S		30
+#define FW_EQ_OFLD_CMD_FREE_V(x)	((x) << FW_EQ_OFLD_CMD_FREE_S)
+#define FW_EQ_OFLD_CMD_FREE_F		FW_EQ_OFLD_CMD_FREE_V(1U)
+
+#define FW_EQ_OFLD_CMD_MODIFY_S		29
+#define FW_EQ_OFLD_CMD_MODIFY_V(x)	((x) << FW_EQ_OFLD_CMD_MODIFY_S)
+#define FW_EQ_OFLD_CMD_MODIFY_F		FW_EQ_OFLD_CMD_MODIFY_V(1U)
+
+#define FW_EQ_OFLD_CMD_EQSTART_S	28
+#define FW_EQ_OFLD_CMD_EQSTART_V(x)	((x) << FW_EQ_OFLD_CMD_EQSTART_S)
+#define FW_EQ_OFLD_CMD_EQSTART_F	FW_EQ_OFLD_CMD_EQSTART_V(1U)
+
+#define FW_EQ_OFLD_CMD_EQSTOP_S		27
+#define FW_EQ_OFLD_CMD_EQSTOP_V(x)	((x) << FW_EQ_OFLD_CMD_EQSTOP_S)
+#define FW_EQ_OFLD_CMD_EQSTOP_F		FW_EQ_OFLD_CMD_EQSTOP_V(1U)
+
+#define FW_EQ_OFLD_CMD_EQID_S		0
+#define FW_EQ_OFLD_CMD_EQID_M		0xfffff
+#define FW_EQ_OFLD_CMD_EQID_V(x)	((x) << FW_EQ_OFLD_CMD_EQID_S)
+#define FW_EQ_OFLD_CMD_EQID_G(x)	\
+	(((x) >> FW_EQ_OFLD_CMD_EQID_S) & FW_EQ_OFLD_CMD_EQID_M)
+
+#define FW_EQ_OFLD_CMD_PHYSEQID_S	0
+#define FW_EQ_OFLD_CMD_PHYSEQID_M	0xfffff
+#define FW_EQ_OFLD_CMD_PHYSEQID_G(x)	\
+	(((x) >> FW_EQ_OFLD_CMD_PHYSEQID_S) & FW_EQ_OFLD_CMD_PHYSEQID_M)
+
+#define FW_EQ_OFLD_CMD_FETCHSZM_S	26
+#define FW_EQ_OFLD_CMD_FETCHSZM_V(x)	((x) << FW_EQ_OFLD_CMD_FETCHSZM_S)
+
+#define FW_EQ_OFLD_CMD_STATUSPGNS_S	25
+#define FW_EQ_OFLD_CMD_STATUSPGNS_V(x)	((x) << FW_EQ_OFLD_CMD_STATUSPGNS_S)
+
+#define FW_EQ_OFLD_CMD_STATUSPGRO_S	24
+#define FW_EQ_OFLD_CMD_STATUSPGRO_V(x)	((x) << FW_EQ_OFLD_CMD_STATUSPGRO_S)
+
+#define FW_EQ_OFLD_CMD_FETCHNS_S	23
+#define FW_EQ_OFLD_CMD_FETCHNS_V(x)	((x) << FW_EQ_OFLD_CMD_FETCHNS_S)
+
+#define FW_EQ_OFLD_CMD_FETCHRO_S	22
+#define FW_EQ_OFLD_CMD_FETCHRO_V(x)	((x) << FW_EQ_OFLD_CMD_FETCHRO_S)
+#define FW_EQ_OFLD_CMD_FETCHRO_F	FW_EQ_OFLD_CMD_FETCHRO_V(1U)
+
+#define FW_EQ_OFLD_CMD_HOSTFCMODE_S	20
+#define FW_EQ_OFLD_CMD_HOSTFCMODE_V(x)	((x) << FW_EQ_OFLD_CMD_HOSTFCMODE_S)
+
+#define FW_EQ_OFLD_CMD_CPRIO_S		19
+#define FW_EQ_OFLD_CMD_CPRIO_V(x)	((x) << FW_EQ_OFLD_CMD_CPRIO_S)
+
+#define FW_EQ_OFLD_CMD_ONCHIP_S		18
+#define FW_EQ_OFLD_CMD_ONCHIP_V(x)	((x) << FW_EQ_OFLD_CMD_ONCHIP_S)
+
+#define FW_EQ_OFLD_CMD_PCIECHN_S	16
+#define FW_EQ_OFLD_CMD_PCIECHN_V(x)	((x) << FW_EQ_OFLD_CMD_PCIECHN_S)
+
+#define FW_EQ_OFLD_CMD_IQID_S		0
+#define FW_EQ_OFLD_CMD_IQID_V(x)	((x) << FW_EQ_OFLD_CMD_IQID_S)
+
+#define FW_EQ_OFLD_CMD_DCAEN_S		31
+#define FW_EQ_OFLD_CMD_DCAEN_V(x)	((x) << FW_EQ_OFLD_CMD_DCAEN_S)
+
+#define FW_EQ_OFLD_CMD_DCACPU_S		26
+#define FW_EQ_OFLD_CMD_DCACPU_V(x)	((x) << FW_EQ_OFLD_CMD_DCACPU_S)
+
+#define FW_EQ_OFLD_CMD_FBMIN_S		23
+#define FW_EQ_OFLD_CMD_FBMIN_V(x)	((x) << FW_EQ_OFLD_CMD_FBMIN_S)
+
+#define FW_EQ_OFLD_CMD_FBMAX_S		20
+#define FW_EQ_OFLD_CMD_FBMAX_V(x)	((x) << FW_EQ_OFLD_CMD_FBMAX_S)
+
+#define FW_EQ_OFLD_CMD_CIDXFTHRESHO_S		19
+#define FW_EQ_OFLD_CMD_CIDXFTHRESHO_V(x)	\
+	((x) << FW_EQ_OFLD_CMD_CIDXFTHRESHO_S)
+
+#define FW_EQ_OFLD_CMD_CIDXFTHRESH_S	16
+#define FW_EQ_OFLD_CMD_CIDXFTHRESH_V(x)	((x) << FW_EQ_OFLD_CMD_CIDXFTHRESH_S)
+
+#define FW_EQ_OFLD_CMD_EQSIZE_S		0
+#define FW_EQ_OFLD_CMD_EQSIZE_V(x)	((x) << FW_EQ_OFLD_CMD_EQSIZE_S)
+
+/*
+ * Macros for VIID parsing:
+ * VIID - [10:8] PFN, [7] VI Valid, [6:0] VI number
+ */
+
+#define FW_VIID_PFN_S           8
+#define FW_VIID_PFN_M           0x7
+#define FW_VIID_PFN_G(x)        (((x) >> FW_VIID_PFN_S) & FW_VIID_PFN_M)
+
+#define FW_VIID_VIVLD_S		7
+#define FW_VIID_VIVLD_M		0x1
+#define FW_VIID_VIVLD_G(x)	(((x) >> FW_VIID_VIVLD_S) & FW_VIID_VIVLD_M)
+
+#define FW_VIID_VIN_S		0
+#define FW_VIID_VIN_M		0x7F
+#define FW_VIID_VIN_G(x)	(((x) >> FW_VIID_VIN_S) & FW_VIID_VIN_M)
+
+struct fw_vi_cmd {
+	__be32 op_to_vfn;
+	__be32 alloc_to_len16;
+	__be16 type_viid;
+	u8 mac[6];
+	u8 portid_pkd;
+	u8 nmac;
+	u8 nmac0[6];
+	__be16 rsssize_pkd;
+	u8 nmac1[6];
+	__be16 idsiiq_pkd;
+	u8 nmac2[6];
+	__be16 idseiq_pkd;
+	u8 nmac3[6];
+	__be64 r9;
+	__be64 r10;
+};
+
+#define FW_VI_CMD_PFN_S		8
+#define FW_VI_CMD_PFN_V(x)	((x) << FW_VI_CMD_PFN_S)
+
+#define FW_VI_CMD_VFN_S		0
+#define FW_VI_CMD_VFN_V(x)	((x) << FW_VI_CMD_VFN_S)
+
+#define FW_VI_CMD_ALLOC_S	31
+#define FW_VI_CMD_ALLOC_V(x)	((x) << FW_VI_CMD_ALLOC_S)
+#define FW_VI_CMD_ALLOC_F	FW_VI_CMD_ALLOC_V(1U)
+
+#define FW_VI_CMD_FREE_S	30
+#define FW_VI_CMD_FREE_V(x)	((x) << FW_VI_CMD_FREE_S)
+#define FW_VI_CMD_FREE_F	FW_VI_CMD_FREE_V(1U)
+
+#define FW_VI_CMD_VIID_S	0
+#define FW_VI_CMD_VIID_M	0xfff
+#define FW_VI_CMD_VIID_V(x)	((x) << FW_VI_CMD_VIID_S)
+#define FW_VI_CMD_VIID_G(x)	(((x) >> FW_VI_CMD_VIID_S) & FW_VI_CMD_VIID_M)
+
+#define FW_VI_CMD_PORTID_S	4
+#define FW_VI_CMD_PORTID_M	0xf
+#define FW_VI_CMD_PORTID_V(x)	((x) << FW_VI_CMD_PORTID_S)
+#define FW_VI_CMD_PORTID_G(x)	\
+	(((x) >> FW_VI_CMD_PORTID_S) & FW_VI_CMD_PORTID_M)
+
+#define FW_VI_CMD_RSSSIZE_S	0
+#define FW_VI_CMD_RSSSIZE_M	0x7ff
+#define FW_VI_CMD_RSSSIZE_G(x)	\
+	(((x) >> FW_VI_CMD_RSSSIZE_S) & FW_VI_CMD_RSSSIZE_M)
+
+/* Special VI_MAC command index ids */
+#define FW_VI_MAC_ADD_MAC		0x3FF
+#define FW_VI_MAC_ADD_PERSIST_MAC	0x3FE
+#define FW_VI_MAC_MAC_BASED_FREE	0x3FD
+#define FW_VI_MAC_ID_BASED_FREE		0x3FC
+#define FW_CLS_TCAM_NUM_ENTRIES		336
+
+enum fw_vi_mac_smac {
+	FW_VI_MAC_MPS_TCAM_ENTRY,
+	FW_VI_MAC_MPS_TCAM_ONLY,
+	FW_VI_MAC_SMT_ONLY,
+	FW_VI_MAC_SMT_AND_MPSTCAM
+};
+
+enum fw_vi_mac_result {
+	FW_VI_MAC_R_SUCCESS,
+	FW_VI_MAC_R_F_NONEXISTENT_NOMEM,
+	FW_VI_MAC_R_SMAC_FAIL,
+	FW_VI_MAC_R_F_ACL_CHECK
+};
+
+enum fw_vi_mac_entry_types {
+	FW_VI_MAC_TYPE_EXACTMAC,
+	FW_VI_MAC_TYPE_HASHVEC,
+	FW_VI_MAC_TYPE_RAW,
+	FW_VI_MAC_TYPE_EXACTMAC_VNI,
+};
+
+struct fw_vi_mac_cmd {
+	__be32 op_to_viid;
+	__be32 freemacs_to_len16;
+	union fw_vi_mac {
+		struct fw_vi_mac_exact {
+			__be16 valid_to_idx;
+			u8 macaddr[6];
+		} exact[7];
+		struct fw_vi_mac_hash {
+			__be64 hashvec;
+		} hash;
+		struct fw_vi_mac_raw {
+			__be32 raw_idx_pkd;
+			__be32 data0_pkd;
+			__be32 data1[2];
+			__be64 data0m_pkd;
+			__be32 data1m[2];
+		} raw;
+	} u;
+};
+
+#define FW_VI_MAC_CMD_VIID_S	0
+#define FW_VI_MAC_CMD_VIID_V(x)	((x) << FW_VI_MAC_CMD_VIID_S)
+
+#define FW_VI_MAC_CMD_FREEMACS_S	31
+#define FW_VI_MAC_CMD_FREEMACS_V(x)	((x) << FW_VI_MAC_CMD_FREEMACS_S)
+
+#define FW_VI_MAC_CMD_ENTRY_TYPE_S      23
+#define FW_VI_MAC_CMD_ENTRY_TYPE_M      0x7
+#define FW_VI_MAC_CMD_ENTRY_TYPE_V(x)   ((x) << FW_VI_MAC_CMD_ENTRY_TYPE_S)
+#define FW_VI_MAC_CMD_ENTRY_TYPE_G(x)	\
+	(((x) >> FW_VI_MAC_CMD_ENTRY_TYPE_S) & FW_VI_MAC_CMD_ENTRY_TYPE_M)
+
+#define FW_VI_MAC_CMD_HASHVECEN_S	23
+#define FW_VI_MAC_CMD_HASHVECEN_V(x)	((x) << FW_VI_MAC_CMD_HASHVECEN_S)
+#define FW_VI_MAC_CMD_HASHVECEN_F	FW_VI_MAC_CMD_HASHVECEN_V(1U)
+
+#define FW_VI_MAC_CMD_HASHUNIEN_S	22
+#define FW_VI_MAC_CMD_HASHUNIEN_V(x)	((x) << FW_VI_MAC_CMD_HASHUNIEN_S)
+
+#define FW_VI_MAC_CMD_VALID_S		15
+#define FW_VI_MAC_CMD_VALID_V(x)	((x) << FW_VI_MAC_CMD_VALID_S)
+#define FW_VI_MAC_CMD_VALID_F	FW_VI_MAC_CMD_VALID_V(1U)
+
+#define FW_VI_MAC_CMD_PRIO_S	12
+#define FW_VI_MAC_CMD_PRIO_V(x)	((x) << FW_VI_MAC_CMD_PRIO_S)
+
+#define FW_VI_MAC_CMD_SMAC_RESULT_S	10
+#define FW_VI_MAC_CMD_SMAC_RESULT_M	0x3
+#define FW_VI_MAC_CMD_SMAC_RESULT_V(x)	((x) << FW_VI_MAC_CMD_SMAC_RESULT_S)
+#define FW_VI_MAC_CMD_SMAC_RESULT_G(x)	\
+	(((x) >> FW_VI_MAC_CMD_SMAC_RESULT_S) & FW_VI_MAC_CMD_SMAC_RESULT_M)
+
+#define FW_VI_MAC_CMD_IDX_S	0
+#define FW_VI_MAC_CMD_IDX_M	0x3ff
+#define FW_VI_MAC_CMD_IDX_V(x)	((x) << FW_VI_MAC_CMD_IDX_S)
+#define FW_VI_MAC_CMD_IDX_G(x)	\
+	(((x) >> FW_VI_MAC_CMD_IDX_S) & FW_VI_MAC_CMD_IDX_M)
+
+#define FW_VI_MAC_CMD_RAW_IDX_S         16
+#define FW_VI_MAC_CMD_RAW_IDX_M         0xffff
+#define FW_VI_MAC_CMD_RAW_IDX_V(x)      ((x) << FW_VI_MAC_CMD_RAW_IDX_S)
+#define FW_VI_MAC_CMD_RAW_IDX_G(x)      \
+	(((x) >> FW_VI_MAC_CMD_RAW_IDX_S) & FW_VI_MAC_CMD_RAW_IDX_M)
+
+#define FW_RXMODE_MTU_NO_CHG	65535
+
+struct fw_vi_rxmode_cmd {
+	__be32 op_to_viid;
+	__be32 retval_len16;
+	__be32 mtu_to_vlanexen;
+	__be32 r4_lo;
+};
+
+#define FW_VI_RXMODE_CMD_VIID_S		0
+#define FW_VI_RXMODE_CMD_VIID_V(x)	((x) << FW_VI_RXMODE_CMD_VIID_S)
+
+#define FW_VI_RXMODE_CMD_MTU_S		16
+#define FW_VI_RXMODE_CMD_MTU_M		0xffff
+#define FW_VI_RXMODE_CMD_MTU_V(x)	((x) << FW_VI_RXMODE_CMD_MTU_S)
+
+#define FW_VI_RXMODE_CMD_PROMISCEN_S	14
+#define FW_VI_RXMODE_CMD_PROMISCEN_M	0x3
+#define FW_VI_RXMODE_CMD_PROMISCEN_V(x)	((x) << FW_VI_RXMODE_CMD_PROMISCEN_S)
+
+#define FW_VI_RXMODE_CMD_ALLMULTIEN_S		12
+#define FW_VI_RXMODE_CMD_ALLMULTIEN_M		0x3
+#define FW_VI_RXMODE_CMD_ALLMULTIEN_V(x)	\
+	((x) << FW_VI_RXMODE_CMD_ALLMULTIEN_S)
+
+#define FW_VI_RXMODE_CMD_BROADCASTEN_S		10
+#define FW_VI_RXMODE_CMD_BROADCASTEN_M		0x3
+#define FW_VI_RXMODE_CMD_BROADCASTEN_V(x)	\
+	((x) << FW_VI_RXMODE_CMD_BROADCASTEN_S)
+
+#define FW_VI_RXMODE_CMD_VLANEXEN_S	8
+#define FW_VI_RXMODE_CMD_VLANEXEN_M	0x3
+#define FW_VI_RXMODE_CMD_VLANEXEN_V(x)	((x) << FW_VI_RXMODE_CMD_VLANEXEN_S)
+
+struct fw_vi_enable_cmd {
+	__be32 op_to_viid;
+	__be32 ien_to_len16;
+	__be16 blinkdur;
+	__be16 r3;
+	__be32 r4;
+};
+
+#define FW_VI_ENABLE_CMD_VIID_S         0
+#define FW_VI_ENABLE_CMD_VIID_V(x)      ((x) << FW_VI_ENABLE_CMD_VIID_S)
+
+#define FW_VI_ENABLE_CMD_IEN_S		31
+#define FW_VI_ENABLE_CMD_IEN_V(x)	((x) << FW_VI_ENABLE_CMD_IEN_S)
+
+#define FW_VI_ENABLE_CMD_EEN_S		30
+#define FW_VI_ENABLE_CMD_EEN_V(x)	((x) << FW_VI_ENABLE_CMD_EEN_S)
+
+#define FW_VI_ENABLE_CMD_LED_S		29
+#define FW_VI_ENABLE_CMD_LED_V(x)	((x) << FW_VI_ENABLE_CMD_LED_S)
+#define FW_VI_ENABLE_CMD_LED_F	FW_VI_ENABLE_CMD_LED_V(1U)
+
+#define FW_VI_ENABLE_CMD_DCB_INFO_S	28
+#define FW_VI_ENABLE_CMD_DCB_INFO_V(x)	((x) << FW_VI_ENABLE_CMD_DCB_INFO_S)
+
+/* VI VF stats offset definitions */
+#define VI_VF_NUM_STATS	16
+enum fw_vi_stats_vf_index {
+	FW_VI_VF_STAT_TX_BCAST_BYTES_IX,
+	FW_VI_VF_STAT_TX_BCAST_FRAMES_IX,
+	FW_VI_VF_STAT_TX_MCAST_BYTES_IX,
+	FW_VI_VF_STAT_TX_MCAST_FRAMES_IX,
+	FW_VI_VF_STAT_TX_UCAST_BYTES_IX,
+	FW_VI_VF_STAT_TX_UCAST_FRAMES_IX,
+	FW_VI_VF_STAT_TX_DROP_FRAMES_IX,
+	FW_VI_VF_STAT_TX_OFLD_BYTES_IX,
+	FW_VI_VF_STAT_TX_OFLD_FRAMES_IX,
+	FW_VI_VF_STAT_RX_BCAST_BYTES_IX,
+	FW_VI_VF_STAT_RX_BCAST_FRAMES_IX,
+	FW_VI_VF_STAT_RX_MCAST_BYTES_IX,
+	FW_VI_VF_STAT_RX_MCAST_FRAMES_IX,
+	FW_VI_VF_STAT_RX_UCAST_BYTES_IX,
+	FW_VI_VF_STAT_RX_UCAST_FRAMES_IX,
+	FW_VI_VF_STAT_RX_ERR_FRAMES_IX
+};
+
+/* VI PF stats offset definitions */
+#define VI_PF_NUM_STATS	17
+enum fw_vi_stats_pf_index {
+	FW_VI_PF_STAT_TX_BCAST_BYTES_IX,
+	FW_VI_PF_STAT_TX_BCAST_FRAMES_IX,
+	FW_VI_PF_STAT_TX_MCAST_BYTES_IX,
+	FW_VI_PF_STAT_TX_MCAST_FRAMES_IX,
+	FW_VI_PF_STAT_TX_UCAST_BYTES_IX,
+	FW_VI_PF_STAT_TX_UCAST_FRAMES_IX,
+	FW_VI_PF_STAT_TX_OFLD_BYTES_IX,
+	FW_VI_PF_STAT_TX_OFLD_FRAMES_IX,
+	FW_VI_PF_STAT_RX_BYTES_IX,
+	FW_VI_PF_STAT_RX_FRAMES_IX,
+	FW_VI_PF_STAT_RX_BCAST_BYTES_IX,
+	FW_VI_PF_STAT_RX_BCAST_FRAMES_IX,
+	FW_VI_PF_STAT_RX_MCAST_BYTES_IX,
+	FW_VI_PF_STAT_RX_MCAST_FRAMES_IX,
+	FW_VI_PF_STAT_RX_UCAST_BYTES_IX,
+	FW_VI_PF_STAT_RX_UCAST_FRAMES_IX,
+	FW_VI_PF_STAT_RX_ERR_FRAMES_IX
+};
+
+struct fw_vi_stats_cmd {
+	__be32 op_to_viid;
+	__be32 retval_len16;
+	union fw_vi_stats {
+		struct fw_vi_stats_ctl {
+			__be16 nstats_ix;
+			__be16 r6;
+			__be32 r7;
+			__be64 stat0;
+			__be64 stat1;
+			__be64 stat2;
+			__be64 stat3;
+			__be64 stat4;
+			__be64 stat5;
+		} ctl;
+		struct fw_vi_stats_pf {
+			__be64 tx_bcast_bytes;
+			__be64 tx_bcast_frames;
+			__be64 tx_mcast_bytes;
+			__be64 tx_mcast_frames;
+			__be64 tx_ucast_bytes;
+			__be64 tx_ucast_frames;
+			__be64 tx_offload_bytes;
+			__be64 tx_offload_frames;
+			__be64 rx_pf_bytes;
+			__be64 rx_pf_frames;
+			__be64 rx_bcast_bytes;
+			__be64 rx_bcast_frames;
+			__be64 rx_mcast_bytes;
+			__be64 rx_mcast_frames;
+			__be64 rx_ucast_bytes;
+			__be64 rx_ucast_frames;
+			__be64 rx_err_frames;
+		} pf;
+		struct fw_vi_stats_vf {
+			__be64 tx_bcast_bytes;
+			__be64 tx_bcast_frames;
+			__be64 tx_mcast_bytes;
+			__be64 tx_mcast_frames;
+			__be64 tx_ucast_bytes;
+			__be64 tx_ucast_frames;
+			__be64 tx_drop_frames;
+			__be64 tx_offload_bytes;
+			__be64 tx_offload_frames;
+			__be64 rx_bcast_bytes;
+			__be64 rx_bcast_frames;
+			__be64 rx_mcast_bytes;
+			__be64 rx_mcast_frames;
+			__be64 rx_ucast_bytes;
+			__be64 rx_ucast_frames;
+			__be64 rx_err_frames;
+		} vf;
+	} u;
+};
+
+#define FW_VI_STATS_CMD_VIID_S		0
+#define FW_VI_STATS_CMD_VIID_V(x)	((x) << FW_VI_STATS_CMD_VIID_S)
+
+#define FW_VI_STATS_CMD_NSTATS_S	12
+#define FW_VI_STATS_CMD_NSTATS_V(x)	((x) << FW_VI_STATS_CMD_NSTATS_S)
+
+#define FW_VI_STATS_CMD_IX_S	0
+#define FW_VI_STATS_CMD_IX_V(x)	((x) << FW_VI_STATS_CMD_IX_S)
+
+struct fw_acl_mac_cmd {
+	__be32 op_to_vfn;
+	__be32 en_to_len16;
+	u8 nmac;
+	u8 r3[7];
+	__be16 r4;
+	u8 macaddr0[6];
+	__be16 r5;
+	u8 macaddr1[6];
+	__be16 r6;
+	u8 macaddr2[6];
+	__be16 r7;
+	u8 macaddr3[6];
+};
+
+#define FW_ACL_MAC_CMD_PFN_S	8
+#define FW_ACL_MAC_CMD_PFN_V(x)	((x) << FW_ACL_MAC_CMD_PFN_S)
+
+#define FW_ACL_MAC_CMD_VFN_S	0
+#define FW_ACL_MAC_CMD_VFN_V(x)	((x) << FW_ACL_MAC_CMD_VFN_S)
+
+#define FW_ACL_MAC_CMD_EN_S	31
+#define FW_ACL_MAC_CMD_EN_V(x)	((x) << FW_ACL_MAC_CMD_EN_S)
+
+struct fw_acl_vlan_cmd {
+	__be32 op_to_vfn;
+	__be32 en_to_len16;
+	u8 nvlan;
+	u8 dropnovlan_fm;
+	u8 r3_lo[6];
+	__be16 vlanid[16];
+};
+
+#define FW_ACL_VLAN_CMD_PFN_S		8
+#define FW_ACL_VLAN_CMD_PFN_V(x)	((x) << FW_ACL_VLAN_CMD_PFN_S)
+
+#define FW_ACL_VLAN_CMD_VFN_S		0
+#define FW_ACL_VLAN_CMD_VFN_V(x)	((x) << FW_ACL_VLAN_CMD_VFN_S)
+
+#define FW_ACL_VLAN_CMD_EN_S		31
+#define FW_ACL_VLAN_CMD_EN_M		0x1
+#define FW_ACL_VLAN_CMD_EN_V(x)		((x) << FW_ACL_VLAN_CMD_EN_S)
+#define FW_ACL_VLAN_CMD_EN_G(x)         \
+	(((x) >> S_FW_ACL_VLAN_CMD_EN_S) & FW_ACL_VLAN_CMD_EN_M)
+#define FW_ACL_VLAN_CMD_EN_F            FW_ACL_VLAN_CMD_EN_V(1U)
+
+#define FW_ACL_VLAN_CMD_DROPNOVLAN_S	7
+#define FW_ACL_VLAN_CMD_DROPNOVLAN_V(x)	((x) << FW_ACL_VLAN_CMD_DROPNOVLAN_S)
+
+#define FW_ACL_VLAN_CMD_FM_S		6
+#define FW_ACL_VLAN_CMD_FM_M		0x1
+#define FW_ACL_VLAN_CMD_FM_V(x)         ((x) << FW_ACL_VLAN_CMD_FM_S)
+#define FW_ACL_VLAN_CMD_FM_G(x)         \
+	(((x) >> FW_ACL_VLAN_CMD_FM_S) & FW_ACL_VLAN_CMD_FM_M)
+#define FW_ACL_VLAN_CMD_FM_F            FW_ACL_VLAN_CMD_FM_V(1U)
+
+/* old 16-bit port capabilities bitmap (fw_port_cap16_t) */
+enum fw_port_cap {
+	FW_PORT_CAP_SPEED_100M		= 0x0001,
+	FW_PORT_CAP_SPEED_1G		= 0x0002,
+	FW_PORT_CAP_SPEED_25G		= 0x0004,
+	FW_PORT_CAP_SPEED_10G		= 0x0008,
+	FW_PORT_CAP_SPEED_40G		= 0x0010,
+	FW_PORT_CAP_SPEED_100G		= 0x0020,
+	FW_PORT_CAP_FC_RX		= 0x0040,
+	FW_PORT_CAP_FC_TX		= 0x0080,
+	FW_PORT_CAP_ANEG		= 0x0100,
+	FW_PORT_CAP_MDIX		= 0x0200,
+	FW_PORT_CAP_MDIAUTO		= 0x0400,
+	FW_PORT_CAP_FEC_RS		= 0x0800,
+	FW_PORT_CAP_FEC_BASER_RS	= 0x1000,
+	FW_PORT_CAP_FEC_RESERVED	= 0x2000,
+	FW_PORT_CAP_802_3_PAUSE		= 0x4000,
+	FW_PORT_CAP_802_3_ASM_DIR	= 0x8000,
+};
+
+#define FW_PORT_CAP_SPEED_S     0
+#define FW_PORT_CAP_SPEED_M     0x3f
+#define FW_PORT_CAP_SPEED_V(x)  ((x) << FW_PORT_CAP_SPEED_S)
+#define FW_PORT_CAP_SPEED_G(x) \
+	(((x) >> FW_PORT_CAP_SPEED_S) & FW_PORT_CAP_SPEED_M)
+
+enum fw_port_mdi {
+	FW_PORT_CAP_MDI_UNCHANGED,
+	FW_PORT_CAP_MDI_AUTO,
+	FW_PORT_CAP_MDI_F_STRAIGHT,
+	FW_PORT_CAP_MDI_F_CROSSOVER
+};
+
+#define FW_PORT_CAP_MDI_S 9
+#define FW_PORT_CAP_MDI_V(x) ((x) << FW_PORT_CAP_MDI_S)
+
+/* new 32-bit port capabilities bitmap (fw_port_cap32_t) */
+#define	FW_PORT_CAP32_SPEED_100M	0x00000001UL
+#define	FW_PORT_CAP32_SPEED_1G		0x00000002UL
+#define	FW_PORT_CAP32_SPEED_10G		0x00000004UL
+#define	FW_PORT_CAP32_SPEED_25G		0x00000008UL
+#define	FW_PORT_CAP32_SPEED_40G		0x00000010UL
+#define	FW_PORT_CAP32_SPEED_50G		0x00000020UL
+#define	FW_PORT_CAP32_SPEED_100G	0x00000040UL
+#define	FW_PORT_CAP32_SPEED_200G	0x00000080UL
+#define	FW_PORT_CAP32_SPEED_400G	0x00000100UL
+#define	FW_PORT_CAP32_SPEED_RESERVED1	0x00000200UL
+#define	FW_PORT_CAP32_SPEED_RESERVED2	0x00000400UL
+#define	FW_PORT_CAP32_SPEED_RESERVED3	0x00000800UL
+#define	FW_PORT_CAP32_RESERVED1		0x0000f000UL
+#define	FW_PORT_CAP32_FC_RX		0x00010000UL
+#define	FW_PORT_CAP32_FC_TX		0x00020000UL
+#define	FW_PORT_CAP32_802_3_PAUSE	0x00040000UL
+#define	FW_PORT_CAP32_802_3_ASM_DIR	0x00080000UL
+#define	FW_PORT_CAP32_ANEG		0x00100000UL
+#define	FW_PORT_CAP32_MDIX		0x00200000UL
+#define	FW_PORT_CAP32_MDIAUTO		0x00400000UL
+#define	FW_PORT_CAP32_FEC_RS		0x00800000UL
+#define	FW_PORT_CAP32_FEC_BASER_RS	0x01000000UL
+#define	FW_PORT_CAP32_FEC_RESERVED1	0x02000000UL
+#define	FW_PORT_CAP32_FEC_RESERVED2	0x04000000UL
+#define	FW_PORT_CAP32_FEC_RESERVED3	0x08000000UL
+#define	FW_PORT_CAP32_RESERVED2		0xf0000000UL
+
+#define FW_PORT_CAP32_SPEED_S	0
+#define FW_PORT_CAP32_SPEED_M	0xfff
+#define FW_PORT_CAP32_SPEED_V(x)	((x) << FW_PORT_CAP32_SPEED_S)
+#define FW_PORT_CAP32_SPEED_G(x) \
+	(((x) >> FW_PORT_CAP32_SPEED_S) & FW_PORT_CAP32_SPEED_M)
+
+#define FW_PORT_CAP32_FC_S	16
+#define FW_PORT_CAP32_FC_M	0x3
+#define FW_PORT_CAP32_FC_V(x)	((x) << FW_PORT_CAP32_FC_S)
+#define FW_PORT_CAP32_FC_G(x) \
+	(((x) >> FW_PORT_CAP32_FC_S) & FW_PORT_CAP32_FC_M)
+
+#define FW_PORT_CAP32_802_3_S	18
+#define FW_PORT_CAP32_802_3_M	0x3
+#define FW_PORT_CAP32_802_3_V(x)	((x) << FW_PORT_CAP32_802_3_S)
+#define FW_PORT_CAP32_802_3_G(x) \
+	(((x) >> FW_PORT_CAP32_802_3_S) & FW_PORT_CAP32_802_3_M)
+
+#define FW_PORT_CAP32_ANEG_S	20
+#define FW_PORT_CAP32_ANEG_M	0x1
+#define FW_PORT_CAP32_ANEG_V(x)	((x) << FW_PORT_CAP32_ANEG_S)
+#define FW_PORT_CAP32_ANEG_G(x) \
+	(((x) >> FW_PORT_CAP32_ANEG_S) & FW_PORT_CAP32_ANEG_M)
+
+enum fw_port_mdi32 {
+	FW_PORT_CAP32_MDI_UNCHANGED,
+	FW_PORT_CAP32_MDI_AUTO,
+	FW_PORT_CAP32_MDI_F_STRAIGHT,
+	FW_PORT_CAP32_MDI_F_CROSSOVER
+};
+
+#define FW_PORT_CAP32_MDI_S 21
+#define FW_PORT_CAP32_MDI_M 3
+#define FW_PORT_CAP32_MDI_V(x) ((x) << FW_PORT_CAP32_MDI_S)
+#define FW_PORT_CAP32_MDI_G(x) \
+	(((x) >> FW_PORT_CAP32_MDI_S) & FW_PORT_CAP32_MDI_M)
+
+#define FW_PORT_CAP32_FEC_S	23
+#define FW_PORT_CAP32_FEC_M	0x1f
+#define FW_PORT_CAP32_FEC_V(x)	((x) << FW_PORT_CAP32_FEC_S)
+#define FW_PORT_CAP32_FEC_G(x) \
+	(((x) >> FW_PORT_CAP32_FEC_S) & FW_PORT_CAP32_FEC_M)
+
+/* macros to isolate various 32-bit Port Capabilities sub-fields */
+#define CAP32_SPEED(__cap32) \
+	(FW_PORT_CAP32_SPEED_V(FW_PORT_CAP32_SPEED_M) & __cap32)
+
+#define CAP32_FEC(__cap32) \
+	(FW_PORT_CAP32_FEC_V(FW_PORT_CAP32_FEC_M) & __cap32)
+
+enum fw_port_action {
+	FW_PORT_ACTION_L1_CFG		= 0x0001,
+	FW_PORT_ACTION_L2_CFG		= 0x0002,
+	FW_PORT_ACTION_GET_PORT_INFO	= 0x0003,
+	FW_PORT_ACTION_L2_PPP_CFG	= 0x0004,
+	FW_PORT_ACTION_L2_DCB_CFG	= 0x0005,
+	FW_PORT_ACTION_DCB_READ_TRANS	= 0x0006,
+	FW_PORT_ACTION_DCB_READ_RECV	= 0x0007,
+	FW_PORT_ACTION_DCB_READ_DET	= 0x0008,
+	FW_PORT_ACTION_L1_CFG32		= 0x0009,
+	FW_PORT_ACTION_GET_PORT_INFO32	= 0x000a,
+	FW_PORT_ACTION_LOW_PWR_TO_NORMAL = 0x0010,
+	FW_PORT_ACTION_L1_LOW_PWR_EN	= 0x0011,
+	FW_PORT_ACTION_L2_WOL_MODE_EN	= 0x0012,
+	FW_PORT_ACTION_LPBK_TO_NORMAL	= 0x0020,
+	FW_PORT_ACTION_L1_LPBK		= 0x0021,
+	FW_PORT_ACTION_L1_PMA_LPBK	= 0x0022,
+	FW_PORT_ACTION_L1_PCS_LPBK	= 0x0023,
+	FW_PORT_ACTION_L1_PHYXS_CSIDE_LPBK = 0x0024,
+	FW_PORT_ACTION_L1_PHYXS_ESIDE_LPBK = 0x0025,
+	FW_PORT_ACTION_PHY_RESET	= 0x0040,
+	FW_PORT_ACTION_PMA_RESET	= 0x0041,
+	FW_PORT_ACTION_PCS_RESET	= 0x0042,
+	FW_PORT_ACTION_PHYXS_RESET	= 0x0043,
+	FW_PORT_ACTION_DTEXS_REEST	= 0x0044,
+	FW_PORT_ACTION_AN_RESET		= 0x0045
+};
+
+enum fw_port_l2cfg_ctlbf {
+	FW_PORT_L2_CTLBF_OVLAN0	= 0x01,
+	FW_PORT_L2_CTLBF_OVLAN1	= 0x02,
+	FW_PORT_L2_CTLBF_OVLAN2	= 0x04,
+	FW_PORT_L2_CTLBF_OVLAN3	= 0x08,
+	FW_PORT_L2_CTLBF_IVLAN	= 0x10,
+	FW_PORT_L2_CTLBF_TXIPG	= 0x20
+};
+
+enum fw_port_dcb_versions {
+	FW_PORT_DCB_VER_UNKNOWN,
+	FW_PORT_DCB_VER_CEE1D0,
+	FW_PORT_DCB_VER_CEE1D01,
+	FW_PORT_DCB_VER_IEEE,
+	FW_PORT_DCB_VER_AUTO = 7
+};
+
+enum fw_port_dcb_cfg {
+	FW_PORT_DCB_CFG_PG	= 0x01,
+	FW_PORT_DCB_CFG_PFC	= 0x02,
+	FW_PORT_DCB_CFG_APPL	= 0x04
+};
+
+enum fw_port_dcb_cfg_rc {
+	FW_PORT_DCB_CFG_SUCCESS	= 0x0,
+	FW_PORT_DCB_CFG_ERROR	= 0x1
+};
+
+enum fw_port_dcb_type {
+	FW_PORT_DCB_TYPE_PGID		= 0x00,
+	FW_PORT_DCB_TYPE_PGRATE		= 0x01,
+	FW_PORT_DCB_TYPE_PRIORATE	= 0x02,
+	FW_PORT_DCB_TYPE_PFC		= 0x03,
+	FW_PORT_DCB_TYPE_APP_ID		= 0x04,
+	FW_PORT_DCB_TYPE_CONTROL	= 0x05,
+};
+
+enum fw_port_dcb_feature_state {
+	FW_PORT_DCB_FEATURE_STATE_PENDING = 0x0,
+	FW_PORT_DCB_FEATURE_STATE_SUCCESS = 0x1,
+	FW_PORT_DCB_FEATURE_STATE_ERROR	= 0x2,
+	FW_PORT_DCB_FEATURE_STATE_TIMEOUT = 0x3,
+};
+
+struct fw_port_cmd {
+	__be32 op_to_portid;
+	__be32 action_to_len16;
+	union fw_port {
+		struct fw_port_l1cfg {
+			__be32 rcap;
+			__be32 r;
+		} l1cfg;
+		struct fw_port_l2cfg {
+			__u8   ctlbf;
+			__u8   ovlan3_to_ivlan0;
+			__be16 ivlantype;
+			__be16 txipg_force_pinfo;
+			__be16 mtu;
+			__be16 ovlan0mask;
+			__be16 ovlan0type;
+			__be16 ovlan1mask;
+			__be16 ovlan1type;
+			__be16 ovlan2mask;
+			__be16 ovlan2type;
+			__be16 ovlan3mask;
+			__be16 ovlan3type;
+		} l2cfg;
+		struct fw_port_info {
+			__be32 lstatus_to_modtype;
+			__be16 pcap;
+			__be16 acap;
+			__be16 mtu;
+			__u8   cbllen;
+			__u8   auxlinfo;
+			__u8   dcbxdis_pkd;
+			__u8   r8_lo;
+			__be16 lpacap;
+			__be64 r9;
+		} info;
+		struct fw_port_diags {
+			__u8   diagop;
+			__u8   r[3];
+			__be32 diagval;
+		} diags;
+		union fw_port_dcb {
+			struct fw_port_dcb_pgid {
+				__u8   type;
+				__u8   apply_pkd;
+				__u8   r10_lo[2];
+				__be32 pgid;
+				__be64 r11;
+			} pgid;
+			struct fw_port_dcb_pgrate {
+				__u8   type;
+				__u8   apply_pkd;
+				__u8   r10_lo[5];
+				__u8   num_tcs_supported;
+				__u8   pgrate[8];
+				__u8   tsa[8];
+			} pgrate;
+			struct fw_port_dcb_priorate {
+				__u8   type;
+				__u8   apply_pkd;
+				__u8   r10_lo[6];
+				__u8   strict_priorate[8];
+			} priorate;
+			struct fw_port_dcb_pfc {
+				__u8   type;
+				__u8   pfcen;
+				__u8   r10[5];
+				__u8   max_pfc_tcs;
+				__be64 r11;
+			} pfc;
+			struct fw_port_app_priority {
+				__u8   type;
+				__u8   r10[2];
+				__u8   idx;
+				__u8   user_prio_map;
+				__u8   sel_field;
+				__be16 protocolid;
+				__be64 r12;
+			} app_priority;
+			struct fw_port_dcb_control {
+				__u8   type;
+				__u8   all_syncd_pkd;
+				__be16 dcb_version_to_app_state;
+				__be32 r11;
+				__be64 r12;
+			} control;
+		} dcb;
+		struct fw_port_l1cfg32 {
+			__be32 rcap32;
+			__be32 r;
+		} l1cfg32;
+		struct fw_port_info32 {
+			__be32 lstatus32_to_cbllen32;
+			__be32 auxlinfo32_mtu32;
+			__be32 linkattr32;
+			__be32 pcaps32;
+			__be32 acaps32;
+			__be32 lpacaps32;
+		} info32;
+	} u;
+};
+
+#define FW_PORT_CMD_READ_S	22
+#define FW_PORT_CMD_READ_V(x)	((x) << FW_PORT_CMD_READ_S)
+#define FW_PORT_CMD_READ_F	FW_PORT_CMD_READ_V(1U)
+
+#define FW_PORT_CMD_PORTID_S	0
+#define FW_PORT_CMD_PORTID_M	0xf
+#define FW_PORT_CMD_PORTID_V(x)	((x) << FW_PORT_CMD_PORTID_S)
+#define FW_PORT_CMD_PORTID_G(x)	\
+	(((x) >> FW_PORT_CMD_PORTID_S) & FW_PORT_CMD_PORTID_M)
+
+#define FW_PORT_CMD_ACTION_S	16
+#define FW_PORT_CMD_ACTION_M	0xffff
+#define FW_PORT_CMD_ACTION_V(x)	((x) << FW_PORT_CMD_ACTION_S)
+#define FW_PORT_CMD_ACTION_G(x)	\
+	(((x) >> FW_PORT_CMD_ACTION_S) & FW_PORT_CMD_ACTION_M)
+
+#define FW_PORT_CMD_OVLAN3_S	7
+#define FW_PORT_CMD_OVLAN3_V(x)	((x) << FW_PORT_CMD_OVLAN3_S)
+
+#define FW_PORT_CMD_OVLAN2_S	6
+#define FW_PORT_CMD_OVLAN2_V(x)	((x) << FW_PORT_CMD_OVLAN2_S)
+
+#define FW_PORT_CMD_OVLAN1_S	5
+#define FW_PORT_CMD_OVLAN1_V(x)	((x) << FW_PORT_CMD_OVLAN1_S)
+
+#define FW_PORT_CMD_OVLAN0_S	4
+#define FW_PORT_CMD_OVLAN0_V(x)	((x) << FW_PORT_CMD_OVLAN0_S)
+
+#define FW_PORT_CMD_IVLAN0_S	3
+#define FW_PORT_CMD_IVLAN0_V(x)	((x) << FW_PORT_CMD_IVLAN0_S)
+
+#define FW_PORT_CMD_TXIPG_S	3
+#define FW_PORT_CMD_TXIPG_V(x)	((x) << FW_PORT_CMD_TXIPG_S)
+
+#define FW_PORT_CMD_LSTATUS_S           31
+#define FW_PORT_CMD_LSTATUS_M           0x1
+#define FW_PORT_CMD_LSTATUS_V(x)        ((x) << FW_PORT_CMD_LSTATUS_S)
+#define FW_PORT_CMD_LSTATUS_G(x)        \
+	(((x) >> FW_PORT_CMD_LSTATUS_S) & FW_PORT_CMD_LSTATUS_M)
+#define FW_PORT_CMD_LSTATUS_F   FW_PORT_CMD_LSTATUS_V(1U)
+
+#define FW_PORT_CMD_LSPEED_S	24
+#define FW_PORT_CMD_LSPEED_M	0x3f
+#define FW_PORT_CMD_LSPEED_V(x)	((x) << FW_PORT_CMD_LSPEED_S)
+#define FW_PORT_CMD_LSPEED_G(x)	\
+	(((x) >> FW_PORT_CMD_LSPEED_S) & FW_PORT_CMD_LSPEED_M)
+
+#define FW_PORT_CMD_TXPAUSE_S		23
+#define FW_PORT_CMD_TXPAUSE_V(x)	((x) << FW_PORT_CMD_TXPAUSE_S)
+#define FW_PORT_CMD_TXPAUSE_F	FW_PORT_CMD_TXPAUSE_V(1U)
+
+#define FW_PORT_CMD_RXPAUSE_S		22
+#define FW_PORT_CMD_RXPAUSE_V(x)	((x) << FW_PORT_CMD_RXPAUSE_S)
+#define FW_PORT_CMD_RXPAUSE_F	FW_PORT_CMD_RXPAUSE_V(1U)
+
+#define FW_PORT_CMD_MDIOCAP_S		21
+#define FW_PORT_CMD_MDIOCAP_V(x)	((x) << FW_PORT_CMD_MDIOCAP_S)
+#define FW_PORT_CMD_MDIOCAP_F	FW_PORT_CMD_MDIOCAP_V(1U)
+
+#define FW_PORT_CMD_MDIOADDR_S		16
+#define FW_PORT_CMD_MDIOADDR_M		0x1f
+#define FW_PORT_CMD_MDIOADDR_G(x)	\
+	(((x) >> FW_PORT_CMD_MDIOADDR_S) & FW_PORT_CMD_MDIOADDR_M)
+
+#define FW_PORT_CMD_LPTXPAUSE_S		15
+#define FW_PORT_CMD_LPTXPAUSE_V(x)	((x) << FW_PORT_CMD_LPTXPAUSE_S)
+#define FW_PORT_CMD_LPTXPAUSE_F	FW_PORT_CMD_LPTXPAUSE_V(1U)
+
+#define FW_PORT_CMD_LPRXPAUSE_S		14
+#define FW_PORT_CMD_LPRXPAUSE_V(x)	((x) << FW_PORT_CMD_LPRXPAUSE_S)
+#define FW_PORT_CMD_LPRXPAUSE_F	FW_PORT_CMD_LPRXPAUSE_V(1U)
+
+#define FW_PORT_CMD_PTYPE_S	8
+#define FW_PORT_CMD_PTYPE_M	0x1f
+#define FW_PORT_CMD_PTYPE_G(x)	\
+	(((x) >> FW_PORT_CMD_PTYPE_S) & FW_PORT_CMD_PTYPE_M)
+
+#define FW_PORT_CMD_LINKDNRC_S		5
+#define FW_PORT_CMD_LINKDNRC_M		0x7
+#define FW_PORT_CMD_LINKDNRC_G(x)	\
+	(((x) >> FW_PORT_CMD_LINKDNRC_S) & FW_PORT_CMD_LINKDNRC_M)
+
+#define FW_PORT_CMD_MODTYPE_S		0
+#define FW_PORT_CMD_MODTYPE_M		0x1f
+#define FW_PORT_CMD_MODTYPE_V(x)	((x) << FW_PORT_CMD_MODTYPE_S)
+#define FW_PORT_CMD_MODTYPE_G(x)	\
+	(((x) >> FW_PORT_CMD_MODTYPE_S) & FW_PORT_CMD_MODTYPE_M)
+
+#define FW_PORT_CMD_DCBXDIS_S		7
+#define FW_PORT_CMD_DCBXDIS_V(x)	((x) << FW_PORT_CMD_DCBXDIS_S)
+#define FW_PORT_CMD_DCBXDIS_F	FW_PORT_CMD_DCBXDIS_V(1U)
+
+#define FW_PORT_CMD_APPLY_S	7
+#define FW_PORT_CMD_APPLY_V(x)	((x) << FW_PORT_CMD_APPLY_S)
+#define FW_PORT_CMD_APPLY_F	FW_PORT_CMD_APPLY_V(1U)
+
+#define FW_PORT_CMD_ALL_SYNCD_S		7
+#define FW_PORT_CMD_ALL_SYNCD_V(x)	((x) << FW_PORT_CMD_ALL_SYNCD_S)
+#define FW_PORT_CMD_ALL_SYNCD_F	FW_PORT_CMD_ALL_SYNCD_V(1U)
+
+#define FW_PORT_CMD_DCB_VERSION_S	12
+#define FW_PORT_CMD_DCB_VERSION_M	0x7
+#define FW_PORT_CMD_DCB_VERSION_G(x)	\
+	(((x) >> FW_PORT_CMD_DCB_VERSION_S) & FW_PORT_CMD_DCB_VERSION_M)
+
+#define FW_PORT_CMD_LSTATUS32_S		31
+#define FW_PORT_CMD_LSTATUS32_M		0x1
+#define FW_PORT_CMD_LSTATUS32_V(x)	((x) << FW_PORT_CMD_LSTATUS32_S)
+#define FW_PORT_CMD_LSTATUS32_G(x)	\
+	(((x) >> FW_PORT_CMD_LSTATUS32_S) & FW_PORT_CMD_LSTATUS32_M)
+#define FW_PORT_CMD_LSTATUS32_F	FW_PORT_CMD_LSTATUS32_V(1U)
+
+#define FW_PORT_CMD_LINKDNRC32_S	28
+#define FW_PORT_CMD_LINKDNRC32_M	0x7
+#define FW_PORT_CMD_LINKDNRC32_V(x)	((x) << FW_PORT_CMD_LINKDNRC32_S)
+#define FW_PORT_CMD_LINKDNRC32_G(x)	\
+	(((x) >> FW_PORT_CMD_LINKDNRC32_S) & FW_PORT_CMD_LINKDNRC32_M)
+
+#define FW_PORT_CMD_DCBXDIS32_S		27
+#define FW_PORT_CMD_DCBXDIS32_M		0x1
+#define FW_PORT_CMD_DCBXDIS32_V(x)	((x) << FW_PORT_CMD_DCBXDIS32_S)
+#define FW_PORT_CMD_DCBXDIS32_G(x)	\
+	(((x) >> FW_PORT_CMD_DCBXDIS32_S) & FW_PORT_CMD_DCBXDIS32_M)
+#define FW_PORT_CMD_DCBXDIS32_F	FW_PORT_CMD_DCBXDIS32_V(1U)
+
+#define FW_PORT_CMD_MDIOCAP32_S		26
+#define FW_PORT_CMD_MDIOCAP32_M		0x1
+#define FW_PORT_CMD_MDIOCAP32_V(x)	((x) << FW_PORT_CMD_MDIOCAP32_S)
+#define FW_PORT_CMD_MDIOCAP32_G(x)	\
+	(((x) >> FW_PORT_CMD_MDIOCAP32_S) & FW_PORT_CMD_MDIOCAP32_M)
+#define FW_PORT_CMD_MDIOCAP32_F	FW_PORT_CMD_MDIOCAP32_V(1U)
+
+#define FW_PORT_CMD_MDIOADDR32_S	21
+#define FW_PORT_CMD_MDIOADDR32_M	0x1f
+#define FW_PORT_CMD_MDIOADDR32_V(x)	((x) << FW_PORT_CMD_MDIOADDR32_S)
+#define FW_PORT_CMD_MDIOADDR32_G(x)	\
+	(((x) >> FW_PORT_CMD_MDIOADDR32_S) & FW_PORT_CMD_MDIOADDR32_M)
+
+#define FW_PORT_CMD_PORTTYPE32_S	13
+#define FW_PORT_CMD_PORTTYPE32_M	0xff
+#define FW_PORT_CMD_PORTTYPE32_V(x)	((x) << FW_PORT_CMD_PORTTYPE32_S)
+#define FW_PORT_CMD_PORTTYPE32_G(x)	\
+	(((x) >> FW_PORT_CMD_PORTTYPE32_S) & FW_PORT_CMD_PORTTYPE32_M)
+
+#define FW_PORT_CMD_MODTYPE32_S		8
+#define FW_PORT_CMD_MODTYPE32_M		0x1f
+#define FW_PORT_CMD_MODTYPE32_V(x)	((x) << FW_PORT_CMD_MODTYPE32_S)
+#define FW_PORT_CMD_MODTYPE32_G(x)	\
+	(((x) >> FW_PORT_CMD_MODTYPE32_S) & FW_PORT_CMD_MODTYPE32_M)
+
+#define FW_PORT_CMD_CBLLEN32_S		0
+#define FW_PORT_CMD_CBLLEN32_M		0xff
+#define FW_PORT_CMD_CBLLEN32_V(x)	((x) << FW_PORT_CMD_CBLLEN32_S)
+#define FW_PORT_CMD_CBLLEN32_G(x)	\
+	(((x) >> FW_PORT_CMD_CBLLEN32_S) & FW_PORT_CMD_CBLLEN32_M)
+
+#define FW_PORT_CMD_AUXLINFO32_S	24
+#define FW_PORT_CMD_AUXLINFO32_M	0xff
+#define FW_PORT_CMD_AUXLINFO32_V(x)	((x) << FW_PORT_CMD_AUXLINFO32_S)
+#define FW_PORT_CMD_AUXLINFO32_G(x)	\
+	(((x) >> FW_PORT_CMD_AUXLINFO32_S) & FW_PORT_CMD_AUXLINFO32_M)
+
+#define FW_PORT_AUXLINFO32_KX4_S	2
+#define FW_PORT_AUXLINFO32_KX4_M	0x1
+#define FW_PORT_AUXLINFO32_KX4_V(x) \
+	((x) << FW_PORT_AUXLINFO32_KX4_S)
+#define FW_PORT_AUXLINFO32_KX4_G(x) \
+	(((x) >> FW_PORT_AUXLINFO32_KX4_S) & FW_PORT_AUXLINFO32_KX4_M)
+#define FW_PORT_AUXLINFO32_KX4_F	FW_PORT_AUXLINFO32_KX4_V(1U)
+
+#define FW_PORT_AUXLINFO32_KR_S	1
+#define FW_PORT_AUXLINFO32_KR_M	0x1
+#define FW_PORT_AUXLINFO32_KR_V(x) \
+	((x) << FW_PORT_AUXLINFO32_KR_S)
+#define FW_PORT_AUXLINFO32_KR_G(x) \
+	(((x) >> FW_PORT_AUXLINFO32_KR_S) & FW_PORT_AUXLINFO32_KR_M)
+#define FW_PORT_AUXLINFO32_KR_F	FW_PORT_AUXLINFO32_KR_V(1U)
+
+#define FW_PORT_CMD_MTU32_S	0
+#define FW_PORT_CMD_MTU32_M	0xffff
+#define FW_PORT_CMD_MTU32_V(x)	((x) << FW_PORT_CMD_MTU32_S)
+#define FW_PORT_CMD_MTU32_G(x)	\
+	(((x) >> FW_PORT_CMD_MTU32_S) & FW_PORT_CMD_MTU32_M)
+
+enum fw_port_type {
+	FW_PORT_TYPE_FIBER_XFI,
+	FW_PORT_TYPE_FIBER_XAUI,
+	FW_PORT_TYPE_BT_SGMII,
+	FW_PORT_TYPE_BT_XFI,
+	FW_PORT_TYPE_BT_XAUI,
+	FW_PORT_TYPE_KX4,
+	FW_PORT_TYPE_CX4,
+	FW_PORT_TYPE_KX,
+	FW_PORT_TYPE_KR,
+	FW_PORT_TYPE_SFP,
+	FW_PORT_TYPE_BP_AP,
+	FW_PORT_TYPE_BP4_AP,
+	FW_PORT_TYPE_QSFP_10G,
+	FW_PORT_TYPE_QSA,
+	FW_PORT_TYPE_QSFP,
+	FW_PORT_TYPE_BP40_BA,
+	FW_PORT_TYPE_KR4_100G,
+	FW_PORT_TYPE_CR4_QSFP,
+	FW_PORT_TYPE_CR_QSFP,
+	FW_PORT_TYPE_CR2_QSFP,
+	FW_PORT_TYPE_SFP28,
+	FW_PORT_TYPE_KR_SFP28,
+	FW_PORT_TYPE_KR_XLAUI,
+
+	FW_PORT_TYPE_NONE = FW_PORT_CMD_PTYPE_M
+};
+
+enum fw_port_module_type {
+	FW_PORT_MOD_TYPE_NA,
+	FW_PORT_MOD_TYPE_LR,
+	FW_PORT_MOD_TYPE_SR,
+	FW_PORT_MOD_TYPE_ER,
+	FW_PORT_MOD_TYPE_TWINAX_PASSIVE,
+	FW_PORT_MOD_TYPE_TWINAX_ACTIVE,
+	FW_PORT_MOD_TYPE_LRM,
+	FW_PORT_MOD_TYPE_ERROR		= FW_PORT_CMD_MODTYPE_M - 3,
+	FW_PORT_MOD_TYPE_UNKNOWN	= FW_PORT_CMD_MODTYPE_M - 2,
+	FW_PORT_MOD_TYPE_NOTSUPPORTED	= FW_PORT_CMD_MODTYPE_M - 1,
+
+	FW_PORT_MOD_TYPE_NONE = FW_PORT_CMD_MODTYPE_M
+};
+
+enum fw_port_mod_sub_type {
+	FW_PORT_MOD_SUB_TYPE_NA,
+	FW_PORT_MOD_SUB_TYPE_MV88E114X = 0x1,
+	FW_PORT_MOD_SUB_TYPE_TN8022 = 0x2,
+	FW_PORT_MOD_SUB_TYPE_AQ1202 = 0x3,
+	FW_PORT_MOD_SUB_TYPE_88x3120 = 0x4,
+	FW_PORT_MOD_SUB_TYPE_BCM84834 = 0x5,
+	FW_PORT_MOD_SUB_TYPE_BT_VSC8634 = 0x8,
+
+	/* The following will never been in the VPD.  They are TWINAX cable
+	 * lengths decoded from SFP+ module i2c PROMs.  These should
+	 * almost certainly go somewhere else ...
+	 */
+	FW_PORT_MOD_SUB_TYPE_TWINAX_1 = 0x9,
+	FW_PORT_MOD_SUB_TYPE_TWINAX_3 = 0xA,
+	FW_PORT_MOD_SUB_TYPE_TWINAX_5 = 0xB,
+	FW_PORT_MOD_SUB_TYPE_TWINAX_7 = 0xC,
+};
+
+enum fw_port_stats_tx_index {
+	FW_STAT_TX_PORT_BYTES_IX = 0,
+	FW_STAT_TX_PORT_FRAMES_IX,
+	FW_STAT_TX_PORT_BCAST_IX,
+	FW_STAT_TX_PORT_MCAST_IX,
+	FW_STAT_TX_PORT_UCAST_IX,
+	FW_STAT_TX_PORT_ERROR_IX,
+	FW_STAT_TX_PORT_64B_IX,
+	FW_STAT_TX_PORT_65B_127B_IX,
+	FW_STAT_TX_PORT_128B_255B_IX,
+	FW_STAT_TX_PORT_256B_511B_IX,
+	FW_STAT_TX_PORT_512B_1023B_IX,
+	FW_STAT_TX_PORT_1024B_1518B_IX,
+	FW_STAT_TX_PORT_1519B_MAX_IX,
+	FW_STAT_TX_PORT_DROP_IX,
+	FW_STAT_TX_PORT_PAUSE_IX,
+	FW_STAT_TX_PORT_PPP0_IX,
+	FW_STAT_TX_PORT_PPP1_IX,
+	FW_STAT_TX_PORT_PPP2_IX,
+	FW_STAT_TX_PORT_PPP3_IX,
+	FW_STAT_TX_PORT_PPP4_IX,
+	FW_STAT_TX_PORT_PPP5_IX,
+	FW_STAT_TX_PORT_PPP6_IX,
+	FW_STAT_TX_PORT_PPP7_IX,
+	FW_NUM_PORT_TX_STATS
+};
+
+enum fw_port_stat_rx_index {
+	FW_STAT_RX_PORT_BYTES_IX = 0,
+	FW_STAT_RX_PORT_FRAMES_IX,
+	FW_STAT_RX_PORT_BCAST_IX,
+	FW_STAT_RX_PORT_MCAST_IX,
+	FW_STAT_RX_PORT_UCAST_IX,
+	FW_STAT_RX_PORT_MTU_ERROR_IX,
+	FW_STAT_RX_PORT_MTU_CRC_ERROR_IX,
+	FW_STAT_RX_PORT_CRC_ERROR_IX,
+	FW_STAT_RX_PORT_LEN_ERROR_IX,
+	FW_STAT_RX_PORT_SYM_ERROR_IX,
+	FW_STAT_RX_PORT_64B_IX,
+	FW_STAT_RX_PORT_65B_127B_IX,
+	FW_STAT_RX_PORT_128B_255B_IX,
+	FW_STAT_RX_PORT_256B_511B_IX,
+	FW_STAT_RX_PORT_512B_1023B_IX,
+	FW_STAT_RX_PORT_1024B_1518B_IX,
+	FW_STAT_RX_PORT_1519B_MAX_IX,
+	FW_STAT_RX_PORT_PAUSE_IX,
+	FW_STAT_RX_PORT_PPP0_IX,
+	FW_STAT_RX_PORT_PPP1_IX,
+	FW_STAT_RX_PORT_PPP2_IX,
+	FW_STAT_RX_PORT_PPP3_IX,
+	FW_STAT_RX_PORT_PPP4_IX,
+	FW_STAT_RX_PORT_PPP5_IX,
+	FW_STAT_RX_PORT_PPP6_IX,
+	FW_STAT_RX_PORT_PPP7_IX,
+	FW_STAT_RX_PORT_LESS_64B_IX,
+	FW_STAT_RX_PORT_MAC_ERROR_IX,
+	FW_NUM_PORT_RX_STATS
+};
+
+/* port stats */
+#define FW_NUM_PORT_STATS (FW_NUM_PORT_TX_STATS + FW_NUM_PORT_RX_STATS)
+
+struct fw_port_stats_cmd {
+	__be32 op_to_portid;
+	__be32 retval_len16;
+	union fw_port_stats {
+		struct fw_port_stats_ctl {
+			u8 nstats_bg_bm;
+			u8 tx_ix;
+			__be16 r6;
+			__be32 r7;
+			__be64 stat0;
+			__be64 stat1;
+			__be64 stat2;
+			__be64 stat3;
+			__be64 stat4;
+			__be64 stat5;
+		} ctl;
+		struct fw_port_stats_all {
+			__be64 tx_bytes;
+			__be64 tx_frames;
+			__be64 tx_bcast;
+			__be64 tx_mcast;
+			__be64 tx_ucast;
+			__be64 tx_error;
+			__be64 tx_64b;
+			__be64 tx_65b_127b;
+			__be64 tx_128b_255b;
+			__be64 tx_256b_511b;
+			__be64 tx_512b_1023b;
+			__be64 tx_1024b_1518b;
+			__be64 tx_1519b_max;
+			__be64 tx_drop;
+			__be64 tx_pause;
+			__be64 tx_ppp0;
+			__be64 tx_ppp1;
+			__be64 tx_ppp2;
+			__be64 tx_ppp3;
+			__be64 tx_ppp4;
+			__be64 tx_ppp5;
+			__be64 tx_ppp6;
+			__be64 tx_ppp7;
+			__be64 rx_bytes;
+			__be64 rx_frames;
+			__be64 rx_bcast;
+			__be64 rx_mcast;
+			__be64 rx_ucast;
+			__be64 rx_mtu_error;
+			__be64 rx_mtu_crc_error;
+			__be64 rx_crc_error;
+			__be64 rx_len_error;
+			__be64 rx_sym_error;
+			__be64 rx_64b;
+			__be64 rx_65b_127b;
+			__be64 rx_128b_255b;
+			__be64 rx_256b_511b;
+			__be64 rx_512b_1023b;
+			__be64 rx_1024b_1518b;
+			__be64 rx_1519b_max;
+			__be64 rx_pause;
+			__be64 rx_ppp0;
+			__be64 rx_ppp1;
+			__be64 rx_ppp2;
+			__be64 rx_ppp3;
+			__be64 rx_ppp4;
+			__be64 rx_ppp5;
+			__be64 rx_ppp6;
+			__be64 rx_ppp7;
+			__be64 rx_less_64b;
+			__be64 rx_bg_drop;
+			__be64 rx_bg_trunc;
+		} all;
+	} u;
+};
+
+/* port loopback stats */
+#define FW_NUM_LB_STATS 16
+enum fw_port_lb_stats_index {
+	FW_STAT_LB_PORT_BYTES_IX,
+	FW_STAT_LB_PORT_FRAMES_IX,
+	FW_STAT_LB_PORT_BCAST_IX,
+	FW_STAT_LB_PORT_MCAST_IX,
+	FW_STAT_LB_PORT_UCAST_IX,
+	FW_STAT_LB_PORT_ERROR_IX,
+	FW_STAT_LB_PORT_64B_IX,
+	FW_STAT_LB_PORT_65B_127B_IX,
+	FW_STAT_LB_PORT_128B_255B_IX,
+	FW_STAT_LB_PORT_256B_511B_IX,
+	FW_STAT_LB_PORT_512B_1023B_IX,
+	FW_STAT_LB_PORT_1024B_1518B_IX,
+	FW_STAT_LB_PORT_1519B_MAX_IX,
+	FW_STAT_LB_PORT_DROP_FRAMES_IX
+};
+
+struct fw_port_lb_stats_cmd {
+	__be32 op_to_lbport;
+	__be32 retval_len16;
+	union fw_port_lb_stats {
+		struct fw_port_lb_stats_ctl {
+			u8 nstats_bg_bm;
+			u8 ix_pkd;
+			__be16 r6;
+			__be32 r7;
+			__be64 stat0;
+			__be64 stat1;
+			__be64 stat2;
+			__be64 stat3;
+			__be64 stat4;
+			__be64 stat5;
+		} ctl;
+		struct fw_port_lb_stats_all {
+			__be64 tx_bytes;
+			__be64 tx_frames;
+			__be64 tx_bcast;
+			__be64 tx_mcast;
+			__be64 tx_ucast;
+			__be64 tx_error;
+			__be64 tx_64b;
+			__be64 tx_65b_127b;
+			__be64 tx_128b_255b;
+			__be64 tx_256b_511b;
+			__be64 tx_512b_1023b;
+			__be64 tx_1024b_1518b;
+			__be64 tx_1519b_max;
+			__be64 rx_lb_drop;
+			__be64 rx_lb_trunc;
+		} all;
+	} u;
+};
+
+enum fw_ptp_subop {
+	/* none */
+	FW_PTP_SC_INIT_TIMER            = 0x00,
+	FW_PTP_SC_TX_TYPE               = 0x01,
+	/* init */
+	FW_PTP_SC_RXTIME_STAMP          = 0x08,
+	FW_PTP_SC_RDRX_TYPE             = 0x09,
+	/* ts */
+	FW_PTP_SC_ADJ_FREQ              = 0x10,
+	FW_PTP_SC_ADJ_TIME              = 0x11,
+	FW_PTP_SC_ADJ_FTIME             = 0x12,
+	FW_PTP_SC_WALL_CLOCK            = 0x13,
+	FW_PTP_SC_GET_TIME              = 0x14,
+	FW_PTP_SC_SET_TIME              = 0x15,
+};
+
+struct fw_ptp_cmd {
+	__be32 op_to_portid;
+	__be32 retval_len16;
+	union fw_ptp {
+		struct fw_ptp_sc {
+			__u8   sc;
+			__u8   r3[7];
+		} scmd;
+		struct fw_ptp_init {
+			__u8   sc;
+			__u8   txchan;
+			__be16 absid;
+			__be16 mode;
+			__be16 r3;
+		} init;
+		struct fw_ptp_ts {
+			__u8   sc;
+			__u8   sign;
+			__be16 r3;
+			__be32 ppb;
+			__be64 tm;
+		} ts;
+	} u;
+	__be64 r3;
+};
+
+#define FW_PTP_CMD_PORTID_S             0
+#define FW_PTP_CMD_PORTID_M             0xf
+#define FW_PTP_CMD_PORTID_V(x)          ((x) << FW_PTP_CMD_PORTID_S)
+#define FW_PTP_CMD_PORTID_G(x)          \
+	(((x) >> FW_PTP_CMD_PORTID_S) & FW_PTP_CMD_PORTID_M)
+
+struct fw_rss_ind_tbl_cmd {
+	__be32 op_to_viid;
+	__be32 retval_len16;
+	__be16 niqid;
+	__be16 startidx;
+	__be32 r3;
+	__be32 iq0_to_iq2;
+	__be32 iq3_to_iq5;
+	__be32 iq6_to_iq8;
+	__be32 iq9_to_iq11;
+	__be32 iq12_to_iq14;
+	__be32 iq15_to_iq17;
+	__be32 iq18_to_iq20;
+	__be32 iq21_to_iq23;
+	__be32 iq24_to_iq26;
+	__be32 iq27_to_iq29;
+	__be32 iq30_iq31;
+	__be32 r15_lo;
+};
+
+#define FW_RSS_IND_TBL_CMD_VIID_S	0
+#define FW_RSS_IND_TBL_CMD_VIID_V(x)	((x) << FW_RSS_IND_TBL_CMD_VIID_S)
+
+#define FW_RSS_IND_TBL_CMD_IQ0_S	20
+#define FW_RSS_IND_TBL_CMD_IQ0_V(x)	((x) << FW_RSS_IND_TBL_CMD_IQ0_S)
+
+#define FW_RSS_IND_TBL_CMD_IQ1_S	10
+#define FW_RSS_IND_TBL_CMD_IQ1_V(x)	((x) << FW_RSS_IND_TBL_CMD_IQ1_S)
+
+#define FW_RSS_IND_TBL_CMD_IQ2_S	0
+#define FW_RSS_IND_TBL_CMD_IQ2_V(x)	((x) << FW_RSS_IND_TBL_CMD_IQ2_S)
+
+struct fw_rss_glb_config_cmd {
+	__be32 op_to_write;
+	__be32 retval_len16;
+	union fw_rss_glb_config {
+		struct fw_rss_glb_config_manual {
+			__be32 mode_pkd;
+			__be32 r3;
+			__be64 r4;
+			__be64 r5;
+		} manual;
+		struct fw_rss_glb_config_basicvirtual {
+			__be32 mode_pkd;
+			__be32 synmapen_to_hashtoeplitz;
+			__be64 r8;
+			__be64 r9;
+		} basicvirtual;
+	} u;
+};
+
+#define FW_RSS_GLB_CONFIG_CMD_MODE_S	28
+#define FW_RSS_GLB_CONFIG_CMD_MODE_M	0xf
+#define FW_RSS_GLB_CONFIG_CMD_MODE_V(x)	((x) << FW_RSS_GLB_CONFIG_CMD_MODE_S)
+#define FW_RSS_GLB_CONFIG_CMD_MODE_G(x)	\
+	(((x) >> FW_RSS_GLB_CONFIG_CMD_MODE_S) & FW_RSS_GLB_CONFIG_CMD_MODE_M)
+
+#define FW_RSS_GLB_CONFIG_CMD_MODE_MANUAL	0
+#define FW_RSS_GLB_CONFIG_CMD_MODE_BASICVIRTUAL	1
+
+#define FW_RSS_GLB_CONFIG_CMD_SYNMAPEN_S	8
+#define FW_RSS_GLB_CONFIG_CMD_SYNMAPEN_V(x)	\
+	((x) << FW_RSS_GLB_CONFIG_CMD_SYNMAPEN_S)
+#define FW_RSS_GLB_CONFIG_CMD_SYNMAPEN_F	\
+	FW_RSS_GLB_CONFIG_CMD_SYNMAPEN_V(1U)
+
+#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV6_S		7
+#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV6_V(x)	\
+	((x) << FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV6_S)
+#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV6_F	\
+	FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV6_V(1U)
+
+#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV6_S		6
+#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV6_V(x)	\
+	((x) << FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV6_S)
+#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV6_F	\
+	FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV6_V(1U)
+
+#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV4_S		5
+#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV4_V(x)	\
+	((x) << FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV4_S)
+#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV4_F	\
+	FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV4_V(1U)
+
+#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV4_S		4
+#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV4_V(x)	\
+	((x) << FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV4_S)
+#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV4_F	\
+	FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV4_V(1U)
+
+#define FW_RSS_GLB_CONFIG_CMD_OFDMAPEN_S	3
+#define FW_RSS_GLB_CONFIG_CMD_OFDMAPEN_V(x)	\
+	((x) << FW_RSS_GLB_CONFIG_CMD_OFDMAPEN_S)
+#define FW_RSS_GLB_CONFIG_CMD_OFDMAPEN_F	\
+	FW_RSS_GLB_CONFIG_CMD_OFDMAPEN_V(1U)
+
+#define FW_RSS_GLB_CONFIG_CMD_TNLMAPEN_S	2
+#define FW_RSS_GLB_CONFIG_CMD_TNLMAPEN_V(x)	\
+	((x) << FW_RSS_GLB_CONFIG_CMD_TNLMAPEN_S)
+#define FW_RSS_GLB_CONFIG_CMD_TNLMAPEN_F	\
+	FW_RSS_GLB_CONFIG_CMD_TNLMAPEN_V(1U)
+
+#define FW_RSS_GLB_CONFIG_CMD_TNLALLLKP_S	1
+#define FW_RSS_GLB_CONFIG_CMD_TNLALLLKP_V(x)	\
+	((x) << FW_RSS_GLB_CONFIG_CMD_TNLALLLKP_S)
+#define FW_RSS_GLB_CONFIG_CMD_TNLALLLKP_F	\
+	FW_RSS_GLB_CONFIG_CMD_TNLALLLKP_V(1U)
+
+#define FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ_S	0
+#define FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ_V(x)	\
+	((x) << FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ_S)
+#define FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ_F	\
+	FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ_V(1U)
+
+struct fw_rss_vi_config_cmd {
+	__be32 op_to_viid;
+#define FW_RSS_VI_CONFIG_CMD_VIID(x) ((x) << 0)
+	__be32 retval_len16;
+	union fw_rss_vi_config {
+		struct fw_rss_vi_config_manual {
+			__be64 r3;
+			__be64 r4;
+			__be64 r5;
+		} manual;
+		struct fw_rss_vi_config_basicvirtual {
+			__be32 r6;
+			__be32 defaultq_to_udpen;
+			__be64 r9;
+			__be64 r10;
+		} basicvirtual;
+	} u;
+};
+
+#define FW_RSS_VI_CONFIG_CMD_VIID_S	0
+#define FW_RSS_VI_CONFIG_CMD_VIID_V(x)	((x) << FW_RSS_VI_CONFIG_CMD_VIID_S)
+
+#define FW_RSS_VI_CONFIG_CMD_DEFAULTQ_S		16
+#define FW_RSS_VI_CONFIG_CMD_DEFAULTQ_M		0x3ff
+#define FW_RSS_VI_CONFIG_CMD_DEFAULTQ_V(x)	\
+	((x) << FW_RSS_VI_CONFIG_CMD_DEFAULTQ_S)
+#define FW_RSS_VI_CONFIG_CMD_DEFAULTQ_G(x)	\
+	(((x) >> FW_RSS_VI_CONFIG_CMD_DEFAULTQ_S) & \
+	 FW_RSS_VI_CONFIG_CMD_DEFAULTQ_M)
+
+#define FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN_S	4
+#define FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN_V(x)	\
+	((x) << FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN_S)
+#define FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN_F	\
+	FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN_V(1U)
+
+#define FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_S	3
+#define FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_V(x)	\
+	((x) << FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_S)
+#define FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_F	\
+	FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_V(1U)
+
+#define FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN_S	2
+#define FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN_V(x)	\
+	((x) << FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN_S)
+#define FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN_F	\
+	FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN_V(1U)
+
+#define FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_S	1
+#define FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_V(x)	\
+	((x) << FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_S)
+#define FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_F	\
+	FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_V(1U)
+
+#define FW_RSS_VI_CONFIG_CMD_UDPEN_S	0
+#define FW_RSS_VI_CONFIG_CMD_UDPEN_V(x)	((x) << FW_RSS_VI_CONFIG_CMD_UDPEN_S)
+#define FW_RSS_VI_CONFIG_CMD_UDPEN_F	FW_RSS_VI_CONFIG_CMD_UDPEN_V(1U)
+
+enum fw_sched_sc {
+	FW_SCHED_SC_PARAMS		= 1,
+};
+
+struct fw_sched_cmd {
+	__be32 op_to_write;
+	__be32 retval_len16;
+	union fw_sched {
+		struct fw_sched_config {
+			__u8   sc;
+			__u8   type;
+			__u8   minmaxen;
+			__u8   r3[5];
+			__u8   nclasses[4];
+			__be32 r4;
+		} config;
+		struct fw_sched_params {
+			__u8   sc;
+			__u8   type;
+			__u8   level;
+			__u8   mode;
+			__u8   unit;
+			__u8   rate;
+			__u8   ch;
+			__u8   cl;
+			__be32 min;
+			__be32 max;
+			__be16 weight;
+			__be16 pktsize;
+			__be16 burstsize;
+			__be16 r4;
+		} params;
+	} u;
+};
+
+struct fw_clip_cmd {
+	__be32 op_to_write;
+	__be32 alloc_to_len16;
+	__be64 ip_hi;
+	__be64 ip_lo;
+	__be32 r4[2];
+};
+
+#define FW_CLIP_CMD_ALLOC_S     31
+#define FW_CLIP_CMD_ALLOC_V(x)  ((x) << FW_CLIP_CMD_ALLOC_S)
+#define FW_CLIP_CMD_ALLOC_F     FW_CLIP_CMD_ALLOC_V(1U)
+
+#define FW_CLIP_CMD_FREE_S      30
+#define FW_CLIP_CMD_FREE_V(x)   ((x) << FW_CLIP_CMD_FREE_S)
+#define FW_CLIP_CMD_FREE_F      FW_CLIP_CMD_FREE_V(1U)
+
+enum fw_error_type {
+	FW_ERROR_TYPE_EXCEPTION		= 0x0,
+	FW_ERROR_TYPE_HWMODULE		= 0x1,
+	FW_ERROR_TYPE_WR		= 0x2,
+	FW_ERROR_TYPE_ACL		= 0x3,
+};
+
+struct fw_error_cmd {
+	__be32 op_to_type;
+	__be32 len16_pkd;
+	union fw_error {
+		struct fw_error_exception {
+			__be32 info[6];
+		} exception;
+		struct fw_error_hwmodule {
+			__be32 regaddr;
+			__be32 regval;
+		} hwmodule;
+		struct fw_error_wr {
+			__be16 cidx;
+			__be16 pfn_vfn;
+			__be32 eqid;
+			u8 wrhdr[16];
+		} wr;
+		struct fw_error_acl {
+			__be16 cidx;
+			__be16 pfn_vfn;
+			__be32 eqid;
+			__be16 mv_pkd;
+			u8 val[6];
+			__be64 r4;
+		} acl;
+	} u;
+};
+
+struct fw_debug_cmd {
+	__be32 op_type;
+	__be32 len16_pkd;
+	union fw_debug {
+		struct fw_debug_assert {
+			__be32 fcid;
+			__be32 line;
+			__be32 x;
+			__be32 y;
+			u8 filename_0_7[8];
+			u8 filename_8_15[8];
+			__be64 r3;
+		} assert;
+		struct fw_debug_prt {
+			__be16 dprtstridx;
+			__be16 r3[3];
+			__be32 dprtstrparam0;
+			__be32 dprtstrparam1;
+			__be32 dprtstrparam2;
+			__be32 dprtstrparam3;
+		} prt;
+	} u;
+};
+
+#define FW_DEBUG_CMD_TYPE_S	0
+#define FW_DEBUG_CMD_TYPE_M	0xff
+#define FW_DEBUG_CMD_TYPE_G(x)	\
+	(((x) >> FW_DEBUG_CMD_TYPE_S) & FW_DEBUG_CMD_TYPE_M)
+
+struct fw_hma_cmd {
+	__be32 op_pkd;
+	__be32 retval_len16;
+	__be32 mode_to_pcie_params;
+	__be32 naddr_size;
+	__be32 addr_size_pkd;
+	__be32 r6;
+	__be64 phy_address[5];
+};
+
+#define FW_HMA_CMD_MODE_S	31
+#define FW_HMA_CMD_MODE_M	0x1
+#define FW_HMA_CMD_MODE_V(x)	((x) << FW_HMA_CMD_MODE_S)
+#define FW_HMA_CMD_MODE_G(x)	\
+	(((x) >> FW_HMA_CMD_MODE_S) & FW_HMA_CMD_MODE_M)
+#define FW_HMA_CMD_MODE_F	FW_HMA_CMD_MODE_V(1U)
+
+#define FW_HMA_CMD_SOC_S	30
+#define FW_HMA_CMD_SOC_M	0x1
+#define FW_HMA_CMD_SOC_V(x)	((x) << FW_HMA_CMD_SOC_S)
+#define FW_HMA_CMD_SOC_G(x)	(((x) >> FW_HMA_CMD_SOC_S) & FW_HMA_CMD_SOC_M)
+#define FW_HMA_CMD_SOC_F	FW_HMA_CMD_SOC_V(1U)
+
+#define FW_HMA_CMD_EOC_S	29
+#define FW_HMA_CMD_EOC_M	0x1
+#define FW_HMA_CMD_EOC_V(x)	((x) << FW_HMA_CMD_EOC_S)
+#define FW_HMA_CMD_EOC_G(x)	(((x) >> FW_HMA_CMD_EOC_S) & FW_HMA_CMD_EOC_M)
+#define FW_HMA_CMD_EOC_F	FW_HMA_CMD_EOC_V(1U)
+
+#define FW_HMA_CMD_PCIE_PARAMS_S	0
+#define FW_HMA_CMD_PCIE_PARAMS_M	0x7ffffff
+#define FW_HMA_CMD_PCIE_PARAMS_V(x)	((x) << FW_HMA_CMD_PCIE_PARAMS_S)
+#define FW_HMA_CMD_PCIE_PARAMS_G(x)	\
+	(((x) >> FW_HMA_CMD_PCIE_PARAMS_S) & FW_HMA_CMD_PCIE_PARAMS_M)
+
+#define FW_HMA_CMD_NADDR_S	12
+#define FW_HMA_CMD_NADDR_M	0x3f
+#define FW_HMA_CMD_NADDR_V(x)	((x) << FW_HMA_CMD_NADDR_S)
+#define FW_HMA_CMD_NADDR_G(x)	\
+	(((x) >> FW_HMA_CMD_NADDR_S) & FW_HMA_CMD_NADDR_M)
+
+#define FW_HMA_CMD_SIZE_S	0
+#define FW_HMA_CMD_SIZE_M	0xfff
+#define FW_HMA_CMD_SIZE_V(x)	((x) << FW_HMA_CMD_SIZE_S)
+#define FW_HMA_CMD_SIZE_G(x)	\
+	(((x) >> FW_HMA_CMD_SIZE_S) & FW_HMA_CMD_SIZE_M)
+
+#define FW_HMA_CMD_ADDR_SIZE_S		11
+#define FW_HMA_CMD_ADDR_SIZE_M		0x1fffff
+#define FW_HMA_CMD_ADDR_SIZE_V(x)	((x) << FW_HMA_CMD_ADDR_SIZE_S)
+#define FW_HMA_CMD_ADDR_SIZE_G(x)	\
+	(((x) >> FW_HMA_CMD_ADDR_SIZE_S) & FW_HMA_CMD_ADDR_SIZE_M)
+
+enum pcie_fw_eval {
+	PCIE_FW_EVAL_CRASH = 0,
+};
+
+#define PCIE_FW_ERR_S		31
+#define PCIE_FW_ERR_V(x)	((x) << PCIE_FW_ERR_S)
+#define PCIE_FW_ERR_F		PCIE_FW_ERR_V(1U)
+
+#define PCIE_FW_INIT_S		30
+#define PCIE_FW_INIT_V(x)	((x) << PCIE_FW_INIT_S)
+#define PCIE_FW_INIT_F		PCIE_FW_INIT_V(1U)
+
+#define PCIE_FW_HALT_S          29
+#define PCIE_FW_HALT_V(x)       ((x) << PCIE_FW_HALT_S)
+#define PCIE_FW_HALT_F          PCIE_FW_HALT_V(1U)
+
+#define PCIE_FW_EVAL_S		24
+#define PCIE_FW_EVAL_M		0x7
+#define PCIE_FW_EVAL_G(x)	(((x) >> PCIE_FW_EVAL_S) & PCIE_FW_EVAL_M)
+
+#define PCIE_FW_MASTER_VLD_S	15
+#define PCIE_FW_MASTER_VLD_V(x)	((x) << PCIE_FW_MASTER_VLD_S)
+#define PCIE_FW_MASTER_VLD_F	PCIE_FW_MASTER_VLD_V(1U)
+
+#define PCIE_FW_MASTER_S	12
+#define PCIE_FW_MASTER_M	0x7
+#define PCIE_FW_MASTER_V(x)	((x) << PCIE_FW_MASTER_S)
+#define PCIE_FW_MASTER_G(x)	(((x) >> PCIE_FW_MASTER_S) & PCIE_FW_MASTER_M)
+
+struct fw_hdr {
+	u8 ver;
+	u8 chip;			/* terminator chip type */
+	__be16	len512;			/* bin length in units of 512-bytes */
+	__be32	fw_ver;			/* firmware version */
+	__be32	tp_microcode_ver;
+	u8 intfver_nic;
+	u8 intfver_vnic;
+	u8 intfver_ofld;
+	u8 intfver_ri;
+	u8 intfver_iscsipdu;
+	u8 intfver_iscsi;
+	u8 intfver_fcoepdu;
+	u8 intfver_fcoe;
+	__u32   reserved2;
+	__u32   reserved3;
+	__u32   reserved4;
+	__be32  flags;
+	__be32  reserved6[23];
+};
+
+enum fw_hdr_chip {
+	FW_HDR_CHIP_T4,
+	FW_HDR_CHIP_T5,
+	FW_HDR_CHIP_T6
+};
+
+#define FW_HDR_FW_VER_MAJOR_S	24
+#define FW_HDR_FW_VER_MAJOR_M	0xff
+#define FW_HDR_FW_VER_MAJOR_V(x) \
+	((x) << FW_HDR_FW_VER_MAJOR_S)
+#define FW_HDR_FW_VER_MAJOR_G(x) \
+	(((x) >> FW_HDR_FW_VER_MAJOR_S) & FW_HDR_FW_VER_MAJOR_M)
+
+#define FW_HDR_FW_VER_MINOR_S	16
+#define FW_HDR_FW_VER_MINOR_M	0xff
+#define FW_HDR_FW_VER_MINOR_V(x) \
+	((x) << FW_HDR_FW_VER_MINOR_S)
+#define FW_HDR_FW_VER_MINOR_G(x) \
+	(((x) >> FW_HDR_FW_VER_MINOR_S) & FW_HDR_FW_VER_MINOR_M)
+
+#define FW_HDR_FW_VER_MICRO_S	8
+#define FW_HDR_FW_VER_MICRO_M	0xff
+#define FW_HDR_FW_VER_MICRO_V(x) \
+	((x) << FW_HDR_FW_VER_MICRO_S)
+#define FW_HDR_FW_VER_MICRO_G(x) \
+	(((x) >> FW_HDR_FW_VER_MICRO_S) & FW_HDR_FW_VER_MICRO_M)
+
+#define FW_HDR_FW_VER_BUILD_S	0
+#define FW_HDR_FW_VER_BUILD_M	0xff
+#define FW_HDR_FW_VER_BUILD_V(x) \
+	((x) << FW_HDR_FW_VER_BUILD_S)
+#define FW_HDR_FW_VER_BUILD_G(x) \
+	(((x) >> FW_HDR_FW_VER_BUILD_S) & FW_HDR_FW_VER_BUILD_M)
+
+enum fw_hdr_intfver {
+	FW_HDR_INTFVER_NIC      = 0x00,
+	FW_HDR_INTFVER_VNIC     = 0x00,
+	FW_HDR_INTFVER_OFLD     = 0x00,
+	FW_HDR_INTFVER_RI       = 0x00,
+	FW_HDR_INTFVER_ISCSIPDU = 0x00,
+	FW_HDR_INTFVER_ISCSI    = 0x00,
+	FW_HDR_INTFVER_FCOEPDU  = 0x00,
+	FW_HDR_INTFVER_FCOE     = 0x00,
+};
+
+enum fw_hdr_flags {
+	FW_HDR_FLAGS_RESET_HALT = 0x00000001,
+};
+
+/* length of the formatting string  */
+#define FW_DEVLOG_FMT_LEN	192
+
+/* maximum number of the formatting string parameters */
+#define FW_DEVLOG_FMT_PARAMS_NUM 8
+
+/* priority levels */
+enum fw_devlog_level {
+	FW_DEVLOG_LEVEL_EMERG	= 0x0,
+	FW_DEVLOG_LEVEL_CRIT	= 0x1,
+	FW_DEVLOG_LEVEL_ERR	= 0x2,
+	FW_DEVLOG_LEVEL_NOTICE	= 0x3,
+	FW_DEVLOG_LEVEL_INFO	= 0x4,
+	FW_DEVLOG_LEVEL_DEBUG	= 0x5,
+	FW_DEVLOG_LEVEL_MAX	= 0x5,
+};
+
+/* facilities that may send a log message */
+enum fw_devlog_facility {
+	FW_DEVLOG_FACILITY_CORE		= 0x00,
+	FW_DEVLOG_FACILITY_CF		= 0x01,
+	FW_DEVLOG_FACILITY_SCHED	= 0x02,
+	FW_DEVLOG_FACILITY_TIMER	= 0x04,
+	FW_DEVLOG_FACILITY_RES		= 0x06,
+	FW_DEVLOG_FACILITY_HW		= 0x08,
+	FW_DEVLOG_FACILITY_FLR		= 0x10,
+	FW_DEVLOG_FACILITY_DMAQ		= 0x12,
+	FW_DEVLOG_FACILITY_PHY		= 0x14,
+	FW_DEVLOG_FACILITY_MAC		= 0x16,
+	FW_DEVLOG_FACILITY_PORT		= 0x18,
+	FW_DEVLOG_FACILITY_VI		= 0x1A,
+	FW_DEVLOG_FACILITY_FILTER	= 0x1C,
+	FW_DEVLOG_FACILITY_ACL		= 0x1E,
+	FW_DEVLOG_FACILITY_TM		= 0x20,
+	FW_DEVLOG_FACILITY_QFC		= 0x22,
+	FW_DEVLOG_FACILITY_DCB		= 0x24,
+	FW_DEVLOG_FACILITY_ETH		= 0x26,
+	FW_DEVLOG_FACILITY_OFLD		= 0x28,
+	FW_DEVLOG_FACILITY_RI		= 0x2A,
+	FW_DEVLOG_FACILITY_ISCSI	= 0x2C,
+	FW_DEVLOG_FACILITY_FCOE		= 0x2E,
+	FW_DEVLOG_FACILITY_FOISCSI	= 0x30,
+	FW_DEVLOG_FACILITY_FOFCOE	= 0x32,
+	FW_DEVLOG_FACILITY_CHNET        = 0x34,
+	FW_DEVLOG_FACILITY_MAX          = 0x34,
+};
+
+/* log message format */
+struct fw_devlog_e {
+	__be64	timestamp;
+	__be32	seqno;
+	__be16	reserved1;
+	__u8	level;
+	__u8	facility;
+	__u8	fmt[FW_DEVLOG_FMT_LEN];
+	__be32	params[FW_DEVLOG_FMT_PARAMS_NUM];
+	__be32	reserved3[4];
+};
+
+struct fw_devlog_cmd {
+	__be32 op_to_write;
+	__be32 retval_len16;
+	__u8   level;
+	__u8   r2[7];
+	__be32 memtype_devlog_memaddr16_devlog;
+	__be32 memsize_devlog;
+	__be32 r3[2];
+};
+
+#define FW_DEVLOG_CMD_MEMTYPE_DEVLOG_S		28
+#define FW_DEVLOG_CMD_MEMTYPE_DEVLOG_M		0xf
+#define FW_DEVLOG_CMD_MEMTYPE_DEVLOG_G(x)	\
+	(((x) >> FW_DEVLOG_CMD_MEMTYPE_DEVLOG_S) & \
+	 FW_DEVLOG_CMD_MEMTYPE_DEVLOG_M)
+
+#define FW_DEVLOG_CMD_MEMADDR16_DEVLOG_S	0
+#define FW_DEVLOG_CMD_MEMADDR16_DEVLOG_M	0xfffffff
+#define FW_DEVLOG_CMD_MEMADDR16_DEVLOG_G(x)	\
+	(((x) >> FW_DEVLOG_CMD_MEMADDR16_DEVLOG_S) & \
+	 FW_DEVLOG_CMD_MEMADDR16_DEVLOG_M)
+
+/* P C I E   F W   P F 7   R E G I S T E R */
+
+/* PF7 stores the Firmware Device Log parameters which allows Host Drivers to
+ * access the "devlog" which needing to contact firmware.  The encoding is
+ * mostly the same as that returned by the DEVLOG command except for the size
+ * which is encoded as the number of entries in multiples-1 of 128 here rather
+ * than the memory size as is done in the DEVLOG command.  Thus, 0 means 128
+ * and 15 means 2048.  This of course in turn constrains the allowed values
+ * for the devlog size ...
+ */
+#define PCIE_FW_PF_DEVLOG		7
+
+#define PCIE_FW_PF_DEVLOG_NENTRIES128_S	28
+#define PCIE_FW_PF_DEVLOG_NENTRIES128_M	0xf
+#define PCIE_FW_PF_DEVLOG_NENTRIES128_V(x) \
+	((x) << PCIE_FW_PF_DEVLOG_NENTRIES128_S)
+#define PCIE_FW_PF_DEVLOG_NENTRIES128_G(x) \
+	(((x) >> PCIE_FW_PF_DEVLOG_NENTRIES128_S) & \
+	 PCIE_FW_PF_DEVLOG_NENTRIES128_M)
+
+#define PCIE_FW_PF_DEVLOG_ADDR16_S	4
+#define PCIE_FW_PF_DEVLOG_ADDR16_M	0xffffff
+#define PCIE_FW_PF_DEVLOG_ADDR16_V(x)	((x) << PCIE_FW_PF_DEVLOG_ADDR16_S)
+#define PCIE_FW_PF_DEVLOG_ADDR16_G(x) \
+	(((x) >> PCIE_FW_PF_DEVLOG_ADDR16_S) & PCIE_FW_PF_DEVLOG_ADDR16_M)
+
+#define PCIE_FW_PF_DEVLOG_MEMTYPE_S	0
+#define PCIE_FW_PF_DEVLOG_MEMTYPE_M	0xf
+#define PCIE_FW_PF_DEVLOG_MEMTYPE_V(x)	((x) << PCIE_FW_PF_DEVLOG_MEMTYPE_S)
+#define PCIE_FW_PF_DEVLOG_MEMTYPE_G(x) \
+	(((x) >> PCIE_FW_PF_DEVLOG_MEMTYPE_S) & PCIE_FW_PF_DEVLOG_MEMTYPE_M)
+
+#define MAX_IMM_OFLD_TX_DATA_WR_LEN (0xff + sizeof(struct fw_ofld_tx_data_wr))
+
+struct fw_crypto_lookaside_wr {
+	__be32 op_to_cctx_size;
+	__be32 len16_pkd;
+	__be32 session_id;
+	__be32 rx_chid_to_rx_q_id;
+	__be32 key_addr;
+	__be32 pld_size_hash_size;
+	__be64 cookie;
+};
+
+#define FW_CRYPTO_LOOKASIDE_WR_OPCODE_S 24
+#define FW_CRYPTO_LOOKASIDE_WR_OPCODE_M 0xff
+#define FW_CRYPTO_LOOKASIDE_WR_OPCODE_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_OPCODE_S)
+#define FW_CRYPTO_LOOKASIDE_WR_OPCODE_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_OPCODE_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_OPCODE_M)
+
+#define FW_CRYPTO_LOOKASIDE_WR_COMPL_S 23
+#define FW_CRYPTO_LOOKASIDE_WR_COMPL_M 0x1
+#define FW_CRYPTO_LOOKASIDE_WR_COMPL_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_COMPL_S)
+#define FW_CRYPTO_LOOKASIDE_WR_COMPL_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_COMPL_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_COMPL_M)
+#define FW_CRYPTO_LOOKASIDE_WR_COMPL_F FW_CRYPTO_LOOKASIDE_WR_COMPL_V(1U)
+
+#define FW_CRYPTO_LOOKASIDE_WR_IMM_LEN_S 15
+#define FW_CRYPTO_LOOKASIDE_WR_IMM_LEN_M 0xff
+#define FW_CRYPTO_LOOKASIDE_WR_IMM_LEN_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_IMM_LEN_S)
+#define FW_CRYPTO_LOOKASIDE_WR_IMM_LEN_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_IMM_LEN_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_IMM_LEN_M)
+
+#define FW_CRYPTO_LOOKASIDE_WR_CCTX_LOC_S 5
+#define FW_CRYPTO_LOOKASIDE_WR_CCTX_LOC_M 0x3
+#define FW_CRYPTO_LOOKASIDE_WR_CCTX_LOC_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_CCTX_LOC_S)
+#define FW_CRYPTO_LOOKASIDE_WR_CCTX_LOC_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_CCTX_LOC_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_CCTX_LOC_M)
+
+#define FW_CRYPTO_LOOKASIDE_WR_CCTX_SIZE_S 0
+#define FW_CRYPTO_LOOKASIDE_WR_CCTX_SIZE_M 0x1f
+#define FW_CRYPTO_LOOKASIDE_WR_CCTX_SIZE_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_CCTX_SIZE_S)
+#define FW_CRYPTO_LOOKASIDE_WR_CCTX_SIZE_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_CCTX_SIZE_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_CCTX_SIZE_M)
+
+#define FW_CRYPTO_LOOKASIDE_WR_LEN16_S 0
+#define FW_CRYPTO_LOOKASIDE_WR_LEN16_M 0xff
+#define FW_CRYPTO_LOOKASIDE_WR_LEN16_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_LEN16_S)
+#define FW_CRYPTO_LOOKASIDE_WR_LEN16_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_LEN16_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_LEN16_M)
+
+#define FW_CRYPTO_LOOKASIDE_WR_RX_CHID_S 29
+#define FW_CRYPTO_LOOKASIDE_WR_RX_CHID_M 0x3
+#define FW_CRYPTO_LOOKASIDE_WR_RX_CHID_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_RX_CHID_S)
+#define FW_CRYPTO_LOOKASIDE_WR_RX_CHID_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_RX_CHID_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_RX_CHID_M)
+
+#define FW_CRYPTO_LOOKASIDE_WR_LCB_S  27
+#define FW_CRYPTO_LOOKASIDE_WR_LCB_M  0x3
+#define FW_CRYPTO_LOOKASIDE_WR_LCB_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_LCB_S)
+#define FW_CRYPTO_LOOKASIDE_WR_LCB_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_LCB_S) & FW_CRYPTO_LOOKASIDE_WR_LCB_M)
+
+#define FW_CRYPTO_LOOKASIDE_WR_PHASH_S 25
+#define FW_CRYPTO_LOOKASIDE_WR_PHASH_M 0x3
+#define FW_CRYPTO_LOOKASIDE_WR_PHASH_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_PHASH_S)
+#define FW_CRYPTO_LOOKASIDE_WR_PHASH_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_PHASH_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_PHASH_M)
+
+#define FW_CRYPTO_LOOKASIDE_WR_IV_S   23
+#define FW_CRYPTO_LOOKASIDE_WR_IV_M   0x3
+#define FW_CRYPTO_LOOKASIDE_WR_IV_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_IV_S)
+#define FW_CRYPTO_LOOKASIDE_WR_IV_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_IV_S) & FW_CRYPTO_LOOKASIDE_WR_IV_M)
+
+#define FW_CRYPTO_LOOKASIDE_WR_FQIDX_S   15
+#define FW_CRYPTO_LOOKASIDE_WR_FQIDX_M   0xff
+#define FW_CRYPTO_LOOKASIDE_WR_FQIDX_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_FQIDX_S)
+#define FW_CRYPTO_LOOKASIDE_WR_FQIDX_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_FQIDX_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_FQIDX_M)
+
+#define FW_CRYPTO_LOOKASIDE_WR_TX_CH_S 10
+#define FW_CRYPTO_LOOKASIDE_WR_TX_CH_M 0x3
+#define FW_CRYPTO_LOOKASIDE_WR_TX_CH_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_TX_CH_S)
+#define FW_CRYPTO_LOOKASIDE_WR_TX_CH_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_TX_CH_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_TX_CH_M)
+
+#define FW_CRYPTO_LOOKASIDE_WR_RX_Q_ID_S 0
+#define FW_CRYPTO_LOOKASIDE_WR_RX_Q_ID_M 0x3ff
+#define FW_CRYPTO_LOOKASIDE_WR_RX_Q_ID_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_RX_Q_ID_S)
+#define FW_CRYPTO_LOOKASIDE_WR_RX_Q_ID_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_RX_Q_ID_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_RX_Q_ID_M)
+
+#define FW_CRYPTO_LOOKASIDE_WR_PLD_SIZE_S 24
+#define FW_CRYPTO_LOOKASIDE_WR_PLD_SIZE_M 0xff
+#define FW_CRYPTO_LOOKASIDE_WR_PLD_SIZE_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_PLD_SIZE_S)
+#define FW_CRYPTO_LOOKASIDE_WR_PLD_SIZE_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_PLD_SIZE_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_PLD_SIZE_M)
+
+#define FW_CRYPTO_LOOKASIDE_WR_HASH_SIZE_S 17
+#define FW_CRYPTO_LOOKASIDE_WR_HASH_SIZE_M 0x7f
+#define FW_CRYPTO_LOOKASIDE_WR_HASH_SIZE_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_HASH_SIZE_S)
+#define FW_CRYPTO_LOOKASIDE_WR_HASH_SIZE_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_HASH_SIZE_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_HASH_SIZE_M)
+
+struct fw_tlstx_data_wr {
+	__be32 op_to_immdlen;
+	__be32 flowid_len16;
+	__be32 plen;
+	__be32 lsodisable_to_flags;
+	__be32 r5;
+	__be32 ctxloc_to_exp;
+	__be16 mfs;
+	__be16 adjustedplen_pkd;
+	__be16 expinplenmax_pkd;
+	u8   pdusinplenmax_pkd;
+	u8   r10;
+};
+
+#define FW_TLSTX_DATA_WR_OPCODE_S       24
+#define FW_TLSTX_DATA_WR_OPCODE_M       0xff
+#define FW_TLSTX_DATA_WR_OPCODE_V(x)    ((x) << FW_TLSTX_DATA_WR_OPCODE_S)
+#define FW_TLSTX_DATA_WR_OPCODE_G(x)    \
+	(((x) >> FW_TLSTX_DATA_WR_OPCODE_S) & FW_TLSTX_DATA_WR_OPCODE_M)
+
+#define FW_TLSTX_DATA_WR_COMPL_S        21
+#define FW_TLSTX_DATA_WR_COMPL_M        0x1
+#define FW_TLSTX_DATA_WR_COMPL_V(x)     ((x) << FW_TLSTX_DATA_WR_COMPL_S)
+#define FW_TLSTX_DATA_WR_COMPL_G(x)     \
+	(((x) >> FW_TLSTX_DATA_WR_COMPL_S) & FW_TLSTX_DATA_WR_COMPL_M)
+#define FW_TLSTX_DATA_WR_COMPL_F        FW_TLSTX_DATA_WR_COMPL_V(1U)
+
+#define FW_TLSTX_DATA_WR_IMMDLEN_S      0
+#define FW_TLSTX_DATA_WR_IMMDLEN_M      0xff
+#define FW_TLSTX_DATA_WR_IMMDLEN_V(x)   ((x) << FW_TLSTX_DATA_WR_IMMDLEN_S)
+#define FW_TLSTX_DATA_WR_IMMDLEN_G(x)   \
+	(((x) >> FW_TLSTX_DATA_WR_IMMDLEN_S) & FW_TLSTX_DATA_WR_IMMDLEN_M)
+
+#define FW_TLSTX_DATA_WR_FLOWID_S       8
+#define FW_TLSTX_DATA_WR_FLOWID_M       0xfffff
+#define FW_TLSTX_DATA_WR_FLOWID_V(x)    ((x) << FW_TLSTX_DATA_WR_FLOWID_S)
+#define FW_TLSTX_DATA_WR_FLOWID_G(x)    \
+	(((x) >> FW_TLSTX_DATA_WR_FLOWID_S) & FW_TLSTX_DATA_WR_FLOWID_M)
+
+#define FW_TLSTX_DATA_WR_LEN16_S        0
+#define FW_TLSTX_DATA_WR_LEN16_M        0xff
+#define FW_TLSTX_DATA_WR_LEN16_V(x)     ((x) << FW_TLSTX_DATA_WR_LEN16_S)
+#define FW_TLSTX_DATA_WR_LEN16_G(x)     \
+	(((x) >> FW_TLSTX_DATA_WR_LEN16_S) & FW_TLSTX_DATA_WR_LEN16_M)
+
+#define FW_TLSTX_DATA_WR_LSODISABLE_S   31
+#define FW_TLSTX_DATA_WR_LSODISABLE_M   0x1
+#define FW_TLSTX_DATA_WR_LSODISABLE_V(x) \
+	((x) << FW_TLSTX_DATA_WR_LSODISABLE_S)
+#define FW_TLSTX_DATA_WR_LSODISABLE_G(x) \
+	(((x) >> FW_TLSTX_DATA_WR_LSODISABLE_S) & FW_TLSTX_DATA_WR_LSODISABLE_M)
+#define FW_TLSTX_DATA_WR_LSODISABLE_F   FW_TLSTX_DATA_WR_LSODISABLE_V(1U)
+
+#define FW_TLSTX_DATA_WR_ALIGNPLD_S     30
+#define FW_TLSTX_DATA_WR_ALIGNPLD_M     0x1
+#define FW_TLSTX_DATA_WR_ALIGNPLD_V(x)  ((x) << FW_TLSTX_DATA_WR_ALIGNPLD_S)
+#define FW_TLSTX_DATA_WR_ALIGNPLD_G(x)  \
+	(((x) >> FW_TLSTX_DATA_WR_ALIGNPLD_S) & FW_TLSTX_DATA_WR_ALIGNPLD_M)
+#define FW_TLSTX_DATA_WR_ALIGNPLD_F     FW_TLSTX_DATA_WR_ALIGNPLD_V(1U)
+
+#define FW_TLSTX_DATA_WR_ALIGNPLDSHOVE_S 29
+#define FW_TLSTX_DATA_WR_ALIGNPLDSHOVE_M 0x1
+#define FW_TLSTX_DATA_WR_ALIGNPLDSHOVE_V(x) \
+	((x) << FW_TLSTX_DATA_WR_ALIGNPLDSHOVE_S)
+#define FW_TLSTX_DATA_WR_ALIGNPLDSHOVE_G(x) \
+	(((x) >> FW_TLSTX_DATA_WR_ALIGNPLDSHOVE_S) & \
+	FW_TLSTX_DATA_WR_ALIGNPLDSHOVE_M)
+#define FW_TLSTX_DATA_WR_ALIGNPLDSHOVE_F FW_TLSTX_DATA_WR_ALIGNPLDSHOVE_V(1U)
+
+#define FW_TLSTX_DATA_WR_FLAGS_S        0
+#define FW_TLSTX_DATA_WR_FLAGS_M        0xfffffff
+#define FW_TLSTX_DATA_WR_FLAGS_V(x)     ((x) << FW_TLSTX_DATA_WR_FLAGS_S)
+#define FW_TLSTX_DATA_WR_FLAGS_G(x)     \
+	(((x) >> FW_TLSTX_DATA_WR_FLAGS_S) & FW_TLSTX_DATA_WR_FLAGS_M)
+
+#define FW_TLSTX_DATA_WR_CTXLOC_S       30
+#define FW_TLSTX_DATA_WR_CTXLOC_M       0x3
+#define FW_TLSTX_DATA_WR_CTXLOC_V(x)    ((x) << FW_TLSTX_DATA_WR_CTXLOC_S)
+#define FW_TLSTX_DATA_WR_CTXLOC_G(x)    \
+	(((x) >> FW_TLSTX_DATA_WR_CTXLOC_S) & FW_TLSTX_DATA_WR_CTXLOC_M)
+
+#define FW_TLSTX_DATA_WR_IVDSGL_S       29
+#define FW_TLSTX_DATA_WR_IVDSGL_M       0x1
+#define FW_TLSTX_DATA_WR_IVDSGL_V(x)    ((x) << FW_TLSTX_DATA_WR_IVDSGL_S)
+#define FW_TLSTX_DATA_WR_IVDSGL_G(x)    \
+	(((x) >> FW_TLSTX_DATA_WR_IVDSGL_S) & FW_TLSTX_DATA_WR_IVDSGL_M)
+#define FW_TLSTX_DATA_WR_IVDSGL_F       FW_TLSTX_DATA_WR_IVDSGL_V(1U)
+
+#define FW_TLSTX_DATA_WR_KEYSIZE_S      24
+#define FW_TLSTX_DATA_WR_KEYSIZE_M      0x1f
+#define FW_TLSTX_DATA_WR_KEYSIZE_V(x)   ((x) << FW_TLSTX_DATA_WR_KEYSIZE_S)
+#define FW_TLSTX_DATA_WR_KEYSIZE_G(x)   \
+	(((x) >> FW_TLSTX_DATA_WR_KEYSIZE_S) & FW_TLSTX_DATA_WR_KEYSIZE_M)
+
+#define FW_TLSTX_DATA_WR_NUMIVS_S       14
+#define FW_TLSTX_DATA_WR_NUMIVS_M       0xff
+#define FW_TLSTX_DATA_WR_NUMIVS_V(x)    ((x) << FW_TLSTX_DATA_WR_NUMIVS_S)
+#define FW_TLSTX_DATA_WR_NUMIVS_G(x)    \
+	(((x) >> FW_TLSTX_DATA_WR_NUMIVS_S) & FW_TLSTX_DATA_WR_NUMIVS_M)
+
+#define FW_TLSTX_DATA_WR_EXP_S          0
+#define FW_TLSTX_DATA_WR_EXP_M          0x3fff
+#define FW_TLSTX_DATA_WR_EXP_V(x)       ((x) << FW_TLSTX_DATA_WR_EXP_S)
+#define FW_TLSTX_DATA_WR_EXP_G(x)       \
+	(((x) >> FW_TLSTX_DATA_WR_EXP_S) & FW_TLSTX_DATA_WR_EXP_M)
+
+#define FW_TLSTX_DATA_WR_ADJUSTEDPLEN_S 1
+#define FW_TLSTX_DATA_WR_ADJUSTEDPLEN_V(x) \
+	((x) << FW_TLSTX_DATA_WR_ADJUSTEDPLEN_S)
+
+#define FW_TLSTX_DATA_WR_EXPINPLENMAX_S 4
+#define FW_TLSTX_DATA_WR_EXPINPLENMAX_V(x) \
+	((x) << FW_TLSTX_DATA_WR_EXPINPLENMAX_S)
+
+#define FW_TLSTX_DATA_WR_PDUSINPLENMAX_S 2
+#define FW_TLSTX_DATA_WR_PDUSINPLENMAX_V(x) \
+	((x) << FW_TLSTX_DATA_WR_PDUSINPLENMAX_S)
+
+#endif /* _T4FW_INTERFACE_H_ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h
new file mode 100644
index 0000000..123e2c1
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h
@@ -0,0 +1,64 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __T4FW_VERSION_H__
+#define __T4FW_VERSION_H__
+
+#define T4FW_VERSION_MAJOR 0x01
+#define T4FW_VERSION_MINOR 0x10
+#define T4FW_VERSION_MICRO 0x3F
+#define T4FW_VERSION_BUILD 0x00
+
+#define T4FW_MIN_VERSION_MAJOR 0x01
+#define T4FW_MIN_VERSION_MINOR 0x04
+#define T4FW_MIN_VERSION_MICRO 0x00
+
+#define T5FW_VERSION_MAJOR 0x01
+#define T5FW_VERSION_MINOR 0x10
+#define T5FW_VERSION_MICRO 0x3F
+#define T5FW_VERSION_BUILD 0x00
+
+#define T5FW_MIN_VERSION_MAJOR 0x00
+#define T5FW_MIN_VERSION_MINOR 0x00
+#define T5FW_MIN_VERSION_MICRO 0x00
+
+#define T6FW_VERSION_MAJOR 0x01
+#define T6FW_VERSION_MINOR 0x10
+#define T6FW_VERSION_MICRO 0x3F
+#define T6FW_VERSION_BUILD 0x00
+
+#define T6FW_MIN_VERSION_MAJOR 0x00
+#define T6FW_MIN_VERSION_MINOR 0x00
+#define T6FW_MIN_VERSION_MICRO 0x00
+#endif
diff --git a/drivers/net/ethernet/emulex/Kconfig b/drivers/net/ethernet/emulex/Kconfig
new file mode 100644
index 0000000..fdbb27c
--- /dev/null
+++ b/drivers/net/ethernet/emulex/Kconfig
@@ -0,0 +1,21 @@
+#
+# Emulex driver configuration
+#
+
+config NET_VENDOR_EMULEX
+	bool "Emulex devices"
+	default y
+	depends on PCI
+	---help---
+	  If you have a network (Ethernet) card belonging to this class, say Y.
+
+	  Note that the answer to this question doesn't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about Emulex cards. If you say Y, you will be asked for
+	  your specific card in the following questions.
+
+if NET_VENDOR_EMULEX
+
+source "drivers/net/ethernet/emulex/benet/Kconfig"
+
+endif # NET_VENDOR_EMULEX
diff --git a/drivers/net/ethernet/emulex/Makefile b/drivers/net/ethernet/emulex/Makefile
new file mode 100644
index 0000000..ea8ec57
--- /dev/null
+++ b/drivers/net/ethernet/emulex/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the Emulex device drivers.
+#
+
+obj-$(CONFIG_BE2NET) += benet/
diff --git a/drivers/net/ethernet/emulex/benet/Kconfig b/drivers/net/ethernet/emulex/benet/Kconfig
new file mode 100644
index 0000000..b4853ec
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/Kconfig
@@ -0,0 +1,15 @@
+config BE2NET
+	tristate "ServerEngines' 10Gbps NIC - BladeEngine"
+	depends on PCI
+	---help---
+	  This driver implements the NIC functionality for ServerEngines'
+	  10Gbps network adapter - BladeEngine.
+
+config BE2NET_HWMON
+	bool "HWMON support for be2net driver"
+	depends on BE2NET && HWMON
+	depends on !(BE2NET=y && HWMON=m)
+	default y
+	---help---
+	  Say Y here if you want to expose thermal sensor data on
+	  be2net network adapter.
diff --git a/drivers/net/ethernet/emulex/benet/Makefile b/drivers/net/ethernet/emulex/benet/Makefile
new file mode 100644
index 0000000..1a91b27
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile to build the network driver for ServerEngine's BladeEngine.
+#
+
+obj-$(CONFIG_BE2NET) += be2net.o
+
+be2net-y :=  be_main.o be_cmds.o be_ethtool.o be_roce.o
diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
new file mode 100644
index 0000000..382891f
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -0,0 +1,1002 @@
+/*
+ * Copyright (C) 2005 - 2016 Broadcom
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.  The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#ifndef BE_H
+#define BE_H
+
+#include <linux/pci.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <net/tcp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/if_vlan.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <linux/firmware.h>
+#include <linux/slab.h>
+#include <linux/u64_stats_sync.h>
+#include <linux/cpumask.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+
+#include "be_hw.h"
+#include "be_roce.h"
+
+#define DRV_VER			"11.4.0.0"
+#define DRV_NAME		"be2net"
+#define BE_NAME			"Emulex BladeEngine2"
+#define BE3_NAME		"Emulex BladeEngine3"
+#define OC_NAME			"Emulex OneConnect"
+#define OC_NAME_BE		OC_NAME	"(be3)"
+#define OC_NAME_LANCER		OC_NAME "(Lancer)"
+#define OC_NAME_SH		OC_NAME "(Skyhawk)"
+#define DRV_DESC		"Emulex OneConnect NIC Driver"
+
+#define BE_VENDOR_ID 		0x19a2
+#define EMULEX_VENDOR_ID	0x10df
+#define BE_DEVICE_ID1		0x211
+#define BE_DEVICE_ID2		0x221
+#define OC_DEVICE_ID1		0x700	/* Device Id for BE2 cards */
+#define OC_DEVICE_ID2		0x710	/* Device Id for BE3 cards */
+#define OC_DEVICE_ID3		0xe220	/* Device id for Lancer cards */
+#define OC_DEVICE_ID4           0xe228   /* Device id for VF in Lancer */
+#define OC_DEVICE_ID5		0x720	/* Device Id for Skyhawk cards */
+#define OC_DEVICE_ID6		0x728   /* Device id for VF in SkyHawk */
+#define OC_SUBSYS_DEVICE_ID1	0xE602
+#define OC_SUBSYS_DEVICE_ID2	0xE642
+#define OC_SUBSYS_DEVICE_ID3	0xE612
+#define OC_SUBSYS_DEVICE_ID4	0xE652
+
+/* Number of bytes of an RX frame that are copied to skb->data */
+#define BE_HDR_LEN		((u16) 64)
+/* allocate extra space to allow tunneling decapsulation without head reallocation */
+#define BE_RX_SKB_ALLOC_SIZE	256
+
+#define BE_MAX_JUMBO_FRAME_SIZE	9018
+#define BE_MIN_MTU		256
+#define BE_MAX_MTU              (BE_MAX_JUMBO_FRAME_SIZE -	\
+				 (ETH_HLEN + ETH_FCS_LEN))
+
+/* Accommodate for QnQ configurations where VLAN insertion is enabled in HW */
+#define BE_MAX_GSO_SIZE		(65535 - 2 * VLAN_HLEN)
+
+#define BE_NUM_VLANS_SUPPORTED	64
+#define BE_MAX_EQD		128u
+#define	BE_MAX_TX_FRAG_COUNT	30
+
+#define EVNT_Q_LEN		1024
+#define TX_Q_LEN		2048
+#define TX_CQ_LEN		1024
+#define RX_Q_LEN		1024	/* Does not support any other value */
+#define RX_CQ_LEN		1024
+#define MCC_Q_LEN		128	/* total size not to exceed 8 pages */
+#define MCC_CQ_LEN		256
+
+#define BE2_MAX_RSS_QS		4
+#define BE3_MAX_RSS_QS		16
+#define BE3_MAX_TX_QS		16
+#define BE3_MAX_EVT_QS		16
+#define BE3_SRIOV_MAX_EVT_QS	8
+#define SH_VF_MAX_NIC_EQS	3	/* Skyhawk VFs can have a max of 4 EQs
+					 * and at least 1 is granted to either
+					 * SURF/DPDK
+					 */
+
+#define MAX_PORT_RSS_TABLES	15
+#define MAX_NIC_FUNCS		16
+#define MAX_RX_QS		32
+#define MAX_EVT_QS		32
+#define MAX_TX_QS		32
+
+#define MAX_ROCE_EQS		5
+#define MAX_MSIX_VECTORS	32
+#define MIN_MSIX_VECTORS	1
+#define BE_NAPI_WEIGHT		64
+#define MAX_RX_POST		BE_NAPI_WEIGHT /* Frags posted at a time */
+#define RX_FRAGS_REFILL_WM	(RX_Q_LEN - MAX_RX_POST)
+#define MAX_NUM_POST_ERX_DB	255u
+
+#define MAX_VFS			30 /* Max VFs supported by BE3 FW */
+#define FW_VER_LEN		32
+#define	CNTL_SERIAL_NUM_WORDS	8  /* Controller serial number words */
+#define	CNTL_SERIAL_NUM_WORD_SZ	(sizeof(u16)) /* Byte-sz of serial num word */
+
+#define	RSS_INDIR_TABLE_LEN	128
+#define RSS_HASH_KEY_LEN	40
+
+#define BE_UNKNOWN_PHY_STATE	0xFF
+
+struct be_dma_mem {
+	void *va;
+	dma_addr_t dma;
+	u32 size;
+};
+
+struct be_queue_info {
+	u32 len;
+	u32 entry_size;	/* Size of an element in the queue */
+	u32 tail, head;
+	atomic_t used;	/* Number of valid elements in the queue */
+	u32 id;
+	struct be_dma_mem dma_mem;
+	bool created;
+};
+
+static inline u32 MODULO(u32 val, u32 limit)
+{
+	BUG_ON(limit & (limit - 1));
+	return val & (limit - 1);
+}
+
+static inline void index_adv(u32 *index, u32 val, u32 limit)
+{
+	*index = MODULO((*index + val), limit);
+}
+
+static inline void index_inc(u32 *index, u32 limit)
+{
+	*index = MODULO((*index + 1), limit);
+}
+
+static inline void *queue_head_node(struct be_queue_info *q)
+{
+	return q->dma_mem.va + q->head * q->entry_size;
+}
+
+static inline void *queue_tail_node(struct be_queue_info *q)
+{
+	return q->dma_mem.va + q->tail * q->entry_size;
+}
+
+static inline void *queue_index_node(struct be_queue_info *q, u16 index)
+{
+	return q->dma_mem.va + index * q->entry_size;
+}
+
+static inline void queue_head_inc(struct be_queue_info *q)
+{
+	index_inc(&q->head, q->len);
+}
+
+static inline void index_dec(u32 *index, u32 limit)
+{
+	*index = MODULO((*index - 1), limit);
+}
+
+static inline void queue_tail_inc(struct be_queue_info *q)
+{
+	index_inc(&q->tail, q->len);
+}
+
+struct be_eq_obj {
+	struct be_queue_info q;
+	char desc[32];
+
+	/* Adaptive interrupt coalescing (AIC) info */
+	bool enable_aic;
+	u32 min_eqd;		/* in usecs */
+	u32 max_eqd;		/* in usecs */
+	u32 eqd;		/* configured val when aic is off */
+	u32 cur_eqd;		/* in usecs */
+
+	u8 idx;			/* array index */
+	u8 msix_idx;
+	u16 spurious_intr;
+	struct napi_struct napi;
+	struct be_adapter *adapter;
+	cpumask_var_t  affinity_mask;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+#define BE_EQ_IDLE		0
+#define BE_EQ_NAPI		1	/* napi owns this EQ */
+#define BE_EQ_POLL		2	/* poll owns this EQ */
+#define BE_EQ_LOCKED		(BE_EQ_NAPI | BE_EQ_POLL)
+#define BE_EQ_NAPI_YIELD	4	/* napi yielded this EQ */
+#define BE_EQ_POLL_YIELD	8	/* poll yielded this EQ */
+#define BE_EQ_YIELD		(BE_EQ_NAPI_YIELD | BE_EQ_POLL_YIELD)
+#define BE_EQ_USER_PEND		(BE_EQ_POLL | BE_EQ_POLL_YIELD)
+	unsigned int state;
+	spinlock_t lock;	/* lock to serialize napi and busy-poll */
+#endif  /* CONFIG_NET_RX_BUSY_POLL */
+} ____cacheline_aligned_in_smp;
+
+struct be_aic_obj {		/* Adaptive interrupt coalescing (AIC) info */
+	bool enable;
+	u32 min_eqd;		/* in usecs */
+	u32 max_eqd;		/* in usecs */
+	u32 prev_eqd;		/* in usecs */
+	u32 et_eqd;		/* configured val when aic is off */
+	ulong jiffies;
+	u64 rx_pkts_prev;	/* Used to calculate RX pps */
+	u64 tx_reqs_prev;	/* Used to calculate TX pps */
+};
+
+struct be_mcc_obj {
+	struct be_queue_info q;
+	struct be_queue_info cq;
+	bool rearm_cq;
+};
+
+struct be_tx_stats {
+	u64 tx_bytes;
+	u64 tx_pkts;
+	u64 tx_vxlan_offload_pkts;
+	u64 tx_reqs;
+	u64 tx_compl;
+	ulong tx_jiffies;
+	u32 tx_stops;
+	u32 tx_drv_drops;	/* pkts dropped by driver */
+	/* the error counters are described in be_ethtool.c */
+	u32 tx_hdr_parse_err;
+	u32 tx_dma_err;
+	u32 tx_tso_err;
+	u32 tx_spoof_check_err;
+	u32 tx_qinq_err;
+	u32 tx_internal_parity_err;
+	u32 tx_sge_err;
+	struct u64_stats_sync sync;
+	struct u64_stats_sync sync_compl;
+};
+
+/* Structure to hold some data of interest obtained from a TX CQE */
+struct be_tx_compl_info {
+	u8 status;		/* Completion status */
+	u16 end_index;		/* Completed TXQ Index */
+};
+
+struct be_tx_obj {
+	u32 db_offset;
+	struct be_queue_info q;
+	struct be_queue_info cq;
+	struct be_tx_compl_info txcp;
+	/* Remember the skbs that were transmitted */
+	struct sk_buff *sent_skb_list[TX_Q_LEN];
+	struct be_tx_stats stats;
+	u16 pend_wrb_cnt;	/* Number of WRBs yet to be given to HW */
+	u16 last_req_wrb_cnt;	/* wrb cnt of the last req in the Q */
+	u16 last_req_hdr;	/* index of the last req's hdr-wrb */
+} ____cacheline_aligned_in_smp;
+
+/* Struct to remember the pages posted for rx frags */
+struct be_rx_page_info {
+	struct page *page;
+	/* set to page-addr for last frag of the page & frag-addr otherwise */
+	DEFINE_DMA_UNMAP_ADDR(bus);
+	u16 page_offset;
+	bool last_frag;		/* last frag of the page */
+};
+
+struct be_rx_stats {
+	u64 rx_bytes;
+	u64 rx_pkts;
+	u64 rx_vxlan_offload_pkts;
+	u32 rx_drops_no_skbs;	/* skb allocation errors */
+	u32 rx_drops_no_frags;	/* HW has no fetched frags */
+	u32 rx_post_fail;	/* page post alloc failures */
+	u32 rx_compl;
+	u32 rx_mcast_pkts;
+	u32 rx_compl_err;	/* completions with err set */
+	struct u64_stats_sync sync;
+};
+
+struct be_rx_compl_info {
+	u32 rss_hash;
+	u16 vlan_tag;
+	u16 pkt_size;
+	u16 port;
+	u8 vlanf;
+	u8 num_rcvd;
+	u8 err;
+	u8 ipf;
+	u8 tcpf;
+	u8 udpf;
+	u8 ip_csum;
+	u8 l4_csum;
+	u8 ipv6;
+	u8 qnq;
+	u8 pkt_type;
+	u8 ip_frag;
+	u8 tunneled;
+};
+
+struct be_rx_obj {
+	struct be_adapter *adapter;
+	struct be_queue_info q;
+	struct be_queue_info cq;
+	struct be_rx_compl_info rxcp;
+	struct be_rx_page_info page_info_tbl[RX_Q_LEN];
+	struct be_rx_stats stats;
+	u8 rss_id;
+	bool rx_post_starved;	/* Zero rx frags have been posted to BE */
+} ____cacheline_aligned_in_smp;
+
+struct be_drv_stats {
+	u32 eth_red_drops;
+	u32 dma_map_errors;
+	u32 rx_drops_no_pbuf;
+	u32 rx_drops_no_txpb;
+	u32 rx_drops_no_erx_descr;
+	u32 rx_drops_no_tpre_descr;
+	u32 rx_drops_too_many_frags;
+	u32 forwarded_packets;
+	u32 rx_drops_mtu;
+	u32 rx_crc_errors;
+	u32 rx_alignment_symbol_errors;
+	u32 rx_pause_frames;
+	u32 rx_priority_pause_frames;
+	u32 rx_control_frames;
+	u32 rx_in_range_errors;
+	u32 rx_out_range_errors;
+	u32 rx_frame_too_long;
+	u32 rx_address_filtered;
+	u32 rx_dropped_too_small;
+	u32 rx_dropped_too_short;
+	u32 rx_dropped_header_too_small;
+	u32 rx_dropped_tcp_length;
+	u32 rx_dropped_runt;
+	u32 rx_ip_checksum_errs;
+	u32 rx_tcp_checksum_errs;
+	u32 rx_udp_checksum_errs;
+	u32 tx_pauseframes;
+	u32 tx_priority_pauseframes;
+	u32 tx_controlframes;
+	u32 rxpp_fifo_overflow_drop;
+	u32 rx_input_fifo_overflow_drop;
+	u32 pmem_fifo_overflow_drop;
+	u32 jabber_events;
+	u32 rx_roce_bytes_lsd;
+	u32 rx_roce_bytes_msd;
+	u32 rx_roce_frames;
+	u32 roce_drops_payload_len;
+	u32 roce_drops_crc;
+};
+
+/* A vlan-id of 0xFFFF must be used to clear transparent vlan-tagging */
+#define BE_RESET_VLAN_TAG_ID	0xFFFF
+
+struct be_vf_cfg {
+	unsigned char mac_addr[ETH_ALEN];
+	int if_handle;
+	int pmac_id;
+	u16 vlan_tag;
+	u32 tx_rate;
+	u32 plink_tracking;
+	u32 privileges;
+	bool spoofchk;
+};
+
+enum vf_state {
+	ENABLED = 0,
+	ASSIGNED = 1
+};
+
+#define BE_FLAGS_LINK_STATUS_INIT		BIT(1)
+#define BE_FLAGS_SRIOV_ENABLED			BIT(2)
+#define BE_FLAGS_WORKER_SCHEDULED		BIT(3)
+#define BE_FLAGS_NAPI_ENABLED			BIT(6)
+#define BE_FLAGS_QNQ_ASYNC_EVT_RCVD		BIT(7)
+#define BE_FLAGS_VXLAN_OFFLOADS			BIT(8)
+#define BE_FLAGS_SETUP_DONE			BIT(9)
+#define BE_FLAGS_PHY_MISCONFIGURED		BIT(10)
+#define BE_FLAGS_ERR_DETECTION_SCHEDULED	BIT(11)
+#define BE_FLAGS_OS2BMC				BIT(12)
+#define BE_FLAGS_TRY_RECOVERY			BIT(13)
+
+#define BE_UC_PMAC_COUNT			30
+#define BE_VF_UC_PMAC_COUNT			2
+
+#define MAX_ERR_RECOVERY_RETRY_COUNT		3
+#define ERR_DETECTION_DELAY			1000
+
+/* Ethtool set_dump flags */
+#define LANCER_INITIATE_FW_DUMP			0x1
+#define LANCER_DELETE_FW_DUMP			0x2
+
+struct phy_info {
+/* From SFF-8472 spec */
+#define SFP_VENDOR_NAME_LEN			17
+	u8 transceiver;
+	u8 autoneg;
+	u8 fc_autoneg;
+	u8 port_type;
+	u16 phy_type;
+	u16 interface_type;
+	u32 misc_params;
+	u16 auto_speeds_supported;
+	u16 fixed_speeds_supported;
+	int link_speed;
+	u32 advertising;
+	u32 supported;
+	u8 cable_type;
+	u8 vendor_name[SFP_VENDOR_NAME_LEN];
+	u8 vendor_pn[SFP_VENDOR_NAME_LEN];
+};
+
+struct be_resources {
+	u16 max_vfs;		/* Total VFs "really" supported by FW/HW */
+	u16 max_mcast_mac;
+	u16 max_tx_qs;
+	u16 max_rss_qs;
+	u16 max_rx_qs;
+	u16 max_cq_count;
+	u16 max_uc_mac;		/* Max UC MACs programmable */
+	u16 max_vlans;		/* Number of vlans supported */
+	u16 max_iface_count;
+	u16 max_mcc_count;
+	u16 max_evt_qs;
+	u16 max_nic_evt_qs;	/* NIC's share of evt qs */
+	u32 if_cap_flags;
+	u32 vf_if_cap_flags;	/* VF if capability flags */
+	u32 flags;
+	/* Calculated PF Pool's share of RSS Tables. This is not enforced by
+	 * the FW, but is a self-imposed driver limitation.
+	 */
+	u16 max_rss_tables;
+};
+
+/* These are port-wide values */
+struct be_port_resources {
+	u16 max_vfs;
+	u16 nic_pfs;
+};
+
+#define be_is_os2bmc_enabled(adapter) (adapter->flags & BE_FLAGS_OS2BMC)
+
+struct rss_info {
+	u64 rss_flags;
+	u8 rsstable[RSS_INDIR_TABLE_LEN];
+	u8 rss_queue[RSS_INDIR_TABLE_LEN];
+	u8 rss_hkey[RSS_HASH_KEY_LEN];
+};
+
+#define BE_INVALID_DIE_TEMP	0xFF
+struct be_hwmon {
+	struct device *hwmon_dev;
+	u8 be_on_die_temp;  /* Unit: millidegree Celsius */
+};
+
+/* Macros to read/write the 'features' word of be_wrb_params structure.
+ */
+#define	BE_WRB_F_BIT(name)			BE_WRB_F_##name##_BIT
+#define	BE_WRB_F_MASK(name)			BIT_MASK(BE_WRB_F_##name##_BIT)
+
+#define	BE_WRB_F_GET(word, name)	\
+	(((word) & (BE_WRB_F_MASK(name))) >> BE_WRB_F_BIT(name))
+
+#define	BE_WRB_F_SET(word, name, val)	\
+	((word) |= (((val) << BE_WRB_F_BIT(name)) & BE_WRB_F_MASK(name)))
+
+/* Feature/offload bits */
+enum {
+	BE_WRB_F_CRC_BIT,		/* Ethernet CRC */
+	BE_WRB_F_IPCS_BIT,		/* IP csum */
+	BE_WRB_F_TCPCS_BIT,		/* TCP csum */
+	BE_WRB_F_UDPCS_BIT,		/* UDP csum */
+	BE_WRB_F_LSO_BIT,		/* LSO */
+	BE_WRB_F_LSO6_BIT,		/* LSO6 */
+	BE_WRB_F_VLAN_BIT,		/* VLAN */
+	BE_WRB_F_VLAN_SKIP_HW_BIT,	/* Skip VLAN tag (workaround) */
+	BE_WRB_F_OS2BMC_BIT		/* Send packet to the management ring */
+};
+
+/* The structure below provides a HW-agnostic abstraction of WRB params
+ * retrieved from a TX skb. This is in turn passed to chip specific routines
+ * during transmit, to set the corresponding params in the WRB.
+ */
+struct be_wrb_params {
+	u32 features;	/* Feature bits */
+	u16 vlan_tag;	/* VLAN tag */
+	u16 lso_mss;	/* MSS for LSO */
+};
+
+struct be_eth_addr {
+	unsigned char mac[ETH_ALEN];
+};
+
+#define BE_SEC	1000			/* in msec */
+#define BE_MIN	(60 * BE_SEC)		/* in msec */
+#define BE_HOUR	(60 * BE_MIN)		/* in msec */
+
+#define ERR_RECOVERY_MAX_RETRY_COUNT		3
+#define ERR_RECOVERY_DETECTION_DELAY		BE_SEC
+#define ERR_RECOVERY_RETRY_DELAY		(30 * BE_SEC)
+
+/* UE-detection-duration in BEx/Skyhawk:
+ * All PFs must wait for this duration after they detect UE before reading
+ * SLIPORT_SEMAPHORE register. At the end of this duration, the Firmware
+ * guarantees that the SLIPORT_SEMAPHORE register is updated to indicate
+ * if the UE is recoverable.
+ */
+#define ERR_RECOVERY_UE_DETECT_DURATION			BE_SEC
+
+/* Initial idle time (in msec) to elapse after driver load,
+ * before UE recovery is allowed.
+ */
+#define ERR_IDLE_HR			24
+#define ERR_RECOVERY_IDLE_TIME		(ERR_IDLE_HR * BE_HOUR)
+
+/* Time interval (in msec) after which UE recovery can be repeated */
+#define ERR_INTERVAL_HR			72
+#define ERR_RECOVERY_INTERVAL		(ERR_INTERVAL_HR * BE_HOUR)
+
+/* BEx/SH UE recovery state machine */
+enum {
+	ERR_RECOVERY_ST_NONE = 0,		/* No Recovery */
+	ERR_RECOVERY_ST_DETECT = 1,		/* UE detection duration */
+	ERR_RECOVERY_ST_RESET = 2,		/* Reset Phase (PF0 only) */
+	ERR_RECOVERY_ST_PRE_POLL = 3,		/* Pre-Poll Phase (all PFs) */
+	ERR_RECOVERY_ST_REINIT = 4		/* Re-initialize Phase */
+};
+
+struct be_error_recovery {
+	/* Lancer error recovery variables */
+	u8 recovery_retries;
+
+	/* BEx/Skyhawk error recovery variables */
+	u8 recovery_state;
+	u16 ue_to_reset_time;		/* Time after UE, to soft reset
+					 * the chip - PF0 only
+					 */
+	u16 ue_to_poll_time;		/* Time after UE, to Restart Polling
+					 * of SLIPORT_SEMAPHORE reg
+					 */
+	u16 last_err_code;
+	bool recovery_supported;
+	unsigned long probe_time;
+	unsigned long last_recovery_time;
+
+	/* Common to both Lancer & BEx/SH error recovery */
+	u32 resched_delay;
+	struct delayed_work err_detection_work;
+};
+
+/* Ethtool priv_flags */
+#define	BE_DISABLE_TPE_RECOVERY	0x1
+
+struct be_vxlan_port {
+	struct list_head list;
+	__be16 port;		/* VxLAN UDP dst port */
+	int port_aliases;	/* alias count */
+};
+
+struct be_adapter {
+	struct pci_dev *pdev;
+	struct net_device *netdev;
+
+	u8 __iomem *csr;	/* CSR BAR used only for BE2/3 */
+	u8 __iomem *db;		/* Door Bell */
+	u8 __iomem *pcicfg;	/* On SH,BEx only. Shadow of PCI config space */
+
+	struct mutex mbox_lock; /* For serializing mbox cmds to BE card */
+	struct be_dma_mem mbox_mem;
+	/* Mbox mem is adjusted to align to 16 bytes. The allocated addr
+	 * is stored for freeing purpose */
+	struct be_dma_mem mbox_mem_alloced;
+
+	struct be_mcc_obj mcc_obj;
+	struct mutex mcc_lock;	/* For serializing mcc cmds to BE card */
+	spinlock_t mcc_cq_lock;
+
+	u16 cfg_num_rx_irqs;		/* configured via set-channels */
+	u16 cfg_num_tx_irqs;		/* configured via set-channels */
+	u16 num_evt_qs;
+	u16 num_msix_vec;
+	struct be_eq_obj eq_obj[MAX_EVT_QS];
+	struct msix_entry msix_entries[MAX_MSIX_VECTORS];
+	bool isr_registered;
+
+	/* TX Rings */
+	u16 num_tx_qs;
+	struct be_tx_obj tx_obj[MAX_TX_QS];
+
+	/* Rx rings */
+	u16 num_rx_qs;
+	u16 num_rss_qs;
+	u16 need_def_rxq;
+	struct be_rx_obj rx_obj[MAX_RX_QS];
+	u32 big_page_size;	/* Compounded page size shared by rx wrbs */
+
+	struct be_drv_stats drv_stats;
+	struct be_aic_obj aic_obj[MAX_EVT_QS];
+	u8 vlan_prio_bmap;	/* Available Priority BitMap */
+	u16 recommended_prio_bits;/* Recommended Priority bits in vlan tag */
+	struct be_dma_mem rx_filter; /* Cmd DMA mem for rx-filter */
+
+	struct be_dma_mem stats_cmd;
+	/* Work queue used to perform periodic tasks like getting statistics */
+	struct delayed_work work;
+	u16 work_counter;
+
+	u8 recovery_retries;
+	u8 err_flags;
+	bool pcicfg_mapped;	/* pcicfg obtained via pci_iomap() */
+	u32 flags;
+	u32 cmd_privileges;
+	/* Ethtool knobs and info */
+	char fw_ver[FW_VER_LEN];
+	char fw_on_flash[FW_VER_LEN];
+
+	/* IFACE filtering fields */
+	int if_handle;		/* Used to configure filtering */
+	u32 if_flags;		/* Interface filtering flags */
+	u32 *pmac_id;		/* MAC addr handle used by BE card */
+	struct be_eth_addr *uc_list;/* list of uc-addrs programmed (not perm) */
+	u32 uc_macs;		/* Count of secondary UC MAC programmed */
+	struct be_eth_addr *mc_list;/* list of mcast addrs programmed */
+	u32 mc_count;
+	unsigned long vids[BITS_TO_LONGS(VLAN_N_VID)];
+	u16 vlans_added;
+	bool update_uc_list;
+	bool update_mc_list;
+	struct mutex rx_filter_lock;/* For protecting vids[] & mc/uc_list[] */
+
+	u32 beacon_state;	/* for set_phys_id */
+
+	u32 port_num;
+	char port_name;
+	u8 mc_type;
+	u32 function_mode;
+	u32 function_caps;
+	u32 rx_fc;		/* Rx flow control */
+	u32 tx_fc;		/* Tx flow control */
+	bool stats_cmd_sent;
+	struct {
+		u32 size;
+		u32 total_size;
+		u64 io_addr;
+	} roce_db;
+	u32 num_msix_roce_vec;
+	struct ocrdma_dev *ocrdma_dev;
+	struct list_head entry;
+
+	u32 flash_status;
+	struct completion et_cmd_compl;
+
+	struct be_resources pool_res;	/* resources available for the port */
+	struct be_resources res;	/* resources available for the func */
+	u16 num_vfs;			/* Number of VFs provisioned by PF */
+	u8 pf_num;			/* Numbering used by FW, starts at 0 */
+	u8 vf_num;			/* Numbering used by FW, starts at 1 */
+	u8 virtfn;
+	struct be_vf_cfg *vf_cfg;
+	bool be3_native;
+	u32 sli_family;
+	u8 hba_port_num;
+	u16 pvid;
+	__be16 vxlan_port;		/* offloaded vxlan port num */
+	int vxlan_port_count;		/* active vxlan port count */
+	struct list_head vxlan_port_list;	/* vxlan port list */
+	struct phy_info phy;
+	u8 wol_cap;
+	bool wol_en;
+	u16 asic_rev;
+	u16 qnq_vid;
+	u32 msg_enable;
+	int be_get_temp_freq;
+	struct be_hwmon hwmon_info;
+	struct rss_info rss_info;
+	/* Filters for packets that need to be sent to BMC */
+	u32 bmc_filt_mask;
+	u32 fat_dump_len;
+	u16 serial_num[CNTL_SERIAL_NUM_WORDS];
+	u8 phy_state; /* state of sfp optics (functional, faulted, etc.,) */
+	u8 dev_mac[ETH_ALEN];
+	u32 priv_flags; /* ethtool get/set_priv_flags() */
+	struct be_error_recovery error_recovery;
+};
+
+/* Used for defered FW config cmds. Add fields to this struct as reqd */
+struct be_cmd_work {
+	struct work_struct work;
+	struct be_adapter *adapter;
+	union {
+		__be16 vxlan_port;
+	} info;
+};
+
+#define be_physfn(adapter)		(!adapter->virtfn)
+#define be_virtfn(adapter)		(adapter->virtfn)
+#define sriov_enabled(adapter)		(adapter->flags &	\
+					 BE_FLAGS_SRIOV_ENABLED)
+
+#define for_all_vfs(adapter, vf_cfg, i)					\
+	for (i = 0, vf_cfg = &adapter->vf_cfg[i]; i < adapter->num_vfs;	\
+		i++, vf_cfg++)
+
+#define ON				1
+#define OFF				0
+
+#define be_max_vlans(adapter)		(adapter->res.max_vlans)
+#define be_max_uc(adapter)		(adapter->res.max_uc_mac)
+#define be_max_mc(adapter)		(adapter->res.max_mcast_mac)
+#define be_max_vfs(adapter)		(adapter->pool_res.max_vfs)
+#define be_max_rss(adapter)		(adapter->res.max_rss_qs)
+#define be_max_txqs(adapter)		(adapter->res.max_tx_qs)
+#define be_max_prio_txqs(adapter)	(adapter->res.max_prio_tx_qs)
+#define be_max_rxqs(adapter)		(adapter->res.max_rx_qs)
+/* Max number of EQs available for the function (NIC + RoCE (if enabled)) */
+#define be_max_func_eqs(adapter)	(adapter->res.max_evt_qs)
+/* Max number of EQs available avaialble only for NIC */
+#define be_max_nic_eqs(adapter)		(adapter->res.max_nic_evt_qs)
+#define be_if_cap_flags(adapter)	(adapter->res.if_cap_flags)
+#define be_max_pf_pool_rss_tables(adapter)	\
+				(adapter->pool_res.max_rss_tables)
+/* Max irqs avaialble for NIC */
+#define be_max_irqs(adapter)		\
+			(min_t(u16, be_max_nic_eqs(adapter), num_online_cpus()))
+
+/* Max irqs *needed* for RX queues */
+static inline u16 be_max_rx_irqs(struct be_adapter *adapter)
+{
+	/* If no RSS, need atleast one irq for def-RXQ */
+	u16 num = max_t(u16, be_max_rss(adapter), 1);
+
+	return min_t(u16, num, be_max_irqs(adapter));
+}
+
+/* Max irqs *needed* for TX queues */
+static inline u16 be_max_tx_irqs(struct be_adapter *adapter)
+{
+	return min_t(u16, be_max_txqs(adapter), be_max_irqs(adapter));
+}
+
+/* Max irqs *needed* for combined queues */
+static inline u16 be_max_qp_irqs(struct be_adapter *adapter)
+{
+	return min(be_max_tx_irqs(adapter), be_max_rx_irqs(adapter));
+}
+
+/* Max irqs *needed* for RX and TX queues together */
+static inline u16 be_max_any_irqs(struct be_adapter *adapter)
+{
+	return max(be_max_tx_irqs(adapter), be_max_rx_irqs(adapter));
+}
+
+/* Is BE in pvid_tagging mode */
+#define be_pvid_tagging_enabled(adapter)	(adapter->pvid)
+
+/* Is BE in QNQ multi-channel mode */
+#define be_is_qnq_mode(adapter)		(adapter->function_mode & QNQ_MODE)
+
+#define lancer_chip(adapter)	(adapter->pdev->device == OC_DEVICE_ID3 || \
+				 adapter->pdev->device == OC_DEVICE_ID4)
+
+#define skyhawk_chip(adapter)	(adapter->pdev->device == OC_DEVICE_ID5 || \
+				 adapter->pdev->device == OC_DEVICE_ID6)
+
+#define BE3_chip(adapter)	(adapter->pdev->device == BE_DEVICE_ID2 || \
+				 adapter->pdev->device == OC_DEVICE_ID2)
+
+#define BE2_chip(adapter)	(adapter->pdev->device == BE_DEVICE_ID1 || \
+				 adapter->pdev->device == OC_DEVICE_ID1)
+
+#define BEx_chip(adapter)	(BE3_chip(adapter) || BE2_chip(adapter))
+
+#define be_roce_supported(adapter)	(skyhawk_chip(adapter) && \
+					(adapter->function_mode & RDMA_ENABLED))
+
+extern const struct ethtool_ops be_ethtool_ops;
+
+#define msix_enabled(adapter)		(adapter->num_msix_vec > 0)
+#define num_irqs(adapter)		(msix_enabled(adapter) ?	\
+						adapter->num_msix_vec : 1)
+#define tx_stats(txo)			(&(txo)->stats)
+#define rx_stats(rxo)			(&(rxo)->stats)
+
+/* The default RXQ is the last RXQ */
+#define default_rxo(adpt)		(&adpt->rx_obj[adpt->num_rx_qs - 1])
+
+#define for_all_rx_queues(adapter, rxo, i)				\
+	for (i = 0, rxo = &adapter->rx_obj[i]; i < adapter->num_rx_qs;	\
+		i++, rxo++)
+
+#define for_all_rss_queues(adapter, rxo, i)				\
+	for (i = 0, rxo = &adapter->rx_obj[i]; i < adapter->num_rss_qs;	\
+		i++, rxo++)
+
+#define for_all_tx_queues(adapter, txo, i)				\
+	for (i = 0, txo = &adapter->tx_obj[i]; i < adapter->num_tx_qs;	\
+		i++, txo++)
+
+#define for_all_evt_queues(adapter, eqo, i)				\
+	for (i = 0, eqo = &adapter->eq_obj[i]; i < adapter->num_evt_qs; \
+		i++, eqo++)
+
+#define for_all_rx_queues_on_eq(adapter, eqo, rxo, i)			\
+	for (i = eqo->idx, rxo = &adapter->rx_obj[i]; i < adapter->num_rx_qs;\
+		 i += adapter->num_evt_qs, rxo += adapter->num_evt_qs)
+
+#define for_all_tx_queues_on_eq(adapter, eqo, txo, i)			\
+	for (i = eqo->idx, txo = &adapter->tx_obj[i]; i < adapter->num_tx_qs;\
+		i += adapter->num_evt_qs, txo += adapter->num_evt_qs)
+
+#define is_mcc_eqo(eqo)			(eqo->idx == 0)
+#define mcc_eqo(adapter)		(&adapter->eq_obj[0])
+
+#define PAGE_SHIFT_4K		12
+#define PAGE_SIZE_4K		(1 << PAGE_SHIFT_4K)
+
+/* Returns number of pages spanned by the data starting at the given addr */
+#define PAGES_4K_SPANNED(_address, size) 				\
+		((u32)((((size_t)(_address) & (PAGE_SIZE_4K - 1)) + 	\
+			(size) + (PAGE_SIZE_4K - 1)) >> PAGE_SHIFT_4K))
+
+/* Returns bit offset within a DWORD of a bitfield */
+#define AMAP_BIT_OFFSET(_struct, field)  				\
+		(((size_t)&(((_struct *)0)->field))%32)
+
+/* Returns the bit mask of the field that is NOT shifted into location. */
+static inline u32 amap_mask(u32 bitsize)
+{
+	return (bitsize == 32 ? 0xFFFFFFFF : (1 << bitsize) - 1);
+}
+
+static inline void
+amap_set(void *ptr, u32 dw_offset, u32 mask, u32 offset, u32 value)
+{
+	u32 *dw = (u32 *) ptr + dw_offset;
+	*dw &= ~(mask << offset);
+	*dw |= (mask & value) << offset;
+}
+
+#define AMAP_SET_BITS(_struct, field, ptr, val)				\
+		amap_set(ptr,						\
+			offsetof(_struct, field)/32,			\
+			amap_mask(sizeof(((_struct *)0)->field)),	\
+			AMAP_BIT_OFFSET(_struct, field),		\
+			val)
+
+static inline u32 amap_get(void *ptr, u32 dw_offset, u32 mask, u32 offset)
+{
+	u32 *dw = (u32 *) ptr;
+	return mask & (*(dw + dw_offset) >> offset);
+}
+
+#define AMAP_GET_BITS(_struct, field, ptr)				\
+		amap_get(ptr,						\
+			offsetof(_struct, field)/32,			\
+			amap_mask(sizeof(((_struct *)0)->field)),	\
+			AMAP_BIT_OFFSET(_struct, field))
+
+#define GET_RX_COMPL_V0_BITS(field, ptr)				\
+		AMAP_GET_BITS(struct amap_eth_rx_compl_v0, field, ptr)
+
+#define GET_RX_COMPL_V1_BITS(field, ptr)				\
+		AMAP_GET_BITS(struct amap_eth_rx_compl_v1, field, ptr)
+
+#define GET_TX_COMPL_BITS(field, ptr)					\
+		AMAP_GET_BITS(struct amap_eth_tx_compl, field, ptr)
+
+#define SET_TX_WRB_HDR_BITS(field, ptr, val)				\
+		AMAP_SET_BITS(struct amap_eth_hdr_wrb, field, ptr, val)
+
+#define be_dws_cpu_to_le(wrb, len)	swap_dws(wrb, len)
+#define be_dws_le_to_cpu(wrb, len)	swap_dws(wrb, len)
+static inline void swap_dws(void *wrb, int len)
+{
+#ifdef __BIG_ENDIAN
+	u32 *dw = wrb;
+	BUG_ON(len % 4);
+	do {
+		*dw = cpu_to_le32(*dw);
+		dw++;
+		len -= 4;
+	} while (len);
+#endif				/* __BIG_ENDIAN */
+}
+
+#define be_cmd_status(status)		(status > 0 ? -EIO : status)
+
+static inline u8 is_tcp_pkt(struct sk_buff *skb)
+{
+	u8 val = 0;
+
+	if (ip_hdr(skb)->version == 4)
+		val = (ip_hdr(skb)->protocol == IPPROTO_TCP);
+	else if (ip_hdr(skb)->version == 6)
+		val = (ipv6_hdr(skb)->nexthdr == NEXTHDR_TCP);
+
+	return val;
+}
+
+static inline u8 is_udp_pkt(struct sk_buff *skb)
+{
+	u8 val = 0;
+
+	if (ip_hdr(skb)->version == 4)
+		val = (ip_hdr(skb)->protocol == IPPROTO_UDP);
+	else if (ip_hdr(skb)->version == 6)
+		val = (ipv6_hdr(skb)->nexthdr == NEXTHDR_UDP);
+
+	return val;
+}
+
+static inline bool is_ipv4_pkt(struct sk_buff *skb)
+{
+	return skb->protocol == htons(ETH_P_IP) && ip_hdr(skb)->version == 4;
+}
+
+static inline bool is_ipv6_ext_hdr(struct sk_buff *skb)
+{
+	if (ip_hdr(skb)->version == 6)
+		return ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr);
+	else
+		return false;
+}
+
+#define be_error_recovering(adapter)	\
+		(adapter->flags & BE_FLAGS_TRY_RECOVERY)
+
+#define BE_ERROR_EEH		1
+#define BE_ERROR_UE		BIT(1)
+#define BE_ERROR_FW		BIT(2)
+#define BE_ERROR_TX		BIT(3)
+#define BE_ERROR_HW		(BE_ERROR_EEH | BE_ERROR_UE | BE_ERROR_TX)
+#define BE_ERROR_ANY		(BE_ERROR_EEH | BE_ERROR_UE | BE_ERROR_FW | \
+				 BE_ERROR_TX)
+#define BE_CLEAR_ALL		0xFF
+
+static inline u8 be_check_error(struct be_adapter *adapter, u32 err_type)
+{
+	return (adapter->err_flags & err_type);
+}
+
+static inline void be_set_error(struct be_adapter *adapter, int err_type)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	adapter->err_flags |= err_type;
+	netif_carrier_off(netdev);
+
+	dev_info(&adapter->pdev->dev, "%s: Link down\n", netdev->name);
+}
+
+static inline void  be_clear_error(struct be_adapter *adapter, int err_type)
+{
+	adapter->err_flags &= ~err_type;
+}
+
+static inline bool be_multi_rxq(const struct be_adapter *adapter)
+{
+	return adapter->num_rx_qs > 1;
+}
+
+void be_cq_notify(struct be_adapter *adapter, u16 qid, bool arm,
+		  u16 num_popped);
+void be_link_status_update(struct be_adapter *adapter, u8 link_status);
+void be_parse_stats(struct be_adapter *adapter);
+int be_load_fw(struct be_adapter *adapter, u8 *func);
+bool be_is_wol_supported(struct be_adapter *adapter);
+bool be_pause_supported(struct be_adapter *adapter);
+u32 be_get_fw_log_level(struct be_adapter *adapter);
+int be_update_queues(struct be_adapter *adapter);
+int be_poll(struct napi_struct *napi, int budget);
+void be_eqd_update(struct be_adapter *adapter, bool force_update);
+
+/*
+ * internal function to initialize-cleanup roce device.
+ */
+void be_roce_dev_add(struct be_adapter *);
+void be_roce_dev_remove(struct be_adapter *);
+
+/*
+ * internal function to open-close roce device during ifup-ifdown.
+ */
+void be_roce_dev_shutdown(struct be_adapter *);
+
+#endif				/* BE_H */
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c
new file mode 100644
index 0000000..ff92ab1
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -0,0 +1,5085 @@
+/*
+ * Copyright (C) 2005 - 2016 Broadcom
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.  The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include <linux/module.h>
+#include "be.h"
+#include "be_cmds.h"
+
+const char * const be_misconfig_evt_port_state[] = {
+	"Physical Link is functional",
+	"Optics faulted/incorrectly installed/not installed - Reseat optics. If issue not resolved, replace.",
+	"Optics of two types installed â Remove one optic or install matching pair of optics.",
+	"Incompatible optics â Replace with compatible optics for card to function.",
+	"Unqualified optics â Replace with Avago optics for Warranty and Technical Support.",
+	"Uncertified optics â Replace with Avago-certified optics to enable link operation."
+};
+
+static char *be_port_misconfig_evt_severity[] = {
+	"KERN_WARN",
+	"KERN_INFO",
+	"KERN_ERR",
+	"KERN_WARN"
+};
+
+static char *phy_state_oper_desc[] = {
+	"Link is non-operational",
+	"Link is operational",
+	""
+};
+
+static struct be_cmd_priv_map cmd_priv_map[] = {
+	{
+		OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG,
+		CMD_SUBSYSTEM_ETH,
+		BE_PRIV_LNKMGMT | BE_PRIV_VHADM |
+		BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+	},
+	{
+		OPCODE_COMMON_GET_FLOW_CONTROL,
+		CMD_SUBSYSTEM_COMMON,
+		BE_PRIV_LNKQUERY | BE_PRIV_VHADM |
+		BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+	},
+	{
+		OPCODE_COMMON_SET_FLOW_CONTROL,
+		CMD_SUBSYSTEM_COMMON,
+		BE_PRIV_LNKMGMT | BE_PRIV_VHADM |
+		BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+	},
+	{
+		OPCODE_ETH_GET_PPORT_STATS,
+		CMD_SUBSYSTEM_ETH,
+		BE_PRIV_LNKMGMT | BE_PRIV_VHADM |
+		BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+	},
+	{
+		OPCODE_COMMON_GET_PHY_DETAILS,
+		CMD_SUBSYSTEM_COMMON,
+		BE_PRIV_LNKMGMT | BE_PRIV_VHADM |
+		BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+	},
+	{
+		OPCODE_LOWLEVEL_HOST_DDR_DMA,
+		CMD_SUBSYSTEM_LOWLEVEL,
+		BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+	},
+	{
+		OPCODE_LOWLEVEL_LOOPBACK_TEST,
+		CMD_SUBSYSTEM_LOWLEVEL,
+		BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+	},
+	{
+		OPCODE_LOWLEVEL_SET_LOOPBACK_MODE,
+		CMD_SUBSYSTEM_LOWLEVEL,
+		BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+	},
+	{
+		OPCODE_COMMON_SET_HSW_CONFIG,
+		CMD_SUBSYSTEM_COMMON,
+		BE_PRIV_DEVCFG | BE_PRIV_VHADM |
+		BE_PRIV_DEVSEC
+	},
+	{
+		OPCODE_COMMON_GET_EXT_FAT_CAPABILITIES,
+		CMD_SUBSYSTEM_COMMON,
+		BE_PRIV_DEVCFG
+	}
+};
+
+static bool be_cmd_allowed(struct be_adapter *adapter, u8 opcode, u8 subsystem)
+{
+	int i;
+	int num_entries = ARRAY_SIZE(cmd_priv_map);
+	u32 cmd_privileges = adapter->cmd_privileges;
+
+	for (i = 0; i < num_entries; i++)
+		if (opcode == cmd_priv_map[i].opcode &&
+		    subsystem == cmd_priv_map[i].subsystem)
+			if (!(cmd_privileges & cmd_priv_map[i].priv_mask))
+				return false;
+
+	return true;
+}
+
+static inline void *embedded_payload(struct be_mcc_wrb *wrb)
+{
+	return wrb->payload.embedded_payload;
+}
+
+static int be_mcc_notify(struct be_adapter *adapter)
+{
+	struct be_queue_info *mccq = &adapter->mcc_obj.q;
+	u32 val = 0;
+
+	if (be_check_error(adapter, BE_ERROR_ANY))
+		return -EIO;
+
+	val |= mccq->id & DB_MCCQ_RING_ID_MASK;
+	val |= 1 << DB_MCCQ_NUM_POSTED_SHIFT;
+
+	wmb();
+	iowrite32(val, adapter->db + DB_MCCQ_OFFSET);
+
+	return 0;
+}
+
+/* To check if valid bit is set, check the entire word as we don't know
+ * the endianness of the data (old entry is host endian while a new entry is
+ * little endian) */
+static inline bool be_mcc_compl_is_new(struct be_mcc_compl *compl)
+{
+	u32 flags;
+
+	if (compl->flags != 0) {
+		flags = le32_to_cpu(compl->flags);
+		if (flags & CQE_FLAGS_VALID_MASK) {
+			compl->flags = flags;
+			return true;
+		}
+	}
+	return false;
+}
+
+/* Need to reset the entire word that houses the valid bit */
+static inline void be_mcc_compl_use(struct be_mcc_compl *compl)
+{
+	compl->flags = 0;
+}
+
+static struct be_cmd_resp_hdr *be_decode_resp_hdr(u32 tag0, u32 tag1)
+{
+	unsigned long addr;
+
+	addr = tag1;
+	addr = ((addr << 16) << 16) | tag0;
+	return (void *)addr;
+}
+
+static bool be_skip_err_log(u8 opcode, u16 base_status, u16 addl_status)
+{
+	if (base_status == MCC_STATUS_NOT_SUPPORTED ||
+	    base_status == MCC_STATUS_ILLEGAL_REQUEST ||
+	    addl_status == MCC_ADDL_STATUS_TOO_MANY_INTERFACES ||
+	    addl_status == MCC_ADDL_STATUS_INSUFFICIENT_VLANS ||
+	    (opcode == OPCODE_COMMON_WRITE_FLASHROM &&
+	    (base_status == MCC_STATUS_ILLEGAL_FIELD ||
+	     addl_status == MCC_ADDL_STATUS_FLASH_IMAGE_CRC_MISMATCH)))
+		return true;
+	else
+		return false;
+}
+
+/* Place holder for all the async MCC cmds wherein the caller is not in a busy
+ * loop (has not issued be_mcc_notify_wait())
+ */
+static void be_async_cmd_process(struct be_adapter *adapter,
+				 struct be_mcc_compl *compl,
+				 struct be_cmd_resp_hdr *resp_hdr)
+{
+	enum mcc_base_status base_status = base_status(compl->status);
+	u8 opcode = 0, subsystem = 0;
+
+	if (resp_hdr) {
+		opcode = resp_hdr->opcode;
+		subsystem = resp_hdr->subsystem;
+	}
+
+	if (opcode == OPCODE_LOWLEVEL_LOOPBACK_TEST &&
+	    subsystem == CMD_SUBSYSTEM_LOWLEVEL) {
+		complete(&adapter->et_cmd_compl);
+		return;
+	}
+
+	if (opcode == OPCODE_LOWLEVEL_SET_LOOPBACK_MODE &&
+	    subsystem == CMD_SUBSYSTEM_LOWLEVEL) {
+		complete(&adapter->et_cmd_compl);
+		return;
+	}
+
+	if ((opcode == OPCODE_COMMON_WRITE_FLASHROM ||
+	     opcode == OPCODE_COMMON_WRITE_OBJECT) &&
+	    subsystem == CMD_SUBSYSTEM_COMMON) {
+		adapter->flash_status = compl->status;
+		complete(&adapter->et_cmd_compl);
+		return;
+	}
+
+	if ((opcode == OPCODE_ETH_GET_STATISTICS ||
+	     opcode == OPCODE_ETH_GET_PPORT_STATS) &&
+	    subsystem == CMD_SUBSYSTEM_ETH &&
+	    base_status == MCC_STATUS_SUCCESS) {
+		be_parse_stats(adapter);
+		adapter->stats_cmd_sent = false;
+		return;
+	}
+
+	if (opcode == OPCODE_COMMON_GET_CNTL_ADDITIONAL_ATTRIBUTES &&
+	    subsystem == CMD_SUBSYSTEM_COMMON) {
+		if (base_status == MCC_STATUS_SUCCESS) {
+			struct be_cmd_resp_get_cntl_addnl_attribs *resp =
+							(void *)resp_hdr;
+			adapter->hwmon_info.be_on_die_temp =
+						resp->on_die_temperature;
+		} else {
+			adapter->be_get_temp_freq = 0;
+			adapter->hwmon_info.be_on_die_temp =
+						BE_INVALID_DIE_TEMP;
+		}
+		return;
+	}
+}
+
+static int be_mcc_compl_process(struct be_adapter *adapter,
+				struct be_mcc_compl *compl)
+{
+	enum mcc_base_status base_status;
+	enum mcc_addl_status addl_status;
+	struct be_cmd_resp_hdr *resp_hdr;
+	u8 opcode = 0, subsystem = 0;
+
+	/* Just swap the status to host endian; mcc tag is opaquely copied
+	 * from mcc_wrb */
+	be_dws_le_to_cpu(compl, 4);
+
+	base_status = base_status(compl->status);
+	addl_status = addl_status(compl->status);
+
+	resp_hdr = be_decode_resp_hdr(compl->tag0, compl->tag1);
+	if (resp_hdr) {
+		opcode = resp_hdr->opcode;
+		subsystem = resp_hdr->subsystem;
+	}
+
+	be_async_cmd_process(adapter, compl, resp_hdr);
+
+	if (base_status != MCC_STATUS_SUCCESS &&
+	    !be_skip_err_log(opcode, base_status, addl_status)) {
+		if (base_status == MCC_STATUS_UNAUTHORIZED_REQUEST ||
+		    addl_status == MCC_ADDL_STATUS_INSUFFICIENT_PRIVILEGES) {
+			dev_warn(&adapter->pdev->dev,
+				 "VF is not privileged to issue opcode %d-%d\n",
+				 opcode, subsystem);
+		} else {
+			dev_err(&adapter->pdev->dev,
+				"opcode %d-%d failed:status %d-%d\n",
+				opcode, subsystem, base_status, addl_status);
+		}
+	}
+	return compl->status;
+}
+
+/* Link state evt is a string of bytes; no need for endian swapping */
+static void be_async_link_state_process(struct be_adapter *adapter,
+					struct be_mcc_compl *compl)
+{
+	struct be_async_event_link_state *evt =
+			(struct be_async_event_link_state *)compl;
+
+	/* When link status changes, link speed must be re-queried from FW */
+	adapter->phy.link_speed = -1;
+
+	/* On BEx the FW does not send a separate link status
+	 * notification for physical and logical link.
+	 * On other chips just process the logical link
+	 * status notification
+	 */
+	if (!BEx_chip(adapter) &&
+	    !(evt->port_link_status & LOGICAL_LINK_STATUS_MASK))
+		return;
+
+	/* For the initial link status do not rely on the ASYNC event as
+	 * it may not be received in some cases.
+	 */
+	if (adapter->flags & BE_FLAGS_LINK_STATUS_INIT)
+		be_link_status_update(adapter,
+				      evt->port_link_status & LINK_STATUS_MASK);
+}
+
+static void be_async_port_misconfig_event_process(struct be_adapter *adapter,
+						  struct be_mcc_compl *compl)
+{
+	struct be_async_event_misconfig_port *evt =
+			(struct be_async_event_misconfig_port *)compl;
+	u32 sfp_misconfig_evt_word1 = le32_to_cpu(evt->event_data_word1);
+	u32 sfp_misconfig_evt_word2 = le32_to_cpu(evt->event_data_word2);
+	u8 phy_oper_state = PHY_STATE_OPER_MSG_NONE;
+	struct device *dev = &adapter->pdev->dev;
+	u8 msg_severity = DEFAULT_MSG_SEVERITY;
+	u8 phy_state_info;
+	u8 new_phy_state;
+
+	new_phy_state =
+		(sfp_misconfig_evt_word1 >> (adapter->hba_port_num * 8)) & 0xff;
+
+	if (new_phy_state == adapter->phy_state)
+		return;
+
+	adapter->phy_state = new_phy_state;
+
+	/* for older fw that doesn't populate link effect data */
+	if (!sfp_misconfig_evt_word2)
+		goto log_message;
+
+	phy_state_info =
+		(sfp_misconfig_evt_word2 >> (adapter->hba_port_num * 8)) & 0xff;
+
+	if (phy_state_info & PHY_STATE_INFO_VALID) {
+		msg_severity = (phy_state_info & PHY_STATE_MSG_SEVERITY) >> 1;
+
+		if (be_phy_unqualified(new_phy_state))
+			phy_oper_state = (phy_state_info & PHY_STATE_OPER);
+	}
+
+log_message:
+	/* Log an error message that would allow a user to determine
+	 * whether the SFPs have an issue
+	 */
+	if (be_phy_state_unknown(new_phy_state))
+		dev_printk(be_port_misconfig_evt_severity[msg_severity], dev,
+			   "Port %c: Unrecognized Optics state: 0x%x. %s",
+			   adapter->port_name,
+			   new_phy_state,
+			   phy_state_oper_desc[phy_oper_state]);
+	else
+		dev_printk(be_port_misconfig_evt_severity[msg_severity], dev,
+			   "Port %c: %s %s",
+			   adapter->port_name,
+			   be_misconfig_evt_port_state[new_phy_state],
+			   phy_state_oper_desc[phy_oper_state]);
+
+	/* Log Vendor name and part no. if a misconfigured SFP is detected */
+	if (be_phy_misconfigured(new_phy_state))
+		adapter->flags |= BE_FLAGS_PHY_MISCONFIGURED;
+}
+
+/* Grp5 CoS Priority evt */
+static void be_async_grp5_cos_priority_process(struct be_adapter *adapter,
+					       struct be_mcc_compl *compl)
+{
+	struct be_async_event_grp5_cos_priority *evt =
+			(struct be_async_event_grp5_cos_priority *)compl;
+
+	if (evt->valid) {
+		adapter->vlan_prio_bmap = evt->available_priority_bmap;
+		adapter->recommended_prio_bits =
+			evt->reco_default_priority << VLAN_PRIO_SHIFT;
+	}
+}
+
+/* Grp5 QOS Speed evt: qos_link_speed is in units of 10 Mbps */
+static void be_async_grp5_qos_speed_process(struct be_adapter *adapter,
+					    struct be_mcc_compl *compl)
+{
+	struct be_async_event_grp5_qos_link_speed *evt =
+			(struct be_async_event_grp5_qos_link_speed *)compl;
+
+	if (adapter->phy.link_speed >= 0 &&
+	    evt->physical_port == adapter->port_num)
+		adapter->phy.link_speed = le16_to_cpu(evt->qos_link_speed) * 10;
+}
+
+/*Grp5 PVID evt*/
+static void be_async_grp5_pvid_state_process(struct be_adapter *adapter,
+					     struct be_mcc_compl *compl)
+{
+	struct be_async_event_grp5_pvid_state *evt =
+			(struct be_async_event_grp5_pvid_state *)compl;
+
+	if (evt->enabled) {
+		adapter->pvid = le16_to_cpu(evt->tag) & VLAN_VID_MASK;
+		dev_info(&adapter->pdev->dev, "LPVID: %d\n", adapter->pvid);
+	} else {
+		adapter->pvid = 0;
+	}
+}
+
+#define MGMT_ENABLE_MASK	0x4
+static void be_async_grp5_fw_control_process(struct be_adapter *adapter,
+					     struct be_mcc_compl *compl)
+{
+	struct be_async_fw_control *evt = (struct be_async_fw_control *)compl;
+	u32 evt_dw1 = le32_to_cpu(evt->event_data_word1);
+
+	if (evt_dw1 & MGMT_ENABLE_MASK) {
+		adapter->flags |= BE_FLAGS_OS2BMC;
+		adapter->bmc_filt_mask = le32_to_cpu(evt->event_data_word2);
+	} else {
+		adapter->flags &= ~BE_FLAGS_OS2BMC;
+	}
+}
+
+static void be_async_grp5_evt_process(struct be_adapter *adapter,
+				      struct be_mcc_compl *compl)
+{
+	u8 event_type = (compl->flags >> ASYNC_EVENT_TYPE_SHIFT) &
+				ASYNC_EVENT_TYPE_MASK;
+
+	switch (event_type) {
+	case ASYNC_EVENT_COS_PRIORITY:
+		be_async_grp5_cos_priority_process(adapter, compl);
+		break;
+	case ASYNC_EVENT_QOS_SPEED:
+		be_async_grp5_qos_speed_process(adapter, compl);
+		break;
+	case ASYNC_EVENT_PVID_STATE:
+		be_async_grp5_pvid_state_process(adapter, compl);
+		break;
+	/* Async event to disable/enable os2bmc and/or mac-learning */
+	case ASYNC_EVENT_FW_CONTROL:
+		be_async_grp5_fw_control_process(adapter, compl);
+		break;
+	default:
+		break;
+	}
+}
+
+static void be_async_dbg_evt_process(struct be_adapter *adapter,
+				     struct be_mcc_compl *cmp)
+{
+	u8 event_type = 0;
+	struct be_async_event_qnq *evt = (struct be_async_event_qnq *)cmp;
+
+	event_type = (cmp->flags >> ASYNC_EVENT_TYPE_SHIFT) &
+			ASYNC_EVENT_TYPE_MASK;
+
+	switch (event_type) {
+	case ASYNC_DEBUG_EVENT_TYPE_QNQ:
+		if (evt->valid)
+			adapter->qnq_vid = le16_to_cpu(evt->vlan_tag);
+		adapter->flags |= BE_FLAGS_QNQ_ASYNC_EVT_RCVD;
+	break;
+	default:
+		dev_warn(&adapter->pdev->dev, "Unknown debug event 0x%x!\n",
+			 event_type);
+	break;
+	}
+}
+
+static void be_async_sliport_evt_process(struct be_adapter *adapter,
+					 struct be_mcc_compl *cmp)
+{
+	u8 event_type = (cmp->flags >> ASYNC_EVENT_TYPE_SHIFT) &
+			ASYNC_EVENT_TYPE_MASK;
+
+	if (event_type == ASYNC_EVENT_PORT_MISCONFIG)
+		be_async_port_misconfig_event_process(adapter, cmp);
+}
+
+static inline bool is_link_state_evt(u32 flags)
+{
+	return ((flags >> ASYNC_EVENT_CODE_SHIFT) & ASYNC_EVENT_CODE_MASK) ==
+			ASYNC_EVENT_CODE_LINK_STATE;
+}
+
+static inline bool is_grp5_evt(u32 flags)
+{
+	return ((flags >> ASYNC_EVENT_CODE_SHIFT) & ASYNC_EVENT_CODE_MASK) ==
+			ASYNC_EVENT_CODE_GRP_5;
+}
+
+static inline bool is_dbg_evt(u32 flags)
+{
+	return ((flags >> ASYNC_EVENT_CODE_SHIFT) & ASYNC_EVENT_CODE_MASK) ==
+			ASYNC_EVENT_CODE_QNQ;
+}
+
+static inline bool is_sliport_evt(u32 flags)
+{
+	return ((flags >> ASYNC_EVENT_CODE_SHIFT) & ASYNC_EVENT_CODE_MASK) ==
+		ASYNC_EVENT_CODE_SLIPORT;
+}
+
+static void be_mcc_event_process(struct be_adapter *adapter,
+				 struct be_mcc_compl *compl)
+{
+	if (is_link_state_evt(compl->flags))
+		be_async_link_state_process(adapter, compl);
+	else if (is_grp5_evt(compl->flags))
+		be_async_grp5_evt_process(adapter, compl);
+	else if (is_dbg_evt(compl->flags))
+		be_async_dbg_evt_process(adapter, compl);
+	else if (is_sliport_evt(compl->flags))
+		be_async_sliport_evt_process(adapter, compl);
+}
+
+static struct be_mcc_compl *be_mcc_compl_get(struct be_adapter *adapter)
+{
+	struct be_queue_info *mcc_cq = &adapter->mcc_obj.cq;
+	struct be_mcc_compl *compl = queue_tail_node(mcc_cq);
+
+	if (be_mcc_compl_is_new(compl)) {
+		queue_tail_inc(mcc_cq);
+		return compl;
+	}
+	return NULL;
+}
+
+void be_async_mcc_enable(struct be_adapter *adapter)
+{
+	spin_lock_bh(&adapter->mcc_cq_lock);
+
+	be_cq_notify(adapter, adapter->mcc_obj.cq.id, true, 0);
+	adapter->mcc_obj.rearm_cq = true;
+
+	spin_unlock_bh(&adapter->mcc_cq_lock);
+}
+
+void be_async_mcc_disable(struct be_adapter *adapter)
+{
+	spin_lock_bh(&adapter->mcc_cq_lock);
+
+	adapter->mcc_obj.rearm_cq = false;
+	be_cq_notify(adapter, adapter->mcc_obj.cq.id, false, 0);
+
+	spin_unlock_bh(&adapter->mcc_cq_lock);
+}
+
+int be_process_mcc(struct be_adapter *adapter)
+{
+	struct be_mcc_compl *compl;
+	int num = 0, status = 0;
+	struct be_mcc_obj *mcc_obj = &adapter->mcc_obj;
+
+	spin_lock(&adapter->mcc_cq_lock);
+
+	while ((compl = be_mcc_compl_get(adapter))) {
+		if (compl->flags & CQE_FLAGS_ASYNC_MASK) {
+			be_mcc_event_process(adapter, compl);
+		} else if (compl->flags & CQE_FLAGS_COMPLETED_MASK) {
+			status = be_mcc_compl_process(adapter, compl);
+			atomic_dec(&mcc_obj->q.used);
+		}
+		be_mcc_compl_use(compl);
+		num++;
+	}
+
+	if (num)
+		be_cq_notify(adapter, mcc_obj->cq.id, mcc_obj->rearm_cq, num);
+
+	spin_unlock(&adapter->mcc_cq_lock);
+	return status;
+}
+
+/* Wait till no more pending mcc requests are present */
+static int be_mcc_wait_compl(struct be_adapter *adapter)
+{
+#define mcc_timeout		12000 /* 12s timeout */
+	int i, status = 0;
+	struct be_mcc_obj *mcc_obj = &adapter->mcc_obj;
+
+	for (i = 0; i < mcc_timeout; i++) {
+		if (be_check_error(adapter, BE_ERROR_ANY))
+			return -EIO;
+
+		local_bh_disable();
+		status = be_process_mcc(adapter);
+		local_bh_enable();
+
+		if (atomic_read(&mcc_obj->q.used) == 0)
+			break;
+		usleep_range(500, 1000);
+	}
+	if (i == mcc_timeout) {
+		dev_err(&adapter->pdev->dev, "FW not responding\n");
+		be_set_error(adapter, BE_ERROR_FW);
+		return -EIO;
+	}
+	return status;
+}
+
+/* Notify MCC requests and wait for completion */
+static int be_mcc_notify_wait(struct be_adapter *adapter)
+{
+	int status;
+	struct be_mcc_wrb *wrb;
+	struct be_mcc_obj *mcc_obj = &adapter->mcc_obj;
+	u32 index = mcc_obj->q.head;
+	struct be_cmd_resp_hdr *resp;
+
+	index_dec(&index, mcc_obj->q.len);
+	wrb = queue_index_node(&mcc_obj->q, index);
+
+	resp = be_decode_resp_hdr(wrb->tag0, wrb->tag1);
+
+	status = be_mcc_notify(adapter);
+	if (status)
+		goto out;
+
+	status = be_mcc_wait_compl(adapter);
+	if (status == -EIO)
+		goto out;
+
+	status = (resp->base_status |
+		  ((resp->addl_status & CQE_ADDL_STATUS_MASK) <<
+		   CQE_ADDL_STATUS_SHIFT));
+out:
+	return status;
+}
+
+static int be_mbox_db_ready_wait(struct be_adapter *adapter, void __iomem *db)
+{
+	int msecs = 0;
+	u32 ready;
+
+	do {
+		if (be_check_error(adapter, BE_ERROR_ANY))
+			return -EIO;
+
+		ready = ioread32(db);
+		if (ready == 0xffffffff)
+			return -1;
+
+		ready &= MPU_MAILBOX_DB_RDY_MASK;
+		if (ready)
+			break;
+
+		if (msecs > 4000) {
+			dev_err(&adapter->pdev->dev, "FW not responding\n");
+			be_set_error(adapter, BE_ERROR_FW);
+			be_detect_error(adapter);
+			return -1;
+		}
+
+		msleep(1);
+		msecs++;
+	} while (true);
+
+	return 0;
+}
+
+/*
+ * Insert the mailbox address into the doorbell in two steps
+ * Polls on the mbox doorbell till a command completion (or a timeout) occurs
+ */
+static int be_mbox_notify_wait(struct be_adapter *adapter)
+{
+	int status;
+	u32 val = 0;
+	void __iomem *db = adapter->db + MPU_MAILBOX_DB_OFFSET;
+	struct be_dma_mem *mbox_mem = &adapter->mbox_mem;
+	struct be_mcc_mailbox *mbox = mbox_mem->va;
+	struct be_mcc_compl *compl = &mbox->compl;
+
+	/* wait for ready to be set */
+	status = be_mbox_db_ready_wait(adapter, db);
+	if (status != 0)
+		return status;
+
+	val |= MPU_MAILBOX_DB_HI_MASK;
+	/* at bits 2 - 31 place mbox dma addr msb bits 34 - 63 */
+	val |= (upper_32_bits(mbox_mem->dma) >> 2) << 2;
+	iowrite32(val, db);
+
+	/* wait for ready to be set */
+	status = be_mbox_db_ready_wait(adapter, db);
+	if (status != 0)
+		return status;
+
+	val = 0;
+	/* at bits 2 - 31 place mbox dma addr lsb bits 4 - 33 */
+	val |= (u32)(mbox_mem->dma >> 4) << 2;
+	iowrite32(val, db);
+
+	status = be_mbox_db_ready_wait(adapter, db);
+	if (status != 0)
+		return status;
+
+	/* A cq entry has been made now */
+	if (be_mcc_compl_is_new(compl)) {
+		status = be_mcc_compl_process(adapter, &mbox->compl);
+		be_mcc_compl_use(compl);
+		if (status)
+			return status;
+	} else {
+		dev_err(&adapter->pdev->dev, "invalid mailbox completion\n");
+		return -1;
+	}
+	return 0;
+}
+
+u16 be_POST_stage_get(struct be_adapter *adapter)
+{
+	u32 sem;
+
+	if (BEx_chip(adapter))
+		sem  = ioread32(adapter->csr + SLIPORT_SEMAPHORE_OFFSET_BEx);
+	else
+		pci_read_config_dword(adapter->pdev,
+				      SLIPORT_SEMAPHORE_OFFSET_SH, &sem);
+
+	return sem & POST_STAGE_MASK;
+}
+
+static int lancer_wait_ready(struct be_adapter *adapter)
+{
+#define SLIPORT_READY_TIMEOUT 30
+	u32 sliport_status;
+	int i;
+
+	for (i = 0; i < SLIPORT_READY_TIMEOUT; i++) {
+		sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET);
+		if (sliport_status & SLIPORT_STATUS_RDY_MASK)
+			return 0;
+
+		if (sliport_status & SLIPORT_STATUS_ERR_MASK &&
+		    !(sliport_status & SLIPORT_STATUS_RN_MASK))
+			return -EIO;
+
+		msleep(1000);
+	}
+
+	return sliport_status ? : -1;
+}
+
+int be_fw_wait_ready(struct be_adapter *adapter)
+{
+	u16 stage;
+	int status, timeout = 0;
+	struct device *dev = &adapter->pdev->dev;
+
+	if (lancer_chip(adapter)) {
+		status = lancer_wait_ready(adapter);
+		if (status) {
+			stage = status;
+			goto err;
+		}
+		return 0;
+	}
+
+	do {
+		/* There's no means to poll POST state on BE2/3 VFs */
+		if (BEx_chip(adapter) && be_virtfn(adapter))
+			return 0;
+
+		stage = be_POST_stage_get(adapter);
+		if (stage == POST_STAGE_ARMFW_RDY)
+			return 0;
+
+		dev_info(dev, "Waiting for POST, %ds elapsed\n", timeout);
+		if (msleep_interruptible(2000)) {
+			dev_err(dev, "Waiting for POST aborted\n");
+			return -EINTR;
+		}
+		timeout += 2;
+	} while (timeout < 60);
+
+err:
+	dev_err(dev, "POST timeout; stage=%#x\n", stage);
+	return -ETIMEDOUT;
+}
+
+static inline struct be_sge *nonembedded_sgl(struct be_mcc_wrb *wrb)
+{
+	return &wrb->payload.sgl[0];
+}
+
+static inline void fill_wrb_tags(struct be_mcc_wrb *wrb, unsigned long addr)
+{
+	wrb->tag0 = addr & 0xFFFFFFFF;
+	wrb->tag1 = upper_32_bits(addr);
+}
+
+/* Don't touch the hdr after it's prepared */
+/* mem will be NULL for embedded commands */
+static void be_wrb_cmd_hdr_prepare(struct be_cmd_req_hdr *req_hdr,
+				   u8 subsystem, u8 opcode, int cmd_len,
+				   struct be_mcc_wrb *wrb,
+				   struct be_dma_mem *mem)
+{
+	struct be_sge *sge;
+
+	req_hdr->opcode = opcode;
+	req_hdr->subsystem = subsystem;
+	req_hdr->request_length = cpu_to_le32(cmd_len - sizeof(*req_hdr));
+	req_hdr->version = 0;
+	fill_wrb_tags(wrb, (ulong) req_hdr);
+	wrb->payload_length = cmd_len;
+	if (mem) {
+		wrb->embedded |= (1 & MCC_WRB_SGE_CNT_MASK) <<
+			MCC_WRB_SGE_CNT_SHIFT;
+		sge = nonembedded_sgl(wrb);
+		sge->pa_hi = cpu_to_le32(upper_32_bits(mem->dma));
+		sge->pa_lo = cpu_to_le32(mem->dma & 0xFFFFFFFF);
+		sge->len = cpu_to_le32(mem->size);
+	} else
+		wrb->embedded |= MCC_WRB_EMBEDDED_MASK;
+	be_dws_cpu_to_le(wrb, 8);
+}
+
+static void be_cmd_page_addrs_prepare(struct phys_addr *pages, u32 max_pages,
+				      struct be_dma_mem *mem)
+{
+	int i, buf_pages = min(PAGES_4K_SPANNED(mem->va, mem->size), max_pages);
+	u64 dma = (u64)mem->dma;
+
+	for (i = 0; i < buf_pages; i++) {
+		pages[i].lo = cpu_to_le32(dma & 0xFFFFFFFF);
+		pages[i].hi = cpu_to_le32(upper_32_bits(dma));
+		dma += PAGE_SIZE_4K;
+	}
+}
+
+static inline struct be_mcc_wrb *wrb_from_mbox(struct be_adapter *adapter)
+{
+	struct be_dma_mem *mbox_mem = &adapter->mbox_mem;
+	struct be_mcc_wrb *wrb
+		= &((struct be_mcc_mailbox *)(mbox_mem->va))->wrb;
+	memset(wrb, 0, sizeof(*wrb));
+	return wrb;
+}
+
+static struct be_mcc_wrb *wrb_from_mccq(struct be_adapter *adapter)
+{
+	struct be_queue_info *mccq = &adapter->mcc_obj.q;
+	struct be_mcc_wrb *wrb;
+
+	if (!mccq->created)
+		return NULL;
+
+	if (atomic_read(&mccq->used) >= mccq->len)
+		return NULL;
+
+	wrb = queue_head_node(mccq);
+	queue_head_inc(mccq);
+	atomic_inc(&mccq->used);
+	memset(wrb, 0, sizeof(*wrb));
+	return wrb;
+}
+
+static bool use_mcc(struct be_adapter *adapter)
+{
+	return adapter->mcc_obj.q.created;
+}
+
+/* Must be used only in process context */
+static int be_cmd_lock(struct be_adapter *adapter)
+{
+	if (use_mcc(adapter)) {
+		mutex_lock(&adapter->mcc_lock);
+		return 0;
+	} else {
+		return mutex_lock_interruptible(&adapter->mbox_lock);
+	}
+}
+
+/* Must be used only in process context */
+static void be_cmd_unlock(struct be_adapter *adapter)
+{
+	if (use_mcc(adapter))
+		return mutex_unlock(&adapter->mcc_lock);
+	else
+		return mutex_unlock(&adapter->mbox_lock);
+}
+
+static struct be_mcc_wrb *be_cmd_copy(struct be_adapter *adapter,
+				      struct be_mcc_wrb *wrb)
+{
+	struct be_mcc_wrb *dest_wrb;
+
+	if (use_mcc(adapter)) {
+		dest_wrb = wrb_from_mccq(adapter);
+		if (!dest_wrb)
+			return NULL;
+	} else {
+		dest_wrb = wrb_from_mbox(adapter);
+	}
+
+	memcpy(dest_wrb, wrb, sizeof(*wrb));
+	if (wrb->embedded & cpu_to_le32(MCC_WRB_EMBEDDED_MASK))
+		fill_wrb_tags(dest_wrb, (ulong) embedded_payload(wrb));
+
+	return dest_wrb;
+}
+
+/* Must be used only in process context */
+static int be_cmd_notify_wait(struct be_adapter *adapter,
+			      struct be_mcc_wrb *wrb)
+{
+	struct be_mcc_wrb *dest_wrb;
+	int status;
+
+	status = be_cmd_lock(adapter);
+	if (status)
+		return status;
+
+	dest_wrb = be_cmd_copy(adapter, wrb);
+	if (!dest_wrb) {
+		status = -EBUSY;
+		goto unlock;
+	}
+
+	if (use_mcc(adapter))
+		status = be_mcc_notify_wait(adapter);
+	else
+		status = be_mbox_notify_wait(adapter);
+
+	if (!status)
+		memcpy(wrb, dest_wrb, sizeof(*wrb));
+
+unlock:
+	be_cmd_unlock(adapter);
+	return status;
+}
+
+/* Tell fw we're about to start firing cmds by writing a
+ * special pattern across the wrb hdr; uses mbox
+ */
+int be_cmd_fw_init(struct be_adapter *adapter)
+{
+	u8 *wrb;
+	int status;
+
+	if (lancer_chip(adapter))
+		return 0;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = (u8 *)wrb_from_mbox(adapter);
+	*wrb++ = 0xFF;
+	*wrb++ = 0x12;
+	*wrb++ = 0x34;
+	*wrb++ = 0xFF;
+	*wrb++ = 0xFF;
+	*wrb++ = 0x56;
+	*wrb++ = 0x78;
+	*wrb = 0xFF;
+
+	status = be_mbox_notify_wait(adapter);
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+/* Tell fw we're done with firing cmds by writing a
+ * special pattern across the wrb hdr; uses mbox
+ */
+int be_cmd_fw_clean(struct be_adapter *adapter)
+{
+	u8 *wrb;
+	int status;
+
+	if (lancer_chip(adapter))
+		return 0;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = (u8 *)wrb_from_mbox(adapter);
+	*wrb++ = 0xFF;
+	*wrb++ = 0xAA;
+	*wrb++ = 0xBB;
+	*wrb++ = 0xFF;
+	*wrb++ = 0xFF;
+	*wrb++ = 0xCC;
+	*wrb++ = 0xDD;
+	*wrb = 0xFF;
+
+	status = be_mbox_notify_wait(adapter);
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+int be_cmd_eq_create(struct be_adapter *adapter, struct be_eq_obj *eqo)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_eq_create *req;
+	struct be_dma_mem *q_mem = &eqo->q.dma_mem;
+	int status, ver = 0;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_EQ_CREATE, sizeof(*req), wrb,
+			       NULL);
+
+	/* Support for EQ_CREATEv2 available only SH-R onwards */
+	if (!(BEx_chip(adapter) || lancer_chip(adapter)))
+		ver = 2;
+
+	req->hdr.version = ver;
+	req->num_pages =  cpu_to_le16(PAGES_4K_SPANNED(q_mem->va, q_mem->size));
+
+	AMAP_SET_BITS(struct amap_eq_context, valid, req->context, 1);
+	/* 4byte eqe*/
+	AMAP_SET_BITS(struct amap_eq_context, size, req->context, 0);
+	AMAP_SET_BITS(struct amap_eq_context, count, req->context,
+		      __ilog2_u32(eqo->q.len / 256));
+	be_dws_cpu_to_le(req->context, sizeof(req->context));
+
+	be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_eq_create *resp = embedded_payload(wrb);
+
+		eqo->q.id = le16_to_cpu(resp->eq_id);
+		eqo->msix_idx =
+			(ver == 2) ? le16_to_cpu(resp->msix_idx) : eqo->idx;
+		eqo->q.created = true;
+	}
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+/* Use MCC */
+int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr,
+			  bool permanent, u32 if_handle, u32 pmac_id)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_mac_query *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_MAC_QUERY, sizeof(*req), wrb,
+			       NULL);
+	req->type = MAC_ADDRESS_TYPE_NETWORK;
+	if (permanent) {
+		req->permanent = 1;
+	} else {
+		req->if_id = cpu_to_le16((u16)if_handle);
+		req->pmac_id = cpu_to_le32(pmac_id);
+		req->permanent = 0;
+	}
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_mac_query *resp = embedded_payload(wrb);
+
+		memcpy(mac_addr, resp->mac.addr, ETH_ALEN);
+	}
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses synchronous MCCQ */
+int be_cmd_pmac_add(struct be_adapter *adapter, u8 *mac_addr,
+		    u32 if_id, u32 *pmac_id, u32 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_pmac_add *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_PMAC_ADD, sizeof(*req), wrb,
+			       NULL);
+
+	req->hdr.domain = domain;
+	req->if_id = cpu_to_le32(if_id);
+	memcpy(req->mac_address, mac_addr, ETH_ALEN);
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_pmac_add *resp = embedded_payload(wrb);
+
+		*pmac_id = le32_to_cpu(resp->pmac_id);
+	}
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+
+	 if (base_status(status) == MCC_STATUS_UNAUTHORIZED_REQUEST)
+		status = -EPERM;
+
+	return status;
+}
+
+/* Uses synchronous MCCQ */
+int be_cmd_pmac_del(struct be_adapter *adapter, u32 if_id, int pmac_id, u32 dom)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_pmac_del *req;
+	int status;
+
+	if (pmac_id == -1)
+		return 0;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_PMAC_DEL, sizeof(*req),
+			       wrb, NULL);
+
+	req->hdr.domain = dom;
+	req->if_id = cpu_to_le32(if_id);
+	req->pmac_id = cpu_to_le32(pmac_id);
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses Mbox */
+int be_cmd_cq_create(struct be_adapter *adapter, struct be_queue_info *cq,
+		     struct be_queue_info *eq, bool no_delay, int coalesce_wm)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_cq_create *req;
+	struct be_dma_mem *q_mem = &cq->dma_mem;
+	void *ctxt;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+	ctxt = &req->context;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_CQ_CREATE, sizeof(*req), wrb,
+			       NULL);
+
+	req->num_pages =  cpu_to_le16(PAGES_4K_SPANNED(q_mem->va, q_mem->size));
+
+	if (BEx_chip(adapter)) {
+		AMAP_SET_BITS(struct amap_cq_context_be, coalescwm, ctxt,
+			      coalesce_wm);
+		AMAP_SET_BITS(struct amap_cq_context_be, nodelay,
+			      ctxt, no_delay);
+		AMAP_SET_BITS(struct amap_cq_context_be, count, ctxt,
+			      __ilog2_u32(cq->len / 256));
+		AMAP_SET_BITS(struct amap_cq_context_be, valid, ctxt, 1);
+		AMAP_SET_BITS(struct amap_cq_context_be, eventable, ctxt, 1);
+		AMAP_SET_BITS(struct amap_cq_context_be, eqid, ctxt, eq->id);
+	} else {
+		req->hdr.version = 2;
+		req->page_size = 1; /* 1 for 4K */
+
+		/* coalesce-wm field in this cmd is not relevant to Lancer.
+		 * Lancer uses COMMON_MODIFY_CQ to set this field
+		 */
+		if (!lancer_chip(adapter))
+			AMAP_SET_BITS(struct amap_cq_context_v2, coalescwm,
+				      ctxt, coalesce_wm);
+		AMAP_SET_BITS(struct amap_cq_context_v2, nodelay, ctxt,
+			      no_delay);
+		AMAP_SET_BITS(struct amap_cq_context_v2, count, ctxt,
+			      __ilog2_u32(cq->len / 256));
+		AMAP_SET_BITS(struct amap_cq_context_v2, valid, ctxt, 1);
+		AMAP_SET_BITS(struct amap_cq_context_v2, eventable, ctxt, 1);
+		AMAP_SET_BITS(struct amap_cq_context_v2, eqid, ctxt, eq->id);
+	}
+
+	be_dws_cpu_to_le(ctxt, sizeof(req->context));
+
+	be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_cq_create *resp = embedded_payload(wrb);
+
+		cq->id = le16_to_cpu(resp->cq_id);
+		cq->created = true;
+	}
+
+	mutex_unlock(&adapter->mbox_lock);
+
+	return status;
+}
+
+static u32 be_encoded_q_len(int q_len)
+{
+	u32 len_encoded = fls(q_len); /* log2(len) + 1 */
+
+	if (len_encoded == 16)
+		len_encoded = 0;
+	return len_encoded;
+}
+
+static int be_cmd_mccq_ext_create(struct be_adapter *adapter,
+				  struct be_queue_info *mccq,
+				  struct be_queue_info *cq)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_mcc_ext_create *req;
+	struct be_dma_mem *q_mem = &mccq->dma_mem;
+	void *ctxt;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+	ctxt = &req->context;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_MCC_CREATE_EXT, sizeof(*req), wrb,
+			       NULL);
+
+	req->num_pages = cpu_to_le16(PAGES_4K_SPANNED(q_mem->va, q_mem->size));
+	if (BEx_chip(adapter)) {
+		AMAP_SET_BITS(struct amap_mcc_context_be, valid, ctxt, 1);
+		AMAP_SET_BITS(struct amap_mcc_context_be, ring_size, ctxt,
+			      be_encoded_q_len(mccq->len));
+		AMAP_SET_BITS(struct amap_mcc_context_be, cq_id, ctxt, cq->id);
+	} else {
+		req->hdr.version = 1;
+		req->cq_id = cpu_to_le16(cq->id);
+
+		AMAP_SET_BITS(struct amap_mcc_context_v1, ring_size, ctxt,
+			      be_encoded_q_len(mccq->len));
+		AMAP_SET_BITS(struct amap_mcc_context_v1, valid, ctxt, 1);
+		AMAP_SET_BITS(struct amap_mcc_context_v1, async_cq_id,
+			      ctxt, cq->id);
+		AMAP_SET_BITS(struct amap_mcc_context_v1, async_cq_valid,
+			      ctxt, 1);
+	}
+
+	/* Subscribe to Link State, Sliport Event and Group 5 Events
+	 * (bits 1, 5 and 17 set)
+	 */
+	req->async_event_bitmap[0] =
+			cpu_to_le32(BIT(ASYNC_EVENT_CODE_LINK_STATE) |
+				    BIT(ASYNC_EVENT_CODE_GRP_5) |
+				    BIT(ASYNC_EVENT_CODE_QNQ) |
+				    BIT(ASYNC_EVENT_CODE_SLIPORT));
+
+	be_dws_cpu_to_le(ctxt, sizeof(req->context));
+
+	be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_mcc_create *resp = embedded_payload(wrb);
+
+		mccq->id = le16_to_cpu(resp->id);
+		mccq->created = true;
+	}
+	mutex_unlock(&adapter->mbox_lock);
+
+	return status;
+}
+
+static int be_cmd_mccq_org_create(struct be_adapter *adapter,
+				  struct be_queue_info *mccq,
+				  struct be_queue_info *cq)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_mcc_create *req;
+	struct be_dma_mem *q_mem = &mccq->dma_mem;
+	void *ctxt;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+	ctxt = &req->context;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_MCC_CREATE, sizeof(*req), wrb,
+			       NULL);
+
+	req->num_pages = cpu_to_le16(PAGES_4K_SPANNED(q_mem->va, q_mem->size));
+
+	AMAP_SET_BITS(struct amap_mcc_context_be, valid, ctxt, 1);
+	AMAP_SET_BITS(struct amap_mcc_context_be, ring_size, ctxt,
+		      be_encoded_q_len(mccq->len));
+	AMAP_SET_BITS(struct amap_mcc_context_be, cq_id, ctxt, cq->id);
+
+	be_dws_cpu_to_le(ctxt, sizeof(req->context));
+
+	be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_mcc_create *resp = embedded_payload(wrb);
+
+		mccq->id = le16_to_cpu(resp->id);
+		mccq->created = true;
+	}
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+int be_cmd_mccq_create(struct be_adapter *adapter,
+		       struct be_queue_info *mccq, struct be_queue_info *cq)
+{
+	int status;
+
+	status = be_cmd_mccq_ext_create(adapter, mccq, cq);
+	if (status && BEx_chip(adapter)) {
+		dev_warn(&adapter->pdev->dev, "Upgrade to F/W ver 2.102.235.0 "
+			"or newer to avoid conflicting priorities between NIC "
+			"and FCoE traffic");
+		status = be_cmd_mccq_org_create(adapter, mccq, cq);
+	}
+	return status;
+}
+
+int be_cmd_txq_create(struct be_adapter *adapter, struct be_tx_obj *txo)
+{
+	struct be_mcc_wrb wrb = {0};
+	struct be_cmd_req_eth_tx_create *req;
+	struct be_queue_info *txq = &txo->q;
+	struct be_queue_info *cq = &txo->cq;
+	struct be_dma_mem *q_mem = &txq->dma_mem;
+	int status, ver = 0;
+
+	req = embedded_payload(&wrb);
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_TX_CREATE, sizeof(*req), &wrb, NULL);
+
+	if (lancer_chip(adapter)) {
+		req->hdr.version = 1;
+	} else if (BEx_chip(adapter)) {
+		if (adapter->function_caps & BE_FUNCTION_CAPS_SUPER_NIC)
+			req->hdr.version = 2;
+	} else { /* For SH */
+		req->hdr.version = 2;
+	}
+
+	if (req->hdr.version > 0)
+		req->if_id = cpu_to_le16(adapter->if_handle);
+	req->num_pages = PAGES_4K_SPANNED(q_mem->va, q_mem->size);
+	req->ulp_num = BE_ULP1_NUM;
+	req->type = BE_ETH_TX_RING_TYPE_STANDARD;
+	req->cq_id = cpu_to_le16(cq->id);
+	req->queue_size = be_encoded_q_len(txq->len);
+	be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem);
+	ver = req->hdr.version;
+
+	status = be_cmd_notify_wait(adapter, &wrb);
+	if (!status) {
+		struct be_cmd_resp_eth_tx_create *resp = embedded_payload(&wrb);
+
+		txq->id = le16_to_cpu(resp->cid);
+		if (ver == 2)
+			txo->db_offset = le32_to_cpu(resp->db_offset);
+		else
+			txo->db_offset = DB_TXULP1_OFFSET;
+		txq->created = true;
+	}
+
+	return status;
+}
+
+/* Uses MCC */
+int be_cmd_rxq_create(struct be_adapter *adapter,
+		      struct be_queue_info *rxq, u16 cq_id, u16 frag_size,
+		      u32 if_id, u32 rss, u8 *rss_id)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_eth_rx_create *req;
+	struct be_dma_mem *q_mem = &rxq->dma_mem;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_RX_CREATE, sizeof(*req), wrb, NULL);
+
+	req->cq_id = cpu_to_le16(cq_id);
+	req->frag_size = fls(frag_size) - 1;
+	req->num_pages = 2;
+	be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem);
+	req->interface_id = cpu_to_le32(if_id);
+	req->max_frame_size = cpu_to_le16(BE_MAX_JUMBO_FRAME_SIZE);
+	req->rss_queue = cpu_to_le32(rss);
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_eth_rx_create *resp = embedded_payload(wrb);
+
+		rxq->id = le16_to_cpu(resp->id);
+		rxq->created = true;
+		*rss_id = resp->rss_id;
+	}
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Generic destroyer function for all types of queues
+ * Uses Mbox
+ */
+int be_cmd_q_destroy(struct be_adapter *adapter, struct be_queue_info *q,
+		     int queue_type)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_q_destroy *req;
+	u8 subsys = 0, opcode = 0;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+
+	switch (queue_type) {
+	case QTYPE_EQ:
+		subsys = CMD_SUBSYSTEM_COMMON;
+		opcode = OPCODE_COMMON_EQ_DESTROY;
+		break;
+	case QTYPE_CQ:
+		subsys = CMD_SUBSYSTEM_COMMON;
+		opcode = OPCODE_COMMON_CQ_DESTROY;
+		break;
+	case QTYPE_TXQ:
+		subsys = CMD_SUBSYSTEM_ETH;
+		opcode = OPCODE_ETH_TX_DESTROY;
+		break;
+	case QTYPE_RXQ:
+		subsys = CMD_SUBSYSTEM_ETH;
+		opcode = OPCODE_ETH_RX_DESTROY;
+		break;
+	case QTYPE_MCCQ:
+		subsys = CMD_SUBSYSTEM_COMMON;
+		opcode = OPCODE_COMMON_MCC_DESTROY;
+		break;
+	default:
+		BUG();
+	}
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, subsys, opcode, sizeof(*req), wrb,
+			       NULL);
+	req->id = cpu_to_le16(q->id);
+
+	status = be_mbox_notify_wait(adapter);
+	q->created = false;
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+/* Uses MCC */
+int be_cmd_rxq_destroy(struct be_adapter *adapter, struct be_queue_info *q)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_q_destroy *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_RX_DESTROY, sizeof(*req), wrb, NULL);
+	req->id = cpu_to_le16(q->id);
+
+	status = be_mcc_notify_wait(adapter);
+	q->created = false;
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Create an rx filtering policy configuration on an i/f
+ * Will use MBOX only if MCCQ has not been created.
+ */
+int be_cmd_if_create(struct be_adapter *adapter, u32 cap_flags, u32 en_flags,
+		     u32 *if_handle, u32 domain)
+{
+	struct be_mcc_wrb wrb = {0};
+	struct be_cmd_req_if_create *req;
+	int status;
+
+	req = embedded_payload(&wrb);
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_INTERFACE_CREATE,
+			       sizeof(*req), &wrb, NULL);
+	req->hdr.domain = domain;
+	req->capability_flags = cpu_to_le32(cap_flags);
+	req->enable_flags = cpu_to_le32(en_flags);
+	req->pmac_invalid = true;
+
+	status = be_cmd_notify_wait(adapter, &wrb);
+	if (!status) {
+		struct be_cmd_resp_if_create *resp = embedded_payload(&wrb);
+
+		*if_handle = le32_to_cpu(resp->interface_id);
+
+		/* Hack to retrieve VF's pmac-id on BE3 */
+		if (BE3_chip(adapter) && be_virtfn(adapter))
+			adapter->pmac_id[0] = le32_to_cpu(resp->pmac_id);
+	}
+	return status;
+}
+
+/* Uses MCCQ if available else MBOX */
+int be_cmd_if_destroy(struct be_adapter *adapter, int interface_id, u32 domain)
+{
+	struct be_mcc_wrb wrb = {0};
+	struct be_cmd_req_if_destroy *req;
+	int status;
+
+	if (interface_id == -1)
+		return 0;
+
+	req = embedded_payload(&wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_INTERFACE_DESTROY,
+			       sizeof(*req), &wrb, NULL);
+	req->hdr.domain = domain;
+	req->interface_id = cpu_to_le32(interface_id);
+
+	status = be_cmd_notify_wait(adapter, &wrb);
+	return status;
+}
+
+/* Get stats is a non embedded command: the request is not embedded inside
+ * WRB but is a separate dma memory block
+ * Uses asynchronous MCC
+ */
+int be_cmd_get_stats(struct be_adapter *adapter, struct be_dma_mem *nonemb_cmd)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_hdr *hdr;
+	int status = 0;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	hdr = nonemb_cmd->va;
+
+	be_wrb_cmd_hdr_prepare(hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_GET_STATISTICS, nonemb_cmd->size, wrb,
+			       nonemb_cmd);
+
+	/* version 1 of the cmd is not supported only by BE2 */
+	if (BE2_chip(adapter))
+		hdr->version = 0;
+	if (BE3_chip(adapter) || lancer_chip(adapter))
+		hdr->version = 1;
+	else
+		hdr->version = 2;
+
+	status = be_mcc_notify(adapter);
+	if (status)
+		goto err;
+
+	adapter->stats_cmd_sent = true;
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Lancer Stats */
+int lancer_cmd_get_pport_stats(struct be_adapter *adapter,
+			       struct be_dma_mem *nonemb_cmd)
+{
+	struct be_mcc_wrb *wrb;
+	struct lancer_cmd_req_pport_stats *req;
+	int status = 0;
+
+	if (!be_cmd_allowed(adapter, OPCODE_ETH_GET_PPORT_STATS,
+			    CMD_SUBSYSTEM_ETH))
+		return -EPERM;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = nonemb_cmd->va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_GET_PPORT_STATS, nonemb_cmd->size,
+			       wrb, nonemb_cmd);
+
+	req->cmd_params.params.pport_num = cpu_to_le16(adapter->hba_port_num);
+	req->cmd_params.params.reset_stats = 0;
+
+	status = be_mcc_notify(adapter);
+	if (status)
+		goto err;
+
+	adapter->stats_cmd_sent = true;
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+static int be_mac_to_link_speed(int mac_speed)
+{
+	switch (mac_speed) {
+	case PHY_LINK_SPEED_ZERO:
+		return 0;
+	case PHY_LINK_SPEED_10MBPS:
+		return 10;
+	case PHY_LINK_SPEED_100MBPS:
+		return 100;
+	case PHY_LINK_SPEED_1GBPS:
+		return 1000;
+	case PHY_LINK_SPEED_10GBPS:
+		return 10000;
+	case PHY_LINK_SPEED_20GBPS:
+		return 20000;
+	case PHY_LINK_SPEED_25GBPS:
+		return 25000;
+	case PHY_LINK_SPEED_40GBPS:
+		return 40000;
+	}
+	return 0;
+}
+
+/* Uses synchronous mcc
+ * Returns link_speed in Mbps
+ */
+int be_cmd_link_status_query(struct be_adapter *adapter, u16 *link_speed,
+			     u8 *link_status, u32 dom)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_link_status *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	if (link_status)
+		*link_status = LINK_DOWN;
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_LINK_STATUS_QUERY,
+			       sizeof(*req), wrb, NULL);
+
+	/* version 1 of the cmd is not supported only by BE2 */
+	if (!BE2_chip(adapter))
+		req->hdr.version = 1;
+
+	req->hdr.domain = dom;
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_link_status *resp = embedded_payload(wrb);
+
+		if (link_speed) {
+			*link_speed = resp->link_speed ?
+				      le16_to_cpu(resp->link_speed) * 10 :
+				      be_mac_to_link_speed(resp->mac_speed);
+
+			if (!resp->logical_link_status)
+				*link_speed = 0;
+		}
+		if (link_status)
+			*link_status = resp->logical_link_status;
+	}
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses synchronous mcc */
+int be_cmd_get_die_temperature(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_cntl_addnl_attribs *req;
+	int status = 0;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_CNTL_ADDITIONAL_ATTRIBUTES,
+			       sizeof(*req), wrb, NULL);
+
+	status = be_mcc_notify(adapter);
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses synchronous mcc */
+int be_cmd_get_fat_dump_len(struct be_adapter *adapter, u32 *dump_size)
+{
+	struct be_mcc_wrb wrb = {0};
+	struct be_cmd_req_get_fat *req;
+	int status;
+
+	req = embedded_payload(&wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_MANAGE_FAT, sizeof(*req),
+			       &wrb, NULL);
+	req->fat_operation = cpu_to_le32(QUERY_FAT);
+	status = be_cmd_notify_wait(adapter, &wrb);
+	if (!status) {
+		struct be_cmd_resp_get_fat *resp = embedded_payload(&wrb);
+
+		if (dump_size && resp->log_size)
+			*dump_size = le32_to_cpu(resp->log_size) -
+					sizeof(u32);
+	}
+	return status;
+}
+
+int be_cmd_get_fat_dump(struct be_adapter *adapter, u32 buf_len, void *buf)
+{
+	struct be_dma_mem get_fat_cmd;
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_fat *req;
+	u32 offset = 0, total_size, buf_size,
+				log_offset = sizeof(u32), payload_len;
+	int status;
+
+	if (buf_len == 0)
+		return 0;
+
+	total_size = buf_len;
+
+	get_fat_cmd.size = sizeof(struct be_cmd_req_get_fat) + 60*1024;
+	get_fat_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+					     get_fat_cmd.size,
+					     &get_fat_cmd.dma, GFP_ATOMIC);
+	if (!get_fat_cmd.va)
+		return -ENOMEM;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	while (total_size) {
+		buf_size = min(total_size, (u32)60*1024);
+		total_size -= buf_size;
+
+		wrb = wrb_from_mccq(adapter);
+		if (!wrb) {
+			status = -EBUSY;
+			goto err;
+		}
+		req = get_fat_cmd.va;
+
+		payload_len = sizeof(struct be_cmd_req_get_fat) + buf_size;
+		be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+				       OPCODE_COMMON_MANAGE_FAT, payload_len,
+				       wrb, &get_fat_cmd);
+
+		req->fat_operation = cpu_to_le32(RETRIEVE_FAT);
+		req->read_log_offset = cpu_to_le32(log_offset);
+		req->read_log_length = cpu_to_le32(buf_size);
+		req->data_buffer_size = cpu_to_le32(buf_size);
+
+		status = be_mcc_notify_wait(adapter);
+		if (!status) {
+			struct be_cmd_resp_get_fat *resp = get_fat_cmd.va;
+
+			memcpy(buf + offset,
+			       resp->data_buffer,
+			       le32_to_cpu(resp->read_log_length));
+		} else {
+			dev_err(&adapter->pdev->dev, "FAT Table Retrieve error\n");
+			goto err;
+		}
+		offset += buf_size;
+		log_offset += buf_size;
+	}
+err:
+	dma_free_coherent(&adapter->pdev->dev, get_fat_cmd.size,
+			  get_fat_cmd.va, get_fat_cmd.dma);
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses synchronous mcc */
+int be_cmd_get_fw_ver(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_fw_version *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_FW_VERSION, sizeof(*req), wrb,
+			       NULL);
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_fw_version *resp = embedded_payload(wrb);
+
+		strlcpy(adapter->fw_ver, resp->firmware_version_string,
+			sizeof(adapter->fw_ver));
+		strlcpy(adapter->fw_on_flash, resp->fw_on_flash_version_string,
+			sizeof(adapter->fw_on_flash));
+	}
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* set the EQ delay interval of an EQ to specified value
+ * Uses async mcc
+ */
+static int __be_cmd_modify_eqd(struct be_adapter *adapter,
+			       struct be_set_eqd *set_eqd, int num)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_modify_eq_delay *req;
+	int status = 0, i;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_MODIFY_EQ_DELAY, sizeof(*req), wrb,
+			       NULL);
+
+	req->num_eq = cpu_to_le32(num);
+	for (i = 0; i < num; i++) {
+		req->set_eqd[i].eq_id = cpu_to_le32(set_eqd[i].eq_id);
+		req->set_eqd[i].phase = 0;
+		req->set_eqd[i].delay_multiplier =
+				cpu_to_le32(set_eqd[i].delay_multiplier);
+	}
+
+	status = be_mcc_notify(adapter);
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_modify_eqd(struct be_adapter *adapter, struct be_set_eqd *set_eqd,
+		      int num)
+{
+	int num_eqs, i = 0;
+
+	while (num) {
+		num_eqs = min(num, 8);
+		__be_cmd_modify_eqd(adapter, &set_eqd[i], num_eqs);
+		i += num_eqs;
+		num -= num_eqs;
+	}
+
+	return 0;
+}
+
+/* Uses sycnhronous mcc */
+int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array,
+		       u32 num, u32 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_vlan_config *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_VLAN_CONFIG, sizeof(*req),
+			       wrb, NULL);
+	req->hdr.domain = domain;
+
+	req->interface_id = if_id;
+	req->untagged = BE_IF_FLAGS_UNTAGGED & be_if_cap_flags(adapter) ? 1 : 0;
+	req->num_vlan = num;
+	memcpy(req->normal_vlan, vtag_array,
+	       req->num_vlan * sizeof(vtag_array[0]));
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+static int __be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 value)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_dma_mem *mem = &adapter->rx_filter;
+	struct be_cmd_req_rx_filter *req = mem->va;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	memset(req, 0, sizeof(*req));
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_RX_FILTER, sizeof(*req),
+			       wrb, mem);
+
+	req->if_id = cpu_to_le32(adapter->if_handle);
+	req->if_flags_mask = cpu_to_le32(flags);
+	req->if_flags = (value == ON) ? req->if_flags_mask : 0;
+
+	if (flags & BE_IF_FLAGS_MULTICAST) {
+		int i;
+
+		/* Reset mcast promisc mode if already set by setting mask
+		 * and not setting flags field
+		 */
+		req->if_flags_mask |=
+			cpu_to_le32(BE_IF_FLAGS_MCAST_PROMISCUOUS &
+				    be_if_cap_flags(adapter));
+		req->mcast_num = cpu_to_le32(adapter->mc_count);
+		for (i = 0; i < adapter->mc_count; i++)
+			ether_addr_copy(req->mcast_mac[i].byte,
+					adapter->mc_list[i].mac);
+	}
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 value)
+{
+	struct device *dev = &adapter->pdev->dev;
+
+	if ((flags & be_if_cap_flags(adapter)) != flags) {
+		dev_warn(dev, "Cannot set rx filter flags 0x%x\n", flags);
+		dev_warn(dev, "Interface is capable of 0x%x flags only\n",
+			 be_if_cap_flags(adapter));
+	}
+	flags &= be_if_cap_flags(adapter);
+	if (!flags)
+		return -ENOTSUPP;
+
+	return __be_cmd_rx_filter(adapter, flags, value);
+}
+
+/* Uses synchrounous mcc */
+int be_cmd_set_flow_control(struct be_adapter *adapter, u32 tx_fc, u32 rx_fc)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_flow_control *req;
+	int status;
+
+	if (!be_cmd_allowed(adapter, OPCODE_COMMON_SET_FLOW_CONTROL,
+			    CMD_SUBSYSTEM_COMMON))
+		return -EPERM;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_FLOW_CONTROL, sizeof(*req),
+			       wrb, NULL);
+
+	req->hdr.version = 1;
+	req->tx_flow_control = cpu_to_le16((u16)tx_fc);
+	req->rx_flow_control = cpu_to_le16((u16)rx_fc);
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+
+	if (base_status(status) == MCC_STATUS_FEATURE_NOT_SUPPORTED)
+		return  -EOPNOTSUPP;
+
+	return status;
+}
+
+/* Uses sycn mcc */
+int be_cmd_get_flow_control(struct be_adapter *adapter, u32 *tx_fc, u32 *rx_fc)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_flow_control *req;
+	int status;
+
+	if (!be_cmd_allowed(adapter, OPCODE_COMMON_GET_FLOW_CONTROL,
+			    CMD_SUBSYSTEM_COMMON))
+		return -EPERM;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_FLOW_CONTROL, sizeof(*req),
+			       wrb, NULL);
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_flow_control *resp =
+						embedded_payload(wrb);
+
+		*tx_fc = le16_to_cpu(resp->tx_flow_control);
+		*rx_fc = le16_to_cpu(resp->rx_flow_control);
+	}
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses mbox */
+int be_cmd_query_fw_cfg(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_query_fw_cfg *req;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_QUERY_FIRMWARE_CONFIG,
+			       sizeof(*req), wrb, NULL);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_query_fw_cfg *resp = embedded_payload(wrb);
+
+		adapter->port_num = le32_to_cpu(resp->phys_port);
+		adapter->function_mode = le32_to_cpu(resp->function_mode);
+		adapter->function_caps = le32_to_cpu(resp->function_caps);
+		adapter->asic_rev = le32_to_cpu(resp->asic_revision) & 0xFF;
+		dev_info(&adapter->pdev->dev,
+			 "FW config: function_mode=0x%x, function_caps=0x%x\n",
+			 adapter->function_mode, adapter->function_caps);
+	}
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+/* Uses mbox */
+int be_cmd_reset_function(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_hdr *req;
+	int status;
+
+	if (lancer_chip(adapter)) {
+		iowrite32(SLI_PORT_CONTROL_IP_MASK,
+			  adapter->db + SLIPORT_CONTROL_OFFSET);
+		status = lancer_wait_ready(adapter);
+		if (status)
+			dev_err(&adapter->pdev->dev,
+				"Adapter in non recoverable error\n");
+		return status;
+	}
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(req, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_FUNCTION_RESET, sizeof(*req), wrb,
+			       NULL);
+
+	status = be_mbox_notify_wait(adapter);
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+int be_cmd_rss_config(struct be_adapter *adapter, u8 *rsstable,
+		      u32 rss_hash_opts, u16 table_size, const u8 *rss_hkey)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_rss_config *req;
+	int status;
+
+	if (!(be_if_cap_flags(adapter) & BE_IF_FLAGS_RSS))
+		return 0;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_RSS_CONFIG, sizeof(*req), wrb, NULL);
+
+	req->if_id = cpu_to_le32(adapter->if_handle);
+	req->enable_rss = cpu_to_le16(rss_hash_opts);
+	req->cpu_table_size_log2 = cpu_to_le16(fls(table_size) - 1);
+
+	if (!BEx_chip(adapter))
+		req->hdr.version = 1;
+
+	memcpy(req->cpu_table, rsstable, table_size);
+	memcpy(req->hash, rss_hkey, RSS_HASH_KEY_LEN);
+	be_dws_cpu_to_le(req->hash, sizeof(req->hash));
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses sync mcc */
+int be_cmd_set_beacon_state(struct be_adapter *adapter, u8 port_num,
+			    u8 bcn, u8 sts, u8 state)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_enable_disable_beacon *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_ENABLE_DISABLE_BEACON,
+			       sizeof(*req), wrb, NULL);
+
+	req->port_num = port_num;
+	req->beacon_state = state;
+	req->beacon_duration = bcn;
+	req->status_duration = sts;
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses sync mcc */
+int be_cmd_get_beacon_state(struct be_adapter *adapter, u8 port_num, u32 *state)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_beacon_state *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_BEACON_STATE, sizeof(*req),
+			       wrb, NULL);
+
+	req->port_num = port_num;
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_beacon_state *resp =
+						embedded_payload(wrb);
+
+		*state = resp->beacon_state;
+	}
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses sync mcc */
+int be_cmd_read_port_transceiver_data(struct be_adapter *adapter,
+				      u8 page_num, u8 *data)
+{
+	struct be_dma_mem cmd;
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_port_type *req;
+	int status;
+
+	if (page_num > TR_PAGE_A2)
+		return -EINVAL;
+
+	cmd.size = sizeof(struct be_cmd_resp_port_type);
+	cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+				     GFP_ATOMIC);
+	if (!cmd.va) {
+		dev_err(&adapter->pdev->dev, "Memory allocation failed\n");
+		return -ENOMEM;
+	}
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = cmd.va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_READ_TRANSRECV_DATA,
+			       cmd.size, wrb, &cmd);
+
+	req->port = cpu_to_le32(adapter->hba_port_num);
+	req->page_num = cpu_to_le32(page_num);
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_port_type *resp = cmd.va;
+
+		memcpy(data, resp->page_data, PAGE_DATA_LEN);
+	}
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va, cmd.dma);
+	return status;
+}
+
+static int lancer_cmd_write_object(struct be_adapter *adapter,
+				   struct be_dma_mem *cmd, u32 data_size,
+				   u32 data_offset, const char *obj_name,
+				   u32 *data_written, u8 *change_status,
+				   u8 *addn_status)
+{
+	struct be_mcc_wrb *wrb;
+	struct lancer_cmd_req_write_object *req;
+	struct lancer_cmd_resp_write_object *resp;
+	void *ctxt = NULL;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+	adapter->flash_status = 0;
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err_unlock;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_WRITE_OBJECT,
+			       sizeof(struct lancer_cmd_req_write_object), wrb,
+			       NULL);
+
+	ctxt = &req->context;
+	AMAP_SET_BITS(struct amap_lancer_write_obj_context,
+		      write_length, ctxt, data_size);
+
+	if (data_size == 0)
+		AMAP_SET_BITS(struct amap_lancer_write_obj_context,
+			      eof, ctxt, 1);
+	else
+		AMAP_SET_BITS(struct amap_lancer_write_obj_context,
+			      eof, ctxt, 0);
+
+	be_dws_cpu_to_le(ctxt, sizeof(req->context));
+	req->write_offset = cpu_to_le32(data_offset);
+	strlcpy(req->object_name, obj_name, sizeof(req->object_name));
+	req->descriptor_count = cpu_to_le32(1);
+	req->buf_len = cpu_to_le32(data_size);
+	req->addr_low = cpu_to_le32((cmd->dma +
+				     sizeof(struct lancer_cmd_req_write_object))
+				    & 0xFFFFFFFF);
+	req->addr_high = cpu_to_le32(upper_32_bits(cmd->dma +
+				sizeof(struct lancer_cmd_req_write_object)));
+
+	status = be_mcc_notify(adapter);
+	if (status)
+		goto err_unlock;
+
+	mutex_unlock(&adapter->mcc_lock);
+
+	if (!wait_for_completion_timeout(&adapter->et_cmd_compl,
+					 msecs_to_jiffies(60000)))
+		status = -ETIMEDOUT;
+	else
+		status = adapter->flash_status;
+
+	resp = embedded_payload(wrb);
+	if (!status) {
+		*data_written = le32_to_cpu(resp->actual_write_len);
+		*change_status = resp->change_status;
+	} else {
+		*addn_status = resp->additional_status;
+	}
+
+	return status;
+
+err_unlock:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_query_cable_type(struct be_adapter *adapter)
+{
+	u8 page_data[PAGE_DATA_LEN];
+	int status;
+
+	status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0,
+						   page_data);
+	if (!status) {
+		switch (adapter->phy.interface_type) {
+		case PHY_TYPE_QSFP:
+			adapter->phy.cable_type =
+				page_data[QSFP_PLUS_CABLE_TYPE_OFFSET];
+			break;
+		case PHY_TYPE_SFP_PLUS_10GB:
+			adapter->phy.cable_type =
+				page_data[SFP_PLUS_CABLE_TYPE_OFFSET];
+			break;
+		default:
+			adapter->phy.cable_type = 0;
+			break;
+		}
+	}
+	return status;
+}
+
+int be_cmd_query_sfp_info(struct be_adapter *adapter)
+{
+	u8 page_data[PAGE_DATA_LEN];
+	int status;
+
+	status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0,
+						   page_data);
+	if (!status) {
+		strlcpy(adapter->phy.vendor_name, page_data +
+			SFP_VENDOR_NAME_OFFSET, SFP_VENDOR_NAME_LEN - 1);
+		strlcpy(adapter->phy.vendor_pn,
+			page_data + SFP_VENDOR_PN_OFFSET,
+			SFP_VENDOR_NAME_LEN - 1);
+	}
+
+	return status;
+}
+
+static int lancer_cmd_delete_object(struct be_adapter *adapter,
+				    const char *obj_name)
+{
+	struct lancer_cmd_req_delete_object *req;
+	struct be_mcc_wrb *wrb;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_DELETE_OBJECT,
+			       sizeof(*req), wrb, NULL);
+
+	strlcpy(req->object_name, obj_name, sizeof(req->object_name));
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int lancer_cmd_read_object(struct be_adapter *adapter, struct be_dma_mem *cmd,
+			   u32 data_size, u32 data_offset, const char *obj_name,
+			   u32 *data_read, u32 *eof, u8 *addn_status)
+{
+	struct be_mcc_wrb *wrb;
+	struct lancer_cmd_req_read_object *req;
+	struct lancer_cmd_resp_read_object *resp;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err_unlock;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_READ_OBJECT,
+			       sizeof(struct lancer_cmd_req_read_object), wrb,
+			       NULL);
+
+	req->desired_read_len = cpu_to_le32(data_size);
+	req->read_offset = cpu_to_le32(data_offset);
+	strcpy(req->object_name, obj_name);
+	req->descriptor_count = cpu_to_le32(1);
+	req->buf_len = cpu_to_le32(data_size);
+	req->addr_low = cpu_to_le32((cmd->dma & 0xFFFFFFFF));
+	req->addr_high = cpu_to_le32(upper_32_bits(cmd->dma));
+
+	status = be_mcc_notify_wait(adapter);
+
+	resp = embedded_payload(wrb);
+	if (!status) {
+		*data_read = le32_to_cpu(resp->actual_read_len);
+		*eof = le32_to_cpu(resp->eof);
+	} else {
+		*addn_status = resp->additional_status;
+	}
+
+err_unlock:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+static int be_cmd_write_flashrom(struct be_adapter *adapter,
+				 struct be_dma_mem *cmd, u32 flash_type,
+				 u32 flash_opcode, u32 img_offset, u32 buf_size)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_write_flashrom *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+	adapter->flash_status = 0;
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err_unlock;
+	}
+	req = cmd->va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_WRITE_FLASHROM, cmd->size, wrb,
+			       cmd);
+
+	req->params.op_type = cpu_to_le32(flash_type);
+	if (flash_type == OPTYPE_OFFSET_SPECIFIED)
+		req->params.offset = cpu_to_le32(img_offset);
+
+	req->params.op_code = cpu_to_le32(flash_opcode);
+	req->params.data_buf_size = cpu_to_le32(buf_size);
+
+	status = be_mcc_notify(adapter);
+	if (status)
+		goto err_unlock;
+
+	mutex_unlock(&adapter->mcc_lock);
+
+	if (!wait_for_completion_timeout(&adapter->et_cmd_compl,
+					 msecs_to_jiffies(40000)))
+		status = -ETIMEDOUT;
+	else
+		status = adapter->flash_status;
+
+	return status;
+
+err_unlock:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+static int be_cmd_get_flash_crc(struct be_adapter *adapter, u8 *flashed_crc,
+				u16 img_optype, u32 img_offset, u32 crc_offset)
+{
+	struct be_cmd_read_flash_crc *req;
+	struct be_mcc_wrb *wrb;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_READ_FLASHROM, sizeof(*req),
+			       wrb, NULL);
+
+	req->params.op_type = cpu_to_le32(img_optype);
+	if (img_optype == OPTYPE_OFFSET_SPECIFIED)
+		req->params.offset = cpu_to_le32(img_offset + crc_offset);
+	else
+		req->params.offset = cpu_to_le32(crc_offset);
+
+	req->params.op_code = cpu_to_le32(FLASHROM_OPER_REPORT);
+	req->params.data_buf_size = cpu_to_le32(0x4);
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status)
+		memcpy(flashed_crc, req->crc, 4);
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+static char flash_cookie[2][16] = {"*** SE FLAS", "H DIRECTORY *** "};
+
+static bool phy_flashing_required(struct be_adapter *adapter)
+{
+	return (adapter->phy.phy_type == PHY_TYPE_TN_8022 &&
+		adapter->phy.interface_type == PHY_TYPE_BASET_10GB);
+}
+
+static bool is_comp_in_ufi(struct be_adapter *adapter,
+			   struct flash_section_info *fsec, int type)
+{
+	int i = 0, img_type = 0;
+	struct flash_section_info_g2 *fsec_g2 = NULL;
+
+	if (BE2_chip(adapter))
+		fsec_g2 = (struct flash_section_info_g2 *)fsec;
+
+	for (i = 0; i < MAX_FLASH_COMP; i++) {
+		if (fsec_g2)
+			img_type = le32_to_cpu(fsec_g2->fsec_entry[i].type);
+		else
+			img_type = le32_to_cpu(fsec->fsec_entry[i].type);
+
+		if (img_type == type)
+			return true;
+	}
+	return false;
+}
+
+static struct flash_section_info *get_fsec_info(struct be_adapter *adapter,
+						int header_size,
+						const struct firmware *fw)
+{
+	struct flash_section_info *fsec = NULL;
+	const u8 *p = fw->data;
+
+	p += header_size;
+	while (p < (fw->data + fw->size)) {
+		fsec = (struct flash_section_info *)p;
+		if (!memcmp(flash_cookie, fsec->cookie, sizeof(flash_cookie)))
+			return fsec;
+		p += 32;
+	}
+	return NULL;
+}
+
+static int be_check_flash_crc(struct be_adapter *adapter, const u8 *p,
+			      u32 img_offset, u32 img_size, int hdr_size,
+			      u16 img_optype, bool *crc_match)
+{
+	u32 crc_offset;
+	int status;
+	u8 crc[4];
+
+	status = be_cmd_get_flash_crc(adapter, crc, img_optype, img_offset,
+				      img_size - 4);
+	if (status)
+		return status;
+
+	crc_offset = hdr_size + img_offset + img_size - 4;
+
+	/* Skip flashing, if crc of flashed region matches */
+	if (!memcmp(crc, p + crc_offset, 4))
+		*crc_match = true;
+	else
+		*crc_match = false;
+
+	return status;
+}
+
+static int be_flash(struct be_adapter *adapter, const u8 *img,
+		    struct be_dma_mem *flash_cmd, int optype, int img_size,
+		    u32 img_offset)
+{
+	u32 flash_op, num_bytes, total_bytes = img_size, bytes_sent = 0;
+	struct be_cmd_write_flashrom *req = flash_cmd->va;
+	int status;
+
+	while (total_bytes) {
+		num_bytes = min_t(u32, 32 * 1024, total_bytes);
+
+		total_bytes -= num_bytes;
+
+		if (!total_bytes) {
+			if (optype == OPTYPE_PHY_FW)
+				flash_op = FLASHROM_OPER_PHY_FLASH;
+			else
+				flash_op = FLASHROM_OPER_FLASH;
+		} else {
+			if (optype == OPTYPE_PHY_FW)
+				flash_op = FLASHROM_OPER_PHY_SAVE;
+			else
+				flash_op = FLASHROM_OPER_SAVE;
+		}
+
+		memcpy(req->data_buf, img, num_bytes);
+		img += num_bytes;
+		status = be_cmd_write_flashrom(adapter, flash_cmd, optype,
+					       flash_op, img_offset +
+					       bytes_sent, num_bytes);
+		if (base_status(status) == MCC_STATUS_ILLEGAL_REQUEST &&
+		    optype == OPTYPE_PHY_FW)
+			break;
+		else if (status)
+			return status;
+
+		bytes_sent += num_bytes;
+	}
+	return 0;
+}
+
+#define NCSI_UPDATE_LOG	"NCSI section update is not supported in FW ver %s\n"
+static bool be_fw_ncsi_supported(char *ver)
+{
+	int v1[4] = {3, 102, 148, 0}; /* Min ver that supports NCSI FW */
+	int v2[4];
+	int i;
+
+	if (sscanf(ver, "%d.%d.%d.%d", &v2[0], &v2[1], &v2[2], &v2[3]) != 4)
+		return false;
+
+	for (i = 0; i < 4; i++) {
+		if (v1[i] < v2[i])
+			return true;
+		else if (v1[i] > v2[i])
+			return false;
+	}
+
+	return true;
+}
+
+/* For BE2, BE3 and BE3-R */
+static int be_flash_BEx(struct be_adapter *adapter,
+			const struct firmware *fw,
+			struct be_dma_mem *flash_cmd, int num_of_images)
+{
+	int img_hdrs_size = (num_of_images * sizeof(struct image_hdr));
+	struct device *dev = &adapter->pdev->dev;
+	struct flash_section_info *fsec = NULL;
+	int status, i, filehdr_size, num_comp;
+	const struct flash_comp *pflashcomp;
+	bool crc_match;
+	const u8 *p;
+
+	struct flash_comp gen3_flash_types[] = {
+		{ BE3_ISCSI_PRIMARY_IMAGE_START, OPTYPE_ISCSI_ACTIVE,
+			BE3_COMP_MAX_SIZE, IMAGE_FIRMWARE_ISCSI},
+		{ BE3_REDBOOT_START, OPTYPE_REDBOOT,
+			BE3_REDBOOT_COMP_MAX_SIZE, IMAGE_BOOT_CODE},
+		{ BE3_ISCSI_BIOS_START, OPTYPE_BIOS,
+			BE3_BIOS_COMP_MAX_SIZE, IMAGE_OPTION_ROM_ISCSI},
+		{ BE3_PXE_BIOS_START, OPTYPE_PXE_BIOS,
+			BE3_BIOS_COMP_MAX_SIZE, IMAGE_OPTION_ROM_PXE},
+		{ BE3_FCOE_BIOS_START, OPTYPE_FCOE_BIOS,
+			BE3_BIOS_COMP_MAX_SIZE, IMAGE_OPTION_ROM_FCOE},
+		{ BE3_ISCSI_BACKUP_IMAGE_START, OPTYPE_ISCSI_BACKUP,
+			BE3_COMP_MAX_SIZE, IMAGE_FIRMWARE_BACKUP_ISCSI},
+		{ BE3_FCOE_PRIMARY_IMAGE_START, OPTYPE_FCOE_FW_ACTIVE,
+			BE3_COMP_MAX_SIZE, IMAGE_FIRMWARE_FCOE},
+		{ BE3_FCOE_BACKUP_IMAGE_START, OPTYPE_FCOE_FW_BACKUP,
+			BE3_COMP_MAX_SIZE, IMAGE_FIRMWARE_BACKUP_FCOE},
+		{ BE3_NCSI_START, OPTYPE_NCSI_FW,
+			BE3_NCSI_COMP_MAX_SIZE, IMAGE_NCSI},
+		{ BE3_PHY_FW_START, OPTYPE_PHY_FW,
+			BE3_PHY_FW_COMP_MAX_SIZE, IMAGE_FIRMWARE_PHY}
+	};
+
+	struct flash_comp gen2_flash_types[] = {
+		{ BE2_ISCSI_PRIMARY_IMAGE_START, OPTYPE_ISCSI_ACTIVE,
+			BE2_COMP_MAX_SIZE, IMAGE_FIRMWARE_ISCSI},
+		{ BE2_REDBOOT_START, OPTYPE_REDBOOT,
+			BE2_REDBOOT_COMP_MAX_SIZE, IMAGE_BOOT_CODE},
+		{ BE2_ISCSI_BIOS_START, OPTYPE_BIOS,
+			BE2_BIOS_COMP_MAX_SIZE, IMAGE_OPTION_ROM_ISCSI},
+		{ BE2_PXE_BIOS_START, OPTYPE_PXE_BIOS,
+			BE2_BIOS_COMP_MAX_SIZE, IMAGE_OPTION_ROM_PXE},
+		{ BE2_FCOE_BIOS_START, OPTYPE_FCOE_BIOS,
+			BE2_BIOS_COMP_MAX_SIZE, IMAGE_OPTION_ROM_FCOE},
+		{ BE2_ISCSI_BACKUP_IMAGE_START, OPTYPE_ISCSI_BACKUP,
+			BE2_COMP_MAX_SIZE, IMAGE_FIRMWARE_BACKUP_ISCSI},
+		{ BE2_FCOE_PRIMARY_IMAGE_START, OPTYPE_FCOE_FW_ACTIVE,
+			BE2_COMP_MAX_SIZE, IMAGE_FIRMWARE_FCOE},
+		{ BE2_FCOE_BACKUP_IMAGE_START, OPTYPE_FCOE_FW_BACKUP,
+			 BE2_COMP_MAX_SIZE, IMAGE_FIRMWARE_BACKUP_FCOE}
+	};
+
+	if (BE3_chip(adapter)) {
+		pflashcomp = gen3_flash_types;
+		filehdr_size = sizeof(struct flash_file_hdr_g3);
+		num_comp = ARRAY_SIZE(gen3_flash_types);
+	} else {
+		pflashcomp = gen2_flash_types;
+		filehdr_size = sizeof(struct flash_file_hdr_g2);
+		num_comp = ARRAY_SIZE(gen2_flash_types);
+		img_hdrs_size = 0;
+	}
+
+	/* Get flash section info*/
+	fsec = get_fsec_info(adapter, filehdr_size + img_hdrs_size, fw);
+	if (!fsec) {
+		dev_err(dev, "Invalid Cookie. FW image may be corrupted\n");
+		return -1;
+	}
+	for (i = 0; i < num_comp; i++) {
+		if (!is_comp_in_ufi(adapter, fsec, pflashcomp[i].img_type))
+			continue;
+
+		if ((pflashcomp[i].optype == OPTYPE_NCSI_FW) &&
+		    !be_fw_ncsi_supported(adapter->fw_ver)) {
+			dev_info(dev, NCSI_UPDATE_LOG, adapter->fw_ver);
+			continue;
+		}
+
+		if (pflashcomp[i].optype == OPTYPE_PHY_FW  &&
+		    !phy_flashing_required(adapter))
+			continue;
+
+		if (pflashcomp[i].optype == OPTYPE_REDBOOT) {
+			status = be_check_flash_crc(adapter, fw->data,
+						    pflashcomp[i].offset,
+						    pflashcomp[i].size,
+						    filehdr_size +
+						    img_hdrs_size,
+						    OPTYPE_REDBOOT, &crc_match);
+			if (status) {
+				dev_err(dev,
+					"Could not get CRC for 0x%x region\n",
+					pflashcomp[i].optype);
+				continue;
+			}
+
+			if (crc_match)
+				continue;
+		}
+
+		p = fw->data + filehdr_size + pflashcomp[i].offset +
+			img_hdrs_size;
+		if (p + pflashcomp[i].size > fw->data + fw->size)
+			return -1;
+
+		status = be_flash(adapter, p, flash_cmd, pflashcomp[i].optype,
+				  pflashcomp[i].size, 0);
+		if (status) {
+			dev_err(dev, "Flashing section type 0x%x failed\n",
+				pflashcomp[i].img_type);
+			return status;
+		}
+	}
+	return 0;
+}
+
+static u16 be_get_img_optype(struct flash_section_entry fsec_entry)
+{
+	u32 img_type = le32_to_cpu(fsec_entry.type);
+	u16 img_optype = le16_to_cpu(fsec_entry.optype);
+
+	if (img_optype != 0xFFFF)
+		return img_optype;
+
+	switch (img_type) {
+	case IMAGE_FIRMWARE_ISCSI:
+		img_optype = OPTYPE_ISCSI_ACTIVE;
+		break;
+	case IMAGE_BOOT_CODE:
+		img_optype = OPTYPE_REDBOOT;
+		break;
+	case IMAGE_OPTION_ROM_ISCSI:
+		img_optype = OPTYPE_BIOS;
+		break;
+	case IMAGE_OPTION_ROM_PXE:
+		img_optype = OPTYPE_PXE_BIOS;
+		break;
+	case IMAGE_OPTION_ROM_FCOE:
+		img_optype = OPTYPE_FCOE_BIOS;
+		break;
+	case IMAGE_FIRMWARE_BACKUP_ISCSI:
+		img_optype = OPTYPE_ISCSI_BACKUP;
+		break;
+	case IMAGE_NCSI:
+		img_optype = OPTYPE_NCSI_FW;
+		break;
+	case IMAGE_FLASHISM_JUMPVECTOR:
+		img_optype = OPTYPE_FLASHISM_JUMPVECTOR;
+		break;
+	case IMAGE_FIRMWARE_PHY:
+		img_optype = OPTYPE_SH_PHY_FW;
+		break;
+	case IMAGE_REDBOOT_DIR:
+		img_optype = OPTYPE_REDBOOT_DIR;
+		break;
+	case IMAGE_REDBOOT_CONFIG:
+		img_optype = OPTYPE_REDBOOT_CONFIG;
+		break;
+	case IMAGE_UFI_DIR:
+		img_optype = OPTYPE_UFI_DIR;
+		break;
+	default:
+		break;
+	}
+
+	return img_optype;
+}
+
+static int be_flash_skyhawk(struct be_adapter *adapter,
+			    const struct firmware *fw,
+			    struct be_dma_mem *flash_cmd, int num_of_images)
+{
+	int img_hdrs_size = num_of_images * sizeof(struct image_hdr);
+	bool crc_match, old_fw_img, flash_offset_support = true;
+	struct device *dev = &adapter->pdev->dev;
+	struct flash_section_info *fsec = NULL;
+	u32 img_offset, img_size, img_type;
+	u16 img_optype, flash_optype;
+	int status, i, filehdr_size;
+	const u8 *p;
+
+	filehdr_size = sizeof(struct flash_file_hdr_g3);
+	fsec = get_fsec_info(adapter, filehdr_size + img_hdrs_size, fw);
+	if (!fsec) {
+		dev_err(dev, "Invalid Cookie. FW image may be corrupted\n");
+		return -EINVAL;
+	}
+
+retry_flash:
+	for (i = 0; i < le32_to_cpu(fsec->fsec_hdr.num_images); i++) {
+		img_offset = le32_to_cpu(fsec->fsec_entry[i].offset);
+		img_size   = le32_to_cpu(fsec->fsec_entry[i].pad_size);
+		img_type   = le32_to_cpu(fsec->fsec_entry[i].type);
+		img_optype = be_get_img_optype(fsec->fsec_entry[i]);
+		old_fw_img = fsec->fsec_entry[i].optype == 0xFFFF;
+
+		if (img_optype == 0xFFFF)
+			continue;
+
+		if (flash_offset_support)
+			flash_optype = OPTYPE_OFFSET_SPECIFIED;
+		else
+			flash_optype = img_optype;
+
+		/* Don't bother verifying CRC if an old FW image is being
+		 * flashed
+		 */
+		if (old_fw_img)
+			goto flash;
+
+		status = be_check_flash_crc(adapter, fw->data, img_offset,
+					    img_size, filehdr_size +
+					    img_hdrs_size, flash_optype,
+					    &crc_match);
+		if (base_status(status) == MCC_STATUS_ILLEGAL_REQUEST ||
+		    base_status(status) == MCC_STATUS_ILLEGAL_FIELD) {
+			/* The current FW image on the card does not support
+			 * OFFSET based flashing. Retry using older mechanism
+			 * of OPTYPE based flashing
+			 */
+			if (flash_optype == OPTYPE_OFFSET_SPECIFIED) {
+				flash_offset_support = false;
+				goto retry_flash;
+			}
+
+			/* The current FW image on the card does not recognize
+			 * the new FLASH op_type. The FW download is partially
+			 * complete. Reboot the server now to enable FW image
+			 * to recognize the new FLASH op_type. To complete the
+			 * remaining process, download the same FW again after
+			 * the reboot.
+			 */
+			dev_err(dev, "Flash incomplete. Reset the server\n");
+			dev_err(dev, "Download FW image again after reset\n");
+			return -EAGAIN;
+		} else if (status) {
+			dev_err(dev, "Could not get CRC for 0x%x region\n",
+				img_optype);
+			return -EFAULT;
+		}
+
+		if (crc_match)
+			continue;
+
+flash:
+		p = fw->data + filehdr_size + img_offset + img_hdrs_size;
+		if (p + img_size > fw->data + fw->size)
+			return -1;
+
+		status = be_flash(adapter, p, flash_cmd, flash_optype, img_size,
+				  img_offset);
+
+		/* The current FW image on the card does not support OFFSET
+		 * based flashing. Retry using older mechanism of OPTYPE based
+		 * flashing
+		 */
+		if (base_status(status) == MCC_STATUS_ILLEGAL_FIELD &&
+		    flash_optype == OPTYPE_OFFSET_SPECIFIED) {
+			flash_offset_support = false;
+			goto retry_flash;
+		}
+
+		/* For old FW images ignore ILLEGAL_FIELD error or errors on
+		 * UFI_DIR region
+		 */
+		if (old_fw_img &&
+		    (base_status(status) == MCC_STATUS_ILLEGAL_FIELD ||
+		     (img_optype == OPTYPE_UFI_DIR &&
+		      base_status(status) == MCC_STATUS_FAILED))) {
+			continue;
+		} else if (status) {
+			dev_err(dev, "Flashing section type 0x%x failed\n",
+				img_type);
+
+			switch (addl_status(status)) {
+			case MCC_ADDL_STATUS_MISSING_SIGNATURE:
+				dev_err(dev,
+					"Digital signature missing in FW\n");
+				return -EINVAL;
+			case MCC_ADDL_STATUS_INVALID_SIGNATURE:
+				dev_err(dev,
+					"Invalid digital signature in FW\n");
+				return -EINVAL;
+			default:
+				return -EFAULT;
+			}
+		}
+	}
+	return 0;
+}
+
+int lancer_fw_download(struct be_adapter *adapter,
+		       const struct firmware *fw)
+{
+	struct device *dev = &adapter->pdev->dev;
+	struct be_dma_mem flash_cmd;
+	const u8 *data_ptr = NULL;
+	u8 *dest_image_ptr = NULL;
+	size_t image_size = 0;
+	u32 chunk_size = 0;
+	u32 data_written = 0;
+	u32 offset = 0;
+	int status = 0;
+	u8 add_status = 0;
+	u8 change_status;
+
+	if (!IS_ALIGNED(fw->size, sizeof(u32))) {
+		dev_err(dev, "FW image size should be multiple of 4\n");
+		return -EINVAL;
+	}
+
+	flash_cmd.size = sizeof(struct lancer_cmd_req_write_object)
+				+ LANCER_FW_DOWNLOAD_CHUNK;
+	flash_cmd.va = dma_zalloc_coherent(dev, flash_cmd.size,
+					   &flash_cmd.dma, GFP_KERNEL);
+	if (!flash_cmd.va)
+		return -ENOMEM;
+
+	dest_image_ptr = flash_cmd.va +
+				sizeof(struct lancer_cmd_req_write_object);
+	image_size = fw->size;
+	data_ptr = fw->data;
+
+	while (image_size) {
+		chunk_size = min_t(u32, image_size, LANCER_FW_DOWNLOAD_CHUNK);
+
+		/* Copy the image chunk content. */
+		memcpy(dest_image_ptr, data_ptr, chunk_size);
+
+		status = lancer_cmd_write_object(adapter, &flash_cmd,
+						 chunk_size, offset,
+						 LANCER_FW_DOWNLOAD_LOCATION,
+						 &data_written, &change_status,
+						 &add_status);
+		if (status)
+			break;
+
+		offset += data_written;
+		data_ptr += data_written;
+		image_size -= data_written;
+	}
+
+	if (!status) {
+		/* Commit the FW written */
+		status = lancer_cmd_write_object(adapter, &flash_cmd,
+						 0, offset,
+						 LANCER_FW_DOWNLOAD_LOCATION,
+						 &data_written, &change_status,
+						 &add_status);
+	}
+
+	dma_free_coherent(dev, flash_cmd.size, flash_cmd.va, flash_cmd.dma);
+	if (status) {
+		dev_err(dev, "Firmware load error\n");
+		return be_cmd_status(status);
+	}
+
+	dev_info(dev, "Firmware flashed successfully\n");
+
+	if (change_status == LANCER_FW_RESET_NEEDED) {
+		dev_info(dev, "Resetting adapter to activate new FW\n");
+		status = lancer_physdev_ctrl(adapter,
+					     PHYSDEV_CONTROL_FW_RESET_MASK);
+		if (status) {
+			dev_err(dev, "Adapter busy, could not reset FW\n");
+			dev_err(dev, "Reboot server to activate new FW\n");
+		}
+	} else if (change_status != LANCER_NO_RESET_NEEDED) {
+		dev_info(dev, "Reboot server to activate new FW\n");
+	}
+
+	return 0;
+}
+
+/* Check if the flash image file is compatible with the adapter that
+ * is being flashed.
+ */
+static bool be_check_ufi_compatibility(struct be_adapter *adapter,
+				       struct flash_file_hdr_g3 *fhdr)
+{
+	if (!fhdr) {
+		dev_err(&adapter->pdev->dev, "Invalid FW UFI file");
+		return false;
+	}
+
+	/* First letter of the build version is used to identify
+	 * which chip this image file is meant for.
+	 */
+	switch (fhdr->build[0]) {
+	case BLD_STR_UFI_TYPE_SH:
+		if (!skyhawk_chip(adapter))
+			return false;
+		break;
+	case BLD_STR_UFI_TYPE_BE3:
+		if (!BE3_chip(adapter))
+			return false;
+		break;
+	case BLD_STR_UFI_TYPE_BE2:
+		if (!BE2_chip(adapter))
+			return false;
+		break;
+	default:
+		return false;
+	}
+
+	/* In BE3 FW images the "asic_type_rev" field doesn't track the
+	 * asic_rev of the chips it is compatible with.
+	 * When asic_type_rev is 0 the image is compatible only with
+	 * pre-BE3-R chips (asic_rev < 0x10)
+	 */
+	if (BEx_chip(adapter) && fhdr->asic_type_rev == 0)
+		return adapter->asic_rev < 0x10;
+	else
+		return (fhdr->asic_type_rev >= adapter->asic_rev);
+}
+
+int be_fw_download(struct be_adapter *adapter, const struct firmware *fw)
+{
+	struct device *dev = &adapter->pdev->dev;
+	struct flash_file_hdr_g3 *fhdr3;
+	struct image_hdr *img_hdr_ptr;
+	int status = 0, i, num_imgs;
+	struct be_dma_mem flash_cmd;
+
+	fhdr3 = (struct flash_file_hdr_g3 *)fw->data;
+	if (!be_check_ufi_compatibility(adapter, fhdr3)) {
+		dev_err(dev, "Flash image is not compatible with adapter\n");
+		return -EINVAL;
+	}
+
+	flash_cmd.size = sizeof(struct be_cmd_write_flashrom);
+	flash_cmd.va = dma_zalloc_coherent(dev, flash_cmd.size, &flash_cmd.dma,
+					   GFP_KERNEL);
+	if (!flash_cmd.va)
+		return -ENOMEM;
+
+	num_imgs = le32_to_cpu(fhdr3->num_imgs);
+	for (i = 0; i < num_imgs; i++) {
+		img_hdr_ptr = (struct image_hdr *)(fw->data +
+				(sizeof(struct flash_file_hdr_g3) +
+				 i * sizeof(struct image_hdr)));
+		if (!BE2_chip(adapter) &&
+		    le32_to_cpu(img_hdr_ptr->imageid) != 1)
+			continue;
+
+		if (skyhawk_chip(adapter))
+			status = be_flash_skyhawk(adapter, fw, &flash_cmd,
+						  num_imgs);
+		else
+			status = be_flash_BEx(adapter, fw, &flash_cmd,
+					      num_imgs);
+	}
+
+	dma_free_coherent(dev, flash_cmd.size, flash_cmd.va, flash_cmd.dma);
+	if (!status)
+		dev_info(dev, "Firmware flashed successfully\n");
+
+	return status;
+}
+
+int be_cmd_enable_magic_wol(struct be_adapter *adapter, u8 *mac,
+			    struct be_dma_mem *nonemb_cmd)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_acpi_wol_magic_config *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = nonemb_cmd->va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG, sizeof(*req),
+			       wrb, nonemb_cmd);
+	memcpy(req->magic_mac, mac, ETH_ALEN);
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_set_loopback(struct be_adapter *adapter, u8 port_num,
+			u8 loopback_type, u8 enable)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_lmode *req;
+	int status;
+
+	if (!be_cmd_allowed(adapter, OPCODE_LOWLEVEL_SET_LOOPBACK_MODE,
+			    CMD_SUBSYSTEM_LOWLEVEL))
+		return -EPERM;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err_unlock;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_LOWLEVEL,
+			       OPCODE_LOWLEVEL_SET_LOOPBACK_MODE, sizeof(*req),
+			       wrb, NULL);
+
+	req->src_port = port_num;
+	req->dest_port = port_num;
+	req->loopback_type = loopback_type;
+	req->loopback_state = enable;
+
+	status = be_mcc_notify(adapter);
+	if (status)
+		goto err_unlock;
+
+	mutex_unlock(&adapter->mcc_lock);
+
+	if (!wait_for_completion_timeout(&adapter->et_cmd_compl,
+					 msecs_to_jiffies(SET_LB_MODE_TIMEOUT)))
+		status = -ETIMEDOUT;
+
+	return status;
+
+err_unlock:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_loopback_test(struct be_adapter *adapter, u32 port_num,
+			 u32 loopback_type, u32 pkt_size, u32 num_pkts,
+			 u64 pattern)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_loopback_test *req;
+	struct be_cmd_resp_loopback_test *resp;
+	int status;
+
+	if (!be_cmd_allowed(adapter, OPCODE_LOWLEVEL_LOOPBACK_TEST,
+			    CMD_SUBSYSTEM_LOWLEVEL))
+		return -EPERM;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_LOWLEVEL,
+			       OPCODE_LOWLEVEL_LOOPBACK_TEST, sizeof(*req), wrb,
+			       NULL);
+
+	req->hdr.timeout = cpu_to_le32(15);
+	req->pattern = cpu_to_le64(pattern);
+	req->src_port = cpu_to_le32(port_num);
+	req->dest_port = cpu_to_le32(port_num);
+	req->pkt_size = cpu_to_le32(pkt_size);
+	req->num_pkts = cpu_to_le32(num_pkts);
+	req->loopback_type = cpu_to_le32(loopback_type);
+
+	status = be_mcc_notify(adapter);
+	if (status)
+		goto err;
+
+	mutex_unlock(&adapter->mcc_lock);
+
+	wait_for_completion(&adapter->et_cmd_compl);
+	resp = embedded_payload(wrb);
+	status = le32_to_cpu(resp->status);
+
+	return status;
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_ddr_dma_test(struct be_adapter *adapter, u64 pattern,
+			u32 byte_cnt, struct be_dma_mem *cmd)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_ddrdma_test *req;
+	int status;
+	int i, j = 0;
+
+	if (!be_cmd_allowed(adapter, OPCODE_LOWLEVEL_HOST_DDR_DMA,
+			    CMD_SUBSYSTEM_LOWLEVEL))
+		return -EPERM;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = cmd->va;
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_LOWLEVEL,
+			       OPCODE_LOWLEVEL_HOST_DDR_DMA, cmd->size, wrb,
+			       cmd);
+
+	req->pattern = cpu_to_le64(pattern);
+	req->byte_count = cpu_to_le32(byte_cnt);
+	for (i = 0; i < byte_cnt; i++) {
+		req->snd_buff[i] = (u8)(pattern >> (j*8));
+		j++;
+		if (j > 7)
+			j = 0;
+	}
+
+	status = be_mcc_notify_wait(adapter);
+
+	if (!status) {
+		struct be_cmd_resp_ddrdma_test *resp;
+
+		resp = cmd->va;
+		if ((memcmp(resp->rcv_buff, req->snd_buff, byte_cnt) != 0) ||
+		    resp->snd_err) {
+			status = -1;
+		}
+	}
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_get_seeprom_data(struct be_adapter *adapter,
+			    struct be_dma_mem *nonemb_cmd)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_seeprom_read *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = nonemb_cmd->va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SEEPROM_READ, sizeof(*req), wrb,
+			       nonemb_cmd);
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_get_phy_info(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_phy_info *req;
+	struct be_dma_mem cmd;
+	int status;
+
+	if (!be_cmd_allowed(adapter, OPCODE_COMMON_GET_PHY_DETAILS,
+			    CMD_SUBSYSTEM_COMMON))
+		return -EPERM;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	cmd.size = sizeof(struct be_cmd_req_get_phy_info);
+	cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+				     GFP_ATOMIC);
+	if (!cmd.va) {
+		dev_err(&adapter->pdev->dev, "Memory alloc failure\n");
+		status = -ENOMEM;
+		goto err;
+	}
+
+	req = cmd.va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_PHY_DETAILS, sizeof(*req),
+			       wrb, &cmd);
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_phy_info *resp_phy_info =
+				cmd.va + sizeof(struct be_cmd_req_hdr);
+
+		adapter->phy.phy_type = le16_to_cpu(resp_phy_info->phy_type);
+		adapter->phy.interface_type =
+			le16_to_cpu(resp_phy_info->interface_type);
+		adapter->phy.auto_speeds_supported =
+			le16_to_cpu(resp_phy_info->auto_speeds_supported);
+		adapter->phy.fixed_speeds_supported =
+			le16_to_cpu(resp_phy_info->fixed_speeds_supported);
+		adapter->phy.misc_params =
+			le32_to_cpu(resp_phy_info->misc_params);
+
+		if (BE2_chip(adapter)) {
+			adapter->phy.fixed_speeds_supported =
+				BE_SUPPORTED_SPEED_10GBPS |
+				BE_SUPPORTED_SPEED_1GBPS;
+		}
+	}
+	dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va, cmd.dma);
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+static int be_cmd_set_qos(struct be_adapter *adapter, u32 bps, u32 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_qos *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_QOS, sizeof(*req), wrb, NULL);
+
+	req->hdr.domain = domain;
+	req->valid_bits = cpu_to_le32(BE_QOS_BITS_NIC);
+	req->max_bps_nic = cpu_to_le32(bps);
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_get_cntl_attributes(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_cntl_attribs *req;
+	struct be_cmd_resp_cntl_attribs *resp;
+	int status, i;
+	int payload_len = max(sizeof(*req), sizeof(*resp));
+	struct mgmt_controller_attrib *attribs;
+	struct be_dma_mem attribs_cmd;
+	u32 *serial_num;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	memset(&attribs_cmd, 0, sizeof(struct be_dma_mem));
+	attribs_cmd.size = sizeof(struct be_cmd_resp_cntl_attribs);
+	attribs_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+					     attribs_cmd.size,
+					     &attribs_cmd.dma, GFP_ATOMIC);
+	if (!attribs_cmd.va) {
+		dev_err(&adapter->pdev->dev, "Memory allocation failure\n");
+		status = -ENOMEM;
+		goto err;
+	}
+
+	wrb = wrb_from_mbox(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = attribs_cmd.va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_CNTL_ATTRIBUTES, payload_len,
+			       wrb, &attribs_cmd);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		attribs = attribs_cmd.va + sizeof(struct be_cmd_resp_hdr);
+		adapter->hba_port_num = attribs->hba_attribs.phy_port;
+		serial_num = attribs->hba_attribs.controller_serial_number;
+		for (i = 0; i < CNTL_SERIAL_NUM_WORDS; i++)
+			adapter->serial_num[i] = le32_to_cpu(serial_num[i]) &
+				(BIT_MASK(16) - 1);
+		/* For BEx, since GET_FUNC_CONFIG command is not
+		 * supported, we read funcnum here as a workaround.
+		 */
+		if (BEx_chip(adapter))
+			adapter->pf_num = attribs->hba_attribs.pci_funcnum;
+	}
+
+err:
+	mutex_unlock(&adapter->mbox_lock);
+	if (attribs_cmd.va)
+		dma_free_coherent(&adapter->pdev->dev, attribs_cmd.size,
+				  attribs_cmd.va, attribs_cmd.dma);
+	return status;
+}
+
+/* Uses mbox */
+int be_cmd_req_native_mode(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_func_cap *req;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_DRIVER_FUNCTION_CAP,
+			       sizeof(*req), wrb, NULL);
+
+	req->valid_cap_flags = cpu_to_le32(CAPABILITY_SW_TIMESTAMPS |
+				CAPABILITY_BE3_NATIVE_ERX_API);
+	req->cap_flags = cpu_to_le32(CAPABILITY_BE3_NATIVE_ERX_API);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_set_func_cap *resp = embedded_payload(wrb);
+
+		adapter->be3_native = le32_to_cpu(resp->cap_flags) &
+					CAPABILITY_BE3_NATIVE_ERX_API;
+		if (!adapter->be3_native)
+			dev_warn(&adapter->pdev->dev,
+				 "adapter not in advanced mode\n");
+	}
+err:
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+/* Get privilege(s) for a function */
+int be_cmd_get_fn_privileges(struct be_adapter *adapter, u32 *privilege,
+			     u32 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_fn_privileges *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_FN_PRIVILEGES, sizeof(*req),
+			       wrb, NULL);
+
+	req->hdr.domain = domain;
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_fn_privileges *resp =
+						embedded_payload(wrb);
+
+		*privilege = le32_to_cpu(resp->privilege_mask);
+
+		/* In UMC mode FW does not return right privileges.
+		 * Override with correct privilege equivalent to PF.
+		 */
+		if (BEx_chip(adapter) && be_is_mc(adapter) &&
+		    be_physfn(adapter))
+			*privilege = MAX_PRIVILEGES;
+	}
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Set privilege(s) for a function */
+int be_cmd_set_fn_privileges(struct be_adapter *adapter, u32 privileges,
+			     u32 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_fn_privileges *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_FN_PRIVILEGES, sizeof(*req),
+			       wrb, NULL);
+	req->hdr.domain = domain;
+	if (lancer_chip(adapter))
+		req->privileges_lancer = cpu_to_le32(privileges);
+	else
+		req->privileges = cpu_to_le32(privileges);
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* pmac_id_valid: true => pmac_id is supplied and MAC address is requested.
+ * pmac_id_valid: false => pmac_id or MAC address is requested.
+ *		  If pmac_id is returned, pmac_id_valid is returned as true
+ */
+int be_cmd_get_mac_from_list(struct be_adapter *adapter, u8 *mac,
+			     bool *pmac_id_valid, u32 *pmac_id, u32 if_handle,
+			     u8 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_mac_list *req;
+	int status;
+	int mac_count;
+	struct be_dma_mem get_mac_list_cmd;
+	int i;
+
+	memset(&get_mac_list_cmd, 0, sizeof(struct be_dma_mem));
+	get_mac_list_cmd.size = sizeof(struct be_cmd_resp_get_mac_list);
+	get_mac_list_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+						  get_mac_list_cmd.size,
+						  &get_mac_list_cmd.dma,
+						  GFP_ATOMIC);
+
+	if (!get_mac_list_cmd.va) {
+		dev_err(&adapter->pdev->dev,
+			"Memory allocation failure during GET_MAC_LIST\n");
+		return -ENOMEM;
+	}
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto out;
+	}
+
+	req = get_mac_list_cmd.va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_MAC_LIST,
+			       get_mac_list_cmd.size, wrb, &get_mac_list_cmd);
+	req->hdr.domain = domain;
+	req->mac_type = MAC_ADDRESS_TYPE_NETWORK;
+	if (*pmac_id_valid) {
+		req->mac_id = cpu_to_le32(*pmac_id);
+		req->iface_id = cpu_to_le16(if_handle);
+		req->perm_override = 0;
+	} else {
+		req->perm_override = 1;
+	}
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_mac_list *resp =
+						get_mac_list_cmd.va;
+
+		if (*pmac_id_valid) {
+			memcpy(mac, resp->macid_macaddr.mac_addr_id.macaddr,
+			       ETH_ALEN);
+			goto out;
+		}
+
+		mac_count = resp->true_mac_count + resp->pseudo_mac_count;
+		/* Mac list returned could contain one or more active mac_ids
+		 * or one or more true or pseudo permanent mac addresses.
+		 * If an active mac_id is present, return first active mac_id
+		 * found.
+		 */
+		for (i = 0; i < mac_count; i++) {
+			struct get_list_macaddr *mac_entry;
+			u16 mac_addr_size;
+			u32 mac_id;
+
+			mac_entry = &resp->macaddr_list[i];
+			mac_addr_size = le16_to_cpu(mac_entry->mac_addr_size);
+			/* mac_id is a 32 bit value and mac_addr size
+			 * is 6 bytes
+			 */
+			if (mac_addr_size == sizeof(u32)) {
+				*pmac_id_valid = true;
+				mac_id = mac_entry->mac_addr_id.s_mac_id.mac_id;
+				*pmac_id = le32_to_cpu(mac_id);
+				goto out;
+			}
+		}
+		/* If no active mac_id found, return first mac addr */
+		*pmac_id_valid = false;
+		memcpy(mac, resp->macaddr_list[0].mac_addr_id.macaddr,
+		       ETH_ALEN);
+	}
+
+out:
+	mutex_unlock(&adapter->mcc_lock);
+	dma_free_coherent(&adapter->pdev->dev, get_mac_list_cmd.size,
+			  get_mac_list_cmd.va, get_mac_list_cmd.dma);
+	return status;
+}
+
+int be_cmd_get_active_mac(struct be_adapter *adapter, u32 curr_pmac_id,
+			  u8 *mac, u32 if_handle, bool active, u32 domain)
+{
+	if (!active)
+		be_cmd_get_mac_from_list(adapter, mac, &active, &curr_pmac_id,
+					 if_handle, domain);
+	if (BEx_chip(adapter))
+		return be_cmd_mac_addr_query(adapter, mac, false,
+					     if_handle, curr_pmac_id);
+	else
+		/* Fetch the MAC address using pmac_id */
+		return be_cmd_get_mac_from_list(adapter, mac, &active,
+						&curr_pmac_id,
+						if_handle, domain);
+}
+
+int be_cmd_get_perm_mac(struct be_adapter *adapter, u8 *mac)
+{
+	int status;
+	bool pmac_valid = false;
+
+	eth_zero_addr(mac);
+
+	if (BEx_chip(adapter)) {
+		if (be_physfn(adapter))
+			status = be_cmd_mac_addr_query(adapter, mac, true, 0,
+						       0);
+		else
+			status = be_cmd_mac_addr_query(adapter, mac, false,
+						       adapter->if_handle, 0);
+	} else {
+		status = be_cmd_get_mac_from_list(adapter, mac, &pmac_valid,
+						  NULL, adapter->if_handle, 0);
+	}
+
+	return status;
+}
+
+/* Uses synchronous MCCQ */
+int be_cmd_set_mac_list(struct be_adapter *adapter, u8 *mac_array,
+			u8 mac_count, u32 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_mac_list *req;
+	int status;
+	struct be_dma_mem cmd;
+
+	memset(&cmd, 0, sizeof(struct be_dma_mem));
+	cmd.size = sizeof(struct be_cmd_req_set_mac_list);
+	cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+				     GFP_KERNEL);
+	if (!cmd.va)
+		return -ENOMEM;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = cmd.va;
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_MAC_LIST, sizeof(*req),
+			       wrb, &cmd);
+
+	req->hdr.domain = domain;
+	req->mac_count = mac_count;
+	if (mac_count)
+		memcpy(req->mac, mac_array, ETH_ALEN*mac_count);
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va, cmd.dma);
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Wrapper to delete any active MACs and provision the new mac.
+ * Changes to MAC_LIST are allowed iff none of the MAC addresses in the
+ * current list are active.
+ */
+int be_cmd_set_mac(struct be_adapter *adapter, u8 *mac, int if_id, u32 dom)
+{
+	bool active_mac = false;
+	u8 old_mac[ETH_ALEN];
+	u32 pmac_id;
+	int status;
+
+	status = be_cmd_get_mac_from_list(adapter, old_mac, &active_mac,
+					  &pmac_id, if_id, dom);
+
+	if (!status && active_mac)
+		be_cmd_pmac_del(adapter, if_id, pmac_id, dom);
+
+	return be_cmd_set_mac_list(adapter, mac, mac ? 1 : 0, dom);
+}
+
+int be_cmd_set_hsw_config(struct be_adapter *adapter, u16 pvid,
+			  u32 domain, u16 intf_id, u16 hsw_mode, u8 spoofchk)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_hsw_config *req;
+	void *ctxt;
+	int status;
+
+	if (!be_cmd_allowed(adapter, OPCODE_COMMON_SET_HSW_CONFIG,
+			    CMD_SUBSYSTEM_COMMON))
+		return -EPERM;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+	ctxt = &req->context;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_HSW_CONFIG, sizeof(*req), wrb,
+			       NULL);
+
+	req->hdr.domain = domain;
+	AMAP_SET_BITS(struct amap_set_hsw_context, interface_id, ctxt, intf_id);
+	if (pvid) {
+		AMAP_SET_BITS(struct amap_set_hsw_context, pvid_valid, ctxt, 1);
+		AMAP_SET_BITS(struct amap_set_hsw_context, pvid, ctxt, pvid);
+	}
+	if (hsw_mode) {
+		AMAP_SET_BITS(struct amap_set_hsw_context, interface_id,
+			      ctxt, adapter->hba_port_num);
+		AMAP_SET_BITS(struct amap_set_hsw_context, pport, ctxt, 1);
+		AMAP_SET_BITS(struct amap_set_hsw_context, port_fwd_type,
+			      ctxt, hsw_mode);
+	}
+
+	/* Enable/disable both mac and vlan spoof checking */
+	if (!BEx_chip(adapter) && spoofchk) {
+		AMAP_SET_BITS(struct amap_set_hsw_context, mac_spoofchk,
+			      ctxt, spoofchk);
+		AMAP_SET_BITS(struct amap_set_hsw_context, vlan_spoofchk,
+			      ctxt, spoofchk);
+	}
+
+	be_dws_cpu_to_le(req->context, sizeof(req->context));
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+/* Get Hyper switch config */
+int be_cmd_get_hsw_config(struct be_adapter *adapter, u16 *pvid,
+			  u32 domain, u16 intf_id, u8 *mode, bool *spoofchk)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_hsw_config *req;
+	void *ctxt;
+	int status;
+	u16 vid;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+	ctxt = &req->context;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_HSW_CONFIG, sizeof(*req), wrb,
+			       NULL);
+
+	req->hdr.domain = domain;
+	AMAP_SET_BITS(struct amap_get_hsw_req_context, interface_id,
+		      ctxt, intf_id);
+	AMAP_SET_BITS(struct amap_get_hsw_req_context, pvid_valid, ctxt, 1);
+
+	if (!BEx_chip(adapter) && mode) {
+		AMAP_SET_BITS(struct amap_get_hsw_req_context, interface_id,
+			      ctxt, adapter->hba_port_num);
+		AMAP_SET_BITS(struct amap_get_hsw_req_context, pport, ctxt, 1);
+	}
+	be_dws_cpu_to_le(req->context, sizeof(req->context));
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_hsw_config *resp =
+						embedded_payload(wrb);
+
+		be_dws_le_to_cpu(&resp->context, sizeof(resp->context));
+		vid = AMAP_GET_BITS(struct amap_get_hsw_resp_context,
+				    pvid, &resp->context);
+		if (pvid)
+			*pvid = le16_to_cpu(vid);
+		if (mode)
+			*mode = AMAP_GET_BITS(struct amap_get_hsw_resp_context,
+					      port_fwd_type, &resp->context);
+		if (spoofchk)
+			*spoofchk =
+				AMAP_GET_BITS(struct amap_get_hsw_resp_context,
+					      spoofchk, &resp->context);
+	}
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+static bool be_is_wol_excluded(struct be_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+
+	if (be_virtfn(adapter))
+		return true;
+
+	switch (pdev->subsystem_device) {
+	case OC_SUBSYS_DEVICE_ID1:
+	case OC_SUBSYS_DEVICE_ID2:
+	case OC_SUBSYS_DEVICE_ID3:
+	case OC_SUBSYS_DEVICE_ID4:
+		return true;
+	default:
+		return false;
+	}
+}
+
+int be_cmd_get_acpi_wol_cap(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_acpi_wol_magic_config_v1 *req;
+	int status = 0;
+	struct be_dma_mem cmd;
+
+	if (!be_cmd_allowed(adapter, OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG,
+			    CMD_SUBSYSTEM_ETH))
+		return -EPERM;
+
+	if (be_is_wol_excluded(adapter))
+		return status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	memset(&cmd, 0, sizeof(struct be_dma_mem));
+	cmd.size = sizeof(struct be_cmd_resp_acpi_wol_magic_config_v1);
+	cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+				     GFP_ATOMIC);
+	if (!cmd.va) {
+		dev_err(&adapter->pdev->dev, "Memory allocation failure\n");
+		status = -ENOMEM;
+		goto err;
+	}
+
+	wrb = wrb_from_mbox(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = cmd.va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG,
+			       sizeof(*req), wrb, &cmd);
+
+	req->hdr.version = 1;
+	req->query_options = BE_GET_WOL_CAP;
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_acpi_wol_magic_config_v1 *resp;
+
+		resp = (struct be_cmd_resp_acpi_wol_magic_config_v1 *)cmd.va;
+
+		adapter->wol_cap = resp->wol_settings;
+
+		/* Non-zero macaddr indicates WOL is enabled */
+		if (adapter->wol_cap & BE_WOL_CAP &&
+		    !is_zero_ether_addr(resp->magic_mac))
+			adapter->wol_en = true;
+	}
+err:
+	mutex_unlock(&adapter->mbox_lock);
+	if (cmd.va)
+		dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va,
+				  cmd.dma);
+	return status;
+
+}
+
+int be_cmd_set_fw_log_level(struct be_adapter *adapter, u32 level)
+{
+	struct be_dma_mem extfat_cmd;
+	struct be_fat_conf_params *cfgs;
+	int status;
+	int i, j;
+
+	memset(&extfat_cmd, 0, sizeof(struct be_dma_mem));
+	extfat_cmd.size = sizeof(struct be_cmd_resp_get_ext_fat_caps);
+	extfat_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+					    extfat_cmd.size, &extfat_cmd.dma,
+					    GFP_ATOMIC);
+	if (!extfat_cmd.va)
+		return -ENOMEM;
+
+	status = be_cmd_get_ext_fat_capabilites(adapter, &extfat_cmd);
+	if (status)
+		goto err;
+
+	cfgs = (struct be_fat_conf_params *)
+			(extfat_cmd.va + sizeof(struct be_cmd_resp_hdr));
+	for (i = 0; i < le32_to_cpu(cfgs->num_modules); i++) {
+		u32 num_modes = le32_to_cpu(cfgs->module[i].num_modes);
+
+		for (j = 0; j < num_modes; j++) {
+			if (cfgs->module[i].trace_lvl[j].mode == MODE_UART)
+				cfgs->module[i].trace_lvl[j].dbg_lvl =
+							cpu_to_le32(level);
+		}
+	}
+
+	status = be_cmd_set_ext_fat_capabilites(adapter, &extfat_cmd, cfgs);
+err:
+	dma_free_coherent(&adapter->pdev->dev, extfat_cmd.size, extfat_cmd.va,
+			  extfat_cmd.dma);
+	return status;
+}
+
+int be_cmd_get_fw_log_level(struct be_adapter *adapter)
+{
+	struct be_dma_mem extfat_cmd;
+	struct be_fat_conf_params *cfgs;
+	int status, j;
+	int level = 0;
+
+	memset(&extfat_cmd, 0, sizeof(struct be_dma_mem));
+	extfat_cmd.size = sizeof(struct be_cmd_resp_get_ext_fat_caps);
+	extfat_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+					    extfat_cmd.size, &extfat_cmd.dma,
+					    GFP_ATOMIC);
+
+	if (!extfat_cmd.va) {
+		dev_err(&adapter->pdev->dev, "%s: Memory allocation failure\n",
+			__func__);
+		goto err;
+	}
+
+	status = be_cmd_get_ext_fat_capabilites(adapter, &extfat_cmd);
+	if (!status) {
+		cfgs = (struct be_fat_conf_params *)(extfat_cmd.va +
+						sizeof(struct be_cmd_resp_hdr));
+
+		for (j = 0; j < le32_to_cpu(cfgs->module[0].num_modes); j++) {
+			if (cfgs->module[0].trace_lvl[j].mode == MODE_UART)
+				level = cfgs->module[0].trace_lvl[j].dbg_lvl;
+		}
+	}
+	dma_free_coherent(&adapter->pdev->dev, extfat_cmd.size, extfat_cmd.va,
+			  extfat_cmd.dma);
+err:
+	return level;
+}
+
+int be_cmd_get_ext_fat_capabilites(struct be_adapter *adapter,
+				   struct be_dma_mem *cmd)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_ext_fat_caps *req;
+	int status;
+
+	if (!be_cmd_allowed(adapter, OPCODE_COMMON_GET_EXT_FAT_CAPABILITIES,
+			    CMD_SUBSYSTEM_COMMON))
+		return -EPERM;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = cmd->va;
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_EXT_FAT_CAPABILITIES,
+			       cmd->size, wrb, cmd);
+	req->parameter_type = cpu_to_le32(1);
+
+	status = be_mbox_notify_wait(adapter);
+err:
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+int be_cmd_set_ext_fat_capabilites(struct be_adapter *adapter,
+				   struct be_dma_mem *cmd,
+				   struct be_fat_conf_params *configs)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_ext_fat_caps *req;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = cmd->va;
+	memcpy(&req->set_params, configs, sizeof(struct be_fat_conf_params));
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_EXT_FAT_CAPABILITIES,
+			       cmd->size, wrb, cmd);
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_query_port_name(struct be_adapter *adapter)
+{
+	struct be_cmd_req_get_port_name *req;
+	struct be_mcc_wrb *wrb;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_PORT_NAME, sizeof(*req), wrb,
+			       NULL);
+	if (!BEx_chip(adapter))
+		req->hdr.version = 1;
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_port_name *resp = embedded_payload(wrb);
+
+		adapter->port_name = resp->port_name[adapter->hba_port_num];
+	} else {
+		adapter->port_name = adapter->hba_port_num + '0';
+	}
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+/* When more than 1 NIC descriptor is present in the descriptor list,
+ * the caller must specify the pf_num to obtain the NIC descriptor
+ * corresponding to its pci function.
+ * get_vft must be true when the caller wants the VF-template desc of the
+ * PF-pool.
+ * The pf_num should be set to PF_NUM_IGNORE when the caller knows
+ * that only it's NIC descriptor is present in the descriptor list.
+ */
+static struct be_nic_res_desc *be_get_nic_desc(u8 *buf, u32 desc_count,
+					       bool get_vft, u8 pf_num)
+{
+	struct be_res_desc_hdr *hdr = (struct be_res_desc_hdr *)buf;
+	struct be_nic_res_desc *nic;
+	int i;
+
+	for (i = 0; i < desc_count; i++) {
+		if (hdr->desc_type == NIC_RESOURCE_DESC_TYPE_V0 ||
+		    hdr->desc_type == NIC_RESOURCE_DESC_TYPE_V1) {
+			nic = (struct be_nic_res_desc *)hdr;
+
+			if ((pf_num == PF_NUM_IGNORE ||
+			     nic->pf_num == pf_num) &&
+			    (!get_vft || nic->flags & BIT(VFT_SHIFT)))
+				return nic;
+		}
+		hdr->desc_len = hdr->desc_len ? : RESOURCE_DESC_SIZE_V0;
+		hdr = (void *)hdr + hdr->desc_len;
+	}
+	return NULL;
+}
+
+static struct be_nic_res_desc *be_get_vft_desc(u8 *buf, u32 desc_count,
+					       u8 pf_num)
+{
+	return be_get_nic_desc(buf, desc_count, true, pf_num);
+}
+
+static struct be_nic_res_desc *be_get_func_nic_desc(u8 *buf, u32 desc_count,
+						    u8 pf_num)
+{
+	return be_get_nic_desc(buf, desc_count, false, pf_num);
+}
+
+static struct be_pcie_res_desc *be_get_pcie_desc(u8 *buf, u32 desc_count,
+						 u8 pf_num)
+{
+	struct be_res_desc_hdr *hdr = (struct be_res_desc_hdr *)buf;
+	struct be_pcie_res_desc *pcie;
+	int i;
+
+	for (i = 0; i < desc_count; i++) {
+		if (hdr->desc_type == PCIE_RESOURCE_DESC_TYPE_V0 ||
+		    hdr->desc_type == PCIE_RESOURCE_DESC_TYPE_V1) {
+			pcie = (struct be_pcie_res_desc *)hdr;
+			if (pcie->pf_num == pf_num)
+				return pcie;
+		}
+
+		hdr->desc_len = hdr->desc_len ? : RESOURCE_DESC_SIZE_V0;
+		hdr = (void *)hdr + hdr->desc_len;
+	}
+	return NULL;
+}
+
+static struct be_port_res_desc *be_get_port_desc(u8 *buf, u32 desc_count)
+{
+	struct be_res_desc_hdr *hdr = (struct be_res_desc_hdr *)buf;
+	int i;
+
+	for (i = 0; i < desc_count; i++) {
+		if (hdr->desc_type == PORT_RESOURCE_DESC_TYPE_V1)
+			return (struct be_port_res_desc *)hdr;
+
+		hdr->desc_len = hdr->desc_len ? : RESOURCE_DESC_SIZE_V0;
+		hdr = (void *)hdr + hdr->desc_len;
+	}
+	return NULL;
+}
+
+static void be_copy_nic_desc(struct be_resources *res,
+			     struct be_nic_res_desc *desc)
+{
+	res->max_uc_mac = le16_to_cpu(desc->unicast_mac_count);
+	res->max_vlans = le16_to_cpu(desc->vlan_count);
+	res->max_mcast_mac = le16_to_cpu(desc->mcast_mac_count);
+	res->max_tx_qs = le16_to_cpu(desc->txq_count);
+	res->max_rss_qs = le16_to_cpu(desc->rssq_count);
+	res->max_rx_qs = le16_to_cpu(desc->rq_count);
+	res->max_evt_qs = le16_to_cpu(desc->eq_count);
+	res->max_cq_count = le16_to_cpu(desc->cq_count);
+	res->max_iface_count = le16_to_cpu(desc->iface_count);
+	res->max_mcc_count = le16_to_cpu(desc->mcc_count);
+	/* Clear flags that driver is not interested in */
+	res->if_cap_flags = le32_to_cpu(desc->cap_flags) &
+				BE_IF_CAP_FLAGS_WANT;
+}
+
+/* Uses Mbox */
+int be_cmd_get_func_config(struct be_adapter *adapter, struct be_resources *res)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_func_config *req;
+	int status;
+	struct be_dma_mem cmd;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	memset(&cmd, 0, sizeof(struct be_dma_mem));
+	cmd.size = sizeof(struct be_cmd_resp_get_func_config);
+	cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+				     GFP_ATOMIC);
+	if (!cmd.va) {
+		dev_err(&adapter->pdev->dev, "Memory alloc failure\n");
+		status = -ENOMEM;
+		goto err;
+	}
+
+	wrb = wrb_from_mbox(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = cmd.va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_FUNC_CONFIG,
+			       cmd.size, wrb, &cmd);
+
+	if (skyhawk_chip(adapter))
+		req->hdr.version = 1;
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_func_config *resp = cmd.va;
+		u32 desc_count = le32_to_cpu(resp->desc_count);
+		struct be_nic_res_desc *desc;
+
+		/* GET_FUNC_CONFIG returns resource descriptors of the
+		 * current function only. So, pf_num should be set to
+		 * PF_NUM_IGNORE.
+		 */
+		desc = be_get_func_nic_desc(resp->func_param, desc_count,
+					    PF_NUM_IGNORE);
+		if (!desc) {
+			status = -EINVAL;
+			goto err;
+		}
+
+		/* Store pf_num & vf_num for later use in GET_PROFILE_CONFIG */
+		adapter->pf_num = desc->pf_num;
+		adapter->vf_num = desc->vf_num;
+
+		if (res)
+			be_copy_nic_desc(res, desc);
+	}
+err:
+	mutex_unlock(&adapter->mbox_lock);
+	if (cmd.va)
+		dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va,
+				  cmd.dma);
+	return status;
+}
+
+/* This routine returns a list of all the NIC PF_nums in the adapter */
+static u16 be_get_nic_pf_num_list(u8 *buf, u32 desc_count, u16 *nic_pf_nums)
+{
+	struct be_res_desc_hdr *hdr = (struct be_res_desc_hdr *)buf;
+	struct be_pcie_res_desc *pcie = NULL;
+	int i;
+	u16 nic_pf_count = 0;
+
+	for (i = 0; i < desc_count; i++) {
+		if (hdr->desc_type == PCIE_RESOURCE_DESC_TYPE_V0 ||
+		    hdr->desc_type == PCIE_RESOURCE_DESC_TYPE_V1) {
+			pcie = (struct be_pcie_res_desc *)hdr;
+			if (pcie->pf_state && (pcie->pf_type == MISSION_NIC ||
+					       pcie->pf_type == MISSION_RDMA)) {
+				nic_pf_nums[nic_pf_count++] = pcie->pf_num;
+			}
+		}
+
+		hdr->desc_len = hdr->desc_len ? : RESOURCE_DESC_SIZE_V0;
+		hdr = (void *)hdr + hdr->desc_len;
+	}
+	return nic_pf_count;
+}
+
+/* Will use MBOX only if MCCQ has not been created */
+int be_cmd_get_profile_config(struct be_adapter *adapter,
+			      struct be_resources *res,
+			      struct be_port_resources *port_res,
+			      u8 profile_type, u8 query, u8 domain)
+{
+	struct be_cmd_resp_get_profile_config *resp;
+	struct be_cmd_req_get_profile_config *req;
+	struct be_nic_res_desc *vf_res;
+	struct be_pcie_res_desc *pcie;
+	struct be_port_res_desc *port;
+	struct be_nic_res_desc *nic;
+	struct be_mcc_wrb wrb = {0};
+	struct be_dma_mem cmd;
+	u16 desc_count;
+	int status;
+
+	memset(&cmd, 0, sizeof(struct be_dma_mem));
+	cmd.size = sizeof(struct be_cmd_resp_get_profile_config);
+	cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+				     GFP_ATOMIC);
+	if (!cmd.va)
+		return -ENOMEM;
+
+	req = cmd.va;
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_PROFILE_CONFIG,
+			       cmd.size, &wrb, &cmd);
+
+	if (!lancer_chip(adapter))
+		req->hdr.version = 1;
+	req->type = profile_type;
+	req->hdr.domain = domain;
+
+	/* When QUERY_MODIFIABLE_FIELDS_TYPE bit is set, cmd returns the
+	 * descriptors with all bits set to "1" for the fields which can be
+	 * modified using SET_PROFILE_CONFIG cmd.
+	 */
+	if (query == RESOURCE_MODIFIABLE)
+		req->type |= QUERY_MODIFIABLE_FIELDS_TYPE;
+
+	status = be_cmd_notify_wait(adapter, &wrb);
+	if (status)
+		goto err;
+
+	resp = cmd.va;
+	desc_count = le16_to_cpu(resp->desc_count);
+
+	if (port_res) {
+		u16 nic_pf_cnt = 0, i;
+		u16 nic_pf_num_list[MAX_NIC_FUNCS];
+
+		nic_pf_cnt = be_get_nic_pf_num_list(resp->func_param,
+						    desc_count,
+						    nic_pf_num_list);
+
+		for (i = 0; i < nic_pf_cnt; i++) {
+			nic = be_get_func_nic_desc(resp->func_param, desc_count,
+						   nic_pf_num_list[i]);
+			if (nic->link_param == adapter->port_num) {
+				port_res->nic_pfs++;
+				pcie = be_get_pcie_desc(resp->func_param,
+							desc_count,
+							nic_pf_num_list[i]);
+				port_res->max_vfs += le16_to_cpu(pcie->num_vfs);
+			}
+		}
+		return status;
+	}
+
+	pcie = be_get_pcie_desc(resp->func_param, desc_count,
+				adapter->pf_num);
+	if (pcie)
+		res->max_vfs = le16_to_cpu(pcie->num_vfs);
+
+	port = be_get_port_desc(resp->func_param, desc_count);
+	if (port)
+		adapter->mc_type = port->mc_type;
+
+	nic = be_get_func_nic_desc(resp->func_param, desc_count,
+				   adapter->pf_num);
+	if (nic)
+		be_copy_nic_desc(res, nic);
+
+	vf_res = be_get_vft_desc(resp->func_param, desc_count,
+				 adapter->pf_num);
+	if (vf_res)
+		res->vf_if_cap_flags = vf_res->cap_flags;
+err:
+	if (cmd.va)
+		dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va,
+				  cmd.dma);
+	return status;
+}
+
+/* Will use MBOX only if MCCQ has not been created */
+static int be_cmd_set_profile_config(struct be_adapter *adapter, void *desc,
+				     int size, int count, u8 version, u8 domain)
+{
+	struct be_cmd_req_set_profile_config *req;
+	struct be_mcc_wrb wrb = {0};
+	struct be_dma_mem cmd;
+	int status;
+
+	memset(&cmd, 0, sizeof(struct be_dma_mem));
+	cmd.size = sizeof(struct be_cmd_req_set_profile_config);
+	cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+				     GFP_ATOMIC);
+	if (!cmd.va)
+		return -ENOMEM;
+
+	req = cmd.va;
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_PROFILE_CONFIG, cmd.size,
+			       &wrb, &cmd);
+	req->hdr.version = version;
+	req->hdr.domain = domain;
+	req->desc_count = cpu_to_le32(count);
+	memcpy(req->desc, desc, size);
+
+	status = be_cmd_notify_wait(adapter, &wrb);
+
+	if (cmd.va)
+		dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va,
+				  cmd.dma);
+	return status;
+}
+
+/* Mark all fields invalid */
+static void be_reset_nic_desc(struct be_nic_res_desc *nic)
+{
+	memset(nic, 0, sizeof(*nic));
+	nic->unicast_mac_count = 0xFFFF;
+	nic->mcc_count = 0xFFFF;
+	nic->vlan_count = 0xFFFF;
+	nic->mcast_mac_count = 0xFFFF;
+	nic->txq_count = 0xFFFF;
+	nic->rq_count = 0xFFFF;
+	nic->rssq_count = 0xFFFF;
+	nic->lro_count = 0xFFFF;
+	nic->cq_count = 0xFFFF;
+	nic->toe_conn_count = 0xFFFF;
+	nic->eq_count = 0xFFFF;
+	nic->iface_count = 0xFFFF;
+	nic->link_param = 0xFF;
+	nic->channel_id_param = cpu_to_le16(0xF000);
+	nic->acpi_params = 0xFF;
+	nic->wol_param = 0x0F;
+	nic->tunnel_iface_count = 0xFFFF;
+	nic->direct_tenant_iface_count = 0xFFFF;
+	nic->bw_min = 0xFFFFFFFF;
+	nic->bw_max = 0xFFFFFFFF;
+}
+
+/* Mark all fields invalid */
+static void be_reset_pcie_desc(struct be_pcie_res_desc *pcie)
+{
+	memset(pcie, 0, sizeof(*pcie));
+	pcie->sriov_state = 0xFF;
+	pcie->pf_state = 0xFF;
+	pcie->pf_type = 0xFF;
+	pcie->num_vfs = 0xFFFF;
+}
+
+int be_cmd_config_qos(struct be_adapter *adapter, u32 max_rate, u16 link_speed,
+		      u8 domain)
+{
+	struct be_nic_res_desc nic_desc;
+	u32 bw_percent;
+	u16 version = 0;
+
+	if (BE3_chip(adapter))
+		return be_cmd_set_qos(adapter, max_rate / 10, domain);
+
+	be_reset_nic_desc(&nic_desc);
+	nic_desc.pf_num = adapter->pf_num;
+	nic_desc.vf_num = domain;
+	nic_desc.bw_min = 0;
+	if (lancer_chip(adapter)) {
+		nic_desc.hdr.desc_type = NIC_RESOURCE_DESC_TYPE_V0;
+		nic_desc.hdr.desc_len = RESOURCE_DESC_SIZE_V0;
+		nic_desc.flags = (1 << QUN_SHIFT) | (1 << IMM_SHIFT) |
+					(1 << NOSV_SHIFT);
+		nic_desc.bw_max = cpu_to_le32(max_rate / 10);
+	} else {
+		version = 1;
+		nic_desc.hdr.desc_type = NIC_RESOURCE_DESC_TYPE_V1;
+		nic_desc.hdr.desc_len = RESOURCE_DESC_SIZE_V1;
+		nic_desc.flags = (1 << IMM_SHIFT) | (1 << NOSV_SHIFT);
+		bw_percent = max_rate ? (max_rate * 100) / link_speed : 100;
+		nic_desc.bw_max = cpu_to_le32(bw_percent);
+	}
+
+	return be_cmd_set_profile_config(adapter, &nic_desc,
+					 nic_desc.hdr.desc_len,
+					 1, version, domain);
+}
+
+int be_cmd_set_sriov_config(struct be_adapter *adapter,
+			    struct be_resources pool_res, u16 num_vfs,
+			    struct be_resources *vft_res)
+{
+	struct {
+		struct be_pcie_res_desc pcie;
+		struct be_nic_res_desc nic_vft;
+	} __packed desc;
+
+	/* PF PCIE descriptor */
+	be_reset_pcie_desc(&desc.pcie);
+	desc.pcie.hdr.desc_type = PCIE_RESOURCE_DESC_TYPE_V1;
+	desc.pcie.hdr.desc_len = RESOURCE_DESC_SIZE_V1;
+	desc.pcie.flags = BIT(IMM_SHIFT) | BIT(NOSV_SHIFT);
+	desc.pcie.pf_num = adapter->pdev->devfn;
+	desc.pcie.sriov_state = num_vfs ? 1 : 0;
+	desc.pcie.num_vfs = cpu_to_le16(num_vfs);
+
+	/* VF NIC Template descriptor */
+	be_reset_nic_desc(&desc.nic_vft);
+	desc.nic_vft.hdr.desc_type = NIC_RESOURCE_DESC_TYPE_V1;
+	desc.nic_vft.hdr.desc_len = RESOURCE_DESC_SIZE_V1;
+	desc.nic_vft.flags = vft_res->flags | BIT(VFT_SHIFT) |
+			     BIT(IMM_SHIFT) | BIT(NOSV_SHIFT);
+	desc.nic_vft.pf_num = adapter->pdev->devfn;
+	desc.nic_vft.vf_num = 0;
+	desc.nic_vft.cap_flags = cpu_to_le32(vft_res->vf_if_cap_flags);
+	desc.nic_vft.rq_count = cpu_to_le16(vft_res->max_rx_qs);
+	desc.nic_vft.txq_count = cpu_to_le16(vft_res->max_tx_qs);
+	desc.nic_vft.rssq_count = cpu_to_le16(vft_res->max_rss_qs);
+	desc.nic_vft.cq_count = cpu_to_le16(vft_res->max_cq_count);
+
+	if (vft_res->max_uc_mac)
+		desc.nic_vft.unicast_mac_count =
+					cpu_to_le16(vft_res->max_uc_mac);
+	if (vft_res->max_vlans)
+		desc.nic_vft.vlan_count = cpu_to_le16(vft_res->max_vlans);
+	if (vft_res->max_iface_count)
+		desc.nic_vft.iface_count =
+				cpu_to_le16(vft_res->max_iface_count);
+	if (vft_res->max_mcc_count)
+		desc.nic_vft.mcc_count = cpu_to_le16(vft_res->max_mcc_count);
+
+	return be_cmd_set_profile_config(adapter, &desc,
+					 2 * RESOURCE_DESC_SIZE_V1, 2, 1, 0);
+}
+
+int be_cmd_manage_iface(struct be_adapter *adapter, u32 iface, u8 op)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_manage_iface_filters *req;
+	int status;
+
+	if (iface == 0xFFFFFFFF)
+		return -1;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_MANAGE_IFACE_FILTERS, sizeof(*req),
+			       wrb, NULL);
+	req->op = op;
+	req->target_iface_id = cpu_to_le32(iface);
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_set_vxlan_port(struct be_adapter *adapter, __be16 port)
+{
+	struct be_port_res_desc port_desc;
+
+	memset(&port_desc, 0, sizeof(port_desc));
+	port_desc.hdr.desc_type = PORT_RESOURCE_DESC_TYPE_V1;
+	port_desc.hdr.desc_len = RESOURCE_DESC_SIZE_V1;
+	port_desc.flags = (1 << IMM_SHIFT) | (1 << NOSV_SHIFT);
+	port_desc.link_num = adapter->hba_port_num;
+	if (port) {
+		port_desc.nv_flags = NV_TYPE_VXLAN | (1 << SOCVID_SHIFT) |
+					(1 << RCVID_SHIFT);
+		port_desc.nv_port = swab16(port);
+	} else {
+		port_desc.nv_flags = NV_TYPE_DISABLED;
+		port_desc.nv_port = 0;
+	}
+
+	return be_cmd_set_profile_config(adapter, &port_desc,
+					 RESOURCE_DESC_SIZE_V1, 1, 1, 0);
+}
+
+int be_cmd_get_if_id(struct be_adapter *adapter, struct be_vf_cfg *vf_cfg,
+		     int vf_num)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_iface_list *req;
+	struct be_cmd_resp_get_iface_list *resp;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_IFACE_LIST, sizeof(*resp),
+			       wrb, NULL);
+	req->hdr.domain = vf_num + 1;
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		resp = (struct be_cmd_resp_get_iface_list *)req;
+		vf_cfg->if_handle = le32_to_cpu(resp->if_desc.if_id);
+	}
+
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+static int lancer_wait_idle(struct be_adapter *adapter)
+{
+#define SLIPORT_IDLE_TIMEOUT 30
+	u32 reg_val;
+	int status = 0, i;
+
+	for (i = 0; i < SLIPORT_IDLE_TIMEOUT; i++) {
+		reg_val = ioread32(adapter->db + PHYSDEV_CONTROL_OFFSET);
+		if ((reg_val & PHYSDEV_CONTROL_INP_MASK) == 0)
+			break;
+
+		ssleep(1);
+	}
+
+	if (i == SLIPORT_IDLE_TIMEOUT)
+		status = -1;
+
+	return status;
+}
+
+int lancer_physdev_ctrl(struct be_adapter *adapter, u32 mask)
+{
+	int status = 0;
+
+	status = lancer_wait_idle(adapter);
+	if (status)
+		return status;
+
+	iowrite32(mask, adapter->db + PHYSDEV_CONTROL_OFFSET);
+
+	return status;
+}
+
+/* Routine to check whether dump image is present or not */
+bool dump_present(struct be_adapter *adapter)
+{
+	u32 sliport_status = 0;
+
+	sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET);
+	return !!(sliport_status & SLIPORT_STATUS_DIP_MASK);
+}
+
+int lancer_initiate_dump(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	int status;
+
+	if (dump_present(adapter)) {
+		dev_info(dev, "Previous dump not cleared, not forcing dump\n");
+		return -EEXIST;
+	}
+
+	/* give firmware reset and diagnostic dump */
+	status = lancer_physdev_ctrl(adapter, PHYSDEV_CONTROL_FW_RESET_MASK |
+				     PHYSDEV_CONTROL_DD_MASK);
+	if (status < 0) {
+		dev_err(dev, "FW reset failed\n");
+		return status;
+	}
+
+	status = lancer_wait_idle(adapter);
+	if (status)
+		return status;
+
+	if (!dump_present(adapter)) {
+		dev_err(dev, "FW dump not generated\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+int lancer_delete_dump(struct be_adapter *adapter)
+{
+	int status;
+
+	status = lancer_cmd_delete_object(adapter, LANCER_FW_DUMP_FILE);
+	return be_cmd_status(status);
+}
+
+/* Uses sync mcc */
+int be_cmd_enable_vf(struct be_adapter *adapter, u8 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_enable_disable_vf *req;
+	int status;
+
+	if (BEx_chip(adapter))
+		return 0;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_ENABLE_DISABLE_VF, sizeof(*req),
+			       wrb, NULL);
+
+	req->hdr.domain = domain;
+	req->enable = 1;
+	status = be_mcc_notify_wait(adapter);
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_intr_set(struct be_adapter *adapter, bool intr_enable)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_intr_set *req;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_INTERRUPT_ENABLE, sizeof(*req),
+			       wrb, NULL);
+
+	req->intr_enabled = intr_enable;
+
+	status = be_mbox_notify_wait(adapter);
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+/* Uses MBOX */
+int be_cmd_get_active_profile(struct be_adapter *adapter, u16 *profile_id)
+{
+	struct be_cmd_req_get_active_profile *req;
+	struct be_mcc_wrb *wrb;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_ACTIVE_PROFILE, sizeof(*req),
+			       wrb, NULL);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_active_profile *resp =
+							embedded_payload(wrb);
+
+		*profile_id = le16_to_cpu(resp->active_profile_id);
+	}
+
+err:
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+static int
+__be_cmd_set_logical_link_config(struct be_adapter *adapter,
+				 int link_state, int version, u8 domain)
+{
+	struct be_cmd_req_set_ll_link *req;
+	struct be_mcc_wrb *wrb;
+	u32 link_config = 0;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_LOGICAL_LINK_CONFIG,
+			       sizeof(*req), wrb, NULL);
+
+	req->hdr.version = version;
+	req->hdr.domain = domain;
+
+	if (link_state == IFLA_VF_LINK_STATE_ENABLE ||
+	    link_state == IFLA_VF_LINK_STATE_AUTO)
+		link_config |= PLINK_ENABLE;
+
+	if (link_state == IFLA_VF_LINK_STATE_AUTO)
+		link_config |= PLINK_TRACK;
+
+	req->link_config = cpu_to_le32(link_config);
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_set_logical_link_config(struct be_adapter *adapter,
+				   int link_state, u8 domain)
+{
+	int status;
+
+	if (BE2_chip(adapter))
+		return -EOPNOTSUPP;
+
+	status = __be_cmd_set_logical_link_config(adapter, link_state,
+						  2, domain);
+
+	/* Version 2 of the command will not be recognized by older FW.
+	 * On such a failure issue version 1 of the command.
+	 */
+	if (base_status(status) == MCC_STATUS_ILLEGAL_REQUEST)
+		status = __be_cmd_set_logical_link_config(adapter, link_state,
+							  1, domain);
+	return status;
+}
+
+int be_cmd_set_features(struct be_adapter *adapter)
+{
+	struct be_cmd_resp_set_features *resp;
+	struct be_cmd_req_set_features *req;
+	struct be_mcc_wrb *wrb;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mcc_lock))
+		return -1;
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_FEATURES,
+			       sizeof(*req), wrb, NULL);
+
+	req->features = cpu_to_le32(BE_FEATURE_UE_RECOVERY);
+	req->parameter_len = cpu_to_le32(sizeof(struct be_req_ue_recovery));
+	req->parameter.req.uer = cpu_to_le32(BE_UE_RECOVERY_UER_MASK);
+
+	status = be_mcc_notify_wait(adapter);
+	if (status)
+		goto err;
+
+	resp = embedded_payload(wrb);
+
+	adapter->error_recovery.ue_to_poll_time =
+		le16_to_cpu(resp->parameter.resp.ue2rp);
+	adapter->error_recovery.ue_to_reset_time =
+		le16_to_cpu(resp->parameter.resp.ue2sr);
+	adapter->error_recovery.recovery_supported = true;
+err:
+	/* Checking "MCC_STATUS_INVALID_LENGTH" for SKH as FW
+	 * returns this error in older firmware versions
+	 */
+	if (base_status(status) == MCC_STATUS_ILLEGAL_REQUEST ||
+	    base_status(status) == MCC_STATUS_INVALID_LENGTH)
+		dev_info(&adapter->pdev->dev,
+			 "Adapter does not support HW error recovery\n");
+
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+
+int be_roce_mcc_cmd(void *netdev_handle, void *wrb_payload,
+		    int wrb_payload_size, u16 *cmd_status, u16 *ext_status)
+{
+	struct be_adapter *adapter = netdev_priv(netdev_handle);
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_hdr *hdr = (struct be_cmd_req_hdr *)wrb_payload;
+	struct be_cmd_req_hdr *req;
+	struct be_cmd_resp_hdr *resp;
+	int status;
+
+	mutex_lock(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+	resp = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(req, hdr->subsystem,
+			       hdr->opcode, wrb_payload_size, wrb, NULL);
+	memcpy(req, wrb_payload, wrb_payload_size);
+	be_dws_cpu_to_le(req, wrb_payload_size);
+
+	status = be_mcc_notify_wait(adapter);
+	if (cmd_status)
+		*cmd_status = (status & 0xffff);
+	if (ext_status)
+		*ext_status = 0;
+	memcpy(wrb_payload, resp, sizeof(*resp) + resp->response_length);
+	be_dws_le_to_cpu(wrb_payload, sizeof(*resp) + resp->response_length);
+err:
+	mutex_unlock(&adapter->mcc_lock);
+	return status;
+}
+EXPORT_SYMBOL(be_roce_mcc_cmd);
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h
new file mode 100644
index 0000000..e8b43cf
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.h
@@ -0,0 +1,2515 @@
+/*
+ * Copyright (C) 2005 - 2016 Broadcom
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.  The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+/*
+ * The driver sends configuration and managements command requests to the
+ * firmware in the BE. These requests are communicated to the processor
+ * using Work Request Blocks (WRBs) submitted to the MCC-WRB ring or via one
+ * WRB inside a MAILBOX.
+ * The commands are serviced by the ARM processor in the BladeEngine's MPU.
+ */
+
+struct be_sge {
+	u32 pa_lo;
+	u32 pa_hi;
+	u32 len;
+};
+
+#define MCC_WRB_EMBEDDED_MASK	1 	/* bit 0 of dword 0*/
+#define MCC_WRB_SGE_CNT_SHIFT	3	/* bits 3 - 7 of dword 0 */
+#define MCC_WRB_SGE_CNT_MASK	0x1F	/* bits 3 - 7 of dword 0 */
+struct be_mcc_wrb {
+	u32 embedded;		/* dword 0 */
+	u32 payload_length;	/* dword 1 */
+	u32 tag0;		/* dword 2 */
+	u32 tag1;		/* dword 3 */
+	u32 rsvd;		/* dword 4 */
+	union {
+		u8 embedded_payload[236]; /* used by embedded cmds */
+		struct be_sge sgl[19];    /* used by non-embedded cmds */
+	} payload;
+};
+
+#define CQE_FLAGS_VALID_MASK		BIT(31)
+#define CQE_FLAGS_ASYNC_MASK		BIT(30)
+#define CQE_FLAGS_COMPLETED_MASK	BIT(28)
+#define CQE_FLAGS_CONSUMED_MASK		BIT(27)
+
+/* Completion Status */
+enum mcc_base_status {
+	MCC_STATUS_SUCCESS = 0,
+	MCC_STATUS_FAILED = 1,
+	MCC_STATUS_ILLEGAL_REQUEST = 2,
+	MCC_STATUS_ILLEGAL_FIELD = 3,
+	MCC_STATUS_INSUFFICIENT_BUFFER = 4,
+	MCC_STATUS_UNAUTHORIZED_REQUEST = 5,
+	MCC_STATUS_NOT_SUPPORTED = 66,
+	MCC_STATUS_FEATURE_NOT_SUPPORTED = 68,
+	MCC_STATUS_INVALID_LENGTH = 116
+};
+
+/* Additional status */
+enum mcc_addl_status {
+	MCC_ADDL_STATUS_INSUFFICIENT_RESOURCES = 0x16,
+	MCC_ADDL_STATUS_FLASH_IMAGE_CRC_MISMATCH = 0x4d,
+	MCC_ADDL_STATUS_TOO_MANY_INTERFACES = 0x4a,
+	MCC_ADDL_STATUS_INSUFFICIENT_VLANS = 0xab,
+	MCC_ADDL_STATUS_INVALID_SIGNATURE = 0x56,
+	MCC_ADDL_STATUS_MISSING_SIGNATURE = 0x57,
+	MCC_ADDL_STATUS_INSUFFICIENT_PRIVILEGES = 0x60
+};
+
+#define CQE_BASE_STATUS_MASK		0xFFFF
+#define CQE_BASE_STATUS_SHIFT		0	/* bits 0 - 15 */
+#define CQE_ADDL_STATUS_MASK		0xFF
+#define CQE_ADDL_STATUS_SHIFT		16	/* bits 16 - 31 */
+
+#define base_status(status)		\
+		((enum mcc_base_status)	\
+			(status > 0 ? (status & CQE_BASE_STATUS_MASK) : 0))
+#define addl_status(status)		\
+		((enum mcc_addl_status)	\
+			(status > 0 ? (status >> CQE_ADDL_STATUS_SHIFT) & \
+					CQE_ADDL_STATUS_MASK : 0))
+
+struct be_mcc_compl {
+	u32 status;		/* dword 0 */
+	u32 tag0;		/* dword 1 */
+	u32 tag1;		/* dword 2 */
+	u32 flags;		/* dword 3 */
+};
+
+/* When the async bit of mcc_compl flags is set, flags
+ * is interpreted as follows:
+ */
+#define ASYNC_EVENT_CODE_SHIFT		8	/* bits 8 - 15 */
+#define ASYNC_EVENT_CODE_MASK		0xFF
+#define ASYNC_EVENT_TYPE_SHIFT		16
+#define ASYNC_EVENT_TYPE_MASK		0xFF
+#define ASYNC_EVENT_CODE_LINK_STATE	0x1
+#define ASYNC_EVENT_CODE_GRP_5		0x5
+#define ASYNC_EVENT_QOS_SPEED		0x1
+#define ASYNC_EVENT_COS_PRIORITY	0x2
+#define ASYNC_EVENT_PVID_STATE		0x3
+#define ASYNC_EVENT_CODE_QNQ		0x6
+#define ASYNC_DEBUG_EVENT_TYPE_QNQ	1
+#define ASYNC_EVENT_CODE_SLIPORT	0x11
+#define ASYNC_EVENT_PORT_MISCONFIG	0x9
+#define ASYNC_EVENT_FW_CONTROL		0x5
+
+enum {
+	LINK_DOWN	= 0x0,
+	LINK_UP		= 0x1
+};
+#define LINK_STATUS_MASK			0x1
+#define LOGICAL_LINK_STATUS_MASK		0x2
+
+/* When the event code of compl->flags is link-state, the mcc_compl
+ * must be interpreted as follows
+ */
+struct be_async_event_link_state {
+	u8 physical_port;
+	u8 port_link_status;
+	u8 port_duplex;
+	u8 port_speed;
+	u8 port_fault;
+	u8 rsvd0[7];
+	u32 flags;
+} __packed;
+
+/* When the event code of compl->flags is GRP-5 and event_type is QOS_SPEED
+ * the mcc_compl must be interpreted as follows
+ */
+struct be_async_event_grp5_qos_link_speed {
+	u8 physical_port;
+	u8 rsvd[5];
+	u16 qos_link_speed;
+	u32 event_tag;
+	u32 flags;
+} __packed;
+
+/* When the event code of compl->flags is GRP5 and event type is
+ * CoS-Priority, the mcc_compl must be interpreted as follows
+ */
+struct be_async_event_grp5_cos_priority {
+	u8 physical_port;
+	u8 available_priority_bmap;
+	u8 reco_default_priority;
+	u8 valid;
+	u8 rsvd0;
+	u8 event_tag;
+	u32 flags;
+} __packed;
+
+/* When the event code of compl->flags is GRP5 and event type is
+ * PVID state, the mcc_compl must be interpreted as follows
+ */
+struct be_async_event_grp5_pvid_state {
+	u8 enabled;
+	u8 rsvd0;
+	u16 tag;
+	u32 event_tag;
+	u32 rsvd1;
+	u32 flags;
+} __packed;
+
+/* async event indicating outer VLAN tag in QnQ */
+struct be_async_event_qnq {
+	u8 valid;	/* Indicates if outer VLAN is valid */
+	u8 rsvd0;
+	u16 vlan_tag;
+	u32 event_tag;
+	u8 rsvd1[4];
+	u32 flags;
+} __packed;
+
+enum {
+	BE_PHY_FUNCTIONAL	= 0,
+	BE_PHY_NOT_PRESENT	= 1,
+	BE_PHY_DIFF_MEDIA	= 2,
+	BE_PHY_INCOMPATIBLE	= 3,
+	BE_PHY_UNQUALIFIED	= 4,
+	BE_PHY_UNCERTIFIED	= 5
+};
+
+#define PHY_STATE_MSG_SEVERITY		0x6
+#define PHY_STATE_OPER			0x1
+#define PHY_STATE_INFO_VALID		0x80
+#define	PHY_STATE_OPER_MSG_NONE		0x2
+#define DEFAULT_MSG_SEVERITY		0x1
+
+#define be_phy_state_unknown(phy_state) (phy_state > BE_PHY_UNCERTIFIED)
+#define be_phy_unqualified(phy_state)				\
+			(phy_state == BE_PHY_UNQUALIFIED ||	\
+			 phy_state == BE_PHY_UNCERTIFIED)
+#define be_phy_misconfigured(phy_state)				\
+			(phy_state == BE_PHY_INCOMPATIBLE ||	\
+			 phy_state == BE_PHY_UNQUALIFIED ||	\
+			 phy_state == BE_PHY_UNCERTIFIED)
+
+extern const  char * const be_misconfig_evt_port_state[];
+
+/* async event indicating misconfigured port */
+struct be_async_event_misconfig_port {
+ /* DATA_WORD1:
+  * phy state of port 0: bits 7 - 0
+  * phy state of port 1: bits 15 - 8
+  * phy state of port 2: bits 23 - 16
+  * phy state of port 3: bits 31 - 24
+  */
+	u32 event_data_word1;
+ /* DATA_WORD2:
+  * phy state info of port 0: bits 7 - 0
+  * phy state info of port 1: bits 15 - 8
+  * phy state info of port 2: bits 23 - 16
+  * phy state info of port 3: bits 31 - 24
+  *
+  * PHY STATE INFO:
+  * Link operability	 :bit 0
+  * Message severity	 :bit 2 - 1
+  * Rsvd			 :bits 6 - 3
+  * phy state info valid	 :bit 7
+  */
+	u32 event_data_word2;
+	u32 rsvd0;
+	u32 flags;
+} __packed;
+
+#define BMC_FILT_BROADCAST_ARP				BIT(0)
+#define BMC_FILT_BROADCAST_DHCP_CLIENT			BIT(1)
+#define BMC_FILT_BROADCAST_DHCP_SERVER			BIT(2)
+#define BMC_FILT_BROADCAST_NET_BIOS			BIT(3)
+#define BMC_FILT_BROADCAST				BIT(7)
+#define BMC_FILT_MULTICAST_IPV6_NEIGH_ADVER		BIT(8)
+#define BMC_FILT_MULTICAST_IPV6_RA			BIT(9)
+#define BMC_FILT_MULTICAST_IPV6_RAS			BIT(10)
+#define BMC_FILT_MULTICAST				BIT(15)
+struct be_async_fw_control {
+	u32 event_data_word1;
+	u32 event_data_word2;
+	u32 evt_tag;
+	u32 event_data_word4;
+} __packed;
+
+struct be_mcc_mailbox {
+	struct be_mcc_wrb wrb;
+	struct be_mcc_compl compl;
+};
+
+#define CMD_SUBSYSTEM_COMMON	0x1
+#define CMD_SUBSYSTEM_ETH 	0x3
+#define CMD_SUBSYSTEM_LOWLEVEL  0xb
+
+#define OPCODE_COMMON_NTWK_MAC_QUERY			1
+#define OPCODE_COMMON_NTWK_MAC_SET			2
+#define OPCODE_COMMON_NTWK_MULTICAST_SET		3
+#define OPCODE_COMMON_NTWK_VLAN_CONFIG  		4
+#define OPCODE_COMMON_NTWK_LINK_STATUS_QUERY		5
+#define OPCODE_COMMON_READ_FLASHROM			6
+#define OPCODE_COMMON_WRITE_FLASHROM			7
+#define OPCODE_COMMON_CQ_CREATE				12
+#define OPCODE_COMMON_EQ_CREATE				13
+#define OPCODE_COMMON_MCC_CREATE			21
+#define OPCODE_COMMON_SET_QOS				28
+#define OPCODE_COMMON_MCC_CREATE_EXT			90
+#define OPCODE_COMMON_SEEPROM_READ			30
+#define OPCODE_COMMON_GET_CNTL_ATTRIBUTES               32
+#define OPCODE_COMMON_NTWK_RX_FILTER    		34
+#define OPCODE_COMMON_GET_FW_VERSION			35
+#define OPCODE_COMMON_SET_FLOW_CONTROL			36
+#define OPCODE_COMMON_GET_FLOW_CONTROL			37
+#define OPCODE_COMMON_SET_FRAME_SIZE			39
+#define OPCODE_COMMON_MODIFY_EQ_DELAY			41
+#define OPCODE_COMMON_FIRMWARE_CONFIG			42
+#define OPCODE_COMMON_NTWK_INTERFACE_CREATE 		50
+#define OPCODE_COMMON_NTWK_INTERFACE_DESTROY 		51
+#define OPCODE_COMMON_MCC_DESTROY        		53
+#define OPCODE_COMMON_CQ_DESTROY        		54
+#define OPCODE_COMMON_EQ_DESTROY        		55
+#define OPCODE_COMMON_QUERY_FIRMWARE_CONFIG		58
+#define OPCODE_COMMON_NTWK_PMAC_ADD			59
+#define OPCODE_COMMON_NTWK_PMAC_DEL			60
+#define OPCODE_COMMON_FUNCTION_RESET			61
+#define OPCODE_COMMON_MANAGE_FAT			68
+#define OPCODE_COMMON_ENABLE_DISABLE_BEACON		69
+#define OPCODE_COMMON_GET_BEACON_STATE			70
+#define OPCODE_COMMON_READ_TRANSRECV_DATA		73
+#define OPCODE_COMMON_GET_PORT_NAME			77
+#define OPCODE_COMMON_SET_LOGICAL_LINK_CONFIG		80
+#define OPCODE_COMMON_SET_INTERRUPT_ENABLE		89
+#define OPCODE_COMMON_SET_FN_PRIVILEGES			100
+#define OPCODE_COMMON_GET_PHY_DETAILS			102
+#define OPCODE_COMMON_SET_DRIVER_FUNCTION_CAP		103
+#define OPCODE_COMMON_GET_CNTL_ADDITIONAL_ATTRIBUTES	121
+#define OPCODE_COMMON_GET_EXT_FAT_CAPABILITIES		125
+#define OPCODE_COMMON_SET_EXT_FAT_CAPABILITIES		126
+#define OPCODE_COMMON_GET_MAC_LIST			147
+#define OPCODE_COMMON_SET_MAC_LIST			148
+#define OPCODE_COMMON_GET_HSW_CONFIG			152
+#define OPCODE_COMMON_GET_FUNC_CONFIG			160
+#define OPCODE_COMMON_GET_PROFILE_CONFIG		164
+#define OPCODE_COMMON_SET_PROFILE_CONFIG		165
+#define OPCODE_COMMON_GET_ACTIVE_PROFILE		167
+#define OPCODE_COMMON_SET_HSW_CONFIG			153
+#define OPCODE_COMMON_GET_FN_PRIVILEGES			170
+#define OPCODE_COMMON_READ_OBJECT			171
+#define OPCODE_COMMON_WRITE_OBJECT			172
+#define OPCODE_COMMON_DELETE_OBJECT			174
+#define OPCODE_COMMON_SET_FEATURES			191
+#define OPCODE_COMMON_MANAGE_IFACE_FILTERS		193
+#define OPCODE_COMMON_GET_IFACE_LIST			194
+#define OPCODE_COMMON_ENABLE_DISABLE_VF			196
+
+#define OPCODE_ETH_RSS_CONFIG				1
+#define OPCODE_ETH_ACPI_CONFIG				2
+#define OPCODE_ETH_PROMISCUOUS				3
+#define OPCODE_ETH_GET_STATISTICS			4
+#define OPCODE_ETH_TX_CREATE				7
+#define OPCODE_ETH_RX_CREATE            		8
+#define OPCODE_ETH_TX_DESTROY           		9
+#define OPCODE_ETH_RX_DESTROY           		10
+#define OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG		12
+#define OPCODE_ETH_GET_PPORT_STATS			18
+
+#define OPCODE_LOWLEVEL_HOST_DDR_DMA                    17
+#define OPCODE_LOWLEVEL_LOOPBACK_TEST                   18
+#define OPCODE_LOWLEVEL_SET_LOOPBACK_MODE		19
+
+struct be_cmd_req_hdr {
+	u8 opcode;		/* dword 0 */
+	u8 subsystem;		/* dword 0 */
+	u8 port_number;		/* dword 0 */
+	u8 domain;		/* dword 0 */
+	u32 timeout;		/* dword 1 */
+	u32 request_length;	/* dword 2 */
+	u8 version;		/* dword 3 */
+	u8 rsvd[3];		/* dword 3 */
+};
+
+#define RESP_HDR_INFO_OPCODE_SHIFT	0	/* bits 0 - 7 */
+#define RESP_HDR_INFO_SUBSYS_SHIFT	8 	/* bits 8 - 15 */
+struct be_cmd_resp_hdr {
+	u8 opcode;		/* dword 0 */
+	u8 subsystem;		/* dword 0 */
+	u8 rsvd[2];		/* dword 0 */
+	u8 base_status;		/* dword 1 */
+	u8 addl_status;		/* dword 1 */
+	u8 rsvd1[2];		/* dword 1 */
+	u32 response_length;	/* dword 2 */
+	u32 actual_resp_len;	/* dword 3 */
+};
+
+struct phys_addr {
+	u32 lo;
+	u32 hi;
+};
+
+/**************************
+ * BE Command definitions *
+ **************************/
+
+/* Pseudo amap definition in which each bit of the actual structure is defined
+ * as a byte: used to calculate offset/shift/mask of each field */
+struct amap_eq_context {
+	u8 cidx[13];		/* dword 0*/
+	u8 rsvd0[3];		/* dword 0*/
+	u8 epidx[13];		/* dword 0*/
+	u8 valid;		/* dword 0*/
+	u8 rsvd1;		/* dword 0*/
+	u8 size;		/* dword 0*/
+	u8 pidx[13];		/* dword 1*/
+	u8 rsvd2[3];		/* dword 1*/
+	u8 pd[10];		/* dword 1*/
+	u8 count[3];		/* dword 1*/
+	u8 solevent;		/* dword 1*/
+	u8 stalled;		/* dword 1*/
+	u8 armed;		/* dword 1*/
+	u8 rsvd3[4];		/* dword 2*/
+	u8 func[8];		/* dword 2*/
+	u8 rsvd4;		/* dword 2*/
+	u8 delaymult[10];	/* dword 2*/
+	u8 rsvd5[2];		/* dword 2*/
+	u8 phase[2];		/* dword 2*/
+	u8 nodelay;		/* dword 2*/
+	u8 rsvd6[4];		/* dword 2*/
+	u8 rsvd7[32];		/* dword 3*/
+} __packed;
+
+struct be_cmd_req_eq_create {
+	struct be_cmd_req_hdr hdr;
+	u16 num_pages;		/* sword */
+	u16 rsvd0;		/* sword */
+	u8 context[sizeof(struct amap_eq_context) / 8];
+	struct phys_addr pages[8];
+} __packed;
+
+struct be_cmd_resp_eq_create {
+	struct be_cmd_resp_hdr resp_hdr;
+	u16 eq_id;		/* sword */
+	u16 msix_idx;		/* available only in v2 */
+} __packed;
+
+/******************** Mac query ***************************/
+enum {
+	MAC_ADDRESS_TYPE_STORAGE = 0x0,
+	MAC_ADDRESS_TYPE_NETWORK = 0x1,
+	MAC_ADDRESS_TYPE_PD = 0x2,
+	MAC_ADDRESS_TYPE_MANAGEMENT = 0x3
+};
+
+struct mac_addr {
+	u16 size_of_struct;
+	u8 addr[ETH_ALEN];
+} __packed;
+
+struct be_cmd_req_mac_query {
+	struct be_cmd_req_hdr hdr;
+	u8 type;
+	u8 permanent;
+	u16 if_id;
+	u32 pmac_id;
+} __packed;
+
+struct be_cmd_resp_mac_query {
+	struct be_cmd_resp_hdr hdr;
+	struct mac_addr mac;
+};
+
+/******************** PMac Add ***************************/
+struct be_cmd_req_pmac_add {
+	struct be_cmd_req_hdr hdr;
+	u32 if_id;
+	u8 mac_address[ETH_ALEN];
+	u8 rsvd0[2];
+} __packed;
+
+struct be_cmd_resp_pmac_add {
+	struct be_cmd_resp_hdr hdr;
+	u32 pmac_id;
+};
+
+/******************** PMac Del ***************************/
+struct be_cmd_req_pmac_del {
+	struct be_cmd_req_hdr hdr;
+	u32 if_id;
+	u32 pmac_id;
+};
+
+/******************** Create CQ ***************************/
+/* Pseudo amap definition in which each bit of the actual structure is defined
+ * as a byte: used to calculate offset/shift/mask of each field */
+struct amap_cq_context_be {
+	u8 cidx[11];		/* dword 0*/
+	u8 rsvd0;		/* dword 0*/
+	u8 coalescwm[2];	/* dword 0*/
+	u8 nodelay;		/* dword 0*/
+	u8 epidx[11];		/* dword 0*/
+	u8 rsvd1;		/* dword 0*/
+	u8 count[2];		/* dword 0*/
+	u8 valid;		/* dword 0*/
+	u8 solevent;		/* dword 0*/
+	u8 eventable;		/* dword 0*/
+	u8 pidx[11];		/* dword 1*/
+	u8 rsvd2;		/* dword 1*/
+	u8 pd[10];		/* dword 1*/
+	u8 eqid[8];		/* dword 1*/
+	u8 stalled;		/* dword 1*/
+	u8 armed;		/* dword 1*/
+	u8 rsvd3[4];		/* dword 2*/
+	u8 func[8];		/* dword 2*/
+	u8 rsvd4[20];		/* dword 2*/
+	u8 rsvd5[32];		/* dword 3*/
+} __packed;
+
+struct amap_cq_context_v2 {
+	u8 rsvd0[12];		/* dword 0*/
+	u8 coalescwm[2];	/* dword 0*/
+	u8 nodelay;		/* dword 0*/
+	u8 rsvd1[12];		/* dword 0*/
+	u8 count[2];		/* dword 0*/
+	u8 valid;		/* dword 0*/
+	u8 rsvd2;		/* dword 0*/
+	u8 eventable;		/* dword 0*/
+	u8 eqid[16];		/* dword 1*/
+	u8 rsvd3[15];		/* dword 1*/
+	u8 armed;		/* dword 1*/
+	u8 rsvd4[32];		/* dword 2*/
+	u8 rsvd5[32];		/* dword 3*/
+} __packed;
+
+struct be_cmd_req_cq_create {
+	struct be_cmd_req_hdr hdr;
+	u16 num_pages;
+	u8 page_size;
+	u8 rsvd0;
+	u8 context[sizeof(struct amap_cq_context_be) / 8];
+	struct phys_addr pages[8];
+} __packed;
+
+
+struct be_cmd_resp_cq_create {
+	struct be_cmd_resp_hdr hdr;
+	u16 cq_id;
+	u16 rsvd0;
+} __packed;
+
+struct be_cmd_req_get_fat {
+	struct be_cmd_req_hdr hdr;
+	u32 fat_operation;
+	u32 read_log_offset;
+	u32 read_log_length;
+	u32 data_buffer_size;
+	u32 data_buffer[1];
+} __packed;
+
+struct be_cmd_resp_get_fat {
+	struct be_cmd_resp_hdr hdr;
+	u32 log_size;
+	u32 read_log_length;
+	u32 rsvd[2];
+	u32 data_buffer[1];
+} __packed;
+
+
+/******************** Create MCCQ ***************************/
+/* Pseudo amap definition in which each bit of the actual structure is defined
+ * as a byte: used to calculate offset/shift/mask of each field */
+struct amap_mcc_context_be {
+	u8 con_index[14];
+	u8 rsvd0[2];
+	u8 ring_size[4];
+	u8 fetch_wrb;
+	u8 fetch_r2t;
+	u8 cq_id[10];
+	u8 prod_index[14];
+	u8 fid[8];
+	u8 pdid[9];
+	u8 valid;
+	u8 rsvd1[32];
+	u8 rsvd2[32];
+} __packed;
+
+struct amap_mcc_context_v1 {
+	u8 async_cq_id[16];
+	u8 ring_size[4];
+	u8 rsvd0[12];
+	u8 rsvd1[31];
+	u8 valid;
+	u8 async_cq_valid[1];
+	u8 rsvd2[31];
+	u8 rsvd3[32];
+} __packed;
+
+struct be_cmd_req_mcc_create {
+	struct be_cmd_req_hdr hdr;
+	u16 num_pages;
+	u16 cq_id;
+	u8 context[sizeof(struct amap_mcc_context_be) / 8];
+	struct phys_addr pages[8];
+} __packed;
+
+struct be_cmd_req_mcc_ext_create {
+	struct be_cmd_req_hdr hdr;
+	u16 num_pages;
+	u16 cq_id;
+	u32 async_event_bitmap[1];
+	u8 context[sizeof(struct amap_mcc_context_v1) / 8];
+	struct phys_addr pages[8];
+} __packed;
+
+struct be_cmd_resp_mcc_create {
+	struct be_cmd_resp_hdr hdr;
+	u16 id;
+	u16 rsvd0;
+} __packed;
+
+/******************** Create TxQ ***************************/
+#define BE_ETH_TX_RING_TYPE_STANDARD    	2
+#define BE_ULP1_NUM				1
+
+struct be_cmd_req_eth_tx_create {
+	struct be_cmd_req_hdr hdr;
+	u8 num_pages;
+	u8 ulp_num;
+	u16 type;
+	u16 if_id;
+	u8 queue_size;
+	u8 rsvd0;
+	u32 rsvd1;
+	u16 cq_id;
+	u16 rsvd2;
+	u32 rsvd3[13];
+	struct phys_addr pages[8];
+} __packed;
+
+struct be_cmd_resp_eth_tx_create {
+	struct be_cmd_resp_hdr hdr;
+	u16 cid;
+	u16 rid;
+	u32 db_offset;
+	u32 rsvd0[4];
+} __packed;
+
+/******************** Create RxQ ***************************/
+struct be_cmd_req_eth_rx_create {
+	struct be_cmd_req_hdr hdr;
+	u16 cq_id;
+	u8 frag_size;
+	u8 num_pages;
+	struct phys_addr pages[2];
+	u32 interface_id;
+	u16 max_frame_size;
+	u16 rsvd0;
+	u32 rss_queue;
+} __packed;
+
+struct be_cmd_resp_eth_rx_create {
+	struct be_cmd_resp_hdr hdr;
+	u16 id;
+	u8 rss_id;
+	u8 rsvd0;
+} __packed;
+
+/******************** Q Destroy  ***************************/
+/* Type of Queue to be destroyed */
+enum {
+	QTYPE_EQ = 1,
+	QTYPE_CQ,
+	QTYPE_TXQ,
+	QTYPE_RXQ,
+	QTYPE_MCCQ
+};
+
+struct be_cmd_req_q_destroy {
+	struct be_cmd_req_hdr hdr;
+	u16 id;
+	u16 bypass_flush;	/* valid only for rx q destroy */
+} __packed;
+
+/************ I/f Create (it's actually I/f Config Create)**********/
+
+/* Capability flags for the i/f */
+enum be_if_flags {
+	BE_IF_FLAGS_RSS = 0x4,
+	BE_IF_FLAGS_PROMISCUOUS = 0x8,
+	BE_IF_FLAGS_BROADCAST = 0x10,
+	BE_IF_FLAGS_UNTAGGED = 0x20,
+	BE_IF_FLAGS_ULP = 0x40,
+	BE_IF_FLAGS_VLAN_PROMISCUOUS = 0x80,
+	BE_IF_FLAGS_VLAN = 0x100,
+	BE_IF_FLAGS_MCAST_PROMISCUOUS = 0x200,
+	BE_IF_FLAGS_PASS_L2_ERRORS = 0x400,
+	BE_IF_FLAGS_PASS_L3L4_ERRORS = 0x800,
+	BE_IF_FLAGS_MULTICAST = 0x1000,
+	BE_IF_FLAGS_DEFQ_RSS = 0x1000000
+};
+
+#define BE_IF_CAP_FLAGS_WANT (BE_IF_FLAGS_RSS | BE_IF_FLAGS_PROMISCUOUS |\
+			 BE_IF_FLAGS_BROADCAST | BE_IF_FLAGS_VLAN_PROMISCUOUS |\
+			 BE_IF_FLAGS_VLAN | BE_IF_FLAGS_MCAST_PROMISCUOUS |\
+			 BE_IF_FLAGS_PASS_L3L4_ERRORS | BE_IF_FLAGS_MULTICAST |\
+			 BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_DEFQ_RSS)
+
+#define BE_IF_FLAGS_ALL_PROMISCUOUS	(BE_IF_FLAGS_PROMISCUOUS | \
+					 BE_IF_FLAGS_VLAN_PROMISCUOUS |\
+					 BE_IF_FLAGS_MCAST_PROMISCUOUS)
+
+#define BE_IF_FILT_FLAGS_BASIC (BE_IF_FLAGS_BROADCAST | \
+				BE_IF_FLAGS_PASS_L3L4_ERRORS | \
+				BE_IF_FLAGS_UNTAGGED)
+
+#define BE_IF_ALL_FILT_FLAGS	(BE_IF_FILT_FLAGS_BASIC | \
+				 BE_IF_FLAGS_MULTICAST | \
+				 BE_IF_FLAGS_ALL_PROMISCUOUS)
+
+/* An RX interface is an object with one or more MAC addresses and
+ * filtering capabilities. */
+struct be_cmd_req_if_create {
+	struct be_cmd_req_hdr hdr;
+	u32 version;		/* ignore currently */
+	u32 capability_flags;
+	u32 enable_flags;
+	u8 mac_addr[ETH_ALEN];
+	u8 rsvd0;
+	u8 pmac_invalid; /* if set, don't attach the mac addr to the i/f */
+	u32 vlan_tag;	 /* not used currently */
+} __packed;
+
+struct be_cmd_resp_if_create {
+	struct be_cmd_resp_hdr hdr;
+	u32 interface_id;
+	u32 pmac_id;
+};
+
+/****** I/f Destroy(it's actually I/f Config Destroy )**********/
+struct be_cmd_req_if_destroy {
+	struct be_cmd_req_hdr hdr;
+	u32 interface_id;
+};
+
+/*************** HW Stats Get **********************************/
+struct be_port_rxf_stats_v0 {
+	u32 rx_bytes_lsd;	/* dword 0*/
+	u32 rx_bytes_msd;	/* dword 1*/
+	u32 rx_total_frames;	/* dword 2*/
+	u32 rx_unicast_frames;	/* dword 3*/
+	u32 rx_multicast_frames;	/* dword 4*/
+	u32 rx_broadcast_frames;	/* dword 5*/
+	u32 rx_crc_errors;	/* dword 6*/
+	u32 rx_alignment_symbol_errors;	/* dword 7*/
+	u32 rx_pause_frames;	/* dword 8*/
+	u32 rx_control_frames;	/* dword 9*/
+	u32 rx_in_range_errors;	/* dword 10*/
+	u32 rx_out_range_errors;	/* dword 11*/
+	u32 rx_frame_too_long;	/* dword 12*/
+	u32 rx_address_filtered;	/* dword 13*/
+	u32 rx_vlan_filtered;	/* dword 14*/
+	u32 rx_dropped_too_small;	/* dword 15*/
+	u32 rx_dropped_too_short;	/* dword 16*/
+	u32 rx_dropped_header_too_small;	/* dword 17*/
+	u32 rx_dropped_tcp_length;	/* dword 18*/
+	u32 rx_dropped_runt;	/* dword 19*/
+	u32 rx_64_byte_packets;	/* dword 20*/
+	u32 rx_65_127_byte_packets;	/* dword 21*/
+	u32 rx_128_256_byte_packets;	/* dword 22*/
+	u32 rx_256_511_byte_packets;	/* dword 23*/
+	u32 rx_512_1023_byte_packets;	/* dword 24*/
+	u32 rx_1024_1518_byte_packets;	/* dword 25*/
+	u32 rx_1519_2047_byte_packets;	/* dword 26*/
+	u32 rx_2048_4095_byte_packets;	/* dword 27*/
+	u32 rx_4096_8191_byte_packets;	/* dword 28*/
+	u32 rx_8192_9216_byte_packets;	/* dword 29*/
+	u32 rx_ip_checksum_errs;	/* dword 30*/
+	u32 rx_tcp_checksum_errs;	/* dword 31*/
+	u32 rx_udp_checksum_errs;	/* dword 32*/
+	u32 rx_non_rss_packets;	/* dword 33*/
+	u32 rx_ipv4_packets;	/* dword 34*/
+	u32 rx_ipv6_packets;	/* dword 35*/
+	u32 rx_ipv4_bytes_lsd;	/* dword 36*/
+	u32 rx_ipv4_bytes_msd;	/* dword 37*/
+	u32 rx_ipv6_bytes_lsd;	/* dword 38*/
+	u32 rx_ipv6_bytes_msd;	/* dword 39*/
+	u32 rx_chute1_packets;	/* dword 40*/
+	u32 rx_chute2_packets;	/* dword 41*/
+	u32 rx_chute3_packets;	/* dword 42*/
+	u32 rx_management_packets;	/* dword 43*/
+	u32 rx_switched_unicast_packets;	/* dword 44*/
+	u32 rx_switched_multicast_packets;	/* dword 45*/
+	u32 rx_switched_broadcast_packets;	/* dword 46*/
+	u32 tx_bytes_lsd;	/* dword 47*/
+	u32 tx_bytes_msd;	/* dword 48*/
+	u32 tx_unicastframes;	/* dword 49*/
+	u32 tx_multicastframes;	/* dword 50*/
+	u32 tx_broadcastframes;	/* dword 51*/
+	u32 tx_pauseframes;	/* dword 52*/
+	u32 tx_controlframes;	/* dword 53*/
+	u32 tx_64_byte_packets;	/* dword 54*/
+	u32 tx_65_127_byte_packets;	/* dword 55*/
+	u32 tx_128_256_byte_packets;	/* dword 56*/
+	u32 tx_256_511_byte_packets;	/* dword 57*/
+	u32 tx_512_1023_byte_packets;	/* dword 58*/
+	u32 tx_1024_1518_byte_packets;	/* dword 59*/
+	u32 tx_1519_2047_byte_packets;	/* dword 60*/
+	u32 tx_2048_4095_byte_packets;	/* dword 61*/
+	u32 tx_4096_8191_byte_packets;	/* dword 62*/
+	u32 tx_8192_9216_byte_packets;	/* dword 63*/
+	u32 rx_fifo_overflow;	/* dword 64*/
+	u32 rx_input_fifo_overflow;	/* dword 65*/
+};
+
+struct be_rxf_stats_v0 {
+	struct be_port_rxf_stats_v0 port[2];
+	u32 rx_drops_no_pbuf;	/* dword 132*/
+	u32 rx_drops_no_txpb;	/* dword 133*/
+	u32 rx_drops_no_erx_descr;	/* dword 134*/
+	u32 rx_drops_no_tpre_descr;	/* dword 135*/
+	u32 management_rx_port_packets;	/* dword 136*/
+	u32 management_rx_port_bytes;	/* dword 137*/
+	u32 management_rx_port_pause_frames;	/* dword 138*/
+	u32 management_rx_port_errors;	/* dword 139*/
+	u32 management_tx_port_packets;	/* dword 140*/
+	u32 management_tx_port_bytes;	/* dword 141*/
+	u32 management_tx_port_pause;	/* dword 142*/
+	u32 management_rx_port_rxfifo_overflow;	/* dword 143*/
+	u32 rx_drops_too_many_frags;	/* dword 144*/
+	u32 rx_drops_invalid_ring;	/* dword 145*/
+	u32 forwarded_packets;	/* dword 146*/
+	u32 rx_drops_mtu;	/* dword 147*/
+	u32 rsvd0[7];
+	u32 port0_jabber_events;
+	u32 port1_jabber_events;
+	u32 rsvd1[6];
+};
+
+struct be_erx_stats_v0 {
+	u32 rx_drops_no_fragments[44];     /* dwordS 0 to 43*/
+	u32 rsvd[4];
+};
+
+struct be_pmem_stats {
+	u32 eth_red_drops;
+	u32 rsvd[5];
+};
+
+struct be_hw_stats_v0 {
+	struct be_rxf_stats_v0 rxf;
+	u32 rsvd[48];
+	struct be_erx_stats_v0 erx;
+	struct be_pmem_stats pmem;
+};
+
+struct be_cmd_req_get_stats_v0 {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd[sizeof(struct be_hw_stats_v0)];
+};
+
+struct be_cmd_resp_get_stats_v0 {
+	struct be_cmd_resp_hdr hdr;
+	struct be_hw_stats_v0 hw_stats;
+};
+
+struct lancer_pport_stats {
+	u32 tx_packets_lo;
+	u32 tx_packets_hi;
+	u32 tx_unicast_packets_lo;
+	u32 tx_unicast_packets_hi;
+	u32 tx_multicast_packets_lo;
+	u32 tx_multicast_packets_hi;
+	u32 tx_broadcast_packets_lo;
+	u32 tx_broadcast_packets_hi;
+	u32 tx_bytes_lo;
+	u32 tx_bytes_hi;
+	u32 tx_unicast_bytes_lo;
+	u32 tx_unicast_bytes_hi;
+	u32 tx_multicast_bytes_lo;
+	u32 tx_multicast_bytes_hi;
+	u32 tx_broadcast_bytes_lo;
+	u32 tx_broadcast_bytes_hi;
+	u32 tx_discards_lo;
+	u32 tx_discards_hi;
+	u32 tx_errors_lo;
+	u32 tx_errors_hi;
+	u32 tx_pause_frames_lo;
+	u32 tx_pause_frames_hi;
+	u32 tx_pause_on_frames_lo;
+	u32 tx_pause_on_frames_hi;
+	u32 tx_pause_off_frames_lo;
+	u32 tx_pause_off_frames_hi;
+	u32 tx_internal_mac_errors_lo;
+	u32 tx_internal_mac_errors_hi;
+	u32 tx_control_frames_lo;
+	u32 tx_control_frames_hi;
+	u32 tx_packets_64_bytes_lo;
+	u32 tx_packets_64_bytes_hi;
+	u32 tx_packets_65_to_127_bytes_lo;
+	u32 tx_packets_65_to_127_bytes_hi;
+	u32 tx_packets_128_to_255_bytes_lo;
+	u32 tx_packets_128_to_255_bytes_hi;
+	u32 tx_packets_256_to_511_bytes_lo;
+	u32 tx_packets_256_to_511_bytes_hi;
+	u32 tx_packets_512_to_1023_bytes_lo;
+	u32 tx_packets_512_to_1023_bytes_hi;
+	u32 tx_packets_1024_to_1518_bytes_lo;
+	u32 tx_packets_1024_to_1518_bytes_hi;
+	u32 tx_packets_1519_to_2047_bytes_lo;
+	u32 tx_packets_1519_to_2047_bytes_hi;
+	u32 tx_packets_2048_to_4095_bytes_lo;
+	u32 tx_packets_2048_to_4095_bytes_hi;
+	u32 tx_packets_4096_to_8191_bytes_lo;
+	u32 tx_packets_4096_to_8191_bytes_hi;
+	u32 tx_packets_8192_to_9216_bytes_lo;
+	u32 tx_packets_8192_to_9216_bytes_hi;
+	u32 tx_lso_packets_lo;
+	u32 tx_lso_packets_hi;
+	u32 rx_packets_lo;
+	u32 rx_packets_hi;
+	u32 rx_unicast_packets_lo;
+	u32 rx_unicast_packets_hi;
+	u32 rx_multicast_packets_lo;
+	u32 rx_multicast_packets_hi;
+	u32 rx_broadcast_packets_lo;
+	u32 rx_broadcast_packets_hi;
+	u32 rx_bytes_lo;
+	u32 rx_bytes_hi;
+	u32 rx_unicast_bytes_lo;
+	u32 rx_unicast_bytes_hi;
+	u32 rx_multicast_bytes_lo;
+	u32 rx_multicast_bytes_hi;
+	u32 rx_broadcast_bytes_lo;
+	u32 rx_broadcast_bytes_hi;
+	u32 rx_unknown_protos;
+	u32 rsvd_69; /* Word 69 is reserved */
+	u32 rx_discards_lo;
+	u32 rx_discards_hi;
+	u32 rx_errors_lo;
+	u32 rx_errors_hi;
+	u32 rx_crc_errors_lo;
+	u32 rx_crc_errors_hi;
+	u32 rx_alignment_errors_lo;
+	u32 rx_alignment_errors_hi;
+	u32 rx_symbol_errors_lo;
+	u32 rx_symbol_errors_hi;
+	u32 rx_pause_frames_lo;
+	u32 rx_pause_frames_hi;
+	u32 rx_pause_on_frames_lo;
+	u32 rx_pause_on_frames_hi;
+	u32 rx_pause_off_frames_lo;
+	u32 rx_pause_off_frames_hi;
+	u32 rx_frames_too_long_lo;
+	u32 rx_frames_too_long_hi;
+	u32 rx_internal_mac_errors_lo;
+	u32 rx_internal_mac_errors_hi;
+	u32 rx_undersize_packets;
+	u32 rx_oversize_packets;
+	u32 rx_fragment_packets;
+	u32 rx_jabbers;
+	u32 rx_control_frames_lo;
+	u32 rx_control_frames_hi;
+	u32 rx_control_frames_unknown_opcode_lo;
+	u32 rx_control_frames_unknown_opcode_hi;
+	u32 rx_in_range_errors;
+	u32 rx_out_of_range_errors;
+	u32 rx_address_filtered;
+	u32 rx_vlan_filtered;
+	u32 rx_dropped_too_small;
+	u32 rx_dropped_too_short;
+	u32 rx_dropped_header_too_small;
+	u32 rx_dropped_invalid_tcp_length;
+	u32 rx_dropped_runt;
+	u32 rx_ip_checksum_errors;
+	u32 rx_tcp_checksum_errors;
+	u32 rx_udp_checksum_errors;
+	u32 rx_non_rss_packets;
+	u32 rsvd_111;
+	u32 rx_ipv4_packets_lo;
+	u32 rx_ipv4_packets_hi;
+	u32 rx_ipv6_packets_lo;
+	u32 rx_ipv6_packets_hi;
+	u32 rx_ipv4_bytes_lo;
+	u32 rx_ipv4_bytes_hi;
+	u32 rx_ipv6_bytes_lo;
+	u32 rx_ipv6_bytes_hi;
+	u32 rx_nic_packets_lo;
+	u32 rx_nic_packets_hi;
+	u32 rx_tcp_packets_lo;
+	u32 rx_tcp_packets_hi;
+	u32 rx_iscsi_packets_lo;
+	u32 rx_iscsi_packets_hi;
+	u32 rx_management_packets_lo;
+	u32 rx_management_packets_hi;
+	u32 rx_switched_unicast_packets_lo;
+	u32 rx_switched_unicast_packets_hi;
+	u32 rx_switched_multicast_packets_lo;
+	u32 rx_switched_multicast_packets_hi;
+	u32 rx_switched_broadcast_packets_lo;
+	u32 rx_switched_broadcast_packets_hi;
+	u32 num_forwards_lo;
+	u32 num_forwards_hi;
+	u32 rx_fifo_overflow;
+	u32 rx_input_fifo_overflow;
+	u32 rx_drops_too_many_frags_lo;
+	u32 rx_drops_too_many_frags_hi;
+	u32 rx_drops_invalid_queue;
+	u32 rsvd_141;
+	u32 rx_drops_mtu_lo;
+	u32 rx_drops_mtu_hi;
+	u32 rx_packets_64_bytes_lo;
+	u32 rx_packets_64_bytes_hi;
+	u32 rx_packets_65_to_127_bytes_lo;
+	u32 rx_packets_65_to_127_bytes_hi;
+	u32 rx_packets_128_to_255_bytes_lo;
+	u32 rx_packets_128_to_255_bytes_hi;
+	u32 rx_packets_256_to_511_bytes_lo;
+	u32 rx_packets_256_to_511_bytes_hi;
+	u32 rx_packets_512_to_1023_bytes_lo;
+	u32 rx_packets_512_to_1023_bytes_hi;
+	u32 rx_packets_1024_to_1518_bytes_lo;
+	u32 rx_packets_1024_to_1518_bytes_hi;
+	u32 rx_packets_1519_to_2047_bytes_lo;
+	u32 rx_packets_1519_to_2047_bytes_hi;
+	u32 rx_packets_2048_to_4095_bytes_lo;
+	u32 rx_packets_2048_to_4095_bytes_hi;
+	u32 rx_packets_4096_to_8191_bytes_lo;
+	u32 rx_packets_4096_to_8191_bytes_hi;
+	u32 rx_packets_8192_to_9216_bytes_lo;
+	u32 rx_packets_8192_to_9216_bytes_hi;
+};
+
+struct pport_stats_params {
+	u16 pport_num;
+	u8 rsvd;
+	u8 reset_stats;
+};
+
+struct lancer_cmd_req_pport_stats {
+	struct be_cmd_req_hdr hdr;
+	union {
+		struct pport_stats_params params;
+		u8 rsvd[sizeof(struct lancer_pport_stats)];
+	} cmd_params;
+};
+
+struct lancer_cmd_resp_pport_stats {
+	struct be_cmd_resp_hdr hdr;
+	struct lancer_pport_stats pport_stats;
+};
+
+static inline struct lancer_pport_stats*
+	pport_stats_from_cmd(struct be_adapter *adapter)
+{
+	struct lancer_cmd_resp_pport_stats *cmd = adapter->stats_cmd.va;
+	return &cmd->pport_stats;
+}
+
+struct be_cmd_req_get_cntl_addnl_attribs {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd[8];
+};
+
+struct be_cmd_resp_get_cntl_addnl_attribs {
+	struct be_cmd_resp_hdr hdr;
+	u16 ipl_file_number;
+	u8 ipl_file_version;
+	u8 rsvd0;
+	u8 on_die_temperature; /* in degrees centigrade*/
+	u8 rsvd1[3];
+};
+
+struct be_cmd_req_vlan_config {
+	struct be_cmd_req_hdr hdr;
+	u8 interface_id;
+	u8 promiscuous;
+	u8 untagged;
+	u8 num_vlan;
+	u16 normal_vlan[64];
+} __packed;
+
+/******************* RX FILTER ******************************/
+#define BE_MAX_MC		64 /* set mcast promisc if > 64 */
+struct macaddr {
+	u8 byte[ETH_ALEN];
+};
+
+struct be_cmd_req_rx_filter {
+	struct be_cmd_req_hdr hdr;
+	u32 global_flags_mask;
+	u32 global_flags;
+	u32 if_flags_mask;
+	u32 if_flags;
+	u32 if_id;
+	u32 mcast_num;
+	struct macaddr mcast_mac[BE_MAX_MC];
+};
+
+/******************** Link Status Query *******************/
+struct be_cmd_req_link_status {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd;
+};
+
+enum {
+	PHY_LINK_DUPLEX_NONE = 0x0,
+	PHY_LINK_DUPLEX_HALF = 0x1,
+	PHY_LINK_DUPLEX_FULL = 0x2
+};
+
+enum {
+	PHY_LINK_SPEED_ZERO = 0x0, 	/* => No link */
+	PHY_LINK_SPEED_10MBPS = 0x1,
+	PHY_LINK_SPEED_100MBPS = 0x2,
+	PHY_LINK_SPEED_1GBPS = 0x3,
+	PHY_LINK_SPEED_10GBPS = 0x4,
+	PHY_LINK_SPEED_20GBPS = 0x5,
+	PHY_LINK_SPEED_25GBPS = 0x6,
+	PHY_LINK_SPEED_40GBPS = 0x7
+};
+
+struct be_cmd_resp_link_status {
+	struct be_cmd_resp_hdr hdr;
+	u8 physical_port;
+	u8 mac_duplex;
+	u8 mac_speed;
+	u8 mac_fault;
+	u8 mgmt_mac_duplex;
+	u8 mgmt_mac_speed;
+	u16 link_speed;
+	u8 logical_link_status;
+	u8 rsvd1[3];
+} __packed;
+
+/******************** Port Identification ***************************/
+/*    Identifies the type of port attached to NIC     */
+struct be_cmd_req_port_type {
+	struct be_cmd_req_hdr hdr;
+	__le32 page_num;
+	__le32 port;
+};
+
+enum {
+	TR_PAGE_A0 = 0xa0,
+	TR_PAGE_A2 = 0xa2
+};
+
+/* From SFF-8436 QSFP+ spec */
+#define	QSFP_PLUS_CABLE_TYPE_OFFSET	0x83
+#define	QSFP_PLUS_CR4_CABLE		0x8
+#define	QSFP_PLUS_SR4_CABLE		0x4
+#define	QSFP_PLUS_LR4_CABLE		0x2
+
+/* From SFF-8472 spec */
+#define	SFP_PLUS_SFF_8472_COMP		0x5E
+#define	SFP_PLUS_CABLE_TYPE_OFFSET	0x8
+#define	SFP_PLUS_COPPER_CABLE		0x4
+#define SFP_VENDOR_NAME_OFFSET		0x14
+#define SFP_VENDOR_PN_OFFSET		0x28
+
+#define PAGE_DATA_LEN   256
+struct be_cmd_resp_port_type {
+	struct be_cmd_resp_hdr hdr;
+	u32 page_num;
+	u32 port;
+	u8  page_data[PAGE_DATA_LEN];
+};
+
+/******************** Get FW Version *******************/
+struct be_cmd_req_get_fw_version {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd0[FW_VER_LEN];
+	u8 rsvd1[FW_VER_LEN];
+} __packed;
+
+struct be_cmd_resp_get_fw_version {
+	struct be_cmd_resp_hdr hdr;
+	u8 firmware_version_string[FW_VER_LEN];
+	u8 fw_on_flash_version_string[FW_VER_LEN];
+} __packed;
+
+/******************** Set Flow Contrl *******************/
+struct be_cmd_req_set_flow_control {
+	struct be_cmd_req_hdr hdr;
+	u16 tx_flow_control;
+	u16 rx_flow_control;
+} __packed;
+
+/******************** Get Flow Contrl *******************/
+struct be_cmd_req_get_flow_control {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd;
+};
+
+struct be_cmd_resp_get_flow_control {
+	struct be_cmd_resp_hdr hdr;
+	u16 tx_flow_control;
+	u16 rx_flow_control;
+} __packed;
+
+/******************** Modify EQ Delay *******************/
+struct be_set_eqd {
+	u32 eq_id;
+	u32 phase;
+	u32 delay_multiplier;
+};
+
+struct be_cmd_req_modify_eq_delay {
+	struct be_cmd_req_hdr hdr;
+	u32 num_eq;
+	struct be_set_eqd set_eqd[MAX_EVT_QS];
+} __packed;
+
+/******************** Get FW Config *******************/
+/* The HW can come up in either of the following multi-channel modes
+ * based on the skew/IPL.
+ */
+#define RDMA_ENABLED				0x4
+#define QNQ_MODE				0x400
+#define VNIC_MODE				0x20000
+#define UMC_ENABLED				0x1000000
+struct be_cmd_req_query_fw_cfg {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd[31];
+};
+
+struct be_cmd_resp_query_fw_cfg {
+	struct be_cmd_resp_hdr hdr;
+	u32 be_config_number;
+	u32 asic_revision;
+	u32 phys_port;
+	u32 function_mode;
+	u32 rsvd[26];
+	u32 function_caps;
+};
+
+/******************** RSS Config ****************************************/
+/* RSS type		Input parameters used to compute RX hash
+ * RSS_ENABLE_IPV4	SRC IPv4, DST IPv4
+ * RSS_ENABLE_TCP_IPV4	SRC IPv4, DST IPv4, TCP SRC PORT, TCP DST PORT
+ * RSS_ENABLE_IPV6	SRC IPv6, DST IPv6
+ * RSS_ENABLE_TCP_IPV6	SRC IPv6, DST IPv6, TCP SRC PORT, TCP DST PORT
+ * RSS_ENABLE_UDP_IPV4	SRC IPv4, DST IPv4, UDP SRC PORT, UDP DST PORT
+ * RSS_ENABLE_UDP_IPV6	SRC IPv6, DST IPv6, UDP SRC PORT, UDP DST PORT
+ *
+ * When multiple RSS types are enabled, HW picks the best hash policy
+ * based on the type of the received packet.
+ */
+#define RSS_ENABLE_NONE				0x0
+#define RSS_ENABLE_IPV4				0x1
+#define RSS_ENABLE_TCP_IPV4			0x2
+#define RSS_ENABLE_IPV6				0x4
+#define RSS_ENABLE_TCP_IPV6			0x8
+#define RSS_ENABLE_UDP_IPV4			0x10
+#define RSS_ENABLE_UDP_IPV6			0x20
+
+#define L3_RSS_FLAGS				(RXH_IP_DST | RXH_IP_SRC)
+#define L4_RSS_FLAGS				(RXH_L4_B_0_1 | RXH_L4_B_2_3)
+
+struct be_cmd_req_rss_config {
+	struct be_cmd_req_hdr hdr;
+	u32 if_id;
+	u16 enable_rss;
+	u16 cpu_table_size_log2;
+	u32 hash[10];
+	u8 cpu_table[128];
+	u8 flush;
+	u8 rsvd0[3];
+};
+
+/******************** Port Beacon ***************************/
+
+#define BEACON_STATE_ENABLED		0x1
+#define BEACON_STATE_DISABLED		0x0
+
+struct be_cmd_req_enable_disable_beacon {
+	struct be_cmd_req_hdr hdr;
+	u8  port_num;
+	u8  beacon_state;
+	u8  beacon_duration;
+	u8  status_duration;
+} __packed;
+
+struct be_cmd_req_get_beacon_state {
+	struct be_cmd_req_hdr hdr;
+	u8  port_num;
+	u8  rsvd0;
+	u16 rsvd1;
+} __packed;
+
+struct be_cmd_resp_get_beacon_state {
+	struct be_cmd_resp_hdr resp_hdr;
+	u8 beacon_state;
+	u8 rsvd0[3];
+} __packed;
+
+/* Flashrom related descriptors */
+#define MAX_FLASH_COMP			32
+
+/* Optypes of each component in the UFI */
+enum {
+	OPTYPE_ISCSI_ACTIVE = 0,
+	OPTYPE_REDBOOT = 1,
+	OPTYPE_BIOS = 2,
+	OPTYPE_PXE_BIOS = 3,
+	OPTYPE_OFFSET_SPECIFIED = 7,
+	OPTYPE_FCOE_BIOS = 8,
+	OPTYPE_ISCSI_BACKUP = 9,
+	OPTYPE_FCOE_FW_ACTIVE = 10,
+	OPTYPE_FCOE_FW_BACKUP = 11,
+	OPTYPE_NCSI_FW = 13,
+	OPTYPE_REDBOOT_DIR = 18,
+	OPTYPE_REDBOOT_CONFIG = 19,
+	OPTYPE_SH_PHY_FW = 21,
+	OPTYPE_FLASHISM_JUMPVECTOR = 22,
+	OPTYPE_UFI_DIR = 23,
+	OPTYPE_PHY_FW = 99
+};
+
+/* Maximum sizes of components in BE2 FW UFI */
+enum {
+	BE2_BIOS_COMP_MAX_SIZE = 0x40000,
+	BE2_REDBOOT_COMP_MAX_SIZE = 0x40000,
+	BE2_COMP_MAX_SIZE = 0x140000
+};
+
+/* Maximum sizes of components in BE3 FW UFI */
+enum {
+	BE3_NCSI_COMP_MAX_SIZE = 0x40000,
+	BE3_PHY_FW_COMP_MAX_SIZE = 0x40000,
+	BE3_BIOS_COMP_MAX_SIZE = 0x80000,
+	BE3_REDBOOT_COMP_MAX_SIZE = 0x100000,
+	BE3_COMP_MAX_SIZE = 0x200000
+};
+
+/* Offsets for components in BE2 FW UFI */
+enum {
+	BE2_REDBOOT_START = 0x8000,
+	BE2_FCOE_BIOS_START = 0x80000,
+	BE2_ISCSI_PRIMARY_IMAGE_START = 0x100000,
+	BE2_ISCSI_BACKUP_IMAGE_START = 0x240000,
+	BE2_FCOE_PRIMARY_IMAGE_START = 0x380000,
+	BE2_FCOE_BACKUP_IMAGE_START = 0x4c0000,
+	BE2_ISCSI_BIOS_START = 0x700000,
+	BE2_PXE_BIOS_START = 0x780000
+};
+
+/* Offsets for components in BE3 FW UFI */
+enum {
+	BE3_REDBOOT_START = 0x40000,
+	BE3_PHY_FW_START = 0x140000,
+	BE3_ISCSI_PRIMARY_IMAGE_START = 0x200000,
+	BE3_ISCSI_BACKUP_IMAGE_START = 0x400000,
+	BE3_FCOE_PRIMARY_IMAGE_START = 0x600000,
+	BE3_FCOE_BACKUP_IMAGE_START = 0x800000,
+	BE3_ISCSI_BIOS_START = 0xc00000,
+	BE3_PXE_BIOS_START = 0xc80000,
+	BE3_FCOE_BIOS_START = 0xd00000,
+	BE3_NCSI_START = 0xf40000
+};
+
+/* Component entry types */
+enum {
+	IMAGE_NCSI = 0x10,
+	IMAGE_OPTION_ROM_PXE = 0x20,
+	IMAGE_OPTION_ROM_FCOE = 0x21,
+	IMAGE_OPTION_ROM_ISCSI = 0x22,
+	IMAGE_FLASHISM_JUMPVECTOR = 0x30,
+	IMAGE_FIRMWARE_ISCSI = 0xa0,
+	IMAGE_FIRMWARE_FCOE = 0xa2,
+	IMAGE_FIRMWARE_BACKUP_ISCSI = 0xb0,
+	IMAGE_FIRMWARE_BACKUP_FCOE = 0xb2,
+	IMAGE_FIRMWARE_PHY = 0xc0,
+	IMAGE_REDBOOT_DIR = 0xd0,
+	IMAGE_REDBOOT_CONFIG = 0xd1,
+	IMAGE_UFI_DIR = 0xd2,
+	IMAGE_BOOT_CODE = 0xe2
+};
+
+struct controller_id {
+	u32 vendor;
+	u32 device;
+	u32 subvendor;
+	u32 subdevice;
+};
+
+struct flash_comp {
+	unsigned long offset;
+	int optype;
+	int size;
+	int img_type;
+};
+
+struct image_hdr {
+	u32 imageid;
+	u32 imageoffset;
+	u32 imagelength;
+	u32 image_checksum;
+	u8 image_version[32];
+};
+
+struct flash_file_hdr_g2 {
+	u8 sign[32];
+	u32 cksum;
+	u32 antidote;
+	struct controller_id cont_id;
+	u32 file_len;
+	u32 chunk_num;
+	u32 total_chunks;
+	u32 num_imgs;
+	u8 build[24];
+};
+
+/* First letter of the build version of the image */
+#define BLD_STR_UFI_TYPE_BE2	'2'
+#define BLD_STR_UFI_TYPE_BE3	'3'
+#define BLD_STR_UFI_TYPE_SH	'4'
+
+struct flash_file_hdr_g3 {
+	u8 sign[52];
+	u8 ufi_version[4];
+	u32 file_len;
+	u32 cksum;
+	u32 antidote;
+	u32 num_imgs;
+	u8 build[24];
+	u8 asic_type_rev;
+	u8 rsvd[31];
+};
+
+struct flash_section_hdr {
+	u32 format_rev;
+	u32 cksum;
+	u32 antidote;
+	u32 num_images;
+	u8 id_string[128];
+	u32 rsvd[4];
+} __packed;
+
+struct flash_section_hdr_g2 {
+	u32 format_rev;
+	u32 cksum;
+	u32 antidote;
+	u32 build_num;
+	u8 id_string[128];
+	u32 rsvd[8];
+} __packed;
+
+struct flash_section_entry {
+	u32 type;
+	u32 offset;
+	u32 pad_size;
+	u32 image_size;
+	u32 cksum;
+	u32 entry_point;
+	u16 optype;
+	u16 rsvd0;
+	u32 rsvd1;
+	u8 ver_data[32];
+} __packed;
+
+struct flash_section_info {
+	u8 cookie[32];
+	struct flash_section_hdr fsec_hdr;
+	struct flash_section_entry fsec_entry[32];
+} __packed;
+
+struct flash_section_info_g2 {
+	u8 cookie[32];
+	struct flash_section_hdr_g2 fsec_hdr;
+	struct flash_section_entry fsec_entry[32];
+} __packed;
+
+/****************** Firmware Flash ******************/
+#define FLASHROM_OPER_FLASH		1
+#define FLASHROM_OPER_SAVE		2
+#define FLASHROM_OPER_REPORT		4
+#define FLASHROM_OPER_PHY_FLASH		9
+#define FLASHROM_OPER_PHY_SAVE		10
+
+struct flashrom_params {
+	u32 op_code;
+	u32 op_type;
+	u32 data_buf_size;
+	u32 offset;
+};
+
+struct be_cmd_write_flashrom {
+	struct be_cmd_req_hdr hdr;
+	struct flashrom_params params;
+	u8 data_buf[32768];
+	u8 rsvd[4];
+} __packed;
+
+/* cmd to read flash crc */
+struct be_cmd_read_flash_crc {
+	struct be_cmd_req_hdr hdr;
+	struct flashrom_params params;
+	u8 crc[4];
+	u8 rsvd[4];
+} __packed;
+
+/**************** Lancer Firmware Flash ************/
+#define LANCER_FW_DOWNLOAD_CHUNK      (32 * 1024)
+#define LANCER_FW_DOWNLOAD_LOCATION   "/prg"
+
+struct amap_lancer_write_obj_context {
+	u8 write_length[24];
+	u8 reserved1[7];
+	u8 eof;
+} __packed;
+
+struct lancer_cmd_req_write_object {
+	struct be_cmd_req_hdr hdr;
+	u8 context[sizeof(struct amap_lancer_write_obj_context) / 8];
+	u32 write_offset;
+	u8 object_name[104];
+	u32 descriptor_count;
+	u32 buf_len;
+	u32 addr_low;
+	u32 addr_high;
+};
+
+#define LANCER_NO_RESET_NEEDED		0x00
+#define LANCER_FW_RESET_NEEDED		0x02
+struct lancer_cmd_resp_write_object {
+	u8 opcode;
+	u8 subsystem;
+	u8 rsvd1[2];
+	u8 status;
+	u8 additional_status;
+	u8 rsvd2[2];
+	u32 resp_len;
+	u32 actual_resp_len;
+	u32 actual_write_len;
+	u8 change_status;
+	u8 rsvd3[3];
+};
+
+/************************ Lancer Read FW info **************/
+#define LANCER_READ_FILE_CHUNK			(32*1024)
+#define LANCER_READ_FILE_EOF_MASK		0x80000000
+
+#define LANCER_FW_DUMP_FILE			"/dbg/dump.bin"
+#define LANCER_VPD_PF_FILE			"/vpd/ntr_pf.vpd"
+#define LANCER_VPD_VF_FILE			"/vpd/ntr_vf.vpd"
+
+struct lancer_cmd_req_read_object {
+	struct be_cmd_req_hdr hdr;
+	u32 desired_read_len;
+	u32 read_offset;
+	u8 object_name[104];
+	u32 descriptor_count;
+	u32 buf_len;
+	u32 addr_low;
+	u32 addr_high;
+};
+
+struct lancer_cmd_resp_read_object {
+	u8 opcode;
+	u8 subsystem;
+	u8 rsvd1[2];
+	u8 status;
+	u8 additional_status;
+	u8 rsvd2[2];
+	u32 resp_len;
+	u32 actual_resp_len;
+	u32 actual_read_len;
+	u32 eof;
+};
+
+struct lancer_cmd_req_delete_object {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd1;
+	u32 rsvd2;
+	u8 object_name[104];
+};
+
+/************************ WOL *******************************/
+struct be_cmd_req_acpi_wol_magic_config{
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd0[145];
+	u8 magic_mac[6];
+	u8 rsvd2[2];
+} __packed;
+
+struct be_cmd_req_acpi_wol_magic_config_v1 {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd0[2];
+	u8 query_options;
+	u8 rsvd1[5];
+	u32 rsvd2[288];
+	u8 magic_mac[6];
+	u8 rsvd3[22];
+} __packed;
+
+struct be_cmd_resp_acpi_wol_magic_config_v1 {
+	struct be_cmd_resp_hdr hdr;
+	u8 rsvd0[2];
+	u8 wol_settings;
+	u8 rsvd1[5];
+	u32 rsvd2[288];
+	u8 magic_mac[6];
+	u8 rsvd3[22];
+} __packed;
+
+#define BE_GET_WOL_CAP			2
+
+#define BE_WOL_CAP			0x1
+#define BE_PME_D0_CAP			0x8
+#define BE_PME_D1_CAP			0x10
+#define BE_PME_D2_CAP			0x20
+#define BE_PME_D3HOT_CAP		0x40
+#define BE_PME_D3COLD_CAP		0x80
+
+/********************** LoopBack test *********************/
+#define SET_LB_MODE_TIMEOUT		12000
+
+struct be_cmd_req_loopback_test {
+	struct be_cmd_req_hdr hdr;
+	u32 loopback_type;
+	u32 num_pkts;
+	u64 pattern;
+	u32 src_port;
+	u32 dest_port;
+	u32 pkt_size;
+};
+
+struct be_cmd_resp_loopback_test {
+	struct be_cmd_resp_hdr resp_hdr;
+	u32    status;
+	u32    num_txfer;
+	u32    num_rx;
+	u32    miscomp_off;
+	u32    ticks_compl;
+};
+
+struct be_cmd_req_set_lmode {
+	struct be_cmd_req_hdr hdr;
+	u8 src_port;
+	u8 dest_port;
+	u8 loopback_type;
+	u8 loopback_state;
+};
+
+/********************** DDR DMA test *********************/
+struct be_cmd_req_ddrdma_test {
+	struct be_cmd_req_hdr hdr;
+	u64 pattern;
+	u32 byte_count;
+	u32 rsvd0;
+	u8  snd_buff[4096];
+	u8  rsvd1[4096];
+};
+
+struct be_cmd_resp_ddrdma_test {
+	struct be_cmd_resp_hdr hdr;
+	u64 pattern;
+	u32 byte_cnt;
+	u32 snd_err;
+	u8  rsvd0[4096];
+	u8  rcv_buff[4096];
+};
+
+/*********************** SEEPROM Read ***********************/
+
+#define BE_READ_SEEPROM_LEN 1024
+struct be_cmd_req_seeprom_read {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd0[BE_READ_SEEPROM_LEN];
+};
+
+struct be_cmd_resp_seeprom_read {
+	struct be_cmd_req_hdr hdr;
+	u8 seeprom_data[BE_READ_SEEPROM_LEN];
+};
+
+enum {
+	PHY_TYPE_CX4_10GB = 0,
+	PHY_TYPE_XFP_10GB,
+	PHY_TYPE_SFP_1GB,
+	PHY_TYPE_SFP_PLUS_10GB,
+	PHY_TYPE_KR_10GB,
+	PHY_TYPE_KX4_10GB,
+	PHY_TYPE_BASET_10GB,
+	PHY_TYPE_BASET_1GB,
+	PHY_TYPE_BASEX_1GB,
+	PHY_TYPE_SGMII,
+	PHY_TYPE_QSFP,
+	PHY_TYPE_KR4_40GB,
+	PHY_TYPE_KR2_20GB,
+	PHY_TYPE_TN_8022,
+	PHY_TYPE_DISABLED = 255
+};
+
+#define BE_SUPPORTED_SPEED_NONE		0
+#define BE_SUPPORTED_SPEED_10MBPS	1
+#define BE_SUPPORTED_SPEED_100MBPS	2
+#define BE_SUPPORTED_SPEED_1GBPS	4
+#define BE_SUPPORTED_SPEED_10GBPS	8
+#define BE_SUPPORTED_SPEED_20GBPS	0x10
+#define BE_SUPPORTED_SPEED_40GBPS	0x20
+
+#define BE_AN_EN			0x2
+#define BE_PAUSE_SYM_EN			0x80
+
+/* MAC speed valid values */
+#define SPEED_DEFAULT  0x0
+#define SPEED_FORCED_10GB  0x1
+#define SPEED_FORCED_1GB  0x2
+#define SPEED_AUTONEG_10GB  0x3
+#define SPEED_AUTONEG_1GB  0x4
+#define SPEED_AUTONEG_100MB  0x5
+#define SPEED_AUTONEG_10GB_1GB 0x6
+#define SPEED_AUTONEG_10GB_1GB_100MB 0x7
+#define SPEED_AUTONEG_1GB_100MB  0x8
+#define SPEED_AUTONEG_10MB  0x9
+#define SPEED_AUTONEG_1GB_100MB_10MB 0xa
+#define SPEED_AUTONEG_100MB_10MB 0xb
+#define SPEED_FORCED_100MB  0xc
+#define SPEED_FORCED_10MB  0xd
+
+struct be_cmd_req_get_phy_info {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd0[24];
+};
+
+struct be_phy_info {
+	u16 phy_type;
+	u16 interface_type;
+	u32 misc_params;
+	u16 ext_phy_details;
+	u16 rsvd;
+	u16 auto_speeds_supported;
+	u16 fixed_speeds_supported;
+	u32 future_use[2];
+};
+
+struct be_cmd_resp_get_phy_info {
+	struct be_cmd_req_hdr hdr;
+	struct be_phy_info phy_info;
+};
+
+/*********************** Set QOS ***********************/
+
+#define BE_QOS_BITS_NIC				1
+
+struct be_cmd_req_set_qos {
+	struct be_cmd_req_hdr hdr;
+	u32 valid_bits;
+	u32 max_bps_nic;
+	u32 rsvd[7];
+};
+
+/*********************** Controller Attributes ***********************/
+struct mgmt_hba_attribs {
+	u32 rsvd0[24];
+	u8 controller_model_number[32];
+	u32 rsvd1[16];
+	u32 controller_serial_number[8];
+	u32 rsvd2[55];
+	u8 rsvd3[3];
+	u8 phy_port;
+	u32 rsvd4[15];
+	u8 rsvd5[2];
+	u8 pci_funcnum;
+	u8 rsvd6;
+	u32 rsvd7[6];
+} __packed;
+
+struct mgmt_controller_attrib {
+	struct mgmt_hba_attribs hba_attribs;
+	u32 rsvd0[10];
+} __packed;
+
+struct be_cmd_req_cntl_attribs {
+	struct be_cmd_req_hdr hdr;
+};
+
+struct be_cmd_resp_cntl_attribs {
+	struct be_cmd_resp_hdr hdr;
+	struct mgmt_controller_attrib attribs;
+};
+
+/*********************** Set driver function ***********************/
+#define CAPABILITY_SW_TIMESTAMPS	2
+#define CAPABILITY_BE3_NATIVE_ERX_API	4
+
+struct be_cmd_req_set_func_cap {
+	struct be_cmd_req_hdr hdr;
+	u32 valid_cap_flags;
+	u32 cap_flags;
+	u8 rsvd[212];
+};
+
+struct be_cmd_resp_set_func_cap {
+	struct be_cmd_resp_hdr hdr;
+	u32 valid_cap_flags;
+	u32 cap_flags;
+	u8 rsvd[212];
+};
+
+/*********************** Function Privileges ***********************/
+enum {
+	BE_PRIV_DEFAULT = 0x1,
+	BE_PRIV_LNKQUERY = 0x2,
+	BE_PRIV_LNKSTATS = 0x4,
+	BE_PRIV_LNKMGMT = 0x8,
+	BE_PRIV_LNKDIAG = 0x10,
+	BE_PRIV_UTILQUERY = 0x20,
+	BE_PRIV_FILTMGMT = 0x40,
+	BE_PRIV_IFACEMGMT = 0x80,
+	BE_PRIV_VHADM = 0x100,
+	BE_PRIV_DEVCFG = 0x200,
+	BE_PRIV_DEVSEC = 0x400
+};
+#define MAX_PRIVILEGES		(BE_PRIV_VHADM | BE_PRIV_DEVCFG | \
+				 BE_PRIV_DEVSEC)
+#define MIN_PRIVILEGES		BE_PRIV_DEFAULT
+
+struct be_cmd_priv_map {
+	u8 opcode;
+	u8 subsystem;
+	u32 priv_mask;
+};
+
+struct be_cmd_req_get_fn_privileges {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd;
+};
+
+struct be_cmd_resp_get_fn_privileges {
+	struct be_cmd_resp_hdr hdr;
+	u32 privilege_mask;
+};
+
+struct be_cmd_req_set_fn_privileges {
+	struct be_cmd_req_hdr hdr;
+	u32 privileges;		/* Used by BE3, SH-R */
+	u32 privileges_lancer;	/* Used by Lancer */
+};
+
+/******************** GET/SET_MACLIST  **************************/
+#define BE_MAX_MAC			64
+struct be_cmd_req_get_mac_list {
+	struct be_cmd_req_hdr hdr;
+	u8 mac_type;
+	u8 perm_override;
+	u16 iface_id;
+	u32 mac_id;
+	u32 rsvd[3];
+} __packed;
+
+struct get_list_macaddr {
+	u16 mac_addr_size;
+	union {
+		u8 macaddr[6];
+		struct {
+			u8 rsvd[2];
+			u32 mac_id;
+		} __packed s_mac_id;
+	} __packed mac_addr_id;
+} __packed;
+
+struct be_cmd_resp_get_mac_list {
+	struct be_cmd_resp_hdr hdr;
+	struct get_list_macaddr fd_macaddr; /* Factory default mac */
+	struct get_list_macaddr macid_macaddr; /* soft mac */
+	u8 true_mac_count;
+	u8 pseudo_mac_count;
+	u8 mac_list_size;
+	u8 rsvd;
+	/* perm override mac */
+	struct get_list_macaddr macaddr_list[BE_MAX_MAC];
+} __packed;
+
+struct be_cmd_req_set_mac_list {
+	struct be_cmd_req_hdr hdr;
+	u8 mac_count;
+	u8 rsvd1;
+	u16 rsvd2;
+	struct macaddr mac[BE_MAX_MAC];
+} __packed;
+
+/*********************** HSW Config ***********************/
+#define PORT_FWD_TYPE_VEPA		0x3
+#define PORT_FWD_TYPE_VEB		0x2
+#define PORT_FWD_TYPE_PASSTHRU		0x1
+
+#define ENABLE_MAC_SPOOFCHK		0x2
+#define DISABLE_MAC_SPOOFCHK		0x3
+
+struct amap_set_hsw_context {
+	u8 interface_id[16];
+	u8 rsvd0[8];
+	u8 mac_spoofchk[2];
+	u8 rsvd1[4];
+	u8 pvid_valid;
+	u8 pport;
+	u8 rsvd2[6];
+	u8 port_fwd_type[3];
+	u8 rsvd3[5];
+	u8 vlan_spoofchk[2];
+	u8 pvid[16];
+	u8 rsvd4[32];
+	u8 rsvd5[32];
+	u8 rsvd6[32];
+} __packed;
+
+struct be_cmd_req_set_hsw_config {
+	struct be_cmd_req_hdr hdr;
+	u8 context[sizeof(struct amap_set_hsw_context) / 8];
+} __packed;
+
+struct amap_get_hsw_req_context {
+	u8 interface_id[16];
+	u8 rsvd0[14];
+	u8 pvid_valid;
+	u8 pport;
+} __packed;
+
+struct amap_get_hsw_resp_context {
+	u8 rsvd0[6];
+	u8 port_fwd_type[3];
+	u8 rsvd1[5];
+	u8 spoofchk;
+	u8 rsvd2;
+	u8 pvid[16];
+	u8 rsvd3[32];
+	u8 rsvd4[32];
+	u8 rsvd5[32];
+} __packed;
+
+struct be_cmd_req_get_hsw_config {
+	struct be_cmd_req_hdr hdr;
+	u8 context[sizeof(struct amap_get_hsw_req_context) / 8];
+} __packed;
+
+struct be_cmd_resp_get_hsw_config {
+	struct be_cmd_resp_hdr hdr;
+	u8 context[sizeof(struct amap_get_hsw_resp_context) / 8];
+	u32 rsvd;
+};
+
+/******************* get port names ***************/
+struct be_cmd_req_get_port_name {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd0;
+};
+
+struct be_cmd_resp_get_port_name {
+	struct be_cmd_req_hdr hdr;
+	u8 port_name[4];
+};
+
+/*************** HW Stats Get v1 **********************************/
+#define BE_TXP_SW_SZ			48
+struct be_port_rxf_stats_v1 {
+	u32 rsvd0[12];
+	u32 rx_crc_errors;
+	u32 rx_alignment_symbol_errors;
+	u32 rx_pause_frames;
+	u32 rx_priority_pause_frames;
+	u32 rx_control_frames;
+	u32 rx_in_range_errors;
+	u32 rx_out_range_errors;
+	u32 rx_frame_too_long;
+	u32 rx_address_filtered;
+	u32 rx_dropped_too_small;
+	u32 rx_dropped_too_short;
+	u32 rx_dropped_header_too_small;
+	u32 rx_dropped_tcp_length;
+	u32 rx_dropped_runt;
+	u32 rsvd1[10];
+	u32 rx_ip_checksum_errs;
+	u32 rx_tcp_checksum_errs;
+	u32 rx_udp_checksum_errs;
+	u32 rsvd2[7];
+	u32 rx_switched_unicast_packets;
+	u32 rx_switched_multicast_packets;
+	u32 rx_switched_broadcast_packets;
+	u32 rsvd3[3];
+	u32 tx_pauseframes;
+	u32 tx_priority_pauseframes;
+	u32 tx_controlframes;
+	u32 rsvd4[10];
+	u32 rxpp_fifo_overflow_drop;
+	u32 rx_input_fifo_overflow_drop;
+	u32 pmem_fifo_overflow_drop;
+	u32 jabber_events;
+	u32 rsvd5[3];
+};
+
+
+struct be_rxf_stats_v1 {
+	struct be_port_rxf_stats_v1 port[4];
+	u32 rsvd0[2];
+	u32 rx_drops_no_pbuf;
+	u32 rx_drops_no_txpb;
+	u32 rx_drops_no_erx_descr;
+	u32 rx_drops_no_tpre_descr;
+	u32 rsvd1[6];
+	u32 rx_drops_too_many_frags;
+	u32 rx_drops_invalid_ring;
+	u32 forwarded_packets;
+	u32 rx_drops_mtu;
+	u32 rsvd2[14];
+};
+
+struct be_erx_stats_v1 {
+	u32 rx_drops_no_fragments[68];     /* dwordS 0 to 67*/
+	u32 rsvd[4];
+};
+
+struct be_port_rxf_stats_v2 {
+	u32 rsvd0[10];
+	u32 roce_bytes_received_lsd;
+	u32 roce_bytes_received_msd;
+	u32 rsvd1[5];
+	u32 roce_frames_received;
+	u32 rx_crc_errors;
+	u32 rx_alignment_symbol_errors;
+	u32 rx_pause_frames;
+	u32 rx_priority_pause_frames;
+	u32 rx_control_frames;
+	u32 rx_in_range_errors;
+	u32 rx_out_range_errors;
+	u32 rx_frame_too_long;
+	u32 rx_address_filtered;
+	u32 rx_dropped_too_small;
+	u32 rx_dropped_too_short;
+	u32 rx_dropped_header_too_small;
+	u32 rx_dropped_tcp_length;
+	u32 rx_dropped_runt;
+	u32 rsvd2[10];
+	u32 rx_ip_checksum_errs;
+	u32 rx_tcp_checksum_errs;
+	u32 rx_udp_checksum_errs;
+	u32 rsvd3[7];
+	u32 rx_switched_unicast_packets;
+	u32 rx_switched_multicast_packets;
+	u32 rx_switched_broadcast_packets;
+	u32 rsvd4[3];
+	u32 tx_pauseframes;
+	u32 tx_priority_pauseframes;
+	u32 tx_controlframes;
+	u32 rsvd5[10];
+	u32 rxpp_fifo_overflow_drop;
+	u32 rx_input_fifo_overflow_drop;
+	u32 pmem_fifo_overflow_drop;
+	u32 jabber_events;
+	u32 rsvd6[3];
+	u32 rx_drops_payload_size;
+	u32 rx_drops_clipped_header;
+	u32 rx_drops_crc;
+	u32 roce_drops_payload_len;
+	u32 roce_drops_crc;
+	u32 rsvd7[19];
+};
+
+struct be_rxf_stats_v2 {
+	struct be_port_rxf_stats_v2 port[4];
+	u32 rsvd0[2];
+	u32 rx_drops_no_pbuf;
+	u32 rx_drops_no_txpb;
+	u32 rx_drops_no_erx_descr;
+	u32 rx_drops_no_tpre_descr;
+	u32 rsvd1[6];
+	u32 rx_drops_too_many_frags;
+	u32 rx_drops_invalid_ring;
+	u32 forwarded_packets;
+	u32 rx_drops_mtu;
+	u32 rsvd2[35];
+};
+
+struct be_hw_stats_v1 {
+	struct be_rxf_stats_v1 rxf;
+	u32 rsvd0[BE_TXP_SW_SZ];
+	struct be_erx_stats_v1 erx;
+	struct be_pmem_stats pmem;
+	u32 rsvd1[18];
+};
+
+struct be_cmd_req_get_stats_v1 {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd[sizeof(struct be_hw_stats_v1)];
+};
+
+struct be_cmd_resp_get_stats_v1 {
+	struct be_cmd_resp_hdr hdr;
+	struct be_hw_stats_v1 hw_stats;
+};
+
+struct be_erx_stats_v2 {
+	u32 rx_drops_no_fragments[136];     /* dwordS 0 to 135*/
+	u32 rsvd[3];
+};
+
+struct be_hw_stats_v2 {
+	struct be_rxf_stats_v2 rxf;
+	u32 rsvd0[BE_TXP_SW_SZ];
+	struct be_erx_stats_v2 erx;
+	struct be_pmem_stats pmem;
+	u32 rsvd1[18];
+};
+
+struct be_cmd_req_get_stats_v2 {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd[sizeof(struct be_hw_stats_v2)];
+};
+
+struct be_cmd_resp_get_stats_v2 {
+	struct be_cmd_resp_hdr hdr;
+	struct be_hw_stats_v2 hw_stats;
+};
+
+/************** get fat capabilites *******************/
+#define MAX_MODULES 27
+#define MAX_MODES 4
+#define MODE_UART 0
+#define FW_LOG_LEVEL_DEFAULT 48
+#define FW_LOG_LEVEL_FATAL 64
+
+struct ext_fat_mode {
+	u8 mode;
+	u8 rsvd0;
+	u16 port_mask;
+	u32 dbg_lvl;
+	u64 fun_mask;
+} __packed;
+
+struct ext_fat_modules {
+	u8 modules_str[32];
+	u32 modules_id;
+	u32 num_modes;
+	struct ext_fat_mode trace_lvl[MAX_MODES];
+} __packed;
+
+struct be_fat_conf_params {
+	u32 max_log_entries;
+	u32 log_entry_size;
+	u8 log_type;
+	u8 max_log_funs;
+	u8 max_log_ports;
+	u8 rsvd0;
+	u32 supp_modes;
+	u32 num_modules;
+	struct ext_fat_modules module[MAX_MODULES];
+} __packed;
+
+struct be_cmd_req_get_ext_fat_caps {
+	struct be_cmd_req_hdr hdr;
+	u32 parameter_type;
+};
+
+struct be_cmd_resp_get_ext_fat_caps {
+	struct be_cmd_resp_hdr hdr;
+	struct be_fat_conf_params get_params;
+};
+
+struct be_cmd_req_set_ext_fat_caps {
+	struct be_cmd_req_hdr hdr;
+	struct be_fat_conf_params set_params;
+};
+
+#define RESOURCE_DESC_SIZE_V0			72
+#define RESOURCE_DESC_SIZE_V1			88
+#define PCIE_RESOURCE_DESC_TYPE_V0		0x40
+#define NIC_RESOURCE_DESC_TYPE_V0		0x41
+#define PCIE_RESOURCE_DESC_TYPE_V1		0x50
+#define NIC_RESOURCE_DESC_TYPE_V1		0x51
+#define PORT_RESOURCE_DESC_TYPE_V1		0x55
+#define MAX_RESOURCE_DESC			264
+
+#define IF_CAPS_FLAGS_VALID_SHIFT		0	/* IF caps valid */
+#define VFT_SHIFT				3	/* VF template */
+#define IMM_SHIFT				6	/* Immediate */
+#define NOSV_SHIFT				7	/* No save */
+
+#define MISSION_NIC				1
+#define MISSION_RDMA				8
+
+struct be_res_desc_hdr {
+	u8 desc_type;
+	u8 desc_len;
+} __packed;
+
+struct be_port_res_desc {
+	struct be_res_desc_hdr hdr;
+	u8 rsvd0;
+	u8 flags;
+	u8 link_num;
+	u8 mc_type;
+	u16 rsvd1;
+
+#define NV_TYPE_MASK				0x3	/* bits 0-1 */
+#define NV_TYPE_DISABLED			1
+#define NV_TYPE_VXLAN				3
+#define SOCVID_SHIFT				2	/* Strip outer vlan */
+#define RCVID_SHIFT				4	/* Report vlan */
+#define PF_NUM_IGNORE				255
+	u8 nv_flags;
+	u8 rsvd2;
+	__le16 nv_port;					/* vxlan/gre port */
+	u32 rsvd3[19];
+} __packed;
+
+struct be_pcie_res_desc {
+	struct be_res_desc_hdr hdr;
+	u8 rsvd0;
+	u8 flags;
+	u16 rsvd1;
+	u8 pf_num;
+	u8 rsvd2;
+	u32 rsvd3;
+	u8 sriov_state;
+	u8 pf_state;
+	u8 pf_type;
+	u8 rsvd4;
+	u16 num_vfs;
+	u16 rsvd5;
+	u32 rsvd6[17];
+} __packed;
+
+struct be_nic_res_desc {
+	struct be_res_desc_hdr hdr;
+	u8 rsvd1;
+
+#define QUN_SHIFT				4 /* QoS is in absolute units */
+	u8 flags;
+	u8 vf_num;
+	u8 rsvd2;
+	u8 pf_num;
+	u8 rsvd3;
+	u16 unicast_mac_count;
+	u8 rsvd4[6];
+	u16 mcc_count;
+	u16 vlan_count;
+	u16 mcast_mac_count;
+	u16 txq_count;
+	u16 rq_count;
+	u16 rssq_count;
+	u16 lro_count;
+	u16 cq_count;
+	u16 toe_conn_count;
+	u16 eq_count;
+	u16 vlan_id;
+	u16 iface_count;
+	u32 cap_flags;
+	u8 link_param;
+	u8 rsvd6;
+	u16 channel_id_param;
+	u32 bw_min;
+	u32 bw_max;
+	u8 acpi_params;
+	u8 wol_param;
+	u16 rsvd7;
+	u16 tunnel_iface_count;
+	u16 direct_tenant_iface_count;
+	u32 rsvd8[6];
+} __packed;
+
+/************ Multi-Channel type ***********/
+enum mc_type {
+	MC_NONE = 0x01,
+	UMC = 0x02,
+	FLEX10 = 0x03,
+	vNIC1 = 0x04,
+	nPAR = 0x05,
+	UFP = 0x06,
+	vNIC2 = 0x07
+};
+
+/* Is BE in a multi-channel mode */
+static inline bool be_is_mc(struct be_adapter *adapter)
+{
+	return adapter->mc_type > MC_NONE;
+}
+
+struct be_cmd_req_get_func_config {
+	struct be_cmd_req_hdr hdr;
+};
+
+struct be_cmd_resp_get_func_config {
+	struct be_cmd_resp_hdr hdr;
+	u32 desc_count;
+	u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE_V1];
+};
+
+enum {
+	RESOURCE_LIMITS,
+	RESOURCE_MODIFIABLE
+};
+
+struct be_cmd_req_get_profile_config {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd;
+#define ACTIVE_PROFILE_TYPE			0x2
+#define SAVED_PROFILE_TYPE			0x0
+#define QUERY_MODIFIABLE_FIELDS_TYPE		BIT(3)
+	u8 type;
+	u16 rsvd1;
+};
+
+struct be_cmd_resp_get_profile_config {
+	struct be_cmd_resp_hdr hdr;
+	__le16 desc_count;
+	u16 rsvd;
+	u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE_V1];
+};
+
+#define FIELD_MODIFIABLE			0xFFFF
+struct be_cmd_req_set_profile_config {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd;
+	u32 desc_count;
+	u8 desc[2 * RESOURCE_DESC_SIZE_V1];
+} __packed;
+
+struct be_cmd_req_get_active_profile {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd;
+} __packed;
+
+struct be_cmd_resp_get_active_profile {
+	struct be_cmd_resp_hdr hdr;
+	u16 active_profile_id;
+	u16 next_profile_id;
+} __packed;
+
+struct be_cmd_enable_disable_vf {
+	struct be_cmd_req_hdr hdr;
+	u8 enable;
+	u8 rsvd[3];
+};
+
+struct be_cmd_req_intr_set {
+	struct be_cmd_req_hdr hdr;
+	u8 intr_enabled;
+	u8 rsvd[3];
+};
+
+static inline bool check_privilege(struct be_adapter *adapter, u32 flags)
+{
+	return flags & adapter->cmd_privileges ? true : false;
+}
+
+/************** Get IFACE LIST *******************/
+struct be_if_desc {
+	u32 if_id;
+	u32 cap_flags;
+	u32 en_flags;
+};
+
+struct be_cmd_req_get_iface_list {
+	struct be_cmd_req_hdr hdr;
+};
+
+struct be_cmd_resp_get_iface_list {
+	struct be_cmd_req_hdr hdr;
+	u32 if_cnt;
+	struct be_if_desc if_desc;
+};
+
+/************** Set Features *******************/
+#define	BE_FEATURE_UE_RECOVERY		0x10
+#define	BE_UE_RECOVERY_UER_MASK		0x1
+
+struct be_req_ue_recovery {
+	u32	uer;
+	u32	rsvd;
+};
+
+struct be_cmd_req_set_features {
+	struct be_cmd_req_hdr hdr;
+	u32 features;
+	u32 parameter_len;
+	union {
+		struct be_req_ue_recovery req;
+		u32 rsvd[2];
+	} parameter;
+};
+
+struct be_resp_ue_recovery {
+	u32 uer;
+	u16 ue2rp;
+	u16 ue2sr;
+};
+
+struct be_cmd_resp_set_features {
+	struct be_cmd_resp_hdr hdr;
+	u32 features;
+	u32 parameter_len;
+	union {
+		struct be_resp_ue_recovery resp;
+		u32 rsvd[2];
+	} parameter;
+};
+
+/*************** Set logical link ********************/
+#define PLINK_ENABLE            BIT(0)
+#define PLINK_TRACK             BIT(8)
+struct be_cmd_req_set_ll_link {
+	struct be_cmd_req_hdr hdr;
+	u32 link_config; /* Bit 0: UP_DOWN, Bit 9: PLINK */
+};
+
+/************** Manage IFACE Filters *******************/
+#define OP_CONVERT_NORMAL_TO_TUNNEL		0
+#define OP_CONVERT_TUNNEL_TO_NORMAL		1
+
+struct be_cmd_req_manage_iface_filters {
+	struct be_cmd_req_hdr hdr;
+	u8  op;
+	u8  rsvd0;
+	u8  flags;
+	u8  rsvd1;
+	u32 tunnel_iface_id;
+	u32 target_iface_id;
+	u8  mac[6];
+	u16 vlan_tag;
+	u32 tenant_id;
+	u32 filter_id;
+	u32 cap_flags;
+	u32 cap_control_flags;
+} __packed;
+
+u16 be_POST_stage_get(struct be_adapter *adapter);
+int be_pci_fnum_get(struct be_adapter *adapter);
+int be_fw_wait_ready(struct be_adapter *adapter);
+int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr,
+			  bool permanent, u32 if_handle, u32 pmac_id);
+int be_cmd_pmac_add(struct be_adapter *adapter, u8 *mac_addr, u32 if_id,
+		    u32 *pmac_id, u32 domain);
+int be_cmd_pmac_del(struct be_adapter *adapter, u32 if_id, int pmac_id,
+		    u32 domain);
+int be_cmd_if_create(struct be_adapter *adapter, u32 cap_flags, u32 en_flags,
+		     u32 *if_handle, u32 domain);
+int be_cmd_if_destroy(struct be_adapter *adapter, int if_handle, u32 domain);
+int be_cmd_eq_create(struct be_adapter *adapter, struct be_eq_obj *eqo);
+int be_cmd_cq_create(struct be_adapter *adapter, struct be_queue_info *cq,
+		     struct be_queue_info *eq, bool no_delay,
+		     int num_cqe_dma_coalesce);
+int be_cmd_mccq_create(struct be_adapter *adapter, struct be_queue_info *mccq,
+		       struct be_queue_info *cq);
+int be_cmd_txq_create(struct be_adapter *adapter, struct be_tx_obj *txo);
+int be_cmd_rxq_create(struct be_adapter *adapter, struct be_queue_info *rxq,
+		      u16 cq_id, u16 frag_size, u32 if_id, u32 rss, u8 *rss_id);
+int be_cmd_q_destroy(struct be_adapter *adapter, struct be_queue_info *q,
+		     int type);
+int be_cmd_rxq_destroy(struct be_adapter *adapter, struct be_queue_info *q);
+int be_cmd_link_status_query(struct be_adapter *adapter, u16 *link_speed,
+			     u8 *link_status, u32 dom);
+int be_cmd_reset(struct be_adapter *adapter);
+int be_cmd_get_stats(struct be_adapter *adapter, struct be_dma_mem *nonemb_cmd);
+int lancer_cmd_get_pport_stats(struct be_adapter *adapter,
+			       struct be_dma_mem *nonemb_cmd);
+int be_cmd_get_fw_ver(struct be_adapter *adapter);
+int be_cmd_modify_eqd(struct be_adapter *adapter, struct be_set_eqd *, int num);
+int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array,
+		       u32 num, u32 domain);
+int be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 status);
+int be_cmd_set_flow_control(struct be_adapter *adapter, u32 tx_fc, u32 rx_fc);
+int be_cmd_get_flow_control(struct be_adapter *adapter, u32 *tx_fc, u32 *rx_fc);
+int be_cmd_query_fw_cfg(struct be_adapter *adapter);
+int be_cmd_reset_function(struct be_adapter *adapter);
+int be_cmd_rss_config(struct be_adapter *adapter, u8 *rsstable,
+		      u32 rss_hash_opts, u16 table_size, const u8 *rss_hkey);
+int be_process_mcc(struct be_adapter *adapter);
+int be_cmd_set_beacon_state(struct be_adapter *adapter, u8 port_num, u8 beacon,
+			    u8 status, u8 state);
+int be_cmd_get_beacon_state(struct be_adapter *adapter, u8 port_num,
+			    u32 *state);
+int be_cmd_read_port_transceiver_data(struct be_adapter *adapter,
+				      u8 page_num, u8 *data);
+int be_cmd_query_cable_type(struct be_adapter *adapter);
+int be_cmd_query_sfp_info(struct be_adapter *adapter);
+int lancer_cmd_read_object(struct be_adapter *adapter, struct be_dma_mem *cmd,
+			   u32 data_size, u32 data_offset, const char *obj_name,
+			   u32 *data_read, u32 *eof, u8 *addn_status);
+int lancer_fw_download(struct be_adapter *adapter, const struct firmware *fw);
+int be_fw_download(struct be_adapter *adapter, const struct firmware *fw);
+int be_cmd_enable_magic_wol(struct be_adapter *adapter, u8 *mac,
+			    struct be_dma_mem *nonemb_cmd);
+int be_cmd_fw_init(struct be_adapter *adapter);
+int be_cmd_fw_clean(struct be_adapter *adapter);
+void be_async_mcc_enable(struct be_adapter *adapter);
+void be_async_mcc_disable(struct be_adapter *adapter);
+int be_cmd_loopback_test(struct be_adapter *adapter, u32 port_num,
+			 u32 loopback_type, u32 pkt_size, u32 num_pkts,
+			 u64 pattern);
+int be_cmd_ddr_dma_test(struct be_adapter *adapter, u64 pattern, u32 byte_cnt,
+			struct be_dma_mem *cmd);
+int be_cmd_get_seeprom_data(struct be_adapter *adapter,
+			    struct be_dma_mem *nonemb_cmd);
+int be_cmd_set_loopback(struct be_adapter *adapter, u8 port_num,
+			u8 loopback_type, u8 enable);
+int be_cmd_get_phy_info(struct be_adapter *adapter);
+int be_cmd_config_qos(struct be_adapter *adapter, u32 max_rate,
+		      u16 link_speed, u8 domain);
+void be_detect_error(struct be_adapter *adapter);
+int be_cmd_get_die_temperature(struct be_adapter *adapter);
+int be_cmd_get_cntl_attributes(struct be_adapter *adapter);
+int be_cmd_get_fat_dump_len(struct be_adapter *adapter, u32 *dump_size);
+int be_cmd_get_fat_dump(struct be_adapter *adapter, u32 buf_len, void *buf);
+int be_cmd_req_native_mode(struct be_adapter *adapter);
+int be_cmd_get_fn_privileges(struct be_adapter *adapter, u32 *privilege,
+			     u32 domain);
+int be_cmd_set_fn_privileges(struct be_adapter *adapter, u32 privileges,
+			     u32 vf_num);
+int be_cmd_get_mac_from_list(struct be_adapter *adapter, u8 *mac,
+			     bool *pmac_id_active, u32 *pmac_id,
+			     u32 if_handle, u8 domain);
+int be_cmd_get_active_mac(struct be_adapter *adapter, u32 pmac_id, u8 *mac,
+			  u32 if_handle, bool active, u32 domain);
+int be_cmd_get_perm_mac(struct be_adapter *adapter, u8 *mac);
+int be_cmd_set_mac_list(struct be_adapter *adapter, u8 *mac_array, u8 mac_count,
+			u32 domain);
+int be_cmd_set_mac(struct be_adapter *adapter, u8 *mac, int if_id, u32 dom);
+int be_cmd_set_hsw_config(struct be_adapter *adapter, u16 pvid, u32 domain,
+			  u16 intf_id, u16 hsw_mode, u8 spoofchk);
+int be_cmd_get_hsw_config(struct be_adapter *adapter, u16 *pvid, u32 domain,
+			  u16 intf_id, u8 *mode, bool *spoofchk);
+int be_cmd_get_acpi_wol_cap(struct be_adapter *adapter);
+int be_cmd_set_fw_log_level(struct be_adapter *adapter, u32 level);
+int be_cmd_get_fw_log_level(struct be_adapter *adapter);
+int be_cmd_get_ext_fat_capabilites(struct be_adapter *adapter,
+				   struct be_dma_mem *cmd);
+int be_cmd_set_ext_fat_capabilites(struct be_adapter *adapter,
+				   struct be_dma_mem *cmd,
+				   struct be_fat_conf_params *cfgs);
+int lancer_physdev_ctrl(struct be_adapter *adapter, u32 mask);
+int lancer_initiate_dump(struct be_adapter *adapter);
+int lancer_delete_dump(struct be_adapter *adapter);
+bool dump_present(struct be_adapter *adapter);
+int lancer_test_and_set_rdy_state(struct be_adapter *adapter);
+int be_cmd_query_port_name(struct be_adapter *adapter);
+int be_cmd_get_func_config(struct be_adapter *adapter,
+			   struct be_resources *res);
+int be_cmd_get_profile_config(struct be_adapter *adapter,
+			      struct be_resources *res,
+			      struct be_port_resources *port_res,
+			      u8 profile_type, u8 query, u8 domain);
+int be_cmd_get_active_profile(struct be_adapter *adapter, u16 *profile);
+int be_cmd_get_if_id(struct be_adapter *adapter, struct be_vf_cfg *vf_cfg,
+		     int vf_num);
+int be_cmd_enable_vf(struct be_adapter *adapter, u8 domain);
+int be_cmd_intr_set(struct be_adapter *adapter, bool intr_enable);
+int be_cmd_set_logical_link_config(struct be_adapter *adapter,
+					  int link_state, u8 domain);
+int be_cmd_set_vxlan_port(struct be_adapter *adapter, __be16 port);
+int be_cmd_manage_iface(struct be_adapter *adapter, u32 iface, u8 op);
+int be_cmd_set_sriov_config(struct be_adapter *adapter,
+			    struct be_resources res, u16 num_vfs,
+			    struct be_resources *vft_res);
+int be_cmd_set_features(struct be_adapter *adapter);
diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
new file mode 100644
index 0000000..7f7e206
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -0,0 +1,1433 @@
+/*
+ * Copyright (C) 2005 - 2016 Broadcom
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.  The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include "be.h"
+#include "be_cmds.h"
+#include <linux/ethtool.h>
+
+struct be_ethtool_stat {
+	char desc[ETH_GSTRING_LEN];
+	int type;
+	int size;
+	int offset;
+};
+
+enum {DRVSTAT_TX, DRVSTAT_RX, DRVSTAT};
+#define FIELDINFO(_struct, field) FIELD_SIZEOF(_struct, field), \
+					offsetof(_struct, field)
+#define DRVSTAT_TX_INFO(field)	#field, DRVSTAT_TX,\
+					FIELDINFO(struct be_tx_stats, field)
+#define DRVSTAT_RX_INFO(field)	#field, DRVSTAT_RX,\
+					FIELDINFO(struct be_rx_stats, field)
+#define	DRVSTAT_INFO(field)	#field, DRVSTAT,\
+					FIELDINFO(struct be_drv_stats, field)
+
+static const struct be_ethtool_stat et_stats[] = {
+	{DRVSTAT_INFO(rx_crc_errors)},
+	{DRVSTAT_INFO(rx_alignment_symbol_errors)},
+	{DRVSTAT_INFO(rx_pause_frames)},
+	{DRVSTAT_INFO(rx_control_frames)},
+	/* Received packets dropped when the Ethernet length field
+	 * is not equal to the actual Ethernet data length.
+	 */
+	{DRVSTAT_INFO(rx_in_range_errors)},
+	/* Received packets dropped when their length field is >= 1501 bytes
+	 * and <= 1535 bytes.
+	 */
+	{DRVSTAT_INFO(rx_out_range_errors)},
+	/* Received packets dropped when they are longer than 9216 bytes */
+	{DRVSTAT_INFO(rx_frame_too_long)},
+	/* Received packets dropped when they don't pass the unicast or
+	 * multicast address filtering.
+	 */
+	{DRVSTAT_INFO(rx_address_filtered)},
+	/* Received packets dropped when IP packet length field is less than
+	 * the IP header length field.
+	 */
+	{DRVSTAT_INFO(rx_dropped_too_small)},
+	/* Received packets dropped when IP length field is greater than
+	 * the actual packet length.
+	 */
+	{DRVSTAT_INFO(rx_dropped_too_short)},
+	/* Received packets dropped when the IP header length field is less
+	 * than 5.
+	 */
+	{DRVSTAT_INFO(rx_dropped_header_too_small)},
+	/* Received packets dropped when the TCP header length field is less
+	 * than 5 or the TCP header length + IP header length is more
+	 * than IP packet length.
+	 */
+	{DRVSTAT_INFO(rx_dropped_tcp_length)},
+	{DRVSTAT_INFO(rx_dropped_runt)},
+	/* Number of received packets dropped when a fifo for descriptors going
+	 * into the packet demux block overflows. In normal operation, this
+	 * fifo must never overflow.
+	 */
+	{DRVSTAT_INFO(rxpp_fifo_overflow_drop)},
+	/* Received packets dropped when the RX block runs out of space in
+	 * one of its input FIFOs. This could happen due a long burst of
+	 * minimum-sized (64b) frames in the receive path.
+	 * This counter may also be erroneously incremented rarely.
+	 */
+	{DRVSTAT_INFO(rx_input_fifo_overflow_drop)},
+	{DRVSTAT_INFO(rx_ip_checksum_errs)},
+	{DRVSTAT_INFO(rx_tcp_checksum_errs)},
+	{DRVSTAT_INFO(rx_udp_checksum_errs)},
+	{DRVSTAT_INFO(tx_pauseframes)},
+	{DRVSTAT_INFO(tx_controlframes)},
+	{DRVSTAT_INFO(rx_priority_pause_frames)},
+	{DRVSTAT_INFO(tx_priority_pauseframes)},
+	/* Received packets dropped when an internal fifo going into
+	 * main packet buffer tank (PMEM) overflows.
+	 */
+	{DRVSTAT_INFO(pmem_fifo_overflow_drop)},
+	{DRVSTAT_INFO(jabber_events)},
+	/* Received packets dropped due to lack of available HW packet buffers
+	 * used to temporarily hold the received packets.
+	 */
+	{DRVSTAT_INFO(rx_drops_no_pbuf)},
+	/* Received packets dropped due to input receive buffer
+	 * descriptor fifo overflowing.
+	 */
+	{DRVSTAT_INFO(rx_drops_no_erx_descr)},
+	/* Packets dropped because the internal FIFO to the offloaded TCP
+	 * receive processing block is full. This could happen only for
+	 * offloaded iSCSI or FCoE trarffic.
+	 */
+	{DRVSTAT_INFO(rx_drops_no_tpre_descr)},
+	/* Received packets dropped when they need more than 8
+	 * receive buffers. This cannot happen as the driver configures
+	 * 2048 byte receive buffers.
+	 */
+	{DRVSTAT_INFO(rx_drops_too_many_frags)},
+	{DRVSTAT_INFO(forwarded_packets)},
+	/* Received packets dropped when the frame length
+	 * is more than 9018 bytes
+	 */
+	{DRVSTAT_INFO(rx_drops_mtu)},
+	/* Number of dma mapping errors */
+	{DRVSTAT_INFO(dma_map_errors)},
+	/* Number of packets dropped due to random early drop function */
+	{DRVSTAT_INFO(eth_red_drops)},
+	{DRVSTAT_INFO(rx_roce_bytes_lsd)},
+	{DRVSTAT_INFO(rx_roce_bytes_msd)},
+	{DRVSTAT_INFO(rx_roce_frames)},
+	{DRVSTAT_INFO(roce_drops_payload_len)},
+	{DRVSTAT_INFO(roce_drops_crc)}
+};
+
+#define ETHTOOL_STATS_NUM ARRAY_SIZE(et_stats)
+
+/* Stats related to multi RX queues: get_stats routine assumes bytes, pkts
+ * are first and second members respectively.
+ */
+static const struct be_ethtool_stat et_rx_stats[] = {
+	{DRVSTAT_RX_INFO(rx_bytes)},/* If moving this member see above note */
+	{DRVSTAT_RX_INFO(rx_pkts)}, /* If moving this member see above note */
+	{DRVSTAT_RX_INFO(rx_vxlan_offload_pkts)},
+	{DRVSTAT_RX_INFO(rx_compl)},
+	{DRVSTAT_RX_INFO(rx_compl_err)},
+	{DRVSTAT_RX_INFO(rx_mcast_pkts)},
+	/* Number of page allocation failures while posting receive buffers
+	 * to HW.
+	 */
+	{DRVSTAT_RX_INFO(rx_post_fail)},
+	/* Recevied packets dropped due to skb allocation failure */
+	{DRVSTAT_RX_INFO(rx_drops_no_skbs)},
+	/* Received packets dropped due to lack of available fetched buffers
+	 * posted by the driver.
+	 */
+	{DRVSTAT_RX_INFO(rx_drops_no_frags)}
+};
+
+#define ETHTOOL_RXSTATS_NUM (ARRAY_SIZE(et_rx_stats))
+
+/* Stats related to multi TX queues: get_stats routine assumes compl is the
+ * first member
+ */
+static const struct be_ethtool_stat et_tx_stats[] = {
+	{DRVSTAT_TX_INFO(tx_compl)}, /* If moving this member see above note */
+	/* This counter is incremented when the HW encounters an error while
+	 * parsing the packet header of an outgoing TX request. This counter is
+	 * applicable only for BE2, BE3 and Skyhawk based adapters.
+	 */
+	{DRVSTAT_TX_INFO(tx_hdr_parse_err)},
+	/* This counter is incremented when an error occurs in the DMA
+	 * operation associated with the TX request from the host to the device.
+	 */
+	{DRVSTAT_TX_INFO(tx_dma_err)},
+	/* This counter is incremented when MAC or VLAN spoof checking is
+	 * enabled on the interface and the TX request fails the spoof check
+	 * in HW.
+	 */
+	{DRVSTAT_TX_INFO(tx_spoof_check_err)},
+	/* This counter is incremented when the HW encounters an error while
+	 * performing TSO offload. This counter is applicable only for Lancer
+	 * adapters.
+	 */
+	{DRVSTAT_TX_INFO(tx_tso_err)},
+	/* This counter is incremented when the HW detects Q-in-Q style VLAN
+	 * tagging in a packet and such tagging is not expected on the outgoing
+	 * interface. This counter is applicable only for Lancer adapters.
+	 */
+	{DRVSTAT_TX_INFO(tx_qinq_err)},
+	/* This counter is incremented when the HW detects parity errors in the
+	 * packet data. This counter is applicable only for Lancer adapters.
+	 */
+	{DRVSTAT_TX_INFO(tx_internal_parity_err)},
+	{DRVSTAT_TX_INFO(tx_sge_err)},
+	{DRVSTAT_TX_INFO(tx_bytes)},
+	{DRVSTAT_TX_INFO(tx_pkts)},
+	{DRVSTAT_TX_INFO(tx_vxlan_offload_pkts)},
+	/* Number of skbs queued for trasmission by the driver */
+	{DRVSTAT_TX_INFO(tx_reqs)},
+	/* Number of times the TX queue was stopped due to lack
+	 * of spaces in the TXQ.
+	 */
+	{DRVSTAT_TX_INFO(tx_stops)},
+	/* Pkts dropped in the driver's transmit path */
+	{DRVSTAT_TX_INFO(tx_drv_drops)}
+};
+
+#define ETHTOOL_TXSTATS_NUM (ARRAY_SIZE(et_tx_stats))
+
+static const char et_self_tests[][ETH_GSTRING_LEN] = {
+	"MAC Loopback test",
+	"PHY Loopback test",
+	"External Loopback test",
+	"DDR DMA test",
+	"Link test"
+};
+
+#define ETHTOOL_TESTS_NUM ARRAY_SIZE(et_self_tests)
+#define BE_MAC_LOOPBACK 0x0
+#define BE_PHY_LOOPBACK 0x1
+#define BE_ONE_PORT_EXT_LOOPBACK 0x2
+#define BE_NO_LOOPBACK 0xff
+
+static void be_get_drvinfo(struct net_device *netdev,
+			   struct ethtool_drvinfo *drvinfo)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, DRV_VER, sizeof(drvinfo->version));
+	if (!memcmp(adapter->fw_ver, adapter->fw_on_flash, FW_VER_LEN))
+		strlcpy(drvinfo->fw_version, adapter->fw_ver,
+			sizeof(drvinfo->fw_version));
+	else
+		snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+			 "%s [%s]", adapter->fw_ver, adapter->fw_on_flash);
+
+	strlcpy(drvinfo->bus_info, pci_name(adapter->pdev),
+		sizeof(drvinfo->bus_info));
+}
+
+static u32 lancer_cmd_get_file_len(struct be_adapter *adapter, u8 *file_name)
+{
+	u32 data_read = 0, eof;
+	u8 addn_status;
+	struct be_dma_mem data_len_cmd;
+
+	memset(&data_len_cmd, 0, sizeof(data_len_cmd));
+	/* data_offset and data_size should be 0 to get reg len */
+	lancer_cmd_read_object(adapter, &data_len_cmd, 0, 0, file_name,
+			       &data_read, &eof, &addn_status);
+
+	return data_read;
+}
+
+static int be_get_dump_len(struct be_adapter *adapter)
+{
+	u32 dump_size = 0;
+
+	if (lancer_chip(adapter))
+		dump_size = lancer_cmd_get_file_len(adapter,
+						    LANCER_FW_DUMP_FILE);
+	else
+		dump_size = adapter->fat_dump_len;
+
+	return dump_size;
+}
+
+static int lancer_cmd_read_file(struct be_adapter *adapter, u8 *file_name,
+				u32 buf_len, void *buf)
+{
+	struct be_dma_mem read_cmd;
+	u32 read_len = 0, total_read_len = 0, chunk_size;
+	u32 eof = 0;
+	u8 addn_status;
+	int status = 0;
+
+	read_cmd.size = LANCER_READ_FILE_CHUNK;
+	read_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, read_cmd.size,
+					  &read_cmd.dma, GFP_ATOMIC);
+
+	if (!read_cmd.va) {
+		dev_err(&adapter->pdev->dev,
+			"Memory allocation failure while reading dump\n");
+		return -ENOMEM;
+	}
+
+	while ((total_read_len < buf_len) && !eof) {
+		chunk_size = min_t(u32, (buf_len - total_read_len),
+				   LANCER_READ_FILE_CHUNK);
+		chunk_size = ALIGN(chunk_size, 4);
+		status = lancer_cmd_read_object(adapter, &read_cmd, chunk_size,
+						total_read_len, file_name,
+						&read_len, &eof, &addn_status);
+		if (!status) {
+			memcpy(buf + total_read_len, read_cmd.va, read_len);
+			total_read_len += read_len;
+			eof &= LANCER_READ_FILE_EOF_MASK;
+		} else {
+			status = -EIO;
+			break;
+		}
+	}
+	dma_free_coherent(&adapter->pdev->dev, read_cmd.size, read_cmd.va,
+			  read_cmd.dma);
+
+	return status;
+}
+
+static int be_read_dump_data(struct be_adapter *adapter, u32 dump_len,
+			     void *buf)
+{
+	int status = 0;
+
+	if (lancer_chip(adapter))
+		status = lancer_cmd_read_file(adapter, LANCER_FW_DUMP_FILE,
+					      dump_len, buf);
+	else
+		status = be_cmd_get_fat_dump(adapter, dump_len, buf);
+
+	return status;
+}
+
+static int be_get_coalesce(struct net_device *netdev,
+			   struct ethtool_coalesce *et)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_aic_obj *aic = &adapter->aic_obj[0];
+
+	et->rx_coalesce_usecs = aic->prev_eqd;
+	et->rx_coalesce_usecs_high = aic->max_eqd;
+	et->rx_coalesce_usecs_low = aic->min_eqd;
+
+	et->tx_coalesce_usecs = aic->prev_eqd;
+	et->tx_coalesce_usecs_high = aic->max_eqd;
+	et->tx_coalesce_usecs_low = aic->min_eqd;
+
+	et->use_adaptive_rx_coalesce = aic->enable;
+	et->use_adaptive_tx_coalesce = aic->enable;
+
+	return 0;
+}
+
+/* TX attributes are ignored. Only RX attributes are considered
+ * eqd cmd is issued in the worker thread.
+ */
+static int be_set_coalesce(struct net_device *netdev,
+			   struct ethtool_coalesce *et)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_aic_obj *aic = &adapter->aic_obj[0];
+	struct be_eq_obj *eqo;
+	int i;
+
+	for_all_evt_queues(adapter, eqo, i) {
+		aic->enable = et->use_adaptive_rx_coalesce;
+		aic->max_eqd = min(et->rx_coalesce_usecs_high, BE_MAX_EQD);
+		aic->min_eqd = min(et->rx_coalesce_usecs_low, aic->max_eqd);
+		aic->et_eqd = min(et->rx_coalesce_usecs, aic->max_eqd);
+		aic->et_eqd = max(aic->et_eqd, aic->min_eqd);
+		aic++;
+	}
+
+	/* For Skyhawk, the EQD setting happens via EQ_DB when AIC is enabled.
+	 * When AIC is disabled, persistently force set EQD value via the
+	 * FW cmd, so that we don't have to calculate the delay multiplier
+	 * encode value each time EQ_DB is rung
+	 */
+	if (!et->use_adaptive_rx_coalesce && skyhawk_chip(adapter))
+		be_eqd_update(adapter, true);
+
+	return 0;
+}
+
+static void be_get_ethtool_stats(struct net_device *netdev,
+				 struct ethtool_stats *stats, uint64_t *data)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_rx_obj *rxo;
+	struct be_tx_obj *txo;
+	void *p;
+	unsigned int i, j, base = 0, start;
+
+	for (i = 0; i < ETHTOOL_STATS_NUM; i++) {
+		p = (u8 *)&adapter->drv_stats + et_stats[i].offset;
+		data[i] = *(u32 *)p;
+	}
+	base += ETHTOOL_STATS_NUM;
+
+	for_all_rx_queues(adapter, rxo, j) {
+		struct be_rx_stats *stats = rx_stats(rxo);
+
+		do {
+			start = u64_stats_fetch_begin_irq(&stats->sync);
+			data[base] = stats->rx_bytes;
+			data[base + 1] = stats->rx_pkts;
+		} while (u64_stats_fetch_retry_irq(&stats->sync, start));
+
+		for (i = 2; i < ETHTOOL_RXSTATS_NUM; i++) {
+			p = (u8 *)stats + et_rx_stats[i].offset;
+			data[base + i] = *(u32 *)p;
+		}
+		base += ETHTOOL_RXSTATS_NUM;
+	}
+
+	for_all_tx_queues(adapter, txo, j) {
+		struct be_tx_stats *stats = tx_stats(txo);
+
+		do {
+			start = u64_stats_fetch_begin_irq(&stats->sync_compl);
+			data[base] = stats->tx_compl;
+		} while (u64_stats_fetch_retry_irq(&stats->sync_compl, start));
+
+		do {
+			start = u64_stats_fetch_begin_irq(&stats->sync);
+			for (i = 1; i < ETHTOOL_TXSTATS_NUM; i++) {
+				p = (u8 *)stats + et_tx_stats[i].offset;
+				data[base + i] =
+					(et_tx_stats[i].size == sizeof(u64)) ?
+						*(u64 *)p : *(u32 *)p;
+			}
+		} while (u64_stats_fetch_retry_irq(&stats->sync, start));
+		base += ETHTOOL_TXSTATS_NUM;
+	}
+}
+
+static const char be_priv_flags[][ETH_GSTRING_LEN] = {
+	"disable-tpe-recovery"
+};
+
+static void be_get_stat_strings(struct net_device *netdev, uint32_t stringset,
+				uint8_t *data)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int i, j;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < ETHTOOL_STATS_NUM; i++) {
+			memcpy(data, et_stats[i].desc, ETH_GSTRING_LEN);
+			data += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < adapter->num_rx_qs; i++) {
+			for (j = 0; j < ETHTOOL_RXSTATS_NUM; j++) {
+				sprintf(data, "rxq%d: %s", i,
+					et_rx_stats[j].desc);
+				data += ETH_GSTRING_LEN;
+			}
+		}
+		for (i = 0; i < adapter->num_tx_qs; i++) {
+			for (j = 0; j < ETHTOOL_TXSTATS_NUM; j++) {
+				sprintf(data, "txq%d: %s", i,
+					et_tx_stats[j].desc);
+				data += ETH_GSTRING_LEN;
+			}
+		}
+		break;
+	case ETH_SS_TEST:
+		for (i = 0; i < ETHTOOL_TESTS_NUM; i++) {
+			memcpy(data, et_self_tests[i], ETH_GSTRING_LEN);
+			data += ETH_GSTRING_LEN;
+		}
+		break;
+	case ETH_SS_PRIV_FLAGS:
+		for (i = 0; i < ARRAY_SIZE(be_priv_flags); i++)
+			strcpy(data + i * ETH_GSTRING_LEN, be_priv_flags[i]);
+		break;
+	}
+}
+
+static int be_get_sset_count(struct net_device *netdev, int stringset)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	switch (stringset) {
+	case ETH_SS_TEST:
+		return ETHTOOL_TESTS_NUM;
+	case ETH_SS_STATS:
+		return ETHTOOL_STATS_NUM +
+			adapter->num_rx_qs * ETHTOOL_RXSTATS_NUM +
+			adapter->num_tx_qs * ETHTOOL_TXSTATS_NUM;
+	case ETH_SS_PRIV_FLAGS:
+		return ARRAY_SIZE(be_priv_flags);
+	default:
+		return -EINVAL;
+	}
+}
+
+static u32 be_get_port_type(struct be_adapter *adapter)
+{
+	u32 port;
+
+	switch (adapter->phy.interface_type) {
+	case PHY_TYPE_BASET_1GB:
+	case PHY_TYPE_BASEX_1GB:
+	case PHY_TYPE_SGMII:
+		port = PORT_TP;
+		break;
+	case PHY_TYPE_SFP_PLUS_10GB:
+		if (adapter->phy.cable_type & SFP_PLUS_COPPER_CABLE)
+			port = PORT_DA;
+		else
+			port = PORT_FIBRE;
+		break;
+	case PHY_TYPE_QSFP:
+		if (adapter->phy.cable_type & QSFP_PLUS_CR4_CABLE)
+			port = PORT_DA;
+		else
+			port = PORT_FIBRE;
+		break;
+	case PHY_TYPE_XFP_10GB:
+	case PHY_TYPE_SFP_1GB:
+		port = PORT_FIBRE;
+		break;
+	case PHY_TYPE_BASET_10GB:
+		port = PORT_TP;
+		break;
+	default:
+		port = PORT_OTHER;
+	}
+
+	return port;
+}
+
+static u32 convert_to_et_setting(struct be_adapter *adapter, u32 if_speeds)
+{
+	u32 val = 0;
+
+	switch (adapter->phy.interface_type) {
+	case PHY_TYPE_BASET_1GB:
+	case PHY_TYPE_BASEX_1GB:
+	case PHY_TYPE_SGMII:
+		val |= SUPPORTED_TP;
+		if (if_speeds & BE_SUPPORTED_SPEED_1GBPS)
+			val |= SUPPORTED_1000baseT_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_100MBPS)
+			val |= SUPPORTED_100baseT_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_10MBPS)
+			val |= SUPPORTED_10baseT_Full;
+		break;
+	case PHY_TYPE_KX4_10GB:
+		val |= SUPPORTED_Backplane;
+		if (if_speeds & BE_SUPPORTED_SPEED_1GBPS)
+			val |= SUPPORTED_1000baseKX_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_10GBPS)
+			val |= SUPPORTED_10000baseKX4_Full;
+		break;
+	case PHY_TYPE_KR2_20GB:
+		val |= SUPPORTED_Backplane;
+		if (if_speeds & BE_SUPPORTED_SPEED_10GBPS)
+			val |= SUPPORTED_10000baseKR_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_20GBPS)
+			val |= SUPPORTED_20000baseKR2_Full;
+		break;
+	case PHY_TYPE_KR_10GB:
+		val |= SUPPORTED_Backplane |
+				SUPPORTED_10000baseKR_Full;
+		break;
+	case PHY_TYPE_KR4_40GB:
+		val |= SUPPORTED_Backplane;
+		if (if_speeds & BE_SUPPORTED_SPEED_10GBPS)
+			val |= SUPPORTED_10000baseKR_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_40GBPS)
+			val |= SUPPORTED_40000baseKR4_Full;
+		break;
+	case PHY_TYPE_QSFP:
+		if (if_speeds & BE_SUPPORTED_SPEED_40GBPS) {
+			switch (adapter->phy.cable_type) {
+			case QSFP_PLUS_CR4_CABLE:
+				val |= SUPPORTED_40000baseCR4_Full;
+				break;
+			case QSFP_PLUS_LR4_CABLE:
+				val |= SUPPORTED_40000baseLR4_Full;
+				break;
+			default:
+				val |= SUPPORTED_40000baseSR4_Full;
+				break;
+			}
+		}
+	case PHY_TYPE_SFP_PLUS_10GB:
+	case PHY_TYPE_XFP_10GB:
+	case PHY_TYPE_SFP_1GB:
+		val |= SUPPORTED_FIBRE;
+		if (if_speeds & BE_SUPPORTED_SPEED_10GBPS)
+			val |= SUPPORTED_10000baseT_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_1GBPS)
+			val |= SUPPORTED_1000baseT_Full;
+		break;
+	case PHY_TYPE_BASET_10GB:
+		val |= SUPPORTED_TP;
+		if (if_speeds & BE_SUPPORTED_SPEED_10GBPS)
+			val |= SUPPORTED_10000baseT_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_1GBPS)
+			val |= SUPPORTED_1000baseT_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_100MBPS)
+			val |= SUPPORTED_100baseT_Full;
+		break;
+	default:
+		val |= SUPPORTED_TP;
+	}
+
+	return val;
+}
+
+bool be_pause_supported(struct be_adapter *adapter)
+{
+	return (adapter->phy.interface_type == PHY_TYPE_SFP_PLUS_10GB ||
+		adapter->phy.interface_type == PHY_TYPE_XFP_10GB) ?
+		false : true;
+}
+
+static int be_get_link_ksettings(struct net_device *netdev,
+				 struct ethtool_link_ksettings *cmd)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	u8 link_status;
+	u16 link_speed = 0;
+	int status;
+	u32 auto_speeds;
+	u32 fixed_speeds;
+	u32 supported = 0, advertising = 0;
+
+	if (adapter->phy.link_speed < 0) {
+		status = be_cmd_link_status_query(adapter, &link_speed,
+						  &link_status, 0);
+		if (!status)
+			be_link_status_update(adapter, link_status);
+		cmd->base.speed = link_speed;
+
+		status = be_cmd_get_phy_info(adapter);
+		if (!status) {
+			auto_speeds = adapter->phy.auto_speeds_supported;
+			fixed_speeds = adapter->phy.fixed_speeds_supported;
+
+			be_cmd_query_cable_type(adapter);
+
+			supported =
+				convert_to_et_setting(adapter,
+						      auto_speeds |
+						      fixed_speeds);
+			advertising =
+				convert_to_et_setting(adapter, auto_speeds);
+
+			cmd->base.port = be_get_port_type(adapter);
+
+			if (adapter->phy.auto_speeds_supported) {
+				supported |= SUPPORTED_Autoneg;
+				cmd->base.autoneg = AUTONEG_ENABLE;
+				advertising |= ADVERTISED_Autoneg;
+			}
+
+			supported |= SUPPORTED_Pause;
+			if (be_pause_supported(adapter))
+				advertising |= ADVERTISED_Pause;
+		} else {
+			cmd->base.port = PORT_OTHER;
+			cmd->base.autoneg = AUTONEG_DISABLE;
+		}
+
+		/* Save for future use */
+		adapter->phy.link_speed = cmd->base.speed;
+		adapter->phy.port_type = cmd->base.port;
+		adapter->phy.autoneg = cmd->base.autoneg;
+		adapter->phy.advertising = advertising;
+		adapter->phy.supported = supported;
+	} else {
+		cmd->base.speed = adapter->phy.link_speed;
+		cmd->base.port = adapter->phy.port_type;
+		cmd->base.autoneg = adapter->phy.autoneg;
+		advertising = adapter->phy.advertising;
+		supported = adapter->phy.supported;
+	}
+
+	cmd->base.duplex = netif_carrier_ok(netdev) ?
+		DUPLEX_FULL : DUPLEX_UNKNOWN;
+	cmd->base.phy_address = adapter->port_num;
+
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
+						supported);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
+						advertising);
+
+	return 0;
+}
+
+static void be_get_ringparam(struct net_device *netdev,
+			     struct ethtool_ringparam *ring)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	ring->rx_max_pending = adapter->rx_obj[0].q.len;
+	ring->rx_pending = adapter->rx_obj[0].q.len;
+	ring->tx_max_pending = adapter->tx_obj[0].q.len;
+	ring->tx_pending = adapter->tx_obj[0].q.len;
+}
+
+static void
+be_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *ecmd)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	be_cmd_get_flow_control(adapter, &ecmd->tx_pause, &ecmd->rx_pause);
+	ecmd->autoneg = adapter->phy.fc_autoneg;
+}
+
+static int
+be_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *ecmd)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status;
+
+	if (ecmd->autoneg != adapter->phy.fc_autoneg)
+		return -EINVAL;
+
+	status = be_cmd_set_flow_control(adapter, ecmd->tx_pause,
+					 ecmd->rx_pause);
+	if (status) {
+		dev_warn(&adapter->pdev->dev, "Pause param set failed\n");
+		return be_cmd_status(status);
+	}
+
+	adapter->tx_fc = ecmd->tx_pause;
+	adapter->rx_fc = ecmd->rx_pause;
+	return 0;
+}
+
+static int be_set_phys_id(struct net_device *netdev,
+			  enum ethtool_phys_id_state state)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status = 0;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		status = be_cmd_get_beacon_state(adapter, adapter->hba_port_num,
+						 &adapter->beacon_state);
+		if (status)
+			return be_cmd_status(status);
+		return 1;       /* cycle on/off once per second */
+
+	case ETHTOOL_ID_ON:
+		status = be_cmd_set_beacon_state(adapter, adapter->hba_port_num,
+						 0, 0, BEACON_STATE_ENABLED);
+		break;
+
+	case ETHTOOL_ID_OFF:
+		status = be_cmd_set_beacon_state(adapter, adapter->hba_port_num,
+						 0, 0, BEACON_STATE_DISABLED);
+		break;
+
+	case ETHTOOL_ID_INACTIVE:
+		status = be_cmd_set_beacon_state(adapter, adapter->hba_port_num,
+						 0, 0, adapter->beacon_state);
+	}
+
+	return be_cmd_status(status);
+}
+
+static int be_set_dump(struct net_device *netdev, struct ethtool_dump *dump)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct device *dev = &adapter->pdev->dev;
+	int status;
+
+	if (!lancer_chip(adapter) ||
+	    !check_privilege(adapter, MAX_PRIVILEGES))
+		return -EOPNOTSUPP;
+
+	switch (dump->flag) {
+	case LANCER_INITIATE_FW_DUMP:
+		status = lancer_initiate_dump(adapter);
+		if (!status)
+			dev_info(dev, "FW dump initiated successfully\n");
+		break;
+	case LANCER_DELETE_FW_DUMP:
+		status = lancer_delete_dump(adapter);
+		if (!status)
+			dev_info(dev, "FW dump deleted successfully\n");
+	break;
+	default:
+		dev_err(dev, "Invalid dump level: 0x%x\n", dump->flag);
+		return -EINVAL;
+	}
+	return status;
+}
+
+static void be_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	if (adapter->wol_cap & BE_WOL_CAP) {
+		wol->supported |= WAKE_MAGIC;
+		if (adapter->wol_en)
+			wol->wolopts |= WAKE_MAGIC;
+	} else {
+		wol->wolopts = 0;
+	}
+	memset(&wol->sopass, 0, sizeof(wol->sopass));
+}
+
+static int be_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct device *dev = &adapter->pdev->dev;
+	struct be_dma_mem cmd;
+	u8 mac[ETH_ALEN];
+	bool enable;
+	int status;
+
+	if (wol->wolopts & ~WAKE_MAGIC)
+		return -EOPNOTSUPP;
+
+	if (!(adapter->wol_cap & BE_WOL_CAP)) {
+		dev_warn(&adapter->pdev->dev, "WOL not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	cmd.size = sizeof(struct be_cmd_req_acpi_wol_magic_config);
+	cmd.va = dma_zalloc_coherent(dev, cmd.size, &cmd.dma, GFP_KERNEL);
+	if (!cmd.va)
+		return -ENOMEM;
+
+	eth_zero_addr(mac);
+
+	enable = wol->wolopts & WAKE_MAGIC;
+	if (enable)
+		ether_addr_copy(mac, adapter->netdev->dev_addr);
+
+	status = be_cmd_enable_magic_wol(adapter, mac, &cmd);
+	if (status) {
+		dev_err(dev, "Could not set Wake-on-lan mac address\n");
+		status = be_cmd_status(status);
+		goto err;
+	}
+
+	pci_enable_wake(adapter->pdev, PCI_D3hot, enable);
+	pci_enable_wake(adapter->pdev, PCI_D3cold, enable);
+
+	adapter->wol_en = enable ? true : false;
+
+err:
+	dma_free_coherent(dev, cmd.size, cmd.va, cmd.dma);
+	return status;
+}
+
+static int be_test_ddr_dma(struct be_adapter *adapter)
+{
+	int ret, i;
+	struct be_dma_mem ddrdma_cmd;
+	static const u64 pattern[2] = {
+		0x5a5a5a5a5a5a5a5aULL, 0xa5a5a5a5a5a5a5a5ULL
+	};
+
+	ddrdma_cmd.size = sizeof(struct be_cmd_req_ddrdma_test);
+	ddrdma_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+					    ddrdma_cmd.size, &ddrdma_cmd.dma,
+					    GFP_KERNEL);
+	if (!ddrdma_cmd.va)
+		return -ENOMEM;
+
+	for (i = 0; i < 2; i++) {
+		ret = be_cmd_ddr_dma_test(adapter, pattern[i],
+					  4096, &ddrdma_cmd);
+		if (ret != 0)
+			goto err;
+	}
+
+err:
+	dma_free_coherent(&adapter->pdev->dev, ddrdma_cmd.size, ddrdma_cmd.va,
+			  ddrdma_cmd.dma);
+	return be_cmd_status(ret);
+}
+
+static u64 be_loopback_test(struct be_adapter *adapter, u8 loopback_type,
+			    u64 *status)
+{
+	int ret;
+
+	ret = be_cmd_set_loopback(adapter, adapter->hba_port_num,
+				  loopback_type, 1);
+	if (ret)
+		return ret;
+
+	*status = be_cmd_loopback_test(adapter, adapter->hba_port_num,
+				       loopback_type, 1500, 2, 0xabc);
+
+	ret = be_cmd_set_loopback(adapter, adapter->hba_port_num,
+				  BE_NO_LOOPBACK, 1);
+	if (ret)
+		return ret;
+
+	return *status;
+}
+
+static void be_self_test(struct net_device *netdev, struct ethtool_test *test,
+			 u64 *data)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status;
+	u8 link_status = 0;
+
+	if (adapter->function_caps & BE_FUNCTION_CAPS_SUPER_NIC) {
+		dev_err(&adapter->pdev->dev, "Self test not supported\n");
+		test->flags |= ETH_TEST_FL_FAILED;
+		return;
+	}
+
+	memset(data, 0, sizeof(u64) * ETHTOOL_TESTS_NUM);
+
+	if (test->flags & ETH_TEST_FL_OFFLINE) {
+		if (be_loopback_test(adapter, BE_MAC_LOOPBACK, &data[0]) != 0)
+			test->flags |= ETH_TEST_FL_FAILED;
+
+		if (be_loopback_test(adapter, BE_PHY_LOOPBACK, &data[1]) != 0)
+			test->flags |= ETH_TEST_FL_FAILED;
+
+		if (test->flags & ETH_TEST_FL_EXTERNAL_LB) {
+			if (be_loopback_test(adapter, BE_ONE_PORT_EXT_LOOPBACK,
+					     &data[2]) != 0)
+				test->flags |= ETH_TEST_FL_FAILED;
+			test->flags |= ETH_TEST_FL_EXTERNAL_LB_DONE;
+		}
+	}
+
+	if (!lancer_chip(adapter) && be_test_ddr_dma(adapter) != 0) {
+		data[3] = 1;
+		test->flags |= ETH_TEST_FL_FAILED;
+	}
+
+	status = be_cmd_link_status_query(adapter, NULL, &link_status, 0);
+	if (status) {
+		test->flags |= ETH_TEST_FL_FAILED;
+		data[4] = -1;
+	} else if (!link_status) {
+		test->flags |= ETH_TEST_FL_FAILED;
+		data[4] = 1;
+	}
+}
+
+static int be_do_flash(struct net_device *netdev, struct ethtool_flash *efl)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	return be_load_fw(adapter, efl->data);
+}
+
+static int
+be_get_dump_flag(struct net_device *netdev, struct ethtool_dump *dump)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	if (!check_privilege(adapter, MAX_PRIVILEGES))
+		return -EOPNOTSUPP;
+
+	dump->len = be_get_dump_len(adapter);
+	dump->version = 1;
+	dump->flag = 0x1;	/* FW dump is enabled */
+	return 0;
+}
+
+static int
+be_get_dump_data(struct net_device *netdev, struct ethtool_dump *dump,
+		 void *buf)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status;
+
+	if (!check_privilege(adapter, MAX_PRIVILEGES))
+		return -EOPNOTSUPP;
+
+	status = be_read_dump_data(adapter, dump->len, buf);
+	return be_cmd_status(status);
+}
+
+static int be_get_eeprom_len(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	if (!check_privilege(adapter, MAX_PRIVILEGES))
+		return 0;
+
+	if (lancer_chip(adapter)) {
+		if (be_physfn(adapter))
+			return lancer_cmd_get_file_len(adapter,
+						       LANCER_VPD_PF_FILE);
+		else
+			return lancer_cmd_get_file_len(adapter,
+						       LANCER_VPD_VF_FILE);
+	} else {
+		return BE_READ_SEEPROM_LEN;
+	}
+}
+
+static int be_read_eeprom(struct net_device *netdev,
+			  struct ethtool_eeprom *eeprom, uint8_t *data)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_dma_mem eeprom_cmd;
+	struct be_cmd_resp_seeprom_read *resp;
+	int status;
+
+	if (!eeprom->len)
+		return -EINVAL;
+
+	if (lancer_chip(adapter)) {
+		if (be_physfn(adapter))
+			return lancer_cmd_read_file(adapter, LANCER_VPD_PF_FILE,
+						    eeprom->len, data);
+		else
+			return lancer_cmd_read_file(adapter, LANCER_VPD_VF_FILE,
+						    eeprom->len, data);
+	}
+
+	eeprom->magic = BE_VENDOR_ID | (adapter->pdev->device<<16);
+
+	memset(&eeprom_cmd, 0, sizeof(struct be_dma_mem));
+	eeprom_cmd.size = sizeof(struct be_cmd_req_seeprom_read);
+	eeprom_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+					    eeprom_cmd.size, &eeprom_cmd.dma,
+					    GFP_KERNEL);
+
+	if (!eeprom_cmd.va)
+		return -ENOMEM;
+
+	status = be_cmd_get_seeprom_data(adapter, &eeprom_cmd);
+
+	if (!status) {
+		resp = eeprom_cmd.va;
+		memcpy(data, resp->seeprom_data + eeprom->offset, eeprom->len);
+	}
+	dma_free_coherent(&adapter->pdev->dev, eeprom_cmd.size, eeprom_cmd.va,
+			  eeprom_cmd.dma);
+
+	return be_cmd_status(status);
+}
+
+static u32 be_get_msg_level(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	return adapter->msg_enable;
+}
+
+static void be_set_msg_level(struct net_device *netdev, u32 level)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	if (adapter->msg_enable == level)
+		return;
+
+	if ((level & NETIF_MSG_HW) != (adapter->msg_enable & NETIF_MSG_HW))
+		if (BEx_chip(adapter))
+			be_cmd_set_fw_log_level(adapter, level & NETIF_MSG_HW ?
+						FW_LOG_LEVEL_DEFAULT :
+						FW_LOG_LEVEL_FATAL);
+	adapter->msg_enable = level;
+}
+
+static u64 be_get_rss_hash_opts(struct be_adapter *adapter, u64 flow_type)
+{
+	u64 data = 0;
+
+	switch (flow_type) {
+	case TCP_V4_FLOW:
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_IPV4)
+			data |= RXH_IP_DST | RXH_IP_SRC;
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_TCP_IPV4)
+			data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		break;
+	case UDP_V4_FLOW:
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_IPV4)
+			data |= RXH_IP_DST | RXH_IP_SRC;
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_UDP_IPV4)
+			data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		break;
+	case TCP_V6_FLOW:
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_IPV6)
+			data |= RXH_IP_DST | RXH_IP_SRC;
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_TCP_IPV6)
+			data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		break;
+	case UDP_V6_FLOW:
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_IPV6)
+			data |= RXH_IP_DST | RXH_IP_SRC;
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_UDP_IPV6)
+			data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		break;
+	}
+
+	return data;
+}
+
+static int be_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd,
+			u32 *rule_locs)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	if (!be_multi_rxq(adapter)) {
+		dev_info(&adapter->pdev->dev,
+			 "ethtool::get_rxnfc: RX flow hashing is disabled\n");
+		return -EINVAL;
+	}
+
+	switch (cmd->cmd) {
+	case ETHTOOL_GRXFH:
+		cmd->data = be_get_rss_hash_opts(adapter, cmd->flow_type);
+		break;
+	case ETHTOOL_GRXRINGS:
+		cmd->data = adapter->num_rx_qs - 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int be_set_rss_hash_opts(struct be_adapter *adapter,
+				struct ethtool_rxnfc *cmd)
+{
+	int status;
+	u32 rss_flags = adapter->rss_info.rss_flags;
+
+	if (cmd->data != L3_RSS_FLAGS &&
+	    cmd->data != (L3_RSS_FLAGS | L4_RSS_FLAGS))
+		return -EINVAL;
+
+	switch (cmd->flow_type) {
+	case TCP_V4_FLOW:
+		if (cmd->data == L3_RSS_FLAGS)
+			rss_flags &= ~RSS_ENABLE_TCP_IPV4;
+		else if (cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS))
+			rss_flags |= RSS_ENABLE_IPV4 |
+					RSS_ENABLE_TCP_IPV4;
+		break;
+	case TCP_V6_FLOW:
+		if (cmd->data == L3_RSS_FLAGS)
+			rss_flags &= ~RSS_ENABLE_TCP_IPV6;
+		else if (cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS))
+			rss_flags |= RSS_ENABLE_IPV6 |
+					RSS_ENABLE_TCP_IPV6;
+		break;
+	case UDP_V4_FLOW:
+		if ((cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS)) &&
+		    BEx_chip(adapter))
+			return -EINVAL;
+
+		if (cmd->data == L3_RSS_FLAGS)
+			rss_flags &= ~RSS_ENABLE_UDP_IPV4;
+		else if (cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS))
+			rss_flags |= RSS_ENABLE_IPV4 |
+					RSS_ENABLE_UDP_IPV4;
+		break;
+	case UDP_V6_FLOW:
+		if ((cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS)) &&
+		    BEx_chip(adapter))
+			return -EINVAL;
+
+		if (cmd->data == L3_RSS_FLAGS)
+			rss_flags &= ~RSS_ENABLE_UDP_IPV6;
+		else if (cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS))
+			rss_flags |= RSS_ENABLE_IPV6 |
+					RSS_ENABLE_UDP_IPV6;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (rss_flags == adapter->rss_info.rss_flags)
+		return 0;
+
+	status = be_cmd_rss_config(adapter, adapter->rss_info.rsstable,
+				   rss_flags, RSS_INDIR_TABLE_LEN,
+				   adapter->rss_info.rss_hkey);
+	if (!status)
+		adapter->rss_info.rss_flags = rss_flags;
+
+	return be_cmd_status(status);
+}
+
+static int be_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status = 0;
+
+	if (!be_multi_rxq(adapter)) {
+		dev_err(&adapter->pdev->dev,
+			"ethtool::set_rxnfc: RX flow hashing is disabled\n");
+		return -EINVAL;
+	}
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXFH:
+		status = be_set_rss_hash_opts(adapter, cmd);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return status;
+}
+
+static void be_get_channels(struct net_device *netdev,
+			    struct ethtool_channels *ch)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	u16 num_rx_irqs = max_t(u16, adapter->num_rss_qs, 1);
+
+	/* num_tx_qs is always same as the number of irqs used for TX */
+	ch->combined_count = min(adapter->num_tx_qs, num_rx_irqs);
+	ch->rx_count = num_rx_irqs - ch->combined_count;
+	ch->tx_count = adapter->num_tx_qs - ch->combined_count;
+
+	ch->max_combined = be_max_qp_irqs(adapter);
+	/* The user must create atleast one combined channel */
+	ch->max_rx = be_max_rx_irqs(adapter) - 1;
+	ch->max_tx = be_max_tx_irqs(adapter) - 1;
+}
+
+static int be_set_channels(struct net_device  *netdev,
+			   struct ethtool_channels *ch)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status;
+
+	/* we support either only combined channels or a combination of
+	 * combined and either RX-only or TX-only channels.
+	 */
+	if (ch->other_count || !ch->combined_count ||
+	    (ch->rx_count && ch->tx_count))
+		return -EINVAL;
+
+	if (ch->combined_count > be_max_qp_irqs(adapter) ||
+	    (ch->rx_count &&
+	     (ch->rx_count + ch->combined_count) > be_max_rx_irqs(adapter)) ||
+	    (ch->tx_count &&
+	     (ch->tx_count + ch->combined_count) > be_max_tx_irqs(adapter)))
+		return -EINVAL;
+
+	adapter->cfg_num_rx_irqs = ch->combined_count + ch->rx_count;
+	adapter->cfg_num_tx_irqs = ch->combined_count + ch->tx_count;
+
+	status = be_update_queues(adapter);
+	return be_cmd_status(status);
+}
+
+static u32 be_get_rxfh_indir_size(struct net_device *netdev)
+{
+	return RSS_INDIR_TABLE_LEN;
+}
+
+static u32 be_get_rxfh_key_size(struct net_device *netdev)
+{
+	return RSS_HASH_KEY_LEN;
+}
+
+static int be_get_rxfh(struct net_device *netdev, u32 *indir, u8 *hkey,
+		       u8 *hfunc)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int i;
+	struct rss_info *rss = &adapter->rss_info;
+
+	if (indir) {
+		for (i = 0; i < RSS_INDIR_TABLE_LEN; i++)
+			indir[i] = rss->rss_queue[i];
+	}
+
+	if (hkey)
+		memcpy(hkey, rss->rss_hkey, RSS_HASH_KEY_LEN);
+
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+
+	return 0;
+}
+
+static int be_set_rxfh(struct net_device *netdev, const u32 *indir,
+		       const u8 *hkey, const u8 hfunc)
+{
+	int rc = 0, i, j;
+	struct be_adapter *adapter = netdev_priv(netdev);
+	u8 rsstable[RSS_INDIR_TABLE_LEN];
+
+	/* We do not allow change in unsupported parameters */
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+		return -EOPNOTSUPP;
+
+	if (indir) {
+		struct be_rx_obj *rxo;
+
+		for (i = 0; i < RSS_INDIR_TABLE_LEN; i++) {
+			j = indir[i];
+			rxo = &adapter->rx_obj[j];
+			rsstable[i] = rxo->rss_id;
+			adapter->rss_info.rss_queue[i] = j;
+		}
+	} else {
+		memcpy(rsstable, adapter->rss_info.rsstable,
+		       RSS_INDIR_TABLE_LEN);
+	}
+
+	if (!hkey)
+		hkey =  adapter->rss_info.rss_hkey;
+
+	rc = be_cmd_rss_config(adapter, rsstable,
+			       adapter->rss_info.rss_flags,
+			       RSS_INDIR_TABLE_LEN, hkey);
+	if (rc) {
+		adapter->rss_info.rss_flags = RSS_ENABLE_NONE;
+		return -EIO;
+	}
+	memcpy(adapter->rss_info.rss_hkey, hkey, RSS_HASH_KEY_LEN);
+	memcpy(adapter->rss_info.rsstable, rsstable,
+	       RSS_INDIR_TABLE_LEN);
+	return 0;
+}
+
+static int be_get_module_info(struct net_device *netdev,
+			      struct ethtool_modinfo *modinfo)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	u8 page_data[PAGE_DATA_LEN];
+	int status;
+
+	if (!check_privilege(adapter, MAX_PRIVILEGES))
+		return -EOPNOTSUPP;
+
+	status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0,
+						   page_data);
+	if (!status) {
+		if (!page_data[SFP_PLUS_SFF_8472_COMP]) {
+			modinfo->type = ETH_MODULE_SFF_8079;
+			modinfo->eeprom_len = PAGE_DATA_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8472;
+			modinfo->eeprom_len = 2 * PAGE_DATA_LEN;
+		}
+	}
+	return be_cmd_status(status);
+}
+
+static int be_get_module_eeprom(struct net_device *netdev,
+				struct ethtool_eeprom *eeprom, u8 *data)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status;
+
+	if (!check_privilege(adapter, MAX_PRIVILEGES))
+		return -EOPNOTSUPP;
+
+	status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0,
+						   data);
+	if (status)
+		goto err;
+
+	if (eeprom->offset + eeprom->len > PAGE_DATA_LEN) {
+		status = be_cmd_read_port_transceiver_data(adapter,
+							   TR_PAGE_A2,
+							   data +
+							   PAGE_DATA_LEN);
+		if (status)
+			goto err;
+	}
+	if (eeprom->offset)
+		memcpy(data, data + eeprom->offset, eeprom->len);
+err:
+	return be_cmd_status(status);
+}
+
+static u32 be_get_priv_flags(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	return adapter->priv_flags;
+}
+
+static int be_set_priv_flags(struct net_device *netdev, u32 flags)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	bool tpe_old = !!(adapter->priv_flags & BE_DISABLE_TPE_RECOVERY);
+	bool tpe_new = !!(flags & BE_DISABLE_TPE_RECOVERY);
+
+	if (tpe_old != tpe_new) {
+		if (tpe_new) {
+			adapter->priv_flags |= BE_DISABLE_TPE_RECOVERY;
+			dev_info(&adapter->pdev->dev,
+				 "HW error recovery is disabled\n");
+		} else {
+			adapter->priv_flags &= ~BE_DISABLE_TPE_RECOVERY;
+			dev_info(&adapter->pdev->dev,
+				 "HW error recovery is enabled\n");
+		}
+	}
+
+	return 0;
+}
+
+const struct ethtool_ops be_ethtool_ops = {
+	.get_drvinfo = be_get_drvinfo,
+	.get_wol = be_get_wol,
+	.set_wol = be_set_wol,
+	.get_link = ethtool_op_get_link,
+	.get_eeprom_len = be_get_eeprom_len,
+	.get_eeprom = be_read_eeprom,
+	.get_coalesce = be_get_coalesce,
+	.set_coalesce = be_set_coalesce,
+	.get_ringparam = be_get_ringparam,
+	.get_pauseparam = be_get_pauseparam,
+	.set_pauseparam = be_set_pauseparam,
+	.set_priv_flags = be_set_priv_flags,
+	.get_priv_flags = be_get_priv_flags,
+	.get_strings = be_get_stat_strings,
+	.set_phys_id = be_set_phys_id,
+	.set_dump = be_set_dump,
+	.get_msglevel = be_get_msg_level,
+	.set_msglevel = be_set_msg_level,
+	.get_sset_count = be_get_sset_count,
+	.get_ethtool_stats = be_get_ethtool_stats,
+	.flash_device = be_do_flash,
+	.self_test = be_self_test,
+	.get_rxnfc = be_get_rxnfc,
+	.set_rxnfc = be_set_rxnfc,
+	.get_rxfh_indir_size = be_get_rxfh_indir_size,
+	.get_rxfh_key_size = be_get_rxfh_key_size,
+	.get_rxfh = be_get_rxfh,
+	.set_rxfh = be_set_rxfh,
+	.get_dump_flag = be_get_dump_flag,
+	.get_dump_data = be_get_dump_data,
+	.get_channels = be_get_channels,
+	.set_channels = be_set_channels,
+	.get_module_info = be_get_module_info,
+	.get_module_eeprom = be_get_module_eeprom,
+	.get_link_ksettings = be_get_link_ksettings,
+};
diff --git a/drivers/net/ethernet/emulex/benet/be_hw.h b/drivers/net/ethernet/emulex/benet/be_hw.h
new file mode 100644
index 0000000..db5f92f
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_hw.h
@@ -0,0 +1,375 @@
+/*
+ * Copyright (C) 2005-2016 Broadcom.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.  The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+/********* Mailbox door bell *************/
+/* Used for driver communication with the FW.
+ * The software must write this register twice to post any command. First,
+ * it writes the register with hi=1 and the upper bits of the physical address
+ * for the MAILBOX structure. Software must poll the ready bit until this
+ * is acknowledged. Then, sotware writes the register with hi=0 with the lower
+ * bits in the address. It must poll the ready bit until the command is
+ * complete. Upon completion, the MAILBOX will contain a valid completion
+ * queue entry.
+ */
+#define MPU_MAILBOX_DB_OFFSET	0x160
+#define MPU_MAILBOX_DB_RDY_MASK	0x1 	/* bit 0 */
+#define MPU_MAILBOX_DB_HI_MASK	0x2	/* bit 1 */
+
+#define MPU_EP_CONTROL 		0
+
+/********** MPU semphore: used for SH & BE  *************/
+#define SLIPORT_SOFTRESET_OFFSET		0x5c	/* CSR BAR offset */
+#define SLIPORT_SEMAPHORE_OFFSET_BEx		0xac  /* CSR BAR offset */
+#define SLIPORT_SEMAPHORE_OFFSET_SH		0x94  /* PCI-CFG offset */
+#define POST_STAGE_MASK				0x0000FFFF
+#define POST_ERR_MASK				0x1
+#define POST_ERR_SHIFT				31
+#define POST_ERR_RECOVERY_CODE_MASK		0xFFF
+
+/* Soft Reset register masks */
+#define SLIPORT_SOFTRESET_SR_MASK		0x00000080	/* SR bit */
+
+/* MPU semphore POST stage values */
+#define POST_STAGE_AWAITING_HOST_RDY 	0x1 /* FW awaiting goahead from host */
+#define POST_STAGE_HOST_RDY 		0x2 /* Host has given go-ahed to FW */
+#define POST_STAGE_BE_RESET		0x3 /* Host wants to reset chip */
+#define POST_STAGE_ARMFW_RDY		0xc000	/* FW is done with POST */
+#define POST_STAGE_RECOVERABLE_ERR	0xE000	/* Recoverable err detected */
+/* FW has detected a UE and is dumping FAT log data */
+#define POST_STAGE_FAT_LOG_START       0x0D00
+#define POST_STAGE_ARMFW_UE            0xF000  /*FW has asserted an UE*/
+
+/* Lancer SLIPORT registers */
+#define SLIPORT_STATUS_OFFSET		0x404
+#define SLIPORT_CONTROL_OFFSET		0x408
+#define SLIPORT_ERROR1_OFFSET		0x40C
+#define SLIPORT_ERROR2_OFFSET		0x410
+#define PHYSDEV_CONTROL_OFFSET		0x414
+
+#define SLIPORT_STATUS_ERR_MASK		0x80000000
+#define SLIPORT_STATUS_DIP_MASK		0x02000000
+#define SLIPORT_STATUS_RN_MASK		0x01000000
+#define SLIPORT_STATUS_RDY_MASK		0x00800000
+#define SLI_PORT_CONTROL_IP_MASK	0x08000000
+#define PHYSDEV_CONTROL_FW_RESET_MASK	0x00000002
+#define PHYSDEV_CONTROL_DD_MASK		0x00000004
+#define PHYSDEV_CONTROL_INP_MASK	0x40000000
+
+#define SLIPORT_ERROR_NO_RESOURCE1	0x2
+#define SLIPORT_ERROR_NO_RESOURCE2	0x9
+
+#define SLIPORT_ERROR_FW_RESET1		0x2
+#define SLIPORT_ERROR_FW_RESET2		0x0
+
+/********* Memory BAR register ************/
+#define PCICFG_MEMBAR_CTRL_INT_CTRL_OFFSET 	0xfc
+/* Host Interrupt Enable, if set interrupts are enabled although "PCI Interrupt
+ * Disable" may still globally block interrupts in addition to individual
+ * interrupt masks; a mechanism for the device driver to block all interrupts
+ * atomically without having to arbitrate for the PCI Interrupt Disable bit
+ * with the OS.
+ */
+#define MEMBAR_CTRL_INT_CTRL_HOSTINTR_MASK	BIT(29) /* bit 29 */
+
+/********* PCI Function Capability *********/
+#define BE_FUNCTION_CAPS_RSS			0x2
+#define BE_FUNCTION_CAPS_SUPER_NIC		0x40
+
+/********* Power management (WOL) **********/
+#define PCICFG_PM_CONTROL_OFFSET		0x44
+#define PCICFG_PM_CONTROL_MASK			0x108	/* bits 3 & 8 */
+
+/********* Online Control Registers *******/
+#define PCICFG_ONLINE0				0xB0
+#define PCICFG_ONLINE1				0xB4
+
+/********* UE Status and Mask Registers ***/
+#define PCICFG_UE_STATUS_LOW			0xA0
+#define PCICFG_UE_STATUS_HIGH			0xA4
+#define PCICFG_UE_STATUS_LOW_MASK		0xA8
+#define PCICFG_UE_STATUS_HI_MASK		0xAC
+
+/******** SLI_INTF ***********************/
+#define SLI_INTF_REG_OFFSET			0x58
+#define SLI_INTF_VALID_MASK			0xE0000000
+#define SLI_INTF_VALID				0xC0000000
+#define SLI_INTF_HINT2_MASK			0x1F000000
+#define SLI_INTF_HINT2_SHIFT			24
+#define SLI_INTF_HINT1_MASK			0x00FF0000
+#define SLI_INTF_HINT1_SHIFT			16
+#define SLI_INTF_FAMILY_MASK			0x00000F00
+#define SLI_INTF_FAMILY_SHIFT			8
+#define SLI_INTF_IF_TYPE_MASK			0x0000F000
+#define SLI_INTF_IF_TYPE_SHIFT			12
+#define SLI_INTF_REV_MASK			0x000000F0
+#define SLI_INTF_REV_SHIFT			4
+#define SLI_INTF_FT_MASK			0x00000001
+
+#define SLI_INTF_TYPE_2		2
+#define SLI_INTF_TYPE_3		3
+
+/********* ISR0 Register offset **********/
+#define CEV_ISR0_OFFSET 			0xC18
+#define CEV_ISR_SIZE				4
+
+/********* Event Q door bell *************/
+#define DB_EQ_OFFSET			DB_CQ_OFFSET
+#define DB_EQ_RING_ID_MASK		0x1FF	/* bits 0 - 8 */
+#define DB_EQ_RING_ID_EXT_MASK		0x3e00  /* bits 9-13 */
+#define DB_EQ_RING_ID_EXT_MASK_SHIFT	(2) /* qid bits 9-13 placing at 11-15 */
+
+/* Clear the interrupt for this eq */
+#define DB_EQ_CLR_SHIFT			(9)	/* bit 9 */
+/* Must be 1 */
+#define DB_EQ_EVNT_SHIFT		(10)	/* bit 10 */
+/* Number of event entries processed */
+#define DB_EQ_NUM_POPPED_SHIFT		(16)	/* bits 16 - 28 */
+/* Rearm bit */
+#define DB_EQ_REARM_SHIFT		(29)	/* bit 29 */
+/* Rearm to interrupt delay encoding */
+#define DB_EQ_R2I_DLY_SHIFT		(30)    /* bits 30 - 31 */
+
+/* Rearm to interrupt (R2I) delay multiplier encoding represents 3 different
+ * values configured in CEV_REARM2IRPT_DLY_MULT_CSR register. This value is
+ * programmed by host driver while ringing an EQ doorbell(EQ_DB) if a delay
+ * between rearming the EQ and next interrupt on this EQ is desired.
+ */
+#define	R2I_DLY_ENC_0			0	/* No delay */
+#define	R2I_DLY_ENC_1			1	/* maps to 160us EQ delay */
+#define	R2I_DLY_ENC_2			2	/* maps to 96us EQ delay */
+#define	R2I_DLY_ENC_3			3	/* maps to 48us EQ delay */
+
+/********* Compl Q door bell *************/
+#define DB_CQ_OFFSET 			0x120
+#define DB_CQ_RING_ID_MASK		0x3FF	/* bits 0 - 9 */
+#define DB_CQ_RING_ID_EXT_MASK		0x7C00	/* bits 10-14 */
+#define DB_CQ_RING_ID_EXT_MASK_SHIFT	(1)	/* qid bits 10-14
+						 placing at 11-15 */
+
+/* Number of event entries processed */
+#define DB_CQ_NUM_POPPED_SHIFT		(16) 	/* bits 16 - 28 */
+/* Rearm bit */
+#define DB_CQ_REARM_SHIFT		(29) 	/* bit 29 */
+
+/********** TX ULP door bell *************/
+#define DB_TXULP1_OFFSET		0x60
+#define DB_TXULP_RING_ID_MASK		0x7FF	/* bits 0 - 10 */
+/* Number of tx entries posted */
+#define DB_TXULP_NUM_POSTED_SHIFT	(16)	/* bits 16 - 29 */
+#define DB_TXULP_NUM_POSTED_MASK	0x3FFF	/* bits 16 - 29 */
+
+/********** RQ(erx) door bell ************/
+#define DB_RQ_OFFSET 			0x100
+#define DB_RQ_RING_ID_MASK		0x3FF	/* bits 0 - 9 */
+/* Number of rx frags posted */
+#define DB_RQ_NUM_POSTED_SHIFT		(24)	/* bits 24 - 31 */
+
+/********** MCC door bell ************/
+#define DB_MCCQ_OFFSET 			0x140
+#define DB_MCCQ_RING_ID_MASK		0x7FF	/* bits 0 - 10 */
+/* Number of entries posted */
+#define DB_MCCQ_NUM_POSTED_SHIFT	(16)	/* bits 16 - 29 */
+
+/********** SRIOV VF PCICFG OFFSET ********/
+#define SRIOV_VF_PCICFG_OFFSET		(4096)
+
+/********** FAT TABLE  ********/
+#define RETRIEVE_FAT	0
+#define QUERY_FAT	1
+
+/************* Rx Packet Type Encoding **************/
+#define BE_UNICAST_PACKET		0
+#define BE_MULTICAST_PACKET		1
+#define BE_BROADCAST_PACKET		2
+#define BE_RSVD_PACKET			3
+
+/*
+ * BE descriptors: host memory data structures whose formats
+ * are hardwired in BE silicon.
+ */
+/* Event Queue Descriptor */
+#define EQ_ENTRY_VALID_MASK 		0x1	/* bit 0 */
+#define EQ_ENTRY_RES_ID_MASK 		0xFFFF	/* bits 16 - 31 */
+#define EQ_ENTRY_RES_ID_SHIFT 		16
+
+struct be_eq_entry {
+	u32 evt;
+};
+
+/* TX Queue Descriptor */
+#define ETH_WRB_FRAG_LEN_MASK		0xFFFF
+struct be_eth_wrb {
+	__le32 frag_pa_hi;		/* dword 0 */
+	__le32 frag_pa_lo;		/* dword 1 */
+	u32 rsvd0;			/* dword 2 */
+	__le32 frag_len;		/* dword 3: bits 0 - 15 */
+} __packed;
+
+/* Pseudo amap definition for eth_hdr_wrb in which each bit of the
+ * actual structure is defined as a byte : used to calculate
+ * offset/shift/mask of each field */
+struct amap_eth_hdr_wrb {
+	u8 rsvd0[32];		/* dword 0 */
+	u8 rsvd1[32];		/* dword 1 */
+	u8 complete;		/* dword 2 */
+	u8 event;
+	u8 crc;
+	u8 forward;
+	u8 lso6;
+	u8 mgmt;
+	u8 ipcs;
+	u8 udpcs;
+	u8 tcpcs;
+	u8 lso;
+	u8 vlan;
+	u8 gso[2];
+	u8 num_wrb[5];
+	u8 lso_mss[14];
+	u8 len[16];		/* dword 3 */
+	u8 vlan_tag[16];
+} __packed;
+
+#define TX_HDR_WRB_COMPL		1		/* word 2 */
+#define TX_HDR_WRB_EVT			BIT(1)		/* word 2 */
+#define TX_HDR_WRB_NUM_SHIFT		13		/* word 2: bits 13:17 */
+#define TX_HDR_WRB_NUM_MASK		0x1F		/* word 2: bits 13:17 */
+
+struct be_eth_hdr_wrb {
+	__le32 dw[4];
+};
+
+/********* Tx Compl Status Encoding *********/
+#define BE_TX_COMP_HDR_PARSE_ERR	0x2
+#define BE_TX_COMP_NDMA_ERR		0x3
+#define BE_TX_COMP_ACL_ERR		0x5
+
+#define LANCER_TX_COMP_LSO_ERR			0x1
+#define LANCER_TX_COMP_HSW_DROP_MAC_ERR		0x3
+#define LANCER_TX_COMP_HSW_DROP_VLAN_ERR	0x5
+#define LANCER_TX_COMP_QINQ_ERR			0x7
+#define LANCER_TX_COMP_SGE_ERR			0x9
+#define LANCER_TX_COMP_PARITY_ERR		0xb
+#define LANCER_TX_COMP_DMA_ERR			0xd
+
+/* TX Compl Queue Descriptor */
+
+/* Pseudo amap definition for eth_tx_compl in which each bit of the
+ * actual structure is defined as a byte: used to calculate
+ * offset/shift/mask of each field */
+struct amap_eth_tx_compl {
+	u8 wrb_index[16];	/* dword 0 */
+	u8 ct[2]; 		/* dword 0 */
+	u8 port[2];		/* dword 0 */
+	u8 rsvd0[8];		/* dword 0 */
+	u8 status[4];		/* dword 0 */
+	u8 user_bytes[16];	/* dword 1 */
+	u8 nwh_bytes[8];	/* dword 1 */
+	u8 lso;			/* dword 1 */
+	u8 cast_enc[2];		/* dword 1 */
+	u8 rsvd1[5];		/* dword 1 */
+	u8 rsvd2[32];		/* dword 2 */
+	u8 pkts[16];		/* dword 3 */
+	u8 ringid[11];		/* dword 3 */
+	u8 hash_val[4];		/* dword 3 */
+	u8 valid;		/* dword 3 */
+} __packed;
+
+struct be_eth_tx_compl {
+	u32 dw[4];
+};
+
+/* RX Queue Descriptor */
+struct be_eth_rx_d {
+	u32 fragpa_hi;
+	u32 fragpa_lo;
+};
+
+/* RX Compl Queue Descriptor */
+
+/* Pseudo amap definition for BE2 and BE3 legacy mode eth_rx_compl in which
+ * each bit of the actual structure is defined as a byte: used to calculate
+ * offset/shift/mask of each field */
+struct amap_eth_rx_compl_v0 {
+	u8 vlan_tag[16];	/* dword 0 */
+	u8 pktsize[14];		/* dword 0 */
+	u8 port;		/* dword 0 */
+	u8 ip_opt;		/* dword 0 */
+	u8 err;			/* dword 1 */
+	u8 rsshp;		/* dword 1 */
+	u8 ipf;			/* dword 1 */
+	u8 tcpf;		/* dword 1 */
+	u8 udpf;		/* dword 1 */
+	u8 ipcksm;		/* dword 1 */
+	u8 l4_cksm;		/* dword 1 */
+	u8 ip_version;		/* dword 1 */
+	u8 macdst[6];		/* dword 1 */
+	u8 vtp;			/* dword 1 */
+	u8 ip_frag;		/* dword 1 */
+	u8 fragndx[10];		/* dword 1 */
+	u8 ct[2];		/* dword 1 */
+	u8 sw;			/* dword 1 */
+	u8 numfrags[3];		/* dword 1 */
+	u8 rss_flush;		/* dword 2 */
+	u8 cast_enc[2];		/* dword 2 */
+	u8 qnq;			/* dword 2 */
+	u8 rss_bank;		/* dword 2 */
+	u8 rsvd1[23];		/* dword 2 */
+	u8 lro_pkt;		/* dword 2 */
+	u8 rsvd2[2];		/* dword 2 */
+	u8 valid;		/* dword 2 */
+	u8 rsshash[32];		/* dword 3 */
+} __packed;
+
+/* Pseudo amap definition for BE3 native mode eth_rx_compl in which
+ * each bit of the actual structure is defined as a byte: used to calculate
+ * offset/shift/mask of each field */
+struct amap_eth_rx_compl_v1 {
+	u8 vlan_tag[16];	/* dword 0 */
+	u8 pktsize[14];		/* dword 0 */
+	u8 vtp;			/* dword 0 */
+	u8 ip_opt;		/* dword 0 */
+	u8 err;			/* dword 1 */
+	u8 rsshp;		/* dword 1 */
+	u8 ipf;			/* dword 1 */
+	u8 tcpf;		/* dword 1 */
+	u8 udpf;		/* dword 1 */
+	u8 ipcksm;		/* dword 1 */
+	u8 l4_cksm;		/* dword 1 */
+	u8 ip_version;		/* dword 1 */
+	u8 macdst[7];		/* dword 1 */
+	u8 rsvd0;		/* dword 1 */
+	u8 fragndx[10];		/* dword 1 */
+	u8 ct[2];		/* dword 1 */
+	u8 sw;			/* dword 1 */
+	u8 numfrags[3];		/* dword 1 */
+	u8 rss_flush;		/* dword 2 */
+	u8 cast_enc[2];		/* dword 2 */
+	u8 qnq;			/* dword 2 */
+	u8 rss_bank;		/* dword 2 */
+	u8 port[2];		/* dword 2 */
+	u8 vntagp;		/* dword 2 */
+	u8 header_len[8];	/* dword 2 */
+	u8 header_split[2];	/* dword 2 */
+	u8 rsvd1[12];		/* dword 2 */
+	u8 tunneled;
+	u8 valid;		/* dword 2 */
+	u8 rsshash[32];		/* dword 3 */
+} __packed;
+
+struct be_eth_rx_compl {
+	u32 dw[4];
+};
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
new file mode 100644
index 0000000..8f75500
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -0,0 +1,6211 @@
+/*
+ * Copyright (C) 2005 - 2016 Broadcom
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.  The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include <linux/prefetch.h>
+#include <linux/module.h>
+#include "be.h"
+#include "be_cmds.h"
+#include <asm/div64.h>
+#include <linux/aer.h>
+#include <linux/if_bridge.h>
+#include <net/busy_poll.h>
+#include <net/vxlan.h>
+
+MODULE_VERSION(DRV_VER);
+MODULE_DESCRIPTION(DRV_DESC " " DRV_VER);
+MODULE_AUTHOR("Emulex Corporation");
+MODULE_LICENSE("GPL");
+
+/* num_vfs module param is obsolete.
+ * Use sysfs method to enable/disable VFs.
+ */
+static unsigned int num_vfs;
+module_param(num_vfs, uint, 0444);
+MODULE_PARM_DESC(num_vfs, "Number of PCI VFs to initialize");
+
+static ushort rx_frag_size = 2048;
+module_param(rx_frag_size, ushort, 0444);
+MODULE_PARM_DESC(rx_frag_size, "Size of a fragment that holds rcvd data.");
+
+/* Per-module error detection/recovery workq shared across all functions.
+ * Each function schedules its own work request on this shared workq.
+ */
+static struct workqueue_struct *be_err_recovery_workq;
+
+static const struct pci_device_id be_dev_ids[] = {
+	{ PCI_DEVICE(BE_VENDOR_ID, BE_DEVICE_ID1) },
+	{ PCI_DEVICE(BE_VENDOR_ID, BE_DEVICE_ID2) },
+	{ PCI_DEVICE(BE_VENDOR_ID, OC_DEVICE_ID1) },
+	{ PCI_DEVICE(BE_VENDOR_ID, OC_DEVICE_ID2) },
+	{ PCI_DEVICE(EMULEX_VENDOR_ID, OC_DEVICE_ID3)},
+	{ PCI_DEVICE(EMULEX_VENDOR_ID, OC_DEVICE_ID4)},
+	{ PCI_DEVICE(EMULEX_VENDOR_ID, OC_DEVICE_ID5)},
+	{ PCI_DEVICE(EMULEX_VENDOR_ID, OC_DEVICE_ID6)},
+	{ 0 }
+};
+MODULE_DEVICE_TABLE(pci, be_dev_ids);
+
+/* Workqueue used by all functions for defering cmd calls to the adapter */
+static struct workqueue_struct *be_wq;
+
+/* UE Status Low CSR */
+static const char * const ue_status_low_desc[] = {
+	"CEV",
+	"CTX",
+	"DBUF",
+	"ERX",
+	"Host",
+	"MPU",
+	"NDMA",
+	"PTC ",
+	"RDMA ",
+	"RXF ",
+	"RXIPS ",
+	"RXULP0 ",
+	"RXULP1 ",
+	"RXULP2 ",
+	"TIM ",
+	"TPOST ",
+	"TPRE ",
+	"TXIPS ",
+	"TXULP0 ",
+	"TXULP1 ",
+	"UC ",
+	"WDMA ",
+	"TXULP2 ",
+	"HOST1 ",
+	"P0_OB_LINK ",
+	"P1_OB_LINK ",
+	"HOST_GPIO ",
+	"MBOX ",
+	"ERX2 ",
+	"SPARE ",
+	"JTAG ",
+	"MPU_INTPEND "
+};
+
+/* UE Status High CSR */
+static const char * const ue_status_hi_desc[] = {
+	"LPCMEMHOST",
+	"MGMT_MAC",
+	"PCS0ONLINE",
+	"MPU_IRAM",
+	"PCS1ONLINE",
+	"PCTL0",
+	"PCTL1",
+	"PMEM",
+	"RR",
+	"TXPB",
+	"RXPP",
+	"XAUI",
+	"TXP",
+	"ARM",
+	"IPC",
+	"HOST2",
+	"HOST3",
+	"HOST4",
+	"HOST5",
+	"HOST6",
+	"HOST7",
+	"ECRC",
+	"Poison TLP",
+	"NETC",
+	"PERIPH",
+	"LLTXULP",
+	"D2P",
+	"RCON",
+	"LDMA",
+	"LLTXP",
+	"LLTXPB",
+	"Unknown"
+};
+
+#define BE_VF_IF_EN_FLAGS	(BE_IF_FLAGS_UNTAGGED | \
+				 BE_IF_FLAGS_BROADCAST | \
+				 BE_IF_FLAGS_MULTICAST | \
+				 BE_IF_FLAGS_PASS_L3L4_ERRORS)
+
+static void be_queue_free(struct be_adapter *adapter, struct be_queue_info *q)
+{
+	struct be_dma_mem *mem = &q->dma_mem;
+
+	if (mem->va) {
+		dma_free_coherent(&adapter->pdev->dev, mem->size, mem->va,
+				  mem->dma);
+		mem->va = NULL;
+	}
+}
+
+static int be_queue_alloc(struct be_adapter *adapter, struct be_queue_info *q,
+			  u16 len, u16 entry_size)
+{
+	struct be_dma_mem *mem = &q->dma_mem;
+
+	memset(q, 0, sizeof(*q));
+	q->len = len;
+	q->entry_size = entry_size;
+	mem->size = len * entry_size;
+	mem->va = dma_zalloc_coherent(&adapter->pdev->dev, mem->size, &mem->dma,
+				      GFP_KERNEL);
+	if (!mem->va)
+		return -ENOMEM;
+	return 0;
+}
+
+static void be_reg_intr_set(struct be_adapter *adapter, bool enable)
+{
+	u32 reg, enabled;
+
+	pci_read_config_dword(adapter->pdev, PCICFG_MEMBAR_CTRL_INT_CTRL_OFFSET,
+			      &reg);
+	enabled = reg & MEMBAR_CTRL_INT_CTRL_HOSTINTR_MASK;
+
+	if (!enabled && enable)
+		reg |= MEMBAR_CTRL_INT_CTRL_HOSTINTR_MASK;
+	else if (enabled && !enable)
+		reg &= ~MEMBAR_CTRL_INT_CTRL_HOSTINTR_MASK;
+	else
+		return;
+
+	pci_write_config_dword(adapter->pdev,
+			       PCICFG_MEMBAR_CTRL_INT_CTRL_OFFSET, reg);
+}
+
+static void be_intr_set(struct be_adapter *adapter, bool enable)
+{
+	int status = 0;
+
+	/* On lancer interrupts can't be controlled via this register */
+	if (lancer_chip(adapter))
+		return;
+
+	if (be_check_error(adapter, BE_ERROR_EEH))
+		return;
+
+	status = be_cmd_intr_set(adapter, enable);
+	if (status)
+		be_reg_intr_set(adapter, enable);
+}
+
+static void be_rxq_notify(struct be_adapter *adapter, u16 qid, u16 posted)
+{
+	u32 val = 0;
+
+	if (be_check_error(adapter, BE_ERROR_HW))
+		return;
+
+	val |= qid & DB_RQ_RING_ID_MASK;
+	val |= posted << DB_RQ_NUM_POSTED_SHIFT;
+
+	wmb();
+	iowrite32(val, adapter->db + DB_RQ_OFFSET);
+}
+
+static void be_txq_notify(struct be_adapter *adapter, struct be_tx_obj *txo,
+			  u16 posted)
+{
+	u32 val = 0;
+
+	if (be_check_error(adapter, BE_ERROR_HW))
+		return;
+
+	val |= txo->q.id & DB_TXULP_RING_ID_MASK;
+	val |= (posted & DB_TXULP_NUM_POSTED_MASK) << DB_TXULP_NUM_POSTED_SHIFT;
+
+	wmb();
+	iowrite32(val, adapter->db + txo->db_offset);
+}
+
+static void be_eq_notify(struct be_adapter *adapter, u16 qid,
+			 bool arm, bool clear_int, u16 num_popped,
+			 u32 eq_delay_mult_enc)
+{
+	u32 val = 0;
+
+	val |= qid & DB_EQ_RING_ID_MASK;
+	val |= ((qid & DB_EQ_RING_ID_EXT_MASK) << DB_EQ_RING_ID_EXT_MASK_SHIFT);
+
+	if (be_check_error(adapter, BE_ERROR_HW))
+		return;
+
+	if (arm)
+		val |= 1 << DB_EQ_REARM_SHIFT;
+	if (clear_int)
+		val |= 1 << DB_EQ_CLR_SHIFT;
+	val |= 1 << DB_EQ_EVNT_SHIFT;
+	val |= num_popped << DB_EQ_NUM_POPPED_SHIFT;
+	val |= eq_delay_mult_enc << DB_EQ_R2I_DLY_SHIFT;
+	iowrite32(val, adapter->db + DB_EQ_OFFSET);
+}
+
+void be_cq_notify(struct be_adapter *adapter, u16 qid, bool arm, u16 num_popped)
+{
+	u32 val = 0;
+
+	val |= qid & DB_CQ_RING_ID_MASK;
+	val |= ((qid & DB_CQ_RING_ID_EXT_MASK) <<
+			DB_CQ_RING_ID_EXT_MASK_SHIFT);
+
+	if (be_check_error(adapter, BE_ERROR_HW))
+		return;
+
+	if (arm)
+		val |= 1 << DB_CQ_REARM_SHIFT;
+	val |= num_popped << DB_CQ_NUM_POPPED_SHIFT;
+	iowrite32(val, adapter->db + DB_CQ_OFFSET);
+}
+
+static int be_dev_mac_add(struct be_adapter *adapter, u8 *mac)
+{
+	int i;
+
+	/* Check if mac has already been added as part of uc-list */
+	for (i = 0; i < adapter->uc_macs; i++) {
+		if (ether_addr_equal(adapter->uc_list[i].mac, mac)) {
+			/* mac already added, skip addition */
+			adapter->pmac_id[0] = adapter->pmac_id[i + 1];
+			return 0;
+		}
+	}
+
+	return be_cmd_pmac_add(adapter, mac, adapter->if_handle,
+			       &adapter->pmac_id[0], 0);
+}
+
+static void be_dev_mac_del(struct be_adapter *adapter, int pmac_id)
+{
+	int i;
+
+	/* Skip deletion if the programmed mac is
+	 * being used in uc-list
+	 */
+	for (i = 0; i < adapter->uc_macs; i++) {
+		if (adapter->pmac_id[i + 1] == pmac_id)
+			return;
+	}
+	be_cmd_pmac_del(adapter, adapter->if_handle, pmac_id, 0);
+}
+
+static int be_mac_addr_set(struct net_device *netdev, void *p)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct device *dev = &adapter->pdev->dev;
+	struct sockaddr *addr = p;
+	int status;
+	u8 mac[ETH_ALEN];
+	u32 old_pmac_id = adapter->pmac_id[0];
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	/* Proceed further only if, User provided MAC is different
+	 * from active MAC
+	 */
+	if (ether_addr_equal(addr->sa_data, adapter->dev_mac))
+		return 0;
+
+	/* BE3 VFs without FILTMGMT privilege are not allowed to set its MAC
+	 * address
+	 */
+	if (BEx_chip(adapter) && be_virtfn(adapter) &&
+	    !check_privilege(adapter, BE_PRIV_FILTMGMT))
+		return -EPERM;
+
+	/* if device is not running, copy MAC to netdev->dev_addr */
+	if (!netif_running(netdev))
+		goto done;
+
+	/* The PMAC_ADD cmd may fail if the VF doesn't have FILTMGMT
+	 * privilege or if PF did not provision the new MAC address.
+	 * On BE3, this cmd will always fail if the VF doesn't have the
+	 * FILTMGMT privilege. This failure is OK, only if the PF programmed
+	 * the MAC for the VF.
+	 */
+	mutex_lock(&adapter->rx_filter_lock);
+	status = be_dev_mac_add(adapter, (u8 *)addr->sa_data);
+	if (!status) {
+
+		/* Delete the old programmed MAC. This call may fail if the
+		 * old MAC was already deleted by the PF driver.
+		 */
+		if (adapter->pmac_id[0] != old_pmac_id)
+			be_dev_mac_del(adapter, old_pmac_id);
+	}
+
+	mutex_unlock(&adapter->rx_filter_lock);
+	/* Decide if the new MAC is successfully activated only after
+	 * querying the FW
+	 */
+	status = be_cmd_get_active_mac(adapter, adapter->pmac_id[0], mac,
+				       adapter->if_handle, true, 0);
+	if (status)
+		goto err;
+
+	/* The MAC change did not happen, either due to lack of privilege
+	 * or PF didn't pre-provision.
+	 */
+	if (!ether_addr_equal(addr->sa_data, mac)) {
+		status = -EPERM;
+		goto err;
+	}
+
+	/* Remember currently programmed MAC */
+	ether_addr_copy(adapter->dev_mac, addr->sa_data);
+done:
+	ether_addr_copy(netdev->dev_addr, addr->sa_data);
+	dev_info(dev, "MAC address changed to %pM\n", addr->sa_data);
+	return 0;
+err:
+	dev_warn(dev, "MAC address change to %pM failed\n", addr->sa_data);
+	return status;
+}
+
+/* BE2 supports only v0 cmd */
+static void *hw_stats_from_cmd(struct be_adapter *adapter)
+{
+	if (BE2_chip(adapter)) {
+		struct be_cmd_resp_get_stats_v0 *cmd = adapter->stats_cmd.va;
+
+		return &cmd->hw_stats;
+	} else if (BE3_chip(adapter)) {
+		struct be_cmd_resp_get_stats_v1 *cmd = adapter->stats_cmd.va;
+
+		return &cmd->hw_stats;
+	} else {
+		struct be_cmd_resp_get_stats_v2 *cmd = adapter->stats_cmd.va;
+
+		return &cmd->hw_stats;
+	}
+}
+
+/* BE2 supports only v0 cmd */
+static void *be_erx_stats_from_cmd(struct be_adapter *adapter)
+{
+	if (BE2_chip(adapter)) {
+		struct be_hw_stats_v0 *hw_stats = hw_stats_from_cmd(adapter);
+
+		return &hw_stats->erx;
+	} else if (BE3_chip(adapter)) {
+		struct be_hw_stats_v1 *hw_stats = hw_stats_from_cmd(adapter);
+
+		return &hw_stats->erx;
+	} else {
+		struct be_hw_stats_v2 *hw_stats = hw_stats_from_cmd(adapter);
+
+		return &hw_stats->erx;
+	}
+}
+
+static void populate_be_v0_stats(struct be_adapter *adapter)
+{
+	struct be_hw_stats_v0 *hw_stats = hw_stats_from_cmd(adapter);
+	struct be_pmem_stats *pmem_sts = &hw_stats->pmem;
+	struct be_rxf_stats_v0 *rxf_stats = &hw_stats->rxf;
+	struct be_port_rxf_stats_v0 *port_stats =
+					&rxf_stats->port[adapter->port_num];
+	struct be_drv_stats *drvs = &adapter->drv_stats;
+
+	be_dws_le_to_cpu(hw_stats, sizeof(*hw_stats));
+	drvs->rx_pause_frames = port_stats->rx_pause_frames;
+	drvs->rx_crc_errors = port_stats->rx_crc_errors;
+	drvs->rx_control_frames = port_stats->rx_control_frames;
+	drvs->rx_in_range_errors = port_stats->rx_in_range_errors;
+	drvs->rx_frame_too_long = port_stats->rx_frame_too_long;
+	drvs->rx_dropped_runt = port_stats->rx_dropped_runt;
+	drvs->rx_ip_checksum_errs = port_stats->rx_ip_checksum_errs;
+	drvs->rx_tcp_checksum_errs = port_stats->rx_tcp_checksum_errs;
+	drvs->rx_udp_checksum_errs = port_stats->rx_udp_checksum_errs;
+	drvs->rxpp_fifo_overflow_drop = port_stats->rx_fifo_overflow;
+	drvs->rx_dropped_tcp_length = port_stats->rx_dropped_tcp_length;
+	drvs->rx_dropped_too_small = port_stats->rx_dropped_too_small;
+	drvs->rx_dropped_too_short = port_stats->rx_dropped_too_short;
+	drvs->rx_out_range_errors = port_stats->rx_out_range_errors;
+	drvs->rx_input_fifo_overflow_drop = port_stats->rx_input_fifo_overflow;
+	drvs->rx_dropped_header_too_small =
+		port_stats->rx_dropped_header_too_small;
+	drvs->rx_address_filtered =
+					port_stats->rx_address_filtered +
+					port_stats->rx_vlan_filtered;
+	drvs->rx_alignment_symbol_errors =
+		port_stats->rx_alignment_symbol_errors;
+
+	drvs->tx_pauseframes = port_stats->tx_pauseframes;
+	drvs->tx_controlframes = port_stats->tx_controlframes;
+
+	if (adapter->port_num)
+		drvs->jabber_events = rxf_stats->port1_jabber_events;
+	else
+		drvs->jabber_events = rxf_stats->port0_jabber_events;
+	drvs->rx_drops_no_pbuf = rxf_stats->rx_drops_no_pbuf;
+	drvs->rx_drops_no_erx_descr = rxf_stats->rx_drops_no_erx_descr;
+	drvs->forwarded_packets = rxf_stats->forwarded_packets;
+	drvs->rx_drops_mtu = rxf_stats->rx_drops_mtu;
+	drvs->rx_drops_no_tpre_descr = rxf_stats->rx_drops_no_tpre_descr;
+	drvs->rx_drops_too_many_frags = rxf_stats->rx_drops_too_many_frags;
+	adapter->drv_stats.eth_red_drops = pmem_sts->eth_red_drops;
+}
+
+static void populate_be_v1_stats(struct be_adapter *adapter)
+{
+	struct be_hw_stats_v1 *hw_stats = hw_stats_from_cmd(adapter);
+	struct be_pmem_stats *pmem_sts = &hw_stats->pmem;
+	struct be_rxf_stats_v1 *rxf_stats = &hw_stats->rxf;
+	struct be_port_rxf_stats_v1 *port_stats =
+					&rxf_stats->port[adapter->port_num];
+	struct be_drv_stats *drvs = &adapter->drv_stats;
+
+	be_dws_le_to_cpu(hw_stats, sizeof(*hw_stats));
+	drvs->pmem_fifo_overflow_drop = port_stats->pmem_fifo_overflow_drop;
+	drvs->rx_priority_pause_frames = port_stats->rx_priority_pause_frames;
+	drvs->rx_pause_frames = port_stats->rx_pause_frames;
+	drvs->rx_crc_errors = port_stats->rx_crc_errors;
+	drvs->rx_control_frames = port_stats->rx_control_frames;
+	drvs->rx_in_range_errors = port_stats->rx_in_range_errors;
+	drvs->rx_frame_too_long = port_stats->rx_frame_too_long;
+	drvs->rx_dropped_runt = port_stats->rx_dropped_runt;
+	drvs->rx_ip_checksum_errs = port_stats->rx_ip_checksum_errs;
+	drvs->rx_tcp_checksum_errs = port_stats->rx_tcp_checksum_errs;
+	drvs->rx_udp_checksum_errs = port_stats->rx_udp_checksum_errs;
+	drvs->rx_dropped_tcp_length = port_stats->rx_dropped_tcp_length;
+	drvs->rx_dropped_too_small = port_stats->rx_dropped_too_small;
+	drvs->rx_dropped_too_short = port_stats->rx_dropped_too_short;
+	drvs->rx_out_range_errors = port_stats->rx_out_range_errors;
+	drvs->rx_dropped_header_too_small =
+		port_stats->rx_dropped_header_too_small;
+	drvs->rx_input_fifo_overflow_drop =
+		port_stats->rx_input_fifo_overflow_drop;
+	drvs->rx_address_filtered = port_stats->rx_address_filtered;
+	drvs->rx_alignment_symbol_errors =
+		port_stats->rx_alignment_symbol_errors;
+	drvs->rxpp_fifo_overflow_drop = port_stats->rxpp_fifo_overflow_drop;
+	drvs->tx_pauseframes = port_stats->tx_pauseframes;
+	drvs->tx_controlframes = port_stats->tx_controlframes;
+	drvs->tx_priority_pauseframes = port_stats->tx_priority_pauseframes;
+	drvs->jabber_events = port_stats->jabber_events;
+	drvs->rx_drops_no_pbuf = rxf_stats->rx_drops_no_pbuf;
+	drvs->rx_drops_no_erx_descr = rxf_stats->rx_drops_no_erx_descr;
+	drvs->forwarded_packets = rxf_stats->forwarded_packets;
+	drvs->rx_drops_mtu = rxf_stats->rx_drops_mtu;
+	drvs->rx_drops_no_tpre_descr = rxf_stats->rx_drops_no_tpre_descr;
+	drvs->rx_drops_too_many_frags = rxf_stats->rx_drops_too_many_frags;
+	adapter->drv_stats.eth_red_drops = pmem_sts->eth_red_drops;
+}
+
+static void populate_be_v2_stats(struct be_adapter *adapter)
+{
+	struct be_hw_stats_v2 *hw_stats = hw_stats_from_cmd(adapter);
+	struct be_pmem_stats *pmem_sts = &hw_stats->pmem;
+	struct be_rxf_stats_v2 *rxf_stats = &hw_stats->rxf;
+	struct be_port_rxf_stats_v2 *port_stats =
+					&rxf_stats->port[adapter->port_num];
+	struct be_drv_stats *drvs = &adapter->drv_stats;
+
+	be_dws_le_to_cpu(hw_stats, sizeof(*hw_stats));
+	drvs->pmem_fifo_overflow_drop = port_stats->pmem_fifo_overflow_drop;
+	drvs->rx_priority_pause_frames = port_stats->rx_priority_pause_frames;
+	drvs->rx_pause_frames = port_stats->rx_pause_frames;
+	drvs->rx_crc_errors = port_stats->rx_crc_errors;
+	drvs->rx_control_frames = port_stats->rx_control_frames;
+	drvs->rx_in_range_errors = port_stats->rx_in_range_errors;
+	drvs->rx_frame_too_long = port_stats->rx_frame_too_long;
+	drvs->rx_dropped_runt = port_stats->rx_dropped_runt;
+	drvs->rx_ip_checksum_errs = port_stats->rx_ip_checksum_errs;
+	drvs->rx_tcp_checksum_errs = port_stats->rx_tcp_checksum_errs;
+	drvs->rx_udp_checksum_errs = port_stats->rx_udp_checksum_errs;
+	drvs->rx_dropped_tcp_length = port_stats->rx_dropped_tcp_length;
+	drvs->rx_dropped_too_small = port_stats->rx_dropped_too_small;
+	drvs->rx_dropped_too_short = port_stats->rx_dropped_too_short;
+	drvs->rx_out_range_errors = port_stats->rx_out_range_errors;
+	drvs->rx_dropped_header_too_small =
+		port_stats->rx_dropped_header_too_small;
+	drvs->rx_input_fifo_overflow_drop =
+		port_stats->rx_input_fifo_overflow_drop;
+	drvs->rx_address_filtered = port_stats->rx_address_filtered;
+	drvs->rx_alignment_symbol_errors =
+		port_stats->rx_alignment_symbol_errors;
+	drvs->rxpp_fifo_overflow_drop = port_stats->rxpp_fifo_overflow_drop;
+	drvs->tx_pauseframes = port_stats->tx_pauseframes;
+	drvs->tx_controlframes = port_stats->tx_controlframes;
+	drvs->tx_priority_pauseframes = port_stats->tx_priority_pauseframes;
+	drvs->jabber_events = port_stats->jabber_events;
+	drvs->rx_drops_no_pbuf = rxf_stats->rx_drops_no_pbuf;
+	drvs->rx_drops_no_erx_descr = rxf_stats->rx_drops_no_erx_descr;
+	drvs->forwarded_packets = rxf_stats->forwarded_packets;
+	drvs->rx_drops_mtu = rxf_stats->rx_drops_mtu;
+	drvs->rx_drops_no_tpre_descr = rxf_stats->rx_drops_no_tpre_descr;
+	drvs->rx_drops_too_many_frags = rxf_stats->rx_drops_too_many_frags;
+	adapter->drv_stats.eth_red_drops = pmem_sts->eth_red_drops;
+	if (be_roce_supported(adapter)) {
+		drvs->rx_roce_bytes_lsd = port_stats->roce_bytes_received_lsd;
+		drvs->rx_roce_bytes_msd = port_stats->roce_bytes_received_msd;
+		drvs->rx_roce_frames = port_stats->roce_frames_received;
+		drvs->roce_drops_crc = port_stats->roce_drops_crc;
+		drvs->roce_drops_payload_len =
+			port_stats->roce_drops_payload_len;
+	}
+}
+
+static void populate_lancer_stats(struct be_adapter *adapter)
+{
+	struct be_drv_stats *drvs = &adapter->drv_stats;
+	struct lancer_pport_stats *pport_stats = pport_stats_from_cmd(adapter);
+
+	be_dws_le_to_cpu(pport_stats, sizeof(*pport_stats));
+	drvs->rx_pause_frames = pport_stats->rx_pause_frames_lo;
+	drvs->rx_crc_errors = pport_stats->rx_crc_errors_lo;
+	drvs->rx_control_frames = pport_stats->rx_control_frames_lo;
+	drvs->rx_in_range_errors = pport_stats->rx_in_range_errors;
+	drvs->rx_frame_too_long = pport_stats->rx_frames_too_long_lo;
+	drvs->rx_dropped_runt = pport_stats->rx_dropped_runt;
+	drvs->rx_ip_checksum_errs = pport_stats->rx_ip_checksum_errors;
+	drvs->rx_tcp_checksum_errs = pport_stats->rx_tcp_checksum_errors;
+	drvs->rx_udp_checksum_errs = pport_stats->rx_udp_checksum_errors;
+	drvs->rx_dropped_tcp_length =
+				pport_stats->rx_dropped_invalid_tcp_length;
+	drvs->rx_dropped_too_small = pport_stats->rx_dropped_too_small;
+	drvs->rx_dropped_too_short = pport_stats->rx_dropped_too_short;
+	drvs->rx_out_range_errors = pport_stats->rx_out_of_range_errors;
+	drvs->rx_dropped_header_too_small =
+				pport_stats->rx_dropped_header_too_small;
+	drvs->rx_input_fifo_overflow_drop = pport_stats->rx_fifo_overflow;
+	drvs->rx_address_filtered =
+					pport_stats->rx_address_filtered +
+					pport_stats->rx_vlan_filtered;
+	drvs->rx_alignment_symbol_errors = pport_stats->rx_symbol_errors_lo;
+	drvs->rxpp_fifo_overflow_drop = pport_stats->rx_fifo_overflow;
+	drvs->tx_pauseframes = pport_stats->tx_pause_frames_lo;
+	drvs->tx_controlframes = pport_stats->tx_control_frames_lo;
+	drvs->jabber_events = pport_stats->rx_jabbers;
+	drvs->forwarded_packets = pport_stats->num_forwards_lo;
+	drvs->rx_drops_mtu = pport_stats->rx_drops_mtu_lo;
+	drvs->rx_drops_too_many_frags =
+				pport_stats->rx_drops_too_many_frags_lo;
+}
+
+static void accumulate_16bit_val(u32 *acc, u16 val)
+{
+#define lo(x)			(x & 0xFFFF)
+#define hi(x)			(x & 0xFFFF0000)
+	bool wrapped = val < lo(*acc);
+	u32 newacc = hi(*acc) + val;
+
+	if (wrapped)
+		newacc += 65536;
+	WRITE_ONCE(*acc, newacc);
+}
+
+static void populate_erx_stats(struct be_adapter *adapter,
+			       struct be_rx_obj *rxo, u32 erx_stat)
+{
+	if (!BEx_chip(adapter))
+		rx_stats(rxo)->rx_drops_no_frags = erx_stat;
+	else
+		/* below erx HW counter can actually wrap around after
+		 * 65535. Driver accumulates a 32-bit value
+		 */
+		accumulate_16bit_val(&rx_stats(rxo)->rx_drops_no_frags,
+				     (u16)erx_stat);
+}
+
+void be_parse_stats(struct be_adapter *adapter)
+{
+	struct be_erx_stats_v2 *erx = be_erx_stats_from_cmd(adapter);
+	struct be_rx_obj *rxo;
+	int i;
+	u32 erx_stat;
+
+	if (lancer_chip(adapter)) {
+		populate_lancer_stats(adapter);
+	} else {
+		if (BE2_chip(adapter))
+			populate_be_v0_stats(adapter);
+		else if (BE3_chip(adapter))
+			/* for BE3 */
+			populate_be_v1_stats(adapter);
+		else
+			populate_be_v2_stats(adapter);
+
+		/* erx_v2 is longer than v0, v1. use v2 for v0, v1 access */
+		for_all_rx_queues(adapter, rxo, i) {
+			erx_stat = erx->rx_drops_no_fragments[rxo->q.id];
+			populate_erx_stats(adapter, rxo, erx_stat);
+		}
+	}
+}
+
+static void be_get_stats64(struct net_device *netdev,
+			   struct rtnl_link_stats64 *stats)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_drv_stats *drvs = &adapter->drv_stats;
+	struct be_rx_obj *rxo;
+	struct be_tx_obj *txo;
+	u64 pkts, bytes;
+	unsigned int start;
+	int i;
+
+	for_all_rx_queues(adapter, rxo, i) {
+		const struct be_rx_stats *rx_stats = rx_stats(rxo);
+
+		do {
+			start = u64_stats_fetch_begin_irq(&rx_stats->sync);
+			pkts = rx_stats(rxo)->rx_pkts;
+			bytes = rx_stats(rxo)->rx_bytes;
+		} while (u64_stats_fetch_retry_irq(&rx_stats->sync, start));
+		stats->rx_packets += pkts;
+		stats->rx_bytes += bytes;
+		stats->multicast += rx_stats(rxo)->rx_mcast_pkts;
+		stats->rx_dropped += rx_stats(rxo)->rx_drops_no_skbs +
+					rx_stats(rxo)->rx_drops_no_frags;
+	}
+
+	for_all_tx_queues(adapter, txo, i) {
+		const struct be_tx_stats *tx_stats = tx_stats(txo);
+
+		do {
+			start = u64_stats_fetch_begin_irq(&tx_stats->sync);
+			pkts = tx_stats(txo)->tx_pkts;
+			bytes = tx_stats(txo)->tx_bytes;
+		} while (u64_stats_fetch_retry_irq(&tx_stats->sync, start));
+		stats->tx_packets += pkts;
+		stats->tx_bytes += bytes;
+	}
+
+	/* bad pkts received */
+	stats->rx_errors = drvs->rx_crc_errors +
+		drvs->rx_alignment_symbol_errors +
+		drvs->rx_in_range_errors +
+		drvs->rx_out_range_errors +
+		drvs->rx_frame_too_long +
+		drvs->rx_dropped_too_small +
+		drvs->rx_dropped_too_short +
+		drvs->rx_dropped_header_too_small +
+		drvs->rx_dropped_tcp_length +
+		drvs->rx_dropped_runt;
+
+	/* detailed rx errors */
+	stats->rx_length_errors = drvs->rx_in_range_errors +
+		drvs->rx_out_range_errors +
+		drvs->rx_frame_too_long;
+
+	stats->rx_crc_errors = drvs->rx_crc_errors;
+
+	/* frame alignment errors */
+	stats->rx_frame_errors = drvs->rx_alignment_symbol_errors;
+
+	/* receiver fifo overrun */
+	/* drops_no_pbuf is no per i/f, it's per BE card */
+	stats->rx_fifo_errors = drvs->rxpp_fifo_overflow_drop +
+				drvs->rx_input_fifo_overflow_drop +
+				drvs->rx_drops_no_pbuf;
+}
+
+void be_link_status_update(struct be_adapter *adapter, u8 link_status)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (!(adapter->flags & BE_FLAGS_LINK_STATUS_INIT)) {
+		netif_carrier_off(netdev);
+		adapter->flags |= BE_FLAGS_LINK_STATUS_INIT;
+	}
+
+	if (link_status)
+		netif_carrier_on(netdev);
+	else
+		netif_carrier_off(netdev);
+
+	netdev_info(netdev, "Link is %s\n", link_status ? "Up" : "Down");
+}
+
+static int be_gso_hdr_len(struct sk_buff *skb)
+{
+	if (skb->encapsulation)
+		return skb_inner_transport_offset(skb) +
+		       inner_tcp_hdrlen(skb);
+	return skb_transport_offset(skb) + tcp_hdrlen(skb);
+}
+
+static void be_tx_stats_update(struct be_tx_obj *txo, struct sk_buff *skb)
+{
+	struct be_tx_stats *stats = tx_stats(txo);
+	u32 tx_pkts = skb_shinfo(skb)->gso_segs ? : 1;
+	/* Account for headers which get duplicated in TSO pkt */
+	u32 dup_hdr_len = tx_pkts > 1 ? be_gso_hdr_len(skb) * (tx_pkts - 1) : 0;
+
+	u64_stats_update_begin(&stats->sync);
+	stats->tx_reqs++;
+	stats->tx_bytes += skb->len + dup_hdr_len;
+	stats->tx_pkts += tx_pkts;
+	if (skb->encapsulation && skb->ip_summed == CHECKSUM_PARTIAL)
+		stats->tx_vxlan_offload_pkts += tx_pkts;
+	u64_stats_update_end(&stats->sync);
+}
+
+/* Returns number of WRBs needed for the skb */
+static u32 skb_wrb_cnt(struct sk_buff *skb)
+{
+	/* +1 for the header wrb */
+	return 1 + (skb_headlen(skb) ? 1 : 0) + skb_shinfo(skb)->nr_frags;
+}
+
+static inline void wrb_fill(struct be_eth_wrb *wrb, u64 addr, int len)
+{
+	wrb->frag_pa_hi = cpu_to_le32(upper_32_bits(addr));
+	wrb->frag_pa_lo = cpu_to_le32(lower_32_bits(addr));
+	wrb->frag_len = cpu_to_le32(len & ETH_WRB_FRAG_LEN_MASK);
+	wrb->rsvd0 = 0;
+}
+
+/* A dummy wrb is just all zeros. Using a separate routine for dummy-wrb
+ * to avoid the swap and shift/mask operations in wrb_fill().
+ */
+static inline void wrb_fill_dummy(struct be_eth_wrb *wrb)
+{
+	wrb->frag_pa_hi = 0;
+	wrb->frag_pa_lo = 0;
+	wrb->frag_len = 0;
+	wrb->rsvd0 = 0;
+}
+
+static inline u16 be_get_tx_vlan_tag(struct be_adapter *adapter,
+				     struct sk_buff *skb)
+{
+	u8 vlan_prio;
+	u16 vlan_tag;
+
+	vlan_tag = skb_vlan_tag_get(skb);
+	vlan_prio = (vlan_tag & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+	/* If vlan priority provided by OS is NOT in available bmap */
+	if (!(adapter->vlan_prio_bmap & (1 << vlan_prio)))
+		vlan_tag = (vlan_tag & ~VLAN_PRIO_MASK) |
+				adapter->recommended_prio_bits;
+
+	return vlan_tag;
+}
+
+/* Used only for IP tunnel packets */
+static u16 skb_inner_ip_proto(struct sk_buff *skb)
+{
+	return (inner_ip_hdr(skb)->version == 4) ?
+		inner_ip_hdr(skb)->protocol : inner_ipv6_hdr(skb)->nexthdr;
+}
+
+static u16 skb_ip_proto(struct sk_buff *skb)
+{
+	return (ip_hdr(skb)->version == 4) ?
+		ip_hdr(skb)->protocol : ipv6_hdr(skb)->nexthdr;
+}
+
+static inline bool be_is_txq_full(struct be_tx_obj *txo)
+{
+	return atomic_read(&txo->q.used) + BE_MAX_TX_FRAG_COUNT >= txo->q.len;
+}
+
+static inline bool be_can_txq_wake(struct be_tx_obj *txo)
+{
+	return atomic_read(&txo->q.used) < txo->q.len / 2;
+}
+
+static inline bool be_is_tx_compl_pending(struct be_tx_obj *txo)
+{
+	return atomic_read(&txo->q.used) > txo->pend_wrb_cnt;
+}
+
+static void be_get_wrb_params_from_skb(struct be_adapter *adapter,
+				       struct sk_buff *skb,
+				       struct be_wrb_params *wrb_params)
+{
+	u16 proto;
+
+	if (skb_is_gso(skb)) {
+		BE_WRB_F_SET(wrb_params->features, LSO, 1);
+		wrb_params->lso_mss = skb_shinfo(skb)->gso_size;
+		if (skb_is_gso_v6(skb) && !lancer_chip(adapter))
+			BE_WRB_F_SET(wrb_params->features, LSO6, 1);
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		if (skb->encapsulation) {
+			BE_WRB_F_SET(wrb_params->features, IPCS, 1);
+			proto = skb_inner_ip_proto(skb);
+		} else {
+			proto = skb_ip_proto(skb);
+		}
+		if (proto == IPPROTO_TCP)
+			BE_WRB_F_SET(wrb_params->features, TCPCS, 1);
+		else if (proto == IPPROTO_UDP)
+			BE_WRB_F_SET(wrb_params->features, UDPCS, 1);
+	}
+
+	if (skb_vlan_tag_present(skb)) {
+		BE_WRB_F_SET(wrb_params->features, VLAN, 1);
+		wrb_params->vlan_tag = be_get_tx_vlan_tag(adapter, skb);
+	}
+
+	BE_WRB_F_SET(wrb_params->features, CRC, 1);
+}
+
+static void wrb_fill_hdr(struct be_adapter *adapter,
+			 struct be_eth_hdr_wrb *hdr,
+			 struct be_wrb_params *wrb_params,
+			 struct sk_buff *skb)
+{
+	memset(hdr, 0, sizeof(*hdr));
+
+	SET_TX_WRB_HDR_BITS(crc, hdr,
+			    BE_WRB_F_GET(wrb_params->features, CRC));
+	SET_TX_WRB_HDR_BITS(ipcs, hdr,
+			    BE_WRB_F_GET(wrb_params->features, IPCS));
+	SET_TX_WRB_HDR_BITS(tcpcs, hdr,
+			    BE_WRB_F_GET(wrb_params->features, TCPCS));
+	SET_TX_WRB_HDR_BITS(udpcs, hdr,
+			    BE_WRB_F_GET(wrb_params->features, UDPCS));
+
+	SET_TX_WRB_HDR_BITS(lso, hdr,
+			    BE_WRB_F_GET(wrb_params->features, LSO));
+	SET_TX_WRB_HDR_BITS(lso6, hdr,
+			    BE_WRB_F_GET(wrb_params->features, LSO6));
+	SET_TX_WRB_HDR_BITS(lso_mss, hdr, wrb_params->lso_mss);
+
+	/* Hack to skip HW VLAN tagging needs evt = 1, compl = 0. When this
+	 * hack is not needed, the evt bit is set while ringing DB.
+	 */
+	SET_TX_WRB_HDR_BITS(event, hdr,
+			    BE_WRB_F_GET(wrb_params->features, VLAN_SKIP_HW));
+	SET_TX_WRB_HDR_BITS(vlan, hdr,
+			    BE_WRB_F_GET(wrb_params->features, VLAN));
+	SET_TX_WRB_HDR_BITS(vlan_tag, hdr, wrb_params->vlan_tag);
+
+	SET_TX_WRB_HDR_BITS(num_wrb, hdr, skb_wrb_cnt(skb));
+	SET_TX_WRB_HDR_BITS(len, hdr, skb->len);
+	SET_TX_WRB_HDR_BITS(mgmt, hdr,
+			    BE_WRB_F_GET(wrb_params->features, OS2BMC));
+}
+
+static void unmap_tx_frag(struct device *dev, struct be_eth_wrb *wrb,
+			  bool unmap_single)
+{
+	dma_addr_t dma;
+	u32 frag_len = le32_to_cpu(wrb->frag_len);
+
+
+	dma = (u64)le32_to_cpu(wrb->frag_pa_hi) << 32 |
+		(u64)le32_to_cpu(wrb->frag_pa_lo);
+	if (frag_len) {
+		if (unmap_single)
+			dma_unmap_single(dev, dma, frag_len, DMA_TO_DEVICE);
+		else
+			dma_unmap_page(dev, dma, frag_len, DMA_TO_DEVICE);
+	}
+}
+
+/* Grab a WRB header for xmit */
+static u32 be_tx_get_wrb_hdr(struct be_tx_obj *txo)
+{
+	u32 head = txo->q.head;
+
+	queue_head_inc(&txo->q);
+	return head;
+}
+
+/* Set up the WRB header for xmit */
+static void be_tx_setup_wrb_hdr(struct be_adapter *adapter,
+				struct be_tx_obj *txo,
+				struct be_wrb_params *wrb_params,
+				struct sk_buff *skb, u16 head)
+{
+	u32 num_frags = skb_wrb_cnt(skb);
+	struct be_queue_info *txq = &txo->q;
+	struct be_eth_hdr_wrb *hdr = queue_index_node(txq, head);
+
+	wrb_fill_hdr(adapter, hdr, wrb_params, skb);
+	be_dws_cpu_to_le(hdr, sizeof(*hdr));
+
+	BUG_ON(txo->sent_skb_list[head]);
+	txo->sent_skb_list[head] = skb;
+	txo->last_req_hdr = head;
+	atomic_add(num_frags, &txq->used);
+	txo->last_req_wrb_cnt = num_frags;
+	txo->pend_wrb_cnt += num_frags;
+}
+
+/* Setup a WRB fragment (buffer descriptor) for xmit */
+static void be_tx_setup_wrb_frag(struct be_tx_obj *txo, dma_addr_t busaddr,
+				 int len)
+{
+	struct be_eth_wrb *wrb;
+	struct be_queue_info *txq = &txo->q;
+
+	wrb = queue_head_node(txq);
+	wrb_fill(wrb, busaddr, len);
+	queue_head_inc(txq);
+}
+
+/* Bring the queue back to the state it was in before be_xmit_enqueue() routine
+ * was invoked. The producer index is restored to the previous packet and the
+ * WRBs of the current packet are unmapped. Invoked to handle tx setup errors.
+ */
+static void be_xmit_restore(struct be_adapter *adapter,
+			    struct be_tx_obj *txo, u32 head, bool map_single,
+			    u32 copied)
+{
+	struct device *dev;
+	struct be_eth_wrb *wrb;
+	struct be_queue_info *txq = &txo->q;
+
+	dev = &adapter->pdev->dev;
+	txq->head = head;
+
+	/* skip the first wrb (hdr); it's not mapped */
+	queue_head_inc(txq);
+	while (copied) {
+		wrb = queue_head_node(txq);
+		unmap_tx_frag(dev, wrb, map_single);
+		map_single = false;
+		copied -= le32_to_cpu(wrb->frag_len);
+		queue_head_inc(txq);
+	}
+
+	txq->head = head;
+}
+
+/* Enqueue the given packet for transmit. This routine allocates WRBs for the
+ * packet, dma maps the packet buffers and sets up the WRBs. Returns the number
+ * of WRBs used up by the packet.
+ */
+static u32 be_xmit_enqueue(struct be_adapter *adapter, struct be_tx_obj *txo,
+			   struct sk_buff *skb,
+			   struct be_wrb_params *wrb_params)
+{
+	u32 i, copied = 0, wrb_cnt = skb_wrb_cnt(skb);
+	struct device *dev = &adapter->pdev->dev;
+	bool map_single = false;
+	u32 head;
+	dma_addr_t busaddr;
+	int len;
+
+	head = be_tx_get_wrb_hdr(txo);
+
+	if (skb->len > skb->data_len) {
+		len = skb_headlen(skb);
+
+		busaddr = dma_map_single(dev, skb->data, len, DMA_TO_DEVICE);
+		if (dma_mapping_error(dev, busaddr))
+			goto dma_err;
+		map_single = true;
+		be_tx_setup_wrb_frag(txo, busaddr, len);
+		copied += len;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i];
+		len = skb_frag_size(frag);
+
+		busaddr = skb_frag_dma_map(dev, frag, 0, len, DMA_TO_DEVICE);
+		if (dma_mapping_error(dev, busaddr))
+			goto dma_err;
+		be_tx_setup_wrb_frag(txo, busaddr, len);
+		copied += len;
+	}
+
+	be_tx_setup_wrb_hdr(adapter, txo, wrb_params, skb, head);
+
+	be_tx_stats_update(txo, skb);
+	return wrb_cnt;
+
+dma_err:
+	adapter->drv_stats.dma_map_errors++;
+	be_xmit_restore(adapter, txo, head, map_single, copied);
+	return 0;
+}
+
+static inline int qnq_async_evt_rcvd(struct be_adapter *adapter)
+{
+	return adapter->flags & BE_FLAGS_QNQ_ASYNC_EVT_RCVD;
+}
+
+static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
+					     struct sk_buff *skb,
+					     struct be_wrb_params
+					     *wrb_params)
+{
+	u16 vlan_tag = 0;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return skb;
+
+	if (skb_vlan_tag_present(skb))
+		vlan_tag = be_get_tx_vlan_tag(adapter, skb);
+
+	if (qnq_async_evt_rcvd(adapter) && adapter->pvid) {
+		if (!vlan_tag)
+			vlan_tag = adapter->pvid;
+		/* f/w workaround to set skip_hw_vlan = 1, informs the F/W to
+		 * skip VLAN insertion
+		 */
+		BE_WRB_F_SET(wrb_params->features, VLAN_SKIP_HW, 1);
+	}
+
+	if (vlan_tag) {
+		skb = vlan_insert_tag_set_proto(skb, htons(ETH_P_8021Q),
+						vlan_tag);
+		if (unlikely(!skb))
+			return skb;
+		skb->vlan_tci = 0;
+	}
+
+	/* Insert the outer VLAN, if any */
+	if (adapter->qnq_vid) {
+		vlan_tag = adapter->qnq_vid;
+		skb = vlan_insert_tag_set_proto(skb, htons(ETH_P_8021Q),
+						vlan_tag);
+		if (unlikely(!skb))
+			return skb;
+		BE_WRB_F_SET(wrb_params->features, VLAN_SKIP_HW, 1);
+	}
+
+	return skb;
+}
+
+static bool be_ipv6_exthdr_check(struct sk_buff *skb)
+{
+	struct ethhdr *eh = (struct ethhdr *)skb->data;
+	u16 offset = ETH_HLEN;
+
+	if (eh->h_proto == htons(ETH_P_IPV6)) {
+		struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + offset);
+
+		offset += sizeof(struct ipv6hdr);
+		if (ip6h->nexthdr != NEXTHDR_TCP &&
+		    ip6h->nexthdr != NEXTHDR_UDP) {
+			struct ipv6_opt_hdr *ehdr =
+				(struct ipv6_opt_hdr *)(skb->data + offset);
+
+			/* offending pkt: 2nd byte following IPv6 hdr is 0xff */
+			if (ehdr->hdrlen == 0xff)
+				return true;
+		}
+	}
+	return false;
+}
+
+static int be_vlan_tag_tx_chk(struct be_adapter *adapter, struct sk_buff *skb)
+{
+	return skb_vlan_tag_present(skb) || adapter->pvid || adapter->qnq_vid;
+}
+
+static int be_ipv6_tx_stall_chk(struct be_adapter *adapter, struct sk_buff *skb)
+{
+	return BE3_chip(adapter) && be_ipv6_exthdr_check(skb);
+}
+
+static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter,
+						  struct sk_buff *skb,
+						  struct be_wrb_params
+						  *wrb_params)
+{
+	struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+	unsigned int eth_hdr_len;
+	struct iphdr *ip;
+
+	/* For padded packets, BE HW modifies tot_len field in IP header
+	 * incorrecly when VLAN tag is inserted by HW.
+	 * For padded packets, Lancer computes incorrect checksum.
+	 */
+	eth_hdr_len = ntohs(skb->protocol) == ETH_P_8021Q ?
+						VLAN_ETH_HLEN : ETH_HLEN;
+	if (skb->len <= 60 &&
+	    (lancer_chip(adapter) || skb_vlan_tag_present(skb)) &&
+	    is_ipv4_pkt(skb)) {
+		ip = (struct iphdr *)ip_hdr(skb);
+		pskb_trim(skb, eth_hdr_len + ntohs(ip->tot_len));
+	}
+
+	/* If vlan tag is already inlined in the packet, skip HW VLAN
+	 * tagging in pvid-tagging mode
+	 */
+	if (be_pvid_tagging_enabled(adapter) &&
+	    veh->h_vlan_proto == htons(ETH_P_8021Q))
+		BE_WRB_F_SET(wrb_params->features, VLAN_SKIP_HW, 1);
+
+	/* HW has a bug wherein it will calculate CSUM for VLAN
+	 * pkts even though it is disabled.
+	 * Manually insert VLAN in pkt.
+	 */
+	if (skb->ip_summed != CHECKSUM_PARTIAL &&
+	    skb_vlan_tag_present(skb)) {
+		skb = be_insert_vlan_in_pkt(adapter, skb, wrb_params);
+		if (unlikely(!skb))
+			goto err;
+	}
+
+	/* HW may lockup when VLAN HW tagging is requested on
+	 * certain ipv6 packets. Drop such pkts if the HW workaround to
+	 * skip HW tagging is not enabled by FW.
+	 */
+	if (unlikely(be_ipv6_tx_stall_chk(adapter, skb) &&
+		     (adapter->pvid || adapter->qnq_vid) &&
+		     !qnq_async_evt_rcvd(adapter)))
+		goto tx_drop;
+
+	/* Manual VLAN tag insertion to prevent:
+	 * ASIC lockup when the ASIC inserts VLAN tag into
+	 * certain ipv6 packets. Insert VLAN tags in driver,
+	 * and set event, completion, vlan bits accordingly
+	 * in the Tx WRB.
+	 */
+	if (be_ipv6_tx_stall_chk(adapter, skb) &&
+	    be_vlan_tag_tx_chk(adapter, skb)) {
+		skb = be_insert_vlan_in_pkt(adapter, skb, wrb_params);
+		if (unlikely(!skb))
+			goto err;
+	}
+
+	return skb;
+tx_drop:
+	dev_kfree_skb_any(skb);
+err:
+	return NULL;
+}
+
+static struct sk_buff *be_xmit_workarounds(struct be_adapter *adapter,
+					   struct sk_buff *skb,
+					   struct be_wrb_params *wrb_params)
+{
+	int err;
+
+	/* Lancer, SH and BE3 in SRIOV mode have a bug wherein
+	 * packets that are 32b or less may cause a transmit stall
+	 * on that port. The workaround is to pad such packets
+	 * (len <= 32 bytes) to a minimum length of 36b.
+	 */
+	if (skb->len <= 32) {
+		if (skb_put_padto(skb, 36))
+			return NULL;
+	}
+
+	if (BEx_chip(adapter) || lancer_chip(adapter)) {
+		skb = be_lancer_xmit_workarounds(adapter, skb, wrb_params);
+		if (!skb)
+			return NULL;
+	}
+
+	/* The stack can send us skbs with length greater than
+	 * what the HW can handle. Trim the extra bytes.
+	 */
+	WARN_ON_ONCE(skb->len > BE_MAX_GSO_SIZE);
+	err = pskb_trim(skb, BE_MAX_GSO_SIZE);
+	WARN_ON(err);
+
+	return skb;
+}
+
+static void be_xmit_flush(struct be_adapter *adapter, struct be_tx_obj *txo)
+{
+	struct be_queue_info *txq = &txo->q;
+	struct be_eth_hdr_wrb *hdr = queue_index_node(txq, txo->last_req_hdr);
+
+	/* Mark the last request eventable if it hasn't been marked already */
+	if (!(hdr->dw[2] & cpu_to_le32(TX_HDR_WRB_EVT)))
+		hdr->dw[2] |= cpu_to_le32(TX_HDR_WRB_EVT | TX_HDR_WRB_COMPL);
+
+	/* compose a dummy wrb if there are odd set of wrbs to notify */
+	if (!lancer_chip(adapter) && (txo->pend_wrb_cnt & 1)) {
+		wrb_fill_dummy(queue_head_node(txq));
+		queue_head_inc(txq);
+		atomic_inc(&txq->used);
+		txo->pend_wrb_cnt++;
+		hdr->dw[2] &= ~cpu_to_le32(TX_HDR_WRB_NUM_MASK <<
+					   TX_HDR_WRB_NUM_SHIFT);
+		hdr->dw[2] |= cpu_to_le32((txo->last_req_wrb_cnt + 1) <<
+					  TX_HDR_WRB_NUM_SHIFT);
+	}
+	be_txq_notify(adapter, txo, txo->pend_wrb_cnt);
+	txo->pend_wrb_cnt = 0;
+}
+
+/* OS2BMC related */
+
+#define DHCP_CLIENT_PORT	68
+#define DHCP_SERVER_PORT	67
+#define NET_BIOS_PORT1		137
+#define NET_BIOS_PORT2		138
+#define DHCPV6_RAS_PORT		547
+
+#define is_mc_allowed_on_bmc(adapter, eh)	\
+	(!is_multicast_filt_enabled(adapter) &&	\
+	 is_multicast_ether_addr(eh->h_dest) &&	\
+	 !is_broadcast_ether_addr(eh->h_dest))
+
+#define is_bc_allowed_on_bmc(adapter, eh)	\
+	(!is_broadcast_filt_enabled(adapter) &&	\
+	 is_broadcast_ether_addr(eh->h_dest))
+
+#define is_arp_allowed_on_bmc(adapter, skb)	\
+	(is_arp(skb) && is_arp_filt_enabled(adapter))
+
+#define is_broadcast_packet(eh, adapter)	\
+		(is_multicast_ether_addr(eh->h_dest) && \
+		!compare_ether_addr(eh->h_dest, adapter->netdev->broadcast))
+
+#define is_arp(skb)	(skb->protocol == htons(ETH_P_ARP))
+
+#define is_arp_filt_enabled(adapter)	\
+		(adapter->bmc_filt_mask & (BMC_FILT_BROADCAST_ARP))
+
+#define is_dhcp_client_filt_enabled(adapter)	\
+		(adapter->bmc_filt_mask & BMC_FILT_BROADCAST_DHCP_CLIENT)
+
+#define is_dhcp_srvr_filt_enabled(adapter)	\
+		(adapter->bmc_filt_mask & BMC_FILT_BROADCAST_DHCP_SERVER)
+
+#define is_nbios_filt_enabled(adapter)	\
+		(adapter->bmc_filt_mask & BMC_FILT_BROADCAST_NET_BIOS)
+
+#define is_ipv6_na_filt_enabled(adapter)	\
+		(adapter->bmc_filt_mask &	\
+			BMC_FILT_MULTICAST_IPV6_NEIGH_ADVER)
+
+#define is_ipv6_ra_filt_enabled(adapter)	\
+		(adapter->bmc_filt_mask & BMC_FILT_MULTICAST_IPV6_RA)
+
+#define is_ipv6_ras_filt_enabled(adapter)	\
+		(adapter->bmc_filt_mask & BMC_FILT_MULTICAST_IPV6_RAS)
+
+#define is_broadcast_filt_enabled(adapter)	\
+		(adapter->bmc_filt_mask & BMC_FILT_BROADCAST)
+
+#define is_multicast_filt_enabled(adapter)	\
+		(adapter->bmc_filt_mask & BMC_FILT_MULTICAST)
+
+static bool be_send_pkt_to_bmc(struct be_adapter *adapter,
+			       struct sk_buff **skb)
+{
+	struct ethhdr *eh = (struct ethhdr *)(*skb)->data;
+	bool os2bmc = false;
+
+	if (!be_is_os2bmc_enabled(adapter))
+		goto done;
+
+	if (!is_multicast_ether_addr(eh->h_dest))
+		goto done;
+
+	if (is_mc_allowed_on_bmc(adapter, eh) ||
+	    is_bc_allowed_on_bmc(adapter, eh) ||
+	    is_arp_allowed_on_bmc(adapter, (*skb))) {
+		os2bmc = true;
+		goto done;
+	}
+
+	if ((*skb)->protocol == htons(ETH_P_IPV6)) {
+		struct ipv6hdr *hdr = ipv6_hdr((*skb));
+		u8 nexthdr = hdr->nexthdr;
+
+		if (nexthdr == IPPROTO_ICMPV6) {
+			struct icmp6hdr *icmp6 = icmp6_hdr((*skb));
+
+			switch (icmp6->icmp6_type) {
+			case NDISC_ROUTER_ADVERTISEMENT:
+				os2bmc = is_ipv6_ra_filt_enabled(adapter);
+				goto done;
+			case NDISC_NEIGHBOUR_ADVERTISEMENT:
+				os2bmc = is_ipv6_na_filt_enabled(adapter);
+				goto done;
+			default:
+				break;
+			}
+		}
+	}
+
+	if (is_udp_pkt((*skb))) {
+		struct udphdr *udp = udp_hdr((*skb));
+
+		switch (ntohs(udp->dest)) {
+		case DHCP_CLIENT_PORT:
+			os2bmc = is_dhcp_client_filt_enabled(adapter);
+			goto done;
+		case DHCP_SERVER_PORT:
+			os2bmc = is_dhcp_srvr_filt_enabled(adapter);
+			goto done;
+		case NET_BIOS_PORT1:
+		case NET_BIOS_PORT2:
+			os2bmc = is_nbios_filt_enabled(adapter);
+			goto done;
+		case DHCPV6_RAS_PORT:
+			os2bmc = is_ipv6_ras_filt_enabled(adapter);
+			goto done;
+		default:
+			break;
+		}
+	}
+done:
+	/* For packets over a vlan, which are destined
+	 * to BMC, asic expects the vlan to be inline in the packet.
+	 */
+	if (os2bmc)
+		*skb = be_insert_vlan_in_pkt(adapter, *skb, NULL);
+
+	return os2bmc;
+}
+
+static netdev_tx_t be_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	u16 q_idx = skb_get_queue_mapping(skb);
+	struct be_tx_obj *txo = &adapter->tx_obj[q_idx];
+	struct be_wrb_params wrb_params = { 0 };
+	bool flush = !skb->xmit_more;
+	u16 wrb_cnt;
+
+	skb = be_xmit_workarounds(adapter, skb, &wrb_params);
+	if (unlikely(!skb))
+		goto drop;
+
+	be_get_wrb_params_from_skb(adapter, skb, &wrb_params);
+
+	wrb_cnt = be_xmit_enqueue(adapter, txo, skb, &wrb_params);
+	if (unlikely(!wrb_cnt)) {
+		dev_kfree_skb_any(skb);
+		goto drop;
+	}
+
+	/* if os2bmc is enabled and if the pkt is destined to bmc,
+	 * enqueue the pkt a 2nd time with mgmt bit set.
+	 */
+	if (be_send_pkt_to_bmc(adapter, &skb)) {
+		BE_WRB_F_SET(wrb_params.features, OS2BMC, 1);
+		wrb_cnt = be_xmit_enqueue(adapter, txo, skb, &wrb_params);
+		if (unlikely(!wrb_cnt))
+			goto drop;
+		else
+			skb_get(skb);
+	}
+
+	if (be_is_txq_full(txo)) {
+		netif_stop_subqueue(netdev, q_idx);
+		tx_stats(txo)->tx_stops++;
+	}
+
+	if (flush || __netif_subqueue_stopped(netdev, q_idx))
+		be_xmit_flush(adapter, txo);
+
+	return NETDEV_TX_OK;
+drop:
+	tx_stats(txo)->tx_drv_drops++;
+	/* Flush the already enqueued tx requests */
+	if (flush && txo->pend_wrb_cnt)
+		be_xmit_flush(adapter, txo);
+
+	return NETDEV_TX_OK;
+}
+
+static inline bool be_in_all_promisc(struct be_adapter *adapter)
+{
+	return (adapter->if_flags & BE_IF_FLAGS_ALL_PROMISCUOUS) ==
+			BE_IF_FLAGS_ALL_PROMISCUOUS;
+}
+
+static int be_set_vlan_promisc(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	int status;
+
+	if (adapter->if_flags & BE_IF_FLAGS_VLAN_PROMISCUOUS)
+		return 0;
+
+	status = be_cmd_rx_filter(adapter, BE_IF_FLAGS_VLAN_PROMISCUOUS, ON);
+	if (!status) {
+		dev_info(dev, "Enabled VLAN promiscuous mode\n");
+		adapter->if_flags |= BE_IF_FLAGS_VLAN_PROMISCUOUS;
+	} else {
+		dev_err(dev, "Failed to enable VLAN promiscuous mode\n");
+	}
+	return status;
+}
+
+static int be_clear_vlan_promisc(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	int status;
+
+	status = be_cmd_rx_filter(adapter, BE_IF_FLAGS_VLAN_PROMISCUOUS, OFF);
+	if (!status) {
+		dev_info(dev, "Disabling VLAN promiscuous mode\n");
+		adapter->if_flags &= ~BE_IF_FLAGS_VLAN_PROMISCUOUS;
+	}
+	return status;
+}
+
+/*
+ * A max of 64 (BE_NUM_VLANS_SUPPORTED) vlans can be configured in BE.
+ * If the user configures more, place BE in vlan promiscuous mode.
+ */
+static int be_vid_config(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	u16 vids[BE_NUM_VLANS_SUPPORTED];
+	u16 num = 0, i = 0;
+	int status = 0;
+
+	/* No need to change the VLAN state if the I/F is in promiscuous */
+	if (adapter->netdev->flags & IFF_PROMISC)
+		return 0;
+
+	if (adapter->vlans_added > be_max_vlans(adapter))
+		return be_set_vlan_promisc(adapter);
+
+	if (adapter->if_flags & BE_IF_FLAGS_VLAN_PROMISCUOUS) {
+		status = be_clear_vlan_promisc(adapter);
+		if (status)
+			return status;
+	}
+	/* Construct VLAN Table to give to HW */
+	for_each_set_bit(i, adapter->vids, VLAN_N_VID)
+		vids[num++] = cpu_to_le16(i);
+
+	status = be_cmd_vlan_config(adapter, adapter->if_handle, vids, num, 0);
+	if (status) {
+		dev_err(dev, "Setting HW VLAN filtering failed\n");
+		/* Set to VLAN promisc mode as setting VLAN filter failed */
+		if (addl_status(status) == MCC_ADDL_STATUS_INSUFFICIENT_VLANS ||
+		    addl_status(status) ==
+				MCC_ADDL_STATUS_INSUFFICIENT_RESOURCES)
+			return be_set_vlan_promisc(adapter);
+	}
+	return status;
+}
+
+static int be_vlan_add_vid(struct net_device *netdev, __be16 proto, u16 vid)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status = 0;
+
+	mutex_lock(&adapter->rx_filter_lock);
+
+	/* Packets with VID 0 are always received by Lancer by default */
+	if (lancer_chip(adapter) && vid == 0)
+		goto done;
+
+	if (test_bit(vid, adapter->vids))
+		goto done;
+
+	set_bit(vid, adapter->vids);
+	adapter->vlans_added++;
+
+	status = be_vid_config(adapter);
+done:
+	mutex_unlock(&adapter->rx_filter_lock);
+	return status;
+}
+
+static int be_vlan_rem_vid(struct net_device *netdev, __be16 proto, u16 vid)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status = 0;
+
+	mutex_lock(&adapter->rx_filter_lock);
+
+	/* Packets with VID 0 are always received by Lancer by default */
+	if (lancer_chip(adapter) && vid == 0)
+		goto done;
+
+	if (!test_bit(vid, adapter->vids))
+		goto done;
+
+	clear_bit(vid, adapter->vids);
+	adapter->vlans_added--;
+
+	status = be_vid_config(adapter);
+done:
+	mutex_unlock(&adapter->rx_filter_lock);
+	return status;
+}
+
+static void be_set_all_promisc(struct be_adapter *adapter)
+{
+	be_cmd_rx_filter(adapter, BE_IF_FLAGS_ALL_PROMISCUOUS, ON);
+	adapter->if_flags |= BE_IF_FLAGS_ALL_PROMISCUOUS;
+}
+
+static void be_set_mc_promisc(struct be_adapter *adapter)
+{
+	int status;
+
+	if (adapter->if_flags & BE_IF_FLAGS_MCAST_PROMISCUOUS)
+		return;
+
+	status = be_cmd_rx_filter(adapter, BE_IF_FLAGS_MCAST_PROMISCUOUS, ON);
+	if (!status)
+		adapter->if_flags |= BE_IF_FLAGS_MCAST_PROMISCUOUS;
+}
+
+static void be_set_uc_promisc(struct be_adapter *adapter)
+{
+	int status;
+
+	if (adapter->if_flags & BE_IF_FLAGS_PROMISCUOUS)
+		return;
+
+	status = be_cmd_rx_filter(adapter, BE_IF_FLAGS_PROMISCUOUS, ON);
+	if (!status)
+		adapter->if_flags |= BE_IF_FLAGS_PROMISCUOUS;
+}
+
+static void be_clear_uc_promisc(struct be_adapter *adapter)
+{
+	int status;
+
+	if (!(adapter->if_flags & BE_IF_FLAGS_PROMISCUOUS))
+		return;
+
+	status = be_cmd_rx_filter(adapter, BE_IF_FLAGS_PROMISCUOUS, OFF);
+	if (!status)
+		adapter->if_flags &= ~BE_IF_FLAGS_PROMISCUOUS;
+}
+
+/* The below 2 functions are the callback args for __dev_mc_sync/dev_uc_sync().
+ * We use a single callback function for both sync and unsync. We really don't
+ * add/remove addresses through this callback. But, we use it to detect changes
+ * to the uc/mc lists. The entire uc/mc list is programmed in be_set_rx_mode().
+ */
+static int be_uc_list_update(struct net_device *netdev,
+			     const unsigned char *addr)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	adapter->update_uc_list = true;
+	return 0;
+}
+
+static int be_mc_list_update(struct net_device *netdev,
+			     const unsigned char *addr)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	adapter->update_mc_list = true;
+	return 0;
+}
+
+static void be_set_mc_list(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct netdev_hw_addr *ha;
+	bool mc_promisc = false;
+	int status;
+
+	netif_addr_lock_bh(netdev);
+	__dev_mc_sync(netdev, be_mc_list_update, be_mc_list_update);
+
+	if (netdev->flags & IFF_PROMISC) {
+		adapter->update_mc_list = false;
+	} else if (netdev->flags & IFF_ALLMULTI ||
+		   netdev_mc_count(netdev) > be_max_mc(adapter)) {
+		/* Enable multicast promisc if num configured exceeds
+		 * what we support
+		 */
+		mc_promisc = true;
+		adapter->update_mc_list = false;
+	} else if (adapter->if_flags & BE_IF_FLAGS_MCAST_PROMISCUOUS) {
+		/* Update mc-list unconditionally if the iface was previously
+		 * in mc-promisc mode and now is out of that mode.
+		 */
+		adapter->update_mc_list = true;
+	}
+
+	if (adapter->update_mc_list) {
+		int i = 0;
+
+		/* cache the mc-list in adapter */
+		netdev_for_each_mc_addr(ha, netdev) {
+			ether_addr_copy(adapter->mc_list[i].mac, ha->addr);
+			i++;
+		}
+		adapter->mc_count = netdev_mc_count(netdev);
+	}
+	netif_addr_unlock_bh(netdev);
+
+	if (mc_promisc) {
+		be_set_mc_promisc(adapter);
+	} else if (adapter->update_mc_list) {
+		status = be_cmd_rx_filter(adapter, BE_IF_FLAGS_MULTICAST, ON);
+		if (!status)
+			adapter->if_flags &= ~BE_IF_FLAGS_MCAST_PROMISCUOUS;
+		else
+			be_set_mc_promisc(adapter);
+
+		adapter->update_mc_list = false;
+	}
+}
+
+static void be_clear_mc_list(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	__dev_mc_unsync(netdev, NULL);
+	be_cmd_rx_filter(adapter, BE_IF_FLAGS_MULTICAST, OFF);
+	adapter->mc_count = 0;
+}
+
+static int be_uc_mac_add(struct be_adapter *adapter, int uc_idx)
+{
+	if (ether_addr_equal(adapter->uc_list[uc_idx].mac, adapter->dev_mac)) {
+		adapter->pmac_id[uc_idx + 1] = adapter->pmac_id[0];
+		return 0;
+	}
+
+	return be_cmd_pmac_add(adapter, adapter->uc_list[uc_idx].mac,
+			       adapter->if_handle,
+			       &adapter->pmac_id[uc_idx + 1], 0);
+}
+
+static void be_uc_mac_del(struct be_adapter *adapter, int pmac_id)
+{
+	if (pmac_id == adapter->pmac_id[0])
+		return;
+
+	be_cmd_pmac_del(adapter, adapter->if_handle, pmac_id, 0);
+}
+
+static void be_set_uc_list(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct netdev_hw_addr *ha;
+	bool uc_promisc = false;
+	int curr_uc_macs = 0, i;
+
+	netif_addr_lock_bh(netdev);
+	__dev_uc_sync(netdev, be_uc_list_update, be_uc_list_update);
+
+	if (netdev->flags & IFF_PROMISC) {
+		adapter->update_uc_list = false;
+	} else if (netdev_uc_count(netdev) > (be_max_uc(adapter) - 1)) {
+		uc_promisc = true;
+		adapter->update_uc_list = false;
+	}  else if (adapter->if_flags & BE_IF_FLAGS_PROMISCUOUS) {
+		/* Update uc-list unconditionally if the iface was previously
+		 * in uc-promisc mode and now is out of that mode.
+		 */
+		adapter->update_uc_list = true;
+	}
+
+	if (adapter->update_uc_list) {
+		/* cache the uc-list in adapter array */
+		i = 0;
+		netdev_for_each_uc_addr(ha, netdev) {
+			ether_addr_copy(adapter->uc_list[i].mac, ha->addr);
+			i++;
+		}
+		curr_uc_macs = netdev_uc_count(netdev);
+	}
+	netif_addr_unlock_bh(netdev);
+
+	if (uc_promisc) {
+		be_set_uc_promisc(adapter);
+	} else if (adapter->update_uc_list) {
+		be_clear_uc_promisc(adapter);
+
+		for (i = 0; i < adapter->uc_macs; i++)
+			be_uc_mac_del(adapter, adapter->pmac_id[i + 1]);
+
+		for (i = 0; i < curr_uc_macs; i++)
+			be_uc_mac_add(adapter, i);
+		adapter->uc_macs = curr_uc_macs;
+		adapter->update_uc_list = false;
+	}
+}
+
+static void be_clear_uc_list(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	int i;
+
+	__dev_uc_unsync(netdev, NULL);
+	for (i = 0; i < adapter->uc_macs; i++)
+		be_uc_mac_del(adapter, adapter->pmac_id[i + 1]);
+
+	adapter->uc_macs = 0;
+}
+
+static void __be_set_rx_mode(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	mutex_lock(&adapter->rx_filter_lock);
+
+	if (netdev->flags & IFF_PROMISC) {
+		if (!be_in_all_promisc(adapter))
+			be_set_all_promisc(adapter);
+	} else if (be_in_all_promisc(adapter)) {
+		/* We need to re-program the vlan-list or clear
+		 * vlan-promisc mode (if needed) when the interface
+		 * comes out of promisc mode.
+		 */
+		be_vid_config(adapter);
+	}
+
+	be_set_uc_list(adapter);
+	be_set_mc_list(adapter);
+
+	mutex_unlock(&adapter->rx_filter_lock);
+}
+
+static void be_work_set_rx_mode(struct work_struct *work)
+{
+	struct be_cmd_work *cmd_work =
+				container_of(work, struct be_cmd_work, work);
+
+	__be_set_rx_mode(cmd_work->adapter);
+	kfree(cmd_work);
+}
+
+static int be_set_vf_mac(struct net_device *netdev, int vf, u8 *mac)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
+	int status;
+
+	if (!sriov_enabled(adapter))
+		return -EPERM;
+
+	if (!is_valid_ether_addr(mac) || vf >= adapter->num_vfs)
+		return -EINVAL;
+
+	/* Proceed further only if user provided MAC is different
+	 * from active MAC
+	 */
+	if (ether_addr_equal(mac, vf_cfg->mac_addr))
+		return 0;
+
+	if (BEx_chip(adapter)) {
+		be_cmd_pmac_del(adapter, vf_cfg->if_handle, vf_cfg->pmac_id,
+				vf + 1);
+
+		status = be_cmd_pmac_add(adapter, mac, vf_cfg->if_handle,
+					 &vf_cfg->pmac_id, vf + 1);
+	} else {
+		status = be_cmd_set_mac(adapter, mac, vf_cfg->if_handle,
+					vf + 1);
+	}
+
+	if (status) {
+		dev_err(&adapter->pdev->dev, "MAC %pM set on VF %d Failed: %#x",
+			mac, vf, status);
+		return be_cmd_status(status);
+	}
+
+	ether_addr_copy(vf_cfg->mac_addr, mac);
+
+	return 0;
+}
+
+static int be_get_vf_config(struct net_device *netdev, int vf,
+			    struct ifla_vf_info *vi)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
+
+	if (!sriov_enabled(adapter))
+		return -EPERM;
+
+	if (vf >= adapter->num_vfs)
+		return -EINVAL;
+
+	vi->vf = vf;
+	vi->max_tx_rate = vf_cfg->tx_rate;
+	vi->min_tx_rate = 0;
+	vi->vlan = vf_cfg->vlan_tag & VLAN_VID_MASK;
+	vi->qos = vf_cfg->vlan_tag >> VLAN_PRIO_SHIFT;
+	memcpy(&vi->mac, vf_cfg->mac_addr, ETH_ALEN);
+	vi->linkstate = adapter->vf_cfg[vf].plink_tracking;
+	vi->spoofchk = adapter->vf_cfg[vf].spoofchk;
+
+	return 0;
+}
+
+static int be_set_vf_tvt(struct be_adapter *adapter, int vf, u16 vlan)
+{
+	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
+	u16 vids[BE_NUM_VLANS_SUPPORTED];
+	int vf_if_id = vf_cfg->if_handle;
+	int status;
+
+	/* Enable Transparent VLAN Tagging */
+	status = be_cmd_set_hsw_config(adapter, vlan, vf + 1, vf_if_id, 0, 0);
+	if (status)
+		return status;
+
+	/* Clear pre-programmed VLAN filters on VF if any, if TVT is enabled */
+	vids[0] = 0;
+	status = be_cmd_vlan_config(adapter, vf_if_id, vids, 1, vf + 1);
+	if (!status)
+		dev_info(&adapter->pdev->dev,
+			 "Cleared guest VLANs on VF%d", vf);
+
+	/* After TVT is enabled, disallow VFs to program VLAN filters */
+	if (vf_cfg->privileges & BE_PRIV_FILTMGMT) {
+		status = be_cmd_set_fn_privileges(adapter, vf_cfg->privileges &
+						  ~BE_PRIV_FILTMGMT, vf + 1);
+		if (!status)
+			vf_cfg->privileges &= ~BE_PRIV_FILTMGMT;
+	}
+	return 0;
+}
+
+static int be_clear_vf_tvt(struct be_adapter *adapter, int vf)
+{
+	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
+	struct device *dev = &adapter->pdev->dev;
+	int status;
+
+	/* Reset Transparent VLAN Tagging. */
+	status = be_cmd_set_hsw_config(adapter, BE_RESET_VLAN_TAG_ID, vf + 1,
+				       vf_cfg->if_handle, 0, 0);
+	if (status)
+		return status;
+
+	/* Allow VFs to program VLAN filtering */
+	if (!(vf_cfg->privileges & BE_PRIV_FILTMGMT)) {
+		status = be_cmd_set_fn_privileges(adapter, vf_cfg->privileges |
+						  BE_PRIV_FILTMGMT, vf + 1);
+		if (!status) {
+			vf_cfg->privileges |= BE_PRIV_FILTMGMT;
+			dev_info(dev, "VF%d: FILTMGMT priv enabled", vf);
+		}
+	}
+
+	dev_info(dev,
+		 "Disable/re-enable i/f in VM to clear Transparent VLAN tag");
+	return 0;
+}
+
+static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos,
+			  __be16 vlan_proto)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
+	int status;
+
+	if (!sriov_enabled(adapter))
+		return -EPERM;
+
+	if (vf >= adapter->num_vfs || vlan > 4095 || qos > 7)
+		return -EINVAL;
+
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
+	if (vlan || qos) {
+		vlan |= qos << VLAN_PRIO_SHIFT;
+		status = be_set_vf_tvt(adapter, vf, vlan);
+	} else {
+		status = be_clear_vf_tvt(adapter, vf);
+	}
+
+	if (status) {
+		dev_err(&adapter->pdev->dev,
+			"VLAN %d config on VF %d failed : %#x\n", vlan, vf,
+			status);
+		return be_cmd_status(status);
+	}
+
+	vf_cfg->vlan_tag = vlan;
+	return 0;
+}
+
+static int be_set_vf_tx_rate(struct net_device *netdev, int vf,
+			     int min_tx_rate, int max_tx_rate)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct device *dev = &adapter->pdev->dev;
+	int percent_rate, status = 0;
+	u16 link_speed = 0;
+	u8 link_status;
+
+	if (!sriov_enabled(adapter))
+		return -EPERM;
+
+	if (vf >= adapter->num_vfs)
+		return -EINVAL;
+
+	if (min_tx_rate)
+		return -EINVAL;
+
+	if (!max_tx_rate)
+		goto config_qos;
+
+	status = be_cmd_link_status_query(adapter, &link_speed,
+					  &link_status, 0);
+	if (status)
+		goto err;
+
+	if (!link_status) {
+		dev_err(dev, "TX-rate setting not allowed when link is down\n");
+		status = -ENETDOWN;
+		goto err;
+	}
+
+	if (max_tx_rate < 100 || max_tx_rate > link_speed) {
+		dev_err(dev, "TX-rate must be between 100 and %d Mbps\n",
+			link_speed);
+		status = -EINVAL;
+		goto err;
+	}
+
+	/* On Skyhawk the QOS setting must be done only as a % value */
+	percent_rate = link_speed / 100;
+	if (skyhawk_chip(adapter) && (max_tx_rate % percent_rate)) {
+		dev_err(dev, "TX-rate must be a multiple of %d Mbps\n",
+			percent_rate);
+		status = -EINVAL;
+		goto err;
+	}
+
+config_qos:
+	status = be_cmd_config_qos(adapter, max_tx_rate, link_speed, vf + 1);
+	if (status)
+		goto err;
+
+	adapter->vf_cfg[vf].tx_rate = max_tx_rate;
+	return 0;
+
+err:
+	dev_err(dev, "TX-rate setting of %dMbps on VF%d failed\n",
+		max_tx_rate, vf);
+	return be_cmd_status(status);
+}
+
+static int be_set_vf_link_state(struct net_device *netdev, int vf,
+				int link_state)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status;
+
+	if (!sriov_enabled(adapter))
+		return -EPERM;
+
+	if (vf >= adapter->num_vfs)
+		return -EINVAL;
+
+	status = be_cmd_set_logical_link_config(adapter, link_state, vf+1);
+	if (status) {
+		dev_err(&adapter->pdev->dev,
+			"Link state change on VF %d failed: %#x\n", vf, status);
+		return be_cmd_status(status);
+	}
+
+	adapter->vf_cfg[vf].plink_tracking = link_state;
+
+	return 0;
+}
+
+static int be_set_vf_spoofchk(struct net_device *netdev, int vf, bool enable)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
+	u8 spoofchk;
+	int status;
+
+	if (!sriov_enabled(adapter))
+		return -EPERM;
+
+	if (vf >= adapter->num_vfs)
+		return -EINVAL;
+
+	if (BEx_chip(adapter))
+		return -EOPNOTSUPP;
+
+	if (enable == vf_cfg->spoofchk)
+		return 0;
+
+	spoofchk = enable ? ENABLE_MAC_SPOOFCHK : DISABLE_MAC_SPOOFCHK;
+
+	status = be_cmd_set_hsw_config(adapter, 0, vf + 1, vf_cfg->if_handle,
+				       0, spoofchk);
+	if (status) {
+		dev_err(&adapter->pdev->dev,
+			"Spoofchk change on VF %d failed: %#x\n", vf, status);
+		return be_cmd_status(status);
+	}
+
+	vf_cfg->spoofchk = enable;
+	return 0;
+}
+
+static void be_aic_update(struct be_aic_obj *aic, u64 rx_pkts, u64 tx_pkts,
+			  ulong now)
+{
+	aic->rx_pkts_prev = rx_pkts;
+	aic->tx_reqs_prev = tx_pkts;
+	aic->jiffies = now;
+}
+
+static int be_get_new_eqd(struct be_eq_obj *eqo)
+{
+	struct be_adapter *adapter = eqo->adapter;
+	int eqd, start;
+	struct be_aic_obj *aic;
+	struct be_rx_obj *rxo;
+	struct be_tx_obj *txo;
+	u64 rx_pkts = 0, tx_pkts = 0;
+	ulong now;
+	u32 pps, delta;
+	int i;
+
+	aic = &adapter->aic_obj[eqo->idx];
+	if (!aic->enable) {
+		if (aic->jiffies)
+			aic->jiffies = 0;
+		eqd = aic->et_eqd;
+		return eqd;
+	}
+
+	for_all_rx_queues_on_eq(adapter, eqo, rxo, i) {
+		do {
+			start = u64_stats_fetch_begin_irq(&rxo->stats.sync);
+			rx_pkts += rxo->stats.rx_pkts;
+		} while (u64_stats_fetch_retry_irq(&rxo->stats.sync, start));
+	}
+
+	for_all_tx_queues_on_eq(adapter, eqo, txo, i) {
+		do {
+			start = u64_stats_fetch_begin_irq(&txo->stats.sync);
+			tx_pkts += txo->stats.tx_reqs;
+		} while (u64_stats_fetch_retry_irq(&txo->stats.sync, start));
+	}
+
+	/* Skip, if wrapped around or first calculation */
+	now = jiffies;
+	if (!aic->jiffies || time_before(now, aic->jiffies) ||
+	    rx_pkts < aic->rx_pkts_prev ||
+	    tx_pkts < aic->tx_reqs_prev) {
+		be_aic_update(aic, rx_pkts, tx_pkts, now);
+		return aic->prev_eqd;
+	}
+
+	delta = jiffies_to_msecs(now - aic->jiffies);
+	if (delta == 0)
+		return aic->prev_eqd;
+
+	pps = (((u32)(rx_pkts - aic->rx_pkts_prev) * 1000) / delta) +
+		(((u32)(tx_pkts - aic->tx_reqs_prev) * 1000) / delta);
+	eqd = (pps / 15000) << 2;
+
+	if (eqd < 8)
+		eqd = 0;
+	eqd = min_t(u32, eqd, aic->max_eqd);
+	eqd = max_t(u32, eqd, aic->min_eqd);
+
+	be_aic_update(aic, rx_pkts, tx_pkts, now);
+
+	return eqd;
+}
+
+/* For Skyhawk-R only */
+static u32 be_get_eq_delay_mult_enc(struct be_eq_obj *eqo)
+{
+	struct be_adapter *adapter = eqo->adapter;
+	struct be_aic_obj *aic = &adapter->aic_obj[eqo->idx];
+	ulong now = jiffies;
+	int eqd;
+	u32 mult_enc;
+
+	if (!aic->enable)
+		return 0;
+
+	if (jiffies_to_msecs(now - aic->jiffies) < 1)
+		eqd = aic->prev_eqd;
+	else
+		eqd = be_get_new_eqd(eqo);
+
+	if (eqd > 100)
+		mult_enc = R2I_DLY_ENC_1;
+	else if (eqd > 60)
+		mult_enc = R2I_DLY_ENC_2;
+	else if (eqd > 20)
+		mult_enc = R2I_DLY_ENC_3;
+	else
+		mult_enc = R2I_DLY_ENC_0;
+
+	aic->prev_eqd = eqd;
+
+	return mult_enc;
+}
+
+void be_eqd_update(struct be_adapter *adapter, bool force_update)
+{
+	struct be_set_eqd set_eqd[MAX_EVT_QS];
+	struct be_aic_obj *aic;
+	struct be_eq_obj *eqo;
+	int i, num = 0, eqd;
+
+	for_all_evt_queues(adapter, eqo, i) {
+		aic = &adapter->aic_obj[eqo->idx];
+		eqd = be_get_new_eqd(eqo);
+		if (force_update || eqd != aic->prev_eqd) {
+			set_eqd[num].delay_multiplier = (eqd * 65)/100;
+			set_eqd[num].eq_id = eqo->q.id;
+			aic->prev_eqd = eqd;
+			num++;
+		}
+	}
+
+	if (num)
+		be_cmd_modify_eqd(adapter, set_eqd, num);
+}
+
+static void be_rx_stats_update(struct be_rx_obj *rxo,
+			       struct be_rx_compl_info *rxcp)
+{
+	struct be_rx_stats *stats = rx_stats(rxo);
+
+	u64_stats_update_begin(&stats->sync);
+	stats->rx_compl++;
+	stats->rx_bytes += rxcp->pkt_size;
+	stats->rx_pkts++;
+	if (rxcp->tunneled)
+		stats->rx_vxlan_offload_pkts++;
+	if (rxcp->pkt_type == BE_MULTICAST_PACKET)
+		stats->rx_mcast_pkts++;
+	if (rxcp->err)
+		stats->rx_compl_err++;
+	u64_stats_update_end(&stats->sync);
+}
+
+static inline bool csum_passed(struct be_rx_compl_info *rxcp)
+{
+	/* L4 checksum is not reliable for non TCP/UDP packets.
+	 * Also ignore ipcksm for ipv6 pkts
+	 */
+	return (rxcp->tcpf || rxcp->udpf) && rxcp->l4_csum &&
+		(rxcp->ip_csum || rxcp->ipv6) && !rxcp->err;
+}
+
+static struct be_rx_page_info *get_rx_page_info(struct be_rx_obj *rxo)
+{
+	struct be_adapter *adapter = rxo->adapter;
+	struct be_rx_page_info *rx_page_info;
+	struct be_queue_info *rxq = &rxo->q;
+	u32 frag_idx = rxq->tail;
+
+	rx_page_info = &rxo->page_info_tbl[frag_idx];
+	BUG_ON(!rx_page_info->page);
+
+	if (rx_page_info->last_frag) {
+		dma_unmap_page(&adapter->pdev->dev,
+			       dma_unmap_addr(rx_page_info, bus),
+			       adapter->big_page_size, DMA_FROM_DEVICE);
+		rx_page_info->last_frag = false;
+	} else {
+		dma_sync_single_for_cpu(&adapter->pdev->dev,
+					dma_unmap_addr(rx_page_info, bus),
+					rx_frag_size, DMA_FROM_DEVICE);
+	}
+
+	queue_tail_inc(rxq);
+	atomic_dec(&rxq->used);
+	return rx_page_info;
+}
+
+/* Throwaway the data in the Rx completion */
+static void be_rx_compl_discard(struct be_rx_obj *rxo,
+				struct be_rx_compl_info *rxcp)
+{
+	struct be_rx_page_info *page_info;
+	u16 i, num_rcvd = rxcp->num_rcvd;
+
+	for (i = 0; i < num_rcvd; i++) {
+		page_info = get_rx_page_info(rxo);
+		put_page(page_info->page);
+		memset(page_info, 0, sizeof(*page_info));
+	}
+}
+
+/*
+ * skb_fill_rx_data forms a complete skb for an ether frame
+ * indicated by rxcp.
+ */
+static void skb_fill_rx_data(struct be_rx_obj *rxo, struct sk_buff *skb,
+			     struct be_rx_compl_info *rxcp)
+{
+	struct be_rx_page_info *page_info;
+	u16 i, j;
+	u16 hdr_len, curr_frag_len, remaining;
+	u8 *start;
+
+	page_info = get_rx_page_info(rxo);
+	start = page_address(page_info->page) + page_info->page_offset;
+	prefetch(start);
+
+	/* Copy data in the first descriptor of this completion */
+	curr_frag_len = min(rxcp->pkt_size, rx_frag_size);
+
+	skb->len = curr_frag_len;
+	if (curr_frag_len <= BE_HDR_LEN) { /* tiny packet */
+		memcpy(skb->data, start, curr_frag_len);
+		/* Complete packet has now been moved to data */
+		put_page(page_info->page);
+		skb->data_len = 0;
+		skb->tail += curr_frag_len;
+	} else {
+		hdr_len = ETH_HLEN;
+		memcpy(skb->data, start, hdr_len);
+		skb_shinfo(skb)->nr_frags = 1;
+		skb_frag_set_page(skb, 0, page_info->page);
+		skb_shinfo(skb)->frags[0].page_offset =
+					page_info->page_offset + hdr_len;
+		skb_frag_size_set(&skb_shinfo(skb)->frags[0],
+				  curr_frag_len - hdr_len);
+		skb->data_len = curr_frag_len - hdr_len;
+		skb->truesize += rx_frag_size;
+		skb->tail += hdr_len;
+	}
+	page_info->page = NULL;
+
+	if (rxcp->pkt_size <= rx_frag_size) {
+		BUG_ON(rxcp->num_rcvd != 1);
+		return;
+	}
+
+	/* More frags present for this completion */
+	remaining = rxcp->pkt_size - curr_frag_len;
+	for (i = 1, j = 0; i < rxcp->num_rcvd; i++) {
+		page_info = get_rx_page_info(rxo);
+		curr_frag_len = min(remaining, rx_frag_size);
+
+		/* Coalesce all frags from the same physical page in one slot */
+		if (page_info->page_offset == 0) {
+			/* Fresh page */
+			j++;
+			skb_frag_set_page(skb, j, page_info->page);
+			skb_shinfo(skb)->frags[j].page_offset =
+							page_info->page_offset;
+			skb_frag_size_set(&skb_shinfo(skb)->frags[j], 0);
+			skb_shinfo(skb)->nr_frags++;
+		} else {
+			put_page(page_info->page);
+		}
+
+		skb_frag_size_add(&skb_shinfo(skb)->frags[j], curr_frag_len);
+		skb->len += curr_frag_len;
+		skb->data_len += curr_frag_len;
+		skb->truesize += rx_frag_size;
+		remaining -= curr_frag_len;
+		page_info->page = NULL;
+	}
+	BUG_ON(j > MAX_SKB_FRAGS);
+}
+
+/* Process the RX completion indicated by rxcp when GRO is disabled */
+static void be_rx_compl_process(struct be_rx_obj *rxo, struct napi_struct *napi,
+				struct be_rx_compl_info *rxcp)
+{
+	struct be_adapter *adapter = rxo->adapter;
+	struct net_device *netdev = adapter->netdev;
+	struct sk_buff *skb;
+
+	skb = netdev_alloc_skb_ip_align(netdev, BE_RX_SKB_ALLOC_SIZE);
+	if (unlikely(!skb)) {
+		rx_stats(rxo)->rx_drops_no_skbs++;
+		be_rx_compl_discard(rxo, rxcp);
+		return;
+	}
+
+	skb_fill_rx_data(rxo, skb, rxcp);
+
+	if (likely((netdev->features & NETIF_F_RXCSUM) && csum_passed(rxcp)))
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	else
+		skb_checksum_none_assert(skb);
+
+	skb->protocol = eth_type_trans(skb, netdev);
+	skb_record_rx_queue(skb, rxo - &adapter->rx_obj[0]);
+	if (netdev->features & NETIF_F_RXHASH)
+		skb_set_hash(skb, rxcp->rss_hash, PKT_HASH_TYPE_L3);
+
+	skb->csum_level = rxcp->tunneled;
+	skb_mark_napi_id(skb, napi);
+
+	if (rxcp->vlanf)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), rxcp->vlan_tag);
+
+	netif_receive_skb(skb);
+}
+
+/* Process the RX completion indicated by rxcp when GRO is enabled */
+static void be_rx_compl_process_gro(struct be_rx_obj *rxo,
+				    struct napi_struct *napi,
+				    struct be_rx_compl_info *rxcp)
+{
+	struct be_adapter *adapter = rxo->adapter;
+	struct be_rx_page_info *page_info;
+	struct sk_buff *skb = NULL;
+	u16 remaining, curr_frag_len;
+	u16 i, j;
+
+	skb = napi_get_frags(napi);
+	if (!skb) {
+		be_rx_compl_discard(rxo, rxcp);
+		return;
+	}
+
+	remaining = rxcp->pkt_size;
+	for (i = 0, j = -1; i < rxcp->num_rcvd; i++) {
+		page_info = get_rx_page_info(rxo);
+
+		curr_frag_len = min(remaining, rx_frag_size);
+
+		/* Coalesce all frags from the same physical page in one slot */
+		if (i == 0 || page_info->page_offset == 0) {
+			/* First frag or Fresh page */
+			j++;
+			skb_frag_set_page(skb, j, page_info->page);
+			skb_shinfo(skb)->frags[j].page_offset =
+							page_info->page_offset;
+			skb_frag_size_set(&skb_shinfo(skb)->frags[j], 0);
+		} else {
+			put_page(page_info->page);
+		}
+		skb_frag_size_add(&skb_shinfo(skb)->frags[j], curr_frag_len);
+		skb->truesize += rx_frag_size;
+		remaining -= curr_frag_len;
+		memset(page_info, 0, sizeof(*page_info));
+	}
+	BUG_ON(j > MAX_SKB_FRAGS);
+
+	skb_shinfo(skb)->nr_frags = j + 1;
+	skb->len = rxcp->pkt_size;
+	skb->data_len = rxcp->pkt_size;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	skb_record_rx_queue(skb, rxo - &adapter->rx_obj[0]);
+	if (adapter->netdev->features & NETIF_F_RXHASH)
+		skb_set_hash(skb, rxcp->rss_hash, PKT_HASH_TYPE_L3);
+
+	skb->csum_level = rxcp->tunneled;
+
+	if (rxcp->vlanf)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), rxcp->vlan_tag);
+
+	napi_gro_frags(napi);
+}
+
+static void be_parse_rx_compl_v1(struct be_eth_rx_compl *compl,
+				 struct be_rx_compl_info *rxcp)
+{
+	rxcp->pkt_size = GET_RX_COMPL_V1_BITS(pktsize, compl);
+	rxcp->vlanf = GET_RX_COMPL_V1_BITS(vtp, compl);
+	rxcp->err = GET_RX_COMPL_V1_BITS(err, compl);
+	rxcp->tcpf = GET_RX_COMPL_V1_BITS(tcpf, compl);
+	rxcp->udpf = GET_RX_COMPL_V1_BITS(udpf, compl);
+	rxcp->ip_csum = GET_RX_COMPL_V1_BITS(ipcksm, compl);
+	rxcp->l4_csum = GET_RX_COMPL_V1_BITS(l4_cksm, compl);
+	rxcp->ipv6 = GET_RX_COMPL_V1_BITS(ip_version, compl);
+	rxcp->num_rcvd = GET_RX_COMPL_V1_BITS(numfrags, compl);
+	rxcp->pkt_type = GET_RX_COMPL_V1_BITS(cast_enc, compl);
+	rxcp->rss_hash = GET_RX_COMPL_V1_BITS(rsshash, compl);
+	if (rxcp->vlanf) {
+		rxcp->qnq = GET_RX_COMPL_V1_BITS(qnq, compl);
+		rxcp->vlan_tag = GET_RX_COMPL_V1_BITS(vlan_tag, compl);
+	}
+	rxcp->port = GET_RX_COMPL_V1_BITS(port, compl);
+	rxcp->tunneled =
+		GET_RX_COMPL_V1_BITS(tunneled, compl);
+}
+
+static void be_parse_rx_compl_v0(struct be_eth_rx_compl *compl,
+				 struct be_rx_compl_info *rxcp)
+{
+	rxcp->pkt_size = GET_RX_COMPL_V0_BITS(pktsize, compl);
+	rxcp->vlanf = GET_RX_COMPL_V0_BITS(vtp, compl);
+	rxcp->err = GET_RX_COMPL_V0_BITS(err, compl);
+	rxcp->tcpf = GET_RX_COMPL_V0_BITS(tcpf, compl);
+	rxcp->udpf = GET_RX_COMPL_V0_BITS(udpf, compl);
+	rxcp->ip_csum = GET_RX_COMPL_V0_BITS(ipcksm, compl);
+	rxcp->l4_csum = GET_RX_COMPL_V0_BITS(l4_cksm, compl);
+	rxcp->ipv6 = GET_RX_COMPL_V0_BITS(ip_version, compl);
+	rxcp->num_rcvd = GET_RX_COMPL_V0_BITS(numfrags, compl);
+	rxcp->pkt_type = GET_RX_COMPL_V0_BITS(cast_enc, compl);
+	rxcp->rss_hash = GET_RX_COMPL_V0_BITS(rsshash, compl);
+	if (rxcp->vlanf) {
+		rxcp->qnq = GET_RX_COMPL_V0_BITS(qnq, compl);
+		rxcp->vlan_tag = GET_RX_COMPL_V0_BITS(vlan_tag, compl);
+	}
+	rxcp->port = GET_RX_COMPL_V0_BITS(port, compl);
+	rxcp->ip_frag = GET_RX_COMPL_V0_BITS(ip_frag, compl);
+}
+
+static struct be_rx_compl_info *be_rx_compl_get(struct be_rx_obj *rxo)
+{
+	struct be_eth_rx_compl *compl = queue_tail_node(&rxo->cq);
+	struct be_rx_compl_info *rxcp = &rxo->rxcp;
+	struct be_adapter *adapter = rxo->adapter;
+
+	/* For checking the valid bit it is Ok to use either definition as the
+	 * valid bit is at the same position in both v0 and v1 Rx compl */
+	if (compl->dw[offsetof(struct amap_eth_rx_compl_v1, valid) / 32] == 0)
+		return NULL;
+
+	rmb();
+	be_dws_le_to_cpu(compl, sizeof(*compl));
+
+	if (adapter->be3_native)
+		be_parse_rx_compl_v1(compl, rxcp);
+	else
+		be_parse_rx_compl_v0(compl, rxcp);
+
+	if (rxcp->ip_frag)
+		rxcp->l4_csum = 0;
+
+	if (rxcp->vlanf) {
+		/* In QNQ modes, if qnq bit is not set, then the packet was
+		 * tagged only with the transparent outer vlan-tag and must
+		 * not be treated as a vlan packet by host
+		 */
+		if (be_is_qnq_mode(adapter) && !rxcp->qnq)
+			rxcp->vlanf = 0;
+
+		if (!lancer_chip(adapter))
+			rxcp->vlan_tag = swab16(rxcp->vlan_tag);
+
+		if (adapter->pvid == (rxcp->vlan_tag & VLAN_VID_MASK) &&
+		    !test_bit(rxcp->vlan_tag, adapter->vids))
+			rxcp->vlanf = 0;
+	}
+
+	/* As the compl has been parsed, reset it; we wont touch it again */
+	compl->dw[offsetof(struct amap_eth_rx_compl_v1, valid) / 32] = 0;
+
+	queue_tail_inc(&rxo->cq);
+	return rxcp;
+}
+
+static inline struct page *be_alloc_pages(u32 size, gfp_t gfp)
+{
+	u32 order = get_order(size);
+
+	if (order > 0)
+		gfp |= __GFP_COMP;
+	return  alloc_pages(gfp, order);
+}
+
+/*
+ * Allocate a page, split it to fragments of size rx_frag_size and post as
+ * receive buffers to BE
+ */
+static void be_post_rx_frags(struct be_rx_obj *rxo, gfp_t gfp, u32 frags_needed)
+{
+	struct be_adapter *adapter = rxo->adapter;
+	struct be_rx_page_info *page_info = NULL, *prev_page_info = NULL;
+	struct be_queue_info *rxq = &rxo->q;
+	struct page *pagep = NULL;
+	struct device *dev = &adapter->pdev->dev;
+	struct be_eth_rx_d *rxd;
+	u64 page_dmaaddr = 0, frag_dmaaddr;
+	u32 posted, page_offset = 0, notify = 0;
+
+	page_info = &rxo->page_info_tbl[rxq->head];
+	for (posted = 0; posted < frags_needed && !page_info->page; posted++) {
+		if (!pagep) {
+			pagep = be_alloc_pages(adapter->big_page_size, gfp);
+			if (unlikely(!pagep)) {
+				rx_stats(rxo)->rx_post_fail++;
+				break;
+			}
+			page_dmaaddr = dma_map_page(dev, pagep, 0,
+						    adapter->big_page_size,
+						    DMA_FROM_DEVICE);
+			if (dma_mapping_error(dev, page_dmaaddr)) {
+				put_page(pagep);
+				pagep = NULL;
+				adapter->drv_stats.dma_map_errors++;
+				break;
+			}
+			page_offset = 0;
+		} else {
+			get_page(pagep);
+			page_offset += rx_frag_size;
+		}
+		page_info->page_offset = page_offset;
+		page_info->page = pagep;
+
+		rxd = queue_head_node(rxq);
+		frag_dmaaddr = page_dmaaddr + page_info->page_offset;
+		rxd->fragpa_lo = cpu_to_le32(frag_dmaaddr & 0xFFFFFFFF);
+		rxd->fragpa_hi = cpu_to_le32(upper_32_bits(frag_dmaaddr));
+
+		/* Any space left in the current big page for another frag? */
+		if ((page_offset + rx_frag_size + rx_frag_size) >
+					adapter->big_page_size) {
+			pagep = NULL;
+			page_info->last_frag = true;
+			dma_unmap_addr_set(page_info, bus, page_dmaaddr);
+		} else {
+			dma_unmap_addr_set(page_info, bus, frag_dmaaddr);
+		}
+
+		prev_page_info = page_info;
+		queue_head_inc(rxq);
+		page_info = &rxo->page_info_tbl[rxq->head];
+	}
+
+	/* Mark the last frag of a page when we break out of the above loop
+	 * with no more slots available in the RXQ
+	 */
+	if (pagep) {
+		prev_page_info->last_frag = true;
+		dma_unmap_addr_set(prev_page_info, bus, page_dmaaddr);
+	}
+
+	if (posted) {
+		atomic_add(posted, &rxq->used);
+		if (rxo->rx_post_starved)
+			rxo->rx_post_starved = false;
+		do {
+			notify = min(MAX_NUM_POST_ERX_DB, posted);
+			be_rxq_notify(adapter, rxq->id, notify);
+			posted -= notify;
+		} while (posted);
+	} else if (atomic_read(&rxq->used) == 0) {
+		/* Let be_worker replenish when memory is available */
+		rxo->rx_post_starved = true;
+	}
+}
+
+static inline void be_update_tx_err(struct be_tx_obj *txo, u8 status)
+{
+	switch (status) {
+	case BE_TX_COMP_HDR_PARSE_ERR:
+		tx_stats(txo)->tx_hdr_parse_err++;
+		break;
+	case BE_TX_COMP_NDMA_ERR:
+		tx_stats(txo)->tx_dma_err++;
+		break;
+	case BE_TX_COMP_ACL_ERR:
+		tx_stats(txo)->tx_spoof_check_err++;
+		break;
+	}
+}
+
+static inline void lancer_update_tx_err(struct be_tx_obj *txo, u8 status)
+{
+	switch (status) {
+	case LANCER_TX_COMP_LSO_ERR:
+		tx_stats(txo)->tx_tso_err++;
+		break;
+	case LANCER_TX_COMP_HSW_DROP_MAC_ERR:
+	case LANCER_TX_COMP_HSW_DROP_VLAN_ERR:
+		tx_stats(txo)->tx_spoof_check_err++;
+		break;
+	case LANCER_TX_COMP_QINQ_ERR:
+		tx_stats(txo)->tx_qinq_err++;
+		break;
+	case LANCER_TX_COMP_PARITY_ERR:
+		tx_stats(txo)->tx_internal_parity_err++;
+		break;
+	case LANCER_TX_COMP_DMA_ERR:
+		tx_stats(txo)->tx_dma_err++;
+		break;
+	case LANCER_TX_COMP_SGE_ERR:
+		tx_stats(txo)->tx_sge_err++;
+		break;
+	}
+}
+
+static struct be_tx_compl_info *be_tx_compl_get(struct be_adapter *adapter,
+						struct be_tx_obj *txo)
+{
+	struct be_queue_info *tx_cq = &txo->cq;
+	struct be_tx_compl_info *txcp = &txo->txcp;
+	struct be_eth_tx_compl *compl = queue_tail_node(tx_cq);
+
+	if (compl->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] == 0)
+		return NULL;
+
+	/* Ensure load ordering of valid bit dword and other dwords below */
+	rmb();
+	be_dws_le_to_cpu(compl, sizeof(*compl));
+
+	txcp->status = GET_TX_COMPL_BITS(status, compl);
+	txcp->end_index = GET_TX_COMPL_BITS(wrb_index, compl);
+
+	if (txcp->status) {
+		if (lancer_chip(adapter)) {
+			lancer_update_tx_err(txo, txcp->status);
+			/* Reset the adapter incase of TSO,
+			 * SGE or Parity error
+			 */
+			if (txcp->status == LANCER_TX_COMP_LSO_ERR ||
+			    txcp->status == LANCER_TX_COMP_PARITY_ERR ||
+			    txcp->status == LANCER_TX_COMP_SGE_ERR)
+				be_set_error(adapter, BE_ERROR_TX);
+		} else {
+			be_update_tx_err(txo, txcp->status);
+		}
+	}
+
+	if (be_check_error(adapter, BE_ERROR_TX))
+		return NULL;
+
+	compl->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] = 0;
+	queue_tail_inc(tx_cq);
+	return txcp;
+}
+
+static u16 be_tx_compl_process(struct be_adapter *adapter,
+			       struct be_tx_obj *txo, u16 last_index)
+{
+	struct sk_buff **sent_skbs = txo->sent_skb_list;
+	struct be_queue_info *txq = &txo->q;
+	struct sk_buff *skb = NULL;
+	bool unmap_skb_hdr = false;
+	struct be_eth_wrb *wrb;
+	u16 num_wrbs = 0;
+	u32 frag_index;
+
+	do {
+		if (sent_skbs[txq->tail]) {
+			/* Free skb from prev req */
+			if (skb)
+				dev_consume_skb_any(skb);
+			skb = sent_skbs[txq->tail];
+			sent_skbs[txq->tail] = NULL;
+			queue_tail_inc(txq);  /* skip hdr wrb */
+			num_wrbs++;
+			unmap_skb_hdr = true;
+		}
+		wrb = queue_tail_node(txq);
+		frag_index = txq->tail;
+		unmap_tx_frag(&adapter->pdev->dev, wrb,
+			      (unmap_skb_hdr && skb_headlen(skb)));
+		unmap_skb_hdr = false;
+		queue_tail_inc(txq);
+		num_wrbs++;
+	} while (frag_index != last_index);
+	dev_consume_skb_any(skb);
+
+	return num_wrbs;
+}
+
+/* Return the number of events in the event queue */
+static inline int events_get(struct be_eq_obj *eqo)
+{
+	struct be_eq_entry *eqe;
+	int num = 0;
+
+	do {
+		eqe = queue_tail_node(&eqo->q);
+		if (eqe->evt == 0)
+			break;
+
+		rmb();
+		eqe->evt = 0;
+		num++;
+		queue_tail_inc(&eqo->q);
+	} while (true);
+
+	return num;
+}
+
+/* Leaves the EQ is disarmed state */
+static void be_eq_clean(struct be_eq_obj *eqo)
+{
+	int num = events_get(eqo);
+
+	be_eq_notify(eqo->adapter, eqo->q.id, false, true, num, 0);
+}
+
+/* Free posted rx buffers that were not used */
+static void be_rxq_clean(struct be_rx_obj *rxo)
+{
+	struct be_queue_info *rxq = &rxo->q;
+	struct be_rx_page_info *page_info;
+
+	while (atomic_read(&rxq->used) > 0) {
+		page_info = get_rx_page_info(rxo);
+		put_page(page_info->page);
+		memset(page_info, 0, sizeof(*page_info));
+	}
+	BUG_ON(atomic_read(&rxq->used));
+	rxq->tail = 0;
+	rxq->head = 0;
+}
+
+static void be_rx_cq_clean(struct be_rx_obj *rxo)
+{
+	struct be_queue_info *rx_cq = &rxo->cq;
+	struct be_rx_compl_info *rxcp;
+	struct be_adapter *adapter = rxo->adapter;
+	int flush_wait = 0;
+
+	/* Consume pending rx completions.
+	 * Wait for the flush completion (identified by zero num_rcvd)
+	 * to arrive. Notify CQ even when there are no more CQ entries
+	 * for HW to flush partially coalesced CQ entries.
+	 * In Lancer, there is no need to wait for flush compl.
+	 */
+	for (;;) {
+		rxcp = be_rx_compl_get(rxo);
+		if (!rxcp) {
+			if (lancer_chip(adapter))
+				break;
+
+			if (flush_wait++ > 50 ||
+			    be_check_error(adapter,
+					   BE_ERROR_HW)) {
+				dev_warn(&adapter->pdev->dev,
+					 "did not receive flush compl\n");
+				break;
+			}
+			be_cq_notify(adapter, rx_cq->id, true, 0);
+			mdelay(1);
+		} else {
+			be_rx_compl_discard(rxo, rxcp);
+			be_cq_notify(adapter, rx_cq->id, false, 1);
+			if (rxcp->num_rcvd == 0)
+				break;
+		}
+	}
+
+	/* After cleanup, leave the CQ in unarmed state */
+	be_cq_notify(adapter, rx_cq->id, false, 0);
+}
+
+static void be_tx_compl_clean(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	u16 cmpl = 0, timeo = 0, num_wrbs = 0;
+	struct be_tx_compl_info *txcp;
+	struct be_queue_info *txq;
+	u32 end_idx, notified_idx;
+	struct be_tx_obj *txo;
+	int i, pending_txqs;
+
+	/* Stop polling for compls when HW has been silent for 10ms */
+	do {
+		pending_txqs = adapter->num_tx_qs;
+
+		for_all_tx_queues(adapter, txo, i) {
+			cmpl = 0;
+			num_wrbs = 0;
+			txq = &txo->q;
+			while ((txcp = be_tx_compl_get(adapter, txo))) {
+				num_wrbs +=
+					be_tx_compl_process(adapter, txo,
+							    txcp->end_index);
+				cmpl++;
+			}
+			if (cmpl) {
+				be_cq_notify(adapter, txo->cq.id, false, cmpl);
+				atomic_sub(num_wrbs, &txq->used);
+				timeo = 0;
+			}
+			if (!be_is_tx_compl_pending(txo))
+				pending_txqs--;
+		}
+
+		if (pending_txqs == 0 || ++timeo > 10 ||
+		    be_check_error(adapter, BE_ERROR_HW))
+			break;
+
+		mdelay(1);
+	} while (true);
+
+	/* Free enqueued TX that was never notified to HW */
+	for_all_tx_queues(adapter, txo, i) {
+		txq = &txo->q;
+
+		if (atomic_read(&txq->used)) {
+			dev_info(dev, "txq%d: cleaning %d pending tx-wrbs\n",
+				 i, atomic_read(&txq->used));
+			notified_idx = txq->tail;
+			end_idx = txq->tail;
+			index_adv(&end_idx, atomic_read(&txq->used) - 1,
+				  txq->len);
+			/* Use the tx-compl process logic to handle requests
+			 * that were not sent to the HW.
+			 */
+			num_wrbs = be_tx_compl_process(adapter, txo, end_idx);
+			atomic_sub(num_wrbs, &txq->used);
+			BUG_ON(atomic_read(&txq->used));
+			txo->pend_wrb_cnt = 0;
+			/* Since hw was never notified of these requests,
+			 * reset TXQ indices
+			 */
+			txq->head = notified_idx;
+			txq->tail = notified_idx;
+		}
+	}
+}
+
+static void be_evt_queues_destroy(struct be_adapter *adapter)
+{
+	struct be_eq_obj *eqo;
+	int i;
+
+	for_all_evt_queues(adapter, eqo, i) {
+		if (eqo->q.created) {
+			be_eq_clean(eqo);
+			be_cmd_q_destroy(adapter, &eqo->q, QTYPE_EQ);
+			netif_napi_del(&eqo->napi);
+			free_cpumask_var(eqo->affinity_mask);
+		}
+		be_queue_free(adapter, &eqo->q);
+	}
+}
+
+static int be_evt_queues_create(struct be_adapter *adapter)
+{
+	struct be_queue_info *eq;
+	struct be_eq_obj *eqo;
+	struct be_aic_obj *aic;
+	int i, rc;
+
+	/* need enough EQs to service both RX and TX queues */
+	adapter->num_evt_qs = min_t(u16, num_irqs(adapter),
+				    max(adapter->cfg_num_rx_irqs,
+					adapter->cfg_num_tx_irqs));
+
+	for_all_evt_queues(adapter, eqo, i) {
+		int numa_node = dev_to_node(&adapter->pdev->dev);
+
+		aic = &adapter->aic_obj[i];
+		eqo->adapter = adapter;
+		eqo->idx = i;
+		aic->max_eqd = BE_MAX_EQD;
+		aic->enable = true;
+
+		eq = &eqo->q;
+		rc = be_queue_alloc(adapter, eq, EVNT_Q_LEN,
+				    sizeof(struct be_eq_entry));
+		if (rc)
+			return rc;
+
+		rc = be_cmd_eq_create(adapter, eqo);
+		if (rc)
+			return rc;
+
+		if (!zalloc_cpumask_var(&eqo->affinity_mask, GFP_KERNEL))
+			return -ENOMEM;
+		cpumask_set_cpu(cpumask_local_spread(i, numa_node),
+				eqo->affinity_mask);
+		netif_napi_add(adapter->netdev, &eqo->napi, be_poll,
+			       BE_NAPI_WEIGHT);
+	}
+	return 0;
+}
+
+static void be_mcc_queues_destroy(struct be_adapter *adapter)
+{
+	struct be_queue_info *q;
+
+	q = &adapter->mcc_obj.q;
+	if (q->created)
+		be_cmd_q_destroy(adapter, q, QTYPE_MCCQ);
+	be_queue_free(adapter, q);
+
+	q = &adapter->mcc_obj.cq;
+	if (q->created)
+		be_cmd_q_destroy(adapter, q, QTYPE_CQ);
+	be_queue_free(adapter, q);
+}
+
+/* Must be called only after TX qs are created as MCC shares TX EQ */
+static int be_mcc_queues_create(struct be_adapter *adapter)
+{
+	struct be_queue_info *q, *cq;
+
+	cq = &adapter->mcc_obj.cq;
+	if (be_queue_alloc(adapter, cq, MCC_CQ_LEN,
+			   sizeof(struct be_mcc_compl)))
+		goto err;
+
+	/* Use the default EQ for MCC completions */
+	if (be_cmd_cq_create(adapter, cq, &mcc_eqo(adapter)->q, true, 0))
+		goto mcc_cq_free;
+
+	q = &adapter->mcc_obj.q;
+	if (be_queue_alloc(adapter, q, MCC_Q_LEN, sizeof(struct be_mcc_wrb)))
+		goto mcc_cq_destroy;
+
+	if (be_cmd_mccq_create(adapter, q, cq))
+		goto mcc_q_free;
+
+	return 0;
+
+mcc_q_free:
+	be_queue_free(adapter, q);
+mcc_cq_destroy:
+	be_cmd_q_destroy(adapter, cq, QTYPE_CQ);
+mcc_cq_free:
+	be_queue_free(adapter, cq);
+err:
+	return -1;
+}
+
+static void be_tx_queues_destroy(struct be_adapter *adapter)
+{
+	struct be_queue_info *q;
+	struct be_tx_obj *txo;
+	u8 i;
+
+	for_all_tx_queues(adapter, txo, i) {
+		q = &txo->q;
+		if (q->created)
+			be_cmd_q_destroy(adapter, q, QTYPE_TXQ);
+		be_queue_free(adapter, q);
+
+		q = &txo->cq;
+		if (q->created)
+			be_cmd_q_destroy(adapter, q, QTYPE_CQ);
+		be_queue_free(adapter, q);
+	}
+}
+
+static int be_tx_qs_create(struct be_adapter *adapter)
+{
+	struct be_queue_info *cq;
+	struct be_tx_obj *txo;
+	struct be_eq_obj *eqo;
+	int status, i;
+
+	adapter->num_tx_qs = min(adapter->num_evt_qs, adapter->cfg_num_tx_irqs);
+
+	for_all_tx_queues(adapter, txo, i) {
+		cq = &txo->cq;
+		status = be_queue_alloc(adapter, cq, TX_CQ_LEN,
+					sizeof(struct be_eth_tx_compl));
+		if (status)
+			return status;
+
+		u64_stats_init(&txo->stats.sync);
+		u64_stats_init(&txo->stats.sync_compl);
+
+		/* If num_evt_qs is less than num_tx_qs, then more than
+		 * one txq share an eq
+		 */
+		eqo = &adapter->eq_obj[i % adapter->num_evt_qs];
+		status = be_cmd_cq_create(adapter, cq, &eqo->q, false, 3);
+		if (status)
+			return status;
+
+		status = be_queue_alloc(adapter, &txo->q, TX_Q_LEN,
+					sizeof(struct be_eth_wrb));
+		if (status)
+			return status;
+
+		status = be_cmd_txq_create(adapter, txo);
+		if (status)
+			return status;
+
+		netif_set_xps_queue(adapter->netdev, eqo->affinity_mask,
+				    eqo->idx);
+	}
+
+	dev_info(&adapter->pdev->dev, "created %d TX queue(s)\n",
+		 adapter->num_tx_qs);
+	return 0;
+}
+
+static void be_rx_cqs_destroy(struct be_adapter *adapter)
+{
+	struct be_queue_info *q;
+	struct be_rx_obj *rxo;
+	int i;
+
+	for_all_rx_queues(adapter, rxo, i) {
+		q = &rxo->cq;
+		if (q->created)
+			be_cmd_q_destroy(adapter, q, QTYPE_CQ);
+		be_queue_free(adapter, q);
+	}
+}
+
+static int be_rx_cqs_create(struct be_adapter *adapter)
+{
+	struct be_queue_info *eq, *cq;
+	struct be_rx_obj *rxo;
+	int rc, i;
+
+	adapter->num_rss_qs =
+			min(adapter->num_evt_qs, adapter->cfg_num_rx_irqs);
+
+	/* We'll use RSS only if atleast 2 RSS rings are supported. */
+	if (adapter->num_rss_qs < 2)
+		adapter->num_rss_qs = 0;
+
+	adapter->num_rx_qs = adapter->num_rss_qs + adapter->need_def_rxq;
+
+	/* When the interface is not capable of RSS rings (and there is no
+	 * need to create a default RXQ) we'll still need one RXQ
+	 */
+	if (adapter->num_rx_qs == 0)
+		adapter->num_rx_qs = 1;
+
+	adapter->big_page_size = (1 << get_order(rx_frag_size)) * PAGE_SIZE;
+	for_all_rx_queues(adapter, rxo, i) {
+		rxo->adapter = adapter;
+		cq = &rxo->cq;
+		rc = be_queue_alloc(adapter, cq, RX_CQ_LEN,
+				    sizeof(struct be_eth_rx_compl));
+		if (rc)
+			return rc;
+
+		u64_stats_init(&rxo->stats.sync);
+		eq = &adapter->eq_obj[i % adapter->num_evt_qs].q;
+		rc = be_cmd_cq_create(adapter, cq, eq, false, 3);
+		if (rc)
+			return rc;
+	}
+
+	dev_info(&adapter->pdev->dev,
+		 "created %d RX queue(s)\n", adapter->num_rx_qs);
+	return 0;
+}
+
+static irqreturn_t be_intx(int irq, void *dev)
+{
+	struct be_eq_obj *eqo = dev;
+	struct be_adapter *adapter = eqo->adapter;
+	int num_evts = 0;
+
+	/* IRQ is not expected when NAPI is scheduled as the EQ
+	 * will not be armed.
+	 * But, this can happen on Lancer INTx where it takes
+	 * a while to de-assert INTx or in BE2 where occasionaly
+	 * an interrupt may be raised even when EQ is unarmed.
+	 * If NAPI is already scheduled, then counting & notifying
+	 * events will orphan them.
+	 */
+	if (napi_schedule_prep(&eqo->napi)) {
+		num_evts = events_get(eqo);
+		__napi_schedule(&eqo->napi);
+		if (num_evts)
+			eqo->spurious_intr = 0;
+	}
+	be_eq_notify(adapter, eqo->q.id, false, true, num_evts, 0);
+
+	/* Return IRQ_HANDLED only for the the first spurious intr
+	 * after a valid intr to stop the kernel from branding
+	 * this irq as a bad one!
+	 */
+	if (num_evts || eqo->spurious_intr++ == 0)
+		return IRQ_HANDLED;
+	else
+		return IRQ_NONE;
+}
+
+static irqreturn_t be_msix(int irq, void *dev)
+{
+	struct be_eq_obj *eqo = dev;
+
+	be_eq_notify(eqo->adapter, eqo->q.id, false, true, 0, 0);
+	napi_schedule(&eqo->napi);
+	return IRQ_HANDLED;
+}
+
+static inline bool do_gro(struct be_rx_compl_info *rxcp)
+{
+	return (rxcp->tcpf && !rxcp->err && rxcp->l4_csum) ? true : false;
+}
+
+static int be_process_rx(struct be_rx_obj *rxo, struct napi_struct *napi,
+			 int budget)
+{
+	struct be_adapter *adapter = rxo->adapter;
+	struct be_queue_info *rx_cq = &rxo->cq;
+	struct be_rx_compl_info *rxcp;
+	u32 work_done;
+	u32 frags_consumed = 0;
+
+	for (work_done = 0; work_done < budget; work_done++) {
+		rxcp = be_rx_compl_get(rxo);
+		if (!rxcp)
+			break;
+
+		/* Is it a flush compl that has no data */
+		if (unlikely(rxcp->num_rcvd == 0))
+			goto loop_continue;
+
+		/* Discard compl with partial DMA Lancer B0 */
+		if (unlikely(!rxcp->pkt_size)) {
+			be_rx_compl_discard(rxo, rxcp);
+			goto loop_continue;
+		}
+
+		/* On BE drop pkts that arrive due to imperfect filtering in
+		 * promiscuous mode on some skews
+		 */
+		if (unlikely(rxcp->port != adapter->port_num &&
+			     !lancer_chip(adapter))) {
+			be_rx_compl_discard(rxo, rxcp);
+			goto loop_continue;
+		}
+
+		if (do_gro(rxcp))
+			be_rx_compl_process_gro(rxo, napi, rxcp);
+		else
+			be_rx_compl_process(rxo, napi, rxcp);
+
+loop_continue:
+		frags_consumed += rxcp->num_rcvd;
+		be_rx_stats_update(rxo, rxcp);
+	}
+
+	if (work_done) {
+		be_cq_notify(adapter, rx_cq->id, true, work_done);
+
+		/* When an rx-obj gets into post_starved state, just
+		 * let be_worker do the posting.
+		 */
+		if (atomic_read(&rxo->q.used) < RX_FRAGS_REFILL_WM &&
+		    !rxo->rx_post_starved)
+			be_post_rx_frags(rxo, GFP_ATOMIC,
+					 max_t(u32, MAX_RX_POST,
+					       frags_consumed));
+	}
+
+	return work_done;
+}
+
+
+static void be_process_tx(struct be_adapter *adapter, struct be_tx_obj *txo,
+			  int idx)
+{
+	int num_wrbs = 0, work_done = 0;
+	struct be_tx_compl_info *txcp;
+
+	while ((txcp = be_tx_compl_get(adapter, txo))) {
+		num_wrbs += be_tx_compl_process(adapter, txo, txcp->end_index);
+		work_done++;
+	}
+
+	if (work_done) {
+		be_cq_notify(adapter, txo->cq.id, true, work_done);
+		atomic_sub(num_wrbs, &txo->q.used);
+
+		/* As Tx wrbs have been freed up, wake up netdev queue
+		 * if it was stopped due to lack of tx wrbs.  */
+		if (__netif_subqueue_stopped(adapter->netdev, idx) &&
+		    be_can_txq_wake(txo)) {
+			netif_wake_subqueue(adapter->netdev, idx);
+		}
+
+		u64_stats_update_begin(&tx_stats(txo)->sync_compl);
+		tx_stats(txo)->tx_compl += work_done;
+		u64_stats_update_end(&tx_stats(txo)->sync_compl);
+	}
+}
+
+int be_poll(struct napi_struct *napi, int budget)
+{
+	struct be_eq_obj *eqo = container_of(napi, struct be_eq_obj, napi);
+	struct be_adapter *adapter = eqo->adapter;
+	int max_work = 0, work, i, num_evts;
+	struct be_rx_obj *rxo;
+	struct be_tx_obj *txo;
+	u32 mult_enc = 0;
+
+	num_evts = events_get(eqo);
+
+	for_all_tx_queues_on_eq(adapter, eqo, txo, i)
+		be_process_tx(adapter, txo, i);
+
+	/* This loop will iterate twice for EQ0 in which
+	 * completions of the last RXQ (default one) are also processed
+	 * For other EQs the loop iterates only once
+	 */
+	for_all_rx_queues_on_eq(adapter, eqo, rxo, i) {
+		work = be_process_rx(rxo, napi, budget);
+		max_work = max(work, max_work);
+	}
+
+	if (is_mcc_eqo(eqo))
+		be_process_mcc(adapter);
+
+	if (max_work < budget) {
+		napi_complete_done(napi, max_work);
+
+		/* Skyhawk EQ_DB has a provision to set the rearm to interrupt
+		 * delay via a delay multiplier encoding value
+		 */
+		if (skyhawk_chip(adapter))
+			mult_enc = be_get_eq_delay_mult_enc(eqo);
+
+		be_eq_notify(adapter, eqo->q.id, true, false, num_evts,
+			     mult_enc);
+	} else {
+		/* As we'll continue in polling mode, count and clear events */
+		be_eq_notify(adapter, eqo->q.id, false, false, num_evts, 0);
+	}
+	return max_work;
+}
+
+void be_detect_error(struct be_adapter *adapter)
+{
+	u32 ue_lo = 0, ue_hi = 0, ue_lo_mask = 0, ue_hi_mask = 0;
+	u32 sliport_status = 0, sliport_err1 = 0, sliport_err2 = 0;
+	struct device *dev = &adapter->pdev->dev;
+	u16 val;
+	u32 i;
+
+	if (be_check_error(adapter, BE_ERROR_HW))
+		return;
+
+	if (lancer_chip(adapter)) {
+		sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET);
+		if (sliport_status & SLIPORT_STATUS_ERR_MASK) {
+			be_set_error(adapter, BE_ERROR_UE);
+			sliport_err1 = ioread32(adapter->db +
+						SLIPORT_ERROR1_OFFSET);
+			sliport_err2 = ioread32(adapter->db +
+						SLIPORT_ERROR2_OFFSET);
+			/* Do not log error messages if its a FW reset */
+			if (sliport_err1 == SLIPORT_ERROR_FW_RESET1 &&
+			    sliport_err2 == SLIPORT_ERROR_FW_RESET2) {
+				dev_info(dev, "Firmware update in progress\n");
+			} else {
+				dev_err(dev, "Error detected in the card\n");
+				dev_err(dev, "ERR: sliport status 0x%x\n",
+					sliport_status);
+				dev_err(dev, "ERR: sliport error1 0x%x\n",
+					sliport_err1);
+				dev_err(dev, "ERR: sliport error2 0x%x\n",
+					sliport_err2);
+			}
+		}
+	} else {
+		ue_lo = ioread32(adapter->pcicfg + PCICFG_UE_STATUS_LOW);
+		ue_hi = ioread32(adapter->pcicfg + PCICFG_UE_STATUS_HIGH);
+		ue_lo_mask = ioread32(adapter->pcicfg +
+				      PCICFG_UE_STATUS_LOW_MASK);
+		ue_hi_mask = ioread32(adapter->pcicfg +
+				      PCICFG_UE_STATUS_HI_MASK);
+
+		ue_lo = (ue_lo & ~ue_lo_mask);
+		ue_hi = (ue_hi & ~ue_hi_mask);
+
+		if (ue_lo || ue_hi) {
+			/* On certain platforms BE3 hardware can indicate
+			 * spurious UEs. In case of a UE in the chip,
+			 * the POST register correctly reports either a
+			 * FAT_LOG_START state (FW is currently dumping
+			 * FAT log data) or a ARMFW_UE state. Check for the
+			 * above states to ascertain if the UE is valid or not.
+			 */
+			if (BE3_chip(adapter)) {
+				val = be_POST_stage_get(adapter);
+				if ((val & POST_STAGE_FAT_LOG_START)
+				     != POST_STAGE_FAT_LOG_START &&
+				    (val & POST_STAGE_ARMFW_UE)
+				     != POST_STAGE_ARMFW_UE &&
+				    (val & POST_STAGE_RECOVERABLE_ERR)
+				     != POST_STAGE_RECOVERABLE_ERR)
+					return;
+			}
+
+			dev_err(dev, "Error detected in the adapter");
+			be_set_error(adapter, BE_ERROR_UE);
+
+			for (i = 0; ue_lo; ue_lo >>= 1, i++) {
+				if (ue_lo & 1)
+					dev_err(dev, "UE: %s bit set\n",
+						ue_status_low_desc[i]);
+			}
+			for (i = 0; ue_hi; ue_hi >>= 1, i++) {
+				if (ue_hi & 1)
+					dev_err(dev, "UE: %s bit set\n",
+						ue_status_hi_desc[i]);
+			}
+		}
+	}
+}
+
+static void be_msix_disable(struct be_adapter *adapter)
+{
+	if (msix_enabled(adapter)) {
+		pci_disable_msix(adapter->pdev);
+		adapter->num_msix_vec = 0;
+		adapter->num_msix_roce_vec = 0;
+	}
+}
+
+static int be_msix_enable(struct be_adapter *adapter)
+{
+	unsigned int i, max_roce_eqs;
+	struct device *dev = &adapter->pdev->dev;
+	int num_vec;
+
+	/* If RoCE is supported, program the max number of vectors that
+	 * could be used for NIC and RoCE, else, just program the number
+	 * we'll use initially.
+	 */
+	if (be_roce_supported(adapter)) {
+		max_roce_eqs =
+			be_max_func_eqs(adapter) - be_max_nic_eqs(adapter);
+		max_roce_eqs = min(max_roce_eqs, num_online_cpus());
+		num_vec = be_max_any_irqs(adapter) + max_roce_eqs;
+	} else {
+		num_vec = max(adapter->cfg_num_rx_irqs,
+			      adapter->cfg_num_tx_irqs);
+	}
+
+	for (i = 0; i < num_vec; i++)
+		adapter->msix_entries[i].entry = i;
+
+	num_vec = pci_enable_msix_range(adapter->pdev, adapter->msix_entries,
+					MIN_MSIX_VECTORS, num_vec);
+	if (num_vec < 0)
+		goto fail;
+
+	if (be_roce_supported(adapter) && num_vec > MIN_MSIX_VECTORS) {
+		adapter->num_msix_roce_vec = num_vec / 2;
+		dev_info(dev, "enabled %d MSI-x vector(s) for RoCE\n",
+			 adapter->num_msix_roce_vec);
+	}
+
+	adapter->num_msix_vec = num_vec - adapter->num_msix_roce_vec;
+
+	dev_info(dev, "enabled %d MSI-x vector(s) for NIC\n",
+		 adapter->num_msix_vec);
+	return 0;
+
+fail:
+	dev_warn(dev, "MSIx enable failed\n");
+
+	/* INTx is not supported in VFs, so fail probe if enable_msix fails */
+	if (be_virtfn(adapter))
+		return num_vec;
+	return 0;
+}
+
+static inline int be_msix_vec_get(struct be_adapter *adapter,
+				  struct be_eq_obj *eqo)
+{
+	return adapter->msix_entries[eqo->msix_idx].vector;
+}
+
+static int be_msix_register(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct be_eq_obj *eqo;
+	int status, i, vec;
+
+	for_all_evt_queues(adapter, eqo, i) {
+		sprintf(eqo->desc, "%s-q%d", netdev->name, i);
+		vec = be_msix_vec_get(adapter, eqo);
+		status = request_irq(vec, be_msix, 0, eqo->desc, eqo);
+		if (status)
+			goto err_msix;
+
+		irq_set_affinity_hint(vec, eqo->affinity_mask);
+	}
+
+	return 0;
+err_msix:
+	for (i--; i >= 0; i--) {
+		eqo = &adapter->eq_obj[i];
+		free_irq(be_msix_vec_get(adapter, eqo), eqo);
+	}
+	dev_warn(&adapter->pdev->dev, "MSIX Request IRQ failed - err %d\n",
+		 status);
+	be_msix_disable(adapter);
+	return status;
+}
+
+static int be_irq_register(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	int status;
+
+	if (msix_enabled(adapter)) {
+		status = be_msix_register(adapter);
+		if (status == 0)
+			goto done;
+		/* INTx is not supported for VF */
+		if (be_virtfn(adapter))
+			return status;
+	}
+
+	/* INTx: only the first EQ is used */
+	netdev->irq = adapter->pdev->irq;
+	status = request_irq(netdev->irq, be_intx, IRQF_SHARED, netdev->name,
+			     &adapter->eq_obj[0]);
+	if (status) {
+		dev_err(&adapter->pdev->dev,
+			"INTx request IRQ failed - err %d\n", status);
+		return status;
+	}
+done:
+	adapter->isr_registered = true;
+	return 0;
+}
+
+static void be_irq_unregister(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct be_eq_obj *eqo;
+	int i, vec;
+
+	if (!adapter->isr_registered)
+		return;
+
+	/* INTx */
+	if (!msix_enabled(adapter)) {
+		free_irq(netdev->irq, &adapter->eq_obj[0]);
+		goto done;
+	}
+
+	/* MSIx */
+	for_all_evt_queues(adapter, eqo, i) {
+		vec = be_msix_vec_get(adapter, eqo);
+		irq_set_affinity_hint(vec, NULL);
+		free_irq(vec, eqo);
+	}
+
+done:
+	adapter->isr_registered = false;
+}
+
+static void be_rx_qs_destroy(struct be_adapter *adapter)
+{
+	struct rss_info *rss = &adapter->rss_info;
+	struct be_queue_info *q;
+	struct be_rx_obj *rxo;
+	int i;
+
+	for_all_rx_queues(adapter, rxo, i) {
+		q = &rxo->q;
+		if (q->created) {
+			/* If RXQs are destroyed while in an "out of buffer"
+			 * state, there is a possibility of an HW stall on
+			 * Lancer. So, post 64 buffers to each queue to relieve
+			 * the "out of buffer" condition.
+			 * Make sure there's space in the RXQ before posting.
+			 */
+			if (lancer_chip(adapter)) {
+				be_rx_cq_clean(rxo);
+				if (atomic_read(&q->used) == 0)
+					be_post_rx_frags(rxo, GFP_KERNEL,
+							 MAX_RX_POST);
+			}
+
+			be_cmd_rxq_destroy(adapter, q);
+			be_rx_cq_clean(rxo);
+			be_rxq_clean(rxo);
+		}
+		be_queue_free(adapter, q);
+	}
+
+	if (rss->rss_flags) {
+		rss->rss_flags = RSS_ENABLE_NONE;
+		be_cmd_rss_config(adapter, rss->rsstable, rss->rss_flags,
+				  128, rss->rss_hkey);
+	}
+}
+
+static void be_disable_if_filters(struct be_adapter *adapter)
+{
+	/* Don't delete MAC on BE3 VFs without FILTMGMT privilege  */
+	if (!BEx_chip(adapter) || !be_virtfn(adapter) ||
+	    check_privilege(adapter, BE_PRIV_FILTMGMT)) {
+		be_dev_mac_del(adapter, adapter->pmac_id[0]);
+		eth_zero_addr(adapter->dev_mac);
+	}
+
+	be_clear_uc_list(adapter);
+	be_clear_mc_list(adapter);
+
+	/* The IFACE flags are enabled in the open path and cleared
+	 * in the close path. When a VF gets detached from the host and
+	 * assigned to a VM the following happens:
+	 *	- VF's IFACE flags get cleared in the detach path
+	 *	- IFACE create is issued by the VF in the attach path
+	 * Due to a bug in the BE3/Skyhawk-R FW
+	 * (Lancer FW doesn't have the bug), the IFACE capability flags
+	 * specified along with the IFACE create cmd issued by a VF are not
+	 * honoured by FW.  As a consequence, if a *new* driver
+	 * (that enables/disables IFACE flags in open/close)
+	 * is loaded in the host and an *old* driver is * used by a VM/VF,
+	 * the IFACE gets created *without* the needed flags.
+	 * To avoid this, disable RX-filter flags only for Lancer.
+	 */
+	if (lancer_chip(adapter)) {
+		be_cmd_rx_filter(adapter, BE_IF_ALL_FILT_FLAGS, OFF);
+		adapter->if_flags &= ~BE_IF_ALL_FILT_FLAGS;
+	}
+}
+
+static int be_close(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_eq_obj *eqo;
+	int i;
+
+	/* This protection is needed as be_close() may be called even when the
+	 * adapter is in cleared state (after eeh perm failure)
+	 */
+	if (!(adapter->flags & BE_FLAGS_SETUP_DONE))
+		return 0;
+
+	/* Before attempting cleanup ensure all the pending cmds in the
+	 * config_wq have finished execution
+	 */
+	flush_workqueue(be_wq);
+
+	be_disable_if_filters(adapter);
+
+	if (adapter->flags & BE_FLAGS_NAPI_ENABLED) {
+		for_all_evt_queues(adapter, eqo, i) {
+			napi_disable(&eqo->napi);
+		}
+		adapter->flags &= ~BE_FLAGS_NAPI_ENABLED;
+	}
+
+	be_async_mcc_disable(adapter);
+
+	/* Wait for all pending tx completions to arrive so that
+	 * all tx skbs are freed.
+	 */
+	netif_tx_disable(netdev);
+	be_tx_compl_clean(adapter);
+
+	be_rx_qs_destroy(adapter);
+
+	for_all_evt_queues(adapter, eqo, i) {
+		if (msix_enabled(adapter))
+			synchronize_irq(be_msix_vec_get(adapter, eqo));
+		else
+			synchronize_irq(netdev->irq);
+		be_eq_clean(eqo);
+	}
+
+	be_irq_unregister(adapter);
+
+	return 0;
+}
+
+static int be_rx_qs_create(struct be_adapter *adapter)
+{
+	struct rss_info *rss = &adapter->rss_info;
+	u8 rss_key[RSS_HASH_KEY_LEN];
+	struct be_rx_obj *rxo;
+	int rc, i, j;
+
+	for_all_rx_queues(adapter, rxo, i) {
+		rc = be_queue_alloc(adapter, &rxo->q, RX_Q_LEN,
+				    sizeof(struct be_eth_rx_d));
+		if (rc)
+			return rc;
+	}
+
+	if (adapter->need_def_rxq || !adapter->num_rss_qs) {
+		rxo = default_rxo(adapter);
+		rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id,
+				       rx_frag_size, adapter->if_handle,
+				       false, &rxo->rss_id);
+		if (rc)
+			return rc;
+	}
+
+	for_all_rss_queues(adapter, rxo, i) {
+		rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id,
+				       rx_frag_size, adapter->if_handle,
+				       true, &rxo->rss_id);
+		if (rc)
+			return rc;
+	}
+
+	if (be_multi_rxq(adapter)) {
+		for (j = 0; j < RSS_INDIR_TABLE_LEN; j += adapter->num_rss_qs) {
+			for_all_rss_queues(adapter, rxo, i) {
+				if ((j + i) >= RSS_INDIR_TABLE_LEN)
+					break;
+				rss->rsstable[j + i] = rxo->rss_id;
+				rss->rss_queue[j + i] = i;
+			}
+		}
+		rss->rss_flags = RSS_ENABLE_TCP_IPV4 | RSS_ENABLE_IPV4 |
+			RSS_ENABLE_TCP_IPV6 | RSS_ENABLE_IPV6;
+
+		if (!BEx_chip(adapter))
+			rss->rss_flags |= RSS_ENABLE_UDP_IPV4 |
+				RSS_ENABLE_UDP_IPV6;
+
+		netdev_rss_key_fill(rss_key, RSS_HASH_KEY_LEN);
+		rc = be_cmd_rss_config(adapter, rss->rsstable, rss->rss_flags,
+				       RSS_INDIR_TABLE_LEN, rss_key);
+		if (rc) {
+			rss->rss_flags = RSS_ENABLE_NONE;
+			return rc;
+		}
+
+		memcpy(rss->rss_hkey, rss_key, RSS_HASH_KEY_LEN);
+	} else {
+		/* Disable RSS, if only default RX Q is created */
+		rss->rss_flags = RSS_ENABLE_NONE;
+	}
+
+
+	/* Post 1 less than RXQ-len to avoid head being equal to tail,
+	 * which is a queue empty condition
+	 */
+	for_all_rx_queues(adapter, rxo, i)
+		be_post_rx_frags(rxo, GFP_KERNEL, RX_Q_LEN - 1);
+
+	return 0;
+}
+
+static int be_enable_if_filters(struct be_adapter *adapter)
+{
+	int status;
+
+	status = be_cmd_rx_filter(adapter, BE_IF_FILT_FLAGS_BASIC, ON);
+	if (status)
+		return status;
+
+	/* Normally this condition usually true as the ->dev_mac is zeroed.
+	 * But on BE3 VFs the initial MAC is pre-programmed by PF and
+	 * subsequent be_dev_mac_add() can fail (after fresh boot)
+	 */
+	if (!ether_addr_equal(adapter->dev_mac, adapter->netdev->dev_addr)) {
+		int old_pmac_id = -1;
+
+		/* Remember old programmed MAC if any - can happen on BE3 VF */
+		if (!is_zero_ether_addr(adapter->dev_mac))
+			old_pmac_id = adapter->pmac_id[0];
+
+		status = be_dev_mac_add(adapter, adapter->netdev->dev_addr);
+		if (status)
+			return status;
+
+		/* Delete the old programmed MAC as we successfully programmed
+		 * a new MAC
+		 */
+		if (old_pmac_id >= 0 && old_pmac_id != adapter->pmac_id[0])
+			be_dev_mac_del(adapter, old_pmac_id);
+
+		ether_addr_copy(adapter->dev_mac, adapter->netdev->dev_addr);
+	}
+
+	if (adapter->vlans_added)
+		be_vid_config(adapter);
+
+	__be_set_rx_mode(adapter);
+
+	return 0;
+}
+
+static int be_open(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_eq_obj *eqo;
+	struct be_rx_obj *rxo;
+	struct be_tx_obj *txo;
+	u8 link_status;
+	int status, i;
+
+	status = be_rx_qs_create(adapter);
+	if (status)
+		goto err;
+
+	status = be_enable_if_filters(adapter);
+	if (status)
+		goto err;
+
+	status = be_irq_register(adapter);
+	if (status)
+		goto err;
+
+	for_all_rx_queues(adapter, rxo, i)
+		be_cq_notify(adapter, rxo->cq.id, true, 0);
+
+	for_all_tx_queues(adapter, txo, i)
+		be_cq_notify(adapter, txo->cq.id, true, 0);
+
+	be_async_mcc_enable(adapter);
+
+	for_all_evt_queues(adapter, eqo, i) {
+		napi_enable(&eqo->napi);
+		be_eq_notify(adapter, eqo->q.id, true, true, 0, 0);
+	}
+	adapter->flags |= BE_FLAGS_NAPI_ENABLED;
+
+	status = be_cmd_link_status_query(adapter, NULL, &link_status, 0);
+	if (!status)
+		be_link_status_update(adapter, link_status);
+
+	netif_tx_start_all_queues(netdev);
+	if (skyhawk_chip(adapter))
+		udp_tunnel_get_rx_info(netdev);
+
+	return 0;
+err:
+	be_close(adapter->netdev);
+	return -EIO;
+}
+
+static void be_vf_eth_addr_generate(struct be_adapter *adapter, u8 *mac)
+{
+	u32 addr;
+
+	addr = jhash(adapter->netdev->dev_addr, ETH_ALEN, 0);
+
+	mac[5] = (u8)(addr & 0xFF);
+	mac[4] = (u8)((addr >> 8) & 0xFF);
+	mac[3] = (u8)((addr >> 16) & 0xFF);
+	/* Use the OUI from the current MAC address */
+	memcpy(mac, adapter->netdev->dev_addr, 3);
+}
+
+/*
+ * Generate a seed MAC address from the PF MAC Address using jhash.
+ * MAC Address for VFs are assigned incrementally starting from the seed.
+ * These addresses are programmed in the ASIC by the PF and the VF driver
+ * queries for the MAC address during its probe.
+ */
+static int be_vf_eth_addr_config(struct be_adapter *adapter)
+{
+	u32 vf;
+	int status = 0;
+	u8 mac[ETH_ALEN];
+	struct be_vf_cfg *vf_cfg;
+
+	be_vf_eth_addr_generate(adapter, mac);
+
+	for_all_vfs(adapter, vf_cfg, vf) {
+		if (BEx_chip(adapter))
+			status = be_cmd_pmac_add(adapter, mac,
+						 vf_cfg->if_handle,
+						 &vf_cfg->pmac_id, vf + 1);
+		else
+			status = be_cmd_set_mac(adapter, mac, vf_cfg->if_handle,
+						vf + 1);
+
+		if (status)
+			dev_err(&adapter->pdev->dev,
+				"Mac address assignment failed for VF %d\n",
+				vf);
+		else
+			memcpy(vf_cfg->mac_addr, mac, ETH_ALEN);
+
+		mac[5] += 1;
+	}
+	return status;
+}
+
+static int be_vfs_mac_query(struct be_adapter *adapter)
+{
+	int status, vf;
+	u8 mac[ETH_ALEN];
+	struct be_vf_cfg *vf_cfg;
+
+	for_all_vfs(adapter, vf_cfg, vf) {
+		status = be_cmd_get_active_mac(adapter, vf_cfg->pmac_id,
+					       mac, vf_cfg->if_handle,
+					       false, vf+1);
+		if (status)
+			return status;
+		memcpy(vf_cfg->mac_addr, mac, ETH_ALEN);
+	}
+	return 0;
+}
+
+static void be_vf_clear(struct be_adapter *adapter)
+{
+	struct be_vf_cfg *vf_cfg;
+	u32 vf;
+
+	if (pci_vfs_assigned(adapter->pdev)) {
+		dev_warn(&adapter->pdev->dev,
+			 "VFs are assigned to VMs: not disabling VFs\n");
+		goto done;
+	}
+
+	pci_disable_sriov(adapter->pdev);
+
+	for_all_vfs(adapter, vf_cfg, vf) {
+		if (BEx_chip(adapter))
+			be_cmd_pmac_del(adapter, vf_cfg->if_handle,
+					vf_cfg->pmac_id, vf + 1);
+		else
+			be_cmd_set_mac(adapter, NULL, vf_cfg->if_handle,
+				       vf + 1);
+
+		be_cmd_if_destroy(adapter, vf_cfg->if_handle, vf + 1);
+	}
+
+	if (BE3_chip(adapter))
+		be_cmd_set_hsw_config(adapter, 0, 0,
+				      adapter->if_handle,
+				      PORT_FWD_TYPE_PASSTHRU, 0);
+done:
+	kfree(adapter->vf_cfg);
+	adapter->num_vfs = 0;
+	adapter->flags &= ~BE_FLAGS_SRIOV_ENABLED;
+}
+
+static void be_clear_queues(struct be_adapter *adapter)
+{
+	be_mcc_queues_destroy(adapter);
+	be_rx_cqs_destroy(adapter);
+	be_tx_queues_destroy(adapter);
+	be_evt_queues_destroy(adapter);
+}
+
+static void be_cancel_worker(struct be_adapter *adapter)
+{
+	if (adapter->flags & BE_FLAGS_WORKER_SCHEDULED) {
+		cancel_delayed_work_sync(&adapter->work);
+		adapter->flags &= ~BE_FLAGS_WORKER_SCHEDULED;
+	}
+}
+
+static void be_cancel_err_detection(struct be_adapter *adapter)
+{
+	struct be_error_recovery *err_rec = &adapter->error_recovery;
+
+	if (!be_err_recovery_workq)
+		return;
+
+	if (adapter->flags & BE_FLAGS_ERR_DETECTION_SCHEDULED) {
+		cancel_delayed_work_sync(&err_rec->err_detection_work);
+		adapter->flags &= ~BE_FLAGS_ERR_DETECTION_SCHEDULED;
+	}
+}
+
+static int be_enable_vxlan_offloads(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct device *dev = &adapter->pdev->dev;
+	struct be_vxlan_port *vxlan_port;
+	__be16 port;
+	int status;
+
+	vxlan_port = list_first_entry(&adapter->vxlan_port_list,
+				      struct be_vxlan_port, list);
+	port = vxlan_port->port;
+
+	status = be_cmd_manage_iface(adapter, adapter->if_handle,
+				     OP_CONVERT_NORMAL_TO_TUNNEL);
+	if (status) {
+		dev_warn(dev, "Failed to convert normal interface to tunnel\n");
+		return status;
+	}
+	adapter->flags |= BE_FLAGS_VXLAN_OFFLOADS;
+
+	status = be_cmd_set_vxlan_port(adapter, port);
+	if (status) {
+		dev_warn(dev, "Failed to add VxLAN port\n");
+		return status;
+	}
+	adapter->vxlan_port = port;
+
+	netdev->hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+				   NETIF_F_TSO | NETIF_F_TSO6 |
+				   NETIF_F_GSO_UDP_TUNNEL;
+	netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL;
+	netdev->features |= NETIF_F_GSO_UDP_TUNNEL;
+
+	dev_info(dev, "Enabled VxLAN offloads for UDP port %d\n",
+		 be16_to_cpu(port));
+	return 0;
+}
+
+static void be_disable_vxlan_offloads(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (adapter->flags & BE_FLAGS_VXLAN_OFFLOADS)
+		be_cmd_manage_iface(adapter, adapter->if_handle,
+				    OP_CONVERT_TUNNEL_TO_NORMAL);
+
+	if (adapter->vxlan_port)
+		be_cmd_set_vxlan_port(adapter, 0);
+
+	adapter->flags &= ~BE_FLAGS_VXLAN_OFFLOADS;
+	adapter->vxlan_port = 0;
+
+	netdev->hw_enc_features = 0;
+	netdev->hw_features &= ~(NETIF_F_GSO_UDP_TUNNEL);
+	netdev->features &= ~(NETIF_F_GSO_UDP_TUNNEL);
+}
+
+static void be_calculate_vf_res(struct be_adapter *adapter, u16 num_vfs,
+				struct be_resources *vft_res)
+{
+	struct be_resources res = adapter->pool_res;
+	u32 vf_if_cap_flags = res.vf_if_cap_flags;
+	struct be_resources res_mod = {0};
+	u16 num_vf_qs = 1;
+
+	/* Distribute the queue resources among the PF and it's VFs */
+	if (num_vfs) {
+		/* Divide the rx queues evenly among the VFs and the PF, capped
+		 * at VF-EQ-count. Any remainder queues belong to the PF.
+		 */
+		num_vf_qs = min(SH_VF_MAX_NIC_EQS,
+				res.max_rss_qs / (num_vfs + 1));
+
+		/* Skyhawk-R chip supports only MAX_PORT_RSS_TABLES
+		 * RSS Tables per port. Provide RSS on VFs, only if number of
+		 * VFs requested is less than it's PF Pool's RSS Tables limit.
+		 */
+		if (num_vfs >= be_max_pf_pool_rss_tables(adapter))
+			num_vf_qs = 1;
+	}
+
+	/* Resource with fields set to all '1's by GET_PROFILE_CONFIG cmd,
+	 * which are modifiable using SET_PROFILE_CONFIG cmd.
+	 */
+	be_cmd_get_profile_config(adapter, &res_mod, NULL, ACTIVE_PROFILE_TYPE,
+				  RESOURCE_MODIFIABLE, 0);
+
+	/* If RSS IFACE capability flags are modifiable for a VF, set the
+	 * capability flag as valid and set RSS and DEFQ_RSS IFACE flags if
+	 * more than 1 RSSQ is available for a VF.
+	 * Otherwise, provision only 1 queue pair for VF.
+	 */
+	if (res_mod.vf_if_cap_flags & BE_IF_FLAGS_RSS) {
+		vft_res->flags |= BIT(IF_CAPS_FLAGS_VALID_SHIFT);
+		if (num_vf_qs > 1) {
+			vf_if_cap_flags |= BE_IF_FLAGS_RSS;
+			if (res.if_cap_flags & BE_IF_FLAGS_DEFQ_RSS)
+				vf_if_cap_flags |= BE_IF_FLAGS_DEFQ_RSS;
+		} else {
+			vf_if_cap_flags &= ~(BE_IF_FLAGS_RSS |
+					     BE_IF_FLAGS_DEFQ_RSS);
+		}
+	} else {
+		num_vf_qs = 1;
+	}
+
+	if (res_mod.vf_if_cap_flags & BE_IF_FLAGS_VLAN_PROMISCUOUS) {
+		vft_res->flags |= BIT(IF_CAPS_FLAGS_VALID_SHIFT);
+		vf_if_cap_flags &= ~BE_IF_FLAGS_VLAN_PROMISCUOUS;
+	}
+
+	vft_res->vf_if_cap_flags = vf_if_cap_flags;
+	vft_res->max_rx_qs = num_vf_qs;
+	vft_res->max_rss_qs = num_vf_qs;
+	vft_res->max_tx_qs = res.max_tx_qs / (num_vfs + 1);
+	vft_res->max_cq_count = res.max_cq_count / (num_vfs + 1);
+
+	/* Distribute unicast MACs, VLANs, IFACE count and MCCQ count equally
+	 * among the PF and it's VFs, if the fields are changeable
+	 */
+	if (res_mod.max_uc_mac == FIELD_MODIFIABLE)
+		vft_res->max_uc_mac = res.max_uc_mac / (num_vfs + 1);
+
+	if (res_mod.max_vlans == FIELD_MODIFIABLE)
+		vft_res->max_vlans = res.max_vlans / (num_vfs + 1);
+
+	if (res_mod.max_iface_count == FIELD_MODIFIABLE)
+		vft_res->max_iface_count = res.max_iface_count / (num_vfs + 1);
+
+	if (res_mod.max_mcc_count == FIELD_MODIFIABLE)
+		vft_res->max_mcc_count = res.max_mcc_count / (num_vfs + 1);
+}
+
+static void be_if_destroy(struct be_adapter *adapter)
+{
+	be_cmd_if_destroy(adapter, adapter->if_handle,  0);
+
+	kfree(adapter->pmac_id);
+	adapter->pmac_id = NULL;
+
+	kfree(adapter->mc_list);
+	adapter->mc_list = NULL;
+
+	kfree(adapter->uc_list);
+	adapter->uc_list = NULL;
+}
+
+static int be_clear(struct be_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	struct  be_resources vft_res = {0};
+
+	be_cancel_worker(adapter);
+
+	flush_workqueue(be_wq);
+
+	if (sriov_enabled(adapter))
+		be_vf_clear(adapter);
+
+	/* Re-configure FW to distribute resources evenly across max-supported
+	 * number of VFs, only when VFs are not already enabled.
+	 */
+	if (skyhawk_chip(adapter) && be_physfn(adapter) &&
+	    !pci_vfs_assigned(pdev)) {
+		be_calculate_vf_res(adapter,
+				    pci_sriov_get_totalvfs(pdev),
+				    &vft_res);
+		be_cmd_set_sriov_config(adapter, adapter->pool_res,
+					pci_sriov_get_totalvfs(pdev),
+					&vft_res);
+	}
+
+	be_disable_vxlan_offloads(adapter);
+
+	be_if_destroy(adapter);
+
+	be_clear_queues(adapter);
+
+	be_msix_disable(adapter);
+	adapter->flags &= ~BE_FLAGS_SETUP_DONE;
+	return 0;
+}
+
+static int be_vfs_if_create(struct be_adapter *adapter)
+{
+	struct be_resources res = {0};
+	u32 cap_flags, en_flags, vf;
+	struct be_vf_cfg *vf_cfg;
+	int status;
+
+	/* If a FW profile exists, then cap_flags are updated */
+	cap_flags = BE_VF_IF_EN_FLAGS;
+
+	for_all_vfs(adapter, vf_cfg, vf) {
+		if (!BE3_chip(adapter)) {
+			status = be_cmd_get_profile_config(adapter, &res, NULL,
+							   ACTIVE_PROFILE_TYPE,
+							   RESOURCE_LIMITS,
+							   vf + 1);
+			if (!status) {
+				cap_flags = res.if_cap_flags;
+				/* Prevent VFs from enabling VLAN promiscuous
+				 * mode
+				 */
+				cap_flags &= ~BE_IF_FLAGS_VLAN_PROMISCUOUS;
+			}
+		}
+
+		/* PF should enable IF flags during proxy if_create call */
+		en_flags = cap_flags & BE_VF_IF_EN_FLAGS;
+		status = be_cmd_if_create(adapter, cap_flags, en_flags,
+					  &vf_cfg->if_handle, vf + 1);
+		if (status)
+			return status;
+	}
+
+	return 0;
+}
+
+static int be_vf_setup_init(struct be_adapter *adapter)
+{
+	struct be_vf_cfg *vf_cfg;
+	int vf;
+
+	adapter->vf_cfg = kcalloc(adapter->num_vfs, sizeof(*vf_cfg),
+				  GFP_KERNEL);
+	if (!adapter->vf_cfg)
+		return -ENOMEM;
+
+	for_all_vfs(adapter, vf_cfg, vf) {
+		vf_cfg->if_handle = -1;
+		vf_cfg->pmac_id = -1;
+	}
+	return 0;
+}
+
+static int be_vf_setup(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	struct be_vf_cfg *vf_cfg;
+	int status, old_vfs, vf;
+	bool spoofchk;
+
+	old_vfs = pci_num_vf(adapter->pdev);
+
+	status = be_vf_setup_init(adapter);
+	if (status)
+		goto err;
+
+	if (old_vfs) {
+		for_all_vfs(adapter, vf_cfg, vf) {
+			status = be_cmd_get_if_id(adapter, vf_cfg, vf);
+			if (status)
+				goto err;
+		}
+
+		status = be_vfs_mac_query(adapter);
+		if (status)
+			goto err;
+	} else {
+		status = be_vfs_if_create(adapter);
+		if (status)
+			goto err;
+
+		status = be_vf_eth_addr_config(adapter);
+		if (status)
+			goto err;
+	}
+
+	for_all_vfs(adapter, vf_cfg, vf) {
+		/* Allow VFs to programs MAC/VLAN filters */
+		status = be_cmd_get_fn_privileges(adapter, &vf_cfg->privileges,
+						  vf + 1);
+		if (!status && !(vf_cfg->privileges & BE_PRIV_FILTMGMT)) {
+			status = be_cmd_set_fn_privileges(adapter,
+							  vf_cfg->privileges |
+							  BE_PRIV_FILTMGMT,
+							  vf + 1);
+			if (!status) {
+				vf_cfg->privileges |= BE_PRIV_FILTMGMT;
+				dev_info(dev, "VF%d has FILTMGMT privilege\n",
+					 vf);
+			}
+		}
+
+		/* Allow full available bandwidth */
+		if (!old_vfs)
+			be_cmd_config_qos(adapter, 0, 0, vf + 1);
+
+		status = be_cmd_get_hsw_config(adapter, NULL, vf + 1,
+					       vf_cfg->if_handle, NULL,
+					       &spoofchk);
+		if (!status)
+			vf_cfg->spoofchk = spoofchk;
+
+		if (!old_vfs) {
+			be_cmd_enable_vf(adapter, vf + 1);
+			be_cmd_set_logical_link_config(adapter,
+						       IFLA_VF_LINK_STATE_AUTO,
+						       vf+1);
+		}
+	}
+
+	if (!old_vfs) {
+		status = pci_enable_sriov(adapter->pdev, adapter->num_vfs);
+		if (status) {
+			dev_err(dev, "SRIOV enable failed\n");
+			adapter->num_vfs = 0;
+			goto err;
+		}
+	}
+
+	if (BE3_chip(adapter)) {
+		/* On BE3, enable VEB only when SRIOV is enabled */
+		status = be_cmd_set_hsw_config(adapter, 0, 0,
+					       adapter->if_handle,
+					       PORT_FWD_TYPE_VEB, 0);
+		if (status)
+			goto err;
+	}
+
+	adapter->flags |= BE_FLAGS_SRIOV_ENABLED;
+	return 0;
+err:
+	dev_err(dev, "VF setup failed\n");
+	be_vf_clear(adapter);
+	return status;
+}
+
+/* Converting function_mode bits on BE3 to SH mc_type enums */
+
+static u8 be_convert_mc_type(u32 function_mode)
+{
+	if (function_mode & VNIC_MODE && function_mode & QNQ_MODE)
+		return vNIC1;
+	else if (function_mode & QNQ_MODE)
+		return FLEX10;
+	else if (function_mode & VNIC_MODE)
+		return vNIC2;
+	else if (function_mode & UMC_ENABLED)
+		return UMC;
+	else
+		return MC_NONE;
+}
+
+/* On BE2/BE3 FW does not suggest the supported limits */
+static void BEx_get_resources(struct be_adapter *adapter,
+			      struct be_resources *res)
+{
+	bool use_sriov = adapter->num_vfs ? 1 : 0;
+
+	if (be_physfn(adapter))
+		res->max_uc_mac = BE_UC_PMAC_COUNT;
+	else
+		res->max_uc_mac = BE_VF_UC_PMAC_COUNT;
+
+	adapter->mc_type = be_convert_mc_type(adapter->function_mode);
+
+	if (be_is_mc(adapter)) {
+		/* Assuming that there are 4 channels per port,
+		 * when multi-channel is enabled
+		 */
+		if (be_is_qnq_mode(adapter))
+			res->max_vlans = BE_NUM_VLANS_SUPPORTED/8;
+		else
+			/* In a non-qnq multichannel mode, the pvid
+			 * takes up one vlan entry
+			 */
+			res->max_vlans = (BE_NUM_VLANS_SUPPORTED / 4) - 1;
+	} else {
+		res->max_vlans = BE_NUM_VLANS_SUPPORTED;
+	}
+
+	res->max_mcast_mac = BE_MAX_MC;
+
+	/* 1) For BE3 1Gb ports, FW does not support multiple TXQs
+	 * 2) Create multiple TX rings on a BE3-R multi-channel interface
+	 *    *only* if it is RSS-capable.
+	 */
+	if (BE2_chip(adapter) || use_sriov ||  (adapter->port_num > 1) ||
+	    be_virtfn(adapter) ||
+	    (be_is_mc(adapter) &&
+	     !(adapter->function_caps & BE_FUNCTION_CAPS_RSS))) {
+		res->max_tx_qs = 1;
+	} else if (adapter->function_caps & BE_FUNCTION_CAPS_SUPER_NIC) {
+		struct be_resources super_nic_res = {0};
+
+		/* On a SuperNIC profile, the driver needs to use the
+		 * GET_PROFILE_CONFIG cmd to query the per-function TXQ limits
+		 */
+		be_cmd_get_profile_config(adapter, &super_nic_res, NULL,
+					  ACTIVE_PROFILE_TYPE, RESOURCE_LIMITS,
+					  0);
+		/* Some old versions of BE3 FW don't report max_tx_qs value */
+		res->max_tx_qs = super_nic_res.max_tx_qs ? : BE3_MAX_TX_QS;
+	} else {
+		res->max_tx_qs = BE3_MAX_TX_QS;
+	}
+
+	if ((adapter->function_caps & BE_FUNCTION_CAPS_RSS) &&
+	    !use_sriov && be_physfn(adapter))
+		res->max_rss_qs = (adapter->be3_native) ?
+					   BE3_MAX_RSS_QS : BE2_MAX_RSS_QS;
+	res->max_rx_qs = res->max_rss_qs + 1;
+
+	if (be_physfn(adapter))
+		res->max_evt_qs = (be_max_vfs(adapter) > 0) ?
+					BE3_SRIOV_MAX_EVT_QS : BE3_MAX_EVT_QS;
+	else
+		res->max_evt_qs = 1;
+
+	res->if_cap_flags = BE_IF_CAP_FLAGS_WANT;
+	res->if_cap_flags &= ~BE_IF_FLAGS_DEFQ_RSS;
+	if (!(adapter->function_caps & BE_FUNCTION_CAPS_RSS))
+		res->if_cap_flags &= ~BE_IF_FLAGS_RSS;
+}
+
+static void be_setup_init(struct be_adapter *adapter)
+{
+	adapter->vlan_prio_bmap = 0xff;
+	adapter->phy.link_speed = -1;
+	adapter->if_handle = -1;
+	adapter->be3_native = false;
+	adapter->if_flags = 0;
+	adapter->phy_state = BE_UNKNOWN_PHY_STATE;
+	if (be_physfn(adapter))
+		adapter->cmd_privileges = MAX_PRIVILEGES;
+	else
+		adapter->cmd_privileges = MIN_PRIVILEGES;
+}
+
+/* HW supports only MAX_PORT_RSS_TABLES RSS Policy Tables per port.
+ * However, this HW limitation is not exposed to the host via any SLI cmd.
+ * As a result, in the case of SRIOV and in particular multi-partition configs
+ * the driver needs to calcuate a proportional share of RSS Tables per PF-pool
+ * for distribution between the VFs. This self-imposed limit will determine the
+ * no: of VFs for which RSS can be enabled.
+ */
+static void be_calculate_pf_pool_rss_tables(struct be_adapter *adapter)
+{
+	struct be_port_resources port_res = {0};
+	u8 rss_tables_on_port;
+	u16 max_vfs = be_max_vfs(adapter);
+
+	be_cmd_get_profile_config(adapter, NULL, &port_res, SAVED_PROFILE_TYPE,
+				  RESOURCE_LIMITS, 0);
+
+	rss_tables_on_port = MAX_PORT_RSS_TABLES - port_res.nic_pfs;
+
+	/* Each PF Pool's RSS Tables limit =
+	 * PF's Max VFs / Total_Max_VFs on Port * RSS Tables on Port
+	 */
+	adapter->pool_res.max_rss_tables =
+		max_vfs * rss_tables_on_port / port_res.max_vfs;
+}
+
+static int be_get_sriov_config(struct be_adapter *adapter)
+{
+	struct be_resources res = {0};
+	int max_vfs, old_vfs;
+
+	be_cmd_get_profile_config(adapter, &res, NULL, ACTIVE_PROFILE_TYPE,
+				  RESOURCE_LIMITS, 0);
+
+	/* Some old versions of BE3 FW don't report max_vfs value */
+	if (BE3_chip(adapter) && !res.max_vfs) {
+		max_vfs = pci_sriov_get_totalvfs(adapter->pdev);
+		res.max_vfs = max_vfs > 0 ? min(MAX_VFS, max_vfs) : 0;
+	}
+
+	adapter->pool_res = res;
+
+	/* If during previous unload of the driver, the VFs were not disabled,
+	 * then we cannot rely on the PF POOL limits for the TotalVFs value.
+	 * Instead use the TotalVFs value stored in the pci-dev struct.
+	 */
+	old_vfs = pci_num_vf(adapter->pdev);
+	if (old_vfs) {
+		dev_info(&adapter->pdev->dev, "%d VFs are already enabled\n",
+			 old_vfs);
+
+		adapter->pool_res.max_vfs =
+			pci_sriov_get_totalvfs(adapter->pdev);
+		adapter->num_vfs = old_vfs;
+	}
+
+	if (skyhawk_chip(adapter) && be_max_vfs(adapter) && !old_vfs) {
+		be_calculate_pf_pool_rss_tables(adapter);
+		dev_info(&adapter->pdev->dev,
+			 "RSS can be enabled for all VFs if num_vfs <= %d\n",
+			 be_max_pf_pool_rss_tables(adapter));
+	}
+	return 0;
+}
+
+static void be_alloc_sriov_res(struct be_adapter *adapter)
+{
+	int old_vfs = pci_num_vf(adapter->pdev);
+	struct  be_resources vft_res = {0};
+	int status;
+
+	be_get_sriov_config(adapter);
+
+	if (!old_vfs)
+		pci_sriov_set_totalvfs(adapter->pdev, be_max_vfs(adapter));
+
+	/* When the HW is in SRIOV capable configuration, the PF-pool
+	 * resources are given to PF during driver load, if there are no
+	 * old VFs. This facility is not available in BE3 FW.
+	 * Also, this is done by FW in Lancer chip.
+	 */
+	if (skyhawk_chip(adapter) && be_max_vfs(adapter) && !old_vfs) {
+		be_calculate_vf_res(adapter, 0, &vft_res);
+		status = be_cmd_set_sriov_config(adapter, adapter->pool_res, 0,
+						 &vft_res);
+		if (status)
+			dev_err(&adapter->pdev->dev,
+				"Failed to optimize SRIOV resources\n");
+	}
+}
+
+static int be_get_resources(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	struct be_resources res = {0};
+	int status;
+
+	/* For Lancer, SH etc read per-function resource limits from FW.
+	 * GET_FUNC_CONFIG returns per function guaranteed limits.
+	 * GET_PROFILE_CONFIG returns PCI-E related limits PF-pool limits
+	 */
+	if (BEx_chip(adapter)) {
+		BEx_get_resources(adapter, &res);
+	} else {
+		status = be_cmd_get_func_config(adapter, &res);
+		if (status)
+			return status;
+
+		/* If a deafault RXQ must be created, we'll use up one RSSQ*/
+		if (res.max_rss_qs && res.max_rss_qs == res.max_rx_qs &&
+		    !(res.if_cap_flags & BE_IF_FLAGS_DEFQ_RSS))
+			res.max_rss_qs -= 1;
+	}
+
+	/* If RoCE is supported stash away half the EQs for RoCE */
+	res.max_nic_evt_qs = be_roce_supported(adapter) ?
+				res.max_evt_qs / 2 : res.max_evt_qs;
+	adapter->res = res;
+
+	/* If FW supports RSS default queue, then skip creating non-RSS
+	 * queue for non-IP traffic.
+	 */
+	adapter->need_def_rxq = (be_if_cap_flags(adapter) &
+				 BE_IF_FLAGS_DEFQ_RSS) ? 0 : 1;
+
+	dev_info(dev, "Max: txqs %d, rxqs %d, rss %d, eqs %d, vfs %d\n",
+		 be_max_txqs(adapter), be_max_rxqs(adapter),
+		 be_max_rss(adapter), be_max_nic_eqs(adapter),
+		 be_max_vfs(adapter));
+	dev_info(dev, "Max: uc-macs %d, mc-macs %d, vlans %d\n",
+		 be_max_uc(adapter), be_max_mc(adapter),
+		 be_max_vlans(adapter));
+
+	/* Ensure RX and TX queues are created in pairs at init time */
+	adapter->cfg_num_rx_irqs =
+				min_t(u16, netif_get_num_default_rss_queues(),
+				      be_max_qp_irqs(adapter));
+	adapter->cfg_num_tx_irqs = adapter->cfg_num_rx_irqs;
+	return 0;
+}
+
+static int be_get_config(struct be_adapter *adapter)
+{
+	int status, level;
+	u16 profile_id;
+
+	status = be_cmd_get_cntl_attributes(adapter);
+	if (status)
+		return status;
+
+	status = be_cmd_query_fw_cfg(adapter);
+	if (status)
+		return status;
+
+	if (!lancer_chip(adapter) && be_physfn(adapter))
+		be_cmd_get_fat_dump_len(adapter, &adapter->fat_dump_len);
+
+	if (BEx_chip(adapter)) {
+		level = be_cmd_get_fw_log_level(adapter);
+		adapter->msg_enable =
+			level <= FW_LOG_LEVEL_DEFAULT ? NETIF_MSG_HW : 0;
+	}
+
+	be_cmd_get_acpi_wol_cap(adapter);
+	pci_enable_wake(adapter->pdev, PCI_D3hot, adapter->wol_en);
+	pci_enable_wake(adapter->pdev, PCI_D3cold, adapter->wol_en);
+
+	be_cmd_query_port_name(adapter);
+
+	if (be_physfn(adapter)) {
+		status = be_cmd_get_active_profile(adapter, &profile_id);
+		if (!status)
+			dev_info(&adapter->pdev->dev,
+				 "Using profile 0x%x\n", profile_id);
+	}
+
+	return 0;
+}
+
+static int be_mac_setup(struct be_adapter *adapter)
+{
+	u8 mac[ETH_ALEN];
+	int status;
+
+	if (is_zero_ether_addr(adapter->netdev->dev_addr)) {
+		status = be_cmd_get_perm_mac(adapter, mac);
+		if (status)
+			return status;
+
+		memcpy(adapter->netdev->dev_addr, mac, ETH_ALEN);
+		memcpy(adapter->netdev->perm_addr, mac, ETH_ALEN);
+
+		/* Initial MAC for BE3 VFs is already programmed by PF */
+		if (BEx_chip(adapter) && be_virtfn(adapter))
+			memcpy(adapter->dev_mac, mac, ETH_ALEN);
+	}
+
+	return 0;
+}
+
+static void be_schedule_worker(struct be_adapter *adapter)
+{
+	queue_delayed_work(be_wq, &adapter->work, msecs_to_jiffies(1000));
+	adapter->flags |= BE_FLAGS_WORKER_SCHEDULED;
+}
+
+static void be_destroy_err_recovery_workq(void)
+{
+	if (!be_err_recovery_workq)
+		return;
+
+	flush_workqueue(be_err_recovery_workq);
+	destroy_workqueue(be_err_recovery_workq);
+	be_err_recovery_workq = NULL;
+}
+
+static void be_schedule_err_detection(struct be_adapter *adapter, u32 delay)
+{
+	struct be_error_recovery *err_rec = &adapter->error_recovery;
+
+	if (!be_err_recovery_workq)
+		return;
+
+	queue_delayed_work(be_err_recovery_workq, &err_rec->err_detection_work,
+			   msecs_to_jiffies(delay));
+	adapter->flags |= BE_FLAGS_ERR_DETECTION_SCHEDULED;
+}
+
+static int be_setup_queues(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	int status;
+
+	status = be_evt_queues_create(adapter);
+	if (status)
+		goto err;
+
+	status = be_tx_qs_create(adapter);
+	if (status)
+		goto err;
+
+	status = be_rx_cqs_create(adapter);
+	if (status)
+		goto err;
+
+	status = be_mcc_queues_create(adapter);
+	if (status)
+		goto err;
+
+	status = netif_set_real_num_rx_queues(netdev, adapter->num_rx_qs);
+	if (status)
+		goto err;
+
+	status = netif_set_real_num_tx_queues(netdev, adapter->num_tx_qs);
+	if (status)
+		goto err;
+
+	return 0;
+err:
+	dev_err(&adapter->pdev->dev, "queue_setup failed\n");
+	return status;
+}
+
+static int be_if_create(struct be_adapter *adapter)
+{
+	u32 en_flags = BE_IF_FLAGS_RSS | BE_IF_FLAGS_DEFQ_RSS;
+	u32 cap_flags = be_if_cap_flags(adapter);
+	int status;
+
+	/* alloc required memory for other filtering fields */
+	adapter->pmac_id = kcalloc(be_max_uc(adapter),
+				   sizeof(*adapter->pmac_id), GFP_KERNEL);
+	if (!adapter->pmac_id)
+		return -ENOMEM;
+
+	adapter->mc_list = kcalloc(be_max_mc(adapter),
+				   sizeof(*adapter->mc_list), GFP_KERNEL);
+	if (!adapter->mc_list)
+		return -ENOMEM;
+
+	adapter->uc_list = kcalloc(be_max_uc(adapter),
+				   sizeof(*adapter->uc_list), GFP_KERNEL);
+	if (!adapter->uc_list)
+		return -ENOMEM;
+
+	if (adapter->cfg_num_rx_irqs == 1)
+		cap_flags &= ~(BE_IF_FLAGS_DEFQ_RSS | BE_IF_FLAGS_RSS);
+
+	en_flags &= cap_flags;
+	/* will enable all the needed filter flags in be_open() */
+	status = be_cmd_if_create(adapter, be_if_cap_flags(adapter), en_flags,
+				  &adapter->if_handle, 0);
+
+	if (status)
+		return status;
+
+	return 0;
+}
+
+int be_update_queues(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	int status;
+
+	if (netif_running(netdev))
+		be_close(netdev);
+
+	be_cancel_worker(adapter);
+
+	/* If any vectors have been shared with RoCE we cannot re-program
+	 * the MSIx table.
+	 */
+	if (!adapter->num_msix_roce_vec)
+		be_msix_disable(adapter);
+
+	be_clear_queues(adapter);
+	status = be_cmd_if_destroy(adapter, adapter->if_handle,  0);
+	if (status)
+		return status;
+
+	if (!msix_enabled(adapter)) {
+		status = be_msix_enable(adapter);
+		if (status)
+			return status;
+	}
+
+	status = be_if_create(adapter);
+	if (status)
+		return status;
+
+	status = be_setup_queues(adapter);
+	if (status)
+		return status;
+
+	be_schedule_worker(adapter);
+
+	/* The IF was destroyed and re-created. We need to clear
+	 * all promiscuous flags valid for the destroyed IF.
+	 * Without this promisc mode is not restored during
+	 * be_open() because the driver thinks that it is
+	 * already enabled in HW.
+	 */
+	adapter->if_flags &= ~BE_IF_FLAGS_ALL_PROMISCUOUS;
+
+	if (netif_running(netdev))
+		status = be_open(netdev);
+
+	return status;
+}
+
+static inline int fw_major_num(const char *fw_ver)
+{
+	int fw_major = 0, i;
+
+	i = sscanf(fw_ver, "%d.", &fw_major);
+	if (i != 1)
+		return 0;
+
+	return fw_major;
+}
+
+/* If it is error recovery, FLR the PF
+ * Else if any VFs are already enabled don't FLR the PF
+ */
+static bool be_reset_required(struct be_adapter *adapter)
+{
+	if (be_error_recovering(adapter))
+		return true;
+	else
+		return pci_num_vf(adapter->pdev) == 0;
+}
+
+/* Wait for the FW to be ready and perform the required initialization */
+static int be_func_init(struct be_adapter *adapter)
+{
+	int status;
+
+	status = be_fw_wait_ready(adapter);
+	if (status)
+		return status;
+
+	/* FW is now ready; clear errors to allow cmds/doorbell */
+	be_clear_error(adapter, BE_CLEAR_ALL);
+
+	if (be_reset_required(adapter)) {
+		status = be_cmd_reset_function(adapter);
+		if (status)
+			return status;
+
+		/* Wait for interrupts to quiesce after an FLR */
+		msleep(100);
+	}
+
+	/* Tell FW we're ready to fire cmds */
+	status = be_cmd_fw_init(adapter);
+	if (status)
+		return status;
+
+	/* Allow interrupts for other ULPs running on NIC function */
+	be_intr_set(adapter, true);
+
+	return 0;
+}
+
+static int be_setup(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	int status;
+
+	status = be_func_init(adapter);
+	if (status)
+		return status;
+
+	be_setup_init(adapter);
+
+	if (!lancer_chip(adapter))
+		be_cmd_req_native_mode(adapter);
+
+	/* invoke this cmd first to get pf_num and vf_num which are needed
+	 * for issuing profile related cmds
+	 */
+	if (!BEx_chip(adapter)) {
+		status = be_cmd_get_func_config(adapter, NULL);
+		if (status)
+			return status;
+	}
+
+	status = be_get_config(adapter);
+	if (status)
+		goto err;
+
+	if (!BE2_chip(adapter) && be_physfn(adapter))
+		be_alloc_sriov_res(adapter);
+
+	status = be_get_resources(adapter);
+	if (status)
+		goto err;
+
+	status = be_msix_enable(adapter);
+	if (status)
+		goto err;
+
+	/* will enable all the needed filter flags in be_open() */
+	status = be_if_create(adapter);
+	if (status)
+		goto err;
+
+	/* Updating real_num_tx/rx_queues() requires rtnl_lock() */
+	rtnl_lock();
+	status = be_setup_queues(adapter);
+	rtnl_unlock();
+	if (status)
+		goto err;
+
+	be_cmd_get_fn_privileges(adapter, &adapter->cmd_privileges, 0);
+
+	status = be_mac_setup(adapter);
+	if (status)
+		goto err;
+
+	be_cmd_get_fw_ver(adapter);
+	dev_info(dev, "FW version is %s\n", adapter->fw_ver);
+
+	if (BE2_chip(adapter) && fw_major_num(adapter->fw_ver) < 4) {
+		dev_err(dev, "Firmware on card is old(%s), IRQs may not work",
+			adapter->fw_ver);
+		dev_err(dev, "Please upgrade firmware to version >= 4.0\n");
+	}
+
+	status = be_cmd_set_flow_control(adapter, adapter->tx_fc,
+					 adapter->rx_fc);
+	if (status)
+		be_cmd_get_flow_control(adapter, &adapter->tx_fc,
+					&adapter->rx_fc);
+
+	dev_info(&adapter->pdev->dev, "HW Flow control - TX:%d RX:%d\n",
+		 adapter->tx_fc, adapter->rx_fc);
+
+	if (be_physfn(adapter))
+		be_cmd_set_logical_link_config(adapter,
+					       IFLA_VF_LINK_STATE_AUTO, 0);
+
+	/* BE3 EVB echoes broadcast/multicast packets back to PF's vport
+	 * confusing a linux bridge or OVS that it might be connected to.
+	 * Set the EVB to PASSTHRU mode which effectively disables the EVB
+	 * when SRIOV is not enabled.
+	 */
+	if (BE3_chip(adapter))
+		be_cmd_set_hsw_config(adapter, 0, 0, adapter->if_handle,
+				      PORT_FWD_TYPE_PASSTHRU, 0);
+
+	if (adapter->num_vfs)
+		be_vf_setup(adapter);
+
+	status = be_cmd_get_phy_info(adapter);
+	if (!status && be_pause_supported(adapter))
+		adapter->phy.fc_autoneg = 1;
+
+	if (be_physfn(adapter) && !lancer_chip(adapter))
+		be_cmd_set_features(adapter);
+
+	be_schedule_worker(adapter);
+	adapter->flags |= BE_FLAGS_SETUP_DONE;
+	return 0;
+err:
+	be_clear(adapter);
+	return status;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void be_netpoll(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_eq_obj *eqo;
+	int i;
+
+	for_all_evt_queues(adapter, eqo, i) {
+		be_eq_notify(eqo->adapter, eqo->q.id, false, true, 0, 0);
+		napi_schedule(&eqo->napi);
+	}
+}
+#endif
+
+int be_load_fw(struct be_adapter *adapter, u8 *fw_file)
+{
+	const struct firmware *fw;
+	int status;
+
+	if (!netif_running(adapter->netdev)) {
+		dev_err(&adapter->pdev->dev,
+			"Firmware load not allowed (interface is down)\n");
+		return -ENETDOWN;
+	}
+
+	status = request_firmware(&fw, fw_file, &adapter->pdev->dev);
+	if (status)
+		goto fw_exit;
+
+	dev_info(&adapter->pdev->dev, "Flashing firmware file %s\n", fw_file);
+
+	if (lancer_chip(adapter))
+		status = lancer_fw_download(adapter, fw);
+	else
+		status = be_fw_download(adapter, fw);
+
+	if (!status)
+		be_cmd_get_fw_ver(adapter);
+
+fw_exit:
+	release_firmware(fw);
+	return status;
+}
+
+static int be_ndo_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
+				 u16 flags)
+{
+	struct be_adapter *adapter = netdev_priv(dev);
+	struct nlattr *attr, *br_spec;
+	int rem;
+	int status = 0;
+	u16 mode = 0;
+
+	if (!sriov_enabled(adapter))
+		return -EOPNOTSUPP;
+
+	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+	if (!br_spec)
+		return -EINVAL;
+
+	nla_for_each_nested(attr, br_spec, rem) {
+		if (nla_type(attr) != IFLA_BRIDGE_MODE)
+			continue;
+
+		if (nla_len(attr) < sizeof(mode))
+			return -EINVAL;
+
+		mode = nla_get_u16(attr);
+		if (BE3_chip(adapter) && mode == BRIDGE_MODE_VEPA)
+			return -EOPNOTSUPP;
+
+		if (mode != BRIDGE_MODE_VEPA && mode != BRIDGE_MODE_VEB)
+			return -EINVAL;
+
+		status = be_cmd_set_hsw_config(adapter, 0, 0,
+					       adapter->if_handle,
+					       mode == BRIDGE_MODE_VEPA ?
+					       PORT_FWD_TYPE_VEPA :
+					       PORT_FWD_TYPE_VEB, 0);
+		if (status)
+			goto err;
+
+		dev_info(&adapter->pdev->dev, "enabled switch mode: %s\n",
+			 mode == BRIDGE_MODE_VEPA ? "VEPA" : "VEB");
+
+		return status;
+	}
+err:
+	dev_err(&adapter->pdev->dev, "Failed to set switch mode %s\n",
+		mode == BRIDGE_MODE_VEPA ? "VEPA" : "VEB");
+
+	return status;
+}
+
+static int be_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+				 struct net_device *dev, u32 filter_mask,
+				 int nlflags)
+{
+	struct be_adapter *adapter = netdev_priv(dev);
+	int status = 0;
+	u8 hsw_mode;
+
+	/* BE and Lancer chips support VEB mode only */
+	if (BEx_chip(adapter) || lancer_chip(adapter)) {
+		/* VEB is disabled in non-SR-IOV profiles on BE3/Lancer */
+		if (!pci_sriov_get_totalvfs(adapter->pdev))
+			return 0;
+		hsw_mode = PORT_FWD_TYPE_VEB;
+	} else {
+		status = be_cmd_get_hsw_config(adapter, NULL, 0,
+					       adapter->if_handle, &hsw_mode,
+					       NULL);
+		if (status)
+			return 0;
+
+		if (hsw_mode == PORT_FWD_TYPE_PASSTHRU)
+			return 0;
+	}
+
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev,
+				       hsw_mode == PORT_FWD_TYPE_VEPA ?
+				       BRIDGE_MODE_VEPA : BRIDGE_MODE_VEB,
+				       0, 0, nlflags, filter_mask, NULL);
+}
+
+static struct be_cmd_work *be_alloc_work(struct be_adapter *adapter,
+					 void (*func)(struct work_struct *))
+{
+	struct be_cmd_work *work;
+
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work) {
+		dev_err(&adapter->pdev->dev,
+			"be_work memory allocation failed\n");
+		return NULL;
+	}
+
+	INIT_WORK(&work->work, func);
+	work->adapter = adapter;
+	return work;
+}
+
+/* VxLAN offload Notes:
+ *
+ * The stack defines tunnel offload flags (hw_enc_features) for IP and doesn't
+ * distinguish various types of transports (VxLAN, GRE, NVGRE ..). So, offload
+ * is expected to work across all types of IP tunnels once exported. Skyhawk
+ * supports offloads for either VxLAN or NVGRE, exclusively. So we export VxLAN
+ * offloads in hw_enc_features only when a VxLAN port is added. If other (non
+ * VxLAN) tunnels are configured while VxLAN offloads are enabled, offloads for
+ * those other tunnels are unexported on the fly through ndo_features_check().
+ *
+ * Skyhawk supports VxLAN offloads only for one UDP dport. So, if the stack
+ * adds more than one port, disable offloads and re-enable them again when
+ * there's only one port left. We maintain a list of ports for this purpose.
+ */
+static void be_work_add_vxlan_port(struct work_struct *work)
+{
+	struct be_cmd_work *cmd_work =
+				container_of(work, struct be_cmd_work, work);
+	struct be_adapter *adapter = cmd_work->adapter;
+	struct device *dev = &adapter->pdev->dev;
+	__be16 port = cmd_work->info.vxlan_port;
+	struct be_vxlan_port *vxlan_port;
+	int status;
+
+	/* Bump up the alias count if it is an existing port */
+	list_for_each_entry(vxlan_port, &adapter->vxlan_port_list, list) {
+		if (vxlan_port->port == port) {
+			vxlan_port->port_aliases++;
+			goto done;
+		}
+	}
+
+	/* Add a new port to our list. We don't need a lock here since port
+	 * add/delete are done only in the context of a single-threaded work
+	 * queue (be_wq).
+	 */
+	vxlan_port = kzalloc(sizeof(*vxlan_port), GFP_KERNEL);
+	if (!vxlan_port)
+		goto done;
+
+	vxlan_port->port = port;
+	INIT_LIST_HEAD(&vxlan_port->list);
+	list_add_tail(&vxlan_port->list, &adapter->vxlan_port_list);
+	adapter->vxlan_port_count++;
+
+	if (adapter->flags & BE_FLAGS_VXLAN_OFFLOADS) {
+		dev_info(dev,
+			 "Only one UDP port supported for VxLAN offloads\n");
+		dev_info(dev, "Disabling VxLAN offloads\n");
+		goto err;
+	}
+
+	if (adapter->vxlan_port_count > 1)
+		goto done;
+
+	status = be_enable_vxlan_offloads(adapter);
+	if (!status)
+		goto done;
+
+err:
+	be_disable_vxlan_offloads(adapter);
+done:
+	kfree(cmd_work);
+	return;
+}
+
+static void be_work_del_vxlan_port(struct work_struct *work)
+{
+	struct be_cmd_work *cmd_work =
+				container_of(work, struct be_cmd_work, work);
+	struct be_adapter *adapter = cmd_work->adapter;
+	__be16 port = cmd_work->info.vxlan_port;
+	struct be_vxlan_port *vxlan_port;
+
+	/* Nothing to be done if a port alias is being deleted */
+	list_for_each_entry(vxlan_port, &adapter->vxlan_port_list, list) {
+		if (vxlan_port->port == port) {
+			if (vxlan_port->port_aliases) {
+				vxlan_port->port_aliases--;
+				goto done;
+			}
+			break;
+		}
+	}
+
+	/* No port aliases left; delete the port from the list */
+	list_del(&vxlan_port->list);
+	adapter->vxlan_port_count--;
+
+	/* Disable VxLAN offload if this is the offloaded port */
+	if (adapter->vxlan_port == vxlan_port->port) {
+		WARN_ON(adapter->vxlan_port_count);
+		be_disable_vxlan_offloads(adapter);
+		dev_info(&adapter->pdev->dev,
+			 "Disabled VxLAN offloads for UDP port %d\n",
+			 be16_to_cpu(port));
+		goto out;
+	}
+
+	/* If only 1 port is left, re-enable VxLAN offload */
+	if (adapter->vxlan_port_count == 1)
+		be_enable_vxlan_offloads(adapter);
+
+out:
+	kfree(vxlan_port);
+done:
+	kfree(cmd_work);
+}
+
+static void be_cfg_vxlan_port(struct net_device *netdev,
+			      struct udp_tunnel_info *ti,
+			      void (*func)(struct work_struct *))
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_cmd_work *cmd_work;
+
+	if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+		return;
+
+	if (lancer_chip(adapter) || BEx_chip(adapter) || be_is_mc(adapter))
+		return;
+
+	cmd_work = be_alloc_work(adapter, func);
+	if (cmd_work) {
+		cmd_work->info.vxlan_port = ti->port;
+		queue_work(be_wq, &cmd_work->work);
+	}
+}
+
+static void be_del_vxlan_port(struct net_device *netdev,
+			      struct udp_tunnel_info *ti)
+{
+	be_cfg_vxlan_port(netdev, ti, be_work_del_vxlan_port);
+}
+
+static void be_add_vxlan_port(struct net_device *netdev,
+			      struct udp_tunnel_info *ti)
+{
+	be_cfg_vxlan_port(netdev, ti, be_work_add_vxlan_port);
+}
+
+static netdev_features_t be_features_check(struct sk_buff *skb,
+					   struct net_device *dev,
+					   netdev_features_t features)
+{
+	struct be_adapter *adapter = netdev_priv(dev);
+	u8 l4_hdr = 0;
+
+	if (skb_is_gso(skb)) {
+		/* IPv6 TSO requests with extension hdrs are a problem
+		 * to Lancer and BE3 HW. Disable TSO6 feature.
+		 */
+		if (!skyhawk_chip(adapter) && is_ipv6_ext_hdr(skb))
+			features &= ~NETIF_F_TSO6;
+
+		/* Lancer cannot handle the packet with MSS less than 256.
+		 * Also it can't handle a TSO packet with a single segment
+		 * Disable the GSO support in such cases
+		 */
+		if (lancer_chip(adapter) &&
+		    (skb_shinfo(skb)->gso_size < 256 ||
+		     skb_shinfo(skb)->gso_segs == 1))
+			features &= ~NETIF_F_GSO_MASK;
+	}
+
+	/* The code below restricts offload features for some tunneled and
+	 * Q-in-Q packets.
+	 * Offload features for normal (non tunnel) packets are unchanged.
+	 */
+	features = vlan_features_check(skb, features);
+	if (!skb->encapsulation ||
+	    !(adapter->flags & BE_FLAGS_VXLAN_OFFLOADS))
+		return features;
+
+	/* It's an encapsulated packet and VxLAN offloads are enabled. We
+	 * should disable tunnel offload features if it's not a VxLAN packet,
+	 * as tunnel offloads have been enabled only for VxLAN. This is done to
+	 * allow other tunneled traffic like GRE work fine while VxLAN
+	 * offloads are configured in Skyhawk-R.
+	 */
+	switch (vlan_get_protocol(skb)) {
+	case htons(ETH_P_IP):
+		l4_hdr = ip_hdr(skb)->protocol;
+		break;
+	case htons(ETH_P_IPV6):
+		l4_hdr = ipv6_hdr(skb)->nexthdr;
+		break;
+	default:
+		return features;
+	}
+
+	if (l4_hdr != IPPROTO_UDP ||
+	    skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
+	    skb->inner_protocol != htons(ETH_P_TEB) ||
+	    skb_inner_mac_header(skb) - skb_transport_header(skb) !=
+		sizeof(struct udphdr) + sizeof(struct vxlanhdr) ||
+	    !adapter->vxlan_port ||
+	    udp_hdr(skb)->dest != adapter->vxlan_port)
+		return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+
+	return features;
+}
+
+static int be_get_phys_port_id(struct net_device *dev,
+			       struct netdev_phys_item_id *ppid)
+{
+	int i, id_len = CNTL_SERIAL_NUM_WORDS * CNTL_SERIAL_NUM_WORD_SZ + 1;
+	struct be_adapter *adapter = netdev_priv(dev);
+	u8 *id;
+
+	if (MAX_PHYS_ITEM_ID_LEN < id_len)
+		return -ENOSPC;
+
+	ppid->id[0] = adapter->hba_port_num + 1;
+	id = &ppid->id[1];
+	for (i = CNTL_SERIAL_NUM_WORDS - 1; i >= 0;
+	     i--, id += CNTL_SERIAL_NUM_WORD_SZ)
+		memcpy(id, &adapter->serial_num[i], CNTL_SERIAL_NUM_WORD_SZ);
+
+	ppid->id_len = id_len;
+
+	return 0;
+}
+
+static void be_set_rx_mode(struct net_device *dev)
+{
+	struct be_adapter *adapter = netdev_priv(dev);
+	struct be_cmd_work *work;
+
+	work = be_alloc_work(adapter, be_work_set_rx_mode);
+	if (work)
+		queue_work(be_wq, &work->work);
+}
+
+static const struct net_device_ops be_netdev_ops = {
+	.ndo_open		= be_open,
+	.ndo_stop		= be_close,
+	.ndo_start_xmit		= be_xmit,
+	.ndo_set_rx_mode	= be_set_rx_mode,
+	.ndo_set_mac_address	= be_mac_addr_set,
+	.ndo_get_stats64	= be_get_stats64,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_vlan_rx_add_vid	= be_vlan_add_vid,
+	.ndo_vlan_rx_kill_vid	= be_vlan_rem_vid,
+	.ndo_set_vf_mac		= be_set_vf_mac,
+	.ndo_set_vf_vlan	= be_set_vf_vlan,
+	.ndo_set_vf_rate	= be_set_vf_tx_rate,
+	.ndo_get_vf_config	= be_get_vf_config,
+	.ndo_set_vf_link_state  = be_set_vf_link_state,
+	.ndo_set_vf_spoofchk    = be_set_vf_spoofchk,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= be_netpoll,
+#endif
+	.ndo_bridge_setlink	= be_ndo_bridge_setlink,
+	.ndo_bridge_getlink	= be_ndo_bridge_getlink,
+	.ndo_udp_tunnel_add	= be_add_vxlan_port,
+	.ndo_udp_tunnel_del	= be_del_vxlan_port,
+	.ndo_features_check	= be_features_check,
+	.ndo_get_phys_port_id   = be_get_phys_port_id,
+};
+
+static void be_netdev_init(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	netdev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6 |
+		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM |
+		NETIF_F_HW_VLAN_CTAG_TX;
+	if ((be_if_cap_flags(adapter) & BE_IF_FLAGS_RSS))
+		netdev->hw_features |= NETIF_F_RXHASH;
+
+	netdev->features |= netdev->hw_features |
+		NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER;
+
+	netdev->vlan_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6 |
+		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+
+	netdev->priv_flags |= IFF_UNICAST_FLT;
+
+	netdev->flags |= IFF_MULTICAST;
+
+	netif_set_gso_max_size(netdev, BE_MAX_GSO_SIZE - ETH_HLEN);
+
+	netdev->netdev_ops = &be_netdev_ops;
+
+	netdev->ethtool_ops = &be_ethtool_ops;
+
+	/* MTU range: 256 - 9000 */
+	netdev->min_mtu = BE_MIN_MTU;
+	netdev->max_mtu = BE_MAX_MTU;
+}
+
+static void be_cleanup(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	rtnl_lock();
+	netif_device_detach(netdev);
+	if (netif_running(netdev))
+		be_close(netdev);
+	rtnl_unlock();
+
+	be_clear(adapter);
+}
+
+static int be_resume(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	int status;
+
+	status = be_setup(adapter);
+	if (status)
+		return status;
+
+	rtnl_lock();
+	if (netif_running(netdev))
+		status = be_open(netdev);
+	rtnl_unlock();
+
+	if (status)
+		return status;
+
+	netif_device_attach(netdev);
+
+	return 0;
+}
+
+static void be_soft_reset(struct be_adapter *adapter)
+{
+	u32 val;
+
+	dev_info(&adapter->pdev->dev, "Initiating chip soft reset\n");
+	val = ioread32(adapter->pcicfg + SLIPORT_SOFTRESET_OFFSET);
+	val |= SLIPORT_SOFTRESET_SR_MASK;
+	iowrite32(val, adapter->pcicfg + SLIPORT_SOFTRESET_OFFSET);
+}
+
+static bool be_err_is_recoverable(struct be_adapter *adapter)
+{
+	struct be_error_recovery *err_rec = &adapter->error_recovery;
+	unsigned long initial_idle_time =
+		msecs_to_jiffies(ERR_RECOVERY_IDLE_TIME);
+	unsigned long recovery_interval =
+		msecs_to_jiffies(ERR_RECOVERY_INTERVAL);
+	u16 ue_err_code;
+	u32 val;
+
+	val = be_POST_stage_get(adapter);
+	if ((val & POST_STAGE_RECOVERABLE_ERR) != POST_STAGE_RECOVERABLE_ERR)
+		return false;
+	ue_err_code = val & POST_ERR_RECOVERY_CODE_MASK;
+	if (ue_err_code == 0)
+		return false;
+
+	dev_err(&adapter->pdev->dev, "Recoverable HW error code: 0x%x\n",
+		ue_err_code);
+
+	if (time_before_eq(jiffies - err_rec->probe_time, initial_idle_time)) {
+		dev_err(&adapter->pdev->dev,
+			"Cannot recover within %lu sec from driver load\n",
+			jiffies_to_msecs(initial_idle_time) / MSEC_PER_SEC);
+		return false;
+	}
+
+	if (err_rec->last_recovery_time && time_before_eq(
+		jiffies - err_rec->last_recovery_time, recovery_interval)) {
+		dev_err(&adapter->pdev->dev,
+			"Cannot recover within %lu sec from last recovery\n",
+			jiffies_to_msecs(recovery_interval) / MSEC_PER_SEC);
+		return false;
+	}
+
+	if (ue_err_code == err_rec->last_err_code) {
+		dev_err(&adapter->pdev->dev,
+			"Cannot recover from a consecutive TPE error\n");
+		return false;
+	}
+
+	err_rec->last_recovery_time = jiffies;
+	err_rec->last_err_code = ue_err_code;
+	return true;
+}
+
+static int be_tpe_recover(struct be_adapter *adapter)
+{
+	struct be_error_recovery *err_rec = &adapter->error_recovery;
+	int status = -EAGAIN;
+	u32 val;
+
+	switch (err_rec->recovery_state) {
+	case ERR_RECOVERY_ST_NONE:
+		err_rec->recovery_state = ERR_RECOVERY_ST_DETECT;
+		err_rec->resched_delay = ERR_RECOVERY_UE_DETECT_DURATION;
+		break;
+
+	case ERR_RECOVERY_ST_DETECT:
+		val = be_POST_stage_get(adapter);
+		if ((val & POST_STAGE_RECOVERABLE_ERR) !=
+		    POST_STAGE_RECOVERABLE_ERR) {
+			dev_err(&adapter->pdev->dev,
+				"Unrecoverable HW error detected: 0x%x\n", val);
+			status = -EINVAL;
+			err_rec->resched_delay = 0;
+			break;
+		}
+
+		dev_err(&adapter->pdev->dev, "Recoverable HW error detected\n");
+
+		/* Only PF0 initiates Chip Soft Reset. But PF0 must wait UE2SR
+		 * milliseconds before it checks for final error status in
+		 * SLIPORT_SEMAPHORE to determine if recovery criteria is met.
+		 * If it does, then PF0 initiates a Soft Reset.
+		 */
+		if (adapter->pf_num == 0) {
+			err_rec->recovery_state = ERR_RECOVERY_ST_RESET;
+			err_rec->resched_delay = err_rec->ue_to_reset_time -
+					ERR_RECOVERY_UE_DETECT_DURATION;
+			break;
+		}
+
+		err_rec->recovery_state = ERR_RECOVERY_ST_PRE_POLL;
+		err_rec->resched_delay = err_rec->ue_to_poll_time -
+					ERR_RECOVERY_UE_DETECT_DURATION;
+		break;
+
+	case ERR_RECOVERY_ST_RESET:
+		if (!be_err_is_recoverable(adapter)) {
+			dev_err(&adapter->pdev->dev,
+				"Failed to meet recovery criteria\n");
+			status = -EIO;
+			err_rec->resched_delay = 0;
+			break;
+		}
+		be_soft_reset(adapter);
+		err_rec->recovery_state = ERR_RECOVERY_ST_PRE_POLL;
+		err_rec->resched_delay = err_rec->ue_to_poll_time -
+					err_rec->ue_to_reset_time;
+		break;
+
+	case ERR_RECOVERY_ST_PRE_POLL:
+		err_rec->recovery_state = ERR_RECOVERY_ST_REINIT;
+		err_rec->resched_delay = 0;
+		status = 0;			/* done */
+		break;
+
+	default:
+		status = -EINVAL;
+		err_rec->resched_delay = 0;
+		break;
+	}
+
+	return status;
+}
+
+static int be_err_recover(struct be_adapter *adapter)
+{
+	int status;
+
+	if (!lancer_chip(adapter)) {
+		if (!adapter->error_recovery.recovery_supported ||
+		    adapter->priv_flags & BE_DISABLE_TPE_RECOVERY)
+			return -EIO;
+		status = be_tpe_recover(adapter);
+		if (status)
+			goto err;
+	}
+
+	/* Wait for adapter to reach quiescent state before
+	 * destroying queues
+	 */
+	status = be_fw_wait_ready(adapter);
+	if (status)
+		goto err;
+
+	adapter->flags |= BE_FLAGS_TRY_RECOVERY;
+
+	be_cleanup(adapter);
+
+	status = be_resume(adapter);
+	if (status)
+		goto err;
+
+	adapter->flags &= ~BE_FLAGS_TRY_RECOVERY;
+
+err:
+	return status;
+}
+
+static void be_err_detection_task(struct work_struct *work)
+{
+	struct be_error_recovery *err_rec =
+			container_of(work, struct be_error_recovery,
+				     err_detection_work.work);
+	struct be_adapter *adapter =
+			container_of(err_rec, struct be_adapter,
+				     error_recovery);
+	u32 resched_delay = ERR_RECOVERY_DETECTION_DELAY;
+	struct device *dev = &adapter->pdev->dev;
+	int recovery_status;
+
+	be_detect_error(adapter);
+	if (!be_check_error(adapter, BE_ERROR_HW))
+		goto reschedule_task;
+
+	recovery_status = be_err_recover(adapter);
+	if (!recovery_status) {
+		err_rec->recovery_retries = 0;
+		err_rec->recovery_state = ERR_RECOVERY_ST_NONE;
+		dev_info(dev, "Adapter recovery successful\n");
+		goto reschedule_task;
+	} else if (!lancer_chip(adapter) && err_rec->resched_delay) {
+		/* BEx/SH recovery state machine */
+		if (adapter->pf_num == 0 &&
+		    err_rec->recovery_state > ERR_RECOVERY_ST_DETECT)
+			dev_err(&adapter->pdev->dev,
+				"Adapter recovery in progress\n");
+		resched_delay = err_rec->resched_delay;
+		goto reschedule_task;
+	} else if (lancer_chip(adapter) && be_virtfn(adapter)) {
+		/* For VFs, check if PF have allocated resources
+		 * every second.
+		 */
+		dev_err(dev, "Re-trying adapter recovery\n");
+		goto reschedule_task;
+	} else if (lancer_chip(adapter) && err_rec->recovery_retries++ <
+		   ERR_RECOVERY_MAX_RETRY_COUNT) {
+		/* In case of another error during recovery, it takes 30 sec
+		 * for adapter to come out of error. Retry error recovery after
+		 * this time interval.
+		 */
+		dev_err(&adapter->pdev->dev, "Re-trying adapter recovery\n");
+		resched_delay = ERR_RECOVERY_RETRY_DELAY;
+		goto reschedule_task;
+	} else {
+		dev_err(dev, "Adapter recovery failed\n");
+		dev_err(dev, "Please reboot server to recover\n");
+	}
+
+	return;
+
+reschedule_task:
+	be_schedule_err_detection(adapter, resched_delay);
+}
+
+static void be_log_sfp_info(struct be_adapter *adapter)
+{
+	int status;
+
+	status = be_cmd_query_sfp_info(adapter);
+	if (!status) {
+		dev_err(&adapter->pdev->dev,
+			"Port %c: %s Vendor: %s part no: %s",
+			adapter->port_name,
+			be_misconfig_evt_port_state[adapter->phy_state],
+			adapter->phy.vendor_name,
+			adapter->phy.vendor_pn);
+	}
+	adapter->flags &= ~BE_FLAGS_PHY_MISCONFIGURED;
+}
+
+static void be_worker(struct work_struct *work)
+{
+	struct be_adapter *adapter =
+		container_of(work, struct be_adapter, work.work);
+	struct be_rx_obj *rxo;
+	int i;
+
+	if (be_physfn(adapter) &&
+	    MODULO(adapter->work_counter, adapter->be_get_temp_freq) == 0)
+		be_cmd_get_die_temperature(adapter);
+
+	/* when interrupts are not yet enabled, just reap any pending
+	 * mcc completions
+	 */
+	if (!netif_running(adapter->netdev)) {
+		local_bh_disable();
+		be_process_mcc(adapter);
+		local_bh_enable();
+		goto reschedule;
+	}
+
+	if (!adapter->stats_cmd_sent) {
+		if (lancer_chip(adapter))
+			lancer_cmd_get_pport_stats(adapter,
+						   &adapter->stats_cmd);
+		else
+			be_cmd_get_stats(adapter, &adapter->stats_cmd);
+	}
+
+	for_all_rx_queues(adapter, rxo, i) {
+		/* Replenish RX-queues starved due to memory
+		 * allocation failures.
+		 */
+		if (rxo->rx_post_starved)
+			be_post_rx_frags(rxo, GFP_KERNEL, MAX_RX_POST);
+	}
+
+	/* EQ-delay update for Skyhawk is done while notifying EQ */
+	if (!skyhawk_chip(adapter))
+		be_eqd_update(adapter, false);
+
+	if (adapter->flags & BE_FLAGS_PHY_MISCONFIGURED)
+		be_log_sfp_info(adapter);
+
+reschedule:
+	adapter->work_counter++;
+	queue_delayed_work(be_wq, &adapter->work, msecs_to_jiffies(1000));
+}
+
+static void be_unmap_pci_bars(struct be_adapter *adapter)
+{
+	if (adapter->csr)
+		pci_iounmap(adapter->pdev, adapter->csr);
+	if (adapter->db)
+		pci_iounmap(adapter->pdev, adapter->db);
+	if (adapter->pcicfg && adapter->pcicfg_mapped)
+		pci_iounmap(adapter->pdev, adapter->pcicfg);
+}
+
+static int db_bar(struct be_adapter *adapter)
+{
+	if (lancer_chip(adapter) || be_virtfn(adapter))
+		return 0;
+	else
+		return 4;
+}
+
+static int be_roce_map_pci_bars(struct be_adapter *adapter)
+{
+	if (skyhawk_chip(adapter)) {
+		adapter->roce_db.size = 4096;
+		adapter->roce_db.io_addr = pci_resource_start(adapter->pdev,
+							      db_bar(adapter));
+		adapter->roce_db.total_size = pci_resource_len(adapter->pdev,
+							       db_bar(adapter));
+	}
+	return 0;
+}
+
+static int be_map_pci_bars(struct be_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	u8 __iomem *addr;
+	u32 sli_intf;
+
+	pci_read_config_dword(adapter->pdev, SLI_INTF_REG_OFFSET, &sli_intf);
+	adapter->sli_family = (sli_intf & SLI_INTF_FAMILY_MASK) >>
+				SLI_INTF_FAMILY_SHIFT;
+	adapter->virtfn = (sli_intf & SLI_INTF_FT_MASK) ? 1 : 0;
+
+	if (BEx_chip(adapter) && be_physfn(adapter)) {
+		adapter->csr = pci_iomap(pdev, 2, 0);
+		if (!adapter->csr)
+			return -ENOMEM;
+	}
+
+	addr = pci_iomap(pdev, db_bar(adapter), 0);
+	if (!addr)
+		goto pci_map_err;
+	adapter->db = addr;
+
+	if (skyhawk_chip(adapter) || BEx_chip(adapter)) {
+		if (be_physfn(adapter)) {
+			/* PCICFG is the 2nd BAR in BE2 */
+			addr = pci_iomap(pdev, BE2_chip(adapter) ? 1 : 0, 0);
+			if (!addr)
+				goto pci_map_err;
+			adapter->pcicfg = addr;
+			adapter->pcicfg_mapped = true;
+		} else {
+			adapter->pcicfg = adapter->db + SRIOV_VF_PCICFG_OFFSET;
+			adapter->pcicfg_mapped = false;
+		}
+	}
+
+	be_roce_map_pci_bars(adapter);
+	return 0;
+
+pci_map_err:
+	dev_err(&pdev->dev, "Error in mapping PCI BARs\n");
+	be_unmap_pci_bars(adapter);
+	return -ENOMEM;
+}
+
+static void be_drv_cleanup(struct be_adapter *adapter)
+{
+	struct be_dma_mem *mem = &adapter->mbox_mem_alloced;
+	struct device *dev = &adapter->pdev->dev;
+
+	if (mem->va)
+		dma_free_coherent(dev, mem->size, mem->va, mem->dma);
+
+	mem = &adapter->rx_filter;
+	if (mem->va)
+		dma_free_coherent(dev, mem->size, mem->va, mem->dma);
+
+	mem = &adapter->stats_cmd;
+	if (mem->va)
+		dma_free_coherent(dev, mem->size, mem->va, mem->dma);
+}
+
+/* Allocate and initialize various fields in be_adapter struct */
+static int be_drv_init(struct be_adapter *adapter)
+{
+	struct be_dma_mem *mbox_mem_alloc = &adapter->mbox_mem_alloced;
+	struct be_dma_mem *mbox_mem_align = &adapter->mbox_mem;
+	struct be_dma_mem *rx_filter = &adapter->rx_filter;
+	struct be_dma_mem *stats_cmd = &adapter->stats_cmd;
+	struct device *dev = &adapter->pdev->dev;
+	int status = 0;
+
+	mbox_mem_alloc->size = sizeof(struct be_mcc_mailbox) + 16;
+	mbox_mem_alloc->va = dma_zalloc_coherent(dev, mbox_mem_alloc->size,
+						 &mbox_mem_alloc->dma,
+						 GFP_KERNEL);
+	if (!mbox_mem_alloc->va)
+		return -ENOMEM;
+
+	mbox_mem_align->size = sizeof(struct be_mcc_mailbox);
+	mbox_mem_align->va = PTR_ALIGN(mbox_mem_alloc->va, 16);
+	mbox_mem_align->dma = PTR_ALIGN(mbox_mem_alloc->dma, 16);
+
+	rx_filter->size = sizeof(struct be_cmd_req_rx_filter);
+	rx_filter->va = dma_zalloc_coherent(dev, rx_filter->size,
+					    &rx_filter->dma, GFP_KERNEL);
+	if (!rx_filter->va) {
+		status = -ENOMEM;
+		goto free_mbox;
+	}
+
+	if (lancer_chip(adapter))
+		stats_cmd->size = sizeof(struct lancer_cmd_req_pport_stats);
+	else if (BE2_chip(adapter))
+		stats_cmd->size = sizeof(struct be_cmd_req_get_stats_v0);
+	else if (BE3_chip(adapter))
+		stats_cmd->size = sizeof(struct be_cmd_req_get_stats_v1);
+	else
+		stats_cmd->size = sizeof(struct be_cmd_req_get_stats_v2);
+	stats_cmd->va = dma_zalloc_coherent(dev, stats_cmd->size,
+					    &stats_cmd->dma, GFP_KERNEL);
+	if (!stats_cmd->va) {
+		status = -ENOMEM;
+		goto free_rx_filter;
+	}
+
+	mutex_init(&adapter->mbox_lock);
+	mutex_init(&adapter->mcc_lock);
+	mutex_init(&adapter->rx_filter_lock);
+	spin_lock_init(&adapter->mcc_cq_lock);
+	init_completion(&adapter->et_cmd_compl);
+
+	pci_save_state(adapter->pdev);
+
+	INIT_DELAYED_WORK(&adapter->work, be_worker);
+
+	adapter->error_recovery.recovery_state = ERR_RECOVERY_ST_NONE;
+	adapter->error_recovery.resched_delay = 0;
+	INIT_DELAYED_WORK(&adapter->error_recovery.err_detection_work,
+			  be_err_detection_task);
+
+	adapter->rx_fc = true;
+	adapter->tx_fc = true;
+
+	/* Must be a power of 2 or else MODULO will BUG_ON */
+	adapter->be_get_temp_freq = 64;
+
+	INIT_LIST_HEAD(&adapter->vxlan_port_list);
+	return 0;
+
+free_rx_filter:
+	dma_free_coherent(dev, rx_filter->size, rx_filter->va, rx_filter->dma);
+free_mbox:
+	dma_free_coherent(dev, mbox_mem_alloc->size, mbox_mem_alloc->va,
+			  mbox_mem_alloc->dma);
+	return status;
+}
+
+static void be_remove(struct pci_dev *pdev)
+{
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+
+	if (!adapter)
+		return;
+
+	be_roce_dev_remove(adapter);
+	be_intr_set(adapter, false);
+
+	be_cancel_err_detection(adapter);
+
+	unregister_netdev(adapter->netdev);
+
+	be_clear(adapter);
+
+	if (!pci_vfs_assigned(adapter->pdev))
+		be_cmd_reset_function(adapter);
+
+	/* tell fw we're done with firing cmds */
+	be_cmd_fw_clean(adapter);
+
+	be_unmap_pci_bars(adapter);
+	be_drv_cleanup(adapter);
+
+	pci_disable_pcie_error_reporting(pdev);
+
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+
+	free_netdev(adapter->netdev);
+}
+
+static ssize_t be_hwmon_show_temp(struct device *dev,
+				  struct device_attribute *dev_attr,
+				  char *buf)
+{
+	struct be_adapter *adapter = dev_get_drvdata(dev);
+
+	/* Unit: millidegree Celsius */
+	if (adapter->hwmon_info.be_on_die_temp == BE_INVALID_DIE_TEMP)
+		return -EIO;
+	else
+		return sprintf(buf, "%u\n",
+			       adapter->hwmon_info.be_on_die_temp * 1000);
+}
+
+static SENSOR_DEVICE_ATTR(temp1_input, 0444,
+			  be_hwmon_show_temp, NULL, 1);
+
+static struct attribute *be_hwmon_attrs[] = {
+	&sensor_dev_attr_temp1_input.dev_attr.attr,
+	NULL
+};
+
+ATTRIBUTE_GROUPS(be_hwmon);
+
+static char *mc_name(struct be_adapter *adapter)
+{
+	char *str = "";	/* default */
+
+	switch (adapter->mc_type) {
+	case UMC:
+		str = "UMC";
+		break;
+	case FLEX10:
+		str = "FLEX10";
+		break;
+	case vNIC1:
+		str = "vNIC-1";
+		break;
+	case nPAR:
+		str = "nPAR";
+		break;
+	case UFP:
+		str = "UFP";
+		break;
+	case vNIC2:
+		str = "vNIC-2";
+		break;
+	default:
+		str = "";
+	}
+
+	return str;
+}
+
+static inline char *func_name(struct be_adapter *adapter)
+{
+	return be_physfn(adapter) ? "PF" : "VF";
+}
+
+static inline char *nic_name(struct pci_dev *pdev)
+{
+	switch (pdev->device) {
+	case OC_DEVICE_ID1:
+		return OC_NAME;
+	case OC_DEVICE_ID2:
+		return OC_NAME_BE;
+	case OC_DEVICE_ID3:
+	case OC_DEVICE_ID4:
+		return OC_NAME_LANCER;
+	case BE_DEVICE_ID2:
+		return BE3_NAME;
+	case OC_DEVICE_ID5:
+	case OC_DEVICE_ID6:
+		return OC_NAME_SH;
+	default:
+		return BE_NAME;
+	}
+}
+
+static int be_probe(struct pci_dev *pdev, const struct pci_device_id *pdev_id)
+{
+	struct be_adapter *adapter;
+	struct net_device *netdev;
+	int status = 0;
+
+	dev_info(&pdev->dev, "%s version is %s\n", DRV_NAME, DRV_VER);
+
+	status = pci_enable_device(pdev);
+	if (status)
+		goto do_none;
+
+	status = pci_request_regions(pdev, DRV_NAME);
+	if (status)
+		goto disable_dev;
+	pci_set_master(pdev);
+
+	netdev = alloc_etherdev_mqs(sizeof(*adapter), MAX_TX_QS, MAX_RX_QS);
+	if (!netdev) {
+		status = -ENOMEM;
+		goto rel_reg;
+	}
+	adapter = netdev_priv(netdev);
+	adapter->pdev = pdev;
+	pci_set_drvdata(pdev, adapter);
+	adapter->netdev = netdev;
+	SET_NETDEV_DEV(netdev, &pdev->dev);
+
+	status = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+	if (!status) {
+		netdev->features |= NETIF_F_HIGHDMA;
+	} else {
+		status = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+		if (status) {
+			dev_err(&pdev->dev, "Could not set PCI DMA Mask\n");
+			goto free_netdev;
+		}
+	}
+
+	status = pci_enable_pcie_error_reporting(pdev);
+	if (!status)
+		dev_info(&pdev->dev, "PCIe error reporting enabled\n");
+
+	status = be_map_pci_bars(adapter);
+	if (status)
+		goto free_netdev;
+
+	status = be_drv_init(adapter);
+	if (status)
+		goto unmap_bars;
+
+	status = be_setup(adapter);
+	if (status)
+		goto drv_cleanup;
+
+	be_netdev_init(netdev);
+	status = register_netdev(netdev);
+	if (status != 0)
+		goto unsetup;
+
+	be_roce_dev_add(adapter);
+
+	be_schedule_err_detection(adapter, ERR_DETECTION_DELAY);
+	adapter->error_recovery.probe_time = jiffies;
+
+	/* On Die temperature not supported for VF. */
+	if (be_physfn(adapter) && IS_ENABLED(CONFIG_BE2NET_HWMON)) {
+		adapter->hwmon_info.hwmon_dev =
+			devm_hwmon_device_register_with_groups(&pdev->dev,
+							       DRV_NAME,
+							       adapter,
+							       be_hwmon_groups);
+		adapter->hwmon_info.be_on_die_temp = BE_INVALID_DIE_TEMP;
+	}
+
+	dev_info(&pdev->dev, "%s: %s %s port %c\n", nic_name(pdev),
+		 func_name(adapter), mc_name(adapter), adapter->port_name);
+
+	return 0;
+
+unsetup:
+	be_clear(adapter);
+drv_cleanup:
+	be_drv_cleanup(adapter);
+unmap_bars:
+	be_unmap_pci_bars(adapter);
+free_netdev:
+	free_netdev(netdev);
+rel_reg:
+	pci_release_regions(pdev);
+disable_dev:
+	pci_disable_device(pdev);
+do_none:
+	dev_err(&pdev->dev, "%s initialization failed\n", nic_name(pdev));
+	return status;
+}
+
+static int be_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+
+	be_intr_set(adapter, false);
+	be_cancel_err_detection(adapter);
+
+	be_cleanup(adapter);
+
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+	return 0;
+}
+
+static int be_pci_resume(struct pci_dev *pdev)
+{
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+	int status = 0;
+
+	status = pci_enable_device(pdev);
+	if (status)
+		return status;
+
+	pci_restore_state(pdev);
+
+	status = be_resume(adapter);
+	if (status)
+		return status;
+
+	be_schedule_err_detection(adapter, ERR_DETECTION_DELAY);
+
+	return 0;
+}
+
+/*
+ * An FLR will stop BE from DMAing any data.
+ */
+static void be_shutdown(struct pci_dev *pdev)
+{
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+
+	if (!adapter)
+		return;
+
+	be_roce_dev_shutdown(adapter);
+	cancel_delayed_work_sync(&adapter->work);
+	be_cancel_err_detection(adapter);
+
+	netif_device_detach(adapter->netdev);
+
+	be_cmd_reset_function(adapter);
+
+	pci_disable_device(pdev);
+}
+
+static pci_ers_result_t be_eeh_err_detected(struct pci_dev *pdev,
+					    pci_channel_state_t state)
+{
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+
+	dev_err(&adapter->pdev->dev, "EEH error detected\n");
+
+	be_roce_dev_remove(adapter);
+
+	if (!be_check_error(adapter, BE_ERROR_EEH)) {
+		be_set_error(adapter, BE_ERROR_EEH);
+
+		be_cancel_err_detection(adapter);
+
+		be_cleanup(adapter);
+	}
+
+	if (state == pci_channel_io_perm_failure)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	pci_disable_device(pdev);
+
+	/* The error could cause the FW to trigger a flash debug dump.
+	 * Resetting the card while flash dump is in progress
+	 * can cause it not to recover; wait for it to finish.
+	 * Wait only for first function as it is needed only once per
+	 * adapter.
+	 */
+	if (pdev->devfn == 0)
+		ssleep(30);
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t be_eeh_reset(struct pci_dev *pdev)
+{
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+	int status;
+
+	dev_info(&adapter->pdev->dev, "EEH reset\n");
+
+	status = pci_enable_device(pdev);
+	if (status)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	pci_set_master(pdev);
+	pci_restore_state(pdev);
+
+	/* Check if card is ok and fw is ready */
+	dev_info(&adapter->pdev->dev,
+		 "Waiting for FW to be ready after EEH reset\n");
+	status = be_fw_wait_ready(adapter);
+	if (status)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+	be_clear_error(adapter, BE_CLEAR_ALL);
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void be_eeh_resume(struct pci_dev *pdev)
+{
+	int status = 0;
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+
+	dev_info(&adapter->pdev->dev, "EEH resume\n");
+
+	pci_save_state(pdev);
+
+	status = be_resume(adapter);
+	if (status)
+		goto err;
+
+	be_roce_dev_add(adapter);
+
+	be_schedule_err_detection(adapter, ERR_DETECTION_DELAY);
+	return;
+err:
+	dev_err(&adapter->pdev->dev, "EEH resume failed\n");
+}
+
+static int be_pci_sriov_configure(struct pci_dev *pdev, int num_vfs)
+{
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+	struct be_resources vft_res = {0};
+	int status;
+
+	if (!num_vfs)
+		be_vf_clear(adapter);
+
+	adapter->num_vfs = num_vfs;
+
+	if (adapter->num_vfs == 0 && pci_vfs_assigned(pdev)) {
+		dev_warn(&pdev->dev,
+			 "Cannot disable VFs while they are assigned\n");
+		return -EBUSY;
+	}
+
+	/* When the HW is in SRIOV capable configuration, the PF-pool resources
+	 * are equally distributed across the max-number of VFs. The user may
+	 * request only a subset of the max-vfs to be enabled.
+	 * Based on num_vfs, redistribute the resources across num_vfs so that
+	 * each VF will have access to more number of resources.
+	 * This facility is not available in BE3 FW.
+	 * Also, this is done by FW in Lancer chip.
+	 */
+	if (skyhawk_chip(adapter) && !pci_num_vf(pdev)) {
+		be_calculate_vf_res(adapter, adapter->num_vfs,
+				    &vft_res);
+		status = be_cmd_set_sriov_config(adapter, adapter->pool_res,
+						 adapter->num_vfs, &vft_res);
+		if (status)
+			dev_err(&pdev->dev,
+				"Failed to optimize SR-IOV resources\n");
+	}
+
+	status = be_get_resources(adapter);
+	if (status)
+		return be_cmd_status(status);
+
+	/* Updating real_num_tx/rx_queues() requires rtnl_lock() */
+	rtnl_lock();
+	status = be_update_queues(adapter);
+	rtnl_unlock();
+	if (status)
+		return be_cmd_status(status);
+
+	if (adapter->num_vfs)
+		status = be_vf_setup(adapter);
+
+	if (!status)
+		return adapter->num_vfs;
+
+	return 0;
+}
+
+static const struct pci_error_handlers be_eeh_handlers = {
+	.error_detected = be_eeh_err_detected,
+	.slot_reset = be_eeh_reset,
+	.resume = be_eeh_resume,
+};
+
+static struct pci_driver be_driver = {
+	.name = DRV_NAME,
+	.id_table = be_dev_ids,
+	.probe = be_probe,
+	.remove = be_remove,
+	.suspend = be_suspend,
+	.resume = be_pci_resume,
+	.shutdown = be_shutdown,
+	.sriov_configure = be_pci_sriov_configure,
+	.err_handler = &be_eeh_handlers
+};
+
+static int __init be_init_module(void)
+{
+	int status;
+
+	if (rx_frag_size != 8192 && rx_frag_size != 4096 &&
+	    rx_frag_size != 2048) {
+		printk(KERN_WARNING DRV_NAME
+			" : Module param rx_frag_size must be 2048/4096/8192."
+			" Using 2048\n");
+		rx_frag_size = 2048;
+	}
+
+	if (num_vfs > 0) {
+		pr_info(DRV_NAME " : Module param num_vfs is obsolete.");
+		pr_info(DRV_NAME " : Use sysfs method to enable VFs\n");
+	}
+
+	be_wq = create_singlethread_workqueue("be_wq");
+	if (!be_wq) {
+		pr_warn(DRV_NAME "workqueue creation failed\n");
+		return -1;
+	}
+
+	be_err_recovery_workq =
+		create_singlethread_workqueue("be_err_recover");
+	if (!be_err_recovery_workq)
+		pr_warn(DRV_NAME "Could not create error recovery workqueue\n");
+
+	status = pci_register_driver(&be_driver);
+	if (status) {
+		destroy_workqueue(be_wq);
+		be_destroy_err_recovery_workq();
+	}
+	return status;
+}
+module_init(be_init_module);
+
+static void __exit be_exit_module(void)
+{
+	pci_unregister_driver(&be_driver);
+
+	be_destroy_err_recovery_workq();
+
+	if (be_wq)
+		destroy_workqueue(be_wq);
+}
+module_exit(be_exit_module);
diff --git a/drivers/net/ethernet/emulex/benet/be_roce.c b/drivers/net/ethernet/emulex/benet/be_roce.c
new file mode 100644
index 0000000..05989aa
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_roce.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (C) 2005 - 2016 Broadcom
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation. The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+
+#include "be.h"
+#include "be_cmds.h"
+
+static struct ocrdma_driver *ocrdma_drv;
+static LIST_HEAD(be_adapter_list);
+static DEFINE_MUTEX(be_adapter_list_lock);
+
+static void _be_roce_dev_add(struct be_adapter *adapter)
+{
+	struct be_dev_info dev_info;
+	int i, num_vec;
+	struct pci_dev *pdev = adapter->pdev;
+
+	if (!ocrdma_drv)
+		return;
+
+	if (ocrdma_drv->be_abi_version != BE_ROCE_ABI_VERSION) {
+		dev_warn(&pdev->dev, "Cannot initialize RoCE due to ocrdma ABI mismatch\n");
+		return;
+	}
+
+	if (pdev->device == OC_DEVICE_ID5) {
+		/* only msix is supported on these devices */
+		if (!msix_enabled(adapter))
+			return;
+		/* DPP region address and length */
+		dev_info.dpp_unmapped_addr = pci_resource_start(pdev, 2);
+		dev_info.dpp_unmapped_len = pci_resource_len(pdev, 2);
+	} else {
+		dev_info.dpp_unmapped_addr = 0;
+		dev_info.dpp_unmapped_len = 0;
+	}
+	dev_info.pdev = adapter->pdev;
+	dev_info.db = adapter->db;
+	dev_info.unmapped_db = adapter->roce_db.io_addr;
+	dev_info.db_page_size = adapter->roce_db.size;
+	dev_info.db_total_size = adapter->roce_db.total_size;
+	dev_info.netdev = adapter->netdev;
+	memcpy(dev_info.mac_addr, adapter->netdev->dev_addr, ETH_ALEN);
+	dev_info.dev_family = adapter->sli_family;
+	if (msix_enabled(adapter)) {
+		/* provide all the vectors, so that EQ creation response
+		 * can decide which one to use.
+		 */
+		num_vec = adapter->num_msix_vec + adapter->num_msix_roce_vec;
+		dev_info.intr_mode = BE_INTERRUPT_MODE_MSIX;
+		dev_info.msix.num_vectors = min(num_vec, MAX_MSIX_VECTORS);
+		/* provide start index of the vector,
+		 * so in case of linear usage,
+		 * it can use the base as starting point.
+		 */
+		dev_info.msix.start_vector = adapter->num_evt_qs;
+		for (i = 0; i < dev_info.msix.num_vectors; i++) {
+			dev_info.msix.vector_list[i] =
+			    adapter->msix_entries[i].vector;
+		}
+	} else {
+		dev_info.msix.num_vectors = 0;
+		dev_info.intr_mode = BE_INTERRUPT_MODE_INTX;
+	}
+	adapter->ocrdma_dev = ocrdma_drv->add(&dev_info);
+}
+
+void be_roce_dev_add(struct be_adapter *adapter)
+{
+	if (be_roce_supported(adapter)) {
+		INIT_LIST_HEAD(&adapter->entry);
+		mutex_lock(&be_adapter_list_lock);
+		list_add_tail(&adapter->entry, &be_adapter_list);
+
+		/* invoke add() routine of roce driver only if
+		 * valid driver registered with add method and add() is not yet
+		 * invoked on a given adapter.
+		 */
+		_be_roce_dev_add(adapter);
+		mutex_unlock(&be_adapter_list_lock);
+	}
+}
+
+static void _be_roce_dev_remove(struct be_adapter *adapter)
+{
+	if (ocrdma_drv && ocrdma_drv->remove && adapter->ocrdma_dev)
+		ocrdma_drv->remove(adapter->ocrdma_dev);
+	adapter->ocrdma_dev = NULL;
+}
+
+void be_roce_dev_remove(struct be_adapter *adapter)
+{
+	if (be_roce_supported(adapter)) {
+		mutex_lock(&be_adapter_list_lock);
+		_be_roce_dev_remove(adapter);
+		list_del(&adapter->entry);
+		mutex_unlock(&be_adapter_list_lock);
+	}
+}
+
+void be_roce_dev_shutdown(struct be_adapter *adapter)
+{
+	if (be_roce_supported(adapter)) {
+		mutex_lock(&be_adapter_list_lock);
+		if (ocrdma_drv && adapter->ocrdma_dev &&
+		    ocrdma_drv->state_change_handler)
+			ocrdma_drv->state_change_handler(adapter->ocrdma_dev,
+							 BE_DEV_SHUTDOWN);
+		mutex_unlock(&be_adapter_list_lock);
+	}
+}
+
+int be_roce_register_driver(struct ocrdma_driver *drv)
+{
+	struct be_adapter *dev;
+
+	mutex_lock(&be_adapter_list_lock);
+	if (ocrdma_drv) {
+		mutex_unlock(&be_adapter_list_lock);
+		return -EINVAL;
+	}
+	ocrdma_drv = drv;
+	list_for_each_entry(dev, &be_adapter_list, entry) {
+		_be_roce_dev_add(dev);
+	}
+	mutex_unlock(&be_adapter_list_lock);
+	return 0;
+}
+EXPORT_SYMBOL(be_roce_register_driver);
+
+void be_roce_unregister_driver(struct ocrdma_driver *drv)
+{
+	struct be_adapter *dev;
+
+	mutex_lock(&be_adapter_list_lock);
+	list_for_each_entry(dev, &be_adapter_list, entry) {
+		if (dev->ocrdma_dev)
+			_be_roce_dev_remove(dev);
+	}
+	ocrdma_drv = NULL;
+	mutex_unlock(&be_adapter_list_lock);
+}
+EXPORT_SYMBOL(be_roce_unregister_driver);
diff --git a/drivers/net/ethernet/emulex/benet/be_roce.h b/drivers/net/ethernet/emulex/benet/be_roce.h
new file mode 100644
index 0000000..e51719a
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_roce.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2005 - 2016 Broadcom
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation. The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#ifndef BE_ROCE_H
+#define BE_ROCE_H
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+
+#define BE_ROCE_ABI_VERSION	1
+
+struct ocrdma_dev;
+
+enum be_interrupt_mode {
+	BE_INTERRUPT_MODE_MSIX	= 0,
+	BE_INTERRUPT_MODE_INTX	= 1,
+	BE_INTERRUPT_MODE_MSI	= 2,
+};
+
+#define MAX_MSIX_VECTORS		32
+struct be_dev_info {
+	u8 __iomem *db;
+	u64 unmapped_db;
+	u32 db_page_size;
+	u32 db_total_size;
+	u64 dpp_unmapped_addr;
+	u32 dpp_unmapped_len;
+	struct pci_dev *pdev;
+	struct net_device *netdev;
+	u8 mac_addr[ETH_ALEN];
+	u32 dev_family;
+	enum be_interrupt_mode intr_mode;
+	struct {
+		int num_vectors;
+		int start_vector;
+		u32 vector_list[MAX_MSIX_VECTORS];
+	} msix;
+};
+
+/* ocrdma driver register's the callback functions with nic driver. */
+struct ocrdma_driver {
+	unsigned char name[32];
+	u32 be_abi_version;
+	struct ocrdma_dev *(*add) (struct be_dev_info *dev_info);
+	void (*remove) (struct ocrdma_dev *);
+	void (*state_change_handler) (struct ocrdma_dev *, u32 new_state);
+};
+
+enum be_roce_event {
+	BE_DEV_SHUTDOWN = 2
+};
+
+/* APIs for RoCE driver to register callback handlers,
+ * which will be invoked when device is added, removed, ifup, ifdown
+ */
+int be_roce_register_driver(struct ocrdma_driver *drv);
+void be_roce_unregister_driver(struct ocrdma_driver *drv);
+
+/* API for RoCE driver to issue mailbox commands */
+int be_roce_mcc_cmd(void *netdev_handle, void *wrb_payload,
+		    int wrb_payload_size, u16 *cmd_status, u16 *ext_status);
+
+#endif /* BE_ROCE_H */
diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
new file mode 100644
index 0000000..14d287b
--- /dev/null
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -0,0 +1,283 @@
+#
+# Intel network device configuration
+#
+
+config NET_VENDOR_INTEL
+	bool "Intel devices"
+	default y
+	---help---
+	  If you have a network (Ethernet) card belonging to this class, say Y.
+
+	  Note that the answer to this question doesn't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about Intel cards. If you say Y, you will be asked for
+	  your specific card in the following questions.
+
+if NET_VENDOR_INTEL
+
+config E100
+	tristate "Intel(R) PRO/100+ support"
+	depends on PCI
+	select MII
+	---help---
+	  This driver supports Intel(R) PRO/100 family of adapters.
+	  To verify that your adapter is supported, find the board ID number
+	  on the adapter. Look for a label that has a barcode and a number
+	  in the format 123456-001 (six digits hyphen three digits).
+
+	  Use the above information and the Adapter & Driver ID Guide that
+	  can be located at:
+
+	  <http://support.intel.com>
+
+	  to identify the adapter.
+
+	  More specific information on configuring the driver is in
+	  <file:Documentation/networking/e100.txt>.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called e100.
+
+config E1000
+	tristate "Intel(R) PRO/1000 Gigabit Ethernet support"
+	depends on PCI
+	---help---
+	  This driver supports Intel(R) PRO/1000 gigabit ethernet family of
+	  adapters.  For more information on how to identify your adapter, go
+	  to the Adapter & Driver ID Guide that can be located at:
+
+	  <http://support.intel.com>
+
+	  More specific information on configuring the driver is in
+	  <file:Documentation/networking/e1000.txt>.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called e1000.
+
+config E1000E
+	tristate "Intel(R) PRO/1000 PCI-Express Gigabit Ethernet support"
+	depends on PCI && (!SPARC32 || BROKEN)
+	select CRC32
+	imply PTP_1588_CLOCK
+	---help---
+	  This driver supports the PCI-Express Intel(R) PRO/1000 gigabit
+	  ethernet family of adapters. For PCI or PCI-X e1000 adapters,
+	  use the regular e1000 driver For more information on how to
+	  identify your adapter, go to the Adapter & Driver ID Guide that
+	  can be located at:
+
+	  <http://support.intel.com>
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called e1000e.
+
+config E1000E_HWTS
+	bool "Support HW cross-timestamp on PCH devices"
+	default y
+	depends on E1000E && X86
+	---help---
+	 Say Y to enable hardware supported cross-timestamping on PCH
+	 devices. The cross-timestamp is available through the PTP clock
+	 driver precise cross-timestamp ioctl (PTP_SYS_OFFSET_PRECISE).
+
+config IGB
+	tristate "Intel(R) 82575/82576 PCI-Express Gigabit Ethernet support"
+	depends on PCI
+	imply PTP_1588_CLOCK
+	select I2C
+	select I2C_ALGOBIT
+	---help---
+	  This driver supports Intel(R) 82575/82576 gigabit ethernet family of
+	  adapters.  For more information on how to identify your adapter, go
+	  to the Adapter & Driver ID Guide that can be located at:
+
+	  <http://support.intel.com>
+
+	  More specific information on configuring the driver is in
+	  <file:Documentation/networking/e1000.txt>.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called igb.
+
+config IGB_HWMON
+	bool "Intel(R) PCI-Express Gigabit adapters HWMON support"
+	default y
+	depends on IGB && HWMON && !(IGB=y && HWMON=m)
+	---help---
+	  Say Y if you want to expose thermal sensor data on Intel devices.
+
+	  Some of our devices contain thermal sensors, both external and internal.
+	  This data is available via the hwmon sysfs interface and exposes
+	  the onboard sensors.
+
+config IGB_DCA
+	bool "Direct Cache Access (DCA) Support"
+	default y
+	depends on IGB && DCA && !(IGB=y && DCA=m)
+	---help---
+	  Say Y here if you want to use Direct Cache Access (DCA) in the
+	  driver.  DCA is a method for warming the CPU cache before data
+	  is used, with the intent of lessening the impact of cache misses.
+
+config IGBVF
+	tristate "Intel(R) 82576 Virtual Function Ethernet support"
+	depends on PCI
+	---help---
+	  This driver supports Intel(R) 82576 virtual functions.  For more
+	  information on how to identify your adapter, go to the Adapter &
+	  Driver ID Guide that can be located at:
+
+	  <http://support.intel.com>
+
+	  More specific information on configuring the driver is in
+	  <file:Documentation/networking/e1000.txt>.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called igbvf.
+
+config IXGB
+	tristate "Intel(R) PRO/10GbE support"
+	depends on PCI
+	---help---
+	  This driver supports Intel(R) PRO/10GbE family of adapters for
+	  PCI-X type cards. For PCI-E type cards, use the "ixgbe" driver
+	  instead. For more information on how to identify your adapter, go
+	  to the Adapter & Driver ID Guide that can be located at:
+
+	  <http://support.intel.com>
+
+	  More specific information on configuring the driver is in
+	  <file:Documentation/networking/ixgb.txt>.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called ixgb.
+
+config IXGBE
+	tristate "Intel(R) 10GbE PCI Express adapters support"
+	depends on PCI
+	select MDIO
+	imply PTP_1588_CLOCK
+	---help---
+	  This driver supports Intel(R) 10GbE PCI Express family of
+	  adapters.  For more information on how to identify your adapter, go
+	  to the Adapter & Driver ID Guide that can be located at:
+
+	  <http://support.intel.com>
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called ixgbe.
+
+config IXGBE_HWMON
+	bool "Intel(R) 10GbE PCI Express adapters HWMON support"
+	default y
+	depends on IXGBE && HWMON && !(IXGBE=y && HWMON=m)
+	---help---
+	  Say Y if you want to expose the thermal sensor data on some of
+	  our cards, via a hwmon sysfs interface.
+
+config IXGBE_DCA
+	bool "Direct Cache Access (DCA) Support"
+	default y
+	depends on IXGBE && DCA && !(IXGBE=y && DCA=m)
+	---help---
+	  Say Y here if you want to use Direct Cache Access (DCA) in the
+	  driver.  DCA is a method for warming the CPU cache before data
+	  is used, with the intent of lessening the impact of cache misses.
+
+config IXGBE_DCB
+	bool "Data Center Bridging (DCB) Support"
+	default n
+	depends on IXGBE && DCB
+	---help---
+	  Say Y here if you want to use Data Center Bridging (DCB) in the
+	  driver.
+
+	  If unsure, say N.
+
+config IXGBEVF
+	tristate "Intel(R) 10GbE PCI Express Virtual Function Ethernet support"
+	depends on PCI_MSI
+	---help---
+	  This driver supports Intel(R) PCI Express virtual functions for the
+	  Intel(R) ixgbe driver.  For more information on how to identify your
+	  adapter, go to the Adapter & Driver ID Guide that can be located at:
+
+	  <http://support.intel.com>
+
+	  More specific information on configuring the driver is in
+	  <file:Documentation/networking/ixgbevf.txt>.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called ixgbevf.  MSI-X interrupt support is required
+	  for this driver to work correctly.
+
+config I40E
+	tristate "Intel(R) Ethernet Controller XL710 Family support"
+	imply PTP_1588_CLOCK
+	depends on PCI
+	---help---
+	  This driver supports Intel(R) Ethernet Controller XL710 Family of
+	  devices.  For more information on how to identify your adapter, go
+	  to the Adapter & Driver ID Guide that can be located at:
+
+	  <http://support.intel.com>
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called i40e.
+
+config I40E_DCB
+	bool "Data Center Bridging (DCB) Support"
+	default n
+	depends on I40E && DCB
+	---help---
+	  Say Y here if you want to use Data Center Bridging (DCB) in the
+	  driver.
+
+	  If unsure, say N.
+
+config I40EVF
+	tristate "Intel(R) Ethernet Adaptive Virtual Function support"
+	depends on PCI_MSI
+	---help---
+	  This driver supports virtual functions for Intel XL710,
+	  X710, X722, and all devices advertising support for Intel
+	  Ethernet Adaptive Virtual Function devices. For more
+	  information on how to identify your adapter, go to the Adapter
+	  & Driver ID Guide that can be located at:
+
+	  <http://support.intel.com>
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called i40evf.  MSI-X interrupt support is required
+	  for this driver to work correctly.
+
+config ICE
+	tristate "Intel(R) Ethernet Connection E800 Series Support"
+	default n
+	depends on PCI_MSI
+	---help---
+	  This driver supports Intel(R) Ethernet Connection E800 Series of
+	  devices.  For more information on how to identify your adapter, go
+	  to the Adapter & Driver ID Guide that can be located at:
+
+	  <http://support.intel.com>
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called ice.
+
+config FM10K
+	tristate "Intel(R) FM10000 Ethernet Switch Host Interface Support"
+	default n
+	depends on PCI_MSI
+	imply PTP_1588_CLOCK
+	---help---
+	  This driver supports Intel(R) FM10000 Ethernet Switch Host
+	  Interface.  For more information on how to identify your adapter,
+	  go to the Adapter & Driver ID Guide that can be located at:
+
+	  <http://support.intel.com>
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called fm10k.  MSI-X interrupt support is required
+
+endif # NET_VENDOR_INTEL
diff --git a/drivers/net/ethernet/intel/i40e/Makefile b/drivers/net/ethernet/intel/i40e/Makefile
new file mode 100644
index 0000000..7543776
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/Makefile
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: GPL-2.0
+################################################################################
+#
+# Intel Ethernet Controller XL710 Family Linux Driver
+# Copyright(c) 2013 - 2015 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms and conditions of the GNU General Public License,
+# version 2, as published by the Free Software Foundation.
+#
+# This program is distributed in the hope it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+# more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# The full GNU General Public License is included in this distribution in
+# the file called "COPYING".
+#
+# Contact Information:
+# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+#
+################################################################################
+
+#
+# Makefile for the Intel(R) Ethernet Connection XL710 (i40e.ko) driver
+#
+
+ccflags-y += -I$(src)
+subdir-ccflags-y += -I$(src)
+
+obj-$(CONFIG_I40E) += i40e.o
+
+i40e-objs := i40e_main.o \
+	i40e_ethtool.o	\
+	i40e_adminq.o	\
+	i40e_common.o	\
+	i40e_hmc.o	\
+	i40e_lan_hmc.o	\
+	i40e_nvm.o	\
+	i40e_debugfs.o	\
+	i40e_diag.o	\
+	i40e_txrx.o	\
+	i40e_ptp.o	\
+	i40e_client.o   \
+	i40e_virtchnl_pf.o
+
+i40e-$(CONFIG_I40E_DCB) += i40e_dcb.o i40e_dcb_nl.o
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
new file mode 100644
index 0000000..a44139c
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -0,0 +1,1120 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_H_
+#define _I40E_H_
+
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/aer.h>
+#include <linux/netdevice.h>
+#include <linux/ioport.h>
+#include <linux/iommu.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/hashtable.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/sctp.h>
+#include <linux/pkt_sched.h>
+#include <linux/ipv6.h>
+#include <net/checksum.h>
+#include <net/ip6_checksum.h>
+#include <linux/ethtool.h>
+#include <linux/if_vlan.h>
+#include <linux/if_bridge.h>
+#include <linux/clocksource.h>
+#include <linux/net_tstamp.h>
+#include <linux/ptp_clock_kernel.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_gact.h>
+#include <net/tc_act/tc_mirred.h>
+#include "i40e_type.h"
+#include "i40e_prototype.h"
+#include "i40e_client.h"
+#include <linux/avf/virtchnl.h>
+#include "i40e_virtchnl_pf.h"
+#include "i40e_txrx.h"
+#include "i40e_dcb.h"
+
+/* Useful i40e defaults */
+#define I40E_MAX_VEB			16
+
+#define I40E_MAX_NUM_DESCRIPTORS	4096
+#define I40E_MAX_CSR_SPACE		(4 * 1024 * 1024 - 64 * 1024)
+#define I40E_DEFAULT_NUM_DESCRIPTORS	512
+#define I40E_REQ_DESCRIPTOR_MULTIPLE	32
+#define I40E_MIN_NUM_DESCRIPTORS	64
+#define I40E_MIN_MSIX			2
+#define I40E_DEFAULT_NUM_VMDQ_VSI	8 /* max 256 VSIs */
+#define I40E_MIN_VSI_ALLOC		83 /* LAN, ATR, FCOE, 64 VF */
+/* max 16 qps */
+#define i40e_default_queues_per_vmdq(pf) \
+		(((pf)->hw_features & I40E_HW_RSS_AQ_CAPABLE) ? 4 : 1)
+#define I40E_DEFAULT_QUEUES_PER_VF	4
+#define I40E_MAX_VF_QUEUES		16
+#define I40E_DEFAULT_QUEUES_PER_TC	1 /* should be a power of 2 */
+#define i40e_pf_get_max_q_per_tc(pf) \
+		(((pf)->hw_features & I40E_HW_128_QP_RSS_CAPABLE) ? 128 : 64)
+#define I40E_FDIR_RING			0
+#define I40E_FDIR_RING_COUNT		32
+#define I40E_MAX_AQ_BUF_SIZE		4096
+#define I40E_AQ_LEN			256
+#define I40E_AQ_WORK_LIMIT		66 /* max number of VFs + a little */
+#define I40E_MAX_USER_PRIORITY		8
+#define I40E_DEFAULT_TRAFFIC_CLASS	BIT(0)
+#define I40E_DEFAULT_MSG_ENABLE		4
+#define I40E_QUEUE_WAIT_RETRY_LIMIT	10
+#define I40E_INT_NAME_STR_LEN		(IFNAMSIZ + 16)
+
+#define I40E_NVM_VERSION_LO_SHIFT	0
+#define I40E_NVM_VERSION_LO_MASK	(0xff << I40E_NVM_VERSION_LO_SHIFT)
+#define I40E_NVM_VERSION_HI_SHIFT	12
+#define I40E_NVM_VERSION_HI_MASK	(0xf << I40E_NVM_VERSION_HI_SHIFT)
+#define I40E_OEM_VER_BUILD_MASK		0xffff
+#define I40E_OEM_VER_PATCH_MASK		0xff
+#define I40E_OEM_VER_BUILD_SHIFT	8
+#define I40E_OEM_VER_SHIFT		24
+#define I40E_PHY_DEBUG_ALL \
+	(I40E_AQ_PHY_DEBUG_DISABLE_LINK_FW | \
+	I40E_AQ_PHY_DEBUG_DISABLE_ALL_LINK_FW)
+
+#define I40E_OEM_EETRACK_ID		0xffffffff
+#define I40E_OEM_GEN_SHIFT		24
+#define I40E_OEM_SNAP_MASK		0x00ff0000
+#define I40E_OEM_SNAP_SHIFT		16
+#define I40E_OEM_RELEASE_MASK		0x0000ffff
+
+/* The values in here are decimal coded as hex as is the case in the NVM map*/
+#define I40E_CURRENT_NVM_VERSION_HI	0x2
+#define I40E_CURRENT_NVM_VERSION_LO	0x40
+
+#define I40E_RX_DESC(R, i)	\
+	(&(((union i40e_32byte_rx_desc *)((R)->desc))[i]))
+#define I40E_TX_DESC(R, i)	\
+	(&(((struct i40e_tx_desc *)((R)->desc))[i]))
+#define I40E_TX_CTXTDESC(R, i)	\
+	(&(((struct i40e_tx_context_desc *)((R)->desc))[i]))
+#define I40E_TX_FDIRDESC(R, i)	\
+	(&(((struct i40e_filter_program_desc *)((R)->desc))[i]))
+
+/* default to trying for four seconds */
+#define I40E_TRY_LINK_TIMEOUT	(4 * HZ)
+
+/* BW rate limiting */
+#define I40E_BW_CREDIT_DIVISOR		50 /* 50Mbps per BW credit */
+#define I40E_BW_MBPS_DIVISOR		125000 /* rate / (1000000 / 8) Mbps */
+#define I40E_MAX_BW_INACTIVE_ACCUM	4 /* accumulate 4 credits max */
+
+/* driver state flags */
+enum i40e_state_t {
+	__I40E_TESTING,
+	__I40E_CONFIG_BUSY,
+	__I40E_CONFIG_DONE,
+	__I40E_DOWN,
+	__I40E_SERVICE_SCHED,
+	__I40E_ADMINQ_EVENT_PENDING,
+	__I40E_MDD_EVENT_PENDING,
+	__I40E_VFLR_EVENT_PENDING,
+	__I40E_RESET_RECOVERY_PENDING,
+	__I40E_MISC_IRQ_REQUESTED,
+	__I40E_RESET_INTR_RECEIVED,
+	__I40E_REINIT_REQUESTED,
+	__I40E_PF_RESET_REQUESTED,
+	__I40E_CORE_RESET_REQUESTED,
+	__I40E_GLOBAL_RESET_REQUESTED,
+	__I40E_EMP_RESET_REQUESTED,
+	__I40E_EMP_RESET_INTR_RECEIVED,
+	__I40E_SUSPENDED,
+	__I40E_PTP_TX_IN_PROGRESS,
+	__I40E_BAD_EEPROM,
+	__I40E_DOWN_REQUESTED,
+	__I40E_FD_FLUSH_REQUESTED,
+	__I40E_FD_ATR_AUTO_DISABLED,
+	__I40E_FD_SB_AUTO_DISABLED,
+	__I40E_RESET_FAILED,
+	__I40E_PORT_SUSPENDED,
+	__I40E_VF_DISABLE,
+	__I40E_MACVLAN_SYNC_PENDING,
+	__I40E_UDP_FILTER_SYNC_PENDING,
+	__I40E_TEMP_LINK_POLLING,
+	__I40E_CLIENT_SERVICE_REQUESTED,
+	__I40E_CLIENT_L2_CHANGE,
+	__I40E_CLIENT_RESET,
+	/* This must be last as it determines the size of the BITMAP */
+	__I40E_STATE_SIZE__,
+};
+
+#define I40E_PF_RESET_FLAG	BIT_ULL(__I40E_PF_RESET_REQUESTED)
+
+/* VSI state flags */
+enum i40e_vsi_state_t {
+	__I40E_VSI_DOWN,
+	__I40E_VSI_NEEDS_RESTART,
+	__I40E_VSI_SYNCING_FILTERS,
+	__I40E_VSI_OVERFLOW_PROMISC,
+	__I40E_VSI_REINIT_REQUESTED,
+	__I40E_VSI_DOWN_REQUESTED,
+	/* This must be last as it determines the size of the BITMAP */
+	__I40E_VSI_STATE_SIZE__,
+};
+
+enum i40e_interrupt_policy {
+	I40E_INTERRUPT_BEST_CASE,
+	I40E_INTERRUPT_MEDIUM,
+	I40E_INTERRUPT_LOWEST
+};
+
+struct i40e_lump_tracking {
+	u16 num_entries;
+	u16 search_hint;
+	u16 list[0];
+#define I40E_PILE_VALID_BIT  0x8000
+#define I40E_IWARP_IRQ_PILE_ID  (I40E_PILE_VALID_BIT - 2)
+};
+
+#define I40E_DEFAULT_ATR_SAMPLE_RATE	20
+#define I40E_FDIR_MAX_RAW_PACKET_SIZE	512
+#define I40E_FDIR_BUFFER_FULL_MARGIN	10
+#define I40E_FDIR_BUFFER_HEAD_ROOM	32
+#define I40E_FDIR_BUFFER_HEAD_ROOM_FOR_ATR (I40E_FDIR_BUFFER_HEAD_ROOM * 4)
+
+#define I40E_HKEY_ARRAY_SIZE	((I40E_PFQF_HKEY_MAX_INDEX + 1) * 4)
+#define I40E_HLUT_ARRAY_SIZE	((I40E_PFQF_HLUT_MAX_INDEX + 1) * 4)
+#define I40E_VF_HLUT_ARRAY_SIZE	((I40E_VFQF_HLUT1_MAX_INDEX + 1) * 4)
+
+enum i40e_fd_stat_idx {
+	I40E_FD_STAT_ATR,
+	I40E_FD_STAT_SB,
+	I40E_FD_STAT_ATR_TUNNEL,
+	I40E_FD_STAT_PF_COUNT
+};
+#define I40E_FD_STAT_PF_IDX(pf_id) ((pf_id) * I40E_FD_STAT_PF_COUNT)
+#define I40E_FD_ATR_STAT_IDX(pf_id) \
+			(I40E_FD_STAT_PF_IDX(pf_id) + I40E_FD_STAT_ATR)
+#define I40E_FD_SB_STAT_IDX(pf_id)  \
+			(I40E_FD_STAT_PF_IDX(pf_id) + I40E_FD_STAT_SB)
+#define I40E_FD_ATR_TUNNEL_STAT_IDX(pf_id) \
+			(I40E_FD_STAT_PF_IDX(pf_id) + I40E_FD_STAT_ATR_TUNNEL)
+
+/* The following structure contains the data parsed from the user-defined
+ * field of the ethtool_rx_flow_spec structure.
+ */
+struct i40e_rx_flow_userdef {
+	bool flex_filter;
+	u16 flex_word;
+	u16 flex_offset;
+};
+
+struct i40e_fdir_filter {
+	struct hlist_node fdir_node;
+	/* filter ipnut set */
+	u8 flow_type;
+	u8 ip4_proto;
+	/* TX packet view of src and dst */
+	__be32 dst_ip;
+	__be32 src_ip;
+	__be16 src_port;
+	__be16 dst_port;
+	__be32 sctp_v_tag;
+
+	/* Flexible data to match within the packet payload */
+	__be16 flex_word;
+	u16 flex_offset;
+	bool flex_filter;
+
+	/* filter control */
+	u16 q_index;
+	u8  flex_off;
+	u8  pctype;
+	u16 dest_vsi;
+	u8  dest_ctl;
+	u8  fd_status;
+	u16 cnt_index;
+	u32 fd_id;
+};
+
+#define I40E_CLOUD_FIELD_OMAC	0x01
+#define I40E_CLOUD_FIELD_IMAC	0x02
+#define I40E_CLOUD_FIELD_IVLAN	0x04
+#define I40E_CLOUD_FIELD_TEN_ID	0x08
+#define I40E_CLOUD_FIELD_IIP	0x10
+
+#define I40E_CLOUD_FILTER_FLAGS_OMAC	I40E_CLOUD_FIELD_OMAC
+#define I40E_CLOUD_FILTER_FLAGS_IMAC	I40E_CLOUD_FIELD_IMAC
+#define I40E_CLOUD_FILTER_FLAGS_IMAC_IVLAN	(I40E_CLOUD_FIELD_IMAC | \
+						 I40E_CLOUD_FIELD_IVLAN)
+#define I40E_CLOUD_FILTER_FLAGS_IMAC_TEN_ID	(I40E_CLOUD_FIELD_IMAC | \
+						 I40E_CLOUD_FIELD_TEN_ID)
+#define I40E_CLOUD_FILTER_FLAGS_OMAC_TEN_ID_IMAC (I40E_CLOUD_FIELD_OMAC | \
+						  I40E_CLOUD_FIELD_IMAC | \
+						  I40E_CLOUD_FIELD_TEN_ID)
+#define I40E_CLOUD_FILTER_FLAGS_IMAC_IVLAN_TEN_ID (I40E_CLOUD_FIELD_IMAC | \
+						   I40E_CLOUD_FIELD_IVLAN | \
+						   I40E_CLOUD_FIELD_TEN_ID)
+#define I40E_CLOUD_FILTER_FLAGS_IIP	I40E_CLOUD_FIELD_IIP
+
+struct i40e_cloud_filter {
+	struct hlist_node cloud_node;
+	unsigned long cookie;
+	/* cloud filter input set follows */
+	u8 dst_mac[ETH_ALEN];
+	u8 src_mac[ETH_ALEN];
+	__be16 vlan_id;
+	u16 seid;       /* filter control */
+	__be16 dst_port;
+	__be16 src_port;
+	u32 tenant_id;
+	union {
+		struct {
+			struct in_addr dst_ip;
+			struct in_addr src_ip;
+		} v4;
+		struct {
+			struct in6_addr dst_ip6;
+			struct in6_addr src_ip6;
+		} v6;
+	} ip;
+#define dst_ipv6	ip.v6.dst_ip6.s6_addr32
+#define src_ipv6	ip.v6.src_ip6.s6_addr32
+#define dst_ipv4	ip.v4.dst_ip.s_addr
+#define src_ipv4	ip.v4.src_ip.s_addr
+	u16 n_proto;    /* Ethernet Protocol */
+	u8 ip_proto;    /* IPPROTO value */
+	u8 flags;
+#define I40E_CLOUD_TNL_TYPE_NONE        0xff
+	u8 tunnel_type;
+};
+
+#define I40E_ETH_P_LLDP			0x88cc
+
+#define I40E_DCB_PRIO_TYPE_STRICT	0
+#define I40E_DCB_PRIO_TYPE_ETS		1
+#define I40E_DCB_STRICT_PRIO_CREDITS	127
+/* DCB per TC information data structure */
+struct i40e_tc_info {
+	u16	qoffset;	/* Queue offset from base queue */
+	u16	qcount;		/* Total Queues */
+	u8	netdev_tc;	/* Netdev TC index if netdev associated */
+};
+
+/* TC configuration data structure */
+struct i40e_tc_configuration {
+	u8	numtc;		/* Total number of enabled TCs */
+	u8	enabled_tc;	/* TC map */
+	struct i40e_tc_info tc_info[I40E_MAX_TRAFFIC_CLASS];
+};
+
+struct i40e_udp_port_config {
+	/* AdminQ command interface expects port number in Host byte order */
+	u16 port;
+	u8 type;
+};
+
+/* macros related to FLX_PIT */
+#define I40E_FLEX_SET_FSIZE(fsize) (((fsize) << \
+				    I40E_PRTQF_FLX_PIT_FSIZE_SHIFT) & \
+				    I40E_PRTQF_FLX_PIT_FSIZE_MASK)
+#define I40E_FLEX_SET_DST_WORD(dst) (((dst) << \
+				     I40E_PRTQF_FLX_PIT_DEST_OFF_SHIFT) & \
+				     I40E_PRTQF_FLX_PIT_DEST_OFF_MASK)
+#define I40E_FLEX_SET_SRC_WORD(src) (((src) << \
+				     I40E_PRTQF_FLX_PIT_SOURCE_OFF_SHIFT) & \
+				     I40E_PRTQF_FLX_PIT_SOURCE_OFF_MASK)
+#define I40E_FLEX_PREP_VAL(dst, fsize, src) (I40E_FLEX_SET_DST_WORD(dst) | \
+					     I40E_FLEX_SET_FSIZE(fsize) | \
+					     I40E_FLEX_SET_SRC_WORD(src))
+
+#define I40E_FLEX_PIT_GET_SRC(flex) (((flex) & \
+				     I40E_PRTQF_FLX_PIT_SOURCE_OFF_MASK) >> \
+				     I40E_PRTQF_FLX_PIT_SOURCE_OFF_SHIFT)
+#define I40E_FLEX_PIT_GET_DST(flex) (((flex) & \
+				     I40E_PRTQF_FLX_PIT_DEST_OFF_MASK) >> \
+				     I40E_PRTQF_FLX_PIT_DEST_OFF_SHIFT)
+#define I40E_FLEX_PIT_GET_FSIZE(flex) (((flex) & \
+				       I40E_PRTQF_FLX_PIT_FSIZE_MASK) >> \
+				       I40E_PRTQF_FLX_PIT_FSIZE_SHIFT)
+
+#define I40E_MAX_FLEX_SRC_OFFSET 0x1F
+
+/* macros related to GLQF_ORT */
+#define I40E_ORT_SET_IDX(idx)		(((idx) << \
+					  I40E_GLQF_ORT_PIT_INDX_SHIFT) & \
+					 I40E_GLQF_ORT_PIT_INDX_MASK)
+
+#define I40E_ORT_SET_COUNT(count)	(((count) << \
+					  I40E_GLQF_ORT_FIELD_CNT_SHIFT) & \
+					 I40E_GLQF_ORT_FIELD_CNT_MASK)
+
+#define I40E_ORT_SET_PAYLOAD(payload)	(((payload) << \
+					  I40E_GLQF_ORT_FLX_PAYLOAD_SHIFT) & \
+					 I40E_GLQF_ORT_FLX_PAYLOAD_MASK)
+
+#define I40E_ORT_PREP_VAL(idx, count, payload) (I40E_ORT_SET_IDX(idx) | \
+						I40E_ORT_SET_COUNT(count) | \
+						I40E_ORT_SET_PAYLOAD(payload))
+
+#define I40E_L3_GLQF_ORT_IDX		34
+#define I40E_L4_GLQF_ORT_IDX		35
+
+/* Flex PIT register index */
+#define I40E_FLEX_PIT_IDX_START_L2	0
+#define I40E_FLEX_PIT_IDX_START_L3	3
+#define I40E_FLEX_PIT_IDX_START_L4	6
+
+#define I40E_FLEX_PIT_TABLE_SIZE	3
+
+#define I40E_FLEX_DEST_UNUSED		63
+
+#define I40E_FLEX_INDEX_ENTRIES		8
+
+/* Flex MASK to disable all flexible entries */
+#define I40E_FLEX_INPUT_MASK	(I40E_FLEX_50_MASK | I40E_FLEX_51_MASK | \
+				 I40E_FLEX_52_MASK | I40E_FLEX_53_MASK | \
+				 I40E_FLEX_54_MASK | I40E_FLEX_55_MASK | \
+				 I40E_FLEX_56_MASK | I40E_FLEX_57_MASK)
+
+struct i40e_flex_pit {
+	struct list_head list;
+	u16 src_offset;
+	u8 pit_index;
+};
+
+struct i40e_channel {
+	struct list_head list;
+	bool initialized;
+	u8 type;
+	u16 vsi_number; /* Assigned VSI number from AQ 'Add VSI' response */
+	u16 stat_counter_idx;
+	u16 base_queue;
+	u16 num_queue_pairs; /* Requested by user */
+	u16 seid;
+
+	u8 enabled_tc;
+	struct i40e_aqc_vsi_properties_data info;
+
+	u64 max_tx_rate;
+
+	/* track this channel belongs to which VSI */
+	struct i40e_vsi *parent_vsi;
+};
+
+/* struct that defines the Ethernet device */
+struct i40e_pf {
+	struct pci_dev *pdev;
+	struct i40e_hw hw;
+	DECLARE_BITMAP(state, __I40E_STATE_SIZE__);
+	struct msix_entry *msix_entries;
+	bool fc_autoneg_status;
+
+	u16 eeprom_version;
+	u16 num_vmdq_vsis;         /* num vmdq vsis this PF has set up */
+	u16 num_vmdq_qps;          /* num queue pairs per vmdq pool */
+	u16 num_vmdq_msix;         /* num queue vectors per vmdq pool */
+	u16 num_req_vfs;           /* num VFs requested for this PF */
+	u16 num_vf_qps;            /* num queue pairs per VF */
+	u16 num_lan_qps;           /* num lan queues this PF has set up */
+	u16 num_lan_msix;          /* num queue vectors for the base PF vsi */
+	u16 num_fdsb_msix;         /* num queue vectors for sideband Fdir */
+	u16 num_iwarp_msix;        /* num of iwarp vectors for this PF */
+	int iwarp_base_vector;
+	int queues_left;           /* queues left unclaimed */
+	u16 alloc_rss_size;        /* allocated RSS queues */
+	u16 rss_size_max;          /* HW defined max RSS queues */
+	u16 fdir_pf_filter_count;  /* num of guaranteed filters for this PF */
+	u16 num_alloc_vsi;         /* num VSIs this driver supports */
+	u8 atr_sample_rate;
+	bool wol_en;
+
+	struct hlist_head fdir_filter_list;
+	u16 fdir_pf_active_filters;
+	unsigned long fd_flush_timestamp;
+	u32 fd_flush_cnt;
+	u32 fd_add_err;
+	u32 fd_atr_cnt;
+
+	/* Book-keeping of side-band filter count per flow-type.
+	 * This is used to detect and handle input set changes for
+	 * respective flow-type.
+	 */
+	u16 fd_tcp4_filter_cnt;
+	u16 fd_udp4_filter_cnt;
+	u16 fd_sctp4_filter_cnt;
+	u16 fd_ip4_filter_cnt;
+
+	/* Flexible filter table values that need to be programmed into
+	 * hardware, which expects L3 and L4 to be programmed separately. We
+	 * need to ensure that the values are in ascended order and don't have
+	 * duplicates, so we track each L3 and L4 values in separate lists.
+	 */
+	struct list_head l3_flex_pit_list;
+	struct list_head l4_flex_pit_list;
+
+	struct i40e_udp_port_config udp_ports[I40E_MAX_PF_UDP_OFFLOAD_PORTS];
+	u16 pending_udp_bitmap;
+
+	struct hlist_head cloud_filter_list;
+	u16 num_cloud_filters;
+
+	enum i40e_interrupt_policy int_policy;
+	u16 rx_itr_default;
+	u16 tx_itr_default;
+	u32 msg_enable;
+	char int_name[I40E_INT_NAME_STR_LEN];
+	u16 adminq_work_limit; /* num of admin receive queue desc to process */
+	unsigned long service_timer_period;
+	unsigned long service_timer_previous;
+	struct timer_list service_timer;
+	struct work_struct service_task;
+
+	u32 hw_features;
+#define I40E_HW_RSS_AQ_CAPABLE			BIT(0)
+#define I40E_HW_128_QP_RSS_CAPABLE		BIT(1)
+#define I40E_HW_ATR_EVICT_CAPABLE		BIT(2)
+#define I40E_HW_WB_ON_ITR_CAPABLE		BIT(3)
+#define I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE	BIT(4)
+#define I40E_HW_NO_PCI_LINK_CHECK		BIT(5)
+#define I40E_HW_100M_SGMII_CAPABLE		BIT(6)
+#define I40E_HW_NO_DCB_SUPPORT			BIT(7)
+#define I40E_HW_USE_SET_LLDP_MIB		BIT(8)
+#define I40E_HW_GENEVE_OFFLOAD_CAPABLE		BIT(9)
+#define I40E_HW_PTP_L4_CAPABLE			BIT(10)
+#define I40E_HW_WOL_MC_MAGIC_PKT_WAKE		BIT(11)
+#define I40E_HW_MPLS_HDR_OFFLOAD_CAPABLE	BIT(12)
+#define I40E_HW_HAVE_CRT_RETIMER		BIT(13)
+#define I40E_HW_OUTER_UDP_CSUM_CAPABLE		BIT(14)
+#define I40E_HW_PHY_CONTROLS_LEDS		BIT(15)
+#define I40E_HW_STOP_FW_LLDP			BIT(16)
+#define I40E_HW_PORT_ID_VALID			BIT(17)
+#define I40E_HW_RESTART_AUTONEG			BIT(18)
+#define I40E_HW_STOPPABLE_FW_LLDP		BIT(19)
+
+	u32 flags;
+#define I40E_FLAG_RX_CSUM_ENABLED		BIT(0)
+#define I40E_FLAG_MSI_ENABLED			BIT(1)
+#define I40E_FLAG_MSIX_ENABLED			BIT(2)
+#define I40E_FLAG_RSS_ENABLED			BIT(3)
+#define I40E_FLAG_VMDQ_ENABLED			BIT(4)
+#define I40E_FLAG_SRIOV_ENABLED			BIT(5)
+#define I40E_FLAG_DCB_CAPABLE			BIT(6)
+#define I40E_FLAG_DCB_ENABLED			BIT(7)
+#define I40E_FLAG_FD_SB_ENABLED			BIT(8)
+#define I40E_FLAG_FD_ATR_ENABLED		BIT(9)
+#define I40E_FLAG_MFP_ENABLED			BIT(10)
+#define I40E_FLAG_HW_ATR_EVICT_ENABLED		BIT(11)
+#define I40E_FLAG_VEB_MODE_ENABLED		BIT(12)
+#define I40E_FLAG_VEB_STATS_ENABLED		BIT(13)
+#define I40E_FLAG_LINK_POLLING_ENABLED		BIT(14)
+#define I40E_FLAG_TRUE_PROMISC_SUPPORT		BIT(15)
+#define I40E_FLAG_LEGACY_RX			BIT(16)
+#define I40E_FLAG_PTP				BIT(17)
+#define I40E_FLAG_IWARP_ENABLED			BIT(18)
+#define I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED	BIT(19)
+#define I40E_FLAG_SOURCE_PRUNING_DISABLED       BIT(20)
+#define I40E_FLAG_TC_MQPRIO			BIT(21)
+#define I40E_FLAG_FD_SB_INACTIVE		BIT(22)
+#define I40E_FLAG_FD_SB_TO_CLOUD_FILTER		BIT(23)
+#define I40E_FLAG_DISABLE_FW_LLDP		BIT(24)
+
+	struct i40e_client_instance *cinst;
+	bool stat_offsets_loaded;
+	struct i40e_hw_port_stats stats;
+	struct i40e_hw_port_stats stats_offsets;
+	u32 tx_timeout_count;
+	u32 tx_timeout_recovery_level;
+	unsigned long tx_timeout_last_recovery;
+	u32 tx_sluggish_count;
+	u32 hw_csum_rx_error;
+	u32 led_status;
+	u16 corer_count; /* Core reset count */
+	u16 globr_count; /* Global reset count */
+	u16 empr_count; /* EMP reset count */
+	u16 pfr_count; /* PF reset count */
+	u16 sw_int_count; /* SW interrupt count */
+
+	struct mutex switch_mutex;
+	u16 lan_vsi;       /* our default LAN VSI */
+	u16 lan_veb;       /* initial relay, if exists */
+#define I40E_NO_VEB	0xffff
+#define I40E_NO_VSI	0xffff
+	u16 next_vsi;      /* Next unallocated VSI - 0-based! */
+	struct i40e_vsi **vsi;
+	struct i40e_veb *veb[I40E_MAX_VEB];
+
+	struct i40e_lump_tracking *qp_pile;
+	struct i40e_lump_tracking *irq_pile;
+
+	/* switch config info */
+	u16 pf_seid;
+	u16 main_vsi_seid;
+	u16 mac_seid;
+	struct kobject *switch_kobj;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *i40e_dbg_pf;
+#endif /* CONFIG_DEBUG_FS */
+	bool cur_promisc;
+
+	u16 instance; /* A unique number per i40e_pf instance in the system */
+
+	/* sr-iov config info */
+	struct i40e_vf *vf;
+	int num_alloc_vfs;	/* actual number of VFs allocated */
+	u32 vf_aq_requests;
+	u32 arq_overflows;	/* Not fatal, possibly indicative of problems */
+
+	/* DCBx/DCBNL capability for PF that indicates
+	 * whether DCBx is managed by firmware or host
+	 * based agent (LLDPAD). Also, indicates what
+	 * flavor of DCBx protocol (IEEE/CEE) is supported
+	 * by the device. For now we're supporting IEEE
+	 * mode only.
+	 */
+	u16 dcbx_cap;
+
+	struct i40e_filter_control_settings filter_settings;
+
+	struct ptp_clock *ptp_clock;
+	struct ptp_clock_info ptp_caps;
+	struct sk_buff *ptp_tx_skb;
+	unsigned long ptp_tx_start;
+	struct hwtstamp_config tstamp_config;
+	struct mutex tmreg_lock; /* Used to protect the SYSTIME registers. */
+	u64 ptp_base_adj;
+	u32 tx_hwtstamp_timeouts;
+	u32 tx_hwtstamp_skipped;
+	u32 rx_hwtstamp_cleared;
+	u32 latch_event_flags;
+	spinlock_t ptp_rx_lock; /* Used to protect Rx timestamp registers. */
+	unsigned long latch_events[4];
+	bool ptp_tx;
+	bool ptp_rx;
+	u16 rss_table_size; /* HW RSS table size */
+	u32 max_bw;
+	u32 min_bw;
+
+	u32 ioremap_len;
+	u32 fd_inv;
+	u16 phy_led_val;
+
+	u16 override_q_count;
+	u16 last_sw_conf_flags;
+	u16 last_sw_conf_valid_flags;
+};
+
+/**
+ * i40e_mac_to_hkey - Convert a 6-byte MAC Address to a u64 hash key
+ * @macaddr: the MAC Address as the base key
+ *
+ * Simply copies the address and returns it as a u64 for hashing
+ **/
+static inline u64 i40e_addr_to_hkey(const u8 *macaddr)
+{
+	u64 key = 0;
+
+	ether_addr_copy((u8 *)&key, macaddr);
+	return key;
+}
+
+enum i40e_filter_state {
+	I40E_FILTER_INVALID = 0,	/* Invalid state */
+	I40E_FILTER_NEW,		/* New, not sent to FW yet */
+	I40E_FILTER_ACTIVE,		/* Added to switch by FW */
+	I40E_FILTER_FAILED,		/* Rejected by FW */
+	I40E_FILTER_REMOVE,		/* To be removed */
+/* There is no 'removed' state; the filter struct is freed */
+};
+struct i40e_mac_filter {
+	struct hlist_node hlist;
+	u8 macaddr[ETH_ALEN];
+#define I40E_VLAN_ANY -1
+	s16 vlan;
+	enum i40e_filter_state state;
+};
+
+/* Wrapper structure to keep track of filters while we are preparing to send
+ * firmware commands. We cannot send firmware commands while holding a
+ * spinlock, since it might sleep. To avoid this, we wrap the added filters in
+ * a separate structure, which will track the state change and update the real
+ * filter while under lock. We can't simply hold the filters in a separate
+ * list, as this opens a window for a race condition when adding new MAC
+ * addresses to all VLANs, or when adding new VLANs to all MAC addresses.
+ */
+struct i40e_new_mac_filter {
+	struct hlist_node hlist;
+	struct i40e_mac_filter *f;
+
+	/* Track future changes to state separately */
+	enum i40e_filter_state state;
+};
+
+struct i40e_veb {
+	struct i40e_pf *pf;
+	u16 idx;
+	u16 veb_idx;		/* index of VEB parent */
+	u16 seid;
+	u16 uplink_seid;
+	u16 stats_idx;		/* index of VEB parent */
+	u8  enabled_tc;
+	u16 bridge_mode;	/* Bridge Mode (VEB/VEPA) */
+	u16 flags;
+	u16 bw_limit;
+	u8  bw_max_quanta;
+	bool is_abs_credits;
+	u8  bw_tc_share_credits[I40E_MAX_TRAFFIC_CLASS];
+	u16 bw_tc_limit_credits[I40E_MAX_TRAFFIC_CLASS];
+	u8  bw_tc_max_quanta[I40E_MAX_TRAFFIC_CLASS];
+	struct kobject *kobj;
+	bool stat_offsets_loaded;
+	struct i40e_eth_stats stats;
+	struct i40e_eth_stats stats_offsets;
+	struct i40e_veb_tc_stats tc_stats;
+	struct i40e_veb_tc_stats tc_stats_offsets;
+};
+
+/* struct that defines a VSI, associated with a dev */
+struct i40e_vsi {
+	struct net_device *netdev;
+	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
+	bool netdev_registered;
+	bool stat_offsets_loaded;
+
+	u32 current_netdev_flags;
+	DECLARE_BITMAP(state, __I40E_VSI_STATE_SIZE__);
+#define I40E_VSI_FLAG_FILTER_CHANGED	BIT(0)
+#define I40E_VSI_FLAG_VEB_OWNER		BIT(1)
+	unsigned long flags;
+
+	/* Per VSI lock to protect elements/hash (MAC filter) */
+	spinlock_t mac_filter_hash_lock;
+	/* Fixed size hash table with 2^8 buckets for MAC filters */
+	DECLARE_HASHTABLE(mac_filter_hash, 8);
+	bool has_vlan_filter;
+
+	/* VSI stats */
+	struct rtnl_link_stats64 net_stats;
+	struct rtnl_link_stats64 net_stats_offsets;
+	struct i40e_eth_stats eth_stats;
+	struct i40e_eth_stats eth_stats_offsets;
+	u32 tx_restart;
+	u32 tx_busy;
+	u64 tx_linearize;
+	u64 tx_force_wb;
+	u32 rx_buf_failed;
+	u32 rx_page_failed;
+
+	/* These are containers of ring pointers, allocated at run-time */
+	struct i40e_ring **rx_rings;
+	struct i40e_ring **tx_rings;
+	struct i40e_ring **xdp_rings; /* XDP Tx rings */
+
+	u32  active_filters;
+	u32  promisc_threshold;
+
+	u16 work_limit;
+	u16 int_rate_limit;	/* value in usecs */
+
+	u16 rss_table_size;	/* HW RSS table size */
+	u16 rss_size;		/* Allocated RSS queues */
+	u8  *rss_hkey_user;	/* User configured hash keys */
+	u8  *rss_lut_user;	/* User configured lookup table entries */
+
+
+	u16 max_frame;
+	u16 rx_buf_len;
+
+	struct bpf_prog *xdp_prog;
+
+	/* List of q_vectors allocated to this VSI */
+	struct i40e_q_vector **q_vectors;
+	int num_q_vectors;
+	int base_vector;
+	bool irqs_ready;
+
+	u16 seid;		/* HW index of this VSI (absolute index) */
+	u16 id;			/* VSI number */
+	u16 uplink_seid;
+
+	u16 base_queue;		/* vsi's first queue in hw array */
+	u16 alloc_queue_pairs;	/* Allocated Tx/Rx queues */
+	u16 req_queue_pairs;	/* User requested queue pairs */
+	u16 num_queue_pairs;	/* Used tx and rx pairs */
+	u16 num_desc;
+	enum i40e_vsi_type type;  /* VSI type, e.g., LAN, FCoE, etc */
+	s16 vf_id;		/* Virtual function ID for SRIOV VSIs */
+
+	struct tc_mqprio_qopt_offload mqprio_qopt; /* queue parameters */
+	struct i40e_tc_configuration tc_config;
+	struct i40e_aqc_vsi_properties_data info;
+
+	/* VSI BW limit (absolute across all TCs) */
+	u16 bw_limit;		/* VSI BW Limit (0 = disabled) */
+	u8  bw_max_quanta;	/* Max Quanta when BW limit is enabled */
+
+	/* Relative TC credits across VSIs */
+	u8  bw_ets_share_credits[I40E_MAX_TRAFFIC_CLASS];
+	/* TC BW limit credits within VSI */
+	u16  bw_ets_limit_credits[I40E_MAX_TRAFFIC_CLASS];
+	/* TC BW limit max quanta within VSI */
+	u8  bw_ets_max_quanta[I40E_MAX_TRAFFIC_CLASS];
+
+	struct i40e_pf *back;	/* Backreference to associated PF */
+	u16 idx;		/* index in pf->vsi[] */
+	u16 veb_idx;		/* index of VEB parent */
+	struct kobject *kobj;	/* sysfs object */
+	bool current_isup;	/* Sync 'link up' logging */
+	enum i40e_aq_link_speed current_speed;	/* Sync link speed logging */
+
+	/* channel specific fields */
+	u16 cnt_q_avail;	/* num of queues available for channel usage */
+	u16 orig_rss_size;
+	u16 current_rss_size;
+	bool reconfig_rss;
+
+	u16 next_base_queue;	/* next queue to be used for channel setup */
+
+	struct list_head ch_list;
+	u16 tc_seid_map[I40E_MAX_TRAFFIC_CLASS];
+
+	void *priv;	/* client driver data reference. */
+
+	/* VSI specific handlers */
+	irqreturn_t (*irq_handler)(int irq, void *data);
+} ____cacheline_internodealigned_in_smp;
+
+struct i40e_netdev_priv {
+	struct i40e_vsi *vsi;
+};
+
+/* struct that defines an interrupt vector */
+struct i40e_q_vector {
+	struct i40e_vsi *vsi;
+
+	u16 v_idx;		/* index in the vsi->q_vector array. */
+	u16 reg_idx;		/* register index of the interrupt */
+
+	struct napi_struct napi;
+
+	struct i40e_ring_container rx;
+	struct i40e_ring_container tx;
+
+	u8 itr_countdown;	/* when 0 should adjust adaptive ITR */
+	u8 num_ringpairs;	/* total number of ring pairs in vector */
+
+	cpumask_t affinity_mask;
+	struct irq_affinity_notify affinity_notify;
+
+	struct rcu_head rcu;	/* to avoid race with update stats on free */
+	char name[I40E_INT_NAME_STR_LEN];
+	bool arm_wb_state;
+} ____cacheline_internodealigned_in_smp;
+
+/* lan device */
+struct i40e_device {
+	struct list_head list;
+	struct i40e_pf *pf;
+};
+
+/**
+ * i40e_nvm_version_str - format the NVM version strings
+ * @hw: ptr to the hardware info
+ **/
+static inline char *i40e_nvm_version_str(struct i40e_hw *hw)
+{
+	static char buf[32];
+	u32 full_ver;
+
+	full_ver = hw->nvm.oem_ver;
+
+	if (hw->nvm.eetrack == I40E_OEM_EETRACK_ID) {
+		u8 gen, snap;
+		u16 release;
+
+		gen = (u8)(full_ver >> I40E_OEM_GEN_SHIFT);
+		snap = (u8)((full_ver & I40E_OEM_SNAP_MASK) >>
+			I40E_OEM_SNAP_SHIFT);
+		release = (u16)(full_ver & I40E_OEM_RELEASE_MASK);
+
+		snprintf(buf, sizeof(buf), "%x.%x.%x", gen, snap, release);
+	} else {
+		u8 ver, patch;
+		u16 build;
+
+		ver = (u8)(full_ver >> I40E_OEM_VER_SHIFT);
+		build = (u16)((full_ver >> I40E_OEM_VER_BUILD_SHIFT) &
+			 I40E_OEM_VER_BUILD_MASK);
+		patch = (u8)(full_ver & I40E_OEM_VER_PATCH_MASK);
+
+		snprintf(buf, sizeof(buf),
+			 "%x.%02x 0x%x %d.%d.%d",
+			 (hw->nvm.version & I40E_NVM_VERSION_HI_MASK) >>
+				I40E_NVM_VERSION_HI_SHIFT,
+			 (hw->nvm.version & I40E_NVM_VERSION_LO_MASK) >>
+				I40E_NVM_VERSION_LO_SHIFT,
+			 hw->nvm.eetrack, ver, build, patch);
+	}
+
+	return buf;
+}
+
+/**
+ * i40e_netdev_to_pf: Retrieve the PF struct for given netdev
+ * @netdev: the corresponding netdev
+ *
+ * Return the PF struct for the given netdev
+ **/
+static inline struct i40e_pf *i40e_netdev_to_pf(struct net_device *netdev)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+
+	return vsi->back;
+}
+
+static inline void i40e_vsi_setup_irqhandler(struct i40e_vsi *vsi,
+				irqreturn_t (*irq_handler)(int, void *))
+{
+	vsi->irq_handler = irq_handler;
+}
+
+/**
+ * i40e_get_fd_cnt_all - get the total FD filter space available
+ * @pf: pointer to the PF struct
+ **/
+static inline int i40e_get_fd_cnt_all(struct i40e_pf *pf)
+{
+	return pf->hw.fdir_shared_filter_count + pf->fdir_pf_filter_count;
+}
+
+/**
+ * i40e_read_fd_input_set - reads value of flow director input set register
+ * @pf: pointer to the PF struct
+ * @addr: register addr
+ *
+ * This function reads value of flow director input set register
+ * specified by 'addr' (which is specific to flow-type)
+ **/
+static inline u64 i40e_read_fd_input_set(struct i40e_pf *pf, u16 addr)
+{
+	u64 val;
+
+	val = i40e_read_rx_ctl(&pf->hw, I40E_PRTQF_FD_INSET(addr, 1));
+	val <<= 32;
+	val += i40e_read_rx_ctl(&pf->hw, I40E_PRTQF_FD_INSET(addr, 0));
+
+	return val;
+}
+
+/**
+ * i40e_write_fd_input_set - writes value into flow director input set register
+ * @pf: pointer to the PF struct
+ * @addr: register addr
+ * @val: value to be written
+ *
+ * This function writes specified value to the register specified by 'addr'.
+ * This register is input set register based on flow-type.
+ **/
+static inline void i40e_write_fd_input_set(struct i40e_pf *pf,
+					   u16 addr, u64 val)
+{
+	i40e_write_rx_ctl(&pf->hw, I40E_PRTQF_FD_INSET(addr, 1),
+			  (u32)(val >> 32));
+	i40e_write_rx_ctl(&pf->hw, I40E_PRTQF_FD_INSET(addr, 0),
+			  (u32)(val & 0xFFFFFFFFULL));
+}
+
+/* needed by i40e_ethtool.c */
+int i40e_up(struct i40e_vsi *vsi);
+void i40e_down(struct i40e_vsi *vsi);
+extern const char i40e_driver_name[];
+extern const char i40e_driver_version_str[];
+void i40e_do_reset_safe(struct i40e_pf *pf, u32 reset_flags);
+void i40e_do_reset(struct i40e_pf *pf, u32 reset_flags, bool lock_acquired);
+int i40e_config_rss(struct i40e_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size);
+int i40e_get_rss(struct i40e_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size);
+void i40e_fill_rss_lut(struct i40e_pf *pf, u8 *lut,
+		       u16 rss_table_size, u16 rss_size);
+struct i40e_vsi *i40e_find_vsi_from_id(struct i40e_pf *pf, u16 id);
+/**
+ * i40e_find_vsi_by_type - Find and return Flow Director VSI
+ * @pf: PF to search for VSI
+ * @type: Value indicating type of VSI we are looking for
+ **/
+static inline struct i40e_vsi *
+i40e_find_vsi_by_type(struct i40e_pf *pf, u16 type)
+{
+	int i;
+
+	for (i = 0; i < pf->num_alloc_vsi; i++) {
+		struct i40e_vsi *vsi = pf->vsi[i];
+
+		if (vsi && vsi->type == type)
+			return vsi;
+	}
+
+	return NULL;
+}
+void i40e_update_stats(struct i40e_vsi *vsi);
+void i40e_update_eth_stats(struct i40e_vsi *vsi);
+struct rtnl_link_stats64 *i40e_get_vsi_stats_struct(struct i40e_vsi *vsi);
+int i40e_fetch_switch_configuration(struct i40e_pf *pf,
+				    bool printconfig);
+
+int i40e_add_del_fdir(struct i40e_vsi *vsi,
+		      struct i40e_fdir_filter *input, bool add);
+void i40e_fdir_check_and_reenable(struct i40e_pf *pf);
+u32 i40e_get_current_fd_count(struct i40e_pf *pf);
+u32 i40e_get_cur_guaranteed_fd_count(struct i40e_pf *pf);
+u32 i40e_get_current_atr_cnt(struct i40e_pf *pf);
+u32 i40e_get_global_fd_count(struct i40e_pf *pf);
+bool i40e_set_ntuple(struct i40e_pf *pf, netdev_features_t features);
+void i40e_set_ethtool_ops(struct net_device *netdev);
+struct i40e_mac_filter *i40e_add_filter(struct i40e_vsi *vsi,
+					const u8 *macaddr, s16 vlan);
+void __i40e_del_filter(struct i40e_vsi *vsi, struct i40e_mac_filter *f);
+void i40e_del_filter(struct i40e_vsi *vsi, const u8 *macaddr, s16 vlan);
+int i40e_sync_vsi_filters(struct i40e_vsi *vsi);
+struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 type,
+				u16 uplink, u32 param1);
+int i40e_vsi_release(struct i40e_vsi *vsi);
+void i40e_service_event_schedule(struct i40e_pf *pf);
+void i40e_notify_client_of_vf_msg(struct i40e_vsi *vsi, u32 vf_id,
+				  u8 *msg, u16 len);
+
+int i40e_vsi_start_rings(struct i40e_vsi *vsi);
+void i40e_vsi_stop_rings(struct i40e_vsi *vsi);
+void i40e_vsi_stop_rings_no_wait(struct  i40e_vsi *vsi);
+int i40e_vsi_wait_queues_disabled(struct i40e_vsi *vsi);
+int i40e_reconfig_rss_queues(struct i40e_pf *pf, int queue_count);
+struct i40e_veb *i40e_veb_setup(struct i40e_pf *pf, u16 flags, u16 uplink_seid,
+				u16 downlink_seid, u8 enabled_tc);
+void i40e_veb_release(struct i40e_veb *veb);
+
+int i40e_veb_config_tc(struct i40e_veb *veb, u8 enabled_tc);
+int i40e_vsi_add_pvid(struct i40e_vsi *vsi, u16 vid);
+void i40e_vsi_remove_pvid(struct i40e_vsi *vsi);
+void i40e_vsi_reset_stats(struct i40e_vsi *vsi);
+void i40e_pf_reset_stats(struct i40e_pf *pf);
+#ifdef CONFIG_DEBUG_FS
+void i40e_dbg_pf_init(struct i40e_pf *pf);
+void i40e_dbg_pf_exit(struct i40e_pf *pf);
+void i40e_dbg_init(void);
+void i40e_dbg_exit(void);
+#else
+static inline void i40e_dbg_pf_init(struct i40e_pf *pf) {}
+static inline void i40e_dbg_pf_exit(struct i40e_pf *pf) {}
+static inline void i40e_dbg_init(void) {}
+static inline void i40e_dbg_exit(void) {}
+#endif /* CONFIG_DEBUG_FS*/
+/* needed by client drivers */
+int i40e_lan_add_device(struct i40e_pf *pf);
+int i40e_lan_del_device(struct i40e_pf *pf);
+void i40e_client_subtask(struct i40e_pf *pf);
+void i40e_notify_client_of_l2_param_changes(struct i40e_vsi *vsi);
+void i40e_notify_client_of_netdev_close(struct i40e_vsi *vsi, bool reset);
+void i40e_notify_client_of_vf_enable(struct i40e_pf *pf, u32 num_vfs);
+void i40e_notify_client_of_vf_reset(struct i40e_pf *pf, u32 vf_id);
+void i40e_client_update_msix_info(struct i40e_pf *pf);
+int i40e_vf_client_capable(struct i40e_pf *pf, u32 vf_id);
+/**
+ * i40e_irq_dynamic_enable - Enable default interrupt generation settings
+ * @vsi: pointer to a vsi
+ * @vector: enable a particular Hw Interrupt vector, without base_vector
+ **/
+static inline void i40e_irq_dynamic_enable(struct i40e_vsi *vsi, int vector)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	u32 val;
+
+	val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
+	      I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
+	      (I40E_ITR_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
+	wr32(hw, I40E_PFINT_DYN_CTLN(vector + vsi->base_vector - 1), val);
+	/* skip the flush */
+}
+
+void i40e_irq_dynamic_disable_icr0(struct i40e_pf *pf);
+void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf);
+int i40e_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd);
+int i40e_open(struct net_device *netdev);
+int i40e_close(struct net_device *netdev);
+int i40e_vsi_open(struct i40e_vsi *vsi);
+void i40e_vlan_stripping_disable(struct i40e_vsi *vsi);
+int i40e_add_vlan_all_mac(struct i40e_vsi *vsi, s16 vid);
+int i40e_vsi_add_vlan(struct i40e_vsi *vsi, u16 vid);
+void i40e_rm_vlan_all_mac(struct i40e_vsi *vsi, s16 vid);
+void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, u16 vid);
+struct i40e_mac_filter *i40e_add_mac_filter(struct i40e_vsi *vsi,
+					    const u8 *macaddr);
+int i40e_del_mac_filter(struct i40e_vsi *vsi, const u8 *macaddr);
+bool i40e_is_vsi_in_vlan(struct i40e_vsi *vsi);
+struct i40e_mac_filter *i40e_find_mac(struct i40e_vsi *vsi, const u8 *macaddr);
+void i40e_vlan_stripping_enable(struct i40e_vsi *vsi);
+#ifdef CONFIG_I40E_DCB
+void i40e_dcbnl_flush_apps(struct i40e_pf *pf,
+			   struct i40e_dcbx_config *old_cfg,
+			   struct i40e_dcbx_config *new_cfg);
+void i40e_dcbnl_set_all(struct i40e_vsi *vsi);
+void i40e_dcbnl_setup(struct i40e_vsi *vsi);
+bool i40e_dcb_need_reconfig(struct i40e_pf *pf,
+			    struct i40e_dcbx_config *old_cfg,
+			    struct i40e_dcbx_config *new_cfg);
+#endif /* CONFIG_I40E_DCB */
+void i40e_ptp_rx_hang(struct i40e_pf *pf);
+void i40e_ptp_tx_hang(struct i40e_pf *pf);
+void i40e_ptp_tx_hwtstamp(struct i40e_pf *pf);
+void i40e_ptp_rx_hwtstamp(struct i40e_pf *pf, struct sk_buff *skb, u8 index);
+void i40e_ptp_set_increment(struct i40e_pf *pf);
+int i40e_ptp_set_ts_config(struct i40e_pf *pf, struct ifreq *ifr);
+int i40e_ptp_get_ts_config(struct i40e_pf *pf, struct ifreq *ifr);
+void i40e_ptp_init(struct i40e_pf *pf);
+void i40e_ptp_stop(struct i40e_pf *pf);
+int i40e_is_vsi_uplink_mode_veb(struct i40e_vsi *vsi);
+i40e_status i40e_get_partition_bw_setting(struct i40e_pf *pf);
+i40e_status i40e_set_partition_bw_setting(struct i40e_pf *pf);
+i40e_status i40e_commit_partition_bw_setting(struct i40e_pf *pf);
+void i40e_print_link_message(struct i40e_vsi *vsi, bool isup);
+
+static inline bool i40e_enabled_xdp_vsi(struct i40e_vsi *vsi)
+{
+	return !!vsi->xdp_prog;
+}
+
+int i40e_create_queue_channel(struct i40e_vsi *vsi, struct i40e_channel *ch);
+int i40e_set_bw_limit(struct i40e_vsi *vsi, u16 seid, u64 max_tx_rate);
+int i40e_add_del_cloud_filter(struct i40e_vsi *vsi,
+			      struct i40e_cloud_filter *filter,
+			      bool add);
+int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi,
+				      struct i40e_cloud_filter *filter,
+				      bool add);
+#endif /* _I40E_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
new file mode 100644
index 0000000..843fc77
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
@@ -0,0 +1,1059 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#include "i40e_status.h"
+#include "i40e_type.h"
+#include "i40e_register.h"
+#include "i40e_adminq.h"
+#include "i40e_prototype.h"
+
+static void i40e_resume_aq(struct i40e_hw *hw);
+
+/**
+ *  i40e_adminq_init_regs - Initialize AdminQ registers
+ *  @hw: pointer to the hardware structure
+ *
+ *  This assumes the alloc_asq and alloc_arq functions have already been called
+ **/
+static void i40e_adminq_init_regs(struct i40e_hw *hw)
+{
+	/* set head and tail registers in our local struct */
+	if (i40e_is_vf(hw)) {
+		hw->aq.asq.tail = I40E_VF_ATQT1;
+		hw->aq.asq.head = I40E_VF_ATQH1;
+		hw->aq.asq.len  = I40E_VF_ATQLEN1;
+		hw->aq.asq.bal  = I40E_VF_ATQBAL1;
+		hw->aq.asq.bah  = I40E_VF_ATQBAH1;
+		hw->aq.arq.tail = I40E_VF_ARQT1;
+		hw->aq.arq.head = I40E_VF_ARQH1;
+		hw->aq.arq.len  = I40E_VF_ARQLEN1;
+		hw->aq.arq.bal  = I40E_VF_ARQBAL1;
+		hw->aq.arq.bah  = I40E_VF_ARQBAH1;
+	} else {
+		hw->aq.asq.tail = I40E_PF_ATQT;
+		hw->aq.asq.head = I40E_PF_ATQH;
+		hw->aq.asq.len  = I40E_PF_ATQLEN;
+		hw->aq.asq.bal  = I40E_PF_ATQBAL;
+		hw->aq.asq.bah  = I40E_PF_ATQBAH;
+		hw->aq.arq.tail = I40E_PF_ARQT;
+		hw->aq.arq.head = I40E_PF_ARQH;
+		hw->aq.arq.len  = I40E_PF_ARQLEN;
+		hw->aq.arq.bal  = I40E_PF_ARQBAL;
+		hw->aq.arq.bah  = I40E_PF_ARQBAH;
+	}
+}
+
+/**
+ *  i40e_alloc_adminq_asq_ring - Allocate Admin Queue send rings
+ *  @hw: pointer to the hardware structure
+ **/
+static i40e_status i40e_alloc_adminq_asq_ring(struct i40e_hw *hw)
+{
+	i40e_status ret_code;
+
+	ret_code = i40e_allocate_dma_mem(hw, &hw->aq.asq.desc_buf,
+					 i40e_mem_atq_ring,
+					 (hw->aq.num_asq_entries *
+					 sizeof(struct i40e_aq_desc)),
+					 I40E_ADMINQ_DESC_ALIGNMENT);
+	if (ret_code)
+		return ret_code;
+
+	ret_code = i40e_allocate_virt_mem(hw, &hw->aq.asq.cmd_buf,
+					  (hw->aq.num_asq_entries *
+					  sizeof(struct i40e_asq_cmd_details)));
+	if (ret_code) {
+		i40e_free_dma_mem(hw, &hw->aq.asq.desc_buf);
+		return ret_code;
+	}
+
+	return ret_code;
+}
+
+/**
+ *  i40e_alloc_adminq_arq_ring - Allocate Admin Queue receive rings
+ *  @hw: pointer to the hardware structure
+ **/
+static i40e_status i40e_alloc_adminq_arq_ring(struct i40e_hw *hw)
+{
+	i40e_status ret_code;
+
+	ret_code = i40e_allocate_dma_mem(hw, &hw->aq.arq.desc_buf,
+					 i40e_mem_arq_ring,
+					 (hw->aq.num_arq_entries *
+					 sizeof(struct i40e_aq_desc)),
+					 I40E_ADMINQ_DESC_ALIGNMENT);
+
+	return ret_code;
+}
+
+/**
+ *  i40e_free_adminq_asq - Free Admin Queue send rings
+ *  @hw: pointer to the hardware structure
+ *
+ *  This assumes the posted send buffers have already been cleaned
+ *  and de-allocated
+ **/
+static void i40e_free_adminq_asq(struct i40e_hw *hw)
+{
+	i40e_free_dma_mem(hw, &hw->aq.asq.desc_buf);
+}
+
+/**
+ *  i40e_free_adminq_arq - Free Admin Queue receive rings
+ *  @hw: pointer to the hardware structure
+ *
+ *  This assumes the posted receive buffers have already been cleaned
+ *  and de-allocated
+ **/
+static void i40e_free_adminq_arq(struct i40e_hw *hw)
+{
+	i40e_free_dma_mem(hw, &hw->aq.arq.desc_buf);
+}
+
+/**
+ *  i40e_alloc_arq_bufs - Allocate pre-posted buffers for the receive queue
+ *  @hw: pointer to the hardware structure
+ **/
+static i40e_status i40e_alloc_arq_bufs(struct i40e_hw *hw)
+{
+	i40e_status ret_code;
+	struct i40e_aq_desc *desc;
+	struct i40e_dma_mem *bi;
+	int i;
+
+	/* We'll be allocating the buffer info memory first, then we can
+	 * allocate the mapped buffers for the event processing
+	 */
+
+	/* buffer_info structures do not need alignment */
+	ret_code = i40e_allocate_virt_mem(hw, &hw->aq.arq.dma_head,
+		(hw->aq.num_arq_entries * sizeof(struct i40e_dma_mem)));
+	if (ret_code)
+		goto alloc_arq_bufs;
+	hw->aq.arq.r.arq_bi = (struct i40e_dma_mem *)hw->aq.arq.dma_head.va;
+
+	/* allocate the mapped buffers */
+	for (i = 0; i < hw->aq.num_arq_entries; i++) {
+		bi = &hw->aq.arq.r.arq_bi[i];
+		ret_code = i40e_allocate_dma_mem(hw, bi,
+						 i40e_mem_arq_buf,
+						 hw->aq.arq_buf_size,
+						 I40E_ADMINQ_DESC_ALIGNMENT);
+		if (ret_code)
+			goto unwind_alloc_arq_bufs;
+
+		/* now configure the descriptors for use */
+		desc = I40E_ADMINQ_DESC(hw->aq.arq, i);
+
+		desc->flags = cpu_to_le16(I40E_AQ_FLAG_BUF);
+		if (hw->aq.arq_buf_size > I40E_AQ_LARGE_BUF)
+			desc->flags |= cpu_to_le16(I40E_AQ_FLAG_LB);
+		desc->opcode = 0;
+		/* This is in accordance with Admin queue design, there is no
+		 * register for buffer size configuration
+		 */
+		desc->datalen = cpu_to_le16((u16)bi->size);
+		desc->retval = 0;
+		desc->cookie_high = 0;
+		desc->cookie_low = 0;
+		desc->params.external.addr_high =
+			cpu_to_le32(upper_32_bits(bi->pa));
+		desc->params.external.addr_low =
+			cpu_to_le32(lower_32_bits(bi->pa));
+		desc->params.external.param0 = 0;
+		desc->params.external.param1 = 0;
+	}
+
+alloc_arq_bufs:
+	return ret_code;
+
+unwind_alloc_arq_bufs:
+	/* don't try to free the one that failed... */
+	i--;
+	for (; i >= 0; i--)
+		i40e_free_dma_mem(hw, &hw->aq.arq.r.arq_bi[i]);
+	i40e_free_virt_mem(hw, &hw->aq.arq.dma_head);
+
+	return ret_code;
+}
+
+/**
+ *  i40e_alloc_asq_bufs - Allocate empty buffer structs for the send queue
+ *  @hw: pointer to the hardware structure
+ **/
+static i40e_status i40e_alloc_asq_bufs(struct i40e_hw *hw)
+{
+	i40e_status ret_code;
+	struct i40e_dma_mem *bi;
+	int i;
+
+	/* No mapped memory needed yet, just the buffer info structures */
+	ret_code = i40e_allocate_virt_mem(hw, &hw->aq.asq.dma_head,
+		(hw->aq.num_asq_entries * sizeof(struct i40e_dma_mem)));
+	if (ret_code)
+		goto alloc_asq_bufs;
+	hw->aq.asq.r.asq_bi = (struct i40e_dma_mem *)hw->aq.asq.dma_head.va;
+
+	/* allocate the mapped buffers */
+	for (i = 0; i < hw->aq.num_asq_entries; i++) {
+		bi = &hw->aq.asq.r.asq_bi[i];
+		ret_code = i40e_allocate_dma_mem(hw, bi,
+						 i40e_mem_asq_buf,
+						 hw->aq.asq_buf_size,
+						 I40E_ADMINQ_DESC_ALIGNMENT);
+		if (ret_code)
+			goto unwind_alloc_asq_bufs;
+	}
+alloc_asq_bufs:
+	return ret_code;
+
+unwind_alloc_asq_bufs:
+	/* don't try to free the one that failed... */
+	i--;
+	for (; i >= 0; i--)
+		i40e_free_dma_mem(hw, &hw->aq.asq.r.asq_bi[i]);
+	i40e_free_virt_mem(hw, &hw->aq.asq.dma_head);
+
+	return ret_code;
+}
+
+/**
+ *  i40e_free_arq_bufs - Free receive queue buffer info elements
+ *  @hw: pointer to the hardware structure
+ **/
+static void i40e_free_arq_bufs(struct i40e_hw *hw)
+{
+	int i;
+
+	/* free descriptors */
+	for (i = 0; i < hw->aq.num_arq_entries; i++)
+		i40e_free_dma_mem(hw, &hw->aq.arq.r.arq_bi[i]);
+
+	/* free the descriptor memory */
+	i40e_free_dma_mem(hw, &hw->aq.arq.desc_buf);
+
+	/* free the dma header */
+	i40e_free_virt_mem(hw, &hw->aq.arq.dma_head);
+}
+
+/**
+ *  i40e_free_asq_bufs - Free send queue buffer info elements
+ *  @hw: pointer to the hardware structure
+ **/
+static void i40e_free_asq_bufs(struct i40e_hw *hw)
+{
+	int i;
+
+	/* only unmap if the address is non-NULL */
+	for (i = 0; i < hw->aq.num_asq_entries; i++)
+		if (hw->aq.asq.r.asq_bi[i].pa)
+			i40e_free_dma_mem(hw, &hw->aq.asq.r.asq_bi[i]);
+
+	/* free the buffer info list */
+	i40e_free_virt_mem(hw, &hw->aq.asq.cmd_buf);
+
+	/* free the descriptor memory */
+	i40e_free_dma_mem(hw, &hw->aq.asq.desc_buf);
+
+	/* free the dma header */
+	i40e_free_virt_mem(hw, &hw->aq.asq.dma_head);
+}
+
+/**
+ *  i40e_config_asq_regs - configure ASQ registers
+ *  @hw: pointer to the hardware structure
+ *
+ *  Configure base address and length registers for the transmit queue
+ **/
+static i40e_status i40e_config_asq_regs(struct i40e_hw *hw)
+{
+	i40e_status ret_code = 0;
+	u32 reg = 0;
+
+	/* Clear Head and Tail */
+	wr32(hw, hw->aq.asq.head, 0);
+	wr32(hw, hw->aq.asq.tail, 0);
+
+	/* set starting point */
+	wr32(hw, hw->aq.asq.len, (hw->aq.num_asq_entries |
+				  I40E_PF_ATQLEN_ATQENABLE_MASK));
+	wr32(hw, hw->aq.asq.bal, lower_32_bits(hw->aq.asq.desc_buf.pa));
+	wr32(hw, hw->aq.asq.bah, upper_32_bits(hw->aq.asq.desc_buf.pa));
+
+	/* Check one register to verify that config was applied */
+	reg = rd32(hw, hw->aq.asq.bal);
+	if (reg != lower_32_bits(hw->aq.asq.desc_buf.pa))
+		ret_code = I40E_ERR_ADMIN_QUEUE_ERROR;
+
+	return ret_code;
+}
+
+/**
+ *  i40e_config_arq_regs - ARQ register configuration
+ *  @hw: pointer to the hardware structure
+ *
+ * Configure base address and length registers for the receive (event queue)
+ **/
+static i40e_status i40e_config_arq_regs(struct i40e_hw *hw)
+{
+	i40e_status ret_code = 0;
+	u32 reg = 0;
+
+	/* Clear Head and Tail */
+	wr32(hw, hw->aq.arq.head, 0);
+	wr32(hw, hw->aq.arq.tail, 0);
+
+	/* set starting point */
+	wr32(hw, hw->aq.arq.len, (hw->aq.num_arq_entries |
+				  I40E_PF_ARQLEN_ARQENABLE_MASK));
+	wr32(hw, hw->aq.arq.bal, lower_32_bits(hw->aq.arq.desc_buf.pa));
+	wr32(hw, hw->aq.arq.bah, upper_32_bits(hw->aq.arq.desc_buf.pa));
+
+	/* Update tail in the HW to post pre-allocated buffers */
+	wr32(hw, hw->aq.arq.tail, hw->aq.num_arq_entries - 1);
+
+	/* Check one register to verify that config was applied */
+	reg = rd32(hw, hw->aq.arq.bal);
+	if (reg != lower_32_bits(hw->aq.arq.desc_buf.pa))
+		ret_code = I40E_ERR_ADMIN_QUEUE_ERROR;
+
+	return ret_code;
+}
+
+/**
+ *  i40e_init_asq - main initialization routine for ASQ
+ *  @hw: pointer to the hardware structure
+ *
+ *  This is the main initialization routine for the Admin Send Queue
+ *  Prior to calling this function, drivers *MUST* set the following fields
+ *  in the hw->aq structure:
+ *     - hw->aq.num_asq_entries
+ *     - hw->aq.arq_buf_size
+ *
+ *  Do *NOT* hold the lock when calling this as the memory allocation routines
+ *  called are not going to be atomic context safe
+ **/
+static i40e_status i40e_init_asq(struct i40e_hw *hw)
+{
+	i40e_status ret_code = 0;
+
+	if (hw->aq.asq.count > 0) {
+		/* queue already initialized */
+		ret_code = I40E_ERR_NOT_READY;
+		goto init_adminq_exit;
+	}
+
+	/* verify input for valid configuration */
+	if ((hw->aq.num_asq_entries == 0) ||
+	    (hw->aq.asq_buf_size == 0)) {
+		ret_code = I40E_ERR_CONFIG;
+		goto init_adminq_exit;
+	}
+
+	hw->aq.asq.next_to_use = 0;
+	hw->aq.asq.next_to_clean = 0;
+
+	/* allocate the ring memory */
+	ret_code = i40e_alloc_adminq_asq_ring(hw);
+	if (ret_code)
+		goto init_adminq_exit;
+
+	/* allocate buffers in the rings */
+	ret_code = i40e_alloc_asq_bufs(hw);
+	if (ret_code)
+		goto init_adminq_free_rings;
+
+	/* initialize base registers */
+	ret_code = i40e_config_asq_regs(hw);
+	if (ret_code)
+		goto init_adminq_free_rings;
+
+	/* success! */
+	hw->aq.asq.count = hw->aq.num_asq_entries;
+	goto init_adminq_exit;
+
+init_adminq_free_rings:
+	i40e_free_adminq_asq(hw);
+
+init_adminq_exit:
+	return ret_code;
+}
+
+/**
+ *  i40e_init_arq - initialize ARQ
+ *  @hw: pointer to the hardware structure
+ *
+ *  The main initialization routine for the Admin Receive (Event) Queue.
+ *  Prior to calling this function, drivers *MUST* set the following fields
+ *  in the hw->aq structure:
+ *     - hw->aq.num_asq_entries
+ *     - hw->aq.arq_buf_size
+ *
+ *  Do *NOT* hold the lock when calling this as the memory allocation routines
+ *  called are not going to be atomic context safe
+ **/
+static i40e_status i40e_init_arq(struct i40e_hw *hw)
+{
+	i40e_status ret_code = 0;
+
+	if (hw->aq.arq.count > 0) {
+		/* queue already initialized */
+		ret_code = I40E_ERR_NOT_READY;
+		goto init_adminq_exit;
+	}
+
+	/* verify input for valid configuration */
+	if ((hw->aq.num_arq_entries == 0) ||
+	    (hw->aq.arq_buf_size == 0)) {
+		ret_code = I40E_ERR_CONFIG;
+		goto init_adminq_exit;
+	}
+
+	hw->aq.arq.next_to_use = 0;
+	hw->aq.arq.next_to_clean = 0;
+
+	/* allocate the ring memory */
+	ret_code = i40e_alloc_adminq_arq_ring(hw);
+	if (ret_code)
+		goto init_adminq_exit;
+
+	/* allocate buffers in the rings */
+	ret_code = i40e_alloc_arq_bufs(hw);
+	if (ret_code)
+		goto init_adminq_free_rings;
+
+	/* initialize base registers */
+	ret_code = i40e_config_arq_regs(hw);
+	if (ret_code)
+		goto init_adminq_free_rings;
+
+	/* success! */
+	hw->aq.arq.count = hw->aq.num_arq_entries;
+	goto init_adminq_exit;
+
+init_adminq_free_rings:
+	i40e_free_adminq_arq(hw);
+
+init_adminq_exit:
+	return ret_code;
+}
+
+/**
+ *  i40e_shutdown_asq - shutdown the ASQ
+ *  @hw: pointer to the hardware structure
+ *
+ *  The main shutdown routine for the Admin Send Queue
+ **/
+static i40e_status i40e_shutdown_asq(struct i40e_hw *hw)
+{
+	i40e_status ret_code = 0;
+
+	mutex_lock(&hw->aq.asq_mutex);
+
+	if (hw->aq.asq.count == 0) {
+		ret_code = I40E_ERR_NOT_READY;
+		goto shutdown_asq_out;
+	}
+
+	/* Stop firmware AdminQ processing */
+	wr32(hw, hw->aq.asq.head, 0);
+	wr32(hw, hw->aq.asq.tail, 0);
+	wr32(hw, hw->aq.asq.len, 0);
+	wr32(hw, hw->aq.asq.bal, 0);
+	wr32(hw, hw->aq.asq.bah, 0);
+
+	hw->aq.asq.count = 0; /* to indicate uninitialized queue */
+
+	/* free ring buffers */
+	i40e_free_asq_bufs(hw);
+
+shutdown_asq_out:
+	mutex_unlock(&hw->aq.asq_mutex);
+	return ret_code;
+}
+
+/**
+ *  i40e_shutdown_arq - shutdown ARQ
+ *  @hw: pointer to the hardware structure
+ *
+ *  The main shutdown routine for the Admin Receive Queue
+ **/
+static i40e_status i40e_shutdown_arq(struct i40e_hw *hw)
+{
+	i40e_status ret_code = 0;
+
+	mutex_lock(&hw->aq.arq_mutex);
+
+	if (hw->aq.arq.count == 0) {
+		ret_code = I40E_ERR_NOT_READY;
+		goto shutdown_arq_out;
+	}
+
+	/* Stop firmware AdminQ processing */
+	wr32(hw, hw->aq.arq.head, 0);
+	wr32(hw, hw->aq.arq.tail, 0);
+	wr32(hw, hw->aq.arq.len, 0);
+	wr32(hw, hw->aq.arq.bal, 0);
+	wr32(hw, hw->aq.arq.bah, 0);
+
+	hw->aq.arq.count = 0; /* to indicate uninitialized queue */
+
+	/* free ring buffers */
+	i40e_free_arq_bufs(hw);
+
+shutdown_arq_out:
+	mutex_unlock(&hw->aq.arq_mutex);
+	return ret_code;
+}
+
+/**
+ *  i40e_init_adminq - main initialization routine for Admin Queue
+ *  @hw: pointer to the hardware structure
+ *
+ *  Prior to calling this function, drivers *MUST* set the following fields
+ *  in the hw->aq structure:
+ *     - hw->aq.num_asq_entries
+ *     - hw->aq.num_arq_entries
+ *     - hw->aq.arq_buf_size
+ *     - hw->aq.asq_buf_size
+ **/
+i40e_status i40e_init_adminq(struct i40e_hw *hw)
+{
+	u16 cfg_ptr, oem_hi, oem_lo;
+	u16 eetrack_lo, eetrack_hi;
+	i40e_status ret_code;
+	int retry = 0;
+
+	/* verify input for valid configuration */
+	if ((hw->aq.num_arq_entries == 0) ||
+	    (hw->aq.num_asq_entries == 0) ||
+	    (hw->aq.arq_buf_size == 0) ||
+	    (hw->aq.asq_buf_size == 0)) {
+		ret_code = I40E_ERR_CONFIG;
+		goto init_adminq_exit;
+	}
+
+	/* Set up register offsets */
+	i40e_adminq_init_regs(hw);
+
+	/* setup ASQ command write back timeout */
+	hw->aq.asq_cmd_timeout = I40E_ASQ_CMD_TIMEOUT;
+
+	/* allocate the ASQ */
+	ret_code = i40e_init_asq(hw);
+	if (ret_code)
+		goto init_adminq_destroy_locks;
+
+	/* allocate the ARQ */
+	ret_code = i40e_init_arq(hw);
+	if (ret_code)
+		goto init_adminq_free_asq;
+
+	/* There are some cases where the firmware may not be quite ready
+	 * for AdminQ operations, so we retry the AdminQ setup a few times
+	 * if we see timeouts in this first AQ call.
+	 */
+	do {
+		ret_code = i40e_aq_get_firmware_version(hw,
+							&hw->aq.fw_maj_ver,
+							&hw->aq.fw_min_ver,
+							&hw->aq.fw_build,
+							&hw->aq.api_maj_ver,
+							&hw->aq.api_min_ver,
+							NULL);
+		if (ret_code != I40E_ERR_ADMIN_QUEUE_TIMEOUT)
+			break;
+		retry++;
+		msleep(100);
+		i40e_resume_aq(hw);
+	} while (retry < 10);
+	if (ret_code != I40E_SUCCESS)
+		goto init_adminq_free_arq;
+
+	/* get the NVM version info */
+	i40e_read_nvm_word(hw, I40E_SR_NVM_DEV_STARTER_VERSION,
+			   &hw->nvm.version);
+	i40e_read_nvm_word(hw, I40E_SR_NVM_EETRACK_LO, &eetrack_lo);
+	i40e_read_nvm_word(hw, I40E_SR_NVM_EETRACK_HI, &eetrack_hi);
+	hw->nvm.eetrack = (eetrack_hi << 16) | eetrack_lo;
+	i40e_read_nvm_word(hw, I40E_SR_BOOT_CONFIG_PTR, &cfg_ptr);
+	i40e_read_nvm_word(hw, (cfg_ptr + I40E_NVM_OEM_VER_OFF),
+			   &oem_hi);
+	i40e_read_nvm_word(hw, (cfg_ptr + (I40E_NVM_OEM_VER_OFF + 1)),
+			   &oem_lo);
+	hw->nvm.oem_ver = ((u32)oem_hi << 16) | oem_lo;
+
+	if (hw->mac.type == I40E_MAC_XL710 &&
+	    hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR &&
+	    hw->aq.api_min_ver >= I40E_MINOR_VER_GET_LINK_INFO_XL710) {
+		hw->flags |= I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE;
+	}
+
+	/* Newer versions of firmware require lock when reading the NVM */
+	if (hw->aq.api_maj_ver > 1 ||
+	    (hw->aq.api_maj_ver == 1 &&
+	     hw->aq.api_min_ver >= 5))
+		hw->flags |= I40E_HW_FLAG_NVM_READ_REQUIRES_LOCK;
+
+	/* The ability to RX (not drop) 802.1ad frames was added in API 1.7 */
+	if (hw->aq.api_maj_ver > 1 ||
+	    (hw->aq.api_maj_ver == 1 &&
+	     hw->aq.api_min_ver >= 7))
+		hw->flags |= I40E_HW_FLAG_802_1AD_CAPABLE;
+
+	if (hw->aq.api_maj_ver > I40E_FW_API_VERSION_MAJOR) {
+		ret_code = I40E_ERR_FIRMWARE_API_VERSION;
+		goto init_adminq_free_arq;
+	}
+
+	/* pre-emptive resource lock release */
+	i40e_aq_release_resource(hw, I40E_NVM_RESOURCE_ID, 0, NULL);
+	hw->nvm_release_on_done = false;
+	hw->nvmupd_state = I40E_NVMUPD_STATE_INIT;
+
+	ret_code = 0;
+
+	/* success! */
+	goto init_adminq_exit;
+
+init_adminq_free_arq:
+	i40e_shutdown_arq(hw);
+init_adminq_free_asq:
+	i40e_shutdown_asq(hw);
+init_adminq_destroy_locks:
+
+init_adminq_exit:
+	return ret_code;
+}
+
+/**
+ *  i40e_shutdown_adminq - shutdown routine for the Admin Queue
+ *  @hw: pointer to the hardware structure
+ **/
+i40e_status i40e_shutdown_adminq(struct i40e_hw *hw)
+{
+	i40e_status ret_code = 0;
+
+	if (i40e_check_asq_alive(hw))
+		i40e_aq_queue_shutdown(hw, true);
+
+	i40e_shutdown_asq(hw);
+	i40e_shutdown_arq(hw);
+
+	if (hw->nvm_buff.va)
+		i40e_free_virt_mem(hw, &hw->nvm_buff);
+
+	return ret_code;
+}
+
+/**
+ *  i40e_clean_asq - cleans Admin send queue
+ *  @hw: pointer to the hardware structure
+ *
+ *  returns the number of free desc
+ **/
+static u16 i40e_clean_asq(struct i40e_hw *hw)
+{
+	struct i40e_adminq_ring *asq = &(hw->aq.asq);
+	struct i40e_asq_cmd_details *details;
+	u16 ntc = asq->next_to_clean;
+	struct i40e_aq_desc desc_cb;
+	struct i40e_aq_desc *desc;
+
+	desc = I40E_ADMINQ_DESC(*asq, ntc);
+	details = I40E_ADMINQ_DETAILS(*asq, ntc);
+	while (rd32(hw, hw->aq.asq.head) != ntc) {
+		i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+			   "ntc %d head %d.\n", ntc, rd32(hw, hw->aq.asq.head));
+
+		if (details->callback) {
+			I40E_ADMINQ_CALLBACK cb_func =
+					(I40E_ADMINQ_CALLBACK)details->callback;
+			desc_cb = *desc;
+			cb_func(hw, &desc_cb);
+		}
+		memset(desc, 0, sizeof(*desc));
+		memset(details, 0, sizeof(*details));
+		ntc++;
+		if (ntc == asq->count)
+			ntc = 0;
+		desc = I40E_ADMINQ_DESC(*asq, ntc);
+		details = I40E_ADMINQ_DETAILS(*asq, ntc);
+	}
+
+	asq->next_to_clean = ntc;
+
+	return I40E_DESC_UNUSED(asq);
+}
+
+/**
+ *  i40e_asq_done - check if FW has processed the Admin Send Queue
+ *  @hw: pointer to the hw struct
+ *
+ *  Returns true if the firmware has processed all descriptors on the
+ *  admin send queue. Returns false if there are still requests pending.
+ **/
+static bool i40e_asq_done(struct i40e_hw *hw)
+{
+	/* AQ designers suggest use of head for better
+	 * timing reliability than DD bit
+	 */
+	return rd32(hw, hw->aq.asq.head) == hw->aq.asq.next_to_use;
+
+}
+
+/**
+ *  i40e_asq_send_command - send command to Admin Queue
+ *  @hw: pointer to the hw struct
+ *  @desc: prefilled descriptor describing the command (non DMA mem)
+ *  @buff: buffer to use for indirect commands
+ *  @buff_size: size of buffer for indirect commands
+ *  @cmd_details: pointer to command details structure
+ *
+ *  This is the main send command driver routine for the Admin Queue send
+ *  queue.  It runs the queue, cleans the queue, etc
+ **/
+i40e_status i40e_asq_send_command(struct i40e_hw *hw,
+				struct i40e_aq_desc *desc,
+				void *buff, /* can be NULL */
+				u16  buff_size,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	i40e_status status = 0;
+	struct i40e_dma_mem *dma_buff = NULL;
+	struct i40e_asq_cmd_details *details;
+	struct i40e_aq_desc *desc_on_ring;
+	bool cmd_completed = false;
+	u16  retval = 0;
+	u32  val = 0;
+
+	mutex_lock(&hw->aq.asq_mutex);
+
+	if (hw->aq.asq.count == 0) {
+		i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+			   "AQTX: Admin queue not initialized.\n");
+		status = I40E_ERR_QUEUE_EMPTY;
+		goto asq_send_command_error;
+	}
+
+	hw->aq.asq_last_status = I40E_AQ_RC_OK;
+
+	val = rd32(hw, hw->aq.asq.head);
+	if (val >= hw->aq.num_asq_entries) {
+		i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+			   "AQTX: head overrun at %d\n", val);
+		status = I40E_ERR_QUEUE_EMPTY;
+		goto asq_send_command_error;
+	}
+
+	details = I40E_ADMINQ_DETAILS(hw->aq.asq, hw->aq.asq.next_to_use);
+	if (cmd_details) {
+		*details = *cmd_details;
+
+		/* If the cmd_details are defined copy the cookie.  The
+		 * cpu_to_le32 is not needed here because the data is ignored
+		 * by the FW, only used by the driver
+		 */
+		if (details->cookie) {
+			desc->cookie_high =
+				cpu_to_le32(upper_32_bits(details->cookie));
+			desc->cookie_low =
+				cpu_to_le32(lower_32_bits(details->cookie));
+		}
+	} else {
+		memset(details, 0, sizeof(struct i40e_asq_cmd_details));
+	}
+
+	/* clear requested flags and then set additional flags if defined */
+	desc->flags &= ~cpu_to_le16(details->flags_dis);
+	desc->flags |= cpu_to_le16(details->flags_ena);
+
+	if (buff_size > hw->aq.asq_buf_size) {
+		i40e_debug(hw,
+			   I40E_DEBUG_AQ_MESSAGE,
+			   "AQTX: Invalid buffer size: %d.\n",
+			   buff_size);
+		status = I40E_ERR_INVALID_SIZE;
+		goto asq_send_command_error;
+	}
+
+	if (details->postpone && !details->async) {
+		i40e_debug(hw,
+			   I40E_DEBUG_AQ_MESSAGE,
+			   "AQTX: Async flag not set along with postpone flag");
+		status = I40E_ERR_PARAM;
+		goto asq_send_command_error;
+	}
+
+	/* call clean and check queue available function to reclaim the
+	 * descriptors that were processed by FW, the function returns the
+	 * number of desc available
+	 */
+	/* the clean function called here could be called in a separate thread
+	 * in case of asynchronous completions
+	 */
+	if (i40e_clean_asq(hw) == 0) {
+		i40e_debug(hw,
+			   I40E_DEBUG_AQ_MESSAGE,
+			   "AQTX: Error queue is full.\n");
+		status = I40E_ERR_ADMIN_QUEUE_FULL;
+		goto asq_send_command_error;
+	}
+
+	/* initialize the temp desc pointer with the right desc */
+	desc_on_ring = I40E_ADMINQ_DESC(hw->aq.asq, hw->aq.asq.next_to_use);
+
+	/* if the desc is available copy the temp desc to the right place */
+	*desc_on_ring = *desc;
+
+	/* if buff is not NULL assume indirect command */
+	if (buff != NULL) {
+		dma_buff = &(hw->aq.asq.r.asq_bi[hw->aq.asq.next_to_use]);
+		/* copy the user buff into the respective DMA buff */
+		memcpy(dma_buff->va, buff, buff_size);
+		desc_on_ring->datalen = cpu_to_le16(buff_size);
+
+		/* Update the address values in the desc with the pa value
+		 * for respective buffer
+		 */
+		desc_on_ring->params.external.addr_high =
+				cpu_to_le32(upper_32_bits(dma_buff->pa));
+		desc_on_ring->params.external.addr_low =
+				cpu_to_le32(lower_32_bits(dma_buff->pa));
+	}
+
+	/* bump the tail */
+	i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE, "AQTX: desc and buffer:\n");
+	i40e_debug_aq(hw, I40E_DEBUG_AQ_COMMAND, (void *)desc_on_ring,
+		      buff, buff_size);
+	(hw->aq.asq.next_to_use)++;
+	if (hw->aq.asq.next_to_use == hw->aq.asq.count)
+		hw->aq.asq.next_to_use = 0;
+	if (!details->postpone)
+		wr32(hw, hw->aq.asq.tail, hw->aq.asq.next_to_use);
+
+	/* if cmd_details are not defined or async flag is not set,
+	 * we need to wait for desc write back
+	 */
+	if (!details->async && !details->postpone) {
+		u32 total_delay = 0;
+
+		do {
+			/* AQ designers suggest use of head for better
+			 * timing reliability than DD bit
+			 */
+			if (i40e_asq_done(hw))
+				break;
+			udelay(50);
+			total_delay += 50;
+		} while (total_delay < hw->aq.asq_cmd_timeout);
+	}
+
+	/* if ready, copy the desc back to temp */
+	if (i40e_asq_done(hw)) {
+		*desc = *desc_on_ring;
+		if (buff != NULL)
+			memcpy(buff, dma_buff->va, buff_size);
+		retval = le16_to_cpu(desc->retval);
+		if (retval != 0) {
+			i40e_debug(hw,
+				   I40E_DEBUG_AQ_MESSAGE,
+				   "AQTX: Command completed with error 0x%X.\n",
+				   retval);
+
+			/* strip off FW internal code */
+			retval &= 0xff;
+		}
+		cmd_completed = true;
+		if ((enum i40e_admin_queue_err)retval == I40E_AQ_RC_OK)
+			status = 0;
+		else
+			status = I40E_ERR_ADMIN_QUEUE_ERROR;
+		hw->aq.asq_last_status = (enum i40e_admin_queue_err)retval;
+	}
+
+	i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+		   "AQTX: desc and buffer writeback:\n");
+	i40e_debug_aq(hw, I40E_DEBUG_AQ_COMMAND, (void *)desc, buff, buff_size);
+
+	/* save writeback aq if requested */
+	if (details->wb_desc)
+		*details->wb_desc = *desc_on_ring;
+
+	/* update the error if time out occurred */
+	if ((!cmd_completed) &&
+	    (!details->async && !details->postpone)) {
+		if (rd32(hw, hw->aq.asq.len) & I40E_GL_ATQLEN_ATQCRIT_MASK) {
+			i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+				   "AQTX: AQ Critical error.\n");
+			status = I40E_ERR_ADMIN_QUEUE_CRITICAL_ERROR;
+		} else {
+			i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+				   "AQTX: Writeback timeout.\n");
+			status = I40E_ERR_ADMIN_QUEUE_TIMEOUT;
+		}
+	}
+
+asq_send_command_error:
+	mutex_unlock(&hw->aq.asq_mutex);
+	return status;
+}
+
+/**
+ *  i40e_fill_default_direct_cmd_desc - AQ descriptor helper function
+ *  @desc:     pointer to the temp descriptor (non DMA mem)
+ *  @opcode:   the opcode can be used to decide which flags to turn off or on
+ *
+ *  Fill the desc with default values
+ **/
+void i40e_fill_default_direct_cmd_desc(struct i40e_aq_desc *desc,
+				       u16 opcode)
+{
+	/* zero out the desc */
+	memset((void *)desc, 0, sizeof(struct i40e_aq_desc));
+	desc->opcode = cpu_to_le16(opcode);
+	desc->flags = cpu_to_le16(I40E_AQ_FLAG_SI);
+}
+
+/**
+ *  i40e_clean_arq_element
+ *  @hw: pointer to the hw struct
+ *  @e: event info from the receive descriptor, includes any buffers
+ *  @pending: number of events that could be left to process
+ *
+ *  This function cleans one Admin Receive Queue element and returns
+ *  the contents through e.  It can also return how many events are
+ *  left to process through 'pending'
+ **/
+i40e_status i40e_clean_arq_element(struct i40e_hw *hw,
+					     struct i40e_arq_event_info *e,
+					     u16 *pending)
+{
+	i40e_status ret_code = 0;
+	u16 ntc = hw->aq.arq.next_to_clean;
+	struct i40e_aq_desc *desc;
+	struct i40e_dma_mem *bi;
+	u16 desc_idx;
+	u16 datalen;
+	u16 flags;
+	u16 ntu;
+
+	/* pre-clean the event info */
+	memset(&e->desc, 0, sizeof(e->desc));
+
+	/* take the lock before we start messing with the ring */
+	mutex_lock(&hw->aq.arq_mutex);
+
+	if (hw->aq.arq.count == 0) {
+		i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
+			   "AQRX: Admin queue not initialized.\n");
+		ret_code = I40E_ERR_QUEUE_EMPTY;
+		goto clean_arq_element_err;
+	}
+
+	/* set next_to_use to head */
+	ntu = rd32(hw, hw->aq.arq.head) & I40E_PF_ARQH_ARQH_MASK;
+	if (ntu == ntc) {
+		/* nothing to do - shouldn't need to update ring's values */
+		ret_code = I40E_ERR_ADMIN_QUEUE_NO_WORK;
+		goto clean_arq_element_out;
+	}
+
+	/* now clean the next descriptor */
+	desc = I40E_ADMINQ_DESC(hw->aq.arq, ntc);
+	desc_idx = ntc;
+
+	hw->aq.arq_last_status =
+		(enum i40e_admin_queue_err)le16_to_cpu(desc->retval);
+	flags = le16_to_cpu(desc->flags);
+	if (flags & I40E_AQ_FLAG_ERR) {
+		ret_code = I40E_ERR_ADMIN_QUEUE_ERROR;
+		i40e_debug(hw,
+			   I40E_DEBUG_AQ_MESSAGE,
+			   "AQRX: Event received with error 0x%X.\n",
+			   hw->aq.arq_last_status);
+	}
+
+	e->desc = *desc;
+	datalen = le16_to_cpu(desc->datalen);
+	e->msg_len = min(datalen, e->buf_len);
+	if (e->msg_buf != NULL && (e->msg_len != 0))
+		memcpy(e->msg_buf, hw->aq.arq.r.arq_bi[desc_idx].va,
+		       e->msg_len);
+
+	i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE, "AQRX: desc and buffer:\n");
+	i40e_debug_aq(hw, I40E_DEBUG_AQ_COMMAND, (void *)desc, e->msg_buf,
+		      hw->aq.arq_buf_size);
+
+	/* Restore the original datalen and buffer address in the desc,
+	 * FW updates datalen to indicate the event message
+	 * size
+	 */
+	bi = &hw->aq.arq.r.arq_bi[ntc];
+	memset((void *)desc, 0, sizeof(struct i40e_aq_desc));
+
+	desc->flags = cpu_to_le16(I40E_AQ_FLAG_BUF);
+	if (hw->aq.arq_buf_size > I40E_AQ_LARGE_BUF)
+		desc->flags |= cpu_to_le16(I40E_AQ_FLAG_LB);
+	desc->datalen = cpu_to_le16((u16)bi->size);
+	desc->params.external.addr_high = cpu_to_le32(upper_32_bits(bi->pa));
+	desc->params.external.addr_low = cpu_to_le32(lower_32_bits(bi->pa));
+
+	/* set tail = the last cleaned desc index. */
+	wr32(hw, hw->aq.arq.tail, ntc);
+	/* ntc is updated to tail + 1 */
+	ntc++;
+	if (ntc == hw->aq.num_arq_entries)
+		ntc = 0;
+	hw->aq.arq.next_to_clean = ntc;
+	hw->aq.arq.next_to_use = ntu;
+
+	i40e_nvmupd_check_wait_event(hw, le16_to_cpu(e->desc.opcode), &e->desc);
+clean_arq_element_out:
+	/* Set pending if needed, unlock and return */
+	if (pending)
+		*pending = (ntc > ntu ? hw->aq.arq.count : 0) + (ntu - ntc);
+clean_arq_element_err:
+	mutex_unlock(&hw->aq.arq_mutex);
+
+	return ret_code;
+}
+
+static void i40e_resume_aq(struct i40e_hw *hw)
+{
+	/* Registers are reset after PF reset */
+	hw->aq.asq.next_to_use = 0;
+	hw->aq.asq.next_to_clean = 0;
+
+	i40e_config_asq_regs(hw);
+
+	hw->aq.arq.next_to_use = 0;
+	hw->aq.arq.next_to_clean = 0;
+
+	i40e_config_arq_regs(hw);
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.h b/drivers/net/ethernet/intel/i40e/i40e_adminq.h
new file mode 100644
index 0000000..0a8749e
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.h
@@ -0,0 +1,160 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_ADMINQ_H_
+#define _I40E_ADMINQ_H_
+
+#include "i40e_osdep.h"
+#include "i40e_status.h"
+#include "i40e_adminq_cmd.h"
+
+#define I40E_ADMINQ_DESC(R, i)   \
+	(&(((struct i40e_aq_desc *)((R).desc_buf.va))[i]))
+
+#define I40E_ADMINQ_DESC_ALIGNMENT 4096
+
+struct i40e_adminq_ring {
+	struct i40e_virt_mem dma_head;	/* space for dma structures */
+	struct i40e_dma_mem desc_buf;	/* descriptor ring memory */
+	struct i40e_virt_mem cmd_buf;	/* command buffer memory */
+
+	union {
+		struct i40e_dma_mem *asq_bi;
+		struct i40e_dma_mem *arq_bi;
+	} r;
+
+	u16 count;		/* Number of descriptors */
+	u16 rx_buf_len;		/* Admin Receive Queue buffer length */
+
+	/* used for interrupt processing */
+	u16 next_to_use;
+	u16 next_to_clean;
+
+	/* used for queue tracking */
+	u32 head;
+	u32 tail;
+	u32 len;
+	u32 bah;
+	u32 bal;
+};
+
+/* ASQ transaction details */
+struct i40e_asq_cmd_details {
+	void *callback; /* cast from type I40E_ADMINQ_CALLBACK */
+	u64 cookie;
+	u16 flags_ena;
+	u16 flags_dis;
+	bool async;
+	bool postpone;
+	struct i40e_aq_desc *wb_desc;
+};
+
+#define I40E_ADMINQ_DETAILS(R, i)   \
+	(&(((struct i40e_asq_cmd_details *)((R).cmd_buf.va))[i]))
+
+/* ARQ event information */
+struct i40e_arq_event_info {
+	struct i40e_aq_desc desc;
+	u16 msg_len;
+	u16 buf_len;
+	u8 *msg_buf;
+};
+
+/* Admin Queue information */
+struct i40e_adminq_info {
+	struct i40e_adminq_ring arq;    /* receive queue */
+	struct i40e_adminq_ring asq;    /* send queue */
+	u32 asq_cmd_timeout;            /* send queue cmd write back timeout*/
+	u16 num_arq_entries;            /* receive queue depth */
+	u16 num_asq_entries;            /* send queue depth */
+	u16 arq_buf_size;               /* receive queue buffer size */
+	u16 asq_buf_size;               /* send queue buffer size */
+	u16 fw_maj_ver;                 /* firmware major version */
+	u16 fw_min_ver;                 /* firmware minor version */
+	u32 fw_build;                   /* firmware build number */
+	u16 api_maj_ver;                /* api major version */
+	u16 api_min_ver;                /* api minor version */
+
+	struct mutex asq_mutex; /* Send queue lock */
+	struct mutex arq_mutex; /* Receive queue lock */
+
+	/* last status values on send and receive queues */
+	enum i40e_admin_queue_err asq_last_status;
+	enum i40e_admin_queue_err arq_last_status;
+};
+
+/**
+ * i40e_aq_rc_to_posix - convert errors to user-land codes
+ * aq_ret: AdminQ handler error code can override aq_rc
+ * aq_rc: AdminQ firmware error code to convert
+ **/
+static inline int i40e_aq_rc_to_posix(int aq_ret, int aq_rc)
+{
+	int aq_to_posix[] = {
+		0,           /* I40E_AQ_RC_OK */
+		-EPERM,      /* I40E_AQ_RC_EPERM */
+		-ENOENT,     /* I40E_AQ_RC_ENOENT */
+		-ESRCH,      /* I40E_AQ_RC_ESRCH */
+		-EINTR,      /* I40E_AQ_RC_EINTR */
+		-EIO,        /* I40E_AQ_RC_EIO */
+		-ENXIO,      /* I40E_AQ_RC_ENXIO */
+		-E2BIG,      /* I40E_AQ_RC_E2BIG */
+		-EAGAIN,     /* I40E_AQ_RC_EAGAIN */
+		-ENOMEM,     /* I40E_AQ_RC_ENOMEM */
+		-EACCES,     /* I40E_AQ_RC_EACCES */
+		-EFAULT,     /* I40E_AQ_RC_EFAULT */
+		-EBUSY,      /* I40E_AQ_RC_EBUSY */
+		-EEXIST,     /* I40E_AQ_RC_EEXIST */
+		-EINVAL,     /* I40E_AQ_RC_EINVAL */
+		-ENOTTY,     /* I40E_AQ_RC_ENOTTY */
+		-ENOSPC,     /* I40E_AQ_RC_ENOSPC */
+		-ENOSYS,     /* I40E_AQ_RC_ENOSYS */
+		-ERANGE,     /* I40E_AQ_RC_ERANGE */
+		-EPIPE,      /* I40E_AQ_RC_EFLUSHED */
+		-ESPIPE,     /* I40E_AQ_RC_BAD_ADDR */
+		-EROFS,      /* I40E_AQ_RC_EMODE */
+		-EFBIG,      /* I40E_AQ_RC_EFBIG */
+	};
+
+	/* aq_rc is invalid if AQ timed out */
+	if (aq_ret == I40E_ERR_ADMIN_QUEUE_TIMEOUT)
+		return -EAGAIN;
+
+	if (!((u32)aq_rc < (sizeof(aq_to_posix) / sizeof((aq_to_posix)[0]))))
+		return -ERANGE;
+
+	return aq_to_posix[aq_rc];
+}
+
+/* general information */
+#define I40E_AQ_LARGE_BUF	512
+#define I40E_ASQ_CMD_TIMEOUT	250000  /* usecs */
+
+void i40e_fill_default_direct_cmd_desc(struct i40e_aq_desc *desc,
+				       u16 opcode);
+
+#endif /* _I40E_ADMINQ_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
new file mode 100644
index 0000000..0244923
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -0,0 +1,2897 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_ADMINQ_CMD_H_
+#define _I40E_ADMINQ_CMD_H_
+
+/* This header file defines the i40e Admin Queue commands and is shared between
+ * i40e Firmware and Software.
+ *
+ * This file needs to comply with the Linux Kernel coding style.
+ */
+
+#define I40E_FW_API_VERSION_MAJOR	0x0001
+#define I40E_FW_API_VERSION_MINOR_X722	0x0005
+#define I40E_FW_API_VERSION_MINOR_X710	0x0007
+
+#define I40E_FW_MINOR_VERSION(_h) ((_h)->mac.type == I40E_MAC_XL710 ? \
+					I40E_FW_API_VERSION_MINOR_X710 : \
+					I40E_FW_API_VERSION_MINOR_X722)
+
+/* API version 1.7 implements additional link and PHY-specific APIs  */
+#define I40E_MINOR_VER_GET_LINK_INFO_XL710 0x0007
+
+struct i40e_aq_desc {
+	__le16 flags;
+	__le16 opcode;
+	__le16 datalen;
+	__le16 retval;
+	__le32 cookie_high;
+	__le32 cookie_low;
+	union {
+		struct {
+			__le32 param0;
+			__le32 param1;
+			__le32 param2;
+			__le32 param3;
+		} internal;
+		struct {
+			__le32 param0;
+			__le32 param1;
+			__le32 addr_high;
+			__le32 addr_low;
+		} external;
+		u8 raw[16];
+	} params;
+};
+
+/* Flags sub-structure
+ * |0  |1  |2  |3  |4  |5  |6  |7  |8  |9  |10 |11 |12 |13 |14 |15 |
+ * |DD |CMP|ERR|VFE| * *  RESERVED * * |LB |RD |VFC|BUF|SI |EI |FE |
+ */
+
+/* command flags and offsets*/
+#define I40E_AQ_FLAG_DD_SHIFT	0
+#define I40E_AQ_FLAG_CMP_SHIFT	1
+#define I40E_AQ_FLAG_ERR_SHIFT	2
+#define I40E_AQ_FLAG_VFE_SHIFT	3
+#define I40E_AQ_FLAG_LB_SHIFT	9
+#define I40E_AQ_FLAG_RD_SHIFT	10
+#define I40E_AQ_FLAG_VFC_SHIFT	11
+#define I40E_AQ_FLAG_BUF_SHIFT	12
+#define I40E_AQ_FLAG_SI_SHIFT	13
+#define I40E_AQ_FLAG_EI_SHIFT	14
+#define I40E_AQ_FLAG_FE_SHIFT	15
+
+#define I40E_AQ_FLAG_DD		BIT(I40E_AQ_FLAG_DD_SHIFT)  /* 0x1    */
+#define I40E_AQ_FLAG_CMP	BIT(I40E_AQ_FLAG_CMP_SHIFT) /* 0x2    */
+#define I40E_AQ_FLAG_ERR	BIT(I40E_AQ_FLAG_ERR_SHIFT) /* 0x4    */
+#define I40E_AQ_FLAG_VFE	BIT(I40E_AQ_FLAG_VFE_SHIFT) /* 0x8    */
+#define I40E_AQ_FLAG_LB		BIT(I40E_AQ_FLAG_LB_SHIFT)  /* 0x200  */
+#define I40E_AQ_FLAG_RD		BIT(I40E_AQ_FLAG_RD_SHIFT)  /* 0x400  */
+#define I40E_AQ_FLAG_VFC	BIT(I40E_AQ_FLAG_VFC_SHIFT) /* 0x800  */
+#define I40E_AQ_FLAG_BUF	BIT(I40E_AQ_FLAG_BUF_SHIFT) /* 0x1000 */
+#define I40E_AQ_FLAG_SI		BIT(I40E_AQ_FLAG_SI_SHIFT)  /* 0x2000 */
+#define I40E_AQ_FLAG_EI		BIT(I40E_AQ_FLAG_EI_SHIFT)  /* 0x4000 */
+#define I40E_AQ_FLAG_FE		BIT(I40E_AQ_FLAG_FE_SHIFT)  /* 0x8000 */
+
+/* error codes */
+enum i40e_admin_queue_err {
+	I40E_AQ_RC_OK		= 0,  /* success */
+	I40E_AQ_RC_EPERM	= 1,  /* Operation not permitted */
+	I40E_AQ_RC_ENOENT	= 2,  /* No such element */
+	I40E_AQ_RC_ESRCH	= 3,  /* Bad opcode */
+	I40E_AQ_RC_EINTR	= 4,  /* operation interrupted */
+	I40E_AQ_RC_EIO		= 5,  /* I/O error */
+	I40E_AQ_RC_ENXIO	= 6,  /* No such resource */
+	I40E_AQ_RC_E2BIG	= 7,  /* Arg too long */
+	I40E_AQ_RC_EAGAIN	= 8,  /* Try again */
+	I40E_AQ_RC_ENOMEM	= 9,  /* Out of memory */
+	I40E_AQ_RC_EACCES	= 10, /* Permission denied */
+	I40E_AQ_RC_EFAULT	= 11, /* Bad address */
+	I40E_AQ_RC_EBUSY	= 12, /* Device or resource busy */
+	I40E_AQ_RC_EEXIST	= 13, /* object already exists */
+	I40E_AQ_RC_EINVAL	= 14, /* Invalid argument */
+	I40E_AQ_RC_ENOTTY	= 15, /* Not a typewriter */
+	I40E_AQ_RC_ENOSPC	= 16, /* No space left or alloc failure */
+	I40E_AQ_RC_ENOSYS	= 17, /* Function not implemented */
+	I40E_AQ_RC_ERANGE	= 18, /* Parameter out of range */
+	I40E_AQ_RC_EFLUSHED	= 19, /* Cmd flushed due to prev cmd error */
+	I40E_AQ_RC_BAD_ADDR	= 20, /* Descriptor contains a bad pointer */
+	I40E_AQ_RC_EMODE	= 21, /* Op not allowed in current dev mode */
+	I40E_AQ_RC_EFBIG	= 22, /* File too large */
+};
+
+/* Admin Queue command opcodes */
+enum i40e_admin_queue_opc {
+	/* aq commands */
+	i40e_aqc_opc_get_version	= 0x0001,
+	i40e_aqc_opc_driver_version	= 0x0002,
+	i40e_aqc_opc_queue_shutdown	= 0x0003,
+	i40e_aqc_opc_set_pf_context	= 0x0004,
+
+	/* resource ownership */
+	i40e_aqc_opc_request_resource	= 0x0008,
+	i40e_aqc_opc_release_resource	= 0x0009,
+
+	i40e_aqc_opc_list_func_capabilities	= 0x000A,
+	i40e_aqc_opc_list_dev_capabilities	= 0x000B,
+
+	/* Proxy commands */
+	i40e_aqc_opc_set_proxy_config		= 0x0104,
+	i40e_aqc_opc_set_ns_proxy_table_entry	= 0x0105,
+
+	/* LAA */
+	i40e_aqc_opc_mac_address_read	= 0x0107,
+	i40e_aqc_opc_mac_address_write	= 0x0108,
+
+	/* PXE */
+	i40e_aqc_opc_clear_pxe_mode	= 0x0110,
+
+	/* WoL commands */
+	i40e_aqc_opc_set_wol_filter	= 0x0120,
+	i40e_aqc_opc_get_wake_reason	= 0x0121,
+
+	/* internal switch commands */
+	i40e_aqc_opc_get_switch_config		= 0x0200,
+	i40e_aqc_opc_add_statistics		= 0x0201,
+	i40e_aqc_opc_remove_statistics		= 0x0202,
+	i40e_aqc_opc_set_port_parameters	= 0x0203,
+	i40e_aqc_opc_get_switch_resource_alloc	= 0x0204,
+	i40e_aqc_opc_set_switch_config		= 0x0205,
+	i40e_aqc_opc_rx_ctl_reg_read		= 0x0206,
+	i40e_aqc_opc_rx_ctl_reg_write		= 0x0207,
+
+	i40e_aqc_opc_add_vsi			= 0x0210,
+	i40e_aqc_opc_update_vsi_parameters	= 0x0211,
+	i40e_aqc_opc_get_vsi_parameters		= 0x0212,
+
+	i40e_aqc_opc_add_pv			= 0x0220,
+	i40e_aqc_opc_update_pv_parameters	= 0x0221,
+	i40e_aqc_opc_get_pv_parameters		= 0x0222,
+
+	i40e_aqc_opc_add_veb			= 0x0230,
+	i40e_aqc_opc_update_veb_parameters	= 0x0231,
+	i40e_aqc_opc_get_veb_parameters		= 0x0232,
+
+	i40e_aqc_opc_delete_element		= 0x0243,
+
+	i40e_aqc_opc_add_macvlan		= 0x0250,
+	i40e_aqc_opc_remove_macvlan		= 0x0251,
+	i40e_aqc_opc_add_vlan			= 0x0252,
+	i40e_aqc_opc_remove_vlan		= 0x0253,
+	i40e_aqc_opc_set_vsi_promiscuous_modes	= 0x0254,
+	i40e_aqc_opc_add_tag			= 0x0255,
+	i40e_aqc_opc_remove_tag			= 0x0256,
+	i40e_aqc_opc_add_multicast_etag		= 0x0257,
+	i40e_aqc_opc_remove_multicast_etag	= 0x0258,
+	i40e_aqc_opc_update_tag			= 0x0259,
+	i40e_aqc_opc_add_control_packet_filter	= 0x025A,
+	i40e_aqc_opc_remove_control_packet_filter	= 0x025B,
+	i40e_aqc_opc_add_cloud_filters		= 0x025C,
+	i40e_aqc_opc_remove_cloud_filters	= 0x025D,
+	i40e_aqc_opc_clear_wol_switch_filters	= 0x025E,
+
+	i40e_aqc_opc_add_mirror_rule	= 0x0260,
+	i40e_aqc_opc_delete_mirror_rule	= 0x0261,
+
+	/* Dynamic Device Personalization */
+	i40e_aqc_opc_write_personalization_profile	= 0x0270,
+	i40e_aqc_opc_get_personalization_profile_list	= 0x0271,
+
+	/* DCB commands */
+	i40e_aqc_opc_dcb_ignore_pfc	= 0x0301,
+	i40e_aqc_opc_dcb_updated	= 0x0302,
+	i40e_aqc_opc_set_dcb_parameters = 0x0303,
+
+	/* TX scheduler */
+	i40e_aqc_opc_configure_vsi_bw_limit		= 0x0400,
+	i40e_aqc_opc_configure_vsi_ets_sla_bw_limit	= 0x0406,
+	i40e_aqc_opc_configure_vsi_tc_bw		= 0x0407,
+	i40e_aqc_opc_query_vsi_bw_config		= 0x0408,
+	i40e_aqc_opc_query_vsi_ets_sla_config		= 0x040A,
+	i40e_aqc_opc_configure_switching_comp_bw_limit	= 0x0410,
+
+	i40e_aqc_opc_enable_switching_comp_ets			= 0x0413,
+	i40e_aqc_opc_modify_switching_comp_ets			= 0x0414,
+	i40e_aqc_opc_disable_switching_comp_ets			= 0x0415,
+	i40e_aqc_opc_configure_switching_comp_ets_bw_limit	= 0x0416,
+	i40e_aqc_opc_configure_switching_comp_bw_config		= 0x0417,
+	i40e_aqc_opc_query_switching_comp_ets_config		= 0x0418,
+	i40e_aqc_opc_query_port_ets_config			= 0x0419,
+	i40e_aqc_opc_query_switching_comp_bw_config		= 0x041A,
+	i40e_aqc_opc_suspend_port_tx				= 0x041B,
+	i40e_aqc_opc_resume_port_tx				= 0x041C,
+	i40e_aqc_opc_configure_partition_bw			= 0x041D,
+	/* hmc */
+	i40e_aqc_opc_query_hmc_resource_profile	= 0x0500,
+	i40e_aqc_opc_set_hmc_resource_profile	= 0x0501,
+
+	/* phy commands*/
+	i40e_aqc_opc_get_phy_abilities		= 0x0600,
+	i40e_aqc_opc_set_phy_config		= 0x0601,
+	i40e_aqc_opc_set_mac_config		= 0x0603,
+	i40e_aqc_opc_set_link_restart_an	= 0x0605,
+	i40e_aqc_opc_get_link_status		= 0x0607,
+	i40e_aqc_opc_set_phy_int_mask		= 0x0613,
+	i40e_aqc_opc_get_local_advt_reg		= 0x0614,
+	i40e_aqc_opc_set_local_advt_reg		= 0x0615,
+	i40e_aqc_opc_get_partner_advt		= 0x0616,
+	i40e_aqc_opc_set_lb_modes		= 0x0618,
+	i40e_aqc_opc_get_phy_wol_caps		= 0x0621,
+	i40e_aqc_opc_set_phy_debug		= 0x0622,
+	i40e_aqc_opc_upload_ext_phy_fm		= 0x0625,
+	i40e_aqc_opc_run_phy_activity		= 0x0626,
+	i40e_aqc_opc_set_phy_register		= 0x0628,
+	i40e_aqc_opc_get_phy_register		= 0x0629,
+
+	/* NVM commands */
+	i40e_aqc_opc_nvm_read			= 0x0701,
+	i40e_aqc_opc_nvm_erase			= 0x0702,
+	i40e_aqc_opc_nvm_update			= 0x0703,
+	i40e_aqc_opc_nvm_config_read		= 0x0704,
+	i40e_aqc_opc_nvm_config_write		= 0x0705,
+	i40e_aqc_opc_oem_post_update		= 0x0720,
+	i40e_aqc_opc_thermal_sensor		= 0x0721,
+
+	/* virtualization commands */
+	i40e_aqc_opc_send_msg_to_pf		= 0x0801,
+	i40e_aqc_opc_send_msg_to_vf		= 0x0802,
+	i40e_aqc_opc_send_msg_to_peer		= 0x0803,
+
+	/* alternate structure */
+	i40e_aqc_opc_alternate_write		= 0x0900,
+	i40e_aqc_opc_alternate_write_indirect	= 0x0901,
+	i40e_aqc_opc_alternate_read		= 0x0902,
+	i40e_aqc_opc_alternate_read_indirect	= 0x0903,
+	i40e_aqc_opc_alternate_write_done	= 0x0904,
+	i40e_aqc_opc_alternate_set_mode		= 0x0905,
+	i40e_aqc_opc_alternate_clear_port	= 0x0906,
+
+	/* LLDP commands */
+	i40e_aqc_opc_lldp_get_mib	= 0x0A00,
+	i40e_aqc_opc_lldp_update_mib	= 0x0A01,
+	i40e_aqc_opc_lldp_add_tlv	= 0x0A02,
+	i40e_aqc_opc_lldp_update_tlv	= 0x0A03,
+	i40e_aqc_opc_lldp_delete_tlv	= 0x0A04,
+	i40e_aqc_opc_lldp_stop		= 0x0A05,
+	i40e_aqc_opc_lldp_start		= 0x0A06,
+	i40e_aqc_opc_get_cee_dcb_cfg	= 0x0A07,
+	i40e_aqc_opc_lldp_set_local_mib	= 0x0A08,
+	i40e_aqc_opc_lldp_stop_start_spec_agent	= 0x0A09,
+
+	/* Tunnel commands */
+	i40e_aqc_opc_add_udp_tunnel	= 0x0B00,
+	i40e_aqc_opc_del_udp_tunnel	= 0x0B01,
+	i40e_aqc_opc_set_rss_key	= 0x0B02,
+	i40e_aqc_opc_set_rss_lut	= 0x0B03,
+	i40e_aqc_opc_get_rss_key	= 0x0B04,
+	i40e_aqc_opc_get_rss_lut	= 0x0B05,
+
+	/* Async Events */
+	i40e_aqc_opc_event_lan_overflow		= 0x1001,
+
+	/* OEM commands */
+	i40e_aqc_opc_oem_parameter_change	= 0xFE00,
+	i40e_aqc_opc_oem_device_status_change	= 0xFE01,
+	i40e_aqc_opc_oem_ocsd_initialize	= 0xFE02,
+	i40e_aqc_opc_oem_ocbb_initialize	= 0xFE03,
+
+	/* debug commands */
+	i40e_aqc_opc_debug_read_reg		= 0xFF03,
+	i40e_aqc_opc_debug_write_reg		= 0xFF04,
+	i40e_aqc_opc_debug_modify_reg		= 0xFF07,
+	i40e_aqc_opc_debug_dump_internals	= 0xFF08,
+};
+
+/* command structures and indirect data structures */
+
+/* Structure naming conventions:
+ * - no suffix for direct command descriptor structures
+ * - _data for indirect sent data
+ * - _resp for indirect return data (data which is both will use _data)
+ * - _completion for direct return data
+ * - _element_ for repeated elements (may also be _data or _resp)
+ *
+ * Command structures are expected to overlay the params.raw member of the basic
+ * descriptor, and as such cannot exceed 16 bytes in length.
+ */
+
+/* This macro is used to generate a compilation error if a structure
+ * is not exactly the correct length. It gives a divide by zero error if the
+ * structure is not of the correct size, otherwise it creates an enum that is
+ * never used.
+ */
+#define I40E_CHECK_STRUCT_LEN(n, X) enum i40e_static_assert_enum_##X \
+	{ i40e_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) }
+
+/* This macro is used extensively to ensure that command structures are 16
+ * bytes in length as they have to map to the raw array of that size.
+ */
+#define I40E_CHECK_CMD_LENGTH(X)	I40E_CHECK_STRUCT_LEN(16, X)
+
+/* internal (0x00XX) commands */
+
+/* Get version (direct 0x0001) */
+struct i40e_aqc_get_version {
+	__le32 rom_ver;
+	__le32 fw_build;
+	__le16 fw_major;
+	__le16 fw_minor;
+	__le16 api_major;
+	__le16 api_minor;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_version);
+
+/* Send driver version (indirect 0x0002) */
+struct i40e_aqc_driver_version {
+	u8	driver_major_ver;
+	u8	driver_minor_ver;
+	u8	driver_build_ver;
+	u8	driver_subbuild_ver;
+	u8	reserved[4];
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_driver_version);
+
+/* Queue Shutdown (direct 0x0003) */
+struct i40e_aqc_queue_shutdown {
+	__le32	driver_unloading;
+#define I40E_AQ_DRIVER_UNLOADING	0x1
+	u8	reserved[12];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_queue_shutdown);
+
+/* Set PF context (0x0004, direct) */
+struct i40e_aqc_set_pf_context {
+	u8	pf_id;
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_pf_context);
+
+/* Request resource ownership (direct 0x0008)
+ * Release resource ownership (direct 0x0009)
+ */
+#define I40E_AQ_RESOURCE_NVM			1
+#define I40E_AQ_RESOURCE_SDP			2
+#define I40E_AQ_RESOURCE_ACCESS_READ		1
+#define I40E_AQ_RESOURCE_ACCESS_WRITE		2
+#define I40E_AQ_RESOURCE_NVM_READ_TIMEOUT	3000
+#define I40E_AQ_RESOURCE_NVM_WRITE_TIMEOUT	180000
+
+struct i40e_aqc_request_resource {
+	__le16	resource_id;
+	__le16	access_type;
+	__le32	timeout;
+	__le32	resource_number;
+	u8	reserved[4];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_request_resource);
+
+/* Get function capabilities (indirect 0x000A)
+ * Get device capabilities (indirect 0x000B)
+ */
+struct i40e_aqc_list_capabilites {
+	u8 command_flags;
+#define I40E_AQ_LIST_CAP_PF_INDEX_EN	1
+	u8 pf_index;
+	u8 reserved[2];
+	__le32 count;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_list_capabilites);
+
+struct i40e_aqc_list_capabilities_element_resp {
+	__le16	id;
+	u8	major_rev;
+	u8	minor_rev;
+	__le32	number;
+	__le32	logical_id;
+	__le32	phys_id;
+	u8	reserved[16];
+};
+
+/* list of caps */
+
+#define I40E_AQ_CAP_ID_SWITCH_MODE	0x0001
+#define I40E_AQ_CAP_ID_MNG_MODE		0x0002
+#define I40E_AQ_CAP_ID_NPAR_ACTIVE	0x0003
+#define I40E_AQ_CAP_ID_OS2BMC_CAP	0x0004
+#define I40E_AQ_CAP_ID_FUNCTIONS_VALID	0x0005
+#define I40E_AQ_CAP_ID_ALTERNATE_RAM	0x0006
+#define I40E_AQ_CAP_ID_WOL_AND_PROXY	0x0008
+#define I40E_AQ_CAP_ID_SRIOV		0x0012
+#define I40E_AQ_CAP_ID_VF		0x0013
+#define I40E_AQ_CAP_ID_VMDQ		0x0014
+#define I40E_AQ_CAP_ID_8021QBG		0x0015
+#define I40E_AQ_CAP_ID_8021QBR		0x0016
+#define I40E_AQ_CAP_ID_VSI		0x0017
+#define I40E_AQ_CAP_ID_DCB		0x0018
+#define I40E_AQ_CAP_ID_FCOE		0x0021
+#define I40E_AQ_CAP_ID_ISCSI		0x0022
+#define I40E_AQ_CAP_ID_RSS		0x0040
+#define I40E_AQ_CAP_ID_RXQ		0x0041
+#define I40E_AQ_CAP_ID_TXQ		0x0042
+#define I40E_AQ_CAP_ID_MSIX		0x0043
+#define I40E_AQ_CAP_ID_VF_MSIX		0x0044
+#define I40E_AQ_CAP_ID_FLOW_DIRECTOR	0x0045
+#define I40E_AQ_CAP_ID_1588		0x0046
+#define I40E_AQ_CAP_ID_IWARP		0x0051
+#define I40E_AQ_CAP_ID_LED		0x0061
+#define I40E_AQ_CAP_ID_SDP		0x0062
+#define I40E_AQ_CAP_ID_MDIO		0x0063
+#define I40E_AQ_CAP_ID_WSR_PROT		0x0064
+#define I40E_AQ_CAP_ID_NVM_MGMT		0x0080
+#define I40E_AQ_CAP_ID_FLEX10		0x00F1
+#define I40E_AQ_CAP_ID_CEM		0x00F2
+
+/* Set CPPM Configuration (direct 0x0103) */
+struct i40e_aqc_cppm_configuration {
+	__le16	command_flags;
+#define I40E_AQ_CPPM_EN_LTRC	0x0800
+#define I40E_AQ_CPPM_EN_DMCTH	0x1000
+#define I40E_AQ_CPPM_EN_DMCTLX	0x2000
+#define I40E_AQ_CPPM_EN_HPTC	0x4000
+#define I40E_AQ_CPPM_EN_DMARC	0x8000
+	__le16	ttlx;
+	__le32	dmacr;
+	__le16	dmcth;
+	u8	hptc;
+	u8	reserved;
+	__le32	pfltrc;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_cppm_configuration);
+
+/* Set ARP Proxy command / response (indirect 0x0104) */
+struct i40e_aqc_arp_proxy_data {
+	__le16	command_flags;
+#define I40E_AQ_ARP_INIT_IPV4	0x0800
+#define I40E_AQ_ARP_UNSUP_CTL	0x1000
+#define I40E_AQ_ARP_ENA		0x2000
+#define I40E_AQ_ARP_ADD_IPV4	0x4000
+#define I40E_AQ_ARP_DEL_IPV4	0x8000
+	__le16	table_id;
+	__le32	enabled_offloads;
+#define I40E_AQ_ARP_DIRECTED_OFFLOAD_ENABLE	0x00000020
+#define I40E_AQ_ARP_OFFLOAD_ENABLE		0x00000800
+	__le32	ip_addr;
+	u8	mac_addr[6];
+	u8	reserved[2];
+};
+
+I40E_CHECK_STRUCT_LEN(0x14, i40e_aqc_arp_proxy_data);
+
+/* Set NS Proxy Table Entry Command (indirect 0x0105) */
+struct i40e_aqc_ns_proxy_data {
+	__le16	table_idx_mac_addr_0;
+	__le16	table_idx_mac_addr_1;
+	__le16	table_idx_ipv6_0;
+	__le16	table_idx_ipv6_1;
+	__le16	control;
+#define I40E_AQ_NS_PROXY_ADD_0		0x0001
+#define I40E_AQ_NS_PROXY_DEL_0		0x0002
+#define I40E_AQ_NS_PROXY_ADD_1		0x0004
+#define I40E_AQ_NS_PROXY_DEL_1		0x0008
+#define I40E_AQ_NS_PROXY_ADD_IPV6_0	0x0010
+#define I40E_AQ_NS_PROXY_DEL_IPV6_0	0x0020
+#define I40E_AQ_NS_PROXY_ADD_IPV6_1	0x0040
+#define I40E_AQ_NS_PROXY_DEL_IPV6_1	0x0080
+#define I40E_AQ_NS_PROXY_COMMAND_SEQ	0x0100
+#define I40E_AQ_NS_PROXY_INIT_IPV6_TBL	0x0200
+#define I40E_AQ_NS_PROXY_INIT_MAC_TBL	0x0400
+#define I40E_AQ_NS_PROXY_OFFLOAD_ENABLE	0x0800
+#define I40E_AQ_NS_PROXY_DIRECTED_OFFLOAD_ENABLE	0x1000
+	u8	mac_addr_0[6];
+	u8	mac_addr_1[6];
+	u8	local_mac_addr[6];
+	u8	ipv6_addr_0[16]; /* Warning! spec specifies BE byte order */
+	u8	ipv6_addr_1[16];
+};
+
+I40E_CHECK_STRUCT_LEN(0x3c, i40e_aqc_ns_proxy_data);
+
+/* Manage LAA Command (0x0106) - obsolete */
+struct i40e_aqc_mng_laa {
+	__le16	command_flags;
+#define I40E_AQ_LAA_FLAG_WR	0x8000
+	u8	reserved[2];
+	__le32	sal;
+	__le16	sah;
+	u8	reserved2[6];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_mng_laa);
+
+/* Manage MAC Address Read Command (indirect 0x0107) */
+struct i40e_aqc_mac_address_read {
+	__le16	command_flags;
+#define I40E_AQC_LAN_ADDR_VALID		0x10
+#define I40E_AQC_SAN_ADDR_VALID		0x20
+#define I40E_AQC_PORT_ADDR_VALID	0x40
+#define I40E_AQC_WOL_ADDR_VALID		0x80
+#define I40E_AQC_MC_MAG_EN_VALID	0x100
+#define I40E_AQC_ADDR_VALID_MASK	0x3F0
+	u8	reserved[6];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_mac_address_read);
+
+struct i40e_aqc_mac_address_read_data {
+	u8 pf_lan_mac[6];
+	u8 pf_san_mac[6];
+	u8 port_mac[6];
+	u8 pf_wol_mac[6];
+};
+
+I40E_CHECK_STRUCT_LEN(24, i40e_aqc_mac_address_read_data);
+
+/* Manage MAC Address Write Command (0x0108) */
+struct i40e_aqc_mac_address_write {
+	__le16	command_flags;
+#define I40E_AQC_MC_MAG_EN		0x0100
+#define I40E_AQC_WOL_PRESERVE_ON_PFR	0x0200
+#define I40E_AQC_WRITE_TYPE_LAA_ONLY	0x0000
+#define I40E_AQC_WRITE_TYPE_LAA_WOL	0x4000
+#define I40E_AQC_WRITE_TYPE_PORT	0x8000
+#define I40E_AQC_WRITE_TYPE_UPDATE_MC_MAG	0xC000
+#define I40E_AQC_WRITE_TYPE_MASK	0xC000
+
+	__le16	mac_sah;
+	__le32	mac_sal;
+	u8	reserved[8];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_mac_address_write);
+
+/* PXE commands (0x011x) */
+
+/* Clear PXE Command and response  (direct 0x0110) */
+struct i40e_aqc_clear_pxe {
+	u8	rx_cnt;
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_clear_pxe);
+
+/* Set WoL Filter (0x0120) */
+
+struct i40e_aqc_set_wol_filter {
+	__le16 filter_index;
+#define I40E_AQC_MAX_NUM_WOL_FILTERS	8
+#define I40E_AQC_SET_WOL_FILTER_TYPE_MAGIC_SHIFT	15
+#define I40E_AQC_SET_WOL_FILTER_TYPE_MAGIC_MASK	(0x1 << \
+		I40E_AQC_SET_WOL_FILTER_TYPE_MAGIC_SHIFT)
+
+#define I40E_AQC_SET_WOL_FILTER_INDEX_SHIFT		0
+#define I40E_AQC_SET_WOL_FILTER_INDEX_MASK	(0x7 << \
+		I40E_AQC_SET_WOL_FILTER_INDEX_SHIFT)
+	__le16 cmd_flags;
+#define I40E_AQC_SET_WOL_FILTER				0x8000
+#define I40E_AQC_SET_WOL_FILTER_NO_TCO_WOL		0x4000
+#define I40E_AQC_SET_WOL_FILTER_ACTION_CLEAR		0
+#define I40E_AQC_SET_WOL_FILTER_ACTION_SET		1
+	__le16 valid_flags;
+#define I40E_AQC_SET_WOL_FILTER_ACTION_VALID		0x8000
+#define I40E_AQC_SET_WOL_FILTER_NO_TCO_ACTION_VALID	0x4000
+	u8 reserved[2];
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_wol_filter);
+
+struct i40e_aqc_set_wol_filter_data {
+	u8 filter[128];
+	u8 mask[16];
+};
+
+I40E_CHECK_STRUCT_LEN(0x90, i40e_aqc_set_wol_filter_data);
+
+/* Get Wake Reason (0x0121) */
+
+struct i40e_aqc_get_wake_reason_completion {
+	u8 reserved_1[2];
+	__le16 wake_reason;
+#define I40E_AQC_GET_WAKE_UP_REASON_WOL_REASON_MATCHED_INDEX_SHIFT	0
+#define I40E_AQC_GET_WAKE_UP_REASON_WOL_REASON_MATCHED_INDEX_MASK (0xFF << \
+		I40E_AQC_GET_WAKE_UP_REASON_WOL_REASON_MATCHED_INDEX_SHIFT)
+#define I40E_AQC_GET_WAKE_UP_REASON_WOL_REASON_RESERVED_SHIFT	8
+#define I40E_AQC_GET_WAKE_UP_REASON_WOL_REASON_RESERVED_MASK	(0xFF << \
+		I40E_AQC_GET_WAKE_UP_REASON_WOL_REASON_RESERVED_SHIFT)
+	u8 reserved_2[12];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_wake_reason_completion);
+
+/* Switch configuration commands (0x02xx) */
+
+/* Used by many indirect commands that only pass an seid and a buffer in the
+ * command
+ */
+struct i40e_aqc_switch_seid {
+	__le16	seid;
+	u8	reserved[6];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_switch_seid);
+
+/* Get Switch Configuration command (indirect 0x0200)
+ * uses i40e_aqc_switch_seid for the descriptor
+ */
+struct i40e_aqc_get_switch_config_header_resp {
+	__le16	num_reported;
+	__le16	num_total;
+	u8	reserved[12];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_switch_config_header_resp);
+
+struct i40e_aqc_switch_config_element_resp {
+	u8	element_type;
+#define I40E_AQ_SW_ELEM_TYPE_MAC	1
+#define I40E_AQ_SW_ELEM_TYPE_PF		2
+#define I40E_AQ_SW_ELEM_TYPE_VF		3
+#define I40E_AQ_SW_ELEM_TYPE_EMP	4
+#define I40E_AQ_SW_ELEM_TYPE_BMC	5
+#define I40E_AQ_SW_ELEM_TYPE_PV		16
+#define I40E_AQ_SW_ELEM_TYPE_VEB	17
+#define I40E_AQ_SW_ELEM_TYPE_PA		18
+#define I40E_AQ_SW_ELEM_TYPE_VSI	19
+	u8	revision;
+#define I40E_AQ_SW_ELEM_REV_1		1
+	__le16	seid;
+	__le16	uplink_seid;
+	__le16	downlink_seid;
+	u8	reserved[3];
+	u8	connection_type;
+#define I40E_AQ_CONN_TYPE_REGULAR	0x1
+#define I40E_AQ_CONN_TYPE_DEFAULT	0x2
+#define I40E_AQ_CONN_TYPE_CASCADED	0x3
+	__le16	scheduler_id;
+	__le16	element_info;
+};
+
+I40E_CHECK_STRUCT_LEN(0x10, i40e_aqc_switch_config_element_resp);
+
+/* Get Switch Configuration (indirect 0x0200)
+ *    an array of elements are returned in the response buffer
+ *    the first in the array is the header, remainder are elements
+ */
+struct i40e_aqc_get_switch_config_resp {
+	struct i40e_aqc_get_switch_config_header_resp	header;
+	struct i40e_aqc_switch_config_element_resp	element[1];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_get_switch_config_resp);
+
+/* Add Statistics (direct 0x0201)
+ * Remove Statistics (direct 0x0202)
+ */
+struct i40e_aqc_add_remove_statistics {
+	__le16	seid;
+	__le16	vlan;
+	__le16	stat_index;
+	u8	reserved[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_statistics);
+
+/* Set Port Parameters command (direct 0x0203) */
+struct i40e_aqc_set_port_parameters {
+	__le16	command_flags;
+#define I40E_AQ_SET_P_PARAMS_SAVE_BAD_PACKETS	1
+#define I40E_AQ_SET_P_PARAMS_PAD_SHORT_PACKETS	2 /* must set! */
+#define I40E_AQ_SET_P_PARAMS_DOUBLE_VLAN_ENA	4
+	__le16	bad_frame_vsi;
+#define I40E_AQ_SET_P_PARAMS_BFRAME_SEID_SHIFT	0x0
+#define I40E_AQ_SET_P_PARAMS_BFRAME_SEID_MASK	0x3FF
+	__le16	default_seid;        /* reserved for command */
+	u8	reserved[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_port_parameters);
+
+/* Get Switch Resource Allocation (indirect 0x0204) */
+struct i40e_aqc_get_switch_resource_alloc {
+	u8	num_entries;         /* reserved for command */
+	u8	reserved[7];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_switch_resource_alloc);
+
+/* expect an array of these structs in the response buffer */
+struct i40e_aqc_switch_resource_alloc_element_resp {
+	u8	resource_type;
+#define I40E_AQ_RESOURCE_TYPE_VEB		0x0
+#define I40E_AQ_RESOURCE_TYPE_VSI		0x1
+#define I40E_AQ_RESOURCE_TYPE_MACADDR		0x2
+#define I40E_AQ_RESOURCE_TYPE_STAG		0x3
+#define I40E_AQ_RESOURCE_TYPE_ETAG		0x4
+#define I40E_AQ_RESOURCE_TYPE_MULTICAST_HASH	0x5
+#define I40E_AQ_RESOURCE_TYPE_UNICAST_HASH	0x6
+#define I40E_AQ_RESOURCE_TYPE_VLAN		0x7
+#define I40E_AQ_RESOURCE_TYPE_VSI_LIST_ENTRY	0x8
+#define I40E_AQ_RESOURCE_TYPE_ETAG_LIST_ENTRY	0x9
+#define I40E_AQ_RESOURCE_TYPE_VLAN_STAT_POOL	0xA
+#define I40E_AQ_RESOURCE_TYPE_MIRROR_RULE	0xB
+#define I40E_AQ_RESOURCE_TYPE_QUEUE_SETS	0xC
+#define I40E_AQ_RESOURCE_TYPE_VLAN_FILTERS	0xD
+#define I40E_AQ_RESOURCE_TYPE_INNER_MAC_FILTERS	0xF
+#define I40E_AQ_RESOURCE_TYPE_IP_FILTERS	0x10
+#define I40E_AQ_RESOURCE_TYPE_GRE_VN_KEYS	0x11
+#define I40E_AQ_RESOURCE_TYPE_VN2_KEYS		0x12
+#define I40E_AQ_RESOURCE_TYPE_TUNNEL_PORTS	0x13
+	u8	reserved1;
+	__le16	guaranteed;
+	__le16	total;
+	__le16	used;
+	__le16	total_unalloced;
+	u8	reserved2[6];
+};
+
+I40E_CHECK_STRUCT_LEN(0x10, i40e_aqc_switch_resource_alloc_element_resp);
+
+/* Set Switch Configuration (direct 0x0205) */
+struct i40e_aqc_set_switch_config {
+	__le16	flags;
+/* flags used for both fields below */
+#define I40E_AQ_SET_SWITCH_CFG_PROMISC		0x0001
+#define I40E_AQ_SET_SWITCH_CFG_L2_FILTER	0x0002
+	__le16	valid_flags;
+	/* The ethertype in switch_tag is dropped on ingress and used
+	 * internally by the switch. Set this to zero for the default
+	 * of 0x88a8 (802.1ad). Should be zero for firmware API
+	 * versions lower than 1.7.
+	 */
+	__le16	switch_tag;
+	/* The ethertypes in first_tag and second_tag are used to
+	 * match the outer and inner VLAN tags (respectively) when HW
+	 * double VLAN tagging is enabled via the set port parameters
+	 * AQ command. Otherwise these are both ignored. Set them to
+	 * zero for their defaults of 0x8100 (802.1Q). Should be zero
+	 * for firmware API versions lower than 1.7.
+	 */
+	__le16	first_tag;
+	__le16	second_tag;
+	/* Next byte is split into following:
+	 * Bit 7    : 0 : No action, 1: Switch to mode defined by bits 6:0
+	 * Bit 6    : 0 : Destination Port, 1: source port
+	 * Bit 5..4 : L4 type
+	 * 0: rsvd
+	 * 1: TCP
+	 * 2: UDP
+	 * 3: Both TCP and UDP
+	 * Bits 3:0 Mode
+	 * 0: default mode
+	 * 1: L4 port only mode
+	 * 2: non-tunneled mode
+	 * 3: tunneled mode
+	 */
+#define I40E_AQ_SET_SWITCH_BIT7_VALID		0x80
+
+#define I40E_AQ_SET_SWITCH_L4_SRC_PORT		0x40
+
+#define I40E_AQ_SET_SWITCH_L4_TYPE_RSVD		0x00
+#define I40E_AQ_SET_SWITCH_L4_TYPE_TCP		0x10
+#define I40E_AQ_SET_SWITCH_L4_TYPE_UDP		0x20
+#define I40E_AQ_SET_SWITCH_L4_TYPE_BOTH		0x30
+
+#define I40E_AQ_SET_SWITCH_MODE_DEFAULT		0x00
+#define I40E_AQ_SET_SWITCH_MODE_L4_PORT		0x01
+#define I40E_AQ_SET_SWITCH_MODE_NON_TUNNEL	0x02
+#define I40E_AQ_SET_SWITCH_MODE_TUNNEL		0x03
+	u8	mode;
+	u8	rsvd5[5];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_switch_config);
+
+/* Read Receive control registers  (direct 0x0206)
+ * Write Receive control registers (direct 0x0207)
+ *     used for accessing Rx control registers that can be
+ *     slow and need special handling when under high Rx load
+ */
+struct i40e_aqc_rx_ctl_reg_read_write {
+	__le32 reserved1;
+	__le32 address;
+	__le32 reserved2;
+	__le32 value;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_rx_ctl_reg_read_write);
+
+/* Add VSI (indirect 0x0210)
+ *    this indirect command uses struct i40e_aqc_vsi_properties_data
+ *    as the indirect buffer (128 bytes)
+ *
+ * Update VSI (indirect 0x211)
+ *     uses the same data structure as Add VSI
+ *
+ * Get VSI (indirect 0x0212)
+ *     uses the same completion and data structure as Add VSI
+ */
+struct i40e_aqc_add_get_update_vsi {
+	__le16	uplink_seid;
+	u8	connection_type;
+#define I40E_AQ_VSI_CONN_TYPE_NORMAL	0x1
+#define I40E_AQ_VSI_CONN_TYPE_DEFAULT	0x2
+#define I40E_AQ_VSI_CONN_TYPE_CASCADED	0x3
+	u8	reserved1;
+	u8	vf_id;
+	u8	reserved2;
+	__le16	vsi_flags;
+#define I40E_AQ_VSI_TYPE_SHIFT		0x0
+#define I40E_AQ_VSI_TYPE_MASK		(0x3 << I40E_AQ_VSI_TYPE_SHIFT)
+#define I40E_AQ_VSI_TYPE_VF		0x0
+#define I40E_AQ_VSI_TYPE_VMDQ2		0x1
+#define I40E_AQ_VSI_TYPE_PF		0x2
+#define I40E_AQ_VSI_TYPE_EMP_MNG	0x3
+#define I40E_AQ_VSI_FLAG_CASCADED_PV	0x4
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_get_update_vsi);
+
+struct i40e_aqc_add_get_update_vsi_completion {
+	__le16 seid;
+	__le16 vsi_number;
+	__le16 vsi_used;
+	__le16 vsi_free;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_get_update_vsi_completion);
+
+struct i40e_aqc_vsi_properties_data {
+	/* first 96 byte are written by SW */
+	__le16	valid_sections;
+#define I40E_AQ_VSI_PROP_SWITCH_VALID		0x0001
+#define I40E_AQ_VSI_PROP_SECURITY_VALID		0x0002
+#define I40E_AQ_VSI_PROP_VLAN_VALID		0x0004
+#define I40E_AQ_VSI_PROP_CAS_PV_VALID		0x0008
+#define I40E_AQ_VSI_PROP_INGRESS_UP_VALID	0x0010
+#define I40E_AQ_VSI_PROP_EGRESS_UP_VALID	0x0020
+#define I40E_AQ_VSI_PROP_QUEUE_MAP_VALID	0x0040
+#define I40E_AQ_VSI_PROP_QUEUE_OPT_VALID	0x0080
+#define I40E_AQ_VSI_PROP_OUTER_UP_VALID		0x0100
+#define I40E_AQ_VSI_PROP_SCHED_VALID		0x0200
+	/* switch section */
+	__le16	switch_id; /* 12bit id combined with flags below */
+#define I40E_AQ_VSI_SW_ID_SHIFT		0x0000
+#define I40E_AQ_VSI_SW_ID_MASK		(0xFFF << I40E_AQ_VSI_SW_ID_SHIFT)
+#define I40E_AQ_VSI_SW_ID_FLAG_NOT_STAG	0x1000
+#define I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB	0x2000
+#define I40E_AQ_VSI_SW_ID_FLAG_LOCAL_LB	0x4000
+	u8	sw_reserved[2];
+	/* security section */
+	u8	sec_flags;
+#define I40E_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD	0x01
+#define I40E_AQ_VSI_SEC_FLAG_ENABLE_VLAN_CHK	0x02
+#define I40E_AQ_VSI_SEC_FLAG_ENABLE_MAC_CHK	0x04
+	u8	sec_reserved;
+	/* VLAN section */
+	__le16	pvid; /* VLANS include priority bits */
+	__le16	fcoe_pvid;
+	u8	port_vlan_flags;
+#define I40E_AQ_VSI_PVLAN_MODE_SHIFT	0x00
+#define I40E_AQ_VSI_PVLAN_MODE_MASK	(0x03 << \
+					 I40E_AQ_VSI_PVLAN_MODE_SHIFT)
+#define I40E_AQ_VSI_PVLAN_MODE_TAGGED	0x01
+#define I40E_AQ_VSI_PVLAN_MODE_UNTAGGED	0x02
+#define I40E_AQ_VSI_PVLAN_MODE_ALL	0x03
+#define I40E_AQ_VSI_PVLAN_INSERT_PVID	0x04
+#define I40E_AQ_VSI_PVLAN_EMOD_SHIFT	0x03
+#define I40E_AQ_VSI_PVLAN_EMOD_MASK	(0x3 << \
+					 I40E_AQ_VSI_PVLAN_EMOD_SHIFT)
+#define I40E_AQ_VSI_PVLAN_EMOD_STR_BOTH	0x0
+#define I40E_AQ_VSI_PVLAN_EMOD_STR_UP	0x08
+#define I40E_AQ_VSI_PVLAN_EMOD_STR	0x10
+#define I40E_AQ_VSI_PVLAN_EMOD_NOTHING	0x18
+	u8	pvlan_reserved[3];
+	/* ingress egress up sections */
+	__le32	ingress_table; /* bitmap, 3 bits per up */
+#define I40E_AQ_VSI_UP_TABLE_UP0_SHIFT	0
+#define I40E_AQ_VSI_UP_TABLE_UP0_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP0_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP1_SHIFT	3
+#define I40E_AQ_VSI_UP_TABLE_UP1_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP1_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP2_SHIFT	6
+#define I40E_AQ_VSI_UP_TABLE_UP2_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP2_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP3_SHIFT	9
+#define I40E_AQ_VSI_UP_TABLE_UP3_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP3_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP4_SHIFT	12
+#define I40E_AQ_VSI_UP_TABLE_UP4_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP4_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP5_SHIFT	15
+#define I40E_AQ_VSI_UP_TABLE_UP5_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP5_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP6_SHIFT	18
+#define I40E_AQ_VSI_UP_TABLE_UP6_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP6_SHIFT)
+#define I40E_AQ_VSI_UP_TABLE_UP7_SHIFT	21
+#define I40E_AQ_VSI_UP_TABLE_UP7_MASK	(0x7 << \
+					 I40E_AQ_VSI_UP_TABLE_UP7_SHIFT)
+	__le32	egress_table;   /* same defines as for ingress table */
+	/* cascaded PV section */
+	__le16	cas_pv_tag;
+	u8	cas_pv_flags;
+#define I40E_AQ_VSI_CAS_PV_TAGX_SHIFT		0x00
+#define I40E_AQ_VSI_CAS_PV_TAGX_MASK		(0x03 << \
+						 I40E_AQ_VSI_CAS_PV_TAGX_SHIFT)
+#define I40E_AQ_VSI_CAS_PV_TAGX_LEAVE		0x00
+#define I40E_AQ_VSI_CAS_PV_TAGX_REMOVE		0x01
+#define I40E_AQ_VSI_CAS_PV_TAGX_COPY		0x02
+#define I40E_AQ_VSI_CAS_PV_INSERT_TAG		0x10
+#define I40E_AQ_VSI_CAS_PV_ETAG_PRUNE		0x20
+#define I40E_AQ_VSI_CAS_PV_ACCEPT_HOST_TAG	0x40
+	u8	cas_pv_reserved;
+	/* queue mapping section */
+	__le16	mapping_flags;
+#define I40E_AQ_VSI_QUE_MAP_CONTIG	0x0
+#define I40E_AQ_VSI_QUE_MAP_NONCONTIG	0x1
+	__le16	queue_mapping[16];
+#define I40E_AQ_VSI_QUEUE_SHIFT		0x0
+#define I40E_AQ_VSI_QUEUE_MASK		(0x7FF << I40E_AQ_VSI_QUEUE_SHIFT)
+	__le16	tc_mapping[8];
+#define I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT	0
+#define I40E_AQ_VSI_TC_QUE_OFFSET_MASK	(0x1FF << \
+					 I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT)
+#define I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT	9
+#define I40E_AQ_VSI_TC_QUE_NUMBER_MASK	(0x7 << \
+					 I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT)
+	/* queueing option section */
+	u8	queueing_opt_flags;
+#define I40E_AQ_VSI_QUE_OPT_MULTICAST_UDP_ENA	0x04
+#define I40E_AQ_VSI_QUE_OPT_UNICAST_UDP_ENA	0x08
+#define I40E_AQ_VSI_QUE_OPT_TCP_ENA	0x10
+#define I40E_AQ_VSI_QUE_OPT_FCOE_ENA	0x20
+#define I40E_AQ_VSI_QUE_OPT_RSS_LUT_PF	0x00
+#define I40E_AQ_VSI_QUE_OPT_RSS_LUT_VSI	0x40
+	u8	queueing_opt_reserved[3];
+	/* scheduler section */
+	u8	up_enable_bits;
+	u8	sched_reserved;
+	/* outer up section */
+	__le32	outer_up_table; /* same structure and defines as ingress tbl */
+	u8	cmd_reserved[8];
+	/* last 32 bytes are written by FW */
+	__le16	qs_handle[8];
+#define I40E_AQ_VSI_QS_HANDLE_INVALID	0xFFFF
+	__le16	stat_counter_idx;
+	__le16	sched_id;
+	u8	resp_reserved[12];
+};
+
+I40E_CHECK_STRUCT_LEN(128, i40e_aqc_vsi_properties_data);
+
+/* Add Port Virtualizer (direct 0x0220)
+ * also used for update PV (direct 0x0221) but only flags are used
+ * (IS_CTRL_PORT only works on add PV)
+ */
+struct i40e_aqc_add_update_pv {
+	__le16	command_flags;
+#define I40E_AQC_PV_FLAG_PV_TYPE		0x1
+#define I40E_AQC_PV_FLAG_FWD_UNKNOWN_STAG_EN	0x2
+#define I40E_AQC_PV_FLAG_FWD_UNKNOWN_ETAG_EN	0x4
+#define I40E_AQC_PV_FLAG_IS_CTRL_PORT		0x8
+	__le16	uplink_seid;
+	__le16	connected_seid;
+	u8	reserved[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_update_pv);
+
+struct i40e_aqc_add_update_pv_completion {
+	/* reserved for update; for add also encodes error if rc == ENOSPC */
+	__le16	pv_seid;
+#define I40E_AQC_PV_ERR_FLAG_NO_PV	0x1
+#define I40E_AQC_PV_ERR_FLAG_NO_SCHED	0x2
+#define I40E_AQC_PV_ERR_FLAG_NO_COUNTER	0x4
+#define I40E_AQC_PV_ERR_FLAG_NO_ENTRY	0x8
+	u8	reserved[14];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_update_pv_completion);
+
+/* Get PV Params (direct 0x0222)
+ * uses i40e_aqc_switch_seid for the descriptor
+ */
+
+struct i40e_aqc_get_pv_params_completion {
+	__le16	seid;
+	__le16	default_stag;
+	__le16	pv_flags; /* same flags as add_pv */
+#define I40E_AQC_GET_PV_PV_TYPE			0x1
+#define I40E_AQC_GET_PV_FRWD_UNKNOWN_STAG	0x2
+#define I40E_AQC_GET_PV_FRWD_UNKNOWN_ETAG	0x4
+	u8	reserved[8];
+	__le16	default_port_seid;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_pv_params_completion);
+
+/* Add VEB (direct 0x0230) */
+struct i40e_aqc_add_veb {
+	__le16	uplink_seid;
+	__le16	downlink_seid;
+	__le16	veb_flags;
+#define I40E_AQC_ADD_VEB_FLOATING		0x1
+#define I40E_AQC_ADD_VEB_PORT_TYPE_SHIFT	1
+#define I40E_AQC_ADD_VEB_PORT_TYPE_MASK		(0x3 << \
+					I40E_AQC_ADD_VEB_PORT_TYPE_SHIFT)
+#define I40E_AQC_ADD_VEB_PORT_TYPE_DEFAULT	0x2
+#define I40E_AQC_ADD_VEB_PORT_TYPE_DATA		0x4
+#define I40E_AQC_ADD_VEB_ENABLE_L2_FILTER	0x8     /* deprecated */
+#define I40E_AQC_ADD_VEB_ENABLE_DISABLE_STATS	0x10
+	u8	enable_tcs;
+	u8	reserved[9];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_veb);
+
+struct i40e_aqc_add_veb_completion {
+	u8	reserved[6];
+	__le16	switch_seid;
+	/* also encodes error if rc == ENOSPC; codes are the same as add_pv */
+	__le16	veb_seid;
+#define I40E_AQC_VEB_ERR_FLAG_NO_VEB		0x1
+#define I40E_AQC_VEB_ERR_FLAG_NO_SCHED		0x2
+#define I40E_AQC_VEB_ERR_FLAG_NO_COUNTER	0x4
+#define I40E_AQC_VEB_ERR_FLAG_NO_ENTRY		0x8
+	__le16	statistic_index;
+	__le16	vebs_used;
+	__le16	vebs_free;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_veb_completion);
+
+/* Get VEB Parameters (direct 0x0232)
+ * uses i40e_aqc_switch_seid for the descriptor
+ */
+struct i40e_aqc_get_veb_parameters_completion {
+	__le16	seid;
+	__le16	switch_id;
+	__le16	veb_flags; /* only the first/last flags from 0x0230 is valid */
+	__le16	statistic_index;
+	__le16	vebs_used;
+	__le16	vebs_free;
+	u8	reserved[4];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_veb_parameters_completion);
+
+/* Delete Element (direct 0x0243)
+ * uses the generic i40e_aqc_switch_seid
+ */
+
+/* Add MAC-VLAN (indirect 0x0250) */
+
+/* used for the command for most vlan commands */
+struct i40e_aqc_macvlan {
+	__le16	num_addresses;
+	__le16	seid[3];
+#define I40E_AQC_MACVLAN_CMD_SEID_NUM_SHIFT	0
+#define I40E_AQC_MACVLAN_CMD_SEID_NUM_MASK	(0x3FF << \
+					I40E_AQC_MACVLAN_CMD_SEID_NUM_SHIFT)
+#define I40E_AQC_MACVLAN_CMD_SEID_VALID		0x8000
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_macvlan);
+
+/* indirect data for command and response */
+struct i40e_aqc_add_macvlan_element_data {
+	u8	mac_addr[6];
+	__le16	vlan_tag;
+	__le16	flags;
+#define I40E_AQC_MACVLAN_ADD_PERFECT_MATCH	0x0001
+#define I40E_AQC_MACVLAN_ADD_HASH_MATCH		0x0002
+#define I40E_AQC_MACVLAN_ADD_IGNORE_VLAN	0x0004
+#define I40E_AQC_MACVLAN_ADD_TO_QUEUE		0x0008
+#define I40E_AQC_MACVLAN_ADD_USE_SHARED_MAC	0x0010
+	__le16	queue_number;
+#define I40E_AQC_MACVLAN_CMD_QUEUE_SHIFT	0
+#define I40E_AQC_MACVLAN_CMD_QUEUE_MASK		(0x7FF << \
+					I40E_AQC_MACVLAN_CMD_SEID_NUM_SHIFT)
+	/* response section */
+	u8	match_method;
+#define I40E_AQC_MM_PERFECT_MATCH	0x01
+#define I40E_AQC_MM_HASH_MATCH		0x02
+#define I40E_AQC_MM_ERR_NO_RES		0xFF
+	u8	reserved1[3];
+};
+
+struct i40e_aqc_add_remove_macvlan_completion {
+	__le16 perfect_mac_used;
+	__le16 perfect_mac_free;
+	__le16 unicast_hash_free;
+	__le16 multicast_hash_free;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_macvlan_completion);
+
+/* Remove MAC-VLAN (indirect 0x0251)
+ * uses i40e_aqc_macvlan for the descriptor
+ * data points to an array of num_addresses of elements
+ */
+
+struct i40e_aqc_remove_macvlan_element_data {
+	u8	mac_addr[6];
+	__le16	vlan_tag;
+	u8	flags;
+#define I40E_AQC_MACVLAN_DEL_PERFECT_MATCH	0x01
+#define I40E_AQC_MACVLAN_DEL_HASH_MATCH		0x02
+#define I40E_AQC_MACVLAN_DEL_IGNORE_VLAN	0x08
+#define I40E_AQC_MACVLAN_DEL_ALL_VSIS		0x10
+	u8	reserved[3];
+	/* reply section */
+	u8	error_code;
+#define I40E_AQC_REMOVE_MACVLAN_SUCCESS		0x0
+#define I40E_AQC_REMOVE_MACVLAN_FAIL		0xFF
+	u8	reply_reserved[3];
+};
+
+/* Add VLAN (indirect 0x0252)
+ * Remove VLAN (indirect 0x0253)
+ * use the generic i40e_aqc_macvlan for the command
+ */
+struct i40e_aqc_add_remove_vlan_element_data {
+	__le16	vlan_tag;
+	u8	vlan_flags;
+/* flags for add VLAN */
+#define I40E_AQC_ADD_VLAN_LOCAL			0x1
+#define I40E_AQC_ADD_PVLAN_TYPE_SHIFT		1
+#define I40E_AQC_ADD_PVLAN_TYPE_MASK	(0x3 << I40E_AQC_ADD_PVLAN_TYPE_SHIFT)
+#define I40E_AQC_ADD_PVLAN_TYPE_REGULAR		0x0
+#define I40E_AQC_ADD_PVLAN_TYPE_PRIMARY		0x2
+#define I40E_AQC_ADD_PVLAN_TYPE_SECONDARY	0x4
+#define I40E_AQC_VLAN_PTYPE_SHIFT		3
+#define I40E_AQC_VLAN_PTYPE_MASK	(0x3 << I40E_AQC_VLAN_PTYPE_SHIFT)
+#define I40E_AQC_VLAN_PTYPE_REGULAR_VSI		0x0
+#define I40E_AQC_VLAN_PTYPE_PROMISC_VSI		0x8
+#define I40E_AQC_VLAN_PTYPE_COMMUNITY_VSI	0x10
+#define I40E_AQC_VLAN_PTYPE_ISOLATED_VSI	0x18
+/* flags for remove VLAN */
+#define I40E_AQC_REMOVE_VLAN_ALL	0x1
+	u8	reserved;
+	u8	result;
+/* flags for add VLAN */
+#define I40E_AQC_ADD_VLAN_SUCCESS	0x0
+#define I40E_AQC_ADD_VLAN_FAIL_REQUEST	0xFE
+#define I40E_AQC_ADD_VLAN_FAIL_RESOURCE	0xFF
+/* flags for remove VLAN */
+#define I40E_AQC_REMOVE_VLAN_SUCCESS	0x0
+#define I40E_AQC_REMOVE_VLAN_FAIL	0xFF
+	u8	reserved1[3];
+};
+
+struct i40e_aqc_add_remove_vlan_completion {
+	u8	reserved[4];
+	__le16	vlans_used;
+	__le16	vlans_free;
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+/* Set VSI Promiscuous Modes (direct 0x0254) */
+struct i40e_aqc_set_vsi_promiscuous_modes {
+	__le16	promiscuous_flags;
+	__le16	valid_flags;
+/* flags used for both fields above */
+#define I40E_AQC_SET_VSI_PROMISC_UNICAST	0x01
+#define I40E_AQC_SET_VSI_PROMISC_MULTICAST	0x02
+#define I40E_AQC_SET_VSI_PROMISC_BROADCAST	0x04
+#define I40E_AQC_SET_VSI_DEFAULT		0x08
+#define I40E_AQC_SET_VSI_PROMISC_VLAN		0x10
+#define I40E_AQC_SET_VSI_PROMISC_TX		0x8000
+	__le16	seid;
+#define I40E_AQC_VSI_PROM_CMD_SEID_MASK		0x3FF
+	__le16	vlan_tag;
+#define I40E_AQC_SET_VSI_VLAN_MASK		0x0FFF
+#define I40E_AQC_SET_VSI_VLAN_VALID		0x8000
+	u8	reserved[8];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_vsi_promiscuous_modes);
+
+/* Add S/E-tag command (direct 0x0255)
+ * Uses generic i40e_aqc_add_remove_tag_completion for completion
+ */
+struct i40e_aqc_add_tag {
+	__le16	flags;
+#define I40E_AQC_ADD_TAG_FLAG_TO_QUEUE		0x0001
+	__le16	seid;
+#define I40E_AQC_ADD_TAG_CMD_SEID_NUM_SHIFT	0
+#define I40E_AQC_ADD_TAG_CMD_SEID_NUM_MASK	(0x3FF << \
+					I40E_AQC_ADD_TAG_CMD_SEID_NUM_SHIFT)
+	__le16	tag;
+	__le16	queue_number;
+	u8	reserved[8];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_tag);
+
+struct i40e_aqc_add_remove_tag_completion {
+	u8	reserved[12];
+	__le16	tags_used;
+	__le16	tags_free;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_tag_completion);
+
+/* Remove S/E-tag command (direct 0x0256)
+ * Uses generic i40e_aqc_add_remove_tag_completion for completion
+ */
+struct i40e_aqc_remove_tag {
+	__le16	seid;
+#define I40E_AQC_REMOVE_TAG_CMD_SEID_NUM_SHIFT	0
+#define I40E_AQC_REMOVE_TAG_CMD_SEID_NUM_MASK	(0x3FF << \
+					I40E_AQC_REMOVE_TAG_CMD_SEID_NUM_SHIFT)
+	__le16	tag;
+	u8	reserved[12];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_remove_tag);
+
+/* Add multicast E-Tag (direct 0x0257)
+ * del multicast E-Tag (direct 0x0258) only uses pv_seid and etag fields
+ * and no external data
+ */
+struct i40e_aqc_add_remove_mcast_etag {
+	__le16	pv_seid;
+	__le16	etag;
+	u8	num_unicast_etags;
+	u8	reserved[3];
+	__le32	addr_high;          /* address of array of 2-byte s-tags */
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_mcast_etag);
+
+struct i40e_aqc_add_remove_mcast_etag_completion {
+	u8	reserved[4];
+	__le16	mcast_etags_used;
+	__le16	mcast_etags_free;
+	__le32	addr_high;
+	__le32	addr_low;
+
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_mcast_etag_completion);
+
+/* Update S/E-Tag (direct 0x0259) */
+struct i40e_aqc_update_tag {
+	__le16	seid;
+#define I40E_AQC_UPDATE_TAG_CMD_SEID_NUM_SHIFT	0
+#define I40E_AQC_UPDATE_TAG_CMD_SEID_NUM_MASK	(0x3FF << \
+					I40E_AQC_UPDATE_TAG_CMD_SEID_NUM_SHIFT)
+	__le16	old_tag;
+	__le16	new_tag;
+	u8	reserved[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_update_tag);
+
+struct i40e_aqc_update_tag_completion {
+	u8	reserved[12];
+	__le16	tags_used;
+	__le16	tags_free;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_update_tag_completion);
+
+/* Add Control Packet filter (direct 0x025A)
+ * Remove Control Packet filter (direct 0x025B)
+ * uses the i40e_aqc_add_oveb_cloud,
+ * and the generic direct completion structure
+ */
+struct i40e_aqc_add_remove_control_packet_filter {
+	u8	mac[6];
+	__le16	etype;
+	__le16	flags;
+#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_IGNORE_MAC	0x0001
+#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_DROP		0x0002
+#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_TO_QUEUE	0x0004
+#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_TX		0x0008
+#define I40E_AQC_ADD_CONTROL_PACKET_FLAGS_RX		0x0000
+	__le16	seid;
+#define I40E_AQC_ADD_CONTROL_PACKET_CMD_SEID_NUM_SHIFT	0
+#define I40E_AQC_ADD_CONTROL_PACKET_CMD_SEID_NUM_MASK	(0x3FF << \
+				I40E_AQC_ADD_CONTROL_PACKET_CMD_SEID_NUM_SHIFT)
+	__le16	queue;
+	u8	reserved[2];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_control_packet_filter);
+
+struct i40e_aqc_add_remove_control_packet_filter_completion {
+	__le16	mac_etype_used;
+	__le16	etype_used;
+	__le16	mac_etype_free;
+	__le16	etype_free;
+	u8	reserved[8];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_control_packet_filter_completion);
+
+/* Add Cloud filters (indirect 0x025C)
+ * Remove Cloud filters (indirect 0x025D)
+ * uses the i40e_aqc_add_remove_cloud_filters,
+ * and the generic indirect completion structure
+ */
+struct i40e_aqc_add_remove_cloud_filters {
+	u8	num_filters;
+	u8	reserved;
+	__le16	seid;
+#define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT	0
+#define I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_MASK	(0x3FF << \
+					I40E_AQC_ADD_CLOUD_CMD_SEID_NUM_SHIFT)
+	u8	big_buffer_flag;
+#define I40E_AQC_ADD_CLOUD_CMD_BB	1
+	u8	reserved2[3];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_remove_cloud_filters);
+
+struct i40e_aqc_cloud_filters_element_data {
+	u8	outer_mac[6];
+	u8	inner_mac[6];
+	__le16	inner_vlan;
+	union {
+		struct {
+			u8 reserved[12];
+			u8 data[4];
+		} v4;
+		struct {
+			u8 data[16];
+		} v6;
+		struct {
+			__le16 data[8];
+		} raw_v6;
+	} ipaddr;
+	__le16	flags;
+#define I40E_AQC_ADD_CLOUD_FILTER_SHIFT			0
+#define I40E_AQC_ADD_CLOUD_FILTER_MASK	(0x3F << \
+					I40E_AQC_ADD_CLOUD_FILTER_SHIFT)
+/* 0x0000 reserved */
+#define I40E_AQC_ADD_CLOUD_FILTER_OIP			0x0001
+/* 0x0002 reserved */
+#define I40E_AQC_ADD_CLOUD_FILTER_IMAC_IVLAN		0x0003
+#define I40E_AQC_ADD_CLOUD_FILTER_IMAC_IVLAN_TEN_ID	0x0004
+/* 0x0005 reserved */
+#define I40E_AQC_ADD_CLOUD_FILTER_IMAC_TEN_ID		0x0006
+/* 0x0007 reserved */
+/* 0x0008 reserved */
+#define I40E_AQC_ADD_CLOUD_FILTER_OMAC			0x0009
+#define I40E_AQC_ADD_CLOUD_FILTER_IMAC			0x000A
+#define I40E_AQC_ADD_CLOUD_FILTER_OMAC_TEN_ID_IMAC	0x000B
+#define I40E_AQC_ADD_CLOUD_FILTER_IIP			0x000C
+/* 0x0010 to 0x0017 is for custom filters */
+#define I40E_AQC_ADD_CLOUD_FILTER_IP_PORT		0x0010 /* Dest IP + L4 Port */
+#define I40E_AQC_ADD_CLOUD_FILTER_MAC_PORT		0x0011 /* Dest MAC + L4 Port */
+#define I40E_AQC_ADD_CLOUD_FILTER_MAC_VLAN_PORT		0x0012 /* Dest MAC + VLAN + L4 Port */
+
+#define I40E_AQC_ADD_CLOUD_FLAGS_TO_QUEUE		0x0080
+#define I40E_AQC_ADD_CLOUD_VNK_SHIFT			6
+#define I40E_AQC_ADD_CLOUD_VNK_MASK			0x00C0
+#define I40E_AQC_ADD_CLOUD_FLAGS_IPV4			0
+#define I40E_AQC_ADD_CLOUD_FLAGS_IPV6			0x0100
+
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_SHIFT		9
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_MASK		0x1E00
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_VXLAN		0
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_NVGRE_OMAC		1
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_GENEVE		2
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_IP			3
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_RESERVED		4
+#define I40E_AQC_ADD_CLOUD_TNL_TYPE_VXLAN_GPE		5
+
+#define I40E_AQC_ADD_CLOUD_FLAGS_SHARED_OUTER_MAC	0x2000
+#define I40E_AQC_ADD_CLOUD_FLAGS_SHARED_INNER_MAC	0x4000
+#define I40E_AQC_ADD_CLOUD_FLAGS_SHARED_OUTER_IP	0x8000
+
+	__le32	tenant_id;
+	u8	reserved[4];
+	__le16	queue_number;
+#define I40E_AQC_ADD_CLOUD_QUEUE_SHIFT		0
+#define I40E_AQC_ADD_CLOUD_QUEUE_MASK		(0x7FF << \
+						 I40E_AQC_ADD_CLOUD_QUEUE_SHIFT)
+	u8	reserved2[14];
+	/* response section */
+	u8	allocation_result;
+#define I40E_AQC_ADD_CLOUD_FILTER_SUCCESS	0x0
+#define I40E_AQC_ADD_CLOUD_FILTER_FAIL		0xFF
+	u8	response_reserved[7];
+};
+
+I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_cloud_filters_element_data);
+
+/* i40e_aqc_cloud_filters_element_bb is used when
+ * I40E_AQC_CLOUD_CMD_BB flag is set.
+ */
+struct i40e_aqc_cloud_filters_element_bb {
+	struct i40e_aqc_cloud_filters_element_data element;
+	u16     general_fields[32];
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD0	0
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD1	1
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X10_WORD2	2
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD0	3
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD1	4
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X11_WORD2	5
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD0	6
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD1	7
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X12_WORD2	8
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD0	9
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD1	10
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X13_WORD2	11
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD0	12
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD1	13
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X14_WORD2	14
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD0	15
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD1	16
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD2	17
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD3	18
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD4	19
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD5	20
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD6	21
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD7	22
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD0	23
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD1	24
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD2	25
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD3	26
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD4	27
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD5	28
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD6	29
+#define I40E_AQC_ADD_CLOUD_FV_FLU_0X17_WORD7	30
+};
+
+I40E_CHECK_STRUCT_LEN(0x80, i40e_aqc_cloud_filters_element_bb);
+
+struct i40e_aqc_remove_cloud_filters_completion {
+	__le16 perfect_ovlan_used;
+	__le16 perfect_ovlan_free;
+	__le16 vlan_used;
+	__le16 vlan_free;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_remove_cloud_filters_completion);
+
+/* Replace filter Command 0x025F
+ * uses the i40e_aqc_replace_cloud_filters,
+ * and the generic indirect completion structure
+ */
+struct i40e_filter_data {
+	u8 filter_type;
+	u8 input[3];
+};
+
+I40E_CHECK_STRUCT_LEN(4, i40e_filter_data);
+
+struct i40e_aqc_replace_cloud_filters_cmd {
+	u8      valid_flags;
+#define I40E_AQC_REPLACE_L1_FILTER		0x0
+#define I40E_AQC_REPLACE_CLOUD_FILTER		0x1
+#define I40E_AQC_GET_CLOUD_FILTERS		0x2
+#define I40E_AQC_MIRROR_CLOUD_FILTER		0x4
+#define I40E_AQC_HIGH_PRIORITY_CLOUD_FILTER	0x8
+	u8      old_filter_type;
+	u8      new_filter_type;
+	u8      tr_bit;
+	u8      reserved[4];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_replace_cloud_filters_cmd);
+
+struct i40e_aqc_replace_cloud_filters_cmd_buf {
+	u8      data[32];
+/* Filter type INPUT codes*/
+#define I40E_AQC_REPLACE_CLOUD_CMD_INPUT_ENTRIES_MAX	3
+#define I40E_AQC_REPLACE_CLOUD_CMD_INPUT_VALIDATED	BIT(7)
+
+/* Field Vector offsets */
+#define I40E_AQC_REPLACE_CLOUD_CMD_INPUT_FV_MAC_DA	0
+#define I40E_AQC_REPLACE_CLOUD_CMD_INPUT_FV_STAG_ETH	6
+#define I40E_AQC_REPLACE_CLOUD_CMD_INPUT_FV_STAG	7
+#define I40E_AQC_REPLACE_CLOUD_CMD_INPUT_FV_VLAN	8
+#define I40E_AQC_REPLACE_CLOUD_CMD_INPUT_FV_STAG_OVLAN	9
+#define I40E_AQC_REPLACE_CLOUD_CMD_INPUT_FV_STAG_IVLAN	10
+#define I40E_AQC_REPLACE_CLOUD_CMD_INPUT_FV_TUNNLE_KEY	11
+#define I40E_AQC_REPLACE_CLOUD_CMD_INPUT_FV_IMAC	12
+/* big FLU */
+#define I40E_AQC_REPLACE_CLOUD_CMD_INPUT_FV_IP_DA	14
+/* big FLU */
+#define I40E_AQC_REPLACE_CLOUD_CMD_INPUT_FV_OIP_DA	15
+
+#define I40E_AQC_REPLACE_CLOUD_CMD_INPUT_FV_INNER_VLAN	37
+	struct i40e_filter_data filters[8];
+};
+
+I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_replace_cloud_filters_cmd_buf);
+
+/* Add Mirror Rule (indirect or direct 0x0260)
+ * Delete Mirror Rule (indirect or direct 0x0261)
+ * note: some rule types (4,5) do not use an external buffer.
+ *       take care to set the flags correctly.
+ */
+struct i40e_aqc_add_delete_mirror_rule {
+	__le16 seid;
+	__le16 rule_type;
+#define I40E_AQC_MIRROR_RULE_TYPE_SHIFT		0
+#define I40E_AQC_MIRROR_RULE_TYPE_MASK		(0x7 << \
+						I40E_AQC_MIRROR_RULE_TYPE_SHIFT)
+#define I40E_AQC_MIRROR_RULE_TYPE_VPORT_INGRESS	1
+#define I40E_AQC_MIRROR_RULE_TYPE_VPORT_EGRESS	2
+#define I40E_AQC_MIRROR_RULE_TYPE_VLAN		3
+#define I40E_AQC_MIRROR_RULE_TYPE_ALL_INGRESS	4
+#define I40E_AQC_MIRROR_RULE_TYPE_ALL_EGRESS	5
+	__le16 num_entries;
+	__le16 destination;  /* VSI for add, rule id for delete */
+	__le32 addr_high;    /* address of array of 2-byte VSI or VLAN ids */
+	__le32 addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_delete_mirror_rule);
+
+struct i40e_aqc_add_delete_mirror_rule_completion {
+	u8	reserved[2];
+	__le16	rule_id;  /* only used on add */
+	__le16	mirror_rules_used;
+	__le16	mirror_rules_free;
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_delete_mirror_rule_completion);
+
+/* Dynamic Device Personalization */
+struct i40e_aqc_write_personalization_profile {
+	u8      flags;
+	u8      reserved[3];
+	__le32  profile_track_id;
+	__le32  addr_high;
+	__le32  addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_write_personalization_profile);
+
+struct i40e_aqc_write_ddp_resp {
+	__le32 error_offset;
+	__le32 error_info;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+struct i40e_aqc_get_applied_profiles {
+	u8      flags;
+#define I40E_AQC_GET_DDP_GET_CONF	0x1
+#define I40E_AQC_GET_DDP_GET_RDPU_CONF	0x2
+	u8      rsv[3];
+	__le32  reserved;
+	__le32  addr_high;
+	__le32  addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_applied_profiles);
+
+/* DCB 0x03xx*/
+
+/* PFC Ignore (direct 0x0301)
+ *    the command and response use the same descriptor structure
+ */
+struct i40e_aqc_pfc_ignore {
+	u8	tc_bitmap;
+	u8	command_flags; /* unused on response */
+#define I40E_AQC_PFC_IGNORE_SET		0x80
+#define I40E_AQC_PFC_IGNORE_CLEAR	0x0
+	u8	reserved[14];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_pfc_ignore);
+
+/* DCB Update (direct 0x0302) uses the i40e_aq_desc structure
+ * with no parameters
+ */
+
+/* TX scheduler 0x04xx */
+
+/* Almost all the indirect commands use
+ * this generic struct to pass the SEID in param0
+ */
+struct i40e_aqc_tx_sched_ind {
+	__le16	vsi_seid;
+	u8	reserved[6];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_tx_sched_ind);
+
+/* Several commands respond with a set of queue set handles */
+struct i40e_aqc_qs_handles_resp {
+	__le16 qs_handles[8];
+};
+
+/* Configure VSI BW limits (direct 0x0400) */
+struct i40e_aqc_configure_vsi_bw_limit {
+	__le16	vsi_seid;
+	u8	reserved[2];
+	__le16	credit;
+	u8	reserved1[2];
+	u8	max_credit; /* 0-3, limit = 2^max */
+	u8	reserved2[7];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_configure_vsi_bw_limit);
+
+/* Configure VSI Bandwidth Limit per Traffic Type (indirect 0x0406)
+ *    responds with i40e_aqc_qs_handles_resp
+ */
+struct i40e_aqc_configure_vsi_ets_sla_bw_data {
+	u8	tc_valid_bits;
+	u8	reserved[15];
+	__le16	tc_bw_credits[8]; /* FW writesback QS handles here */
+
+	/* 4 bits per tc 0-7, 4th bit is reserved, limit = 2^max */
+	__le16	tc_bw_max[2];
+	u8	reserved1[28];
+};
+
+I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_configure_vsi_ets_sla_bw_data);
+
+/* Configure VSI Bandwidth Allocation per Traffic Type (indirect 0x0407)
+ *    responds with i40e_aqc_qs_handles_resp
+ */
+struct i40e_aqc_configure_vsi_tc_bw_data {
+	u8	tc_valid_bits;
+	u8	reserved[3];
+	u8	tc_bw_credits[8];
+	u8	reserved1[4];
+	__le16	qs_handles[8];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_configure_vsi_tc_bw_data);
+
+/* Query vsi bw configuration (indirect 0x0408) */
+struct i40e_aqc_query_vsi_bw_config_resp {
+	u8	tc_valid_bits;
+	u8	tc_suspended_bits;
+	u8	reserved[14];
+	__le16	qs_handles[8];
+	u8	reserved1[4];
+	__le16	port_bw_limit;
+	u8	reserved2[2];
+	u8	max_bw; /* 0-3, limit = 2^max */
+	u8	reserved3[23];
+};
+
+I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_query_vsi_bw_config_resp);
+
+/* Query VSI Bandwidth Allocation per Traffic Type (indirect 0x040A) */
+struct i40e_aqc_query_vsi_ets_sla_config_resp {
+	u8	tc_valid_bits;
+	u8	reserved[3];
+	u8	share_credits[8];
+	__le16	credits[8];
+
+	/* 4 bits per tc 0-7, 4th bit is reserved, limit = 2^max */
+	__le16	tc_bw_max[2];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_query_vsi_ets_sla_config_resp);
+
+/* Configure Switching Component Bandwidth Limit (direct 0x0410) */
+struct i40e_aqc_configure_switching_comp_bw_limit {
+	__le16	seid;
+	u8	reserved[2];
+	__le16	credit;
+	u8	reserved1[2];
+	u8	max_bw; /* 0-3, limit = 2^max */
+	u8	reserved2[7];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_configure_switching_comp_bw_limit);
+
+/* Enable  Physical Port ETS (indirect 0x0413)
+ * Modify  Physical Port ETS (indirect 0x0414)
+ * Disable Physical Port ETS (indirect 0x0415)
+ */
+struct i40e_aqc_configure_switching_comp_ets_data {
+	u8	reserved[4];
+	u8	tc_valid_bits;
+	u8	seepage;
+#define I40E_AQ_ETS_SEEPAGE_EN_MASK	0x1
+	u8	tc_strict_priority_flags;
+	u8	reserved1[17];
+	u8	tc_bw_share_credits[8];
+	u8	reserved2[96];
+};
+
+I40E_CHECK_STRUCT_LEN(0x80, i40e_aqc_configure_switching_comp_ets_data);
+
+/* Configure Switching Component Bandwidth Limits per Tc (indirect 0x0416) */
+struct i40e_aqc_configure_switching_comp_ets_bw_limit_data {
+	u8	tc_valid_bits;
+	u8	reserved[15];
+	__le16	tc_bw_credit[8];
+
+	/* 4 bits per tc 0-7, 4th bit is reserved, limit = 2^max */
+	__le16	tc_bw_max[2];
+	u8	reserved1[28];
+};
+
+I40E_CHECK_STRUCT_LEN(0x40,
+		      i40e_aqc_configure_switching_comp_ets_bw_limit_data);
+
+/* Configure Switching Component Bandwidth Allocation per Tc
+ * (indirect 0x0417)
+ */
+struct i40e_aqc_configure_switching_comp_bw_config_data {
+	u8	tc_valid_bits;
+	u8	reserved[2];
+	u8	absolute_credits; /* bool */
+	u8	tc_bw_share_credits[8];
+	u8	reserved1[20];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_configure_switching_comp_bw_config_data);
+
+/* Query Switching Component Configuration (indirect 0x0418) */
+struct i40e_aqc_query_switching_comp_ets_config_resp {
+	u8	tc_valid_bits;
+	u8	reserved[35];
+	__le16	port_bw_limit;
+	u8	reserved1[2];
+	u8	tc_bw_max; /* 0-3, limit = 2^max */
+	u8	reserved2[23];
+};
+
+I40E_CHECK_STRUCT_LEN(0x40, i40e_aqc_query_switching_comp_ets_config_resp);
+
+/* Query PhysicalPort ETS Configuration (indirect 0x0419) */
+struct i40e_aqc_query_port_ets_config_resp {
+	u8	reserved[4];
+	u8	tc_valid_bits;
+	u8	reserved1;
+	u8	tc_strict_priority_bits;
+	u8	reserved2;
+	u8	tc_bw_share_credits[8];
+	__le16	tc_bw_limits[8];
+
+	/* 4 bits per tc 0-7, 4th bit reserved, limit = 2^max */
+	__le16	tc_bw_max[2];
+	u8	reserved3[32];
+};
+
+I40E_CHECK_STRUCT_LEN(0x44, i40e_aqc_query_port_ets_config_resp);
+
+/* Query Switching Component Bandwidth Allocation per Traffic Type
+ * (indirect 0x041A)
+ */
+struct i40e_aqc_query_switching_comp_bw_config_resp {
+	u8	tc_valid_bits;
+	u8	reserved[2];
+	u8	absolute_credits_enable; /* bool */
+	u8	tc_bw_share_credits[8];
+	__le16	tc_bw_limits[8];
+
+	/* 4 bits per tc 0-7, 4th bit is reserved, limit = 2^max */
+	__le16	tc_bw_max[2];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_query_switching_comp_bw_config_resp);
+
+/* Suspend/resume port TX traffic
+ * (direct 0x041B and 0x041C) uses the generic SEID struct
+ */
+
+/* Configure partition BW
+ * (indirect 0x041D)
+ */
+struct i40e_aqc_configure_partition_bw_data {
+	__le16	pf_valid_bits;
+	u8	min_bw[16];      /* guaranteed bandwidth */
+	u8	max_bw[16];      /* bandwidth limit */
+};
+
+I40E_CHECK_STRUCT_LEN(0x22, i40e_aqc_configure_partition_bw_data);
+
+/* Get and set the active HMC resource profile and status.
+ * (direct 0x0500) and (direct 0x0501)
+ */
+struct i40e_aq_get_set_hmc_resource_profile {
+	u8	pm_profile;
+	u8	pe_vf_enabled;
+	u8	reserved[14];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aq_get_set_hmc_resource_profile);
+
+enum i40e_aq_hmc_profile {
+	/* I40E_HMC_PROFILE_NO_CHANGE	= 0, reserved */
+	I40E_HMC_PROFILE_DEFAULT	= 1,
+	I40E_HMC_PROFILE_FAVOR_VF	= 2,
+	I40E_HMC_PROFILE_EQUAL		= 3,
+};
+
+/* Get PHY Abilities (indirect 0x0600) uses the generic indirect struct */
+
+/* set in param0 for get phy abilities to report qualified modules */
+#define I40E_AQ_PHY_REPORT_QUALIFIED_MODULES	0x0001
+#define I40E_AQ_PHY_REPORT_INITIAL_VALUES	0x0002
+
+enum i40e_aq_phy_type {
+	I40E_PHY_TYPE_SGMII			= 0x0,
+	I40E_PHY_TYPE_1000BASE_KX		= 0x1,
+	I40E_PHY_TYPE_10GBASE_KX4		= 0x2,
+	I40E_PHY_TYPE_10GBASE_KR		= 0x3,
+	I40E_PHY_TYPE_40GBASE_KR4		= 0x4,
+	I40E_PHY_TYPE_XAUI			= 0x5,
+	I40E_PHY_TYPE_XFI			= 0x6,
+	I40E_PHY_TYPE_SFI			= 0x7,
+	I40E_PHY_TYPE_XLAUI			= 0x8,
+	I40E_PHY_TYPE_XLPPI			= 0x9,
+	I40E_PHY_TYPE_40GBASE_CR4_CU		= 0xA,
+	I40E_PHY_TYPE_10GBASE_CR1_CU		= 0xB,
+	I40E_PHY_TYPE_10GBASE_AOC		= 0xC,
+	I40E_PHY_TYPE_40GBASE_AOC		= 0xD,
+	I40E_PHY_TYPE_UNRECOGNIZED		= 0xE,
+	I40E_PHY_TYPE_UNSUPPORTED		= 0xF,
+	I40E_PHY_TYPE_100BASE_TX		= 0x11,
+	I40E_PHY_TYPE_1000BASE_T		= 0x12,
+	I40E_PHY_TYPE_10GBASE_T			= 0x13,
+	I40E_PHY_TYPE_10GBASE_SR		= 0x14,
+	I40E_PHY_TYPE_10GBASE_LR		= 0x15,
+	I40E_PHY_TYPE_10GBASE_SFPP_CU		= 0x16,
+	I40E_PHY_TYPE_10GBASE_CR1		= 0x17,
+	I40E_PHY_TYPE_40GBASE_CR4		= 0x18,
+	I40E_PHY_TYPE_40GBASE_SR4		= 0x19,
+	I40E_PHY_TYPE_40GBASE_LR4		= 0x1A,
+	I40E_PHY_TYPE_1000BASE_SX		= 0x1B,
+	I40E_PHY_TYPE_1000BASE_LX		= 0x1C,
+	I40E_PHY_TYPE_1000BASE_T_OPTICAL	= 0x1D,
+	I40E_PHY_TYPE_20GBASE_KR2		= 0x1E,
+	I40E_PHY_TYPE_25GBASE_KR		= 0x1F,
+	I40E_PHY_TYPE_25GBASE_CR		= 0x20,
+	I40E_PHY_TYPE_25GBASE_SR		= 0x21,
+	I40E_PHY_TYPE_25GBASE_LR		= 0x22,
+	I40E_PHY_TYPE_25GBASE_AOC		= 0x23,
+	I40E_PHY_TYPE_25GBASE_ACC		= 0x24,
+	I40E_PHY_TYPE_MAX,
+	I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP	= 0xFD,
+	I40E_PHY_TYPE_EMPTY			= 0xFE,
+	I40E_PHY_TYPE_DEFAULT			= 0xFF,
+};
+
+#define I40E_PHY_TYPES_BITMASK (BIT_ULL(I40E_PHY_TYPE_SGMII) | \
+				BIT_ULL(I40E_PHY_TYPE_1000BASE_KX) | \
+				BIT_ULL(I40E_PHY_TYPE_10GBASE_KX4) | \
+				BIT_ULL(I40E_PHY_TYPE_10GBASE_KR) | \
+				BIT_ULL(I40E_PHY_TYPE_40GBASE_KR4) | \
+				BIT_ULL(I40E_PHY_TYPE_XAUI) | \
+				BIT_ULL(I40E_PHY_TYPE_XFI) | \
+				BIT_ULL(I40E_PHY_TYPE_SFI) | \
+				BIT_ULL(I40E_PHY_TYPE_XLAUI) | \
+				BIT_ULL(I40E_PHY_TYPE_XLPPI) | \
+				BIT_ULL(I40E_PHY_TYPE_40GBASE_CR4_CU) | \
+				BIT_ULL(I40E_PHY_TYPE_10GBASE_CR1_CU) | \
+				BIT_ULL(I40E_PHY_TYPE_10GBASE_AOC) | \
+				BIT_ULL(I40E_PHY_TYPE_40GBASE_AOC) | \
+				BIT_ULL(I40E_PHY_TYPE_UNRECOGNIZED) | \
+				BIT_ULL(I40E_PHY_TYPE_UNSUPPORTED) | \
+				BIT_ULL(I40E_PHY_TYPE_100BASE_TX) | \
+				BIT_ULL(I40E_PHY_TYPE_1000BASE_T) | \
+				BIT_ULL(I40E_PHY_TYPE_10GBASE_T) | \
+				BIT_ULL(I40E_PHY_TYPE_10GBASE_SR) | \
+				BIT_ULL(I40E_PHY_TYPE_10GBASE_LR) | \
+				BIT_ULL(I40E_PHY_TYPE_10GBASE_SFPP_CU) | \
+				BIT_ULL(I40E_PHY_TYPE_10GBASE_CR1) | \
+				BIT_ULL(I40E_PHY_TYPE_40GBASE_CR4) | \
+				BIT_ULL(I40E_PHY_TYPE_40GBASE_SR4) | \
+				BIT_ULL(I40E_PHY_TYPE_40GBASE_LR4) | \
+				BIT_ULL(I40E_PHY_TYPE_1000BASE_SX) | \
+				BIT_ULL(I40E_PHY_TYPE_1000BASE_LX) | \
+				BIT_ULL(I40E_PHY_TYPE_1000BASE_T_OPTICAL) | \
+				BIT_ULL(I40E_PHY_TYPE_20GBASE_KR2) | \
+				BIT_ULL(I40E_PHY_TYPE_25GBASE_KR) | \
+				BIT_ULL(I40E_PHY_TYPE_25GBASE_CR) | \
+				BIT_ULL(I40E_PHY_TYPE_25GBASE_SR) | \
+				BIT_ULL(I40E_PHY_TYPE_25GBASE_LR) | \
+				BIT_ULL(I40E_PHY_TYPE_25GBASE_AOC) | \
+				BIT_ULL(I40E_PHY_TYPE_25GBASE_ACC))
+
+#define I40E_LINK_SPEED_100MB_SHIFT	0x1
+#define I40E_LINK_SPEED_1000MB_SHIFT	0x2
+#define I40E_LINK_SPEED_10GB_SHIFT	0x3
+#define I40E_LINK_SPEED_40GB_SHIFT	0x4
+#define I40E_LINK_SPEED_20GB_SHIFT	0x5
+#define I40E_LINK_SPEED_25GB_SHIFT	0x6
+
+enum i40e_aq_link_speed {
+	I40E_LINK_SPEED_UNKNOWN	= 0,
+	I40E_LINK_SPEED_100MB	= BIT(I40E_LINK_SPEED_100MB_SHIFT),
+	I40E_LINK_SPEED_1GB	= BIT(I40E_LINK_SPEED_1000MB_SHIFT),
+	I40E_LINK_SPEED_10GB	= BIT(I40E_LINK_SPEED_10GB_SHIFT),
+	I40E_LINK_SPEED_40GB	= BIT(I40E_LINK_SPEED_40GB_SHIFT),
+	I40E_LINK_SPEED_20GB	= BIT(I40E_LINK_SPEED_20GB_SHIFT),
+	I40E_LINK_SPEED_25GB	= BIT(I40E_LINK_SPEED_25GB_SHIFT),
+};
+
+struct i40e_aqc_module_desc {
+	u8 oui[3];
+	u8 reserved1;
+	u8 part_number[16];
+	u8 revision[4];
+	u8 reserved2[8];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_module_desc);
+
+struct i40e_aq_get_phy_abilities_resp {
+	__le32	phy_type;       /* bitmap using the above enum for offsets */
+	u8	link_speed;     /* bitmap using the above enum bit patterns */
+	u8	abilities;
+#define I40E_AQ_PHY_FLAG_PAUSE_TX	0x01
+#define I40E_AQ_PHY_FLAG_PAUSE_RX	0x02
+#define I40E_AQ_PHY_FLAG_LOW_POWER	0x04
+#define I40E_AQ_PHY_LINK_ENABLED	0x08
+#define I40E_AQ_PHY_AN_ENABLED		0x10
+#define I40E_AQ_PHY_FLAG_MODULE_QUAL	0x20
+#define I40E_AQ_PHY_FEC_ABILITY_KR	0x40
+#define I40E_AQ_PHY_FEC_ABILITY_RS	0x80
+	__le16	eee_capability;
+#define I40E_AQ_EEE_100BASE_TX		0x0002
+#define I40E_AQ_EEE_1000BASE_T		0x0004
+#define I40E_AQ_EEE_10GBASE_T		0x0008
+#define I40E_AQ_EEE_1000BASE_KX		0x0010
+#define I40E_AQ_EEE_10GBASE_KX4		0x0020
+#define I40E_AQ_EEE_10GBASE_KR		0x0040
+	__le32	eeer_val;
+	u8	d3_lpan;
+#define I40E_AQ_SET_PHY_D3_LPAN_ENA	0x01
+	u8	phy_type_ext;
+#define I40E_AQ_PHY_TYPE_EXT_25G_KR	0X01
+#define I40E_AQ_PHY_TYPE_EXT_25G_CR	0X02
+#define I40E_AQ_PHY_TYPE_EXT_25G_SR	0x04
+#define I40E_AQ_PHY_TYPE_EXT_25G_LR	0x08
+#define I40E_AQ_PHY_TYPE_EXT_25G_AOC	0x10
+#define I40E_AQ_PHY_TYPE_EXT_25G_ACC	0x20
+	u8	fec_cfg_curr_mod_ext_info;
+#define I40E_AQ_ENABLE_FEC_KR		0x01
+#define I40E_AQ_ENABLE_FEC_RS		0x02
+#define I40E_AQ_REQUEST_FEC_KR		0x04
+#define I40E_AQ_REQUEST_FEC_RS		0x08
+#define I40E_AQ_ENABLE_FEC_AUTO		0x10
+#define I40E_AQ_FEC
+#define I40E_AQ_MODULE_TYPE_EXT_MASK	0xE0
+#define I40E_AQ_MODULE_TYPE_EXT_SHIFT	5
+
+	u8	ext_comp_code;
+	u8	phy_id[4];
+	u8	module_type[3];
+	u8	qualified_module_count;
+#define I40E_AQ_PHY_MAX_QMS		16
+	struct i40e_aqc_module_desc	qualified_module[I40E_AQ_PHY_MAX_QMS];
+};
+
+I40E_CHECK_STRUCT_LEN(0x218, i40e_aq_get_phy_abilities_resp);
+
+/* Set PHY Config (direct 0x0601) */
+struct i40e_aq_set_phy_config { /* same bits as above in all */
+	__le32	phy_type;
+	u8	link_speed;
+	u8	abilities;
+/* bits 0-2 use the values from get_phy_abilities_resp */
+#define I40E_AQ_PHY_ENABLE_LINK		0x08
+#define I40E_AQ_PHY_ENABLE_AN		0x10
+#define I40E_AQ_PHY_ENABLE_ATOMIC_LINK	0x20
+	__le16	eee_capability;
+	__le32	eeer;
+	u8	low_power_ctrl;
+	u8	phy_type_ext;
+#define I40E_AQ_PHY_TYPE_EXT_25G_KR	0X01
+#define I40E_AQ_PHY_TYPE_EXT_25G_CR	0X02
+#define I40E_AQ_PHY_TYPE_EXT_25G_SR	0x04
+#define I40E_AQ_PHY_TYPE_EXT_25G_LR	0x08
+	u8	fec_config;
+#define I40E_AQ_SET_FEC_ABILITY_KR	BIT(0)
+#define I40E_AQ_SET_FEC_ABILITY_RS	BIT(1)
+#define I40E_AQ_SET_FEC_REQUEST_KR	BIT(2)
+#define I40E_AQ_SET_FEC_REQUEST_RS	BIT(3)
+#define I40E_AQ_SET_FEC_AUTO		BIT(4)
+#define I40E_AQ_PHY_FEC_CONFIG_SHIFT	0x0
+#define I40E_AQ_PHY_FEC_CONFIG_MASK	(0x1F << I40E_AQ_PHY_FEC_CONFIG_SHIFT)
+	u8	reserved;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aq_set_phy_config);
+
+/* Set MAC Config command data structure (direct 0x0603) */
+struct i40e_aq_set_mac_config {
+	__le16	max_frame_size;
+	u8	params;
+#define I40E_AQ_SET_MAC_CONFIG_CRC_EN		0x04
+#define I40E_AQ_SET_MAC_CONFIG_PACING_MASK	0x78
+#define I40E_AQ_SET_MAC_CONFIG_PACING_SHIFT	3
+#define I40E_AQ_SET_MAC_CONFIG_PACING_NONE	0x0
+#define I40E_AQ_SET_MAC_CONFIG_PACING_1B_13TX	0xF
+#define I40E_AQ_SET_MAC_CONFIG_PACING_1DW_9TX	0x9
+#define I40E_AQ_SET_MAC_CONFIG_PACING_1DW_4TX	0x8
+#define I40E_AQ_SET_MAC_CONFIG_PACING_3DW_7TX	0x7
+#define I40E_AQ_SET_MAC_CONFIG_PACING_2DW_3TX	0x6
+#define I40E_AQ_SET_MAC_CONFIG_PACING_1DW_1TX	0x5
+#define I40E_AQ_SET_MAC_CONFIG_PACING_3DW_2TX	0x4
+#define I40E_AQ_SET_MAC_CONFIG_PACING_7DW_3TX	0x3
+#define I40E_AQ_SET_MAC_CONFIG_PACING_4DW_1TX	0x2
+#define I40E_AQ_SET_MAC_CONFIG_PACING_9DW_1TX	0x1
+	u8	tx_timer_priority; /* bitmap */
+	__le16	tx_timer_value;
+	__le16	fc_refresh_threshold;
+	u8	reserved[8];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aq_set_mac_config);
+
+/* Restart Auto-Negotiation (direct 0x605) */
+struct i40e_aqc_set_link_restart_an {
+	u8	command;
+#define I40E_AQ_PHY_RESTART_AN	0x02
+#define I40E_AQ_PHY_LINK_ENABLE	0x04
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_link_restart_an);
+
+/* Get Link Status cmd & response data structure (direct 0x0607) */
+struct i40e_aqc_get_link_status {
+	__le16	command_flags; /* only field set on command */
+#define I40E_AQ_LSE_MASK		0x3
+#define I40E_AQ_LSE_NOP			0x0
+#define I40E_AQ_LSE_DISABLE		0x2
+#define I40E_AQ_LSE_ENABLE		0x3
+/* only response uses this flag */
+#define I40E_AQ_LSE_IS_ENABLED		0x1
+	u8	phy_type;    /* i40e_aq_phy_type   */
+	u8	link_speed;  /* i40e_aq_link_speed */
+	u8	link_info;
+#define I40E_AQ_LINK_UP			0x01    /* obsolete */
+#define I40E_AQ_LINK_UP_FUNCTION	0x01
+#define I40E_AQ_LINK_FAULT		0x02
+#define I40E_AQ_LINK_FAULT_TX		0x04
+#define I40E_AQ_LINK_FAULT_RX		0x08
+#define I40E_AQ_LINK_FAULT_REMOTE	0x10
+#define I40E_AQ_LINK_UP_PORT		0x20
+#define I40E_AQ_MEDIA_AVAILABLE		0x40
+#define I40E_AQ_SIGNAL_DETECT		0x80
+	u8	an_info;
+#define I40E_AQ_AN_COMPLETED		0x01
+#define I40E_AQ_LP_AN_ABILITY		0x02
+#define I40E_AQ_PD_FAULT		0x04
+#define I40E_AQ_FEC_EN			0x08
+#define I40E_AQ_PHY_LOW_POWER		0x10
+#define I40E_AQ_LINK_PAUSE_TX		0x20
+#define I40E_AQ_LINK_PAUSE_RX		0x40
+#define I40E_AQ_QUALIFIED_MODULE	0x80
+	u8	ext_info;
+#define I40E_AQ_LINK_PHY_TEMP_ALARM	0x01
+#define I40E_AQ_LINK_XCESSIVE_ERRORS	0x02
+#define I40E_AQ_LINK_TX_SHIFT		0x02
+#define I40E_AQ_LINK_TX_MASK		(0x03 << I40E_AQ_LINK_TX_SHIFT)
+#define I40E_AQ_LINK_TX_ACTIVE		0x00
+#define I40E_AQ_LINK_TX_DRAINED		0x01
+#define I40E_AQ_LINK_TX_FLUSHED		0x03
+#define I40E_AQ_LINK_FORCED_40G		0x10
+/* 25G Error Codes */
+#define I40E_AQ_25G_NO_ERR		0X00
+#define I40E_AQ_25G_NOT_PRESENT		0X01
+#define I40E_AQ_25G_NVM_CRC_ERR		0X02
+#define I40E_AQ_25G_SBUS_UCODE_ERR	0X03
+#define I40E_AQ_25G_SERDES_UCODE_ERR	0X04
+#define I40E_AQ_25G_NIMB_UCODE_ERR	0X05
+	u8	loopback; /* use defines from i40e_aqc_set_lb_mode */
+/* Since firmware API 1.7 loopback field keeps power class info as well */
+#define I40E_AQ_LOOPBACK_MASK		0x07
+#define I40E_AQ_PWR_CLASS_SHIFT_LB	6
+#define I40E_AQ_PWR_CLASS_MASK_LB	(0x03 << I40E_AQ_PWR_CLASS_SHIFT_LB)
+	__le16	max_frame_size;
+	u8	config;
+#define I40E_AQ_CONFIG_FEC_KR_ENA	0x01
+#define I40E_AQ_CONFIG_FEC_RS_ENA	0x02
+#define I40E_AQ_CONFIG_CRC_ENA		0x04
+#define I40E_AQ_CONFIG_PACING_MASK	0x78
+	union {
+		struct {
+			u8	power_desc;
+#define I40E_AQ_LINK_POWER_CLASS_1	0x00
+#define I40E_AQ_LINK_POWER_CLASS_2	0x01
+#define I40E_AQ_LINK_POWER_CLASS_3	0x02
+#define I40E_AQ_LINK_POWER_CLASS_4	0x03
+#define I40E_AQ_PWR_CLASS_MASK		0x03
+			u8	reserved[4];
+		};
+		struct {
+			u8	link_type[4];
+			u8	link_type_ext;
+		};
+	};
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_link_status);
+
+/* Set event mask command (direct 0x613) */
+struct i40e_aqc_set_phy_int_mask {
+	u8	reserved[8];
+	__le16	event_mask;
+#define I40E_AQ_EVENT_LINK_UPDOWN	0x0002
+#define I40E_AQ_EVENT_MEDIA_NA		0x0004
+#define I40E_AQ_EVENT_LINK_FAULT	0x0008
+#define I40E_AQ_EVENT_PHY_TEMP_ALARM	0x0010
+#define I40E_AQ_EVENT_EXCESSIVE_ERRORS	0x0020
+#define I40E_AQ_EVENT_SIGNAL_DETECT	0x0040
+#define I40E_AQ_EVENT_AN_COMPLETED	0x0080
+#define I40E_AQ_EVENT_MODULE_QUAL_FAIL	0x0100
+#define I40E_AQ_EVENT_PORT_TX_SUSPENDED	0x0200
+	u8	reserved1[6];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_phy_int_mask);
+
+/* Get Local AN advt register (direct 0x0614)
+ * Set Local AN advt register (direct 0x0615)
+ * Get Link Partner AN advt register (direct 0x0616)
+ */
+struct i40e_aqc_an_advt_reg {
+	__le32	local_an_reg0;
+	__le16	local_an_reg1;
+	u8	reserved[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_an_advt_reg);
+
+/* Set Loopback mode (0x0618) */
+struct i40e_aqc_set_lb_mode {
+	__le16	lb_mode;
+#define I40E_AQ_LB_PHY_LOCAL	0x01
+#define I40E_AQ_LB_PHY_REMOTE	0x02
+#define I40E_AQ_LB_MAC_LOCAL	0x04
+	u8	reserved[14];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_lb_mode);
+
+/* Set PHY Debug command (0x0622) */
+struct i40e_aqc_set_phy_debug {
+	u8	command_flags;
+#define I40E_AQ_PHY_DEBUG_RESET_INTERNAL	0x02
+#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_SHIFT	2
+#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_MASK	(0x03 << \
+					I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_SHIFT)
+#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_NONE	0x00
+#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_HARD	0x01
+#define I40E_AQ_PHY_DEBUG_RESET_EXTERNAL_SOFT	0x02
+/* Disable link manageability on a single port */
+#define I40E_AQ_PHY_DEBUG_DISABLE_LINK_FW	0x10
+/* Disable link manageability on all ports */
+#define I40E_AQ_PHY_DEBUG_DISABLE_ALL_LINK_FW	0x20
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_phy_debug);
+
+enum i40e_aq_phy_reg_type {
+	I40E_AQC_PHY_REG_INTERNAL	= 0x1,
+	I40E_AQC_PHY_REG_EXERNAL_BASET	= 0x2,
+	I40E_AQC_PHY_REG_EXERNAL_MODULE	= 0x3
+};
+
+/* Run PHY Activity (0x0626) */
+struct i40e_aqc_run_phy_activity {
+	__le16  activity_id;
+	u8      flags;
+	u8      reserved1;
+	__le32  control;
+	__le32  data;
+	u8      reserved2[4];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_run_phy_activity);
+
+/* Set PHY Register command (0x0628) */
+/* Get PHY Register command (0x0629) */
+struct i40e_aqc_phy_register_access {
+	u8	phy_interface;
+#define I40E_AQ_PHY_REG_ACCESS_INTERNAL	0
+#define I40E_AQ_PHY_REG_ACCESS_EXTERNAL	1
+#define I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE	2
+	u8	dev_address;
+	u8	reserved1[2];
+	__le32	reg_address;
+	__le32	reg_value;
+	u8	reserved2[4];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_phy_register_access);
+
+/* NVM Read command (indirect 0x0701)
+ * NVM Erase commands (direct 0x0702)
+ * NVM Update commands (indirect 0x0703)
+ */
+struct i40e_aqc_nvm_update {
+	u8	command_flags;
+#define I40E_AQ_NVM_LAST_CMD			0x01
+#define I40E_AQ_NVM_FLASH_ONLY			0x80
+#define I40E_AQ_NVM_PRESERVATION_FLAGS_SHIFT	1
+#define I40E_AQ_NVM_PRESERVATION_FLAGS_MASK	0x03
+#define I40E_AQ_NVM_PRESERVATION_FLAGS_SELECTED	0x03
+#define I40E_AQ_NVM_PRESERVATION_FLAGS_ALL	0x01
+	u8	module_pointer;
+	__le16	length;
+	__le32	offset;
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_nvm_update);
+
+/* NVM Config Read (indirect 0x0704) */
+struct i40e_aqc_nvm_config_read {
+	__le16	cmd_flags;
+#define I40E_AQ_ANVM_SINGLE_OR_MULTIPLE_FEATURES_MASK	1 
+#define I40E_AQ_ANVM_READ_SINGLE_FEATURE		0 
+#define I40E_AQ_ANVM_READ_MULTIPLE_FEATURES		1
+	__le16	element_count;
+	__le16	element_id;	/* Feature/field ID */
+	__le16	element_id_msw;	/* MSWord of field ID */
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_nvm_config_read);
+
+/* NVM Config Write (indirect 0x0705) */
+struct i40e_aqc_nvm_config_write {
+	__le16	cmd_flags;
+	__le16	element_count;
+	u8	reserved[4];
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_nvm_config_write);
+
+/* Used for 0x0704 as well as for 0x0705 commands */
+#define I40E_AQ_ANVM_FEATURE_OR_IMMEDIATE_SHIFT		1
+#define I40E_AQ_ANVM_FEATURE_OR_IMMEDIATE_MASK \
+				BIT(I40E_AQ_ANVM_FEATURE_OR_IMMEDIATE_SHIFT)
+#define I40E_AQ_ANVM_FEATURE		0
+#define I40E_AQ_ANVM_IMMEDIATE_FIELD	BIT(FEATURE_OR_IMMEDIATE_SHIFT)
+struct i40e_aqc_nvm_config_data_feature {
+	__le16 feature_id;
+#define I40E_AQ_ANVM_FEATURE_OPTION_OEM_ONLY		0x01
+#define I40E_AQ_ANVM_FEATURE_OPTION_DWORD_MAP		0x08
+#define I40E_AQ_ANVM_FEATURE_OPTION_POR_CSR		0x10
+	__le16 feature_options;
+	__le16 feature_selection;
+};
+
+I40E_CHECK_STRUCT_LEN(0x6, i40e_aqc_nvm_config_data_feature);
+
+struct i40e_aqc_nvm_config_data_immediate_field {
+	__le32 field_id;
+	__le32 field_value;
+	__le16 field_options;
+	__le16 reserved;
+};
+
+I40E_CHECK_STRUCT_LEN(0xc, i40e_aqc_nvm_config_data_immediate_field);
+
+/* OEM Post Update (indirect 0x0720)
+ * no command data struct used
+ */
+struct i40e_aqc_nvm_oem_post_update {
+#define I40E_AQ_NVM_OEM_POST_UPDATE_EXTERNAL_DATA	0x01
+	u8 sel_data;
+	u8 reserved[7];
+};
+
+I40E_CHECK_STRUCT_LEN(0x8, i40e_aqc_nvm_oem_post_update);
+
+struct i40e_aqc_nvm_oem_post_update_buffer {
+	u8 str_len;
+	u8 dev_addr;
+	__le16 eeprom_addr;
+	u8 data[36];
+};
+
+I40E_CHECK_STRUCT_LEN(0x28, i40e_aqc_nvm_oem_post_update_buffer);
+
+/* Thermal Sensor (indirect 0x0721)
+ *     read or set thermal sensor configs and values
+ *     takes a sensor and command specific data buffer, not detailed here
+ */
+struct i40e_aqc_thermal_sensor {
+	u8 sensor_action;
+#define I40E_AQ_THERMAL_SENSOR_READ_CONFIG	0
+#define I40E_AQ_THERMAL_SENSOR_SET_CONFIG	1
+#define I40E_AQ_THERMAL_SENSOR_READ_TEMP	2
+	u8 reserved[7];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_thermal_sensor);
+
+/* Send to PF command (indirect 0x0801) id is only used by PF
+ * Send to VF command (indirect 0x0802) id is only used by PF
+ * Send to Peer PF command (indirect 0x0803)
+ */
+struct i40e_aqc_pf_vf_message {
+	__le32	id;
+	u8	reserved[4];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_pf_vf_message);
+
+/* Alternate structure */
+
+/* Direct write (direct 0x0900)
+ * Direct read (direct 0x0902)
+ */
+struct i40e_aqc_alternate_write {
+	__le32 address0;
+	__le32 data0;
+	__le32 address1;
+	__le32 data1;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_alternate_write);
+
+/* Indirect write (indirect 0x0901)
+ * Indirect read (indirect 0x0903)
+ */
+
+struct i40e_aqc_alternate_ind_write {
+	__le32 address;
+	__le32 length;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_alternate_ind_write);
+
+/* Done alternate write (direct 0x0904)
+ * uses i40e_aq_desc
+ */
+struct i40e_aqc_alternate_write_done {
+	__le16	cmd_flags;
+#define I40E_AQ_ALTERNATE_MODE_BIOS_MASK	1
+#define I40E_AQ_ALTERNATE_MODE_BIOS_LEGACY	0
+#define I40E_AQ_ALTERNATE_MODE_BIOS_UEFI	1
+#define I40E_AQ_ALTERNATE_RESET_NEEDED		2
+	u8	reserved[14];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_alternate_write_done);
+
+/* Set OEM mode (direct 0x0905) */
+struct i40e_aqc_alternate_set_mode {
+	__le32	mode;
+#define I40E_AQ_ALTERNATE_MODE_NONE	0
+#define I40E_AQ_ALTERNATE_MODE_OEM	1
+	u8	reserved[12];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_alternate_set_mode);
+
+/* Clear port Alternate RAM (direct 0x0906) uses i40e_aq_desc */
+
+/* async events 0x10xx */
+
+/* Lan Queue Overflow Event (direct, 0x1001) */
+struct i40e_aqc_lan_overflow {
+	__le32	prtdcb_rupto;
+	__le32	otx_ctl;
+	u8	reserved[8];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lan_overflow);
+
+/* Get LLDP MIB (indirect 0x0A00) */
+struct i40e_aqc_lldp_get_mib {
+	u8	type;
+	u8	reserved1;
+#define I40E_AQ_LLDP_MIB_TYPE_MASK		0x3
+#define I40E_AQ_LLDP_MIB_LOCAL			0x0
+#define I40E_AQ_LLDP_MIB_REMOTE			0x1
+#define I40E_AQ_LLDP_MIB_LOCAL_AND_REMOTE	0x2
+#define I40E_AQ_LLDP_BRIDGE_TYPE_MASK		0xC
+#define I40E_AQ_LLDP_BRIDGE_TYPE_SHIFT		0x2
+#define I40E_AQ_LLDP_BRIDGE_TYPE_NEAREST_BRIDGE	0x0
+#define I40E_AQ_LLDP_BRIDGE_TYPE_NON_TPMR	0x1
+#define I40E_AQ_LLDP_TX_SHIFT			0x4
+#define I40E_AQ_LLDP_TX_MASK			(0x03 << I40E_AQ_LLDP_TX_SHIFT)
+/* TX pause flags use I40E_AQ_LINK_TX_* above */
+	__le16	local_len;
+	__le16	remote_len;
+	u8	reserved2[2];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_get_mib);
+
+/* Configure LLDP MIB Change Event (direct 0x0A01)
+ * also used for the event (with type in the command field)
+ */
+struct i40e_aqc_lldp_update_mib {
+	u8	command;
+#define I40E_AQ_LLDP_MIB_UPDATE_ENABLE	0x0
+#define I40E_AQ_LLDP_MIB_UPDATE_DISABLE	0x1
+	u8	reserved[7];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_update_mib);
+
+/* Add LLDP TLV (indirect 0x0A02)
+ * Delete LLDP TLV (indirect 0x0A04)
+ */
+struct i40e_aqc_lldp_add_tlv {
+	u8	type; /* only nearest bridge and non-TPMR from 0x0A00 */
+	u8	reserved1[1];
+	__le16	len;
+	u8	reserved2[4];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_add_tlv);
+
+/* Update LLDP TLV (indirect 0x0A03) */
+struct i40e_aqc_lldp_update_tlv {
+	u8	type; /* only nearest bridge and non-TPMR from 0x0A00 */
+	u8	reserved;
+	__le16	old_len;
+	__le16	new_offset;
+	__le16	new_len;
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_update_tlv);
+
+/* Stop LLDP (direct 0x0A05) */
+struct i40e_aqc_lldp_stop {
+	u8	command;
+#define I40E_AQ_LLDP_AGENT_STOP		0x0
+#define I40E_AQ_LLDP_AGENT_SHUTDOWN	0x1
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_stop);
+
+/* Start LLDP (direct 0x0A06) */
+
+struct i40e_aqc_lldp_start {
+	u8	command;
+#define I40E_AQ_LLDP_AGENT_START	0x1
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_start);
+
+/* Set DCB (direct 0x0303) */
+struct i40e_aqc_set_dcb_parameters {
+	u8 command;
+#define I40E_AQ_DCB_SET_AGENT	0x1
+#define I40E_DCB_VALID		0x1
+	u8 valid_flags;
+	u8 reserved[14];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_set_dcb_parameters);
+
+/* Get CEE DCBX Oper Config (0x0A07)
+ * uses the generic descriptor struct
+ * returns below as indirect response
+ */
+
+#define I40E_AQC_CEE_APP_FCOE_SHIFT	0x0
+#define I40E_AQC_CEE_APP_FCOE_MASK	(0x7 << I40E_AQC_CEE_APP_FCOE_SHIFT)
+#define I40E_AQC_CEE_APP_ISCSI_SHIFT	0x3
+#define I40E_AQC_CEE_APP_ISCSI_MASK	(0x7 << I40E_AQC_CEE_APP_ISCSI_SHIFT)
+#define I40E_AQC_CEE_APP_FIP_SHIFT	0x8
+#define I40E_AQC_CEE_APP_FIP_MASK	(0x7 << I40E_AQC_CEE_APP_FIP_SHIFT)
+
+#define I40E_AQC_CEE_PG_STATUS_SHIFT	0x0
+#define I40E_AQC_CEE_PG_STATUS_MASK	(0x7 << I40E_AQC_CEE_PG_STATUS_SHIFT)
+#define I40E_AQC_CEE_PFC_STATUS_SHIFT	0x3
+#define I40E_AQC_CEE_PFC_STATUS_MASK	(0x7 << I40E_AQC_CEE_PFC_STATUS_SHIFT)
+#define I40E_AQC_CEE_APP_STATUS_SHIFT	0x8
+#define I40E_AQC_CEE_APP_STATUS_MASK	(0x7 << I40E_AQC_CEE_APP_STATUS_SHIFT)
+#define I40E_AQC_CEE_FCOE_STATUS_SHIFT	0x8
+#define I40E_AQC_CEE_FCOE_STATUS_MASK	(0x7 << I40E_AQC_CEE_FCOE_STATUS_SHIFT)
+#define I40E_AQC_CEE_ISCSI_STATUS_SHIFT	0xB
+#define I40E_AQC_CEE_ISCSI_STATUS_MASK	(0x7 << I40E_AQC_CEE_ISCSI_STATUS_SHIFT)
+#define I40E_AQC_CEE_FIP_STATUS_SHIFT	0x10
+#define I40E_AQC_CEE_FIP_STATUS_MASK	(0x7 << I40E_AQC_CEE_FIP_STATUS_SHIFT)
+
+/* struct i40e_aqc_get_cee_dcb_cfg_v1_resp was originally defined with
+ * word boundary layout issues, which the Linux compilers silently deal
+ * with by adding padding, making the actual struct larger than designed.
+ * However, the FW compiler for the NIC is less lenient and complains
+ * about the struct.  Hence, the struct defined here has an extra byte in
+ * fields reserved3 and reserved4 to directly acknowledge that padding,
+ * and the new length is used in the length check macro.
+ */
+struct i40e_aqc_get_cee_dcb_cfg_v1_resp {
+	u8	reserved1;
+	u8	oper_num_tc;
+	u8	oper_prio_tc[4];
+	u8	reserved2;
+	u8	oper_tc_bw[8];
+	u8	oper_pfc_en;
+	u8	reserved3[2];
+	__le16	oper_app_prio;
+	u8	reserved4[2];
+	__le16	tlv_status;
+};
+
+I40E_CHECK_STRUCT_LEN(0x18, i40e_aqc_get_cee_dcb_cfg_v1_resp);
+
+struct i40e_aqc_get_cee_dcb_cfg_resp {
+	u8	oper_num_tc;
+	u8	oper_prio_tc[4];
+	u8	oper_tc_bw[8];
+	u8	oper_pfc_en;
+	__le16	oper_app_prio;
+#define I40E_AQC_CEE_APP_FCOE_SHIFT	0x0
+#define I40E_AQC_CEE_APP_FCOE_MASK	(0x7 << I40E_AQC_CEE_APP_FCOE_SHIFT)
+#define I40E_AQC_CEE_APP_ISCSI_SHIFT	0x3
+#define I40E_AQC_CEE_APP_ISCSI_MASK	(0x7 << I40E_AQC_CEE_APP_ISCSI_SHIFT)
+#define I40E_AQC_CEE_APP_FIP_SHIFT	0x8
+#define I40E_AQC_CEE_APP_FIP_MASK	(0x7 << I40E_AQC_CEE_APP_FIP_SHIFT)
+#define I40E_AQC_CEE_APP_FIP_MASK	(0x7 << I40E_AQC_CEE_APP_FIP_SHIFT)
+	__le32	tlv_status;
+#define I40E_AQC_CEE_PG_STATUS_SHIFT	0x0
+#define I40E_AQC_CEE_PG_STATUS_MASK	(0x7 << I40E_AQC_CEE_PG_STATUS_SHIFT)
+#define I40E_AQC_CEE_PFC_STATUS_SHIFT	0x3
+#define I40E_AQC_CEE_PFC_STATUS_MASK	(0x7 << I40E_AQC_CEE_PFC_STATUS_SHIFT)
+#define I40E_AQC_CEE_APP_STATUS_SHIFT	0x8
+#define I40E_AQC_CEE_APP_STATUS_MASK	(0x7 << I40E_AQC_CEE_APP_STATUS_SHIFT)
+	u8	reserved[12];
+};
+
+I40E_CHECK_STRUCT_LEN(0x20, i40e_aqc_get_cee_dcb_cfg_resp);
+
+/*	Set Local LLDP MIB (indirect 0x0A08)
+ *	Used to replace the local MIB of a given LLDP agent. e.g. DCBx
+ */
+struct i40e_aqc_lldp_set_local_mib {
+#define SET_LOCAL_MIB_AC_TYPE_DCBX_SHIFT	0
+#define SET_LOCAL_MIB_AC_TYPE_DCBX_MASK	BIT(SET_LOCAL_MIB_AC_TYPE_DCBX_SHIFT)
+#define SET_LOCAL_MIB_AC_TYPE_LOCAL_MIB	0x0
+#define SET_LOCAL_MIB_AC_TYPE_NON_WILLING_APPS_SHIFT	(1)
+#define SET_LOCAL_MIB_AC_TYPE_NON_WILLING_APPS_MASK \
+			BIT(SET_LOCAL_MIB_AC_TYPE_NON_WILLING_APPS_SHIFT)
+#define SET_LOCAL_MIB_AC_TYPE_NON_WILLING_APPS		0x1
+	u8	type;
+	u8	reserved0;
+	__le16	length;
+	u8	reserved1[4];
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_set_local_mib);
+
+/*	Stop/Start LLDP Agent (direct 0x0A09)
+ *	Used for stopping/starting specific LLDP agent. e.g. DCBx
+ */
+struct i40e_aqc_lldp_stop_start_specific_agent {
+#define I40E_AQC_START_SPECIFIC_AGENT_SHIFT	0
+#define I40E_AQC_START_SPECIFIC_AGENT_MASK \
+				BIT(I40E_AQC_START_SPECIFIC_AGENT_SHIFT)
+	u8	command;
+	u8	reserved[15];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_stop_start_specific_agent);
+
+/* Add Udp Tunnel command and completion (direct 0x0B00) */
+struct i40e_aqc_add_udp_tunnel {
+	__le16	udp_port;
+	u8	reserved0[3];
+	u8	protocol_type;
+#define I40E_AQC_TUNNEL_TYPE_VXLAN	0x00
+#define I40E_AQC_TUNNEL_TYPE_NGE	0x01
+#define I40E_AQC_TUNNEL_TYPE_TEREDO	0x10
+#define I40E_AQC_TUNNEL_TYPE_VXLAN_GPE	0x11
+	u8	reserved1[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_udp_tunnel);
+
+struct i40e_aqc_add_udp_tunnel_completion {
+	__le16	udp_port;
+	u8	filter_entry_index;
+	u8	multiple_pfs;
+#define I40E_AQC_SINGLE_PF		0x0
+#define I40E_AQC_MULTIPLE_PFS		0x1
+	u8	total_filters;
+	u8	reserved[11];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_add_udp_tunnel_completion);
+
+/* remove UDP Tunnel command (0x0B01) */
+struct i40e_aqc_remove_udp_tunnel {
+	u8	reserved[2];
+	u8	index; /* 0 to 15 */
+	u8	reserved2[13];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_remove_udp_tunnel);
+
+struct i40e_aqc_del_udp_tunnel_completion {
+	__le16	udp_port;
+	u8	index; /* 0 to 15 */
+	u8	multiple_pfs;
+	u8	total_filters_used;
+	u8	reserved1[11];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_del_udp_tunnel_completion);
+
+struct i40e_aqc_get_set_rss_key {
+#define I40E_AQC_SET_RSS_KEY_VSI_VALID		BIT(15)
+#define I40E_AQC_SET_RSS_KEY_VSI_ID_SHIFT	0
+#define I40E_AQC_SET_RSS_KEY_VSI_ID_MASK	(0x3FF << \
+					I40E_AQC_SET_RSS_KEY_VSI_ID_SHIFT)
+	__le16	vsi_id;
+	u8	reserved[6];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_set_rss_key);
+
+struct i40e_aqc_get_set_rss_key_data {
+	u8 standard_rss_key[0x28];
+	u8 extended_hash_key[0xc];
+};
+
+I40E_CHECK_STRUCT_LEN(0x34, i40e_aqc_get_set_rss_key_data);
+
+struct  i40e_aqc_get_set_rss_lut {
+#define I40E_AQC_SET_RSS_LUT_VSI_VALID		BIT(15)
+#define I40E_AQC_SET_RSS_LUT_VSI_ID_SHIFT	0
+#define I40E_AQC_SET_RSS_LUT_VSI_ID_MASK	(0x3FF << \
+					I40E_AQC_SET_RSS_LUT_VSI_ID_SHIFT)
+	__le16	vsi_id;
+#define I40E_AQC_SET_RSS_LUT_TABLE_TYPE_SHIFT	0
+#define I40E_AQC_SET_RSS_LUT_TABLE_TYPE_MASK	BIT(I40E_AQC_SET_RSS_LUT_TABLE_TYPE_SHIFT)
+
+#define I40E_AQC_SET_RSS_LUT_TABLE_TYPE_VSI	0
+#define I40E_AQC_SET_RSS_LUT_TABLE_TYPE_PF	1
+	__le16	flags;
+	u8	reserved[4];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_get_set_rss_lut);
+
+/* tunnel key structure 0x0B10 */
+
+struct i40e_aqc_tunnel_key_structure {
+	u8	key1_off;
+	u8	key2_off;
+	u8	key1_len;  /* 0 to 15 */
+	u8	key2_len;  /* 0 to 15 */
+	u8	flags;
+#define I40E_AQC_TUNNEL_KEY_STRUCT_OVERRIDE	0x01
+/* response flags */
+#define I40E_AQC_TUNNEL_KEY_STRUCT_SUCCESS	0x01
+#define I40E_AQC_TUNNEL_KEY_STRUCT_MODIFIED	0x02
+#define I40E_AQC_TUNNEL_KEY_STRUCT_OVERRIDDEN	0x03
+	u8	network_key_index;
+#define I40E_AQC_NETWORK_KEY_INDEX_VXLAN		0x0
+#define I40E_AQC_NETWORK_KEY_INDEX_NGE			0x1
+#define I40E_AQC_NETWORK_KEY_INDEX_FLEX_MAC_IN_UDP	0x2
+#define I40E_AQC_NETWORK_KEY_INDEX_GRE			0x3
+	u8	reserved[10];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_tunnel_key_structure);
+
+/* OEM mode commands (direct 0xFE0x) */
+struct i40e_aqc_oem_param_change {
+	__le32	param_type;
+#define I40E_AQ_OEM_PARAM_TYPE_PF_CTL	0
+#define I40E_AQ_OEM_PARAM_TYPE_BW_CTL	1
+#define I40E_AQ_OEM_PARAM_MAC		2
+	__le32	param_value1;
+	__le16	param_value2;
+	u8	reserved[6];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_oem_param_change);
+
+struct i40e_aqc_oem_state_change {
+	__le32	state;
+#define I40E_AQ_OEM_STATE_LINK_DOWN	0x0
+#define I40E_AQ_OEM_STATE_LINK_UP	0x1
+	u8	reserved[12];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_oem_state_change);
+
+/* Initialize OCSD (0xFE02, direct) */
+struct i40e_aqc_opc_oem_ocsd_initialize {
+	u8 type_status;
+	u8 reserved1[3];
+	__le32 ocsd_memory_block_addr_high;
+	__le32 ocsd_memory_block_addr_low;
+	__le32 requested_update_interval;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_opc_oem_ocsd_initialize);
+
+/* Initialize OCBB  (0xFE03, direct) */
+struct i40e_aqc_opc_oem_ocbb_initialize {
+	u8 type_status;
+	u8 reserved1[3];
+	__le32 ocbb_memory_block_addr_high;
+	__le32 ocbb_memory_block_addr_low;
+	u8 reserved2[4];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_opc_oem_ocbb_initialize);
+
+/* debug commands */
+
+/* get device id (0xFF00) uses the generic structure */
+
+/* set test more (0xFF01, internal) */
+
+struct i40e_acq_set_test_mode {
+	u8	mode;
+#define I40E_AQ_TEST_PARTIAL	0
+#define I40E_AQ_TEST_FULL	1
+#define I40E_AQ_TEST_NVM	2
+	u8	reserved[3];
+	u8	command;
+#define I40E_AQ_TEST_OPEN	0
+#define I40E_AQ_TEST_CLOSE	1
+#define I40E_AQ_TEST_INC	2
+	u8	reserved2[3];
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_acq_set_test_mode);
+
+/* Debug Read Register command (0xFF03)
+ * Debug Write Register command (0xFF04)
+ */
+struct i40e_aqc_debug_reg_read_write {
+	__le32 reserved;
+	__le32 address;
+	__le32 value_high;
+	__le32 value_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_debug_reg_read_write);
+
+/* Scatter/gather Reg Read  (indirect 0xFF05)
+ * Scatter/gather Reg Write (indirect 0xFF06)
+ */
+
+/* i40e_aq_desc is used for the command */
+struct i40e_aqc_debug_reg_sg_element_data {
+	__le32 address;
+	__le32 value;
+};
+
+/* Debug Modify register (direct 0xFF07) */
+struct i40e_aqc_debug_modify_reg {
+	__le32 address;
+	__le32 value;
+	__le32 clear_mask;
+	__le32 set_mask;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_debug_modify_reg);
+
+/* dump internal data (0xFF08, indirect) */
+
+#define I40E_AQ_CLUSTER_ID_AUX		0
+#define I40E_AQ_CLUSTER_ID_SWITCH_FLU	1
+#define I40E_AQ_CLUSTER_ID_TXSCHED	2
+#define I40E_AQ_CLUSTER_ID_HMC		3
+#define I40E_AQ_CLUSTER_ID_MAC0		4
+#define I40E_AQ_CLUSTER_ID_MAC1		5
+#define I40E_AQ_CLUSTER_ID_MAC2		6
+#define I40E_AQ_CLUSTER_ID_MAC3		7
+#define I40E_AQ_CLUSTER_ID_DCB		8
+#define I40E_AQ_CLUSTER_ID_EMP_MEM	9
+#define I40E_AQ_CLUSTER_ID_PKT_BUF	10
+#define I40E_AQ_CLUSTER_ID_ALTRAM	11
+
+struct i40e_aqc_debug_dump_internals {
+	u8	cluster_id;
+	u8	table_id;
+	__le16	data_size;
+	__le32	idx;
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_debug_dump_internals);
+
+struct i40e_aqc_debug_modify_internals {
+	u8	cluster_id;
+	u8	cluster_specific_params[7];
+	__le32	address_high;
+	__le32	address_low;
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_debug_modify_internals);
+
+#endif /* _I40E_ADMINQ_CMD_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_alloc.h b/drivers/net/ethernet/intel/i40e/i40e_alloc.h
new file mode 100644
index 0000000..abed0c5
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_alloc.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_ALLOC_H_
+#define _I40E_ALLOC_H_
+
+struct i40e_hw;
+
+/* Memory allocation types */
+enum i40e_memory_type {
+	i40e_mem_arq_buf = 0,		/* ARQ indirect command buffer */
+	i40e_mem_asq_buf = 1,
+	i40e_mem_atq_buf = 2,		/* ATQ indirect command buffer */
+	i40e_mem_arq_ring = 3,		/* ARQ descriptor ring */
+	i40e_mem_atq_ring = 4,		/* ATQ descriptor ring */
+	i40e_mem_pd = 5,		/* Page Descriptor */
+	i40e_mem_bp = 6,		/* Backing Page - 4KB */
+	i40e_mem_bp_jumbo = 7,		/* Backing Page - > 4KB */
+	i40e_mem_reserved
+};
+
+/* prototype for functions used for dynamic memory allocation */
+i40e_status i40e_allocate_dma_mem(struct i40e_hw *hw,
+					    struct i40e_dma_mem *mem,
+					    enum i40e_memory_type type,
+					    u64 size, u32 alignment);
+i40e_status i40e_free_dma_mem(struct i40e_hw *hw,
+					struct i40e_dma_mem *mem);
+i40e_status i40e_allocate_virt_mem(struct i40e_hw *hw,
+					     struct i40e_virt_mem *mem,
+					     u32 size);
+i40e_status i40e_free_virt_mem(struct i40e_hw *hw,
+					 struct i40e_virt_mem *mem);
+
+#endif /* _I40E_ALLOC_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c b/drivers/net/ethernet/intel/i40e/i40e_client.c
new file mode 100644
index 0000000..d8ce499
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.c
@@ -0,0 +1,839 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#include <linux/list.h>
+#include <linux/errno.h>
+
+#include "i40e.h"
+#include "i40e_prototype.h"
+#include "i40e_client.h"
+
+static const char i40e_client_interface_version_str[] = I40E_CLIENT_VERSION_STR;
+static struct i40e_client *registered_client;
+static LIST_HEAD(i40e_devices);
+static DEFINE_MUTEX(i40e_device_mutex);
+
+static int i40e_client_virtchnl_send(struct i40e_info *ldev,
+				     struct i40e_client *client,
+				     u32 vf_id, u8 *msg, u16 len);
+
+static int i40e_client_setup_qvlist(struct i40e_info *ldev,
+				    struct i40e_client *client,
+				    struct i40e_qvlist_info *qvlist_info);
+
+static void i40e_client_request_reset(struct i40e_info *ldev,
+				      struct i40e_client *client,
+				      u32 reset_level);
+
+static int i40e_client_update_vsi_ctxt(struct i40e_info *ldev,
+				       struct i40e_client *client,
+				       bool is_vf, u32 vf_id,
+				       u32 flag, u32 valid_flag);
+
+static struct i40e_ops i40e_lan_ops = {
+	.virtchnl_send = i40e_client_virtchnl_send,
+	.setup_qvlist = i40e_client_setup_qvlist,
+	.request_reset = i40e_client_request_reset,
+	.update_vsi_ctxt = i40e_client_update_vsi_ctxt,
+};
+
+/**
+ * i40e_client_get_params - Get the params that can change at runtime
+ * @vsi: the VSI with the message
+ * @param: clinet param struct
+ *
+ **/
+static
+int i40e_client_get_params(struct i40e_vsi *vsi, struct i40e_params *params)
+{
+	struct i40e_dcbx_config *dcb_cfg = &vsi->back->hw.local_dcbx_config;
+	int i = 0;
+
+	for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
+		u8 tc = dcb_cfg->etscfg.prioritytable[i];
+		u16 qs_handle;
+
+		/* If TC is not enabled for VSI use TC0 for UP */
+		if (!(vsi->tc_config.enabled_tc & BIT(tc)))
+			tc = 0;
+
+		qs_handle = le16_to_cpu(vsi->info.qs_handle[tc]);
+		params->qos.prio_qos[i].tc = tc;
+		params->qos.prio_qos[i].qs_handle = qs_handle;
+		if (qs_handle == I40E_AQ_VSI_QS_HANDLE_INVALID) {
+			dev_err(&vsi->back->pdev->dev, "Invalid queue set handle for TC = %d, vsi id = %d\n",
+				tc, vsi->id);
+			return -EINVAL;
+		}
+	}
+
+	params->mtu = vsi->netdev->mtu;
+	return 0;
+}
+
+/**
+ * i40e_notify_client_of_vf_msg - call the client vf message callback
+ * @vsi: the VSI with the message
+ * @vf_id: the absolute VF id that sent the message
+ * @msg: message buffer
+ * @len: length of the message
+ *
+ * If there is a client to this VSI, call the client
+ **/
+void
+i40e_notify_client_of_vf_msg(struct i40e_vsi *vsi, u32 vf_id, u8 *msg, u16 len)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_client_instance *cdev = pf->cinst;
+
+	if (!cdev || !cdev->client)
+		return;
+	if (!cdev->client->ops || !cdev->client->ops->virtchnl_receive) {
+		dev_dbg(&pf->pdev->dev,
+			"Cannot locate client instance virtual channel receive routine\n");
+		return;
+	}
+	if (!test_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state)) {
+		dev_dbg(&pf->pdev->dev, "Client is not open, abort virtchnl_receive\n");
+		return;
+	}
+	cdev->client->ops->virtchnl_receive(&cdev->lan_info, cdev->client,
+					    vf_id, msg, len);
+}
+
+/**
+ * i40e_notify_client_of_l2_param_changes - call the client notify callback
+ * @vsi: the VSI with l2 param changes
+ *
+ * If there is a client to this VSI, call the client
+ **/
+void i40e_notify_client_of_l2_param_changes(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_client_instance *cdev = pf->cinst;
+	struct i40e_params params;
+
+	if (!cdev || !cdev->client)
+		return;
+	if (!cdev->client->ops || !cdev->client->ops->l2_param_change) {
+		dev_dbg(&vsi->back->pdev->dev,
+			"Cannot locate client instance l2_param_change routine\n");
+		return;
+	}
+	if (!test_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state)) {
+		dev_dbg(&vsi->back->pdev->dev, "Client is not open, abort l2 param change\n");
+		return;
+	}
+	memset(&params, 0, sizeof(params));
+	i40e_client_get_params(vsi, &params);
+	memcpy(&cdev->lan_info.params, &params, sizeof(struct i40e_params));
+	cdev->client->ops->l2_param_change(&cdev->lan_info, cdev->client,
+					   &params);
+}
+
+/**
+ * i40e_client_release_qvlist - release MSI-X vector mapping for client
+ * @ldev: pointer to L2 context.
+ *
+ **/
+static void i40e_client_release_qvlist(struct i40e_info *ldev)
+{
+	struct i40e_qvlist_info *qvlist_info = ldev->qvlist_info;
+	u32 i;
+
+	if (!ldev->qvlist_info)
+		return;
+
+	for (i = 0; i < qvlist_info->num_vectors; i++) {
+		struct i40e_pf *pf = ldev->pf;
+		struct i40e_qv_info *qv_info;
+		u32 reg_idx;
+
+		qv_info = &qvlist_info->qv_info[i];
+		if (!qv_info)
+			continue;
+		reg_idx = I40E_PFINT_LNKLSTN(qv_info->v_idx - 1);
+		wr32(&pf->hw, reg_idx, I40E_PFINT_LNKLSTN_FIRSTQ_INDX_MASK);
+	}
+	kfree(ldev->qvlist_info);
+	ldev->qvlist_info = NULL;
+}
+
+/**
+ * i40e_notify_client_of_netdev_close - call the client close callback
+ * @vsi: the VSI with netdev closed
+ * @reset: true when close called due to a reset pending
+ *
+ * If there is a client to this netdev, call the client with close
+ **/
+void i40e_notify_client_of_netdev_close(struct i40e_vsi *vsi, bool reset)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_client_instance *cdev = pf->cinst;
+
+	if (!cdev || !cdev->client)
+		return;
+	if (!cdev->client->ops || !cdev->client->ops->close) {
+		dev_dbg(&vsi->back->pdev->dev,
+			"Cannot locate client instance close routine\n");
+		return;
+	}
+	cdev->client->ops->close(&cdev->lan_info, cdev->client, reset);
+	clear_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state);
+	i40e_client_release_qvlist(&cdev->lan_info);
+}
+
+/**
+ * i40e_notify_client_of_vf_reset - call the client vf reset callback
+ * @pf: PF device pointer
+ * @vf_id: asolute id of VF being reset
+ *
+ * If there is a client attached to this PF, notify when a VF is reset
+ **/
+void i40e_notify_client_of_vf_reset(struct i40e_pf *pf, u32 vf_id)
+{
+	struct i40e_client_instance *cdev = pf->cinst;
+
+	if (!cdev || !cdev->client)
+		return;
+	if (!cdev->client->ops || !cdev->client->ops->vf_reset) {
+		dev_dbg(&pf->pdev->dev,
+			"Cannot locate client instance VF reset routine\n");
+		return;
+	}
+	if (!test_bit(__I40E_CLIENT_INSTANCE_OPENED,  &cdev->state)) {
+		dev_dbg(&pf->pdev->dev, "Client is not open, abort vf-reset\n");
+		return;
+	}
+	cdev->client->ops->vf_reset(&cdev->lan_info, cdev->client, vf_id);
+}
+
+/**
+ * i40e_notify_client_of_vf_enable - call the client vf notification callback
+ * @pf: PF device pointer
+ * @num_vfs: the number of VFs currently enabled, 0 for disable
+ *
+ * If there is a client attached to this PF, call its VF notification routine
+ **/
+void i40e_notify_client_of_vf_enable(struct i40e_pf *pf, u32 num_vfs)
+{
+	struct i40e_client_instance *cdev = pf->cinst;
+
+	if (!cdev || !cdev->client)
+		return;
+	if (!cdev->client->ops || !cdev->client->ops->vf_enable) {
+		dev_dbg(&pf->pdev->dev,
+			"Cannot locate client instance VF enable routine\n");
+		return;
+	}
+	if (!test_bit(__I40E_CLIENT_INSTANCE_OPENED,
+		      &cdev->state)) {
+		dev_dbg(&pf->pdev->dev, "Client is not open, abort vf-enable\n");
+		return;
+	}
+	cdev->client->ops->vf_enable(&cdev->lan_info, cdev->client, num_vfs);
+}
+
+/**
+ * i40e_vf_client_capable - ask the client if it likes the specified VF
+ * @pf: PF device pointer
+ * @vf_id: the VF in question
+ *
+ * If there is a client of the specified type attached to this PF, call
+ * its vf_capable routine
+ **/
+int i40e_vf_client_capable(struct i40e_pf *pf, u32 vf_id)
+{
+	struct i40e_client_instance *cdev = pf->cinst;
+	int capable = false;
+
+	if (!cdev || !cdev->client)
+		goto out;
+	if (!cdev->client->ops || !cdev->client->ops->vf_capable) {
+		dev_dbg(&pf->pdev->dev,
+			"Cannot locate client instance VF capability routine\n");
+		goto out;
+	}
+	if (!test_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state))
+		goto out;
+
+	capable = cdev->client->ops->vf_capable(&cdev->lan_info,
+						cdev->client,
+						vf_id);
+out:
+	return capable;
+}
+
+void i40e_client_update_msix_info(struct i40e_pf *pf)
+{
+	struct i40e_client_instance *cdev = pf->cinst;
+
+	if (!cdev || !cdev->client)
+		return;
+
+	cdev->lan_info.msix_count = pf->num_iwarp_msix;
+	cdev->lan_info.msix_entries = &pf->msix_entries[pf->iwarp_base_vector];
+}
+
+/**
+ * i40e_client_add_instance - add a client instance struct to the instance list
+ * @pf: pointer to the board struct
+ * @client: pointer to a client struct in the client list.
+ * @existing: if there was already an existing instance
+ *
+ **/
+static void i40e_client_add_instance(struct i40e_pf *pf)
+{
+	struct i40e_client_instance *cdev = NULL;
+	struct netdev_hw_addr *mac = NULL;
+	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+
+	if (!registered_client || pf->cinst)
+		return;
+
+	cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
+	if (!cdev)
+		return;
+
+	cdev->lan_info.pf = (void *)pf;
+	cdev->lan_info.netdev = vsi->netdev;
+	cdev->lan_info.pcidev = pf->pdev;
+	cdev->lan_info.fid = pf->hw.pf_id;
+	cdev->lan_info.ftype = I40E_CLIENT_FTYPE_PF;
+	cdev->lan_info.hw_addr = pf->hw.hw_addr;
+	cdev->lan_info.ops = &i40e_lan_ops;
+	cdev->lan_info.version.major = I40E_CLIENT_VERSION_MAJOR;
+	cdev->lan_info.version.minor = I40E_CLIENT_VERSION_MINOR;
+	cdev->lan_info.version.build = I40E_CLIENT_VERSION_BUILD;
+	cdev->lan_info.fw_maj_ver = pf->hw.aq.fw_maj_ver;
+	cdev->lan_info.fw_min_ver = pf->hw.aq.fw_min_ver;
+	cdev->lan_info.fw_build = pf->hw.aq.fw_build;
+	set_bit(__I40E_CLIENT_INSTANCE_NONE, &cdev->state);
+
+	if (i40e_client_get_params(vsi, &cdev->lan_info.params)) {
+		kfree(cdev);
+		cdev = NULL;
+		return;
+	}
+
+	mac = list_first_entry(&cdev->lan_info.netdev->dev_addrs.list,
+			       struct netdev_hw_addr, list);
+	if (mac)
+		ether_addr_copy(cdev->lan_info.lanmac, mac->addr);
+	else
+		dev_err(&pf->pdev->dev, "MAC address list is empty!\n");
+
+	cdev->client = registered_client;
+	pf->cinst = cdev;
+
+	i40e_client_update_msix_info(pf);
+}
+
+/**
+ * i40e_client_del_instance - removes a client instance from the list
+ * @pf: pointer to the board struct
+ *
+ **/
+static
+void i40e_client_del_instance(struct i40e_pf *pf)
+{
+	kfree(pf->cinst);
+	pf->cinst = NULL;
+}
+
+/**
+ * i40e_client_subtask - client maintenance work
+ * @pf: board private structure
+ **/
+void i40e_client_subtask(struct i40e_pf *pf)
+{
+	struct i40e_client *client = registered_client;
+	struct i40e_client_instance *cdev;
+	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+	int ret = 0;
+
+	if (!test_and_clear_bit(__I40E_CLIENT_SERVICE_REQUESTED, pf->state))
+		return;
+	cdev = pf->cinst;
+
+	/* If we're down or resetting, just bail */
+	if (test_bit(__I40E_DOWN, pf->state) ||
+	    test_bit(__I40E_CONFIG_BUSY, pf->state))
+		return;
+
+	if (!client || !cdev)
+		return;
+
+	/* Here we handle client opens. If the client is down, and
+	 * the netdev is registered, then open the client.
+	 */
+	if (!test_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state)) {
+		if (vsi->netdev_registered &&
+		    client->ops && client->ops->open) {
+			set_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state);
+			ret = client->ops->open(&cdev->lan_info, client);
+			if (ret) {
+				/* Remove failed client instance */
+				clear_bit(__I40E_CLIENT_INSTANCE_OPENED,
+					  &cdev->state);
+				i40e_client_del_instance(pf);
+			}
+		}
+	}
+
+	/* enable/disable PE TCP_ENA flag based on netdev down/up
+	 */
+	if (test_bit(__I40E_VSI_DOWN, vsi->state))
+		i40e_client_update_vsi_ctxt(&cdev->lan_info, client,
+					    0, 0, 0,
+					    I40E_CLIENT_VSI_FLAG_TCP_ENABLE);
+	else
+		i40e_client_update_vsi_ctxt(&cdev->lan_info, client,
+					    0, 0,
+					    I40E_CLIENT_VSI_FLAG_TCP_ENABLE,
+					    I40E_CLIENT_VSI_FLAG_TCP_ENABLE);
+}
+
+/**
+ * i40e_lan_add_device - add a lan device struct to the list of lan devices
+ * @pf: pointer to the board struct
+ *
+ * Returns 0 on success or none 0 on error
+ **/
+int i40e_lan_add_device(struct i40e_pf *pf)
+{
+	struct i40e_device *ldev;
+	int ret = 0;
+
+	mutex_lock(&i40e_device_mutex);
+	list_for_each_entry(ldev, &i40e_devices, list) {
+		if (ldev->pf == pf) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
+	if (!ldev) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ldev->pf = pf;
+	INIT_LIST_HEAD(&ldev->list);
+	list_add(&ldev->list, &i40e_devices);
+	dev_info(&pf->pdev->dev, "Added LAN device PF%d bus=0x%02x dev=0x%02x func=0x%02x\n",
+		 pf->hw.pf_id, pf->hw.bus.bus_id,
+		 pf->hw.bus.device, pf->hw.bus.func);
+
+	/* If a client has already been registered, we need to add an instance
+	 * of it to our new LAN device.
+	 */
+	if (registered_client)
+		i40e_client_add_instance(pf);
+
+	/* Since in some cases register may have happened before a device gets
+	 * added, we can schedule a subtask to go initiate the clients if
+	 * they can be launched at probe time.
+	 */
+	set_bit(__I40E_CLIENT_SERVICE_REQUESTED, pf->state);
+	i40e_service_event_schedule(pf);
+
+out:
+	mutex_unlock(&i40e_device_mutex);
+	return ret;
+}
+
+/**
+ * i40e_lan_del_device - removes a lan device from the device list
+ * @pf: pointer to the board struct
+ *
+ * Returns 0 on success or non-0 on error
+ **/
+int i40e_lan_del_device(struct i40e_pf *pf)
+{
+	struct i40e_device *ldev, *tmp;
+	int ret = -ENODEV;
+
+	/* First, remove any client instance. */
+	i40e_client_del_instance(pf);
+
+	mutex_lock(&i40e_device_mutex);
+	list_for_each_entry_safe(ldev, tmp, &i40e_devices, list) {
+		if (ldev->pf == pf) {
+			dev_info(&pf->pdev->dev, "Deleted LAN device PF%d bus=0x%02x dev=0x%02x func=0x%02x\n",
+				 pf->hw.pf_id, pf->hw.bus.bus_id,
+				 pf->hw.bus.device, pf->hw.bus.func);
+			list_del(&ldev->list);
+			kfree(ldev);
+			ret = 0;
+			break;
+		}
+	}
+	mutex_unlock(&i40e_device_mutex);
+	return ret;
+}
+
+/**
+ * i40e_client_release - release client specific resources
+ * @client: pointer to the registered client
+ *
+ **/
+static void i40e_client_release(struct i40e_client *client)
+{
+	struct i40e_client_instance *cdev;
+	struct i40e_device *ldev;
+	struct i40e_pf *pf;
+
+	mutex_lock(&i40e_device_mutex);
+	list_for_each_entry(ldev, &i40e_devices, list) {
+		pf = ldev->pf;
+		cdev = pf->cinst;
+		if (!cdev)
+			continue;
+
+		while (test_and_set_bit(__I40E_SERVICE_SCHED,
+					pf->state))
+			usleep_range(500, 1000);
+
+		if (test_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state)) {
+			if (client->ops && client->ops->close)
+				client->ops->close(&cdev->lan_info, client,
+						   false);
+			i40e_client_release_qvlist(&cdev->lan_info);
+			clear_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state);
+
+			dev_warn(&pf->pdev->dev,
+				 "Client %s instance for PF id %d closed\n",
+				 client->name, pf->hw.pf_id);
+		}
+		/* delete the client instance */
+		i40e_client_del_instance(pf);
+		dev_info(&pf->pdev->dev, "Deleted client instance of Client %s\n",
+			 client->name);
+		clear_bit(__I40E_SERVICE_SCHED, pf->state);
+	}
+	mutex_unlock(&i40e_device_mutex);
+}
+
+/**
+ * i40e_client_prepare - prepare client specific resources
+ * @client: pointer to the registered client
+ *
+ **/
+static void i40e_client_prepare(struct i40e_client *client)
+{
+	struct i40e_device *ldev;
+	struct i40e_pf *pf;
+
+	mutex_lock(&i40e_device_mutex);
+	list_for_each_entry(ldev, &i40e_devices, list) {
+		pf = ldev->pf;
+		i40e_client_add_instance(pf);
+		/* Start the client subtask */
+		set_bit(__I40E_CLIENT_SERVICE_REQUESTED, pf->state);
+		i40e_service_event_schedule(pf);
+	}
+	mutex_unlock(&i40e_device_mutex);
+}
+
+/**
+ * i40e_client_virtchnl_send - TBD
+ * @ldev: pointer to L2 context
+ * @client: Client pointer
+ * @vf_id: absolute VF identifier
+ * @msg: message buffer
+ * @len: length of message buffer
+ *
+ * Return 0 on success or < 0 on error
+ **/
+static int i40e_client_virtchnl_send(struct i40e_info *ldev,
+				     struct i40e_client *client,
+				     u32 vf_id, u8 *msg, u16 len)
+{
+	struct i40e_pf *pf = ldev->pf;
+	struct i40e_hw *hw = &pf->hw;
+	i40e_status err;
+
+	err = i40e_aq_send_msg_to_vf(hw, vf_id, VIRTCHNL_OP_IWARP,
+				     0, msg, len, NULL);
+	if (err)
+		dev_err(&pf->pdev->dev, "Unable to send iWarp message to VF, error %d, aq status %d\n",
+			err, hw->aq.asq_last_status);
+
+	return err;
+}
+
+/**
+ * i40e_client_setup_qvlist
+ * @ldev: pointer to L2 context.
+ * @client: Client pointer.
+ * @qv_info: queue and vector list
+ *
+ * Return 0 on success or < 0 on error
+ **/
+static int i40e_client_setup_qvlist(struct i40e_info *ldev,
+				    struct i40e_client *client,
+				    struct i40e_qvlist_info *qvlist_info)
+{
+	struct i40e_pf *pf = ldev->pf;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_qv_info *qv_info;
+	u32 v_idx, i, reg_idx, reg;
+	u32 size;
+
+	size = sizeof(struct i40e_qvlist_info) +
+	       (sizeof(struct i40e_qv_info) * (qvlist_info->num_vectors - 1));
+	ldev->qvlist_info = kzalloc(size, GFP_KERNEL);
+	if (!ldev->qvlist_info)
+		return -ENOMEM;
+	ldev->qvlist_info->num_vectors = qvlist_info->num_vectors;
+
+	for (i = 0; i < qvlist_info->num_vectors; i++) {
+		qv_info = &qvlist_info->qv_info[i];
+		if (!qv_info)
+			continue;
+		v_idx = qv_info->v_idx;
+
+		/* Validate vector id belongs to this client */
+		if ((v_idx >= (pf->iwarp_base_vector + pf->num_iwarp_msix)) ||
+		    (v_idx < pf->iwarp_base_vector))
+			goto err;
+
+		ldev->qvlist_info->qv_info[i] = *qv_info;
+		reg_idx = I40E_PFINT_LNKLSTN(v_idx - 1);
+
+		if (qv_info->ceq_idx == I40E_QUEUE_INVALID_IDX) {
+			/* Special case - No CEQ mapped on this vector */
+			wr32(hw, reg_idx, I40E_PFINT_LNKLSTN_FIRSTQ_INDX_MASK);
+		} else {
+			reg = (qv_info->ceq_idx &
+			       I40E_PFINT_LNKLSTN_FIRSTQ_INDX_MASK) |
+			       (I40E_QUEUE_TYPE_PE_CEQ <<
+			       I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
+			wr32(hw, reg_idx, reg);
+
+			reg = (I40E_PFINT_CEQCTL_CAUSE_ENA_MASK |
+			       (v_idx << I40E_PFINT_CEQCTL_MSIX_INDX_SHIFT) |
+			       (qv_info->itr_idx <<
+				I40E_PFINT_CEQCTL_ITR_INDX_SHIFT) |
+			       (I40E_QUEUE_END_OF_LIST <<
+				I40E_PFINT_CEQCTL_NEXTQ_INDX_SHIFT));
+			wr32(hw, I40E_PFINT_CEQCTL(qv_info->ceq_idx), reg);
+		}
+		if (qv_info->aeq_idx != I40E_QUEUE_INVALID_IDX) {
+			reg = (I40E_PFINT_AEQCTL_CAUSE_ENA_MASK |
+			       (v_idx << I40E_PFINT_AEQCTL_MSIX_INDX_SHIFT) |
+			       (qv_info->itr_idx <<
+				I40E_PFINT_AEQCTL_ITR_INDX_SHIFT));
+
+			wr32(hw, I40E_PFINT_AEQCTL, reg);
+		}
+	}
+	/* Mitigate sync problems with iwarp VF driver */
+	i40e_flush(hw);
+	return 0;
+err:
+	kfree(ldev->qvlist_info);
+	ldev->qvlist_info = NULL;
+	return -EINVAL;
+}
+
+/**
+ * i40e_client_request_reset
+ * @ldev: pointer to L2 context.
+ * @client: Client pointer.
+ * @level: reset level
+ **/
+static void i40e_client_request_reset(struct i40e_info *ldev,
+				      struct i40e_client *client,
+				      u32 reset_level)
+{
+	struct i40e_pf *pf = ldev->pf;
+
+	switch (reset_level) {
+	case I40E_CLIENT_RESET_LEVEL_PF:
+		set_bit(__I40E_PF_RESET_REQUESTED, pf->state);
+		break;
+	case I40E_CLIENT_RESET_LEVEL_CORE:
+		set_bit(__I40E_PF_RESET_REQUESTED, pf->state);
+		break;
+	default:
+		dev_warn(&pf->pdev->dev,
+			 "Client for PF id %d requested an unsupported reset: %d.\n",
+			 pf->hw.pf_id, reset_level);
+		break;
+	}
+
+	i40e_service_event_schedule(pf);
+}
+
+/**
+ * i40e_client_update_vsi_ctxt
+ * @ldev: pointer to L2 context.
+ * @client: Client pointer.
+ * @is_vf: if this for the VF
+ * @vf_id: if is_vf true this carries the vf_id
+ * @flag: Any device level setting that needs to be done for PE
+ * @valid_flag: Bits in this match up and enable changing of flag bits
+ *
+ * Return 0 on success or < 0 on error
+ **/
+static int i40e_client_update_vsi_ctxt(struct i40e_info *ldev,
+				       struct i40e_client *client,
+				       bool is_vf, u32 vf_id,
+				       u32 flag, u32 valid_flag)
+{
+	struct i40e_pf *pf = ldev->pf;
+	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+	struct i40e_vsi_context ctxt;
+	bool update = true;
+	i40e_status err;
+
+	/* TODO: for now do not allow setting VF's VSI setting */
+	if (is_vf)
+		return -EINVAL;
+
+	ctxt.seid = pf->main_vsi_seid;
+	ctxt.pf_num = pf->hw.pf_id;
+	err = i40e_aq_get_vsi_params(&pf->hw, &ctxt, NULL);
+	ctxt.flags = I40E_AQ_VSI_TYPE_PF;
+	if (err) {
+		dev_info(&pf->pdev->dev,
+			 "couldn't get PF vsi config, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, err),
+			 i40e_aq_str(&pf->hw,
+				     pf->hw.aq.asq_last_status));
+		return -ENOENT;
+	}
+
+	if ((valid_flag & I40E_CLIENT_VSI_FLAG_TCP_ENABLE) &&
+	    (flag & I40E_CLIENT_VSI_FLAG_TCP_ENABLE)) {
+		ctxt.info.valid_sections =
+			cpu_to_le16(I40E_AQ_VSI_PROP_QUEUE_OPT_VALID);
+		ctxt.info.queueing_opt_flags |= I40E_AQ_VSI_QUE_OPT_TCP_ENA;
+	} else if ((valid_flag & I40E_CLIENT_VSI_FLAG_TCP_ENABLE) &&
+		  !(flag & I40E_CLIENT_VSI_FLAG_TCP_ENABLE)) {
+		ctxt.info.valid_sections =
+			cpu_to_le16(I40E_AQ_VSI_PROP_QUEUE_OPT_VALID);
+		ctxt.info.queueing_opt_flags &= ~I40E_AQ_VSI_QUE_OPT_TCP_ENA;
+	} else {
+		update = false;
+		dev_warn(&pf->pdev->dev,
+			 "Client for PF id %d request an unsupported Config: %x.\n",
+			 pf->hw.pf_id, flag);
+	}
+
+	if (update) {
+		err = i40e_aq_update_vsi_params(&vsi->back->hw, &ctxt, NULL);
+		if (err) {
+			dev_info(&pf->pdev->dev,
+				 "update VSI ctxt for PE failed, err %s aq_err %s\n",
+				 i40e_stat_str(&pf->hw, err),
+				 i40e_aq_str(&pf->hw,
+					     pf->hw.aq.asq_last_status));
+		}
+	}
+	return err;
+}
+
+/**
+ * i40e_register_client - Register a i40e client driver with the L2 driver
+ * @client: pointer to the i40e_client struct
+ *
+ * Returns 0 on success or non-0 on error
+ **/
+int i40e_register_client(struct i40e_client *client)
+{
+	int ret = 0;
+
+	if (!client) {
+		ret = -EIO;
+		goto out;
+	}
+
+	if (strlen(client->name) == 0) {
+		pr_info("i40e: Failed to register client with no name\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	if (registered_client) {
+		pr_info("i40e: Client %s has already been registered!\n",
+			client->name);
+		ret = -EEXIST;
+		goto out;
+	}
+
+	if ((client->version.major != I40E_CLIENT_VERSION_MAJOR) ||
+	    (client->version.minor != I40E_CLIENT_VERSION_MINOR)) {
+		pr_info("i40e: Failed to register client %s due to mismatched client interface version\n",
+			client->name);
+		pr_info("Client is using version: %02d.%02d.%02d while LAN driver supports %s\n",
+			client->version.major, client->version.minor,
+			client->version.build,
+			i40e_client_interface_version_str);
+		ret = -EIO;
+		goto out;
+	}
+
+	registered_client = client;
+
+	i40e_client_prepare(client);
+
+	pr_info("i40e: Registered client %s\n", client->name);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(i40e_register_client);
+
+/**
+ * i40e_unregister_client - Unregister a i40e client driver with the L2 driver
+ * @client: pointer to the i40e_client struct
+ *
+ * Returns 0 on success or non-0 on error
+ **/
+int i40e_unregister_client(struct i40e_client *client)
+{
+	int ret = 0;
+
+	if (registered_client != client) {
+		pr_info("i40e: Client %s has not been registered\n",
+			client->name);
+		ret = -ENODEV;
+		goto out;
+	}
+	registered_client = NULL;
+	/* When a unregister request comes through we would have to send
+	 * a close for each of the client instances that were opened.
+	 * client_release function is called to handle this.
+	 */
+	i40e_client_release(client);
+
+	pr_info("i40e: Unregistered client %s\n", client->name);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(i40e_unregister_client);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.h b/drivers/net/ethernet/intel/i40e/i40e_client.h
new file mode 100644
index 0000000..9d464d4
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.h
@@ -0,0 +1,227 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_CLIENT_H_
+#define _I40E_CLIENT_H_
+
+#define I40E_CLIENT_STR_LENGTH 10
+
+/* Client interface version should be updated anytime there is a change in the
+ * existing APIs or data structures.
+ */
+#define I40E_CLIENT_VERSION_MAJOR 0
+#define I40E_CLIENT_VERSION_MINOR 01
+#define I40E_CLIENT_VERSION_BUILD 00
+#define I40E_CLIENT_VERSION_STR     \
+	__stringify(I40E_CLIENT_VERSION_MAJOR) "." \
+	__stringify(I40E_CLIENT_VERSION_MINOR) "." \
+	__stringify(I40E_CLIENT_VERSION_BUILD)
+
+struct i40e_client_version {
+	u8 major;
+	u8 minor;
+	u8 build;
+	u8 rsvd;
+};
+
+enum i40e_client_state {
+	__I40E_CLIENT_NULL,
+	__I40E_CLIENT_REGISTERED
+};
+
+enum i40e_client_instance_state {
+	__I40E_CLIENT_INSTANCE_NONE,
+	__I40E_CLIENT_INSTANCE_OPENED,
+};
+
+struct i40e_ops;
+struct i40e_client;
+
+/* HW does not define a type value for AEQ; only for RX/TX and CEQ.
+ * In order for us to keep the interface simple, SW will define a
+ * unique type value for AEQ.
+ */
+#define I40E_QUEUE_TYPE_PE_AEQ  0x80
+#define I40E_QUEUE_INVALID_IDX	0xFFFF
+
+struct i40e_qv_info {
+	u32 v_idx; /* msix_vector */
+	u16 ceq_idx;
+	u16 aeq_idx;
+	u8 itr_idx;
+};
+
+struct i40e_qvlist_info {
+	u32 num_vectors;
+	struct i40e_qv_info qv_info[1];
+};
+
+#define I40E_CLIENT_MSIX_ALL 0xFFFFFFFF
+
+/* set of LAN parameters useful for clients managed by LAN */
+
+/* Struct to hold per priority info */
+struct i40e_prio_qos_params {
+	u16 qs_handle; /* qs handle for prio */
+	u8 tc; /* TC mapped to prio */
+	u8 reserved;
+};
+
+#define I40E_CLIENT_MAX_USER_PRIORITY        8
+/* Struct to hold Client QoS */
+struct i40e_qos_params {
+	struct i40e_prio_qos_params prio_qos[I40E_CLIENT_MAX_USER_PRIORITY];
+};
+
+struct i40e_params {
+	struct i40e_qos_params qos;
+	u16 mtu;
+};
+
+/* Structure to hold Lan device info for a client device */
+struct i40e_info {
+	struct i40e_client_version version;
+	u8 lanmac[6];
+	struct net_device *netdev;
+	struct pci_dev *pcidev;
+	u8 __iomem *hw_addr;
+	u8 fid;	/* function id, PF id or VF id */
+#define I40E_CLIENT_FTYPE_PF 0
+#define I40E_CLIENT_FTYPE_VF 1
+	u8 ftype; /* function type, PF or VF */
+	void *pf;
+
+	/* All L2 params that could change during the life span of the PF
+	 * and needs to be communicated to the client when they change
+	 */
+	struct i40e_qvlist_info *qvlist_info;
+	struct i40e_params params;
+	struct i40e_ops *ops;
+
+	u16 msix_count;	 /* number of msix vectors*/
+	/* Array down below will be dynamically allocated based on msix_count */
+	struct msix_entry *msix_entries;
+	u16 itr_index; /* Which ITR index the PE driver is suppose to use */
+	u16 fw_maj_ver;                 /* firmware major version */
+	u16 fw_min_ver;                 /* firmware minor version */
+	u32 fw_build;                   /* firmware build number */
+};
+
+#define I40E_CLIENT_RESET_LEVEL_PF   1
+#define I40E_CLIENT_RESET_LEVEL_CORE 2
+#define I40E_CLIENT_VSI_FLAG_TCP_ENABLE  BIT(1)
+
+struct i40e_ops {
+	/* setup_q_vector_list enables queues with a particular vector */
+	int (*setup_qvlist)(struct i40e_info *ldev, struct i40e_client *client,
+			    struct i40e_qvlist_info *qv_info);
+
+	int (*virtchnl_send)(struct i40e_info *ldev, struct i40e_client *client,
+			     u32 vf_id, u8 *msg, u16 len);
+
+	/* If the PE Engine is unresponsive, RDMA driver can request a reset.
+	 * The level helps determine the level of reset being requested.
+	 */
+	void (*request_reset)(struct i40e_info *ldev,
+			      struct i40e_client *client, u32 level);
+
+	/* API for the RDMA driver to set certain VSI flags that control
+	 * PE Engine.
+	 */
+	int (*update_vsi_ctxt)(struct i40e_info *ldev,
+			       struct i40e_client *client,
+			       bool is_vf, u32 vf_id,
+			       u32 flag, u32 valid_flag);
+};
+
+struct i40e_client_ops {
+	/* Should be called from register_client() or whenever PF is ready
+	 * to create a specific client instance.
+	 */
+	int (*open)(struct i40e_info *ldev, struct i40e_client *client);
+
+	/* Should be called when netdev is unavailable or when unregister
+	 * call comes in. If the close is happenening due to a reset being
+	 * triggered set the reset bit to true.
+	 */
+	void (*close)(struct i40e_info *ldev, struct i40e_client *client,
+		      bool reset);
+
+	/* called when some l2 managed parameters changes - mtu */
+	void (*l2_param_change)(struct i40e_info *ldev,
+				struct i40e_client *client,
+				struct i40e_params *params);
+
+	int (*virtchnl_receive)(struct i40e_info *ldev,
+				struct i40e_client *client, u32 vf_id,
+				u8 *msg, u16 len);
+
+	/* called when a VF is reset by the PF */
+	void (*vf_reset)(struct i40e_info *ldev,
+			 struct i40e_client *client, u32 vf_id);
+
+	/* called when the number of VFs changes */
+	void (*vf_enable)(struct i40e_info *ldev,
+			  struct i40e_client *client, u32 num_vfs);
+
+	/* returns true if VF is capable of specified offload */
+	int (*vf_capable)(struct i40e_info *ldev,
+			  struct i40e_client *client, u32 vf_id);
+};
+
+/* Client device */
+struct i40e_client_instance {
+	struct list_head list;
+	struct i40e_info lan_info;
+	struct i40e_client *client;
+	unsigned long  state;
+};
+
+struct i40e_client {
+	struct list_head list;		/* list of registered clients */
+	char name[I40E_CLIENT_STR_LENGTH];
+	struct i40e_client_version version;
+	unsigned long state;		/* client state */
+	atomic_t ref_cnt;  /* Count of all the client devices of this kind */
+	u32 flags;
+#define I40E_CLIENT_FLAGS_LAUNCH_ON_PROBE	BIT(0)
+#define I40E_TX_FLAGS_NOTIFY_OTHER_EVENTS	BIT(2)
+	u8 type;
+#define I40E_CLIENT_IWARP 0
+	const struct i40e_client_ops *ops; /* client ops provided by the client */
+};
+
+static inline bool i40e_client_is_registered(struct i40e_client *client)
+{
+	return test_bit(__I40E_CLIENT_REGISTERED, &client->state);
+}
+
+/* used by clients */
+int i40e_register_client(struct i40e_client *client);
+int i40e_unregister_client(struct i40e_client *client);
+
+#endif /* _I40E_CLIENT_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
new file mode 100644
index 0000000..c0a3dae
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -0,0 +1,5717 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#include "i40e_type.h"
+#include "i40e_adminq.h"
+#include "i40e_prototype.h"
+#include <linux/avf/virtchnl.h>
+
+/**
+ * i40e_set_mac_type - Sets MAC type
+ * @hw: pointer to the HW structure
+ *
+ * This function sets the mac type of the adapter based on the
+ * vendor ID and device ID stored in the hw structure.
+ **/
+static i40e_status i40e_set_mac_type(struct i40e_hw *hw)
+{
+	i40e_status status = 0;
+
+	if (hw->vendor_id == PCI_VENDOR_ID_INTEL) {
+		switch (hw->device_id) {
+		case I40E_DEV_ID_SFP_XL710:
+		case I40E_DEV_ID_QEMU:
+		case I40E_DEV_ID_KX_B:
+		case I40E_DEV_ID_KX_C:
+		case I40E_DEV_ID_QSFP_A:
+		case I40E_DEV_ID_QSFP_B:
+		case I40E_DEV_ID_QSFP_C:
+		case I40E_DEV_ID_10G_BASE_T:
+		case I40E_DEV_ID_10G_BASE_T4:
+		case I40E_DEV_ID_20G_KR2:
+		case I40E_DEV_ID_20G_KR2_A:
+		case I40E_DEV_ID_25G_B:
+		case I40E_DEV_ID_25G_SFP28:
+			hw->mac.type = I40E_MAC_XL710;
+			break;
+		case I40E_DEV_ID_KX_X722:
+		case I40E_DEV_ID_QSFP_X722:
+		case I40E_DEV_ID_SFP_X722:
+		case I40E_DEV_ID_1G_BASE_T_X722:
+		case I40E_DEV_ID_10G_BASE_T_X722:
+		case I40E_DEV_ID_SFP_I_X722:
+			hw->mac.type = I40E_MAC_X722;
+			break;
+		default:
+			hw->mac.type = I40E_MAC_GENERIC;
+			break;
+		}
+	} else {
+		status = I40E_ERR_DEVICE_NOT_SUPPORTED;
+	}
+
+	hw_dbg(hw, "i40e_set_mac_type found mac: %d, returns: %d\n",
+		  hw->mac.type, status);
+	return status;
+}
+
+/**
+ * i40e_aq_str - convert AQ err code to a string
+ * @hw: pointer to the HW structure
+ * @aq_err: the AQ error code to convert
+ **/
+const char *i40e_aq_str(struct i40e_hw *hw, enum i40e_admin_queue_err aq_err)
+{
+	switch (aq_err) {
+	case I40E_AQ_RC_OK:
+		return "OK";
+	case I40E_AQ_RC_EPERM:
+		return "I40E_AQ_RC_EPERM";
+	case I40E_AQ_RC_ENOENT:
+		return "I40E_AQ_RC_ENOENT";
+	case I40E_AQ_RC_ESRCH:
+		return "I40E_AQ_RC_ESRCH";
+	case I40E_AQ_RC_EINTR:
+		return "I40E_AQ_RC_EINTR";
+	case I40E_AQ_RC_EIO:
+		return "I40E_AQ_RC_EIO";
+	case I40E_AQ_RC_ENXIO:
+		return "I40E_AQ_RC_ENXIO";
+	case I40E_AQ_RC_E2BIG:
+		return "I40E_AQ_RC_E2BIG";
+	case I40E_AQ_RC_EAGAIN:
+		return "I40E_AQ_RC_EAGAIN";
+	case I40E_AQ_RC_ENOMEM:
+		return "I40E_AQ_RC_ENOMEM";
+	case I40E_AQ_RC_EACCES:
+		return "I40E_AQ_RC_EACCES";
+	case I40E_AQ_RC_EFAULT:
+		return "I40E_AQ_RC_EFAULT";
+	case I40E_AQ_RC_EBUSY:
+		return "I40E_AQ_RC_EBUSY";
+	case I40E_AQ_RC_EEXIST:
+		return "I40E_AQ_RC_EEXIST";
+	case I40E_AQ_RC_EINVAL:
+		return "I40E_AQ_RC_EINVAL";
+	case I40E_AQ_RC_ENOTTY:
+		return "I40E_AQ_RC_ENOTTY";
+	case I40E_AQ_RC_ENOSPC:
+		return "I40E_AQ_RC_ENOSPC";
+	case I40E_AQ_RC_ENOSYS:
+		return "I40E_AQ_RC_ENOSYS";
+	case I40E_AQ_RC_ERANGE:
+		return "I40E_AQ_RC_ERANGE";
+	case I40E_AQ_RC_EFLUSHED:
+		return "I40E_AQ_RC_EFLUSHED";
+	case I40E_AQ_RC_BAD_ADDR:
+		return "I40E_AQ_RC_BAD_ADDR";
+	case I40E_AQ_RC_EMODE:
+		return "I40E_AQ_RC_EMODE";
+	case I40E_AQ_RC_EFBIG:
+		return "I40E_AQ_RC_EFBIG";
+	}
+
+	snprintf(hw->err_str, sizeof(hw->err_str), "%d", aq_err);
+	return hw->err_str;
+}
+
+/**
+ * i40e_stat_str - convert status err code to a string
+ * @hw: pointer to the HW structure
+ * @stat_err: the status error code to convert
+ **/
+const char *i40e_stat_str(struct i40e_hw *hw, i40e_status stat_err)
+{
+	switch (stat_err) {
+	case 0:
+		return "OK";
+	case I40E_ERR_NVM:
+		return "I40E_ERR_NVM";
+	case I40E_ERR_NVM_CHECKSUM:
+		return "I40E_ERR_NVM_CHECKSUM";
+	case I40E_ERR_PHY:
+		return "I40E_ERR_PHY";
+	case I40E_ERR_CONFIG:
+		return "I40E_ERR_CONFIG";
+	case I40E_ERR_PARAM:
+		return "I40E_ERR_PARAM";
+	case I40E_ERR_MAC_TYPE:
+		return "I40E_ERR_MAC_TYPE";
+	case I40E_ERR_UNKNOWN_PHY:
+		return "I40E_ERR_UNKNOWN_PHY";
+	case I40E_ERR_LINK_SETUP:
+		return "I40E_ERR_LINK_SETUP";
+	case I40E_ERR_ADAPTER_STOPPED:
+		return "I40E_ERR_ADAPTER_STOPPED";
+	case I40E_ERR_INVALID_MAC_ADDR:
+		return "I40E_ERR_INVALID_MAC_ADDR";
+	case I40E_ERR_DEVICE_NOT_SUPPORTED:
+		return "I40E_ERR_DEVICE_NOT_SUPPORTED";
+	case I40E_ERR_MASTER_REQUESTS_PENDING:
+		return "I40E_ERR_MASTER_REQUESTS_PENDING";
+	case I40E_ERR_INVALID_LINK_SETTINGS:
+		return "I40E_ERR_INVALID_LINK_SETTINGS";
+	case I40E_ERR_AUTONEG_NOT_COMPLETE:
+		return "I40E_ERR_AUTONEG_NOT_COMPLETE";
+	case I40E_ERR_RESET_FAILED:
+		return "I40E_ERR_RESET_FAILED";
+	case I40E_ERR_SWFW_SYNC:
+		return "I40E_ERR_SWFW_SYNC";
+	case I40E_ERR_NO_AVAILABLE_VSI:
+		return "I40E_ERR_NO_AVAILABLE_VSI";
+	case I40E_ERR_NO_MEMORY:
+		return "I40E_ERR_NO_MEMORY";
+	case I40E_ERR_BAD_PTR:
+		return "I40E_ERR_BAD_PTR";
+	case I40E_ERR_RING_FULL:
+		return "I40E_ERR_RING_FULL";
+	case I40E_ERR_INVALID_PD_ID:
+		return "I40E_ERR_INVALID_PD_ID";
+	case I40E_ERR_INVALID_QP_ID:
+		return "I40E_ERR_INVALID_QP_ID";
+	case I40E_ERR_INVALID_CQ_ID:
+		return "I40E_ERR_INVALID_CQ_ID";
+	case I40E_ERR_INVALID_CEQ_ID:
+		return "I40E_ERR_INVALID_CEQ_ID";
+	case I40E_ERR_INVALID_AEQ_ID:
+		return "I40E_ERR_INVALID_AEQ_ID";
+	case I40E_ERR_INVALID_SIZE:
+		return "I40E_ERR_INVALID_SIZE";
+	case I40E_ERR_INVALID_ARP_INDEX:
+		return "I40E_ERR_INVALID_ARP_INDEX";
+	case I40E_ERR_INVALID_FPM_FUNC_ID:
+		return "I40E_ERR_INVALID_FPM_FUNC_ID";
+	case I40E_ERR_QP_INVALID_MSG_SIZE:
+		return "I40E_ERR_QP_INVALID_MSG_SIZE";
+	case I40E_ERR_QP_TOOMANY_WRS_POSTED:
+		return "I40E_ERR_QP_TOOMANY_WRS_POSTED";
+	case I40E_ERR_INVALID_FRAG_COUNT:
+		return "I40E_ERR_INVALID_FRAG_COUNT";
+	case I40E_ERR_QUEUE_EMPTY:
+		return "I40E_ERR_QUEUE_EMPTY";
+	case I40E_ERR_INVALID_ALIGNMENT:
+		return "I40E_ERR_INVALID_ALIGNMENT";
+	case I40E_ERR_FLUSHED_QUEUE:
+		return "I40E_ERR_FLUSHED_QUEUE";
+	case I40E_ERR_INVALID_PUSH_PAGE_INDEX:
+		return "I40E_ERR_INVALID_PUSH_PAGE_INDEX";
+	case I40E_ERR_INVALID_IMM_DATA_SIZE:
+		return "I40E_ERR_INVALID_IMM_DATA_SIZE";
+	case I40E_ERR_TIMEOUT:
+		return "I40E_ERR_TIMEOUT";
+	case I40E_ERR_OPCODE_MISMATCH:
+		return "I40E_ERR_OPCODE_MISMATCH";
+	case I40E_ERR_CQP_COMPL_ERROR:
+		return "I40E_ERR_CQP_COMPL_ERROR";
+	case I40E_ERR_INVALID_VF_ID:
+		return "I40E_ERR_INVALID_VF_ID";
+	case I40E_ERR_INVALID_HMCFN_ID:
+		return "I40E_ERR_INVALID_HMCFN_ID";
+	case I40E_ERR_BACKING_PAGE_ERROR:
+		return "I40E_ERR_BACKING_PAGE_ERROR";
+	case I40E_ERR_NO_PBLCHUNKS_AVAILABLE:
+		return "I40E_ERR_NO_PBLCHUNKS_AVAILABLE";
+	case I40E_ERR_INVALID_PBLE_INDEX:
+		return "I40E_ERR_INVALID_PBLE_INDEX";
+	case I40E_ERR_INVALID_SD_INDEX:
+		return "I40E_ERR_INVALID_SD_INDEX";
+	case I40E_ERR_INVALID_PAGE_DESC_INDEX:
+		return "I40E_ERR_INVALID_PAGE_DESC_INDEX";
+	case I40E_ERR_INVALID_SD_TYPE:
+		return "I40E_ERR_INVALID_SD_TYPE";
+	case I40E_ERR_MEMCPY_FAILED:
+		return "I40E_ERR_MEMCPY_FAILED";
+	case I40E_ERR_INVALID_HMC_OBJ_INDEX:
+		return "I40E_ERR_INVALID_HMC_OBJ_INDEX";
+	case I40E_ERR_INVALID_HMC_OBJ_COUNT:
+		return "I40E_ERR_INVALID_HMC_OBJ_COUNT";
+	case I40E_ERR_INVALID_SRQ_ARM_LIMIT:
+		return "I40E_ERR_INVALID_SRQ_ARM_LIMIT";
+	case I40E_ERR_SRQ_ENABLED:
+		return "I40E_ERR_SRQ_ENABLED";
+	case I40E_ERR_ADMIN_QUEUE_ERROR:
+		return "I40E_ERR_ADMIN_QUEUE_ERROR";
+	case I40E_ERR_ADMIN_QUEUE_TIMEOUT:
+		return "I40E_ERR_ADMIN_QUEUE_TIMEOUT";
+	case I40E_ERR_BUF_TOO_SHORT:
+		return "I40E_ERR_BUF_TOO_SHORT";
+	case I40E_ERR_ADMIN_QUEUE_FULL:
+		return "I40E_ERR_ADMIN_QUEUE_FULL";
+	case I40E_ERR_ADMIN_QUEUE_NO_WORK:
+		return "I40E_ERR_ADMIN_QUEUE_NO_WORK";
+	case I40E_ERR_BAD_IWARP_CQE:
+		return "I40E_ERR_BAD_IWARP_CQE";
+	case I40E_ERR_NVM_BLANK_MODE:
+		return "I40E_ERR_NVM_BLANK_MODE";
+	case I40E_ERR_NOT_IMPLEMENTED:
+		return "I40E_ERR_NOT_IMPLEMENTED";
+	case I40E_ERR_PE_DOORBELL_NOT_ENABLED:
+		return "I40E_ERR_PE_DOORBELL_NOT_ENABLED";
+	case I40E_ERR_DIAG_TEST_FAILED:
+		return "I40E_ERR_DIAG_TEST_FAILED";
+	case I40E_ERR_NOT_READY:
+		return "I40E_ERR_NOT_READY";
+	case I40E_NOT_SUPPORTED:
+		return "I40E_NOT_SUPPORTED";
+	case I40E_ERR_FIRMWARE_API_VERSION:
+		return "I40E_ERR_FIRMWARE_API_VERSION";
+	case I40E_ERR_ADMIN_QUEUE_CRITICAL_ERROR:
+		return "I40E_ERR_ADMIN_QUEUE_CRITICAL_ERROR";
+	}
+
+	snprintf(hw->err_str, sizeof(hw->err_str), "%d", stat_err);
+	return hw->err_str;
+}
+
+/**
+ * i40e_debug_aq
+ * @hw: debug mask related to admin queue
+ * @mask: debug mask
+ * @desc: pointer to admin queue descriptor
+ * @buffer: pointer to command buffer
+ * @buf_len: max length of buffer
+ *
+ * Dumps debug log about adminq command with descriptor contents.
+ **/
+void i40e_debug_aq(struct i40e_hw *hw, enum i40e_debug_mask mask, void *desc,
+		   void *buffer, u16 buf_len)
+{
+	struct i40e_aq_desc *aq_desc = (struct i40e_aq_desc *)desc;
+	u16 len;
+	u8 *buf = (u8 *)buffer;
+
+	if ((!(mask & hw->debug_mask)) || (desc == NULL))
+		return;
+
+	len = le16_to_cpu(aq_desc->datalen);
+
+	i40e_debug(hw, mask,
+		   "AQ CMD: opcode 0x%04X, flags 0x%04X, datalen 0x%04X, retval 0x%04X\n",
+		   le16_to_cpu(aq_desc->opcode),
+		   le16_to_cpu(aq_desc->flags),
+		   le16_to_cpu(aq_desc->datalen),
+		   le16_to_cpu(aq_desc->retval));
+	i40e_debug(hw, mask, "\tcookie (h,l) 0x%08X 0x%08X\n",
+		   le32_to_cpu(aq_desc->cookie_high),
+		   le32_to_cpu(aq_desc->cookie_low));
+	i40e_debug(hw, mask, "\tparam (0,1)  0x%08X 0x%08X\n",
+		   le32_to_cpu(aq_desc->params.internal.param0),
+		   le32_to_cpu(aq_desc->params.internal.param1));
+	i40e_debug(hw, mask, "\taddr (h,l)   0x%08X 0x%08X\n",
+		   le32_to_cpu(aq_desc->params.external.addr_high),
+		   le32_to_cpu(aq_desc->params.external.addr_low));
+
+	if ((buffer != NULL) && (aq_desc->datalen != 0)) {
+		i40e_debug(hw, mask, "AQ CMD Buffer:\n");
+		if (buf_len < len)
+			len = buf_len;
+		/* write the full 16-byte chunks */
+		if (hw->debug_mask & mask) {
+			char prefix[27];
+
+			snprintf(prefix, sizeof(prefix),
+				 "i40e %02x:%02x.%x: \t0x",
+				 hw->bus.bus_id,
+				 hw->bus.device,
+				 hw->bus.func);
+
+			print_hex_dump(KERN_INFO, prefix, DUMP_PREFIX_OFFSET,
+				       16, 1, buf, len, false);
+		}
+	}
+}
+
+/**
+ * i40e_check_asq_alive
+ * @hw: pointer to the hw struct
+ *
+ * Returns true if Queue is enabled else false.
+ **/
+bool i40e_check_asq_alive(struct i40e_hw *hw)
+{
+	if (hw->aq.asq.len)
+		return !!(rd32(hw, hw->aq.asq.len) &
+			  I40E_PF_ATQLEN_ATQENABLE_MASK);
+	else
+		return false;
+}
+
+/**
+ * i40e_aq_queue_shutdown
+ * @hw: pointer to the hw struct
+ * @unloading: is the driver unloading itself
+ *
+ * Tell the Firmware that we're shutting down the AdminQ and whether
+ * or not the driver is unloading as well.
+ **/
+i40e_status i40e_aq_queue_shutdown(struct i40e_hw *hw,
+					     bool unloading)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_queue_shutdown *cmd =
+		(struct i40e_aqc_queue_shutdown *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_queue_shutdown);
+
+	if (unloading)
+		cmd->driver_unloading = cpu_to_le32(I40E_AQ_DRIVER_UNLOADING);
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_set_rss_lut
+ * @hw: pointer to the hardware structure
+ * @vsi_id: vsi fw index
+ * @pf_lut: for PF table set true, for VSI table set false
+ * @lut: pointer to the lut buffer provided by the caller
+ * @lut_size: size of the lut buffer
+ * @set: set true to set the table, false to get the table
+ *
+ * Internal function to get or set RSS look up table
+ **/
+static i40e_status i40e_aq_get_set_rss_lut(struct i40e_hw *hw,
+					   u16 vsi_id, bool pf_lut,
+					   u8 *lut, u16 lut_size,
+					   bool set)
+{
+	i40e_status status;
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_get_set_rss_lut *cmd_resp =
+		   (struct i40e_aqc_get_set_rss_lut *)&desc.params.raw;
+
+	if (set)
+		i40e_fill_default_direct_cmd_desc(&desc,
+						  i40e_aqc_opc_set_rss_lut);
+	else
+		i40e_fill_default_direct_cmd_desc(&desc,
+						  i40e_aqc_opc_get_rss_lut);
+
+	/* Indirect command */
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_RD);
+
+	cmd_resp->vsi_id =
+			cpu_to_le16((u16)((vsi_id <<
+					  I40E_AQC_SET_RSS_LUT_VSI_ID_SHIFT) &
+					  I40E_AQC_SET_RSS_LUT_VSI_ID_MASK));
+	cmd_resp->vsi_id |= cpu_to_le16((u16)I40E_AQC_SET_RSS_LUT_VSI_VALID);
+
+	if (pf_lut)
+		cmd_resp->flags |= cpu_to_le16((u16)
+					((I40E_AQC_SET_RSS_LUT_TABLE_TYPE_PF <<
+					I40E_AQC_SET_RSS_LUT_TABLE_TYPE_SHIFT) &
+					I40E_AQC_SET_RSS_LUT_TABLE_TYPE_MASK));
+	else
+		cmd_resp->flags |= cpu_to_le16((u16)
+					((I40E_AQC_SET_RSS_LUT_TABLE_TYPE_VSI <<
+					I40E_AQC_SET_RSS_LUT_TABLE_TYPE_SHIFT) &
+					I40E_AQC_SET_RSS_LUT_TABLE_TYPE_MASK));
+
+	status = i40e_asq_send_command(hw, &desc, lut, lut_size, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_rss_lut
+ * @hw: pointer to the hardware structure
+ * @vsi_id: vsi fw index
+ * @pf_lut: for PF table set true, for VSI table set false
+ * @lut: pointer to the lut buffer provided by the caller
+ * @lut_size: size of the lut buffer
+ *
+ * get the RSS lookup table, PF or VSI type
+ **/
+i40e_status i40e_aq_get_rss_lut(struct i40e_hw *hw, u16 vsi_id,
+				bool pf_lut, u8 *lut, u16 lut_size)
+{
+	return i40e_aq_get_set_rss_lut(hw, vsi_id, pf_lut, lut, lut_size,
+				       false);
+}
+
+/**
+ * i40e_aq_set_rss_lut
+ * @hw: pointer to the hardware structure
+ * @vsi_id: vsi fw index
+ * @pf_lut: for PF table set true, for VSI table set false
+ * @lut: pointer to the lut buffer provided by the caller
+ * @lut_size: size of the lut buffer
+ *
+ * set the RSS lookup table, PF or VSI type
+ **/
+i40e_status i40e_aq_set_rss_lut(struct i40e_hw *hw, u16 vsi_id,
+				bool pf_lut, u8 *lut, u16 lut_size)
+{
+	return i40e_aq_get_set_rss_lut(hw, vsi_id, pf_lut, lut, lut_size, true);
+}
+
+/**
+ * i40e_aq_get_set_rss_key
+ * @hw: pointer to the hw struct
+ * @vsi_id: vsi fw index
+ * @key: pointer to key info struct
+ * @set: set true to set the key, false to get the key
+ *
+ * get the RSS key per VSI
+ **/
+static i40e_status i40e_aq_get_set_rss_key(struct i40e_hw *hw,
+				      u16 vsi_id,
+				      struct i40e_aqc_get_set_rss_key_data *key,
+				      bool set)
+{
+	i40e_status status;
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_get_set_rss_key *cmd_resp =
+			(struct i40e_aqc_get_set_rss_key *)&desc.params.raw;
+	u16 key_size = sizeof(struct i40e_aqc_get_set_rss_key_data);
+
+	if (set)
+		i40e_fill_default_direct_cmd_desc(&desc,
+						  i40e_aqc_opc_set_rss_key);
+	else
+		i40e_fill_default_direct_cmd_desc(&desc,
+						  i40e_aqc_opc_get_rss_key);
+
+	/* Indirect command */
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_RD);
+
+	cmd_resp->vsi_id =
+			cpu_to_le16((u16)((vsi_id <<
+					  I40E_AQC_SET_RSS_KEY_VSI_ID_SHIFT) &
+					  I40E_AQC_SET_RSS_KEY_VSI_ID_MASK));
+	cmd_resp->vsi_id |= cpu_to_le16((u16)I40E_AQC_SET_RSS_KEY_VSI_VALID);
+
+	status = i40e_asq_send_command(hw, &desc, key, key_size, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_rss_key
+ * @hw: pointer to the hw struct
+ * @vsi_id: vsi fw index
+ * @key: pointer to key info struct
+ *
+ **/
+i40e_status i40e_aq_get_rss_key(struct i40e_hw *hw,
+				u16 vsi_id,
+				struct i40e_aqc_get_set_rss_key_data *key)
+{
+	return i40e_aq_get_set_rss_key(hw, vsi_id, key, false);
+}
+
+/**
+ * i40e_aq_set_rss_key
+ * @hw: pointer to the hw struct
+ * @vsi_id: vsi fw index
+ * @key: pointer to key info struct
+ *
+ * set the RSS key per VSI
+ **/
+i40e_status i40e_aq_set_rss_key(struct i40e_hw *hw,
+				u16 vsi_id,
+				struct i40e_aqc_get_set_rss_key_data *key)
+{
+	return i40e_aq_get_set_rss_key(hw, vsi_id, key, true);
+}
+
+/* The i40e_ptype_lookup table is used to convert from the 8-bit ptype in the
+ * hardware to a bit-field that can be used by SW to more easily determine the
+ * packet type.
+ *
+ * Macros are used to shorten the table lines and make this table human
+ * readable.
+ *
+ * We store the PTYPE in the top byte of the bit field - this is just so that
+ * we can check that the table doesn't have a row missing, as the index into
+ * the table should be the PTYPE.
+ *
+ * Typical work flow:
+ *
+ * IF NOT i40e_ptype_lookup[ptype].known
+ * THEN
+ *      Packet is unknown
+ * ELSE IF i40e_ptype_lookup[ptype].outer_ip == I40E_RX_PTYPE_OUTER_IP
+ *      Use the rest of the fields to look at the tunnels, inner protocols, etc
+ * ELSE
+ *      Use the enum i40e_rx_l2_ptype to decode the packet type
+ * ENDIF
+ */
+
+/* macro to make the table lines short */
+#define I40E_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
+	{	PTYPE, \
+		1, \
+		I40E_RX_PTYPE_OUTER_##OUTER_IP, \
+		I40E_RX_PTYPE_OUTER_##OUTER_IP_VER, \
+		I40E_RX_PTYPE_##OUTER_FRAG, \
+		I40E_RX_PTYPE_TUNNEL_##T, \
+		I40E_RX_PTYPE_TUNNEL_END_##TE, \
+		I40E_RX_PTYPE_##TEF, \
+		I40E_RX_PTYPE_INNER_PROT_##I, \
+		I40E_RX_PTYPE_PAYLOAD_LAYER_##PL }
+
+#define I40E_PTT_UNUSED_ENTRY(PTYPE) \
+		{ PTYPE, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+
+/* shorter macros makes the table fit but are terse */
+#define I40E_RX_PTYPE_NOF		I40E_RX_PTYPE_NOT_FRAG
+#define I40E_RX_PTYPE_FRG		I40E_RX_PTYPE_FRAG
+#define I40E_RX_PTYPE_INNER_PROT_TS	I40E_RX_PTYPE_INNER_PROT_TIMESYNC
+
+/* Lookup table mapping the HW PTYPE to the bit field for decoding */
+struct i40e_rx_ptype_decoded i40e_ptype_lookup[] = {
+	/* L2 Packet types */
+	I40E_PTT_UNUSED_ENTRY(0),
+	I40E_PTT(1,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
+	I40E_PTT(2,  L2, NONE, NOF, NONE, NONE, NOF, TS,   PAY2),
+	I40E_PTT(3,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
+	I40E_PTT_UNUSED_ENTRY(4),
+	I40E_PTT_UNUSED_ENTRY(5),
+	I40E_PTT(6,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
+	I40E_PTT(7,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
+	I40E_PTT_UNUSED_ENTRY(8),
+	I40E_PTT_UNUSED_ENTRY(9),
+	I40E_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
+	I40E_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
+	I40E_PTT(12, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(13, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(14, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(15, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(16, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(17, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(18, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(19, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(20, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(21, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
+
+	/* Non Tunneled IPv4 */
+	I40E_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(25),
+	I40E_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP,  PAY4),
+	I40E_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4),
+	I40E_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4),
+
+	/* IPv4 --> IPv4 */
+	I40E_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(32),
+	I40E_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 --> IPv6 */
+	I40E_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(39),
+	I40E_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT */
+	I40E_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
+
+	/* IPv4 --> GRE/NAT --> IPv4 */
+	I40E_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(47),
+	I40E_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT --> IPv6 */
+	I40E_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(54),
+	I40E_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT --> MAC */
+	I40E_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
+
+	/* IPv4 --> GRE/NAT --> MAC --> IPv4 */
+	I40E_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(62),
+	I40E_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT -> MAC --> IPv6 */
+	I40E_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(69),
+	I40E_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv4 --> GRE/NAT --> MAC/VLAN */
+	I40E_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
+
+	/* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */
+	I40E_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(77),
+	I40E_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */
+	I40E_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(84),
+	I40E_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
+
+	/* Non Tunneled IPv6 */
+	I40E_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3),
+	I40E_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(91),
+	I40E_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP,  PAY4),
+	I40E_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4),
+	I40E_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4),
+
+	/* IPv6 --> IPv4 */
+	I40E_PTT(95,  IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(96,  IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(97,  IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(98),
+	I40E_PTT(99,  IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> IPv6 */
+	I40E_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(105),
+	I40E_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT */
+	I40E_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
+
+	/* IPv6 --> GRE/NAT -> IPv4 */
+	I40E_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(113),
+	I40E_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> IPv6 */
+	I40E_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(120),
+	I40E_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC */
+	I40E_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
+
+	/* IPv6 --> GRE/NAT -> MAC -> IPv4 */
+	I40E_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(128),
+	I40E_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC -> IPv6 */
+	I40E_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(135),
+	I40E_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC/VLAN */
+	I40E_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
+
+	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */
+	I40E_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
+	I40E_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
+	I40E_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(143),
+	I40E_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
+	I40E_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
+	I40E_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
+
+	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */
+	I40E_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
+	I40E_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
+	I40E_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
+	I40E_PTT_UNUSED_ENTRY(150),
+	I40E_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
+	I40E_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
+	I40E_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
+
+	/* unused entries */
+	I40E_PTT_UNUSED_ENTRY(154),
+	I40E_PTT_UNUSED_ENTRY(155),
+	I40E_PTT_UNUSED_ENTRY(156),
+	I40E_PTT_UNUSED_ENTRY(157),
+	I40E_PTT_UNUSED_ENTRY(158),
+	I40E_PTT_UNUSED_ENTRY(159),
+
+	I40E_PTT_UNUSED_ENTRY(160),
+	I40E_PTT_UNUSED_ENTRY(161),
+	I40E_PTT_UNUSED_ENTRY(162),
+	I40E_PTT_UNUSED_ENTRY(163),
+	I40E_PTT_UNUSED_ENTRY(164),
+	I40E_PTT_UNUSED_ENTRY(165),
+	I40E_PTT_UNUSED_ENTRY(166),
+	I40E_PTT_UNUSED_ENTRY(167),
+	I40E_PTT_UNUSED_ENTRY(168),
+	I40E_PTT_UNUSED_ENTRY(169),
+
+	I40E_PTT_UNUSED_ENTRY(170),
+	I40E_PTT_UNUSED_ENTRY(171),
+	I40E_PTT_UNUSED_ENTRY(172),
+	I40E_PTT_UNUSED_ENTRY(173),
+	I40E_PTT_UNUSED_ENTRY(174),
+	I40E_PTT_UNUSED_ENTRY(175),
+	I40E_PTT_UNUSED_ENTRY(176),
+	I40E_PTT_UNUSED_ENTRY(177),
+	I40E_PTT_UNUSED_ENTRY(178),
+	I40E_PTT_UNUSED_ENTRY(179),
+
+	I40E_PTT_UNUSED_ENTRY(180),
+	I40E_PTT_UNUSED_ENTRY(181),
+	I40E_PTT_UNUSED_ENTRY(182),
+	I40E_PTT_UNUSED_ENTRY(183),
+	I40E_PTT_UNUSED_ENTRY(184),
+	I40E_PTT_UNUSED_ENTRY(185),
+	I40E_PTT_UNUSED_ENTRY(186),
+	I40E_PTT_UNUSED_ENTRY(187),
+	I40E_PTT_UNUSED_ENTRY(188),
+	I40E_PTT_UNUSED_ENTRY(189),
+
+	I40E_PTT_UNUSED_ENTRY(190),
+	I40E_PTT_UNUSED_ENTRY(191),
+	I40E_PTT_UNUSED_ENTRY(192),
+	I40E_PTT_UNUSED_ENTRY(193),
+	I40E_PTT_UNUSED_ENTRY(194),
+	I40E_PTT_UNUSED_ENTRY(195),
+	I40E_PTT_UNUSED_ENTRY(196),
+	I40E_PTT_UNUSED_ENTRY(197),
+	I40E_PTT_UNUSED_ENTRY(198),
+	I40E_PTT_UNUSED_ENTRY(199),
+
+	I40E_PTT_UNUSED_ENTRY(200),
+	I40E_PTT_UNUSED_ENTRY(201),
+	I40E_PTT_UNUSED_ENTRY(202),
+	I40E_PTT_UNUSED_ENTRY(203),
+	I40E_PTT_UNUSED_ENTRY(204),
+	I40E_PTT_UNUSED_ENTRY(205),
+	I40E_PTT_UNUSED_ENTRY(206),
+	I40E_PTT_UNUSED_ENTRY(207),
+	I40E_PTT_UNUSED_ENTRY(208),
+	I40E_PTT_UNUSED_ENTRY(209),
+
+	I40E_PTT_UNUSED_ENTRY(210),
+	I40E_PTT_UNUSED_ENTRY(211),
+	I40E_PTT_UNUSED_ENTRY(212),
+	I40E_PTT_UNUSED_ENTRY(213),
+	I40E_PTT_UNUSED_ENTRY(214),
+	I40E_PTT_UNUSED_ENTRY(215),
+	I40E_PTT_UNUSED_ENTRY(216),
+	I40E_PTT_UNUSED_ENTRY(217),
+	I40E_PTT_UNUSED_ENTRY(218),
+	I40E_PTT_UNUSED_ENTRY(219),
+
+	I40E_PTT_UNUSED_ENTRY(220),
+	I40E_PTT_UNUSED_ENTRY(221),
+	I40E_PTT_UNUSED_ENTRY(222),
+	I40E_PTT_UNUSED_ENTRY(223),
+	I40E_PTT_UNUSED_ENTRY(224),
+	I40E_PTT_UNUSED_ENTRY(225),
+	I40E_PTT_UNUSED_ENTRY(226),
+	I40E_PTT_UNUSED_ENTRY(227),
+	I40E_PTT_UNUSED_ENTRY(228),
+	I40E_PTT_UNUSED_ENTRY(229),
+
+	I40E_PTT_UNUSED_ENTRY(230),
+	I40E_PTT_UNUSED_ENTRY(231),
+	I40E_PTT_UNUSED_ENTRY(232),
+	I40E_PTT_UNUSED_ENTRY(233),
+	I40E_PTT_UNUSED_ENTRY(234),
+	I40E_PTT_UNUSED_ENTRY(235),
+	I40E_PTT_UNUSED_ENTRY(236),
+	I40E_PTT_UNUSED_ENTRY(237),
+	I40E_PTT_UNUSED_ENTRY(238),
+	I40E_PTT_UNUSED_ENTRY(239),
+
+	I40E_PTT_UNUSED_ENTRY(240),
+	I40E_PTT_UNUSED_ENTRY(241),
+	I40E_PTT_UNUSED_ENTRY(242),
+	I40E_PTT_UNUSED_ENTRY(243),
+	I40E_PTT_UNUSED_ENTRY(244),
+	I40E_PTT_UNUSED_ENTRY(245),
+	I40E_PTT_UNUSED_ENTRY(246),
+	I40E_PTT_UNUSED_ENTRY(247),
+	I40E_PTT_UNUSED_ENTRY(248),
+	I40E_PTT_UNUSED_ENTRY(249),
+
+	I40E_PTT_UNUSED_ENTRY(250),
+	I40E_PTT_UNUSED_ENTRY(251),
+	I40E_PTT_UNUSED_ENTRY(252),
+	I40E_PTT_UNUSED_ENTRY(253),
+	I40E_PTT_UNUSED_ENTRY(254),
+	I40E_PTT_UNUSED_ENTRY(255)
+};
+
+/**
+ * i40e_init_shared_code - Initialize the shared code
+ * @hw: pointer to hardware structure
+ *
+ * This assigns the MAC type and PHY code and inits the NVM.
+ * Does not touch the hardware. This function must be called prior to any
+ * other function in the shared code. The i40e_hw structure should be
+ * memset to 0 prior to calling this function.  The following fields in
+ * hw structure should be filled in prior to calling this function:
+ * hw_addr, back, device_id, vendor_id, subsystem_device_id,
+ * subsystem_vendor_id, and revision_id
+ **/
+i40e_status i40e_init_shared_code(struct i40e_hw *hw)
+{
+	i40e_status status = 0;
+	u32 port, ari, func_rid;
+
+	i40e_set_mac_type(hw);
+
+	switch (hw->mac.type) {
+	case I40E_MAC_XL710:
+	case I40E_MAC_X722:
+		break;
+	default:
+		return I40E_ERR_DEVICE_NOT_SUPPORTED;
+	}
+
+	hw->phy.get_link_info = true;
+
+	/* Determine port number and PF number*/
+	port = (rd32(hw, I40E_PFGEN_PORTNUM) & I40E_PFGEN_PORTNUM_PORT_NUM_MASK)
+					   >> I40E_PFGEN_PORTNUM_PORT_NUM_SHIFT;
+	hw->port = (u8)port;
+	ari = (rd32(hw, I40E_GLPCI_CAPSUP) & I40E_GLPCI_CAPSUP_ARI_EN_MASK) >>
+						 I40E_GLPCI_CAPSUP_ARI_EN_SHIFT;
+	func_rid = rd32(hw, I40E_PF_FUNC_RID);
+	if (ari)
+		hw->pf_id = (u8)(func_rid & 0xff);
+	else
+		hw->pf_id = (u8)(func_rid & 0x7);
+
+	if (hw->mac.type == I40E_MAC_X722)
+		hw->flags |= I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE |
+			     I40E_HW_FLAG_NVM_READ_REQUIRES_LOCK;
+
+	status = i40e_init_nvm(hw);
+	return status;
+}
+
+/**
+ * i40e_aq_mac_address_read - Retrieve the MAC addresses
+ * @hw: pointer to the hw struct
+ * @flags: a return indicator of what addresses were added to the addr store
+ * @addrs: the requestor's mac addr store
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+static i40e_status i40e_aq_mac_address_read(struct i40e_hw *hw,
+				   u16 *flags,
+				   struct i40e_aqc_mac_address_read_data *addrs,
+				   struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_mac_address_read *cmd_data =
+		(struct i40e_aqc_mac_address_read *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_mac_address_read);
+	desc.flags |= cpu_to_le16(I40E_AQ_FLAG_BUF);
+
+	status = i40e_asq_send_command(hw, &desc, addrs,
+				       sizeof(*addrs), cmd_details);
+	*flags = le16_to_cpu(cmd_data->command_flags);
+
+	return status;
+}
+
+/**
+ * i40e_aq_mac_address_write - Change the MAC addresses
+ * @hw: pointer to the hw struct
+ * @flags: indicates which MAC to be written
+ * @mac_addr: address to write
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_mac_address_write(struct i40e_hw *hw,
+				    u16 flags, u8 *mac_addr,
+				    struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_mac_address_write *cmd_data =
+		(struct i40e_aqc_mac_address_write *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_mac_address_write);
+	cmd_data->command_flags = cpu_to_le16(flags);
+	cmd_data->mac_sah = cpu_to_le16((u16)mac_addr[0] << 8 | mac_addr[1]);
+	cmd_data->mac_sal = cpu_to_le32(((u32)mac_addr[2] << 24) |
+					((u32)mac_addr[3] << 16) |
+					((u32)mac_addr[4] << 8) |
+					mac_addr[5]);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_get_mac_addr - get MAC address
+ * @hw: pointer to the HW structure
+ * @mac_addr: pointer to MAC address
+ *
+ * Reads the adapter's MAC address from register
+ **/
+i40e_status i40e_get_mac_addr(struct i40e_hw *hw, u8 *mac_addr)
+{
+	struct i40e_aqc_mac_address_read_data addrs;
+	i40e_status status;
+	u16 flags = 0;
+
+	status = i40e_aq_mac_address_read(hw, &flags, &addrs, NULL);
+
+	if (flags & I40E_AQC_LAN_ADDR_VALID)
+		ether_addr_copy(mac_addr, addrs.pf_lan_mac);
+
+	return status;
+}
+
+/**
+ * i40e_get_port_mac_addr - get Port MAC address
+ * @hw: pointer to the HW structure
+ * @mac_addr: pointer to Port MAC address
+ *
+ * Reads the adapter's Port MAC address
+ **/
+i40e_status i40e_get_port_mac_addr(struct i40e_hw *hw, u8 *mac_addr)
+{
+	struct i40e_aqc_mac_address_read_data addrs;
+	i40e_status status;
+	u16 flags = 0;
+
+	status = i40e_aq_mac_address_read(hw, &flags, &addrs, NULL);
+	if (status)
+		return status;
+
+	if (flags & I40E_AQC_PORT_ADDR_VALID)
+		ether_addr_copy(mac_addr, addrs.port_mac);
+	else
+		status = I40E_ERR_INVALID_MAC_ADDR;
+
+	return status;
+}
+
+/**
+ * i40e_pre_tx_queue_cfg - pre tx queue configure
+ * @hw: pointer to the HW structure
+ * @queue: target PF queue index
+ * @enable: state change request
+ *
+ * Handles hw requirement to indicate intention to enable
+ * or disable target queue.
+ **/
+void i40e_pre_tx_queue_cfg(struct i40e_hw *hw, u32 queue, bool enable)
+{
+	u32 abs_queue_idx = hw->func_caps.base_queue + queue;
+	u32 reg_block = 0;
+	u32 reg_val;
+
+	if (abs_queue_idx >= 128) {
+		reg_block = abs_queue_idx / 128;
+		abs_queue_idx %= 128;
+	}
+
+	reg_val = rd32(hw, I40E_GLLAN_TXPRE_QDIS(reg_block));
+	reg_val &= ~I40E_GLLAN_TXPRE_QDIS_QINDX_MASK;
+	reg_val |= (abs_queue_idx << I40E_GLLAN_TXPRE_QDIS_QINDX_SHIFT);
+
+	if (enable)
+		reg_val |= I40E_GLLAN_TXPRE_QDIS_CLEAR_QDIS_MASK;
+	else
+		reg_val |= I40E_GLLAN_TXPRE_QDIS_SET_QDIS_MASK;
+
+	wr32(hw, I40E_GLLAN_TXPRE_QDIS(reg_block), reg_val);
+}
+
+/**
+ *  i40e_read_pba_string - Reads part number string from EEPROM
+ *  @hw: pointer to hardware structure
+ *  @pba_num: stores the part number string from the EEPROM
+ *  @pba_num_size: part number string buffer length
+ *
+ *  Reads the part number string from the EEPROM.
+ **/
+i40e_status i40e_read_pba_string(struct i40e_hw *hw, u8 *pba_num,
+				 u32 pba_num_size)
+{
+	i40e_status status = 0;
+	u16 pba_word = 0;
+	u16 pba_size = 0;
+	u16 pba_ptr = 0;
+	u16 i = 0;
+
+	status = i40e_read_nvm_word(hw, I40E_SR_PBA_FLAGS, &pba_word);
+	if (status || (pba_word != 0xFAFA)) {
+		hw_dbg(hw, "Failed to read PBA flags or flag is invalid.\n");
+		return status;
+	}
+
+	status = i40e_read_nvm_word(hw, I40E_SR_PBA_BLOCK_PTR, &pba_ptr);
+	if (status) {
+		hw_dbg(hw, "Failed to read PBA Block pointer.\n");
+		return status;
+	}
+
+	status = i40e_read_nvm_word(hw, pba_ptr, &pba_size);
+	if (status) {
+		hw_dbg(hw, "Failed to read PBA Block size.\n");
+		return status;
+	}
+
+	/* Subtract one to get PBA word count (PBA Size word is included in
+	 * total size)
+	 */
+	pba_size--;
+	if (pba_num_size < (((u32)pba_size * 2) + 1)) {
+		hw_dbg(hw, "Buffer to small for PBA data.\n");
+		return I40E_ERR_PARAM;
+	}
+
+	for (i = 0; i < pba_size; i++) {
+		status = i40e_read_nvm_word(hw, (pba_ptr + 1) + i, &pba_word);
+		if (status) {
+			hw_dbg(hw, "Failed to read PBA Block word %d.\n", i);
+			return status;
+		}
+
+		pba_num[(i * 2)] = (pba_word >> 8) & 0xFF;
+		pba_num[(i * 2) + 1] = pba_word & 0xFF;
+	}
+	pba_num[(pba_size * 2)] = '\0';
+
+	return status;
+}
+
+/**
+ * i40e_get_media_type - Gets media type
+ * @hw: pointer to the hardware structure
+ **/
+static enum i40e_media_type i40e_get_media_type(struct i40e_hw *hw)
+{
+	enum i40e_media_type media;
+
+	switch (hw->phy.link_info.phy_type) {
+	case I40E_PHY_TYPE_10GBASE_SR:
+	case I40E_PHY_TYPE_10GBASE_LR:
+	case I40E_PHY_TYPE_1000BASE_SX:
+	case I40E_PHY_TYPE_1000BASE_LX:
+	case I40E_PHY_TYPE_40GBASE_SR4:
+	case I40E_PHY_TYPE_40GBASE_LR4:
+	case I40E_PHY_TYPE_25GBASE_LR:
+	case I40E_PHY_TYPE_25GBASE_SR:
+		media = I40E_MEDIA_TYPE_FIBER;
+		break;
+	case I40E_PHY_TYPE_100BASE_TX:
+	case I40E_PHY_TYPE_1000BASE_T:
+	case I40E_PHY_TYPE_10GBASE_T:
+		media = I40E_MEDIA_TYPE_BASET;
+		break;
+	case I40E_PHY_TYPE_10GBASE_CR1_CU:
+	case I40E_PHY_TYPE_40GBASE_CR4_CU:
+	case I40E_PHY_TYPE_10GBASE_CR1:
+	case I40E_PHY_TYPE_40GBASE_CR4:
+	case I40E_PHY_TYPE_10GBASE_SFPP_CU:
+	case I40E_PHY_TYPE_40GBASE_AOC:
+	case I40E_PHY_TYPE_10GBASE_AOC:
+	case I40E_PHY_TYPE_25GBASE_CR:
+	case I40E_PHY_TYPE_25GBASE_AOC:
+	case I40E_PHY_TYPE_25GBASE_ACC:
+		media = I40E_MEDIA_TYPE_DA;
+		break;
+	case I40E_PHY_TYPE_1000BASE_KX:
+	case I40E_PHY_TYPE_10GBASE_KX4:
+	case I40E_PHY_TYPE_10GBASE_KR:
+	case I40E_PHY_TYPE_40GBASE_KR4:
+	case I40E_PHY_TYPE_20GBASE_KR2:
+	case I40E_PHY_TYPE_25GBASE_KR:
+		media = I40E_MEDIA_TYPE_BACKPLANE;
+		break;
+	case I40E_PHY_TYPE_SGMII:
+	case I40E_PHY_TYPE_XAUI:
+	case I40E_PHY_TYPE_XFI:
+	case I40E_PHY_TYPE_XLAUI:
+	case I40E_PHY_TYPE_XLPPI:
+	default:
+		media = I40E_MEDIA_TYPE_UNKNOWN;
+		break;
+	}
+
+	return media;
+}
+
+/**
+ * i40e_poll_globr - Poll for Global Reset completion
+ * @hw: pointer to the hardware structure
+ * @retry_limit: how many times to retry before failure
+ **/
+static i40e_status i40e_poll_globr(struct i40e_hw *hw,
+				   u32 retry_limit)
+{
+	u32 cnt, reg = 0;
+
+	for (cnt = 0; cnt < retry_limit; cnt++) {
+		reg = rd32(hw, I40E_GLGEN_RSTAT);
+		if (!(reg & I40E_GLGEN_RSTAT_DEVSTATE_MASK))
+			return 0;
+		msleep(100);
+	}
+
+	hw_dbg(hw, "Global reset failed.\n");
+	hw_dbg(hw, "I40E_GLGEN_RSTAT = 0x%x\n", reg);
+
+	return I40E_ERR_RESET_FAILED;
+}
+
+#define I40E_PF_RESET_WAIT_COUNT_A0	200
+#define I40E_PF_RESET_WAIT_COUNT	200
+/**
+ * i40e_pf_reset - Reset the PF
+ * @hw: pointer to the hardware structure
+ *
+ * Assuming someone else has triggered a global reset,
+ * assure the global reset is complete and then reset the PF
+ **/
+i40e_status i40e_pf_reset(struct i40e_hw *hw)
+{
+	u32 cnt = 0;
+	u32 cnt1 = 0;
+	u32 reg = 0;
+	u32 grst_del;
+
+	/* Poll for Global Reset steady state in case of recent GRST.
+	 * The grst delay value is in 100ms units, and we'll wait a
+	 * couple counts longer to be sure we don't just miss the end.
+	 */
+	grst_del = (rd32(hw, I40E_GLGEN_RSTCTL) &
+		    I40E_GLGEN_RSTCTL_GRSTDEL_MASK) >>
+		    I40E_GLGEN_RSTCTL_GRSTDEL_SHIFT;
+
+	/* It can take upto 15 secs for GRST steady state.
+	 * Bump it to 16 secs max to be safe.
+	 */
+	grst_del = grst_del * 20;
+
+	for (cnt = 0; cnt < grst_del; cnt++) {
+		reg = rd32(hw, I40E_GLGEN_RSTAT);
+		if (!(reg & I40E_GLGEN_RSTAT_DEVSTATE_MASK))
+			break;
+		msleep(100);
+	}
+	if (reg & I40E_GLGEN_RSTAT_DEVSTATE_MASK) {
+		hw_dbg(hw, "Global reset polling failed to complete.\n");
+		return I40E_ERR_RESET_FAILED;
+	}
+
+	/* Now Wait for the FW to be ready */
+	for (cnt1 = 0; cnt1 < I40E_PF_RESET_WAIT_COUNT; cnt1++) {
+		reg = rd32(hw, I40E_GLNVM_ULD);
+		reg &= (I40E_GLNVM_ULD_CONF_CORE_DONE_MASK |
+			I40E_GLNVM_ULD_CONF_GLOBAL_DONE_MASK);
+		if (reg == (I40E_GLNVM_ULD_CONF_CORE_DONE_MASK |
+			    I40E_GLNVM_ULD_CONF_GLOBAL_DONE_MASK)) {
+			hw_dbg(hw, "Core and Global modules ready %d\n", cnt1);
+			break;
+		}
+		usleep_range(10000, 20000);
+	}
+	if (!(reg & (I40E_GLNVM_ULD_CONF_CORE_DONE_MASK |
+		     I40E_GLNVM_ULD_CONF_GLOBAL_DONE_MASK))) {
+		hw_dbg(hw, "wait for FW Reset complete timedout\n");
+		hw_dbg(hw, "I40E_GLNVM_ULD = 0x%x\n", reg);
+		return I40E_ERR_RESET_FAILED;
+	}
+
+	/* If there was a Global Reset in progress when we got here,
+	 * we don't need to do the PF Reset
+	 */
+	if (!cnt) {
+		u32 reg2 = 0;
+		if (hw->revision_id == 0)
+			cnt = I40E_PF_RESET_WAIT_COUNT_A0;
+		else
+			cnt = I40E_PF_RESET_WAIT_COUNT;
+		reg = rd32(hw, I40E_PFGEN_CTRL);
+		wr32(hw, I40E_PFGEN_CTRL,
+		     (reg | I40E_PFGEN_CTRL_PFSWR_MASK));
+		for (; cnt; cnt--) {
+			reg = rd32(hw, I40E_PFGEN_CTRL);
+			if (!(reg & I40E_PFGEN_CTRL_PFSWR_MASK))
+				break;
+			reg2 = rd32(hw, I40E_GLGEN_RSTAT);
+			if (reg2 & I40E_GLGEN_RSTAT_DEVSTATE_MASK)
+				break;
+			usleep_range(1000, 2000);
+		}
+		if (reg2 & I40E_GLGEN_RSTAT_DEVSTATE_MASK) {
+			if (i40e_poll_globr(hw, grst_del))
+				return I40E_ERR_RESET_FAILED;
+		} else if (reg & I40E_PFGEN_CTRL_PFSWR_MASK) {
+			hw_dbg(hw, "PF reset polling failed to complete.\n");
+			return I40E_ERR_RESET_FAILED;
+		}
+	}
+
+	i40e_clear_pxe_mode(hw);
+
+	return 0;
+}
+
+/**
+ * i40e_clear_hw - clear out any left over hw state
+ * @hw: pointer to the hw struct
+ *
+ * Clear queues and interrupts, typically called at init time,
+ * but after the capabilities have been found so we know how many
+ * queues and msix vectors have been allocated.
+ **/
+void i40e_clear_hw(struct i40e_hw *hw)
+{
+	u32 num_queues, base_queue;
+	u32 num_pf_int;
+	u32 num_vf_int;
+	u32 num_vfs;
+	u32 i, j;
+	u32 val;
+	u32 eol = 0x7ff;
+
+	/* get number of interrupts, queues, and VFs */
+	val = rd32(hw, I40E_GLPCI_CNF2);
+	num_pf_int = (val & I40E_GLPCI_CNF2_MSI_X_PF_N_MASK) >>
+		     I40E_GLPCI_CNF2_MSI_X_PF_N_SHIFT;
+	num_vf_int = (val & I40E_GLPCI_CNF2_MSI_X_VF_N_MASK) >>
+		     I40E_GLPCI_CNF2_MSI_X_VF_N_SHIFT;
+
+	val = rd32(hw, I40E_PFLAN_QALLOC);
+	base_queue = (val & I40E_PFLAN_QALLOC_FIRSTQ_MASK) >>
+		     I40E_PFLAN_QALLOC_FIRSTQ_SHIFT;
+	j = (val & I40E_PFLAN_QALLOC_LASTQ_MASK) >>
+	    I40E_PFLAN_QALLOC_LASTQ_SHIFT;
+	if (val & I40E_PFLAN_QALLOC_VALID_MASK)
+		num_queues = (j - base_queue) + 1;
+	else
+		num_queues = 0;
+
+	val = rd32(hw, I40E_PF_VT_PFALLOC);
+	i = (val & I40E_PF_VT_PFALLOC_FIRSTVF_MASK) >>
+	    I40E_PF_VT_PFALLOC_FIRSTVF_SHIFT;
+	j = (val & I40E_PF_VT_PFALLOC_LASTVF_MASK) >>
+	    I40E_PF_VT_PFALLOC_LASTVF_SHIFT;
+	if (val & I40E_PF_VT_PFALLOC_VALID_MASK)
+		num_vfs = (j - i) + 1;
+	else
+		num_vfs = 0;
+
+	/* stop all the interrupts */
+	wr32(hw, I40E_PFINT_ICR0_ENA, 0);
+	val = 0x3 << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
+	for (i = 0; i < num_pf_int - 2; i++)
+		wr32(hw, I40E_PFINT_DYN_CTLN(i), val);
+
+	/* Set the FIRSTQ_INDX field to 0x7FF in PFINT_LNKLSTx */
+	val = eol << I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT;
+	wr32(hw, I40E_PFINT_LNKLST0, val);
+	for (i = 0; i < num_pf_int - 2; i++)
+		wr32(hw, I40E_PFINT_LNKLSTN(i), val);
+	val = eol << I40E_VPINT_LNKLST0_FIRSTQ_INDX_SHIFT;
+	for (i = 0; i < num_vfs; i++)
+		wr32(hw, I40E_VPINT_LNKLST0(i), val);
+	for (i = 0; i < num_vf_int - 2; i++)
+		wr32(hw, I40E_VPINT_LNKLSTN(i), val);
+
+	/* warn the HW of the coming Tx disables */
+	for (i = 0; i < num_queues; i++) {
+		u32 abs_queue_idx = base_queue + i;
+		u32 reg_block = 0;
+
+		if (abs_queue_idx >= 128) {
+			reg_block = abs_queue_idx / 128;
+			abs_queue_idx %= 128;
+		}
+
+		val = rd32(hw, I40E_GLLAN_TXPRE_QDIS(reg_block));
+		val &= ~I40E_GLLAN_TXPRE_QDIS_QINDX_MASK;
+		val |= (abs_queue_idx << I40E_GLLAN_TXPRE_QDIS_QINDX_SHIFT);
+		val |= I40E_GLLAN_TXPRE_QDIS_SET_QDIS_MASK;
+
+		wr32(hw, I40E_GLLAN_TXPRE_QDIS(reg_block), val);
+	}
+	udelay(400);
+
+	/* stop all the queues */
+	for (i = 0; i < num_queues; i++) {
+		wr32(hw, I40E_QINT_TQCTL(i), 0);
+		wr32(hw, I40E_QTX_ENA(i), 0);
+		wr32(hw, I40E_QINT_RQCTL(i), 0);
+		wr32(hw, I40E_QRX_ENA(i), 0);
+	}
+
+	/* short wait for all queue disables to settle */
+	udelay(50);
+}
+
+/**
+ * i40e_clear_pxe_mode - clear pxe operations mode
+ * @hw: pointer to the hw struct
+ *
+ * Make sure all PXE mode settings are cleared, including things
+ * like descriptor fetch/write-back mode.
+ **/
+void i40e_clear_pxe_mode(struct i40e_hw *hw)
+{
+	u32 reg;
+
+	if (i40e_check_asq_alive(hw))
+		i40e_aq_clear_pxe_mode(hw, NULL);
+
+	/* Clear single descriptor fetch/write-back mode */
+	reg = rd32(hw, I40E_GLLAN_RCTL_0);
+
+	if (hw->revision_id == 0) {
+		/* As a work around clear PXE_MODE instead of setting it */
+		wr32(hw, I40E_GLLAN_RCTL_0, (reg & (~I40E_GLLAN_RCTL_0_PXE_MODE_MASK)));
+	} else {
+		wr32(hw, I40E_GLLAN_RCTL_0, (reg | I40E_GLLAN_RCTL_0_PXE_MODE_MASK));
+	}
+}
+
+/**
+ * i40e_led_is_mine - helper to find matching led
+ * @hw: pointer to the hw struct
+ * @idx: index into GPIO registers
+ *
+ * returns: 0 if no match, otherwise the value of the GPIO_CTL register
+ */
+static u32 i40e_led_is_mine(struct i40e_hw *hw, int idx)
+{
+	u32 gpio_val = 0;
+	u32 port;
+
+	if (!hw->func_caps.led[idx])
+		return 0;
+
+	gpio_val = rd32(hw, I40E_GLGEN_GPIO_CTL(idx));
+	port = (gpio_val & I40E_GLGEN_GPIO_CTL_PRT_NUM_MASK) >>
+		I40E_GLGEN_GPIO_CTL_PRT_NUM_SHIFT;
+
+	/* if PRT_NUM_NA is 1 then this LED is not port specific, OR
+	 * if it is not our port then ignore
+	 */
+	if ((gpio_val & I40E_GLGEN_GPIO_CTL_PRT_NUM_NA_MASK) ||
+	    (port != hw->port))
+		return 0;
+
+	return gpio_val;
+}
+
+#define I40E_COMBINED_ACTIVITY 0xA
+#define I40E_FILTER_ACTIVITY 0xE
+#define I40E_LINK_ACTIVITY 0xC
+#define I40E_MAC_ACTIVITY 0xD
+#define I40E_LED0 22
+
+/**
+ * i40e_led_get - return current on/off mode
+ * @hw: pointer to the hw struct
+ *
+ * The value returned is the 'mode' field as defined in the
+ * GPIO register definitions: 0x0 = off, 0xf = on, and other
+ * values are variations of possible behaviors relating to
+ * blink, link, and wire.
+ **/
+u32 i40e_led_get(struct i40e_hw *hw)
+{
+	u32 current_mode = 0;
+	u32 mode = 0;
+	int i;
+
+	/* as per the documentation GPIO 22-29 are the LED
+	 * GPIO pins named LED0..LED7
+	 */
+	for (i = I40E_LED0; i <= I40E_GLGEN_GPIO_CTL_MAX_INDEX; i++) {
+		u32 gpio_val = i40e_led_is_mine(hw, i);
+
+		if (!gpio_val)
+			continue;
+
+		/* ignore gpio LED src mode entries related to the activity
+		 * LEDs
+		 */
+		current_mode = ((gpio_val & I40E_GLGEN_GPIO_CTL_LED_MODE_MASK)
+				>> I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT);
+		switch (current_mode) {
+		case I40E_COMBINED_ACTIVITY:
+		case I40E_FILTER_ACTIVITY:
+		case I40E_MAC_ACTIVITY:
+		case I40E_LINK_ACTIVITY:
+			continue;
+		default:
+			break;
+		}
+
+		mode = (gpio_val & I40E_GLGEN_GPIO_CTL_LED_MODE_MASK) >>
+			I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT;
+		break;
+	}
+
+	return mode;
+}
+
+/**
+ * i40e_led_set - set new on/off mode
+ * @hw: pointer to the hw struct
+ * @mode: 0=off, 0xf=on (else see manual for mode details)
+ * @blink: true if the LED should blink when on, false if steady
+ *
+ * if this function is used to turn on the blink it should
+ * be used to disable the blink when restoring the original state.
+ **/
+void i40e_led_set(struct i40e_hw *hw, u32 mode, bool blink)
+{
+	u32 current_mode = 0;
+	int i;
+
+	if (mode & 0xfffffff0)
+		hw_dbg(hw, "invalid mode passed in %X\n", mode);
+
+	/* as per the documentation GPIO 22-29 are the LED
+	 * GPIO pins named LED0..LED7
+	 */
+	for (i = I40E_LED0; i <= I40E_GLGEN_GPIO_CTL_MAX_INDEX; i++) {
+		u32 gpio_val = i40e_led_is_mine(hw, i);
+
+		if (!gpio_val)
+			continue;
+
+		/* ignore gpio LED src mode entries related to the activity
+		 * LEDs
+		 */
+		current_mode = ((gpio_val & I40E_GLGEN_GPIO_CTL_LED_MODE_MASK)
+				>> I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT);
+		switch (current_mode) {
+		case I40E_COMBINED_ACTIVITY:
+		case I40E_FILTER_ACTIVITY:
+		case I40E_MAC_ACTIVITY:
+		case I40E_LINK_ACTIVITY:
+			continue;
+		default:
+			break;
+		}
+
+		gpio_val &= ~I40E_GLGEN_GPIO_CTL_LED_MODE_MASK;
+		/* this & is a bit of paranoia, but serves as a range check */
+		gpio_val |= ((mode << I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT) &
+			     I40E_GLGEN_GPIO_CTL_LED_MODE_MASK);
+
+		if (blink)
+			gpio_val |= BIT(I40E_GLGEN_GPIO_CTL_LED_BLINK_SHIFT);
+		else
+			gpio_val &= ~BIT(I40E_GLGEN_GPIO_CTL_LED_BLINK_SHIFT);
+
+		wr32(hw, I40E_GLGEN_GPIO_CTL(i), gpio_val);
+		break;
+	}
+}
+
+/* Admin command wrappers */
+
+/**
+ * i40e_aq_get_phy_capabilities
+ * @hw: pointer to the hw struct
+ * @abilities: structure for PHY capabilities to be filled
+ * @qualified_modules: report Qualified Modules
+ * @report_init: report init capabilities (active are default)
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Returns the various PHY abilities supported on the Port.
+ **/
+i40e_status i40e_aq_get_phy_capabilities(struct i40e_hw *hw,
+			bool qualified_modules, bool report_init,
+			struct i40e_aq_get_phy_abilities_resp *abilities,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	i40e_status status;
+	u16 abilities_size = sizeof(struct i40e_aq_get_phy_abilities_resp);
+	u16 max_delay = I40E_MAX_PHY_TIMEOUT, total_delay = 0;
+
+	if (!abilities)
+		return I40E_ERR_PARAM;
+
+	do {
+		i40e_fill_default_direct_cmd_desc(&desc,
+					       i40e_aqc_opc_get_phy_abilities);
+
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+		if (abilities_size > I40E_AQ_LARGE_BUF)
+			desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+
+		if (qualified_modules)
+			desc.params.external.param0 |=
+			cpu_to_le32(I40E_AQ_PHY_REPORT_QUALIFIED_MODULES);
+
+		if (report_init)
+			desc.params.external.param0 |=
+			cpu_to_le32(I40E_AQ_PHY_REPORT_INITIAL_VALUES);
+
+		status = i40e_asq_send_command(hw, &desc, abilities,
+					       abilities_size, cmd_details);
+
+		if (status)
+			break;
+
+		if (hw->aq.asq_last_status == I40E_AQ_RC_EIO) {
+			status = I40E_ERR_UNKNOWN_PHY;
+			break;
+		} else if (hw->aq.asq_last_status == I40E_AQ_RC_EAGAIN) {
+			usleep_range(1000, 2000);
+			total_delay++;
+			status = I40E_ERR_TIMEOUT;
+		}
+	} while ((hw->aq.asq_last_status != I40E_AQ_RC_OK) &&
+		 (total_delay < max_delay));
+
+	if (status)
+		return status;
+
+	if (report_init) {
+		if (hw->mac.type ==  I40E_MAC_XL710 &&
+		    hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR &&
+		    hw->aq.api_min_ver >= I40E_MINOR_VER_GET_LINK_INFO_XL710) {
+			status = i40e_aq_get_link_info(hw, true, NULL, NULL);
+		} else {
+			hw->phy.phy_types = le32_to_cpu(abilities->phy_type);
+			hw->phy.phy_types |=
+					((u64)abilities->phy_type_ext << 32);
+		}
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_phy_config
+ * @hw: pointer to the hw struct
+ * @config: structure with PHY configuration to be set
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Set the various PHY configuration parameters
+ * supported on the Port.One or more of the Set PHY config parameters may be
+ * ignored in an MFP mode as the PF may not have the privilege to set some
+ * of the PHY Config parameters. This status will be indicated by the
+ * command response.
+ **/
+enum i40e_status_code i40e_aq_set_phy_config(struct i40e_hw *hw,
+				struct i40e_aq_set_phy_config *config,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aq_set_phy_config *cmd =
+			(struct i40e_aq_set_phy_config *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	if (!config)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_phy_config);
+
+	*cmd = *config;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_set_fc
+ * @hw: pointer to the hw struct
+ *
+ * Set the requested flow control mode using set_phy_config.
+ **/
+enum i40e_status_code i40e_set_fc(struct i40e_hw *hw, u8 *aq_failures,
+				  bool atomic_restart)
+{
+	enum i40e_fc_mode fc_mode = hw->fc.requested_mode;
+	struct i40e_aq_get_phy_abilities_resp abilities;
+	struct i40e_aq_set_phy_config config;
+	enum i40e_status_code status;
+	u8 pause_mask = 0x0;
+
+	*aq_failures = 0x0;
+
+	switch (fc_mode) {
+	case I40E_FC_FULL:
+		pause_mask |= I40E_AQ_PHY_FLAG_PAUSE_TX;
+		pause_mask |= I40E_AQ_PHY_FLAG_PAUSE_RX;
+		break;
+	case I40E_FC_RX_PAUSE:
+		pause_mask |= I40E_AQ_PHY_FLAG_PAUSE_RX;
+		break;
+	case I40E_FC_TX_PAUSE:
+		pause_mask |= I40E_AQ_PHY_FLAG_PAUSE_TX;
+		break;
+	default:
+		break;
+	}
+
+	/* Get the current phy config */
+	status = i40e_aq_get_phy_capabilities(hw, false, false, &abilities,
+					      NULL);
+	if (status) {
+		*aq_failures |= I40E_SET_FC_AQ_FAIL_GET;
+		return status;
+	}
+
+	memset(&config, 0, sizeof(struct i40e_aq_set_phy_config));
+	/* clear the old pause settings */
+	config.abilities = abilities.abilities & ~(I40E_AQ_PHY_FLAG_PAUSE_TX) &
+			   ~(I40E_AQ_PHY_FLAG_PAUSE_RX);
+	/* set the new abilities */
+	config.abilities |= pause_mask;
+	/* If the abilities have changed, then set the new config */
+	if (config.abilities != abilities.abilities) {
+		/* Auto restart link so settings take effect */
+		if (atomic_restart)
+			config.abilities |= I40E_AQ_PHY_ENABLE_ATOMIC_LINK;
+		/* Copy over all the old settings */
+		config.phy_type = abilities.phy_type;
+		config.phy_type_ext = abilities.phy_type_ext;
+		config.link_speed = abilities.link_speed;
+		config.eee_capability = abilities.eee_capability;
+		config.eeer = abilities.eeer_val;
+		config.low_power_ctrl = abilities.d3_lpan;
+		config.fec_config = abilities.fec_cfg_curr_mod_ext_info &
+				    I40E_AQ_PHY_FEC_CONFIG_MASK;
+		status = i40e_aq_set_phy_config(hw, &config, NULL);
+
+		if (status)
+			*aq_failures |= I40E_SET_FC_AQ_FAIL_SET;
+	}
+	/* Update the link info */
+	status = i40e_update_link_info(hw);
+	if (status) {
+		/* Wait a little bit (on 40G cards it sometimes takes a really
+		 * long time for link to come back from the atomic reset)
+		 * and try once more
+		 */
+		msleep(1000);
+		status = i40e_update_link_info(hw);
+	}
+	if (status)
+		*aq_failures |= I40E_SET_FC_AQ_FAIL_UPDATE;
+
+	return status;
+}
+
+/**
+ * i40e_aq_clear_pxe_mode
+ * @hw: pointer to the hw struct
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Tell the firmware that the driver is taking over from PXE
+ **/
+i40e_status i40e_aq_clear_pxe_mode(struct i40e_hw *hw,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	i40e_status status;
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_clear_pxe *cmd =
+		(struct i40e_aqc_clear_pxe *)&desc.params.raw;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_clear_pxe_mode);
+
+	cmd->rx_cnt = 0x2;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	wr32(hw, I40E_GLLAN_RCTL_0, 0x1);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_link_restart_an
+ * @hw: pointer to the hw struct
+ * @enable_link: if true: enable link, if false: disable link
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Sets up the link and restarts the Auto-Negotiation over the link.
+ **/
+i40e_status i40e_aq_set_link_restart_an(struct i40e_hw *hw,
+					bool enable_link,
+					struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_link_restart_an *cmd =
+		(struct i40e_aqc_set_link_restart_an *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_link_restart_an);
+
+	cmd->command = I40E_AQ_PHY_RESTART_AN;
+	if (enable_link)
+		cmd->command |= I40E_AQ_PHY_LINK_ENABLE;
+	else
+		cmd->command &= ~I40E_AQ_PHY_LINK_ENABLE;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_link_info
+ * @hw: pointer to the hw struct
+ * @enable_lse: enable/disable LinkStatusEvent reporting
+ * @link: pointer to link status structure - optional
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Returns the link status of the adapter.
+ **/
+i40e_status i40e_aq_get_link_info(struct i40e_hw *hw,
+				bool enable_lse, struct i40e_link_status *link,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_get_link_status *resp =
+		(struct i40e_aqc_get_link_status *)&desc.params.raw;
+	struct i40e_link_status *hw_link_info = &hw->phy.link_info;
+	i40e_status status;
+	bool tx_pause, rx_pause;
+	u16 command_flags;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_link_status);
+
+	if (enable_lse)
+		command_flags = I40E_AQ_LSE_ENABLE;
+	else
+		command_flags = I40E_AQ_LSE_DISABLE;
+	resp->command_flags = cpu_to_le16(command_flags);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (status)
+		goto aq_get_link_info_exit;
+
+	/* save off old link status information */
+	hw->phy.link_info_old = *hw_link_info;
+
+	/* update link status */
+	hw_link_info->phy_type = (enum i40e_aq_phy_type)resp->phy_type;
+	hw->phy.media_type = i40e_get_media_type(hw);
+	hw_link_info->link_speed = (enum i40e_aq_link_speed)resp->link_speed;
+	hw_link_info->link_info = resp->link_info;
+	hw_link_info->an_info = resp->an_info;
+	hw_link_info->fec_info = resp->config & (I40E_AQ_CONFIG_FEC_KR_ENA |
+						 I40E_AQ_CONFIG_FEC_RS_ENA);
+	hw_link_info->ext_info = resp->ext_info;
+	hw_link_info->loopback = resp->loopback & I40E_AQ_LOOPBACK_MASK;
+	hw_link_info->max_frame_size = le16_to_cpu(resp->max_frame_size);
+	hw_link_info->pacing = resp->config & I40E_AQ_CONFIG_PACING_MASK;
+
+	/* update fc info */
+	tx_pause = !!(resp->an_info & I40E_AQ_LINK_PAUSE_TX);
+	rx_pause = !!(resp->an_info & I40E_AQ_LINK_PAUSE_RX);
+	if (tx_pause & rx_pause)
+		hw->fc.current_mode = I40E_FC_FULL;
+	else if (tx_pause)
+		hw->fc.current_mode = I40E_FC_TX_PAUSE;
+	else if (rx_pause)
+		hw->fc.current_mode = I40E_FC_RX_PAUSE;
+	else
+		hw->fc.current_mode = I40E_FC_NONE;
+
+	if (resp->config & I40E_AQ_CONFIG_CRC_ENA)
+		hw_link_info->crc_enable = true;
+	else
+		hw_link_info->crc_enable = false;
+
+	if (resp->command_flags & cpu_to_le16(I40E_AQ_LSE_IS_ENABLED))
+		hw_link_info->lse_enable = true;
+	else
+		hw_link_info->lse_enable = false;
+
+	if ((hw->mac.type == I40E_MAC_XL710) &&
+	    (hw->aq.fw_maj_ver < 4 || (hw->aq.fw_maj_ver == 4 &&
+	     hw->aq.fw_min_ver < 40)) && hw_link_info->phy_type == 0xE)
+		hw_link_info->phy_type = I40E_PHY_TYPE_10GBASE_SFPP_CU;
+
+	if (hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR &&
+	    hw->aq.api_min_ver >= 7) {
+		__le32 tmp;
+
+		memcpy(&tmp, resp->link_type, sizeof(tmp));
+		hw->phy.phy_types = le32_to_cpu(tmp);
+		hw->phy.phy_types |= ((u64)resp->link_type_ext << 32);
+	}
+
+	/* save link status information */
+	if (link)
+		*link = *hw_link_info;
+
+	/* flag cleared so helper functions don't call AQ again */
+	hw->phy.get_link_info = false;
+
+aq_get_link_info_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_set_phy_int_mask
+ * @hw: pointer to the hw struct
+ * @mask: interrupt mask to be set
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Set link interrupt mask.
+ **/
+i40e_status i40e_aq_set_phy_int_mask(struct i40e_hw *hw,
+				     u16 mask,
+				     struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_phy_int_mask *cmd =
+		(struct i40e_aqc_set_phy_int_mask *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_phy_int_mask);
+
+	cmd->event_mask = cpu_to_le16(mask);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_phy_debug
+ * @hw: pointer to the hw struct
+ * @cmd_flags: debug command flags
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Reset the external PHY.
+ **/
+i40e_status i40e_aq_set_phy_debug(struct i40e_hw *hw, u8 cmd_flags,
+				  struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_phy_debug *cmd =
+		(struct i40e_aqc_set_phy_debug *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_phy_debug);
+
+	cmd->command_flags = cmd_flags;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_add_vsi
+ * @hw: pointer to the hw struct
+ * @vsi_ctx: pointer to a vsi context struct
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Add a VSI context to the hardware.
+**/
+i40e_status i40e_aq_add_vsi(struct i40e_hw *hw,
+				struct i40e_vsi_context *vsi_ctx,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_get_update_vsi *cmd =
+		(struct i40e_aqc_add_get_update_vsi *)&desc.params.raw;
+	struct i40e_aqc_add_get_update_vsi_completion *resp =
+		(struct i40e_aqc_add_get_update_vsi_completion *)
+		&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_add_vsi);
+
+	cmd->uplink_seid = cpu_to_le16(vsi_ctx->uplink_seid);
+	cmd->connection_type = vsi_ctx->connection_type;
+	cmd->vf_id = vsi_ctx->vf_num;
+	cmd->vsi_flags = cpu_to_le16(vsi_ctx->flags);
+
+	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+
+	status = i40e_asq_send_command(hw, &desc, &vsi_ctx->info,
+				    sizeof(vsi_ctx->info), cmd_details);
+
+	if (status)
+		goto aq_add_vsi_exit;
+
+	vsi_ctx->seid = le16_to_cpu(resp->seid);
+	vsi_ctx->vsi_number = le16_to_cpu(resp->vsi_number);
+	vsi_ctx->vsis_allocated = le16_to_cpu(resp->vsi_used);
+	vsi_ctx->vsis_unallocated = le16_to_cpu(resp->vsi_free);
+
+aq_add_vsi_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_set_default_vsi
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_set_default_vsi(struct i40e_hw *hw,
+				    u16 seid,
+				    struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)
+		&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	cmd->promiscuous_flags = cpu_to_le16(I40E_AQC_SET_VSI_DEFAULT);
+	cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_DEFAULT);
+	cmd->seid = cpu_to_le16(seid);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_clear_default_vsi
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_clear_default_vsi(struct i40e_hw *hw,
+				      u16 seid,
+				      struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)
+		&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	cmd->promiscuous_flags = cpu_to_le16(0);
+	cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_DEFAULT);
+	cmd->seid = cpu_to_le16(seid);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_vsi_unicast_promiscuous
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @set: set unicast promiscuous enable/disable
+ * @cmd_details: pointer to command details structure or NULL
+ * @rx_only_promisc: flag to decide if egress traffic gets mirrored in promisc
+ **/
+i40e_status i40e_aq_set_vsi_unicast_promiscuous(struct i40e_hw *hw,
+				u16 seid, bool set,
+				struct i40e_asq_cmd_details *cmd_details,
+				bool rx_only_promisc)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	i40e_status status;
+	u16 flags = 0;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	if (set) {
+		flags |= I40E_AQC_SET_VSI_PROMISC_UNICAST;
+		if (rx_only_promisc &&
+		    (((hw->aq.api_maj_ver == 1) && (hw->aq.api_min_ver >= 5)) ||
+		     (hw->aq.api_maj_ver > 1)))
+			flags |= I40E_AQC_SET_VSI_PROMISC_TX;
+	}
+
+	cmd->promiscuous_flags = cpu_to_le16(flags);
+
+	cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_UNICAST);
+	if (((hw->aq.api_maj_ver >= 1) && (hw->aq.api_min_ver >= 5)) ||
+	    (hw->aq.api_maj_ver > 1))
+		cmd->valid_flags |= cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_TX);
+
+	cmd->seid = cpu_to_le16(seid);
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_vsi_multicast_promiscuous
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @set: set multicast promiscuous enable/disable
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_set_vsi_multicast_promiscuous(struct i40e_hw *hw,
+				u16 seid, bool set, struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	i40e_status status;
+	u16 flags = 0;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	if (set)
+		flags |= I40E_AQC_SET_VSI_PROMISC_MULTICAST;
+
+	cmd->promiscuous_flags = cpu_to_le16(flags);
+
+	cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_MULTICAST);
+
+	cmd->seid = cpu_to_le16(seid);
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_vsi_mc_promisc_on_vlan
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @enable: set MAC L2 layer unicast promiscuous enable/disable for a given VLAN
+ * @vid: The VLAN tag filter - capture any multicast packet with this VLAN tag
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_set_vsi_mc_promisc_on_vlan(struct i40e_hw *hw,
+							 u16 seid, bool enable,
+							 u16 vid,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 flags = 0;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	if (enable)
+		flags |= I40E_AQC_SET_VSI_PROMISC_MULTICAST;
+
+	cmd->promiscuous_flags = cpu_to_le16(flags);
+	cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_MULTICAST);
+	cmd->seid = cpu_to_le16(seid);
+	cmd->vlan_tag = cpu_to_le16(vid | I40E_AQC_SET_VSI_VLAN_VALID);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_vsi_uc_promisc_on_vlan
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @enable: set MAC L2 layer unicast promiscuous enable/disable for a given VLAN
+ * @vid: The VLAN tag filter - capture any unicast packet with this VLAN tag
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum i40e_status_code i40e_aq_set_vsi_uc_promisc_on_vlan(struct i40e_hw *hw,
+							 u16 seid, bool enable,
+							 u16 vid,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 flags = 0;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	if (enable)
+		flags |= I40E_AQC_SET_VSI_PROMISC_UNICAST;
+
+	cmd->promiscuous_flags = cpu_to_le16(flags);
+	cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_UNICAST);
+	cmd->seid = cpu_to_le16(seid);
+	cmd->vlan_tag = cpu_to_le16(vid | I40E_AQC_SET_VSI_VLAN_VALID);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_vsi_bc_promisc_on_vlan
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @enable: set broadcast promiscuous enable/disable for a given VLAN
+ * @vid: The VLAN tag filter - capture any broadcast packet with this VLAN tag
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_set_vsi_bc_promisc_on_vlan(struct i40e_hw *hw,
+				u16 seid, bool enable, u16 vid,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	i40e_status status;
+	u16 flags = 0;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	if (enable)
+		flags |= I40E_AQC_SET_VSI_PROMISC_BROADCAST;
+
+	cmd->promiscuous_flags = cpu_to_le16(flags);
+	cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_BROADCAST);
+	cmd->seid = cpu_to_le16(seid);
+	cmd->vlan_tag = cpu_to_le16(vid | I40E_AQC_SET_VSI_VLAN_VALID);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_vsi_broadcast
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @set_filter: true to set filter, false to clear filter
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Set or clear the broadcast promiscuous flag (filter) for a given VSI.
+ **/
+i40e_status i40e_aq_set_vsi_broadcast(struct i40e_hw *hw,
+				u16 seid, bool set_filter,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					i40e_aqc_opc_set_vsi_promiscuous_modes);
+
+	if (set_filter)
+		cmd->promiscuous_flags
+			    |= cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_BROADCAST);
+	else
+		cmd->promiscuous_flags
+			    &= cpu_to_le16(~I40E_AQC_SET_VSI_PROMISC_BROADCAST);
+
+	cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_BROADCAST);
+	cmd->seid = cpu_to_le16(seid);
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_vsi_vlan_promisc - control the VLAN promiscuous setting
+ * @hw: pointer to the hw struct
+ * @seid: vsi number
+ * @enable: set MAC L2 layer unicast promiscuous enable/disable for a given VLAN
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_set_vsi_vlan_promisc(struct i40e_hw *hw,
+				       u16 seid, bool enable,
+				       struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
+		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	i40e_status status;
+	u16 flags = 0;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					i40e_aqc_opc_set_vsi_promiscuous_modes);
+	if (enable)
+		flags |= I40E_AQC_SET_VSI_PROMISC_VLAN;
+
+	cmd->promiscuous_flags = cpu_to_le16(flags);
+	cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_VLAN);
+	cmd->seid = cpu_to_le16(seid);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_get_vsi_params - get VSI configuration info
+ * @hw: pointer to the hw struct
+ * @vsi_ctx: pointer to a vsi context struct
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_get_vsi_params(struct i40e_hw *hw,
+				struct i40e_vsi_context *vsi_ctx,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_get_update_vsi *cmd =
+		(struct i40e_aqc_add_get_update_vsi *)&desc.params.raw;
+	struct i40e_aqc_add_get_update_vsi_completion *resp =
+		(struct i40e_aqc_add_get_update_vsi_completion *)
+		&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_get_vsi_parameters);
+
+	cmd->uplink_seid = cpu_to_le16(vsi_ctx->seid);
+
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+
+	status = i40e_asq_send_command(hw, &desc, &vsi_ctx->info,
+				    sizeof(vsi_ctx->info), NULL);
+
+	if (status)
+		goto aq_get_vsi_params_exit;
+
+	vsi_ctx->seid = le16_to_cpu(resp->seid);
+	vsi_ctx->vsi_number = le16_to_cpu(resp->vsi_number);
+	vsi_ctx->vsis_allocated = le16_to_cpu(resp->vsi_used);
+	vsi_ctx->vsis_unallocated = le16_to_cpu(resp->vsi_free);
+
+aq_get_vsi_params_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_update_vsi_params
+ * @hw: pointer to the hw struct
+ * @vsi_ctx: pointer to a vsi context struct
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Update a VSI context.
+ **/
+i40e_status i40e_aq_update_vsi_params(struct i40e_hw *hw,
+				struct i40e_vsi_context *vsi_ctx,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_get_update_vsi *cmd =
+		(struct i40e_aqc_add_get_update_vsi *)&desc.params.raw;
+	struct i40e_aqc_add_get_update_vsi_completion *resp =
+		(struct i40e_aqc_add_get_update_vsi_completion *)
+		&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_update_vsi_parameters);
+	cmd->uplink_seid = cpu_to_le16(vsi_ctx->seid);
+
+	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+
+	status = i40e_asq_send_command(hw, &desc, &vsi_ctx->info,
+				    sizeof(vsi_ctx->info), cmd_details);
+
+	vsi_ctx->vsis_allocated = le16_to_cpu(resp->vsi_used);
+	vsi_ctx->vsis_unallocated = le16_to_cpu(resp->vsi_free);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_switch_config
+ * @hw: pointer to the hardware structure
+ * @buf: pointer to the result buffer
+ * @buf_size: length of input buffer
+ * @start_seid: seid to start for the report, 0 == beginning
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Fill the buf with switch configuration returned from AdminQ command
+ **/
+i40e_status i40e_aq_get_switch_config(struct i40e_hw *hw,
+				struct i40e_aqc_get_switch_config_resp *buf,
+				u16 buf_size, u16 *start_seid,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_switch_seid *scfg =
+		(struct i40e_aqc_switch_seid *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_get_switch_config);
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	if (buf_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+	scfg->seid = cpu_to_le16(*start_seid);
+
+	status = i40e_asq_send_command(hw, &desc, buf, buf_size, cmd_details);
+	*start_seid = le16_to_cpu(scfg->seid);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_switch_config
+ * @hw: pointer to the hardware structure
+ * @flags: bit flag values to set
+ * @mode: cloud filter mode
+ * @valid_flags: which bit flags to set
+ * @mode: cloud filter mode
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Set switch configuration bits
+ **/
+enum i40e_status_code i40e_aq_set_switch_config(struct i40e_hw *hw,
+						u16 flags,
+						u16 valid_flags, u8 mode,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_switch_config *scfg =
+		(struct i40e_aqc_set_switch_config *)&desc.params.raw;
+	enum i40e_status_code status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_switch_config);
+	scfg->flags = cpu_to_le16(flags);
+	scfg->valid_flags = cpu_to_le16(valid_flags);
+	scfg->mode = mode;
+	if (hw->flags & I40E_HW_FLAG_802_1AD_CAPABLE) {
+		scfg->switch_tag = cpu_to_le16(hw->switch_tag);
+		scfg->first_tag = cpu_to_le16(hw->first_tag);
+		scfg->second_tag = cpu_to_le16(hw->second_tag);
+	}
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_firmware_version
+ * @hw: pointer to the hw struct
+ * @fw_major_version: firmware major version
+ * @fw_minor_version: firmware minor version
+ * @fw_build: firmware build number
+ * @api_major_version: major queue version
+ * @api_minor_version: minor queue version
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Get the firmware version from the admin queue commands
+ **/
+i40e_status i40e_aq_get_firmware_version(struct i40e_hw *hw,
+				u16 *fw_major_version, u16 *fw_minor_version,
+				u32 *fw_build,
+				u16 *api_major_version, u16 *api_minor_version,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_get_version *resp =
+		(struct i40e_aqc_get_version *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_version);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status) {
+		if (fw_major_version)
+			*fw_major_version = le16_to_cpu(resp->fw_major);
+		if (fw_minor_version)
+			*fw_minor_version = le16_to_cpu(resp->fw_minor);
+		if (fw_build)
+			*fw_build = le32_to_cpu(resp->fw_build);
+		if (api_major_version)
+			*api_major_version = le16_to_cpu(resp->api_major);
+		if (api_minor_version)
+			*api_minor_version = le16_to_cpu(resp->api_minor);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_send_driver_version
+ * @hw: pointer to the hw struct
+ * @dv: driver's major, minor version
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Send the driver version to the firmware
+ **/
+i40e_status i40e_aq_send_driver_version(struct i40e_hw *hw,
+				struct i40e_driver_version *dv,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_driver_version *cmd =
+		(struct i40e_aqc_driver_version *)&desc.params.raw;
+	i40e_status status;
+	u16 len;
+
+	if (dv == NULL)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_driver_version);
+
+	desc.flags |= cpu_to_le16(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD);
+	cmd->driver_major_ver = dv->major_version;
+	cmd->driver_minor_ver = dv->minor_version;
+	cmd->driver_build_ver = dv->build_version;
+	cmd->driver_subbuild_ver = dv->subbuild_version;
+
+	len = 0;
+	while (len < sizeof(dv->driver_string) &&
+	       (dv->driver_string[len] < 0x80) &&
+	       dv->driver_string[len])
+		len++;
+	status = i40e_asq_send_command(hw, &desc, dv->driver_string,
+				       len, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_get_link_status - get status of the HW network link
+ * @hw: pointer to the hw struct
+ * @link_up: pointer to bool (true/false = linkup/linkdown)
+ *
+ * Variable link_up true if link is up, false if link is down.
+ * The variable link_up is invalid if returned value of status != 0
+ *
+ * Side effect: LinkStatusEvent reporting becomes enabled
+ **/
+i40e_status i40e_get_link_status(struct i40e_hw *hw, bool *link_up)
+{
+	i40e_status status = 0;
+
+	if (hw->phy.get_link_info) {
+		status = i40e_update_link_info(hw);
+
+		if (status)
+			i40e_debug(hw, I40E_DEBUG_LINK, "get link failed: status %d\n",
+				   status);
+	}
+
+	*link_up = hw->phy.link_info.link_info & I40E_AQ_LINK_UP;
+
+	return status;
+}
+
+/**
+ * i40e_updatelink_status - update status of the HW network link
+ * @hw: pointer to the hw struct
+ **/
+i40e_status i40e_update_link_info(struct i40e_hw *hw)
+{
+	struct i40e_aq_get_phy_abilities_resp abilities;
+	i40e_status status = 0;
+
+	status = i40e_aq_get_link_info(hw, true, NULL, NULL);
+	if (status)
+		return status;
+
+	/* extra checking needed to ensure link info to user is timely */
+	if ((hw->phy.link_info.link_info & I40E_AQ_MEDIA_AVAILABLE) &&
+	    ((hw->phy.link_info.link_info & I40E_AQ_LINK_UP) ||
+	     !(hw->phy.link_info_old.link_info & I40E_AQ_LINK_UP))) {
+		status = i40e_aq_get_phy_capabilities(hw, false, false,
+						      &abilities, NULL);
+		if (status)
+			return status;
+
+		hw->phy.link_info.req_fec_info =
+			abilities.fec_cfg_curr_mod_ext_info &
+			(I40E_AQ_REQUEST_FEC_KR | I40E_AQ_REQUEST_FEC_RS);
+
+		memcpy(hw->phy.link_info.module_type, &abilities.module_type,
+		       sizeof(hw->phy.link_info.module_type));
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_add_veb - Insert a VEB between the VSI and the MAC
+ * @hw: pointer to the hw struct
+ * @uplink_seid: the MAC or other gizmo SEID
+ * @downlink_seid: the VSI SEID
+ * @enabled_tc: bitmap of TCs to be enabled
+ * @default_port: true for default port VSI, false for control port
+ * @veb_seid: pointer to where to put the resulting VEB SEID
+ * @enable_stats: true to turn on VEB stats
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This asks the FW to add a VEB between the uplink and downlink
+ * elements.  If the uplink SEID is 0, this will be a floating VEB.
+ **/
+i40e_status i40e_aq_add_veb(struct i40e_hw *hw, u16 uplink_seid,
+				u16 downlink_seid, u8 enabled_tc,
+				bool default_port, u16 *veb_seid,
+				bool enable_stats,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_veb *cmd =
+		(struct i40e_aqc_add_veb *)&desc.params.raw;
+	struct i40e_aqc_add_veb_completion *resp =
+		(struct i40e_aqc_add_veb_completion *)&desc.params.raw;
+	i40e_status status;
+	u16 veb_flags = 0;
+
+	/* SEIDs need to either both be set or both be 0 for floating VEB */
+	if (!!uplink_seid != !!downlink_seid)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_veb);
+
+	cmd->uplink_seid = cpu_to_le16(uplink_seid);
+	cmd->downlink_seid = cpu_to_le16(downlink_seid);
+	cmd->enable_tcs = enabled_tc;
+	if (!uplink_seid)
+		veb_flags |= I40E_AQC_ADD_VEB_FLOATING;
+	if (default_port)
+		veb_flags |= I40E_AQC_ADD_VEB_PORT_TYPE_DEFAULT;
+	else
+		veb_flags |= I40E_AQC_ADD_VEB_PORT_TYPE_DATA;
+
+	/* reverse logic here: set the bitflag to disable the stats */
+	if (!enable_stats)
+		veb_flags |= I40E_AQC_ADD_VEB_ENABLE_DISABLE_STATS;
+
+	cmd->veb_flags = cpu_to_le16(veb_flags);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status && veb_seid)
+		*veb_seid = le16_to_cpu(resp->veb_seid);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_veb_parameters - Retrieve VEB parameters
+ * @hw: pointer to the hw struct
+ * @veb_seid: the SEID of the VEB to query
+ * @switch_id: the uplink switch id
+ * @floating: set to true if the VEB is floating
+ * @statistic_index: index of the stats counter block for this VEB
+ * @vebs_used: number of VEB's used by function
+ * @vebs_free: total VEB's not reserved by any function
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This retrieves the parameters for a particular VEB, specified by
+ * uplink_seid, and returns them to the caller.
+ **/
+i40e_status i40e_aq_get_veb_parameters(struct i40e_hw *hw,
+				u16 veb_seid, u16 *switch_id,
+				bool *floating, u16 *statistic_index,
+				u16 *vebs_used, u16 *vebs_free,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_get_veb_parameters_completion *cmd_resp =
+		(struct i40e_aqc_get_veb_parameters_completion *)
+		&desc.params.raw;
+	i40e_status status;
+
+	if (veb_seid == 0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_get_veb_parameters);
+	cmd_resp->seid = cpu_to_le16(veb_seid);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+	if (status)
+		goto get_veb_exit;
+
+	if (switch_id)
+		*switch_id = le16_to_cpu(cmd_resp->switch_id);
+	if (statistic_index)
+		*statistic_index = le16_to_cpu(cmd_resp->statistic_index);
+	if (vebs_used)
+		*vebs_used = le16_to_cpu(cmd_resp->vebs_used);
+	if (vebs_free)
+		*vebs_free = le16_to_cpu(cmd_resp->vebs_free);
+	if (floating) {
+		u16 flags = le16_to_cpu(cmd_resp->veb_flags);
+
+		if (flags & I40E_AQC_ADD_VEB_FLOATING)
+			*floating = true;
+		else
+			*floating = false;
+	}
+
+get_veb_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_add_macvlan
+ * @hw: pointer to the hw struct
+ * @seid: VSI for the mac address
+ * @mv_list: list of macvlans to be added
+ * @count: length of the list
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Add MAC/VLAN addresses to the HW filtering
+ **/
+i40e_status i40e_aq_add_macvlan(struct i40e_hw *hw, u16 seid,
+			struct i40e_aqc_add_macvlan_element_data *mv_list,
+			u16 count, struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_macvlan *cmd =
+		(struct i40e_aqc_macvlan *)&desc.params.raw;
+	i40e_status status;
+	u16 buf_size;
+	int i;
+
+	if (count == 0 || !mv_list || !hw)
+		return I40E_ERR_PARAM;
+
+	buf_size = count * sizeof(*mv_list);
+
+	/* prep the rest of the request */
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_macvlan);
+	cmd->num_addresses = cpu_to_le16(count);
+	cmd->seid[0] = cpu_to_le16(I40E_AQC_MACVLAN_CMD_SEID_VALID | seid);
+	cmd->seid[1] = 0;
+	cmd->seid[2] = 0;
+
+	for (i = 0; i < count; i++)
+		if (is_multicast_ether_addr(mv_list[i].mac_addr))
+			mv_list[i].flags |=
+			       cpu_to_le16(I40E_AQC_MACVLAN_ADD_USE_SHARED_MAC);
+
+	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (buf_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, mv_list, buf_size,
+				       cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_remove_macvlan
+ * @hw: pointer to the hw struct
+ * @seid: VSI for the mac address
+ * @mv_list: list of macvlans to be removed
+ * @count: length of the list
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Remove MAC/VLAN addresses from the HW filtering
+ **/
+i40e_status i40e_aq_remove_macvlan(struct i40e_hw *hw, u16 seid,
+			struct i40e_aqc_remove_macvlan_element_data *mv_list,
+			u16 count, struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_macvlan *cmd =
+		(struct i40e_aqc_macvlan *)&desc.params.raw;
+	i40e_status status;
+	u16 buf_size;
+
+	if (count == 0 || !mv_list || !hw)
+		return I40E_ERR_PARAM;
+
+	buf_size = count * sizeof(*mv_list);
+
+	/* prep the rest of the request */
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_remove_macvlan);
+	cmd->num_addresses = cpu_to_le16(count);
+	cmd->seid[0] = cpu_to_le16(I40E_AQC_MACVLAN_CMD_SEID_VALID | seid);
+	cmd->seid[1] = 0;
+	cmd->seid[2] = 0;
+
+	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (buf_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, mv_list, buf_size,
+				       cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_mirrorrule_op - Internal helper function to add/delete mirror rule
+ * @hw: pointer to the hw struct
+ * @opcode: AQ opcode for add or delete mirror rule
+ * @sw_seid: Switch SEID (to which rule refers)
+ * @rule_type: Rule Type (ingress/egress/VLAN)
+ * @id: Destination VSI SEID or Rule ID
+ * @count: length of the list
+ * @mr_list: list of mirrored VSI SEIDs or VLAN IDs
+ * @cmd_details: pointer to command details structure or NULL
+ * @rule_id: Rule ID returned from FW
+ * @rule_used: Number of rules used in internal switch
+ * @rule_free: Number of rules free in internal switch
+ *
+ * Add/Delete a mirror rule to a specific switch. Mirror rules are supported for
+ * VEBs/VEPA elements only
+ **/
+static i40e_status i40e_mirrorrule_op(struct i40e_hw *hw,
+				u16 opcode, u16 sw_seid, u16 rule_type, u16 id,
+				u16 count, __le16 *mr_list,
+				struct i40e_asq_cmd_details *cmd_details,
+				u16 *rule_id, u16 *rules_used, u16 *rules_free)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_delete_mirror_rule *cmd =
+		(struct i40e_aqc_add_delete_mirror_rule *)&desc.params.raw;
+	struct i40e_aqc_add_delete_mirror_rule_completion *resp =
+	(struct i40e_aqc_add_delete_mirror_rule_completion *)&desc.params.raw;
+	i40e_status status;
+	u16 buf_size;
+
+	buf_size = count * sizeof(*mr_list);
+
+	/* prep the rest of the request */
+	i40e_fill_default_direct_cmd_desc(&desc, opcode);
+	cmd->seid = cpu_to_le16(sw_seid);
+	cmd->rule_type = cpu_to_le16(rule_type &
+				     I40E_AQC_MIRROR_RULE_TYPE_MASK);
+	cmd->num_entries = cpu_to_le16(count);
+	/* Dest VSI for add, rule_id for delete */
+	cmd->destination = cpu_to_le16(id);
+	if (mr_list) {
+		desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF |
+						I40E_AQ_FLAG_RD));
+		if (buf_size > I40E_AQ_LARGE_BUF)
+			desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+	}
+
+	status = i40e_asq_send_command(hw, &desc, mr_list, buf_size,
+				       cmd_details);
+	if (!status ||
+	    hw->aq.asq_last_status == I40E_AQ_RC_ENOSPC) {
+		if (rule_id)
+			*rule_id = le16_to_cpu(resp->rule_id);
+		if (rules_used)
+			*rules_used = le16_to_cpu(resp->mirror_rules_used);
+		if (rules_free)
+			*rules_free = le16_to_cpu(resp->mirror_rules_free);
+	}
+	return status;
+}
+
+/**
+ * i40e_aq_add_mirrorrule - add a mirror rule
+ * @hw: pointer to the hw struct
+ * @sw_seid: Switch SEID (to which rule refers)
+ * @rule_type: Rule Type (ingress/egress/VLAN)
+ * @dest_vsi: SEID of VSI to which packets will be mirrored
+ * @count: length of the list
+ * @mr_list: list of mirrored VSI SEIDs or VLAN IDs
+ * @cmd_details: pointer to command details structure or NULL
+ * @rule_id: Rule ID returned from FW
+ * @rule_used: Number of rules used in internal switch
+ * @rule_free: Number of rules free in internal switch
+ *
+ * Add mirror rule. Mirror rules are supported for VEBs or VEPA elements only
+ **/
+i40e_status i40e_aq_add_mirrorrule(struct i40e_hw *hw, u16 sw_seid,
+			u16 rule_type, u16 dest_vsi, u16 count, __le16 *mr_list,
+			struct i40e_asq_cmd_details *cmd_details,
+			u16 *rule_id, u16 *rules_used, u16 *rules_free)
+{
+	if (!(rule_type == I40E_AQC_MIRROR_RULE_TYPE_ALL_INGRESS ||
+	    rule_type == I40E_AQC_MIRROR_RULE_TYPE_ALL_EGRESS)) {
+		if (count == 0 || !mr_list)
+			return I40E_ERR_PARAM;
+	}
+
+	return i40e_mirrorrule_op(hw, i40e_aqc_opc_add_mirror_rule, sw_seid,
+				  rule_type, dest_vsi, count, mr_list,
+				  cmd_details, rule_id, rules_used, rules_free);
+}
+
+/**
+ * i40e_aq_delete_mirrorrule - delete a mirror rule
+ * @hw: pointer to the hw struct
+ * @sw_seid: Switch SEID (to which rule refers)
+ * @rule_type: Rule Type (ingress/egress/VLAN)
+ * @count: length of the list
+ * @rule_id: Rule ID that is returned in the receive desc as part of
+ *		add_mirrorrule.
+ * @mr_list: list of mirrored VLAN IDs to be removed
+ * @cmd_details: pointer to command details structure or NULL
+ * @rule_used: Number of rules used in internal switch
+ * @rule_free: Number of rules free in internal switch
+ *
+ * Delete a mirror rule. Mirror rules are supported for VEBs/VEPA elements only
+ **/
+i40e_status i40e_aq_delete_mirrorrule(struct i40e_hw *hw, u16 sw_seid,
+			u16 rule_type, u16 rule_id, u16 count, __le16 *mr_list,
+			struct i40e_asq_cmd_details *cmd_details,
+			u16 *rules_used, u16 *rules_free)
+{
+	/* Rule ID has to be valid except rule_type: INGRESS VLAN mirroring */
+	if (rule_type == I40E_AQC_MIRROR_RULE_TYPE_VLAN) {
+		/* count and mr_list shall be valid for rule_type INGRESS VLAN
+		 * mirroring. For other rule_type, count and rule_type should
+		 * not matter.
+		 */
+		if (count == 0 || !mr_list)
+			return I40E_ERR_PARAM;
+	}
+
+	return i40e_mirrorrule_op(hw, i40e_aqc_opc_delete_mirror_rule, sw_seid,
+				  rule_type, rule_id, count, mr_list,
+				  cmd_details, NULL, rules_used, rules_free);
+}
+
+/**
+ * i40e_aq_send_msg_to_vf
+ * @hw: pointer to the hardware structure
+ * @vfid: VF id to send msg
+ * @v_opcode: opcodes for VF-PF communication
+ * @v_retval: return error code
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ * @cmd_details: pointer to command details
+ *
+ * send msg to vf
+ **/
+i40e_status i40e_aq_send_msg_to_vf(struct i40e_hw *hw, u16 vfid,
+				u32 v_opcode, u32 v_retval, u8 *msg, u16 msglen,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_pf_vf_message *cmd =
+		(struct i40e_aqc_pf_vf_message *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_send_msg_to_vf);
+	cmd->id = cpu_to_le32(vfid);
+	desc.cookie_high = cpu_to_le32(v_opcode);
+	desc.cookie_low = cpu_to_le32(v_retval);
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_SI);
+	if (msglen) {
+		desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF |
+						I40E_AQ_FLAG_RD));
+		if (msglen > I40E_AQ_LARGE_BUF)
+			desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.datalen = cpu_to_le16(msglen);
+	}
+	status = i40e_asq_send_command(hw, &desc, msg, msglen, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_debug_read_register
+ * @hw: pointer to the hw struct
+ * @reg_addr: register address
+ * @reg_val: register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Read the register using the admin queue commands
+ **/
+i40e_status i40e_aq_debug_read_register(struct i40e_hw *hw,
+				u32 reg_addr, u64 *reg_val,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_debug_reg_read_write *cmd_resp =
+		(struct i40e_aqc_debug_reg_read_write *)&desc.params.raw;
+	i40e_status status;
+
+	if (reg_val == NULL)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_debug_read_reg);
+
+	cmd_resp->address = cpu_to_le32(reg_addr);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status) {
+		*reg_val = ((u64)le32_to_cpu(cmd_resp->value_high) << 32) |
+			   (u64)le32_to_cpu(cmd_resp->value_low);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_debug_write_register
+ * @hw: pointer to the hw struct
+ * @reg_addr: register address
+ * @reg_val: register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Write to a register using the admin queue commands
+ **/
+i40e_status i40e_aq_debug_write_register(struct i40e_hw *hw,
+					u32 reg_addr, u64 reg_val,
+					struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_debug_reg_read_write *cmd =
+		(struct i40e_aqc_debug_reg_read_write *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_debug_write_reg);
+
+	cmd->address = cpu_to_le32(reg_addr);
+	cmd->value_high = cpu_to_le32((u32)(reg_val >> 32));
+	cmd->value_low = cpu_to_le32((u32)(reg_val & 0xFFFFFFFF));
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_request_resource
+ * @hw: pointer to the hw struct
+ * @resource: resource id
+ * @access: access type
+ * @sdp_number: resource number
+ * @timeout: the maximum time in ms that the driver may hold the resource
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * requests common resource using the admin queue commands
+ **/
+i40e_status i40e_aq_request_resource(struct i40e_hw *hw,
+				enum i40e_aq_resources_ids resource,
+				enum i40e_aq_resource_access_type access,
+				u8 sdp_number, u64 *timeout,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_request_resource *cmd_resp =
+		(struct i40e_aqc_request_resource *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_request_resource);
+
+	cmd_resp->resource_id = cpu_to_le16(resource);
+	cmd_resp->access_type = cpu_to_le16(access);
+	cmd_resp->resource_number = cpu_to_le32(sdp_number);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+	/* The completion specifies the maximum time in ms that the driver
+	 * may hold the resource in the Timeout field.
+	 * If the resource is held by someone else, the command completes with
+	 * busy return value and the timeout field indicates the maximum time
+	 * the current owner of the resource has to free it.
+	 */
+	if (!status || hw->aq.asq_last_status == I40E_AQ_RC_EBUSY)
+		*timeout = le32_to_cpu(cmd_resp->timeout);
+
+	return status;
+}
+
+/**
+ * i40e_aq_release_resource
+ * @hw: pointer to the hw struct
+ * @resource: resource id
+ * @sdp_number: resource number
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * release common resource using the admin queue commands
+ **/
+i40e_status i40e_aq_release_resource(struct i40e_hw *hw,
+				enum i40e_aq_resources_ids resource,
+				u8 sdp_number,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_request_resource *cmd =
+		(struct i40e_aqc_request_resource *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_release_resource);
+
+	cmd->resource_id = cpu_to_le16(resource);
+	cmd->resource_number = cpu_to_le32(sdp_number);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_read_nvm
+ * @hw: pointer to the hw struct
+ * @module_pointer: module pointer location in words from the NVM beginning
+ * @offset: byte offset from the module beginning
+ * @length: length of the section to be read (in bytes from the offset)
+ * @data: command buffer (size [bytes] = length)
+ * @last_command: tells if this is the last command in a series
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Read the NVM using the admin queue commands
+ **/
+i40e_status i40e_aq_read_nvm(struct i40e_hw *hw, u8 module_pointer,
+				u32 offset, u16 length, void *data,
+				bool last_command,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_nvm_update *cmd =
+		(struct i40e_aqc_nvm_update *)&desc.params.raw;
+	i40e_status status;
+
+	/* In offset the highest byte must be zeroed. */
+	if (offset & 0xFF000000) {
+		status = I40E_ERR_PARAM;
+		goto i40e_aq_read_nvm_exit;
+	}
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_read);
+
+	/* If this is the last command in a series, set the proper flag. */
+	if (last_command)
+		cmd->command_flags |= I40E_AQ_NVM_LAST_CMD;
+	cmd->module_pointer = module_pointer;
+	cmd->offset = cpu_to_le32(offset);
+	cmd->length = cpu_to_le16(length);
+
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	if (length > I40E_AQ_LARGE_BUF)
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, data, length, cmd_details);
+
+i40e_aq_read_nvm_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_erase_nvm
+ * @hw: pointer to the hw struct
+ * @module_pointer: module pointer location in words from the NVM beginning
+ * @offset: offset in the module (expressed in 4 KB from module's beginning)
+ * @length: length of the section to be erased (expressed in 4 KB)
+ * @last_command: tells if this is the last command in a series
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Erase the NVM sector using the admin queue commands
+ **/
+i40e_status i40e_aq_erase_nvm(struct i40e_hw *hw, u8 module_pointer,
+			      u32 offset, u16 length, bool last_command,
+			      struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_nvm_update *cmd =
+		(struct i40e_aqc_nvm_update *)&desc.params.raw;
+	i40e_status status;
+
+	/* In offset the highest byte must be zeroed. */
+	if (offset & 0xFF000000) {
+		status = I40E_ERR_PARAM;
+		goto i40e_aq_erase_nvm_exit;
+	}
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_erase);
+
+	/* If this is the last command in a series, set the proper flag. */
+	if (last_command)
+		cmd->command_flags |= I40E_AQ_NVM_LAST_CMD;
+	cmd->module_pointer = module_pointer;
+	cmd->offset = cpu_to_le32(offset);
+	cmd->length = cpu_to_le16(length);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+i40e_aq_erase_nvm_exit:
+	return status;
+}
+
+/**
+ * i40e_parse_discover_capabilities
+ * @hw: pointer to the hw struct
+ * @buff: pointer to a buffer containing device/function capability records
+ * @cap_count: number of capability records in the list
+ * @list_type_opc: type of capabilities list to parse
+ *
+ * Parse the device/function capabilities list.
+ **/
+static void i40e_parse_discover_capabilities(struct i40e_hw *hw, void *buff,
+				     u32 cap_count,
+				     enum i40e_admin_queue_opc list_type_opc)
+{
+	struct i40e_aqc_list_capabilities_element_resp *cap;
+	u32 valid_functions, num_functions;
+	u32 number, logical_id, phys_id;
+	struct i40e_hw_capabilities *p;
+	u16 id, ocp_cfg_word0;
+	i40e_status status;
+	u8 major_rev;
+	u32 i = 0;
+
+	cap = (struct i40e_aqc_list_capabilities_element_resp *) buff;
+
+	if (list_type_opc == i40e_aqc_opc_list_dev_capabilities)
+		p = &hw->dev_caps;
+	else if (list_type_opc == i40e_aqc_opc_list_func_capabilities)
+		p = &hw->func_caps;
+	else
+		return;
+
+	for (i = 0; i < cap_count; i++, cap++) {
+		id = le16_to_cpu(cap->id);
+		number = le32_to_cpu(cap->number);
+		logical_id = le32_to_cpu(cap->logical_id);
+		phys_id = le32_to_cpu(cap->phys_id);
+		major_rev = cap->major_rev;
+
+		switch (id) {
+		case I40E_AQ_CAP_ID_SWITCH_MODE:
+			p->switch_mode = number;
+			break;
+		case I40E_AQ_CAP_ID_MNG_MODE:
+			p->management_mode = number;
+			if (major_rev > 1) {
+				p->mng_protocols_over_mctp = logical_id;
+				i40e_debug(hw, I40E_DEBUG_INIT,
+					   "HW Capability: Protocols over MCTP = %d\n",
+					   p->mng_protocols_over_mctp);
+			} else {
+				p->mng_protocols_over_mctp = 0;
+			}
+			break;
+		case I40E_AQ_CAP_ID_NPAR_ACTIVE:
+			p->npar_enable = number;
+			break;
+		case I40E_AQ_CAP_ID_OS2BMC_CAP:
+			p->os2bmc = number;
+			break;
+		case I40E_AQ_CAP_ID_FUNCTIONS_VALID:
+			p->valid_functions = number;
+			break;
+		case I40E_AQ_CAP_ID_SRIOV:
+			if (number == 1)
+				p->sr_iov_1_1 = true;
+			break;
+		case I40E_AQ_CAP_ID_VF:
+			p->num_vfs = number;
+			p->vf_base_id = logical_id;
+			break;
+		case I40E_AQ_CAP_ID_VMDQ:
+			if (number == 1)
+				p->vmdq = true;
+			break;
+		case I40E_AQ_CAP_ID_8021QBG:
+			if (number == 1)
+				p->evb_802_1_qbg = true;
+			break;
+		case I40E_AQ_CAP_ID_8021QBR:
+			if (number == 1)
+				p->evb_802_1_qbh = true;
+			break;
+		case I40E_AQ_CAP_ID_VSI:
+			p->num_vsis = number;
+			break;
+		case I40E_AQ_CAP_ID_DCB:
+			if (number == 1) {
+				p->dcb = true;
+				p->enabled_tcmap = logical_id;
+				p->maxtc = phys_id;
+			}
+			break;
+		case I40E_AQ_CAP_ID_FCOE:
+			if (number == 1)
+				p->fcoe = true;
+			break;
+		case I40E_AQ_CAP_ID_ISCSI:
+			if (number == 1)
+				p->iscsi = true;
+			break;
+		case I40E_AQ_CAP_ID_RSS:
+			p->rss = true;
+			p->rss_table_size = number;
+			p->rss_table_entry_width = logical_id;
+			break;
+		case I40E_AQ_CAP_ID_RXQ:
+			p->num_rx_qp = number;
+			p->base_queue = phys_id;
+			break;
+		case I40E_AQ_CAP_ID_TXQ:
+			p->num_tx_qp = number;
+			p->base_queue = phys_id;
+			break;
+		case I40E_AQ_CAP_ID_MSIX:
+			p->num_msix_vectors = number;
+			i40e_debug(hw, I40E_DEBUG_INIT,
+				   "HW Capability: MSIX vector count = %d\n",
+				   p->num_msix_vectors);
+			break;
+		case I40E_AQ_CAP_ID_VF_MSIX:
+			p->num_msix_vectors_vf = number;
+			break;
+		case I40E_AQ_CAP_ID_FLEX10:
+			if (major_rev == 1) {
+				if (number == 1) {
+					p->flex10_enable = true;
+					p->flex10_capable = true;
+				}
+			} else {
+				/* Capability revision >= 2 */
+				if (number & 1)
+					p->flex10_enable = true;
+				if (number & 2)
+					p->flex10_capable = true;
+			}
+			p->flex10_mode = logical_id;
+			p->flex10_status = phys_id;
+			break;
+		case I40E_AQ_CAP_ID_CEM:
+			if (number == 1)
+				p->mgmt_cem = true;
+			break;
+		case I40E_AQ_CAP_ID_IWARP:
+			if (number == 1)
+				p->iwarp = true;
+			break;
+		case I40E_AQ_CAP_ID_LED:
+			if (phys_id < I40E_HW_CAP_MAX_GPIO)
+				p->led[phys_id] = true;
+			break;
+		case I40E_AQ_CAP_ID_SDP:
+			if (phys_id < I40E_HW_CAP_MAX_GPIO)
+				p->sdp[phys_id] = true;
+			break;
+		case I40E_AQ_CAP_ID_MDIO:
+			if (number == 1) {
+				p->mdio_port_num = phys_id;
+				p->mdio_port_mode = logical_id;
+			}
+			break;
+		case I40E_AQ_CAP_ID_1588:
+			if (number == 1)
+				p->ieee_1588 = true;
+			break;
+		case I40E_AQ_CAP_ID_FLOW_DIRECTOR:
+			p->fd = true;
+			p->fd_filters_guaranteed = number;
+			p->fd_filters_best_effort = logical_id;
+			break;
+		case I40E_AQ_CAP_ID_WSR_PROT:
+			p->wr_csr_prot = (u64)number;
+			p->wr_csr_prot |= (u64)logical_id << 32;
+			break;
+		case I40E_AQ_CAP_ID_NVM_MGMT:
+			if (number & I40E_NVM_MGMT_SEC_REV_DISABLED)
+				p->sec_rev_disabled = true;
+			if (number & I40E_NVM_MGMT_UPDATE_DISABLED)
+				p->update_disabled = true;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (p->fcoe)
+		i40e_debug(hw, I40E_DEBUG_ALL, "device is FCoE capable\n");
+
+	/* Software override ensuring FCoE is disabled if npar or mfp
+	 * mode because it is not supported in these modes.
+	 */
+	if (p->npar_enable || p->flex10_enable)
+		p->fcoe = false;
+
+	/* count the enabled ports (aka the "not disabled" ports) */
+	hw->num_ports = 0;
+	for (i = 0; i < 4; i++) {
+		u32 port_cfg_reg = I40E_PRTGEN_CNF + (4 * i);
+		u64 port_cfg = 0;
+
+		/* use AQ read to get the physical register offset instead
+		 * of the port relative offset
+		 */
+		i40e_aq_debug_read_register(hw, port_cfg_reg, &port_cfg, NULL);
+		if (!(port_cfg & I40E_PRTGEN_CNF_PORT_DIS_MASK))
+			hw->num_ports++;
+	}
+
+	/* OCP cards case: if a mezz is removed the Ethernet port is at
+	 * disabled state in PRTGEN_CNF register. Additional NVM read is
+	 * needed in order to check if we are dealing with OCP card.
+	 * Those cards have 4 PFs at minimum, so using PRTGEN_CNF for counting
+	 * physical ports results in wrong partition id calculation and thus
+	 * not supporting WoL.
+	 */
+	if (hw->mac.type == I40E_MAC_X722) {
+		if (!i40e_acquire_nvm(hw, I40E_RESOURCE_READ)) {
+			status = i40e_aq_read_nvm(hw, I40E_SR_EMP_MODULE_PTR,
+						  2 * I40E_SR_OCP_CFG_WORD0,
+						  sizeof(ocp_cfg_word0),
+						  &ocp_cfg_word0, true, NULL);
+			if (!status &&
+			    (ocp_cfg_word0 & I40E_SR_OCP_ENABLED))
+				hw->num_ports = 4;
+			i40e_release_nvm(hw);
+		}
+	}
+
+	valid_functions = p->valid_functions;
+	num_functions = 0;
+	while (valid_functions) {
+		if (valid_functions & 1)
+			num_functions++;
+		valid_functions >>= 1;
+	}
+
+	/* partition id is 1-based, and functions are evenly spread
+	 * across the ports as partitions
+	 */
+	if (hw->num_ports != 0) {
+		hw->partition_id = (hw->pf_id / hw->num_ports) + 1;
+		hw->num_partitions = num_functions / hw->num_ports;
+	}
+
+	/* additional HW specific goodies that might
+	 * someday be HW version specific
+	 */
+	p->rx_buf_chain_len = I40E_MAX_CHAINED_RX_BUFFERS;
+}
+
+/**
+ * i40e_aq_discover_capabilities
+ * @hw: pointer to the hw struct
+ * @buff: a virtual buffer to hold the capabilities
+ * @buff_size: Size of the virtual buffer
+ * @data_size: Size of the returned data, or buff size needed if AQ err==ENOMEM
+ * @list_type_opc: capabilities type to discover - pass in the command opcode
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Get the device capabilities descriptions from the firmware
+ **/
+i40e_status i40e_aq_discover_capabilities(struct i40e_hw *hw,
+				void *buff, u16 buff_size, u16 *data_size,
+				enum i40e_admin_queue_opc list_type_opc,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aqc_list_capabilites *cmd;
+	struct i40e_aq_desc desc;
+	i40e_status status = 0;
+
+	cmd = (struct i40e_aqc_list_capabilites *)&desc.params.raw;
+
+	if (list_type_opc != i40e_aqc_opc_list_func_capabilities &&
+		list_type_opc != i40e_aqc_opc_list_dev_capabilities) {
+		status = I40E_ERR_PARAM;
+		goto exit;
+	}
+
+	i40e_fill_default_direct_cmd_desc(&desc, list_type_opc);
+
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+	*data_size = le16_to_cpu(desc.datalen);
+
+	if (status)
+		goto exit;
+
+	i40e_parse_discover_capabilities(hw, buff, le32_to_cpu(cmd->count),
+					 list_type_opc);
+
+exit:
+	return status;
+}
+
+/**
+ * i40e_aq_update_nvm
+ * @hw: pointer to the hw struct
+ * @module_pointer: module pointer location in words from the NVM beginning
+ * @offset: byte offset from the module beginning
+ * @length: length of the section to be written (in bytes from the offset)
+ * @data: command buffer (size [bytes] = length)
+ * @last_command: tells if this is the last command in a series
+ * @preservation_flags: Preservation mode flags
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Update the NVM using the admin queue commands
+ **/
+i40e_status i40e_aq_update_nvm(struct i40e_hw *hw, u8 module_pointer,
+			       u32 offset, u16 length, void *data,
+				bool last_command, u8 preservation_flags,
+			       struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_nvm_update *cmd =
+		(struct i40e_aqc_nvm_update *)&desc.params.raw;
+	i40e_status status;
+
+	/* In offset the highest byte must be zeroed. */
+	if (offset & 0xFF000000) {
+		status = I40E_ERR_PARAM;
+		goto i40e_aq_update_nvm_exit;
+	}
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_update);
+
+	/* If this is the last command in a series, set the proper flag. */
+	if (last_command)
+		cmd->command_flags |= I40E_AQ_NVM_LAST_CMD;
+	if (hw->mac.type == I40E_MAC_X722) {
+		if (preservation_flags == I40E_NVM_PRESERVATION_FLAGS_SELECTED)
+			cmd->command_flags |=
+				(I40E_AQ_NVM_PRESERVATION_FLAGS_SELECTED <<
+				 I40E_AQ_NVM_PRESERVATION_FLAGS_SHIFT);
+		else if (preservation_flags == I40E_NVM_PRESERVATION_FLAGS_ALL)
+			cmd->command_flags |=
+				(I40E_AQ_NVM_PRESERVATION_FLAGS_ALL <<
+				 I40E_AQ_NVM_PRESERVATION_FLAGS_SHIFT);
+	}
+	cmd->module_pointer = module_pointer;
+	cmd->offset = cpu_to_le32(offset);
+	cmd->length = cpu_to_le16(length);
+
+	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	if (length > I40E_AQ_LARGE_BUF)
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, data, length, cmd_details);
+
+i40e_aq_update_nvm_exit:
+	return status;
+}
+
+/**
+ * i40e_aq_get_lldp_mib
+ * @hw: pointer to the hw struct
+ * @bridge_type: type of bridge requested
+ * @mib_type: Local, Remote or both Local and Remote MIBs
+ * @buff: pointer to a user supplied buffer to store the MIB block
+ * @buff_size: size of the buffer (in bytes)
+ * @local_len : length of the returned Local LLDP MIB
+ * @remote_len: length of the returned Remote LLDP MIB
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Requests the complete LLDP MIB (entire packet).
+ **/
+i40e_status i40e_aq_get_lldp_mib(struct i40e_hw *hw, u8 bridge_type,
+				u8 mib_type, void *buff, u16 buff_size,
+				u16 *local_len, u16 *remote_len,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_lldp_get_mib *cmd =
+		(struct i40e_aqc_lldp_get_mib *)&desc.params.raw;
+	struct i40e_aqc_lldp_get_mib *resp =
+		(struct i40e_aqc_lldp_get_mib *)&desc.params.raw;
+	i40e_status status;
+
+	if (buff_size == 0 || !buff)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_get_mib);
+	/* Indirect Command */
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+
+	cmd->type = mib_type & I40E_AQ_LLDP_MIB_TYPE_MASK;
+	cmd->type |= ((bridge_type << I40E_AQ_LLDP_BRIDGE_TYPE_SHIFT) &
+		       I40E_AQ_LLDP_BRIDGE_TYPE_MASK);
+
+	desc.datalen = cpu_to_le16(buff_size);
+
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+	if (!status) {
+		if (local_len != NULL)
+			*local_len = le16_to_cpu(resp->local_len);
+		if (remote_len != NULL)
+			*remote_len = le16_to_cpu(resp->remote_len);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_cfg_lldp_mib_change_event
+ * @hw: pointer to the hw struct
+ * @enable_update: Enable or Disable event posting
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Enable or Disable posting of an event on ARQ when LLDP MIB
+ * associated with the interface changes
+ **/
+i40e_status i40e_aq_cfg_lldp_mib_change_event(struct i40e_hw *hw,
+				bool enable_update,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_lldp_update_mib *cmd =
+		(struct i40e_aqc_lldp_update_mib *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_update_mib);
+
+	if (!enable_update)
+		cmd->command |= I40E_AQ_LLDP_MIB_UPDATE_DISABLE;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_stop_lldp
+ * @hw: pointer to the hw struct
+ * @shutdown_agent: True if LLDP Agent needs to be Shutdown
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Stop or Shutdown the embedded LLDP Agent
+ **/
+i40e_status i40e_aq_stop_lldp(struct i40e_hw *hw, bool shutdown_agent,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_lldp_stop *cmd =
+		(struct i40e_aqc_lldp_stop *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_stop);
+
+	if (shutdown_agent)
+		cmd->command |= I40E_AQ_LLDP_AGENT_SHUTDOWN;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_start_lldp
+ * @hw: pointer to the hw struct
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Start the embedded LLDP Agent on all ports.
+ **/
+i40e_status i40e_aq_start_lldp(struct i40e_hw *hw,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_lldp_start *cmd =
+		(struct i40e_aqc_lldp_start *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_start);
+
+	cmd->command = I40E_AQ_LLDP_AGENT_START;
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_set_dcb_parameters
+ * @hw: pointer to the hw struct
+ * @cmd_details: pointer to command details structure or NULL
+ * @dcb_enable: True if DCB configuration needs to be applied
+ *
+ **/
+enum i40e_status_code
+i40e_aq_set_dcb_parameters(struct i40e_hw *hw, bool dcb_enable,
+			   struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_set_dcb_parameters *cmd =
+		(struct i40e_aqc_set_dcb_parameters *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_dcb_parameters);
+
+	if (dcb_enable) {
+		cmd->valid_flags = I40E_DCB_VALID;
+		cmd->command = I40E_AQ_DCB_SET_AGENT;
+	}
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_cee_dcb_config
+ * @hw: pointer to the hw struct
+ * @buff: response buffer that stores CEE operational configuration
+ * @buff_size: size of the buffer passed
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Get CEE DCBX mode operational configuration from firmware
+ **/
+i40e_status i40e_aq_get_cee_dcb_config(struct i40e_hw *hw,
+				       void *buff, u16 buff_size,
+				       struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	i40e_status status;
+
+	if (buff_size == 0 || !buff)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_cee_dcb_cfg);
+
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	status = i40e_asq_send_command(hw, &desc, (void *)buff, buff_size,
+				       cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_add_udp_tunnel
+ * @hw: pointer to the hw struct
+ * @udp_port: the UDP port to add in Host byte order
+ * @header_len: length of the tunneling header length in DWords
+ * @protocol_index: protocol index type
+ * @filter_index: pointer to filter index
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Note: Firmware expects the udp_port value to be in Little Endian format,
+ * and this function will call cpu_to_le16 to convert from Host byte order to
+ * Little Endian order.
+ **/
+i40e_status i40e_aq_add_udp_tunnel(struct i40e_hw *hw,
+				u16 udp_port, u8 protocol_index,
+				u8 *filter_index,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_udp_tunnel *cmd =
+		(struct i40e_aqc_add_udp_tunnel *)&desc.params.raw;
+	struct i40e_aqc_del_udp_tunnel_completion *resp =
+		(struct i40e_aqc_del_udp_tunnel_completion *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_udp_tunnel);
+
+	cmd->udp_port = cpu_to_le16(udp_port);
+	cmd->protocol_type = protocol_index;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status && filter_index)
+		*filter_index = resp->index;
+
+	return status;
+}
+
+/**
+ * i40e_aq_del_udp_tunnel
+ * @hw: pointer to the hw struct
+ * @index: filter index
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_del_udp_tunnel(struct i40e_hw *hw, u8 index,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_remove_udp_tunnel *cmd =
+		(struct i40e_aqc_remove_udp_tunnel *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_del_udp_tunnel);
+
+	cmd->index = index;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_delete_element - Delete switch element
+ * @hw: pointer to the hw struct
+ * @seid: the SEID to delete from the switch
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This deletes a switch element from the switch.
+ **/
+i40e_status i40e_aq_delete_element(struct i40e_hw *hw, u16 seid,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_switch_seid *cmd =
+		(struct i40e_aqc_switch_seid *)&desc.params.raw;
+	i40e_status status;
+
+	if (seid == 0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_delete_element);
+
+	cmd->seid = cpu_to_le16(seid);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_dcb_updated - DCB Updated Command
+ * @hw: pointer to the hw struct
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * EMP will return when the shared RPB settings have been
+ * recomputed and modified. The retval field in the descriptor
+ * will be set to 0 when RPB is modified.
+ **/
+i40e_status i40e_aq_dcb_updated(struct i40e_hw *hw,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_dcb_updated);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_tx_sched_cmd - generic Tx scheduler AQ command handler
+ * @hw: pointer to the hw struct
+ * @seid: seid for the physical port/switching component/vsi
+ * @buff: Indirect buffer to hold data parameters and response
+ * @buff_size: Indirect buffer size
+ * @opcode: Tx scheduler AQ command opcode
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Generic command handler for Tx scheduler AQ commands
+ **/
+static i40e_status i40e_aq_tx_sched_cmd(struct i40e_hw *hw, u16 seid,
+				void *buff, u16 buff_size,
+				 enum i40e_admin_queue_opc opcode,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_tx_sched_ind *cmd =
+		(struct i40e_aqc_tx_sched_ind *)&desc.params.raw;
+	i40e_status status;
+	bool cmd_param_flag = false;
+
+	switch (opcode) {
+	case i40e_aqc_opc_configure_vsi_ets_sla_bw_limit:
+	case i40e_aqc_opc_configure_vsi_tc_bw:
+	case i40e_aqc_opc_enable_switching_comp_ets:
+	case i40e_aqc_opc_modify_switching_comp_ets:
+	case i40e_aqc_opc_disable_switching_comp_ets:
+	case i40e_aqc_opc_configure_switching_comp_ets_bw_limit:
+	case i40e_aqc_opc_configure_switching_comp_bw_config:
+		cmd_param_flag = true;
+		break;
+	case i40e_aqc_opc_query_vsi_bw_config:
+	case i40e_aqc_opc_query_vsi_ets_sla_config:
+	case i40e_aqc_opc_query_switching_comp_ets_config:
+	case i40e_aqc_opc_query_port_ets_config:
+	case i40e_aqc_opc_query_switching_comp_bw_config:
+		cmd_param_flag = false;
+		break;
+	default:
+		return I40E_ERR_PARAM;
+	}
+
+	i40e_fill_default_direct_cmd_desc(&desc, opcode);
+
+	/* Indirect command */
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	if (cmd_param_flag)
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_RD);
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+
+	desc.datalen = cpu_to_le16(buff_size);
+
+	cmd->vsi_seid = cpu_to_le16(seid);
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_config_vsi_bw_limit - Configure VSI BW Limit
+ * @hw: pointer to the hw struct
+ * @seid: VSI seid
+ * @credit: BW limit credits (0 = disabled)
+ * @max_credit: Max BW limit credits
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_config_vsi_bw_limit(struct i40e_hw *hw,
+				u16 seid, u16 credit, u8 max_credit,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_configure_vsi_bw_limit *cmd =
+		(struct i40e_aqc_configure_vsi_bw_limit *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_configure_vsi_bw_limit);
+
+	cmd->vsi_seid = cpu_to_le16(seid);
+	cmd->credit = cpu_to_le16(credit);
+	cmd->max_credit = max_credit;
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_config_vsi_tc_bw - Config VSI BW Allocation per TC
+ * @hw: pointer to the hw struct
+ * @seid: VSI seid
+ * @bw_data: Buffer holding enabled TCs, relative TC BW limit/credits
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_config_vsi_tc_bw(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_configure_vsi_tc_bw_data *bw_data,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+				    i40e_aqc_opc_configure_vsi_tc_bw,
+				    cmd_details);
+}
+
+/**
+ * i40e_aq_config_switch_comp_ets - Enable/Disable/Modify ETS on the port
+ * @hw: pointer to the hw struct
+ * @seid: seid of the switching component connected to Physical Port
+ * @ets_data: Buffer holding ETS parameters
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_config_switch_comp_ets(struct i40e_hw *hw,
+		u16 seid,
+		struct i40e_aqc_configure_switching_comp_ets_data *ets_data,
+		enum i40e_admin_queue_opc opcode,
+		struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)ets_data,
+				    sizeof(*ets_data), opcode, cmd_details);
+}
+
+/**
+ * i40e_aq_config_switch_comp_bw_config - Config Switch comp BW Alloc per TC
+ * @hw: pointer to the hw struct
+ * @seid: seid of the switching component
+ * @bw_data: Buffer holding enabled TCs, relative/absolute TC BW limit/credits
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_config_switch_comp_bw_config(struct i40e_hw *hw,
+	u16 seid,
+	struct i40e_aqc_configure_switching_comp_bw_config_data *bw_data,
+	struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+			    i40e_aqc_opc_configure_switching_comp_bw_config,
+			    cmd_details);
+}
+
+/**
+ * i40e_aq_query_vsi_bw_config - Query VSI BW configuration
+ * @hw: pointer to the hw struct
+ * @seid: seid of the VSI
+ * @bw_data: Buffer to hold VSI BW configuration
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_query_vsi_bw_config(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_query_vsi_bw_config_resp *bw_data,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+				    i40e_aqc_opc_query_vsi_bw_config,
+				    cmd_details);
+}
+
+/**
+ * i40e_aq_query_vsi_ets_sla_config - Query VSI BW configuration per TC
+ * @hw: pointer to the hw struct
+ * @seid: seid of the VSI
+ * @bw_data: Buffer to hold VSI BW configuration per TC
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_query_vsi_ets_sla_config(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_query_vsi_ets_sla_config_resp *bw_data,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+				    i40e_aqc_opc_query_vsi_ets_sla_config,
+				    cmd_details);
+}
+
+/**
+ * i40e_aq_query_switch_comp_ets_config - Query Switch comp BW config per TC
+ * @hw: pointer to the hw struct
+ * @seid: seid of the switching component
+ * @bw_data: Buffer to hold switching component's per TC BW config
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_query_switch_comp_ets_config(struct i40e_hw *hw,
+		u16 seid,
+		struct i40e_aqc_query_switching_comp_ets_config_resp *bw_data,
+		struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+				   i40e_aqc_opc_query_switching_comp_ets_config,
+				   cmd_details);
+}
+
+/**
+ * i40e_aq_query_port_ets_config - Query Physical Port ETS configuration
+ * @hw: pointer to the hw struct
+ * @seid: seid of the VSI or switching component connected to Physical Port
+ * @bw_data: Buffer to hold current ETS configuration for the Physical Port
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_query_port_ets_config(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_query_port_ets_config_resp *bw_data,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+				    i40e_aqc_opc_query_port_ets_config,
+				    cmd_details);
+}
+
+/**
+ * i40e_aq_query_switch_comp_bw_config - Query Switch comp BW configuration
+ * @hw: pointer to the hw struct
+ * @seid: seid of the switching component
+ * @bw_data: Buffer to hold switching component's BW configuration
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+i40e_status i40e_aq_query_switch_comp_bw_config(struct i40e_hw *hw,
+		u16 seid,
+		struct i40e_aqc_query_switching_comp_bw_config_resp *bw_data,
+		struct i40e_asq_cmd_details *cmd_details)
+{
+	return i40e_aq_tx_sched_cmd(hw, seid, (void *)bw_data, sizeof(*bw_data),
+				    i40e_aqc_opc_query_switching_comp_bw_config,
+				    cmd_details);
+}
+
+/**
+ * i40e_validate_filter_settings
+ * @hw: pointer to the hardware structure
+ * @settings: Filter control settings
+ *
+ * Check and validate the filter control settings passed.
+ * The function checks for the valid filter/context sizes being
+ * passed for FCoE and PE.
+ *
+ * Returns 0 if the values passed are valid and within
+ * range else returns an error.
+ **/
+static i40e_status i40e_validate_filter_settings(struct i40e_hw *hw,
+				struct i40e_filter_control_settings *settings)
+{
+	u32 fcoe_cntx_size, fcoe_filt_size;
+	u32 pe_cntx_size, pe_filt_size;
+	u32 fcoe_fmax;
+	u32 val;
+
+	/* Validate FCoE settings passed */
+	switch (settings->fcoe_filt_num) {
+	case I40E_HASH_FILTER_SIZE_1K:
+	case I40E_HASH_FILTER_SIZE_2K:
+	case I40E_HASH_FILTER_SIZE_4K:
+	case I40E_HASH_FILTER_SIZE_8K:
+	case I40E_HASH_FILTER_SIZE_16K:
+	case I40E_HASH_FILTER_SIZE_32K:
+		fcoe_filt_size = I40E_HASH_FILTER_BASE_SIZE;
+		fcoe_filt_size <<= (u32)settings->fcoe_filt_num;
+		break;
+	default:
+		return I40E_ERR_PARAM;
+	}
+
+	switch (settings->fcoe_cntx_num) {
+	case I40E_DMA_CNTX_SIZE_512:
+	case I40E_DMA_CNTX_SIZE_1K:
+	case I40E_DMA_CNTX_SIZE_2K:
+	case I40E_DMA_CNTX_SIZE_4K:
+		fcoe_cntx_size = I40E_DMA_CNTX_BASE_SIZE;
+		fcoe_cntx_size <<= (u32)settings->fcoe_cntx_num;
+		break;
+	default:
+		return I40E_ERR_PARAM;
+	}
+
+	/* Validate PE settings passed */
+	switch (settings->pe_filt_num) {
+	case I40E_HASH_FILTER_SIZE_1K:
+	case I40E_HASH_FILTER_SIZE_2K:
+	case I40E_HASH_FILTER_SIZE_4K:
+	case I40E_HASH_FILTER_SIZE_8K:
+	case I40E_HASH_FILTER_SIZE_16K:
+	case I40E_HASH_FILTER_SIZE_32K:
+	case I40E_HASH_FILTER_SIZE_64K:
+	case I40E_HASH_FILTER_SIZE_128K:
+	case I40E_HASH_FILTER_SIZE_256K:
+	case I40E_HASH_FILTER_SIZE_512K:
+	case I40E_HASH_FILTER_SIZE_1M:
+		pe_filt_size = I40E_HASH_FILTER_BASE_SIZE;
+		pe_filt_size <<= (u32)settings->pe_filt_num;
+		break;
+	default:
+		return I40E_ERR_PARAM;
+	}
+
+	switch (settings->pe_cntx_num) {
+	case I40E_DMA_CNTX_SIZE_512:
+	case I40E_DMA_CNTX_SIZE_1K:
+	case I40E_DMA_CNTX_SIZE_2K:
+	case I40E_DMA_CNTX_SIZE_4K:
+	case I40E_DMA_CNTX_SIZE_8K:
+	case I40E_DMA_CNTX_SIZE_16K:
+	case I40E_DMA_CNTX_SIZE_32K:
+	case I40E_DMA_CNTX_SIZE_64K:
+	case I40E_DMA_CNTX_SIZE_128K:
+	case I40E_DMA_CNTX_SIZE_256K:
+		pe_cntx_size = I40E_DMA_CNTX_BASE_SIZE;
+		pe_cntx_size <<= (u32)settings->pe_cntx_num;
+		break;
+	default:
+		return I40E_ERR_PARAM;
+	}
+
+	/* FCHSIZE + FCDSIZE should not be greater than PMFCOEFMAX */
+	val = rd32(hw, I40E_GLHMC_FCOEFMAX);
+	fcoe_fmax = (val & I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_MASK)
+		     >> I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_SHIFT;
+	if (fcoe_filt_size + fcoe_cntx_size >  fcoe_fmax)
+		return I40E_ERR_INVALID_SIZE;
+
+	return 0;
+}
+
+/**
+ * i40e_set_filter_control
+ * @hw: pointer to the hardware structure
+ * @settings: Filter control settings
+ *
+ * Set the Queue Filters for PE/FCoE and enable filters required
+ * for a single PF. It is expected that these settings are programmed
+ * at the driver initialization time.
+ **/
+i40e_status i40e_set_filter_control(struct i40e_hw *hw,
+				struct i40e_filter_control_settings *settings)
+{
+	i40e_status ret = 0;
+	u32 hash_lut_size = 0;
+	u32 val;
+
+	if (!settings)
+		return I40E_ERR_PARAM;
+
+	/* Validate the input settings */
+	ret = i40e_validate_filter_settings(hw, settings);
+	if (ret)
+		return ret;
+
+	/* Read the PF Queue Filter control register */
+	val = i40e_read_rx_ctl(hw, I40E_PFQF_CTL_0);
+
+	/* Program required PE hash buckets for the PF */
+	val &= ~I40E_PFQF_CTL_0_PEHSIZE_MASK;
+	val |= ((u32)settings->pe_filt_num << I40E_PFQF_CTL_0_PEHSIZE_SHIFT) &
+		I40E_PFQF_CTL_0_PEHSIZE_MASK;
+	/* Program required PE contexts for the PF */
+	val &= ~I40E_PFQF_CTL_0_PEDSIZE_MASK;
+	val |= ((u32)settings->pe_cntx_num << I40E_PFQF_CTL_0_PEDSIZE_SHIFT) &
+		I40E_PFQF_CTL_0_PEDSIZE_MASK;
+
+	/* Program required FCoE hash buckets for the PF */
+	val &= ~I40E_PFQF_CTL_0_PFFCHSIZE_MASK;
+	val |= ((u32)settings->fcoe_filt_num <<
+			I40E_PFQF_CTL_0_PFFCHSIZE_SHIFT) &
+		I40E_PFQF_CTL_0_PFFCHSIZE_MASK;
+	/* Program required FCoE DDP contexts for the PF */
+	val &= ~I40E_PFQF_CTL_0_PFFCDSIZE_MASK;
+	val |= ((u32)settings->fcoe_cntx_num <<
+			I40E_PFQF_CTL_0_PFFCDSIZE_SHIFT) &
+		I40E_PFQF_CTL_0_PFFCDSIZE_MASK;
+
+	/* Program Hash LUT size for the PF */
+	val &= ~I40E_PFQF_CTL_0_HASHLUTSIZE_MASK;
+	if (settings->hash_lut_size == I40E_HASH_LUT_SIZE_512)
+		hash_lut_size = 1;
+	val |= (hash_lut_size << I40E_PFQF_CTL_0_HASHLUTSIZE_SHIFT) &
+		I40E_PFQF_CTL_0_HASHLUTSIZE_MASK;
+
+	/* Enable FDIR, Ethertype and MACVLAN filters for PF and VFs */
+	if (settings->enable_fdir)
+		val |= I40E_PFQF_CTL_0_FD_ENA_MASK;
+	if (settings->enable_ethtype)
+		val |= I40E_PFQF_CTL_0_ETYPE_ENA_MASK;
+	if (settings->enable_macvlan)
+		val |= I40E_PFQF_CTL_0_MACVLAN_ENA_MASK;
+
+	i40e_write_rx_ctl(hw, I40E_PFQF_CTL_0, val);
+
+	return 0;
+}
+
+/**
+ * i40e_aq_add_rem_control_packet_filter - Add or Remove Control Packet Filter
+ * @hw: pointer to the hw struct
+ * @mac_addr: MAC address to use in the filter
+ * @ethtype: Ethertype to use in the filter
+ * @flags: Flags that needs to be applied to the filter
+ * @vsi_seid: seid of the control VSI
+ * @queue: VSI queue number to send the packet to
+ * @is_add: Add control packet filter if True else remove
+ * @stats: Structure to hold information on control filter counts
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * This command will Add or Remove control packet filter for a control VSI.
+ * In return it will update the total number of perfect filter count in
+ * the stats member.
+ **/
+i40e_status i40e_aq_add_rem_control_packet_filter(struct i40e_hw *hw,
+				u8 *mac_addr, u16 ethtype, u16 flags,
+				u16 vsi_seid, u16 queue, bool is_add,
+				struct i40e_control_filter_stats *stats,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_remove_control_packet_filter *cmd =
+		(struct i40e_aqc_add_remove_control_packet_filter *)
+		&desc.params.raw;
+	struct i40e_aqc_add_remove_control_packet_filter_completion *resp =
+		(struct i40e_aqc_add_remove_control_packet_filter_completion *)
+		&desc.params.raw;
+	i40e_status status;
+
+	if (vsi_seid == 0)
+		return I40E_ERR_PARAM;
+
+	if (is_add) {
+		i40e_fill_default_direct_cmd_desc(&desc,
+				i40e_aqc_opc_add_control_packet_filter);
+		cmd->queue = cpu_to_le16(queue);
+	} else {
+		i40e_fill_default_direct_cmd_desc(&desc,
+				i40e_aqc_opc_remove_control_packet_filter);
+	}
+
+	if (mac_addr)
+		ether_addr_copy(cmd->mac, mac_addr);
+
+	cmd->etype = cpu_to_le16(ethtype);
+	cmd->flags = cpu_to_le16(flags);
+	cmd->seid = cpu_to_le16(vsi_seid);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (!status && stats) {
+		stats->mac_etype_used = le16_to_cpu(resp->mac_etype_used);
+		stats->etype_used = le16_to_cpu(resp->etype_used);
+		stats->mac_etype_free = le16_to_cpu(resp->mac_etype_free);
+		stats->etype_free = le16_to_cpu(resp->etype_free);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_add_filter_to_drop_tx_flow_control_frames- filter to drop flow control
+ * @hw: pointer to the hw struct
+ * @seid: VSI seid to add ethertype filter from
+ **/
+#define I40E_FLOW_CONTROL_ETHTYPE 0x8808
+void i40e_add_filter_to_drop_tx_flow_control_frames(struct i40e_hw *hw,
+						    u16 seid)
+{
+	u16 flag = I40E_AQC_ADD_CONTROL_PACKET_FLAGS_IGNORE_MAC |
+		   I40E_AQC_ADD_CONTROL_PACKET_FLAGS_DROP |
+		   I40E_AQC_ADD_CONTROL_PACKET_FLAGS_TX;
+	u16 ethtype = I40E_FLOW_CONTROL_ETHTYPE;
+	i40e_status status;
+
+	status = i40e_aq_add_rem_control_packet_filter(hw, NULL, ethtype, flag,
+						       seid, 0, true, NULL,
+						       NULL);
+	if (status)
+		hw_dbg(hw, "Ethtype Filter Add failed: Error pruning Tx flow control frames\n");
+}
+
+/**
+ * i40e_aq_alternate_read
+ * @hw: pointer to the hardware structure
+ * @reg_addr0: address of first dword to be read
+ * @reg_val0: pointer for data read from 'reg_addr0'
+ * @reg_addr1: address of second dword to be read
+ * @reg_val1: pointer for data read from 'reg_addr1'
+ *
+ * Read one or two dwords from alternate structure. Fields are indicated
+ * by 'reg_addr0' and 'reg_addr1' register numbers. If 'reg_val1' pointer
+ * is not passed then only register at 'reg_addr0' is read.
+ *
+ **/
+static i40e_status i40e_aq_alternate_read(struct i40e_hw *hw,
+					  u32 reg_addr0, u32 *reg_val0,
+					  u32 reg_addr1, u32 *reg_val1)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_alternate_write *cmd_resp =
+		(struct i40e_aqc_alternate_write *)&desc.params.raw;
+	i40e_status status;
+
+	if (!reg_val0)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_alternate_read);
+	cmd_resp->address0 = cpu_to_le32(reg_addr0);
+	cmd_resp->address1 = cpu_to_le32(reg_addr1);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL);
+
+	if (!status) {
+		*reg_val0 = le32_to_cpu(cmd_resp->data0);
+
+		if (reg_val1)
+			*reg_val1 = le32_to_cpu(cmd_resp->data1);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_resume_port_tx
+ * @hw: pointer to the hardware structure
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Resume port's Tx traffic
+ **/
+i40e_status i40e_aq_resume_port_tx(struct i40e_hw *hw,
+				   struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_resume_port_tx);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_set_pci_config_data - store PCI bus info
+ * @hw: pointer to hardware structure
+ * @link_status: the link status word from PCI config space
+ *
+ * Stores the PCI bus info (speed, width, type) within the i40e_hw structure
+ **/
+void i40e_set_pci_config_data(struct i40e_hw *hw, u16 link_status)
+{
+	hw->bus.type = i40e_bus_type_pci_express;
+
+	switch (link_status & PCI_EXP_LNKSTA_NLW) {
+	case PCI_EXP_LNKSTA_NLW_X1:
+		hw->bus.width = i40e_bus_width_pcie_x1;
+		break;
+	case PCI_EXP_LNKSTA_NLW_X2:
+		hw->bus.width = i40e_bus_width_pcie_x2;
+		break;
+	case PCI_EXP_LNKSTA_NLW_X4:
+		hw->bus.width = i40e_bus_width_pcie_x4;
+		break;
+	case PCI_EXP_LNKSTA_NLW_X8:
+		hw->bus.width = i40e_bus_width_pcie_x8;
+		break;
+	default:
+		hw->bus.width = i40e_bus_width_unknown;
+		break;
+	}
+
+	switch (link_status & PCI_EXP_LNKSTA_CLS) {
+	case PCI_EXP_LNKSTA_CLS_2_5GB:
+		hw->bus.speed = i40e_bus_speed_2500;
+		break;
+	case PCI_EXP_LNKSTA_CLS_5_0GB:
+		hw->bus.speed = i40e_bus_speed_5000;
+		break;
+	case PCI_EXP_LNKSTA_CLS_8_0GB:
+		hw->bus.speed = i40e_bus_speed_8000;
+		break;
+	default:
+		hw->bus.speed = i40e_bus_speed_unknown;
+		break;
+	}
+}
+
+/**
+ * i40e_aq_debug_dump
+ * @hw: pointer to the hardware structure
+ * @cluster_id: specific cluster to dump
+ * @table_id: table id within cluster
+ * @start_index: index of line in the block to read
+ * @buff_size: dump buffer size
+ * @buff: dump buffer
+ * @ret_buff_size: actual buffer size returned
+ * @ret_next_table: next block to read
+ * @ret_next_index: next index to read
+ *
+ * Dump internal FW/HW data for debug purposes.
+ *
+ **/
+i40e_status i40e_aq_debug_dump(struct i40e_hw *hw, u8 cluster_id,
+			       u8 table_id, u32 start_index, u16 buff_size,
+			       void *buff, u16 *ret_buff_size,
+			       u8 *ret_next_table, u32 *ret_next_index,
+			       struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_debug_dump_internals *cmd =
+		(struct i40e_aqc_debug_dump_internals *)&desc.params.raw;
+	struct i40e_aqc_debug_dump_internals *resp =
+		(struct i40e_aqc_debug_dump_internals *)&desc.params.raw;
+	i40e_status status;
+
+	if (buff_size == 0 || !buff)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_debug_dump_internals);
+	/* Indirect Command */
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+
+	cmd->cluster_id = cluster_id;
+	cmd->table_id = table_id;
+	cmd->idx = cpu_to_le32(start_index);
+
+	desc.datalen = cpu_to_le16(buff_size);
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+	if (!status) {
+		if (ret_buff_size)
+			*ret_buff_size = le16_to_cpu(desc.datalen);
+		if (ret_next_table)
+			*ret_next_table = resp->table_id;
+		if (ret_next_index)
+			*ret_next_index = le32_to_cpu(resp->idx);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_read_bw_from_alt_ram
+ * @hw: pointer to the hardware structure
+ * @max_bw: pointer for max_bw read
+ * @min_bw: pointer for min_bw read
+ * @min_valid: pointer for bool that is true if min_bw is a valid value
+ * @max_valid: pointer for bool that is true if max_bw is a valid value
+ *
+ * Read bw from the alternate ram for the given pf
+ **/
+i40e_status i40e_read_bw_from_alt_ram(struct i40e_hw *hw,
+				      u32 *max_bw, u32 *min_bw,
+				      bool *min_valid, bool *max_valid)
+{
+	i40e_status status;
+	u32 max_bw_addr, min_bw_addr;
+
+	/* Calculate the address of the min/max bw registers */
+	max_bw_addr = I40E_ALT_STRUCT_FIRST_PF_OFFSET +
+		      I40E_ALT_STRUCT_MAX_BW_OFFSET +
+		      (I40E_ALT_STRUCT_DWORDS_PER_PF * hw->pf_id);
+	min_bw_addr = I40E_ALT_STRUCT_FIRST_PF_OFFSET +
+		      I40E_ALT_STRUCT_MIN_BW_OFFSET +
+		      (I40E_ALT_STRUCT_DWORDS_PER_PF * hw->pf_id);
+
+	/* Read the bandwidths from alt ram */
+	status = i40e_aq_alternate_read(hw, max_bw_addr, max_bw,
+					min_bw_addr, min_bw);
+
+	if (*min_bw & I40E_ALT_BW_VALID_MASK)
+		*min_valid = true;
+	else
+		*min_valid = false;
+
+	if (*max_bw & I40E_ALT_BW_VALID_MASK)
+		*max_valid = true;
+	else
+		*max_valid = false;
+
+	return status;
+}
+
+/**
+ * i40e_aq_configure_partition_bw
+ * @hw: pointer to the hardware structure
+ * @bw_data: Buffer holding valid pfs and bw limits
+ * @cmd_details: pointer to command details
+ *
+ * Configure partitions guaranteed/max bw
+ **/
+i40e_status i40e_aq_configure_partition_bw(struct i40e_hw *hw,
+			struct i40e_aqc_configure_partition_bw_data *bw_data,
+			struct i40e_asq_cmd_details *cmd_details)
+{
+	i40e_status status;
+	struct i40e_aq_desc desc;
+	u16 bwd_size = sizeof(*bw_data);
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_configure_partition_bw);
+
+	/* Indirect command */
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_RD);
+
+	if (bwd_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+
+	desc.datalen = cpu_to_le16(bwd_size);
+
+	status = i40e_asq_send_command(hw, &desc, bw_data, bwd_size,
+				       cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_read_phy_register_clause22
+ * @hw: pointer to the HW structure
+ * @reg: register address in the page
+ * @phy_adr: PHY address on MDIO interface
+ * @value: PHY register value
+ *
+ * Reads specified PHY register value
+ **/
+i40e_status i40e_read_phy_register_clause22(struct i40e_hw *hw,
+					    u16 reg, u8 phy_addr, u16 *value)
+{
+	i40e_status status = I40E_ERR_TIMEOUT;
+	u8 port_num = (u8)hw->func_caps.mdio_port_num;
+	u32 command = 0;
+	u16 retry = 1000;
+
+	command = (reg << I40E_GLGEN_MSCA_DEVADD_SHIFT) |
+		  (phy_addr << I40E_GLGEN_MSCA_PHYADD_SHIFT) |
+		  (I40E_MDIO_CLAUSE22_OPCODE_READ_MASK) |
+		  (I40E_MDIO_CLAUSE22_STCODE_MASK) |
+		  (I40E_GLGEN_MSCA_MDICMD_MASK);
+	wr32(hw, I40E_GLGEN_MSCA(port_num), command);
+	do {
+		command = rd32(hw, I40E_GLGEN_MSCA(port_num));
+		if (!(command & I40E_GLGEN_MSCA_MDICMD_MASK)) {
+			status = 0;
+			break;
+		}
+		udelay(10);
+		retry--;
+	} while (retry);
+
+	if (status) {
+		i40e_debug(hw, I40E_DEBUG_PHY,
+			   "PHY: Can't write command to external PHY.\n");
+	} else {
+		command = rd32(hw, I40E_GLGEN_MSRWD(port_num));
+		*value = (command & I40E_GLGEN_MSRWD_MDIRDDATA_MASK) >>
+			 I40E_GLGEN_MSRWD_MDIRDDATA_SHIFT;
+	}
+
+	return status;
+}
+
+/**
+ * i40e_write_phy_register_clause22
+ * @hw: pointer to the HW structure
+ * @reg: register address in the page
+ * @phy_adr: PHY address on MDIO interface
+ * @value: PHY register value
+ *
+ * Writes specified PHY register value
+ **/
+i40e_status i40e_write_phy_register_clause22(struct i40e_hw *hw,
+					     u16 reg, u8 phy_addr, u16 value)
+{
+	i40e_status status = I40E_ERR_TIMEOUT;
+	u8 port_num = (u8)hw->func_caps.mdio_port_num;
+	u32 command  = 0;
+	u16 retry = 1000;
+
+	command = value << I40E_GLGEN_MSRWD_MDIWRDATA_SHIFT;
+	wr32(hw, I40E_GLGEN_MSRWD(port_num), command);
+
+	command = (reg << I40E_GLGEN_MSCA_DEVADD_SHIFT) |
+		  (phy_addr << I40E_GLGEN_MSCA_PHYADD_SHIFT) |
+		  (I40E_MDIO_CLAUSE22_OPCODE_WRITE_MASK) |
+		  (I40E_MDIO_CLAUSE22_STCODE_MASK) |
+		  (I40E_GLGEN_MSCA_MDICMD_MASK);
+
+	wr32(hw, I40E_GLGEN_MSCA(port_num), command);
+	do {
+		command = rd32(hw, I40E_GLGEN_MSCA(port_num));
+		if (!(command & I40E_GLGEN_MSCA_MDICMD_MASK)) {
+			status = 0;
+			break;
+		}
+		udelay(10);
+		retry--;
+	} while (retry);
+
+	return status;
+}
+
+/**
+ * i40e_read_phy_register_clause45
+ * @hw: pointer to the HW structure
+ * @page: registers page number
+ * @reg: register address in the page
+ * @phy_adr: PHY address on MDIO interface
+ * @value: PHY register value
+ *
+ * Reads specified PHY register value
+ **/
+i40e_status i40e_read_phy_register_clause45(struct i40e_hw *hw,
+				u8 page, u16 reg, u8 phy_addr, u16 *value)
+{
+	i40e_status status = I40E_ERR_TIMEOUT;
+	u32 command = 0;
+	u16 retry = 1000;
+	u8 port_num = hw->func_caps.mdio_port_num;
+
+	command = (reg << I40E_GLGEN_MSCA_MDIADD_SHIFT) |
+		  (page << I40E_GLGEN_MSCA_DEVADD_SHIFT) |
+		  (phy_addr << I40E_GLGEN_MSCA_PHYADD_SHIFT) |
+		  (I40E_MDIO_CLAUSE45_OPCODE_ADDRESS_MASK) |
+		  (I40E_MDIO_CLAUSE45_STCODE_MASK) |
+		  (I40E_GLGEN_MSCA_MDICMD_MASK) |
+		  (I40E_GLGEN_MSCA_MDIINPROGEN_MASK);
+	wr32(hw, I40E_GLGEN_MSCA(port_num), command);
+	do {
+		command = rd32(hw, I40E_GLGEN_MSCA(port_num));
+		if (!(command & I40E_GLGEN_MSCA_MDICMD_MASK)) {
+			status = 0;
+			break;
+		}
+		usleep_range(10, 20);
+		retry--;
+	} while (retry);
+
+	if (status) {
+		i40e_debug(hw, I40E_DEBUG_PHY,
+			   "PHY: Can't write command to external PHY.\n");
+		goto phy_read_end;
+	}
+
+	command = (page << I40E_GLGEN_MSCA_DEVADD_SHIFT) |
+		  (phy_addr << I40E_GLGEN_MSCA_PHYADD_SHIFT) |
+		  (I40E_MDIO_CLAUSE45_OPCODE_READ_MASK) |
+		  (I40E_MDIO_CLAUSE45_STCODE_MASK) |
+		  (I40E_GLGEN_MSCA_MDICMD_MASK) |
+		  (I40E_GLGEN_MSCA_MDIINPROGEN_MASK);
+	status = I40E_ERR_TIMEOUT;
+	retry = 1000;
+	wr32(hw, I40E_GLGEN_MSCA(port_num), command);
+	do {
+		command = rd32(hw, I40E_GLGEN_MSCA(port_num));
+		if (!(command & I40E_GLGEN_MSCA_MDICMD_MASK)) {
+			status = 0;
+			break;
+		}
+		usleep_range(10, 20);
+		retry--;
+	} while (retry);
+
+	if (!status) {
+		command = rd32(hw, I40E_GLGEN_MSRWD(port_num));
+		*value = (command & I40E_GLGEN_MSRWD_MDIRDDATA_MASK) >>
+			 I40E_GLGEN_MSRWD_MDIRDDATA_SHIFT;
+	} else {
+		i40e_debug(hw, I40E_DEBUG_PHY,
+			   "PHY: Can't read register value from external PHY.\n");
+	}
+
+phy_read_end:
+	return status;
+}
+
+/**
+ * i40e_write_phy_register_clause45
+ * @hw: pointer to the HW structure
+ * @page: registers page number
+ * @reg: register address in the page
+ * @phy_adr: PHY address on MDIO interface
+ * @value: PHY register value
+ *
+ * Writes value to specified PHY register
+ **/
+i40e_status i40e_write_phy_register_clause45(struct i40e_hw *hw,
+				u8 page, u16 reg, u8 phy_addr, u16 value)
+{
+	i40e_status status = I40E_ERR_TIMEOUT;
+	u32 command = 0;
+	u16 retry = 1000;
+	u8 port_num = hw->func_caps.mdio_port_num;
+
+	command = (reg << I40E_GLGEN_MSCA_MDIADD_SHIFT) |
+		  (page << I40E_GLGEN_MSCA_DEVADD_SHIFT) |
+		  (phy_addr << I40E_GLGEN_MSCA_PHYADD_SHIFT) |
+		  (I40E_MDIO_CLAUSE45_OPCODE_ADDRESS_MASK) |
+		  (I40E_MDIO_CLAUSE45_STCODE_MASK) |
+		  (I40E_GLGEN_MSCA_MDICMD_MASK) |
+		  (I40E_GLGEN_MSCA_MDIINPROGEN_MASK);
+	wr32(hw, I40E_GLGEN_MSCA(port_num), command);
+	do {
+		command = rd32(hw, I40E_GLGEN_MSCA(port_num));
+		if (!(command & I40E_GLGEN_MSCA_MDICMD_MASK)) {
+			status = 0;
+			break;
+		}
+		usleep_range(10, 20);
+		retry--;
+	} while (retry);
+	if (status) {
+		i40e_debug(hw, I40E_DEBUG_PHY,
+			   "PHY: Can't write command to external PHY.\n");
+		goto phy_write_end;
+	}
+
+	command = value << I40E_GLGEN_MSRWD_MDIWRDATA_SHIFT;
+	wr32(hw, I40E_GLGEN_MSRWD(port_num), command);
+
+	command = (page << I40E_GLGEN_MSCA_DEVADD_SHIFT) |
+		  (phy_addr << I40E_GLGEN_MSCA_PHYADD_SHIFT) |
+		  (I40E_MDIO_CLAUSE45_OPCODE_WRITE_MASK) |
+		  (I40E_MDIO_CLAUSE45_STCODE_MASK) |
+		  (I40E_GLGEN_MSCA_MDICMD_MASK) |
+		  (I40E_GLGEN_MSCA_MDIINPROGEN_MASK);
+	status = I40E_ERR_TIMEOUT;
+	retry = 1000;
+	wr32(hw, I40E_GLGEN_MSCA(port_num), command);
+	do {
+		command = rd32(hw, I40E_GLGEN_MSCA(port_num));
+		if (!(command & I40E_GLGEN_MSCA_MDICMD_MASK)) {
+			status = 0;
+			break;
+		}
+		usleep_range(10, 20);
+		retry--;
+	} while (retry);
+
+phy_write_end:
+	return status;
+}
+
+/**
+ * i40e_write_phy_register
+ * @hw: pointer to the HW structure
+ * @page: registers page number
+ * @reg: register address in the page
+ * @phy_adr: PHY address on MDIO interface
+ * @value: PHY register value
+ *
+ * Writes value to specified PHY register
+ **/
+i40e_status i40e_write_phy_register(struct i40e_hw *hw,
+				    u8 page, u16 reg, u8 phy_addr, u16 value)
+{
+	i40e_status status;
+
+	switch (hw->device_id) {
+	case I40E_DEV_ID_1G_BASE_T_X722:
+		status = i40e_write_phy_register_clause22(hw, reg, phy_addr,
+							  value);
+		break;
+	case I40E_DEV_ID_10G_BASE_T:
+	case I40E_DEV_ID_10G_BASE_T4:
+	case I40E_DEV_ID_10G_BASE_T_X722:
+	case I40E_DEV_ID_25G_B:
+	case I40E_DEV_ID_25G_SFP28:
+		status = i40e_write_phy_register_clause45(hw, page, reg,
+							  phy_addr, value);
+		break;
+	default:
+		status = I40E_ERR_UNKNOWN_PHY;
+		break;
+	}
+
+	return status;
+}
+
+/**
+ * i40e_read_phy_register
+ * @hw: pointer to the HW structure
+ * @page: registers page number
+ * @reg: register address in the page
+ * @phy_adr: PHY address on MDIO interface
+ * @value: PHY register value
+ *
+ * Reads specified PHY register value
+ **/
+i40e_status i40e_read_phy_register(struct i40e_hw *hw,
+				   u8 page, u16 reg, u8 phy_addr, u16 *value)
+{
+	i40e_status status;
+
+	switch (hw->device_id) {
+	case I40E_DEV_ID_1G_BASE_T_X722:
+		status = i40e_read_phy_register_clause22(hw, reg, phy_addr,
+							 value);
+		break;
+	case I40E_DEV_ID_10G_BASE_T:
+	case I40E_DEV_ID_10G_BASE_T4:
+	case I40E_DEV_ID_10G_BASE_T_X722:
+	case I40E_DEV_ID_25G_B:
+	case I40E_DEV_ID_25G_SFP28:
+		status = i40e_read_phy_register_clause45(hw, page, reg,
+							 phy_addr, value);
+		break;
+	default:
+		status = I40E_ERR_UNKNOWN_PHY;
+		break;
+	}
+
+	return status;
+}
+
+/**
+ * i40e_get_phy_address
+ * @hw: pointer to the HW structure
+ * @dev_num: PHY port num that address we want
+ * @phy_addr: Returned PHY address
+ *
+ * Gets PHY address for current port
+ **/
+u8 i40e_get_phy_address(struct i40e_hw *hw, u8 dev_num)
+{
+	u8 port_num = hw->func_caps.mdio_port_num;
+	u32 reg_val = rd32(hw, I40E_GLGEN_MDIO_I2C_SEL(port_num));
+
+	return (u8)(reg_val >> ((dev_num + 1) * 5)) & 0x1f;
+}
+
+/**
+ * i40e_blink_phy_led
+ * @hw: pointer to the HW structure
+ * @time: time how long led will blinks in secs
+ * @interval: gap between LED on and off in msecs
+ *
+ * Blinks PHY link LED
+ **/
+i40e_status i40e_blink_phy_link_led(struct i40e_hw *hw,
+				    u32 time, u32 interval)
+{
+	i40e_status status = 0;
+	u32 i;
+	u16 led_ctl;
+	u16 gpio_led_port;
+	u16 led_reg;
+	u16 led_addr = I40E_PHY_LED_PROV_REG_1;
+	u8 phy_addr = 0;
+	u8 port_num;
+
+	i = rd32(hw, I40E_PFGEN_PORTNUM);
+	port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK);
+	phy_addr = i40e_get_phy_address(hw, port_num);
+
+	for (gpio_led_port = 0; gpio_led_port < 3; gpio_led_port++,
+	     led_addr++) {
+		status = i40e_read_phy_register_clause45(hw,
+							 I40E_PHY_COM_REG_PAGE,
+							 led_addr, phy_addr,
+							 &led_reg);
+		if (status)
+			goto phy_blinking_end;
+		led_ctl = led_reg;
+		if (led_reg & I40E_PHY_LED_LINK_MODE_MASK) {
+			led_reg = 0;
+			status = i40e_write_phy_register_clause45(hw,
+							 I40E_PHY_COM_REG_PAGE,
+							 led_addr, phy_addr,
+							 led_reg);
+			if (status)
+				goto phy_blinking_end;
+			break;
+		}
+	}
+
+	if (time > 0 && interval > 0) {
+		for (i = 0; i < time * 1000; i += interval) {
+			status = i40e_read_phy_register_clause45(hw,
+						I40E_PHY_COM_REG_PAGE,
+						led_addr, phy_addr, &led_reg);
+			if (status)
+				goto restore_config;
+			if (led_reg & I40E_PHY_LED_MANUAL_ON)
+				led_reg = 0;
+			else
+				led_reg = I40E_PHY_LED_MANUAL_ON;
+			status = i40e_write_phy_register_clause45(hw,
+						I40E_PHY_COM_REG_PAGE,
+						led_addr, phy_addr, led_reg);
+			if (status)
+				goto restore_config;
+			msleep(interval);
+		}
+	}
+
+restore_config:
+	status = i40e_write_phy_register_clause45(hw,
+						  I40E_PHY_COM_REG_PAGE,
+						  led_addr, phy_addr, led_ctl);
+
+phy_blinking_end:
+	return status;
+}
+
+/**
+ * i40e_led_get_reg - read LED register
+ * @hw: pointer to the HW structure
+ * @led_addr: LED register address
+ * @reg_val: read register value
+ **/
+static enum i40e_status_code i40e_led_get_reg(struct i40e_hw *hw, u16 led_addr,
+					      u32 *reg_val)
+{
+	enum i40e_status_code status;
+	u8 phy_addr = 0;
+	u8 port_num;
+	u32 i;
+
+	*reg_val = 0;
+	if (hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE) {
+		status =
+		       i40e_aq_get_phy_register(hw,
+						I40E_AQ_PHY_REG_ACCESS_EXTERNAL,
+						I40E_PHY_COM_REG_PAGE,
+						I40E_PHY_LED_PROV_REG_1,
+						reg_val, NULL);
+	} else {
+		i = rd32(hw, I40E_PFGEN_PORTNUM);
+		port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK);
+		phy_addr = i40e_get_phy_address(hw, port_num);
+		status = i40e_read_phy_register_clause45(hw,
+							 I40E_PHY_COM_REG_PAGE,
+							 led_addr, phy_addr,
+							 (u16 *)reg_val);
+	}
+	return status;
+}
+
+/**
+ * i40e_led_set_reg - write LED register
+ * @hw: pointer to the HW structure
+ * @led_addr: LED register address
+ * @reg_val: register value to write
+ **/
+static enum i40e_status_code i40e_led_set_reg(struct i40e_hw *hw, u16 led_addr,
+					      u32 reg_val)
+{
+	enum i40e_status_code status;
+	u8 phy_addr = 0;
+	u8 port_num;
+	u32 i;
+
+	if (hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE) {
+		status =
+		       i40e_aq_set_phy_register(hw,
+						I40E_AQ_PHY_REG_ACCESS_EXTERNAL,
+						I40E_PHY_COM_REG_PAGE,
+						I40E_PHY_LED_PROV_REG_1,
+						reg_val, NULL);
+	} else {
+		i = rd32(hw, I40E_PFGEN_PORTNUM);
+		port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK);
+		phy_addr = i40e_get_phy_address(hw, port_num);
+		status = i40e_write_phy_register_clause45(hw,
+							  I40E_PHY_COM_REG_PAGE,
+							  led_addr, phy_addr,
+							  (u16)reg_val);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_led_get_phy - return current on/off mode
+ * @hw: pointer to the hw struct
+ * @led_addr: address of led register to use
+ * @val: original value of register to use
+ *
+ **/
+i40e_status i40e_led_get_phy(struct i40e_hw *hw, u16 *led_addr,
+			     u16 *val)
+{
+	i40e_status status = 0;
+	u16 gpio_led_port;
+	u8 phy_addr = 0;
+	u16 reg_val;
+	u16 temp_addr;
+	u8 port_num;
+	u32 i;
+	u32 reg_val_aq;
+
+	if (hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE) {
+		status =
+		      i40e_aq_get_phy_register(hw,
+					       I40E_AQ_PHY_REG_ACCESS_EXTERNAL,
+					       I40E_PHY_COM_REG_PAGE,
+					       I40E_PHY_LED_PROV_REG_1,
+					       &reg_val_aq, NULL);
+		if (status == I40E_SUCCESS)
+			*val = (u16)reg_val_aq;
+		return status;
+	}
+	temp_addr = I40E_PHY_LED_PROV_REG_1;
+	i = rd32(hw, I40E_PFGEN_PORTNUM);
+	port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK);
+	phy_addr = i40e_get_phy_address(hw, port_num);
+
+	for (gpio_led_port = 0; gpio_led_port < 3; gpio_led_port++,
+	     temp_addr++) {
+		status = i40e_read_phy_register_clause45(hw,
+							 I40E_PHY_COM_REG_PAGE,
+							 temp_addr, phy_addr,
+							 &reg_val);
+		if (status)
+			return status;
+		*val = reg_val;
+		if (reg_val & I40E_PHY_LED_LINK_MODE_MASK) {
+			*led_addr = temp_addr;
+			break;
+		}
+	}
+	return status;
+}
+
+/**
+ * i40e_led_set_phy
+ * @hw: pointer to the HW structure
+ * @on: true or false
+ * @mode: original val plus bit for set or ignore
+ * Set led's on or off when controlled by the PHY
+ *
+ **/
+i40e_status i40e_led_set_phy(struct i40e_hw *hw, bool on,
+			     u16 led_addr, u32 mode)
+{
+	i40e_status status = 0;
+	u32 led_ctl = 0;
+	u32 led_reg = 0;
+
+	status = i40e_led_get_reg(hw, led_addr, &led_reg);
+	if (status)
+		return status;
+	led_ctl = led_reg;
+	if (led_reg & I40E_PHY_LED_LINK_MODE_MASK) {
+		led_reg = 0;
+		status = i40e_led_set_reg(hw, led_addr, led_reg);
+		if (status)
+			return status;
+	}
+	status = i40e_led_get_reg(hw, led_addr, &led_reg);
+	if (status)
+		goto restore_config;
+	if (on)
+		led_reg = I40E_PHY_LED_MANUAL_ON;
+	else
+		led_reg = 0;
+
+	status = i40e_led_set_reg(hw, led_addr, led_reg);
+	if (status)
+		goto restore_config;
+	if (mode & I40E_PHY_LED_MODE_ORIG) {
+		led_ctl = (mode & I40E_PHY_LED_MODE_MASK);
+		status = i40e_led_set_reg(hw, led_addr, led_ctl);
+	}
+	return status;
+
+restore_config:
+	status = i40e_led_set_reg(hw, led_addr, led_ctl);
+	return status;
+}
+
+/**
+ * i40e_aq_rx_ctl_read_register - use FW to read from an Rx control register
+ * @hw: pointer to the hw struct
+ * @reg_addr: register address
+ * @reg_val: ptr to register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Use the firmware to read the Rx control register,
+ * especially useful if the Rx unit is under heavy pressure
+ **/
+i40e_status i40e_aq_rx_ctl_read_register(struct i40e_hw *hw,
+				u32 reg_addr, u32 *reg_val,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_rx_ctl_reg_read_write *cmd_resp =
+		(struct i40e_aqc_rx_ctl_reg_read_write *)&desc.params.raw;
+	i40e_status status;
+
+	if (!reg_val)
+		return I40E_ERR_PARAM;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_rx_ctl_reg_read);
+
+	cmd_resp->address = cpu_to_le32(reg_addr);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	if (status == 0)
+		*reg_val = le32_to_cpu(cmd_resp->value);
+
+	return status;
+}
+
+/**
+ * i40e_read_rx_ctl - read from an Rx control register
+ * @hw: pointer to the hw struct
+ * @reg_addr: register address
+ **/
+u32 i40e_read_rx_ctl(struct i40e_hw *hw, u32 reg_addr)
+{
+	i40e_status status = 0;
+	bool use_register;
+	int retry = 5;
+	u32 val = 0;
+
+	use_register = (((hw->aq.api_maj_ver == 1) &&
+			(hw->aq.api_min_ver < 5)) ||
+			(hw->mac.type == I40E_MAC_X722));
+	if (!use_register) {
+do_retry:
+		status = i40e_aq_rx_ctl_read_register(hw, reg_addr, &val, NULL);
+		if (hw->aq.asq_last_status == I40E_AQ_RC_EAGAIN && retry) {
+			usleep_range(1000, 2000);
+			retry--;
+			goto do_retry;
+		}
+	}
+
+	/* if the AQ access failed, try the old-fashioned way */
+	if (status || use_register)
+		val = rd32(hw, reg_addr);
+
+	return val;
+}
+
+/**
+ * i40e_aq_rx_ctl_write_register
+ * @hw: pointer to the hw struct
+ * @reg_addr: register address
+ * @reg_val: register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Use the firmware to write to an Rx control register,
+ * especially useful if the Rx unit is under heavy pressure
+ **/
+i40e_status i40e_aq_rx_ctl_write_register(struct i40e_hw *hw,
+				u32 reg_addr, u32 reg_val,
+				struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_rx_ctl_reg_read_write *cmd =
+		(struct i40e_aqc_rx_ctl_reg_read_write *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_rx_ctl_reg_write);
+
+	cmd->address = cpu_to_le32(reg_addr);
+	cmd->value = cpu_to_le32(reg_val);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_write_rx_ctl - write to an Rx control register
+ * @hw: pointer to the hw struct
+ * @reg_addr: register address
+ * @reg_val: register value
+ **/
+void i40e_write_rx_ctl(struct i40e_hw *hw, u32 reg_addr, u32 reg_val)
+{
+	i40e_status status = 0;
+	bool use_register;
+	int retry = 5;
+
+	use_register = (((hw->aq.api_maj_ver == 1) &&
+			(hw->aq.api_min_ver < 5)) ||
+			(hw->mac.type == I40E_MAC_X722));
+	if (!use_register) {
+do_retry:
+		status = i40e_aq_rx_ctl_write_register(hw, reg_addr,
+						       reg_val, NULL);
+		if (hw->aq.asq_last_status == I40E_AQ_RC_EAGAIN && retry) {
+			usleep_range(1000, 2000);
+			retry--;
+			goto do_retry;
+		}
+	}
+
+	/* if the AQ access failed, try the old-fashioned way */
+	if (status || use_register)
+		wr32(hw, reg_addr, reg_val);
+}
+
+/**
+ * i40e_aq_set_phy_register
+ * @hw: pointer to the hw struct
+ * @phy_select: select which phy should be accessed
+ * @dev_addr: PHY device address
+ * @reg_addr: PHY register address
+ * @reg_val: new register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Write the external PHY register.
+ **/
+i40e_status i40e_aq_set_phy_register(struct i40e_hw *hw,
+				     u8 phy_select, u8 dev_addr,
+				     u32 reg_addr, u32 reg_val,
+				     struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_phy_register_access *cmd =
+		(struct i40e_aqc_phy_register_access *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_phy_register);
+
+	cmd->phy_interface = phy_select;
+	cmd->dev_address = dev_addr;
+	cmd->reg_address = cpu_to_le32(reg_addr);
+	cmd->reg_value = cpu_to_le32(reg_val);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_phy_register
+ * @hw: pointer to the hw struct
+ * @phy_select: select which phy should be accessed
+ * @dev_addr: PHY device address
+ * @reg_addr: PHY register address
+ * @reg_val: read register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Read the external PHY register.
+ **/
+i40e_status i40e_aq_get_phy_register(struct i40e_hw *hw,
+				     u8 phy_select, u8 dev_addr,
+				     u32 reg_addr, u32 *reg_val,
+				     struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_phy_register_access *cmd =
+		(struct i40e_aqc_phy_register_access *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_get_phy_register);
+
+	cmd->phy_interface = phy_select;
+	cmd->dev_address = dev_addr;
+	cmd->reg_address = cpu_to_le32(reg_addr);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+	if (!status)
+		*reg_val = le32_to_cpu(cmd->reg_value);
+
+	return status;
+}
+
+/**
+ * i40e_aq_write_ddp - Write dynamic device personalization (ddp)
+ * @hw: pointer to the hw struct
+ * @buff: command buffer (size in bytes = buff_size)
+ * @buff_size: buffer size in bytes
+ * @track_id: package tracking id
+ * @error_offset: returns error offset
+ * @error_info: returns error information
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum
+i40e_status_code i40e_aq_write_ddp(struct i40e_hw *hw, void *buff,
+				   u16 buff_size, u32 track_id,
+				   u32 *error_offset, u32 *error_info,
+				   struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_write_personalization_profile *cmd =
+		(struct i40e_aqc_write_personalization_profile *)
+		&desc.params.raw;
+	struct i40e_aqc_write_ddp_resp *resp;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_write_personalization_profile);
+
+	desc.flags |= cpu_to_le16(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD);
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+
+	desc.datalen = cpu_to_le16(buff_size);
+
+	cmd->profile_track_id = cpu_to_le32(track_id);
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+	if (!status) {
+		resp = (struct i40e_aqc_write_ddp_resp *)&desc.params.raw;
+		if (error_offset)
+			*error_offset = le32_to_cpu(resp->error_offset);
+		if (error_info)
+			*error_info = le32_to_cpu(resp->error_info);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_ddp_list - Read dynamic device personalization (ddp)
+ * @hw: pointer to the hw struct
+ * @buff: command buffer (size in bytes = buff_size)
+ * @buff_size: buffer size in bytes
+ * @cmd_details: pointer to command details structure or NULL
+ **/
+enum
+i40e_status_code i40e_aq_get_ddp_list(struct i40e_hw *hw, void *buff,
+				      u16 buff_size, u8 flags,
+				      struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_get_applied_profiles *cmd =
+		(struct i40e_aqc_get_applied_profiles *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_get_personalization_profile_list);
+
+	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	if (buff_size > I40E_AQ_LARGE_BUF)
+		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+	desc.datalen = cpu_to_le16(buff_size);
+
+	cmd->flags = flags;
+
+	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_find_segment_in_package
+ * @segment_type: the segment type to search for (i.e., SEGMENT_TYPE_I40E)
+ * @pkg_hdr: pointer to the package header to be searched
+ *
+ * This function searches a package file for a particular segment type. On
+ * success it returns a pointer to the segment header, otherwise it will
+ * return NULL.
+ **/
+struct i40e_generic_seg_header *
+i40e_find_segment_in_package(u32 segment_type,
+			     struct i40e_package_header *pkg_hdr)
+{
+	struct i40e_generic_seg_header *segment;
+	u32 i;
+
+	/* Search all package segments for the requested segment type */
+	for (i = 0; i < pkg_hdr->segment_count; i++) {
+		segment =
+			(struct i40e_generic_seg_header *)((u8 *)pkg_hdr +
+			 pkg_hdr->segment_offset[i]);
+
+		if (segment->type == segment_type)
+			return segment;
+	}
+
+	return NULL;
+}
+
+/**
+ * i40e_write_profile
+ * @hw: pointer to the hardware structure
+ * @profile: pointer to the profile segment of the package to be downloaded
+ * @track_id: package tracking id
+ *
+ * Handles the download of a complete package.
+ */
+enum i40e_status_code
+i40e_write_profile(struct i40e_hw *hw, struct i40e_profile_segment *profile,
+		   u32 track_id)
+{
+	i40e_status status = 0;
+	struct i40e_section_table *sec_tbl;
+	struct i40e_profile_section_header *sec = NULL;
+	u32 dev_cnt;
+	u32 vendor_dev_id;
+	u32 *nvm;
+	u32 section_size = 0;
+	u32 offset = 0, info = 0;
+	u32 i;
+
+	dev_cnt = profile->device_table_count;
+
+	for (i = 0; i < dev_cnt; i++) {
+		vendor_dev_id = profile->device_table[i].vendor_dev_id;
+		if ((vendor_dev_id >> 16) == PCI_VENDOR_ID_INTEL)
+			if (hw->device_id == (vendor_dev_id & 0xFFFF))
+				break;
+	}
+	if (i == dev_cnt) {
+		i40e_debug(hw, I40E_DEBUG_PACKAGE, "Device doesn't support DDP");
+		return I40E_ERR_DEVICE_NOT_SUPPORTED;
+	}
+
+	nvm = (u32 *)&profile->device_table[dev_cnt];
+	sec_tbl = (struct i40e_section_table *)&nvm[nvm[0] + 1];
+
+	for (i = 0; i < sec_tbl->section_count; i++) {
+		sec = (struct i40e_profile_section_header *)((u8 *)profile +
+					     sec_tbl->section_offset[i]);
+
+		/* Skip 'AQ', 'note' and 'name' sections */
+		if (sec->section.type != SECTION_TYPE_MMIO)
+			continue;
+
+		section_size = sec->section.size +
+			sizeof(struct i40e_profile_section_header);
+
+		/* Write profile */
+		status = i40e_aq_write_ddp(hw, (void *)sec, (u16)section_size,
+					   track_id, &offset, &info, NULL);
+		if (status) {
+			i40e_debug(hw, I40E_DEBUG_PACKAGE,
+				   "Failed to write profile: offset %d, info %d",
+				   offset, info);
+			break;
+		}
+	}
+	return status;
+}
+
+/**
+ * i40e_add_pinfo_to_list
+ * @hw: pointer to the hardware structure
+ * @profile: pointer to the profile segment of the package
+ * @profile_info_sec: buffer for information section
+ * @track_id: package tracking id
+ *
+ * Register a profile to the list of loaded profiles.
+ */
+enum i40e_status_code
+i40e_add_pinfo_to_list(struct i40e_hw *hw,
+		       struct i40e_profile_segment *profile,
+		       u8 *profile_info_sec, u32 track_id)
+{
+	i40e_status status = 0;
+	struct i40e_profile_section_header *sec = NULL;
+	struct i40e_profile_info *pinfo;
+	u32 offset = 0, info = 0;
+
+	sec = (struct i40e_profile_section_header *)profile_info_sec;
+	sec->tbl_size = 1;
+	sec->data_end = sizeof(struct i40e_profile_section_header) +
+			sizeof(struct i40e_profile_info);
+	sec->section.type = SECTION_TYPE_INFO;
+	sec->section.offset = sizeof(struct i40e_profile_section_header);
+	sec->section.size = sizeof(struct i40e_profile_info);
+	pinfo = (struct i40e_profile_info *)(profile_info_sec +
+					     sec->section.offset);
+	pinfo->track_id = track_id;
+	pinfo->version = profile->version;
+	pinfo->op = I40E_DDP_ADD_TRACKID;
+	memcpy(pinfo->name, profile->name, I40E_DDP_NAME_SIZE);
+
+	status = i40e_aq_write_ddp(hw, (void *)sec, sec->data_end,
+				   track_id, &offset, &info, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_add_cloud_filters
+ * @hw: pointer to the hardware structure
+ * @seid: VSI seid to add cloud filters from
+ * @filters: Buffer which contains the filters to be added
+ * @filter_count: number of filters contained in the buffer
+ *
+ * Set the cloud filters for a given VSI.  The contents of the
+ * i40e_aqc_cloud_filters_element_data are filled in by the caller
+ * of the function.
+ *
+ **/
+enum i40e_status_code
+i40e_aq_add_cloud_filters(struct i40e_hw *hw, u16 seid,
+			  struct i40e_aqc_cloud_filters_element_data *filters,
+			  u8 filter_count)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_remove_cloud_filters *cmd =
+	(struct i40e_aqc_add_remove_cloud_filters *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 buff_len;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_add_cloud_filters);
+
+	buff_len = filter_count * sizeof(*filters);
+	desc.datalen = cpu_to_le16(buff_len);
+	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	cmd->num_filters = filter_count;
+	cmd->seid = cpu_to_le16(seid);
+
+	status = i40e_asq_send_command(hw, &desc, filters, buff_len, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_add_cloud_filters_bb
+ * @hw: pointer to the hardware structure
+ * @seid: VSI seid to add cloud filters from
+ * @filters: Buffer which contains the filters in big buffer to be added
+ * @filter_count: number of filters contained in the buffer
+ *
+ * Set the big buffer cloud filters for a given VSI.  The contents of the
+ * i40e_aqc_cloud_filters_element_bb are filled in by the caller of the
+ * function.
+ *
+ **/
+enum i40e_status_code
+i40e_aq_add_cloud_filters_bb(struct i40e_hw *hw, u16 seid,
+			     struct i40e_aqc_cloud_filters_element_bb *filters,
+			     u8 filter_count)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_remove_cloud_filters *cmd =
+	(struct i40e_aqc_add_remove_cloud_filters *)&desc.params.raw;
+	i40e_status status;
+	u16 buff_len;
+	int i;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_add_cloud_filters);
+
+	buff_len = filter_count * sizeof(*filters);
+	desc.datalen = cpu_to_le16(buff_len);
+	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	cmd->num_filters = filter_count;
+	cmd->seid = cpu_to_le16(seid);
+	cmd->big_buffer_flag = I40E_AQC_ADD_CLOUD_CMD_BB;
+
+	for (i = 0; i < filter_count; i++) {
+		u16 tnl_type;
+		u32 ti;
+
+		tnl_type = (le16_to_cpu(filters[i].element.flags) &
+			   I40E_AQC_ADD_CLOUD_TNL_TYPE_MASK) >>
+			   I40E_AQC_ADD_CLOUD_TNL_TYPE_SHIFT;
+
+		/* Due to hardware eccentricities, the VNI for Geneve is shifted
+		 * one more byte further than normally used for Tenant ID in
+		 * other tunnel types.
+		 */
+		if (tnl_type == I40E_AQC_ADD_CLOUD_TNL_TYPE_GENEVE) {
+			ti = le32_to_cpu(filters[i].element.tenant_id);
+			filters[i].element.tenant_id = cpu_to_le32(ti << 8);
+		}
+	}
+
+	status = i40e_asq_send_command(hw, &desc, filters, buff_len, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_rem_cloud_filters
+ * @hw: pointer to the hardware structure
+ * @seid: VSI seid to remove cloud filters from
+ * @filters: Buffer which contains the filters to be removed
+ * @filter_count: number of filters contained in the buffer
+ *
+ * Remove the cloud filters for a given VSI.  The contents of the
+ * i40e_aqc_cloud_filters_element_data are filled in by the caller
+ * of the function.
+ *
+ **/
+enum i40e_status_code
+i40e_aq_rem_cloud_filters(struct i40e_hw *hw, u16 seid,
+			  struct i40e_aqc_cloud_filters_element_data *filters,
+			  u8 filter_count)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_remove_cloud_filters *cmd =
+	(struct i40e_aqc_add_remove_cloud_filters *)&desc.params.raw;
+	enum i40e_status_code status;
+	u16 buff_len;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_remove_cloud_filters);
+
+	buff_len = filter_count * sizeof(*filters);
+	desc.datalen = cpu_to_le16(buff_len);
+	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	cmd->num_filters = filter_count;
+	cmd->seid = cpu_to_le16(seid);
+
+	status = i40e_asq_send_command(hw, &desc, filters, buff_len, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_aq_rem_cloud_filters_bb
+ * @hw: pointer to the hardware structure
+ * @seid: VSI seid to remove cloud filters from
+ * @filters: Buffer which contains the filters in big buffer to be removed
+ * @filter_count: number of filters contained in the buffer
+ *
+ * Remove the big buffer cloud filters for a given VSI.  The contents of the
+ * i40e_aqc_cloud_filters_element_bb are filled in by the caller of the
+ * function.
+ *
+ **/
+enum i40e_status_code
+i40e_aq_rem_cloud_filters_bb(struct i40e_hw *hw, u16 seid,
+			     struct i40e_aqc_cloud_filters_element_bb *filters,
+			     u8 filter_count)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_add_remove_cloud_filters *cmd =
+	(struct i40e_aqc_add_remove_cloud_filters *)&desc.params.raw;
+	i40e_status status;
+	u16 buff_len;
+	int i;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_remove_cloud_filters);
+
+	buff_len = filter_count * sizeof(*filters);
+	desc.datalen = cpu_to_le16(buff_len);
+	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	cmd->num_filters = filter_count;
+	cmd->seid = cpu_to_le16(seid);
+	cmd->big_buffer_flag = I40E_AQC_ADD_CLOUD_CMD_BB;
+
+	for (i = 0; i < filter_count; i++) {
+		u16 tnl_type;
+		u32 ti;
+
+		tnl_type = (le16_to_cpu(filters[i].element.flags) &
+			   I40E_AQC_ADD_CLOUD_TNL_TYPE_MASK) >>
+			   I40E_AQC_ADD_CLOUD_TNL_TYPE_SHIFT;
+
+		/* Due to hardware eccentricities, the VNI for Geneve is shifted
+		 * one more byte further than normally used for Tenant ID in
+		 * other tunnel types.
+		 */
+		if (tnl_type == I40E_AQC_ADD_CLOUD_TNL_TYPE_GENEVE) {
+			ti = le32_to_cpu(filters[i].element.tenant_id);
+			filters[i].element.tenant_id = cpu_to_le32(ti << 8);
+		}
+	}
+
+	status = i40e_asq_send_command(hw, &desc, filters, buff_len, NULL);
+
+	return status;
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.c b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
new file mode 100644
index 0000000..9fec728
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
@@ -0,0 +1,975 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#include "i40e_adminq.h"
+#include "i40e_prototype.h"
+#include "i40e_dcb.h"
+
+/**
+ * i40e_get_dcbx_status
+ * @hw: pointer to the hw struct
+ * @status: Embedded DCBX Engine Status
+ *
+ * Get the DCBX status from the Firmware
+ **/
+i40e_status i40e_get_dcbx_status(struct i40e_hw *hw, u16 *status)
+{
+	u32 reg;
+
+	if (!status)
+		return I40E_ERR_PARAM;
+
+	reg = rd32(hw, I40E_PRTDCB_GENS);
+	*status = (u16)((reg & I40E_PRTDCB_GENS_DCBX_STATUS_MASK) >>
+			I40E_PRTDCB_GENS_DCBX_STATUS_SHIFT);
+
+	return 0;
+}
+
+/**
+ * i40e_parse_ieee_etscfg_tlv
+ * @tlv: IEEE 802.1Qaz ETS CFG TLV
+ * @dcbcfg: Local store to update ETS CFG data
+ *
+ * Parses IEEE 802.1Qaz ETS CFG TLV
+ **/
+static void i40e_parse_ieee_etscfg_tlv(struct i40e_lldp_org_tlv *tlv,
+				       struct i40e_dcbx_config *dcbcfg)
+{
+	struct i40e_dcb_ets_config *etscfg;
+	u8 *buf = tlv->tlvinfo;
+	u16 offset = 0;
+	u8 priority;
+	int i;
+
+	/* First Octet post subtype
+	 * --------------------------
+	 * |will-|CBS  | Re-  | Max |
+	 * |ing  |     |served| TCs |
+	 * --------------------------
+	 * |1bit | 1bit|3 bits|3bits|
+	 */
+	etscfg = &dcbcfg->etscfg;
+	etscfg->willing = (u8)((buf[offset] & I40E_IEEE_ETS_WILLING_MASK) >>
+			       I40E_IEEE_ETS_WILLING_SHIFT);
+	etscfg->cbs = (u8)((buf[offset] & I40E_IEEE_ETS_CBS_MASK) >>
+			   I40E_IEEE_ETS_CBS_SHIFT);
+	etscfg->maxtcs = (u8)((buf[offset] & I40E_IEEE_ETS_MAXTC_MASK) >>
+			      I40E_IEEE_ETS_MAXTC_SHIFT);
+
+	/* Move offset to Priority Assignment Table */
+	offset++;
+
+	/* Priority Assignment Table (4 octets)
+	 * Octets:|    1    |    2    |    3    |    4    |
+	 *        -----------------------------------------
+	 *        |pri0|pri1|pri2|pri3|pri4|pri5|pri6|pri7|
+	 *        -----------------------------------------
+	 *   Bits:|7  4|3  0|7  4|3  0|7  4|3  0|7  4|3  0|
+	 *        -----------------------------------------
+	 */
+	for (i = 0; i < 4; i++) {
+		priority = (u8)((buf[offset] & I40E_IEEE_ETS_PRIO_1_MASK) >>
+				I40E_IEEE_ETS_PRIO_1_SHIFT);
+		etscfg->prioritytable[i * 2] =  priority;
+		priority = (u8)((buf[offset] & I40E_IEEE_ETS_PRIO_0_MASK) >>
+				I40E_IEEE_ETS_PRIO_0_SHIFT);
+		etscfg->prioritytable[i * 2 + 1] = priority;
+		offset++;
+	}
+
+	/* TC Bandwidth Table (8 octets)
+	 * Octets:| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
+	 *        ---------------------------------
+	 *        |tc0|tc1|tc2|tc3|tc4|tc5|tc6|tc7|
+	 *        ---------------------------------
+	 */
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++)
+		etscfg->tcbwtable[i] = buf[offset++];
+
+	/* TSA Assignment Table (8 octets)
+	 * Octets:| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
+	 *        ---------------------------------
+	 *        |tc0|tc1|tc2|tc3|tc4|tc5|tc6|tc7|
+	 *        ---------------------------------
+	 */
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++)
+		etscfg->tsatable[i] = buf[offset++];
+}
+
+/**
+ * i40e_parse_ieee_etsrec_tlv
+ * @tlv: IEEE 802.1Qaz ETS REC TLV
+ * @dcbcfg: Local store to update ETS REC data
+ *
+ * Parses IEEE 802.1Qaz ETS REC TLV
+ **/
+static void i40e_parse_ieee_etsrec_tlv(struct i40e_lldp_org_tlv *tlv,
+				       struct i40e_dcbx_config *dcbcfg)
+{
+	u8 *buf = tlv->tlvinfo;
+	u16 offset = 0;
+	u8 priority;
+	int i;
+
+	/* Move offset to priority table */
+	offset++;
+
+	/* Priority Assignment Table (4 octets)
+	 * Octets:|    1    |    2    |    3    |    4    |
+	 *        -----------------------------------------
+	 *        |pri0|pri1|pri2|pri3|pri4|pri5|pri6|pri7|
+	 *        -----------------------------------------
+	 *   Bits:|7  4|3  0|7  4|3  0|7  4|3  0|7  4|3  0|
+	 *        -----------------------------------------
+	 */
+	for (i = 0; i < 4; i++) {
+		priority = (u8)((buf[offset] & I40E_IEEE_ETS_PRIO_1_MASK) >>
+				I40E_IEEE_ETS_PRIO_1_SHIFT);
+		dcbcfg->etsrec.prioritytable[i*2] =  priority;
+		priority = (u8)((buf[offset] & I40E_IEEE_ETS_PRIO_0_MASK) >>
+				I40E_IEEE_ETS_PRIO_0_SHIFT);
+		dcbcfg->etsrec.prioritytable[i*2 + 1] = priority;
+		offset++;
+	}
+
+	/* TC Bandwidth Table (8 octets)
+	 * Octets:| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
+	 *        ---------------------------------
+	 *        |tc0|tc1|tc2|tc3|tc4|tc5|tc6|tc7|
+	 *        ---------------------------------
+	 */
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++)
+		dcbcfg->etsrec.tcbwtable[i] = buf[offset++];
+
+	/* TSA Assignment Table (8 octets)
+	 * Octets:| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
+	 *        ---------------------------------
+	 *        |tc0|tc1|tc2|tc3|tc4|tc5|tc6|tc7|
+	 *        ---------------------------------
+	 */
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++)
+		dcbcfg->etsrec.tsatable[i] = buf[offset++];
+}
+
+/**
+ * i40e_parse_ieee_pfccfg_tlv
+ * @tlv: IEEE 802.1Qaz PFC CFG TLV
+ * @dcbcfg: Local store to update PFC CFG data
+ *
+ * Parses IEEE 802.1Qaz PFC CFG TLV
+ **/
+static void i40e_parse_ieee_pfccfg_tlv(struct i40e_lldp_org_tlv *tlv,
+				       struct i40e_dcbx_config *dcbcfg)
+{
+	u8 *buf = tlv->tlvinfo;
+
+	/* ----------------------------------------
+	 * |will-|MBC  | Re-  | PFC |  PFC Enable  |
+	 * |ing  |     |served| cap |              |
+	 * -----------------------------------------
+	 * |1bit | 1bit|2 bits|4bits| 1 octet      |
+	 */
+	dcbcfg->pfc.willing = (u8)((buf[0] & I40E_IEEE_PFC_WILLING_MASK) >>
+				   I40E_IEEE_PFC_WILLING_SHIFT);
+	dcbcfg->pfc.mbc = (u8)((buf[0] & I40E_IEEE_PFC_MBC_MASK) >>
+			       I40E_IEEE_PFC_MBC_SHIFT);
+	dcbcfg->pfc.pfccap = (u8)((buf[0] & I40E_IEEE_PFC_CAP_MASK) >>
+				  I40E_IEEE_PFC_CAP_SHIFT);
+	dcbcfg->pfc.pfcenable = buf[1];
+}
+
+/**
+ * i40e_parse_ieee_app_tlv
+ * @tlv: IEEE 802.1Qaz APP TLV
+ * @dcbcfg: Local store to update APP PRIO data
+ *
+ * Parses IEEE 802.1Qaz APP PRIO TLV
+ **/
+static void i40e_parse_ieee_app_tlv(struct i40e_lldp_org_tlv *tlv,
+				    struct i40e_dcbx_config *dcbcfg)
+{
+	u16 typelength;
+	u16 offset = 0;
+	u16 length;
+	int i = 0;
+	u8 *buf;
+
+	typelength = ntohs(tlv->typelength);
+	length = (u16)((typelength & I40E_LLDP_TLV_LEN_MASK) >>
+		       I40E_LLDP_TLV_LEN_SHIFT);
+	buf = tlv->tlvinfo;
+
+	/* The App priority table starts 5 octets after TLV header */
+	length -= (sizeof(tlv->ouisubtype) + 1);
+
+	/* Move offset to App Priority Table */
+	offset++;
+
+	/* Application Priority Table (3 octets)
+	 * Octets:|         1          |    2    |    3    |
+	 *        -----------------------------------------
+	 *        |Priority|Rsrvd| Sel |    Protocol ID    |
+	 *        -----------------------------------------
+	 *   Bits:|23    21|20 19|18 16|15                0|
+	 *        -----------------------------------------
+	 */
+	while (offset < length) {
+		dcbcfg->app[i].priority = (u8)((buf[offset] &
+						I40E_IEEE_APP_PRIO_MASK) >>
+					       I40E_IEEE_APP_PRIO_SHIFT);
+		dcbcfg->app[i].selector = (u8)((buf[offset] &
+						I40E_IEEE_APP_SEL_MASK) >>
+					       I40E_IEEE_APP_SEL_SHIFT);
+		dcbcfg->app[i].protocolid = (buf[offset + 1] << 0x8) |
+					     buf[offset + 2];
+		/* Move to next app */
+		offset += 3;
+		i++;
+		if (i >= I40E_DCBX_MAX_APPS)
+			break;
+	}
+
+	dcbcfg->numapps = i;
+}
+
+/**
+ * i40e_parse_ieee_etsrec_tlv
+ * @tlv: IEEE 802.1Qaz TLV
+ * @dcbcfg: Local store to update ETS REC data
+ *
+ * Get the TLV subtype and send it to parsing function
+ * based on the subtype value
+ **/
+static void i40e_parse_ieee_tlv(struct i40e_lldp_org_tlv *tlv,
+				struct i40e_dcbx_config *dcbcfg)
+{
+	u32 ouisubtype;
+	u8 subtype;
+
+	ouisubtype = ntohl(tlv->ouisubtype);
+	subtype = (u8)((ouisubtype & I40E_LLDP_TLV_SUBTYPE_MASK) >>
+		       I40E_LLDP_TLV_SUBTYPE_SHIFT);
+	switch (subtype) {
+	case I40E_IEEE_SUBTYPE_ETS_CFG:
+		i40e_parse_ieee_etscfg_tlv(tlv, dcbcfg);
+		break;
+	case I40E_IEEE_SUBTYPE_ETS_REC:
+		i40e_parse_ieee_etsrec_tlv(tlv, dcbcfg);
+		break;
+	case I40E_IEEE_SUBTYPE_PFC_CFG:
+		i40e_parse_ieee_pfccfg_tlv(tlv, dcbcfg);
+		break;
+	case I40E_IEEE_SUBTYPE_APP_PRI:
+		i40e_parse_ieee_app_tlv(tlv, dcbcfg);
+		break;
+	default:
+		break;
+	}
+}
+
+/**
+ * i40e_parse_cee_pgcfg_tlv
+ * @tlv: CEE DCBX PG CFG TLV
+ * @dcbcfg: Local store to update ETS CFG data
+ *
+ * Parses CEE DCBX PG CFG TLV
+ **/
+static void i40e_parse_cee_pgcfg_tlv(struct i40e_cee_feat_tlv *tlv,
+				     struct i40e_dcbx_config *dcbcfg)
+{
+	struct i40e_dcb_ets_config *etscfg;
+	u8 *buf = tlv->tlvinfo;
+	u16 offset = 0;
+	u8 priority;
+	int i;
+
+	etscfg = &dcbcfg->etscfg;
+
+	if (tlv->en_will_err & I40E_CEE_FEAT_TLV_WILLING_MASK)
+		etscfg->willing = 1;
+
+	etscfg->cbs = 0;
+	/* Priority Group Table (4 octets)
+	 * Octets:|    1    |    2    |    3    |    4    |
+	 *        -----------------------------------------
+	 *        |pri0|pri1|pri2|pri3|pri4|pri5|pri6|pri7|
+	 *        -----------------------------------------
+	 *   Bits:|7  4|3  0|7  4|3  0|7  4|3  0|7  4|3  0|
+	 *        -----------------------------------------
+	 */
+	for (i = 0; i < 4; i++) {
+		priority = (u8)((buf[offset] & I40E_CEE_PGID_PRIO_1_MASK) >>
+				 I40E_CEE_PGID_PRIO_1_SHIFT);
+		etscfg->prioritytable[i * 2] =  priority;
+		priority = (u8)((buf[offset] & I40E_CEE_PGID_PRIO_0_MASK) >>
+				 I40E_CEE_PGID_PRIO_0_SHIFT);
+		etscfg->prioritytable[i * 2 + 1] = priority;
+		offset++;
+	}
+
+	/* PG Percentage Table (8 octets)
+	 * Octets:| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
+	 *        ---------------------------------
+	 *        |pg0|pg1|pg2|pg3|pg4|pg5|pg6|pg7|
+	 *        ---------------------------------
+	 */
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++)
+		etscfg->tcbwtable[i] = buf[offset++];
+
+	/* Number of TCs supported (1 octet) */
+	etscfg->maxtcs = buf[offset];
+}
+
+/**
+ * i40e_parse_cee_pfccfg_tlv
+ * @tlv: CEE DCBX PFC CFG TLV
+ * @dcbcfg: Local store to update PFC CFG data
+ *
+ * Parses CEE DCBX PFC CFG TLV
+ **/
+static void i40e_parse_cee_pfccfg_tlv(struct i40e_cee_feat_tlv *tlv,
+				      struct i40e_dcbx_config *dcbcfg)
+{
+	u8 *buf = tlv->tlvinfo;
+
+	if (tlv->en_will_err & I40E_CEE_FEAT_TLV_WILLING_MASK)
+		dcbcfg->pfc.willing = 1;
+
+	/* ------------------------
+	 * | PFC Enable | PFC TCs |
+	 * ------------------------
+	 * | 1 octet    | 1 octet |
+	 */
+	dcbcfg->pfc.pfcenable = buf[0];
+	dcbcfg->pfc.pfccap = buf[1];
+}
+
+/**
+ * i40e_parse_cee_app_tlv
+ * @tlv: CEE DCBX APP TLV
+ * @dcbcfg: Local store to update APP PRIO data
+ *
+ * Parses CEE DCBX APP PRIO TLV
+ **/
+static void i40e_parse_cee_app_tlv(struct i40e_cee_feat_tlv *tlv,
+				   struct i40e_dcbx_config *dcbcfg)
+{
+	u16 length, typelength, offset = 0;
+	struct i40e_cee_app_prio *app;
+	u8 i;
+
+	typelength = ntohs(tlv->hdr.typelen);
+	length = (u16)((typelength & I40E_LLDP_TLV_LEN_MASK) >>
+		       I40E_LLDP_TLV_LEN_SHIFT);
+
+	dcbcfg->numapps = length / sizeof(*app);
+
+	if (!dcbcfg->numapps)
+		return;
+	if (dcbcfg->numapps > I40E_DCBX_MAX_APPS)
+		dcbcfg->numapps = I40E_DCBX_MAX_APPS;
+
+	for (i = 0; i < dcbcfg->numapps; i++) {
+		u8 up, selector;
+
+		app = (struct i40e_cee_app_prio *)(tlv->tlvinfo + offset);
+		for (up = 0; up < I40E_MAX_USER_PRIORITY; up++) {
+			if (app->prio_map & BIT(up))
+				break;
+		}
+		dcbcfg->app[i].priority = up;
+
+		/* Get Selector from lower 2 bits, and convert to IEEE */
+		selector = (app->upper_oui_sel & I40E_CEE_APP_SELECTOR_MASK);
+		switch (selector) {
+		case I40E_CEE_APP_SEL_ETHTYPE:
+			dcbcfg->app[i].selector = I40E_APP_SEL_ETHTYPE;
+			break;
+		case I40E_CEE_APP_SEL_TCPIP:
+			dcbcfg->app[i].selector = I40E_APP_SEL_TCPIP;
+			break;
+		default:
+			/* Keep selector as it is for unknown types */
+			dcbcfg->app[i].selector = selector;
+		}
+
+		dcbcfg->app[i].protocolid = ntohs(app->protocol);
+		/* Move to next app */
+		offset += sizeof(*app);
+	}
+}
+
+/**
+ * i40e_parse_cee_tlv
+ * @tlv: CEE DCBX TLV
+ * @dcbcfg: Local store to update DCBX config data
+ *
+ * Get the TLV subtype and send it to parsing function
+ * based on the subtype value
+ **/
+static void i40e_parse_cee_tlv(struct i40e_lldp_org_tlv *tlv,
+			       struct i40e_dcbx_config *dcbcfg)
+{
+	u16 len, tlvlen, sublen, typelength;
+	struct i40e_cee_feat_tlv *sub_tlv;
+	u8 subtype, feat_tlv_count = 0;
+	u32 ouisubtype;
+
+	ouisubtype = ntohl(tlv->ouisubtype);
+	subtype = (u8)((ouisubtype & I40E_LLDP_TLV_SUBTYPE_MASK) >>
+		       I40E_LLDP_TLV_SUBTYPE_SHIFT);
+	/* Return if not CEE DCBX */
+	if (subtype != I40E_CEE_DCBX_TYPE)
+		return;
+
+	typelength = ntohs(tlv->typelength);
+	tlvlen = (u16)((typelength & I40E_LLDP_TLV_LEN_MASK) >>
+			I40E_LLDP_TLV_LEN_SHIFT);
+	len = sizeof(tlv->typelength) + sizeof(ouisubtype) +
+	      sizeof(struct i40e_cee_ctrl_tlv);
+	/* Return if no CEE DCBX Feature TLVs */
+	if (tlvlen <= len)
+		return;
+
+	sub_tlv = (struct i40e_cee_feat_tlv *)((char *)tlv + len);
+	while (feat_tlv_count < I40E_CEE_MAX_FEAT_TYPE) {
+		typelength = ntohs(sub_tlv->hdr.typelen);
+		sublen = (u16)((typelength &
+				I40E_LLDP_TLV_LEN_MASK) >>
+				I40E_LLDP_TLV_LEN_SHIFT);
+		subtype = (u8)((typelength & I40E_LLDP_TLV_TYPE_MASK) >>
+				I40E_LLDP_TLV_TYPE_SHIFT);
+		switch (subtype) {
+		case I40E_CEE_SUBTYPE_PG_CFG:
+			i40e_parse_cee_pgcfg_tlv(sub_tlv, dcbcfg);
+			break;
+		case I40E_CEE_SUBTYPE_PFC_CFG:
+			i40e_parse_cee_pfccfg_tlv(sub_tlv, dcbcfg);
+			break;
+		case I40E_CEE_SUBTYPE_APP_PRI:
+			i40e_parse_cee_app_tlv(sub_tlv, dcbcfg);
+			break;
+		default:
+			return; /* Invalid Sub-type return */
+		}
+		feat_tlv_count++;
+		/* Move to next sub TLV */
+		sub_tlv = (struct i40e_cee_feat_tlv *)((char *)sub_tlv +
+						sizeof(sub_tlv->hdr.typelen) +
+						sublen);
+	}
+}
+
+/**
+ * i40e_parse_org_tlv
+ * @tlv: Organization specific TLV
+ * @dcbcfg: Local store to update ETS REC data
+ *
+ * Currently only IEEE 802.1Qaz TLV is supported, all others
+ * will be returned
+ **/
+static void i40e_parse_org_tlv(struct i40e_lldp_org_tlv *tlv,
+			       struct i40e_dcbx_config *dcbcfg)
+{
+	u32 ouisubtype;
+	u32 oui;
+
+	ouisubtype = ntohl(tlv->ouisubtype);
+	oui = (u32)((ouisubtype & I40E_LLDP_TLV_OUI_MASK) >>
+		    I40E_LLDP_TLV_OUI_SHIFT);
+	switch (oui) {
+	case I40E_IEEE_8021QAZ_OUI:
+		i40e_parse_ieee_tlv(tlv, dcbcfg);
+		break;
+	case I40E_CEE_DCBX_OUI:
+		i40e_parse_cee_tlv(tlv, dcbcfg);
+		break;
+	default:
+		break;
+	}
+}
+
+/**
+ * i40e_lldp_to_dcb_config
+ * @lldpmib: LLDPDU to be parsed
+ * @dcbcfg: store for LLDPDU data
+ *
+ * Parse DCB configuration from the LLDPDU
+ **/
+i40e_status i40e_lldp_to_dcb_config(u8 *lldpmib,
+				    struct i40e_dcbx_config *dcbcfg)
+{
+	i40e_status ret = 0;
+	struct i40e_lldp_org_tlv *tlv;
+	u16 type;
+	u16 length;
+	u16 typelength;
+	u16 offset = 0;
+
+	if (!lldpmib || !dcbcfg)
+		return I40E_ERR_PARAM;
+
+	/* set to the start of LLDPDU */
+	lldpmib += ETH_HLEN;
+	tlv = (struct i40e_lldp_org_tlv *)lldpmib;
+	while (1) {
+		typelength = ntohs(tlv->typelength);
+		type = (u16)((typelength & I40E_LLDP_TLV_TYPE_MASK) >>
+			     I40E_LLDP_TLV_TYPE_SHIFT);
+		length = (u16)((typelength & I40E_LLDP_TLV_LEN_MASK) >>
+			       I40E_LLDP_TLV_LEN_SHIFT);
+		offset += sizeof(typelength) + length;
+
+		/* END TLV or beyond LLDPDU size */
+		if ((type == I40E_TLV_TYPE_END) || (offset > I40E_LLDPDU_SIZE))
+			break;
+
+		switch (type) {
+		case I40E_TLV_TYPE_ORG:
+			i40e_parse_org_tlv(tlv, dcbcfg);
+			break;
+		default:
+			break;
+		}
+
+		/* Move to next TLV */
+		tlv = (struct i40e_lldp_org_tlv *)((char *)tlv +
+						    sizeof(tlv->typelength) +
+						    length);
+	}
+
+	return ret;
+}
+
+/**
+ * i40e_aq_get_dcb_config
+ * @hw: pointer to the hw struct
+ * @mib_type: mib type for the query
+ * @bridgetype: bridge type for the query (remote)
+ * @dcbcfg: store for LLDPDU data
+ *
+ * Query DCB configuration from the Firmware
+ **/
+i40e_status i40e_aq_get_dcb_config(struct i40e_hw *hw, u8 mib_type,
+				   u8 bridgetype,
+				   struct i40e_dcbx_config *dcbcfg)
+{
+	i40e_status ret = 0;
+	struct i40e_virt_mem mem;
+	u8 *lldpmib;
+
+	/* Allocate the LLDPDU */
+	ret = i40e_allocate_virt_mem(hw, &mem, I40E_LLDPDU_SIZE);
+	if (ret)
+		return ret;
+
+	lldpmib = (u8 *)mem.va;
+	ret = i40e_aq_get_lldp_mib(hw, bridgetype, mib_type,
+				   (void *)lldpmib, I40E_LLDPDU_SIZE,
+				   NULL, NULL, NULL);
+	if (ret)
+		goto free_mem;
+
+	/* Parse LLDP MIB to get dcb configuration */
+	ret = i40e_lldp_to_dcb_config(lldpmib, dcbcfg);
+
+free_mem:
+	i40e_free_virt_mem(hw, &mem);
+	return ret;
+}
+
+/**
+ * i40e_cee_to_dcb_v1_config
+ * @cee_cfg: pointer to CEE v1 response configuration struct
+ * @dcbcfg: DCB configuration struct
+ *
+ * Convert CEE v1 configuration from firmware to DCB configuration
+ **/
+static void i40e_cee_to_dcb_v1_config(
+			struct i40e_aqc_get_cee_dcb_cfg_v1_resp *cee_cfg,
+			struct i40e_dcbx_config *dcbcfg)
+{
+	u16 status, tlv_status = le16_to_cpu(cee_cfg->tlv_status);
+	u16 app_prio = le16_to_cpu(cee_cfg->oper_app_prio);
+	u8 i, tc, err;
+
+	/* CEE PG data to ETS config */
+	dcbcfg->etscfg.maxtcs = cee_cfg->oper_num_tc;
+
+	/* Note that the FW creates the oper_prio_tc nibbles reversed
+	 * from those in the CEE Priority Group sub-TLV.
+	 */
+	for (i = 0; i < 4; i++) {
+		tc = (u8)((cee_cfg->oper_prio_tc[i] &
+			 I40E_CEE_PGID_PRIO_0_MASK) >>
+			 I40E_CEE_PGID_PRIO_0_SHIFT);
+		dcbcfg->etscfg.prioritytable[i * 2] =  tc;
+		tc = (u8)((cee_cfg->oper_prio_tc[i] &
+			 I40E_CEE_PGID_PRIO_1_MASK) >>
+			 I40E_CEE_PGID_PRIO_1_SHIFT);
+		dcbcfg->etscfg.prioritytable[i*2 + 1] = tc;
+	}
+
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++)
+		dcbcfg->etscfg.tcbwtable[i] = cee_cfg->oper_tc_bw[i];
+
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		if (dcbcfg->etscfg.prioritytable[i] == I40E_CEE_PGID_STRICT) {
+			/* Map it to next empty TC */
+			dcbcfg->etscfg.prioritytable[i] =
+						cee_cfg->oper_num_tc - 1;
+			dcbcfg->etscfg.tsatable[i] = I40E_IEEE_TSA_STRICT;
+		} else {
+			dcbcfg->etscfg.tsatable[i] = I40E_IEEE_TSA_ETS;
+		}
+	}
+
+	/* CEE PFC data to ETS config */
+	dcbcfg->pfc.pfcenable = cee_cfg->oper_pfc_en;
+	dcbcfg->pfc.pfccap = I40E_MAX_TRAFFIC_CLASS;
+
+	status = (tlv_status & I40E_AQC_CEE_APP_STATUS_MASK) >>
+		  I40E_AQC_CEE_APP_STATUS_SHIFT;
+	err = (status & I40E_TLV_STATUS_ERR) ? 1 : 0;
+	/* Add APPs if Error is False */
+	if (!err) {
+		/* CEE operating configuration supports FCoE/iSCSI/FIP only */
+		dcbcfg->numapps = I40E_CEE_OPER_MAX_APPS;
+
+		/* FCoE APP */
+		dcbcfg->app[0].priority =
+			(app_prio & I40E_AQC_CEE_APP_FCOE_MASK) >>
+			 I40E_AQC_CEE_APP_FCOE_SHIFT;
+		dcbcfg->app[0].selector = I40E_APP_SEL_ETHTYPE;
+		dcbcfg->app[0].protocolid = I40E_APP_PROTOID_FCOE;
+
+		/* iSCSI APP */
+		dcbcfg->app[1].priority =
+			(app_prio & I40E_AQC_CEE_APP_ISCSI_MASK) >>
+			 I40E_AQC_CEE_APP_ISCSI_SHIFT;
+		dcbcfg->app[1].selector = I40E_APP_SEL_TCPIP;
+		dcbcfg->app[1].protocolid = I40E_APP_PROTOID_ISCSI;
+
+		/* FIP APP */
+		dcbcfg->app[2].priority =
+			(app_prio & I40E_AQC_CEE_APP_FIP_MASK) >>
+			 I40E_AQC_CEE_APP_FIP_SHIFT;
+		dcbcfg->app[2].selector = I40E_APP_SEL_ETHTYPE;
+		dcbcfg->app[2].protocolid = I40E_APP_PROTOID_FIP;
+	}
+}
+
+/**
+ * i40e_cee_to_dcb_config
+ * @cee_cfg: pointer to CEE configuration struct
+ * @dcbcfg: DCB configuration struct
+ *
+ * Convert CEE configuration from firmware to DCB configuration
+ **/
+static void i40e_cee_to_dcb_config(
+				struct i40e_aqc_get_cee_dcb_cfg_resp *cee_cfg,
+				struct i40e_dcbx_config *dcbcfg)
+{
+	u32 status, tlv_status = le32_to_cpu(cee_cfg->tlv_status);
+	u16 app_prio = le16_to_cpu(cee_cfg->oper_app_prio);
+	u8 i, tc, err, sync, oper;
+
+	/* CEE PG data to ETS config */
+	dcbcfg->etscfg.maxtcs = cee_cfg->oper_num_tc;
+
+	/* Note that the FW creates the oper_prio_tc nibbles reversed
+	 * from those in the CEE Priority Group sub-TLV.
+	 */
+	for (i = 0; i < 4; i++) {
+		tc = (u8)((cee_cfg->oper_prio_tc[i] &
+			 I40E_CEE_PGID_PRIO_0_MASK) >>
+			 I40E_CEE_PGID_PRIO_0_SHIFT);
+		dcbcfg->etscfg.prioritytable[i * 2] =  tc;
+		tc = (u8)((cee_cfg->oper_prio_tc[i] &
+			 I40E_CEE_PGID_PRIO_1_MASK) >>
+			 I40E_CEE_PGID_PRIO_1_SHIFT);
+		dcbcfg->etscfg.prioritytable[i * 2 + 1] = tc;
+	}
+
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++)
+		dcbcfg->etscfg.tcbwtable[i] = cee_cfg->oper_tc_bw[i];
+
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		if (dcbcfg->etscfg.prioritytable[i] == I40E_CEE_PGID_STRICT) {
+			/* Map it to next empty TC */
+			dcbcfg->etscfg.prioritytable[i] =
+						cee_cfg->oper_num_tc - 1;
+			dcbcfg->etscfg.tsatable[i] = I40E_IEEE_TSA_STRICT;
+		} else {
+			dcbcfg->etscfg.tsatable[i] = I40E_IEEE_TSA_ETS;
+		}
+	}
+
+	/* CEE PFC data to ETS config */
+	dcbcfg->pfc.pfcenable = cee_cfg->oper_pfc_en;
+	dcbcfg->pfc.pfccap = I40E_MAX_TRAFFIC_CLASS;
+
+	i = 0;
+	status = (tlv_status & I40E_AQC_CEE_FCOE_STATUS_MASK) >>
+		  I40E_AQC_CEE_FCOE_STATUS_SHIFT;
+	err = (status & I40E_TLV_STATUS_ERR) ? 1 : 0;
+	sync = (status & I40E_TLV_STATUS_SYNC) ? 1 : 0;
+	oper = (status & I40E_TLV_STATUS_OPER) ? 1 : 0;
+	/* Add FCoE APP if Error is False and Oper/Sync is True */
+	if (!err && sync && oper) {
+		/* FCoE APP */
+		dcbcfg->app[i].priority =
+			(app_prio & I40E_AQC_CEE_APP_FCOE_MASK) >>
+			 I40E_AQC_CEE_APP_FCOE_SHIFT;
+		dcbcfg->app[i].selector = I40E_APP_SEL_ETHTYPE;
+		dcbcfg->app[i].protocolid = I40E_APP_PROTOID_FCOE;
+		i++;
+	}
+
+	status = (tlv_status & I40E_AQC_CEE_ISCSI_STATUS_MASK) >>
+		  I40E_AQC_CEE_ISCSI_STATUS_SHIFT;
+	err = (status & I40E_TLV_STATUS_ERR) ? 1 : 0;
+	sync = (status & I40E_TLV_STATUS_SYNC) ? 1 : 0;
+	oper = (status & I40E_TLV_STATUS_OPER) ? 1 : 0;
+	/* Add iSCSI APP if Error is False and Oper/Sync is True */
+	if (!err && sync && oper) {
+		/* iSCSI APP */
+		dcbcfg->app[i].priority =
+			(app_prio & I40E_AQC_CEE_APP_ISCSI_MASK) >>
+			 I40E_AQC_CEE_APP_ISCSI_SHIFT;
+		dcbcfg->app[i].selector = I40E_APP_SEL_TCPIP;
+		dcbcfg->app[i].protocolid = I40E_APP_PROTOID_ISCSI;
+		i++;
+	}
+
+	status = (tlv_status & I40E_AQC_CEE_FIP_STATUS_MASK) >>
+		  I40E_AQC_CEE_FIP_STATUS_SHIFT;
+	err = (status & I40E_TLV_STATUS_ERR) ? 1 : 0;
+	sync = (status & I40E_TLV_STATUS_SYNC) ? 1 : 0;
+	oper = (status & I40E_TLV_STATUS_OPER) ? 1 : 0;
+	/* Add FIP APP if Error is False and Oper/Sync is True */
+	if (!err && sync && oper) {
+		/* FIP APP */
+		dcbcfg->app[i].priority =
+			(app_prio & I40E_AQC_CEE_APP_FIP_MASK) >>
+			 I40E_AQC_CEE_APP_FIP_SHIFT;
+		dcbcfg->app[i].selector = I40E_APP_SEL_ETHTYPE;
+		dcbcfg->app[i].protocolid = I40E_APP_PROTOID_FIP;
+		i++;
+	}
+	dcbcfg->numapps = i;
+}
+
+/**
+ * i40e_get_ieee_dcb_config
+ * @hw: pointer to the hw struct
+ *
+ * Get IEEE mode DCB configuration from the Firmware
+ **/
+static i40e_status i40e_get_ieee_dcb_config(struct i40e_hw *hw)
+{
+	i40e_status ret = 0;
+
+	/* IEEE mode */
+	hw->local_dcbx_config.dcbx_mode = I40E_DCBX_MODE_IEEE;
+	/* Get Local DCB Config */
+	ret = i40e_aq_get_dcb_config(hw, I40E_AQ_LLDP_MIB_LOCAL, 0,
+				     &hw->local_dcbx_config);
+	if (ret)
+		goto out;
+
+	/* Get Remote DCB Config */
+	ret = i40e_aq_get_dcb_config(hw, I40E_AQ_LLDP_MIB_REMOTE,
+				     I40E_AQ_LLDP_BRIDGE_TYPE_NEAREST_BRIDGE,
+				     &hw->remote_dcbx_config);
+	/* Don't treat ENOENT as an error for Remote MIBs */
+	if (hw->aq.asq_last_status == I40E_AQ_RC_ENOENT)
+		ret = 0;
+
+out:
+	return ret;
+}
+
+/**
+ * i40e_get_dcb_config
+ * @hw: pointer to the hw struct
+ *
+ * Get DCB configuration from the Firmware
+ **/
+i40e_status i40e_get_dcb_config(struct i40e_hw *hw)
+{
+	i40e_status ret = 0;
+	struct i40e_aqc_get_cee_dcb_cfg_resp cee_cfg;
+	struct i40e_aqc_get_cee_dcb_cfg_v1_resp cee_v1_cfg;
+
+	/* If Firmware version < v4.33 on X710/XL710, IEEE only */
+	if ((hw->mac.type == I40E_MAC_XL710) &&
+	    (((hw->aq.fw_maj_ver == 4) && (hw->aq.fw_min_ver < 33)) ||
+	      (hw->aq.fw_maj_ver < 4)))
+		return i40e_get_ieee_dcb_config(hw);
+
+	/* If Firmware version == v4.33 on X710/XL710, use old CEE struct */
+	if ((hw->mac.type == I40E_MAC_XL710) &&
+	    ((hw->aq.fw_maj_ver == 4) && (hw->aq.fw_min_ver == 33))) {
+		ret = i40e_aq_get_cee_dcb_config(hw, &cee_v1_cfg,
+						 sizeof(cee_v1_cfg), NULL);
+		if (!ret) {
+			/* CEE mode */
+			hw->local_dcbx_config.dcbx_mode = I40E_DCBX_MODE_CEE;
+			hw->local_dcbx_config.tlv_status =
+					le16_to_cpu(cee_v1_cfg.tlv_status);
+			i40e_cee_to_dcb_v1_config(&cee_v1_cfg,
+						  &hw->local_dcbx_config);
+		}
+	} else {
+		ret = i40e_aq_get_cee_dcb_config(hw, &cee_cfg,
+						 sizeof(cee_cfg), NULL);
+		if (!ret) {
+			/* CEE mode */
+			hw->local_dcbx_config.dcbx_mode = I40E_DCBX_MODE_CEE;
+			hw->local_dcbx_config.tlv_status =
+					le32_to_cpu(cee_cfg.tlv_status);
+			i40e_cee_to_dcb_config(&cee_cfg,
+					       &hw->local_dcbx_config);
+		}
+	}
+
+	/* CEE mode not enabled try querying IEEE data */
+	if (hw->aq.asq_last_status == I40E_AQ_RC_ENOENT)
+		return i40e_get_ieee_dcb_config(hw);
+
+	if (ret)
+		goto out;
+
+	/* Get CEE DCB Desired Config */
+	ret = i40e_aq_get_dcb_config(hw, I40E_AQ_LLDP_MIB_LOCAL, 0,
+				     &hw->desired_dcbx_config);
+	if (ret)
+		goto out;
+
+	/* Get Remote DCB Config */
+	ret = i40e_aq_get_dcb_config(hw, I40E_AQ_LLDP_MIB_REMOTE,
+				     I40E_AQ_LLDP_BRIDGE_TYPE_NEAREST_BRIDGE,
+				     &hw->remote_dcbx_config);
+	/* Don't treat ENOENT as an error for Remote MIBs */
+	if (hw->aq.asq_last_status == I40E_AQ_RC_ENOENT)
+		ret = 0;
+
+out:
+	return ret;
+}
+
+/**
+ * i40e_init_dcb
+ * @hw: pointer to the hw struct
+ *
+ * Update DCB configuration from the Firmware
+ **/
+i40e_status i40e_init_dcb(struct i40e_hw *hw)
+{
+	i40e_status ret = 0;
+	struct i40e_lldp_variables lldp_cfg;
+	u8 adminstatus = 0;
+
+	if (!hw->func_caps.dcb)
+		return ret;
+
+	/* Read LLDP NVM area */
+	ret = i40e_read_lldp_cfg(hw, &lldp_cfg);
+	if (ret)
+		return ret;
+
+	/* Get the LLDP AdminStatus for the current port */
+	adminstatus = lldp_cfg.adminstatus >> (hw->port * 4);
+	adminstatus &= 0xF;
+
+	/* LLDP agent disabled */
+	if (!adminstatus) {
+		hw->dcbx_status = I40E_DCBX_STATUS_DISABLED;
+		return ret;
+	}
+
+	/* Get DCBX status */
+	ret = i40e_get_dcbx_status(hw, &hw->dcbx_status);
+	if (ret)
+		return ret;
+
+	/* Check the DCBX Status */
+	switch (hw->dcbx_status) {
+	case I40E_DCBX_STATUS_DONE:
+	case I40E_DCBX_STATUS_IN_PROGRESS:
+		/* Get current DCBX configuration */
+		ret = i40e_get_dcb_config(hw);
+		if (ret)
+			return ret;
+		break;
+	case I40E_DCBX_STATUS_DISABLED:
+		return ret;
+	case I40E_DCBX_STATUS_NOT_STARTED:
+	case I40E_DCBX_STATUS_MULTIPLE_PEERS:
+	default:
+		break;
+	}
+
+	/* Configure the LLDP MIB change event */
+	ret = i40e_aq_cfg_lldp_mib_change_event(hw, true, NULL);
+	if (ret)
+		return ret;
+
+	return ret;
+}
+
+/**
+ * i40e_read_lldp_cfg - read LLDP Configuration data from NVM
+ * @hw: pointer to the HW structure
+ * @lldp_cfg: pointer to hold lldp configuration variables
+ *
+ * Reads the LLDP configuration data from NVM
+ **/
+i40e_status i40e_read_lldp_cfg(struct i40e_hw *hw,
+			       struct i40e_lldp_variables *lldp_cfg)
+{
+	i40e_status ret = 0;
+	u32 offset = (2 * I40E_NVM_LLDP_CFG_PTR);
+
+	if (!lldp_cfg)
+		return I40E_ERR_PARAM;
+
+	ret = i40e_acquire_nvm(hw, I40E_RESOURCE_READ);
+	if (ret)
+		goto err_lldp_cfg;
+
+	ret = i40e_aq_read_nvm(hw, I40E_SR_EMP_MODULE_PTR, offset,
+			       sizeof(struct i40e_lldp_variables),
+			       (u8 *)lldp_cfg,
+			       true, NULL);
+	i40e_release_nvm(hw);
+
+err_lldp_cfg:
+	return ret;
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.h b/drivers/net/ethernet/intel/i40e/i40e_dcb.h
new file mode 100644
index 0000000..4f80638
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_DCB_H_
+#define _I40E_DCB_H_
+
+#include "i40e_type.h"
+
+#define I40E_DCBX_STATUS_NOT_STARTED	0
+#define I40E_DCBX_STATUS_IN_PROGRESS	1
+#define I40E_DCBX_STATUS_DONE		2
+#define I40E_DCBX_STATUS_MULTIPLE_PEERS	3
+#define I40E_DCBX_STATUS_DISABLED	7
+
+#define I40E_TLV_TYPE_END		0
+#define I40E_TLV_TYPE_ORG		127
+
+#define I40E_IEEE_8021QAZ_OUI		0x0080C2
+#define I40E_IEEE_SUBTYPE_ETS_CFG	9
+#define I40E_IEEE_SUBTYPE_ETS_REC	10
+#define I40E_IEEE_SUBTYPE_PFC_CFG	11
+#define I40E_IEEE_SUBTYPE_APP_PRI	12
+
+#define I40E_CEE_DCBX_OUI		0x001b21
+#define I40E_CEE_DCBX_TYPE		2
+
+#define I40E_CEE_SUBTYPE_CTRL		1
+#define I40E_CEE_SUBTYPE_PG_CFG		2
+#define I40E_CEE_SUBTYPE_PFC_CFG	3
+#define I40E_CEE_SUBTYPE_APP_PRI	4
+
+#define I40E_CEE_MAX_FEAT_TYPE		3
+/* Defines for LLDP TLV header */
+#define I40E_LLDP_TLV_LEN_SHIFT		0
+#define I40E_LLDP_TLV_LEN_MASK		(0x01FF << I40E_LLDP_TLV_LEN_SHIFT)
+#define I40E_LLDP_TLV_TYPE_SHIFT	9
+#define I40E_LLDP_TLV_TYPE_MASK		(0x7F << I40E_LLDP_TLV_TYPE_SHIFT)
+#define I40E_LLDP_TLV_SUBTYPE_SHIFT	0
+#define I40E_LLDP_TLV_SUBTYPE_MASK	(0xFF << I40E_LLDP_TLV_SUBTYPE_SHIFT)
+#define I40E_LLDP_TLV_OUI_SHIFT		8
+#define I40E_LLDP_TLV_OUI_MASK		(0xFFFFFF << I40E_LLDP_TLV_OUI_SHIFT)
+
+/* Defines for IEEE ETS TLV */
+#define I40E_IEEE_ETS_MAXTC_SHIFT	0
+#define I40E_IEEE_ETS_MAXTC_MASK	(0x7 << I40E_IEEE_ETS_MAXTC_SHIFT)
+#define I40E_IEEE_ETS_CBS_SHIFT		6
+#define I40E_IEEE_ETS_CBS_MASK		BIT(I40E_IEEE_ETS_CBS_SHIFT)
+#define I40E_IEEE_ETS_WILLING_SHIFT	7
+#define I40E_IEEE_ETS_WILLING_MASK	BIT(I40E_IEEE_ETS_WILLING_SHIFT)
+#define I40E_IEEE_ETS_PRIO_0_SHIFT	0
+#define I40E_IEEE_ETS_PRIO_0_MASK	(0x7 << I40E_IEEE_ETS_PRIO_0_SHIFT)
+#define I40E_IEEE_ETS_PRIO_1_SHIFT	4
+#define I40E_IEEE_ETS_PRIO_1_MASK	(0x7 << I40E_IEEE_ETS_PRIO_1_SHIFT)
+#define I40E_CEE_PGID_PRIO_0_SHIFT	0
+#define I40E_CEE_PGID_PRIO_0_MASK	(0xF << I40E_CEE_PGID_PRIO_0_SHIFT)
+#define I40E_CEE_PGID_PRIO_1_SHIFT	4
+#define I40E_CEE_PGID_PRIO_1_MASK	(0xF << I40E_CEE_PGID_PRIO_1_SHIFT)
+#define I40E_CEE_PGID_STRICT		15
+
+/* Defines for IEEE TSA types */
+#define I40E_IEEE_TSA_STRICT		0
+#define I40E_IEEE_TSA_ETS		2
+
+/* Defines for IEEE PFC TLV */
+#define I40E_IEEE_PFC_CAP_SHIFT		0
+#define I40E_IEEE_PFC_CAP_MASK		(0xF << I40E_IEEE_PFC_CAP_SHIFT)
+#define I40E_IEEE_PFC_MBC_SHIFT		6
+#define I40E_IEEE_PFC_MBC_MASK		BIT(I40E_IEEE_PFC_MBC_SHIFT)
+#define I40E_IEEE_PFC_WILLING_SHIFT	7
+#define I40E_IEEE_PFC_WILLING_MASK	BIT(I40E_IEEE_PFC_WILLING_SHIFT)
+
+/* Defines for IEEE APP TLV */
+#define I40E_IEEE_APP_SEL_SHIFT		0
+#define I40E_IEEE_APP_SEL_MASK		(0x7 << I40E_IEEE_APP_SEL_SHIFT)
+#define I40E_IEEE_APP_PRIO_SHIFT	5
+#define I40E_IEEE_APP_PRIO_MASK		(0x7 << I40E_IEEE_APP_PRIO_SHIFT)
+
+
+#pragma pack(1)
+
+/* IEEE 802.1AB LLDP Organization specific TLV */
+struct i40e_lldp_org_tlv {
+	__be16 typelength;
+	__be32 ouisubtype;
+	u8 tlvinfo[1];
+};
+
+struct i40e_cee_tlv_hdr {
+	__be16 typelen;
+	u8 operver;
+	u8 maxver;
+};
+
+struct i40e_cee_ctrl_tlv {
+	struct i40e_cee_tlv_hdr hdr;
+	__be32 seqno;
+	__be32 ackno;
+};
+
+struct i40e_cee_feat_tlv {
+	struct i40e_cee_tlv_hdr hdr;
+	u8 en_will_err; /* Bits: |En|Will|Err|Reserved(5)| */
+#define I40E_CEE_FEAT_TLV_ENABLE_MASK	0x80
+#define I40E_CEE_FEAT_TLV_WILLING_MASK	0x40
+#define I40E_CEE_FEAT_TLV_ERR_MASK	0x20
+	u8 subtype;
+	u8 tlvinfo[1];
+};
+
+struct i40e_cee_app_prio {
+	__be16 protocol;
+	u8 upper_oui_sel; /* Bits: |Upper OUI(6)|Selector(2)| */
+#define I40E_CEE_APP_SELECTOR_MASK	0x03
+	__be16 lower_oui;
+	u8 prio_map;
+};
+#pragma pack()
+
+i40e_status i40e_get_dcbx_status(struct i40e_hw *hw,
+					   u16 *status);
+i40e_status i40e_lldp_to_dcb_config(u8 *lldpmib,
+					      struct i40e_dcbx_config *dcbcfg);
+i40e_status i40e_aq_get_dcb_config(struct i40e_hw *hw, u8 mib_type,
+					     u8 bridgetype,
+					     struct i40e_dcbx_config *dcbcfg);
+i40e_status i40e_get_dcb_config(struct i40e_hw *hw);
+i40e_status i40e_init_dcb(struct i40e_hw *hw);
+#endif /* _I40E_DCB_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c b/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c
new file mode 100644
index 0000000..502818e
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c
@@ -0,0 +1,321 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifdef CONFIG_I40E_DCB
+#include "i40e.h"
+#include <net/dcbnl.h>
+
+/**
+ * i40e_get_pfc_delay - retrieve PFC Link Delay
+ * @hw: pointer to hardware struct
+ * @delay: holds the PFC Link delay value
+ *
+ * Returns PFC Link Delay from the PRTDCB_GENC.PFCLDA
+ **/
+static void i40e_get_pfc_delay(struct i40e_hw *hw, u16 *delay)
+{
+	u32 val;
+
+	val = rd32(hw, I40E_PRTDCB_GENC);
+	*delay = (u16)((val & I40E_PRTDCB_GENC_PFCLDA_MASK) >>
+		       I40E_PRTDCB_GENC_PFCLDA_SHIFT);
+}
+
+/**
+ * i40e_dcbnl_ieee_getets - retrieve local IEEE ETS configuration
+ * @netdev: the corresponding netdev
+ * @ets: structure to hold the ETS information
+ *
+ * Returns local IEEE ETS configuration
+ **/
+static int i40e_dcbnl_ieee_getets(struct net_device *dev,
+				  struct ieee_ets *ets)
+{
+	struct i40e_pf *pf = i40e_netdev_to_pf(dev);
+	struct i40e_dcbx_config *dcbxcfg;
+	struct i40e_hw *hw = &pf->hw;
+
+	if (!(pf->dcbx_cap & DCB_CAP_DCBX_VER_IEEE))
+		return -EINVAL;
+
+	dcbxcfg = &hw->local_dcbx_config;
+	ets->willing = dcbxcfg->etscfg.willing;
+	ets->ets_cap = dcbxcfg->etscfg.maxtcs;
+	ets->cbs = dcbxcfg->etscfg.cbs;
+	memcpy(ets->tc_tx_bw, dcbxcfg->etscfg.tcbwtable,
+		sizeof(ets->tc_tx_bw));
+	memcpy(ets->tc_rx_bw, dcbxcfg->etscfg.tcbwtable,
+		sizeof(ets->tc_rx_bw));
+	memcpy(ets->tc_tsa, dcbxcfg->etscfg.tsatable,
+		sizeof(ets->tc_tsa));
+	memcpy(ets->prio_tc, dcbxcfg->etscfg.prioritytable,
+		sizeof(ets->prio_tc));
+	memcpy(ets->tc_reco_bw, dcbxcfg->etsrec.tcbwtable,
+		sizeof(ets->tc_reco_bw));
+	memcpy(ets->tc_reco_tsa, dcbxcfg->etsrec.tsatable,
+		sizeof(ets->tc_reco_tsa));
+	memcpy(ets->reco_prio_tc, dcbxcfg->etscfg.prioritytable,
+		sizeof(ets->reco_prio_tc));
+
+	return 0;
+}
+
+/**
+ * i40e_dcbnl_ieee_getpfc - retrieve local IEEE PFC configuration
+ * @netdev: the corresponding netdev
+ * @ets: structure to hold the PFC information
+ *
+ * Returns local IEEE PFC configuration
+ **/
+static int i40e_dcbnl_ieee_getpfc(struct net_device *dev,
+				  struct ieee_pfc *pfc)
+{
+	struct i40e_pf *pf = i40e_netdev_to_pf(dev);
+	struct i40e_dcbx_config *dcbxcfg;
+	struct i40e_hw *hw = &pf->hw;
+	int i;
+
+	if (!(pf->dcbx_cap & DCB_CAP_DCBX_VER_IEEE))
+		return -EINVAL;
+
+	dcbxcfg = &hw->local_dcbx_config;
+	pfc->pfc_cap = dcbxcfg->pfc.pfccap;
+	pfc->pfc_en = dcbxcfg->pfc.pfcenable;
+	pfc->mbc = dcbxcfg->pfc.mbc;
+	i40e_get_pfc_delay(hw, &pfc->delay);
+
+	/* Get Requests/Indicatiosn */
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		pfc->requests[i] = pf->stats.priority_xoff_tx[i];
+		pfc->indications[i] = pf->stats.priority_xoff_rx[i];
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_dcbnl_getdcbx - retrieve current DCBx capability
+ * @netdev: the corresponding netdev
+ *
+ * Returns DCBx capability features
+ **/
+static u8 i40e_dcbnl_getdcbx(struct net_device *dev)
+{
+	struct i40e_pf *pf = i40e_netdev_to_pf(dev);
+
+	return pf->dcbx_cap;
+}
+
+/**
+ * i40e_dcbnl_get_perm_hw_addr - MAC address used by DCBx
+ * @netdev: the corresponding netdev
+ *
+ * Returns the SAN MAC address used for LLDP exchange
+ **/
+static void i40e_dcbnl_get_perm_hw_addr(struct net_device *dev,
+					u8 *perm_addr)
+{
+	struct i40e_pf *pf = i40e_netdev_to_pf(dev);
+	int i, j;
+
+	memset(perm_addr, 0xff, MAX_ADDR_LEN);
+
+	for (i = 0; i < dev->addr_len; i++)
+		perm_addr[i] = pf->hw.mac.perm_addr[i];
+
+	for (j = 0; j < dev->addr_len; j++, i++)
+		perm_addr[i] = pf->hw.mac.san_addr[j];
+}
+
+static const struct dcbnl_rtnl_ops dcbnl_ops = {
+	.ieee_getets	= i40e_dcbnl_ieee_getets,
+	.ieee_getpfc	= i40e_dcbnl_ieee_getpfc,
+	.getdcbx	= i40e_dcbnl_getdcbx,
+	.getpermhwaddr  = i40e_dcbnl_get_perm_hw_addr,
+};
+
+/**
+ * i40e_dcbnl_set_all - set all the apps and ieee data from DCBx config
+ * @vsi: the corresponding vsi
+ *
+ * Set up all the IEEE APPs in the DCBNL App Table and generate event for
+ * other settings
+ **/
+void i40e_dcbnl_set_all(struct i40e_vsi *vsi)
+{
+	struct net_device *dev = vsi->netdev;
+	struct i40e_pf *pf = i40e_netdev_to_pf(dev);
+	struct i40e_dcbx_config *dcbxcfg;
+	struct i40e_hw *hw = &pf->hw;
+	struct dcb_app sapp;
+	u8 prio, tc_map;
+	int i;
+
+	/* DCB not enabled */
+	if (!(pf->flags & I40E_FLAG_DCB_ENABLED))
+		return;
+
+	/* MFP mode but not an iSCSI PF so return */
+	if ((pf->flags & I40E_FLAG_MFP_ENABLED) && !(pf->hw.func_caps.iscsi))
+		return;
+
+	dcbxcfg = &hw->local_dcbx_config;
+
+	/* Set up all the App TLVs if DCBx is negotiated */
+	for (i = 0; i < dcbxcfg->numapps; i++) {
+		prio = dcbxcfg->app[i].priority;
+		tc_map = BIT(dcbxcfg->etscfg.prioritytable[prio]);
+
+		/* Add APP only if the TC is enabled for this VSI */
+		if (tc_map & vsi->tc_config.enabled_tc) {
+			sapp.selector = dcbxcfg->app[i].selector;
+			sapp.protocol = dcbxcfg->app[i].protocolid;
+			sapp.priority = prio;
+			dcb_ieee_setapp(dev, &sapp);
+		}
+	}
+
+	/* Notify user-space of the changes */
+	dcbnl_ieee_notify(dev, RTM_SETDCB, DCB_CMD_IEEE_SET, 0, 0);
+}
+
+/**
+ * i40e_dcbnl_vsi_del_app - Delete APP for given VSI
+ * @vsi: the corresponding vsi
+ * @app: APP to delete
+ *
+ * Delete given APP from the DCBNL APP table for given
+ * VSI
+ **/
+static int i40e_dcbnl_vsi_del_app(struct i40e_vsi *vsi,
+				  struct i40e_dcb_app_priority_table *app)
+{
+	struct net_device *dev = vsi->netdev;
+	struct dcb_app sapp;
+
+	if (!dev)
+		return -EINVAL;
+
+	sapp.selector = app->selector;
+	sapp.protocol = app->protocolid;
+	sapp.priority = app->priority;
+	return dcb_ieee_delapp(dev, &sapp);
+}
+
+/**
+ * i40e_dcbnl_del_app - Delete APP on all VSIs
+ * @pf: the corresponding PF
+ * @app: APP to delete
+ *
+ * Delete given APP from all the VSIs for given PF
+ **/
+static void i40e_dcbnl_del_app(struct i40e_pf *pf,
+			       struct i40e_dcb_app_priority_table *app)
+{
+	int v, err;
+
+	for (v = 0; v < pf->num_alloc_vsi; v++) {
+		if (pf->vsi[v] && pf->vsi[v]->netdev) {
+			err = i40e_dcbnl_vsi_del_app(pf->vsi[v], app);
+			dev_dbg(&pf->pdev->dev, "Deleting app for VSI seid=%d err=%d sel=%d proto=0x%x prio=%d\n",
+				pf->vsi[v]->seid, err, app->selector,
+				app->protocolid, app->priority);
+		}
+	}
+}
+
+/**
+ * i40e_dcbnl_find_app - Search APP in given DCB config
+ * @cfg: DCBX configuration data
+ * @app: APP to search for
+ *
+ * Find given APP in the DCB configuration
+ **/
+static bool i40e_dcbnl_find_app(struct i40e_dcbx_config *cfg,
+				struct i40e_dcb_app_priority_table *app)
+{
+	int i;
+
+	for (i = 0; i < cfg->numapps; i++) {
+		if (app->selector == cfg->app[i].selector &&
+		    app->protocolid == cfg->app[i].protocolid &&
+		    app->priority == cfg->app[i].priority)
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * i40e_dcbnl_flush_apps - Delete all removed APPs
+ * @pf: the corresponding PF
+ * @old_cfg: old DCBX configuration data
+ * @new_cfg: new DCBX configuration data
+ *
+ * Find and delete all APPs that are not present in the passed
+ * DCB configuration
+ **/
+void i40e_dcbnl_flush_apps(struct i40e_pf *pf,
+			   struct i40e_dcbx_config *old_cfg,
+			   struct i40e_dcbx_config *new_cfg)
+{
+	struct i40e_dcb_app_priority_table app;
+	int i;
+
+	/* MFP mode but not an iSCSI PF so return */
+	if ((pf->flags & I40E_FLAG_MFP_ENABLED) && !(pf->hw.func_caps.iscsi))
+		return;
+
+	for (i = 0; i < old_cfg->numapps; i++) {
+		app = old_cfg->app[i];
+		/* The APP is not available anymore delete it */
+		if (!i40e_dcbnl_find_app(new_cfg, &app))
+			i40e_dcbnl_del_app(pf, &app);
+	}
+}
+
+/**
+ * i40e_dcbnl_setup - DCBNL setup
+ * @vsi: the corresponding vsi
+ *
+ * Set up DCBNL ops and initial APP TLVs
+ **/
+void i40e_dcbnl_setup(struct i40e_vsi *vsi)
+{
+	struct net_device *dev = vsi->netdev;
+	struct i40e_pf *pf = i40e_netdev_to_pf(dev);
+
+	/* Not DCB capable */
+	if (!(pf->flags & I40E_FLAG_DCB_CAPABLE))
+		return;
+
+	dev->dcbnl_ops = &dcbnl_ops;
+
+	/* Set initial IEEE DCB settings */
+	i40e_dcbnl_set_all(vsi);
+}
+#endif /* CONFIG_I40E_DCB */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
new file mode 100644
index 0000000..d494dca
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -0,0 +1,1838 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+
+#include "i40e.h"
+
+static struct dentry *i40e_dbg_root;
+
+/**
+ * i40e_dbg_find_vsi - searches for the vsi with the given seid
+ * @pf - the PF structure to search for the vsi
+ * @seid - seid of the vsi it is searching for
+ **/
+static struct i40e_vsi *i40e_dbg_find_vsi(struct i40e_pf *pf, int seid)
+{
+	int i;
+
+	if (seid < 0)
+		dev_info(&pf->pdev->dev, "%d: bad seid\n", seid);
+	else
+		for (i = 0; i < pf->num_alloc_vsi; i++)
+			if (pf->vsi[i] && (pf->vsi[i]->seid == seid))
+				return pf->vsi[i];
+
+	return NULL;
+}
+
+/**
+ * i40e_dbg_find_veb - searches for the veb with the given seid
+ * @pf - the PF structure to search for the veb
+ * @seid - seid of the veb it is searching for
+ **/
+static struct i40e_veb *i40e_dbg_find_veb(struct i40e_pf *pf, int seid)
+{
+	int i;
+
+	for (i = 0; i < I40E_MAX_VEB; i++)
+		if (pf->veb[i] && pf->veb[i]->seid == seid)
+			return pf->veb[i];
+	return NULL;
+}
+
+/**************************************************************
+ * command
+ * The command entry in debugfs is for giving the driver commands
+ * to be executed - these may be for changing the internal switch
+ * setup, adding or removing filters, or other things.  Many of
+ * these will be useful for some forms of unit testing.
+ **************************************************************/
+static char i40e_dbg_command_buf[256] = "";
+
+/**
+ * i40e_dbg_command_read - read for command datum
+ * @filp: the opened file
+ * @buffer: where to write the data for the user to read
+ * @count: the size of the user's buffer
+ * @ppos: file position offset
+ **/
+static ssize_t i40e_dbg_command_read(struct file *filp, char __user *buffer,
+				     size_t count, loff_t *ppos)
+{
+	struct i40e_pf *pf = filp->private_data;
+	int bytes_not_copied;
+	int buf_size = 256;
+	char *buf;
+	int len;
+
+	/* don't allow partial reads */
+	if (*ppos != 0)
+		return 0;
+	if (count < buf_size)
+		return -ENOSPC;
+
+	buf = kzalloc(buf_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOSPC;
+
+	len = snprintf(buf, buf_size, "%s: %s\n",
+		       pf->vsi[pf->lan_vsi]->netdev->name,
+		       i40e_dbg_command_buf);
+
+	bytes_not_copied = copy_to_user(buffer, buf, len);
+	kfree(buf);
+
+	if (bytes_not_copied)
+		return -EFAULT;
+
+	*ppos = len;
+	return len;
+}
+
+static char *i40e_filter_state_string[] = {
+	"INVALID",
+	"NEW",
+	"ACTIVE",
+	"FAILED",
+	"REMOVE",
+};
+
+/**
+ * i40e_dbg_dump_vsi_seid - handles dump vsi seid write into command datum
+ * @pf: the i40e_pf created in command write
+ * @seid: the seid the user put in
+ **/
+static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid)
+{
+	struct rtnl_link_stats64 *nstat;
+	struct i40e_mac_filter *f;
+	struct i40e_vsi *vsi;
+	int i, bkt;
+
+	vsi = i40e_dbg_find_vsi(pf, seid);
+	if (!vsi) {
+		dev_info(&pf->pdev->dev,
+			 "dump %d: seid not found\n", seid);
+		return;
+	}
+	dev_info(&pf->pdev->dev, "vsi seid %d\n", seid);
+	if (vsi->netdev) {
+		struct net_device *nd = vsi->netdev;
+
+		dev_info(&pf->pdev->dev, "    netdev: name = %s, state = %lu, flags = 0x%08x\n",
+			 nd->name, nd->state, nd->flags);
+		dev_info(&pf->pdev->dev, "        features      = 0x%08lx\n",
+			 (unsigned long int)nd->features);
+		dev_info(&pf->pdev->dev, "        hw_features   = 0x%08lx\n",
+			 (unsigned long int)nd->hw_features);
+		dev_info(&pf->pdev->dev, "        vlan_features = 0x%08lx\n",
+			 (unsigned long int)nd->vlan_features);
+	}
+	dev_info(&pf->pdev->dev, "    active_vlans is %s\n",
+		 vsi->active_vlans ? "<valid>" : "<null>");
+	dev_info(&pf->pdev->dev,
+		 "    flags = 0x%08lx, netdev_registered = %i, current_netdev_flags = 0x%04x\n",
+		 vsi->flags, vsi->netdev_registered, vsi->current_netdev_flags);
+	for (i = 0; i < BITS_TO_LONGS(__I40E_VSI_STATE_SIZE__); i++)
+		dev_info(&pf->pdev->dev,
+			 "    state[%d] = %08lx\n",
+			 i, vsi->state[i]);
+	if (vsi == pf->vsi[pf->lan_vsi])
+		dev_info(&pf->pdev->dev, "    MAC address: %pM SAN MAC: %pM Port MAC: %pM\n",
+			 pf->hw.mac.addr,
+			 pf->hw.mac.san_addr,
+			 pf->hw.mac.port_addr);
+	hash_for_each(vsi->mac_filter_hash, bkt, f, hlist) {
+		dev_info(&pf->pdev->dev,
+			 "    mac_filter_hash: %pM vid=%d, state %s\n",
+			 f->macaddr, f->vlan,
+			 i40e_filter_state_string[f->state]);
+	}
+	dev_info(&pf->pdev->dev, "    active_filters %u, promisc_threshold %u, overflow promisc %s\n",
+		 vsi->active_filters, vsi->promisc_threshold,
+		 (test_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state) ?
+		  "ON" : "OFF"));
+	nstat = i40e_get_vsi_stats_struct(vsi);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats: rx_packets = %lu, rx_bytes = %lu, rx_errors = %lu, rx_dropped = %lu\n",
+		 (unsigned long int)nstat->rx_packets,
+		 (unsigned long int)nstat->rx_bytes,
+		 (unsigned long int)nstat->rx_errors,
+		 (unsigned long int)nstat->rx_dropped);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats: tx_packets = %lu, tx_bytes = %lu, tx_errors = %lu, tx_dropped = %lu\n",
+		 (unsigned long int)nstat->tx_packets,
+		 (unsigned long int)nstat->tx_bytes,
+		 (unsigned long int)nstat->tx_errors,
+		 (unsigned long int)nstat->tx_dropped);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats: multicast = %lu, collisions = %lu\n",
+		 (unsigned long int)nstat->multicast,
+		 (unsigned long int)nstat->collisions);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats: rx_length_errors = %lu, rx_over_errors = %lu, rx_crc_errors = %lu\n",
+		 (unsigned long int)nstat->rx_length_errors,
+		 (unsigned long int)nstat->rx_over_errors,
+		 (unsigned long int)nstat->rx_crc_errors);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats: rx_frame_errors = %lu, rx_fifo_errors = %lu, rx_missed_errors = %lu\n",
+		 (unsigned long int)nstat->rx_frame_errors,
+		 (unsigned long int)nstat->rx_fifo_errors,
+		 (unsigned long int)nstat->rx_missed_errors);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats: tx_aborted_errors = %lu, tx_carrier_errors = %lu, tx_fifo_errors = %lu\n",
+		 (unsigned long int)nstat->tx_aborted_errors,
+		 (unsigned long int)nstat->tx_carrier_errors,
+		 (unsigned long int)nstat->tx_fifo_errors);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats: tx_heartbeat_errors = %lu, tx_window_errors = %lu\n",
+		 (unsigned long int)nstat->tx_heartbeat_errors,
+		 (unsigned long int)nstat->tx_window_errors);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats: rx_compressed = %lu, tx_compressed = %lu\n",
+		 (unsigned long int)nstat->rx_compressed,
+		 (unsigned long int)nstat->tx_compressed);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats_offsets: rx_packets = %lu, rx_bytes = %lu, rx_errors = %lu, rx_dropped = %lu\n",
+		 (unsigned long int)vsi->net_stats_offsets.rx_packets,
+		 (unsigned long int)vsi->net_stats_offsets.rx_bytes,
+		 (unsigned long int)vsi->net_stats_offsets.rx_errors,
+		 (unsigned long int)vsi->net_stats_offsets.rx_dropped);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats_offsets: tx_packets = %lu, tx_bytes = %lu, tx_errors = %lu, tx_dropped = %lu\n",
+		 (unsigned long int)vsi->net_stats_offsets.tx_packets,
+		 (unsigned long int)vsi->net_stats_offsets.tx_bytes,
+		 (unsigned long int)vsi->net_stats_offsets.tx_errors,
+		 (unsigned long int)vsi->net_stats_offsets.tx_dropped);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats_offsets: multicast = %lu, collisions = %lu\n",
+		 (unsigned long int)vsi->net_stats_offsets.multicast,
+		 (unsigned long int)vsi->net_stats_offsets.collisions);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats_offsets: rx_length_errors = %lu, rx_over_errors = %lu, rx_crc_errors = %lu\n",
+		 (unsigned long int)vsi->net_stats_offsets.rx_length_errors,
+		 (unsigned long int)vsi->net_stats_offsets.rx_over_errors,
+		 (unsigned long int)vsi->net_stats_offsets.rx_crc_errors);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats_offsets: rx_frame_errors = %lu, rx_fifo_errors = %lu, rx_missed_errors = %lu\n",
+		 (unsigned long int)vsi->net_stats_offsets.rx_frame_errors,
+		 (unsigned long int)vsi->net_stats_offsets.rx_fifo_errors,
+		 (unsigned long int)vsi->net_stats_offsets.rx_missed_errors);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats_offsets: tx_aborted_errors = %lu, tx_carrier_errors = %lu, tx_fifo_errors = %lu\n",
+		 (unsigned long int)vsi->net_stats_offsets.tx_aborted_errors,
+		 (unsigned long int)vsi->net_stats_offsets.tx_carrier_errors,
+		 (unsigned long int)vsi->net_stats_offsets.tx_fifo_errors);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats_offsets: tx_heartbeat_errors = %lu, tx_window_errors = %lu\n",
+		 (unsigned long int)vsi->net_stats_offsets.tx_heartbeat_errors,
+		 (unsigned long int)vsi->net_stats_offsets.tx_window_errors);
+	dev_info(&pf->pdev->dev,
+		 "    net_stats_offsets: rx_compressed = %lu, tx_compressed = %lu\n",
+		 (unsigned long int)vsi->net_stats_offsets.rx_compressed,
+		 (unsigned long int)vsi->net_stats_offsets.tx_compressed);
+	dev_info(&pf->pdev->dev,
+		 "    tx_restart = %d, tx_busy = %d, rx_buf_failed = %d, rx_page_failed = %d\n",
+		 vsi->tx_restart, vsi->tx_busy,
+		 vsi->rx_buf_failed, vsi->rx_page_failed);
+	rcu_read_lock();
+	for (i = 0; i < vsi->num_queue_pairs; i++) {
+		struct i40e_ring *rx_ring = READ_ONCE(vsi->rx_rings[i]);
+
+		if (!rx_ring)
+			continue;
+
+		dev_info(&pf->pdev->dev,
+			 "    rx_rings[%i]: state = %lu, queue_index = %d, reg_idx = %d\n",
+			 i, *rx_ring->state,
+			 rx_ring->queue_index,
+			 rx_ring->reg_idx);
+		dev_info(&pf->pdev->dev,
+			 "    rx_rings[%i]: rx_buf_len = %d\n",
+			 i, rx_ring->rx_buf_len);
+		dev_info(&pf->pdev->dev,
+			 "    rx_rings[%i]: next_to_use = %d, next_to_clean = %d, ring_active = %i\n",
+			 i,
+			 rx_ring->next_to_use,
+			 rx_ring->next_to_clean,
+			 rx_ring->ring_active);
+		dev_info(&pf->pdev->dev,
+			 "    rx_rings[%i]: rx_stats: packets = %lld, bytes = %lld, non_eop_descs = %lld\n",
+			 i, rx_ring->stats.packets,
+			 rx_ring->stats.bytes,
+			 rx_ring->rx_stats.non_eop_descs);
+		dev_info(&pf->pdev->dev,
+			 "    rx_rings[%i]: rx_stats: alloc_page_failed = %lld, alloc_buff_failed = %lld\n",
+			 i,
+			 rx_ring->rx_stats.alloc_page_failed,
+			 rx_ring->rx_stats.alloc_buff_failed);
+		dev_info(&pf->pdev->dev,
+			 "    rx_rings[%i]: rx_stats: realloc_count = %lld, page_reuse_count = %lld\n",
+			 i,
+			 rx_ring->rx_stats.realloc_count,
+			 rx_ring->rx_stats.page_reuse_count);
+		dev_info(&pf->pdev->dev,
+			 "    rx_rings[%i]: size = %i\n",
+			 i, rx_ring->size);
+		dev_info(&pf->pdev->dev,
+			 "    rx_rings[%i]: itr_setting = %d (%s)\n",
+			 i, rx_ring->itr_setting,
+			 ITR_IS_DYNAMIC(rx_ring->itr_setting) ? "dynamic" : "fixed");
+	}
+	for (i = 0; i < vsi->num_queue_pairs; i++) {
+		struct i40e_ring *tx_ring = READ_ONCE(vsi->tx_rings[i]);
+
+		if (!tx_ring)
+			continue;
+
+		dev_info(&pf->pdev->dev,
+			 "    tx_rings[%i]: state = %lu, queue_index = %d, reg_idx = %d\n",
+			 i, *tx_ring->state,
+			 tx_ring->queue_index,
+			 tx_ring->reg_idx);
+		dev_info(&pf->pdev->dev,
+			 "    tx_rings[%i]: next_to_use = %d, next_to_clean = %d, ring_active = %i\n",
+			 i,
+			 tx_ring->next_to_use,
+			 tx_ring->next_to_clean,
+			 tx_ring->ring_active);
+		dev_info(&pf->pdev->dev,
+			 "    tx_rings[%i]: tx_stats: packets = %lld, bytes = %lld, restart_queue = %lld\n",
+			 i, tx_ring->stats.packets,
+			 tx_ring->stats.bytes,
+			 tx_ring->tx_stats.restart_queue);
+		dev_info(&pf->pdev->dev,
+			 "    tx_rings[%i]: tx_stats: tx_busy = %lld, tx_done_old = %lld\n",
+			 i,
+			 tx_ring->tx_stats.tx_busy,
+			 tx_ring->tx_stats.tx_done_old);
+		dev_info(&pf->pdev->dev,
+			 "    tx_rings[%i]: size = %i\n",
+			 i, tx_ring->size);
+		dev_info(&pf->pdev->dev,
+			 "    tx_rings[%i]: DCB tc = %d\n",
+			 i, tx_ring->dcb_tc);
+		dev_info(&pf->pdev->dev,
+			 "    tx_rings[%i]: itr_setting = %d (%s)\n",
+			 i, tx_ring->itr_setting,
+			 ITR_IS_DYNAMIC(tx_ring->itr_setting) ? "dynamic" : "fixed");
+	}
+	rcu_read_unlock();
+	dev_info(&pf->pdev->dev,
+		 "    work_limit = %d\n",
+		 vsi->work_limit);
+	dev_info(&pf->pdev->dev,
+		 "    max_frame = %d, rx_buf_len = %d dtype = %d\n",
+		 vsi->max_frame, vsi->rx_buf_len, 0);
+	dev_info(&pf->pdev->dev,
+		 "    num_q_vectors = %i, base_vector = %i\n",
+		 vsi->num_q_vectors, vsi->base_vector);
+	dev_info(&pf->pdev->dev,
+		 "    seid = %d, id = %d, uplink_seid = %d\n",
+		 vsi->seid, vsi->id, vsi->uplink_seid);
+	dev_info(&pf->pdev->dev,
+		 "    base_queue = %d, num_queue_pairs = %d, num_desc = %d\n",
+		 vsi->base_queue, vsi->num_queue_pairs, vsi->num_desc);
+	dev_info(&pf->pdev->dev, "    type = %i\n", vsi->type);
+	if (vsi->type == I40E_VSI_SRIOV)
+		dev_info(&pf->pdev->dev, "    VF ID = %i\n", vsi->vf_id);
+	dev_info(&pf->pdev->dev,
+		 "    info: valid_sections = 0x%04x, switch_id = 0x%04x\n",
+		 vsi->info.valid_sections, vsi->info.switch_id);
+	dev_info(&pf->pdev->dev,
+		 "    info: sw_reserved[] = 0x%02x 0x%02x\n",
+		 vsi->info.sw_reserved[0], vsi->info.sw_reserved[1]);
+	dev_info(&pf->pdev->dev,
+		 "    info: sec_flags = 0x%02x, sec_reserved = 0x%02x\n",
+		 vsi->info.sec_flags, vsi->info.sec_reserved);
+	dev_info(&pf->pdev->dev,
+		 "    info: pvid = 0x%04x, fcoe_pvid = 0x%04x, port_vlan_flags = 0x%02x\n",
+		 vsi->info.pvid, vsi->info.fcoe_pvid,
+		 vsi->info.port_vlan_flags);
+	dev_info(&pf->pdev->dev,
+		 "    info: pvlan_reserved[] = 0x%02x 0x%02x 0x%02x\n",
+		 vsi->info.pvlan_reserved[0], vsi->info.pvlan_reserved[1],
+		 vsi->info.pvlan_reserved[2]);
+	dev_info(&pf->pdev->dev,
+		 "    info: ingress_table = 0x%08x, egress_table = 0x%08x\n",
+		 vsi->info.ingress_table, vsi->info.egress_table);
+	dev_info(&pf->pdev->dev,
+		 "    info: cas_pv_stag = 0x%04x, cas_pv_flags= 0x%02x, cas_pv_reserved = 0x%02x\n",
+		 vsi->info.cas_pv_tag, vsi->info.cas_pv_flags,
+		 vsi->info.cas_pv_reserved);
+	dev_info(&pf->pdev->dev,
+		 "    info: queue_mapping[0..7 ] = 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x\n",
+		 vsi->info.queue_mapping[0], vsi->info.queue_mapping[1],
+		 vsi->info.queue_mapping[2], vsi->info.queue_mapping[3],
+		 vsi->info.queue_mapping[4], vsi->info.queue_mapping[5],
+		 vsi->info.queue_mapping[6], vsi->info.queue_mapping[7]);
+	dev_info(&pf->pdev->dev,
+		 "    info: queue_mapping[8..15] = 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x\n",
+		 vsi->info.queue_mapping[8], vsi->info.queue_mapping[9],
+		 vsi->info.queue_mapping[10], vsi->info.queue_mapping[11],
+		 vsi->info.queue_mapping[12], vsi->info.queue_mapping[13],
+		 vsi->info.queue_mapping[14], vsi->info.queue_mapping[15]);
+	dev_info(&pf->pdev->dev,
+		 "    info: tc_mapping[] = 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x\n",
+		 vsi->info.tc_mapping[0], vsi->info.tc_mapping[1],
+		 vsi->info.tc_mapping[2], vsi->info.tc_mapping[3],
+		 vsi->info.tc_mapping[4], vsi->info.tc_mapping[5],
+		 vsi->info.tc_mapping[6], vsi->info.tc_mapping[7]);
+	dev_info(&pf->pdev->dev,
+		 "    info: queueing_opt_flags = 0x%02x  queueing_opt_reserved[0..2] = 0x%02x 0x%02x 0x%02x\n",
+		 vsi->info.queueing_opt_flags,
+		 vsi->info.queueing_opt_reserved[0],
+		 vsi->info.queueing_opt_reserved[1],
+		 vsi->info.queueing_opt_reserved[2]);
+	dev_info(&pf->pdev->dev,
+		 "    info: up_enable_bits = 0x%02x\n",
+		 vsi->info.up_enable_bits);
+	dev_info(&pf->pdev->dev,
+		 "    info: sched_reserved = 0x%02x, outer_up_table = 0x%04x\n",
+		 vsi->info.sched_reserved, vsi->info.outer_up_table);
+	dev_info(&pf->pdev->dev,
+		 "    info: cmd_reserved[] = 0x%02x 0x%02x 0x%02x 0x0%02x 0x%02x 0x%02x 0x%02x 0x0%02x\n",
+		 vsi->info.cmd_reserved[0], vsi->info.cmd_reserved[1],
+		 vsi->info.cmd_reserved[2], vsi->info.cmd_reserved[3],
+		 vsi->info.cmd_reserved[4], vsi->info.cmd_reserved[5],
+		 vsi->info.cmd_reserved[6], vsi->info.cmd_reserved[7]);
+	dev_info(&pf->pdev->dev,
+		 "    info: qs_handle[] = 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x\n",
+		 vsi->info.qs_handle[0], vsi->info.qs_handle[1],
+		 vsi->info.qs_handle[2], vsi->info.qs_handle[3],
+		 vsi->info.qs_handle[4], vsi->info.qs_handle[5],
+		 vsi->info.qs_handle[6], vsi->info.qs_handle[7]);
+	dev_info(&pf->pdev->dev,
+		 "    info: stat_counter_idx = 0x%04x, sched_id = 0x%04x\n",
+		 vsi->info.stat_counter_idx, vsi->info.sched_id);
+	dev_info(&pf->pdev->dev,
+		 "    info: resp_reserved[] = 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x\n",
+		 vsi->info.resp_reserved[0], vsi->info.resp_reserved[1],
+		 vsi->info.resp_reserved[2], vsi->info.resp_reserved[3],
+		 vsi->info.resp_reserved[4], vsi->info.resp_reserved[5],
+		 vsi->info.resp_reserved[6], vsi->info.resp_reserved[7],
+		 vsi->info.resp_reserved[8], vsi->info.resp_reserved[9],
+		 vsi->info.resp_reserved[10], vsi->info.resp_reserved[11]);
+	dev_info(&pf->pdev->dev, "    idx = %d\n", vsi->idx);
+	dev_info(&pf->pdev->dev,
+		 "    tc_config: numtc = %d, enabled_tc = 0x%x\n",
+		 vsi->tc_config.numtc, vsi->tc_config.enabled_tc);
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		dev_info(&pf->pdev->dev,
+			 "    tc_config: tc = %d, qoffset = %d, qcount = %d, netdev_tc = %d\n",
+			 i, vsi->tc_config.tc_info[i].qoffset,
+			 vsi->tc_config.tc_info[i].qcount,
+			 vsi->tc_config.tc_info[i].netdev_tc);
+	}
+	dev_info(&pf->pdev->dev,
+		 "    bw: bw_limit = %d, bw_max_quanta = %d\n",
+		 vsi->bw_limit, vsi->bw_max_quanta);
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		dev_info(&pf->pdev->dev,
+			 "    bw[%d]: ets_share_credits = %d, ets_limit_credits = %d, max_quanta = %d\n",
+			 i, vsi->bw_ets_share_credits[i],
+			 vsi->bw_ets_limit_credits[i],
+			 vsi->bw_ets_max_quanta[i]);
+	}
+}
+
+/**
+ * i40e_dbg_dump_aq_desc - handles dump aq_desc write into command datum
+ * @pf: the i40e_pf created in command write
+ **/
+static void i40e_dbg_dump_aq_desc(struct i40e_pf *pf)
+{
+	struct i40e_adminq_ring *ring;
+	struct i40e_hw *hw = &pf->hw;
+	char hdr[32];
+	int i;
+
+	snprintf(hdr, sizeof(hdr), "%s %s:         ",
+		 dev_driver_string(&pf->pdev->dev),
+		 dev_name(&pf->pdev->dev));
+
+	/* first the send (command) ring, then the receive (event) ring */
+	dev_info(&pf->pdev->dev, "AdminQ Tx Ring\n");
+	ring = &(hw->aq.asq);
+	for (i = 0; i < ring->count; i++) {
+		struct i40e_aq_desc *d = I40E_ADMINQ_DESC(*ring, i);
+
+		dev_info(&pf->pdev->dev,
+			 "   at[%02d] flags=0x%04x op=0x%04x dlen=0x%04x ret=0x%04x cookie_h=0x%08x cookie_l=0x%08x\n",
+			 i, d->flags, d->opcode, d->datalen, d->retval,
+			 d->cookie_high, d->cookie_low);
+		print_hex_dump(KERN_INFO, hdr, DUMP_PREFIX_NONE,
+			       16, 1, d->params.raw, 16, 0);
+	}
+
+	dev_info(&pf->pdev->dev, "AdminQ Rx Ring\n");
+	ring = &(hw->aq.arq);
+	for (i = 0; i < ring->count; i++) {
+		struct i40e_aq_desc *d = I40E_ADMINQ_DESC(*ring, i);
+
+		dev_info(&pf->pdev->dev,
+			 "   ar[%02d] flags=0x%04x op=0x%04x dlen=0x%04x ret=0x%04x cookie_h=0x%08x cookie_l=0x%08x\n",
+			 i, d->flags, d->opcode, d->datalen, d->retval,
+			 d->cookie_high, d->cookie_low);
+		print_hex_dump(KERN_INFO, hdr, DUMP_PREFIX_NONE,
+			       16, 1, d->params.raw, 16, 0);
+	}
+}
+
+/**
+ * i40e_dbg_dump_desc - handles dump desc write into command datum
+ * @cnt: number of arguments that the user supplied
+ * @vsi_seid: vsi id entered by user
+ * @ring_id: ring id entered by user
+ * @desc_n: descriptor number entered by user
+ * @pf: the i40e_pf created in command write
+ * @is_rx_ring: true if rx, false if tx
+ **/
+static void i40e_dbg_dump_desc(int cnt, int vsi_seid, int ring_id, int desc_n,
+			       struct i40e_pf *pf, bool is_rx_ring)
+{
+	struct i40e_tx_desc *txd;
+	union i40e_rx_desc *rxd;
+	struct i40e_ring *ring;
+	struct i40e_vsi *vsi;
+	int i;
+
+	vsi = i40e_dbg_find_vsi(pf, vsi_seid);
+	if (!vsi) {
+		dev_info(&pf->pdev->dev, "vsi %d not found\n", vsi_seid);
+		return;
+	}
+	if (ring_id >= vsi->num_queue_pairs || ring_id < 0) {
+		dev_info(&pf->pdev->dev, "ring %d not found\n", ring_id);
+		return;
+	}
+	if (!vsi->tx_rings || !vsi->tx_rings[0]->desc) {
+		dev_info(&pf->pdev->dev,
+			 "descriptor rings have not been allocated for vsi %d\n",
+			 vsi_seid);
+		return;
+	}
+
+	ring = kmemdup(is_rx_ring
+		       ? vsi->rx_rings[ring_id] : vsi->tx_rings[ring_id],
+		       sizeof(*ring), GFP_KERNEL);
+	if (!ring)
+		return;
+
+	if (cnt == 2) {
+		dev_info(&pf->pdev->dev, "vsi = %02i %s ring = %02i\n",
+			 vsi_seid, is_rx_ring ? "rx" : "tx", ring_id);
+		for (i = 0; i < ring->count; i++) {
+			if (!is_rx_ring) {
+				txd = I40E_TX_DESC(ring, i);
+				dev_info(&pf->pdev->dev,
+					 "   d[%03x] = 0x%016llx 0x%016llx\n",
+					 i, txd->buffer_addr,
+					 txd->cmd_type_offset_bsz);
+			} else {
+				rxd = I40E_RX_DESC(ring, i);
+				dev_info(&pf->pdev->dev,
+					 "   d[%03x] = 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n",
+					 i, rxd->read.pkt_addr,
+					 rxd->read.hdr_addr,
+					 rxd->read.rsvd1, rxd->read.rsvd2);
+			}
+		}
+	} else if (cnt == 3) {
+		if (desc_n >= ring->count || desc_n < 0) {
+			dev_info(&pf->pdev->dev,
+				 "descriptor %d not found\n", desc_n);
+			goto out;
+		}
+		if (!is_rx_ring) {
+			txd = I40E_TX_DESC(ring, desc_n);
+			dev_info(&pf->pdev->dev,
+				 "vsi = %02i tx ring = %02i d[%03x] = 0x%016llx 0x%016llx\n",
+				 vsi_seid, ring_id, desc_n,
+				 txd->buffer_addr, txd->cmd_type_offset_bsz);
+		} else {
+			rxd = I40E_RX_DESC(ring, desc_n);
+			dev_info(&pf->pdev->dev,
+				 "vsi = %02i rx ring = %02i d[%03x] = 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n",
+				 vsi_seid, ring_id, desc_n,
+				 rxd->read.pkt_addr, rxd->read.hdr_addr,
+				 rxd->read.rsvd1, rxd->read.rsvd2);
+		}
+	} else {
+		dev_info(&pf->pdev->dev, "dump desc rx/tx <vsi_seid> <ring_id> [<desc_n>]\n");
+	}
+
+out:
+	kfree(ring);
+}
+
+/**
+ * i40e_dbg_dump_vsi_no_seid - handles dump vsi write into command datum
+ * @pf: the i40e_pf created in command write
+ **/
+static void i40e_dbg_dump_vsi_no_seid(struct i40e_pf *pf)
+{
+	int i;
+
+	for (i = 0; i < pf->num_alloc_vsi; i++)
+		if (pf->vsi[i])
+			dev_info(&pf->pdev->dev, "dump vsi[%d]: %d\n",
+				 i, pf->vsi[i]->seid);
+}
+
+/**
+ * i40e_dbg_dump_stats - handles dump stats write into command datum
+ * @pf: the i40e_pf created in command write
+ * @estats: the eth stats structure to be dumped
+ **/
+static void i40e_dbg_dump_eth_stats(struct i40e_pf *pf,
+				    struct i40e_eth_stats *estats)
+{
+	dev_info(&pf->pdev->dev, "  ethstats:\n");
+	dev_info(&pf->pdev->dev,
+		 "    rx_bytes = \t%lld \trx_unicast = \t\t%lld \trx_multicast = \t%lld\n",
+		estats->rx_bytes, estats->rx_unicast, estats->rx_multicast);
+	dev_info(&pf->pdev->dev,
+		 "    rx_broadcast = \t%lld \trx_discards = \t\t%lld\n",
+		 estats->rx_broadcast, estats->rx_discards);
+	dev_info(&pf->pdev->dev,
+		 "    rx_unknown_protocol = \t%lld \ttx_bytes = \t%lld\n",
+		 estats->rx_unknown_protocol, estats->tx_bytes);
+	dev_info(&pf->pdev->dev,
+		 "    tx_unicast = \t%lld \ttx_multicast = \t\t%lld \ttx_broadcast = \t%lld\n",
+		 estats->tx_unicast, estats->tx_multicast, estats->tx_broadcast);
+	dev_info(&pf->pdev->dev,
+		 "    tx_discards = \t%lld \ttx_errors = \t\t%lld\n",
+		 estats->tx_discards, estats->tx_errors);
+}
+
+/**
+ * i40e_dbg_dump_veb_seid - handles dump stats of a single given veb
+ * @pf: the i40e_pf created in command write
+ * @seid: the seid the user put in
+ **/
+static void i40e_dbg_dump_veb_seid(struct i40e_pf *pf, int seid)
+{
+	struct i40e_veb *veb;
+
+	veb = i40e_dbg_find_veb(pf, seid);
+	if (!veb) {
+		dev_info(&pf->pdev->dev, "can't find veb %d\n", seid);
+		return;
+	}
+	dev_info(&pf->pdev->dev,
+		 "veb idx=%d,%d stats_ic=%d  seid=%d uplink=%d mode=%s\n",
+		 veb->idx, veb->veb_idx, veb->stats_idx, veb->seid,
+		 veb->uplink_seid,
+		 veb->bridge_mode == BRIDGE_MODE_VEPA ? "VEPA" : "VEB");
+	i40e_dbg_dump_eth_stats(pf, &veb->stats);
+}
+
+/**
+ * i40e_dbg_dump_veb_all - dumps all known veb's stats
+ * @pf: the i40e_pf created in command write
+ **/
+static void i40e_dbg_dump_veb_all(struct i40e_pf *pf)
+{
+	struct i40e_veb *veb;
+	int i;
+
+	for (i = 0; i < I40E_MAX_VEB; i++) {
+		veb = pf->veb[i];
+		if (veb)
+			i40e_dbg_dump_veb_seid(pf, veb->seid);
+	}
+}
+
+/**
+ * i40e_dbg_dump_vf - dump VF info
+ * @pf: the i40e_pf created in command write
+ * @vf_id: the vf_id from the user
+ **/
+static void i40e_dbg_dump_vf(struct i40e_pf *pf, int vf_id)
+{
+	struct i40e_vf *vf;
+	struct i40e_vsi *vsi;
+
+	if (!pf->num_alloc_vfs) {
+		dev_info(&pf->pdev->dev, "no VFs allocated\n");
+	} else if ((vf_id >= 0) && (vf_id < pf->num_alloc_vfs)) {
+		vf = &pf->vf[vf_id];
+		vsi = pf->vsi[vf->lan_vsi_idx];
+		dev_info(&pf->pdev->dev, "vf %2d: VSI id=%d, seid=%d, qps=%d\n",
+			 vf_id, vf->lan_vsi_id, vsi->seid, vf->num_queue_pairs);
+		dev_info(&pf->pdev->dev, "       num MDD=%lld, invalid msg=%lld, valid msg=%lld\n",
+			 vf->num_mdd_events,
+			 vf->num_invalid_msgs,
+			 vf->num_valid_msgs);
+	} else {
+		dev_info(&pf->pdev->dev, "invalid VF id %d\n", vf_id);
+	}
+}
+
+/**
+ * i40e_dbg_dump_vf_all - dump VF info for all VFs
+ * @pf: the i40e_pf created in command write
+ **/
+static void i40e_dbg_dump_vf_all(struct i40e_pf *pf)
+{
+	int i;
+
+	if (!pf->num_alloc_vfs)
+		dev_info(&pf->pdev->dev, "no VFs enabled!\n");
+	else
+		for (i = 0; i < pf->num_alloc_vfs; i++)
+			i40e_dbg_dump_vf(pf, i);
+}
+
+#define I40E_MAX_DEBUG_OUT_BUFFER (4096*4)
+/**
+ * i40e_dbg_command_write - write into command datum
+ * @filp: the opened file
+ * @buffer: where to find the user's data
+ * @count: the length of the user's data
+ * @ppos: file position offset
+ **/
+static ssize_t i40e_dbg_command_write(struct file *filp,
+				      const char __user *buffer,
+				      size_t count, loff_t *ppos)
+{
+	struct i40e_pf *pf = filp->private_data;
+	char *cmd_buf, *cmd_buf_tmp;
+	int bytes_not_copied;
+	struct i40e_vsi *vsi;
+	int vsi_seid;
+	int veb_seid;
+	int vf_id;
+	int cnt;
+
+	/* don't allow partial writes */
+	if (*ppos != 0)
+		return 0;
+
+	cmd_buf = kzalloc(count + 1, GFP_KERNEL);
+	if (!cmd_buf)
+		return count;
+	bytes_not_copied = copy_from_user(cmd_buf, buffer, count);
+	if (bytes_not_copied) {
+		kfree(cmd_buf);
+		return -EFAULT;
+	}
+	cmd_buf[count] = '\0';
+
+	cmd_buf_tmp = strchr(cmd_buf, '\n');
+	if (cmd_buf_tmp) {
+		*cmd_buf_tmp = '\0';
+		count = cmd_buf_tmp - cmd_buf + 1;
+	}
+
+	if (strncmp(cmd_buf, "add vsi", 7) == 0) {
+		vsi_seid = -1;
+		cnt = sscanf(&cmd_buf[7], "%i", &vsi_seid);
+		if (cnt == 0) {
+			/* default to PF VSI */
+			vsi_seid = pf->vsi[pf->lan_vsi]->seid;
+		} else if (vsi_seid < 0) {
+			dev_info(&pf->pdev->dev, "add VSI %d: bad vsi seid\n",
+				 vsi_seid);
+			goto command_write_done;
+		}
+
+		/* By default we are in VEPA mode, if this is the first VF/VMDq
+		 * VSI to be added switch to VEB mode.
+		 */
+		if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) {
+			pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
+			i40e_do_reset_safe(pf, I40E_PF_RESET_FLAG);
+		}
+
+		vsi = i40e_vsi_setup(pf, I40E_VSI_VMDQ2, vsi_seid, 0);
+		if (vsi)
+			dev_info(&pf->pdev->dev, "added VSI %d to relay %d\n",
+				 vsi->seid, vsi->uplink_seid);
+		else
+			dev_info(&pf->pdev->dev, "'%s' failed\n", cmd_buf);
+
+	} else if (strncmp(cmd_buf, "del vsi", 7) == 0) {
+		cnt = sscanf(&cmd_buf[7], "%i", &vsi_seid);
+		if (cnt != 1) {
+			dev_info(&pf->pdev->dev,
+				 "del vsi: bad command string, cnt=%d\n",
+				 cnt);
+			goto command_write_done;
+		}
+		vsi = i40e_dbg_find_vsi(pf, vsi_seid);
+		if (!vsi) {
+			dev_info(&pf->pdev->dev, "del VSI %d: seid not found\n",
+				 vsi_seid);
+			goto command_write_done;
+		}
+
+		dev_info(&pf->pdev->dev, "deleting VSI %d\n", vsi_seid);
+		i40e_vsi_release(vsi);
+
+	} else if (strncmp(cmd_buf, "add relay", 9) == 0) {
+		struct i40e_veb *veb;
+		int uplink_seid, i;
+
+		cnt = sscanf(&cmd_buf[9], "%i %i", &uplink_seid, &vsi_seid);
+		if (cnt != 2) {
+			dev_info(&pf->pdev->dev,
+				 "add relay: bad command string, cnt=%d\n",
+				 cnt);
+			goto command_write_done;
+		} else if (uplink_seid < 0) {
+			dev_info(&pf->pdev->dev,
+				 "add relay %d: bad uplink seid\n",
+				 uplink_seid);
+			goto command_write_done;
+		}
+
+		vsi = i40e_dbg_find_vsi(pf, vsi_seid);
+		if (!vsi) {
+			dev_info(&pf->pdev->dev,
+				 "add relay: VSI %d not found\n", vsi_seid);
+			goto command_write_done;
+		}
+
+		for (i = 0; i < I40E_MAX_VEB; i++)
+			if (pf->veb[i] && pf->veb[i]->seid == uplink_seid)
+				break;
+		if (i >= I40E_MAX_VEB && uplink_seid != 0 &&
+		    uplink_seid != pf->mac_seid) {
+			dev_info(&pf->pdev->dev,
+				 "add relay: relay uplink %d not found\n",
+				 uplink_seid);
+			goto command_write_done;
+		}
+
+		veb = i40e_veb_setup(pf, 0, uplink_seid, vsi_seid,
+				     vsi->tc_config.enabled_tc);
+		if (veb)
+			dev_info(&pf->pdev->dev, "added relay %d\n", veb->seid);
+		else
+			dev_info(&pf->pdev->dev, "add relay failed\n");
+
+	} else if (strncmp(cmd_buf, "del relay", 9) == 0) {
+		int i;
+		cnt = sscanf(&cmd_buf[9], "%i", &veb_seid);
+		if (cnt != 1) {
+			dev_info(&pf->pdev->dev,
+				 "del relay: bad command string, cnt=%d\n",
+				 cnt);
+			goto command_write_done;
+		} else if (veb_seid < 0) {
+			dev_info(&pf->pdev->dev,
+				 "del relay %d: bad relay seid\n", veb_seid);
+			goto command_write_done;
+		}
+
+		/* find the veb */
+		for (i = 0; i < I40E_MAX_VEB; i++)
+			if (pf->veb[i] && pf->veb[i]->seid == veb_seid)
+				break;
+		if (i >= I40E_MAX_VEB) {
+			dev_info(&pf->pdev->dev,
+				 "del relay: relay %d not found\n", veb_seid);
+			goto command_write_done;
+		}
+
+		dev_info(&pf->pdev->dev, "deleting relay %d\n", veb_seid);
+		i40e_veb_release(pf->veb[i]);
+	} else if (strncmp(cmd_buf, "add pvid", 8) == 0) {
+		i40e_status ret;
+		u16 vid;
+		unsigned int v;
+
+		cnt = sscanf(&cmd_buf[8], "%i %u", &vsi_seid, &v);
+		if (cnt != 2) {
+			dev_info(&pf->pdev->dev,
+				 "add pvid: bad command string, cnt=%d\n", cnt);
+			goto command_write_done;
+		}
+
+		vsi = i40e_dbg_find_vsi(pf, vsi_seid);
+		if (!vsi) {
+			dev_info(&pf->pdev->dev, "add pvid: VSI %d not found\n",
+				 vsi_seid);
+			goto command_write_done;
+		}
+
+		vid = v;
+		ret = i40e_vsi_add_pvid(vsi, vid);
+		if (!ret)
+			dev_info(&pf->pdev->dev,
+				 "add pvid: %d added to VSI %d\n",
+				 vid, vsi_seid);
+		else
+			dev_info(&pf->pdev->dev,
+				 "add pvid: %d to VSI %d failed, ret=%d\n",
+				 vid, vsi_seid, ret);
+
+	} else if (strncmp(cmd_buf, "del pvid", 8) == 0) {
+
+		cnt = sscanf(&cmd_buf[8], "%i", &vsi_seid);
+		if (cnt != 1) {
+			dev_info(&pf->pdev->dev,
+				 "del pvid: bad command string, cnt=%d\n",
+				 cnt);
+			goto command_write_done;
+		}
+
+		vsi = i40e_dbg_find_vsi(pf, vsi_seid);
+		if (!vsi) {
+			dev_info(&pf->pdev->dev,
+				 "del pvid: VSI %d not found\n", vsi_seid);
+			goto command_write_done;
+		}
+
+		i40e_vsi_remove_pvid(vsi);
+		dev_info(&pf->pdev->dev,
+			 "del pvid: removed from VSI %d\n", vsi_seid);
+
+	} else if (strncmp(cmd_buf, "dump", 4) == 0) {
+		if (strncmp(&cmd_buf[5], "switch", 6) == 0) {
+			i40e_fetch_switch_configuration(pf, true);
+		} else if (strncmp(&cmd_buf[5], "vsi", 3) == 0) {
+			cnt = sscanf(&cmd_buf[8], "%i", &vsi_seid);
+			if (cnt > 0)
+				i40e_dbg_dump_vsi_seid(pf, vsi_seid);
+			else
+				i40e_dbg_dump_vsi_no_seid(pf);
+		} else if (strncmp(&cmd_buf[5], "veb", 3) == 0) {
+			cnt = sscanf(&cmd_buf[8], "%i", &vsi_seid);
+			if (cnt > 0)
+				i40e_dbg_dump_veb_seid(pf, vsi_seid);
+			else
+				i40e_dbg_dump_veb_all(pf);
+		} else if (strncmp(&cmd_buf[5], "vf", 2) == 0) {
+			cnt = sscanf(&cmd_buf[7], "%i", &vf_id);
+			if (cnt > 0)
+				i40e_dbg_dump_vf(pf, vf_id);
+			else
+				i40e_dbg_dump_vf_all(pf);
+		} else if (strncmp(&cmd_buf[5], "desc", 4) == 0) {
+			int ring_id, desc_n;
+			if (strncmp(&cmd_buf[10], "rx", 2) == 0) {
+				cnt = sscanf(&cmd_buf[12], "%i %i %i",
+					     &vsi_seid, &ring_id, &desc_n);
+				i40e_dbg_dump_desc(cnt, vsi_seid, ring_id,
+						   desc_n, pf, true);
+			} else if (strncmp(&cmd_buf[10], "tx", 2)
+					== 0) {
+				cnt = sscanf(&cmd_buf[12], "%i %i %i",
+					     &vsi_seid, &ring_id, &desc_n);
+				i40e_dbg_dump_desc(cnt, vsi_seid, ring_id,
+						   desc_n, pf, false);
+			} else if (strncmp(&cmd_buf[10], "aq", 2) == 0) {
+				i40e_dbg_dump_aq_desc(pf);
+			} else {
+				dev_info(&pf->pdev->dev,
+					 "dump desc tx <vsi_seid> <ring_id> [<desc_n>]\n");
+				dev_info(&pf->pdev->dev,
+					 "dump desc rx <vsi_seid> <ring_id> [<desc_n>]\n");
+				dev_info(&pf->pdev->dev, "dump desc aq\n");
+			}
+		} else if (strncmp(&cmd_buf[5], "reset stats", 11) == 0) {
+			dev_info(&pf->pdev->dev,
+				 "core reset count: %d\n", pf->corer_count);
+			dev_info(&pf->pdev->dev,
+				 "global reset count: %d\n", pf->globr_count);
+			dev_info(&pf->pdev->dev,
+				 "emp reset count: %d\n", pf->empr_count);
+			dev_info(&pf->pdev->dev,
+				 "pf reset count: %d\n", pf->pfr_count);
+			dev_info(&pf->pdev->dev,
+				 "pf tx sluggish count: %d\n",
+				 pf->tx_sluggish_count);
+		} else if (strncmp(&cmd_buf[5], "port", 4) == 0) {
+			struct i40e_aqc_query_port_ets_config_resp *bw_data;
+			struct i40e_dcbx_config *cfg =
+						&pf->hw.local_dcbx_config;
+			struct i40e_dcbx_config *r_cfg =
+						&pf->hw.remote_dcbx_config;
+			int i, ret;
+			u16 switch_id;
+
+			bw_data = kzalloc(sizeof(
+				    struct i40e_aqc_query_port_ets_config_resp),
+					  GFP_KERNEL);
+			if (!bw_data) {
+				ret = -ENOMEM;
+				goto command_write_done;
+			}
+
+			vsi = pf->vsi[pf->lan_vsi];
+			switch_id =
+				le16_to_cpu(vsi->info.switch_id) &
+					    I40E_AQ_VSI_SW_ID_MASK;
+
+			ret = i40e_aq_query_port_ets_config(&pf->hw,
+							    switch_id,
+							    bw_data, NULL);
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					 "Query Port ETS Config AQ command failed =0x%x\n",
+					 pf->hw.aq.asq_last_status);
+				kfree(bw_data);
+				bw_data = NULL;
+				goto command_write_done;
+			}
+			dev_info(&pf->pdev->dev,
+				 "port bw: tc_valid=0x%x tc_strict_prio=0x%x, tc_bw_max=0x%04x,0x%04x\n",
+				 bw_data->tc_valid_bits,
+				 bw_data->tc_strict_priority_bits,
+				 le16_to_cpu(bw_data->tc_bw_max[0]),
+				 le16_to_cpu(bw_data->tc_bw_max[1]));
+			for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+				dev_info(&pf->pdev->dev, "port bw: tc_bw_share=%d tc_bw_limit=%d\n",
+					 bw_data->tc_bw_share_credits[i],
+					 le16_to_cpu(bw_data->tc_bw_limits[i]));
+			}
+
+			kfree(bw_data);
+			bw_data = NULL;
+
+			dev_info(&pf->pdev->dev,
+				 "port dcbx_mode=%d\n", cfg->dcbx_mode);
+			dev_info(&pf->pdev->dev,
+				 "port ets_cfg: willing=%d cbs=%d, maxtcs=%d\n",
+				 cfg->etscfg.willing, cfg->etscfg.cbs,
+				 cfg->etscfg.maxtcs);
+			for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+				dev_info(&pf->pdev->dev, "port ets_cfg: %d prio_tc=%d tcbw=%d tctsa=%d\n",
+					 i, cfg->etscfg.prioritytable[i],
+					 cfg->etscfg.tcbwtable[i],
+					 cfg->etscfg.tsatable[i]);
+			}
+			for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+				dev_info(&pf->pdev->dev, "port ets_rec: %d prio_tc=%d tcbw=%d tctsa=%d\n",
+					 i, cfg->etsrec.prioritytable[i],
+					 cfg->etsrec.tcbwtable[i],
+					 cfg->etsrec.tsatable[i]);
+			}
+			dev_info(&pf->pdev->dev,
+				 "port pfc_cfg: willing=%d mbc=%d, pfccap=%d pfcenable=0x%x\n",
+				 cfg->pfc.willing, cfg->pfc.mbc,
+				 cfg->pfc.pfccap, cfg->pfc.pfcenable);
+			dev_info(&pf->pdev->dev,
+				 "port app_table: num_apps=%d\n", cfg->numapps);
+			for (i = 0; i < cfg->numapps; i++) {
+				dev_info(&pf->pdev->dev, "port app_table: %d prio=%d selector=%d protocol=0x%x\n",
+					 i, cfg->app[i].priority,
+					 cfg->app[i].selector,
+					 cfg->app[i].protocolid);
+			}
+			/* Peer TLV DCBX data */
+			dev_info(&pf->pdev->dev,
+				 "remote port ets_cfg: willing=%d cbs=%d, maxtcs=%d\n",
+				 r_cfg->etscfg.willing,
+				 r_cfg->etscfg.cbs, r_cfg->etscfg.maxtcs);
+			for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+				dev_info(&pf->pdev->dev, "remote port ets_cfg: %d prio_tc=%d tcbw=%d tctsa=%d\n",
+					 i, r_cfg->etscfg.prioritytable[i],
+					 r_cfg->etscfg.tcbwtable[i],
+					 r_cfg->etscfg.tsatable[i]);
+			}
+			for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+				dev_info(&pf->pdev->dev, "remote port ets_rec: %d prio_tc=%d tcbw=%d tctsa=%d\n",
+					 i, r_cfg->etsrec.prioritytable[i],
+					 r_cfg->etsrec.tcbwtable[i],
+					 r_cfg->etsrec.tsatable[i]);
+			}
+			dev_info(&pf->pdev->dev,
+				 "remote port pfc_cfg: willing=%d mbc=%d, pfccap=%d pfcenable=0x%x\n",
+				 r_cfg->pfc.willing,
+				 r_cfg->pfc.mbc,
+				 r_cfg->pfc.pfccap,
+				 r_cfg->pfc.pfcenable);
+			dev_info(&pf->pdev->dev,
+				 "remote port app_table: num_apps=%d\n",
+				 r_cfg->numapps);
+			for (i = 0; i < r_cfg->numapps; i++) {
+				dev_info(&pf->pdev->dev, "remote port app_table: %d prio=%d selector=%d protocol=0x%x\n",
+					 i, r_cfg->app[i].priority,
+					 r_cfg->app[i].selector,
+					 r_cfg->app[i].protocolid);
+			}
+		} else if (strncmp(&cmd_buf[5], "debug fwdata", 12) == 0) {
+			int cluster_id, table_id;
+			int index, ret;
+			u16 buff_len = 4096;
+			u32 next_index;
+			u8 next_table;
+			u8 *buff;
+			u16 rlen;
+
+			cnt = sscanf(&cmd_buf[18], "%i %i %i",
+				     &cluster_id, &table_id, &index);
+			if (cnt != 3) {
+				dev_info(&pf->pdev->dev,
+					 "dump debug fwdata <cluster_id> <table_id> <index>\n");
+				goto command_write_done;
+			}
+
+			dev_info(&pf->pdev->dev,
+				 "AQ debug dump fwdata params %x %x %x %x\n",
+				 cluster_id, table_id, index, buff_len);
+			buff = kzalloc(buff_len, GFP_KERNEL);
+			if (!buff)
+				goto command_write_done;
+
+			ret = i40e_aq_debug_dump(&pf->hw, cluster_id, table_id,
+						 index, buff_len, buff, &rlen,
+						 &next_table, &next_index,
+						 NULL);
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					 "debug dump fwdata AQ Failed %d 0x%x\n",
+					 ret, pf->hw.aq.asq_last_status);
+				kfree(buff);
+				buff = NULL;
+				goto command_write_done;
+			}
+			dev_info(&pf->pdev->dev,
+				 "AQ debug dump fwdata rlen=0x%x next_table=0x%x next_index=0x%x\n",
+				 rlen, next_table, next_index);
+			print_hex_dump(KERN_INFO, "AQ buffer WB: ",
+				       DUMP_PREFIX_OFFSET, 16, 1,
+				       buff, rlen, true);
+			kfree(buff);
+			buff = NULL;
+		} else {
+			dev_info(&pf->pdev->dev,
+				 "dump desc tx <vsi_seid> <ring_id> [<desc_n>], dump desc rx <vsi_seid> <ring_id> [<desc_n>],\n");
+			dev_info(&pf->pdev->dev, "dump switch\n");
+			dev_info(&pf->pdev->dev, "dump vsi [seid]\n");
+			dev_info(&pf->pdev->dev, "dump reset stats\n");
+			dev_info(&pf->pdev->dev, "dump port\n");
+			dev_info(&pf->pdev->dev, "dump vf [vf_id]\n");
+			dev_info(&pf->pdev->dev,
+				 "dump debug fwdata <cluster_id> <table_id> <index>\n");
+		}
+	} else if (strncmp(cmd_buf, "pfr", 3) == 0) {
+		dev_info(&pf->pdev->dev, "debugfs: forcing PFR\n");
+		i40e_do_reset_safe(pf, BIT(__I40E_PF_RESET_REQUESTED));
+
+	} else if (strncmp(cmd_buf, "corer", 5) == 0) {
+		dev_info(&pf->pdev->dev, "debugfs: forcing CoreR\n");
+		i40e_do_reset_safe(pf, BIT(__I40E_CORE_RESET_REQUESTED));
+
+	} else if (strncmp(cmd_buf, "globr", 5) == 0) {
+		dev_info(&pf->pdev->dev, "debugfs: forcing GlobR\n");
+		i40e_do_reset_safe(pf, BIT(__I40E_GLOBAL_RESET_REQUESTED));
+
+	} else if (strncmp(cmd_buf, "empr", 4) == 0) {
+		dev_info(&pf->pdev->dev, "debugfs: forcing EMPR\n");
+		i40e_do_reset_safe(pf, BIT(__I40E_EMP_RESET_REQUESTED));
+
+	} else if (strncmp(cmd_buf, "read", 4) == 0) {
+		u32 address;
+		u32 value;
+
+		cnt = sscanf(&cmd_buf[4], "%i", &address);
+		if (cnt != 1) {
+			dev_info(&pf->pdev->dev, "read <reg>\n");
+			goto command_write_done;
+		}
+
+		/* check the range on address */
+		if (address > (pf->ioremap_len - sizeof(u32))) {
+			dev_info(&pf->pdev->dev, "read reg address 0x%08x too large, max=0x%08lx\n",
+				 address, (unsigned long int)(pf->ioremap_len - sizeof(u32)));
+			goto command_write_done;
+		}
+
+		value = rd32(&pf->hw, address);
+		dev_info(&pf->pdev->dev, "read: 0x%08x = 0x%08x\n",
+			 address, value);
+
+	} else if (strncmp(cmd_buf, "write", 5) == 0) {
+		u32 address, value;
+
+		cnt = sscanf(&cmd_buf[5], "%i %i", &address, &value);
+		if (cnt != 2) {
+			dev_info(&pf->pdev->dev, "write <reg> <value>\n");
+			goto command_write_done;
+		}
+
+		/* check the range on address */
+		if (address > (pf->ioremap_len - sizeof(u32))) {
+			dev_info(&pf->pdev->dev, "write reg address 0x%08x too large, max=0x%08lx\n",
+				 address, (unsigned long int)(pf->ioremap_len - sizeof(u32)));
+			goto command_write_done;
+		}
+		wr32(&pf->hw, address, value);
+		value = rd32(&pf->hw, address);
+		dev_info(&pf->pdev->dev, "write: 0x%08x = 0x%08x\n",
+			 address, value);
+	} else if (strncmp(cmd_buf, "clear_stats", 11) == 0) {
+		if (strncmp(&cmd_buf[12], "vsi", 3) == 0) {
+			cnt = sscanf(&cmd_buf[15], "%i", &vsi_seid);
+			if (cnt == 0) {
+				int i;
+
+				for (i = 0; i < pf->num_alloc_vsi; i++)
+					i40e_vsi_reset_stats(pf->vsi[i]);
+				dev_info(&pf->pdev->dev, "vsi clear stats called for all vsi's\n");
+			} else if (cnt == 1) {
+				vsi = i40e_dbg_find_vsi(pf, vsi_seid);
+				if (!vsi) {
+					dev_info(&pf->pdev->dev,
+						 "clear_stats vsi: bad vsi %d\n",
+						 vsi_seid);
+					goto command_write_done;
+				}
+				i40e_vsi_reset_stats(vsi);
+				dev_info(&pf->pdev->dev,
+					 "vsi clear stats called for vsi %d\n",
+					 vsi_seid);
+			} else {
+				dev_info(&pf->pdev->dev, "clear_stats vsi [seid]\n");
+			}
+		} else if (strncmp(&cmd_buf[12], "port", 4) == 0) {
+			if (pf->hw.partition_id == 1) {
+				i40e_pf_reset_stats(pf);
+				dev_info(&pf->pdev->dev, "port stats cleared\n");
+			} else {
+				dev_info(&pf->pdev->dev, "clear port stats not allowed on this port partition\n");
+			}
+		} else {
+			dev_info(&pf->pdev->dev, "clear_stats vsi [seid] or clear_stats port\n");
+		}
+	} else if (strncmp(cmd_buf, "send aq_cmd", 11) == 0) {
+		struct i40e_aq_desc *desc;
+		i40e_status ret;
+
+		desc = kzalloc(sizeof(struct i40e_aq_desc), GFP_KERNEL);
+		if (!desc)
+			goto command_write_done;
+		cnt = sscanf(&cmd_buf[11],
+			     "%hi %hi %hi %hi %i %i %i %i %i %i",
+			     &desc->flags,
+			     &desc->opcode, &desc->datalen, &desc->retval,
+			     &desc->cookie_high, &desc->cookie_low,
+			     &desc->params.internal.param0,
+			     &desc->params.internal.param1,
+			     &desc->params.internal.param2,
+			     &desc->params.internal.param3);
+		if (cnt != 10) {
+			dev_info(&pf->pdev->dev,
+				 "send aq_cmd: bad command string, cnt=%d\n",
+				 cnt);
+			kfree(desc);
+			desc = NULL;
+			goto command_write_done;
+		}
+		ret = i40e_asq_send_command(&pf->hw, desc, NULL, 0, NULL);
+		if (!ret) {
+			dev_info(&pf->pdev->dev, "AQ command sent Status : Success\n");
+		} else if (ret == I40E_ERR_ADMIN_QUEUE_ERROR) {
+			dev_info(&pf->pdev->dev,
+				 "AQ command send failed Opcode %x AQ Error: %d\n",
+				 desc->opcode, pf->hw.aq.asq_last_status);
+		} else {
+			dev_info(&pf->pdev->dev,
+				 "AQ command send failed Opcode %x Status: %d\n",
+				 desc->opcode, ret);
+		}
+		dev_info(&pf->pdev->dev,
+			 "AQ desc WB 0x%04x 0x%04x 0x%04x 0x%04x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x\n",
+			 desc->flags, desc->opcode, desc->datalen, desc->retval,
+			 desc->cookie_high, desc->cookie_low,
+			 desc->params.internal.param0,
+			 desc->params.internal.param1,
+			 desc->params.internal.param2,
+			 desc->params.internal.param3);
+		kfree(desc);
+		desc = NULL;
+	} else if (strncmp(cmd_buf, "send indirect aq_cmd", 20) == 0) {
+		struct i40e_aq_desc *desc;
+		i40e_status ret;
+		u16 buffer_len;
+		u8 *buff;
+
+		desc = kzalloc(sizeof(struct i40e_aq_desc), GFP_KERNEL);
+		if (!desc)
+			goto command_write_done;
+		cnt = sscanf(&cmd_buf[20],
+			     "%hi %hi %hi %hi %i %i %i %i %i %i %hi",
+			     &desc->flags,
+			     &desc->opcode, &desc->datalen, &desc->retval,
+			     &desc->cookie_high, &desc->cookie_low,
+			     &desc->params.internal.param0,
+			     &desc->params.internal.param1,
+			     &desc->params.internal.param2,
+			     &desc->params.internal.param3,
+			     &buffer_len);
+		if (cnt != 11) {
+			dev_info(&pf->pdev->dev,
+				 "send indirect aq_cmd: bad command string, cnt=%d\n",
+				 cnt);
+			kfree(desc);
+			desc = NULL;
+			goto command_write_done;
+		}
+		/* Just stub a buffer big enough in case user messed up */
+		if (buffer_len == 0)
+			buffer_len = 1280;
+
+		buff = kzalloc(buffer_len, GFP_KERNEL);
+		if (!buff) {
+			kfree(desc);
+			desc = NULL;
+			goto command_write_done;
+		}
+		desc->flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+		ret = i40e_asq_send_command(&pf->hw, desc, buff,
+					    buffer_len, NULL);
+		if (!ret) {
+			dev_info(&pf->pdev->dev, "AQ command sent Status : Success\n");
+		} else if (ret == I40E_ERR_ADMIN_QUEUE_ERROR) {
+			dev_info(&pf->pdev->dev,
+				 "AQ command send failed Opcode %x AQ Error: %d\n",
+				 desc->opcode, pf->hw.aq.asq_last_status);
+		} else {
+			dev_info(&pf->pdev->dev,
+				 "AQ command send failed Opcode %x Status: %d\n",
+				 desc->opcode, ret);
+		}
+		dev_info(&pf->pdev->dev,
+			 "AQ desc WB 0x%04x 0x%04x 0x%04x 0x%04x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x\n",
+			 desc->flags, desc->opcode, desc->datalen, desc->retval,
+			 desc->cookie_high, desc->cookie_low,
+			 desc->params.internal.param0,
+			 desc->params.internal.param1,
+			 desc->params.internal.param2,
+			 desc->params.internal.param3);
+		print_hex_dump(KERN_INFO, "AQ buffer WB: ",
+			       DUMP_PREFIX_OFFSET, 16, 1,
+			       buff, buffer_len, true);
+		kfree(buff);
+		buff = NULL;
+		kfree(desc);
+		desc = NULL;
+	} else if (strncmp(cmd_buf, "fd current cnt", 14) == 0) {
+		dev_info(&pf->pdev->dev, "FD current total filter count for this interface: %d\n",
+			 i40e_get_current_fd_count(pf));
+	} else if (strncmp(cmd_buf, "lldp", 4) == 0) {
+		if (strncmp(&cmd_buf[5], "stop", 4) == 0) {
+			int ret;
+
+			ret = i40e_aq_stop_lldp(&pf->hw, false, NULL);
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					 "Stop LLDP AQ command failed =0x%x\n",
+					 pf->hw.aq.asq_last_status);
+				goto command_write_done;
+			}
+			ret = i40e_aq_add_rem_control_packet_filter(&pf->hw,
+						pf->hw.mac.addr,
+						I40E_ETH_P_LLDP, 0,
+						pf->vsi[pf->lan_vsi]->seid,
+						0, true, NULL, NULL);
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					"%s: Add Control Packet Filter AQ command failed =0x%x\n",
+					__func__, pf->hw.aq.asq_last_status);
+				goto command_write_done;
+			}
+#ifdef CONFIG_I40E_DCB
+			pf->dcbx_cap = DCB_CAP_DCBX_HOST |
+				       DCB_CAP_DCBX_VER_IEEE;
+#endif /* CONFIG_I40E_DCB */
+		} else if (strncmp(&cmd_buf[5], "start", 5) == 0) {
+			int ret;
+
+			ret = i40e_aq_add_rem_control_packet_filter(&pf->hw,
+						pf->hw.mac.addr,
+						I40E_ETH_P_LLDP, 0,
+						pf->vsi[pf->lan_vsi]->seid,
+						0, false, NULL, NULL);
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					"%s: Remove Control Packet Filter AQ command failed =0x%x\n",
+					__func__, pf->hw.aq.asq_last_status);
+				/* Continue and start FW LLDP anyways */
+			}
+
+			ret = i40e_aq_start_lldp(&pf->hw, NULL);
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					 "Start LLDP AQ command failed =0x%x\n",
+					 pf->hw.aq.asq_last_status);
+				goto command_write_done;
+			}
+#ifdef CONFIG_I40E_DCB
+			pf->dcbx_cap = DCB_CAP_DCBX_LLD_MANAGED |
+				       DCB_CAP_DCBX_VER_IEEE;
+#endif /* CONFIG_I40E_DCB */
+		} else if (strncmp(&cmd_buf[5],
+			   "get local", 9) == 0) {
+			u16 llen, rlen;
+			int ret;
+			u8 *buff;
+
+			buff = kzalloc(I40E_LLDPDU_SIZE, GFP_KERNEL);
+			if (!buff)
+				goto command_write_done;
+
+			ret = i40e_aq_get_lldp_mib(&pf->hw, 0,
+						   I40E_AQ_LLDP_MIB_LOCAL,
+						   buff, I40E_LLDPDU_SIZE,
+						   &llen, &rlen, NULL);
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					 "Get LLDP MIB (local) AQ command failed =0x%x\n",
+					 pf->hw.aq.asq_last_status);
+				kfree(buff);
+				buff = NULL;
+				goto command_write_done;
+			}
+			dev_info(&pf->pdev->dev, "LLDP MIB (local)\n");
+			print_hex_dump(KERN_INFO, "LLDP MIB (local): ",
+				       DUMP_PREFIX_OFFSET, 16, 1,
+				       buff, I40E_LLDPDU_SIZE, true);
+			kfree(buff);
+			buff = NULL;
+		} else if (strncmp(&cmd_buf[5], "get remote", 10) == 0) {
+			u16 llen, rlen;
+			int ret;
+			u8 *buff;
+
+			buff = kzalloc(I40E_LLDPDU_SIZE, GFP_KERNEL);
+			if (!buff)
+				goto command_write_done;
+
+			ret = i40e_aq_get_lldp_mib(&pf->hw,
+					I40E_AQ_LLDP_BRIDGE_TYPE_NEAREST_BRIDGE,
+					I40E_AQ_LLDP_MIB_REMOTE,
+					buff, I40E_LLDPDU_SIZE,
+					&llen, &rlen, NULL);
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					 "Get LLDP MIB (remote) AQ command failed =0x%x\n",
+					 pf->hw.aq.asq_last_status);
+				kfree(buff);
+				buff = NULL;
+				goto command_write_done;
+			}
+			dev_info(&pf->pdev->dev, "LLDP MIB (remote)\n");
+			print_hex_dump(KERN_INFO, "LLDP MIB (remote): ",
+				       DUMP_PREFIX_OFFSET, 16, 1,
+				       buff, I40E_LLDPDU_SIZE, true);
+			kfree(buff);
+			buff = NULL;
+		} else if (strncmp(&cmd_buf[5], "event on", 8) == 0) {
+			int ret;
+
+			ret = i40e_aq_cfg_lldp_mib_change_event(&pf->hw,
+								true, NULL);
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					 "Config LLDP MIB Change Event (on) AQ command failed =0x%x\n",
+					 pf->hw.aq.asq_last_status);
+				goto command_write_done;
+			}
+		} else if (strncmp(&cmd_buf[5], "event off", 9) == 0) {
+			int ret;
+
+			ret = i40e_aq_cfg_lldp_mib_change_event(&pf->hw,
+								false, NULL);
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					 "Config LLDP MIB Change Event (off) AQ command failed =0x%x\n",
+					 pf->hw.aq.asq_last_status);
+				goto command_write_done;
+			}
+		}
+	} else if (strncmp(cmd_buf, "nvm read", 8) == 0) {
+		u16 buffer_len, bytes;
+		u16 module;
+		u32 offset;
+		u16 *buff;
+		int ret;
+
+		cnt = sscanf(&cmd_buf[8], "%hx %x %hx",
+			     &module, &offset, &buffer_len);
+		if (cnt == 0) {
+			module = 0;
+			offset = 0;
+			buffer_len = 0;
+		} else if (cnt == 1) {
+			offset = 0;
+			buffer_len = 0;
+		} else if (cnt == 2) {
+			buffer_len = 0;
+		} else if (cnt > 3) {
+			dev_info(&pf->pdev->dev,
+				 "nvm read: bad command string, cnt=%d\n", cnt);
+			goto command_write_done;
+		}
+
+		/* set the max length */
+		buffer_len = min_t(u16, buffer_len, I40E_MAX_AQ_BUF_SIZE/2);
+
+		bytes = 2 * buffer_len;
+
+		/* read at least 1k bytes, no more than 4kB */
+		bytes = clamp(bytes, (u16)1024, (u16)I40E_MAX_AQ_BUF_SIZE);
+		buff = kzalloc(bytes, GFP_KERNEL);
+		if (!buff)
+			goto command_write_done;
+
+		ret = i40e_acquire_nvm(&pf->hw, I40E_RESOURCE_READ);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "Failed Acquiring NVM resource for read err=%d status=0x%x\n",
+				 ret, pf->hw.aq.asq_last_status);
+			kfree(buff);
+			goto command_write_done;
+		}
+
+		ret = i40e_aq_read_nvm(&pf->hw, module, (2 * offset),
+				       bytes, (u8 *)buff, true, NULL);
+		i40e_release_nvm(&pf->hw);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "Read NVM AQ failed err=%d status=0x%x\n",
+				 ret, pf->hw.aq.asq_last_status);
+		} else {
+			dev_info(&pf->pdev->dev,
+				 "Read NVM module=0x%x offset=0x%x words=%d\n",
+				 module, offset, buffer_len);
+			if (bytes)
+				print_hex_dump(KERN_INFO, "NVM Dump: ",
+					DUMP_PREFIX_OFFSET, 16, 2,
+					buff, bytes, true);
+		}
+		kfree(buff);
+		buff = NULL;
+	} else {
+		dev_info(&pf->pdev->dev, "unknown command '%s'\n", cmd_buf);
+		dev_info(&pf->pdev->dev, "available commands\n");
+		dev_info(&pf->pdev->dev, "  add vsi [relay_seid]\n");
+		dev_info(&pf->pdev->dev, "  del vsi [vsi_seid]\n");
+		dev_info(&pf->pdev->dev, "  add relay <uplink_seid> <vsi_seid>\n");
+		dev_info(&pf->pdev->dev, "  del relay <relay_seid>\n");
+		dev_info(&pf->pdev->dev, "  add pvid <vsi_seid> <vid>\n");
+		dev_info(&pf->pdev->dev, "  del pvid <vsi_seid>\n");
+		dev_info(&pf->pdev->dev, "  dump switch\n");
+		dev_info(&pf->pdev->dev, "  dump vsi [seid]\n");
+		dev_info(&pf->pdev->dev, "  dump desc tx <vsi_seid> <ring_id> [<desc_n>]\n");
+		dev_info(&pf->pdev->dev, "  dump desc rx <vsi_seid> <ring_id> [<desc_n>]\n");
+		dev_info(&pf->pdev->dev, "  dump desc aq\n");
+		dev_info(&pf->pdev->dev, "  dump reset stats\n");
+		dev_info(&pf->pdev->dev, "  dump debug fwdata <cluster_id> <table_id> <index>\n");
+		dev_info(&pf->pdev->dev, "  read <reg>\n");
+		dev_info(&pf->pdev->dev, "  write <reg> <value>\n");
+		dev_info(&pf->pdev->dev, "  clear_stats vsi [seid]\n");
+		dev_info(&pf->pdev->dev, "  clear_stats port\n");
+		dev_info(&pf->pdev->dev, "  pfr\n");
+		dev_info(&pf->pdev->dev, "  corer\n");
+		dev_info(&pf->pdev->dev, "  globr\n");
+		dev_info(&pf->pdev->dev, "  send aq_cmd <flags> <opcode> <datalen> <retval> <cookie_h> <cookie_l> <param0> <param1> <param2> <param3>\n");
+		dev_info(&pf->pdev->dev, "  send indirect aq_cmd <flags> <opcode> <datalen> <retval> <cookie_h> <cookie_l> <param0> <param1> <param2> <param3> <buffer_len>\n");
+		dev_info(&pf->pdev->dev, "  fd current cnt");
+		dev_info(&pf->pdev->dev, "  lldp start\n");
+		dev_info(&pf->pdev->dev, "  lldp stop\n");
+		dev_info(&pf->pdev->dev, "  lldp get local\n");
+		dev_info(&pf->pdev->dev, "  lldp get remote\n");
+		dev_info(&pf->pdev->dev, "  lldp event on\n");
+		dev_info(&pf->pdev->dev, "  lldp event off\n");
+		dev_info(&pf->pdev->dev, "  nvm read [module] [word_offset] [word_count]\n");
+	}
+
+command_write_done:
+	kfree(cmd_buf);
+	cmd_buf = NULL;
+	return count;
+}
+
+static const struct file_operations i40e_dbg_command_fops = {
+	.owner = THIS_MODULE,
+	.open =  simple_open,
+	.read =  i40e_dbg_command_read,
+	.write = i40e_dbg_command_write,
+};
+
+/**************************************************************
+ * netdev_ops
+ * The netdev_ops entry in debugfs is for giving the driver commands
+ * to be executed from the netdev operations.
+ **************************************************************/
+static char i40e_dbg_netdev_ops_buf[256] = "";
+
+/**
+ * i40e_dbg_netdev_ops - read for netdev_ops datum
+ * @filp: the opened file
+ * @buffer: where to write the data for the user to read
+ * @count: the size of the user's buffer
+ * @ppos: file position offset
+ **/
+static ssize_t i40e_dbg_netdev_ops_read(struct file *filp, char __user *buffer,
+					size_t count, loff_t *ppos)
+{
+	struct i40e_pf *pf = filp->private_data;
+	int bytes_not_copied;
+	int buf_size = 256;
+	char *buf;
+	int len;
+
+	/* don't allow partal reads */
+	if (*ppos != 0)
+		return 0;
+	if (count < buf_size)
+		return -ENOSPC;
+
+	buf = kzalloc(buf_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOSPC;
+
+	len = snprintf(buf, buf_size, "%s: %s\n",
+		       pf->vsi[pf->lan_vsi]->netdev->name,
+		       i40e_dbg_netdev_ops_buf);
+
+	bytes_not_copied = copy_to_user(buffer, buf, len);
+	kfree(buf);
+
+	if (bytes_not_copied)
+		return -EFAULT;
+
+	*ppos = len;
+	return len;
+}
+
+/**
+ * i40e_dbg_netdev_ops_write - write into netdev_ops datum
+ * @filp: the opened file
+ * @buffer: where to find the user's data
+ * @count: the length of the user's data
+ * @ppos: file position offset
+ **/
+static ssize_t i40e_dbg_netdev_ops_write(struct file *filp,
+					 const char __user *buffer,
+					 size_t count, loff_t *ppos)
+{
+	struct i40e_pf *pf = filp->private_data;
+	int bytes_not_copied;
+	struct i40e_vsi *vsi;
+	char *buf_tmp;
+	int vsi_seid;
+	int i, cnt;
+
+	/* don't allow partial writes */
+	if (*ppos != 0)
+		return 0;
+	if (count >= sizeof(i40e_dbg_netdev_ops_buf))
+		return -ENOSPC;
+
+	memset(i40e_dbg_netdev_ops_buf, 0, sizeof(i40e_dbg_netdev_ops_buf));
+	bytes_not_copied = copy_from_user(i40e_dbg_netdev_ops_buf,
+					  buffer, count);
+	if (bytes_not_copied)
+		return -EFAULT;
+	i40e_dbg_netdev_ops_buf[count] = '\0';
+
+	buf_tmp = strchr(i40e_dbg_netdev_ops_buf, '\n');
+	if (buf_tmp) {
+		*buf_tmp = '\0';
+		count = buf_tmp - i40e_dbg_netdev_ops_buf + 1;
+	}
+
+	if (strncmp(i40e_dbg_netdev_ops_buf, "tx_timeout", 10) == 0) {
+		cnt = sscanf(&i40e_dbg_netdev_ops_buf[11], "%i", &vsi_seid);
+		if (cnt != 1) {
+			dev_info(&pf->pdev->dev, "tx_timeout <vsi_seid>\n");
+			goto netdev_ops_write_done;
+		}
+		vsi = i40e_dbg_find_vsi(pf, vsi_seid);
+		if (!vsi) {
+			dev_info(&pf->pdev->dev,
+				 "tx_timeout: VSI %d not found\n", vsi_seid);
+		} else if (!vsi->netdev) {
+			dev_info(&pf->pdev->dev, "tx_timeout: no netdev for VSI %d\n",
+				 vsi_seid);
+		} else if (test_bit(__I40E_VSI_DOWN, vsi->state)) {
+			dev_info(&pf->pdev->dev, "tx_timeout: VSI %d not UP\n",
+				 vsi_seid);
+		} else if (rtnl_trylock()) {
+			vsi->netdev->netdev_ops->ndo_tx_timeout(vsi->netdev);
+			rtnl_unlock();
+			dev_info(&pf->pdev->dev, "tx_timeout called\n");
+		} else {
+			dev_info(&pf->pdev->dev, "Could not acquire RTNL - please try again\n");
+		}
+	} else if (strncmp(i40e_dbg_netdev_ops_buf, "change_mtu", 10) == 0) {
+		int mtu;
+
+		cnt = sscanf(&i40e_dbg_netdev_ops_buf[11], "%i %i",
+			     &vsi_seid, &mtu);
+		if (cnt != 2) {
+			dev_info(&pf->pdev->dev, "change_mtu <vsi_seid> <mtu>\n");
+			goto netdev_ops_write_done;
+		}
+		vsi = i40e_dbg_find_vsi(pf, vsi_seid);
+		if (!vsi) {
+			dev_info(&pf->pdev->dev,
+				 "change_mtu: VSI %d not found\n", vsi_seid);
+		} else if (!vsi->netdev) {
+			dev_info(&pf->pdev->dev, "change_mtu: no netdev for VSI %d\n",
+				 vsi_seid);
+		} else if (rtnl_trylock()) {
+			vsi->netdev->netdev_ops->ndo_change_mtu(vsi->netdev,
+								mtu);
+			rtnl_unlock();
+			dev_info(&pf->pdev->dev, "change_mtu called\n");
+		} else {
+			dev_info(&pf->pdev->dev, "Could not acquire RTNL - please try again\n");
+		}
+
+	} else if (strncmp(i40e_dbg_netdev_ops_buf, "set_rx_mode", 11) == 0) {
+		cnt = sscanf(&i40e_dbg_netdev_ops_buf[11], "%i", &vsi_seid);
+		if (cnt != 1) {
+			dev_info(&pf->pdev->dev, "set_rx_mode <vsi_seid>\n");
+			goto netdev_ops_write_done;
+		}
+		vsi = i40e_dbg_find_vsi(pf, vsi_seid);
+		if (!vsi) {
+			dev_info(&pf->pdev->dev,
+				 "set_rx_mode: VSI %d not found\n", vsi_seid);
+		} else if (!vsi->netdev) {
+			dev_info(&pf->pdev->dev, "set_rx_mode: no netdev for VSI %d\n",
+				 vsi_seid);
+		} else if (rtnl_trylock()) {
+			vsi->netdev->netdev_ops->ndo_set_rx_mode(vsi->netdev);
+			rtnl_unlock();
+			dev_info(&pf->pdev->dev, "set_rx_mode called\n");
+		} else {
+			dev_info(&pf->pdev->dev, "Could not acquire RTNL - please try again\n");
+		}
+
+	} else if (strncmp(i40e_dbg_netdev_ops_buf, "napi", 4) == 0) {
+		cnt = sscanf(&i40e_dbg_netdev_ops_buf[4], "%i", &vsi_seid);
+		if (cnt != 1) {
+			dev_info(&pf->pdev->dev, "napi <vsi_seid>\n");
+			goto netdev_ops_write_done;
+		}
+		vsi = i40e_dbg_find_vsi(pf, vsi_seid);
+		if (!vsi) {
+			dev_info(&pf->pdev->dev, "napi: VSI %d not found\n",
+				 vsi_seid);
+		} else if (!vsi->netdev) {
+			dev_info(&pf->pdev->dev, "napi: no netdev for VSI %d\n",
+				 vsi_seid);
+		} else {
+			for (i = 0; i < vsi->num_q_vectors; i++)
+				napi_schedule(&vsi->q_vectors[i]->napi);
+			dev_info(&pf->pdev->dev, "napi called\n");
+		}
+	} else {
+		dev_info(&pf->pdev->dev, "unknown command '%s'\n",
+			 i40e_dbg_netdev_ops_buf);
+		dev_info(&pf->pdev->dev, "available commands\n");
+		dev_info(&pf->pdev->dev, "  tx_timeout <vsi_seid>\n");
+		dev_info(&pf->pdev->dev, "  change_mtu <vsi_seid> <mtu>\n");
+		dev_info(&pf->pdev->dev, "  set_rx_mode <vsi_seid>\n");
+		dev_info(&pf->pdev->dev, "  napi <vsi_seid>\n");
+	}
+netdev_ops_write_done:
+	return count;
+}
+
+static const struct file_operations i40e_dbg_netdev_ops_fops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = i40e_dbg_netdev_ops_read,
+	.write = i40e_dbg_netdev_ops_write,
+};
+
+/**
+ * i40e_dbg_pf_init - setup the debugfs directory for the PF
+ * @pf: the PF that is starting up
+ **/
+void i40e_dbg_pf_init(struct i40e_pf *pf)
+{
+	struct dentry *pfile;
+	const char *name = pci_name(pf->pdev);
+	const struct device *dev = &pf->pdev->dev;
+
+	pf->i40e_dbg_pf = debugfs_create_dir(name, i40e_dbg_root);
+	if (!pf->i40e_dbg_pf)
+		return;
+
+	pfile = debugfs_create_file("command", 0600, pf->i40e_dbg_pf, pf,
+				    &i40e_dbg_command_fops);
+	if (!pfile)
+		goto create_failed;
+
+	pfile = debugfs_create_file("netdev_ops", 0600, pf->i40e_dbg_pf, pf,
+				    &i40e_dbg_netdev_ops_fops);
+	if (!pfile)
+		goto create_failed;
+
+	return;
+
+create_failed:
+	dev_info(dev, "debugfs dir/file for %s failed\n", name);
+	debugfs_remove_recursive(pf->i40e_dbg_pf);
+}
+
+/**
+ * i40e_dbg_pf_exit - clear out the PF's debugfs entries
+ * @pf: the PF that is stopping
+ **/
+void i40e_dbg_pf_exit(struct i40e_pf *pf)
+{
+	debugfs_remove_recursive(pf->i40e_dbg_pf);
+	pf->i40e_dbg_pf = NULL;
+}
+
+/**
+ * i40e_dbg_init - start up debugfs for the driver
+ **/
+void i40e_dbg_init(void)
+{
+	i40e_dbg_root = debugfs_create_dir(i40e_driver_name, NULL);
+	if (!i40e_dbg_root)
+		pr_info("init of debugfs failed\n");
+}
+
+/**
+ * i40e_dbg_exit - clean out the driver's debugfs entries
+ **/
+void i40e_dbg_exit(void)
+{
+	debugfs_remove_recursive(i40e_dbg_root);
+	i40e_dbg_root = NULL;
+}
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_devids.h b/drivers/net/ethernet/intel/i40e/i40e_devids.h
new file mode 100644
index 0000000..ad6a66c
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_devids.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_DEVIDS_H_
+#define _I40E_DEVIDS_H_
+
+/* Device IDs */
+#define I40E_DEV_ID_SFP_XL710		0x1572
+#define I40E_DEV_ID_QEMU		0x1574
+#define I40E_DEV_ID_KX_B		0x1580
+#define I40E_DEV_ID_KX_C		0x1581
+#define I40E_DEV_ID_QSFP_A		0x1583
+#define I40E_DEV_ID_QSFP_B		0x1584
+#define I40E_DEV_ID_QSFP_C		0x1585
+#define I40E_DEV_ID_10G_BASE_T		0x1586
+#define I40E_DEV_ID_20G_KR2		0x1587
+#define I40E_DEV_ID_20G_KR2_A		0x1588
+#define I40E_DEV_ID_10G_BASE_T4		0x1589
+#define I40E_DEV_ID_25G_B		0x158A
+#define I40E_DEV_ID_25G_SFP28		0x158B
+#define I40E_DEV_ID_KX_X722		0x37CE
+#define I40E_DEV_ID_QSFP_X722		0x37CF
+#define I40E_DEV_ID_SFP_X722		0x37D0
+#define I40E_DEV_ID_1G_BASE_T_X722	0x37D1
+#define I40E_DEV_ID_10G_BASE_T_X722	0x37D2
+#define I40E_DEV_ID_SFP_I_X722		0x37D3
+
+#define i40e_is_40G_device(d)		((d) == I40E_DEV_ID_QSFP_A  || \
+					 (d) == I40E_DEV_ID_QSFP_B  || \
+					 (d) == I40E_DEV_ID_QSFP_C)
+
+#endif /* _I40E_DEVIDS_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_diag.c b/drivers/net/ethernet/intel/i40e/i40e_diag.c
new file mode 100644
index 0000000..df3e604
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_diag.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#include "i40e_diag.h"
+#include "i40e_prototype.h"
+
+/**
+ * i40e_diag_reg_pattern_test
+ * @hw: pointer to the hw struct
+ * @reg: reg to be tested
+ * @mask: bits to be touched
+ **/
+static i40e_status i40e_diag_reg_pattern_test(struct i40e_hw *hw,
+							u32 reg, u32 mask)
+{
+	static const u32 patterns[] = {
+		0x5A5A5A5A, 0xA5A5A5A5, 0x00000000, 0xFFFFFFFF
+	};
+	u32 pat, val, orig_val;
+	int i;
+
+	orig_val = rd32(hw, reg);
+	for (i = 0; i < ARRAY_SIZE(patterns); i++) {
+		pat = patterns[i];
+		wr32(hw, reg, (pat & mask));
+		val = rd32(hw, reg);
+		if ((val & mask) != (pat & mask)) {
+			i40e_debug(hw, I40E_DEBUG_DIAG,
+				   "%s: reg pattern test failed - reg 0x%08x pat 0x%08x val 0x%08x\n",
+				   __func__, reg, pat, val);
+			return I40E_ERR_DIAG_TEST_FAILED;
+		}
+	}
+
+	wr32(hw, reg, orig_val);
+	val = rd32(hw, reg);
+	if (val != orig_val) {
+		i40e_debug(hw, I40E_DEBUG_DIAG,
+			   "%s: reg restore test failed - reg 0x%08x orig_val 0x%08x val 0x%08x\n",
+			   __func__, reg, orig_val, val);
+		return I40E_ERR_DIAG_TEST_FAILED;
+	}
+
+	return 0;
+}
+
+struct i40e_diag_reg_test_info i40e_reg_list[] = {
+	/* offset               mask         elements   stride */
+	{I40E_QTX_CTL(0),       0x0000FFBF, 1,
+		I40E_QTX_CTL(1) - I40E_QTX_CTL(0)},
+	{I40E_PFINT_ITR0(0),    0x00000FFF, 3,
+		I40E_PFINT_ITR0(1) - I40E_PFINT_ITR0(0)},
+	{I40E_PFINT_ITRN(0, 0), 0x00000FFF, 1,
+		I40E_PFINT_ITRN(0, 1) - I40E_PFINT_ITRN(0, 0)},
+	{I40E_PFINT_ITRN(1, 0), 0x00000FFF, 1,
+		I40E_PFINT_ITRN(1, 1) - I40E_PFINT_ITRN(1, 0)},
+	{I40E_PFINT_ITRN(2, 0), 0x00000FFF, 1,
+		I40E_PFINT_ITRN(2, 1) - I40E_PFINT_ITRN(2, 0)},
+	{I40E_PFINT_STAT_CTL0,  0x0000000C, 1, 0},
+	{I40E_PFINT_LNKLST0,    0x00001FFF, 1, 0},
+	{I40E_PFINT_LNKLSTN(0), 0x000007FF, 1,
+		I40E_PFINT_LNKLSTN(1) - I40E_PFINT_LNKLSTN(0)},
+	{I40E_QINT_TQCTL(0),    0x000000FF, 1,
+		I40E_QINT_TQCTL(1) - I40E_QINT_TQCTL(0)},
+	{I40E_QINT_RQCTL(0),    0x000000FF, 1,
+		I40E_QINT_RQCTL(1) - I40E_QINT_RQCTL(0)},
+	{I40E_PFINT_ICR0_ENA,   0xF7F20000, 1, 0},
+	{ 0 }
+};
+
+/**
+ * i40e_diag_reg_test
+ * @hw: pointer to the hw struct
+ *
+ * Perform registers diagnostic test
+ **/
+i40e_status i40e_diag_reg_test(struct i40e_hw *hw)
+{
+	i40e_status ret_code = 0;
+	u32 reg, mask;
+	u32 i, j;
+
+	for (i = 0; i40e_reg_list[i].offset != 0 &&
+					     !ret_code; i++) {
+
+		/* set actual reg range for dynamically allocated resources */
+		if (i40e_reg_list[i].offset == I40E_QTX_CTL(0) &&
+		    hw->func_caps.num_tx_qp != 0)
+			i40e_reg_list[i].elements = hw->func_caps.num_tx_qp;
+		if ((i40e_reg_list[i].offset == I40E_PFINT_ITRN(0, 0) ||
+		     i40e_reg_list[i].offset == I40E_PFINT_ITRN(1, 0) ||
+		     i40e_reg_list[i].offset == I40E_PFINT_ITRN(2, 0) ||
+		     i40e_reg_list[i].offset == I40E_QINT_TQCTL(0) ||
+		     i40e_reg_list[i].offset == I40E_QINT_RQCTL(0)) &&
+		    hw->func_caps.num_msix_vectors != 0)
+			i40e_reg_list[i].elements =
+				hw->func_caps.num_msix_vectors - 1;
+
+		/* test register access */
+		mask = i40e_reg_list[i].mask;
+		for (j = 0; j < i40e_reg_list[i].elements && !ret_code; j++) {
+			reg = i40e_reg_list[i].offset +
+			      (j * i40e_reg_list[i].stride);
+			ret_code = i40e_diag_reg_pattern_test(hw, reg, mask);
+		}
+	}
+
+	return ret_code;
+}
+
+/**
+ * i40e_diag_eeprom_test
+ * @hw: pointer to the hw struct
+ *
+ * Perform EEPROM diagnostic test
+ **/
+i40e_status i40e_diag_eeprom_test(struct i40e_hw *hw)
+{
+	i40e_status ret_code;
+	u16 reg_val;
+
+	/* read NVM control word and if NVM valid, validate EEPROM checksum*/
+	ret_code = i40e_read_nvm_word(hw, I40E_SR_NVM_CONTROL_WORD, &reg_val);
+	if (!ret_code &&
+	    ((reg_val & I40E_SR_CONTROL_WORD_1_MASK) ==
+	     BIT(I40E_SR_CONTROL_WORD_1_SHIFT)))
+		return i40e_validate_nvm_checksum(hw, NULL);
+	else
+		return I40E_ERR_DIAG_TEST_FAILED;
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_diag.h b/drivers/net/ethernet/intel/i40e/i40e_diag.h
new file mode 100644
index 0000000..be83417
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_diag.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_DIAG_H_
+#define _I40E_DIAG_H_
+
+#include "i40e_type.h"
+
+enum i40e_lb_mode {
+	I40E_LB_MODE_NONE       = 0x0,
+	I40E_LB_MODE_PHY_LOCAL  = I40E_AQ_LB_PHY_LOCAL,
+	I40E_LB_MODE_PHY_REMOTE = I40E_AQ_LB_PHY_REMOTE,
+	I40E_LB_MODE_MAC_LOCAL  = I40E_AQ_LB_MAC_LOCAL,
+};
+
+struct i40e_diag_reg_test_info {
+	u32 offset;	/* the base register */
+	u32 mask;	/* bits that can be tested */
+	u32 elements;	/* number of elements if array */
+	u32 stride;	/* bytes between each element */
+};
+
+extern struct i40e_diag_reg_test_info i40e_reg_list[];
+
+i40e_status i40e_diag_reg_test(struct i40e_hw *hw);
+i40e_status i40e_diag_eeprom_test(struct i40e_hw *hw);
+
+#endif /* _I40E_DIAG_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
new file mode 100644
index 0000000..b974482
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -0,0 +1,4717 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+/* ethtool support for i40e */
+
+#include "i40e.h"
+#include "i40e_diag.h"
+
+struct i40e_stats {
+	char stat_string[ETH_GSTRING_LEN];
+	int sizeof_stat;
+	int stat_offset;
+};
+
+#define I40E_STAT(_type, _name, _stat) { \
+	.stat_string = _name, \
+	.sizeof_stat = FIELD_SIZEOF(_type, _stat), \
+	.stat_offset = offsetof(_type, _stat) \
+}
+
+#define I40E_NETDEV_STAT(_net_stat) \
+		I40E_STAT(struct rtnl_link_stats64, #_net_stat, _net_stat)
+#define I40E_PF_STAT(_name, _stat) \
+		I40E_STAT(struct i40e_pf, _name, _stat)
+#define I40E_VSI_STAT(_name, _stat) \
+		I40E_STAT(struct i40e_vsi, _name, _stat)
+#define I40E_VEB_STAT(_name, _stat) \
+		I40E_STAT(struct i40e_veb, _name, _stat)
+
+static const struct i40e_stats i40e_gstrings_net_stats[] = {
+	I40E_NETDEV_STAT(rx_packets),
+	I40E_NETDEV_STAT(tx_packets),
+	I40E_NETDEV_STAT(rx_bytes),
+	I40E_NETDEV_STAT(tx_bytes),
+	I40E_NETDEV_STAT(rx_errors),
+	I40E_NETDEV_STAT(tx_errors),
+	I40E_NETDEV_STAT(rx_dropped),
+	I40E_NETDEV_STAT(tx_dropped),
+	I40E_NETDEV_STAT(collisions),
+	I40E_NETDEV_STAT(rx_length_errors),
+	I40E_NETDEV_STAT(rx_crc_errors),
+};
+
+static const struct i40e_stats i40e_gstrings_veb_stats[] = {
+	I40E_VEB_STAT("rx_bytes", stats.rx_bytes),
+	I40E_VEB_STAT("tx_bytes", stats.tx_bytes),
+	I40E_VEB_STAT("rx_unicast", stats.rx_unicast),
+	I40E_VEB_STAT("tx_unicast", stats.tx_unicast),
+	I40E_VEB_STAT("rx_multicast", stats.rx_multicast),
+	I40E_VEB_STAT("tx_multicast", stats.tx_multicast),
+	I40E_VEB_STAT("rx_broadcast", stats.rx_broadcast),
+	I40E_VEB_STAT("tx_broadcast", stats.tx_broadcast),
+	I40E_VEB_STAT("rx_discards", stats.rx_discards),
+	I40E_VEB_STAT("tx_discards", stats.tx_discards),
+	I40E_VEB_STAT("tx_errors", stats.tx_errors),
+	I40E_VEB_STAT("rx_unknown_protocol", stats.rx_unknown_protocol),
+};
+
+static const struct i40e_stats i40e_gstrings_misc_stats[] = {
+	I40E_VSI_STAT("rx_unicast", eth_stats.rx_unicast),
+	I40E_VSI_STAT("tx_unicast", eth_stats.tx_unicast),
+	I40E_VSI_STAT("rx_multicast", eth_stats.rx_multicast),
+	I40E_VSI_STAT("tx_multicast", eth_stats.tx_multicast),
+	I40E_VSI_STAT("rx_broadcast", eth_stats.rx_broadcast),
+	I40E_VSI_STAT("tx_broadcast", eth_stats.tx_broadcast),
+	I40E_VSI_STAT("rx_unknown_protocol", eth_stats.rx_unknown_protocol),
+	I40E_VSI_STAT("tx_linearize", tx_linearize),
+	I40E_VSI_STAT("tx_force_wb", tx_force_wb),
+	I40E_VSI_STAT("rx_alloc_fail", rx_buf_failed),
+	I40E_VSI_STAT("rx_pg_alloc_fail", rx_page_failed),
+};
+
+/* These PF_STATs might look like duplicates of some NETDEV_STATs,
+ * but they are separate.  This device supports Virtualization, and
+ * as such might have several netdevs supporting VMDq and FCoE going
+ * through a single port.  The NETDEV_STATs are for individual netdevs
+ * seen at the top of the stack, and the PF_STATs are for the physical
+ * function at the bottom of the stack hosting those netdevs.
+ *
+ * The PF_STATs are appended to the netdev stats only when ethtool -S
+ * is queried on the base PF netdev, not on the VMDq or FCoE netdev.
+ */
+static const struct i40e_stats i40e_gstrings_stats[] = {
+	I40E_PF_STAT("rx_bytes", stats.eth.rx_bytes),
+	I40E_PF_STAT("tx_bytes", stats.eth.tx_bytes),
+	I40E_PF_STAT("rx_unicast", stats.eth.rx_unicast),
+	I40E_PF_STAT("tx_unicast", stats.eth.tx_unicast),
+	I40E_PF_STAT("rx_multicast", stats.eth.rx_multicast),
+	I40E_PF_STAT("tx_multicast", stats.eth.tx_multicast),
+	I40E_PF_STAT("rx_broadcast", stats.eth.rx_broadcast),
+	I40E_PF_STAT("tx_broadcast", stats.eth.tx_broadcast),
+	I40E_PF_STAT("tx_errors", stats.eth.tx_errors),
+	I40E_PF_STAT("rx_dropped", stats.eth.rx_discards),
+	I40E_PF_STAT("tx_dropped_link_down", stats.tx_dropped_link_down),
+	I40E_PF_STAT("rx_crc_errors", stats.crc_errors),
+	I40E_PF_STAT("illegal_bytes", stats.illegal_bytes),
+	I40E_PF_STAT("mac_local_faults", stats.mac_local_faults),
+	I40E_PF_STAT("mac_remote_faults", stats.mac_remote_faults),
+	I40E_PF_STAT("tx_timeout", tx_timeout_count),
+	I40E_PF_STAT("rx_csum_bad", hw_csum_rx_error),
+	I40E_PF_STAT("rx_length_errors", stats.rx_length_errors),
+	I40E_PF_STAT("link_xon_rx", stats.link_xon_rx),
+	I40E_PF_STAT("link_xoff_rx", stats.link_xoff_rx),
+	I40E_PF_STAT("link_xon_tx", stats.link_xon_tx),
+	I40E_PF_STAT("link_xoff_tx", stats.link_xoff_tx),
+	I40E_PF_STAT("priority_xon_rx", stats.priority_xon_rx),
+	I40E_PF_STAT("priority_xoff_rx", stats.priority_xoff_rx),
+	I40E_PF_STAT("priority_xon_tx", stats.priority_xon_tx),
+	I40E_PF_STAT("priority_xoff_tx", stats.priority_xoff_tx),
+	I40E_PF_STAT("rx_size_64", stats.rx_size_64),
+	I40E_PF_STAT("rx_size_127", stats.rx_size_127),
+	I40E_PF_STAT("rx_size_255", stats.rx_size_255),
+	I40E_PF_STAT("rx_size_511", stats.rx_size_511),
+	I40E_PF_STAT("rx_size_1023", stats.rx_size_1023),
+	I40E_PF_STAT("rx_size_1522", stats.rx_size_1522),
+	I40E_PF_STAT("rx_size_big", stats.rx_size_big),
+	I40E_PF_STAT("tx_size_64", stats.tx_size_64),
+	I40E_PF_STAT("tx_size_127", stats.tx_size_127),
+	I40E_PF_STAT("tx_size_255", stats.tx_size_255),
+	I40E_PF_STAT("tx_size_511", stats.tx_size_511),
+	I40E_PF_STAT("tx_size_1023", stats.tx_size_1023),
+	I40E_PF_STAT("tx_size_1522", stats.tx_size_1522),
+	I40E_PF_STAT("tx_size_big", stats.tx_size_big),
+	I40E_PF_STAT("rx_undersize", stats.rx_undersize),
+	I40E_PF_STAT("rx_fragments", stats.rx_fragments),
+	I40E_PF_STAT("rx_oversize", stats.rx_oversize),
+	I40E_PF_STAT("rx_jabber", stats.rx_jabber),
+	I40E_PF_STAT("VF_admin_queue_requests", vf_aq_requests),
+	I40E_PF_STAT("arq_overflows", arq_overflows),
+	I40E_PF_STAT("rx_hwtstamp_cleared", rx_hwtstamp_cleared),
+	I40E_PF_STAT("tx_hwtstamp_skipped", tx_hwtstamp_skipped),
+	I40E_PF_STAT("fdir_flush_cnt", fd_flush_cnt),
+	I40E_PF_STAT("fdir_atr_match", stats.fd_atr_match),
+	I40E_PF_STAT("fdir_atr_tunnel_match", stats.fd_atr_tunnel_match),
+	I40E_PF_STAT("fdir_atr_status", stats.fd_atr_status),
+	I40E_PF_STAT("fdir_sb_match", stats.fd_sb_match),
+	I40E_PF_STAT("fdir_sb_status", stats.fd_sb_status),
+
+	/* LPI stats */
+	I40E_PF_STAT("tx_lpi_status", stats.tx_lpi_status),
+	I40E_PF_STAT("rx_lpi_status", stats.rx_lpi_status),
+	I40E_PF_STAT("tx_lpi_count", stats.tx_lpi_count),
+	I40E_PF_STAT("rx_lpi_count", stats.rx_lpi_count),
+};
+
+#define I40E_QUEUE_STATS_LEN(n) \
+	(((struct i40e_netdev_priv *)netdev_priv((n)))->vsi->num_queue_pairs \
+	    * 2 /* Tx and Rx together */                                     \
+	    * (sizeof(struct i40e_queue_stats) / sizeof(u64)))
+#define I40E_GLOBAL_STATS_LEN	ARRAY_SIZE(i40e_gstrings_stats)
+#define I40E_NETDEV_STATS_LEN   ARRAY_SIZE(i40e_gstrings_net_stats)
+#define I40E_MISC_STATS_LEN	ARRAY_SIZE(i40e_gstrings_misc_stats)
+#define I40E_VSI_STATS_LEN(n)   (I40E_NETDEV_STATS_LEN + \
+				 I40E_MISC_STATS_LEN + \
+				 I40E_QUEUE_STATS_LEN((n)))
+#define I40E_PFC_STATS_LEN ( \
+		(FIELD_SIZEOF(struct i40e_pf, stats.priority_xoff_rx) + \
+		 FIELD_SIZEOF(struct i40e_pf, stats.priority_xon_rx) + \
+		 FIELD_SIZEOF(struct i40e_pf, stats.priority_xoff_tx) + \
+		 FIELD_SIZEOF(struct i40e_pf, stats.priority_xon_tx) + \
+		 FIELD_SIZEOF(struct i40e_pf, stats.priority_xon_2_xoff)) \
+		 / sizeof(u64))
+#define I40E_VEB_TC_STATS_LEN ( \
+		(FIELD_SIZEOF(struct i40e_veb, tc_stats.tc_rx_packets) + \
+		 FIELD_SIZEOF(struct i40e_veb, tc_stats.tc_rx_bytes) + \
+		 FIELD_SIZEOF(struct i40e_veb, tc_stats.tc_tx_packets) + \
+		 FIELD_SIZEOF(struct i40e_veb, tc_stats.tc_tx_bytes)) \
+		 / sizeof(u64))
+#define I40E_VEB_STATS_LEN	ARRAY_SIZE(i40e_gstrings_veb_stats)
+#define I40E_VEB_STATS_TOTAL	(I40E_VEB_STATS_LEN + I40E_VEB_TC_STATS_LEN)
+#define I40E_PF_STATS_LEN(n)	(I40E_GLOBAL_STATS_LEN + \
+				 I40E_PFC_STATS_LEN + \
+				 I40E_VSI_STATS_LEN((n)))
+
+enum i40e_ethtool_test_id {
+	I40E_ETH_TEST_REG = 0,
+	I40E_ETH_TEST_EEPROM,
+	I40E_ETH_TEST_INTR,
+	I40E_ETH_TEST_LINK,
+};
+
+static const char i40e_gstrings_test[][ETH_GSTRING_LEN] = {
+	"Register test  (offline)",
+	"Eeprom test    (offline)",
+	"Interrupt test (offline)",
+	"Link test   (on/offline)"
+};
+
+#define I40E_TEST_LEN (sizeof(i40e_gstrings_test) / ETH_GSTRING_LEN)
+
+struct i40e_priv_flags {
+	char flag_string[ETH_GSTRING_LEN];
+	u64 flag;
+	bool read_only;
+};
+
+#define I40E_PRIV_FLAG(_name, _flag, _read_only) { \
+	.flag_string = _name, \
+	.flag = _flag, \
+	.read_only = _read_only, \
+}
+
+static const struct i40e_priv_flags i40e_gstrings_priv_flags[] = {
+	/* NOTE: MFP setting cannot be changed */
+	I40E_PRIV_FLAG("MFP", I40E_FLAG_MFP_ENABLED, 1),
+	I40E_PRIV_FLAG("LinkPolling", I40E_FLAG_LINK_POLLING_ENABLED, 0),
+	I40E_PRIV_FLAG("flow-director-atr", I40E_FLAG_FD_ATR_ENABLED, 0),
+	I40E_PRIV_FLAG("veb-stats", I40E_FLAG_VEB_STATS_ENABLED, 0),
+	I40E_PRIV_FLAG("hw-atr-eviction", I40E_FLAG_HW_ATR_EVICT_ENABLED, 0),
+	I40E_PRIV_FLAG("link-down-on-close",
+		       I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED, 0),
+	I40E_PRIV_FLAG("legacy-rx", I40E_FLAG_LEGACY_RX, 0),
+	I40E_PRIV_FLAG("disable-source-pruning",
+		       I40E_FLAG_SOURCE_PRUNING_DISABLED, 0),
+	I40E_PRIV_FLAG("disable-fw-lldp", I40E_FLAG_DISABLE_FW_LLDP, 0),
+};
+
+#define I40E_PRIV_FLAGS_STR_LEN ARRAY_SIZE(i40e_gstrings_priv_flags)
+
+/* Private flags with a global effect, restricted to PF 0 */
+static const struct i40e_priv_flags i40e_gl_gstrings_priv_flags[] = {
+	I40E_PRIV_FLAG("vf-true-promisc-support",
+		       I40E_FLAG_TRUE_PROMISC_SUPPORT, 0),
+};
+
+#define I40E_GL_PRIV_FLAGS_STR_LEN ARRAY_SIZE(i40e_gl_gstrings_priv_flags)
+
+/**
+ * i40e_partition_setting_complaint - generic complaint for MFP restriction
+ * @pf: the PF struct
+ **/
+static void i40e_partition_setting_complaint(struct i40e_pf *pf)
+{
+	dev_info(&pf->pdev->dev,
+		 "The link settings are allowed to be changed only from the first partition of a given port. Please switch to the first partition in order to change the setting.\n");
+}
+
+/**
+ * i40e_phy_type_to_ethtool - convert the phy_types to ethtool link modes
+ * @pf: PF struct with phy_types
+ * @ks: ethtool link ksettings struct to fill out
+ *
+ **/
+static void i40e_phy_type_to_ethtool(struct i40e_pf *pf,
+				     struct ethtool_link_ksettings *ks)
+{
+	struct i40e_link_status *hw_link_info = &pf->hw.phy.link_info;
+	u64 phy_types = pf->hw.phy.phy_types;
+
+	ethtool_link_ksettings_zero_link_mode(ks, supported);
+	ethtool_link_ksettings_zero_link_mode(ks, advertising);
+
+	if (phy_types & I40E_CAP_PHY_TYPE_SGMII) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseT_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_1GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     1000baseT_Full);
+		if (pf->hw_features & I40E_HW_100M_SGMII_CAPABLE) {
+			ethtool_link_ksettings_add_link_mode(ks, supported,
+							     100baseT_Full);
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     100baseT_Full);
+		}
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_XAUI ||
+	    phy_types & I40E_CAP_PHY_TYPE_XFI ||
+	    phy_types & I40E_CAP_PHY_TYPE_SFI ||
+	    phy_types & I40E_CAP_PHY_TYPE_10GBASE_SFPP_CU ||
+	    phy_types & I40E_CAP_PHY_TYPE_10GBASE_AOC) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseT_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseT_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_10GBASE_T) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseT_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseT_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_XLAUI ||
+	    phy_types & I40E_CAP_PHY_TYPE_XLPPI ||
+	    phy_types & I40E_CAP_PHY_TYPE_40GBASE_AOC)
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseCR4_Full);
+	if (phy_types & I40E_CAP_PHY_TYPE_40GBASE_CR4_CU ||
+	    phy_types & I40E_CAP_PHY_TYPE_40GBASE_CR4) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseCR4_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_40GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     40000baseCR4_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_100BASE_TX) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100baseT_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_100MB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     100baseT_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_1000BASE_T) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseT_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_1GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     1000baseT_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_40GBASE_SR4)
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseSR4_Full);
+	if (phy_types & I40E_CAP_PHY_TYPE_40GBASE_LR4)
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseLR4_Full);
+	if (phy_types & I40E_CAP_PHY_TYPE_40GBASE_KR4) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseLR4_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     40000baseLR4_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_20GBASE_KR2) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     20000baseKR2_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_20GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     20000baseKR2_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_10GBASE_KX4) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseKX4_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseKX4_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_10GBASE_KR &&
+	    !(pf->hw_features & I40E_HW_HAVE_CRT_RETIMER)) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseKR_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseKR_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_1000BASE_KX &&
+	    !(pf->hw_features & I40E_HW_HAVE_CRT_RETIMER)) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseKX_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_1GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     1000baseKX_Full);
+	}
+	/* need to add 25G PHY types */
+	if (phy_types & I40E_CAP_PHY_TYPE_25GBASE_KR) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseKR_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_25GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     25000baseKR_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_25GBASE_CR) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseCR_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_25GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     25000baseCR_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_25GBASE_SR ||
+	    phy_types & I40E_CAP_PHY_TYPE_25GBASE_LR) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseSR_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_25GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     25000baseSR_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_25GBASE_AOC ||
+	    phy_types & I40E_CAP_PHY_TYPE_25GBASE_ACC) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseCR_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_25GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     25000baseCR_Full);
+	}
+	/* need to add new 10G PHY types */
+	if (phy_types & I40E_CAP_PHY_TYPE_10GBASE_CR1 ||
+	    phy_types & I40E_CAP_PHY_TYPE_10GBASE_CR1_CU) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseCR_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseCR_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_10GBASE_SR) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseSR_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseSR_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_10GBASE_LR) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseLR_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseLR_Full);
+	}
+	if (phy_types & I40E_CAP_PHY_TYPE_1000BASE_SX ||
+	    phy_types & I40E_CAP_PHY_TYPE_1000BASE_LX ||
+	    phy_types & I40E_CAP_PHY_TYPE_1000BASE_T_OPTICAL) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseX_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_1GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     1000baseX_Full);
+	}
+	/* Autoneg PHY types */
+	if (phy_types & I40E_CAP_PHY_TYPE_SGMII ||
+	    phy_types & I40E_CAP_PHY_TYPE_40GBASE_KR4 ||
+	    phy_types & I40E_CAP_PHY_TYPE_40GBASE_CR4_CU ||
+	    phy_types & I40E_CAP_PHY_TYPE_40GBASE_CR4 ||
+	    phy_types & I40E_CAP_PHY_TYPE_25GBASE_SR ||
+	    phy_types & I40E_CAP_PHY_TYPE_25GBASE_LR ||
+	    phy_types & I40E_CAP_PHY_TYPE_25GBASE_KR ||
+	    phy_types & I40E_CAP_PHY_TYPE_25GBASE_CR ||
+	    phy_types & I40E_CAP_PHY_TYPE_20GBASE_KR2 ||
+	    phy_types & I40E_CAP_PHY_TYPE_10GBASE_T ||
+	    phy_types & I40E_CAP_PHY_TYPE_10GBASE_SR ||
+	    phy_types & I40E_CAP_PHY_TYPE_10GBASE_LR ||
+	    phy_types & I40E_CAP_PHY_TYPE_10GBASE_KX4 ||
+	    phy_types & I40E_CAP_PHY_TYPE_10GBASE_KR ||
+	    phy_types & I40E_CAP_PHY_TYPE_10GBASE_CR1_CU ||
+	    phy_types & I40E_CAP_PHY_TYPE_10GBASE_CR1 ||
+	    phy_types & I40E_CAP_PHY_TYPE_1000BASE_T_OPTICAL ||
+	    phy_types & I40E_CAP_PHY_TYPE_1000BASE_T ||
+	    phy_types & I40E_CAP_PHY_TYPE_1000BASE_SX ||
+	    phy_types & I40E_CAP_PHY_TYPE_1000BASE_LX ||
+	    phy_types & I40E_CAP_PHY_TYPE_1000BASE_KX ||
+	    phy_types & I40E_CAP_PHY_TYPE_100BASE_TX) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     Autoneg);
+	}
+}
+
+/**
+ * i40e_get_settings_link_up - Get the Link settings for when link is up
+ * @hw: hw structure
+ * @ks: ethtool ksettings to fill in
+ * @netdev: network interface device structure
+ * @pf: pointer to physical function struct
+ **/
+static void i40e_get_settings_link_up(struct i40e_hw *hw,
+				      struct ethtool_link_ksettings *ks,
+				      struct net_device *netdev,
+				      struct i40e_pf *pf)
+{
+	struct i40e_link_status *hw_link_info = &hw->phy.link_info;
+	struct ethtool_link_ksettings cap_ksettings;
+	u32 link_speed = hw_link_info->link_speed;
+
+	/* Initialize supported and advertised settings based on phy settings */
+	switch (hw_link_info->phy_type) {
+	case I40E_PHY_TYPE_40GBASE_CR4:
+	case I40E_PHY_TYPE_40GBASE_CR4_CU:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseCR4_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     40000baseCR4_Full);
+		break;
+	case I40E_PHY_TYPE_XLAUI:
+	case I40E_PHY_TYPE_XLPPI:
+	case I40E_PHY_TYPE_40GBASE_AOC:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseCR4_Full);
+		break;
+	case I40E_PHY_TYPE_40GBASE_SR4:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseSR4_Full);
+		break;
+	case I40E_PHY_TYPE_40GBASE_LR4:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseLR4_Full);
+		break;
+	case I40E_PHY_TYPE_25GBASE_SR:
+	case I40E_PHY_TYPE_25GBASE_LR:
+	case I40E_PHY_TYPE_10GBASE_SR:
+	case I40E_PHY_TYPE_10GBASE_LR:
+	case I40E_PHY_TYPE_1000BASE_SX:
+	case I40E_PHY_TYPE_1000BASE_LX:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseSR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     25000baseSR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseSR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     10000baseSR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseLR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     10000baseLR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseX_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     1000baseX_Full);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseT_Full);
+		if (hw_link_info->module_type[2] &
+		    I40E_MODULE_TYPE_1000BASE_SX ||
+		    hw_link_info->module_type[2] &
+		    I40E_MODULE_TYPE_1000BASE_LX) {
+			ethtool_link_ksettings_add_link_mode(ks, supported,
+							     1000baseT_Full);
+			if (hw_link_info->requested_speeds &
+			    I40E_LINK_SPEED_1GB)
+				ethtool_link_ksettings_add_link_mode(
+				     ks, advertising, 1000baseT_Full);
+		}
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseT_Full);
+		break;
+	case I40E_PHY_TYPE_10GBASE_T:
+	case I40E_PHY_TYPE_1000BASE_T:
+	case I40E_PHY_TYPE_100BASE_TX:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseT_Full);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseT_Full);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100baseT_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseT_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_1GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     1000baseT_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_100MB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     100baseT_Full);
+		break;
+	case I40E_PHY_TYPE_1000BASE_T_OPTICAL:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseT_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     1000baseT_Full);
+		break;
+	case I40E_PHY_TYPE_10GBASE_CR1_CU:
+	case I40E_PHY_TYPE_10GBASE_CR1:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseT_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     10000baseT_Full);
+		break;
+	case I40E_PHY_TYPE_XAUI:
+	case I40E_PHY_TYPE_XFI:
+	case I40E_PHY_TYPE_SFI:
+	case I40E_PHY_TYPE_10GBASE_SFPP_CU:
+	case I40E_PHY_TYPE_10GBASE_AOC:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseT_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseT_Full);
+		break;
+	case I40E_PHY_TYPE_SGMII:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseT_Full);
+		if (hw_link_info->requested_speeds & I40E_LINK_SPEED_1GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     1000baseT_Full);
+		if (pf->hw_features & I40E_HW_100M_SGMII_CAPABLE) {
+			ethtool_link_ksettings_add_link_mode(ks, supported,
+							     100baseT_Full);
+			if (hw_link_info->requested_speeds &
+			    I40E_LINK_SPEED_100MB)
+				ethtool_link_ksettings_add_link_mode(
+				      ks, advertising, 100baseT_Full);
+		}
+		break;
+	case I40E_PHY_TYPE_40GBASE_KR4:
+	case I40E_PHY_TYPE_25GBASE_KR:
+	case I40E_PHY_TYPE_20GBASE_KR2:
+	case I40E_PHY_TYPE_10GBASE_KR:
+	case I40E_PHY_TYPE_10GBASE_KX4:
+	case I40E_PHY_TYPE_1000BASE_KX:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseKR4_Full);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseKR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     20000baseKR2_Full);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseKR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseKX4_Full);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseKX_Full);
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     40000baseKR4_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     25000baseKR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     20000baseKR2_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     10000baseKR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     10000baseKX4_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     1000baseKX_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		break;
+	case I40E_PHY_TYPE_25GBASE_CR:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseCR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     25000baseCR_Full);
+		break;
+	case I40E_PHY_TYPE_25GBASE_AOC:
+	case I40E_PHY_TYPE_25GBASE_ACC:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseCR_Full);
+
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     25000baseCR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseCR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     10000baseCR_Full);
+		break;
+	default:
+		/* if we got here and link is up something bad is afoot */
+		netdev_info(netdev,
+			    "WARNING: Link is up but PHY type 0x%x is not recognized.\n",
+			    hw_link_info->phy_type);
+	}
+
+	/* Now that we've worked out everything that could be supported by the
+	 * current PHY type, get what is supported by the NVM and intersect
+	 * them to get what is truly supported
+	 */
+	memset(&cap_ksettings, 0, sizeof(struct ethtool_link_ksettings));
+	i40e_phy_type_to_ethtool(pf, &cap_ksettings);
+	ethtool_intersect_link_masks(ks, &cap_ksettings);
+
+	/* Set speed and duplex */
+	switch (link_speed) {
+	case I40E_LINK_SPEED_40GB:
+		ks->base.speed = SPEED_40000;
+		break;
+	case I40E_LINK_SPEED_25GB:
+		ks->base.speed = SPEED_25000;
+		break;
+	case I40E_LINK_SPEED_20GB:
+		ks->base.speed = SPEED_20000;
+		break;
+	case I40E_LINK_SPEED_10GB:
+		ks->base.speed = SPEED_10000;
+		break;
+	case I40E_LINK_SPEED_1GB:
+		ks->base.speed = SPEED_1000;
+		break;
+	case I40E_LINK_SPEED_100MB:
+		ks->base.speed = SPEED_100;
+		break;
+	default:
+		break;
+	}
+	ks->base.duplex = DUPLEX_FULL;
+}
+
+/**
+ * i40e_get_settings_link_down - Get the Link settings for when link is down
+ * @hw: hw structure
+ * @ks: ethtool ksettings to fill in
+ * @pf: pointer to physical function struct
+ *
+ * Reports link settings that can be determined when link is down
+ **/
+static void i40e_get_settings_link_down(struct i40e_hw *hw,
+					struct ethtool_link_ksettings *ks,
+					struct i40e_pf *pf)
+{
+	/* link is down and the driver needs to fall back on
+	 * supported phy types to figure out what info to display
+	 */
+	i40e_phy_type_to_ethtool(pf, ks);
+
+	/* With no link speed and duplex are unknown */
+	ks->base.speed = SPEED_UNKNOWN;
+	ks->base.duplex = DUPLEX_UNKNOWN;
+}
+
+/**
+ * i40e_get_link_ksettings - Get Link Speed and Duplex settings
+ * @netdev: network interface device structure
+ * @ks: ethtool ksettings
+ *
+ * Reports speed/duplex settings based on media_type
+ **/
+static int i40e_get_link_ksettings(struct net_device *netdev,
+				   struct ethtool_link_ksettings *ks)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_link_status *hw_link_info = &hw->phy.link_info;
+	bool link_up = hw_link_info->link_info & I40E_AQ_LINK_UP;
+
+	ethtool_link_ksettings_zero_link_mode(ks, supported);
+	ethtool_link_ksettings_zero_link_mode(ks, advertising);
+
+	if (link_up)
+		i40e_get_settings_link_up(hw, ks, netdev, pf);
+	else
+		i40e_get_settings_link_down(hw, ks, pf);
+
+	/* Now set the settings that don't rely on link being up/down */
+	/* Set autoneg settings */
+	ks->base.autoneg = ((hw_link_info->an_info & I40E_AQ_AN_COMPLETED) ?
+			    AUTONEG_ENABLE : AUTONEG_DISABLE);
+
+	/* Set media type settings */
+	switch (hw->phy.media_type) {
+	case I40E_MEDIA_TYPE_BACKPLANE:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported, Backplane);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     Backplane);
+		ks->base.port = PORT_NONE;
+		break;
+	case I40E_MEDIA_TYPE_BASET:
+		ethtool_link_ksettings_add_link_mode(ks, supported, TP);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, TP);
+		ks->base.port = PORT_TP;
+		break;
+	case I40E_MEDIA_TYPE_DA:
+	case I40E_MEDIA_TYPE_CX4:
+		ethtool_link_ksettings_add_link_mode(ks, supported, FIBRE);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, FIBRE);
+		ks->base.port = PORT_DA;
+		break;
+	case I40E_MEDIA_TYPE_FIBER:
+		ethtool_link_ksettings_add_link_mode(ks, supported, FIBRE);
+		ks->base.port = PORT_FIBRE;
+		break;
+	case I40E_MEDIA_TYPE_UNKNOWN:
+	default:
+		ks->base.port = PORT_OTHER;
+		break;
+	}
+
+	/* Set flow control settings */
+	ethtool_link_ksettings_add_link_mode(ks, supported, Pause);
+
+	switch (hw->fc.requested_mode) {
+	case I40E_FC_FULL:
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Pause);
+		break;
+	case I40E_FC_TX_PAUSE:
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     Asym_Pause);
+		break;
+	case I40E_FC_RX_PAUSE:
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Pause);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     Asym_Pause);
+		break;
+	default:
+		ethtool_link_ksettings_del_link_mode(ks, advertising, Pause);
+		ethtool_link_ksettings_del_link_mode(ks, advertising,
+						     Asym_Pause);
+		break;
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_set_link_ksettings - Set Speed and Duplex
+ * @netdev: network interface device structure
+ * @ks: ethtool ksettings
+ *
+ * Set speed/duplex per media_types advertised/forced
+ **/
+static int i40e_set_link_ksettings(struct net_device *netdev,
+				   const struct ethtool_link_ksettings *ks)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_aq_get_phy_abilities_resp abilities;
+	struct ethtool_link_ksettings safe_ks;
+	struct ethtool_link_ksettings copy_ks;
+	struct i40e_aq_set_phy_config config;
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_hw *hw = &pf->hw;
+	bool autoneg_changed = false;
+	i40e_status status = 0;
+	int timeout = 50;
+	int err = 0;
+	u8 autoneg;
+
+	/* Changing port settings is not supported if this isn't the
+	 * port's controlling PF
+	 */
+	if (hw->partition_id != 1) {
+		i40e_partition_setting_complaint(pf);
+		return -EOPNOTSUPP;
+	}
+	if (vsi != pf->vsi[pf->lan_vsi])
+		return -EOPNOTSUPP;
+	if (hw->phy.media_type != I40E_MEDIA_TYPE_BASET &&
+	    hw->phy.media_type != I40E_MEDIA_TYPE_FIBER &&
+	    hw->phy.media_type != I40E_MEDIA_TYPE_BACKPLANE &&
+	    hw->phy.media_type != I40E_MEDIA_TYPE_DA &&
+	    hw->phy.link_info.link_info & I40E_AQ_LINK_UP)
+		return -EOPNOTSUPP;
+	if (hw->device_id == I40E_DEV_ID_KX_B ||
+	    hw->device_id == I40E_DEV_ID_KX_C ||
+	    hw->device_id == I40E_DEV_ID_20G_KR2 ||
+	    hw->device_id == I40E_DEV_ID_20G_KR2_A ||
+	    hw->device_id == I40E_DEV_ID_25G_B ||
+	    hw->device_id == I40E_DEV_ID_KX_X722) {
+		netdev_info(netdev, "Changing settings is not supported on backplane.\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* copy the ksettings to copy_ks to avoid modifying the origin */
+	memcpy(&copy_ks, ks, sizeof(struct ethtool_link_ksettings));
+
+	/* save autoneg out of ksettings */
+	autoneg = copy_ks.base.autoneg;
+
+	/* get our own copy of the bits to check against */
+	memset(&safe_ks, 0, sizeof(struct ethtool_link_ksettings));
+	safe_ks.base.cmd = copy_ks.base.cmd;
+	safe_ks.base.link_mode_masks_nwords =
+		copy_ks.base.link_mode_masks_nwords;
+	i40e_get_link_ksettings(netdev, &safe_ks);
+
+	/* Get link modes supported by hardware and check against modes
+	 * requested by the user.  Return an error if unsupported mode was set.
+	 */
+	if (!bitmap_subset(copy_ks.link_modes.advertising,
+			   safe_ks.link_modes.supported,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS))
+		return -EINVAL;
+
+	/* set autoneg back to what it currently is */
+	copy_ks.base.autoneg = safe_ks.base.autoneg;
+
+	/* If copy_ks.base and safe_ks.base are not the same now, then they are
+	 * trying to set something that we do not support.
+	 */
+	if (memcmp(&copy_ks.base, &safe_ks.base,
+		   sizeof(struct ethtool_link_settings)))
+		return -EOPNOTSUPP;
+
+	while (test_and_set_bit(__I40E_CONFIG_BUSY, pf->state)) {
+		timeout--;
+		if (!timeout)
+			return -EBUSY;
+		usleep_range(1000, 2000);
+	}
+
+	/* Get the current phy config */
+	status = i40e_aq_get_phy_capabilities(hw, false, false, &abilities,
+					      NULL);
+	if (status) {
+		err = -EAGAIN;
+		goto done;
+	}
+
+	/* Copy abilities to config in case autoneg is not
+	 * set below
+	 */
+	memset(&config, 0, sizeof(struct i40e_aq_set_phy_config));
+	config.abilities = abilities.abilities;
+
+	/* Check autoneg */
+	if (autoneg == AUTONEG_ENABLE) {
+		/* If autoneg was not already enabled */
+		if (!(hw->phy.link_info.an_info & I40E_AQ_AN_COMPLETED)) {
+			/* If autoneg is not supported, return error */
+			if (!ethtool_link_ksettings_test_link_mode(&safe_ks,
+								   supported,
+								   Autoneg)) {
+				netdev_info(netdev, "Autoneg not supported on this phy\n");
+				err = -EINVAL;
+				goto done;
+			}
+			/* Autoneg is allowed to change */
+			config.abilities = abilities.abilities |
+					   I40E_AQ_PHY_ENABLE_AN;
+			autoneg_changed = true;
+		}
+	} else {
+		/* If autoneg is currently enabled */
+		if (hw->phy.link_info.an_info & I40E_AQ_AN_COMPLETED) {
+			/* If autoneg is supported 10GBASE_T is the only PHY
+			 * that can disable it, so otherwise return error
+			 */
+			if (ethtool_link_ksettings_test_link_mode(&safe_ks,
+								  supported,
+								  Autoneg) &&
+			    hw->phy.link_info.phy_type !=
+			    I40E_PHY_TYPE_10GBASE_T) {
+				netdev_info(netdev, "Autoneg cannot be disabled on this phy\n");
+				err = -EINVAL;
+				goto done;
+			}
+			/* Autoneg is allowed to change */
+			config.abilities = abilities.abilities &
+					   ~I40E_AQ_PHY_ENABLE_AN;
+			autoneg_changed = true;
+		}
+	}
+
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  100baseT_Full))
+		config.link_speed |= I40E_LINK_SPEED_100MB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseT_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseX_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseKX_Full))
+		config.link_speed |= I40E_LINK_SPEED_1GB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseT_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseKX4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseKR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseCR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseSR_Full))
+		config.link_speed |= I40E_LINK_SPEED_10GB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  20000baseKR2_Full))
+		config.link_speed |= I40E_LINK_SPEED_20GB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  25000baseCR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  25000baseKR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  25000baseSR_Full))
+		config.link_speed |= I40E_LINK_SPEED_25GB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseKR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseCR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseSR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseLR4_Full))
+		config.link_speed |= I40E_LINK_SPEED_40GB;
+
+	/* If speed didn't get set, set it to what it currently is.
+	 * This is needed because if advertise is 0 (as it is when autoneg
+	 * is disabled) then speed won't get set.
+	 */
+	if (!config.link_speed)
+		config.link_speed = abilities.link_speed;
+	if (autoneg_changed || abilities.link_speed != config.link_speed) {
+		/* copy over the rest of the abilities */
+		config.phy_type = abilities.phy_type;
+		config.phy_type_ext = abilities.phy_type_ext;
+		config.eee_capability = abilities.eee_capability;
+		config.eeer = abilities.eeer_val;
+		config.low_power_ctrl = abilities.d3_lpan;
+		config.fec_config = abilities.fec_cfg_curr_mod_ext_info &
+				    I40E_AQ_PHY_FEC_CONFIG_MASK;
+
+		/* save the requested speeds */
+		hw->phy.link_info.requested_speeds = config.link_speed;
+		/* set link and auto negotiation so changes take effect */
+		config.abilities |= I40E_AQ_PHY_ENABLE_ATOMIC_LINK;
+		/* If link is up put link down */
+		if (hw->phy.link_info.link_info & I40E_AQ_LINK_UP) {
+			/* Tell the OS link is going down, the link will go
+			 * back up when fw says it is ready asynchronously
+			 */
+			i40e_print_link_message(vsi, false);
+			netif_carrier_off(netdev);
+			netif_tx_stop_all_queues(netdev);
+		}
+
+		/* make the aq call */
+		status = i40e_aq_set_phy_config(hw, &config, NULL);
+		if (status) {
+			netdev_info(netdev,
+				    "Set phy config failed, err %s aq_err %s\n",
+				    i40e_stat_str(hw, status),
+				    i40e_aq_str(hw, hw->aq.asq_last_status));
+			err = -EAGAIN;
+			goto done;
+		}
+
+		status = i40e_update_link_info(hw);
+		if (status)
+			netdev_dbg(netdev,
+				   "Updating link info failed with err %s aq_err %s\n",
+				   i40e_stat_str(hw, status),
+				   i40e_aq_str(hw, hw->aq.asq_last_status));
+
+	} else {
+		netdev_info(netdev, "Nothing changed, exiting without setting anything.\n");
+	}
+
+done:
+	clear_bit(__I40E_CONFIG_BUSY, pf->state);
+
+	return err;
+}
+
+static int i40e_nway_reset(struct net_device *netdev)
+{
+	/* restart autonegotiation */
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	bool link_up = hw->phy.link_info.link_info & I40E_AQ_LINK_UP;
+	i40e_status ret = 0;
+
+	ret = i40e_aq_set_link_restart_an(hw, link_up, NULL);
+	if (ret) {
+		netdev_info(netdev, "link restart failed, err %s aq_err %s\n",
+			    i40e_stat_str(hw, ret),
+			    i40e_aq_str(hw, hw->aq.asq_last_status));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_get_pauseparam -  Get Flow Control status
+ * Return tx/rx-pause status
+ **/
+static void i40e_get_pauseparam(struct net_device *netdev,
+				struct ethtool_pauseparam *pause)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_link_status *hw_link_info = &hw->phy.link_info;
+	struct i40e_dcbx_config *dcbx_cfg = &hw->local_dcbx_config;
+
+	pause->autoneg =
+		((hw_link_info->an_info & I40E_AQ_AN_COMPLETED) ?
+		  AUTONEG_ENABLE : AUTONEG_DISABLE);
+
+	/* PFC enabled so report LFC as off */
+	if (dcbx_cfg->pfc.pfcenable) {
+		pause->rx_pause = 0;
+		pause->tx_pause = 0;
+		return;
+	}
+
+	if (hw->fc.current_mode == I40E_FC_RX_PAUSE) {
+		pause->rx_pause = 1;
+	} else if (hw->fc.current_mode == I40E_FC_TX_PAUSE) {
+		pause->tx_pause = 1;
+	} else if (hw->fc.current_mode == I40E_FC_FULL) {
+		pause->rx_pause = 1;
+		pause->tx_pause = 1;
+	}
+}
+
+/**
+ * i40e_set_pauseparam - Set Flow Control parameter
+ * @netdev: network interface device structure
+ * @pause: return tx/rx flow control status
+ **/
+static int i40e_set_pauseparam(struct net_device *netdev,
+			       struct ethtool_pauseparam *pause)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_link_status *hw_link_info = &hw->phy.link_info;
+	struct i40e_dcbx_config *dcbx_cfg = &hw->local_dcbx_config;
+	bool link_up = hw_link_info->link_info & I40E_AQ_LINK_UP;
+	i40e_status status;
+	u8 aq_failures;
+	int err = 0;
+
+	/* Changing the port's flow control is not supported if this isn't the
+	 * port's controlling PF
+	 */
+	if (hw->partition_id != 1) {
+		i40e_partition_setting_complaint(pf);
+		return -EOPNOTSUPP;
+	}
+
+	if (vsi != pf->vsi[pf->lan_vsi])
+		return -EOPNOTSUPP;
+
+	if (pause->autoneg != ((hw_link_info->an_info & I40E_AQ_AN_COMPLETED) ?
+	    AUTONEG_ENABLE : AUTONEG_DISABLE)) {
+		netdev_info(netdev, "To change autoneg please use: ethtool -s <dev> autoneg <on|off>\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* If we have link and don't have autoneg */
+	if (!test_bit(__I40E_DOWN, pf->state) &&
+	    !(hw_link_info->an_info & I40E_AQ_AN_COMPLETED)) {
+		/* Send message that it might not necessarily work*/
+		netdev_info(netdev, "Autoneg did not complete so changing settings may not result in an actual change.\n");
+	}
+
+	if (dcbx_cfg->pfc.pfcenable) {
+		netdev_info(netdev,
+			    "Priority flow control enabled. Cannot set link flow control.\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (pause->rx_pause && pause->tx_pause)
+		hw->fc.requested_mode = I40E_FC_FULL;
+	else if (pause->rx_pause && !pause->tx_pause)
+		hw->fc.requested_mode = I40E_FC_RX_PAUSE;
+	else if (!pause->rx_pause && pause->tx_pause)
+		hw->fc.requested_mode = I40E_FC_TX_PAUSE;
+	else if (!pause->rx_pause && !pause->tx_pause)
+		hw->fc.requested_mode = I40E_FC_NONE;
+	else
+		 return -EINVAL;
+
+	/* Tell the OS link is going down, the link will go back up when fw
+	 * says it is ready asynchronously
+	 */
+	i40e_print_link_message(vsi, false);
+	netif_carrier_off(netdev);
+	netif_tx_stop_all_queues(netdev);
+
+	/* Set the fc mode and only restart an if link is up*/
+	status = i40e_set_fc(hw, &aq_failures, link_up);
+
+	if (aq_failures & I40E_SET_FC_AQ_FAIL_GET) {
+		netdev_info(netdev, "Set fc failed on the get_phy_capabilities call with err %s aq_err %s\n",
+			    i40e_stat_str(hw, status),
+			    i40e_aq_str(hw, hw->aq.asq_last_status));
+		err = -EAGAIN;
+	}
+	if (aq_failures & I40E_SET_FC_AQ_FAIL_SET) {
+		netdev_info(netdev, "Set fc failed on the set_phy_config call with err %s aq_err %s\n",
+			    i40e_stat_str(hw, status),
+			    i40e_aq_str(hw, hw->aq.asq_last_status));
+		err = -EAGAIN;
+	}
+	if (aq_failures & I40E_SET_FC_AQ_FAIL_UPDATE) {
+		netdev_info(netdev, "Set fc failed on the get_link_info call with err %s aq_err %s\n",
+			    i40e_stat_str(hw, status),
+			    i40e_aq_str(hw, hw->aq.asq_last_status));
+		err = -EAGAIN;
+	}
+
+	if (!test_bit(__I40E_DOWN, pf->state)) {
+		/* Give it a little more time to try to come back */
+		msleep(75);
+		if (!test_bit(__I40E_DOWN, pf->state))
+			return i40e_nway_reset(netdev);
+	}
+
+	return err;
+}
+
+static u32 i40e_get_msglevel(struct net_device *netdev)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	u32 debug_mask = pf->hw.debug_mask;
+
+	if (debug_mask)
+		netdev_info(netdev, "i40e debug_mask: 0x%08X\n", debug_mask);
+
+	return pf->msg_enable;
+}
+
+static void i40e_set_msglevel(struct net_device *netdev, u32 data)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+
+	if (I40E_DEBUG_USER & data)
+		pf->hw.debug_mask = data;
+	else
+		pf->msg_enable = data;
+}
+
+static int i40e_get_regs_len(struct net_device *netdev)
+{
+	int reg_count = 0;
+	int i;
+
+	for (i = 0; i40e_reg_list[i].offset != 0; i++)
+		reg_count += i40e_reg_list[i].elements;
+
+	return reg_count * sizeof(u32);
+}
+
+static void i40e_get_regs(struct net_device *netdev, struct ethtool_regs *regs,
+			  void *p)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	u32 *reg_buf = p;
+	unsigned int i, j, ri;
+	u32 reg;
+
+	/* Tell ethtool which driver-version-specific regs output we have.
+	 *
+	 * At some point, if we have ethtool doing special formatting of
+	 * this data, it will rely on this version number to know how to
+	 * interpret things.  Hence, this needs to be updated if/when the
+	 * diags register table is changed.
+	 */
+	regs->version = 1;
+
+	/* loop through the diags reg table for what to print */
+	ri = 0;
+	for (i = 0; i40e_reg_list[i].offset != 0; i++) {
+		for (j = 0; j < i40e_reg_list[i].elements; j++) {
+			reg = i40e_reg_list[i].offset
+				+ (j * i40e_reg_list[i].stride);
+			reg_buf[ri++] = rd32(hw, reg);
+		}
+	}
+
+}
+
+static int i40e_get_eeprom(struct net_device *netdev,
+			   struct ethtool_eeprom *eeprom, u8 *bytes)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_hw *hw = &np->vsi->back->hw;
+	struct i40e_pf *pf = np->vsi->back;
+	int ret_val = 0, len, offset;
+	u8 *eeprom_buff;
+	u16 i, sectors;
+	bool last;
+	u32 magic;
+
+#define I40E_NVM_SECTOR_SIZE  4096
+	if (eeprom->len == 0)
+		return -EINVAL;
+
+	/* check for NVMUpdate access method */
+	magic = hw->vendor_id | (hw->device_id << 16);
+	if (eeprom->magic && eeprom->magic != magic) {
+		struct i40e_nvm_access *cmd = (struct i40e_nvm_access *)eeprom;
+		int errno = 0;
+
+		/* make sure it is the right magic for NVMUpdate */
+		if ((eeprom->magic >> 16) != hw->device_id)
+			errno = -EINVAL;
+		else if (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state) ||
+			 test_bit(__I40E_RESET_INTR_RECEIVED, pf->state))
+			errno = -EBUSY;
+		else
+			ret_val = i40e_nvmupd_command(hw, cmd, bytes, &errno);
+
+		if ((errno || ret_val) && (hw->debug_mask & I40E_DEBUG_NVM))
+			dev_info(&pf->pdev->dev,
+				 "NVMUpdate read failed err=%d status=0x%x errno=%d module=%d offset=0x%x size=%d\n",
+				 ret_val, hw->aq.asq_last_status, errno,
+				 (u8)(cmd->config & I40E_NVM_MOD_PNT_MASK),
+				 cmd->offset, cmd->data_size);
+
+		return errno;
+	}
+
+	/* normal ethtool get_eeprom support */
+	eeprom->magic = hw->vendor_id | (hw->device_id << 16);
+
+	eeprom_buff = kzalloc(eeprom->len, GFP_KERNEL);
+	if (!eeprom_buff)
+		return -ENOMEM;
+
+	ret_val = i40e_acquire_nvm(hw, I40E_RESOURCE_READ);
+	if (ret_val) {
+		dev_info(&pf->pdev->dev,
+			 "Failed Acquiring NVM resource for read err=%d status=0x%x\n",
+			 ret_val, hw->aq.asq_last_status);
+		goto free_buff;
+	}
+
+	sectors = eeprom->len / I40E_NVM_SECTOR_SIZE;
+	sectors += (eeprom->len % I40E_NVM_SECTOR_SIZE) ? 1 : 0;
+	len = I40E_NVM_SECTOR_SIZE;
+	last = false;
+	for (i = 0; i < sectors; i++) {
+		if (i == (sectors - 1)) {
+			len = eeprom->len - (I40E_NVM_SECTOR_SIZE * i);
+			last = true;
+		}
+		offset = eeprom->offset + (I40E_NVM_SECTOR_SIZE * i),
+		ret_val = i40e_aq_read_nvm(hw, 0x0, offset, len,
+				(u8 *)eeprom_buff + (I40E_NVM_SECTOR_SIZE * i),
+				last, NULL);
+		if (ret_val && hw->aq.asq_last_status == I40E_AQ_RC_EPERM) {
+			dev_info(&pf->pdev->dev,
+				 "read NVM failed, invalid offset 0x%x\n",
+				 offset);
+			break;
+		} else if (ret_val &&
+			   hw->aq.asq_last_status == I40E_AQ_RC_EACCES) {
+			dev_info(&pf->pdev->dev,
+				 "read NVM failed, access, offset 0x%x\n",
+				 offset);
+			break;
+		} else if (ret_val) {
+			dev_info(&pf->pdev->dev,
+				 "read NVM failed offset %d err=%d status=0x%x\n",
+				 offset, ret_val, hw->aq.asq_last_status);
+			break;
+		}
+	}
+
+	i40e_release_nvm(hw);
+	memcpy(bytes, (u8 *)eeprom_buff, eeprom->len);
+free_buff:
+	kfree(eeprom_buff);
+	return ret_val;
+}
+
+static int i40e_get_eeprom_len(struct net_device *netdev)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_hw *hw = &np->vsi->back->hw;
+	u32 val;
+
+#define X722_EEPROM_SCOPE_LIMIT 0x5B9FFF
+	if (hw->mac.type == I40E_MAC_X722) {
+		val = X722_EEPROM_SCOPE_LIMIT + 1;
+		return val;
+	}
+	val = (rd32(hw, I40E_GLPCI_LBARCTRL)
+		& I40E_GLPCI_LBARCTRL_FL_SIZE_MASK)
+		>> I40E_GLPCI_LBARCTRL_FL_SIZE_SHIFT;
+	/* register returns value in power of 2, 64Kbyte chunks. */
+	val = (64 * 1024) * BIT(val);
+	return val;
+}
+
+static int i40e_set_eeprom(struct net_device *netdev,
+			   struct ethtool_eeprom *eeprom, u8 *bytes)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_hw *hw = &np->vsi->back->hw;
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_nvm_access *cmd = (struct i40e_nvm_access *)eeprom;
+	int ret_val = 0;
+	int errno = 0;
+	u32 magic;
+
+	/* normal ethtool set_eeprom is not supported */
+	magic = hw->vendor_id | (hw->device_id << 16);
+	if (eeprom->magic == magic)
+		errno = -EOPNOTSUPP;
+	/* check for NVMUpdate access method */
+	else if (!eeprom->magic || (eeprom->magic >> 16) != hw->device_id)
+		errno = -EINVAL;
+	else if (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state) ||
+		 test_bit(__I40E_RESET_INTR_RECEIVED, pf->state))
+		errno = -EBUSY;
+	else
+		ret_val = i40e_nvmupd_command(hw, cmd, bytes, &errno);
+
+	if ((errno || ret_val) && (hw->debug_mask & I40E_DEBUG_NVM))
+		dev_info(&pf->pdev->dev,
+			 "NVMUpdate write failed err=%d status=0x%x errno=%d module=%d offset=0x%x size=%d\n",
+			 ret_val, hw->aq.asq_last_status, errno,
+			 (u8)(cmd->config & I40E_NVM_MOD_PNT_MASK),
+			 cmd->offset, cmd->data_size);
+
+	return errno;
+}
+
+static void i40e_get_drvinfo(struct net_device *netdev,
+			     struct ethtool_drvinfo *drvinfo)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+
+	strlcpy(drvinfo->driver, i40e_driver_name, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, i40e_driver_version_str,
+		sizeof(drvinfo->version));
+	strlcpy(drvinfo->fw_version, i40e_nvm_version_str(&pf->hw),
+		sizeof(drvinfo->fw_version));
+	strlcpy(drvinfo->bus_info, pci_name(pf->pdev),
+		sizeof(drvinfo->bus_info));
+	drvinfo->n_priv_flags = I40E_PRIV_FLAGS_STR_LEN;
+	if (pf->hw.pf_id == 0)
+		drvinfo->n_priv_flags += I40E_GL_PRIV_FLAGS_STR_LEN;
+}
+
+static void i40e_get_ringparam(struct net_device *netdev,
+			       struct ethtool_ringparam *ring)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+
+	ring->rx_max_pending = I40E_MAX_NUM_DESCRIPTORS;
+	ring->tx_max_pending = I40E_MAX_NUM_DESCRIPTORS;
+	ring->rx_mini_max_pending = 0;
+	ring->rx_jumbo_max_pending = 0;
+	ring->rx_pending = vsi->rx_rings[0]->count;
+	ring->tx_pending = vsi->tx_rings[0]->count;
+	ring->rx_mini_pending = 0;
+	ring->rx_jumbo_pending = 0;
+}
+
+static bool i40e_active_tx_ring_index(struct i40e_vsi *vsi, u16 index)
+{
+	if (i40e_enabled_xdp_vsi(vsi)) {
+		return index < vsi->num_queue_pairs ||
+			(index >= vsi->alloc_queue_pairs &&
+			 index < vsi->alloc_queue_pairs + vsi->num_queue_pairs);
+	}
+
+	return index < vsi->num_queue_pairs;
+}
+
+static int i40e_set_ringparam(struct net_device *netdev,
+			      struct ethtool_ringparam *ring)
+{
+	struct i40e_ring *tx_rings = NULL, *rx_rings = NULL;
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_hw *hw = &np->vsi->back->hw;
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	u32 new_rx_count, new_tx_count;
+	u16 tx_alloc_queue_pairs;
+	int timeout = 50;
+	int i, err = 0;
+
+	if ((ring->rx_mini_pending) || (ring->rx_jumbo_pending))
+		return -EINVAL;
+
+	if (ring->tx_pending > I40E_MAX_NUM_DESCRIPTORS ||
+	    ring->tx_pending < I40E_MIN_NUM_DESCRIPTORS ||
+	    ring->rx_pending > I40E_MAX_NUM_DESCRIPTORS ||
+	    ring->rx_pending < I40E_MIN_NUM_DESCRIPTORS) {
+		netdev_info(netdev,
+			    "Descriptors requested (Tx: %d / Rx: %d) out of range [%d-%d]\n",
+			    ring->tx_pending, ring->rx_pending,
+			    I40E_MIN_NUM_DESCRIPTORS, I40E_MAX_NUM_DESCRIPTORS);
+		return -EINVAL;
+	}
+
+	new_tx_count = ALIGN(ring->tx_pending, I40E_REQ_DESCRIPTOR_MULTIPLE);
+	new_rx_count = ALIGN(ring->rx_pending, I40E_REQ_DESCRIPTOR_MULTIPLE);
+
+	/* if nothing to do return success */
+	if ((new_tx_count == vsi->tx_rings[0]->count) &&
+	    (new_rx_count == vsi->rx_rings[0]->count))
+		return 0;
+
+	while (test_and_set_bit(__I40E_CONFIG_BUSY, pf->state)) {
+		timeout--;
+		if (!timeout)
+			return -EBUSY;
+		usleep_range(1000, 2000);
+	}
+
+	if (!netif_running(vsi->netdev)) {
+		/* simple case - set for the next time the netdev is started */
+		for (i = 0; i < vsi->num_queue_pairs; i++) {
+			vsi->tx_rings[i]->count = new_tx_count;
+			vsi->rx_rings[i]->count = new_rx_count;
+			if (i40e_enabled_xdp_vsi(vsi))
+				vsi->xdp_rings[i]->count = new_tx_count;
+		}
+		goto done;
+	}
+
+	/* We can't just free everything and then setup again,
+	 * because the ISRs in MSI-X mode get passed pointers
+	 * to the Tx and Rx ring structs.
+	 */
+
+	/* alloc updated Tx and XDP Tx resources */
+	tx_alloc_queue_pairs = vsi->alloc_queue_pairs *
+			       (i40e_enabled_xdp_vsi(vsi) ? 2 : 1);
+	if (new_tx_count != vsi->tx_rings[0]->count) {
+		netdev_info(netdev,
+			    "Changing Tx descriptor count from %d to %d.\n",
+			    vsi->tx_rings[0]->count, new_tx_count);
+		tx_rings = kcalloc(tx_alloc_queue_pairs,
+				   sizeof(struct i40e_ring), GFP_KERNEL);
+		if (!tx_rings) {
+			err = -ENOMEM;
+			goto done;
+		}
+
+		for (i = 0; i < tx_alloc_queue_pairs; i++) {
+			if (!i40e_active_tx_ring_index(vsi, i))
+				continue;
+
+			tx_rings[i] = *vsi->tx_rings[i];
+			tx_rings[i].count = new_tx_count;
+			/* the desc and bi pointers will be reallocated in the
+			 * setup call
+			 */
+			tx_rings[i].desc = NULL;
+			tx_rings[i].rx_bi = NULL;
+			err = i40e_setup_tx_descriptors(&tx_rings[i]);
+			if (err) {
+				while (i) {
+					i--;
+					if (!i40e_active_tx_ring_index(vsi, i))
+						continue;
+					i40e_free_tx_resources(&tx_rings[i]);
+				}
+				kfree(tx_rings);
+				tx_rings = NULL;
+
+				goto done;
+			}
+		}
+	}
+
+	/* alloc updated Rx resources */
+	if (new_rx_count != vsi->rx_rings[0]->count) {
+		netdev_info(netdev,
+			    "Changing Rx descriptor count from %d to %d\n",
+			    vsi->rx_rings[0]->count, new_rx_count);
+		rx_rings = kcalloc(vsi->alloc_queue_pairs,
+				   sizeof(struct i40e_ring), GFP_KERNEL);
+		if (!rx_rings) {
+			err = -ENOMEM;
+			goto free_tx;
+		}
+
+		for (i = 0; i < vsi->num_queue_pairs; i++) {
+			struct i40e_ring *ring;
+			u16 unused;
+
+			/* clone ring and setup updated count */
+			rx_rings[i] = *vsi->rx_rings[i];
+			rx_rings[i].count = new_rx_count;
+			/* the desc and bi pointers will be reallocated in the
+			 * setup call
+			 */
+			rx_rings[i].desc = NULL;
+			rx_rings[i].rx_bi = NULL;
+			/* Clear cloned XDP RX-queue info before setup call */
+			memset(&rx_rings[i].xdp_rxq, 0, sizeof(rx_rings[i].xdp_rxq));
+			/* this is to allow wr32 to have something to write to
+			 * during early allocation of Rx buffers
+			 */
+			rx_rings[i].tail = hw->hw_addr + I40E_PRTGEN_STATUS;
+			err = i40e_setup_rx_descriptors(&rx_rings[i]);
+			if (err)
+				goto rx_unwind;
+
+			/* now allocate the Rx buffers to make sure the OS
+			 * has enough memory, any failure here means abort
+			 */
+			ring = &rx_rings[i];
+			unused = I40E_DESC_UNUSED(ring);
+			err = i40e_alloc_rx_buffers(ring, unused);
+rx_unwind:
+			if (err) {
+				do {
+					i40e_free_rx_resources(&rx_rings[i]);
+				} while (i--);
+				kfree(rx_rings);
+				rx_rings = NULL;
+
+				goto free_tx;
+			}
+		}
+	}
+
+	/* Bring interface down, copy in the new ring info,
+	 * then restore the interface
+	 */
+	i40e_down(vsi);
+
+	if (tx_rings) {
+		for (i = 0; i < tx_alloc_queue_pairs; i++) {
+			if (i40e_active_tx_ring_index(vsi, i)) {
+				i40e_free_tx_resources(vsi->tx_rings[i]);
+				*vsi->tx_rings[i] = tx_rings[i];
+			}
+		}
+		kfree(tx_rings);
+		tx_rings = NULL;
+	}
+
+	if (rx_rings) {
+		for (i = 0; i < vsi->num_queue_pairs; i++) {
+			i40e_free_rx_resources(vsi->rx_rings[i]);
+			/* get the real tail offset */
+			rx_rings[i].tail = vsi->rx_rings[i]->tail;
+			/* this is to fake out the allocation routine
+			 * into thinking it has to realloc everything
+			 * but the recycling logic will let us re-use
+			 * the buffers allocated above
+			 */
+			rx_rings[i].next_to_use = 0;
+			rx_rings[i].next_to_clean = 0;
+			rx_rings[i].next_to_alloc = 0;
+			/* do a struct copy */
+			*vsi->rx_rings[i] = rx_rings[i];
+		}
+		kfree(rx_rings);
+		rx_rings = NULL;
+	}
+
+	i40e_up(vsi);
+
+free_tx:
+	/* error cleanup if the Rx allocations failed after getting Tx */
+	if (tx_rings) {
+		for (i = 0; i < tx_alloc_queue_pairs; i++) {
+			if (i40e_active_tx_ring_index(vsi, i))
+				i40e_free_tx_resources(vsi->tx_rings[i]);
+		}
+		kfree(tx_rings);
+		tx_rings = NULL;
+	}
+
+done:
+	clear_bit(__I40E_CONFIG_BUSY, pf->state);
+
+	return err;
+}
+
+static int i40e_get_sset_count(struct net_device *netdev, int sset)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+
+	switch (sset) {
+	case ETH_SS_TEST:
+		return I40E_TEST_LEN;
+	case ETH_SS_STATS:
+		if (vsi == pf->vsi[pf->lan_vsi] && pf->hw.partition_id == 1) {
+			int len = I40E_PF_STATS_LEN(netdev);
+
+			if ((pf->lan_veb != I40E_NO_VEB) &&
+			    (pf->flags & I40E_FLAG_VEB_STATS_ENABLED))
+				len += I40E_VEB_STATS_TOTAL;
+			return len;
+		} else {
+			return I40E_VSI_STATS_LEN(netdev);
+		}
+	case ETH_SS_PRIV_FLAGS:
+		return I40E_PRIV_FLAGS_STR_LEN +
+			(pf->hw.pf_id == 0 ? I40E_GL_PRIV_FLAGS_STR_LEN : 0);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void i40e_get_ethtool_stats(struct net_device *netdev,
+				   struct ethtool_stats *stats, u64 *data)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_ring *tx_ring, *rx_ring;
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	unsigned int j;
+	int i = 0;
+	char *p;
+	struct rtnl_link_stats64 *net_stats = i40e_get_vsi_stats_struct(vsi);
+	unsigned int start;
+
+	i40e_update_stats(vsi);
+
+	for (j = 0; j < I40E_NETDEV_STATS_LEN; j++) {
+		p = (char *)net_stats + i40e_gstrings_net_stats[j].stat_offset;
+		data[i++] = (i40e_gstrings_net_stats[j].sizeof_stat ==
+			sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+	}
+	for (j = 0; j < I40E_MISC_STATS_LEN; j++) {
+		p = (char *)vsi + i40e_gstrings_misc_stats[j].stat_offset;
+		data[i++] = (i40e_gstrings_misc_stats[j].sizeof_stat ==
+			    sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+	}
+	rcu_read_lock();
+	for (j = 0; j < vsi->num_queue_pairs; j++) {
+		tx_ring = READ_ONCE(vsi->tx_rings[j]);
+
+		if (!tx_ring)
+			continue;
+
+		/* process Tx ring statistics */
+		do {
+			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
+			data[i] = tx_ring->stats.packets;
+			data[i + 1] = tx_ring->stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
+		i += 2;
+
+		/* Rx ring is the 2nd half of the queue pair */
+		rx_ring = &tx_ring[1];
+		do {
+			start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
+			data[i] = rx_ring->stats.packets;
+			data[i + 1] = rx_ring->stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
+		i += 2;
+	}
+	rcu_read_unlock();
+	if (vsi != pf->vsi[pf->lan_vsi] || pf->hw.partition_id != 1)
+		return;
+
+	if ((pf->lan_veb != I40E_NO_VEB) &&
+	    (pf->flags & I40E_FLAG_VEB_STATS_ENABLED)) {
+		struct i40e_veb *veb = pf->veb[pf->lan_veb];
+
+		for (j = 0; j < I40E_VEB_STATS_LEN; j++) {
+			p = (char *)veb;
+			p += i40e_gstrings_veb_stats[j].stat_offset;
+			data[i++] = (i40e_gstrings_veb_stats[j].sizeof_stat ==
+				     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+		}
+		for (j = 0; j < I40E_MAX_TRAFFIC_CLASS; j++) {
+			data[i++] = veb->tc_stats.tc_tx_packets[j];
+			data[i++] = veb->tc_stats.tc_tx_bytes[j];
+			data[i++] = veb->tc_stats.tc_rx_packets[j];
+			data[i++] = veb->tc_stats.tc_rx_bytes[j];
+		}
+	}
+	for (j = 0; j < I40E_GLOBAL_STATS_LEN; j++) {
+		p = (char *)pf + i40e_gstrings_stats[j].stat_offset;
+		data[i++] = (i40e_gstrings_stats[j].sizeof_stat ==
+			     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
+	}
+	for (j = 0; j < I40E_MAX_USER_PRIORITY; j++) {
+		data[i++] = pf->stats.priority_xon_tx[j];
+		data[i++] = pf->stats.priority_xoff_tx[j];
+	}
+	for (j = 0; j < I40E_MAX_USER_PRIORITY; j++) {
+		data[i++] = pf->stats.priority_xon_rx[j];
+		data[i++] = pf->stats.priority_xoff_rx[j];
+	}
+	for (j = 0; j < I40E_MAX_USER_PRIORITY; j++)
+		data[i++] = pf->stats.priority_xon_2_xoff[j];
+}
+
+static void i40e_get_strings(struct net_device *netdev, u32 stringset,
+			     u8 *data)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	char *p = (char *)data;
+	unsigned int i;
+
+	switch (stringset) {
+	case ETH_SS_TEST:
+		memcpy(data, i40e_gstrings_test,
+		       I40E_TEST_LEN * ETH_GSTRING_LEN);
+		break;
+	case ETH_SS_STATS:
+		for (i = 0; i < I40E_NETDEV_STATS_LEN; i++) {
+			snprintf(p, ETH_GSTRING_LEN, "%s",
+				 i40e_gstrings_net_stats[i].stat_string);
+			p += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < I40E_MISC_STATS_LEN; i++) {
+			snprintf(p, ETH_GSTRING_LEN, "%s",
+				 i40e_gstrings_misc_stats[i].stat_string);
+			p += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < vsi->num_queue_pairs; i++) {
+			snprintf(p, ETH_GSTRING_LEN, "tx-%d.tx_packets", i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, "tx-%d.tx_bytes", i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, "rx-%d.rx_packets", i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN, "rx-%d.rx_bytes", i);
+			p += ETH_GSTRING_LEN;
+		}
+		if (vsi != pf->vsi[pf->lan_vsi] || pf->hw.partition_id != 1)
+			return;
+
+		if ((pf->lan_veb != I40E_NO_VEB) &&
+		    (pf->flags & I40E_FLAG_VEB_STATS_ENABLED)) {
+			for (i = 0; i < I40E_VEB_STATS_LEN; i++) {
+				snprintf(p, ETH_GSTRING_LEN, "veb.%s",
+					i40e_gstrings_veb_stats[i].stat_string);
+				p += ETH_GSTRING_LEN;
+			}
+			for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+				snprintf(p, ETH_GSTRING_LEN,
+					 "veb.tc_%d_tx_packets", i);
+				p += ETH_GSTRING_LEN;
+				snprintf(p, ETH_GSTRING_LEN,
+					 "veb.tc_%d_tx_bytes", i);
+				p += ETH_GSTRING_LEN;
+				snprintf(p, ETH_GSTRING_LEN,
+					 "veb.tc_%d_rx_packets", i);
+				p += ETH_GSTRING_LEN;
+				snprintf(p, ETH_GSTRING_LEN,
+					 "veb.tc_%d_rx_bytes", i);
+				p += ETH_GSTRING_LEN;
+			}
+		}
+		for (i = 0; i < I40E_GLOBAL_STATS_LEN; i++) {
+			snprintf(p, ETH_GSTRING_LEN, "port.%s",
+				 i40e_gstrings_stats[i].stat_string);
+			p += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
+			snprintf(p, ETH_GSTRING_LEN,
+				 "port.tx_priority_%d_xon", i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN,
+				 "port.tx_priority_%d_xoff", i);
+			p += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
+			snprintf(p, ETH_GSTRING_LEN,
+				 "port.rx_priority_%d_xon", i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN,
+				 "port.rx_priority_%d_xoff", i);
+			p += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
+			snprintf(p, ETH_GSTRING_LEN,
+				 "port.rx_priority_%d_xon_2_xoff", i);
+			p += ETH_GSTRING_LEN;
+		}
+		/* BUG_ON(p - data != I40E_STATS_LEN * ETH_GSTRING_LEN); */
+		break;
+	case ETH_SS_PRIV_FLAGS:
+		for (i = 0; i < I40E_PRIV_FLAGS_STR_LEN; i++) {
+			snprintf(p, ETH_GSTRING_LEN, "%s",
+				 i40e_gstrings_priv_flags[i].flag_string);
+			p += ETH_GSTRING_LEN;
+		}
+		if (pf->hw.pf_id != 0)
+			break;
+		for (i = 0; i < I40E_GL_PRIV_FLAGS_STR_LEN; i++) {
+			snprintf(p, ETH_GSTRING_LEN, "%s",
+				 i40e_gl_gstrings_priv_flags[i].flag_string);
+			p += ETH_GSTRING_LEN;
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+static int i40e_get_ts_info(struct net_device *dev,
+			    struct ethtool_ts_info *info)
+{
+	struct i40e_pf *pf = i40e_netdev_to_pf(dev);
+
+	/* only report HW timestamping if PTP is enabled */
+	if (!(pf->flags & I40E_FLAG_PTP))
+		return ethtool_op_get_ts_info(dev, info);
+
+	info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
+				SOF_TIMESTAMPING_RX_SOFTWARE |
+				SOF_TIMESTAMPING_SOFTWARE |
+				SOF_TIMESTAMPING_TX_HARDWARE |
+				SOF_TIMESTAMPING_RX_HARDWARE |
+				SOF_TIMESTAMPING_RAW_HARDWARE;
+
+	if (pf->ptp_clock)
+		info->phc_index = ptp_clock_index(pf->ptp_clock);
+	else
+		info->phc_index = -1;
+
+	info->tx_types = BIT(HWTSTAMP_TX_OFF) | BIT(HWTSTAMP_TX_ON);
+
+	info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) |
+			   BIT(HWTSTAMP_FILTER_PTP_V2_L2_EVENT) |
+			   BIT(HWTSTAMP_FILTER_PTP_V2_L2_SYNC) |
+			   BIT(HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ);
+
+	if (pf->hw_features & I40E_HW_PTP_L4_CAPABLE)
+		info->rx_filters |= BIT(HWTSTAMP_FILTER_PTP_V1_L4_SYNC) |
+				    BIT(HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ) |
+				    BIT(HWTSTAMP_FILTER_PTP_V2_EVENT) |
+				    BIT(HWTSTAMP_FILTER_PTP_V2_L4_EVENT) |
+				    BIT(HWTSTAMP_FILTER_PTP_V2_SYNC) |
+				    BIT(HWTSTAMP_FILTER_PTP_V2_L4_SYNC) |
+				    BIT(HWTSTAMP_FILTER_PTP_V2_DELAY_REQ) |
+				    BIT(HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ);
+
+	return 0;
+}
+
+static int i40e_link_test(struct net_device *netdev, u64 *data)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	i40e_status status;
+	bool link_up = false;
+
+	netif_info(pf, hw, netdev, "link test\n");
+	status = i40e_get_link_status(&pf->hw, &link_up);
+	if (status) {
+		netif_err(pf, drv, netdev, "link query timed out, please retry test\n");
+		*data = 1;
+		return *data;
+	}
+
+	if (link_up)
+		*data = 0;
+	else
+		*data = 1;
+
+	return *data;
+}
+
+static int i40e_reg_test(struct net_device *netdev, u64 *data)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+
+	netif_info(pf, hw, netdev, "register test\n");
+	*data = i40e_diag_reg_test(&pf->hw);
+
+	return *data;
+}
+
+static int i40e_eeprom_test(struct net_device *netdev, u64 *data)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+
+	netif_info(pf, hw, netdev, "eeprom test\n");
+	*data = i40e_diag_eeprom_test(&pf->hw);
+
+	/* forcebly clear the NVM Update state machine */
+	pf->hw.nvmupd_state = I40E_NVMUPD_STATE_INIT;
+
+	return *data;
+}
+
+static int i40e_intr_test(struct net_device *netdev, u64 *data)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	u16 swc_old = pf->sw_int_count;
+
+	netif_info(pf, hw, netdev, "interrupt test\n");
+	wr32(&pf->hw, I40E_PFINT_DYN_CTL0,
+	     (I40E_PFINT_DYN_CTL0_INTENA_MASK |
+	      I40E_PFINT_DYN_CTL0_SWINT_TRIG_MASK |
+	      I40E_PFINT_DYN_CTL0_ITR_INDX_MASK |
+	      I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_MASK |
+	      I40E_PFINT_DYN_CTL0_SW_ITR_INDX_MASK));
+	usleep_range(1000, 2000);
+	*data = (swc_old == pf->sw_int_count);
+
+	return *data;
+}
+
+static inline bool i40e_active_vfs(struct i40e_pf *pf)
+{
+	struct i40e_vf *vfs = pf->vf;
+	int i;
+
+	for (i = 0; i < pf->num_alloc_vfs; i++)
+		if (test_bit(I40E_VF_STATE_ACTIVE, &vfs[i].vf_states))
+			return true;
+	return false;
+}
+
+static inline bool i40e_active_vmdqs(struct i40e_pf *pf)
+{
+	return !!i40e_find_vsi_by_type(pf, I40E_VSI_VMDQ2);
+}
+
+static void i40e_diag_test(struct net_device *netdev,
+			   struct ethtool_test *eth_test, u64 *data)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	bool if_running = netif_running(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+
+	if (eth_test->flags == ETH_TEST_FL_OFFLINE) {
+		/* Offline tests */
+		netif_info(pf, drv, netdev, "offline testing starting\n");
+
+		set_bit(__I40E_TESTING, pf->state);
+
+		if (i40e_active_vfs(pf) || i40e_active_vmdqs(pf)) {
+			dev_warn(&pf->pdev->dev,
+				 "Please take active VFs and Netqueues offline and restart the adapter before running NIC diagnostics\n");
+			data[I40E_ETH_TEST_REG]		= 1;
+			data[I40E_ETH_TEST_EEPROM]	= 1;
+			data[I40E_ETH_TEST_INTR]	= 1;
+			data[I40E_ETH_TEST_LINK]	= 1;
+			eth_test->flags |= ETH_TEST_FL_FAILED;
+			clear_bit(__I40E_TESTING, pf->state);
+			goto skip_ol_tests;
+		}
+
+		/* If the device is online then take it offline */
+		if (if_running)
+			/* indicate we're in test mode */
+			i40e_close(netdev);
+		else
+			/* This reset does not affect link - if it is
+			 * changed to a type of reset that does affect
+			 * link then the following link test would have
+			 * to be moved to before the reset
+			 */
+			i40e_do_reset(pf, BIT(__I40E_PF_RESET_REQUESTED), true);
+
+		if (i40e_link_test(netdev, &data[I40E_ETH_TEST_LINK]))
+			eth_test->flags |= ETH_TEST_FL_FAILED;
+
+		if (i40e_eeprom_test(netdev, &data[I40E_ETH_TEST_EEPROM]))
+			eth_test->flags |= ETH_TEST_FL_FAILED;
+
+		if (i40e_intr_test(netdev, &data[I40E_ETH_TEST_INTR]))
+			eth_test->flags |= ETH_TEST_FL_FAILED;
+
+		/* run reg test last, a reset is required after it */
+		if (i40e_reg_test(netdev, &data[I40E_ETH_TEST_REG]))
+			eth_test->flags |= ETH_TEST_FL_FAILED;
+
+		clear_bit(__I40E_TESTING, pf->state);
+		i40e_do_reset(pf, BIT(__I40E_PF_RESET_REQUESTED), true);
+
+		if (if_running)
+			i40e_open(netdev);
+	} else {
+		/* Online tests */
+		netif_info(pf, drv, netdev, "online testing starting\n");
+
+		if (i40e_link_test(netdev, &data[I40E_ETH_TEST_LINK]))
+			eth_test->flags |= ETH_TEST_FL_FAILED;
+
+		/* Offline only tests, not run in online; pass by default */
+		data[I40E_ETH_TEST_REG] = 0;
+		data[I40E_ETH_TEST_EEPROM] = 0;
+		data[I40E_ETH_TEST_INTR] = 0;
+	}
+
+skip_ol_tests:
+
+	netif_info(pf, drv, netdev, "testing finished\n");
+}
+
+static void i40e_get_wol(struct net_device *netdev,
+			 struct ethtool_wolinfo *wol)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	u16 wol_nvm_bits;
+
+	/* NVM bit on means WoL disabled for the port */
+	i40e_read_nvm_word(hw, I40E_SR_NVM_WAKE_ON_LAN, &wol_nvm_bits);
+	if ((BIT(hw->port) & wol_nvm_bits) || (hw->partition_id != 1)) {
+		wol->supported = 0;
+		wol->wolopts = 0;
+	} else {
+		wol->supported = WAKE_MAGIC;
+		wol->wolopts = (pf->wol_en ? WAKE_MAGIC : 0);
+	}
+}
+
+/**
+ * i40e_set_wol - set the WakeOnLAN configuration
+ * @netdev: the netdev in question
+ * @wol: the ethtool WoL setting data
+ **/
+static int i40e_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_hw *hw = &pf->hw;
+	u16 wol_nvm_bits;
+
+	/* WoL not supported if this isn't the controlling PF on the port */
+	if (hw->partition_id != 1) {
+		i40e_partition_setting_complaint(pf);
+		return -EOPNOTSUPP;
+	}
+
+	if (vsi != pf->vsi[pf->lan_vsi])
+		return -EOPNOTSUPP;
+
+	/* NVM bit on means WoL disabled for the port */
+	i40e_read_nvm_word(hw, I40E_SR_NVM_WAKE_ON_LAN, &wol_nvm_bits);
+	if (BIT(hw->port) & wol_nvm_bits)
+		return -EOPNOTSUPP;
+
+	/* only magic packet is supported */
+	if (wol->wolopts && (wol->wolopts != WAKE_MAGIC))
+		return -EOPNOTSUPP;
+
+	/* is this a new value? */
+	if (pf->wol_en != !!wol->wolopts) {
+		pf->wol_en = !!wol->wolopts;
+		device_set_wakeup_enable(&pf->pdev->dev, pf->wol_en);
+	}
+
+	return 0;
+}
+
+static int i40e_set_phys_id(struct net_device *netdev,
+			    enum ethtool_phys_id_state state)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	i40e_status ret = 0;
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	int blink_freq = 2;
+	u16 temp_status;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		if (!(pf->hw_features & I40E_HW_PHY_CONTROLS_LEDS)) {
+			pf->led_status = i40e_led_get(hw);
+		} else {
+			if (!(hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE))
+				i40e_aq_set_phy_debug(hw, I40E_PHY_DEBUG_ALL,
+						      NULL);
+			ret = i40e_led_get_phy(hw, &temp_status,
+					       &pf->phy_led_val);
+			pf->led_status = temp_status;
+		}
+		return blink_freq;
+	case ETHTOOL_ID_ON:
+		if (!(pf->hw_features & I40E_HW_PHY_CONTROLS_LEDS))
+			i40e_led_set(hw, 0xf, false);
+		else
+			ret = i40e_led_set_phy(hw, true, pf->led_status, 0);
+		break;
+	case ETHTOOL_ID_OFF:
+		if (!(pf->hw_features & I40E_HW_PHY_CONTROLS_LEDS))
+			i40e_led_set(hw, 0x0, false);
+		else
+			ret = i40e_led_set_phy(hw, false, pf->led_status, 0);
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		if (!(pf->hw_features & I40E_HW_PHY_CONTROLS_LEDS)) {
+			i40e_led_set(hw, pf->led_status, false);
+		} else {
+			ret = i40e_led_set_phy(hw, false, pf->led_status,
+					       (pf->phy_led_val |
+					       I40E_PHY_LED_MODE_ORIG));
+			if (!(hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE))
+				i40e_aq_set_phy_debug(hw, 0, NULL);
+		}
+		break;
+	default:
+		break;
+	}
+		if (ret)
+			return -ENOENT;
+		else
+			return 0;
+}
+
+/* NOTE: i40e hardware uses a conversion factor of 2 for Interrupt
+ * Throttle Rate (ITR) ie. ITR(1) = 2us ITR(10) = 20 us, and also
+ * 125us (8000 interrupts per second) == ITR(62)
+ */
+
+/**
+ * __i40e_get_coalesce - get per-queue coalesce settings
+ * @netdev: the netdev to check
+ * @ec: ethtool coalesce data structure
+ * @queue: which queue to pick
+ *
+ * Gets the per-queue settings for coalescence. Specifically Rx and Tx usecs
+ * are per queue. If queue is <0 then we default to queue 0 as the
+ * representative value.
+ **/
+static int __i40e_get_coalesce(struct net_device *netdev,
+			       struct ethtool_coalesce *ec,
+			       int queue)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_ring *rx_ring, *tx_ring;
+	struct i40e_vsi *vsi = np->vsi;
+
+	ec->tx_max_coalesced_frames_irq = vsi->work_limit;
+	ec->rx_max_coalesced_frames_irq = vsi->work_limit;
+
+	/* rx and tx usecs has per queue value. If user doesn't specify the
+	 * queue, return queue 0's value to represent.
+	 */
+	if (queue < 0)
+		queue = 0;
+	else if (queue >= vsi->num_queue_pairs)
+		return -EINVAL;
+
+	rx_ring = vsi->rx_rings[queue];
+	tx_ring = vsi->tx_rings[queue];
+
+	if (ITR_IS_DYNAMIC(rx_ring->itr_setting))
+		ec->use_adaptive_rx_coalesce = 1;
+
+	if (ITR_IS_DYNAMIC(tx_ring->itr_setting))
+		ec->use_adaptive_tx_coalesce = 1;
+
+	ec->rx_coalesce_usecs = rx_ring->itr_setting & ~I40E_ITR_DYNAMIC;
+	ec->tx_coalesce_usecs = tx_ring->itr_setting & ~I40E_ITR_DYNAMIC;
+
+	/* we use the _usecs_high to store/set the interrupt rate limit
+	 * that the hardware supports, that almost but not quite
+	 * fits the original intent of the ethtool variable,
+	 * the rx_coalesce_usecs_high limits total interrupts
+	 * per second from both tx/rx sources.
+	 */
+	ec->rx_coalesce_usecs_high = vsi->int_rate_limit;
+	ec->tx_coalesce_usecs_high = vsi->int_rate_limit;
+
+	return 0;
+}
+
+/**
+ * i40e_get_coalesce - get a netdev's coalesce settings
+ * @netdev: the netdev to check
+ * @ec: ethtool coalesce data structure
+ *
+ * Gets the coalesce settings for a particular netdev. Note that if user has
+ * modified per-queue settings, this only guarantees to represent queue 0. See
+ * __i40e_get_coalesce for more details.
+ **/
+static int i40e_get_coalesce(struct net_device *netdev,
+			     struct ethtool_coalesce *ec)
+{
+	return __i40e_get_coalesce(netdev, ec, -1);
+}
+
+/**
+ * i40e_get_per_queue_coalesce - gets coalesce settings for particular queue
+ * @netdev: netdev structure
+ * @ec: ethtool's coalesce settings
+ * @queue: the particular queue to read
+ *
+ * Will read a specific queue's coalesce settings
+ **/
+static int i40e_get_per_queue_coalesce(struct net_device *netdev, u32 queue,
+				       struct ethtool_coalesce *ec)
+{
+	return __i40e_get_coalesce(netdev, ec, queue);
+}
+
+/**
+ * i40e_set_itr_per_queue - set ITR values for specific queue
+ * @vsi: the VSI to set values for
+ * @ec: coalesce settings from ethtool
+ * @queue: the queue to modify
+ *
+ * Change the ITR settings for a specific queue.
+ **/
+static void i40e_set_itr_per_queue(struct i40e_vsi *vsi,
+				   struct ethtool_coalesce *ec,
+				   int queue)
+{
+	struct i40e_ring *rx_ring = vsi->rx_rings[queue];
+	struct i40e_ring *tx_ring = vsi->tx_rings[queue];
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_q_vector *q_vector;
+	u16 intrl;
+
+	intrl = i40e_intrl_usec_to_reg(vsi->int_rate_limit);
+
+	rx_ring->itr_setting = ITR_REG_ALIGN(ec->rx_coalesce_usecs);
+	tx_ring->itr_setting = ITR_REG_ALIGN(ec->tx_coalesce_usecs);
+
+	if (ec->use_adaptive_rx_coalesce)
+		rx_ring->itr_setting |= I40E_ITR_DYNAMIC;
+	else
+		rx_ring->itr_setting &= ~I40E_ITR_DYNAMIC;
+
+	if (ec->use_adaptive_tx_coalesce)
+		tx_ring->itr_setting |= I40E_ITR_DYNAMIC;
+	else
+		tx_ring->itr_setting &= ~I40E_ITR_DYNAMIC;
+
+	q_vector = rx_ring->q_vector;
+	q_vector->rx.target_itr = ITR_TO_REG(rx_ring->itr_setting);
+
+	q_vector = tx_ring->q_vector;
+	q_vector->tx.target_itr = ITR_TO_REG(tx_ring->itr_setting);
+
+	/* The interrupt handler itself will take care of programming
+	 * the Tx and Rx ITR values based on the values we have entered
+	 * into the q_vector, no need to write the values now.
+	 */
+
+	wr32(hw, I40E_PFINT_RATEN(q_vector->reg_idx), intrl);
+	i40e_flush(hw);
+}
+
+/**
+ * __i40e_set_coalesce - set coalesce settings for particular queue
+ * @netdev: the netdev to change
+ * @ec: ethtool coalesce settings
+ * @queue: the queue to change
+ *
+ * Sets the coalesce settings for a particular queue.
+ **/
+static int __i40e_set_coalesce(struct net_device *netdev,
+			       struct ethtool_coalesce *ec,
+			       int queue)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	u16 intrl_reg, cur_rx_itr, cur_tx_itr;
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	int i;
+
+	if (ec->tx_max_coalesced_frames_irq || ec->rx_max_coalesced_frames_irq)
+		vsi->work_limit = ec->tx_max_coalesced_frames_irq;
+
+	if (queue < 0) {
+		cur_rx_itr = vsi->rx_rings[0]->itr_setting;
+		cur_tx_itr = vsi->tx_rings[0]->itr_setting;
+	} else if (queue < vsi->num_queue_pairs) {
+		cur_rx_itr = vsi->rx_rings[queue]->itr_setting;
+		cur_tx_itr = vsi->tx_rings[queue]->itr_setting;
+	} else {
+		netif_info(pf, drv, netdev, "Invalid queue value, queue range is 0 - %d\n",
+			   vsi->num_queue_pairs - 1);
+		return -EINVAL;
+	}
+
+	cur_tx_itr &= ~I40E_ITR_DYNAMIC;
+	cur_rx_itr &= ~I40E_ITR_DYNAMIC;
+
+	/* tx_coalesce_usecs_high is ignored, use rx-usecs-high instead */
+	if (ec->tx_coalesce_usecs_high != vsi->int_rate_limit) {
+		netif_info(pf, drv, netdev, "tx-usecs-high is not used, please program rx-usecs-high\n");
+		return -EINVAL;
+	}
+
+	if (ec->rx_coalesce_usecs_high > INTRL_REG_TO_USEC(I40E_MAX_INTRL)) {
+		netif_info(pf, drv, netdev, "Invalid value, rx-usecs-high range is 0-%lu\n",
+			   INTRL_REG_TO_USEC(I40E_MAX_INTRL));
+		return -EINVAL;
+	}
+
+	if (ec->rx_coalesce_usecs != cur_rx_itr &&
+	    ec->use_adaptive_rx_coalesce) {
+		netif_info(pf, drv, netdev, "RX interrupt moderation cannot be changed if adaptive-rx is enabled.\n");
+		return -EINVAL;
+	}
+
+	if (ec->rx_coalesce_usecs > I40E_MAX_ITR) {
+		netif_info(pf, drv, netdev, "Invalid value, rx-usecs range is 0-8160\n");
+		return -EINVAL;
+	}
+
+	if (ec->tx_coalesce_usecs != cur_tx_itr &&
+	    ec->use_adaptive_tx_coalesce) {
+		netif_info(pf, drv, netdev, "TX interrupt moderation cannot be changed if adaptive-tx is enabled.\n");
+		return -EINVAL;
+	}
+
+	if (ec->tx_coalesce_usecs > I40E_MAX_ITR) {
+		netif_info(pf, drv, netdev, "Invalid value, tx-usecs range is 0-8160\n");
+		return -EINVAL;
+	}
+
+	if (ec->use_adaptive_rx_coalesce && !cur_rx_itr)
+		ec->rx_coalesce_usecs = I40E_MIN_ITR;
+
+	if (ec->use_adaptive_tx_coalesce && !cur_tx_itr)
+		ec->tx_coalesce_usecs = I40E_MIN_ITR;
+
+	intrl_reg = i40e_intrl_usec_to_reg(ec->rx_coalesce_usecs_high);
+	vsi->int_rate_limit = INTRL_REG_TO_USEC(intrl_reg);
+	if (vsi->int_rate_limit != ec->rx_coalesce_usecs_high) {
+		netif_info(pf, drv, netdev, "Interrupt rate limit rounded down to %d\n",
+			   vsi->int_rate_limit);
+	}
+
+	/* rx and tx usecs has per queue value. If user doesn't specify the
+	 * queue, apply to all queues.
+	 */
+	if (queue < 0) {
+		for (i = 0; i < vsi->num_queue_pairs; i++)
+			i40e_set_itr_per_queue(vsi, ec, i);
+	} else {
+		i40e_set_itr_per_queue(vsi, ec, queue);
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_set_coalesce - set coalesce settings for every queue on the netdev
+ * @netdev: the netdev to change
+ * @ec: ethtool coalesce settings
+ *
+ * This will set each queue to the same coalesce settings.
+ **/
+static int i40e_set_coalesce(struct net_device *netdev,
+			     struct ethtool_coalesce *ec)
+{
+	return __i40e_set_coalesce(netdev, ec, -1);
+}
+
+/**
+ * i40e_set_per_queue_coalesce - set specific queue's coalesce settings
+ * @netdev: the netdev to change
+ * @ec: ethtool's coalesce settings
+ * @queue: the queue to change
+ *
+ * Sets the specified queue's coalesce settings.
+ **/
+static int i40e_set_per_queue_coalesce(struct net_device *netdev, u32 queue,
+				       struct ethtool_coalesce *ec)
+{
+	return __i40e_set_coalesce(netdev, ec, queue);
+}
+
+/**
+ * i40e_get_rss_hash_opts - Get RSS hash Input Set for each flow type
+ * @pf: pointer to the physical function struct
+ * @cmd: ethtool rxnfc command
+ *
+ * Returns Success if the flow is supported, else Invalid Input.
+ **/
+static int i40e_get_rss_hash_opts(struct i40e_pf *pf, struct ethtool_rxnfc *cmd)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u8 flow_pctype = 0;
+	u64 i_set = 0;
+
+	cmd->data = 0;
+
+	switch (cmd->flow_type) {
+	case TCP_V4_FLOW:
+		flow_pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
+		break;
+	case UDP_V4_FLOW:
+		flow_pctype = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
+		break;
+	case TCP_V6_FLOW:
+		flow_pctype = I40E_FILTER_PCTYPE_NONF_IPV6_TCP;
+		break;
+	case UDP_V6_FLOW:
+		flow_pctype = I40E_FILTER_PCTYPE_NONF_IPV6_UDP;
+		break;
+	case SCTP_V4_FLOW:
+	case AH_ESP_V4_FLOW:
+	case AH_V4_FLOW:
+	case ESP_V4_FLOW:
+	case IPV4_FLOW:
+	case SCTP_V6_FLOW:
+	case AH_ESP_V6_FLOW:
+	case AH_V6_FLOW:
+	case ESP_V6_FLOW:
+	case IPV6_FLOW:
+		/* Default is src/dest for IP, no matter the L4 hashing */
+		cmd->data |= RXH_IP_SRC | RXH_IP_DST;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Read flow based hash input set register */
+	if (flow_pctype) {
+		i_set = (u64)i40e_read_rx_ctl(hw, I40E_GLQF_HASH_INSET(0,
+					      flow_pctype)) |
+			((u64)i40e_read_rx_ctl(hw, I40E_GLQF_HASH_INSET(1,
+					       flow_pctype)) << 32);
+	}
+
+	/* Process bits of hash input set */
+	if (i_set) {
+		if (i_set & I40E_L4_SRC_MASK)
+			cmd->data |= RXH_L4_B_0_1;
+		if (i_set & I40E_L4_DST_MASK)
+			cmd->data |= RXH_L4_B_2_3;
+
+		if (cmd->flow_type == TCP_V4_FLOW ||
+		    cmd->flow_type == UDP_V4_FLOW) {
+			if (i_set & I40E_L3_SRC_MASK)
+				cmd->data |= RXH_IP_SRC;
+			if (i_set & I40E_L3_DST_MASK)
+				cmd->data |= RXH_IP_DST;
+		} else if (cmd->flow_type == TCP_V6_FLOW ||
+			  cmd->flow_type == UDP_V6_FLOW) {
+			if (i_set & I40E_L3_V6_SRC_MASK)
+				cmd->data |= RXH_IP_SRC;
+			if (i_set & I40E_L3_V6_DST_MASK)
+				cmd->data |= RXH_IP_DST;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_check_mask - Check whether a mask field is set
+ * @mask: the full mask value
+ * @field; mask of the field to check
+ *
+ * If the given mask is fully set, return positive value. If the mask for the
+ * field is fully unset, return zero. Otherwise return a negative error code.
+ **/
+static int i40e_check_mask(u64 mask, u64 field)
+{
+	u64 value = mask & field;
+
+	if (value == field)
+		return 1;
+	else if (!value)
+		return 0;
+	else
+		return -1;
+}
+
+/**
+ * i40e_parse_rx_flow_user_data - Deconstruct user-defined data
+ * @fsp: pointer to rx flow specification
+ * @data: pointer to userdef data structure for storage
+ *
+ * Read the user-defined data and deconstruct the value into a structure. No
+ * other code should read the user-defined data, so as to ensure that every
+ * place consistently reads the value correctly.
+ *
+ * The user-defined field is a 64bit Big Endian format value, which we
+ * deconstruct by reading bits or bit fields from it. Single bit flags shall
+ * be defined starting from the highest bits, while small bit field values
+ * shall be defined starting from the lowest bits.
+ *
+ * Returns 0 if the data is valid, and non-zero if the userdef data is invalid
+ * and the filter should be rejected. The data structure will always be
+ * modified even if FLOW_EXT is not set.
+ *
+ **/
+static int i40e_parse_rx_flow_user_data(struct ethtool_rx_flow_spec *fsp,
+					struct i40e_rx_flow_userdef *data)
+{
+	u64 value, mask;
+	int valid;
+
+	/* Zero memory first so it's always consistent. */
+	memset(data, 0, sizeof(*data));
+
+	if (!(fsp->flow_type & FLOW_EXT))
+		return 0;
+
+	value = be64_to_cpu(*((__be64 *)fsp->h_ext.data));
+	mask = be64_to_cpu(*((__be64 *)fsp->m_ext.data));
+
+#define I40E_USERDEF_FLEX_WORD		GENMASK_ULL(15, 0)
+#define I40E_USERDEF_FLEX_OFFSET	GENMASK_ULL(31, 16)
+#define I40E_USERDEF_FLEX_FILTER	GENMASK_ULL(31, 0)
+
+	valid = i40e_check_mask(mask, I40E_USERDEF_FLEX_FILTER);
+	if (valid < 0) {
+		return -EINVAL;
+	} else if (valid) {
+		data->flex_word = value & I40E_USERDEF_FLEX_WORD;
+		data->flex_offset =
+			(value & I40E_USERDEF_FLEX_OFFSET) >> 16;
+		data->flex_filter = true;
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_fill_rx_flow_user_data - Fill in user-defined data field
+ * @fsp: pointer to rx_flow specification
+ *
+ * Reads the userdef data structure and properly fills in the user defined
+ * fields of the rx_flow_spec.
+ **/
+static void i40e_fill_rx_flow_user_data(struct ethtool_rx_flow_spec *fsp,
+					struct i40e_rx_flow_userdef *data)
+{
+	u64 value = 0, mask = 0;
+
+	if (data->flex_filter) {
+		value |= data->flex_word;
+		value |= (u64)data->flex_offset << 16;
+		mask |= I40E_USERDEF_FLEX_FILTER;
+	}
+
+	if (value || mask)
+		fsp->flow_type |= FLOW_EXT;
+
+	*((__be64 *)fsp->h_ext.data) = cpu_to_be64(value);
+	*((__be64 *)fsp->m_ext.data) = cpu_to_be64(mask);
+}
+
+/**
+ * i40e_get_ethtool_fdir_all - Populates the rule count of a command
+ * @pf: Pointer to the physical function struct
+ * @cmd: The command to get or set Rx flow classification rules
+ * @rule_locs: Array of used rule locations
+ *
+ * This function populates both the total and actual rule count of
+ * the ethtool flow classification command
+ *
+ * Returns 0 on success or -EMSGSIZE if entry not found
+ **/
+static int i40e_get_ethtool_fdir_all(struct i40e_pf *pf,
+				     struct ethtool_rxnfc *cmd,
+				     u32 *rule_locs)
+{
+	struct i40e_fdir_filter *rule;
+	struct hlist_node *node2;
+	int cnt = 0;
+
+	/* report total rule count */
+	cmd->data = i40e_get_fd_cnt_all(pf);
+
+	hlist_for_each_entry_safe(rule, node2,
+				  &pf->fdir_filter_list, fdir_node) {
+		if (cnt == cmd->rule_cnt)
+			return -EMSGSIZE;
+
+		rule_locs[cnt] = rule->fd_id;
+		cnt++;
+	}
+
+	cmd->rule_cnt = cnt;
+
+	return 0;
+}
+
+/**
+ * i40e_get_ethtool_fdir_entry - Look up a filter based on Rx flow
+ * @pf: Pointer to the physical function struct
+ * @cmd: The command to get or set Rx flow classification rules
+ *
+ * This function looks up a filter based on the Rx flow classification
+ * command and fills the flow spec info for it if found
+ *
+ * Returns 0 on success or -EINVAL if filter not found
+ **/
+static int i40e_get_ethtool_fdir_entry(struct i40e_pf *pf,
+				       struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_rx_flow_spec *fsp =
+			(struct ethtool_rx_flow_spec *)&cmd->fs;
+	struct i40e_rx_flow_userdef userdef = {0};
+	struct i40e_fdir_filter *rule = NULL;
+	struct hlist_node *node2;
+	u64 input_set;
+	u16 index;
+
+	hlist_for_each_entry_safe(rule, node2,
+				  &pf->fdir_filter_list, fdir_node) {
+		if (fsp->location <= rule->fd_id)
+			break;
+	}
+
+	if (!rule || fsp->location != rule->fd_id)
+		return -EINVAL;
+
+	fsp->flow_type = rule->flow_type;
+	if (fsp->flow_type == IP_USER_FLOW) {
+		fsp->h_u.usr_ip4_spec.ip_ver = ETH_RX_NFC_IP4;
+		fsp->h_u.usr_ip4_spec.proto = 0;
+		fsp->m_u.usr_ip4_spec.proto = 0;
+	}
+
+	/* Reverse the src and dest notion, since the HW views them from
+	 * Tx perspective where as the user expects it from Rx filter view.
+	 */
+	fsp->h_u.tcp_ip4_spec.psrc = rule->dst_port;
+	fsp->h_u.tcp_ip4_spec.pdst = rule->src_port;
+	fsp->h_u.tcp_ip4_spec.ip4src = rule->dst_ip;
+	fsp->h_u.tcp_ip4_spec.ip4dst = rule->src_ip;
+
+	switch (rule->flow_type) {
+	case SCTP_V4_FLOW:
+		index = I40E_FILTER_PCTYPE_NONF_IPV4_SCTP;
+		break;
+	case TCP_V4_FLOW:
+		index = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
+		break;
+	case UDP_V4_FLOW:
+		index = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
+		break;
+	case IP_USER_FLOW:
+		index = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER;
+		break;
+	default:
+		/* If we have stored a filter with a flow type not listed here
+		 * it is almost certainly a driver bug. WARN(), and then
+		 * assign the input_set as if all fields are enabled to avoid
+		 * reading unassigned memory.
+		 */
+		WARN(1, "Missing input set index for flow_type %d\n",
+		     rule->flow_type);
+		input_set = 0xFFFFFFFFFFFFFFFFULL;
+		goto no_input_set;
+	}
+
+	input_set = i40e_read_fd_input_set(pf, index);
+
+no_input_set:
+	if (input_set & I40E_L3_SRC_MASK)
+		fsp->m_u.tcp_ip4_spec.ip4src = htonl(0xFFFFFFFF);
+
+	if (input_set & I40E_L3_DST_MASK)
+		fsp->m_u.tcp_ip4_spec.ip4dst = htonl(0xFFFFFFFF);
+
+	if (input_set & I40E_L4_SRC_MASK)
+		fsp->m_u.tcp_ip4_spec.psrc = htons(0xFFFF);
+
+	if (input_set & I40E_L4_DST_MASK)
+		fsp->m_u.tcp_ip4_spec.pdst = htons(0xFFFF);
+
+	if (rule->dest_ctl == I40E_FILTER_PROGRAM_DESC_DEST_DROP_PACKET)
+		fsp->ring_cookie = RX_CLS_FLOW_DISC;
+	else
+		fsp->ring_cookie = rule->q_index;
+
+	if (rule->dest_vsi != pf->vsi[pf->lan_vsi]->id) {
+		struct i40e_vsi *vsi;
+
+		vsi = i40e_find_vsi_from_id(pf, rule->dest_vsi);
+		if (vsi && vsi->type == I40E_VSI_SRIOV) {
+			/* VFs are zero-indexed by the driver, but ethtool
+			 * expects them to be one-indexed, so add one here
+			 */
+			u64 ring_vf = vsi->vf_id + 1;
+
+			ring_vf <<= ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF;
+			fsp->ring_cookie |= ring_vf;
+		}
+	}
+
+	if (rule->flex_filter) {
+		userdef.flex_filter = true;
+		userdef.flex_word = be16_to_cpu(rule->flex_word);
+		userdef.flex_offset = rule->flex_offset;
+	}
+
+	i40e_fill_rx_flow_user_data(fsp, &userdef);
+
+	return 0;
+}
+
+/**
+ * i40e_get_rxnfc - command to get RX flow classification rules
+ * @netdev: network interface device structure
+ * @cmd: ethtool rxnfc command
+ *
+ * Returns Success if the command is supported.
+ **/
+static int i40e_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd,
+			  u32 *rule_locs)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	int ret = -EOPNOTSUPP;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_GRXRINGS:
+		cmd->data = vsi->rss_size;
+		ret = 0;
+		break;
+	case ETHTOOL_GRXFH:
+		ret = i40e_get_rss_hash_opts(pf, cmd);
+		break;
+	case ETHTOOL_GRXCLSRLCNT:
+		cmd->rule_cnt = pf->fdir_pf_active_filters;
+		/* report total rule count */
+		cmd->data = i40e_get_fd_cnt_all(pf);
+		ret = 0;
+		break;
+	case ETHTOOL_GRXCLSRULE:
+		ret = i40e_get_ethtool_fdir_entry(pf, cmd);
+		break;
+	case ETHTOOL_GRXCLSRLALL:
+		ret = i40e_get_ethtool_fdir_all(pf, cmd, rule_locs);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * i40e_get_rss_hash_bits - Read RSS Hash bits from register
+ * @nfc: pointer to user request
+ * @i_setc bits currently set
+ *
+ * Returns value of bits to be set per user request
+ **/
+static u64 i40e_get_rss_hash_bits(struct ethtool_rxnfc *nfc, u64 i_setc)
+{
+	u64 i_set = i_setc;
+	u64 src_l3 = 0, dst_l3 = 0;
+
+	if (nfc->data & RXH_L4_B_0_1)
+		i_set |= I40E_L4_SRC_MASK;
+	else
+		i_set &= ~I40E_L4_SRC_MASK;
+	if (nfc->data & RXH_L4_B_2_3)
+		i_set |= I40E_L4_DST_MASK;
+	else
+		i_set &= ~I40E_L4_DST_MASK;
+
+	if (nfc->flow_type == TCP_V6_FLOW || nfc->flow_type == UDP_V6_FLOW) {
+		src_l3 = I40E_L3_V6_SRC_MASK;
+		dst_l3 = I40E_L3_V6_DST_MASK;
+	} else if (nfc->flow_type == TCP_V4_FLOW ||
+		  nfc->flow_type == UDP_V4_FLOW) {
+		src_l3 = I40E_L3_SRC_MASK;
+		dst_l3 = I40E_L3_DST_MASK;
+	} else {
+		/* Any other flow type are not supported here */
+		return i_set;
+	}
+
+	if (nfc->data & RXH_IP_SRC)
+		i_set |= src_l3;
+	else
+		i_set &= ~src_l3;
+	if (nfc->data & RXH_IP_DST)
+		i_set |= dst_l3;
+	else
+		i_set &= ~dst_l3;
+
+	return i_set;
+}
+
+/**
+ * i40e_set_rss_hash_opt - Enable/Disable flow types for RSS hash
+ * @pf: pointer to the physical function struct
+ * @cmd: ethtool rxnfc command
+ *
+ * Returns Success if the flow input set is supported.
+ **/
+static int i40e_set_rss_hash_opt(struct i40e_pf *pf, struct ethtool_rxnfc *nfc)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u64 hena = (u64)i40e_read_rx_ctl(hw, I40E_PFQF_HENA(0)) |
+		   ((u64)i40e_read_rx_ctl(hw, I40E_PFQF_HENA(1)) << 32);
+	u8 flow_pctype = 0;
+	u64 i_set, i_setc;
+
+	if (pf->flags & I40E_FLAG_MFP_ENABLED) {
+		dev_err(&pf->pdev->dev,
+			"Change of RSS hash input set is not supported when MFP mode is enabled\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* RSS does not support anything other than hashing
+	 * to queues on src and dst IPs and ports
+	 */
+	if (nfc->data & ~(RXH_IP_SRC | RXH_IP_DST |
+			  RXH_L4_B_0_1 | RXH_L4_B_2_3))
+		return -EINVAL;
+
+	switch (nfc->flow_type) {
+	case TCP_V4_FLOW:
+		flow_pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
+		if (pf->hw_features & I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE)
+			hena |=
+			  BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK);
+		break;
+	case TCP_V6_FLOW:
+		flow_pctype = I40E_FILTER_PCTYPE_NONF_IPV6_TCP;
+		if (pf->hw_features & I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE)
+			hena |=
+			  BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK);
+		if (pf->hw_features & I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE)
+			hena |=
+			  BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK);
+		break;
+	case UDP_V4_FLOW:
+		flow_pctype = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
+		if (pf->hw_features & I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE)
+			hena |=
+			  BIT_ULL(I40E_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP) |
+			  BIT_ULL(I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP);
+
+		hena |= BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV4);
+		break;
+	case UDP_V6_FLOW:
+		flow_pctype = I40E_FILTER_PCTYPE_NONF_IPV6_UDP;
+		if (pf->hw_features & I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE)
+			hena |=
+			  BIT_ULL(I40E_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP) |
+			  BIT_ULL(I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP);
+
+		hena |= BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV6);
+		break;
+	case AH_ESP_V4_FLOW:
+	case AH_V4_FLOW:
+	case ESP_V4_FLOW:
+	case SCTP_V4_FLOW:
+		if ((nfc->data & RXH_L4_B_0_1) ||
+		    (nfc->data & RXH_L4_B_2_3))
+			return -EINVAL;
+		hena |= BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_OTHER);
+		break;
+	case AH_ESP_V6_FLOW:
+	case AH_V6_FLOW:
+	case ESP_V6_FLOW:
+	case SCTP_V6_FLOW:
+		if ((nfc->data & RXH_L4_B_0_1) ||
+		    (nfc->data & RXH_L4_B_2_3))
+			return -EINVAL;
+		hena |= BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_OTHER);
+		break;
+	case IPV4_FLOW:
+		hena |= BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_OTHER) |
+			BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV4);
+		break;
+	case IPV6_FLOW:
+		hena |= BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_OTHER) |
+			BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV6);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (flow_pctype) {
+		i_setc = (u64)i40e_read_rx_ctl(hw, I40E_GLQF_HASH_INSET(0,
+					       flow_pctype)) |
+			((u64)i40e_read_rx_ctl(hw, I40E_GLQF_HASH_INSET(1,
+					       flow_pctype)) << 32);
+		i_set = i40e_get_rss_hash_bits(nfc, i_setc);
+		i40e_write_rx_ctl(hw, I40E_GLQF_HASH_INSET(0, flow_pctype),
+				  (u32)i_set);
+		i40e_write_rx_ctl(hw, I40E_GLQF_HASH_INSET(1, flow_pctype),
+				  (u32)(i_set >> 32));
+		hena |= BIT_ULL(flow_pctype);
+	}
+
+	i40e_write_rx_ctl(hw, I40E_PFQF_HENA(0), (u32)hena);
+	i40e_write_rx_ctl(hw, I40E_PFQF_HENA(1), (u32)(hena >> 32));
+	i40e_flush(hw);
+
+	return 0;
+}
+
+/**
+ * i40e_update_ethtool_fdir_entry - Updates the fdir filter entry
+ * @vsi: Pointer to the targeted VSI
+ * @input: The filter to update or NULL to indicate deletion
+ * @sw_idx: Software index to the filter
+ * @cmd: The command to get or set Rx flow classification rules
+ *
+ * This function updates (or deletes) a Flow Director entry from
+ * the hlist of the corresponding PF
+ *
+ * Returns 0 on success
+ **/
+static int i40e_update_ethtool_fdir_entry(struct i40e_vsi *vsi,
+					  struct i40e_fdir_filter *input,
+					  u16 sw_idx,
+					  struct ethtool_rxnfc *cmd)
+{
+	struct i40e_fdir_filter *rule, *parent;
+	struct i40e_pf *pf = vsi->back;
+	struct hlist_node *node2;
+	int err = -EINVAL;
+
+	parent = NULL;
+	rule = NULL;
+
+	hlist_for_each_entry_safe(rule, node2,
+				  &pf->fdir_filter_list, fdir_node) {
+		/* hash found, or no matching entry */
+		if (rule->fd_id >= sw_idx)
+			break;
+		parent = rule;
+	}
+
+	/* if there is an old rule occupying our place remove it */
+	if (rule && (rule->fd_id == sw_idx)) {
+		/* Remove this rule, since we're either deleting it, or
+		 * replacing it.
+		 */
+		err = i40e_add_del_fdir(vsi, rule, false);
+		hlist_del(&rule->fdir_node);
+		kfree(rule);
+		pf->fdir_pf_active_filters--;
+	}
+
+	/* If we weren't given an input, this is a delete, so just return the
+	 * error code indicating if there was an entry at the requested slot
+	 */
+	if (!input)
+		return err;
+
+	/* Otherwise, install the new rule as requested */
+	INIT_HLIST_NODE(&input->fdir_node);
+
+	/* add filter to the list */
+	if (parent)
+		hlist_add_behind(&input->fdir_node, &parent->fdir_node);
+	else
+		hlist_add_head(&input->fdir_node,
+			       &pf->fdir_filter_list);
+
+	/* update counts */
+	pf->fdir_pf_active_filters++;
+
+	return 0;
+}
+
+/**
+ * i40e_prune_flex_pit_list - Cleanup unused entries in FLX_PIT table
+ * @pf: pointer to PF structure
+ *
+ * This function searches the list of filters and determines which FLX_PIT
+ * entries are still required. It will prune any entries which are no longer
+ * in use after the deletion.
+ **/
+static void i40e_prune_flex_pit_list(struct i40e_pf *pf)
+{
+	struct i40e_flex_pit *entry, *tmp;
+	struct i40e_fdir_filter *rule;
+
+	/* First, we'll check the l3 table */
+	list_for_each_entry_safe(entry, tmp, &pf->l3_flex_pit_list, list) {
+		bool found = false;
+
+		hlist_for_each_entry(rule, &pf->fdir_filter_list, fdir_node) {
+			if (rule->flow_type != IP_USER_FLOW)
+				continue;
+			if (rule->flex_filter &&
+			    rule->flex_offset == entry->src_offset) {
+				found = true;
+				break;
+			}
+		}
+
+		/* If we didn't find the filter, then we can prune this entry
+		 * from the list.
+		 */
+		if (!found) {
+			list_del(&entry->list);
+			kfree(entry);
+		}
+	}
+
+	/* Followed by the L4 table */
+	list_for_each_entry_safe(entry, tmp, &pf->l4_flex_pit_list, list) {
+		bool found = false;
+
+		hlist_for_each_entry(rule, &pf->fdir_filter_list, fdir_node) {
+			/* Skip this filter if it's L3, since we already
+			 * checked those in the above loop
+			 */
+			if (rule->flow_type == IP_USER_FLOW)
+				continue;
+			if (rule->flex_filter &&
+			    rule->flex_offset == entry->src_offset) {
+				found = true;
+				break;
+			}
+		}
+
+		/* If we didn't find the filter, then we can prune this entry
+		 * from the list.
+		 */
+		if (!found) {
+			list_del(&entry->list);
+			kfree(entry);
+		}
+	}
+}
+
+/**
+ * i40e_del_fdir_entry - Deletes a Flow Director filter entry
+ * @vsi: Pointer to the targeted VSI
+ * @cmd: The command to get or set Rx flow classification rules
+ *
+ * The function removes a Flow Director filter entry from the
+ * hlist of the corresponding PF
+ *
+ * Returns 0 on success
+ */
+static int i40e_del_fdir_entry(struct i40e_vsi *vsi,
+			       struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_rx_flow_spec *fsp =
+		(struct ethtool_rx_flow_spec *)&cmd->fs;
+	struct i40e_pf *pf = vsi->back;
+	int ret = 0;
+
+	if (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state) ||
+	    test_bit(__I40E_RESET_INTR_RECEIVED, pf->state))
+		return -EBUSY;
+
+	if (test_bit(__I40E_FD_FLUSH_REQUESTED, pf->state))
+		return -EBUSY;
+
+	ret = i40e_update_ethtool_fdir_entry(vsi, NULL, fsp->location, cmd);
+
+	i40e_prune_flex_pit_list(pf);
+
+	i40e_fdir_check_and_reenable(pf);
+	return ret;
+}
+
+/**
+ * i40e_unused_pit_index - Find an unused PIT index for given list
+ * @pf: the PF data structure
+ *
+ * Find the first unused flexible PIT index entry. We search both the L3 and
+ * L4 flexible PIT lists so that the returned index is unique and unused by
+ * either currently programmed L3 or L4 filters. We use a bit field as storage
+ * to track which indexes are already used.
+ **/
+static u8 i40e_unused_pit_index(struct i40e_pf *pf)
+{
+	unsigned long available_index = 0xFF;
+	struct i40e_flex_pit *entry;
+
+	/* We need to make sure that the new index isn't in use by either L3
+	 * or L4 filters so that IP_USER_FLOW filters can program both L3 and
+	 * L4 to use the same index.
+	 */
+
+	list_for_each_entry(entry, &pf->l4_flex_pit_list, list)
+		clear_bit(entry->pit_index, &available_index);
+
+	list_for_each_entry(entry, &pf->l3_flex_pit_list, list)
+		clear_bit(entry->pit_index, &available_index);
+
+	return find_first_bit(&available_index, 8);
+}
+
+/**
+ * i40e_find_flex_offset - Find an existing flex src_offset
+ * @flex_pit_list: L3 or L4 flex PIT list
+ * @src_offset: new src_offset to find
+ *
+ * Searches the flex_pit_list for an existing offset. If no offset is
+ * currently programmed, then this will return an ERR_PTR if there is no space
+ * to add a new offset, otherwise it returns NULL.
+ **/
+static
+struct i40e_flex_pit *i40e_find_flex_offset(struct list_head *flex_pit_list,
+					    u16 src_offset)
+{
+	struct i40e_flex_pit *entry;
+	int size = 0;
+
+	/* Search for the src_offset first. If we find a matching entry
+	 * already programmed, we can simply re-use it.
+	 */
+	list_for_each_entry(entry, flex_pit_list, list) {
+		size++;
+		if (entry->src_offset == src_offset)
+			return entry;
+	}
+
+	/* If we haven't found an entry yet, then the provided src offset has
+	 * not yet been programmed. We will program the src offset later on,
+	 * but we need to indicate whether there is enough space to do so
+	 * here. We'll make use of ERR_PTR for this purpose.
+	 */
+	if (size >= I40E_FLEX_PIT_TABLE_SIZE)
+		return ERR_PTR(-ENOSPC);
+
+	return NULL;
+}
+
+/**
+ * i40e_add_flex_offset - Add src_offset to flex PIT table list
+ * @flex_pit_list: L3 or L4 flex PIT list
+ * @src_offset: new src_offset to add
+ * @pit_index: the PIT index to program
+ *
+ * This function programs the new src_offset to the list. It is expected that
+ * i40e_find_flex_offset has already been tried and returned NULL, indicating
+ * that this offset is not programmed, and that the list has enough space to
+ * store another offset.
+ *
+ * Returns 0 on success, and negative value on error.
+ **/
+static int i40e_add_flex_offset(struct list_head *flex_pit_list,
+				u16 src_offset,
+				u8 pit_index)
+{
+	struct i40e_flex_pit *new_pit, *entry;
+
+	new_pit = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!new_pit)
+		return -ENOMEM;
+
+	new_pit->src_offset = src_offset;
+	new_pit->pit_index = pit_index;
+
+	/* We need to insert this item such that the list is sorted by
+	 * src_offset in ascending order.
+	 */
+	list_for_each_entry(entry, flex_pit_list, list) {
+		if (new_pit->src_offset < entry->src_offset) {
+			list_add_tail(&new_pit->list, &entry->list);
+			return 0;
+		}
+
+		/* If we found an entry with our offset already programmed we
+		 * can simply return here, after freeing the memory. However,
+		 * if the pit_index does not match we need to report an error.
+		 */
+		if (new_pit->src_offset == entry->src_offset) {
+			int err = 0;
+
+			/* If the PIT index is not the same we can't re-use
+			 * the entry, so we must report an error.
+			 */
+			if (new_pit->pit_index != entry->pit_index)
+				err = -EINVAL;
+
+			kfree(new_pit);
+			return err;
+		}
+	}
+
+	/* If we reached here, then we haven't yet added the item. This means
+	 * that we should add the item at the end of the list.
+	 */
+	list_add_tail(&new_pit->list, flex_pit_list);
+	return 0;
+}
+
+/**
+ * __i40e_reprogram_flex_pit - Re-program specific FLX_PIT table
+ * @pf: Pointer to the PF structure
+ * @flex_pit_list: list of flexible src offsets in use
+ * #flex_pit_start: index to first entry for this section of the table
+ *
+ * In order to handle flexible data, the hardware uses a table of values
+ * called the FLX_PIT table. This table is used to indicate which sections of
+ * the input correspond to what PIT index values. Unfortunately, hardware is
+ * very restrictive about programming this table. Entries must be ordered by
+ * src_offset in ascending order, without duplicates. Additionally, unused
+ * entries must be set to the unused index value, and must have valid size and
+ * length according to the src_offset ordering.
+ *
+ * This function will reprogram the FLX_PIT register from a book-keeping
+ * structure that we guarantee is already ordered correctly, and has no more
+ * than 3 entries.
+ *
+ * To make things easier, we only support flexible values of one word length,
+ * rather than allowing variable length flexible values.
+ **/
+static void __i40e_reprogram_flex_pit(struct i40e_pf *pf,
+				      struct list_head *flex_pit_list,
+				      int flex_pit_start)
+{
+	struct i40e_flex_pit *entry = NULL;
+	u16 last_offset = 0;
+	int i = 0, j = 0;
+
+	/* First, loop over the list of flex PIT entries, and reprogram the
+	 * registers.
+	 */
+	list_for_each_entry(entry, flex_pit_list, list) {
+		/* We have to be careful when programming values for the
+		 * largest SRC_OFFSET value. It is possible that adding
+		 * additional empty values at the end would overflow the space
+		 * for the SRC_OFFSET in the FLX_PIT register. To avoid this,
+		 * we check here and add the empty values prior to adding the
+		 * largest value.
+		 *
+		 * To determine this, we will use a loop from i+1 to 3, which
+		 * will determine whether the unused entries would have valid
+		 * SRC_OFFSET. Note that there cannot be extra entries past
+		 * this value, because the only valid values would have been
+		 * larger than I40E_MAX_FLEX_SRC_OFFSET, and thus would not
+		 * have been added to the list in the first place.
+		 */
+		for (j = i + 1; j < 3; j++) {
+			u16 offset = entry->src_offset + j;
+			int index = flex_pit_start + i;
+			u32 value = I40E_FLEX_PREP_VAL(I40E_FLEX_DEST_UNUSED,
+						       1,
+						       offset - 3);
+
+			if (offset > I40E_MAX_FLEX_SRC_OFFSET) {
+				i40e_write_rx_ctl(&pf->hw,
+						  I40E_PRTQF_FLX_PIT(index),
+						  value);
+				i++;
+			}
+		}
+
+		/* Now, we can program the actual value into the table */
+		i40e_write_rx_ctl(&pf->hw,
+				  I40E_PRTQF_FLX_PIT(flex_pit_start + i),
+				  I40E_FLEX_PREP_VAL(entry->pit_index + 50,
+						     1,
+						     entry->src_offset));
+		i++;
+	}
+
+	/* In order to program the last entries in the table, we need to
+	 * determine the valid offset. If the list is empty, we'll just start
+	 * with 0. Otherwise, we'll start with the last item offset and add 1.
+	 * This ensures that all entries have valid sizes. If we don't do this
+	 * correctly, the hardware will disable flexible field parsing.
+	 */
+	if (!list_empty(flex_pit_list))
+		last_offset = list_prev_entry(entry, list)->src_offset + 1;
+
+	for (; i < 3; i++, last_offset++) {
+		i40e_write_rx_ctl(&pf->hw,
+				  I40E_PRTQF_FLX_PIT(flex_pit_start + i),
+				  I40E_FLEX_PREP_VAL(I40E_FLEX_DEST_UNUSED,
+						     1,
+						     last_offset));
+	}
+}
+
+/**
+ * i40e_reprogram_flex_pit - Reprogram all FLX_PIT tables after input set change
+ * @pf: pointer to the PF structure
+ *
+ * This function reprograms both the L3 and L4 FLX_PIT tables. See the
+ * internal helper function for implementation details.
+ **/
+static void i40e_reprogram_flex_pit(struct i40e_pf *pf)
+{
+	__i40e_reprogram_flex_pit(pf, &pf->l3_flex_pit_list,
+				  I40E_FLEX_PIT_IDX_START_L3);
+
+	__i40e_reprogram_flex_pit(pf, &pf->l4_flex_pit_list,
+				  I40E_FLEX_PIT_IDX_START_L4);
+
+	/* We also need to program the L3 and L4 GLQF ORT register */
+	i40e_write_rx_ctl(&pf->hw,
+			  I40E_GLQF_ORT(I40E_L3_GLQF_ORT_IDX),
+			  I40E_ORT_PREP_VAL(I40E_FLEX_PIT_IDX_START_L3,
+					    3, 1));
+
+	i40e_write_rx_ctl(&pf->hw,
+			  I40E_GLQF_ORT(I40E_L4_GLQF_ORT_IDX),
+			  I40E_ORT_PREP_VAL(I40E_FLEX_PIT_IDX_START_L4,
+					    3, 1));
+}
+
+/**
+ * i40e_flow_str - Converts a flow_type into a human readable string
+ * @flow_type: the flow type from a flow specification
+ *
+ * Currently only flow types we support are included here, and the string
+ * value attempts to match what ethtool would use to configure this flow type.
+ **/
+static const char *i40e_flow_str(struct ethtool_rx_flow_spec *fsp)
+{
+	switch (fsp->flow_type & ~FLOW_EXT) {
+	case TCP_V4_FLOW:
+		return "tcp4";
+	case UDP_V4_FLOW:
+		return "udp4";
+	case SCTP_V4_FLOW:
+		return "sctp4";
+	case IP_USER_FLOW:
+		return "ip4";
+	default:
+		return "unknown";
+	}
+}
+
+/**
+ * i40e_pit_index_to_mask - Return the FLEX mask for a given PIT index
+ * @pit_index: PIT index to convert
+ *
+ * Returns the mask for a given PIT index. Will return 0 if the pit_index is
+ * of range.
+ **/
+static u64 i40e_pit_index_to_mask(int pit_index)
+{
+	switch (pit_index) {
+	case 0:
+		return I40E_FLEX_50_MASK;
+	case 1:
+		return I40E_FLEX_51_MASK;
+	case 2:
+		return I40E_FLEX_52_MASK;
+	case 3:
+		return I40E_FLEX_53_MASK;
+	case 4:
+		return I40E_FLEX_54_MASK;
+	case 5:
+		return I40E_FLEX_55_MASK;
+	case 6:
+		return I40E_FLEX_56_MASK;
+	case 7:
+		return I40E_FLEX_57_MASK;
+	default:
+		return 0;
+	}
+}
+
+/**
+ * i40e_print_input_set - Show changes between two input sets
+ * @vsi: the vsi being configured
+ * @old: the old input set
+ * @new: the new input set
+ *
+ * Print the difference between old and new input sets by showing which series
+ * of words are toggled on or off. Only displays the bits we actually support
+ * changing.
+ **/
+static void i40e_print_input_set(struct i40e_vsi *vsi, u64 old, u64 new)
+{
+	struct i40e_pf *pf = vsi->back;
+	bool old_value, new_value;
+	int i;
+
+	old_value = !!(old & I40E_L3_SRC_MASK);
+	new_value = !!(new & I40E_L3_SRC_MASK);
+	if (old_value != new_value)
+		netif_info(pf, drv, vsi->netdev, "L3 source address: %s -> %s\n",
+			   old_value ? "ON" : "OFF",
+			   new_value ? "ON" : "OFF");
+
+	old_value = !!(old & I40E_L3_DST_MASK);
+	new_value = !!(new & I40E_L3_DST_MASK);
+	if (old_value != new_value)
+		netif_info(pf, drv, vsi->netdev, "L3 destination address: %s -> %s\n",
+			   old_value ? "ON" : "OFF",
+			   new_value ? "ON" : "OFF");
+
+	old_value = !!(old & I40E_L4_SRC_MASK);
+	new_value = !!(new & I40E_L4_SRC_MASK);
+	if (old_value != new_value)
+		netif_info(pf, drv, vsi->netdev, "L4 source port: %s -> %s\n",
+			   old_value ? "ON" : "OFF",
+			   new_value ? "ON" : "OFF");
+
+	old_value = !!(old & I40E_L4_DST_MASK);
+	new_value = !!(new & I40E_L4_DST_MASK);
+	if (old_value != new_value)
+		netif_info(pf, drv, vsi->netdev, "L4 destination port: %s -> %s\n",
+			   old_value ? "ON" : "OFF",
+			   new_value ? "ON" : "OFF");
+
+	old_value = !!(old & I40E_VERIFY_TAG_MASK);
+	new_value = !!(new & I40E_VERIFY_TAG_MASK);
+	if (old_value != new_value)
+		netif_info(pf, drv, vsi->netdev, "SCTP verification tag: %s -> %s\n",
+			   old_value ? "ON" : "OFF",
+			   new_value ? "ON" : "OFF");
+
+	/* Show change of flexible filter entries */
+	for (i = 0; i < I40E_FLEX_INDEX_ENTRIES; i++) {
+		u64 flex_mask = i40e_pit_index_to_mask(i);
+
+		old_value = !!(old & flex_mask);
+		new_value = !!(new & flex_mask);
+		if (old_value != new_value)
+			netif_info(pf, drv, vsi->netdev, "FLEX index %d: %s -> %s\n",
+				   i,
+				   old_value ? "ON" : "OFF",
+				   new_value ? "ON" : "OFF");
+	}
+
+	netif_info(pf, drv, vsi->netdev, "  Current input set: %0llx\n",
+		   old);
+	netif_info(pf, drv, vsi->netdev, "Requested input set: %0llx\n",
+		   new);
+}
+
+/**
+ * i40e_check_fdir_input_set - Check that a given rx_flow_spec mask is valid
+ * @vsi: pointer to the targeted VSI
+ * @fsp: pointer to Rx flow specification
+ * @userdef: userdefined data from flow specification
+ *
+ * Ensures that a given ethtool_rx_flow_spec has a valid mask. Some support
+ * for partial matches exists with a few limitations. First, hardware only
+ * supports masking by word boundary (2 bytes) and not per individual bit.
+ * Second, hardware is limited to using one mask for a flow type and cannot
+ * use a separate mask for each filter.
+ *
+ * To support these limitations, if we already have a configured filter for
+ * the specified type, this function enforces that new filters of the type
+ * match the configured input set. Otherwise, if we do not have a filter of
+ * the specified type, we allow the input set to be updated to match the
+ * desired filter.
+ *
+ * To help ensure that administrators understand why filters weren't displayed
+ * as supported, we print a diagnostic message displaying how the input set
+ * would change and warning to delete the preexisting filters if required.
+ *
+ * Returns 0 on successful input set match, and a negative return code on
+ * failure.
+ **/
+static int i40e_check_fdir_input_set(struct i40e_vsi *vsi,
+				     struct ethtool_rx_flow_spec *fsp,
+				     struct i40e_rx_flow_userdef *userdef)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct ethtool_tcpip4_spec *tcp_ip4_spec;
+	struct ethtool_usrip4_spec *usr_ip4_spec;
+	u64 current_mask, new_mask;
+	bool new_flex_offset = false;
+	bool flex_l3 = false;
+	u16 *fdir_filter_count;
+	u16 index, src_offset = 0;
+	u8 pit_index = 0;
+	int err;
+
+	switch (fsp->flow_type & ~FLOW_EXT) {
+	case SCTP_V4_FLOW:
+		index = I40E_FILTER_PCTYPE_NONF_IPV4_SCTP;
+		fdir_filter_count = &pf->fd_sctp4_filter_cnt;
+		break;
+	case TCP_V4_FLOW:
+		index = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
+		fdir_filter_count = &pf->fd_tcp4_filter_cnt;
+		break;
+	case UDP_V4_FLOW:
+		index = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
+		fdir_filter_count = &pf->fd_udp4_filter_cnt;
+		break;
+	case IP_USER_FLOW:
+		index = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER;
+		fdir_filter_count = &pf->fd_ip4_filter_cnt;
+		flex_l3 = true;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	/* Read the current input set from register memory. */
+	current_mask = i40e_read_fd_input_set(pf, index);
+	new_mask = current_mask;
+
+	/* Determine, if any, the required changes to the input set in order
+	 * to support the provided mask.
+	 *
+	 * Hardware only supports masking at word (2 byte) granularity and does
+	 * not support full bitwise masking. This implementation simplifies
+	 * even further and only supports fully enabled or fully disabled
+	 * masks for each field, even though we could split the ip4src and
+	 * ip4dst fields.
+	 */
+	switch (fsp->flow_type & ~FLOW_EXT) {
+	case SCTP_V4_FLOW:
+		new_mask &= ~I40E_VERIFY_TAG_MASK;
+		/* Fall through */
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+		tcp_ip4_spec = &fsp->m_u.tcp_ip4_spec;
+
+		/* IPv4 source address */
+		if (tcp_ip4_spec->ip4src == htonl(0xFFFFFFFF))
+			new_mask |= I40E_L3_SRC_MASK;
+		else if (!tcp_ip4_spec->ip4src)
+			new_mask &= ~I40E_L3_SRC_MASK;
+		else
+			return -EOPNOTSUPP;
+
+		/* IPv4 destination address */
+		if (tcp_ip4_spec->ip4dst == htonl(0xFFFFFFFF))
+			new_mask |= I40E_L3_DST_MASK;
+		else if (!tcp_ip4_spec->ip4dst)
+			new_mask &= ~I40E_L3_DST_MASK;
+		else
+			return -EOPNOTSUPP;
+
+		/* L4 source port */
+		if (tcp_ip4_spec->psrc == htons(0xFFFF))
+			new_mask |= I40E_L4_SRC_MASK;
+		else if (!tcp_ip4_spec->psrc)
+			new_mask &= ~I40E_L4_SRC_MASK;
+		else
+			return -EOPNOTSUPP;
+
+		/* L4 destination port */
+		if (tcp_ip4_spec->pdst == htons(0xFFFF))
+			new_mask |= I40E_L4_DST_MASK;
+		else if (!tcp_ip4_spec->pdst)
+			new_mask &= ~I40E_L4_DST_MASK;
+		else
+			return -EOPNOTSUPP;
+
+		/* Filtering on Type of Service is not supported. */
+		if (tcp_ip4_spec->tos)
+			return -EOPNOTSUPP;
+
+		break;
+	case IP_USER_FLOW:
+		usr_ip4_spec = &fsp->m_u.usr_ip4_spec;
+
+		/* IPv4 source address */
+		if (usr_ip4_spec->ip4src == htonl(0xFFFFFFFF))
+			new_mask |= I40E_L3_SRC_MASK;
+		else if (!usr_ip4_spec->ip4src)
+			new_mask &= ~I40E_L3_SRC_MASK;
+		else
+			return -EOPNOTSUPP;
+
+		/* IPv4 destination address */
+		if (usr_ip4_spec->ip4dst == htonl(0xFFFFFFFF))
+			new_mask |= I40E_L3_DST_MASK;
+		else if (!usr_ip4_spec->ip4dst)
+			new_mask &= ~I40E_L3_DST_MASK;
+		else
+			return -EOPNOTSUPP;
+
+		/* First 4 bytes of L4 header */
+		if (usr_ip4_spec->l4_4_bytes == htonl(0xFFFFFFFF))
+			new_mask |= I40E_L4_SRC_MASK | I40E_L4_DST_MASK;
+		else if (!usr_ip4_spec->l4_4_bytes)
+			new_mask &= ~(I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
+		else
+			return -EOPNOTSUPP;
+
+		/* Filtering on Type of Service is not supported. */
+		if (usr_ip4_spec->tos)
+			return -EOPNOTSUPP;
+
+		/* Filtering on IP version is not supported */
+		if (usr_ip4_spec->ip_ver)
+			return -EINVAL;
+
+		/* Filtering on L4 protocol is not supported */
+		if (usr_ip4_spec->proto)
+			return -EINVAL;
+
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	/* First, clear all flexible filter entries */
+	new_mask &= ~I40E_FLEX_INPUT_MASK;
+
+	/* If we have a flexible filter, try to add this offset to the correct
+	 * flexible filter PIT list. Once finished, we can update the mask.
+	 * If the src_offset changed, we will get a new mask value which will
+	 * trigger an input set change.
+	 */
+	if (userdef->flex_filter) {
+		struct i40e_flex_pit *l3_flex_pit = NULL, *flex_pit = NULL;
+
+		/* Flexible offset must be even, since the flexible payload
+		 * must be aligned on 2-byte boundary.
+		 */
+		if (userdef->flex_offset & 0x1) {
+			dev_warn(&pf->pdev->dev,
+				 "Flexible data offset must be 2-byte aligned\n");
+			return -EINVAL;
+		}
+
+		src_offset = userdef->flex_offset >> 1;
+
+		/* FLX_PIT source offset value is only so large */
+		if (src_offset > I40E_MAX_FLEX_SRC_OFFSET) {
+			dev_warn(&pf->pdev->dev,
+				 "Flexible data must reside within first 64 bytes of the packet payload\n");
+			return -EINVAL;
+		}
+
+		/* See if this offset has already been programmed. If we get
+		 * an ERR_PTR, then the filter is not safe to add. Otherwise,
+		 * if we get a NULL pointer, this means we will need to add
+		 * the offset.
+		 */
+		flex_pit = i40e_find_flex_offset(&pf->l4_flex_pit_list,
+						 src_offset);
+		if (IS_ERR(flex_pit))
+			return PTR_ERR(flex_pit);
+
+		/* IP_USER_FLOW filters match both L4 (ICMP) and L3 (unknown)
+		 * packet types, and thus we need to program both L3 and L4
+		 * flexible values. These must have identical flexible index,
+		 * as otherwise we can't correctly program the input set. So
+		 * we'll find both an L3 and L4 index and make sure they are
+		 * the same.
+		 */
+		if (flex_l3) {
+			l3_flex_pit =
+				i40e_find_flex_offset(&pf->l3_flex_pit_list,
+						      src_offset);
+			if (IS_ERR(l3_flex_pit))
+				return PTR_ERR(l3_flex_pit);
+
+			if (flex_pit) {
+				/* If we already had a matching L4 entry, we
+				 * need to make sure that the L3 entry we
+				 * obtained uses the same index.
+				 */
+				if (l3_flex_pit) {
+					if (l3_flex_pit->pit_index !=
+					    flex_pit->pit_index) {
+						return -EINVAL;
+					}
+				} else {
+					new_flex_offset = true;
+				}
+			} else {
+				flex_pit = l3_flex_pit;
+			}
+		}
+
+		/* If we didn't find an existing flex offset, we need to
+		 * program a new one. However, we don't immediately program it
+		 * here because we will wait to program until after we check
+		 * that it is safe to change the input set.
+		 */
+		if (!flex_pit) {
+			new_flex_offset = true;
+			pit_index = i40e_unused_pit_index(pf);
+		} else {
+			pit_index = flex_pit->pit_index;
+		}
+
+		/* Update the mask with the new offset */
+		new_mask |= i40e_pit_index_to_mask(pit_index);
+	}
+
+	/* If the mask and flexible filter offsets for this filter match the
+	 * currently programmed values we don't need any input set change, so
+	 * this filter is safe to install.
+	 */
+	if (new_mask == current_mask && !new_flex_offset)
+		return 0;
+
+	netif_info(pf, drv, vsi->netdev, "Input set change requested for %s flows:\n",
+		   i40e_flow_str(fsp));
+	i40e_print_input_set(vsi, current_mask, new_mask);
+	if (new_flex_offset) {
+		netif_info(pf, drv, vsi->netdev, "FLEX index %d: Offset -> %d",
+			   pit_index, src_offset);
+	}
+
+	/* Hardware input sets are global across multiple ports, so even the
+	 * main port cannot change them when in MFP mode as this would impact
+	 * any filters on the other ports.
+	 */
+	if (pf->flags & I40E_FLAG_MFP_ENABLED) {
+		netif_err(pf, drv, vsi->netdev, "Cannot change Flow Director input sets while MFP is enabled\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* This filter requires us to update the input set. However, hardware
+	 * only supports one input set per flow type, and does not support
+	 * separate masks for each filter. This means that we can only support
+	 * a single mask for all filters of a specific type.
+	 *
+	 * If we have preexisting filters, they obviously depend on the
+	 * current programmed input set. Display a diagnostic message in this
+	 * case explaining why the filter could not be accepted.
+	 */
+	if (*fdir_filter_count) {
+		netif_err(pf, drv, vsi->netdev, "Cannot change input set for %s flows until %d preexisting filters are removed\n",
+			  i40e_flow_str(fsp),
+			  *fdir_filter_count);
+		return -EOPNOTSUPP;
+	}
+
+	i40e_write_fd_input_set(pf, index, new_mask);
+
+	/* IP_USER_FLOW filters match both IPv4/Other and IPv4/Fragmented
+	 * frames. If we're programming the input set for IPv4/Other, we also
+	 * need to program the IPv4/Fragmented input set. Since we don't have
+	 * separate support, we'll always assume and enforce that the two flow
+	 * types must have matching input sets.
+	 */
+	if (index == I40E_FILTER_PCTYPE_NONF_IPV4_OTHER)
+		i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_FRAG_IPV4,
+					new_mask);
+
+	/* Add the new offset and update table, if necessary */
+	if (new_flex_offset) {
+		err = i40e_add_flex_offset(&pf->l4_flex_pit_list, src_offset,
+					   pit_index);
+		if (err)
+			return err;
+
+		if (flex_l3) {
+			err = i40e_add_flex_offset(&pf->l3_flex_pit_list,
+						   src_offset,
+						   pit_index);
+			if (err)
+				return err;
+		}
+
+		i40e_reprogram_flex_pit(pf);
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_match_fdir_filter - Return true of two filters match
+ * @a: pointer to filter struct
+ * @b: pointer to filter struct
+ *
+ * Returns true if the two filters match exactly the same criteria. I.e. they
+ * match the same flow type and have the same parameters. We don't need to
+ * check any input-set since all filters of the same flow type must use the
+ * same input set.
+ **/
+static bool i40e_match_fdir_filter(struct i40e_fdir_filter *a,
+				   struct i40e_fdir_filter *b)
+{
+	/* The filters do not much if any of these criteria differ. */
+	if (a->dst_ip != b->dst_ip ||
+	    a->src_ip != b->src_ip ||
+	    a->dst_port != b->dst_port ||
+	    a->src_port != b->src_port ||
+	    a->flow_type != b->flow_type ||
+	    a->ip4_proto != b->ip4_proto)
+		return false;
+
+	return true;
+}
+
+/**
+ * i40e_disallow_matching_filters - Check that new filters differ
+ * @vsi: pointer to the targeted VSI
+ * @input: new filter to check
+ *
+ * Due to hardware limitations, it is not possible for two filters that match
+ * similar criteria to be programmed at the same time. This is true for a few
+ * reasons:
+ *
+ * (a) all filters matching a particular flow type must use the same input
+ * set, that is they must match the same criteria.
+ * (b) different flow types will never match the same packet, as the flow type
+ * is decided by hardware before checking which rules apply.
+ * (c) hardware has no way to distinguish which order filters apply in.
+ *
+ * Due to this, we can't really support using the location data to order
+ * filters in the hardware parsing. It is technically possible for the user to
+ * request two filters matching the same criteria but which select different
+ * queues. In this case, rather than keep both filters in the list, we reject
+ * the 2nd filter when the user requests adding it.
+ *
+ * This avoids needing to track location for programming the filter to
+ * hardware, and ensures that we avoid some strange scenarios involving
+ * deleting filters which match the same criteria.
+ **/
+static int i40e_disallow_matching_filters(struct i40e_vsi *vsi,
+					  struct i40e_fdir_filter *input)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_fdir_filter *rule;
+	struct hlist_node *node2;
+
+	/* Loop through every filter, and check that it doesn't match */
+	hlist_for_each_entry_safe(rule, node2,
+				  &pf->fdir_filter_list, fdir_node) {
+		/* Don't check the filters match if they share the same fd_id,
+		 * since the new filter is actually just updating the target
+		 * of the old filter.
+		 */
+		if (rule->fd_id == input->fd_id)
+			continue;
+
+		/* If any filters match, then print a warning message to the
+		 * kernel message buffer and bail out.
+		 */
+		if (i40e_match_fdir_filter(rule, input)) {
+			dev_warn(&pf->pdev->dev,
+				 "Existing user defined filter %d already matches this flow.\n",
+				 rule->fd_id);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_add_fdir_ethtool - Add/Remove Flow Director filters
+ * @vsi: pointer to the targeted VSI
+ * @cmd: command to get or set RX flow classification rules
+ *
+ * Add Flow Director filters for a specific flow spec based on their
+ * protocol.  Returns 0 if the filters were successfully added.
+ **/
+static int i40e_add_fdir_ethtool(struct i40e_vsi *vsi,
+				 struct ethtool_rxnfc *cmd)
+{
+	struct i40e_rx_flow_userdef userdef;
+	struct ethtool_rx_flow_spec *fsp;
+	struct i40e_fdir_filter *input;
+	u16 dest_vsi = 0, q_index = 0;
+	struct i40e_pf *pf;
+	int ret = -EINVAL;
+	u8 dest_ctl;
+
+	if (!vsi)
+		return -EINVAL;
+	pf = vsi->back;
+
+	if (!(pf->flags & I40E_FLAG_FD_SB_ENABLED))
+		return -EOPNOTSUPP;
+
+	if (test_bit(__I40E_FD_SB_AUTO_DISABLED, pf->state))
+		return -ENOSPC;
+
+	if (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state) ||
+	    test_bit(__I40E_RESET_INTR_RECEIVED, pf->state))
+		return -EBUSY;
+
+	if (test_bit(__I40E_FD_FLUSH_REQUESTED, pf->state))
+		return -EBUSY;
+
+	fsp = (struct ethtool_rx_flow_spec *)&cmd->fs;
+
+	/* Parse the user-defined field */
+	if (i40e_parse_rx_flow_user_data(fsp, &userdef))
+		return -EINVAL;
+
+	/* Extended MAC field is not supported */
+	if (fsp->flow_type & FLOW_MAC_EXT)
+		return -EINVAL;
+
+	ret = i40e_check_fdir_input_set(vsi, fsp, &userdef);
+	if (ret)
+		return ret;
+
+	if (fsp->location >= (pf->hw.func_caps.fd_filters_best_effort +
+			      pf->hw.func_caps.fd_filters_guaranteed)) {
+		return -EINVAL;
+	}
+
+	/* ring_cookie is either the drop index, or is a mask of the queue
+	 * index and VF id we wish to target.
+	 */
+	if (fsp->ring_cookie == RX_CLS_FLOW_DISC) {
+		dest_ctl = I40E_FILTER_PROGRAM_DESC_DEST_DROP_PACKET;
+	} else {
+		u32 ring = ethtool_get_flow_spec_ring(fsp->ring_cookie);
+		u8 vf = ethtool_get_flow_spec_ring_vf(fsp->ring_cookie);
+
+		if (!vf) {
+			if (ring >= vsi->num_queue_pairs)
+				return -EINVAL;
+			dest_vsi = vsi->id;
+		} else {
+			/* VFs are zero-indexed, so we subtract one here */
+			vf--;
+
+			if (vf >= pf->num_alloc_vfs)
+				return -EINVAL;
+			if (ring >= pf->vf[vf].num_queue_pairs)
+				return -EINVAL;
+			dest_vsi = pf->vf[vf].lan_vsi_id;
+		}
+		dest_ctl = I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_QINDEX;
+		q_index = ring;
+	}
+
+	input = kzalloc(sizeof(*input), GFP_KERNEL);
+
+	if (!input)
+		return -ENOMEM;
+
+	input->fd_id = fsp->location;
+	input->q_index = q_index;
+	input->dest_vsi = dest_vsi;
+	input->dest_ctl = dest_ctl;
+	input->fd_status = I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID;
+	input->cnt_index  = I40E_FD_SB_STAT_IDX(pf->hw.pf_id);
+	input->dst_ip = fsp->h_u.tcp_ip4_spec.ip4src;
+	input->src_ip = fsp->h_u.tcp_ip4_spec.ip4dst;
+	input->flow_type = fsp->flow_type & ~FLOW_EXT;
+	input->ip4_proto = fsp->h_u.usr_ip4_spec.proto;
+
+	/* Reverse the src and dest notion, since the HW expects them to be from
+	 * Tx perspective where as the input from user is from Rx filter view.
+	 */
+	input->dst_port = fsp->h_u.tcp_ip4_spec.psrc;
+	input->src_port = fsp->h_u.tcp_ip4_spec.pdst;
+	input->dst_ip = fsp->h_u.tcp_ip4_spec.ip4src;
+	input->src_ip = fsp->h_u.tcp_ip4_spec.ip4dst;
+
+	if (userdef.flex_filter) {
+		input->flex_filter = true;
+		input->flex_word = cpu_to_be16(userdef.flex_word);
+		input->flex_offset = userdef.flex_offset;
+	}
+
+	/* Avoid programming two filters with identical match criteria. */
+	ret = i40e_disallow_matching_filters(vsi, input);
+	if (ret)
+		goto free_filter_memory;
+
+	/* Add the input filter to the fdir_input_list, possibly replacing
+	 * a previous filter. Do not free the input structure after adding it
+	 * to the list as this would cause a use-after-free bug.
+	 */
+	i40e_update_ethtool_fdir_entry(vsi, input, fsp->location, NULL);
+	ret = i40e_add_del_fdir(vsi, input, true);
+	if (ret)
+		goto remove_sw_rule;
+	return 0;
+
+remove_sw_rule:
+	hlist_del(&input->fdir_node);
+	pf->fdir_pf_active_filters--;
+free_filter_memory:
+	kfree(input);
+	return ret;
+}
+
+/**
+ * i40e_set_rxnfc - command to set RX flow classification rules
+ * @netdev: network interface device structure
+ * @cmd: ethtool rxnfc command
+ *
+ * Returns Success if the command is supported.
+ **/
+static int i40e_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	int ret = -EOPNOTSUPP;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXFH:
+		ret = i40e_set_rss_hash_opt(pf, cmd);
+		break;
+	case ETHTOOL_SRXCLSRLINS:
+		ret = i40e_add_fdir_ethtool(vsi, cmd);
+		break;
+	case ETHTOOL_SRXCLSRLDEL:
+		ret = i40e_del_fdir_entry(vsi, cmd);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * i40e_max_channels - get Max number of combined channels supported
+ * @vsi: vsi pointer
+ **/
+static unsigned int i40e_max_channels(struct i40e_vsi *vsi)
+{
+	/* TODO: This code assumes DCB and FD is disabled for now. */
+	return vsi->alloc_queue_pairs;
+}
+
+/**
+ * i40e_get_channels - Get the current channels enabled and max supported etc.
+ * @netdev: network interface device structure
+ * @ch: ethtool channels structure
+ *
+ * We don't support separate tx and rx queues as channels. The other count
+ * represents how many queues are being used for control. max_combined counts
+ * how many queue pairs we can support. They may not be mapped 1 to 1 with
+ * q_vectors since we support a lot more queue pairs than q_vectors.
+ **/
+static void i40e_get_channels(struct net_device *dev,
+			       struct ethtool_channels *ch)
+{
+	struct i40e_netdev_priv *np = netdev_priv(dev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+
+	/* report maximum channels */
+	ch->max_combined = i40e_max_channels(vsi);
+
+	/* report info for other vector */
+	ch->other_count = (pf->flags & I40E_FLAG_FD_SB_ENABLED) ? 1 : 0;
+	ch->max_other = ch->other_count;
+
+	/* Note: This code assumes DCB is disabled for now. */
+	ch->combined_count = vsi->num_queue_pairs;
+}
+
+/**
+ * i40e_set_channels - Set the new channels count.
+ * @netdev: network interface device structure
+ * @ch: ethtool channels structure
+ *
+ * The new channels count may not be the same as requested by the user
+ * since it gets rounded down to a power of 2 value.
+ **/
+static int i40e_set_channels(struct net_device *dev,
+			      struct ethtool_channels *ch)
+{
+	const u8 drop = I40E_FILTER_PROGRAM_DESC_DEST_DROP_PACKET;
+	struct i40e_netdev_priv *np = netdev_priv(dev);
+	unsigned int count = ch->combined_count;
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_fdir_filter *rule;
+	struct hlist_node *node2;
+	int new_count;
+	int err = 0;
+
+	/* We do not support setting channels for any other VSI at present */
+	if (vsi->type != I40E_VSI_MAIN)
+		return -EINVAL;
+
+	/* We do not support setting channels via ethtool when TCs are
+	 * configured through mqprio
+	 */
+	if (pf->flags & I40E_FLAG_TC_MQPRIO)
+		return -EINVAL;
+
+	/* verify they are not requesting separate vectors */
+	if (!count || ch->rx_count || ch->tx_count)
+		return -EINVAL;
+
+	/* verify other_count has not changed */
+	if (ch->other_count != ((pf->flags & I40E_FLAG_FD_SB_ENABLED) ? 1 : 0))
+		return -EINVAL;
+
+	/* verify the number of channels does not exceed hardware limits */
+	if (count > i40e_max_channels(vsi))
+		return -EINVAL;
+
+	/* verify that the number of channels does not invalidate any current
+	 * flow director rules
+	 */
+	hlist_for_each_entry_safe(rule, node2,
+				  &pf->fdir_filter_list, fdir_node) {
+		if (rule->dest_ctl != drop && count <= rule->q_index) {
+			dev_warn(&pf->pdev->dev,
+				 "Existing user defined filter %d assigns flow to queue %d\n",
+				 rule->fd_id, rule->q_index);
+			err = -EINVAL;
+		}
+	}
+
+	if (err) {
+		dev_err(&pf->pdev->dev,
+			"Existing filter rules must be deleted to reduce combined channel count to %d\n",
+			count);
+		return err;
+	}
+
+	/* update feature limits from largest to smallest supported values */
+	/* TODO: Flow director limit, DCB etc */
+
+	/* use rss_reconfig to rebuild with new queue count and update traffic
+	 * class queue mapping
+	 */
+	new_count = i40e_reconfig_rss_queues(pf, count);
+	if (new_count > 0)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+/**
+ * i40e_get_rxfh_key_size - get the RSS hash key size
+ * @netdev: network interface device structure
+ *
+ * Returns the table size.
+ **/
+static u32 i40e_get_rxfh_key_size(struct net_device *netdev)
+{
+	return I40E_HKEY_ARRAY_SIZE;
+}
+
+/**
+ * i40e_get_rxfh_indir_size - get the rx flow hash indirection table size
+ * @netdev: network interface device structure
+ *
+ * Returns the table size.
+ **/
+static u32 i40e_get_rxfh_indir_size(struct net_device *netdev)
+{
+	return I40E_HLUT_ARRAY_SIZE;
+}
+
+/**
+ * i40e_get_rxfh - get the rx flow hash indirection table
+ * @netdev: network interface device structure
+ * @indir: indirection table
+ * @key: hash key
+ * @hfunc: hash function
+ *
+ * Reads the indirection table directly from the hardware. Returns 0 on
+ * success.
+ **/
+static int i40e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
+			 u8 *hfunc)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	u8 *lut, *seed = NULL;
+	int ret;
+	u16 i;
+
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+
+	if (!indir)
+		return 0;
+
+	seed = key;
+	lut = kzalloc(I40E_HLUT_ARRAY_SIZE, GFP_KERNEL);
+	if (!lut)
+		return -ENOMEM;
+	ret = i40e_get_rss(vsi, seed, lut, I40E_HLUT_ARRAY_SIZE);
+	if (ret)
+		goto out;
+	for (i = 0; i < I40E_HLUT_ARRAY_SIZE; i++)
+		indir[i] = (u32)(lut[i]);
+
+out:
+	kfree(lut);
+
+	return ret;
+}
+
+/**
+ * i40e_set_rxfh - set the rx flow hash indirection table
+ * @netdev: network interface device structure
+ * @indir: indirection table
+ * @key: hash key
+ *
+ * Returns -EINVAL if the table specifies an invalid queue id, otherwise
+ * returns 0 after programming the table.
+ **/
+static int i40e_set_rxfh(struct net_device *netdev, const u32 *indir,
+			 const u8 *key, const u8 hfunc)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	u8 *seed = NULL;
+	u16 i;
+
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+		return -EOPNOTSUPP;
+
+	if (key) {
+		if (!vsi->rss_hkey_user) {
+			vsi->rss_hkey_user = kzalloc(I40E_HKEY_ARRAY_SIZE,
+						     GFP_KERNEL);
+			if (!vsi->rss_hkey_user)
+				return -ENOMEM;
+		}
+		memcpy(vsi->rss_hkey_user, key, I40E_HKEY_ARRAY_SIZE);
+		seed = vsi->rss_hkey_user;
+	}
+	if (!vsi->rss_lut_user) {
+		vsi->rss_lut_user = kzalloc(I40E_HLUT_ARRAY_SIZE, GFP_KERNEL);
+		if (!vsi->rss_lut_user)
+			return -ENOMEM;
+	}
+
+	/* Each 32 bits pointed by 'indir' is stored with a lut entry */
+	if (indir)
+		for (i = 0; i < I40E_HLUT_ARRAY_SIZE; i++)
+			vsi->rss_lut_user[i] = (u8)(indir[i]);
+	else
+		i40e_fill_rss_lut(pf, vsi->rss_lut_user, I40E_HLUT_ARRAY_SIZE,
+				  vsi->rss_size);
+
+	return i40e_config_rss(vsi, seed, vsi->rss_lut_user,
+			       I40E_HLUT_ARRAY_SIZE);
+}
+
+/**
+ * i40e_get_priv_flags - report device private flags
+ * @dev: network interface device structure
+ *
+ * The get string set count and the string set should be matched for each
+ * flag returned.  Add new strings for each flag to the i40e_gstrings_priv_flags
+ * array.
+ *
+ * Returns a u32 bitmap of flags.
+ **/
+static u32 i40e_get_priv_flags(struct net_device *dev)
+{
+	struct i40e_netdev_priv *np = netdev_priv(dev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	u32 i, j, ret_flags = 0;
+
+	for (i = 0; i < I40E_PRIV_FLAGS_STR_LEN; i++) {
+		const struct i40e_priv_flags *priv_flags;
+
+		priv_flags = &i40e_gstrings_priv_flags[i];
+
+		if (priv_flags->flag & pf->flags)
+			ret_flags |= BIT(i);
+	}
+
+	if (pf->hw.pf_id != 0)
+		return ret_flags;
+
+	for (j = 0; j < I40E_GL_PRIV_FLAGS_STR_LEN; j++) {
+		const struct i40e_priv_flags *priv_flags;
+
+		priv_flags = &i40e_gl_gstrings_priv_flags[j];
+
+		if (priv_flags->flag & pf->flags)
+			ret_flags |= BIT(i + j);
+	}
+
+	return ret_flags;
+}
+
+/**
+ * i40e_set_priv_flags - set private flags
+ * @dev: network interface device structure
+ * @flags: bit flags to be set
+ **/
+static int i40e_set_priv_flags(struct net_device *dev, u32 flags)
+{
+	struct i40e_netdev_priv *np = netdev_priv(dev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	u64 orig_flags, new_flags, changed_flags;
+	u32 i, j;
+
+	orig_flags = READ_ONCE(pf->flags);
+	new_flags = orig_flags;
+
+	for (i = 0; i < I40E_PRIV_FLAGS_STR_LEN; i++) {
+		const struct i40e_priv_flags *priv_flags;
+
+		priv_flags = &i40e_gstrings_priv_flags[i];
+
+		if (flags & BIT(i))
+			new_flags |= priv_flags->flag;
+		else
+			new_flags &= ~(priv_flags->flag);
+
+		/* If this is a read-only flag, it can't be changed */
+		if (priv_flags->read_only &&
+		    ((orig_flags ^ new_flags) & ~BIT(i)))
+			return -EOPNOTSUPP;
+	}
+
+	if (pf->hw.pf_id != 0)
+		goto flags_complete;
+
+	for (j = 0; j < I40E_GL_PRIV_FLAGS_STR_LEN; j++) {
+		const struct i40e_priv_flags *priv_flags;
+
+		priv_flags = &i40e_gl_gstrings_priv_flags[j];
+
+		if (flags & BIT(i + j))
+			new_flags |= priv_flags->flag;
+		else
+			new_flags &= ~(priv_flags->flag);
+
+		/* If this is a read-only flag, it can't be changed */
+		if (priv_flags->read_only &&
+		    ((orig_flags ^ new_flags) & ~BIT(i)))
+			return -EOPNOTSUPP;
+	}
+
+flags_complete:
+	changed_flags = orig_flags ^ new_flags;
+
+	/* Before we finalize any flag changes, we need to perform some
+	 * checks to ensure that the changes are supported and safe.
+	 */
+
+	/* ATR eviction is not supported on all devices */
+	if ((new_flags & I40E_FLAG_HW_ATR_EVICT_ENABLED) &&
+	    !(pf->hw_features & I40E_HW_ATR_EVICT_CAPABLE))
+		return -EOPNOTSUPP;
+
+	/* If the driver detected FW LLDP was disabled on init, this flag could
+	 * be set, however we do not support _changing_ the flag if NPAR is
+	 * enabled or FW API version < 1.7.  There are situations where older
+	 * FW versions/NPAR enabled PFs could disable LLDP, however we _must_
+	 * not allow the user to enable/disable LLDP with this flag on
+	 * unsupported FW versions.
+	 */
+	if (changed_flags & I40E_FLAG_DISABLE_FW_LLDP) {
+		if (!(pf->hw_features & I40E_HW_STOPPABLE_FW_LLDP)) {
+			dev_warn(&pf->pdev->dev,
+				 "Device does not support changing FW LLDP\n");
+			return -EOPNOTSUPP;
+		}
+	}
+
+	/* Now that we've checked to ensure that the new flags are valid, load
+	 * them into place. Since we only modify flags either (a) during
+	 * initialization or (b) while holding the RTNL lock, we don't need
+	 * anything fancy here.
+	 */
+	pf->flags = new_flags;
+
+	/* Process any additional changes needed as a result of flag changes.
+	 * The changed_flags value reflects the list of bits that were
+	 * changed in the code above.
+	 */
+
+	/* Flush current ATR settings if ATR was disabled */
+	if ((changed_flags & I40E_FLAG_FD_ATR_ENABLED) &&
+	    !(pf->flags & I40E_FLAG_FD_ATR_ENABLED)) {
+		set_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state);
+		set_bit(__I40E_FD_FLUSH_REQUESTED, pf->state);
+	}
+
+	if (changed_flags & I40E_FLAG_TRUE_PROMISC_SUPPORT) {
+		u16 sw_flags = 0, valid_flags = 0;
+		int ret;
+
+		if (!(pf->flags & I40E_FLAG_TRUE_PROMISC_SUPPORT))
+			sw_flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
+		valid_flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
+		ret = i40e_aq_set_switch_config(&pf->hw, sw_flags, valid_flags,
+						0, NULL);
+		if (ret && pf->hw.aq.asq_last_status != I40E_AQ_RC_ESRCH) {
+			dev_info(&pf->pdev->dev,
+				 "couldn't set switch config bits, err %s aq_err %s\n",
+				 i40e_stat_str(&pf->hw, ret),
+				 i40e_aq_str(&pf->hw,
+					     pf->hw.aq.asq_last_status));
+			/* not a fatal problem, just keep going */
+		}
+	}
+
+	if ((changed_flags & pf->flags &
+	     I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED) &&
+	    (pf->flags & I40E_FLAG_MFP_ENABLED))
+		dev_warn(&pf->pdev->dev,
+			 "Turning on link-down-on-close flag may affect other partitions\n");
+
+	if (changed_flags & I40E_FLAG_DISABLE_FW_LLDP) {
+		if (pf->flags & I40E_FLAG_DISABLE_FW_LLDP) {
+			struct i40e_dcbx_config *dcbcfg;
+			int i;
+
+			i40e_aq_stop_lldp(&pf->hw, true, NULL);
+			i40e_aq_set_dcb_parameters(&pf->hw, true, NULL);
+			/* reset local_dcbx_config to default */
+			dcbcfg = &pf->hw.local_dcbx_config;
+			dcbcfg->etscfg.willing = 1;
+			dcbcfg->etscfg.maxtcs = 0;
+			dcbcfg->etscfg.tcbwtable[0] = 100;
+			for (i = 1; i < I40E_MAX_TRAFFIC_CLASS; i++)
+				dcbcfg->etscfg.tcbwtable[i] = 0;
+			for (i = 0; i < I40E_MAX_USER_PRIORITY; i++)
+				dcbcfg->etscfg.prioritytable[i] = 0;
+			dcbcfg->etscfg.tsatable[0] = I40E_IEEE_TSA_ETS;
+			dcbcfg->pfc.willing = 1;
+			dcbcfg->pfc.pfccap = I40E_MAX_TRAFFIC_CLASS;
+		} else {
+			i40e_aq_start_lldp(&pf->hw, NULL);
+		}
+	}
+
+	/* Issue reset to cause things to take effect, as additional bits
+	 * are added we will need to create a mask of bits requiring reset
+	 */
+	if (changed_flags & (I40E_FLAG_VEB_STATS_ENABLED |
+			     I40E_FLAG_LEGACY_RX |
+			     I40E_FLAG_SOURCE_PRUNING_DISABLED |
+			     I40E_FLAG_DISABLE_FW_LLDP))
+		i40e_do_reset(pf, BIT(__I40E_PF_RESET_REQUESTED), true);
+
+	return 0;
+}
+
+/**
+ * i40e_get_module_info - get (Q)SFP+ module type info
+ * @netdev: network interface device structure
+ * @modinfo: module EEPROM size and layout information structure
+ **/
+static int i40e_get_module_info(struct net_device *netdev,
+				struct ethtool_modinfo *modinfo)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	u32 sff8472_comp = 0;
+	u32 sff8472_swap = 0;
+	u32 sff8636_rev = 0;
+	i40e_status status;
+	u32 type = 0;
+
+	/* Check if firmware supports reading module EEPROM. */
+	if (!(hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE)) {
+		netdev_err(vsi->netdev, "Module EEPROM memory read not supported. Please update the NVM image.\n");
+		return -EINVAL;
+	}
+
+	status = i40e_update_link_info(hw);
+	if (status)
+		return -EIO;
+
+	if (hw->phy.link_info.phy_type == I40E_PHY_TYPE_EMPTY) {
+		netdev_err(vsi->netdev, "Cannot read module EEPROM memory. No module connected.\n");
+		return -EINVAL;
+	}
+
+	type = hw->phy.link_info.module_type[0];
+
+	switch (type) {
+	case I40E_MODULE_TYPE_SFP:
+		status = i40e_aq_get_phy_register(hw,
+				I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE,
+				I40E_I2C_EEPROM_DEV_ADDR,
+				I40E_MODULE_SFF_8472_COMP,
+				&sff8472_comp, NULL);
+		if (status)
+			return -EIO;
+
+		status = i40e_aq_get_phy_register(hw,
+				I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE,
+				I40E_I2C_EEPROM_DEV_ADDR,
+				I40E_MODULE_SFF_8472_SWAP,
+				&sff8472_swap, NULL);
+		if (status)
+			return -EIO;
+
+		/* Check if the module requires address swap to access
+		 * the other EEPROM memory page.
+		 */
+		if (sff8472_swap & I40E_MODULE_SFF_ADDR_MODE) {
+			netdev_warn(vsi->netdev, "Module address swap to access page 0xA2 is not supported.\n");
+			modinfo->type = ETH_MODULE_SFF_8079;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
+		} else if (sff8472_comp == 0x00) {
+			/* Module is not SFF-8472 compliant */
+			modinfo->type = ETH_MODULE_SFF_8079;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8472;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		}
+		break;
+	case I40E_MODULE_TYPE_QSFP_PLUS:
+		/* Read from memory page 0. */
+		status = i40e_aq_get_phy_register(hw,
+				I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE,
+				0,
+				I40E_MODULE_REVISION_ADDR,
+				&sff8636_rev, NULL);
+		if (status)
+			return -EIO;
+		/* Determine revision compliance byte */
+		if (sff8636_rev > 0x02) {
+			/* Module is SFF-8636 compliant */
+			modinfo->type = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = I40E_MODULE_QSFP_MAX_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = I40E_MODULE_QSFP_MAX_LEN;
+		}
+		break;
+	case I40E_MODULE_TYPE_QSFP28:
+		modinfo->type = ETH_MODULE_SFF_8636;
+		modinfo->eeprom_len = I40E_MODULE_QSFP_MAX_LEN;
+		break;
+	default:
+		netdev_err(vsi->netdev, "Module type unrecognized\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * i40e_get_module_eeprom - fills buffer with (Q)SFP+ module memory contents
+ * @netdev: network interface device structure
+ * @ee: EEPROM dump request structure
+ * @data: buffer to be filled with EEPROM contents
+ **/
+static int i40e_get_module_eeprom(struct net_device *netdev,
+				  struct ethtool_eeprom *ee,
+				  u8 *data)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	bool is_sfp = false;
+	i40e_status status;
+	u32 value = 0;
+	int i;
+
+	if (!ee || !ee->len || !data)
+		return -EINVAL;
+
+	if (hw->phy.link_info.module_type[0] == I40E_MODULE_TYPE_SFP)
+		is_sfp = true;
+
+	for (i = 0; i < ee->len; i++) {
+		u32 offset = i + ee->offset;
+		u32 addr = is_sfp ? I40E_I2C_EEPROM_DEV_ADDR : 0;
+
+		/* Check if we need to access the other memory page */
+		if (is_sfp) {
+			if (offset >= ETH_MODULE_SFF_8079_LEN) {
+				offset -= ETH_MODULE_SFF_8079_LEN;
+				addr = I40E_I2C_EEPROM_DEV_ADDR2;
+			}
+		} else {
+			while (offset >= ETH_MODULE_SFF_8436_LEN) {
+				/* Compute memory page number and offset. */
+				offset -= ETH_MODULE_SFF_8436_LEN / 2;
+				addr++;
+			}
+		}
+
+		status = i40e_aq_get_phy_register(hw,
+				I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE,
+				addr, offset, &value, NULL);
+		if (status)
+			return -EIO;
+		data[i] = value;
+	}
+	return 0;
+}
+
+static const struct ethtool_ops i40e_ethtool_ops = {
+	.get_drvinfo		= i40e_get_drvinfo,
+	.get_regs_len		= i40e_get_regs_len,
+	.get_regs		= i40e_get_regs,
+	.nway_reset		= i40e_nway_reset,
+	.get_link		= ethtool_op_get_link,
+	.get_wol		= i40e_get_wol,
+	.set_wol		= i40e_set_wol,
+	.set_eeprom		= i40e_set_eeprom,
+	.get_eeprom_len		= i40e_get_eeprom_len,
+	.get_eeprom		= i40e_get_eeprom,
+	.get_ringparam		= i40e_get_ringparam,
+	.set_ringparam		= i40e_set_ringparam,
+	.get_pauseparam		= i40e_get_pauseparam,
+	.set_pauseparam		= i40e_set_pauseparam,
+	.get_msglevel		= i40e_get_msglevel,
+	.set_msglevel		= i40e_set_msglevel,
+	.get_rxnfc		= i40e_get_rxnfc,
+	.set_rxnfc		= i40e_set_rxnfc,
+	.self_test		= i40e_diag_test,
+	.get_strings		= i40e_get_strings,
+	.set_phys_id		= i40e_set_phys_id,
+	.get_sset_count		= i40e_get_sset_count,
+	.get_ethtool_stats	= i40e_get_ethtool_stats,
+	.get_coalesce		= i40e_get_coalesce,
+	.set_coalesce		= i40e_set_coalesce,
+	.get_rxfh_key_size	= i40e_get_rxfh_key_size,
+	.get_rxfh_indir_size	= i40e_get_rxfh_indir_size,
+	.get_rxfh		= i40e_get_rxfh,
+	.set_rxfh		= i40e_set_rxfh,
+	.get_channels		= i40e_get_channels,
+	.set_channels		= i40e_set_channels,
+	.get_module_info	= i40e_get_module_info,
+	.get_module_eeprom	= i40e_get_module_eeprom,
+	.get_ts_info		= i40e_get_ts_info,
+	.get_priv_flags		= i40e_get_priv_flags,
+	.set_priv_flags		= i40e_set_priv_flags,
+	.get_per_queue_coalesce	= i40e_get_per_queue_coalesce,
+	.set_per_queue_coalesce	= i40e_set_per_queue_coalesce,
+	.get_link_ksettings	= i40e_get_link_ksettings,
+	.set_link_ksettings	= i40e_set_link_ksettings,
+};
+
+void i40e_set_ethtool_ops(struct net_device *netdev)
+{
+	netdev->ethtool_ops = &i40e_ethtool_ops;
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_hmc.c b/drivers/net/ethernet/intel/i40e/i40e_hmc.c
new file mode 100644
index 0000000..6d4b590
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_hmc.c
@@ -0,0 +1,358 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#include "i40e_osdep.h"
+#include "i40e_register.h"
+#include "i40e_status.h"
+#include "i40e_alloc.h"
+#include "i40e_hmc.h"
+#include "i40e_type.h"
+
+/**
+ * i40e_add_sd_table_entry - Adds a segment descriptor to the table
+ * @hw: pointer to our hw struct
+ * @hmc_info: pointer to the HMC configuration information struct
+ * @sd_index: segment descriptor index to manipulate
+ * @type: what type of segment descriptor we're manipulating
+ * @direct_mode_sz: size to alloc in direct mode
+ **/
+i40e_status i40e_add_sd_table_entry(struct i40e_hw *hw,
+					      struct i40e_hmc_info *hmc_info,
+					      u32 sd_index,
+					      enum i40e_sd_entry_type type,
+					      u64 direct_mode_sz)
+{
+	enum i40e_memory_type mem_type __attribute__((unused));
+	struct i40e_hmc_sd_entry *sd_entry;
+	bool dma_mem_alloc_done = false;
+	struct i40e_dma_mem mem;
+	i40e_status ret_code = I40E_SUCCESS;
+	u64 alloc_len;
+
+	if (NULL == hmc_info->sd_table.sd_entry) {
+		ret_code = I40E_ERR_BAD_PTR;
+		hw_dbg(hw, "i40e_add_sd_table_entry: bad sd_entry\n");
+		goto exit;
+	}
+
+	if (sd_index >= hmc_info->sd_table.sd_cnt) {
+		ret_code = I40E_ERR_INVALID_SD_INDEX;
+		hw_dbg(hw, "i40e_add_sd_table_entry: bad sd_index\n");
+		goto exit;
+	}
+
+	sd_entry = &hmc_info->sd_table.sd_entry[sd_index];
+	if (!sd_entry->valid) {
+		if (I40E_SD_TYPE_PAGED == type) {
+			mem_type = i40e_mem_pd;
+			alloc_len = I40E_HMC_PAGED_BP_SIZE;
+		} else {
+			mem_type = i40e_mem_bp_jumbo;
+			alloc_len = direct_mode_sz;
+		}
+
+		/* allocate a 4K pd page or 2M backing page */
+		ret_code = i40e_allocate_dma_mem(hw, &mem, mem_type, alloc_len,
+						 I40E_HMC_PD_BP_BUF_ALIGNMENT);
+		if (ret_code)
+			goto exit;
+		dma_mem_alloc_done = true;
+		if (I40E_SD_TYPE_PAGED == type) {
+			ret_code = i40e_allocate_virt_mem(hw,
+					&sd_entry->u.pd_table.pd_entry_virt_mem,
+					sizeof(struct i40e_hmc_pd_entry) * 512);
+			if (ret_code)
+				goto exit;
+			sd_entry->u.pd_table.pd_entry =
+				(struct i40e_hmc_pd_entry *)
+				sd_entry->u.pd_table.pd_entry_virt_mem.va;
+			sd_entry->u.pd_table.pd_page_addr = mem;
+		} else {
+			sd_entry->u.bp.addr = mem;
+			sd_entry->u.bp.sd_pd_index = sd_index;
+		}
+		/* initialize the sd entry */
+		hmc_info->sd_table.sd_entry[sd_index].entry_type = type;
+
+		/* increment the ref count */
+		I40E_INC_SD_REFCNT(&hmc_info->sd_table);
+	}
+	/* Increment backing page reference count */
+	if (I40E_SD_TYPE_DIRECT == sd_entry->entry_type)
+		I40E_INC_BP_REFCNT(&sd_entry->u.bp);
+exit:
+	if (ret_code)
+		if (dma_mem_alloc_done)
+			i40e_free_dma_mem(hw, &mem);
+
+	return ret_code;
+}
+
+/**
+ * i40e_add_pd_table_entry - Adds page descriptor to the specified table
+ * @hw: pointer to our HW structure
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @pd_index: which page descriptor index to manipulate
+ * @rsrc_pg: if not NULL, use preallocated page instead of allocating new one.
+ *
+ * This function:
+ *	1. Initializes the pd entry
+ *	2. Adds pd_entry in the pd_table
+ *	3. Mark the entry valid in i40e_hmc_pd_entry structure
+ *	4. Initializes the pd_entry's ref count to 1
+ * assumptions:
+ *	1. The memory for pd should be pinned down, physically contiguous and
+ *	   aligned on 4K boundary and zeroed memory.
+ *	2. It should be 4K in size.
+ **/
+i40e_status i40e_add_pd_table_entry(struct i40e_hw *hw,
+					      struct i40e_hmc_info *hmc_info,
+					      u32 pd_index,
+					      struct i40e_dma_mem *rsrc_pg)
+{
+	i40e_status ret_code = 0;
+	struct i40e_hmc_pd_table *pd_table;
+	struct i40e_hmc_pd_entry *pd_entry;
+	struct i40e_dma_mem mem;
+	struct i40e_dma_mem *page = &mem;
+	u32 sd_idx, rel_pd_idx;
+	u64 *pd_addr;
+	u64 page_desc;
+
+	if (pd_index / I40E_HMC_PD_CNT_IN_SD >= hmc_info->sd_table.sd_cnt) {
+		ret_code = I40E_ERR_INVALID_PAGE_DESC_INDEX;
+		hw_dbg(hw, "i40e_add_pd_table_entry: bad pd_index\n");
+		goto exit;
+	}
+
+	/* find corresponding sd */
+	sd_idx = (pd_index / I40E_HMC_PD_CNT_IN_SD);
+	if (I40E_SD_TYPE_PAGED !=
+	    hmc_info->sd_table.sd_entry[sd_idx].entry_type)
+		goto exit;
+
+	rel_pd_idx = (pd_index % I40E_HMC_PD_CNT_IN_SD);
+	pd_table = &hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
+	pd_entry = &pd_table->pd_entry[rel_pd_idx];
+	if (!pd_entry->valid) {
+		if (rsrc_pg) {
+			pd_entry->rsrc_pg = true;
+			page = rsrc_pg;
+		} else {
+			/* allocate a 4K backing page */
+			ret_code = i40e_allocate_dma_mem(hw, page, i40e_mem_bp,
+						I40E_HMC_PAGED_BP_SIZE,
+						I40E_HMC_PD_BP_BUF_ALIGNMENT);
+			if (ret_code)
+				goto exit;
+			pd_entry->rsrc_pg = false;
+		}
+
+		pd_entry->bp.addr = *page;
+		pd_entry->bp.sd_pd_index = pd_index;
+		pd_entry->bp.entry_type = I40E_SD_TYPE_PAGED;
+		/* Set page address and valid bit */
+		page_desc = page->pa | 0x1;
+
+		pd_addr = (u64 *)pd_table->pd_page_addr.va;
+		pd_addr += rel_pd_idx;
+
+		/* Add the backing page physical address in the pd entry */
+		memcpy(pd_addr, &page_desc, sizeof(u64));
+
+		pd_entry->sd_index = sd_idx;
+		pd_entry->valid = true;
+		I40E_INC_PD_REFCNT(pd_table);
+	}
+	I40E_INC_BP_REFCNT(&pd_entry->bp);
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_remove_pd_bp - remove a backing page from a page descriptor
+ * @hw: pointer to our HW structure
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: the page index
+ * @is_pf: distinguishes a VF from a PF
+ *
+ * This function:
+ *	1. Marks the entry in pd tabe (for paged address mode) or in sd table
+ *	   (for direct address mode) invalid.
+ *	2. Write to register PMPDINV to invalidate the backing page in FV cache
+ *	3. Decrement the ref count for the pd _entry
+ * assumptions:
+ *	1. Caller can deallocate the memory used by backing storage after this
+ *	   function returns.
+ **/
+i40e_status i40e_remove_pd_bp(struct i40e_hw *hw,
+					struct i40e_hmc_info *hmc_info,
+					u32 idx)
+{
+	i40e_status ret_code = 0;
+	struct i40e_hmc_pd_entry *pd_entry;
+	struct i40e_hmc_pd_table *pd_table;
+	struct i40e_hmc_sd_entry *sd_entry;
+	u32 sd_idx, rel_pd_idx;
+	u64 *pd_addr;
+
+	/* calculate index */
+	sd_idx = idx / I40E_HMC_PD_CNT_IN_SD;
+	rel_pd_idx = idx % I40E_HMC_PD_CNT_IN_SD;
+	if (sd_idx >= hmc_info->sd_table.sd_cnt) {
+		ret_code = I40E_ERR_INVALID_PAGE_DESC_INDEX;
+		hw_dbg(hw, "i40e_remove_pd_bp: bad idx\n");
+		goto exit;
+	}
+	sd_entry = &hmc_info->sd_table.sd_entry[sd_idx];
+	if (I40E_SD_TYPE_PAGED != sd_entry->entry_type) {
+		ret_code = I40E_ERR_INVALID_SD_TYPE;
+		hw_dbg(hw, "i40e_remove_pd_bp: wrong sd_entry type\n");
+		goto exit;
+	}
+	/* get the entry and decrease its ref counter */
+	pd_table = &hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
+	pd_entry = &pd_table->pd_entry[rel_pd_idx];
+	I40E_DEC_BP_REFCNT(&pd_entry->bp);
+	if (pd_entry->bp.ref_cnt)
+		goto exit;
+
+	/* mark the entry invalid */
+	pd_entry->valid = false;
+	I40E_DEC_PD_REFCNT(pd_table);
+	pd_addr = (u64 *)pd_table->pd_page_addr.va;
+	pd_addr += rel_pd_idx;
+	memset(pd_addr, 0, sizeof(u64));
+	I40E_INVALIDATE_PF_HMC_PD(hw, sd_idx, idx);
+
+	/* free memory here */
+	if (!pd_entry->rsrc_pg)
+		ret_code = i40e_free_dma_mem(hw, &pd_entry->bp.addr);
+	if (ret_code)
+		goto exit;
+	if (!pd_table->ref_cnt)
+		i40e_free_virt_mem(hw, &pd_table->pd_entry_virt_mem);
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_prep_remove_sd_bp - Prepares to remove a backing page from a sd entry
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: the page index
+ **/
+i40e_status i40e_prep_remove_sd_bp(struct i40e_hmc_info *hmc_info,
+					     u32 idx)
+{
+	i40e_status ret_code = 0;
+	struct i40e_hmc_sd_entry *sd_entry;
+
+	/* get the entry and decrease its ref counter */
+	sd_entry = &hmc_info->sd_table.sd_entry[idx];
+	I40E_DEC_BP_REFCNT(&sd_entry->u.bp);
+	if (sd_entry->u.bp.ref_cnt) {
+		ret_code = I40E_ERR_NOT_READY;
+		goto exit;
+	}
+	I40E_DEC_SD_REFCNT(&hmc_info->sd_table);
+
+	/* mark the entry invalid */
+	sd_entry->valid = false;
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_remove_sd_bp_new - Removes a backing page from a segment descriptor
+ * @hw: pointer to our hw struct
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: the page index
+ * @is_pf: used to distinguish between VF and PF
+ **/
+i40e_status i40e_remove_sd_bp_new(struct i40e_hw *hw,
+					    struct i40e_hmc_info *hmc_info,
+					    u32 idx, bool is_pf)
+{
+	struct i40e_hmc_sd_entry *sd_entry;
+
+	if (!is_pf)
+		return I40E_NOT_SUPPORTED;
+
+	/* get the entry and decrease its ref counter */
+	sd_entry = &hmc_info->sd_table.sd_entry[idx];
+	I40E_CLEAR_PF_SD_ENTRY(hw, idx, I40E_SD_TYPE_DIRECT);
+
+	return i40e_free_dma_mem(hw, &sd_entry->u.bp.addr);
+}
+
+/**
+ * i40e_prep_remove_pd_page - Prepares to remove a PD page from sd entry.
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: segment descriptor index to find the relevant page descriptor
+ **/
+i40e_status i40e_prep_remove_pd_page(struct i40e_hmc_info *hmc_info,
+					       u32 idx)
+{
+	i40e_status ret_code = 0;
+	struct i40e_hmc_sd_entry *sd_entry;
+
+	sd_entry = &hmc_info->sd_table.sd_entry[idx];
+
+	if (sd_entry->u.pd_table.ref_cnt) {
+		ret_code = I40E_ERR_NOT_READY;
+		goto exit;
+	}
+
+	/* mark the entry invalid */
+	sd_entry->valid = false;
+
+	I40E_DEC_SD_REFCNT(&hmc_info->sd_table);
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_remove_pd_page_new - Removes a PD page from sd entry.
+ * @hw: pointer to our hw struct
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: segment descriptor index to find the relevant page descriptor
+ * @is_pf: used to distinguish between VF and PF
+ **/
+i40e_status i40e_remove_pd_page_new(struct i40e_hw *hw,
+					      struct i40e_hmc_info *hmc_info,
+					      u32 idx, bool is_pf)
+{
+	struct i40e_hmc_sd_entry *sd_entry;
+
+	if (!is_pf)
+		return I40E_NOT_SUPPORTED;
+
+	sd_entry = &hmc_info->sd_table.sd_entry[idx];
+	I40E_CLEAR_PF_SD_ENTRY(hw, idx, I40E_SD_TYPE_PAGED);
+
+	return  i40e_free_dma_mem(hw, &sd_entry->u.pd_table.pd_page_addr);
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_hmc.h b/drivers/net/ethernet/intel/i40e/i40e_hmc.h
new file mode 100644
index 0000000..7b5fd33
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_hmc.h
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_HMC_H_
+#define _I40E_HMC_H_
+
+#define I40E_HMC_MAX_BP_COUNT 512
+
+/* forward-declare the HW struct for the compiler */
+struct i40e_hw;
+
+#define I40E_HMC_INFO_SIGNATURE		0x484D5347 /* HMSG */
+#define I40E_HMC_PD_CNT_IN_SD		512
+#define I40E_HMC_DIRECT_BP_SIZE		0x200000 /* 2M */
+#define I40E_HMC_PAGED_BP_SIZE		4096
+#define I40E_HMC_PD_BP_BUF_ALIGNMENT	4096
+#define I40E_FIRST_VF_FPM_ID		16
+
+struct i40e_hmc_obj_info {
+	u64 base;	/* base addr in FPM */
+	u32 max_cnt;	/* max count available for this hmc func */
+	u32 cnt;	/* count of objects driver actually wants to create */
+	u64 size;	/* size in bytes of one object */
+};
+
+enum i40e_sd_entry_type {
+	I40E_SD_TYPE_INVALID = 0,
+	I40E_SD_TYPE_PAGED   = 1,
+	I40E_SD_TYPE_DIRECT  = 2
+};
+
+struct i40e_hmc_bp {
+	enum i40e_sd_entry_type entry_type;
+	struct i40e_dma_mem addr; /* populate to be used by hw */
+	u32 sd_pd_index;
+	u32 ref_cnt;
+};
+
+struct i40e_hmc_pd_entry {
+	struct i40e_hmc_bp bp;
+	u32 sd_index;
+	bool rsrc_pg;
+	bool valid;
+};
+
+struct i40e_hmc_pd_table {
+	struct i40e_dma_mem pd_page_addr; /* populate to be used by hw */
+	struct i40e_hmc_pd_entry  *pd_entry; /* [512] for sw book keeping */
+	struct i40e_virt_mem pd_entry_virt_mem; /* virt mem for pd_entry */
+
+	u32 ref_cnt;
+	u32 sd_index;
+};
+
+struct i40e_hmc_sd_entry {
+	enum i40e_sd_entry_type entry_type;
+	bool valid;
+
+	union {
+		struct i40e_hmc_pd_table pd_table;
+		struct i40e_hmc_bp bp;
+	} u;
+};
+
+struct i40e_hmc_sd_table {
+	struct i40e_virt_mem addr; /* used to track sd_entry allocations */
+	u32 sd_cnt;
+	u32 ref_cnt;
+	struct i40e_hmc_sd_entry *sd_entry; /* (sd_cnt*512) entries max */
+};
+
+struct i40e_hmc_info {
+	u32 signature;
+	/* equals to pci func num for PF and dynamically allocated for VFs */
+	u8 hmc_fn_id;
+	u16 first_sd_index; /* index of the first available SD */
+
+	/* hmc objects */
+	struct i40e_hmc_obj_info *hmc_obj;
+	struct i40e_virt_mem hmc_obj_virt_mem;
+	struct i40e_hmc_sd_table sd_table;
+};
+
+#define I40E_INC_SD_REFCNT(sd_table)	((sd_table)->ref_cnt++)
+#define I40E_INC_PD_REFCNT(pd_table)	((pd_table)->ref_cnt++)
+#define I40E_INC_BP_REFCNT(bp)		((bp)->ref_cnt++)
+
+#define I40E_DEC_SD_REFCNT(sd_table)	((sd_table)->ref_cnt--)
+#define I40E_DEC_PD_REFCNT(pd_table)	((pd_table)->ref_cnt--)
+#define I40E_DEC_BP_REFCNT(bp)		((bp)->ref_cnt--)
+
+/**
+ * I40E_SET_PF_SD_ENTRY - marks the sd entry as valid in the hardware
+ * @hw: pointer to our hw struct
+ * @pa: pointer to physical address
+ * @sd_index: segment descriptor index
+ * @type: if sd entry is direct or paged
+ **/
+#define I40E_SET_PF_SD_ENTRY(hw, pa, sd_index, type)			\
+{									\
+	u32 val1, val2, val3;						\
+	val1 = (u32)(upper_32_bits(pa));				\
+	val2 = (u32)(pa) | (I40E_HMC_MAX_BP_COUNT <<			\
+		 I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT) |		\
+		((((type) == I40E_SD_TYPE_PAGED) ? 0 : 1) <<		\
+		I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT) |			\
+		BIT(I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT);		\
+	val3 = (sd_index) | BIT_ULL(I40E_PFHMC_SDCMD_PMSDWR_SHIFT);	\
+	wr32((hw), I40E_PFHMC_SDDATAHIGH, val1);			\
+	wr32((hw), I40E_PFHMC_SDDATALOW, val2);				\
+	wr32((hw), I40E_PFHMC_SDCMD, val3);				\
+}
+
+/**
+ * I40E_CLEAR_PF_SD_ENTRY - marks the sd entry as invalid in the hardware
+ * @hw: pointer to our hw struct
+ * @sd_index: segment descriptor index
+ * @type: if sd entry is direct or paged
+ **/
+#define I40E_CLEAR_PF_SD_ENTRY(hw, sd_index, type)			\
+{									\
+	u32 val2, val3;							\
+	val2 = (I40E_HMC_MAX_BP_COUNT <<				\
+		I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT) |		\
+		((((type) == I40E_SD_TYPE_PAGED) ? 0 : 1) <<		\
+		I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT);			\
+	val3 = (sd_index) | BIT_ULL(I40E_PFHMC_SDCMD_PMSDWR_SHIFT);	\
+	wr32((hw), I40E_PFHMC_SDDATAHIGH, 0);				\
+	wr32((hw), I40E_PFHMC_SDDATALOW, val2);				\
+	wr32((hw), I40E_PFHMC_SDCMD, val3);				\
+}
+
+/**
+ * I40E_INVALIDATE_PF_HMC_PD - Invalidates the pd cache in the hardware
+ * @hw: pointer to our hw struct
+ * @sd_idx: segment descriptor index
+ * @pd_idx: page descriptor index
+ **/
+#define I40E_INVALIDATE_PF_HMC_PD(hw, sd_idx, pd_idx)			\
+	wr32((hw), I40E_PFHMC_PDINV,					\
+	    (((sd_idx) << I40E_PFHMC_PDINV_PMSDIDX_SHIFT) |		\
+	     ((pd_idx) << I40E_PFHMC_PDINV_PMPDIDX_SHIFT)))
+
+/**
+ * I40E_FIND_SD_INDEX_LIMIT - finds segment descriptor index limit
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @type: type of HMC resources we're searching
+ * @index: starting index for the object
+ * @cnt: number of objects we're trying to create
+ * @sd_idx: pointer to return index of the segment descriptor in question
+ * @sd_limit: pointer to return the maximum number of segment descriptors
+ *
+ * This function calculates the segment descriptor index and index limit
+ * for the resource defined by i40e_hmc_rsrc_type.
+ **/
+#define I40E_FIND_SD_INDEX_LIMIT(hmc_info, type, index, cnt, sd_idx, sd_limit)\
+{									\
+	u64 fpm_addr, fpm_limit;					\
+	fpm_addr = (hmc_info)->hmc_obj[(type)].base +			\
+		   (hmc_info)->hmc_obj[(type)].size * (index);		\
+	fpm_limit = fpm_addr + (hmc_info)->hmc_obj[(type)].size * (cnt);\
+	*(sd_idx) = (u32)(fpm_addr / I40E_HMC_DIRECT_BP_SIZE);		\
+	*(sd_limit) = (u32)((fpm_limit - 1) / I40E_HMC_DIRECT_BP_SIZE);	\
+	/* add one more to the limit to correct our range */		\
+	*(sd_limit) += 1;						\
+}
+
+/**
+ * I40E_FIND_PD_INDEX_LIMIT - finds page descriptor index limit
+ * @hmc_info: pointer to the HMC configuration information struct
+ * @type: HMC resource type we're examining
+ * @idx: starting index for the object
+ * @cnt: number of objects we're trying to create
+ * @pd_index: pointer to return page descriptor index
+ * @pd_limit: pointer to return page descriptor index limit
+ *
+ * Calculates the page descriptor index and index limit for the resource
+ * defined by i40e_hmc_rsrc_type.
+ **/
+#define I40E_FIND_PD_INDEX_LIMIT(hmc_info, type, idx, cnt, pd_index, pd_limit)\
+{									\
+	u64 fpm_adr, fpm_limit;						\
+	fpm_adr = (hmc_info)->hmc_obj[(type)].base +			\
+		  (hmc_info)->hmc_obj[(type)].size * (idx);		\
+	fpm_limit = fpm_adr + (hmc_info)->hmc_obj[(type)].size * (cnt);	\
+	*(pd_index) = (u32)(fpm_adr / I40E_HMC_PAGED_BP_SIZE);		\
+	*(pd_limit) = (u32)((fpm_limit - 1) / I40E_HMC_PAGED_BP_SIZE);	\
+	/* add one more to the limit to correct our range */		\
+	*(pd_limit) += 1;						\
+}
+i40e_status i40e_add_sd_table_entry(struct i40e_hw *hw,
+					      struct i40e_hmc_info *hmc_info,
+					      u32 sd_index,
+					      enum i40e_sd_entry_type type,
+					      u64 direct_mode_sz);
+
+i40e_status i40e_add_pd_table_entry(struct i40e_hw *hw,
+					      struct i40e_hmc_info *hmc_info,
+					      u32 pd_index,
+					      struct i40e_dma_mem *rsrc_pg);
+i40e_status i40e_remove_pd_bp(struct i40e_hw *hw,
+					struct i40e_hmc_info *hmc_info,
+					u32 idx);
+i40e_status i40e_prep_remove_sd_bp(struct i40e_hmc_info *hmc_info,
+					     u32 idx);
+i40e_status i40e_remove_sd_bp_new(struct i40e_hw *hw,
+					    struct i40e_hmc_info *hmc_info,
+					    u32 idx, bool is_pf);
+i40e_status i40e_prep_remove_pd_page(struct i40e_hmc_info *hmc_info,
+					       u32 idx);
+i40e_status i40e_remove_pd_page_new(struct i40e_hw *hw,
+					      struct i40e_hmc_info *hmc_info,
+					      u32 idx, bool is_pf);
+
+#endif /* _I40E_HMC_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.c b/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.c
new file mode 100644
index 0000000..cd40dc4
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.c
@@ -0,0 +1,1143 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#include "i40e_osdep.h"
+#include "i40e_register.h"
+#include "i40e_type.h"
+#include "i40e_hmc.h"
+#include "i40e_lan_hmc.h"
+#include "i40e_prototype.h"
+
+/* lan specific interface functions */
+
+/**
+ * i40e_align_l2obj_base - aligns base object pointer to 512 bytes
+ * @offset: base address offset needing alignment
+ *
+ * Aligns the layer 2 function private memory so it's 512-byte aligned.
+ **/
+static u64 i40e_align_l2obj_base(u64 offset)
+{
+	u64 aligned_offset = offset;
+
+	if ((offset % I40E_HMC_L2OBJ_BASE_ALIGNMENT) > 0)
+		aligned_offset += (I40E_HMC_L2OBJ_BASE_ALIGNMENT -
+				   (offset % I40E_HMC_L2OBJ_BASE_ALIGNMENT));
+
+	return aligned_offset;
+}
+
+/**
+ * i40e_calculate_l2fpm_size - calculates layer 2 FPM memory size
+ * @txq_num: number of Tx queues needing backing context
+ * @rxq_num: number of Rx queues needing backing context
+ * @fcoe_cntx_num: amount of FCoE statefull contexts needing backing context
+ * @fcoe_filt_num: number of FCoE filters needing backing context
+ *
+ * Calculates the maximum amount of memory for the function required, based
+ * on the number of resources it must provide context for.
+ **/
+static u64 i40e_calculate_l2fpm_size(u32 txq_num, u32 rxq_num,
+			      u32 fcoe_cntx_num, u32 fcoe_filt_num)
+{
+	u64 fpm_size = 0;
+
+	fpm_size = txq_num * I40E_HMC_OBJ_SIZE_TXQ;
+	fpm_size = i40e_align_l2obj_base(fpm_size);
+
+	fpm_size += (rxq_num * I40E_HMC_OBJ_SIZE_RXQ);
+	fpm_size = i40e_align_l2obj_base(fpm_size);
+
+	fpm_size += (fcoe_cntx_num * I40E_HMC_OBJ_SIZE_FCOE_CNTX);
+	fpm_size = i40e_align_l2obj_base(fpm_size);
+
+	fpm_size += (fcoe_filt_num * I40E_HMC_OBJ_SIZE_FCOE_FILT);
+	fpm_size = i40e_align_l2obj_base(fpm_size);
+
+	return fpm_size;
+}
+
+/**
+ * i40e_init_lan_hmc - initialize i40e_hmc_info struct
+ * @hw: pointer to the HW structure
+ * @txq_num: number of Tx queues needing backing context
+ * @rxq_num: number of Rx queues needing backing context
+ * @fcoe_cntx_num: amount of FCoE statefull contexts needing backing context
+ * @fcoe_filt_num: number of FCoE filters needing backing context
+ *
+ * This function will be called once per physical function initialization.
+ * It will fill out the i40e_hmc_obj_info structure for LAN objects based on
+ * the driver's provided input, as well as information from the HMC itself
+ * loaded from NVRAM.
+ *
+ * Assumptions:
+ *   - HMC Resource Profile has been selected before calling this function.
+ **/
+i40e_status i40e_init_lan_hmc(struct i40e_hw *hw, u32 txq_num,
+					u32 rxq_num, u32 fcoe_cntx_num,
+					u32 fcoe_filt_num)
+{
+	struct i40e_hmc_obj_info *obj, *full_obj;
+	i40e_status ret_code = 0;
+	u64 l2fpm_size;
+	u32 size_exp;
+
+	hw->hmc.signature = I40E_HMC_INFO_SIGNATURE;
+	hw->hmc.hmc_fn_id = hw->pf_id;
+
+	/* allocate memory for hmc_obj */
+	ret_code = i40e_allocate_virt_mem(hw, &hw->hmc.hmc_obj_virt_mem,
+			sizeof(struct i40e_hmc_obj_info) * I40E_HMC_LAN_MAX);
+	if (ret_code)
+		goto init_lan_hmc_out;
+	hw->hmc.hmc_obj = (struct i40e_hmc_obj_info *)
+			  hw->hmc.hmc_obj_virt_mem.va;
+
+	/* The full object will be used to create the LAN HMC SD */
+	full_obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_FULL];
+	full_obj->max_cnt = 0;
+	full_obj->cnt = 0;
+	full_obj->base = 0;
+	full_obj->size = 0;
+
+	/* Tx queue context information */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_TX];
+	obj->max_cnt = rd32(hw, I40E_GLHMC_LANQMAX);
+	obj->cnt = txq_num;
+	obj->base = 0;
+	size_exp = rd32(hw, I40E_GLHMC_LANTXOBJSZ);
+	obj->size = BIT_ULL(size_exp);
+
+	/* validate values requested by driver don't exceed HMC capacity */
+	if (txq_num > obj->max_cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT;
+		hw_dbg(hw, "i40e_init_lan_hmc: Tx context: asks for 0x%x but max allowed is 0x%x, returns error %d\n",
+			  txq_num, obj->max_cnt, ret_code);
+		goto init_lan_hmc_out;
+	}
+
+	/* aggregate values into the full LAN object for later */
+	full_obj->max_cnt += obj->max_cnt;
+	full_obj->cnt += obj->cnt;
+
+	/* Rx queue context information */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_RX];
+	obj->max_cnt = rd32(hw, I40E_GLHMC_LANQMAX);
+	obj->cnt = rxq_num;
+	obj->base = hw->hmc.hmc_obj[I40E_HMC_LAN_TX].base +
+		    (hw->hmc.hmc_obj[I40E_HMC_LAN_TX].cnt *
+		     hw->hmc.hmc_obj[I40E_HMC_LAN_TX].size);
+	obj->base = i40e_align_l2obj_base(obj->base);
+	size_exp = rd32(hw, I40E_GLHMC_LANRXOBJSZ);
+	obj->size = BIT_ULL(size_exp);
+
+	/* validate values requested by driver don't exceed HMC capacity */
+	if (rxq_num > obj->max_cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT;
+		hw_dbg(hw, "i40e_init_lan_hmc: Rx context: asks for 0x%x but max allowed is 0x%x, returns error %d\n",
+			  rxq_num, obj->max_cnt, ret_code);
+		goto init_lan_hmc_out;
+	}
+
+	/* aggregate values into the full LAN object for later */
+	full_obj->max_cnt += obj->max_cnt;
+	full_obj->cnt += obj->cnt;
+
+	/* FCoE context information */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX];
+	obj->max_cnt = rd32(hw, I40E_GLHMC_FCOEMAX);
+	obj->cnt = fcoe_cntx_num;
+	obj->base = hw->hmc.hmc_obj[I40E_HMC_LAN_RX].base +
+		    (hw->hmc.hmc_obj[I40E_HMC_LAN_RX].cnt *
+		     hw->hmc.hmc_obj[I40E_HMC_LAN_RX].size);
+	obj->base = i40e_align_l2obj_base(obj->base);
+	size_exp = rd32(hw, I40E_GLHMC_FCOEDDPOBJSZ);
+	obj->size = BIT_ULL(size_exp);
+
+	/* validate values requested by driver don't exceed HMC capacity */
+	if (fcoe_cntx_num > obj->max_cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT;
+		hw_dbg(hw, "i40e_init_lan_hmc: FCoE context: asks for 0x%x but max allowed is 0x%x, returns error %d\n",
+			  fcoe_cntx_num, obj->max_cnt, ret_code);
+		goto init_lan_hmc_out;
+	}
+
+	/* aggregate values into the full LAN object for later */
+	full_obj->max_cnt += obj->max_cnt;
+	full_obj->cnt += obj->cnt;
+
+	/* FCoE filter information */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_FCOE_FILT];
+	obj->max_cnt = rd32(hw, I40E_GLHMC_FCOEFMAX);
+	obj->cnt = fcoe_filt_num;
+	obj->base = hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX].base +
+		    (hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX].cnt *
+		     hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX].size);
+	obj->base = i40e_align_l2obj_base(obj->base);
+	size_exp = rd32(hw, I40E_GLHMC_FCOEFOBJSZ);
+	obj->size = BIT_ULL(size_exp);
+
+	/* validate values requested by driver don't exceed HMC capacity */
+	if (fcoe_filt_num > obj->max_cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT;
+		hw_dbg(hw, "i40e_init_lan_hmc: FCoE filter: asks for 0x%x but max allowed is 0x%x, returns error %d\n",
+			  fcoe_filt_num, obj->max_cnt, ret_code);
+		goto init_lan_hmc_out;
+	}
+
+	/* aggregate values into the full LAN object for later */
+	full_obj->max_cnt += obj->max_cnt;
+	full_obj->cnt += obj->cnt;
+
+	hw->hmc.first_sd_index = 0;
+	hw->hmc.sd_table.ref_cnt = 0;
+	l2fpm_size = i40e_calculate_l2fpm_size(txq_num, rxq_num, fcoe_cntx_num,
+					       fcoe_filt_num);
+	if (NULL == hw->hmc.sd_table.sd_entry) {
+		hw->hmc.sd_table.sd_cnt = (u32)
+				   (l2fpm_size + I40E_HMC_DIRECT_BP_SIZE - 1) /
+				   I40E_HMC_DIRECT_BP_SIZE;
+
+		/* allocate the sd_entry members in the sd_table */
+		ret_code = i40e_allocate_virt_mem(hw, &hw->hmc.sd_table.addr,
+					  (sizeof(struct i40e_hmc_sd_entry) *
+					  hw->hmc.sd_table.sd_cnt));
+		if (ret_code)
+			goto init_lan_hmc_out;
+		hw->hmc.sd_table.sd_entry =
+			(struct i40e_hmc_sd_entry *)hw->hmc.sd_table.addr.va;
+	}
+	/* store in the LAN full object for later */
+	full_obj->size = l2fpm_size;
+
+init_lan_hmc_out:
+	return ret_code;
+}
+
+/**
+ * i40e_remove_pd_page - Remove a page from the page descriptor table
+ * @hw: pointer to the HW structure
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: segment descriptor index to find the relevant page descriptor
+ *
+ * This function:
+ *	1. Marks the entry in pd table (for paged address mode) invalid
+ *	2. write to register PMPDINV to invalidate the backing page in FV cache
+ *	3. Decrement the ref count for  pd_entry
+ * assumptions:
+ *	1. caller can deallocate the memory used by pd after this function
+ *	   returns.
+ **/
+static i40e_status i40e_remove_pd_page(struct i40e_hw *hw,
+						 struct i40e_hmc_info *hmc_info,
+						 u32 idx)
+{
+	i40e_status ret_code = 0;
+
+	if (!i40e_prep_remove_pd_page(hmc_info, idx))
+		ret_code = i40e_remove_pd_page_new(hw, hmc_info, idx, true);
+
+	return ret_code;
+}
+
+/**
+ * i40e_remove_sd_bp - remove a backing page from a segment descriptor
+ * @hw: pointer to our HW structure
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @idx: the page index
+ *
+ * This function:
+ *	1. Marks the entry in sd table (for direct address mode) invalid
+ *	2. write to register PMSDCMD, PMSDDATALOW(PMSDDATALOW.PMSDVALID set
+ *	   to 0) and PMSDDATAHIGH to invalidate the sd page
+ *	3. Decrement the ref count for the sd_entry
+ * assumptions:
+ *	1. caller can deallocate the memory used by backing storage after this
+ *	   function returns.
+ **/
+static i40e_status i40e_remove_sd_bp(struct i40e_hw *hw,
+					       struct i40e_hmc_info *hmc_info,
+					       u32 idx)
+{
+	i40e_status ret_code = 0;
+
+	if (!i40e_prep_remove_sd_bp(hmc_info, idx))
+		ret_code = i40e_remove_sd_bp_new(hw, hmc_info, idx, true);
+
+	return ret_code;
+}
+
+/**
+ * i40e_create_lan_hmc_object - allocate backing store for hmc objects
+ * @hw: pointer to the HW structure
+ * @info: pointer to i40e_hmc_create_obj_info struct
+ *
+ * This will allocate memory for PDs and backing pages and populate
+ * the sd and pd entries.
+ **/
+static i40e_status i40e_create_lan_hmc_object(struct i40e_hw *hw,
+				struct i40e_hmc_lan_create_obj_info *info)
+{
+	i40e_status ret_code = 0;
+	struct i40e_hmc_sd_entry *sd_entry;
+	u32 pd_idx1 = 0, pd_lmt1 = 0;
+	u32 pd_idx = 0, pd_lmt = 0;
+	bool pd_error = false;
+	u32 sd_idx, sd_lmt;
+	u64 sd_size;
+	u32 i, j;
+
+	if (NULL == info) {
+		ret_code = I40E_ERR_BAD_PTR;
+		hw_dbg(hw, "i40e_create_lan_hmc_object: bad info ptr\n");
+		goto exit;
+	}
+	if (NULL == info->hmc_info) {
+		ret_code = I40E_ERR_BAD_PTR;
+		hw_dbg(hw, "i40e_create_lan_hmc_object: bad hmc_info ptr\n");
+		goto exit;
+	}
+	if (I40E_HMC_INFO_SIGNATURE != info->hmc_info->signature) {
+		ret_code = I40E_ERR_BAD_PTR;
+		hw_dbg(hw, "i40e_create_lan_hmc_object: bad signature\n");
+		goto exit;
+	}
+
+	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_INDEX;
+		hw_dbg(hw, "i40e_create_lan_hmc_object: returns error %d\n",
+			  ret_code);
+		goto exit;
+	}
+	if ((info->start_idx + info->count) >
+	    info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT;
+		hw_dbg(hw, "i40e_create_lan_hmc_object: returns error %d\n",
+			  ret_code);
+		goto exit;
+	}
+
+	/* find sd index and limit */
+	I40E_FIND_SD_INDEX_LIMIT(info->hmc_info, info->rsrc_type,
+				 info->start_idx, info->count,
+				 &sd_idx, &sd_lmt);
+	if (sd_idx >= info->hmc_info->sd_table.sd_cnt ||
+	    sd_lmt > info->hmc_info->sd_table.sd_cnt) {
+			ret_code = I40E_ERR_INVALID_SD_INDEX;
+			goto exit;
+	}
+	/* find pd index */
+	I40E_FIND_PD_INDEX_LIMIT(info->hmc_info, info->rsrc_type,
+				 info->start_idx, info->count, &pd_idx,
+				 &pd_lmt);
+
+	/* This is to cover for cases where you may not want to have an SD with
+	 * the full 2M memory but something smaller. By not filling out any
+	 * size, the function will default the SD size to be 2M.
+	 */
+	if (info->direct_mode_sz == 0)
+		sd_size = I40E_HMC_DIRECT_BP_SIZE;
+	else
+		sd_size = info->direct_mode_sz;
+
+	/* check if all the sds are valid. If not, allocate a page and
+	 * initialize it.
+	 */
+	for (j = sd_idx; j < sd_lmt; j++) {
+		/* update the sd table entry */
+		ret_code = i40e_add_sd_table_entry(hw, info->hmc_info, j,
+						   info->entry_type,
+						   sd_size);
+		if (ret_code)
+			goto exit_sd_error;
+		sd_entry = &info->hmc_info->sd_table.sd_entry[j];
+		if (I40E_SD_TYPE_PAGED == sd_entry->entry_type) {
+			/* check if all the pds in this sd are valid. If not,
+			 * allocate a page and initialize it.
+			 */
+
+			/* find pd_idx and pd_lmt in this sd */
+			pd_idx1 = max(pd_idx, (j * I40E_HMC_MAX_BP_COUNT));
+			pd_lmt1 = min(pd_lmt,
+				      ((j + 1) * I40E_HMC_MAX_BP_COUNT));
+			for (i = pd_idx1; i < pd_lmt1; i++) {
+				/* update the pd table entry */
+				ret_code = i40e_add_pd_table_entry(hw,
+								info->hmc_info,
+								i, NULL);
+				if (ret_code) {
+					pd_error = true;
+					break;
+				}
+			}
+			if (pd_error) {
+				/* remove the backing pages from pd_idx1 to i */
+				while (i && (i > pd_idx1)) {
+					i40e_remove_pd_bp(hw, info->hmc_info,
+							  (i - 1));
+					i--;
+				}
+			}
+		}
+		if (!sd_entry->valid) {
+			sd_entry->valid = true;
+			switch (sd_entry->entry_type) {
+			case I40E_SD_TYPE_PAGED:
+				I40E_SET_PF_SD_ENTRY(hw,
+					sd_entry->u.pd_table.pd_page_addr.pa,
+					j, sd_entry->entry_type);
+				break;
+			case I40E_SD_TYPE_DIRECT:
+				I40E_SET_PF_SD_ENTRY(hw, sd_entry->u.bp.addr.pa,
+						     j, sd_entry->entry_type);
+				break;
+			default:
+				ret_code = I40E_ERR_INVALID_SD_TYPE;
+				goto exit;
+			}
+		}
+	}
+	goto exit;
+
+exit_sd_error:
+	/* cleanup for sd entries from j to sd_idx */
+	while (j && (j > sd_idx)) {
+		sd_entry = &info->hmc_info->sd_table.sd_entry[j - 1];
+		switch (sd_entry->entry_type) {
+		case I40E_SD_TYPE_PAGED:
+			pd_idx1 = max(pd_idx,
+				      ((j - 1) * I40E_HMC_MAX_BP_COUNT));
+			pd_lmt1 = min(pd_lmt, (j * I40E_HMC_MAX_BP_COUNT));
+			for (i = pd_idx1; i < pd_lmt1; i++)
+				i40e_remove_pd_bp(hw, info->hmc_info, i);
+			i40e_remove_pd_page(hw, info->hmc_info, (j - 1));
+			break;
+		case I40E_SD_TYPE_DIRECT:
+			i40e_remove_sd_bp(hw, info->hmc_info, (j - 1));
+			break;
+		default:
+			ret_code = I40E_ERR_INVALID_SD_TYPE;
+			break;
+		}
+		j--;
+	}
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_configure_lan_hmc - prepare the HMC backing store
+ * @hw: pointer to the hw structure
+ * @model: the model for the layout of the SD/PD tables
+ *
+ * - This function will be called once per physical function initialization.
+ * - This function will be called after i40e_init_lan_hmc() and before
+ *   any LAN/FCoE HMC objects can be created.
+ **/
+i40e_status i40e_configure_lan_hmc(struct i40e_hw *hw,
+					     enum i40e_hmc_model model)
+{
+	struct i40e_hmc_lan_create_obj_info info;
+	i40e_status ret_code = 0;
+	u8 hmc_fn_id = hw->hmc.hmc_fn_id;
+	struct i40e_hmc_obj_info *obj;
+
+	/* Initialize part of the create object info struct */
+	info.hmc_info = &hw->hmc;
+	info.rsrc_type = I40E_HMC_LAN_FULL;
+	info.start_idx = 0;
+	info.direct_mode_sz = hw->hmc.hmc_obj[I40E_HMC_LAN_FULL].size;
+
+	/* Build the SD entry for the LAN objects */
+	switch (model) {
+	case I40E_HMC_MODEL_DIRECT_PREFERRED:
+	case I40E_HMC_MODEL_DIRECT_ONLY:
+		info.entry_type = I40E_SD_TYPE_DIRECT;
+		/* Make one big object, a single SD */
+		info.count = 1;
+		ret_code = i40e_create_lan_hmc_object(hw, &info);
+		if (ret_code && (model == I40E_HMC_MODEL_DIRECT_PREFERRED))
+			goto try_type_paged;
+		else if (ret_code)
+			goto configure_lan_hmc_out;
+		/* else clause falls through the break */
+		break;
+	case I40E_HMC_MODEL_PAGED_ONLY:
+try_type_paged:
+		info.entry_type = I40E_SD_TYPE_PAGED;
+		/* Make one big object in the PD table */
+		info.count = 1;
+		ret_code = i40e_create_lan_hmc_object(hw, &info);
+		if (ret_code)
+			goto configure_lan_hmc_out;
+		break;
+	default:
+		/* unsupported type */
+		ret_code = I40E_ERR_INVALID_SD_TYPE;
+		hw_dbg(hw, "i40e_configure_lan_hmc: Unknown SD type: %d\n",
+			  ret_code);
+		goto configure_lan_hmc_out;
+	}
+
+	/* Configure and program the FPM registers so objects can be created */
+
+	/* Tx contexts */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_TX];
+	wr32(hw, I40E_GLHMC_LANTXBASE(hmc_fn_id),
+	     (u32)((obj->base & I40E_GLHMC_LANTXBASE_FPMLANTXBASE_MASK) / 512));
+	wr32(hw, I40E_GLHMC_LANTXCNT(hmc_fn_id), obj->cnt);
+
+	/* Rx contexts */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_LAN_RX];
+	wr32(hw, I40E_GLHMC_LANRXBASE(hmc_fn_id),
+	     (u32)((obj->base & I40E_GLHMC_LANRXBASE_FPMLANRXBASE_MASK) / 512));
+	wr32(hw, I40E_GLHMC_LANRXCNT(hmc_fn_id), obj->cnt);
+
+	/* FCoE contexts */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_FCOE_CTX];
+	wr32(hw, I40E_GLHMC_FCOEDDPBASE(hmc_fn_id),
+	 (u32)((obj->base & I40E_GLHMC_FCOEDDPBASE_FPMFCOEDDPBASE_MASK) / 512));
+	wr32(hw, I40E_GLHMC_FCOEDDPCNT(hmc_fn_id), obj->cnt);
+
+	/* FCoE filters */
+	obj = &hw->hmc.hmc_obj[I40E_HMC_FCOE_FILT];
+	wr32(hw, I40E_GLHMC_FCOEFBASE(hmc_fn_id),
+	     (u32)((obj->base & I40E_GLHMC_FCOEFBASE_FPMFCOEFBASE_MASK) / 512));
+	wr32(hw, I40E_GLHMC_FCOEFCNT(hmc_fn_id), obj->cnt);
+
+configure_lan_hmc_out:
+	return ret_code;
+}
+
+/**
+ * i40e_delete_hmc_object - remove hmc objects
+ * @hw: pointer to the HW structure
+ * @info: pointer to i40e_hmc_delete_obj_info struct
+ *
+ * This will de-populate the SDs and PDs.  It frees
+ * the memory for PDS and backing storage.  After this function is returned,
+ * caller should deallocate memory allocated previously for
+ * book-keeping information about PDs and backing storage.
+ **/
+static i40e_status i40e_delete_lan_hmc_object(struct i40e_hw *hw,
+				struct i40e_hmc_lan_delete_obj_info *info)
+{
+	i40e_status ret_code = 0;
+	struct i40e_hmc_pd_table *pd_table;
+	u32 pd_idx, pd_lmt, rel_pd_idx;
+	u32 sd_idx, sd_lmt;
+	u32 i, j;
+
+	if (NULL == info) {
+		ret_code = I40E_ERR_BAD_PTR;
+		hw_dbg(hw, "i40e_delete_hmc_object: bad info ptr\n");
+		goto exit;
+	}
+	if (NULL == info->hmc_info) {
+		ret_code = I40E_ERR_BAD_PTR;
+		hw_dbg(hw, "i40e_delete_hmc_object: bad info->hmc_info ptr\n");
+		goto exit;
+	}
+	if (I40E_HMC_INFO_SIGNATURE != info->hmc_info->signature) {
+		ret_code = I40E_ERR_BAD_PTR;
+		hw_dbg(hw, "i40e_delete_hmc_object: bad hmc_info->signature\n");
+		goto exit;
+	}
+
+	if (NULL == info->hmc_info->sd_table.sd_entry) {
+		ret_code = I40E_ERR_BAD_PTR;
+		hw_dbg(hw, "i40e_delete_hmc_object: bad sd_entry\n");
+		goto exit;
+	}
+
+	if (NULL == info->hmc_info->hmc_obj) {
+		ret_code = I40E_ERR_BAD_PTR;
+		hw_dbg(hw, "i40e_delete_hmc_object: bad hmc_info->hmc_obj\n");
+		goto exit;
+	}
+	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_INDEX;
+		hw_dbg(hw, "i40e_delete_hmc_object: returns error %d\n",
+			  ret_code);
+		goto exit;
+	}
+
+	if ((info->start_idx + info->count) >
+	    info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_COUNT;
+		hw_dbg(hw, "i40e_delete_hmc_object: returns error %d\n",
+			  ret_code);
+		goto exit;
+	}
+
+	I40E_FIND_PD_INDEX_LIMIT(info->hmc_info, info->rsrc_type,
+				 info->start_idx, info->count, &pd_idx,
+				 &pd_lmt);
+
+	for (j = pd_idx; j < pd_lmt; j++) {
+		sd_idx = j / I40E_HMC_PD_CNT_IN_SD;
+
+		if (I40E_SD_TYPE_PAGED !=
+		    info->hmc_info->sd_table.sd_entry[sd_idx].entry_type)
+			continue;
+
+		rel_pd_idx = j % I40E_HMC_PD_CNT_IN_SD;
+
+		pd_table =
+			&info->hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
+		if (pd_table->pd_entry[rel_pd_idx].valid) {
+			ret_code = i40e_remove_pd_bp(hw, info->hmc_info, j);
+			if (ret_code)
+				goto exit;
+		}
+	}
+
+	/* find sd index and limit */
+	I40E_FIND_SD_INDEX_LIMIT(info->hmc_info, info->rsrc_type,
+				 info->start_idx, info->count,
+				 &sd_idx, &sd_lmt);
+	if (sd_idx >= info->hmc_info->sd_table.sd_cnt ||
+	    sd_lmt > info->hmc_info->sd_table.sd_cnt) {
+		ret_code = I40E_ERR_INVALID_SD_INDEX;
+		goto exit;
+	}
+
+	for (i = sd_idx; i < sd_lmt; i++) {
+		if (!info->hmc_info->sd_table.sd_entry[i].valid)
+			continue;
+		switch (info->hmc_info->sd_table.sd_entry[i].entry_type) {
+		case I40E_SD_TYPE_DIRECT:
+			ret_code = i40e_remove_sd_bp(hw, info->hmc_info, i);
+			if (ret_code)
+				goto exit;
+			break;
+		case I40E_SD_TYPE_PAGED:
+			ret_code = i40e_remove_pd_page(hw, info->hmc_info, i);
+			if (ret_code)
+				goto exit;
+			break;
+		default:
+			break;
+		}
+	}
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_shutdown_lan_hmc - Remove HMC backing store, free allocated memory
+ * @hw: pointer to the hw structure
+ *
+ * This must be called by drivers as they are shutting down and being
+ * removed from the OS.
+ **/
+i40e_status i40e_shutdown_lan_hmc(struct i40e_hw *hw)
+{
+	struct i40e_hmc_lan_delete_obj_info info;
+	i40e_status ret_code;
+
+	info.hmc_info = &hw->hmc;
+	info.rsrc_type = I40E_HMC_LAN_FULL;
+	info.start_idx = 0;
+	info.count = 1;
+
+	/* delete the object */
+	ret_code = i40e_delete_lan_hmc_object(hw, &info);
+
+	/* free the SD table entry for LAN */
+	i40e_free_virt_mem(hw, &hw->hmc.sd_table.addr);
+	hw->hmc.sd_table.sd_cnt = 0;
+	hw->hmc.sd_table.sd_entry = NULL;
+
+	/* free memory used for hmc_obj */
+	i40e_free_virt_mem(hw, &hw->hmc.hmc_obj_virt_mem);
+	hw->hmc.hmc_obj = NULL;
+
+	return ret_code;
+}
+
+#define I40E_HMC_STORE(_struct, _ele)		\
+	offsetof(struct _struct, _ele),		\
+	FIELD_SIZEOF(struct _struct, _ele)
+
+struct i40e_context_ele {
+	u16 offset;
+	u16 size_of;
+	u16 width;
+	u16 lsb;
+};
+
+/* LAN Tx Queue Context */
+static struct i40e_context_ele i40e_hmc_txq_ce_info[] = {
+					     /* Field      Width    LSB */
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, head),           13,      0 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, new_context),     1,     30 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, base),           57,     32 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, fc_ena),          1,     89 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, timesync_ena),    1,     90 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, fd_ena),          1,     91 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, alt_vlan_ena),    1,     92 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, cpuid),           8,     96 },
+/* line 1 */
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, thead_wb),       13,  0 + 128 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, head_wb_ena),     1, 32 + 128 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, qlen),           13, 33 + 128 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, tphrdesc_ena),    1, 46 + 128 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, tphrpacket_ena),  1, 47 + 128 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, tphwdesc_ena),    1, 48 + 128 },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, head_wb_addr),   64, 64 + 128 },
+/* line 7 */
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, crc),            32,  0 + (7 * 128) },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, rdylist),        10, 84 + (7 * 128) },
+	{I40E_HMC_STORE(i40e_hmc_obj_txq, rdylist_act),     1, 94 + (7 * 128) },
+	{ 0 }
+};
+
+/* LAN Rx Queue Context */
+static struct i40e_context_ele i40e_hmc_rxq_ce_info[] = {
+					 /* Field      Width    LSB */
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, head),        13,	0   },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, cpuid),        8,	13  },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, base),        57,	32  },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, qlen),        13,	89  },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, dbuff),        7,	102 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, hbuff),        5,	109 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, dtype),        2,	114 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, dsize),        1,	116 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, crcstrip),     1,	117 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, fc_ena),       1,	118 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, l2tsel),       1,	119 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, hsplit_0),     4,	120 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, hsplit_1),     2,	124 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, showiv),       1,	127 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, rxmax),       14,	174 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, tphrdesc_ena), 1,	193 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, tphwdesc_ena), 1,	194 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, tphdata_ena),  1,	195 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, tphhead_ena),  1,	196 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, lrxqthresh),   3,	198 },
+	{ I40E_HMC_STORE(i40e_hmc_obj_rxq, prefena),      1,	201 },
+	{ 0 }
+};
+
+/**
+ * i40e_write_byte - replace HMC context byte
+ * @hmc_bits: pointer to the HMC memory
+ * @ce_info: a description of the struct to be read from
+ * @src: the struct to be read from
+ **/
+static void i40e_write_byte(u8 *hmc_bits,
+			    struct i40e_context_ele *ce_info,
+			    u8 *src)
+{
+	u8 src_byte, dest_byte, mask;
+	u8 *from, *dest;
+	u16 shift_width;
+
+	/* copy from the next struct field */
+	from = src + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+	mask = (u8)(BIT(ce_info->width) - 1);
+
+	src_byte = *from;
+	src_byte &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_byte <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = hmc_bits + (ce_info->lsb / 8);
+
+	memcpy(&dest_byte, dest, sizeof(dest_byte));
+
+	dest_byte &= ~mask;	/* get the bits not changing */
+	dest_byte |= src_byte;	/* add in the new bits */
+
+	/* put it all back */
+	memcpy(dest, &dest_byte, sizeof(dest_byte));
+}
+
+/**
+ * i40e_write_word - replace HMC context word
+ * @hmc_bits: pointer to the HMC memory
+ * @ce_info: a description of the struct to be read from
+ * @src: the struct to be read from
+ **/
+static void i40e_write_word(u8 *hmc_bits,
+			    struct i40e_context_ele *ce_info,
+			    u8 *src)
+{
+	u16 src_word, mask;
+	u8 *from, *dest;
+	u16 shift_width;
+	__le16 dest_word;
+
+	/* copy from the next struct field */
+	from = src + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+	mask = BIT(ce_info->width) - 1;
+
+	/* don't swizzle the bits until after the mask because the mask bits
+	 * will be in a different bit position on big endian machines
+	 */
+	src_word = *(u16 *)from;
+	src_word &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_word <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = hmc_bits + (ce_info->lsb / 8);
+
+	memcpy(&dest_word, dest, sizeof(dest_word));
+
+	dest_word &= ~(cpu_to_le16(mask));	/* get the bits not changing */
+	dest_word |= cpu_to_le16(src_word);	/* add in the new bits */
+
+	/* put it all back */
+	memcpy(dest, &dest_word, sizeof(dest_word));
+}
+
+/**
+ * i40e_write_dword - replace HMC context dword
+ * @hmc_bits: pointer to the HMC memory
+ * @ce_info: a description of the struct to be read from
+ * @src: the struct to be read from
+ **/
+static void i40e_write_dword(u8 *hmc_bits,
+			     struct i40e_context_ele *ce_info,
+			     u8 *src)
+{
+	u32 src_dword, mask;
+	u8 *from, *dest;
+	u16 shift_width;
+	__le32 dest_dword;
+
+	/* copy from the next struct field */
+	from = src + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+
+	/* if the field width is exactly 32 on an x86 machine, then the shift
+	 * operation will not work because the SHL instructions count is masked
+	 * to 5 bits so the shift will do nothing
+	 */
+	if (ce_info->width < 32)
+		mask = BIT(ce_info->width) - 1;
+	else
+		mask = ~(u32)0;
+
+	/* don't swizzle the bits until after the mask because the mask bits
+	 * will be in a different bit position on big endian machines
+	 */
+	src_dword = *(u32 *)from;
+	src_dword &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_dword <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = hmc_bits + (ce_info->lsb / 8);
+
+	memcpy(&dest_dword, dest, sizeof(dest_dword));
+
+	dest_dword &= ~(cpu_to_le32(mask));	/* get the bits not changing */
+	dest_dword |= cpu_to_le32(src_dword);	/* add in the new bits */
+
+	/* put it all back */
+	memcpy(dest, &dest_dword, sizeof(dest_dword));
+}
+
+/**
+ * i40e_write_qword - replace HMC context qword
+ * @hmc_bits: pointer to the HMC memory
+ * @ce_info: a description of the struct to be read from
+ * @src: the struct to be read from
+ **/
+static void i40e_write_qword(u8 *hmc_bits,
+			     struct i40e_context_ele *ce_info,
+			     u8 *src)
+{
+	u64 src_qword, mask;
+	u8 *from, *dest;
+	u16 shift_width;
+	__le64 dest_qword;
+
+	/* copy from the next struct field */
+	from = src + ce_info->offset;
+
+	/* prepare the bits and mask */
+	shift_width = ce_info->lsb % 8;
+
+	/* if the field width is exactly 64 on an x86 machine, then the shift
+	 * operation will not work because the SHL instructions count is masked
+	 * to 6 bits so the shift will do nothing
+	 */
+	if (ce_info->width < 64)
+		mask = BIT_ULL(ce_info->width) - 1;
+	else
+		mask = ~(u64)0;
+
+	/* don't swizzle the bits until after the mask because the mask bits
+	 * will be in a different bit position on big endian machines
+	 */
+	src_qword = *(u64 *)from;
+	src_qword &= mask;
+
+	/* shift to correct alignment */
+	mask <<= shift_width;
+	src_qword <<= shift_width;
+
+	/* get the current bits from the target bit string */
+	dest = hmc_bits + (ce_info->lsb / 8);
+
+	memcpy(&dest_qword, dest, sizeof(dest_qword));
+
+	dest_qword &= ~(cpu_to_le64(mask));	/* get the bits not changing */
+	dest_qword |= cpu_to_le64(src_qword);	/* add in the new bits */
+
+	/* put it all back */
+	memcpy(dest, &dest_qword, sizeof(dest_qword));
+}
+
+/**
+ * i40e_clear_hmc_context - zero out the HMC context bits
+ * @hw:       the hardware struct
+ * @context_bytes: pointer to the context bit array (DMA memory)
+ * @hmc_type: the type of HMC resource
+ **/
+static i40e_status i40e_clear_hmc_context(struct i40e_hw *hw,
+					u8 *context_bytes,
+					enum i40e_hmc_lan_rsrc_type hmc_type)
+{
+	/* clean the bit array */
+	memset(context_bytes, 0, (u32)hw->hmc.hmc_obj[hmc_type].size);
+
+	return 0;
+}
+
+/**
+ * i40e_set_hmc_context - replace HMC context bits
+ * @context_bytes: pointer to the context bit array
+ * @ce_info:  a description of the struct to be filled
+ * @dest:     the struct to be filled
+ **/
+static i40e_status i40e_set_hmc_context(u8 *context_bytes,
+					struct i40e_context_ele *ce_info,
+					u8 *dest)
+{
+	int f;
+
+	for (f = 0; ce_info[f].width != 0; f++) {
+
+		/* we have to deal with each element of the HMC using the
+		 * correct size so that we are correct regardless of the
+		 * endianness of the machine
+		 */
+		switch (ce_info[f].size_of) {
+		case 1:
+			i40e_write_byte(context_bytes, &ce_info[f], dest);
+			break;
+		case 2:
+			i40e_write_word(context_bytes, &ce_info[f], dest);
+			break;
+		case 4:
+			i40e_write_dword(context_bytes, &ce_info[f], dest);
+			break;
+		case 8:
+			i40e_write_qword(context_bytes, &ce_info[f], dest);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_hmc_get_object_va - retrieves an object's virtual address
+ * @hmc_info: pointer to i40e_hmc_info struct
+ * @object_base: pointer to u64 to get the va
+ * @rsrc_type: the hmc resource type
+ * @obj_idx: hmc object index
+ *
+ * This function retrieves the object's virtual address from the object
+ * base pointer.  This function is used for LAN Queue contexts.
+ **/
+static
+i40e_status i40e_hmc_get_object_va(struct i40e_hmc_info *hmc_info,
+					u8 **object_base,
+					enum i40e_hmc_lan_rsrc_type rsrc_type,
+					u32 obj_idx)
+{
+	u32 obj_offset_in_sd, obj_offset_in_pd;
+	i40e_status ret_code = 0;
+	struct i40e_hmc_sd_entry *sd_entry;
+	struct i40e_hmc_pd_entry *pd_entry;
+	u32 pd_idx, pd_lmt, rel_pd_idx;
+	u64 obj_offset_in_fpm;
+	u32 sd_idx, sd_lmt;
+
+	if (NULL == hmc_info) {
+		ret_code = I40E_ERR_BAD_PTR;
+		hw_dbg(hw, "i40e_hmc_get_object_va: bad hmc_info ptr\n");
+		goto exit;
+	}
+	if (NULL == hmc_info->hmc_obj) {
+		ret_code = I40E_ERR_BAD_PTR;
+		hw_dbg(hw, "i40e_hmc_get_object_va: bad hmc_info->hmc_obj ptr\n");
+		goto exit;
+	}
+	if (NULL == object_base) {
+		ret_code = I40E_ERR_BAD_PTR;
+		hw_dbg(hw, "i40e_hmc_get_object_va: bad object_base ptr\n");
+		goto exit;
+	}
+	if (I40E_HMC_INFO_SIGNATURE != hmc_info->signature) {
+		ret_code = I40E_ERR_BAD_PTR;
+		hw_dbg(hw, "i40e_hmc_get_object_va: bad hmc_info->signature\n");
+		goto exit;
+	}
+	if (obj_idx >= hmc_info->hmc_obj[rsrc_type].cnt) {
+		hw_dbg(hw, "i40e_hmc_get_object_va: returns error %d\n",
+			  ret_code);
+		ret_code = I40E_ERR_INVALID_HMC_OBJ_INDEX;
+		goto exit;
+	}
+	/* find sd index and limit */
+	I40E_FIND_SD_INDEX_LIMIT(hmc_info, rsrc_type, obj_idx, 1,
+				 &sd_idx, &sd_lmt);
+
+	sd_entry = &hmc_info->sd_table.sd_entry[sd_idx];
+	obj_offset_in_fpm = hmc_info->hmc_obj[rsrc_type].base +
+			    hmc_info->hmc_obj[rsrc_type].size * obj_idx;
+
+	if (I40E_SD_TYPE_PAGED == sd_entry->entry_type) {
+		I40E_FIND_PD_INDEX_LIMIT(hmc_info, rsrc_type, obj_idx, 1,
+					 &pd_idx, &pd_lmt);
+		rel_pd_idx = pd_idx % I40E_HMC_PD_CNT_IN_SD;
+		pd_entry = &sd_entry->u.pd_table.pd_entry[rel_pd_idx];
+		obj_offset_in_pd = (u32)(obj_offset_in_fpm %
+					 I40E_HMC_PAGED_BP_SIZE);
+		*object_base = (u8 *)pd_entry->bp.addr.va + obj_offset_in_pd;
+	} else {
+		obj_offset_in_sd = (u32)(obj_offset_in_fpm %
+					 I40E_HMC_DIRECT_BP_SIZE);
+		*object_base = (u8 *)sd_entry->u.bp.addr.va + obj_offset_in_sd;
+	}
+exit:
+	return ret_code;
+}
+
+/**
+ * i40e_clear_lan_tx_queue_context - clear the HMC context for the queue
+ * @hw:    the hardware struct
+ * @queue: the queue we care about
+ **/
+i40e_status i40e_clear_lan_tx_queue_context(struct i40e_hw *hw,
+						      u16 queue)
+{
+	i40e_status err;
+	u8 *context_bytes;
+
+	err = i40e_hmc_get_object_va(&hw->hmc, &context_bytes,
+				     I40E_HMC_LAN_TX, queue);
+	if (err < 0)
+		return err;
+
+	return i40e_clear_hmc_context(hw, context_bytes, I40E_HMC_LAN_TX);
+}
+
+/**
+ * i40e_set_lan_tx_queue_context - set the HMC context for the queue
+ * @hw:    the hardware struct
+ * @queue: the queue we care about
+ * @s:     the struct to be filled
+ **/
+i40e_status i40e_set_lan_tx_queue_context(struct i40e_hw *hw,
+						    u16 queue,
+						    struct i40e_hmc_obj_txq *s)
+{
+	i40e_status err;
+	u8 *context_bytes;
+
+	err = i40e_hmc_get_object_va(&hw->hmc, &context_bytes,
+				     I40E_HMC_LAN_TX, queue);
+	if (err < 0)
+		return err;
+
+	return i40e_set_hmc_context(context_bytes,
+				    i40e_hmc_txq_ce_info, (u8 *)s);
+}
+
+/**
+ * i40e_clear_lan_rx_queue_context - clear the HMC context for the queue
+ * @hw:    the hardware struct
+ * @queue: the queue we care about
+ **/
+i40e_status i40e_clear_lan_rx_queue_context(struct i40e_hw *hw,
+						      u16 queue)
+{
+	i40e_status err;
+	u8 *context_bytes;
+
+	err = i40e_hmc_get_object_va(&hw->hmc, &context_bytes,
+				     I40E_HMC_LAN_RX, queue);
+	if (err < 0)
+		return err;
+
+	return i40e_clear_hmc_context(hw, context_bytes, I40E_HMC_LAN_RX);
+}
+
+/**
+ * i40e_set_lan_rx_queue_context - set the HMC context for the queue
+ * @hw:    the hardware struct
+ * @queue: the queue we care about
+ * @s:     the struct to be filled
+ **/
+i40e_status i40e_set_lan_rx_queue_context(struct i40e_hw *hw,
+						    u16 queue,
+						    struct i40e_hmc_obj_rxq *s)
+{
+	i40e_status err;
+	u8 *context_bytes;
+
+	err = i40e_hmc_get_object_va(&hw->hmc, &context_bytes,
+				     I40E_HMC_LAN_RX, queue);
+	if (err < 0)
+		return err;
+
+	return i40e_set_hmc_context(context_bytes,
+				    i40e_hmc_rxq_ce_info, (u8 *)s);
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.h b/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.h
new file mode 100644
index 0000000..79e1396
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_LAN_HMC_H_
+#define _I40E_LAN_HMC_H_
+
+/* forward-declare the HW struct for the compiler */
+struct i40e_hw;
+
+/* HMC element context information */
+
+/* Rx queue context data
+ *
+ * The sizes of the variables may be larger than needed due to crossing byte
+ * boundaries. If we do not have the width of the variable set to the correct
+ * size then we could end up shifting bits off the top of the variable when the
+ * variable is at the top of a byte and crosses over into the next byte.
+ */
+struct i40e_hmc_obj_rxq {
+	u16 head;
+	u16 cpuid; /* bigger than needed, see above for reason */
+	u64 base;
+	u16 qlen;
+#define I40E_RXQ_CTX_DBUFF_SHIFT 7
+	u16 dbuff; /* bigger than needed, see above for reason */
+#define I40E_RXQ_CTX_HBUFF_SHIFT 6
+	u16 hbuff; /* bigger than needed, see above for reason */
+	u8  dtype;
+	u8  dsize;
+	u8  crcstrip;
+	u8  fc_ena;
+	u8  l2tsel;
+	u8  hsplit_0;
+	u8  hsplit_1;
+	u8  showiv;
+	u32 rxmax; /* bigger than needed, see above for reason */
+	u8  tphrdesc_ena;
+	u8  tphwdesc_ena;
+	u8  tphdata_ena;
+	u8  tphhead_ena;
+	u16 lrxqthresh; /* bigger than needed, see above for reason */
+	u8  prefena;	/* NOTE: normally must be set to 1 at init */
+};
+
+/* Tx queue context data
+*
+* The sizes of the variables may be larger than needed due to crossing byte
+* boundaries. If we do not have the width of the variable set to the correct
+* size then we could end up shifting bits off the top of the variable when the
+* variable is at the top of a byte and crosses over into the next byte.
+*/
+struct i40e_hmc_obj_txq {
+	u16 head;
+	u8  new_context;
+	u64 base;
+	u8  fc_ena;
+	u8  timesync_ena;
+	u8  fd_ena;
+	u8  alt_vlan_ena;
+	u16 thead_wb;
+	u8  cpuid;
+	u8  head_wb_ena;
+	u16 qlen;
+	u8  tphrdesc_ena;
+	u8  tphrpacket_ena;
+	u8  tphwdesc_ena;
+	u64 head_wb_addr;
+	u32 crc;
+	u16 rdylist;
+	u8  rdylist_act;
+};
+
+/* for hsplit_0 field of Rx HMC context */
+enum i40e_hmc_obj_rx_hsplit_0 {
+	I40E_HMC_OBJ_RX_HSPLIT_0_NO_SPLIT      = 0,
+	I40E_HMC_OBJ_RX_HSPLIT_0_SPLIT_L2      = 1,
+	I40E_HMC_OBJ_RX_HSPLIT_0_SPLIT_IP      = 2,
+	I40E_HMC_OBJ_RX_HSPLIT_0_SPLIT_TCP_UDP = 4,
+	I40E_HMC_OBJ_RX_HSPLIT_0_SPLIT_SCTP    = 8,
+};
+
+/* fcoe_cntx and fcoe_filt are for debugging purpose only */
+struct i40e_hmc_obj_fcoe_cntx {
+	u32 rsv[32];
+};
+
+struct i40e_hmc_obj_fcoe_filt {
+	u32 rsv[8];
+};
+
+/* Context sizes for LAN objects */
+enum i40e_hmc_lan_object_size {
+	I40E_HMC_LAN_OBJ_SZ_8   = 0x3,
+	I40E_HMC_LAN_OBJ_SZ_16  = 0x4,
+	I40E_HMC_LAN_OBJ_SZ_32  = 0x5,
+	I40E_HMC_LAN_OBJ_SZ_64  = 0x6,
+	I40E_HMC_LAN_OBJ_SZ_128 = 0x7,
+	I40E_HMC_LAN_OBJ_SZ_256 = 0x8,
+	I40E_HMC_LAN_OBJ_SZ_512 = 0x9,
+};
+
+#define I40E_HMC_L2OBJ_BASE_ALIGNMENT 512
+#define I40E_HMC_OBJ_SIZE_TXQ         128
+#define I40E_HMC_OBJ_SIZE_RXQ         32
+#define I40E_HMC_OBJ_SIZE_FCOE_CNTX   64
+#define I40E_HMC_OBJ_SIZE_FCOE_FILT   64
+
+enum i40e_hmc_lan_rsrc_type {
+	I40E_HMC_LAN_FULL  = 0,
+	I40E_HMC_LAN_TX    = 1,
+	I40E_HMC_LAN_RX    = 2,
+	I40E_HMC_FCOE_CTX  = 3,
+	I40E_HMC_FCOE_FILT = 4,
+	I40E_HMC_LAN_MAX   = 5
+};
+
+enum i40e_hmc_model {
+	I40E_HMC_MODEL_DIRECT_PREFERRED = 0,
+	I40E_HMC_MODEL_DIRECT_ONLY      = 1,
+	I40E_HMC_MODEL_PAGED_ONLY       = 2,
+	I40E_HMC_MODEL_UNKNOWN,
+};
+
+struct i40e_hmc_lan_create_obj_info {
+	struct i40e_hmc_info *hmc_info;
+	u32 rsrc_type;
+	u32 start_idx;
+	u32 count;
+	enum i40e_sd_entry_type entry_type;
+	u64 direct_mode_sz;
+};
+
+struct i40e_hmc_lan_delete_obj_info {
+	struct i40e_hmc_info *hmc_info;
+	u32 rsrc_type;
+	u32 start_idx;
+	u32 count;
+};
+
+i40e_status i40e_init_lan_hmc(struct i40e_hw *hw, u32 txq_num,
+					u32 rxq_num, u32 fcoe_cntx_num,
+					u32 fcoe_filt_num);
+i40e_status i40e_configure_lan_hmc(struct i40e_hw *hw,
+					     enum i40e_hmc_model model);
+i40e_status i40e_shutdown_lan_hmc(struct i40e_hw *hw);
+
+i40e_status i40e_clear_lan_tx_queue_context(struct i40e_hw *hw,
+						      u16 queue);
+i40e_status i40e_set_lan_tx_queue_context(struct i40e_hw *hw,
+						    u16 queue,
+						    struct i40e_hmc_obj_txq *s);
+i40e_status i40e_clear_lan_rx_queue_context(struct i40e_hw *hw,
+						      u16 queue);
+i40e_status i40e_set_lan_rx_queue_context(struct i40e_hw *hw,
+						    u16 queue,
+						    struct i40e_hmc_obj_rxq *s);
+
+#endif /* _I40E_LAN_HMC_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
new file mode 100644
index 0000000..1622999
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -0,0 +1,14484 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#include <linux/etherdevice.h>
+#include <linux/of_net.h>
+#include <linux/pci.h>
+#include <linux/bpf.h>
+
+/* Local includes */
+#include "i40e.h"
+#include "i40e_diag.h"
+#include <net/udp_tunnel.h>
+/* All i40e tracepoints are defined by the include below, which
+ * must be included exactly once across the whole kernel with
+ * CREATE_TRACE_POINTS defined
+ */
+#define CREATE_TRACE_POINTS
+#include "i40e_trace.h"
+
+const char i40e_driver_name[] = "i40e";
+static const char i40e_driver_string[] =
+			"Intel(R) Ethernet Connection XL710 Network Driver";
+
+#define DRV_KERN "-k"
+
+#define DRV_VERSION_MAJOR 2
+#define DRV_VERSION_MINOR 3
+#define DRV_VERSION_BUILD 2
+#define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
+	     __stringify(DRV_VERSION_MINOR) "." \
+	     __stringify(DRV_VERSION_BUILD)    DRV_KERN
+const char i40e_driver_version_str[] = DRV_VERSION;
+static const char i40e_copyright[] = "Copyright (c) 2013 - 2014 Intel Corporation.";
+
+/* a bit of forward declarations */
+static void i40e_vsi_reinit_locked(struct i40e_vsi *vsi);
+static void i40e_handle_reset_warning(struct i40e_pf *pf, bool lock_acquired);
+static int i40e_add_vsi(struct i40e_vsi *vsi);
+static int i40e_add_veb(struct i40e_veb *veb, struct i40e_vsi *vsi);
+static int i40e_setup_pf_switch(struct i40e_pf *pf, bool reinit);
+static int i40e_setup_misc_vector(struct i40e_pf *pf);
+static void i40e_determine_queue_usage(struct i40e_pf *pf);
+static int i40e_setup_pf_filter_control(struct i40e_pf *pf);
+static void i40e_prep_for_reset(struct i40e_pf *pf, bool lock_acquired);
+static int i40e_reset(struct i40e_pf *pf);
+static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired);
+static void i40e_fdir_sb_setup(struct i40e_pf *pf);
+static int i40e_veb_get_bw_info(struct i40e_veb *veb);
+static int i40e_get_capabilities(struct i40e_pf *pf,
+				 enum i40e_admin_queue_opc list_type);
+
+
+/* i40e_pci_tbl - PCI Device ID Table
+ *
+ * Last entry must be all 0s
+ *
+ * { Vendor ID, Device ID, SubVendor ID, SubDevice ID,
+ *   Class, Class Mask, private data (not used) }
+ */
+static const struct pci_device_id i40e_pci_tbl[] = {
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_SFP_XL710), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_QEMU), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_KX_B), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_KX_C), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_QSFP_A), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_QSFP_B), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_QSFP_C), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_10G_BASE_T), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_10G_BASE_T4), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_KX_X722), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_QSFP_X722), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_SFP_X722), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_1G_BASE_T_X722), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_10G_BASE_T_X722), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_SFP_I_X722), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_20G_KR2), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_20G_KR2_A), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_25G_B), 0},
+	{PCI_VDEVICE(INTEL, I40E_DEV_ID_25G_SFP28), 0},
+	/* required last entry */
+	{0, }
+};
+MODULE_DEVICE_TABLE(pci, i40e_pci_tbl);
+
+#define I40E_MAX_VF_COUNT 128
+static int debug = -1;
+module_param(debug, uint, 0);
+MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all), Debug mask (0x8XXXXXXX)");
+
+MODULE_AUTHOR("Intel Corporation, <e1000-devel@lists.sourceforge.net>");
+MODULE_DESCRIPTION("Intel(R) Ethernet Connection XL710 Network Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static struct workqueue_struct *i40e_wq;
+
+/**
+ * i40e_allocate_dma_mem_d - OS specific memory alloc for shared code
+ * @hw:   pointer to the HW structure
+ * @mem:  ptr to mem struct to fill out
+ * @size: size of memory requested
+ * @alignment: what to align the allocation to
+ **/
+int i40e_allocate_dma_mem_d(struct i40e_hw *hw, struct i40e_dma_mem *mem,
+			    u64 size, u32 alignment)
+{
+	struct i40e_pf *pf = (struct i40e_pf *)hw->back;
+
+	mem->size = ALIGN(size, alignment);
+	mem->va = dma_zalloc_coherent(&pf->pdev->dev, mem->size,
+				      &mem->pa, GFP_KERNEL);
+	if (!mem->va)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * i40e_free_dma_mem_d - OS specific memory free for shared code
+ * @hw:   pointer to the HW structure
+ * @mem:  ptr to mem struct to free
+ **/
+int i40e_free_dma_mem_d(struct i40e_hw *hw, struct i40e_dma_mem *mem)
+{
+	struct i40e_pf *pf = (struct i40e_pf *)hw->back;
+
+	dma_free_coherent(&pf->pdev->dev, mem->size, mem->va, mem->pa);
+	mem->va = NULL;
+	mem->pa = 0;
+	mem->size = 0;
+
+	return 0;
+}
+
+/**
+ * i40e_allocate_virt_mem_d - OS specific memory alloc for shared code
+ * @hw:   pointer to the HW structure
+ * @mem:  ptr to mem struct to fill out
+ * @size: size of memory requested
+ **/
+int i40e_allocate_virt_mem_d(struct i40e_hw *hw, struct i40e_virt_mem *mem,
+			     u32 size)
+{
+	mem->size = size;
+	mem->va = kzalloc(size, GFP_KERNEL);
+
+	if (!mem->va)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * i40e_free_virt_mem_d - OS specific memory free for shared code
+ * @hw:   pointer to the HW structure
+ * @mem:  ptr to mem struct to free
+ **/
+int i40e_free_virt_mem_d(struct i40e_hw *hw, struct i40e_virt_mem *mem)
+{
+	/* it's ok to kfree a NULL pointer */
+	kfree(mem->va);
+	mem->va = NULL;
+	mem->size = 0;
+
+	return 0;
+}
+
+/**
+ * i40e_get_lump - find a lump of free generic resource
+ * @pf: board private structure
+ * @pile: the pile of resource to search
+ * @needed: the number of items needed
+ * @id: an owner id to stick on the items assigned
+ *
+ * Returns the base item index of the lump, or negative for error
+ *
+ * The search_hint trick and lack of advanced fit-finding only work
+ * because we're highly likely to have all the same size lump requests.
+ * Linear search time and any fragmentation should be minimal.
+ **/
+static int i40e_get_lump(struct i40e_pf *pf, struct i40e_lump_tracking *pile,
+			 u16 needed, u16 id)
+{
+	int ret = -ENOMEM;
+	int i, j;
+
+	if (!pile || needed == 0 || id >= I40E_PILE_VALID_BIT) {
+		dev_info(&pf->pdev->dev,
+			 "param err: pile=%s needed=%d id=0x%04x\n",
+			 pile ? "<valid>" : "<null>", needed, id);
+		return -EINVAL;
+	}
+
+	/* start the linear search with an imperfect hint */
+	i = pile->search_hint;
+	while (i < pile->num_entries) {
+		/* skip already allocated entries */
+		if (pile->list[i] & I40E_PILE_VALID_BIT) {
+			i++;
+			continue;
+		}
+
+		/* do we have enough in this lump? */
+		for (j = 0; (j < needed) && ((i+j) < pile->num_entries); j++) {
+			if (pile->list[i+j] & I40E_PILE_VALID_BIT)
+				break;
+		}
+
+		if (j == needed) {
+			/* there was enough, so assign it to the requestor */
+			for (j = 0; j < needed; j++)
+				pile->list[i+j] = id | I40E_PILE_VALID_BIT;
+			ret = i;
+			pile->search_hint = i + j;
+			break;
+		}
+
+		/* not enough, so skip over it and continue looking */
+		i += j;
+	}
+
+	return ret;
+}
+
+/**
+ * i40e_put_lump - return a lump of generic resource
+ * @pile: the pile of resource to search
+ * @index: the base item index
+ * @id: the owner id of the items assigned
+ *
+ * Returns the count of items in the lump
+ **/
+static int i40e_put_lump(struct i40e_lump_tracking *pile, u16 index, u16 id)
+{
+	int valid_id = (id | I40E_PILE_VALID_BIT);
+	int count = 0;
+	int i;
+
+	if (!pile || index >= pile->num_entries)
+		return -EINVAL;
+
+	for (i = index;
+	     i < pile->num_entries && pile->list[i] == valid_id;
+	     i++) {
+		pile->list[i] = 0;
+		count++;
+	}
+
+	if (count && index < pile->search_hint)
+		pile->search_hint = index;
+
+	return count;
+}
+
+/**
+ * i40e_find_vsi_from_id - searches for the vsi with the given id
+ * @pf - the pf structure to search for the vsi
+ * @id - id of the vsi it is searching for
+ **/
+struct i40e_vsi *i40e_find_vsi_from_id(struct i40e_pf *pf, u16 id)
+{
+	int i;
+
+	for (i = 0; i < pf->num_alloc_vsi; i++)
+		if (pf->vsi[i] && (pf->vsi[i]->id == id))
+			return pf->vsi[i];
+
+	return NULL;
+}
+
+/**
+ * i40e_service_event_schedule - Schedule the service task to wake up
+ * @pf: board private structure
+ *
+ * If not already scheduled, this puts the task into the work queue
+ **/
+void i40e_service_event_schedule(struct i40e_pf *pf)
+{
+	if (!test_bit(__I40E_DOWN, pf->state) &&
+	    !test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state))
+		queue_work(i40e_wq, &pf->service_task);
+}
+
+/**
+ * i40e_tx_timeout - Respond to a Tx Hang
+ * @netdev: network interface device structure
+ *
+ * If any port has noticed a Tx timeout, it is likely that the whole
+ * device is munged, not just the one netdev port, so go for the full
+ * reset.
+ **/
+static void i40e_tx_timeout(struct net_device *netdev)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_ring *tx_ring = NULL;
+	unsigned int i, hung_queue = 0;
+	u32 head, val;
+
+	pf->tx_timeout_count++;
+
+	/* find the stopped queue the same way the stack does */
+	for (i = 0; i < netdev->num_tx_queues; i++) {
+		struct netdev_queue *q;
+		unsigned long trans_start;
+
+		q = netdev_get_tx_queue(netdev, i);
+		trans_start = q->trans_start;
+		if (netif_xmit_stopped(q) &&
+		    time_after(jiffies,
+			       (trans_start + netdev->watchdog_timeo))) {
+			hung_queue = i;
+			break;
+		}
+	}
+
+	if (i == netdev->num_tx_queues) {
+		netdev_info(netdev, "tx_timeout: no netdev hung queue found\n");
+	} else {
+		/* now that we have an index, find the tx_ring struct */
+		for (i = 0; i < vsi->num_queue_pairs; i++) {
+			if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc) {
+				if (hung_queue ==
+				    vsi->tx_rings[i]->queue_index) {
+					tx_ring = vsi->tx_rings[i];
+					break;
+				}
+			}
+		}
+	}
+
+	if (time_after(jiffies, (pf->tx_timeout_last_recovery + HZ*20)))
+		pf->tx_timeout_recovery_level = 1;  /* reset after some time */
+	else if (time_before(jiffies,
+		      (pf->tx_timeout_last_recovery + netdev->watchdog_timeo)))
+		return;   /* don't do any new action before the next timeout */
+
+	if (tx_ring) {
+		head = i40e_get_head(tx_ring);
+		/* Read interrupt register */
+		if (pf->flags & I40E_FLAG_MSIX_ENABLED)
+			val = rd32(&pf->hw,
+			     I40E_PFINT_DYN_CTLN(tx_ring->q_vector->v_idx +
+						tx_ring->vsi->base_vector - 1));
+		else
+			val = rd32(&pf->hw, I40E_PFINT_DYN_CTL0);
+
+		netdev_info(netdev, "tx_timeout: VSI_seid: %d, Q %d, NTC: 0x%x, HWB: 0x%x, NTU: 0x%x, TAIL: 0x%x, INT: 0x%x\n",
+			    vsi->seid, hung_queue, tx_ring->next_to_clean,
+			    head, tx_ring->next_to_use,
+			    readl(tx_ring->tail), val);
+	}
+
+	pf->tx_timeout_last_recovery = jiffies;
+	netdev_info(netdev, "tx_timeout recovery level %d, hung_queue %d\n",
+		    pf->tx_timeout_recovery_level, hung_queue);
+
+	switch (pf->tx_timeout_recovery_level) {
+	case 1:
+		set_bit(__I40E_PF_RESET_REQUESTED, pf->state);
+		break;
+	case 2:
+		set_bit(__I40E_CORE_RESET_REQUESTED, pf->state);
+		break;
+	case 3:
+		set_bit(__I40E_GLOBAL_RESET_REQUESTED, pf->state);
+		break;
+	default:
+		netdev_err(netdev, "tx_timeout recovery unsuccessful\n");
+		break;
+	}
+
+	i40e_service_event_schedule(pf);
+	pf->tx_timeout_recovery_level++;
+}
+
+/**
+ * i40e_get_vsi_stats_struct - Get System Network Statistics
+ * @vsi: the VSI we care about
+ *
+ * Returns the address of the device statistics structure.
+ * The statistics are actually updated from the service task.
+ **/
+struct rtnl_link_stats64 *i40e_get_vsi_stats_struct(struct i40e_vsi *vsi)
+{
+	return &vsi->net_stats;
+}
+
+/**
+ * i40e_get_netdev_stats_struct_tx - populate stats from a Tx ring
+ * @ring: Tx ring to get statistics from
+ * @stats: statistics entry to be updated
+ **/
+static void i40e_get_netdev_stats_struct_tx(struct i40e_ring *ring,
+					    struct rtnl_link_stats64 *stats)
+{
+	u64 bytes, packets;
+	unsigned int start;
+
+	do {
+		start = u64_stats_fetch_begin_irq(&ring->syncp);
+		packets = ring->stats.packets;
+		bytes   = ring->stats.bytes;
+	} while (u64_stats_fetch_retry_irq(&ring->syncp, start));
+
+	stats->tx_packets += packets;
+	stats->tx_bytes   += bytes;
+}
+
+/**
+ * i40e_get_netdev_stats_struct - Get statistics for netdev interface
+ * @netdev: network interface device structure
+ *
+ * Returns the address of the device statistics structure.
+ * The statistics are actually updated from the service task.
+ **/
+static void i40e_get_netdev_stats_struct(struct net_device *netdev,
+				  struct rtnl_link_stats64 *stats)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_ring *tx_ring, *rx_ring;
+	struct i40e_vsi *vsi = np->vsi;
+	struct rtnl_link_stats64 *vsi_stats = i40e_get_vsi_stats_struct(vsi);
+	int i;
+
+	if (test_bit(__I40E_VSI_DOWN, vsi->state))
+		return;
+
+	if (!vsi->tx_rings)
+		return;
+
+	rcu_read_lock();
+	for (i = 0; i < vsi->num_queue_pairs; i++) {
+		u64 bytes, packets;
+		unsigned int start;
+
+		tx_ring = READ_ONCE(vsi->tx_rings[i]);
+		if (!tx_ring)
+			continue;
+		i40e_get_netdev_stats_struct_tx(tx_ring, stats);
+
+		rx_ring = &tx_ring[1];
+
+		do {
+			start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
+			packets = rx_ring->stats.packets;
+			bytes   = rx_ring->stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
+
+		stats->rx_packets += packets;
+		stats->rx_bytes   += bytes;
+
+		if (i40e_enabled_xdp_vsi(vsi))
+			i40e_get_netdev_stats_struct_tx(&rx_ring[1], stats);
+	}
+	rcu_read_unlock();
+
+	/* following stats updated by i40e_watchdog_subtask() */
+	stats->multicast	= vsi_stats->multicast;
+	stats->tx_errors	= vsi_stats->tx_errors;
+	stats->tx_dropped	= vsi_stats->tx_dropped;
+	stats->rx_errors	= vsi_stats->rx_errors;
+	stats->rx_dropped	= vsi_stats->rx_dropped;
+	stats->rx_crc_errors	= vsi_stats->rx_crc_errors;
+	stats->rx_length_errors	= vsi_stats->rx_length_errors;
+}
+
+/**
+ * i40e_vsi_reset_stats - Resets all stats of the given vsi
+ * @vsi: the VSI to have its stats reset
+ **/
+void i40e_vsi_reset_stats(struct i40e_vsi *vsi)
+{
+	struct rtnl_link_stats64 *ns;
+	int i;
+
+	if (!vsi)
+		return;
+
+	ns = i40e_get_vsi_stats_struct(vsi);
+	memset(ns, 0, sizeof(*ns));
+	memset(&vsi->net_stats_offsets, 0, sizeof(vsi->net_stats_offsets));
+	memset(&vsi->eth_stats, 0, sizeof(vsi->eth_stats));
+	memset(&vsi->eth_stats_offsets, 0, sizeof(vsi->eth_stats_offsets));
+	if (vsi->rx_rings && vsi->rx_rings[0]) {
+		for (i = 0; i < vsi->num_queue_pairs; i++) {
+			memset(&vsi->rx_rings[i]->stats, 0,
+			       sizeof(vsi->rx_rings[i]->stats));
+			memset(&vsi->rx_rings[i]->rx_stats, 0,
+			       sizeof(vsi->rx_rings[i]->rx_stats));
+			memset(&vsi->tx_rings[i]->stats, 0,
+			       sizeof(vsi->tx_rings[i]->stats));
+			memset(&vsi->tx_rings[i]->tx_stats, 0,
+			       sizeof(vsi->tx_rings[i]->tx_stats));
+		}
+	}
+	vsi->stat_offsets_loaded = false;
+}
+
+/**
+ * i40e_pf_reset_stats - Reset all of the stats for the given PF
+ * @pf: the PF to be reset
+ **/
+void i40e_pf_reset_stats(struct i40e_pf *pf)
+{
+	int i;
+
+	memset(&pf->stats, 0, sizeof(pf->stats));
+	memset(&pf->stats_offsets, 0, sizeof(pf->stats_offsets));
+	pf->stat_offsets_loaded = false;
+
+	for (i = 0; i < I40E_MAX_VEB; i++) {
+		if (pf->veb[i]) {
+			memset(&pf->veb[i]->stats, 0,
+			       sizeof(pf->veb[i]->stats));
+			memset(&pf->veb[i]->stats_offsets, 0,
+			       sizeof(pf->veb[i]->stats_offsets));
+			pf->veb[i]->stat_offsets_loaded = false;
+		}
+	}
+	pf->hw_csum_rx_error = 0;
+}
+
+/**
+ * i40e_stat_update48 - read and update a 48 bit stat from the chip
+ * @hw: ptr to the hardware info
+ * @hireg: the high 32 bit reg to read
+ * @loreg: the low 32 bit reg to read
+ * @offset_loaded: has the initial offset been loaded yet
+ * @offset: ptr to current offset value
+ * @stat: ptr to the stat
+ *
+ * Since the device stats are not reset at PFReset, they likely will not
+ * be zeroed when the driver starts.  We'll save the first values read
+ * and use them as offsets to be subtracted from the raw values in order
+ * to report stats that count from zero.  In the process, we also manage
+ * the potential roll-over.
+ **/
+static void i40e_stat_update48(struct i40e_hw *hw, u32 hireg, u32 loreg,
+			       bool offset_loaded, u64 *offset, u64 *stat)
+{
+	u64 new_data;
+
+	if (hw->device_id == I40E_DEV_ID_QEMU) {
+		new_data = rd32(hw, loreg);
+		new_data |= ((u64)(rd32(hw, hireg) & 0xFFFF)) << 32;
+	} else {
+		new_data = rd64(hw, loreg);
+	}
+	if (!offset_loaded)
+		*offset = new_data;
+	if (likely(new_data >= *offset))
+		*stat = new_data - *offset;
+	else
+		*stat = (new_data + BIT_ULL(48)) - *offset;
+	*stat &= 0xFFFFFFFFFFFFULL;
+}
+
+/**
+ * i40e_stat_update32 - read and update a 32 bit stat from the chip
+ * @hw: ptr to the hardware info
+ * @reg: the hw reg to read
+ * @offset_loaded: has the initial offset been loaded yet
+ * @offset: ptr to current offset value
+ * @stat: ptr to the stat
+ **/
+static void i40e_stat_update32(struct i40e_hw *hw, u32 reg,
+			       bool offset_loaded, u64 *offset, u64 *stat)
+{
+	u32 new_data;
+
+	new_data = rd32(hw, reg);
+	if (!offset_loaded)
+		*offset = new_data;
+	if (likely(new_data >= *offset))
+		*stat = (u32)(new_data - *offset);
+	else
+		*stat = (u32)((new_data + BIT_ULL(32)) - *offset);
+}
+
+/**
+ * i40e_stat_update_and_clear32 - read and clear hw reg, update a 32 bit stat
+ * @hw: ptr to the hardware info
+ * @reg: the hw reg to read and clear
+ * @stat: ptr to the stat
+ **/
+static void i40e_stat_update_and_clear32(struct i40e_hw *hw, u32 reg, u64 *stat)
+{
+	u32 new_data = rd32(hw, reg);
+
+	wr32(hw, reg, 1); /* must write a nonzero value to clear register */
+	*stat += new_data;
+}
+
+/**
+ * i40e_update_eth_stats - Update VSI-specific ethernet statistics counters.
+ * @vsi: the VSI to be updated
+ **/
+void i40e_update_eth_stats(struct i40e_vsi *vsi)
+{
+	int stat_idx = le16_to_cpu(vsi->info.stat_counter_idx);
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_eth_stats *oes;
+	struct i40e_eth_stats *es;     /* device's eth stats */
+
+	es = &vsi->eth_stats;
+	oes = &vsi->eth_stats_offsets;
+
+	/* Gather up the stats that the hw collects */
+	i40e_stat_update32(hw, I40E_GLV_TEPC(stat_idx),
+			   vsi->stat_offsets_loaded,
+			   &oes->tx_errors, &es->tx_errors);
+	i40e_stat_update32(hw, I40E_GLV_RDPC(stat_idx),
+			   vsi->stat_offsets_loaded,
+			   &oes->rx_discards, &es->rx_discards);
+	i40e_stat_update32(hw, I40E_GLV_RUPP(stat_idx),
+			   vsi->stat_offsets_loaded,
+			   &oes->rx_unknown_protocol, &es->rx_unknown_protocol);
+	i40e_stat_update32(hw, I40E_GLV_TEPC(stat_idx),
+			   vsi->stat_offsets_loaded,
+			   &oes->tx_errors, &es->tx_errors);
+
+	i40e_stat_update48(hw, I40E_GLV_GORCH(stat_idx),
+			   I40E_GLV_GORCL(stat_idx),
+			   vsi->stat_offsets_loaded,
+			   &oes->rx_bytes, &es->rx_bytes);
+	i40e_stat_update48(hw, I40E_GLV_UPRCH(stat_idx),
+			   I40E_GLV_UPRCL(stat_idx),
+			   vsi->stat_offsets_loaded,
+			   &oes->rx_unicast, &es->rx_unicast);
+	i40e_stat_update48(hw, I40E_GLV_MPRCH(stat_idx),
+			   I40E_GLV_MPRCL(stat_idx),
+			   vsi->stat_offsets_loaded,
+			   &oes->rx_multicast, &es->rx_multicast);
+	i40e_stat_update48(hw, I40E_GLV_BPRCH(stat_idx),
+			   I40E_GLV_BPRCL(stat_idx),
+			   vsi->stat_offsets_loaded,
+			   &oes->rx_broadcast, &es->rx_broadcast);
+
+	i40e_stat_update48(hw, I40E_GLV_GOTCH(stat_idx),
+			   I40E_GLV_GOTCL(stat_idx),
+			   vsi->stat_offsets_loaded,
+			   &oes->tx_bytes, &es->tx_bytes);
+	i40e_stat_update48(hw, I40E_GLV_UPTCH(stat_idx),
+			   I40E_GLV_UPTCL(stat_idx),
+			   vsi->stat_offsets_loaded,
+			   &oes->tx_unicast, &es->tx_unicast);
+	i40e_stat_update48(hw, I40E_GLV_MPTCH(stat_idx),
+			   I40E_GLV_MPTCL(stat_idx),
+			   vsi->stat_offsets_loaded,
+			   &oes->tx_multicast, &es->tx_multicast);
+	i40e_stat_update48(hw, I40E_GLV_BPTCH(stat_idx),
+			   I40E_GLV_BPTCL(stat_idx),
+			   vsi->stat_offsets_loaded,
+			   &oes->tx_broadcast, &es->tx_broadcast);
+	vsi->stat_offsets_loaded = true;
+}
+
+/**
+ * i40e_update_veb_stats - Update Switch component statistics
+ * @veb: the VEB being updated
+ **/
+static void i40e_update_veb_stats(struct i40e_veb *veb)
+{
+	struct i40e_pf *pf = veb->pf;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_eth_stats *oes;
+	struct i40e_eth_stats *es;     /* device's eth stats */
+	struct i40e_veb_tc_stats *veb_oes;
+	struct i40e_veb_tc_stats *veb_es;
+	int i, idx = 0;
+
+	idx = veb->stats_idx;
+	es = &veb->stats;
+	oes = &veb->stats_offsets;
+	veb_es = &veb->tc_stats;
+	veb_oes = &veb->tc_stats_offsets;
+
+	/* Gather up the stats that the hw collects */
+	i40e_stat_update32(hw, I40E_GLSW_TDPC(idx),
+			   veb->stat_offsets_loaded,
+			   &oes->tx_discards, &es->tx_discards);
+	if (hw->revision_id > 0)
+		i40e_stat_update32(hw, I40E_GLSW_RUPP(idx),
+				   veb->stat_offsets_loaded,
+				   &oes->rx_unknown_protocol,
+				   &es->rx_unknown_protocol);
+	i40e_stat_update48(hw, I40E_GLSW_GORCH(idx), I40E_GLSW_GORCL(idx),
+			   veb->stat_offsets_loaded,
+			   &oes->rx_bytes, &es->rx_bytes);
+	i40e_stat_update48(hw, I40E_GLSW_UPRCH(idx), I40E_GLSW_UPRCL(idx),
+			   veb->stat_offsets_loaded,
+			   &oes->rx_unicast, &es->rx_unicast);
+	i40e_stat_update48(hw, I40E_GLSW_MPRCH(idx), I40E_GLSW_MPRCL(idx),
+			   veb->stat_offsets_loaded,
+			   &oes->rx_multicast, &es->rx_multicast);
+	i40e_stat_update48(hw, I40E_GLSW_BPRCH(idx), I40E_GLSW_BPRCL(idx),
+			   veb->stat_offsets_loaded,
+			   &oes->rx_broadcast, &es->rx_broadcast);
+
+	i40e_stat_update48(hw, I40E_GLSW_GOTCH(idx), I40E_GLSW_GOTCL(idx),
+			   veb->stat_offsets_loaded,
+			   &oes->tx_bytes, &es->tx_bytes);
+	i40e_stat_update48(hw, I40E_GLSW_UPTCH(idx), I40E_GLSW_UPTCL(idx),
+			   veb->stat_offsets_loaded,
+			   &oes->tx_unicast, &es->tx_unicast);
+	i40e_stat_update48(hw, I40E_GLSW_MPTCH(idx), I40E_GLSW_MPTCL(idx),
+			   veb->stat_offsets_loaded,
+			   &oes->tx_multicast, &es->tx_multicast);
+	i40e_stat_update48(hw, I40E_GLSW_BPTCH(idx), I40E_GLSW_BPTCL(idx),
+			   veb->stat_offsets_loaded,
+			   &oes->tx_broadcast, &es->tx_broadcast);
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		i40e_stat_update48(hw, I40E_GLVEBTC_RPCH(i, idx),
+				   I40E_GLVEBTC_RPCL(i, idx),
+				   veb->stat_offsets_loaded,
+				   &veb_oes->tc_rx_packets[i],
+				   &veb_es->tc_rx_packets[i]);
+		i40e_stat_update48(hw, I40E_GLVEBTC_RBCH(i, idx),
+				   I40E_GLVEBTC_RBCL(i, idx),
+				   veb->stat_offsets_loaded,
+				   &veb_oes->tc_rx_bytes[i],
+				   &veb_es->tc_rx_bytes[i]);
+		i40e_stat_update48(hw, I40E_GLVEBTC_TPCH(i, idx),
+				   I40E_GLVEBTC_TPCL(i, idx),
+				   veb->stat_offsets_loaded,
+				   &veb_oes->tc_tx_packets[i],
+				   &veb_es->tc_tx_packets[i]);
+		i40e_stat_update48(hw, I40E_GLVEBTC_TBCH(i, idx),
+				   I40E_GLVEBTC_TBCL(i, idx),
+				   veb->stat_offsets_loaded,
+				   &veb_oes->tc_tx_bytes[i],
+				   &veb_es->tc_tx_bytes[i]);
+	}
+	veb->stat_offsets_loaded = true;
+}
+
+/**
+ * i40e_update_vsi_stats - Update the vsi statistics counters.
+ * @vsi: the VSI to be updated
+ *
+ * There are a few instances where we store the same stat in a
+ * couple of different structs.  This is partly because we have
+ * the netdev stats that need to be filled out, which is slightly
+ * different from the "eth_stats" defined by the chip and used in
+ * VF communications.  We sort it out here.
+ **/
+static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct rtnl_link_stats64 *ons;
+	struct rtnl_link_stats64 *ns;   /* netdev stats */
+	struct i40e_eth_stats *oes;
+	struct i40e_eth_stats *es;     /* device's eth stats */
+	u32 tx_restart, tx_busy;
+	struct i40e_ring *p;
+	u32 rx_page, rx_buf;
+	u64 bytes, packets;
+	unsigned int start;
+	u64 tx_linearize;
+	u64 tx_force_wb;
+	u64 rx_p, rx_b;
+	u64 tx_p, tx_b;
+	u16 q;
+
+	if (test_bit(__I40E_VSI_DOWN, vsi->state) ||
+	    test_bit(__I40E_CONFIG_BUSY, pf->state))
+		return;
+
+	ns = i40e_get_vsi_stats_struct(vsi);
+	ons = &vsi->net_stats_offsets;
+	es = &vsi->eth_stats;
+	oes = &vsi->eth_stats_offsets;
+
+	/* Gather up the netdev and vsi stats that the driver collects
+	 * on the fly during packet processing
+	 */
+	rx_b = rx_p = 0;
+	tx_b = tx_p = 0;
+	tx_restart = tx_busy = tx_linearize = tx_force_wb = 0;
+	rx_page = 0;
+	rx_buf = 0;
+	rcu_read_lock();
+	for (q = 0; q < vsi->num_queue_pairs; q++) {
+		/* locate Tx ring */
+		p = READ_ONCE(vsi->tx_rings[q]);
+
+		do {
+			start = u64_stats_fetch_begin_irq(&p->syncp);
+			packets = p->stats.packets;
+			bytes = p->stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&p->syncp, start));
+		tx_b += bytes;
+		tx_p += packets;
+		tx_restart += p->tx_stats.restart_queue;
+		tx_busy += p->tx_stats.tx_busy;
+		tx_linearize += p->tx_stats.tx_linearize;
+		tx_force_wb += p->tx_stats.tx_force_wb;
+
+		/* Rx queue is part of the same block as Tx queue */
+		p = &p[1];
+		do {
+			start = u64_stats_fetch_begin_irq(&p->syncp);
+			packets = p->stats.packets;
+			bytes = p->stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&p->syncp, start));
+		rx_b += bytes;
+		rx_p += packets;
+		rx_buf += p->rx_stats.alloc_buff_failed;
+		rx_page += p->rx_stats.alloc_page_failed;
+	}
+	rcu_read_unlock();
+	vsi->tx_restart = tx_restart;
+	vsi->tx_busy = tx_busy;
+	vsi->tx_linearize = tx_linearize;
+	vsi->tx_force_wb = tx_force_wb;
+	vsi->rx_page_failed = rx_page;
+	vsi->rx_buf_failed = rx_buf;
+
+	ns->rx_packets = rx_p;
+	ns->rx_bytes = rx_b;
+	ns->tx_packets = tx_p;
+	ns->tx_bytes = tx_b;
+
+	/* update netdev stats from eth stats */
+	i40e_update_eth_stats(vsi);
+	ons->tx_errors = oes->tx_errors;
+	ns->tx_errors = es->tx_errors;
+	ons->multicast = oes->rx_multicast;
+	ns->multicast = es->rx_multicast;
+	ons->rx_dropped = oes->rx_discards;
+	ns->rx_dropped = es->rx_discards;
+	ons->tx_dropped = oes->tx_discards;
+	ns->tx_dropped = es->tx_discards;
+
+	/* pull in a couple PF stats if this is the main vsi */
+	if (vsi == pf->vsi[pf->lan_vsi]) {
+		ns->rx_crc_errors = pf->stats.crc_errors;
+		ns->rx_errors = pf->stats.crc_errors + pf->stats.illegal_bytes;
+		ns->rx_length_errors = pf->stats.rx_length_errors;
+	}
+}
+
+/**
+ * i40e_update_pf_stats - Update the PF statistics counters.
+ * @pf: the PF to be updated
+ **/
+static void i40e_update_pf_stats(struct i40e_pf *pf)
+{
+	struct i40e_hw_port_stats *osd = &pf->stats_offsets;
+	struct i40e_hw_port_stats *nsd = &pf->stats;
+	struct i40e_hw *hw = &pf->hw;
+	u32 val;
+	int i;
+
+	i40e_stat_update48(hw, I40E_GLPRT_GORCH(hw->port),
+			   I40E_GLPRT_GORCL(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->eth.rx_bytes, &nsd->eth.rx_bytes);
+	i40e_stat_update48(hw, I40E_GLPRT_GOTCH(hw->port),
+			   I40E_GLPRT_GOTCL(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->eth.tx_bytes, &nsd->eth.tx_bytes);
+	i40e_stat_update32(hw, I40E_GLPRT_RDPC(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->eth.rx_discards,
+			   &nsd->eth.rx_discards);
+	i40e_stat_update48(hw, I40E_GLPRT_UPRCH(hw->port),
+			   I40E_GLPRT_UPRCL(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->eth.rx_unicast,
+			   &nsd->eth.rx_unicast);
+	i40e_stat_update48(hw, I40E_GLPRT_MPRCH(hw->port),
+			   I40E_GLPRT_MPRCL(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->eth.rx_multicast,
+			   &nsd->eth.rx_multicast);
+	i40e_stat_update48(hw, I40E_GLPRT_BPRCH(hw->port),
+			   I40E_GLPRT_BPRCL(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->eth.rx_broadcast,
+			   &nsd->eth.rx_broadcast);
+	i40e_stat_update48(hw, I40E_GLPRT_UPTCH(hw->port),
+			   I40E_GLPRT_UPTCL(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->eth.tx_unicast,
+			   &nsd->eth.tx_unicast);
+	i40e_stat_update48(hw, I40E_GLPRT_MPTCH(hw->port),
+			   I40E_GLPRT_MPTCL(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->eth.tx_multicast,
+			   &nsd->eth.tx_multicast);
+	i40e_stat_update48(hw, I40E_GLPRT_BPTCH(hw->port),
+			   I40E_GLPRT_BPTCL(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->eth.tx_broadcast,
+			   &nsd->eth.tx_broadcast);
+
+	i40e_stat_update32(hw, I40E_GLPRT_TDOLD(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->tx_dropped_link_down,
+			   &nsd->tx_dropped_link_down);
+
+	i40e_stat_update32(hw, I40E_GLPRT_CRCERRS(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->crc_errors, &nsd->crc_errors);
+
+	i40e_stat_update32(hw, I40E_GLPRT_ILLERRC(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->illegal_bytes, &nsd->illegal_bytes);
+
+	i40e_stat_update32(hw, I40E_GLPRT_MLFC(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->mac_local_faults,
+			   &nsd->mac_local_faults);
+	i40e_stat_update32(hw, I40E_GLPRT_MRFC(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->mac_remote_faults,
+			   &nsd->mac_remote_faults);
+
+	i40e_stat_update32(hw, I40E_GLPRT_RLEC(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->rx_length_errors,
+			   &nsd->rx_length_errors);
+
+	i40e_stat_update32(hw, I40E_GLPRT_LXONRXC(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->link_xon_rx, &nsd->link_xon_rx);
+	i40e_stat_update32(hw, I40E_GLPRT_LXONTXC(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->link_xon_tx, &nsd->link_xon_tx);
+	i40e_stat_update32(hw, I40E_GLPRT_LXOFFRXC(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->link_xoff_rx, &nsd->link_xoff_rx);
+	i40e_stat_update32(hw, I40E_GLPRT_LXOFFTXC(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->link_xoff_tx, &nsd->link_xoff_tx);
+
+	for (i = 0; i < 8; i++) {
+		i40e_stat_update32(hw, I40E_GLPRT_PXOFFRXC(hw->port, i),
+				   pf->stat_offsets_loaded,
+				   &osd->priority_xoff_rx[i],
+				   &nsd->priority_xoff_rx[i]);
+		i40e_stat_update32(hw, I40E_GLPRT_PXONRXC(hw->port, i),
+				   pf->stat_offsets_loaded,
+				   &osd->priority_xon_rx[i],
+				   &nsd->priority_xon_rx[i]);
+		i40e_stat_update32(hw, I40E_GLPRT_PXONTXC(hw->port, i),
+				   pf->stat_offsets_loaded,
+				   &osd->priority_xon_tx[i],
+				   &nsd->priority_xon_tx[i]);
+		i40e_stat_update32(hw, I40E_GLPRT_PXOFFTXC(hw->port, i),
+				   pf->stat_offsets_loaded,
+				   &osd->priority_xoff_tx[i],
+				   &nsd->priority_xoff_tx[i]);
+		i40e_stat_update32(hw,
+				   I40E_GLPRT_RXON2OFFCNT(hw->port, i),
+				   pf->stat_offsets_loaded,
+				   &osd->priority_xon_2_xoff[i],
+				   &nsd->priority_xon_2_xoff[i]);
+	}
+
+	i40e_stat_update48(hw, I40E_GLPRT_PRC64H(hw->port),
+			   I40E_GLPRT_PRC64L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->rx_size_64, &nsd->rx_size_64);
+	i40e_stat_update48(hw, I40E_GLPRT_PRC127H(hw->port),
+			   I40E_GLPRT_PRC127L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->rx_size_127, &nsd->rx_size_127);
+	i40e_stat_update48(hw, I40E_GLPRT_PRC255H(hw->port),
+			   I40E_GLPRT_PRC255L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->rx_size_255, &nsd->rx_size_255);
+	i40e_stat_update48(hw, I40E_GLPRT_PRC511H(hw->port),
+			   I40E_GLPRT_PRC511L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->rx_size_511, &nsd->rx_size_511);
+	i40e_stat_update48(hw, I40E_GLPRT_PRC1023H(hw->port),
+			   I40E_GLPRT_PRC1023L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->rx_size_1023, &nsd->rx_size_1023);
+	i40e_stat_update48(hw, I40E_GLPRT_PRC1522H(hw->port),
+			   I40E_GLPRT_PRC1522L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->rx_size_1522, &nsd->rx_size_1522);
+	i40e_stat_update48(hw, I40E_GLPRT_PRC9522H(hw->port),
+			   I40E_GLPRT_PRC9522L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->rx_size_big, &nsd->rx_size_big);
+
+	i40e_stat_update48(hw, I40E_GLPRT_PTC64H(hw->port),
+			   I40E_GLPRT_PTC64L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->tx_size_64, &nsd->tx_size_64);
+	i40e_stat_update48(hw, I40E_GLPRT_PTC127H(hw->port),
+			   I40E_GLPRT_PTC127L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->tx_size_127, &nsd->tx_size_127);
+	i40e_stat_update48(hw, I40E_GLPRT_PTC255H(hw->port),
+			   I40E_GLPRT_PTC255L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->tx_size_255, &nsd->tx_size_255);
+	i40e_stat_update48(hw, I40E_GLPRT_PTC511H(hw->port),
+			   I40E_GLPRT_PTC511L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->tx_size_511, &nsd->tx_size_511);
+	i40e_stat_update48(hw, I40E_GLPRT_PTC1023H(hw->port),
+			   I40E_GLPRT_PTC1023L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->tx_size_1023, &nsd->tx_size_1023);
+	i40e_stat_update48(hw, I40E_GLPRT_PTC1522H(hw->port),
+			   I40E_GLPRT_PTC1522L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->tx_size_1522, &nsd->tx_size_1522);
+	i40e_stat_update48(hw, I40E_GLPRT_PTC9522H(hw->port),
+			   I40E_GLPRT_PTC9522L(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->tx_size_big, &nsd->tx_size_big);
+
+	i40e_stat_update32(hw, I40E_GLPRT_RUC(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->rx_undersize, &nsd->rx_undersize);
+	i40e_stat_update32(hw, I40E_GLPRT_RFC(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->rx_fragments, &nsd->rx_fragments);
+	i40e_stat_update32(hw, I40E_GLPRT_ROC(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->rx_oversize, &nsd->rx_oversize);
+	i40e_stat_update32(hw, I40E_GLPRT_RJC(hw->port),
+			   pf->stat_offsets_loaded,
+			   &osd->rx_jabber, &nsd->rx_jabber);
+
+	/* FDIR stats */
+	i40e_stat_update_and_clear32(hw,
+			I40E_GLQF_PCNT(I40E_FD_ATR_STAT_IDX(hw->pf_id)),
+			&nsd->fd_atr_match);
+	i40e_stat_update_and_clear32(hw,
+			I40E_GLQF_PCNT(I40E_FD_SB_STAT_IDX(hw->pf_id)),
+			&nsd->fd_sb_match);
+	i40e_stat_update_and_clear32(hw,
+			I40E_GLQF_PCNT(I40E_FD_ATR_TUNNEL_STAT_IDX(hw->pf_id)),
+			&nsd->fd_atr_tunnel_match);
+
+	val = rd32(hw, I40E_PRTPM_EEE_STAT);
+	nsd->tx_lpi_status =
+		       (val & I40E_PRTPM_EEE_STAT_TX_LPI_STATUS_MASK) >>
+			I40E_PRTPM_EEE_STAT_TX_LPI_STATUS_SHIFT;
+	nsd->rx_lpi_status =
+		       (val & I40E_PRTPM_EEE_STAT_RX_LPI_STATUS_MASK) >>
+			I40E_PRTPM_EEE_STAT_RX_LPI_STATUS_SHIFT;
+	i40e_stat_update32(hw, I40E_PRTPM_TLPIC,
+			   pf->stat_offsets_loaded,
+			   &osd->tx_lpi_count, &nsd->tx_lpi_count);
+	i40e_stat_update32(hw, I40E_PRTPM_RLPIC,
+			   pf->stat_offsets_loaded,
+			   &osd->rx_lpi_count, &nsd->rx_lpi_count);
+
+	if (pf->flags & I40E_FLAG_FD_SB_ENABLED &&
+	    !test_bit(__I40E_FD_SB_AUTO_DISABLED, pf->state))
+		nsd->fd_sb_status = true;
+	else
+		nsd->fd_sb_status = false;
+
+	if (pf->flags & I40E_FLAG_FD_ATR_ENABLED &&
+	    !test_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state))
+		nsd->fd_atr_status = true;
+	else
+		nsd->fd_atr_status = false;
+
+	pf->stat_offsets_loaded = true;
+}
+
+/**
+ * i40e_update_stats - Update the various statistics counters.
+ * @vsi: the VSI to be updated
+ *
+ * Update the various stats for this VSI and its related entities.
+ **/
+void i40e_update_stats(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+
+	if (vsi == pf->vsi[pf->lan_vsi])
+		i40e_update_pf_stats(pf);
+
+	i40e_update_vsi_stats(vsi);
+}
+
+/**
+ * i40e_find_filter - Search VSI filter list for specific mac/vlan filter
+ * @vsi: the VSI to be searched
+ * @macaddr: the MAC address
+ * @vlan: the vlan
+ *
+ * Returns ptr to the filter object or NULL
+ **/
+static struct i40e_mac_filter *i40e_find_filter(struct i40e_vsi *vsi,
+						const u8 *macaddr, s16 vlan)
+{
+	struct i40e_mac_filter *f;
+	u64 key;
+
+	if (!vsi || !macaddr)
+		return NULL;
+
+	key = i40e_addr_to_hkey(macaddr);
+	hash_for_each_possible(vsi->mac_filter_hash, f, hlist, key) {
+		if ((ether_addr_equal(macaddr, f->macaddr)) &&
+		    (vlan == f->vlan))
+			return f;
+	}
+	return NULL;
+}
+
+/**
+ * i40e_find_mac - Find a mac addr in the macvlan filters list
+ * @vsi: the VSI to be searched
+ * @macaddr: the MAC address we are searching for
+ *
+ * Returns the first filter with the provided MAC address or NULL if
+ * MAC address was not found
+ **/
+struct i40e_mac_filter *i40e_find_mac(struct i40e_vsi *vsi, const u8 *macaddr)
+{
+	struct i40e_mac_filter *f;
+	u64 key;
+
+	if (!vsi || !macaddr)
+		return NULL;
+
+	key = i40e_addr_to_hkey(macaddr);
+	hash_for_each_possible(vsi->mac_filter_hash, f, hlist, key) {
+		if ((ether_addr_equal(macaddr, f->macaddr)))
+			return f;
+	}
+	return NULL;
+}
+
+/**
+ * i40e_is_vsi_in_vlan - Check if VSI is in vlan mode
+ * @vsi: the VSI to be searched
+ *
+ * Returns true if VSI is in vlan mode or false otherwise
+ **/
+bool i40e_is_vsi_in_vlan(struct i40e_vsi *vsi)
+{
+	/* If we have a PVID, always operate in VLAN mode */
+	if (vsi->info.pvid)
+		return true;
+
+	/* We need to operate in VLAN mode whenever we have any filters with
+	 * a VLAN other than I40E_VLAN_ALL. We could check the table each
+	 * time, incurring search cost repeatedly. However, we can notice two
+	 * things:
+	 *
+	 * 1) the only place where we can gain a VLAN filter is in
+	 *    i40e_add_filter.
+	 *
+	 * 2) the only place where filters are actually removed is in
+	 *    i40e_sync_filters_subtask.
+	 *
+	 * Thus, we can simply use a boolean value, has_vlan_filters which we
+	 * will set to true when we add a VLAN filter in i40e_add_filter. Then
+	 * we have to perform the full search after deleting filters in
+	 * i40e_sync_filters_subtask, but we already have to search
+	 * filters here and can perform the check at the same time. This
+	 * results in avoiding embedding a loop for VLAN mode inside another
+	 * loop over all the filters, and should maintain correctness as noted
+	 * above.
+	 */
+	return vsi->has_vlan_filter;
+}
+
+/**
+ * i40e_correct_mac_vlan_filters - Correct non-VLAN filters if necessary
+ * @vsi: the VSI to configure
+ * @tmp_add_list: list of filters ready to be added
+ * @tmp_del_list: list of filters ready to be deleted
+ * @vlan_filters: the number of active VLAN filters
+ *
+ * Update VLAN=0 and VLAN=-1 (I40E_VLAN_ANY) filters properly so that they
+ * behave as expected. If we have any active VLAN filters remaining or about
+ * to be added then we need to update non-VLAN filters to be marked as VLAN=0
+ * so that they only match against untagged traffic. If we no longer have any
+ * active VLAN filters, we need to make all non-VLAN filters marked as VLAN=-1
+ * so that they match against both tagged and untagged traffic. In this way,
+ * we ensure that we correctly receive the desired traffic. This ensures that
+ * when we have an active VLAN we will receive only untagged traffic and
+ * traffic matching active VLANs. If we have no active VLANs then we will
+ * operate in non-VLAN mode and receive all traffic, tagged or untagged.
+ *
+ * Finally, in a similar fashion, this function also corrects filters when
+ * there is an active PVID assigned to this VSI.
+ *
+ * In case of memory allocation failure return -ENOMEM. Otherwise, return 0.
+ *
+ * This function is only expected to be called from within
+ * i40e_sync_vsi_filters.
+ *
+ * NOTE: This function expects to be called while under the
+ * mac_filter_hash_lock
+ */
+static int i40e_correct_mac_vlan_filters(struct i40e_vsi *vsi,
+					 struct hlist_head *tmp_add_list,
+					 struct hlist_head *tmp_del_list,
+					 int vlan_filters)
+{
+	s16 pvid = le16_to_cpu(vsi->info.pvid);
+	struct i40e_mac_filter *f, *add_head;
+	struct i40e_new_mac_filter *new;
+	struct hlist_node *h;
+	int bkt, new_vlan;
+
+	/* To determine if a particular filter needs to be replaced we
+	 * have the three following conditions:
+	 *
+	 * a) if we have a PVID assigned, then all filters which are
+	 *    not marked as VLAN=PVID must be replaced with filters that
+	 *    are.
+	 * b) otherwise, if we have any active VLANS, all filters
+	 *    which are marked as VLAN=-1 must be replaced with
+	 *    filters marked as VLAN=0
+	 * c) finally, if we do not have any active VLANS, all filters
+	 *    which are marked as VLAN=0 must be replaced with filters
+	 *    marked as VLAN=-1
+	 */
+
+	/* Update the filters about to be added in place */
+	hlist_for_each_entry(new, tmp_add_list, hlist) {
+		if (pvid && new->f->vlan != pvid)
+			new->f->vlan = pvid;
+		else if (vlan_filters && new->f->vlan == I40E_VLAN_ANY)
+			new->f->vlan = 0;
+		else if (!vlan_filters && new->f->vlan == 0)
+			new->f->vlan = I40E_VLAN_ANY;
+	}
+
+	/* Update the remaining active filters */
+	hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
+		/* Combine the checks for whether a filter needs to be changed
+		 * and then determine the new VLAN inside the if block, in
+		 * order to avoid duplicating code for adding the new filter
+		 * then deleting the old filter.
+		 */
+		if ((pvid && f->vlan != pvid) ||
+		    (vlan_filters && f->vlan == I40E_VLAN_ANY) ||
+		    (!vlan_filters && f->vlan == 0)) {
+			/* Determine the new vlan we will be adding */
+			if (pvid)
+				new_vlan = pvid;
+			else if (vlan_filters)
+				new_vlan = 0;
+			else
+				new_vlan = I40E_VLAN_ANY;
+
+			/* Create the new filter */
+			add_head = i40e_add_filter(vsi, f->macaddr, new_vlan);
+			if (!add_head)
+				return -ENOMEM;
+
+			/* Create a temporary i40e_new_mac_filter */
+			new = kzalloc(sizeof(*new), GFP_ATOMIC);
+			if (!new)
+				return -ENOMEM;
+
+			new->f = add_head;
+			new->state = add_head->state;
+
+			/* Add the new filter to the tmp list */
+			hlist_add_head(&new->hlist, tmp_add_list);
+
+			/* Put the original filter into the delete list */
+			f->state = I40E_FILTER_REMOVE;
+			hash_del(&f->hlist);
+			hlist_add_head(&f->hlist, tmp_del_list);
+		}
+	}
+
+	vsi->has_vlan_filter = !!vlan_filters;
+
+	return 0;
+}
+
+/**
+ * i40e_rm_default_mac_filter - Remove the default MAC filter set by NVM
+ * @vsi: the PF Main VSI - inappropriate for any other VSI
+ * @macaddr: the MAC address
+ *
+ * Remove whatever filter the firmware set up so the driver can manage
+ * its own filtering intelligently.
+ **/
+static void i40e_rm_default_mac_filter(struct i40e_vsi *vsi, u8 *macaddr)
+{
+	struct i40e_aqc_remove_macvlan_element_data element;
+	struct i40e_pf *pf = vsi->back;
+
+	/* Only appropriate for the PF main VSI */
+	if (vsi->type != I40E_VSI_MAIN)
+		return;
+
+	memset(&element, 0, sizeof(element));
+	ether_addr_copy(element.mac_addr, macaddr);
+	element.vlan_tag = 0;
+	/* Ignore error returns, some firmware does it this way... */
+	element.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH;
+	i40e_aq_remove_macvlan(&pf->hw, vsi->seid, &element, 1, NULL);
+
+	memset(&element, 0, sizeof(element));
+	ether_addr_copy(element.mac_addr, macaddr);
+	element.vlan_tag = 0;
+	/* ...and some firmware does it this way. */
+	element.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH |
+			I40E_AQC_MACVLAN_DEL_IGNORE_VLAN;
+	i40e_aq_remove_macvlan(&pf->hw, vsi->seid, &element, 1, NULL);
+}
+
+/**
+ * i40e_add_filter - Add a mac/vlan filter to the VSI
+ * @vsi: the VSI to be searched
+ * @macaddr: the MAC address
+ * @vlan: the vlan
+ *
+ * Returns ptr to the filter object or NULL when no memory available.
+ *
+ * NOTE: This function is expected to be called with mac_filter_hash_lock
+ * being held.
+ **/
+struct i40e_mac_filter *i40e_add_filter(struct i40e_vsi *vsi,
+					const u8 *macaddr, s16 vlan)
+{
+	struct i40e_mac_filter *f;
+	u64 key;
+
+	if (!vsi || !macaddr)
+		return NULL;
+
+	f = i40e_find_filter(vsi, macaddr, vlan);
+	if (!f) {
+		f = kzalloc(sizeof(*f), GFP_ATOMIC);
+		if (!f)
+			return NULL;
+
+		/* Update the boolean indicating if we need to function in
+		 * VLAN mode.
+		 */
+		if (vlan >= 0)
+			vsi->has_vlan_filter = true;
+
+		ether_addr_copy(f->macaddr, macaddr);
+		f->vlan = vlan;
+		f->state = I40E_FILTER_NEW;
+		INIT_HLIST_NODE(&f->hlist);
+
+		key = i40e_addr_to_hkey(macaddr);
+		hash_add(vsi->mac_filter_hash, &f->hlist, key);
+
+		vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED;
+		set_bit(__I40E_MACVLAN_SYNC_PENDING, vsi->back->state);
+	}
+
+	/* If we're asked to add a filter that has been marked for removal, it
+	 * is safe to simply restore it to active state. __i40e_del_filter
+	 * will have simply deleted any filters which were previously marked
+	 * NEW or FAILED, so if it is currently marked REMOVE it must have
+	 * previously been ACTIVE. Since we haven't yet run the sync filters
+	 * task, just restore this filter to the ACTIVE state so that the
+	 * sync task leaves it in place
+	 */
+	if (f->state == I40E_FILTER_REMOVE)
+		f->state = I40E_FILTER_ACTIVE;
+
+	return f;
+}
+
+/**
+ * __i40e_del_filter - Remove a specific filter from the VSI
+ * @vsi: VSI to remove from
+ * @f: the filter to remove from the list
+ *
+ * This function should be called instead of i40e_del_filter only if you know
+ * the exact filter you will remove already, such as via i40e_find_filter or
+ * i40e_find_mac.
+ *
+ * NOTE: This function is expected to be called with mac_filter_hash_lock
+ * being held.
+ * ANOTHER NOTE: This function MUST be called from within the context of
+ * the "safe" variants of any list iterators, e.g. list_for_each_entry_safe()
+ * instead of list_for_each_entry().
+ **/
+void __i40e_del_filter(struct i40e_vsi *vsi, struct i40e_mac_filter *f)
+{
+	if (!f)
+		return;
+
+	/* If the filter was never added to firmware then we can just delete it
+	 * directly and we don't want to set the status to remove or else an
+	 * admin queue command will unnecessarily fire.
+	 */
+	if ((f->state == I40E_FILTER_FAILED) ||
+	    (f->state == I40E_FILTER_NEW)) {
+		hash_del(&f->hlist);
+		kfree(f);
+	} else {
+		f->state = I40E_FILTER_REMOVE;
+	}
+
+	vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED;
+	set_bit(__I40E_MACVLAN_SYNC_PENDING, vsi->state);
+}
+
+/**
+ * i40e_del_filter - Remove a MAC/VLAN filter from the VSI
+ * @vsi: the VSI to be searched
+ * @macaddr: the MAC address
+ * @vlan: the VLAN
+ *
+ * NOTE: This function is expected to be called with mac_filter_hash_lock
+ * being held.
+ * ANOTHER NOTE: This function MUST be called from within the context of
+ * the "safe" variants of any list iterators, e.g. list_for_each_entry_safe()
+ * instead of list_for_each_entry().
+ **/
+void i40e_del_filter(struct i40e_vsi *vsi, const u8 *macaddr, s16 vlan)
+{
+	struct i40e_mac_filter *f;
+
+	if (!vsi || !macaddr)
+		return;
+
+	f = i40e_find_filter(vsi, macaddr, vlan);
+	__i40e_del_filter(vsi, f);
+}
+
+/**
+ * i40e_add_mac_filter - Add a MAC filter for all active VLANs
+ * @vsi: the VSI to be searched
+ * @macaddr: the mac address to be filtered
+ *
+ * If we're not in VLAN mode, just add the filter to I40E_VLAN_ANY. Otherwise,
+ * go through all the macvlan filters and add a macvlan filter for each
+ * unique vlan that already exists. If a PVID has been assigned, instead only
+ * add the macaddr to that VLAN.
+ *
+ * Returns last filter added on success, else NULL
+ **/
+struct i40e_mac_filter *i40e_add_mac_filter(struct i40e_vsi *vsi,
+					    const u8 *macaddr)
+{
+	struct i40e_mac_filter *f, *add = NULL;
+	struct hlist_node *h;
+	int bkt;
+
+	if (vsi->info.pvid)
+		return i40e_add_filter(vsi, macaddr,
+				       le16_to_cpu(vsi->info.pvid));
+
+	if (!i40e_is_vsi_in_vlan(vsi))
+		return i40e_add_filter(vsi, macaddr, I40E_VLAN_ANY);
+
+	hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
+		if (f->state == I40E_FILTER_REMOVE)
+			continue;
+		add = i40e_add_filter(vsi, macaddr, f->vlan);
+		if (!add)
+			return NULL;
+	}
+
+	return add;
+}
+
+/**
+ * i40e_del_mac_filter - Remove a MAC filter from all VLANs
+ * @vsi: the VSI to be searched
+ * @macaddr: the mac address to be removed
+ *
+ * Removes a given MAC address from a VSI regardless of what VLAN it has been
+ * associated with.
+ *
+ * Returns 0 for success, or error
+ **/
+int i40e_del_mac_filter(struct i40e_vsi *vsi, const u8 *macaddr)
+{
+	struct i40e_mac_filter *f;
+	struct hlist_node *h;
+	bool found = false;
+	int bkt;
+
+	WARN(!spin_is_locked(&vsi->mac_filter_hash_lock),
+	     "Missing mac_filter_hash_lock\n");
+	hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
+		if (ether_addr_equal(macaddr, f->macaddr)) {
+			__i40e_del_filter(vsi, f);
+			found = true;
+		}
+	}
+
+	if (found)
+		return 0;
+	else
+		return -ENOENT;
+}
+
+/**
+ * i40e_set_mac - NDO callback to set mac address
+ * @netdev: network interface device structure
+ * @p: pointer to an address structure
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_set_mac(struct net_device *netdev, void *p)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	if (ether_addr_equal(netdev->dev_addr, addr->sa_data)) {
+		netdev_info(netdev, "already using mac address %pM\n",
+			    addr->sa_data);
+		return 0;
+	}
+
+	if (test_bit(__I40E_VSI_DOWN, vsi->back->state) ||
+	    test_bit(__I40E_RESET_RECOVERY_PENDING, vsi->back->state))
+		return -EADDRNOTAVAIL;
+
+	if (ether_addr_equal(hw->mac.addr, addr->sa_data))
+		netdev_info(netdev, "returning to hw mac address %pM\n",
+			    hw->mac.addr);
+	else
+		netdev_info(netdev, "set new mac address %pM\n", addr->sa_data);
+
+	/* Copy the address first, so that we avoid a possible race with
+	 * .set_rx_mode(). If we copy after changing the address in the filter
+	 * list, we might open ourselves to a narrow race window where
+	 * .set_rx_mode could delete our dev_addr filter and prevent traffic
+	 * from passing.
+	 */
+	ether_addr_copy(netdev->dev_addr, addr->sa_data);
+
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+	i40e_del_mac_filter(vsi, netdev->dev_addr);
+	i40e_add_mac_filter(vsi, addr->sa_data);
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+	if (vsi->type == I40E_VSI_MAIN) {
+		i40e_status ret;
+
+		ret = i40e_aq_mac_address_write(&vsi->back->hw,
+						I40E_AQC_WRITE_TYPE_LAA_WOL,
+						addr->sa_data, NULL);
+		if (ret)
+			netdev_info(netdev, "Ignoring error from firmware on LAA update, status %s, AQ ret %s\n",
+				    i40e_stat_str(hw, ret),
+				    i40e_aq_str(hw, hw->aq.asq_last_status));
+	}
+
+	/* schedule our worker thread which will take care of
+	 * applying the new filter changes
+	 */
+	i40e_service_event_schedule(vsi->back);
+	return 0;
+}
+
+/**
+ * i40e_config_rss_aq - Prepare for RSS using AQ commands
+ * @vsi: vsi structure
+ * @seed: RSS hash seed
+ **/
+static int i40e_config_rss_aq(struct i40e_vsi *vsi, const u8 *seed,
+			      u8 *lut, u16 lut_size)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	int ret = 0;
+
+	if (seed) {
+		struct i40e_aqc_get_set_rss_key_data *seed_dw =
+			(struct i40e_aqc_get_set_rss_key_data *)seed;
+		ret = i40e_aq_set_rss_key(hw, vsi->id, seed_dw);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "Cannot set RSS key, err %s aq_err %s\n",
+				 i40e_stat_str(hw, ret),
+				 i40e_aq_str(hw, hw->aq.asq_last_status));
+			return ret;
+		}
+	}
+	if (lut) {
+		bool pf_lut = vsi->type == I40E_VSI_MAIN ? true : false;
+
+		ret = i40e_aq_set_rss_lut(hw, vsi->id, pf_lut, lut, lut_size);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "Cannot set RSS lut, err %s aq_err %s\n",
+				 i40e_stat_str(hw, ret),
+				 i40e_aq_str(hw, hw->aq.asq_last_status));
+			return ret;
+		}
+	}
+	return ret;
+}
+
+/**
+ * i40e_vsi_config_rss - Prepare for VSI(VMDq) RSS if used
+ * @vsi: VSI structure
+ **/
+static int i40e_vsi_config_rss(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	u8 seed[I40E_HKEY_ARRAY_SIZE];
+	u8 *lut;
+	int ret;
+
+	if (!(pf->hw_features & I40E_HW_RSS_AQ_CAPABLE))
+		return 0;
+	if (!vsi->rss_size)
+		vsi->rss_size = min_t(int, pf->alloc_rss_size,
+				      vsi->num_queue_pairs);
+	if (!vsi->rss_size)
+		return -EINVAL;
+	lut = kzalloc(vsi->rss_table_size, GFP_KERNEL);
+	if (!lut)
+		return -ENOMEM;
+
+	/* Use the user configured hash keys and lookup table if there is one,
+	 * otherwise use default
+	 */
+	if (vsi->rss_lut_user)
+		memcpy(lut, vsi->rss_lut_user, vsi->rss_table_size);
+	else
+		i40e_fill_rss_lut(pf, lut, vsi->rss_table_size, vsi->rss_size);
+	if (vsi->rss_hkey_user)
+		memcpy(seed, vsi->rss_hkey_user, I40E_HKEY_ARRAY_SIZE);
+	else
+		netdev_rss_key_fill((void *)seed, I40E_HKEY_ARRAY_SIZE);
+	ret = i40e_config_rss_aq(vsi, seed, lut, vsi->rss_table_size);
+	kfree(lut);
+	return ret;
+}
+
+/**
+ * i40e_vsi_setup_queue_map_mqprio - Prepares mqprio based tc_config
+ * @vsi: the VSI being configured,
+ * @ctxt: VSI context structure
+ * @enabled_tc: number of traffic classes to enable
+ *
+ * Prepares VSI tc_config to have queue configurations based on MQPRIO options.
+ **/
+static int i40e_vsi_setup_queue_map_mqprio(struct i40e_vsi *vsi,
+					   struct i40e_vsi_context *ctxt,
+					   u8 enabled_tc)
+{
+	u16 qcount = 0, max_qcount, qmap, sections = 0;
+	int i, override_q, pow, num_qps, ret;
+	u8 netdev_tc = 0, offset = 0;
+
+	if (vsi->type != I40E_VSI_MAIN)
+		return -EINVAL;
+	sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID;
+	sections |= I40E_AQ_VSI_PROP_SCHED_VALID;
+	vsi->tc_config.numtc = vsi->mqprio_qopt.qopt.num_tc;
+	vsi->tc_config.enabled_tc = enabled_tc ? enabled_tc : 1;
+	num_qps = vsi->mqprio_qopt.qopt.count[0];
+
+	/* find the next higher power-of-2 of num queue pairs */
+	pow = ilog2(num_qps);
+	if (!is_power_of_2(num_qps))
+		pow++;
+	qmap = (offset << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) |
+		(pow << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT);
+
+	/* Setup queue offset/count for all TCs for given VSI */
+	max_qcount = vsi->mqprio_qopt.qopt.count[0];
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		/* See if the given TC is enabled for the given VSI */
+		if (vsi->tc_config.enabled_tc & BIT(i)) {
+			offset = vsi->mqprio_qopt.qopt.offset[i];
+			qcount = vsi->mqprio_qopt.qopt.count[i];
+			if (qcount > max_qcount)
+				max_qcount = qcount;
+			vsi->tc_config.tc_info[i].qoffset = offset;
+			vsi->tc_config.tc_info[i].qcount = qcount;
+			vsi->tc_config.tc_info[i].netdev_tc = netdev_tc++;
+		} else {
+			/* TC is not enabled so set the offset to
+			 * default queue and allocate one queue
+			 * for the given TC.
+			 */
+			vsi->tc_config.tc_info[i].qoffset = 0;
+			vsi->tc_config.tc_info[i].qcount = 1;
+			vsi->tc_config.tc_info[i].netdev_tc = 0;
+		}
+	}
+
+	/* Set actual Tx/Rx queue pairs */
+	vsi->num_queue_pairs = offset + qcount;
+
+	/* Setup queue TC[0].qmap for given VSI context */
+	ctxt->info.tc_mapping[0] = cpu_to_le16(qmap);
+	ctxt->info.mapping_flags |= cpu_to_le16(I40E_AQ_VSI_QUE_MAP_CONTIG);
+	ctxt->info.queue_mapping[0] = cpu_to_le16(vsi->base_queue);
+	ctxt->info.valid_sections |= cpu_to_le16(sections);
+
+	/* Reconfigure RSS for main VSI with max queue count */
+	vsi->rss_size = max_qcount;
+	ret = i40e_vsi_config_rss(vsi);
+	if (ret) {
+		dev_info(&vsi->back->pdev->dev,
+			 "Failed to reconfig rss for num_queues (%u)\n",
+			 max_qcount);
+		return ret;
+	}
+	vsi->reconfig_rss = true;
+	dev_dbg(&vsi->back->pdev->dev,
+		"Reconfigured rss with num_queues (%u)\n", max_qcount);
+
+	/* Find queue count available for channel VSIs and starting offset
+	 * for channel VSIs
+	 */
+	override_q = vsi->mqprio_qopt.qopt.count[0];
+	if (override_q && override_q < vsi->num_queue_pairs) {
+		vsi->cnt_q_avail = vsi->num_queue_pairs - override_q;
+		vsi->next_base_queue = override_q;
+	}
+	return 0;
+}
+
+/**
+ * i40e_vsi_setup_queue_map - Setup a VSI queue map based on enabled_tc
+ * @vsi: the VSI being setup
+ * @ctxt: VSI context structure
+ * @enabled_tc: Enabled TCs bitmap
+ * @is_add: True if called before Add VSI
+ *
+ * Setup VSI queue mapping for enabled traffic classes.
+ **/
+static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
+				     struct i40e_vsi_context *ctxt,
+				     u8 enabled_tc,
+				     bool is_add)
+{
+	struct i40e_pf *pf = vsi->back;
+	u16 sections = 0;
+	u8 netdev_tc = 0;
+	u16 numtc = 1;
+	u16 qcount;
+	u8 offset;
+	u16 qmap;
+	int i;
+	u16 num_tc_qps = 0;
+
+	sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID;
+	offset = 0;
+
+	/* Number of queues per enabled TC */
+	num_tc_qps = vsi->alloc_queue_pairs;
+	if (enabled_tc && (vsi->back->flags & I40E_FLAG_DCB_ENABLED)) {
+		/* Find numtc from enabled TC bitmap */
+		for (i = 0, numtc = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+			if (enabled_tc & BIT(i)) /* TC is enabled */
+				numtc++;
+		}
+		if (!numtc) {
+			dev_warn(&pf->pdev->dev, "DCB is enabled but no TC enabled, forcing TC0\n");
+			numtc = 1;
+		}
+		num_tc_qps = num_tc_qps / numtc;
+		num_tc_qps = min_t(int, num_tc_qps,
+				   i40e_pf_get_max_q_per_tc(pf));
+	}
+
+	vsi->tc_config.numtc = numtc;
+	vsi->tc_config.enabled_tc = enabled_tc ? enabled_tc : 1;
+
+	/* Do not allow use more TC queue pairs than MSI-X vectors exist */
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED)
+		num_tc_qps = min_t(int, num_tc_qps, pf->num_lan_msix);
+
+	/* Setup queue offset/count for all TCs for given VSI */
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		/* See if the given TC is enabled for the given VSI */
+		if (vsi->tc_config.enabled_tc & BIT(i)) {
+			/* TC is enabled */
+			int pow, num_qps;
+
+			switch (vsi->type) {
+			case I40E_VSI_MAIN:
+				if (!(pf->flags & (I40E_FLAG_FD_SB_ENABLED |
+				    I40E_FLAG_FD_ATR_ENABLED)) ||
+				    vsi->tc_config.enabled_tc != 1) {
+					qcount = min_t(int, pf->alloc_rss_size,
+						       num_tc_qps);
+					break;
+				}
+			case I40E_VSI_FDIR:
+			case I40E_VSI_SRIOV:
+			case I40E_VSI_VMDQ2:
+			default:
+				qcount = num_tc_qps;
+				WARN_ON(i != 0);
+				break;
+			}
+			vsi->tc_config.tc_info[i].qoffset = offset;
+			vsi->tc_config.tc_info[i].qcount = qcount;
+
+			/* find the next higher power-of-2 of num queue pairs */
+			num_qps = qcount;
+			pow = 0;
+			while (num_qps && (BIT_ULL(pow) < qcount)) {
+				pow++;
+				num_qps >>= 1;
+			}
+
+			vsi->tc_config.tc_info[i].netdev_tc = netdev_tc++;
+			qmap =
+			    (offset << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) |
+			    (pow << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT);
+
+			offset += qcount;
+		} else {
+			/* TC is not enabled so set the offset to
+			 * default queue and allocate one queue
+			 * for the given TC.
+			 */
+			vsi->tc_config.tc_info[i].qoffset = 0;
+			vsi->tc_config.tc_info[i].qcount = 1;
+			vsi->tc_config.tc_info[i].netdev_tc = 0;
+
+			qmap = 0;
+		}
+		ctxt->info.tc_mapping[i] = cpu_to_le16(qmap);
+	}
+
+	/* Set actual Tx/Rx queue pairs */
+	vsi->num_queue_pairs = offset;
+	if ((vsi->type == I40E_VSI_MAIN) && (numtc == 1)) {
+		if (vsi->req_queue_pairs > 0)
+			vsi->num_queue_pairs = vsi->req_queue_pairs;
+		else if (pf->flags & I40E_FLAG_MSIX_ENABLED)
+			vsi->num_queue_pairs = pf->num_lan_msix;
+	}
+
+	/* Scheduler section valid can only be set for ADD VSI */
+	if (is_add) {
+		sections |= I40E_AQ_VSI_PROP_SCHED_VALID;
+
+		ctxt->info.up_enable_bits = enabled_tc;
+	}
+	if (vsi->type == I40E_VSI_SRIOV) {
+		ctxt->info.mapping_flags |=
+				     cpu_to_le16(I40E_AQ_VSI_QUE_MAP_NONCONTIG);
+		for (i = 0; i < vsi->num_queue_pairs; i++)
+			ctxt->info.queue_mapping[i] =
+					       cpu_to_le16(vsi->base_queue + i);
+	} else {
+		ctxt->info.mapping_flags |=
+					cpu_to_le16(I40E_AQ_VSI_QUE_MAP_CONTIG);
+		ctxt->info.queue_mapping[0] = cpu_to_le16(vsi->base_queue);
+	}
+	ctxt->info.valid_sections |= cpu_to_le16(sections);
+}
+
+/**
+ * i40e_addr_sync - Callback for dev_(mc|uc)_sync to add address
+ * @netdev: the netdevice
+ * @addr: address to add
+ *
+ * Called by __dev_(mc|uc)_sync when an address needs to be added. We call
+ * __dev_(uc|mc)_sync from .set_rx_mode and guarantee to hold the hash lock.
+ */
+static int i40e_addr_sync(struct net_device *netdev, const u8 *addr)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+
+	if (i40e_add_mac_filter(vsi, addr))
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+/**
+ * i40e_addr_unsync - Callback for dev_(mc|uc)_sync to remove address
+ * @netdev: the netdevice
+ * @addr: address to add
+ *
+ * Called by __dev_(mc|uc)_sync when an address needs to be removed. We call
+ * __dev_(uc|mc)_sync from .set_rx_mode and guarantee to hold the hash lock.
+ */
+static int i40e_addr_unsync(struct net_device *netdev, const u8 *addr)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+
+	/* Under some circumstances, we might receive a request to delete
+	 * our own device address from our uc list. Because we store the
+	 * device address in the VSI's MAC/VLAN filter list, we need to ignore
+	 * such requests and not delete our device address from this list.
+	 */
+	if (ether_addr_equal(addr, netdev->dev_addr))
+		return 0;
+
+	i40e_del_mac_filter(vsi, addr);
+
+	return 0;
+}
+
+/**
+ * i40e_set_rx_mode - NDO callback to set the netdev filters
+ * @netdev: network interface device structure
+ **/
+static void i40e_set_rx_mode(struct net_device *netdev)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+
+	__dev_uc_sync(netdev, i40e_addr_sync, i40e_addr_unsync);
+	__dev_mc_sync(netdev, i40e_addr_sync, i40e_addr_unsync);
+
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+	/* check for other flag changes */
+	if (vsi->current_netdev_flags != vsi->netdev->flags) {
+		vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED;
+		set_bit(__I40E_MACVLAN_SYNC_PENDING, vsi->back->state);
+	}
+}
+
+/**
+ * i40e_undo_del_filter_entries - Undo the changes made to MAC filter entries
+ * @vsi: Pointer to VSI struct
+ * @from: Pointer to list which contains MAC filter entries - changes to
+ *        those entries needs to be undone.
+ *
+ * MAC filter entries from this list were slated for deletion.
+ **/
+static void i40e_undo_del_filter_entries(struct i40e_vsi *vsi,
+					 struct hlist_head *from)
+{
+	struct i40e_mac_filter *f;
+	struct hlist_node *h;
+
+	hlist_for_each_entry_safe(f, h, from, hlist) {
+		u64 key = i40e_addr_to_hkey(f->macaddr);
+
+		/* Move the element back into MAC filter list*/
+		hlist_del(&f->hlist);
+		hash_add(vsi->mac_filter_hash, &f->hlist, key);
+	}
+}
+
+/**
+ * i40e_undo_add_filter_entries - Undo the changes made to MAC filter entries
+ * @vsi: Pointer to vsi struct
+ * @from: Pointer to list which contains MAC filter entries - changes to
+ *        those entries needs to be undone.
+ *
+ * MAC filter entries from this list were slated for addition.
+ **/
+static void i40e_undo_add_filter_entries(struct i40e_vsi *vsi,
+					 struct hlist_head *from)
+{
+	struct i40e_new_mac_filter *new;
+	struct hlist_node *h;
+
+	hlist_for_each_entry_safe(new, h, from, hlist) {
+		/* We can simply free the wrapper structure */
+		hlist_del(&new->hlist);
+		kfree(new);
+	}
+}
+
+/**
+ * i40e_next_entry - Get the next non-broadcast filter from a list
+ * @next: pointer to filter in list
+ *
+ * Returns the next non-broadcast filter in the list. Required so that we
+ * ignore broadcast filters within the list, since these are not handled via
+ * the normal firmware update path.
+ */
+static
+struct i40e_new_mac_filter *i40e_next_filter(struct i40e_new_mac_filter *next)
+{
+	hlist_for_each_entry_continue(next, hlist) {
+		if (!is_broadcast_ether_addr(next->f->macaddr))
+			return next;
+	}
+
+	return NULL;
+}
+
+/**
+ * i40e_update_filter_state - Update filter state based on return data
+ * from firmware
+ * @count: Number of filters added
+ * @add_list: return data from fw
+ * @head: pointer to first filter in current batch
+ *
+ * MAC filter entries from list were slated to be added to device. Returns
+ * number of successful filters. Note that 0 does NOT mean success!
+ **/
+static int
+i40e_update_filter_state(int count,
+			 struct i40e_aqc_add_macvlan_element_data *add_list,
+			 struct i40e_new_mac_filter *add_head)
+{
+	int retval = 0;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		/* Always check status of each filter. We don't need to check
+		 * the firmware return status because we pre-set the filter
+		 * status to I40E_AQC_MM_ERR_NO_RES when sending the filter
+		 * request to the adminq. Thus, if it no longer matches then
+		 * we know the filter is active.
+		 */
+		if (add_list[i].match_method == I40E_AQC_MM_ERR_NO_RES) {
+			add_head->state = I40E_FILTER_FAILED;
+		} else {
+			add_head->state = I40E_FILTER_ACTIVE;
+			retval++;
+		}
+
+		add_head = i40e_next_filter(add_head);
+		if (!add_head)
+			break;
+	}
+
+	return retval;
+}
+
+/**
+ * i40e_aqc_del_filters - Request firmware to delete a set of filters
+ * @vsi: ptr to the VSI
+ * @vsi_name: name to display in messages
+ * @list: the list of filters to send to firmware
+ * @num_del: the number of filters to delete
+ * @retval: Set to -EIO on failure to delete
+ *
+ * Send a request to firmware via AdminQ to delete a set of filters. Uses
+ * *retval instead of a return value so that success does not force ret_val to
+ * be set to 0. This ensures that a sequence of calls to this function
+ * preserve the previous value of *retval on successful delete.
+ */
+static
+void i40e_aqc_del_filters(struct i40e_vsi *vsi, const char *vsi_name,
+			  struct i40e_aqc_remove_macvlan_element_data *list,
+			  int num_del, int *retval)
+{
+	struct i40e_hw *hw = &vsi->back->hw;
+	i40e_status aq_ret;
+	int aq_err;
+
+	aq_ret = i40e_aq_remove_macvlan(hw, vsi->seid, list, num_del, NULL);
+	aq_err = hw->aq.asq_last_status;
+
+	/* Explicitly ignore and do not report when firmware returns ENOENT */
+	if (aq_ret && !(aq_err == I40E_AQ_RC_ENOENT)) {
+		*retval = -EIO;
+		dev_info(&vsi->back->pdev->dev,
+			 "ignoring delete macvlan error on %s, err %s, aq_err %s\n",
+			 vsi_name, i40e_stat_str(hw, aq_ret),
+			 i40e_aq_str(hw, aq_err));
+	}
+}
+
+/**
+ * i40e_aqc_add_filters - Request firmware to add a set of filters
+ * @vsi: ptr to the VSI
+ * @vsi_name: name to display in messages
+ * @list: the list of filters to send to firmware
+ * @add_head: Position in the add hlist
+ * @num_add: the number of filters to add
+ *
+ * Send a request to firmware via AdminQ to add a chunk of filters. Will set
+ * __I40E_VSI_OVERFLOW_PROMISC bit in vsi->state if the firmware has run out of
+ * space for more filters.
+ */
+static
+void i40e_aqc_add_filters(struct i40e_vsi *vsi, const char *vsi_name,
+			  struct i40e_aqc_add_macvlan_element_data *list,
+			  struct i40e_new_mac_filter *add_head,
+			  int num_add)
+{
+	struct i40e_hw *hw = &vsi->back->hw;
+	int aq_err, fcnt;
+
+	i40e_aq_add_macvlan(hw, vsi->seid, list, num_add, NULL);
+	aq_err = hw->aq.asq_last_status;
+	fcnt = i40e_update_filter_state(num_add, list, add_head);
+
+	if (fcnt != num_add) {
+		set_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state);
+		dev_warn(&vsi->back->pdev->dev,
+			 "Error %s adding RX filters on %s, promiscuous mode forced on\n",
+			 i40e_aq_str(hw, aq_err),
+			 vsi_name);
+	}
+}
+
+/**
+ * i40e_aqc_broadcast_filter - Set promiscuous broadcast flags
+ * @vsi: pointer to the VSI
+ * @f: filter data
+ *
+ * This function sets or clears the promiscuous broadcast flags for VLAN
+ * filters in order to properly receive broadcast frames. Assumes that only
+ * broadcast filters are passed.
+ *
+ * Returns status indicating success or failure;
+ **/
+static i40e_status
+i40e_aqc_broadcast_filter(struct i40e_vsi *vsi, const char *vsi_name,
+			  struct i40e_mac_filter *f)
+{
+	bool enable = f->state == I40E_FILTER_NEW;
+	struct i40e_hw *hw = &vsi->back->hw;
+	i40e_status aq_ret;
+
+	if (f->vlan == I40E_VLAN_ANY) {
+		aq_ret = i40e_aq_set_vsi_broadcast(hw,
+						   vsi->seid,
+						   enable,
+						   NULL);
+	} else {
+		aq_ret = i40e_aq_set_vsi_bc_promisc_on_vlan(hw,
+							    vsi->seid,
+							    enable,
+							    f->vlan,
+							    NULL);
+	}
+
+	if (aq_ret) {
+		set_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state);
+		dev_warn(&vsi->back->pdev->dev,
+			 "Error %s, forcing overflow promiscuous on %s\n",
+			 i40e_aq_str(hw, hw->aq.asq_last_status),
+			 vsi_name);
+	}
+
+	return aq_ret;
+}
+
+/**
+ * i40e_set_promiscuous - set promiscuous mode
+ * @pf: board private structure
+ * @promisc: promisc on or off
+ *
+ * There are different ways of setting promiscuous mode on a PF depending on
+ * what state/environment we're in.  This identifies and sets it appropriately.
+ * Returns 0 on success.
+ **/
+static int i40e_set_promiscuous(struct i40e_pf *pf, bool promisc)
+{
+	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+	struct i40e_hw *hw = &pf->hw;
+	i40e_status aq_ret;
+
+	if (vsi->type == I40E_VSI_MAIN &&
+	    pf->lan_veb != I40E_NO_VEB &&
+	    !(pf->flags & I40E_FLAG_MFP_ENABLED)) {
+		/* set defport ON for Main VSI instead of true promisc
+		 * this way we will get all unicast/multicast and VLAN
+		 * promisc behavior but will not get VF or VMDq traffic
+		 * replicated on the Main VSI.
+		 */
+		if (promisc)
+			aq_ret = i40e_aq_set_default_vsi(hw,
+							 vsi->seid,
+							 NULL);
+		else
+			aq_ret = i40e_aq_clear_default_vsi(hw,
+							   vsi->seid,
+							   NULL);
+		if (aq_ret) {
+			dev_info(&pf->pdev->dev,
+				 "Set default VSI failed, err %s, aq_err %s\n",
+				 i40e_stat_str(hw, aq_ret),
+				 i40e_aq_str(hw, hw->aq.asq_last_status));
+		}
+	} else {
+		aq_ret = i40e_aq_set_vsi_unicast_promiscuous(
+						  hw,
+						  vsi->seid,
+						  promisc, NULL,
+						  true);
+		if (aq_ret) {
+			dev_info(&pf->pdev->dev,
+				 "set unicast promisc failed, err %s, aq_err %s\n",
+				 i40e_stat_str(hw, aq_ret),
+				 i40e_aq_str(hw, hw->aq.asq_last_status));
+		}
+		aq_ret = i40e_aq_set_vsi_multicast_promiscuous(
+						  hw,
+						  vsi->seid,
+						  promisc, NULL);
+		if (aq_ret) {
+			dev_info(&pf->pdev->dev,
+				 "set multicast promisc failed, err %s, aq_err %s\n",
+				 i40e_stat_str(hw, aq_ret),
+				 i40e_aq_str(hw, hw->aq.asq_last_status));
+		}
+	}
+
+	if (!aq_ret)
+		pf->cur_promisc = promisc;
+
+	return aq_ret;
+}
+
+/**
+ * i40e_sync_vsi_filters - Update the VSI filter list to the HW
+ * @vsi: ptr to the VSI
+ *
+ * Push any outstanding VSI filter changes through the AdminQ.
+ *
+ * Returns 0 or error value
+ **/
+int i40e_sync_vsi_filters(struct i40e_vsi *vsi)
+{
+	struct hlist_head tmp_add_list, tmp_del_list;
+	struct i40e_mac_filter *f;
+	struct i40e_new_mac_filter *new, *add_head = NULL;
+	struct i40e_hw *hw = &vsi->back->hw;
+	bool old_overflow, new_overflow;
+	unsigned int failed_filters = 0;
+	unsigned int vlan_filters = 0;
+	char vsi_name[16] = "PF";
+	int filter_list_len = 0;
+	i40e_status aq_ret = 0;
+	u32 changed_flags = 0;
+	struct hlist_node *h;
+	struct i40e_pf *pf;
+	int num_add = 0;
+	int num_del = 0;
+	int retval = 0;
+	u16 cmd_flags;
+	int list_size;
+	int bkt;
+
+	/* empty array typed pointers, kcalloc later */
+	struct i40e_aqc_add_macvlan_element_data *add_list;
+	struct i40e_aqc_remove_macvlan_element_data *del_list;
+
+	while (test_and_set_bit(__I40E_VSI_SYNCING_FILTERS, vsi->state))
+		usleep_range(1000, 2000);
+	pf = vsi->back;
+
+	old_overflow = test_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state);
+
+	if (vsi->netdev) {
+		changed_flags = vsi->current_netdev_flags ^ vsi->netdev->flags;
+		vsi->current_netdev_flags = vsi->netdev->flags;
+	}
+
+	INIT_HLIST_HEAD(&tmp_add_list);
+	INIT_HLIST_HEAD(&tmp_del_list);
+
+	if (vsi->type == I40E_VSI_SRIOV)
+		snprintf(vsi_name, sizeof(vsi_name) - 1, "VF %d", vsi->vf_id);
+	else if (vsi->type != I40E_VSI_MAIN)
+		snprintf(vsi_name, sizeof(vsi_name) - 1, "vsi %d", vsi->seid);
+
+	if (vsi->flags & I40E_VSI_FLAG_FILTER_CHANGED) {
+		vsi->flags &= ~I40E_VSI_FLAG_FILTER_CHANGED;
+
+		spin_lock_bh(&vsi->mac_filter_hash_lock);
+		/* Create a list of filters to delete. */
+		hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
+			if (f->state == I40E_FILTER_REMOVE) {
+				/* Move the element into temporary del_list */
+				hash_del(&f->hlist);
+				hlist_add_head(&f->hlist, &tmp_del_list);
+
+				/* Avoid counting removed filters */
+				continue;
+			}
+			if (f->state == I40E_FILTER_NEW) {
+				/* Create a temporary i40e_new_mac_filter */
+				new = kzalloc(sizeof(*new), GFP_ATOMIC);
+				if (!new)
+					goto err_no_memory_locked;
+
+				/* Store pointer to the real filter */
+				new->f = f;
+				new->state = f->state;
+
+				/* Add it to the hash list */
+				hlist_add_head(&new->hlist, &tmp_add_list);
+			}
+
+			/* Count the number of active (current and new) VLAN
+			 * filters we have now. Does not count filters which
+			 * are marked for deletion.
+			 */
+			if (f->vlan > 0)
+				vlan_filters++;
+		}
+
+		retval = i40e_correct_mac_vlan_filters(vsi,
+						       &tmp_add_list,
+						       &tmp_del_list,
+						       vlan_filters);
+		if (retval)
+			goto err_no_memory_locked;
+
+		spin_unlock_bh(&vsi->mac_filter_hash_lock);
+	}
+
+	/* Now process 'del_list' outside the lock */
+	if (!hlist_empty(&tmp_del_list)) {
+		filter_list_len = hw->aq.asq_buf_size /
+			    sizeof(struct i40e_aqc_remove_macvlan_element_data);
+		list_size = filter_list_len *
+			    sizeof(struct i40e_aqc_remove_macvlan_element_data);
+		del_list = kzalloc(list_size, GFP_ATOMIC);
+		if (!del_list)
+			goto err_no_memory;
+
+		hlist_for_each_entry_safe(f, h, &tmp_del_list, hlist) {
+			cmd_flags = 0;
+
+			/* handle broadcast filters by updating the broadcast
+			 * promiscuous flag and release filter list.
+			 */
+			if (is_broadcast_ether_addr(f->macaddr)) {
+				i40e_aqc_broadcast_filter(vsi, vsi_name, f);
+
+				hlist_del(&f->hlist);
+				kfree(f);
+				continue;
+			}
+
+			/* add to delete list */
+			ether_addr_copy(del_list[num_del].mac_addr, f->macaddr);
+			if (f->vlan == I40E_VLAN_ANY) {
+				del_list[num_del].vlan_tag = 0;
+				cmd_flags |= I40E_AQC_MACVLAN_DEL_IGNORE_VLAN;
+			} else {
+				del_list[num_del].vlan_tag =
+					cpu_to_le16((u16)(f->vlan));
+			}
+
+			cmd_flags |= I40E_AQC_MACVLAN_DEL_PERFECT_MATCH;
+			del_list[num_del].flags = cmd_flags;
+			num_del++;
+
+			/* flush a full buffer */
+			if (num_del == filter_list_len) {
+				i40e_aqc_del_filters(vsi, vsi_name, del_list,
+						     num_del, &retval);
+				memset(del_list, 0, list_size);
+				num_del = 0;
+			}
+			/* Release memory for MAC filter entries which were
+			 * synced up with HW.
+			 */
+			hlist_del(&f->hlist);
+			kfree(f);
+		}
+
+		if (num_del) {
+			i40e_aqc_del_filters(vsi, vsi_name, del_list,
+					     num_del, &retval);
+		}
+
+		kfree(del_list);
+		del_list = NULL;
+	}
+
+	if (!hlist_empty(&tmp_add_list)) {
+		/* Do all the adds now. */
+		filter_list_len = hw->aq.asq_buf_size /
+			       sizeof(struct i40e_aqc_add_macvlan_element_data);
+		list_size = filter_list_len *
+			       sizeof(struct i40e_aqc_add_macvlan_element_data);
+		add_list = kzalloc(list_size, GFP_ATOMIC);
+		if (!add_list)
+			goto err_no_memory;
+
+		num_add = 0;
+		hlist_for_each_entry_safe(new, h, &tmp_add_list, hlist) {
+			/* handle broadcast filters by updating the broadcast
+			 * promiscuous flag instead of adding a MAC filter.
+			 */
+			if (is_broadcast_ether_addr(new->f->macaddr)) {
+				if (i40e_aqc_broadcast_filter(vsi, vsi_name,
+							      new->f))
+					new->state = I40E_FILTER_FAILED;
+				else
+					new->state = I40E_FILTER_ACTIVE;
+				continue;
+			}
+
+			/* add to add array */
+			if (num_add == 0)
+				add_head = new;
+			cmd_flags = 0;
+			ether_addr_copy(add_list[num_add].mac_addr,
+					new->f->macaddr);
+			if (new->f->vlan == I40E_VLAN_ANY) {
+				add_list[num_add].vlan_tag = 0;
+				cmd_flags |= I40E_AQC_MACVLAN_ADD_IGNORE_VLAN;
+			} else {
+				add_list[num_add].vlan_tag =
+					cpu_to_le16((u16)(new->f->vlan));
+			}
+			add_list[num_add].queue_number = 0;
+			/* set invalid match method for later detection */
+			add_list[num_add].match_method = I40E_AQC_MM_ERR_NO_RES;
+			cmd_flags |= I40E_AQC_MACVLAN_ADD_PERFECT_MATCH;
+			add_list[num_add].flags = cpu_to_le16(cmd_flags);
+			num_add++;
+
+			/* flush a full buffer */
+			if (num_add == filter_list_len) {
+				i40e_aqc_add_filters(vsi, vsi_name, add_list,
+						     add_head, num_add);
+				memset(add_list, 0, list_size);
+				num_add = 0;
+			}
+		}
+		if (num_add) {
+			i40e_aqc_add_filters(vsi, vsi_name, add_list, add_head,
+					     num_add);
+		}
+		/* Now move all of the filters from the temp add list back to
+		 * the VSI's list.
+		 */
+		spin_lock_bh(&vsi->mac_filter_hash_lock);
+		hlist_for_each_entry_safe(new, h, &tmp_add_list, hlist) {
+			/* Only update the state if we're still NEW */
+			if (new->f->state == I40E_FILTER_NEW)
+				new->f->state = new->state;
+			hlist_del(&new->hlist);
+			kfree(new);
+		}
+		spin_unlock_bh(&vsi->mac_filter_hash_lock);
+		kfree(add_list);
+		add_list = NULL;
+	}
+
+	/* Determine the number of active and failed filters. */
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+	vsi->active_filters = 0;
+	hash_for_each(vsi->mac_filter_hash, bkt, f, hlist) {
+		if (f->state == I40E_FILTER_ACTIVE)
+			vsi->active_filters++;
+		else if (f->state == I40E_FILTER_FAILED)
+			failed_filters++;
+	}
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+	/* Check if we are able to exit overflow promiscuous mode. We can
+	 * safely exit if we didn't just enter, we no longer have any failed
+	 * filters, and we have reduced filters below the threshold value.
+	 */
+	if (old_overflow && !failed_filters &&
+	    vsi->active_filters < vsi->promisc_threshold) {
+		dev_info(&pf->pdev->dev,
+			 "filter logjam cleared on %s, leaving overflow promiscuous mode\n",
+			 vsi_name);
+		clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state);
+		vsi->promisc_threshold = 0;
+	}
+
+	/* if the VF is not trusted do not do promisc */
+	if ((vsi->type == I40E_VSI_SRIOV) && !pf->vf[vsi->vf_id].trusted) {
+		clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state);
+		goto out;
+	}
+
+	new_overflow = test_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state);
+
+	/* If we are entering overflow promiscuous, we need to calculate a new
+	 * threshold for when we are safe to exit
+	 */
+	if (!old_overflow && new_overflow)
+		vsi->promisc_threshold = (vsi->active_filters * 3) / 4;
+
+	/* check for changes in promiscuous modes */
+	if (changed_flags & IFF_ALLMULTI) {
+		bool cur_multipromisc;
+
+		cur_multipromisc = !!(vsi->current_netdev_flags & IFF_ALLMULTI);
+		aq_ret = i40e_aq_set_vsi_multicast_promiscuous(&vsi->back->hw,
+							       vsi->seid,
+							       cur_multipromisc,
+							       NULL);
+		if (aq_ret) {
+			retval = i40e_aq_rc_to_posix(aq_ret,
+						     hw->aq.asq_last_status);
+			dev_info(&pf->pdev->dev,
+				 "set multi promisc failed on %s, err %s aq_err %s\n",
+				 vsi_name,
+				 i40e_stat_str(hw, aq_ret),
+				 i40e_aq_str(hw, hw->aq.asq_last_status));
+		}
+	}
+
+	if ((changed_flags & IFF_PROMISC) || old_overflow != new_overflow) {
+		bool cur_promisc;
+
+		cur_promisc = (!!(vsi->current_netdev_flags & IFF_PROMISC) ||
+			       new_overflow);
+		aq_ret = i40e_set_promiscuous(pf, cur_promisc);
+		if (aq_ret) {
+			retval = i40e_aq_rc_to_posix(aq_ret,
+						     hw->aq.asq_last_status);
+			dev_info(&pf->pdev->dev,
+				 "Setting promiscuous %s failed on %s, err %s aq_err %s\n",
+				 cur_promisc ? "on" : "off",
+				 vsi_name,
+				 i40e_stat_str(hw, aq_ret),
+				 i40e_aq_str(hw, hw->aq.asq_last_status));
+		}
+	}
+out:
+	/* if something went wrong then set the changed flag so we try again */
+	if (retval)
+		vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED;
+
+	clear_bit(__I40E_VSI_SYNCING_FILTERS, vsi->state);
+	return retval;
+
+err_no_memory:
+	/* Restore elements on the temporary add and delete lists */
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+err_no_memory_locked:
+	i40e_undo_del_filter_entries(vsi, &tmp_del_list);
+	i40e_undo_add_filter_entries(vsi, &tmp_add_list);
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+	vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED;
+	clear_bit(__I40E_VSI_SYNCING_FILTERS, vsi->state);
+	return -ENOMEM;
+}
+
+/**
+ * i40e_sync_filters_subtask - Sync the VSI filter list with HW
+ * @pf: board private structure
+ **/
+static void i40e_sync_filters_subtask(struct i40e_pf *pf)
+{
+	int v;
+
+	if (!pf)
+		return;
+	if (!test_and_clear_bit(__I40E_MACVLAN_SYNC_PENDING, pf->state))
+		return;
+
+	for (v = 0; v < pf->num_alloc_vsi; v++) {
+		if (pf->vsi[v] &&
+		    (pf->vsi[v]->flags & I40E_VSI_FLAG_FILTER_CHANGED)) {
+			int ret = i40e_sync_vsi_filters(pf->vsi[v]);
+
+			if (ret) {
+				/* come back and try again later */
+				set_bit(__I40E_MACVLAN_SYNC_PENDING,
+					pf->state);
+				break;
+			}
+		}
+	}
+}
+
+/**
+ * i40e_max_xdp_frame_size - returns the maximum allowed frame size for XDP
+ * @vsi: the vsi
+ **/
+static int i40e_max_xdp_frame_size(struct i40e_vsi *vsi)
+{
+	if (PAGE_SIZE >= 8192 || (vsi->back->flags & I40E_FLAG_LEGACY_RX))
+		return I40E_RXBUFFER_2048;
+	else
+		return I40E_RXBUFFER_3072;
+}
+
+/**
+ * i40e_change_mtu - NDO callback to change the Maximum Transfer Unit
+ * @netdev: network interface device structure
+ * @new_mtu: new value for maximum frame size
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+
+	if (i40e_enabled_xdp_vsi(vsi)) {
+		int frame_size = new_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+
+		if (frame_size > i40e_max_xdp_frame_size(vsi))
+			return -EINVAL;
+	}
+
+	netdev_info(netdev, "changing MTU from %d to %d\n",
+		    netdev->mtu, new_mtu);
+	netdev->mtu = new_mtu;
+	if (netif_running(netdev))
+		i40e_vsi_reinit_locked(vsi);
+	set_bit(__I40E_CLIENT_SERVICE_REQUESTED, pf->state);
+	set_bit(__I40E_CLIENT_L2_CHANGE, pf->state);
+	return 0;
+}
+
+/**
+ * i40e_ioctl - Access the hwtstamp interface
+ * @netdev: network interface device structure
+ * @ifr: interface request data
+ * @cmd: ioctl command
+ **/
+int i40e_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+
+	switch (cmd) {
+	case SIOCGHWTSTAMP:
+		return i40e_ptp_get_ts_config(pf, ifr);
+	case SIOCSHWTSTAMP:
+		return i40e_ptp_set_ts_config(pf, ifr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+/**
+ * i40e_vlan_stripping_enable - Turn on vlan stripping for the VSI
+ * @vsi: the vsi being adjusted
+ **/
+void i40e_vlan_stripping_enable(struct i40e_vsi *vsi)
+{
+	struct i40e_vsi_context ctxt;
+	i40e_status ret;
+
+	if ((vsi->info.valid_sections &
+	     cpu_to_le16(I40E_AQ_VSI_PROP_VLAN_VALID)) &&
+	    ((vsi->info.port_vlan_flags & I40E_AQ_VSI_PVLAN_MODE_MASK) == 0))
+		return;  /* already enabled */
+
+	vsi->info.valid_sections = cpu_to_le16(I40E_AQ_VSI_PROP_VLAN_VALID);
+	vsi->info.port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL |
+				    I40E_AQ_VSI_PVLAN_EMOD_STR_BOTH;
+
+	ctxt.seid = vsi->seid;
+	ctxt.info = vsi->info;
+	ret = i40e_aq_update_vsi_params(&vsi->back->hw, &ctxt, NULL);
+	if (ret) {
+		dev_info(&vsi->back->pdev->dev,
+			 "update vlan stripping failed, err %s aq_err %s\n",
+			 i40e_stat_str(&vsi->back->hw, ret),
+			 i40e_aq_str(&vsi->back->hw,
+				     vsi->back->hw.aq.asq_last_status));
+	}
+}
+
+/**
+ * i40e_vlan_stripping_disable - Turn off vlan stripping for the VSI
+ * @vsi: the vsi being adjusted
+ **/
+void i40e_vlan_stripping_disable(struct i40e_vsi *vsi)
+{
+	struct i40e_vsi_context ctxt;
+	i40e_status ret;
+
+	if ((vsi->info.valid_sections &
+	     cpu_to_le16(I40E_AQ_VSI_PROP_VLAN_VALID)) &&
+	    ((vsi->info.port_vlan_flags & I40E_AQ_VSI_PVLAN_EMOD_MASK) ==
+	     I40E_AQ_VSI_PVLAN_EMOD_MASK))
+		return;  /* already disabled */
+
+	vsi->info.valid_sections = cpu_to_le16(I40E_AQ_VSI_PROP_VLAN_VALID);
+	vsi->info.port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL |
+				    I40E_AQ_VSI_PVLAN_EMOD_NOTHING;
+
+	ctxt.seid = vsi->seid;
+	ctxt.info = vsi->info;
+	ret = i40e_aq_update_vsi_params(&vsi->back->hw, &ctxt, NULL);
+	if (ret) {
+		dev_info(&vsi->back->pdev->dev,
+			 "update vlan stripping failed, err %s aq_err %s\n",
+			 i40e_stat_str(&vsi->back->hw, ret),
+			 i40e_aq_str(&vsi->back->hw,
+				     vsi->back->hw.aq.asq_last_status));
+	}
+}
+
+/**
+ * i40e_add_vlan_all_mac - Add a MAC/VLAN filter for each existing MAC address
+ * @vsi: the vsi being configured
+ * @vid: vlan id to be added (0 = untagged only , -1 = any)
+ *
+ * This is a helper function for adding a new MAC/VLAN filter with the
+ * specified VLAN for each existing MAC address already in the hash table.
+ * This function does *not* perform any accounting to update filters based on
+ * VLAN mode.
+ *
+ * NOTE: this function expects to be called while under the
+ * mac_filter_hash_lock
+ **/
+int i40e_add_vlan_all_mac(struct i40e_vsi *vsi, s16 vid)
+{
+	struct i40e_mac_filter *f, *add_f;
+	struct hlist_node *h;
+	int bkt;
+
+	hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
+		if (f->state == I40E_FILTER_REMOVE)
+			continue;
+		add_f = i40e_add_filter(vsi, f->macaddr, vid);
+		if (!add_f) {
+			dev_info(&vsi->back->pdev->dev,
+				 "Could not add vlan filter %d for %pM\n",
+				 vid, f->macaddr);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_vsi_add_vlan - Add VSI membership for given VLAN
+ * @vsi: the VSI being configured
+ * @vid: VLAN id to be added
+ **/
+int i40e_vsi_add_vlan(struct i40e_vsi *vsi, u16 vid)
+{
+	int err;
+
+	if (vsi->info.pvid)
+		return -EINVAL;
+
+	/* The network stack will attempt to add VID=0, with the intention to
+	 * receive priority tagged packets with a VLAN of 0. Our HW receives
+	 * these packets by default when configured to receive untagged
+	 * packets, so we don't need to add a filter for this case.
+	 * Additionally, HW interprets adding a VID=0 filter as meaning to
+	 * receive *only* tagged traffic and stops receiving untagged traffic.
+	 * Thus, we do not want to actually add a filter for VID=0
+	 */
+	if (!vid)
+		return 0;
+
+	/* Locked once because all functions invoked below iterates list*/
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+	err = i40e_add_vlan_all_mac(vsi, vid);
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+	if (err)
+		return err;
+
+	/* schedule our worker thread which will take care of
+	 * applying the new filter changes
+	 */
+	i40e_service_event_schedule(vsi->back);
+	return 0;
+}
+
+/**
+ * i40e_rm_vlan_all_mac - Remove MAC/VLAN pair for all MAC with the given VLAN
+ * @vsi: the vsi being configured
+ * @vid: vlan id to be removed (0 = untagged only , -1 = any)
+ *
+ * This function should be used to remove all VLAN filters which match the
+ * given VID. It does not schedule the service event and does not take the
+ * mac_filter_hash_lock so it may be combined with other operations under
+ * a single invocation of the mac_filter_hash_lock.
+ *
+ * NOTE: this function expects to be called while under the
+ * mac_filter_hash_lock
+ */
+void i40e_rm_vlan_all_mac(struct i40e_vsi *vsi, s16 vid)
+{
+	struct i40e_mac_filter *f;
+	struct hlist_node *h;
+	int bkt;
+
+	hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
+		if (f->vlan == vid)
+			__i40e_del_filter(vsi, f);
+	}
+}
+
+/**
+ * i40e_vsi_kill_vlan - Remove VSI membership for given VLAN
+ * @vsi: the VSI being configured
+ * @vid: VLAN id to be removed
+ **/
+void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, u16 vid)
+{
+	if (!vid || vsi->info.pvid)
+		return;
+
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+	i40e_rm_vlan_all_mac(vsi, vid);
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+	/* schedule our worker thread which will take care of
+	 * applying the new filter changes
+	 */
+	i40e_service_event_schedule(vsi->back);
+}
+
+/**
+ * i40e_vlan_rx_add_vid - Add a vlan id filter to HW offload
+ * @netdev: network interface to be adjusted
+ * @vid: vlan id to be added
+ *
+ * net_device_ops implementation for adding vlan ids
+ **/
+static int i40e_vlan_rx_add_vid(struct net_device *netdev,
+				__always_unused __be16 proto, u16 vid)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	int ret = 0;
+
+	if (vid >= VLAN_N_VID)
+		return -EINVAL;
+
+	ret = i40e_vsi_add_vlan(vsi, vid);
+	if (!ret)
+		set_bit(vid, vsi->active_vlans);
+
+	return ret;
+}
+
+/**
+ * i40e_vlan_rx_kill_vid - Remove a vlan id filter from HW offload
+ * @netdev: network interface to be adjusted
+ * @vid: vlan id to be removed
+ *
+ * net_device_ops implementation for removing vlan ids
+ **/
+static int i40e_vlan_rx_kill_vid(struct net_device *netdev,
+				 __always_unused __be16 proto, u16 vid)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+
+	/* return code is ignored as there is nothing a user
+	 * can do about failure to remove and a log message was
+	 * already printed from the other function
+	 */
+	i40e_vsi_kill_vlan(vsi, vid);
+
+	clear_bit(vid, vsi->active_vlans);
+
+	return 0;
+}
+
+/**
+ * i40e_restore_vlan - Reinstate vlans when vsi/netdev comes back up
+ * @vsi: the vsi being brought back up
+ **/
+static void i40e_restore_vlan(struct i40e_vsi *vsi)
+{
+	u16 vid;
+
+	if (!vsi->netdev)
+		return;
+
+	if (vsi->netdev->features & NETIF_F_HW_VLAN_CTAG_RX)
+		i40e_vlan_stripping_enable(vsi);
+	else
+		i40e_vlan_stripping_disable(vsi);
+
+	for_each_set_bit(vid, vsi->active_vlans, VLAN_N_VID)
+		i40e_vlan_rx_add_vid(vsi->netdev, htons(ETH_P_8021Q),
+				     vid);
+}
+
+/**
+ * i40e_vsi_add_pvid - Add pvid for the VSI
+ * @vsi: the vsi being adjusted
+ * @vid: the vlan id to set as a PVID
+ **/
+int i40e_vsi_add_pvid(struct i40e_vsi *vsi, u16 vid)
+{
+	struct i40e_vsi_context ctxt;
+	i40e_status ret;
+
+	vsi->info.valid_sections = cpu_to_le16(I40E_AQ_VSI_PROP_VLAN_VALID);
+	vsi->info.pvid = cpu_to_le16(vid);
+	vsi->info.port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_TAGGED |
+				    I40E_AQ_VSI_PVLAN_INSERT_PVID |
+				    I40E_AQ_VSI_PVLAN_EMOD_STR;
+
+	ctxt.seid = vsi->seid;
+	ctxt.info = vsi->info;
+	ret = i40e_aq_update_vsi_params(&vsi->back->hw, &ctxt, NULL);
+	if (ret) {
+		dev_info(&vsi->back->pdev->dev,
+			 "add pvid failed, err %s aq_err %s\n",
+			 i40e_stat_str(&vsi->back->hw, ret),
+			 i40e_aq_str(&vsi->back->hw,
+				     vsi->back->hw.aq.asq_last_status));
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_vsi_remove_pvid - Remove the pvid from the VSI
+ * @vsi: the vsi being adjusted
+ *
+ * Just use the vlan_rx_register() service to put it back to normal
+ **/
+void i40e_vsi_remove_pvid(struct i40e_vsi *vsi)
+{
+	i40e_vlan_stripping_disable(vsi);
+
+	vsi->info.pvid = 0;
+}
+
+/**
+ * i40e_vsi_setup_tx_resources - Allocate VSI Tx queue resources
+ * @vsi: ptr to the VSI
+ *
+ * If this function returns with an error, then it's possible one or
+ * more of the rings is populated (while the rest are not).  It is the
+ * callers duty to clean those orphaned rings.
+ *
+ * Return 0 on success, negative on failure
+ **/
+static int i40e_vsi_setup_tx_resources(struct i40e_vsi *vsi)
+{
+	int i, err = 0;
+
+	for (i = 0; i < vsi->num_queue_pairs && !err; i++)
+		err = i40e_setup_tx_descriptors(vsi->tx_rings[i]);
+
+	if (!i40e_enabled_xdp_vsi(vsi))
+		return err;
+
+	for (i = 0; i < vsi->num_queue_pairs && !err; i++)
+		err = i40e_setup_tx_descriptors(vsi->xdp_rings[i]);
+
+	return err;
+}
+
+/**
+ * i40e_vsi_free_tx_resources - Free Tx resources for VSI queues
+ * @vsi: ptr to the VSI
+ *
+ * Free VSI's transmit software resources
+ **/
+static void i40e_vsi_free_tx_resources(struct i40e_vsi *vsi)
+{
+	int i;
+
+	if (vsi->tx_rings) {
+		for (i = 0; i < vsi->num_queue_pairs; i++)
+			if (vsi->tx_rings[i] && vsi->tx_rings[i]->desc)
+				i40e_free_tx_resources(vsi->tx_rings[i]);
+	}
+
+	if (vsi->xdp_rings) {
+		for (i = 0; i < vsi->num_queue_pairs; i++)
+			if (vsi->xdp_rings[i] && vsi->xdp_rings[i]->desc)
+				i40e_free_tx_resources(vsi->xdp_rings[i]);
+	}
+}
+
+/**
+ * i40e_vsi_setup_rx_resources - Allocate VSI queues Rx resources
+ * @vsi: ptr to the VSI
+ *
+ * If this function returns with an error, then it's possible one or
+ * more of the rings is populated (while the rest are not).  It is the
+ * callers duty to clean those orphaned rings.
+ *
+ * Return 0 on success, negative on failure
+ **/
+static int i40e_vsi_setup_rx_resources(struct i40e_vsi *vsi)
+{
+	int i, err = 0;
+
+	for (i = 0; i < vsi->num_queue_pairs && !err; i++)
+		err = i40e_setup_rx_descriptors(vsi->rx_rings[i]);
+	return err;
+}
+
+/**
+ * i40e_vsi_free_rx_resources - Free Rx Resources for VSI queues
+ * @vsi: ptr to the VSI
+ *
+ * Free all receive software resources
+ **/
+static void i40e_vsi_free_rx_resources(struct i40e_vsi *vsi)
+{
+	int i;
+
+	if (!vsi->rx_rings)
+		return;
+
+	for (i = 0; i < vsi->num_queue_pairs; i++)
+		if (vsi->rx_rings[i] && vsi->rx_rings[i]->desc)
+			i40e_free_rx_resources(vsi->rx_rings[i]);
+}
+
+/**
+ * i40e_config_xps_tx_ring - Configure XPS for a Tx ring
+ * @ring: The Tx ring to configure
+ *
+ * This enables/disables XPS for a given Tx descriptor ring
+ * based on the TCs enabled for the VSI that ring belongs to.
+ **/
+static void i40e_config_xps_tx_ring(struct i40e_ring *ring)
+{
+	int cpu;
+
+	if (!ring->q_vector || !ring->netdev || ring->ch)
+		return;
+
+	/* We only initialize XPS once, so as not to overwrite user settings */
+	if (test_and_set_bit(__I40E_TX_XPS_INIT_DONE, ring->state))
+		return;
+
+	cpu = cpumask_local_spread(ring->q_vector->v_idx, -1);
+	netif_set_xps_queue(ring->netdev, get_cpu_mask(cpu),
+			    ring->queue_index);
+}
+
+/**
+ * i40e_configure_tx_ring - Configure a transmit ring context and rest
+ * @ring: The Tx ring to configure
+ *
+ * Configure the Tx descriptor ring in the HMC context.
+ **/
+static int i40e_configure_tx_ring(struct i40e_ring *ring)
+{
+	struct i40e_vsi *vsi = ring->vsi;
+	u16 pf_q = vsi->base_queue + ring->queue_index;
+	struct i40e_hw *hw = &vsi->back->hw;
+	struct i40e_hmc_obj_txq tx_ctx;
+	i40e_status err = 0;
+	u32 qtx_ctl = 0;
+
+	/* some ATR related tx ring init */
+	if (vsi->back->flags & I40E_FLAG_FD_ATR_ENABLED) {
+		ring->atr_sample_rate = vsi->back->atr_sample_rate;
+		ring->atr_count = 0;
+	} else {
+		ring->atr_sample_rate = 0;
+	}
+
+	/* configure XPS */
+	i40e_config_xps_tx_ring(ring);
+
+	/* clear the context structure first */
+	memset(&tx_ctx, 0, sizeof(tx_ctx));
+
+	tx_ctx.new_context = 1;
+	tx_ctx.base = (ring->dma / 128);
+	tx_ctx.qlen = ring->count;
+	tx_ctx.fd_ena = !!(vsi->back->flags & (I40E_FLAG_FD_SB_ENABLED |
+					       I40E_FLAG_FD_ATR_ENABLED));
+	tx_ctx.timesync_ena = !!(vsi->back->flags & I40E_FLAG_PTP);
+	/* FDIR VSI tx ring can still use RS bit and writebacks */
+	if (vsi->type != I40E_VSI_FDIR)
+		tx_ctx.head_wb_ena = 1;
+	tx_ctx.head_wb_addr = ring->dma +
+			      (ring->count * sizeof(struct i40e_tx_desc));
+
+	/* As part of VSI creation/update, FW allocates certain
+	 * Tx arbitration queue sets for each TC enabled for
+	 * the VSI. The FW returns the handles to these queue
+	 * sets as part of the response buffer to Add VSI,
+	 * Update VSI, etc. AQ commands. It is expected that
+	 * these queue set handles be associated with the Tx
+	 * queues by the driver as part of the TX queue context
+	 * initialization. This has to be done regardless of
+	 * DCB as by default everything is mapped to TC0.
+	 */
+
+	if (ring->ch)
+		tx_ctx.rdylist =
+			le16_to_cpu(ring->ch->info.qs_handle[ring->dcb_tc]);
+
+	else
+		tx_ctx.rdylist = le16_to_cpu(vsi->info.qs_handle[ring->dcb_tc]);
+
+	tx_ctx.rdylist_act = 0;
+
+	/* clear the context in the HMC */
+	err = i40e_clear_lan_tx_queue_context(hw, pf_q);
+	if (err) {
+		dev_info(&vsi->back->pdev->dev,
+			 "Failed to clear LAN Tx queue context on Tx ring %d (pf_q %d), error: %d\n",
+			 ring->queue_index, pf_q, err);
+		return -ENOMEM;
+	}
+
+	/* set the context in the HMC */
+	err = i40e_set_lan_tx_queue_context(hw, pf_q, &tx_ctx);
+	if (err) {
+		dev_info(&vsi->back->pdev->dev,
+			 "Failed to set LAN Tx queue context on Tx ring %d (pf_q %d, error: %d\n",
+			 ring->queue_index, pf_q, err);
+		return -ENOMEM;
+	}
+
+	/* Now associate this queue with this PCI function */
+	if (ring->ch) {
+		if (ring->ch->type == I40E_VSI_VMDQ2)
+			qtx_ctl = I40E_QTX_CTL_VM_QUEUE;
+		else
+			return -EINVAL;
+
+		qtx_ctl |= (ring->ch->vsi_number <<
+			    I40E_QTX_CTL_VFVM_INDX_SHIFT) &
+			    I40E_QTX_CTL_VFVM_INDX_MASK;
+	} else {
+		if (vsi->type == I40E_VSI_VMDQ2) {
+			qtx_ctl = I40E_QTX_CTL_VM_QUEUE;
+			qtx_ctl |= ((vsi->id) << I40E_QTX_CTL_VFVM_INDX_SHIFT) &
+				    I40E_QTX_CTL_VFVM_INDX_MASK;
+		} else {
+			qtx_ctl = I40E_QTX_CTL_PF_QUEUE;
+		}
+	}
+
+	qtx_ctl |= ((hw->pf_id << I40E_QTX_CTL_PF_INDX_SHIFT) &
+		    I40E_QTX_CTL_PF_INDX_MASK);
+	wr32(hw, I40E_QTX_CTL(pf_q), qtx_ctl);
+	i40e_flush(hw);
+
+	/* cache tail off for easier writes later */
+	ring->tail = hw->hw_addr + I40E_QTX_TAIL(pf_q);
+
+	return 0;
+}
+
+/**
+ * i40e_configure_rx_ring - Configure a receive ring context
+ * @ring: The Rx ring to configure
+ *
+ * Configure the Rx descriptor ring in the HMC context.
+ **/
+static int i40e_configure_rx_ring(struct i40e_ring *ring)
+{
+	struct i40e_vsi *vsi = ring->vsi;
+	u32 chain_len = vsi->back->hw.func_caps.rx_buf_chain_len;
+	u16 pf_q = vsi->base_queue + ring->queue_index;
+	struct i40e_hw *hw = &vsi->back->hw;
+	struct i40e_hmc_obj_rxq rx_ctx;
+	i40e_status err = 0;
+
+	bitmap_zero(ring->state, __I40E_RING_STATE_NBITS);
+
+	/* clear the context structure first */
+	memset(&rx_ctx, 0, sizeof(rx_ctx));
+
+	ring->rx_buf_len = vsi->rx_buf_len;
+
+	rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len,
+				    BIT_ULL(I40E_RXQ_CTX_DBUFF_SHIFT));
+
+	rx_ctx.base = (ring->dma / 128);
+	rx_ctx.qlen = ring->count;
+
+	/* use 32 byte descriptors */
+	rx_ctx.dsize = 1;
+
+	/* descriptor type is always zero
+	 * rx_ctx.dtype = 0;
+	 */
+	rx_ctx.hsplit_0 = 0;
+
+	rx_ctx.rxmax = min_t(u16, vsi->max_frame, chain_len * ring->rx_buf_len);
+	if (hw->revision_id == 0)
+		rx_ctx.lrxqthresh = 0;
+	else
+		rx_ctx.lrxqthresh = 1;
+	rx_ctx.crcstrip = 1;
+	rx_ctx.l2tsel = 1;
+	/* this controls whether VLAN is stripped from inner headers */
+	rx_ctx.showiv = 0;
+	/* set the prefena field to 1 because the manual says to */
+	rx_ctx.prefena = 1;
+
+	/* clear the context in the HMC */
+	err = i40e_clear_lan_rx_queue_context(hw, pf_q);
+	if (err) {
+		dev_info(&vsi->back->pdev->dev,
+			 "Failed to clear LAN Rx queue context on Rx ring %d (pf_q %d), error: %d\n",
+			 ring->queue_index, pf_q, err);
+		return -ENOMEM;
+	}
+
+	/* set the context in the HMC */
+	err = i40e_set_lan_rx_queue_context(hw, pf_q, &rx_ctx);
+	if (err) {
+		dev_info(&vsi->back->pdev->dev,
+			 "Failed to set LAN Rx queue context on Rx ring %d (pf_q %d), error: %d\n",
+			 ring->queue_index, pf_q, err);
+		return -ENOMEM;
+	}
+
+	/* configure Rx buffer alignment */
+	if (!vsi->netdev || (vsi->back->flags & I40E_FLAG_LEGACY_RX))
+		clear_ring_build_skb_enabled(ring);
+	else
+		set_ring_build_skb_enabled(ring);
+
+	/* cache tail for quicker writes, and clear the reg before use */
+	ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
+	writel(0, ring->tail);
+
+	i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
+
+	return 0;
+}
+
+/**
+ * i40e_vsi_configure_tx - Configure the VSI for Tx
+ * @vsi: VSI structure describing this set of rings and resources
+ *
+ * Configure the Tx VSI for operation.
+ **/
+static int i40e_vsi_configure_tx(struct i40e_vsi *vsi)
+{
+	int err = 0;
+	u16 i;
+
+	for (i = 0; (i < vsi->num_queue_pairs) && !err; i++)
+		err = i40e_configure_tx_ring(vsi->tx_rings[i]);
+
+	if (!i40e_enabled_xdp_vsi(vsi))
+		return err;
+
+	for (i = 0; (i < vsi->num_queue_pairs) && !err; i++)
+		err = i40e_configure_tx_ring(vsi->xdp_rings[i]);
+
+	return err;
+}
+
+/**
+ * i40e_vsi_configure_rx - Configure the VSI for Rx
+ * @vsi: the VSI being configured
+ *
+ * Configure the Rx VSI for operation.
+ **/
+static int i40e_vsi_configure_rx(struct i40e_vsi *vsi)
+{
+	int err = 0;
+	u16 i;
+
+	if (!vsi->netdev || (vsi->back->flags & I40E_FLAG_LEGACY_RX)) {
+		vsi->max_frame = I40E_MAX_RXBUFFER;
+		vsi->rx_buf_len = I40E_RXBUFFER_2048;
+#if (PAGE_SIZE < 8192)
+	} else if (!I40E_2K_TOO_SMALL_WITH_PADDING &&
+		   (vsi->netdev->mtu <= ETH_DATA_LEN)) {
+		vsi->max_frame = I40E_RXBUFFER_1536 - NET_IP_ALIGN;
+		vsi->rx_buf_len = I40E_RXBUFFER_1536 - NET_IP_ALIGN;
+#endif
+	} else {
+		vsi->max_frame = I40E_MAX_RXBUFFER;
+		vsi->rx_buf_len = (PAGE_SIZE < 8192) ? I40E_RXBUFFER_3072 :
+						       I40E_RXBUFFER_2048;
+	}
+
+	/* set up individual rings */
+	for (i = 0; i < vsi->num_queue_pairs && !err; i++)
+		err = i40e_configure_rx_ring(vsi->rx_rings[i]);
+
+	return err;
+}
+
+/**
+ * i40e_vsi_config_dcb_rings - Update rings to reflect DCB TC
+ * @vsi: ptr to the VSI
+ **/
+static void i40e_vsi_config_dcb_rings(struct i40e_vsi *vsi)
+{
+	struct i40e_ring *tx_ring, *rx_ring;
+	u16 qoffset, qcount;
+	int i, n;
+
+	if (!(vsi->back->flags & I40E_FLAG_DCB_ENABLED)) {
+		/* Reset the TC information */
+		for (i = 0; i < vsi->num_queue_pairs; i++) {
+			rx_ring = vsi->rx_rings[i];
+			tx_ring = vsi->tx_rings[i];
+			rx_ring->dcb_tc = 0;
+			tx_ring->dcb_tc = 0;
+		}
+		return;
+	}
+
+	for (n = 0; n < I40E_MAX_TRAFFIC_CLASS; n++) {
+		if (!(vsi->tc_config.enabled_tc & BIT_ULL(n)))
+			continue;
+
+		qoffset = vsi->tc_config.tc_info[n].qoffset;
+		qcount = vsi->tc_config.tc_info[n].qcount;
+		for (i = qoffset; i < (qoffset + qcount); i++) {
+			rx_ring = vsi->rx_rings[i];
+			tx_ring = vsi->tx_rings[i];
+			rx_ring->dcb_tc = n;
+			tx_ring->dcb_tc = n;
+		}
+	}
+}
+
+/**
+ * i40e_set_vsi_rx_mode - Call set_rx_mode on a VSI
+ * @vsi: ptr to the VSI
+ **/
+static void i40e_set_vsi_rx_mode(struct i40e_vsi *vsi)
+{
+	if (vsi->netdev)
+		i40e_set_rx_mode(vsi->netdev);
+}
+
+/**
+ * i40e_fdir_filter_restore - Restore the Sideband Flow Director filters
+ * @vsi: Pointer to the targeted VSI
+ *
+ * This function replays the hlist on the hw where all the SB Flow Director
+ * filters were saved.
+ **/
+static void i40e_fdir_filter_restore(struct i40e_vsi *vsi)
+{
+	struct i40e_fdir_filter *filter;
+	struct i40e_pf *pf = vsi->back;
+	struct hlist_node *node;
+
+	if (!(pf->flags & I40E_FLAG_FD_SB_ENABLED))
+		return;
+
+	/* Reset FDir counters as we're replaying all existing filters */
+	pf->fd_tcp4_filter_cnt = 0;
+	pf->fd_udp4_filter_cnt = 0;
+	pf->fd_sctp4_filter_cnt = 0;
+	pf->fd_ip4_filter_cnt = 0;
+
+	hlist_for_each_entry_safe(filter, node,
+				  &pf->fdir_filter_list, fdir_node) {
+		i40e_add_del_fdir(vsi, filter, true);
+	}
+}
+
+/**
+ * i40e_vsi_configure - Set up the VSI for action
+ * @vsi: the VSI being configured
+ **/
+static int i40e_vsi_configure(struct i40e_vsi *vsi)
+{
+	int err;
+
+	i40e_set_vsi_rx_mode(vsi);
+	i40e_restore_vlan(vsi);
+	i40e_vsi_config_dcb_rings(vsi);
+	err = i40e_vsi_configure_tx(vsi);
+	if (!err)
+		err = i40e_vsi_configure_rx(vsi);
+
+	return err;
+}
+
+/**
+ * i40e_vsi_configure_msix - MSIX mode Interrupt Config in the HW
+ * @vsi: the VSI being configured
+ **/
+static void i40e_vsi_configure_msix(struct i40e_vsi *vsi)
+{
+	bool has_xdp = i40e_enabled_xdp_vsi(vsi);
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	u16 vector;
+	int i, q;
+	u32 qp;
+
+	/* The interrupt indexing is offset by 1 in the PFINT_ITRn
+	 * and PFINT_LNKLSTn registers, e.g.:
+	 *   PFINT_ITRn[0..n-1] gets msix-1..msix-n  (qpair interrupts)
+	 */
+	qp = vsi->base_queue;
+	vector = vsi->base_vector;
+	for (i = 0; i < vsi->num_q_vectors; i++, vector++) {
+		struct i40e_q_vector *q_vector = vsi->q_vectors[i];
+
+		q_vector->rx.next_update = jiffies + 1;
+		q_vector->rx.target_itr =
+			ITR_TO_REG(vsi->rx_rings[i]->itr_setting);
+		wr32(hw, I40E_PFINT_ITRN(I40E_RX_ITR, vector - 1),
+		     q_vector->rx.target_itr);
+		q_vector->rx.current_itr = q_vector->rx.target_itr;
+
+		q_vector->tx.next_update = jiffies + 1;
+		q_vector->tx.target_itr =
+			ITR_TO_REG(vsi->tx_rings[i]->itr_setting);
+		wr32(hw, I40E_PFINT_ITRN(I40E_TX_ITR, vector - 1),
+		     q_vector->tx.target_itr);
+		q_vector->tx.current_itr = q_vector->tx.target_itr;
+
+		wr32(hw, I40E_PFINT_RATEN(vector - 1),
+		     i40e_intrl_usec_to_reg(vsi->int_rate_limit));
+
+		/* Linked list for the queuepairs assigned to this vector */
+		wr32(hw, I40E_PFINT_LNKLSTN(vector - 1), qp);
+		for (q = 0; q < q_vector->num_ringpairs; q++) {
+			u32 nextqp = has_xdp ? qp + vsi->alloc_queue_pairs : qp;
+			u32 val;
+
+			val = I40E_QINT_RQCTL_CAUSE_ENA_MASK |
+			      (I40E_RX_ITR << I40E_QINT_RQCTL_ITR_INDX_SHIFT) |
+			      (vector << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) |
+			      (nextqp << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
+			      (I40E_QUEUE_TYPE_TX <<
+			       I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT);
+
+			wr32(hw, I40E_QINT_RQCTL(qp), val);
+
+			if (has_xdp) {
+				val = I40E_QINT_TQCTL_CAUSE_ENA_MASK |
+				      (I40E_TX_ITR << I40E_QINT_TQCTL_ITR_INDX_SHIFT) |
+				      (vector << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) |
+				      (qp << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
+				      (I40E_QUEUE_TYPE_TX <<
+				       I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT);
+
+				wr32(hw, I40E_QINT_TQCTL(nextqp), val);
+			}
+
+			val = I40E_QINT_TQCTL_CAUSE_ENA_MASK |
+			      (I40E_TX_ITR << I40E_QINT_TQCTL_ITR_INDX_SHIFT) |
+			      (vector << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) |
+			      ((qp + 1) << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) |
+			      (I40E_QUEUE_TYPE_RX <<
+			       I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT);
+
+			/* Terminate the linked list */
+			if (q == (q_vector->num_ringpairs - 1))
+				val |= (I40E_QUEUE_END_OF_LIST <<
+					I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT);
+
+			wr32(hw, I40E_QINT_TQCTL(qp), val);
+			qp++;
+		}
+	}
+
+	i40e_flush(hw);
+}
+
+/**
+ * i40e_enable_misc_int_causes - enable the non-queue interrupts
+ * @hw: ptr to the hardware info
+ **/
+static void i40e_enable_misc_int_causes(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u32 val;
+
+	/* clear things first */
+	wr32(hw, I40E_PFINT_ICR0_ENA, 0);  /* disable all */
+	rd32(hw, I40E_PFINT_ICR0);         /* read to clear */
+
+	val = I40E_PFINT_ICR0_ENA_ECC_ERR_MASK       |
+	      I40E_PFINT_ICR0_ENA_MAL_DETECT_MASK    |
+	      I40E_PFINT_ICR0_ENA_GRST_MASK          |
+	      I40E_PFINT_ICR0_ENA_PCI_EXCEPTION_MASK |
+	      I40E_PFINT_ICR0_ENA_GPIO_MASK          |
+	      I40E_PFINT_ICR0_ENA_HMC_ERR_MASK       |
+	      I40E_PFINT_ICR0_ENA_VFLR_MASK          |
+	      I40E_PFINT_ICR0_ENA_ADMINQ_MASK;
+
+	if (pf->flags & I40E_FLAG_IWARP_ENABLED)
+		val |= I40E_PFINT_ICR0_ENA_PE_CRITERR_MASK;
+
+	if (pf->flags & I40E_FLAG_PTP)
+		val |= I40E_PFINT_ICR0_ENA_TIMESYNC_MASK;
+
+	wr32(hw, I40E_PFINT_ICR0_ENA, val);
+
+	/* SW_ITR_IDX = 0, but don't change INTENA */
+	wr32(hw, I40E_PFINT_DYN_CTL0, I40E_PFINT_DYN_CTL0_SW_ITR_INDX_MASK |
+					I40E_PFINT_DYN_CTL0_INTENA_MSK_MASK);
+
+	/* OTHER_ITR_IDX = 0 */
+	wr32(hw, I40E_PFINT_STAT_CTL0, 0);
+}
+
+/**
+ * i40e_configure_msi_and_legacy - Legacy mode interrupt config in the HW
+ * @vsi: the VSI being configured
+ **/
+static void i40e_configure_msi_and_legacy(struct i40e_vsi *vsi)
+{
+	u32 nextqp = i40e_enabled_xdp_vsi(vsi) ? vsi->alloc_queue_pairs : 0;
+	struct i40e_q_vector *q_vector = vsi->q_vectors[0];
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	u32 val;
+
+	/* set the ITR configuration */
+	q_vector->rx.next_update = jiffies + 1;
+	q_vector->rx.target_itr = ITR_TO_REG(vsi->rx_rings[0]->itr_setting);
+	wr32(hw, I40E_PFINT_ITR0(I40E_RX_ITR), q_vector->rx.target_itr);
+	q_vector->rx.current_itr = q_vector->rx.target_itr;
+	q_vector->tx.next_update = jiffies + 1;
+	q_vector->tx.target_itr = ITR_TO_REG(vsi->tx_rings[0]->itr_setting);
+	wr32(hw, I40E_PFINT_ITR0(I40E_TX_ITR), q_vector->tx.target_itr);
+	q_vector->tx.current_itr = q_vector->tx.target_itr;
+
+	i40e_enable_misc_int_causes(pf);
+
+	/* FIRSTQ_INDX = 0, FIRSTQ_TYPE = 0 (rx) */
+	wr32(hw, I40E_PFINT_LNKLST0, 0);
+
+	/* Associate the queue pair to the vector and enable the queue int */
+	val = I40E_QINT_RQCTL_CAUSE_ENA_MASK		       |
+	      (I40E_RX_ITR << I40E_QINT_RQCTL_ITR_INDX_SHIFT)  |
+	      (nextqp	   << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT)|
+	      (I40E_QUEUE_TYPE_TX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT);
+
+	wr32(hw, I40E_QINT_RQCTL(0), val);
+
+	if (i40e_enabled_xdp_vsi(vsi)) {
+		val = I40E_QINT_TQCTL_CAUSE_ENA_MASK		     |
+		      (I40E_TX_ITR << I40E_QINT_TQCTL_ITR_INDX_SHIFT)|
+		      (I40E_QUEUE_TYPE_TX
+		       << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT);
+
+	       wr32(hw, I40E_QINT_TQCTL(nextqp), val);
+	}
+
+	val = I40E_QINT_TQCTL_CAUSE_ENA_MASK		      |
+	      (I40E_TX_ITR << I40E_QINT_TQCTL_ITR_INDX_SHIFT) |
+	      (I40E_QUEUE_END_OF_LIST << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT);
+
+	wr32(hw, I40E_QINT_TQCTL(0), val);
+	i40e_flush(hw);
+}
+
+/**
+ * i40e_irq_dynamic_disable_icr0 - Disable default interrupt generation for icr0
+ * @pf: board private structure
+ **/
+void i40e_irq_dynamic_disable_icr0(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+
+	wr32(hw, I40E_PFINT_DYN_CTL0,
+	     I40E_ITR_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
+	i40e_flush(hw);
+}
+
+/**
+ * i40e_irq_dynamic_enable_icr0 - Enable default interrupt generation for icr0
+ * @pf: board private structure
+ **/
+void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u32 val;
+
+	val = I40E_PFINT_DYN_CTL0_INTENA_MASK   |
+	      I40E_PFINT_DYN_CTL0_CLEARPBA_MASK |
+	      (I40E_ITR_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT);
+
+	wr32(hw, I40E_PFINT_DYN_CTL0, val);
+	i40e_flush(hw);
+}
+
+/**
+ * i40e_msix_clean_rings - MSIX mode Interrupt Handler
+ * @irq: interrupt number
+ * @data: pointer to a q_vector
+ **/
+static irqreturn_t i40e_msix_clean_rings(int irq, void *data)
+{
+	struct i40e_q_vector *q_vector = data;
+
+	if (!q_vector->tx.ring && !q_vector->rx.ring)
+		return IRQ_HANDLED;
+
+	napi_schedule_irqoff(&q_vector->napi);
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * i40e_irq_affinity_notify - Callback for affinity changes
+ * @notify: context as to what irq was changed
+ * @mask: the new affinity mask
+ *
+ * This is a callback function used by the irq_set_affinity_notifier function
+ * so that we may register to receive changes to the irq affinity masks.
+ **/
+static void i40e_irq_affinity_notify(struct irq_affinity_notify *notify,
+				     const cpumask_t *mask)
+{
+	struct i40e_q_vector *q_vector =
+		container_of(notify, struct i40e_q_vector, affinity_notify);
+
+	cpumask_copy(&q_vector->affinity_mask, mask);
+}
+
+/**
+ * i40e_irq_affinity_release - Callback for affinity notifier release
+ * @ref: internal core kernel usage
+ *
+ * This is a callback function used by the irq_set_affinity_notifier function
+ * to inform the current notification subscriber that they will no longer
+ * receive notifications.
+ **/
+static void i40e_irq_affinity_release(struct kref *ref) {}
+
+/**
+ * i40e_vsi_request_irq_msix - Initialize MSI-X interrupts
+ * @vsi: the VSI being configured
+ * @basename: name for the vector
+ *
+ * Allocates MSI-X vectors and requests interrupts from the kernel.
+ **/
+static int i40e_vsi_request_irq_msix(struct i40e_vsi *vsi, char *basename)
+{
+	int q_vectors = vsi->num_q_vectors;
+	struct i40e_pf *pf = vsi->back;
+	int base = vsi->base_vector;
+	int rx_int_idx = 0;
+	int tx_int_idx = 0;
+	int vector, err;
+	int irq_num;
+	int cpu;
+
+	for (vector = 0; vector < q_vectors; vector++) {
+		struct i40e_q_vector *q_vector = vsi->q_vectors[vector];
+
+		irq_num = pf->msix_entries[base + vector].vector;
+
+		if (q_vector->tx.ring && q_vector->rx.ring) {
+			snprintf(q_vector->name, sizeof(q_vector->name) - 1,
+				 "%s-%s-%d", basename, "TxRx", rx_int_idx++);
+			tx_int_idx++;
+		} else if (q_vector->rx.ring) {
+			snprintf(q_vector->name, sizeof(q_vector->name) - 1,
+				 "%s-%s-%d", basename, "rx", rx_int_idx++);
+		} else if (q_vector->tx.ring) {
+			snprintf(q_vector->name, sizeof(q_vector->name) - 1,
+				 "%s-%s-%d", basename, "tx", tx_int_idx++);
+		} else {
+			/* skip this unused q_vector */
+			continue;
+		}
+		err = request_irq(irq_num,
+				  vsi->irq_handler,
+				  0,
+				  q_vector->name,
+				  q_vector);
+		if (err) {
+			dev_info(&pf->pdev->dev,
+				 "MSIX request_irq failed, error: %d\n", err);
+			goto free_queue_irqs;
+		}
+
+		/* register for affinity change notifications */
+		q_vector->affinity_notify.notify = i40e_irq_affinity_notify;
+		q_vector->affinity_notify.release = i40e_irq_affinity_release;
+		irq_set_affinity_notifier(irq_num, &q_vector->affinity_notify);
+		/* Spread affinity hints out across online CPUs.
+		 *
+		 * get_cpu_mask returns a static constant mask with
+		 * a permanent lifetime so it's ok to pass to
+		 * irq_set_affinity_hint without making a copy.
+		 */
+		cpu = cpumask_local_spread(q_vector->v_idx, -1);
+		irq_set_affinity_hint(irq_num, get_cpu_mask(cpu));
+	}
+
+	vsi->irqs_ready = true;
+	return 0;
+
+free_queue_irqs:
+	while (vector) {
+		vector--;
+		irq_num = pf->msix_entries[base + vector].vector;
+		irq_set_affinity_notifier(irq_num, NULL);
+		irq_set_affinity_hint(irq_num, NULL);
+		free_irq(irq_num, &vsi->q_vectors[vector]);
+	}
+	return err;
+}
+
+/**
+ * i40e_vsi_disable_irq - Mask off queue interrupt generation on the VSI
+ * @vsi: the VSI being un-configured
+ **/
+static void i40e_vsi_disable_irq(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	int base = vsi->base_vector;
+	int i;
+
+	/* disable interrupt causation from each queue */
+	for (i = 0; i < vsi->num_queue_pairs; i++) {
+		u32 val;
+
+		val = rd32(hw, I40E_QINT_TQCTL(vsi->tx_rings[i]->reg_idx));
+		val &= ~I40E_QINT_TQCTL_CAUSE_ENA_MASK;
+		wr32(hw, I40E_QINT_TQCTL(vsi->tx_rings[i]->reg_idx), val);
+
+		val = rd32(hw, I40E_QINT_RQCTL(vsi->rx_rings[i]->reg_idx));
+		val &= ~I40E_QINT_RQCTL_CAUSE_ENA_MASK;
+		wr32(hw, I40E_QINT_RQCTL(vsi->rx_rings[i]->reg_idx), val);
+
+		if (!i40e_enabled_xdp_vsi(vsi))
+			continue;
+		wr32(hw, I40E_QINT_TQCTL(vsi->xdp_rings[i]->reg_idx), 0);
+	}
+
+	/* disable each interrupt */
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED) {
+		for (i = vsi->base_vector;
+		     i < (vsi->num_q_vectors + vsi->base_vector); i++)
+			wr32(hw, I40E_PFINT_DYN_CTLN(i - 1), 0);
+
+		i40e_flush(hw);
+		for (i = 0; i < vsi->num_q_vectors; i++)
+			synchronize_irq(pf->msix_entries[i + base].vector);
+	} else {
+		/* Legacy and MSI mode - this stops all interrupt handling */
+		wr32(hw, I40E_PFINT_ICR0_ENA, 0);
+		wr32(hw, I40E_PFINT_DYN_CTL0, 0);
+		i40e_flush(hw);
+		synchronize_irq(pf->pdev->irq);
+	}
+}
+
+/**
+ * i40e_vsi_enable_irq - Enable IRQ for the given VSI
+ * @vsi: the VSI being configured
+ **/
+static int i40e_vsi_enable_irq(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	int i;
+
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED) {
+		for (i = 0; i < vsi->num_q_vectors; i++)
+			i40e_irq_dynamic_enable(vsi, i);
+	} else {
+		i40e_irq_dynamic_enable_icr0(pf);
+	}
+
+	i40e_flush(&pf->hw);
+	return 0;
+}
+
+/**
+ * i40e_free_misc_vector - Free the vector that handles non-queue events
+ * @pf: board private structure
+ **/
+static void i40e_free_misc_vector(struct i40e_pf *pf)
+{
+	/* Disable ICR 0 */
+	wr32(&pf->hw, I40E_PFINT_ICR0_ENA, 0);
+	i40e_flush(&pf->hw);
+
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED && pf->msix_entries) {
+		synchronize_irq(pf->msix_entries[0].vector);
+		free_irq(pf->msix_entries[0].vector, pf);
+		clear_bit(__I40E_MISC_IRQ_REQUESTED, pf->state);
+	}
+}
+
+/**
+ * i40e_intr - MSI/Legacy and non-queue interrupt handler
+ * @irq: interrupt number
+ * @data: pointer to a q_vector
+ *
+ * This is the handler used for all MSI/Legacy interrupts, and deals
+ * with both queue and non-queue interrupts.  This is also used in
+ * MSIX mode to handle the non-queue interrupts.
+ **/
+static irqreturn_t i40e_intr(int irq, void *data)
+{
+	struct i40e_pf *pf = (struct i40e_pf *)data;
+	struct i40e_hw *hw = &pf->hw;
+	irqreturn_t ret = IRQ_NONE;
+	u32 icr0, icr0_remaining;
+	u32 val, ena_mask;
+
+	icr0 = rd32(hw, I40E_PFINT_ICR0);
+	ena_mask = rd32(hw, I40E_PFINT_ICR0_ENA);
+
+	/* if sharing a legacy IRQ, we might get called w/o an intr pending */
+	if ((icr0 & I40E_PFINT_ICR0_INTEVENT_MASK) == 0)
+		goto enable_intr;
+
+	/* if interrupt but no bits showing, must be SWINT */
+	if (((icr0 & ~I40E_PFINT_ICR0_INTEVENT_MASK) == 0) ||
+	    (icr0 & I40E_PFINT_ICR0_SWINT_MASK))
+		pf->sw_int_count++;
+
+	if ((pf->flags & I40E_FLAG_IWARP_ENABLED) &&
+	    (icr0 & I40E_PFINT_ICR0_ENA_PE_CRITERR_MASK)) {
+		ena_mask &= ~I40E_PFINT_ICR0_ENA_PE_CRITERR_MASK;
+		dev_dbg(&pf->pdev->dev, "cleared PE_CRITERR\n");
+		set_bit(__I40E_CORE_RESET_REQUESTED, pf->state);
+	}
+
+	/* only q0 is used in MSI/Legacy mode, and none are used in MSIX */
+	if (icr0 & I40E_PFINT_ICR0_QUEUE_0_MASK) {
+		struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+		struct i40e_q_vector *q_vector = vsi->q_vectors[0];
+
+		/* We do not have a way to disarm Queue causes while leaving
+		 * interrupt enabled for all other causes, ideally
+		 * interrupt should be disabled while we are in NAPI but
+		 * this is not a performance path and napi_schedule()
+		 * can deal with rescheduling.
+		 */
+		if (!test_bit(__I40E_DOWN, pf->state))
+			napi_schedule_irqoff(&q_vector->napi);
+	}
+
+	if (icr0 & I40E_PFINT_ICR0_ADMINQ_MASK) {
+		ena_mask &= ~I40E_PFINT_ICR0_ENA_ADMINQ_MASK;
+		set_bit(__I40E_ADMINQ_EVENT_PENDING, pf->state);
+		i40e_debug(&pf->hw, I40E_DEBUG_NVM, "AdminQ event\n");
+	}
+
+	if (icr0 & I40E_PFINT_ICR0_MAL_DETECT_MASK) {
+		ena_mask &= ~I40E_PFINT_ICR0_ENA_MAL_DETECT_MASK;
+		set_bit(__I40E_MDD_EVENT_PENDING, pf->state);
+	}
+
+	if (icr0 & I40E_PFINT_ICR0_VFLR_MASK) {
+		ena_mask &= ~I40E_PFINT_ICR0_ENA_VFLR_MASK;
+		set_bit(__I40E_VFLR_EVENT_PENDING, pf->state);
+	}
+
+	if (icr0 & I40E_PFINT_ICR0_GRST_MASK) {
+		if (!test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state))
+			set_bit(__I40E_RESET_INTR_RECEIVED, pf->state);
+		ena_mask &= ~I40E_PFINT_ICR0_ENA_GRST_MASK;
+		val = rd32(hw, I40E_GLGEN_RSTAT);
+		val = (val & I40E_GLGEN_RSTAT_RESET_TYPE_MASK)
+		       >> I40E_GLGEN_RSTAT_RESET_TYPE_SHIFT;
+		if (val == I40E_RESET_CORER) {
+			pf->corer_count++;
+		} else if (val == I40E_RESET_GLOBR) {
+			pf->globr_count++;
+		} else if (val == I40E_RESET_EMPR) {
+			pf->empr_count++;
+			set_bit(__I40E_EMP_RESET_INTR_RECEIVED, pf->state);
+		}
+	}
+
+	if (icr0 & I40E_PFINT_ICR0_HMC_ERR_MASK) {
+		icr0 &= ~I40E_PFINT_ICR0_HMC_ERR_MASK;
+		dev_info(&pf->pdev->dev, "HMC error interrupt\n");
+		dev_info(&pf->pdev->dev, "HMC error info 0x%x, HMC error data 0x%x\n",
+			 rd32(hw, I40E_PFHMC_ERRORINFO),
+			 rd32(hw, I40E_PFHMC_ERRORDATA));
+	}
+
+	if (icr0 & I40E_PFINT_ICR0_TIMESYNC_MASK) {
+		u32 prttsyn_stat = rd32(hw, I40E_PRTTSYN_STAT_0);
+
+		if (prttsyn_stat & I40E_PRTTSYN_STAT_0_TXTIME_MASK) {
+			icr0 &= ~I40E_PFINT_ICR0_ENA_TIMESYNC_MASK;
+			i40e_ptp_tx_hwtstamp(pf);
+		}
+	}
+
+	/* If a critical error is pending we have no choice but to reset the
+	 * device.
+	 * Report and mask out any remaining unexpected interrupts.
+	 */
+	icr0_remaining = icr0 & ena_mask;
+	if (icr0_remaining) {
+		dev_info(&pf->pdev->dev, "unhandled interrupt icr0=0x%08x\n",
+			 icr0_remaining);
+		if ((icr0_remaining & I40E_PFINT_ICR0_PE_CRITERR_MASK) ||
+		    (icr0_remaining & I40E_PFINT_ICR0_PCI_EXCEPTION_MASK) ||
+		    (icr0_remaining & I40E_PFINT_ICR0_ECC_ERR_MASK)) {
+			dev_info(&pf->pdev->dev, "device will be reset\n");
+			set_bit(__I40E_PF_RESET_REQUESTED, pf->state);
+			i40e_service_event_schedule(pf);
+		}
+		ena_mask &= ~icr0_remaining;
+	}
+	ret = IRQ_HANDLED;
+
+enable_intr:
+	/* re-enable interrupt causes */
+	wr32(hw, I40E_PFINT_ICR0_ENA, ena_mask);
+	if (!test_bit(__I40E_DOWN, pf->state)) {
+		i40e_service_event_schedule(pf);
+		i40e_irq_dynamic_enable_icr0(pf);
+	}
+
+	return ret;
+}
+
+/**
+ * i40e_clean_fdir_tx_irq - Reclaim resources after transmit completes
+ * @tx_ring:  tx ring to clean
+ * @budget:   how many cleans we're allowed
+ *
+ * Returns true if there's any budget left (e.g. the clean is finished)
+ **/
+static bool i40e_clean_fdir_tx_irq(struct i40e_ring *tx_ring, int budget)
+{
+	struct i40e_vsi *vsi = tx_ring->vsi;
+	u16 i = tx_ring->next_to_clean;
+	struct i40e_tx_buffer *tx_buf;
+	struct i40e_tx_desc *tx_desc;
+
+	tx_buf = &tx_ring->tx_bi[i];
+	tx_desc = I40E_TX_DESC(tx_ring, i);
+	i -= tx_ring->count;
+
+	do {
+		struct i40e_tx_desc *eop_desc = tx_buf->next_to_watch;
+
+		/* if next_to_watch is not set then there is no work pending */
+		if (!eop_desc)
+			break;
+
+		/* prevent any other reads prior to eop_desc */
+		smp_rmb();
+
+		/* if the descriptor isn't done, no work yet to do */
+		if (!(eop_desc->cmd_type_offset_bsz &
+		      cpu_to_le64(I40E_TX_DESC_DTYPE_DESC_DONE)))
+			break;
+
+		/* clear next_to_watch to prevent false hangs */
+		tx_buf->next_to_watch = NULL;
+
+		tx_desc->buffer_addr = 0;
+		tx_desc->cmd_type_offset_bsz = 0;
+		/* move past filter desc */
+		tx_buf++;
+		tx_desc++;
+		i++;
+		if (unlikely(!i)) {
+			i -= tx_ring->count;
+			tx_buf = tx_ring->tx_bi;
+			tx_desc = I40E_TX_DESC(tx_ring, 0);
+		}
+		/* unmap skb header data */
+		dma_unmap_single(tx_ring->dev,
+				 dma_unmap_addr(tx_buf, dma),
+				 dma_unmap_len(tx_buf, len),
+				 DMA_TO_DEVICE);
+		if (tx_buf->tx_flags & I40E_TX_FLAGS_FD_SB)
+			kfree(tx_buf->raw_buf);
+
+		tx_buf->raw_buf = NULL;
+		tx_buf->tx_flags = 0;
+		tx_buf->next_to_watch = NULL;
+		dma_unmap_len_set(tx_buf, len, 0);
+		tx_desc->buffer_addr = 0;
+		tx_desc->cmd_type_offset_bsz = 0;
+
+		/* move us past the eop_desc for start of next FD desc */
+		tx_buf++;
+		tx_desc++;
+		i++;
+		if (unlikely(!i)) {
+			i -= tx_ring->count;
+			tx_buf = tx_ring->tx_bi;
+			tx_desc = I40E_TX_DESC(tx_ring, 0);
+		}
+
+		/* update budget accounting */
+		budget--;
+	} while (likely(budget));
+
+	i += tx_ring->count;
+	tx_ring->next_to_clean = i;
+
+	if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED)
+		i40e_irq_dynamic_enable(vsi, tx_ring->q_vector->v_idx);
+
+	return budget > 0;
+}
+
+/**
+ * i40e_fdir_clean_ring - Interrupt Handler for FDIR SB ring
+ * @irq: interrupt number
+ * @data: pointer to a q_vector
+ **/
+static irqreturn_t i40e_fdir_clean_ring(int irq, void *data)
+{
+	struct i40e_q_vector *q_vector = data;
+	struct i40e_vsi *vsi;
+
+	if (!q_vector->tx.ring)
+		return IRQ_HANDLED;
+
+	vsi = q_vector->tx.ring->vsi;
+	i40e_clean_fdir_tx_irq(q_vector->tx.ring, vsi->work_limit);
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * i40e_map_vector_to_qp - Assigns the queue pair to the vector
+ * @vsi: the VSI being configured
+ * @v_idx: vector index
+ * @qp_idx: queue pair index
+ **/
+static void i40e_map_vector_to_qp(struct i40e_vsi *vsi, int v_idx, int qp_idx)
+{
+	struct i40e_q_vector *q_vector = vsi->q_vectors[v_idx];
+	struct i40e_ring *tx_ring = vsi->tx_rings[qp_idx];
+	struct i40e_ring *rx_ring = vsi->rx_rings[qp_idx];
+
+	tx_ring->q_vector = q_vector;
+	tx_ring->next = q_vector->tx.ring;
+	q_vector->tx.ring = tx_ring;
+	q_vector->tx.count++;
+
+	/* Place XDP Tx ring in the same q_vector ring list as regular Tx */
+	if (i40e_enabled_xdp_vsi(vsi)) {
+		struct i40e_ring *xdp_ring = vsi->xdp_rings[qp_idx];
+
+		xdp_ring->q_vector = q_vector;
+		xdp_ring->next = q_vector->tx.ring;
+		q_vector->tx.ring = xdp_ring;
+		q_vector->tx.count++;
+	}
+
+	rx_ring->q_vector = q_vector;
+	rx_ring->next = q_vector->rx.ring;
+	q_vector->rx.ring = rx_ring;
+	q_vector->rx.count++;
+}
+
+/**
+ * i40e_vsi_map_rings_to_vectors - Maps descriptor rings to vectors
+ * @vsi: the VSI being configured
+ *
+ * This function maps descriptor rings to the queue-specific vectors
+ * we were allotted through the MSI-X enabling code.  Ideally, we'd have
+ * one vector per queue pair, but on a constrained vector budget, we
+ * group the queue pairs as "efficiently" as possible.
+ **/
+static void i40e_vsi_map_rings_to_vectors(struct i40e_vsi *vsi)
+{
+	int qp_remaining = vsi->num_queue_pairs;
+	int q_vectors = vsi->num_q_vectors;
+	int num_ringpairs;
+	int v_start = 0;
+	int qp_idx = 0;
+
+	/* If we don't have enough vectors for a 1-to-1 mapping, we'll have to
+	 * group them so there are multiple queues per vector.
+	 * It is also important to go through all the vectors available to be
+	 * sure that if we don't use all the vectors, that the remaining vectors
+	 * are cleared. This is especially important when decreasing the
+	 * number of queues in use.
+	 */
+	for (; v_start < q_vectors; v_start++) {
+		struct i40e_q_vector *q_vector = vsi->q_vectors[v_start];
+
+		num_ringpairs = DIV_ROUND_UP(qp_remaining, q_vectors - v_start);
+
+		q_vector->num_ringpairs = num_ringpairs;
+		q_vector->reg_idx = q_vector->v_idx + vsi->base_vector - 1;
+
+		q_vector->rx.count = 0;
+		q_vector->tx.count = 0;
+		q_vector->rx.ring = NULL;
+		q_vector->tx.ring = NULL;
+
+		while (num_ringpairs--) {
+			i40e_map_vector_to_qp(vsi, v_start, qp_idx);
+			qp_idx++;
+			qp_remaining--;
+		}
+	}
+}
+
+/**
+ * i40e_vsi_request_irq - Request IRQ from the OS
+ * @vsi: the VSI being configured
+ * @basename: name for the vector
+ **/
+static int i40e_vsi_request_irq(struct i40e_vsi *vsi, char *basename)
+{
+	struct i40e_pf *pf = vsi->back;
+	int err;
+
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED)
+		err = i40e_vsi_request_irq_msix(vsi, basename);
+	else if (pf->flags & I40E_FLAG_MSI_ENABLED)
+		err = request_irq(pf->pdev->irq, i40e_intr, 0,
+				  pf->int_name, pf);
+	else
+		err = request_irq(pf->pdev->irq, i40e_intr, IRQF_SHARED,
+				  pf->int_name, pf);
+
+	if (err)
+		dev_info(&pf->pdev->dev, "request_irq failed, Error %d\n", err);
+
+	return err;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/**
+ * i40e_netpoll - A Polling 'interrupt' handler
+ * @netdev: network interface device structure
+ *
+ * This is used by netconsole to send skbs without having to re-enable
+ * interrupts.  It's not called while the normal interrupt routine is executing.
+ **/
+static void i40e_netpoll(struct net_device *netdev)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	int i;
+
+	/* if interface is down do nothing */
+	if (test_bit(__I40E_VSI_DOWN, vsi->state))
+		return;
+
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED) {
+		for (i = 0; i < vsi->num_q_vectors; i++)
+			i40e_msix_clean_rings(0, vsi->q_vectors[i]);
+	} else {
+		i40e_intr(pf->pdev->irq, netdev);
+	}
+}
+#endif
+
+#define I40E_QTX_ENA_WAIT_COUNT 50
+
+/**
+ * i40e_pf_txq_wait - Wait for a PF's Tx queue to be enabled or disabled
+ * @pf: the PF being configured
+ * @pf_q: the PF queue
+ * @enable: enable or disable state of the queue
+ *
+ * This routine will wait for the given Tx queue of the PF to reach the
+ * enabled or disabled state.
+ * Returns -ETIMEDOUT in case of failing to reach the requested state after
+ * multiple retries; else will return 0 in case of success.
+ **/
+static int i40e_pf_txq_wait(struct i40e_pf *pf, int pf_q, bool enable)
+{
+	int i;
+	u32 tx_reg;
+
+	for (i = 0; i < I40E_QUEUE_WAIT_RETRY_LIMIT; i++) {
+		tx_reg = rd32(&pf->hw, I40E_QTX_ENA(pf_q));
+		if (enable == !!(tx_reg & I40E_QTX_ENA_QENA_STAT_MASK))
+			break;
+
+		usleep_range(10, 20);
+	}
+	if (i >= I40E_QUEUE_WAIT_RETRY_LIMIT)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+/**
+ * i40e_control_tx_q - Start or stop a particular Tx queue
+ * @pf: the PF structure
+ * @pf_q: the PF queue to configure
+ * @enable: start or stop the queue
+ *
+ * This function enables or disables a single queue. Note that any delay
+ * required after the operation is expected to be handled by the caller of
+ * this function.
+ **/
+static void i40e_control_tx_q(struct i40e_pf *pf, int pf_q, bool enable)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u32 tx_reg;
+	int i;
+
+	/* warn the TX unit of coming changes */
+	i40e_pre_tx_queue_cfg(&pf->hw, pf_q, enable);
+	if (!enable)
+		usleep_range(10, 20);
+
+	for (i = 0; i < I40E_QTX_ENA_WAIT_COUNT; i++) {
+		tx_reg = rd32(hw, I40E_QTX_ENA(pf_q));
+		if (((tx_reg >> I40E_QTX_ENA_QENA_REQ_SHIFT) & 1) ==
+		    ((tx_reg >> I40E_QTX_ENA_QENA_STAT_SHIFT) & 1))
+			break;
+		usleep_range(1000, 2000);
+	}
+
+	/* Skip if the queue is already in the requested state */
+	if (enable == !!(tx_reg & I40E_QTX_ENA_QENA_STAT_MASK))
+		return;
+
+	/* turn on/off the queue */
+	if (enable) {
+		wr32(hw, I40E_QTX_HEAD(pf_q), 0);
+		tx_reg |= I40E_QTX_ENA_QENA_REQ_MASK;
+	} else {
+		tx_reg &= ~I40E_QTX_ENA_QENA_REQ_MASK;
+	}
+
+	wr32(hw, I40E_QTX_ENA(pf_q), tx_reg);
+}
+
+/**
+ * i40e_control_wait_tx_q - Start/stop Tx queue and wait for completion
+ * @seid: VSI SEID
+ * @pf: the PF structure
+ * @pf_q: the PF queue to configure
+ * @is_xdp: true if the queue is used for XDP
+ * @enable: start or stop the queue
+ **/
+static int i40e_control_wait_tx_q(int seid, struct i40e_pf *pf, int pf_q,
+				  bool is_xdp, bool enable)
+{
+	int ret;
+
+	i40e_control_tx_q(pf, pf_q, enable);
+
+	/* wait for the change to finish */
+	ret = i40e_pf_txq_wait(pf, pf_q, enable);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "VSI seid %d %sTx ring %d %sable timeout\n",
+			 seid, (is_xdp ? "XDP " : ""), pf_q,
+			 (enable ? "en" : "dis"));
+	}
+
+	return ret;
+}
+
+/**
+ * i40e_vsi_control_tx - Start or stop a VSI's rings
+ * @vsi: the VSI being configured
+ * @enable: start or stop the rings
+ **/
+static int i40e_vsi_control_tx(struct i40e_vsi *vsi, bool enable)
+{
+	struct i40e_pf *pf = vsi->back;
+	int i, pf_q, ret = 0;
+
+	pf_q = vsi->base_queue;
+	for (i = 0; i < vsi->num_queue_pairs; i++, pf_q++) {
+		ret = i40e_control_wait_tx_q(vsi->seid, pf,
+					     pf_q,
+					     false /*is xdp*/, enable);
+		if (ret)
+			break;
+
+		if (!i40e_enabled_xdp_vsi(vsi))
+			continue;
+
+		ret = i40e_control_wait_tx_q(vsi->seid, pf,
+					     pf_q + vsi->alloc_queue_pairs,
+					     true /*is xdp*/, enable);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/**
+ * i40e_pf_rxq_wait - Wait for a PF's Rx queue to be enabled or disabled
+ * @pf: the PF being configured
+ * @pf_q: the PF queue
+ * @enable: enable or disable state of the queue
+ *
+ * This routine will wait for the given Rx queue of the PF to reach the
+ * enabled or disabled state.
+ * Returns -ETIMEDOUT in case of failing to reach the requested state after
+ * multiple retries; else will return 0 in case of success.
+ **/
+static int i40e_pf_rxq_wait(struct i40e_pf *pf, int pf_q, bool enable)
+{
+	int i;
+	u32 rx_reg;
+
+	for (i = 0; i < I40E_QUEUE_WAIT_RETRY_LIMIT; i++) {
+		rx_reg = rd32(&pf->hw, I40E_QRX_ENA(pf_q));
+		if (enable == !!(rx_reg & I40E_QRX_ENA_QENA_STAT_MASK))
+			break;
+
+		usleep_range(10, 20);
+	}
+	if (i >= I40E_QUEUE_WAIT_RETRY_LIMIT)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+/**
+ * i40e_control_rx_q - Start or stop a particular Rx queue
+ * @pf: the PF structure
+ * @pf_q: the PF queue to configure
+ * @enable: start or stop the queue
+ *
+ * This function enables or disables a single queue. Note that any delay
+ * required after the operation is expected to be handled by the caller of
+ * this function.
+ **/
+static void i40e_control_rx_q(struct i40e_pf *pf, int pf_q, bool enable)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u32 rx_reg;
+	int i;
+
+	for (i = 0; i < I40E_QTX_ENA_WAIT_COUNT; i++) {
+		rx_reg = rd32(hw, I40E_QRX_ENA(pf_q));
+		if (((rx_reg >> I40E_QRX_ENA_QENA_REQ_SHIFT) & 1) ==
+		    ((rx_reg >> I40E_QRX_ENA_QENA_STAT_SHIFT) & 1))
+			break;
+		usleep_range(1000, 2000);
+	}
+
+	/* Skip if the queue is already in the requested state */
+	if (enable == !!(rx_reg & I40E_QRX_ENA_QENA_STAT_MASK))
+		return;
+
+	/* turn on/off the queue */
+	if (enable)
+		rx_reg |= I40E_QRX_ENA_QENA_REQ_MASK;
+	else
+		rx_reg &= ~I40E_QRX_ENA_QENA_REQ_MASK;
+
+	wr32(hw, I40E_QRX_ENA(pf_q), rx_reg);
+}
+
+/**
+ * i40e_vsi_control_rx - Start or stop a VSI's rings
+ * @vsi: the VSI being configured
+ * @enable: start or stop the rings
+ **/
+static int i40e_vsi_control_rx(struct i40e_vsi *vsi, bool enable)
+{
+	struct i40e_pf *pf = vsi->back;
+	int i, pf_q, ret = 0;
+
+	pf_q = vsi->base_queue;
+	for (i = 0; i < vsi->num_queue_pairs; i++, pf_q++) {
+		i40e_control_rx_q(pf, pf_q, enable);
+
+		/* wait for the change to finish */
+		ret = i40e_pf_rxq_wait(pf, pf_q, enable);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "VSI seid %d Rx ring %d %sable timeout\n",
+				 vsi->seid, pf_q, (enable ? "en" : "dis"));
+			break;
+		}
+	}
+
+	/* Due to HW errata, on Rx disable only, the register can indicate done
+	 * before it really is. Needs 50ms to be sure
+	 */
+	if (!enable)
+		mdelay(50);
+
+	return ret;
+}
+
+/**
+ * i40e_vsi_start_rings - Start a VSI's rings
+ * @vsi: the VSI being configured
+ **/
+int i40e_vsi_start_rings(struct i40e_vsi *vsi)
+{
+	int ret = 0;
+
+	/* do rx first for enable and last for disable */
+	ret = i40e_vsi_control_rx(vsi, true);
+	if (ret)
+		return ret;
+	ret = i40e_vsi_control_tx(vsi, true);
+
+	return ret;
+}
+
+/**
+ * i40e_vsi_stop_rings - Stop a VSI's rings
+ * @vsi: the VSI being configured
+ **/
+void i40e_vsi_stop_rings(struct i40e_vsi *vsi)
+{
+	/* When port TX is suspended, don't wait */
+	if (test_bit(__I40E_PORT_SUSPENDED, vsi->back->state))
+		return i40e_vsi_stop_rings_no_wait(vsi);
+
+	/* do rx first for enable and last for disable
+	 * Ignore return value, we need to shutdown whatever we can
+	 */
+	i40e_vsi_control_tx(vsi, false);
+	i40e_vsi_control_rx(vsi, false);
+}
+
+/**
+ * i40e_vsi_stop_rings_no_wait - Stop a VSI's rings and do not delay
+ * @vsi: the VSI being shutdown
+ *
+ * This function stops all the rings for a VSI but does not delay to verify
+ * that rings have been disabled. It is expected that the caller is shutting
+ * down multiple VSIs at once and will delay together for all the VSIs after
+ * initiating the shutdown. This is particularly useful for shutting down lots
+ * of VFs together. Otherwise, a large delay can be incurred while configuring
+ * each VSI in serial.
+ **/
+void i40e_vsi_stop_rings_no_wait(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	int i, pf_q;
+
+	pf_q = vsi->base_queue;
+	for (i = 0; i < vsi->num_queue_pairs; i++, pf_q++) {
+		i40e_control_tx_q(pf, pf_q, false);
+		i40e_control_rx_q(pf, pf_q, false);
+	}
+}
+
+/**
+ * i40e_vsi_free_irq - Free the irq association with the OS
+ * @vsi: the VSI being configured
+ **/
+static void i40e_vsi_free_irq(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	int base = vsi->base_vector;
+	u32 val, qp;
+	int i;
+
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED) {
+		if (!vsi->q_vectors)
+			return;
+
+		if (!vsi->irqs_ready)
+			return;
+
+		vsi->irqs_ready = false;
+		for (i = 0; i < vsi->num_q_vectors; i++) {
+			int irq_num;
+			u16 vector;
+
+			vector = i + base;
+			irq_num = pf->msix_entries[vector].vector;
+
+			/* free only the irqs that were actually requested */
+			if (!vsi->q_vectors[i] ||
+			    !vsi->q_vectors[i]->num_ringpairs)
+				continue;
+
+			/* clear the affinity notifier in the IRQ descriptor */
+			irq_set_affinity_notifier(irq_num, NULL);
+			/* remove our suggested affinity mask for this IRQ */
+			irq_set_affinity_hint(irq_num, NULL);
+			synchronize_irq(irq_num);
+			free_irq(irq_num, vsi->q_vectors[i]);
+
+			/* Tear down the interrupt queue link list
+			 *
+			 * We know that they come in pairs and always
+			 * the Rx first, then the Tx.  To clear the
+			 * link list, stick the EOL value into the
+			 * next_q field of the registers.
+			 */
+			val = rd32(hw, I40E_PFINT_LNKLSTN(vector - 1));
+			qp = (val & I40E_PFINT_LNKLSTN_FIRSTQ_INDX_MASK)
+				>> I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT;
+			val |= I40E_QUEUE_END_OF_LIST
+				<< I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT;
+			wr32(hw, I40E_PFINT_LNKLSTN(vector - 1), val);
+
+			while (qp != I40E_QUEUE_END_OF_LIST) {
+				u32 next;
+
+				val = rd32(hw, I40E_QINT_RQCTL(qp));
+
+				val &= ~(I40E_QINT_RQCTL_MSIX_INDX_MASK  |
+					 I40E_QINT_RQCTL_MSIX0_INDX_MASK |
+					 I40E_QINT_RQCTL_CAUSE_ENA_MASK  |
+					 I40E_QINT_RQCTL_INTEVENT_MASK);
+
+				val |= (I40E_QINT_RQCTL_ITR_INDX_MASK |
+					 I40E_QINT_RQCTL_NEXTQ_INDX_MASK);
+
+				wr32(hw, I40E_QINT_RQCTL(qp), val);
+
+				val = rd32(hw, I40E_QINT_TQCTL(qp));
+
+				next = (val & I40E_QINT_TQCTL_NEXTQ_INDX_MASK)
+					>> I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT;
+
+				val &= ~(I40E_QINT_TQCTL_MSIX_INDX_MASK  |
+					 I40E_QINT_TQCTL_MSIX0_INDX_MASK |
+					 I40E_QINT_TQCTL_CAUSE_ENA_MASK  |
+					 I40E_QINT_TQCTL_INTEVENT_MASK);
+
+				val |= (I40E_QINT_TQCTL_ITR_INDX_MASK |
+					 I40E_QINT_TQCTL_NEXTQ_INDX_MASK);
+
+				wr32(hw, I40E_QINT_TQCTL(qp), val);
+				qp = next;
+			}
+		}
+	} else {
+		free_irq(pf->pdev->irq, pf);
+
+		val = rd32(hw, I40E_PFINT_LNKLST0);
+		qp = (val & I40E_PFINT_LNKLSTN_FIRSTQ_INDX_MASK)
+			>> I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT;
+		val |= I40E_QUEUE_END_OF_LIST
+			<< I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT;
+		wr32(hw, I40E_PFINT_LNKLST0, val);
+
+		val = rd32(hw, I40E_QINT_RQCTL(qp));
+		val &= ~(I40E_QINT_RQCTL_MSIX_INDX_MASK  |
+			 I40E_QINT_RQCTL_MSIX0_INDX_MASK |
+			 I40E_QINT_RQCTL_CAUSE_ENA_MASK  |
+			 I40E_QINT_RQCTL_INTEVENT_MASK);
+
+		val |= (I40E_QINT_RQCTL_ITR_INDX_MASK |
+			I40E_QINT_RQCTL_NEXTQ_INDX_MASK);
+
+		wr32(hw, I40E_QINT_RQCTL(qp), val);
+
+		val = rd32(hw, I40E_QINT_TQCTL(qp));
+
+		val &= ~(I40E_QINT_TQCTL_MSIX_INDX_MASK  |
+			 I40E_QINT_TQCTL_MSIX0_INDX_MASK |
+			 I40E_QINT_TQCTL_CAUSE_ENA_MASK  |
+			 I40E_QINT_TQCTL_INTEVENT_MASK);
+
+		val |= (I40E_QINT_TQCTL_ITR_INDX_MASK |
+			I40E_QINT_TQCTL_NEXTQ_INDX_MASK);
+
+		wr32(hw, I40E_QINT_TQCTL(qp), val);
+	}
+}
+
+/**
+ * i40e_free_q_vector - Free memory allocated for specific interrupt vector
+ * @vsi: the VSI being configured
+ * @v_idx: Index of vector to be freed
+ *
+ * This function frees the memory allocated to the q_vector.  In addition if
+ * NAPI is enabled it will delete any references to the NAPI struct prior
+ * to freeing the q_vector.
+ **/
+static void i40e_free_q_vector(struct i40e_vsi *vsi, int v_idx)
+{
+	struct i40e_q_vector *q_vector = vsi->q_vectors[v_idx];
+	struct i40e_ring *ring;
+
+	if (!q_vector)
+		return;
+
+	/* disassociate q_vector from rings */
+	i40e_for_each_ring(ring, q_vector->tx)
+		ring->q_vector = NULL;
+
+	i40e_for_each_ring(ring, q_vector->rx)
+		ring->q_vector = NULL;
+
+	/* only VSI w/ an associated netdev is set up w/ NAPI */
+	if (vsi->netdev)
+		netif_napi_del(&q_vector->napi);
+
+	vsi->q_vectors[v_idx] = NULL;
+
+	kfree_rcu(q_vector, rcu);
+}
+
+/**
+ * i40e_vsi_free_q_vectors - Free memory allocated for interrupt vectors
+ * @vsi: the VSI being un-configured
+ *
+ * This frees the memory allocated to the q_vectors and
+ * deletes references to the NAPI struct.
+ **/
+static void i40e_vsi_free_q_vectors(struct i40e_vsi *vsi)
+{
+	int v_idx;
+
+	for (v_idx = 0; v_idx < vsi->num_q_vectors; v_idx++)
+		i40e_free_q_vector(vsi, v_idx);
+}
+
+/**
+ * i40e_reset_interrupt_capability - Disable interrupt setup in OS
+ * @pf: board private structure
+ **/
+static void i40e_reset_interrupt_capability(struct i40e_pf *pf)
+{
+	/* If we're in Legacy mode, the interrupt was cleaned in vsi_close */
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED) {
+		pci_disable_msix(pf->pdev);
+		kfree(pf->msix_entries);
+		pf->msix_entries = NULL;
+		kfree(pf->irq_pile);
+		pf->irq_pile = NULL;
+	} else if (pf->flags & I40E_FLAG_MSI_ENABLED) {
+		pci_disable_msi(pf->pdev);
+	}
+	pf->flags &= ~(I40E_FLAG_MSIX_ENABLED | I40E_FLAG_MSI_ENABLED);
+}
+
+/**
+ * i40e_clear_interrupt_scheme - Clear the current interrupt scheme settings
+ * @pf: board private structure
+ *
+ * We go through and clear interrupt specific resources and reset the structure
+ * to pre-load conditions
+ **/
+static void i40e_clear_interrupt_scheme(struct i40e_pf *pf)
+{
+	int i;
+
+	i40e_free_misc_vector(pf);
+
+	i40e_put_lump(pf->irq_pile, pf->iwarp_base_vector,
+		      I40E_IWARP_IRQ_PILE_ID);
+
+	i40e_put_lump(pf->irq_pile, 0, I40E_PILE_VALID_BIT-1);
+	for (i = 0; i < pf->num_alloc_vsi; i++)
+		if (pf->vsi[i])
+			i40e_vsi_free_q_vectors(pf->vsi[i]);
+	i40e_reset_interrupt_capability(pf);
+}
+
+/**
+ * i40e_napi_enable_all - Enable NAPI for all q_vectors in the VSI
+ * @vsi: the VSI being configured
+ **/
+static void i40e_napi_enable_all(struct i40e_vsi *vsi)
+{
+	int q_idx;
+
+	if (!vsi->netdev)
+		return;
+
+	for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++) {
+		struct i40e_q_vector *q_vector = vsi->q_vectors[q_idx];
+
+		if (q_vector->rx.ring || q_vector->tx.ring)
+			napi_enable(&q_vector->napi);
+	}
+}
+
+/**
+ * i40e_napi_disable_all - Disable NAPI for all q_vectors in the VSI
+ * @vsi: the VSI being configured
+ **/
+static void i40e_napi_disable_all(struct i40e_vsi *vsi)
+{
+	int q_idx;
+
+	if (!vsi->netdev)
+		return;
+
+	for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++) {
+		struct i40e_q_vector *q_vector = vsi->q_vectors[q_idx];
+
+		if (q_vector->rx.ring || q_vector->tx.ring)
+			napi_disable(&q_vector->napi);
+	}
+}
+
+/**
+ * i40e_vsi_close - Shut down a VSI
+ * @vsi: the vsi to be quelled
+ **/
+static void i40e_vsi_close(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	if (!test_and_set_bit(__I40E_VSI_DOWN, vsi->state))
+		i40e_down(vsi);
+	i40e_vsi_free_irq(vsi);
+	i40e_vsi_free_tx_resources(vsi);
+	i40e_vsi_free_rx_resources(vsi);
+	vsi->current_netdev_flags = 0;
+	set_bit(__I40E_CLIENT_SERVICE_REQUESTED, pf->state);
+	if (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state))
+		set_bit(__I40E_CLIENT_RESET, pf->state);
+}
+
+/**
+ * i40e_quiesce_vsi - Pause a given VSI
+ * @vsi: the VSI being paused
+ **/
+static void i40e_quiesce_vsi(struct i40e_vsi *vsi)
+{
+	if (test_bit(__I40E_VSI_DOWN, vsi->state))
+		return;
+
+	set_bit(__I40E_VSI_NEEDS_RESTART, vsi->state);
+	if (vsi->netdev && netif_running(vsi->netdev))
+		vsi->netdev->netdev_ops->ndo_stop(vsi->netdev);
+	else
+		i40e_vsi_close(vsi);
+}
+
+/**
+ * i40e_unquiesce_vsi - Resume a given VSI
+ * @vsi: the VSI being resumed
+ **/
+static void i40e_unquiesce_vsi(struct i40e_vsi *vsi)
+{
+	if (!test_and_clear_bit(__I40E_VSI_NEEDS_RESTART, vsi->state))
+		return;
+
+	if (vsi->netdev && netif_running(vsi->netdev))
+		vsi->netdev->netdev_ops->ndo_open(vsi->netdev);
+	else
+		i40e_vsi_open(vsi);   /* this clears the DOWN bit */
+}
+
+/**
+ * i40e_pf_quiesce_all_vsi - Pause all VSIs on a PF
+ * @pf: the PF
+ **/
+static void i40e_pf_quiesce_all_vsi(struct i40e_pf *pf)
+{
+	int v;
+
+	for (v = 0; v < pf->num_alloc_vsi; v++) {
+		if (pf->vsi[v])
+			i40e_quiesce_vsi(pf->vsi[v]);
+	}
+}
+
+/**
+ * i40e_pf_unquiesce_all_vsi - Resume all VSIs on a PF
+ * @pf: the PF
+ **/
+static void i40e_pf_unquiesce_all_vsi(struct i40e_pf *pf)
+{
+	int v;
+
+	for (v = 0; v < pf->num_alloc_vsi; v++) {
+		if (pf->vsi[v])
+			i40e_unquiesce_vsi(pf->vsi[v]);
+	}
+}
+
+/**
+ * i40e_vsi_wait_queues_disabled - Wait for VSI's queues to be disabled
+ * @vsi: the VSI being configured
+ *
+ * Wait until all queues on a given VSI have been disabled.
+ **/
+int i40e_vsi_wait_queues_disabled(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	int i, pf_q, ret;
+
+	pf_q = vsi->base_queue;
+	for (i = 0; i < vsi->num_queue_pairs; i++, pf_q++) {
+		/* Check and wait for the Tx queue */
+		ret = i40e_pf_txq_wait(pf, pf_q, false);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "VSI seid %d Tx ring %d disable timeout\n",
+				 vsi->seid, pf_q);
+			return ret;
+		}
+
+		if (!i40e_enabled_xdp_vsi(vsi))
+			goto wait_rx;
+
+		/* Check and wait for the XDP Tx queue */
+		ret = i40e_pf_txq_wait(pf, pf_q + vsi->alloc_queue_pairs,
+				       false);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "VSI seid %d XDP Tx ring %d disable timeout\n",
+				 vsi->seid, pf_q);
+			return ret;
+		}
+wait_rx:
+		/* Check and wait for the Rx queue */
+		ret = i40e_pf_rxq_wait(pf, pf_q, false);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "VSI seid %d Rx ring %d disable timeout\n",
+				 vsi->seid, pf_q);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_I40E_DCB
+/**
+ * i40e_pf_wait_queues_disabled - Wait for all queues of PF VSIs to be disabled
+ * @pf: the PF
+ *
+ * This function waits for the queues to be in disabled state for all the
+ * VSIs that are managed by this PF.
+ **/
+static int i40e_pf_wait_queues_disabled(struct i40e_pf *pf)
+{
+	int v, ret = 0;
+
+	for (v = 0; v < pf->hw.func_caps.num_vsis; v++) {
+		if (pf->vsi[v]) {
+			ret = i40e_vsi_wait_queues_disabled(pf->vsi[v]);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+#endif
+
+/**
+ * i40e_get_iscsi_tc_map - Return TC map for iSCSI APP
+ * @pf: pointer to PF
+ *
+ * Get TC map for ISCSI PF type that will include iSCSI TC
+ * and LAN TC.
+ **/
+static u8 i40e_get_iscsi_tc_map(struct i40e_pf *pf)
+{
+	struct i40e_dcb_app_priority_table app;
+	struct i40e_hw *hw = &pf->hw;
+	u8 enabled_tc = 1; /* TC0 is always enabled */
+	u8 tc, i;
+	/* Get the iSCSI APP TLV */
+	struct i40e_dcbx_config *dcbcfg = &hw->local_dcbx_config;
+
+	for (i = 0; i < dcbcfg->numapps; i++) {
+		app = dcbcfg->app[i];
+		if (app.selector == I40E_APP_SEL_TCPIP &&
+		    app.protocolid == I40E_APP_PROTOID_ISCSI) {
+			tc = dcbcfg->etscfg.prioritytable[app.priority];
+			enabled_tc |= BIT(tc);
+			break;
+		}
+	}
+
+	return enabled_tc;
+}
+
+/**
+ * i40e_dcb_get_num_tc -  Get the number of TCs from DCBx config
+ * @dcbcfg: the corresponding DCBx configuration structure
+ *
+ * Return the number of TCs from given DCBx configuration
+ **/
+static u8 i40e_dcb_get_num_tc(struct i40e_dcbx_config *dcbcfg)
+{
+	int i, tc_unused = 0;
+	u8 num_tc = 0;
+	u8 ret = 0;
+
+	/* Scan the ETS Config Priority Table to find
+	 * traffic class enabled for a given priority
+	 * and create a bitmask of enabled TCs
+	 */
+	for (i = 0; i < I40E_MAX_USER_PRIORITY; i++)
+		num_tc |= BIT(dcbcfg->etscfg.prioritytable[i]);
+
+	/* Now scan the bitmask to check for
+	 * contiguous TCs starting with TC0
+	 */
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		if (num_tc & BIT(i)) {
+			if (!tc_unused) {
+				ret++;
+			} else {
+				pr_err("Non-contiguous TC - Disabling DCB\n");
+				return 1;
+			}
+		} else {
+			tc_unused = 1;
+		}
+	}
+
+	/* There is always at least TC0 */
+	if (!ret)
+		ret = 1;
+
+	return ret;
+}
+
+/**
+ * i40e_dcb_get_enabled_tc - Get enabled traffic classes
+ * @dcbcfg: the corresponding DCBx configuration structure
+ *
+ * Query the current DCB configuration and return the number of
+ * traffic classes enabled from the given DCBX config
+ **/
+static u8 i40e_dcb_get_enabled_tc(struct i40e_dcbx_config *dcbcfg)
+{
+	u8 num_tc = i40e_dcb_get_num_tc(dcbcfg);
+	u8 enabled_tc = 1;
+	u8 i;
+
+	for (i = 0; i < num_tc; i++)
+		enabled_tc |= BIT(i);
+
+	return enabled_tc;
+}
+
+/**
+ * i40e_mqprio_get_enabled_tc - Get enabled traffic classes
+ * @pf: PF being queried
+ *
+ * Query the current MQPRIO configuration and return the number of
+ * traffic classes enabled.
+ **/
+static u8 i40e_mqprio_get_enabled_tc(struct i40e_pf *pf)
+{
+	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+	u8 num_tc = vsi->mqprio_qopt.qopt.num_tc;
+	u8 enabled_tc = 1, i;
+
+	for (i = 1; i < num_tc; i++)
+		enabled_tc |= BIT(i);
+	return enabled_tc;
+}
+
+/**
+ * i40e_pf_get_num_tc - Get enabled traffic classes for PF
+ * @pf: PF being queried
+ *
+ * Return number of traffic classes enabled for the given PF
+ **/
+static u8 i40e_pf_get_num_tc(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u8 i, enabled_tc = 1;
+	u8 num_tc = 0;
+	struct i40e_dcbx_config *dcbcfg = &hw->local_dcbx_config;
+
+	if (pf->flags & I40E_FLAG_TC_MQPRIO)
+		return pf->vsi[pf->lan_vsi]->mqprio_qopt.qopt.num_tc;
+
+	/* If neither MQPRIO nor DCB is enabled, then always use single TC */
+	if (!(pf->flags & I40E_FLAG_DCB_ENABLED))
+		return 1;
+
+	/* SFP mode will be enabled for all TCs on port */
+	if (!(pf->flags & I40E_FLAG_MFP_ENABLED))
+		return i40e_dcb_get_num_tc(dcbcfg);
+
+	/* MFP mode return count of enabled TCs for this PF */
+	if (pf->hw.func_caps.iscsi)
+		enabled_tc =  i40e_get_iscsi_tc_map(pf);
+	else
+		return 1; /* Only TC0 */
+
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		if (enabled_tc & BIT(i))
+			num_tc++;
+	}
+	return num_tc;
+}
+
+/**
+ * i40e_pf_get_pf_tc_map - Get bitmap for enabled traffic classes
+ * @pf: PF being queried
+ *
+ * Return a bitmap for enabled traffic classes for this PF.
+ **/
+static u8 i40e_pf_get_tc_map(struct i40e_pf *pf)
+{
+	if (pf->flags & I40E_FLAG_TC_MQPRIO)
+		return i40e_mqprio_get_enabled_tc(pf);
+
+	/* If neither MQPRIO nor DCB is enabled for this PF then just return
+	 * default TC
+	 */
+	if (!(pf->flags & I40E_FLAG_DCB_ENABLED))
+		return I40E_DEFAULT_TRAFFIC_CLASS;
+
+	/* SFP mode we want PF to be enabled for all TCs */
+	if (!(pf->flags & I40E_FLAG_MFP_ENABLED))
+		return i40e_dcb_get_enabled_tc(&pf->hw.local_dcbx_config);
+
+	/* MFP enabled and iSCSI PF type */
+	if (pf->hw.func_caps.iscsi)
+		return i40e_get_iscsi_tc_map(pf);
+	else
+		return I40E_DEFAULT_TRAFFIC_CLASS;
+}
+
+/**
+ * i40e_vsi_get_bw_info - Query VSI BW Information
+ * @vsi: the VSI being queried
+ *
+ * Returns 0 on success, negative value on failure
+ **/
+static int i40e_vsi_get_bw_info(struct i40e_vsi *vsi)
+{
+	struct i40e_aqc_query_vsi_ets_sla_config_resp bw_ets_config = {0};
+	struct i40e_aqc_query_vsi_bw_config_resp bw_config = {0};
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	i40e_status ret;
+	u32 tc_bw_max;
+	int i;
+
+	/* Get the VSI level BW configuration */
+	ret = i40e_aq_query_vsi_bw_config(hw, vsi->seid, &bw_config, NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "couldn't get PF vsi bw config, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		return -EINVAL;
+	}
+
+	/* Get the VSI level BW configuration per TC */
+	ret = i40e_aq_query_vsi_ets_sla_config(hw, vsi->seid, &bw_ets_config,
+					       NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "couldn't get PF vsi ets bw config, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		return -EINVAL;
+	}
+
+	if (bw_config.tc_valid_bits != bw_ets_config.tc_valid_bits) {
+		dev_info(&pf->pdev->dev,
+			 "Enabled TCs mismatch from querying VSI BW info 0x%08x 0x%08x\n",
+			 bw_config.tc_valid_bits,
+			 bw_ets_config.tc_valid_bits);
+		/* Still continuing */
+	}
+
+	vsi->bw_limit = le16_to_cpu(bw_config.port_bw_limit);
+	vsi->bw_max_quanta = bw_config.max_bw;
+	tc_bw_max = le16_to_cpu(bw_ets_config.tc_bw_max[0]) |
+		    (le16_to_cpu(bw_ets_config.tc_bw_max[1]) << 16);
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		vsi->bw_ets_share_credits[i] = bw_ets_config.share_credits[i];
+		vsi->bw_ets_limit_credits[i] =
+					le16_to_cpu(bw_ets_config.credits[i]);
+		/* 3 bits out of 4 for each TC */
+		vsi->bw_ets_max_quanta[i] = (u8)((tc_bw_max >> (i*4)) & 0x7);
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_vsi_configure_bw_alloc - Configure VSI BW allocation per TC
+ * @vsi: the VSI being configured
+ * @enabled_tc: TC bitmap
+ * @bw_credits: BW shared credits per TC
+ *
+ * Returns 0 on success, negative value on failure
+ **/
+static int i40e_vsi_configure_bw_alloc(struct i40e_vsi *vsi, u8 enabled_tc,
+				       u8 *bw_share)
+{
+	struct i40e_aqc_configure_vsi_tc_bw_data bw_data;
+	i40e_status ret;
+	int i;
+
+	if (vsi->back->flags & I40E_FLAG_TC_MQPRIO)
+		return 0;
+	if (!vsi->mqprio_qopt.qopt.hw) {
+		ret = i40e_set_bw_limit(vsi, vsi->seid, 0);
+		if (ret)
+			dev_info(&vsi->back->pdev->dev,
+				 "Failed to reset tx rate for vsi->seid %u\n",
+				 vsi->seid);
+		return ret;
+	}
+	bw_data.tc_valid_bits = enabled_tc;
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++)
+		bw_data.tc_bw_credits[i] = bw_share[i];
+
+	ret = i40e_aq_config_vsi_tc_bw(&vsi->back->hw, vsi->seid, &bw_data,
+				       NULL);
+	if (ret) {
+		dev_info(&vsi->back->pdev->dev,
+			 "AQ command Config VSI BW allocation per TC failed = %d\n",
+			 vsi->back->hw.aq.asq_last_status);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++)
+		vsi->info.qs_handle[i] = bw_data.qs_handles[i];
+
+	return 0;
+}
+
+/**
+ * i40e_vsi_config_netdev_tc - Setup the netdev TC configuration
+ * @vsi: the VSI being configured
+ * @enabled_tc: TC map to be enabled
+ *
+ **/
+static void i40e_vsi_config_netdev_tc(struct i40e_vsi *vsi, u8 enabled_tc)
+{
+	struct net_device *netdev = vsi->netdev;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	u8 netdev_tc = 0;
+	int i;
+	struct i40e_dcbx_config *dcbcfg = &hw->local_dcbx_config;
+
+	if (!netdev)
+		return;
+
+	if (!enabled_tc) {
+		netdev_reset_tc(netdev);
+		return;
+	}
+
+	/* Set up actual enabled TCs on the VSI */
+	if (netdev_set_num_tc(netdev, vsi->tc_config.numtc))
+		return;
+
+	/* set per TC queues for the VSI */
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		/* Only set TC queues for enabled tcs
+		 *
+		 * e.g. For a VSI that has TC0 and TC3 enabled the
+		 * enabled_tc bitmap would be 0x00001001; the driver
+		 * will set the numtc for netdev as 2 that will be
+		 * referenced by the netdev layer as TC 0 and 1.
+		 */
+		if (vsi->tc_config.enabled_tc & BIT(i))
+			netdev_set_tc_queue(netdev,
+					vsi->tc_config.tc_info[i].netdev_tc,
+					vsi->tc_config.tc_info[i].qcount,
+					vsi->tc_config.tc_info[i].qoffset);
+	}
+
+	if (pf->flags & I40E_FLAG_TC_MQPRIO)
+		return;
+
+	/* Assign UP2TC map for the VSI */
+	for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
+		/* Get the actual TC# for the UP */
+		u8 ets_tc = dcbcfg->etscfg.prioritytable[i];
+		/* Get the mapped netdev TC# for the UP */
+		netdev_tc =  vsi->tc_config.tc_info[ets_tc].netdev_tc;
+		netdev_set_prio_tc_map(netdev, i, netdev_tc);
+	}
+}
+
+/**
+ * i40e_vsi_update_queue_map - Update our copy of VSi info with new queue map
+ * @vsi: the VSI being configured
+ * @ctxt: the ctxt buffer returned from AQ VSI update param command
+ **/
+static void i40e_vsi_update_queue_map(struct i40e_vsi *vsi,
+				      struct i40e_vsi_context *ctxt)
+{
+	/* copy just the sections touched not the entire info
+	 * since not all sections are valid as returned by
+	 * update vsi params
+	 */
+	vsi->info.mapping_flags = ctxt->info.mapping_flags;
+	memcpy(&vsi->info.queue_mapping,
+	       &ctxt->info.queue_mapping, sizeof(vsi->info.queue_mapping));
+	memcpy(&vsi->info.tc_mapping, ctxt->info.tc_mapping,
+	       sizeof(vsi->info.tc_mapping));
+}
+
+/**
+ * i40e_vsi_config_tc - Configure VSI Tx Scheduler for given TC map
+ * @vsi: VSI to be configured
+ * @enabled_tc: TC bitmap
+ *
+ * This configures a particular VSI for TCs that are mapped to the
+ * given TC bitmap. It uses default bandwidth share for TCs across
+ * VSIs to configure TC for a particular VSI.
+ *
+ * NOTE:
+ * It is expected that the VSI queues have been quisced before calling
+ * this function.
+ **/
+static int i40e_vsi_config_tc(struct i40e_vsi *vsi, u8 enabled_tc)
+{
+	u8 bw_share[I40E_MAX_TRAFFIC_CLASS] = {0};
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_vsi_context ctxt;
+	int ret = 0;
+	int i;
+
+	/* Check if enabled_tc is same as existing or new TCs */
+	if (vsi->tc_config.enabled_tc == enabled_tc &&
+	    vsi->mqprio_qopt.mode != TC_MQPRIO_MODE_CHANNEL)
+		return ret;
+
+	/* Enable ETS TCs with equal BW Share for now across all VSIs */
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		if (enabled_tc & BIT(i))
+			bw_share[i] = 1;
+	}
+
+	ret = i40e_vsi_configure_bw_alloc(vsi, enabled_tc, bw_share);
+	if (ret) {
+		struct i40e_aqc_query_vsi_bw_config_resp bw_config = {0};
+
+		dev_info(&pf->pdev->dev,
+			 "Failed configuring TC map %d for VSI %d\n",
+			 enabled_tc, vsi->seid);
+		ret = i40e_aq_query_vsi_bw_config(hw, vsi->seid,
+						  &bw_config, NULL);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "Failed querying vsi bw info, err %s aq_err %s\n",
+				 i40e_stat_str(hw, ret),
+				 i40e_aq_str(hw, hw->aq.asq_last_status));
+			goto out;
+		}
+		if ((bw_config.tc_valid_bits & enabled_tc) != enabled_tc) {
+			u8 valid_tc = bw_config.tc_valid_bits & enabled_tc;
+
+			if (!valid_tc)
+				valid_tc = bw_config.tc_valid_bits;
+			/* Always enable TC0, no matter what */
+			valid_tc |= 1;
+			dev_info(&pf->pdev->dev,
+				 "Requested tc 0x%x, but FW reports 0x%x as valid. Attempting to use 0x%x.\n",
+				 enabled_tc, bw_config.tc_valid_bits, valid_tc);
+			enabled_tc = valid_tc;
+		}
+
+		ret = i40e_vsi_configure_bw_alloc(vsi, enabled_tc, bw_share);
+		if (ret) {
+			dev_err(&pf->pdev->dev,
+				"Unable to  configure TC map %d for VSI %d\n",
+				enabled_tc, vsi->seid);
+			goto out;
+		}
+	}
+
+	/* Update Queue Pairs Mapping for currently enabled UPs */
+	ctxt.seid = vsi->seid;
+	ctxt.pf_num = vsi->back->hw.pf_id;
+	ctxt.vf_num = 0;
+	ctxt.uplink_seid = vsi->uplink_seid;
+	ctxt.info = vsi->info;
+	if (vsi->back->flags & I40E_FLAG_TC_MQPRIO) {
+		ret = i40e_vsi_setup_queue_map_mqprio(vsi, &ctxt, enabled_tc);
+		if (ret)
+			goto out;
+	} else {
+		i40e_vsi_setup_queue_map(vsi, &ctxt, enabled_tc, false);
+	}
+
+	/* On destroying the qdisc, reset vsi->rss_size, as number of enabled
+	 * queues changed.
+	 */
+	if (!vsi->mqprio_qopt.qopt.hw && vsi->reconfig_rss) {
+		vsi->rss_size = min_t(int, vsi->back->alloc_rss_size,
+				      vsi->num_queue_pairs);
+		ret = i40e_vsi_config_rss(vsi);
+		if (ret) {
+			dev_info(&vsi->back->pdev->dev,
+				 "Failed to reconfig rss for num_queues\n");
+			return ret;
+		}
+		vsi->reconfig_rss = false;
+	}
+	if (vsi->back->flags & I40E_FLAG_IWARP_ENABLED) {
+		ctxt.info.valid_sections |=
+				cpu_to_le16(I40E_AQ_VSI_PROP_QUEUE_OPT_VALID);
+		ctxt.info.queueing_opt_flags |= I40E_AQ_VSI_QUE_OPT_TCP_ENA;
+	}
+
+	/* Update the VSI after updating the VSI queue-mapping
+	 * information
+	 */
+	ret = i40e_aq_update_vsi_params(hw, &ctxt, NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "Update vsi tc config failed, err %s aq_err %s\n",
+			 i40e_stat_str(hw, ret),
+			 i40e_aq_str(hw, hw->aq.asq_last_status));
+		goto out;
+	}
+	/* update the local VSI info with updated queue map */
+	i40e_vsi_update_queue_map(vsi, &ctxt);
+	vsi->info.valid_sections = 0;
+
+	/* Update current VSI BW information */
+	ret = i40e_vsi_get_bw_info(vsi);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "Failed updating vsi bw info, err %s aq_err %s\n",
+			 i40e_stat_str(hw, ret),
+			 i40e_aq_str(hw, hw->aq.asq_last_status));
+		goto out;
+	}
+
+	/* Update the netdev TC setup */
+	i40e_vsi_config_netdev_tc(vsi, enabled_tc);
+out:
+	return ret;
+}
+
+/**
+ * i40e_get_link_speed - Returns link speed for the interface
+ * @vsi: VSI to be configured
+ *
+ **/
+static int i40e_get_link_speed(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+
+	switch (pf->hw.phy.link_info.link_speed) {
+	case I40E_LINK_SPEED_40GB:
+		return 40000;
+	case I40E_LINK_SPEED_25GB:
+		return 25000;
+	case I40E_LINK_SPEED_20GB:
+		return 20000;
+	case I40E_LINK_SPEED_10GB:
+		return 10000;
+	case I40E_LINK_SPEED_1GB:
+		return 1000;
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * i40e_set_bw_limit - setup BW limit for Tx traffic based on max_tx_rate
+ * @vsi: VSI to be configured
+ * @seid: seid of the channel/VSI
+ * @max_tx_rate: max TX rate to be configured as BW limit
+ *
+ * Helper function to set BW limit for a given VSI
+ **/
+int i40e_set_bw_limit(struct i40e_vsi *vsi, u16 seid, u64 max_tx_rate)
+{
+	struct i40e_pf *pf = vsi->back;
+	u64 credits = 0;
+	int speed = 0;
+	int ret = 0;
+
+	speed = i40e_get_link_speed(vsi);
+	if (max_tx_rate > speed) {
+		dev_err(&pf->pdev->dev,
+			"Invalid max tx rate %llu specified for VSI seid %d.",
+			max_tx_rate, seid);
+		return -EINVAL;
+	}
+	if (max_tx_rate && max_tx_rate < 50) {
+		dev_warn(&pf->pdev->dev,
+			 "Setting max tx rate to minimum usable value of 50Mbps.\n");
+		max_tx_rate = 50;
+	}
+
+	/* Tx rate credits are in values of 50Mbps, 0 is disabled */
+	credits = max_tx_rate;
+	do_div(credits, I40E_BW_CREDIT_DIVISOR);
+	ret = i40e_aq_config_vsi_bw_limit(&pf->hw, seid, credits,
+					  I40E_MAX_BW_INACTIVE_ACCUM, NULL);
+	if (ret)
+		dev_err(&pf->pdev->dev,
+			"Failed set tx rate (%llu Mbps) for vsi->seid %u, err %s aq_err %s\n",
+			max_tx_rate, seid, i40e_stat_str(&pf->hw, ret),
+			i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+	return ret;
+}
+
+/**
+ * i40e_remove_queue_channels - Remove queue channels for the TCs
+ * @vsi: VSI to be configured
+ *
+ * Remove queue channels for the TCs
+ **/
+static void i40e_remove_queue_channels(struct i40e_vsi *vsi)
+{
+	enum i40e_admin_queue_err last_aq_status;
+	struct i40e_cloud_filter *cfilter;
+	struct i40e_channel *ch, *ch_tmp;
+	struct i40e_pf *pf = vsi->back;
+	struct hlist_node *node;
+	int ret, i;
+
+	/* Reset rss size that was stored when reconfiguring rss for
+	 * channel VSIs with non-power-of-2 queue count.
+	 */
+	vsi->current_rss_size = 0;
+
+	/* perform cleanup for channels if they exist */
+	if (list_empty(&vsi->ch_list))
+		return;
+
+	list_for_each_entry_safe(ch, ch_tmp, &vsi->ch_list, list) {
+		struct i40e_vsi *p_vsi;
+
+		list_del(&ch->list);
+		p_vsi = ch->parent_vsi;
+		if (!p_vsi || !ch->initialized) {
+			kfree(ch);
+			continue;
+		}
+		/* Reset queue contexts */
+		for (i = 0; i < ch->num_queue_pairs; i++) {
+			struct i40e_ring *tx_ring, *rx_ring;
+			u16 pf_q;
+
+			pf_q = ch->base_queue + i;
+			tx_ring = vsi->tx_rings[pf_q];
+			tx_ring->ch = NULL;
+
+			rx_ring = vsi->rx_rings[pf_q];
+			rx_ring->ch = NULL;
+		}
+
+		/* Reset BW configured for this VSI via mqprio */
+		ret = i40e_set_bw_limit(vsi, ch->seid, 0);
+		if (ret)
+			dev_info(&vsi->back->pdev->dev,
+				 "Failed to reset tx rate for ch->seid %u\n",
+				 ch->seid);
+
+		/* delete cloud filters associated with this channel */
+		hlist_for_each_entry_safe(cfilter, node,
+					  &pf->cloud_filter_list, cloud_node) {
+			if (cfilter->seid != ch->seid)
+				continue;
+
+			hash_del(&cfilter->cloud_node);
+			if (cfilter->dst_port)
+				ret = i40e_add_del_cloud_filter_big_buf(vsi,
+									cfilter,
+									false);
+			else
+				ret = i40e_add_del_cloud_filter(vsi, cfilter,
+								false);
+			last_aq_status = pf->hw.aq.asq_last_status;
+			if (ret)
+				dev_info(&pf->pdev->dev,
+					 "Failed to delete cloud filter, err %s aq_err %s\n",
+					 i40e_stat_str(&pf->hw, ret),
+					 i40e_aq_str(&pf->hw, last_aq_status));
+			kfree(cfilter);
+		}
+
+		/* delete VSI from FW */
+		ret = i40e_aq_delete_element(&vsi->back->hw, ch->seid,
+					     NULL);
+		if (ret)
+			dev_err(&vsi->back->pdev->dev,
+				"unable to remove channel (%d) for parent VSI(%d)\n",
+				ch->seid, p_vsi->seid);
+		kfree(ch);
+	}
+	INIT_LIST_HEAD(&vsi->ch_list);
+}
+
+/**
+ * i40e_is_any_channel - channel exist or not
+ * @vsi: ptr to VSI to which channels are associated with
+ *
+ * Returns true or false if channel(s) exist for associated VSI or not
+ **/
+static bool i40e_is_any_channel(struct i40e_vsi *vsi)
+{
+	struct i40e_channel *ch, *ch_tmp;
+
+	list_for_each_entry_safe(ch, ch_tmp, &vsi->ch_list, list) {
+		if (ch->initialized)
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * i40e_get_max_queues_for_channel
+ * @vsi: ptr to VSI to which channels are associated with
+ *
+ * Helper function which returns max value among the queue counts set on the
+ * channels/TCs created.
+ **/
+static int i40e_get_max_queues_for_channel(struct i40e_vsi *vsi)
+{
+	struct i40e_channel *ch, *ch_tmp;
+	int max = 0;
+
+	list_for_each_entry_safe(ch, ch_tmp, &vsi->ch_list, list) {
+		if (!ch->initialized)
+			continue;
+		if (ch->num_queue_pairs > max)
+			max = ch->num_queue_pairs;
+	}
+
+	return max;
+}
+
+/**
+ * i40e_validate_num_queues - validate num_queues w.r.t channel
+ * @pf: ptr to PF device
+ * @num_queues: number of queues
+ * @vsi: the parent VSI
+ * @reconfig_rss: indicates should the RSS be reconfigured or not
+ *
+ * This function validates number of queues in the context of new channel
+ * which is being established and determines if RSS should be reconfigured
+ * or not for parent VSI.
+ **/
+static int i40e_validate_num_queues(struct i40e_pf *pf, int num_queues,
+				    struct i40e_vsi *vsi, bool *reconfig_rss)
+{
+	int max_ch_queues;
+
+	if (!reconfig_rss)
+		return -EINVAL;
+
+	*reconfig_rss = false;
+	if (vsi->current_rss_size) {
+		if (num_queues > vsi->current_rss_size) {
+			dev_dbg(&pf->pdev->dev,
+				"Error: num_queues (%d) > vsi's current_size(%d)\n",
+				num_queues, vsi->current_rss_size);
+			return -EINVAL;
+		} else if ((num_queues < vsi->current_rss_size) &&
+			   (!is_power_of_2(num_queues))) {
+			dev_dbg(&pf->pdev->dev,
+				"Error: num_queues (%d) < vsi's current_size(%d), but not power of 2\n",
+				num_queues, vsi->current_rss_size);
+			return -EINVAL;
+		}
+	}
+
+	if (!is_power_of_2(num_queues)) {
+		/* Find the max num_queues configured for channel if channel
+		 * exist.
+		 * if channel exist, then enforce 'num_queues' to be more than
+		 * max ever queues configured for channel.
+		 */
+		max_ch_queues = i40e_get_max_queues_for_channel(vsi);
+		if (num_queues < max_ch_queues) {
+			dev_dbg(&pf->pdev->dev,
+				"Error: num_queues (%d) < max queues configured for channel(%d)\n",
+				num_queues, max_ch_queues);
+			return -EINVAL;
+		}
+		*reconfig_rss = true;
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_vsi_reconfig_rss - reconfig RSS based on specified rss_size
+ * @vsi: the VSI being setup
+ * @rss_size: size of RSS, accordingly LUT gets reprogrammed
+ *
+ * This function reconfigures RSS by reprogramming LUTs using 'rss_size'
+ **/
+static int i40e_vsi_reconfig_rss(struct i40e_vsi *vsi, u16 rss_size)
+{
+	struct i40e_pf *pf = vsi->back;
+	u8 seed[I40E_HKEY_ARRAY_SIZE];
+	struct i40e_hw *hw = &pf->hw;
+	int local_rss_size;
+	u8 *lut;
+	int ret;
+
+	if (!vsi->rss_size)
+		return -EINVAL;
+
+	if (rss_size > vsi->rss_size)
+		return -EINVAL;
+
+	local_rss_size = min_t(int, vsi->rss_size, rss_size);
+	lut = kzalloc(vsi->rss_table_size, GFP_KERNEL);
+	if (!lut)
+		return -ENOMEM;
+
+	/* Ignoring user configured lut if there is one */
+	i40e_fill_rss_lut(pf, lut, vsi->rss_table_size, local_rss_size);
+
+	/* Use user configured hash key if there is one, otherwise
+	 * use default.
+	 */
+	if (vsi->rss_hkey_user)
+		memcpy(seed, vsi->rss_hkey_user, I40E_HKEY_ARRAY_SIZE);
+	else
+		netdev_rss_key_fill((void *)seed, I40E_HKEY_ARRAY_SIZE);
+
+	ret = i40e_config_rss(vsi, seed, lut, vsi->rss_table_size);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "Cannot set RSS lut, err %s aq_err %s\n",
+			 i40e_stat_str(hw, ret),
+			 i40e_aq_str(hw, hw->aq.asq_last_status));
+		kfree(lut);
+		return ret;
+	}
+	kfree(lut);
+
+	/* Do the update w.r.t. storing rss_size */
+	if (!vsi->orig_rss_size)
+		vsi->orig_rss_size = vsi->rss_size;
+	vsi->current_rss_size = local_rss_size;
+
+	return ret;
+}
+
+/**
+ * i40e_channel_setup_queue_map - Setup a channel queue map
+ * @pf: ptr to PF device
+ * @vsi: the VSI being setup
+ * @ctxt: VSI context structure
+ * @ch: ptr to channel structure
+ *
+ * Setup queue map for a specific channel
+ **/
+static void i40e_channel_setup_queue_map(struct i40e_pf *pf,
+					 struct i40e_vsi_context *ctxt,
+					 struct i40e_channel *ch)
+{
+	u16 qcount, qmap, sections = 0;
+	u8 offset = 0;
+	int pow;
+
+	sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID;
+	sections |= I40E_AQ_VSI_PROP_SCHED_VALID;
+
+	qcount = min_t(int, ch->num_queue_pairs, pf->num_lan_msix);
+	ch->num_queue_pairs = qcount;
+
+	/* find the next higher power-of-2 of num queue pairs */
+	pow = ilog2(qcount);
+	if (!is_power_of_2(qcount))
+		pow++;
+
+	qmap = (offset << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) |
+		(pow << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT);
+
+	/* Setup queue TC[0].qmap for given VSI context */
+	ctxt->info.tc_mapping[0] = cpu_to_le16(qmap);
+
+	ctxt->info.up_enable_bits = 0x1; /* TC0 enabled */
+	ctxt->info.mapping_flags |= cpu_to_le16(I40E_AQ_VSI_QUE_MAP_CONTIG);
+	ctxt->info.queue_mapping[0] = cpu_to_le16(ch->base_queue);
+	ctxt->info.valid_sections |= cpu_to_le16(sections);
+}
+
+/**
+ * i40e_add_channel - add a channel by adding VSI
+ * @pf: ptr to PF device
+ * @uplink_seid: underlying HW switching element (VEB) ID
+ * @ch: ptr to channel structure
+ *
+ * Add a channel (VSI) using add_vsi and queue_map
+ **/
+static int i40e_add_channel(struct i40e_pf *pf, u16 uplink_seid,
+			    struct i40e_channel *ch)
+{
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_vsi_context ctxt;
+	u8 enabled_tc = 0x1; /* TC0 enabled */
+	int ret;
+
+	if (ch->type != I40E_VSI_VMDQ2) {
+		dev_info(&pf->pdev->dev,
+			 "add new vsi failed, ch->type %d\n", ch->type);
+		return -EINVAL;
+	}
+
+	memset(&ctxt, 0, sizeof(ctxt));
+	ctxt.pf_num = hw->pf_id;
+	ctxt.vf_num = 0;
+	ctxt.uplink_seid = uplink_seid;
+	ctxt.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL;
+	if (ch->type == I40E_VSI_VMDQ2)
+		ctxt.flags = I40E_AQ_VSI_TYPE_VMDQ2;
+
+	if (pf->flags & I40E_FLAG_VEB_MODE_ENABLED) {
+		ctxt.info.valid_sections |=
+		     cpu_to_le16(I40E_AQ_VSI_PROP_SWITCH_VALID);
+		ctxt.info.switch_id =
+		   cpu_to_le16(I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB);
+	}
+
+	/* Set queue map for a given VSI context */
+	i40e_channel_setup_queue_map(pf, &ctxt, ch);
+
+	/* Now time to create VSI */
+	ret = i40e_aq_add_vsi(hw, &ctxt, NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "add new vsi failed, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw,
+				     pf->hw.aq.asq_last_status));
+		return -ENOENT;
+	}
+
+	/* Success, update channel */
+	ch->enabled_tc = enabled_tc;
+	ch->seid = ctxt.seid;
+	ch->vsi_number = ctxt.vsi_number;
+	ch->stat_counter_idx = cpu_to_le16(ctxt.info.stat_counter_idx);
+
+	/* copy just the sections touched not the entire info
+	 * since not all sections are valid as returned by
+	 * update vsi params
+	 */
+	ch->info.mapping_flags = ctxt.info.mapping_flags;
+	memcpy(&ch->info.queue_mapping,
+	       &ctxt.info.queue_mapping, sizeof(ctxt.info.queue_mapping));
+	memcpy(&ch->info.tc_mapping, ctxt.info.tc_mapping,
+	       sizeof(ctxt.info.tc_mapping));
+
+	return 0;
+}
+
+static int i40e_channel_config_bw(struct i40e_vsi *vsi, struct i40e_channel *ch,
+				  u8 *bw_share)
+{
+	struct i40e_aqc_configure_vsi_tc_bw_data bw_data;
+	i40e_status ret;
+	int i;
+
+	bw_data.tc_valid_bits = ch->enabled_tc;
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++)
+		bw_data.tc_bw_credits[i] = bw_share[i];
+
+	ret = i40e_aq_config_vsi_tc_bw(&vsi->back->hw, ch->seid,
+				       &bw_data, NULL);
+	if (ret) {
+		dev_info(&vsi->back->pdev->dev,
+			 "Config VSI BW allocation per TC failed, aq_err: %d for new_vsi->seid %u\n",
+			 vsi->back->hw.aq.asq_last_status, ch->seid);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++)
+		ch->info.qs_handle[i] = bw_data.qs_handles[i];
+
+	return 0;
+}
+
+/**
+ * i40e_channel_config_tx_ring - config TX ring associated with new channel
+ * @pf: ptr to PF device
+ * @vsi: the VSI being setup
+ * @ch: ptr to channel structure
+ *
+ * Configure TX rings associated with channel (VSI) since queues are being
+ * from parent VSI.
+ **/
+static int i40e_channel_config_tx_ring(struct i40e_pf *pf,
+				       struct i40e_vsi *vsi,
+				       struct i40e_channel *ch)
+{
+	i40e_status ret;
+	int i;
+	u8 bw_share[I40E_MAX_TRAFFIC_CLASS] = {0};
+
+	/* Enable ETS TCs with equal BW Share for now across all VSIs */
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		if (ch->enabled_tc & BIT(i))
+			bw_share[i] = 1;
+	}
+
+	/* configure BW for new VSI */
+	ret = i40e_channel_config_bw(vsi, ch, bw_share);
+	if (ret) {
+		dev_info(&vsi->back->pdev->dev,
+			 "Failed configuring TC map %d for channel (seid %u)\n",
+			 ch->enabled_tc, ch->seid);
+		return ret;
+	}
+
+	for (i = 0; i < ch->num_queue_pairs; i++) {
+		struct i40e_ring *tx_ring, *rx_ring;
+		u16 pf_q;
+
+		pf_q = ch->base_queue + i;
+
+		/* Get to TX ring ptr of main VSI, for re-setup TX queue
+		 * context
+		 */
+		tx_ring = vsi->tx_rings[pf_q];
+		tx_ring->ch = ch;
+
+		/* Get the RX ring ptr */
+		rx_ring = vsi->rx_rings[pf_q];
+		rx_ring->ch = ch;
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_setup_hw_channel - setup new channel
+ * @pf: ptr to PF device
+ * @vsi: the VSI being setup
+ * @ch: ptr to channel structure
+ * @uplink_seid: underlying HW switching element (VEB) ID
+ * @type: type of channel to be created (VMDq2/VF)
+ *
+ * Setup new channel (VSI) based on specified type (VMDq2/VF)
+ * and configures TX rings accordingly
+ **/
+static inline int i40e_setup_hw_channel(struct i40e_pf *pf,
+					struct i40e_vsi *vsi,
+					struct i40e_channel *ch,
+					u16 uplink_seid, u8 type)
+{
+	int ret;
+
+	ch->initialized = false;
+	ch->base_queue = vsi->next_base_queue;
+	ch->type = type;
+
+	/* Proceed with creation of channel (VMDq2) VSI */
+	ret = i40e_add_channel(pf, uplink_seid, ch);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "failed to add_channel using uplink_seid %u\n",
+			 uplink_seid);
+		return ret;
+	}
+
+	/* Mark the successful creation of channel */
+	ch->initialized = true;
+
+	/* Reconfigure TX queues using QTX_CTL register */
+	ret = i40e_channel_config_tx_ring(pf, vsi, ch);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "failed to configure TX rings for channel %u\n",
+			 ch->seid);
+		return ret;
+	}
+
+	/* update 'next_base_queue' */
+	vsi->next_base_queue = vsi->next_base_queue + ch->num_queue_pairs;
+	dev_dbg(&pf->pdev->dev,
+		"Added channel: vsi_seid %u, vsi_number %u, stat_counter_idx %u, num_queue_pairs %u, pf->next_base_queue %d\n",
+		ch->seid, ch->vsi_number, ch->stat_counter_idx,
+		ch->num_queue_pairs,
+		vsi->next_base_queue);
+	return ret;
+}
+
+/**
+ * i40e_setup_channel - setup new channel using uplink element
+ * @pf: ptr to PF device
+ * @type: type of channel to be created (VMDq2/VF)
+ * @uplink_seid: underlying HW switching element (VEB) ID
+ * @ch: ptr to channel structure
+ *
+ * Setup new channel (VSI) based on specified type (VMDq2/VF)
+ * and uplink switching element (uplink_seid)
+ **/
+static bool i40e_setup_channel(struct i40e_pf *pf, struct i40e_vsi *vsi,
+			       struct i40e_channel *ch)
+{
+	u8 vsi_type;
+	u16 seid;
+	int ret;
+
+	if (vsi->type == I40E_VSI_MAIN) {
+		vsi_type = I40E_VSI_VMDQ2;
+	} else {
+		dev_err(&pf->pdev->dev, "unsupported parent vsi type(%d)\n",
+			vsi->type);
+		return false;
+	}
+
+	/* underlying switching element */
+	seid = pf->vsi[pf->lan_vsi]->uplink_seid;
+
+	/* create channel (VSI), configure TX rings */
+	ret = i40e_setup_hw_channel(pf, vsi, ch, seid, vsi_type);
+	if (ret) {
+		dev_err(&pf->pdev->dev, "failed to setup hw_channel\n");
+		return false;
+	}
+
+	return ch->initialized ? true : false;
+}
+
+/**
+ * i40e_validate_and_set_switch_mode - sets up switch mode correctly
+ * @vsi: ptr to VSI which has PF backing
+ *
+ * Sets up switch mode correctly if it needs to be changed and perform
+ * what are allowed modes.
+ **/
+static int i40e_validate_and_set_switch_mode(struct i40e_vsi *vsi)
+{
+	u8 mode;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	int ret;
+
+	ret = i40e_get_capabilities(pf, i40e_aqc_opc_list_dev_capabilities);
+	if (ret)
+		return -EINVAL;
+
+	if (hw->dev_caps.switch_mode) {
+		/* if switch mode is set, support mode2 (non-tunneled for
+		 * cloud filter) for now
+		 */
+		u32 switch_mode = hw->dev_caps.switch_mode &
+				  I40E_SWITCH_MODE_MASK;
+		if (switch_mode >= I40E_CLOUD_FILTER_MODE1) {
+			if (switch_mode == I40E_CLOUD_FILTER_MODE2)
+				return 0;
+			dev_err(&pf->pdev->dev,
+				"Invalid switch_mode (%d), only non-tunneled mode for cloud filter is supported\n",
+				hw->dev_caps.switch_mode);
+			return -EINVAL;
+		}
+	}
+
+	/* Set Bit 7 to be valid */
+	mode = I40E_AQ_SET_SWITCH_BIT7_VALID;
+
+	/* Set L4type for TCP support */
+	mode |= I40E_AQ_SET_SWITCH_L4_TYPE_TCP;
+
+	/* Set cloud filter mode */
+	mode |= I40E_AQ_SET_SWITCH_MODE_NON_TUNNEL;
+
+	/* Prep mode field for set_switch_config */
+	ret = i40e_aq_set_switch_config(hw, pf->last_sw_conf_flags,
+					pf->last_sw_conf_valid_flags,
+					mode, NULL);
+	if (ret && hw->aq.asq_last_status != I40E_AQ_RC_ESRCH)
+		dev_err(&pf->pdev->dev,
+			"couldn't set switch config bits, err %s aq_err %s\n",
+			i40e_stat_str(hw, ret),
+			i40e_aq_str(hw,
+				    hw->aq.asq_last_status));
+
+	return ret;
+}
+
+/**
+ * i40e_create_queue_channel - function to create channel
+ * @vsi: VSI to be configured
+ * @ch: ptr to channel (it contains channel specific params)
+ *
+ * This function creates channel (VSI) using num_queues specified by user,
+ * reconfigs RSS if needed.
+ **/
+int i40e_create_queue_channel(struct i40e_vsi *vsi,
+			      struct i40e_channel *ch)
+{
+	struct i40e_pf *pf = vsi->back;
+	bool reconfig_rss;
+	int err;
+
+	if (!ch)
+		return -EINVAL;
+
+	if (!ch->num_queue_pairs) {
+		dev_err(&pf->pdev->dev, "Invalid num_queues requested: %d\n",
+			ch->num_queue_pairs);
+		return -EINVAL;
+	}
+
+	/* validate user requested num_queues for channel */
+	err = i40e_validate_num_queues(pf, ch->num_queue_pairs, vsi,
+				       &reconfig_rss);
+	if (err) {
+		dev_info(&pf->pdev->dev, "Failed to validate num_queues (%d)\n",
+			 ch->num_queue_pairs);
+		return -EINVAL;
+	}
+
+	/* By default we are in VEPA mode, if this is the first VF/VMDq
+	 * VSI to be added switch to VEB mode.
+	 */
+	if ((!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) ||
+	    (!i40e_is_any_channel(vsi))) {
+		if (!is_power_of_2(vsi->tc_config.tc_info[0].qcount)) {
+			dev_dbg(&pf->pdev->dev,
+				"Failed to create channel. Override queues (%u) not power of 2\n",
+				vsi->tc_config.tc_info[0].qcount);
+			return -EINVAL;
+		}
+
+		if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) {
+			pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
+
+			if (vsi->type == I40E_VSI_MAIN) {
+				if (pf->flags & I40E_FLAG_TC_MQPRIO)
+					i40e_do_reset(pf, I40E_PF_RESET_FLAG,
+						      true);
+				else
+					i40e_do_reset_safe(pf,
+							   I40E_PF_RESET_FLAG);
+			}
+		}
+		/* now onwards for main VSI, number of queues will be value
+		 * of TC0's queue count
+		 */
+	}
+
+	/* By this time, vsi->cnt_q_avail shall be set to non-zero and
+	 * it should be more than num_queues
+	 */
+	if (!vsi->cnt_q_avail || vsi->cnt_q_avail < ch->num_queue_pairs) {
+		dev_dbg(&pf->pdev->dev,
+			"Error: cnt_q_avail (%u) less than num_queues %d\n",
+			vsi->cnt_q_avail, ch->num_queue_pairs);
+		return -EINVAL;
+	}
+
+	/* reconfig_rss only if vsi type is MAIN_VSI */
+	if (reconfig_rss && (vsi->type == I40E_VSI_MAIN)) {
+		err = i40e_vsi_reconfig_rss(vsi, ch->num_queue_pairs);
+		if (err) {
+			dev_info(&pf->pdev->dev,
+				 "Error: unable to reconfig rss for num_queues (%u)\n",
+				 ch->num_queue_pairs);
+			return -EINVAL;
+		}
+	}
+
+	if (!i40e_setup_channel(pf, vsi, ch)) {
+		dev_info(&pf->pdev->dev, "Failed to setup channel\n");
+		return -EINVAL;
+	}
+
+	dev_info(&pf->pdev->dev,
+		 "Setup channel (id:%u) utilizing num_queues %d\n",
+		 ch->seid, ch->num_queue_pairs);
+
+	/* configure VSI for BW limit */
+	if (ch->max_tx_rate) {
+		u64 credits = ch->max_tx_rate;
+
+		if (i40e_set_bw_limit(vsi, ch->seid, ch->max_tx_rate))
+			return -EINVAL;
+
+		do_div(credits, I40E_BW_CREDIT_DIVISOR);
+		dev_dbg(&pf->pdev->dev,
+			"Set tx rate of %llu Mbps (count of 50Mbps %llu) for vsi->seid %u\n",
+			ch->max_tx_rate,
+			credits,
+			ch->seid);
+	}
+
+	/* in case of VF, this will be main SRIOV VSI */
+	ch->parent_vsi = vsi;
+
+	/* and update main_vsi's count for queue_available to use */
+	vsi->cnt_q_avail -= ch->num_queue_pairs;
+
+	return 0;
+}
+
+/**
+ * i40e_configure_queue_channels - Add queue channel for the given TCs
+ * @vsi: VSI to be configured
+ *
+ * Configures queue channel mapping to the given TCs
+ **/
+static int i40e_configure_queue_channels(struct i40e_vsi *vsi)
+{
+	struct i40e_channel *ch;
+	u64 max_rate = 0;
+	int ret = 0, i;
+
+	/* Create app vsi with the TCs. Main VSI with TC0 is already set up */
+	vsi->tc_seid_map[0] = vsi->seid;
+	for (i = 1; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		if (vsi->tc_config.enabled_tc & BIT(i)) {
+			ch = kzalloc(sizeof(*ch), GFP_KERNEL);
+			if (!ch) {
+				ret = -ENOMEM;
+				goto err_free;
+			}
+
+			INIT_LIST_HEAD(&ch->list);
+			ch->num_queue_pairs =
+				vsi->tc_config.tc_info[i].qcount;
+			ch->base_queue =
+				vsi->tc_config.tc_info[i].qoffset;
+
+			/* Bandwidth limit through tc interface is in bytes/s,
+			 * change to Mbit/s
+			 */
+			max_rate = vsi->mqprio_qopt.max_rate[i];
+			do_div(max_rate, I40E_BW_MBPS_DIVISOR);
+			ch->max_tx_rate = max_rate;
+
+			list_add_tail(&ch->list, &vsi->ch_list);
+
+			ret = i40e_create_queue_channel(vsi, ch);
+			if (ret) {
+				dev_err(&vsi->back->pdev->dev,
+					"Failed creating queue channel with TC%d: queues %d\n",
+					i, ch->num_queue_pairs);
+				goto err_free;
+			}
+			vsi->tc_seid_map[i] = ch->seid;
+		}
+	}
+	return ret;
+
+err_free:
+	i40e_remove_queue_channels(vsi);
+	return ret;
+}
+
+/**
+ * i40e_veb_config_tc - Configure TCs for given VEB
+ * @veb: given VEB
+ * @enabled_tc: TC bitmap
+ *
+ * Configures given TC bitmap for VEB (switching) element
+ **/
+int i40e_veb_config_tc(struct i40e_veb *veb, u8 enabled_tc)
+{
+	struct i40e_aqc_configure_switching_comp_bw_config_data bw_data = {0};
+	struct i40e_pf *pf = veb->pf;
+	int ret = 0;
+	int i;
+
+	/* No TCs or already enabled TCs just return */
+	if (!enabled_tc || veb->enabled_tc == enabled_tc)
+		return ret;
+
+	bw_data.tc_valid_bits = enabled_tc;
+	/* bw_data.absolute_credits is not set (relative) */
+
+	/* Enable ETS TCs with equal BW Share for now */
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		if (enabled_tc & BIT(i))
+			bw_data.tc_bw_share_credits[i] = 1;
+	}
+
+	ret = i40e_aq_config_switch_comp_bw_config(&pf->hw, veb->seid,
+						   &bw_data, NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "VEB bw config failed, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		goto out;
+	}
+
+	/* Update the BW information */
+	ret = i40e_veb_get_bw_info(veb);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "Failed getting veb bw config, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+	}
+
+out:
+	return ret;
+}
+
+#ifdef CONFIG_I40E_DCB
+/**
+ * i40e_dcb_reconfigure - Reconfigure all VEBs and VSIs
+ * @pf: PF struct
+ *
+ * Reconfigure VEB/VSIs on a given PF; it is assumed that
+ * the caller would've quiesce all the VSIs before calling
+ * this function
+ **/
+static void i40e_dcb_reconfigure(struct i40e_pf *pf)
+{
+	u8 tc_map = 0;
+	int ret;
+	u8 v;
+
+	/* Enable the TCs available on PF to all VEBs */
+	tc_map = i40e_pf_get_tc_map(pf);
+	for (v = 0; v < I40E_MAX_VEB; v++) {
+		if (!pf->veb[v])
+			continue;
+		ret = i40e_veb_config_tc(pf->veb[v], tc_map);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "Failed configuring TC for VEB seid=%d\n",
+				 pf->veb[v]->seid);
+			/* Will try to configure as many components */
+		}
+	}
+
+	/* Update each VSI */
+	for (v = 0; v < pf->num_alloc_vsi; v++) {
+		if (!pf->vsi[v])
+			continue;
+
+		/* - Enable all TCs for the LAN VSI
+		 * - For all others keep them at TC0 for now
+		 */
+		if (v == pf->lan_vsi)
+			tc_map = i40e_pf_get_tc_map(pf);
+		else
+			tc_map = I40E_DEFAULT_TRAFFIC_CLASS;
+
+		ret = i40e_vsi_config_tc(pf->vsi[v], tc_map);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "Failed configuring TC for VSI seid=%d\n",
+				 pf->vsi[v]->seid);
+			/* Will try to configure as many components */
+		} else {
+			/* Re-configure VSI vectors based on updated TC map */
+			i40e_vsi_map_rings_to_vectors(pf->vsi[v]);
+			if (pf->vsi[v]->netdev)
+				i40e_dcbnl_set_all(pf->vsi[v]);
+		}
+	}
+}
+
+/**
+ * i40e_resume_port_tx - Resume port Tx
+ * @pf: PF struct
+ *
+ * Resume a port's Tx and issue a PF reset in case of failure to
+ * resume.
+ **/
+static int i40e_resume_port_tx(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	int ret;
+
+	ret = i40e_aq_resume_port_tx(hw, NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "Resume Port Tx failed, err %s aq_err %s\n",
+			  i40e_stat_str(&pf->hw, ret),
+			  i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		/* Schedule PF reset to recover */
+		set_bit(__I40E_PF_RESET_REQUESTED, pf->state);
+		i40e_service_event_schedule(pf);
+	}
+
+	return ret;
+}
+
+/**
+ * i40e_init_pf_dcb - Initialize DCB configuration
+ * @pf: PF being configured
+ *
+ * Query the current DCB configuration and cache it
+ * in the hardware structure
+ **/
+static int i40e_init_pf_dcb(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	int err = 0;
+
+	/* Do not enable DCB for SW1 and SW2 images even if the FW is capable
+	 * Also do not enable DCBx if FW LLDP agent is disabled
+	 */
+	if ((pf->hw_features & I40E_HW_NO_DCB_SUPPORT) ||
+	    (pf->flags & I40E_FLAG_DISABLE_FW_LLDP))
+		goto out;
+
+	/* Get the initial DCB configuration */
+	err = i40e_init_dcb(hw);
+	if (!err) {
+		/* Device/Function is not DCBX capable */
+		if ((!hw->func_caps.dcb) ||
+		    (hw->dcbx_status == I40E_DCBX_STATUS_DISABLED)) {
+			dev_info(&pf->pdev->dev,
+				 "DCBX offload is not supported or is disabled for this PF.\n");
+		} else {
+			/* When status is not DISABLED then DCBX in FW */
+			pf->dcbx_cap = DCB_CAP_DCBX_LLD_MANAGED |
+				       DCB_CAP_DCBX_VER_IEEE;
+
+			pf->flags |= I40E_FLAG_DCB_CAPABLE;
+			/* Enable DCB tagging only when more than one TC
+			 * or explicitly disable if only one TC
+			 */
+			if (i40e_dcb_get_num_tc(&hw->local_dcbx_config) > 1)
+				pf->flags |= I40E_FLAG_DCB_ENABLED;
+			else
+				pf->flags &= ~I40E_FLAG_DCB_ENABLED;
+			dev_dbg(&pf->pdev->dev,
+				"DCBX offload is supported for this PF.\n");
+		}
+	} else if (pf->hw.aq.asq_last_status == I40E_AQ_RC_EPERM) {
+		dev_info(&pf->pdev->dev, "FW LLDP disabled for this PF.\n");
+		pf->flags |= I40E_FLAG_DISABLE_FW_LLDP;
+	} else {
+		dev_info(&pf->pdev->dev,
+			 "Query for DCB configuration failed, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, err),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+	}
+
+out:
+	return err;
+}
+#endif /* CONFIG_I40E_DCB */
+#define SPEED_SIZE 14
+#define FC_SIZE 8
+/**
+ * i40e_print_link_message - print link up or down
+ * @vsi: the VSI for which link needs a message
+ */
+void i40e_print_link_message(struct i40e_vsi *vsi, bool isup)
+{
+	enum i40e_aq_link_speed new_speed;
+	struct i40e_pf *pf = vsi->back;
+	char *speed = "Unknown";
+	char *fc = "Unknown";
+	char *fec = "";
+	char *req_fec = "";
+	char *an = "";
+
+	new_speed = pf->hw.phy.link_info.link_speed;
+
+	if ((vsi->current_isup == isup) && (vsi->current_speed == new_speed))
+		return;
+	vsi->current_isup = isup;
+	vsi->current_speed = new_speed;
+	if (!isup) {
+		netdev_info(vsi->netdev, "NIC Link is Down\n");
+		return;
+	}
+
+	/* Warn user if link speed on NPAR enabled partition is not at
+	 * least 10GB
+	 */
+	if (pf->hw.func_caps.npar_enable &&
+	    (pf->hw.phy.link_info.link_speed == I40E_LINK_SPEED_1GB ||
+	     pf->hw.phy.link_info.link_speed == I40E_LINK_SPEED_100MB))
+		netdev_warn(vsi->netdev,
+			    "The partition detected link speed that is less than 10Gbps\n");
+
+	switch (pf->hw.phy.link_info.link_speed) {
+	case I40E_LINK_SPEED_40GB:
+		speed = "40 G";
+		break;
+	case I40E_LINK_SPEED_20GB:
+		speed = "20 G";
+		break;
+	case I40E_LINK_SPEED_25GB:
+		speed = "25 G";
+		break;
+	case I40E_LINK_SPEED_10GB:
+		speed = "10 G";
+		break;
+	case I40E_LINK_SPEED_1GB:
+		speed = "1000 M";
+		break;
+	case I40E_LINK_SPEED_100MB:
+		speed = "100 M";
+		break;
+	default:
+		break;
+	}
+
+	switch (pf->hw.fc.current_mode) {
+	case I40E_FC_FULL:
+		fc = "RX/TX";
+		break;
+	case I40E_FC_TX_PAUSE:
+		fc = "TX";
+		break;
+	case I40E_FC_RX_PAUSE:
+		fc = "RX";
+		break;
+	default:
+		fc = "None";
+		break;
+	}
+
+	if (pf->hw.phy.link_info.link_speed == I40E_LINK_SPEED_25GB) {
+		req_fec = ", Requested FEC: None";
+		fec = ", FEC: None";
+		an = ", Autoneg: False";
+
+		if (pf->hw.phy.link_info.an_info & I40E_AQ_AN_COMPLETED)
+			an = ", Autoneg: True";
+
+		if (pf->hw.phy.link_info.fec_info &
+		    I40E_AQ_CONFIG_FEC_KR_ENA)
+			fec = ", FEC: CL74 FC-FEC/BASE-R";
+		else if (pf->hw.phy.link_info.fec_info &
+			 I40E_AQ_CONFIG_FEC_RS_ENA)
+			fec = ", FEC: CL108 RS-FEC";
+
+		/* 'CL108 RS-FEC' should be displayed when RS is requested, or
+		 * both RS and FC are requested
+		 */
+		if (vsi->back->hw.phy.link_info.req_fec_info &
+		    (I40E_AQ_REQUEST_FEC_KR | I40E_AQ_REQUEST_FEC_RS)) {
+			if (vsi->back->hw.phy.link_info.req_fec_info &
+			    I40E_AQ_REQUEST_FEC_RS)
+				req_fec = ", Requested FEC: CL108 RS-FEC";
+			else
+				req_fec = ", Requested FEC: CL74 FC-FEC/BASE-R";
+		}
+	}
+
+	netdev_info(vsi->netdev, "NIC Link is Up, %sbps Full Duplex%s%s%s, Flow Control: %s\n",
+		    speed, req_fec, fec, an, fc);
+}
+
+/**
+ * i40e_up_complete - Finish the last steps of bringing up a connection
+ * @vsi: the VSI being configured
+ **/
+static int i40e_up_complete(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	int err;
+
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED)
+		i40e_vsi_configure_msix(vsi);
+	else
+		i40e_configure_msi_and_legacy(vsi);
+
+	/* start rings */
+	err = i40e_vsi_start_rings(vsi);
+	if (err)
+		return err;
+
+	clear_bit(__I40E_VSI_DOWN, vsi->state);
+	i40e_napi_enable_all(vsi);
+	i40e_vsi_enable_irq(vsi);
+
+	if ((pf->hw.phy.link_info.link_info & I40E_AQ_LINK_UP) &&
+	    (vsi->netdev)) {
+		i40e_print_link_message(vsi, true);
+		netif_tx_start_all_queues(vsi->netdev);
+		netif_carrier_on(vsi->netdev);
+	}
+
+	/* replay FDIR SB filters */
+	if (vsi->type == I40E_VSI_FDIR) {
+		/* reset fd counters */
+		pf->fd_add_err = 0;
+		pf->fd_atr_cnt = 0;
+		i40e_fdir_filter_restore(vsi);
+	}
+
+	/* On the next run of the service_task, notify any clients of the new
+	 * opened netdev
+	 */
+	set_bit(__I40E_CLIENT_SERVICE_REQUESTED, pf->state);
+	i40e_service_event_schedule(pf);
+
+	return 0;
+}
+
+/**
+ * i40e_vsi_reinit_locked - Reset the VSI
+ * @vsi: the VSI being configured
+ *
+ * Rebuild the ring structs after some configuration
+ * has changed, e.g. MTU size.
+ **/
+static void i40e_vsi_reinit_locked(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+
+	WARN_ON(in_interrupt());
+	while (test_and_set_bit(__I40E_CONFIG_BUSY, pf->state))
+		usleep_range(1000, 2000);
+	i40e_down(vsi);
+
+	i40e_up(vsi);
+	clear_bit(__I40E_CONFIG_BUSY, pf->state);
+}
+
+/**
+ * i40e_up - Bring the connection back up after being down
+ * @vsi: the VSI being configured
+ **/
+int i40e_up(struct i40e_vsi *vsi)
+{
+	int err;
+
+	err = i40e_vsi_configure(vsi);
+	if (!err)
+		err = i40e_up_complete(vsi);
+
+	return err;
+}
+
+/**
+ * i40e_force_link_state - Force the link status
+ * @pf: board private structure
+ * @is_up: whether the link state should be forced up or down
+ **/
+static i40e_status i40e_force_link_state(struct i40e_pf *pf, bool is_up)
+{
+	struct i40e_aq_get_phy_abilities_resp abilities;
+	struct i40e_aq_set_phy_config config = {0};
+	struct i40e_hw *hw = &pf->hw;
+	i40e_status err;
+	u64 mask;
+
+	/* Get the current phy config */
+	err = i40e_aq_get_phy_capabilities(hw, false, false, &abilities,
+					   NULL);
+	if (err) {
+		dev_err(&pf->pdev->dev,
+			"failed to get phy cap., ret =  %s last_status =  %s\n",
+			i40e_stat_str(hw, err),
+			i40e_aq_str(hw, hw->aq.asq_last_status));
+		return err;
+	}
+
+	/* If link needs to go up, but was not forced to go down,
+	 * no need for a flap
+	 */
+	if (is_up && abilities.phy_type != 0)
+		return I40E_SUCCESS;
+
+	/* To force link we need to set bits for all supported PHY types,
+	 * but there are now more than 32, so we need to split the bitmap
+	 * across two fields.
+	 */
+	mask = I40E_PHY_TYPES_BITMASK;
+	config.phy_type = is_up ? cpu_to_le32((u32)(mask & 0xffffffff)) : 0;
+	config.phy_type_ext = is_up ? (u8)((mask >> 32) & 0xff) : 0;
+	/* Copy the old settings, except of phy_type */
+	config.abilities = abilities.abilities;
+	config.link_speed = abilities.link_speed;
+	config.eee_capability = abilities.eee_capability;
+	config.eeer = abilities.eeer_val;
+	config.low_power_ctrl = abilities.d3_lpan;
+	err = i40e_aq_set_phy_config(hw, &config, NULL);
+
+	if (err) {
+		dev_err(&pf->pdev->dev,
+			"set phy config ret =  %s last_status =  %s\n",
+			i40e_stat_str(&pf->hw, err),
+			i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		return err;
+	}
+
+	/* Update the link info */
+	err = i40e_update_link_info(hw);
+	if (err) {
+		/* Wait a little bit (on 40G cards it sometimes takes a really
+		 * long time for link to come back from the atomic reset)
+		 * and try once more
+		 */
+		msleep(1000);
+		i40e_update_link_info(hw);
+	}
+
+	i40e_aq_set_link_restart_an(hw, true, NULL);
+
+	return I40E_SUCCESS;
+}
+
+/**
+ * i40e_down - Shutdown the connection processing
+ * @vsi: the VSI being stopped
+ **/
+void i40e_down(struct i40e_vsi *vsi)
+{
+	int i;
+
+	/* It is assumed that the caller of this function
+	 * sets the vsi->state __I40E_VSI_DOWN bit.
+	 */
+	if (vsi->netdev) {
+		netif_carrier_off(vsi->netdev);
+		netif_tx_disable(vsi->netdev);
+	}
+	i40e_vsi_disable_irq(vsi);
+	i40e_vsi_stop_rings(vsi);
+	if (vsi->type == I40E_VSI_MAIN &&
+	    vsi->back->flags & I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED)
+		i40e_force_link_state(vsi->back, false);
+	i40e_napi_disable_all(vsi);
+
+	for (i = 0; i < vsi->num_queue_pairs; i++) {
+		i40e_clean_tx_ring(vsi->tx_rings[i]);
+		if (i40e_enabled_xdp_vsi(vsi))
+			i40e_clean_tx_ring(vsi->xdp_rings[i]);
+		i40e_clean_rx_ring(vsi->rx_rings[i]);
+	}
+
+}
+
+/**
+ * i40e_validate_mqprio_qopt- validate queue mapping info
+ * @vsi: the VSI being configured
+ * @mqprio_qopt: queue parametrs
+ **/
+static int i40e_validate_mqprio_qopt(struct i40e_vsi *vsi,
+				     struct tc_mqprio_qopt_offload *mqprio_qopt)
+{
+	u64 sum_max_rate = 0;
+	u64 max_rate = 0;
+	int i;
+
+	if (mqprio_qopt->qopt.offset[0] != 0 ||
+	    mqprio_qopt->qopt.num_tc < 1 ||
+	    mqprio_qopt->qopt.num_tc > I40E_MAX_TRAFFIC_CLASS)
+		return -EINVAL;
+	for (i = 0; ; i++) {
+		if (!mqprio_qopt->qopt.count[i])
+			return -EINVAL;
+		if (mqprio_qopt->min_rate[i]) {
+			dev_err(&vsi->back->pdev->dev,
+				"Invalid min tx rate (greater than 0) specified\n");
+			return -EINVAL;
+		}
+		max_rate = mqprio_qopt->max_rate[i];
+		do_div(max_rate, I40E_BW_MBPS_DIVISOR);
+		sum_max_rate += max_rate;
+
+		if (i >= mqprio_qopt->qopt.num_tc - 1)
+			break;
+		if (mqprio_qopt->qopt.offset[i + 1] !=
+		    (mqprio_qopt->qopt.offset[i] + mqprio_qopt->qopt.count[i]))
+			return -EINVAL;
+	}
+	if (vsi->num_queue_pairs <
+	    (mqprio_qopt->qopt.offset[i] + mqprio_qopt->qopt.count[i])) {
+		return -EINVAL;
+	}
+	if (sum_max_rate > i40e_get_link_speed(vsi)) {
+		dev_err(&vsi->back->pdev->dev,
+			"Invalid max tx rate specified\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * i40e_vsi_set_default_tc_config - set default values for tc configuration
+ * @vsi: the VSI being configured
+ **/
+static void i40e_vsi_set_default_tc_config(struct i40e_vsi *vsi)
+{
+	u16 qcount;
+	int i;
+
+	/* Only TC0 is enabled */
+	vsi->tc_config.numtc = 1;
+	vsi->tc_config.enabled_tc = 1;
+	qcount = min_t(int, vsi->alloc_queue_pairs,
+		       i40e_pf_get_max_q_per_tc(vsi->back));
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		/* For the TC that is not enabled set the offset to to default
+		 * queue and allocate one queue for the given TC.
+		 */
+		vsi->tc_config.tc_info[i].qoffset = 0;
+		if (i == 0)
+			vsi->tc_config.tc_info[i].qcount = qcount;
+		else
+			vsi->tc_config.tc_info[i].qcount = 1;
+		vsi->tc_config.tc_info[i].netdev_tc = 0;
+	}
+}
+
+/**
+ * i40e_setup_tc - configure multiple traffic classes
+ * @netdev: net device to configure
+ * @type_data: tc offload data
+ **/
+static int i40e_setup_tc(struct net_device *netdev, void *type_data)
+{
+	struct tc_mqprio_qopt_offload *mqprio_qopt = type_data;
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	u8 enabled_tc = 0, num_tc, hw;
+	bool need_reset = false;
+	int ret = -EINVAL;
+	u16 mode;
+	int i;
+
+	num_tc = mqprio_qopt->qopt.num_tc;
+	hw = mqprio_qopt->qopt.hw;
+	mode = mqprio_qopt->mode;
+	if (!hw) {
+		pf->flags &= ~I40E_FLAG_TC_MQPRIO;
+		memcpy(&vsi->mqprio_qopt, mqprio_qopt, sizeof(*mqprio_qopt));
+		goto config_tc;
+	}
+
+	/* Check if MFP enabled */
+	if (pf->flags & I40E_FLAG_MFP_ENABLED) {
+		netdev_info(netdev,
+			    "Configuring TC not supported in MFP mode\n");
+		return ret;
+	}
+	switch (mode) {
+	case TC_MQPRIO_MODE_DCB:
+		pf->flags &= ~I40E_FLAG_TC_MQPRIO;
+
+		/* Check if DCB enabled to continue */
+		if (!(pf->flags & I40E_FLAG_DCB_ENABLED)) {
+			netdev_info(netdev,
+				    "DCB is not enabled for adapter\n");
+			return ret;
+		}
+
+		/* Check whether tc count is within enabled limit */
+		if (num_tc > i40e_pf_get_num_tc(pf)) {
+			netdev_info(netdev,
+				    "TC count greater than enabled on link for adapter\n");
+			return ret;
+		}
+		break;
+	case TC_MQPRIO_MODE_CHANNEL:
+		if (pf->flags & I40E_FLAG_DCB_ENABLED) {
+			netdev_info(netdev,
+				    "Full offload of TC Mqprio options is not supported when DCB is enabled\n");
+			return ret;
+		}
+		if (!(pf->flags & I40E_FLAG_MSIX_ENABLED))
+			return ret;
+		ret = i40e_validate_mqprio_qopt(vsi, mqprio_qopt);
+		if (ret)
+			return ret;
+		memcpy(&vsi->mqprio_qopt, mqprio_qopt,
+		       sizeof(*mqprio_qopt));
+		pf->flags |= I40E_FLAG_TC_MQPRIO;
+		pf->flags &= ~I40E_FLAG_DCB_ENABLED;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+config_tc:
+	/* Generate TC map for number of tc requested */
+	for (i = 0; i < num_tc; i++)
+		enabled_tc |= BIT(i);
+
+	/* Requesting same TC configuration as already enabled */
+	if (enabled_tc == vsi->tc_config.enabled_tc &&
+	    mode != TC_MQPRIO_MODE_CHANNEL)
+		return 0;
+
+	/* Quiesce VSI queues */
+	i40e_quiesce_vsi(vsi);
+
+	if (!hw && !(pf->flags & I40E_FLAG_TC_MQPRIO))
+		i40e_remove_queue_channels(vsi);
+
+	/* Configure VSI for enabled TCs */
+	ret = i40e_vsi_config_tc(vsi, enabled_tc);
+	if (ret) {
+		netdev_info(netdev, "Failed configuring TC for VSI seid=%d\n",
+			    vsi->seid);
+		need_reset = true;
+		goto exit;
+	}
+
+	if (pf->flags & I40E_FLAG_TC_MQPRIO) {
+		if (vsi->mqprio_qopt.max_rate[0]) {
+			u64 max_tx_rate = vsi->mqprio_qopt.max_rate[0];
+
+			do_div(max_tx_rate, I40E_BW_MBPS_DIVISOR);
+			ret = i40e_set_bw_limit(vsi, vsi->seid, max_tx_rate);
+			if (!ret) {
+				u64 credits = max_tx_rate;
+
+				do_div(credits, I40E_BW_CREDIT_DIVISOR);
+				dev_dbg(&vsi->back->pdev->dev,
+					"Set tx rate of %llu Mbps (count of 50Mbps %llu) for vsi->seid %u\n",
+					max_tx_rate,
+					credits,
+					vsi->seid);
+			} else {
+				need_reset = true;
+				goto exit;
+			}
+		}
+		ret = i40e_configure_queue_channels(vsi);
+		if (ret) {
+			netdev_info(netdev,
+				    "Failed configuring queue channels\n");
+			need_reset = true;
+			goto exit;
+		}
+	}
+
+exit:
+	/* Reset the configuration data to defaults, only TC0 is enabled */
+	if (need_reset) {
+		i40e_vsi_set_default_tc_config(vsi);
+		need_reset = false;
+	}
+
+	/* Unquiesce VSI */
+	i40e_unquiesce_vsi(vsi);
+	return ret;
+}
+
+/**
+ * i40e_set_cld_element - sets cloud filter element data
+ * @filter: cloud filter rule
+ * @cld: ptr to cloud filter element data
+ *
+ * This is helper function to copy data into cloud filter element
+ **/
+static inline void
+i40e_set_cld_element(struct i40e_cloud_filter *filter,
+		     struct i40e_aqc_cloud_filters_element_data *cld)
+{
+	int i, j;
+	u32 ipa;
+
+	memset(cld, 0, sizeof(*cld));
+	ether_addr_copy(cld->outer_mac, filter->dst_mac);
+	ether_addr_copy(cld->inner_mac, filter->src_mac);
+
+	if (filter->n_proto != ETH_P_IP && filter->n_proto != ETH_P_IPV6)
+		return;
+
+	if (filter->n_proto == ETH_P_IPV6) {
+#define IPV6_MAX_INDEX	(ARRAY_SIZE(filter->dst_ipv6) - 1)
+		for (i = 0, j = 0; i < ARRAY_SIZE(filter->dst_ipv6);
+		     i++, j += 2) {
+			ipa = be32_to_cpu(filter->dst_ipv6[IPV6_MAX_INDEX - i]);
+			ipa = cpu_to_le32(ipa);
+			memcpy(&cld->ipaddr.raw_v6.data[j], &ipa, sizeof(ipa));
+		}
+	} else {
+		ipa = be32_to_cpu(filter->dst_ipv4);
+		memcpy(&cld->ipaddr.v4.data, &ipa, sizeof(ipa));
+	}
+
+	cld->inner_vlan = cpu_to_le16(ntohs(filter->vlan_id));
+
+	/* tenant_id is not supported by FW now, once the support is enabled
+	 * fill the cld->tenant_id with cpu_to_le32(filter->tenant_id)
+	 */
+	if (filter->tenant_id)
+		return;
+}
+
+/**
+ * i40e_add_del_cloud_filter - Add/del cloud filter
+ * @vsi: pointer to VSI
+ * @filter: cloud filter rule
+ * @add: if true, add, if false, delete
+ *
+ * Add or delete a cloud filter for a specific flow spec.
+ * Returns 0 if the filter were successfully added.
+ **/
+int i40e_add_del_cloud_filter(struct i40e_vsi *vsi,
+			      struct i40e_cloud_filter *filter, bool add)
+{
+	struct i40e_aqc_cloud_filters_element_data cld_filter;
+	struct i40e_pf *pf = vsi->back;
+	int ret;
+	static const u16 flag_table[128] = {
+		[I40E_CLOUD_FILTER_FLAGS_OMAC]  =
+			I40E_AQC_ADD_CLOUD_FILTER_OMAC,
+		[I40E_CLOUD_FILTER_FLAGS_IMAC]  =
+			I40E_AQC_ADD_CLOUD_FILTER_IMAC,
+		[I40E_CLOUD_FILTER_FLAGS_IMAC_IVLAN]  =
+			I40E_AQC_ADD_CLOUD_FILTER_IMAC_IVLAN,
+		[I40E_CLOUD_FILTER_FLAGS_IMAC_TEN_ID] =
+			I40E_AQC_ADD_CLOUD_FILTER_IMAC_TEN_ID,
+		[I40E_CLOUD_FILTER_FLAGS_OMAC_TEN_ID_IMAC] =
+			I40E_AQC_ADD_CLOUD_FILTER_OMAC_TEN_ID_IMAC,
+		[I40E_CLOUD_FILTER_FLAGS_IMAC_IVLAN_TEN_ID] =
+			I40E_AQC_ADD_CLOUD_FILTER_IMAC_IVLAN_TEN_ID,
+		[I40E_CLOUD_FILTER_FLAGS_IIP] =
+			I40E_AQC_ADD_CLOUD_FILTER_IIP,
+	};
+
+	if (filter->flags >= ARRAY_SIZE(flag_table))
+		return I40E_ERR_CONFIG;
+
+	/* copy element needed to add cloud filter from filter */
+	i40e_set_cld_element(filter, &cld_filter);
+
+	if (filter->tunnel_type != I40E_CLOUD_TNL_TYPE_NONE)
+		cld_filter.flags = cpu_to_le16(filter->tunnel_type <<
+					     I40E_AQC_ADD_CLOUD_TNL_TYPE_SHIFT);
+
+	if (filter->n_proto == ETH_P_IPV6)
+		cld_filter.flags |= cpu_to_le16(flag_table[filter->flags] |
+						I40E_AQC_ADD_CLOUD_FLAGS_IPV6);
+	else
+		cld_filter.flags |= cpu_to_le16(flag_table[filter->flags] |
+						I40E_AQC_ADD_CLOUD_FLAGS_IPV4);
+
+	if (add)
+		ret = i40e_aq_add_cloud_filters(&pf->hw, filter->seid,
+						&cld_filter, 1);
+	else
+		ret = i40e_aq_rem_cloud_filters(&pf->hw, filter->seid,
+						&cld_filter, 1);
+	if (ret)
+		dev_dbg(&pf->pdev->dev,
+			"Failed to %s cloud filter using l4 port %u, err %d aq_err %d\n",
+			add ? "add" : "delete", filter->dst_port, ret,
+			pf->hw.aq.asq_last_status);
+	else
+		dev_info(&pf->pdev->dev,
+			 "%s cloud filter for VSI: %d\n",
+			 add ? "Added" : "Deleted", filter->seid);
+	return ret;
+}
+
+/**
+ * i40e_add_del_cloud_filter_big_buf - Add/del cloud filter using big_buf
+ * @vsi: pointer to VSI
+ * @filter: cloud filter rule
+ * @add: if true, add, if false, delete
+ *
+ * Add or delete a cloud filter for a specific flow spec using big buffer.
+ * Returns 0 if the filter were successfully added.
+ **/
+int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi,
+				      struct i40e_cloud_filter *filter,
+				      bool add)
+{
+	struct i40e_aqc_cloud_filters_element_bb cld_filter;
+	struct i40e_pf *pf = vsi->back;
+	int ret;
+
+	/* Both (src/dst) valid mac_addr are not supported */
+	if ((is_valid_ether_addr(filter->dst_mac) &&
+	     is_valid_ether_addr(filter->src_mac)) ||
+	    (is_multicast_ether_addr(filter->dst_mac) &&
+	     is_multicast_ether_addr(filter->src_mac)))
+		return -EOPNOTSUPP;
+
+	/* Big buffer cloud filter needs 'L4 port' to be non-zero. Also, UDP
+	 * ports are not supported via big buffer now.
+	 */
+	if (!filter->dst_port || filter->ip_proto == IPPROTO_UDP)
+		return -EOPNOTSUPP;
+
+	/* adding filter using src_port/src_ip is not supported at this stage */
+	if (filter->src_port || filter->src_ipv4 ||
+	    !ipv6_addr_any(&filter->ip.v6.src_ip6))
+		return -EOPNOTSUPP;
+
+	/* copy element needed to add cloud filter from filter */
+	i40e_set_cld_element(filter, &cld_filter.element);
+
+	if (is_valid_ether_addr(filter->dst_mac) ||
+	    is_valid_ether_addr(filter->src_mac) ||
+	    is_multicast_ether_addr(filter->dst_mac) ||
+	    is_multicast_ether_addr(filter->src_mac)) {
+		/* MAC + IP : unsupported mode */
+		if (filter->dst_ipv4)
+			return -EOPNOTSUPP;
+
+		/* since we validated that L4 port must be valid before
+		 * we get here, start with respective "flags" value
+		 * and update if vlan is present or not
+		 */
+		cld_filter.element.flags =
+			cpu_to_le16(I40E_AQC_ADD_CLOUD_FILTER_MAC_PORT);
+
+		if (filter->vlan_id) {
+			cld_filter.element.flags =
+			cpu_to_le16(I40E_AQC_ADD_CLOUD_FILTER_MAC_VLAN_PORT);
+		}
+
+	} else if (filter->dst_ipv4 ||
+		   !ipv6_addr_any(&filter->ip.v6.dst_ip6)) {
+		cld_filter.element.flags =
+				cpu_to_le16(I40E_AQC_ADD_CLOUD_FILTER_IP_PORT);
+		if (filter->n_proto == ETH_P_IPV6)
+			cld_filter.element.flags |=
+				cpu_to_le16(I40E_AQC_ADD_CLOUD_FLAGS_IPV6);
+		else
+			cld_filter.element.flags |=
+				cpu_to_le16(I40E_AQC_ADD_CLOUD_FLAGS_IPV4);
+	} else {
+		dev_err(&pf->pdev->dev,
+			"either mac or ip has to be valid for cloud filter\n");
+		return -EINVAL;
+	}
+
+	/* Now copy L4 port in Byte 6..7 in general fields */
+	cld_filter.general_fields[I40E_AQC_ADD_CLOUD_FV_FLU_0X16_WORD0] =
+						be16_to_cpu(filter->dst_port);
+
+	if (add) {
+		/* Validate current device switch mode, change if necessary */
+		ret = i40e_validate_and_set_switch_mode(vsi);
+		if (ret) {
+			dev_err(&pf->pdev->dev,
+				"failed to set switch mode, ret %d\n",
+				ret);
+			return ret;
+		}
+
+		ret = i40e_aq_add_cloud_filters_bb(&pf->hw, filter->seid,
+						   &cld_filter, 1);
+	} else {
+		ret = i40e_aq_rem_cloud_filters_bb(&pf->hw, filter->seid,
+						   &cld_filter, 1);
+	}
+
+	if (ret)
+		dev_dbg(&pf->pdev->dev,
+			"Failed to %s cloud filter(big buffer) err %d aq_err %d\n",
+			add ? "add" : "delete", ret, pf->hw.aq.asq_last_status);
+	else
+		dev_info(&pf->pdev->dev,
+			 "%s cloud filter for VSI: %d, L4 port: %d\n",
+			 add ? "add" : "delete", filter->seid,
+			 ntohs(filter->dst_port));
+	return ret;
+}
+
+/**
+ * i40e_parse_cls_flower - Parse tc flower filters provided by kernel
+ * @vsi: Pointer to VSI
+ * @cls_flower: Pointer to struct tc_cls_flower_offload
+ * @filter: Pointer to cloud filter structure
+ *
+ **/
+static int i40e_parse_cls_flower(struct i40e_vsi *vsi,
+				 struct tc_cls_flower_offload *f,
+				 struct i40e_cloud_filter *filter)
+{
+	u16 n_proto_mask = 0, n_proto_key = 0, addr_type = 0;
+	struct i40e_pf *pf = vsi->back;
+	u8 field_flags = 0;
+
+	if (f->dissector->used_keys &
+	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
+	      BIT(FLOW_DISSECTOR_KEY_BASIC) |
+	      BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_VLAN) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_PORTS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_KEYID))) {
+		dev_err(&pf->pdev->dev, "Unsupported key used: 0x%x\n",
+			f->dissector->used_keys);
+		return -EOPNOTSUPP;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
+		struct flow_dissector_key_keyid *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_KEYID,
+						  f->key);
+
+		struct flow_dissector_key_keyid *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_KEYID,
+						  f->mask);
+
+		if (mask->keyid != 0)
+			field_flags |= I40E_CLOUD_FIELD_TEN_ID;
+
+		filter->tenant_id = be32_to_cpu(key->keyid);
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->key);
+
+		struct flow_dissector_key_basic *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->mask);
+
+		n_proto_key = ntohs(key->n_proto);
+		n_proto_mask = ntohs(mask->n_proto);
+
+		if (n_proto_key == ETH_P_ALL) {
+			n_proto_key = 0;
+			n_proto_mask = 0;
+		}
+		filter->n_proto = n_proto_key & n_proto_mask;
+		filter->ip_proto = key->ip_proto;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+		struct flow_dissector_key_eth_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ETH_ADDRS,
+						  f->key);
+
+		struct flow_dissector_key_eth_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ETH_ADDRS,
+						  f->mask);
+
+		/* use is_broadcast and is_zero to check for all 0xf or 0 */
+		if (!is_zero_ether_addr(mask->dst)) {
+			if (is_broadcast_ether_addr(mask->dst)) {
+				field_flags |= I40E_CLOUD_FIELD_OMAC;
+			} else {
+				dev_err(&pf->pdev->dev, "Bad ether dest mask %pM\n",
+					mask->dst);
+				return I40E_ERR_CONFIG;
+			}
+		}
+
+		if (!is_zero_ether_addr(mask->src)) {
+			if (is_broadcast_ether_addr(mask->src)) {
+				field_flags |= I40E_CLOUD_FIELD_IMAC;
+			} else {
+				dev_err(&pf->pdev->dev, "Bad ether src mask %pM\n",
+					mask->src);
+				return I40E_ERR_CONFIG;
+			}
+		}
+		ether_addr_copy(filter->dst_mac, key->dst);
+		ether_addr_copy(filter->src_mac, key->src);
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_VLAN)) {
+		struct flow_dissector_key_vlan *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLAN,
+						  f->key);
+		struct flow_dissector_key_vlan *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLAN,
+						  f->mask);
+
+		if (mask->vlan_id) {
+			if (mask->vlan_id == VLAN_VID_MASK) {
+				field_flags |= I40E_CLOUD_FIELD_IVLAN;
+
+			} else {
+				dev_err(&pf->pdev->dev, "Bad vlan mask 0x%04x\n",
+					mask->vlan_id);
+				return I40E_ERR_CONFIG;
+			}
+		}
+
+		filter->vlan_id = cpu_to_be16(key->vlan_id);
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
+		struct flow_dissector_key_control *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_CONTROL,
+						  f->key);
+
+		addr_type = key->addr_type;
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+		struct flow_dissector_key_ipv4_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						  f->key);
+		struct flow_dissector_key_ipv4_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						  f->mask);
+
+		if (mask->dst) {
+			if (mask->dst == cpu_to_be32(0xffffffff)) {
+				field_flags |= I40E_CLOUD_FIELD_IIP;
+			} else {
+				mask->dst = be32_to_cpu(mask->dst);
+				dev_err(&pf->pdev->dev, "Bad ip dst mask %pI4\n",
+					&mask->dst);
+				return I40E_ERR_CONFIG;
+			}
+		}
+
+		if (mask->src) {
+			if (mask->src == cpu_to_be32(0xffffffff)) {
+				field_flags |= I40E_CLOUD_FIELD_IIP;
+			} else {
+				mask->src = be32_to_cpu(mask->src);
+				dev_err(&pf->pdev->dev, "Bad ip src mask %pI4\n",
+					&mask->src);
+				return I40E_ERR_CONFIG;
+			}
+		}
+
+		if (field_flags & I40E_CLOUD_FIELD_TEN_ID) {
+			dev_err(&pf->pdev->dev, "Tenant id not allowed for ip filter\n");
+			return I40E_ERR_CONFIG;
+		}
+		filter->dst_ipv4 = key->dst;
+		filter->src_ipv4 = key->src;
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+		struct flow_dissector_key_ipv6_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+						  f->key);
+		struct flow_dissector_key_ipv6_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+						  f->mask);
+
+		/* src and dest IPV6 address should not be LOOPBACK
+		 * (0:0:0:0:0:0:0:1), which can be represented as ::1
+		 */
+		if (ipv6_addr_loopback(&key->dst) ||
+		    ipv6_addr_loopback(&key->src)) {
+			dev_err(&pf->pdev->dev,
+				"Bad ipv6, addr is LOOPBACK\n");
+			return I40E_ERR_CONFIG;
+		}
+		if (!ipv6_addr_any(&mask->dst) || !ipv6_addr_any(&mask->src))
+			field_flags |= I40E_CLOUD_FIELD_IIP;
+
+		memcpy(&filter->src_ipv6, &key->src.s6_addr32,
+		       sizeof(filter->src_ipv6));
+		memcpy(&filter->dst_ipv6, &key->dst.s6_addr32,
+		       sizeof(filter->dst_ipv6));
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_PORTS)) {
+		struct flow_dissector_key_ports *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_PORTS,
+						  f->key);
+		struct flow_dissector_key_ports *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_PORTS,
+						  f->mask);
+
+		if (mask->src) {
+			if (mask->src == cpu_to_be16(0xffff)) {
+				field_flags |= I40E_CLOUD_FIELD_IIP;
+			} else {
+				dev_err(&pf->pdev->dev, "Bad src port mask 0x%04x\n",
+					be16_to_cpu(mask->src));
+				return I40E_ERR_CONFIG;
+			}
+		}
+
+		if (mask->dst) {
+			if (mask->dst == cpu_to_be16(0xffff)) {
+				field_flags |= I40E_CLOUD_FIELD_IIP;
+			} else {
+				dev_err(&pf->pdev->dev, "Bad dst port mask 0x%04x\n",
+					be16_to_cpu(mask->dst));
+				return I40E_ERR_CONFIG;
+			}
+		}
+
+		filter->dst_port = key->dst;
+		filter->src_port = key->src;
+
+		switch (filter->ip_proto) {
+		case IPPROTO_TCP:
+		case IPPROTO_UDP:
+			break;
+		default:
+			dev_err(&pf->pdev->dev,
+				"Only UDP and TCP transport are supported\n");
+			return -EINVAL;
+		}
+	}
+	filter->flags = field_flags;
+	return 0;
+}
+
+/**
+ * i40e_handle_tclass: Forward to a traffic class on the device
+ * @vsi: Pointer to VSI
+ * @tc: traffic class index on the device
+ * @filter: Pointer to cloud filter structure
+ *
+ **/
+static int i40e_handle_tclass(struct i40e_vsi *vsi, u32 tc,
+			      struct i40e_cloud_filter *filter)
+{
+	struct i40e_channel *ch, *ch_tmp;
+
+	/* direct to a traffic class on the same device */
+	if (tc == 0) {
+		filter->seid = vsi->seid;
+		return 0;
+	} else if (vsi->tc_config.enabled_tc & BIT(tc)) {
+		if (!filter->dst_port) {
+			dev_err(&vsi->back->pdev->dev,
+				"Specify destination port to direct to traffic class that is not default\n");
+			return -EINVAL;
+		}
+		if (list_empty(&vsi->ch_list))
+			return -EINVAL;
+		list_for_each_entry_safe(ch, ch_tmp, &vsi->ch_list,
+					 list) {
+			if (ch->seid == vsi->tc_seid_map[tc])
+				filter->seid = ch->seid;
+		}
+		return 0;
+	}
+	dev_err(&vsi->back->pdev->dev, "TC is not enabled\n");
+	return -EINVAL;
+}
+
+/**
+ * i40e_configure_clsflower - Configure tc flower filters
+ * @vsi: Pointer to VSI
+ * @cls_flower: Pointer to struct tc_cls_flower_offload
+ *
+ **/
+static int i40e_configure_clsflower(struct i40e_vsi *vsi,
+				    struct tc_cls_flower_offload *cls_flower)
+{
+	int tc = tc_classid_to_hwtc(vsi->netdev, cls_flower->classid);
+	struct i40e_cloud_filter *filter = NULL;
+	struct i40e_pf *pf = vsi->back;
+	int err = 0;
+
+	if (tc < 0) {
+		dev_err(&vsi->back->pdev->dev, "Invalid traffic class\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state) ||
+	    test_bit(__I40E_RESET_INTR_RECEIVED, pf->state))
+		return -EBUSY;
+
+	if (pf->fdir_pf_active_filters ||
+	    (!hlist_empty(&pf->fdir_filter_list))) {
+		dev_err(&vsi->back->pdev->dev,
+			"Flow Director Sideband filters exists, turn ntuple off to configure cloud filters\n");
+		return -EINVAL;
+	}
+
+	if (vsi->back->flags & I40E_FLAG_FD_SB_ENABLED) {
+		dev_err(&vsi->back->pdev->dev,
+			"Disable Flow Director Sideband, configuring Cloud filters via tc-flower\n");
+		vsi->back->flags &= ~I40E_FLAG_FD_SB_ENABLED;
+		vsi->back->flags |= I40E_FLAG_FD_SB_TO_CLOUD_FILTER;
+	}
+
+	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	if (!filter)
+		return -ENOMEM;
+
+	filter->cookie = cls_flower->cookie;
+
+	err = i40e_parse_cls_flower(vsi, cls_flower, filter);
+	if (err < 0)
+		goto err;
+
+	err = i40e_handle_tclass(vsi, tc, filter);
+	if (err < 0)
+		goto err;
+
+	/* Add cloud filter */
+	if (filter->dst_port)
+		err = i40e_add_del_cloud_filter_big_buf(vsi, filter, true);
+	else
+		err = i40e_add_del_cloud_filter(vsi, filter, true);
+
+	if (err) {
+		dev_err(&pf->pdev->dev,
+			"Failed to add cloud filter, err %s\n",
+			i40e_stat_str(&pf->hw, err));
+		goto err;
+	}
+
+	/* add filter to the ordered list */
+	INIT_HLIST_NODE(&filter->cloud_node);
+
+	hlist_add_head(&filter->cloud_node, &pf->cloud_filter_list);
+
+	pf->num_cloud_filters++;
+
+	return err;
+err:
+	kfree(filter);
+	return err;
+}
+
+/**
+ * i40e_find_cloud_filter - Find the could filter in the list
+ * @vsi: Pointer to VSI
+ * @cookie: filter specific cookie
+ *
+ **/
+static struct i40e_cloud_filter *i40e_find_cloud_filter(struct i40e_vsi *vsi,
+							unsigned long *cookie)
+{
+	struct i40e_cloud_filter *filter = NULL;
+	struct hlist_node *node2;
+
+	hlist_for_each_entry_safe(filter, node2,
+				  &vsi->back->cloud_filter_list, cloud_node)
+		if (!memcmp(cookie, &filter->cookie, sizeof(filter->cookie)))
+			return filter;
+	return NULL;
+}
+
+/**
+ * i40e_delete_clsflower - Remove tc flower filters
+ * @vsi: Pointer to VSI
+ * @cls_flower: Pointer to struct tc_cls_flower_offload
+ *
+ **/
+static int i40e_delete_clsflower(struct i40e_vsi *vsi,
+				 struct tc_cls_flower_offload *cls_flower)
+{
+	struct i40e_cloud_filter *filter = NULL;
+	struct i40e_pf *pf = vsi->back;
+	int err = 0;
+
+	filter = i40e_find_cloud_filter(vsi, &cls_flower->cookie);
+
+	if (!filter)
+		return -EINVAL;
+
+	hash_del(&filter->cloud_node);
+
+	if (filter->dst_port)
+		err = i40e_add_del_cloud_filter_big_buf(vsi, filter, false);
+	else
+		err = i40e_add_del_cloud_filter(vsi, filter, false);
+
+	kfree(filter);
+	if (err) {
+		dev_err(&pf->pdev->dev,
+			"Failed to delete cloud filter, err %s\n",
+			i40e_stat_str(&pf->hw, err));
+		return i40e_aq_rc_to_posix(err, pf->hw.aq.asq_last_status);
+	}
+
+	pf->num_cloud_filters--;
+	if (!pf->num_cloud_filters)
+		if ((pf->flags & I40E_FLAG_FD_SB_TO_CLOUD_FILTER) &&
+		    !(pf->flags & I40E_FLAG_FD_SB_INACTIVE)) {
+			pf->flags |= I40E_FLAG_FD_SB_ENABLED;
+			pf->flags &= ~I40E_FLAG_FD_SB_TO_CLOUD_FILTER;
+			pf->flags &= ~I40E_FLAG_FD_SB_INACTIVE;
+		}
+	return 0;
+}
+
+/**
+ * i40e_setup_tc_cls_flower - flower classifier offloads
+ * @netdev: net device to configure
+ * @type_data: offload data
+ **/
+static int i40e_setup_tc_cls_flower(struct i40e_netdev_priv *np,
+				    struct tc_cls_flower_offload *cls_flower)
+{
+	struct i40e_vsi *vsi = np->vsi;
+
+	switch (cls_flower->command) {
+	case TC_CLSFLOWER_REPLACE:
+		return i40e_configure_clsflower(vsi, cls_flower);
+	case TC_CLSFLOWER_DESTROY:
+		return i40e_delete_clsflower(vsi, cls_flower);
+	case TC_CLSFLOWER_STATS:
+		return -EOPNOTSUPP;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int i40e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+				  void *cb_priv)
+{
+	struct i40e_netdev_priv *np = cb_priv;
+
+	if (!tc_cls_can_offload_and_chain0(np->vsi->netdev, type_data))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return i40e_setup_tc_cls_flower(np, type_data);
+
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int i40e_setup_tc_block(struct net_device *dev,
+			       struct tc_block_offload *f)
+{
+	struct i40e_netdev_priv *np = netdev_priv(dev);
+
+	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		return tcf_block_cb_register(f->block, i40e_setup_tc_block_cb,
+					     np, np);
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block, i40e_setup_tc_block_cb, np);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int __i40e_setup_tc(struct net_device *netdev, enum tc_setup_type type,
+			   void *type_data)
+{
+	switch (type) {
+	case TC_SETUP_QDISC_MQPRIO:
+		return i40e_setup_tc(netdev, type_data);
+	case TC_SETUP_BLOCK:
+		return i40e_setup_tc_block(netdev, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+/**
+ * i40e_open - Called when a network interface is made active
+ * @netdev: network interface device structure
+ *
+ * The open entry point is called when a network interface is made
+ * active by the system (IFF_UP).  At this point all resources needed
+ * for transmit and receive operations are allocated, the interrupt
+ * handler is registered with the OS, the netdev watchdog subtask is
+ * enabled, and the stack is notified that the interface is ready.
+ *
+ * Returns 0 on success, negative value on failure
+ **/
+int i40e_open(struct net_device *netdev)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	int err;
+
+	/* disallow open during test or if eeprom is broken */
+	if (test_bit(__I40E_TESTING, pf->state) ||
+	    test_bit(__I40E_BAD_EEPROM, pf->state))
+		return -EBUSY;
+
+	netif_carrier_off(netdev);
+
+	if (i40e_force_link_state(pf, true))
+		return -EAGAIN;
+
+	err = i40e_vsi_open(vsi);
+	if (err)
+		return err;
+
+	/* configure global TSO hardware offload settings */
+	wr32(&pf->hw, I40E_GLLAN_TSOMSK_F, be32_to_cpu(TCP_FLAG_PSH |
+						       TCP_FLAG_FIN) >> 16);
+	wr32(&pf->hw, I40E_GLLAN_TSOMSK_M, be32_to_cpu(TCP_FLAG_PSH |
+						       TCP_FLAG_FIN |
+						       TCP_FLAG_CWR) >> 16);
+	wr32(&pf->hw, I40E_GLLAN_TSOMSK_L, be32_to_cpu(TCP_FLAG_CWR) >> 16);
+
+	udp_tunnel_get_rx_info(netdev);
+
+	return 0;
+}
+
+/**
+ * i40e_vsi_open -
+ * @vsi: the VSI to open
+ *
+ * Finish initialization of the VSI.
+ *
+ * Returns 0 on success, negative value on failure
+ *
+ * Note: expects to be called while under rtnl_lock()
+ **/
+int i40e_vsi_open(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	char int_name[I40E_INT_NAME_STR_LEN];
+	int err;
+
+	/* allocate descriptors */
+	err = i40e_vsi_setup_tx_resources(vsi);
+	if (err)
+		goto err_setup_tx;
+	err = i40e_vsi_setup_rx_resources(vsi);
+	if (err)
+		goto err_setup_rx;
+
+	err = i40e_vsi_configure(vsi);
+	if (err)
+		goto err_setup_rx;
+
+	if (vsi->netdev) {
+		snprintf(int_name, sizeof(int_name) - 1, "%s-%s",
+			 dev_driver_string(&pf->pdev->dev), vsi->netdev->name);
+		err = i40e_vsi_request_irq(vsi, int_name);
+		if (err)
+			goto err_setup_rx;
+
+		/* Notify the stack of the actual queue counts. */
+		err = netif_set_real_num_tx_queues(vsi->netdev,
+						   vsi->num_queue_pairs);
+		if (err)
+			goto err_set_queues;
+
+		err = netif_set_real_num_rx_queues(vsi->netdev,
+						   vsi->num_queue_pairs);
+		if (err)
+			goto err_set_queues;
+
+	} else if (vsi->type == I40E_VSI_FDIR) {
+		snprintf(int_name, sizeof(int_name) - 1, "%s-%s:fdir",
+			 dev_driver_string(&pf->pdev->dev),
+			 dev_name(&pf->pdev->dev));
+		err = i40e_vsi_request_irq(vsi, int_name);
+
+	} else {
+		err = -EINVAL;
+		goto err_setup_rx;
+	}
+
+	err = i40e_up_complete(vsi);
+	if (err)
+		goto err_up_complete;
+
+	return 0;
+
+err_up_complete:
+	i40e_down(vsi);
+err_set_queues:
+	i40e_vsi_free_irq(vsi);
+err_setup_rx:
+	i40e_vsi_free_rx_resources(vsi);
+err_setup_tx:
+	i40e_vsi_free_tx_resources(vsi);
+	if (vsi == pf->vsi[pf->lan_vsi])
+		i40e_do_reset(pf, I40E_PF_RESET_FLAG, true);
+
+	return err;
+}
+
+/**
+ * i40e_fdir_filter_exit - Cleans up the Flow Director accounting
+ * @pf: Pointer to PF
+ *
+ * This function destroys the hlist where all the Flow Director
+ * filters were saved.
+ **/
+static void i40e_fdir_filter_exit(struct i40e_pf *pf)
+{
+	struct i40e_fdir_filter *filter;
+	struct i40e_flex_pit *pit_entry, *tmp;
+	struct hlist_node *node2;
+
+	hlist_for_each_entry_safe(filter, node2,
+				  &pf->fdir_filter_list, fdir_node) {
+		hlist_del(&filter->fdir_node);
+		kfree(filter);
+	}
+
+	list_for_each_entry_safe(pit_entry, tmp, &pf->l3_flex_pit_list, list) {
+		list_del(&pit_entry->list);
+		kfree(pit_entry);
+	}
+	INIT_LIST_HEAD(&pf->l3_flex_pit_list);
+
+	list_for_each_entry_safe(pit_entry, tmp, &pf->l4_flex_pit_list, list) {
+		list_del(&pit_entry->list);
+		kfree(pit_entry);
+	}
+	INIT_LIST_HEAD(&pf->l4_flex_pit_list);
+
+	pf->fdir_pf_active_filters = 0;
+	pf->fd_tcp4_filter_cnt = 0;
+	pf->fd_udp4_filter_cnt = 0;
+	pf->fd_sctp4_filter_cnt = 0;
+	pf->fd_ip4_filter_cnt = 0;
+
+	/* Reprogram the default input set for TCP/IPv4 */
+	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV4_TCP,
+				I40E_L3_SRC_MASK | I40E_L3_DST_MASK |
+				I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
+
+	/* Reprogram the default input set for UDP/IPv4 */
+	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV4_UDP,
+				I40E_L3_SRC_MASK | I40E_L3_DST_MASK |
+				I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
+
+	/* Reprogram the default input set for SCTP/IPv4 */
+	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV4_SCTP,
+				I40E_L3_SRC_MASK | I40E_L3_DST_MASK |
+				I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
+
+	/* Reprogram the default input set for Other/IPv4 */
+	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV4_OTHER,
+				I40E_L3_SRC_MASK | I40E_L3_DST_MASK);
+
+	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_FRAG_IPV4,
+				I40E_L3_SRC_MASK | I40E_L3_DST_MASK);
+}
+
+/**
+ * i40e_cloud_filter_exit - Cleans up the cloud filters
+ * @pf: Pointer to PF
+ *
+ * This function destroys the hlist where all the cloud filters
+ * were saved.
+ **/
+static void i40e_cloud_filter_exit(struct i40e_pf *pf)
+{
+	struct i40e_cloud_filter *cfilter;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_safe(cfilter, node,
+				  &pf->cloud_filter_list, cloud_node) {
+		hlist_del(&cfilter->cloud_node);
+		kfree(cfilter);
+	}
+	pf->num_cloud_filters = 0;
+
+	if ((pf->flags & I40E_FLAG_FD_SB_TO_CLOUD_FILTER) &&
+	    !(pf->flags & I40E_FLAG_FD_SB_INACTIVE)) {
+		pf->flags |= I40E_FLAG_FD_SB_ENABLED;
+		pf->flags &= ~I40E_FLAG_FD_SB_TO_CLOUD_FILTER;
+		pf->flags &= ~I40E_FLAG_FD_SB_INACTIVE;
+	}
+}
+
+/**
+ * i40e_close - Disables a network interface
+ * @netdev: network interface device structure
+ *
+ * The close entry point is called when an interface is de-activated
+ * by the OS.  The hardware is still under the driver's control, but
+ * this netdev interface is disabled.
+ *
+ * Returns 0, this is not allowed to fail
+ **/
+int i40e_close(struct net_device *netdev)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+
+	i40e_vsi_close(vsi);
+
+	return 0;
+}
+
+/**
+ * i40e_do_reset - Start a PF or Core Reset sequence
+ * @pf: board private structure
+ * @reset_flags: which reset is requested
+ * @lock_acquired: indicates whether or not the lock has been acquired
+ * before this function was called.
+ *
+ * The essential difference in resets is that the PF Reset
+ * doesn't clear the packet buffers, doesn't reset the PE
+ * firmware, and doesn't bother the other PFs on the chip.
+ **/
+void i40e_do_reset(struct i40e_pf *pf, u32 reset_flags, bool lock_acquired)
+{
+	u32 val;
+
+	WARN_ON(in_interrupt());
+
+
+	/* do the biggest reset indicated */
+	if (reset_flags & BIT_ULL(__I40E_GLOBAL_RESET_REQUESTED)) {
+
+		/* Request a Global Reset
+		 *
+		 * This will start the chip's countdown to the actual full
+		 * chip reset event, and a warning interrupt to be sent
+		 * to all PFs, including the requestor.  Our handler
+		 * for the warning interrupt will deal with the shutdown
+		 * and recovery of the switch setup.
+		 */
+		dev_dbg(&pf->pdev->dev, "GlobalR requested\n");
+		val = rd32(&pf->hw, I40E_GLGEN_RTRIG);
+		val |= I40E_GLGEN_RTRIG_GLOBR_MASK;
+		wr32(&pf->hw, I40E_GLGEN_RTRIG, val);
+
+	} else if (reset_flags & BIT_ULL(__I40E_CORE_RESET_REQUESTED)) {
+
+		/* Request a Core Reset
+		 *
+		 * Same as Global Reset, except does *not* include the MAC/PHY
+		 */
+		dev_dbg(&pf->pdev->dev, "CoreR requested\n");
+		val = rd32(&pf->hw, I40E_GLGEN_RTRIG);
+		val |= I40E_GLGEN_RTRIG_CORER_MASK;
+		wr32(&pf->hw, I40E_GLGEN_RTRIG, val);
+		i40e_flush(&pf->hw);
+
+	} else if (reset_flags & I40E_PF_RESET_FLAG) {
+
+		/* Request a PF Reset
+		 *
+		 * Resets only the PF-specific registers
+		 *
+		 * This goes directly to the tear-down and rebuild of
+		 * the switch, since we need to do all the recovery as
+		 * for the Core Reset.
+		 */
+		dev_dbg(&pf->pdev->dev, "PFR requested\n");
+		i40e_handle_reset_warning(pf, lock_acquired);
+
+	} else if (reset_flags & BIT_ULL(__I40E_REINIT_REQUESTED)) {
+		int v;
+
+		/* Find the VSI(s) that requested a re-init */
+		dev_info(&pf->pdev->dev,
+			 "VSI reinit requested\n");
+		for (v = 0; v < pf->num_alloc_vsi; v++) {
+			struct i40e_vsi *vsi = pf->vsi[v];
+
+			if (vsi != NULL &&
+			    test_and_clear_bit(__I40E_VSI_REINIT_REQUESTED,
+					       vsi->state))
+				i40e_vsi_reinit_locked(pf->vsi[v]);
+		}
+	} else if (reset_flags & BIT_ULL(__I40E_DOWN_REQUESTED)) {
+		int v;
+
+		/* Find the VSI(s) that needs to be brought down */
+		dev_info(&pf->pdev->dev, "VSI down requested\n");
+		for (v = 0; v < pf->num_alloc_vsi; v++) {
+			struct i40e_vsi *vsi = pf->vsi[v];
+
+			if (vsi != NULL &&
+			    test_and_clear_bit(__I40E_VSI_DOWN_REQUESTED,
+					       vsi->state)) {
+				set_bit(__I40E_VSI_DOWN, vsi->state);
+				i40e_down(vsi);
+			}
+		}
+	} else {
+		dev_info(&pf->pdev->dev,
+			 "bad reset request 0x%08x\n", reset_flags);
+	}
+}
+
+#ifdef CONFIG_I40E_DCB
+/**
+ * i40e_dcb_need_reconfig - Check if DCB needs reconfig
+ * @pf: board private structure
+ * @old_cfg: current DCB config
+ * @new_cfg: new DCB config
+ **/
+bool i40e_dcb_need_reconfig(struct i40e_pf *pf,
+			    struct i40e_dcbx_config *old_cfg,
+			    struct i40e_dcbx_config *new_cfg)
+{
+	bool need_reconfig = false;
+
+	/* Check if ETS configuration has changed */
+	if (memcmp(&new_cfg->etscfg,
+		   &old_cfg->etscfg,
+		   sizeof(new_cfg->etscfg))) {
+		/* If Priority Table has changed reconfig is needed */
+		if (memcmp(&new_cfg->etscfg.prioritytable,
+			   &old_cfg->etscfg.prioritytable,
+			   sizeof(new_cfg->etscfg.prioritytable))) {
+			need_reconfig = true;
+			dev_dbg(&pf->pdev->dev, "ETS UP2TC changed.\n");
+		}
+
+		if (memcmp(&new_cfg->etscfg.tcbwtable,
+			   &old_cfg->etscfg.tcbwtable,
+			   sizeof(new_cfg->etscfg.tcbwtable)))
+			dev_dbg(&pf->pdev->dev, "ETS TC BW Table changed.\n");
+
+		if (memcmp(&new_cfg->etscfg.tsatable,
+			   &old_cfg->etscfg.tsatable,
+			   sizeof(new_cfg->etscfg.tsatable)))
+			dev_dbg(&pf->pdev->dev, "ETS TSA Table changed.\n");
+	}
+
+	/* Check if PFC configuration has changed */
+	if (memcmp(&new_cfg->pfc,
+		   &old_cfg->pfc,
+		   sizeof(new_cfg->pfc))) {
+		need_reconfig = true;
+		dev_dbg(&pf->pdev->dev, "PFC config change detected.\n");
+	}
+
+	/* Check if APP Table has changed */
+	if (memcmp(&new_cfg->app,
+		   &old_cfg->app,
+		   sizeof(new_cfg->app))) {
+		need_reconfig = true;
+		dev_dbg(&pf->pdev->dev, "APP Table change detected.\n");
+	}
+
+	dev_dbg(&pf->pdev->dev, "dcb need_reconfig=%d\n", need_reconfig);
+	return need_reconfig;
+}
+
+/**
+ * i40e_handle_lldp_event - Handle LLDP Change MIB event
+ * @pf: board private structure
+ * @e: event info posted on ARQ
+ **/
+static int i40e_handle_lldp_event(struct i40e_pf *pf,
+				  struct i40e_arq_event_info *e)
+{
+	struct i40e_aqc_lldp_get_mib *mib =
+		(struct i40e_aqc_lldp_get_mib *)&e->desc.params.raw;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_dcbx_config tmp_dcbx_cfg;
+	bool need_reconfig = false;
+	int ret = 0;
+	u8 type;
+
+	/* Not DCB capable or capability disabled */
+	if (!(pf->flags & I40E_FLAG_DCB_CAPABLE))
+		return ret;
+
+	/* Ignore if event is not for Nearest Bridge */
+	type = ((mib->type >> I40E_AQ_LLDP_BRIDGE_TYPE_SHIFT)
+		& I40E_AQ_LLDP_BRIDGE_TYPE_MASK);
+	dev_dbg(&pf->pdev->dev, "LLDP event mib bridge type 0x%x\n", type);
+	if (type != I40E_AQ_LLDP_BRIDGE_TYPE_NEAREST_BRIDGE)
+		return ret;
+
+	/* Check MIB Type and return if event for Remote MIB update */
+	type = mib->type & I40E_AQ_LLDP_MIB_TYPE_MASK;
+	dev_dbg(&pf->pdev->dev,
+		"LLDP event mib type %s\n", type ? "remote" : "local");
+	if (type == I40E_AQ_LLDP_MIB_REMOTE) {
+		/* Update the remote cached instance and return */
+		ret = i40e_aq_get_dcb_config(hw, I40E_AQ_LLDP_MIB_REMOTE,
+				I40E_AQ_LLDP_BRIDGE_TYPE_NEAREST_BRIDGE,
+				&hw->remote_dcbx_config);
+		goto exit;
+	}
+
+	/* Store the old configuration */
+	tmp_dcbx_cfg = hw->local_dcbx_config;
+
+	/* Reset the old DCBx configuration data */
+	memset(&hw->local_dcbx_config, 0, sizeof(hw->local_dcbx_config));
+	/* Get updated DCBX data from firmware */
+	ret = i40e_get_dcb_config(&pf->hw);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "Failed querying DCB configuration data from firmware, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		goto exit;
+	}
+
+	/* No change detected in DCBX configs */
+	if (!memcmp(&tmp_dcbx_cfg, &hw->local_dcbx_config,
+		    sizeof(tmp_dcbx_cfg))) {
+		dev_dbg(&pf->pdev->dev, "No change detected in DCBX configuration.\n");
+		goto exit;
+	}
+
+	need_reconfig = i40e_dcb_need_reconfig(pf, &tmp_dcbx_cfg,
+					       &hw->local_dcbx_config);
+
+	i40e_dcbnl_flush_apps(pf, &tmp_dcbx_cfg, &hw->local_dcbx_config);
+
+	if (!need_reconfig)
+		goto exit;
+
+	/* Enable DCB tagging only when more than one TC */
+	if (i40e_dcb_get_num_tc(&hw->local_dcbx_config) > 1)
+		pf->flags |= I40E_FLAG_DCB_ENABLED;
+	else
+		pf->flags &= ~I40E_FLAG_DCB_ENABLED;
+
+	set_bit(__I40E_PORT_SUSPENDED, pf->state);
+	/* Reconfiguration needed quiesce all VSIs */
+	i40e_pf_quiesce_all_vsi(pf);
+
+	/* Changes in configuration update VEB/VSI */
+	i40e_dcb_reconfigure(pf);
+
+	ret = i40e_resume_port_tx(pf);
+
+	clear_bit(__I40E_PORT_SUSPENDED, pf->state);
+	/* In case of error no point in resuming VSIs */
+	if (ret)
+		goto exit;
+
+	/* Wait for the PF's queues to be disabled */
+	ret = i40e_pf_wait_queues_disabled(pf);
+	if (ret) {
+		/* Schedule PF reset to recover */
+		set_bit(__I40E_PF_RESET_REQUESTED, pf->state);
+		i40e_service_event_schedule(pf);
+	} else {
+		i40e_pf_unquiesce_all_vsi(pf);
+	set_bit(__I40E_CLIENT_SERVICE_REQUESTED, pf->state);
+	set_bit(__I40E_CLIENT_L2_CHANGE, pf->state);
+	}
+
+exit:
+	return ret;
+}
+#endif /* CONFIG_I40E_DCB */
+
+/**
+ * i40e_do_reset_safe - Protected reset path for userland calls.
+ * @pf: board private structure
+ * @reset_flags: which reset is requested
+ *
+ **/
+void i40e_do_reset_safe(struct i40e_pf *pf, u32 reset_flags)
+{
+	rtnl_lock();
+	i40e_do_reset(pf, reset_flags, true);
+	rtnl_unlock();
+}
+
+/**
+ * i40e_handle_lan_overflow_event - Handler for LAN queue overflow event
+ * @pf: board private structure
+ * @e: event info posted on ARQ
+ *
+ * Handler for LAN Queue Overflow Event generated by the firmware for PF
+ * and VF queues
+ **/
+static void i40e_handle_lan_overflow_event(struct i40e_pf *pf,
+					   struct i40e_arq_event_info *e)
+{
+	struct i40e_aqc_lan_overflow *data =
+		(struct i40e_aqc_lan_overflow *)&e->desc.params.raw;
+	u32 queue = le32_to_cpu(data->prtdcb_rupto);
+	u32 qtx_ctl = le32_to_cpu(data->otx_ctl);
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_vf *vf;
+	u16 vf_id;
+
+	dev_dbg(&pf->pdev->dev, "overflow Rx Queue Number = %d QTX_CTL=0x%08x\n",
+		queue, qtx_ctl);
+
+	/* Queue belongs to VF, find the VF and issue VF reset */
+	if (((qtx_ctl & I40E_QTX_CTL_PFVF_Q_MASK)
+	    >> I40E_QTX_CTL_PFVF_Q_SHIFT) == I40E_QTX_CTL_VF_QUEUE) {
+		vf_id = (u16)((qtx_ctl & I40E_QTX_CTL_VFVM_INDX_MASK)
+			 >> I40E_QTX_CTL_VFVM_INDX_SHIFT);
+		vf_id -= hw->func_caps.vf_base_id;
+		vf = &pf->vf[vf_id];
+		i40e_vc_notify_vf_reset(vf);
+		/* Allow VF to process pending reset notification */
+		msleep(20);
+		i40e_reset_vf(vf, false);
+	}
+}
+
+/**
+ * i40e_get_cur_guaranteed_fd_count - Get the consumed guaranteed FD filters
+ * @pf: board private structure
+ **/
+u32 i40e_get_cur_guaranteed_fd_count(struct i40e_pf *pf)
+{
+	u32 val, fcnt_prog;
+
+	val = rd32(&pf->hw, I40E_PFQF_FDSTAT);
+	fcnt_prog = (val & I40E_PFQF_FDSTAT_GUARANT_CNT_MASK);
+	return fcnt_prog;
+}
+
+/**
+ * i40e_get_current_fd_count - Get total FD filters programmed for this PF
+ * @pf: board private structure
+ **/
+u32 i40e_get_current_fd_count(struct i40e_pf *pf)
+{
+	u32 val, fcnt_prog;
+
+	val = rd32(&pf->hw, I40E_PFQF_FDSTAT);
+	fcnt_prog = (val & I40E_PFQF_FDSTAT_GUARANT_CNT_MASK) +
+		    ((val & I40E_PFQF_FDSTAT_BEST_CNT_MASK) >>
+		      I40E_PFQF_FDSTAT_BEST_CNT_SHIFT);
+	return fcnt_prog;
+}
+
+/**
+ * i40e_get_global_fd_count - Get total FD filters programmed on device
+ * @pf: board private structure
+ **/
+u32 i40e_get_global_fd_count(struct i40e_pf *pf)
+{
+	u32 val, fcnt_prog;
+
+	val = rd32(&pf->hw, I40E_GLQF_FDCNT_0);
+	fcnt_prog = (val & I40E_GLQF_FDCNT_0_GUARANT_CNT_MASK) +
+		    ((val & I40E_GLQF_FDCNT_0_BESTCNT_MASK) >>
+		     I40E_GLQF_FDCNT_0_BESTCNT_SHIFT);
+	return fcnt_prog;
+}
+
+/**
+ * i40e_reenable_fdir_sb - Restore FDir SB capability
+ * @pf: board private structure
+ **/
+static void i40e_reenable_fdir_sb(struct i40e_pf *pf)
+{
+	if (test_and_clear_bit(__I40E_FD_SB_AUTO_DISABLED, pf->state))
+		if ((pf->flags & I40E_FLAG_FD_SB_ENABLED) &&
+		    (I40E_DEBUG_FD & pf->hw.debug_mask))
+			dev_info(&pf->pdev->dev, "FD Sideband/ntuple is being enabled since we have space in the table now\n");
+}
+
+/**
+ * i40e_reenable_fdir_atr - Restore FDir ATR capability
+ * @pf: board private structure
+ **/
+static void i40e_reenable_fdir_atr(struct i40e_pf *pf)
+{
+	if (test_and_clear_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state)) {
+		/* ATR uses the same filtering logic as SB rules. It only
+		 * functions properly if the input set mask is at the default
+		 * settings. It is safe to restore the default input set
+		 * because there are no active TCPv4 filter rules.
+		 */
+		i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV4_TCP,
+					I40E_L3_SRC_MASK | I40E_L3_DST_MASK |
+					I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
+
+		if ((pf->flags & I40E_FLAG_FD_ATR_ENABLED) &&
+		    (I40E_DEBUG_FD & pf->hw.debug_mask))
+			dev_info(&pf->pdev->dev, "ATR is being enabled since we have space in the table and there are no conflicting ntuple rules\n");
+	}
+}
+
+/**
+ * i40e_delete_invalid_filter - Delete an invalid FDIR filter
+ * @pf: board private structure
+ * @filter: FDir filter to remove
+ */
+static void i40e_delete_invalid_filter(struct i40e_pf *pf,
+				       struct i40e_fdir_filter *filter)
+{
+	/* Update counters */
+	pf->fdir_pf_active_filters--;
+	pf->fd_inv = 0;
+
+	switch (filter->flow_type) {
+	case TCP_V4_FLOW:
+		pf->fd_tcp4_filter_cnt--;
+		break;
+	case UDP_V4_FLOW:
+		pf->fd_udp4_filter_cnt--;
+		break;
+	case SCTP_V4_FLOW:
+		pf->fd_sctp4_filter_cnt--;
+		break;
+	case IP_USER_FLOW:
+		switch (filter->ip4_proto) {
+		case IPPROTO_TCP:
+			pf->fd_tcp4_filter_cnt--;
+			break;
+		case IPPROTO_UDP:
+			pf->fd_udp4_filter_cnt--;
+			break;
+		case IPPROTO_SCTP:
+			pf->fd_sctp4_filter_cnt--;
+			break;
+		case IPPROTO_IP:
+			pf->fd_ip4_filter_cnt--;
+			break;
+		}
+		break;
+	}
+
+	/* Remove the filter from the list and free memory */
+	hlist_del(&filter->fdir_node);
+	kfree(filter);
+}
+
+/**
+ * i40e_fdir_check_and_reenable - Function to reenabe FD ATR or SB if disabled
+ * @pf: board private structure
+ **/
+void i40e_fdir_check_and_reenable(struct i40e_pf *pf)
+{
+	struct i40e_fdir_filter *filter;
+	u32 fcnt_prog, fcnt_avail;
+	struct hlist_node *node;
+
+	if (test_bit(__I40E_FD_FLUSH_REQUESTED, pf->state))
+		return;
+
+	/* Check if we have enough room to re-enable FDir SB capability. */
+	fcnt_prog = i40e_get_global_fd_count(pf);
+	fcnt_avail = pf->fdir_pf_filter_count;
+	if ((fcnt_prog < (fcnt_avail - I40E_FDIR_BUFFER_HEAD_ROOM)) ||
+	    (pf->fd_add_err == 0) ||
+	    (i40e_get_current_atr_cnt(pf) < pf->fd_atr_cnt))
+		i40e_reenable_fdir_sb(pf);
+
+	/* We should wait for even more space before re-enabling ATR.
+	 * Additionally, we cannot enable ATR as long as we still have TCP SB
+	 * rules active.
+	 */
+	if ((fcnt_prog < (fcnt_avail - I40E_FDIR_BUFFER_HEAD_ROOM_FOR_ATR)) &&
+	    (pf->fd_tcp4_filter_cnt == 0))
+		i40e_reenable_fdir_atr(pf);
+
+	/* if hw had a problem adding a filter, delete it */
+	if (pf->fd_inv > 0) {
+		hlist_for_each_entry_safe(filter, node,
+					  &pf->fdir_filter_list, fdir_node)
+			if (filter->fd_id == pf->fd_inv)
+				i40e_delete_invalid_filter(pf, filter);
+	}
+}
+
+#define I40E_MIN_FD_FLUSH_INTERVAL 10
+#define I40E_MIN_FD_FLUSH_SB_ATR_UNSTABLE 30
+/**
+ * i40e_fdir_flush_and_replay - Function to flush all FD filters and replay SB
+ * @pf: board private structure
+ **/
+static void i40e_fdir_flush_and_replay(struct i40e_pf *pf)
+{
+	unsigned long min_flush_time;
+	int flush_wait_retry = 50;
+	bool disable_atr = false;
+	int fd_room;
+	int reg;
+
+	if (!time_after(jiffies, pf->fd_flush_timestamp +
+				 (I40E_MIN_FD_FLUSH_INTERVAL * HZ)))
+		return;
+
+	/* If the flush is happening too quick and we have mostly SB rules we
+	 * should not re-enable ATR for some time.
+	 */
+	min_flush_time = pf->fd_flush_timestamp +
+			 (I40E_MIN_FD_FLUSH_SB_ATR_UNSTABLE * HZ);
+	fd_room = pf->fdir_pf_filter_count - pf->fdir_pf_active_filters;
+
+	if (!(time_after(jiffies, min_flush_time)) &&
+	    (fd_room < I40E_FDIR_BUFFER_HEAD_ROOM_FOR_ATR)) {
+		if (I40E_DEBUG_FD & pf->hw.debug_mask)
+			dev_info(&pf->pdev->dev, "ATR disabled, not enough FD filter space.\n");
+		disable_atr = true;
+	}
+
+	pf->fd_flush_timestamp = jiffies;
+	set_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state);
+	/* flush all filters */
+	wr32(&pf->hw, I40E_PFQF_CTL_1,
+	     I40E_PFQF_CTL_1_CLEARFDTABLE_MASK);
+	i40e_flush(&pf->hw);
+	pf->fd_flush_cnt++;
+	pf->fd_add_err = 0;
+	do {
+		/* Check FD flush status every 5-6msec */
+		usleep_range(5000, 6000);
+		reg = rd32(&pf->hw, I40E_PFQF_CTL_1);
+		if (!(reg & I40E_PFQF_CTL_1_CLEARFDTABLE_MASK))
+			break;
+	} while (flush_wait_retry--);
+	if (reg & I40E_PFQF_CTL_1_CLEARFDTABLE_MASK) {
+		dev_warn(&pf->pdev->dev, "FD table did not flush, needs more time\n");
+	} else {
+		/* replay sideband filters */
+		i40e_fdir_filter_restore(pf->vsi[pf->lan_vsi]);
+		if (!disable_atr && !pf->fd_tcp4_filter_cnt)
+			clear_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state);
+		clear_bit(__I40E_FD_FLUSH_REQUESTED, pf->state);
+		if (I40E_DEBUG_FD & pf->hw.debug_mask)
+			dev_info(&pf->pdev->dev, "FD Filter table flushed and FD-SB replayed.\n");
+	}
+}
+
+/**
+ * i40e_get_current_atr_count - Get the count of total FD ATR filters programmed
+ * @pf: board private structure
+ **/
+u32 i40e_get_current_atr_cnt(struct i40e_pf *pf)
+{
+	return i40e_get_current_fd_count(pf) - pf->fdir_pf_active_filters;
+}
+
+/* We can see up to 256 filter programming desc in transit if the filters are
+ * being applied really fast; before we see the first
+ * filter miss error on Rx queue 0. Accumulating enough error messages before
+ * reacting will make sure we don't cause flush too often.
+ */
+#define I40E_MAX_FD_PROGRAM_ERROR 256
+
+/**
+ * i40e_fdir_reinit_subtask - Worker thread to reinit FDIR filter table
+ * @pf: board private structure
+ **/
+static void i40e_fdir_reinit_subtask(struct i40e_pf *pf)
+{
+
+	/* if interface is down do nothing */
+	if (test_bit(__I40E_DOWN, pf->state))
+		return;
+
+	if (test_bit(__I40E_FD_FLUSH_REQUESTED, pf->state))
+		i40e_fdir_flush_and_replay(pf);
+
+	i40e_fdir_check_and_reenable(pf);
+
+}
+
+/**
+ * i40e_vsi_link_event - notify VSI of a link event
+ * @vsi: vsi to be notified
+ * @link_up: link up or down
+ **/
+static void i40e_vsi_link_event(struct i40e_vsi *vsi, bool link_up)
+{
+	if (!vsi || test_bit(__I40E_VSI_DOWN, vsi->state))
+		return;
+
+	switch (vsi->type) {
+	case I40E_VSI_MAIN:
+		if (!vsi->netdev || !vsi->netdev_registered)
+			break;
+
+		if (link_up) {
+			netif_carrier_on(vsi->netdev);
+			netif_tx_wake_all_queues(vsi->netdev);
+		} else {
+			netif_carrier_off(vsi->netdev);
+			netif_tx_stop_all_queues(vsi->netdev);
+		}
+		break;
+
+	case I40E_VSI_SRIOV:
+	case I40E_VSI_VMDQ2:
+	case I40E_VSI_CTRL:
+	case I40E_VSI_IWARP:
+	case I40E_VSI_MIRROR:
+	default:
+		/* there is no notification for other VSIs */
+		break;
+	}
+}
+
+/**
+ * i40e_veb_link_event - notify elements on the veb of a link event
+ * @veb: veb to be notified
+ * @link_up: link up or down
+ **/
+static void i40e_veb_link_event(struct i40e_veb *veb, bool link_up)
+{
+	struct i40e_pf *pf;
+	int i;
+
+	if (!veb || !veb->pf)
+		return;
+	pf = veb->pf;
+
+	/* depth first... */
+	for (i = 0; i < I40E_MAX_VEB; i++)
+		if (pf->veb[i] && (pf->veb[i]->uplink_seid == veb->seid))
+			i40e_veb_link_event(pf->veb[i], link_up);
+
+	/* ... now the local VSIs */
+	for (i = 0; i < pf->num_alloc_vsi; i++)
+		if (pf->vsi[i] && (pf->vsi[i]->uplink_seid == veb->seid))
+			i40e_vsi_link_event(pf->vsi[i], link_up);
+}
+
+/**
+ * i40e_link_event - Update netif_carrier status
+ * @pf: board private structure
+ **/
+static void i40e_link_event(struct i40e_pf *pf)
+{
+	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+	u8 new_link_speed, old_link_speed;
+	i40e_status status;
+	bool new_link, old_link;
+
+	/* save off old link status information */
+	pf->hw.phy.link_info_old = pf->hw.phy.link_info;
+
+	/* set this to force the get_link_status call to refresh state */
+	pf->hw.phy.get_link_info = true;
+
+	old_link = (pf->hw.phy.link_info_old.link_info & I40E_AQ_LINK_UP);
+
+	status = i40e_get_link_status(&pf->hw, &new_link);
+
+	/* On success, disable temp link polling */
+	if (status == I40E_SUCCESS) {
+		clear_bit(__I40E_TEMP_LINK_POLLING, pf->state);
+	} else {
+		/* Enable link polling temporarily until i40e_get_link_status
+		 * returns I40E_SUCCESS
+		 */
+		set_bit(__I40E_TEMP_LINK_POLLING, pf->state);
+		dev_dbg(&pf->pdev->dev, "couldn't get link state, status: %d\n",
+			status);
+		return;
+	}
+
+	old_link_speed = pf->hw.phy.link_info_old.link_speed;
+	new_link_speed = pf->hw.phy.link_info.link_speed;
+
+	if (new_link == old_link &&
+	    new_link_speed == old_link_speed &&
+	    (test_bit(__I40E_VSI_DOWN, vsi->state) ||
+	     new_link == netif_carrier_ok(vsi->netdev)))
+		return;
+
+	i40e_print_link_message(vsi, new_link);
+
+	/* Notify the base of the switch tree connected to
+	 * the link.  Floating VEBs are not notified.
+	 */
+	if (pf->lan_veb != I40E_NO_VEB && pf->veb[pf->lan_veb])
+		i40e_veb_link_event(pf->veb[pf->lan_veb], new_link);
+	else
+		i40e_vsi_link_event(vsi, new_link);
+
+	if (pf->vf)
+		i40e_vc_notify_link_state(pf);
+
+	if (pf->flags & I40E_FLAG_PTP)
+		i40e_ptp_set_increment(pf);
+}
+
+/**
+ * i40e_watchdog_subtask - periodic checks not using event driven response
+ * @pf: board private structure
+ **/
+static void i40e_watchdog_subtask(struct i40e_pf *pf)
+{
+	int i;
+
+	/* if interface is down do nothing */
+	if (test_bit(__I40E_DOWN, pf->state) ||
+	    test_bit(__I40E_CONFIG_BUSY, pf->state))
+		return;
+
+	/* make sure we don't do these things too often */
+	if (time_before(jiffies, (pf->service_timer_previous +
+				  pf->service_timer_period)))
+		return;
+	pf->service_timer_previous = jiffies;
+
+	if ((pf->flags & I40E_FLAG_LINK_POLLING_ENABLED) ||
+	    test_bit(__I40E_TEMP_LINK_POLLING, pf->state))
+		i40e_link_event(pf);
+
+	/* Update the stats for active netdevs so the network stack
+	 * can look at updated numbers whenever it cares to
+	 */
+	for (i = 0; i < pf->num_alloc_vsi; i++)
+		if (pf->vsi[i] && pf->vsi[i]->netdev)
+			i40e_update_stats(pf->vsi[i]);
+
+	if (pf->flags & I40E_FLAG_VEB_STATS_ENABLED) {
+		/* Update the stats for the active switching components */
+		for (i = 0; i < I40E_MAX_VEB; i++)
+			if (pf->veb[i])
+				i40e_update_veb_stats(pf->veb[i]);
+	}
+
+	i40e_ptp_rx_hang(pf);
+	i40e_ptp_tx_hang(pf);
+}
+
+/**
+ * i40e_reset_subtask - Set up for resetting the device and driver
+ * @pf: board private structure
+ **/
+static void i40e_reset_subtask(struct i40e_pf *pf)
+{
+	u32 reset_flags = 0;
+
+	if (test_bit(__I40E_REINIT_REQUESTED, pf->state)) {
+		reset_flags |= BIT(__I40E_REINIT_REQUESTED);
+		clear_bit(__I40E_REINIT_REQUESTED, pf->state);
+	}
+	if (test_bit(__I40E_PF_RESET_REQUESTED, pf->state)) {
+		reset_flags |= BIT(__I40E_PF_RESET_REQUESTED);
+		clear_bit(__I40E_PF_RESET_REQUESTED, pf->state);
+	}
+	if (test_bit(__I40E_CORE_RESET_REQUESTED, pf->state)) {
+		reset_flags |= BIT(__I40E_CORE_RESET_REQUESTED);
+		clear_bit(__I40E_CORE_RESET_REQUESTED, pf->state);
+	}
+	if (test_bit(__I40E_GLOBAL_RESET_REQUESTED, pf->state)) {
+		reset_flags |= BIT(__I40E_GLOBAL_RESET_REQUESTED);
+		clear_bit(__I40E_GLOBAL_RESET_REQUESTED, pf->state);
+	}
+	if (test_bit(__I40E_DOWN_REQUESTED, pf->state)) {
+		reset_flags |= BIT(__I40E_DOWN_REQUESTED);
+		clear_bit(__I40E_DOWN_REQUESTED, pf->state);
+	}
+
+	/* If there's a recovery already waiting, it takes
+	 * precedence before starting a new reset sequence.
+	 */
+	if (test_bit(__I40E_RESET_INTR_RECEIVED, pf->state)) {
+		i40e_prep_for_reset(pf, false);
+		i40e_reset(pf);
+		i40e_rebuild(pf, false, false);
+	}
+
+	/* If we're already down or resetting, just bail */
+	if (reset_flags &&
+	    !test_bit(__I40E_DOWN, pf->state) &&
+	    !test_bit(__I40E_CONFIG_BUSY, pf->state)) {
+		i40e_do_reset(pf, reset_flags, false);
+	}
+}
+
+/**
+ * i40e_handle_link_event - Handle link event
+ * @pf: board private structure
+ * @e: event info posted on ARQ
+ **/
+static void i40e_handle_link_event(struct i40e_pf *pf,
+				   struct i40e_arq_event_info *e)
+{
+	struct i40e_aqc_get_link_status *status =
+		(struct i40e_aqc_get_link_status *)&e->desc.params.raw;
+
+	/* Do a new status request to re-enable LSE reporting
+	 * and load new status information into the hw struct
+	 * This completely ignores any state information
+	 * in the ARQ event info, instead choosing to always
+	 * issue the AQ update link status command.
+	 */
+	i40e_link_event(pf);
+
+	/* Check if module meets thermal requirements */
+	if (status->phy_type == I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP) {
+		dev_err(&pf->pdev->dev,
+			"Rx/Tx is disabled on this device because the module does not meet thermal requirements.\n");
+		dev_err(&pf->pdev->dev,
+			"Refer to the Intel(R) Ethernet Adapters and Devices User Guide for a list of supported modules.\n");
+	} else {
+		/* check for unqualified module, if link is down, suppress
+		 * the message if link was forced to be down.
+		 */
+		if ((status->link_info & I40E_AQ_MEDIA_AVAILABLE) &&
+		    (!(status->an_info & I40E_AQ_QUALIFIED_MODULE)) &&
+		    (!(status->link_info & I40E_AQ_LINK_UP)) &&
+		    (!(pf->flags & I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED))) {
+			dev_err(&pf->pdev->dev,
+				"Rx/Tx is disabled on this device because an unsupported SFP module type was detected.\n");
+			dev_err(&pf->pdev->dev,
+				"Refer to the Intel(R) Ethernet Adapters and Devices User Guide for a list of supported modules.\n");
+		}
+	}
+}
+
+/**
+ * i40e_clean_adminq_subtask - Clean the AdminQ rings
+ * @pf: board private structure
+ **/
+static void i40e_clean_adminq_subtask(struct i40e_pf *pf)
+{
+	struct i40e_arq_event_info event;
+	struct i40e_hw *hw = &pf->hw;
+	u16 pending, i = 0;
+	i40e_status ret;
+	u16 opcode;
+	u32 oldval;
+	u32 val;
+
+	/* Do not run clean AQ when PF reset fails */
+	if (test_bit(__I40E_RESET_FAILED, pf->state))
+		return;
+
+	/* check for error indications */
+	val = rd32(&pf->hw, pf->hw.aq.arq.len);
+	oldval = val;
+	if (val & I40E_PF_ARQLEN_ARQVFE_MASK) {
+		if (hw->debug_mask & I40E_DEBUG_AQ)
+			dev_info(&pf->pdev->dev, "ARQ VF Error detected\n");
+		val &= ~I40E_PF_ARQLEN_ARQVFE_MASK;
+	}
+	if (val & I40E_PF_ARQLEN_ARQOVFL_MASK) {
+		if (hw->debug_mask & I40E_DEBUG_AQ)
+			dev_info(&pf->pdev->dev, "ARQ Overflow Error detected\n");
+		val &= ~I40E_PF_ARQLEN_ARQOVFL_MASK;
+		pf->arq_overflows++;
+	}
+	if (val & I40E_PF_ARQLEN_ARQCRIT_MASK) {
+		if (hw->debug_mask & I40E_DEBUG_AQ)
+			dev_info(&pf->pdev->dev, "ARQ Critical Error detected\n");
+		val &= ~I40E_PF_ARQLEN_ARQCRIT_MASK;
+	}
+	if (oldval != val)
+		wr32(&pf->hw, pf->hw.aq.arq.len, val);
+
+	val = rd32(&pf->hw, pf->hw.aq.asq.len);
+	oldval = val;
+	if (val & I40E_PF_ATQLEN_ATQVFE_MASK) {
+		if (pf->hw.debug_mask & I40E_DEBUG_AQ)
+			dev_info(&pf->pdev->dev, "ASQ VF Error detected\n");
+		val &= ~I40E_PF_ATQLEN_ATQVFE_MASK;
+	}
+	if (val & I40E_PF_ATQLEN_ATQOVFL_MASK) {
+		if (pf->hw.debug_mask & I40E_DEBUG_AQ)
+			dev_info(&pf->pdev->dev, "ASQ Overflow Error detected\n");
+		val &= ~I40E_PF_ATQLEN_ATQOVFL_MASK;
+	}
+	if (val & I40E_PF_ATQLEN_ATQCRIT_MASK) {
+		if (pf->hw.debug_mask & I40E_DEBUG_AQ)
+			dev_info(&pf->pdev->dev, "ASQ Critical Error detected\n");
+		val &= ~I40E_PF_ATQLEN_ATQCRIT_MASK;
+	}
+	if (oldval != val)
+		wr32(&pf->hw, pf->hw.aq.asq.len, val);
+
+	event.buf_len = I40E_MAX_AQ_BUF_SIZE;
+	event.msg_buf = kzalloc(event.buf_len, GFP_KERNEL);
+	if (!event.msg_buf)
+		return;
+
+	do {
+		ret = i40e_clean_arq_element(hw, &event, &pending);
+		if (ret == I40E_ERR_ADMIN_QUEUE_NO_WORK)
+			break;
+		else if (ret) {
+			dev_info(&pf->pdev->dev, "ARQ event error %d\n", ret);
+			break;
+		}
+
+		opcode = le16_to_cpu(event.desc.opcode);
+		switch (opcode) {
+
+		case i40e_aqc_opc_get_link_status:
+			i40e_handle_link_event(pf, &event);
+			break;
+		case i40e_aqc_opc_send_msg_to_pf:
+			ret = i40e_vc_process_vf_msg(pf,
+					le16_to_cpu(event.desc.retval),
+					le32_to_cpu(event.desc.cookie_high),
+					le32_to_cpu(event.desc.cookie_low),
+					event.msg_buf,
+					event.msg_len);
+			break;
+		case i40e_aqc_opc_lldp_update_mib:
+			dev_dbg(&pf->pdev->dev, "ARQ: Update LLDP MIB event received\n");
+#ifdef CONFIG_I40E_DCB
+			rtnl_lock();
+			ret = i40e_handle_lldp_event(pf, &event);
+			rtnl_unlock();
+#endif /* CONFIG_I40E_DCB */
+			break;
+		case i40e_aqc_opc_event_lan_overflow:
+			dev_dbg(&pf->pdev->dev, "ARQ LAN queue overflow event received\n");
+			i40e_handle_lan_overflow_event(pf, &event);
+			break;
+		case i40e_aqc_opc_send_msg_to_peer:
+			dev_info(&pf->pdev->dev, "ARQ: Msg from other pf\n");
+			break;
+		case i40e_aqc_opc_nvm_erase:
+		case i40e_aqc_opc_nvm_update:
+		case i40e_aqc_opc_oem_post_update:
+			i40e_debug(&pf->hw, I40E_DEBUG_NVM,
+				   "ARQ NVM operation 0x%04x completed\n",
+				   opcode);
+			break;
+		default:
+			dev_info(&pf->pdev->dev,
+				 "ARQ: Unknown event 0x%04x ignored\n",
+				 opcode);
+			break;
+		}
+	} while (i++ < pf->adminq_work_limit);
+
+	if (i < pf->adminq_work_limit)
+		clear_bit(__I40E_ADMINQ_EVENT_PENDING, pf->state);
+
+	/* re-enable Admin queue interrupt cause */
+	val = rd32(hw, I40E_PFINT_ICR0_ENA);
+	val |=  I40E_PFINT_ICR0_ENA_ADMINQ_MASK;
+	wr32(hw, I40E_PFINT_ICR0_ENA, val);
+	i40e_flush(hw);
+
+	kfree(event.msg_buf);
+}
+
+/**
+ * i40e_verify_eeprom - make sure eeprom is good to use
+ * @pf: board private structure
+ **/
+static void i40e_verify_eeprom(struct i40e_pf *pf)
+{
+	int err;
+
+	err = i40e_diag_eeprom_test(&pf->hw);
+	if (err) {
+		/* retry in case of garbage read */
+		err = i40e_diag_eeprom_test(&pf->hw);
+		if (err) {
+			dev_info(&pf->pdev->dev, "eeprom check failed (%d), Tx/Rx traffic disabled\n",
+				 err);
+			set_bit(__I40E_BAD_EEPROM, pf->state);
+		}
+	}
+
+	if (!err && test_bit(__I40E_BAD_EEPROM, pf->state)) {
+		dev_info(&pf->pdev->dev, "eeprom check passed, Tx/Rx traffic enabled\n");
+		clear_bit(__I40E_BAD_EEPROM, pf->state);
+	}
+}
+
+/**
+ * i40e_enable_pf_switch_lb
+ * @pf: pointer to the PF structure
+ *
+ * enable switch loop back or die - no point in a return value
+ **/
+static void i40e_enable_pf_switch_lb(struct i40e_pf *pf)
+{
+	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+	struct i40e_vsi_context ctxt;
+	int ret;
+
+	ctxt.seid = pf->main_vsi_seid;
+	ctxt.pf_num = pf->hw.pf_id;
+	ctxt.vf_num = 0;
+	ret = i40e_aq_get_vsi_params(&pf->hw, &ctxt, NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "couldn't get PF vsi config, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		return;
+	}
+	ctxt.flags = I40E_AQ_VSI_TYPE_PF;
+	ctxt.info.valid_sections = cpu_to_le16(I40E_AQ_VSI_PROP_SWITCH_VALID);
+	ctxt.info.switch_id |= cpu_to_le16(I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB);
+
+	ret = i40e_aq_update_vsi_params(&vsi->back->hw, &ctxt, NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "update vsi switch failed, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+	}
+}
+
+/**
+ * i40e_disable_pf_switch_lb
+ * @pf: pointer to the PF structure
+ *
+ * disable switch loop back or die - no point in a return value
+ **/
+static void i40e_disable_pf_switch_lb(struct i40e_pf *pf)
+{
+	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+	struct i40e_vsi_context ctxt;
+	int ret;
+
+	ctxt.seid = pf->main_vsi_seid;
+	ctxt.pf_num = pf->hw.pf_id;
+	ctxt.vf_num = 0;
+	ret = i40e_aq_get_vsi_params(&pf->hw, &ctxt, NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "couldn't get PF vsi config, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		return;
+	}
+	ctxt.flags = I40E_AQ_VSI_TYPE_PF;
+	ctxt.info.valid_sections = cpu_to_le16(I40E_AQ_VSI_PROP_SWITCH_VALID);
+	ctxt.info.switch_id &= ~cpu_to_le16(I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB);
+
+	ret = i40e_aq_update_vsi_params(&vsi->back->hw, &ctxt, NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "update vsi switch failed, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+	}
+}
+
+/**
+ * i40e_config_bridge_mode - Configure the HW bridge mode
+ * @veb: pointer to the bridge instance
+ *
+ * Configure the loop back mode for the LAN VSI that is downlink to the
+ * specified HW bridge instance. It is expected this function is called
+ * when a new HW bridge is instantiated.
+ **/
+static void i40e_config_bridge_mode(struct i40e_veb *veb)
+{
+	struct i40e_pf *pf = veb->pf;
+
+	if (pf->hw.debug_mask & I40E_DEBUG_LAN)
+		dev_info(&pf->pdev->dev, "enabling bridge mode: %s\n",
+			 veb->bridge_mode == BRIDGE_MODE_VEPA ? "VEPA" : "VEB");
+	if (veb->bridge_mode & BRIDGE_MODE_VEPA)
+		i40e_disable_pf_switch_lb(pf);
+	else
+		i40e_enable_pf_switch_lb(pf);
+}
+
+/**
+ * i40e_reconstitute_veb - rebuild the VEB and anything connected to it
+ * @veb: pointer to the VEB instance
+ *
+ * This is a recursive function that first builds the attached VSIs then
+ * recurses in to build the next layer of VEB.  We track the connections
+ * through our own index numbers because the seid's from the HW could
+ * change across the reset.
+ **/
+static int i40e_reconstitute_veb(struct i40e_veb *veb)
+{
+	struct i40e_vsi *ctl_vsi = NULL;
+	struct i40e_pf *pf = veb->pf;
+	int v, veb_idx;
+	int ret;
+
+	/* build VSI that owns this VEB, temporarily attached to base VEB */
+	for (v = 0; v < pf->num_alloc_vsi && !ctl_vsi; v++) {
+		if (pf->vsi[v] &&
+		    pf->vsi[v]->veb_idx == veb->idx &&
+		    pf->vsi[v]->flags & I40E_VSI_FLAG_VEB_OWNER) {
+			ctl_vsi = pf->vsi[v];
+			break;
+		}
+	}
+	if (!ctl_vsi) {
+		dev_info(&pf->pdev->dev,
+			 "missing owner VSI for veb_idx %d\n", veb->idx);
+		ret = -ENOENT;
+		goto end_reconstitute;
+	}
+	if (ctl_vsi != pf->vsi[pf->lan_vsi])
+		ctl_vsi->uplink_seid = pf->vsi[pf->lan_vsi]->uplink_seid;
+	ret = i40e_add_vsi(ctl_vsi);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "rebuild of veb_idx %d owner VSI failed: %d\n",
+			 veb->idx, ret);
+		goto end_reconstitute;
+	}
+	i40e_vsi_reset_stats(ctl_vsi);
+
+	/* create the VEB in the switch and move the VSI onto the VEB */
+	ret = i40e_add_veb(veb, ctl_vsi);
+	if (ret)
+		goto end_reconstitute;
+
+	if (pf->flags & I40E_FLAG_VEB_MODE_ENABLED)
+		veb->bridge_mode = BRIDGE_MODE_VEB;
+	else
+		veb->bridge_mode = BRIDGE_MODE_VEPA;
+	i40e_config_bridge_mode(veb);
+
+	/* create the remaining VSIs attached to this VEB */
+	for (v = 0; v < pf->num_alloc_vsi; v++) {
+		if (!pf->vsi[v] || pf->vsi[v] == ctl_vsi)
+			continue;
+
+		if (pf->vsi[v]->veb_idx == veb->idx) {
+			struct i40e_vsi *vsi = pf->vsi[v];
+
+			vsi->uplink_seid = veb->seid;
+			ret = i40e_add_vsi(vsi);
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					 "rebuild of vsi_idx %d failed: %d\n",
+					 v, ret);
+				goto end_reconstitute;
+			}
+			i40e_vsi_reset_stats(vsi);
+		}
+	}
+
+	/* create any VEBs attached to this VEB - RECURSION */
+	for (veb_idx = 0; veb_idx < I40E_MAX_VEB; veb_idx++) {
+		if (pf->veb[veb_idx] && pf->veb[veb_idx]->veb_idx == veb->idx) {
+			pf->veb[veb_idx]->uplink_seid = veb->seid;
+			ret = i40e_reconstitute_veb(pf->veb[veb_idx]);
+			if (ret)
+				break;
+		}
+	}
+
+end_reconstitute:
+	return ret;
+}
+
+/**
+ * i40e_get_capabilities - get info about the HW
+ * @pf: the PF struct
+ **/
+static int i40e_get_capabilities(struct i40e_pf *pf,
+				 enum i40e_admin_queue_opc list_type)
+{
+	struct i40e_aqc_list_capabilities_element_resp *cap_buf;
+	u16 data_size;
+	int buf_len;
+	int err;
+
+	buf_len = 40 * sizeof(struct i40e_aqc_list_capabilities_element_resp);
+	do {
+		cap_buf = kzalloc(buf_len, GFP_KERNEL);
+		if (!cap_buf)
+			return -ENOMEM;
+
+		/* this loads the data into the hw struct for us */
+		err = i40e_aq_discover_capabilities(&pf->hw, cap_buf, buf_len,
+						    &data_size, list_type,
+						    NULL);
+		/* data loaded, buffer no longer needed */
+		kfree(cap_buf);
+
+		if (pf->hw.aq.asq_last_status == I40E_AQ_RC_ENOMEM) {
+			/* retry with a larger buffer */
+			buf_len = data_size;
+		} else if (pf->hw.aq.asq_last_status != I40E_AQ_RC_OK) {
+			dev_info(&pf->pdev->dev,
+				 "capability discovery failed, err %s aq_err %s\n",
+				 i40e_stat_str(&pf->hw, err),
+				 i40e_aq_str(&pf->hw,
+					     pf->hw.aq.asq_last_status));
+			return -ENODEV;
+		}
+	} while (err);
+
+	if (pf->hw.debug_mask & I40E_DEBUG_USER) {
+		if (list_type == i40e_aqc_opc_list_func_capabilities) {
+			dev_info(&pf->pdev->dev,
+				 "pf=%d, num_vfs=%d, msix_pf=%d, msix_vf=%d, fd_g=%d, fd_b=%d, pf_max_q=%d num_vsi=%d\n",
+				 pf->hw.pf_id, pf->hw.func_caps.num_vfs,
+				 pf->hw.func_caps.num_msix_vectors,
+				 pf->hw.func_caps.num_msix_vectors_vf,
+				 pf->hw.func_caps.fd_filters_guaranteed,
+				 pf->hw.func_caps.fd_filters_best_effort,
+				 pf->hw.func_caps.num_tx_qp,
+				 pf->hw.func_caps.num_vsis);
+		} else if (list_type == i40e_aqc_opc_list_dev_capabilities) {
+			dev_info(&pf->pdev->dev,
+				 "switch_mode=0x%04x, function_valid=0x%08x\n",
+				 pf->hw.dev_caps.switch_mode,
+				 pf->hw.dev_caps.valid_functions);
+			dev_info(&pf->pdev->dev,
+				 "SR-IOV=%d, num_vfs for all function=%u\n",
+				 pf->hw.dev_caps.sr_iov_1_1,
+				 pf->hw.dev_caps.num_vfs);
+			dev_info(&pf->pdev->dev,
+				 "num_vsis=%u, num_rx:%u, num_tx=%u\n",
+				 pf->hw.dev_caps.num_vsis,
+				 pf->hw.dev_caps.num_rx_qp,
+				 pf->hw.dev_caps.num_tx_qp);
+		}
+	}
+	if (list_type == i40e_aqc_opc_list_func_capabilities) {
+#define DEF_NUM_VSI (1 + (pf->hw.func_caps.fcoe ? 1 : 0) \
+		       + pf->hw.func_caps.num_vfs)
+		if (pf->hw.revision_id == 0 &&
+		    pf->hw.func_caps.num_vsis < DEF_NUM_VSI) {
+			dev_info(&pf->pdev->dev,
+				 "got num_vsis %d, setting num_vsis to %d\n",
+				 pf->hw.func_caps.num_vsis, DEF_NUM_VSI);
+			pf->hw.func_caps.num_vsis = DEF_NUM_VSI;
+		}
+	}
+	return 0;
+}
+
+static int i40e_vsi_clear(struct i40e_vsi *vsi);
+
+/**
+ * i40e_fdir_sb_setup - initialize the Flow Director resources for Sideband
+ * @pf: board private structure
+ **/
+static void i40e_fdir_sb_setup(struct i40e_pf *pf)
+{
+	struct i40e_vsi *vsi;
+
+	/* quick workaround for an NVM issue that leaves a critical register
+	 * uninitialized
+	 */
+	if (!rd32(&pf->hw, I40E_GLQF_HKEY(0))) {
+		static const u32 hkey[] = {
+			0xe640d33f, 0xcdfe98ab, 0x73fa7161, 0x0d7a7d36,
+			0xeacb7d61, 0xaa4f05b6, 0x9c5c89ed, 0xfc425ddb,
+			0xa4654832, 0xfc7461d4, 0x8f827619, 0xf5c63c21,
+			0x95b3a76d};
+		int i;
+
+		for (i = 0; i <= I40E_GLQF_HKEY_MAX_INDEX; i++)
+			wr32(&pf->hw, I40E_GLQF_HKEY(i), hkey[i]);
+	}
+
+	if (!(pf->flags & I40E_FLAG_FD_SB_ENABLED))
+		return;
+
+	/* find existing VSI and see if it needs configuring */
+	vsi = i40e_find_vsi_by_type(pf, I40E_VSI_FDIR);
+
+	/* create a new VSI if none exists */
+	if (!vsi) {
+		vsi = i40e_vsi_setup(pf, I40E_VSI_FDIR,
+				     pf->vsi[pf->lan_vsi]->seid, 0);
+		if (!vsi) {
+			dev_info(&pf->pdev->dev, "Couldn't create FDir VSI\n");
+			pf->flags &= ~I40E_FLAG_FD_SB_ENABLED;
+			pf->flags |= I40E_FLAG_FD_SB_INACTIVE;
+			return;
+		}
+	}
+
+	i40e_vsi_setup_irqhandler(vsi, i40e_fdir_clean_ring);
+}
+
+/**
+ * i40e_fdir_teardown - release the Flow Director resources
+ * @pf: board private structure
+ **/
+static void i40e_fdir_teardown(struct i40e_pf *pf)
+{
+	struct i40e_vsi *vsi;
+
+	i40e_fdir_filter_exit(pf);
+	vsi = i40e_find_vsi_by_type(pf, I40E_VSI_FDIR);
+	if (vsi)
+		i40e_vsi_release(vsi);
+}
+
+/**
+ * i40e_rebuild_cloud_filters - Rebuilds cloud filters for VSIs
+ * @vsi: PF main vsi
+ * @seid: seid of main or channel VSIs
+ *
+ * Rebuilds cloud filters associated with main VSI and channel VSIs if they
+ * existed before reset
+ **/
+static int i40e_rebuild_cloud_filters(struct i40e_vsi *vsi, u16 seid)
+{
+	struct i40e_cloud_filter *cfilter;
+	struct i40e_pf *pf = vsi->back;
+	struct hlist_node *node;
+	i40e_status ret;
+
+	/* Add cloud filters back if they exist */
+	hlist_for_each_entry_safe(cfilter, node, &pf->cloud_filter_list,
+				  cloud_node) {
+		if (cfilter->seid != seid)
+			continue;
+
+		if (cfilter->dst_port)
+			ret = i40e_add_del_cloud_filter_big_buf(vsi, cfilter,
+								true);
+		else
+			ret = i40e_add_del_cloud_filter(vsi, cfilter, true);
+
+		if (ret) {
+			dev_dbg(&pf->pdev->dev,
+				"Failed to rebuild cloud filter, err %s aq_err %s\n",
+				i40e_stat_str(&pf->hw, ret),
+				i40e_aq_str(&pf->hw,
+					    pf->hw.aq.asq_last_status));
+			return ret;
+		}
+	}
+	return 0;
+}
+
+/**
+ * i40e_rebuild_channels - Rebuilds channel VSIs if they existed before reset
+ * @vsi: PF main vsi
+ *
+ * Rebuilds channel VSIs if they existed before reset
+ **/
+static int i40e_rebuild_channels(struct i40e_vsi *vsi)
+{
+	struct i40e_channel *ch, *ch_tmp;
+	i40e_status ret;
+
+	if (list_empty(&vsi->ch_list))
+		return 0;
+
+	list_for_each_entry_safe(ch, ch_tmp, &vsi->ch_list, list) {
+		if (!ch->initialized)
+			break;
+		/* Proceed with creation of channel (VMDq2) VSI */
+		ret = i40e_add_channel(vsi->back, vsi->uplink_seid, ch);
+		if (ret) {
+			dev_info(&vsi->back->pdev->dev,
+				 "failed to rebuild channels using uplink_seid %u\n",
+				 vsi->uplink_seid);
+			return ret;
+		}
+		/* Reconfigure TX queues using QTX_CTL register */
+		ret = i40e_channel_config_tx_ring(vsi->back, vsi, ch);
+		if (ret) {
+			dev_info(&vsi->back->pdev->dev,
+				 "failed to configure TX rings for channel %u\n",
+				 ch->seid);
+			return ret;
+		}
+		/* update 'next_base_queue' */
+		vsi->next_base_queue = vsi->next_base_queue +
+							ch->num_queue_pairs;
+		if (ch->max_tx_rate) {
+			u64 credits = ch->max_tx_rate;
+
+			if (i40e_set_bw_limit(vsi, ch->seid,
+					      ch->max_tx_rate))
+				return -EINVAL;
+
+			do_div(credits, I40E_BW_CREDIT_DIVISOR);
+			dev_dbg(&vsi->back->pdev->dev,
+				"Set tx rate of %llu Mbps (count of 50Mbps %llu) for vsi->seid %u\n",
+				ch->max_tx_rate,
+				credits,
+				ch->seid);
+		}
+		ret = i40e_rebuild_cloud_filters(vsi, ch->seid);
+		if (ret) {
+			dev_dbg(&vsi->back->pdev->dev,
+				"Failed to rebuild cloud filters for channel VSI %u\n",
+				ch->seid);
+			return ret;
+		}
+	}
+	return 0;
+}
+
+/**
+ * i40e_prep_for_reset - prep for the core to reset
+ * @pf: board private structure
+ * @lock_acquired: indicates whether or not the lock has been acquired
+ * before this function was called.
+ *
+ * Close up the VFs and other things in prep for PF Reset.
+  **/
+static void i40e_prep_for_reset(struct i40e_pf *pf, bool lock_acquired)
+{
+	struct i40e_hw *hw = &pf->hw;
+	i40e_status ret = 0;
+	u32 v;
+
+	clear_bit(__I40E_RESET_INTR_RECEIVED, pf->state);
+	if (test_and_set_bit(__I40E_RESET_RECOVERY_PENDING, pf->state))
+		return;
+	if (i40e_check_asq_alive(&pf->hw))
+		i40e_vc_notify_reset(pf);
+
+	dev_dbg(&pf->pdev->dev, "Tearing down internal switch for reset\n");
+
+	/* quiesce the VSIs and their queues that are not already DOWN */
+	/* pf_quiesce_all_vsi modifies netdev structures -rtnl_lock needed */
+	if (!lock_acquired)
+		rtnl_lock();
+	i40e_pf_quiesce_all_vsi(pf);
+	if (!lock_acquired)
+		rtnl_unlock();
+
+	for (v = 0; v < pf->num_alloc_vsi; v++) {
+		if (pf->vsi[v])
+			pf->vsi[v]->seid = 0;
+	}
+
+	i40e_shutdown_adminq(&pf->hw);
+
+	/* call shutdown HMC */
+	if (hw->hmc.hmc_obj) {
+		ret = i40e_shutdown_lan_hmc(hw);
+		if (ret)
+			dev_warn(&pf->pdev->dev,
+				 "shutdown_lan_hmc failed: %d\n", ret);
+	}
+}
+
+/**
+ * i40e_send_version - update firmware with driver version
+ * @pf: PF struct
+ */
+static void i40e_send_version(struct i40e_pf *pf)
+{
+	struct i40e_driver_version dv;
+
+	dv.major_version = DRV_VERSION_MAJOR;
+	dv.minor_version = DRV_VERSION_MINOR;
+	dv.build_version = DRV_VERSION_BUILD;
+	dv.subbuild_version = 0;
+	strlcpy(dv.driver_string, DRV_VERSION, sizeof(dv.driver_string));
+	i40e_aq_send_driver_version(&pf->hw, &dv, NULL);
+}
+
+/**
+ * i40e_get_oem_version - get OEM specific version information
+ * @hw: pointer to the hardware structure
+ **/
+static void i40e_get_oem_version(struct i40e_hw *hw)
+{
+	u16 block_offset = 0xffff;
+	u16 block_length = 0;
+	u16 capabilities = 0;
+	u16 gen_snap = 0;
+	u16 release = 0;
+
+#define I40E_SR_NVM_OEM_VERSION_PTR		0x1B
+#define I40E_NVM_OEM_LENGTH_OFFSET		0x00
+#define I40E_NVM_OEM_CAPABILITIES_OFFSET	0x01
+#define I40E_NVM_OEM_GEN_OFFSET			0x02
+#define I40E_NVM_OEM_RELEASE_OFFSET		0x03
+#define I40E_NVM_OEM_CAPABILITIES_MASK		0x000F
+#define I40E_NVM_OEM_LENGTH			3
+
+	/* Check if pointer to OEM version block is valid. */
+	i40e_read_nvm_word(hw, I40E_SR_NVM_OEM_VERSION_PTR, &block_offset);
+	if (block_offset == 0xffff)
+		return;
+
+	/* Check if OEM version block has correct length. */
+	i40e_read_nvm_word(hw, block_offset + I40E_NVM_OEM_LENGTH_OFFSET,
+			   &block_length);
+	if (block_length < I40E_NVM_OEM_LENGTH)
+		return;
+
+	/* Check if OEM version format is as expected. */
+	i40e_read_nvm_word(hw, block_offset + I40E_NVM_OEM_CAPABILITIES_OFFSET,
+			   &capabilities);
+	if ((capabilities & I40E_NVM_OEM_CAPABILITIES_MASK) != 0)
+		return;
+
+	i40e_read_nvm_word(hw, block_offset + I40E_NVM_OEM_GEN_OFFSET,
+			   &gen_snap);
+	i40e_read_nvm_word(hw, block_offset + I40E_NVM_OEM_RELEASE_OFFSET,
+			   &release);
+	hw->nvm.oem_ver = (gen_snap << I40E_OEM_SNAP_SHIFT) | release;
+	hw->nvm.eetrack = I40E_OEM_EETRACK_ID;
+}
+
+/**
+ * i40e_reset - wait for core reset to finish reset, reset pf if corer not seen
+ * @pf: board private structure
+ **/
+static int i40e_reset(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	i40e_status ret;
+
+	ret = i40e_pf_reset(hw);
+	if (ret) {
+		dev_info(&pf->pdev->dev, "PF reset failed, %d\n", ret);
+		set_bit(__I40E_RESET_FAILED, pf->state);
+		clear_bit(__I40E_RESET_RECOVERY_PENDING, pf->state);
+	} else {
+		pf->pfr_count++;
+	}
+	return ret;
+}
+
+/**
+ * i40e_rebuild - rebuild using a saved config
+ * @pf: board private structure
+ * @reinit: if the Main VSI needs to re-initialized.
+ * @lock_acquired: indicates whether or not the lock has been acquired
+ * before this function was called.
+ **/
+static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired)
+{
+	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+	struct i40e_hw *hw = &pf->hw;
+	u8 set_fc_aq_fail = 0;
+	i40e_status ret;
+	u32 val;
+	int v;
+
+	if (test_bit(__I40E_DOWN, pf->state))
+		goto clear_recovery;
+	dev_dbg(&pf->pdev->dev, "Rebuilding internal switch\n");
+
+	/* rebuild the basics for the AdminQ, HMC, and initial HW switch */
+	ret = i40e_init_adminq(&pf->hw);
+	if (ret) {
+		dev_info(&pf->pdev->dev, "Rebuild AdminQ failed, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		goto clear_recovery;
+	}
+	i40e_get_oem_version(&pf->hw);
+
+	if (test_bit(__I40E_EMP_RESET_INTR_RECEIVED, pf->state) &&
+	    ((hw->aq.fw_maj_ver == 4 && hw->aq.fw_min_ver <= 33) ||
+	     hw->aq.fw_maj_ver < 4) && hw->mac.type == I40E_MAC_XL710) {
+		/* The following delay is necessary for 4.33 firmware and older
+		 * to recover after EMP reset. 200 ms should suffice but we
+		 * put here 300 ms to be sure that FW is ready to operate
+		 * after reset.
+		 */
+		mdelay(300);
+	}
+
+	/* re-verify the eeprom if we just had an EMP reset */
+	if (test_and_clear_bit(__I40E_EMP_RESET_INTR_RECEIVED, pf->state))
+		i40e_verify_eeprom(pf);
+
+	i40e_clear_pxe_mode(hw);
+	ret = i40e_get_capabilities(pf, i40e_aqc_opc_list_func_capabilities);
+	if (ret)
+		goto end_core_reset;
+
+	ret = i40e_init_lan_hmc(hw, hw->func_caps.num_tx_qp,
+				hw->func_caps.num_rx_qp, 0, 0);
+	if (ret) {
+		dev_info(&pf->pdev->dev, "init_lan_hmc failed: %d\n", ret);
+		goto end_core_reset;
+	}
+	ret = i40e_configure_lan_hmc(hw, I40E_HMC_MODEL_DIRECT_ONLY);
+	if (ret) {
+		dev_info(&pf->pdev->dev, "configure_lan_hmc failed: %d\n", ret);
+		goto end_core_reset;
+	}
+
+	/* Enable FW to write a default DCB config on link-up */
+	i40e_aq_set_dcb_parameters(hw, true, NULL);
+
+#ifdef CONFIG_I40E_DCB
+	ret = i40e_init_pf_dcb(pf);
+	if (ret) {
+		dev_info(&pf->pdev->dev, "DCB init failed %d, disabled\n", ret);
+		pf->flags &= ~I40E_FLAG_DCB_CAPABLE;
+		/* Continue without DCB enabled */
+	}
+#endif /* CONFIG_I40E_DCB */
+	/* do basic switch setup */
+	if (!lock_acquired)
+		rtnl_lock();
+	ret = i40e_setup_pf_switch(pf, reinit);
+	if (ret)
+		goto end_unlock;
+
+	/* The driver only wants link up/down and module qualification
+	 * reports from firmware.  Note the negative logic.
+	 */
+	ret = i40e_aq_set_phy_int_mask(&pf->hw,
+				       ~(I40E_AQ_EVENT_LINK_UPDOWN |
+					 I40E_AQ_EVENT_MEDIA_NA |
+					 I40E_AQ_EVENT_MODULE_QUAL_FAIL), NULL);
+	if (ret)
+		dev_info(&pf->pdev->dev, "set phy mask fail, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+
+	/* make sure our flow control settings are restored */
+	ret = i40e_set_fc(&pf->hw, &set_fc_aq_fail, true);
+	if (ret)
+		dev_dbg(&pf->pdev->dev, "setting flow control: ret = %s last_status = %s\n",
+			i40e_stat_str(&pf->hw, ret),
+			i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+
+	/* Rebuild the VSIs and VEBs that existed before reset.
+	 * They are still in our local switch element arrays, so only
+	 * need to rebuild the switch model in the HW.
+	 *
+	 * If there were VEBs but the reconstitution failed, we'll try
+	 * try to recover minimal use by getting the basic PF VSI working.
+	 */
+	if (vsi->uplink_seid != pf->mac_seid) {
+		dev_dbg(&pf->pdev->dev, "attempting to rebuild switch\n");
+		/* find the one VEB connected to the MAC, and find orphans */
+		for (v = 0; v < I40E_MAX_VEB; v++) {
+			if (!pf->veb[v])
+				continue;
+
+			if (pf->veb[v]->uplink_seid == pf->mac_seid ||
+			    pf->veb[v]->uplink_seid == 0) {
+				ret = i40e_reconstitute_veb(pf->veb[v]);
+
+				if (!ret)
+					continue;
+
+				/* If Main VEB failed, we're in deep doodoo,
+				 * so give up rebuilding the switch and set up
+				 * for minimal rebuild of PF VSI.
+				 * If orphan failed, we'll report the error
+				 * but try to keep going.
+				 */
+				if (pf->veb[v]->uplink_seid == pf->mac_seid) {
+					dev_info(&pf->pdev->dev,
+						 "rebuild of switch failed: %d, will try to set up simple PF connection\n",
+						 ret);
+					vsi->uplink_seid = pf->mac_seid;
+					break;
+				} else if (pf->veb[v]->uplink_seid == 0) {
+					dev_info(&pf->pdev->dev,
+						 "rebuild of orphan VEB failed: %d\n",
+						 ret);
+				}
+			}
+		}
+	}
+
+	if (vsi->uplink_seid == pf->mac_seid) {
+		dev_dbg(&pf->pdev->dev, "attempting to rebuild PF VSI\n");
+		/* no VEB, so rebuild only the Main VSI */
+		ret = i40e_add_vsi(vsi);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "rebuild of Main VSI failed: %d\n", ret);
+			goto end_unlock;
+		}
+	}
+
+	if (vsi->mqprio_qopt.max_rate[0]) {
+		u64 max_tx_rate = vsi->mqprio_qopt.max_rate[0];
+		u64 credits = 0;
+
+		do_div(max_tx_rate, I40E_BW_MBPS_DIVISOR);
+		ret = i40e_set_bw_limit(vsi, vsi->seid, max_tx_rate);
+		if (ret)
+			goto end_unlock;
+
+		credits = max_tx_rate;
+		do_div(credits, I40E_BW_CREDIT_DIVISOR);
+		dev_dbg(&vsi->back->pdev->dev,
+			"Set tx rate of %llu Mbps (count of 50Mbps %llu) for vsi->seid %u\n",
+			max_tx_rate,
+			credits,
+			vsi->seid);
+	}
+
+	ret = i40e_rebuild_cloud_filters(vsi, vsi->seid);
+	if (ret)
+		goto end_unlock;
+
+	/* PF Main VSI is rebuild by now, go ahead and rebuild channel VSIs
+	 * for this main VSI if they exist
+	 */
+	ret = i40e_rebuild_channels(vsi);
+	if (ret)
+		goto end_unlock;
+
+	/* Reconfigure hardware for allowing smaller MSS in the case
+	 * of TSO, so that we avoid the MDD being fired and causing
+	 * a reset in the case of small MSS+TSO.
+	 */
+#define I40E_REG_MSS          0x000E64DC
+#define I40E_REG_MSS_MIN_MASK 0x3FF0000
+#define I40E_64BYTE_MSS       0x400000
+	val = rd32(hw, I40E_REG_MSS);
+	if ((val & I40E_REG_MSS_MIN_MASK) > I40E_64BYTE_MSS) {
+		val &= ~I40E_REG_MSS_MIN_MASK;
+		val |= I40E_64BYTE_MSS;
+		wr32(hw, I40E_REG_MSS, val);
+	}
+
+	if (pf->hw_features & I40E_HW_RESTART_AUTONEG) {
+		msleep(75);
+		ret = i40e_aq_set_link_restart_an(&pf->hw, true, NULL);
+		if (ret)
+			dev_info(&pf->pdev->dev, "link restart failed, err %s aq_err %s\n",
+				 i40e_stat_str(&pf->hw, ret),
+				 i40e_aq_str(&pf->hw,
+					     pf->hw.aq.asq_last_status));
+	}
+	/* reinit the misc interrupt */
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED)
+		ret = i40e_setup_misc_vector(pf);
+
+	/* Add a filter to drop all Flow control frames from any VSI from being
+	 * transmitted. By doing so we stop a malicious VF from sending out
+	 * PAUSE or PFC frames and potentially controlling traffic for other
+	 * PF/VF VSIs.
+	 * The FW can still send Flow control frames if enabled.
+	 */
+	i40e_add_filter_to_drop_tx_flow_control_frames(&pf->hw,
+						       pf->main_vsi_seid);
+
+	/* restart the VSIs that were rebuilt and running before the reset */
+	i40e_pf_unquiesce_all_vsi(pf);
+
+	/* Release the RTNL lock before we start resetting VFs */
+	if (!lock_acquired)
+		rtnl_unlock();
+
+	/* Restore promiscuous settings */
+	ret = i40e_set_promiscuous(pf, pf->cur_promisc);
+	if (ret)
+		dev_warn(&pf->pdev->dev,
+			 "Failed to restore promiscuous setting: %s, err %s aq_err %s\n",
+			 pf->cur_promisc ? "on" : "off",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+
+	i40e_reset_all_vfs(pf, true);
+
+	/* tell the firmware that we're starting */
+	i40e_send_version(pf);
+
+	/* We've already released the lock, so don't do it again */
+	goto end_core_reset;
+
+end_unlock:
+	if (!lock_acquired)
+		rtnl_unlock();
+end_core_reset:
+	clear_bit(__I40E_RESET_FAILED, pf->state);
+clear_recovery:
+	clear_bit(__I40E_RESET_RECOVERY_PENDING, pf->state);
+}
+
+/**
+ * i40e_reset_and_rebuild - reset and rebuild using a saved config
+ * @pf: board private structure
+ * @reinit: if the Main VSI needs to re-initialized.
+ * @lock_acquired: indicates whether or not the lock has been acquired
+ * before this function was called.
+ **/
+static void i40e_reset_and_rebuild(struct i40e_pf *pf, bool reinit,
+				   bool lock_acquired)
+{
+	int ret;
+	/* Now we wait for GRST to settle out.
+	 * We don't have to delete the VEBs or VSIs from the hw switch
+	 * because the reset will make them disappear.
+	 */
+	ret = i40e_reset(pf);
+	if (!ret)
+		i40e_rebuild(pf, reinit, lock_acquired);
+}
+
+/**
+ * i40e_handle_reset_warning - prep for the PF to reset, reset and rebuild
+ * @pf: board private structure
+ *
+ * Close up the VFs and other things in prep for a Core Reset,
+ * then get ready to rebuild the world.
+ * @lock_acquired: indicates whether or not the lock has been acquired
+ * before this function was called.
+ **/
+static void i40e_handle_reset_warning(struct i40e_pf *pf, bool lock_acquired)
+{
+	i40e_prep_for_reset(pf, lock_acquired);
+	i40e_reset_and_rebuild(pf, false, lock_acquired);
+}
+
+/**
+ * i40e_handle_mdd_event
+ * @pf: pointer to the PF structure
+ *
+ * Called from the MDD irq handler to identify possibly malicious vfs
+ **/
+static void i40e_handle_mdd_event(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	bool mdd_detected = false;
+	bool pf_mdd_detected = false;
+	struct i40e_vf *vf;
+	u32 reg;
+	int i;
+
+	if (!test_bit(__I40E_MDD_EVENT_PENDING, pf->state))
+		return;
+
+	/* find what triggered the MDD event */
+	reg = rd32(hw, I40E_GL_MDET_TX);
+	if (reg & I40E_GL_MDET_TX_VALID_MASK) {
+		u8 pf_num = (reg & I40E_GL_MDET_TX_PF_NUM_MASK) >>
+				I40E_GL_MDET_TX_PF_NUM_SHIFT;
+		u16 vf_num = (reg & I40E_GL_MDET_TX_VF_NUM_MASK) >>
+				I40E_GL_MDET_TX_VF_NUM_SHIFT;
+		u8 event = (reg & I40E_GL_MDET_TX_EVENT_MASK) >>
+				I40E_GL_MDET_TX_EVENT_SHIFT;
+		u16 queue = ((reg & I40E_GL_MDET_TX_QUEUE_MASK) >>
+				I40E_GL_MDET_TX_QUEUE_SHIFT) -
+				pf->hw.func_caps.base_queue;
+		if (netif_msg_tx_err(pf))
+			dev_info(&pf->pdev->dev, "Malicious Driver Detection event 0x%02x on TX queue %d PF number 0x%02x VF number 0x%02x\n",
+				 event, queue, pf_num, vf_num);
+		wr32(hw, I40E_GL_MDET_TX, 0xffffffff);
+		mdd_detected = true;
+	}
+	reg = rd32(hw, I40E_GL_MDET_RX);
+	if (reg & I40E_GL_MDET_RX_VALID_MASK) {
+		u8 func = (reg & I40E_GL_MDET_RX_FUNCTION_MASK) >>
+				I40E_GL_MDET_RX_FUNCTION_SHIFT;
+		u8 event = (reg & I40E_GL_MDET_RX_EVENT_MASK) >>
+				I40E_GL_MDET_RX_EVENT_SHIFT;
+		u16 queue = ((reg & I40E_GL_MDET_RX_QUEUE_MASK) >>
+				I40E_GL_MDET_RX_QUEUE_SHIFT) -
+				pf->hw.func_caps.base_queue;
+		if (netif_msg_rx_err(pf))
+			dev_info(&pf->pdev->dev, "Malicious Driver Detection event 0x%02x on RX queue %d of function 0x%02x\n",
+				 event, queue, func);
+		wr32(hw, I40E_GL_MDET_RX, 0xffffffff);
+		mdd_detected = true;
+	}
+
+	if (mdd_detected) {
+		reg = rd32(hw, I40E_PF_MDET_TX);
+		if (reg & I40E_PF_MDET_TX_VALID_MASK) {
+			wr32(hw, I40E_PF_MDET_TX, 0xFFFF);
+			dev_info(&pf->pdev->dev, "TX driver issue detected, PF reset issued\n");
+			pf_mdd_detected = true;
+		}
+		reg = rd32(hw, I40E_PF_MDET_RX);
+		if (reg & I40E_PF_MDET_RX_VALID_MASK) {
+			wr32(hw, I40E_PF_MDET_RX, 0xFFFF);
+			dev_info(&pf->pdev->dev, "RX driver issue detected, PF reset issued\n");
+			pf_mdd_detected = true;
+		}
+		/* Queue belongs to the PF, initiate a reset */
+		if (pf_mdd_detected) {
+			set_bit(__I40E_PF_RESET_REQUESTED, pf->state);
+			i40e_service_event_schedule(pf);
+		}
+	}
+
+	/* see if one of the VFs needs its hand slapped */
+	for (i = 0; i < pf->num_alloc_vfs && mdd_detected; i++) {
+		vf = &(pf->vf[i]);
+		reg = rd32(hw, I40E_VP_MDET_TX(i));
+		if (reg & I40E_VP_MDET_TX_VALID_MASK) {
+			wr32(hw, I40E_VP_MDET_TX(i), 0xFFFF);
+			vf->num_mdd_events++;
+			dev_info(&pf->pdev->dev, "TX driver issue detected on VF %d\n",
+				 i);
+		}
+
+		reg = rd32(hw, I40E_VP_MDET_RX(i));
+		if (reg & I40E_VP_MDET_RX_VALID_MASK) {
+			wr32(hw, I40E_VP_MDET_RX(i), 0xFFFF);
+			vf->num_mdd_events++;
+			dev_info(&pf->pdev->dev, "RX driver issue detected on VF %d\n",
+				 i);
+		}
+
+		if (vf->num_mdd_events > I40E_DEFAULT_NUM_MDD_EVENTS_ALLOWED) {
+			dev_info(&pf->pdev->dev,
+				 "Too many MDD events on VF %d, disabled\n", i);
+			dev_info(&pf->pdev->dev,
+				 "Use PF Control I/F to re-enable the VF\n");
+			set_bit(I40E_VF_STATE_DISABLED, &vf->vf_states);
+		}
+	}
+
+	/* re-enable mdd interrupt cause */
+	clear_bit(__I40E_MDD_EVENT_PENDING, pf->state);
+	reg = rd32(hw, I40E_PFINT_ICR0_ENA);
+	reg |=  I40E_PFINT_ICR0_ENA_MAL_DETECT_MASK;
+	wr32(hw, I40E_PFINT_ICR0_ENA, reg);
+	i40e_flush(hw);
+}
+
+static const char *i40e_tunnel_name(struct i40e_udp_port_config *port)
+{
+	switch (port->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		return "vxlan";
+	case UDP_TUNNEL_TYPE_GENEVE:
+		return "geneve";
+	default:
+		return "unknown";
+	}
+}
+
+/**
+ * i40e_sync_udp_filters - Trigger a sync event for existing UDP filters
+ * @pf: board private structure
+ **/
+static void i40e_sync_udp_filters(struct i40e_pf *pf)
+{
+	int i;
+
+	/* loop through and set pending bit for all active UDP filters */
+	for (i = 0; i < I40E_MAX_PF_UDP_OFFLOAD_PORTS; i++) {
+		if (pf->udp_ports[i].port)
+			pf->pending_udp_bitmap |= BIT_ULL(i);
+	}
+
+	set_bit(__I40E_UDP_FILTER_SYNC_PENDING, pf->state);
+}
+
+/**
+ * i40e_sync_udp_filters_subtask - Sync the VSI filter list with HW
+ * @pf: board private structure
+ **/
+static void i40e_sync_udp_filters_subtask(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	i40e_status ret;
+	u16 port;
+	int i;
+
+	if (!test_and_clear_bit(__I40E_UDP_FILTER_SYNC_PENDING, pf->state))
+		return;
+
+	for (i = 0; i < I40E_MAX_PF_UDP_OFFLOAD_PORTS; i++) {
+		if (pf->pending_udp_bitmap & BIT_ULL(i)) {
+			pf->pending_udp_bitmap &= ~BIT_ULL(i);
+			port = pf->udp_ports[i].port;
+			if (port)
+				ret = i40e_aq_add_udp_tunnel(hw, port,
+							pf->udp_ports[i].type,
+							NULL, NULL);
+			else
+				ret = i40e_aq_del_udp_tunnel(hw, i, NULL);
+
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					 "%s %s port %d, index %d failed, err %s aq_err %s\n",
+					 i40e_tunnel_name(&pf->udp_ports[i]),
+					 port ? "add" : "delete",
+					 port, i,
+					 i40e_stat_str(&pf->hw, ret),
+					 i40e_aq_str(&pf->hw,
+						     pf->hw.aq.asq_last_status));
+				pf->udp_ports[i].port = 0;
+			}
+		}
+	}
+}
+
+/**
+ * i40e_service_task - Run the driver's async subtasks
+ * @work: pointer to work_struct containing our data
+ **/
+static void i40e_service_task(struct work_struct *work)
+{
+	struct i40e_pf *pf = container_of(work,
+					  struct i40e_pf,
+					  service_task);
+	unsigned long start_time = jiffies;
+
+	/* don't bother with service tasks if a reset is in progress */
+	if (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state))
+		return;
+
+	if (test_and_set_bit(__I40E_SERVICE_SCHED, pf->state))
+		return;
+
+	i40e_detect_recover_hung(pf->vsi[pf->lan_vsi]);
+	i40e_sync_filters_subtask(pf);
+	i40e_reset_subtask(pf);
+	i40e_handle_mdd_event(pf);
+	i40e_vc_process_vflr_event(pf);
+	i40e_watchdog_subtask(pf);
+	i40e_fdir_reinit_subtask(pf);
+	if (test_and_clear_bit(__I40E_CLIENT_RESET, pf->state)) {
+		/* Client subtask will reopen next time through. */
+		i40e_notify_client_of_netdev_close(pf->vsi[pf->lan_vsi], true);
+	} else {
+		i40e_client_subtask(pf);
+		if (test_and_clear_bit(__I40E_CLIENT_L2_CHANGE,
+				       pf->state))
+			i40e_notify_client_of_l2_param_changes(
+							pf->vsi[pf->lan_vsi]);
+	}
+	i40e_sync_filters_subtask(pf);
+	i40e_sync_udp_filters_subtask(pf);
+	i40e_clean_adminq_subtask(pf);
+
+	/* flush memory to make sure state is correct before next watchdog */
+	smp_mb__before_atomic();
+	clear_bit(__I40E_SERVICE_SCHED, pf->state);
+
+	/* If the tasks have taken longer than one timer cycle or there
+	 * is more work to be done, reschedule the service task now
+	 * rather than wait for the timer to tick again.
+	 */
+	if (time_after(jiffies, (start_time + pf->service_timer_period)) ||
+	    test_bit(__I40E_ADMINQ_EVENT_PENDING, pf->state)		 ||
+	    test_bit(__I40E_MDD_EVENT_PENDING, pf->state)		 ||
+	    test_bit(__I40E_VFLR_EVENT_PENDING, pf->state))
+		i40e_service_event_schedule(pf);
+}
+
+/**
+ * i40e_service_timer - timer callback
+ * @data: pointer to PF struct
+ **/
+static void i40e_service_timer(struct timer_list *t)
+{
+	struct i40e_pf *pf = from_timer(pf, t, service_timer);
+
+	mod_timer(&pf->service_timer,
+		  round_jiffies(jiffies + pf->service_timer_period));
+	i40e_service_event_schedule(pf);
+}
+
+/**
+ * i40e_set_num_rings_in_vsi - Determine number of rings in the VSI
+ * @vsi: the VSI being configured
+ **/
+static int i40e_set_num_rings_in_vsi(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+
+	switch (vsi->type) {
+	case I40E_VSI_MAIN:
+		vsi->alloc_queue_pairs = pf->num_lan_qps;
+		vsi->num_desc = ALIGN(I40E_DEFAULT_NUM_DESCRIPTORS,
+				      I40E_REQ_DESCRIPTOR_MULTIPLE);
+		if (pf->flags & I40E_FLAG_MSIX_ENABLED)
+			vsi->num_q_vectors = pf->num_lan_msix;
+		else
+			vsi->num_q_vectors = 1;
+
+		break;
+
+	case I40E_VSI_FDIR:
+		vsi->alloc_queue_pairs = 1;
+		vsi->num_desc = ALIGN(I40E_FDIR_RING_COUNT,
+				      I40E_REQ_DESCRIPTOR_MULTIPLE);
+		vsi->num_q_vectors = pf->num_fdsb_msix;
+		break;
+
+	case I40E_VSI_VMDQ2:
+		vsi->alloc_queue_pairs = pf->num_vmdq_qps;
+		vsi->num_desc = ALIGN(I40E_DEFAULT_NUM_DESCRIPTORS,
+				      I40E_REQ_DESCRIPTOR_MULTIPLE);
+		vsi->num_q_vectors = pf->num_vmdq_msix;
+		break;
+
+	case I40E_VSI_SRIOV:
+		vsi->alloc_queue_pairs = pf->num_vf_qps;
+		vsi->num_desc = ALIGN(I40E_DEFAULT_NUM_DESCRIPTORS,
+				      I40E_REQ_DESCRIPTOR_MULTIPLE);
+		break;
+
+	default:
+		WARN_ON(1);
+		return -ENODATA;
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_vsi_alloc_arrays - Allocate queue and vector pointer arrays for the vsi
+ * @vsi: VSI pointer
+ * @alloc_qvectors: a bool to specify if q_vectors need to be allocated.
+ *
+ * On error: returns error code (negative)
+ * On success: returns 0
+ **/
+static int i40e_vsi_alloc_arrays(struct i40e_vsi *vsi, bool alloc_qvectors)
+{
+	struct i40e_ring **next_rings;
+	int size;
+	int ret = 0;
+
+	/* allocate memory for both Tx, XDP Tx and Rx ring pointers */
+	size = sizeof(struct i40e_ring *) * vsi->alloc_queue_pairs *
+	       (i40e_enabled_xdp_vsi(vsi) ? 3 : 2);
+	vsi->tx_rings = kzalloc(size, GFP_KERNEL);
+	if (!vsi->tx_rings)
+		return -ENOMEM;
+	next_rings = vsi->tx_rings + vsi->alloc_queue_pairs;
+	if (i40e_enabled_xdp_vsi(vsi)) {
+		vsi->xdp_rings = next_rings;
+		next_rings += vsi->alloc_queue_pairs;
+	}
+	vsi->rx_rings = next_rings;
+
+	if (alloc_qvectors) {
+		/* allocate memory for q_vector pointers */
+		size = sizeof(struct i40e_q_vector *) * vsi->num_q_vectors;
+		vsi->q_vectors = kzalloc(size, GFP_KERNEL);
+		if (!vsi->q_vectors) {
+			ret = -ENOMEM;
+			goto err_vectors;
+		}
+	}
+	return ret;
+
+err_vectors:
+	kfree(vsi->tx_rings);
+	return ret;
+}
+
+/**
+ * i40e_vsi_mem_alloc - Allocates the next available struct vsi in the PF
+ * @pf: board private structure
+ * @type: type of VSI
+ *
+ * On error: returns error code (negative)
+ * On success: returns vsi index in PF (positive)
+ **/
+static int i40e_vsi_mem_alloc(struct i40e_pf *pf, enum i40e_vsi_type type)
+{
+	int ret = -ENODEV;
+	struct i40e_vsi *vsi;
+	int vsi_idx;
+	int i;
+
+	/* Need to protect the allocation of the VSIs at the PF level */
+	mutex_lock(&pf->switch_mutex);
+
+	/* VSI list may be fragmented if VSI creation/destruction has
+	 * been happening.  We can afford to do a quick scan to look
+	 * for any free VSIs in the list.
+	 *
+	 * find next empty vsi slot, looping back around if necessary
+	 */
+	i = pf->next_vsi;
+	while (i < pf->num_alloc_vsi && pf->vsi[i])
+		i++;
+	if (i >= pf->num_alloc_vsi) {
+		i = 0;
+		while (i < pf->next_vsi && pf->vsi[i])
+			i++;
+	}
+
+	if (i < pf->num_alloc_vsi && !pf->vsi[i]) {
+		vsi_idx = i;             /* Found one! */
+	} else {
+		ret = -ENODEV;
+		goto unlock_pf;  /* out of VSI slots! */
+	}
+	pf->next_vsi = ++i;
+
+	vsi = kzalloc(sizeof(*vsi), GFP_KERNEL);
+	if (!vsi) {
+		ret = -ENOMEM;
+		goto unlock_pf;
+	}
+	vsi->type = type;
+	vsi->back = pf;
+	set_bit(__I40E_VSI_DOWN, vsi->state);
+	vsi->flags = 0;
+	vsi->idx = vsi_idx;
+	vsi->int_rate_limit = 0;
+	vsi->rss_table_size = (vsi->type == I40E_VSI_MAIN) ?
+				pf->rss_table_size : 64;
+	vsi->netdev_registered = false;
+	vsi->work_limit = I40E_DEFAULT_IRQ_WORK;
+	hash_init(vsi->mac_filter_hash);
+	vsi->irqs_ready = false;
+
+	ret = i40e_set_num_rings_in_vsi(vsi);
+	if (ret)
+		goto err_rings;
+
+	ret = i40e_vsi_alloc_arrays(vsi, true);
+	if (ret)
+		goto err_rings;
+
+	/* Setup default MSIX irq handler for VSI */
+	i40e_vsi_setup_irqhandler(vsi, i40e_msix_clean_rings);
+
+	/* Initialize VSI lock */
+	spin_lock_init(&vsi->mac_filter_hash_lock);
+	pf->vsi[vsi_idx] = vsi;
+	ret = vsi_idx;
+	goto unlock_pf;
+
+err_rings:
+	pf->next_vsi = i - 1;
+	kfree(vsi);
+unlock_pf:
+	mutex_unlock(&pf->switch_mutex);
+	return ret;
+}
+
+/**
+ * i40e_vsi_free_arrays - Free queue and vector pointer arrays for the VSI
+ * @type: VSI pointer
+ * @free_qvectors: a bool to specify if q_vectors need to be freed.
+ *
+ * On error: returns error code (negative)
+ * On success: returns 0
+ **/
+static void i40e_vsi_free_arrays(struct i40e_vsi *vsi, bool free_qvectors)
+{
+	/* free the ring and vector containers */
+	if (free_qvectors) {
+		kfree(vsi->q_vectors);
+		vsi->q_vectors = NULL;
+	}
+	kfree(vsi->tx_rings);
+	vsi->tx_rings = NULL;
+	vsi->rx_rings = NULL;
+	vsi->xdp_rings = NULL;
+}
+
+/**
+ * i40e_clear_rss_config_user - clear the user configured RSS hash keys
+ * and lookup table
+ * @vsi: Pointer to VSI structure
+ */
+static void i40e_clear_rss_config_user(struct i40e_vsi *vsi)
+{
+	if (!vsi)
+		return;
+
+	kfree(vsi->rss_hkey_user);
+	vsi->rss_hkey_user = NULL;
+
+	kfree(vsi->rss_lut_user);
+	vsi->rss_lut_user = NULL;
+}
+
+/**
+ * i40e_vsi_clear - Deallocate the VSI provided
+ * @vsi: the VSI being un-configured
+ **/
+static int i40e_vsi_clear(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf;
+
+	if (!vsi)
+		return 0;
+
+	if (!vsi->back)
+		goto free_vsi;
+	pf = vsi->back;
+
+	mutex_lock(&pf->switch_mutex);
+	if (!pf->vsi[vsi->idx]) {
+		dev_err(&pf->pdev->dev, "pf->vsi[%d] is NULL, just free vsi[%d](type %d)\n",
+			vsi->idx, vsi->idx, vsi->type);
+		goto unlock_vsi;
+	}
+
+	if (pf->vsi[vsi->idx] != vsi) {
+		dev_err(&pf->pdev->dev,
+			"pf->vsi[%d](type %d) != vsi[%d](type %d): no free!\n",
+			pf->vsi[vsi->idx]->idx,
+			pf->vsi[vsi->idx]->type,
+			vsi->idx, vsi->type);
+		goto unlock_vsi;
+	}
+
+	/* updates the PF for this cleared vsi */
+	i40e_put_lump(pf->qp_pile, vsi->base_queue, vsi->idx);
+	i40e_put_lump(pf->irq_pile, vsi->base_vector, vsi->idx);
+
+	i40e_vsi_free_arrays(vsi, true);
+	i40e_clear_rss_config_user(vsi);
+
+	pf->vsi[vsi->idx] = NULL;
+	if (vsi->idx < pf->next_vsi)
+		pf->next_vsi = vsi->idx;
+
+unlock_vsi:
+	mutex_unlock(&pf->switch_mutex);
+free_vsi:
+	kfree(vsi);
+
+	return 0;
+}
+
+/**
+ * i40e_vsi_clear_rings - Deallocates the Rx and Tx rings for the provided VSI
+ * @vsi: the VSI being cleaned
+ **/
+static void i40e_vsi_clear_rings(struct i40e_vsi *vsi)
+{
+	int i;
+
+	if (vsi->tx_rings && vsi->tx_rings[0]) {
+		for (i = 0; i < vsi->alloc_queue_pairs; i++) {
+			kfree_rcu(vsi->tx_rings[i], rcu);
+			vsi->tx_rings[i] = NULL;
+			vsi->rx_rings[i] = NULL;
+			if (vsi->xdp_rings)
+				vsi->xdp_rings[i] = NULL;
+		}
+	}
+}
+
+/**
+ * i40e_alloc_rings - Allocates the Rx and Tx rings for the provided VSI
+ * @vsi: the VSI being configured
+ **/
+static int i40e_alloc_rings(struct i40e_vsi *vsi)
+{
+	int i, qpv = i40e_enabled_xdp_vsi(vsi) ? 3 : 2;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_ring *ring;
+
+	/* Set basic values in the rings to be used later during open() */
+	for (i = 0; i < vsi->alloc_queue_pairs; i++) {
+		/* allocate space for both Tx and Rx in one shot */
+		ring = kcalloc(qpv, sizeof(struct i40e_ring), GFP_KERNEL);
+		if (!ring)
+			goto err_out;
+
+		ring->queue_index = i;
+		ring->reg_idx = vsi->base_queue + i;
+		ring->ring_active = false;
+		ring->vsi = vsi;
+		ring->netdev = vsi->netdev;
+		ring->dev = &pf->pdev->dev;
+		ring->count = vsi->num_desc;
+		ring->size = 0;
+		ring->dcb_tc = 0;
+		if (vsi->back->hw_features & I40E_HW_WB_ON_ITR_CAPABLE)
+			ring->flags = I40E_TXR_FLAGS_WB_ON_ITR;
+		ring->itr_setting = pf->tx_itr_default;
+		vsi->tx_rings[i] = ring++;
+
+		if (!i40e_enabled_xdp_vsi(vsi))
+			goto setup_rx;
+
+		ring->queue_index = vsi->alloc_queue_pairs + i;
+		ring->reg_idx = vsi->base_queue + ring->queue_index;
+		ring->ring_active = false;
+		ring->vsi = vsi;
+		ring->netdev = NULL;
+		ring->dev = &pf->pdev->dev;
+		ring->count = vsi->num_desc;
+		ring->size = 0;
+		ring->dcb_tc = 0;
+		if (vsi->back->hw_features & I40E_HW_WB_ON_ITR_CAPABLE)
+			ring->flags = I40E_TXR_FLAGS_WB_ON_ITR;
+		set_ring_xdp(ring);
+		ring->itr_setting = pf->tx_itr_default;
+		vsi->xdp_rings[i] = ring++;
+
+setup_rx:
+		ring->queue_index = i;
+		ring->reg_idx = vsi->base_queue + i;
+		ring->ring_active = false;
+		ring->vsi = vsi;
+		ring->netdev = vsi->netdev;
+		ring->dev = &pf->pdev->dev;
+		ring->count = vsi->num_desc;
+		ring->size = 0;
+		ring->dcb_tc = 0;
+		ring->itr_setting = pf->rx_itr_default;
+		vsi->rx_rings[i] = ring;
+	}
+
+	return 0;
+
+err_out:
+	i40e_vsi_clear_rings(vsi);
+	return -ENOMEM;
+}
+
+/**
+ * i40e_reserve_msix_vectors - Reserve MSI-X vectors in the kernel
+ * @pf: board private structure
+ * @vectors: the number of MSI-X vectors to request
+ *
+ * Returns the number of vectors reserved, or error
+ **/
+static int i40e_reserve_msix_vectors(struct i40e_pf *pf, int vectors)
+{
+	vectors = pci_enable_msix_range(pf->pdev, pf->msix_entries,
+					I40E_MIN_MSIX, vectors);
+	if (vectors < 0) {
+		dev_info(&pf->pdev->dev,
+			 "MSI-X vector reservation failed: %d\n", vectors);
+		vectors = 0;
+	}
+
+	return vectors;
+}
+
+/**
+ * i40e_init_msix - Setup the MSIX capability
+ * @pf: board private structure
+ *
+ * Work with the OS to set up the MSIX vectors needed.
+ *
+ * Returns the number of vectors reserved or negative on failure
+ **/
+static int i40e_init_msix(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	int cpus, extra_vectors;
+	int vectors_left;
+	int v_budget, i;
+	int v_actual;
+	int iwarp_requested = 0;
+
+	if (!(pf->flags & I40E_FLAG_MSIX_ENABLED))
+		return -ENODEV;
+
+	/* The number of vectors we'll request will be comprised of:
+	 *   - Add 1 for "other" cause for Admin Queue events, etc.
+	 *   - The number of LAN queue pairs
+	 *	- Queues being used for RSS.
+	 *		We don't need as many as max_rss_size vectors.
+	 *		use rss_size instead in the calculation since that
+	 *		is governed by number of cpus in the system.
+	 *	- assumes symmetric Tx/Rx pairing
+	 *   - The number of VMDq pairs
+	 *   - The CPU count within the NUMA node if iWARP is enabled
+	 * Once we count this up, try the request.
+	 *
+	 * If we can't get what we want, we'll simplify to nearly nothing
+	 * and try again.  If that still fails, we punt.
+	 */
+	vectors_left = hw->func_caps.num_msix_vectors;
+	v_budget = 0;
+
+	/* reserve one vector for miscellaneous handler */
+	if (vectors_left) {
+		v_budget++;
+		vectors_left--;
+	}
+
+	/* reserve some vectors for the main PF traffic queues. Initially we
+	 * only reserve at most 50% of the available vectors, in the case that
+	 * the number of online CPUs is large. This ensures that we can enable
+	 * extra features as well. Once we've enabled the other features, we
+	 * will use any remaining vectors to reach as close as we can to the
+	 * number of online CPUs.
+	 */
+	cpus = num_online_cpus();
+	pf->num_lan_msix = min_t(int, cpus, vectors_left / 2);
+	vectors_left -= pf->num_lan_msix;
+
+	/* reserve one vector for sideband flow director */
+	if (pf->flags & I40E_FLAG_FD_SB_ENABLED) {
+		if (vectors_left) {
+			pf->num_fdsb_msix = 1;
+			v_budget++;
+			vectors_left--;
+		} else {
+			pf->num_fdsb_msix = 0;
+		}
+	}
+
+	/* can we reserve enough for iWARP? */
+	if (pf->flags & I40E_FLAG_IWARP_ENABLED) {
+		iwarp_requested = pf->num_iwarp_msix;
+
+		if (!vectors_left)
+			pf->num_iwarp_msix = 0;
+		else if (vectors_left < pf->num_iwarp_msix)
+			pf->num_iwarp_msix = 1;
+		v_budget += pf->num_iwarp_msix;
+		vectors_left -= pf->num_iwarp_msix;
+	}
+
+	/* any vectors left over go for VMDq support */
+	if (pf->flags & I40E_FLAG_VMDQ_ENABLED) {
+		int vmdq_vecs_wanted = pf->num_vmdq_vsis * pf->num_vmdq_qps;
+		int vmdq_vecs = min_t(int, vectors_left, vmdq_vecs_wanted);
+
+		if (!vectors_left) {
+			pf->num_vmdq_msix = 0;
+			pf->num_vmdq_qps = 0;
+		} else {
+			/* if we're short on vectors for what's desired, we limit
+			 * the queues per vmdq.  If this is still more than are
+			 * available, the user will need to change the number of
+			 * queues/vectors used by the PF later with the ethtool
+			 * channels command
+			 */
+			if (vmdq_vecs < vmdq_vecs_wanted)
+				pf->num_vmdq_qps = 1;
+			pf->num_vmdq_msix = pf->num_vmdq_qps;
+
+			v_budget += vmdq_vecs;
+			vectors_left -= vmdq_vecs;
+		}
+	}
+
+	/* On systems with a large number of SMP cores, we previously limited
+	 * the number of vectors for num_lan_msix to be at most 50% of the
+	 * available vectors, to allow for other features. Now, we add back
+	 * the remaining vectors. However, we ensure that the total
+	 * num_lan_msix will not exceed num_online_cpus(). To do this, we
+	 * calculate the number of vectors we can add without going over the
+	 * cap of CPUs. For systems with a small number of CPUs this will be
+	 * zero.
+	 */
+	extra_vectors = min_t(int, cpus - pf->num_lan_msix, vectors_left);
+	pf->num_lan_msix += extra_vectors;
+	vectors_left -= extra_vectors;
+
+	WARN(vectors_left < 0,
+	     "Calculation of remaining vectors underflowed. This is an accounting bug when determining total MSI-X vectors.\n");
+
+	v_budget += pf->num_lan_msix;
+	pf->msix_entries = kcalloc(v_budget, sizeof(struct msix_entry),
+				   GFP_KERNEL);
+	if (!pf->msix_entries)
+		return -ENOMEM;
+
+	for (i = 0; i < v_budget; i++)
+		pf->msix_entries[i].entry = i;
+	v_actual = i40e_reserve_msix_vectors(pf, v_budget);
+
+	if (v_actual < I40E_MIN_MSIX) {
+		pf->flags &= ~I40E_FLAG_MSIX_ENABLED;
+		kfree(pf->msix_entries);
+		pf->msix_entries = NULL;
+		pci_disable_msix(pf->pdev);
+		return -ENODEV;
+
+	} else if (v_actual == I40E_MIN_MSIX) {
+		/* Adjust for minimal MSIX use */
+		pf->num_vmdq_vsis = 0;
+		pf->num_vmdq_qps = 0;
+		pf->num_lan_qps = 1;
+		pf->num_lan_msix = 1;
+
+	} else if (v_actual != v_budget) {
+		/* If we have limited resources, we will start with no vectors
+		 * for the special features and then allocate vectors to some
+		 * of these features based on the policy and at the end disable
+		 * the features that did not get any vectors.
+		 */
+		int vec;
+
+		dev_info(&pf->pdev->dev,
+			 "MSI-X vector limit reached with %d, wanted %d, attempting to redistribute vectors\n",
+			 v_actual, v_budget);
+		/* reserve the misc vector */
+		vec = v_actual - 1;
+
+		/* Scale vector usage down */
+		pf->num_vmdq_msix = 1;    /* force VMDqs to only one vector */
+		pf->num_vmdq_vsis = 1;
+		pf->num_vmdq_qps = 1;
+
+		/* partition out the remaining vectors */
+		switch (vec) {
+		case 2:
+			pf->num_lan_msix = 1;
+			break;
+		case 3:
+			if (pf->flags & I40E_FLAG_IWARP_ENABLED) {
+				pf->num_lan_msix = 1;
+				pf->num_iwarp_msix = 1;
+			} else {
+				pf->num_lan_msix = 2;
+			}
+			break;
+		default:
+			if (pf->flags & I40E_FLAG_IWARP_ENABLED) {
+				pf->num_iwarp_msix = min_t(int, (vec / 3),
+						 iwarp_requested);
+				pf->num_vmdq_vsis = min_t(int, (vec / 3),
+						  I40E_DEFAULT_NUM_VMDQ_VSI);
+			} else {
+				pf->num_vmdq_vsis = min_t(int, (vec / 2),
+						  I40E_DEFAULT_NUM_VMDQ_VSI);
+			}
+			if (pf->flags & I40E_FLAG_FD_SB_ENABLED) {
+				pf->num_fdsb_msix = 1;
+				vec--;
+			}
+			pf->num_lan_msix = min_t(int,
+			       (vec - (pf->num_iwarp_msix + pf->num_vmdq_vsis)),
+							      pf->num_lan_msix);
+			pf->num_lan_qps = pf->num_lan_msix;
+			break;
+		}
+	}
+
+	if ((pf->flags & I40E_FLAG_FD_SB_ENABLED) &&
+	    (pf->num_fdsb_msix == 0)) {
+		dev_info(&pf->pdev->dev, "Sideband Flowdir disabled, not enough MSI-X vectors\n");
+		pf->flags &= ~I40E_FLAG_FD_SB_ENABLED;
+		pf->flags |= I40E_FLAG_FD_SB_INACTIVE;
+	}
+	if ((pf->flags & I40E_FLAG_VMDQ_ENABLED) &&
+	    (pf->num_vmdq_msix == 0)) {
+		dev_info(&pf->pdev->dev, "VMDq disabled, not enough MSI-X vectors\n");
+		pf->flags &= ~I40E_FLAG_VMDQ_ENABLED;
+	}
+
+	if ((pf->flags & I40E_FLAG_IWARP_ENABLED) &&
+	    (pf->num_iwarp_msix == 0)) {
+		dev_info(&pf->pdev->dev, "IWARP disabled, not enough MSI-X vectors\n");
+		pf->flags &= ~I40E_FLAG_IWARP_ENABLED;
+	}
+	i40e_debug(&pf->hw, I40E_DEBUG_INIT,
+		   "MSI-X vector distribution: PF %d, VMDq %d, FDSB %d, iWARP %d\n",
+		   pf->num_lan_msix,
+		   pf->num_vmdq_msix * pf->num_vmdq_vsis,
+		   pf->num_fdsb_msix,
+		   pf->num_iwarp_msix);
+
+	return v_actual;
+}
+
+/**
+ * i40e_vsi_alloc_q_vector - Allocate memory for a single interrupt vector
+ * @vsi: the VSI being configured
+ * @v_idx: index of the vector in the vsi struct
+ * @cpu: cpu to be used on affinity_mask
+ *
+ * We allocate one q_vector.  If allocation fails we return -ENOMEM.
+ **/
+static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx, int cpu)
+{
+	struct i40e_q_vector *q_vector;
+
+	/* allocate q_vector */
+	q_vector = kzalloc(sizeof(struct i40e_q_vector), GFP_KERNEL);
+	if (!q_vector)
+		return -ENOMEM;
+
+	q_vector->vsi = vsi;
+	q_vector->v_idx = v_idx;
+	cpumask_copy(&q_vector->affinity_mask, cpu_possible_mask);
+
+	if (vsi->netdev)
+		netif_napi_add(vsi->netdev, &q_vector->napi,
+			       i40e_napi_poll, NAPI_POLL_WEIGHT);
+
+	/* tie q_vector and vsi together */
+	vsi->q_vectors[v_idx] = q_vector;
+
+	return 0;
+}
+
+/**
+ * i40e_vsi_alloc_q_vectors - Allocate memory for interrupt vectors
+ * @vsi: the VSI being configured
+ *
+ * We allocate one q_vector per queue interrupt.  If allocation fails we
+ * return -ENOMEM.
+ **/
+static int i40e_vsi_alloc_q_vectors(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	int err, v_idx, num_q_vectors, current_cpu;
+
+	/* if not MSIX, give the one vector only to the LAN VSI */
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED)
+		num_q_vectors = vsi->num_q_vectors;
+	else if (vsi == pf->vsi[pf->lan_vsi])
+		num_q_vectors = 1;
+	else
+		return -EINVAL;
+
+	current_cpu = cpumask_first(cpu_online_mask);
+
+	for (v_idx = 0; v_idx < num_q_vectors; v_idx++) {
+		err = i40e_vsi_alloc_q_vector(vsi, v_idx, current_cpu);
+		if (err)
+			goto err_out;
+		current_cpu = cpumask_next(current_cpu, cpu_online_mask);
+		if (unlikely(current_cpu >= nr_cpu_ids))
+			current_cpu = cpumask_first(cpu_online_mask);
+	}
+
+	return 0;
+
+err_out:
+	while (v_idx--)
+		i40e_free_q_vector(vsi, v_idx);
+
+	return err;
+}
+
+/**
+ * i40e_init_interrupt_scheme - Determine proper interrupt scheme
+ * @pf: board private structure to initialize
+ **/
+static int i40e_init_interrupt_scheme(struct i40e_pf *pf)
+{
+	int vectors = 0;
+	ssize_t size;
+
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED) {
+		vectors = i40e_init_msix(pf);
+		if (vectors < 0) {
+			pf->flags &= ~(I40E_FLAG_MSIX_ENABLED	|
+				       I40E_FLAG_IWARP_ENABLED	|
+				       I40E_FLAG_RSS_ENABLED	|
+				       I40E_FLAG_DCB_CAPABLE	|
+				       I40E_FLAG_DCB_ENABLED	|
+				       I40E_FLAG_SRIOV_ENABLED	|
+				       I40E_FLAG_FD_SB_ENABLED	|
+				       I40E_FLAG_FD_ATR_ENABLED	|
+				       I40E_FLAG_VMDQ_ENABLED);
+			pf->flags |= I40E_FLAG_FD_SB_INACTIVE;
+
+			/* rework the queue expectations without MSIX */
+			i40e_determine_queue_usage(pf);
+		}
+	}
+
+	if (!(pf->flags & I40E_FLAG_MSIX_ENABLED) &&
+	    (pf->flags & I40E_FLAG_MSI_ENABLED)) {
+		dev_info(&pf->pdev->dev, "MSI-X not available, trying MSI\n");
+		vectors = pci_enable_msi(pf->pdev);
+		if (vectors < 0) {
+			dev_info(&pf->pdev->dev, "MSI init failed - %d\n",
+				 vectors);
+			pf->flags &= ~I40E_FLAG_MSI_ENABLED;
+		}
+		vectors = 1;  /* one MSI or Legacy vector */
+	}
+
+	if (!(pf->flags & (I40E_FLAG_MSIX_ENABLED | I40E_FLAG_MSI_ENABLED)))
+		dev_info(&pf->pdev->dev, "MSI-X and MSI not available, falling back to Legacy IRQ\n");
+
+	/* set up vector assignment tracking */
+	size = sizeof(struct i40e_lump_tracking) + (sizeof(u16) * vectors);
+	pf->irq_pile = kzalloc(size, GFP_KERNEL);
+	if (!pf->irq_pile)
+		return -ENOMEM;
+
+	pf->irq_pile->num_entries = vectors;
+	pf->irq_pile->search_hint = 0;
+
+	/* track first vector for misc interrupts, ignore return */
+	(void)i40e_get_lump(pf, pf->irq_pile, 1, I40E_PILE_VALID_BIT - 1);
+
+	return 0;
+}
+
+/**
+ * i40e_restore_interrupt_scheme - Restore the interrupt scheme
+ * @pf: private board data structure
+ *
+ * Restore the interrupt scheme that was cleared when we suspended the
+ * device. This should be called during resume to re-allocate the q_vectors
+ * and reacquire IRQs.
+ */
+static int i40e_restore_interrupt_scheme(struct i40e_pf *pf)
+{
+	int err, i;
+
+	/* We cleared the MSI and MSI-X flags when disabling the old interrupt
+	 * scheme. We need to re-enabled them here in order to attempt to
+	 * re-acquire the MSI or MSI-X vectors
+	 */
+	pf->flags |= (I40E_FLAG_MSIX_ENABLED | I40E_FLAG_MSI_ENABLED);
+
+	err = i40e_init_interrupt_scheme(pf);
+	if (err)
+		return err;
+
+	/* Now that we've re-acquired IRQs, we need to remap the vectors and
+	 * rings together again.
+	 */
+	for (i = 0; i < pf->num_alloc_vsi; i++) {
+		if (pf->vsi[i]) {
+			err = i40e_vsi_alloc_q_vectors(pf->vsi[i]);
+			if (err)
+				goto err_unwind;
+			i40e_vsi_map_rings_to_vectors(pf->vsi[i]);
+		}
+	}
+
+	err = i40e_setup_misc_vector(pf);
+	if (err)
+		goto err_unwind;
+
+	if (pf->flags & I40E_FLAG_IWARP_ENABLED)
+		i40e_client_update_msix_info(pf);
+
+	return 0;
+
+err_unwind:
+	while (i--) {
+		if (pf->vsi[i])
+			i40e_vsi_free_q_vectors(pf->vsi[i]);
+	}
+
+	return err;
+}
+
+/**
+ * i40e_setup_misc_vector - Setup the misc vector to handle non queue events
+ * @pf: board private structure
+ *
+ * This sets up the handler for MSIX 0, which is used to manage the
+ * non-queue interrupts, e.g. AdminQ and errors.  This is not used
+ * when in MSI or Legacy interrupt mode.
+ **/
+static int i40e_setup_misc_vector(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	int err = 0;
+
+	/* Only request the IRQ once, the first time through. */
+	if (!test_and_set_bit(__I40E_MISC_IRQ_REQUESTED, pf->state)) {
+		err = request_irq(pf->msix_entries[0].vector,
+				  i40e_intr, 0, pf->int_name, pf);
+		if (err) {
+			clear_bit(__I40E_MISC_IRQ_REQUESTED, pf->state);
+			dev_info(&pf->pdev->dev,
+				 "request_irq for %s failed: %d\n",
+				 pf->int_name, err);
+			return -EFAULT;
+		}
+	}
+
+	i40e_enable_misc_int_causes(pf);
+
+	/* associate no queues to the misc vector */
+	wr32(hw, I40E_PFINT_LNKLST0, I40E_QUEUE_END_OF_LIST);
+	wr32(hw, I40E_PFINT_ITR0(I40E_RX_ITR), I40E_ITR_8K);
+
+	i40e_flush(hw);
+
+	i40e_irq_dynamic_enable_icr0(pf);
+
+	return err;
+}
+
+/**
+ * i40e_get_rss_aq - Get RSS keys and lut by using AQ commands
+ * @vsi: Pointer to vsi structure
+ * @seed: Buffter to store the hash keys
+ * @lut: Buffer to store the lookup table entries
+ * @lut_size: Size of buffer to store the lookup table entries
+ *
+ * Return 0 on success, negative on failure
+ */
+static int i40e_get_rss_aq(struct i40e_vsi *vsi, const u8 *seed,
+			   u8 *lut, u16 lut_size)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	int ret = 0;
+
+	if (seed) {
+		ret = i40e_aq_get_rss_key(hw, vsi->id,
+			(struct i40e_aqc_get_set_rss_key_data *)seed);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "Cannot get RSS key, err %s aq_err %s\n",
+				 i40e_stat_str(&pf->hw, ret),
+				 i40e_aq_str(&pf->hw,
+					     pf->hw.aq.asq_last_status));
+			return ret;
+		}
+	}
+
+	if (lut) {
+		bool pf_lut = vsi->type == I40E_VSI_MAIN ? true : false;
+
+		ret = i40e_aq_get_rss_lut(hw, vsi->id, pf_lut, lut, lut_size);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "Cannot get RSS lut, err %s aq_err %s\n",
+				 i40e_stat_str(&pf->hw, ret),
+				 i40e_aq_str(&pf->hw,
+					     pf->hw.aq.asq_last_status));
+			return ret;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * i40e_config_rss_reg - Configure RSS keys and lut by writing registers
+ * @vsi: Pointer to vsi structure
+ * @seed: RSS hash seed
+ * @lut: Lookup table
+ * @lut_size: Lookup table size
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_config_rss_reg(struct i40e_vsi *vsi, const u8 *seed,
+			       const u8 *lut, u16 lut_size)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	u16 vf_id = vsi->vf_id;
+	u8 i;
+
+	/* Fill out hash function seed */
+	if (seed) {
+		u32 *seed_dw = (u32 *)seed;
+
+		if (vsi->type == I40E_VSI_MAIN) {
+			for (i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++)
+				wr32(hw, I40E_PFQF_HKEY(i), seed_dw[i]);
+		} else if (vsi->type == I40E_VSI_SRIOV) {
+			for (i = 0; i <= I40E_VFQF_HKEY1_MAX_INDEX; i++)
+				wr32(hw, I40E_VFQF_HKEY1(i, vf_id), seed_dw[i]);
+		} else {
+			dev_err(&pf->pdev->dev, "Cannot set RSS seed - invalid VSI type\n");
+		}
+	}
+
+	if (lut) {
+		u32 *lut_dw = (u32 *)lut;
+
+		if (vsi->type == I40E_VSI_MAIN) {
+			if (lut_size != I40E_HLUT_ARRAY_SIZE)
+				return -EINVAL;
+			for (i = 0; i <= I40E_PFQF_HLUT_MAX_INDEX; i++)
+				wr32(hw, I40E_PFQF_HLUT(i), lut_dw[i]);
+		} else if (vsi->type == I40E_VSI_SRIOV) {
+			if (lut_size != I40E_VF_HLUT_ARRAY_SIZE)
+				return -EINVAL;
+			for (i = 0; i <= I40E_VFQF_HLUT_MAX_INDEX; i++)
+				wr32(hw, I40E_VFQF_HLUT1(i, vf_id), lut_dw[i]);
+		} else {
+			dev_err(&pf->pdev->dev, "Cannot set RSS LUT - invalid VSI type\n");
+		}
+	}
+	i40e_flush(hw);
+
+	return 0;
+}
+
+/**
+ * i40e_get_rss_reg - Get the RSS keys and lut by reading registers
+ * @vsi: Pointer to VSI structure
+ * @seed: Buffer to store the keys
+ * @lut: Buffer to store the lookup table entries
+ * @lut_size: Size of buffer to store the lookup table entries
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int i40e_get_rss_reg(struct i40e_vsi *vsi, u8 *seed,
+			    u8 *lut, u16 lut_size)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	u16 i;
+
+	if (seed) {
+		u32 *seed_dw = (u32 *)seed;
+
+		for (i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++)
+			seed_dw[i] = i40e_read_rx_ctl(hw, I40E_PFQF_HKEY(i));
+	}
+	if (lut) {
+		u32 *lut_dw = (u32 *)lut;
+
+		if (lut_size != I40E_HLUT_ARRAY_SIZE)
+			return -EINVAL;
+		for (i = 0; i <= I40E_PFQF_HLUT_MAX_INDEX; i++)
+			lut_dw[i] = rd32(hw, I40E_PFQF_HLUT(i));
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_config_rss - Configure RSS keys and lut
+ * @vsi: Pointer to VSI structure
+ * @seed: RSS hash seed
+ * @lut: Lookup table
+ * @lut_size: Lookup table size
+ *
+ * Returns 0 on success, negative on failure
+ */
+int i40e_config_rss(struct i40e_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size)
+{
+	struct i40e_pf *pf = vsi->back;
+
+	if (pf->hw_features & I40E_HW_RSS_AQ_CAPABLE)
+		return i40e_config_rss_aq(vsi, seed, lut, lut_size);
+	else
+		return i40e_config_rss_reg(vsi, seed, lut, lut_size);
+}
+
+/**
+ * i40e_get_rss - Get RSS keys and lut
+ * @vsi: Pointer to VSI structure
+ * @seed: Buffer to store the keys
+ * @lut: Buffer to store the lookup table entries
+ * lut_size: Size of buffer to store the lookup table entries
+ *
+ * Returns 0 on success, negative on failure
+ */
+int i40e_get_rss(struct i40e_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size)
+{
+	struct i40e_pf *pf = vsi->back;
+
+	if (pf->hw_features & I40E_HW_RSS_AQ_CAPABLE)
+		return i40e_get_rss_aq(vsi, seed, lut, lut_size);
+	else
+		return i40e_get_rss_reg(vsi, seed, lut, lut_size);
+}
+
+/**
+ * i40e_fill_rss_lut - Fill the RSS lookup table with default values
+ * @pf: Pointer to board private structure
+ * @lut: Lookup table
+ * @rss_table_size: Lookup table size
+ * @rss_size: Range of queue number for hashing
+ */
+void i40e_fill_rss_lut(struct i40e_pf *pf, u8 *lut,
+		       u16 rss_table_size, u16 rss_size)
+{
+	u16 i;
+
+	for (i = 0; i < rss_table_size; i++)
+		lut[i] = i % rss_size;
+}
+
+/**
+ * i40e_pf_config_rss - Prepare for RSS if used
+ * @pf: board private structure
+ **/
+static int i40e_pf_config_rss(struct i40e_pf *pf)
+{
+	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+	u8 seed[I40E_HKEY_ARRAY_SIZE];
+	u8 *lut;
+	struct i40e_hw *hw = &pf->hw;
+	u32 reg_val;
+	u64 hena;
+	int ret;
+
+	/* By default we enable TCP/UDP with IPv4/IPv6 ptypes */
+	hena = (u64)i40e_read_rx_ctl(hw, I40E_PFQF_HENA(0)) |
+		((u64)i40e_read_rx_ctl(hw, I40E_PFQF_HENA(1)) << 32);
+	hena |= i40e_pf_get_default_rss_hena(pf);
+
+	i40e_write_rx_ctl(hw, I40E_PFQF_HENA(0), (u32)hena);
+	i40e_write_rx_ctl(hw, I40E_PFQF_HENA(1), (u32)(hena >> 32));
+
+	/* Determine the RSS table size based on the hardware capabilities */
+	reg_val = i40e_read_rx_ctl(hw, I40E_PFQF_CTL_0);
+	reg_val = (pf->rss_table_size == 512) ?
+			(reg_val | I40E_PFQF_CTL_0_HASHLUTSIZE_512) :
+			(reg_val & ~I40E_PFQF_CTL_0_HASHLUTSIZE_512);
+	i40e_write_rx_ctl(hw, I40E_PFQF_CTL_0, reg_val);
+
+	/* Determine the RSS size of the VSI */
+	if (!vsi->rss_size) {
+		u16 qcount;
+		/* If the firmware does something weird during VSI init, we
+		 * could end up with zero TCs. Check for that to avoid
+		 * divide-by-zero. It probably won't pass traffic, but it also
+		 * won't panic.
+		 */
+		qcount = vsi->num_queue_pairs /
+			 (vsi->tc_config.numtc ? vsi->tc_config.numtc : 1);
+		vsi->rss_size = min_t(int, pf->alloc_rss_size, qcount);
+	}
+	if (!vsi->rss_size)
+		return -EINVAL;
+
+	lut = kzalloc(vsi->rss_table_size, GFP_KERNEL);
+	if (!lut)
+		return -ENOMEM;
+
+	/* Use user configured lut if there is one, otherwise use default */
+	if (vsi->rss_lut_user)
+		memcpy(lut, vsi->rss_lut_user, vsi->rss_table_size);
+	else
+		i40e_fill_rss_lut(pf, lut, vsi->rss_table_size, vsi->rss_size);
+
+	/* Use user configured hash key if there is one, otherwise
+	 * use default.
+	 */
+	if (vsi->rss_hkey_user)
+		memcpy(seed, vsi->rss_hkey_user, I40E_HKEY_ARRAY_SIZE);
+	else
+		netdev_rss_key_fill((void *)seed, I40E_HKEY_ARRAY_SIZE);
+	ret = i40e_config_rss(vsi, seed, lut, vsi->rss_table_size);
+	kfree(lut);
+
+	return ret;
+}
+
+/**
+ * i40e_reconfig_rss_queues - change number of queues for rss and rebuild
+ * @pf: board private structure
+ * @queue_count: the requested queue count for rss.
+ *
+ * returns 0 if rss is not enabled, if enabled returns the final rss queue
+ * count which may be different from the requested queue count.
+ * Note: expects to be called while under rtnl_lock()
+ **/
+int i40e_reconfig_rss_queues(struct i40e_pf *pf, int queue_count)
+{
+	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+	int new_rss_size;
+
+	if (!(pf->flags & I40E_FLAG_RSS_ENABLED))
+		return 0;
+
+	new_rss_size = min_t(int, queue_count, pf->rss_size_max);
+
+	if (queue_count != vsi->num_queue_pairs) {
+		u16 qcount;
+
+		vsi->req_queue_pairs = queue_count;
+		i40e_prep_for_reset(pf, true);
+
+		pf->alloc_rss_size = new_rss_size;
+
+		i40e_reset_and_rebuild(pf, true, true);
+
+		/* Discard the user configured hash keys and lut, if less
+		 * queues are enabled.
+		 */
+		if (queue_count < vsi->rss_size) {
+			i40e_clear_rss_config_user(vsi);
+			dev_dbg(&pf->pdev->dev,
+				"discard user configured hash keys and lut\n");
+		}
+
+		/* Reset vsi->rss_size, as number of enabled queues changed */
+		qcount = vsi->num_queue_pairs / vsi->tc_config.numtc;
+		vsi->rss_size = min_t(int, pf->alloc_rss_size, qcount);
+
+		i40e_pf_config_rss(pf);
+	}
+	dev_info(&pf->pdev->dev, "User requested queue count/HW max RSS count:  %d/%d\n",
+		 vsi->req_queue_pairs, pf->rss_size_max);
+	return pf->alloc_rss_size;
+}
+
+/**
+ * i40e_get_partition_bw_setting - Retrieve BW settings for this PF partition
+ * @pf: board private structure
+ **/
+i40e_status i40e_get_partition_bw_setting(struct i40e_pf *pf)
+{
+	i40e_status status;
+	bool min_valid, max_valid;
+	u32 max_bw, min_bw;
+
+	status = i40e_read_bw_from_alt_ram(&pf->hw, &max_bw, &min_bw,
+					   &min_valid, &max_valid);
+
+	if (!status) {
+		if (min_valid)
+			pf->min_bw = min_bw;
+		if (max_valid)
+			pf->max_bw = max_bw;
+	}
+
+	return status;
+}
+
+/**
+ * i40e_set_partition_bw_setting - Set BW settings for this PF partition
+ * @pf: board private structure
+ **/
+i40e_status i40e_set_partition_bw_setting(struct i40e_pf *pf)
+{
+	struct i40e_aqc_configure_partition_bw_data bw_data;
+	i40e_status status;
+
+	/* Set the valid bit for this PF */
+	bw_data.pf_valid_bits = cpu_to_le16(BIT(pf->hw.pf_id));
+	bw_data.max_bw[pf->hw.pf_id] = pf->max_bw & I40E_ALT_BW_VALUE_MASK;
+	bw_data.min_bw[pf->hw.pf_id] = pf->min_bw & I40E_ALT_BW_VALUE_MASK;
+
+	/* Set the new bandwidths */
+	status = i40e_aq_configure_partition_bw(&pf->hw, &bw_data, NULL);
+
+	return status;
+}
+
+/**
+ * i40e_commit_partition_bw_setting - Commit BW settings for this PF partition
+ * @pf: board private structure
+ **/
+i40e_status i40e_commit_partition_bw_setting(struct i40e_pf *pf)
+{
+	/* Commit temporary BW setting to permanent NVM image */
+	enum i40e_admin_queue_err last_aq_status;
+	i40e_status ret;
+	u16 nvm_word;
+
+	if (pf->hw.partition_id != 1) {
+		dev_info(&pf->pdev->dev,
+			 "Commit BW only works on partition 1! This is partition %d",
+			 pf->hw.partition_id);
+		ret = I40E_NOT_SUPPORTED;
+		goto bw_commit_out;
+	}
+
+	/* Acquire NVM for read access */
+	ret = i40e_acquire_nvm(&pf->hw, I40E_RESOURCE_READ);
+	last_aq_status = pf->hw.aq.asq_last_status;
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "Cannot acquire NVM for read access, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, last_aq_status));
+		goto bw_commit_out;
+	}
+
+	/* Read word 0x10 of NVM - SW compatibility word 1 */
+	ret = i40e_aq_read_nvm(&pf->hw,
+			       I40E_SR_NVM_CONTROL_WORD,
+			       0x10, sizeof(nvm_word), &nvm_word,
+			       false, NULL);
+	/* Save off last admin queue command status before releasing
+	 * the NVM
+	 */
+	last_aq_status = pf->hw.aq.asq_last_status;
+	i40e_release_nvm(&pf->hw);
+	if (ret) {
+		dev_info(&pf->pdev->dev, "NVM read error, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, last_aq_status));
+		goto bw_commit_out;
+	}
+
+	/* Wait a bit for NVM release to complete */
+	msleep(50);
+
+	/* Acquire NVM for write access */
+	ret = i40e_acquire_nvm(&pf->hw, I40E_RESOURCE_WRITE);
+	last_aq_status = pf->hw.aq.asq_last_status;
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "Cannot acquire NVM for write access, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, last_aq_status));
+		goto bw_commit_out;
+	}
+	/* Write it back out unchanged to initiate update NVM,
+	 * which will force a write of the shadow (alt) RAM to
+	 * the NVM - thus storing the bandwidth values permanently.
+	 */
+	ret = i40e_aq_update_nvm(&pf->hw,
+				 I40E_SR_NVM_CONTROL_WORD,
+				 0x10, sizeof(nvm_word),
+				 &nvm_word, true, 0, NULL);
+	/* Save off last admin queue command status before releasing
+	 * the NVM
+	 */
+	last_aq_status = pf->hw.aq.asq_last_status;
+	i40e_release_nvm(&pf->hw);
+	if (ret)
+		dev_info(&pf->pdev->dev,
+			 "BW settings NOT SAVED, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, last_aq_status));
+bw_commit_out:
+
+	return ret;
+}
+
+/**
+ * i40e_sw_init - Initialize general software structures (struct i40e_pf)
+ * @pf: board private structure to initialize
+ *
+ * i40e_sw_init initializes the Adapter private data structure.
+ * Fields are initialized based on PCI device information and
+ * OS network device settings (MTU size).
+ **/
+static int i40e_sw_init(struct i40e_pf *pf)
+{
+	int err = 0;
+	int size;
+
+	/* Set default capability flags */
+	pf->flags = I40E_FLAG_RX_CSUM_ENABLED |
+		    I40E_FLAG_MSI_ENABLED     |
+		    I40E_FLAG_MSIX_ENABLED;
+
+	/* Set default ITR */
+	pf->rx_itr_default = I40E_ITR_RX_DEF;
+	pf->tx_itr_default = I40E_ITR_TX_DEF;
+
+	/* Depending on PF configurations, it is possible that the RSS
+	 * maximum might end up larger than the available queues
+	 */
+	pf->rss_size_max = BIT(pf->hw.func_caps.rss_table_entry_width);
+	pf->alloc_rss_size = 1;
+	pf->rss_table_size = pf->hw.func_caps.rss_table_size;
+	pf->rss_size_max = min_t(int, pf->rss_size_max,
+				 pf->hw.func_caps.num_tx_qp);
+	if (pf->hw.func_caps.rss) {
+		pf->flags |= I40E_FLAG_RSS_ENABLED;
+		pf->alloc_rss_size = min_t(int, pf->rss_size_max,
+					   num_online_cpus());
+	}
+
+	/* MFP mode enabled */
+	if (pf->hw.func_caps.npar_enable || pf->hw.func_caps.flex10_enable) {
+		pf->flags |= I40E_FLAG_MFP_ENABLED;
+		dev_info(&pf->pdev->dev, "MFP mode Enabled\n");
+		if (i40e_get_partition_bw_setting(pf)) {
+			dev_warn(&pf->pdev->dev,
+				 "Could not get partition bw settings\n");
+		} else {
+			dev_info(&pf->pdev->dev,
+				 "Partition BW Min = %8.8x, Max = %8.8x\n",
+				 pf->min_bw, pf->max_bw);
+
+			/* nudge the Tx scheduler */
+			i40e_set_partition_bw_setting(pf);
+		}
+	}
+
+	if ((pf->hw.func_caps.fd_filters_guaranteed > 0) ||
+	    (pf->hw.func_caps.fd_filters_best_effort > 0)) {
+		pf->flags |= I40E_FLAG_FD_ATR_ENABLED;
+		pf->atr_sample_rate = I40E_DEFAULT_ATR_SAMPLE_RATE;
+		if (pf->flags & I40E_FLAG_MFP_ENABLED &&
+		    pf->hw.num_partitions > 1)
+			dev_info(&pf->pdev->dev,
+				 "Flow Director Sideband mode Disabled in MFP mode\n");
+		else
+			pf->flags |= I40E_FLAG_FD_SB_ENABLED;
+		pf->fdir_pf_filter_count =
+				 pf->hw.func_caps.fd_filters_guaranteed;
+		pf->hw.fdir_shared_filter_count =
+				 pf->hw.func_caps.fd_filters_best_effort;
+	}
+
+	if (pf->hw.mac.type == I40E_MAC_X722) {
+		pf->hw_features |= (I40E_HW_RSS_AQ_CAPABLE |
+				    I40E_HW_128_QP_RSS_CAPABLE |
+				    I40E_HW_ATR_EVICT_CAPABLE |
+				    I40E_HW_WB_ON_ITR_CAPABLE |
+				    I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE |
+				    I40E_HW_NO_PCI_LINK_CHECK |
+				    I40E_HW_USE_SET_LLDP_MIB |
+				    I40E_HW_GENEVE_OFFLOAD_CAPABLE |
+				    I40E_HW_PTP_L4_CAPABLE |
+				    I40E_HW_WOL_MC_MAGIC_PKT_WAKE |
+				    I40E_HW_OUTER_UDP_CSUM_CAPABLE);
+
+#define I40E_FDEVICT_PCTYPE_DEFAULT 0xc03
+		if (rd32(&pf->hw, I40E_GLQF_FDEVICTENA(1)) !=
+		    I40E_FDEVICT_PCTYPE_DEFAULT) {
+			dev_warn(&pf->pdev->dev,
+				 "FD EVICT PCTYPES are not right, disable FD HW EVICT\n");
+			pf->hw_features &= ~I40E_HW_ATR_EVICT_CAPABLE;
+		}
+	} else if ((pf->hw.aq.api_maj_ver > 1) ||
+		   ((pf->hw.aq.api_maj_ver == 1) &&
+		    (pf->hw.aq.api_min_ver > 4))) {
+		/* Supported in FW API version higher than 1.4 */
+		pf->hw_features |= I40E_HW_GENEVE_OFFLOAD_CAPABLE;
+	}
+
+	/* Enable HW ATR eviction if possible */
+	if (pf->hw_features & I40E_HW_ATR_EVICT_CAPABLE)
+		pf->flags |= I40E_FLAG_HW_ATR_EVICT_ENABLED;
+
+	if ((pf->hw.mac.type == I40E_MAC_XL710) &&
+	    (((pf->hw.aq.fw_maj_ver == 4) && (pf->hw.aq.fw_min_ver < 33)) ||
+	    (pf->hw.aq.fw_maj_ver < 4))) {
+		pf->hw_features |= I40E_HW_RESTART_AUTONEG;
+		/* No DCB support  for FW < v4.33 */
+		pf->hw_features |= I40E_HW_NO_DCB_SUPPORT;
+	}
+
+	/* Disable FW LLDP if FW < v4.3 */
+	if ((pf->hw.mac.type == I40E_MAC_XL710) &&
+	    (((pf->hw.aq.fw_maj_ver == 4) && (pf->hw.aq.fw_min_ver < 3)) ||
+	    (pf->hw.aq.fw_maj_ver < 4)))
+		pf->hw_features |= I40E_HW_STOP_FW_LLDP;
+
+	/* Use the FW Set LLDP MIB API if FW > v4.40 */
+	if ((pf->hw.mac.type == I40E_MAC_XL710) &&
+	    (((pf->hw.aq.fw_maj_ver == 4) && (pf->hw.aq.fw_min_ver >= 40)) ||
+	    (pf->hw.aq.fw_maj_ver >= 5)))
+		pf->hw_features |= I40E_HW_USE_SET_LLDP_MIB;
+
+	/* Enable PTP L4 if FW > v6.0 */
+	if (pf->hw.mac.type == I40E_MAC_XL710 &&
+	    pf->hw.aq.fw_maj_ver >= 6)
+		pf->hw_features |= I40E_HW_PTP_L4_CAPABLE;
+
+	if (pf->hw.func_caps.vmdq && num_online_cpus() != 1) {
+		pf->num_vmdq_vsis = I40E_DEFAULT_NUM_VMDQ_VSI;
+		pf->flags |= I40E_FLAG_VMDQ_ENABLED;
+		pf->num_vmdq_qps = i40e_default_queues_per_vmdq(pf);
+	}
+
+	if (pf->hw.func_caps.iwarp && num_online_cpus() != 1) {
+		pf->flags |= I40E_FLAG_IWARP_ENABLED;
+		/* IWARP needs one extra vector for CQP just like MISC.*/
+		pf->num_iwarp_msix = (int)num_online_cpus() + 1;
+	}
+	/* Stopping the FW LLDP engine is only supported on the
+	 * XL710 with a FW ver >= 1.7.  Also, stopping FW LLDP
+	 * engine is not supported if NPAR is functioning on this
+	 * part
+	 */
+	if (pf->hw.mac.type == I40E_MAC_XL710 &&
+	    !pf->hw.func_caps.npar_enable &&
+	    (pf->hw.aq.api_maj_ver > 1 ||
+	     (pf->hw.aq.api_maj_ver == 1 && pf->hw.aq.api_min_ver > 6)))
+		pf->hw_features |= I40E_HW_STOPPABLE_FW_LLDP;
+
+#ifdef CONFIG_PCI_IOV
+	if (pf->hw.func_caps.num_vfs && pf->hw.partition_id == 1) {
+		pf->num_vf_qps = I40E_DEFAULT_QUEUES_PER_VF;
+		pf->flags |= I40E_FLAG_SRIOV_ENABLED;
+		pf->num_req_vfs = min_t(int,
+					pf->hw.func_caps.num_vfs,
+					I40E_MAX_VF_COUNT);
+	}
+#endif /* CONFIG_PCI_IOV */
+	pf->eeprom_version = 0xDEAD;
+	pf->lan_veb = I40E_NO_VEB;
+	pf->lan_vsi = I40E_NO_VSI;
+
+	/* By default FW has this off for performance reasons */
+	pf->flags &= ~I40E_FLAG_VEB_STATS_ENABLED;
+
+	/* set up queue assignment tracking */
+	size = sizeof(struct i40e_lump_tracking)
+		+ (sizeof(u16) * pf->hw.func_caps.num_tx_qp);
+	pf->qp_pile = kzalloc(size, GFP_KERNEL);
+	if (!pf->qp_pile) {
+		err = -ENOMEM;
+		goto sw_init_done;
+	}
+	pf->qp_pile->num_entries = pf->hw.func_caps.num_tx_qp;
+	pf->qp_pile->search_hint = 0;
+
+	pf->tx_timeout_recovery_level = 1;
+
+	mutex_init(&pf->switch_mutex);
+
+sw_init_done:
+	return err;
+}
+
+/**
+ * i40e_set_ntuple - set the ntuple feature flag and take action
+ * @pf: board private structure to initialize
+ * @features: the feature set that the stack is suggesting
+ *
+ * returns a bool to indicate if reset needs to happen
+ **/
+bool i40e_set_ntuple(struct i40e_pf *pf, netdev_features_t features)
+{
+	bool need_reset = false;
+
+	/* Check if Flow Director n-tuple support was enabled or disabled.  If
+	 * the state changed, we need to reset.
+	 */
+	if (features & NETIF_F_NTUPLE) {
+		/* Enable filters and mark for reset */
+		if (!(pf->flags & I40E_FLAG_FD_SB_ENABLED))
+			need_reset = true;
+		/* enable FD_SB only if there is MSI-X vector and no cloud
+		 * filters exist
+		 */
+		if (pf->num_fdsb_msix > 0 && !pf->num_cloud_filters) {
+			pf->flags |= I40E_FLAG_FD_SB_ENABLED;
+			pf->flags &= ~I40E_FLAG_FD_SB_INACTIVE;
+		}
+	} else {
+		/* turn off filters, mark for reset and clear SW filter list */
+		if (pf->flags & I40E_FLAG_FD_SB_ENABLED) {
+			need_reset = true;
+			i40e_fdir_filter_exit(pf);
+		}
+		pf->flags &= ~I40E_FLAG_FD_SB_ENABLED;
+		clear_bit(__I40E_FD_SB_AUTO_DISABLED, pf->state);
+		pf->flags |= I40E_FLAG_FD_SB_INACTIVE;
+
+		/* reset fd counters */
+		pf->fd_add_err = 0;
+		pf->fd_atr_cnt = 0;
+		/* if ATR was auto disabled it can be re-enabled. */
+		if (test_and_clear_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state))
+			if ((pf->flags & I40E_FLAG_FD_ATR_ENABLED) &&
+			    (I40E_DEBUG_FD & pf->hw.debug_mask))
+				dev_info(&pf->pdev->dev, "ATR re-enabled.\n");
+	}
+	return need_reset;
+}
+
+/**
+ * i40e_clear_rss_lut - clear the rx hash lookup table
+ * @vsi: the VSI being configured
+ **/
+static void i40e_clear_rss_lut(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	u16 vf_id = vsi->vf_id;
+	u8 i;
+
+	if (vsi->type == I40E_VSI_MAIN) {
+		for (i = 0; i <= I40E_PFQF_HLUT_MAX_INDEX; i++)
+			wr32(hw, I40E_PFQF_HLUT(i), 0);
+	} else if (vsi->type == I40E_VSI_SRIOV) {
+		for (i = 0; i <= I40E_VFQF_HLUT_MAX_INDEX; i++)
+			i40e_write_rx_ctl(hw, I40E_VFQF_HLUT1(i, vf_id), 0);
+	} else {
+		dev_err(&pf->pdev->dev, "Cannot set RSS LUT - invalid VSI type\n");
+	}
+}
+
+/**
+ * i40e_set_features - set the netdev feature flags
+ * @netdev: ptr to the netdev being adjusted
+ * @features: the feature set that the stack is suggesting
+ * Note: expects to be called while under rtnl_lock()
+ **/
+static int i40e_set_features(struct net_device *netdev,
+			     netdev_features_t features)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	bool need_reset;
+
+	if (features & NETIF_F_RXHASH && !(netdev->features & NETIF_F_RXHASH))
+		i40e_pf_config_rss(pf);
+	else if (!(features & NETIF_F_RXHASH) &&
+		 netdev->features & NETIF_F_RXHASH)
+		i40e_clear_rss_lut(vsi);
+
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		i40e_vlan_stripping_enable(vsi);
+	else
+		i40e_vlan_stripping_disable(vsi);
+
+	if (!(features & NETIF_F_HW_TC) && pf->num_cloud_filters) {
+		dev_err(&pf->pdev->dev,
+			"Offloaded tc filters active, can't turn hw_tc_offload off");
+		return -EINVAL;
+	}
+
+	need_reset = i40e_set_ntuple(pf, features);
+
+	if (need_reset)
+		i40e_do_reset(pf, I40E_PF_RESET_FLAG, true);
+
+	return 0;
+}
+
+/**
+ * i40e_get_udp_port_idx - Lookup a possibly offloaded for Rx UDP port
+ * @pf: board private structure
+ * @port: The UDP port to look up
+ *
+ * Returns the index number or I40E_MAX_PF_UDP_OFFLOAD_PORTS if port not found
+ **/
+static u8 i40e_get_udp_port_idx(struct i40e_pf *pf, u16 port)
+{
+	u8 i;
+
+	for (i = 0; i < I40E_MAX_PF_UDP_OFFLOAD_PORTS; i++) {
+		if (pf->udp_ports[i].port == port)
+			return i;
+	}
+
+	return i;
+}
+
+/**
+ * i40e_udp_tunnel_add - Get notifications about UDP tunnel ports that come up
+ * @netdev: This physical port's netdev
+ * @ti: Tunnel endpoint information
+ **/
+static void i40e_udp_tunnel_add(struct net_device *netdev,
+				struct udp_tunnel_info *ti)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	u16 port = ntohs(ti->port);
+	u8 next_idx;
+	u8 idx;
+
+	idx = i40e_get_udp_port_idx(pf, port);
+
+	/* Check if port already exists */
+	if (idx < I40E_MAX_PF_UDP_OFFLOAD_PORTS) {
+		netdev_info(netdev, "port %d already offloaded\n", port);
+		return;
+	}
+
+	/* Now check if there is space to add the new port */
+	next_idx = i40e_get_udp_port_idx(pf, 0);
+
+	if (next_idx == I40E_MAX_PF_UDP_OFFLOAD_PORTS) {
+		netdev_info(netdev, "maximum number of offloaded UDP ports reached, not adding port %d\n",
+			    port);
+		return;
+	}
+
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		pf->udp_ports[next_idx].type = I40E_AQC_TUNNEL_TYPE_VXLAN;
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		if (!(pf->hw_features & I40E_HW_GENEVE_OFFLOAD_CAPABLE))
+			return;
+		pf->udp_ports[next_idx].type = I40E_AQC_TUNNEL_TYPE_NGE;
+		break;
+	default:
+		return;
+	}
+
+	/* New port: add it and mark its index in the bitmap */
+	pf->udp_ports[next_idx].port = port;
+	pf->pending_udp_bitmap |= BIT_ULL(next_idx);
+	set_bit(__I40E_UDP_FILTER_SYNC_PENDING, pf->state);
+}
+
+/**
+ * i40e_udp_tunnel_del - Get notifications about UDP tunnel ports that go away
+ * @netdev: This physical port's netdev
+ * @ti: Tunnel endpoint information
+ **/
+static void i40e_udp_tunnel_del(struct net_device *netdev,
+				struct udp_tunnel_info *ti)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	u16 port = ntohs(ti->port);
+	u8 idx;
+
+	idx = i40e_get_udp_port_idx(pf, port);
+
+	/* Check if port already exists */
+	if (idx >= I40E_MAX_PF_UDP_OFFLOAD_PORTS)
+		goto not_found;
+
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		if (pf->udp_ports[idx].type != I40E_AQC_TUNNEL_TYPE_VXLAN)
+			goto not_found;
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		if (pf->udp_ports[idx].type != I40E_AQC_TUNNEL_TYPE_NGE)
+			goto not_found;
+		break;
+	default:
+		goto not_found;
+	}
+
+	/* if port exists, set it to 0 (mark for deletion)
+	 * and make it pending
+	 */
+	pf->udp_ports[idx].port = 0;
+	pf->pending_udp_bitmap |= BIT_ULL(idx);
+	set_bit(__I40E_UDP_FILTER_SYNC_PENDING, pf->state);
+
+	return;
+not_found:
+	netdev_warn(netdev, "UDP port %d was not found, not deleting\n",
+		    port);
+}
+
+static int i40e_get_phys_port_id(struct net_device *netdev,
+				 struct netdev_phys_item_id *ppid)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+
+	if (!(pf->hw_features & I40E_HW_PORT_ID_VALID))
+		return -EOPNOTSUPP;
+
+	ppid->id_len = min_t(int, sizeof(hw->mac.port_addr), sizeof(ppid->id));
+	memcpy(ppid->id, hw->mac.port_addr, ppid->id_len);
+
+	return 0;
+}
+
+/**
+ * i40e_ndo_fdb_add - add an entry to the hardware database
+ * @ndm: the input from the stack
+ * @tb: pointer to array of nladdr (unused)
+ * @dev: the net device pointer
+ * @addr: the MAC address entry being added
+ * @flags: instructions from stack about fdb operation
+ */
+static int i40e_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			    struct net_device *dev,
+			    const unsigned char *addr, u16 vid,
+			    u16 flags)
+{
+	struct i40e_netdev_priv *np = netdev_priv(dev);
+	struct i40e_pf *pf = np->vsi->back;
+	int err = 0;
+
+	if (!(pf->flags & I40E_FLAG_SRIOV_ENABLED))
+		return -EOPNOTSUPP;
+
+	if (vid) {
+		pr_info("%s: vlans aren't supported yet for dev_uc|mc_add()\n", dev->name);
+		return -EINVAL;
+	}
+
+	/* Hardware does not support aging addresses so if a
+	 * ndm_state is given only allow permanent addresses
+	 */
+	if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
+		netdev_info(dev, "FDB only supports static addresses\n");
+		return -EINVAL;
+	}
+
+	if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+		err = dev_uc_add_excl(dev, addr);
+	else if (is_multicast_ether_addr(addr))
+		err = dev_mc_add_excl(dev, addr);
+	else
+		err = -EINVAL;
+
+	/* Only return duplicate errors if NLM_F_EXCL is set */
+	if (err == -EEXIST && !(flags & NLM_F_EXCL))
+		err = 0;
+
+	return err;
+}
+
+/**
+ * i40e_ndo_bridge_setlink - Set the hardware bridge mode
+ * @dev: the netdev being configured
+ * @nlh: RTNL message
+ *
+ * Inserts a new hardware bridge if not already created and
+ * enables the bridging mode requested (VEB or VEPA). If the
+ * hardware bridge has already been inserted and the request
+ * is to change the mode then that requires a PF reset to
+ * allow rebuild of the components with required hardware
+ * bridge mode enabled.
+ *
+ * Note: expects to be called while under rtnl_lock()
+ **/
+static int i40e_ndo_bridge_setlink(struct net_device *dev,
+				   struct nlmsghdr *nlh,
+				   u16 flags)
+{
+	struct i40e_netdev_priv *np = netdev_priv(dev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_veb *veb = NULL;
+	struct nlattr *attr, *br_spec;
+	int i, rem;
+
+	/* Only for PF VSI for now */
+	if (vsi->seid != pf->vsi[pf->lan_vsi]->seid)
+		return -EOPNOTSUPP;
+
+	/* Find the HW bridge for PF VSI */
+	for (i = 0; i < I40E_MAX_VEB && !veb; i++) {
+		if (pf->veb[i] && pf->veb[i]->seid == vsi->uplink_seid)
+			veb = pf->veb[i];
+	}
+
+	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+
+	nla_for_each_nested(attr, br_spec, rem) {
+		__u16 mode;
+
+		if (nla_type(attr) != IFLA_BRIDGE_MODE)
+			continue;
+
+		mode = nla_get_u16(attr);
+		if ((mode != BRIDGE_MODE_VEPA) &&
+		    (mode != BRIDGE_MODE_VEB))
+			return -EINVAL;
+
+		/* Insert a new HW bridge */
+		if (!veb) {
+			veb = i40e_veb_setup(pf, 0, vsi->uplink_seid, vsi->seid,
+					     vsi->tc_config.enabled_tc);
+			if (veb) {
+				veb->bridge_mode = mode;
+				i40e_config_bridge_mode(veb);
+			} else {
+				/* No Bridge HW offload available */
+				return -ENOENT;
+			}
+			break;
+		} else if (mode != veb->bridge_mode) {
+			/* Existing HW bridge but different mode needs reset */
+			veb->bridge_mode = mode;
+			/* TODO: If no VFs or VMDq VSIs, disallow VEB mode */
+			if (mode == BRIDGE_MODE_VEB)
+				pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
+			else
+				pf->flags &= ~I40E_FLAG_VEB_MODE_ENABLED;
+			i40e_do_reset(pf, I40E_PF_RESET_FLAG, true);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_ndo_bridge_getlink - Get the hardware bridge mode
+ * @skb: skb buff
+ * @pid: process id
+ * @seq: RTNL message seq #
+ * @dev: the netdev being configured
+ * @filter_mask: unused
+ * @nlflags: netlink flags passed in
+ *
+ * Return the mode in which the hardware bridge is operating in
+ * i.e VEB or VEPA.
+ **/
+static int i40e_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+				   struct net_device *dev,
+				   u32 __always_unused filter_mask,
+				   int nlflags)
+{
+	struct i40e_netdev_priv *np = netdev_priv(dev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_veb *veb = NULL;
+	int i;
+
+	/* Only for PF VSI for now */
+	if (vsi->seid != pf->vsi[pf->lan_vsi]->seid)
+		return -EOPNOTSUPP;
+
+	/* Find the HW bridge for the PF VSI */
+	for (i = 0; i < I40E_MAX_VEB && !veb; i++) {
+		if (pf->veb[i] && pf->veb[i]->seid == vsi->uplink_seid)
+			veb = pf->veb[i];
+	}
+
+	if (!veb)
+		return 0;
+
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, veb->bridge_mode,
+				       0, 0, nlflags, filter_mask, NULL);
+}
+
+/**
+ * i40e_features_check - Validate encapsulated packet conforms to limits
+ * @skb: skb buff
+ * @dev: This physical port's netdev
+ * @features: Offload features that the stack believes apply
+ **/
+static netdev_features_t i40e_features_check(struct sk_buff *skb,
+					     struct net_device *dev,
+					     netdev_features_t features)
+{
+	size_t len;
+
+	/* No point in doing any of this if neither checksum nor GSO are
+	 * being requested for this frame.  We can rule out both by just
+	 * checking for CHECKSUM_PARTIAL
+	 */
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		return features;
+
+	/* We cannot support GSO if the MSS is going to be less than
+	 * 64 bytes.  If it is then we need to drop support for GSO.
+	 */
+	if (skb_is_gso(skb) && (skb_shinfo(skb)->gso_size < 64))
+		features &= ~NETIF_F_GSO_MASK;
+
+	/* MACLEN can support at most 63 words */
+	len = skb_network_header(skb) - skb->data;
+	if (len & ~(63 * 2))
+		goto out_err;
+
+	/* IPLEN and EIPLEN can support at most 127 dwords */
+	len = skb_transport_header(skb) - skb_network_header(skb);
+	if (len & ~(127 * 4))
+		goto out_err;
+
+	if (skb->encapsulation) {
+		/* L4TUNLEN can support 127 words */
+		len = skb_inner_network_header(skb) - skb_transport_header(skb);
+		if (len & ~(127 * 2))
+			goto out_err;
+
+		/* IPLEN can support at most 127 dwords */
+		len = skb_inner_transport_header(skb) -
+		      skb_inner_network_header(skb);
+		if (len & ~(127 * 4))
+			goto out_err;
+	}
+
+	/* No need to validate L4LEN as TCP is the only protocol with a
+	 * a flexible value and we support all possible values supported
+	 * by TCP, which is at most 15 dwords
+	 */
+
+	return features;
+out_err:
+	return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+}
+
+/**
+ * i40e_xdp_setup - add/remove an XDP program
+ * @vsi: VSI to changed
+ * @prog: XDP program
+ **/
+static int i40e_xdp_setup(struct i40e_vsi *vsi,
+			  struct bpf_prog *prog)
+{
+	int frame_size = vsi->netdev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+	struct i40e_pf *pf = vsi->back;
+	struct bpf_prog *old_prog;
+	bool need_reset;
+	int i;
+
+	/* Don't allow frames that span over multiple buffers */
+	if (frame_size > vsi->rx_buf_len)
+		return -EINVAL;
+
+	if (!i40e_enabled_xdp_vsi(vsi) && !prog)
+		return 0;
+
+	/* When turning XDP on->off/off->on we reset and rebuild the rings. */
+	need_reset = (i40e_enabled_xdp_vsi(vsi) != !!prog);
+
+	if (need_reset)
+		i40e_prep_for_reset(pf, true);
+
+	old_prog = xchg(&vsi->xdp_prog, prog);
+
+	if (need_reset)
+		i40e_reset_and_rebuild(pf, true, true);
+
+	for (i = 0; i < vsi->num_queue_pairs; i++)
+		WRITE_ONCE(vsi->rx_rings[i]->xdp_prog, vsi->xdp_prog);
+
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	return 0;
+}
+
+/**
+ * i40e_xdp - implements ndo_bpf for i40e
+ * @dev: netdevice
+ * @xdp: XDP command
+ **/
+static int i40e_xdp(struct net_device *dev,
+		    struct netdev_bpf *xdp)
+{
+	struct i40e_netdev_priv *np = netdev_priv(dev);
+	struct i40e_vsi *vsi = np->vsi;
+
+	if (vsi->type != I40E_VSI_MAIN)
+		return -EINVAL;
+
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return i40e_xdp_setup(vsi, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_attached = i40e_enabled_xdp_vsi(vsi);
+		xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct net_device_ops i40e_netdev_ops = {
+	.ndo_open		= i40e_open,
+	.ndo_stop		= i40e_close,
+	.ndo_start_xmit		= i40e_lan_xmit_frame,
+	.ndo_get_stats64	= i40e_get_netdev_stats_struct,
+	.ndo_set_rx_mode	= i40e_set_rx_mode,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= i40e_set_mac,
+	.ndo_change_mtu		= i40e_change_mtu,
+	.ndo_do_ioctl		= i40e_ioctl,
+	.ndo_tx_timeout		= i40e_tx_timeout,
+	.ndo_vlan_rx_add_vid	= i40e_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= i40e_vlan_rx_kill_vid,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= i40e_netpoll,
+#endif
+	.ndo_setup_tc		= __i40e_setup_tc,
+	.ndo_set_features	= i40e_set_features,
+	.ndo_set_vf_mac		= i40e_ndo_set_vf_mac,
+	.ndo_set_vf_vlan	= i40e_ndo_set_vf_port_vlan,
+	.ndo_set_vf_rate	= i40e_ndo_set_vf_bw,
+	.ndo_get_vf_config	= i40e_ndo_get_vf_config,
+	.ndo_set_vf_link_state	= i40e_ndo_set_vf_link_state,
+	.ndo_set_vf_spoofchk	= i40e_ndo_set_vf_spoofchk,
+	.ndo_set_vf_trust	= i40e_ndo_set_vf_trust,
+	.ndo_udp_tunnel_add	= i40e_udp_tunnel_add,
+	.ndo_udp_tunnel_del	= i40e_udp_tunnel_del,
+	.ndo_get_phys_port_id	= i40e_get_phys_port_id,
+	.ndo_fdb_add		= i40e_ndo_fdb_add,
+	.ndo_features_check	= i40e_features_check,
+	.ndo_bridge_getlink	= i40e_ndo_bridge_getlink,
+	.ndo_bridge_setlink	= i40e_ndo_bridge_setlink,
+	.ndo_bpf		= i40e_xdp,
+	.ndo_xdp_xmit		= i40e_xdp_xmit,
+	.ndo_xdp_flush		= i40e_xdp_flush,
+};
+
+/**
+ * i40e_config_netdev - Setup the netdev flags
+ * @vsi: the VSI being configured
+ *
+ * Returns 0 on success, negative value on failure
+ **/
+static int i40e_config_netdev(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_netdev_priv *np;
+	struct net_device *netdev;
+	u8 broadcast[ETH_ALEN];
+	u8 mac_addr[ETH_ALEN];
+	int etherdev_size;
+	netdev_features_t hw_enc_features;
+	netdev_features_t hw_features;
+
+	etherdev_size = sizeof(struct i40e_netdev_priv);
+	netdev = alloc_etherdev_mq(etherdev_size, vsi->alloc_queue_pairs);
+	if (!netdev)
+		return -ENOMEM;
+
+	vsi->netdev = netdev;
+	np = netdev_priv(netdev);
+	np->vsi = vsi;
+
+	hw_enc_features = NETIF_F_SG			|
+			  NETIF_F_IP_CSUM		|
+			  NETIF_F_IPV6_CSUM		|
+			  NETIF_F_HIGHDMA		|
+			  NETIF_F_SOFT_FEATURES		|
+			  NETIF_F_TSO			|
+			  NETIF_F_TSO_ECN		|
+			  NETIF_F_TSO6			|
+			  NETIF_F_GSO_GRE		|
+			  NETIF_F_GSO_GRE_CSUM		|
+			  NETIF_F_GSO_PARTIAL		|
+			  NETIF_F_GSO_UDP_TUNNEL	|
+			  NETIF_F_GSO_UDP_TUNNEL_CSUM	|
+			  NETIF_F_SCTP_CRC		|
+			  NETIF_F_RXHASH		|
+			  NETIF_F_RXCSUM		|
+			  0;
+
+	if (!(pf->hw_features & I40E_HW_OUTER_UDP_CSUM_CAPABLE))
+		netdev->gso_partial_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM;
+
+	netdev->gso_partial_features |= NETIF_F_GSO_GRE_CSUM;
+
+	netdev->hw_enc_features |= hw_enc_features;
+
+	/* record features VLANs can make use of */
+	netdev->vlan_features |= hw_enc_features | NETIF_F_TSO_MANGLEID;
+
+	if (!(pf->flags & I40E_FLAG_MFP_ENABLED))
+		netdev->hw_features |= NETIF_F_NTUPLE | NETIF_F_HW_TC;
+
+	hw_features = hw_enc_features		|
+		      NETIF_F_HW_VLAN_CTAG_TX	|
+		      NETIF_F_HW_VLAN_CTAG_RX;
+
+	netdev->hw_features |= hw_features;
+
+	netdev->features |= hw_features | NETIF_F_HW_VLAN_CTAG_FILTER;
+	netdev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
+
+	if (vsi->type == I40E_VSI_MAIN) {
+		SET_NETDEV_DEV(netdev, &pf->pdev->dev);
+		ether_addr_copy(mac_addr, hw->mac.perm_addr);
+		/* The following steps are necessary for two reasons. First,
+		 * some older NVM configurations load a default MAC-VLAN
+		 * filter that will accept any tagged packet, and we want to
+		 * replace this with a normal filter. Additionally, it is
+		 * possible our MAC address was provided by the platform using
+		 * Open Firmware or similar.
+		 *
+		 * Thus, we need to remove the default filter and install one
+		 * specific to the MAC address.
+		 */
+		i40e_rm_default_mac_filter(vsi, mac_addr);
+		spin_lock_bh(&vsi->mac_filter_hash_lock);
+		i40e_add_mac_filter(vsi, mac_addr);
+		spin_unlock_bh(&vsi->mac_filter_hash_lock);
+	} else {
+		/* Relate the VSI_VMDQ name to the VSI_MAIN name. Note that we
+		 * are still limited by IFNAMSIZ, but we're adding 'v%d\0' to
+		 * the end, which is 4 bytes long, so force truncation of the
+		 * original name by IFNAMSIZ - 4
+		 */
+		snprintf(netdev->name, IFNAMSIZ, "%.*sv%%d",
+			 IFNAMSIZ - 4,
+			 pf->vsi[pf->lan_vsi]->netdev->name);
+		random_ether_addr(mac_addr);
+
+		spin_lock_bh(&vsi->mac_filter_hash_lock);
+		i40e_add_mac_filter(vsi, mac_addr);
+		spin_unlock_bh(&vsi->mac_filter_hash_lock);
+	}
+
+	/* Add the broadcast filter so that we initially will receive
+	 * broadcast packets. Note that when a new VLAN is first added the
+	 * driver will convert all filters marked I40E_VLAN_ANY into VLAN
+	 * specific filters as part of transitioning into "vlan" operation.
+	 * When more VLANs are added, the driver will copy each existing MAC
+	 * filter and add it for the new VLAN.
+	 *
+	 * Broadcast filters are handled specially by
+	 * i40e_sync_filters_subtask, as the driver must to set the broadcast
+	 * promiscuous bit instead of adding this directly as a MAC/VLAN
+	 * filter. The subtask will update the correct broadcast promiscuous
+	 * bits as VLANs become active or inactive.
+	 */
+	eth_broadcast_addr(broadcast);
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+	i40e_add_mac_filter(vsi, broadcast);
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+	ether_addr_copy(netdev->dev_addr, mac_addr);
+	ether_addr_copy(netdev->perm_addr, mac_addr);
+
+	netdev->priv_flags |= IFF_UNICAST_FLT;
+	netdev->priv_flags |= IFF_SUPP_NOFCS;
+	/* Setup netdev TC information */
+	i40e_vsi_config_netdev_tc(vsi, vsi->tc_config.enabled_tc);
+
+	netdev->netdev_ops = &i40e_netdev_ops;
+	netdev->watchdog_timeo = 5 * HZ;
+	i40e_set_ethtool_ops(netdev);
+
+	/* MTU range: 68 - 9706 */
+	netdev->min_mtu = ETH_MIN_MTU;
+	netdev->max_mtu = I40E_MAX_RXBUFFER - I40E_PACKET_HDR_PAD;
+
+	return 0;
+}
+
+/**
+ * i40e_vsi_delete - Delete a VSI from the switch
+ * @vsi: the VSI being removed
+ *
+ * Returns 0 on success, negative value on failure
+ **/
+static void i40e_vsi_delete(struct i40e_vsi *vsi)
+{
+	/* remove default VSI is not allowed */
+	if (vsi == vsi->back->vsi[vsi->back->lan_vsi])
+		return;
+
+	i40e_aq_delete_element(&vsi->back->hw, vsi->seid, NULL);
+}
+
+/**
+ * i40e_is_vsi_uplink_mode_veb - Check if the VSI's uplink bridge mode is VEB
+ * @vsi: the VSI being queried
+ *
+ * Returns 1 if HW bridge mode is VEB and return 0 in case of VEPA mode
+ **/
+int i40e_is_vsi_uplink_mode_veb(struct i40e_vsi *vsi)
+{
+	struct i40e_veb *veb;
+	struct i40e_pf *pf = vsi->back;
+
+	/* Uplink is not a bridge so default to VEB */
+	if (vsi->veb_idx == I40E_NO_VEB)
+		return 1;
+
+	veb = pf->veb[vsi->veb_idx];
+	if (!veb) {
+		dev_info(&pf->pdev->dev,
+			 "There is no veb associated with the bridge\n");
+		return -ENOENT;
+	}
+
+	/* Uplink is a bridge in VEPA mode */
+	if (veb->bridge_mode & BRIDGE_MODE_VEPA) {
+		return 0;
+	} else {
+		/* Uplink is a bridge in VEB mode */
+		return 1;
+	}
+
+	/* VEPA is now default bridge, so return 0 */
+	return 0;
+}
+
+/**
+ * i40e_add_vsi - Add a VSI to the switch
+ * @vsi: the VSI being configured
+ *
+ * This initializes a VSI context depending on the VSI type to be added and
+ * passes it down to the add_vsi aq command.
+ **/
+static int i40e_add_vsi(struct i40e_vsi *vsi)
+{
+	int ret = -ENODEV;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_vsi_context ctxt;
+	struct i40e_mac_filter *f;
+	struct hlist_node *h;
+	int bkt;
+
+	u8 enabled_tc = 0x1; /* TC0 enabled */
+	int f_count = 0;
+
+	memset(&ctxt, 0, sizeof(ctxt));
+	switch (vsi->type) {
+	case I40E_VSI_MAIN:
+		/* The PF's main VSI is already setup as part of the
+		 * device initialization, so we'll not bother with
+		 * the add_vsi call, but we will retrieve the current
+		 * VSI context.
+		 */
+		ctxt.seid = pf->main_vsi_seid;
+		ctxt.pf_num = pf->hw.pf_id;
+		ctxt.vf_num = 0;
+		ret = i40e_aq_get_vsi_params(&pf->hw, &ctxt, NULL);
+		ctxt.flags = I40E_AQ_VSI_TYPE_PF;
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "couldn't get PF vsi config, err %s aq_err %s\n",
+				 i40e_stat_str(&pf->hw, ret),
+				 i40e_aq_str(&pf->hw,
+					     pf->hw.aq.asq_last_status));
+			return -ENOENT;
+		}
+		vsi->info = ctxt.info;
+		vsi->info.valid_sections = 0;
+
+		vsi->seid = ctxt.seid;
+		vsi->id = ctxt.vsi_number;
+
+		enabled_tc = i40e_pf_get_tc_map(pf);
+
+		/* Source pruning is enabled by default, so the flag is
+		 * negative logic - if it's set, we need to fiddle with
+		 * the VSI to disable source pruning.
+		 */
+		if (pf->flags & I40E_FLAG_SOURCE_PRUNING_DISABLED) {
+			memset(&ctxt, 0, sizeof(ctxt));
+			ctxt.seid = pf->main_vsi_seid;
+			ctxt.pf_num = pf->hw.pf_id;
+			ctxt.vf_num = 0;
+			ctxt.info.valid_sections |=
+				     cpu_to_le16(I40E_AQ_VSI_PROP_SWITCH_VALID);
+			ctxt.info.switch_id =
+				   cpu_to_le16(I40E_AQ_VSI_SW_ID_FLAG_LOCAL_LB);
+			ret = i40e_aq_update_vsi_params(hw, &ctxt, NULL);
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					 "update vsi failed, err %s aq_err %s\n",
+					 i40e_stat_str(&pf->hw, ret),
+					 i40e_aq_str(&pf->hw,
+						     pf->hw.aq.asq_last_status));
+				ret = -ENOENT;
+				goto err;
+			}
+		}
+
+		/* MFP mode setup queue map and update VSI */
+		if ((pf->flags & I40E_FLAG_MFP_ENABLED) &&
+		    !(pf->hw.func_caps.iscsi)) { /* NIC type PF */
+			memset(&ctxt, 0, sizeof(ctxt));
+			ctxt.seid = pf->main_vsi_seid;
+			ctxt.pf_num = pf->hw.pf_id;
+			ctxt.vf_num = 0;
+			i40e_vsi_setup_queue_map(vsi, &ctxt, enabled_tc, false);
+			ret = i40e_aq_update_vsi_params(hw, &ctxt, NULL);
+			if (ret) {
+				dev_info(&pf->pdev->dev,
+					 "update vsi failed, err %s aq_err %s\n",
+					 i40e_stat_str(&pf->hw, ret),
+					 i40e_aq_str(&pf->hw,
+						    pf->hw.aq.asq_last_status));
+				ret = -ENOENT;
+				goto err;
+			}
+			/* update the local VSI info queue map */
+			i40e_vsi_update_queue_map(vsi, &ctxt);
+			vsi->info.valid_sections = 0;
+		} else {
+			/* Default/Main VSI is only enabled for TC0
+			 * reconfigure it to enable all TCs that are
+			 * available on the port in SFP mode.
+			 * For MFP case the iSCSI PF would use this
+			 * flow to enable LAN+iSCSI TC.
+			 */
+			ret = i40e_vsi_config_tc(vsi, enabled_tc);
+			if (ret) {
+				/* Single TC condition is not fatal,
+				 * message and continue
+				 */
+				dev_info(&pf->pdev->dev,
+					 "failed to configure TCs for main VSI tc_map 0x%08x, err %s aq_err %s\n",
+					 enabled_tc,
+					 i40e_stat_str(&pf->hw, ret),
+					 i40e_aq_str(&pf->hw,
+						    pf->hw.aq.asq_last_status));
+			}
+		}
+		break;
+
+	case I40E_VSI_FDIR:
+		ctxt.pf_num = hw->pf_id;
+		ctxt.vf_num = 0;
+		ctxt.uplink_seid = vsi->uplink_seid;
+		ctxt.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL;
+		ctxt.flags = I40E_AQ_VSI_TYPE_PF;
+		if ((pf->flags & I40E_FLAG_VEB_MODE_ENABLED) &&
+		    (i40e_is_vsi_uplink_mode_veb(vsi))) {
+			ctxt.info.valid_sections |=
+			     cpu_to_le16(I40E_AQ_VSI_PROP_SWITCH_VALID);
+			ctxt.info.switch_id =
+			   cpu_to_le16(I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB);
+		}
+		i40e_vsi_setup_queue_map(vsi, &ctxt, enabled_tc, true);
+		break;
+
+	case I40E_VSI_VMDQ2:
+		ctxt.pf_num = hw->pf_id;
+		ctxt.vf_num = 0;
+		ctxt.uplink_seid = vsi->uplink_seid;
+		ctxt.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL;
+		ctxt.flags = I40E_AQ_VSI_TYPE_VMDQ2;
+
+		/* This VSI is connected to VEB so the switch_id
+		 * should be set to zero by default.
+		 */
+		if (i40e_is_vsi_uplink_mode_veb(vsi)) {
+			ctxt.info.valid_sections |=
+				cpu_to_le16(I40E_AQ_VSI_PROP_SWITCH_VALID);
+			ctxt.info.switch_id =
+				cpu_to_le16(I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB);
+		}
+
+		/* Setup the VSI tx/rx queue map for TC0 only for now */
+		i40e_vsi_setup_queue_map(vsi, &ctxt, enabled_tc, true);
+		break;
+
+	case I40E_VSI_SRIOV:
+		ctxt.pf_num = hw->pf_id;
+		ctxt.vf_num = vsi->vf_id + hw->func_caps.vf_base_id;
+		ctxt.uplink_seid = vsi->uplink_seid;
+		ctxt.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL;
+		ctxt.flags = I40E_AQ_VSI_TYPE_VF;
+
+		/* This VSI is connected to VEB so the switch_id
+		 * should be set to zero by default.
+		 */
+		if (i40e_is_vsi_uplink_mode_veb(vsi)) {
+			ctxt.info.valid_sections |=
+				cpu_to_le16(I40E_AQ_VSI_PROP_SWITCH_VALID);
+			ctxt.info.switch_id =
+				cpu_to_le16(I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB);
+		}
+
+		if (vsi->back->flags & I40E_FLAG_IWARP_ENABLED) {
+			ctxt.info.valid_sections |=
+				cpu_to_le16(I40E_AQ_VSI_PROP_QUEUE_OPT_VALID);
+			ctxt.info.queueing_opt_flags |=
+				(I40E_AQ_VSI_QUE_OPT_TCP_ENA |
+				 I40E_AQ_VSI_QUE_OPT_RSS_LUT_VSI);
+		}
+
+		ctxt.info.valid_sections |= cpu_to_le16(I40E_AQ_VSI_PROP_VLAN_VALID);
+		ctxt.info.port_vlan_flags |= I40E_AQ_VSI_PVLAN_MODE_ALL;
+		if (pf->vf[vsi->vf_id].spoofchk) {
+			ctxt.info.valid_sections |=
+				cpu_to_le16(I40E_AQ_VSI_PROP_SECURITY_VALID);
+			ctxt.info.sec_flags |=
+				(I40E_AQ_VSI_SEC_FLAG_ENABLE_VLAN_CHK |
+				 I40E_AQ_VSI_SEC_FLAG_ENABLE_MAC_CHK);
+		}
+		/* Setup the VSI tx/rx queue map for TC0 only for now */
+		i40e_vsi_setup_queue_map(vsi, &ctxt, enabled_tc, true);
+		break;
+
+	case I40E_VSI_IWARP:
+		/* send down message to iWARP */
+		break;
+
+	default:
+		return -ENODEV;
+	}
+
+	if (vsi->type != I40E_VSI_MAIN) {
+		ret = i40e_aq_add_vsi(hw, &ctxt, NULL);
+		if (ret) {
+			dev_info(&vsi->back->pdev->dev,
+				 "add vsi failed, err %s aq_err %s\n",
+				 i40e_stat_str(&pf->hw, ret),
+				 i40e_aq_str(&pf->hw,
+					     pf->hw.aq.asq_last_status));
+			ret = -ENOENT;
+			goto err;
+		}
+		vsi->info = ctxt.info;
+		vsi->info.valid_sections = 0;
+		vsi->seid = ctxt.seid;
+		vsi->id = ctxt.vsi_number;
+	}
+
+	vsi->active_filters = 0;
+	clear_bit(__I40E_VSI_OVERFLOW_PROMISC, vsi->state);
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+	/* If macvlan filters already exist, force them to get loaded */
+	hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
+		f->state = I40E_FILTER_NEW;
+		f_count++;
+	}
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+	if (f_count) {
+		vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED;
+		set_bit(__I40E_MACVLAN_SYNC_PENDING, pf->state);
+	}
+
+	/* Update VSI BW information */
+	ret = i40e_vsi_get_bw_info(vsi);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "couldn't get vsi bw info, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		/* VSI is already added so not tearing that up */
+		ret = 0;
+	}
+
+err:
+	return ret;
+}
+
+/**
+ * i40e_vsi_release - Delete a VSI and free its resources
+ * @vsi: the VSI being removed
+ *
+ * Returns 0 on success or < 0 on error
+ **/
+int i40e_vsi_release(struct i40e_vsi *vsi)
+{
+	struct i40e_mac_filter *f;
+	struct hlist_node *h;
+	struct i40e_veb *veb = NULL;
+	struct i40e_pf *pf;
+	u16 uplink_seid;
+	int i, n, bkt;
+
+	pf = vsi->back;
+
+	/* release of a VEB-owner or last VSI is not allowed */
+	if (vsi->flags & I40E_VSI_FLAG_VEB_OWNER) {
+		dev_info(&pf->pdev->dev, "VSI %d has existing VEB %d\n",
+			 vsi->seid, vsi->uplink_seid);
+		return -ENODEV;
+	}
+	if (vsi == pf->vsi[pf->lan_vsi] &&
+	    !test_bit(__I40E_DOWN, pf->state)) {
+		dev_info(&pf->pdev->dev, "Can't remove PF VSI\n");
+		return -ENODEV;
+	}
+
+	uplink_seid = vsi->uplink_seid;
+	if (vsi->type != I40E_VSI_SRIOV) {
+		if (vsi->netdev_registered) {
+			vsi->netdev_registered = false;
+			if (vsi->netdev) {
+				/* results in a call to i40e_close() */
+				unregister_netdev(vsi->netdev);
+			}
+		} else {
+			i40e_vsi_close(vsi);
+		}
+		i40e_vsi_disable_irq(vsi);
+	}
+
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+
+	/* clear the sync flag on all filters */
+	if (vsi->netdev) {
+		__dev_uc_unsync(vsi->netdev, NULL);
+		__dev_mc_unsync(vsi->netdev, NULL);
+	}
+
+	/* make sure any remaining filters are marked for deletion */
+	hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist)
+		__i40e_del_filter(vsi, f);
+
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+	i40e_sync_vsi_filters(vsi);
+
+	i40e_vsi_delete(vsi);
+	i40e_vsi_free_q_vectors(vsi);
+	if (vsi->netdev) {
+		free_netdev(vsi->netdev);
+		vsi->netdev = NULL;
+	}
+	i40e_vsi_clear_rings(vsi);
+	i40e_vsi_clear(vsi);
+
+	/* If this was the last thing on the VEB, except for the
+	 * controlling VSI, remove the VEB, which puts the controlling
+	 * VSI onto the next level down in the switch.
+	 *
+	 * Well, okay, there's one more exception here: don't remove
+	 * the orphan VEBs yet.  We'll wait for an explicit remove request
+	 * from up the network stack.
+	 */
+	for (n = 0, i = 0; i < pf->num_alloc_vsi; i++) {
+		if (pf->vsi[i] &&
+		    pf->vsi[i]->uplink_seid == uplink_seid &&
+		    (pf->vsi[i]->flags & I40E_VSI_FLAG_VEB_OWNER) == 0) {
+			n++;      /* count the VSIs */
+		}
+	}
+	for (i = 0; i < I40E_MAX_VEB; i++) {
+		if (!pf->veb[i])
+			continue;
+		if (pf->veb[i]->uplink_seid == uplink_seid)
+			n++;     /* count the VEBs */
+		if (pf->veb[i]->seid == uplink_seid)
+			veb = pf->veb[i];
+	}
+	if (n == 0 && veb && veb->uplink_seid != 0)
+		i40e_veb_release(veb);
+
+	return 0;
+}
+
+/**
+ * i40e_vsi_setup_vectors - Set up the q_vectors for the given VSI
+ * @vsi: ptr to the VSI
+ *
+ * This should only be called after i40e_vsi_mem_alloc() which allocates the
+ * corresponding SW VSI structure and initializes num_queue_pairs for the
+ * newly allocated VSI.
+ *
+ * Returns 0 on success or negative on failure
+ **/
+static int i40e_vsi_setup_vectors(struct i40e_vsi *vsi)
+{
+	int ret = -ENOENT;
+	struct i40e_pf *pf = vsi->back;
+
+	if (vsi->q_vectors[0]) {
+		dev_info(&pf->pdev->dev, "VSI %d has existing q_vectors\n",
+			 vsi->seid);
+		return -EEXIST;
+	}
+
+	if (vsi->base_vector) {
+		dev_info(&pf->pdev->dev, "VSI %d has non-zero base vector %d\n",
+			 vsi->seid, vsi->base_vector);
+		return -EEXIST;
+	}
+
+	ret = i40e_vsi_alloc_q_vectors(vsi);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "failed to allocate %d q_vector for VSI %d, ret=%d\n",
+			 vsi->num_q_vectors, vsi->seid, ret);
+		vsi->num_q_vectors = 0;
+		goto vector_setup_out;
+	}
+
+	/* In Legacy mode, we do not have to get any other vector since we
+	 * piggyback on the misc/ICR0 for queue interrupts.
+	*/
+	if (!(pf->flags & I40E_FLAG_MSIX_ENABLED))
+		return ret;
+	if (vsi->num_q_vectors)
+		vsi->base_vector = i40e_get_lump(pf, pf->irq_pile,
+						 vsi->num_q_vectors, vsi->idx);
+	if (vsi->base_vector < 0) {
+		dev_info(&pf->pdev->dev,
+			 "failed to get tracking for %d vectors for VSI %d, err=%d\n",
+			 vsi->num_q_vectors, vsi->seid, vsi->base_vector);
+		i40e_vsi_free_q_vectors(vsi);
+		ret = -ENOENT;
+		goto vector_setup_out;
+	}
+
+vector_setup_out:
+	return ret;
+}
+
+/**
+ * i40e_vsi_reinit_setup - return and reallocate resources for a VSI
+ * @vsi: pointer to the vsi.
+ *
+ * This re-allocates a vsi's queue resources.
+ *
+ * Returns pointer to the successfully allocated and configured VSI sw struct
+ * on success, otherwise returns NULL on failure.
+ **/
+static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
+{
+	u16 alloc_queue_pairs;
+	struct i40e_pf *pf;
+	u8 enabled_tc;
+	int ret;
+
+	if (!vsi)
+		return NULL;
+
+	pf = vsi->back;
+
+	i40e_put_lump(pf->qp_pile, vsi->base_queue, vsi->idx);
+	i40e_vsi_clear_rings(vsi);
+
+	i40e_vsi_free_arrays(vsi, false);
+	i40e_set_num_rings_in_vsi(vsi);
+	ret = i40e_vsi_alloc_arrays(vsi, false);
+	if (ret)
+		goto err_vsi;
+
+	alloc_queue_pairs = vsi->alloc_queue_pairs *
+			    (i40e_enabled_xdp_vsi(vsi) ? 2 : 1);
+
+	ret = i40e_get_lump(pf, pf->qp_pile, alloc_queue_pairs, vsi->idx);
+	if (ret < 0) {
+		dev_info(&pf->pdev->dev,
+			 "failed to get tracking for %d queues for VSI %d err %d\n",
+			 alloc_queue_pairs, vsi->seid, ret);
+		goto err_vsi;
+	}
+	vsi->base_queue = ret;
+
+	/* Update the FW view of the VSI. Force a reset of TC and queue
+	 * layout configurations.
+	 */
+	enabled_tc = pf->vsi[pf->lan_vsi]->tc_config.enabled_tc;
+	pf->vsi[pf->lan_vsi]->tc_config.enabled_tc = 0;
+	pf->vsi[pf->lan_vsi]->seid = pf->main_vsi_seid;
+	i40e_vsi_config_tc(pf->vsi[pf->lan_vsi], enabled_tc);
+	if (vsi->type == I40E_VSI_MAIN)
+		i40e_rm_default_mac_filter(vsi, pf->hw.mac.perm_addr);
+
+	/* assign it some queues */
+	ret = i40e_alloc_rings(vsi);
+	if (ret)
+		goto err_rings;
+
+	/* map all of the rings to the q_vectors */
+	i40e_vsi_map_rings_to_vectors(vsi);
+	return vsi;
+
+err_rings:
+	i40e_vsi_free_q_vectors(vsi);
+	if (vsi->netdev_registered) {
+		vsi->netdev_registered = false;
+		unregister_netdev(vsi->netdev);
+		free_netdev(vsi->netdev);
+		vsi->netdev = NULL;
+	}
+	i40e_aq_delete_element(&pf->hw, vsi->seid, NULL);
+err_vsi:
+	i40e_vsi_clear(vsi);
+	return NULL;
+}
+
+/**
+ * i40e_vsi_setup - Set up a VSI by a given type
+ * @pf: board private structure
+ * @type: VSI type
+ * @uplink_seid: the switch element to link to
+ * @param1: usage depends upon VSI type. For VF types, indicates VF id
+ *
+ * This allocates the sw VSI structure and its queue resources, then add a VSI
+ * to the identified VEB.
+ *
+ * Returns pointer to the successfully allocated and configure VSI sw struct on
+ * success, otherwise returns NULL on failure.
+ **/
+struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 type,
+				u16 uplink_seid, u32 param1)
+{
+	struct i40e_vsi *vsi = NULL;
+	struct i40e_veb *veb = NULL;
+	u16 alloc_queue_pairs;
+	int ret, i;
+	int v_idx;
+
+	/* The requested uplink_seid must be either
+	 *     - the PF's port seid
+	 *              no VEB is needed because this is the PF
+	 *              or this is a Flow Director special case VSI
+	 *     - seid of an existing VEB
+	 *     - seid of a VSI that owns an existing VEB
+	 *     - seid of a VSI that doesn't own a VEB
+	 *              a new VEB is created and the VSI becomes the owner
+	 *     - seid of the PF VSI, which is what creates the first VEB
+	 *              this is a special case of the previous
+	 *
+	 * Find which uplink_seid we were given and create a new VEB if needed
+	 */
+	for (i = 0; i < I40E_MAX_VEB; i++) {
+		if (pf->veb[i] && pf->veb[i]->seid == uplink_seid) {
+			veb = pf->veb[i];
+			break;
+		}
+	}
+
+	if (!veb && uplink_seid != pf->mac_seid) {
+
+		for (i = 0; i < pf->num_alloc_vsi; i++) {
+			if (pf->vsi[i] && pf->vsi[i]->seid == uplink_seid) {
+				vsi = pf->vsi[i];
+				break;
+			}
+		}
+		if (!vsi) {
+			dev_info(&pf->pdev->dev, "no such uplink_seid %d\n",
+				 uplink_seid);
+			return NULL;
+		}
+
+		if (vsi->uplink_seid == pf->mac_seid)
+			veb = i40e_veb_setup(pf, 0, pf->mac_seid, vsi->seid,
+					     vsi->tc_config.enabled_tc);
+		else if ((vsi->flags & I40E_VSI_FLAG_VEB_OWNER) == 0)
+			veb = i40e_veb_setup(pf, 0, vsi->uplink_seid, vsi->seid,
+					     vsi->tc_config.enabled_tc);
+		if (veb) {
+			if (vsi->seid != pf->vsi[pf->lan_vsi]->seid) {
+				dev_info(&vsi->back->pdev->dev,
+					 "New VSI creation error, uplink seid of LAN VSI expected.\n");
+				return NULL;
+			}
+			/* We come up by default in VEPA mode if SRIOV is not
+			 * already enabled, in which case we can't force VEPA
+			 * mode.
+			 */
+			if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) {
+				veb->bridge_mode = BRIDGE_MODE_VEPA;
+				pf->flags &= ~I40E_FLAG_VEB_MODE_ENABLED;
+			}
+			i40e_config_bridge_mode(veb);
+		}
+		for (i = 0; i < I40E_MAX_VEB && !veb; i++) {
+			if (pf->veb[i] && pf->veb[i]->seid == vsi->uplink_seid)
+				veb = pf->veb[i];
+		}
+		if (!veb) {
+			dev_info(&pf->pdev->dev, "couldn't add VEB\n");
+			return NULL;
+		}
+
+		vsi->flags |= I40E_VSI_FLAG_VEB_OWNER;
+		uplink_seid = veb->seid;
+	}
+
+	/* get vsi sw struct */
+	v_idx = i40e_vsi_mem_alloc(pf, type);
+	if (v_idx < 0)
+		goto err_alloc;
+	vsi = pf->vsi[v_idx];
+	if (!vsi)
+		goto err_alloc;
+	vsi->type = type;
+	vsi->veb_idx = (veb ? veb->idx : I40E_NO_VEB);
+
+	if (type == I40E_VSI_MAIN)
+		pf->lan_vsi = v_idx;
+	else if (type == I40E_VSI_SRIOV)
+		vsi->vf_id = param1;
+	/* assign it some queues */
+	alloc_queue_pairs = vsi->alloc_queue_pairs *
+			    (i40e_enabled_xdp_vsi(vsi) ? 2 : 1);
+
+	ret = i40e_get_lump(pf, pf->qp_pile, alloc_queue_pairs, vsi->idx);
+	if (ret < 0) {
+		dev_info(&pf->pdev->dev,
+			 "failed to get tracking for %d queues for VSI %d err=%d\n",
+			 alloc_queue_pairs, vsi->seid, ret);
+		goto err_vsi;
+	}
+	vsi->base_queue = ret;
+
+	/* get a VSI from the hardware */
+	vsi->uplink_seid = uplink_seid;
+	ret = i40e_add_vsi(vsi);
+	if (ret)
+		goto err_vsi;
+
+	switch (vsi->type) {
+	/* setup the netdev if needed */
+	case I40E_VSI_MAIN:
+	case I40E_VSI_VMDQ2:
+		ret = i40e_config_netdev(vsi);
+		if (ret)
+			goto err_netdev;
+		ret = register_netdev(vsi->netdev);
+		if (ret)
+			goto err_netdev;
+		vsi->netdev_registered = true;
+		netif_carrier_off(vsi->netdev);
+#ifdef CONFIG_I40E_DCB
+		/* Setup DCB netlink interface */
+		i40e_dcbnl_setup(vsi);
+#endif /* CONFIG_I40E_DCB */
+		/* fall through */
+
+	case I40E_VSI_FDIR:
+		/* set up vectors and rings if needed */
+		ret = i40e_vsi_setup_vectors(vsi);
+		if (ret)
+			goto err_msix;
+
+		ret = i40e_alloc_rings(vsi);
+		if (ret)
+			goto err_rings;
+
+		/* map all of the rings to the q_vectors */
+		i40e_vsi_map_rings_to_vectors(vsi);
+
+		i40e_vsi_reset_stats(vsi);
+		break;
+
+	default:
+		/* no netdev or rings for the other VSI types */
+		break;
+	}
+
+	if ((pf->hw_features & I40E_HW_RSS_AQ_CAPABLE) &&
+	    (vsi->type == I40E_VSI_VMDQ2)) {
+		ret = i40e_vsi_config_rss(vsi);
+	}
+	return vsi;
+
+err_rings:
+	i40e_vsi_free_q_vectors(vsi);
+err_msix:
+	if (vsi->netdev_registered) {
+		vsi->netdev_registered = false;
+		unregister_netdev(vsi->netdev);
+		free_netdev(vsi->netdev);
+		vsi->netdev = NULL;
+	}
+err_netdev:
+	i40e_aq_delete_element(&pf->hw, vsi->seid, NULL);
+err_vsi:
+	i40e_vsi_clear(vsi);
+err_alloc:
+	return NULL;
+}
+
+/**
+ * i40e_veb_get_bw_info - Query VEB BW information
+ * @veb: the veb to query
+ *
+ * Query the Tx scheduler BW configuration data for given VEB
+ **/
+static int i40e_veb_get_bw_info(struct i40e_veb *veb)
+{
+	struct i40e_aqc_query_switching_comp_ets_config_resp ets_data;
+	struct i40e_aqc_query_switching_comp_bw_config_resp bw_data;
+	struct i40e_pf *pf = veb->pf;
+	struct i40e_hw *hw = &pf->hw;
+	u32 tc_bw_max;
+	int ret = 0;
+	int i;
+
+	ret = i40e_aq_query_switch_comp_bw_config(hw, veb->seid,
+						  &bw_data, NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "query veb bw config failed, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, hw->aq.asq_last_status));
+		goto out;
+	}
+
+	ret = i40e_aq_query_switch_comp_ets_config(hw, veb->seid,
+						   &ets_data, NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "query veb bw ets config failed, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, hw->aq.asq_last_status));
+		goto out;
+	}
+
+	veb->bw_limit = le16_to_cpu(ets_data.port_bw_limit);
+	veb->bw_max_quanta = ets_data.tc_bw_max;
+	veb->is_abs_credits = bw_data.absolute_credits_enable;
+	veb->enabled_tc = ets_data.tc_valid_bits;
+	tc_bw_max = le16_to_cpu(bw_data.tc_bw_max[0]) |
+		    (le16_to_cpu(bw_data.tc_bw_max[1]) << 16);
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		veb->bw_tc_share_credits[i] = bw_data.tc_bw_share_credits[i];
+		veb->bw_tc_limit_credits[i] =
+					le16_to_cpu(bw_data.tc_bw_limits[i]);
+		veb->bw_tc_max_quanta[i] = ((tc_bw_max >> (i*4)) & 0x7);
+	}
+
+out:
+	return ret;
+}
+
+/**
+ * i40e_veb_mem_alloc - Allocates the next available struct veb in the PF
+ * @pf: board private structure
+ *
+ * On error: returns error code (negative)
+ * On success: returns vsi index in PF (positive)
+ **/
+static int i40e_veb_mem_alloc(struct i40e_pf *pf)
+{
+	int ret = -ENOENT;
+	struct i40e_veb *veb;
+	int i;
+
+	/* Need to protect the allocation of switch elements at the PF level */
+	mutex_lock(&pf->switch_mutex);
+
+	/* VEB list may be fragmented if VEB creation/destruction has
+	 * been happening.  We can afford to do a quick scan to look
+	 * for any free slots in the list.
+	 *
+	 * find next empty veb slot, looping back around if necessary
+	 */
+	i = 0;
+	while ((i < I40E_MAX_VEB) && (pf->veb[i] != NULL))
+		i++;
+	if (i >= I40E_MAX_VEB) {
+		ret = -ENOMEM;
+		goto err_alloc_veb;  /* out of VEB slots! */
+	}
+
+	veb = kzalloc(sizeof(*veb), GFP_KERNEL);
+	if (!veb) {
+		ret = -ENOMEM;
+		goto err_alloc_veb;
+	}
+	veb->pf = pf;
+	veb->idx = i;
+	veb->enabled_tc = 1;
+
+	pf->veb[i] = veb;
+	ret = i;
+err_alloc_veb:
+	mutex_unlock(&pf->switch_mutex);
+	return ret;
+}
+
+/**
+ * i40e_switch_branch_release - Delete a branch of the switch tree
+ * @branch: where to start deleting
+ *
+ * This uses recursion to find the tips of the branch to be
+ * removed, deleting until we get back to and can delete this VEB.
+ **/
+static void i40e_switch_branch_release(struct i40e_veb *branch)
+{
+	struct i40e_pf *pf = branch->pf;
+	u16 branch_seid = branch->seid;
+	u16 veb_idx = branch->idx;
+	int i;
+
+	/* release any VEBs on this VEB - RECURSION */
+	for (i = 0; i < I40E_MAX_VEB; i++) {
+		if (!pf->veb[i])
+			continue;
+		if (pf->veb[i]->uplink_seid == branch->seid)
+			i40e_switch_branch_release(pf->veb[i]);
+	}
+
+	/* Release the VSIs on this VEB, but not the owner VSI.
+	 *
+	 * NOTE: Removing the last VSI on a VEB has the SIDE EFFECT of removing
+	 *       the VEB itself, so don't use (*branch) after this loop.
+	 */
+	for (i = 0; i < pf->num_alloc_vsi; i++) {
+		if (!pf->vsi[i])
+			continue;
+		if (pf->vsi[i]->uplink_seid == branch_seid &&
+		   (pf->vsi[i]->flags & I40E_VSI_FLAG_VEB_OWNER) == 0) {
+			i40e_vsi_release(pf->vsi[i]);
+		}
+	}
+
+	/* There's one corner case where the VEB might not have been
+	 * removed, so double check it here and remove it if needed.
+	 * This case happens if the veb was created from the debugfs
+	 * commands and no VSIs were added to it.
+	 */
+	if (pf->veb[veb_idx])
+		i40e_veb_release(pf->veb[veb_idx]);
+}
+
+/**
+ * i40e_veb_clear - remove veb struct
+ * @veb: the veb to remove
+ **/
+static void i40e_veb_clear(struct i40e_veb *veb)
+{
+	if (!veb)
+		return;
+
+	if (veb->pf) {
+		struct i40e_pf *pf = veb->pf;
+
+		mutex_lock(&pf->switch_mutex);
+		if (pf->veb[veb->idx] == veb)
+			pf->veb[veb->idx] = NULL;
+		mutex_unlock(&pf->switch_mutex);
+	}
+
+	kfree(veb);
+}
+
+/**
+ * i40e_veb_release - Delete a VEB and free its resources
+ * @veb: the VEB being removed
+ **/
+void i40e_veb_release(struct i40e_veb *veb)
+{
+	struct i40e_vsi *vsi = NULL;
+	struct i40e_pf *pf;
+	int i, n = 0;
+
+	pf = veb->pf;
+
+	/* find the remaining VSI and check for extras */
+	for (i = 0; i < pf->num_alloc_vsi; i++) {
+		if (pf->vsi[i] && pf->vsi[i]->uplink_seid == veb->seid) {
+			n++;
+			vsi = pf->vsi[i];
+		}
+	}
+	if (n != 1) {
+		dev_info(&pf->pdev->dev,
+			 "can't remove VEB %d with %d VSIs left\n",
+			 veb->seid, n);
+		return;
+	}
+
+	/* move the remaining VSI to uplink veb */
+	vsi->flags &= ~I40E_VSI_FLAG_VEB_OWNER;
+	if (veb->uplink_seid) {
+		vsi->uplink_seid = veb->uplink_seid;
+		if (veb->uplink_seid == pf->mac_seid)
+			vsi->veb_idx = I40E_NO_VEB;
+		else
+			vsi->veb_idx = veb->veb_idx;
+	} else {
+		/* floating VEB */
+		vsi->uplink_seid = pf->vsi[pf->lan_vsi]->uplink_seid;
+		vsi->veb_idx = pf->vsi[pf->lan_vsi]->veb_idx;
+	}
+
+	i40e_aq_delete_element(&pf->hw, veb->seid, NULL);
+	i40e_veb_clear(veb);
+}
+
+/**
+ * i40e_add_veb - create the VEB in the switch
+ * @veb: the VEB to be instantiated
+ * @vsi: the controlling VSI
+ **/
+static int i40e_add_veb(struct i40e_veb *veb, struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = veb->pf;
+	bool enable_stats = !!(pf->flags & I40E_FLAG_VEB_STATS_ENABLED);
+	int ret;
+
+	ret = i40e_aq_add_veb(&pf->hw, veb->uplink_seid, vsi->seid,
+			      veb->enabled_tc, false,
+			      &veb->seid, enable_stats, NULL);
+
+	/* get a VEB from the hardware */
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "couldn't add VEB, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		return -EPERM;
+	}
+
+	/* get statistics counter */
+	ret = i40e_aq_get_veb_parameters(&pf->hw, veb->seid, NULL, NULL,
+					 &veb->stats_idx, NULL, NULL, NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "couldn't get VEB statistics idx, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		return -EPERM;
+	}
+	ret = i40e_veb_get_bw_info(veb);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "couldn't get VEB bw info, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		i40e_aq_delete_element(&pf->hw, veb->seid, NULL);
+		return -ENOENT;
+	}
+
+	vsi->uplink_seid = veb->seid;
+	vsi->veb_idx = veb->idx;
+	vsi->flags |= I40E_VSI_FLAG_VEB_OWNER;
+
+	return 0;
+}
+
+/**
+ * i40e_veb_setup - Set up a VEB
+ * @pf: board private structure
+ * @flags: VEB setup flags
+ * @uplink_seid: the switch element to link to
+ * @vsi_seid: the initial VSI seid
+ * @enabled_tc: Enabled TC bit-map
+ *
+ * This allocates the sw VEB structure and links it into the switch
+ * It is possible and legal for this to be a duplicate of an already
+ * existing VEB.  It is also possible for both uplink and vsi seids
+ * to be zero, in order to create a floating VEB.
+ *
+ * Returns pointer to the successfully allocated VEB sw struct on
+ * success, otherwise returns NULL on failure.
+ **/
+struct i40e_veb *i40e_veb_setup(struct i40e_pf *pf, u16 flags,
+				u16 uplink_seid, u16 vsi_seid,
+				u8 enabled_tc)
+{
+	struct i40e_veb *veb, *uplink_veb = NULL;
+	int vsi_idx, veb_idx;
+	int ret;
+
+	/* if one seid is 0, the other must be 0 to create a floating relay */
+	if ((uplink_seid == 0 || vsi_seid == 0) &&
+	    (uplink_seid + vsi_seid != 0)) {
+		dev_info(&pf->pdev->dev,
+			 "one, not both seid's are 0: uplink=%d vsi=%d\n",
+			 uplink_seid, vsi_seid);
+		return NULL;
+	}
+
+	/* make sure there is such a vsi and uplink */
+	for (vsi_idx = 0; vsi_idx < pf->num_alloc_vsi; vsi_idx++)
+		if (pf->vsi[vsi_idx] && pf->vsi[vsi_idx]->seid == vsi_seid)
+			break;
+	if (vsi_idx >= pf->num_alloc_vsi && vsi_seid != 0) {
+		dev_info(&pf->pdev->dev, "vsi seid %d not found\n",
+			 vsi_seid);
+		return NULL;
+	}
+
+	if (uplink_seid && uplink_seid != pf->mac_seid) {
+		for (veb_idx = 0; veb_idx < I40E_MAX_VEB; veb_idx++) {
+			if (pf->veb[veb_idx] &&
+			    pf->veb[veb_idx]->seid == uplink_seid) {
+				uplink_veb = pf->veb[veb_idx];
+				break;
+			}
+		}
+		if (!uplink_veb) {
+			dev_info(&pf->pdev->dev,
+				 "uplink seid %d not found\n", uplink_seid);
+			return NULL;
+		}
+	}
+
+	/* get veb sw struct */
+	veb_idx = i40e_veb_mem_alloc(pf);
+	if (veb_idx < 0)
+		goto err_alloc;
+	veb = pf->veb[veb_idx];
+	veb->flags = flags;
+	veb->uplink_seid = uplink_seid;
+	veb->veb_idx = (uplink_veb ? uplink_veb->idx : I40E_NO_VEB);
+	veb->enabled_tc = (enabled_tc ? enabled_tc : 0x1);
+
+	/* create the VEB in the switch */
+	ret = i40e_add_veb(veb, pf->vsi[vsi_idx]);
+	if (ret)
+		goto err_veb;
+	if (vsi_idx == pf->lan_vsi)
+		pf->lan_veb = veb->idx;
+
+	return veb;
+
+err_veb:
+	i40e_veb_clear(veb);
+err_alloc:
+	return NULL;
+}
+
+/**
+ * i40e_setup_pf_switch_element - set PF vars based on switch type
+ * @pf: board private structure
+ * @ele: element we are building info from
+ * @num_reported: total number of elements
+ * @printconfig: should we print the contents
+ *
+ * helper function to assist in extracting a few useful SEID values.
+ **/
+static void i40e_setup_pf_switch_element(struct i40e_pf *pf,
+				struct i40e_aqc_switch_config_element_resp *ele,
+				u16 num_reported, bool printconfig)
+{
+	u16 downlink_seid = le16_to_cpu(ele->downlink_seid);
+	u16 uplink_seid = le16_to_cpu(ele->uplink_seid);
+	u8 element_type = ele->element_type;
+	u16 seid = le16_to_cpu(ele->seid);
+
+	if (printconfig)
+		dev_info(&pf->pdev->dev,
+			 "type=%d seid=%d uplink=%d downlink=%d\n",
+			 element_type, seid, uplink_seid, downlink_seid);
+
+	switch (element_type) {
+	case I40E_SWITCH_ELEMENT_TYPE_MAC:
+		pf->mac_seid = seid;
+		break;
+	case I40E_SWITCH_ELEMENT_TYPE_VEB:
+		/* Main VEB? */
+		if (uplink_seid != pf->mac_seid)
+			break;
+		if (pf->lan_veb == I40E_NO_VEB) {
+			int v;
+
+			/* find existing or else empty VEB */
+			for (v = 0; v < I40E_MAX_VEB; v++) {
+				if (pf->veb[v] && (pf->veb[v]->seid == seid)) {
+					pf->lan_veb = v;
+					break;
+				}
+			}
+			if (pf->lan_veb == I40E_NO_VEB) {
+				v = i40e_veb_mem_alloc(pf);
+				if (v < 0)
+					break;
+				pf->lan_veb = v;
+			}
+		}
+
+		pf->veb[pf->lan_veb]->seid = seid;
+		pf->veb[pf->lan_veb]->uplink_seid = pf->mac_seid;
+		pf->veb[pf->lan_veb]->pf = pf;
+		pf->veb[pf->lan_veb]->veb_idx = I40E_NO_VEB;
+		break;
+	case I40E_SWITCH_ELEMENT_TYPE_VSI:
+		if (num_reported != 1)
+			break;
+		/* This is immediately after a reset so we can assume this is
+		 * the PF's VSI
+		 */
+		pf->mac_seid = uplink_seid;
+		pf->pf_seid = downlink_seid;
+		pf->main_vsi_seid = seid;
+		if (printconfig)
+			dev_info(&pf->pdev->dev,
+				 "pf_seid=%d main_vsi_seid=%d\n",
+				 pf->pf_seid, pf->main_vsi_seid);
+		break;
+	case I40E_SWITCH_ELEMENT_TYPE_PF:
+	case I40E_SWITCH_ELEMENT_TYPE_VF:
+	case I40E_SWITCH_ELEMENT_TYPE_EMP:
+	case I40E_SWITCH_ELEMENT_TYPE_BMC:
+	case I40E_SWITCH_ELEMENT_TYPE_PE:
+	case I40E_SWITCH_ELEMENT_TYPE_PA:
+		/* ignore these for now */
+		break;
+	default:
+		dev_info(&pf->pdev->dev, "unknown element type=%d seid=%d\n",
+			 element_type, seid);
+		break;
+	}
+}
+
+/**
+ * i40e_fetch_switch_configuration - Get switch config from firmware
+ * @pf: board private structure
+ * @printconfig: should we print the contents
+ *
+ * Get the current switch configuration from the device and
+ * extract a few useful SEID values.
+ **/
+int i40e_fetch_switch_configuration(struct i40e_pf *pf, bool printconfig)
+{
+	struct i40e_aqc_get_switch_config_resp *sw_config;
+	u16 next_seid = 0;
+	int ret = 0;
+	u8 *aq_buf;
+	int i;
+
+	aq_buf = kzalloc(I40E_AQ_LARGE_BUF, GFP_KERNEL);
+	if (!aq_buf)
+		return -ENOMEM;
+
+	sw_config = (struct i40e_aqc_get_switch_config_resp *)aq_buf;
+	do {
+		u16 num_reported, num_total;
+
+		ret = i40e_aq_get_switch_config(&pf->hw, sw_config,
+						I40E_AQ_LARGE_BUF,
+						&next_seid, NULL);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "get switch config failed err %s aq_err %s\n",
+				 i40e_stat_str(&pf->hw, ret),
+				 i40e_aq_str(&pf->hw,
+					     pf->hw.aq.asq_last_status));
+			kfree(aq_buf);
+			return -ENOENT;
+		}
+
+		num_reported = le16_to_cpu(sw_config->header.num_reported);
+		num_total = le16_to_cpu(sw_config->header.num_total);
+
+		if (printconfig)
+			dev_info(&pf->pdev->dev,
+				 "header: %d reported %d total\n",
+				 num_reported, num_total);
+
+		for (i = 0; i < num_reported; i++) {
+			struct i40e_aqc_switch_config_element_resp *ele =
+				&sw_config->element[i];
+
+			i40e_setup_pf_switch_element(pf, ele, num_reported,
+						     printconfig);
+		}
+	} while (next_seid != 0);
+
+	kfree(aq_buf);
+	return ret;
+}
+
+/**
+ * i40e_setup_pf_switch - Setup the HW switch on startup or after reset
+ * @pf: board private structure
+ * @reinit: if the Main VSI needs to re-initialized.
+ *
+ * Returns 0 on success, negative value on failure
+ **/
+static int i40e_setup_pf_switch(struct i40e_pf *pf, bool reinit)
+{
+	u16 flags = 0;
+	int ret;
+
+	/* find out what's out there already */
+	ret = i40e_fetch_switch_configuration(pf, false);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "couldn't fetch switch config, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, ret),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		return ret;
+	}
+	i40e_pf_reset_stats(pf);
+
+	/* set the switch config bit for the whole device to
+	 * support limited promisc or true promisc
+	 * when user requests promisc. The default is limited
+	 * promisc.
+	*/
+
+	if ((pf->hw.pf_id == 0) &&
+	    !(pf->flags & I40E_FLAG_TRUE_PROMISC_SUPPORT)) {
+		flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
+		pf->last_sw_conf_flags = flags;
+	}
+
+	if (pf->hw.pf_id == 0) {
+		u16 valid_flags;
+
+		valid_flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
+		ret = i40e_aq_set_switch_config(&pf->hw, flags, valid_flags, 0,
+						NULL);
+		if (ret && pf->hw.aq.asq_last_status != I40E_AQ_RC_ESRCH) {
+			dev_info(&pf->pdev->dev,
+				 "couldn't set switch config bits, err %s aq_err %s\n",
+				 i40e_stat_str(&pf->hw, ret),
+				 i40e_aq_str(&pf->hw,
+					     pf->hw.aq.asq_last_status));
+			/* not a fatal problem, just keep going */
+		}
+		pf->last_sw_conf_valid_flags = valid_flags;
+	}
+
+	/* first time setup */
+	if (pf->lan_vsi == I40E_NO_VSI || reinit) {
+		struct i40e_vsi *vsi = NULL;
+		u16 uplink_seid;
+
+		/* Set up the PF VSI associated with the PF's main VSI
+		 * that is already in the HW switch
+		 */
+		if (pf->lan_veb != I40E_NO_VEB && pf->veb[pf->lan_veb])
+			uplink_seid = pf->veb[pf->lan_veb]->seid;
+		else
+			uplink_seid = pf->mac_seid;
+		if (pf->lan_vsi == I40E_NO_VSI)
+			vsi = i40e_vsi_setup(pf, I40E_VSI_MAIN, uplink_seid, 0);
+		else if (reinit)
+			vsi = i40e_vsi_reinit_setup(pf->vsi[pf->lan_vsi]);
+		if (!vsi) {
+			dev_info(&pf->pdev->dev, "setup of MAIN VSI failed\n");
+			i40e_cloud_filter_exit(pf);
+			i40e_fdir_teardown(pf);
+			return -EAGAIN;
+		}
+	} else {
+		/* force a reset of TC and queue layout configurations */
+		u8 enabled_tc = pf->vsi[pf->lan_vsi]->tc_config.enabled_tc;
+
+		pf->vsi[pf->lan_vsi]->tc_config.enabled_tc = 0;
+		pf->vsi[pf->lan_vsi]->seid = pf->main_vsi_seid;
+		i40e_vsi_config_tc(pf->vsi[pf->lan_vsi], enabled_tc);
+	}
+	i40e_vlan_stripping_disable(pf->vsi[pf->lan_vsi]);
+
+	i40e_fdir_sb_setup(pf);
+
+	/* Setup static PF queue filter control settings */
+	ret = i40e_setup_pf_filter_control(pf);
+	if (ret) {
+		dev_info(&pf->pdev->dev, "setup_pf_filter_control failed: %d\n",
+			 ret);
+		/* Failure here should not stop continuing other steps */
+	}
+
+	/* enable RSS in the HW, even for only one queue, as the stack can use
+	 * the hash
+	 */
+	if ((pf->flags & I40E_FLAG_RSS_ENABLED))
+		i40e_pf_config_rss(pf);
+
+	/* fill in link information and enable LSE reporting */
+	i40e_link_event(pf);
+
+	/* Initialize user-specific link properties */
+	pf->fc_autoneg_status = ((pf->hw.phy.link_info.an_info &
+				  I40E_AQ_AN_COMPLETED) ? true : false);
+
+	i40e_ptp_init(pf);
+
+	/* repopulate tunnel port filters */
+	i40e_sync_udp_filters(pf);
+
+	return ret;
+}
+
+/**
+ * i40e_determine_queue_usage - Work out queue distribution
+ * @pf: board private structure
+ **/
+static void i40e_determine_queue_usage(struct i40e_pf *pf)
+{
+	int queues_left;
+	int q_max;
+
+	pf->num_lan_qps = 0;
+
+	/* Find the max queues to be put into basic use.  We'll always be
+	 * using TC0, whether or not DCB is running, and TC0 will get the
+	 * big RSS set.
+	 */
+	queues_left = pf->hw.func_caps.num_tx_qp;
+
+	if ((queues_left == 1) ||
+	    !(pf->flags & I40E_FLAG_MSIX_ENABLED)) {
+		/* one qp for PF, no queues for anything else */
+		queues_left = 0;
+		pf->alloc_rss_size = pf->num_lan_qps = 1;
+
+		/* make sure all the fancies are disabled */
+		pf->flags &= ~(I40E_FLAG_RSS_ENABLED	|
+			       I40E_FLAG_IWARP_ENABLED	|
+			       I40E_FLAG_FD_SB_ENABLED	|
+			       I40E_FLAG_FD_ATR_ENABLED	|
+			       I40E_FLAG_DCB_CAPABLE	|
+			       I40E_FLAG_DCB_ENABLED	|
+			       I40E_FLAG_SRIOV_ENABLED	|
+			       I40E_FLAG_VMDQ_ENABLED);
+		pf->flags |= I40E_FLAG_FD_SB_INACTIVE;
+	} else if (!(pf->flags & (I40E_FLAG_RSS_ENABLED |
+				  I40E_FLAG_FD_SB_ENABLED |
+				  I40E_FLAG_FD_ATR_ENABLED |
+				  I40E_FLAG_DCB_CAPABLE))) {
+		/* one qp for PF */
+		pf->alloc_rss_size = pf->num_lan_qps = 1;
+		queues_left -= pf->num_lan_qps;
+
+		pf->flags &= ~(I40E_FLAG_RSS_ENABLED	|
+			       I40E_FLAG_IWARP_ENABLED	|
+			       I40E_FLAG_FD_SB_ENABLED	|
+			       I40E_FLAG_FD_ATR_ENABLED	|
+			       I40E_FLAG_DCB_ENABLED	|
+			       I40E_FLAG_VMDQ_ENABLED);
+		pf->flags |= I40E_FLAG_FD_SB_INACTIVE;
+	} else {
+		/* Not enough queues for all TCs */
+		if ((pf->flags & I40E_FLAG_DCB_CAPABLE) &&
+		    (queues_left < I40E_MAX_TRAFFIC_CLASS)) {
+			pf->flags &= ~(I40E_FLAG_DCB_CAPABLE |
+					I40E_FLAG_DCB_ENABLED);
+			dev_info(&pf->pdev->dev, "not enough queues for DCB. DCB is disabled.\n");
+		}
+
+		/* limit lan qps to the smaller of qps, cpus or msix */
+		q_max = max_t(int, pf->rss_size_max, num_online_cpus());
+		q_max = min_t(int, q_max, pf->hw.func_caps.num_tx_qp);
+		q_max = min_t(int, q_max, pf->hw.func_caps.num_msix_vectors);
+		pf->num_lan_qps = q_max;
+
+		queues_left -= pf->num_lan_qps;
+	}
+
+	if (pf->flags & I40E_FLAG_FD_SB_ENABLED) {
+		if (queues_left > 1) {
+			queues_left -= 1; /* save 1 queue for FD */
+		} else {
+			pf->flags &= ~I40E_FLAG_FD_SB_ENABLED;
+			pf->flags |= I40E_FLAG_FD_SB_INACTIVE;
+			dev_info(&pf->pdev->dev, "not enough queues for Flow Director. Flow Director feature is disabled\n");
+		}
+	}
+
+	if ((pf->flags & I40E_FLAG_SRIOV_ENABLED) &&
+	    pf->num_vf_qps && pf->num_req_vfs && queues_left) {
+		pf->num_req_vfs = min_t(int, pf->num_req_vfs,
+					(queues_left / pf->num_vf_qps));
+		queues_left -= (pf->num_req_vfs * pf->num_vf_qps);
+	}
+
+	if ((pf->flags & I40E_FLAG_VMDQ_ENABLED) &&
+	    pf->num_vmdq_vsis && pf->num_vmdq_qps && queues_left) {
+		pf->num_vmdq_vsis = min_t(int, pf->num_vmdq_vsis,
+					  (queues_left / pf->num_vmdq_qps));
+		queues_left -= (pf->num_vmdq_vsis * pf->num_vmdq_qps);
+	}
+
+	pf->queues_left = queues_left;
+	dev_dbg(&pf->pdev->dev,
+		"qs_avail=%d FD SB=%d lan_qs=%d lan_tc0=%d vf=%d*%d vmdq=%d*%d, remaining=%d\n",
+		pf->hw.func_caps.num_tx_qp,
+		!!(pf->flags & I40E_FLAG_FD_SB_ENABLED),
+		pf->num_lan_qps, pf->alloc_rss_size, pf->num_req_vfs,
+		pf->num_vf_qps, pf->num_vmdq_vsis, pf->num_vmdq_qps,
+		queues_left);
+}
+
+/**
+ * i40e_setup_pf_filter_control - Setup PF static filter control
+ * @pf: PF to be setup
+ *
+ * i40e_setup_pf_filter_control sets up a PF's initial filter control
+ * settings. If PE/FCoE are enabled then it will also set the per PF
+ * based filter sizes required for them. It also enables Flow director,
+ * ethertype and macvlan type filter settings for the pf.
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_setup_pf_filter_control(struct i40e_pf *pf)
+{
+	struct i40e_filter_control_settings *settings = &pf->filter_settings;
+
+	settings->hash_lut_size = I40E_HASH_LUT_SIZE_128;
+
+	/* Flow Director is enabled */
+	if (pf->flags & (I40E_FLAG_FD_SB_ENABLED | I40E_FLAG_FD_ATR_ENABLED))
+		settings->enable_fdir = true;
+
+	/* Ethtype and MACVLAN filters enabled for PF */
+	settings->enable_ethtype = true;
+	settings->enable_macvlan = true;
+
+	if (i40e_set_filter_control(&pf->hw, settings))
+		return -ENOENT;
+
+	return 0;
+}
+
+#define INFO_STRING_LEN 255
+#define REMAIN(__x) (INFO_STRING_LEN - (__x))
+static void i40e_print_features(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	char *buf;
+	int i;
+
+	buf = kmalloc(INFO_STRING_LEN, GFP_KERNEL);
+	if (!buf)
+		return;
+
+	i = snprintf(buf, INFO_STRING_LEN, "Features: PF-id[%d]", hw->pf_id);
+#ifdef CONFIG_PCI_IOV
+	i += snprintf(&buf[i], REMAIN(i), " VFs: %d", pf->num_req_vfs);
+#endif
+	i += snprintf(&buf[i], REMAIN(i), " VSIs: %d QP: %d",
+		      pf->hw.func_caps.num_vsis,
+		      pf->vsi[pf->lan_vsi]->num_queue_pairs);
+	if (pf->flags & I40E_FLAG_RSS_ENABLED)
+		i += snprintf(&buf[i], REMAIN(i), " RSS");
+	if (pf->flags & I40E_FLAG_FD_ATR_ENABLED)
+		i += snprintf(&buf[i], REMAIN(i), " FD_ATR");
+	if (pf->flags & I40E_FLAG_FD_SB_ENABLED) {
+		i += snprintf(&buf[i], REMAIN(i), " FD_SB");
+		i += snprintf(&buf[i], REMAIN(i), " NTUPLE");
+	}
+	if (pf->flags & I40E_FLAG_DCB_CAPABLE)
+		i += snprintf(&buf[i], REMAIN(i), " DCB");
+	i += snprintf(&buf[i], REMAIN(i), " VxLAN");
+	i += snprintf(&buf[i], REMAIN(i), " Geneve");
+	if (pf->flags & I40E_FLAG_PTP)
+		i += snprintf(&buf[i], REMAIN(i), " PTP");
+	if (pf->flags & I40E_FLAG_VEB_MODE_ENABLED)
+		i += snprintf(&buf[i], REMAIN(i), " VEB");
+	else
+		i += snprintf(&buf[i], REMAIN(i), " VEPA");
+
+	dev_info(&pf->pdev->dev, "%s\n", buf);
+	kfree(buf);
+	WARN_ON(i > INFO_STRING_LEN);
+}
+
+/**
+ * i40e_get_platform_mac_addr - get platform-specific MAC address
+ * @pdev: PCI device information struct
+ * @pf: board private structure
+ *
+ * Look up the MAC address for the device. First we'll try
+ * eth_platform_get_mac_address, which will check Open Firmware, or arch
+ * specific fallback. Otherwise, we'll default to the stored value in
+ * firmware.
+ **/
+static void i40e_get_platform_mac_addr(struct pci_dev *pdev, struct i40e_pf *pf)
+{
+	if (eth_platform_get_mac_address(&pdev->dev, pf->hw.mac.addr))
+		i40e_get_mac_addr(&pf->hw, pf->hw.mac.addr);
+}
+
+/**
+ * i40e_probe - Device initialization routine
+ * @pdev: PCI device information struct
+ * @ent: entry in i40e_pci_tbl
+ *
+ * i40e_probe initializes a PF identified by a pci_dev structure.
+ * The OS initialization, configuring of the PF private structure,
+ * and a hardware reset occur.
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct i40e_aq_get_phy_abilities_resp abilities;
+	struct i40e_pf *pf;
+	struct i40e_hw *hw;
+	static u16 pfs_found;
+	u16 wol_nvm_bits;
+	u16 link_status;
+	int err;
+	u32 val;
+	u32 i;
+	u8 set_fc_aq_fail;
+
+	err = pci_enable_device_mem(pdev);
+	if (err)
+		return err;
+
+	/* set up for high or low dma */
+	err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+	if (err) {
+		err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev,
+				"DMA configuration failed: 0x%x\n", err);
+			goto err_dma;
+		}
+	}
+
+	/* set up pci connections */
+	err = pci_request_mem_regions(pdev, i40e_driver_name);
+	if (err) {
+		dev_info(&pdev->dev,
+			 "pci_request_selected_regions failed %d\n", err);
+		goto err_pci_reg;
+	}
+
+	pci_enable_pcie_error_reporting(pdev);
+	pci_set_master(pdev);
+
+	/* Now that we have a PCI connection, we need to do the
+	 * low level device setup.  This is primarily setting up
+	 * the Admin Queue structures and then querying for the
+	 * device's current profile information.
+	 */
+	pf = kzalloc(sizeof(*pf), GFP_KERNEL);
+	if (!pf) {
+		err = -ENOMEM;
+		goto err_pf_alloc;
+	}
+	pf->next_vsi = 0;
+	pf->pdev = pdev;
+	set_bit(__I40E_DOWN, pf->state);
+
+	hw = &pf->hw;
+	hw->back = pf;
+
+	pf->ioremap_len = min_t(int, pci_resource_len(pdev, 0),
+				I40E_MAX_CSR_SPACE);
+
+	hw->hw_addr = ioremap(pci_resource_start(pdev, 0), pf->ioremap_len);
+	if (!hw->hw_addr) {
+		err = -EIO;
+		dev_info(&pdev->dev, "ioremap(0x%04x, 0x%04x) failed: 0x%x\n",
+			 (unsigned int)pci_resource_start(pdev, 0),
+			 pf->ioremap_len, err);
+		goto err_ioremap;
+	}
+	hw->vendor_id = pdev->vendor;
+	hw->device_id = pdev->device;
+	pci_read_config_byte(pdev, PCI_REVISION_ID, &hw->revision_id);
+	hw->subsystem_vendor_id = pdev->subsystem_vendor;
+	hw->subsystem_device_id = pdev->subsystem_device;
+	hw->bus.device = PCI_SLOT(pdev->devfn);
+	hw->bus.func = PCI_FUNC(pdev->devfn);
+	hw->bus.bus_id = pdev->bus->number;
+	pf->instance = pfs_found;
+
+	/* Select something other than the 802.1ad ethertype for the
+	 * switch to use internally and drop on ingress.
+	 */
+	hw->switch_tag = 0xffff;
+	hw->first_tag = ETH_P_8021AD;
+	hw->second_tag = ETH_P_8021Q;
+
+	INIT_LIST_HEAD(&pf->l3_flex_pit_list);
+	INIT_LIST_HEAD(&pf->l4_flex_pit_list);
+
+	/* set up the locks for the AQ, do this only once in probe
+	 * and destroy them only once in remove
+	 */
+	mutex_init(&hw->aq.asq_mutex);
+	mutex_init(&hw->aq.arq_mutex);
+
+	pf->msg_enable = netif_msg_init(debug,
+					NETIF_MSG_DRV |
+					NETIF_MSG_PROBE |
+					NETIF_MSG_LINK);
+	if (debug < -1)
+		pf->hw.debug_mask = debug;
+
+	/* do a special CORER for clearing PXE mode once at init */
+	if (hw->revision_id == 0 &&
+	    (rd32(hw, I40E_GLLAN_RCTL_0) & I40E_GLLAN_RCTL_0_PXE_MODE_MASK)) {
+		wr32(hw, I40E_GLGEN_RTRIG, I40E_GLGEN_RTRIG_CORER_MASK);
+		i40e_flush(hw);
+		msleep(200);
+		pf->corer_count++;
+
+		i40e_clear_pxe_mode(hw);
+	}
+
+	/* Reset here to make sure all is clean and to define PF 'n' */
+	i40e_clear_hw(hw);
+	err = i40e_pf_reset(hw);
+	if (err) {
+		dev_info(&pdev->dev, "Initial pf_reset failed: %d\n", err);
+		goto err_pf_reset;
+	}
+	pf->pfr_count++;
+
+	hw->aq.num_arq_entries = I40E_AQ_LEN;
+	hw->aq.num_asq_entries = I40E_AQ_LEN;
+	hw->aq.arq_buf_size = I40E_MAX_AQ_BUF_SIZE;
+	hw->aq.asq_buf_size = I40E_MAX_AQ_BUF_SIZE;
+	pf->adminq_work_limit = I40E_AQ_WORK_LIMIT;
+
+	snprintf(pf->int_name, sizeof(pf->int_name) - 1,
+		 "%s-%s:misc",
+		 dev_driver_string(&pf->pdev->dev), dev_name(&pdev->dev));
+
+	err = i40e_init_shared_code(hw);
+	if (err) {
+		dev_warn(&pdev->dev, "unidentified MAC or BLANK NVM: %d\n",
+			 err);
+		goto err_pf_reset;
+	}
+
+	/* set up a default setting for link flow control */
+	pf->hw.fc.requested_mode = I40E_FC_NONE;
+
+	err = i40e_init_adminq(hw);
+	if (err) {
+		if (err == I40E_ERR_FIRMWARE_API_VERSION)
+			dev_info(&pdev->dev,
+				 "The driver for the device stopped because the NVM image is newer than expected. You must install the most recent version of the network driver.\n");
+		else
+			dev_info(&pdev->dev,
+				 "The driver for the device stopped because the device firmware failed to init. Try updating your NVM image.\n");
+
+		goto err_pf_reset;
+	}
+	i40e_get_oem_version(hw);
+
+	/* provide nvm, fw, api versions */
+	dev_info(&pdev->dev, "fw %d.%d.%05d api %d.%d nvm %s\n",
+		 hw->aq.fw_maj_ver, hw->aq.fw_min_ver, hw->aq.fw_build,
+		 hw->aq.api_maj_ver, hw->aq.api_min_ver,
+		 i40e_nvm_version_str(hw));
+
+	if (hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR &&
+	    hw->aq.api_min_ver > I40E_FW_MINOR_VERSION(hw))
+		dev_info(&pdev->dev,
+			 "The driver for the device detected a newer version of the NVM image than expected. Please install the most recent version of the network driver.\n");
+	else if (hw->aq.api_maj_ver == 1 && hw->aq.api_min_ver < 4)
+		dev_info(&pdev->dev,
+			 "The driver for the device detected an older version of the NVM image than expected. Please update the NVM image.\n");
+
+	i40e_verify_eeprom(pf);
+
+	/* Rev 0 hardware was never productized */
+	if (hw->revision_id < 1)
+		dev_warn(&pdev->dev, "This device is a pre-production adapter/LOM. Please be aware there may be issues with your hardware. If you are experiencing problems please contact your Intel or hardware representative who provided you with this hardware.\n");
+
+	i40e_clear_pxe_mode(hw);
+	err = i40e_get_capabilities(pf, i40e_aqc_opc_list_func_capabilities);
+	if (err)
+		goto err_adminq_setup;
+
+	err = i40e_sw_init(pf);
+	if (err) {
+		dev_info(&pdev->dev, "sw_init failed: %d\n", err);
+		goto err_sw_init;
+	}
+
+	err = i40e_init_lan_hmc(hw, hw->func_caps.num_tx_qp,
+				hw->func_caps.num_rx_qp, 0, 0);
+	if (err) {
+		dev_info(&pdev->dev, "init_lan_hmc failed: %d\n", err);
+		goto err_init_lan_hmc;
+	}
+
+	err = i40e_configure_lan_hmc(hw, I40E_HMC_MODEL_DIRECT_ONLY);
+	if (err) {
+		dev_info(&pdev->dev, "configure_lan_hmc failed: %d\n", err);
+		err = -ENOENT;
+		goto err_configure_lan_hmc;
+	}
+
+	/* Disable LLDP for NICs that have firmware versions lower than v4.3.
+	 * Ignore error return codes because if it was already disabled via
+	 * hardware settings this will fail
+	 */
+	if (pf->hw_features & I40E_HW_STOP_FW_LLDP) {
+		dev_info(&pdev->dev, "Stopping firmware LLDP agent.\n");
+		i40e_aq_stop_lldp(hw, true, NULL);
+	}
+
+	/* allow a platform config to override the HW addr */
+	i40e_get_platform_mac_addr(pdev, pf);
+
+	if (!is_valid_ether_addr(hw->mac.addr)) {
+		dev_info(&pdev->dev, "invalid MAC address %pM\n", hw->mac.addr);
+		err = -EIO;
+		goto err_mac_addr;
+	}
+	dev_info(&pdev->dev, "MAC address: %pM\n", hw->mac.addr);
+	ether_addr_copy(hw->mac.perm_addr, hw->mac.addr);
+	i40e_get_port_mac_addr(hw, hw->mac.port_addr);
+	if (is_valid_ether_addr(hw->mac.port_addr))
+		pf->hw_features |= I40E_HW_PORT_ID_VALID;
+
+	pci_set_drvdata(pdev, pf);
+	pci_save_state(pdev);
+
+	/* Enable FW to write default DCB config on link-up */
+	i40e_aq_set_dcb_parameters(hw, true, NULL);
+
+#ifdef CONFIG_I40E_DCB
+	err = i40e_init_pf_dcb(pf);
+	if (err) {
+		dev_info(&pdev->dev, "DCB init failed %d, disabled\n", err);
+		pf->flags &= ~(I40E_FLAG_DCB_CAPABLE | I40E_FLAG_DCB_ENABLED);
+		/* Continue without DCB enabled */
+	}
+#endif /* CONFIG_I40E_DCB */
+
+	/* set up periodic task facility */
+	timer_setup(&pf->service_timer, i40e_service_timer, 0);
+	pf->service_timer_period = HZ;
+
+	INIT_WORK(&pf->service_task, i40e_service_task);
+	clear_bit(__I40E_SERVICE_SCHED, pf->state);
+
+	/* NVM bit on means WoL disabled for the port */
+	i40e_read_nvm_word(hw, I40E_SR_NVM_WAKE_ON_LAN, &wol_nvm_bits);
+	if (BIT (hw->port) & wol_nvm_bits || hw->partition_id != 1)
+		pf->wol_en = false;
+	else
+		pf->wol_en = true;
+	device_set_wakeup_enable(&pf->pdev->dev, pf->wol_en);
+
+	/* set up the main switch operations */
+	i40e_determine_queue_usage(pf);
+	err = i40e_init_interrupt_scheme(pf);
+	if (err)
+		goto err_switch_setup;
+
+	/* The number of VSIs reported by the FW is the minimum guaranteed
+	 * to us; HW supports far more and we share the remaining pool with
+	 * the other PFs. We allocate space for more than the guarantee with
+	 * the understanding that we might not get them all later.
+	 */
+	if (pf->hw.func_caps.num_vsis < I40E_MIN_VSI_ALLOC)
+		pf->num_alloc_vsi = I40E_MIN_VSI_ALLOC;
+	else
+		pf->num_alloc_vsi = pf->hw.func_caps.num_vsis;
+
+	/* Set up the *vsi struct and our local tracking of the MAIN PF vsi. */
+	pf->vsi = kcalloc(pf->num_alloc_vsi, sizeof(struct i40e_vsi *),
+			  GFP_KERNEL);
+	if (!pf->vsi) {
+		err = -ENOMEM;
+		goto err_switch_setup;
+	}
+
+#ifdef CONFIG_PCI_IOV
+	/* prep for VF support */
+	if ((pf->flags & I40E_FLAG_SRIOV_ENABLED) &&
+	    (pf->flags & I40E_FLAG_MSIX_ENABLED) &&
+	    !test_bit(__I40E_BAD_EEPROM, pf->state)) {
+		if (pci_num_vf(pdev))
+			pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
+	}
+#endif
+	err = i40e_setup_pf_switch(pf, false);
+	if (err) {
+		dev_info(&pdev->dev, "setup_pf_switch failed: %d\n", err);
+		goto err_vsis;
+	}
+	INIT_LIST_HEAD(&pf->vsi[pf->lan_vsi]->ch_list);
+
+	/* Make sure flow control is set according to current settings */
+	err = i40e_set_fc(hw, &set_fc_aq_fail, true);
+	if (set_fc_aq_fail & I40E_SET_FC_AQ_FAIL_GET)
+		dev_dbg(&pf->pdev->dev,
+			"Set fc with err %s aq_err %s on get_phy_cap\n",
+			i40e_stat_str(hw, err),
+			i40e_aq_str(hw, hw->aq.asq_last_status));
+	if (set_fc_aq_fail & I40E_SET_FC_AQ_FAIL_SET)
+		dev_dbg(&pf->pdev->dev,
+			"Set fc with err %s aq_err %s on set_phy_config\n",
+			i40e_stat_str(hw, err),
+			i40e_aq_str(hw, hw->aq.asq_last_status));
+	if (set_fc_aq_fail & I40E_SET_FC_AQ_FAIL_UPDATE)
+		dev_dbg(&pf->pdev->dev,
+			"Set fc with err %s aq_err %s on get_link_info\n",
+			i40e_stat_str(hw, err),
+			i40e_aq_str(hw, hw->aq.asq_last_status));
+
+	/* if FDIR VSI was set up, start it now */
+	for (i = 0; i < pf->num_alloc_vsi; i++) {
+		if (pf->vsi[i] && pf->vsi[i]->type == I40E_VSI_FDIR) {
+			i40e_vsi_open(pf->vsi[i]);
+			break;
+		}
+	}
+
+	/* The driver only wants link up/down and module qualification
+	 * reports from firmware.  Note the negative logic.
+	 */
+	err = i40e_aq_set_phy_int_mask(&pf->hw,
+				       ~(I40E_AQ_EVENT_LINK_UPDOWN |
+					 I40E_AQ_EVENT_MEDIA_NA |
+					 I40E_AQ_EVENT_MODULE_QUAL_FAIL), NULL);
+	if (err)
+		dev_info(&pf->pdev->dev, "set phy mask fail, err %s aq_err %s\n",
+			 i40e_stat_str(&pf->hw, err),
+			 i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+
+	/* Reconfigure hardware for allowing smaller MSS in the case
+	 * of TSO, so that we avoid the MDD being fired and causing
+	 * a reset in the case of small MSS+TSO.
+	 */
+	val = rd32(hw, I40E_REG_MSS);
+	if ((val & I40E_REG_MSS_MIN_MASK) > I40E_64BYTE_MSS) {
+		val &= ~I40E_REG_MSS_MIN_MASK;
+		val |= I40E_64BYTE_MSS;
+		wr32(hw, I40E_REG_MSS, val);
+	}
+
+	if (pf->hw_features & I40E_HW_RESTART_AUTONEG) {
+		msleep(75);
+		err = i40e_aq_set_link_restart_an(&pf->hw, true, NULL);
+		if (err)
+			dev_info(&pf->pdev->dev, "link restart failed, err %s aq_err %s\n",
+				 i40e_stat_str(&pf->hw, err),
+				 i40e_aq_str(&pf->hw,
+					     pf->hw.aq.asq_last_status));
+	}
+	/* The main driver is (mostly) up and happy. We need to set this state
+	 * before setting up the misc vector or we get a race and the vector
+	 * ends up disabled forever.
+	 */
+	clear_bit(__I40E_DOWN, pf->state);
+
+	/* In case of MSIX we are going to setup the misc vector right here
+	 * to handle admin queue events etc. In case of legacy and MSI
+	 * the misc functionality and queue processing is combined in
+	 * the same vector and that gets setup at open.
+	 */
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED) {
+		err = i40e_setup_misc_vector(pf);
+		if (err) {
+			dev_info(&pdev->dev,
+				 "setup of misc vector failed: %d\n", err);
+			goto err_vsis;
+		}
+	}
+
+#ifdef CONFIG_PCI_IOV
+	/* prep for VF support */
+	if ((pf->flags & I40E_FLAG_SRIOV_ENABLED) &&
+	    (pf->flags & I40E_FLAG_MSIX_ENABLED) &&
+	    !test_bit(__I40E_BAD_EEPROM, pf->state)) {
+		/* disable link interrupts for VFs */
+		val = rd32(hw, I40E_PFGEN_PORTMDIO_NUM);
+		val &= ~I40E_PFGEN_PORTMDIO_NUM_VFLINK_STAT_ENA_MASK;
+		wr32(hw, I40E_PFGEN_PORTMDIO_NUM, val);
+		i40e_flush(hw);
+
+		if (pci_num_vf(pdev)) {
+			dev_info(&pdev->dev,
+				 "Active VFs found, allocating resources.\n");
+			err = i40e_alloc_vfs(pf, pci_num_vf(pdev));
+			if (err)
+				dev_info(&pdev->dev,
+					 "Error %d allocating resources for existing VFs\n",
+					 err);
+		}
+	}
+#endif /* CONFIG_PCI_IOV */
+
+	if (pf->flags & I40E_FLAG_IWARP_ENABLED) {
+		pf->iwarp_base_vector = i40e_get_lump(pf, pf->irq_pile,
+						      pf->num_iwarp_msix,
+						      I40E_IWARP_IRQ_PILE_ID);
+		if (pf->iwarp_base_vector < 0) {
+			dev_info(&pdev->dev,
+				 "failed to get tracking for %d vectors for IWARP err=%d\n",
+				 pf->num_iwarp_msix, pf->iwarp_base_vector);
+			pf->flags &= ~I40E_FLAG_IWARP_ENABLED;
+		}
+	}
+
+	i40e_dbg_pf_init(pf);
+
+	/* tell the firmware that we're starting */
+	i40e_send_version(pf);
+
+	/* since everything's happy, start the service_task timer */
+	mod_timer(&pf->service_timer,
+		  round_jiffies(jiffies + pf->service_timer_period));
+
+	/* add this PF to client device list and launch a client service task */
+	if (pf->flags & I40E_FLAG_IWARP_ENABLED) {
+		err = i40e_lan_add_device(pf);
+		if (err)
+			dev_info(&pdev->dev, "Failed to add PF to client API service list: %d\n",
+				 err);
+	}
+
+#define PCI_SPEED_SIZE 8
+#define PCI_WIDTH_SIZE 8
+	/* Devices on the IOSF bus do not have this information
+	 * and will report PCI Gen 1 x 1 by default so don't bother
+	 * checking them.
+	 */
+	if (!(pf->hw_features & I40E_HW_NO_PCI_LINK_CHECK)) {
+		char speed[PCI_SPEED_SIZE] = "Unknown";
+		char width[PCI_WIDTH_SIZE] = "Unknown";
+
+		/* Get the negotiated link width and speed from PCI config
+		 * space
+		 */
+		pcie_capability_read_word(pf->pdev, PCI_EXP_LNKSTA,
+					  &link_status);
+
+		i40e_set_pci_config_data(hw, link_status);
+
+		switch (hw->bus.speed) {
+		case i40e_bus_speed_8000:
+			strncpy(speed, "8.0", PCI_SPEED_SIZE); break;
+		case i40e_bus_speed_5000:
+			strncpy(speed, "5.0", PCI_SPEED_SIZE); break;
+		case i40e_bus_speed_2500:
+			strncpy(speed, "2.5", PCI_SPEED_SIZE); break;
+		default:
+			break;
+		}
+		switch (hw->bus.width) {
+		case i40e_bus_width_pcie_x8:
+			strncpy(width, "8", PCI_WIDTH_SIZE); break;
+		case i40e_bus_width_pcie_x4:
+			strncpy(width, "4", PCI_WIDTH_SIZE); break;
+		case i40e_bus_width_pcie_x2:
+			strncpy(width, "2", PCI_WIDTH_SIZE); break;
+		case i40e_bus_width_pcie_x1:
+			strncpy(width, "1", PCI_WIDTH_SIZE); break;
+		default:
+			break;
+		}
+
+		dev_info(&pdev->dev, "PCI-Express: Speed %sGT/s Width x%s\n",
+			 speed, width);
+
+		if (hw->bus.width < i40e_bus_width_pcie_x8 ||
+		    hw->bus.speed < i40e_bus_speed_8000) {
+			dev_warn(&pdev->dev, "PCI-Express bandwidth available for this device may be insufficient for optimal performance.\n");
+			dev_warn(&pdev->dev, "Please move the device to a different PCI-e link with more lanes and/or higher transfer rate.\n");
+		}
+	}
+
+	/* get the requested speeds from the fw */
+	err = i40e_aq_get_phy_capabilities(hw, false, false, &abilities, NULL);
+	if (err)
+		dev_dbg(&pf->pdev->dev, "get requested speeds ret =  %s last_status =  %s\n",
+			i40e_stat_str(&pf->hw, err),
+			i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+	pf->hw.phy.link_info.requested_speeds = abilities.link_speed;
+
+	/* get the supported phy types from the fw */
+	err = i40e_aq_get_phy_capabilities(hw, false, true, &abilities, NULL);
+	if (err)
+		dev_dbg(&pf->pdev->dev, "get supported phy types ret =  %s last_status =  %s\n",
+			i40e_stat_str(&pf->hw, err),
+			i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+
+	/* Add a filter to drop all Flow control frames from any VSI from being
+	 * transmitted. By doing so we stop a malicious VF from sending out
+	 * PAUSE or PFC frames and potentially controlling traffic for other
+	 * PF/VF VSIs.
+	 * The FW can still send Flow control frames if enabled.
+	 */
+	i40e_add_filter_to_drop_tx_flow_control_frames(&pf->hw,
+						       pf->main_vsi_seid);
+
+	if ((pf->hw.device_id == I40E_DEV_ID_10G_BASE_T) ||
+		(pf->hw.device_id == I40E_DEV_ID_10G_BASE_T4))
+		pf->hw_features |= I40E_HW_PHY_CONTROLS_LEDS;
+	if (pf->hw.device_id == I40E_DEV_ID_SFP_I_X722)
+		pf->hw_features |= I40E_HW_HAVE_CRT_RETIMER;
+	/* print a string summarizing features */
+	i40e_print_features(pf);
+
+	return 0;
+
+	/* Unwind what we've done if something failed in the setup */
+err_vsis:
+	set_bit(__I40E_DOWN, pf->state);
+	i40e_clear_interrupt_scheme(pf);
+	kfree(pf->vsi);
+err_switch_setup:
+	i40e_reset_interrupt_capability(pf);
+	del_timer_sync(&pf->service_timer);
+err_mac_addr:
+err_configure_lan_hmc:
+	(void)i40e_shutdown_lan_hmc(hw);
+err_init_lan_hmc:
+	kfree(pf->qp_pile);
+err_sw_init:
+err_adminq_setup:
+err_pf_reset:
+	iounmap(hw->hw_addr);
+err_ioremap:
+	kfree(pf);
+err_pf_alloc:
+	pci_disable_pcie_error_reporting(pdev);
+	pci_release_mem_regions(pdev);
+err_pci_reg:
+err_dma:
+	pci_disable_device(pdev);
+	return err;
+}
+
+/**
+ * i40e_remove - Device removal routine
+ * @pdev: PCI device information struct
+ *
+ * i40e_remove is called by the PCI subsystem to alert the driver
+ * that is should release a PCI device.  This could be caused by a
+ * Hot-Plug event, or because the driver is going to be removed from
+ * memory.
+ **/
+static void i40e_remove(struct pci_dev *pdev)
+{
+	struct i40e_pf *pf = pci_get_drvdata(pdev);
+	struct i40e_hw *hw = &pf->hw;
+	i40e_status ret_code;
+	int i;
+
+	i40e_dbg_pf_exit(pf);
+
+	i40e_ptp_stop(pf);
+
+	/* Disable RSS in hw */
+	i40e_write_rx_ctl(hw, I40E_PFQF_HENA(0), 0);
+	i40e_write_rx_ctl(hw, I40E_PFQF_HENA(1), 0);
+
+	/* no more scheduling of any task */
+	set_bit(__I40E_SUSPENDED, pf->state);
+	set_bit(__I40E_DOWN, pf->state);
+	if (pf->service_timer.function)
+		del_timer_sync(&pf->service_timer);
+	if (pf->service_task.func)
+		cancel_work_sync(&pf->service_task);
+
+	/* Client close must be called explicitly here because the timer
+	 * has been stopped.
+	 */
+	i40e_notify_client_of_netdev_close(pf->vsi[pf->lan_vsi], false);
+
+	if (pf->flags & I40E_FLAG_SRIOV_ENABLED) {
+		i40e_free_vfs(pf);
+		pf->flags &= ~I40E_FLAG_SRIOV_ENABLED;
+	}
+
+	i40e_fdir_teardown(pf);
+
+	/* If there is a switch structure or any orphans, remove them.
+	 * This will leave only the PF's VSI remaining.
+	 */
+	for (i = 0; i < I40E_MAX_VEB; i++) {
+		if (!pf->veb[i])
+			continue;
+
+		if (pf->veb[i]->uplink_seid == pf->mac_seid ||
+		    pf->veb[i]->uplink_seid == 0)
+			i40e_switch_branch_release(pf->veb[i]);
+	}
+
+	/* Now we can shutdown the PF's VSI, just before we kill
+	 * adminq and hmc.
+	 */
+	if (pf->vsi[pf->lan_vsi])
+		i40e_vsi_release(pf->vsi[pf->lan_vsi]);
+
+	i40e_cloud_filter_exit(pf);
+
+	/* remove attached clients */
+	if (pf->flags & I40E_FLAG_IWARP_ENABLED) {
+		ret_code = i40e_lan_del_device(pf);
+		if (ret_code)
+			dev_warn(&pdev->dev, "Failed to delete client device: %d\n",
+				 ret_code);
+	}
+
+	/* shutdown and destroy the HMC */
+	if (hw->hmc.hmc_obj) {
+		ret_code = i40e_shutdown_lan_hmc(hw);
+		if (ret_code)
+			dev_warn(&pdev->dev,
+				 "Failed to destroy the HMC resources: %d\n",
+				 ret_code);
+	}
+
+	/* shutdown the adminq */
+	i40e_shutdown_adminq(hw);
+
+	/* destroy the locks only once, here */
+	mutex_destroy(&hw->aq.arq_mutex);
+	mutex_destroy(&hw->aq.asq_mutex);
+
+	/* Clear all dynamic memory lists of rings, q_vectors, and VSIs */
+	i40e_clear_interrupt_scheme(pf);
+	for (i = 0; i < pf->num_alloc_vsi; i++) {
+		if (pf->vsi[i]) {
+			i40e_vsi_clear_rings(pf->vsi[i]);
+			i40e_vsi_clear(pf->vsi[i]);
+			pf->vsi[i] = NULL;
+		}
+	}
+
+	for (i = 0; i < I40E_MAX_VEB; i++) {
+		kfree(pf->veb[i]);
+		pf->veb[i] = NULL;
+	}
+
+	kfree(pf->qp_pile);
+	kfree(pf->vsi);
+
+	iounmap(hw->hw_addr);
+	kfree(pf);
+	pci_release_mem_regions(pdev);
+
+	pci_disable_pcie_error_reporting(pdev);
+	pci_disable_device(pdev);
+}
+
+/**
+ * i40e_pci_error_detected - warning that something funky happened in PCI land
+ * @pdev: PCI device information struct
+ *
+ * Called to warn that something happened and the error handling steps
+ * are in progress.  Allows the driver to quiesce things, be ready for
+ * remediation.
+ **/
+static pci_ers_result_t i40e_pci_error_detected(struct pci_dev *pdev,
+						enum pci_channel_state error)
+{
+	struct i40e_pf *pf = pci_get_drvdata(pdev);
+
+	dev_info(&pdev->dev, "%s: error %d\n", __func__, error);
+
+	if (!pf) {
+		dev_info(&pdev->dev,
+			 "Cannot recover - error happened during device probe\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	/* shutdown all operations */
+	if (!test_bit(__I40E_SUSPENDED, pf->state))
+		i40e_prep_for_reset(pf, false);
+
+	/* Request a slot reset */
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * i40e_pci_error_slot_reset - a PCI slot reset just happened
+ * @pdev: PCI device information struct
+ *
+ * Called to find if the driver can work with the device now that
+ * the pci slot has been reset.  If a basic connection seems good
+ * (registers are readable and have sane content) then return a
+ * happy little PCI_ERS_RESULT_xxx.
+ **/
+static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev)
+{
+	struct i40e_pf *pf = pci_get_drvdata(pdev);
+	pci_ers_result_t result;
+	int err;
+	u32 reg;
+
+	dev_dbg(&pdev->dev, "%s\n", __func__);
+	if (pci_enable_device_mem(pdev)) {
+		dev_info(&pdev->dev,
+			 "Cannot re-enable PCI device after reset.\n");
+		result = PCI_ERS_RESULT_DISCONNECT;
+	} else {
+		pci_set_master(pdev);
+		pci_restore_state(pdev);
+		pci_save_state(pdev);
+		pci_wake_from_d3(pdev, false);
+
+		reg = rd32(&pf->hw, I40E_GLGEN_RTRIG);
+		if (reg == 0)
+			result = PCI_ERS_RESULT_RECOVERED;
+		else
+			result = PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	err = pci_cleanup_aer_uncorrect_error_status(pdev);
+	if (err) {
+		dev_info(&pdev->dev,
+			 "pci_cleanup_aer_uncorrect_error_status failed 0x%0x\n",
+			 err);
+		/* non-fatal, continue */
+	}
+
+	return result;
+}
+
+/**
+ * i40e_pci_error_reset_prepare - prepare device driver for pci reset
+ * @pdev: PCI device information struct
+ */
+static void i40e_pci_error_reset_prepare(struct pci_dev *pdev)
+{
+	struct i40e_pf *pf = pci_get_drvdata(pdev);
+
+	i40e_prep_for_reset(pf, false);
+}
+
+/**
+ * i40e_pci_error_reset_done - pci reset done, device driver reset can begin
+ * @pdev: PCI device information struct
+ */
+static void i40e_pci_error_reset_done(struct pci_dev *pdev)
+{
+	struct i40e_pf *pf = pci_get_drvdata(pdev);
+
+	i40e_reset_and_rebuild(pf, false, false);
+}
+
+/**
+ * i40e_pci_error_resume - restart operations after PCI error recovery
+ * @pdev: PCI device information struct
+ *
+ * Called to allow the driver to bring things back up after PCI error
+ * and/or reset recovery has finished.
+ **/
+static void i40e_pci_error_resume(struct pci_dev *pdev)
+{
+	struct i40e_pf *pf = pci_get_drvdata(pdev);
+
+	dev_dbg(&pdev->dev, "%s\n", __func__);
+	if (test_bit(__I40E_SUSPENDED, pf->state))
+		return;
+
+	i40e_handle_reset_warning(pf, false);
+}
+
+/**
+ * i40e_enable_mc_magic_wake - enable multicast magic packet wake up
+ * using the mac_address_write admin q function
+ * @pf: pointer to i40e_pf struct
+ **/
+static void i40e_enable_mc_magic_wake(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	i40e_status ret;
+	u8 mac_addr[6];
+	u16 flags = 0;
+
+	/* Get current MAC address in case it's an LAA */
+	if (pf->vsi[pf->lan_vsi] && pf->vsi[pf->lan_vsi]->netdev) {
+		ether_addr_copy(mac_addr,
+				pf->vsi[pf->lan_vsi]->netdev->dev_addr);
+	} else {
+		dev_err(&pf->pdev->dev,
+			"Failed to retrieve MAC address; using default\n");
+		ether_addr_copy(mac_addr, hw->mac.addr);
+	}
+
+	/* The FW expects the mac address write cmd to first be called with
+	 * one of these flags before calling it again with the multicast
+	 * enable flags.
+	 */
+	flags = I40E_AQC_WRITE_TYPE_LAA_WOL;
+
+	if (hw->func_caps.flex10_enable && hw->partition_id != 1)
+		flags = I40E_AQC_WRITE_TYPE_LAA_ONLY;
+
+	ret = i40e_aq_mac_address_write(hw, flags, mac_addr, NULL);
+	if (ret) {
+		dev_err(&pf->pdev->dev,
+			"Failed to update MAC address registers; cannot enable Multicast Magic packet wake up");
+		return;
+	}
+
+	flags = I40E_AQC_MC_MAG_EN
+			| I40E_AQC_WOL_PRESERVE_ON_PFR
+			| I40E_AQC_WRITE_TYPE_UPDATE_MC_MAG;
+	ret = i40e_aq_mac_address_write(hw, flags, mac_addr, NULL);
+	if (ret)
+		dev_err(&pf->pdev->dev,
+			"Failed to enable Multicast Magic Packet wake up\n");
+}
+
+/**
+ * i40e_shutdown - PCI callback for shutting down
+ * @pdev: PCI device information struct
+ **/
+static void i40e_shutdown(struct pci_dev *pdev)
+{
+	struct i40e_pf *pf = pci_get_drvdata(pdev);
+	struct i40e_hw *hw = &pf->hw;
+
+	set_bit(__I40E_SUSPENDED, pf->state);
+	set_bit(__I40E_DOWN, pf->state);
+	rtnl_lock();
+	i40e_prep_for_reset(pf, true);
+	rtnl_unlock();
+
+	wr32(hw, I40E_PFPM_APM, (pf->wol_en ? I40E_PFPM_APM_APME_MASK : 0));
+	wr32(hw, I40E_PFPM_WUFC, (pf->wol_en ? I40E_PFPM_WUFC_MAG_MASK : 0));
+
+	del_timer_sync(&pf->service_timer);
+	cancel_work_sync(&pf->service_task);
+	i40e_cloud_filter_exit(pf);
+	i40e_fdir_teardown(pf);
+
+	/* Client close must be called explicitly here because the timer
+	 * has been stopped.
+	 */
+	i40e_notify_client_of_netdev_close(pf->vsi[pf->lan_vsi], false);
+
+	if (pf->wol_en && (pf->hw_features & I40E_HW_WOL_MC_MAGIC_PKT_WAKE))
+		i40e_enable_mc_magic_wake(pf);
+
+	i40e_prep_for_reset(pf, false);
+
+	wr32(hw, I40E_PFPM_APM,
+	     (pf->wol_en ? I40E_PFPM_APM_APME_MASK : 0));
+	wr32(hw, I40E_PFPM_WUFC,
+	     (pf->wol_en ? I40E_PFPM_WUFC_MAG_MASK : 0));
+
+	i40e_clear_interrupt_scheme(pf);
+
+	if (system_state == SYSTEM_POWER_OFF) {
+		pci_wake_from_d3(pdev, pf->wol_en);
+		pci_set_power_state(pdev, PCI_D3hot);
+	}
+}
+
+/**
+ * i40e_suspend - PM callback for moving to D3
+ * @dev: generic device information structure
+ **/
+static int __maybe_unused i40e_suspend(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct i40e_pf *pf = pci_get_drvdata(pdev);
+	struct i40e_hw *hw = &pf->hw;
+
+	/* If we're already suspended, then there is nothing to do */
+	if (test_and_set_bit(__I40E_SUSPENDED, pf->state))
+		return 0;
+
+	set_bit(__I40E_DOWN, pf->state);
+
+	/* Ensure service task will not be running */
+	del_timer_sync(&pf->service_timer);
+	cancel_work_sync(&pf->service_task);
+
+	/* Client close must be called explicitly here because the timer
+	 * has been stopped.
+	 */
+	i40e_notify_client_of_netdev_close(pf->vsi[pf->lan_vsi], false);
+
+	if (pf->wol_en && (pf->hw_features & I40E_HW_WOL_MC_MAGIC_PKT_WAKE))
+		i40e_enable_mc_magic_wake(pf);
+
+	/* Since we're going to destroy queues during the
+	 * i40e_clear_interrupt_scheme() we should hold the RTNL lock for this
+	 * whole section
+	 */
+	rtnl_lock();
+
+	i40e_prep_for_reset(pf, true);
+
+	wr32(hw, I40E_PFPM_APM, (pf->wol_en ? I40E_PFPM_APM_APME_MASK : 0));
+	wr32(hw, I40E_PFPM_WUFC, (pf->wol_en ? I40E_PFPM_WUFC_MAG_MASK : 0));
+
+	/* Clear the interrupt scheme and release our IRQs so that the system
+	 * can safely hibernate even when there are a large number of CPUs.
+	 * Otherwise hibernation might fail when mapping all the vectors back
+	 * to CPU0.
+	 */
+	i40e_clear_interrupt_scheme(pf);
+
+	rtnl_unlock();
+
+	return 0;
+}
+
+/**
+ * i40e_resume - PM callback for waking up from D3
+ * @dev: generic device information structure
+ **/
+static int __maybe_unused i40e_resume(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct i40e_pf *pf = pci_get_drvdata(pdev);
+	int err;
+
+	/* If we're not suspended, then there is nothing to do */
+	if (!test_bit(__I40E_SUSPENDED, pf->state))
+		return 0;
+
+	/* We need to hold the RTNL lock prior to restoring interrupt schemes,
+	 * since we're going to be restoring queues
+	 */
+	rtnl_lock();
+
+	/* We cleared the interrupt scheme when we suspended, so we need to
+	 * restore it now to resume device functionality.
+	 */
+	err = i40e_restore_interrupt_scheme(pf);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot restore interrupt scheme: %d\n",
+			err);
+	}
+
+	clear_bit(__I40E_DOWN, pf->state);
+	i40e_reset_and_rebuild(pf, false, true);
+
+	rtnl_unlock();
+
+	/* Clear suspended state last after everything is recovered */
+	clear_bit(__I40E_SUSPENDED, pf->state);
+
+	/* Restart the service task */
+	mod_timer(&pf->service_timer,
+		  round_jiffies(jiffies + pf->service_timer_period));
+
+	return 0;
+}
+
+static const struct pci_error_handlers i40e_err_handler = {
+	.error_detected = i40e_pci_error_detected,
+	.slot_reset = i40e_pci_error_slot_reset,
+	.reset_prepare = i40e_pci_error_reset_prepare,
+	.reset_done = i40e_pci_error_reset_done,
+	.resume = i40e_pci_error_resume,
+};
+
+static SIMPLE_DEV_PM_OPS(i40e_pm_ops, i40e_suspend, i40e_resume);
+
+static struct pci_driver i40e_driver = {
+	.name     = i40e_driver_name,
+	.id_table = i40e_pci_tbl,
+	.probe    = i40e_probe,
+	.remove   = i40e_remove,
+	.driver   = {
+		.pm = &i40e_pm_ops,
+	},
+	.shutdown = i40e_shutdown,
+	.err_handler = &i40e_err_handler,
+	.sriov_configure = i40e_pci_sriov_configure,
+};
+
+/**
+ * i40e_init_module - Driver registration routine
+ *
+ * i40e_init_module is the first routine called when the driver is
+ * loaded. All it does is register with the PCI subsystem.
+ **/
+static int __init i40e_init_module(void)
+{
+	pr_info("%s: %s - version %s\n", i40e_driver_name,
+		i40e_driver_string, i40e_driver_version_str);
+	pr_info("%s: %s\n", i40e_driver_name, i40e_copyright);
+
+	/* There is no need to throttle the number of active tasks because
+	 * each device limits its own task using a state bit for scheduling
+	 * the service task, and the device tasks do not interfere with each
+	 * other, so we don't set a max task limit. We must set WQ_MEM_RECLAIM
+	 * since we need to be able to guarantee forward progress even under
+	 * memory pressure.
+	 */
+	i40e_wq = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0, i40e_driver_name);
+	if (!i40e_wq) {
+		pr_err("%s: Failed to create workqueue\n", i40e_driver_name);
+		return -ENOMEM;
+	}
+
+	i40e_dbg_init();
+	return pci_register_driver(&i40e_driver);
+}
+module_init(i40e_init_module);
+
+/**
+ * i40e_exit_module - Driver exit cleanup routine
+ *
+ * i40e_exit_module is called just before the driver is removed
+ * from memory.
+ **/
+static void __exit i40e_exit_module(void)
+{
+	pci_unregister_driver(&i40e_driver);
+	destroy_workqueue(i40e_wq);
+	i40e_dbg_exit();
+}
+module_exit(i40e_exit_module);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
new file mode 100644
index 0000000..ba9687c
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
@@ -0,0 +1,1590 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#include "i40e_prototype.h"
+
+/**
+ * i40e_init_nvm_ops - Initialize NVM function pointers
+ * @hw: pointer to the HW structure
+ *
+ * Setup the function pointers and the NVM info structure. Should be called
+ * once per NVM initialization, e.g. inside the i40e_init_shared_code().
+ * Please notice that the NVM term is used here (& in all methods covered
+ * in this file) as an equivalent of the FLASH part mapped into the SR.
+ * We are accessing FLASH always thru the Shadow RAM.
+ **/
+i40e_status i40e_init_nvm(struct i40e_hw *hw)
+{
+	struct i40e_nvm_info *nvm = &hw->nvm;
+	i40e_status ret_code = 0;
+	u32 fla, gens;
+	u8 sr_size;
+
+	/* The SR size is stored regardless of the nvm programming mode
+	 * as the blank mode may be used in the factory line.
+	 */
+	gens = rd32(hw, I40E_GLNVM_GENS);
+	sr_size = ((gens & I40E_GLNVM_GENS_SR_SIZE_MASK) >>
+			   I40E_GLNVM_GENS_SR_SIZE_SHIFT);
+	/* Switching to words (sr_size contains power of 2KB) */
+	nvm->sr_size = BIT(sr_size) * I40E_SR_WORDS_IN_1KB;
+
+	/* Check if we are in the normal or blank NVM programming mode */
+	fla = rd32(hw, I40E_GLNVM_FLA);
+	if (fla & I40E_GLNVM_FLA_LOCKED_MASK) { /* Normal programming mode */
+		/* Max NVM timeout */
+		nvm->timeout = I40E_MAX_NVM_TIMEOUT;
+		nvm->blank_nvm_mode = false;
+	} else { /* Blank programming mode */
+		nvm->blank_nvm_mode = true;
+		ret_code = I40E_ERR_NVM_BLANK_MODE;
+		i40e_debug(hw, I40E_DEBUG_NVM, "NVM init error: unsupported blank mode.\n");
+	}
+
+	return ret_code;
+}
+
+/**
+ * i40e_acquire_nvm - Generic request for acquiring the NVM ownership
+ * @hw: pointer to the HW structure
+ * @access: NVM access type (read or write)
+ *
+ * This function will request NVM ownership for reading
+ * via the proper Admin Command.
+ **/
+i40e_status i40e_acquire_nvm(struct i40e_hw *hw,
+				       enum i40e_aq_resource_access_type access)
+{
+	i40e_status ret_code = 0;
+	u64 gtime, timeout;
+	u64 time_left = 0;
+
+	if (hw->nvm.blank_nvm_mode)
+		goto i40e_i40e_acquire_nvm_exit;
+
+	ret_code = i40e_aq_request_resource(hw, I40E_NVM_RESOURCE_ID, access,
+					    0, &time_left, NULL);
+	/* Reading the Global Device Timer */
+	gtime = rd32(hw, I40E_GLVFGEN_TIMER);
+
+	/* Store the timeout */
+	hw->nvm.hw_semaphore_timeout = I40E_MS_TO_GTIME(time_left) + gtime;
+
+	if (ret_code)
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM acquire type %d failed time_left=%llu ret=%d aq_err=%d\n",
+			   access, time_left, ret_code, hw->aq.asq_last_status);
+
+	if (ret_code && time_left) {
+		/* Poll until the current NVM owner timeouts */
+		timeout = I40E_MS_TO_GTIME(I40E_MAX_NVM_TIMEOUT) + gtime;
+		while ((gtime < timeout) && time_left) {
+			usleep_range(10000, 20000);
+			gtime = rd32(hw, I40E_GLVFGEN_TIMER);
+			ret_code = i40e_aq_request_resource(hw,
+							I40E_NVM_RESOURCE_ID,
+							access, 0, &time_left,
+							NULL);
+			if (!ret_code) {
+				hw->nvm.hw_semaphore_timeout =
+					    I40E_MS_TO_GTIME(time_left) + gtime;
+				break;
+			}
+		}
+		if (ret_code) {
+			hw->nvm.hw_semaphore_timeout = 0;
+			i40e_debug(hw, I40E_DEBUG_NVM,
+				   "NVM acquire timed out, wait %llu ms before trying again. status=%d aq_err=%d\n",
+				   time_left, ret_code, hw->aq.asq_last_status);
+		}
+	}
+
+i40e_i40e_acquire_nvm_exit:
+	return ret_code;
+}
+
+/**
+ * i40e_release_nvm - Generic request for releasing the NVM ownership
+ * @hw: pointer to the HW structure
+ *
+ * This function will release NVM resource via the proper Admin Command.
+ **/
+void i40e_release_nvm(struct i40e_hw *hw)
+{
+	i40e_status ret_code = I40E_SUCCESS;
+	u32 total_delay = 0;
+
+	if (hw->nvm.blank_nvm_mode)
+		return;
+
+	ret_code = i40e_aq_release_resource(hw, I40E_NVM_RESOURCE_ID, 0, NULL);
+
+	/* there are some rare cases when trying to release the resource
+	 * results in an admin Q timeout, so handle them correctly
+	 */
+	while ((ret_code == I40E_ERR_ADMIN_QUEUE_TIMEOUT) &&
+	       (total_delay < hw->aq.asq_cmd_timeout)) {
+		usleep_range(1000, 2000);
+		ret_code = i40e_aq_release_resource(hw,
+						    I40E_NVM_RESOURCE_ID,
+						    0, NULL);
+		total_delay++;
+	}
+}
+
+/**
+ * i40e_poll_sr_srctl_done_bit - Polls the GLNVM_SRCTL done bit
+ * @hw: pointer to the HW structure
+ *
+ * Polls the SRCTL Shadow RAM register done bit.
+ **/
+static i40e_status i40e_poll_sr_srctl_done_bit(struct i40e_hw *hw)
+{
+	i40e_status ret_code = I40E_ERR_TIMEOUT;
+	u32 srctl, wait_cnt;
+
+	/* Poll the I40E_GLNVM_SRCTL until the done bit is set */
+	for (wait_cnt = 0; wait_cnt < I40E_SRRD_SRCTL_ATTEMPTS; wait_cnt++) {
+		srctl = rd32(hw, I40E_GLNVM_SRCTL);
+		if (srctl & I40E_GLNVM_SRCTL_DONE_MASK) {
+			ret_code = 0;
+			break;
+		}
+		udelay(5);
+	}
+	if (ret_code == I40E_ERR_TIMEOUT)
+		i40e_debug(hw, I40E_DEBUG_NVM, "Done bit in GLNVM_SRCTL not set");
+	return ret_code;
+}
+
+/**
+ * i40e_read_nvm_word_srctl - Reads Shadow RAM via SRCTL register
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
+ * @data: word read from the Shadow RAM
+ *
+ * Reads one 16 bit word from the Shadow RAM using the GLNVM_SRCTL register.
+ **/
+static i40e_status i40e_read_nvm_word_srctl(struct i40e_hw *hw, u16 offset,
+					    u16 *data)
+{
+	i40e_status ret_code = I40E_ERR_TIMEOUT;
+	u32 sr_reg;
+
+	if (offset >= hw->nvm.sr_size) {
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM read error: offset %d beyond Shadow RAM limit %d\n",
+			   offset, hw->nvm.sr_size);
+		ret_code = I40E_ERR_PARAM;
+		goto read_nvm_exit;
+	}
+
+	/* Poll the done bit first */
+	ret_code = i40e_poll_sr_srctl_done_bit(hw);
+	if (!ret_code) {
+		/* Write the address and start reading */
+		sr_reg = ((u32)offset << I40E_GLNVM_SRCTL_ADDR_SHIFT) |
+			 BIT(I40E_GLNVM_SRCTL_START_SHIFT);
+		wr32(hw, I40E_GLNVM_SRCTL, sr_reg);
+
+		/* Poll I40E_GLNVM_SRCTL until the done bit is set */
+		ret_code = i40e_poll_sr_srctl_done_bit(hw);
+		if (!ret_code) {
+			sr_reg = rd32(hw, I40E_GLNVM_SRDATA);
+			*data = (u16)((sr_reg &
+				       I40E_GLNVM_SRDATA_RDDATA_MASK)
+				    >> I40E_GLNVM_SRDATA_RDDATA_SHIFT);
+		}
+	}
+	if (ret_code)
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM read error: Couldn't access Shadow RAM address: 0x%x\n",
+			   offset);
+
+read_nvm_exit:
+	return ret_code;
+}
+
+/**
+ * i40e_read_nvm_aq - Read Shadow RAM.
+ * @hw: pointer to the HW structure.
+ * @module_pointer: module pointer location in words from the NVM beginning
+ * @offset: offset in words from module start
+ * @words: number of words to write
+ * @data: buffer with words to write to the Shadow RAM
+ * @last_command: tells the AdminQ that this is the last command
+ *
+ * Writes a 16 bit words buffer to the Shadow RAM using the admin command.
+ **/
+static i40e_status i40e_read_nvm_aq(struct i40e_hw *hw,
+				    u8 module_pointer, u32 offset,
+				    u16 words, void *data,
+				    bool last_command)
+{
+	i40e_status ret_code = I40E_ERR_NVM;
+	struct i40e_asq_cmd_details cmd_details;
+
+	memset(&cmd_details, 0, sizeof(cmd_details));
+	cmd_details.wb_desc = &hw->nvm_wb_desc;
+
+	/* Here we are checking the SR limit only for the flat memory model.
+	 * We cannot do it for the module-based model, as we did not acquire
+	 * the NVM resource yet (we cannot get the module pointer value).
+	 * Firmware will check the module-based model.
+	 */
+	if ((offset + words) > hw->nvm.sr_size)
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM write error: offset %d beyond Shadow RAM limit %d\n",
+			   (offset + words), hw->nvm.sr_size);
+	else if (words > I40E_SR_SECTOR_SIZE_IN_WORDS)
+		/* We can write only up to 4KB (one sector), in one AQ write */
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM write fail error: tried to write %d words, limit is %d.\n",
+			   words, I40E_SR_SECTOR_SIZE_IN_WORDS);
+	else if (((offset + (words - 1)) / I40E_SR_SECTOR_SIZE_IN_WORDS)
+		 != (offset / I40E_SR_SECTOR_SIZE_IN_WORDS))
+		/* A single write cannot spread over two sectors */
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM write error: cannot spread over two sectors in a single write offset=%d words=%d\n",
+			   offset, words);
+	else
+		ret_code = i40e_aq_read_nvm(hw, module_pointer,
+					    2 * offset,  /*bytes*/
+					    2 * words,   /*bytes*/
+					    data, last_command, &cmd_details);
+
+	return ret_code;
+}
+
+/**
+ * i40e_read_nvm_word_aq - Reads Shadow RAM via AQ
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
+ * @data: word read from the Shadow RAM
+ *
+ * Reads one 16 bit word from the Shadow RAM using the AdminQ
+ **/
+static i40e_status i40e_read_nvm_word_aq(struct i40e_hw *hw, u16 offset,
+					 u16 *data)
+{
+	i40e_status ret_code = I40E_ERR_TIMEOUT;
+
+	ret_code = i40e_read_nvm_aq(hw, 0x0, offset, 1, data, true);
+	*data = le16_to_cpu(*(__le16 *)data);
+
+	return ret_code;
+}
+
+/**
+ * __i40e_read_nvm_word - Reads nvm word, assumes caller does the locking
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
+ * @data: word read from the Shadow RAM
+ *
+ * Reads one 16 bit word from the Shadow RAM.
+ *
+ * Do not use this function except in cases where the nvm lock is already
+ * taken via i40e_acquire_nvm().
+ **/
+static i40e_status __i40e_read_nvm_word(struct i40e_hw *hw,
+					u16 offset, u16 *data)
+{
+	if (hw->flags & I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE)
+		return i40e_read_nvm_word_aq(hw, offset, data);
+
+	return i40e_read_nvm_word_srctl(hw, offset, data);
+}
+
+/**
+ * i40e_read_nvm_word - Reads nvm word and acquire lock if necessary
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
+ * @data: word read from the Shadow RAM
+ *
+ * Reads one 16 bit word from the Shadow RAM.
+ **/
+i40e_status i40e_read_nvm_word(struct i40e_hw *hw, u16 offset,
+			       u16 *data)
+{
+	i40e_status ret_code = 0;
+
+	if (hw->flags & I40E_HW_FLAG_NVM_READ_REQUIRES_LOCK)
+		ret_code = i40e_acquire_nvm(hw, I40E_RESOURCE_READ);
+	if (ret_code)
+		return ret_code;
+
+	ret_code = __i40e_read_nvm_word(hw, offset, data);
+
+	if (hw->flags & I40E_HW_FLAG_NVM_READ_REQUIRES_LOCK)
+		i40e_release_nvm(hw);
+
+	return ret_code;
+}
+
+/**
+ * i40e_read_nvm_buffer_srctl - Reads Shadow RAM buffer via SRCTL register
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF).
+ * @words: (in) number of words to read; (out) number of words actually read
+ * @data: words read from the Shadow RAM
+ *
+ * Reads 16 bit words (data buffer) from the SR using the i40e_read_nvm_srrd()
+ * method. The buffer read is preceded by the NVM ownership take
+ * and followed by the release.
+ **/
+static i40e_status i40e_read_nvm_buffer_srctl(struct i40e_hw *hw, u16 offset,
+					      u16 *words, u16 *data)
+{
+	i40e_status ret_code = 0;
+	u16 index, word;
+
+	/* Loop thru the selected region */
+	for (word = 0; word < *words; word++) {
+		index = offset + word;
+		ret_code = i40e_read_nvm_word_srctl(hw, index, &data[word]);
+		if (ret_code)
+			break;
+	}
+
+	/* Update the number of words read from the Shadow RAM */
+	*words = word;
+
+	return ret_code;
+}
+
+/**
+ * i40e_read_nvm_buffer_aq - Reads Shadow RAM buffer via AQ
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF).
+ * @words: (in) number of words to read; (out) number of words actually read
+ * @data: words read from the Shadow RAM
+ *
+ * Reads 16 bit words (data buffer) from the SR using the i40e_read_nvm_aq()
+ * method. The buffer read is preceded by the NVM ownership take
+ * and followed by the release.
+ **/
+static i40e_status i40e_read_nvm_buffer_aq(struct i40e_hw *hw, u16 offset,
+					   u16 *words, u16 *data)
+{
+	i40e_status ret_code;
+	u16 read_size;
+	bool last_cmd = false;
+	u16 words_read = 0;
+	u16 i = 0;
+
+	do {
+		/* Calculate number of bytes we should read in this step.
+		 * FVL AQ do not allow to read more than one page at a time or
+		 * to cross page boundaries.
+		 */
+		if (offset % I40E_SR_SECTOR_SIZE_IN_WORDS)
+			read_size = min(*words,
+					(u16)(I40E_SR_SECTOR_SIZE_IN_WORDS -
+				      (offset % I40E_SR_SECTOR_SIZE_IN_WORDS)));
+		else
+			read_size = min((*words - words_read),
+					I40E_SR_SECTOR_SIZE_IN_WORDS);
+
+		/* Check if this is last command, if so set proper flag */
+		if ((words_read + read_size) >= *words)
+			last_cmd = true;
+
+		ret_code = i40e_read_nvm_aq(hw, 0x0, offset, read_size,
+					    data + words_read, last_cmd);
+		if (ret_code)
+			goto read_nvm_buffer_aq_exit;
+
+		/* Increment counter for words already read and move offset to
+		 * new read location
+		 */
+		words_read += read_size;
+		offset += read_size;
+	} while (words_read < *words);
+
+	for (i = 0; i < *words; i++)
+		data[i] = le16_to_cpu(((__le16 *)data)[i]);
+
+read_nvm_buffer_aq_exit:
+	*words = words_read;
+	return ret_code;
+}
+
+/**
+ * __i40e_read_nvm_buffer - Reads nvm buffer, caller must acquire lock
+ * @hw: pointer to the HW structure
+ * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF).
+ * @words: (in) number of words to read; (out) number of words actually read
+ * @data: words read from the Shadow RAM
+ *
+ * Reads 16 bit words (data buffer) from the SR using the i40e_read_nvm_srrd()
+ * method.
+ **/
+static i40e_status __i40e_read_nvm_buffer(struct i40e_hw *hw,
+					  u16 offset, u16 *words,
+					  u16 *data)
+{
+	if (hw->flags & I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE)
+		return i40e_read_nvm_buffer_aq(hw, offset, words, data);
+
+	return i40e_read_nvm_buffer_srctl(hw, offset, words, data);
+}
+
+/**
+ * i40e_write_nvm_aq - Writes Shadow RAM.
+ * @hw: pointer to the HW structure.
+ * @module_pointer: module pointer location in words from the NVM beginning
+ * @offset: offset in words from module start
+ * @words: number of words to write
+ * @data: buffer with words to write to the Shadow RAM
+ * @last_command: tells the AdminQ that this is the last command
+ *
+ * Writes a 16 bit words buffer to the Shadow RAM using the admin command.
+ **/
+static i40e_status i40e_write_nvm_aq(struct i40e_hw *hw, u8 module_pointer,
+				     u32 offset, u16 words, void *data,
+				     bool last_command)
+{
+	i40e_status ret_code = I40E_ERR_NVM;
+	struct i40e_asq_cmd_details cmd_details;
+
+	memset(&cmd_details, 0, sizeof(cmd_details));
+	cmd_details.wb_desc = &hw->nvm_wb_desc;
+
+	/* Here we are checking the SR limit only for the flat memory model.
+	 * We cannot do it for the module-based model, as we did not acquire
+	 * the NVM resource yet (we cannot get the module pointer value).
+	 * Firmware will check the module-based model.
+	 */
+	if ((offset + words) > hw->nvm.sr_size)
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM write error: offset %d beyond Shadow RAM limit %d\n",
+			   (offset + words), hw->nvm.sr_size);
+	else if (words > I40E_SR_SECTOR_SIZE_IN_WORDS)
+		/* We can write only up to 4KB (one sector), in one AQ write */
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM write fail error: tried to write %d words, limit is %d.\n",
+			   words, I40E_SR_SECTOR_SIZE_IN_WORDS);
+	else if (((offset + (words - 1)) / I40E_SR_SECTOR_SIZE_IN_WORDS)
+		 != (offset / I40E_SR_SECTOR_SIZE_IN_WORDS))
+		/* A single write cannot spread over two sectors */
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVM write error: cannot spread over two sectors in a single write offset=%d words=%d\n",
+			   offset, words);
+	else
+		ret_code = i40e_aq_update_nvm(hw, module_pointer,
+					      2 * offset,  /*bytes*/
+					      2 * words,   /*bytes*/
+					      data, last_command, 0,
+					      &cmd_details);
+
+	return ret_code;
+}
+
+/**
+ * i40e_calc_nvm_checksum - Calculates and returns the checksum
+ * @hw: pointer to hardware structure
+ * @checksum: pointer to the checksum
+ *
+ * This function calculates SW Checksum that covers the whole 64kB shadow RAM
+ * except the VPD and PCIe ALT Auto-load modules. The structure and size of VPD
+ * is customer specific and unknown. Therefore, this function skips all maximum
+ * possible size of VPD (1kB).
+ **/
+static i40e_status i40e_calc_nvm_checksum(struct i40e_hw *hw,
+						    u16 *checksum)
+{
+	i40e_status ret_code;
+	struct i40e_virt_mem vmem;
+	u16 pcie_alt_module = 0;
+	u16 checksum_local = 0;
+	u16 vpd_module = 0;
+	u16 *data;
+	u16 i = 0;
+
+	ret_code = i40e_allocate_virt_mem(hw, &vmem,
+				    I40E_SR_SECTOR_SIZE_IN_WORDS * sizeof(u16));
+	if (ret_code)
+		goto i40e_calc_nvm_checksum_exit;
+	data = (u16 *)vmem.va;
+
+	/* read pointer to VPD area */
+	ret_code = __i40e_read_nvm_word(hw, I40E_SR_VPD_PTR, &vpd_module);
+	if (ret_code) {
+		ret_code = I40E_ERR_NVM_CHECKSUM;
+		goto i40e_calc_nvm_checksum_exit;
+	}
+
+	/* read pointer to PCIe Alt Auto-load module */
+	ret_code = __i40e_read_nvm_word(hw, I40E_SR_PCIE_ALT_AUTO_LOAD_PTR,
+					&pcie_alt_module);
+	if (ret_code) {
+		ret_code = I40E_ERR_NVM_CHECKSUM;
+		goto i40e_calc_nvm_checksum_exit;
+	}
+
+	/* Calculate SW checksum that covers the whole 64kB shadow RAM
+	 * except the VPD and PCIe ALT Auto-load modules
+	 */
+	for (i = 0; i < hw->nvm.sr_size; i++) {
+		/* Read SR page */
+		if ((i % I40E_SR_SECTOR_SIZE_IN_WORDS) == 0) {
+			u16 words = I40E_SR_SECTOR_SIZE_IN_WORDS;
+
+			ret_code = __i40e_read_nvm_buffer(hw, i, &words, data);
+			if (ret_code) {
+				ret_code = I40E_ERR_NVM_CHECKSUM;
+				goto i40e_calc_nvm_checksum_exit;
+			}
+		}
+
+		/* Skip Checksum word */
+		if (i == I40E_SR_SW_CHECKSUM_WORD)
+			continue;
+		/* Skip VPD module (convert byte size to word count) */
+		if ((i >= (u32)vpd_module) &&
+		    (i < ((u32)vpd_module +
+		     (I40E_SR_VPD_MODULE_MAX_SIZE / 2)))) {
+			continue;
+		}
+		/* Skip PCIe ALT module (convert byte size to word count) */
+		if ((i >= (u32)pcie_alt_module) &&
+		    (i < ((u32)pcie_alt_module +
+		     (I40E_SR_PCIE_ALT_MODULE_MAX_SIZE / 2)))) {
+			continue;
+		}
+
+		checksum_local += data[i % I40E_SR_SECTOR_SIZE_IN_WORDS];
+	}
+
+	*checksum = (u16)I40E_SR_SW_CHECKSUM_BASE - checksum_local;
+
+i40e_calc_nvm_checksum_exit:
+	i40e_free_virt_mem(hw, &vmem);
+	return ret_code;
+}
+
+/**
+ * i40e_update_nvm_checksum - Updates the NVM checksum
+ * @hw: pointer to hardware structure
+ *
+ * NVM ownership must be acquired before calling this function and released
+ * on ARQ completion event reception by caller.
+ * This function will commit SR to NVM.
+ **/
+i40e_status i40e_update_nvm_checksum(struct i40e_hw *hw)
+{
+	i40e_status ret_code;
+	u16 checksum;
+	__le16 le_sum;
+
+	ret_code = i40e_calc_nvm_checksum(hw, &checksum);
+	if (!ret_code) {
+		le_sum = cpu_to_le16(checksum);
+		ret_code = i40e_write_nvm_aq(hw, 0x00, I40E_SR_SW_CHECKSUM_WORD,
+					     1, &le_sum, true);
+	}
+
+	return ret_code;
+}
+
+/**
+ * i40e_validate_nvm_checksum - Validate EEPROM checksum
+ * @hw: pointer to hardware structure
+ * @checksum: calculated checksum
+ *
+ * Performs checksum calculation and validates the NVM SW checksum. If the
+ * caller does not need checksum, the value can be NULL.
+ **/
+i40e_status i40e_validate_nvm_checksum(struct i40e_hw *hw,
+						 u16 *checksum)
+{
+	i40e_status ret_code = 0;
+	u16 checksum_sr = 0;
+	u16 checksum_local = 0;
+
+	/* We must acquire the NVM lock in order to correctly synchronize the
+	 * NVM accesses across multiple PFs. Without doing so it is possible
+	 * for one of the PFs to read invalid data potentially indicating that
+	 * the checksum is invalid.
+	 */
+	ret_code = i40e_acquire_nvm(hw, I40E_RESOURCE_READ);
+	if (ret_code)
+		return ret_code;
+	ret_code = i40e_calc_nvm_checksum(hw, &checksum_local);
+	__i40e_read_nvm_word(hw, I40E_SR_SW_CHECKSUM_WORD, &checksum_sr);
+	i40e_release_nvm(hw);
+	if (ret_code)
+		return ret_code;
+
+	/* Verify read checksum from EEPROM is the same as
+	 * calculated checksum
+	 */
+	if (checksum_local != checksum_sr)
+		ret_code = I40E_ERR_NVM_CHECKSUM;
+
+	/* If the user cares, return the calculated checksum */
+	if (checksum)
+		*checksum = checksum_local;
+
+	return ret_code;
+}
+
+static i40e_status i40e_nvmupd_state_init(struct i40e_hw *hw,
+					  struct i40e_nvm_access *cmd,
+					  u8 *bytes, int *perrno);
+static i40e_status i40e_nvmupd_state_reading(struct i40e_hw *hw,
+					     struct i40e_nvm_access *cmd,
+					     u8 *bytes, int *perrno);
+static i40e_status i40e_nvmupd_state_writing(struct i40e_hw *hw,
+					     struct i40e_nvm_access *cmd,
+					     u8 *bytes, int *errno);
+static enum i40e_nvmupd_cmd i40e_nvmupd_validate_command(struct i40e_hw *hw,
+						struct i40e_nvm_access *cmd,
+						int *perrno);
+static i40e_status i40e_nvmupd_nvm_erase(struct i40e_hw *hw,
+					 struct i40e_nvm_access *cmd,
+					 int *perrno);
+static i40e_status i40e_nvmupd_nvm_write(struct i40e_hw *hw,
+					 struct i40e_nvm_access *cmd,
+					 u8 *bytes, int *perrno);
+static i40e_status i40e_nvmupd_nvm_read(struct i40e_hw *hw,
+					struct i40e_nvm_access *cmd,
+					u8 *bytes, int *perrno);
+static i40e_status i40e_nvmupd_exec_aq(struct i40e_hw *hw,
+				       struct i40e_nvm_access *cmd,
+				       u8 *bytes, int *perrno);
+static i40e_status i40e_nvmupd_get_aq_result(struct i40e_hw *hw,
+					     struct i40e_nvm_access *cmd,
+					     u8 *bytes, int *perrno);
+static i40e_status i40e_nvmupd_get_aq_event(struct i40e_hw *hw,
+					    struct i40e_nvm_access *cmd,
+					    u8 *bytes, int *perrno);
+static inline u8 i40e_nvmupd_get_module(u32 val)
+{
+	return (u8)(val & I40E_NVM_MOD_PNT_MASK);
+}
+static inline u8 i40e_nvmupd_get_transaction(u32 val)
+{
+	return (u8)((val & I40E_NVM_TRANS_MASK) >> I40E_NVM_TRANS_SHIFT);
+}
+
+static inline u8 i40e_nvmupd_get_preservation_flags(u32 val)
+{
+	return (u8)((val & I40E_NVM_PRESERVATION_FLAGS_MASK) >>
+		    I40E_NVM_PRESERVATION_FLAGS_SHIFT);
+}
+
+static const char * const i40e_nvm_update_state_str[] = {
+	"I40E_NVMUPD_INVALID",
+	"I40E_NVMUPD_READ_CON",
+	"I40E_NVMUPD_READ_SNT",
+	"I40E_NVMUPD_READ_LCB",
+	"I40E_NVMUPD_READ_SA",
+	"I40E_NVMUPD_WRITE_ERA",
+	"I40E_NVMUPD_WRITE_CON",
+	"I40E_NVMUPD_WRITE_SNT",
+	"I40E_NVMUPD_WRITE_LCB",
+	"I40E_NVMUPD_WRITE_SA",
+	"I40E_NVMUPD_CSUM_CON",
+	"I40E_NVMUPD_CSUM_SA",
+	"I40E_NVMUPD_CSUM_LCB",
+	"I40E_NVMUPD_STATUS",
+	"I40E_NVMUPD_EXEC_AQ",
+	"I40E_NVMUPD_GET_AQ_RESULT",
+	"I40E_NVMUPD_GET_AQ_EVENT",
+};
+
+/**
+ * i40e_nvmupd_command - Process an NVM update command
+ * @hw: pointer to hardware structure
+ * @cmd: pointer to nvm update command
+ * @bytes: pointer to the data buffer
+ * @perrno: pointer to return error code
+ *
+ * Dispatches command depending on what update state is current
+ **/
+i40e_status i40e_nvmupd_command(struct i40e_hw *hw,
+				struct i40e_nvm_access *cmd,
+				u8 *bytes, int *perrno)
+{
+	i40e_status status;
+	enum i40e_nvmupd_cmd upd_cmd;
+
+	/* assume success */
+	*perrno = 0;
+
+	/* early check for status command and debug msgs */
+	upd_cmd = i40e_nvmupd_validate_command(hw, cmd, perrno);
+
+	i40e_debug(hw, I40E_DEBUG_NVM, "%s state %d nvm_release_on_hold %d opc 0x%04x cmd 0x%08x config 0x%08x offset 0x%08x data_size 0x%08x\n",
+		   i40e_nvm_update_state_str[upd_cmd],
+		   hw->nvmupd_state,
+		   hw->nvm_release_on_done, hw->nvm_wait_opcode,
+		   cmd->command, cmd->config, cmd->offset, cmd->data_size);
+
+	if (upd_cmd == I40E_NVMUPD_INVALID) {
+		*perrno = -EFAULT;
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "i40e_nvmupd_validate_command returns %d errno %d\n",
+			   upd_cmd, *perrno);
+	}
+
+	/* a status request returns immediately rather than
+	 * going into the state machine
+	 */
+	if (upd_cmd == I40E_NVMUPD_STATUS) {
+		if (!cmd->data_size) {
+			*perrno = -EFAULT;
+			return I40E_ERR_BUF_TOO_SHORT;
+		}
+
+		bytes[0] = hw->nvmupd_state;
+
+		if (cmd->data_size >= 4) {
+			bytes[1] = 0;
+			*((u16 *)&bytes[2]) = hw->nvm_wait_opcode;
+		}
+
+		/* Clear error status on read */
+		if (hw->nvmupd_state == I40E_NVMUPD_STATE_ERROR)
+			hw->nvmupd_state = I40E_NVMUPD_STATE_INIT;
+
+		return 0;
+	}
+
+	/* Clear status even it is not read and log */
+	if (hw->nvmupd_state == I40E_NVMUPD_STATE_ERROR) {
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "Clearing I40E_NVMUPD_STATE_ERROR state without reading\n");
+		hw->nvmupd_state = I40E_NVMUPD_STATE_INIT;
+	}
+
+	/* Acquire lock to prevent race condition where adminq_task
+	 * can execute after i40e_nvmupd_nvm_read/write but before state
+	 * variables (nvm_wait_opcode, nvm_release_on_done) are updated.
+	 *
+	 * During NVMUpdate, it is observed that lock could be held for
+	 * ~5ms for most commands. However lock is held for ~60ms for
+	 * NVMUPD_CSUM_LCB command.
+	 */
+	mutex_lock(&hw->aq.arq_mutex);
+	switch (hw->nvmupd_state) {
+	case I40E_NVMUPD_STATE_INIT:
+		status = i40e_nvmupd_state_init(hw, cmd, bytes, perrno);
+		break;
+
+	case I40E_NVMUPD_STATE_READING:
+		status = i40e_nvmupd_state_reading(hw, cmd, bytes, perrno);
+		break;
+
+	case I40E_NVMUPD_STATE_WRITING:
+		status = i40e_nvmupd_state_writing(hw, cmd, bytes, perrno);
+		break;
+
+	case I40E_NVMUPD_STATE_INIT_WAIT:
+	case I40E_NVMUPD_STATE_WRITE_WAIT:
+		/* if we need to stop waiting for an event, clear
+		 * the wait info and return before doing anything else
+		 */
+		if (cmd->offset == 0xffff) {
+			i40e_nvmupd_clear_wait_state(hw);
+			status = 0;
+			break;
+		}
+
+		status = I40E_ERR_NOT_READY;
+		*perrno = -EBUSY;
+		break;
+
+	default:
+		/* invalid state, should never happen */
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVMUPD: no such state %d\n", hw->nvmupd_state);
+		status = I40E_NOT_SUPPORTED;
+		*perrno = -ESRCH;
+		break;
+	}
+
+	mutex_unlock(&hw->aq.arq_mutex);
+	return status;
+}
+
+/**
+ * i40e_nvmupd_state_init - Handle NVM update state Init
+ * @hw: pointer to hardware structure
+ * @cmd: pointer to nvm update command buffer
+ * @bytes: pointer to the data buffer
+ * @perrno: pointer to return error code
+ *
+ * Process legitimate commands of the Init state and conditionally set next
+ * state. Reject all other commands.
+ **/
+static i40e_status i40e_nvmupd_state_init(struct i40e_hw *hw,
+					  struct i40e_nvm_access *cmd,
+					  u8 *bytes, int *perrno)
+{
+	i40e_status status = 0;
+	enum i40e_nvmupd_cmd upd_cmd;
+
+	upd_cmd = i40e_nvmupd_validate_command(hw, cmd, perrno);
+
+	switch (upd_cmd) {
+	case I40E_NVMUPD_READ_SA:
+		status = i40e_acquire_nvm(hw, I40E_RESOURCE_READ);
+		if (status) {
+			*perrno = i40e_aq_rc_to_posix(status,
+						     hw->aq.asq_last_status);
+		} else {
+			status = i40e_nvmupd_nvm_read(hw, cmd, bytes, perrno);
+			i40e_release_nvm(hw);
+		}
+		break;
+
+	case I40E_NVMUPD_READ_SNT:
+		status = i40e_acquire_nvm(hw, I40E_RESOURCE_READ);
+		if (status) {
+			*perrno = i40e_aq_rc_to_posix(status,
+						     hw->aq.asq_last_status);
+		} else {
+			status = i40e_nvmupd_nvm_read(hw, cmd, bytes, perrno);
+			if (status)
+				i40e_release_nvm(hw);
+			else
+				hw->nvmupd_state = I40E_NVMUPD_STATE_READING;
+		}
+		break;
+
+	case I40E_NVMUPD_WRITE_ERA:
+		status = i40e_acquire_nvm(hw, I40E_RESOURCE_WRITE);
+		if (status) {
+			*perrno = i40e_aq_rc_to_posix(status,
+						     hw->aq.asq_last_status);
+		} else {
+			status = i40e_nvmupd_nvm_erase(hw, cmd, perrno);
+			if (status) {
+				i40e_release_nvm(hw);
+			} else {
+				hw->nvm_release_on_done = true;
+				hw->nvm_wait_opcode = i40e_aqc_opc_nvm_erase;
+				hw->nvmupd_state = I40E_NVMUPD_STATE_INIT_WAIT;
+			}
+		}
+		break;
+
+	case I40E_NVMUPD_WRITE_SA:
+		status = i40e_acquire_nvm(hw, I40E_RESOURCE_WRITE);
+		if (status) {
+			*perrno = i40e_aq_rc_to_posix(status,
+						     hw->aq.asq_last_status);
+		} else {
+			status = i40e_nvmupd_nvm_write(hw, cmd, bytes, perrno);
+			if (status) {
+				i40e_release_nvm(hw);
+			} else {
+				hw->nvm_release_on_done = true;
+				hw->nvm_wait_opcode = i40e_aqc_opc_nvm_update;
+				hw->nvmupd_state = I40E_NVMUPD_STATE_INIT_WAIT;
+			}
+		}
+		break;
+
+	case I40E_NVMUPD_WRITE_SNT:
+		status = i40e_acquire_nvm(hw, I40E_RESOURCE_WRITE);
+		if (status) {
+			*perrno = i40e_aq_rc_to_posix(status,
+						     hw->aq.asq_last_status);
+		} else {
+			status = i40e_nvmupd_nvm_write(hw, cmd, bytes, perrno);
+			if (status) {
+				i40e_release_nvm(hw);
+			} else {
+				hw->nvm_wait_opcode = i40e_aqc_opc_nvm_update;
+				hw->nvmupd_state = I40E_NVMUPD_STATE_WRITE_WAIT;
+			}
+		}
+		break;
+
+	case I40E_NVMUPD_CSUM_SA:
+		status = i40e_acquire_nvm(hw, I40E_RESOURCE_WRITE);
+		if (status) {
+			*perrno = i40e_aq_rc_to_posix(status,
+						     hw->aq.asq_last_status);
+		} else {
+			status = i40e_update_nvm_checksum(hw);
+			if (status) {
+				*perrno = hw->aq.asq_last_status ?
+				   i40e_aq_rc_to_posix(status,
+						       hw->aq.asq_last_status) :
+				   -EIO;
+				i40e_release_nvm(hw);
+			} else {
+				hw->nvm_release_on_done = true;
+				hw->nvm_wait_opcode = i40e_aqc_opc_nvm_update;
+				hw->nvmupd_state = I40E_NVMUPD_STATE_INIT_WAIT;
+			}
+		}
+		break;
+
+	case I40E_NVMUPD_EXEC_AQ:
+		status = i40e_nvmupd_exec_aq(hw, cmd, bytes, perrno);
+		break;
+
+	case I40E_NVMUPD_GET_AQ_RESULT:
+		status = i40e_nvmupd_get_aq_result(hw, cmd, bytes, perrno);
+		break;
+
+	case I40E_NVMUPD_GET_AQ_EVENT:
+		status = i40e_nvmupd_get_aq_event(hw, cmd, bytes, perrno);
+		break;
+
+	default:
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVMUPD: bad cmd %s in init state\n",
+			   i40e_nvm_update_state_str[upd_cmd]);
+		status = I40E_ERR_NVM;
+		*perrno = -ESRCH;
+		break;
+	}
+	return status;
+}
+
+/**
+ * i40e_nvmupd_state_reading - Handle NVM update state Reading
+ * @hw: pointer to hardware structure
+ * @cmd: pointer to nvm update command buffer
+ * @bytes: pointer to the data buffer
+ * @perrno: pointer to return error code
+ *
+ * NVM ownership is already held.  Process legitimate commands and set any
+ * change in state; reject all other commands.
+ **/
+static i40e_status i40e_nvmupd_state_reading(struct i40e_hw *hw,
+					     struct i40e_nvm_access *cmd,
+					     u8 *bytes, int *perrno)
+{
+	i40e_status status = 0;
+	enum i40e_nvmupd_cmd upd_cmd;
+
+	upd_cmd = i40e_nvmupd_validate_command(hw, cmd, perrno);
+
+	switch (upd_cmd) {
+	case I40E_NVMUPD_READ_SA:
+	case I40E_NVMUPD_READ_CON:
+		status = i40e_nvmupd_nvm_read(hw, cmd, bytes, perrno);
+		break;
+
+	case I40E_NVMUPD_READ_LCB:
+		status = i40e_nvmupd_nvm_read(hw, cmd, bytes, perrno);
+		i40e_release_nvm(hw);
+		hw->nvmupd_state = I40E_NVMUPD_STATE_INIT;
+		break;
+
+	default:
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVMUPD: bad cmd %s in reading state.\n",
+			   i40e_nvm_update_state_str[upd_cmd]);
+		status = I40E_NOT_SUPPORTED;
+		*perrno = -ESRCH;
+		break;
+	}
+	return status;
+}
+
+/**
+ * i40e_nvmupd_state_writing - Handle NVM update state Writing
+ * @hw: pointer to hardware structure
+ * @cmd: pointer to nvm update command buffer
+ * @bytes: pointer to the data buffer
+ * @perrno: pointer to return error code
+ *
+ * NVM ownership is already held.  Process legitimate commands and set any
+ * change in state; reject all other commands
+ **/
+static i40e_status i40e_nvmupd_state_writing(struct i40e_hw *hw,
+					     struct i40e_nvm_access *cmd,
+					     u8 *bytes, int *perrno)
+{
+	i40e_status status = 0;
+	enum i40e_nvmupd_cmd upd_cmd;
+	bool retry_attempt = false;
+
+	upd_cmd = i40e_nvmupd_validate_command(hw, cmd, perrno);
+
+retry:
+	switch (upd_cmd) {
+	case I40E_NVMUPD_WRITE_CON:
+		status = i40e_nvmupd_nvm_write(hw, cmd, bytes, perrno);
+		if (!status) {
+			hw->nvm_wait_opcode = i40e_aqc_opc_nvm_update;
+			hw->nvmupd_state = I40E_NVMUPD_STATE_WRITE_WAIT;
+		}
+		break;
+
+	case I40E_NVMUPD_WRITE_LCB:
+		status = i40e_nvmupd_nvm_write(hw, cmd, bytes, perrno);
+		if (status) {
+			*perrno = hw->aq.asq_last_status ?
+				   i40e_aq_rc_to_posix(status,
+						       hw->aq.asq_last_status) :
+				   -EIO;
+			hw->nvmupd_state = I40E_NVMUPD_STATE_INIT;
+		} else {
+			hw->nvm_release_on_done = true;
+			hw->nvm_wait_opcode = i40e_aqc_opc_nvm_update;
+			hw->nvmupd_state = I40E_NVMUPD_STATE_INIT_WAIT;
+		}
+		break;
+
+	case I40E_NVMUPD_CSUM_CON:
+		/* Assumes the caller has acquired the nvm */
+		status = i40e_update_nvm_checksum(hw);
+		if (status) {
+			*perrno = hw->aq.asq_last_status ?
+				   i40e_aq_rc_to_posix(status,
+						       hw->aq.asq_last_status) :
+				   -EIO;
+			hw->nvmupd_state = I40E_NVMUPD_STATE_INIT;
+		} else {
+			hw->nvm_wait_opcode = i40e_aqc_opc_nvm_update;
+			hw->nvmupd_state = I40E_NVMUPD_STATE_WRITE_WAIT;
+		}
+		break;
+
+	case I40E_NVMUPD_CSUM_LCB:
+		/* Assumes the caller has acquired the nvm */
+		status = i40e_update_nvm_checksum(hw);
+		if (status) {
+			*perrno = hw->aq.asq_last_status ?
+				   i40e_aq_rc_to_posix(status,
+						       hw->aq.asq_last_status) :
+				   -EIO;
+			hw->nvmupd_state = I40E_NVMUPD_STATE_INIT;
+		} else {
+			hw->nvm_release_on_done = true;
+			hw->nvm_wait_opcode = i40e_aqc_opc_nvm_update;
+			hw->nvmupd_state = I40E_NVMUPD_STATE_INIT_WAIT;
+		}
+		break;
+
+	default:
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVMUPD: bad cmd %s in writing state.\n",
+			   i40e_nvm_update_state_str[upd_cmd]);
+		status = I40E_NOT_SUPPORTED;
+		*perrno = -ESRCH;
+		break;
+	}
+
+	/* In some circumstances, a multi-write transaction takes longer
+	 * than the default 3 minute timeout on the write semaphore.  If
+	 * the write failed with an EBUSY status, this is likely the problem,
+	 * so here we try to reacquire the semaphore then retry the write.
+	 * We only do one retry, then give up.
+	 */
+	if (status && (hw->aq.asq_last_status == I40E_AQ_RC_EBUSY) &&
+	    !retry_attempt) {
+		i40e_status old_status = status;
+		u32 old_asq_status = hw->aq.asq_last_status;
+		u32 gtime;
+
+		gtime = rd32(hw, I40E_GLVFGEN_TIMER);
+		if (gtime >= hw->nvm.hw_semaphore_timeout) {
+			i40e_debug(hw, I40E_DEBUG_ALL,
+				   "NVMUPD: write semaphore expired (%d >= %lld), retrying\n",
+				   gtime, hw->nvm.hw_semaphore_timeout);
+			i40e_release_nvm(hw);
+			status = i40e_acquire_nvm(hw, I40E_RESOURCE_WRITE);
+			if (status) {
+				i40e_debug(hw, I40E_DEBUG_ALL,
+					   "NVMUPD: write semaphore reacquire failed aq_err = %d\n",
+					   hw->aq.asq_last_status);
+				status = old_status;
+				hw->aq.asq_last_status = old_asq_status;
+			} else {
+				retry_attempt = true;
+				goto retry;
+			}
+		}
+	}
+
+	return status;
+}
+
+/**
+ * i40e_nvmupd_clear_wait_state - clear wait state on hw
+ * @hw: pointer to the hardware structure
+ **/
+void i40e_nvmupd_clear_wait_state(struct i40e_hw *hw)
+{
+	i40e_debug(hw, I40E_DEBUG_NVM,
+		   "NVMUPD: clearing wait on opcode 0x%04x\n",
+		   hw->nvm_wait_opcode);
+
+	if (hw->nvm_release_on_done) {
+		i40e_release_nvm(hw);
+		hw->nvm_release_on_done = false;
+	}
+	hw->nvm_wait_opcode = 0;
+
+	if (hw->aq.arq_last_status) {
+		hw->nvmupd_state = I40E_NVMUPD_STATE_ERROR;
+		return;
+	}
+
+	switch (hw->nvmupd_state) {
+	case I40E_NVMUPD_STATE_INIT_WAIT:
+		hw->nvmupd_state = I40E_NVMUPD_STATE_INIT;
+		break;
+
+	case I40E_NVMUPD_STATE_WRITE_WAIT:
+		hw->nvmupd_state = I40E_NVMUPD_STATE_WRITING;
+		break;
+
+	default:
+		break;
+	}
+}
+
+/**
+ * i40e_nvmupd_check_wait_event - handle NVM update operation events
+ * @hw: pointer to the hardware structure
+ * @opcode: the event that just happened
+ **/
+void i40e_nvmupd_check_wait_event(struct i40e_hw *hw, u16 opcode,
+				  struct i40e_aq_desc *desc)
+{
+	u32 aq_desc_len = sizeof(struct i40e_aq_desc);
+
+	if (opcode == hw->nvm_wait_opcode) {
+		memcpy(&hw->nvm_aq_event_desc, desc, aq_desc_len);
+		i40e_nvmupd_clear_wait_state(hw);
+	}
+}
+
+/**
+ * i40e_nvmupd_validate_command - Validate given command
+ * @hw: pointer to hardware structure
+ * @cmd: pointer to nvm update command buffer
+ * @perrno: pointer to return error code
+ *
+ * Return one of the valid command types or I40E_NVMUPD_INVALID
+ **/
+static enum i40e_nvmupd_cmd i40e_nvmupd_validate_command(struct i40e_hw *hw,
+						 struct i40e_nvm_access *cmd,
+						 int *perrno)
+{
+	enum i40e_nvmupd_cmd upd_cmd;
+	u8 module, transaction;
+
+	/* anything that doesn't match a recognized case is an error */
+	upd_cmd = I40E_NVMUPD_INVALID;
+
+	transaction = i40e_nvmupd_get_transaction(cmd->config);
+	module = i40e_nvmupd_get_module(cmd->config);
+
+	/* limits on data size */
+	if ((cmd->data_size < 1) ||
+	    (cmd->data_size > I40E_NVMUPD_MAX_DATA)) {
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "i40e_nvmupd_validate_command data_size %d\n",
+			   cmd->data_size);
+		*perrno = -EFAULT;
+		return I40E_NVMUPD_INVALID;
+	}
+
+	switch (cmd->command) {
+	case I40E_NVM_READ:
+		switch (transaction) {
+		case I40E_NVM_CON:
+			upd_cmd = I40E_NVMUPD_READ_CON;
+			break;
+		case I40E_NVM_SNT:
+			upd_cmd = I40E_NVMUPD_READ_SNT;
+			break;
+		case I40E_NVM_LCB:
+			upd_cmd = I40E_NVMUPD_READ_LCB;
+			break;
+		case I40E_NVM_SA:
+			upd_cmd = I40E_NVMUPD_READ_SA;
+			break;
+		case I40E_NVM_EXEC:
+			if (module == 0xf)
+				upd_cmd = I40E_NVMUPD_STATUS;
+			else if (module == 0)
+				upd_cmd = I40E_NVMUPD_GET_AQ_RESULT;
+			break;
+		case I40E_NVM_AQE:
+			upd_cmd = I40E_NVMUPD_GET_AQ_EVENT;
+			break;
+		}
+		break;
+
+	case I40E_NVM_WRITE:
+		switch (transaction) {
+		case I40E_NVM_CON:
+			upd_cmd = I40E_NVMUPD_WRITE_CON;
+			break;
+		case I40E_NVM_SNT:
+			upd_cmd = I40E_NVMUPD_WRITE_SNT;
+			break;
+		case I40E_NVM_LCB:
+			upd_cmd = I40E_NVMUPD_WRITE_LCB;
+			break;
+		case I40E_NVM_SA:
+			upd_cmd = I40E_NVMUPD_WRITE_SA;
+			break;
+		case I40E_NVM_ERA:
+			upd_cmd = I40E_NVMUPD_WRITE_ERA;
+			break;
+		case I40E_NVM_CSUM:
+			upd_cmd = I40E_NVMUPD_CSUM_CON;
+			break;
+		case (I40E_NVM_CSUM|I40E_NVM_SA):
+			upd_cmd = I40E_NVMUPD_CSUM_SA;
+			break;
+		case (I40E_NVM_CSUM|I40E_NVM_LCB):
+			upd_cmd = I40E_NVMUPD_CSUM_LCB;
+			break;
+		case I40E_NVM_EXEC:
+			if (module == 0)
+				upd_cmd = I40E_NVMUPD_EXEC_AQ;
+			break;
+		}
+		break;
+	}
+
+	return upd_cmd;
+}
+
+/**
+ * i40e_nvmupd_exec_aq - Run an AQ command
+ * @hw: pointer to hardware structure
+ * @cmd: pointer to nvm update command buffer
+ * @bytes: pointer to the data buffer
+ * @perrno: pointer to return error code
+ *
+ * cmd structure contains identifiers and data buffer
+ **/
+static i40e_status i40e_nvmupd_exec_aq(struct i40e_hw *hw,
+				       struct i40e_nvm_access *cmd,
+				       u8 *bytes, int *perrno)
+{
+	struct i40e_asq_cmd_details cmd_details;
+	i40e_status status;
+	struct i40e_aq_desc *aq_desc;
+	u32 buff_size = 0;
+	u8 *buff = NULL;
+	u32 aq_desc_len;
+	u32 aq_data_len;
+
+	i40e_debug(hw, I40E_DEBUG_NVM, "NVMUPD: %s\n", __func__);
+	if (cmd->offset == 0xffff)
+		return 0;
+
+	memset(&cmd_details, 0, sizeof(cmd_details));
+	cmd_details.wb_desc = &hw->nvm_wb_desc;
+
+	aq_desc_len = sizeof(struct i40e_aq_desc);
+	memset(&hw->nvm_wb_desc, 0, aq_desc_len);
+
+	/* get the aq descriptor */
+	if (cmd->data_size < aq_desc_len) {
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "NVMUPD: not enough aq desc bytes for exec, size %d < %d\n",
+			   cmd->data_size, aq_desc_len);
+		*perrno = -EINVAL;
+		return I40E_ERR_PARAM;
+	}
+	aq_desc = (struct i40e_aq_desc *)bytes;
+
+	/* if data buffer needed, make sure it's ready */
+	aq_data_len = cmd->data_size - aq_desc_len;
+	buff_size = max_t(u32, aq_data_len, le16_to_cpu(aq_desc->datalen));
+	if (buff_size) {
+		if (!hw->nvm_buff.va) {
+			status = i40e_allocate_virt_mem(hw, &hw->nvm_buff,
+							hw->aq.asq_buf_size);
+			if (status)
+				i40e_debug(hw, I40E_DEBUG_NVM,
+					   "NVMUPD: i40e_allocate_virt_mem for exec buff failed, %d\n",
+					   status);
+		}
+
+		if (hw->nvm_buff.va) {
+			buff = hw->nvm_buff.va;
+			memcpy(buff, &bytes[aq_desc_len], aq_data_len);
+		}
+	}
+
+	if (cmd->offset)
+		memset(&hw->nvm_aq_event_desc, 0, aq_desc_len);
+
+	/* and away we go! */
+	status = i40e_asq_send_command(hw, aq_desc, buff,
+				       buff_size, &cmd_details);
+	if (status) {
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "i40e_nvmupd_exec_aq err %s aq_err %s\n",
+			   i40e_stat_str(hw, status),
+			   i40e_aq_str(hw, hw->aq.asq_last_status));
+		*perrno = i40e_aq_rc_to_posix(status, hw->aq.asq_last_status);
+		return status;
+	}
+
+	/* should we wait for a followup event? */
+	if (cmd->offset) {
+		hw->nvm_wait_opcode = cmd->offset;
+		hw->nvmupd_state = I40E_NVMUPD_STATE_INIT_WAIT;
+	}
+
+	return status;
+}
+
+/**
+ * i40e_nvmupd_get_aq_result - Get the results from the previous exec_aq
+ * @hw: pointer to hardware structure
+ * @cmd: pointer to nvm update command buffer
+ * @bytes: pointer to the data buffer
+ * @perrno: pointer to return error code
+ *
+ * cmd structure contains identifiers and data buffer
+ **/
+static i40e_status i40e_nvmupd_get_aq_result(struct i40e_hw *hw,
+					     struct i40e_nvm_access *cmd,
+					     u8 *bytes, int *perrno)
+{
+	u32 aq_total_len;
+	u32 aq_desc_len;
+	int remainder;
+	u8 *buff;
+
+	i40e_debug(hw, I40E_DEBUG_NVM, "NVMUPD: %s\n", __func__);
+
+	aq_desc_len = sizeof(struct i40e_aq_desc);
+	aq_total_len = aq_desc_len + le16_to_cpu(hw->nvm_wb_desc.datalen);
+
+	/* check offset range */
+	if (cmd->offset > aq_total_len) {
+		i40e_debug(hw, I40E_DEBUG_NVM, "%s: offset too big %d > %d\n",
+			   __func__, cmd->offset, aq_total_len);
+		*perrno = -EINVAL;
+		return I40E_ERR_PARAM;
+	}
+
+	/* check copylength range */
+	if (cmd->data_size > (aq_total_len - cmd->offset)) {
+		int new_len = aq_total_len - cmd->offset;
+
+		i40e_debug(hw, I40E_DEBUG_NVM, "%s: copy length %d too big, trimming to %d\n",
+			   __func__, cmd->data_size, new_len);
+		cmd->data_size = new_len;
+	}
+
+	remainder = cmd->data_size;
+	if (cmd->offset < aq_desc_len) {
+		u32 len = aq_desc_len - cmd->offset;
+
+		len = min(len, cmd->data_size);
+		i40e_debug(hw, I40E_DEBUG_NVM, "%s: aq_desc bytes %d to %d\n",
+			   __func__, cmd->offset, cmd->offset + len);
+
+		buff = ((u8 *)&hw->nvm_wb_desc) + cmd->offset;
+		memcpy(bytes, buff, len);
+
+		bytes += len;
+		remainder -= len;
+		buff = hw->nvm_buff.va;
+	} else {
+		buff = hw->nvm_buff.va + (cmd->offset - aq_desc_len);
+	}
+
+	if (remainder > 0) {
+		int start_byte = buff - (u8 *)hw->nvm_buff.va;
+
+		i40e_debug(hw, I40E_DEBUG_NVM, "%s: databuf bytes %d to %d\n",
+			   __func__, start_byte, start_byte + remainder);
+		memcpy(bytes, buff, remainder);
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_nvmupd_get_aq_event - Get the Admin Queue event from previous exec_aq
+ * @hw: pointer to hardware structure
+ * @cmd: pointer to nvm update command buffer
+ * @bytes: pointer to the data buffer
+ * @perrno: pointer to return error code
+ *
+ * cmd structure contains identifiers and data buffer
+ **/
+static i40e_status i40e_nvmupd_get_aq_event(struct i40e_hw *hw,
+					    struct i40e_nvm_access *cmd,
+					    u8 *bytes, int *perrno)
+{
+	u32 aq_total_len;
+	u32 aq_desc_len;
+
+	i40e_debug(hw, I40E_DEBUG_NVM, "NVMUPD: %s\n", __func__);
+
+	aq_desc_len = sizeof(struct i40e_aq_desc);
+	aq_total_len = aq_desc_len + le16_to_cpu(hw->nvm_aq_event_desc.datalen);
+
+	/* check copylength range */
+	if (cmd->data_size > aq_total_len) {
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "%s: copy length %d too big, trimming to %d\n",
+			   __func__, cmd->data_size, aq_total_len);
+		cmd->data_size = aq_total_len;
+	}
+
+	memcpy(bytes, &hw->nvm_aq_event_desc, cmd->data_size);
+
+	return 0;
+}
+
+/**
+ * i40e_nvmupd_nvm_read - Read NVM
+ * @hw: pointer to hardware structure
+ * @cmd: pointer to nvm update command buffer
+ * @bytes: pointer to the data buffer
+ * @perrno: pointer to return error code
+ *
+ * cmd structure contains identifiers and data buffer
+ **/
+static i40e_status i40e_nvmupd_nvm_read(struct i40e_hw *hw,
+					struct i40e_nvm_access *cmd,
+					u8 *bytes, int *perrno)
+{
+	struct i40e_asq_cmd_details cmd_details;
+	i40e_status status;
+	u8 module, transaction;
+	bool last;
+
+	transaction = i40e_nvmupd_get_transaction(cmd->config);
+	module = i40e_nvmupd_get_module(cmd->config);
+	last = (transaction == I40E_NVM_LCB) || (transaction == I40E_NVM_SA);
+
+	memset(&cmd_details, 0, sizeof(cmd_details));
+	cmd_details.wb_desc = &hw->nvm_wb_desc;
+
+	status = i40e_aq_read_nvm(hw, module, cmd->offset, (u16)cmd->data_size,
+				  bytes, last, &cmd_details);
+	if (status) {
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "i40e_nvmupd_nvm_read mod 0x%x  off 0x%x  len 0x%x\n",
+			   module, cmd->offset, cmd->data_size);
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "i40e_nvmupd_nvm_read status %d aq %d\n",
+			   status, hw->aq.asq_last_status);
+		*perrno = i40e_aq_rc_to_posix(status, hw->aq.asq_last_status);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_nvmupd_nvm_erase - Erase an NVM module
+ * @hw: pointer to hardware structure
+ * @cmd: pointer to nvm update command buffer
+ * @perrno: pointer to return error code
+ *
+ * module, offset, data_size and data are in cmd structure
+ **/
+static i40e_status i40e_nvmupd_nvm_erase(struct i40e_hw *hw,
+					 struct i40e_nvm_access *cmd,
+					 int *perrno)
+{
+	i40e_status status = 0;
+	struct i40e_asq_cmd_details cmd_details;
+	u8 module, transaction;
+	bool last;
+
+	transaction = i40e_nvmupd_get_transaction(cmd->config);
+	module = i40e_nvmupd_get_module(cmd->config);
+	last = (transaction & I40E_NVM_LCB);
+
+	memset(&cmd_details, 0, sizeof(cmd_details));
+	cmd_details.wb_desc = &hw->nvm_wb_desc;
+
+	status = i40e_aq_erase_nvm(hw, module, cmd->offset, (u16)cmd->data_size,
+				   last, &cmd_details);
+	if (status) {
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "i40e_nvmupd_nvm_erase mod 0x%x  off 0x%x len 0x%x\n",
+			   module, cmd->offset, cmd->data_size);
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "i40e_nvmupd_nvm_erase status %d aq %d\n",
+			   status, hw->aq.asq_last_status);
+		*perrno = i40e_aq_rc_to_posix(status, hw->aq.asq_last_status);
+	}
+
+	return status;
+}
+
+/**
+ * i40e_nvmupd_nvm_write - Write NVM
+ * @hw: pointer to hardware structure
+ * @cmd: pointer to nvm update command buffer
+ * @bytes: pointer to the data buffer
+ * @perrno: pointer to return error code
+ *
+ * module, offset, data_size and data are in cmd structure
+ **/
+static i40e_status i40e_nvmupd_nvm_write(struct i40e_hw *hw,
+					 struct i40e_nvm_access *cmd,
+					 u8 *bytes, int *perrno)
+{
+	i40e_status status = 0;
+	struct i40e_asq_cmd_details cmd_details;
+	u8 module, transaction;
+	u8 preservation_flags;
+	bool last;
+
+	transaction = i40e_nvmupd_get_transaction(cmd->config);
+	module = i40e_nvmupd_get_module(cmd->config);
+	last = (transaction & I40E_NVM_LCB);
+	preservation_flags = i40e_nvmupd_get_preservation_flags(cmd->config);
+
+	memset(&cmd_details, 0, sizeof(cmd_details));
+	cmd_details.wb_desc = &hw->nvm_wb_desc;
+
+	status = i40e_aq_update_nvm(hw, module, cmd->offset,
+				    (u16)cmd->data_size, bytes, last,
+				    preservation_flags, &cmd_details);
+	if (status) {
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "i40e_nvmupd_nvm_write mod 0x%x off 0x%x len 0x%x\n",
+			   module, cmd->offset, cmd->data_size);
+		i40e_debug(hw, I40E_DEBUG_NVM,
+			   "i40e_nvmupd_nvm_write status %d aq %d\n",
+			   status, hw->aq.asq_last_status);
+		*perrno = i40e_aq_rc_to_posix(status, hw->aq.asq_last_status);
+	}
+
+	return status;
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_osdep.h b/drivers/net/ethernet/intel/i40e/i40e_osdep.h
new file mode 100644
index 0000000..9c3c3b0
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_osdep.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_OSDEP_H_
+#define _I40E_OSDEP_H_
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/tcp.h>
+#include <linux/pci.h>
+#include <linux/highuid.h>
+
+/* get readq/writeq support for 32 bit kernels, use the low-first version */
+#include <linux/io-64-nonatomic-lo-hi.h>
+
+/* File to be the magic between shared code and
+ * actual OS primitives
+ */
+
+#define hw_dbg(hw, S, A...)	do {} while (0)
+
+#define wr32(a, reg, value)	writel((value), ((a)->hw_addr + (reg)))
+#define rd32(a, reg)		readl((a)->hw_addr + (reg))
+
+#define wr64(a, reg, value)	writeq((value), ((a)->hw_addr + (reg)))
+#define rd64(a, reg)		readq((a)->hw_addr + (reg))
+#define i40e_flush(a)		readl((a)->hw_addr + I40E_GLGEN_STAT)
+
+/* memory allocation tracking */
+struct i40e_dma_mem {
+	void *va;
+	dma_addr_t pa;
+	u32 size;
+};
+
+#define i40e_allocate_dma_mem(h, m, unused, s, a) \
+			i40e_allocate_dma_mem_d(h, m, s, a)
+#define i40e_free_dma_mem(h, m) i40e_free_dma_mem_d(h, m)
+
+struct i40e_virt_mem {
+	void *va;
+	u32 size;
+};
+
+#define i40e_allocate_virt_mem(h, m, s) i40e_allocate_virt_mem_d(h, m, s)
+#define i40e_free_virt_mem(h, m) i40e_free_virt_mem_d(h, m)
+
+#define i40e_debug(h, m, s, ...)				\
+do {								\
+	if (((m) & (h)->debug_mask))				\
+		pr_info("i40e %02x:%02x.%x " s,			\
+			(h)->bus.bus_id, (h)->bus.device,	\
+			(h)->bus.func, ##__VA_ARGS__);		\
+} while (0)
+
+typedef enum i40e_status_code i40e_status;
+#endif /* _I40E_OSDEP_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
new file mode 100644
index 0000000..2ec2418
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -0,0 +1,460 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_PROTOTYPE_H_
+#define _I40E_PROTOTYPE_H_
+
+#include "i40e_type.h"
+#include "i40e_alloc.h"
+#include <linux/avf/virtchnl.h>
+
+/* Prototypes for shared code functions that are not in
+ * the standard function pointer structures.  These are
+ * mostly because they are needed even before the init
+ * has happened and will assist in the early SW and FW
+ * setup.
+ */
+
+/* adminq functions */
+i40e_status i40e_init_adminq(struct i40e_hw *hw);
+i40e_status i40e_shutdown_adminq(struct i40e_hw *hw);
+void i40e_adminq_init_ring_data(struct i40e_hw *hw);
+i40e_status i40e_clean_arq_element(struct i40e_hw *hw,
+					     struct i40e_arq_event_info *e,
+					     u16 *events_pending);
+i40e_status i40e_asq_send_command(struct i40e_hw *hw,
+				struct i40e_aq_desc *desc,
+				void *buff, /* can be NULL */
+				u16  buff_size,
+				struct i40e_asq_cmd_details *cmd_details);
+
+/* debug function for adminq */
+void i40e_debug_aq(struct i40e_hw *hw, enum i40e_debug_mask mask,
+		   void *desc, void *buffer, u16 buf_len);
+
+void i40e_idle_aq(struct i40e_hw *hw);
+bool i40e_check_asq_alive(struct i40e_hw *hw);
+i40e_status i40e_aq_queue_shutdown(struct i40e_hw *hw, bool unloading);
+const char *i40e_aq_str(struct i40e_hw *hw, enum i40e_admin_queue_err aq_err);
+const char *i40e_stat_str(struct i40e_hw *hw, i40e_status stat_err);
+
+i40e_status i40e_aq_get_rss_lut(struct i40e_hw *hw, u16 seid,
+				bool pf_lut, u8 *lut, u16 lut_size);
+i40e_status i40e_aq_set_rss_lut(struct i40e_hw *hw, u16 seid,
+				bool pf_lut, u8 *lut, u16 lut_size);
+i40e_status i40e_aq_get_rss_key(struct i40e_hw *hw,
+				u16 seid,
+				struct i40e_aqc_get_set_rss_key_data *key);
+i40e_status i40e_aq_set_rss_key(struct i40e_hw *hw,
+				u16 seid,
+				struct i40e_aqc_get_set_rss_key_data *key);
+
+u32 i40e_led_get(struct i40e_hw *hw);
+void i40e_led_set(struct i40e_hw *hw, u32 mode, bool blink);
+i40e_status i40e_led_set_phy(struct i40e_hw *hw, bool on,
+			     u16 led_addr, u32 mode);
+i40e_status i40e_led_get_phy(struct i40e_hw *hw, u16 *led_addr,
+			     u16 *val);
+i40e_status i40e_blink_phy_link_led(struct i40e_hw *hw,
+				    u32 time, u32 interval);
+
+/* admin send queue commands */
+
+i40e_status i40e_aq_get_firmware_version(struct i40e_hw *hw,
+				u16 *fw_major_version, u16 *fw_minor_version,
+				u32 *fw_build,
+				u16 *api_major_version, u16 *api_minor_version,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_debug_write_register(struct i40e_hw *hw,
+					u32 reg_addr, u64 reg_val,
+					struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_debug_read_register(struct i40e_hw *hw,
+				u32  reg_addr, u64 *reg_val,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_set_phy_debug(struct i40e_hw *hw, u8 cmd_flags,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_set_default_vsi(struct i40e_hw *hw, u16 vsi_id,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_clear_default_vsi(struct i40e_hw *hw, u16 vsi_id,
+				      struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_get_phy_capabilities(struct i40e_hw *hw,
+			bool qualified_modules, bool report_init,
+			struct i40e_aq_get_phy_abilities_resp *abilities,
+			struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_phy_config(struct i40e_hw *hw,
+				struct i40e_aq_set_phy_config *config,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_set_fc(struct i40e_hw *hw, u8 *aq_failures,
+				  bool atomic_reset);
+i40e_status i40e_aq_set_phy_int_mask(struct i40e_hw *hw, u16 mask,
+				     struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_clear_pxe_mode(struct i40e_hw *hw,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_set_link_restart_an(struct i40e_hw *hw,
+					bool enable_link,
+					struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_get_link_info(struct i40e_hw *hw,
+				bool enable_lse, struct i40e_link_status *link,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_set_local_advt_reg(struct i40e_hw *hw,
+				u64 advt_reg,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_send_driver_version(struct i40e_hw *hw,
+				struct i40e_driver_version *dv,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_add_vsi(struct i40e_hw *hw,
+				struct i40e_vsi_context *vsi_ctx,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_set_vsi_broadcast(struct i40e_hw *hw,
+				u16 vsi_id, bool set_filter,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_set_vsi_unicast_promiscuous(struct i40e_hw *hw,
+		u16 vsi_id, bool set, struct i40e_asq_cmd_details *cmd_details,
+		bool rx_only_promisc);
+i40e_status i40e_aq_set_vsi_multicast_promiscuous(struct i40e_hw *hw,
+		u16 vsi_id, bool set, struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_vsi_mc_promisc_on_vlan(struct i40e_hw *hw,
+							 u16 seid, bool enable,
+							 u16 vid,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_vsi_uc_promisc_on_vlan(struct i40e_hw *hw,
+							 u16 seid, bool enable,
+							 u16 vid,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_set_vsi_bc_promisc_on_vlan(struct i40e_hw *hw,
+				u16 seid, bool enable, u16 vid,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_set_vsi_vlan_promisc(struct i40e_hw *hw,
+				u16 seid, bool enable,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_get_vsi_params(struct i40e_hw *hw,
+				struct i40e_vsi_context *vsi_ctx,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_update_vsi_params(struct i40e_hw *hw,
+				struct i40e_vsi_context *vsi_ctx,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_add_veb(struct i40e_hw *hw, u16 uplink_seid,
+				u16 downlink_seid, u8 enabled_tc,
+				bool default_port, u16 *pveb_seid,
+				bool enable_stats,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_get_veb_parameters(struct i40e_hw *hw,
+				u16 veb_seid, u16 *switch_id, bool *floating,
+				u16 *statistic_index, u16 *vebs_used,
+				u16 *vebs_free,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_add_macvlan(struct i40e_hw *hw, u16 vsi_id,
+			struct i40e_aqc_add_macvlan_element_data *mv_list,
+			u16 count, struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_remove_macvlan(struct i40e_hw *hw, u16 vsi_id,
+			struct i40e_aqc_remove_macvlan_element_data *mv_list,
+			u16 count, struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_add_mirrorrule(struct i40e_hw *hw, u16 sw_seid,
+			u16 rule_type, u16 dest_vsi, u16 count, __le16 *mr_list,
+			struct i40e_asq_cmd_details *cmd_details,
+			u16 *rule_id, u16 *rules_used, u16 *rules_free);
+i40e_status i40e_aq_delete_mirrorrule(struct i40e_hw *hw, u16 sw_seid,
+			u16 rule_type, u16 rule_id, u16 count, __le16 *mr_list,
+			struct i40e_asq_cmd_details *cmd_details,
+			u16 *rules_used, u16 *rules_free);
+
+i40e_status i40e_aq_send_msg_to_vf(struct i40e_hw *hw, u16 vfid,
+				u32 v_opcode, u32 v_retval, u8 *msg, u16 msglen,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_get_switch_config(struct i40e_hw *hw,
+				struct i40e_aqc_get_switch_config_resp *buf,
+				u16 buf_size, u16 *start_seid,
+				struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code i40e_aq_set_switch_config(struct i40e_hw *hw,
+						u16 flags,
+						u16 valid_flags, u8 mode,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_request_resource(struct i40e_hw *hw,
+				enum i40e_aq_resources_ids resource,
+				enum i40e_aq_resource_access_type access,
+				u8 sdp_number, u64 *timeout,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_release_resource(struct i40e_hw *hw,
+				enum i40e_aq_resources_ids resource,
+				u8 sdp_number,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_read_nvm(struct i40e_hw *hw, u8 module_pointer,
+				u32 offset, u16 length, void *data,
+				bool last_command,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_erase_nvm(struct i40e_hw *hw, u8 module_pointer,
+			      u32 offset, u16 length, bool last_command,
+			      struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_discover_capabilities(struct i40e_hw *hw,
+				void *buff, u16 buff_size, u16 *data_size,
+				enum i40e_admin_queue_opc list_type_opc,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_update_nvm(struct i40e_hw *hw, u8 module_pointer,
+				u32 offset, u16 length, void *data,
+				bool last_command, u8 preservation_flags,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_get_lldp_mib(struct i40e_hw *hw, u8 bridge_type,
+				u8 mib_type, void *buff, u16 buff_size,
+				u16 *local_len, u16 *remote_len,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_cfg_lldp_mib_change_event(struct i40e_hw *hw,
+				bool enable_update,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_stop_lldp(struct i40e_hw *hw, bool shutdown_agent,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_set_dcb_parameters(struct i40e_hw *hw,
+				       bool dcb_enable,
+				       struct i40e_asq_cmd_details
+				       *cmd_details);
+i40e_status i40e_aq_start_lldp(struct i40e_hw *hw,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_get_cee_dcb_config(struct i40e_hw *hw,
+				       void *buff, u16 buff_size,
+				       struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_add_udp_tunnel(struct i40e_hw *hw,
+				u16 udp_port, u8 protocol_index,
+				u8 *filter_index,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_del_udp_tunnel(struct i40e_hw *hw, u8 index,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_delete_element(struct i40e_hw *hw, u16 seid,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_mac_address_write(struct i40e_hw *hw,
+				    u16 flags, u8 *mac_addr,
+				    struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_config_vsi_bw_limit(struct i40e_hw *hw,
+				u16 seid, u16 credit, u8 max_credit,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_dcb_updated(struct i40e_hw *hw,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_config_switch_comp_bw_limit(struct i40e_hw *hw,
+				u16 seid, u16 credit, u8 max_bw,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_config_vsi_tc_bw(struct i40e_hw *hw, u16 seid,
+			struct i40e_aqc_configure_vsi_tc_bw_data *bw_data,
+			struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_config_switch_comp_ets(struct i40e_hw *hw,
+		u16 seid,
+		struct i40e_aqc_configure_switching_comp_ets_data *ets_data,
+		enum i40e_admin_queue_opc opcode,
+		struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_config_switch_comp_bw_config(struct i40e_hw *hw,
+	u16 seid,
+	struct i40e_aqc_configure_switching_comp_bw_config_data *bw_data,
+	struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_query_vsi_bw_config(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_query_vsi_bw_config_resp *bw_data,
+			struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_query_vsi_ets_sla_config(struct i40e_hw *hw,
+			u16 seid,
+			struct i40e_aqc_query_vsi_ets_sla_config_resp *bw_data,
+			struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_query_switch_comp_ets_config(struct i40e_hw *hw,
+		u16 seid,
+		struct i40e_aqc_query_switching_comp_ets_config_resp *bw_data,
+		struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_query_port_ets_config(struct i40e_hw *hw,
+		u16 seid,
+		struct i40e_aqc_query_port_ets_config_resp *bw_data,
+		struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_query_switch_comp_bw_config(struct i40e_hw *hw,
+		u16 seid,
+		struct i40e_aqc_query_switching_comp_bw_config_resp *bw_data,
+		struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_resume_port_tx(struct i40e_hw *hw,
+				   struct i40e_asq_cmd_details *cmd_details);
+enum i40e_status_code
+i40e_aq_add_cloud_filters_bb(struct i40e_hw *hw, u16 seid,
+			     struct i40e_aqc_cloud_filters_element_bb *filters,
+			     u8 filter_count);
+enum i40e_status_code
+i40e_aq_add_cloud_filters(struct i40e_hw *hw, u16 vsi,
+			  struct i40e_aqc_cloud_filters_element_data *filters,
+			  u8 filter_count);
+enum i40e_status_code
+i40e_aq_rem_cloud_filters(struct i40e_hw *hw, u16 vsi,
+			  struct i40e_aqc_cloud_filters_element_data *filters,
+			  u8 filter_count);
+enum i40e_status_code
+i40e_aq_rem_cloud_filters_bb(struct i40e_hw *hw, u16 seid,
+			     struct i40e_aqc_cloud_filters_element_bb *filters,
+			     u8 filter_count);
+i40e_status i40e_read_lldp_cfg(struct i40e_hw *hw,
+			       struct i40e_lldp_variables *lldp_cfg);
+/* i40e_common */
+i40e_status i40e_init_shared_code(struct i40e_hw *hw);
+i40e_status i40e_pf_reset(struct i40e_hw *hw);
+void i40e_clear_hw(struct i40e_hw *hw);
+void i40e_clear_pxe_mode(struct i40e_hw *hw);
+i40e_status i40e_get_link_status(struct i40e_hw *hw, bool *link_up);
+i40e_status i40e_update_link_info(struct i40e_hw *hw);
+i40e_status i40e_get_mac_addr(struct i40e_hw *hw, u8 *mac_addr);
+i40e_status i40e_read_bw_from_alt_ram(struct i40e_hw *hw,
+				      u32 *max_bw, u32 *min_bw, bool *min_valid,
+				      bool *max_valid);
+i40e_status i40e_aq_configure_partition_bw(struct i40e_hw *hw,
+			struct i40e_aqc_configure_partition_bw_data *bw_data,
+			struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_get_port_mac_addr(struct i40e_hw *hw, u8 *mac_addr);
+i40e_status i40e_read_pba_string(struct i40e_hw *hw, u8 *pba_num,
+				 u32 pba_num_size);
+i40e_status i40e_validate_mac_addr(u8 *mac_addr);
+void i40e_pre_tx_queue_cfg(struct i40e_hw *hw, u32 queue, bool enable);
+/* prototype for functions used for NVM access */
+i40e_status i40e_init_nvm(struct i40e_hw *hw);
+i40e_status i40e_acquire_nvm(struct i40e_hw *hw,
+				      enum i40e_aq_resource_access_type access);
+void i40e_release_nvm(struct i40e_hw *hw);
+i40e_status i40e_read_nvm_word(struct i40e_hw *hw, u16 offset,
+					 u16 *data);
+i40e_status i40e_update_nvm_checksum(struct i40e_hw *hw);
+i40e_status i40e_validate_nvm_checksum(struct i40e_hw *hw,
+						 u16 *checksum);
+i40e_status i40e_nvmupd_command(struct i40e_hw *hw,
+				struct i40e_nvm_access *cmd,
+				u8 *bytes, int *);
+void i40e_nvmupd_check_wait_event(struct i40e_hw *hw, u16 opcode,
+				  struct i40e_aq_desc *desc);
+void i40e_nvmupd_clear_wait_state(struct i40e_hw *hw);
+void i40e_set_pci_config_data(struct i40e_hw *hw, u16 link_status);
+
+extern struct i40e_rx_ptype_decoded i40e_ptype_lookup[];
+
+static inline struct i40e_rx_ptype_decoded decode_rx_desc_ptype(u8 ptype)
+{
+	return i40e_ptype_lookup[ptype];
+}
+
+/**
+ * i40e_virtchnl_link_speed - Convert AdminQ link_speed to virtchnl definition
+ * @link_speed: the speed to convert
+ *
+ * Returns the link_speed in terms of the virtchnl interface, for use in
+ * converting link_speed as reported by the AdminQ into the format used for
+ * talking to virtchnl devices. If we can't represent the link speed properly,
+ * report LINK_SPEED_UNKNOWN.
+ **/
+static inline enum virtchnl_link_speed
+i40e_virtchnl_link_speed(enum i40e_aq_link_speed link_speed)
+{
+	switch (link_speed) {
+	case I40E_LINK_SPEED_100MB:
+		return VIRTCHNL_LINK_SPEED_100MB;
+	case I40E_LINK_SPEED_1GB:
+		return VIRTCHNL_LINK_SPEED_1GB;
+	case I40E_LINK_SPEED_10GB:
+		return VIRTCHNL_LINK_SPEED_10GB;
+	case I40E_LINK_SPEED_40GB:
+		return VIRTCHNL_LINK_SPEED_40GB;
+	case I40E_LINK_SPEED_20GB:
+		return VIRTCHNL_LINK_SPEED_20GB;
+	case I40E_LINK_SPEED_25GB:
+		return VIRTCHNL_LINK_SPEED_25GB;
+	case I40E_LINK_SPEED_UNKNOWN:
+	default:
+		return VIRTCHNL_LINK_SPEED_UNKNOWN;
+	}
+}
+
+/* prototype for functions used for SW locks */
+
+/* i40e_common for VF drivers*/
+void i40e_vf_parse_hw_config(struct i40e_hw *hw,
+			     struct virtchnl_vf_resource *msg);
+i40e_status i40e_vf_reset(struct i40e_hw *hw);
+i40e_status i40e_aq_send_msg_to_pf(struct i40e_hw *hw,
+				enum virtchnl_ops v_opcode,
+				i40e_status v_retval,
+				u8 *msg, u16 msglen,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_set_filter_control(struct i40e_hw *hw,
+				struct i40e_filter_control_settings *settings);
+i40e_status i40e_aq_add_rem_control_packet_filter(struct i40e_hw *hw,
+				u8 *mac_addr, u16 ethtype, u16 flags,
+				u16 vsi_seid, u16 queue, bool is_add,
+				struct i40e_control_filter_stats *stats,
+				struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_debug_dump(struct i40e_hw *hw, u8 cluster_id,
+			       u8 table_id, u32 start_index, u16 buff_size,
+			       void *buff, u16 *ret_buff_size,
+			       u8 *ret_next_table, u32 *ret_next_index,
+			       struct i40e_asq_cmd_details *cmd_details);
+void i40e_add_filter_to_drop_tx_flow_control_frames(struct i40e_hw *hw,
+						    u16 vsi_seid);
+i40e_status i40e_aq_rx_ctl_read_register(struct i40e_hw *hw,
+				u32 reg_addr, u32 *reg_val,
+				struct i40e_asq_cmd_details *cmd_details);
+u32 i40e_read_rx_ctl(struct i40e_hw *hw, u32 reg_addr);
+i40e_status i40e_aq_rx_ctl_write_register(struct i40e_hw *hw,
+				u32 reg_addr, u32 reg_val,
+				struct i40e_asq_cmd_details *cmd_details);
+void i40e_write_rx_ctl(struct i40e_hw *hw, u32 reg_addr, u32 reg_val);
+i40e_status i40e_aq_set_phy_register(struct i40e_hw *hw,
+				     u8 phy_select, u8 dev_addr,
+				     u32 reg_addr, u32 reg_val,
+				     struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_get_phy_register(struct i40e_hw *hw,
+				     u8 phy_select, u8 dev_addr,
+				     u32 reg_addr, u32 *reg_val,
+				     struct i40e_asq_cmd_details *cmd_details);
+
+i40e_status i40e_read_phy_register_clause22(struct i40e_hw *hw,
+					    u16 reg, u8 phy_addr, u16 *value);
+i40e_status i40e_write_phy_register_clause22(struct i40e_hw *hw,
+					     u16 reg, u8 phy_addr, u16 value);
+i40e_status i40e_read_phy_register_clause45(struct i40e_hw *hw,
+				u8 page, u16 reg, u8 phy_addr, u16 *value);
+i40e_status i40e_write_phy_register_clause45(struct i40e_hw *hw,
+				u8 page, u16 reg, u8 phy_addr, u16 value);
+i40e_status i40e_read_phy_register(struct i40e_hw *hw, u8 page, u16 reg,
+				   u8 phy_addr, u16 *value);
+i40e_status i40e_write_phy_register(struct i40e_hw *hw, u8 page, u16 reg,
+				    u8 phy_addr, u16 value);
+u8 i40e_get_phy_address(struct i40e_hw *hw, u8 dev_num);
+i40e_status i40e_blink_phy_link_led(struct i40e_hw *hw,
+				    u32 time, u32 interval);
+i40e_status i40e_aq_write_ddp(struct i40e_hw *hw, void *buff,
+			      u16 buff_size, u32 track_id,
+			      u32 *error_offset, u32 *error_info,
+			      struct i40e_asq_cmd_details *
+			      cmd_details);
+i40e_status i40e_aq_get_ddp_list(struct i40e_hw *hw, void *buff,
+				 u16 buff_size, u8 flags,
+				 struct i40e_asq_cmd_details *
+				 cmd_details);
+struct i40e_generic_seg_header *
+i40e_find_segment_in_package(u32 segment_type,
+			     struct i40e_package_header *pkg_header);
+enum i40e_status_code
+i40e_write_profile(struct i40e_hw *hw, struct i40e_profile_segment *i40e_seg,
+		   u32 track_id);
+enum i40e_status_code
+i40e_add_pinfo_to_list(struct i40e_hw *hw,
+		       struct i40e_profile_segment *profile,
+		       u8 *profile_info_sec, u32 track_id);
+#endif /* _I40E_PROTOTYPE_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
new file mode 100644
index 0000000..5b47dd1
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -0,0 +1,819 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#include "i40e.h"
+#include <linux/ptp_classify.h>
+
+/* The XL710 timesync is very much like Intel's 82599 design when it comes to
+ * the fundamental clock design. However, the clock operations are much simpler
+ * in the XL710 because the device supports a full 64 bits of nanoseconds.
+ * Because the field is so wide, we can forgo the cycle counter and just
+ * operate with the nanosecond field directly without fear of overflow.
+ *
+ * Much like the 82599, the update period is dependent upon the link speed:
+ * At 40Gb link or no link, the period is 1.6ns.
+ * At 10Gb link, the period is multiplied by 2. (3.2ns)
+ * At 1Gb link, the period is multiplied by 20. (32ns)
+ * 1588 functionality is not supported at 100Mbps.
+ */
+#define I40E_PTP_40GB_INCVAL 0x0199999999ULL
+#define I40E_PTP_10GB_INCVAL 0x0333333333ULL
+#define I40E_PTP_1GB_INCVAL  0x2000000000ULL
+
+#define I40E_PRTTSYN_CTL1_TSYNTYPE_V1  BIT(I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT)
+#define I40E_PRTTSYN_CTL1_TSYNTYPE_V2  (2 << \
+					I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT)
+
+/**
+ * i40e_ptp_read - Read the PHC time from the device
+ * @pf: Board private structure
+ * @ts: timespec structure to hold the current time value
+ *
+ * This function reads the PRTTSYN_TIME registers and stores them in a
+ * timespec. However, since the registers are 64 bits of nanoseconds, we must
+ * convert the result to a timespec before we can return.
+ **/
+static void i40e_ptp_read(struct i40e_pf *pf, struct timespec64 *ts)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u32 hi, lo;
+	u64 ns;
+
+	/* The timer latches on the lowest register read. */
+	lo = rd32(hw, I40E_PRTTSYN_TIME_L);
+	hi = rd32(hw, I40E_PRTTSYN_TIME_H);
+
+	ns = (((u64)hi) << 32) | lo;
+
+	*ts = ns_to_timespec64(ns);
+}
+
+/**
+ * i40e_ptp_write - Write the PHC time to the device
+ * @pf: Board private structure
+ * @ts: timespec structure that holds the new time value
+ *
+ * This function writes the PRTTSYN_TIME registers with the user value. Since
+ * we receive a timespec from the stack, we must convert that timespec into
+ * nanoseconds before programming the registers.
+ **/
+static void i40e_ptp_write(struct i40e_pf *pf, const struct timespec64 *ts)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u64 ns = timespec64_to_ns(ts);
+
+	/* The timer will not update until the high register is written, so
+	 * write the low register first.
+	 */
+	wr32(hw, I40E_PRTTSYN_TIME_L, ns & 0xFFFFFFFF);
+	wr32(hw, I40E_PRTTSYN_TIME_H, ns >> 32);
+}
+
+/**
+ * i40e_ptp_convert_to_hwtstamp - Convert device clock to system time
+ * @hwtstamps: Timestamp structure to update
+ * @timestamp: Timestamp from the hardware
+ *
+ * We need to convert the NIC clock value into a hwtstamp which can be used by
+ * the upper level timestamping functions. Since the timestamp is simply a 64-
+ * bit nanosecond value, we can call ns_to_ktime directly to handle this.
+ **/
+static void i40e_ptp_convert_to_hwtstamp(struct skb_shared_hwtstamps *hwtstamps,
+					 u64 timestamp)
+{
+	memset(hwtstamps, 0, sizeof(*hwtstamps));
+
+	hwtstamps->hwtstamp = ns_to_ktime(timestamp);
+}
+
+/**
+ * i40e_ptp_adjfreq - Adjust the PHC frequency
+ * @ptp: The PTP clock structure
+ * @ppb: Parts per billion adjustment from the base
+ *
+ * Adjust the frequency of the PHC by the indicated parts per billion from the
+ * base frequency.
+ **/
+static int i40e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
+{
+	struct i40e_pf *pf = container_of(ptp, struct i40e_pf, ptp_caps);
+	struct i40e_hw *hw = &pf->hw;
+	u64 adj, freq, diff;
+	int neg_adj = 0;
+
+	if (ppb < 0) {
+		neg_adj = 1;
+		ppb = -ppb;
+	}
+
+	smp_mb(); /* Force any pending update before accessing. */
+	adj = READ_ONCE(pf->ptp_base_adj);
+
+	freq = adj;
+	freq *= ppb;
+	diff = div_u64(freq, 1000000000ULL);
+
+	if (neg_adj)
+		adj -= diff;
+	else
+		adj += diff;
+
+	wr32(hw, I40E_PRTTSYN_INC_L, adj & 0xFFFFFFFF);
+	wr32(hw, I40E_PRTTSYN_INC_H, adj >> 32);
+
+	return 0;
+}
+
+/**
+ * i40e_ptp_adjtime - Adjust the PHC time
+ * @ptp: The PTP clock structure
+ * @delta: Offset in nanoseconds to adjust the PHC time by
+ *
+ * Adjust the frequency of the PHC by the indicated parts per billion from the
+ * base frequency.
+ **/
+static int i40e_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct i40e_pf *pf = container_of(ptp, struct i40e_pf, ptp_caps);
+	struct timespec64 now;
+
+	mutex_lock(&pf->tmreg_lock);
+
+	i40e_ptp_read(pf, &now);
+	timespec64_add_ns(&now, delta);
+	i40e_ptp_write(pf, (const struct timespec64 *)&now);
+
+	mutex_unlock(&pf->tmreg_lock);
+
+	return 0;
+}
+
+/**
+ * i40e_ptp_gettime - Get the time of the PHC
+ * @ptp: The PTP clock structure
+ * @ts: timespec structure to hold the current time value
+ *
+ * Read the device clock and return the correct value on ns, after converting it
+ * into a timespec struct.
+ **/
+static int i40e_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+	struct i40e_pf *pf = container_of(ptp, struct i40e_pf, ptp_caps);
+
+	mutex_lock(&pf->tmreg_lock);
+	i40e_ptp_read(pf, ts);
+	mutex_unlock(&pf->tmreg_lock);
+
+	return 0;
+}
+
+/**
+ * i40e_ptp_settime - Set the time of the PHC
+ * @ptp: The PTP clock structure
+ * @ts: timespec structure that holds the new time value
+ *
+ * Set the device clock to the user input value. The conversion from timespec
+ * to ns happens in the write function.
+ **/
+static int i40e_ptp_settime(struct ptp_clock_info *ptp,
+			    const struct timespec64 *ts)
+{
+	struct i40e_pf *pf = container_of(ptp, struct i40e_pf, ptp_caps);
+
+	mutex_lock(&pf->tmreg_lock);
+	i40e_ptp_write(pf, ts);
+	mutex_unlock(&pf->tmreg_lock);
+
+	return 0;
+}
+
+/**
+ * i40e_ptp_feature_enable - Enable/disable ancillary features of the PHC subsystem
+ * @ptp: The PTP clock structure
+ * @rq: The requested feature to change
+ * @on: Enable/disable flag
+ *
+ * The XL710 does not support any of the ancillary features of the PHC
+ * subsystem, so this function may just return.
+ **/
+static int i40e_ptp_feature_enable(struct ptp_clock_info *ptp,
+				   struct ptp_clock_request *rq, int on)
+{
+	return -EOPNOTSUPP;
+}
+
+/**
+ * i40e_ptp_update_latch_events - Read I40E_PRTTSYN_STAT_1 and latch events
+ * @pf: the PF data structure
+ *
+ * This function reads I40E_PRTTSYN_STAT_1 and updates the corresponding timers
+ * for noticed latch events. This allows the driver to keep track of the first
+ * time a latch event was noticed which will be used to help clear out Rx
+ * timestamps for packets that got dropped or lost.
+ *
+ * This function will return the current value of I40E_PRTTSYN_STAT_1 and is
+ * expected to be called only while under the ptp_rx_lock.
+ **/
+static u32 i40e_ptp_get_rx_events(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u32 prttsyn_stat, new_latch_events;
+	int  i;
+
+	prttsyn_stat = rd32(hw, I40E_PRTTSYN_STAT_1);
+	new_latch_events = prttsyn_stat & ~pf->latch_event_flags;
+
+	/* Update the jiffies time for any newly latched timestamp. This
+	 * ensures that we store the time that we first discovered a timestamp
+	 * was latched by the hardware. The service task will later determine
+	 * if we should free the latch and drop that timestamp should too much
+	 * time pass. This flow ensures that we only update jiffies for new
+	 * events latched since the last time we checked, and not all events
+	 * currently latched, so that the service task accounting remains
+	 * accurate.
+	 */
+	for (i = 0; i < 4; i++) {
+		if (new_latch_events & BIT(i))
+			pf->latch_events[i] = jiffies;
+	}
+
+	/* Finally, we store the current status of the Rx timestamp latches */
+	pf->latch_event_flags = prttsyn_stat;
+
+	return prttsyn_stat;
+}
+
+/**
+ * i40e_ptp_rx_hang - Detect error case when Rx timestamp registers are hung
+ * @pf: The PF private data structure
+ * @vsi: The VSI with the rings relevant to 1588
+ *
+ * This watchdog task is scheduled to detect error case where hardware has
+ * dropped an Rx packet that was timestamped when the ring is full. The
+ * particular error is rare but leaves the device in a state unable to timestamp
+ * any future packets.
+ **/
+void i40e_ptp_rx_hang(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	unsigned int i, cleared = 0;
+
+	/* Since we cannot turn off the Rx timestamp logic if the device is
+	 * configured for Tx timestamping, we check if Rx timestamping is
+	 * configured. We don't want to spuriously warn about Rx timestamp
+	 * hangs if we don't care about the timestamps.
+	 */
+	if (!(pf->flags & I40E_FLAG_PTP) || !pf->ptp_rx)
+		return;
+
+	spin_lock_bh(&pf->ptp_rx_lock);
+
+	/* Update current latch times for Rx events */
+	i40e_ptp_get_rx_events(pf);
+
+	/* Check all the currently latched Rx events and see whether they have
+	 * been latched for over a second. It is assumed that any timestamp
+	 * should have been cleared within this time, or else it was captured
+	 * for a dropped frame that the driver never received. Thus, we will
+	 * clear any timestamp that has been latched for over 1 second.
+	 */
+	for (i = 0; i < 4; i++) {
+		if ((pf->latch_event_flags & BIT(i)) &&
+		    time_is_before_jiffies(pf->latch_events[i] + HZ)) {
+			rd32(hw, I40E_PRTTSYN_RXTIME_H(i));
+			pf->latch_event_flags &= ~BIT(i);
+			cleared++;
+		}
+	}
+
+	spin_unlock_bh(&pf->ptp_rx_lock);
+
+	/* Log a warning if more than 2 timestamps got dropped in the same
+	 * check. We don't want to warn about all drops because it can occur
+	 * in normal scenarios such as PTP frames on multicast addresses we
+	 * aren't listening to. However, administrator should know if this is
+	 * the reason packets aren't receiving timestamps.
+	 */
+	if (cleared > 2)
+		dev_dbg(&pf->pdev->dev,
+			"Dropped %d missed RXTIME timestamp events\n",
+			cleared);
+
+	/* Finally, update the rx_hwtstamp_cleared counter */
+	pf->rx_hwtstamp_cleared += cleared;
+}
+
+/**
+ * i40e_ptp_tx_hang - Detect error case when Tx timestamp register is hung
+ * @pf: The PF private data structure
+ *
+ * This watchdog task is run periodically to make sure that we clear the Tx
+ * timestamp logic if we don't obtain a timestamp in a reasonable amount of
+ * time. It is unexpected in the normal case but if it occurs it results in
+ * permanently prevent timestamps of future packets
+ **/
+void i40e_ptp_tx_hang(struct i40e_pf *pf)
+{
+	if (!(pf->flags & I40E_FLAG_PTP) || !pf->ptp_tx)
+		return;
+
+	/* Nothing to do if we're not already waiting for a timestamp */
+	if (!test_bit(__I40E_PTP_TX_IN_PROGRESS, pf->state))
+		return;
+
+	/* We already have a handler routine which is run when we are notified
+	 * of a Tx timestamp in the hardware. If we don't get an interrupt
+	 * within a second it is reasonable to assume that we never will.
+	 */
+	if (time_is_before_jiffies(pf->ptp_tx_start + HZ)) {
+		dev_kfree_skb_any(pf->ptp_tx_skb);
+		pf->ptp_tx_skb = NULL;
+		clear_bit_unlock(__I40E_PTP_TX_IN_PROGRESS, pf->state);
+		pf->tx_hwtstamp_timeouts++;
+	}
+}
+
+/**
+ * i40e_ptp_tx_hwtstamp - Utility function which returns the Tx timestamp
+ * @pf: Board private structure
+ *
+ * Read the value of the Tx timestamp from the registers, convert it into a
+ * value consumable by the stack, and store that result into the shhwtstamps
+ * struct before returning it up the stack.
+ **/
+void i40e_ptp_tx_hwtstamp(struct i40e_pf *pf)
+{
+	struct skb_shared_hwtstamps shhwtstamps;
+	struct sk_buff *skb = pf->ptp_tx_skb;
+	struct i40e_hw *hw = &pf->hw;
+	u32 hi, lo;
+	u64 ns;
+
+	if (!(pf->flags & I40E_FLAG_PTP) || !pf->ptp_tx)
+		return;
+
+	/* don't attempt to timestamp if we don't have an skb */
+	if (!pf->ptp_tx_skb)
+		return;
+
+	lo = rd32(hw, I40E_PRTTSYN_TXTIME_L);
+	hi = rd32(hw, I40E_PRTTSYN_TXTIME_H);
+
+	ns = (((u64)hi) << 32) | lo;
+	i40e_ptp_convert_to_hwtstamp(&shhwtstamps, ns);
+
+	/* Clear the bit lock as soon as possible after reading the register,
+	 * and prior to notifying the stack via skb_tstamp_tx(). Otherwise
+	 * applications might wake up and attempt to request another transmit
+	 * timestamp prior to the bit lock being cleared.
+	 */
+	pf->ptp_tx_skb = NULL;
+	clear_bit_unlock(__I40E_PTP_TX_IN_PROGRESS, pf->state);
+
+	/* Notify the stack and free the skb after we've unlocked */
+	skb_tstamp_tx(skb, &shhwtstamps);
+	dev_kfree_skb_any(skb);
+}
+
+/**
+ * i40e_ptp_rx_hwtstamp - Utility function which checks for an Rx timestamp
+ * @pf: Board private structure
+ * @skb: Particular skb to send timestamp with
+ * @index: Index into the receive timestamp registers for the timestamp
+ *
+ * The XL710 receives a notification in the receive descriptor with an offset
+ * into the set of RXTIME registers where the timestamp is for that skb. This
+ * function goes and fetches the receive timestamp from that offset, if a valid
+ * one exists. The RXTIME registers are in ns, so we must convert the result
+ * first.
+ **/
+void i40e_ptp_rx_hwtstamp(struct i40e_pf *pf, struct sk_buff *skb, u8 index)
+{
+	u32 prttsyn_stat, hi, lo;
+	struct i40e_hw *hw;
+	u64 ns;
+
+	/* Since we cannot turn off the Rx timestamp logic if the device is
+	 * doing Tx timestamping, check if Rx timestamping is configured.
+	 */
+	if (!(pf->flags & I40E_FLAG_PTP) || !pf->ptp_rx)
+		return;
+
+	hw = &pf->hw;
+
+	spin_lock_bh(&pf->ptp_rx_lock);
+
+	/* Get current Rx events and update latch times */
+	prttsyn_stat = i40e_ptp_get_rx_events(pf);
+
+	/* TODO: Should we warn about missing Rx timestamp event? */
+	if (!(prttsyn_stat & BIT(index))) {
+		spin_unlock_bh(&pf->ptp_rx_lock);
+		return;
+	}
+
+	/* Clear the latched event since we're about to read its register */
+	pf->latch_event_flags &= ~BIT(index);
+
+	lo = rd32(hw, I40E_PRTTSYN_RXTIME_L(index));
+	hi = rd32(hw, I40E_PRTTSYN_RXTIME_H(index));
+
+	spin_unlock_bh(&pf->ptp_rx_lock);
+
+	ns = (((u64)hi) << 32) | lo;
+
+	i40e_ptp_convert_to_hwtstamp(skb_hwtstamps(skb), ns);
+}
+
+/**
+ * i40e_ptp_set_increment - Utility function to update clock increment rate
+ * @pf: Board private structure
+ *
+ * During a link change, the DMA frequency that drives the 1588 logic will
+ * change. In order to keep the PRTTSYN_TIME registers in units of nanoseconds,
+ * we must update the increment value per clock tick.
+ **/
+void i40e_ptp_set_increment(struct i40e_pf *pf)
+{
+	struct i40e_link_status *hw_link_info;
+	struct i40e_hw *hw = &pf->hw;
+	u64 incval;
+
+	hw_link_info = &hw->phy.link_info;
+
+	i40e_aq_get_link_info(&pf->hw, true, NULL, NULL);
+
+	switch (hw_link_info->link_speed) {
+	case I40E_LINK_SPEED_10GB:
+		incval = I40E_PTP_10GB_INCVAL;
+		break;
+	case I40E_LINK_SPEED_1GB:
+		incval = I40E_PTP_1GB_INCVAL;
+		break;
+	case I40E_LINK_SPEED_100MB:
+	{
+		static int warn_once;
+
+		if (!warn_once) {
+			dev_warn(&pf->pdev->dev,
+				 "1588 functionality is not supported at 100 Mbps. Stopping the PHC.\n");
+			warn_once++;
+		}
+		incval = 0;
+		break;
+	}
+	case I40E_LINK_SPEED_40GB:
+	default:
+		incval = I40E_PTP_40GB_INCVAL;
+		break;
+	}
+
+	/* Write the new increment value into the increment register. The
+	 * hardware will not update the clock until both registers have been
+	 * written.
+	 */
+	wr32(hw, I40E_PRTTSYN_INC_L, incval & 0xFFFFFFFF);
+	wr32(hw, I40E_PRTTSYN_INC_H, incval >> 32);
+
+	/* Update the base adjustement value. */
+	WRITE_ONCE(pf->ptp_base_adj, incval);
+	smp_mb(); /* Force the above update. */
+}
+
+/**
+ * i40e_ptp_get_ts_config - ioctl interface to read the HW timestamping
+ * @pf: Board private structure
+ * @ifreq: ioctl data
+ *
+ * Obtain the current hardware timestamping settigs as requested. To do this,
+ * keep a shadow copy of the timestamp settings rather than attempting to
+ * deconstruct it from the registers.
+ **/
+int i40e_ptp_get_ts_config(struct i40e_pf *pf, struct ifreq *ifr)
+{
+	struct hwtstamp_config *config = &pf->tstamp_config;
+
+	if (!(pf->flags & I40E_FLAG_PTP))
+		return -EOPNOTSUPP;
+
+	return copy_to_user(ifr->ifr_data, config, sizeof(*config)) ?
+		-EFAULT : 0;
+}
+
+/**
+ * i40e_ptp_set_timestamp_mode - setup hardware for requested timestamp mode
+ * @pf: Board private structure
+ * @config: hwtstamp settings requested or saved
+ *
+ * Control hardware registers to enter the specific mode requested by the
+ * user. Also used during reset path to ensure that timestamp settings are
+ * maintained.
+ *
+ * Note: modifies config in place, and may update the requested mode to be
+ * more broad if the specific filter is not directly supported.
+ **/
+static int i40e_ptp_set_timestamp_mode(struct i40e_pf *pf,
+				       struct hwtstamp_config *config)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u32 tsyntype, regval;
+
+	/* Reserved for future extensions. */
+	if (config->flags)
+		return -EINVAL;
+
+	switch (config->tx_type) {
+	case HWTSTAMP_TX_OFF:
+		pf->ptp_tx = false;
+		break;
+	case HWTSTAMP_TX_ON:
+		pf->ptp_tx = true;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	switch (config->rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		pf->ptp_rx = false;
+		/* We set the type to V1, but do not enable UDP packet
+		 * recognition. In this way, we should be as close to
+		 * disabling PTP Rx timestamps as possible since V1 packets
+		 * are always UDP, since L2 packets are a V2 feature.
+		 */
+		tsyntype = I40E_PRTTSYN_CTL1_TSYNTYPE_V1;
+		break;
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+		if (!(pf->hw_features & I40E_HW_PTP_L4_CAPABLE))
+			return -ERANGE;
+		pf->ptp_rx = true;
+		tsyntype = I40E_PRTTSYN_CTL1_V1MESSTYPE0_MASK |
+			   I40E_PRTTSYN_CTL1_TSYNTYPE_V1 |
+			   I40E_PRTTSYN_CTL1_UDP_ENA_MASK;
+		config->rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+		if (!(pf->hw_features & I40E_HW_PTP_L4_CAPABLE))
+			return -ERANGE;
+		/* fall through */
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+		pf->ptp_rx = true;
+		tsyntype = I40E_PRTTSYN_CTL1_V2MESSTYPE0_MASK |
+			   I40E_PRTTSYN_CTL1_TSYNTYPE_V2;
+		if (pf->hw_features & I40E_HW_PTP_L4_CAPABLE) {
+			tsyntype |= I40E_PRTTSYN_CTL1_UDP_ENA_MASK;
+			config->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+		} else {
+			config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
+		}
+		break;
+	case HWTSTAMP_FILTER_NTP_ALL:
+	case HWTSTAMP_FILTER_ALL:
+	default:
+		return -ERANGE;
+	}
+
+	/* Clear out all 1588-related registers to clear and unlatch them. */
+	spin_lock_bh(&pf->ptp_rx_lock);
+	rd32(hw, I40E_PRTTSYN_STAT_0);
+	rd32(hw, I40E_PRTTSYN_TXTIME_H);
+	rd32(hw, I40E_PRTTSYN_RXTIME_H(0));
+	rd32(hw, I40E_PRTTSYN_RXTIME_H(1));
+	rd32(hw, I40E_PRTTSYN_RXTIME_H(2));
+	rd32(hw, I40E_PRTTSYN_RXTIME_H(3));
+	pf->latch_event_flags = 0;
+	spin_unlock_bh(&pf->ptp_rx_lock);
+
+	/* Enable/disable the Tx timestamp interrupt based on user input. */
+	regval = rd32(hw, I40E_PRTTSYN_CTL0);
+	if (pf->ptp_tx)
+		regval |= I40E_PRTTSYN_CTL0_TXTIME_INT_ENA_MASK;
+	else
+		regval &= ~I40E_PRTTSYN_CTL0_TXTIME_INT_ENA_MASK;
+	wr32(hw, I40E_PRTTSYN_CTL0, regval);
+
+	regval = rd32(hw, I40E_PFINT_ICR0_ENA);
+	if (pf->ptp_tx)
+		regval |= I40E_PFINT_ICR0_ENA_TIMESYNC_MASK;
+	else
+		regval &= ~I40E_PFINT_ICR0_ENA_TIMESYNC_MASK;
+	wr32(hw, I40E_PFINT_ICR0_ENA, regval);
+
+	/* Although there is no simple on/off switch for Rx, we "disable" Rx
+	 * timestamps by setting to V1 only mode and clear the UDP
+	 * recognition. This ought to disable all PTP Rx timestamps as V1
+	 * packets are always over UDP. Note that software is configured to
+	 * ignore Rx timestamps via the pf->ptp_rx flag.
+	 */
+	regval = rd32(hw, I40E_PRTTSYN_CTL1);
+	/* clear everything but the enable bit */
+	regval &= I40E_PRTTSYN_CTL1_TSYNENA_MASK;
+	/* now enable bits for desired Rx timestamps */
+	regval |= tsyntype;
+	wr32(hw, I40E_PRTTSYN_CTL1, regval);
+
+	return 0;
+}
+
+/**
+ * i40e_ptp_set_ts_config - ioctl interface to control the HW timestamping
+ * @pf: Board private structure
+ * @ifreq: ioctl data
+ *
+ * Respond to the user filter requests and make the appropriate hardware
+ * changes here. The XL710 cannot support splitting of the Tx/Rx timestamping
+ * logic, so keep track in software of whether to indicate these timestamps
+ * or not.
+ *
+ * It is permissible to "upgrade" the user request to a broader filter, as long
+ * as the user receives the timestamps they care about and the user is notified
+ * the filter has been broadened.
+ **/
+int i40e_ptp_set_ts_config(struct i40e_pf *pf, struct ifreq *ifr)
+{
+	struct hwtstamp_config config;
+	int err;
+
+	if (!(pf->flags & I40E_FLAG_PTP))
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	err = i40e_ptp_set_timestamp_mode(pf, &config);
+	if (err)
+		return err;
+
+	/* save these settings for future reference */
+	pf->tstamp_config = config;
+
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
+		-EFAULT : 0;
+}
+
+/**
+ * i40e_ptp_create_clock - Create PTP clock device for userspace
+ * @pf: Board private structure
+ *
+ * This function creates a new PTP clock device. It only creates one if we
+ * don't already have one, so it is safe to call. Will return error if it
+ * can't create one, but success if we already have a device. Should be used
+ * by i40e_ptp_init to create clock initially, and prevent global resets from
+ * creating new clock devices.
+ **/
+static long i40e_ptp_create_clock(struct i40e_pf *pf)
+{
+	/* no need to create a clock device if we already have one */
+	if (!IS_ERR_OR_NULL(pf->ptp_clock))
+		return 0;
+
+	strncpy(pf->ptp_caps.name, i40e_driver_name, sizeof(pf->ptp_caps.name));
+	pf->ptp_caps.owner = THIS_MODULE;
+	pf->ptp_caps.max_adj = 999999999;
+	pf->ptp_caps.n_ext_ts = 0;
+	pf->ptp_caps.pps = 0;
+	pf->ptp_caps.adjfreq = i40e_ptp_adjfreq;
+	pf->ptp_caps.adjtime = i40e_ptp_adjtime;
+	pf->ptp_caps.gettime64 = i40e_ptp_gettime;
+	pf->ptp_caps.settime64 = i40e_ptp_settime;
+	pf->ptp_caps.enable = i40e_ptp_feature_enable;
+
+	/* Attempt to register the clock before enabling the hardware. */
+	pf->ptp_clock = ptp_clock_register(&pf->ptp_caps, &pf->pdev->dev);
+	if (IS_ERR(pf->ptp_clock))
+		return PTR_ERR(pf->ptp_clock);
+
+	/* clear the hwtstamp settings here during clock create, instead of
+	 * during regular init, so that we can maintain settings across a
+	 * reset or suspend.
+	 */
+	pf->tstamp_config.rx_filter = HWTSTAMP_FILTER_NONE;
+	pf->tstamp_config.tx_type = HWTSTAMP_TX_OFF;
+
+	return 0;
+}
+
+/**
+ * i40e_ptp_init - Initialize the 1588 support after device probe or reset
+ * @pf: Board private structure
+ *
+ * This function sets device up for 1588 support. The first time it is run, it
+ * will create a PHC clock device. It does not create a clock device if one
+ * already exists. It also reconfigures the device after a reset.
+ **/
+void i40e_ptp_init(struct i40e_pf *pf)
+{
+	struct net_device *netdev = pf->vsi[pf->lan_vsi]->netdev;
+	struct i40e_hw *hw = &pf->hw;
+	u32 pf_id;
+	long err;
+
+	/* Only one PF is assigned to control 1588 logic per port. Do not
+	 * enable any support for PFs not assigned via PRTTSYN_CTL0.PF_ID
+	 */
+	pf_id = (rd32(hw, I40E_PRTTSYN_CTL0) & I40E_PRTTSYN_CTL0_PF_ID_MASK) >>
+		I40E_PRTTSYN_CTL0_PF_ID_SHIFT;
+	if (hw->pf_id != pf_id) {
+		pf->flags &= ~I40E_FLAG_PTP;
+		dev_info(&pf->pdev->dev, "%s: PTP not supported on %s\n",
+			 __func__,
+			 netdev->name);
+		return;
+	}
+
+	mutex_init(&pf->tmreg_lock);
+	spin_lock_init(&pf->ptp_rx_lock);
+
+	/* ensure we have a clock device */
+	err = i40e_ptp_create_clock(pf);
+	if (err) {
+		pf->ptp_clock = NULL;
+		dev_err(&pf->pdev->dev, "%s: ptp_clock_register failed\n",
+			__func__);
+	} else if (pf->ptp_clock) {
+		struct timespec64 ts;
+		u32 regval;
+
+		if (pf->hw.debug_mask & I40E_DEBUG_LAN)
+			dev_info(&pf->pdev->dev, "PHC enabled\n");
+		pf->flags |= I40E_FLAG_PTP;
+
+		/* Ensure the clocks are running. */
+		regval = rd32(hw, I40E_PRTTSYN_CTL0);
+		regval |= I40E_PRTTSYN_CTL0_TSYNENA_MASK;
+		wr32(hw, I40E_PRTTSYN_CTL0, regval);
+		regval = rd32(hw, I40E_PRTTSYN_CTL1);
+		regval |= I40E_PRTTSYN_CTL1_TSYNENA_MASK;
+		wr32(hw, I40E_PRTTSYN_CTL1, regval);
+
+		/* Set the increment value per clock tick. */
+		i40e_ptp_set_increment(pf);
+
+		/* reset timestamping mode */
+		i40e_ptp_set_timestamp_mode(pf, &pf->tstamp_config);
+
+		/* Set the clock value. */
+		ts = ktime_to_timespec64(ktime_get_real());
+		i40e_ptp_settime(&pf->ptp_caps, &ts);
+	}
+}
+
+/**
+ * i40e_ptp_stop - Disable the driver/hardware support and unregister the PHC
+ * @pf: Board private structure
+ *
+ * This function handles the cleanup work required from the initialization by
+ * clearing out the important information and unregistering the PHC.
+ **/
+void i40e_ptp_stop(struct i40e_pf *pf)
+{
+	pf->flags &= ~I40E_FLAG_PTP;
+	pf->ptp_tx = false;
+	pf->ptp_rx = false;
+
+	if (pf->ptp_tx_skb) {
+		dev_kfree_skb_any(pf->ptp_tx_skb);
+		pf->ptp_tx_skb = NULL;
+		clear_bit_unlock(__I40E_PTP_TX_IN_PROGRESS, pf->state);
+	}
+
+	if (pf->ptp_clock) {
+		ptp_clock_unregister(pf->ptp_clock);
+		pf->ptp_clock = NULL;
+		dev_info(&pf->pdev->dev, "%s: removed PHC on %s\n", __func__,
+			 pf->vsi[pf->lan_vsi]->netdev->name);
+	}
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_register.h b/drivers/net/ethernet/intel/i40e/i40e_register.h
new file mode 100644
index 0000000..b3e206e
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_register.h
@@ -0,0 +1,5354 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_REGISTER_H_
+#define _I40E_REGISTER_H_
+
+#define I40E_GL_ARQBAH 0x000801C0 /* Reset: EMPR */
+#define I40E_GL_ARQBAH_ARQBAH_SHIFT 0
+#define I40E_GL_ARQBAH_ARQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_ARQBAH_ARQBAH_SHIFT)
+#define I40E_GL_ARQBAL 0x000800C0 /* Reset: EMPR */
+#define I40E_GL_ARQBAL_ARQBAL_SHIFT 0
+#define I40E_GL_ARQBAL_ARQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_ARQBAL_ARQBAL_SHIFT)
+#define I40E_GL_ARQH 0x000803C0 /* Reset: EMPR */
+#define I40E_GL_ARQH_ARQH_SHIFT 0
+#define I40E_GL_ARQH_ARQH_MASK I40E_MASK(0x3FF, I40E_GL_ARQH_ARQH_SHIFT)
+#define I40E_GL_ARQT 0x000804C0 /* Reset: EMPR */
+#define I40E_GL_ARQT_ARQT_SHIFT 0
+#define I40E_GL_ARQT_ARQT_MASK I40E_MASK(0x3FF, I40E_GL_ARQT_ARQT_SHIFT)
+#define I40E_GL_ATQBAH 0x00080140 /* Reset: EMPR */
+#define I40E_GL_ATQBAH_ATQBAH_SHIFT 0
+#define I40E_GL_ATQBAH_ATQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_ATQBAH_ATQBAH_SHIFT)
+#define I40E_GL_ATQBAL 0x00080040 /* Reset: EMPR */
+#define I40E_GL_ATQBAL_ATQBAL_SHIFT 0
+#define I40E_GL_ATQBAL_ATQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_ATQBAL_ATQBAL_SHIFT)
+#define I40E_GL_ATQH 0x00080340 /* Reset: EMPR */
+#define I40E_GL_ATQH_ATQH_SHIFT 0
+#define I40E_GL_ATQH_ATQH_MASK I40E_MASK(0x3FF, I40E_GL_ATQH_ATQH_SHIFT)
+#define I40E_GL_ATQLEN 0x00080240 /* Reset: EMPR */
+#define I40E_GL_ATQLEN_ATQLEN_SHIFT 0
+#define I40E_GL_ATQLEN_ATQLEN_MASK I40E_MASK(0x3FF, I40E_GL_ATQLEN_ATQLEN_SHIFT)
+#define I40E_GL_ATQLEN_ATQVFE_SHIFT 28
+#define I40E_GL_ATQLEN_ATQVFE_MASK I40E_MASK(0x1, I40E_GL_ATQLEN_ATQVFE_SHIFT)
+#define I40E_GL_ATQLEN_ATQOVFL_SHIFT 29
+#define I40E_GL_ATQLEN_ATQOVFL_MASK I40E_MASK(0x1, I40E_GL_ATQLEN_ATQOVFL_SHIFT)
+#define I40E_GL_ATQLEN_ATQCRIT_SHIFT 30
+#define I40E_GL_ATQLEN_ATQCRIT_MASK I40E_MASK(0x1, I40E_GL_ATQLEN_ATQCRIT_SHIFT)
+#define I40E_GL_ATQLEN_ATQENABLE_SHIFT 31
+#define I40E_GL_ATQLEN_ATQENABLE_MASK I40E_MASK(0x1, I40E_GL_ATQLEN_ATQENABLE_SHIFT)
+#define I40E_GL_ATQT 0x00080440 /* Reset: EMPR */
+#define I40E_GL_ATQT_ATQT_SHIFT 0
+#define I40E_GL_ATQT_ATQT_MASK I40E_MASK(0x3FF, I40E_GL_ATQT_ATQT_SHIFT)
+#define I40E_PF_ARQBAH 0x00080180 /* Reset: EMPR */
+#define I40E_PF_ARQBAH_ARQBAH_SHIFT 0
+#define I40E_PF_ARQBAH_ARQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_PF_ARQBAH_ARQBAH_SHIFT)
+#define I40E_PF_ARQBAL 0x00080080 /* Reset: EMPR */
+#define I40E_PF_ARQBAL_ARQBAL_SHIFT 0
+#define I40E_PF_ARQBAL_ARQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_PF_ARQBAL_ARQBAL_SHIFT)
+#define I40E_PF_ARQH 0x00080380 /* Reset: EMPR */
+#define I40E_PF_ARQH_ARQH_SHIFT 0
+#define I40E_PF_ARQH_ARQH_MASK I40E_MASK(0x3FF, I40E_PF_ARQH_ARQH_SHIFT)
+#define I40E_PF_ARQLEN 0x00080280 /* Reset: EMPR */
+#define I40E_PF_ARQLEN_ARQLEN_SHIFT 0
+#define I40E_PF_ARQLEN_ARQLEN_MASK I40E_MASK(0x3FF, I40E_PF_ARQLEN_ARQLEN_SHIFT)
+#define I40E_PF_ARQLEN_ARQVFE_SHIFT 28
+#define I40E_PF_ARQLEN_ARQVFE_MASK I40E_MASK(0x1, I40E_PF_ARQLEN_ARQVFE_SHIFT)
+#define I40E_PF_ARQLEN_ARQOVFL_SHIFT 29
+#define I40E_PF_ARQLEN_ARQOVFL_MASK I40E_MASK(0x1, I40E_PF_ARQLEN_ARQOVFL_SHIFT)
+#define I40E_PF_ARQLEN_ARQCRIT_SHIFT 30
+#define I40E_PF_ARQLEN_ARQCRIT_MASK I40E_MASK(0x1, I40E_PF_ARQLEN_ARQCRIT_SHIFT)
+#define I40E_PF_ARQLEN_ARQENABLE_SHIFT 31
+#define I40E_PF_ARQLEN_ARQENABLE_MASK I40E_MASK(0x1, I40E_PF_ARQLEN_ARQENABLE_SHIFT)
+#define I40E_PF_ARQT 0x00080480 /* Reset: EMPR */
+#define I40E_PF_ARQT_ARQT_SHIFT 0
+#define I40E_PF_ARQT_ARQT_MASK I40E_MASK(0x3FF, I40E_PF_ARQT_ARQT_SHIFT)
+#define I40E_PF_ATQBAH 0x00080100 /* Reset: EMPR */
+#define I40E_PF_ATQBAH_ATQBAH_SHIFT 0
+#define I40E_PF_ATQBAH_ATQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_PF_ATQBAH_ATQBAH_SHIFT)
+#define I40E_PF_ATQBAL 0x00080000 /* Reset: EMPR */
+#define I40E_PF_ATQBAL_ATQBAL_SHIFT 0
+#define I40E_PF_ATQBAL_ATQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_PF_ATQBAL_ATQBAL_SHIFT)
+#define I40E_PF_ATQH 0x00080300 /* Reset: EMPR */
+#define I40E_PF_ATQH_ATQH_SHIFT 0
+#define I40E_PF_ATQH_ATQH_MASK I40E_MASK(0x3FF, I40E_PF_ATQH_ATQH_SHIFT)
+#define I40E_PF_ATQLEN 0x00080200 /* Reset: EMPR */
+#define I40E_PF_ATQLEN_ATQLEN_SHIFT 0
+#define I40E_PF_ATQLEN_ATQLEN_MASK I40E_MASK(0x3FF, I40E_PF_ATQLEN_ATQLEN_SHIFT)
+#define I40E_PF_ATQLEN_ATQVFE_SHIFT 28
+#define I40E_PF_ATQLEN_ATQVFE_MASK I40E_MASK(0x1, I40E_PF_ATQLEN_ATQVFE_SHIFT)
+#define I40E_PF_ATQLEN_ATQOVFL_SHIFT 29
+#define I40E_PF_ATQLEN_ATQOVFL_MASK I40E_MASK(0x1, I40E_PF_ATQLEN_ATQOVFL_SHIFT)
+#define I40E_PF_ATQLEN_ATQCRIT_SHIFT 30
+#define I40E_PF_ATQLEN_ATQCRIT_MASK I40E_MASK(0x1, I40E_PF_ATQLEN_ATQCRIT_SHIFT)
+#define I40E_PF_ATQLEN_ATQENABLE_SHIFT 31
+#define I40E_PF_ATQLEN_ATQENABLE_MASK I40E_MASK(0x1, I40E_PF_ATQLEN_ATQENABLE_SHIFT)
+#define I40E_PF_ATQT 0x00080400 /* Reset: EMPR */
+#define I40E_PF_ATQT_ATQT_SHIFT 0
+#define I40E_PF_ATQT_ATQT_MASK I40E_MASK(0x3FF, I40E_PF_ATQT_ATQT_SHIFT)
+#define I40E_VF_ARQBAH(_VF) (0x00081400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ARQBAH_MAX_INDEX 127
+#define I40E_VF_ARQBAH_ARQBAH_SHIFT 0
+#define I40E_VF_ARQBAH_ARQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ARQBAH_ARQBAH_SHIFT)
+#define I40E_VF_ARQBAL(_VF) (0x00080C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ARQBAL_MAX_INDEX 127
+#define I40E_VF_ARQBAL_ARQBAL_SHIFT 0
+#define I40E_VF_ARQBAL_ARQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ARQBAL_ARQBAL_SHIFT)
+#define I40E_VF_ARQH(_VF) (0x00082400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ARQH_MAX_INDEX 127
+#define I40E_VF_ARQH_ARQH_SHIFT 0
+#define I40E_VF_ARQH_ARQH_MASK I40E_MASK(0x3FF, I40E_VF_ARQH_ARQH_SHIFT)
+#define I40E_VF_ARQLEN(_VF) (0x00081C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ARQLEN_MAX_INDEX 127
+#define I40E_VF_ARQLEN_ARQLEN_SHIFT 0
+#define I40E_VF_ARQLEN_ARQLEN_MASK I40E_MASK(0x3FF, I40E_VF_ARQLEN_ARQLEN_SHIFT)
+#define I40E_VF_ARQLEN_ARQVFE_SHIFT 28
+#define I40E_VF_ARQLEN_ARQVFE_MASK I40E_MASK(0x1, I40E_VF_ARQLEN_ARQVFE_SHIFT)
+#define I40E_VF_ARQLEN_ARQOVFL_SHIFT 29
+#define I40E_VF_ARQLEN_ARQOVFL_MASK I40E_MASK(0x1, I40E_VF_ARQLEN_ARQOVFL_SHIFT)
+#define I40E_VF_ARQLEN_ARQCRIT_SHIFT 30
+#define I40E_VF_ARQLEN_ARQCRIT_MASK I40E_MASK(0x1, I40E_VF_ARQLEN_ARQCRIT_SHIFT)
+#define I40E_VF_ARQLEN_ARQENABLE_SHIFT 31
+#define I40E_VF_ARQLEN_ARQENABLE_MASK I40E_MASK(0x1, I40E_VF_ARQLEN_ARQENABLE_SHIFT)
+#define I40E_VF_ARQT(_VF) (0x00082C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ARQT_MAX_INDEX 127
+#define I40E_VF_ARQT_ARQT_SHIFT 0
+#define I40E_VF_ARQT_ARQT_MASK I40E_MASK(0x3FF, I40E_VF_ARQT_ARQT_SHIFT)
+#define I40E_VF_ATQBAH(_VF) (0x00081000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ATQBAH_MAX_INDEX 127
+#define I40E_VF_ATQBAH_ATQBAH_SHIFT 0
+#define I40E_VF_ATQBAH_ATQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ATQBAH_ATQBAH_SHIFT)
+#define I40E_VF_ATQBAL(_VF) (0x00080800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ATQBAL_MAX_INDEX 127
+#define I40E_VF_ATQBAL_ATQBAL_SHIFT 0
+#define I40E_VF_ATQBAL_ATQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ATQBAL_ATQBAL_SHIFT)
+#define I40E_VF_ATQH(_VF) (0x00082000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ATQH_MAX_INDEX 127
+#define I40E_VF_ATQH_ATQH_SHIFT 0
+#define I40E_VF_ATQH_ATQH_MASK I40E_MASK(0x3FF, I40E_VF_ATQH_ATQH_SHIFT)
+#define I40E_VF_ATQLEN(_VF) (0x00081800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ATQLEN_MAX_INDEX 127
+#define I40E_VF_ATQLEN_ATQLEN_SHIFT 0
+#define I40E_VF_ATQLEN_ATQLEN_MASK I40E_MASK(0x3FF, I40E_VF_ATQLEN_ATQLEN_SHIFT)
+#define I40E_VF_ATQLEN_ATQVFE_SHIFT 28
+#define I40E_VF_ATQLEN_ATQVFE_MASK I40E_MASK(0x1, I40E_VF_ATQLEN_ATQVFE_SHIFT)
+#define I40E_VF_ATQLEN_ATQOVFL_SHIFT 29
+#define I40E_VF_ATQLEN_ATQOVFL_MASK I40E_MASK(0x1, I40E_VF_ATQLEN_ATQOVFL_SHIFT)
+#define I40E_VF_ATQLEN_ATQCRIT_SHIFT 30
+#define I40E_VF_ATQLEN_ATQCRIT_MASK I40E_MASK(0x1, I40E_VF_ATQLEN_ATQCRIT_SHIFT)
+#define I40E_VF_ATQLEN_ATQENABLE_SHIFT 31
+#define I40E_VF_ATQLEN_ATQENABLE_MASK I40E_MASK(0x1, I40E_VF_ATQLEN_ATQENABLE_SHIFT)
+#define I40E_VF_ATQT(_VF) (0x00082800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_VF_ATQT_MAX_INDEX 127
+#define I40E_VF_ATQT_ATQT_SHIFT 0
+#define I40E_VF_ATQT_ATQT_MASK I40E_MASK(0x3FF, I40E_VF_ATQT_ATQT_SHIFT)
+#define I40E_PRT_L2TAGSEN 0x001C0B20 /* Reset: CORER */
+#define I40E_PRT_L2TAGSEN_ENABLE_SHIFT 0
+#define I40E_PRT_L2TAGSEN_ENABLE_MASK I40E_MASK(0xFF, I40E_PRT_L2TAGSEN_ENABLE_SHIFT)
+#define I40E_PFCM_LAN_ERRDATA 0x0010C080 /* Reset: PFR */
+#define I40E_PFCM_LAN_ERRDATA_ERROR_CODE_SHIFT 0
+#define I40E_PFCM_LAN_ERRDATA_ERROR_CODE_MASK I40E_MASK(0xF, I40E_PFCM_LAN_ERRDATA_ERROR_CODE_SHIFT)
+#define I40E_PFCM_LAN_ERRDATA_Q_TYPE_SHIFT 4
+#define I40E_PFCM_LAN_ERRDATA_Q_TYPE_MASK I40E_MASK(0x7, I40E_PFCM_LAN_ERRDATA_Q_TYPE_SHIFT)
+#define I40E_PFCM_LAN_ERRDATA_Q_NUM_SHIFT 8
+#define I40E_PFCM_LAN_ERRDATA_Q_NUM_MASK I40E_MASK(0xFFF, I40E_PFCM_LAN_ERRDATA_Q_NUM_SHIFT)
+#define I40E_PFCM_LAN_ERRINFO 0x0010C000 /* Reset: PFR */
+#define I40E_PFCM_LAN_ERRINFO_ERROR_VALID_SHIFT 0
+#define I40E_PFCM_LAN_ERRINFO_ERROR_VALID_MASK I40E_MASK(0x1, I40E_PFCM_LAN_ERRINFO_ERROR_VALID_SHIFT)
+#define I40E_PFCM_LAN_ERRINFO_ERROR_INST_SHIFT 4
+#define I40E_PFCM_LAN_ERRINFO_ERROR_INST_MASK I40E_MASK(0x7, I40E_PFCM_LAN_ERRINFO_ERROR_INST_SHIFT)
+#define I40E_PFCM_LAN_ERRINFO_DBL_ERROR_CNT_SHIFT 8
+#define I40E_PFCM_LAN_ERRINFO_DBL_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_PFCM_LAN_ERRINFO_DBL_ERROR_CNT_SHIFT)
+#define I40E_PFCM_LAN_ERRINFO_RLU_ERROR_CNT_SHIFT 16
+#define I40E_PFCM_LAN_ERRINFO_RLU_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_PFCM_LAN_ERRINFO_RLU_ERROR_CNT_SHIFT)
+#define I40E_PFCM_LAN_ERRINFO_RLS_ERROR_CNT_SHIFT 24
+#define I40E_PFCM_LAN_ERRINFO_RLS_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_PFCM_LAN_ERRINFO_RLS_ERROR_CNT_SHIFT)
+#define I40E_PFCM_LANCTXCTL 0x0010C300 /* Reset: CORER */
+#define I40E_PFCM_LANCTXCTL_QUEUE_NUM_SHIFT 0
+#define I40E_PFCM_LANCTXCTL_QUEUE_NUM_MASK I40E_MASK(0xFFF, I40E_PFCM_LANCTXCTL_QUEUE_NUM_SHIFT)
+#define I40E_PFCM_LANCTXCTL_SUB_LINE_SHIFT 12
+#define I40E_PFCM_LANCTXCTL_SUB_LINE_MASK I40E_MASK(0x7, I40E_PFCM_LANCTXCTL_SUB_LINE_SHIFT)
+#define I40E_PFCM_LANCTXCTL_QUEUE_TYPE_SHIFT 15
+#define I40E_PFCM_LANCTXCTL_QUEUE_TYPE_MASK I40E_MASK(0x3, I40E_PFCM_LANCTXCTL_QUEUE_TYPE_SHIFT)
+#define I40E_PFCM_LANCTXCTL_OP_CODE_SHIFT 17
+#define I40E_PFCM_LANCTXCTL_OP_CODE_MASK I40E_MASK(0x3, I40E_PFCM_LANCTXCTL_OP_CODE_SHIFT)
+#define I40E_PFCM_LANCTXDATA(_i) (0x0010C100 + ((_i) * 128)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_PFCM_LANCTXDATA_MAX_INDEX 3
+#define I40E_PFCM_LANCTXDATA_DATA_SHIFT 0
+#define I40E_PFCM_LANCTXDATA_DATA_MASK I40E_MASK(0xFFFFFFFF, I40E_PFCM_LANCTXDATA_DATA_SHIFT)
+#define I40E_PFCM_LANCTXSTAT 0x0010C380 /* Reset: CORER */
+#define I40E_PFCM_LANCTXSTAT_CTX_DONE_SHIFT 0
+#define I40E_PFCM_LANCTXSTAT_CTX_DONE_MASK I40E_MASK(0x1, I40E_PFCM_LANCTXSTAT_CTX_DONE_SHIFT)
+#define I40E_PFCM_LANCTXSTAT_CTX_MISS_SHIFT 1
+#define I40E_PFCM_LANCTXSTAT_CTX_MISS_MASK I40E_MASK(0x1, I40E_PFCM_LANCTXSTAT_CTX_MISS_SHIFT)
+#define I40E_VFCM_PE_ERRDATA1(_VF) (0x00138800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFCM_PE_ERRDATA1_MAX_INDEX 127
+#define I40E_VFCM_PE_ERRDATA1_ERROR_CODE_SHIFT 0
+#define I40E_VFCM_PE_ERRDATA1_ERROR_CODE_MASK I40E_MASK(0xF, I40E_VFCM_PE_ERRDATA1_ERROR_CODE_SHIFT)
+#define I40E_VFCM_PE_ERRDATA1_Q_TYPE_SHIFT 4
+#define I40E_VFCM_PE_ERRDATA1_Q_TYPE_MASK I40E_MASK(0x7, I40E_VFCM_PE_ERRDATA1_Q_TYPE_SHIFT)
+#define I40E_VFCM_PE_ERRDATA1_Q_NUM_SHIFT 8
+#define I40E_VFCM_PE_ERRDATA1_Q_NUM_MASK I40E_MASK(0x3FFFF, I40E_VFCM_PE_ERRDATA1_Q_NUM_SHIFT)
+#define I40E_VFCM_PE_ERRINFO1(_VF) (0x00138400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFCM_PE_ERRINFO1_MAX_INDEX 127
+#define I40E_VFCM_PE_ERRINFO1_ERROR_VALID_SHIFT 0
+#define I40E_VFCM_PE_ERRINFO1_ERROR_VALID_MASK I40E_MASK(0x1, I40E_VFCM_PE_ERRINFO1_ERROR_VALID_SHIFT)
+#define I40E_VFCM_PE_ERRINFO1_ERROR_INST_SHIFT 4
+#define I40E_VFCM_PE_ERRINFO1_ERROR_INST_MASK I40E_MASK(0x7, I40E_VFCM_PE_ERRINFO1_ERROR_INST_SHIFT)
+#define I40E_VFCM_PE_ERRINFO1_DBL_ERROR_CNT_SHIFT 8
+#define I40E_VFCM_PE_ERRINFO1_DBL_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO1_DBL_ERROR_CNT_SHIFT)
+#define I40E_VFCM_PE_ERRINFO1_RLU_ERROR_CNT_SHIFT 16
+#define I40E_VFCM_PE_ERRINFO1_RLU_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO1_RLU_ERROR_CNT_SHIFT)
+#define I40E_VFCM_PE_ERRINFO1_RLS_ERROR_CNT_SHIFT 24
+#define I40E_VFCM_PE_ERRINFO1_RLS_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO1_RLS_ERROR_CNT_SHIFT)
+#define I40E_GLDCB_GENC 0x00083044 /* Reset: CORER */
+#define I40E_GLDCB_GENC_PCIRTT_SHIFT 0
+#define I40E_GLDCB_GENC_PCIRTT_MASK I40E_MASK(0xFFFF, I40E_GLDCB_GENC_PCIRTT_SHIFT)
+#define I40E_GLDCB_RUPTI 0x00122618 /* Reset: CORER */
+#define I40E_GLDCB_RUPTI_PFCTIMEOUT_UP_SHIFT 0
+#define I40E_GLDCB_RUPTI_PFCTIMEOUT_UP_MASK I40E_MASK(0xFFFFFFFF, I40E_GLDCB_RUPTI_PFCTIMEOUT_UP_SHIFT)
+#define I40E_PRTDCB_FCCFG 0x001E4640 /* Reset: GLOBR */
+#define I40E_PRTDCB_FCCFG_TFCE_SHIFT 3
+#define I40E_PRTDCB_FCCFG_TFCE_MASK I40E_MASK(0x3, I40E_PRTDCB_FCCFG_TFCE_SHIFT)
+#define I40E_PRTDCB_FCRTV 0x001E4600 /* Reset: GLOBR */
+#define I40E_PRTDCB_FCRTV_FC_REFRESH_TH_SHIFT 0
+#define I40E_PRTDCB_FCRTV_FC_REFRESH_TH_MASK I40E_MASK(0xFFFF, I40E_PRTDCB_FCRTV_FC_REFRESH_TH_SHIFT)
+#define I40E_PRTDCB_FCTTVN(_i) (0x001E4580 + ((_i) * 32)) /* _i=0...3 */ /* Reset: GLOBR */
+#define I40E_PRTDCB_FCTTVN_MAX_INDEX 3
+#define I40E_PRTDCB_FCTTVN_TTV_2N_SHIFT 0
+#define I40E_PRTDCB_FCTTVN_TTV_2N_MASK I40E_MASK(0xFFFF, I40E_PRTDCB_FCTTVN_TTV_2N_SHIFT)
+#define I40E_PRTDCB_FCTTVN_TTV_2N_P1_SHIFT 16
+#define I40E_PRTDCB_FCTTVN_TTV_2N_P1_MASK I40E_MASK(0xFFFF, I40E_PRTDCB_FCTTVN_TTV_2N_P1_SHIFT)
+#define I40E_PRTDCB_GENC 0x00083000 /* Reset: CORER */
+#define I40E_PRTDCB_GENC_RESERVED_1_SHIFT 0
+#define I40E_PRTDCB_GENC_RESERVED_1_MASK I40E_MASK(0x3, I40E_PRTDCB_GENC_RESERVED_1_SHIFT)
+#define I40E_PRTDCB_GENC_NUMTC_SHIFT 2
+#define I40E_PRTDCB_GENC_NUMTC_MASK I40E_MASK(0xF, I40E_PRTDCB_GENC_NUMTC_SHIFT)
+#define I40E_PRTDCB_GENC_FCOEUP_SHIFT 6
+#define I40E_PRTDCB_GENC_FCOEUP_MASK I40E_MASK(0x7, I40E_PRTDCB_GENC_FCOEUP_SHIFT)
+#define I40E_PRTDCB_GENC_FCOEUP_VALID_SHIFT 9
+#define I40E_PRTDCB_GENC_FCOEUP_VALID_MASK I40E_MASK(0x1, I40E_PRTDCB_GENC_FCOEUP_VALID_SHIFT)
+#define I40E_PRTDCB_GENC_PFCLDA_SHIFT 16
+#define I40E_PRTDCB_GENC_PFCLDA_MASK I40E_MASK(0xFFFF, I40E_PRTDCB_GENC_PFCLDA_SHIFT)
+#define I40E_PRTDCB_GENS 0x00083020 /* Reset: CORER */
+#define I40E_PRTDCB_GENS_DCBX_STATUS_SHIFT 0
+#define I40E_PRTDCB_GENS_DCBX_STATUS_MASK I40E_MASK(0x7, I40E_PRTDCB_GENS_DCBX_STATUS_SHIFT)
+#define I40E_PRTDCB_MFLCN 0x001E2400 /* Reset: GLOBR */
+#define I40E_PRTDCB_MFLCN_PMCF_SHIFT 0
+#define I40E_PRTDCB_MFLCN_PMCF_MASK I40E_MASK(0x1, I40E_PRTDCB_MFLCN_PMCF_SHIFT)
+#define I40E_PRTDCB_MFLCN_DPF_SHIFT 1
+#define I40E_PRTDCB_MFLCN_DPF_MASK I40E_MASK(0x1, I40E_PRTDCB_MFLCN_DPF_SHIFT)
+#define I40E_PRTDCB_MFLCN_RPFCM_SHIFT 2
+#define I40E_PRTDCB_MFLCN_RPFCM_MASK I40E_MASK(0x1, I40E_PRTDCB_MFLCN_RPFCM_SHIFT)
+#define I40E_PRTDCB_MFLCN_RFCE_SHIFT 3
+#define I40E_PRTDCB_MFLCN_RFCE_MASK I40E_MASK(0x1, I40E_PRTDCB_MFLCN_RFCE_SHIFT)
+#define I40E_PRTDCB_MFLCN_RPFCE_SHIFT 4
+#define I40E_PRTDCB_MFLCN_RPFCE_MASK I40E_MASK(0xFF, I40E_PRTDCB_MFLCN_RPFCE_SHIFT)
+#define I40E_PRTDCB_RETSC 0x001223E0 /* Reset: CORER */
+#define I40E_PRTDCB_RETSC_ETS_MODE_SHIFT 0
+#define I40E_PRTDCB_RETSC_ETS_MODE_MASK I40E_MASK(0x1, I40E_PRTDCB_RETSC_ETS_MODE_SHIFT)
+#define I40E_PRTDCB_RETSC_NON_ETS_MODE_SHIFT 1
+#define I40E_PRTDCB_RETSC_NON_ETS_MODE_MASK I40E_MASK(0x1, I40E_PRTDCB_RETSC_NON_ETS_MODE_SHIFT)
+#define I40E_PRTDCB_RETSC_ETS_MAX_EXP_SHIFT 2
+#define I40E_PRTDCB_RETSC_ETS_MAX_EXP_MASK I40E_MASK(0xF, I40E_PRTDCB_RETSC_ETS_MAX_EXP_SHIFT)
+#define I40E_PRTDCB_RETSC_LLTC_SHIFT 8
+#define I40E_PRTDCB_RETSC_LLTC_MASK I40E_MASK(0xFF, I40E_PRTDCB_RETSC_LLTC_SHIFT)
+#define I40E_PRTDCB_RETSTCC(_i) (0x00122180 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTDCB_RETSTCC_MAX_INDEX 7
+#define I40E_PRTDCB_RETSTCC_BWSHARE_SHIFT 0
+#define I40E_PRTDCB_RETSTCC_BWSHARE_MASK I40E_MASK(0x7F, I40E_PRTDCB_RETSTCC_BWSHARE_SHIFT)
+#define I40E_PRTDCB_RETSTCC_UPINTC_MODE_SHIFT 30
+#define I40E_PRTDCB_RETSTCC_UPINTC_MODE_MASK I40E_MASK(0x1, I40E_PRTDCB_RETSTCC_UPINTC_MODE_SHIFT)
+#define I40E_PRTDCB_RETSTCC_ETSTC_SHIFT 31
+#define I40E_PRTDCB_RETSTCC_ETSTC_MASK I40E_MASK(0x1, I40E_PRTDCB_RETSTCC_ETSTC_SHIFT)
+#define I40E_PRTDCB_RPPMC 0x001223A0 /* Reset: CORER */
+#define I40E_PRTDCB_RPPMC_LANRPPM_SHIFT 0
+#define I40E_PRTDCB_RPPMC_LANRPPM_MASK I40E_MASK(0xFF, I40E_PRTDCB_RPPMC_LANRPPM_SHIFT)
+#define I40E_PRTDCB_RPPMC_RDMARPPM_SHIFT 8
+#define I40E_PRTDCB_RPPMC_RDMARPPM_MASK I40E_MASK(0xFF, I40E_PRTDCB_RPPMC_RDMARPPM_SHIFT)
+#define I40E_PRTDCB_RPPMC_RX_FIFO_SIZE_SHIFT 16
+#define I40E_PRTDCB_RPPMC_RX_FIFO_SIZE_MASK I40E_MASK(0xFF, I40E_PRTDCB_RPPMC_RX_FIFO_SIZE_SHIFT)
+#define I40E_PRTDCB_RUP 0x001C0B00 /* Reset: CORER */
+#define I40E_PRTDCB_RUP_NOVLANUP_SHIFT 0
+#define I40E_PRTDCB_RUP_NOVLANUP_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP_NOVLANUP_SHIFT)
+#define I40E_PRTDCB_RUP2TC 0x001C09A0 /* Reset: CORER */
+#define I40E_PRTDCB_RUP2TC_UP0TC_SHIFT 0
+#define I40E_PRTDCB_RUP2TC_UP0TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP0TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP1TC_SHIFT 3
+#define I40E_PRTDCB_RUP2TC_UP1TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP1TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP2TC_SHIFT 6
+#define I40E_PRTDCB_RUP2TC_UP2TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP2TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP3TC_SHIFT 9
+#define I40E_PRTDCB_RUP2TC_UP3TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP3TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP4TC_SHIFT 12
+#define I40E_PRTDCB_RUP2TC_UP4TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP4TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP5TC_SHIFT 15
+#define I40E_PRTDCB_RUP2TC_UP5TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP5TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP6TC_SHIFT 18
+#define I40E_PRTDCB_RUP2TC_UP6TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP6TC_SHIFT)
+#define I40E_PRTDCB_RUP2TC_UP7TC_SHIFT 21
+#define I40E_PRTDCB_RUP2TC_UP7TC_MASK I40E_MASK(0x7, I40E_PRTDCB_RUP2TC_UP7TC_SHIFT)
+#define I40E_PRTDCB_RUPTQ(_i) (0x00122400 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTDCB_RUPTQ_MAX_INDEX 7
+#define I40E_PRTDCB_RUPTQ_RXQNUM_SHIFT 0
+#define I40E_PRTDCB_RUPTQ_RXQNUM_MASK I40E_MASK(0x3FFF, I40E_PRTDCB_RUPTQ_RXQNUM_SHIFT)
+#define I40E_PRTDCB_TC2PFC 0x001C0980 /* Reset: CORER */
+#define I40E_PRTDCB_TC2PFC_TC2PFC_SHIFT 0
+#define I40E_PRTDCB_TC2PFC_TC2PFC_MASK I40E_MASK(0xFF, I40E_PRTDCB_TC2PFC_TC2PFC_SHIFT)
+#define I40E_PRTDCB_TCMSTC(_i) (0x000A0040 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTDCB_TCMSTC_MAX_INDEX 7
+#define I40E_PRTDCB_TCMSTC_MSTC_SHIFT 0
+#define I40E_PRTDCB_TCMSTC_MSTC_MASK I40E_MASK(0xFFFFF, I40E_PRTDCB_TCMSTC_MSTC_SHIFT)
+#define I40E_PRTDCB_TCPMC 0x000A21A0 /* Reset: CORER */
+#define I40E_PRTDCB_TCPMC_CPM_SHIFT 0
+#define I40E_PRTDCB_TCPMC_CPM_MASK I40E_MASK(0x1FFF, I40E_PRTDCB_TCPMC_CPM_SHIFT)
+#define I40E_PRTDCB_TCPMC_LLTC_SHIFT 13
+#define I40E_PRTDCB_TCPMC_LLTC_MASK I40E_MASK(0xFF, I40E_PRTDCB_TCPMC_LLTC_SHIFT)
+#define I40E_PRTDCB_TCPMC_TCPM_MODE_SHIFT 30
+#define I40E_PRTDCB_TCPMC_TCPM_MODE_MASK I40E_MASK(0x1, I40E_PRTDCB_TCPMC_TCPM_MODE_SHIFT)
+#define I40E_PRTDCB_TCWSTC(_i) (0x000A2040 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTDCB_TCWSTC_MAX_INDEX 7
+#define I40E_PRTDCB_TCWSTC_MSTC_SHIFT 0
+#define I40E_PRTDCB_TCWSTC_MSTC_MASK I40E_MASK(0xFFFFF, I40E_PRTDCB_TCWSTC_MSTC_SHIFT)
+#define I40E_PRTDCB_TDPMC 0x000A0180 /* Reset: CORER */
+#define I40E_PRTDCB_TDPMC_DPM_SHIFT 0
+#define I40E_PRTDCB_TDPMC_DPM_MASK I40E_MASK(0xFF, I40E_PRTDCB_TDPMC_DPM_SHIFT)
+#define I40E_PRTDCB_TDPMC_TCPM_MODE_SHIFT 30
+#define I40E_PRTDCB_TDPMC_TCPM_MODE_MASK I40E_MASK(0x1, I40E_PRTDCB_TDPMC_TCPM_MODE_SHIFT)
+#define I40E_PRTDCB_TETSC_TCB 0x000AE060 /* Reset: CORER */
+#define I40E_PRTDCB_TETSC_TCB_EN_LL_STRICT_PRIORITY_SHIFT 0
+#define I40E_PRTDCB_TETSC_TCB_EN_LL_STRICT_PRIORITY_MASK I40E_MASK(0x1, I40E_PRTDCB_TETSC_TCB_EN_LL_STRICT_PRIORITY_SHIFT)
+#define I40E_PRTDCB_TETSC_TCB_LLTC_SHIFT 8
+#define I40E_PRTDCB_TETSC_TCB_LLTC_MASK I40E_MASK(0xFF, I40E_PRTDCB_TETSC_TCB_LLTC_SHIFT)
+#define I40E_PRTDCB_TETSC_TPB 0x00098060 /* Reset: CORER */
+#define I40E_PRTDCB_TETSC_TPB_EN_LL_STRICT_PRIORITY_SHIFT 0
+#define I40E_PRTDCB_TETSC_TPB_EN_LL_STRICT_PRIORITY_MASK I40E_MASK(0x1, I40E_PRTDCB_TETSC_TPB_EN_LL_STRICT_PRIORITY_SHIFT)
+#define I40E_PRTDCB_TETSC_TPB_LLTC_SHIFT 8
+#define I40E_PRTDCB_TETSC_TPB_LLTC_MASK I40E_MASK(0xFF, I40E_PRTDCB_TETSC_TPB_LLTC_SHIFT)
+#define I40E_PRTDCB_TFCS 0x001E4560 /* Reset: GLOBR */
+#define I40E_PRTDCB_TFCS_TXOFF_SHIFT 0
+#define I40E_PRTDCB_TFCS_TXOFF_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF0_SHIFT 8
+#define I40E_PRTDCB_TFCS_TXOFF0_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF0_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF1_SHIFT 9
+#define I40E_PRTDCB_TFCS_TXOFF1_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF1_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF2_SHIFT 10
+#define I40E_PRTDCB_TFCS_TXOFF2_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF2_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF3_SHIFT 11
+#define I40E_PRTDCB_TFCS_TXOFF3_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF3_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF4_SHIFT 12
+#define I40E_PRTDCB_TFCS_TXOFF4_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF4_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF5_SHIFT 13
+#define I40E_PRTDCB_TFCS_TXOFF5_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF5_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF6_SHIFT 14
+#define I40E_PRTDCB_TFCS_TXOFF6_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF6_SHIFT)
+#define I40E_PRTDCB_TFCS_TXOFF7_SHIFT 15
+#define I40E_PRTDCB_TFCS_TXOFF7_MASK I40E_MASK(0x1, I40E_PRTDCB_TFCS_TXOFF7_SHIFT)
+#define I40E_PRTDCB_TPFCTS(_i) (0x001E4660 + ((_i) * 32)) /* _i=0...7 */ /* Reset: GLOBR */
+#define I40E_PRTDCB_TPFCTS_MAX_INDEX 7
+#define I40E_PRTDCB_TPFCTS_PFCTIMER_SHIFT 0
+#define I40E_PRTDCB_TPFCTS_PFCTIMER_MASK I40E_MASK(0x3FFF, I40E_PRTDCB_TPFCTS_PFCTIMER_SHIFT)
+#define I40E_GLFCOE_RCTL 0x00269B94 /* Reset: CORER */
+#define I40E_GLFCOE_RCTL_FCOEVER_SHIFT 0
+#define I40E_GLFCOE_RCTL_FCOEVER_MASK I40E_MASK(0xF, I40E_GLFCOE_RCTL_FCOEVER_SHIFT)
+#define I40E_GLFCOE_RCTL_SAVBAD_SHIFT 4
+#define I40E_GLFCOE_RCTL_SAVBAD_MASK I40E_MASK(0x1, I40E_GLFCOE_RCTL_SAVBAD_SHIFT)
+#define I40E_GLFCOE_RCTL_ICRC_SHIFT 5
+#define I40E_GLFCOE_RCTL_ICRC_MASK I40E_MASK(0x1, I40E_GLFCOE_RCTL_ICRC_SHIFT)
+#define I40E_GLFCOE_RCTL_MAX_SIZE_SHIFT 16
+#define I40E_GLFCOE_RCTL_MAX_SIZE_MASK I40E_MASK(0x3FFF, I40E_GLFCOE_RCTL_MAX_SIZE_SHIFT)
+#define I40E_GL_FWSTS 0x00083048 /* Reset: POR */
+#define I40E_GL_FWSTS_FWS0B_SHIFT 0
+#define I40E_GL_FWSTS_FWS0B_MASK I40E_MASK(0xFF, I40E_GL_FWSTS_FWS0B_SHIFT)
+#define I40E_GL_FWSTS_FWRI_SHIFT 9
+#define I40E_GL_FWSTS_FWRI_MASK I40E_MASK(0x1, I40E_GL_FWSTS_FWRI_SHIFT)
+#define I40E_GL_FWSTS_FWS1B_SHIFT 16
+#define I40E_GL_FWSTS_FWS1B_MASK I40E_MASK(0xFF, I40E_GL_FWSTS_FWS1B_SHIFT)
+#define I40E_GLGEN_CLKSTAT 0x000B8184 /* Reset: POR */
+#define I40E_GLGEN_CLKSTAT_CLKMODE_SHIFT 0
+#define I40E_GLGEN_CLKSTAT_CLKMODE_MASK I40E_MASK(0x1, I40E_GLGEN_CLKSTAT_CLKMODE_SHIFT)
+#define I40E_GLGEN_CLKSTAT_U_CLK_SPEED_SHIFT 4
+#define I40E_GLGEN_CLKSTAT_U_CLK_SPEED_MASK I40E_MASK(0x3, I40E_GLGEN_CLKSTAT_U_CLK_SPEED_SHIFT)
+#define I40E_GLGEN_CLKSTAT_P0_CLK_SPEED_SHIFT 8
+#define I40E_GLGEN_CLKSTAT_P0_CLK_SPEED_MASK I40E_MASK(0x7, I40E_GLGEN_CLKSTAT_P0_CLK_SPEED_SHIFT)
+#define I40E_GLGEN_CLKSTAT_P1_CLK_SPEED_SHIFT 12
+#define I40E_GLGEN_CLKSTAT_P1_CLK_SPEED_MASK I40E_MASK(0x7, I40E_GLGEN_CLKSTAT_P1_CLK_SPEED_SHIFT)
+#define I40E_GLGEN_CLKSTAT_P2_CLK_SPEED_SHIFT 16
+#define I40E_GLGEN_CLKSTAT_P2_CLK_SPEED_MASK I40E_MASK(0x7, I40E_GLGEN_CLKSTAT_P2_CLK_SPEED_SHIFT)
+#define I40E_GLGEN_CLKSTAT_P3_CLK_SPEED_SHIFT 20
+#define I40E_GLGEN_CLKSTAT_P3_CLK_SPEED_MASK I40E_MASK(0x7, I40E_GLGEN_CLKSTAT_P3_CLK_SPEED_SHIFT)
+#define I40E_GLGEN_GPIO_CTL(_i) (0x00088100 + ((_i) * 4)) /* _i=0...29 */ /* Reset: POR */
+#define I40E_GLGEN_GPIO_CTL_MAX_INDEX 29
+#define I40E_GLGEN_GPIO_CTL_PRT_NUM_SHIFT 0
+#define I40E_GLGEN_GPIO_CTL_PRT_NUM_MASK I40E_MASK(0x3, I40E_GLGEN_GPIO_CTL_PRT_NUM_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_PRT_NUM_NA_SHIFT 3
+#define I40E_GLGEN_GPIO_CTL_PRT_NUM_NA_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_PRT_NUM_NA_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_PIN_DIR_SHIFT 4
+#define I40E_GLGEN_GPIO_CTL_PIN_DIR_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_PIN_DIR_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_TRI_CTL_SHIFT 5
+#define I40E_GLGEN_GPIO_CTL_TRI_CTL_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_TRI_CTL_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_OUT_CTL_SHIFT 6
+#define I40E_GLGEN_GPIO_CTL_OUT_CTL_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_OUT_CTL_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_PIN_FUNC_SHIFT 7
+#define I40E_GLGEN_GPIO_CTL_PIN_FUNC_MASK I40E_MASK(0x7, I40E_GLGEN_GPIO_CTL_PIN_FUNC_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_LED_INVRT_SHIFT 10
+#define I40E_GLGEN_GPIO_CTL_LED_INVRT_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_LED_INVRT_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_LED_BLINK_SHIFT 11
+#define I40E_GLGEN_GPIO_CTL_LED_BLINK_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_LED_BLINK_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT 12
+#define I40E_GLGEN_GPIO_CTL_LED_MODE_MASK I40E_MASK(0x1F, I40E_GLGEN_GPIO_CTL_LED_MODE_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_INT_MODE_SHIFT 17
+#define I40E_GLGEN_GPIO_CTL_INT_MODE_MASK I40E_MASK(0x3, I40E_GLGEN_GPIO_CTL_INT_MODE_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_OUT_DEFAULT_SHIFT 19
+#define I40E_GLGEN_GPIO_CTL_OUT_DEFAULT_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_CTL_OUT_DEFAULT_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_PHY_PIN_NAME_SHIFT 20
+#define I40E_GLGEN_GPIO_CTL_PHY_PIN_NAME_MASK I40E_MASK(0x3F, I40E_GLGEN_GPIO_CTL_PHY_PIN_NAME_SHIFT)
+#define I40E_GLGEN_GPIO_CTL_PRT_BIT_MAP_SHIFT 26
+#define I40E_GLGEN_GPIO_CTL_PRT_BIT_MAP_MASK I40E_MASK(0xF, I40E_GLGEN_GPIO_CTL_PRT_BIT_MAP_SHIFT)
+#define I40E_GLGEN_GPIO_SET 0x00088184 /* Reset: POR */
+#define I40E_GLGEN_GPIO_SET_GPIO_INDX_SHIFT 0
+#define I40E_GLGEN_GPIO_SET_GPIO_INDX_MASK I40E_MASK(0x1F, I40E_GLGEN_GPIO_SET_GPIO_INDX_SHIFT)
+#define I40E_GLGEN_GPIO_SET_SDP_DATA_SHIFT 5
+#define I40E_GLGEN_GPIO_SET_SDP_DATA_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_SET_SDP_DATA_SHIFT)
+#define I40E_GLGEN_GPIO_SET_DRIVE_SDP_SHIFT 6
+#define I40E_GLGEN_GPIO_SET_DRIVE_SDP_MASK I40E_MASK(0x1, I40E_GLGEN_GPIO_SET_DRIVE_SDP_SHIFT)
+#define I40E_GLGEN_GPIO_STAT 0x0008817C /* Reset: POR */
+#define I40E_GLGEN_GPIO_STAT_GPIO_VALUE_SHIFT 0
+#define I40E_GLGEN_GPIO_STAT_GPIO_VALUE_MASK I40E_MASK(0x3FFFFFFF, I40E_GLGEN_GPIO_STAT_GPIO_VALUE_SHIFT)
+#define I40E_GLGEN_GPIO_TRANSIT 0x00088180 /* Reset: POR */
+#define I40E_GLGEN_GPIO_TRANSIT_GPIO_TRANSITION_SHIFT 0
+#define I40E_GLGEN_GPIO_TRANSIT_GPIO_TRANSITION_MASK I40E_MASK(0x3FFFFFFF, I40E_GLGEN_GPIO_TRANSIT_GPIO_TRANSITION_SHIFT)
+#define I40E_GLGEN_I2CCMD(_i) (0x000881E0 + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_GLGEN_I2CCMD_MAX_INDEX 3
+#define I40E_GLGEN_I2CCMD_DATA_SHIFT 0
+#define I40E_GLGEN_I2CCMD_DATA_MASK I40E_MASK(0xFFFF, I40E_GLGEN_I2CCMD_DATA_SHIFT)
+#define I40E_GLGEN_I2CCMD_REGADD_SHIFT 16
+#define I40E_GLGEN_I2CCMD_REGADD_MASK I40E_MASK(0xFF, I40E_GLGEN_I2CCMD_REGADD_SHIFT)
+#define I40E_GLGEN_I2CCMD_PHYADD_SHIFT 24
+#define I40E_GLGEN_I2CCMD_PHYADD_MASK I40E_MASK(0x7, I40E_GLGEN_I2CCMD_PHYADD_SHIFT)
+#define I40E_GLGEN_I2CCMD_OP_SHIFT 27
+#define I40E_GLGEN_I2CCMD_OP_MASK I40E_MASK(0x1, I40E_GLGEN_I2CCMD_OP_SHIFT)
+#define I40E_GLGEN_I2CCMD_RESET_SHIFT 28
+#define I40E_GLGEN_I2CCMD_RESET_MASK I40E_MASK(0x1, I40E_GLGEN_I2CCMD_RESET_SHIFT)
+#define I40E_GLGEN_I2CCMD_R_SHIFT 29
+#define I40E_GLGEN_I2CCMD_R_MASK I40E_MASK(0x1, I40E_GLGEN_I2CCMD_R_SHIFT)
+#define I40E_GLGEN_I2CCMD_E_SHIFT 31
+#define I40E_GLGEN_I2CCMD_E_MASK I40E_MASK(0x1, I40E_GLGEN_I2CCMD_E_SHIFT)
+#define I40E_GLGEN_I2CPARAMS(_i) (0x000881AC + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_GLGEN_I2CPARAMS_MAX_INDEX 3
+#define I40E_GLGEN_I2CPARAMS_WRITE_TIME_SHIFT 0
+#define I40E_GLGEN_I2CPARAMS_WRITE_TIME_MASK I40E_MASK(0x1F, I40E_GLGEN_I2CPARAMS_WRITE_TIME_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_READ_TIME_SHIFT 5
+#define I40E_GLGEN_I2CPARAMS_READ_TIME_MASK I40E_MASK(0x7, I40E_GLGEN_I2CPARAMS_READ_TIME_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_I2CBB_EN_SHIFT 8
+#define I40E_GLGEN_I2CPARAMS_I2CBB_EN_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_I2CBB_EN_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_CLK_SHIFT 9
+#define I40E_GLGEN_I2CPARAMS_CLK_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_CLK_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_DATA_OUT_SHIFT 10
+#define I40E_GLGEN_I2CPARAMS_DATA_OUT_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_DATA_OUT_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_DATA_OE_N_SHIFT 11
+#define I40E_GLGEN_I2CPARAMS_DATA_OE_N_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_DATA_OE_N_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_DATA_IN_SHIFT 12
+#define I40E_GLGEN_I2CPARAMS_DATA_IN_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_DATA_IN_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_CLK_OE_N_SHIFT 13
+#define I40E_GLGEN_I2CPARAMS_CLK_OE_N_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_CLK_OE_N_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_CLK_IN_SHIFT 14
+#define I40E_GLGEN_I2CPARAMS_CLK_IN_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_CLK_IN_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_CLK_STRETCH_DIS_SHIFT 15
+#define I40E_GLGEN_I2CPARAMS_CLK_STRETCH_DIS_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_CLK_STRETCH_DIS_SHIFT)
+#define I40E_GLGEN_I2CPARAMS_I2C_DATA_ORDER_SHIFT 31
+#define I40E_GLGEN_I2CPARAMS_I2C_DATA_ORDER_MASK I40E_MASK(0x1, I40E_GLGEN_I2CPARAMS_I2C_DATA_ORDER_SHIFT)
+#define I40E_GLGEN_LED_CTL 0x00088178 /* Reset: POR */
+#define I40E_GLGEN_LED_CTL_GLOBAL_BLINK_MODE_SHIFT 0
+#define I40E_GLGEN_LED_CTL_GLOBAL_BLINK_MODE_MASK I40E_MASK(0x1, I40E_GLGEN_LED_CTL_GLOBAL_BLINK_MODE_SHIFT)
+#define I40E_GLGEN_MDIO_CTRL(_i) (0x000881D0 + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_GLGEN_MDIO_CTRL_MAX_INDEX 3
+#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD2_SHIFT 0
+#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD2_MASK I40E_MASK(0x1FFFF, I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD2_SHIFT)
+#define I40E_GLGEN_MDIO_CTRL_CONTMDC_SHIFT 17
+#define I40E_GLGEN_MDIO_CTRL_CONTMDC_MASK I40E_MASK(0x1, I40E_GLGEN_MDIO_CTRL_CONTMDC_SHIFT)
+#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD1_SHIFT 18
+#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD1_MASK I40E_MASK(0x7FF, I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD1_SHIFT)
+#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD0_SHIFT 29
+#define I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD0_MASK I40E_MASK(0x7, I40E_GLGEN_MDIO_CTRL_LEGACY_RSVD0_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL(_i) (0x000881C0 + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_GLGEN_MDIO_I2C_SEL_MAX_INDEX 3
+#define I40E_GLGEN_MDIO_I2C_SEL_MDIO_I2C_SEL_SHIFT 0
+#define I40E_GLGEN_MDIO_I2C_SEL_MDIO_I2C_SEL_MASK I40E_MASK(0x1, I40E_GLGEN_MDIO_I2C_SEL_MDIO_I2C_SEL_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY_PORT_NUM_SHIFT 1
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY_PORT_NUM_MASK I40E_MASK(0xF, I40E_GLGEN_MDIO_I2C_SEL_PHY_PORT_NUM_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY0_ADDRESS_SHIFT 5
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY0_ADDRESS_MASK I40E_MASK(0x1F, I40E_GLGEN_MDIO_I2C_SEL_PHY0_ADDRESS_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY1_ADDRESS_SHIFT 10
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY1_ADDRESS_MASK I40E_MASK(0x1F, I40E_GLGEN_MDIO_I2C_SEL_PHY1_ADDRESS_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY2_ADDRESS_SHIFT 15
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY2_ADDRESS_MASK I40E_MASK(0x1F, I40E_GLGEN_MDIO_I2C_SEL_PHY2_ADDRESS_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY3_ADDRESS_SHIFT 20
+#define I40E_GLGEN_MDIO_I2C_SEL_PHY3_ADDRESS_MASK I40E_MASK(0x1F, I40E_GLGEN_MDIO_I2C_SEL_PHY3_ADDRESS_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_MDIO_IF_MODE_SHIFT 25
+#define I40E_GLGEN_MDIO_I2C_SEL_MDIO_IF_MODE_MASK I40E_MASK(0xF, I40E_GLGEN_MDIO_I2C_SEL_MDIO_IF_MODE_SHIFT)
+#define I40E_GLGEN_MDIO_I2C_SEL_EN_FAST_MODE_SHIFT 31
+#define I40E_GLGEN_MDIO_I2C_SEL_EN_FAST_MODE_MASK I40E_MASK(0x1, I40E_GLGEN_MDIO_I2C_SEL_EN_FAST_MODE_SHIFT)
+#define I40E_GLGEN_MSCA(_i) (0x0008818C + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_GLGEN_MSCA_MAX_INDEX 3
+#define I40E_GLGEN_MSCA_MDIADD_SHIFT 0
+#define I40E_GLGEN_MSCA_MDIADD_MASK I40E_MASK(0xFFFF, I40E_GLGEN_MSCA_MDIADD_SHIFT)
+#define I40E_GLGEN_MSCA_DEVADD_SHIFT 16
+#define I40E_GLGEN_MSCA_DEVADD_MASK I40E_MASK(0x1F, I40E_GLGEN_MSCA_DEVADD_SHIFT)
+#define I40E_GLGEN_MSCA_PHYADD_SHIFT 21
+#define I40E_GLGEN_MSCA_PHYADD_MASK I40E_MASK(0x1F, I40E_GLGEN_MSCA_PHYADD_SHIFT)
+#define I40E_GLGEN_MSCA_OPCODE_SHIFT 26
+#define I40E_GLGEN_MSCA_OPCODE_MASK I40E_MASK(0x3, I40E_GLGEN_MSCA_OPCODE_SHIFT)
+#define I40E_GLGEN_MSCA_STCODE_SHIFT 28
+#define I40E_GLGEN_MSCA_STCODE_MASK I40E_MASK(0x3, I40E_GLGEN_MSCA_STCODE_SHIFT)
+#define I40E_GLGEN_MSCA_MDICMD_SHIFT 30
+#define I40E_GLGEN_MSCA_MDICMD_MASK I40E_MASK(0x1, I40E_GLGEN_MSCA_MDICMD_SHIFT)
+#define I40E_GLGEN_MSCA_MDIINPROGEN_SHIFT 31
+#define I40E_GLGEN_MSCA_MDIINPROGEN_MASK I40E_MASK(0x1, I40E_GLGEN_MSCA_MDIINPROGEN_SHIFT)
+#define I40E_GLGEN_MSRWD(_i) (0x0008819C + ((_i) * 4)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_GLGEN_MSRWD_MAX_INDEX 3
+#define I40E_GLGEN_MSRWD_MDIWRDATA_SHIFT 0
+#define I40E_GLGEN_MSRWD_MDIWRDATA_MASK I40E_MASK(0xFFFF, I40E_GLGEN_MSRWD_MDIWRDATA_SHIFT)
+#define I40E_GLGEN_MSRWD_MDIRDDATA_SHIFT 16
+#define I40E_GLGEN_MSRWD_MDIRDDATA_MASK I40E_MASK(0xFFFF, I40E_GLGEN_MSRWD_MDIRDDATA_SHIFT)
+#define I40E_GLGEN_PCIFCNCNT 0x001C0AB4 /* Reset: PCIR */
+#define I40E_GLGEN_PCIFCNCNT_PCIPFCNT_SHIFT 0
+#define I40E_GLGEN_PCIFCNCNT_PCIPFCNT_MASK I40E_MASK(0x1F, I40E_GLGEN_PCIFCNCNT_PCIPFCNT_SHIFT)
+#define I40E_GLGEN_PCIFCNCNT_PCIVFCNT_SHIFT 16
+#define I40E_GLGEN_PCIFCNCNT_PCIVFCNT_MASK I40E_MASK(0xFF, I40E_GLGEN_PCIFCNCNT_PCIVFCNT_SHIFT)
+#define I40E_GLGEN_RSTAT 0x000B8188 /* Reset: POR */
+#define I40E_GLGEN_RSTAT_DEVSTATE_SHIFT 0
+#define I40E_GLGEN_RSTAT_DEVSTATE_MASK I40E_MASK(0x3, I40E_GLGEN_RSTAT_DEVSTATE_SHIFT)
+#define I40E_GLGEN_RSTAT_RESET_TYPE_SHIFT 2
+#define I40E_GLGEN_RSTAT_RESET_TYPE_MASK I40E_MASK(0x3, I40E_GLGEN_RSTAT_RESET_TYPE_SHIFT)
+#define I40E_GLGEN_RSTAT_CORERCNT_SHIFT 4
+#define I40E_GLGEN_RSTAT_CORERCNT_MASK I40E_MASK(0x3, I40E_GLGEN_RSTAT_CORERCNT_SHIFT)
+#define I40E_GLGEN_RSTAT_GLOBRCNT_SHIFT 6
+#define I40E_GLGEN_RSTAT_GLOBRCNT_MASK I40E_MASK(0x3, I40E_GLGEN_RSTAT_GLOBRCNT_SHIFT)
+#define I40E_GLGEN_RSTAT_EMPRCNT_SHIFT 8
+#define I40E_GLGEN_RSTAT_EMPRCNT_MASK I40E_MASK(0x3, I40E_GLGEN_RSTAT_EMPRCNT_SHIFT)
+#define I40E_GLGEN_RSTAT_TIME_TO_RST_SHIFT 10
+#define I40E_GLGEN_RSTAT_TIME_TO_RST_MASK I40E_MASK(0x3F, I40E_GLGEN_RSTAT_TIME_TO_RST_SHIFT)
+#define I40E_GLGEN_RSTCTL 0x000B8180 /* Reset: POR */
+#define I40E_GLGEN_RSTCTL_GRSTDEL_SHIFT 0
+#define I40E_GLGEN_RSTCTL_GRSTDEL_MASK I40E_MASK(0x3F, I40E_GLGEN_RSTCTL_GRSTDEL_SHIFT)
+#define I40E_GLGEN_RSTCTL_ECC_RST_ENA_SHIFT 8
+#define I40E_GLGEN_RSTCTL_ECC_RST_ENA_MASK I40E_MASK(0x1, I40E_GLGEN_RSTCTL_ECC_RST_ENA_SHIFT)
+#define I40E_GLGEN_RTRIG 0x000B8190 /* Reset: CORER */
+#define I40E_GLGEN_RTRIG_CORER_SHIFT 0
+#define I40E_GLGEN_RTRIG_CORER_MASK I40E_MASK(0x1, I40E_GLGEN_RTRIG_CORER_SHIFT)
+#define I40E_GLGEN_RTRIG_GLOBR_SHIFT 1
+#define I40E_GLGEN_RTRIG_GLOBR_MASK I40E_MASK(0x1, I40E_GLGEN_RTRIG_GLOBR_SHIFT)
+#define I40E_GLGEN_RTRIG_EMPFWR_SHIFT 2
+#define I40E_GLGEN_RTRIG_EMPFWR_MASK I40E_MASK(0x1, I40E_GLGEN_RTRIG_EMPFWR_SHIFT)
+#define I40E_GLGEN_STAT 0x000B612C /* Reset: POR */
+#define I40E_GLGEN_STAT_HWRSVD0_SHIFT 0
+#define I40E_GLGEN_STAT_HWRSVD0_MASK I40E_MASK(0x3, I40E_GLGEN_STAT_HWRSVD0_SHIFT)
+#define I40E_GLGEN_STAT_DCBEN_SHIFT 2
+#define I40E_GLGEN_STAT_DCBEN_MASK I40E_MASK(0x1, I40E_GLGEN_STAT_DCBEN_SHIFT)
+#define I40E_GLGEN_STAT_VTEN_SHIFT 3
+#define I40E_GLGEN_STAT_VTEN_MASK I40E_MASK(0x1, I40E_GLGEN_STAT_VTEN_SHIFT)
+#define I40E_GLGEN_STAT_FCOEN_SHIFT 4
+#define I40E_GLGEN_STAT_FCOEN_MASK I40E_MASK(0x1, I40E_GLGEN_STAT_FCOEN_SHIFT)
+#define I40E_GLGEN_STAT_EVBEN_SHIFT 5
+#define I40E_GLGEN_STAT_EVBEN_MASK I40E_MASK(0x1, I40E_GLGEN_STAT_EVBEN_SHIFT)
+#define I40E_GLGEN_STAT_HWRSVD1_SHIFT 6
+#define I40E_GLGEN_STAT_HWRSVD1_MASK I40E_MASK(0x3, I40E_GLGEN_STAT_HWRSVD1_SHIFT)
+#define I40E_GLGEN_VFLRSTAT(_i) (0x00092600 + ((_i) * 4)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLGEN_VFLRSTAT_MAX_INDEX 3
+#define I40E_GLGEN_VFLRSTAT_VFLRE_SHIFT 0
+#define I40E_GLGEN_VFLRSTAT_VFLRE_MASK I40E_MASK(0xFFFFFFFF, I40E_GLGEN_VFLRSTAT_VFLRE_SHIFT)
+#define I40E_GLVFGEN_TIMER 0x000881BC /* Reset: CORER */
+#define I40E_GLVFGEN_TIMER_GTIME_SHIFT 0
+#define I40E_GLVFGEN_TIMER_GTIME_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVFGEN_TIMER_GTIME_SHIFT)
+#define I40E_PFGEN_CTRL 0x00092400 /* Reset: PFR */
+#define I40E_PFGEN_CTRL_PFSWR_SHIFT 0
+#define I40E_PFGEN_CTRL_PFSWR_MASK I40E_MASK(0x1, I40E_PFGEN_CTRL_PFSWR_SHIFT)
+#define I40E_PFGEN_DRUN 0x00092500 /* Reset: CORER */
+#define I40E_PFGEN_DRUN_DRVUNLD_SHIFT 0
+#define I40E_PFGEN_DRUN_DRVUNLD_MASK I40E_MASK(0x1, I40E_PFGEN_DRUN_DRVUNLD_SHIFT)
+#define I40E_PFGEN_PORTNUM 0x001C0480 /* Reset: CORER */
+#define I40E_PFGEN_PORTNUM_PORT_NUM_SHIFT 0
+#define I40E_PFGEN_PORTNUM_PORT_NUM_MASK I40E_MASK(0x3, I40E_PFGEN_PORTNUM_PORT_NUM_SHIFT)
+#define I40E_PFGEN_STATE 0x00088000 /* Reset: CORER */
+#define I40E_PFGEN_STATE_RESERVED_0_SHIFT 0
+#define I40E_PFGEN_STATE_RESERVED_0_MASK I40E_MASK(0x1, I40E_PFGEN_STATE_RESERVED_0_SHIFT)
+#define I40E_PFGEN_STATE_PFFCEN_SHIFT 1
+#define I40E_PFGEN_STATE_PFFCEN_MASK I40E_MASK(0x1, I40E_PFGEN_STATE_PFFCEN_SHIFT)
+#define I40E_PFGEN_STATE_PFLINKEN_SHIFT 2
+#define I40E_PFGEN_STATE_PFLINKEN_MASK I40E_MASK(0x1, I40E_PFGEN_STATE_PFLINKEN_SHIFT)
+#define I40E_PFGEN_STATE_PFSCEN_SHIFT 3
+#define I40E_PFGEN_STATE_PFSCEN_MASK I40E_MASK(0x1, I40E_PFGEN_STATE_PFSCEN_SHIFT)
+#define I40E_PRTGEN_CNF 0x000B8120 /* Reset: POR */
+#define I40E_PRTGEN_CNF_PORT_DIS_SHIFT 0
+#define I40E_PRTGEN_CNF_PORT_DIS_MASK I40E_MASK(0x1, I40E_PRTGEN_CNF_PORT_DIS_SHIFT)
+#define I40E_PRTGEN_CNF_ALLOW_PORT_DIS_SHIFT 1
+#define I40E_PRTGEN_CNF_ALLOW_PORT_DIS_MASK I40E_MASK(0x1, I40E_PRTGEN_CNF_ALLOW_PORT_DIS_SHIFT)
+#define I40E_PRTGEN_CNF_EMP_PORT_DIS_SHIFT 2
+#define I40E_PRTGEN_CNF_EMP_PORT_DIS_MASK I40E_MASK(0x1, I40E_PRTGEN_CNF_EMP_PORT_DIS_SHIFT)
+#define I40E_PRTGEN_CNF2 0x000B8160 /* Reset: POR */
+#define I40E_PRTGEN_CNF2_ACTIVATE_PORT_LINK_SHIFT 0
+#define I40E_PRTGEN_CNF2_ACTIVATE_PORT_LINK_MASK I40E_MASK(0x1, I40E_PRTGEN_CNF2_ACTIVATE_PORT_LINK_SHIFT)
+#define I40E_PRTGEN_STATUS 0x000B8100 /* Reset: POR */
+#define I40E_PRTGEN_STATUS_PORT_VALID_SHIFT 0
+#define I40E_PRTGEN_STATUS_PORT_VALID_MASK I40E_MASK(0x1, I40E_PRTGEN_STATUS_PORT_VALID_SHIFT)
+#define I40E_PRTGEN_STATUS_PORT_ACTIVE_SHIFT 1
+#define I40E_PRTGEN_STATUS_PORT_ACTIVE_MASK I40E_MASK(0x1, I40E_PRTGEN_STATUS_PORT_ACTIVE_SHIFT)
+#define I40E_VFGEN_RSTAT1(_VF) (0x00074400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFGEN_RSTAT1_MAX_INDEX 127
+#define I40E_VFGEN_RSTAT1_VFR_STATE_SHIFT 0
+#define I40E_VFGEN_RSTAT1_VFR_STATE_MASK I40E_MASK(0x3, I40E_VFGEN_RSTAT1_VFR_STATE_SHIFT)
+#define I40E_VPGEN_VFRSTAT(_VF) (0x00091C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VPGEN_VFRSTAT_MAX_INDEX 127
+#define I40E_VPGEN_VFRSTAT_VFRD_SHIFT 0
+#define I40E_VPGEN_VFRSTAT_VFRD_MASK I40E_MASK(0x1, I40E_VPGEN_VFRSTAT_VFRD_SHIFT)
+#define I40E_VPGEN_VFRTRIG(_VF) (0x00091800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VPGEN_VFRTRIG_MAX_INDEX 127
+#define I40E_VPGEN_VFRTRIG_VFSWR_SHIFT 0
+#define I40E_VPGEN_VFRTRIG_VFSWR_MASK I40E_MASK(0x1, I40E_VPGEN_VFRTRIG_VFSWR_SHIFT)
+#define I40E_VSIGEN_RSTAT(_VSI) (0x00090800 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_VSIGEN_RSTAT_MAX_INDEX 383
+#define I40E_VSIGEN_RSTAT_VMRD_SHIFT 0
+#define I40E_VSIGEN_RSTAT_VMRD_MASK I40E_MASK(0x1, I40E_VSIGEN_RSTAT_VMRD_SHIFT)
+#define I40E_VSIGEN_RTRIG(_VSI) (0x00090000 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_VSIGEN_RTRIG_MAX_INDEX 383
+#define I40E_VSIGEN_RTRIG_VMSWR_SHIFT 0
+#define I40E_VSIGEN_RTRIG_VMSWR_MASK I40E_MASK(0x1, I40E_VSIGEN_RTRIG_VMSWR_SHIFT)
+#define I40E_GLHMC_FCOEDDPBASE(_i) (0x000C6600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FCOEDDPBASE_MAX_INDEX 15
+#define I40E_GLHMC_FCOEDDPBASE_FPMFCOEDDPBASE_SHIFT 0
+#define I40E_GLHMC_FCOEDDPBASE_FPMFCOEDDPBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_FCOEDDPBASE_FPMFCOEDDPBASE_SHIFT)
+#define I40E_GLHMC_FCOEDDPCNT(_i) (0x000C6700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FCOEDDPCNT_MAX_INDEX 15
+#define I40E_GLHMC_FCOEDDPCNT_FPMFCOEDDPCNT_SHIFT 0
+#define I40E_GLHMC_FCOEDDPCNT_FPMFCOEDDPCNT_MASK I40E_MASK(0xFFFFF, I40E_GLHMC_FCOEDDPCNT_FPMFCOEDDPCNT_SHIFT)
+#define I40E_GLHMC_FCOEDDPOBJSZ 0x000C2010 /* Reset: CORER */
+#define I40E_GLHMC_FCOEDDPOBJSZ_PMFCOEDDPOBJSZ_SHIFT 0
+#define I40E_GLHMC_FCOEDDPOBJSZ_PMFCOEDDPOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_FCOEDDPOBJSZ_PMFCOEDDPOBJSZ_SHIFT)
+#define I40E_GLHMC_FCOEFBASE(_i) (0x000C6800 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FCOEFBASE_MAX_INDEX 15
+#define I40E_GLHMC_FCOEFBASE_FPMFCOEFBASE_SHIFT 0
+#define I40E_GLHMC_FCOEFBASE_FPMFCOEFBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_FCOEFBASE_FPMFCOEFBASE_SHIFT)
+#define I40E_GLHMC_FCOEFCNT(_i) (0x000C6900 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FCOEFCNT_MAX_INDEX 15
+#define I40E_GLHMC_FCOEFCNT_FPMFCOEFCNT_SHIFT 0
+#define I40E_GLHMC_FCOEFCNT_FPMFCOEFCNT_MASK I40E_MASK(0x7FFFFF, I40E_GLHMC_FCOEFCNT_FPMFCOEFCNT_SHIFT)
+#define I40E_GLHMC_FCOEFMAX 0x000C20D0 /* Reset: CORER */
+#define I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_SHIFT 0
+#define I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_MASK I40E_MASK(0xFFFF, I40E_GLHMC_FCOEFMAX_PMFCOEFMAX_SHIFT)
+#define I40E_GLHMC_FCOEFOBJSZ 0x000C2018 /* Reset: CORER */
+#define I40E_GLHMC_FCOEFOBJSZ_PMFCOEFOBJSZ_SHIFT 0
+#define I40E_GLHMC_FCOEFOBJSZ_PMFCOEFOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_FCOEFOBJSZ_PMFCOEFOBJSZ_SHIFT)
+#define I40E_GLHMC_FCOEMAX 0x000C2014 /* Reset: CORER */
+#define I40E_GLHMC_FCOEMAX_PMFCOEMAX_SHIFT 0
+#define I40E_GLHMC_FCOEMAX_PMFCOEMAX_MASK I40E_MASK(0x1FFF, I40E_GLHMC_FCOEMAX_PMFCOEMAX_SHIFT)
+#define I40E_GLHMC_FSIAVBASE(_i) (0x000C5600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FSIAVBASE_MAX_INDEX 15
+#define I40E_GLHMC_FSIAVBASE_FPMFSIAVBASE_SHIFT 0
+#define I40E_GLHMC_FSIAVBASE_FPMFSIAVBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_FSIAVBASE_FPMFSIAVBASE_SHIFT)
+#define I40E_GLHMC_FSIAVCNT(_i) (0x000C5700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FSIAVCNT_MAX_INDEX 15
+#define I40E_GLHMC_FSIAVCNT_FPMFSIAVCNT_SHIFT 0
+#define I40E_GLHMC_FSIAVCNT_FPMFSIAVCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_FSIAVCNT_FPMFSIAVCNT_SHIFT)
+#define I40E_GLHMC_FSIAVCNT_RSVD_SHIFT 29
+#define I40E_GLHMC_FSIAVCNT_RSVD_MASK I40E_MASK(0x7, I40E_GLHMC_FSIAVCNT_RSVD_SHIFT)
+#define I40E_GLHMC_FSIAVMAX 0x000C2068 /* Reset: CORER */
+#define I40E_GLHMC_FSIAVMAX_PMFSIAVMAX_SHIFT 0
+#define I40E_GLHMC_FSIAVMAX_PMFSIAVMAX_MASK I40E_MASK(0x1FFFF, I40E_GLHMC_FSIAVMAX_PMFSIAVMAX_SHIFT)
+#define I40E_GLHMC_FSIAVOBJSZ 0x000C2064 /* Reset: CORER */
+#define I40E_GLHMC_FSIAVOBJSZ_PMFSIAVOBJSZ_SHIFT 0
+#define I40E_GLHMC_FSIAVOBJSZ_PMFSIAVOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_FSIAVOBJSZ_PMFSIAVOBJSZ_SHIFT)
+#define I40E_GLHMC_FSIMCBASE(_i) (0x000C6000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FSIMCBASE_MAX_INDEX 15
+#define I40E_GLHMC_FSIMCBASE_FPMFSIMCBASE_SHIFT 0
+#define I40E_GLHMC_FSIMCBASE_FPMFSIMCBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_FSIMCBASE_FPMFSIMCBASE_SHIFT)
+#define I40E_GLHMC_FSIMCCNT(_i) (0x000C6100 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_FSIMCCNT_MAX_INDEX 15
+#define I40E_GLHMC_FSIMCCNT_FPMFSIMCSZ_SHIFT 0
+#define I40E_GLHMC_FSIMCCNT_FPMFSIMCSZ_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_FSIMCCNT_FPMFSIMCSZ_SHIFT)
+#define I40E_GLHMC_FSIMCMAX 0x000C2060 /* Reset: CORER */
+#define I40E_GLHMC_FSIMCMAX_PMFSIMCMAX_SHIFT 0
+#define I40E_GLHMC_FSIMCMAX_PMFSIMCMAX_MASK I40E_MASK(0x3FFF, I40E_GLHMC_FSIMCMAX_PMFSIMCMAX_SHIFT)
+#define I40E_GLHMC_FSIMCOBJSZ 0x000C205c /* Reset: CORER */
+#define I40E_GLHMC_FSIMCOBJSZ_PMFSIMCOBJSZ_SHIFT 0
+#define I40E_GLHMC_FSIMCOBJSZ_PMFSIMCOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_FSIMCOBJSZ_PMFSIMCOBJSZ_SHIFT)
+#define I40E_GLHMC_LANQMAX 0x000C2008 /* Reset: CORER */
+#define I40E_GLHMC_LANQMAX_PMLANQMAX_SHIFT 0
+#define I40E_GLHMC_LANQMAX_PMLANQMAX_MASK I40E_MASK(0x7FF, I40E_GLHMC_LANQMAX_PMLANQMAX_SHIFT)
+#define I40E_GLHMC_LANRXBASE(_i) (0x000C6400 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_LANRXBASE_MAX_INDEX 15
+#define I40E_GLHMC_LANRXBASE_FPMLANRXBASE_SHIFT 0
+#define I40E_GLHMC_LANRXBASE_FPMLANRXBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_LANRXBASE_FPMLANRXBASE_SHIFT)
+#define I40E_GLHMC_LANRXCNT(_i) (0x000C6500 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_LANRXCNT_MAX_INDEX 15
+#define I40E_GLHMC_LANRXCNT_FPMLANRXCNT_SHIFT 0
+#define I40E_GLHMC_LANRXCNT_FPMLANRXCNT_MASK I40E_MASK(0x7FF, I40E_GLHMC_LANRXCNT_FPMLANRXCNT_SHIFT)
+#define I40E_GLHMC_LANRXOBJSZ 0x000C200c /* Reset: CORER */
+#define I40E_GLHMC_LANRXOBJSZ_PMLANRXOBJSZ_SHIFT 0
+#define I40E_GLHMC_LANRXOBJSZ_PMLANRXOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_LANRXOBJSZ_PMLANRXOBJSZ_SHIFT)
+#define I40E_GLHMC_LANTXBASE(_i) (0x000C6200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_LANTXBASE_MAX_INDEX 15
+#define I40E_GLHMC_LANTXBASE_FPMLANTXBASE_SHIFT 0
+#define I40E_GLHMC_LANTXBASE_FPMLANTXBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_LANTXBASE_FPMLANTXBASE_SHIFT)
+#define I40E_GLHMC_LANTXBASE_RSVD_SHIFT 24
+#define I40E_GLHMC_LANTXBASE_RSVD_MASK I40E_MASK(0xFF, I40E_GLHMC_LANTXBASE_RSVD_SHIFT)
+#define I40E_GLHMC_LANTXCNT(_i) (0x000C6300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_LANTXCNT_MAX_INDEX 15
+#define I40E_GLHMC_LANTXCNT_FPMLANTXCNT_SHIFT 0
+#define I40E_GLHMC_LANTXCNT_FPMLANTXCNT_MASK I40E_MASK(0x7FF, I40E_GLHMC_LANTXCNT_FPMLANTXCNT_SHIFT)
+#define I40E_GLHMC_LANTXOBJSZ 0x000C2004 /* Reset: CORER */
+#define I40E_GLHMC_LANTXOBJSZ_PMLANTXOBJSZ_SHIFT 0
+#define I40E_GLHMC_LANTXOBJSZ_PMLANTXOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_LANTXOBJSZ_PMLANTXOBJSZ_SHIFT)
+#define I40E_GLHMC_PFASSIGN(_i) (0x000C0c00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PFASSIGN_MAX_INDEX 15
+#define I40E_GLHMC_PFASSIGN_PMFCNPFASSIGN_SHIFT 0
+#define I40E_GLHMC_PFASSIGN_PMFCNPFASSIGN_MASK I40E_MASK(0xF, I40E_GLHMC_PFASSIGN_PMFCNPFASSIGN_SHIFT)
+#define I40E_GLHMC_SDPART(_i) (0x000C0800 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_SDPART_MAX_INDEX 15
+#define I40E_GLHMC_SDPART_PMSDBASE_SHIFT 0
+#define I40E_GLHMC_SDPART_PMSDBASE_MASK I40E_MASK(0xFFF, I40E_GLHMC_SDPART_PMSDBASE_SHIFT)
+#define I40E_GLHMC_SDPART_PMSDSIZE_SHIFT 16
+#define I40E_GLHMC_SDPART_PMSDSIZE_MASK I40E_MASK(0x1FFF, I40E_GLHMC_SDPART_PMSDSIZE_SHIFT)
+#define I40E_PFHMC_ERRORDATA 0x000C0500 /* Reset: PFR */
+#define I40E_PFHMC_ERRORDATA_HMC_ERROR_DATA_SHIFT 0
+#define I40E_PFHMC_ERRORDATA_HMC_ERROR_DATA_MASK I40E_MASK(0x3FFFFFFF, I40E_PFHMC_ERRORDATA_HMC_ERROR_DATA_SHIFT)
+#define I40E_PFHMC_ERRORINFO 0x000C0400 /* Reset: PFR */
+#define I40E_PFHMC_ERRORINFO_PMF_INDEX_SHIFT 0
+#define I40E_PFHMC_ERRORINFO_PMF_INDEX_MASK I40E_MASK(0x1F, I40E_PFHMC_ERRORINFO_PMF_INDEX_SHIFT)
+#define I40E_PFHMC_ERRORINFO_PMF_ISVF_SHIFT 7
+#define I40E_PFHMC_ERRORINFO_PMF_ISVF_MASK I40E_MASK(0x1, I40E_PFHMC_ERRORINFO_PMF_ISVF_SHIFT)
+#define I40E_PFHMC_ERRORINFO_HMC_ERROR_TYPE_SHIFT 8
+#define I40E_PFHMC_ERRORINFO_HMC_ERROR_TYPE_MASK I40E_MASK(0xF, I40E_PFHMC_ERRORINFO_HMC_ERROR_TYPE_SHIFT)
+#define I40E_PFHMC_ERRORINFO_HMC_OBJECT_TYPE_SHIFT 16
+#define I40E_PFHMC_ERRORINFO_HMC_OBJECT_TYPE_MASK I40E_MASK(0x1F, I40E_PFHMC_ERRORINFO_HMC_OBJECT_TYPE_SHIFT)
+#define I40E_PFHMC_ERRORINFO_ERROR_DETECTED_SHIFT 31
+#define I40E_PFHMC_ERRORINFO_ERROR_DETECTED_MASK I40E_MASK(0x1, I40E_PFHMC_ERRORINFO_ERROR_DETECTED_SHIFT)
+#define I40E_PFHMC_PDINV 0x000C0300 /* Reset: PFR */
+#define I40E_PFHMC_PDINV_PMSDIDX_SHIFT 0
+#define I40E_PFHMC_PDINV_PMSDIDX_MASK I40E_MASK(0xFFF, I40E_PFHMC_PDINV_PMSDIDX_SHIFT)
+#define I40E_PFHMC_PDINV_PMPDIDX_SHIFT 16
+#define I40E_PFHMC_PDINV_PMPDIDX_MASK I40E_MASK(0x1FF, I40E_PFHMC_PDINV_PMPDIDX_SHIFT)
+#define I40E_PFHMC_SDCMD 0x000C0000 /* Reset: PFR */
+#define I40E_PFHMC_SDCMD_PMSDIDX_SHIFT 0
+#define I40E_PFHMC_SDCMD_PMSDIDX_MASK I40E_MASK(0xFFF, I40E_PFHMC_SDCMD_PMSDIDX_SHIFT)
+#define I40E_PFHMC_SDCMD_PMSDWR_SHIFT 31
+#define I40E_PFHMC_SDCMD_PMSDWR_MASK I40E_MASK(0x1, I40E_PFHMC_SDCMD_PMSDWR_SHIFT)
+#define I40E_PFHMC_SDDATAHIGH 0x000C0200 /* Reset: PFR */
+#define I40E_PFHMC_SDDATAHIGH_PMSDDATAHIGH_SHIFT 0
+#define I40E_PFHMC_SDDATAHIGH_PMSDDATAHIGH_MASK I40E_MASK(0xFFFFFFFF, I40E_PFHMC_SDDATAHIGH_PMSDDATAHIGH_SHIFT)
+#define I40E_PFHMC_SDDATALOW 0x000C0100 /* Reset: PFR */
+#define I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT 0
+#define I40E_PFHMC_SDDATALOW_PMSDVALID_MASK I40E_MASK(0x1, I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT)
+#define I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT 1
+#define I40E_PFHMC_SDDATALOW_PMSDTYPE_MASK I40E_MASK(0x1, I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT)
+#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT 2
+#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_MASK I40E_MASK(0x3FF, I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT)
+#define I40E_PFHMC_SDDATALOW_PMSDDATALOW_SHIFT 12
+#define I40E_PFHMC_SDDATALOW_PMSDDATALOW_MASK I40E_MASK(0xFFFFF, I40E_PFHMC_SDDATALOW_PMSDDATALOW_SHIFT)
+#define I40E_GL_GP_FUSE(_i) (0x0009400C + ((_i) * 4)) /* _i=0...28 */ /* Reset: POR */
+#define I40E_GL_GP_FUSE_MAX_INDEX 28
+#define I40E_GL_GP_FUSE_GL_GP_FUSE_SHIFT 0
+#define I40E_GL_GP_FUSE_GL_GP_FUSE_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_GP_FUSE_GL_GP_FUSE_SHIFT)
+#define I40E_GL_UFUSE 0x00094008 /* Reset: POR */
+#define I40E_GL_UFUSE_FOUR_PORT_ENABLE_SHIFT 1
+#define I40E_GL_UFUSE_FOUR_PORT_ENABLE_MASK I40E_MASK(0x1, I40E_GL_UFUSE_FOUR_PORT_ENABLE_SHIFT)
+#define I40E_GL_UFUSE_NIC_ID_SHIFT 2
+#define I40E_GL_UFUSE_NIC_ID_MASK I40E_MASK(0x1, I40E_GL_UFUSE_NIC_ID_SHIFT)
+#define I40E_GL_UFUSE_ULT_LOCKOUT_SHIFT 10
+#define I40E_GL_UFUSE_ULT_LOCKOUT_MASK I40E_MASK(0x1, I40E_GL_UFUSE_ULT_LOCKOUT_SHIFT)
+#define I40E_GL_UFUSE_CLS_LOCKOUT_SHIFT 11
+#define I40E_GL_UFUSE_CLS_LOCKOUT_MASK I40E_MASK(0x1, I40E_GL_UFUSE_CLS_LOCKOUT_SHIFT)
+#define I40E_EMPINT_GPIO_ENA 0x00088188 /* Reset: POR */
+#define I40E_EMPINT_GPIO_ENA_GPIO0_ENA_SHIFT 0
+#define I40E_EMPINT_GPIO_ENA_GPIO0_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO0_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO1_ENA_SHIFT 1
+#define I40E_EMPINT_GPIO_ENA_GPIO1_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO1_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO2_ENA_SHIFT 2
+#define I40E_EMPINT_GPIO_ENA_GPIO2_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO2_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO3_ENA_SHIFT 3
+#define I40E_EMPINT_GPIO_ENA_GPIO3_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO3_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO4_ENA_SHIFT 4
+#define I40E_EMPINT_GPIO_ENA_GPIO4_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO4_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO5_ENA_SHIFT 5
+#define I40E_EMPINT_GPIO_ENA_GPIO5_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO5_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO6_ENA_SHIFT 6
+#define I40E_EMPINT_GPIO_ENA_GPIO6_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO6_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO7_ENA_SHIFT 7
+#define I40E_EMPINT_GPIO_ENA_GPIO7_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO7_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO8_ENA_SHIFT 8
+#define I40E_EMPINT_GPIO_ENA_GPIO8_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO8_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO9_ENA_SHIFT 9
+#define I40E_EMPINT_GPIO_ENA_GPIO9_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO9_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO10_ENA_SHIFT 10
+#define I40E_EMPINT_GPIO_ENA_GPIO10_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO10_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO11_ENA_SHIFT 11
+#define I40E_EMPINT_GPIO_ENA_GPIO11_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO11_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO12_ENA_SHIFT 12
+#define I40E_EMPINT_GPIO_ENA_GPIO12_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO12_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO13_ENA_SHIFT 13
+#define I40E_EMPINT_GPIO_ENA_GPIO13_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO13_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO14_ENA_SHIFT 14
+#define I40E_EMPINT_GPIO_ENA_GPIO14_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO14_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO15_ENA_SHIFT 15
+#define I40E_EMPINT_GPIO_ENA_GPIO15_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO15_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO16_ENA_SHIFT 16
+#define I40E_EMPINT_GPIO_ENA_GPIO16_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO16_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO17_ENA_SHIFT 17
+#define I40E_EMPINT_GPIO_ENA_GPIO17_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO17_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO18_ENA_SHIFT 18
+#define I40E_EMPINT_GPIO_ENA_GPIO18_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO18_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO19_ENA_SHIFT 19
+#define I40E_EMPINT_GPIO_ENA_GPIO19_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO19_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO20_ENA_SHIFT 20
+#define I40E_EMPINT_GPIO_ENA_GPIO20_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO20_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO21_ENA_SHIFT 21
+#define I40E_EMPINT_GPIO_ENA_GPIO21_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO21_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO22_ENA_SHIFT 22
+#define I40E_EMPINT_GPIO_ENA_GPIO22_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO22_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO23_ENA_SHIFT 23
+#define I40E_EMPINT_GPIO_ENA_GPIO23_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO23_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO24_ENA_SHIFT 24
+#define I40E_EMPINT_GPIO_ENA_GPIO24_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO24_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO25_ENA_SHIFT 25
+#define I40E_EMPINT_GPIO_ENA_GPIO25_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO25_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO26_ENA_SHIFT 26
+#define I40E_EMPINT_GPIO_ENA_GPIO26_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO26_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO27_ENA_SHIFT 27
+#define I40E_EMPINT_GPIO_ENA_GPIO27_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO27_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO28_ENA_SHIFT 28
+#define I40E_EMPINT_GPIO_ENA_GPIO28_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO28_ENA_SHIFT)
+#define I40E_EMPINT_GPIO_ENA_GPIO29_ENA_SHIFT 29
+#define I40E_EMPINT_GPIO_ENA_GPIO29_ENA_MASK I40E_MASK(0x1, I40E_EMPINT_GPIO_ENA_GPIO29_ENA_SHIFT)
+#define I40E_PFGEN_PORTMDIO_NUM 0x0003F100 /* Reset: CORER */
+#define I40E_PFGEN_PORTMDIO_NUM_PORT_NUM_SHIFT 0
+#define I40E_PFGEN_PORTMDIO_NUM_PORT_NUM_MASK I40E_MASK(0x3, I40E_PFGEN_PORTMDIO_NUM_PORT_NUM_SHIFT)
+#define I40E_PFGEN_PORTMDIO_NUM_VFLINK_STAT_ENA_SHIFT 4
+#define I40E_PFGEN_PORTMDIO_NUM_VFLINK_STAT_ENA_MASK I40E_MASK(0x1, I40E_PFGEN_PORTMDIO_NUM_VFLINK_STAT_ENA_SHIFT)
+#define I40E_PFINT_AEQCTL 0x00038700 /* Reset: CORER */
+#define I40E_PFINT_AEQCTL_MSIX_INDX_SHIFT 0
+#define I40E_PFINT_AEQCTL_MSIX_INDX_MASK I40E_MASK(0xFF, I40E_PFINT_AEQCTL_MSIX_INDX_SHIFT)
+#define I40E_PFINT_AEQCTL_ITR_INDX_SHIFT 11
+#define I40E_PFINT_AEQCTL_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_AEQCTL_ITR_INDX_SHIFT)
+#define I40E_PFINT_AEQCTL_MSIX0_INDX_SHIFT 13
+#define I40E_PFINT_AEQCTL_MSIX0_INDX_MASK I40E_MASK(0x7, I40E_PFINT_AEQCTL_MSIX0_INDX_SHIFT)
+#define I40E_PFINT_AEQCTL_CAUSE_ENA_SHIFT 30
+#define I40E_PFINT_AEQCTL_CAUSE_ENA_MASK I40E_MASK(0x1, I40E_PFINT_AEQCTL_CAUSE_ENA_SHIFT)
+#define I40E_PFINT_AEQCTL_INTEVENT_SHIFT 31
+#define I40E_PFINT_AEQCTL_INTEVENT_MASK I40E_MASK(0x1, I40E_PFINT_AEQCTL_INTEVENT_SHIFT)
+#define I40E_PFINT_CEQCTL(_INTPF) (0x00036800 + ((_INTPF) * 4)) /* _i=0...511 */ /* Reset: CORER */
+#define I40E_PFINT_CEQCTL_MAX_INDEX 511
+#define I40E_PFINT_CEQCTL_MSIX_INDX_SHIFT 0
+#define I40E_PFINT_CEQCTL_MSIX_INDX_MASK I40E_MASK(0xFF, I40E_PFINT_CEQCTL_MSIX_INDX_SHIFT)
+#define I40E_PFINT_CEQCTL_ITR_INDX_SHIFT 11
+#define I40E_PFINT_CEQCTL_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_CEQCTL_ITR_INDX_SHIFT)
+#define I40E_PFINT_CEQCTL_MSIX0_INDX_SHIFT 13
+#define I40E_PFINT_CEQCTL_MSIX0_INDX_MASK I40E_MASK(0x7, I40E_PFINT_CEQCTL_MSIX0_INDX_SHIFT)
+#define I40E_PFINT_CEQCTL_NEXTQ_INDX_SHIFT 16
+#define I40E_PFINT_CEQCTL_NEXTQ_INDX_MASK I40E_MASK(0x7FF, I40E_PFINT_CEQCTL_NEXTQ_INDX_SHIFT)
+#define I40E_PFINT_CEQCTL_NEXTQ_TYPE_SHIFT 27
+#define I40E_PFINT_CEQCTL_NEXTQ_TYPE_MASK I40E_MASK(0x3, I40E_PFINT_CEQCTL_NEXTQ_TYPE_SHIFT)
+#define I40E_PFINT_CEQCTL_CAUSE_ENA_SHIFT 30
+#define I40E_PFINT_CEQCTL_CAUSE_ENA_MASK I40E_MASK(0x1, I40E_PFINT_CEQCTL_CAUSE_ENA_SHIFT)
+#define I40E_PFINT_CEQCTL_INTEVENT_SHIFT 31
+#define I40E_PFINT_CEQCTL_INTEVENT_MASK I40E_MASK(0x1, I40E_PFINT_CEQCTL_INTEVENT_SHIFT)
+#define I40E_GLINT_CTL 0x0003F800 /* Reset: CORER */
+#define I40E_GLINT_CTL_DIS_AUTOMASK_PF0_SHIFT 0
+#define I40E_GLINT_CTL_DIS_AUTOMASK_PF0_MASK I40E_MASK(0x1, I40E_GLINT_CTL_DIS_AUTOMASK_PF0_SHIFT)
+#define I40E_GLINT_CTL_DIS_AUTOMASK_VF0_SHIFT 1
+#define I40E_GLINT_CTL_DIS_AUTOMASK_VF0_MASK I40E_MASK(0x1, I40E_GLINT_CTL_DIS_AUTOMASK_VF0_SHIFT)
+#define I40E_GLINT_CTL_DIS_AUTOMASK_N_SHIFT 2
+#define I40E_GLINT_CTL_DIS_AUTOMASK_N_MASK I40E_MASK(0x1, I40E_GLINT_CTL_DIS_AUTOMASK_N_SHIFT)
+#define I40E_PFINT_DYN_CTL0 0x00038480 /* Reset: PFR */
+#define I40E_PFINT_DYN_CTL0_INTENA_SHIFT 0
+#define I40E_PFINT_DYN_CTL0_INTENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_INTENA_SHIFT)
+#define I40E_PFINT_DYN_CTL0_CLEARPBA_SHIFT 1
+#define I40E_PFINT_DYN_CTL0_CLEARPBA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_CLEARPBA_SHIFT)
+#define I40E_PFINT_DYN_CTL0_SWINT_TRIG_SHIFT 2
+#define I40E_PFINT_DYN_CTL0_SWINT_TRIG_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_SWINT_TRIG_SHIFT)
+#define I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT 3
+#define I40E_PFINT_DYN_CTL0_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT)
+#define I40E_PFINT_DYN_CTL0_INTERVAL_SHIFT 5
+#define I40E_PFINT_DYN_CTL0_INTERVAL_MASK I40E_MASK(0xFFF, I40E_PFINT_DYN_CTL0_INTERVAL_SHIFT)
+#define I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_SHIFT 24
+#define I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_SHIFT)
+#define I40E_PFINT_DYN_CTL0_SW_ITR_INDX_SHIFT 25
+#define I40E_PFINT_DYN_CTL0_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTL0_SW_ITR_INDX_SHIFT)
+#define I40E_PFINT_DYN_CTL0_INTENA_MSK_SHIFT 31
+#define I40E_PFINT_DYN_CTL0_INTENA_MSK_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_INTENA_MSK_SHIFT)
+#define I40E_PFINT_DYN_CTLN(_INTPF) (0x00034800 + ((_INTPF) * 4)) /* _i=0...511 */ /* Reset: PFR */
+#define I40E_PFINT_DYN_CTLN_MAX_INDEX 511
+#define I40E_PFINT_DYN_CTLN_INTENA_SHIFT 0
+#define I40E_PFINT_DYN_CTLN_INTENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_INTENA_SHIFT)
+#define I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT 1
+#define I40E_PFINT_DYN_CTLN_CLEARPBA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT)
+#define I40E_PFINT_DYN_CTLN_SWINT_TRIG_SHIFT 2
+#define I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_SWINT_TRIG_SHIFT)
+#define I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT 3
+#define I40E_PFINT_DYN_CTLN_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT)
+#define I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT 5
+#define I40E_PFINT_DYN_CTLN_INTERVAL_MASK I40E_MASK(0xFFF, I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT)
+#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT 24
+#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT)
+#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT 25
+#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT)
+#define I40E_PFINT_DYN_CTLN_INTENA_MSK_SHIFT 31
+#define I40E_PFINT_DYN_CTLN_INTENA_MSK_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_INTENA_MSK_SHIFT)
+#define I40E_PFINT_GPIO_ENA 0x00088080 /* Reset: CORER */
+#define I40E_PFINT_GPIO_ENA_GPIO0_ENA_SHIFT 0
+#define I40E_PFINT_GPIO_ENA_GPIO0_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO0_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO1_ENA_SHIFT 1
+#define I40E_PFINT_GPIO_ENA_GPIO1_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO1_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO2_ENA_SHIFT 2
+#define I40E_PFINT_GPIO_ENA_GPIO2_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO2_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO3_ENA_SHIFT 3
+#define I40E_PFINT_GPIO_ENA_GPIO3_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO3_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO4_ENA_SHIFT 4
+#define I40E_PFINT_GPIO_ENA_GPIO4_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO4_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO5_ENA_SHIFT 5
+#define I40E_PFINT_GPIO_ENA_GPIO5_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO5_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO6_ENA_SHIFT 6
+#define I40E_PFINT_GPIO_ENA_GPIO6_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO6_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO7_ENA_SHIFT 7
+#define I40E_PFINT_GPIO_ENA_GPIO7_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO7_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO8_ENA_SHIFT 8
+#define I40E_PFINT_GPIO_ENA_GPIO8_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO8_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO9_ENA_SHIFT 9
+#define I40E_PFINT_GPIO_ENA_GPIO9_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO9_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO10_ENA_SHIFT 10
+#define I40E_PFINT_GPIO_ENA_GPIO10_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO10_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO11_ENA_SHIFT 11
+#define I40E_PFINT_GPIO_ENA_GPIO11_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO11_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO12_ENA_SHIFT 12
+#define I40E_PFINT_GPIO_ENA_GPIO12_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO12_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO13_ENA_SHIFT 13
+#define I40E_PFINT_GPIO_ENA_GPIO13_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO13_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO14_ENA_SHIFT 14
+#define I40E_PFINT_GPIO_ENA_GPIO14_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO14_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO15_ENA_SHIFT 15
+#define I40E_PFINT_GPIO_ENA_GPIO15_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO15_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO16_ENA_SHIFT 16
+#define I40E_PFINT_GPIO_ENA_GPIO16_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO16_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO17_ENA_SHIFT 17
+#define I40E_PFINT_GPIO_ENA_GPIO17_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO17_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO18_ENA_SHIFT 18
+#define I40E_PFINT_GPIO_ENA_GPIO18_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO18_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO19_ENA_SHIFT 19
+#define I40E_PFINT_GPIO_ENA_GPIO19_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO19_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO20_ENA_SHIFT 20
+#define I40E_PFINT_GPIO_ENA_GPIO20_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO20_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO21_ENA_SHIFT 21
+#define I40E_PFINT_GPIO_ENA_GPIO21_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO21_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO22_ENA_SHIFT 22
+#define I40E_PFINT_GPIO_ENA_GPIO22_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO22_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO23_ENA_SHIFT 23
+#define I40E_PFINT_GPIO_ENA_GPIO23_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO23_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO24_ENA_SHIFT 24
+#define I40E_PFINT_GPIO_ENA_GPIO24_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO24_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO25_ENA_SHIFT 25
+#define I40E_PFINT_GPIO_ENA_GPIO25_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO25_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO26_ENA_SHIFT 26
+#define I40E_PFINT_GPIO_ENA_GPIO26_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO26_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO27_ENA_SHIFT 27
+#define I40E_PFINT_GPIO_ENA_GPIO27_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO27_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO28_ENA_SHIFT 28
+#define I40E_PFINT_GPIO_ENA_GPIO28_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO28_ENA_SHIFT)
+#define I40E_PFINT_GPIO_ENA_GPIO29_ENA_SHIFT 29
+#define I40E_PFINT_GPIO_ENA_GPIO29_ENA_MASK I40E_MASK(0x1, I40E_PFINT_GPIO_ENA_GPIO29_ENA_SHIFT)
+#define I40E_PFINT_ICR0 0x00038780 /* Reset: CORER */
+#define I40E_PFINT_ICR0_INTEVENT_SHIFT 0
+#define I40E_PFINT_ICR0_INTEVENT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_INTEVENT_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_0_SHIFT 1
+#define I40E_PFINT_ICR0_QUEUE_0_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_0_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_1_SHIFT 2
+#define I40E_PFINT_ICR0_QUEUE_1_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_1_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_2_SHIFT 3
+#define I40E_PFINT_ICR0_QUEUE_2_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_2_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_3_SHIFT 4
+#define I40E_PFINT_ICR0_QUEUE_3_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_3_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_4_SHIFT 5
+#define I40E_PFINT_ICR0_QUEUE_4_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_4_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_5_SHIFT 6
+#define I40E_PFINT_ICR0_QUEUE_5_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_5_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_6_SHIFT 7
+#define I40E_PFINT_ICR0_QUEUE_6_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_6_SHIFT)
+#define I40E_PFINT_ICR0_QUEUE_7_SHIFT 8
+#define I40E_PFINT_ICR0_QUEUE_7_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_QUEUE_7_SHIFT)
+#define I40E_PFINT_ICR0_ECC_ERR_SHIFT 16
+#define I40E_PFINT_ICR0_ECC_ERR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ECC_ERR_SHIFT)
+#define I40E_PFINT_ICR0_MAL_DETECT_SHIFT 19
+#define I40E_PFINT_ICR0_MAL_DETECT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_MAL_DETECT_SHIFT)
+#define I40E_PFINT_ICR0_GRST_SHIFT 20
+#define I40E_PFINT_ICR0_GRST_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_GRST_SHIFT)
+#define I40E_PFINT_ICR0_PCI_EXCEPTION_SHIFT 21
+#define I40E_PFINT_ICR0_PCI_EXCEPTION_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_PCI_EXCEPTION_SHIFT)
+#define I40E_PFINT_ICR0_GPIO_SHIFT 22
+#define I40E_PFINT_ICR0_GPIO_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_GPIO_SHIFT)
+#define I40E_PFINT_ICR0_TIMESYNC_SHIFT 23
+#define I40E_PFINT_ICR0_TIMESYNC_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_TIMESYNC_SHIFT)
+#define I40E_PFINT_ICR0_STORM_DETECT_SHIFT 24
+#define I40E_PFINT_ICR0_STORM_DETECT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_STORM_DETECT_SHIFT)
+#define I40E_PFINT_ICR0_LINK_STAT_CHANGE_SHIFT 25
+#define I40E_PFINT_ICR0_LINK_STAT_CHANGE_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_LINK_STAT_CHANGE_SHIFT)
+#define I40E_PFINT_ICR0_HMC_ERR_SHIFT 26
+#define I40E_PFINT_ICR0_HMC_ERR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_HMC_ERR_SHIFT)
+#define I40E_PFINT_ICR0_PE_CRITERR_SHIFT 28
+#define I40E_PFINT_ICR0_PE_CRITERR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_PE_CRITERR_SHIFT)
+#define I40E_PFINT_ICR0_VFLR_SHIFT 29
+#define I40E_PFINT_ICR0_VFLR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_VFLR_SHIFT)
+#define I40E_PFINT_ICR0_ADMINQ_SHIFT 30
+#define I40E_PFINT_ICR0_ADMINQ_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ADMINQ_SHIFT)
+#define I40E_PFINT_ICR0_SWINT_SHIFT 31
+#define I40E_PFINT_ICR0_SWINT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_SWINT_SHIFT)
+#define I40E_PFINT_ICR0_ENA 0x00038800 /* Reset: CORER */
+#define I40E_PFINT_ICR0_ENA_ECC_ERR_SHIFT 16
+#define I40E_PFINT_ICR0_ENA_ECC_ERR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_ECC_ERR_SHIFT)
+#define I40E_PFINT_ICR0_ENA_MAL_DETECT_SHIFT 19
+#define I40E_PFINT_ICR0_ENA_MAL_DETECT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_MAL_DETECT_SHIFT)
+#define I40E_PFINT_ICR0_ENA_GRST_SHIFT 20
+#define I40E_PFINT_ICR0_ENA_GRST_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_GRST_SHIFT)
+#define I40E_PFINT_ICR0_ENA_PCI_EXCEPTION_SHIFT 21
+#define I40E_PFINT_ICR0_ENA_PCI_EXCEPTION_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_PCI_EXCEPTION_SHIFT)
+#define I40E_PFINT_ICR0_ENA_GPIO_SHIFT 22
+#define I40E_PFINT_ICR0_ENA_GPIO_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_GPIO_SHIFT)
+#define I40E_PFINT_ICR0_ENA_TIMESYNC_SHIFT 23
+#define I40E_PFINT_ICR0_ENA_TIMESYNC_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_TIMESYNC_SHIFT)
+#define I40E_PFINT_ICR0_ENA_STORM_DETECT_SHIFT 24
+#define I40E_PFINT_ICR0_ENA_STORM_DETECT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_STORM_DETECT_SHIFT)
+#define I40E_PFINT_ICR0_ENA_LINK_STAT_CHANGE_SHIFT 25
+#define I40E_PFINT_ICR0_ENA_LINK_STAT_CHANGE_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_LINK_STAT_CHANGE_SHIFT)
+#define I40E_PFINT_ICR0_ENA_HMC_ERR_SHIFT 26
+#define I40E_PFINT_ICR0_ENA_HMC_ERR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_HMC_ERR_SHIFT)
+#define I40E_PFINT_ICR0_ENA_PE_CRITERR_SHIFT 28
+#define I40E_PFINT_ICR0_ENA_PE_CRITERR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_PE_CRITERR_SHIFT)
+#define I40E_PFINT_ICR0_ENA_VFLR_SHIFT 29
+#define I40E_PFINT_ICR0_ENA_VFLR_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_VFLR_SHIFT)
+#define I40E_PFINT_ICR0_ENA_ADMINQ_SHIFT 30
+#define I40E_PFINT_ICR0_ENA_ADMINQ_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_ADMINQ_SHIFT)
+#define I40E_PFINT_ICR0_ENA_RSVD_SHIFT 31
+#define I40E_PFINT_ICR0_ENA_RSVD_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_ENA_RSVD_SHIFT)
+#define I40E_PFINT_ITR0(_i) (0x00038000 + ((_i) * 128)) /* _i=0...2 */ /* Reset: PFR */
+#define I40E_PFINT_ITR0_MAX_INDEX 2
+#define I40E_PFINT_ITR0_INTERVAL_SHIFT 0
+#define I40E_PFINT_ITR0_INTERVAL_MASK I40E_MASK(0xFFF, I40E_PFINT_ITR0_INTERVAL_SHIFT)
+#define I40E_PFINT_ITRN(_i, _INTPF) (0x00030000 + ((_i) * 2048 + (_INTPF) * 4)) /* _i=0...2, _INTPF=0...511 */ /* Reset: PFR */
+#define I40E_PFINT_ITRN_MAX_INDEX 2
+#define I40E_PFINT_ITRN_INTERVAL_SHIFT 0
+#define I40E_PFINT_ITRN_INTERVAL_MASK I40E_MASK(0xFFF, I40E_PFINT_ITRN_INTERVAL_SHIFT)
+#define I40E_PFINT_LNKLST0 0x00038500 /* Reset: PFR */
+#define I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT 0
+#define I40E_PFINT_LNKLST0_FIRSTQ_INDX_MASK I40E_MASK(0x7FF, I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT)
+#define I40E_PFINT_LNKLST0_FIRSTQ_TYPE_SHIFT 11
+#define I40E_PFINT_LNKLST0_FIRSTQ_TYPE_MASK I40E_MASK(0x3, I40E_PFINT_LNKLST0_FIRSTQ_TYPE_SHIFT)
+#define I40E_PFINT_LNKLSTN(_INTPF) (0x00035000 + ((_INTPF) * 4)) /* _i=0...511 */ /* Reset: PFR */
+#define I40E_PFINT_LNKLSTN_MAX_INDEX 511
+#define I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT 0
+#define I40E_PFINT_LNKLSTN_FIRSTQ_INDX_MASK I40E_MASK(0x7FF, I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT)
+#define I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT 11
+#define I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_MASK I40E_MASK(0x3, I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT)
+#define I40E_PFINT_RATE0 0x00038580 /* Reset: PFR */
+#define I40E_PFINT_RATE0_INTERVAL_SHIFT 0
+#define I40E_PFINT_RATE0_INTERVAL_MASK I40E_MASK(0x3F, I40E_PFINT_RATE0_INTERVAL_SHIFT)
+#define I40E_PFINT_RATE0_INTRL_ENA_SHIFT 6
+#define I40E_PFINT_RATE0_INTRL_ENA_MASK I40E_MASK(0x1, I40E_PFINT_RATE0_INTRL_ENA_SHIFT)
+#define I40E_PFINT_RATEN(_INTPF) (0x00035800 + ((_INTPF) * 4)) /* _i=0...511 */ /* Reset: PFR */
+#define I40E_PFINT_RATEN_MAX_INDEX 511
+#define I40E_PFINT_RATEN_INTERVAL_SHIFT 0
+#define I40E_PFINT_RATEN_INTERVAL_MASK I40E_MASK(0x3F, I40E_PFINT_RATEN_INTERVAL_SHIFT)
+#define I40E_PFINT_RATEN_INTRL_ENA_SHIFT 6
+#define I40E_PFINT_RATEN_INTRL_ENA_MASK I40E_MASK(0x1, I40E_PFINT_RATEN_INTRL_ENA_SHIFT)
+#define I40E_PFINT_STAT_CTL0 0x00038400 /* Reset: CORER */
+#define I40E_PFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT 2
+#define I40E_PFINT_STAT_CTL0_OTHER_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT)
+#define I40E_QINT_RQCTL(_Q) (0x0003A000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */
+#define I40E_QINT_RQCTL_MAX_INDEX 1535
+#define I40E_QINT_RQCTL_MSIX_INDX_SHIFT 0
+#define I40E_QINT_RQCTL_MSIX_INDX_MASK I40E_MASK(0xFF, I40E_QINT_RQCTL_MSIX_INDX_SHIFT)
+#define I40E_QINT_RQCTL_ITR_INDX_SHIFT 11
+#define I40E_QINT_RQCTL_ITR_INDX_MASK I40E_MASK(0x3, I40E_QINT_RQCTL_ITR_INDX_SHIFT)
+#define I40E_QINT_RQCTL_MSIX0_INDX_SHIFT 13
+#define I40E_QINT_RQCTL_MSIX0_INDX_MASK I40E_MASK(0x7, I40E_QINT_RQCTL_MSIX0_INDX_SHIFT)
+#define I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT 16
+#define I40E_QINT_RQCTL_NEXTQ_INDX_MASK I40E_MASK(0x7FF, I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT)
+#define I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT 27
+#define I40E_QINT_RQCTL_NEXTQ_TYPE_MASK I40E_MASK(0x3, I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT)
+#define I40E_QINT_RQCTL_CAUSE_ENA_SHIFT 30
+#define I40E_QINT_RQCTL_CAUSE_ENA_MASK I40E_MASK(0x1, I40E_QINT_RQCTL_CAUSE_ENA_SHIFT)
+#define I40E_QINT_RQCTL_INTEVENT_SHIFT 31
+#define I40E_QINT_RQCTL_INTEVENT_MASK I40E_MASK(0x1, I40E_QINT_RQCTL_INTEVENT_SHIFT)
+#define I40E_QINT_TQCTL(_Q) (0x0003C000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */
+#define I40E_QINT_TQCTL_MAX_INDEX 1535
+#define I40E_QINT_TQCTL_MSIX_INDX_SHIFT 0
+#define I40E_QINT_TQCTL_MSIX_INDX_MASK I40E_MASK(0xFF, I40E_QINT_TQCTL_MSIX_INDX_SHIFT)
+#define I40E_QINT_TQCTL_ITR_INDX_SHIFT 11
+#define I40E_QINT_TQCTL_ITR_INDX_MASK I40E_MASK(0x3, I40E_QINT_TQCTL_ITR_INDX_SHIFT)
+#define I40E_QINT_TQCTL_MSIX0_INDX_SHIFT 13
+#define I40E_QINT_TQCTL_MSIX0_INDX_MASK I40E_MASK(0x7, I40E_QINT_TQCTL_MSIX0_INDX_SHIFT)
+#define I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT 16
+#define I40E_QINT_TQCTL_NEXTQ_INDX_MASK I40E_MASK(0x7FF, I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT)
+#define I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT 27
+#define I40E_QINT_TQCTL_NEXTQ_TYPE_MASK I40E_MASK(0x3, I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT)
+#define I40E_QINT_TQCTL_CAUSE_ENA_SHIFT 30
+#define I40E_QINT_TQCTL_CAUSE_ENA_MASK I40E_MASK(0x1, I40E_QINT_TQCTL_CAUSE_ENA_SHIFT)
+#define I40E_QINT_TQCTL_INTEVENT_SHIFT 31
+#define I40E_QINT_TQCTL_INTEVENT_MASK I40E_MASK(0x1, I40E_QINT_TQCTL_INTEVENT_SHIFT)
+#define I40E_VFINT_DYN_CTL0(_VF) (0x0002A400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFINT_DYN_CTL0_MAX_INDEX 127
+#define I40E_VFINT_DYN_CTL0_INTENA_SHIFT 0
+#define I40E_VFINT_DYN_CTL0_INTENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_INTENA_SHIFT)
+#define I40E_VFINT_DYN_CTL0_CLEARPBA_SHIFT 1
+#define I40E_VFINT_DYN_CTL0_CLEARPBA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_CLEARPBA_SHIFT)
+#define I40E_VFINT_DYN_CTL0_SWINT_TRIG_SHIFT 2
+#define I40E_VFINT_DYN_CTL0_SWINT_TRIG_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_SWINT_TRIG_SHIFT)
+#define I40E_VFINT_DYN_CTL0_ITR_INDX_SHIFT 3
+#define I40E_VFINT_DYN_CTL0_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTL0_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTL0_INTERVAL_SHIFT 5
+#define I40E_VFINT_DYN_CTL0_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_DYN_CTL0_INTERVAL_SHIFT)
+#define I40E_VFINT_DYN_CTL0_SW_ITR_INDX_ENA_SHIFT 24
+#define I40E_VFINT_DYN_CTL0_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_SW_ITR_INDX_ENA_SHIFT)
+#define I40E_VFINT_DYN_CTL0_SW_ITR_INDX_SHIFT 25
+#define I40E_VFINT_DYN_CTL0_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTL0_SW_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTL0_INTENA_MSK_SHIFT 31
+#define I40E_VFINT_DYN_CTL0_INTENA_MSK_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_INTENA_MSK_SHIFT)
+#define I40E_VFINT_DYN_CTLN(_INTVF) (0x00024800 + ((_INTVF) * 4)) /* _i=0...511 */ /* Reset: VFR */
+#define I40E_VFINT_DYN_CTLN_MAX_INDEX 511
+#define I40E_VFINT_DYN_CTLN_INTENA_SHIFT 0
+#define I40E_VFINT_DYN_CTLN_INTENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_INTENA_SHIFT)
+#define I40E_VFINT_DYN_CTLN_CLEARPBA_SHIFT 1
+#define I40E_VFINT_DYN_CTLN_CLEARPBA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_CLEARPBA_SHIFT)
+#define I40E_VFINT_DYN_CTLN_SWINT_TRIG_SHIFT 2
+#define I40E_VFINT_DYN_CTLN_SWINT_TRIG_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_SWINT_TRIG_SHIFT)
+#define I40E_VFINT_DYN_CTLN_ITR_INDX_SHIFT 3
+#define I40E_VFINT_DYN_CTLN_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTLN_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTLN_INTERVAL_SHIFT 5
+#define I40E_VFINT_DYN_CTLN_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_DYN_CTLN_INTERVAL_SHIFT)
+#define I40E_VFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT 24
+#define I40E_VFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT)
+#define I40E_VFINT_DYN_CTLN_SW_ITR_INDX_SHIFT 25
+#define I40E_VFINT_DYN_CTLN_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTLN_SW_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTLN_INTENA_MSK_SHIFT 31
+#define I40E_VFINT_DYN_CTLN_INTENA_MSK_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_INTENA_MSK_SHIFT)
+#define I40E_VFINT_ICR0(_VF) (0x0002BC00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VFINT_ICR0_MAX_INDEX 127
+#define I40E_VFINT_ICR0_INTEVENT_SHIFT 0
+#define I40E_VFINT_ICR0_INTEVENT_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_INTEVENT_SHIFT)
+#define I40E_VFINT_ICR0_QUEUE_0_SHIFT 1
+#define I40E_VFINT_ICR0_QUEUE_0_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_QUEUE_0_SHIFT)
+#define I40E_VFINT_ICR0_QUEUE_1_SHIFT 2
+#define I40E_VFINT_ICR0_QUEUE_1_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_QUEUE_1_SHIFT)
+#define I40E_VFINT_ICR0_QUEUE_2_SHIFT 3
+#define I40E_VFINT_ICR0_QUEUE_2_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_QUEUE_2_SHIFT)
+#define I40E_VFINT_ICR0_QUEUE_3_SHIFT 4
+#define I40E_VFINT_ICR0_QUEUE_3_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_QUEUE_3_SHIFT)
+#define I40E_VFINT_ICR0_LINK_STAT_CHANGE_SHIFT 25
+#define I40E_VFINT_ICR0_LINK_STAT_CHANGE_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_LINK_STAT_CHANGE_SHIFT)
+#define I40E_VFINT_ICR0_ADMINQ_SHIFT 30
+#define I40E_VFINT_ICR0_ADMINQ_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ADMINQ_SHIFT)
+#define I40E_VFINT_ICR0_SWINT_SHIFT 31
+#define I40E_VFINT_ICR0_SWINT_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_SWINT_SHIFT)
+#define I40E_VFINT_ICR0_ENA(_VF) (0x0002C000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VFINT_ICR0_ENA_MAX_INDEX 127
+#define I40E_VFINT_ICR0_ENA_LINK_STAT_CHANGE_SHIFT 25
+#define I40E_VFINT_ICR0_ENA_LINK_STAT_CHANGE_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ENA_LINK_STAT_CHANGE_SHIFT)
+#define I40E_VFINT_ICR0_ENA_ADMINQ_SHIFT 30
+#define I40E_VFINT_ICR0_ENA_ADMINQ_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ENA_ADMINQ_SHIFT)
+#define I40E_VFINT_ICR0_ENA_RSVD_SHIFT 31
+#define I40E_VFINT_ICR0_ENA_RSVD_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ENA_RSVD_SHIFT)
+#define I40E_VFINT_ITR0(_i, _VF) (0x00028000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...2, _VF=0...127 */ /* Reset: VFR */
+#define I40E_VFINT_ITR0_MAX_INDEX 2
+#define I40E_VFINT_ITR0_INTERVAL_SHIFT 0
+#define I40E_VFINT_ITR0_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_ITR0_INTERVAL_SHIFT)
+#define I40E_VFINT_ITRN(_i, _INTVF) (0x00020000 + ((_i) * 2048 + (_INTVF) * 4)) /* _i=0...2, _INTVF=0...511 */ /* Reset: VFR */
+#define I40E_VFINT_ITRN_MAX_INDEX 2
+#define I40E_VFINT_ITRN_INTERVAL_SHIFT 0
+#define I40E_VFINT_ITRN_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_ITRN_INTERVAL_SHIFT)
+#define I40E_VFINT_STAT_CTL0(_VF) (0x0002A000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VFINT_STAT_CTL0_MAX_INDEX 127
+#define I40E_VFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT 2
+#define I40E_VFINT_STAT_CTL0_OTHER_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_STAT_CTL0_OTHER_ITR_INDX_SHIFT)
+#define I40E_VPINT_AEQCTL(_VF) (0x0002B800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VPINT_AEQCTL_MAX_INDEX 127
+#define I40E_VPINT_AEQCTL_MSIX_INDX_SHIFT 0
+#define I40E_VPINT_AEQCTL_MSIX_INDX_MASK I40E_MASK(0xFF, I40E_VPINT_AEQCTL_MSIX_INDX_SHIFT)
+#define I40E_VPINT_AEQCTL_ITR_INDX_SHIFT 11
+#define I40E_VPINT_AEQCTL_ITR_INDX_MASK I40E_MASK(0x3, I40E_VPINT_AEQCTL_ITR_INDX_SHIFT)
+#define I40E_VPINT_AEQCTL_MSIX0_INDX_SHIFT 13
+#define I40E_VPINT_AEQCTL_MSIX0_INDX_MASK I40E_MASK(0x7, I40E_VPINT_AEQCTL_MSIX0_INDX_SHIFT)
+#define I40E_VPINT_AEQCTL_CAUSE_ENA_SHIFT 30
+#define I40E_VPINT_AEQCTL_CAUSE_ENA_MASK I40E_MASK(0x1, I40E_VPINT_AEQCTL_CAUSE_ENA_SHIFT)
+#define I40E_VPINT_AEQCTL_INTEVENT_SHIFT 31
+#define I40E_VPINT_AEQCTL_INTEVENT_MASK I40E_MASK(0x1, I40E_VPINT_AEQCTL_INTEVENT_SHIFT)
+#define I40E_VPINT_CEQCTL(_INTVF) (0x00026800 + ((_INTVF) * 4)) /* _i=0...511 */ /* Reset: CORER */
+#define I40E_VPINT_CEQCTL_MAX_INDEX 511
+#define I40E_VPINT_CEQCTL_MSIX_INDX_SHIFT 0
+#define I40E_VPINT_CEQCTL_MSIX_INDX_MASK I40E_MASK(0xFF, I40E_VPINT_CEQCTL_MSIX_INDX_SHIFT)
+#define I40E_VPINT_CEQCTL_ITR_INDX_SHIFT 11
+#define I40E_VPINT_CEQCTL_ITR_INDX_MASK I40E_MASK(0x3, I40E_VPINT_CEQCTL_ITR_INDX_SHIFT)
+#define I40E_VPINT_CEQCTL_MSIX0_INDX_SHIFT 13
+#define I40E_VPINT_CEQCTL_MSIX0_INDX_MASK I40E_MASK(0x7, I40E_VPINT_CEQCTL_MSIX0_INDX_SHIFT)
+#define I40E_VPINT_CEQCTL_NEXTQ_INDX_SHIFT 16
+#define I40E_VPINT_CEQCTL_NEXTQ_INDX_MASK I40E_MASK(0x7FF, I40E_VPINT_CEQCTL_NEXTQ_INDX_SHIFT)
+#define I40E_VPINT_CEQCTL_NEXTQ_TYPE_SHIFT 27
+#define I40E_VPINT_CEQCTL_NEXTQ_TYPE_MASK I40E_MASK(0x3, I40E_VPINT_CEQCTL_NEXTQ_TYPE_SHIFT)
+#define I40E_VPINT_CEQCTL_CAUSE_ENA_SHIFT 30
+#define I40E_VPINT_CEQCTL_CAUSE_ENA_MASK I40E_MASK(0x1, I40E_VPINT_CEQCTL_CAUSE_ENA_SHIFT)
+#define I40E_VPINT_CEQCTL_INTEVENT_SHIFT 31
+#define I40E_VPINT_CEQCTL_INTEVENT_MASK I40E_MASK(0x1, I40E_VPINT_CEQCTL_INTEVENT_SHIFT)
+#define I40E_VPINT_LNKLST0(_VF) (0x0002A800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VPINT_LNKLST0_MAX_INDEX 127
+#define I40E_VPINT_LNKLST0_FIRSTQ_INDX_SHIFT 0
+#define I40E_VPINT_LNKLST0_FIRSTQ_INDX_MASK I40E_MASK(0x7FF, I40E_VPINT_LNKLST0_FIRSTQ_INDX_SHIFT)
+#define I40E_VPINT_LNKLST0_FIRSTQ_TYPE_SHIFT 11
+#define I40E_VPINT_LNKLST0_FIRSTQ_TYPE_MASK I40E_MASK(0x3, I40E_VPINT_LNKLST0_FIRSTQ_TYPE_SHIFT)
+#define I40E_VPINT_LNKLSTN(_INTVF) (0x00025000 + ((_INTVF) * 4)) /* _i=0...511 */ /* Reset: VFR */
+#define I40E_VPINT_LNKLSTN_MAX_INDEX 511
+#define I40E_VPINT_LNKLSTN_FIRSTQ_INDX_SHIFT 0
+#define I40E_VPINT_LNKLSTN_FIRSTQ_INDX_MASK I40E_MASK(0x7FF, I40E_VPINT_LNKLSTN_FIRSTQ_INDX_SHIFT)
+#define I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_SHIFT 11
+#define I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_MASK I40E_MASK(0x3, I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_SHIFT)
+#define I40E_VPINT_RATE0(_VF) (0x0002AC00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VPINT_RATE0_MAX_INDEX 127
+#define I40E_VPINT_RATE0_INTERVAL_SHIFT 0
+#define I40E_VPINT_RATE0_INTERVAL_MASK I40E_MASK(0x3F, I40E_VPINT_RATE0_INTERVAL_SHIFT)
+#define I40E_VPINT_RATE0_INTRL_ENA_SHIFT 6
+#define I40E_VPINT_RATE0_INTRL_ENA_MASK I40E_MASK(0x1, I40E_VPINT_RATE0_INTRL_ENA_SHIFT)
+#define I40E_VPINT_RATEN(_INTVF) (0x00025800 + ((_INTVF) * 4)) /* _i=0...511 */ /* Reset: VFR */
+#define I40E_VPINT_RATEN_MAX_INDEX 511
+#define I40E_VPINT_RATEN_INTERVAL_SHIFT 0
+#define I40E_VPINT_RATEN_INTERVAL_MASK I40E_MASK(0x3F, I40E_VPINT_RATEN_INTERVAL_SHIFT)
+#define I40E_VPINT_RATEN_INTRL_ENA_SHIFT 6
+#define I40E_VPINT_RATEN_INTRL_ENA_MASK I40E_MASK(0x1, I40E_VPINT_RATEN_INTRL_ENA_SHIFT)
+#define I40E_GL_RDPU_CNTRL 0x00051060 /* Reset: CORER */
+#define I40E_GL_RDPU_CNTRL_RX_PAD_EN_SHIFT 0
+#define I40E_GL_RDPU_CNTRL_RX_PAD_EN_MASK I40E_MASK(0x1, I40E_GL_RDPU_CNTRL_RX_PAD_EN_SHIFT)
+#define I40E_GL_RDPU_CNTRL_ECO_SHIFT 1
+#define I40E_GL_RDPU_CNTRL_ECO_MASK I40E_MASK(0x7FFFFFFF, I40E_GL_RDPU_CNTRL_ECO_SHIFT)
+#define I40E_GLLAN_RCTL_0 0x0012A500 /* Reset: CORER */
+#define I40E_GLLAN_RCTL_0_PXE_MODE_SHIFT 0
+#define I40E_GLLAN_RCTL_0_PXE_MODE_MASK I40E_MASK(0x1, I40E_GLLAN_RCTL_0_PXE_MODE_SHIFT)
+#define I40E_GLLAN_TSOMSK_F 0x000442D8 /* Reset: CORER */
+#define I40E_GLLAN_TSOMSK_F_TCPMSKF_SHIFT 0
+#define I40E_GLLAN_TSOMSK_F_TCPMSKF_MASK I40E_MASK(0xFFF, I40E_GLLAN_TSOMSK_F_TCPMSKF_SHIFT)
+#define I40E_GLLAN_TSOMSK_L 0x000442E0 /* Reset: CORER */
+#define I40E_GLLAN_TSOMSK_L_TCPMSKL_SHIFT 0
+#define I40E_GLLAN_TSOMSK_L_TCPMSKL_MASK I40E_MASK(0xFFF, I40E_GLLAN_TSOMSK_L_TCPMSKL_SHIFT)
+#define I40E_GLLAN_TSOMSK_M 0x000442DC /* Reset: CORER */
+#define I40E_GLLAN_TSOMSK_M_TCPMSKM_SHIFT 0
+#define I40E_GLLAN_TSOMSK_M_TCPMSKM_MASK I40E_MASK(0xFFF, I40E_GLLAN_TSOMSK_M_TCPMSKM_SHIFT)
+#define I40E_GLLAN_TXPRE_QDIS(_i) (0x000e6500 + ((_i) * 4)) /* _i=0...11 */ /* Reset: CORER */
+#define I40E_GLLAN_TXPRE_QDIS_MAX_INDEX 11
+#define I40E_GLLAN_TXPRE_QDIS_QINDX_SHIFT 0
+#define I40E_GLLAN_TXPRE_QDIS_QINDX_MASK I40E_MASK(0x7FF, I40E_GLLAN_TXPRE_QDIS_QINDX_SHIFT)
+#define I40E_GLLAN_TXPRE_QDIS_QDIS_STAT_SHIFT 16
+#define I40E_GLLAN_TXPRE_QDIS_QDIS_STAT_MASK I40E_MASK(0x1, I40E_GLLAN_TXPRE_QDIS_QDIS_STAT_SHIFT)
+#define I40E_GLLAN_TXPRE_QDIS_SET_QDIS_SHIFT 30
+#define I40E_GLLAN_TXPRE_QDIS_SET_QDIS_MASK I40E_MASK(0x1, I40E_GLLAN_TXPRE_QDIS_SET_QDIS_SHIFT)
+#define I40E_GLLAN_TXPRE_QDIS_CLEAR_QDIS_SHIFT 31
+#define I40E_GLLAN_TXPRE_QDIS_CLEAR_QDIS_MASK I40E_MASK(0x1, I40E_GLLAN_TXPRE_QDIS_CLEAR_QDIS_SHIFT)
+#define I40E_PFLAN_QALLOC 0x001C0400 /* Reset: CORER */
+#define I40E_PFLAN_QALLOC_FIRSTQ_SHIFT 0
+#define I40E_PFLAN_QALLOC_FIRSTQ_MASK I40E_MASK(0x7FF, I40E_PFLAN_QALLOC_FIRSTQ_SHIFT)
+#define I40E_PFLAN_QALLOC_LASTQ_SHIFT 16
+#define I40E_PFLAN_QALLOC_LASTQ_MASK I40E_MASK(0x7FF, I40E_PFLAN_QALLOC_LASTQ_SHIFT)
+#define I40E_PFLAN_QALLOC_VALID_SHIFT 31
+#define I40E_PFLAN_QALLOC_VALID_MASK I40E_MASK(0x1, I40E_PFLAN_QALLOC_VALID_SHIFT)
+#define I40E_QRX_ENA(_Q) (0x00120000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: PFR */
+#define I40E_QRX_ENA_MAX_INDEX 1535
+#define I40E_QRX_ENA_QENA_REQ_SHIFT 0
+#define I40E_QRX_ENA_QENA_REQ_MASK I40E_MASK(0x1, I40E_QRX_ENA_QENA_REQ_SHIFT)
+#define I40E_QRX_ENA_FAST_QDIS_SHIFT 1
+#define I40E_QRX_ENA_FAST_QDIS_MASK I40E_MASK(0x1, I40E_QRX_ENA_FAST_QDIS_SHIFT)
+#define I40E_QRX_ENA_QENA_STAT_SHIFT 2
+#define I40E_QRX_ENA_QENA_STAT_MASK I40E_MASK(0x1, I40E_QRX_ENA_QENA_STAT_SHIFT)
+#define I40E_QRX_TAIL(_Q) (0x00128000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */
+#define I40E_QRX_TAIL_MAX_INDEX 1535
+#define I40E_QRX_TAIL_TAIL_SHIFT 0
+#define I40E_QRX_TAIL_TAIL_MASK I40E_MASK(0x1FFF, I40E_QRX_TAIL_TAIL_SHIFT)
+#define I40E_QTX_CTL(_Q) (0x00104000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */
+#define I40E_QTX_CTL_MAX_INDEX 1535
+#define I40E_QTX_CTL_PFVF_Q_SHIFT 0
+#define I40E_QTX_CTL_PFVF_Q_MASK I40E_MASK(0x3, I40E_QTX_CTL_PFVF_Q_SHIFT)
+#define I40E_QTX_CTL_PF_INDX_SHIFT 2
+#define I40E_QTX_CTL_PF_INDX_MASK I40E_MASK(0xF, I40E_QTX_CTL_PF_INDX_SHIFT)
+#define I40E_QTX_CTL_VFVM_INDX_SHIFT 7
+#define I40E_QTX_CTL_VFVM_INDX_MASK I40E_MASK(0x1FF, I40E_QTX_CTL_VFVM_INDX_SHIFT)
+#define I40E_QTX_ENA(_Q) (0x00100000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: PFR */
+#define I40E_QTX_ENA_MAX_INDEX 1535
+#define I40E_QTX_ENA_QENA_REQ_SHIFT 0
+#define I40E_QTX_ENA_QENA_REQ_MASK I40E_MASK(0x1, I40E_QTX_ENA_QENA_REQ_SHIFT)
+#define I40E_QTX_ENA_FAST_QDIS_SHIFT 1
+#define I40E_QTX_ENA_FAST_QDIS_MASK I40E_MASK(0x1, I40E_QTX_ENA_FAST_QDIS_SHIFT)
+#define I40E_QTX_ENA_QENA_STAT_SHIFT 2
+#define I40E_QTX_ENA_QENA_STAT_MASK I40E_MASK(0x1, I40E_QTX_ENA_QENA_STAT_SHIFT)
+#define I40E_QTX_HEAD(_Q) (0x000E4000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: CORER */
+#define I40E_QTX_HEAD_MAX_INDEX 1535
+#define I40E_QTX_HEAD_HEAD_SHIFT 0
+#define I40E_QTX_HEAD_HEAD_MASK I40E_MASK(0x1FFF, I40E_QTX_HEAD_HEAD_SHIFT)
+#define I40E_QTX_HEAD_RS_PENDING_SHIFT 16
+#define I40E_QTX_HEAD_RS_PENDING_MASK I40E_MASK(0x1, I40E_QTX_HEAD_RS_PENDING_SHIFT)
+#define I40E_QTX_TAIL(_Q) (0x00108000 + ((_Q) * 4)) /* _i=0...1535 */ /* Reset: PFR */
+#define I40E_QTX_TAIL_MAX_INDEX 1535
+#define I40E_QTX_TAIL_TAIL_SHIFT 0
+#define I40E_QTX_TAIL_TAIL_MASK I40E_MASK(0x1FFF, I40E_QTX_TAIL_TAIL_SHIFT)
+#define I40E_VPLAN_MAPENA(_VF) (0x00074000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VPLAN_MAPENA_MAX_INDEX 127
+#define I40E_VPLAN_MAPENA_TXRX_ENA_SHIFT 0
+#define I40E_VPLAN_MAPENA_TXRX_ENA_MASK I40E_MASK(0x1, I40E_VPLAN_MAPENA_TXRX_ENA_SHIFT)
+#define I40E_VPLAN_QTABLE(_i, _VF) (0x00070000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...15, _VF=0...127 */ /* Reset: VFR */
+#define I40E_VPLAN_QTABLE_MAX_INDEX 15
+#define I40E_VPLAN_QTABLE_QINDEX_SHIFT 0
+#define I40E_VPLAN_QTABLE_QINDEX_MASK I40E_MASK(0x7FF, I40E_VPLAN_QTABLE_QINDEX_SHIFT)
+#define I40E_VSILAN_QBASE(_VSI) (0x0020C800 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: PFR */
+#define I40E_VSILAN_QBASE_MAX_INDEX 383
+#define I40E_VSILAN_QBASE_VSIBASE_SHIFT 0
+#define I40E_VSILAN_QBASE_VSIBASE_MASK I40E_MASK(0x7FF, I40E_VSILAN_QBASE_VSIBASE_SHIFT)
+#define I40E_VSILAN_QBASE_VSIQTABLE_ENA_SHIFT 11
+#define I40E_VSILAN_QBASE_VSIQTABLE_ENA_MASK I40E_MASK(0x1, I40E_VSILAN_QBASE_VSIQTABLE_ENA_SHIFT)
+#define I40E_VSILAN_QTABLE(_i, _VSI) (0x00200000 + ((_i) * 2048 + (_VSI) * 4)) /* _i=0...7, _VSI=0...383 */ /* Reset: PFR */
+#define I40E_VSILAN_QTABLE_MAX_INDEX 7
+#define I40E_VSILAN_QTABLE_QINDEX_0_SHIFT 0
+#define I40E_VSILAN_QTABLE_QINDEX_0_MASK I40E_MASK(0x7FF, I40E_VSILAN_QTABLE_QINDEX_0_SHIFT)
+#define I40E_VSILAN_QTABLE_QINDEX_1_SHIFT 16
+#define I40E_VSILAN_QTABLE_QINDEX_1_MASK I40E_MASK(0x7FF, I40E_VSILAN_QTABLE_QINDEX_1_SHIFT)
+#define I40E_PRTGL_SAH 0x001E2140 /* Reset: GLOBR */
+#define I40E_PRTGL_SAH_FC_SAH_SHIFT 0
+#define I40E_PRTGL_SAH_FC_SAH_MASK I40E_MASK(0xFFFF, I40E_PRTGL_SAH_FC_SAH_SHIFT)
+#define I40E_PRTGL_SAH_MFS_SHIFT 16
+#define I40E_PRTGL_SAH_MFS_MASK I40E_MASK(0xFFFF, I40E_PRTGL_SAH_MFS_SHIFT)
+#define I40E_PRTGL_SAL 0x001E2120 /* Reset: GLOBR */
+#define I40E_PRTGL_SAL_FC_SAL_SHIFT 0
+#define I40E_PRTGL_SAL_FC_SAL_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTGL_SAL_FC_SAL_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GCP 0x001E30E0 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GCP_HSEC_CTL_RX_ENABLE_GCP_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GCP_HSEC_CTL_RX_ENABLE_GCP_MASK I40E_MASK(0x1, I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GCP_HSEC_CTL_RX_ENABLE_GCP_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GPP 0x001E3260 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GPP_HSEC_CTL_RX_ENABLE_GPP_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GPP_HSEC_CTL_RX_ENABLE_GPP_MASK I40E_MASK(0x1, I40E_PRTMAC_HSEC_CTL_RX_ENABLE_GPP_HSEC_CTL_RX_ENABLE_GPP_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_PPP 0x001E32E0 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_PPP_HSEC_CTL_RX_ENABLE_PPP_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_ENABLE_PPP_HSEC_CTL_RX_ENABLE_PPP_MASK I40E_MASK(0x1, I40E_PRTMAC_HSEC_CTL_RX_ENABLE_PPP_HSEC_CTL_RX_ENABLE_PPP_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL 0x001E3360 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL_HSEC_CTL_RX_FORWARD_CONTROL_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL_HSEC_CTL_RX_FORWARD_CONTROL_MASK I40E_MASK(0x1, I40E_PRTMAC_HSEC_CTL_RX_FORWARD_CONTROL_HSEC_CTL_RX_FORWARD_CONTROL_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1 0x001E3110 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_HSEC_CTL_RX_PAUSE_DA_UCAST_PART1_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2 0x001E3120 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_MASK I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_HSEC_CTL_RX_PAUSE_DA_UCAST_PART2_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE 0x001E30C0 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE_HSEC_CTL_RX_PAUSE_ENABLE_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE_HSEC_CTL_RX_PAUSE_ENABLE_MASK I40E_MASK(0x1FF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_ENABLE_HSEC_CTL_RX_PAUSE_ENABLE_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1 0x001E3140 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1_HSEC_CTL_RX_PAUSE_SA_PART1_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1_HSEC_CTL_RX_PAUSE_SA_PART1_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART1_HSEC_CTL_RX_PAUSE_SA_PART1_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2 0x001E3150 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2_HSEC_CTL_RX_PAUSE_SA_PART2_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2_HSEC_CTL_RX_PAUSE_SA_PART2_MASK I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_RX_PAUSE_SA_PART2_HSEC_CTL_RX_PAUSE_SA_PART2_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE 0x001E30D0 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE_HSEC_CTL_TX_PAUSE_ENABLE_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE_HSEC_CTL_TX_PAUSE_ENABLE_MASK I40E_MASK(0x1FF, I40E_PRTMAC_HSEC_CTL_TX_PAUSE_ENABLE_HSEC_CTL_TX_PAUSE_ENABLE_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA(_i) (0x001E3370 + ((_i) * 16)) /* _i=0...8 */ /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_MAX_INDEX 8
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_MASK I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_TX_PAUSE_QUANTA_HSEC_CTL_TX_PAUSE_QUANTA_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER(_i) (0x001E3400 + ((_i) * 16)) /* _i=0...8 */ /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_MAX_INDEX 8
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_MASK I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_HSEC_CTL_TX_PAUSE_REFRESH_TIMER_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART1 0x001E34B0 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART1_HSEC_CTL_TX_SA_PART1_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART1_HSEC_CTL_TX_SA_PART1_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTMAC_HSEC_CTL_TX_SA_PART1_HSEC_CTL_TX_SA_PART1_SHIFT)
+#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART2 0x001E34C0 /* Reset: GLOBR */
+#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART2_HSEC_CTL_TX_SA_PART2_SHIFT 0
+#define I40E_PRTMAC_HSEC_CTL_TX_SA_PART2_HSEC_CTL_TX_SA_PART2_MASK I40E_MASK(0xFFFF, I40E_PRTMAC_HSEC_CTL_TX_SA_PART2_HSEC_CTL_TX_SA_PART2_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A 0x0008C480 /* Reset: GLOBR */
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE3_SHIFT 0
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE3_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE3_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE2_SHIFT 2
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE2_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE2_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE1_SHIFT 4
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE1_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE1_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE0_SHIFT 6
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE0_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_TX_LANE0_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE3_SHIFT 8
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE3_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE3_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE2_SHIFT 10
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE2_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE2_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE1_SHIFT 12
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE1_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE1_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE0_SHIFT 14
+#define I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE0_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_A_SWAP_RX_LANE0_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B 0x0008C484 /* Reset: GLOBR */
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE3_SHIFT 0
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE3_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE3_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE2_SHIFT 2
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE2_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE2_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE1_SHIFT 4
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE1_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE1_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE0_SHIFT 6
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE0_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_TX_LANE0_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE3_SHIFT 8
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE3_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE3_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE2_SHIFT 10
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE2_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE2_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE1_SHIFT 12
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE1_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE1_SHIFT)
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE0_SHIFT 14
+#define I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE0_MASK I40E_MASK(0x3, I40E_PRTMAC_PCS_XAUI_SWAP_B_SWAP_RX_LANE0_SHIFT)
+#define I40E_GL_FWRESETCNT 0x00083100 /* Reset: POR */
+#define I40E_GL_FWRESETCNT_FWRESETCNT_SHIFT 0
+#define I40E_GL_FWRESETCNT_FWRESETCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FWRESETCNT_FWRESETCNT_SHIFT)
+#define I40E_GL_MNG_FWSM 0x000B6134 /* Reset: POR */
+#define I40E_GL_MNG_FWSM_FW_MODES_SHIFT 0
+#define I40E_GL_MNG_FWSM_FW_MODES_MASK I40E_MASK(0x3, I40E_GL_MNG_FWSM_FW_MODES_SHIFT)
+#define I40E_GL_MNG_FWSM_EEP_RELOAD_IND_SHIFT 10
+#define I40E_GL_MNG_FWSM_EEP_RELOAD_IND_MASK I40E_MASK(0x1, I40E_GL_MNG_FWSM_EEP_RELOAD_IND_SHIFT)
+#define I40E_GL_MNG_FWSM_CRC_ERROR_MODULE_SHIFT 11
+#define I40E_GL_MNG_FWSM_CRC_ERROR_MODULE_MASK I40E_MASK(0xF, I40E_GL_MNG_FWSM_CRC_ERROR_MODULE_SHIFT)
+#define I40E_GL_MNG_FWSM_FW_STATUS_VALID_SHIFT 15
+#define I40E_GL_MNG_FWSM_FW_STATUS_VALID_MASK I40E_MASK(0x1, I40E_GL_MNG_FWSM_FW_STATUS_VALID_SHIFT)
+#define I40E_GL_MNG_FWSM_RESET_CNT_SHIFT 16
+#define I40E_GL_MNG_FWSM_RESET_CNT_MASK I40E_MASK(0x7, I40E_GL_MNG_FWSM_RESET_CNT_SHIFT)
+#define I40E_GL_MNG_FWSM_EXT_ERR_IND_SHIFT 19
+#define I40E_GL_MNG_FWSM_EXT_ERR_IND_MASK I40E_MASK(0x3F, I40E_GL_MNG_FWSM_EXT_ERR_IND_SHIFT)
+#define I40E_GL_MNG_FWSM_PHY_SERDES0_CONFIG_ERR_SHIFT 26
+#define I40E_GL_MNG_FWSM_PHY_SERDES0_CONFIG_ERR_MASK I40E_MASK(0x1, I40E_GL_MNG_FWSM_PHY_SERDES0_CONFIG_ERR_SHIFT)
+#define I40E_GL_MNG_FWSM_PHY_SERDES1_CONFIG_ERR_SHIFT 27
+#define I40E_GL_MNG_FWSM_PHY_SERDES1_CONFIG_ERR_MASK I40E_MASK(0x1, I40E_GL_MNG_FWSM_PHY_SERDES1_CONFIG_ERR_SHIFT)
+#define I40E_GL_MNG_FWSM_PHY_SERDES2_CONFIG_ERR_SHIFT 28
+#define I40E_GL_MNG_FWSM_PHY_SERDES2_CONFIG_ERR_MASK I40E_MASK(0x1, I40E_GL_MNG_FWSM_PHY_SERDES2_CONFIG_ERR_SHIFT)
+#define I40E_GL_MNG_FWSM_PHY_SERDES3_CONFIG_ERR_SHIFT 29
+#define I40E_GL_MNG_FWSM_PHY_SERDES3_CONFIG_ERR_MASK I40E_MASK(0x1, I40E_GL_MNG_FWSM_PHY_SERDES3_CONFIG_ERR_SHIFT)
+#define I40E_GL_MNG_HWARB_CTRL 0x000B6130 /* Reset: POR */
+#define I40E_GL_MNG_HWARB_CTRL_NCSI_ARB_EN_SHIFT 0
+#define I40E_GL_MNG_HWARB_CTRL_NCSI_ARB_EN_MASK I40E_MASK(0x1, I40E_GL_MNG_HWARB_CTRL_NCSI_ARB_EN_SHIFT)
+#define I40E_PRT_MNG_FTFT_DATA(_i) (0x000852A0 + ((_i) * 32)) /* _i=0...31 */ /* Reset: POR */
+#define I40E_PRT_MNG_FTFT_DATA_MAX_INDEX 31
+#define I40E_PRT_MNG_FTFT_DATA_DWORD_SHIFT 0
+#define I40E_PRT_MNG_FTFT_DATA_DWORD_MASK I40E_MASK(0xFFFFFFFF, I40E_PRT_MNG_FTFT_DATA_DWORD_SHIFT)
+#define I40E_PRT_MNG_FTFT_LENGTH 0x00085260 /* Reset: POR */
+#define I40E_PRT_MNG_FTFT_LENGTH_LENGTH_SHIFT 0
+#define I40E_PRT_MNG_FTFT_LENGTH_LENGTH_MASK I40E_MASK(0xFF, I40E_PRT_MNG_FTFT_LENGTH_LENGTH_SHIFT)
+#define I40E_PRT_MNG_FTFT_MASK(_i) (0x00085160 + ((_i) * 32)) /* _i=0...7 */ /* Reset: POR */
+#define I40E_PRT_MNG_FTFT_MASK_MAX_INDEX 7
+#define I40E_PRT_MNG_FTFT_MASK_MASK_SHIFT 0
+#define I40E_PRT_MNG_FTFT_MASK_MASK_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_FTFT_MASK_MASK_SHIFT)
+#define I40E_PRT_MNG_MANC 0x00256A20 /* Reset: POR */
+#define I40E_PRT_MNG_MANC_FLOW_CONTROL_DISCARD_SHIFT 0
+#define I40E_PRT_MNG_MANC_FLOW_CONTROL_DISCARD_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_FLOW_CONTROL_DISCARD_SHIFT)
+#define I40E_PRT_MNG_MANC_NCSI_DISCARD_SHIFT 1
+#define I40E_PRT_MNG_MANC_NCSI_DISCARD_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_NCSI_DISCARD_SHIFT)
+#define I40E_PRT_MNG_MANC_RCV_TCO_EN_SHIFT 17
+#define I40E_PRT_MNG_MANC_RCV_TCO_EN_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_RCV_TCO_EN_SHIFT)
+#define I40E_PRT_MNG_MANC_RCV_ALL_SHIFT 19
+#define I40E_PRT_MNG_MANC_RCV_ALL_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_RCV_ALL_SHIFT)
+#define I40E_PRT_MNG_MANC_FIXED_NET_TYPE_SHIFT 25
+#define I40E_PRT_MNG_MANC_FIXED_NET_TYPE_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_FIXED_NET_TYPE_SHIFT)
+#define I40E_PRT_MNG_MANC_NET_TYPE_SHIFT 26
+#define I40E_PRT_MNG_MANC_NET_TYPE_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_NET_TYPE_SHIFT)
+#define I40E_PRT_MNG_MANC_EN_BMC2OS_SHIFT 28
+#define I40E_PRT_MNG_MANC_EN_BMC2OS_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_EN_BMC2OS_SHIFT)
+#define I40E_PRT_MNG_MANC_EN_BMC2NET_SHIFT 29
+#define I40E_PRT_MNG_MANC_EN_BMC2NET_MASK I40E_MASK(0x1, I40E_PRT_MNG_MANC_EN_BMC2NET_SHIFT)
+#define I40E_PRT_MNG_MAVTV(_i) (0x00255900 + ((_i) * 32)) /* _i=0...7 */ /* Reset: POR */
+#define I40E_PRT_MNG_MAVTV_MAX_INDEX 7
+#define I40E_PRT_MNG_MAVTV_VID_SHIFT 0
+#define I40E_PRT_MNG_MAVTV_VID_MASK I40E_MASK(0xFFF, I40E_PRT_MNG_MAVTV_VID_SHIFT)
+#define I40E_PRT_MNG_MDEF(_i) (0x00255D00 + ((_i) * 32)) /* _i=0...7 */ /* Reset: POR */
+#define I40E_PRT_MNG_MDEF_MAX_INDEX 7
+#define I40E_PRT_MNG_MDEF_MAC_EXACT_AND_SHIFT 0
+#define I40E_PRT_MNG_MDEF_MAC_EXACT_AND_MASK I40E_MASK(0xF, I40E_PRT_MNG_MDEF_MAC_EXACT_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_BROADCAST_AND_SHIFT 4
+#define I40E_PRT_MNG_MDEF_BROADCAST_AND_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_BROADCAST_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_VLAN_AND_SHIFT 5
+#define I40E_PRT_MNG_MDEF_VLAN_AND_MASK I40E_MASK(0xFF, I40E_PRT_MNG_MDEF_VLAN_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_IPV4_ADDRESS_AND_SHIFT 13
+#define I40E_PRT_MNG_MDEF_IPV4_ADDRESS_AND_MASK I40E_MASK(0xF, I40E_PRT_MNG_MDEF_IPV4_ADDRESS_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_IPV6_ADDRESS_AND_SHIFT 17
+#define I40E_PRT_MNG_MDEF_IPV6_ADDRESS_AND_MASK I40E_MASK(0xF, I40E_PRT_MNG_MDEF_IPV6_ADDRESS_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_MAC_EXACT_OR_SHIFT 21
+#define I40E_PRT_MNG_MDEF_MAC_EXACT_OR_MASK I40E_MASK(0xF, I40E_PRT_MNG_MDEF_MAC_EXACT_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_BROADCAST_OR_SHIFT 25
+#define I40E_PRT_MNG_MDEF_BROADCAST_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_BROADCAST_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_MULTICAST_AND_SHIFT 26
+#define I40E_PRT_MNG_MDEF_MULTICAST_AND_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_MULTICAST_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_ARP_REQUEST_OR_SHIFT 27
+#define I40E_PRT_MNG_MDEF_ARP_REQUEST_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_ARP_REQUEST_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_ARP_RESPONSE_OR_SHIFT 28
+#define I40E_PRT_MNG_MDEF_ARP_RESPONSE_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_ARP_RESPONSE_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_NEIGHBOR_DISCOVERY_134_OR_SHIFT 29
+#define I40E_PRT_MNG_MDEF_NEIGHBOR_DISCOVERY_134_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_NEIGHBOR_DISCOVERY_134_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_PORT_0X298_OR_SHIFT 30
+#define I40E_PRT_MNG_MDEF_PORT_0X298_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_PORT_0X298_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_PORT_0X26F_OR_SHIFT 31
+#define I40E_PRT_MNG_MDEF_PORT_0X26F_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_PORT_0X26F_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT(_i) (0x00255F00 + ((_i) * 32)) /* _i=0...7 */ /* Reset: POR */
+#define I40E_PRT_MNG_MDEF_EXT_MAX_INDEX 7
+#define I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_AND_SHIFT 0
+#define I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_AND_MASK I40E_MASK(0xF, I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_AND_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_OR_SHIFT 4
+#define I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_OR_MASK I40E_MASK(0xF, I40E_PRT_MNG_MDEF_EXT_L2_ETHERTYPE_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_FLEX_PORT_OR_SHIFT 8
+#define I40E_PRT_MNG_MDEF_EXT_FLEX_PORT_OR_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_MDEF_EXT_FLEX_PORT_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_FLEX_TCO_SHIFT 24
+#define I40E_PRT_MNG_MDEF_EXT_FLEX_TCO_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_FLEX_TCO_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_135_OR_SHIFT 25
+#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_135_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_135_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_136_OR_SHIFT 26
+#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_136_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_136_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_137_OR_SHIFT 27
+#define I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_137_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_NEIGHBOR_DISCOVERY_137_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_ICMP_OR_SHIFT 28
+#define I40E_PRT_MNG_MDEF_EXT_ICMP_OR_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_ICMP_OR_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_MLD_SHIFT 29
+#define I40E_PRT_MNG_MDEF_EXT_MLD_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_MLD_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_APPLY_TO_NETWORK_TRAFFIC_SHIFT 30
+#define I40E_PRT_MNG_MDEF_EXT_APPLY_TO_NETWORK_TRAFFIC_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_APPLY_TO_NETWORK_TRAFFIC_SHIFT)
+#define I40E_PRT_MNG_MDEF_EXT_APPLY_TO_HOST_TRAFFIC_SHIFT 31
+#define I40E_PRT_MNG_MDEF_EXT_APPLY_TO_HOST_TRAFFIC_MASK I40E_MASK(0x1, I40E_PRT_MNG_MDEF_EXT_APPLY_TO_HOST_TRAFFIC_SHIFT)
+#define I40E_PRT_MNG_MDEFVSI(_i) (0x00256580 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_PRT_MNG_MDEFVSI_MAX_INDEX 3
+#define I40E_PRT_MNG_MDEFVSI_MDEFVSI_2N_SHIFT 0
+#define I40E_PRT_MNG_MDEFVSI_MDEFVSI_2N_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_MDEFVSI_MDEFVSI_2N_SHIFT)
+#define I40E_PRT_MNG_MDEFVSI_MDEFVSI_2NP1_SHIFT 16
+#define I40E_PRT_MNG_MDEFVSI_MDEFVSI_2NP1_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_MDEFVSI_MDEFVSI_2NP1_SHIFT)
+#define I40E_PRT_MNG_METF(_i) (0x00256780 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_PRT_MNG_METF_MAX_INDEX 3
+#define I40E_PRT_MNG_METF_ETYPE_SHIFT 0
+#define I40E_PRT_MNG_METF_ETYPE_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_METF_ETYPE_SHIFT)
+#define I40E_PRT_MNG_METF_POLARITY_SHIFT 30
+#define I40E_PRT_MNG_METF_POLARITY_MASK I40E_MASK(0x1, I40E_PRT_MNG_METF_POLARITY_SHIFT)
+#define I40E_PRT_MNG_MFUTP(_i) (0x00254E00 + ((_i) * 32)) /* _i=0...15 */ /* Reset: POR */
+#define I40E_PRT_MNG_MFUTP_MAX_INDEX 15
+#define I40E_PRT_MNG_MFUTP_MFUTP_N_SHIFT 0
+#define I40E_PRT_MNG_MFUTP_MFUTP_N_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_MFUTP_MFUTP_N_SHIFT)
+#define I40E_PRT_MNG_MFUTP_UDP_SHIFT 16
+#define I40E_PRT_MNG_MFUTP_UDP_MASK I40E_MASK(0x1, I40E_PRT_MNG_MFUTP_UDP_SHIFT)
+#define I40E_PRT_MNG_MFUTP_TCP_SHIFT 17
+#define I40E_PRT_MNG_MFUTP_TCP_MASK I40E_MASK(0x1, I40E_PRT_MNG_MFUTP_TCP_SHIFT)
+#define I40E_PRT_MNG_MFUTP_SOURCE_DESTINATION_SHIFT 18
+#define I40E_PRT_MNG_MFUTP_SOURCE_DESTINATION_MASK I40E_MASK(0x1, I40E_PRT_MNG_MFUTP_SOURCE_DESTINATION_SHIFT)
+#define I40E_PRT_MNG_MIPAF4(_i) (0x00256280 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_PRT_MNG_MIPAF4_MAX_INDEX 3
+#define I40E_PRT_MNG_MIPAF4_MIPAF_SHIFT 0
+#define I40E_PRT_MNG_MIPAF4_MIPAF_MASK I40E_MASK(0xFFFFFFFF, I40E_PRT_MNG_MIPAF4_MIPAF_SHIFT)
+#define I40E_PRT_MNG_MIPAF6(_i) (0x00254200 + ((_i) * 32)) /* _i=0...15 */ /* Reset: POR */
+#define I40E_PRT_MNG_MIPAF6_MAX_INDEX 15
+#define I40E_PRT_MNG_MIPAF6_MIPAF_SHIFT 0
+#define I40E_PRT_MNG_MIPAF6_MIPAF_MASK I40E_MASK(0xFFFFFFFF, I40E_PRT_MNG_MIPAF6_MIPAF_SHIFT)
+#define I40E_PRT_MNG_MMAH(_i) (0x00256380 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_PRT_MNG_MMAH_MAX_INDEX 3
+#define I40E_PRT_MNG_MMAH_MMAH_SHIFT 0
+#define I40E_PRT_MNG_MMAH_MMAH_MASK I40E_MASK(0xFFFF, I40E_PRT_MNG_MMAH_MMAH_SHIFT)
+#define I40E_PRT_MNG_MMAL(_i) (0x00256480 + ((_i) * 32)) /* _i=0...3 */ /* Reset: POR */
+#define I40E_PRT_MNG_MMAL_MAX_INDEX 3
+#define I40E_PRT_MNG_MMAL_MMAL_SHIFT 0
+#define I40E_PRT_MNG_MMAL_MMAL_MASK I40E_MASK(0xFFFFFFFF, I40E_PRT_MNG_MMAL_MMAL_SHIFT)
+#define I40E_PRT_MNG_MNGONLY 0x00256A60 /* Reset: POR */
+#define I40E_PRT_MNG_MNGONLY_EXCLUSIVE_TO_MANAGEABILITY_SHIFT 0
+#define I40E_PRT_MNG_MNGONLY_EXCLUSIVE_TO_MANAGEABILITY_MASK I40E_MASK(0xFF, I40E_PRT_MNG_MNGONLY_EXCLUSIVE_TO_MANAGEABILITY_SHIFT)
+#define I40E_PRT_MNG_MSFM 0x00256AA0 /* Reset: POR */
+#define I40E_PRT_MNG_MSFM_PORT_26F_UDP_SHIFT 0
+#define I40E_PRT_MNG_MSFM_PORT_26F_UDP_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_PORT_26F_UDP_SHIFT)
+#define I40E_PRT_MNG_MSFM_PORT_26F_TCP_SHIFT 1
+#define I40E_PRT_MNG_MSFM_PORT_26F_TCP_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_PORT_26F_TCP_SHIFT)
+#define I40E_PRT_MNG_MSFM_PORT_298_UDP_SHIFT 2
+#define I40E_PRT_MNG_MSFM_PORT_298_UDP_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_PORT_298_UDP_SHIFT)
+#define I40E_PRT_MNG_MSFM_PORT_298_TCP_SHIFT 3
+#define I40E_PRT_MNG_MSFM_PORT_298_TCP_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_PORT_298_TCP_SHIFT)
+#define I40E_PRT_MNG_MSFM_IPV6_0_MASK_SHIFT 4
+#define I40E_PRT_MNG_MSFM_IPV6_0_MASK_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_IPV6_0_MASK_SHIFT)
+#define I40E_PRT_MNG_MSFM_IPV6_1_MASK_SHIFT 5
+#define I40E_PRT_MNG_MSFM_IPV6_1_MASK_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_IPV6_1_MASK_SHIFT)
+#define I40E_PRT_MNG_MSFM_IPV6_2_MASK_SHIFT 6
+#define I40E_PRT_MNG_MSFM_IPV6_2_MASK_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_IPV6_2_MASK_SHIFT)
+#define I40E_PRT_MNG_MSFM_IPV6_3_MASK_SHIFT 7
+#define I40E_PRT_MNG_MSFM_IPV6_3_MASK_MASK I40E_MASK(0x1, I40E_PRT_MNG_MSFM_IPV6_3_MASK_SHIFT)
+#define I40E_MSIX_PBA(_i) (0x00001000 + ((_i) * 4)) /* _i=0...5 */ /* Reset: FLR */
+#define I40E_MSIX_PBA_MAX_INDEX 5
+#define I40E_MSIX_PBA_PENBIT_SHIFT 0
+#define I40E_MSIX_PBA_PENBIT_MASK I40E_MASK(0xFFFFFFFF, I40E_MSIX_PBA_PENBIT_SHIFT)
+#define I40E_MSIX_TADD(_i) (0x00000000 + ((_i) * 16)) /* _i=0...128 */ /* Reset: FLR */
+#define I40E_MSIX_TADD_MAX_INDEX 128
+#define I40E_MSIX_TADD_MSIXTADD10_SHIFT 0
+#define I40E_MSIX_TADD_MSIXTADD10_MASK I40E_MASK(0x3, I40E_MSIX_TADD_MSIXTADD10_SHIFT)
+#define I40E_MSIX_TADD_MSIXTADD_SHIFT 2
+#define I40E_MSIX_TADD_MSIXTADD_MASK I40E_MASK(0x3FFFFFFF, I40E_MSIX_TADD_MSIXTADD_SHIFT)
+#define I40E_MSIX_TMSG(_i) (0x00000008 + ((_i) * 16)) /* _i=0...128 */ /* Reset: FLR */
+#define I40E_MSIX_TMSG_MAX_INDEX 128
+#define I40E_MSIX_TMSG_MSIXTMSG_SHIFT 0
+#define I40E_MSIX_TMSG_MSIXTMSG_MASK I40E_MASK(0xFFFFFFFF, I40E_MSIX_TMSG_MSIXTMSG_SHIFT)
+#define I40E_MSIX_TUADD(_i) (0x00000004 + ((_i) * 16)) /* _i=0...128 */ /* Reset: FLR */
+#define I40E_MSIX_TUADD_MAX_INDEX 128
+#define I40E_MSIX_TUADD_MSIXTUADD_SHIFT 0
+#define I40E_MSIX_TUADD_MSIXTUADD_MASK I40E_MASK(0xFFFFFFFF, I40E_MSIX_TUADD_MSIXTUADD_SHIFT)
+#define I40E_MSIX_TVCTRL(_i) (0x0000000C + ((_i) * 16)) /* _i=0...128 */ /* Reset: FLR */
+#define I40E_MSIX_TVCTRL_MAX_INDEX 128
+#define I40E_MSIX_TVCTRL_MASK_SHIFT 0
+#define I40E_MSIX_TVCTRL_MASK_MASK I40E_MASK(0x1, I40E_MSIX_TVCTRL_MASK_SHIFT)
+#define I40E_VFMSIX_PBA1(_i) (0x00002000 + ((_i) * 4)) /* _i=0...19 */ /* Reset: VFLR */
+#define I40E_VFMSIX_PBA1_MAX_INDEX 19
+#define I40E_VFMSIX_PBA1_PENBIT_SHIFT 0
+#define I40E_VFMSIX_PBA1_PENBIT_MASK I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_PBA1_PENBIT_SHIFT)
+#define I40E_VFMSIX_TADD1(_i) (0x00002100 + ((_i) * 16)) /* _i=0...639 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TADD1_MAX_INDEX 639
+#define I40E_VFMSIX_TADD1_MSIXTADD10_SHIFT 0
+#define I40E_VFMSIX_TADD1_MSIXTADD10_MASK I40E_MASK(0x3, I40E_VFMSIX_TADD1_MSIXTADD10_SHIFT)
+#define I40E_VFMSIX_TADD1_MSIXTADD_SHIFT 2
+#define I40E_VFMSIX_TADD1_MSIXTADD_MASK I40E_MASK(0x3FFFFFFF, I40E_VFMSIX_TADD1_MSIXTADD_SHIFT)
+#define I40E_VFMSIX_TMSG1(_i) (0x00002108 + ((_i) * 16)) /* _i=0...639 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TMSG1_MAX_INDEX 639
+#define I40E_VFMSIX_TMSG1_MSIXTMSG_SHIFT 0
+#define I40E_VFMSIX_TMSG1_MSIXTMSG_MASK I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_TMSG1_MSIXTMSG_SHIFT)
+#define I40E_VFMSIX_TUADD1(_i) (0x00002104 + ((_i) * 16)) /* _i=0...639 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TUADD1_MAX_INDEX 639
+#define I40E_VFMSIX_TUADD1_MSIXTUADD_SHIFT 0
+#define I40E_VFMSIX_TUADD1_MSIXTUADD_MASK I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_TUADD1_MSIXTUADD_SHIFT)
+#define I40E_VFMSIX_TVCTRL1(_i) (0x0000210C + ((_i) * 16)) /* _i=0...639 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TVCTRL1_MAX_INDEX 639
+#define I40E_VFMSIX_TVCTRL1_MASK_SHIFT 0
+#define I40E_VFMSIX_TVCTRL1_MASK_MASK I40E_MASK(0x1, I40E_VFMSIX_TVCTRL1_MASK_SHIFT)
+#define I40E_GLNVM_FLA 0x000B6108 /* Reset: POR */
+#define I40E_GLNVM_FLA_FL_SCK_SHIFT 0
+#define I40E_GLNVM_FLA_FL_SCK_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_SCK_SHIFT)
+#define I40E_GLNVM_FLA_FL_CE_SHIFT 1
+#define I40E_GLNVM_FLA_FL_CE_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_CE_SHIFT)
+#define I40E_GLNVM_FLA_FL_SI_SHIFT 2
+#define I40E_GLNVM_FLA_FL_SI_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_SI_SHIFT)
+#define I40E_GLNVM_FLA_FL_SO_SHIFT 3
+#define I40E_GLNVM_FLA_FL_SO_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_SO_SHIFT)
+#define I40E_GLNVM_FLA_FL_REQ_SHIFT 4
+#define I40E_GLNVM_FLA_FL_REQ_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_REQ_SHIFT)
+#define I40E_GLNVM_FLA_FL_GNT_SHIFT 5
+#define I40E_GLNVM_FLA_FL_GNT_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_GNT_SHIFT)
+#define I40E_GLNVM_FLA_LOCKED_SHIFT 6
+#define I40E_GLNVM_FLA_LOCKED_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_LOCKED_SHIFT)
+#define I40E_GLNVM_FLA_FL_SADDR_SHIFT 18
+#define I40E_GLNVM_FLA_FL_SADDR_MASK I40E_MASK(0x7FF, I40E_GLNVM_FLA_FL_SADDR_SHIFT)
+#define I40E_GLNVM_FLA_FL_BUSY_SHIFT 30
+#define I40E_GLNVM_FLA_FL_BUSY_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_BUSY_SHIFT)
+#define I40E_GLNVM_FLA_FL_DER_SHIFT 31
+#define I40E_GLNVM_FLA_FL_DER_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_FL_DER_SHIFT)
+#define I40E_GLNVM_FLASHID 0x000B6104 /* Reset: POR */
+#define I40E_GLNVM_FLASHID_FLASHID_SHIFT 0
+#define I40E_GLNVM_FLASHID_FLASHID_MASK I40E_MASK(0xFFFFFF, I40E_GLNVM_FLASHID_FLASHID_SHIFT)
+#define I40E_GLNVM_FLASHID_FLEEP_PERF_SHIFT 31
+#define I40E_GLNVM_FLASHID_FLEEP_PERF_MASK I40E_MASK(0x1, I40E_GLNVM_FLASHID_FLEEP_PERF_SHIFT)
+#define I40E_GLNVM_GENS 0x000B6100 /* Reset: POR */
+#define I40E_GLNVM_GENS_NVM_PRES_SHIFT 0
+#define I40E_GLNVM_GENS_NVM_PRES_MASK I40E_MASK(0x1, I40E_GLNVM_GENS_NVM_PRES_SHIFT)
+#define I40E_GLNVM_GENS_SR_SIZE_SHIFT 5
+#define I40E_GLNVM_GENS_SR_SIZE_MASK I40E_MASK(0x7, I40E_GLNVM_GENS_SR_SIZE_SHIFT)
+#define I40E_GLNVM_GENS_BANK1VAL_SHIFT 8
+#define I40E_GLNVM_GENS_BANK1VAL_MASK I40E_MASK(0x1, I40E_GLNVM_GENS_BANK1VAL_SHIFT)
+#define I40E_GLNVM_GENS_ALT_PRST_SHIFT 23
+#define I40E_GLNVM_GENS_ALT_PRST_MASK I40E_MASK(0x1, I40E_GLNVM_GENS_ALT_PRST_SHIFT)
+#define I40E_GLNVM_GENS_FL_AUTO_RD_SHIFT 25
+#define I40E_GLNVM_GENS_FL_AUTO_RD_MASK I40E_MASK(0x1, I40E_GLNVM_GENS_FL_AUTO_RD_SHIFT)
+#define I40E_GLNVM_PROTCSR(_i) (0x000B6010 + ((_i) * 4)) /* _i=0...59 */ /* Reset: POR */
+#define I40E_GLNVM_PROTCSR_MAX_INDEX 59
+#define I40E_GLNVM_PROTCSR_ADDR_BLOCK_SHIFT 0
+#define I40E_GLNVM_PROTCSR_ADDR_BLOCK_MASK I40E_MASK(0xFFFFFF, I40E_GLNVM_PROTCSR_ADDR_BLOCK_SHIFT)
+#define I40E_GLNVM_SRCTL 0x000B6110 /* Reset: POR */
+#define I40E_GLNVM_SRCTL_SRBUSY_SHIFT 0
+#define I40E_GLNVM_SRCTL_SRBUSY_MASK I40E_MASK(0x1, I40E_GLNVM_SRCTL_SRBUSY_SHIFT)
+#define I40E_GLNVM_SRCTL_ADDR_SHIFT 14
+#define I40E_GLNVM_SRCTL_ADDR_MASK I40E_MASK(0x7FFF, I40E_GLNVM_SRCTL_ADDR_SHIFT)
+#define I40E_GLNVM_SRCTL_WRITE_SHIFT 29
+#define I40E_GLNVM_SRCTL_WRITE_MASK I40E_MASK(0x1, I40E_GLNVM_SRCTL_WRITE_SHIFT)
+#define I40E_GLNVM_SRCTL_START_SHIFT 30
+#define I40E_GLNVM_SRCTL_START_MASK I40E_MASK(0x1, I40E_GLNVM_SRCTL_START_SHIFT)
+#define I40E_GLNVM_SRCTL_DONE_SHIFT 31
+#define I40E_GLNVM_SRCTL_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_SRCTL_DONE_SHIFT)
+#define I40E_GLNVM_SRDATA 0x000B6114 /* Reset: POR */
+#define I40E_GLNVM_SRDATA_WRDATA_SHIFT 0
+#define I40E_GLNVM_SRDATA_WRDATA_MASK I40E_MASK(0xFFFF, I40E_GLNVM_SRDATA_WRDATA_SHIFT)
+#define I40E_GLNVM_SRDATA_RDDATA_SHIFT 16
+#define I40E_GLNVM_SRDATA_RDDATA_MASK I40E_MASK(0xFFFF, I40E_GLNVM_SRDATA_RDDATA_SHIFT)
+#define I40E_GLNVM_ULD 0x000B6008 /* Reset: POR */
+#define I40E_GLNVM_ULD_CONF_PCIR_DONE_SHIFT 0
+#define I40E_GLNVM_ULD_CONF_PCIR_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PCIR_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_PCIRTL_DONE_SHIFT 1
+#define I40E_GLNVM_ULD_CONF_PCIRTL_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PCIRTL_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_LCB_DONE_SHIFT 2
+#define I40E_GLNVM_ULD_CONF_LCB_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_LCB_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_CORE_DONE_SHIFT 3
+#define I40E_GLNVM_ULD_CONF_CORE_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_CORE_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_GLOBAL_DONE_SHIFT 4
+#define I40E_GLNVM_ULD_CONF_GLOBAL_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_GLOBAL_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_POR_DONE_SHIFT 5
+#define I40E_GLNVM_ULD_CONF_POR_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_POR_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_PCIE_ANA_DONE_SHIFT 6
+#define I40E_GLNVM_ULD_CONF_PCIE_ANA_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PCIE_ANA_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_PHY_ANA_DONE_SHIFT 7
+#define I40E_GLNVM_ULD_CONF_PHY_ANA_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PHY_ANA_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_EMP_DONE_SHIFT 8
+#define I40E_GLNVM_ULD_CONF_EMP_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_EMP_DONE_SHIFT)
+#define I40E_GLNVM_ULD_CONF_PCIALT_DONE_SHIFT 9
+#define I40E_GLNVM_ULD_CONF_PCIALT_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CONF_PCIALT_DONE_SHIFT)
+#define I40E_GLPCI_BYTCTH 0x0009C484 /* Reset: PCIR */
+#define I40E_GLPCI_BYTCTH_PCI_COUNT_BW_BCT_SHIFT 0
+#define I40E_GLPCI_BYTCTH_PCI_COUNT_BW_BCT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_BYTCTH_PCI_COUNT_BW_BCT_SHIFT)
+#define I40E_GLPCI_BYTCTL 0x0009C488 /* Reset: PCIR */
+#define I40E_GLPCI_BYTCTL_PCI_COUNT_BW_BCT_SHIFT 0
+#define I40E_GLPCI_BYTCTL_PCI_COUNT_BW_BCT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_BYTCTL_PCI_COUNT_BW_BCT_SHIFT)
+#define I40E_GLPCI_CAPCTRL 0x000BE4A4 /* Reset: PCIR */
+#define I40E_GLPCI_CAPCTRL_VPD_EN_SHIFT 0
+#define I40E_GLPCI_CAPCTRL_VPD_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPCTRL_VPD_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP 0x000BE4A8 /* Reset: PCIR */
+#define I40E_GLPCI_CAPSUP_PCIE_VER_SHIFT 0
+#define I40E_GLPCI_CAPSUP_PCIE_VER_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_PCIE_VER_SHIFT)
+#define I40E_GLPCI_CAPSUP_LTR_EN_SHIFT 2
+#define I40E_GLPCI_CAPSUP_LTR_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_LTR_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_TPH_EN_SHIFT 3
+#define I40E_GLPCI_CAPSUP_TPH_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_TPH_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_ARI_EN_SHIFT 4
+#define I40E_GLPCI_CAPSUP_ARI_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_ARI_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_IOV_EN_SHIFT 5
+#define I40E_GLPCI_CAPSUP_IOV_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_IOV_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_ACS_EN_SHIFT 6
+#define I40E_GLPCI_CAPSUP_ACS_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_ACS_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_SEC_EN_SHIFT 7
+#define I40E_GLPCI_CAPSUP_SEC_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_SEC_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_ECRC_GEN_EN_SHIFT 16
+#define I40E_GLPCI_CAPSUP_ECRC_GEN_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_ECRC_GEN_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_ECRC_CHK_EN_SHIFT 17
+#define I40E_GLPCI_CAPSUP_ECRC_CHK_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_ECRC_CHK_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_IDO_EN_SHIFT 18
+#define I40E_GLPCI_CAPSUP_IDO_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_IDO_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_MSI_MASK_SHIFT 19
+#define I40E_GLPCI_CAPSUP_MSI_MASK_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_MSI_MASK_SHIFT)
+#define I40E_GLPCI_CAPSUP_CSR_CONF_EN_SHIFT 20
+#define I40E_GLPCI_CAPSUP_CSR_CONF_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_CSR_CONF_EN_SHIFT)
+#define I40E_GLPCI_CAPSUP_LOAD_SUBSYS_ID_SHIFT 30
+#define I40E_GLPCI_CAPSUP_LOAD_SUBSYS_ID_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_LOAD_SUBSYS_ID_SHIFT)
+#define I40E_GLPCI_CAPSUP_LOAD_DEV_ID_SHIFT 31
+#define I40E_GLPCI_CAPSUP_LOAD_DEV_ID_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_LOAD_DEV_ID_SHIFT)
+#define I40E_GLPCI_CNF 0x000BE4C0 /* Reset: POR */
+#define I40E_GLPCI_CNF_FLEX10_SHIFT 1
+#define I40E_GLPCI_CNF_FLEX10_MASK I40E_MASK(0x1, I40E_GLPCI_CNF_FLEX10_SHIFT)
+#define I40E_GLPCI_CNF_WAKE_PIN_EN_SHIFT 2
+#define I40E_GLPCI_CNF_WAKE_PIN_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CNF_WAKE_PIN_EN_SHIFT)
+#define I40E_GLPCI_CNF2 0x000BE494 /* Reset: PCIR */
+#define I40E_GLPCI_CNF2_RO_DIS_SHIFT 0
+#define I40E_GLPCI_CNF2_RO_DIS_MASK I40E_MASK(0x1, I40E_GLPCI_CNF2_RO_DIS_SHIFT)
+#define I40E_GLPCI_CNF2_CACHELINE_SIZE_SHIFT 1
+#define I40E_GLPCI_CNF2_CACHELINE_SIZE_MASK I40E_MASK(0x1, I40E_GLPCI_CNF2_CACHELINE_SIZE_SHIFT)
+#define I40E_GLPCI_CNF2_MSI_X_PF_N_SHIFT 2
+#define I40E_GLPCI_CNF2_MSI_X_PF_N_MASK I40E_MASK(0x7FF, I40E_GLPCI_CNF2_MSI_X_PF_N_SHIFT)
+#define I40E_GLPCI_CNF2_MSI_X_VF_N_SHIFT 13
+#define I40E_GLPCI_CNF2_MSI_X_VF_N_MASK I40E_MASK(0x7FF, I40E_GLPCI_CNF2_MSI_X_VF_N_SHIFT)
+#define I40E_GLPCI_DREVID 0x0009C480 /* Reset: PCIR */
+#define I40E_GLPCI_DREVID_DEFAULT_REVID_SHIFT 0
+#define I40E_GLPCI_DREVID_DEFAULT_REVID_MASK I40E_MASK(0xFF, I40E_GLPCI_DREVID_DEFAULT_REVID_SHIFT)
+#define I40E_GLPCI_GSCL_1 0x0009C48C /* Reset: PCIR */
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_0_SHIFT 0
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_0_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_EN_0_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_1_SHIFT 1
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_1_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_EN_1_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_2_SHIFT 2
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_2_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_EN_2_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_3_SHIFT 3
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_EN_3_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_EN_3_SHIFT)
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_0_SHIFT 4
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_0_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_LBC_ENABLE_0_SHIFT)
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_1_SHIFT 5
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_1_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_LBC_ENABLE_1_SHIFT)
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_2_SHIFT 6
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_2_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_LBC_ENABLE_2_SHIFT)
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_3_SHIFT 7
+#define I40E_GLPCI_GSCL_1_LBC_ENABLE_3_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_LBC_ENABLE_3_SHIFT)
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EN_SHIFT 8
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EN_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EN_SHIFT)
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EV_SHIFT 9
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EV_MASK I40E_MASK(0x1F, I40E_GLPCI_GSCL_1_PCI_COUNT_LAT_EV_SHIFT)
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EN_SHIFT 14
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EN_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EN_SHIFT)
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EV_SHIFT 15
+#define I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EV_MASK I40E_MASK(0x1F, I40E_GLPCI_GSCL_1_PCI_COUNT_BW_EV_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_64_BIT_EN_SHIFT 28
+#define I40E_GLPCI_GSCL_1_GIO_64_BIT_EN_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_64_BIT_EN_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_RESET_SHIFT 29
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_RESET_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_RESET_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_STOP_SHIFT 30
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_STOP_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_STOP_SHIFT)
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_START_SHIFT 31
+#define I40E_GLPCI_GSCL_1_GIO_COUNT_START_MASK I40E_MASK(0x1, I40E_GLPCI_GSCL_1_GIO_COUNT_START_SHIFT)
+#define I40E_GLPCI_GSCL_2 0x0009C490 /* Reset: PCIR */
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_0_SHIFT 0
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_0_MASK I40E_MASK(0xFF, I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_0_SHIFT)
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_1_SHIFT 8
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_1_MASK I40E_MASK(0xFF, I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_1_SHIFT)
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_2_SHIFT 16
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_2_MASK I40E_MASK(0xFF, I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_2_SHIFT)
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_3_SHIFT 24
+#define I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_3_MASK I40E_MASK(0xFF, I40E_GLPCI_GSCL_2_GIO_EVENT_NUM_3_SHIFT)
+#define I40E_GLPCI_GSCL_5_8(_i) (0x0009C494 + ((_i) * 4)) /* _i=0...3 */ /* Reset: PCIR */
+#define I40E_GLPCI_GSCL_5_8_MAX_INDEX 3
+#define I40E_GLPCI_GSCL_5_8_LBC_THRESHOLD_N_SHIFT 0
+#define I40E_GLPCI_GSCL_5_8_LBC_THRESHOLD_N_MASK I40E_MASK(0xFFFF, I40E_GLPCI_GSCL_5_8_LBC_THRESHOLD_N_SHIFT)
+#define I40E_GLPCI_GSCL_5_8_LBC_TIMER_N_SHIFT 16
+#define I40E_GLPCI_GSCL_5_8_LBC_TIMER_N_MASK I40E_MASK(0xFFFF, I40E_GLPCI_GSCL_5_8_LBC_TIMER_N_SHIFT)
+#define I40E_GLPCI_GSCN_0_3(_i) (0x0009C4A4 + ((_i) * 4)) /* _i=0...3 */ /* Reset: PCIR */
+#define I40E_GLPCI_GSCN_0_3_MAX_INDEX 3
+#define I40E_GLPCI_GSCN_0_3_EVENT_COUNTER_SHIFT 0
+#define I40E_GLPCI_GSCN_0_3_EVENT_COUNTER_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_GSCN_0_3_EVENT_COUNTER_SHIFT)
+#define I40E_GLPCI_LBARCTRL 0x000BE484 /* Reset: POR */
+#define I40E_GLPCI_LBARCTRL_PREFBAR_SHIFT 0
+#define I40E_GLPCI_LBARCTRL_PREFBAR_MASK I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_PREFBAR_SHIFT)
+#define I40E_GLPCI_LBARCTRL_BAR32_SHIFT 1
+#define I40E_GLPCI_LBARCTRL_BAR32_MASK I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_BAR32_SHIFT)
+#define I40E_GLPCI_LBARCTRL_FLASH_EXPOSE_SHIFT 3
+#define I40E_GLPCI_LBARCTRL_FLASH_EXPOSE_MASK I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_FLASH_EXPOSE_SHIFT)
+#define I40E_GLPCI_LBARCTRL_RSVD_4_SHIFT 4
+#define I40E_GLPCI_LBARCTRL_RSVD_4_MASK I40E_MASK(0x3, I40E_GLPCI_LBARCTRL_RSVD_4_SHIFT)
+#define I40E_GLPCI_LBARCTRL_FL_SIZE_SHIFT 6
+#define I40E_GLPCI_LBARCTRL_FL_SIZE_MASK I40E_MASK(0x7, I40E_GLPCI_LBARCTRL_FL_SIZE_SHIFT)
+#define I40E_GLPCI_LBARCTRL_RSVD_10_SHIFT 10
+#define I40E_GLPCI_LBARCTRL_RSVD_10_MASK I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_RSVD_10_SHIFT)
+#define I40E_GLPCI_LBARCTRL_EXROM_SIZE_SHIFT 11
+#define I40E_GLPCI_LBARCTRL_EXROM_SIZE_MASK I40E_MASK(0x7, I40E_GLPCI_LBARCTRL_EXROM_SIZE_SHIFT)
+#define I40E_GLPCI_LINKCAP 0x000BE4AC /* Reset: PCIR */
+#define I40E_GLPCI_LINKCAP_LINK_SPEEDS_VECTOR_SHIFT 0
+#define I40E_GLPCI_LINKCAP_LINK_SPEEDS_VECTOR_MASK I40E_MASK(0x3F, I40E_GLPCI_LINKCAP_LINK_SPEEDS_VECTOR_SHIFT)
+#define I40E_GLPCI_LINKCAP_MAX_PAYLOAD_SHIFT 6
+#define I40E_GLPCI_LINKCAP_MAX_PAYLOAD_MASK I40E_MASK(0x7, I40E_GLPCI_LINKCAP_MAX_PAYLOAD_SHIFT)
+#define I40E_GLPCI_LINKCAP_MAX_LINK_WIDTH_SHIFT 9
+#define I40E_GLPCI_LINKCAP_MAX_LINK_WIDTH_MASK I40E_MASK(0xF, I40E_GLPCI_LINKCAP_MAX_LINK_WIDTH_SHIFT)
+#define I40E_GLPCI_PCIERR 0x000BE4FC /* Reset: PCIR */
+#define I40E_GLPCI_PCIERR_PCIE_ERR_REP_SHIFT 0
+#define I40E_GLPCI_PCIERR_PCIE_ERR_REP_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_PCIERR_PCIE_ERR_REP_SHIFT)
+#define I40E_GLPCI_PKTCT 0x0009C4BC /* Reset: PCIR */
+#define I40E_GLPCI_PKTCT_PCI_COUNT_BW_PCT_SHIFT 0
+#define I40E_GLPCI_PKTCT_PCI_COUNT_BW_PCT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_PKTCT_PCI_COUNT_BW_PCT_SHIFT)
+#define I40E_GLPCI_PM_MUX_NPQ 0x0009C4F4 /* Reset: PCIR */
+#define I40E_GLPCI_PM_MUX_NPQ_NPQ_NUM_PORT_SEL_SHIFT 0
+#define I40E_GLPCI_PM_MUX_NPQ_NPQ_NUM_PORT_SEL_MASK I40E_MASK(0x7, I40E_GLPCI_PM_MUX_NPQ_NPQ_NUM_PORT_SEL_SHIFT)
+#define I40E_GLPCI_PM_MUX_NPQ_INNER_NPQ_SEL_SHIFT 16
+#define I40E_GLPCI_PM_MUX_NPQ_INNER_NPQ_SEL_MASK I40E_MASK(0x1F, I40E_GLPCI_PM_MUX_NPQ_INNER_NPQ_SEL_SHIFT)
+#define I40E_GLPCI_PM_MUX_PFB 0x0009C4F0 /* Reset: PCIR */
+#define I40E_GLPCI_PM_MUX_PFB_PFB_PORT_SEL_SHIFT 0
+#define I40E_GLPCI_PM_MUX_PFB_PFB_PORT_SEL_MASK I40E_MASK(0x1F, I40E_GLPCI_PM_MUX_PFB_PFB_PORT_SEL_SHIFT)
+#define I40E_GLPCI_PM_MUX_PFB_INNER_PORT_SEL_SHIFT 16
+#define I40E_GLPCI_PM_MUX_PFB_INNER_PORT_SEL_MASK I40E_MASK(0x7, I40E_GLPCI_PM_MUX_PFB_INNER_PORT_SEL_SHIFT)
+#define I40E_GLPCI_PMSUP 0x000BE4B0 /* Reset: PCIR */
+#define I40E_GLPCI_PMSUP_ASPM_SUP_SHIFT 0
+#define I40E_GLPCI_PMSUP_ASPM_SUP_MASK I40E_MASK(0x3, I40E_GLPCI_PMSUP_ASPM_SUP_SHIFT)
+#define I40E_GLPCI_PMSUP_L0S_EXIT_LAT_SHIFT 2
+#define I40E_GLPCI_PMSUP_L0S_EXIT_LAT_MASK I40E_MASK(0x7, I40E_GLPCI_PMSUP_L0S_EXIT_LAT_SHIFT)
+#define I40E_GLPCI_PMSUP_L1_EXIT_LAT_SHIFT 5
+#define I40E_GLPCI_PMSUP_L1_EXIT_LAT_MASK I40E_MASK(0x7, I40E_GLPCI_PMSUP_L1_EXIT_LAT_SHIFT)
+#define I40E_GLPCI_PMSUP_L0S_ACC_LAT_SHIFT 8
+#define I40E_GLPCI_PMSUP_L0S_ACC_LAT_MASK I40E_MASK(0x7, I40E_GLPCI_PMSUP_L0S_ACC_LAT_SHIFT)
+#define I40E_GLPCI_PMSUP_L1_ACC_LAT_SHIFT 11
+#define I40E_GLPCI_PMSUP_L1_ACC_LAT_MASK I40E_MASK(0x7, I40E_GLPCI_PMSUP_L1_ACC_LAT_SHIFT)
+#define I40E_GLPCI_PMSUP_SLOT_CLK_SHIFT 14
+#define I40E_GLPCI_PMSUP_SLOT_CLK_MASK I40E_MASK(0x1, I40E_GLPCI_PMSUP_SLOT_CLK_SHIFT)
+#define I40E_GLPCI_PMSUP_OBFF_SUP_SHIFT 15
+#define I40E_GLPCI_PMSUP_OBFF_SUP_MASK I40E_MASK(0x3, I40E_GLPCI_PMSUP_OBFF_SUP_SHIFT)
+#define I40E_GLPCI_PQ_MAX_USED_SPC 0x0009C4EC /* Reset: PCIR */
+#define I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_12_SHIFT 0
+#define I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_12_MASK I40E_MASK(0xFF, I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_12_SHIFT)
+#define I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_13_SHIFT 8
+#define I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_13_MASK I40E_MASK(0xFF, I40E_GLPCI_PQ_MAX_USED_SPC_GLPCI_PQ_MAX_USED_SPC_13_SHIFT)
+#define I40E_GLPCI_PWRDATA 0x000BE490 /* Reset: PCIR */
+#define I40E_GLPCI_PWRDATA_D0_POWER_SHIFT 0
+#define I40E_GLPCI_PWRDATA_D0_POWER_MASK I40E_MASK(0xFF, I40E_GLPCI_PWRDATA_D0_POWER_SHIFT)
+#define I40E_GLPCI_PWRDATA_COMM_POWER_SHIFT 8
+#define I40E_GLPCI_PWRDATA_COMM_POWER_MASK I40E_MASK(0xFF, I40E_GLPCI_PWRDATA_COMM_POWER_SHIFT)
+#define I40E_GLPCI_PWRDATA_D3_POWER_SHIFT 16
+#define I40E_GLPCI_PWRDATA_D3_POWER_MASK I40E_MASK(0xFF, I40E_GLPCI_PWRDATA_D3_POWER_SHIFT)
+#define I40E_GLPCI_PWRDATA_DATA_SCALE_SHIFT 24
+#define I40E_GLPCI_PWRDATA_DATA_SCALE_MASK I40E_MASK(0x3, I40E_GLPCI_PWRDATA_DATA_SCALE_SHIFT)
+#define I40E_GLPCI_REVID 0x000BE4B4 /* Reset: PCIR */
+#define I40E_GLPCI_REVID_NVM_REVID_SHIFT 0
+#define I40E_GLPCI_REVID_NVM_REVID_MASK I40E_MASK(0xFF, I40E_GLPCI_REVID_NVM_REVID_SHIFT)
+#define I40E_GLPCI_SERH 0x000BE49C /* Reset: PCIR */
+#define I40E_GLPCI_SERH_SER_NUM_H_SHIFT 0
+#define I40E_GLPCI_SERH_SER_NUM_H_MASK I40E_MASK(0xFFFF, I40E_GLPCI_SERH_SER_NUM_H_SHIFT)
+#define I40E_GLPCI_SERL 0x000BE498 /* Reset: PCIR */
+#define I40E_GLPCI_SERL_SER_NUM_L_SHIFT 0
+#define I40E_GLPCI_SERL_SER_NUM_L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_SERL_SER_NUM_L_SHIFT)
+#define I40E_GLPCI_SPARE_BITS_0 0x0009C4F8 /* Reset: PCIR */
+#define I40E_GLPCI_SPARE_BITS_0_SPARE_BITS_SHIFT 0
+#define I40E_GLPCI_SPARE_BITS_0_SPARE_BITS_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_SPARE_BITS_0_SPARE_BITS_SHIFT)
+#define I40E_GLPCI_SPARE_BITS_1 0x0009C4FC /* Reset: PCIR */
+#define I40E_GLPCI_SPARE_BITS_1_SPARE_BITS_SHIFT 0
+#define I40E_GLPCI_SPARE_BITS_1_SPARE_BITS_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPCI_SPARE_BITS_1_SPARE_BITS_SHIFT)
+#define I40E_GLPCI_SUBVENID 0x000BE48C /* Reset: PCIR */
+#define I40E_GLPCI_SUBVENID_SUB_VEN_ID_SHIFT 0
+#define I40E_GLPCI_SUBVENID_SUB_VEN_ID_MASK I40E_MASK(0xFFFF, I40E_GLPCI_SUBVENID_SUB_VEN_ID_SHIFT)
+#define I40E_GLPCI_UPADD 0x000BE4F8 /* Reset: PCIR */
+#define I40E_GLPCI_UPADD_ADDRESS_SHIFT 1
+#define I40E_GLPCI_UPADD_ADDRESS_MASK I40E_MASK(0x7FFFFFFF, I40E_GLPCI_UPADD_ADDRESS_SHIFT)
+#define I40E_GLPCI_VENDORID 0x000BE518 /* Reset: PCIR */
+#define I40E_GLPCI_VENDORID_VENDORID_SHIFT 0
+#define I40E_GLPCI_VENDORID_VENDORID_MASK I40E_MASK(0xFFFF, I40E_GLPCI_VENDORID_VENDORID_SHIFT)
+#define I40E_GLPCI_VFSUP 0x000BE4B8 /* Reset: PCIR */
+#define I40E_GLPCI_VFSUP_VF_PREFETCH_SHIFT 0
+#define I40E_GLPCI_VFSUP_VF_PREFETCH_MASK I40E_MASK(0x1, I40E_GLPCI_VFSUP_VF_PREFETCH_SHIFT)
+#define I40E_GLPCI_VFSUP_VR_BAR_TYPE_SHIFT 1
+#define I40E_GLPCI_VFSUP_VR_BAR_TYPE_MASK I40E_MASK(0x1, I40E_GLPCI_VFSUP_VR_BAR_TYPE_SHIFT)
+#define I40E_GLTPH_CTRL 0x000BE480 /* Reset: PCIR */
+#define I40E_GLTPH_CTRL_DESC_PH_SHIFT 9
+#define I40E_GLTPH_CTRL_DESC_PH_MASK I40E_MASK(0x3, I40E_GLTPH_CTRL_DESC_PH_SHIFT)
+#define I40E_GLTPH_CTRL_DATA_PH_SHIFT 11
+#define I40E_GLTPH_CTRL_DATA_PH_MASK I40E_MASK(0x3, I40E_GLTPH_CTRL_DATA_PH_SHIFT)
+#define I40E_PF_FUNC_RID 0x0009C000 /* Reset: PCIR */
+#define I40E_PF_FUNC_RID_FUNCTION_NUMBER_SHIFT 0
+#define I40E_PF_FUNC_RID_FUNCTION_NUMBER_MASK I40E_MASK(0x7, I40E_PF_FUNC_RID_FUNCTION_NUMBER_SHIFT)
+#define I40E_PF_FUNC_RID_DEVICE_NUMBER_SHIFT 3
+#define I40E_PF_FUNC_RID_DEVICE_NUMBER_MASK I40E_MASK(0x1F, I40E_PF_FUNC_RID_DEVICE_NUMBER_SHIFT)
+#define I40E_PF_FUNC_RID_BUS_NUMBER_SHIFT 8
+#define I40E_PF_FUNC_RID_BUS_NUMBER_MASK I40E_MASK(0xFF, I40E_PF_FUNC_RID_BUS_NUMBER_SHIFT)
+#define I40E_PF_PCI_CIAA 0x0009C080 /* Reset: FLR */
+#define I40E_PF_PCI_CIAA_ADDRESS_SHIFT 0
+#define I40E_PF_PCI_CIAA_ADDRESS_MASK I40E_MASK(0xFFF, I40E_PF_PCI_CIAA_ADDRESS_SHIFT)
+#define I40E_PF_PCI_CIAA_VF_NUM_SHIFT 12
+#define I40E_PF_PCI_CIAA_VF_NUM_MASK I40E_MASK(0x7F, I40E_PF_PCI_CIAA_VF_NUM_SHIFT)
+#define I40E_PF_PCI_CIAD 0x0009C100 /* Reset: FLR */
+#define I40E_PF_PCI_CIAD_DATA_SHIFT 0
+#define I40E_PF_PCI_CIAD_DATA_MASK I40E_MASK(0xFFFFFFFF, I40E_PF_PCI_CIAD_DATA_SHIFT)
+#define I40E_PFPCI_CLASS 0x000BE400 /* Reset: PCIR */
+#define I40E_PFPCI_CLASS_STORAGE_CLASS_SHIFT 0
+#define I40E_PFPCI_CLASS_STORAGE_CLASS_MASK I40E_MASK(0x1, I40E_PFPCI_CLASS_STORAGE_CLASS_SHIFT)
+#define I40E_PFPCI_CLASS_RESERVED_1_SHIFT 1
+#define I40E_PFPCI_CLASS_RESERVED_1_MASK I40E_MASK(0x1, I40E_PFPCI_CLASS_RESERVED_1_SHIFT)
+#define I40E_PFPCI_CLASS_PF_IS_LAN_SHIFT 2
+#define I40E_PFPCI_CLASS_PF_IS_LAN_MASK I40E_MASK(0x1, I40E_PFPCI_CLASS_PF_IS_LAN_SHIFT)
+#define I40E_PFPCI_CNF 0x000BE000 /* Reset: PCIR */
+#define I40E_PFPCI_CNF_MSI_EN_SHIFT 2
+#define I40E_PFPCI_CNF_MSI_EN_MASK I40E_MASK(0x1, I40E_PFPCI_CNF_MSI_EN_SHIFT)
+#define I40E_PFPCI_CNF_EXROM_DIS_SHIFT 3
+#define I40E_PFPCI_CNF_EXROM_DIS_MASK I40E_MASK(0x1, I40E_PFPCI_CNF_EXROM_DIS_SHIFT)
+#define I40E_PFPCI_CNF_IO_BAR_SHIFT 4
+#define I40E_PFPCI_CNF_IO_BAR_MASK I40E_MASK(0x1, I40E_PFPCI_CNF_IO_BAR_SHIFT)
+#define I40E_PFPCI_CNF_INT_PIN_SHIFT 5
+#define I40E_PFPCI_CNF_INT_PIN_MASK I40E_MASK(0x3, I40E_PFPCI_CNF_INT_PIN_SHIFT)
+#define I40E_PFPCI_DEVID 0x000BE080 /* Reset: PCIR */
+#define I40E_PFPCI_DEVID_PF_DEV_ID_SHIFT 0
+#define I40E_PFPCI_DEVID_PF_DEV_ID_MASK I40E_MASK(0xFFFF, I40E_PFPCI_DEVID_PF_DEV_ID_SHIFT)
+#define I40E_PFPCI_DEVID_VF_DEV_ID_SHIFT 16
+#define I40E_PFPCI_DEVID_VF_DEV_ID_MASK I40E_MASK(0xFFFF, I40E_PFPCI_DEVID_VF_DEV_ID_SHIFT)
+#define I40E_PFPCI_FACTPS 0x0009C180 /* Reset: FLR */
+#define I40E_PFPCI_FACTPS_FUNC_POWER_STATE_SHIFT 0
+#define I40E_PFPCI_FACTPS_FUNC_POWER_STATE_MASK I40E_MASK(0x3, I40E_PFPCI_FACTPS_FUNC_POWER_STATE_SHIFT)
+#define I40E_PFPCI_FACTPS_FUNC_AUX_EN_SHIFT 3
+#define I40E_PFPCI_FACTPS_FUNC_AUX_EN_MASK I40E_MASK(0x1, I40E_PFPCI_FACTPS_FUNC_AUX_EN_SHIFT)
+#define I40E_PFPCI_FUNC 0x000BE200 /* Reset: POR */
+#define I40E_PFPCI_FUNC_FUNC_DIS_SHIFT 0
+#define I40E_PFPCI_FUNC_FUNC_DIS_MASK I40E_MASK(0x1, I40E_PFPCI_FUNC_FUNC_DIS_SHIFT)
+#define I40E_PFPCI_FUNC_ALLOW_FUNC_DIS_SHIFT 1
+#define I40E_PFPCI_FUNC_ALLOW_FUNC_DIS_MASK I40E_MASK(0x1, I40E_PFPCI_FUNC_ALLOW_FUNC_DIS_SHIFT)
+#define I40E_PFPCI_FUNC_DIS_FUNC_ON_PORT_DIS_SHIFT 2
+#define I40E_PFPCI_FUNC_DIS_FUNC_ON_PORT_DIS_MASK I40E_MASK(0x1, I40E_PFPCI_FUNC_DIS_FUNC_ON_PORT_DIS_SHIFT)
+#define I40E_PFPCI_FUNC2 0x000BE180 /* Reset: PCIR */
+#define I40E_PFPCI_FUNC2_EMP_FUNC_DIS_SHIFT 0
+#define I40E_PFPCI_FUNC2_EMP_FUNC_DIS_MASK I40E_MASK(0x1, I40E_PFPCI_FUNC2_EMP_FUNC_DIS_SHIFT)
+#define I40E_PFPCI_ICAUSE 0x0009C200 /* Reset: PFR */
+#define I40E_PFPCI_ICAUSE_PCIE_ERR_CAUSE_SHIFT 0
+#define I40E_PFPCI_ICAUSE_PCIE_ERR_CAUSE_MASK I40E_MASK(0xFFFFFFFF, I40E_PFPCI_ICAUSE_PCIE_ERR_CAUSE_SHIFT)
+#define I40E_PFPCI_IENA 0x0009C280 /* Reset: PFR */
+#define I40E_PFPCI_IENA_PCIE_ERR_EN_SHIFT 0
+#define I40E_PFPCI_IENA_PCIE_ERR_EN_MASK I40E_MASK(0xFFFFFFFF, I40E_PFPCI_IENA_PCIE_ERR_EN_SHIFT)
+#define I40E_PFPCI_PF_FLUSH_DONE 0x0009C800 /* Reset: PCIR */
+#define I40E_PFPCI_PF_FLUSH_DONE_FLUSH_DONE_SHIFT 0
+#define I40E_PFPCI_PF_FLUSH_DONE_FLUSH_DONE_MASK I40E_MASK(0x1, I40E_PFPCI_PF_FLUSH_DONE_FLUSH_DONE_SHIFT)
+#define I40E_PFPCI_PM 0x000BE300 /* Reset: POR */
+#define I40E_PFPCI_PM_PME_EN_SHIFT 0
+#define I40E_PFPCI_PM_PME_EN_MASK I40E_MASK(0x1, I40E_PFPCI_PM_PME_EN_SHIFT)
+#define I40E_PFPCI_STATUS1 0x000BE280 /* Reset: POR */
+#define I40E_PFPCI_STATUS1_FUNC_VALID_SHIFT 0
+#define I40E_PFPCI_STATUS1_FUNC_VALID_MASK I40E_MASK(0x1, I40E_PFPCI_STATUS1_FUNC_VALID_SHIFT)
+#define I40E_PFPCI_SUBSYSID 0x000BE100 /* Reset: PCIR */
+#define I40E_PFPCI_SUBSYSID_PF_SUBSYS_ID_SHIFT 0
+#define I40E_PFPCI_SUBSYSID_PF_SUBSYS_ID_MASK I40E_MASK(0xFFFF, I40E_PFPCI_SUBSYSID_PF_SUBSYS_ID_SHIFT)
+#define I40E_PFPCI_SUBSYSID_VF_SUBSYS_ID_SHIFT 16
+#define I40E_PFPCI_SUBSYSID_VF_SUBSYS_ID_MASK I40E_MASK(0xFFFF, I40E_PFPCI_SUBSYSID_VF_SUBSYS_ID_SHIFT)
+#define I40E_PFPCI_VF_FLUSH_DONE 0x0000E400 /* Reset: PCIR */
+#define I40E_PFPCI_VF_FLUSH_DONE_FLUSH_DONE_SHIFT 0
+#define I40E_PFPCI_VF_FLUSH_DONE_FLUSH_DONE_MASK I40E_MASK(0x1, I40E_PFPCI_VF_FLUSH_DONE_FLUSH_DONE_SHIFT)
+#define I40E_PFPCI_VF_FLUSH_DONE1(_VF) (0x0009C600 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: PCIR */
+#define I40E_PFPCI_VF_FLUSH_DONE1_MAX_INDEX 127
+#define I40E_PFPCI_VF_FLUSH_DONE1_FLUSH_DONE_SHIFT 0
+#define I40E_PFPCI_VF_FLUSH_DONE1_FLUSH_DONE_MASK I40E_MASK(0x1, I40E_PFPCI_VF_FLUSH_DONE1_FLUSH_DONE_SHIFT)
+#define I40E_PFPCI_VM_FLUSH_DONE 0x0009C880 /* Reset: PCIR */
+#define I40E_PFPCI_VM_FLUSH_DONE_FLUSH_DONE_SHIFT 0
+#define I40E_PFPCI_VM_FLUSH_DONE_FLUSH_DONE_MASK I40E_MASK(0x1, I40E_PFPCI_VM_FLUSH_DONE_FLUSH_DONE_SHIFT)
+#define I40E_PFPCI_VMINDEX 0x0009C300 /* Reset: PCIR */
+#define I40E_PFPCI_VMINDEX_VMINDEX_SHIFT 0
+#define I40E_PFPCI_VMINDEX_VMINDEX_MASK I40E_MASK(0x1FF, I40E_PFPCI_VMINDEX_VMINDEX_SHIFT)
+#define I40E_PFPCI_VMPEND 0x0009C380 /* Reset: PCIR */
+#define I40E_PFPCI_VMPEND_PENDING_SHIFT 0
+#define I40E_PFPCI_VMPEND_PENDING_MASK I40E_MASK(0x1, I40E_PFPCI_VMPEND_PENDING_SHIFT)
+#define I40E_PRTPM_EEE_STAT 0x001E4320 /* Reset: GLOBR */
+#define I40E_PRTPM_EEE_STAT_EEE_NEG_SHIFT 29
+#define I40E_PRTPM_EEE_STAT_EEE_NEG_MASK I40E_MASK(0x1, I40E_PRTPM_EEE_STAT_EEE_NEG_SHIFT)
+#define I40E_PRTPM_EEE_STAT_RX_LPI_STATUS_SHIFT 30
+#define I40E_PRTPM_EEE_STAT_RX_LPI_STATUS_MASK I40E_MASK(0x1, I40E_PRTPM_EEE_STAT_RX_LPI_STATUS_SHIFT)
+#define I40E_PRTPM_EEE_STAT_TX_LPI_STATUS_SHIFT 31
+#define I40E_PRTPM_EEE_STAT_TX_LPI_STATUS_MASK I40E_MASK(0x1, I40E_PRTPM_EEE_STAT_TX_LPI_STATUS_SHIFT)
+#define I40E_PRTPM_EEEC 0x001E4380 /* Reset: GLOBR */
+#define I40E_PRTPM_EEEC_TW_WAKE_MIN_SHIFT 16
+#define I40E_PRTPM_EEEC_TW_WAKE_MIN_MASK I40E_MASK(0x3F, I40E_PRTPM_EEEC_TW_WAKE_MIN_SHIFT)
+#define I40E_PRTPM_EEEC_TX_LU_LPI_DLY_SHIFT 24
+#define I40E_PRTPM_EEEC_TX_LU_LPI_DLY_MASK I40E_MASK(0x3, I40E_PRTPM_EEEC_TX_LU_LPI_DLY_SHIFT)
+#define I40E_PRTPM_EEEC_TEEE_DLY_SHIFT 26
+#define I40E_PRTPM_EEEC_TEEE_DLY_MASK I40E_MASK(0x3F, I40E_PRTPM_EEEC_TEEE_DLY_SHIFT)
+#define I40E_PRTPM_EEEFWD 0x001E4400 /* Reset: GLOBR */
+#define I40E_PRTPM_EEEFWD_EEE_FW_CONFIG_DONE_SHIFT 31
+#define I40E_PRTPM_EEEFWD_EEE_FW_CONFIG_DONE_MASK I40E_MASK(0x1, I40E_PRTPM_EEEFWD_EEE_FW_CONFIG_DONE_SHIFT)
+#define I40E_PRTPM_EEER 0x001E4360 /* Reset: GLOBR */
+#define I40E_PRTPM_EEER_TW_SYSTEM_SHIFT 0
+#define I40E_PRTPM_EEER_TW_SYSTEM_MASK I40E_MASK(0xFFFF, I40E_PRTPM_EEER_TW_SYSTEM_SHIFT)
+#define I40E_PRTPM_EEER_TX_LPI_EN_SHIFT 16
+#define I40E_PRTPM_EEER_TX_LPI_EN_MASK I40E_MASK(0x1, I40E_PRTPM_EEER_TX_LPI_EN_SHIFT)
+#define I40E_PRTPM_EEETXC 0x001E43E0 /* Reset: GLOBR */
+#define I40E_PRTPM_EEETXC_TW_PHY_SHIFT 0
+#define I40E_PRTPM_EEETXC_TW_PHY_MASK I40E_MASK(0xFFFF, I40E_PRTPM_EEETXC_TW_PHY_SHIFT)
+#define I40E_PRTPM_GC 0x000B8140 /* Reset: POR */
+#define I40E_PRTPM_GC_EMP_LINK_ON_SHIFT 0
+#define I40E_PRTPM_GC_EMP_LINK_ON_MASK I40E_MASK(0x1, I40E_PRTPM_GC_EMP_LINK_ON_SHIFT)
+#define I40E_PRTPM_GC_MNG_VETO_SHIFT 1
+#define I40E_PRTPM_GC_MNG_VETO_MASK I40E_MASK(0x1, I40E_PRTPM_GC_MNG_VETO_SHIFT)
+#define I40E_PRTPM_GC_RATD_SHIFT 2
+#define I40E_PRTPM_GC_RATD_MASK I40E_MASK(0x1, I40E_PRTPM_GC_RATD_SHIFT)
+#define I40E_PRTPM_GC_LCDMP_SHIFT 3
+#define I40E_PRTPM_GC_LCDMP_MASK I40E_MASK(0x1, I40E_PRTPM_GC_LCDMP_SHIFT)
+#define I40E_PRTPM_GC_LPLU_ASSERTED_SHIFT 31
+#define I40E_PRTPM_GC_LPLU_ASSERTED_MASK I40E_MASK(0x1, I40E_PRTPM_GC_LPLU_ASSERTED_SHIFT)
+#define I40E_PRTPM_RLPIC 0x001E43A0 /* Reset: GLOBR */
+#define I40E_PRTPM_RLPIC_ERLPIC_SHIFT 0
+#define I40E_PRTPM_RLPIC_ERLPIC_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTPM_RLPIC_ERLPIC_SHIFT)
+#define I40E_PRTPM_TLPIC 0x001E43C0 /* Reset: GLOBR */
+#define I40E_PRTPM_TLPIC_ETLPIC_SHIFT 0
+#define I40E_PRTPM_TLPIC_ETLPIC_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTPM_TLPIC_ETLPIC_SHIFT)
+#define I40E_GL_PRS_FVBM(_i) (0x00269760 + ((_i) * 4)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GL_PRS_FVBM_MAX_INDEX 3
+#define I40E_GL_PRS_FVBM_FV_BYTE_INDX_SHIFT 0
+#define I40E_GL_PRS_FVBM_FV_BYTE_INDX_MASK I40E_MASK(0x7F, I40E_GL_PRS_FVBM_FV_BYTE_INDX_SHIFT)
+#define I40E_GL_PRS_FVBM_RULE_BUS_INDX_SHIFT 8
+#define I40E_GL_PRS_FVBM_RULE_BUS_INDX_MASK I40E_MASK(0x3F, I40E_GL_PRS_FVBM_RULE_BUS_INDX_SHIFT)
+#define I40E_GL_PRS_FVBM_MSK_ENA_SHIFT 31
+#define I40E_GL_PRS_FVBM_MSK_ENA_MASK I40E_MASK(0x1, I40E_GL_PRS_FVBM_MSK_ENA_SHIFT)
+#define I40E_GLRPB_DPSS 0x000AC828 /* Reset: CORER */
+#define I40E_GLRPB_DPSS_DPS_TCN_SHIFT 0
+#define I40E_GLRPB_DPSS_DPS_TCN_MASK I40E_MASK(0xFFFFF, I40E_GLRPB_DPSS_DPS_TCN_SHIFT)
+#define I40E_GLRPB_GHW 0x000AC830 /* Reset: CORER */
+#define I40E_GLRPB_GHW_GHW_SHIFT 0
+#define I40E_GLRPB_GHW_GHW_MASK I40E_MASK(0xFFFFF, I40E_GLRPB_GHW_GHW_SHIFT)
+#define I40E_GLRPB_GLW 0x000AC834 /* Reset: CORER */
+#define I40E_GLRPB_GLW_GLW_SHIFT 0
+#define I40E_GLRPB_GLW_GLW_MASK I40E_MASK(0xFFFFF, I40E_GLRPB_GLW_GLW_SHIFT)
+#define I40E_GLRPB_PHW 0x000AC844 /* Reset: CORER */
+#define I40E_GLRPB_PHW_PHW_SHIFT 0
+#define I40E_GLRPB_PHW_PHW_MASK I40E_MASK(0xFFFFF, I40E_GLRPB_PHW_PHW_SHIFT)
+#define I40E_GLRPB_PLW 0x000AC848 /* Reset: CORER */
+#define I40E_GLRPB_PLW_PLW_SHIFT 0
+#define I40E_GLRPB_PLW_PLW_MASK I40E_MASK(0xFFFFF, I40E_GLRPB_PLW_PLW_SHIFT)
+#define I40E_PRTRPB_DHW(_i) (0x000AC100 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTRPB_DHW_MAX_INDEX 7
+#define I40E_PRTRPB_DHW_DHW_TCN_SHIFT 0
+#define I40E_PRTRPB_DHW_DHW_TCN_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_DHW_DHW_TCN_SHIFT)
+#define I40E_PRTRPB_DLW(_i) (0x000AC220 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTRPB_DLW_MAX_INDEX 7
+#define I40E_PRTRPB_DLW_DLW_TCN_SHIFT 0
+#define I40E_PRTRPB_DLW_DLW_TCN_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_DLW_DLW_TCN_SHIFT)
+#define I40E_PRTRPB_DPS(_i) (0x000AC320 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTRPB_DPS_MAX_INDEX 7
+#define I40E_PRTRPB_DPS_DPS_TCN_SHIFT 0
+#define I40E_PRTRPB_DPS_DPS_TCN_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_DPS_DPS_TCN_SHIFT)
+#define I40E_PRTRPB_SHT(_i) (0x000AC480 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTRPB_SHT_MAX_INDEX 7
+#define I40E_PRTRPB_SHT_SHT_TCN_SHIFT 0
+#define I40E_PRTRPB_SHT_SHT_TCN_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_SHT_SHT_TCN_SHIFT)
+#define I40E_PRTRPB_SHW 0x000AC580 /* Reset: CORER */
+#define I40E_PRTRPB_SHW_SHW_SHIFT 0
+#define I40E_PRTRPB_SHW_SHW_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_SHW_SHW_SHIFT)
+#define I40E_PRTRPB_SLT(_i) (0x000AC5A0 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTRPB_SLT_MAX_INDEX 7
+#define I40E_PRTRPB_SLT_SLT_TCN_SHIFT 0
+#define I40E_PRTRPB_SLT_SLT_TCN_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_SLT_SLT_TCN_SHIFT)
+#define I40E_PRTRPB_SLW 0x000AC6A0 /* Reset: CORER */
+#define I40E_PRTRPB_SLW_SLW_SHIFT 0
+#define I40E_PRTRPB_SLW_SLW_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_SLW_SLW_SHIFT)
+#define I40E_PRTRPB_SPS 0x000AC7C0 /* Reset: CORER */
+#define I40E_PRTRPB_SPS_SPS_SHIFT 0
+#define I40E_PRTRPB_SPS_SPS_MASK I40E_MASK(0xFFFFF, I40E_PRTRPB_SPS_SPS_SHIFT)
+#define I40E_GLQF_CTL 0x00269BA4 /* Reset: CORER */
+#define I40E_GLQF_CTL_HTOEP_SHIFT 1
+#define I40E_GLQF_CTL_HTOEP_MASK I40E_MASK(0x1, I40E_GLQF_CTL_HTOEP_SHIFT)
+#define I40E_GLQF_CTL_HTOEP_FCOE_SHIFT 2
+#define I40E_GLQF_CTL_HTOEP_FCOE_MASK I40E_MASK(0x1, I40E_GLQF_CTL_HTOEP_FCOE_SHIFT)
+#define I40E_GLQF_CTL_PCNT_ALLOC_SHIFT 3
+#define I40E_GLQF_CTL_PCNT_ALLOC_MASK I40E_MASK(0x7, I40E_GLQF_CTL_PCNT_ALLOC_SHIFT)
+#define I40E_GLQF_CTL_FD_AUTO_PCTYPE_SHIFT 6
+#define I40E_GLQF_CTL_FD_AUTO_PCTYPE_MASK I40E_MASK(0x1, I40E_GLQF_CTL_FD_AUTO_PCTYPE_SHIFT)
+#define I40E_GLQF_CTL_RSVD_SHIFT 7
+#define I40E_GLQF_CTL_RSVD_MASK I40E_MASK(0x1, I40E_GLQF_CTL_RSVD_SHIFT)
+#define I40E_GLQF_CTL_MAXPEBLEN_SHIFT 8
+#define I40E_GLQF_CTL_MAXPEBLEN_MASK I40E_MASK(0x7, I40E_GLQF_CTL_MAXPEBLEN_SHIFT)
+#define I40E_GLQF_CTL_MAXFCBLEN_SHIFT 11
+#define I40E_GLQF_CTL_MAXFCBLEN_MASK I40E_MASK(0x7, I40E_GLQF_CTL_MAXFCBLEN_SHIFT)
+#define I40E_GLQF_CTL_MAXFDBLEN_SHIFT 14
+#define I40E_GLQF_CTL_MAXFDBLEN_MASK I40E_MASK(0x7, I40E_GLQF_CTL_MAXFDBLEN_SHIFT)
+#define I40E_GLQF_CTL_FDBEST_SHIFT 17
+#define I40E_GLQF_CTL_FDBEST_MASK I40E_MASK(0xFF, I40E_GLQF_CTL_FDBEST_SHIFT)
+#define I40E_GLQF_CTL_PROGPRIO_SHIFT 25
+#define I40E_GLQF_CTL_PROGPRIO_MASK I40E_MASK(0x1, I40E_GLQF_CTL_PROGPRIO_SHIFT)
+#define I40E_GLQF_CTL_INVALPRIO_SHIFT 26
+#define I40E_GLQF_CTL_INVALPRIO_MASK I40E_MASK(0x1, I40E_GLQF_CTL_INVALPRIO_SHIFT)
+#define I40E_GLQF_CTL_IGNORE_IP_SHIFT 27
+#define I40E_GLQF_CTL_IGNORE_IP_MASK I40E_MASK(0x1, I40E_GLQF_CTL_IGNORE_IP_SHIFT)
+#define I40E_GLQF_FDCNT_0 0x00269BAC /* Reset: CORER */
+#define I40E_GLQF_FDCNT_0_GUARANT_CNT_SHIFT 0
+#define I40E_GLQF_FDCNT_0_GUARANT_CNT_MASK I40E_MASK(0x1FFF, I40E_GLQF_FDCNT_0_GUARANT_CNT_SHIFT)
+#define I40E_GLQF_FDCNT_0_BESTCNT_SHIFT 13
+#define I40E_GLQF_FDCNT_0_BESTCNT_MASK I40E_MASK(0x1FFF, I40E_GLQF_FDCNT_0_BESTCNT_SHIFT)
+#define I40E_GLQF_HKEY(_i) (0x00270140 + ((_i) * 4)) /* _i=0...12 */ /* Reset: CORER */
+#define I40E_GLQF_HKEY_MAX_INDEX 12
+#define I40E_GLQF_HKEY_KEY_0_SHIFT 0
+#define I40E_GLQF_HKEY_KEY_0_MASK I40E_MASK(0xFF, I40E_GLQF_HKEY_KEY_0_SHIFT)
+#define I40E_GLQF_HKEY_KEY_1_SHIFT 8
+#define I40E_GLQF_HKEY_KEY_1_MASK I40E_MASK(0xFF, I40E_GLQF_HKEY_KEY_1_SHIFT)
+#define I40E_GLQF_HKEY_KEY_2_SHIFT 16
+#define I40E_GLQF_HKEY_KEY_2_MASK I40E_MASK(0xFF, I40E_GLQF_HKEY_KEY_2_SHIFT)
+#define I40E_GLQF_HKEY_KEY_3_SHIFT 24
+#define I40E_GLQF_HKEY_KEY_3_MASK I40E_MASK(0xFF, I40E_GLQF_HKEY_KEY_3_SHIFT)
+#define I40E_GLQF_HSYM(_i) (0x00269D00 + ((_i) * 4)) /* _i=0...63 */ /* Reset: CORER */
+#define I40E_GLQF_HSYM_MAX_INDEX 63
+#define I40E_GLQF_HSYM_SYMH_ENA_SHIFT 0
+#define I40E_GLQF_HSYM_SYMH_ENA_MASK I40E_MASK(0x1, I40E_GLQF_HSYM_SYMH_ENA_SHIFT)
+#define I40E_GLQF_PCNT(_i) (0x00266800 + ((_i) * 4)) /* _i=0...511 */ /* Reset: CORER */
+#define I40E_GLQF_PCNT_MAX_INDEX 511
+#define I40E_GLQF_PCNT_PCNT_SHIFT 0
+#define I40E_GLQF_PCNT_PCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLQF_PCNT_PCNT_SHIFT)
+#define I40E_GLQF_SWAP(_i, _j) (0x00267E00 + ((_i) * 4 + (_j) * 8)) /* _i=0...1, _j=0...63 */ /* Reset: CORER */
+#define I40E_GLQF_SWAP_MAX_INDEX 1
+#define I40E_GLQF_SWAP_OFF0_SRC0_SHIFT 0
+#define I40E_GLQF_SWAP_OFF0_SRC0_MASK I40E_MASK(0x3F, I40E_GLQF_SWAP_OFF0_SRC0_SHIFT)
+#define I40E_GLQF_SWAP_OFF0_SRC1_SHIFT 6
+#define I40E_GLQF_SWAP_OFF0_SRC1_MASK I40E_MASK(0x3F, I40E_GLQF_SWAP_OFF0_SRC1_SHIFT)
+#define I40E_GLQF_SWAP_FLEN0_SHIFT 12
+#define I40E_GLQF_SWAP_FLEN0_MASK I40E_MASK(0xF, I40E_GLQF_SWAP_FLEN0_SHIFT)
+#define I40E_GLQF_SWAP_OFF1_SRC0_SHIFT 16
+#define I40E_GLQF_SWAP_OFF1_SRC0_MASK I40E_MASK(0x3F, I40E_GLQF_SWAP_OFF1_SRC0_SHIFT)
+#define I40E_GLQF_SWAP_OFF1_SRC1_SHIFT 22
+#define I40E_GLQF_SWAP_OFF1_SRC1_MASK I40E_MASK(0x3F, I40E_GLQF_SWAP_OFF1_SRC1_SHIFT)
+#define I40E_GLQF_SWAP_FLEN1_SHIFT 28
+#define I40E_GLQF_SWAP_FLEN1_MASK I40E_MASK(0xF, I40E_GLQF_SWAP_FLEN1_SHIFT)
+#define I40E_PFQF_CTL_0 0x001C0AC0 /* Reset: CORER */
+#define I40E_PFQF_CTL_0_PEHSIZE_SHIFT 0
+#define I40E_PFQF_CTL_0_PEHSIZE_MASK I40E_MASK(0x1F, I40E_PFQF_CTL_0_PEHSIZE_SHIFT)
+#define I40E_PFQF_CTL_0_PEDSIZE_SHIFT 5
+#define I40E_PFQF_CTL_0_PEDSIZE_MASK I40E_MASK(0x1F, I40E_PFQF_CTL_0_PEDSIZE_SHIFT)
+#define I40E_PFQF_CTL_0_PFFCHSIZE_SHIFT 10
+#define I40E_PFQF_CTL_0_PFFCHSIZE_MASK I40E_MASK(0xF, I40E_PFQF_CTL_0_PFFCHSIZE_SHIFT)
+#define I40E_PFQF_CTL_0_PFFCDSIZE_SHIFT 14
+#define I40E_PFQF_CTL_0_PFFCDSIZE_MASK I40E_MASK(0x3, I40E_PFQF_CTL_0_PFFCDSIZE_SHIFT)
+#define I40E_PFQF_CTL_0_HASHLUTSIZE_SHIFT 16
+#define I40E_PFQF_CTL_0_HASHLUTSIZE_MASK I40E_MASK(0x1, I40E_PFQF_CTL_0_HASHLUTSIZE_SHIFT)
+#define I40E_PFQF_CTL_0_FD_ENA_SHIFT 17
+#define I40E_PFQF_CTL_0_FD_ENA_MASK I40E_MASK(0x1, I40E_PFQF_CTL_0_FD_ENA_SHIFT)
+#define I40E_PFQF_CTL_0_ETYPE_ENA_SHIFT 18
+#define I40E_PFQF_CTL_0_ETYPE_ENA_MASK I40E_MASK(0x1, I40E_PFQF_CTL_0_ETYPE_ENA_SHIFT)
+#define I40E_PFQF_CTL_0_MACVLAN_ENA_SHIFT 19
+#define I40E_PFQF_CTL_0_MACVLAN_ENA_MASK I40E_MASK(0x1, I40E_PFQF_CTL_0_MACVLAN_ENA_SHIFT)
+#define I40E_PFQF_CTL_0_VFFCHSIZE_SHIFT 20
+#define I40E_PFQF_CTL_0_VFFCHSIZE_MASK I40E_MASK(0xF, I40E_PFQF_CTL_0_VFFCHSIZE_SHIFT)
+#define I40E_PFQF_CTL_0_VFFCDSIZE_SHIFT 24
+#define I40E_PFQF_CTL_0_VFFCDSIZE_MASK I40E_MASK(0x3, I40E_PFQF_CTL_0_VFFCDSIZE_SHIFT)
+#define I40E_PFQF_CTL_1 0x00245D80 /* Reset: CORER */
+#define I40E_PFQF_CTL_1_CLEARFDTABLE_SHIFT 0
+#define I40E_PFQF_CTL_1_CLEARFDTABLE_MASK I40E_MASK(0x1, I40E_PFQF_CTL_1_CLEARFDTABLE_SHIFT)
+#define I40E_PFQF_FDALLOC 0x00246280 /* Reset: CORER */
+#define I40E_PFQF_FDALLOC_FDALLOC_SHIFT 0
+#define I40E_PFQF_FDALLOC_FDALLOC_MASK I40E_MASK(0xFF, I40E_PFQF_FDALLOC_FDALLOC_SHIFT)
+#define I40E_PFQF_FDALLOC_FDBEST_SHIFT 8
+#define I40E_PFQF_FDALLOC_FDBEST_MASK I40E_MASK(0xFF, I40E_PFQF_FDALLOC_FDBEST_SHIFT)
+#define I40E_PFQF_FDSTAT 0x00246380 /* Reset: CORER */
+#define I40E_PFQF_FDSTAT_GUARANT_CNT_SHIFT 0
+#define I40E_PFQF_FDSTAT_GUARANT_CNT_MASK I40E_MASK(0x1FFF, I40E_PFQF_FDSTAT_GUARANT_CNT_SHIFT)
+#define I40E_PFQF_FDSTAT_BEST_CNT_SHIFT 16
+#define I40E_PFQF_FDSTAT_BEST_CNT_MASK I40E_MASK(0x1FFF, I40E_PFQF_FDSTAT_BEST_CNT_SHIFT)
+#define I40E_PFQF_HENA(_i) (0x00245900 + ((_i) * 128)) /* _i=0...1 */ /* Reset: CORER */
+#define I40E_PFQF_HENA_MAX_INDEX 1
+#define I40E_PFQF_HENA_PTYPE_ENA_SHIFT 0
+#define I40E_PFQF_HENA_PTYPE_ENA_MASK I40E_MASK(0xFFFFFFFF, I40E_PFQF_HENA_PTYPE_ENA_SHIFT)
+#define I40E_PFQF_HKEY(_i) (0x00244800 + ((_i) * 128)) /* _i=0...12 */ /* Reset: CORER */
+#define I40E_PFQF_HKEY_MAX_INDEX 12
+#define I40E_PFQF_HKEY_KEY_0_SHIFT 0
+#define I40E_PFQF_HKEY_KEY_0_MASK I40E_MASK(0xFF, I40E_PFQF_HKEY_KEY_0_SHIFT)
+#define I40E_PFQF_HKEY_KEY_1_SHIFT 8
+#define I40E_PFQF_HKEY_KEY_1_MASK I40E_MASK(0xFF, I40E_PFQF_HKEY_KEY_1_SHIFT)
+#define I40E_PFQF_HKEY_KEY_2_SHIFT 16
+#define I40E_PFQF_HKEY_KEY_2_MASK I40E_MASK(0xFF, I40E_PFQF_HKEY_KEY_2_SHIFT)
+#define I40E_PFQF_HKEY_KEY_3_SHIFT 24
+#define I40E_PFQF_HKEY_KEY_3_MASK I40E_MASK(0xFF, I40E_PFQF_HKEY_KEY_3_SHIFT)
+#define I40E_PFQF_HLUT(_i) (0x00240000 + ((_i) * 128)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_PFQF_HLUT_MAX_INDEX 127
+#define I40E_PFQF_HLUT_LUT0_SHIFT 0
+#define I40E_PFQF_HLUT_LUT0_MASK I40E_MASK(0x3F, I40E_PFQF_HLUT_LUT0_SHIFT)
+#define I40E_PFQF_HLUT_LUT1_SHIFT 8
+#define I40E_PFQF_HLUT_LUT1_MASK I40E_MASK(0x3F, I40E_PFQF_HLUT_LUT1_SHIFT)
+#define I40E_PFQF_HLUT_LUT2_SHIFT 16
+#define I40E_PFQF_HLUT_LUT2_MASK I40E_MASK(0x3F, I40E_PFQF_HLUT_LUT2_SHIFT)
+#define I40E_PFQF_HLUT_LUT3_SHIFT 24
+#define I40E_PFQF_HLUT_LUT3_MASK I40E_MASK(0x3F, I40E_PFQF_HLUT_LUT3_SHIFT)
+#define I40E_PRTQF_CTL_0 0x00256E60 /* Reset: CORER */
+#define I40E_PRTQF_CTL_0_HSYM_ENA_SHIFT 0
+#define I40E_PRTQF_CTL_0_HSYM_ENA_MASK I40E_MASK(0x1, I40E_PRTQF_CTL_0_HSYM_ENA_SHIFT)
+#define I40E_PRTQF_FD_FLXINSET(_i) (0x00253800 + ((_i) * 32)) /* _i=0...63 */ /* Reset: CORER */
+#define I40E_PRTQF_FD_FLXINSET_MAX_INDEX 63
+#define I40E_PRTQF_FD_FLXINSET_INSET_SHIFT 0
+#define I40E_PRTQF_FD_FLXINSET_INSET_MASK I40E_MASK(0xFF, I40E_PRTQF_FD_FLXINSET_INSET_SHIFT)
+#define I40E_PRTQF_FD_INSET(_i, _j) (0x00250000 + ((_i) * 64 + (_j) * 32)) /* _i=0...63, _j=0...1 */ /* Reset: CORER */
+#define I40E_PRTQF_FD_INSET_MAX_INDEX 63
+#define I40E_PRTQF_FD_INSET_INSET_SHIFT 0
+#define I40E_PRTQF_FD_INSET_INSET_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTQF_FD_INSET_INSET_SHIFT)
+#define I40E_PRTQF_FD_INSET(_i, _j) (0x00250000 + ((_i) * 64 + (_j) * 32)) /* _i=0...63, _j=0...1 */ /* Reset: CORER */
+#define I40E_PRTQF_FD_INSET_MAX_INDEX 63
+#define I40E_PRTQF_FD_INSET_INSET_SHIFT 0
+#define I40E_PRTQF_FD_INSET_INSET_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTQF_FD_INSET_INSET_SHIFT)
+#define I40E_PRTQF_FD_MSK(_i, _j) (0x00252000 + ((_i) * 64 + (_j) * 32)) /* _i=0...63, _j=0...1 */ /* Reset: CORER */
+#define I40E_PRTQF_FD_MSK_MAX_INDEX 63
+#define I40E_PRTQF_FD_MSK_MASK_SHIFT 0
+#define I40E_PRTQF_FD_MSK_MASK_MASK I40E_MASK(0xFFFF, I40E_PRTQF_FD_MSK_MASK_SHIFT)
+#define I40E_PRTQF_FD_MSK_OFFSET_SHIFT 16
+#define I40E_PRTQF_FD_MSK_OFFSET_MASK I40E_MASK(0x3F, I40E_PRTQF_FD_MSK_OFFSET_SHIFT)
+#define I40E_PRTQF_FLX_PIT(_i) (0x00255200 + ((_i) * 32)) /* _i=0...8 */ /* Reset: CORER */
+#define I40E_PRTQF_FLX_PIT_MAX_INDEX 8
+#define I40E_PRTQF_FLX_PIT_SOURCE_OFF_SHIFT 0
+#define I40E_PRTQF_FLX_PIT_SOURCE_OFF_MASK I40E_MASK(0x1F, I40E_PRTQF_FLX_PIT_SOURCE_OFF_SHIFT)
+#define I40E_PRTQF_FLX_PIT_FSIZE_SHIFT 5
+#define I40E_PRTQF_FLX_PIT_FSIZE_MASK I40E_MASK(0x1F, I40E_PRTQF_FLX_PIT_FSIZE_SHIFT)
+#define I40E_PRTQF_FLX_PIT_DEST_OFF_SHIFT 10
+#define I40E_PRTQF_FLX_PIT_DEST_OFF_MASK I40E_MASK(0x3F, I40E_PRTQF_FLX_PIT_DEST_OFF_SHIFT)
+#define I40E_VFQF_HENA1(_i, _VF) (0x00230800 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...1, _VF=0...127 */ /* Reset: CORER */
+#define I40E_VFQF_HENA1_MAX_INDEX 1
+#define I40E_VFQF_HENA1_PTYPE_ENA_SHIFT 0
+#define I40E_VFQF_HENA1_PTYPE_ENA_MASK I40E_MASK(0xFFFFFFFF, I40E_VFQF_HENA1_PTYPE_ENA_SHIFT)
+#define I40E_VFQF_HKEY1(_i, _VF) (0x00228000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...12, _VF=0...127 */ /* Reset: CORER */
+#define I40E_VFQF_HKEY1_MAX_INDEX 12
+#define I40E_VFQF_HKEY1_KEY_0_SHIFT 0
+#define I40E_VFQF_HKEY1_KEY_0_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY1_KEY_0_SHIFT)
+#define I40E_VFQF_HKEY1_KEY_1_SHIFT 8
+#define I40E_VFQF_HKEY1_KEY_1_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY1_KEY_1_SHIFT)
+#define I40E_VFQF_HKEY1_KEY_2_SHIFT 16
+#define I40E_VFQF_HKEY1_KEY_2_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY1_KEY_2_SHIFT)
+#define I40E_VFQF_HKEY1_KEY_3_SHIFT 24
+#define I40E_VFQF_HKEY1_KEY_3_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY1_KEY_3_SHIFT)
+#define I40E_VFQF_HLUT1(_i, _VF) (0x00220000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...15, _VF=0...127 */ /* Reset: CORER */
+#define I40E_VFQF_HLUT1_MAX_INDEX 15
+#define I40E_VFQF_HLUT1_LUT0_SHIFT 0
+#define I40E_VFQF_HLUT1_LUT0_MASK I40E_MASK(0xF, I40E_VFQF_HLUT1_LUT0_SHIFT)
+#define I40E_VFQF_HLUT1_LUT1_SHIFT 8
+#define I40E_VFQF_HLUT1_LUT1_MASK I40E_MASK(0xF, I40E_VFQF_HLUT1_LUT1_SHIFT)
+#define I40E_VFQF_HLUT1_LUT2_SHIFT 16
+#define I40E_VFQF_HLUT1_LUT2_MASK I40E_MASK(0xF, I40E_VFQF_HLUT1_LUT2_SHIFT)
+#define I40E_VFQF_HLUT1_LUT3_SHIFT 24
+#define I40E_VFQF_HLUT1_LUT3_MASK I40E_MASK(0xF, I40E_VFQF_HLUT1_LUT3_SHIFT)
+#define I40E_VFQF_HREGION1(_i, _VF) (0x0022E000 + ((_i) * 1024 + (_VF) * 4)) /* _i=0...7, _VF=0...127 */ /* Reset: CORER */
+#define I40E_VFQF_HREGION1_MAX_INDEX 7
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_0_SHIFT 0
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_0_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_0_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_0_SHIFT 1
+#define I40E_VFQF_HREGION1_REGION_0_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_0_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_1_SHIFT 4
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_1_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_1_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_1_SHIFT 5
+#define I40E_VFQF_HREGION1_REGION_1_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_1_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_2_SHIFT 8
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_2_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_2_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_2_SHIFT 9
+#define I40E_VFQF_HREGION1_REGION_2_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_2_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_3_SHIFT 12
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_3_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_3_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_3_SHIFT 13
+#define I40E_VFQF_HREGION1_REGION_3_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_3_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_4_SHIFT 16
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_4_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_4_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_4_SHIFT 17
+#define I40E_VFQF_HREGION1_REGION_4_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_4_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_5_SHIFT 20
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_5_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_5_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_5_SHIFT 21
+#define I40E_VFQF_HREGION1_REGION_5_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_5_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_6_SHIFT 24
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_6_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_6_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_6_SHIFT 25
+#define I40E_VFQF_HREGION1_REGION_6_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_6_SHIFT)
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_7_SHIFT 28
+#define I40E_VFQF_HREGION1_OVERRIDE_ENA_7_MASK I40E_MASK(0x1, I40E_VFQF_HREGION1_OVERRIDE_ENA_7_SHIFT)
+#define I40E_VFQF_HREGION1_REGION_7_SHIFT 29
+#define I40E_VFQF_HREGION1_REGION_7_MASK I40E_MASK(0x7, I40E_VFQF_HREGION1_REGION_7_SHIFT)
+#define I40E_VPQF_CTL(_VF) (0x001C0000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VPQF_CTL_MAX_INDEX 127
+#define I40E_VPQF_CTL_PEHSIZE_SHIFT 0
+#define I40E_VPQF_CTL_PEHSIZE_MASK I40E_MASK(0x1F, I40E_VPQF_CTL_PEHSIZE_SHIFT)
+#define I40E_VPQF_CTL_PEDSIZE_SHIFT 5
+#define I40E_VPQF_CTL_PEDSIZE_MASK I40E_MASK(0x1F, I40E_VPQF_CTL_PEDSIZE_SHIFT)
+#define I40E_VPQF_CTL_FCHSIZE_SHIFT 10
+#define I40E_VPQF_CTL_FCHSIZE_MASK I40E_MASK(0xF, I40E_VPQF_CTL_FCHSIZE_SHIFT)
+#define I40E_VPQF_CTL_FCDSIZE_SHIFT 14
+#define I40E_VPQF_CTL_FCDSIZE_MASK I40E_MASK(0x3, I40E_VPQF_CTL_FCDSIZE_SHIFT)
+#define I40E_VSIQF_CTL(_VSI) (0x0020D800 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: PFR */
+#define I40E_VSIQF_CTL_MAX_INDEX 383
+#define I40E_VSIQF_CTL_FCOE_ENA_SHIFT 0
+#define I40E_VSIQF_CTL_FCOE_ENA_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_FCOE_ENA_SHIFT)
+#define I40E_VSIQF_CTL_PETCP_ENA_SHIFT 1
+#define I40E_VSIQF_CTL_PETCP_ENA_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_PETCP_ENA_SHIFT)
+#define I40E_VSIQF_CTL_PEUUDP_ENA_SHIFT 2
+#define I40E_VSIQF_CTL_PEUUDP_ENA_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_PEUUDP_ENA_SHIFT)
+#define I40E_VSIQF_CTL_PEMUDP_ENA_SHIFT 3
+#define I40E_VSIQF_CTL_PEMUDP_ENA_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_PEMUDP_ENA_SHIFT)
+#define I40E_VSIQF_CTL_PEUFRAG_ENA_SHIFT 4
+#define I40E_VSIQF_CTL_PEUFRAG_ENA_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_PEUFRAG_ENA_SHIFT)
+#define I40E_VSIQF_CTL_PEMFRAG_ENA_SHIFT 5
+#define I40E_VSIQF_CTL_PEMFRAG_ENA_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_PEMFRAG_ENA_SHIFT)
+#define I40E_VSIQF_TCREGION(_i, _VSI) (0x00206000 + ((_i) * 2048 + (_VSI) * 4)) /* _i=0...3, _VSI=0...383 */ /* Reset: PFR */
+#define I40E_VSIQF_TCREGION_MAX_INDEX 3
+#define I40E_VSIQF_TCREGION_TC_OFFSET_SHIFT 0
+#define I40E_VSIQF_TCREGION_TC_OFFSET_MASK I40E_MASK(0x1FF, I40E_VSIQF_TCREGION_TC_OFFSET_SHIFT)
+#define I40E_VSIQF_TCREGION_TC_SIZE_SHIFT 9
+#define I40E_VSIQF_TCREGION_TC_SIZE_MASK I40E_MASK(0x7, I40E_VSIQF_TCREGION_TC_SIZE_SHIFT)
+#define I40E_VSIQF_TCREGION_TC_OFFSET2_SHIFT 16
+#define I40E_VSIQF_TCREGION_TC_OFFSET2_MASK I40E_MASK(0x1FF, I40E_VSIQF_TCREGION_TC_OFFSET2_SHIFT)
+#define I40E_VSIQF_TCREGION_TC_SIZE2_SHIFT 25
+#define I40E_VSIQF_TCREGION_TC_SIZE2_MASK I40E_MASK(0x7, I40E_VSIQF_TCREGION_TC_SIZE2_SHIFT)
+#define I40E_GL_FCOECRC(_i) (0x00314d80 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOECRC_MAX_INDEX 143
+#define I40E_GL_FCOECRC_FCOECRC_SHIFT 0
+#define I40E_GL_FCOECRC_FCOECRC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOECRC_FCOECRC_SHIFT)
+#define I40E_GL_FCOEDDPC(_i) (0x00314480 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDDPC_MAX_INDEX 143
+#define I40E_GL_FCOEDDPC_FCOEDDPC_SHIFT 0
+#define I40E_GL_FCOEDDPC_FCOEDDPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDDPC_FCOEDDPC_SHIFT)
+#define I40E_GL_FCOEDIFEC(_i) (0x00318480 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDIFEC_MAX_INDEX 143
+#define I40E_GL_FCOEDIFEC_FCOEDIFRC_SHIFT 0
+#define I40E_GL_FCOEDIFEC_FCOEDIFRC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDIFEC_FCOEDIFRC_SHIFT)
+#define I40E_GL_FCOEDIFTCL(_i) (0x00354000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDIFTCL_MAX_INDEX 143
+#define I40E_GL_FCOEDIFTCL_FCOEDIFTC_SHIFT 0
+#define I40E_GL_FCOEDIFTCL_FCOEDIFTC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDIFTCL_FCOEDIFTC_SHIFT)
+#define I40E_GL_FCOEDIXEC(_i) (0x0034c000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDIXEC_MAX_INDEX 143
+#define I40E_GL_FCOEDIXEC_FCOEDIXEC_SHIFT 0
+#define I40E_GL_FCOEDIXEC_FCOEDIXEC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDIXEC_FCOEDIXEC_SHIFT)
+#define I40E_GL_FCOEDIXVC(_i) (0x00350000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDIXVC_MAX_INDEX 143
+#define I40E_GL_FCOEDIXVC_FCOEDIXVC_SHIFT 0
+#define I40E_GL_FCOEDIXVC_FCOEDIXVC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDIXVC_FCOEDIXVC_SHIFT)
+#define I40E_GL_FCOEDWRCH(_i) (0x00320004 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDWRCH_MAX_INDEX 143
+#define I40E_GL_FCOEDWRCH_FCOEDWRCH_SHIFT 0
+#define I40E_GL_FCOEDWRCH_FCOEDWRCH_MASK I40E_MASK(0xFFFF, I40E_GL_FCOEDWRCH_FCOEDWRCH_SHIFT)
+#define I40E_GL_FCOEDWRCL(_i) (0x00320000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDWRCL_MAX_INDEX 143
+#define I40E_GL_FCOEDWRCL_FCOEDWRCL_SHIFT 0
+#define I40E_GL_FCOEDWRCL_FCOEDWRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDWRCL_FCOEDWRCL_SHIFT)
+#define I40E_GL_FCOEDWTCH(_i) (0x00348084 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDWTCH_MAX_INDEX 143
+#define I40E_GL_FCOEDWTCH_FCOEDWTCH_SHIFT 0
+#define I40E_GL_FCOEDWTCH_FCOEDWTCH_MASK I40E_MASK(0xFFFF, I40E_GL_FCOEDWTCH_FCOEDWTCH_SHIFT)
+#define I40E_GL_FCOEDWTCL(_i) (0x00348080 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEDWTCL_MAX_INDEX 143
+#define I40E_GL_FCOEDWTCL_FCOEDWTCL_SHIFT 0
+#define I40E_GL_FCOEDWTCL_FCOEDWTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEDWTCL_FCOEDWTCL_SHIFT)
+#define I40E_GL_FCOELAST(_i) (0x00314000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOELAST_MAX_INDEX 143
+#define I40E_GL_FCOELAST_FCOELAST_SHIFT 0
+#define I40E_GL_FCOELAST_FCOELAST_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOELAST_FCOELAST_SHIFT)
+#define I40E_GL_FCOEPRC(_i) (0x00315200 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEPRC_MAX_INDEX 143
+#define I40E_GL_FCOEPRC_FCOEPRC_SHIFT 0
+#define I40E_GL_FCOEPRC_FCOEPRC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEPRC_FCOEPRC_SHIFT)
+#define I40E_GL_FCOEPTC(_i) (0x00344C00 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOEPTC_MAX_INDEX 143
+#define I40E_GL_FCOEPTC_FCOEPTC_SHIFT 0
+#define I40E_GL_FCOEPTC_FCOEPTC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOEPTC_FCOEPTC_SHIFT)
+#define I40E_GL_FCOERPDC(_i) (0x00324000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_FCOERPDC_MAX_INDEX 143
+#define I40E_GL_FCOERPDC_FCOERPDC_SHIFT 0
+#define I40E_GL_FCOERPDC_FCOERPDC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_FCOERPDC_FCOERPDC_SHIFT)
+#define I40E_GL_RXERR1_L(_i) (0x00318000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_RXERR1_L_MAX_INDEX 143
+#define I40E_GL_RXERR1_L_FCOEDIFRC_SHIFT 0
+#define I40E_GL_RXERR1_L_FCOEDIFRC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_RXERR1_L_FCOEDIFRC_SHIFT)
+#define I40E_GL_RXERR2_L(_i) (0x0031c000 + ((_i) * 8)) /* _i=0...143 */ /* Reset: CORER */
+#define I40E_GL_RXERR2_L_MAX_INDEX 143
+#define I40E_GL_RXERR2_L_FCOEDIXAC_SHIFT 0
+#define I40E_GL_RXERR2_L_FCOEDIXAC_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_RXERR2_L_FCOEDIXAC_SHIFT)
+#define I40E_GLPRT_BPRCH(_i) (0x003005E4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_BPRCH_MAX_INDEX 3
+#define I40E_GLPRT_BPRCH_BPRCH_SHIFT 0
+#define I40E_GLPRT_BPRCH_BPRCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_BPRCH_BPRCH_SHIFT)
+#define I40E_GLPRT_BPRCL(_i) (0x003005E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_BPRCL_MAX_INDEX 3
+#define I40E_GLPRT_BPRCL_BPRCL_SHIFT 0
+#define I40E_GLPRT_BPRCL_BPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_BPRCL_BPRCL_SHIFT)
+#define I40E_GLPRT_BPTCH(_i) (0x00300A04 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_BPTCH_MAX_INDEX 3
+#define I40E_GLPRT_BPTCH_BPTCH_SHIFT 0
+#define I40E_GLPRT_BPTCH_BPTCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_BPTCH_BPTCH_SHIFT)
+#define I40E_GLPRT_BPTCL(_i) (0x00300A00 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_BPTCL_MAX_INDEX 3
+#define I40E_GLPRT_BPTCL_BPTCL_SHIFT 0
+#define I40E_GLPRT_BPTCL_BPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_BPTCL_BPTCL_SHIFT)
+#define I40E_GLPRT_CRCERRS(_i) (0x00300080 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_CRCERRS_MAX_INDEX 3
+#define I40E_GLPRT_CRCERRS_CRCERRS_SHIFT 0
+#define I40E_GLPRT_CRCERRS_CRCERRS_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_CRCERRS_CRCERRS_SHIFT)
+#define I40E_GLPRT_GORCH(_i) (0x00300004 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_GORCH_MAX_INDEX 3
+#define I40E_GLPRT_GORCH_GORCH_SHIFT 0
+#define I40E_GLPRT_GORCH_GORCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_GORCH_GORCH_SHIFT)
+#define I40E_GLPRT_GORCL(_i) (0x00300000 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_GORCL_MAX_INDEX 3
+#define I40E_GLPRT_GORCL_GORCL_SHIFT 0
+#define I40E_GLPRT_GORCL_GORCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_GORCL_GORCL_SHIFT)
+#define I40E_GLPRT_GOTCH(_i) (0x00300684 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_GOTCH_MAX_INDEX 3
+#define I40E_GLPRT_GOTCH_GOTCH_SHIFT 0
+#define I40E_GLPRT_GOTCH_GOTCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_GOTCH_GOTCH_SHIFT)
+#define I40E_GLPRT_GOTCL(_i) (0x00300680 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_GOTCL_MAX_INDEX 3
+#define I40E_GLPRT_GOTCL_GOTCL_SHIFT 0
+#define I40E_GLPRT_GOTCL_GOTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_GOTCL_GOTCL_SHIFT)
+#define I40E_GLPRT_ILLERRC(_i) (0x003000E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_ILLERRC_MAX_INDEX 3
+#define I40E_GLPRT_ILLERRC_ILLERRC_SHIFT 0
+#define I40E_GLPRT_ILLERRC_ILLERRC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_ILLERRC_ILLERRC_SHIFT)
+#define I40E_GLPRT_LDPC(_i) (0x00300620 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_LDPC_MAX_INDEX 3
+#define I40E_GLPRT_LDPC_LDPC_SHIFT 0
+#define I40E_GLPRT_LDPC_LDPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LDPC_LDPC_SHIFT)
+#define I40E_GLPRT_LXOFFRXC(_i) (0x00300160 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_LXOFFRXC_MAX_INDEX 3
+#define I40E_GLPRT_LXOFFRXC_LXOFFRXCNT_SHIFT 0
+#define I40E_GLPRT_LXOFFRXC_LXOFFRXCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LXOFFRXC_LXOFFRXCNT_SHIFT)
+#define I40E_GLPRT_LXOFFTXC(_i) (0x003009A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_LXOFFTXC_MAX_INDEX 3
+#define I40E_GLPRT_LXOFFTXC_LXOFFTXC_SHIFT 0
+#define I40E_GLPRT_LXOFFTXC_LXOFFTXC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LXOFFTXC_LXOFFTXC_SHIFT)
+#define I40E_GLPRT_LXONRXC(_i) (0x00300140 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_LXONRXC_MAX_INDEX 3
+#define I40E_GLPRT_LXONRXC_LXONRXCNT_SHIFT 0
+#define I40E_GLPRT_LXONRXC_LXONRXCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LXONRXC_LXONRXCNT_SHIFT)
+#define I40E_GLPRT_LXONTXC(_i) (0x00300980 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_LXONTXC_MAX_INDEX 3
+#define I40E_GLPRT_LXONTXC_LXONTXC_SHIFT 0
+#define I40E_GLPRT_LXONTXC_LXONTXC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_LXONTXC_LXONTXC_SHIFT)
+#define I40E_GLPRT_MLFC(_i) (0x00300020 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_MLFC_MAX_INDEX 3
+#define I40E_GLPRT_MLFC_MLFC_SHIFT 0
+#define I40E_GLPRT_MLFC_MLFC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MLFC_MLFC_SHIFT)
+#define I40E_GLPRT_MPRCH(_i) (0x003005C4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_MPRCH_MAX_INDEX 3
+#define I40E_GLPRT_MPRCH_MPRCH_SHIFT 0
+#define I40E_GLPRT_MPRCH_MPRCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_MPRCH_MPRCH_SHIFT)
+#define I40E_GLPRT_MPRCL(_i) (0x003005C0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_MPRCL_MAX_INDEX 3
+#define I40E_GLPRT_MPRCL_MPRCL_SHIFT 0
+#define I40E_GLPRT_MPRCL_MPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MPRCL_MPRCL_SHIFT)
+#define I40E_GLPRT_MPTCH(_i) (0x003009E4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_MPTCH_MAX_INDEX 3
+#define I40E_GLPRT_MPTCH_MPTCH_SHIFT 0
+#define I40E_GLPRT_MPTCH_MPTCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_MPTCH_MPTCH_SHIFT)
+#define I40E_GLPRT_MPTCL(_i) (0x003009E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_MPTCL_MAX_INDEX 3
+#define I40E_GLPRT_MPTCL_MPTCL_SHIFT 0
+#define I40E_GLPRT_MPTCL_MPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MPTCL_MPTCL_SHIFT)
+#define I40E_GLPRT_MRFC(_i) (0x00300040 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_MRFC_MAX_INDEX 3
+#define I40E_GLPRT_MRFC_MRFC_SHIFT 0
+#define I40E_GLPRT_MRFC_MRFC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_MRFC_MRFC_SHIFT)
+#define I40E_GLPRT_PRC1023H(_i) (0x00300504 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC1023H_MAX_INDEX 3
+#define I40E_GLPRT_PRC1023H_PRC1023H_SHIFT 0
+#define I40E_GLPRT_PRC1023H_PRC1023H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC1023H_PRC1023H_SHIFT)
+#define I40E_GLPRT_PRC1023L(_i) (0x00300500 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC1023L_MAX_INDEX 3
+#define I40E_GLPRT_PRC1023L_PRC1023L_SHIFT 0
+#define I40E_GLPRT_PRC1023L_PRC1023L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC1023L_PRC1023L_SHIFT)
+#define I40E_GLPRT_PRC127H(_i) (0x003004A4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC127H_MAX_INDEX 3
+#define I40E_GLPRT_PRC127H_PRC127H_SHIFT 0
+#define I40E_GLPRT_PRC127H_PRC127H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC127H_PRC127H_SHIFT)
+#define I40E_GLPRT_PRC127L(_i) (0x003004A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC127L_MAX_INDEX 3
+#define I40E_GLPRT_PRC127L_PRC127L_SHIFT 0
+#define I40E_GLPRT_PRC127L_PRC127L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC127L_PRC127L_SHIFT)
+#define I40E_GLPRT_PRC1522H(_i) (0x00300524 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC1522H_MAX_INDEX 3
+#define I40E_GLPRT_PRC1522H_PRC1522H_SHIFT 0
+#define I40E_GLPRT_PRC1522H_PRC1522H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC1522H_PRC1522H_SHIFT)
+#define I40E_GLPRT_PRC1522L(_i) (0x00300520 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC1522L_MAX_INDEX 3
+#define I40E_GLPRT_PRC1522L_PRC1522L_SHIFT 0
+#define I40E_GLPRT_PRC1522L_PRC1522L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC1522L_PRC1522L_SHIFT)
+#define I40E_GLPRT_PRC255H(_i) (0x003004C4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC255H_MAX_INDEX 3
+#define I40E_GLPRT_PRC255H_PRTPRC255H_SHIFT 0
+#define I40E_GLPRT_PRC255H_PRTPRC255H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC255H_PRTPRC255H_SHIFT)
+#define I40E_GLPRT_PRC255L(_i) (0x003004C0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC255L_MAX_INDEX 3
+#define I40E_GLPRT_PRC255L_PRC255L_SHIFT 0
+#define I40E_GLPRT_PRC255L_PRC255L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC255L_PRC255L_SHIFT)
+#define I40E_GLPRT_PRC511H(_i) (0x003004E4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC511H_MAX_INDEX 3
+#define I40E_GLPRT_PRC511H_PRC511H_SHIFT 0
+#define I40E_GLPRT_PRC511H_PRC511H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC511H_PRC511H_SHIFT)
+#define I40E_GLPRT_PRC511L(_i) (0x003004E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC511L_MAX_INDEX 3
+#define I40E_GLPRT_PRC511L_PRC511L_SHIFT 0
+#define I40E_GLPRT_PRC511L_PRC511L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC511L_PRC511L_SHIFT)
+#define I40E_GLPRT_PRC64H(_i) (0x00300484 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC64H_MAX_INDEX 3
+#define I40E_GLPRT_PRC64H_PRC64H_SHIFT 0
+#define I40E_GLPRT_PRC64H_PRC64H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC64H_PRC64H_SHIFT)
+#define I40E_GLPRT_PRC64L(_i) (0x00300480 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC64L_MAX_INDEX 3
+#define I40E_GLPRT_PRC64L_PRC64L_SHIFT 0
+#define I40E_GLPRT_PRC64L_PRC64L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC64L_PRC64L_SHIFT)
+#define I40E_GLPRT_PRC9522H(_i) (0x00300544 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC9522H_MAX_INDEX 3
+#define I40E_GLPRT_PRC9522H_PRC1522H_SHIFT 0
+#define I40E_GLPRT_PRC9522H_PRC1522H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PRC9522H_PRC1522H_SHIFT)
+#define I40E_GLPRT_PRC9522L(_i) (0x00300540 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PRC9522L_MAX_INDEX 3
+#define I40E_GLPRT_PRC9522L_PRC1522L_SHIFT 0
+#define I40E_GLPRT_PRC9522L_PRC1522L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PRC9522L_PRC1522L_SHIFT)
+#define I40E_GLPRT_PTC1023H(_i) (0x00300724 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC1023H_MAX_INDEX 3
+#define I40E_GLPRT_PTC1023H_PTC1023H_SHIFT 0
+#define I40E_GLPRT_PTC1023H_PTC1023H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC1023H_PTC1023H_SHIFT)
+#define I40E_GLPRT_PTC1023L(_i) (0x00300720 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC1023L_MAX_INDEX 3
+#define I40E_GLPRT_PTC1023L_PTC1023L_SHIFT 0
+#define I40E_GLPRT_PTC1023L_PTC1023L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC1023L_PTC1023L_SHIFT)
+#define I40E_GLPRT_PTC127H(_i) (0x003006C4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC127H_MAX_INDEX 3
+#define I40E_GLPRT_PTC127H_PTC127H_SHIFT 0
+#define I40E_GLPRT_PTC127H_PTC127H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC127H_PTC127H_SHIFT)
+#define I40E_GLPRT_PTC127L(_i) (0x003006C0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC127L_MAX_INDEX 3
+#define I40E_GLPRT_PTC127L_PTC127L_SHIFT 0
+#define I40E_GLPRT_PTC127L_PTC127L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC127L_PTC127L_SHIFT)
+#define I40E_GLPRT_PTC1522H(_i) (0x00300744 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC1522H_MAX_INDEX 3
+#define I40E_GLPRT_PTC1522H_PTC1522H_SHIFT 0
+#define I40E_GLPRT_PTC1522H_PTC1522H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC1522H_PTC1522H_SHIFT)
+#define I40E_GLPRT_PTC1522L(_i) (0x00300740 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC1522L_MAX_INDEX 3
+#define I40E_GLPRT_PTC1522L_PTC1522L_SHIFT 0
+#define I40E_GLPRT_PTC1522L_PTC1522L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC1522L_PTC1522L_SHIFT)
+#define I40E_GLPRT_PTC255H(_i) (0x003006E4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC255H_MAX_INDEX 3
+#define I40E_GLPRT_PTC255H_PTC255H_SHIFT 0
+#define I40E_GLPRT_PTC255H_PTC255H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC255H_PTC255H_SHIFT)
+#define I40E_GLPRT_PTC255L(_i) (0x003006E0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC255L_MAX_INDEX 3
+#define I40E_GLPRT_PTC255L_PTC255L_SHIFT 0
+#define I40E_GLPRT_PTC255L_PTC255L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC255L_PTC255L_SHIFT)
+#define I40E_GLPRT_PTC511H(_i) (0x00300704 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC511H_MAX_INDEX 3
+#define I40E_GLPRT_PTC511H_PTC511H_SHIFT 0
+#define I40E_GLPRT_PTC511H_PTC511H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC511H_PTC511H_SHIFT)
+#define I40E_GLPRT_PTC511L(_i) (0x00300700 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC511L_MAX_INDEX 3
+#define I40E_GLPRT_PTC511L_PTC511L_SHIFT 0
+#define I40E_GLPRT_PTC511L_PTC511L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC511L_PTC511L_SHIFT)
+#define I40E_GLPRT_PTC64H(_i) (0x003006A4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC64H_MAX_INDEX 3
+#define I40E_GLPRT_PTC64H_PTC64H_SHIFT 0
+#define I40E_GLPRT_PTC64H_PTC64H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC64H_PTC64H_SHIFT)
+#define I40E_GLPRT_PTC64L(_i) (0x003006A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC64L_MAX_INDEX 3
+#define I40E_GLPRT_PTC64L_PTC64L_SHIFT 0
+#define I40E_GLPRT_PTC64L_PTC64L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC64L_PTC64L_SHIFT)
+#define I40E_GLPRT_PTC9522H(_i) (0x00300764 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC9522H_MAX_INDEX 3
+#define I40E_GLPRT_PTC9522H_PTC9522H_SHIFT 0
+#define I40E_GLPRT_PTC9522H_PTC9522H_MASK I40E_MASK(0xFFFF, I40E_GLPRT_PTC9522H_PTC9522H_SHIFT)
+#define I40E_GLPRT_PTC9522L(_i) (0x00300760 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_PTC9522L_MAX_INDEX 3
+#define I40E_GLPRT_PTC9522L_PTC9522L_SHIFT 0
+#define I40E_GLPRT_PTC9522L_PTC9522L_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PTC9522L_PTC9522L_SHIFT)
+#define I40E_GLPRT_PXOFFRXC(_i, _j) (0x00300280 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */
+#define I40E_GLPRT_PXOFFRXC_MAX_INDEX 3
+#define I40E_GLPRT_PXOFFRXC_PRPXOFFRXCNT_SHIFT 0
+#define I40E_GLPRT_PXOFFRXC_PRPXOFFRXCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PXOFFRXC_PRPXOFFRXCNT_SHIFT)
+#define I40E_GLPRT_PXOFFTXC(_i, _j) (0x00300880 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */
+#define I40E_GLPRT_PXOFFTXC_MAX_INDEX 3
+#define I40E_GLPRT_PXOFFTXC_PRPXOFFTXCNT_SHIFT 0
+#define I40E_GLPRT_PXOFFTXC_PRPXOFFTXCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PXOFFTXC_PRPXOFFTXCNT_SHIFT)
+#define I40E_GLPRT_PXONRXC(_i, _j) (0x00300180 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */
+#define I40E_GLPRT_PXONRXC_MAX_INDEX 3
+#define I40E_GLPRT_PXONRXC_PRPXONRXCNT_SHIFT 0
+#define I40E_GLPRT_PXONRXC_PRPXONRXCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PXONRXC_PRPXONRXCNT_SHIFT)
+#define I40E_GLPRT_PXONTXC(_i, _j) (0x00300780 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */
+#define I40E_GLPRT_PXONTXC_MAX_INDEX 3
+#define I40E_GLPRT_PXONTXC_PRPXONTXC_SHIFT 0
+#define I40E_GLPRT_PXONTXC_PRPXONTXC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_PXONTXC_PRPXONTXC_SHIFT)
+#define I40E_GLPRT_RDPC(_i) (0x00300600 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_RDPC_MAX_INDEX 3
+#define I40E_GLPRT_RDPC_RDPC_SHIFT 0
+#define I40E_GLPRT_RDPC_RDPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RDPC_RDPC_SHIFT)
+#define I40E_GLPRT_RFC(_i) (0x00300560 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_RFC_MAX_INDEX 3
+#define I40E_GLPRT_RFC_RFC_SHIFT 0
+#define I40E_GLPRT_RFC_RFC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RFC_RFC_SHIFT)
+#define I40E_GLPRT_RJC(_i) (0x00300580 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_RJC_MAX_INDEX 3
+#define I40E_GLPRT_RJC_RJC_SHIFT 0
+#define I40E_GLPRT_RJC_RJC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RJC_RJC_SHIFT)
+#define I40E_GLPRT_RLEC(_i) (0x003000A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_RLEC_MAX_INDEX 3
+#define I40E_GLPRT_RLEC_RLEC_SHIFT 0
+#define I40E_GLPRT_RLEC_RLEC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RLEC_RLEC_SHIFT)
+#define I40E_GLPRT_ROC(_i) (0x00300120 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_ROC_MAX_INDEX 3
+#define I40E_GLPRT_ROC_ROC_SHIFT 0
+#define I40E_GLPRT_ROC_ROC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_ROC_ROC_SHIFT)
+#define I40E_GLPRT_RUC(_i) (0x00300100 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_RUC_MAX_INDEX 3
+#define I40E_GLPRT_RUC_RUC_SHIFT 0
+#define I40E_GLPRT_RUC_RUC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RUC_RUC_SHIFT)
+#define I40E_GLPRT_RUPP(_i) (0x00300660 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_RUPP_MAX_INDEX 3
+#define I40E_GLPRT_RUPP_RUPP_SHIFT 0
+#define I40E_GLPRT_RUPP_RUPP_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RUPP_RUPP_SHIFT)
+#define I40E_GLPRT_RXON2OFFCNT(_i, _j) (0x00300380 + ((_i) * 8 + (_j) * 32)) /* _i=0...3, _j=0...7 */ /* Reset: CORER */
+#define I40E_GLPRT_RXON2OFFCNT_MAX_INDEX 3
+#define I40E_GLPRT_RXON2OFFCNT_PRRXON2OFFCNT_SHIFT 0
+#define I40E_GLPRT_RXON2OFFCNT_PRRXON2OFFCNT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_RXON2OFFCNT_PRRXON2OFFCNT_SHIFT)
+#define I40E_GLPRT_TDOLD(_i) (0x00300A20 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_TDOLD_MAX_INDEX 3
+#define I40E_GLPRT_TDOLD_GLPRT_TDOLD_SHIFT 0
+#define I40E_GLPRT_TDOLD_GLPRT_TDOLD_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_TDOLD_GLPRT_TDOLD_SHIFT)
+#define I40E_GLPRT_UPRCH(_i) (0x003005A4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_UPRCH_MAX_INDEX 3
+#define I40E_GLPRT_UPRCH_UPRCH_SHIFT 0
+#define I40E_GLPRT_UPRCH_UPRCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_UPRCH_UPRCH_SHIFT)
+#define I40E_GLPRT_UPRCL(_i) (0x003005A0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_UPRCL_MAX_INDEX 3
+#define I40E_GLPRT_UPRCL_UPRCL_SHIFT 0
+#define I40E_GLPRT_UPRCL_UPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_UPRCL_UPRCL_SHIFT)
+#define I40E_GLPRT_UPTCH(_i) (0x003009C4 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_UPTCH_MAX_INDEX 3
+#define I40E_GLPRT_UPTCH_UPTCH_SHIFT 0
+#define I40E_GLPRT_UPTCH_UPTCH_MASK I40E_MASK(0xFFFF, I40E_GLPRT_UPTCH_UPTCH_SHIFT)
+#define I40E_GLPRT_UPTCL(_i) (0x003009C0 + ((_i) * 8)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_GLPRT_UPTCL_MAX_INDEX 3
+#define I40E_GLPRT_UPTCL_VUPTCH_SHIFT 0
+#define I40E_GLPRT_UPTCL_VUPTCH_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPRT_UPTCL_VUPTCH_SHIFT)
+#define I40E_GLSW_BPRCH(_i) (0x00370104 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_BPRCH_MAX_INDEX 15
+#define I40E_GLSW_BPRCH_BPRCH_SHIFT 0
+#define I40E_GLSW_BPRCH_BPRCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_BPRCH_BPRCH_SHIFT)
+#define I40E_GLSW_BPRCL(_i) (0x00370100 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_BPRCL_MAX_INDEX 15
+#define I40E_GLSW_BPRCL_BPRCL_SHIFT 0
+#define I40E_GLSW_BPRCL_BPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_BPRCL_BPRCL_SHIFT)
+#define I40E_GLSW_BPTCH(_i) (0x00340104 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_BPTCH_MAX_INDEX 15
+#define I40E_GLSW_BPTCH_BPTCH_SHIFT 0
+#define I40E_GLSW_BPTCH_BPTCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_BPTCH_BPTCH_SHIFT)
+#define I40E_GLSW_BPTCL(_i) (0x00340100 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_BPTCL_MAX_INDEX 15
+#define I40E_GLSW_BPTCL_BPTCL_SHIFT 0
+#define I40E_GLSW_BPTCL_BPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_BPTCL_BPTCL_SHIFT)
+#define I40E_GLSW_GORCH(_i) (0x0035C004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_GORCH_MAX_INDEX 15
+#define I40E_GLSW_GORCH_GORCH_SHIFT 0
+#define I40E_GLSW_GORCH_GORCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_GORCH_GORCH_SHIFT)
+#define I40E_GLSW_GORCL(_i) (0x0035c000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_GORCL_MAX_INDEX 15
+#define I40E_GLSW_GORCL_GORCL_SHIFT 0
+#define I40E_GLSW_GORCL_GORCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_GORCL_GORCL_SHIFT)
+#define I40E_GLSW_GOTCH(_i) (0x0032C004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_GOTCH_MAX_INDEX 15
+#define I40E_GLSW_GOTCH_GOTCH_SHIFT 0
+#define I40E_GLSW_GOTCH_GOTCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_GOTCH_GOTCH_SHIFT)
+#define I40E_GLSW_GOTCL(_i) (0x0032c000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_GOTCL_MAX_INDEX 15
+#define I40E_GLSW_GOTCL_GOTCL_SHIFT 0
+#define I40E_GLSW_GOTCL_GOTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_GOTCL_GOTCL_SHIFT)
+#define I40E_GLSW_MPRCH(_i) (0x00370084 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_MPRCH_MAX_INDEX 15
+#define I40E_GLSW_MPRCH_MPRCH_SHIFT 0
+#define I40E_GLSW_MPRCH_MPRCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_MPRCH_MPRCH_SHIFT)
+#define I40E_GLSW_MPRCL(_i) (0x00370080 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_MPRCL_MAX_INDEX 15
+#define I40E_GLSW_MPRCL_MPRCL_SHIFT 0
+#define I40E_GLSW_MPRCL_MPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_MPRCL_MPRCL_SHIFT)
+#define I40E_GLSW_MPTCH(_i) (0x00340084 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_MPTCH_MAX_INDEX 15
+#define I40E_GLSW_MPTCH_MPTCH_SHIFT 0
+#define I40E_GLSW_MPTCH_MPTCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_MPTCH_MPTCH_SHIFT)
+#define I40E_GLSW_MPTCL(_i) (0x00340080 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_MPTCL_MAX_INDEX 15
+#define I40E_GLSW_MPTCL_MPTCL_SHIFT 0
+#define I40E_GLSW_MPTCL_MPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_MPTCL_MPTCL_SHIFT)
+#define I40E_GLSW_RUPP(_i) (0x00370180 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_RUPP_MAX_INDEX 15
+#define I40E_GLSW_RUPP_RUPP_SHIFT 0
+#define I40E_GLSW_RUPP_RUPP_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_RUPP_RUPP_SHIFT)
+#define I40E_GLSW_TDPC(_i) (0x00348000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_TDPC_MAX_INDEX 15
+#define I40E_GLSW_TDPC_TDPC_SHIFT 0
+#define I40E_GLSW_TDPC_TDPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_TDPC_TDPC_SHIFT)
+#define I40E_GLSW_UPRCH(_i) (0x00370004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_UPRCH_MAX_INDEX 15
+#define I40E_GLSW_UPRCH_UPRCH_SHIFT 0
+#define I40E_GLSW_UPRCH_UPRCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_UPRCH_UPRCH_SHIFT)
+#define I40E_GLSW_UPRCL(_i) (0x00370000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_UPRCL_MAX_INDEX 15
+#define I40E_GLSW_UPRCL_UPRCL_SHIFT 0
+#define I40E_GLSW_UPRCL_UPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_UPRCL_UPRCL_SHIFT)
+#define I40E_GLSW_UPTCH(_i) (0x00340004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_UPTCH_MAX_INDEX 15
+#define I40E_GLSW_UPTCH_UPTCH_SHIFT 0
+#define I40E_GLSW_UPTCH_UPTCH_MASK I40E_MASK(0xFFFF, I40E_GLSW_UPTCH_UPTCH_SHIFT)
+#define I40E_GLSW_UPTCL(_i) (0x00340000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLSW_UPTCL_MAX_INDEX 15
+#define I40E_GLSW_UPTCL_UPTCL_SHIFT 0
+#define I40E_GLSW_UPTCL_UPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLSW_UPTCL_UPTCL_SHIFT)
+#define I40E_GLV_BPRCH(_i) (0x0036D804 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_BPRCH_MAX_INDEX 383
+#define I40E_GLV_BPRCH_BPRCH_SHIFT 0
+#define I40E_GLV_BPRCH_BPRCH_MASK I40E_MASK(0xFFFF, I40E_GLV_BPRCH_BPRCH_SHIFT)
+#define I40E_GLV_BPRCL(_i) (0x0036d800 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_BPRCL_MAX_INDEX 383
+#define I40E_GLV_BPRCL_BPRCL_SHIFT 0
+#define I40E_GLV_BPRCL_BPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_BPRCL_BPRCL_SHIFT)
+#define I40E_GLV_BPTCH(_i) (0x0033D804 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_BPTCH_MAX_INDEX 383
+#define I40E_GLV_BPTCH_BPTCH_SHIFT 0
+#define I40E_GLV_BPTCH_BPTCH_MASK I40E_MASK(0xFFFF, I40E_GLV_BPTCH_BPTCH_SHIFT)
+#define I40E_GLV_BPTCL(_i) (0x0033d800 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_BPTCL_MAX_INDEX 383
+#define I40E_GLV_BPTCL_BPTCL_SHIFT 0
+#define I40E_GLV_BPTCL_BPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_BPTCL_BPTCL_SHIFT)
+#define I40E_GLV_GORCH(_i) (0x00358004 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_GORCH_MAX_INDEX 383
+#define I40E_GLV_GORCH_GORCH_SHIFT 0
+#define I40E_GLV_GORCH_GORCH_MASK I40E_MASK(0xFFFF, I40E_GLV_GORCH_GORCH_SHIFT)
+#define I40E_GLV_GORCL(_i) (0x00358000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_GORCL_MAX_INDEX 383
+#define I40E_GLV_GORCL_GORCL_SHIFT 0
+#define I40E_GLV_GORCL_GORCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_GORCL_GORCL_SHIFT)
+#define I40E_GLV_GOTCH(_i) (0x00328004 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_GOTCH_MAX_INDEX 383
+#define I40E_GLV_GOTCH_GOTCH_SHIFT 0
+#define I40E_GLV_GOTCH_GOTCH_MASK I40E_MASK(0xFFFF, I40E_GLV_GOTCH_GOTCH_SHIFT)
+#define I40E_GLV_GOTCL(_i) (0x00328000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_GOTCL_MAX_INDEX 383
+#define I40E_GLV_GOTCL_GOTCL_SHIFT 0
+#define I40E_GLV_GOTCL_GOTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_GOTCL_GOTCL_SHIFT)
+#define I40E_GLV_MPRCH(_i) (0x0036CC04 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_MPRCH_MAX_INDEX 383
+#define I40E_GLV_MPRCH_MPRCH_SHIFT 0
+#define I40E_GLV_MPRCH_MPRCH_MASK I40E_MASK(0xFFFF, I40E_GLV_MPRCH_MPRCH_SHIFT)
+#define I40E_GLV_MPRCL(_i) (0x0036cc00 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_MPRCL_MAX_INDEX 383
+#define I40E_GLV_MPRCL_MPRCL_SHIFT 0
+#define I40E_GLV_MPRCL_MPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_MPRCL_MPRCL_SHIFT)
+#define I40E_GLV_MPTCH(_i) (0x0033CC04 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_MPTCH_MAX_INDEX 383
+#define I40E_GLV_MPTCH_MPTCH_SHIFT 0
+#define I40E_GLV_MPTCH_MPTCH_MASK I40E_MASK(0xFFFF, I40E_GLV_MPTCH_MPTCH_SHIFT)
+#define I40E_GLV_MPTCL(_i) (0x0033cc00 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_MPTCL_MAX_INDEX 383
+#define I40E_GLV_MPTCL_MPTCL_SHIFT 0
+#define I40E_GLV_MPTCL_MPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_MPTCL_MPTCL_SHIFT)
+#define I40E_GLV_RDPC(_i) (0x00310000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_RDPC_MAX_INDEX 383
+#define I40E_GLV_RDPC_RDPC_SHIFT 0
+#define I40E_GLV_RDPC_RDPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_RDPC_RDPC_SHIFT)
+#define I40E_GLV_RUPP(_i) (0x0036E400 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_RUPP_MAX_INDEX 383
+#define I40E_GLV_RUPP_RUPP_SHIFT 0
+#define I40E_GLV_RUPP_RUPP_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_RUPP_RUPP_SHIFT)
+#define I40E_GLV_TEPC(_i) (0x00344000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_TEPC_MAX_INDEX 383
+#define I40E_GLV_TEPC_TEPC_SHIFT 0
+#define I40E_GLV_TEPC_TEPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_TEPC_TEPC_SHIFT)
+#define I40E_GLV_UPRCH(_i) (0x0036C004 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_UPRCH_MAX_INDEX 383
+#define I40E_GLV_UPRCH_UPRCH_SHIFT 0
+#define I40E_GLV_UPRCH_UPRCH_MASK I40E_MASK(0xFFFF, I40E_GLV_UPRCH_UPRCH_SHIFT)
+#define I40E_GLV_UPRCL(_i) (0x0036c000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_UPRCL_MAX_INDEX 383
+#define I40E_GLV_UPRCL_UPRCL_SHIFT 0
+#define I40E_GLV_UPRCL_UPRCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_UPRCL_UPRCL_SHIFT)
+#define I40E_GLV_UPTCH(_i) (0x0033C004 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_UPTCH_MAX_INDEX 383
+#define I40E_GLV_UPTCH_GLVUPTCH_SHIFT 0
+#define I40E_GLV_UPTCH_GLVUPTCH_MASK I40E_MASK(0xFFFF, I40E_GLV_UPTCH_GLVUPTCH_SHIFT)
+#define I40E_GLV_UPTCL(_i) (0x0033c000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_UPTCL_MAX_INDEX 383
+#define I40E_GLV_UPTCL_UPTCL_SHIFT 0
+#define I40E_GLV_UPTCL_UPTCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_UPTCL_UPTCL_SHIFT)
+#define I40E_GLVEBTC_RBCH(_i, _j) (0x00364004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_RBCH_MAX_INDEX 7
+#define I40E_GLVEBTC_RBCH_TCBCH_SHIFT 0
+#define I40E_GLVEBTC_RBCH_TCBCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBTC_RBCH_TCBCH_SHIFT)
+#define I40E_GLVEBTC_RBCL(_i, _j) (0x00364000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_RBCL_MAX_INDEX 7
+#define I40E_GLVEBTC_RBCL_TCBCL_SHIFT 0
+#define I40E_GLVEBTC_RBCL_TCBCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBTC_RBCL_TCBCL_SHIFT)
+#define I40E_GLVEBTC_RPCH(_i, _j) (0x00368004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_RPCH_MAX_INDEX 7
+#define I40E_GLVEBTC_RPCH_TCPCH_SHIFT 0
+#define I40E_GLVEBTC_RPCH_TCPCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBTC_RPCH_TCPCH_SHIFT)
+#define I40E_GLVEBTC_RPCL(_i, _j) (0x00368000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_RPCL_MAX_INDEX 7
+#define I40E_GLVEBTC_RPCL_TCPCL_SHIFT 0
+#define I40E_GLVEBTC_RPCL_TCPCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBTC_RPCL_TCPCL_SHIFT)
+#define I40E_GLVEBTC_TBCH(_i, _j) (0x00334004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_TBCH_MAX_INDEX 7
+#define I40E_GLVEBTC_TBCH_TCBCH_SHIFT 0
+#define I40E_GLVEBTC_TBCH_TCBCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBTC_TBCH_TCBCH_SHIFT)
+#define I40E_GLVEBTC_TBCL(_i, _j) (0x00334000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_TBCL_MAX_INDEX 7
+#define I40E_GLVEBTC_TBCL_TCBCL_SHIFT 0
+#define I40E_GLVEBTC_TBCL_TCBCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBTC_TBCL_TCBCL_SHIFT)
+#define I40E_GLVEBTC_TPCH(_i, _j) (0x00338004 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_TPCH_MAX_INDEX 7
+#define I40E_GLVEBTC_TPCH_TCPCH_SHIFT 0
+#define I40E_GLVEBTC_TPCH_TCPCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBTC_TPCH_TCPCH_SHIFT)
+#define I40E_GLVEBTC_TPCL(_i, _j) (0x00338000 + ((_i) * 8 + (_j) * 64)) /* _i=0...7, _j=0...15 */ /* Reset: CORER */
+#define I40E_GLVEBTC_TPCL_MAX_INDEX 7
+#define I40E_GLVEBTC_TPCL_TCPCL_SHIFT 0
+#define I40E_GLVEBTC_TPCL_TCPCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBTC_TPCL_TCPCL_SHIFT)
+#define I40E_GLVEBVL_BPCH(_i) (0x00374804 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_BPCH_MAX_INDEX 127
+#define I40E_GLVEBVL_BPCH_VLBPCH_SHIFT 0
+#define I40E_GLVEBVL_BPCH_VLBPCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBVL_BPCH_VLBPCH_SHIFT)
+#define I40E_GLVEBVL_BPCL(_i) (0x00374800 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_BPCL_MAX_INDEX 127
+#define I40E_GLVEBVL_BPCL_VLBPCL_SHIFT 0
+#define I40E_GLVEBVL_BPCL_VLBPCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_BPCL_VLBPCL_SHIFT)
+#define I40E_GLVEBVL_GORCH(_i) (0x00360004 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_GORCH_MAX_INDEX 127
+#define I40E_GLVEBVL_GORCH_VLBCH_SHIFT 0
+#define I40E_GLVEBVL_GORCH_VLBCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBVL_GORCH_VLBCH_SHIFT)
+#define I40E_GLVEBVL_GORCL(_i) (0x00360000 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_GORCL_MAX_INDEX 127
+#define I40E_GLVEBVL_GORCL_VLBCL_SHIFT 0
+#define I40E_GLVEBVL_GORCL_VLBCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_GORCL_VLBCL_SHIFT)
+#define I40E_GLVEBVL_GOTCH(_i) (0x00330004 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_GOTCH_MAX_INDEX 127
+#define I40E_GLVEBVL_GOTCH_VLBCH_SHIFT 0
+#define I40E_GLVEBVL_GOTCH_VLBCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBVL_GOTCH_VLBCH_SHIFT)
+#define I40E_GLVEBVL_GOTCL(_i) (0x00330000 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_GOTCL_MAX_INDEX 127
+#define I40E_GLVEBVL_GOTCL_VLBCL_SHIFT 0
+#define I40E_GLVEBVL_GOTCL_VLBCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_GOTCL_VLBCL_SHIFT)
+#define I40E_GLVEBVL_MPCH(_i) (0x00374404 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_MPCH_MAX_INDEX 127
+#define I40E_GLVEBVL_MPCH_VLMPCH_SHIFT 0
+#define I40E_GLVEBVL_MPCH_VLMPCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBVL_MPCH_VLMPCH_SHIFT)
+#define I40E_GLVEBVL_MPCL(_i) (0x00374400 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_MPCL_MAX_INDEX 127
+#define I40E_GLVEBVL_MPCL_VLMPCL_SHIFT 0
+#define I40E_GLVEBVL_MPCL_VLMPCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_MPCL_VLMPCL_SHIFT)
+#define I40E_GLVEBVL_UPCH(_i) (0x00374004 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_UPCH_MAX_INDEX 127
+#define I40E_GLVEBVL_UPCH_VLUPCH_SHIFT 0
+#define I40E_GLVEBVL_UPCH_VLUPCH_MASK I40E_MASK(0xFFFF, I40E_GLVEBVL_UPCH_VLUPCH_SHIFT)
+#define I40E_GLVEBVL_UPCL(_i) (0x00374000 + ((_i) * 8)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_GLVEBVL_UPCL_MAX_INDEX 127
+#define I40E_GLVEBVL_UPCL_VLUPCL_SHIFT 0
+#define I40E_GLVEBVL_UPCL_VLUPCL_MASK I40E_MASK(0xFFFFFFFF, I40E_GLVEBVL_UPCL_VLUPCL_SHIFT)
+#define I40E_GL_MTG_FLU_MSK_H 0x00269F4C /* Reset: CORER */
+#define I40E_GL_MTG_FLU_MSK_H_MASK_HIGH_SHIFT 0
+#define I40E_GL_MTG_FLU_MSK_H_MASK_HIGH_MASK I40E_MASK(0xFFFF, I40E_GL_MTG_FLU_MSK_H_MASK_HIGH_SHIFT)
+#define I40E_GL_SWR_DEF_ACT(_i) (0x00270200 + ((_i) * 4)) /* _i=0...35 */ /* Reset: CORER */
+#define I40E_GL_SWR_DEF_ACT_MAX_INDEX 35
+#define I40E_GL_SWR_DEF_ACT_DEF_ACTION_SHIFT 0
+#define I40E_GL_SWR_DEF_ACT_DEF_ACTION_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_SWR_DEF_ACT_DEF_ACTION_SHIFT)
+#define I40E_GL_SWR_DEF_ACT_EN(_i) (0x0026CFB8 + ((_i) * 4)) /* _i=0...1 */ /* Reset: CORER */
+#define I40E_GL_SWR_DEF_ACT_EN_MAX_INDEX 1
+#define I40E_GL_SWR_DEF_ACT_EN_DEF_ACT_EN_BITMAP_SHIFT 0
+#define I40E_GL_SWR_DEF_ACT_EN_DEF_ACT_EN_BITMAP_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_SWR_DEF_ACT_EN_DEF_ACT_EN_BITMAP_SHIFT)
+#define I40E_PRTTSYN_ADJ 0x001E4280 /* Reset: GLOBR */
+#define I40E_PRTTSYN_ADJ_TSYNADJ_SHIFT 0
+#define I40E_PRTTSYN_ADJ_TSYNADJ_MASK I40E_MASK(0x7FFFFFFF, I40E_PRTTSYN_ADJ_TSYNADJ_SHIFT)
+#define I40E_PRTTSYN_ADJ_SIGN_SHIFT 31
+#define I40E_PRTTSYN_ADJ_SIGN_MASK I40E_MASK(0x1, I40E_PRTTSYN_ADJ_SIGN_SHIFT)
+#define I40E_PRTTSYN_AUX_0(_i) (0x001E42A0 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_AUX_0_MAX_INDEX 1
+#define I40E_PRTTSYN_AUX_0_OUT_ENA_SHIFT 0
+#define I40E_PRTTSYN_AUX_0_OUT_ENA_MASK I40E_MASK(0x1, I40E_PRTTSYN_AUX_0_OUT_ENA_SHIFT)
+#define I40E_PRTTSYN_AUX_0_OUTMOD_SHIFT 1
+#define I40E_PRTTSYN_AUX_0_OUTMOD_MASK I40E_MASK(0x3, I40E_PRTTSYN_AUX_0_OUTMOD_SHIFT)
+#define I40E_PRTTSYN_AUX_0_OUTLVL_SHIFT 3
+#define I40E_PRTTSYN_AUX_0_OUTLVL_MASK I40E_MASK(0x1, I40E_PRTTSYN_AUX_0_OUTLVL_SHIFT)
+#define I40E_PRTTSYN_AUX_0_PULSEW_SHIFT 8
+#define I40E_PRTTSYN_AUX_0_PULSEW_MASK I40E_MASK(0xF, I40E_PRTTSYN_AUX_0_PULSEW_SHIFT)
+#define I40E_PRTTSYN_AUX_0_EVNTLVL_SHIFT 16
+#define I40E_PRTTSYN_AUX_0_EVNTLVL_MASK I40E_MASK(0x3, I40E_PRTTSYN_AUX_0_EVNTLVL_SHIFT)
+#define I40E_PRTTSYN_AUX_1(_i) (0x001E42E0 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_AUX_1_MAX_INDEX 1
+#define I40E_PRTTSYN_AUX_1_INSTNT_SHIFT 0
+#define I40E_PRTTSYN_AUX_1_INSTNT_MASK I40E_MASK(0x1, I40E_PRTTSYN_AUX_1_INSTNT_SHIFT)
+#define I40E_PRTTSYN_AUX_1_SAMPLE_TIME_SHIFT 1
+#define I40E_PRTTSYN_AUX_1_SAMPLE_TIME_MASK I40E_MASK(0x1, I40E_PRTTSYN_AUX_1_SAMPLE_TIME_SHIFT)
+#define I40E_PRTTSYN_CLKO(_i) (0x001E4240 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_CLKO_MAX_INDEX 1
+#define I40E_PRTTSYN_CLKO_TSYNCLKO_SHIFT 0
+#define I40E_PRTTSYN_CLKO_TSYNCLKO_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_CLKO_TSYNCLKO_SHIFT)
+#define I40E_PRTTSYN_CTL0 0x001E4200 /* Reset: GLOBR */
+#define I40E_PRTTSYN_CTL0_CLEAR_TSYNTIMER_SHIFT 0
+#define I40E_PRTTSYN_CTL0_CLEAR_TSYNTIMER_MASK I40E_MASK(0x1, I40E_PRTTSYN_CTL0_CLEAR_TSYNTIMER_SHIFT)
+#define I40E_PRTTSYN_CTL0_TXTIME_INT_ENA_SHIFT 1
+#define I40E_PRTTSYN_CTL0_TXTIME_INT_ENA_MASK I40E_MASK(0x1, I40E_PRTTSYN_CTL0_TXTIME_INT_ENA_SHIFT)
+#define I40E_PRTTSYN_CTL0_EVENT_INT_ENA_SHIFT 2
+#define I40E_PRTTSYN_CTL0_EVENT_INT_ENA_MASK I40E_MASK(0x1, I40E_PRTTSYN_CTL0_EVENT_INT_ENA_SHIFT)
+#define I40E_PRTTSYN_CTL0_TGT_INT_ENA_SHIFT 3
+#define I40E_PRTTSYN_CTL0_TGT_INT_ENA_MASK I40E_MASK(0x1, I40E_PRTTSYN_CTL0_TGT_INT_ENA_SHIFT)
+#define I40E_PRTTSYN_CTL0_PF_ID_SHIFT 8
+#define I40E_PRTTSYN_CTL0_PF_ID_MASK I40E_MASK(0xF, I40E_PRTTSYN_CTL0_PF_ID_SHIFT)
+#define I40E_PRTTSYN_CTL0_TSYNACT_SHIFT 12
+#define I40E_PRTTSYN_CTL0_TSYNACT_MASK I40E_MASK(0x3, I40E_PRTTSYN_CTL0_TSYNACT_SHIFT)
+#define I40E_PRTTSYN_CTL0_TSYNENA_SHIFT 31
+#define I40E_PRTTSYN_CTL0_TSYNENA_MASK I40E_MASK(0x1, I40E_PRTTSYN_CTL0_TSYNENA_SHIFT)
+#define I40E_PRTTSYN_CTL1 0x00085020 /* Reset: CORER */
+#define I40E_PRTTSYN_CTL1_V1MESSTYPE0_SHIFT 0
+#define I40E_PRTTSYN_CTL1_V1MESSTYPE0_MASK I40E_MASK(0xFF, I40E_PRTTSYN_CTL1_V1MESSTYPE0_SHIFT)
+#define I40E_PRTTSYN_CTL1_V1MESSTYPE1_SHIFT 8
+#define I40E_PRTTSYN_CTL1_V1MESSTYPE1_MASK I40E_MASK(0xFF, I40E_PRTTSYN_CTL1_V1MESSTYPE1_SHIFT)
+#define I40E_PRTTSYN_CTL1_V2MESSTYPE0_SHIFT 16
+#define I40E_PRTTSYN_CTL1_V2MESSTYPE0_MASK I40E_MASK(0xF, I40E_PRTTSYN_CTL1_V2MESSTYPE0_SHIFT)
+#define I40E_PRTTSYN_CTL1_V2MESSTYPE1_SHIFT 20
+#define I40E_PRTTSYN_CTL1_V2MESSTYPE1_MASK I40E_MASK(0xF, I40E_PRTTSYN_CTL1_V2MESSTYPE1_SHIFT)
+#define I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT 24
+#define I40E_PRTTSYN_CTL1_TSYNTYPE_MASK I40E_MASK(0x3, I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT)
+#define I40E_PRTTSYN_CTL1_UDP_ENA_SHIFT 26
+#define I40E_PRTTSYN_CTL1_UDP_ENA_MASK I40E_MASK(0x3, I40E_PRTTSYN_CTL1_UDP_ENA_SHIFT)
+#define I40E_PRTTSYN_CTL1_TSYNENA_SHIFT 31
+#define I40E_PRTTSYN_CTL1_TSYNENA_MASK I40E_MASK(0x1, I40E_PRTTSYN_CTL1_TSYNENA_SHIFT)
+#define I40E_PRTTSYN_EVNT_H(_i) (0x001E40C0 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_EVNT_H_MAX_INDEX 1
+#define I40E_PRTTSYN_EVNT_H_TSYNEVNT_H_SHIFT 0
+#define I40E_PRTTSYN_EVNT_H_TSYNEVNT_H_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_EVNT_H_TSYNEVNT_H_SHIFT)
+#define I40E_PRTTSYN_EVNT_L(_i) (0x001E4080 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_EVNT_L_MAX_INDEX 1
+#define I40E_PRTTSYN_EVNT_L_TSYNEVNT_L_SHIFT 0
+#define I40E_PRTTSYN_EVNT_L_TSYNEVNT_L_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_EVNT_L_TSYNEVNT_L_SHIFT)
+#define I40E_PRTTSYN_INC_H 0x001E4060 /* Reset: GLOBR */
+#define I40E_PRTTSYN_INC_H_TSYNINC_H_SHIFT 0
+#define I40E_PRTTSYN_INC_H_TSYNINC_H_MASK I40E_MASK(0x3F, I40E_PRTTSYN_INC_H_TSYNINC_H_SHIFT)
+#define I40E_PRTTSYN_INC_L 0x001E4040 /* Reset: GLOBR */
+#define I40E_PRTTSYN_INC_L_TSYNINC_L_SHIFT 0
+#define I40E_PRTTSYN_INC_L_TSYNINC_L_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_INC_L_TSYNINC_L_SHIFT)
+#define I40E_PRTTSYN_RXTIME_H(_i) (0x00085040 + ((_i) * 32)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_PRTTSYN_RXTIME_H_MAX_INDEX 3
+#define I40E_PRTTSYN_RXTIME_H_RXTIEM_H_SHIFT 0
+#define I40E_PRTTSYN_RXTIME_H_RXTIEM_H_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_RXTIME_H_RXTIEM_H_SHIFT)
+#define I40E_PRTTSYN_RXTIME_L(_i) (0x000850C0 + ((_i) * 32)) /* _i=0...3 */ /* Reset: CORER */
+#define I40E_PRTTSYN_RXTIME_L_MAX_INDEX 3
+#define I40E_PRTTSYN_RXTIME_L_RXTIEM_L_SHIFT 0
+#define I40E_PRTTSYN_RXTIME_L_RXTIEM_L_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_RXTIME_L_RXTIEM_L_SHIFT)
+#define I40E_PRTTSYN_STAT_0 0x001E4220 /* Reset: GLOBR */
+#define I40E_PRTTSYN_STAT_0_EVENT0_SHIFT 0
+#define I40E_PRTTSYN_STAT_0_EVENT0_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_EVENT0_SHIFT)
+#define I40E_PRTTSYN_STAT_0_EVENT1_SHIFT 1
+#define I40E_PRTTSYN_STAT_0_EVENT1_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_EVENT1_SHIFT)
+#define I40E_PRTTSYN_STAT_0_TGT0_SHIFT 2
+#define I40E_PRTTSYN_STAT_0_TGT0_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_TGT0_SHIFT)
+#define I40E_PRTTSYN_STAT_0_TGT1_SHIFT 3
+#define I40E_PRTTSYN_STAT_0_TGT1_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_TGT1_SHIFT)
+#define I40E_PRTTSYN_STAT_0_TXTIME_SHIFT 4
+#define I40E_PRTTSYN_STAT_0_TXTIME_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_0_TXTIME_SHIFT)
+#define I40E_PRTTSYN_STAT_1 0x00085140 /* Reset: CORER */
+#define I40E_PRTTSYN_STAT_1_RXT0_SHIFT 0
+#define I40E_PRTTSYN_STAT_1_RXT0_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_1_RXT0_SHIFT)
+#define I40E_PRTTSYN_STAT_1_RXT1_SHIFT 1
+#define I40E_PRTTSYN_STAT_1_RXT1_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_1_RXT1_SHIFT)
+#define I40E_PRTTSYN_STAT_1_RXT2_SHIFT 2
+#define I40E_PRTTSYN_STAT_1_RXT2_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_1_RXT2_SHIFT)
+#define I40E_PRTTSYN_STAT_1_RXT3_SHIFT 3
+#define I40E_PRTTSYN_STAT_1_RXT3_MASK I40E_MASK(0x1, I40E_PRTTSYN_STAT_1_RXT3_SHIFT)
+#define I40E_PRTTSYN_TGT_H(_i) (0x001E4180 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_TGT_H_MAX_INDEX 1
+#define I40E_PRTTSYN_TGT_H_TSYNTGTT_H_SHIFT 0
+#define I40E_PRTTSYN_TGT_H_TSYNTGTT_H_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TGT_H_TSYNTGTT_H_SHIFT)
+#define I40E_PRTTSYN_TGT_L(_i) (0x001E4140 + ((_i) * 32)) /* _i=0...1 */ /* Reset: GLOBR */
+#define I40E_PRTTSYN_TGT_L_MAX_INDEX 1
+#define I40E_PRTTSYN_TGT_L_TSYNTGTT_L_SHIFT 0
+#define I40E_PRTTSYN_TGT_L_TSYNTGTT_L_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TGT_L_TSYNTGTT_L_SHIFT)
+#define I40E_PRTTSYN_TIME_H 0x001E4120 /* Reset: GLOBR */
+#define I40E_PRTTSYN_TIME_H_TSYNTIME_H_SHIFT 0
+#define I40E_PRTTSYN_TIME_H_TSYNTIME_H_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TIME_H_TSYNTIME_H_SHIFT)
+#define I40E_PRTTSYN_TIME_L 0x001E4100 /* Reset: GLOBR */
+#define I40E_PRTTSYN_TIME_L_TSYNTIME_L_SHIFT 0
+#define I40E_PRTTSYN_TIME_L_TSYNTIME_L_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TIME_L_TSYNTIME_L_SHIFT)
+#define I40E_PRTTSYN_TXTIME_H 0x001E41E0 /* Reset: GLOBR */
+#define I40E_PRTTSYN_TXTIME_H_TXTIEM_H_SHIFT 0
+#define I40E_PRTTSYN_TXTIME_H_TXTIEM_H_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TXTIME_H_TXTIEM_H_SHIFT)
+#define I40E_PRTTSYN_TXTIME_L 0x001E41C0 /* Reset: GLOBR */
+#define I40E_PRTTSYN_TXTIME_L_TXTIEM_L_SHIFT 0
+#define I40E_PRTTSYN_TXTIME_L_TXTIEM_L_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTTSYN_TXTIME_L_TXTIEM_L_SHIFT)
+#define I40E_GL_MDET_RX 0x0012A510 /* Reset: CORER */
+#define I40E_GL_MDET_RX_FUNCTION_SHIFT 0
+#define I40E_GL_MDET_RX_FUNCTION_MASK I40E_MASK(0xFF, I40E_GL_MDET_RX_FUNCTION_SHIFT)
+#define I40E_GL_MDET_RX_EVENT_SHIFT 8
+#define I40E_GL_MDET_RX_EVENT_MASK I40E_MASK(0x1FF, I40E_GL_MDET_RX_EVENT_SHIFT)
+#define I40E_GL_MDET_RX_QUEUE_SHIFT 17
+#define I40E_GL_MDET_RX_QUEUE_MASK I40E_MASK(0x3FFF, I40E_GL_MDET_RX_QUEUE_SHIFT)
+#define I40E_GL_MDET_RX_VALID_SHIFT 31
+#define I40E_GL_MDET_RX_VALID_MASK I40E_MASK(0x1, I40E_GL_MDET_RX_VALID_SHIFT)
+#define I40E_GL_MDET_TX 0x000E6480 /* Reset: CORER */
+#define I40E_GL_MDET_TX_QUEUE_SHIFT 0
+#define I40E_GL_MDET_TX_QUEUE_MASK I40E_MASK(0xFFF, I40E_GL_MDET_TX_QUEUE_SHIFT)
+#define I40E_GL_MDET_TX_VF_NUM_SHIFT 12
+#define I40E_GL_MDET_TX_VF_NUM_MASK I40E_MASK(0x1FF, I40E_GL_MDET_TX_VF_NUM_SHIFT)
+#define I40E_GL_MDET_TX_PF_NUM_SHIFT 21
+#define I40E_GL_MDET_TX_PF_NUM_MASK I40E_MASK(0xF, I40E_GL_MDET_TX_PF_NUM_SHIFT)
+#define I40E_GL_MDET_TX_EVENT_SHIFT 25
+#define I40E_GL_MDET_TX_EVENT_MASK I40E_MASK(0x1F, I40E_GL_MDET_TX_EVENT_SHIFT)
+#define I40E_GL_MDET_TX_VALID_SHIFT 31
+#define I40E_GL_MDET_TX_VALID_MASK I40E_MASK(0x1, I40E_GL_MDET_TX_VALID_SHIFT)
+#define I40E_PF_MDET_RX 0x0012A400 /* Reset: CORER */
+#define I40E_PF_MDET_RX_VALID_SHIFT 0
+#define I40E_PF_MDET_RX_VALID_MASK I40E_MASK(0x1, I40E_PF_MDET_RX_VALID_SHIFT)
+#define I40E_PF_MDET_TX 0x000E6400 /* Reset: CORER */
+#define I40E_PF_MDET_TX_VALID_SHIFT 0
+#define I40E_PF_MDET_TX_VALID_MASK I40E_MASK(0x1, I40E_PF_MDET_TX_VALID_SHIFT)
+#define I40E_PF_VT_PFALLOC 0x001C0500 /* Reset: CORER */
+#define I40E_PF_VT_PFALLOC_FIRSTVF_SHIFT 0
+#define I40E_PF_VT_PFALLOC_FIRSTVF_MASK I40E_MASK(0xFF, I40E_PF_VT_PFALLOC_FIRSTVF_SHIFT)
+#define I40E_PF_VT_PFALLOC_LASTVF_SHIFT 8
+#define I40E_PF_VT_PFALLOC_LASTVF_MASK I40E_MASK(0xFF, I40E_PF_VT_PFALLOC_LASTVF_SHIFT)
+#define I40E_PF_VT_PFALLOC_VALID_SHIFT 31
+#define I40E_PF_VT_PFALLOC_VALID_MASK I40E_MASK(0x1, I40E_PF_VT_PFALLOC_VALID_SHIFT)
+#define I40E_VP_MDET_RX(_VF) (0x0012A000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VP_MDET_RX_MAX_INDEX 127
+#define I40E_VP_MDET_RX_VALID_SHIFT 0
+#define I40E_VP_MDET_RX_VALID_MASK I40E_MASK(0x1, I40E_VP_MDET_RX_VALID_SHIFT)
+#define I40E_VP_MDET_TX(_VF) (0x000E6000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_VP_MDET_TX_MAX_INDEX 127
+#define I40E_VP_MDET_TX_VALID_SHIFT 0
+#define I40E_VP_MDET_TX_VALID_MASK I40E_MASK(0x1, I40E_VP_MDET_TX_VALID_SHIFT)
+#define I40E_GLPM_WUMC 0x0006C800 /* Reset: POR */
+#define I40E_GLPM_WUMC_NOTCO_SHIFT 0
+#define I40E_GLPM_WUMC_NOTCO_MASK I40E_MASK(0x1, I40E_GLPM_WUMC_NOTCO_SHIFT)
+#define I40E_GLPM_WUMC_SRST_PIN_VAL_SHIFT 1
+#define I40E_GLPM_WUMC_SRST_PIN_VAL_MASK I40E_MASK(0x1, I40E_GLPM_WUMC_SRST_PIN_VAL_SHIFT)
+#define I40E_GLPM_WUMC_ROL_MODE_SHIFT 2
+#define I40E_GLPM_WUMC_ROL_MODE_MASK I40E_MASK(0x1, I40E_GLPM_WUMC_ROL_MODE_SHIFT)
+#define I40E_GLPM_WUMC_RESERVED_4_SHIFT 3
+#define I40E_GLPM_WUMC_RESERVED_4_MASK I40E_MASK(0x1FFF, I40E_GLPM_WUMC_RESERVED_4_SHIFT)
+#define I40E_GLPM_WUMC_MNG_WU_PF_SHIFT 16
+#define I40E_GLPM_WUMC_MNG_WU_PF_MASK I40E_MASK(0xFFFF, I40E_GLPM_WUMC_MNG_WU_PF_SHIFT)
+#define I40E_PFPM_APM 0x000B8080 /* Reset: POR */
+#define I40E_PFPM_APM_APME_SHIFT 0
+#define I40E_PFPM_APM_APME_MASK I40E_MASK(0x1, I40E_PFPM_APM_APME_SHIFT)
+#define I40E_PFPM_FHFT_LENGTH(_i) (0x0006A000 + ((_i) * 128)) /* _i=0...7 */ /* Reset: POR */
+#define I40E_PFPM_FHFT_LENGTH_MAX_INDEX 7
+#define I40E_PFPM_FHFT_LENGTH_LENGTH_SHIFT 0
+#define I40E_PFPM_FHFT_LENGTH_LENGTH_MASK I40E_MASK(0xFF, I40E_PFPM_FHFT_LENGTH_LENGTH_SHIFT)
+#define I40E_PFPM_WUC 0x0006B200 /* Reset: POR */
+#define I40E_PFPM_WUC_EN_APM_D0_SHIFT 5
+#define I40E_PFPM_WUC_EN_APM_D0_MASK I40E_MASK(0x1, I40E_PFPM_WUC_EN_APM_D0_SHIFT)
+#define I40E_PFPM_WUFC 0x0006B400 /* Reset: POR */
+#define I40E_PFPM_WUFC_LNKC_SHIFT 0
+#define I40E_PFPM_WUFC_LNKC_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_LNKC_SHIFT)
+#define I40E_PFPM_WUFC_MAG_SHIFT 1
+#define I40E_PFPM_WUFC_MAG_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_MAG_SHIFT)
+#define I40E_PFPM_WUFC_MNG_SHIFT 3
+#define I40E_PFPM_WUFC_MNG_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_MNG_SHIFT)
+#define I40E_PFPM_WUFC_FLX0_ACT_SHIFT 4
+#define I40E_PFPM_WUFC_FLX0_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX0_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX1_ACT_SHIFT 5
+#define I40E_PFPM_WUFC_FLX1_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX1_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX2_ACT_SHIFT 6
+#define I40E_PFPM_WUFC_FLX2_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX2_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX3_ACT_SHIFT 7
+#define I40E_PFPM_WUFC_FLX3_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX3_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX4_ACT_SHIFT 8
+#define I40E_PFPM_WUFC_FLX4_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX4_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX5_ACT_SHIFT 9
+#define I40E_PFPM_WUFC_FLX5_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX5_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX6_ACT_SHIFT 10
+#define I40E_PFPM_WUFC_FLX6_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX6_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX7_ACT_SHIFT 11
+#define I40E_PFPM_WUFC_FLX7_ACT_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX7_ACT_SHIFT)
+#define I40E_PFPM_WUFC_FLX0_SHIFT 16
+#define I40E_PFPM_WUFC_FLX0_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX0_SHIFT)
+#define I40E_PFPM_WUFC_FLX1_SHIFT 17
+#define I40E_PFPM_WUFC_FLX1_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX1_SHIFT)
+#define I40E_PFPM_WUFC_FLX2_SHIFT 18
+#define I40E_PFPM_WUFC_FLX2_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX2_SHIFT)
+#define I40E_PFPM_WUFC_FLX3_SHIFT 19
+#define I40E_PFPM_WUFC_FLX3_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX3_SHIFT)
+#define I40E_PFPM_WUFC_FLX4_SHIFT 20
+#define I40E_PFPM_WUFC_FLX4_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX4_SHIFT)
+#define I40E_PFPM_WUFC_FLX5_SHIFT 21
+#define I40E_PFPM_WUFC_FLX5_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX5_SHIFT)
+#define I40E_PFPM_WUFC_FLX6_SHIFT 22
+#define I40E_PFPM_WUFC_FLX6_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX6_SHIFT)
+#define I40E_PFPM_WUFC_FLX7_SHIFT 23
+#define I40E_PFPM_WUFC_FLX7_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FLX7_SHIFT)
+#define I40E_PFPM_WUFC_FW_RST_WK_SHIFT 31
+#define I40E_PFPM_WUFC_FW_RST_WK_MASK I40E_MASK(0x1, I40E_PFPM_WUFC_FW_RST_WK_SHIFT)
+#define I40E_PFPM_WUS 0x0006B600 /* Reset: POR */
+#define I40E_PFPM_WUS_LNKC_SHIFT 0
+#define I40E_PFPM_WUS_LNKC_MASK I40E_MASK(0x1, I40E_PFPM_WUS_LNKC_SHIFT)
+#define I40E_PFPM_WUS_MAG_SHIFT 1
+#define I40E_PFPM_WUS_MAG_MASK I40E_MASK(0x1, I40E_PFPM_WUS_MAG_SHIFT)
+#define I40E_PFPM_WUS_PME_STATUS_SHIFT 2
+#define I40E_PFPM_WUS_PME_STATUS_MASK I40E_MASK(0x1, I40E_PFPM_WUS_PME_STATUS_SHIFT)
+#define I40E_PFPM_WUS_MNG_SHIFT 3
+#define I40E_PFPM_WUS_MNG_MASK I40E_MASK(0x1, I40E_PFPM_WUS_MNG_SHIFT)
+#define I40E_PFPM_WUS_FLX0_SHIFT 16
+#define I40E_PFPM_WUS_FLX0_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX0_SHIFT)
+#define I40E_PFPM_WUS_FLX1_SHIFT 17
+#define I40E_PFPM_WUS_FLX1_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX1_SHIFT)
+#define I40E_PFPM_WUS_FLX2_SHIFT 18
+#define I40E_PFPM_WUS_FLX2_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX2_SHIFT)
+#define I40E_PFPM_WUS_FLX3_SHIFT 19
+#define I40E_PFPM_WUS_FLX3_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX3_SHIFT)
+#define I40E_PFPM_WUS_FLX4_SHIFT 20
+#define I40E_PFPM_WUS_FLX4_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX4_SHIFT)
+#define I40E_PFPM_WUS_FLX5_SHIFT 21
+#define I40E_PFPM_WUS_FLX5_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX5_SHIFT)
+#define I40E_PFPM_WUS_FLX6_SHIFT 22
+#define I40E_PFPM_WUS_FLX6_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX6_SHIFT)
+#define I40E_PFPM_WUS_FLX7_SHIFT 23
+#define I40E_PFPM_WUS_FLX7_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FLX7_SHIFT)
+#define I40E_PFPM_WUS_FW_RST_WK_SHIFT 31
+#define I40E_PFPM_WUS_FW_RST_WK_MASK I40E_MASK(0x1, I40E_PFPM_WUS_FW_RST_WK_SHIFT)
+#define I40E_PRTPM_FHFHR 0x0006C000 /* Reset: POR */
+#define I40E_PRTPM_FHFHR_UNICAST_SHIFT 0
+#define I40E_PRTPM_FHFHR_UNICAST_MASK I40E_MASK(0x1, I40E_PRTPM_FHFHR_UNICAST_SHIFT)
+#define I40E_PRTPM_FHFHR_MULTICAST_SHIFT 1
+#define I40E_PRTPM_FHFHR_MULTICAST_MASK I40E_MASK(0x1, I40E_PRTPM_FHFHR_MULTICAST_SHIFT)
+#define I40E_PRTPM_SAH(_i) (0x001E44C0 + ((_i) * 32)) /* _i=0...3 */ /* Reset: PFR */
+#define I40E_PRTPM_SAH_MAX_INDEX 3
+#define I40E_PRTPM_SAH_PFPM_SAH_SHIFT 0
+#define I40E_PRTPM_SAH_PFPM_SAH_MASK I40E_MASK(0xFFFF, I40E_PRTPM_SAH_PFPM_SAH_SHIFT)
+#define I40E_PRTPM_SAH_PF_NUM_SHIFT 26
+#define I40E_PRTPM_SAH_PF_NUM_MASK I40E_MASK(0xF, I40E_PRTPM_SAH_PF_NUM_SHIFT)
+#define I40E_PRTPM_SAH_MC_MAG_EN_SHIFT 30
+#define I40E_PRTPM_SAH_MC_MAG_EN_MASK I40E_MASK(0x1, I40E_PRTPM_SAH_MC_MAG_EN_SHIFT)
+#define I40E_PRTPM_SAH_AV_SHIFT 31
+#define I40E_PRTPM_SAH_AV_MASK I40E_MASK(0x1, I40E_PRTPM_SAH_AV_SHIFT)
+#define I40E_PRTPM_SAL(_i) (0x001E4440 + ((_i) * 32)) /* _i=0...3 */ /* Reset: PFR */
+#define I40E_PRTPM_SAL_MAX_INDEX 3
+#define I40E_PRTPM_SAL_PFPM_SAL_SHIFT 0
+#define I40E_PRTPM_SAL_PFPM_SAL_MASK I40E_MASK(0xFFFFFFFF, I40E_PRTPM_SAL_PFPM_SAL_SHIFT)
+#define I40E_VF_ARQBAH1 0x00006000 /* Reset: EMPR */
+#define I40E_VF_ARQBAH1_ARQBAH_SHIFT 0
+#define I40E_VF_ARQBAH1_ARQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ARQBAH1_ARQBAH_SHIFT)
+#define I40E_VF_ARQBAL1 0x00006C00 /* Reset: EMPR */
+#define I40E_VF_ARQBAL1_ARQBAL_SHIFT 0
+#define I40E_VF_ARQBAL1_ARQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ARQBAL1_ARQBAL_SHIFT)
+#define I40E_VF_ARQH1 0x00007400 /* Reset: EMPR */
+#define I40E_VF_ARQH1_ARQH_SHIFT 0
+#define I40E_VF_ARQH1_ARQH_MASK I40E_MASK(0x3FF, I40E_VF_ARQH1_ARQH_SHIFT)
+#define I40E_VF_ARQLEN1 0x00008000 /* Reset: EMPR */
+#define I40E_VF_ARQLEN1_ARQLEN_SHIFT 0
+#define I40E_VF_ARQLEN1_ARQLEN_MASK I40E_MASK(0x3FF, I40E_VF_ARQLEN1_ARQLEN_SHIFT)
+#define I40E_VF_ARQLEN1_ARQVFE_SHIFT 28
+#define I40E_VF_ARQLEN1_ARQVFE_MASK I40E_MASK(0x1, I40E_VF_ARQLEN1_ARQVFE_SHIFT)
+#define I40E_VF_ARQLEN1_ARQOVFL_SHIFT 29
+#define I40E_VF_ARQLEN1_ARQOVFL_MASK I40E_MASK(0x1, I40E_VF_ARQLEN1_ARQOVFL_SHIFT)
+#define I40E_VF_ARQLEN1_ARQCRIT_SHIFT 30
+#define I40E_VF_ARQLEN1_ARQCRIT_MASK I40E_MASK(0x1, I40E_VF_ARQLEN1_ARQCRIT_SHIFT)
+#define I40E_VF_ARQLEN1_ARQENABLE_SHIFT 31
+#define I40E_VF_ARQLEN1_ARQENABLE_MASK I40E_MASK(0x1, I40E_VF_ARQLEN1_ARQENABLE_SHIFT)
+#define I40E_VF_ARQT1 0x00007000 /* Reset: EMPR */
+#define I40E_VF_ARQT1_ARQT_SHIFT 0
+#define I40E_VF_ARQT1_ARQT_MASK I40E_MASK(0x3FF, I40E_VF_ARQT1_ARQT_SHIFT)
+#define I40E_VF_ATQBAH1 0x00007800 /* Reset: EMPR */
+#define I40E_VF_ATQBAH1_ATQBAH_SHIFT 0
+#define I40E_VF_ATQBAH1_ATQBAH_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ATQBAH1_ATQBAH_SHIFT)
+#define I40E_VF_ATQBAL1 0x00007C00 /* Reset: EMPR */
+#define I40E_VF_ATQBAL1_ATQBAL_SHIFT 0
+#define I40E_VF_ATQBAL1_ATQBAL_MASK I40E_MASK(0xFFFFFFFF, I40E_VF_ATQBAL1_ATQBAL_SHIFT)
+#define I40E_VF_ATQH1 0x00006400 /* Reset: EMPR */
+#define I40E_VF_ATQH1_ATQH_SHIFT 0
+#define I40E_VF_ATQH1_ATQH_MASK I40E_MASK(0x3FF, I40E_VF_ATQH1_ATQH_SHIFT)
+#define I40E_VF_ATQLEN1 0x00006800 /* Reset: EMPR */
+#define I40E_VF_ATQLEN1_ATQLEN_SHIFT 0
+#define I40E_VF_ATQLEN1_ATQLEN_MASK I40E_MASK(0x3FF, I40E_VF_ATQLEN1_ATQLEN_SHIFT)
+#define I40E_VF_ATQLEN1_ATQVFE_SHIFT 28
+#define I40E_VF_ATQLEN1_ATQVFE_MASK I40E_MASK(0x1, I40E_VF_ATQLEN1_ATQVFE_SHIFT)
+#define I40E_VF_ATQLEN1_ATQOVFL_SHIFT 29
+#define I40E_VF_ATQLEN1_ATQOVFL_MASK I40E_MASK(0x1, I40E_VF_ATQLEN1_ATQOVFL_SHIFT)
+#define I40E_VF_ATQLEN1_ATQCRIT_SHIFT 30
+#define I40E_VF_ATQLEN1_ATQCRIT_MASK I40E_MASK(0x1, I40E_VF_ATQLEN1_ATQCRIT_SHIFT)
+#define I40E_VF_ATQLEN1_ATQENABLE_SHIFT 31
+#define I40E_VF_ATQLEN1_ATQENABLE_MASK I40E_MASK(0x1, I40E_VF_ATQLEN1_ATQENABLE_SHIFT)
+#define I40E_VF_ATQT1 0x00008400 /* Reset: EMPR */
+#define I40E_VF_ATQT1_ATQT_SHIFT 0
+#define I40E_VF_ATQT1_ATQT_MASK I40E_MASK(0x3FF, I40E_VF_ATQT1_ATQT_SHIFT)
+#define I40E_VFGEN_RSTAT 0x00008800 /* Reset: VFR */
+#define I40E_VFGEN_RSTAT_VFR_STATE_SHIFT 0
+#define I40E_VFGEN_RSTAT_VFR_STATE_MASK I40E_MASK(0x3, I40E_VFGEN_RSTAT_VFR_STATE_SHIFT)
+#define I40E_VFINT_DYN_CTL01 0x00005C00 /* Reset: VFR */
+#define I40E_VFINT_DYN_CTL01_INTENA_SHIFT 0
+#define I40E_VFINT_DYN_CTL01_INTENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_INTENA_SHIFT)
+#define I40E_VFINT_DYN_CTL01_CLEARPBA_SHIFT 1
+#define I40E_VFINT_DYN_CTL01_CLEARPBA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_CLEARPBA_SHIFT)
+#define I40E_VFINT_DYN_CTL01_SWINT_TRIG_SHIFT 2
+#define I40E_VFINT_DYN_CTL01_SWINT_TRIG_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_SWINT_TRIG_SHIFT)
+#define I40E_VFINT_DYN_CTL01_ITR_INDX_SHIFT 3
+#define I40E_VFINT_DYN_CTL01_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTL01_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTL01_INTERVAL_SHIFT 5
+#define I40E_VFINT_DYN_CTL01_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_DYN_CTL01_INTERVAL_SHIFT)
+#define I40E_VFINT_DYN_CTL01_SW_ITR_INDX_ENA_SHIFT 24
+#define I40E_VFINT_DYN_CTL01_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_SW_ITR_INDX_ENA_SHIFT)
+#define I40E_VFINT_DYN_CTL01_SW_ITR_INDX_SHIFT 25
+#define I40E_VFINT_DYN_CTL01_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTL01_SW_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTL01_INTENA_MSK_SHIFT 31
+#define I40E_VFINT_DYN_CTL01_INTENA_MSK_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_INTENA_MSK_SHIFT)
+#define I40E_VFINT_DYN_CTLN1(_INTVF) (0x00003800 + ((_INTVF) * 4)) /* _i=0...15 */ /* Reset: VFR */
+#define I40E_VFINT_DYN_CTLN1_MAX_INDEX 15
+#define I40E_VFINT_DYN_CTLN1_INTENA_SHIFT 0
+#define I40E_VFINT_DYN_CTLN1_INTENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_INTENA_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_CLEARPBA_SHIFT 1
+#define I40E_VFINT_DYN_CTLN1_CLEARPBA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_CLEARPBA_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_SWINT_TRIG_SHIFT 2
+#define I40E_VFINT_DYN_CTLN1_SWINT_TRIG_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_SWINT_TRIG_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_ITR_INDX_SHIFT 3
+#define I40E_VFINT_DYN_CTLN1_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTLN1_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_INTERVAL_SHIFT 5
+#define I40E_VFINT_DYN_CTLN1_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_DYN_CTLN1_INTERVAL_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_SHIFT 24
+#define I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_SHIFT 25
+#define I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_DYN_CTLN1_SW_ITR_INDX_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_INTENA_MSK_SHIFT 31
+#define I40E_VFINT_DYN_CTLN1_INTENA_MSK_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_INTENA_MSK_SHIFT)
+#define I40E_VFINT_ICR0_ENA1 0x00005000 /* Reset: CORER */
+#define I40E_VFINT_ICR0_ENA1_LINK_STAT_CHANGE_SHIFT 25
+#define I40E_VFINT_ICR0_ENA1_LINK_STAT_CHANGE_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ENA1_LINK_STAT_CHANGE_SHIFT)
+#define I40E_VFINT_ICR0_ENA1_ADMINQ_SHIFT 30
+#define I40E_VFINT_ICR0_ENA1_ADMINQ_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ENA1_ADMINQ_SHIFT)
+#define I40E_VFINT_ICR0_ENA1_RSVD_SHIFT 31
+#define I40E_VFINT_ICR0_ENA1_RSVD_MASK I40E_MASK(0x1, I40E_VFINT_ICR0_ENA1_RSVD_SHIFT)
+#define I40E_VFINT_ICR01 0x00004800 /* Reset: CORER */
+#define I40E_VFINT_ICR01_INTEVENT_SHIFT 0
+#define I40E_VFINT_ICR01_INTEVENT_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_INTEVENT_SHIFT)
+#define I40E_VFINT_ICR01_QUEUE_0_SHIFT 1
+#define I40E_VFINT_ICR01_QUEUE_0_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_QUEUE_0_SHIFT)
+#define I40E_VFINT_ICR01_QUEUE_1_SHIFT 2
+#define I40E_VFINT_ICR01_QUEUE_1_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_QUEUE_1_SHIFT)
+#define I40E_VFINT_ICR01_QUEUE_2_SHIFT 3
+#define I40E_VFINT_ICR01_QUEUE_2_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_QUEUE_2_SHIFT)
+#define I40E_VFINT_ICR01_QUEUE_3_SHIFT 4
+#define I40E_VFINT_ICR01_QUEUE_3_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_QUEUE_3_SHIFT)
+#define I40E_VFINT_ICR01_LINK_STAT_CHANGE_SHIFT 25
+#define I40E_VFINT_ICR01_LINK_STAT_CHANGE_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_LINK_STAT_CHANGE_SHIFT)
+#define I40E_VFINT_ICR01_ADMINQ_SHIFT 30
+#define I40E_VFINT_ICR01_ADMINQ_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_ADMINQ_SHIFT)
+#define I40E_VFINT_ICR01_SWINT_SHIFT 31
+#define I40E_VFINT_ICR01_SWINT_MASK I40E_MASK(0x1, I40E_VFINT_ICR01_SWINT_SHIFT)
+#define I40E_VFINT_ITR01(_i) (0x00004C00 + ((_i) * 4)) /* _i=0...2 */ /* Reset: VFR */
+#define I40E_VFINT_ITR01_MAX_INDEX 2
+#define I40E_VFINT_ITR01_INTERVAL_SHIFT 0
+#define I40E_VFINT_ITR01_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_ITR01_INTERVAL_SHIFT)
+#define I40E_VFINT_ITRN1(_i, _INTVF) (0x00002800 + ((_i) * 64 + (_INTVF) * 4)) /* _i=0...2, _INTVF=0...15 */ /* Reset: VFR */
+#define I40E_VFINT_ITRN1_MAX_INDEX 2
+#define I40E_VFINT_ITRN1_INTERVAL_SHIFT 0
+#define I40E_VFINT_ITRN1_INTERVAL_MASK I40E_MASK(0xFFF, I40E_VFINT_ITRN1_INTERVAL_SHIFT)
+#define I40E_VFINT_STAT_CTL01 0x00005400 /* Reset: CORER */
+#define I40E_VFINT_STAT_CTL01_OTHER_ITR_INDX_SHIFT 2
+#define I40E_VFINT_STAT_CTL01_OTHER_ITR_INDX_MASK I40E_MASK(0x3, I40E_VFINT_STAT_CTL01_OTHER_ITR_INDX_SHIFT)
+#define I40E_QRX_TAIL1(_Q) (0x00002000 + ((_Q) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_QRX_TAIL1_MAX_INDEX 15
+#define I40E_QRX_TAIL1_TAIL_SHIFT 0
+#define I40E_QRX_TAIL1_TAIL_MASK I40E_MASK(0x1FFF, I40E_QRX_TAIL1_TAIL_SHIFT)
+#define I40E_QTX_TAIL1(_Q) (0x00000000 + ((_Q) * 4)) /* _i=0...15 */ /* Reset: PFR */
+#define I40E_QTX_TAIL1_MAX_INDEX 15
+#define I40E_QTX_TAIL1_TAIL_SHIFT 0
+#define I40E_QTX_TAIL1_TAIL_MASK I40E_MASK(0x1FFF, I40E_QTX_TAIL1_TAIL_SHIFT)
+#define I40E_VFMSIX_PBA 0x00002000 /* Reset: VFLR */
+#define I40E_VFMSIX_PBA_PENBIT_SHIFT 0
+#define I40E_VFMSIX_PBA_PENBIT_MASK I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_PBA_PENBIT_SHIFT)
+#define I40E_VFMSIX_TADD(_i) (0x00000000 + ((_i) * 16)) /* _i=0...16 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TADD_MAX_INDEX 16
+#define I40E_VFMSIX_TADD_MSIXTADD10_SHIFT 0
+#define I40E_VFMSIX_TADD_MSIXTADD10_MASK I40E_MASK(0x3, I40E_VFMSIX_TADD_MSIXTADD10_SHIFT)
+#define I40E_VFMSIX_TADD_MSIXTADD_SHIFT 2
+#define I40E_VFMSIX_TADD_MSIXTADD_MASK I40E_MASK(0x3FFFFFFF, I40E_VFMSIX_TADD_MSIXTADD_SHIFT)
+#define I40E_VFMSIX_TMSG(_i) (0x00000008 + ((_i) * 16)) /* _i=0...16 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TMSG_MAX_INDEX 16
+#define I40E_VFMSIX_TMSG_MSIXTMSG_SHIFT 0
+#define I40E_VFMSIX_TMSG_MSIXTMSG_MASK I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_TMSG_MSIXTMSG_SHIFT)
+#define I40E_VFMSIX_TUADD(_i) (0x00000004 + ((_i) * 16)) /* _i=0...16 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TUADD_MAX_INDEX 16
+#define I40E_VFMSIX_TUADD_MSIXTUADD_SHIFT 0
+#define I40E_VFMSIX_TUADD_MSIXTUADD_MASK I40E_MASK(0xFFFFFFFF, I40E_VFMSIX_TUADD_MSIXTUADD_SHIFT)
+#define I40E_VFMSIX_TVCTRL(_i) (0x0000000C + ((_i) * 16)) /* _i=0...16 */ /* Reset: VFLR */
+#define I40E_VFMSIX_TVCTRL_MAX_INDEX 16
+#define I40E_VFMSIX_TVCTRL_MASK_SHIFT 0
+#define I40E_VFMSIX_TVCTRL_MASK_MASK I40E_MASK(0x1, I40E_VFMSIX_TVCTRL_MASK_SHIFT)
+#define I40E_VFCM_PE_ERRDATA 0x0000DC00 /* Reset: VFR */
+#define I40E_VFCM_PE_ERRDATA_ERROR_CODE_SHIFT 0
+#define I40E_VFCM_PE_ERRDATA_ERROR_CODE_MASK I40E_MASK(0xF, I40E_VFCM_PE_ERRDATA_ERROR_CODE_SHIFT)
+#define I40E_VFCM_PE_ERRDATA_Q_TYPE_SHIFT 4
+#define I40E_VFCM_PE_ERRDATA_Q_TYPE_MASK I40E_MASK(0x7, I40E_VFCM_PE_ERRDATA_Q_TYPE_SHIFT)
+#define I40E_VFCM_PE_ERRDATA_Q_NUM_SHIFT 8
+#define I40E_VFCM_PE_ERRDATA_Q_NUM_MASK I40E_MASK(0x3FFFF, I40E_VFCM_PE_ERRDATA_Q_NUM_SHIFT)
+#define I40E_VFCM_PE_ERRINFO 0x0000D800 /* Reset: VFR */
+#define I40E_VFCM_PE_ERRINFO_ERROR_VALID_SHIFT 0
+#define I40E_VFCM_PE_ERRINFO_ERROR_VALID_MASK I40E_MASK(0x1, I40E_VFCM_PE_ERRINFO_ERROR_VALID_SHIFT)
+#define I40E_VFCM_PE_ERRINFO_ERROR_INST_SHIFT 4
+#define I40E_VFCM_PE_ERRINFO_ERROR_INST_MASK I40E_MASK(0x7, I40E_VFCM_PE_ERRINFO_ERROR_INST_SHIFT)
+#define I40E_VFCM_PE_ERRINFO_DBL_ERROR_CNT_SHIFT 8
+#define I40E_VFCM_PE_ERRINFO_DBL_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO_DBL_ERROR_CNT_SHIFT)
+#define I40E_VFCM_PE_ERRINFO_RLU_ERROR_CNT_SHIFT 16
+#define I40E_VFCM_PE_ERRINFO_RLU_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO_RLU_ERROR_CNT_SHIFT)
+#define I40E_VFCM_PE_ERRINFO_RLS_ERROR_CNT_SHIFT 24
+#define I40E_VFCM_PE_ERRINFO_RLS_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_VFCM_PE_ERRINFO_RLS_ERROR_CNT_SHIFT)
+#define I40E_VFQF_HENA(_i) (0x0000C400 + ((_i) * 4)) /* _i=0...1 */ /* Reset: CORER */
+#define I40E_VFQF_HENA_MAX_INDEX 1
+#define I40E_VFQF_HENA_PTYPE_ENA_SHIFT 0
+#define I40E_VFQF_HENA_PTYPE_ENA_MASK I40E_MASK(0xFFFFFFFF, I40E_VFQF_HENA_PTYPE_ENA_SHIFT)
+#define I40E_VFQF_HKEY(_i) (0x0000CC00 + ((_i) * 4)) /* _i=0...12 */ /* Reset: CORER */
+#define I40E_VFQF_HKEY_MAX_INDEX 12
+#define I40E_VFQF_HKEY_KEY_0_SHIFT 0
+#define I40E_VFQF_HKEY_KEY_0_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY_KEY_0_SHIFT)
+#define I40E_VFQF_HKEY_KEY_1_SHIFT 8
+#define I40E_VFQF_HKEY_KEY_1_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY_KEY_1_SHIFT)
+#define I40E_VFQF_HKEY_KEY_2_SHIFT 16
+#define I40E_VFQF_HKEY_KEY_2_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY_KEY_2_SHIFT)
+#define I40E_VFQF_HKEY_KEY_3_SHIFT 24
+#define I40E_VFQF_HKEY_KEY_3_MASK I40E_MASK(0xFF, I40E_VFQF_HKEY_KEY_3_SHIFT)
+#define I40E_VFQF_HLUT(_i) (0x0000D000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_VFQF_HLUT_MAX_INDEX 15
+#define I40E_VFQF_HLUT_LUT0_SHIFT 0
+#define I40E_VFQF_HLUT_LUT0_MASK I40E_MASK(0xF, I40E_VFQF_HLUT_LUT0_SHIFT)
+#define I40E_VFQF_HLUT_LUT1_SHIFT 8
+#define I40E_VFQF_HLUT_LUT1_MASK I40E_MASK(0xF, I40E_VFQF_HLUT_LUT1_SHIFT)
+#define I40E_VFQF_HLUT_LUT2_SHIFT 16
+#define I40E_VFQF_HLUT_LUT2_MASK I40E_MASK(0xF, I40E_VFQF_HLUT_LUT2_SHIFT)
+#define I40E_VFQF_HLUT_LUT3_SHIFT 24
+#define I40E_VFQF_HLUT_LUT3_MASK I40E_MASK(0xF, I40E_VFQF_HLUT_LUT3_SHIFT)
+#define I40E_VFQF_HREGION(_i) (0x0000D400 + ((_i) * 4)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_VFQF_HREGION_MAX_INDEX 7
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_0_SHIFT 0
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_0_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_0_SHIFT)
+#define I40E_VFQF_HREGION_REGION_0_SHIFT 1
+#define I40E_VFQF_HREGION_REGION_0_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_0_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_1_SHIFT 4
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_1_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_1_SHIFT)
+#define I40E_VFQF_HREGION_REGION_1_SHIFT 5
+#define I40E_VFQF_HREGION_REGION_1_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_1_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_2_SHIFT 8
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_2_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_2_SHIFT)
+#define I40E_VFQF_HREGION_REGION_2_SHIFT 9
+#define I40E_VFQF_HREGION_REGION_2_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_2_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_3_SHIFT 12
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_3_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_3_SHIFT)
+#define I40E_VFQF_HREGION_REGION_3_SHIFT 13
+#define I40E_VFQF_HREGION_REGION_3_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_3_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_4_SHIFT 16
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_4_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_4_SHIFT)
+#define I40E_VFQF_HREGION_REGION_4_SHIFT 17
+#define I40E_VFQF_HREGION_REGION_4_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_4_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_5_SHIFT 20
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_5_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_5_SHIFT)
+#define I40E_VFQF_HREGION_REGION_5_SHIFT 21
+#define I40E_VFQF_HREGION_REGION_5_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_5_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_6_SHIFT 24
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_6_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_6_SHIFT)
+#define I40E_VFQF_HREGION_REGION_6_SHIFT 25
+#define I40E_VFQF_HREGION_REGION_6_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_6_SHIFT)
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_7_SHIFT 28
+#define I40E_VFQF_HREGION_OVERRIDE_ENA_7_MASK I40E_MASK(0x1, I40E_VFQF_HREGION_OVERRIDE_ENA_7_SHIFT)
+#define I40E_VFQF_HREGION_REGION_7_SHIFT 29
+#define I40E_VFQF_HREGION_REGION_7_MASK I40E_MASK(0x7, I40E_VFQF_HREGION_REGION_7_SHIFT)
+
+#define I40E_MNGSB_FDCRC 0x000B7050 /* Reset: POR */
+#define I40E_MNGSB_FDCRC_CRC_RES_SHIFT 0
+#define I40E_MNGSB_FDCRC_CRC_RES_MASK I40E_MASK(0xFF, I40E_MNGSB_FDCRC_CRC_RES_SHIFT)
+#define I40E_MNGSB_FDCS 0x000B7040 /* Reset: POR */
+#define I40E_MNGSB_FDCS_CRC_CONT_SHIFT 2
+#define I40E_MNGSB_FDCS_CRC_CONT_MASK I40E_MASK(0x1, I40E_MNGSB_FDCS_CRC_CONT_SHIFT)
+#define I40E_MNGSB_FDCS_CRC_SEED_EN_SHIFT 3
+#define I40E_MNGSB_FDCS_CRC_SEED_EN_MASK I40E_MASK(0x1, I40E_MNGSB_FDCS_CRC_SEED_EN_SHIFT)
+#define I40E_MNGSB_FDCS_CRC_WR_INH_SHIFT 4
+#define I40E_MNGSB_FDCS_CRC_WR_INH_MASK I40E_MASK(0x1, I40E_MNGSB_FDCS_CRC_WR_INH_SHIFT)
+#define I40E_MNGSB_FDCS_CRC_SEED_SHIFT 8
+#define I40E_MNGSB_FDCS_CRC_SEED_MASK I40E_MASK(0xFF, I40E_MNGSB_FDCS_CRC_SEED_SHIFT)
+#define I40E_MNGSB_FDS 0x000B7048 /* Reset: POR */
+#define I40E_MNGSB_FDS_START_BC_SHIFT 0
+#define I40E_MNGSB_FDS_START_BC_MASK I40E_MASK(0xFFF, I40E_MNGSB_FDS_START_BC_SHIFT)
+#define I40E_MNGSB_FDS_LAST_BC_SHIFT 16
+#define I40E_MNGSB_FDS_LAST_BC_MASK I40E_MASK(0xFFF, I40E_MNGSB_FDS_LAST_BC_SHIFT)
+
+#define I40E_GL_VF_CTRL_RX(_VF) (0x00083600 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_GL_VF_CTRL_RX_MAX_INDEX 127
+#define I40E_GL_VF_CTRL_RX_AQ_RX_EN_SHIFT 0
+#define I40E_GL_VF_CTRL_RX_AQ_RX_EN_MASK I40E_MASK(0x1, I40E_GL_VF_CTRL_RX_AQ_RX_EN_SHIFT)
+#define I40E_GL_VF_CTRL_TX(_VF) (0x00083400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: EMPR */
+#define I40E_GL_VF_CTRL_TX_MAX_INDEX 127
+#define I40E_GL_VF_CTRL_TX_AQ_TX_EN_SHIFT 0
+#define I40E_GL_VF_CTRL_TX_AQ_TX_EN_MASK I40E_MASK(0x1, I40E_GL_VF_CTRL_TX_AQ_TX_EN_SHIFT)
+
+#define I40E_GLCM_LAN_CACHESIZE 0x0010C4D8 /* Reset: CORER */
+#define I40E_GLCM_LAN_CACHESIZE_WORD_SIZE_SHIFT 0
+#define I40E_GLCM_LAN_CACHESIZE_WORD_SIZE_MASK I40E_MASK(0xFFF, I40E_GLCM_LAN_CACHESIZE_WORD_SIZE_SHIFT)
+#define I40E_GLCM_LAN_CACHESIZE_SETS_SHIFT 12
+#define I40E_GLCM_LAN_CACHESIZE_SETS_MASK I40E_MASK(0xF, I40E_GLCM_LAN_CACHESIZE_SETS_SHIFT)
+#define I40E_GLCM_LAN_CACHESIZE_WAYS_SHIFT 16
+#define I40E_GLCM_LAN_CACHESIZE_WAYS_MASK I40E_MASK(0x3FF, I40E_GLCM_LAN_CACHESIZE_WAYS_SHIFT)
+#define I40E_GLCM_PE_CACHESIZE 0x00138FE4 /* Reset: CORER */
+#define I40E_GLCM_PE_CACHESIZE_WORD_SIZE_SHIFT 0
+#define I40E_GLCM_PE_CACHESIZE_WORD_SIZE_MASK I40E_MASK(0xFFF, I40E_GLCM_PE_CACHESIZE_WORD_SIZE_SHIFT)
+#define I40E_GLCM_PE_CACHESIZE_SETS_SHIFT 12
+#define I40E_GLCM_PE_CACHESIZE_SETS_MASK I40E_MASK(0xF, I40E_GLCM_PE_CACHESIZE_SETS_SHIFT)
+#define I40E_GLCM_PE_CACHESIZE_WAYS_SHIFT 16
+#define I40E_GLCM_PE_CACHESIZE_WAYS_MASK I40E_MASK(0x1FF, I40E_GLCM_PE_CACHESIZE_WAYS_SHIFT)
+#define I40E_PFCM_PE_ERRDATA 0x00138D00 /* Reset: PFR */
+#define I40E_PFCM_PE_ERRDATA_ERROR_CODE_SHIFT 0
+#define I40E_PFCM_PE_ERRDATA_ERROR_CODE_MASK I40E_MASK(0xF, I40E_PFCM_PE_ERRDATA_ERROR_CODE_SHIFT)
+#define I40E_PFCM_PE_ERRDATA_Q_TYPE_SHIFT 4
+#define I40E_PFCM_PE_ERRDATA_Q_TYPE_MASK I40E_MASK(0x7, I40E_PFCM_PE_ERRDATA_Q_TYPE_SHIFT)
+#define I40E_PFCM_PE_ERRDATA_Q_NUM_SHIFT 8
+#define I40E_PFCM_PE_ERRDATA_Q_NUM_MASK I40E_MASK(0x3FFFF, I40E_PFCM_PE_ERRDATA_Q_NUM_SHIFT)
+#define I40E_PFCM_PE_ERRINFO 0x00138C80 /* Reset: PFR */
+#define I40E_PFCM_PE_ERRINFO_ERROR_VALID_SHIFT 0
+#define I40E_PFCM_PE_ERRINFO_ERROR_VALID_MASK I40E_MASK(0x1, I40E_PFCM_PE_ERRINFO_ERROR_VALID_SHIFT)
+#define I40E_PFCM_PE_ERRINFO_ERROR_INST_SHIFT 4
+#define I40E_PFCM_PE_ERRINFO_ERROR_INST_MASK I40E_MASK(0x7, I40E_PFCM_PE_ERRINFO_ERROR_INST_SHIFT)
+#define I40E_PFCM_PE_ERRINFO_DBL_ERROR_CNT_SHIFT 8
+#define I40E_PFCM_PE_ERRINFO_DBL_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_PFCM_PE_ERRINFO_DBL_ERROR_CNT_SHIFT)
+#define I40E_PFCM_PE_ERRINFO_RLU_ERROR_CNT_SHIFT 16
+#define I40E_PFCM_PE_ERRINFO_RLU_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_PFCM_PE_ERRINFO_RLU_ERROR_CNT_SHIFT)
+#define I40E_PFCM_PE_ERRINFO_RLS_ERROR_CNT_SHIFT 24
+#define I40E_PFCM_PE_ERRINFO_RLS_ERROR_CNT_MASK I40E_MASK(0xFF, I40E_PFCM_PE_ERRINFO_RLS_ERROR_CNT_SHIFT)
+
+#define I40E_PRTDCB_TFMSTC(_i) (0x000A0040 + ((_i) * 32)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PRTDCB_TFMSTC_MAX_INDEX 7
+#define I40E_PRTDCB_TFMSTC_MSTC_SHIFT 0
+#define I40E_PRTDCB_TFMSTC_MSTC_MASK I40E_MASK(0xFFFFF, I40E_PRTDCB_TFMSTC_MSTC_SHIFT)
+#define I40E_GL_FWSTS_FWROWD_SHIFT 8
+#define I40E_GL_FWSTS_FWROWD_MASK I40E_MASK(0x1, I40E_GL_FWSTS_FWROWD_SHIFT)
+#define I40E_GLFOC_CACHESIZE 0x000AA0DC /* Reset: CORER */
+#define I40E_GLFOC_CACHESIZE_WORD_SIZE_SHIFT 0
+#define I40E_GLFOC_CACHESIZE_WORD_SIZE_MASK I40E_MASK(0xFF, I40E_GLFOC_CACHESIZE_WORD_SIZE_SHIFT)
+#define I40E_GLFOC_CACHESIZE_SETS_SHIFT 8
+#define I40E_GLFOC_CACHESIZE_SETS_MASK I40E_MASK(0xFFF, I40E_GLFOC_CACHESIZE_SETS_SHIFT)
+#define I40E_GLFOC_CACHESIZE_WAYS_SHIFT 20
+#define I40E_GLFOC_CACHESIZE_WAYS_MASK I40E_MASK(0xF, I40E_GLFOC_CACHESIZE_WAYS_SHIFT)
+#define I40E_GLHMC_APBVTINUSEBASE(_i) (0x000C4a00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_APBVTINUSEBASE_MAX_INDEX 15
+#define I40E_GLHMC_APBVTINUSEBASE_FPMAPBINUSEBASE_SHIFT 0
+#define I40E_GLHMC_APBVTINUSEBASE_FPMAPBINUSEBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_APBVTINUSEBASE_FPMAPBINUSEBASE_SHIFT)
+#define I40E_GLHMC_CEQPART(_i) (0x001312C0 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_CEQPART_MAX_INDEX 15
+#define I40E_GLHMC_CEQPART_PMCEQBASE_SHIFT 0
+#define I40E_GLHMC_CEQPART_PMCEQBASE_MASK I40E_MASK(0xFF, I40E_GLHMC_CEQPART_PMCEQBASE_SHIFT)
+#define I40E_GLHMC_CEQPART_PMCEQSIZE_SHIFT 16
+#define I40E_GLHMC_CEQPART_PMCEQSIZE_MASK I40E_MASK(0x1FF, I40E_GLHMC_CEQPART_PMCEQSIZE_SHIFT)
+#define I40E_GLHMC_DBCQMAX 0x000C20F0 /* Reset: CORER */
+#define I40E_GLHMC_DBCQMAX_GLHMC_DBCQMAX_SHIFT 0
+#define I40E_GLHMC_DBCQMAX_GLHMC_DBCQMAX_MASK I40E_MASK(0x3FFFF, I40E_GLHMC_DBCQMAX_GLHMC_DBCQMAX_SHIFT)
+#define I40E_GLHMC_DBCQPART(_i) (0x00131240 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_DBCQPART_MAX_INDEX 15
+#define I40E_GLHMC_DBCQPART_PMDBCQBASE_SHIFT 0
+#define I40E_GLHMC_DBCQPART_PMDBCQBASE_MASK I40E_MASK(0x3FFF, I40E_GLHMC_DBCQPART_PMDBCQBASE_SHIFT)
+#define I40E_GLHMC_DBCQPART_PMDBCQSIZE_SHIFT 16
+#define I40E_GLHMC_DBCQPART_PMDBCQSIZE_MASK I40E_MASK(0x7FFF, I40E_GLHMC_DBCQPART_PMDBCQSIZE_SHIFT)
+#define I40E_GLHMC_DBQPMAX 0x000C20EC /* Reset: CORER */
+#define I40E_GLHMC_DBQPMAX_GLHMC_DBQPMAX_SHIFT 0
+#define I40E_GLHMC_DBQPMAX_GLHMC_DBQPMAX_MASK I40E_MASK(0x7FFFF, I40E_GLHMC_DBQPMAX_GLHMC_DBQPMAX_SHIFT)
+#define I40E_GLHMC_DBQPPART(_i) (0x00138D80 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_DBQPPART_MAX_INDEX 15
+#define I40E_GLHMC_DBQPPART_PMDBQPBASE_SHIFT 0
+#define I40E_GLHMC_DBQPPART_PMDBQPBASE_MASK I40E_MASK(0x3FFF, I40E_GLHMC_DBQPPART_PMDBQPBASE_SHIFT)
+#define I40E_GLHMC_DBQPPART_PMDBQPSIZE_SHIFT 16
+#define I40E_GLHMC_DBQPPART_PMDBQPSIZE_MASK I40E_MASK(0x7FFF, I40E_GLHMC_DBQPPART_PMDBQPSIZE_SHIFT)
+#define I40E_GLHMC_PEARPBASE(_i) (0x000C4800 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEARPBASE_MAX_INDEX 15
+#define I40E_GLHMC_PEARPBASE_FPMPEARPBASE_SHIFT 0
+#define I40E_GLHMC_PEARPBASE_FPMPEARPBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEARPBASE_FPMPEARPBASE_SHIFT)
+#define I40E_GLHMC_PEARPCNT(_i) (0x000C4900 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEARPCNT_MAX_INDEX 15
+#define I40E_GLHMC_PEARPCNT_FPMPEARPCNT_SHIFT 0
+#define I40E_GLHMC_PEARPCNT_FPMPEARPCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEARPCNT_FPMPEARPCNT_SHIFT)
+#define I40E_GLHMC_PEARPMAX 0x000C2038 /* Reset: CORER */
+#define I40E_GLHMC_PEARPMAX_PMPEARPMAX_SHIFT 0
+#define I40E_GLHMC_PEARPMAX_PMPEARPMAX_MASK I40E_MASK(0x1FFFF, I40E_GLHMC_PEARPMAX_PMPEARPMAX_SHIFT)
+#define I40E_GLHMC_PEARPOBJSZ 0x000C2034 /* Reset: CORER */
+#define I40E_GLHMC_PEARPOBJSZ_PMPEARPOBJSZ_SHIFT 0
+#define I40E_GLHMC_PEARPOBJSZ_PMPEARPOBJSZ_MASK I40E_MASK(0x7, I40E_GLHMC_PEARPOBJSZ_PMPEARPOBJSZ_SHIFT)
+#define I40E_GLHMC_PECQBASE(_i) (0x000C4200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PECQBASE_MAX_INDEX 15
+#define I40E_GLHMC_PECQBASE_FPMPECQBASE_SHIFT 0
+#define I40E_GLHMC_PECQBASE_FPMPECQBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PECQBASE_FPMPECQBASE_SHIFT)
+#define I40E_GLHMC_PECQCNT(_i) (0x000C4300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PECQCNT_MAX_INDEX 15
+#define I40E_GLHMC_PECQCNT_FPMPECQCNT_SHIFT 0
+#define I40E_GLHMC_PECQCNT_FPMPECQCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PECQCNT_FPMPECQCNT_SHIFT)
+#define I40E_GLHMC_PECQOBJSZ 0x000C2020 /* Reset: CORER */
+#define I40E_GLHMC_PECQOBJSZ_PMPECQOBJSZ_SHIFT 0
+#define I40E_GLHMC_PECQOBJSZ_PMPECQOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PECQOBJSZ_PMPECQOBJSZ_SHIFT)
+#define I40E_GLHMC_PEHTCNT(_i) (0x000C4700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEHTCNT_MAX_INDEX 15
+#define I40E_GLHMC_PEHTCNT_FPMPEHTCNT_SHIFT 0
+#define I40E_GLHMC_PEHTCNT_FPMPEHTCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEHTCNT_FPMPEHTCNT_SHIFT)
+#define I40E_GLHMC_PEHTEBASE(_i) (0x000C4600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEHTEBASE_MAX_INDEX 15
+#define I40E_GLHMC_PEHTEBASE_FPMPEHTEBASE_SHIFT 0
+#define I40E_GLHMC_PEHTEBASE_FPMPEHTEBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEHTEBASE_FPMPEHTEBASE_SHIFT)
+#define I40E_GLHMC_PEHTEOBJSZ 0x000C202c /* Reset: CORER */
+#define I40E_GLHMC_PEHTEOBJSZ_PMPEHTEOBJSZ_SHIFT 0
+#define I40E_GLHMC_PEHTEOBJSZ_PMPEHTEOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PEHTEOBJSZ_PMPEHTEOBJSZ_SHIFT)
+#define I40E_GLHMC_PEHTMAX 0x000C2030 /* Reset: CORER */
+#define I40E_GLHMC_PEHTMAX_PMPEHTMAX_SHIFT 0
+#define I40E_GLHMC_PEHTMAX_PMPEHTMAX_MASK I40E_MASK(0x1FFFFF, I40E_GLHMC_PEHTMAX_PMPEHTMAX_SHIFT)
+#define I40E_GLHMC_PEMRBASE(_i) (0x000C4c00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEMRBASE_MAX_INDEX 15
+#define I40E_GLHMC_PEMRBASE_FPMPEMRBASE_SHIFT 0
+#define I40E_GLHMC_PEMRBASE_FPMPEMRBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEMRBASE_FPMPEMRBASE_SHIFT)
+#define I40E_GLHMC_PEMRCNT(_i) (0x000C4d00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEMRCNT_MAX_INDEX 15
+#define I40E_GLHMC_PEMRCNT_FPMPEMRSZ_SHIFT 0
+#define I40E_GLHMC_PEMRCNT_FPMPEMRSZ_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEMRCNT_FPMPEMRSZ_SHIFT)
+#define I40E_GLHMC_PEMRMAX 0x000C2040 /* Reset: CORER */
+#define I40E_GLHMC_PEMRMAX_PMPEMRMAX_SHIFT 0
+#define I40E_GLHMC_PEMRMAX_PMPEMRMAX_MASK I40E_MASK(0x7FFFFF, I40E_GLHMC_PEMRMAX_PMPEMRMAX_SHIFT)
+#define I40E_GLHMC_PEMROBJSZ 0x000C203c /* Reset: CORER */
+#define I40E_GLHMC_PEMROBJSZ_PMPEMROBJSZ_SHIFT 0
+#define I40E_GLHMC_PEMROBJSZ_PMPEMROBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PEMROBJSZ_PMPEMROBJSZ_SHIFT)
+#define I40E_GLHMC_PEPBLBASE(_i) (0x000C5800 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEPBLBASE_MAX_INDEX 15
+#define I40E_GLHMC_PEPBLBASE_FPMPEPBLBASE_SHIFT 0
+#define I40E_GLHMC_PEPBLBASE_FPMPEPBLBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEPBLBASE_FPMPEPBLBASE_SHIFT)
+#define I40E_GLHMC_PEPBLCNT(_i) (0x000C5900 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEPBLCNT_MAX_INDEX 15
+#define I40E_GLHMC_PEPBLCNT_FPMPEPBLCNT_SHIFT 0
+#define I40E_GLHMC_PEPBLCNT_FPMPEPBLCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEPBLCNT_FPMPEPBLCNT_SHIFT)
+#define I40E_GLHMC_PEPBLMAX 0x000C206c /* Reset: CORER */
+#define I40E_GLHMC_PEPBLMAX_PMPEPBLMAX_SHIFT 0
+#define I40E_GLHMC_PEPBLMAX_PMPEPBLMAX_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEPBLMAX_PMPEPBLMAX_SHIFT)
+#define I40E_GLHMC_PEPFFIRSTSD 0x000C20E4 /* Reset: CORER */
+#define I40E_GLHMC_PEPFFIRSTSD_GLHMC_PEPFFIRSTSD_SHIFT 0
+#define I40E_GLHMC_PEPFFIRSTSD_GLHMC_PEPFFIRSTSD_MASK I40E_MASK(0xFFF, I40E_GLHMC_PEPFFIRSTSD_GLHMC_PEPFFIRSTSD_SHIFT)
+#define I40E_GLHMC_PEQ1BASE(_i) (0x000C5200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEQ1BASE_MAX_INDEX 15
+#define I40E_GLHMC_PEQ1BASE_FPMPEQ1BASE_SHIFT 0
+#define I40E_GLHMC_PEQ1BASE_FPMPEQ1BASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEQ1BASE_FPMPEQ1BASE_SHIFT)
+#define I40E_GLHMC_PEQ1CNT(_i) (0x000C5300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEQ1CNT_MAX_INDEX 15
+#define I40E_GLHMC_PEQ1CNT_FPMPEQ1CNT_SHIFT 0
+#define I40E_GLHMC_PEQ1CNT_FPMPEQ1CNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEQ1CNT_FPMPEQ1CNT_SHIFT)
+#define I40E_GLHMC_PEQ1FLBASE(_i) (0x000C5400 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEQ1FLBASE_MAX_INDEX 15
+#define I40E_GLHMC_PEQ1FLBASE_FPMPEQ1FLBASE_SHIFT 0
+#define I40E_GLHMC_PEQ1FLBASE_FPMPEQ1FLBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEQ1FLBASE_FPMPEQ1FLBASE_SHIFT)
+#define I40E_GLHMC_PEQ1FLMAX 0x000C2058 /* Reset: CORER */
+#define I40E_GLHMC_PEQ1FLMAX_PMPEQ1FLMAX_SHIFT 0
+#define I40E_GLHMC_PEQ1FLMAX_PMPEQ1FLMAX_MASK I40E_MASK(0x3FFFFFF, I40E_GLHMC_PEQ1FLMAX_PMPEQ1FLMAX_SHIFT)
+#define I40E_GLHMC_PEQ1MAX 0x000C2054 /* Reset: CORER */
+#define I40E_GLHMC_PEQ1MAX_PMPEQ1MAX_SHIFT 0
+#define I40E_GLHMC_PEQ1MAX_PMPEQ1MAX_MASK I40E_MASK(0x3FFFFFF, I40E_GLHMC_PEQ1MAX_PMPEQ1MAX_SHIFT)
+#define I40E_GLHMC_PEQ1OBJSZ 0x000C2050 /* Reset: CORER */
+#define I40E_GLHMC_PEQ1OBJSZ_PMPEQ1OBJSZ_SHIFT 0
+#define I40E_GLHMC_PEQ1OBJSZ_PMPEQ1OBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PEQ1OBJSZ_PMPEQ1OBJSZ_SHIFT)
+#define I40E_GLHMC_PEQPBASE(_i) (0x000C4000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEQPBASE_MAX_INDEX 15
+#define I40E_GLHMC_PEQPBASE_FPMPEQPBASE_SHIFT 0
+#define I40E_GLHMC_PEQPBASE_FPMPEQPBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEQPBASE_FPMPEQPBASE_SHIFT)
+#define I40E_GLHMC_PEQPCNT(_i) (0x000C4100 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEQPCNT_MAX_INDEX 15
+#define I40E_GLHMC_PEQPCNT_FPMPEQPCNT_SHIFT 0
+#define I40E_GLHMC_PEQPCNT_FPMPEQPCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEQPCNT_FPMPEQPCNT_SHIFT)
+#define I40E_GLHMC_PEQPOBJSZ 0x000C201c /* Reset: CORER */
+#define I40E_GLHMC_PEQPOBJSZ_PMPEQPOBJSZ_SHIFT 0
+#define I40E_GLHMC_PEQPOBJSZ_PMPEQPOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PEQPOBJSZ_PMPEQPOBJSZ_SHIFT)
+#define I40E_GLHMC_PESRQBASE(_i) (0x000C4400 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PESRQBASE_MAX_INDEX 15
+#define I40E_GLHMC_PESRQBASE_FPMPESRQBASE_SHIFT 0
+#define I40E_GLHMC_PESRQBASE_FPMPESRQBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PESRQBASE_FPMPESRQBASE_SHIFT)
+#define I40E_GLHMC_PESRQCNT(_i) (0x000C4500 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PESRQCNT_MAX_INDEX 15
+#define I40E_GLHMC_PESRQCNT_FPMPESRQCNT_SHIFT 0
+#define I40E_GLHMC_PESRQCNT_FPMPESRQCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PESRQCNT_FPMPESRQCNT_SHIFT)
+#define I40E_GLHMC_PESRQMAX 0x000C2028 /* Reset: CORER */
+#define I40E_GLHMC_PESRQMAX_PMPESRQMAX_SHIFT 0
+#define I40E_GLHMC_PESRQMAX_PMPESRQMAX_MASK I40E_MASK(0xFFFF, I40E_GLHMC_PESRQMAX_PMPESRQMAX_SHIFT)
+#define I40E_GLHMC_PESRQOBJSZ 0x000C2024 /* Reset: CORER */
+#define I40E_GLHMC_PESRQOBJSZ_PMPESRQOBJSZ_SHIFT 0
+#define I40E_GLHMC_PESRQOBJSZ_PMPESRQOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PESRQOBJSZ_PMPESRQOBJSZ_SHIFT)
+#define I40E_GLHMC_PETIMERBASE(_i) (0x000C5A00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PETIMERBASE_MAX_INDEX 15
+#define I40E_GLHMC_PETIMERBASE_FPMPETIMERBASE_SHIFT 0
+#define I40E_GLHMC_PETIMERBASE_FPMPETIMERBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PETIMERBASE_FPMPETIMERBASE_SHIFT)
+#define I40E_GLHMC_PETIMERCNT(_i) (0x000C5B00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PETIMERCNT_MAX_INDEX 15
+#define I40E_GLHMC_PETIMERCNT_FPMPETIMERCNT_SHIFT 0
+#define I40E_GLHMC_PETIMERCNT_FPMPETIMERCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PETIMERCNT_FPMPETIMERCNT_SHIFT)
+#define I40E_GLHMC_PETIMERMAX 0x000C2084 /* Reset: CORER */
+#define I40E_GLHMC_PETIMERMAX_PMPETIMERMAX_SHIFT 0
+#define I40E_GLHMC_PETIMERMAX_PMPETIMERMAX_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PETIMERMAX_PMPETIMERMAX_SHIFT)
+#define I40E_GLHMC_PETIMEROBJSZ 0x000C2080 /* Reset: CORER */
+#define I40E_GLHMC_PETIMEROBJSZ_PMPETIMEROBJSZ_SHIFT 0
+#define I40E_GLHMC_PETIMEROBJSZ_PMPETIMEROBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PETIMEROBJSZ_PMPETIMEROBJSZ_SHIFT)
+#define I40E_GLHMC_PEXFBASE(_i) (0x000C4e00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEXFBASE_MAX_INDEX 15
+#define I40E_GLHMC_PEXFBASE_FPMPEXFBASE_SHIFT 0
+#define I40E_GLHMC_PEXFBASE_FPMPEXFBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEXFBASE_FPMPEXFBASE_SHIFT)
+#define I40E_GLHMC_PEXFCNT(_i) (0x000C4f00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEXFCNT_MAX_INDEX 15
+#define I40E_GLHMC_PEXFCNT_FPMPEXFCNT_SHIFT 0
+#define I40E_GLHMC_PEXFCNT_FPMPEXFCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_PEXFCNT_FPMPEXFCNT_SHIFT)
+#define I40E_GLHMC_PEXFFLBASE(_i) (0x000C5000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PEXFFLBASE_MAX_INDEX 15
+#define I40E_GLHMC_PEXFFLBASE_FPMPEXFFLBASE_SHIFT 0
+#define I40E_GLHMC_PEXFFLBASE_FPMPEXFFLBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_PEXFFLBASE_FPMPEXFFLBASE_SHIFT)
+#define I40E_GLHMC_PEXFFLMAX 0x000C204c /* Reset: CORER */
+#define I40E_GLHMC_PEXFFLMAX_PMPEXFFLMAX_SHIFT 0
+#define I40E_GLHMC_PEXFFLMAX_PMPEXFFLMAX_MASK I40E_MASK(0x1FFFFFF, I40E_GLHMC_PEXFFLMAX_PMPEXFFLMAX_SHIFT)
+#define I40E_GLHMC_PEXFMAX 0x000C2048 /* Reset: CORER */
+#define I40E_GLHMC_PEXFMAX_PMPEXFMAX_SHIFT 0
+#define I40E_GLHMC_PEXFMAX_PMPEXFMAX_MASK I40E_MASK(0x3FFFFFF, I40E_GLHMC_PEXFMAX_PMPEXFMAX_SHIFT)
+#define I40E_GLHMC_PEXFOBJSZ 0x000C2044 /* Reset: CORER */
+#define I40E_GLHMC_PEXFOBJSZ_PMPEXFOBJSZ_SHIFT 0
+#define I40E_GLHMC_PEXFOBJSZ_PMPEXFOBJSZ_MASK I40E_MASK(0xF, I40E_GLHMC_PEXFOBJSZ_PMPEXFOBJSZ_SHIFT)
+#define I40E_GLHMC_PFPESDPART(_i) (0x000C0880 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLHMC_PFPESDPART_MAX_INDEX 15
+#define I40E_GLHMC_PFPESDPART_PMSDBASE_SHIFT 0
+#define I40E_GLHMC_PFPESDPART_PMSDBASE_MASK I40E_MASK(0xFFF, I40E_GLHMC_PFPESDPART_PMSDBASE_SHIFT)
+#define I40E_GLHMC_PFPESDPART_PMSDSIZE_SHIFT 16
+#define I40E_GLHMC_PFPESDPART_PMSDSIZE_MASK I40E_MASK(0x1FFF, I40E_GLHMC_PFPESDPART_PMSDSIZE_SHIFT)
+#define I40E_GLHMC_VFAPBVTINUSEBASE(_i) (0x000Cca00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFAPBVTINUSEBASE_MAX_INDEX 31
+#define I40E_GLHMC_VFAPBVTINUSEBASE_FPMAPBINUSEBASE_SHIFT 0
+#define I40E_GLHMC_VFAPBVTINUSEBASE_FPMAPBINUSEBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFAPBVTINUSEBASE_FPMAPBINUSEBASE_SHIFT)
+#define I40E_GLHMC_VFCEQPART(_i) (0x00132240 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFCEQPART_MAX_INDEX 31
+#define I40E_GLHMC_VFCEQPART_PMCEQBASE_SHIFT 0
+#define I40E_GLHMC_VFCEQPART_PMCEQBASE_MASK I40E_MASK(0xFF, I40E_GLHMC_VFCEQPART_PMCEQBASE_SHIFT)
+#define I40E_GLHMC_VFCEQPART_PMCEQSIZE_SHIFT 16
+#define I40E_GLHMC_VFCEQPART_PMCEQSIZE_MASK I40E_MASK(0x1FF, I40E_GLHMC_VFCEQPART_PMCEQSIZE_SHIFT)
+#define I40E_GLHMC_VFDBCQPART(_i) (0x00132140 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFDBCQPART_MAX_INDEX 31
+#define I40E_GLHMC_VFDBCQPART_PMDBCQBASE_SHIFT 0
+#define I40E_GLHMC_VFDBCQPART_PMDBCQBASE_MASK I40E_MASK(0x3FFF, I40E_GLHMC_VFDBCQPART_PMDBCQBASE_SHIFT)
+#define I40E_GLHMC_VFDBCQPART_PMDBCQSIZE_SHIFT 16
+#define I40E_GLHMC_VFDBCQPART_PMDBCQSIZE_MASK I40E_MASK(0x7FFF, I40E_GLHMC_VFDBCQPART_PMDBCQSIZE_SHIFT)
+#define I40E_GLHMC_VFDBQPPART(_i) (0x00138E00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFDBQPPART_MAX_INDEX 31
+#define I40E_GLHMC_VFDBQPPART_PMDBQPBASE_SHIFT 0
+#define I40E_GLHMC_VFDBQPPART_PMDBQPBASE_MASK I40E_MASK(0x3FFF, I40E_GLHMC_VFDBQPPART_PMDBQPBASE_SHIFT)
+#define I40E_GLHMC_VFDBQPPART_PMDBQPSIZE_SHIFT 16
+#define I40E_GLHMC_VFDBQPPART_PMDBQPSIZE_MASK I40E_MASK(0x7FFF, I40E_GLHMC_VFDBQPPART_PMDBQPSIZE_SHIFT)
+#define I40E_GLHMC_VFFSIAVBASE(_i) (0x000Cd600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFFSIAVBASE_MAX_INDEX 31
+#define I40E_GLHMC_VFFSIAVBASE_FPMFSIAVBASE_SHIFT 0
+#define I40E_GLHMC_VFFSIAVBASE_FPMFSIAVBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFFSIAVBASE_FPMFSIAVBASE_SHIFT)
+#define I40E_GLHMC_VFFSIAVCNT(_i) (0x000Cd700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFFSIAVCNT_MAX_INDEX 31
+#define I40E_GLHMC_VFFSIAVCNT_FPMFSIAVCNT_SHIFT 0
+#define I40E_GLHMC_VFFSIAVCNT_FPMFSIAVCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFFSIAVCNT_FPMFSIAVCNT_SHIFT)
+#define I40E_GLHMC_VFPDINV(_i) (0x000C8300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPDINV_MAX_INDEX 31
+#define I40E_GLHMC_VFPDINV_PMSDIDX_SHIFT 0
+#define I40E_GLHMC_VFPDINV_PMSDIDX_MASK I40E_MASK(0xFFF, I40E_GLHMC_VFPDINV_PMSDIDX_SHIFT)
+#define I40E_GLHMC_VFPDINV_PMSDPARTSEL_SHIFT 15
+#define I40E_GLHMC_VFPDINV_PMSDPARTSEL_MASK I40E_MASK(0x1, I40E_GLHMC_VFPDINV_PMSDPARTSEL_SHIFT)
+#define I40E_GLHMC_VFPDINV_PMPDIDX_SHIFT 16
+#define I40E_GLHMC_VFPDINV_PMPDIDX_MASK I40E_MASK(0x1FF, I40E_GLHMC_VFPDINV_PMPDIDX_SHIFT)
+#define I40E_GLHMC_VFPEARPBASE(_i) (0x000Cc800 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEARPBASE_MAX_INDEX 31
+#define I40E_GLHMC_VFPEARPBASE_FPMPEARPBASE_SHIFT 0
+#define I40E_GLHMC_VFPEARPBASE_FPMPEARPBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEARPBASE_FPMPEARPBASE_SHIFT)
+#define I40E_GLHMC_VFPEARPCNT(_i) (0x000Cc900 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEARPCNT_MAX_INDEX 31
+#define I40E_GLHMC_VFPEARPCNT_FPMPEARPCNT_SHIFT 0
+#define I40E_GLHMC_VFPEARPCNT_FPMPEARPCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEARPCNT_FPMPEARPCNT_SHIFT)
+#define I40E_GLHMC_VFPECQBASE(_i) (0x000Cc200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPECQBASE_MAX_INDEX 31
+#define I40E_GLHMC_VFPECQBASE_FPMPECQBASE_SHIFT 0
+#define I40E_GLHMC_VFPECQBASE_FPMPECQBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPECQBASE_FPMPECQBASE_SHIFT)
+#define I40E_GLHMC_VFPECQCNT(_i) (0x000Cc300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPECQCNT_MAX_INDEX 31
+#define I40E_GLHMC_VFPECQCNT_FPMPECQCNT_SHIFT 0
+#define I40E_GLHMC_VFPECQCNT_FPMPECQCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPECQCNT_FPMPECQCNT_SHIFT)
+#define I40E_GLHMC_VFPEHTCNT(_i) (0x000Cc700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEHTCNT_MAX_INDEX 31
+#define I40E_GLHMC_VFPEHTCNT_FPMPEHTCNT_SHIFT 0
+#define I40E_GLHMC_VFPEHTCNT_FPMPEHTCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEHTCNT_FPMPEHTCNT_SHIFT)
+#define I40E_GLHMC_VFPEHTEBASE(_i) (0x000Cc600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEHTEBASE_MAX_INDEX 31
+#define I40E_GLHMC_VFPEHTEBASE_FPMPEHTEBASE_SHIFT 0
+#define I40E_GLHMC_VFPEHTEBASE_FPMPEHTEBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEHTEBASE_FPMPEHTEBASE_SHIFT)
+#define I40E_GLHMC_VFPEMRBASE(_i) (0x000Ccc00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEMRBASE_MAX_INDEX 31
+#define I40E_GLHMC_VFPEMRBASE_FPMPEMRBASE_SHIFT 0
+#define I40E_GLHMC_VFPEMRBASE_FPMPEMRBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEMRBASE_FPMPEMRBASE_SHIFT)
+#define I40E_GLHMC_VFPEMRCNT(_i) (0x000Ccd00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEMRCNT_MAX_INDEX 31
+#define I40E_GLHMC_VFPEMRCNT_FPMPEMRSZ_SHIFT 0
+#define I40E_GLHMC_VFPEMRCNT_FPMPEMRSZ_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEMRCNT_FPMPEMRSZ_SHIFT)
+#define I40E_GLHMC_VFPEPBLBASE(_i) (0x000Cd800 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEPBLBASE_MAX_INDEX 31
+#define I40E_GLHMC_VFPEPBLBASE_FPMPEPBLBASE_SHIFT 0
+#define I40E_GLHMC_VFPEPBLBASE_FPMPEPBLBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEPBLBASE_FPMPEPBLBASE_SHIFT)
+#define I40E_GLHMC_VFPEPBLCNT(_i) (0x000Cd900 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEPBLCNT_MAX_INDEX 31
+#define I40E_GLHMC_VFPEPBLCNT_FPMPEPBLCNT_SHIFT 0
+#define I40E_GLHMC_VFPEPBLCNT_FPMPEPBLCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEPBLCNT_FPMPEPBLCNT_SHIFT)
+#define I40E_GLHMC_VFPEQ1BASE(_i) (0x000Cd200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEQ1BASE_MAX_INDEX 31
+#define I40E_GLHMC_VFPEQ1BASE_FPMPEQ1BASE_SHIFT 0
+#define I40E_GLHMC_VFPEQ1BASE_FPMPEQ1BASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEQ1BASE_FPMPEQ1BASE_SHIFT)
+#define I40E_GLHMC_VFPEQ1CNT(_i) (0x000Cd300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEQ1CNT_MAX_INDEX 31
+#define I40E_GLHMC_VFPEQ1CNT_FPMPEQ1CNT_SHIFT 0
+#define I40E_GLHMC_VFPEQ1CNT_FPMPEQ1CNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEQ1CNT_FPMPEQ1CNT_SHIFT)
+#define I40E_GLHMC_VFPEQ1FLBASE(_i) (0x000Cd400 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEQ1FLBASE_MAX_INDEX 31
+#define I40E_GLHMC_VFPEQ1FLBASE_FPMPEQ1FLBASE_SHIFT 0
+#define I40E_GLHMC_VFPEQ1FLBASE_FPMPEQ1FLBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEQ1FLBASE_FPMPEQ1FLBASE_SHIFT)
+#define I40E_GLHMC_VFPEQPBASE(_i) (0x000Cc000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEQPBASE_MAX_INDEX 31
+#define I40E_GLHMC_VFPEQPBASE_FPMPEQPBASE_SHIFT 0
+#define I40E_GLHMC_VFPEQPBASE_FPMPEQPBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEQPBASE_FPMPEQPBASE_SHIFT)
+#define I40E_GLHMC_VFPEQPCNT(_i) (0x000Cc100 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEQPCNT_MAX_INDEX 31
+#define I40E_GLHMC_VFPEQPCNT_FPMPEQPCNT_SHIFT 0
+#define I40E_GLHMC_VFPEQPCNT_FPMPEQPCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEQPCNT_FPMPEQPCNT_SHIFT)
+#define I40E_GLHMC_VFPESRQBASE(_i) (0x000Cc400 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPESRQBASE_MAX_INDEX 31
+#define I40E_GLHMC_VFPESRQBASE_FPMPESRQBASE_SHIFT 0
+#define I40E_GLHMC_VFPESRQBASE_FPMPESRQBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPESRQBASE_FPMPESRQBASE_SHIFT)
+#define I40E_GLHMC_VFPESRQCNT(_i) (0x000Cc500 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPESRQCNT_MAX_INDEX 31
+#define I40E_GLHMC_VFPESRQCNT_FPMPESRQCNT_SHIFT 0
+#define I40E_GLHMC_VFPESRQCNT_FPMPESRQCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPESRQCNT_FPMPESRQCNT_SHIFT)
+#define I40E_GLHMC_VFPETIMERBASE(_i) (0x000CDA00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPETIMERBASE_MAX_INDEX 31
+#define I40E_GLHMC_VFPETIMERBASE_FPMPETIMERBASE_SHIFT 0
+#define I40E_GLHMC_VFPETIMERBASE_FPMPETIMERBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPETIMERBASE_FPMPETIMERBASE_SHIFT)
+#define I40E_GLHMC_VFPETIMERCNT(_i) (0x000CDB00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPETIMERCNT_MAX_INDEX 31
+#define I40E_GLHMC_VFPETIMERCNT_FPMPETIMERCNT_SHIFT 0
+#define I40E_GLHMC_VFPETIMERCNT_FPMPETIMERCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPETIMERCNT_FPMPETIMERCNT_SHIFT)
+#define I40E_GLHMC_VFPEXFBASE(_i) (0x000Cce00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEXFBASE_MAX_INDEX 31
+#define I40E_GLHMC_VFPEXFBASE_FPMPEXFBASE_SHIFT 0
+#define I40E_GLHMC_VFPEXFBASE_FPMPEXFBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEXFBASE_FPMPEXFBASE_SHIFT)
+#define I40E_GLHMC_VFPEXFCNT(_i) (0x000Ccf00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEXFCNT_MAX_INDEX 31
+#define I40E_GLHMC_VFPEXFCNT_FPMPEXFCNT_SHIFT 0
+#define I40E_GLHMC_VFPEXFCNT_FPMPEXFCNT_MASK I40E_MASK(0x1FFFFFFF, I40E_GLHMC_VFPEXFCNT_FPMPEXFCNT_SHIFT)
+#define I40E_GLHMC_VFPEXFFLBASE(_i) (0x000Cd000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFPEXFFLBASE_MAX_INDEX 31
+#define I40E_GLHMC_VFPEXFFLBASE_FPMPEXFFLBASE_SHIFT 0
+#define I40E_GLHMC_VFPEXFFLBASE_FPMPEXFFLBASE_MASK I40E_MASK(0xFFFFFF, I40E_GLHMC_VFPEXFFLBASE_FPMPEXFFLBASE_SHIFT)
+#define I40E_GLHMC_VFSDPART(_i) (0x000C8800 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLHMC_VFSDPART_MAX_INDEX 31
+#define I40E_GLHMC_VFSDPART_PMSDBASE_SHIFT 0
+#define I40E_GLHMC_VFSDPART_PMSDBASE_MASK I40E_MASK(0xFFF, I40E_GLHMC_VFSDPART_PMSDBASE_SHIFT)
+#define I40E_GLHMC_VFSDPART_PMSDSIZE_SHIFT 16
+#define I40E_GLHMC_VFSDPART_PMSDSIZE_MASK I40E_MASK(0x1FFF, I40E_GLHMC_VFSDPART_PMSDSIZE_SHIFT)
+#define I40E_GLPBLOC_CACHESIZE 0x000A80BC /* Reset: CORER */
+#define I40E_GLPBLOC_CACHESIZE_WORD_SIZE_SHIFT 0
+#define I40E_GLPBLOC_CACHESIZE_WORD_SIZE_MASK I40E_MASK(0xFF, I40E_GLPBLOC_CACHESIZE_WORD_SIZE_SHIFT)
+#define I40E_GLPBLOC_CACHESIZE_SETS_SHIFT 8
+#define I40E_GLPBLOC_CACHESIZE_SETS_MASK I40E_MASK(0xFFF, I40E_GLPBLOC_CACHESIZE_SETS_SHIFT)
+#define I40E_GLPBLOC_CACHESIZE_WAYS_SHIFT 20
+#define I40E_GLPBLOC_CACHESIZE_WAYS_MASK I40E_MASK(0xF, I40E_GLPBLOC_CACHESIZE_WAYS_SHIFT)
+#define I40E_GLPDOC_CACHESIZE 0x000D0088 /* Reset: CORER */
+#define I40E_GLPDOC_CACHESIZE_WORD_SIZE_SHIFT 0
+#define I40E_GLPDOC_CACHESIZE_WORD_SIZE_MASK I40E_MASK(0xFF, I40E_GLPDOC_CACHESIZE_WORD_SIZE_SHIFT)
+#define I40E_GLPDOC_CACHESIZE_SETS_SHIFT 8
+#define I40E_GLPDOC_CACHESIZE_SETS_MASK I40E_MASK(0xFFF, I40E_GLPDOC_CACHESIZE_SETS_SHIFT)
+#define I40E_GLPDOC_CACHESIZE_WAYS_SHIFT 20
+#define I40E_GLPDOC_CACHESIZE_WAYS_MASK I40E_MASK(0xF, I40E_GLPDOC_CACHESIZE_WAYS_SHIFT)
+#define I40E_GLPEOC_CACHESIZE 0x000A60E8 /* Reset: CORER */
+#define I40E_GLPEOC_CACHESIZE_WORD_SIZE_SHIFT 0
+#define I40E_GLPEOC_CACHESIZE_WORD_SIZE_MASK I40E_MASK(0xFF, I40E_GLPEOC_CACHESIZE_WORD_SIZE_SHIFT)
+#define I40E_GLPEOC_CACHESIZE_SETS_SHIFT 8
+#define I40E_GLPEOC_CACHESIZE_SETS_MASK I40E_MASK(0xFFF, I40E_GLPEOC_CACHESIZE_SETS_SHIFT)
+#define I40E_GLPEOC_CACHESIZE_WAYS_SHIFT 20
+#define I40E_GLPEOC_CACHESIZE_WAYS_MASK I40E_MASK(0xF, I40E_GLPEOC_CACHESIZE_WAYS_SHIFT)
+#define I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT 15
+#define I40E_PFHMC_PDINV_PMSDPARTSEL_MASK I40E_MASK(0x1, I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT)
+#define I40E_PFHMC_SDCMD_PMSDPARTSEL_SHIFT 15
+#define I40E_PFHMC_SDCMD_PMSDPARTSEL_MASK I40E_MASK(0x1, I40E_PFHMC_SDCMD_PMSDPARTSEL_SHIFT)
+#define I40E_GL_PPRS_SPARE 0x000856E0 /* Reset: CORER */
+#define I40E_GL_PPRS_SPARE_GL_PPRS_SPARE_SHIFT 0
+#define I40E_GL_PPRS_SPARE_GL_PPRS_SPARE_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_PPRS_SPARE_GL_PPRS_SPARE_SHIFT)
+#define I40E_GL_TLAN_SPARE 0x000E64E0 /* Reset: CORER */
+#define I40E_GL_TLAN_SPARE_GL_TLAN_SPARE_SHIFT 0
+#define I40E_GL_TLAN_SPARE_GL_TLAN_SPARE_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_TLAN_SPARE_GL_TLAN_SPARE_SHIFT)
+#define I40E_GL_TUPM_SPARE 0x000a2230 /* Reset: CORER */
+#define I40E_GL_TUPM_SPARE_GL_TUPM_SPARE_SHIFT 0
+#define I40E_GL_TUPM_SPARE_GL_TUPM_SPARE_MASK I40E_MASK(0xFFFFFFFF, I40E_GL_TUPM_SPARE_GL_TUPM_SPARE_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG 0x000B81C0 /* Reset: POR */
+#define I40E_GLGEN_CAR_DEBUG_CAR_UPPER_CORE_CLK_EN_SHIFT 0
+#define I40E_GLGEN_CAR_DEBUG_CAR_UPPER_CORE_CLK_EN_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_UPPER_CORE_CLK_EN_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_HIU_CLK_EN_SHIFT 1
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_HIU_CLK_EN_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_HIU_CLK_EN_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PE_CLK_EN_SHIFT 2
+#define I40E_GLGEN_CAR_DEBUG_CAR_PE_CLK_EN_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PE_CLK_EN_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_PRIM_CLK_ACTIVE_SHIFT 3
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_PRIM_CLK_ACTIVE_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_PRIM_CLK_ACTIVE_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CDC_PE_ACTIVE_SHIFT 4
+#define I40E_GLGEN_CAR_DEBUG_CDC_PE_ACTIVE_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CDC_PE_ACTIVE_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_PRST_RESET_N_SHIFT 5
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_PRST_RESET_N_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_PRST_RESET_N_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_SCLR_RESET_N_SHIFT 6
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_SCLR_RESET_N_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_SCLR_RESET_N_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IB_RESET_N_SHIFT 7
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IB_RESET_N_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IB_RESET_N_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IMIB_RESET_N_SHIFT 8
+#define I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IMIB_RESET_N_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_PCIE_RAW_IMIB_RESET_N_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_EMP_RESET_N_SHIFT 9
+#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_EMP_RESET_N_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_RAW_EMP_RESET_N_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_GLOBAL_RESET_N_SHIFT 10
+#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_GLOBAL_RESET_N_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_RAW_GLOBAL_RESET_N_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_LAN_POWER_GOOD_SHIFT 11
+#define I40E_GLGEN_CAR_DEBUG_CAR_RAW_LAN_POWER_GOOD_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CAR_RAW_LAN_POWER_GOOD_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_CDC_IOSF_PRIMERY_RST_B_SHIFT 12
+#define I40E_GLGEN_CAR_DEBUG_CDC_IOSF_PRIMERY_RST_B_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_CDC_IOSF_PRIMERY_RST_B_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_GBE_GLOBALRST_B_SHIFT 13
+#define I40E_GLGEN_CAR_DEBUG_GBE_GLOBALRST_B_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_GBE_GLOBALRST_B_SHIFT)
+#define I40E_GLGEN_CAR_DEBUG_FLEEP_AL_GLOBR_DONE_SHIFT 14
+#define I40E_GLGEN_CAR_DEBUG_FLEEP_AL_GLOBR_DONE_MASK I40E_MASK(0x1, I40E_GLGEN_CAR_DEBUG_FLEEP_AL_GLOBR_DONE_SHIFT)
+#define I40E_GLGEN_MISC_SPARE 0x000880E0 /* Reset: POR */
+#define I40E_GLGEN_MISC_SPARE_GLGEN_MISC_SPARE_SHIFT 0
+#define I40E_GLGEN_MISC_SPARE_GLGEN_MISC_SPARE_MASK I40E_MASK(0xFFFFFFFF, I40E_GLGEN_MISC_SPARE_GLGEN_MISC_SPARE_SHIFT)
+#define I40E_GL_UFUSE_SOC 0x000BE550 /* Reset: POR */
+#define I40E_GL_UFUSE_SOC_PORT_MODE_SHIFT 0
+#define I40E_GL_UFUSE_SOC_PORT_MODE_MASK I40E_MASK(0x3, I40E_GL_UFUSE_SOC_PORT_MODE_SHIFT)
+#define I40E_GL_UFUSE_SOC_NIC_ID_SHIFT 2
+#define I40E_GL_UFUSE_SOC_NIC_ID_MASK I40E_MASK(0x1, I40E_GL_UFUSE_SOC_NIC_ID_SHIFT)
+#define I40E_GL_UFUSE_SOC_SPARE_FUSES_SHIFT 3
+#define I40E_GL_UFUSE_SOC_SPARE_FUSES_MASK I40E_MASK(0x1FFF, I40E_GL_UFUSE_SOC_SPARE_FUSES_SHIFT)
+#define I40E_PFINT_DYN_CTL0_WB_ON_ITR_SHIFT 30
+#define I40E_PFINT_DYN_CTL0_WB_ON_ITR_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTL0_WB_ON_ITR_SHIFT)
+#define I40E_PFINT_DYN_CTLN_WB_ON_ITR_SHIFT 30
+#define I40E_PFINT_DYN_CTLN_WB_ON_ITR_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_WB_ON_ITR_SHIFT)
+#define I40E_VFINT_DYN_CTL0_WB_ON_ITR_SHIFT 30
+#define I40E_VFINT_DYN_CTL0_WB_ON_ITR_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL0_WB_ON_ITR_SHIFT)
+#define I40E_VFINT_DYN_CTLN_WB_ON_ITR_SHIFT 30
+#define I40E_VFINT_DYN_CTLN_WB_ON_ITR_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN_WB_ON_ITR_SHIFT)
+#define I40E_VPLAN_QBASE(_VF) (0x00074800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VPLAN_QBASE_MAX_INDEX 127
+#define I40E_VPLAN_QBASE_VFFIRSTQ_SHIFT 0
+#define I40E_VPLAN_QBASE_VFFIRSTQ_MASK I40E_MASK(0x7FF, I40E_VPLAN_QBASE_VFFIRSTQ_SHIFT)
+#define I40E_VPLAN_QBASE_VFNUMQ_SHIFT 11
+#define I40E_VPLAN_QBASE_VFNUMQ_MASK I40E_MASK(0xFF, I40E_VPLAN_QBASE_VFNUMQ_SHIFT)
+#define I40E_VPLAN_QBASE_VFQTABLE_ENA_SHIFT 31
+#define I40E_VPLAN_QBASE_VFQTABLE_ENA_MASK I40E_MASK(0x1, I40E_VPLAN_QBASE_VFQTABLE_ENA_SHIFT)
+#define I40E_PRTMAC_LINK_DOWN_COUNTER 0x001E2440 /* Reset: GLOBR */
+#define I40E_PRTMAC_LINK_DOWN_COUNTER_LINK_DOWN_COUNTER_SHIFT 0
+#define I40E_PRTMAC_LINK_DOWN_COUNTER_LINK_DOWN_COUNTER_MASK I40E_MASK(0xFFFF, I40E_PRTMAC_LINK_DOWN_COUNTER_LINK_DOWN_COUNTER_SHIFT)
+#define I40E_GLNVM_AL_REQ 0x000B6164 /* Reset: POR */
+#define I40E_GLNVM_AL_REQ_POR_SHIFT 0
+#define I40E_GLNVM_AL_REQ_POR_MASK I40E_MASK(0x1, I40E_GLNVM_AL_REQ_POR_SHIFT)
+#define I40E_GLNVM_AL_REQ_PCIE_IMIB_SHIFT 1
+#define I40E_GLNVM_AL_REQ_PCIE_IMIB_MASK I40E_MASK(0x1, I40E_GLNVM_AL_REQ_PCIE_IMIB_SHIFT)
+#define I40E_GLNVM_AL_REQ_GLOBR_SHIFT 2
+#define I40E_GLNVM_AL_REQ_GLOBR_MASK I40E_MASK(0x1, I40E_GLNVM_AL_REQ_GLOBR_SHIFT)
+#define I40E_GLNVM_AL_REQ_CORER_SHIFT 3
+#define I40E_GLNVM_AL_REQ_CORER_MASK I40E_MASK(0x1, I40E_GLNVM_AL_REQ_CORER_SHIFT)
+#define I40E_GLNVM_AL_REQ_PE_SHIFT 4
+#define I40E_GLNVM_AL_REQ_PE_MASK I40E_MASK(0x1, I40E_GLNVM_AL_REQ_PE_SHIFT)
+#define I40E_GLNVM_AL_REQ_PCIE_IMIB_ASSERT_SHIFT 5
+#define I40E_GLNVM_AL_REQ_PCIE_IMIB_ASSERT_MASK I40E_MASK(0x1, I40E_GLNVM_AL_REQ_PCIE_IMIB_ASSERT_SHIFT)
+#define I40E_GLNVM_ALTIMERS 0x000B6140 /* Reset: POR */
+#define I40E_GLNVM_ALTIMERS_PCI_ALTIMER_SHIFT 0
+#define I40E_GLNVM_ALTIMERS_PCI_ALTIMER_MASK I40E_MASK(0xFFF, I40E_GLNVM_ALTIMERS_PCI_ALTIMER_SHIFT)
+#define I40E_GLNVM_ALTIMERS_GEN_ALTIMER_SHIFT 12
+#define I40E_GLNVM_ALTIMERS_GEN_ALTIMER_MASK I40E_MASK(0xFFFFF, I40E_GLNVM_ALTIMERS_GEN_ALTIMER_SHIFT)
+#define I40E_GLNVM_FLA 0x000B6108 /* Reset: POR */
+#define I40E_GLNVM_FLA_LOCKED_SHIFT 6
+#define I40E_GLNVM_FLA_LOCKED_MASK I40E_MASK(0x1, I40E_GLNVM_FLA_LOCKED_SHIFT)
+
+#define I40E_GLNVM_ULD 0x000B6008 /* Reset: POR */
+#define I40E_GLNVM_ULD_PCIER_DONE_SHIFT 0
+#define I40E_GLNVM_ULD_PCIER_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_PCIER_DONE_SHIFT)
+#define I40E_GLNVM_ULD_PCIER_DONE_1_SHIFT 1
+#define I40E_GLNVM_ULD_PCIER_DONE_1_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_PCIER_DONE_1_SHIFT)
+#define I40E_GLNVM_ULD_CORER_DONE_SHIFT 3
+#define I40E_GLNVM_ULD_CORER_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_CORER_DONE_SHIFT)
+#define I40E_GLNVM_ULD_GLOBR_DONE_SHIFT 4
+#define I40E_GLNVM_ULD_GLOBR_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_GLOBR_DONE_SHIFT)
+#define I40E_GLNVM_ULD_POR_DONE_SHIFT 5
+#define I40E_GLNVM_ULD_POR_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_POR_DONE_SHIFT)
+#define I40E_GLNVM_ULD_POR_DONE_1_SHIFT 8
+#define I40E_GLNVM_ULD_POR_DONE_1_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_POR_DONE_1_SHIFT)
+#define I40E_GLNVM_ULD_PCIER_DONE_2_SHIFT 9
+#define I40E_GLNVM_ULD_PCIER_DONE_2_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_PCIER_DONE_2_SHIFT)
+#define I40E_GLNVM_ULD_PE_DONE_SHIFT 10
+#define I40E_GLNVM_ULD_PE_DONE_MASK I40E_MASK(0x1, I40E_GLNVM_ULD_PE_DONE_SHIFT)
+#define I40E_GLNVM_ULT 0x000B6154 /* Reset: POR */
+#define I40E_GLNVM_ULT_CONF_PCIR_AE_SHIFT 0
+#define I40E_GLNVM_ULT_CONF_PCIR_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_PCIR_AE_SHIFT)
+#define I40E_GLNVM_ULT_CONF_PCIRTL_AE_SHIFT 1
+#define I40E_GLNVM_ULT_CONF_PCIRTL_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_PCIRTL_AE_SHIFT)
+#define I40E_GLNVM_ULT_RESERVED_1_SHIFT 2
+#define I40E_GLNVM_ULT_RESERVED_1_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_RESERVED_1_SHIFT)
+#define I40E_GLNVM_ULT_CONF_CORE_AE_SHIFT 3
+#define I40E_GLNVM_ULT_CONF_CORE_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_CORE_AE_SHIFT)
+#define I40E_GLNVM_ULT_CONF_GLOBAL_AE_SHIFT 4
+#define I40E_GLNVM_ULT_CONF_GLOBAL_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_GLOBAL_AE_SHIFT)
+#define I40E_GLNVM_ULT_CONF_POR_AE_SHIFT 5
+#define I40E_GLNVM_ULT_CONF_POR_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_POR_AE_SHIFT)
+#define I40E_GLNVM_ULT_RESERVED_2_SHIFT 6
+#define I40E_GLNVM_ULT_RESERVED_2_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_RESERVED_2_SHIFT)
+#define I40E_GLNVM_ULT_RESERVED_3_SHIFT 7
+#define I40E_GLNVM_ULT_RESERVED_3_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_RESERVED_3_SHIFT)
+#define I40E_GLNVM_ULT_CONF_EMP_AE_SHIFT 8
+#define I40E_GLNVM_ULT_CONF_EMP_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_EMP_AE_SHIFT)
+#define I40E_GLNVM_ULT_CONF_PCIALT_AE_SHIFT 9
+#define I40E_GLNVM_ULT_CONF_PCIALT_AE_MASK I40E_MASK(0x1, I40E_GLNVM_ULT_CONF_PCIALT_AE_SHIFT)
+#define I40E_GLNVM_ULT_RESERVED_4_SHIFT 10
+#define I40E_GLNVM_ULT_RESERVED_4_MASK I40E_MASK(0x3FFFFF, I40E_GLNVM_ULT_RESERVED_4_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT 0x000B615C /* Reset: POR */
+#define I40E_MEM_INIT_DONE_STAT_CMLAN_MEM_INIT_DONE_SHIFT 0
+#define I40E_MEM_INIT_DONE_STAT_CMLAN_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_CMLAN_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_PMAT_MEM_INIT_DONE_SHIFT 1
+#define I40E_MEM_INIT_DONE_STAT_PMAT_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_PMAT_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_RCU_MEM_INIT_DONE_SHIFT 2
+#define I40E_MEM_INIT_DONE_STAT_RCU_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RCU_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_TDPU_MEM_INIT_DONE_SHIFT 3
+#define I40E_MEM_INIT_DONE_STAT_TDPU_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TDPU_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_TLAN_MEM_INIT_DONE_SHIFT 4
+#define I40E_MEM_INIT_DONE_STAT_TLAN_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TLAN_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_RLAN_MEM_INIT_DONE_SHIFT 5
+#define I40E_MEM_INIT_DONE_STAT_RLAN_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RLAN_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_RDPU_MEM_INIT_DONE_SHIFT 6
+#define I40E_MEM_INIT_DONE_STAT_RDPU_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RDPU_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_PPRS_MEM_INIT_DONE_SHIFT 7
+#define I40E_MEM_INIT_DONE_STAT_PPRS_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_PPRS_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_RPB_MEM_INIT_DONE_SHIFT 8
+#define I40E_MEM_INIT_DONE_STAT_RPB_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RPB_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_TPB_MEM_INIT_DONE_SHIFT 9
+#define I40E_MEM_INIT_DONE_STAT_TPB_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TPB_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_FOC_MEM_INIT_DONE_SHIFT 10
+#define I40E_MEM_INIT_DONE_STAT_FOC_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_FOC_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_TSCD_MEM_INIT_DONE_SHIFT 11
+#define I40E_MEM_INIT_DONE_STAT_TSCD_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TSCD_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_TCB_MEM_INIT_DONE_SHIFT 12
+#define I40E_MEM_INIT_DONE_STAT_TCB_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_TCB_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_RCB_MEM_INIT_DONE_SHIFT 13
+#define I40E_MEM_INIT_DONE_STAT_RCB_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_RCB_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_WUC_MEM_INIT_DONE_SHIFT 14
+#define I40E_MEM_INIT_DONE_STAT_WUC_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_WUC_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_STAT_MEM_INIT_DONE_SHIFT 15
+#define I40E_MEM_INIT_DONE_STAT_STAT_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_STAT_MEM_INIT_DONE_SHIFT)
+#define I40E_MEM_INIT_DONE_STAT_ITR_MEM_INIT_DONE_SHIFT 16
+#define I40E_MEM_INIT_DONE_STAT_ITR_MEM_INIT_DONE_MASK I40E_MASK(0x1, I40E_MEM_INIT_DONE_STAT_ITR_MEM_INIT_DONE_SHIFT)
+#define I40E_MNGSB_DADD 0x000B7030 /* Reset: POR */
+#define I40E_MNGSB_DADD_ADDR_SHIFT 0
+#define I40E_MNGSB_DADD_ADDR_MASK I40E_MASK(0xFFFFFFFF, I40E_MNGSB_DADD_ADDR_SHIFT)
+#define I40E_MNGSB_DCNT 0x000B7034 /* Reset: POR */
+#define I40E_MNGSB_DCNT_BYTE_CNT_SHIFT 0
+#define I40E_MNGSB_DCNT_BYTE_CNT_MASK I40E_MASK(0xFFFFFFFF, I40E_MNGSB_DCNT_BYTE_CNT_SHIFT)
+#define I40E_MNGSB_MSGCTL 0x000B7020 /* Reset: POR */
+#define I40E_MNGSB_MSGCTL_HDR_DWS_SHIFT 0
+#define I40E_MNGSB_MSGCTL_HDR_DWS_MASK I40E_MASK(0x3, I40E_MNGSB_MSGCTL_HDR_DWS_SHIFT)
+#define I40E_MNGSB_MSGCTL_EXP_RDW_SHIFT 8
+#define I40E_MNGSB_MSGCTL_EXP_RDW_MASK I40E_MASK(0x1FF, I40E_MNGSB_MSGCTL_EXP_RDW_SHIFT)
+#define I40E_MNGSB_MSGCTL_MSG_MODE_SHIFT 26
+#define I40E_MNGSB_MSGCTL_MSG_MODE_MASK I40E_MASK(0x3, I40E_MNGSB_MSGCTL_MSG_MODE_SHIFT)
+#define I40E_MNGSB_MSGCTL_TOKEN_MODE_SHIFT 28
+#define I40E_MNGSB_MSGCTL_TOKEN_MODE_MASK I40E_MASK(0x3, I40E_MNGSB_MSGCTL_TOKEN_MODE_SHIFT)
+#define I40E_MNGSB_MSGCTL_BARCLR_SHIFT 30
+#define I40E_MNGSB_MSGCTL_BARCLR_MASK I40E_MASK(0x1, I40E_MNGSB_MSGCTL_BARCLR_SHIFT)
+#define I40E_MNGSB_MSGCTL_CMDV_SHIFT 31
+#define I40E_MNGSB_MSGCTL_CMDV_MASK I40E_MASK(0x1, I40E_MNGSB_MSGCTL_CMDV_SHIFT)
+#define I40E_MNGSB_RDATA 0x000B7300 /* Reset: POR */
+#define I40E_MNGSB_RDATA_DATA_SHIFT 0
+#define I40E_MNGSB_RDATA_DATA_MASK I40E_MASK(0xFFFFFFFF, I40E_MNGSB_RDATA_DATA_SHIFT)
+#define I40E_MNGSB_RHDR0 0x000B72FC /* Reset: POR */
+#define I40E_MNGSB_RHDR0_DESTINATION_SHIFT 0
+#define I40E_MNGSB_RHDR0_DESTINATION_MASK I40E_MASK(0xFF, I40E_MNGSB_RHDR0_DESTINATION_SHIFT)
+#define I40E_MNGSB_RHDR0_SOURCE_SHIFT 8
+#define I40E_MNGSB_RHDR0_SOURCE_MASK I40E_MASK(0xFF, I40E_MNGSB_RHDR0_SOURCE_SHIFT)
+#define I40E_MNGSB_RHDR0_OPCODE_SHIFT 16
+#define I40E_MNGSB_RHDR0_OPCODE_MASK I40E_MASK(0xFF, I40E_MNGSB_RHDR0_OPCODE_SHIFT)
+#define I40E_MNGSB_RHDR0_TAG_SHIFT 24
+#define I40E_MNGSB_RHDR0_TAG_MASK I40E_MASK(0x7, I40E_MNGSB_RHDR0_TAG_SHIFT)
+#define I40E_MNGSB_RHDR0_RESPONSE_SHIFT 27
+#define I40E_MNGSB_RHDR0_RESPONSE_MASK I40E_MASK(0x7, I40E_MNGSB_RHDR0_RESPONSE_SHIFT)
+#define I40E_MNGSB_RHDR0_EH_SHIFT 31
+#define I40E_MNGSB_RHDR0_EH_MASK I40E_MASK(0x1, I40E_MNGSB_RHDR0_EH_SHIFT)
+#define I40E_MNGSB_RSPCTL 0x000B7024 /* Reset: POR */
+#define I40E_MNGSB_RSPCTL_DMA_MSG_DWORDS_SHIFT 0
+#define I40E_MNGSB_RSPCTL_DMA_MSG_DWORDS_MASK I40E_MASK(0x1FF, I40E_MNGSB_RSPCTL_DMA_MSG_DWORDS_SHIFT)
+#define I40E_MNGSB_RSPCTL_RSP_MODE_SHIFT 26
+#define I40E_MNGSB_RSPCTL_RSP_MODE_MASK I40E_MASK(0x3, I40E_MNGSB_RSPCTL_RSP_MODE_SHIFT)
+#define I40E_MNGSB_RSPCTL_RSP_BAD_LEN_SHIFT 30
+#define I40E_MNGSB_RSPCTL_RSP_BAD_LEN_MASK I40E_MASK(0x1, I40E_MNGSB_RSPCTL_RSP_BAD_LEN_SHIFT)
+#define I40E_MNGSB_RSPCTL_RSP_ERR_SHIFT 31
+#define I40E_MNGSB_RSPCTL_RSP_ERR_MASK I40E_MASK(0x1, I40E_MNGSB_RSPCTL_RSP_ERR_SHIFT)
+#define I40E_MNGSB_WDATA 0x000B7100 /* Reset: POR */
+#define I40E_MNGSB_WDATA_DATA_SHIFT 0
+#define I40E_MNGSB_WDATA_DATA_MASK I40E_MASK(0xFFFFFFFF, I40E_MNGSB_WDATA_DATA_SHIFT)
+#define I40E_MNGSB_WHDR0 0x000B70F4 /* Reset: POR */
+#define I40E_MNGSB_WHDR0_RAW_DEST_SHIFT 0
+#define I40E_MNGSB_WHDR0_RAW_DEST_MASK I40E_MASK(0xFF, I40E_MNGSB_WHDR0_RAW_DEST_SHIFT)
+#define I40E_MNGSB_WHDR0_DEST_SEL_SHIFT 12
+#define I40E_MNGSB_WHDR0_DEST_SEL_MASK I40E_MASK(0xF, I40E_MNGSB_WHDR0_DEST_SEL_SHIFT)
+#define I40E_MNGSB_WHDR0_OPCODE_SEL_SHIFT 16
+#define I40E_MNGSB_WHDR0_OPCODE_SEL_MASK I40E_MASK(0xFF, I40E_MNGSB_WHDR0_OPCODE_SEL_SHIFT)
+#define I40E_MNGSB_WHDR0_TAG_SHIFT 24
+#define I40E_MNGSB_WHDR0_TAG_MASK I40E_MASK(0x7F, I40E_MNGSB_WHDR0_TAG_SHIFT)
+#define I40E_MNGSB_WHDR1 0x000B70F8 /* Reset: POR */
+#define I40E_MNGSB_WHDR1_ADDR_SHIFT 0
+#define I40E_MNGSB_WHDR1_ADDR_MASK I40E_MASK(0xFFFFFFFF, I40E_MNGSB_WHDR1_ADDR_SHIFT)
+#define I40E_MNGSB_WHDR2 0x000B70FC /* Reset: POR */
+#define I40E_MNGSB_WHDR2_LENGTH_SHIFT 0
+#define I40E_MNGSB_WHDR2_LENGTH_MASK I40E_MASK(0xFFFFFFFF, I40E_MNGSB_WHDR2_LENGTH_SHIFT)
+
+#define I40E_GLPCI_CAPSUP_WAKUP_EN_SHIFT 21
+#define I40E_GLPCI_CAPSUP_WAKUP_EN_MASK I40E_MASK(0x1, I40E_GLPCI_CAPSUP_WAKUP_EN_SHIFT)
+
+#define I40E_GLPCI_CUR_CLNT_COMMON 0x0009CA18 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_CLNT_COMMON_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_CLNT_COMMON_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_CLNT_COMMON_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_CLNT_COMMON_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_CLNT_COMMON_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_CLNT_COMMON_OSR_SHIFT)
+#define I40E_GLPCI_CUR_CLNT_PIPEMON 0x0009CA20 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_CLNT_PIPEMON_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_CLNT_PIPEMON_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_CLNT_PIPEMON_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_MNG_ALWD 0x0009c514 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_MNG_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_MNG_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_MNG_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_MNG_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_MNG_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_MNG_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_MNG_RSVD 0x0009c594 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_MNG_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_MNG_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_MNG_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_MNG_RSVD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_MNG_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_MNG_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_PMAT_ALWD 0x0009c510 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_PMAT_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_PMAT_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_PMAT_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_PMAT_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_PMAT_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_PMAT_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_PMAT_RSVD 0x0009c590 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_PMAT_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_PMAT_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_PMAT_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_PMAT_RSVD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_PMAT_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_PMAT_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_RLAN_ALWD 0x0009c500 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_RLAN_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_RLAN_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RLAN_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_RLAN_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_RLAN_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RLAN_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_RLAN_RSVD 0x0009c580 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_RLAN_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_RLAN_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RLAN_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_RLAN_RSVD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_RLAN_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RLAN_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_RXPE_ALWD 0x0009c508 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_RXPE_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_RXPE_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RXPE_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_RXPE_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_RXPE_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RXPE_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_RXPE_RSVD 0x0009c588 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_RXPE_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_RXPE_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RXPE_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_RXPE_RSVD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_RXPE_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_RXPE_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_TDPU_ALWD 0x0009c518 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_TDPU_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_TDPU_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TDPU_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_TDPU_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_TDPU_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TDPU_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_TDPU_RSVD 0x0009c598 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_TDPU_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_TDPU_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TDPU_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_TDPU_RSVD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_TDPU_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TDPU_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_TLAN_ALWD 0x0009c504 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_TLAN_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_TLAN_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TLAN_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_TLAN_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_TLAN_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TLAN_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_TLAN_RSVD 0x0009c584 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_TLAN_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_TLAN_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TLAN_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_TLAN_RSVD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_TLAN_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TLAN_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_TXPE_ALWD 0x0009c50C /* Reset: PCIR */
+#define I40E_GLPCI_CUR_TXPE_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_TXPE_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TXPE_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_TXPE_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_TXPE_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TXPE_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_TXPE_RSVD 0x0009c58c /* Reset: PCIR */
+#define I40E_GLPCI_CUR_TXPE_RSVD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_TXPE_RSVD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TXPE_RSVD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_TXPE_RSVD_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_TXPE_RSVD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_TXPE_RSVD_OSR_SHIFT)
+#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON 0x0009CA28 /* Reset: PCIR */
+#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_WATMK_CLNT_COMMON_DATA_LINES_SHIFT)
+#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON_OSR_SHIFT 16
+#define I40E_GLPCI_CUR_WATMK_CLNT_COMMON_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_CUR_WATMK_CLNT_COMMON_OSR_SHIFT)
+
+#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT 4
+#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_MASK I40E_MASK(0x3, I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT)
+#define I40E_GLPCI_LBARCTRL_VF_PE_DB_SIZE_SHIFT 10
+#define I40E_GLPCI_LBARCTRL_VF_PE_DB_SIZE_MASK I40E_MASK(0x1, I40E_GLPCI_LBARCTRL_VF_PE_DB_SIZE_SHIFT)
+#define I40E_GLPCI_NPQ_CFG 0x0009CA00 /* Reset: PCIR */
+#define I40E_GLPCI_NPQ_CFG_EXTEND_TO_SHIFT 0
+#define I40E_GLPCI_NPQ_CFG_EXTEND_TO_MASK I40E_MASK(0x1, I40E_GLPCI_NPQ_CFG_EXTEND_TO_SHIFT)
+#define I40E_GLPCI_NPQ_CFG_SMALL_TO_SHIFT 1
+#define I40E_GLPCI_NPQ_CFG_SMALL_TO_MASK I40E_MASK(0x1, I40E_GLPCI_NPQ_CFG_SMALL_TO_SHIFT)
+#define I40E_GLPCI_NPQ_CFG_WEIGHT_AVG_SHIFT 2
+#define I40E_GLPCI_NPQ_CFG_WEIGHT_AVG_MASK I40E_MASK(0xF, I40E_GLPCI_NPQ_CFG_WEIGHT_AVG_SHIFT)
+#define I40E_GLPCI_NPQ_CFG_NPQ_SPARE_SHIFT 6
+#define I40E_GLPCI_NPQ_CFG_NPQ_SPARE_MASK I40E_MASK(0x3FF, I40E_GLPCI_NPQ_CFG_NPQ_SPARE_SHIFT)
+#define I40E_GLPCI_NPQ_CFG_NPQ_ERR_STAT_SHIFT 16
+#define I40E_GLPCI_NPQ_CFG_NPQ_ERR_STAT_MASK I40E_MASK(0xF, I40E_GLPCI_NPQ_CFG_NPQ_ERR_STAT_SHIFT)
+#define I40E_GLPCI_WATMK_CLNT_PIPEMON 0x0009CA30 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_CLNT_PIPEMON_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_CLNT_PIPEMON_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_CLNT_PIPEMON_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_MNG_ALWD 0x0009CB14 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_MNG_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_MNG_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_MNG_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_MNG_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_WATMK_MNG_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_MNG_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_WATMK_PMAT_ALWD 0x0009CB10 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_PMAT_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_PMAT_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_PMAT_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_PMAT_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_WATMK_PMAT_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_PMAT_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_WATMK_RLAN_ALWD 0x0009CB00 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_RLAN_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_RLAN_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_RLAN_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_RLAN_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_WATMK_RLAN_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_RLAN_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_WATMK_RXPE_ALWD 0x0009CB08 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_RXPE_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_RXPE_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_RXPE_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_RXPE_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_WATMK_RXPE_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_RXPE_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_WATMK_TLAN_ALWD 0x0009CB04 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_TLAN_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_TLAN_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TLAN_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_TLAN_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_WATMK_TLAN_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TLAN_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_WATMK_TPDU_ALWD 0x0009CB18 /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_TPDU_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_TPDU_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TPDU_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_TPDU_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_WATMK_TPDU_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TPDU_ALWD_OSR_SHIFT)
+#define I40E_GLPCI_WATMK_TXPE_ALWD 0x0009CB0c /* Reset: PCIR */
+#define I40E_GLPCI_WATMK_TXPE_ALWD_DATA_LINES_SHIFT 0
+#define I40E_GLPCI_WATMK_TXPE_ALWD_DATA_LINES_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TXPE_ALWD_DATA_LINES_SHIFT)
+#define I40E_GLPCI_WATMK_TXPE_ALWD_OSR_SHIFT 16
+#define I40E_GLPCI_WATMK_TXPE_ALWD_OSR_MASK I40E_MASK(0xFFFF, I40E_GLPCI_WATMK_TXPE_ALWD_OSR_SHIFT)
+#define I40E_GLPE_CPUSTATUS0 0x0000D040 /* Reset: PE_CORER */
+#define I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_SHIFT 0
+#define I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_SHIFT)
+#define I40E_GLPE_CPUSTATUS1 0x0000D044 /* Reset: PE_CORER */
+#define I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_SHIFT 0
+#define I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_SHIFT)
+#define I40E_GLPE_CPUSTATUS2 0x0000D048 /* Reset: PE_CORER */
+#define I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_SHIFT 0
+#define I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_SHIFT)
+#define I40E_GLPE_CPUTRIG0 0x0000D060 /* Reset: PE_CORER */
+#define I40E_GLPE_CPUTRIG0_PECPUTRIG0_SHIFT 0
+#define I40E_GLPE_CPUTRIG0_PECPUTRIG0_MASK I40E_MASK(0xFFFF, I40E_GLPE_CPUTRIG0_PECPUTRIG0_SHIFT)
+#define I40E_GLPE_CPUTRIG0_TEPREQUEST0_SHIFT 17
+#define I40E_GLPE_CPUTRIG0_TEPREQUEST0_MASK I40E_MASK(0x1, I40E_GLPE_CPUTRIG0_TEPREQUEST0_SHIFT)
+#define I40E_GLPE_CPUTRIG0_OOPREQUEST0_SHIFT 18
+#define I40E_GLPE_CPUTRIG0_OOPREQUEST0_MASK I40E_MASK(0x1, I40E_GLPE_CPUTRIG0_OOPREQUEST0_SHIFT)
+#define I40E_GLPE_DUAL40_RUPM 0x0000DA04 /* Reset: PE_CORER */
+#define I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_SHIFT 0
+#define I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_MASK I40E_MASK(0x1, I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_SHIFT)
+#define I40E_GLPE_PFAEQEDROPCNT(_i) (0x00131440 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLPE_PFAEQEDROPCNT_MAX_INDEX 15
+#define I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_SHIFT 0
+#define I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_SHIFT)
+#define I40E_GLPE_PFCEQEDROPCNT(_i) (0x001313C0 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLPE_PFCEQEDROPCNT_MAX_INDEX 15
+#define I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_SHIFT 0
+#define I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_SHIFT)
+#define I40E_GLPE_PFCQEDROPCNT(_i) (0x00131340 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
+#define I40E_GLPE_PFCQEDROPCNT_MAX_INDEX 15
+#define I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_SHIFT 0
+#define I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_SHIFT)
+#define I40E_GLPE_RUPM_CQPPOOL 0x0000DACC /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_SHIFT 0
+#define I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_MASK I40E_MASK(0xFF, I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_SHIFT)
+#define I40E_GLPE_RUPM_FLRPOOL 0x0000DAC4 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_SHIFT 0
+#define I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_MASK I40E_MASK(0xFF, I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL 0x0000DA00 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_GCTL_ALLOFFTH_SHIFT 0
+#define I40E_GLPE_RUPM_GCTL_ALLOFFTH_MASK I40E_MASK(0xFF, I40E_GLPE_RUPM_GCTL_ALLOFFTH_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_SHIFT 26
+#define I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_SHIFT 27
+#define I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_SHIFT 28
+#define I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_SHIFT 29
+#define I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_RUPM_DIS_SHIFT 30
+#define I40E_GLPE_RUPM_GCTL_RUPM_DIS_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_RUPM_DIS_SHIFT)
+#define I40E_GLPE_RUPM_GCTL_SWLB_MODE_SHIFT 31
+#define I40E_GLPE_RUPM_GCTL_SWLB_MODE_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_GCTL_SWLB_MODE_SHIFT)
+#define I40E_GLPE_RUPM_PTXPOOL 0x0000DAC8 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_SHIFT 0
+#define I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_MASK I40E_MASK(0xFF, I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_SHIFT)
+#define I40E_GLPE_RUPM_PUSHPOOL 0x0000DAC0 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_SHIFT 0
+#define I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_MASK I40E_MASK(0xFF, I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_SHIFT)
+#define I40E_GLPE_RUPM_TXHOST_EN 0x0000DA08 /* Reset: PE_CORER */
+#define I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_SHIFT 0
+#define I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_MASK I40E_MASK(0x1, I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_SHIFT)
+#define I40E_GLPE_VFAEQEDROPCNT(_i) (0x00132540 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLPE_VFAEQEDROPCNT_MAX_INDEX 31
+#define I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_SHIFT 0
+#define I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_SHIFT)
+#define I40E_GLPE_VFCEQEDROPCNT(_i) (0x00132440 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLPE_VFCEQEDROPCNT_MAX_INDEX 31
+#define I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_SHIFT 0
+#define I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_SHIFT)
+#define I40E_GLPE_VFCQEDROPCNT(_i) (0x00132340 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
+#define I40E_GLPE_VFCQEDROPCNT_MAX_INDEX 31
+#define I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_SHIFT 0
+#define I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_SHIFT)
+#define I40E_GLPE_VFFLMOBJCTRL(_i) (0x0000D400 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFFLMOBJCTRL_MAX_INDEX 31
+#define I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_SHIFT 0
+#define I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_MASK I40E_MASK(0x7, I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_SHIFT)
+#define I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_SHIFT 8
+#define I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_MASK I40E_MASK(0x7, I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_SHIFT)
+#define I40E_GLPE_VFFLMQ1ALLOCERR(_i) (0x0000C700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFFLMQ1ALLOCERR_MAX_INDEX 31
+#define I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_SHIFT 0
+#define I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_SHIFT)
+#define I40E_GLPE_VFFLMXMITALLOCERR(_i) (0x0000C600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFFLMXMITALLOCERR_MAX_INDEX 31
+#define I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_SHIFT 0
+#define I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_MASK I40E_MASK(0xFFFF, I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_SHIFT)
+#define I40E_GLPE_VFUDACTRL(_i) (0x0000C000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFUDACTRL_MAX_INDEX 31
+#define I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_SHIFT 0
+#define I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_MASK I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_SHIFT)
+#define I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_SHIFT 1
+#define I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_MASK I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_SHIFT)
+#define I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_SHIFT 2
+#define I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_MASK I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_SHIFT)
+#define I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_SHIFT 3
+#define I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_MASK I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_SHIFT)
+#define I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_SHIFT 4
+#define I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_MASK I40E_MASK(0x1, I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_SHIFT)
+#define I40E_GLPE_VFUDAUCFBQPN(_i) (0x0000C100 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPE_VFUDAUCFBQPN_MAX_INDEX 31
+#define I40E_GLPE_VFUDAUCFBQPN_QPN_SHIFT 0
+#define I40E_GLPE_VFUDAUCFBQPN_QPN_MASK I40E_MASK(0x3FFFF, I40E_GLPE_VFUDAUCFBQPN_QPN_SHIFT)
+#define I40E_GLPE_VFUDAUCFBQPN_VALID_SHIFT 31
+#define I40E_GLPE_VFUDAUCFBQPN_VALID_MASK I40E_MASK(0x1, I40E_GLPE_VFUDAUCFBQPN_VALID_SHIFT)
+#define I40E_PFPE_AEQALLOC 0x00131180 /* Reset: PFR */
+#define I40E_PFPE_AEQALLOC_AECOUNT_SHIFT 0
+#define I40E_PFPE_AEQALLOC_AECOUNT_MASK I40E_MASK(0xFFFFFFFF, I40E_PFPE_AEQALLOC_AECOUNT_SHIFT)
+#define I40E_PFPE_CCQPHIGH 0x00008200 /* Reset: PFR */
+#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT 0
+#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_MASK I40E_MASK(0xFFFFFFFF, I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT)
+#define I40E_PFPE_CCQPLOW 0x00008180 /* Reset: PFR */
+#define I40E_PFPE_CCQPLOW_PECCQPLOW_SHIFT 0
+#define I40E_PFPE_CCQPLOW_PECCQPLOW_MASK I40E_MASK(0xFFFFFFFF, I40E_PFPE_CCQPLOW_PECCQPLOW_SHIFT)
+#define I40E_PFPE_CCQPSTATUS 0x00008100 /* Reset: PFR */
+#define I40E_PFPE_CCQPSTATUS_CCQP_DONE_SHIFT 0
+#define I40E_PFPE_CCQPSTATUS_CCQP_DONE_MASK I40E_MASK(0x1, I40E_PFPE_CCQPSTATUS_CCQP_DONE_SHIFT)
+#define I40E_PFPE_CCQPSTATUS_HMC_PROFILE_SHIFT 4
+#define I40E_PFPE_CCQPSTATUS_HMC_PROFILE_MASK I40E_MASK(0x7, I40E_PFPE_CCQPSTATUS_HMC_PROFILE_SHIFT)
+#define I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT 16
+#define I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_MASK I40E_MASK(0x3F, I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT)
+#define I40E_PFPE_CCQPSTATUS_CCQP_ERR_SHIFT 31
+#define I40E_PFPE_CCQPSTATUS_CCQP_ERR_MASK I40E_MASK(0x1, I40E_PFPE_CCQPSTATUS_CCQP_ERR_SHIFT)
+#define I40E_PFPE_CQACK 0x00131100 /* Reset: PFR */
+#define I40E_PFPE_CQACK_PECQID_SHIFT 0
+#define I40E_PFPE_CQACK_PECQID_MASK I40E_MASK(0x1FFFF, I40E_PFPE_CQACK_PECQID_SHIFT)
+#define I40E_PFPE_CQARM 0x00131080 /* Reset: PFR */
+#define I40E_PFPE_CQARM_PECQID_SHIFT 0
+#define I40E_PFPE_CQARM_PECQID_MASK I40E_MASK(0x1FFFF, I40E_PFPE_CQARM_PECQID_SHIFT)
+#define I40E_PFPE_CQPDB 0x00008000 /* Reset: PFR */
+#define I40E_PFPE_CQPDB_WQHEAD_SHIFT 0
+#define I40E_PFPE_CQPDB_WQHEAD_MASK I40E_MASK(0x7FF, I40E_PFPE_CQPDB_WQHEAD_SHIFT)
+#define I40E_PFPE_CQPERRCODES 0x00008880 /* Reset: PFR */
+#define I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT 0
+#define I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_MASK I40E_MASK(0xFFFF, I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT)
+#define I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT 16
+#define I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_MASK I40E_MASK(0xFFFF, I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT)
+#define I40E_PFPE_CQPTAIL 0x00008080 /* Reset: PFR */
+#define I40E_PFPE_CQPTAIL_WQTAIL_SHIFT 0
+#define I40E_PFPE_CQPTAIL_WQTAIL_MASK I40E_MASK(0x7FF, I40E_PFPE_CQPTAIL_WQTAIL_SHIFT)
+#define I40E_PFPE_CQPTAIL_CQP_OP_ERR_SHIFT 31
+#define I40E_PFPE_CQPTAIL_CQP_OP_ERR_MASK I40E_MASK(0x1, I40E_PFPE_CQPTAIL_CQP_OP_ERR_SHIFT)
+#define I40E_PFPE_FLMQ1ALLOCERR 0x00008980 /* Reset: PFR */
+#define I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_SHIFT 0
+#define I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_MASK I40E_MASK(0xFFFF, I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_SHIFT)
+#define I40E_PFPE_FLMXMITALLOCERR 0x00008900 /* Reset: PFR */
+#define I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_SHIFT 0
+#define I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_MASK I40E_MASK(0xFFFF, I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_SHIFT)
+#define I40E_PFPE_IPCONFIG0 0x00008280 /* Reset: PFR */
+#define I40E_PFPE_IPCONFIG0_PEIPID_SHIFT 0
+#define I40E_PFPE_IPCONFIG0_PEIPID_MASK I40E_MASK(0xFFFF, I40E_PFPE_IPCONFIG0_PEIPID_SHIFT)
+#define I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT 16
+#define I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_MASK I40E_MASK(0x1, I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT)
+#define I40E_PFPE_MRTEIDXMASK 0x00008600 /* Reset: PFR */
+#define I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT 0
+#define I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_MASK I40E_MASK(0x1F, I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT)
+#define I40E_PFPE_RCVUNEXPECTEDERROR 0x00008680 /* Reset: PFR */
+#define I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT 0
+#define I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_MASK I40E_MASK(0xFFFFFF, I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT)
+#define I40E_PFPE_TCPNOWTIMER 0x00008580 /* Reset: PFR */
+#define I40E_PFPE_TCPNOWTIMER_TCP_NOW_SHIFT 0
+#define I40E_PFPE_TCPNOWTIMER_TCP_NOW_MASK I40E_MASK(0xFFFFFFFF, I40E_PFPE_TCPNOWTIMER_TCP_NOW_SHIFT)
+#define I40E_PFPE_UDACTRL 0x00008700 /* Reset: PFR */
+#define I40E_PFPE_UDACTRL_IPV4MCFRAGRESBP_SHIFT 0
+#define I40E_PFPE_UDACTRL_IPV4MCFRAGRESBP_MASK I40E_MASK(0x1, I40E_PFPE_UDACTRL_IPV4MCFRAGRESBP_SHIFT)
+#define I40E_PFPE_UDACTRL_IPV4UCFRAGRESBP_SHIFT 1
+#define I40E_PFPE_UDACTRL_IPV4UCFRAGRESBP_MASK I40E_MASK(0x1, I40E_PFPE_UDACTRL_IPV4UCFRAGRESBP_SHIFT)
+#define I40E_PFPE_UDACTRL_IPV6MCFRAGRESBP_SHIFT 2
+#define I40E_PFPE_UDACTRL_IPV6MCFRAGRESBP_MASK I40E_MASK(0x1, I40E_PFPE_UDACTRL_IPV6MCFRAGRESBP_SHIFT)
+#define I40E_PFPE_UDACTRL_IPV6UCFRAGRESBP_SHIFT 3
+#define I40E_PFPE_UDACTRL_IPV6UCFRAGRESBP_MASK I40E_MASK(0x1, I40E_PFPE_UDACTRL_IPV6UCFRAGRESBP_SHIFT)
+#define I40E_PFPE_UDACTRL_UDPMCFRAGRESFAIL_SHIFT 4
+#define I40E_PFPE_UDACTRL_UDPMCFRAGRESFAIL_MASK I40E_MASK(0x1, I40E_PFPE_UDACTRL_UDPMCFRAGRESFAIL_SHIFT)
+#define I40E_PFPE_UDAUCFBQPN 0x00008780 /* Reset: PFR */
+#define I40E_PFPE_UDAUCFBQPN_QPN_SHIFT 0
+#define I40E_PFPE_UDAUCFBQPN_QPN_MASK I40E_MASK(0x3FFFF, I40E_PFPE_UDAUCFBQPN_QPN_SHIFT)
+#define I40E_PFPE_UDAUCFBQPN_VALID_SHIFT 31
+#define I40E_PFPE_UDAUCFBQPN_VALID_MASK I40E_MASK(0x1, I40E_PFPE_UDAUCFBQPN_VALID_SHIFT)
+#define I40E_PFPE_WQEALLOC 0x00138C00 /* Reset: PFR */
+#define I40E_PFPE_WQEALLOC_PEQPID_SHIFT 0
+#define I40E_PFPE_WQEALLOC_PEQPID_MASK I40E_MASK(0x3FFFF, I40E_PFPE_WQEALLOC_PEQPID_SHIFT)
+#define I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT 20
+#define I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_MASK I40E_MASK(0xFFF, I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT)
+#define I40E_PRTDCB_RLPMC 0x0001F140 /* Reset: PE_CORER */
+#define I40E_PRTDCB_RLPMC_TC2PFC_SHIFT 0
+#define I40E_PRTDCB_RLPMC_TC2PFC_MASK I40E_MASK(0xFF, I40E_PRTDCB_RLPMC_TC2PFC_SHIFT)
+#define I40E_PRTDCB_TCMSTC_RLPM(_i) (0x0001F040 + ((_i) * 32)) /* _i=0...7 */ /* Reset: PE_CORER */
+#define I40E_PRTDCB_TCMSTC_RLPM_MAX_INDEX 7
+#define I40E_PRTDCB_TCMSTC_RLPM_MSTC_SHIFT 0
+#define I40E_PRTDCB_TCMSTC_RLPM_MSTC_MASK I40E_MASK(0xFFFFF, I40E_PRTDCB_TCMSTC_RLPM_MSTC_SHIFT)
+#define I40E_PRTDCB_TCPMC_RLPM 0x0001F1A0 /* Reset: PE_CORER */
+#define I40E_PRTDCB_TCPMC_RLPM_CPM_SHIFT 0
+#define I40E_PRTDCB_TCPMC_RLPM_CPM_MASK I40E_MASK(0x1FFF, I40E_PRTDCB_TCPMC_RLPM_CPM_SHIFT)
+#define I40E_PRTDCB_TCPMC_RLPM_LLTC_SHIFT 13
+#define I40E_PRTDCB_TCPMC_RLPM_LLTC_MASK I40E_MASK(0xFF, I40E_PRTDCB_TCPMC_RLPM_LLTC_SHIFT)
+#define I40E_PRTDCB_TCPMC_RLPM_TCPM_MODE_SHIFT 30
+#define I40E_PRTDCB_TCPMC_RLPM_TCPM_MODE_MASK I40E_MASK(0x1, I40E_PRTDCB_TCPMC_RLPM_TCPM_MODE_SHIFT)
+#define I40E_PRTE_RUPM_TCCNTR03 0x0000DAE0 /* Reset: PE_CORER */
+#define I40E_PRTE_RUPM_TCCNTR03_TC0COUNT_SHIFT 0
+#define I40E_PRTE_RUPM_TCCNTR03_TC0COUNT_MASK I40E_MASK(0xFF, I40E_PRTE_RUPM_TCCNTR03_TC0COUNT_SHIFT)
+#define I40E_PRTE_RUPM_TCCNTR03_TC1COUNT_SHIFT 8
+#define I40E_PRTE_RUPM_TCCNTR03_TC1COUNT_MASK I40E_MASK(0xFF, I40E_PRTE_RUPM_TCCNTR03_TC1COUNT_SHIFT)
+#define I40E_PRTE_RUPM_TCCNTR03_TC2COUNT_SHIFT 16
+#define I40E_PRTE_RUPM_TCCNTR03_TC2COUNT_MASK I40E_MASK(0xFF, I40E_PRTE_RUPM_TCCNTR03_TC2COUNT_SHIFT)
+#define I40E_PRTE_RUPM_TCCNTR03_TC3COUNT_SHIFT 24
+#define I40E_PRTE_RUPM_TCCNTR03_TC3COUNT_MASK I40E_MASK(0xFF, I40E_PRTE_RUPM_TCCNTR03_TC3COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_CNTR 0x0000DB20 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_CNTR_COUNT_SHIFT 0
+#define I40E_PRTPE_RUPM_CNTR_COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_CNTR_COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_CTL 0x0000DA40 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_CTL_LLTC_SHIFT 13
+#define I40E_PRTPE_RUPM_CTL_LLTC_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_CTL_LLTC_SHIFT)
+#define I40E_PRTPE_RUPM_CTL_RUPM_MODE_SHIFT 30
+#define I40E_PRTPE_RUPM_CTL_RUPM_MODE_MASK I40E_MASK(0x1, I40E_PRTPE_RUPM_CTL_RUPM_MODE_SHIFT)
+#define I40E_PRTPE_RUPM_PFCCTL 0x0000DA60 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_PFCCTL_TC2PFC_SHIFT 0
+#define I40E_PRTPE_RUPM_PFCCTL_TC2PFC_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PFCCTL_TC2PFC_SHIFT)
+#define I40E_PRTPE_RUPM_PFCPC 0x0000DA80 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_PFCPC_PORTOFFTH_SHIFT 0
+#define I40E_PRTPE_RUPM_PFCPC_PORTOFFTH_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PFCPC_PORTOFFTH_SHIFT)
+#define I40E_PRTPE_RUPM_PFCTCC 0x0000DAA0 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_PFCTCC_TCOFFTH_SHIFT 0
+#define I40E_PRTPE_RUPM_PFCTCC_TCOFFTH_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PFCTCC_TCOFFTH_SHIFT)
+#define I40E_PRTPE_RUPM_PFCTCC_LL_PRI_TH_SHIFT 16
+#define I40E_PRTPE_RUPM_PFCTCC_LL_PRI_TH_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PFCTCC_LL_PRI_TH_SHIFT)
+#define I40E_PRTPE_RUPM_PFCTCC_LL_PRI_EN_SHIFT 31
+#define I40E_PRTPE_RUPM_PFCTCC_LL_PRI_EN_MASK I40E_MASK(0x1, I40E_PRTPE_RUPM_PFCTCC_LL_PRI_EN_SHIFT)
+#define I40E_PRTPE_RUPM_PTCTCCNTR47 0x0000DB60 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC4COUNT_SHIFT 0
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC4COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTCTCCNTR47_TC4COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC5COUNT_SHIFT 8
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC5COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTCTCCNTR47_TC5COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC6COUNT_SHIFT 16
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC6COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTCTCCNTR47_TC6COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC7COUNT_SHIFT 24
+#define I40E_PRTPE_RUPM_PTCTCCNTR47_TC7COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTCTCCNTR47_TC7COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTXTCCNTR03 0x0000DB40 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC0COUNT_SHIFT 0
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC0COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTXTCCNTR03_TC0COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC1COUNT_SHIFT 8
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC1COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTXTCCNTR03_TC1COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC2COUNT_SHIFT 16
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC2COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTXTCCNTR03_TC2COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC3COUNT_SHIFT 24
+#define I40E_PRTPE_RUPM_PTXTCCNTR03_TC3COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_PTXTCCNTR03_TC3COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_TCCNTR47 0x0000DB00 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_TCCNTR47_TC4COUNT_SHIFT 0
+#define I40E_PRTPE_RUPM_TCCNTR47_TC4COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_TCCNTR47_TC4COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_TCCNTR47_TC5COUNT_SHIFT 8
+#define I40E_PRTPE_RUPM_TCCNTR47_TC5COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_TCCNTR47_TC5COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_TCCNTR47_TC6COUNT_SHIFT 16
+#define I40E_PRTPE_RUPM_TCCNTR47_TC6COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_TCCNTR47_TC6COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_TCCNTR47_TC7COUNT_SHIFT 24
+#define I40E_PRTPE_RUPM_TCCNTR47_TC7COUNT_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_TCCNTR47_TC7COUNT_SHIFT)
+#define I40E_PRTPE_RUPM_THRES 0x0000DA20 /* Reset: PE_CORER */
+#define I40E_PRTPE_RUPM_THRES_MINSPADSPERTC_SHIFT 0
+#define I40E_PRTPE_RUPM_THRES_MINSPADSPERTC_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_THRES_MINSPADSPERTC_SHIFT)
+#define I40E_PRTPE_RUPM_THRES_MAXSPADS_SHIFT 8
+#define I40E_PRTPE_RUPM_THRES_MAXSPADS_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_THRES_MAXSPADS_SHIFT)
+#define I40E_PRTPE_RUPM_THRES_MAXSPADSPERTC_SHIFT 16
+#define I40E_PRTPE_RUPM_THRES_MAXSPADSPERTC_MASK I40E_MASK(0xFF, I40E_PRTPE_RUPM_THRES_MAXSPADSPERTC_SHIFT)
+#define I40E_VFPE_AEQALLOC(_VF) (0x00130C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_AEQALLOC_MAX_INDEX 127
+#define I40E_VFPE_AEQALLOC_AECOUNT_SHIFT 0
+#define I40E_VFPE_AEQALLOC_AECOUNT_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_AEQALLOC_AECOUNT_SHIFT)
+#define I40E_VFPE_CCQPHIGH(_VF) (0x00001000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CCQPHIGH_MAX_INDEX 127
+#define I40E_VFPE_CCQPHIGH_PECCQPHIGH_SHIFT 0
+#define I40E_VFPE_CCQPHIGH_PECCQPHIGH_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_CCQPHIGH_PECCQPHIGH_SHIFT)
+#define I40E_VFPE_CCQPLOW(_VF) (0x00000C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CCQPLOW_MAX_INDEX 127
+#define I40E_VFPE_CCQPLOW_PECCQPLOW_SHIFT 0
+#define I40E_VFPE_CCQPLOW_PECCQPLOW_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_CCQPLOW_PECCQPLOW_SHIFT)
+#define I40E_VFPE_CCQPSTATUS(_VF) (0x00000800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CCQPSTATUS_MAX_INDEX 127
+#define I40E_VFPE_CCQPSTATUS_CCQP_DONE_SHIFT 0
+#define I40E_VFPE_CCQPSTATUS_CCQP_DONE_MASK I40E_MASK(0x1, I40E_VFPE_CCQPSTATUS_CCQP_DONE_SHIFT)
+#define I40E_VFPE_CCQPSTATUS_HMC_PROFILE_SHIFT 4
+#define I40E_VFPE_CCQPSTATUS_HMC_PROFILE_MASK I40E_MASK(0x7, I40E_VFPE_CCQPSTATUS_HMC_PROFILE_SHIFT)
+#define I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT 16
+#define I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_MASK I40E_MASK(0x3F, I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT)
+#define I40E_VFPE_CCQPSTATUS_CCQP_ERR_SHIFT 31
+#define I40E_VFPE_CCQPSTATUS_CCQP_ERR_MASK I40E_MASK(0x1, I40E_VFPE_CCQPSTATUS_CCQP_ERR_SHIFT)
+#define I40E_VFPE_CQACK(_VF) (0x00130800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQACK_MAX_INDEX 127
+#define I40E_VFPE_CQACK_PECQID_SHIFT 0
+#define I40E_VFPE_CQACK_PECQID_MASK I40E_MASK(0x1FFFF, I40E_VFPE_CQACK_PECQID_SHIFT)
+#define I40E_VFPE_CQARM(_VF) (0x00130400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQARM_MAX_INDEX 127
+#define I40E_VFPE_CQARM_PECQID_SHIFT 0
+#define I40E_VFPE_CQARM_PECQID_MASK I40E_MASK(0x1FFFF, I40E_VFPE_CQARM_PECQID_SHIFT)
+#define I40E_VFPE_CQPDB(_VF) (0x00000000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQPDB_MAX_INDEX 127
+#define I40E_VFPE_CQPDB_WQHEAD_SHIFT 0
+#define I40E_VFPE_CQPDB_WQHEAD_MASK I40E_MASK(0x7FF, I40E_VFPE_CQPDB_WQHEAD_SHIFT)
+#define I40E_VFPE_CQPERRCODES(_VF) (0x00001800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQPERRCODES_MAX_INDEX 127
+#define I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT 0
+#define I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_MASK I40E_MASK(0xFFFF, I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT)
+#define I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT 16
+#define I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_MASK I40E_MASK(0xFFFF, I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT)
+#define I40E_VFPE_CQPTAIL(_VF) (0x00000400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_CQPTAIL_MAX_INDEX 127
+#define I40E_VFPE_CQPTAIL_WQTAIL_SHIFT 0
+#define I40E_VFPE_CQPTAIL_WQTAIL_MASK I40E_MASK(0x7FF, I40E_VFPE_CQPTAIL_WQTAIL_SHIFT)
+#define I40E_VFPE_CQPTAIL_CQP_OP_ERR_SHIFT 31
+#define I40E_VFPE_CQPTAIL_CQP_OP_ERR_MASK I40E_MASK(0x1, I40E_VFPE_CQPTAIL_CQP_OP_ERR_SHIFT)
+#define I40E_VFPE_IPCONFIG0(_VF) (0x00001400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_IPCONFIG0_MAX_INDEX 127
+#define I40E_VFPE_IPCONFIG0_PEIPID_SHIFT 0
+#define I40E_VFPE_IPCONFIG0_PEIPID_MASK I40E_MASK(0xFFFF, I40E_VFPE_IPCONFIG0_PEIPID_SHIFT)
+#define I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT 16
+#define I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_MASK I40E_MASK(0x1, I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT)
+#define I40E_VFPE_MRTEIDXMASK(_VF) (0x00003000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_MRTEIDXMASK_MAX_INDEX 127
+#define I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT 0
+#define I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_MASK I40E_MASK(0x1F, I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT)
+#define I40E_VFPE_RCVUNEXPECTEDERROR(_VF) (0x00003400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_RCVUNEXPECTEDERROR_MAX_INDEX 127
+#define I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT 0
+#define I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_MASK I40E_MASK(0xFFFFFF, I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT)
+#define I40E_VFPE_TCPNOWTIMER(_VF) (0x00002C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_TCPNOWTIMER_MAX_INDEX 127
+#define I40E_VFPE_TCPNOWTIMER_TCP_NOW_SHIFT 0
+#define I40E_VFPE_TCPNOWTIMER_TCP_NOW_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_TCPNOWTIMER_TCP_NOW_SHIFT)
+#define I40E_VFPE_WQEALLOC(_VF) (0x00138000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
+#define I40E_VFPE_WQEALLOC_MAX_INDEX 127
+#define I40E_VFPE_WQEALLOC_PEQPID_SHIFT 0
+#define I40E_VFPE_WQEALLOC_PEQPID_MASK I40E_MASK(0x3FFFF, I40E_VFPE_WQEALLOC_PEQPID_SHIFT)
+#define I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT 20
+#define I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_MASK I40E_MASK(0xFFF, I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT)
+#define I40E_GLPES_PFIP4RXDISCARD(_i) (0x00010600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXDISCARD_MAX_INDEX 15
+#define I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_SHIFT 0
+#define I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_SHIFT)
+#define I40E_GLPES_PFIP4RXFRAGSHI(_i) (0x00010804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXFRAGSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXFRAGSLO(_i) (0x00010800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXFRAGSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXMCOCTSHI(_i) (0x00010A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXMCOCTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXMCOCTSLO(_i) (0x00010A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXMCOCTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXMCPKTSHI(_i) (0x00010C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXMCPKTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXMCPKTSLO(_i) (0x00010C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXMCPKTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXOCTSHI(_i) (0x00010204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXOCTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXOCTSLO(_i) (0x00010200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXOCTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXPKTSHI(_i) (0x00010404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXPKTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP4RXPKTSLO(_i) (0x00010400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXPKTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP4RXTRUNC(_i) (0x00010700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4RXTRUNC_MAX_INDEX 15
+#define I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_SHIFT 0
+#define I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_SHIFT)
+#define I40E_GLPES_PFIP4TXFRAGSHI(_i) (0x00011E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXFRAGSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXFRAGSLO(_i) (0x00011E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXFRAGSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT)
+#define I40E_GLPES_PFIP4TXMCOCTSHI(_i) (0x00012004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXMCOCTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXMCOCTSLO(_i) (0x00012000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXMCOCTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP4TXMCPKTSHI(_i) (0x00012204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXMCPKTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXMCPKTSLO(_i) (0x00012200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXMCPKTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP4TXNOROUTE(_i) (0x00012E00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXNOROUTE_MAX_INDEX 15
+#define I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT 0
+#define I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT)
+#define I40E_GLPES_PFIP4TXOCTSHI(_i) (0x00011A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXOCTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXOCTSLO(_i) (0x00011A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXOCTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP4TXPKTSHI(_i) (0x00011C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXPKTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP4TXPKTSLO(_i) (0x00011C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP4TXPKTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXDISCARD(_i) (0x00011200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXDISCARD_MAX_INDEX 15
+#define I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_SHIFT 0
+#define I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_SHIFT)
+#define I40E_GLPES_PFIP6RXFRAGSHI(_i) (0x00011404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXFRAGSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXFRAGSLO(_i) (0x00011400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXFRAGSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXMCOCTSHI(_i) (0x00011604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXMCOCTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXMCOCTSLO(_i) (0x00011600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXMCOCTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXMCPKTSHI(_i) (0x00011804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXMCPKTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXMCPKTSLO(_i) (0x00011800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXMCPKTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXOCTSHI(_i) (0x00010E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXOCTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXOCTSLO(_i) (0x00010E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXOCTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXPKTSHI(_i) (0x00011004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXPKTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP6RXPKTSLO(_i) (0x00011000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXPKTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP6RXTRUNC(_i) (0x00011300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6RXTRUNC_MAX_INDEX 15
+#define I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_SHIFT 0
+#define I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_SHIFT)
+#define I40E_GLPES_PFIP6TXFRAGSHI(_i) (0x00012804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXFRAGSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXFRAGSLO(_i) (0x00012800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXFRAGSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT)
+#define I40E_GLPES_PFIP6TXMCOCTSHI(_i) (0x00012A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXMCOCTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXMCOCTSLO(_i) (0x00012A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXMCOCTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP6TXMCPKTSHI(_i) (0x00012C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXMCPKTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXMCPKTSLO(_i) (0x00012C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXMCPKTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT)
+#define I40E_GLPES_PFIP6TXNOROUTE(_i) (0x00012F00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXNOROUTE_MAX_INDEX 15
+#define I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT 0
+#define I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT)
+#define I40E_GLPES_PFIP6TXOCTSHI(_i) (0x00012404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXOCTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXOCTSLO(_i) (0x00012400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXOCTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT)
+#define I40E_GLPES_PFIP6TXPKTSHI(_i) (0x00012604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXPKTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT)
+#define I40E_GLPES_PFIP6TXPKTSLO(_i) (0x00012600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFIP6TXPKTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT)
+#define I40E_GLPES_PFRDMARXRDSHI(_i) (0x00013E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXRDSHI_MAX_INDEX 15
+#define I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_SHIFT 0
+#define I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_SHIFT)
+#define I40E_GLPES_PFRDMARXRDSLO(_i) (0x00013E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXRDSLO_MAX_INDEX 15
+#define I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_SHIFT 0
+#define I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_SHIFT)
+#define I40E_GLPES_PFRDMARXSNDSHI(_i) (0x00014004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXSNDSHI_MAX_INDEX 15
+#define I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT 0
+#define I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT)
+#define I40E_GLPES_PFRDMARXSNDSLO(_i) (0x00014000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXSNDSLO_MAX_INDEX 15
+#define I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT 0
+#define I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT)
+#define I40E_GLPES_PFRDMARXWRSHI(_i) (0x00013C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXWRSHI_MAX_INDEX 15
+#define I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_SHIFT 0
+#define I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_SHIFT)
+#define I40E_GLPES_PFRDMARXWRSLO(_i) (0x00013C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMARXWRSLO_MAX_INDEX 15
+#define I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_SHIFT 0
+#define I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_SHIFT)
+#define I40E_GLPES_PFRDMATXRDSHI(_i) (0x00014404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXRDSHI_MAX_INDEX 15
+#define I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_SHIFT 0
+#define I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_SHIFT)
+#define I40E_GLPES_PFRDMATXRDSLO(_i) (0x00014400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXRDSLO_MAX_INDEX 15
+#define I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_SHIFT 0
+#define I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_SHIFT)
+#define I40E_GLPES_PFRDMATXSNDSHI(_i) (0x00014604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXSNDSHI_MAX_INDEX 15
+#define I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT 0
+#define I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT)
+#define I40E_GLPES_PFRDMATXSNDSLO(_i) (0x00014600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXSNDSLO_MAX_INDEX 15
+#define I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT 0
+#define I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT)
+#define I40E_GLPES_PFRDMATXWRSHI(_i) (0x00014204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXWRSHI_MAX_INDEX 15
+#define I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_SHIFT 0
+#define I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_SHIFT)
+#define I40E_GLPES_PFRDMATXWRSLO(_i) (0x00014200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMATXWRSLO_MAX_INDEX 15
+#define I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_SHIFT 0
+#define I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_SHIFT)
+#define I40E_GLPES_PFRDMAVBNDHI(_i) (0x00014804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMAVBNDHI_MAX_INDEX 15
+#define I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_SHIFT 0
+#define I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_SHIFT)
+#define I40E_GLPES_PFRDMAVBNDLO(_i) (0x00014800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMAVBNDLO_MAX_INDEX 15
+#define I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_SHIFT 0
+#define I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_SHIFT)
+#define I40E_GLPES_PFRDMAVINVHI(_i) (0x00014A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMAVINVHI_MAX_INDEX 15
+#define I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_SHIFT 0
+#define I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_SHIFT)
+#define I40E_GLPES_PFRDMAVINVLO(_i) (0x00014A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRDMAVINVLO_MAX_INDEX 15
+#define I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_SHIFT 0
+#define I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_SHIFT)
+#define I40E_GLPES_PFRXVLANERR(_i) (0x00010000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFRXVLANERR_MAX_INDEX 15
+#define I40E_GLPES_PFRXVLANERR_RXVLANERR_SHIFT 0
+#define I40E_GLPES_PFRXVLANERR_RXVLANERR_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_PFRXVLANERR_RXVLANERR_SHIFT)
+#define I40E_GLPES_PFTCPRTXSEG(_i) (0x00013600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRTXSEG_MAX_INDEX 15
+#define I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_SHIFT 0
+#define I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_SHIFT)
+#define I40E_GLPES_PFTCPRXOPTERR(_i) (0x00013200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRXOPTERR_MAX_INDEX 15
+#define I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_SHIFT 0
+#define I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_SHIFT)
+#define I40E_GLPES_PFTCPRXPROTOERR(_i) (0x00013300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRXPROTOERR_MAX_INDEX 15
+#define I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT 0
+#define I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT)
+#define I40E_GLPES_PFTCPRXSEGSHI(_i) (0x00013004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRXSEGSHI_MAX_INDEX 15
+#define I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT 0
+#define I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT)
+#define I40E_GLPES_PFTCPRXSEGSLO(_i) (0x00013000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPRXSEGSLO_MAX_INDEX 15
+#define I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT 0
+#define I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT)
+#define I40E_GLPES_PFTCPTXSEGHI(_i) (0x00013404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPTXSEGHI_MAX_INDEX 15
+#define I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_SHIFT 0
+#define I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_SHIFT)
+#define I40E_GLPES_PFTCPTXSEGLO(_i) (0x00013400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFTCPTXSEGLO_MAX_INDEX 15
+#define I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_SHIFT 0
+#define I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_SHIFT)
+#define I40E_GLPES_PFUDPRXPKTSHI(_i) (0x00013804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFUDPRXPKTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT)
+#define I40E_GLPES_PFUDPRXPKTSLO(_i) (0x00013800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFUDPRXPKTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT)
+#define I40E_GLPES_PFUDPTXPKTSHI(_i) (0x00013A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFUDPTXPKTSHI_MAX_INDEX 15
+#define I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT 0
+#define I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT)
+#define I40E_GLPES_PFUDPTXPKTSLO(_i) (0x00013A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
+#define I40E_GLPES_PFUDPTXPKTSLO_MAX_INDEX 15
+#define I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT 0
+#define I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT)
+#define I40E_GLPES_RDMARXMULTFPDUSHI 0x0001E014 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_SHIFT 0
+#define I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_SHIFT)
+#define I40E_GLPES_RDMARXMULTFPDUSLO 0x0001E010 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_SHIFT 0
+#define I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_SHIFT)
+#define I40E_GLPES_RDMARXOOODDPHI 0x0001E01C /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_SHIFT 0
+#define I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_SHIFT)
+#define I40E_GLPES_RDMARXOOODDPLO 0x0001E018 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_SHIFT 0
+#define I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_SHIFT)
+#define I40E_GLPES_RDMARXOOONOMARK 0x0001E004 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_SHIFT 0
+#define I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_SHIFT)
+#define I40E_GLPES_RDMARXUNALIGN 0x0001E000 /* Reset: PE_CORER */
+#define I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_SHIFT 0
+#define I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_SHIFT)
+#define I40E_GLPES_TCPRXFOURHOLEHI 0x0001E044 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_SHIFT 0
+#define I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_SHIFT)
+#define I40E_GLPES_TCPRXFOURHOLELO 0x0001E040 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_SHIFT 0
+#define I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_SHIFT)
+#define I40E_GLPES_TCPRXONEHOLEHI 0x0001E02C /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_SHIFT 0
+#define I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_SHIFT)
+#define I40E_GLPES_TCPRXONEHOLELO 0x0001E028 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_SHIFT 0
+#define I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_SHIFT)
+#define I40E_GLPES_TCPRXPUREACKHI 0x0001E024 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_SHIFT 0
+#define I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_SHIFT)
+#define I40E_GLPES_TCPRXPUREACKSLO 0x0001E020 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_SHIFT 0
+#define I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_SHIFT)
+#define I40E_GLPES_TCPRXTHREEHOLEHI 0x0001E03C /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_SHIFT 0
+#define I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_SHIFT)
+#define I40E_GLPES_TCPRXTHREEHOLELO 0x0001E038 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_SHIFT 0
+#define I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_SHIFT)
+#define I40E_GLPES_TCPRXTWOHOLEHI 0x0001E034 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_SHIFT 0
+#define I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_SHIFT)
+#define I40E_GLPES_TCPRXTWOHOLELO 0x0001E030 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_SHIFT 0
+#define I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_SHIFT)
+#define I40E_GLPES_TCPTXRETRANSFASTHI 0x0001E04C /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_SHIFT 0
+#define I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_SHIFT)
+#define I40E_GLPES_TCPTXRETRANSFASTLO 0x0001E048 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_SHIFT 0
+#define I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_SHIFT)
+#define I40E_GLPES_TCPTXTOUTSFASTHI 0x0001E054 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_SHIFT 0
+#define I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_SHIFT)
+#define I40E_GLPES_TCPTXTOUTSFASTLO 0x0001E050 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_SHIFT 0
+#define I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_SHIFT)
+#define I40E_GLPES_TCPTXTOUTSHI 0x0001E05C /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_SHIFT 0
+#define I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_SHIFT)
+#define I40E_GLPES_TCPTXTOUTSLO 0x0001E058 /* Reset: PE_CORER */
+#define I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_SHIFT 0
+#define I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXDISCARD(_i) (0x00018600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXDISCARD_MAX_INDEX 31
+#define I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_SHIFT 0
+#define I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_SHIFT)
+#define I40E_GLPES_VFIP4RXFRAGSHI(_i) (0x00018804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXFRAGSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXFRAGSLO(_i) (0x00018800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXFRAGSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXMCOCTSHI(_i) (0x00018A04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXMCOCTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXMCOCTSLO(_i) (0x00018A00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXMCOCTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXMCPKTSHI(_i) (0x00018C04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXMCPKTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXMCPKTSLO(_i) (0x00018C00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXMCPKTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXOCTSHI(_i) (0x00018204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXOCTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXOCTSLO(_i) (0x00018200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXOCTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXPKTSHI(_i) (0x00018404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXPKTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP4RXPKTSLO(_i) (0x00018400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXPKTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP4RXTRUNC(_i) (0x00018700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4RXTRUNC_MAX_INDEX 31
+#define I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_SHIFT 0
+#define I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_SHIFT)
+#define I40E_GLPES_VFIP4TXFRAGSHI(_i) (0x00019E04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXFRAGSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXFRAGSLO(_i) (0x00019E00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXFRAGSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT)
+#define I40E_GLPES_VFIP4TXMCOCTSHI(_i) (0x0001A004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXMCOCTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXMCOCTSLO(_i) (0x0001A000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXMCOCTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP4TXMCPKTSHI(_i) (0x0001A204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXMCPKTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXMCPKTSLO(_i) (0x0001A200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXMCPKTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP4TXNOROUTE(_i) (0x0001AE00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXNOROUTE_MAX_INDEX 31
+#define I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT 0
+#define I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT)
+#define I40E_GLPES_VFIP4TXOCTSHI(_i) (0x00019A04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXOCTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXOCTSLO(_i) (0x00019A00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXOCTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP4TXPKTSHI(_i) (0x00019C04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXPKTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP4TXPKTSLO(_i) (0x00019C00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP4TXPKTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXDISCARD(_i) (0x00019200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXDISCARD_MAX_INDEX 31
+#define I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_SHIFT 0
+#define I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_SHIFT)
+#define I40E_GLPES_VFIP6RXFRAGSHI(_i) (0x00019404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXFRAGSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXFRAGSLO(_i) (0x00019400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXFRAGSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXMCOCTSHI(_i) (0x00019604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXMCOCTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXMCOCTSLO(_i) (0x00019600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXMCOCTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXMCPKTSHI(_i) (0x00019804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXMCPKTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXMCPKTSLO(_i) (0x00019800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXMCPKTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXOCTSHI(_i) (0x00018E04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXOCTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXOCTSLO(_i) (0x00018E00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXOCTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXPKTSHI(_i) (0x00019004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXPKTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP6RXPKTSLO(_i) (0x00019000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXPKTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP6RXTRUNC(_i) (0x00019300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6RXTRUNC_MAX_INDEX 31
+#define I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_SHIFT 0
+#define I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_SHIFT)
+#define I40E_GLPES_VFIP6TXFRAGSHI(_i) (0x0001A804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXFRAGSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXFRAGSLO(_i) (0x0001A800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXFRAGSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT)
+#define I40E_GLPES_VFIP6TXMCOCTSHI(_i) (0x0001AA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXMCOCTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXMCOCTSLO(_i) (0x0001AA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXMCOCTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP6TXMCPKTSHI(_i) (0x0001AC04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXMCPKTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXMCPKTSLO(_i) (0x0001AC00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXMCPKTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT)
+#define I40E_GLPES_VFIP6TXNOROUTE(_i) (0x0001AF00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXNOROUTE_MAX_INDEX 31
+#define I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT 0
+#define I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT)
+#define I40E_GLPES_VFIP6TXOCTSHI(_i) (0x0001A404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXOCTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXOCTSLO(_i) (0x0001A400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXOCTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT)
+#define I40E_GLPES_VFIP6TXPKTSHI(_i) (0x0001A604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXPKTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT)
+#define I40E_GLPES_VFIP6TXPKTSLO(_i) (0x0001A600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFIP6TXPKTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT)
+#define I40E_GLPES_VFRDMARXRDSHI(_i) (0x0001BE04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXRDSHI_MAX_INDEX 31
+#define I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_SHIFT 0
+#define I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_SHIFT)
+#define I40E_GLPES_VFRDMARXRDSLO(_i) (0x0001BE00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXRDSLO_MAX_INDEX 31
+#define I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_SHIFT 0
+#define I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_SHIFT)
+#define I40E_GLPES_VFRDMARXSNDSHI(_i) (0x0001C004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXSNDSHI_MAX_INDEX 31
+#define I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT 0
+#define I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT)
+#define I40E_GLPES_VFRDMARXSNDSLO(_i) (0x0001C000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXSNDSLO_MAX_INDEX 31
+#define I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT 0
+#define I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT)
+#define I40E_GLPES_VFRDMARXWRSHI(_i) (0x0001BC04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXWRSHI_MAX_INDEX 31
+#define I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_SHIFT 0
+#define I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_SHIFT)
+#define I40E_GLPES_VFRDMARXWRSLO(_i) (0x0001BC00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMARXWRSLO_MAX_INDEX 31
+#define I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_SHIFT 0
+#define I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_SHIFT)
+#define I40E_GLPES_VFRDMATXRDSHI(_i) (0x0001C404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXRDSHI_MAX_INDEX 31
+#define I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_SHIFT 0
+#define I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_SHIFT)
+#define I40E_GLPES_VFRDMATXRDSLO(_i) (0x0001C400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXRDSLO_MAX_INDEX 31
+#define I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_SHIFT 0
+#define I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_SHIFT)
+#define I40E_GLPES_VFRDMATXSNDSHI(_i) (0x0001C604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXSNDSHI_MAX_INDEX 31
+#define I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT 0
+#define I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT)
+#define I40E_GLPES_VFRDMATXSNDSLO(_i) (0x0001C600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXSNDSLO_MAX_INDEX 31
+#define I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT 0
+#define I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT)
+#define I40E_GLPES_VFRDMATXWRSHI(_i) (0x0001C204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXWRSHI_MAX_INDEX 31
+#define I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_SHIFT 0
+#define I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_SHIFT)
+#define I40E_GLPES_VFRDMATXWRSLO(_i) (0x0001C200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMATXWRSLO_MAX_INDEX 31
+#define I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_SHIFT 0
+#define I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_SHIFT)
+#define I40E_GLPES_VFRDMAVBNDHI(_i) (0x0001C804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMAVBNDHI_MAX_INDEX 31
+#define I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_SHIFT 0
+#define I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_SHIFT)
+#define I40E_GLPES_VFRDMAVBNDLO(_i) (0x0001C800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMAVBNDLO_MAX_INDEX 31
+#define I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_SHIFT 0
+#define I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_SHIFT)
+#define I40E_GLPES_VFRDMAVINVHI(_i) (0x0001CA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMAVINVHI_MAX_INDEX 31
+#define I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_SHIFT 0
+#define I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_SHIFT)
+#define I40E_GLPES_VFRDMAVINVLO(_i) (0x0001CA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRDMAVINVLO_MAX_INDEX 31
+#define I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_SHIFT 0
+#define I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_SHIFT)
+#define I40E_GLPES_VFRXVLANERR(_i) (0x00018000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFRXVLANERR_MAX_INDEX 31
+#define I40E_GLPES_VFRXVLANERR_RXVLANERR_SHIFT 0
+#define I40E_GLPES_VFRXVLANERR_RXVLANERR_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_VFRXVLANERR_RXVLANERR_SHIFT)
+#define I40E_GLPES_VFTCPRTXSEG(_i) (0x0001B600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRTXSEG_MAX_INDEX 31
+#define I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_SHIFT 0
+#define I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_SHIFT)
+#define I40E_GLPES_VFTCPRXOPTERR(_i) (0x0001B200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRXOPTERR_MAX_INDEX 31
+#define I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_SHIFT 0
+#define I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_SHIFT)
+#define I40E_GLPES_VFTCPRXPROTOERR(_i) (0x0001B300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRXPROTOERR_MAX_INDEX 31
+#define I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT 0
+#define I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_MASK I40E_MASK(0xFFFFFF, I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT)
+#define I40E_GLPES_VFTCPRXSEGSHI(_i) (0x0001B004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRXSEGSHI_MAX_INDEX 31
+#define I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT 0
+#define I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT)
+#define I40E_GLPES_VFTCPRXSEGSLO(_i) (0x0001B000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPRXSEGSLO_MAX_INDEX 31
+#define I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT 0
+#define I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT)
+#define I40E_GLPES_VFTCPTXSEGHI(_i) (0x0001B404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPTXSEGHI_MAX_INDEX 31
+#define I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_SHIFT 0
+#define I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_SHIFT)
+#define I40E_GLPES_VFTCPTXSEGLO(_i) (0x0001B400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFTCPTXSEGLO_MAX_INDEX 31
+#define I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_SHIFT 0
+#define I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_SHIFT)
+#define I40E_GLPES_VFUDPRXPKTSHI(_i) (0x0001B804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFUDPRXPKTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT)
+#define I40E_GLPES_VFUDPRXPKTSLO(_i) (0x0001B800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFUDPRXPKTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT)
+#define I40E_GLPES_VFUDPTXPKTSHI(_i) (0x0001BA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFUDPTXPKTSHI_MAX_INDEX 31
+#define I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT 0
+#define I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_MASK I40E_MASK(0xFFFF, I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT)
+#define I40E_GLPES_VFUDPTXPKTSLO(_i) (0x0001BA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
+#define I40E_GLPES_VFUDPTXPKTSLO_MAX_INDEX 31
+#define I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT 0
+#define I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_MASK I40E_MASK(0xFFFFFFFF, I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT)
+#define I40E_GLGEN_PME_TO 0x000B81BC /* Reset: POR */
+#define I40E_GLGEN_PME_TO_PME_TO_FOR_PE_SHIFT 0
+#define I40E_GLGEN_PME_TO_PME_TO_FOR_PE_MASK I40E_MASK(0x1, I40E_GLGEN_PME_TO_PME_TO_FOR_PE_SHIFT)
+#define I40E_GLQF_APBVT(_i) (0x00260000 + ((_i) * 4)) /* _i=0...2047 */ /* Reset: CORER */
+#define I40E_GLQF_APBVT_MAX_INDEX 2047
+#define I40E_GLQF_APBVT_APBVT_SHIFT 0
+#define I40E_GLQF_APBVT_APBVT_MASK I40E_MASK(0xFFFFFFFF, I40E_GLQF_APBVT_APBVT_SHIFT)
+#define I40E_GLQF_FD_PCTYPES(_i) (0x00268000 + ((_i) * 4)) /* _i=0...63 */ /* Reset: POR */
+#define I40E_GLQF_FD_PCTYPES_MAX_INDEX 63
+#define I40E_GLQF_FD_PCTYPES_FD_PCTYPE_SHIFT 0
+#define I40E_GLQF_FD_PCTYPES_FD_PCTYPE_MASK I40E_MASK(0x3F, I40E_GLQF_FD_PCTYPES_FD_PCTYPE_SHIFT)
+#define I40E_GLQF_FD_MSK(_i, _j) (0x00267200 + ((_i) * 4 + (_j) * 8)) /* _i=0...1, _j=0...63 */ /* Reset: CORER */
+#define I40E_GLQF_FD_MSK_MAX_INDEX 1
+#define I40E_GLQF_FD_MSK_MASK_SHIFT 0
+#define I40E_GLQF_FD_MSK_MASK_MASK I40E_MASK(0xFFFF, I40E_GLQF_FD_MSK_MASK_SHIFT)
+#define I40E_GLQF_FD_MSK_OFFSET_SHIFT 16
+#define I40E_GLQF_FD_MSK_OFFSET_MASK I40E_MASK(0x3F, I40E_GLQF_FD_MSK_OFFSET_SHIFT)
+#define I40E_GLQF_HASH_INSET(_i, _j) (0x00267600 + ((_i) * 4 + (_j) * 8)) /* _i=0...1, _j=0...63 */ /* Reset: CORER */
+#define I40E_GLQF_HASH_INSET_MAX_INDEX 1
+#define I40E_GLQF_HASH_INSET_INSET_SHIFT 0
+#define I40E_GLQF_HASH_INSET_INSET_MASK I40E_MASK(0xFFFFFFFF, I40E_GLQF_HASH_INSET_INSET_SHIFT)
+#define I40E_GLQF_HASH_MSK(_i, _j) (0x00267A00 + ((_i) * 4 + (_j) * 8)) /* _i=0...1, _j=0...63 */ /* Reset: CORER */
+#define I40E_GLQF_HASH_MSK_MAX_INDEX 1
+#define I40E_GLQF_HASH_MSK_MASK_SHIFT 0
+#define I40E_GLQF_HASH_MSK_MASK_MASK I40E_MASK(0xFFFF, I40E_GLQF_HASH_MSK_MASK_SHIFT)
+#define I40E_GLQF_HASH_MSK_OFFSET_SHIFT 16
+#define I40E_GLQF_HASH_MSK_OFFSET_MASK I40E_MASK(0x3F, I40E_GLQF_HASH_MSK_OFFSET_SHIFT)
+#define I40E_GLQF_ORT(_i) (0x00268900 + ((_i) * 4)) /* _i=0...63 */ /* Reset: CORER */
+#define I40E_GLQF_ORT_MAX_INDEX 63
+#define I40E_GLQF_ORT_PIT_INDX_SHIFT 0
+#define I40E_GLQF_ORT_PIT_INDX_MASK I40E_MASK(0x1F, I40E_GLQF_ORT_PIT_INDX_SHIFT)
+#define I40E_GLQF_ORT_FIELD_CNT_SHIFT 5
+#define I40E_GLQF_ORT_FIELD_CNT_MASK I40E_MASK(0x3, I40E_GLQF_ORT_FIELD_CNT_SHIFT)
+#define I40E_GLQF_ORT_FLX_PAYLOAD_SHIFT 7
+#define I40E_GLQF_ORT_FLX_PAYLOAD_MASK I40E_MASK(0x1, I40E_GLQF_ORT_FLX_PAYLOAD_SHIFT)
+#define I40E_GLQF_PIT(_i) (0x00268C80 + ((_i) * 4)) /* _i=0...23 */ /* Reset: CORER */
+#define I40E_GLQF_PIT_MAX_INDEX 23
+#define I40E_GLQF_PIT_SOURCE_OFF_SHIFT 0
+#define I40E_GLQF_PIT_SOURCE_OFF_MASK I40E_MASK(0x1F, I40E_GLQF_PIT_SOURCE_OFF_SHIFT)
+#define I40E_GLQF_PIT_FSIZE_SHIFT 5
+#define I40E_GLQF_PIT_FSIZE_MASK I40E_MASK(0x1F, I40E_GLQF_PIT_FSIZE_SHIFT)
+#define I40E_GLQF_PIT_DEST_OFF_SHIFT 10
+#define I40E_GLQF_PIT_DEST_OFF_MASK I40E_MASK(0x3F, I40E_GLQF_PIT_DEST_OFF_SHIFT)
+#define I40E_GLQF_FDEVICTENA(_i) (0x00270384 + ((_i) * 4)) /* _i=0...1 */ /* Reset: CORER */
+#define I40E_GLQF_FDEVICTENA_MAX_INDEX 1
+#define I40E_GLQF_FDEVICTENA_GLQF_FDEVICTENA_SHIFT 0
+#define I40E_GLQF_FDEVICTENA_GLQF_FDEVICTENA_MASK I40E_MASK(0xFFFFFFFF, I40E_GLQF_FDEVICTENA_GLQF_FDEVICTENA_SHIFT)
+#define I40E_GLQF_FDEVICTFLAG 0x00270280 /* Reset: CORER */
+#define I40E_GLQF_FDEVICTFLAG_TX_FLAGS_SHIFT 0
+#define I40E_GLQF_FDEVICTFLAG_TX_FLAGS_MASK I40E_MASK(0xFF, I40E_GLQF_FDEVICTFLAG_TX_FLAGS_SHIFT)
+#define I40E_GLQF_FDEVICTFLAG_RX_FLAGS_SHIFT 8
+#define I40E_GLQF_FDEVICTFLAG_RX_FLAGS_MASK I40E_MASK(0xFF, I40E_GLQF_FDEVICTFLAG_RX_FLAGS_SHIFT)
+#define I40E_PFQF_CTL_2 0x00270300 /* Reset: CORER */
+#define I40E_PFQF_CTL_2_PEHSIZE_SHIFT 0
+#define I40E_PFQF_CTL_2_PEHSIZE_MASK I40E_MASK(0x1F, I40E_PFQF_CTL_2_PEHSIZE_SHIFT)
+#define I40E_PFQF_CTL_2_PEDSIZE_SHIFT 5
+#define I40E_PFQF_CTL_2_PEDSIZE_MASK I40E_MASK(0x1F, I40E_PFQF_CTL_2_PEDSIZE_SHIFT)
+/* Redefined for X722 family */
+#define I40E_X722_PFQF_HLUT(_i) (0x00240000 + ((_i) * 128)) /* _i=0...127 */ /* Reset: CORER */
+#define I40E_X722_PFQF_HLUT_MAX_INDEX 127
+#define I40E_X722_PFQF_HLUT_LUT0_SHIFT 0
+#define I40E_X722_PFQF_HLUT_LUT0_MASK I40E_MASK(0x7F, I40E_X722_PFQF_HLUT_LUT0_SHIFT)
+#define I40E_X722_PFQF_HLUT_LUT1_SHIFT 8
+#define I40E_X722_PFQF_HLUT_LUT1_MASK I40E_MASK(0x7F, I40E_X722_PFQF_HLUT_LUT1_SHIFT)
+#define I40E_X722_PFQF_HLUT_LUT2_SHIFT 16
+#define I40E_X722_PFQF_HLUT_LUT2_MASK I40E_MASK(0x7F, I40E_X722_PFQF_HLUT_LUT2_SHIFT)
+#define I40E_X722_PFQF_HLUT_LUT3_SHIFT 24
+#define I40E_X722_PFQF_HLUT_LUT3_MASK I40E_MASK(0x7F, I40E_X722_PFQF_HLUT_LUT3_SHIFT)
+#define I40E_PFQF_HREGION(_i) (0x00245400 + ((_i) * 128)) /* _i=0...7 */ /* Reset: CORER */
+#define I40E_PFQF_HREGION_MAX_INDEX 7
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_0_SHIFT 0
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_0_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_0_SHIFT)
+#define I40E_PFQF_HREGION_REGION_0_SHIFT 1
+#define I40E_PFQF_HREGION_REGION_0_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_0_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_1_SHIFT 4
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_1_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_1_SHIFT)
+#define I40E_PFQF_HREGION_REGION_1_SHIFT 5
+#define I40E_PFQF_HREGION_REGION_1_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_1_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_2_SHIFT 8
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_2_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_2_SHIFT)
+#define I40E_PFQF_HREGION_REGION_2_SHIFT 9
+#define I40E_PFQF_HREGION_REGION_2_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_2_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_3_SHIFT 12
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_3_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_3_SHIFT)
+#define I40E_PFQF_HREGION_REGION_3_SHIFT 13
+#define I40E_PFQF_HREGION_REGION_3_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_3_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_4_SHIFT 16
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_4_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_4_SHIFT)
+#define I40E_PFQF_HREGION_REGION_4_SHIFT 17
+#define I40E_PFQF_HREGION_REGION_4_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_4_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_5_SHIFT 20
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_5_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_5_SHIFT)
+#define I40E_PFQF_HREGION_REGION_5_SHIFT 21
+#define I40E_PFQF_HREGION_REGION_5_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_5_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_6_SHIFT 24
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_6_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_6_SHIFT)
+#define I40E_PFQF_HREGION_REGION_6_SHIFT 25
+#define I40E_PFQF_HREGION_REGION_6_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_6_SHIFT)
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_7_SHIFT 28
+#define I40E_PFQF_HREGION_OVERRIDE_ENA_7_MASK I40E_MASK(0x1, I40E_PFQF_HREGION_OVERRIDE_ENA_7_SHIFT)
+#define I40E_PFQF_HREGION_REGION_7_SHIFT 29
+#define I40E_PFQF_HREGION_REGION_7_MASK I40E_MASK(0x7, I40E_PFQF_HREGION_REGION_7_SHIFT)
+#define I40E_VSIQF_CTL_RSS_LUT_TYPE_SHIFT 8
+#define I40E_VSIQF_CTL_RSS_LUT_TYPE_MASK I40E_MASK(0x1, I40E_VSIQF_CTL_RSS_LUT_TYPE_SHIFT)
+#define I40E_VSIQF_HKEY(_i, _VSI) (0x002A0000 + ((_i) * 2048 + (_VSI) * 4)) /* _i=0...12, _VSI=0...383 */ /* Reset: CORER */
+#define I40E_VSIQF_HKEY_MAX_INDEX 12
+#define I40E_VSIQF_HKEY_KEY_0_SHIFT 0
+#define I40E_VSIQF_HKEY_KEY_0_MASK I40E_MASK(0xFF, I40E_VSIQF_HKEY_KEY_0_SHIFT)
+#define I40E_VSIQF_HKEY_KEY_1_SHIFT 8
+#define I40E_VSIQF_HKEY_KEY_1_MASK I40E_MASK(0xFF, I40E_VSIQF_HKEY_KEY_1_SHIFT)
+#define I40E_VSIQF_HKEY_KEY_2_SHIFT 16
+#define I40E_VSIQF_HKEY_KEY_2_MASK I40E_MASK(0xFF, I40E_VSIQF_HKEY_KEY_2_SHIFT)
+#define I40E_VSIQF_HKEY_KEY_3_SHIFT 24
+#define I40E_VSIQF_HKEY_KEY_3_MASK I40E_MASK(0xFF, I40E_VSIQF_HKEY_KEY_3_SHIFT)
+#define I40E_VSIQF_HLUT(_i, _VSI) (0x00220000 + ((_i) * 2048 + (_VSI) * 4)) /* _i=0...15, _VSI=0...383 */ /* Reset: CORER */
+#define I40E_VSIQF_HLUT_MAX_INDEX 15
+#define I40E_VSIQF_HLUT_LUT0_SHIFT 0
+#define I40E_VSIQF_HLUT_LUT0_MASK I40E_MASK(0xF, I40E_VSIQF_HLUT_LUT0_SHIFT)
+#define I40E_VSIQF_HLUT_LUT1_SHIFT 8
+#define I40E_VSIQF_HLUT_LUT1_MASK I40E_MASK(0xF, I40E_VSIQF_HLUT_LUT1_SHIFT)
+#define I40E_VSIQF_HLUT_LUT2_SHIFT 16
+#define I40E_VSIQF_HLUT_LUT2_MASK I40E_MASK(0xF, I40E_VSIQF_HLUT_LUT2_SHIFT)
+#define I40E_VSIQF_HLUT_LUT3_SHIFT 24
+#define I40E_VSIQF_HLUT_LUT3_MASK I40E_MASK(0xF, I40E_VSIQF_HLUT_LUT3_SHIFT)
+#define I40E_GLGEN_STAT_CLEAR 0x00390004 /* Reset: CORER */
+#define I40E_GLGEN_STAT_CLEAR_GLGEN_STAT_CLEAR_SHIFT 0
+#define I40E_GLGEN_STAT_CLEAR_GLGEN_STAT_CLEAR_MASK I40E_MASK(0x1, I40E_GLGEN_STAT_CLEAR_GLGEN_STAT_CLEAR_SHIFT)
+#define I40E_GLGEN_STAT_HALT 0x00390000 /* Reset: CORER */
+#define I40E_GLGEN_STAT_HALT_HALT_CELLS_SHIFT 0
+#define I40E_GLGEN_STAT_HALT_HALT_CELLS_MASK I40E_MASK(0x3FFFFFFF, I40E_GLGEN_STAT_HALT_HALT_CELLS_SHIFT)
+#define I40E_VFINT_DYN_CTL01_WB_ON_ITR_SHIFT 30
+#define I40E_VFINT_DYN_CTL01_WB_ON_ITR_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTL01_WB_ON_ITR_SHIFT)
+#define I40E_VFINT_DYN_CTLN1_WB_ON_ITR_SHIFT 30
+#define I40E_VFINT_DYN_CTLN1_WB_ON_ITR_MASK I40E_MASK(0x1, I40E_VFINT_DYN_CTLN1_WB_ON_ITR_SHIFT)
+#define I40E_VFPE_AEQALLOC1 0x0000A400 /* Reset: VFR */
+#define I40E_VFPE_AEQALLOC1_AECOUNT_SHIFT 0
+#define I40E_VFPE_AEQALLOC1_AECOUNT_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_AEQALLOC1_AECOUNT_SHIFT)
+#define I40E_VFPE_CCQPHIGH1 0x00009800 /* Reset: VFR */
+#define I40E_VFPE_CCQPHIGH1_PECCQPHIGH_SHIFT 0
+#define I40E_VFPE_CCQPHIGH1_PECCQPHIGH_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_CCQPHIGH1_PECCQPHIGH_SHIFT)
+#define I40E_VFPE_CCQPLOW1 0x0000AC00 /* Reset: VFR */
+#define I40E_VFPE_CCQPLOW1_PECCQPLOW_SHIFT 0
+#define I40E_VFPE_CCQPLOW1_PECCQPLOW_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_CCQPLOW1_PECCQPLOW_SHIFT)
+#define I40E_VFPE_CCQPSTATUS1 0x0000B800 /* Reset: VFR */
+#define I40E_VFPE_CCQPSTATUS1_CCQP_DONE_SHIFT 0
+#define I40E_VFPE_CCQPSTATUS1_CCQP_DONE_MASK I40E_MASK(0x1, I40E_VFPE_CCQPSTATUS1_CCQP_DONE_SHIFT)
+#define I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_SHIFT 4
+#define I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_MASK I40E_MASK(0x7, I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_SHIFT)
+#define I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_SHIFT 16
+#define I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_MASK I40E_MASK(0x3F, I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_SHIFT)
+#define I40E_VFPE_CCQPSTATUS1_CCQP_ERR_SHIFT 31
+#define I40E_VFPE_CCQPSTATUS1_CCQP_ERR_MASK I40E_MASK(0x1, I40E_VFPE_CCQPSTATUS1_CCQP_ERR_SHIFT)
+#define I40E_VFPE_CQACK1 0x0000B000 /* Reset: VFR */
+#define I40E_VFPE_CQACK1_PECQID_SHIFT 0
+#define I40E_VFPE_CQACK1_PECQID_MASK I40E_MASK(0x1FFFF, I40E_VFPE_CQACK1_PECQID_SHIFT)
+#define I40E_VFPE_CQARM1 0x0000B400 /* Reset: VFR */
+#define I40E_VFPE_CQARM1_PECQID_SHIFT 0
+#define I40E_VFPE_CQARM1_PECQID_MASK I40E_MASK(0x1FFFF, I40E_VFPE_CQARM1_PECQID_SHIFT)
+#define I40E_VFPE_CQPDB1 0x0000BC00 /* Reset: VFR */
+#define I40E_VFPE_CQPDB1_WQHEAD_SHIFT 0
+#define I40E_VFPE_CQPDB1_WQHEAD_MASK I40E_MASK(0x7FF, I40E_VFPE_CQPDB1_WQHEAD_SHIFT)
+#define I40E_VFPE_CQPERRCODES1 0x00009C00 /* Reset: VFR */
+#define I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_SHIFT 0
+#define I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_MASK I40E_MASK(0xFFFF, I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_SHIFT)
+#define I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_SHIFT 16
+#define I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_MASK I40E_MASK(0xFFFF, I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_SHIFT)
+#define I40E_VFPE_CQPTAIL1 0x0000A000 /* Reset: VFR */
+#define I40E_VFPE_CQPTAIL1_WQTAIL_SHIFT 0
+#define I40E_VFPE_CQPTAIL1_WQTAIL_MASK I40E_MASK(0x7FF, I40E_VFPE_CQPTAIL1_WQTAIL_SHIFT)
+#define I40E_VFPE_CQPTAIL1_CQP_OP_ERR_SHIFT 31
+#define I40E_VFPE_CQPTAIL1_CQP_OP_ERR_MASK I40E_MASK(0x1, I40E_VFPE_CQPTAIL1_CQP_OP_ERR_SHIFT)
+#define I40E_VFPE_IPCONFIG01 0x00008C00 /* Reset: VFR */
+#define I40E_VFPE_IPCONFIG01_PEIPID_SHIFT 0
+#define I40E_VFPE_IPCONFIG01_PEIPID_MASK I40E_MASK(0xFFFF, I40E_VFPE_IPCONFIG01_PEIPID_SHIFT)
+#define I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_SHIFT 16
+#define I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_MASK I40E_MASK(0x1, I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_SHIFT)
+#define I40E_VFPE_MRTEIDXMASK1 0x00009000 /* Reset: VFR */
+#define I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_SHIFT 0
+#define I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_MASK I40E_MASK(0x1F, I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_SHIFT)
+#define I40E_VFPE_RCVUNEXPECTEDERROR1 0x00009400 /* Reset: VFR */
+#define I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_SHIFT 0
+#define I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_MASK I40E_MASK(0xFFFFFF, I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_SHIFT)
+#define I40E_VFPE_TCPNOWTIMER1 0x0000A800 /* Reset: VFR */
+#define I40E_VFPE_TCPNOWTIMER1_TCP_NOW_SHIFT 0
+#define I40E_VFPE_TCPNOWTIMER1_TCP_NOW_MASK I40E_MASK(0xFFFFFFFF, I40E_VFPE_TCPNOWTIMER1_TCP_NOW_SHIFT)
+#define I40E_VFPE_WQEALLOC1 0x0000C000 /* Reset: VFR */
+#define I40E_VFPE_WQEALLOC1_PEQPID_SHIFT 0
+#define I40E_VFPE_WQEALLOC1_PEQPID_MASK I40E_MASK(0x3FFFF, I40E_VFPE_WQEALLOC1_PEQPID_SHIFT)
+#define I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_SHIFT 20
+#define I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_MASK I40E_MASK(0xFFF, I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_SHIFT)
+#endif /* _I40E_REGISTER_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_status.h b/drivers/net/ethernet/intel/i40e/i40e_status.h
new file mode 100644
index 0000000..10c86f6
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_status.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_STATUS_H_
+#define _I40E_STATUS_H_
+
+/* Error Codes */
+enum i40e_status_code {
+	I40E_SUCCESS				= 0,
+	I40E_ERR_NVM				= -1,
+	I40E_ERR_NVM_CHECKSUM			= -2,
+	I40E_ERR_PHY				= -3,
+	I40E_ERR_CONFIG				= -4,
+	I40E_ERR_PARAM				= -5,
+	I40E_ERR_MAC_TYPE			= -6,
+	I40E_ERR_UNKNOWN_PHY			= -7,
+	I40E_ERR_LINK_SETUP			= -8,
+	I40E_ERR_ADAPTER_STOPPED		= -9,
+	I40E_ERR_INVALID_MAC_ADDR		= -10,
+	I40E_ERR_DEVICE_NOT_SUPPORTED		= -11,
+	I40E_ERR_MASTER_REQUESTS_PENDING	= -12,
+	I40E_ERR_INVALID_LINK_SETTINGS		= -13,
+	I40E_ERR_AUTONEG_NOT_COMPLETE		= -14,
+	I40E_ERR_RESET_FAILED			= -15,
+	I40E_ERR_SWFW_SYNC			= -16,
+	I40E_ERR_NO_AVAILABLE_VSI		= -17,
+	I40E_ERR_NO_MEMORY			= -18,
+	I40E_ERR_BAD_PTR			= -19,
+	I40E_ERR_RING_FULL			= -20,
+	I40E_ERR_INVALID_PD_ID			= -21,
+	I40E_ERR_INVALID_QP_ID			= -22,
+	I40E_ERR_INVALID_CQ_ID			= -23,
+	I40E_ERR_INVALID_CEQ_ID			= -24,
+	I40E_ERR_INVALID_AEQ_ID			= -25,
+	I40E_ERR_INVALID_SIZE			= -26,
+	I40E_ERR_INVALID_ARP_INDEX		= -27,
+	I40E_ERR_INVALID_FPM_FUNC_ID		= -28,
+	I40E_ERR_QP_INVALID_MSG_SIZE		= -29,
+	I40E_ERR_QP_TOOMANY_WRS_POSTED		= -30,
+	I40E_ERR_INVALID_FRAG_COUNT		= -31,
+	I40E_ERR_QUEUE_EMPTY			= -32,
+	I40E_ERR_INVALID_ALIGNMENT		= -33,
+	I40E_ERR_FLUSHED_QUEUE			= -34,
+	I40E_ERR_INVALID_PUSH_PAGE_INDEX	= -35,
+	I40E_ERR_INVALID_IMM_DATA_SIZE		= -36,
+	I40E_ERR_TIMEOUT			= -37,
+	I40E_ERR_OPCODE_MISMATCH		= -38,
+	I40E_ERR_CQP_COMPL_ERROR		= -39,
+	I40E_ERR_INVALID_VF_ID			= -40,
+	I40E_ERR_INVALID_HMCFN_ID		= -41,
+	I40E_ERR_BACKING_PAGE_ERROR		= -42,
+	I40E_ERR_NO_PBLCHUNKS_AVAILABLE		= -43,
+	I40E_ERR_INVALID_PBLE_INDEX		= -44,
+	I40E_ERR_INVALID_SD_INDEX		= -45,
+	I40E_ERR_INVALID_PAGE_DESC_INDEX	= -46,
+	I40E_ERR_INVALID_SD_TYPE		= -47,
+	I40E_ERR_MEMCPY_FAILED			= -48,
+	I40E_ERR_INVALID_HMC_OBJ_INDEX		= -49,
+	I40E_ERR_INVALID_HMC_OBJ_COUNT		= -50,
+	I40E_ERR_INVALID_SRQ_ARM_LIMIT		= -51,
+	I40E_ERR_SRQ_ENABLED			= -52,
+	I40E_ERR_ADMIN_QUEUE_ERROR		= -53,
+	I40E_ERR_ADMIN_QUEUE_TIMEOUT		= -54,
+	I40E_ERR_BUF_TOO_SHORT			= -55,
+	I40E_ERR_ADMIN_QUEUE_FULL		= -56,
+	I40E_ERR_ADMIN_QUEUE_NO_WORK		= -57,
+	I40E_ERR_BAD_IWARP_CQE			= -58,
+	I40E_ERR_NVM_BLANK_MODE			= -59,
+	I40E_ERR_NOT_IMPLEMENTED		= -60,
+	I40E_ERR_PE_DOORBELL_NOT_ENABLED	= -61,
+	I40E_ERR_DIAG_TEST_FAILED		= -62,
+	I40E_ERR_NOT_READY			= -63,
+	I40E_NOT_SUPPORTED			= -64,
+	I40E_ERR_FIRMWARE_API_VERSION		= -65,
+	I40E_ERR_ADMIN_QUEUE_CRITICAL_ERROR	= -66,
+};
+
+#endif /* _I40E_STATUS_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_trace.h b/drivers/net/ethernet/intel/i40e/i40e_trace.h
new file mode 100644
index 0000000..410ba13
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_trace.h
@@ -0,0 +1,230 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel(R) 40-10 Gigabit Ethernet Connection Network Driver
+ * Copyright(c) 2013 - 2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+/* Modeled on trace-events-sample.h */
+
+/* The trace subsystem name for i40e will be "i40e".
+ *
+ * This file is named i40e_trace.h.
+ *
+ * Since this include file's name is different from the trace
+ * subsystem name, we'll have to define TRACE_INCLUDE_FILE at the end
+ * of this file.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM i40e
+
+/* See trace-events-sample.h for a detailed description of why this
+ * guard clause is different from most normal include files.
+ */
+#if !defined(_I40E_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
+#define _I40E_TRACE_H_
+
+#include <linux/tracepoint.h>
+
+/**
+ * i40e_trace() macro enables shared code to refer to trace points
+ * like:
+ *
+ * trace_i40e{,vf}_example(args...)
+ *
+ * ... as:
+ *
+ * i40e_trace(example, args...)
+ *
+ * ... to resolve to the PF or VF version of the tracepoint without
+ * ifdefs, and to allow tracepoints to be disabled entirely at build
+ * time.
+ *
+ * Trace point should always be referred to in the driver via this
+ * macro.
+ *
+ * Similarly, i40e_trace_enabled(trace_name) wraps references to
+ * trace_i40e{,vf}_<trace_name>_enabled() functions.
+ */
+#define _I40E_TRACE_NAME(trace_name) (trace_ ## i40e ## _ ## trace_name)
+#define I40E_TRACE_NAME(trace_name) _I40E_TRACE_NAME(trace_name)
+
+#define i40e_trace(trace_name, args...) I40E_TRACE_NAME(trace_name)(args)
+
+#define i40e_trace_enabled(trace_name) I40E_TRACE_NAME(trace_name##_enabled)()
+
+/* Events common to PF and VF. Corresponding versions will be defined
+ * for both, named trace_i40e_* and trace_i40evf_*. The i40e_trace()
+ * macro above will select the right trace point name for the driver
+ * being built from shared code.
+ */
+
+/* Events related to a vsi & ring */
+DECLARE_EVENT_CLASS(
+	i40e_tx_template,
+
+	TP_PROTO(struct i40e_ring *ring,
+		 struct i40e_tx_desc *desc,
+		 struct i40e_tx_buffer *buf),
+
+	TP_ARGS(ring, desc, buf),
+
+	/* The convention here is to make the first fields in the
+	 * TP_STRUCT match the TP_PROTO exactly. This enables the use
+	 * of the args struct generated by the tplist tool (from the
+	 * bcc-tools package) to be used for those fields. To access
+	 * fields other than the tracepoint args will require the
+	 * tplist output to be adjusted.
+	 */
+	TP_STRUCT__entry(
+		__field(void*, ring)
+		__field(void*, desc)
+		__field(void*, buf)
+		__string(devname, ring->netdev->name)
+	),
+
+	TP_fast_assign(
+		__entry->ring = ring;
+		__entry->desc = desc;
+		__entry->buf = buf;
+		__assign_str(devname, ring->netdev->name);
+	),
+
+	TP_printk(
+		"netdev: %s ring: %p desc: %p buf %p",
+		__get_str(devname), __entry->ring,
+		__entry->desc, __entry->buf)
+);
+
+DEFINE_EVENT(
+	i40e_tx_template, i40e_clean_tx_irq,
+	TP_PROTO(struct i40e_ring *ring,
+		 struct i40e_tx_desc *desc,
+		 struct i40e_tx_buffer *buf),
+
+	TP_ARGS(ring, desc, buf));
+
+DEFINE_EVENT(
+	i40e_tx_template, i40e_clean_tx_irq_unmap,
+	TP_PROTO(struct i40e_ring *ring,
+		 struct i40e_tx_desc *desc,
+		 struct i40e_tx_buffer *buf),
+
+	TP_ARGS(ring, desc, buf));
+
+DECLARE_EVENT_CLASS(
+	i40e_rx_template,
+
+	TP_PROTO(struct i40e_ring *ring,
+		 union i40e_32byte_rx_desc *desc,
+		 struct sk_buff *skb),
+
+	TP_ARGS(ring, desc, skb),
+
+	TP_STRUCT__entry(
+		__field(void*, ring)
+		__field(void*, desc)
+		__field(void*, skb)
+		__string(devname, ring->netdev->name)
+	),
+
+	TP_fast_assign(
+		__entry->ring = ring;
+		__entry->desc = desc;
+		__entry->skb = skb;
+		__assign_str(devname, ring->netdev->name);
+	),
+
+	TP_printk(
+		"netdev: %s ring: %p desc: %p skb %p",
+		__get_str(devname), __entry->ring,
+		__entry->desc, __entry->skb)
+);
+
+DEFINE_EVENT(
+	i40e_rx_template, i40e_clean_rx_irq,
+	TP_PROTO(struct i40e_ring *ring,
+		 union i40e_32byte_rx_desc *desc,
+		 struct sk_buff *skb),
+
+	TP_ARGS(ring, desc, skb));
+
+DEFINE_EVENT(
+	i40e_rx_template, i40e_clean_rx_irq_rx,
+	TP_PROTO(struct i40e_ring *ring,
+		 union i40e_32byte_rx_desc *desc,
+		 struct sk_buff *skb),
+
+	TP_ARGS(ring, desc, skb));
+
+DECLARE_EVENT_CLASS(
+	i40e_xmit_template,
+
+	TP_PROTO(struct sk_buff *skb,
+		 struct i40e_ring *ring),
+
+	TP_ARGS(skb, ring),
+
+	TP_STRUCT__entry(
+		__field(void*, skb)
+		__field(void*, ring)
+		__string(devname, ring->netdev->name)
+	),
+
+	TP_fast_assign(
+		__entry->skb = skb;
+		__entry->ring = ring;
+		__assign_str(devname, ring->netdev->name);
+	),
+
+	TP_printk(
+		"netdev: %s skb: %p ring: %p",
+		__get_str(devname), __entry->skb,
+		__entry->ring)
+);
+
+DEFINE_EVENT(
+	i40e_xmit_template, i40e_xmit_frame_ring,
+	TP_PROTO(struct sk_buff *skb,
+		 struct i40e_ring *ring),
+
+	TP_ARGS(skb, ring));
+
+DEFINE_EVENT(
+	i40e_xmit_template, i40e_xmit_frame_ring_drop,
+	TP_PROTO(struct sk_buff *skb,
+		 struct i40e_ring *ring),
+
+	TP_ARGS(skb, ring));
+
+/* Events unique to the PF. */
+
+#endif /* _I40E_TRACE_H_ */
+/* This must be outside ifdef _I40E_TRACE_H */
+
+/* This trace include file is not located in the .../include/trace
+ * with the kernel tracepoint definitions, because we're a loadable
+ * module.
+ */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE i40e_trace
+#include <trace/define_trace.h>
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
new file mode 100644
index 0000000..f174c72
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -0,0 +1,3715 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#include <linux/prefetch.h>
+#include <net/busy_poll.h>
+#include <linux/bpf_trace.h>
+#include <net/xdp.h>
+#include "i40e.h"
+#include "i40e_trace.h"
+#include "i40e_prototype.h"
+
+static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
+				u32 td_tag)
+{
+	return cpu_to_le64(I40E_TX_DESC_DTYPE_DATA |
+			   ((u64)td_cmd  << I40E_TXD_QW1_CMD_SHIFT) |
+			   ((u64)td_offset << I40E_TXD_QW1_OFFSET_SHIFT) |
+			   ((u64)size  << I40E_TXD_QW1_TX_BUF_SZ_SHIFT) |
+			   ((u64)td_tag  << I40E_TXD_QW1_L2TAG1_SHIFT));
+}
+
+#define I40E_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
+/**
+ * i40e_fdir - Generate a Flow Director descriptor based on fdata
+ * @tx_ring: Tx ring to send buffer on
+ * @fdata: Flow director filter data
+ * @add: Indicate if we are adding a rule or deleting one
+ *
+ **/
+static void i40e_fdir(struct i40e_ring *tx_ring,
+		      struct i40e_fdir_filter *fdata, bool add)
+{
+	struct i40e_filter_program_desc *fdir_desc;
+	struct i40e_pf *pf = tx_ring->vsi->back;
+	u32 flex_ptype, dtype_cmd;
+	u16 i;
+
+	/* grab the next descriptor */
+	i = tx_ring->next_to_use;
+	fdir_desc = I40E_TX_FDIRDESC(tx_ring, i);
+
+	i++;
+	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
+
+	flex_ptype = I40E_TXD_FLTR_QW0_QINDEX_MASK &
+		     (fdata->q_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT);
+
+	flex_ptype |= I40E_TXD_FLTR_QW0_FLEXOFF_MASK &
+		      (fdata->flex_off << I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT);
+
+	flex_ptype |= I40E_TXD_FLTR_QW0_PCTYPE_MASK &
+		      (fdata->pctype << I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
+
+	flex_ptype |= I40E_TXD_FLTR_QW0_PCTYPE_MASK &
+		      (fdata->flex_offset << I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT);
+
+	/* Use LAN VSI Id if not programmed by user */
+	flex_ptype |= I40E_TXD_FLTR_QW0_DEST_VSI_MASK &
+		      ((u32)(fdata->dest_vsi ? : pf->vsi[pf->lan_vsi]->id) <<
+		       I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT);
+
+	dtype_cmd = I40E_TX_DESC_DTYPE_FILTER_PROG;
+
+	dtype_cmd |= add ?
+		     I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE <<
+		     I40E_TXD_FLTR_QW1_PCMD_SHIFT :
+		     I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE <<
+		     I40E_TXD_FLTR_QW1_PCMD_SHIFT;
+
+	dtype_cmd |= I40E_TXD_FLTR_QW1_DEST_MASK &
+		     (fdata->dest_ctl << I40E_TXD_FLTR_QW1_DEST_SHIFT);
+
+	dtype_cmd |= I40E_TXD_FLTR_QW1_FD_STATUS_MASK &
+		     (fdata->fd_status << I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT);
+
+	if (fdata->cnt_index) {
+		dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
+		dtype_cmd |= I40E_TXD_FLTR_QW1_CNTINDEX_MASK &
+			     ((u32)fdata->cnt_index <<
+			      I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT);
+	}
+
+	fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
+	fdir_desc->rsvd = cpu_to_le32(0);
+	fdir_desc->dtype_cmd_cntindex = cpu_to_le32(dtype_cmd);
+	fdir_desc->fd_id = cpu_to_le32(fdata->fd_id);
+}
+
+#define I40E_FD_CLEAN_DELAY 10
+/**
+ * i40e_program_fdir_filter - Program a Flow Director filter
+ * @fdir_data: Packet data that will be filter parameters
+ * @raw_packet: the pre-allocated packet buffer for FDir
+ * @pf: The PF pointer
+ * @add: True for add/update, False for remove
+ **/
+static int i40e_program_fdir_filter(struct i40e_fdir_filter *fdir_data,
+				    u8 *raw_packet, struct i40e_pf *pf,
+				    bool add)
+{
+	struct i40e_tx_buffer *tx_buf, *first;
+	struct i40e_tx_desc *tx_desc;
+	struct i40e_ring *tx_ring;
+	struct i40e_vsi *vsi;
+	struct device *dev;
+	dma_addr_t dma;
+	u32 td_cmd = 0;
+	u16 i;
+
+	/* find existing FDIR VSI */
+	vsi = i40e_find_vsi_by_type(pf, I40E_VSI_FDIR);
+	if (!vsi)
+		return -ENOENT;
+
+	tx_ring = vsi->tx_rings[0];
+	dev = tx_ring->dev;
+
+	/* we need two descriptors to add/del a filter and we can wait */
+	for (i = I40E_FD_CLEAN_DELAY; I40E_DESC_UNUSED(tx_ring) < 2; i--) {
+		if (!i)
+			return -EAGAIN;
+		msleep_interruptible(1);
+	}
+
+	dma = dma_map_single(dev, raw_packet,
+			     I40E_FDIR_MAX_RAW_PACKET_SIZE, DMA_TO_DEVICE);
+	if (dma_mapping_error(dev, dma))
+		goto dma_fail;
+
+	/* grab the next descriptor */
+	i = tx_ring->next_to_use;
+	first = &tx_ring->tx_bi[i];
+	i40e_fdir(tx_ring, fdir_data, add);
+
+	/* Now program a dummy descriptor */
+	i = tx_ring->next_to_use;
+	tx_desc = I40E_TX_DESC(tx_ring, i);
+	tx_buf = &tx_ring->tx_bi[i];
+
+	tx_ring->next_to_use = ((i + 1) < tx_ring->count) ? i + 1 : 0;
+
+	memset(tx_buf, 0, sizeof(struct i40e_tx_buffer));
+
+	/* record length, and DMA address */
+	dma_unmap_len_set(tx_buf, len, I40E_FDIR_MAX_RAW_PACKET_SIZE);
+	dma_unmap_addr_set(tx_buf, dma, dma);
+
+	tx_desc->buffer_addr = cpu_to_le64(dma);
+	td_cmd = I40E_TXD_CMD | I40E_TX_DESC_CMD_DUMMY;
+
+	tx_buf->tx_flags = I40E_TX_FLAGS_FD_SB;
+	tx_buf->raw_buf = (void *)raw_packet;
+
+	tx_desc->cmd_type_offset_bsz =
+		build_ctob(td_cmd, 0, I40E_FDIR_MAX_RAW_PACKET_SIZE, 0);
+
+	/* Force memory writes to complete before letting h/w
+	 * know there are new descriptors to fetch.
+	 */
+	wmb();
+
+	/* Mark the data descriptor to be watched */
+	first->next_to_watch = tx_desc;
+
+	writel(tx_ring->next_to_use, tx_ring->tail);
+	return 0;
+
+dma_fail:
+	return -1;
+}
+
+#define IP_HEADER_OFFSET 14
+#define I40E_UDPIP_DUMMY_PACKET_LEN 42
+/**
+ * i40e_add_del_fdir_udpv4 - Add/Remove UDPv4 filters
+ * @vsi: pointer to the targeted VSI
+ * @fd_data: the flow director data required for the FDir descriptor
+ * @add: true adds a filter, false removes it
+ *
+ * Returns 0 if the filters were successfully added or removed
+ **/
+static int i40e_add_del_fdir_udpv4(struct i40e_vsi *vsi,
+				   struct i40e_fdir_filter *fd_data,
+				   bool add)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct udphdr *udp;
+	struct iphdr *ip;
+	u8 *raw_packet;
+	int ret;
+	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
+		0x45, 0, 0, 0x1c, 0, 0, 0x40, 0, 0x40, 0x11, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+	raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
+	if (!raw_packet)
+		return -ENOMEM;
+	memcpy(raw_packet, packet, I40E_UDPIP_DUMMY_PACKET_LEN);
+
+	ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
+	udp = (struct udphdr *)(raw_packet + IP_HEADER_OFFSET
+	      + sizeof(struct iphdr));
+
+	ip->daddr = fd_data->dst_ip;
+	udp->dest = fd_data->dst_port;
+	ip->saddr = fd_data->src_ip;
+	udp->source = fd_data->src_port;
+
+	if (fd_data->flex_filter) {
+		u8 *payload = raw_packet + I40E_UDPIP_DUMMY_PACKET_LEN;
+		__be16 pattern = fd_data->flex_word;
+		u16 off = fd_data->flex_offset;
+
+		*((__force __be16 *)(payload + off)) = pattern;
+	}
+
+	fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
+	ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
+			 fd_data->pctype, fd_data->fd_id, ret);
+		/* Free the packet buffer since it wasn't added to the ring */
+		kfree(raw_packet);
+		return -EOPNOTSUPP;
+	} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
+		if (add)
+			dev_info(&pf->pdev->dev,
+				 "Filter OK for PCTYPE %d loc = %d\n",
+				 fd_data->pctype, fd_data->fd_id);
+		else
+			dev_info(&pf->pdev->dev,
+				 "Filter deleted for PCTYPE %d loc = %d\n",
+				 fd_data->pctype, fd_data->fd_id);
+	}
+
+	if (add)
+		pf->fd_udp4_filter_cnt++;
+	else
+		pf->fd_udp4_filter_cnt--;
+
+	return 0;
+}
+
+#define I40E_TCPIP_DUMMY_PACKET_LEN 54
+/**
+ * i40e_add_del_fdir_tcpv4 - Add/Remove TCPv4 filters
+ * @vsi: pointer to the targeted VSI
+ * @fd_data: the flow director data required for the FDir descriptor
+ * @add: true adds a filter, false removes it
+ *
+ * Returns 0 if the filters were successfully added or removed
+ **/
+static int i40e_add_del_fdir_tcpv4(struct i40e_vsi *vsi,
+				   struct i40e_fdir_filter *fd_data,
+				   bool add)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct tcphdr *tcp;
+	struct iphdr *ip;
+	u8 *raw_packet;
+	int ret;
+	/* Dummy packet */
+	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
+		0x45, 0, 0, 0x28, 0, 0, 0x40, 0, 0x40, 0x6, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80, 0x11,
+		0x0, 0x72, 0, 0, 0, 0};
+
+	raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
+	if (!raw_packet)
+		return -ENOMEM;
+	memcpy(raw_packet, packet, I40E_TCPIP_DUMMY_PACKET_LEN);
+
+	ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
+	tcp = (struct tcphdr *)(raw_packet + IP_HEADER_OFFSET
+	      + sizeof(struct iphdr));
+
+	ip->daddr = fd_data->dst_ip;
+	tcp->dest = fd_data->dst_port;
+	ip->saddr = fd_data->src_ip;
+	tcp->source = fd_data->src_port;
+
+	if (fd_data->flex_filter) {
+		u8 *payload = raw_packet + I40E_TCPIP_DUMMY_PACKET_LEN;
+		__be16 pattern = fd_data->flex_word;
+		u16 off = fd_data->flex_offset;
+
+		*((__force __be16 *)(payload + off)) = pattern;
+	}
+
+	fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
+	ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
+			 fd_data->pctype, fd_data->fd_id, ret);
+		/* Free the packet buffer since it wasn't added to the ring */
+		kfree(raw_packet);
+		return -EOPNOTSUPP;
+	} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
+		if (add)
+			dev_info(&pf->pdev->dev, "Filter OK for PCTYPE %d loc = %d)\n",
+				 fd_data->pctype, fd_data->fd_id);
+		else
+			dev_info(&pf->pdev->dev,
+				 "Filter deleted for PCTYPE %d loc = %d\n",
+				 fd_data->pctype, fd_data->fd_id);
+	}
+
+	if (add) {
+		pf->fd_tcp4_filter_cnt++;
+		if ((pf->flags & I40E_FLAG_FD_ATR_ENABLED) &&
+		    I40E_DEBUG_FD & pf->hw.debug_mask)
+			dev_info(&pf->pdev->dev, "Forcing ATR off, sideband rules for TCP/IPv4 flow being applied\n");
+		set_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state);
+	} else {
+		pf->fd_tcp4_filter_cnt--;
+	}
+
+	return 0;
+}
+
+#define I40E_SCTPIP_DUMMY_PACKET_LEN 46
+/**
+ * i40e_add_del_fdir_sctpv4 - Add/Remove SCTPv4 Flow Director filters for
+ * a specific flow spec
+ * @vsi: pointer to the targeted VSI
+ * @fd_data: the flow director data required for the FDir descriptor
+ * @add: true adds a filter, false removes it
+ *
+ * Returns 0 if the filters were successfully added or removed
+ **/
+static int i40e_add_del_fdir_sctpv4(struct i40e_vsi *vsi,
+				    struct i40e_fdir_filter *fd_data,
+				    bool add)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct sctphdr *sctp;
+	struct iphdr *ip;
+	u8 *raw_packet;
+	int ret;
+	/* Dummy packet */
+	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
+		0x45, 0, 0, 0x20, 0, 0, 0x40, 0, 0x40, 0x84, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+	raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
+	if (!raw_packet)
+		return -ENOMEM;
+	memcpy(raw_packet, packet, I40E_SCTPIP_DUMMY_PACKET_LEN);
+
+	ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
+	sctp = (struct sctphdr *)(raw_packet + IP_HEADER_OFFSET
+	      + sizeof(struct iphdr));
+
+	ip->daddr = fd_data->dst_ip;
+	sctp->dest = fd_data->dst_port;
+	ip->saddr = fd_data->src_ip;
+	sctp->source = fd_data->src_port;
+
+	if (fd_data->flex_filter) {
+		u8 *payload = raw_packet + I40E_SCTPIP_DUMMY_PACKET_LEN;
+		__be16 pattern = fd_data->flex_word;
+		u16 off = fd_data->flex_offset;
+
+		*((__force __be16 *)(payload + off)) = pattern;
+	}
+
+	fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_SCTP;
+	ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
+	if (ret) {
+		dev_info(&pf->pdev->dev,
+			 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
+			 fd_data->pctype, fd_data->fd_id, ret);
+		/* Free the packet buffer since it wasn't added to the ring */
+		kfree(raw_packet);
+		return -EOPNOTSUPP;
+	} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
+		if (add)
+			dev_info(&pf->pdev->dev,
+				 "Filter OK for PCTYPE %d loc = %d\n",
+				 fd_data->pctype, fd_data->fd_id);
+		else
+			dev_info(&pf->pdev->dev,
+				 "Filter deleted for PCTYPE %d loc = %d\n",
+				 fd_data->pctype, fd_data->fd_id);
+	}
+
+	if (add)
+		pf->fd_sctp4_filter_cnt++;
+	else
+		pf->fd_sctp4_filter_cnt--;
+
+	return 0;
+}
+
+#define I40E_IP_DUMMY_PACKET_LEN 34
+/**
+ * i40e_add_del_fdir_ipv4 - Add/Remove IPv4 Flow Director filters for
+ * a specific flow spec
+ * @vsi: pointer to the targeted VSI
+ * @fd_data: the flow director data required for the FDir descriptor
+ * @add: true adds a filter, false removes it
+ *
+ * Returns 0 if the filters were successfully added or removed
+ **/
+static int i40e_add_del_fdir_ipv4(struct i40e_vsi *vsi,
+				  struct i40e_fdir_filter *fd_data,
+				  bool add)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct iphdr *ip;
+	u8 *raw_packet;
+	int ret;
+	int i;
+	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
+		0x45, 0, 0, 0x14, 0, 0, 0x40, 0, 0x40, 0x10, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0};
+
+	for (i = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER;
+	     i <= I40E_FILTER_PCTYPE_FRAG_IPV4;	i++) {
+		raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
+		if (!raw_packet)
+			return -ENOMEM;
+		memcpy(raw_packet, packet, I40E_IP_DUMMY_PACKET_LEN);
+		ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
+
+		ip->saddr = fd_data->src_ip;
+		ip->daddr = fd_data->dst_ip;
+		ip->protocol = 0;
+
+		if (fd_data->flex_filter) {
+			u8 *payload = raw_packet + I40E_IP_DUMMY_PACKET_LEN;
+			__be16 pattern = fd_data->flex_word;
+			u16 off = fd_data->flex_offset;
+
+			*((__force __be16 *)(payload + off)) = pattern;
+		}
+
+		fd_data->pctype = i;
+		ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
+		if (ret) {
+			dev_info(&pf->pdev->dev,
+				 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
+				 fd_data->pctype, fd_data->fd_id, ret);
+			/* The packet buffer wasn't added to the ring so we
+			 * need to free it now.
+			 */
+			kfree(raw_packet);
+			return -EOPNOTSUPP;
+		} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
+			if (add)
+				dev_info(&pf->pdev->dev,
+					 "Filter OK for PCTYPE %d loc = %d\n",
+					 fd_data->pctype, fd_data->fd_id);
+			else
+				dev_info(&pf->pdev->dev,
+					 "Filter deleted for PCTYPE %d loc = %d\n",
+					 fd_data->pctype, fd_data->fd_id);
+		}
+	}
+
+	if (add)
+		pf->fd_ip4_filter_cnt++;
+	else
+		pf->fd_ip4_filter_cnt--;
+
+	return 0;
+}
+
+/**
+ * i40e_add_del_fdir - Build raw packets to add/del fdir filter
+ * @vsi: pointer to the targeted VSI
+ * @cmd: command to get or set RX flow classification rules
+ * @add: true adds a filter, false removes it
+ *
+ **/
+int i40e_add_del_fdir(struct i40e_vsi *vsi,
+		      struct i40e_fdir_filter *input, bool add)
+{
+	struct i40e_pf *pf = vsi->back;
+	int ret;
+
+	switch (input->flow_type & ~FLOW_EXT) {
+	case TCP_V4_FLOW:
+		ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
+		break;
+	case UDP_V4_FLOW:
+		ret = i40e_add_del_fdir_udpv4(vsi, input, add);
+		break;
+	case SCTP_V4_FLOW:
+		ret = i40e_add_del_fdir_sctpv4(vsi, input, add);
+		break;
+	case IP_USER_FLOW:
+		switch (input->ip4_proto) {
+		case IPPROTO_TCP:
+			ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
+			break;
+		case IPPROTO_UDP:
+			ret = i40e_add_del_fdir_udpv4(vsi, input, add);
+			break;
+		case IPPROTO_SCTP:
+			ret = i40e_add_del_fdir_sctpv4(vsi, input, add);
+			break;
+		case IPPROTO_IP:
+			ret = i40e_add_del_fdir_ipv4(vsi, input, add);
+			break;
+		default:
+			/* We cannot support masking based on protocol */
+			dev_info(&pf->pdev->dev, "Unsupported IPv4 protocol 0x%02x\n",
+				 input->ip4_proto);
+			return -EINVAL;
+		}
+		break;
+	default:
+		dev_info(&pf->pdev->dev, "Unsupported flow type 0x%02x\n",
+			 input->flow_type);
+		return -EINVAL;
+	}
+
+	/* The buffer allocated here will be normally be freed by
+	 * i40e_clean_fdir_tx_irq() as it reclaims resources after transmit
+	 * completion. In the event of an error adding the buffer to the FDIR
+	 * ring, it will immediately be freed. It may also be freed by
+	 * i40e_clean_tx_ring() when closing the VSI.
+	 */
+	return ret;
+}
+
+/**
+ * i40e_fd_handle_status - check the Programming Status for FD
+ * @rx_ring: the Rx ring for this descriptor
+ * @rx_desc: the Rx descriptor for programming Status, not a packet descriptor.
+ * @prog_id: the id originally used for programming
+ *
+ * This is used to verify if the FD programming or invalidation
+ * requested by SW to the HW is successful or not and take actions accordingly.
+ **/
+static void i40e_fd_handle_status(struct i40e_ring *rx_ring,
+				  union i40e_rx_desc *rx_desc, u8 prog_id)
+{
+	struct i40e_pf *pf = rx_ring->vsi->back;
+	struct pci_dev *pdev = pf->pdev;
+	u32 fcnt_prog, fcnt_avail;
+	u32 error;
+	u64 qw;
+
+	qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+	error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
+		I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT;
+
+	if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) {
+		pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id);
+		if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) ||
+		    (I40E_DEBUG_FD & pf->hw.debug_mask))
+			dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n",
+				 pf->fd_inv);
+
+		/* Check if the programming error is for ATR.
+		 * If so, auto disable ATR and set a state for
+		 * flush in progress. Next time we come here if flush is in
+		 * progress do nothing, once flush is complete the state will
+		 * be cleared.
+		 */
+		if (test_bit(__I40E_FD_FLUSH_REQUESTED, pf->state))
+			return;
+
+		pf->fd_add_err++;
+		/* store the current atr filter count */
+		pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf);
+
+		if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) &&
+		    test_bit(__I40E_FD_SB_AUTO_DISABLED, pf->state)) {
+			/* These set_bit() calls aren't atomic with the
+			 * test_bit() here, but worse case we potentially
+			 * disable ATR and queue a flush right after SB
+			 * support is re-enabled. That shouldn't cause an
+			 * issue in practice
+			 */
+			set_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state);
+			set_bit(__I40E_FD_FLUSH_REQUESTED, pf->state);
+		}
+
+		/* filter programming failed most likely due to table full */
+		fcnt_prog = i40e_get_global_fd_count(pf);
+		fcnt_avail = pf->fdir_pf_filter_count;
+		/* If ATR is running fcnt_prog can quickly change,
+		 * if we are very close to full, it makes sense to disable
+		 * FD ATR/SB and then re-enable it when there is room.
+		 */
+		if (fcnt_prog >= (fcnt_avail - I40E_FDIR_BUFFER_FULL_MARGIN)) {
+			if ((pf->flags & I40E_FLAG_FD_SB_ENABLED) &&
+			    !test_and_set_bit(__I40E_FD_SB_AUTO_DISABLED,
+					      pf->state))
+				if (I40E_DEBUG_FD & pf->hw.debug_mask)
+					dev_warn(&pdev->dev, "FD filter space full, new ntuple rules will not be added\n");
+		}
+	} else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) {
+		if (I40E_DEBUG_FD & pf->hw.debug_mask)
+			dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n",
+				 rx_desc->wb.qword0.hi_dword.fd_id);
+	}
+}
+
+/**
+ * i40e_unmap_and_free_tx_resource - Release a Tx buffer
+ * @ring:      the ring that owns the buffer
+ * @tx_buffer: the buffer to free
+ **/
+static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
+					    struct i40e_tx_buffer *tx_buffer)
+{
+	if (tx_buffer->skb) {
+		if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
+			kfree(tx_buffer->raw_buf);
+		else if (ring_is_xdp(ring))
+			page_frag_free(tx_buffer->raw_buf);
+		else
+			dev_kfree_skb_any(tx_buffer->skb);
+		if (dma_unmap_len(tx_buffer, len))
+			dma_unmap_single(ring->dev,
+					 dma_unmap_addr(tx_buffer, dma),
+					 dma_unmap_len(tx_buffer, len),
+					 DMA_TO_DEVICE);
+	} else if (dma_unmap_len(tx_buffer, len)) {
+		dma_unmap_page(ring->dev,
+			       dma_unmap_addr(tx_buffer, dma),
+			       dma_unmap_len(tx_buffer, len),
+			       DMA_TO_DEVICE);
+	}
+
+	tx_buffer->next_to_watch = NULL;
+	tx_buffer->skb = NULL;
+	dma_unmap_len_set(tx_buffer, len, 0);
+	/* tx_buffer must be completely set up in the transmit path */
+}
+
+/**
+ * i40e_clean_tx_ring - Free any empty Tx buffers
+ * @tx_ring: ring to be cleaned
+ **/
+void i40e_clean_tx_ring(struct i40e_ring *tx_ring)
+{
+	unsigned long bi_size;
+	u16 i;
+
+	/* ring already cleared, nothing to do */
+	if (!tx_ring->tx_bi)
+		return;
+
+	/* Free all the Tx ring sk_buffs */
+	for (i = 0; i < tx_ring->count; i++)
+		i40e_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
+
+	bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
+	memset(tx_ring->tx_bi, 0, bi_size);
+
+	/* Zero out the descriptor ring */
+	memset(tx_ring->desc, 0, tx_ring->size);
+
+	tx_ring->next_to_use = 0;
+	tx_ring->next_to_clean = 0;
+
+	if (!tx_ring->netdev)
+		return;
+
+	/* cleanup Tx queue statistics */
+	netdev_tx_reset_queue(txring_txq(tx_ring));
+}
+
+/**
+ * i40e_free_tx_resources - Free Tx resources per queue
+ * @tx_ring: Tx descriptor ring for a specific queue
+ *
+ * Free all transmit software resources
+ **/
+void i40e_free_tx_resources(struct i40e_ring *tx_ring)
+{
+	i40e_clean_tx_ring(tx_ring);
+	kfree(tx_ring->tx_bi);
+	tx_ring->tx_bi = NULL;
+
+	if (tx_ring->desc) {
+		dma_free_coherent(tx_ring->dev, tx_ring->size,
+				  tx_ring->desc, tx_ring->dma);
+		tx_ring->desc = NULL;
+	}
+}
+
+/**
+ * i40e_get_tx_pending - how many tx descriptors not processed
+ * @tx_ring: the ring of descriptors
+ * @in_sw: use SW variables
+ *
+ * Since there is no access to the ring head register
+ * in XL710, we need to use our local copies
+ **/
+u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw)
+{
+	u32 head, tail;
+
+	if (!in_sw) {
+		head = i40e_get_head(ring);
+		tail = readl(ring->tail);
+	} else {
+		head = ring->next_to_clean;
+		tail = ring->next_to_use;
+	}
+
+	if (head != tail)
+		return (head < tail) ?
+			tail - head : (tail + ring->count - head);
+
+	return 0;
+}
+
+/**
+ * i40e_detect_recover_hung - Function to detect and recover hung_queues
+ * @vsi:  pointer to vsi struct with tx queues
+ *
+ * VSI has netdev and netdev has TX queues. This function is to check each of
+ * those TX queues if they are hung, trigger recovery by issuing SW interrupt.
+ **/
+void i40e_detect_recover_hung(struct i40e_vsi *vsi)
+{
+	struct i40e_ring *tx_ring = NULL;
+	struct net_device *netdev;
+	unsigned int i;
+	int packets;
+
+	if (!vsi)
+		return;
+
+	if (test_bit(__I40E_VSI_DOWN, vsi->state))
+		return;
+
+	netdev = vsi->netdev;
+	if (!netdev)
+		return;
+
+	if (!netif_carrier_ok(netdev))
+		return;
+
+	for (i = 0; i < vsi->num_queue_pairs; i++) {
+		tx_ring = vsi->tx_rings[i];
+		if (tx_ring && tx_ring->desc) {
+			/* If packet counter has not changed the queue is
+			 * likely stalled, so force an interrupt for this
+			 * queue.
+			 *
+			 * prev_pkt_ctr would be negative if there was no
+			 * pending work.
+			 */
+			packets = tx_ring->stats.packets & INT_MAX;
+			if (tx_ring->tx_stats.prev_pkt_ctr == packets) {
+				i40e_force_wb(vsi, tx_ring->q_vector);
+				continue;
+			}
+
+			/* Memory barrier between read of packet count and call
+			 * to i40e_get_tx_pending()
+			 */
+			smp_rmb();
+			tx_ring->tx_stats.prev_pkt_ctr =
+			    i40e_get_tx_pending(tx_ring, true) ? packets : -1;
+		}
+	}
+}
+
+#define WB_STRIDE 4
+
+/**
+ * i40e_clean_tx_irq - Reclaim resources after transmit completes
+ * @vsi: the VSI we care about
+ * @tx_ring: Tx ring to clean
+ * @napi_budget: Used to determine if we are in netpoll
+ *
+ * Returns true if there's any budget left (e.g. the clean is finished)
+ **/
+static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
+			      struct i40e_ring *tx_ring, int napi_budget)
+{
+	u16 i = tx_ring->next_to_clean;
+	struct i40e_tx_buffer *tx_buf;
+	struct i40e_tx_desc *tx_head;
+	struct i40e_tx_desc *tx_desc;
+	unsigned int total_bytes = 0, total_packets = 0;
+	unsigned int budget = vsi->work_limit;
+
+	tx_buf = &tx_ring->tx_bi[i];
+	tx_desc = I40E_TX_DESC(tx_ring, i);
+	i -= tx_ring->count;
+
+	tx_head = I40E_TX_DESC(tx_ring, i40e_get_head(tx_ring));
+
+	do {
+		struct i40e_tx_desc *eop_desc = tx_buf->next_to_watch;
+
+		/* if next_to_watch is not set then there is no work pending */
+		if (!eop_desc)
+			break;
+
+		/* prevent any other reads prior to eop_desc */
+		smp_rmb();
+
+		i40e_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf);
+		/* we have caught up to head, no work left to do */
+		if (tx_head == tx_desc)
+			break;
+
+		/* clear next_to_watch to prevent false hangs */
+		tx_buf->next_to_watch = NULL;
+
+		/* update the statistics for this packet */
+		total_bytes += tx_buf->bytecount;
+		total_packets += tx_buf->gso_segs;
+
+		/* free the skb/XDP data */
+		if (ring_is_xdp(tx_ring))
+			page_frag_free(tx_buf->raw_buf);
+		else
+			napi_consume_skb(tx_buf->skb, napi_budget);
+
+		/* unmap skb header data */
+		dma_unmap_single(tx_ring->dev,
+				 dma_unmap_addr(tx_buf, dma),
+				 dma_unmap_len(tx_buf, len),
+				 DMA_TO_DEVICE);
+
+		/* clear tx_buffer data */
+		tx_buf->skb = NULL;
+		dma_unmap_len_set(tx_buf, len, 0);
+
+		/* unmap remaining buffers */
+		while (tx_desc != eop_desc) {
+			i40e_trace(clean_tx_irq_unmap,
+				   tx_ring, tx_desc, tx_buf);
+
+			tx_buf++;
+			tx_desc++;
+			i++;
+			if (unlikely(!i)) {
+				i -= tx_ring->count;
+				tx_buf = tx_ring->tx_bi;
+				tx_desc = I40E_TX_DESC(tx_ring, 0);
+			}
+
+			/* unmap any remaining paged data */
+			if (dma_unmap_len(tx_buf, len)) {
+				dma_unmap_page(tx_ring->dev,
+					       dma_unmap_addr(tx_buf, dma),
+					       dma_unmap_len(tx_buf, len),
+					       DMA_TO_DEVICE);
+				dma_unmap_len_set(tx_buf, len, 0);
+			}
+		}
+
+		/* move us one more past the eop_desc for start of next pkt */
+		tx_buf++;
+		tx_desc++;
+		i++;
+		if (unlikely(!i)) {
+			i -= tx_ring->count;
+			tx_buf = tx_ring->tx_bi;
+			tx_desc = I40E_TX_DESC(tx_ring, 0);
+		}
+
+		prefetch(tx_desc);
+
+		/* update budget accounting */
+		budget--;
+	} while (likely(budget));
+
+	i += tx_ring->count;
+	tx_ring->next_to_clean = i;
+	u64_stats_update_begin(&tx_ring->syncp);
+	tx_ring->stats.bytes += total_bytes;
+	tx_ring->stats.packets += total_packets;
+	u64_stats_update_end(&tx_ring->syncp);
+	tx_ring->q_vector->tx.total_bytes += total_bytes;
+	tx_ring->q_vector->tx.total_packets += total_packets;
+
+	if (tx_ring->flags & I40E_TXR_FLAGS_WB_ON_ITR) {
+		/* check to see if there are < 4 descriptors
+		 * waiting to be written back, then kick the hardware to force
+		 * them to be written back in case we stay in NAPI.
+		 * In this mode on X722 we do not enable Interrupt.
+		 */
+		unsigned int j = i40e_get_tx_pending(tx_ring, false);
+
+		if (budget &&
+		    ((j / WB_STRIDE) == 0) && (j > 0) &&
+		    !test_bit(__I40E_VSI_DOWN, vsi->state) &&
+		    (I40E_DESC_UNUSED(tx_ring) != tx_ring->count))
+			tx_ring->arm_wb = true;
+	}
+
+	if (ring_is_xdp(tx_ring))
+		return !!budget;
+
+	/* notify netdev of completed buffers */
+	netdev_tx_completed_queue(txring_txq(tx_ring),
+				  total_packets, total_bytes);
+
+#define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
+	if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
+		     (I40E_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
+		/* Make sure that anybody stopping the queue after this
+		 * sees the new next_to_clean.
+		 */
+		smp_mb();
+		if (__netif_subqueue_stopped(tx_ring->netdev,
+					     tx_ring->queue_index) &&
+		   !test_bit(__I40E_VSI_DOWN, vsi->state)) {
+			netif_wake_subqueue(tx_ring->netdev,
+					    tx_ring->queue_index);
+			++tx_ring->tx_stats.restart_queue;
+		}
+	}
+
+	return !!budget;
+}
+
+/**
+ * i40e_enable_wb_on_itr - Arm hardware to do a wb, interrupts are not enabled
+ * @vsi: the VSI we care about
+ * @q_vector: the vector on which to enable writeback
+ *
+ **/
+static void i40e_enable_wb_on_itr(struct i40e_vsi *vsi,
+				  struct i40e_q_vector *q_vector)
+{
+	u16 flags = q_vector->tx.ring[0].flags;
+	u32 val;
+
+	if (!(flags & I40E_TXR_FLAGS_WB_ON_ITR))
+		return;
+
+	if (q_vector->arm_wb_state)
+		return;
+
+	if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
+		val = I40E_PFINT_DYN_CTLN_WB_ON_ITR_MASK |
+		      I40E_PFINT_DYN_CTLN_ITR_INDX_MASK; /* set noitr */
+
+		wr32(&vsi->back->hw,
+		     I40E_PFINT_DYN_CTLN(q_vector->reg_idx),
+		     val);
+	} else {
+		val = I40E_PFINT_DYN_CTL0_WB_ON_ITR_MASK |
+		      I40E_PFINT_DYN_CTL0_ITR_INDX_MASK; /* set noitr */
+
+		wr32(&vsi->back->hw, I40E_PFINT_DYN_CTL0, val);
+	}
+	q_vector->arm_wb_state = true;
+}
+
+/**
+ * i40e_force_wb - Issue SW Interrupt so HW does a wb
+ * @vsi: the VSI we care about
+ * @q_vector: the vector  on which to force writeback
+ *
+ **/
+void i40e_force_wb(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector)
+{
+	if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
+		u32 val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
+			  I40E_PFINT_DYN_CTLN_ITR_INDX_MASK | /* set noitr */
+			  I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK |
+			  I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK;
+			  /* allow 00 to be written to the index */
+
+		wr32(&vsi->back->hw,
+		     I40E_PFINT_DYN_CTLN(q_vector->reg_idx), val);
+	} else {
+		u32 val = I40E_PFINT_DYN_CTL0_INTENA_MASK |
+			  I40E_PFINT_DYN_CTL0_ITR_INDX_MASK | /* set noitr */
+			  I40E_PFINT_DYN_CTL0_SWINT_TRIG_MASK |
+			  I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_MASK;
+			/* allow 00 to be written to the index */
+
+		wr32(&vsi->back->hw, I40E_PFINT_DYN_CTL0, val);
+	}
+}
+
+static inline bool i40e_container_is_rx(struct i40e_q_vector *q_vector,
+					struct i40e_ring_container *rc)
+{
+	return &q_vector->rx == rc;
+}
+
+static inline unsigned int i40e_itr_divisor(struct i40e_q_vector *q_vector)
+{
+	unsigned int divisor;
+
+	switch (q_vector->vsi->back->hw.phy.link_info.link_speed) {
+	case I40E_LINK_SPEED_40GB:
+		divisor = I40E_ITR_ADAPTIVE_MIN_INC * 1024;
+		break;
+	case I40E_LINK_SPEED_25GB:
+	case I40E_LINK_SPEED_20GB:
+		divisor = I40E_ITR_ADAPTIVE_MIN_INC * 512;
+		break;
+	default:
+	case I40E_LINK_SPEED_10GB:
+		divisor = I40E_ITR_ADAPTIVE_MIN_INC * 256;
+		break;
+	case I40E_LINK_SPEED_1GB:
+	case I40E_LINK_SPEED_100MB:
+		divisor = I40E_ITR_ADAPTIVE_MIN_INC * 32;
+		break;
+	}
+
+	return divisor;
+}
+
+/**
+ * i40e_update_itr - update the dynamic ITR value based on statistics
+ * @q_vector: structure containing interrupt and ring information
+ * @rc: structure containing ring performance data
+ *
+ * Stores a new ITR value based on packets and byte
+ * counts during the last interrupt.  The advantage of per interrupt
+ * computation is faster updates and more accurate ITR for the current
+ * traffic pattern.  Constants in this function were computed
+ * based on theoretical maximum wire speed and thresholds were set based
+ * on testing data as well as attempting to minimize response time
+ * while increasing bulk throughput.
+ **/
+static void i40e_update_itr(struct i40e_q_vector *q_vector,
+			    struct i40e_ring_container *rc)
+{
+	unsigned int avg_wire_size, packets, bytes, itr;
+	unsigned long next_update = jiffies;
+
+	/* If we don't have any rings just leave ourselves set for maximum
+	 * possible latency so we take ourselves out of the equation.
+	 */
+	if (!rc->ring || !ITR_IS_DYNAMIC(rc->ring->itr_setting))
+		return;
+
+	/* For Rx we want to push the delay up and default to low latency.
+	 * for Tx we want to pull the delay down and default to high latency.
+	 */
+	itr = i40e_container_is_rx(q_vector, rc) ?
+	      I40E_ITR_ADAPTIVE_MIN_USECS | I40E_ITR_ADAPTIVE_LATENCY :
+	      I40E_ITR_ADAPTIVE_MAX_USECS | I40E_ITR_ADAPTIVE_LATENCY;
+
+	/* If we didn't update within up to 1 - 2 jiffies we can assume
+	 * that either packets are coming in so slow there hasn't been
+	 * any work, or that there is so much work that NAPI is dealing
+	 * with interrupt moderation and we don't need to do anything.
+	 */
+	if (time_after(next_update, rc->next_update))
+		goto clear_counts;
+
+	/* If itr_countdown is set it means we programmed an ITR within
+	 * the last 4 interrupt cycles. This has a side effect of us
+	 * potentially firing an early interrupt. In order to work around
+	 * this we need to throw out any data received for a few
+	 * interrupts following the update.
+	 */
+	if (q_vector->itr_countdown) {
+		itr = rc->target_itr;
+		goto clear_counts;
+	}
+
+	packets = rc->total_packets;
+	bytes = rc->total_bytes;
+
+	if (i40e_container_is_rx(q_vector, rc)) {
+		/* If Rx there are 1 to 4 packets and bytes are less than
+		 * 9000 assume insufficient data to use bulk rate limiting
+		 * approach unless Tx is already in bulk rate limiting. We
+		 * are likely latency driven.
+		 */
+		if (packets && packets < 4 && bytes < 9000 &&
+		    (q_vector->tx.target_itr & I40E_ITR_ADAPTIVE_LATENCY)) {
+			itr = I40E_ITR_ADAPTIVE_LATENCY;
+			goto adjust_by_size;
+		}
+	} else if (packets < 4) {
+		/* If we have Tx and Rx ITR maxed and Tx ITR is running in
+		 * bulk mode and we are receiving 4 or fewer packets just
+		 * reset the ITR_ADAPTIVE_LATENCY bit for latency mode so
+		 * that the Rx can relax.
+		 */
+		if (rc->target_itr == I40E_ITR_ADAPTIVE_MAX_USECS &&
+		    (q_vector->rx.target_itr & I40E_ITR_MASK) ==
+		     I40E_ITR_ADAPTIVE_MAX_USECS)
+			goto clear_counts;
+	} else if (packets > 32) {
+		/* If we have processed over 32 packets in a single interrupt
+		 * for Tx assume we need to switch over to "bulk" mode.
+		 */
+		rc->target_itr &= ~I40E_ITR_ADAPTIVE_LATENCY;
+	}
+
+	/* We have no packets to actually measure against. This means
+	 * either one of the other queues on this vector is active or
+	 * we are a Tx queue doing TSO with too high of an interrupt rate.
+	 *
+	 * Between 4 and 56 we can assume that our current interrupt delay
+	 * is only slightly too low. As such we should increase it by a small
+	 * fixed amount.
+	 */
+	if (packets < 56) {
+		itr = rc->target_itr + I40E_ITR_ADAPTIVE_MIN_INC;
+		if ((itr & I40E_ITR_MASK) > I40E_ITR_ADAPTIVE_MAX_USECS) {
+			itr &= I40E_ITR_ADAPTIVE_LATENCY;
+			itr += I40E_ITR_ADAPTIVE_MAX_USECS;
+		}
+		goto clear_counts;
+	}
+
+	if (packets <= 256) {
+		itr = min(q_vector->tx.current_itr, q_vector->rx.current_itr);
+		itr &= I40E_ITR_MASK;
+
+		/* Between 56 and 112 is our "goldilocks" zone where we are
+		 * working out "just right". Just report that our current
+		 * ITR is good for us.
+		 */
+		if (packets <= 112)
+			goto clear_counts;
+
+		/* If packet count is 128 or greater we are likely looking
+		 * at a slight overrun of the delay we want. Try halving
+		 * our delay to see if that will cut the number of packets
+		 * in half per interrupt.
+		 */
+		itr /= 2;
+		itr &= I40E_ITR_MASK;
+		if (itr < I40E_ITR_ADAPTIVE_MIN_USECS)
+			itr = I40E_ITR_ADAPTIVE_MIN_USECS;
+
+		goto clear_counts;
+	}
+
+	/* The paths below assume we are dealing with a bulk ITR since
+	 * number of packets is greater than 256. We are just going to have
+	 * to compute a value and try to bring the count under control,
+	 * though for smaller packet sizes there isn't much we can do as
+	 * NAPI polling will likely be kicking in sooner rather than later.
+	 */
+	itr = I40E_ITR_ADAPTIVE_BULK;
+
+adjust_by_size:
+	/* If packet counts are 256 or greater we can assume we have a gross
+	 * overestimation of what the rate should be. Instead of trying to fine
+	 * tune it just use the formula below to try and dial in an exact value
+	 * give the current packet size of the frame.
+	 */
+	avg_wire_size = bytes / packets;
+
+	/* The following is a crude approximation of:
+	 *  wmem_default / (size + overhead) = desired_pkts_per_int
+	 *  rate / bits_per_byte / (size + ethernet overhead) = pkt_rate
+	 *  (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value
+	 *
+	 * Assuming wmem_default is 212992 and overhead is 640 bytes per
+	 * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the
+	 * formula down to
+	 *
+	 *  (170 * (size + 24)) / (size + 640) = ITR
+	 *
+	 * We first do some math on the packet size and then finally bitshift
+	 * by 8 after rounding up. We also have to account for PCIe link speed
+	 * difference as ITR scales based on this.
+	 */
+	if (avg_wire_size <= 60) {
+		/* Start at 250k ints/sec */
+		avg_wire_size = 4096;
+	} else if (avg_wire_size <= 380) {
+		/* 250K ints/sec to 60K ints/sec */
+		avg_wire_size *= 40;
+		avg_wire_size += 1696;
+	} else if (avg_wire_size <= 1084) {
+		/* 60K ints/sec to 36K ints/sec */
+		avg_wire_size *= 15;
+		avg_wire_size += 11452;
+	} else if (avg_wire_size <= 1980) {
+		/* 36K ints/sec to 30K ints/sec */
+		avg_wire_size *= 5;
+		avg_wire_size += 22420;
+	} else {
+		/* plateau at a limit of 30K ints/sec */
+		avg_wire_size = 32256;
+	}
+
+	/* If we are in low latency mode halve our delay which doubles the
+	 * rate to somewhere between 100K to 16K ints/sec
+	 */
+	if (itr & I40E_ITR_ADAPTIVE_LATENCY)
+		avg_wire_size /= 2;
+
+	/* Resultant value is 256 times larger than it needs to be. This
+	 * gives us room to adjust the value as needed to either increase
+	 * or decrease the value based on link speeds of 10G, 2.5G, 1G, etc.
+	 *
+	 * Use addition as we have already recorded the new latency flag
+	 * for the ITR value.
+	 */
+	itr += DIV_ROUND_UP(avg_wire_size, i40e_itr_divisor(q_vector)) *
+	       I40E_ITR_ADAPTIVE_MIN_INC;
+
+	if ((itr & I40E_ITR_MASK) > I40E_ITR_ADAPTIVE_MAX_USECS) {
+		itr &= I40E_ITR_ADAPTIVE_LATENCY;
+		itr += I40E_ITR_ADAPTIVE_MAX_USECS;
+	}
+
+clear_counts:
+	/* write back value */
+	rc->target_itr = itr;
+
+	/* next update should occur within next jiffy */
+	rc->next_update = next_update + 1;
+
+	rc->total_bytes = 0;
+	rc->total_packets = 0;
+}
+
+/**
+ * i40e_reuse_rx_page - page flip buffer and store it back on the ring
+ * @rx_ring: rx descriptor ring to store buffers on
+ * @old_buff: donor buffer to have page reused
+ *
+ * Synchronizes page for reuse by the adapter
+ **/
+static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
+			       struct i40e_rx_buffer *old_buff)
+{
+	struct i40e_rx_buffer *new_buff;
+	u16 nta = rx_ring->next_to_alloc;
+
+	new_buff = &rx_ring->rx_bi[nta];
+
+	/* update, and store next to alloc */
+	nta++;
+	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+
+	/* transfer page from old buffer to new buffer */
+	new_buff->dma		= old_buff->dma;
+	new_buff->page		= old_buff->page;
+	new_buff->page_offset	= old_buff->page_offset;
+	new_buff->pagecnt_bias	= old_buff->pagecnt_bias;
+}
+
+/**
+ * i40e_rx_is_programming_status - check for programming status descriptor
+ * @qw: qword representing status_error_len in CPU ordering
+ *
+ * The value of in the descriptor length field indicate if this
+ * is a programming status descriptor for flow director or FCoE
+ * by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise
+ * it is a packet descriptor.
+ **/
+static inline bool i40e_rx_is_programming_status(u64 qw)
+{
+	/* The Rx filter programming status and SPH bit occupy the same
+	 * spot in the descriptor. Since we don't support packet split we
+	 * can just reuse the bit as an indication that this is a
+	 * programming status descriptor.
+	 */
+	return qw & I40E_RXD_QW1_LENGTH_SPH_MASK;
+}
+
+/**
+ * i40e_clean_programming_status - clean the programming status descriptor
+ * @rx_ring: the rx ring that has this descriptor
+ * @rx_desc: the rx descriptor written back by HW
+ * @qw: qword representing status_error_len in CPU ordering
+ *
+ * Flow director should handle FD_FILTER_STATUS to check its filter programming
+ * status being successful or not and take actions accordingly. FCoE should
+ * handle its context/filter programming/invalidation status and take actions.
+ *
+ **/
+static void i40e_clean_programming_status(struct i40e_ring *rx_ring,
+					  union i40e_rx_desc *rx_desc,
+					  u64 qw)
+{
+	struct i40e_rx_buffer *rx_buffer;
+	u32 ntc = rx_ring->next_to_clean;
+	u8 id;
+
+	/* fetch, update, and store next to clean */
+	rx_buffer = &rx_ring->rx_bi[ntc++];
+	ntc = (ntc < rx_ring->count) ? ntc : 0;
+	rx_ring->next_to_clean = ntc;
+
+	prefetch(I40E_RX_DESC(rx_ring, ntc));
+
+	/* place unused page back on the ring */
+	i40e_reuse_rx_page(rx_ring, rx_buffer);
+	rx_ring->rx_stats.page_reuse_count++;
+
+	/* clear contents of buffer_info */
+	rx_buffer->page = NULL;
+
+	id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
+		  I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
+
+	if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS)
+		i40e_fd_handle_status(rx_ring, rx_desc, id);
+}
+
+/**
+ * i40e_setup_tx_descriptors - Allocate the Tx descriptors
+ * @tx_ring: the tx ring to set up
+ *
+ * Return 0 on success, negative on error
+ **/
+int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
+{
+	struct device *dev = tx_ring->dev;
+	int bi_size;
+
+	if (!dev)
+		return -ENOMEM;
+
+	/* warn if we are about to overwrite the pointer */
+	WARN_ON(tx_ring->tx_bi);
+	bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
+	tx_ring->tx_bi = kzalloc(bi_size, GFP_KERNEL);
+	if (!tx_ring->tx_bi)
+		goto err;
+
+	u64_stats_init(&tx_ring->syncp);
+
+	/* round up to nearest 4K */
+	tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc);
+	/* add u32 for head writeback, align after this takes care of
+	 * guaranteeing this is at least one cache line in size
+	 */
+	tx_ring->size += sizeof(u32);
+	tx_ring->size = ALIGN(tx_ring->size, 4096);
+	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
+					   &tx_ring->dma, GFP_KERNEL);
+	if (!tx_ring->desc) {
+		dev_info(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n",
+			 tx_ring->size);
+		goto err;
+	}
+
+	tx_ring->next_to_use = 0;
+	tx_ring->next_to_clean = 0;
+	tx_ring->tx_stats.prev_pkt_ctr = -1;
+	return 0;
+
+err:
+	kfree(tx_ring->tx_bi);
+	tx_ring->tx_bi = NULL;
+	return -ENOMEM;
+}
+
+/**
+ * i40e_clean_rx_ring - Free Rx buffers
+ * @rx_ring: ring to be cleaned
+ **/
+void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
+{
+	unsigned long bi_size;
+	u16 i;
+
+	/* ring already cleared, nothing to do */
+	if (!rx_ring->rx_bi)
+		return;
+
+	if (rx_ring->skb) {
+		dev_kfree_skb(rx_ring->skb);
+		rx_ring->skb = NULL;
+	}
+
+	/* Free all the Rx ring sk_buffs */
+	for (i = 0; i < rx_ring->count; i++) {
+		struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
+
+		if (!rx_bi->page)
+			continue;
+
+		/* Invalidate cache lines that may have been written to by
+		 * device so that we avoid corrupting memory.
+		 */
+		dma_sync_single_range_for_cpu(rx_ring->dev,
+					      rx_bi->dma,
+					      rx_bi->page_offset,
+					      rx_ring->rx_buf_len,
+					      DMA_FROM_DEVICE);
+
+		/* free resources associated with mapping */
+		dma_unmap_page_attrs(rx_ring->dev, rx_bi->dma,
+				     i40e_rx_pg_size(rx_ring),
+				     DMA_FROM_DEVICE,
+				     I40E_RX_DMA_ATTR);
+
+		__page_frag_cache_drain(rx_bi->page, rx_bi->pagecnt_bias);
+
+		rx_bi->page = NULL;
+		rx_bi->page_offset = 0;
+	}
+
+	bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
+	memset(rx_ring->rx_bi, 0, bi_size);
+
+	/* Zero out the descriptor ring */
+	memset(rx_ring->desc, 0, rx_ring->size);
+
+	rx_ring->next_to_alloc = 0;
+	rx_ring->next_to_clean = 0;
+	rx_ring->next_to_use = 0;
+}
+
+/**
+ * i40e_free_rx_resources - Free Rx resources
+ * @rx_ring: ring to clean the resources from
+ *
+ * Free all receive software resources
+ **/
+void i40e_free_rx_resources(struct i40e_ring *rx_ring)
+{
+	i40e_clean_rx_ring(rx_ring);
+	if (rx_ring->vsi->type == I40E_VSI_MAIN)
+		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+	rx_ring->xdp_prog = NULL;
+	kfree(rx_ring->rx_bi);
+	rx_ring->rx_bi = NULL;
+
+	if (rx_ring->desc) {
+		dma_free_coherent(rx_ring->dev, rx_ring->size,
+				  rx_ring->desc, rx_ring->dma);
+		rx_ring->desc = NULL;
+	}
+}
+
+/**
+ * i40e_setup_rx_descriptors - Allocate Rx descriptors
+ * @rx_ring: Rx descriptor ring (for a specific queue) to setup
+ *
+ * Returns 0 on success, negative on failure
+ **/
+int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
+{
+	struct device *dev = rx_ring->dev;
+	int err = -ENOMEM;
+	int bi_size;
+
+	/* warn if we are about to overwrite the pointer */
+	WARN_ON(rx_ring->rx_bi);
+	bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
+	rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
+	if (!rx_ring->rx_bi)
+		goto err;
+
+	u64_stats_init(&rx_ring->syncp);
+
+	/* Round up to nearest 4K */
+	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
+	rx_ring->size = ALIGN(rx_ring->size, 4096);
+	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
+					   &rx_ring->dma, GFP_KERNEL);
+
+	if (!rx_ring->desc) {
+		dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
+			 rx_ring->size);
+		goto err;
+	}
+
+	rx_ring->next_to_alloc = 0;
+	rx_ring->next_to_clean = 0;
+	rx_ring->next_to_use = 0;
+
+	/* XDP RX-queue info only needed for RX rings exposed to XDP */
+	if (rx_ring->vsi->type == I40E_VSI_MAIN) {
+		err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+				       rx_ring->queue_index);
+		if (err < 0)
+			goto err;
+	}
+
+	rx_ring->xdp_prog = rx_ring->vsi->xdp_prog;
+
+	return 0;
+err:
+	kfree(rx_ring->rx_bi);
+	rx_ring->rx_bi = NULL;
+	return err;
+}
+
+/**
+ * i40e_release_rx_desc - Store the new tail and head values
+ * @rx_ring: ring to bump
+ * @val: new head index
+ **/
+static inline void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val)
+{
+	rx_ring->next_to_use = val;
+
+	/* update next to alloc since we have filled the ring */
+	rx_ring->next_to_alloc = val;
+
+	/* Force memory writes to complete before letting h/w
+	 * know there are new descriptors to fetch.  (Only
+	 * applicable for weak-ordered memory model archs,
+	 * such as IA-64).
+	 */
+	wmb();
+	writel(val, rx_ring->tail);
+}
+
+/**
+ * i40e_rx_offset - Return expected offset into page to access data
+ * @rx_ring: Ring we are requesting offset of
+ *
+ * Returns the offset value for ring into the data buffer.
+ */
+static inline unsigned int i40e_rx_offset(struct i40e_ring *rx_ring)
+{
+	return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0;
+}
+
+/**
+ * i40e_alloc_mapped_page - recycle or make a new page
+ * @rx_ring: ring to use
+ * @bi: rx_buffer struct to modify
+ *
+ * Returns true if the page was successfully allocated or
+ * reused.
+ **/
+static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
+				   struct i40e_rx_buffer *bi)
+{
+	struct page *page = bi->page;
+	dma_addr_t dma;
+
+	/* since we are recycling buffers we should seldom need to alloc */
+	if (likely(page)) {
+		rx_ring->rx_stats.page_reuse_count++;
+		return true;
+	}
+
+	/* alloc new page for storage */
+	page = dev_alloc_pages(i40e_rx_pg_order(rx_ring));
+	if (unlikely(!page)) {
+		rx_ring->rx_stats.alloc_page_failed++;
+		return false;
+	}
+
+	/* map page for use */
+	dma = dma_map_page_attrs(rx_ring->dev, page, 0,
+				 i40e_rx_pg_size(rx_ring),
+				 DMA_FROM_DEVICE,
+				 I40E_RX_DMA_ATTR);
+
+	/* if mapping failed free memory back to system since
+	 * there isn't much point in holding memory we can't use
+	 */
+	if (dma_mapping_error(rx_ring->dev, dma)) {
+		__free_pages(page, i40e_rx_pg_order(rx_ring));
+		rx_ring->rx_stats.alloc_page_failed++;
+		return false;
+	}
+
+	bi->dma = dma;
+	bi->page = page;
+	bi->page_offset = i40e_rx_offset(rx_ring);
+	page_ref_add(page, USHRT_MAX - 1);
+	bi->pagecnt_bias = USHRT_MAX;
+
+	return true;
+}
+
+/**
+ * i40e_receive_skb - Send a completed packet up the stack
+ * @rx_ring:  rx ring in play
+ * @skb: packet to send up
+ * @vlan_tag: vlan tag for packet
+ **/
+static void i40e_receive_skb(struct i40e_ring *rx_ring,
+			     struct sk_buff *skb, u16 vlan_tag)
+{
+	struct i40e_q_vector *q_vector = rx_ring->q_vector;
+
+	if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
+	    (vlan_tag & VLAN_VID_MASK))
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+
+	napi_gro_receive(&q_vector->napi, skb);
+}
+
+/**
+ * i40e_alloc_rx_buffers - Replace used receive buffers
+ * @rx_ring: ring to place buffers on
+ * @cleaned_count: number of buffers to replace
+ *
+ * Returns false if all allocations were successful, true if any fail
+ **/
+bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
+{
+	u16 ntu = rx_ring->next_to_use;
+	union i40e_rx_desc *rx_desc;
+	struct i40e_rx_buffer *bi;
+
+	/* do nothing if no valid netdev defined */
+	if (!rx_ring->netdev || !cleaned_count)
+		return false;
+
+	rx_desc = I40E_RX_DESC(rx_ring, ntu);
+	bi = &rx_ring->rx_bi[ntu];
+
+	do {
+		if (!i40e_alloc_mapped_page(rx_ring, bi))
+			goto no_buffers;
+
+		/* sync the buffer for use by the device */
+		dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
+						 bi->page_offset,
+						 rx_ring->rx_buf_len,
+						 DMA_FROM_DEVICE);
+
+		/* Refresh the desc even if buffer_addrs didn't change
+		 * because each write-back erases this info.
+		 */
+		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
+
+		rx_desc++;
+		bi++;
+		ntu++;
+		if (unlikely(ntu == rx_ring->count)) {
+			rx_desc = I40E_RX_DESC(rx_ring, 0);
+			bi = rx_ring->rx_bi;
+			ntu = 0;
+		}
+
+		/* clear the status bits for the next_to_use descriptor */
+		rx_desc->wb.qword1.status_error_len = 0;
+
+		cleaned_count--;
+	} while (cleaned_count);
+
+	if (rx_ring->next_to_use != ntu)
+		i40e_release_rx_desc(rx_ring, ntu);
+
+	return false;
+
+no_buffers:
+	if (rx_ring->next_to_use != ntu)
+		i40e_release_rx_desc(rx_ring, ntu);
+
+	/* make sure to come back via polling to try again after
+	 * allocation failure
+	 */
+	return true;
+}
+
+/**
+ * i40e_rx_checksum - Indicate in skb if hw indicated a good cksum
+ * @vsi: the VSI we care about
+ * @skb: skb currently being received and modified
+ * @rx_desc: the receive descriptor
+ **/
+static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
+				    struct sk_buff *skb,
+				    union i40e_rx_desc *rx_desc)
+{
+	struct i40e_rx_ptype_decoded decoded;
+	u32 rx_error, rx_status;
+	bool ipv4, ipv6;
+	u8 ptype;
+	u64 qword;
+
+	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+	ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >> I40E_RXD_QW1_PTYPE_SHIFT;
+	rx_error = (qword & I40E_RXD_QW1_ERROR_MASK) >>
+		   I40E_RXD_QW1_ERROR_SHIFT;
+	rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
+		    I40E_RXD_QW1_STATUS_SHIFT;
+	decoded = decode_rx_desc_ptype(ptype);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	skb_checksum_none_assert(skb);
+
+	/* Rx csum enabled and ip headers found? */
+	if (!(vsi->netdev->features & NETIF_F_RXCSUM))
+		return;
+
+	/* did the hardware decode the packet and checksum? */
+	if (!(rx_status & BIT(I40E_RX_DESC_STATUS_L3L4P_SHIFT)))
+		return;
+
+	/* both known and outer_ip must be set for the below code to work */
+	if (!(decoded.known && decoded.outer_ip))
+		return;
+
+	ipv4 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
+	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4);
+	ipv6 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
+	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6);
+
+	if (ipv4 &&
+	    (rx_error & (BIT(I40E_RX_DESC_ERROR_IPE_SHIFT) |
+			 BIT(I40E_RX_DESC_ERROR_EIPE_SHIFT))))
+		goto checksum_fail;
+
+	/* likely incorrect csum if alternate IP extension headers found */
+	if (ipv6 &&
+	    rx_status & BIT(I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))
+		/* don't increment checksum err here, non-fatal err */
+		return;
+
+	/* there was some L4 error, count error and punt packet to the stack */
+	if (rx_error & BIT(I40E_RX_DESC_ERROR_L4E_SHIFT))
+		goto checksum_fail;
+
+	/* handle packets that were not able to be checksummed due
+	 * to arrival speed, in this case the stack can compute
+	 * the csum.
+	 */
+	if (rx_error & BIT(I40E_RX_DESC_ERROR_PPRS_SHIFT))
+		return;
+
+	/* If there is an outer header present that might contain a checksum
+	 * we need to bump the checksum level by 1 to reflect the fact that
+	 * we are indicating we validated the inner checksum.
+	 */
+	if (decoded.tunnel_type >= I40E_RX_PTYPE_TUNNEL_IP_GRENAT)
+		skb->csum_level = 1;
+
+	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
+	switch (decoded.inner_prot) {
+	case I40E_RX_PTYPE_INNER_PROT_TCP:
+	case I40E_RX_PTYPE_INNER_PROT_UDP:
+	case I40E_RX_PTYPE_INNER_PROT_SCTP:
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		/* fall though */
+	default:
+		break;
+	}
+
+	return;
+
+checksum_fail:
+	vsi->back->hw_csum_rx_error++;
+}
+
+/**
+ * i40e_ptype_to_htype - get a hash type
+ * @ptype: the ptype value from the descriptor
+ *
+ * Returns a hash type to be used by skb_set_hash
+ **/
+static inline int i40e_ptype_to_htype(u8 ptype)
+{
+	struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
+
+	if (!decoded.known)
+		return PKT_HASH_TYPE_NONE;
+
+	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
+	    decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4)
+		return PKT_HASH_TYPE_L4;
+	else if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
+		 decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3)
+		return PKT_HASH_TYPE_L3;
+	else
+		return PKT_HASH_TYPE_L2;
+}
+
+/**
+ * i40e_rx_hash - set the hash value in the skb
+ * @ring: descriptor ring
+ * @rx_desc: specific descriptor
+ **/
+static inline void i40e_rx_hash(struct i40e_ring *ring,
+				union i40e_rx_desc *rx_desc,
+				struct sk_buff *skb,
+				u8 rx_ptype)
+{
+	u32 hash;
+	const __le64 rss_mask =
+		cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH <<
+			    I40E_RX_DESC_STATUS_FLTSTAT_SHIFT);
+
+	if (!(ring->netdev->features & NETIF_F_RXHASH))
+		return;
+
+	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
+		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
+		skb_set_hash(skb, hash, i40e_ptype_to_htype(rx_ptype));
+	}
+}
+
+/**
+ * i40e_process_skb_fields - Populate skb header fields from Rx descriptor
+ * @rx_ring: rx descriptor ring packet is being transacted on
+ * @rx_desc: pointer to the EOP Rx descriptor
+ * @skb: pointer to current skb being populated
+ * @rx_ptype: the packet type decoded by hardware
+ *
+ * This function checks the ring, descriptor, and packet information in
+ * order to populate the hash, checksum, VLAN, protocol, and
+ * other fields within the skb.
+ **/
+static inline
+void i40e_process_skb_fields(struct i40e_ring *rx_ring,
+			     union i40e_rx_desc *rx_desc, struct sk_buff *skb,
+			     u8 rx_ptype)
+{
+	u64 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+	u32 rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
+			I40E_RXD_QW1_STATUS_SHIFT;
+	u32 tsynvalid = rx_status & I40E_RXD_QW1_STATUS_TSYNVALID_MASK;
+	u32 tsyn = (rx_status & I40E_RXD_QW1_STATUS_TSYNINDX_MASK) >>
+		   I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT;
+
+	if (unlikely(tsynvalid))
+		i40e_ptp_rx_hwtstamp(rx_ring->vsi->back, skb, tsyn);
+
+	i40e_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
+
+	i40e_rx_checksum(rx_ring->vsi, skb, rx_desc);
+
+	skb_record_rx_queue(skb, rx_ring->queue_index);
+
+	/* modifies the skb - consumes the enet header */
+	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+}
+
+/**
+ * i40e_cleanup_headers - Correct empty headers
+ * @rx_ring: rx descriptor ring packet is being transacted on
+ * @skb: pointer to current skb being fixed
+ * @rx_desc: pointer to the EOP Rx descriptor
+ *
+ * Also address the case where we are pulling data in on pages only
+ * and as such no data is present in the skb header.
+ *
+ * In addition if skb is not at least 60 bytes we need to pad it so that
+ * it is large enough to qualify as a valid Ethernet frame.
+ *
+ * Returns true if an error was encountered and skb was freed.
+ **/
+static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb,
+				 union i40e_rx_desc *rx_desc)
+
+{
+	/* XDP packets use error pointer so abort at this point */
+	if (IS_ERR(skb))
+		return true;
+
+	/* ERR_MASK will only have valid bits if EOP set, and
+	 * what we are doing here is actually checking
+	 * I40E_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
+	 * the error field
+	 */
+	if (unlikely(i40e_test_staterr(rx_desc,
+				       BIT(I40E_RXD_QW1_ERROR_SHIFT)))) {
+		dev_kfree_skb_any(skb);
+		return true;
+	}
+
+	/* if eth_skb_pad returns an error the skb was freed */
+	if (eth_skb_pad(skb))
+		return true;
+
+	return false;
+}
+
+/**
+ * i40e_page_is_reusable - check if any reuse is possible
+ * @page: page struct to check
+ *
+ * A page is not reusable if it was allocated under low memory
+ * conditions, or it's not in the same NUMA node as this CPU.
+ */
+static inline bool i40e_page_is_reusable(struct page *page)
+{
+	return (page_to_nid(page) == numa_mem_id()) &&
+		!page_is_pfmemalloc(page);
+}
+
+/**
+ * i40e_can_reuse_rx_page - Determine if this page can be reused by
+ * the adapter for another receive
+ *
+ * @rx_buffer: buffer containing the page
+ *
+ * If page is reusable, rx_buffer->page_offset is adjusted to point to
+ * an unused region in the page.
+ *
+ * For small pages, @truesize will be a constant value, half the size
+ * of the memory at page.  We'll attempt to alternate between high and
+ * low halves of the page, with one half ready for use by the hardware
+ * and the other half being consumed by the stack.  We use the page
+ * ref count to determine whether the stack has finished consuming the
+ * portion of this page that was passed up with a previous packet.  If
+ * the page ref count is >1, we'll assume the "other" half page is
+ * still busy, and this page cannot be reused.
+ *
+ * For larger pages, @truesize will be the actual space used by the
+ * received packet (adjusted upward to an even multiple of the cache
+ * line size).  This will advance through the page by the amount
+ * actually consumed by the received packets while there is still
+ * space for a buffer.  Each region of larger pages will be used at
+ * most once, after which the page will not be reused.
+ *
+ * In either case, if the page is reusable its refcount is increased.
+ **/
+static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer)
+{
+	unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
+	struct page *page = rx_buffer->page;
+
+	/* Is any reuse possible? */
+	if (unlikely(!i40e_page_is_reusable(page)))
+		return false;
+
+#if (PAGE_SIZE < 8192)
+	/* if we are only owner of page we can reuse it */
+	if (unlikely((page_count(page) - pagecnt_bias) > 1))
+		return false;
+#else
+#define I40E_LAST_OFFSET \
+	(SKB_WITH_OVERHEAD(PAGE_SIZE) - I40E_RXBUFFER_2048)
+	if (rx_buffer->page_offset > I40E_LAST_OFFSET)
+		return false;
+#endif
+
+	/* If we have drained the page fragment pool we need to update
+	 * the pagecnt_bias and page count so that we fully restock the
+	 * number of references the driver holds.
+	 */
+	if (unlikely(pagecnt_bias == 1)) {
+		page_ref_add(page, USHRT_MAX - 1);
+		rx_buffer->pagecnt_bias = USHRT_MAX;
+	}
+
+	return true;
+}
+
+/**
+ * i40e_add_rx_frag - Add contents of Rx buffer to sk_buff
+ * @rx_ring: rx descriptor ring to transact packets on
+ * @rx_buffer: buffer containing page to add
+ * @skb: sk_buff to place the data into
+ * @size: packet length from rx_desc
+ *
+ * This function will add the data contained in rx_buffer->page to the skb.
+ * It will just attach the page as a frag to the skb.
+ *
+ * The function will then update the page offset.
+ **/
+static void i40e_add_rx_frag(struct i40e_ring *rx_ring,
+			     struct i40e_rx_buffer *rx_buffer,
+			     struct sk_buff *skb,
+			     unsigned int size)
+{
+#if (PAGE_SIZE < 8192)
+	unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2;
+#else
+	unsigned int truesize = SKB_DATA_ALIGN(size + i40e_rx_offset(rx_ring));
+#endif
+
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
+			rx_buffer->page_offset, size, truesize);
+
+	/* page is being used so we must update the page offset */
+#if (PAGE_SIZE < 8192)
+	rx_buffer->page_offset ^= truesize;
+#else
+	rx_buffer->page_offset += truesize;
+#endif
+}
+
+/**
+ * i40e_get_rx_buffer - Fetch Rx buffer and synchronize data for use
+ * @rx_ring: rx descriptor ring to transact packets on
+ * @size: size of buffer to add to skb
+ *
+ * This function will pull an Rx buffer from the ring and synchronize it
+ * for use by the CPU.
+ */
+static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring,
+						 const unsigned int size)
+{
+	struct i40e_rx_buffer *rx_buffer;
+
+	rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
+	prefetchw(rx_buffer->page);
+
+	/* we are reusing so sync this buffer for CPU use */
+	dma_sync_single_range_for_cpu(rx_ring->dev,
+				      rx_buffer->dma,
+				      rx_buffer->page_offset,
+				      size,
+				      DMA_FROM_DEVICE);
+
+	/* We have pulled a buffer for use, so decrement pagecnt_bias */
+	rx_buffer->pagecnt_bias--;
+
+	return rx_buffer;
+}
+
+/**
+ * i40e_construct_skb - Allocate skb and populate it
+ * @rx_ring: rx descriptor ring to transact packets on
+ * @rx_buffer: rx buffer to pull data from
+ * @xdp: xdp_buff pointing to the data
+ *
+ * This function allocates an skb.  It then populates it with the page
+ * data from the current receive descriptor, taking care to set up the
+ * skb correctly.
+ */
+static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
+					  struct i40e_rx_buffer *rx_buffer,
+					  struct xdp_buff *xdp)
+{
+	unsigned int size = xdp->data_end - xdp->data;
+#if (PAGE_SIZE < 8192)
+	unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2;
+#else
+	unsigned int truesize = SKB_DATA_ALIGN(size);
+#endif
+	unsigned int headlen;
+	struct sk_buff *skb;
+
+	/* prefetch first cache line of first page */
+	prefetch(xdp->data);
+#if L1_CACHE_BYTES < 128
+	prefetch(xdp->data + L1_CACHE_BYTES);
+#endif
+
+	/* allocate a skb to store the frags */
+	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
+			       I40E_RX_HDR_SIZE,
+			       GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* Determine available headroom for copy */
+	headlen = size;
+	if (headlen > I40E_RX_HDR_SIZE)
+		headlen = eth_get_headlen(xdp->data, I40E_RX_HDR_SIZE);
+
+	/* align pull length to size of long to optimize memcpy performance */
+	memcpy(__skb_put(skb, headlen), xdp->data,
+	       ALIGN(headlen, sizeof(long)));
+
+	/* update all of the pointers */
+	size -= headlen;
+	if (size) {
+		skb_add_rx_frag(skb, 0, rx_buffer->page,
+				rx_buffer->page_offset + headlen,
+				size, truesize);
+
+		/* buffer is used by skb, update page_offset */
+#if (PAGE_SIZE < 8192)
+		rx_buffer->page_offset ^= truesize;
+#else
+		rx_buffer->page_offset += truesize;
+#endif
+	} else {
+		/* buffer is unused, reset bias back to rx_buffer */
+		rx_buffer->pagecnt_bias++;
+	}
+
+	return skb;
+}
+
+/**
+ * i40e_build_skb - Build skb around an existing buffer
+ * @rx_ring: Rx descriptor ring to transact packets on
+ * @rx_buffer: Rx buffer to pull data from
+ * @xdp: xdp_buff pointing to the data
+ *
+ * This function builds an skb around an existing Rx buffer, taking care
+ * to set up the skb correctly and avoid any memcpy overhead.
+ */
+static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
+				      struct i40e_rx_buffer *rx_buffer,
+				      struct xdp_buff *xdp)
+{
+	unsigned int size = xdp->data_end - xdp->data;
+#if (PAGE_SIZE < 8192)
+	unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2;
+#else
+	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
+				SKB_DATA_ALIGN(I40E_SKB_PAD + size);
+#endif
+	struct sk_buff *skb;
+
+	/* prefetch first cache line of first page */
+	prefetch(xdp->data);
+#if L1_CACHE_BYTES < 128
+	prefetch(xdp->data + L1_CACHE_BYTES);
+#endif
+	/* build an skb around the page buffer */
+	skb = build_skb(xdp->data_hard_start, truesize);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* update pointers within the skb to store the data */
+	skb_reserve(skb, I40E_SKB_PAD);
+	__skb_put(skb, size);
+
+	/* buffer is used by skb, update page_offset */
+#if (PAGE_SIZE < 8192)
+	rx_buffer->page_offset ^= truesize;
+#else
+	rx_buffer->page_offset += truesize;
+#endif
+
+	return skb;
+}
+
+/**
+ * i40e_put_rx_buffer - Clean up used buffer and either recycle or free
+ * @rx_ring: rx descriptor ring to transact packets on
+ * @rx_buffer: rx buffer to pull data from
+ *
+ * This function will clean up the contents of the rx_buffer.  It will
+ * either recycle the buffer or unmap it and free the associated resources.
+ */
+static void i40e_put_rx_buffer(struct i40e_ring *rx_ring,
+			       struct i40e_rx_buffer *rx_buffer)
+{
+	if (i40e_can_reuse_rx_page(rx_buffer)) {
+		/* hand second half of page back to the ring */
+		i40e_reuse_rx_page(rx_ring, rx_buffer);
+		rx_ring->rx_stats.page_reuse_count++;
+	} else {
+		/* we are not reusing the buffer so unmap it */
+		dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
+				     i40e_rx_pg_size(rx_ring),
+				     DMA_FROM_DEVICE, I40E_RX_DMA_ATTR);
+		__page_frag_cache_drain(rx_buffer->page,
+					rx_buffer->pagecnt_bias);
+	}
+
+	/* clear contents of buffer_info */
+	rx_buffer->page = NULL;
+}
+
+/**
+ * i40e_is_non_eop - process handling of non-EOP buffers
+ * @rx_ring: Rx ring being processed
+ * @rx_desc: Rx descriptor for current buffer
+ * @skb: Current socket buffer containing buffer in progress
+ *
+ * This function updates next to clean.  If the buffer is an EOP buffer
+ * this function exits returning false, otherwise it will place the
+ * sk_buff in the next buffer to be chained and return true indicating
+ * that this is in fact a non-EOP buffer.
+ **/
+static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
+			    union i40e_rx_desc *rx_desc,
+			    struct sk_buff *skb)
+{
+	u32 ntc = rx_ring->next_to_clean + 1;
+
+	/* fetch, update, and store next to clean */
+	ntc = (ntc < rx_ring->count) ? ntc : 0;
+	rx_ring->next_to_clean = ntc;
+
+	prefetch(I40E_RX_DESC(rx_ring, ntc));
+
+	/* if we are the last buffer then there is nothing else to do */
+#define I40E_RXD_EOF BIT(I40E_RX_DESC_STATUS_EOF_SHIFT)
+	if (likely(i40e_test_staterr(rx_desc, I40E_RXD_EOF)))
+		return false;
+
+	rx_ring->rx_stats.non_eop_descs++;
+
+	return true;
+}
+
+#define I40E_XDP_PASS 0
+#define I40E_XDP_CONSUMED 1
+#define I40E_XDP_TX 2
+
+static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
+			      struct i40e_ring *xdp_ring);
+
+/**
+ * i40e_run_xdp - run an XDP program
+ * @rx_ring: Rx ring being processed
+ * @xdp: XDP buffer containing the frame
+ **/
+static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
+				    struct xdp_buff *xdp)
+{
+	int err, result = I40E_XDP_PASS;
+	struct i40e_ring *xdp_ring;
+	struct bpf_prog *xdp_prog;
+	u32 act;
+
+	rcu_read_lock();
+	xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+
+	if (!xdp_prog)
+		goto xdp_out;
+
+	act = bpf_prog_run_xdp(xdp_prog, xdp);
+	switch (act) {
+	case XDP_PASS:
+		break;
+	case XDP_TX:
+		xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
+		result = i40e_xmit_xdp_ring(xdp, xdp_ring);
+		break;
+	case XDP_REDIRECT:
+		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
+		result = !err ? I40E_XDP_TX : I40E_XDP_CONSUMED;
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+	case XDP_ABORTED:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
+		/* fallthrough -- handle aborts by dropping packet */
+	case XDP_DROP:
+		result = I40E_XDP_CONSUMED;
+		break;
+	}
+xdp_out:
+	rcu_read_unlock();
+	return ERR_PTR(-result);
+}
+
+/**
+ * i40e_rx_buffer_flip - adjusted rx_buffer to point to an unused region
+ * @rx_ring: Rx ring
+ * @rx_buffer: Rx buffer to adjust
+ * @size: Size of adjustment
+ **/
+static void i40e_rx_buffer_flip(struct i40e_ring *rx_ring,
+				struct i40e_rx_buffer *rx_buffer,
+				unsigned int size)
+{
+#if (PAGE_SIZE < 8192)
+	unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2;
+
+	rx_buffer->page_offset ^= truesize;
+#else
+	unsigned int truesize = SKB_DATA_ALIGN(i40e_rx_offset(rx_ring) + size);
+
+	rx_buffer->page_offset += truesize;
+#endif
+}
+
+static inline void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring)
+{
+	/* Force memory writes to complete before letting h/w
+	 * know there are new descriptors to fetch.
+	 */
+	wmb();
+	writel_relaxed(xdp_ring->next_to_use, xdp_ring->tail);
+}
+
+/**
+ * i40e_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
+ * @rx_ring: rx descriptor ring to transact packets on
+ * @budget: Total limit on number of packets to process
+ *
+ * This function provides a "bounce buffer" approach to Rx interrupt
+ * processing.  The advantage to this is that on systems that have
+ * expensive overhead for IOMMU access this provides a means of avoiding
+ * it by maintaining the mapping of the page to the system.
+ *
+ * Returns amount of work completed
+ **/
+static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
+{
+	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
+	struct sk_buff *skb = rx_ring->skb;
+	u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
+	bool failure = false, xdp_xmit = false;
+	struct xdp_buff xdp;
+
+	xdp.rxq = &rx_ring->xdp_rxq;
+
+	while (likely(total_rx_packets < (unsigned int)budget)) {
+		struct i40e_rx_buffer *rx_buffer;
+		union i40e_rx_desc *rx_desc;
+		unsigned int size;
+		u16 vlan_tag;
+		u8 rx_ptype;
+		u64 qword;
+
+		/* return some buffers to hardware, one at a time is too slow */
+		if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
+			failure = failure ||
+				  i40e_alloc_rx_buffers(rx_ring, cleaned_count);
+			cleaned_count = 0;
+		}
+
+		rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean);
+
+		/* status_error_len will always be zero for unused descriptors
+		 * because it's cleared in cleanup, and overlaps with hdr_addr
+		 * which is always zero because packet split isn't used, if the
+		 * hardware wrote DD then the length will be non-zero
+		 */
+		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+
+		/* This memory barrier is needed to keep us from reading
+		 * any other fields out of the rx_desc until we have
+		 * verified the descriptor has been written back.
+		 */
+		dma_rmb();
+
+		if (unlikely(i40e_rx_is_programming_status(qword))) {
+			i40e_clean_programming_status(rx_ring, rx_desc, qword);
+			cleaned_count++;
+			continue;
+		}
+		size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
+		       I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
+		if (!size)
+			break;
+
+		i40e_trace(clean_rx_irq, rx_ring, rx_desc, skb);
+		rx_buffer = i40e_get_rx_buffer(rx_ring, size);
+
+		/* retrieve a buffer from the ring */
+		if (!skb) {
+			xdp.data = page_address(rx_buffer->page) +
+				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
+			xdp.data_hard_start = xdp.data -
+					      i40e_rx_offset(rx_ring);
+			xdp.data_end = xdp.data + size;
+
+			skb = i40e_run_xdp(rx_ring, &xdp);
+		}
+
+		if (IS_ERR(skb)) {
+			if (PTR_ERR(skb) == -I40E_XDP_TX) {
+				xdp_xmit = true;
+				i40e_rx_buffer_flip(rx_ring, rx_buffer, size);
+			} else {
+				rx_buffer->pagecnt_bias++;
+			}
+			total_rx_bytes += size;
+			total_rx_packets++;
+		} else if (skb) {
+			i40e_add_rx_frag(rx_ring, rx_buffer, skb, size);
+		} else if (ring_uses_build_skb(rx_ring)) {
+			skb = i40e_build_skb(rx_ring, rx_buffer, &xdp);
+		} else {
+			skb = i40e_construct_skb(rx_ring, rx_buffer, &xdp);
+		}
+
+		/* exit if we failed to retrieve a buffer */
+		if (!skb) {
+			rx_ring->rx_stats.alloc_buff_failed++;
+			rx_buffer->pagecnt_bias++;
+			break;
+		}
+
+		i40e_put_rx_buffer(rx_ring, rx_buffer);
+		cleaned_count++;
+
+		if (i40e_is_non_eop(rx_ring, rx_desc, skb))
+			continue;
+
+		if (i40e_cleanup_headers(rx_ring, skb, rx_desc)) {
+			skb = NULL;
+			continue;
+		}
+
+		/* probably a little skewed due to removing CRC */
+		total_rx_bytes += skb->len;
+
+		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+		rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
+			   I40E_RXD_QW1_PTYPE_SHIFT;
+
+		/* populate checksum, VLAN, and protocol */
+		i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
+
+		vlan_tag = (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) ?
+			   le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1) : 0;
+
+		i40e_trace(clean_rx_irq_rx, rx_ring, rx_desc, skb);
+		i40e_receive_skb(rx_ring, skb, vlan_tag);
+		skb = NULL;
+
+		/* update budget accounting */
+		total_rx_packets++;
+	}
+
+	if (xdp_xmit) {
+		struct i40e_ring *xdp_ring =
+			rx_ring->vsi->xdp_rings[rx_ring->queue_index];
+
+		i40e_xdp_ring_update_tail(xdp_ring);
+		xdp_do_flush_map();
+	}
+
+	rx_ring->skb = skb;
+
+	u64_stats_update_begin(&rx_ring->syncp);
+	rx_ring->stats.packets += total_rx_packets;
+	rx_ring->stats.bytes += total_rx_bytes;
+	u64_stats_update_end(&rx_ring->syncp);
+	rx_ring->q_vector->rx.total_packets += total_rx_packets;
+	rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
+
+	/* guarantee a trip back through this routine if there was a failure */
+	return failure ? budget : (int)total_rx_packets;
+}
+
+static inline u32 i40e_buildreg_itr(const int type, u16 itr)
+{
+	u32 val;
+
+	/* We don't bother with setting the CLEARPBA bit as the data sheet
+	 * points out doing so is "meaningless since it was already
+	 * auto-cleared". The auto-clearing happens when the interrupt is
+	 * asserted.
+	 *
+	 * Hardware errata 28 for also indicates that writing to a
+	 * xxINT_DYN_CTLx CSR with INTENA_MSK (bit 31) set to 0 will clear
+	 * an event in the PBA anyway so we need to rely on the automask
+	 * to hold pending events for us until the interrupt is re-enabled
+	 *
+	 * The itr value is reported in microseconds, and the register
+	 * value is recorded in 2 microsecond units. For this reason we
+	 * only need to shift by the interval shift - 1 instead of the
+	 * full value.
+	 */
+	itr &= I40E_ITR_MASK;
+
+	val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
+	      (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) |
+	      (itr << (I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT - 1));
+
+	return val;
+}
+
+/* a small macro to shorten up some long lines */
+#define INTREG I40E_PFINT_DYN_CTLN
+
+/* The act of updating the ITR will cause it to immediately trigger. In order
+ * to prevent this from throwing off adaptive update statistics we defer the
+ * update so that it can only happen so often. So after either Tx or Rx are
+ * updated we make the adaptive scheme wait until either the ITR completely
+ * expires via the next_update expiration or we have been through at least
+ * 3 interrupts.
+ */
+#define ITR_COUNTDOWN_START 3
+
+/**
+ * i40e_update_enable_itr - Update itr and re-enable MSIX interrupt
+ * @vsi: the VSI we care about
+ * @q_vector: q_vector for which itr is being updated and interrupt enabled
+ *
+ **/
+static inline void i40e_update_enable_itr(struct i40e_vsi *vsi,
+					  struct i40e_q_vector *q_vector)
+{
+	struct i40e_hw *hw = &vsi->back->hw;
+	u32 intval;
+
+	/* If we don't have MSIX, then we only need to re-enable icr0 */
+	if (!(vsi->back->flags & I40E_FLAG_MSIX_ENABLED)) {
+		i40e_irq_dynamic_enable_icr0(vsi->back);
+		return;
+	}
+
+	/* These will do nothing if dynamic updates are not enabled */
+	i40e_update_itr(q_vector, &q_vector->tx);
+	i40e_update_itr(q_vector, &q_vector->rx);
+
+	/* This block of logic allows us to get away with only updating
+	 * one ITR value with each interrupt. The idea is to perform a
+	 * pseudo-lazy update with the following criteria.
+	 *
+	 * 1. Rx is given higher priority than Tx if both are in same state
+	 * 2. If we must reduce an ITR that is given highest priority.
+	 * 3. We then give priority to increasing ITR based on amount.
+	 */
+	if (q_vector->rx.target_itr < q_vector->rx.current_itr) {
+		/* Rx ITR needs to be reduced, this is highest priority */
+		intval = i40e_buildreg_itr(I40E_RX_ITR,
+					   q_vector->rx.target_itr);
+		q_vector->rx.current_itr = q_vector->rx.target_itr;
+		q_vector->itr_countdown = ITR_COUNTDOWN_START;
+	} else if ((q_vector->tx.target_itr < q_vector->tx.current_itr) ||
+		   ((q_vector->rx.target_itr - q_vector->rx.current_itr) <
+		    (q_vector->tx.target_itr - q_vector->tx.current_itr))) {
+		/* Tx ITR needs to be reduced, this is second priority
+		 * Tx ITR needs to be increased more than Rx, fourth priority
+		 */
+		intval = i40e_buildreg_itr(I40E_TX_ITR,
+					   q_vector->tx.target_itr);
+		q_vector->tx.current_itr = q_vector->tx.target_itr;
+		q_vector->itr_countdown = ITR_COUNTDOWN_START;
+	} else if (q_vector->rx.current_itr != q_vector->rx.target_itr) {
+		/* Rx ITR needs to be increased, third priority */
+		intval = i40e_buildreg_itr(I40E_RX_ITR,
+					   q_vector->rx.target_itr);
+		q_vector->rx.current_itr = q_vector->rx.target_itr;
+		q_vector->itr_countdown = ITR_COUNTDOWN_START;
+	} else {
+		/* No ITR update, lowest priority */
+		intval = i40e_buildreg_itr(I40E_ITR_NONE, 0);
+		if (q_vector->itr_countdown)
+			q_vector->itr_countdown--;
+	}
+
+	if (!test_bit(__I40E_VSI_DOWN, vsi->state))
+		wr32(hw, INTREG(q_vector->reg_idx), intval);
+}
+
+/**
+ * i40e_napi_poll - NAPI polling Rx/Tx cleanup routine
+ * @napi: napi struct with our devices info in it
+ * @budget: amount of work driver is allowed to do this pass, in packets
+ *
+ * This function will clean all queues associated with a q_vector.
+ *
+ * Returns the amount of work done
+ **/
+int i40e_napi_poll(struct napi_struct *napi, int budget)
+{
+	struct i40e_q_vector *q_vector =
+			       container_of(napi, struct i40e_q_vector, napi);
+	struct i40e_vsi *vsi = q_vector->vsi;
+	struct i40e_ring *ring;
+	bool clean_complete = true;
+	bool arm_wb = false;
+	int budget_per_ring;
+	int work_done = 0;
+
+	if (test_bit(__I40E_VSI_DOWN, vsi->state)) {
+		napi_complete(napi);
+		return 0;
+	}
+
+	/* Since the actual Tx work is minimal, we can give the Tx a larger
+	 * budget and be more aggressive about cleaning up the Tx descriptors.
+	 */
+	i40e_for_each_ring(ring, q_vector->tx) {
+		if (!i40e_clean_tx_irq(vsi, ring, budget)) {
+			clean_complete = false;
+			continue;
+		}
+		arm_wb |= ring->arm_wb;
+		ring->arm_wb = false;
+	}
+
+	/* Handle case where we are called by netpoll with a budget of 0 */
+	if (budget <= 0)
+		goto tx_only;
+
+	/* We attempt to distribute budget to each Rx queue fairly, but don't
+	 * allow the budget to go below 1 because that would exit polling early.
+	 */
+	budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
+
+	i40e_for_each_ring(ring, q_vector->rx) {
+		int cleaned = i40e_clean_rx_irq(ring, budget_per_ring);
+
+		work_done += cleaned;
+		/* if we clean as many as budgeted, we must not be done */
+		if (cleaned >= budget_per_ring)
+			clean_complete = false;
+	}
+
+	/* If work not completed, return budget and polling will return */
+	if (!clean_complete) {
+		int cpu_id = smp_processor_id();
+
+		/* It is possible that the interrupt affinity has changed but,
+		 * if the cpu is pegged at 100%, polling will never exit while
+		 * traffic continues and the interrupt will be stuck on this
+		 * cpu.  We check to make sure affinity is correct before we
+		 * continue to poll, otherwise we must stop polling so the
+		 * interrupt can move to the correct cpu.
+		 */
+		if (!cpumask_test_cpu(cpu_id, &q_vector->affinity_mask)) {
+			/* Tell napi that we are done polling */
+			napi_complete_done(napi, work_done);
+
+			/* Force an interrupt */
+			i40e_force_wb(vsi, q_vector);
+
+			/* Return budget-1 so that polling stops */
+			return budget - 1;
+		}
+tx_only:
+		if (arm_wb) {
+			q_vector->tx.ring[0].tx_stats.tx_force_wb++;
+			i40e_enable_wb_on_itr(vsi, q_vector);
+		}
+		return budget;
+	}
+
+	if (vsi->back->flags & I40E_TXR_FLAGS_WB_ON_ITR)
+		q_vector->arm_wb_state = false;
+
+	/* Work is done so exit the polling mode and re-enable the interrupt */
+	napi_complete_done(napi, work_done);
+
+	i40e_update_enable_itr(vsi, q_vector);
+
+	return min(work_done, budget - 1);
+}
+
+/**
+ * i40e_atr - Add a Flow Director ATR filter
+ * @tx_ring:  ring to add programming descriptor to
+ * @skb:      send buffer
+ * @tx_flags: send tx flags
+ **/
+static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
+		     u32 tx_flags)
+{
+	struct i40e_filter_program_desc *fdir_desc;
+	struct i40e_pf *pf = tx_ring->vsi->back;
+	union {
+		unsigned char *network;
+		struct iphdr *ipv4;
+		struct ipv6hdr *ipv6;
+	} hdr;
+	struct tcphdr *th;
+	unsigned int hlen;
+	u32 flex_ptype, dtype_cmd;
+	int l4_proto;
+	u16 i;
+
+	/* make sure ATR is enabled */
+	if (!(pf->flags & I40E_FLAG_FD_ATR_ENABLED))
+		return;
+
+	if (test_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state))
+		return;
+
+	/* if sampling is disabled do nothing */
+	if (!tx_ring->atr_sample_rate)
+		return;
+
+	/* Currently only IPv4/IPv6 with TCP is supported */
+	if (!(tx_flags & (I40E_TX_FLAGS_IPV4 | I40E_TX_FLAGS_IPV6)))
+		return;
+
+	/* snag network header to get L4 type and address */
+	hdr.network = (tx_flags & I40E_TX_FLAGS_UDP_TUNNEL) ?
+		      skb_inner_network_header(skb) : skb_network_header(skb);
+
+	/* Note: tx_flags gets modified to reflect inner protocols in
+	 * tx_enable_csum function if encap is enabled.
+	 */
+	if (tx_flags & I40E_TX_FLAGS_IPV4) {
+		/* access ihl as u8 to avoid unaligned access on ia64 */
+		hlen = (hdr.network[0] & 0x0F) << 2;
+		l4_proto = hdr.ipv4->protocol;
+	} else {
+		/* find the start of the innermost ipv6 header */
+		unsigned int inner_hlen = hdr.network - skb->data;
+		unsigned int h_offset = inner_hlen;
+
+		/* this function updates h_offset to the end of the header */
+		l4_proto =
+		  ipv6_find_hdr(skb, &h_offset, IPPROTO_TCP, NULL, NULL);
+		/* hlen will contain our best estimate of the tcp header */
+		hlen = h_offset - inner_hlen;
+	}
+
+	if (l4_proto != IPPROTO_TCP)
+		return;
+
+	th = (struct tcphdr *)(hdr.network + hlen);
+
+	/* Due to lack of space, no more new filters can be programmed */
+	if (th->syn && test_bit(__I40E_FD_ATR_AUTO_DISABLED, pf->state))
+		return;
+	if (pf->flags & I40E_FLAG_HW_ATR_EVICT_ENABLED) {
+		/* HW ATR eviction will take care of removing filters on FIN
+		 * and RST packets.
+		 */
+		if (th->fin || th->rst)
+			return;
+	}
+
+	tx_ring->atr_count++;
+
+	/* sample on all syn/fin/rst packets or once every atr sample rate */
+	if (!th->fin &&
+	    !th->syn &&
+	    !th->rst &&
+	    (tx_ring->atr_count < tx_ring->atr_sample_rate))
+		return;
+
+	tx_ring->atr_count = 0;
+
+	/* grab the next descriptor */
+	i = tx_ring->next_to_use;
+	fdir_desc = I40E_TX_FDIRDESC(tx_ring, i);
+
+	i++;
+	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
+
+	flex_ptype = (tx_ring->queue_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT) &
+		      I40E_TXD_FLTR_QW0_QINDEX_MASK;
+	flex_ptype |= (tx_flags & I40E_TX_FLAGS_IPV4) ?
+		      (I40E_FILTER_PCTYPE_NONF_IPV4_TCP <<
+		       I40E_TXD_FLTR_QW0_PCTYPE_SHIFT) :
+		      (I40E_FILTER_PCTYPE_NONF_IPV6_TCP <<
+		       I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
+
+	flex_ptype |= tx_ring->vsi->id << I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT;
+
+	dtype_cmd = I40E_TX_DESC_DTYPE_FILTER_PROG;
+
+	dtype_cmd |= (th->fin || th->rst) ?
+		     (I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE <<
+		      I40E_TXD_FLTR_QW1_PCMD_SHIFT) :
+		     (I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE <<
+		      I40E_TXD_FLTR_QW1_PCMD_SHIFT);
+
+	dtype_cmd |= I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_QINDEX <<
+		     I40E_TXD_FLTR_QW1_DEST_SHIFT;
+
+	dtype_cmd |= I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID <<
+		     I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT;
+
+	dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
+	if (!(tx_flags & I40E_TX_FLAGS_UDP_TUNNEL))
+		dtype_cmd |=
+			((u32)I40E_FD_ATR_STAT_IDX(pf->hw.pf_id) <<
+			I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
+			I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
+	else
+		dtype_cmd |=
+			((u32)I40E_FD_ATR_TUNNEL_STAT_IDX(pf->hw.pf_id) <<
+			I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
+			I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
+
+	if (pf->flags & I40E_FLAG_HW_ATR_EVICT_ENABLED)
+		dtype_cmd |= I40E_TXD_FLTR_QW1_ATR_MASK;
+
+	fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
+	fdir_desc->rsvd = cpu_to_le32(0);
+	fdir_desc->dtype_cmd_cntindex = cpu_to_le32(dtype_cmd);
+	fdir_desc->fd_id = cpu_to_le32(0);
+}
+
+/**
+ * i40e_tx_prepare_vlan_flags - prepare generic TX VLAN tagging flags for HW
+ * @skb:     send buffer
+ * @tx_ring: ring to send buffer on
+ * @flags:   the tx flags to be set
+ *
+ * Checks the skb and set up correspondingly several generic transmit flags
+ * related to VLAN tagging for the HW, such as VLAN, DCB, etc.
+ *
+ * Returns error code indicate the frame should be dropped upon error and the
+ * otherwise  returns 0 to indicate the flags has been set properly.
+ **/
+static inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
+					     struct i40e_ring *tx_ring,
+					     u32 *flags)
+{
+	__be16 protocol = skb->protocol;
+	u32  tx_flags = 0;
+
+	if (protocol == htons(ETH_P_8021Q) &&
+	    !(tx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) {
+		/* When HW VLAN acceleration is turned off by the user the
+		 * stack sets the protocol to 8021q so that the driver
+		 * can take any steps required to support the SW only
+		 * VLAN handling.  In our case the driver doesn't need
+		 * to take any further steps so just set the protocol
+		 * to the encapsulated ethertype.
+		 */
+		skb->protocol = vlan_get_protocol(skb);
+		goto out;
+	}
+
+	/* if we have a HW VLAN tag being added, default to the HW one */
+	if (skb_vlan_tag_present(skb)) {
+		tx_flags |= skb_vlan_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
+		tx_flags |= I40E_TX_FLAGS_HW_VLAN;
+	/* else if it is a SW VLAN, check the next protocol and store the tag */
+	} else if (protocol == htons(ETH_P_8021Q)) {
+		struct vlan_hdr *vhdr, _vhdr;
+
+		vhdr = skb_header_pointer(skb, ETH_HLEN, sizeof(_vhdr), &_vhdr);
+		if (!vhdr)
+			return -EINVAL;
+
+		protocol = vhdr->h_vlan_encapsulated_proto;
+		tx_flags |= ntohs(vhdr->h_vlan_TCI) << I40E_TX_FLAGS_VLAN_SHIFT;
+		tx_flags |= I40E_TX_FLAGS_SW_VLAN;
+	}
+
+	if (!(tx_ring->vsi->back->flags & I40E_FLAG_DCB_ENABLED))
+		goto out;
+
+	/* Insert 802.1p priority into VLAN header */
+	if ((tx_flags & (I40E_TX_FLAGS_HW_VLAN | I40E_TX_FLAGS_SW_VLAN)) ||
+	    (skb->priority != TC_PRIO_CONTROL)) {
+		tx_flags &= ~I40E_TX_FLAGS_VLAN_PRIO_MASK;
+		tx_flags |= (skb->priority & 0x7) <<
+				I40E_TX_FLAGS_VLAN_PRIO_SHIFT;
+		if (tx_flags & I40E_TX_FLAGS_SW_VLAN) {
+			struct vlan_ethhdr *vhdr;
+			int rc;
+
+			rc = skb_cow_head(skb, 0);
+			if (rc < 0)
+				return rc;
+			vhdr = (struct vlan_ethhdr *)skb->data;
+			vhdr->h_vlan_TCI = htons(tx_flags >>
+						 I40E_TX_FLAGS_VLAN_SHIFT);
+		} else {
+			tx_flags |= I40E_TX_FLAGS_HW_VLAN;
+		}
+	}
+
+out:
+	*flags = tx_flags;
+	return 0;
+}
+
+/**
+ * i40e_tso - set up the tso context descriptor
+ * @first:    pointer to first Tx buffer for xmit
+ * @hdr_len:  ptr to the size of the packet header
+ * @cd_type_cmd_tso_mss: Quad Word 1
+ *
+ * Returns 0 if no TSO can happen, 1 if tso is going, or error
+ **/
+static int i40e_tso(struct i40e_tx_buffer *first, u8 *hdr_len,
+		    u64 *cd_type_cmd_tso_mss)
+{
+	struct sk_buff *skb = first->skb;
+	u64 cd_cmd, cd_tso_len, cd_mss;
+	union {
+		struct iphdr *v4;
+		struct ipv6hdr *v6;
+		unsigned char *hdr;
+	} ip;
+	union {
+		struct tcphdr *tcp;
+		struct udphdr *udp;
+		unsigned char *hdr;
+	} l4;
+	u32 paylen, l4_offset;
+	u16 gso_segs, gso_size;
+	int err;
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		return 0;
+
+	if (!skb_is_gso(skb))
+		return 0;
+
+	err = skb_cow_head(skb, 0);
+	if (err < 0)
+		return err;
+
+	ip.hdr = skb_network_header(skb);
+	l4.hdr = skb_transport_header(skb);
+
+	/* initialize outer IP header fields */
+	if (ip.v4->version == 4) {
+		ip.v4->tot_len = 0;
+		ip.v4->check = 0;
+	} else {
+		ip.v6->payload_len = 0;
+	}
+
+	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
+					 SKB_GSO_GRE_CSUM |
+					 SKB_GSO_IPXIP4 |
+					 SKB_GSO_IPXIP6 |
+					 SKB_GSO_UDP_TUNNEL |
+					 SKB_GSO_UDP_TUNNEL_CSUM)) {
+		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
+		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
+			l4.udp->len = 0;
+
+			/* determine offset of outer transport header */
+			l4_offset = l4.hdr - skb->data;
+
+			/* remove payload length from outer checksum */
+			paylen = skb->len - l4_offset;
+			csum_replace_by_diff(&l4.udp->check,
+					     (__force __wsum)htonl(paylen));
+		}
+
+		/* reset pointers to inner headers */
+		ip.hdr = skb_inner_network_header(skb);
+		l4.hdr = skb_inner_transport_header(skb);
+
+		/* initialize inner IP header fields */
+		if (ip.v4->version == 4) {
+			ip.v4->tot_len = 0;
+			ip.v4->check = 0;
+		} else {
+			ip.v6->payload_len = 0;
+		}
+	}
+
+	/* determine offset of inner transport header */
+	l4_offset = l4.hdr - skb->data;
+
+	/* remove payload length from inner checksum */
+	paylen = skb->len - l4_offset;
+	csum_replace_by_diff(&l4.tcp->check, (__force __wsum)htonl(paylen));
+
+	/* compute length of segmentation header */
+	*hdr_len = (l4.tcp->doff * 4) + l4_offset;
+
+	/* pull values out of skb_shinfo */
+	gso_size = skb_shinfo(skb)->gso_size;
+	gso_segs = skb_shinfo(skb)->gso_segs;
+
+	/* update GSO size and bytecount with header size */
+	first->gso_segs = gso_segs;
+	first->bytecount += (first->gso_segs - 1) * *hdr_len;
+
+	/* find the field values */
+	cd_cmd = I40E_TX_CTX_DESC_TSO;
+	cd_tso_len = skb->len - *hdr_len;
+	cd_mss = gso_size;
+	*cd_type_cmd_tso_mss |= (cd_cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) |
+				(cd_tso_len << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
+				(cd_mss << I40E_TXD_CTX_QW1_MSS_SHIFT);
+	return 1;
+}
+
+/**
+ * i40e_tsyn - set up the tsyn context descriptor
+ * @tx_ring:  ptr to the ring to send
+ * @skb:      ptr to the skb we're sending
+ * @tx_flags: the collected send information
+ * @cd_type_cmd_tso_mss: Quad Word 1
+ *
+ * Returns 0 if no Tx timestamp can happen and 1 if the timestamp will happen
+ **/
+static int i40e_tsyn(struct i40e_ring *tx_ring, struct sk_buff *skb,
+		     u32 tx_flags, u64 *cd_type_cmd_tso_mss)
+{
+	struct i40e_pf *pf;
+
+	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
+		return 0;
+
+	/* Tx timestamps cannot be sampled when doing TSO */
+	if (tx_flags & I40E_TX_FLAGS_TSO)
+		return 0;
+
+	/* only timestamp the outbound packet if the user has requested it and
+	 * we are not already transmitting a packet to be timestamped
+	 */
+	pf = i40e_netdev_to_pf(tx_ring->netdev);
+	if (!(pf->flags & I40E_FLAG_PTP))
+		return 0;
+
+	if (pf->ptp_tx &&
+	    !test_and_set_bit_lock(__I40E_PTP_TX_IN_PROGRESS, pf->state)) {
+		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+		pf->ptp_tx_start = jiffies;
+		pf->ptp_tx_skb = skb_get(skb);
+	} else {
+		pf->tx_hwtstamp_skipped++;
+		return 0;
+	}
+
+	*cd_type_cmd_tso_mss |= (u64)I40E_TX_CTX_DESC_TSYN <<
+				I40E_TXD_CTX_QW1_CMD_SHIFT;
+
+	return 1;
+}
+
+/**
+ * i40e_tx_enable_csum - Enable Tx checksum offloads
+ * @skb: send buffer
+ * @tx_flags: pointer to Tx flags currently set
+ * @td_cmd: Tx descriptor command bits to set
+ * @td_offset: Tx descriptor header offsets to set
+ * @tx_ring: Tx descriptor ring
+ * @cd_tunneling: ptr to context desc bits
+ **/
+static int i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
+			       u32 *td_cmd, u32 *td_offset,
+			       struct i40e_ring *tx_ring,
+			       u32 *cd_tunneling)
+{
+	union {
+		struct iphdr *v4;
+		struct ipv6hdr *v6;
+		unsigned char *hdr;
+	} ip;
+	union {
+		struct tcphdr *tcp;
+		struct udphdr *udp;
+		unsigned char *hdr;
+	} l4;
+	unsigned char *exthdr;
+	u32 offset, cmd = 0;
+	__be16 frag_off;
+	u8 l4_proto = 0;
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		return 0;
+
+	ip.hdr = skb_network_header(skb);
+	l4.hdr = skb_transport_header(skb);
+
+	/* compute outer L2 header size */
+	offset = ((ip.hdr - skb->data) / 2) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
+
+	if (skb->encapsulation) {
+		u32 tunnel = 0;
+		/* define outer network header type */
+		if (*tx_flags & I40E_TX_FLAGS_IPV4) {
+			tunnel |= (*tx_flags & I40E_TX_FLAGS_TSO) ?
+				  I40E_TX_CTX_EXT_IP_IPV4 :
+				  I40E_TX_CTX_EXT_IP_IPV4_NO_CSUM;
+
+			l4_proto = ip.v4->protocol;
+		} else if (*tx_flags & I40E_TX_FLAGS_IPV6) {
+			tunnel |= I40E_TX_CTX_EXT_IP_IPV6;
+
+			exthdr = ip.hdr + sizeof(*ip.v6);
+			l4_proto = ip.v6->nexthdr;
+			if (l4.hdr != exthdr)
+				ipv6_skip_exthdr(skb, exthdr - skb->data,
+						 &l4_proto, &frag_off);
+		}
+
+		/* define outer transport */
+		switch (l4_proto) {
+		case IPPROTO_UDP:
+			tunnel |= I40E_TXD_CTX_UDP_TUNNELING;
+			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
+			break;
+		case IPPROTO_GRE:
+			tunnel |= I40E_TXD_CTX_GRE_TUNNELING;
+			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
+			break;
+		case IPPROTO_IPIP:
+		case IPPROTO_IPV6:
+			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
+			l4.hdr = skb_inner_network_header(skb);
+			break;
+		default:
+			if (*tx_flags & I40E_TX_FLAGS_TSO)
+				return -1;
+
+			skb_checksum_help(skb);
+			return 0;
+		}
+
+		/* compute outer L3 header size */
+		tunnel |= ((l4.hdr - ip.hdr) / 4) <<
+			  I40E_TXD_CTX_QW0_EXT_IPLEN_SHIFT;
+
+		/* switch IP header pointer from outer to inner header */
+		ip.hdr = skb_inner_network_header(skb);
+
+		/* compute tunnel header size */
+		tunnel |= ((ip.hdr - l4.hdr) / 2) <<
+			  I40E_TXD_CTX_QW0_NATLEN_SHIFT;
+
+		/* indicate if we need to offload outer UDP header */
+		if ((*tx_flags & I40E_TX_FLAGS_TSO) &&
+		    !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
+		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
+			tunnel |= I40E_TXD_CTX_QW0_L4T_CS_MASK;
+
+		/* record tunnel offload values */
+		*cd_tunneling |= tunnel;
+
+		/* switch L4 header pointer from outer to inner */
+		l4.hdr = skb_inner_transport_header(skb);
+		l4_proto = 0;
+
+		/* reset type as we transition from outer to inner headers */
+		*tx_flags &= ~(I40E_TX_FLAGS_IPV4 | I40E_TX_FLAGS_IPV6);
+		if (ip.v4->version == 4)
+			*tx_flags |= I40E_TX_FLAGS_IPV4;
+		if (ip.v6->version == 6)
+			*tx_flags |= I40E_TX_FLAGS_IPV6;
+	}
+
+	/* Enable IP checksum offloads */
+	if (*tx_flags & I40E_TX_FLAGS_IPV4) {
+		l4_proto = ip.v4->protocol;
+		/* the stack computes the IP header already, the only time we
+		 * need the hardware to recompute it is in the case of TSO.
+		 */
+		cmd |= (*tx_flags & I40E_TX_FLAGS_TSO) ?
+		       I40E_TX_DESC_CMD_IIPT_IPV4_CSUM :
+		       I40E_TX_DESC_CMD_IIPT_IPV4;
+	} else if (*tx_flags & I40E_TX_FLAGS_IPV6) {
+		cmd |= I40E_TX_DESC_CMD_IIPT_IPV6;
+
+		exthdr = ip.hdr + sizeof(*ip.v6);
+		l4_proto = ip.v6->nexthdr;
+		if (l4.hdr != exthdr)
+			ipv6_skip_exthdr(skb, exthdr - skb->data,
+					 &l4_proto, &frag_off);
+	}
+
+	/* compute inner L3 header size */
+	offset |= ((l4.hdr - ip.hdr) / 4) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
+
+	/* Enable L4 checksum offloads */
+	switch (l4_proto) {
+	case IPPROTO_TCP:
+		/* enable checksum offloads */
+		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
+		offset |= l4.tcp->doff << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case IPPROTO_SCTP:
+		/* enable SCTP checksum offload */
+		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
+		offset |= (sizeof(struct sctphdr) >> 2) <<
+			  I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case IPPROTO_UDP:
+		/* enable UDP checksum offload */
+		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
+		offset |= (sizeof(struct udphdr) >> 2) <<
+			  I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	default:
+		if (*tx_flags & I40E_TX_FLAGS_TSO)
+			return -1;
+		skb_checksum_help(skb);
+		return 0;
+	}
+
+	*td_cmd |= cmd;
+	*td_offset |= offset;
+
+	return 1;
+}
+
+/**
+ * i40e_create_tx_ctx Build the Tx context descriptor
+ * @tx_ring:  ring to create the descriptor on
+ * @cd_type_cmd_tso_mss: Quad Word 1
+ * @cd_tunneling: Quad Word 0 - bits 0-31
+ * @cd_l2tag2: Quad Word 0 - bits 32-63
+ **/
+static void i40e_create_tx_ctx(struct i40e_ring *tx_ring,
+			       const u64 cd_type_cmd_tso_mss,
+			       const u32 cd_tunneling, const u32 cd_l2tag2)
+{
+	struct i40e_tx_context_desc *context_desc;
+	int i = tx_ring->next_to_use;
+
+	if ((cd_type_cmd_tso_mss == I40E_TX_DESC_DTYPE_CONTEXT) &&
+	    !cd_tunneling && !cd_l2tag2)
+		return;
+
+	/* grab the next descriptor */
+	context_desc = I40E_TX_CTXTDESC(tx_ring, i);
+
+	i++;
+	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
+
+	/* cpu_to_le32 and assign to struct fields */
+	context_desc->tunneling_params = cpu_to_le32(cd_tunneling);
+	context_desc->l2tag2 = cpu_to_le16(cd_l2tag2);
+	context_desc->rsvd = cpu_to_le16(0);
+	context_desc->type_cmd_tso_mss = cpu_to_le64(cd_type_cmd_tso_mss);
+}
+
+/**
+ * __i40e_maybe_stop_tx - 2nd level check for tx stop conditions
+ * @tx_ring: the ring to be checked
+ * @size:    the size buffer we want to assure is available
+ *
+ * Returns -EBUSY if a stop is needed, else 0
+ **/
+int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
+{
+	netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
+	/* Memory barrier before checking head and tail */
+	smp_mb();
+
+	/* Check again in a case another CPU has just made room available. */
+	if (likely(I40E_DESC_UNUSED(tx_ring) < size))
+		return -EBUSY;
+
+	/* A reprieve! - use start_queue because it doesn't call schedule */
+	netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
+	++tx_ring->tx_stats.restart_queue;
+	return 0;
+}
+
+/**
+ * __i40e_chk_linearize - Check if there are more than 8 buffers per packet
+ * @skb:      send buffer
+ *
+ * Note: Our HW can't DMA more than 8 buffers to build a packet on the wire
+ * and so we need to figure out the cases where we need to linearize the skb.
+ *
+ * For TSO we need to count the TSO header and segment payload separately.
+ * As such we need to check cases where we have 7 fragments or more as we
+ * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
+ * the segment payload in the first descriptor, and another 7 for the
+ * fragments.
+ **/
+bool __i40e_chk_linearize(struct sk_buff *skb)
+{
+	const struct skb_frag_struct *frag, *stale;
+	int nr_frags, sum;
+
+	/* no need to check if number of frags is less than 7 */
+	nr_frags = skb_shinfo(skb)->nr_frags;
+	if (nr_frags < (I40E_MAX_BUFFER_TXD - 1))
+		return false;
+
+	/* We need to walk through the list and validate that each group
+	 * of 6 fragments totals at least gso_size.
+	 */
+	nr_frags -= I40E_MAX_BUFFER_TXD - 2;
+	frag = &skb_shinfo(skb)->frags[0];
+
+	/* Initialize size to the negative value of gso_size minus 1.  We
+	 * use this as the worst case scenerio in which the frag ahead
+	 * of us only provides one byte which is why we are limited to 6
+	 * descriptors for a single transmit as the header and previous
+	 * fragment are already consuming 2 descriptors.
+	 */
+	sum = 1 - skb_shinfo(skb)->gso_size;
+
+	/* Add size of frags 0 through 4 to create our initial sum */
+	sum += skb_frag_size(frag++);
+	sum += skb_frag_size(frag++);
+	sum += skb_frag_size(frag++);
+	sum += skb_frag_size(frag++);
+	sum += skb_frag_size(frag++);
+
+	/* Walk through fragments adding latest fragment, testing it, and
+	 * then removing stale fragments from the sum.
+	 */
+	for (stale = &skb_shinfo(skb)->frags[0];; stale++) {
+		int stale_size = skb_frag_size(stale);
+
+		sum += skb_frag_size(frag++);
+
+		/* The stale fragment may present us with a smaller
+		 * descriptor than the actual fragment size. To account
+		 * for that we need to remove all the data on the front and
+		 * figure out what the remainder would be in the last
+		 * descriptor associated with the fragment.
+		 */
+		if (stale_size > I40E_MAX_DATA_PER_TXD) {
+			int align_pad = -(stale->page_offset) &
+					(I40E_MAX_READ_REQ_SIZE - 1);
+
+			sum -= align_pad;
+			stale_size -= align_pad;
+
+			do {
+				sum -= I40E_MAX_DATA_PER_TXD_ALIGNED;
+				stale_size -= I40E_MAX_DATA_PER_TXD_ALIGNED;
+			} while (stale_size > I40E_MAX_DATA_PER_TXD);
+		}
+
+		/* if sum is negative we failed to make sufficient progress */
+		if (sum < 0)
+			return true;
+
+		if (!nr_frags--)
+			break;
+
+		sum -= stale_size;
+	}
+
+	return false;
+}
+
+/**
+ * i40e_tx_map - Build the Tx descriptor
+ * @tx_ring:  ring to send buffer on
+ * @skb:      send buffer
+ * @first:    first buffer info buffer to use
+ * @tx_flags: collected send information
+ * @hdr_len:  size of the packet header
+ * @td_cmd:   the command field in the descriptor
+ * @td_offset: offset for checksum or crc
+ *
+ * Returns 0 on success, -1 on failure to DMA
+ **/
+static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
+			      struct i40e_tx_buffer *first, u32 tx_flags,
+			      const u8 hdr_len, u32 td_cmd, u32 td_offset)
+{
+	unsigned int data_len = skb->data_len;
+	unsigned int size = skb_headlen(skb);
+	struct skb_frag_struct *frag;
+	struct i40e_tx_buffer *tx_bi;
+	struct i40e_tx_desc *tx_desc;
+	u16 i = tx_ring->next_to_use;
+	u32 td_tag = 0;
+	dma_addr_t dma;
+	u16 desc_count = 1;
+
+	if (tx_flags & I40E_TX_FLAGS_HW_VLAN) {
+		td_cmd |= I40E_TX_DESC_CMD_IL2TAG1;
+		td_tag = (tx_flags & I40E_TX_FLAGS_VLAN_MASK) >>
+			 I40E_TX_FLAGS_VLAN_SHIFT;
+	}
+
+	first->tx_flags = tx_flags;
+
+	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
+
+	tx_desc = I40E_TX_DESC(tx_ring, i);
+	tx_bi = first;
+
+	for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
+		unsigned int max_data = I40E_MAX_DATA_PER_TXD_ALIGNED;
+
+		if (dma_mapping_error(tx_ring->dev, dma))
+			goto dma_error;
+
+		/* record length, and DMA address */
+		dma_unmap_len_set(tx_bi, len, size);
+		dma_unmap_addr_set(tx_bi, dma, dma);
+
+		/* align size to end of page */
+		max_data += -dma & (I40E_MAX_READ_REQ_SIZE - 1);
+		tx_desc->buffer_addr = cpu_to_le64(dma);
+
+		while (unlikely(size > I40E_MAX_DATA_PER_TXD)) {
+			tx_desc->cmd_type_offset_bsz =
+				build_ctob(td_cmd, td_offset,
+					   max_data, td_tag);
+
+			tx_desc++;
+			i++;
+			desc_count++;
+
+			if (i == tx_ring->count) {
+				tx_desc = I40E_TX_DESC(tx_ring, 0);
+				i = 0;
+			}
+
+			dma += max_data;
+			size -= max_data;
+
+			max_data = I40E_MAX_DATA_PER_TXD_ALIGNED;
+			tx_desc->buffer_addr = cpu_to_le64(dma);
+		}
+
+		if (likely(!data_len))
+			break;
+
+		tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
+							  size, td_tag);
+
+		tx_desc++;
+		i++;
+		desc_count++;
+
+		if (i == tx_ring->count) {
+			tx_desc = I40E_TX_DESC(tx_ring, 0);
+			i = 0;
+		}
+
+		size = skb_frag_size(frag);
+		data_len -= size;
+
+		dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
+				       DMA_TO_DEVICE);
+
+		tx_bi = &tx_ring->tx_bi[i];
+	}
+
+	netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount);
+
+	i++;
+	if (i == tx_ring->count)
+		i = 0;
+
+	tx_ring->next_to_use = i;
+
+	i40e_maybe_stop_tx(tx_ring, DESC_NEEDED);
+
+	/* write last descriptor with EOP bit */
+	td_cmd |= I40E_TX_DESC_CMD_EOP;
+
+	/* We OR these values together to check both against 4 (WB_STRIDE)
+	 * below. This is safe since we don't re-use desc_count afterwards.
+	 */
+	desc_count |= ++tx_ring->packet_stride;
+
+	if (desc_count >= WB_STRIDE) {
+		/* write last descriptor with RS bit set */
+		td_cmd |= I40E_TX_DESC_CMD_RS;
+		tx_ring->packet_stride = 0;
+	}
+
+	tx_desc->cmd_type_offset_bsz =
+			build_ctob(td_cmd, td_offset, size, td_tag);
+
+	/* Force memory writes to complete before letting h/w know there
+	 * are new descriptors to fetch.
+	 *
+	 * We also use this memory barrier to make certain all of the
+	 * status bits have been updated before next_to_watch is written.
+	 */
+	wmb();
+
+	/* set next_to_watch value indicating a packet is present */
+	first->next_to_watch = tx_desc;
+
+	/* notify HW of packet */
+	if (netif_xmit_stopped(txring_txq(tx_ring)) || !skb->xmit_more) {
+		writel(i, tx_ring->tail);
+
+		/* we need this if more than one processor can write to our tail
+		 * at a time, it synchronizes IO on IA64/Altix systems
+		 */
+		mmiowb();
+	}
+
+	return 0;
+
+dma_error:
+	dev_info(tx_ring->dev, "TX DMA map failed\n");
+
+	/* clear dma mappings for failed tx_bi map */
+	for (;;) {
+		tx_bi = &tx_ring->tx_bi[i];
+		i40e_unmap_and_free_tx_resource(tx_ring, tx_bi);
+		if (tx_bi == first)
+			break;
+		if (i == 0)
+			i = tx_ring->count;
+		i--;
+	}
+
+	tx_ring->next_to_use = i;
+
+	return -1;
+}
+
+/**
+ * i40e_xmit_xdp_ring - transmits an XDP buffer to an XDP Tx ring
+ * @xdp: data to transmit
+ * @xdp_ring: XDP Tx ring
+ **/
+static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
+			      struct i40e_ring *xdp_ring)
+{
+	u32 size = xdp->data_end - xdp->data;
+	u16 i = xdp_ring->next_to_use;
+	struct i40e_tx_buffer *tx_bi;
+	struct i40e_tx_desc *tx_desc;
+	dma_addr_t dma;
+
+	if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
+		xdp_ring->tx_stats.tx_busy++;
+		return I40E_XDP_CONSUMED;
+	}
+
+	dma = dma_map_single(xdp_ring->dev, xdp->data, size, DMA_TO_DEVICE);
+	if (dma_mapping_error(xdp_ring->dev, dma))
+		return I40E_XDP_CONSUMED;
+
+	tx_bi = &xdp_ring->tx_bi[i];
+	tx_bi->bytecount = size;
+	tx_bi->gso_segs = 1;
+	tx_bi->raw_buf = xdp->data;
+
+	/* record length, and DMA address */
+	dma_unmap_len_set(tx_bi, len, size);
+	dma_unmap_addr_set(tx_bi, dma, dma);
+
+	tx_desc = I40E_TX_DESC(xdp_ring, i);
+	tx_desc->buffer_addr = cpu_to_le64(dma);
+	tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC
+						  | I40E_TXD_CMD,
+						  0, size, 0);
+
+	/* Make certain all of the status bits have been updated
+	 * before next_to_watch is written.
+	 */
+	smp_wmb();
+
+	i++;
+	if (i == xdp_ring->count)
+		i = 0;
+
+	tx_bi->next_to_watch = tx_desc;
+	xdp_ring->next_to_use = i;
+
+	return I40E_XDP_TX;
+}
+
+/**
+ * i40e_xmit_frame_ring - Sends buffer on Tx ring
+ * @skb:     send buffer
+ * @tx_ring: ring to send buffer on
+ *
+ * Returns NETDEV_TX_OK if sent, else an error code
+ **/
+static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
+					struct i40e_ring *tx_ring)
+{
+	u64 cd_type_cmd_tso_mss = I40E_TX_DESC_DTYPE_CONTEXT;
+	u32 cd_tunneling = 0, cd_l2tag2 = 0;
+	struct i40e_tx_buffer *first;
+	u32 td_offset = 0;
+	u32 tx_flags = 0;
+	__be16 protocol;
+	u32 td_cmd = 0;
+	u8 hdr_len = 0;
+	int tso, count;
+	int tsyn;
+
+	/* prefetch the data, we'll need it later */
+	prefetch(skb->data);
+
+	i40e_trace(xmit_frame_ring, skb, tx_ring);
+
+	count = i40e_xmit_descriptor_count(skb);
+	if (i40e_chk_linearize(skb, count)) {
+		if (__skb_linearize(skb)) {
+			dev_kfree_skb_any(skb);
+			return NETDEV_TX_OK;
+		}
+		count = i40e_txd_use_count(skb->len);
+		tx_ring->tx_stats.tx_linearize++;
+	}
+
+	/* need: 1 descriptor per page * PAGE_SIZE/I40E_MAX_DATA_PER_TXD,
+	 *       + 1 desc for skb_head_len/I40E_MAX_DATA_PER_TXD,
+	 *       + 4 desc gap to avoid the cache line where head is,
+	 *       + 1 desc for context descriptor,
+	 * otherwise try next time
+	 */
+	if (i40e_maybe_stop_tx(tx_ring, count + 4 + 1)) {
+		tx_ring->tx_stats.tx_busy++;
+		return NETDEV_TX_BUSY;
+	}
+
+	/* record the location of the first descriptor for this packet */
+	first = &tx_ring->tx_bi[tx_ring->next_to_use];
+	first->skb = skb;
+	first->bytecount = skb->len;
+	first->gso_segs = 1;
+
+	/* prepare the xmit flags */
+	if (i40e_tx_prepare_vlan_flags(skb, tx_ring, &tx_flags))
+		goto out_drop;
+
+	/* obtain protocol of skb */
+	protocol = vlan_get_protocol(skb);
+
+	/* setup IPv4/IPv6 offloads */
+	if (protocol == htons(ETH_P_IP))
+		tx_flags |= I40E_TX_FLAGS_IPV4;
+	else if (protocol == htons(ETH_P_IPV6))
+		tx_flags |= I40E_TX_FLAGS_IPV6;
+
+	tso = i40e_tso(first, &hdr_len, &cd_type_cmd_tso_mss);
+
+	if (tso < 0)
+		goto out_drop;
+	else if (tso)
+		tx_flags |= I40E_TX_FLAGS_TSO;
+
+	/* Always offload the checksum, since it's in the data descriptor */
+	tso = i40e_tx_enable_csum(skb, &tx_flags, &td_cmd, &td_offset,
+				  tx_ring, &cd_tunneling);
+	if (tso < 0)
+		goto out_drop;
+
+	tsyn = i40e_tsyn(tx_ring, skb, tx_flags, &cd_type_cmd_tso_mss);
+
+	if (tsyn)
+		tx_flags |= I40E_TX_FLAGS_TSYN;
+
+	skb_tx_timestamp(skb);
+
+	/* always enable CRC insertion offload */
+	td_cmd |= I40E_TX_DESC_CMD_ICRC;
+
+	i40e_create_tx_ctx(tx_ring, cd_type_cmd_tso_mss,
+			   cd_tunneling, cd_l2tag2);
+
+	/* Add Flow Director ATR if it's enabled.
+	 *
+	 * NOTE: this must always be directly before the data descriptor.
+	 */
+	i40e_atr(tx_ring, skb, tx_flags);
+
+	if (i40e_tx_map(tx_ring, skb, first, tx_flags, hdr_len,
+			td_cmd, td_offset))
+		goto cleanup_tx_tstamp;
+
+	return NETDEV_TX_OK;
+
+out_drop:
+	i40e_trace(xmit_frame_ring_drop, first->skb, tx_ring);
+	dev_kfree_skb_any(first->skb);
+	first->skb = NULL;
+cleanup_tx_tstamp:
+	if (unlikely(tx_flags & I40E_TX_FLAGS_TSYN)) {
+		struct i40e_pf *pf = i40e_netdev_to_pf(tx_ring->netdev);
+
+		dev_kfree_skb_any(pf->ptp_tx_skb);
+		pf->ptp_tx_skb = NULL;
+		clear_bit_unlock(__I40E_PTP_TX_IN_PROGRESS, pf->state);
+	}
+
+	return NETDEV_TX_OK;
+}
+
+/**
+ * i40e_lan_xmit_frame - Selects the correct VSI and Tx queue to send buffer
+ * @skb:    send buffer
+ * @netdev: network interface device structure
+ *
+ * Returns NETDEV_TX_OK if sent, else an error code
+ **/
+netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_ring *tx_ring = vsi->tx_rings[skb->queue_mapping];
+
+	/* hardware can't handle really short frames, hardware padding works
+	 * beyond this point
+	 */
+	if (skb_put_padto(skb, I40E_MIN_TX_LEN))
+		return NETDEV_TX_OK;
+
+	return i40e_xmit_frame_ring(skb, tx_ring);
+}
+
+/**
+ * i40e_xdp_xmit - Implements ndo_xdp_xmit
+ * @dev: netdev
+ * @xdp: XDP buffer
+ *
+ * Returns Zero if sent, else an error code
+ **/
+int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+{
+	struct i40e_netdev_priv *np = netdev_priv(dev);
+	unsigned int queue_index = smp_processor_id();
+	struct i40e_vsi *vsi = np->vsi;
+	int err;
+
+	if (test_bit(__I40E_VSI_DOWN, vsi->state))
+		return -ENETDOWN;
+
+	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
+		return -ENXIO;
+
+	err = i40e_xmit_xdp_ring(xdp, vsi->xdp_rings[queue_index]);
+	if (err != I40E_XDP_TX)
+		return -ENOSPC;
+
+	return 0;
+}
+
+/**
+ * i40e_xdp_flush - Implements ndo_xdp_flush
+ * @dev: netdev
+ **/
+void i40e_xdp_flush(struct net_device *dev)
+{
+	struct i40e_netdev_priv *np = netdev_priv(dev);
+	unsigned int queue_index = smp_processor_id();
+	struct i40e_vsi *vsi = np->vsi;
+
+	if (test_bit(__I40E_VSI_DOWN, vsi->state))
+		return;
+
+	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
+		return;
+
+	i40e_xdp_ring_update_tail(vsi->xdp_rings[queue_index]);
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
new file mode 100644
index 0000000..3043483
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -0,0 +1,601 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_TXRX_H_
+#define _I40E_TXRX_H_
+
+#include <net/xdp.h>
+
+/* Interrupt Throttling and Rate Limiting Goodies */
+#define I40E_DEFAULT_IRQ_WORK      256
+
+/* The datasheet for the X710 and XL710 indicate that the maximum value for
+ * the ITR is 8160usec which is then called out as 0xFF0 with a 2usec
+ * resolution. 8160 is 0x1FE0 when written out in hex. So instead of storing
+ * the register value which is divided by 2 lets use the actual values and
+ * avoid an excessive amount of translation.
+ */
+#define I40E_ITR_DYNAMIC	0x8000	/* use top bit as a flag */
+#define I40E_ITR_MASK		0x1FFE	/* mask for ITR register value */
+#define I40E_MIN_ITR		     2	/* reg uses 2 usec resolution */
+#define I40E_ITR_100K		    10	/* all values below must be even */
+#define I40E_ITR_50K		    20
+#define I40E_ITR_20K		    50
+#define I40E_ITR_18K		    60
+#define I40E_ITR_8K		   122
+#define I40E_MAX_ITR		  8160	/* maximum value as per datasheet */
+#define ITR_TO_REG(setting) ((setting) & ~I40E_ITR_DYNAMIC)
+#define ITR_REG_ALIGN(setting) __ALIGN_MASK(setting, ~I40E_ITR_MASK)
+#define ITR_IS_DYNAMIC(setting) (!!((setting) & I40E_ITR_DYNAMIC))
+
+#define I40E_ITR_RX_DEF		(I40E_ITR_20K | I40E_ITR_DYNAMIC)
+#define I40E_ITR_TX_DEF		(I40E_ITR_20K | I40E_ITR_DYNAMIC)
+
+/* 0x40 is the enable bit for interrupt rate limiting, and must be set if
+ * the value of the rate limit is non-zero
+ */
+#define INTRL_ENA                  BIT(6)
+#define I40E_MAX_INTRL             0x3B    /* reg uses 4 usec resolution */
+#define INTRL_REG_TO_USEC(intrl) ((intrl & ~INTRL_ENA) << 2)
+
+/**
+ * i40e_intrl_usec_to_reg - convert interrupt rate limit to register
+ * @intrl: interrupt rate limit to convert
+ *
+ * This function converts a decimal interrupt rate limit to the appropriate
+ * register format expected by the firmware when setting interrupt rate limit.
+ */
+static inline u16 i40e_intrl_usec_to_reg(int intrl)
+{
+	if (intrl >> 2)
+		return ((intrl >> 2) | INTRL_ENA);
+	else
+		return 0;
+}
+#define I40E_INTRL_8K              125     /* 8000 ints/sec */
+#define I40E_INTRL_62K             16      /* 62500 ints/sec */
+#define I40E_INTRL_83K             12      /* 83333 ints/sec */
+
+#define I40E_QUEUE_END_OF_LIST 0x7FF
+
+/* this enum matches hardware bits and is meant to be used by DYN_CTLN
+ * registers and QINT registers or more generally anywhere in the manual
+ * mentioning ITR_INDX, ITR_NONE cannot be used as an index 'n' into any
+ * register but instead is a special value meaning "don't update" ITR0/1/2.
+ */
+enum i40e_dyn_idx_t {
+	I40E_IDX_ITR0 = 0,
+	I40E_IDX_ITR1 = 1,
+	I40E_IDX_ITR2 = 2,
+	I40E_ITR_NONE = 3	/* ITR_NONE must not be used as an index */
+};
+
+/* these are indexes into ITRN registers */
+#define I40E_RX_ITR    I40E_IDX_ITR0
+#define I40E_TX_ITR    I40E_IDX_ITR1
+#define I40E_PE_ITR    I40E_IDX_ITR2
+
+/* Supported RSS offloads */
+#define I40E_DEFAULT_RSS_HENA ( \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_UDP) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_SCTP) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_TCP) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_OTHER) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV4) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_UDP) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_TCP) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_SCTP) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_OTHER) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV6) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_L2_PAYLOAD))
+
+#define I40E_DEFAULT_RSS_HENA_EXPANDED (I40E_DEFAULT_RSS_HENA | \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP) | \
+	BIT_ULL(I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP))
+
+#define i40e_pf_get_default_rss_hena(pf) \
+	(((pf)->hw_features & I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE) ? \
+	  I40E_DEFAULT_RSS_HENA_EXPANDED : I40E_DEFAULT_RSS_HENA)
+
+/* Supported Rx Buffer Sizes (a multiple of 128) */
+#define I40E_RXBUFFER_256   256
+#define I40E_RXBUFFER_1536  1536  /* 128B aligned standard Ethernet frame */
+#define I40E_RXBUFFER_2048  2048
+#define I40E_RXBUFFER_3072  3072  /* Used for large frames w/ padding */
+#define I40E_MAX_RXBUFFER   9728  /* largest size for single descriptor */
+
+/* NOTE: netdev_alloc_skb reserves up to 64 bytes, NET_IP_ALIGN means we
+ * reserve 2 more, and skb_shared_info adds an additional 384 bytes more,
+ * this adds up to 512 bytes of extra data meaning the smallest allocation
+ * we could have is 1K.
+ * i.e. RXBUFFER_256 --> 960 byte skb (size-1024 slab)
+ * i.e. RXBUFFER_512 --> 1216 byte skb (size-2048 slab)
+ */
+#define I40E_RX_HDR_SIZE I40E_RXBUFFER_256
+#define I40E_PACKET_HDR_PAD (ETH_HLEN + ETH_FCS_LEN + (VLAN_HLEN * 2))
+#define i40e_rx_desc i40e_32byte_rx_desc
+
+#define I40E_RX_DMA_ATTR \
+	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+
+/* Attempt to maximize the headroom available for incoming frames.  We
+ * use a 2K buffer for receives and need 1536/1534 to store the data for
+ * the frame.  This leaves us with 512 bytes of room.  From that we need
+ * to deduct the space needed for the shared info and the padding needed
+ * to IP align the frame.
+ *
+ * Note: For cache line sizes 256 or larger this value is going to end
+ *	 up negative.  In these cases we should fall back to the legacy
+ *	 receive path.
+ */
+#if (PAGE_SIZE < 8192)
+#define I40E_2K_TOO_SMALL_WITH_PADDING \
+((NET_SKB_PAD + I40E_RXBUFFER_1536) > SKB_WITH_OVERHEAD(I40E_RXBUFFER_2048))
+
+static inline int i40e_compute_pad(int rx_buf_len)
+{
+	int page_size, pad_size;
+
+	page_size = ALIGN(rx_buf_len, PAGE_SIZE / 2);
+	pad_size = SKB_WITH_OVERHEAD(page_size) - rx_buf_len;
+
+	return pad_size;
+}
+
+static inline int i40e_skb_pad(void)
+{
+	int rx_buf_len;
+
+	/* If a 2K buffer cannot handle a standard Ethernet frame then
+	 * optimize padding for a 3K buffer instead of a 1.5K buffer.
+	 *
+	 * For a 3K buffer we need to add enough padding to allow for
+	 * tailroom due to NET_IP_ALIGN possibly shifting us out of
+	 * cache-line alignment.
+	 */
+	if (I40E_2K_TOO_SMALL_WITH_PADDING)
+		rx_buf_len = I40E_RXBUFFER_3072 + SKB_DATA_ALIGN(NET_IP_ALIGN);
+	else
+		rx_buf_len = I40E_RXBUFFER_1536;
+
+	/* if needed make room for NET_IP_ALIGN */
+	rx_buf_len -= NET_IP_ALIGN;
+
+	return i40e_compute_pad(rx_buf_len);
+}
+
+#define I40E_SKB_PAD i40e_skb_pad()
+#else
+#define I40E_2K_TOO_SMALL_WITH_PADDING false
+#define I40E_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN)
+#endif
+
+/**
+ * i40e_test_staterr - tests bits in Rx descriptor status and error fields
+ * @rx_desc: pointer to receive descriptor (in le64 format)
+ * @stat_err_bits: value to mask
+ *
+ * This function does some fast chicanery in order to return the
+ * value of the mask which is really only used for boolean tests.
+ * The status_error_len doesn't need to be shifted because it begins
+ * at offset zero.
+ */
+static inline bool i40e_test_staterr(union i40e_rx_desc *rx_desc,
+				     const u64 stat_err_bits)
+{
+	return !!(rx_desc->wb.qword1.status_error_len &
+		  cpu_to_le64(stat_err_bits));
+}
+
+/* How many Rx Buffers do we bundle into one write to the hardware ? */
+#define I40E_RX_BUFFER_WRITE	32	/* Must be power of 2 */
+#define I40E_RX_INCREMENT(r, i) \
+	do {					\
+		(i)++;				\
+		if ((i) == (r)->count)		\
+			i = 0;			\
+		r->next_to_clean = i;		\
+	} while (0)
+
+#define I40E_RX_NEXT_DESC(r, i, n)		\
+	do {					\
+		(i)++;				\
+		if ((i) == (r)->count)		\
+			i = 0;			\
+		(n) = I40E_RX_DESC((r), (i));	\
+	} while (0)
+
+#define I40E_RX_NEXT_DESC_PREFETCH(r, i, n)		\
+	do {						\
+		I40E_RX_NEXT_DESC((r), (i), (n));	\
+		prefetch((n));				\
+	} while (0)
+
+#define I40E_MAX_BUFFER_TXD	8
+#define I40E_MIN_TX_LEN		17
+
+/* The size limit for a transmit buffer in a descriptor is (16K - 1).
+ * In order to align with the read requests we will align the value to
+ * the nearest 4K which represents our maximum read request size.
+ */
+#define I40E_MAX_READ_REQ_SIZE		4096
+#define I40E_MAX_DATA_PER_TXD		(16 * 1024 - 1)
+#define I40E_MAX_DATA_PER_TXD_ALIGNED \
+	(I40E_MAX_DATA_PER_TXD & ~(I40E_MAX_READ_REQ_SIZE - 1))
+
+/**
+ * i40e_txd_use_count  - estimate the number of descriptors needed for Tx
+ * @size: transmit request size in bytes
+ *
+ * Due to hardware alignment restrictions (4K alignment), we need to
+ * assume that we can have no more than 12K of data per descriptor, even
+ * though each descriptor can take up to 16K - 1 bytes of aligned memory.
+ * Thus, we need to divide by 12K. But division is slow! Instead,
+ * we decompose the operation into shifts and one relatively cheap
+ * multiply operation.
+ *
+ * To divide by 12K, we first divide by 4K, then divide by 3:
+ *     To divide by 4K, shift right by 12 bits
+ *     To divide by 3, multiply by 85, then divide by 256
+ *     (Divide by 256 is done by shifting right by 8 bits)
+ * Finally, we add one to round up. Because 256 isn't an exact multiple of
+ * 3, we'll underestimate near each multiple of 12K. This is actually more
+ * accurate as we have 4K - 1 of wiggle room that we can fit into the last
+ * segment.  For our purposes this is accurate out to 1M which is orders of
+ * magnitude greater than our largest possible GSO size.
+ *
+ * This would then be implemented as:
+ *     return (((size >> 12) * 85) >> 8) + 1;
+ *
+ * Since multiplication and division are commutative, we can reorder
+ * operations into:
+ *     return ((size * 85) >> 20) + 1;
+ */
+static inline unsigned int i40e_txd_use_count(unsigned int size)
+{
+	return ((size * 85) >> 20) + 1;
+}
+
+/* Tx Descriptors needed, worst case */
+#define DESC_NEEDED (MAX_SKB_FRAGS + 6)
+#define I40E_MIN_DESC_PENDING	4
+
+#define I40E_TX_FLAGS_HW_VLAN		BIT(1)
+#define I40E_TX_FLAGS_SW_VLAN		BIT(2)
+#define I40E_TX_FLAGS_TSO		BIT(3)
+#define I40E_TX_FLAGS_IPV4		BIT(4)
+#define I40E_TX_FLAGS_IPV6		BIT(5)
+#define I40E_TX_FLAGS_FCCRC		BIT(6)
+#define I40E_TX_FLAGS_FSO		BIT(7)
+#define I40E_TX_FLAGS_TSYN		BIT(8)
+#define I40E_TX_FLAGS_FD_SB		BIT(9)
+#define I40E_TX_FLAGS_UDP_TUNNEL	BIT(10)
+#define I40E_TX_FLAGS_VLAN_MASK		0xffff0000
+#define I40E_TX_FLAGS_VLAN_PRIO_MASK	0xe0000000
+#define I40E_TX_FLAGS_VLAN_PRIO_SHIFT	29
+#define I40E_TX_FLAGS_VLAN_SHIFT	16
+
+struct i40e_tx_buffer {
+	struct i40e_tx_desc *next_to_watch;
+	union {
+		struct sk_buff *skb;
+		void *raw_buf;
+	};
+	unsigned int bytecount;
+	unsigned short gso_segs;
+
+	DEFINE_DMA_UNMAP_ADDR(dma);
+	DEFINE_DMA_UNMAP_LEN(len);
+	u32 tx_flags;
+};
+
+struct i40e_rx_buffer {
+	dma_addr_t dma;
+	struct page *page;
+#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
+	__u32 page_offset;
+#else
+	__u16 page_offset;
+#endif
+	__u16 pagecnt_bias;
+};
+
+struct i40e_queue_stats {
+	u64 packets;
+	u64 bytes;
+};
+
+struct i40e_tx_queue_stats {
+	u64 restart_queue;
+	u64 tx_busy;
+	u64 tx_done_old;
+	u64 tx_linearize;
+	u64 tx_force_wb;
+	int prev_pkt_ctr;
+};
+
+struct i40e_rx_queue_stats {
+	u64 non_eop_descs;
+	u64 alloc_page_failed;
+	u64 alloc_buff_failed;
+	u64 page_reuse_count;
+	u64 realloc_count;
+};
+
+enum i40e_ring_state_t {
+	__I40E_TX_FDIR_INIT_DONE,
+	__I40E_TX_XPS_INIT_DONE,
+	__I40E_RING_STATE_NBITS /* must be last */
+};
+
+/* some useful defines for virtchannel interface, which
+ * is the only remaining user of header split
+ */
+#define I40E_RX_DTYPE_NO_SPLIT      0
+#define I40E_RX_DTYPE_HEADER_SPLIT  1
+#define I40E_RX_DTYPE_SPLIT_ALWAYS  2
+#define I40E_RX_SPLIT_L2      0x1
+#define I40E_RX_SPLIT_IP      0x2
+#define I40E_RX_SPLIT_TCP_UDP 0x4
+#define I40E_RX_SPLIT_SCTP    0x8
+
+/* struct that defines a descriptor ring, associated with a VSI */
+struct i40e_ring {
+	struct i40e_ring *next;		/* pointer to next ring in q_vector */
+	void *desc;			/* Descriptor ring memory */
+	struct device *dev;		/* Used for DMA mapping */
+	struct net_device *netdev;	/* netdev ring maps to */
+	struct bpf_prog *xdp_prog;
+	union {
+		struct i40e_tx_buffer *tx_bi;
+		struct i40e_rx_buffer *rx_bi;
+	};
+	DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS);
+	u16 queue_index;		/* Queue number of ring */
+	u8 dcb_tc;			/* Traffic class of ring */
+	u8 __iomem *tail;
+
+	/* high bit set means dynamic, use accessor routines to read/write.
+	 * hardware only supports 2us resolution for the ITR registers.
+	 * these values always store the USER setting, and must be converted
+	 * before programming to a register.
+	 */
+	u16 itr_setting;
+
+	u16 count;			/* Number of descriptors */
+	u16 reg_idx;			/* HW register index of the ring */
+	u16 rx_buf_len;
+
+	/* used in interrupt processing */
+	u16 next_to_use;
+	u16 next_to_clean;
+
+	u8 atr_sample_rate;
+	u8 atr_count;
+
+	bool ring_active;		/* is ring online or not */
+	bool arm_wb;		/* do something to arm write back */
+	u8 packet_stride;
+
+	u16 flags;
+#define I40E_TXR_FLAGS_WB_ON_ITR		BIT(0)
+#define I40E_RXR_FLAGS_BUILD_SKB_ENABLED	BIT(1)
+#define I40E_TXR_FLAGS_XDP			BIT(2)
+
+	/* stats structs */
+	struct i40e_queue_stats	stats;
+	struct u64_stats_sync syncp;
+	union {
+		struct i40e_tx_queue_stats tx_stats;
+		struct i40e_rx_queue_stats rx_stats;
+	};
+
+	unsigned int size;		/* length of descriptor ring in bytes */
+	dma_addr_t dma;			/* physical address of ring */
+
+	struct i40e_vsi *vsi;		/* Backreference to associated VSI */
+	struct i40e_q_vector *q_vector;	/* Backreference to associated vector */
+
+	struct rcu_head rcu;		/* to avoid race on free */
+	u16 next_to_alloc;
+	struct sk_buff *skb;		/* When i40e_clean_rx_ring_irq() must
+					 * return before it sees the EOP for
+					 * the current packet, we save that skb
+					 * here and resume receiving this
+					 * packet the next time
+					 * i40e_clean_rx_ring_irq() is called
+					 * for this ring.
+					 */
+
+	struct i40e_channel *ch;
+	struct xdp_rxq_info xdp_rxq;
+} ____cacheline_internodealigned_in_smp;
+
+static inline bool ring_uses_build_skb(struct i40e_ring *ring)
+{
+	return !!(ring->flags & I40E_RXR_FLAGS_BUILD_SKB_ENABLED);
+}
+
+static inline void set_ring_build_skb_enabled(struct i40e_ring *ring)
+{
+	ring->flags |= I40E_RXR_FLAGS_BUILD_SKB_ENABLED;
+}
+
+static inline void clear_ring_build_skb_enabled(struct i40e_ring *ring)
+{
+	ring->flags &= ~I40E_RXR_FLAGS_BUILD_SKB_ENABLED;
+}
+
+static inline bool ring_is_xdp(struct i40e_ring *ring)
+{
+	return !!(ring->flags & I40E_TXR_FLAGS_XDP);
+}
+
+static inline void set_ring_xdp(struct i40e_ring *ring)
+{
+	ring->flags |= I40E_TXR_FLAGS_XDP;
+}
+
+#define I40E_ITR_ADAPTIVE_MIN_INC	0x0002
+#define I40E_ITR_ADAPTIVE_MIN_USECS	0x0002
+#define I40E_ITR_ADAPTIVE_MAX_USECS	0x007e
+#define I40E_ITR_ADAPTIVE_LATENCY	0x8000
+#define I40E_ITR_ADAPTIVE_BULK		0x0000
+#define ITR_IS_BULK(x) (!((x) & I40E_ITR_ADAPTIVE_LATENCY))
+
+struct i40e_ring_container {
+	struct i40e_ring *ring;		/* pointer to linked list of ring(s) */
+	unsigned long next_update;	/* jiffies value of next update */
+	unsigned int total_bytes;	/* total bytes processed this int */
+	unsigned int total_packets;	/* total packets processed this int */
+	u16 count;
+	u16 target_itr;			/* target ITR setting for ring(s) */
+	u16 current_itr;		/* current ITR setting for ring(s) */
+};
+
+/* iterator for handling rings in ring container */
+#define i40e_for_each_ring(pos, head) \
+	for (pos = (head).ring; pos != NULL; pos = pos->next)
+
+static inline unsigned int i40e_rx_pg_order(struct i40e_ring *ring)
+{
+#if (PAGE_SIZE < 8192)
+	if (ring->rx_buf_len > (PAGE_SIZE / 2))
+		return 1;
+#endif
+	return 0;
+}
+
+#define i40e_rx_pg_size(_ring) (PAGE_SIZE << i40e_rx_pg_order(_ring))
+
+bool i40e_alloc_rx_buffers(struct i40e_ring *rxr, u16 cleaned_count);
+netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
+void i40e_clean_tx_ring(struct i40e_ring *tx_ring);
+void i40e_clean_rx_ring(struct i40e_ring *rx_ring);
+int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring);
+int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring);
+void i40e_free_tx_resources(struct i40e_ring *tx_ring);
+void i40e_free_rx_resources(struct i40e_ring *rx_ring);
+int i40e_napi_poll(struct napi_struct *napi, int budget);
+void i40e_force_wb(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector);
+u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
+void i40e_detect_recover_hung(struct i40e_vsi *vsi);
+int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
+bool __i40e_chk_linearize(struct sk_buff *skb);
+int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp);
+void i40e_xdp_flush(struct net_device *dev);
+
+/**
+ * i40e_get_head - Retrieve head from head writeback
+ * @tx_ring:  tx ring to fetch head of
+ *
+ * Returns value of Tx ring head based on value stored
+ * in head write-back location
+ **/
+static inline u32 i40e_get_head(struct i40e_ring *tx_ring)
+{
+	void *head = (struct i40e_tx_desc *)tx_ring->desc + tx_ring->count;
+
+	return le32_to_cpu(*(volatile __le32 *)head);
+}
+
+/**
+ * i40e_xmit_descriptor_count - calculate number of Tx descriptors needed
+ * @skb:     send buffer
+ * @tx_ring: ring to send buffer on
+ *
+ * Returns number of data descriptors needed for this skb. Returns 0 to indicate
+ * there is not enough descriptors available in this ring since we need at least
+ * one descriptor.
+ **/
+static inline int i40e_xmit_descriptor_count(struct sk_buff *skb)
+{
+	const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
+	unsigned int nr_frags = skb_shinfo(skb)->nr_frags;
+	int count = 0, size = skb_headlen(skb);
+
+	for (;;) {
+		count += i40e_txd_use_count(size);
+
+		if (!nr_frags--)
+			break;
+
+		size = skb_frag_size(frag++);
+	}
+
+	return count;
+}
+
+/**
+ * i40e_maybe_stop_tx - 1st level check for Tx stop conditions
+ * @tx_ring: the ring to be checked
+ * @size:    the size buffer we want to assure is available
+ *
+ * Returns 0 if stop is not needed
+ **/
+static inline int i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
+{
+	if (likely(I40E_DESC_UNUSED(tx_ring) >= size))
+		return 0;
+	return __i40e_maybe_stop_tx(tx_ring, size);
+}
+
+/**
+ * i40e_chk_linearize - Check if there are more than 8 fragments per packet
+ * @skb:      send buffer
+ * @count:    number of buffers used
+ *
+ * Note: Our HW can't scatter-gather more than 8 fragments to build
+ * a packet on the wire and so we need to figure out the cases where we
+ * need to linearize the skb.
+ **/
+static inline bool i40e_chk_linearize(struct sk_buff *skb, int count)
+{
+	/* Both TSO and single send will work if count is less than 8 */
+	if (likely(count < I40E_MAX_BUFFER_TXD))
+		return false;
+
+	if (skb_is_gso(skb))
+		return __i40e_chk_linearize(skb);
+
+	/* we can support up to 8 data buffers for a single send */
+	return count != I40E_MAX_BUFFER_TXD;
+}
+
+/**
+ * txring_txq - Find the netdev Tx ring based on the i40e Tx ring
+ * @ring: Tx ring to find the netdev equivalent of
+ **/
+static inline struct netdev_queue *txring_txq(const struct i40e_ring *ring)
+{
+	return netdev_get_tx_queue(ring->netdev, ring->queue_index);
+}
+#endif /* _I40E_TXRX_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
new file mode 100644
index 0000000..bfb8009
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -0,0 +1,1595 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_TYPE_H_
+#define _I40E_TYPE_H_
+
+#include "i40e_status.h"
+#include "i40e_osdep.h"
+#include "i40e_register.h"
+#include "i40e_adminq.h"
+#include "i40e_hmc.h"
+#include "i40e_lan_hmc.h"
+#include "i40e_devids.h"
+
+/* I40E_MASK is a macro used on 32 bit registers */
+#define I40E_MASK(mask, shift) ((u32)(mask) << (shift))
+
+#define I40E_MAX_VSI_QP			16
+#define I40E_MAX_VF_VSI			4
+#define I40E_MAX_CHAINED_RX_BUFFERS	5
+#define I40E_MAX_PF_UDP_OFFLOAD_PORTS	16
+
+/* Max default timeout in ms, */
+#define I40E_MAX_NVM_TIMEOUT		18000
+
+/* Max timeout in ms for the phy to respond */
+#define I40E_MAX_PHY_TIMEOUT		500
+
+/* Switch from ms to the 1usec global time (this is the GTIME resolution) */
+#define I40E_MS_TO_GTIME(time)		((time) * 1000)
+
+/* forward declaration */
+struct i40e_hw;
+typedef void (*I40E_ADMINQ_CALLBACK)(struct i40e_hw *, struct i40e_aq_desc *);
+
+/* Data type manipulation macros. */
+
+#define I40E_DESC_UNUSED(R)	\
+	((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
+	(R)->next_to_clean - (R)->next_to_use - 1)
+
+/* bitfields for Tx queue mapping in QTX_CTL */
+#define I40E_QTX_CTL_VF_QUEUE	0x0
+#define I40E_QTX_CTL_VM_QUEUE	0x1
+#define I40E_QTX_CTL_PF_QUEUE	0x2
+
+/* debug masks - set these bits in hw->debug_mask to control output */
+enum i40e_debug_mask {
+	I40E_DEBUG_INIT			= 0x00000001,
+	I40E_DEBUG_RELEASE		= 0x00000002,
+
+	I40E_DEBUG_LINK			= 0x00000010,
+	I40E_DEBUG_PHY			= 0x00000020,
+	I40E_DEBUG_HMC			= 0x00000040,
+	I40E_DEBUG_NVM			= 0x00000080,
+	I40E_DEBUG_LAN			= 0x00000100,
+	I40E_DEBUG_FLOW			= 0x00000200,
+	I40E_DEBUG_DCB			= 0x00000400,
+	I40E_DEBUG_DIAG			= 0x00000800,
+	I40E_DEBUG_FD			= 0x00001000,
+	I40E_DEBUG_PACKAGE		= 0x00002000,
+	I40E_DEBUG_IWARP		= 0x00F00000,
+	I40E_DEBUG_AQ_MESSAGE		= 0x01000000,
+	I40E_DEBUG_AQ_DESCRIPTOR	= 0x02000000,
+	I40E_DEBUG_AQ_DESC_BUFFER	= 0x04000000,
+	I40E_DEBUG_AQ_COMMAND		= 0x06000000,
+	I40E_DEBUG_AQ			= 0x0F000000,
+
+	I40E_DEBUG_USER			= 0xF0000000,
+
+	I40E_DEBUG_ALL			= 0xFFFFFFFF
+};
+
+#define I40E_MDIO_CLAUSE22_STCODE_MASK	I40E_MASK(1, \
+						  I40E_GLGEN_MSCA_STCODE_SHIFT)
+#define I40E_MDIO_CLAUSE22_OPCODE_WRITE_MASK	I40E_MASK(1, \
+						  I40E_GLGEN_MSCA_OPCODE_SHIFT)
+#define I40E_MDIO_CLAUSE22_OPCODE_READ_MASK	I40E_MASK(2, \
+						  I40E_GLGEN_MSCA_OPCODE_SHIFT)
+
+#define I40E_MDIO_CLAUSE45_STCODE_MASK	I40E_MASK(0, \
+						  I40E_GLGEN_MSCA_STCODE_SHIFT)
+#define I40E_MDIO_CLAUSE45_OPCODE_ADDRESS_MASK	I40E_MASK(0, \
+						  I40E_GLGEN_MSCA_OPCODE_SHIFT)
+#define I40E_MDIO_CLAUSE45_OPCODE_WRITE_MASK	I40E_MASK(1, \
+						  I40E_GLGEN_MSCA_OPCODE_SHIFT)
+#define I40E_MDIO_CLAUSE45_OPCODE_READ_INC_ADDR_MASK	I40E_MASK(2, \
+						I40E_GLGEN_MSCA_OPCODE_SHIFT)
+#define I40E_MDIO_CLAUSE45_OPCODE_READ_MASK	I40E_MASK(3, \
+						I40E_GLGEN_MSCA_OPCODE_SHIFT)
+
+#define I40E_PHY_COM_REG_PAGE                   0x1E
+#define I40E_PHY_LED_LINK_MODE_MASK             0xF0
+#define I40E_PHY_LED_MANUAL_ON                  0x100
+#define I40E_PHY_LED_PROV_REG_1                 0xC430
+#define I40E_PHY_LED_MODE_MASK                  0xFFFF
+#define I40E_PHY_LED_MODE_ORIG                  0x80000000
+
+/* These are structs for managing the hardware information and the operations.
+ * The structures of function pointers are filled out at init time when we
+ * know for sure exactly which hardware we're working with.  This gives us the
+ * flexibility of using the same main driver code but adapting to slightly
+ * different hardware needs as new parts are developed.  For this architecture,
+ * the Firmware and AdminQ are intended to insulate the driver from most of the
+ * future changes, but these structures will also do part of the job.
+ */
+enum i40e_mac_type {
+	I40E_MAC_UNKNOWN = 0,
+	I40E_MAC_XL710,
+	I40E_MAC_VF,
+	I40E_MAC_X722,
+	I40E_MAC_X722_VF,
+	I40E_MAC_GENERIC,
+};
+
+enum i40e_media_type {
+	I40E_MEDIA_TYPE_UNKNOWN = 0,
+	I40E_MEDIA_TYPE_FIBER,
+	I40E_MEDIA_TYPE_BASET,
+	I40E_MEDIA_TYPE_BACKPLANE,
+	I40E_MEDIA_TYPE_CX4,
+	I40E_MEDIA_TYPE_DA,
+	I40E_MEDIA_TYPE_VIRTUAL
+};
+
+enum i40e_fc_mode {
+	I40E_FC_NONE = 0,
+	I40E_FC_RX_PAUSE,
+	I40E_FC_TX_PAUSE,
+	I40E_FC_FULL,
+	I40E_FC_PFC,
+	I40E_FC_DEFAULT
+};
+
+enum i40e_set_fc_aq_failures {
+	I40E_SET_FC_AQ_FAIL_NONE = 0,
+	I40E_SET_FC_AQ_FAIL_GET = 1,
+	I40E_SET_FC_AQ_FAIL_SET = 2,
+	I40E_SET_FC_AQ_FAIL_UPDATE = 4,
+	I40E_SET_FC_AQ_FAIL_SET_UPDATE = 6
+};
+
+enum i40e_vsi_type {
+	I40E_VSI_MAIN	= 0,
+	I40E_VSI_VMDQ1	= 1,
+	I40E_VSI_VMDQ2	= 2,
+	I40E_VSI_CTRL	= 3,
+	I40E_VSI_FCOE	= 4,
+	I40E_VSI_MIRROR	= 5,
+	I40E_VSI_SRIOV	= 6,
+	I40E_VSI_FDIR	= 7,
+	I40E_VSI_IWARP	= 8,
+	I40E_VSI_TYPE_UNKNOWN
+};
+
+enum i40e_queue_type {
+	I40E_QUEUE_TYPE_RX = 0,
+	I40E_QUEUE_TYPE_TX,
+	I40E_QUEUE_TYPE_PE_CEQ,
+	I40E_QUEUE_TYPE_UNKNOWN
+};
+
+struct i40e_link_status {
+	enum i40e_aq_phy_type phy_type;
+	enum i40e_aq_link_speed link_speed;
+	u8 link_info;
+	u8 an_info;
+	u8 req_fec_info;
+	u8 fec_info;
+	u8 ext_info;
+	u8 loopback;
+	/* is Link Status Event notification to SW enabled */
+	bool lse_enable;
+	u16 max_frame_size;
+	bool crc_enable;
+	u8 pacing;
+	u8 requested_speeds;
+	u8 module_type[3];
+	/* 1st byte: module identifier */
+#define I40E_MODULE_TYPE_SFP		0x03
+#define I40E_MODULE_TYPE_QSFP		0x0D
+	/* 2nd byte: ethernet compliance codes for 10/40G */
+#define I40E_MODULE_TYPE_40G_ACTIVE	0x01
+#define I40E_MODULE_TYPE_40G_LR4	0x02
+#define I40E_MODULE_TYPE_40G_SR4	0x04
+#define I40E_MODULE_TYPE_40G_CR4	0x08
+#define I40E_MODULE_TYPE_10G_BASE_SR	0x10
+#define I40E_MODULE_TYPE_10G_BASE_LR	0x20
+#define I40E_MODULE_TYPE_10G_BASE_LRM	0x40
+#define I40E_MODULE_TYPE_10G_BASE_ER	0x80
+	/* 3rd byte: ethernet compliance codes for 1G */
+#define I40E_MODULE_TYPE_1000BASE_SX	0x01
+#define I40E_MODULE_TYPE_1000BASE_LX	0x02
+#define I40E_MODULE_TYPE_1000BASE_CX	0x04
+#define I40E_MODULE_TYPE_1000BASE_T	0x08
+};
+
+struct i40e_phy_info {
+	struct i40e_link_status link_info;
+	struct i40e_link_status link_info_old;
+	bool get_link_info;
+	enum i40e_media_type media_type;
+	/* all the phy types the NVM is capable of */
+	u64 phy_types;
+};
+
+#define I40E_CAP_PHY_TYPE_SGMII BIT_ULL(I40E_PHY_TYPE_SGMII)
+#define I40E_CAP_PHY_TYPE_1000BASE_KX BIT_ULL(I40E_PHY_TYPE_1000BASE_KX)
+#define I40E_CAP_PHY_TYPE_10GBASE_KX4 BIT_ULL(I40E_PHY_TYPE_10GBASE_KX4)
+#define I40E_CAP_PHY_TYPE_10GBASE_KR BIT_ULL(I40E_PHY_TYPE_10GBASE_KR)
+#define I40E_CAP_PHY_TYPE_40GBASE_KR4 BIT_ULL(I40E_PHY_TYPE_40GBASE_KR4)
+#define I40E_CAP_PHY_TYPE_XAUI BIT_ULL(I40E_PHY_TYPE_XAUI)
+#define I40E_CAP_PHY_TYPE_XFI BIT_ULL(I40E_PHY_TYPE_XFI)
+#define I40E_CAP_PHY_TYPE_SFI BIT_ULL(I40E_PHY_TYPE_SFI)
+#define I40E_CAP_PHY_TYPE_XLAUI BIT_ULL(I40E_PHY_TYPE_XLAUI)
+#define I40E_CAP_PHY_TYPE_XLPPI BIT_ULL(I40E_PHY_TYPE_XLPPI)
+#define I40E_CAP_PHY_TYPE_40GBASE_CR4_CU BIT_ULL(I40E_PHY_TYPE_40GBASE_CR4_CU)
+#define I40E_CAP_PHY_TYPE_10GBASE_CR1_CU BIT_ULL(I40E_PHY_TYPE_10GBASE_CR1_CU)
+#define I40E_CAP_PHY_TYPE_10GBASE_AOC BIT_ULL(I40E_PHY_TYPE_10GBASE_AOC)
+#define I40E_CAP_PHY_TYPE_40GBASE_AOC BIT_ULL(I40E_PHY_TYPE_40GBASE_AOC)
+#define I40E_CAP_PHY_TYPE_100BASE_TX BIT_ULL(I40E_PHY_TYPE_100BASE_TX)
+#define I40E_CAP_PHY_TYPE_1000BASE_T BIT_ULL(I40E_PHY_TYPE_1000BASE_T)
+#define I40E_CAP_PHY_TYPE_10GBASE_T BIT_ULL(I40E_PHY_TYPE_10GBASE_T)
+#define I40E_CAP_PHY_TYPE_10GBASE_SR BIT_ULL(I40E_PHY_TYPE_10GBASE_SR)
+#define I40E_CAP_PHY_TYPE_10GBASE_LR BIT_ULL(I40E_PHY_TYPE_10GBASE_LR)
+#define I40E_CAP_PHY_TYPE_10GBASE_SFPP_CU BIT_ULL(I40E_PHY_TYPE_10GBASE_SFPP_CU)
+#define I40E_CAP_PHY_TYPE_10GBASE_CR1 BIT_ULL(I40E_PHY_TYPE_10GBASE_CR1)
+#define I40E_CAP_PHY_TYPE_40GBASE_CR4 BIT_ULL(I40E_PHY_TYPE_40GBASE_CR4)
+#define I40E_CAP_PHY_TYPE_40GBASE_SR4 BIT_ULL(I40E_PHY_TYPE_40GBASE_SR4)
+#define I40E_CAP_PHY_TYPE_40GBASE_LR4 BIT_ULL(I40E_PHY_TYPE_40GBASE_LR4)
+#define I40E_CAP_PHY_TYPE_1000BASE_SX BIT_ULL(I40E_PHY_TYPE_1000BASE_SX)
+#define I40E_CAP_PHY_TYPE_1000BASE_LX BIT_ULL(I40E_PHY_TYPE_1000BASE_LX)
+#define I40E_CAP_PHY_TYPE_1000BASE_T_OPTICAL \
+				BIT_ULL(I40E_PHY_TYPE_1000BASE_T_OPTICAL)
+#define I40E_CAP_PHY_TYPE_20GBASE_KR2 BIT_ULL(I40E_PHY_TYPE_20GBASE_KR2)
+/* Defining the macro I40E_TYPE_OFFSET to implement a bit shift for some
+ * PHY types. There is an unused bit (31) in the I40E_CAP_PHY_TYPE_* bit
+ * fields but no corresponding gap in the i40e_aq_phy_type enumeration. So,
+ * a shift is needed to adjust for this with values larger than 31. The
+ * only affected values are I40E_PHY_TYPE_25GBASE_*.
+ */
+#define I40E_PHY_TYPE_OFFSET 1
+#define I40E_CAP_PHY_TYPE_25GBASE_KR BIT_ULL(I40E_PHY_TYPE_25GBASE_KR + \
+					     I40E_PHY_TYPE_OFFSET)
+#define I40E_CAP_PHY_TYPE_25GBASE_CR BIT_ULL(I40E_PHY_TYPE_25GBASE_CR + \
+					     I40E_PHY_TYPE_OFFSET)
+#define I40E_CAP_PHY_TYPE_25GBASE_SR BIT_ULL(I40E_PHY_TYPE_25GBASE_SR + \
+					     I40E_PHY_TYPE_OFFSET)
+#define I40E_CAP_PHY_TYPE_25GBASE_LR BIT_ULL(I40E_PHY_TYPE_25GBASE_LR + \
+					     I40E_PHY_TYPE_OFFSET)
+#define I40E_CAP_PHY_TYPE_25GBASE_AOC BIT_ULL(I40E_PHY_TYPE_25GBASE_AOC + \
+					     I40E_PHY_TYPE_OFFSET)
+#define I40E_CAP_PHY_TYPE_25GBASE_ACC BIT_ULL(I40E_PHY_TYPE_25GBASE_ACC + \
+					     I40E_PHY_TYPE_OFFSET)
+#define I40E_HW_CAP_MAX_GPIO			30
+/* Capabilities of a PF or a VF or the whole device */
+struct i40e_hw_capabilities {
+	u32  switch_mode;
+#define I40E_NVM_IMAGE_TYPE_EVB		0x0
+#define I40E_NVM_IMAGE_TYPE_CLOUD	0x2
+#define I40E_NVM_IMAGE_TYPE_UDP_CLOUD	0x3
+
+	/* Cloud filter modes:
+	 * Mode1: Filter on L4 port only
+	 * Mode2: Filter for non-tunneled traffic
+	 * Mode3: Filter for tunnel traffic
+	 */
+#define I40E_CLOUD_FILTER_MODE1	0x6
+#define I40E_CLOUD_FILTER_MODE2	0x7
+#define I40E_CLOUD_FILTER_MODE3	0x8
+#define I40E_SWITCH_MODE_MASK	0xF
+
+	u32  management_mode;
+	u32  mng_protocols_over_mctp;
+#define I40E_MNG_PROTOCOL_PLDM		0x2
+#define I40E_MNG_PROTOCOL_OEM_COMMANDS	0x4
+#define I40E_MNG_PROTOCOL_NCSI		0x8
+	u32  npar_enable;
+	u32  os2bmc;
+	u32  valid_functions;
+	bool sr_iov_1_1;
+	bool vmdq;
+	bool evb_802_1_qbg; /* Edge Virtual Bridging */
+	bool evb_802_1_qbh; /* Bridge Port Extension */
+	bool dcb;
+	bool fcoe;
+	bool iscsi; /* Indicates iSCSI enabled */
+	bool flex10_enable;
+	bool flex10_capable;
+	u32  flex10_mode;
+#define I40E_FLEX10_MODE_UNKNOWN	0x0
+#define I40E_FLEX10_MODE_DCC		0x1
+#define I40E_FLEX10_MODE_DCI		0x2
+
+	u32 flex10_status;
+#define I40E_FLEX10_STATUS_DCC_ERROR	0x1
+#define I40E_FLEX10_STATUS_VC_MODE	0x2
+
+	bool sec_rev_disabled;
+	bool update_disabled;
+#define I40E_NVM_MGMT_SEC_REV_DISABLED	0x1
+#define I40E_NVM_MGMT_UPDATE_DISABLED	0x2
+
+	bool mgmt_cem;
+	bool ieee_1588;
+	bool iwarp;
+	bool fd;
+	u32 fd_filters_guaranteed;
+	u32 fd_filters_best_effort;
+	bool rss;
+	u32 rss_table_size;
+	u32 rss_table_entry_width;
+	bool led[I40E_HW_CAP_MAX_GPIO];
+	bool sdp[I40E_HW_CAP_MAX_GPIO];
+	u32 nvm_image_type;
+	u32 num_flow_director_filters;
+	u32 num_vfs;
+	u32 vf_base_id;
+	u32 num_vsis;
+	u32 num_rx_qp;
+	u32 num_tx_qp;
+	u32 base_queue;
+	u32 num_msix_vectors;
+	u32 num_msix_vectors_vf;
+	u32 led_pin_num;
+	u32 sdp_pin_num;
+	u32 mdio_port_num;
+	u32 mdio_port_mode;
+	u8 rx_buf_chain_len;
+	u32 enabled_tcmap;
+	u32 maxtc;
+	u64 wr_csr_prot;
+};
+
+struct i40e_mac_info {
+	enum i40e_mac_type type;
+	u8 addr[ETH_ALEN];
+	u8 perm_addr[ETH_ALEN];
+	u8 san_addr[ETH_ALEN];
+	u8 port_addr[ETH_ALEN];
+	u16 max_fcoeq;
+};
+
+enum i40e_aq_resources_ids {
+	I40E_NVM_RESOURCE_ID = 1
+};
+
+enum i40e_aq_resource_access_type {
+	I40E_RESOURCE_READ = 1,
+	I40E_RESOURCE_WRITE
+};
+
+struct i40e_nvm_info {
+	u64 hw_semaphore_timeout; /* usec global time (GTIME resolution) */
+	u32 timeout;              /* [ms] */
+	u16 sr_size;              /* Shadow RAM size in words */
+	bool blank_nvm_mode;      /* is NVM empty (no FW present)*/
+	u16 version;              /* NVM package version */
+	u32 eetrack;              /* NVM data version */
+	u32 oem_ver;              /* OEM version info */
+};
+
+/* definitions used in NVM update support */
+
+enum i40e_nvmupd_cmd {
+	I40E_NVMUPD_INVALID,
+	I40E_NVMUPD_READ_CON,
+	I40E_NVMUPD_READ_SNT,
+	I40E_NVMUPD_READ_LCB,
+	I40E_NVMUPD_READ_SA,
+	I40E_NVMUPD_WRITE_ERA,
+	I40E_NVMUPD_WRITE_CON,
+	I40E_NVMUPD_WRITE_SNT,
+	I40E_NVMUPD_WRITE_LCB,
+	I40E_NVMUPD_WRITE_SA,
+	I40E_NVMUPD_CSUM_CON,
+	I40E_NVMUPD_CSUM_SA,
+	I40E_NVMUPD_CSUM_LCB,
+	I40E_NVMUPD_STATUS,
+	I40E_NVMUPD_EXEC_AQ,
+	I40E_NVMUPD_GET_AQ_RESULT,
+	I40E_NVMUPD_GET_AQ_EVENT,
+};
+
+enum i40e_nvmupd_state {
+	I40E_NVMUPD_STATE_INIT,
+	I40E_NVMUPD_STATE_READING,
+	I40E_NVMUPD_STATE_WRITING,
+	I40E_NVMUPD_STATE_INIT_WAIT,
+	I40E_NVMUPD_STATE_WRITE_WAIT,
+	I40E_NVMUPD_STATE_ERROR
+};
+
+/* nvm_access definition and its masks/shifts need to be accessible to
+ * application, core driver, and shared code.  Where is the right file?
+ */
+#define I40E_NVM_READ	0xB
+#define I40E_NVM_WRITE	0xC
+
+#define I40E_NVM_MOD_PNT_MASK 0xFF
+
+#define I40E_NVM_TRANS_SHIFT			8
+#define I40E_NVM_TRANS_MASK			(0xf << I40E_NVM_TRANS_SHIFT)
+#define I40E_NVM_PRESERVATION_FLAGS_SHIFT	12
+#define I40E_NVM_PRESERVATION_FLAGS_MASK \
+				(0x3 << I40E_NVM_PRESERVATION_FLAGS_SHIFT)
+#define I40E_NVM_PRESERVATION_FLAGS_SELECTED	0x01
+#define I40E_NVM_PRESERVATION_FLAGS_ALL		0x02
+#define I40E_NVM_CON				0x0
+#define I40E_NVM_SNT				0x1
+#define I40E_NVM_LCB				0x2
+#define I40E_NVM_SA				(I40E_NVM_SNT | I40E_NVM_LCB)
+#define I40E_NVM_ERA				0x4
+#define I40E_NVM_CSUM				0x8
+#define I40E_NVM_AQE				0xe
+#define I40E_NVM_EXEC				0xf
+
+#define I40E_NVM_ADAPT_SHIFT	16
+#define I40E_NVM_ADAPT_MASK	(0xffff << I40E_NVM_ADAPT_SHIFT)
+
+#define I40E_NVMUPD_MAX_DATA	4096
+#define I40E_NVMUPD_IFACE_TIMEOUT 2 /* seconds */
+
+struct i40e_nvm_access {
+	u32 command;
+	u32 config;
+	u32 offset;	/* in bytes */
+	u32 data_size;	/* in bytes */
+	u8 data[1];
+};
+
+/* (Q)SFP module access definitions */
+#define I40E_I2C_EEPROM_DEV_ADDR	0xA0
+#define I40E_I2C_EEPROM_DEV_ADDR2	0xA2
+#define I40E_MODULE_TYPE_ADDR		0x00
+#define I40E_MODULE_REVISION_ADDR	0x01
+#define I40E_MODULE_SFF_8472_COMP	0x5E
+#define I40E_MODULE_SFF_8472_SWAP	0x5C
+#define I40E_MODULE_SFF_ADDR_MODE	0x04
+#define I40E_MODULE_TYPE_QSFP_PLUS	0x0D
+#define I40E_MODULE_TYPE_QSFP28		0x11
+#define I40E_MODULE_QSFP_MAX_LEN	640
+
+/* PCI bus types */
+enum i40e_bus_type {
+	i40e_bus_type_unknown = 0,
+	i40e_bus_type_pci,
+	i40e_bus_type_pcix,
+	i40e_bus_type_pci_express,
+	i40e_bus_type_reserved
+};
+
+/* PCI bus speeds */
+enum i40e_bus_speed {
+	i40e_bus_speed_unknown	= 0,
+	i40e_bus_speed_33	= 33,
+	i40e_bus_speed_66	= 66,
+	i40e_bus_speed_100	= 100,
+	i40e_bus_speed_120	= 120,
+	i40e_bus_speed_133	= 133,
+	i40e_bus_speed_2500	= 2500,
+	i40e_bus_speed_5000	= 5000,
+	i40e_bus_speed_8000	= 8000,
+	i40e_bus_speed_reserved
+};
+
+/* PCI bus widths */
+enum i40e_bus_width {
+	i40e_bus_width_unknown	= 0,
+	i40e_bus_width_pcie_x1	= 1,
+	i40e_bus_width_pcie_x2	= 2,
+	i40e_bus_width_pcie_x4	= 4,
+	i40e_bus_width_pcie_x8	= 8,
+	i40e_bus_width_32	= 32,
+	i40e_bus_width_64	= 64,
+	i40e_bus_width_reserved
+};
+
+/* Bus parameters */
+struct i40e_bus_info {
+	enum i40e_bus_speed speed;
+	enum i40e_bus_width width;
+	enum i40e_bus_type type;
+
+	u16 func;
+	u16 device;
+	u16 lan_id;
+	u16 bus_id;
+};
+
+/* Flow control (FC) parameters */
+struct i40e_fc_info {
+	enum i40e_fc_mode current_mode; /* FC mode in effect */
+	enum i40e_fc_mode requested_mode; /* FC mode requested by caller */
+};
+
+#define I40E_MAX_TRAFFIC_CLASS		8
+#define I40E_MAX_USER_PRIORITY		8
+#define I40E_DCBX_MAX_APPS		32
+#define I40E_LLDPDU_SIZE		1500
+#define I40E_TLV_STATUS_OPER		0x1
+#define I40E_TLV_STATUS_SYNC		0x2
+#define I40E_TLV_STATUS_ERR		0x4
+#define I40E_CEE_OPER_MAX_APPS		3
+#define I40E_APP_PROTOID_FCOE		0x8906
+#define I40E_APP_PROTOID_ISCSI		0x0cbc
+#define I40E_APP_PROTOID_FIP		0x8914
+#define I40E_APP_SEL_ETHTYPE		0x1
+#define I40E_APP_SEL_TCPIP		0x2
+#define I40E_CEE_APP_SEL_ETHTYPE	0x0
+#define I40E_CEE_APP_SEL_TCPIP		0x1
+
+/* CEE or IEEE 802.1Qaz ETS Configuration data */
+struct i40e_dcb_ets_config {
+	u8 willing;
+	u8 cbs;
+	u8 maxtcs;
+	u8 prioritytable[I40E_MAX_TRAFFIC_CLASS];
+	u8 tcbwtable[I40E_MAX_TRAFFIC_CLASS];
+	u8 tsatable[I40E_MAX_TRAFFIC_CLASS];
+};
+
+/* CEE or IEEE 802.1Qaz PFC Configuration data */
+struct i40e_dcb_pfc_config {
+	u8 willing;
+	u8 mbc;
+	u8 pfccap;
+	u8 pfcenable;
+};
+
+/* CEE or IEEE 802.1Qaz Application Priority data */
+struct i40e_dcb_app_priority_table {
+	u8  priority;
+	u8  selector;
+	u16 protocolid;
+};
+
+struct i40e_dcbx_config {
+	u8  dcbx_mode;
+#define I40E_DCBX_MODE_CEE	0x1
+#define I40E_DCBX_MODE_IEEE	0x2
+	u8  app_mode;
+#define I40E_DCBX_APPS_NON_WILLING	0x1
+	u32 numapps;
+	u32 tlv_status; /* CEE mode TLV status */
+	struct i40e_dcb_ets_config etscfg;
+	struct i40e_dcb_ets_config etsrec;
+	struct i40e_dcb_pfc_config pfc;
+	struct i40e_dcb_app_priority_table app[I40E_DCBX_MAX_APPS];
+};
+
+/* Port hardware description */
+struct i40e_hw {
+	u8 __iomem *hw_addr;
+	void *back;
+
+	/* subsystem structs */
+	struct i40e_phy_info phy;
+	struct i40e_mac_info mac;
+	struct i40e_bus_info bus;
+	struct i40e_nvm_info nvm;
+	struct i40e_fc_info fc;
+
+	/* pci info */
+	u16 device_id;
+	u16 vendor_id;
+	u16 subsystem_device_id;
+	u16 subsystem_vendor_id;
+	u8 revision_id;
+	u8 port;
+	bool adapter_stopped;
+
+	/* capabilities for entire device and PCI func */
+	struct i40e_hw_capabilities dev_caps;
+	struct i40e_hw_capabilities func_caps;
+
+	/* Flow Director shared filter space */
+	u16 fdir_shared_filter_count;
+
+	/* device profile info */
+	u8  pf_id;
+	u16 main_vsi_seid;
+
+	/* for multi-function MACs */
+	u16 partition_id;
+	u16 num_partitions;
+	u16 num_ports;
+
+	/* Closest numa node to the device */
+	u16 numa_node;
+
+	/* Admin Queue info */
+	struct i40e_adminq_info aq;
+
+	/* state of nvm update process */
+	enum i40e_nvmupd_state nvmupd_state;
+	struct i40e_aq_desc nvm_wb_desc;
+	struct i40e_aq_desc nvm_aq_event_desc;
+	struct i40e_virt_mem nvm_buff;
+	bool nvm_release_on_done;
+	u16 nvm_wait_opcode;
+
+	/* HMC info */
+	struct i40e_hmc_info hmc; /* HMC info struct */
+
+	/* LLDP/DCBX Status */
+	u16 dcbx_status;
+
+	/* DCBX info */
+	struct i40e_dcbx_config local_dcbx_config; /* Oper/Local Cfg */
+	struct i40e_dcbx_config remote_dcbx_config; /* Peer Cfg */
+	struct i40e_dcbx_config desired_dcbx_config; /* CEE Desired Cfg */
+
+#define I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE BIT_ULL(0)
+#define I40E_HW_FLAG_802_1AD_CAPABLE        BIT_ULL(1)
+#define I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE  BIT_ULL(2)
+#define I40E_HW_FLAG_NVM_READ_REQUIRES_LOCK BIT_ULL(3)
+	u64 flags;
+
+	/* Used in set switch config AQ command */
+	u16 switch_tag;
+	u16 first_tag;
+	u16 second_tag;
+
+	/* debug mask */
+	u32 debug_mask;
+	char err_str[16];
+};
+
+static inline bool i40e_is_vf(struct i40e_hw *hw)
+{
+	return (hw->mac.type == I40E_MAC_VF ||
+		hw->mac.type == I40E_MAC_X722_VF);
+}
+
+struct i40e_driver_version {
+	u8 major_version;
+	u8 minor_version;
+	u8 build_version;
+	u8 subbuild_version;
+	u8 driver_string[32];
+};
+
+/* RX Descriptors */
+union i40e_16byte_rx_desc {
+	struct {
+		__le64 pkt_addr; /* Packet buffer address */
+		__le64 hdr_addr; /* Header buffer address */
+	} read;
+	struct {
+		struct {
+			struct {
+				union {
+					__le16 mirroring_status;
+					__le16 fcoe_ctx_id;
+				} mirr_fcoe;
+				__le16 l2tag1;
+			} lo_dword;
+			union {
+				__le32 rss; /* RSS Hash */
+				__le32 fd_id; /* Flow director filter id */
+				__le32 fcoe_param; /* FCoE DDP Context id */
+			} hi_dword;
+		} qword0;
+		struct {
+			/* ext status/error/pktype/length */
+			__le64 status_error_len;
+		} qword1;
+	} wb;  /* writeback */
+};
+
+union i40e_32byte_rx_desc {
+	struct {
+		__le64  pkt_addr; /* Packet buffer address */
+		__le64  hdr_addr; /* Header buffer address */
+			/* bit 0 of hdr_buffer_addr is DD bit */
+		__le64  rsvd1;
+		__le64  rsvd2;
+	} read;
+	struct {
+		struct {
+			struct {
+				union {
+					__le16 mirroring_status;
+					__le16 fcoe_ctx_id;
+				} mirr_fcoe;
+				__le16 l2tag1;
+			} lo_dword;
+			union {
+				__le32 rss; /* RSS Hash */
+				__le32 fcoe_param; /* FCoE DDP Context id */
+				/* Flow director filter id in case of
+				 * Programming status desc WB
+				 */
+				__le32 fd_id;
+			} hi_dword;
+		} qword0;
+		struct {
+			/* status/error/pktype/length */
+			__le64 status_error_len;
+		} qword1;
+		struct {
+			__le16 ext_status; /* extended status */
+			__le16 rsvd;
+			__le16 l2tag2_1;
+			__le16 l2tag2_2;
+		} qword2;
+		struct {
+			union {
+				__le32 flex_bytes_lo;
+				__le32 pe_status;
+			} lo_dword;
+			union {
+				__le32 flex_bytes_hi;
+				__le32 fd_id;
+			} hi_dword;
+		} qword3;
+	} wb;  /* writeback */
+};
+
+enum i40e_rx_desc_status_bits {
+	/* Note: These are predefined bit offsets */
+	I40E_RX_DESC_STATUS_DD_SHIFT		= 0,
+	I40E_RX_DESC_STATUS_EOF_SHIFT		= 1,
+	I40E_RX_DESC_STATUS_L2TAG1P_SHIFT	= 2,
+	I40E_RX_DESC_STATUS_L3L4P_SHIFT		= 3,
+	I40E_RX_DESC_STATUS_CRCP_SHIFT		= 4,
+	I40E_RX_DESC_STATUS_TSYNINDX_SHIFT	= 5, /* 2 BITS */
+	I40E_RX_DESC_STATUS_TSYNVALID_SHIFT	= 7,
+	/* Note: Bit 8 is reserved in X710 and XL710 */
+	I40E_RX_DESC_STATUS_EXT_UDP_0_SHIFT	= 8,
+	I40E_RX_DESC_STATUS_UMBCAST_SHIFT	= 9, /* 2 BITS */
+	I40E_RX_DESC_STATUS_FLM_SHIFT		= 11,
+	I40E_RX_DESC_STATUS_FLTSTAT_SHIFT	= 12, /* 2 BITS */
+	I40E_RX_DESC_STATUS_LPBK_SHIFT		= 14,
+	I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT	= 15,
+	I40E_RX_DESC_STATUS_RESERVED_SHIFT	= 16, /* 2 BITS */
+	/* Note: For non-tunnel packets INT_UDP_0 is the right status for
+	 * UDP header
+	 */
+	I40E_RX_DESC_STATUS_INT_UDP_0_SHIFT	= 18,
+	I40E_RX_DESC_STATUS_LAST /* this entry must be last!!! */
+};
+
+#define I40E_RXD_QW1_STATUS_SHIFT	0
+#define I40E_RXD_QW1_STATUS_MASK	((BIT(I40E_RX_DESC_STATUS_LAST) - 1) \
+					 << I40E_RXD_QW1_STATUS_SHIFT)
+
+#define I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT   I40E_RX_DESC_STATUS_TSYNINDX_SHIFT
+#define I40E_RXD_QW1_STATUS_TSYNINDX_MASK	(0x3UL << \
+					     I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT)
+
+#define I40E_RXD_QW1_STATUS_TSYNVALID_SHIFT  I40E_RX_DESC_STATUS_TSYNVALID_SHIFT
+#define I40E_RXD_QW1_STATUS_TSYNVALID_MASK \
+				    BIT_ULL(I40E_RXD_QW1_STATUS_TSYNVALID_SHIFT)
+
+enum i40e_rx_desc_fltstat_values {
+	I40E_RX_DESC_FLTSTAT_NO_DATA	= 0,
+	I40E_RX_DESC_FLTSTAT_RSV_FD_ID	= 1, /* 16byte desc? FD_ID : RSV */
+	I40E_RX_DESC_FLTSTAT_RSV	= 2,
+	I40E_RX_DESC_FLTSTAT_RSS_HASH	= 3,
+};
+
+#define I40E_RXD_QW1_ERROR_SHIFT	19
+#define I40E_RXD_QW1_ERROR_MASK		(0xFFUL << I40E_RXD_QW1_ERROR_SHIFT)
+
+enum i40e_rx_desc_error_bits {
+	/* Note: These are predefined bit offsets */
+	I40E_RX_DESC_ERROR_RXE_SHIFT		= 0,
+	I40E_RX_DESC_ERROR_RECIPE_SHIFT		= 1,
+	I40E_RX_DESC_ERROR_HBO_SHIFT		= 2,
+	I40E_RX_DESC_ERROR_L3L4E_SHIFT		= 3, /* 3 BITS */
+	I40E_RX_DESC_ERROR_IPE_SHIFT		= 3,
+	I40E_RX_DESC_ERROR_L4E_SHIFT		= 4,
+	I40E_RX_DESC_ERROR_EIPE_SHIFT		= 5,
+	I40E_RX_DESC_ERROR_OVERSIZE_SHIFT	= 6,
+	I40E_RX_DESC_ERROR_PPRS_SHIFT		= 7
+};
+
+enum i40e_rx_desc_error_l3l4e_fcoe_masks {
+	I40E_RX_DESC_ERROR_L3L4E_NONE		= 0,
+	I40E_RX_DESC_ERROR_L3L4E_PROT		= 1,
+	I40E_RX_DESC_ERROR_L3L4E_FC		= 2,
+	I40E_RX_DESC_ERROR_L3L4E_DMAC_ERR	= 3,
+	I40E_RX_DESC_ERROR_L3L4E_DMAC_WARN	= 4
+};
+
+#define I40E_RXD_QW1_PTYPE_SHIFT	30
+#define I40E_RXD_QW1_PTYPE_MASK		(0xFFULL << I40E_RXD_QW1_PTYPE_SHIFT)
+
+/* Packet type non-ip values */
+enum i40e_rx_l2_ptype {
+	I40E_RX_PTYPE_L2_RESERVED			= 0,
+	I40E_RX_PTYPE_L2_MAC_PAY2			= 1,
+	I40E_RX_PTYPE_L2_TIMESYNC_PAY2			= 2,
+	I40E_RX_PTYPE_L2_FIP_PAY2			= 3,
+	I40E_RX_PTYPE_L2_OUI_PAY2			= 4,
+	I40E_RX_PTYPE_L2_MACCNTRL_PAY2			= 5,
+	I40E_RX_PTYPE_L2_LLDP_PAY2			= 6,
+	I40E_RX_PTYPE_L2_ECP_PAY2			= 7,
+	I40E_RX_PTYPE_L2_EVB_PAY2			= 8,
+	I40E_RX_PTYPE_L2_QCN_PAY2			= 9,
+	I40E_RX_PTYPE_L2_EAPOL_PAY2			= 10,
+	I40E_RX_PTYPE_L2_ARP				= 11,
+	I40E_RX_PTYPE_L2_FCOE_PAY3			= 12,
+	I40E_RX_PTYPE_L2_FCOE_FCDATA_PAY3		= 13,
+	I40E_RX_PTYPE_L2_FCOE_FCRDY_PAY3		= 14,
+	I40E_RX_PTYPE_L2_FCOE_FCRSP_PAY3		= 15,
+	I40E_RX_PTYPE_L2_FCOE_FCOTHER_PA		= 16,
+	I40E_RX_PTYPE_L2_FCOE_VFT_PAY3			= 17,
+	I40E_RX_PTYPE_L2_FCOE_VFT_FCDATA		= 18,
+	I40E_RX_PTYPE_L2_FCOE_VFT_FCRDY			= 19,
+	I40E_RX_PTYPE_L2_FCOE_VFT_FCRSP			= 20,
+	I40E_RX_PTYPE_L2_FCOE_VFT_FCOTHER		= 21,
+	I40E_RX_PTYPE_GRENAT4_MAC_PAY3			= 58,
+	I40E_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4	= 87,
+	I40E_RX_PTYPE_GRENAT6_MAC_PAY3			= 124,
+	I40E_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4	= 153
+};
+
+struct i40e_rx_ptype_decoded {
+	u32 ptype:8;
+	u32 known:1;
+	u32 outer_ip:1;
+	u32 outer_ip_ver:1;
+	u32 outer_frag:1;
+	u32 tunnel_type:3;
+	u32 tunnel_end_prot:2;
+	u32 tunnel_end_frag:1;
+	u32 inner_prot:4;
+	u32 payload_layer:3;
+};
+
+enum i40e_rx_ptype_outer_ip {
+	I40E_RX_PTYPE_OUTER_L2	= 0,
+	I40E_RX_PTYPE_OUTER_IP	= 1
+};
+
+enum i40e_rx_ptype_outer_ip_ver {
+	I40E_RX_PTYPE_OUTER_NONE	= 0,
+	I40E_RX_PTYPE_OUTER_IPV4	= 0,
+	I40E_RX_PTYPE_OUTER_IPV6	= 1
+};
+
+enum i40e_rx_ptype_outer_fragmented {
+	I40E_RX_PTYPE_NOT_FRAG	= 0,
+	I40E_RX_PTYPE_FRAG	= 1
+};
+
+enum i40e_rx_ptype_tunnel_type {
+	I40E_RX_PTYPE_TUNNEL_NONE		= 0,
+	I40E_RX_PTYPE_TUNNEL_IP_IP		= 1,
+	I40E_RX_PTYPE_TUNNEL_IP_GRENAT		= 2,
+	I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC	= 3,
+	I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN	= 4,
+};
+
+enum i40e_rx_ptype_tunnel_end_prot {
+	I40E_RX_PTYPE_TUNNEL_END_NONE	= 0,
+	I40E_RX_PTYPE_TUNNEL_END_IPV4	= 1,
+	I40E_RX_PTYPE_TUNNEL_END_IPV6	= 2,
+};
+
+enum i40e_rx_ptype_inner_prot {
+	I40E_RX_PTYPE_INNER_PROT_NONE		= 0,
+	I40E_RX_PTYPE_INNER_PROT_UDP		= 1,
+	I40E_RX_PTYPE_INNER_PROT_TCP		= 2,
+	I40E_RX_PTYPE_INNER_PROT_SCTP		= 3,
+	I40E_RX_PTYPE_INNER_PROT_ICMP		= 4,
+	I40E_RX_PTYPE_INNER_PROT_TIMESYNC	= 5
+};
+
+enum i40e_rx_ptype_payload_layer {
+	I40E_RX_PTYPE_PAYLOAD_LAYER_NONE	= 0,
+	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY2	= 1,
+	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3	= 2,
+	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4	= 3,
+};
+
+#define I40E_RXD_QW1_LENGTH_PBUF_SHIFT	38
+#define I40E_RXD_QW1_LENGTH_PBUF_MASK	(0x3FFFULL << \
+					 I40E_RXD_QW1_LENGTH_PBUF_SHIFT)
+
+#define I40E_RXD_QW1_LENGTH_HBUF_SHIFT	52
+#define I40E_RXD_QW1_LENGTH_HBUF_MASK	(0x7FFULL << \
+					 I40E_RXD_QW1_LENGTH_HBUF_SHIFT)
+
+#define I40E_RXD_QW1_LENGTH_SPH_SHIFT	63
+#define I40E_RXD_QW1_LENGTH_SPH_MASK	BIT_ULL(I40E_RXD_QW1_LENGTH_SPH_SHIFT)
+
+enum i40e_rx_desc_ext_status_bits {
+	/* Note: These are predefined bit offsets */
+	I40E_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT	= 0,
+	I40E_RX_DESC_EXT_STATUS_L2TAG3P_SHIFT	= 1,
+	I40E_RX_DESC_EXT_STATUS_FLEXBL_SHIFT	= 2, /* 2 BITS */
+	I40E_RX_DESC_EXT_STATUS_FLEXBH_SHIFT	= 4, /* 2 BITS */
+	I40E_RX_DESC_EXT_STATUS_FDLONGB_SHIFT	= 9,
+	I40E_RX_DESC_EXT_STATUS_FCOELONGB_SHIFT	= 10,
+	I40E_RX_DESC_EXT_STATUS_PELONGB_SHIFT	= 11,
+};
+
+enum i40e_rx_desc_pe_status_bits {
+	/* Note: These are predefined bit offsets */
+	I40E_RX_DESC_PE_STATUS_QPID_SHIFT	= 0, /* 18 BITS */
+	I40E_RX_DESC_PE_STATUS_L4PORT_SHIFT	= 0, /* 16 BITS */
+	I40E_RX_DESC_PE_STATUS_IPINDEX_SHIFT	= 16, /* 8 BITS */
+	I40E_RX_DESC_PE_STATUS_QPIDHIT_SHIFT	= 24,
+	I40E_RX_DESC_PE_STATUS_APBVTHIT_SHIFT	= 25,
+	I40E_RX_DESC_PE_STATUS_PORTV_SHIFT	= 26,
+	I40E_RX_DESC_PE_STATUS_URG_SHIFT	= 27,
+	I40E_RX_DESC_PE_STATUS_IPFRAG_SHIFT	= 28,
+	I40E_RX_DESC_PE_STATUS_IPOPT_SHIFT	= 29
+};
+
+#define I40E_RX_PROG_STATUS_DESC_LENGTH_SHIFT		38
+#define I40E_RX_PROG_STATUS_DESC_LENGTH			0x2000000
+
+#define I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT	2
+#define I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK	(0x7UL << \
+				I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT)
+
+#define I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT	19
+#define I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK		(0x3FUL << \
+				I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT)
+
+enum i40e_rx_prog_status_desc_status_bits {
+	/* Note: These are predefined bit offsets */
+	I40E_RX_PROG_STATUS_DESC_DD_SHIFT	= 0,
+	I40E_RX_PROG_STATUS_DESC_PROG_ID_SHIFT	= 2 /* 3 BITS */
+};
+
+enum i40e_rx_prog_status_desc_prog_id_masks {
+	I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS	= 1,
+	I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_PROG_STATUS	= 2,
+	I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_INVL_STATUS	= 4,
+};
+
+enum i40e_rx_prog_status_desc_error_bits {
+	/* Note: These are predefined bit offsets */
+	I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT	= 0,
+	I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT	= 1,
+	I40E_RX_PROG_STATUS_DESC_FCOE_TBL_FULL_SHIFT	= 2,
+	I40E_RX_PROG_STATUS_DESC_FCOE_CONFLICT_SHIFT	= 3
+};
+
+/* TX Descriptor */
+struct i40e_tx_desc {
+	__le64 buffer_addr; /* Address of descriptor's data buf */
+	__le64 cmd_type_offset_bsz;
+};
+
+#define I40E_TXD_QW1_DTYPE_SHIFT	0
+#define I40E_TXD_QW1_DTYPE_MASK		(0xFUL << I40E_TXD_QW1_DTYPE_SHIFT)
+
+enum i40e_tx_desc_dtype_value {
+	I40E_TX_DESC_DTYPE_DATA		= 0x0,
+	I40E_TX_DESC_DTYPE_NOP		= 0x1, /* same as Context desc */
+	I40E_TX_DESC_DTYPE_CONTEXT	= 0x1,
+	I40E_TX_DESC_DTYPE_FCOE_CTX	= 0x2,
+	I40E_TX_DESC_DTYPE_FILTER_PROG	= 0x8,
+	I40E_TX_DESC_DTYPE_DDP_CTX	= 0x9,
+	I40E_TX_DESC_DTYPE_FLEX_DATA	= 0xB,
+	I40E_TX_DESC_DTYPE_FLEX_CTX_1	= 0xC,
+	I40E_TX_DESC_DTYPE_FLEX_CTX_2	= 0xD,
+	I40E_TX_DESC_DTYPE_DESC_DONE	= 0xF
+};
+
+#define I40E_TXD_QW1_CMD_SHIFT	4
+#define I40E_TXD_QW1_CMD_MASK	(0x3FFUL << I40E_TXD_QW1_CMD_SHIFT)
+
+enum i40e_tx_desc_cmd_bits {
+	I40E_TX_DESC_CMD_EOP			= 0x0001,
+	I40E_TX_DESC_CMD_RS			= 0x0002,
+	I40E_TX_DESC_CMD_ICRC			= 0x0004,
+	I40E_TX_DESC_CMD_IL2TAG1		= 0x0008,
+	I40E_TX_DESC_CMD_DUMMY			= 0x0010,
+	I40E_TX_DESC_CMD_IIPT_NONIP		= 0x0000, /* 2 BITS */
+	I40E_TX_DESC_CMD_IIPT_IPV6		= 0x0020, /* 2 BITS */
+	I40E_TX_DESC_CMD_IIPT_IPV4		= 0x0040, /* 2 BITS */
+	I40E_TX_DESC_CMD_IIPT_IPV4_CSUM		= 0x0060, /* 2 BITS */
+	I40E_TX_DESC_CMD_FCOET			= 0x0080,
+	I40E_TX_DESC_CMD_L4T_EOFT_UNK		= 0x0000, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_TCP		= 0x0100, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_SCTP		= 0x0200, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_UDP		= 0x0300, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_EOF_N		= 0x0000, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_EOF_T		= 0x0100, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_EOF_NI	= 0x0200, /* 2 BITS */
+	I40E_TX_DESC_CMD_L4T_EOFT_EOF_A		= 0x0300, /* 2 BITS */
+};
+
+#define I40E_TXD_QW1_OFFSET_SHIFT	16
+#define I40E_TXD_QW1_OFFSET_MASK	(0x3FFFFULL << \
+					 I40E_TXD_QW1_OFFSET_SHIFT)
+
+enum i40e_tx_desc_length_fields {
+	/* Note: These are predefined bit offsets */
+	I40E_TX_DESC_LENGTH_MACLEN_SHIFT	= 0, /* 7 BITS */
+	I40E_TX_DESC_LENGTH_IPLEN_SHIFT		= 7, /* 7 BITS */
+	I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT	= 14 /* 4 BITS */
+};
+
+#define I40E_TXD_QW1_TX_BUF_SZ_SHIFT	34
+#define I40E_TXD_QW1_TX_BUF_SZ_MASK	(0x3FFFULL << \
+					 I40E_TXD_QW1_TX_BUF_SZ_SHIFT)
+
+#define I40E_TXD_QW1_L2TAG1_SHIFT	48
+#define I40E_TXD_QW1_L2TAG1_MASK	(0xFFFFULL << I40E_TXD_QW1_L2TAG1_SHIFT)
+
+/* Context descriptors */
+struct i40e_tx_context_desc {
+	__le32 tunneling_params;
+	__le16 l2tag2;
+	__le16 rsvd;
+	__le64 type_cmd_tso_mss;
+};
+
+#define I40E_TXD_CTX_QW1_DTYPE_SHIFT	0
+#define I40E_TXD_CTX_QW1_DTYPE_MASK	(0xFUL << I40E_TXD_CTX_QW1_DTYPE_SHIFT)
+
+#define I40E_TXD_CTX_QW1_CMD_SHIFT	4
+#define I40E_TXD_CTX_QW1_CMD_MASK	(0xFFFFUL << I40E_TXD_CTX_QW1_CMD_SHIFT)
+
+enum i40e_tx_ctx_desc_cmd_bits {
+	I40E_TX_CTX_DESC_TSO		= 0x01,
+	I40E_TX_CTX_DESC_TSYN		= 0x02,
+	I40E_TX_CTX_DESC_IL2TAG2	= 0x04,
+	I40E_TX_CTX_DESC_IL2TAG2_IL2H	= 0x08,
+	I40E_TX_CTX_DESC_SWTCH_NOTAG	= 0x00,
+	I40E_TX_CTX_DESC_SWTCH_UPLINK	= 0x10,
+	I40E_TX_CTX_DESC_SWTCH_LOCAL	= 0x20,
+	I40E_TX_CTX_DESC_SWTCH_VSI	= 0x30,
+	I40E_TX_CTX_DESC_SWPE		= 0x40
+};
+
+#define I40E_TXD_CTX_QW1_TSO_LEN_SHIFT	30
+#define I40E_TXD_CTX_QW1_TSO_LEN_MASK	(0x3FFFFULL << \
+					 I40E_TXD_CTX_QW1_TSO_LEN_SHIFT)
+
+#define I40E_TXD_CTX_QW1_MSS_SHIFT	50
+#define I40E_TXD_CTX_QW1_MSS_MASK	(0x3FFFULL << \
+					 I40E_TXD_CTX_QW1_MSS_SHIFT)
+
+#define I40E_TXD_CTX_QW1_VSI_SHIFT	50
+#define I40E_TXD_CTX_QW1_VSI_MASK	(0x1FFULL << I40E_TXD_CTX_QW1_VSI_SHIFT)
+
+#define I40E_TXD_CTX_QW0_EXT_IP_SHIFT	0
+#define I40E_TXD_CTX_QW0_EXT_IP_MASK	(0x3ULL << \
+					 I40E_TXD_CTX_QW0_EXT_IP_SHIFT)
+
+enum i40e_tx_ctx_desc_eipt_offload {
+	I40E_TX_CTX_EXT_IP_NONE		= 0x0,
+	I40E_TX_CTX_EXT_IP_IPV6		= 0x1,
+	I40E_TX_CTX_EXT_IP_IPV4_NO_CSUM	= 0x2,
+	I40E_TX_CTX_EXT_IP_IPV4		= 0x3
+};
+
+#define I40E_TXD_CTX_QW0_EXT_IPLEN_SHIFT	2
+#define I40E_TXD_CTX_QW0_EXT_IPLEN_MASK	(0x3FULL << \
+					 I40E_TXD_CTX_QW0_EXT_IPLEN_SHIFT)
+
+#define I40E_TXD_CTX_QW0_NATT_SHIFT	9
+#define I40E_TXD_CTX_QW0_NATT_MASK	(0x3ULL << I40E_TXD_CTX_QW0_NATT_SHIFT)
+
+#define I40E_TXD_CTX_UDP_TUNNELING	BIT_ULL(I40E_TXD_CTX_QW0_NATT_SHIFT)
+#define I40E_TXD_CTX_GRE_TUNNELING	(0x2ULL << I40E_TXD_CTX_QW0_NATT_SHIFT)
+
+#define I40E_TXD_CTX_QW0_EIP_NOINC_SHIFT	11
+#define I40E_TXD_CTX_QW0_EIP_NOINC_MASK \
+				       BIT_ULL(I40E_TXD_CTX_QW0_EIP_NOINC_SHIFT)
+
+#define I40E_TXD_CTX_EIP_NOINC_IPID_CONST	I40E_TXD_CTX_QW0_EIP_NOINC_MASK
+
+#define I40E_TXD_CTX_QW0_NATLEN_SHIFT	12
+#define I40E_TXD_CTX_QW0_NATLEN_MASK	(0X7FULL << \
+					 I40E_TXD_CTX_QW0_NATLEN_SHIFT)
+
+#define I40E_TXD_CTX_QW0_DECTTL_SHIFT	19
+#define I40E_TXD_CTX_QW0_DECTTL_MASK	(0xFULL << \
+					 I40E_TXD_CTX_QW0_DECTTL_SHIFT)
+
+#define I40E_TXD_CTX_QW0_L4T_CS_SHIFT	23
+#define I40E_TXD_CTX_QW0_L4T_CS_MASK	BIT_ULL(I40E_TXD_CTX_QW0_L4T_CS_SHIFT)
+struct i40e_filter_program_desc {
+	__le32 qindex_flex_ptype_vsi;
+	__le32 rsvd;
+	__le32 dtype_cmd_cntindex;
+	__le32 fd_id;
+};
+#define I40E_TXD_FLTR_QW0_QINDEX_SHIFT	0
+#define I40E_TXD_FLTR_QW0_QINDEX_MASK	(0x7FFUL << \
+					 I40E_TXD_FLTR_QW0_QINDEX_SHIFT)
+#define I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT	11
+#define I40E_TXD_FLTR_QW0_FLEXOFF_MASK	(0x7UL << \
+					 I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT)
+#define I40E_TXD_FLTR_QW0_PCTYPE_SHIFT	17
+#define I40E_TXD_FLTR_QW0_PCTYPE_MASK	(0x3FUL << \
+					 I40E_TXD_FLTR_QW0_PCTYPE_SHIFT)
+
+/* Packet Classifier Types for filters */
+enum i40e_filter_pctype {
+	/* Note: Values 0-28 are reserved for future use.
+	 * Value 29, 30, 32 are not supported on XL710 and X710.
+	 */
+	I40E_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP	= 29,
+	I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP	= 30,
+	I40E_FILTER_PCTYPE_NONF_IPV4_UDP		= 31,
+	I40E_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK	= 32,
+	I40E_FILTER_PCTYPE_NONF_IPV4_TCP		= 33,
+	I40E_FILTER_PCTYPE_NONF_IPV4_SCTP		= 34,
+	I40E_FILTER_PCTYPE_NONF_IPV4_OTHER		= 35,
+	I40E_FILTER_PCTYPE_FRAG_IPV4			= 36,
+	/* Note: Values 37-38 are reserved for future use.
+	 * Value 39, 40, 42 are not supported on XL710 and X710.
+	 */
+	I40E_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP	= 39,
+	I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP	= 40,
+	I40E_FILTER_PCTYPE_NONF_IPV6_UDP		= 41,
+	I40E_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK	= 42,
+	I40E_FILTER_PCTYPE_NONF_IPV6_TCP		= 43,
+	I40E_FILTER_PCTYPE_NONF_IPV6_SCTP		= 44,
+	I40E_FILTER_PCTYPE_NONF_IPV6_OTHER		= 45,
+	I40E_FILTER_PCTYPE_FRAG_IPV6			= 46,
+	/* Note: Value 47 is reserved for future use */
+	I40E_FILTER_PCTYPE_FCOE_OX			= 48,
+	I40E_FILTER_PCTYPE_FCOE_RX			= 49,
+	I40E_FILTER_PCTYPE_FCOE_OTHER			= 50,
+	/* Note: Values 51-62 are reserved for future use */
+	I40E_FILTER_PCTYPE_L2_PAYLOAD			= 63,
+};
+
+enum i40e_filter_program_desc_dest {
+	I40E_FILTER_PROGRAM_DESC_DEST_DROP_PACKET		= 0x0,
+	I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_QINDEX	= 0x1,
+	I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_OTHER	= 0x2,
+};
+
+enum i40e_filter_program_desc_fd_status {
+	I40E_FILTER_PROGRAM_DESC_FD_STATUS_NONE			= 0x0,
+	I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID		= 0x1,
+	I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID_4FLEX_BYTES	= 0x2,
+	I40E_FILTER_PROGRAM_DESC_FD_STATUS_8FLEX_BYTES		= 0x3,
+};
+
+#define I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT	23
+#define I40E_TXD_FLTR_QW0_DEST_VSI_MASK	(0x1FFUL << \
+					 I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT)
+
+#define I40E_TXD_FLTR_QW1_CMD_SHIFT	4
+#define I40E_TXD_FLTR_QW1_CMD_MASK	(0xFFFFULL << \
+					 I40E_TXD_FLTR_QW1_CMD_SHIFT)
+
+#define I40E_TXD_FLTR_QW1_PCMD_SHIFT	(0x0ULL + I40E_TXD_FLTR_QW1_CMD_SHIFT)
+#define I40E_TXD_FLTR_QW1_PCMD_MASK	(0x7ULL << I40E_TXD_FLTR_QW1_PCMD_SHIFT)
+
+enum i40e_filter_program_desc_pcmd {
+	I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE	= 0x1,
+	I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE		= 0x2,
+};
+
+#define I40E_TXD_FLTR_QW1_DEST_SHIFT	(0x3ULL + I40E_TXD_FLTR_QW1_CMD_SHIFT)
+#define I40E_TXD_FLTR_QW1_DEST_MASK	(0x3ULL << I40E_TXD_FLTR_QW1_DEST_SHIFT)
+
+#define I40E_TXD_FLTR_QW1_CNT_ENA_SHIFT	(0x7ULL + I40E_TXD_FLTR_QW1_CMD_SHIFT)
+#define I40E_TXD_FLTR_QW1_CNT_ENA_MASK	BIT_ULL(I40E_TXD_FLTR_QW1_CNT_ENA_SHIFT)
+
+#define I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT	(0x9ULL + \
+						 I40E_TXD_FLTR_QW1_CMD_SHIFT)
+#define I40E_TXD_FLTR_QW1_FD_STATUS_MASK (0x3ULL << \
+					  I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT)
+
+#define I40E_TXD_FLTR_QW1_ATR_SHIFT	(0xEULL + \
+					 I40E_TXD_FLTR_QW1_CMD_SHIFT)
+#define I40E_TXD_FLTR_QW1_ATR_MASK	BIT_ULL(I40E_TXD_FLTR_QW1_ATR_SHIFT)
+
+#define I40E_TXD_FLTR_QW1_ATR_SHIFT	(0xEULL + \
+					 I40E_TXD_FLTR_QW1_CMD_SHIFT)
+#define I40E_TXD_FLTR_QW1_ATR_MASK	BIT_ULL(I40E_TXD_FLTR_QW1_ATR_SHIFT)
+
+#define I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT 20
+#define I40E_TXD_FLTR_QW1_CNTINDEX_MASK	(0x1FFUL << \
+					 I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT)
+
+enum i40e_filter_type {
+	I40E_FLOW_DIRECTOR_FLTR = 0,
+	I40E_PE_QUAD_HASH_FLTR = 1,
+	I40E_ETHERTYPE_FLTR,
+	I40E_FCOE_CTX_FLTR,
+	I40E_MAC_VLAN_FLTR,
+	I40E_HASH_FLTR
+};
+
+struct i40e_vsi_context {
+	u16 seid;
+	u16 uplink_seid;
+	u16 vsi_number;
+	u16 vsis_allocated;
+	u16 vsis_unallocated;
+	u16 flags;
+	u8 pf_num;
+	u8 vf_num;
+	u8 connection_type;
+	struct i40e_aqc_vsi_properties_data info;
+};
+
+struct i40e_veb_context {
+	u16 seid;
+	u16 uplink_seid;
+	u16 veb_number;
+	u16 vebs_allocated;
+	u16 vebs_unallocated;
+	u16 flags;
+	struct i40e_aqc_get_veb_parameters_completion info;
+};
+
+/* Statistics collected by each port, VSI, VEB, and S-channel */
+struct i40e_eth_stats {
+	u64 rx_bytes;			/* gorc */
+	u64 rx_unicast;			/* uprc */
+	u64 rx_multicast;		/* mprc */
+	u64 rx_broadcast;		/* bprc */
+	u64 rx_discards;		/* rdpc */
+	u64 rx_unknown_protocol;	/* rupp */
+	u64 tx_bytes;			/* gotc */
+	u64 tx_unicast;			/* uptc */
+	u64 tx_multicast;		/* mptc */
+	u64 tx_broadcast;		/* bptc */
+	u64 tx_discards;		/* tdpc */
+	u64 tx_errors;			/* tepc */
+};
+
+/* Statistics collected per VEB per TC */
+struct i40e_veb_tc_stats {
+	u64 tc_rx_packets[I40E_MAX_TRAFFIC_CLASS];
+	u64 tc_rx_bytes[I40E_MAX_TRAFFIC_CLASS];
+	u64 tc_tx_packets[I40E_MAX_TRAFFIC_CLASS];
+	u64 tc_tx_bytes[I40E_MAX_TRAFFIC_CLASS];
+};
+
+/* Statistics collected by the MAC */
+struct i40e_hw_port_stats {
+	/* eth stats collected by the port */
+	struct i40e_eth_stats eth;
+
+	/* additional port specific stats */
+	u64 tx_dropped_link_down;	/* tdold */
+	u64 crc_errors;			/* crcerrs */
+	u64 illegal_bytes;		/* illerrc */
+	u64 error_bytes;		/* errbc */
+	u64 mac_local_faults;		/* mlfc */
+	u64 mac_remote_faults;		/* mrfc */
+	u64 rx_length_errors;		/* rlec */
+	u64 link_xon_rx;		/* lxonrxc */
+	u64 link_xoff_rx;		/* lxoffrxc */
+	u64 priority_xon_rx[8];		/* pxonrxc[8] */
+	u64 priority_xoff_rx[8];	/* pxoffrxc[8] */
+	u64 link_xon_tx;		/* lxontxc */
+	u64 link_xoff_tx;		/* lxofftxc */
+	u64 priority_xon_tx[8];		/* pxontxc[8] */
+	u64 priority_xoff_tx[8];	/* pxofftxc[8] */
+	u64 priority_xon_2_xoff[8];	/* pxon2offc[8] */
+	u64 rx_size_64;			/* prc64 */
+	u64 rx_size_127;		/* prc127 */
+	u64 rx_size_255;		/* prc255 */
+	u64 rx_size_511;		/* prc511 */
+	u64 rx_size_1023;		/* prc1023 */
+	u64 rx_size_1522;		/* prc1522 */
+	u64 rx_size_big;		/* prc9522 */
+	u64 rx_undersize;		/* ruc */
+	u64 rx_fragments;		/* rfc */
+	u64 rx_oversize;		/* roc */
+	u64 rx_jabber;			/* rjc */
+	u64 tx_size_64;			/* ptc64 */
+	u64 tx_size_127;		/* ptc127 */
+	u64 tx_size_255;		/* ptc255 */
+	u64 tx_size_511;		/* ptc511 */
+	u64 tx_size_1023;		/* ptc1023 */
+	u64 tx_size_1522;		/* ptc1522 */
+	u64 tx_size_big;		/* ptc9522 */
+	u64 mac_short_packet_dropped;	/* mspdc */
+	u64 checksum_error;		/* xec */
+	/* flow director stats */
+	u64 fd_atr_match;
+	u64 fd_sb_match;
+	u64 fd_atr_tunnel_match;
+	u32 fd_atr_status;
+	u32 fd_sb_status;
+	/* EEE LPI */
+	u32 tx_lpi_status;
+	u32 rx_lpi_status;
+	u64 tx_lpi_count;		/* etlpic */
+	u64 rx_lpi_count;		/* erlpic */
+};
+
+/* Checksum and Shadow RAM pointers */
+#define I40E_SR_NVM_CONTROL_WORD		0x00
+#define I40E_SR_EMP_MODULE_PTR			0x0F
+#define I40E_SR_PBA_FLAGS			0x15
+#define I40E_SR_PBA_BLOCK_PTR			0x16
+#define I40E_SR_BOOT_CONFIG_PTR			0x17
+#define I40E_NVM_OEM_VER_OFF			0x83
+#define I40E_SR_NVM_DEV_STARTER_VERSION		0x18
+#define I40E_SR_NVM_WAKE_ON_LAN			0x19
+#define I40E_SR_ALTERNATE_SAN_MAC_ADDRESS_PTR	0x27
+#define I40E_SR_NVM_EETRACK_LO			0x2D
+#define I40E_SR_NVM_EETRACK_HI			0x2E
+#define I40E_SR_VPD_PTR				0x2F
+#define I40E_SR_PCIE_ALT_AUTO_LOAD_PTR		0x3E
+#define I40E_SR_SW_CHECKSUM_WORD		0x3F
+
+/* Auxiliary field, mask and shift definition for Shadow RAM and NVM Flash */
+#define I40E_SR_VPD_MODULE_MAX_SIZE		1024
+#define I40E_SR_PCIE_ALT_MODULE_MAX_SIZE	1024
+#define I40E_SR_CONTROL_WORD_1_SHIFT		0x06
+#define I40E_SR_CONTROL_WORD_1_MASK	(0x03 << I40E_SR_CONTROL_WORD_1_SHIFT)
+#define I40E_PTR_TYPE				BIT(15)
+#define I40E_SR_OCP_CFG_WORD0			0x2B
+#define I40E_SR_OCP_ENABLED			BIT(15)
+
+/* Shadow RAM related */
+#define I40E_SR_SECTOR_SIZE_IN_WORDS	0x800
+#define I40E_SR_WORDS_IN_1KB		512
+/* Checksum should be calculated such that after adding all the words,
+ * including the checksum word itself, the sum should be 0xBABA.
+ */
+#define I40E_SR_SW_CHECKSUM_BASE	0xBABA
+
+#define I40E_SRRD_SRCTL_ATTEMPTS	100000
+
+enum i40e_switch_element_types {
+	I40E_SWITCH_ELEMENT_TYPE_MAC	= 1,
+	I40E_SWITCH_ELEMENT_TYPE_PF	= 2,
+	I40E_SWITCH_ELEMENT_TYPE_VF	= 3,
+	I40E_SWITCH_ELEMENT_TYPE_EMP	= 4,
+	I40E_SWITCH_ELEMENT_TYPE_BMC	= 6,
+	I40E_SWITCH_ELEMENT_TYPE_PE	= 16,
+	I40E_SWITCH_ELEMENT_TYPE_VEB	= 17,
+	I40E_SWITCH_ELEMENT_TYPE_PA	= 18,
+	I40E_SWITCH_ELEMENT_TYPE_VSI	= 19,
+};
+
+/* Supported EtherType filters */
+enum i40e_ether_type_index {
+	I40E_ETHER_TYPE_1588		= 0,
+	I40E_ETHER_TYPE_FIP		= 1,
+	I40E_ETHER_TYPE_OUI_EXTENDED	= 2,
+	I40E_ETHER_TYPE_MAC_CONTROL	= 3,
+	I40E_ETHER_TYPE_LLDP		= 4,
+	I40E_ETHER_TYPE_EVB_PROTOCOL1	= 5,
+	I40E_ETHER_TYPE_EVB_PROTOCOL2	= 6,
+	I40E_ETHER_TYPE_QCN_CNM		= 7,
+	I40E_ETHER_TYPE_8021X		= 8,
+	I40E_ETHER_TYPE_ARP		= 9,
+	I40E_ETHER_TYPE_RSV1		= 10,
+	I40E_ETHER_TYPE_RSV2		= 11,
+};
+
+/* Filter context base size is 1K */
+#define I40E_HASH_FILTER_BASE_SIZE	1024
+/* Supported Hash filter values */
+enum i40e_hash_filter_size {
+	I40E_HASH_FILTER_SIZE_1K	= 0,
+	I40E_HASH_FILTER_SIZE_2K	= 1,
+	I40E_HASH_FILTER_SIZE_4K	= 2,
+	I40E_HASH_FILTER_SIZE_8K	= 3,
+	I40E_HASH_FILTER_SIZE_16K	= 4,
+	I40E_HASH_FILTER_SIZE_32K	= 5,
+	I40E_HASH_FILTER_SIZE_64K	= 6,
+	I40E_HASH_FILTER_SIZE_128K	= 7,
+	I40E_HASH_FILTER_SIZE_256K	= 8,
+	I40E_HASH_FILTER_SIZE_512K	= 9,
+	I40E_HASH_FILTER_SIZE_1M	= 10,
+};
+
+/* DMA context base size is 0.5K */
+#define I40E_DMA_CNTX_BASE_SIZE		512
+/* Supported DMA context values */
+enum i40e_dma_cntx_size {
+	I40E_DMA_CNTX_SIZE_512		= 0,
+	I40E_DMA_CNTX_SIZE_1K		= 1,
+	I40E_DMA_CNTX_SIZE_2K		= 2,
+	I40E_DMA_CNTX_SIZE_4K		= 3,
+	I40E_DMA_CNTX_SIZE_8K		= 4,
+	I40E_DMA_CNTX_SIZE_16K		= 5,
+	I40E_DMA_CNTX_SIZE_32K		= 6,
+	I40E_DMA_CNTX_SIZE_64K		= 7,
+	I40E_DMA_CNTX_SIZE_128K		= 8,
+	I40E_DMA_CNTX_SIZE_256K		= 9,
+};
+
+/* Supported Hash look up table (LUT) sizes */
+enum i40e_hash_lut_size {
+	I40E_HASH_LUT_SIZE_128		= 0,
+	I40E_HASH_LUT_SIZE_512		= 1,
+};
+
+/* Structure to hold a per PF filter control settings */
+struct i40e_filter_control_settings {
+	/* number of PE Quad Hash filter buckets */
+	enum i40e_hash_filter_size pe_filt_num;
+	/* number of PE Quad Hash contexts */
+	enum i40e_dma_cntx_size pe_cntx_num;
+	/* number of FCoE filter buckets */
+	enum i40e_hash_filter_size fcoe_filt_num;
+	/* number of FCoE DDP contexts */
+	enum i40e_dma_cntx_size fcoe_cntx_num;
+	/* size of the Hash LUT */
+	enum i40e_hash_lut_size	hash_lut_size;
+	/* enable FDIR filters for PF and its VFs */
+	bool enable_fdir;
+	/* enable Ethertype filters for PF and its VFs */
+	bool enable_ethtype;
+	/* enable MAC/VLAN filters for PF and its VFs */
+	bool enable_macvlan;
+};
+
+/* Structure to hold device level control filter counts */
+struct i40e_control_filter_stats {
+	u16 mac_etype_used;   /* Used perfect match MAC/EtherType filters */
+	u16 etype_used;       /* Used perfect EtherType filters */
+	u16 mac_etype_free;   /* Un-used perfect match MAC/EtherType filters */
+	u16 etype_free;       /* Un-used perfect EtherType filters */
+};
+
+enum i40e_reset_type {
+	I40E_RESET_POR		= 0,
+	I40E_RESET_CORER	= 1,
+	I40E_RESET_GLOBR	= 2,
+	I40E_RESET_EMPR		= 3,
+};
+
+/* IEEE 802.1AB LLDP Agent Variables from NVM */
+#define I40E_NVM_LLDP_CFG_PTR		0xD
+struct i40e_lldp_variables {
+	u16 length;
+	u16 adminstatus;
+	u16 msgfasttx;
+	u16 msgtxinterval;
+	u16 txparams;
+	u16 timers;
+	u16 crc8;
+};
+
+/* Offsets into Alternate Ram */
+#define I40E_ALT_STRUCT_FIRST_PF_OFFSET		0   /* in dwords */
+#define I40E_ALT_STRUCT_DWORDS_PER_PF		64   /* in dwords */
+#define I40E_ALT_STRUCT_OUTER_VLAN_TAG_OFFSET	0xD  /* in dwords */
+#define I40E_ALT_STRUCT_USER_PRIORITY_OFFSET	0xC  /* in dwords */
+#define I40E_ALT_STRUCT_MIN_BW_OFFSET		0xE  /* in dwords */
+#define I40E_ALT_STRUCT_MAX_BW_OFFSET		0xF  /* in dwords */
+
+/* Alternate Ram Bandwidth Masks */
+#define I40E_ALT_BW_VALUE_MASK		0xFF
+#define I40E_ALT_BW_RELATIVE_MASK	0x40000000
+#define I40E_ALT_BW_VALID_MASK		0x80000000
+
+/* RSS Hash Table Size */
+#define I40E_PFQF_CTL_0_HASHLUTSIZE_512	0x00010000
+
+/* INPUT SET MASK for RSS, flow director, and flexible payload */
+#define I40E_L3_SRC_SHIFT		47
+#define I40E_L3_SRC_MASK		(0x3ULL << I40E_L3_SRC_SHIFT)
+#define I40E_L3_V6_SRC_SHIFT		43
+#define I40E_L3_V6_SRC_MASK		(0xFFULL << I40E_L3_V6_SRC_SHIFT)
+#define I40E_L3_DST_SHIFT		35
+#define I40E_L3_DST_MASK		(0x3ULL << I40E_L3_DST_SHIFT)
+#define I40E_L3_V6_DST_SHIFT		35
+#define I40E_L3_V6_DST_MASK		(0xFFULL << I40E_L3_V6_DST_SHIFT)
+#define I40E_L4_SRC_SHIFT		34
+#define I40E_L4_SRC_MASK		(0x1ULL << I40E_L4_SRC_SHIFT)
+#define I40E_L4_DST_SHIFT		33
+#define I40E_L4_DST_MASK		(0x1ULL << I40E_L4_DST_SHIFT)
+#define I40E_VERIFY_TAG_SHIFT		31
+#define I40E_VERIFY_TAG_MASK		(0x3ULL << I40E_VERIFY_TAG_SHIFT)
+
+#define I40E_FLEX_50_SHIFT		13
+#define I40E_FLEX_50_MASK		(0x1ULL << I40E_FLEX_50_SHIFT)
+#define I40E_FLEX_51_SHIFT		12
+#define I40E_FLEX_51_MASK		(0x1ULL << I40E_FLEX_51_SHIFT)
+#define I40E_FLEX_52_SHIFT		11
+#define I40E_FLEX_52_MASK		(0x1ULL << I40E_FLEX_52_SHIFT)
+#define I40E_FLEX_53_SHIFT		10
+#define I40E_FLEX_53_MASK		(0x1ULL << I40E_FLEX_53_SHIFT)
+#define I40E_FLEX_54_SHIFT		9
+#define I40E_FLEX_54_MASK		(0x1ULL << I40E_FLEX_54_SHIFT)
+#define I40E_FLEX_55_SHIFT		8
+#define I40E_FLEX_55_MASK		(0x1ULL << I40E_FLEX_55_SHIFT)
+#define I40E_FLEX_56_SHIFT		7
+#define I40E_FLEX_56_MASK		(0x1ULL << I40E_FLEX_56_SHIFT)
+#define I40E_FLEX_57_SHIFT		6
+#define I40E_FLEX_57_MASK		(0x1ULL << I40E_FLEX_57_SHIFT)
+
+/* Version format for Dynamic Device Personalization(DDP) */
+struct i40e_ddp_version {
+	u8 major;
+	u8 minor;
+	u8 update;
+	u8 draft;
+};
+
+#define I40E_DDP_NAME_SIZE	32
+
+/* Package header */
+struct i40e_package_header {
+	struct i40e_ddp_version version;
+	u32 segment_count;
+	u32 segment_offset[1];
+};
+
+/* Generic segment header */
+struct i40e_generic_seg_header {
+#define SEGMENT_TYPE_METADATA	0x00000001
+#define SEGMENT_TYPE_NOTES	0x00000002
+#define SEGMENT_TYPE_I40E	0x00000011
+#define SEGMENT_TYPE_X722	0x00000012
+	u32 type;
+	struct i40e_ddp_version version;
+	u32 size;
+	char name[I40E_DDP_NAME_SIZE];
+};
+
+struct i40e_metadata_segment {
+	struct i40e_generic_seg_header header;
+	struct i40e_ddp_version version;
+	u32 track_id;
+	char name[I40E_DDP_NAME_SIZE];
+};
+
+struct i40e_device_id_entry {
+	u32 vendor_dev_id;
+	u32 sub_vendor_dev_id;
+};
+
+struct i40e_profile_segment {
+	struct i40e_generic_seg_header header;
+	struct i40e_ddp_version version;
+	char name[I40E_DDP_NAME_SIZE];
+	u32 device_table_count;
+	struct i40e_device_id_entry device_table[1];
+};
+
+struct i40e_section_table {
+	u32 section_count;
+	u32 section_offset[1];
+};
+
+struct i40e_profile_section_header {
+	u16 tbl_size;
+	u16 data_end;
+	struct {
+#define SECTION_TYPE_INFO	0x00000010
+#define SECTION_TYPE_MMIO	0x00000800
+#define SECTION_TYPE_AQ		0x00000801
+#define SECTION_TYPE_NOTE	0x80000000
+#define SECTION_TYPE_NAME	0x80000001
+		u32 type;
+		u32 offset;
+		u32 size;
+	} section;
+};
+
+struct i40e_profile_info {
+	u32 track_id;
+	struct i40e_ddp_version version;
+	u8 op;
+#define I40E_DDP_ADD_TRACKID		0x01
+#define I40E_DDP_REMOVE_TRACKID	0x02
+	u8 reserved[7];
+	u8 name[I40E_DDP_NAME_SIZE];
+};
+#endif /* _I40E_TYPE_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
new file mode 100644
index 0000000..35173cb
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -0,0 +1,4284 @@
+// SPDX-License-Identifier: GPL-2.0
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#include "i40e.h"
+
+/*********************notification routines***********************/
+
+/**
+ * i40e_vc_vf_broadcast
+ * @pf: pointer to the PF structure
+ * @opcode: operation code
+ * @retval: return value
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * send a message to all VFs on a given PF
+ **/
+static void i40e_vc_vf_broadcast(struct i40e_pf *pf,
+				 enum virtchnl_ops v_opcode,
+				 i40e_status v_retval, u8 *msg,
+				 u16 msglen)
+{
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_vf *vf = pf->vf;
+	int i;
+
+	for (i = 0; i < pf->num_alloc_vfs; i++, vf++) {
+		int abs_vf_id = vf->vf_id + (int)hw->func_caps.vf_base_id;
+		/* Not all vfs are enabled so skip the ones that are not */
+		if (!test_bit(I40E_VF_STATE_INIT, &vf->vf_states) &&
+		    !test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states))
+			continue;
+
+		/* Ignore return value on purpose - a given VF may fail, but
+		 * we need to keep going and send to all of them
+		 */
+		i40e_aq_send_msg_to_vf(hw, abs_vf_id, v_opcode, v_retval,
+				       msg, msglen, NULL);
+	}
+}
+
+/**
+ * i40e_vc_notify_vf_link_state
+ * @vf: pointer to the VF structure
+ *
+ * send a link status message to a single VF
+ **/
+static void i40e_vc_notify_vf_link_state(struct i40e_vf *vf)
+{
+	struct virtchnl_pf_event pfe;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_link_status *ls = &pf->hw.phy.link_info;
+	int abs_vf_id = vf->vf_id + (int)hw->func_caps.vf_base_id;
+
+	pfe.event = VIRTCHNL_EVENT_LINK_CHANGE;
+	pfe.severity = PF_EVENT_SEVERITY_INFO;
+	if (vf->link_forced) {
+		pfe.event_data.link_event.link_status = vf->link_up;
+		pfe.event_data.link_event.link_speed =
+			(vf->link_up ? VIRTCHNL_LINK_SPEED_40GB : 0);
+	} else {
+		pfe.event_data.link_event.link_status =
+			ls->link_info & I40E_AQ_LINK_UP;
+		pfe.event_data.link_event.link_speed =
+			i40e_virtchnl_link_speed(ls->link_speed);
+	}
+	i40e_aq_send_msg_to_vf(hw, abs_vf_id, VIRTCHNL_OP_EVENT,
+			       0, (u8 *)&pfe, sizeof(pfe), NULL);
+}
+
+/**
+ * i40e_vc_notify_link_state
+ * @pf: pointer to the PF structure
+ *
+ * send a link status message to all VFs on a given PF
+ **/
+void i40e_vc_notify_link_state(struct i40e_pf *pf)
+{
+	int i;
+
+	for (i = 0; i < pf->num_alloc_vfs; i++)
+		i40e_vc_notify_vf_link_state(&pf->vf[i]);
+}
+
+/**
+ * i40e_vc_notify_reset
+ * @pf: pointer to the PF structure
+ *
+ * indicate a pending reset to all VFs on a given PF
+ **/
+void i40e_vc_notify_reset(struct i40e_pf *pf)
+{
+	struct virtchnl_pf_event pfe;
+
+	pfe.event = VIRTCHNL_EVENT_RESET_IMPENDING;
+	pfe.severity = PF_EVENT_SEVERITY_CERTAIN_DOOM;
+	i40e_vc_vf_broadcast(pf, VIRTCHNL_OP_EVENT, 0,
+			     (u8 *)&pfe, sizeof(struct virtchnl_pf_event));
+}
+
+/**
+ * i40e_vc_notify_vf_reset
+ * @vf: pointer to the VF structure
+ *
+ * indicate a pending reset to the given VF
+ **/
+void i40e_vc_notify_vf_reset(struct i40e_vf *vf)
+{
+	struct virtchnl_pf_event pfe;
+	int abs_vf_id;
+
+	/* validate the request */
+	if (!vf || vf->vf_id >= vf->pf->num_alloc_vfs)
+		return;
+
+	/* verify if the VF is in either init or active before proceeding */
+	if (!test_bit(I40E_VF_STATE_INIT, &vf->vf_states) &&
+	    !test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states))
+		return;
+
+	abs_vf_id = vf->vf_id + (int)vf->pf->hw.func_caps.vf_base_id;
+
+	pfe.event = VIRTCHNL_EVENT_RESET_IMPENDING;
+	pfe.severity = PF_EVENT_SEVERITY_CERTAIN_DOOM;
+	i40e_aq_send_msg_to_vf(&vf->pf->hw, abs_vf_id, VIRTCHNL_OP_EVENT,
+			       0, (u8 *)&pfe,
+			       sizeof(struct virtchnl_pf_event), NULL);
+}
+/***********************misc routines*****************************/
+
+/**
+ * i40e_vc_disable_vf
+ * @vf: pointer to the VF info
+ *
+ * Disable the VF through a SW reset.
+ **/
+static inline void i40e_vc_disable_vf(struct i40e_vf *vf)
+{
+	int i;
+
+	i40e_vc_notify_vf_reset(vf);
+
+	/* We want to ensure that an actual reset occurs initiated after this
+	 * function was called. However, we do not want to wait forever, so
+	 * we'll give a reasonable time and print a message if we failed to
+	 * ensure a reset.
+	 */
+	for (i = 0; i < 20; i++) {
+		if (i40e_reset_vf(vf, false))
+			return;
+		usleep_range(10000, 20000);
+	}
+
+	dev_warn(&vf->pf->pdev->dev,
+		 "Failed to initiate reset for VF %d after 200 milliseconds\n",
+		 vf->vf_id);
+}
+
+/**
+ * i40e_vc_isvalid_vsi_id
+ * @vf: pointer to the VF info
+ * @vsi_id: VF relative VSI id
+ *
+ * check for the valid VSI id
+ **/
+static inline bool i40e_vc_isvalid_vsi_id(struct i40e_vf *vf, u16 vsi_id)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = i40e_find_vsi_from_id(pf, vsi_id);
+
+	return (vsi && (vsi->vf_id == vf->vf_id));
+}
+
+/**
+ * i40e_vc_isvalid_queue_id
+ * @vf: pointer to the VF info
+ * @vsi_id: vsi id
+ * @qid: vsi relative queue id
+ *
+ * check for the valid queue id
+ **/
+static inline bool i40e_vc_isvalid_queue_id(struct i40e_vf *vf, u16 vsi_id,
+					    u8 qid)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = i40e_find_vsi_from_id(pf, vsi_id);
+
+	return (vsi && (qid < vsi->alloc_queue_pairs));
+}
+
+/**
+ * i40e_vc_isvalid_vector_id
+ * @vf: pointer to the VF info
+ * @vector_id: VF relative vector id
+ *
+ * check for the valid vector id
+ **/
+static inline bool i40e_vc_isvalid_vector_id(struct i40e_vf *vf, u8 vector_id)
+{
+	struct i40e_pf *pf = vf->pf;
+
+	return vector_id < pf->hw.func_caps.num_msix_vectors_vf;
+}
+
+/***********************vf resource mgmt routines*****************/
+
+/**
+ * i40e_vc_get_pf_queue_id
+ * @vf: pointer to the VF info
+ * @vsi_id: id of VSI as provided by the FW
+ * @vsi_queue_id: vsi relative queue id
+ *
+ * return PF relative queue id
+ **/
+static u16 i40e_vc_get_pf_queue_id(struct i40e_vf *vf, u16 vsi_id,
+				   u8 vsi_queue_id)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = i40e_find_vsi_from_id(pf, vsi_id);
+	u16 pf_queue_id = I40E_QUEUE_END_OF_LIST;
+
+	if (!vsi)
+		return pf_queue_id;
+
+	if (le16_to_cpu(vsi->info.mapping_flags) &
+	    I40E_AQ_VSI_QUE_MAP_NONCONTIG)
+		pf_queue_id =
+			le16_to_cpu(vsi->info.queue_mapping[vsi_queue_id]);
+	else
+		pf_queue_id = le16_to_cpu(vsi->info.queue_mapping[0]) +
+			      vsi_queue_id;
+
+	return pf_queue_id;
+}
+
+/**
+ * i40e_get_real_pf_qid
+ * @vf: pointer to the VF info
+ * @vsi_id: vsi id
+ * @queue_id: queue number
+ *
+ * wrapper function to get pf_queue_id handling ADq code as well
+ **/
+static u16 i40e_get_real_pf_qid(struct i40e_vf *vf, u16 vsi_id, u16 queue_id)
+{
+	int i;
+
+	if (vf->adq_enabled) {
+		/* Although VF considers all the queues(can be 1 to 16) as its
+		 * own but they may actually belong to different VSIs(up to 4).
+		 * We need to find which queues belongs to which VSI.
+		 */
+		for (i = 0; i < vf->num_tc; i++) {
+			if (queue_id < vf->ch[i].num_qps) {
+				vsi_id = vf->ch[i].vsi_id;
+				break;
+			}
+			/* find right queue id which is relative to a
+			 * given VSI.
+			 */
+			queue_id -= vf->ch[i].num_qps;
+			}
+		}
+
+	return i40e_vc_get_pf_queue_id(vf, vsi_id, queue_id);
+}
+
+/**
+ * i40e_config_irq_link_list
+ * @vf: pointer to the VF info
+ * @vsi_id: id of VSI as given by the FW
+ * @vecmap: irq map info
+ *
+ * configure irq link list from the map
+ **/
+static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id,
+				      struct virtchnl_vector_map *vecmap)
+{
+	unsigned long linklistmap = 0, tempmap;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	u16 vsi_queue_id, pf_queue_id;
+	enum i40e_queue_type qtype;
+	u16 next_q, vector_id, size;
+	u32 reg, reg_idx;
+	u16 itr_idx = 0;
+
+	vector_id = vecmap->vector_id;
+	/* setup the head */
+	if (0 == vector_id)
+		reg_idx = I40E_VPINT_LNKLST0(vf->vf_id);
+	else
+		reg_idx = I40E_VPINT_LNKLSTN(
+		     ((pf->hw.func_caps.num_msix_vectors_vf - 1) * vf->vf_id) +
+		     (vector_id - 1));
+
+	if (vecmap->rxq_map == 0 && vecmap->txq_map == 0) {
+		/* Special case - No queues mapped on this vector */
+		wr32(hw, reg_idx, I40E_VPINT_LNKLST0_FIRSTQ_INDX_MASK);
+		goto irq_list_done;
+	}
+	tempmap = vecmap->rxq_map;
+	for_each_set_bit(vsi_queue_id, &tempmap, I40E_MAX_VSI_QP) {
+		linklistmap |= (BIT(I40E_VIRTCHNL_SUPPORTED_QTYPES *
+				    vsi_queue_id));
+	}
+
+	tempmap = vecmap->txq_map;
+	for_each_set_bit(vsi_queue_id, &tempmap, I40E_MAX_VSI_QP) {
+		linklistmap |= (BIT(I40E_VIRTCHNL_SUPPORTED_QTYPES *
+				     vsi_queue_id + 1));
+	}
+
+	size = I40E_MAX_VSI_QP * I40E_VIRTCHNL_SUPPORTED_QTYPES;
+	next_q = find_first_bit(&linklistmap, size);
+	if (unlikely(next_q == size))
+		goto irq_list_done;
+
+	vsi_queue_id = next_q / I40E_VIRTCHNL_SUPPORTED_QTYPES;
+	qtype = next_q % I40E_VIRTCHNL_SUPPORTED_QTYPES;
+	pf_queue_id = i40e_get_real_pf_qid(vf, vsi_id, vsi_queue_id);
+	reg = ((qtype << I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_SHIFT) | pf_queue_id);
+
+	wr32(hw, reg_idx, reg);
+
+	while (next_q < size) {
+		switch (qtype) {
+		case I40E_QUEUE_TYPE_RX:
+			reg_idx = I40E_QINT_RQCTL(pf_queue_id);
+			itr_idx = vecmap->rxitr_idx;
+			break;
+		case I40E_QUEUE_TYPE_TX:
+			reg_idx = I40E_QINT_TQCTL(pf_queue_id);
+			itr_idx = vecmap->txitr_idx;
+			break;
+		default:
+			break;
+		}
+
+		next_q = find_next_bit(&linklistmap, size, next_q + 1);
+		if (next_q < size) {
+			vsi_queue_id = next_q / I40E_VIRTCHNL_SUPPORTED_QTYPES;
+			qtype = next_q % I40E_VIRTCHNL_SUPPORTED_QTYPES;
+			pf_queue_id = i40e_get_real_pf_qid(vf,
+							   vsi_id,
+							   vsi_queue_id);
+		} else {
+			pf_queue_id = I40E_QUEUE_END_OF_LIST;
+			qtype = 0;
+		}
+
+		/* format for the RQCTL & TQCTL regs is same */
+		reg = (vector_id) |
+		    (qtype << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
+		    (pf_queue_id << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
+		    BIT(I40E_QINT_RQCTL_CAUSE_ENA_SHIFT) |
+		    (itr_idx << I40E_QINT_RQCTL_ITR_INDX_SHIFT);
+		wr32(hw, reg_idx, reg);
+	}
+
+	/* if the vf is running in polling mode and using interrupt zero,
+	 * need to disable auto-mask on enabling zero interrupt for VFs.
+	 */
+	if ((vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RX_POLLING) &&
+	    (vector_id == 0)) {
+		reg = rd32(hw, I40E_GLINT_CTL);
+		if (!(reg & I40E_GLINT_CTL_DIS_AUTOMASK_VF0_MASK)) {
+			reg |= I40E_GLINT_CTL_DIS_AUTOMASK_VF0_MASK;
+			wr32(hw, I40E_GLINT_CTL, reg);
+		}
+	}
+
+irq_list_done:
+	i40e_flush(hw);
+}
+
+/**
+ * i40e_release_iwarp_qvlist
+ * @vf: pointer to the VF.
+ *
+ **/
+static void i40e_release_iwarp_qvlist(struct i40e_vf *vf)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct virtchnl_iwarp_qvlist_info *qvlist_info = vf->qvlist_info;
+	u32 msix_vf;
+	u32 i;
+
+	if (!vf->qvlist_info)
+		return;
+
+	msix_vf = pf->hw.func_caps.num_msix_vectors_vf;
+	for (i = 0; i < qvlist_info->num_vectors; i++) {
+		struct virtchnl_iwarp_qv_info *qv_info;
+		u32 next_q_index, next_q_type;
+		struct i40e_hw *hw = &pf->hw;
+		u32 v_idx, reg_idx, reg;
+
+		qv_info = &qvlist_info->qv_info[i];
+		if (!qv_info)
+			continue;
+		v_idx = qv_info->v_idx;
+		if (qv_info->ceq_idx != I40E_QUEUE_INVALID_IDX) {
+			/* Figure out the queue after CEQ and make that the
+			 * first queue.
+			 */
+			reg_idx = (msix_vf - 1) * vf->vf_id + qv_info->ceq_idx;
+			reg = rd32(hw, I40E_VPINT_CEQCTL(reg_idx));
+			next_q_index = (reg & I40E_VPINT_CEQCTL_NEXTQ_INDX_MASK)
+					>> I40E_VPINT_CEQCTL_NEXTQ_INDX_SHIFT;
+			next_q_type = (reg & I40E_VPINT_CEQCTL_NEXTQ_TYPE_MASK)
+					>> I40E_VPINT_CEQCTL_NEXTQ_TYPE_SHIFT;
+
+			reg_idx = ((msix_vf - 1) * vf->vf_id) + (v_idx - 1);
+			reg = (next_q_index &
+			       I40E_VPINT_LNKLSTN_FIRSTQ_INDX_MASK) |
+			       (next_q_type <<
+			       I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
+
+			wr32(hw, I40E_VPINT_LNKLSTN(reg_idx), reg);
+		}
+	}
+	kfree(vf->qvlist_info);
+	vf->qvlist_info = NULL;
+}
+
+/**
+ * i40e_config_iwarp_qvlist
+ * @vf: pointer to the VF info
+ * @qvlist_info: queue and vector list
+ *
+ * Return 0 on success or < 0 on error
+ **/
+static int i40e_config_iwarp_qvlist(struct i40e_vf *vf,
+				    struct virtchnl_iwarp_qvlist_info *qvlist_info)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	struct virtchnl_iwarp_qv_info *qv_info;
+	u32 v_idx, i, reg_idx, reg;
+	u32 next_q_idx, next_q_type;
+	u32 msix_vf, size;
+
+	size = sizeof(struct virtchnl_iwarp_qvlist_info) +
+	       (sizeof(struct virtchnl_iwarp_qv_info) *
+						(qvlist_info->num_vectors - 1));
+	vf->qvlist_info = kzalloc(size, GFP_KERNEL);
+	if (!vf->qvlist_info)
+		return -ENOMEM;
+
+	vf->qvlist_info->num_vectors = qvlist_info->num_vectors;
+
+	msix_vf = pf->hw.func_caps.num_msix_vectors_vf;
+	for (i = 0; i < qvlist_info->num_vectors; i++) {
+		qv_info = &qvlist_info->qv_info[i];
+		if (!qv_info)
+			continue;
+		v_idx = qv_info->v_idx;
+
+		/* Validate vector id belongs to this vf */
+		if (!i40e_vc_isvalid_vector_id(vf, v_idx))
+			goto err;
+
+		vf->qvlist_info->qv_info[i] = *qv_info;
+
+		reg_idx = ((msix_vf - 1) * vf->vf_id) + (v_idx - 1);
+		/* We might be sharing the interrupt, so get the first queue
+		 * index and type, push it down the list by adding the new
+		 * queue on top. Also link it with the new queue in CEQCTL.
+		 */
+		reg = rd32(hw, I40E_VPINT_LNKLSTN(reg_idx));
+		next_q_idx = ((reg & I40E_VPINT_LNKLSTN_FIRSTQ_INDX_MASK) >>
+				I40E_VPINT_LNKLSTN_FIRSTQ_INDX_SHIFT);
+		next_q_type = ((reg & I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_MASK) >>
+				I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
+
+		if (qv_info->ceq_idx != I40E_QUEUE_INVALID_IDX) {
+			reg_idx = (msix_vf - 1) * vf->vf_id + qv_info->ceq_idx;
+			reg = (I40E_VPINT_CEQCTL_CAUSE_ENA_MASK |
+			(v_idx << I40E_VPINT_CEQCTL_MSIX_INDX_SHIFT) |
+			(qv_info->itr_idx << I40E_VPINT_CEQCTL_ITR_INDX_SHIFT) |
+			(next_q_type << I40E_VPINT_CEQCTL_NEXTQ_TYPE_SHIFT) |
+			(next_q_idx << I40E_VPINT_CEQCTL_NEXTQ_INDX_SHIFT));
+			wr32(hw, I40E_VPINT_CEQCTL(reg_idx), reg);
+
+			reg_idx = ((msix_vf - 1) * vf->vf_id) + (v_idx - 1);
+			reg = (qv_info->ceq_idx &
+			       I40E_VPINT_LNKLSTN_FIRSTQ_INDX_MASK) |
+			       (I40E_QUEUE_TYPE_PE_CEQ <<
+			       I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_SHIFT);
+			wr32(hw, I40E_VPINT_LNKLSTN(reg_idx), reg);
+		}
+
+		if (qv_info->aeq_idx != I40E_QUEUE_INVALID_IDX) {
+			reg = (I40E_VPINT_AEQCTL_CAUSE_ENA_MASK |
+			(v_idx << I40E_VPINT_AEQCTL_MSIX_INDX_SHIFT) |
+			(qv_info->itr_idx << I40E_VPINT_AEQCTL_ITR_INDX_SHIFT));
+
+			wr32(hw, I40E_VPINT_AEQCTL(vf->vf_id), reg);
+		}
+	}
+
+	return 0;
+err:
+	kfree(vf->qvlist_info);
+	vf->qvlist_info = NULL;
+	return -EINVAL;
+}
+
+/**
+ * i40e_config_vsi_tx_queue
+ * @vf: pointer to the VF info
+ * @vsi_id: id of VSI as provided by the FW
+ * @vsi_queue_id: vsi relative queue index
+ * @info: config. info
+ *
+ * configure tx queue
+ **/
+static int i40e_config_vsi_tx_queue(struct i40e_vf *vf, u16 vsi_id,
+				    u16 vsi_queue_id,
+				    struct virtchnl_txq_info *info)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_hmc_obj_txq tx_ctx;
+	struct i40e_vsi *vsi;
+	u16 pf_queue_id;
+	u32 qtx_ctl;
+	int ret = 0;
+
+	if (!i40e_vc_isvalid_vsi_id(vf, info->vsi_id)) {
+		ret = -ENOENT;
+		goto error_context;
+	}
+	pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id, vsi_queue_id);
+	vsi = i40e_find_vsi_from_id(pf, vsi_id);
+	if (!vsi) {
+		ret = -ENOENT;
+		goto error_context;
+	}
+
+	/* clear the context structure first */
+	memset(&tx_ctx, 0, sizeof(struct i40e_hmc_obj_txq));
+
+	/* only set the required fields */
+	tx_ctx.base = info->dma_ring_addr / 128;
+	tx_ctx.qlen = info->ring_len;
+	tx_ctx.rdylist = le16_to_cpu(vsi->info.qs_handle[0]);
+	tx_ctx.rdylist_act = 0;
+	tx_ctx.head_wb_ena = info->headwb_enabled;
+	tx_ctx.head_wb_addr = info->dma_headwb_addr;
+
+	/* clear the context in the HMC */
+	ret = i40e_clear_lan_tx_queue_context(hw, pf_queue_id);
+	if (ret) {
+		dev_err(&pf->pdev->dev,
+			"Failed to clear VF LAN Tx queue context %d, error: %d\n",
+			pf_queue_id, ret);
+		ret = -ENOENT;
+		goto error_context;
+	}
+
+	/* set the context in the HMC */
+	ret = i40e_set_lan_tx_queue_context(hw, pf_queue_id, &tx_ctx);
+	if (ret) {
+		dev_err(&pf->pdev->dev,
+			"Failed to set VF LAN Tx queue context %d error: %d\n",
+			pf_queue_id, ret);
+		ret = -ENOENT;
+		goto error_context;
+	}
+
+	/* associate this queue with the PCI VF function */
+	qtx_ctl = I40E_QTX_CTL_VF_QUEUE;
+	qtx_ctl |= ((hw->pf_id << I40E_QTX_CTL_PF_INDX_SHIFT)
+		    & I40E_QTX_CTL_PF_INDX_MASK);
+	qtx_ctl |= (((vf->vf_id + hw->func_caps.vf_base_id)
+		     << I40E_QTX_CTL_VFVM_INDX_SHIFT)
+		    & I40E_QTX_CTL_VFVM_INDX_MASK);
+	wr32(hw, I40E_QTX_CTL(pf_queue_id), qtx_ctl);
+	i40e_flush(hw);
+
+error_context:
+	return ret;
+}
+
+/**
+ * i40e_config_vsi_rx_queue
+ * @vf: pointer to the VF info
+ * @vsi_id: id of VSI  as provided by the FW
+ * @vsi_queue_id: vsi relative queue index
+ * @info: config. info
+ *
+ * configure rx queue
+ **/
+static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id,
+				    u16 vsi_queue_id,
+				    struct virtchnl_rxq_info *info)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_hmc_obj_rxq rx_ctx;
+	u16 pf_queue_id;
+	int ret = 0;
+
+	pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id, vsi_queue_id);
+
+	/* clear the context structure first */
+	memset(&rx_ctx, 0, sizeof(struct i40e_hmc_obj_rxq));
+
+	/* only set the required fields */
+	rx_ctx.base = info->dma_ring_addr / 128;
+	rx_ctx.qlen = info->ring_len;
+
+	if (info->splithdr_enabled) {
+		rx_ctx.hsplit_0 = I40E_RX_SPLIT_L2      |
+				  I40E_RX_SPLIT_IP      |
+				  I40E_RX_SPLIT_TCP_UDP |
+				  I40E_RX_SPLIT_SCTP;
+		/* header length validation */
+		if (info->hdr_size > ((2 * 1024) - 64)) {
+			ret = -EINVAL;
+			goto error_param;
+		}
+		rx_ctx.hbuff = info->hdr_size >> I40E_RXQ_CTX_HBUFF_SHIFT;
+
+		/* set split mode 10b */
+		rx_ctx.dtype = I40E_RX_DTYPE_HEADER_SPLIT;
+	}
+
+	/* databuffer length validation */
+	if (info->databuffer_size > ((16 * 1024) - 128)) {
+		ret = -EINVAL;
+		goto error_param;
+	}
+	rx_ctx.dbuff = info->databuffer_size >> I40E_RXQ_CTX_DBUFF_SHIFT;
+
+	/* max pkt. length validation */
+	if (info->max_pkt_size >= (16 * 1024) || info->max_pkt_size < 64) {
+		ret = -EINVAL;
+		goto error_param;
+	}
+	rx_ctx.rxmax = info->max_pkt_size;
+
+	/* enable 32bytes desc always */
+	rx_ctx.dsize = 1;
+
+	/* default values */
+	rx_ctx.lrxqthresh = 1;
+	rx_ctx.crcstrip = 1;
+	rx_ctx.prefena = 1;
+	rx_ctx.l2tsel = 1;
+
+	/* clear the context in the HMC */
+	ret = i40e_clear_lan_rx_queue_context(hw, pf_queue_id);
+	if (ret) {
+		dev_err(&pf->pdev->dev,
+			"Failed to clear VF LAN Rx queue context %d, error: %d\n",
+			pf_queue_id, ret);
+		ret = -ENOENT;
+		goto error_param;
+	}
+
+	/* set the context in the HMC */
+	ret = i40e_set_lan_rx_queue_context(hw, pf_queue_id, &rx_ctx);
+	if (ret) {
+		dev_err(&pf->pdev->dev,
+			"Failed to set VF LAN Rx queue context %d error: %d\n",
+			pf_queue_id, ret);
+		ret = -ENOENT;
+		goto error_param;
+	}
+
+error_param:
+	return ret;
+}
+
+/**
+ * i40e_alloc_vsi_res
+ * @vf: pointer to the VF info
+ * @idx: VSI index, applies only for ADq mode, zero otherwise
+ *
+ * alloc VF vsi context & resources
+ **/
+static int i40e_alloc_vsi_res(struct i40e_vf *vf, u8 idx)
+{
+	struct i40e_mac_filter *f = NULL;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi;
+	u64 max_tx_rate = 0;
+	int ret = 0;
+
+	vsi = i40e_vsi_setup(pf, I40E_VSI_SRIOV, pf->vsi[pf->lan_vsi]->seid,
+			     vf->vf_id);
+
+	if (!vsi) {
+		dev_err(&pf->pdev->dev,
+			"add vsi failed for VF %d, aq_err %d\n",
+			vf->vf_id, pf->hw.aq.asq_last_status);
+		ret = -ENOENT;
+		goto error_alloc_vsi_res;
+	}
+
+	if (!idx) {
+		u64 hena = i40e_pf_get_default_rss_hena(pf);
+		u8 broadcast[ETH_ALEN];
+
+		vf->lan_vsi_idx = vsi->idx;
+		vf->lan_vsi_id = vsi->id;
+		/* If the port VLAN has been configured and then the
+		 * VF driver was removed then the VSI port VLAN
+		 * configuration was destroyed.  Check if there is
+		 * a port VLAN and restore the VSI configuration if
+		 * needed.
+		 */
+		if (vf->port_vlan_id)
+			i40e_vsi_add_pvid(vsi, vf->port_vlan_id);
+
+		spin_lock_bh(&vsi->mac_filter_hash_lock);
+		if (is_valid_ether_addr(vf->default_lan_addr.addr)) {
+			f = i40e_add_mac_filter(vsi,
+						vf->default_lan_addr.addr);
+			if (!f)
+				dev_info(&pf->pdev->dev,
+					 "Could not add MAC filter %pM for VF %d\n",
+					vf->default_lan_addr.addr, vf->vf_id);
+		}
+		eth_broadcast_addr(broadcast);
+		f = i40e_add_mac_filter(vsi, broadcast);
+		if (!f)
+			dev_info(&pf->pdev->dev,
+				 "Could not allocate VF broadcast filter\n");
+		spin_unlock_bh(&vsi->mac_filter_hash_lock);
+		wr32(&pf->hw, I40E_VFQF_HENA1(0, vf->vf_id), (u32)hena);
+		wr32(&pf->hw, I40E_VFQF_HENA1(1, vf->vf_id), (u32)(hena >> 32));
+		/* program mac filter only for VF VSI */
+		ret = i40e_sync_vsi_filters(vsi);
+		if (ret)
+			dev_err(&pf->pdev->dev, "Unable to program ucast filters\n");
+	}
+
+	/* storing VSI index and id for ADq and don't apply the mac filter */
+	if (vf->adq_enabled) {
+		vf->ch[idx].vsi_idx = vsi->idx;
+		vf->ch[idx].vsi_id = vsi->id;
+	}
+
+	/* Set VF bandwidth if specified */
+	if (vf->tx_rate) {
+		max_tx_rate = vf->tx_rate;
+	} else if (vf->ch[idx].max_tx_rate) {
+		max_tx_rate = vf->ch[idx].max_tx_rate;
+	}
+
+	if (max_tx_rate) {
+		max_tx_rate = div_u64(max_tx_rate, I40E_BW_CREDIT_DIVISOR);
+		ret = i40e_aq_config_vsi_bw_limit(&pf->hw, vsi->seid,
+						  max_tx_rate, 0, NULL);
+		if (ret)
+			dev_err(&pf->pdev->dev, "Unable to set tx rate, VF %d, error code %d.\n",
+				vf->vf_id, ret);
+	}
+
+error_alloc_vsi_res:
+	return ret;
+}
+
+/**
+ * i40e_map_pf_queues_to_vsi
+ * @vf: pointer to the VF info
+ *
+ * PF maps LQPs to a VF by programming VSILAN_QTABLE & VPLAN_QTABLE. This
+ * function takes care of first part VSILAN_QTABLE, mapping pf queues to VSI.
+ **/
+static void i40e_map_pf_queues_to_vsi(struct i40e_vf *vf)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	u32 reg, num_tc = 1; /* VF has at least one traffic class */
+	u16 vsi_id, qps;
+	int i, j;
+
+	if (vf->adq_enabled)
+		num_tc = vf->num_tc;
+
+	for (i = 0; i < num_tc; i++) {
+		if (vf->adq_enabled) {
+			qps = vf->ch[i].num_qps;
+			vsi_id =  vf->ch[i].vsi_id;
+		} else {
+			qps = pf->vsi[vf->lan_vsi_idx]->alloc_queue_pairs;
+			vsi_id = vf->lan_vsi_id;
+		}
+
+		for (j = 0; j < 7; j++) {
+			if (j * 2 >= qps) {
+				/* end of list */
+				reg = 0x07FF07FF;
+			} else {
+				u16 qid = i40e_vc_get_pf_queue_id(vf,
+								  vsi_id,
+								  j * 2);
+				reg = qid;
+				qid = i40e_vc_get_pf_queue_id(vf, vsi_id,
+							      (j * 2) + 1);
+				reg |= qid << 16;
+			}
+			i40e_write_rx_ctl(hw,
+					  I40E_VSILAN_QTABLE(j, vsi_id),
+					  reg);
+		}
+	}
+}
+
+/**
+ * i40e_map_pf_to_vf_queues
+ * @vf: pointer to the VF info
+ *
+ * PF maps LQPs to a VF by programming VSILAN_QTABLE & VPLAN_QTABLE. This
+ * function takes care of the second part VPLAN_QTABLE & completes VF mappings.
+ **/
+static void i40e_map_pf_to_vf_queues(struct i40e_vf *vf)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	u32 reg, total_qps = 0;
+	u32 qps, num_tc = 1; /* VF has at least one traffic class */
+	u16 vsi_id, qid;
+	int i, j;
+
+	if (vf->adq_enabled)
+		num_tc = vf->num_tc;
+
+	for (i = 0; i < num_tc; i++) {
+		if (vf->adq_enabled) {
+			qps = vf->ch[i].num_qps;
+			vsi_id =  vf->ch[i].vsi_id;
+		} else {
+			qps = pf->vsi[vf->lan_vsi_idx]->alloc_queue_pairs;
+			vsi_id = vf->lan_vsi_id;
+		}
+
+		for (j = 0; j < qps; j++) {
+			qid = i40e_vc_get_pf_queue_id(vf, vsi_id, j);
+
+			reg = (qid & I40E_VPLAN_QTABLE_QINDEX_MASK);
+			wr32(hw, I40E_VPLAN_QTABLE(total_qps, vf->vf_id),
+			     reg);
+			total_qps++;
+		}
+	}
+}
+
+/**
+ * i40e_enable_vf_mappings
+ * @vf: pointer to the VF info
+ *
+ * enable VF mappings
+ **/
+static void i40e_enable_vf_mappings(struct i40e_vf *vf)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	u32 reg;
+
+	/* Tell the hardware we're using noncontiguous mapping. HW requires
+	 * that VF queues be mapped using this method, even when they are
+	 * contiguous in real life
+	 */
+	i40e_write_rx_ctl(hw, I40E_VSILAN_QBASE(vf->lan_vsi_id),
+			  I40E_VSILAN_QBASE_VSIQTABLE_ENA_MASK);
+
+	/* enable VF vplan_qtable mappings */
+	reg = I40E_VPLAN_MAPENA_TXRX_ENA_MASK;
+	wr32(hw, I40E_VPLAN_MAPENA(vf->vf_id), reg);
+
+	i40e_map_pf_to_vf_queues(vf);
+	i40e_map_pf_queues_to_vsi(vf);
+
+	i40e_flush(hw);
+}
+
+/**
+ * i40e_disable_vf_mappings
+ * @vf: pointer to the VF info
+ *
+ * disable VF mappings
+ **/
+static void i40e_disable_vf_mappings(struct i40e_vf *vf)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	int i;
+
+	/* disable qp mappings */
+	wr32(hw, I40E_VPLAN_MAPENA(vf->vf_id), 0);
+	for (i = 0; i < I40E_MAX_VSI_QP; i++)
+		wr32(hw, I40E_VPLAN_QTABLE(i, vf->vf_id),
+		     I40E_QUEUE_END_OF_LIST);
+	i40e_flush(hw);
+}
+
+/**
+ * i40e_free_vf_res
+ * @vf: pointer to the VF info
+ *
+ * free VF resources
+ **/
+static void i40e_free_vf_res(struct i40e_vf *vf)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	u32 reg_idx, reg;
+	int i, j, msix_vf;
+
+	/* Start by disabling VF's configuration API to prevent the OS from
+	 * accessing the VF's VSI after it's freed / invalidated.
+	 */
+	clear_bit(I40E_VF_STATE_INIT, &vf->vf_states);
+
+	/* It's possible the VF had requeuested more queues than the default so
+	 * do the accounting here when we're about to free them.
+	 */
+	if (vf->num_queue_pairs > I40E_DEFAULT_QUEUES_PER_VF) {
+		pf->queues_left += vf->num_queue_pairs -
+				   I40E_DEFAULT_QUEUES_PER_VF;
+	}
+
+	/* free vsi & disconnect it from the parent uplink */
+	if (vf->lan_vsi_idx) {
+		i40e_vsi_release(pf->vsi[vf->lan_vsi_idx]);
+		vf->lan_vsi_idx = 0;
+		vf->lan_vsi_id = 0;
+		vf->num_mac = 0;
+	}
+
+	/* do the accounting and remove additional ADq VSI's */
+	if (vf->adq_enabled && vf->ch[0].vsi_idx) {
+		for (j = 0; j < vf->num_tc; j++) {
+			/* At this point VSI0 is already released so don't
+			 * release it again and only clear their values in
+			 * structure variables
+			 */
+			if (j)
+				i40e_vsi_release(pf->vsi[vf->ch[j].vsi_idx]);
+			vf->ch[j].vsi_idx = 0;
+			vf->ch[j].vsi_id = 0;
+		}
+	}
+	msix_vf = pf->hw.func_caps.num_msix_vectors_vf;
+
+	/* disable interrupts so the VF starts in a known state */
+	for (i = 0; i < msix_vf; i++) {
+		/* format is same for both registers */
+		if (0 == i)
+			reg_idx = I40E_VFINT_DYN_CTL0(vf->vf_id);
+		else
+			reg_idx = I40E_VFINT_DYN_CTLN(((msix_vf - 1) *
+						      (vf->vf_id))
+						     + (i - 1));
+		wr32(hw, reg_idx, I40E_VFINT_DYN_CTLN_CLEARPBA_MASK);
+		i40e_flush(hw);
+	}
+
+	/* clear the irq settings */
+	for (i = 0; i < msix_vf; i++) {
+		/* format is same for both registers */
+		if (0 == i)
+			reg_idx = I40E_VPINT_LNKLST0(vf->vf_id);
+		else
+			reg_idx = I40E_VPINT_LNKLSTN(((msix_vf - 1) *
+						      (vf->vf_id))
+						     + (i - 1));
+		reg = (I40E_VPINT_LNKLSTN_FIRSTQ_TYPE_MASK |
+		       I40E_VPINT_LNKLSTN_FIRSTQ_INDX_MASK);
+		wr32(hw, reg_idx, reg);
+		i40e_flush(hw);
+	}
+	/* reset some of the state variables keeping track of the resources */
+	vf->num_queue_pairs = 0;
+	clear_bit(I40E_VF_STATE_MC_PROMISC, &vf->vf_states);
+	clear_bit(I40E_VF_STATE_UC_PROMISC, &vf->vf_states);
+}
+
+/**
+ * i40e_alloc_vf_res
+ * @vf: pointer to the VF info
+ *
+ * allocate VF resources
+ **/
+static int i40e_alloc_vf_res(struct i40e_vf *vf)
+{
+	struct i40e_pf *pf = vf->pf;
+	int total_queue_pairs = 0;
+	int ret, idx;
+
+	if (vf->num_req_queues &&
+	    vf->num_req_queues <= pf->queues_left + I40E_DEFAULT_QUEUES_PER_VF)
+		pf->num_vf_qps = vf->num_req_queues;
+	else
+		pf->num_vf_qps = I40E_DEFAULT_QUEUES_PER_VF;
+
+	/* allocate hw vsi context & associated resources */
+	ret = i40e_alloc_vsi_res(vf, 0);
+	if (ret)
+		goto error_alloc;
+	total_queue_pairs += pf->vsi[vf->lan_vsi_idx]->alloc_queue_pairs;
+
+	/* allocate additional VSIs based on tc information for ADq */
+	if (vf->adq_enabled) {
+		if (pf->queues_left >=
+		    (I40E_MAX_VF_QUEUES - I40E_DEFAULT_QUEUES_PER_VF)) {
+			/* TC 0 always belongs to VF VSI */
+			for (idx = 1; idx < vf->num_tc; idx++) {
+				ret = i40e_alloc_vsi_res(vf, idx);
+				if (ret)
+					goto error_alloc;
+			}
+			/* send correct number of queues */
+			total_queue_pairs = I40E_MAX_VF_QUEUES;
+		} else {
+			dev_info(&pf->pdev->dev, "VF %d: Not enough queues to allocate, disabling ADq\n",
+				 vf->vf_id);
+			vf->adq_enabled = false;
+		}
+	}
+
+	/* We account for each VF to get a default number of queue pairs.  If
+	 * the VF has now requested more, we need to account for that to make
+	 * certain we never request more queues than we actually have left in
+	 * HW.
+	 */
+	if (total_queue_pairs > I40E_DEFAULT_QUEUES_PER_VF)
+		pf->queues_left -=
+			total_queue_pairs - I40E_DEFAULT_QUEUES_PER_VF;
+
+	if (vf->trusted)
+		set_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+	else
+		clear_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
+
+	/* store the total qps number for the runtime
+	 * VF req validation
+	 */
+	vf->num_queue_pairs = total_queue_pairs;
+
+	/* VF is now completely initialized */
+	set_bit(I40E_VF_STATE_INIT, &vf->vf_states);
+
+error_alloc:
+	if (ret)
+		i40e_free_vf_res(vf);
+
+	return ret;
+}
+
+#define VF_DEVICE_STATUS 0xAA
+#define VF_TRANS_PENDING_MASK 0x20
+/**
+ * i40e_quiesce_vf_pci
+ * @vf: pointer to the VF structure
+ *
+ * Wait for VF PCI transactions to be cleared after reset. Returns -EIO
+ * if the transactions never clear.
+ **/
+static int i40e_quiesce_vf_pci(struct i40e_vf *vf)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	int vf_abs_id, i;
+	u32 reg;
+
+	vf_abs_id = vf->vf_id + hw->func_caps.vf_base_id;
+
+	wr32(hw, I40E_PF_PCI_CIAA,
+	     VF_DEVICE_STATUS | (vf_abs_id << I40E_PF_PCI_CIAA_VF_NUM_SHIFT));
+	for (i = 0; i < 100; i++) {
+		reg = rd32(hw, I40E_PF_PCI_CIAD);
+		if ((reg & VF_TRANS_PENDING_MASK) == 0)
+			return 0;
+		udelay(1);
+	}
+	return -EIO;
+}
+
+/**
+ * i40e_trigger_vf_reset
+ * @vf: pointer to the VF structure
+ * @flr: VFLR was issued or not
+ *
+ * Trigger hardware to start a reset for a particular VF. Expects the caller
+ * to wait the proper amount of time to allow hardware to reset the VF before
+ * it cleans up and restores VF functionality.
+ **/
+static void i40e_trigger_vf_reset(struct i40e_vf *vf, bool flr)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	u32 reg, reg_idx, bit_idx;
+
+	/* warn the VF */
+	clear_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states);
+
+	/* Disable VF's configuration API during reset. The flag is re-enabled
+	 * in i40e_alloc_vf_res(), when it's safe again to access VF's VSI.
+	 * It's normally disabled in i40e_free_vf_res(), but it's safer
+	 * to do it earlier to give some time to finish to any VF config
+	 * functions that may still be running at this point.
+	 */
+	clear_bit(I40E_VF_STATE_INIT, &vf->vf_states);
+
+	/* In the case of a VFLR, the HW has already reset the VF and we
+	 * just need to clean up, so don't hit the VFRTRIG register.
+	 */
+	if (!flr) {
+		/* reset VF using VPGEN_VFRTRIG reg */
+		reg = rd32(hw, I40E_VPGEN_VFRTRIG(vf->vf_id));
+		reg |= I40E_VPGEN_VFRTRIG_VFSWR_MASK;
+		wr32(hw, I40E_VPGEN_VFRTRIG(vf->vf_id), reg);
+		i40e_flush(hw);
+	}
+	/* clear the VFLR bit in GLGEN_VFLRSTAT */
+	reg_idx = (hw->func_caps.vf_base_id + vf->vf_id) / 32;
+	bit_idx = (hw->func_caps.vf_base_id + vf->vf_id) % 32;
+	wr32(hw, I40E_GLGEN_VFLRSTAT(reg_idx), BIT(bit_idx));
+	i40e_flush(hw);
+
+	if (i40e_quiesce_vf_pci(vf))
+		dev_err(&pf->pdev->dev, "VF %d PCI transactions stuck\n",
+			vf->vf_id);
+}
+
+/**
+ * i40e_cleanup_reset_vf
+ * @vf: pointer to the VF structure
+ *
+ * Cleanup a VF after the hardware reset is finished. Expects the caller to
+ * have verified whether the reset is finished properly, and ensure the
+ * minimum amount of wait time has passed.
+ **/
+static void i40e_cleanup_reset_vf(struct i40e_vf *vf)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	u32 reg;
+
+	/* free VF resources to begin resetting the VSI state */
+	i40e_free_vf_res(vf);
+
+	/* Enable hardware by clearing the reset bit in the VPGEN_VFRTRIG reg.
+	 * By doing this we allow HW to access VF memory at any point. If we
+	 * did it any sooner, HW could access memory while it was being freed
+	 * in i40e_free_vf_res(), causing an IOMMU fault.
+	 *
+	 * On the other hand, this needs to be done ASAP, because the VF driver
+	 * is waiting for this to happen and may report a timeout. It's
+	 * harmless, but it gets logged into Guest OS kernel log, so best avoid
+	 * it.
+	 */
+	reg = rd32(hw, I40E_VPGEN_VFRTRIG(vf->vf_id));
+	reg &= ~I40E_VPGEN_VFRTRIG_VFSWR_MASK;
+	wr32(hw, I40E_VPGEN_VFRTRIG(vf->vf_id), reg);
+
+	/* reallocate VF resources to finish resetting the VSI state */
+	if (!i40e_alloc_vf_res(vf)) {
+		int abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id;
+		i40e_enable_vf_mappings(vf);
+		set_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states);
+		clear_bit(I40E_VF_STATE_DISABLED, &vf->vf_states);
+		/* Do not notify the client during VF init */
+		if (!test_and_clear_bit(I40E_VF_STATE_PRE_ENABLE,
+					&vf->vf_states))
+			i40e_notify_client_of_vf_reset(pf, abs_vf_id);
+		vf->num_vlan = 0;
+	}
+
+	/* Tell the VF driver the reset is done. This needs to be done only
+	 * after VF has been fully initialized, because the VF driver may
+	 * request resources immediately after setting this flag.
+	 */
+	wr32(hw, I40E_VFGEN_RSTAT1(vf->vf_id), VIRTCHNL_VFR_VFACTIVE);
+}
+
+/**
+ * i40e_reset_vf
+ * @vf: pointer to the VF structure
+ * @flr: VFLR was issued or not
+ *
+ * Returns true if the VF is reset, false otherwise.
+ **/
+bool i40e_reset_vf(struct i40e_vf *vf, bool flr)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	bool rsd = false;
+	u32 reg;
+	int i;
+
+	/* If the VFs have been disabled, this means something else is
+	 * resetting the VF, so we shouldn't continue.
+	 */
+	if (test_and_set_bit(__I40E_VF_DISABLE, pf->state))
+		return false;
+
+	i40e_trigger_vf_reset(vf, flr);
+
+	/* poll VPGEN_VFRSTAT reg to make sure
+	 * that reset is complete
+	 */
+	for (i = 0; i < 10; i++) {
+		/* VF reset requires driver to first reset the VF and then
+		 * poll the status register to make sure that the reset
+		 * completed successfully. Due to internal HW FIFO flushes,
+		 * we must wait 10ms before the register will be valid.
+		 */
+		usleep_range(10000, 20000);
+		reg = rd32(hw, I40E_VPGEN_VFRSTAT(vf->vf_id));
+		if (reg & I40E_VPGEN_VFRSTAT_VFRD_MASK) {
+			rsd = true;
+			break;
+		}
+	}
+
+	if (flr)
+		usleep_range(10000, 20000);
+
+	if (!rsd)
+		dev_err(&pf->pdev->dev, "VF reset check timeout on VF %d\n",
+			vf->vf_id);
+	usleep_range(10000, 20000);
+
+	/* On initial reset, we don't have any queues to disable */
+	if (vf->lan_vsi_idx != 0)
+		i40e_vsi_stop_rings(pf->vsi[vf->lan_vsi_idx]);
+
+	i40e_cleanup_reset_vf(vf);
+
+	i40e_flush(hw);
+	clear_bit(__I40E_VF_DISABLE, pf->state);
+
+	return true;
+}
+
+/**
+ * i40e_reset_all_vfs
+ * @pf: pointer to the PF structure
+ * @flr: VFLR was issued or not
+ *
+ * Reset all allocated VFs in one go. First, tell the hardware to reset each
+ * VF, then do all the waiting in one chunk, and finally finish restoring each
+ * VF after the wait. This is useful during PF routines which need to reset
+ * all VFs, as otherwise it must perform these resets in a serialized fashion.
+ *
+ * Returns true if any VFs were reset, and false otherwise.
+ **/
+bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr)
+{
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_vf *vf;
+	int i, v;
+	u32 reg;
+
+	/* If we don't have any VFs, then there is nothing to reset */
+	if (!pf->num_alloc_vfs)
+		return false;
+
+	/* If VFs have been disabled, there is no need to reset */
+	if (test_and_set_bit(__I40E_VF_DISABLE, pf->state))
+		return false;
+
+	/* Begin reset on all VFs at once */
+	for (v = 0; v < pf->num_alloc_vfs; v++)
+		i40e_trigger_vf_reset(&pf->vf[v], flr);
+
+	/* HW requires some time to make sure it can flush the FIFO for a VF
+	 * when it resets it. Poll the VPGEN_VFRSTAT register for each VF in
+	 * sequence to make sure that it has completed. We'll keep track of
+	 * the VFs using a simple iterator that increments once that VF has
+	 * finished resetting.
+	 */
+	for (i = 0, v = 0; i < 10 && v < pf->num_alloc_vfs; i++) {
+		usleep_range(10000, 20000);
+
+		/* Check each VF in sequence, beginning with the VF to fail
+		 * the previous check.
+		 */
+		while (v < pf->num_alloc_vfs) {
+			vf = &pf->vf[v];
+			reg = rd32(hw, I40E_VPGEN_VFRSTAT(vf->vf_id));
+			if (!(reg & I40E_VPGEN_VFRSTAT_VFRD_MASK))
+				break;
+
+			/* If the current VF has finished resetting, move on
+			 * to the next VF in sequence.
+			 */
+			v++;
+		}
+	}
+
+	if (flr)
+		usleep_range(10000, 20000);
+
+	/* Display a warning if at least one VF didn't manage to reset in
+	 * time, but continue on with the operation.
+	 */
+	if (v < pf->num_alloc_vfs)
+		dev_err(&pf->pdev->dev, "VF reset check timeout on VF %d\n",
+			pf->vf[v].vf_id);
+	usleep_range(10000, 20000);
+
+	/* Begin disabling all the rings associated with VFs, but do not wait
+	 * between each VF.
+	 */
+	for (v = 0; v < pf->num_alloc_vfs; v++) {
+		/* On initial reset, we don't have any queues to disable */
+		if (pf->vf[v].lan_vsi_idx == 0)
+			continue;
+
+		i40e_vsi_stop_rings_no_wait(pf->vsi[pf->vf[v].lan_vsi_idx]);
+	}
+
+	/* Now that we've notified HW to disable all of the VF rings, wait
+	 * until they finish.
+	 */
+	for (v = 0; v < pf->num_alloc_vfs; v++) {
+		/* On initial reset, we don't have any queues to disable */
+		if (pf->vf[v].lan_vsi_idx == 0)
+			continue;
+
+		i40e_vsi_wait_queues_disabled(pf->vsi[pf->vf[v].lan_vsi_idx]);
+	}
+
+	/* Hw may need up to 50ms to finish disabling the RX queues. We
+	 * minimize the wait by delaying only once for all VFs.
+	 */
+	mdelay(50);
+
+	/* Finish the reset on each VF */
+	for (v = 0; v < pf->num_alloc_vfs; v++)
+		i40e_cleanup_reset_vf(&pf->vf[v]);
+
+	i40e_flush(hw);
+	clear_bit(__I40E_VF_DISABLE, pf->state);
+
+	return true;
+}
+
+/**
+ * i40e_free_vfs
+ * @pf: pointer to the PF structure
+ *
+ * free VF resources
+ **/
+void i40e_free_vfs(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u32 reg_idx, bit_idx;
+	int i, tmp, vf_id;
+
+	if (!pf->vf)
+		return;
+	while (test_and_set_bit(__I40E_VF_DISABLE, pf->state))
+		usleep_range(1000, 2000);
+
+	i40e_notify_client_of_vf_enable(pf, 0);
+
+	/* Amortize wait time by stopping all VFs at the same time */
+	for (i = 0; i < pf->num_alloc_vfs; i++) {
+		if (test_bit(I40E_VF_STATE_INIT, &pf->vf[i].vf_states))
+			continue;
+
+		i40e_vsi_stop_rings_no_wait(pf->vsi[pf->vf[i].lan_vsi_idx]);
+	}
+
+	for (i = 0; i < pf->num_alloc_vfs; i++) {
+		if (test_bit(I40E_VF_STATE_INIT, &pf->vf[i].vf_states))
+			continue;
+
+		i40e_vsi_wait_queues_disabled(pf->vsi[pf->vf[i].lan_vsi_idx]);
+	}
+
+	/* Disable IOV before freeing resources. This lets any VF drivers
+	 * running in the host get themselves cleaned up before we yank
+	 * the carpet out from underneath their feet.
+	 */
+	if (!pci_vfs_assigned(pf->pdev))
+		pci_disable_sriov(pf->pdev);
+	else
+		dev_warn(&pf->pdev->dev, "VFs are assigned - not disabling SR-IOV\n");
+
+	/* free up VF resources */
+	tmp = pf->num_alloc_vfs;
+	pf->num_alloc_vfs = 0;
+	for (i = 0; i < tmp; i++) {
+		if (test_bit(I40E_VF_STATE_INIT, &pf->vf[i].vf_states))
+			i40e_free_vf_res(&pf->vf[i]);
+		/* disable qp mappings */
+		i40e_disable_vf_mappings(&pf->vf[i]);
+	}
+
+	kfree(pf->vf);
+	pf->vf = NULL;
+
+	/* This check is for when the driver is unloaded while VFs are
+	 * assigned. Setting the number of VFs to 0 through sysfs is caught
+	 * before this function ever gets called.
+	 */
+	if (!pci_vfs_assigned(pf->pdev)) {
+		/* Acknowledge VFLR for all VFS. Without this, VFs will fail to
+		 * work correctly when SR-IOV gets re-enabled.
+		 */
+		for (vf_id = 0; vf_id < tmp; vf_id++) {
+			reg_idx = (hw->func_caps.vf_base_id + vf_id) / 32;
+			bit_idx = (hw->func_caps.vf_base_id + vf_id) % 32;
+			wr32(hw, I40E_GLGEN_VFLRSTAT(reg_idx), BIT(bit_idx));
+		}
+	}
+	clear_bit(__I40E_VF_DISABLE, pf->state);
+}
+
+#ifdef CONFIG_PCI_IOV
+/**
+ * i40e_alloc_vfs
+ * @pf: pointer to the PF structure
+ * @num_alloc_vfs: number of VFs to allocate
+ *
+ * allocate VF resources
+ **/
+int i40e_alloc_vfs(struct i40e_pf *pf, u16 num_alloc_vfs)
+{
+	struct i40e_vf *vfs;
+	int i, ret = 0;
+
+	/* Disable interrupt 0 so we don't try to handle the VFLR. */
+	i40e_irq_dynamic_disable_icr0(pf);
+
+	/* Check to see if we're just allocating resources for extant VFs */
+	if (pci_num_vf(pf->pdev) != num_alloc_vfs) {
+		ret = pci_enable_sriov(pf->pdev, num_alloc_vfs);
+		if (ret) {
+			pf->flags &= ~I40E_FLAG_VEB_MODE_ENABLED;
+			pf->num_alloc_vfs = 0;
+			goto err_iov;
+		}
+	}
+	/* allocate memory */
+	vfs = kcalloc(num_alloc_vfs, sizeof(struct i40e_vf), GFP_KERNEL);
+	if (!vfs) {
+		ret = -ENOMEM;
+		goto err_alloc;
+	}
+	pf->vf = vfs;
+
+	/* apply default profile */
+	for (i = 0; i < num_alloc_vfs; i++) {
+		vfs[i].pf = pf;
+		vfs[i].parent_type = I40E_SWITCH_ELEMENT_TYPE_VEB;
+		vfs[i].vf_id = i;
+
+		/* assign default capabilities */
+		set_bit(I40E_VIRTCHNL_VF_CAP_L2, &vfs[i].vf_caps);
+		vfs[i].spoofchk = true;
+
+		set_bit(I40E_VF_STATE_PRE_ENABLE, &vfs[i].vf_states);
+
+	}
+	pf->num_alloc_vfs = num_alloc_vfs;
+
+	/* VF resources get allocated during reset */
+	i40e_reset_all_vfs(pf, false);
+
+	i40e_notify_client_of_vf_enable(pf, num_alloc_vfs);
+
+err_alloc:
+	if (ret)
+		i40e_free_vfs(pf);
+err_iov:
+	/* Re-enable interrupt 0. */
+	i40e_irq_dynamic_enable_icr0(pf);
+	return ret;
+}
+
+#endif
+/**
+ * i40e_pci_sriov_enable
+ * @pdev: pointer to a pci_dev structure
+ * @num_vfs: number of VFs to allocate
+ *
+ * Enable or change the number of VFs
+ **/
+static int i40e_pci_sriov_enable(struct pci_dev *pdev, int num_vfs)
+{
+#ifdef CONFIG_PCI_IOV
+	struct i40e_pf *pf = pci_get_drvdata(pdev);
+	int pre_existing_vfs = pci_num_vf(pdev);
+	int err = 0;
+
+	if (test_bit(__I40E_TESTING, pf->state)) {
+		dev_warn(&pdev->dev,
+			 "Cannot enable SR-IOV virtual functions while the device is undergoing diagnostic testing\n");
+		err = -EPERM;
+		goto err_out;
+	}
+
+	if (pre_existing_vfs && pre_existing_vfs != num_vfs)
+		i40e_free_vfs(pf);
+	else if (pre_existing_vfs && pre_existing_vfs == num_vfs)
+		goto out;
+
+	if (num_vfs > pf->num_req_vfs) {
+		dev_warn(&pdev->dev, "Unable to enable %d VFs. Limited to %d VFs due to device resource constraints.\n",
+			 num_vfs, pf->num_req_vfs);
+		err = -EPERM;
+		goto err_out;
+	}
+
+	dev_info(&pdev->dev, "Allocating %d VFs.\n", num_vfs);
+	err = i40e_alloc_vfs(pf, num_vfs);
+	if (err) {
+		dev_warn(&pdev->dev, "Failed to enable SR-IOV: %d\n", err);
+		goto err_out;
+	}
+
+out:
+	return num_vfs;
+
+err_out:
+	return err;
+#endif
+	return 0;
+}
+
+/**
+ * i40e_pci_sriov_configure
+ * @pdev: pointer to a pci_dev structure
+ * @num_vfs: number of VFs to allocate
+ *
+ * Enable or change the number of VFs. Called when the user updates the number
+ * of VFs in sysfs.
+ **/
+int i40e_pci_sriov_configure(struct pci_dev *pdev, int num_vfs)
+{
+	struct i40e_pf *pf = pci_get_drvdata(pdev);
+
+	if (num_vfs) {
+		if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) {
+			pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
+			i40e_do_reset_safe(pf, I40E_PF_RESET_FLAG);
+		}
+		return i40e_pci_sriov_enable(pdev, num_vfs);
+	}
+
+	if (!pci_vfs_assigned(pf->pdev)) {
+		i40e_free_vfs(pf);
+		pf->flags &= ~I40E_FLAG_VEB_MODE_ENABLED;
+		i40e_do_reset_safe(pf, I40E_PF_RESET_FLAG);
+	} else {
+		dev_warn(&pdev->dev, "Unable to free VFs because some are assigned to VMs.\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/***********************virtual channel routines******************/
+
+/**
+ * i40e_vc_send_msg_to_vf
+ * @vf: pointer to the VF info
+ * @v_opcode: virtual channel opcode
+ * @v_retval: virtual channel return value
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * send msg to VF
+ **/
+static int i40e_vc_send_msg_to_vf(struct i40e_vf *vf, u32 v_opcode,
+				  u32 v_retval, u8 *msg, u16 msglen)
+{
+	struct i40e_pf *pf;
+	struct i40e_hw *hw;
+	int abs_vf_id;
+	i40e_status aq_ret;
+
+	/* validate the request */
+	if (!vf || vf->vf_id >= vf->pf->num_alloc_vfs)
+		return -EINVAL;
+
+	pf = vf->pf;
+	hw = &pf->hw;
+	abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id;
+
+	/* single place to detect unsuccessful return values */
+	if (v_retval) {
+		vf->num_invalid_msgs++;
+		dev_info(&pf->pdev->dev, "VF %d failed opcode %d, retval: %d\n",
+			 vf->vf_id, v_opcode, v_retval);
+		if (vf->num_invalid_msgs >
+		    I40E_DEFAULT_NUM_INVALID_MSGS_ALLOWED) {
+			dev_err(&pf->pdev->dev,
+				"Number of invalid messages exceeded for VF %d\n",
+				vf->vf_id);
+			dev_err(&pf->pdev->dev, "Use PF Control I/F to enable the VF\n");
+			set_bit(I40E_VF_STATE_DISABLED, &vf->vf_states);
+		}
+	} else {
+		vf->num_valid_msgs++;
+		/* reset the invalid counter, if a valid message is received. */
+		vf->num_invalid_msgs = 0;
+	}
+
+	aq_ret = i40e_aq_send_msg_to_vf(hw, abs_vf_id,	v_opcode, v_retval,
+					msg, msglen, NULL);
+	if (aq_ret) {
+		dev_info(&pf->pdev->dev,
+			 "Unable to send the message to VF %d aq_err %d\n",
+			 vf->vf_id, pf->hw.aq.asq_last_status);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_vc_send_resp_to_vf
+ * @vf: pointer to the VF info
+ * @opcode: operation code
+ * @retval: return value
+ *
+ * send resp msg to VF
+ **/
+static int i40e_vc_send_resp_to_vf(struct i40e_vf *vf,
+				   enum virtchnl_ops opcode,
+				   i40e_status retval)
+{
+	return i40e_vc_send_msg_to_vf(vf, opcode, retval, NULL, 0);
+}
+
+/**
+ * i40e_vc_get_version_msg
+ * @vf: pointer to the VF info
+ *
+ * called from the VF to request the API version used by the PF
+ **/
+static int i40e_vc_get_version_msg(struct i40e_vf *vf, u8 *msg)
+{
+	struct virtchnl_version_info info = {
+		VIRTCHNL_VERSION_MAJOR, VIRTCHNL_VERSION_MINOR
+	};
+
+	vf->vf_ver = *(struct virtchnl_version_info *)msg;
+	/* VFs running the 1.0 API expect to get 1.0 back or they will cry. */
+	if (VF_IS_V10(&vf->vf_ver))
+		info.minor = VIRTCHNL_VERSION_MINOR_NO_VF_CAPS;
+	return i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_VERSION,
+				      I40E_SUCCESS, (u8 *)&info,
+				      sizeof(struct virtchnl_version_info));
+}
+
+/**
+ * i40e_del_qch - delete all the additional VSIs created as a part of ADq
+ * @vf: pointer to VF structure
+ **/
+static void i40e_del_qch(struct i40e_vf *vf)
+{
+	struct i40e_pf *pf = vf->pf;
+	int i;
+
+	/* first element in the array belongs to primary VF VSI and we shouldn't
+	 * delete it. We should however delete the rest of the VSIs created
+	 */
+	for (i = 1; i < vf->num_tc; i++) {
+		if (vf->ch[i].vsi_idx) {
+			i40e_vsi_release(pf->vsi[vf->ch[i].vsi_idx]);
+			vf->ch[i].vsi_idx = 0;
+			vf->ch[i].vsi_id = 0;
+		}
+	}
+}
+
+/**
+ * i40e_vc_get_vf_resources_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * called from the VF to request its resources
+ **/
+static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg)
+{
+	struct virtchnl_vf_resource *vfres = NULL;
+	struct i40e_pf *pf = vf->pf;
+	i40e_status aq_ret = 0;
+	struct i40e_vsi *vsi;
+	int num_vsis = 1;
+	int len = 0;
+	int ret;
+
+	if (!test_bit(I40E_VF_STATE_INIT, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	len = (sizeof(struct virtchnl_vf_resource) +
+	       sizeof(struct virtchnl_vsi_resource) * num_vsis);
+
+	vfres = kzalloc(len, GFP_KERNEL);
+	if (!vfres) {
+		aq_ret = I40E_ERR_NO_MEMORY;
+		len = 0;
+		goto err;
+	}
+	if (VF_IS_V11(&vf->vf_ver))
+		vf->driver_caps = *(u32 *)msg;
+	else
+		vf->driver_caps = VIRTCHNL_VF_OFFLOAD_L2 |
+				  VIRTCHNL_VF_OFFLOAD_RSS_REG |
+				  VIRTCHNL_VF_OFFLOAD_VLAN;
+
+	vfres->vf_cap_flags = VIRTCHNL_VF_OFFLOAD_L2;
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (!vsi->info.pvid)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN;
+
+	if (i40e_vf_client_capable(pf, vf->vf_id) &&
+	    (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_IWARP)) {
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_IWARP;
+		set_bit(I40E_VF_STATE_IWARPENA, &vf->vf_states);
+	} else {
+		clear_bit(I40E_VF_STATE_IWARPENA, &vf->vf_states);
+	}
+
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_PF) {
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_PF;
+	} else {
+		if ((pf->hw_features & I40E_HW_RSS_AQ_CAPABLE) &&
+		    (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_AQ))
+			vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_AQ;
+		else
+			vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_REG;
+	}
+
+	if (pf->hw_features & I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE) {
+		if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2)
+			vfres->vf_cap_flags |=
+				VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2;
+	}
+
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ENCAP)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_ENCAP;
+
+	if ((pf->hw_features & I40E_HW_OUTER_UDP_CSUM_CAPABLE) &&
+	    (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM))
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM;
+
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RX_POLLING) {
+		if (pf->flags & I40E_FLAG_MFP_ENABLED) {
+			dev_err(&pf->pdev->dev,
+				"VF %d requested polling mode: this feature is supported only when the device is running in single function per port (SFP) mode\n",
+				 vf->vf_id);
+			aq_ret = I40E_ERR_PARAM;
+			goto err;
+		}
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RX_POLLING;
+	}
+
+	if (pf->hw_features & I40E_HW_WB_ON_ITR_CAPABLE) {
+		if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_WB_ON_ITR)
+			vfres->vf_cap_flags |=
+					VIRTCHNL_VF_OFFLOAD_WB_ON_ITR;
+	}
+
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_REQ_QUEUES)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_REQ_QUEUES;
+
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_ADQ;
+
+	vfres->num_vsis = num_vsis;
+	vfres->num_queue_pairs = vf->num_queue_pairs;
+	vfres->max_vectors = pf->hw.func_caps.num_msix_vectors_vf;
+	vfres->rss_key_size = I40E_HKEY_ARRAY_SIZE;
+	vfres->rss_lut_size = I40E_VF_HLUT_ARRAY_SIZE;
+
+	if (vf->lan_vsi_idx) {
+		vfres->vsi_res[0].vsi_id = vf->lan_vsi_id;
+		vfres->vsi_res[0].vsi_type = VIRTCHNL_VSI_SRIOV;
+		vfres->vsi_res[0].num_queue_pairs = vsi->alloc_queue_pairs;
+		/* VFs only use TC 0 */
+		vfres->vsi_res[0].qset_handle
+					  = le16_to_cpu(vsi->info.qs_handle[0]);
+		ether_addr_copy(vfres->vsi_res[0].default_mac_addr,
+				vf->default_lan_addr.addr);
+	}
+	set_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states);
+
+err:
+	/* send the response back to the VF */
+	ret = i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_VF_RESOURCES,
+				     aq_ret, (u8 *)vfres, len);
+
+	kfree(vfres);
+	return ret;
+}
+
+/**
+ * i40e_vc_reset_vf_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * called from the VF to reset itself,
+ * unlike other virtchnl messages, PF driver
+ * doesn't send the response back to the VF
+ **/
+static void i40e_vc_reset_vf_msg(struct i40e_vf *vf)
+{
+	if (test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states))
+		i40e_reset_vf(vf, false);
+}
+
+/**
+ * i40e_getnum_vf_vsi_vlan_filters
+ * @vsi: pointer to the vsi
+ *
+ * called to get the number of VLANs offloaded on this VF
+ **/
+static inline int i40e_getnum_vf_vsi_vlan_filters(struct i40e_vsi *vsi)
+{
+	struct i40e_mac_filter *f;
+	int num_vlans = 0, bkt;
+
+	hash_for_each(vsi->mac_filter_hash, bkt, f, hlist) {
+		if (f->vlan >= 0 && f->vlan <= I40E_MAX_VLANID)
+			num_vlans++;
+	}
+
+	return num_vlans;
+}
+
+/**
+ * i40e_vc_config_promiscuous_mode_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * called from the VF to configure the promiscuous mode of
+ * VF vsis
+ **/
+static int i40e_vc_config_promiscuous_mode_msg(struct i40e_vf *vf,
+					       u8 *msg, u16 msglen)
+{
+	struct virtchnl_promisc_info *info =
+	    (struct virtchnl_promisc_info *)msg;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_mac_filter *f;
+	i40e_status aq_ret = 0;
+	bool allmulti = false;
+	struct i40e_vsi *vsi;
+	bool alluni = false;
+	int aq_err = 0;
+	int bkt;
+
+	vsi = i40e_find_vsi_from_id(pf, info->vsi_id);
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states) ||
+	    !i40e_vc_isvalid_vsi_id(vf, info->vsi_id) ||
+	    !vsi) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+	if (!test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
+		dev_err(&pf->pdev->dev,
+			"Unprivileged VF %d is attempting to configure promiscuous mode\n",
+			vf->vf_id);
+		/* Lie to the VF on purpose. */
+		aq_ret = 0;
+		goto error_param;
+	}
+	/* Multicast promiscuous handling*/
+	if (info->flags & FLAG_VF_MULTICAST_PROMISC)
+		allmulti = true;
+
+	if (vf->port_vlan_id) {
+		aq_ret = i40e_aq_set_vsi_mc_promisc_on_vlan(hw, vsi->seid,
+							    allmulti,
+							    vf->port_vlan_id,
+							    NULL);
+	} else if (i40e_getnum_vf_vsi_vlan_filters(vsi)) {
+		hash_for_each(vsi->mac_filter_hash, bkt, f, hlist) {
+			if (f->vlan < 0 || f->vlan > I40E_MAX_VLANID)
+				continue;
+			aq_ret = i40e_aq_set_vsi_mc_promisc_on_vlan(hw,
+								    vsi->seid,
+								    allmulti,
+								    f->vlan,
+								    NULL);
+			aq_err = pf->hw.aq.asq_last_status;
+			if (aq_ret) {
+				dev_err(&pf->pdev->dev,
+					"Could not add VLAN %d to multicast promiscuous domain err %s aq_err %s\n",
+					f->vlan,
+					i40e_stat_str(&pf->hw, aq_ret),
+					i40e_aq_str(&pf->hw, aq_err));
+				break;
+			}
+		}
+	} else {
+		aq_ret = i40e_aq_set_vsi_multicast_promiscuous(hw, vsi->seid,
+							       allmulti, NULL);
+		aq_err = pf->hw.aq.asq_last_status;
+		if (aq_ret) {
+			dev_err(&pf->pdev->dev,
+				"VF %d failed to set multicast promiscuous mode err %s aq_err %s\n",
+				vf->vf_id,
+				i40e_stat_str(&pf->hw, aq_ret),
+				i40e_aq_str(&pf->hw, aq_err));
+			goto error_param;
+		}
+	}
+
+	if (!aq_ret) {
+		dev_info(&pf->pdev->dev,
+			 "VF %d successfully set multicast promiscuous mode\n",
+			 vf->vf_id);
+		if (allmulti)
+			set_bit(I40E_VF_STATE_MC_PROMISC, &vf->vf_states);
+		else
+			clear_bit(I40E_VF_STATE_MC_PROMISC, &vf->vf_states);
+	}
+
+	if (info->flags & FLAG_VF_UNICAST_PROMISC)
+		alluni = true;
+	if (vf->port_vlan_id) {
+		aq_ret = i40e_aq_set_vsi_uc_promisc_on_vlan(hw, vsi->seid,
+							    alluni,
+							    vf->port_vlan_id,
+							    NULL);
+	} else if (i40e_getnum_vf_vsi_vlan_filters(vsi)) {
+		hash_for_each(vsi->mac_filter_hash, bkt, f, hlist) {
+			if (f->vlan < 0 || f->vlan > I40E_MAX_VLANID)
+				continue;
+			aq_ret = i40e_aq_set_vsi_uc_promisc_on_vlan(hw,
+								    vsi->seid,
+								    alluni,
+								    f->vlan,
+								    NULL);
+			aq_err = pf->hw.aq.asq_last_status;
+			if (aq_ret)
+				dev_err(&pf->pdev->dev,
+					"Could not add VLAN %d to Unicast promiscuous domain err %s aq_err %s\n",
+					f->vlan,
+					i40e_stat_str(&pf->hw, aq_ret),
+					i40e_aq_str(&pf->hw, aq_err));
+		}
+	} else {
+		aq_ret = i40e_aq_set_vsi_unicast_promiscuous(hw, vsi->seid,
+							     alluni, NULL,
+							     true);
+		aq_err = pf->hw.aq.asq_last_status;
+		if (aq_ret) {
+			dev_err(&pf->pdev->dev,
+				"VF %d failed to set unicast promiscuous mode %8.8x err %s aq_err %s\n",
+				vf->vf_id, info->flags,
+				i40e_stat_str(&pf->hw, aq_ret),
+				i40e_aq_str(&pf->hw, aq_err));
+			goto error_param;
+		}
+	}
+
+	if (!aq_ret) {
+		dev_info(&pf->pdev->dev,
+			 "VF %d successfully set unicast promiscuous mode\n",
+			 vf->vf_id);
+		if (alluni)
+			set_bit(I40E_VF_STATE_UC_PROMISC, &vf->vf_states);
+		else
+			clear_bit(I40E_VF_STATE_UC_PROMISC, &vf->vf_states);
+	}
+
+error_param:
+	/* send the response to the VF */
+	return i40e_vc_send_resp_to_vf(vf,
+				       VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE,
+				       aq_ret);
+}
+
+/**
+ * i40e_vc_config_queues_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * called from the VF to configure the rx/tx
+ * queues
+ **/
+static int i40e_vc_config_queues_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct virtchnl_vsi_queue_config_info *qci =
+	    (struct virtchnl_vsi_queue_config_info *)msg;
+	struct virtchnl_queue_pair_info *qpi;
+	struct i40e_pf *pf = vf->pf;
+	u16 vsi_id, vsi_queue_id = 0;
+	i40e_status aq_ret = 0;
+	int i, j = 0, idx = 0;
+
+	vsi_id = qci->vsi_id;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!i40e_vc_isvalid_vsi_id(vf, vsi_id)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	for (i = 0; i < qci->num_queue_pairs; i++) {
+		qpi = &qci->qpair[i];
+
+		if (!vf->adq_enabled) {
+			vsi_queue_id = qpi->txq.queue_id;
+
+			if (qpi->txq.vsi_id != qci->vsi_id ||
+			    qpi->rxq.vsi_id != qci->vsi_id ||
+			    qpi->rxq.queue_id != vsi_queue_id) {
+				aq_ret = I40E_ERR_PARAM;
+				goto error_param;
+			}
+		}
+
+		if (!i40e_vc_isvalid_queue_id(vf, vsi_id, vsi_queue_id)) {
+			aq_ret = I40E_ERR_PARAM;
+			goto error_param;
+		}
+
+		if (i40e_config_vsi_rx_queue(vf, vsi_id, vsi_queue_id,
+					     &qpi->rxq) ||
+		    i40e_config_vsi_tx_queue(vf, vsi_id, vsi_queue_id,
+					     &qpi->txq)) {
+			aq_ret = I40E_ERR_PARAM;
+			goto error_param;
+		}
+
+		/* For ADq there can be up to 4 VSIs with max 4 queues each.
+		 * VF does not know about these additional VSIs and all
+		 * it cares is about its own queues. PF configures these queues
+		 * to its appropriate VSIs based on TC mapping
+		 **/
+		if (vf->adq_enabled) {
+			if (j == (vf->ch[idx].num_qps - 1)) {
+				idx++;
+				j = 0; /* resetting the queue count */
+				vsi_queue_id = 0;
+			} else {
+				j++;
+				vsi_queue_id++;
+			}
+			vsi_id = vf->ch[idx].vsi_id;
+		}
+	}
+	/* set vsi num_queue_pairs in use to num configured by VF */
+	if (!vf->adq_enabled) {
+		pf->vsi[vf->lan_vsi_idx]->num_queue_pairs =
+			qci->num_queue_pairs;
+	} else {
+		for (i = 0; i < vf->num_tc; i++)
+			pf->vsi[vf->ch[i].vsi_idx]->num_queue_pairs =
+			       vf->ch[i].num_qps;
+	}
+
+error_param:
+	/* send the response to the VF */
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_CONFIG_VSI_QUEUES,
+				       aq_ret);
+}
+
+/**
+ * i40e_validate_queue_map
+ * @vsi_id: vsi id
+ * @queuemap: Tx or Rx queue map
+ *
+ * check if Tx or Rx queue map is valid
+ **/
+static int i40e_validate_queue_map(struct i40e_vf *vf, u16 vsi_id,
+				   unsigned long queuemap)
+{
+	u16 vsi_queue_id, queue_id;
+
+	for_each_set_bit(vsi_queue_id, &queuemap, I40E_MAX_VSI_QP) {
+		if (vf->adq_enabled) {
+			vsi_id = vf->ch[vsi_queue_id / I40E_MAX_VF_VSI].vsi_id;
+			queue_id = (vsi_queue_id % I40E_DEFAULT_QUEUES_PER_VF);
+		} else {
+			queue_id = vsi_queue_id;
+		}
+
+		if (!i40e_vc_isvalid_queue_id(vf, vsi_id, queue_id))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_vc_config_irq_map_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * called from the VF to configure the irq to
+ * queue map
+ **/
+static int i40e_vc_config_irq_map_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct virtchnl_irq_map_info *irqmap_info =
+	    (struct virtchnl_irq_map_info *)msg;
+	struct virtchnl_vector_map *map;
+	u16 vsi_id, vector_id;
+	i40e_status aq_ret = 0;
+	int i;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	for (i = 0; i < irqmap_info->num_vectors; i++) {
+		map = &irqmap_info->vecmap[i];
+		vector_id = map->vector_id;
+		vsi_id = map->vsi_id;
+		/* validate msg params */
+		if (!i40e_vc_isvalid_vector_id(vf, vector_id) ||
+		    !i40e_vc_isvalid_vsi_id(vf, vsi_id)) {
+			aq_ret = I40E_ERR_PARAM;
+			goto error_param;
+		}
+
+		if (i40e_validate_queue_map(vf, vsi_id, map->rxq_map)) {
+			aq_ret = I40E_ERR_PARAM;
+			goto error_param;
+		}
+
+		if (i40e_validate_queue_map(vf, vsi_id, map->txq_map)) {
+			aq_ret = I40E_ERR_PARAM;
+			goto error_param;
+		}
+
+		i40e_config_irq_link_list(vf, vsi_id, map);
+	}
+error_param:
+	/* send the response to the VF */
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_CONFIG_IRQ_MAP,
+				       aq_ret);
+}
+
+/**
+ * i40e_vc_enable_queues_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * called from the VF to enable all or specific queue(s)
+ **/
+static int i40e_vc_enable_queues_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct virtchnl_queue_select *vqs =
+	    (struct virtchnl_queue_select *)msg;
+	struct i40e_pf *pf = vf->pf;
+	u16 vsi_id = vqs->vsi_id;
+	i40e_status aq_ret = 0;
+	int i;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!i40e_vc_isvalid_vsi_id(vf, vsi_id)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	if ((0 == vqs->rx_queues) && (0 == vqs->tx_queues)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (i40e_vsi_start_rings(pf->vsi[vf->lan_vsi_idx]))
+		aq_ret = I40E_ERR_TIMEOUT;
+
+	/* need to start the rings for additional ADq VSI's as well */
+	if (vf->adq_enabled) {
+		/* zero belongs to LAN VSI */
+		for (i = 1; i < vf->num_tc; i++) {
+			if (i40e_vsi_start_rings(pf->vsi[vf->ch[i].vsi_idx]))
+				aq_ret = I40E_ERR_TIMEOUT;
+		}
+	}
+
+error_param:
+	/* send the response to the VF */
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_ENABLE_QUEUES,
+				       aq_ret);
+}
+
+/**
+ * i40e_vc_disable_queues_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * called from the VF to disable all or specific
+ * queue(s)
+ **/
+static int i40e_vc_disable_queues_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct virtchnl_queue_select *vqs =
+	    (struct virtchnl_queue_select *)msg;
+	struct i40e_pf *pf = vf->pf;
+	i40e_status aq_ret = 0;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!i40e_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	if ((0 == vqs->rx_queues) && (0 == vqs->tx_queues)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	i40e_vsi_stop_rings(pf->vsi[vf->lan_vsi_idx]);
+
+error_param:
+	/* send the response to the VF */
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_DISABLE_QUEUES,
+				       aq_ret);
+}
+
+/**
+ * i40e_vc_request_queues_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * VFs get a default number of queues but can use this message to request a
+ * different number.  If the request is successful, PF will reset the VF and
+ * return 0.  If unsuccessful, PF will send message informing VF of number of
+ * available queues and return result of sending VF a message.
+ **/
+static int i40e_vc_request_queues_msg(struct i40e_vf *vf, u8 *msg, int msglen)
+{
+	struct virtchnl_vf_res_request *vfres =
+		(struct virtchnl_vf_res_request *)msg;
+	int req_pairs = vfres->num_queue_pairs;
+	int cur_pairs = vf->num_queue_pairs;
+	struct i40e_pf *pf = vf->pf;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states))
+		return -EINVAL;
+
+	if (req_pairs <= 0) {
+		dev_err(&pf->pdev->dev,
+			"VF %d tried to request %d queues.  Ignoring.\n",
+			vf->vf_id, req_pairs);
+	} else if (req_pairs > I40E_MAX_VF_QUEUES) {
+		dev_err(&pf->pdev->dev,
+			"VF %d tried to request more than %d queues.\n",
+			vf->vf_id,
+			I40E_MAX_VF_QUEUES);
+		vfres->num_queue_pairs = I40E_MAX_VF_QUEUES;
+	} else if (req_pairs - cur_pairs > pf->queues_left) {
+		dev_warn(&pf->pdev->dev,
+			 "VF %d requested %d more queues, but only %d left.\n",
+			 vf->vf_id,
+			 req_pairs - cur_pairs,
+			 pf->queues_left);
+		vfres->num_queue_pairs = pf->queues_left + cur_pairs;
+	} else {
+		/* successful request */
+		vf->num_req_queues = req_pairs;
+		i40e_vc_notify_vf_reset(vf);
+		i40e_reset_vf(vf, false);
+		return 0;
+	}
+
+	return i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_REQUEST_QUEUES, 0,
+				      (u8 *)vfres, sizeof(*vfres));
+}
+
+/**
+ * i40e_vc_get_stats_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * called from the VF to get vsi stats
+ **/
+static int i40e_vc_get_stats_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct virtchnl_queue_select *vqs =
+	    (struct virtchnl_queue_select *)msg;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_eth_stats stats;
+	i40e_status aq_ret = 0;
+	struct i40e_vsi *vsi;
+
+	memset(&stats, 0, sizeof(struct i40e_eth_stats));
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!i40e_vc_isvalid_vsi_id(vf, vqs->vsi_id)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (!vsi) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+	i40e_update_eth_stats(vsi);
+	stats = vsi->eth_stats;
+
+error_param:
+	/* send the response back to the VF */
+	return i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_STATS, aq_ret,
+				      (u8 *)&stats, sizeof(stats));
+}
+
+/* If the VF is not trusted restrict the number of MAC/VLAN it can program */
+#define I40E_VC_MAX_MAC_ADDR_PER_VF 12
+#define I40E_VC_MAX_VLAN_PER_VF 8
+
+/**
+ * i40e_check_vf_permission
+ * @vf: pointer to the VF info
+ * @al: MAC address list from virtchnl
+ *
+ * Check that the given list of MAC addresses is allowed. Will return -EPERM
+ * if any address in the list is not valid. Checks the following conditions:
+ *
+ * 1) broadcast and zero addresses are never valid
+ * 2) unicast addresses are not allowed if the VMM has administratively set
+ *    the VF MAC address, unless the VF is marked as privileged.
+ * 3) There is enough space to add all the addresses.
+ *
+ * Note that to guarantee consistency, it is expected this function be called
+ * while holding the mac_filter_hash_lock, as otherwise the current number of
+ * addresses might not be accurate.
+ **/
+static inline int i40e_check_vf_permission(struct i40e_vf *vf,
+					   struct virtchnl_ether_addr_list *al)
+{
+	struct i40e_pf *pf = vf->pf;
+	int i;
+
+	/* If this VF is not privileged, then we can't add more than a limited
+	 * number of addresses. Check to make sure that the additions do not
+	 * push us over the limit.
+	 */
+	if (!test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps) &&
+	    (vf->num_mac + al->num_elements) > I40E_VC_MAX_MAC_ADDR_PER_VF) {
+		dev_err(&pf->pdev->dev,
+			"Cannot add more MAC addresses, VF is not trusted, switch the VF to trusted to add more functionality\n");
+		return -EPERM;
+	}
+
+	for (i = 0; i < al->num_elements; i++) {
+		u8 *addr = al->list[i].addr;
+
+		if (is_broadcast_ether_addr(addr) ||
+		    is_zero_ether_addr(addr)) {
+			dev_err(&pf->pdev->dev, "invalid VF MAC addr %pM\n",
+				addr);
+			return I40E_ERR_INVALID_MAC_ADDR;
+		}
+
+		/* If the host VMM administrator has set the VF MAC address
+		 * administratively via the ndo_set_vf_mac command then deny
+		 * permission to the VF to add or delete unicast MAC addresses.
+		 * Unless the VF is privileged and then it can do whatever.
+		 * The VF may request to set the MAC address filter already
+		 * assigned to it so do not return an error in that case.
+		 */
+		if (!test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps) &&
+		    !is_multicast_ether_addr(addr) && vf->pf_set_mac &&
+		    !ether_addr_equal(addr, vf->default_lan_addr.addr)) {
+			dev_err(&pf->pdev->dev,
+				"VF attempting to override administratively set MAC address, reload the VF driver to resume normal operation\n");
+			return -EPERM;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_vc_add_mac_addr_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * add guest mac address filter
+ **/
+static int i40e_vc_add_mac_addr_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct virtchnl_ether_addr_list *al =
+	    (struct virtchnl_ether_addr_list *)msg;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = NULL;
+	u16 vsi_id = al->vsi_id;
+	i40e_status ret = 0;
+	int i;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states) ||
+	    !i40e_vc_isvalid_vsi_id(vf, vsi_id)) {
+		ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = pf->vsi[vf->lan_vsi_idx];
+
+	/* Lock once, because all function inside for loop accesses VSI's
+	 * MAC filter list which needs to be protected using same lock.
+	 */
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+
+	ret = i40e_check_vf_permission(vf, al);
+	if (ret) {
+		spin_unlock_bh(&vsi->mac_filter_hash_lock);
+		goto error_param;
+	}
+
+	/* add new addresses to the list */
+	for (i = 0; i < al->num_elements; i++) {
+		struct i40e_mac_filter *f;
+
+		f = i40e_find_mac(vsi, al->list[i].addr);
+		if (!f) {
+			f = i40e_add_mac_filter(vsi, al->list[i].addr);
+
+			if (!f) {
+				dev_err(&pf->pdev->dev,
+					"Unable to add MAC filter %pM for VF %d\n",
+					al->list[i].addr, vf->vf_id);
+				ret = I40E_ERR_PARAM;
+				spin_unlock_bh(&vsi->mac_filter_hash_lock);
+				goto error_param;
+			} else {
+				vf->num_mac++;
+			}
+		}
+	}
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+	/* program the updated filter list */
+	ret = i40e_sync_vsi_filters(vsi);
+	if (ret)
+		dev_err(&pf->pdev->dev, "Unable to program VF %d MAC filters, error %d\n",
+			vf->vf_id, ret);
+
+error_param:
+	/* send the response to the VF */
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_ADD_ETH_ADDR,
+				       ret);
+}
+
+/**
+ * i40e_vc_del_mac_addr_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * remove guest mac address filter
+ **/
+static int i40e_vc_del_mac_addr_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct virtchnl_ether_addr_list *al =
+	    (struct virtchnl_ether_addr_list *)msg;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = NULL;
+	u16 vsi_id = al->vsi_id;
+	i40e_status ret = 0;
+	int i;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states) ||
+	    !i40e_vc_isvalid_vsi_id(vf, vsi_id)) {
+		ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	for (i = 0; i < al->num_elements; i++) {
+		if (is_broadcast_ether_addr(al->list[i].addr) ||
+		    is_zero_ether_addr(al->list[i].addr)) {
+			dev_err(&pf->pdev->dev, "Invalid MAC addr %pM for VF %d\n",
+				al->list[i].addr, vf->vf_id);
+			ret = I40E_ERR_INVALID_MAC_ADDR;
+			goto error_param;
+		}
+	}
+	vsi = pf->vsi[vf->lan_vsi_idx];
+
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+	/* delete addresses from the list */
+	for (i = 0; i < al->num_elements; i++)
+		if (i40e_del_mac_filter(vsi, al->list[i].addr)) {
+			ret = I40E_ERR_INVALID_MAC_ADDR;
+			spin_unlock_bh(&vsi->mac_filter_hash_lock);
+			goto error_param;
+		} else {
+			vf->num_mac--;
+		}
+
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+	/* program the updated filter list */
+	ret = i40e_sync_vsi_filters(vsi);
+	if (ret)
+		dev_err(&pf->pdev->dev, "Unable to program VF %d MAC filters, error %d\n",
+			vf->vf_id, ret);
+
+error_param:
+	/* send the response to the VF */
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_DEL_ETH_ADDR,
+				       ret);
+}
+
+/**
+ * i40e_vc_add_vlan_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * program guest vlan id
+ **/
+static int i40e_vc_add_vlan_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct virtchnl_vlan_filter_list *vfl =
+	    (struct virtchnl_vlan_filter_list *)msg;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = NULL;
+	u16 vsi_id = vfl->vsi_id;
+	i40e_status aq_ret = 0;
+	int i;
+
+	if ((vf->num_vlan >= I40E_VC_MAX_VLAN_PER_VF) &&
+	    !test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
+		dev_err(&pf->pdev->dev,
+			"VF is not trusted, switch the VF to trusted to add more VLAN addresses\n");
+		goto error_param;
+	}
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states) ||
+	    !i40e_vc_isvalid_vsi_id(vf, vsi_id)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	for (i = 0; i < vfl->num_elements; i++) {
+		if (vfl->vlan_id[i] > I40E_MAX_VLANID) {
+			aq_ret = I40E_ERR_PARAM;
+			dev_err(&pf->pdev->dev,
+				"invalid VF VLAN id %d\n", vfl->vlan_id[i]);
+			goto error_param;
+		}
+	}
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (vsi->info.pvid) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	i40e_vlan_stripping_enable(vsi);
+	for (i = 0; i < vfl->num_elements; i++) {
+		/* add new VLAN filter */
+		int ret = i40e_vsi_add_vlan(vsi, vfl->vlan_id[i]);
+		if (!ret)
+			vf->num_vlan++;
+
+		if (test_bit(I40E_VF_STATE_UC_PROMISC, &vf->vf_states))
+			i40e_aq_set_vsi_uc_promisc_on_vlan(&pf->hw, vsi->seid,
+							   true,
+							   vfl->vlan_id[i],
+							   NULL);
+		if (test_bit(I40E_VF_STATE_MC_PROMISC, &vf->vf_states))
+			i40e_aq_set_vsi_mc_promisc_on_vlan(&pf->hw, vsi->seid,
+							   true,
+							   vfl->vlan_id[i],
+							   NULL);
+
+		if (ret)
+			dev_err(&pf->pdev->dev,
+				"Unable to add VLAN filter %d for VF %d, error %d\n",
+				vfl->vlan_id[i], vf->vf_id, ret);
+	}
+
+error_param:
+	/* send the response to the VF */
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_ADD_VLAN, aq_ret);
+}
+
+/**
+ * i40e_vc_remove_vlan_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * remove programmed guest vlan id
+ **/
+static int i40e_vc_remove_vlan_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct virtchnl_vlan_filter_list *vfl =
+	    (struct virtchnl_vlan_filter_list *)msg;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = NULL;
+	u16 vsi_id = vfl->vsi_id;
+	i40e_status aq_ret = 0;
+	int i;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states) ||
+	    !i40e_vc_isvalid_vsi_id(vf, vsi_id)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	for (i = 0; i < vfl->num_elements; i++) {
+		if (vfl->vlan_id[i] > I40E_MAX_VLANID) {
+			aq_ret = I40E_ERR_PARAM;
+			goto error_param;
+		}
+	}
+
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (vsi->info.pvid) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	for (i = 0; i < vfl->num_elements; i++) {
+		i40e_vsi_kill_vlan(vsi, vfl->vlan_id[i]);
+		vf->num_vlan--;
+
+		if (test_bit(I40E_VF_STATE_UC_PROMISC, &vf->vf_states))
+			i40e_aq_set_vsi_uc_promisc_on_vlan(&pf->hw, vsi->seid,
+							   false,
+							   vfl->vlan_id[i],
+							   NULL);
+		if (test_bit(I40E_VF_STATE_MC_PROMISC, &vf->vf_states))
+			i40e_aq_set_vsi_mc_promisc_on_vlan(&pf->hw, vsi->seid,
+							   false,
+							   vfl->vlan_id[i],
+							   NULL);
+	}
+
+error_param:
+	/* send the response to the VF */
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_DEL_VLAN, aq_ret);
+}
+
+/**
+ * i40e_vc_iwarp_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * called from the VF for the iwarp msgs
+ **/
+static int i40e_vc_iwarp_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct i40e_pf *pf = vf->pf;
+	int abs_vf_id = vf->vf_id + pf->hw.func_caps.vf_base_id;
+	i40e_status aq_ret = 0;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states) ||
+	    !test_bit(I40E_VF_STATE_IWARPENA, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	i40e_notify_client_of_vf_msg(pf->vsi[pf->lan_vsi], abs_vf_id,
+				     msg, msglen);
+
+error_param:
+	/* send the response to the VF */
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_IWARP,
+				       aq_ret);
+}
+
+/**
+ * i40e_vc_iwarp_qvmap_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ * @config: config qvmap or release it
+ *
+ * called from the VF for the iwarp msgs
+ **/
+static int i40e_vc_iwarp_qvmap_msg(struct i40e_vf *vf, u8 *msg, u16 msglen,
+				   bool config)
+{
+	struct virtchnl_iwarp_qvlist_info *qvlist_info =
+				(struct virtchnl_iwarp_qvlist_info *)msg;
+	i40e_status aq_ret = 0;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states) ||
+	    !test_bit(I40E_VF_STATE_IWARPENA, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (config) {
+		if (i40e_config_iwarp_qvlist(vf, qvlist_info))
+			aq_ret = I40E_ERR_PARAM;
+	} else {
+		i40e_release_iwarp_qvlist(vf);
+	}
+
+error_param:
+	/* send the response to the VF */
+	return i40e_vc_send_resp_to_vf(vf,
+			       config ? VIRTCHNL_OP_CONFIG_IWARP_IRQ_MAP :
+			       VIRTCHNL_OP_RELEASE_IWARP_IRQ_MAP,
+			       aq_ret);
+}
+
+/**
+ * i40e_vc_config_rss_key
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * Configure the VF's RSS key
+ **/
+static int i40e_vc_config_rss_key(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct virtchnl_rss_key *vrk =
+		(struct virtchnl_rss_key *)msg;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = NULL;
+	u16 vsi_id = vrk->vsi_id;
+	i40e_status aq_ret = 0;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states) ||
+	    !i40e_vc_isvalid_vsi_id(vf, vsi_id) ||
+	    (vrk->key_len != I40E_HKEY_ARRAY_SIZE)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	aq_ret = i40e_config_rss(vsi, vrk->key, NULL, 0);
+err:
+	/* send the response to the VF */
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_CONFIG_RSS_KEY,
+				       aq_ret);
+}
+
+/**
+ * i40e_vc_config_rss_lut
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * Configure the VF's RSS LUT
+ **/
+static int i40e_vc_config_rss_lut(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct virtchnl_rss_lut *vrl =
+		(struct virtchnl_rss_lut *)msg;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = NULL;
+	u16 vsi_id = vrl->vsi_id;
+	i40e_status aq_ret = 0;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states) ||
+	    !i40e_vc_isvalid_vsi_id(vf, vsi_id) ||
+	    (vrl->lut_entries != I40E_VF_HLUT_ARRAY_SIZE)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	aq_ret = i40e_config_rss(vsi, NULL, vrl->lut, I40E_VF_HLUT_ARRAY_SIZE);
+	/* send the response to the VF */
+err:
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_CONFIG_RSS_LUT,
+				       aq_ret);
+}
+
+/**
+ * i40e_vc_get_rss_hena
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * Return the RSS HENA bits allowed by the hardware
+ **/
+static int i40e_vc_get_rss_hena(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct virtchnl_rss_hena *vrh = NULL;
+	struct i40e_pf *pf = vf->pf;
+	i40e_status aq_ret = 0;
+	int len = 0;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+	len = sizeof(struct virtchnl_rss_hena);
+
+	vrh = kzalloc(len, GFP_KERNEL);
+	if (!vrh) {
+		aq_ret = I40E_ERR_NO_MEMORY;
+		len = 0;
+		goto err;
+	}
+	vrh->hena = i40e_pf_get_default_rss_hena(pf);
+err:
+	/* send the response back to the VF */
+	aq_ret = i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_RSS_HENA_CAPS,
+					aq_ret, (u8 *)vrh, len);
+	kfree(vrh);
+	return aq_ret;
+}
+
+/**
+ * i40e_vc_set_rss_hena
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * Set the RSS HENA bits for the VF
+ **/
+static int i40e_vc_set_rss_hena(struct i40e_vf *vf, u8 *msg, u16 msglen)
+{
+	struct virtchnl_rss_hena *vrh =
+		(struct virtchnl_rss_hena *)msg;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_hw *hw = &pf->hw;
+	i40e_status aq_ret = 0;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+	i40e_write_rx_ctl(hw, I40E_VFQF_HENA1(0, vf->vf_id), (u32)vrh->hena);
+	i40e_write_rx_ctl(hw, I40E_VFQF_HENA1(1, vf->vf_id),
+			  (u32)(vrh->hena >> 32));
+
+	/* send the response to the VF */
+err:
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_SET_RSS_HENA, aq_ret);
+}
+
+/**
+ * i40e_vc_enable_vlan_stripping
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * Enable vlan header stripping for the VF
+ **/
+static int i40e_vc_enable_vlan_stripping(struct i40e_vf *vf, u8 *msg,
+					 u16 msglen)
+{
+	struct i40e_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	i40e_status aq_ret = 0;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	i40e_vlan_stripping_enable(vsi);
+
+	/* send the response to the VF */
+err:
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING,
+				       aq_ret);
+}
+
+/**
+ * i40e_vc_disable_vlan_stripping
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * Disable vlan header stripping for the VF
+ **/
+static int i40e_vc_disable_vlan_stripping(struct i40e_vf *vf, u8 *msg,
+					  u16 msglen)
+{
+	struct i40e_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	i40e_status aq_ret = 0;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	i40e_vlan_stripping_disable(vsi);
+
+	/* send the response to the VF */
+err:
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING,
+				       aq_ret);
+}
+
+/**
+ * i40e_validate_cloud_filter
+ * @mask: mask for TC filter
+ * @data: data for TC filter
+ *
+ * This function validates cloud filter programmed as TC filter for ADq
+ **/
+static int i40e_validate_cloud_filter(struct i40e_vf *vf,
+				      struct virtchnl_filter *tc_filter)
+{
+	struct virtchnl_l4_spec mask = tc_filter->mask.tcp_spec;
+	struct virtchnl_l4_spec data = tc_filter->data.tcp_spec;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = NULL;
+	struct i40e_mac_filter *f;
+	struct hlist_node *h;
+	bool found = false;
+	int bkt;
+
+	if (!tc_filter->action) {
+		dev_info(&pf->pdev->dev,
+			 "VF %d: Currently ADq doesn't support Drop Action\n",
+			 vf->vf_id);
+		goto err;
+	}
+
+	/* action_meta is TC number here to which the filter is applied */
+	if (!tc_filter->action_meta ||
+	    tc_filter->action_meta > I40E_MAX_VF_VSI) {
+		dev_info(&pf->pdev->dev, "VF %d: Invalid TC number %u\n",
+			 vf->vf_id, tc_filter->action_meta);
+		goto err;
+	}
+
+	/* Check filter if it's programmed for advanced mode or basic mode.
+	 * There are two ADq modes (for VF only),
+	 * 1. Basic mode: intended to allow as many filter options as possible
+	 *		  to be added to a VF in Non-trusted mode. Main goal is
+	 *		  to add filters to its own MAC and VLAN id.
+	 * 2. Advanced mode: is for allowing filters to be applied other than
+	 *		  its own MAC or VLAN. This mode requires the VF to be
+	 *		  Trusted.
+	 */
+	if (mask.dst_mac[0] && !mask.dst_ip[0]) {
+		vsi = pf->vsi[vf->lan_vsi_idx];
+		f = i40e_find_mac(vsi, data.dst_mac);
+
+		if (!f) {
+			dev_info(&pf->pdev->dev,
+				 "Destination MAC %pM doesn't belong to VF %d\n",
+				 data.dst_mac, vf->vf_id);
+			goto err;
+		}
+
+		if (mask.vlan_id) {
+			hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f,
+					   hlist) {
+				if (f->vlan == ntohs(data.vlan_id)) {
+					found = true;
+					break;
+				}
+			}
+			if (!found) {
+				dev_info(&pf->pdev->dev,
+					 "VF %d doesn't have any VLAN id %u\n",
+					 vf->vf_id, ntohs(data.vlan_id));
+				goto err;
+			}
+		}
+	} else {
+		/* Check if VF is trusted */
+		if (!test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
+			dev_err(&pf->pdev->dev,
+				"VF %d not trusted, make VF trusted to add advanced mode ADq cloud filters\n",
+				vf->vf_id);
+			return I40E_ERR_CONFIG;
+		}
+	}
+
+	if (mask.dst_mac[0] & data.dst_mac[0]) {
+		if (is_broadcast_ether_addr(data.dst_mac) ||
+		    is_zero_ether_addr(data.dst_mac)) {
+			dev_info(&pf->pdev->dev, "VF %d: Invalid Dest MAC addr %pM\n",
+				 vf->vf_id, data.dst_mac);
+			goto err;
+		}
+	}
+
+	if (mask.src_mac[0] & data.src_mac[0]) {
+		if (is_broadcast_ether_addr(data.src_mac) ||
+		    is_zero_ether_addr(data.src_mac)) {
+			dev_info(&pf->pdev->dev, "VF %d: Invalid Source MAC addr %pM\n",
+				 vf->vf_id, data.src_mac);
+			goto err;
+		}
+	}
+
+	if (mask.dst_port & data.dst_port) {
+		if (!data.dst_port || be16_to_cpu(data.dst_port) > 0xFFFF) {
+			dev_info(&pf->pdev->dev, "VF %d: Invalid Dest port\n",
+				 vf->vf_id);
+			goto err;
+		}
+	}
+
+	if (mask.src_port & data.src_port) {
+		if (!data.src_port || be16_to_cpu(data.src_port) > 0xFFFF) {
+			dev_info(&pf->pdev->dev, "VF %d: Invalid Source port\n",
+				 vf->vf_id);
+			goto err;
+		}
+	}
+
+	if (tc_filter->flow_type != VIRTCHNL_TCP_V6_FLOW &&
+	    tc_filter->flow_type != VIRTCHNL_TCP_V4_FLOW) {
+		dev_info(&pf->pdev->dev, "VF %d: Invalid Flow type\n",
+			 vf->vf_id);
+		goto err;
+	}
+
+	if (mask.vlan_id & data.vlan_id) {
+		if (ntohs(data.vlan_id) > I40E_MAX_VLANID) {
+			dev_info(&pf->pdev->dev, "VF %d: invalid VLAN ID\n",
+				 vf->vf_id);
+			goto err;
+		}
+	}
+
+	return I40E_SUCCESS;
+err:
+	return I40E_ERR_CONFIG;
+}
+
+/**
+ * i40e_find_vsi_from_seid - searches for the vsi with the given seid
+ * @vf: pointer to the VF info
+ * @seid - seid of the vsi it is searching for
+ **/
+static struct i40e_vsi *i40e_find_vsi_from_seid(struct i40e_vf *vf, u16 seid)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = NULL;
+	int i;
+
+	for (i = 0; i < vf->num_tc ; i++) {
+		vsi = i40e_find_vsi_from_id(pf, vf->ch[i].vsi_id);
+		if (vsi && vsi->seid == seid)
+			return vsi;
+	}
+	return NULL;
+}
+
+/**
+ * i40e_del_all_cloud_filters
+ * @vf: pointer to the VF info
+ *
+ * This function deletes all cloud filters
+ **/
+static void i40e_del_all_cloud_filters(struct i40e_vf *vf)
+{
+	struct i40e_cloud_filter *cfilter = NULL;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = NULL;
+	struct hlist_node *node;
+	int ret;
+
+	hlist_for_each_entry_safe(cfilter, node,
+				  &vf->cloud_filter_list, cloud_node) {
+		vsi = i40e_find_vsi_from_seid(vf, cfilter->seid);
+
+		if (!vsi) {
+			dev_err(&pf->pdev->dev, "VF %d: no VSI found for matching %u seid, can't delete cloud filter\n",
+				vf->vf_id, cfilter->seid);
+			continue;
+		}
+
+		if (cfilter->dst_port)
+			ret = i40e_add_del_cloud_filter_big_buf(vsi, cfilter,
+								false);
+		else
+			ret = i40e_add_del_cloud_filter(vsi, cfilter, false);
+		if (ret)
+			dev_err(&pf->pdev->dev,
+				"VF %d: Failed to delete cloud filter, err %s aq_err %s\n",
+				vf->vf_id, i40e_stat_str(&pf->hw, ret),
+				i40e_aq_str(&pf->hw,
+					    pf->hw.aq.asq_last_status));
+
+		hlist_del(&cfilter->cloud_node);
+		kfree(cfilter);
+		vf->num_cloud_filters--;
+	}
+}
+
+/**
+ * i40e_vc_del_cloud_filter
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * This function deletes a cloud filter programmed as TC filter for ADq
+ **/
+static int i40e_vc_del_cloud_filter(struct i40e_vf *vf, u8 *msg)
+{
+	struct virtchnl_filter *vcf = (struct virtchnl_filter *)msg;
+	struct virtchnl_l4_spec mask = vcf->mask.tcp_spec;
+	struct virtchnl_l4_spec tcf = vcf->data.tcp_spec;
+	struct i40e_cloud_filter cfilter, *cf = NULL;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = NULL;
+	struct hlist_node *node;
+	i40e_status aq_ret = 0;
+	int i, ret;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	if (!vf->adq_enabled) {
+		dev_info(&pf->pdev->dev,
+			 "VF %d: ADq not enabled, can't apply cloud filter\n",
+			 vf->vf_id);
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	if (i40e_validate_cloud_filter(vf, vcf)) {
+		dev_info(&pf->pdev->dev,
+			 "VF %d: Invalid input, can't apply cloud filter\n",
+			 vf->vf_id);
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	memset(&cfilter, 0, sizeof(cfilter));
+	/* parse destination mac address */
+	for (i = 0; i < ETH_ALEN; i++)
+		cfilter.dst_mac[i] = mask.dst_mac[i] & tcf.dst_mac[i];
+
+	/* parse source mac address */
+	for (i = 0; i < ETH_ALEN; i++)
+		cfilter.src_mac[i] = mask.src_mac[i] & tcf.src_mac[i];
+
+	cfilter.vlan_id = mask.vlan_id & tcf.vlan_id;
+	cfilter.dst_port = mask.dst_port & tcf.dst_port;
+	cfilter.src_port = mask.src_port & tcf.src_port;
+
+	switch (vcf->flow_type) {
+	case VIRTCHNL_TCP_V4_FLOW:
+		cfilter.n_proto = ETH_P_IP;
+		if (mask.dst_ip[0] & tcf.dst_ip[0])
+			memcpy(&cfilter.ip.v4.dst_ip, tcf.dst_ip,
+			       ARRAY_SIZE(tcf.dst_ip));
+		else if (mask.src_ip[0] & tcf.dst_ip[0])
+			memcpy(&cfilter.ip.v4.src_ip, tcf.src_ip,
+			       ARRAY_SIZE(tcf.dst_ip));
+		break;
+	case VIRTCHNL_TCP_V6_FLOW:
+		cfilter.n_proto = ETH_P_IPV6;
+		if (mask.dst_ip[3] & tcf.dst_ip[3])
+			memcpy(&cfilter.ip.v6.dst_ip6, tcf.dst_ip,
+			       sizeof(cfilter.ip.v6.dst_ip6));
+		if (mask.src_ip[3] & tcf.src_ip[3])
+			memcpy(&cfilter.ip.v6.src_ip6, tcf.src_ip,
+			       sizeof(cfilter.ip.v6.src_ip6));
+		break;
+	default:
+		/* TC filter can be configured based on different combinations
+		 * and in this case IP is not a part of filter config
+		 */
+		dev_info(&pf->pdev->dev, "VF %d: Flow type not configured\n",
+			 vf->vf_id);
+	}
+
+	/* get the vsi to which the tc belongs to */
+	vsi = pf->vsi[vf->ch[vcf->action_meta].vsi_idx];
+	cfilter.seid = vsi->seid;
+	cfilter.flags = vcf->field_flags;
+
+	/* Deleting TC filter */
+	if (tcf.dst_port)
+		ret = i40e_add_del_cloud_filter_big_buf(vsi, &cfilter, false);
+	else
+		ret = i40e_add_del_cloud_filter(vsi, &cfilter, false);
+	if (ret) {
+		dev_err(&pf->pdev->dev,
+			"VF %d: Failed to delete cloud filter, err %s aq_err %s\n",
+			vf->vf_id, i40e_stat_str(&pf->hw, ret),
+			i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		goto err;
+	}
+
+	hlist_for_each_entry_safe(cf, node,
+				  &vf->cloud_filter_list, cloud_node) {
+		if (cf->seid != cfilter.seid)
+			continue;
+		if (mask.dst_port)
+			if (cfilter.dst_port != cf->dst_port)
+				continue;
+		if (mask.dst_mac[0])
+			if (!ether_addr_equal(cf->src_mac, cfilter.src_mac))
+				continue;
+		/* for ipv4 data to be valid, only first byte of mask is set */
+		if (cfilter.n_proto == ETH_P_IP && mask.dst_ip[0])
+			if (memcmp(&cfilter.ip.v4.dst_ip, &cf->ip.v4.dst_ip,
+				   ARRAY_SIZE(tcf.dst_ip)))
+				continue;
+		/* for ipv6, mask is set for all sixteen bytes (4 words) */
+		if (cfilter.n_proto == ETH_P_IPV6 && mask.dst_ip[3])
+			if (memcmp(&cfilter.ip.v6.dst_ip6, &cf->ip.v6.dst_ip6,
+				   sizeof(cfilter.ip.v6.src_ip6)))
+				continue;
+		if (mask.vlan_id)
+			if (cfilter.vlan_id != cf->vlan_id)
+				continue;
+
+		hlist_del(&cf->cloud_node);
+		kfree(cf);
+		vf->num_cloud_filters--;
+	}
+
+err:
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_DEL_CLOUD_FILTER,
+				       aq_ret);
+}
+
+/**
+ * i40e_vc_add_cloud_filter
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * This function adds a cloud filter programmed as TC filter for ADq
+ **/
+static int i40e_vc_add_cloud_filter(struct i40e_vf *vf, u8 *msg)
+{
+	struct virtchnl_filter *vcf = (struct virtchnl_filter *)msg;
+	struct virtchnl_l4_spec mask = vcf->mask.tcp_spec;
+	struct virtchnl_l4_spec tcf = vcf->data.tcp_spec;
+	struct i40e_cloud_filter *cfilter = NULL;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = NULL;
+	i40e_status aq_ret = 0;
+	int i, ret;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	if (!vf->adq_enabled) {
+		dev_info(&pf->pdev->dev,
+			 "VF %d: ADq is not enabled, can't apply cloud filter\n",
+			 vf->vf_id);
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	if (i40e_validate_cloud_filter(vf, vcf)) {
+		dev_info(&pf->pdev->dev,
+			 "VF %d: Invalid input/s, can't apply cloud filter\n",
+			 vf->vf_id);
+			aq_ret = I40E_ERR_PARAM;
+			goto err;
+	}
+
+	cfilter = kzalloc(sizeof(*cfilter), GFP_KERNEL);
+	if (!cfilter)
+		return -ENOMEM;
+
+	/* parse destination mac address */
+	for (i = 0; i < ETH_ALEN; i++)
+		cfilter->dst_mac[i] = mask.dst_mac[i] & tcf.dst_mac[i];
+
+	/* parse source mac address */
+	for (i = 0; i < ETH_ALEN; i++)
+		cfilter->src_mac[i] = mask.src_mac[i] & tcf.src_mac[i];
+
+	cfilter->vlan_id = mask.vlan_id & tcf.vlan_id;
+	cfilter->dst_port = mask.dst_port & tcf.dst_port;
+	cfilter->src_port = mask.src_port & tcf.src_port;
+
+	switch (vcf->flow_type) {
+	case VIRTCHNL_TCP_V4_FLOW:
+		cfilter->n_proto = ETH_P_IP;
+		if (mask.dst_ip[0] & tcf.dst_ip[0])
+			memcpy(&cfilter->ip.v4.dst_ip, tcf.dst_ip,
+			       ARRAY_SIZE(tcf.dst_ip));
+		else if (mask.src_ip[0] & tcf.dst_ip[0])
+			memcpy(&cfilter->ip.v4.src_ip, tcf.src_ip,
+			       ARRAY_SIZE(tcf.dst_ip));
+		break;
+	case VIRTCHNL_TCP_V6_FLOW:
+		cfilter->n_proto = ETH_P_IPV6;
+		if (mask.dst_ip[3] & tcf.dst_ip[3])
+			memcpy(&cfilter->ip.v6.dst_ip6, tcf.dst_ip,
+			       sizeof(cfilter->ip.v6.dst_ip6));
+		if (mask.src_ip[3] & tcf.src_ip[3])
+			memcpy(&cfilter->ip.v6.src_ip6, tcf.src_ip,
+			       sizeof(cfilter->ip.v6.src_ip6));
+		break;
+	default:
+		/* TC filter can be configured based on different combinations
+		 * and in this case IP is not a part of filter config
+		 */
+		dev_info(&pf->pdev->dev, "VF %d: Flow type not configured\n",
+			 vf->vf_id);
+	}
+
+	/* get the VSI to which the TC belongs to */
+	vsi = pf->vsi[vf->ch[vcf->action_meta].vsi_idx];
+	cfilter->seid = vsi->seid;
+	cfilter->flags = vcf->field_flags;
+
+	/* Adding cloud filter programmed as TC filter */
+	if (tcf.dst_port)
+		ret = i40e_add_del_cloud_filter_big_buf(vsi, cfilter, true);
+	else
+		ret = i40e_add_del_cloud_filter(vsi, cfilter, true);
+	if (ret) {
+		dev_err(&pf->pdev->dev,
+			"VF %d: Failed to add cloud filter, err %s aq_err %s\n",
+			vf->vf_id, i40e_stat_str(&pf->hw, ret),
+			i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status));
+		goto err;
+	}
+
+	INIT_HLIST_NODE(&cfilter->cloud_node);
+	hlist_add_head(&cfilter->cloud_node, &vf->cloud_filter_list);
+	vf->num_cloud_filters++;
+err:
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_ADD_CLOUD_FILTER,
+				       aq_ret);
+}
+
+/**
+ * i40e_vc_add_qch_msg: Add queue channel and enable ADq
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ **/
+static int i40e_vc_add_qch_msg(struct i40e_vf *vf, u8 *msg)
+{
+	struct virtchnl_tc_info *tci =
+		(struct virtchnl_tc_info *)msg;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_link_status *ls = &pf->hw.phy.link_info;
+	int i, adq_request_qps = 0, speed = 0;
+	i40e_status aq_ret = 0;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	/* ADq cannot be applied if spoof check is ON */
+	if (vf->spoofchk) {
+		dev_err(&pf->pdev->dev,
+			"Spoof check is ON, turn it OFF to enable ADq\n");
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADQ)) {
+		dev_err(&pf->pdev->dev,
+			"VF %d attempting to enable ADq, but hasn't properly negotiated that capability\n",
+			vf->vf_id);
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	/* max number of traffic classes for VF currently capped at 4 */
+	if (!tci->num_tc || tci->num_tc > I40E_MAX_VF_VSI) {
+		dev_err(&pf->pdev->dev,
+			"VF %d trying to set %u TCs, valid range 1-4 TCs per VF\n",
+			vf->vf_id, tci->num_tc);
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	/* validate queues for each TC */
+	for (i = 0; i < tci->num_tc; i++)
+		if (!tci->list[i].count ||
+		    tci->list[i].count > I40E_DEFAULT_QUEUES_PER_VF) {
+			dev_err(&pf->pdev->dev,
+				"VF %d: TC %d trying to set %u queues, valid range 1-4 queues per TC\n",
+				vf->vf_id, i, tci->list[i].count);
+			aq_ret = I40E_ERR_PARAM;
+			goto err;
+		}
+
+	/* need Max VF queues but already have default number of queues */
+	adq_request_qps = I40E_MAX_VF_QUEUES - I40E_DEFAULT_QUEUES_PER_VF;
+
+	if (pf->queues_left < adq_request_qps) {
+		dev_err(&pf->pdev->dev,
+			"No queues left to allocate to VF %d\n",
+			vf->vf_id);
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	} else {
+		/* we need to allocate max VF queues to enable ADq so as to
+		 * make sure ADq enabled VF always gets back queues when it
+		 * goes through a reset.
+		 */
+		vf->num_queue_pairs = I40E_MAX_VF_QUEUES;
+	}
+
+	/* get link speed in MB to validate rate limit */
+	switch (ls->link_speed) {
+	case VIRTCHNL_LINK_SPEED_100MB:
+		speed = SPEED_100;
+		break;
+	case VIRTCHNL_LINK_SPEED_1GB:
+		speed = SPEED_1000;
+		break;
+	case VIRTCHNL_LINK_SPEED_10GB:
+		speed = SPEED_10000;
+		break;
+	case VIRTCHNL_LINK_SPEED_20GB:
+		speed = SPEED_20000;
+		break;
+	case VIRTCHNL_LINK_SPEED_25GB:
+		speed = SPEED_25000;
+		break;
+	case VIRTCHNL_LINK_SPEED_40GB:
+		speed = SPEED_40000;
+		break;
+	default:
+		dev_err(&pf->pdev->dev,
+			"Cannot detect link speed\n");
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	/* parse data from the queue channel info */
+	vf->num_tc = tci->num_tc;
+	for (i = 0; i < vf->num_tc; i++) {
+		if (tci->list[i].max_tx_rate) {
+			if (tci->list[i].max_tx_rate > speed) {
+				dev_err(&pf->pdev->dev,
+					"Invalid max tx rate %llu specified for VF %d.",
+					tci->list[i].max_tx_rate,
+					vf->vf_id);
+				aq_ret = I40E_ERR_PARAM;
+				goto err;
+			} else {
+				vf->ch[i].max_tx_rate =
+					tci->list[i].max_tx_rate;
+			}
+		}
+		vf->ch[i].num_qps = tci->list[i].count;
+	}
+
+	/* set this flag only after making sure all inputs are sane */
+	vf->adq_enabled = true;
+	/* num_req_queues is set when user changes number of queues via ethtool
+	 * and this causes issue for default VSI(which depends on this variable)
+	 * when ADq is enabled, hence reset it.
+	 */
+	vf->num_req_queues = 0;
+
+	/* reset the VF in order to allocate resources */
+	i40e_vc_notify_vf_reset(vf);
+	i40e_reset_vf(vf, false);
+
+	return I40E_SUCCESS;
+
+	/* send the response to the VF */
+err:
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_ENABLE_CHANNELS,
+				       aq_ret);
+}
+
+/**
+ * i40e_vc_del_qch_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ **/
+static int i40e_vc_del_qch_msg(struct i40e_vf *vf, u8 *msg)
+{
+	struct i40e_pf *pf = vf->pf;
+	i40e_status aq_ret = 0;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	if (vf->adq_enabled) {
+		i40e_del_all_cloud_filters(vf);
+		i40e_del_qch(vf);
+		vf->adq_enabled = false;
+		vf->num_tc = 0;
+		dev_info(&pf->pdev->dev,
+			 "Deleting Queue Channels and cloud filters for ADq on VF %d\n",
+			 vf->vf_id);
+	} else {
+		dev_info(&pf->pdev->dev, "VF %d trying to delete queue channels but ADq isn't enabled\n",
+			 vf->vf_id);
+		aq_ret = I40E_ERR_PARAM;
+	}
+
+	/* reset the VF in order to allocate resources */
+	i40e_vc_notify_vf_reset(vf);
+	i40e_reset_vf(vf, false);
+
+	return I40E_SUCCESS;
+
+err:
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_DISABLE_CHANNELS,
+				       aq_ret);
+}
+
+/**
+ * i40e_vc_process_vf_msg
+ * @pf: pointer to the PF structure
+ * @vf_id: source VF id
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ * @msghndl: msg handle
+ *
+ * called from the common aeq/arq handler to
+ * process request from VF
+ **/
+int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode,
+			   u32 v_retval, u8 *msg, u16 msglen)
+{
+	struct i40e_hw *hw = &pf->hw;
+	int local_vf_id = vf_id - (s16)hw->func_caps.vf_base_id;
+	struct i40e_vf *vf;
+	int ret;
+
+	pf->vf_aq_requests++;
+	if (local_vf_id >= pf->num_alloc_vfs)
+		return -EINVAL;
+	vf = &(pf->vf[local_vf_id]);
+
+	/* Check if VF is disabled. */
+	if (test_bit(I40E_VF_STATE_DISABLED, &vf->vf_states))
+		return I40E_ERR_PARAM;
+
+	/* perform basic checks on the msg */
+	ret = virtchnl_vc_validate_vf_msg(&vf->vf_ver, v_opcode, msg, msglen);
+
+	/* perform additional checks specific to this driver */
+	if (v_opcode == VIRTCHNL_OP_CONFIG_RSS_KEY) {
+		struct virtchnl_rss_key *vrk = (struct virtchnl_rss_key *)msg;
+
+		if (vrk->key_len != I40E_HKEY_ARRAY_SIZE)
+			ret = -EINVAL;
+	} else if (v_opcode == VIRTCHNL_OP_CONFIG_RSS_LUT) {
+		struct virtchnl_rss_lut *vrl = (struct virtchnl_rss_lut *)msg;
+
+		if (vrl->lut_entries != I40E_VF_HLUT_ARRAY_SIZE)
+			ret = -EINVAL;
+	}
+
+	if (ret) {
+		i40e_vc_send_resp_to_vf(vf, v_opcode, I40E_ERR_PARAM);
+		dev_err(&pf->pdev->dev, "Invalid message from VF %d, opcode %d, len %d\n",
+			local_vf_id, v_opcode, msglen);
+		switch (ret) {
+		case VIRTCHNL_ERR_PARAM:
+			return -EPERM;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	switch (v_opcode) {
+	case VIRTCHNL_OP_VERSION:
+		ret = i40e_vc_get_version_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_GET_VF_RESOURCES:
+		ret = i40e_vc_get_vf_resources_msg(vf, msg);
+		i40e_vc_notify_vf_link_state(vf);
+		break;
+	case VIRTCHNL_OP_RESET_VF:
+		i40e_vc_reset_vf_msg(vf);
+		ret = 0;
+		break;
+	case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE:
+		ret = i40e_vc_config_promiscuous_mode_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_CONFIG_VSI_QUEUES:
+		ret = i40e_vc_config_queues_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_CONFIG_IRQ_MAP:
+		ret = i40e_vc_config_irq_map_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_ENABLE_QUEUES:
+		ret = i40e_vc_enable_queues_msg(vf, msg, msglen);
+		i40e_vc_notify_vf_link_state(vf);
+		break;
+	case VIRTCHNL_OP_DISABLE_QUEUES:
+		ret = i40e_vc_disable_queues_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_ADD_ETH_ADDR:
+		ret = i40e_vc_add_mac_addr_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_DEL_ETH_ADDR:
+		ret = i40e_vc_del_mac_addr_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_ADD_VLAN:
+		ret = i40e_vc_add_vlan_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_DEL_VLAN:
+		ret = i40e_vc_remove_vlan_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_GET_STATS:
+		ret = i40e_vc_get_stats_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_IWARP:
+		ret = i40e_vc_iwarp_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_CONFIG_IWARP_IRQ_MAP:
+		ret = i40e_vc_iwarp_qvmap_msg(vf, msg, msglen, true);
+		break;
+	case VIRTCHNL_OP_RELEASE_IWARP_IRQ_MAP:
+		ret = i40e_vc_iwarp_qvmap_msg(vf, msg, msglen, false);
+		break;
+	case VIRTCHNL_OP_CONFIG_RSS_KEY:
+		ret = i40e_vc_config_rss_key(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_CONFIG_RSS_LUT:
+		ret = i40e_vc_config_rss_lut(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_GET_RSS_HENA_CAPS:
+		ret = i40e_vc_get_rss_hena(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_SET_RSS_HENA:
+		ret = i40e_vc_set_rss_hena(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
+		ret = i40e_vc_enable_vlan_stripping(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
+		ret = i40e_vc_disable_vlan_stripping(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_REQUEST_QUEUES:
+		ret = i40e_vc_request_queues_msg(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_ENABLE_CHANNELS:
+		ret = i40e_vc_add_qch_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_DISABLE_CHANNELS:
+		ret = i40e_vc_del_qch_msg(vf, msg);
+		break;
+	case VIRTCHNL_OP_ADD_CLOUD_FILTER:
+		ret = i40e_vc_add_cloud_filter(vf, msg);
+		break;
+	case VIRTCHNL_OP_DEL_CLOUD_FILTER:
+		ret = i40e_vc_del_cloud_filter(vf, msg);
+		break;
+	case VIRTCHNL_OP_UNKNOWN:
+	default:
+		dev_err(&pf->pdev->dev, "Unsupported opcode %d from VF %d\n",
+			v_opcode, local_vf_id);
+		ret = i40e_vc_send_resp_to_vf(vf, v_opcode,
+					      I40E_ERR_NOT_IMPLEMENTED);
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * i40e_vc_process_vflr_event
+ * @pf: pointer to the PF structure
+ *
+ * called from the vlfr irq handler to
+ * free up VF resources and state variables
+ **/
+int i40e_vc_process_vflr_event(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u32 reg, reg_idx, bit_idx;
+	struct i40e_vf *vf;
+	int vf_id;
+
+	if (!test_bit(__I40E_VFLR_EVENT_PENDING, pf->state))
+		return 0;
+
+	/* Re-enable the VFLR interrupt cause here, before looking for which
+	 * VF got reset. Otherwise, if another VF gets a reset while the
+	 * first one is being processed, that interrupt will be lost, and
+	 * that VF will be stuck in reset forever.
+	 */
+	reg = rd32(hw, I40E_PFINT_ICR0_ENA);
+	reg |= I40E_PFINT_ICR0_ENA_VFLR_MASK;
+	wr32(hw, I40E_PFINT_ICR0_ENA, reg);
+	i40e_flush(hw);
+
+	clear_bit(__I40E_VFLR_EVENT_PENDING, pf->state);
+	for (vf_id = 0; vf_id < pf->num_alloc_vfs; vf_id++) {
+		reg_idx = (hw->func_caps.vf_base_id + vf_id) / 32;
+		bit_idx = (hw->func_caps.vf_base_id + vf_id) % 32;
+		/* read GLGEN_VFLRSTAT register to find out the flr VFs */
+		vf = &pf->vf[vf_id];
+		reg = rd32(hw, I40E_GLGEN_VFLRSTAT(reg_idx));
+		if (reg & BIT(bit_idx))
+			/* i40e_reset_vf will clear the bit in GLGEN_VFLRSTAT */
+			i40e_reset_vf(vf, true);
+	}
+
+	return 0;
+}
+
+/**
+ * i40e_ndo_set_vf_mac
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @mac: mac address
+ *
+ * program VF mac address
+ **/
+int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_mac_filter *f;
+	struct i40e_vf *vf;
+	int ret = 0;
+	struct hlist_node *h;
+	int bkt;
+	u8 i;
+
+	/* validate the request */
+	if (vf_id >= pf->num_alloc_vfs) {
+		dev_err(&pf->pdev->dev,
+			"Invalid VF Identifier %d\n", vf_id);
+		ret = -EINVAL;
+		goto error_param;
+	}
+
+	vf = &(pf->vf[vf_id]);
+	vsi = pf->vsi[vf->lan_vsi_idx];
+
+	/* When the VF is resetting wait until it is done.
+	 * It can take up to 200 milliseconds,
+	 * but wait for up to 300 milliseconds to be safe.
+	 */
+	for (i = 0; i < 15; i++) {
+		if (test_bit(I40E_VF_STATE_INIT, &vf->vf_states))
+			break;
+		msleep(20);
+	}
+	if (!test_bit(I40E_VF_STATE_INIT, &vf->vf_states)) {
+		dev_err(&pf->pdev->dev, "VF %d still in reset. Try again.\n",
+			vf_id);
+		ret = -EAGAIN;
+		goto error_param;
+	}
+
+	if (is_multicast_ether_addr(mac)) {
+		dev_err(&pf->pdev->dev,
+			"Invalid Ethernet address %pM for VF %d\n", mac, vf_id);
+		ret = -EINVAL;
+		goto error_param;
+	}
+
+	/* Lock once because below invoked function add/del_filter requires
+	 * mac_filter_hash_lock to be held
+	 */
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+
+	/* delete the temporary mac address */
+	if (!is_zero_ether_addr(vf->default_lan_addr.addr))
+		i40e_del_mac_filter(vsi, vf->default_lan_addr.addr);
+
+	/* Delete all the filters for this VSI - we're going to kill it
+	 * anyway.
+	 */
+	hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist)
+		__i40e_del_filter(vsi, f);
+
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+	/* program mac filter */
+	if (i40e_sync_vsi_filters(vsi)) {
+		dev_err(&pf->pdev->dev, "Unable to program ucast filters\n");
+		ret = -EIO;
+		goto error_param;
+	}
+	ether_addr_copy(vf->default_lan_addr.addr, mac);
+
+	if (is_zero_ether_addr(mac)) {
+		vf->pf_set_mac = false;
+		dev_info(&pf->pdev->dev, "Removing MAC on VF %d\n", vf_id);
+	} else {
+		vf->pf_set_mac = true;
+		dev_info(&pf->pdev->dev, "Setting MAC %pM on VF %d\n",
+			 mac, vf_id);
+	}
+
+	/* Force the VF driver stop so it has to reload with new MAC address */
+	i40e_vc_disable_vf(vf);
+	dev_info(&pf->pdev->dev, "Reload the VF driver to make this change effective.\n");
+
+error_param:
+	return ret;
+}
+
+/**
+ * i40e_vsi_has_vlans - True if VSI has configured VLANs
+ * @vsi: pointer to the vsi
+ *
+ * Check if a VSI has configured any VLANs. False if we have a port VLAN or if
+ * we have no configured VLANs. Do not call while holding the
+ * mac_filter_hash_lock.
+ */
+static bool i40e_vsi_has_vlans(struct i40e_vsi *vsi)
+{
+	bool have_vlans;
+
+	/* If we have a port VLAN, then the VSI cannot have any VLANs
+	 * configured, as all MAC/VLAN filters will be assigned to the PVID.
+	 */
+	if (vsi->info.pvid)
+		return false;
+
+	/* Since we don't have a PVID, we know that if the device is in VLAN
+	 * mode it must be because of a VLAN filter configured on this VSI.
+	 */
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+	have_vlans = i40e_is_vsi_in_vlan(vsi);
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+	return have_vlans;
+}
+
+/**
+ * i40e_ndo_set_vf_port_vlan
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @vlan_id: mac address
+ * @qos: priority setting
+ * @vlan_proto: vlan protocol
+ *
+ * program VF vlan id and/or qos
+ **/
+int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
+			      u16 vlan_id, u8 qos, __be16 vlan_proto)
+{
+	u16 vlanprio = vlan_id | (qos << I40E_VLAN_PRIORITY_SHIFT);
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_vsi *vsi;
+	struct i40e_vf *vf;
+	int ret = 0;
+
+	/* validate the request */
+	if (vf_id >= pf->num_alloc_vfs) {
+		dev_err(&pf->pdev->dev, "Invalid VF Identifier %d\n", vf_id);
+		ret = -EINVAL;
+		goto error_pvid;
+	}
+
+	if ((vlan_id > I40E_MAX_VLANID) || (qos > 7)) {
+		dev_err(&pf->pdev->dev, "Invalid VF Parameters\n");
+		ret = -EINVAL;
+		goto error_pvid;
+	}
+
+	if (vlan_proto != htons(ETH_P_8021Q)) {
+		dev_err(&pf->pdev->dev, "VF VLAN protocol is not supported\n");
+		ret = -EPROTONOSUPPORT;
+		goto error_pvid;
+	}
+
+	vf = &(pf->vf[vf_id]);
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (!test_bit(I40E_VF_STATE_INIT, &vf->vf_states)) {
+		dev_err(&pf->pdev->dev, "VF %d still in reset. Try again.\n",
+			vf_id);
+		ret = -EAGAIN;
+		goto error_pvid;
+	}
+
+	if (le16_to_cpu(vsi->info.pvid) == vlanprio)
+		/* duplicate request, so just return success */
+		goto error_pvid;
+
+	if (i40e_vsi_has_vlans(vsi)) {
+		dev_err(&pf->pdev->dev,
+			"VF %d has already configured VLAN filters and the administrator is requesting a port VLAN override.\nPlease unload and reload the VF driver for this change to take effect.\n",
+			vf_id);
+		/* Administrator Error - knock the VF offline until he does
+		 * the right thing by reconfiguring his network correctly
+		 * and then reloading the VF driver.
+		 */
+		i40e_vc_disable_vf(vf);
+		/* During reset the VF got a new VSI, so refresh the pointer. */
+		vsi = pf->vsi[vf->lan_vsi_idx];
+	}
+
+	/* Locked once because multiple functions below iterate list */
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+
+	/* Check for condition where there was already a port VLAN ID
+	 * filter set and now it is being deleted by setting it to zero.
+	 * Additionally check for the condition where there was a port
+	 * VLAN but now there is a new and different port VLAN being set.
+	 * Before deleting all the old VLAN filters we must add new ones
+	 * with -1 (I40E_VLAN_ANY) or otherwise we're left with all our
+	 * MAC addresses deleted.
+	 */
+	if ((!(vlan_id || qos) ||
+	    vlanprio != le16_to_cpu(vsi->info.pvid)) &&
+	    vsi->info.pvid) {
+		ret = i40e_add_vlan_all_mac(vsi, I40E_VLAN_ANY);
+		if (ret) {
+			dev_info(&vsi->back->pdev->dev,
+				 "add VF VLAN failed, ret=%d aq_err=%d\n", ret,
+				 vsi->back->hw.aq.asq_last_status);
+			spin_unlock_bh(&vsi->mac_filter_hash_lock);
+			goto error_pvid;
+		}
+	}
+
+	if (vsi->info.pvid) {
+		/* remove all filters on the old VLAN */
+		i40e_rm_vlan_all_mac(vsi, (le16_to_cpu(vsi->info.pvid) &
+					   VLAN_VID_MASK));
+	}
+
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+	if (vlan_id || qos)
+		ret = i40e_vsi_add_pvid(vsi, vlanprio);
+	else
+		i40e_vsi_remove_pvid(vsi);
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+
+	if (vlan_id) {
+		dev_info(&pf->pdev->dev, "Setting VLAN %d, QOS 0x%x on VF %d\n",
+			 vlan_id, qos, vf_id);
+
+		/* add new VLAN filter for each MAC */
+		ret = i40e_add_vlan_all_mac(vsi, vlan_id);
+		if (ret) {
+			dev_info(&vsi->back->pdev->dev,
+				 "add VF VLAN failed, ret=%d aq_err=%d\n", ret,
+				 vsi->back->hw.aq.asq_last_status);
+			spin_unlock_bh(&vsi->mac_filter_hash_lock);
+			goto error_pvid;
+		}
+
+		/* remove the previously added non-VLAN MAC filters */
+		i40e_rm_vlan_all_mac(vsi, I40E_VLAN_ANY);
+	}
+
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+	/* Schedule the worker thread to take care of applying changes */
+	i40e_service_event_schedule(vsi->back);
+
+	if (ret) {
+		dev_err(&pf->pdev->dev, "Unable to update VF vsi context\n");
+		goto error_pvid;
+	}
+
+	/* The Port VLAN needs to be saved across resets the same as the
+	 * default LAN MAC address.
+	 */
+	vf->port_vlan_id = le16_to_cpu(vsi->info.pvid);
+	ret = 0;
+
+error_pvid:
+	return ret;
+}
+
+/**
+ * i40e_ndo_set_vf_bw
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @tx_rate: Tx rate
+ *
+ * configure VF Tx rate
+ **/
+int i40e_ndo_set_vf_bw(struct net_device *netdev, int vf_id, int min_tx_rate,
+		       int max_tx_rate)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_vsi *vsi;
+	struct i40e_vf *vf;
+	int ret = 0;
+
+	/* validate the request */
+	if (vf_id >= pf->num_alloc_vfs) {
+		dev_err(&pf->pdev->dev, "Invalid VF Identifier %d.\n", vf_id);
+		ret = -EINVAL;
+		goto error;
+	}
+
+	if (min_tx_rate) {
+		dev_err(&pf->pdev->dev, "Invalid min tx rate (%d) (greater than 0) specified for VF %d.\n",
+			min_tx_rate, vf_id);
+		return -EINVAL;
+	}
+
+	vf = &(pf->vf[vf_id]);
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (!test_bit(I40E_VF_STATE_INIT, &vf->vf_states)) {
+		dev_err(&pf->pdev->dev, "VF %d still in reset. Try again.\n",
+			vf_id);
+		ret = -EAGAIN;
+		goto error;
+	}
+
+	ret = i40e_set_bw_limit(vsi, vsi->seid, max_tx_rate);
+	if (ret)
+		goto error;
+
+	vf->tx_rate = max_tx_rate;
+error:
+	return ret;
+}
+
+/**
+ * i40e_ndo_get_vf_config
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @ivi: VF configuration structure
+ *
+ * return VF configuration
+ **/
+int i40e_ndo_get_vf_config(struct net_device *netdev,
+			   int vf_id, struct ifla_vf_info *ivi)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_vf *vf;
+	int ret = 0;
+
+	/* validate the request */
+	if (vf_id >= pf->num_alloc_vfs) {
+		dev_err(&pf->pdev->dev, "Invalid VF Identifier %d\n", vf_id);
+		ret = -EINVAL;
+		goto error_param;
+	}
+
+	vf = &(pf->vf[vf_id]);
+	/* first vsi is always the LAN vsi */
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (!test_bit(I40E_VF_STATE_INIT, &vf->vf_states)) {
+		dev_err(&pf->pdev->dev, "VF %d still in reset. Try again.\n",
+			vf_id);
+		ret = -EAGAIN;
+		goto error_param;
+	}
+
+	ivi->vf = vf_id;
+
+	ether_addr_copy(ivi->mac, vf->default_lan_addr.addr);
+
+	ivi->max_tx_rate = vf->tx_rate;
+	ivi->min_tx_rate = 0;
+	ivi->vlan = le16_to_cpu(vsi->info.pvid) & I40E_VLAN_MASK;
+	ivi->qos = (le16_to_cpu(vsi->info.pvid) & I40E_PRIORITY_MASK) >>
+		   I40E_VLAN_PRIORITY_SHIFT;
+	if (vf->link_forced == false)
+		ivi->linkstate = IFLA_VF_LINK_STATE_AUTO;
+	else if (vf->link_up == true)
+		ivi->linkstate = IFLA_VF_LINK_STATE_ENABLE;
+	else
+		ivi->linkstate = IFLA_VF_LINK_STATE_DISABLE;
+	ivi->spoofchk = vf->spoofchk;
+	ivi->trusted = vf->trusted;
+	ret = 0;
+
+error_param:
+	return ret;
+}
+
+/**
+ * i40e_ndo_set_vf_link_state
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @link: required link state
+ *
+ * Set the link state of a specified VF, regardless of physical link state
+ **/
+int i40e_ndo_set_vf_link_state(struct net_device *netdev, int vf_id, int link)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	struct virtchnl_pf_event pfe;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_vf *vf;
+	int abs_vf_id;
+	int ret = 0;
+
+	/* validate the request */
+	if (vf_id >= pf->num_alloc_vfs) {
+		dev_err(&pf->pdev->dev, "Invalid VF Identifier %d\n", vf_id);
+		ret = -EINVAL;
+		goto error_out;
+	}
+
+	vf = &pf->vf[vf_id];
+	abs_vf_id = vf->vf_id + hw->func_caps.vf_base_id;
+
+	pfe.event = VIRTCHNL_EVENT_LINK_CHANGE;
+	pfe.severity = PF_EVENT_SEVERITY_INFO;
+
+	switch (link) {
+	case IFLA_VF_LINK_STATE_AUTO:
+		vf->link_forced = false;
+		pfe.event_data.link_event.link_status =
+			pf->hw.phy.link_info.link_info & I40E_AQ_LINK_UP;
+		pfe.event_data.link_event.link_speed =
+			(enum virtchnl_link_speed)
+			pf->hw.phy.link_info.link_speed;
+		break;
+	case IFLA_VF_LINK_STATE_ENABLE:
+		vf->link_forced = true;
+		vf->link_up = true;
+		pfe.event_data.link_event.link_status = true;
+		pfe.event_data.link_event.link_speed = I40E_LINK_SPEED_40GB;
+		break;
+	case IFLA_VF_LINK_STATE_DISABLE:
+		vf->link_forced = true;
+		vf->link_up = false;
+		pfe.event_data.link_event.link_status = false;
+		pfe.event_data.link_event.link_speed = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		goto error_out;
+	}
+	/* Notify the VF of its new link state */
+	i40e_aq_send_msg_to_vf(hw, abs_vf_id, VIRTCHNL_OP_EVENT,
+			       0, (u8 *)&pfe, sizeof(pfe), NULL);
+
+error_out:
+	return ret;
+}
+
+/**
+ * i40e_ndo_set_vf_spoofchk
+ * @netdev: network interface device structure
+ * @vf_id: VF identifier
+ * @enable: flag to enable or disable feature
+ *
+ * Enable or disable VF spoof checking
+ **/
+int i40e_ndo_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool enable)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_vsi_context ctxt;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_vf *vf;
+	int ret = 0;
+
+	/* validate the request */
+	if (vf_id >= pf->num_alloc_vfs) {
+		dev_err(&pf->pdev->dev, "Invalid VF Identifier %d\n", vf_id);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	vf = &(pf->vf[vf_id]);
+	if (!test_bit(I40E_VF_STATE_INIT, &vf->vf_states)) {
+		dev_err(&pf->pdev->dev, "VF %d still in reset. Try again.\n",
+			vf_id);
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	if (enable == vf->spoofchk)
+		goto out;
+
+	vf->spoofchk = enable;
+	memset(&ctxt, 0, sizeof(ctxt));
+	ctxt.seid = pf->vsi[vf->lan_vsi_idx]->seid;
+	ctxt.pf_num = pf->hw.pf_id;
+	ctxt.info.valid_sections = cpu_to_le16(I40E_AQ_VSI_PROP_SECURITY_VALID);
+	if (enable)
+		ctxt.info.sec_flags |= (I40E_AQ_VSI_SEC_FLAG_ENABLE_VLAN_CHK |
+					I40E_AQ_VSI_SEC_FLAG_ENABLE_MAC_CHK);
+	ret = i40e_aq_update_vsi_params(hw, &ctxt, NULL);
+	if (ret) {
+		dev_err(&pf->pdev->dev, "Error %d updating VSI parameters\n",
+			ret);
+		ret = -EIO;
+	}
+out:
+	return ret;
+}
+
+/**
+ * i40e_ndo_set_vf_trust
+ * @netdev: network interface device structure of the pf
+ * @vf_id: VF identifier
+ * @setting: trust setting
+ *
+ * Enable or disable VF trust setting
+ **/
+int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+	struct i40e_vf *vf;
+	int ret = 0;
+
+	/* validate the request */
+	if (vf_id >= pf->num_alloc_vfs) {
+		dev_err(&pf->pdev->dev, "Invalid VF Identifier %d\n", vf_id);
+		return -EINVAL;
+	}
+
+	if (pf->flags & I40E_FLAG_MFP_ENABLED) {
+		dev_err(&pf->pdev->dev, "Trusted VF not supported in MFP mode.\n");
+		return -EINVAL;
+	}
+
+	vf = &pf->vf[vf_id];
+
+	if (setting == vf->trusted)
+		goto out;
+
+	vf->trusted = setting;
+	i40e_vc_disable_vf(vf);
+	dev_info(&pf->pdev->dev, "VF %u is now %strusted\n",
+		 vf_id, setting ? "" : "un");
+
+	if (vf->adq_enabled) {
+		if (!vf->trusted) {
+			dev_info(&pf->pdev->dev,
+				 "VF %u no longer Trusted, deleting all cloud filters\n",
+				 vf_id);
+			i40e_del_all_cloud_filters(vf);
+		}
+	}
+
+out:
+	return ret;
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
new file mode 100644
index 0000000..57f727b
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 - 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#ifndef _I40E_VIRTCHNL_PF_H_
+#define _I40E_VIRTCHNL_PF_H_
+
+#include "i40e.h"
+
+#define I40E_MAX_VLANID 4095
+
+#define I40E_VIRTCHNL_SUPPORTED_QTYPES 2
+
+#define I40E_DEFAULT_NUM_MDD_EVENTS_ALLOWED	3
+#define I40E_DEFAULT_NUM_INVALID_MSGS_ALLOWED	10
+
+#define I40E_VLAN_PRIORITY_SHIFT	12
+#define I40E_VLAN_MASK			0xFFF
+#define I40E_PRIORITY_MASK		0x7000
+
+/* Various queue ctrls */
+enum i40e_queue_ctrl {
+	I40E_QUEUE_CTRL_UNKNOWN = 0,
+	I40E_QUEUE_CTRL_ENABLE,
+	I40E_QUEUE_CTRL_ENABLECHECK,
+	I40E_QUEUE_CTRL_DISABLE,
+	I40E_QUEUE_CTRL_DISABLECHECK,
+	I40E_QUEUE_CTRL_FASTDISABLE,
+	I40E_QUEUE_CTRL_FASTDISABLECHECK,
+};
+
+/* VF states */
+enum i40e_vf_states {
+	I40E_VF_STATE_INIT = 0,
+	I40E_VF_STATE_ACTIVE,
+	I40E_VF_STATE_IWARPENA,
+	I40E_VF_STATE_DISABLED,
+	I40E_VF_STATE_MC_PROMISC,
+	I40E_VF_STATE_UC_PROMISC,
+	I40E_VF_STATE_PRE_ENABLE,
+};
+
+/* VF capabilities */
+enum i40e_vf_capabilities {
+	I40E_VIRTCHNL_VF_CAP_PRIVILEGE = 0,
+	I40E_VIRTCHNL_VF_CAP_L2,
+	I40E_VIRTCHNL_VF_CAP_IWARP,
+};
+
+/* In ADq, max 4 VSI's can be allocated per VF including primary VF VSI.
+ * These variables are used to store indices, id's and number of queues
+ * for each VSI including that of primary VF VSI. Each Traffic class is
+ * termed as channel and each channel can in-turn have 4 queues which
+ * means max 16 queues overall per VF.
+ */
+struct i40evf_channel {
+	u16 vsi_idx; /* index in PF struct for all channel VSIs */
+	u16 vsi_id; /* VSI ID used by firmware */
+	u16 num_qps; /* number of queue pairs requested by user */
+	u64 max_tx_rate; /* bandwidth rate allocation for VSIs */
+};
+
+/* VF information structure */
+struct i40e_vf {
+	struct i40e_pf *pf;
+
+	/* VF id in the PF space */
+	s16 vf_id;
+	/* all VF vsis connect to the same parent */
+	enum i40e_switch_element_types parent_type;
+	struct virtchnl_version_info vf_ver;
+	u32 driver_caps; /* reported by VF driver */
+
+	/* VF Port Extender (PE) stag if used */
+	u16 stag;
+
+	struct virtchnl_ether_addr default_lan_addr;
+	u16 port_vlan_id;
+	bool pf_set_mac;	/* The VMM admin set the VF MAC address */
+	bool trusted;
+
+	/* VSI indices - actual VSI pointers are maintained in the PF structure
+	 * When assigned, these will be non-zero, because VSI 0 is always
+	 * the main LAN VSI for the PF.
+	 */
+	u16 lan_vsi_idx;	/* index into PF struct */
+	u16 lan_vsi_id;		/* ID as used by firmware */
+
+	u8 num_queue_pairs;	/* num of qps assigned to VF vsis */
+	u8 num_req_queues;	/* num of requested qps */
+	u64 num_mdd_events;	/* num of mdd events detected */
+	/* num of continuous malformed or invalid msgs detected */
+	u64 num_invalid_msgs;
+	u64 num_valid_msgs;	/* num of valid msgs detected */
+
+	unsigned long vf_caps;	/* vf's adv. capabilities */
+	unsigned long vf_states;	/* vf's runtime states */
+	unsigned int tx_rate;	/* Tx bandwidth limit in Mbps */
+	bool link_forced;
+	bool link_up;		/* only valid if VF link is forced */
+	bool spoofchk;
+	u16 num_mac;
+	u16 num_vlan;
+
+	/* ADq related variables */
+	bool adq_enabled; /* flag to enable adq */
+	u8 num_tc;
+	struct i40evf_channel ch[I40E_MAX_VF_VSI];
+	struct hlist_head cloud_filter_list;
+	u16 num_cloud_filters;
+
+	/* RDMA Client */
+	struct virtchnl_iwarp_qvlist_info *qvlist_info;
+};
+
+void i40e_free_vfs(struct i40e_pf *pf);
+int i40e_pci_sriov_configure(struct pci_dev *dev, int num_vfs);
+int i40e_alloc_vfs(struct i40e_pf *pf, u16 num_alloc_vfs);
+int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode,
+			   u32 v_retval, u8 *msg, u16 msglen);
+int i40e_vc_process_vflr_event(struct i40e_pf *pf);
+bool i40e_reset_vf(struct i40e_vf *vf, bool flr);
+bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr);
+void i40e_vc_notify_vf_reset(struct i40e_vf *vf);
+
+/* VF configuration related iplink handlers */
+int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac);
+int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
+			      u16 vlan_id, u8 qos, __be16 vlan_proto);
+int i40e_ndo_set_vf_bw(struct net_device *netdev, int vf_id, int min_tx_rate,
+		       int max_tx_rate);
+int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting);
+int i40e_ndo_get_vf_config(struct net_device *netdev,
+			   int vf_id, struct ifla_vf_info *ivi);
+int i40e_ndo_set_vf_link_state(struct net_device *netdev, int vf_id, int link);
+int i40e_ndo_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool enable);
+
+void i40e_vc_notify_link_state(struct i40e_pf *pf);
+void i40e_vc_notify_reset(struct i40e_pf *pf);
+
+#endif /* _I40E_VIRTCHNL_PF_H_ */
diff --git a/drivers/net/ethernet/mellanox/Kconfig b/drivers/net/ethernet/mellanox/Kconfig
new file mode 100644
index 0000000..872548c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/Kconfig
@@ -0,0 +1,25 @@
+#
+# Mellanox driver configuration
+#
+
+config NET_VENDOR_MELLANOX
+	bool "Mellanox devices"
+	default y
+	depends on PCI || I2C
+	---help---
+	  If you have a network (Ethernet or RDMA) device belonging to this
+	  class, say Y.
+
+	  Note that the answer to this question doesn't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about Mellanox cards. If you say Y, you will be asked
+	  for your specific card in the following questions.
+
+if NET_VENDOR_MELLANOX
+
+source "drivers/net/ethernet/mellanox/mlx4/Kconfig"
+source "drivers/net/ethernet/mellanox/mlx5/core/Kconfig"
+source "drivers/net/ethernet/mellanox/mlxsw/Kconfig"
+source "drivers/net/ethernet/mellanox/mlxfw/Kconfig"
+
+endif # NET_VENDOR_MELLANOX
diff --git a/drivers/net/ethernet/mellanox/Makefile b/drivers/net/ethernet/mellanox/Makefile
new file mode 100644
index 0000000..016aa26
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Mellanox device drivers.
+#
+
+obj-$(CONFIG_MLX4_CORE) += mlx4/
+obj-$(CONFIG_MLX5_CORE) += mlx5/core/
+obj-$(CONFIG_MLXSW_CORE) += mlxsw/
+obj-$(CONFIG_MLXFW) += mlxfw/
diff --git a/drivers/net/ethernet/mellanox/mlx4/Kconfig b/drivers/net/ethernet/mellanox/mlx4/Kconfig
new file mode 100644
index 0000000..36054e6
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/Kconfig
@@ -0,0 +1,48 @@
+#
+# Mellanox driver configuration
+#
+
+config MLX4_EN
+	tristate "Mellanox Technologies 1/10/40Gbit Ethernet support"
+	depends on MAY_USE_DEVLINK
+	depends on PCI
+	select MLX4_CORE
+	imply PTP_1588_CLOCK
+	---help---
+	  This driver supports Mellanox Technologies ConnectX Ethernet
+	  devices.
+
+config MLX4_EN_DCB
+	bool "Data Center Bridging (DCB) Support"
+	default y
+	depends on MLX4_EN && DCB
+	---help---
+	  Say Y here if you want to use Data Center Bridging (DCB) in the
+	  driver.
+	  If set to N, will not be able to configure QoS and ratelimit attributes.
+	  This flag is depended on the kernel's DCB support.
+
+	  If unsure, set to Y
+
+config MLX4_CORE
+	tristate
+	depends on PCI
+	default n
+
+config MLX4_DEBUG
+	bool "Verbose debugging output" if (MLX4_CORE && EXPERT)
+	depends on MLX4_CORE
+	default y
+	---help---
+	  This option causes debugging code to be compiled into the
+	  mlx4_core driver.  The output can be turned on via the
+	  debug_level module parameter (which can also be set after
+	  the driver is loaded through sysfs).
+
+config MLX4_CORE_GEN2
+	bool "Support for old gen2 Mellanox PCI IDs" if (MLX4_CORE)
+	depends on MLX4_CORE
+	default y
+	---help---
+	  Say Y here if you want to use old gen2 Mellanox devices in the
+	  driver.
diff --git a/drivers/net/ethernet/mellanox/mlx4/Makefile b/drivers/net/ethernet/mellanox/mlx4/Makefile
new file mode 100644
index 0000000..16b10d0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_MLX4_CORE)		+= mlx4_core.o
+
+mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o fw_qos.o icm.o intf.o \
+		main.o mcg.o mr.o pd.o port.o profile.o qp.o reset.o sense.o \
+		srq.o resource_tracker.o
+
+obj-$(CONFIG_MLX4_EN)               += mlx4_en.o
+
+mlx4_en-y := 	en_main.o en_tx.o en_rx.o en_ethtool.o en_port.o en_cq.o \
+		en_resources.o en_netdev.o en_selftest.o en_clock.o
+mlx4_en-$(CONFIG_MLX4_EN_DCB) += en_dcb_nl.o
diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
new file mode 100644
index 0000000..6dabd98
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/bitmap.h>
+#include <linux/dma-mapping.h>
+#include <linux/vmalloc.h>
+
+#include "mlx4.h"
+
+u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
+{
+	u32 obj;
+
+	spin_lock(&bitmap->lock);
+
+	obj = find_next_zero_bit(bitmap->table, bitmap->max, bitmap->last);
+	if (obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+		obj = find_first_zero_bit(bitmap->table, bitmap->max);
+	}
+
+	if (obj < bitmap->max) {
+		set_bit(obj, bitmap->table);
+		bitmap->last = (obj + 1);
+		if (bitmap->last == bitmap->max)
+			bitmap->last = 0;
+		obj |= bitmap->top;
+	} else
+		obj = -1;
+
+	if (obj != -1)
+		--bitmap->avail;
+
+	spin_unlock(&bitmap->lock);
+
+	return obj;
+}
+
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr)
+{
+	mlx4_bitmap_free_range(bitmap, obj, 1, use_rr);
+}
+
+static unsigned long find_aligned_range(unsigned long *bitmap,
+					u32 start, u32 nbits,
+					int len, int align, u32 skip_mask)
+{
+	unsigned long end, i;
+
+again:
+	start = ALIGN(start, align);
+
+	while ((start < nbits) && (test_bit(start, bitmap) ||
+				   (start & skip_mask)))
+		start += align;
+
+	if (start >= nbits)
+		return -1;
+
+	end = start+len;
+	if (end > nbits)
+		return -1;
+
+	for (i = start + 1; i < end; i++) {
+		if (test_bit(i, bitmap) || ((u32)i & skip_mask)) {
+			start = i + 1;
+			goto again;
+		}
+	}
+
+	return start;
+}
+
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
+			    int align, u32 skip_mask)
+{
+	u32 obj;
+
+	if (likely(cnt == 1 && align == 1 && !skip_mask))
+		return mlx4_bitmap_alloc(bitmap);
+
+	spin_lock(&bitmap->lock);
+
+	obj = find_aligned_range(bitmap->table, bitmap->last,
+				 bitmap->max, cnt, align, skip_mask);
+	if (obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+		obj = find_aligned_range(bitmap->table, 0, bitmap->max,
+					 cnt, align, skip_mask);
+	}
+
+	if (obj < bitmap->max) {
+		bitmap_set(bitmap->table, obj, cnt);
+		if (obj == bitmap->last) {
+			bitmap->last = (obj + cnt);
+			if (bitmap->last >= bitmap->max)
+				bitmap->last = 0;
+		}
+		obj |= bitmap->top;
+	} else
+		obj = -1;
+
+	if (obj != -1)
+		bitmap->avail -= cnt;
+
+	spin_unlock(&bitmap->lock);
+
+	return obj;
+}
+
+u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap)
+{
+	return bitmap->avail;
+}
+
+static u32 mlx4_bitmap_masked_value(struct mlx4_bitmap *bitmap, u32 obj)
+{
+	return obj & (bitmap->max + bitmap->reserved_top - 1);
+}
+
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt,
+			    int use_rr)
+{
+	obj &= bitmap->max + bitmap->reserved_top - 1;
+
+	spin_lock(&bitmap->lock);
+	if (!use_rr) {
+		bitmap->last = min(bitmap->last, obj);
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+	}
+	bitmap_clear(bitmap->table, obj, cnt);
+	bitmap->avail += cnt;
+	spin_unlock(&bitmap->lock);
+}
+
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+		     u32 reserved_bot, u32 reserved_top)
+{
+	/* num must be a power of 2 */
+	if (num != roundup_pow_of_two(num))
+		return -EINVAL;
+
+	bitmap->last = 0;
+	bitmap->top  = 0;
+	bitmap->max  = num - reserved_top;
+	bitmap->mask = mask;
+	bitmap->reserved_top = reserved_top;
+	bitmap->avail = num - reserved_top - reserved_bot;
+	bitmap->effective_len = bitmap->avail;
+	spin_lock_init(&bitmap->lock);
+	bitmap->table = kzalloc(BITS_TO_LONGS(bitmap->max) *
+				sizeof(long), GFP_KERNEL);
+	if (!bitmap->table)
+		return -ENOMEM;
+
+	bitmap_set(bitmap->table, 0, reserved_bot);
+
+	return 0;
+}
+
+void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
+{
+	kfree(bitmap->table);
+}
+
+struct mlx4_zone_allocator {
+	struct list_head		entries;
+	struct list_head		prios;
+	u32				last_uid;
+	u32				mask;
+	/* protect the zone_allocator from concurrent accesses */
+	spinlock_t			lock;
+	enum mlx4_zone_alloc_flags	flags;
+};
+
+struct mlx4_zone_entry {
+	struct list_head		list;
+	struct list_head		prio_list;
+	u32				uid;
+	struct mlx4_zone_allocator	*allocator;
+	struct mlx4_bitmap		*bitmap;
+	int				use_rr;
+	int				priority;
+	int				offset;
+	enum mlx4_zone_flags		flags;
+};
+
+struct mlx4_zone_allocator *mlx4_zone_allocator_create(enum mlx4_zone_alloc_flags flags)
+{
+	struct mlx4_zone_allocator *zones = kmalloc(sizeof(*zones), GFP_KERNEL);
+
+	if (NULL == zones)
+		return NULL;
+
+	INIT_LIST_HEAD(&zones->entries);
+	INIT_LIST_HEAD(&zones->prios);
+	spin_lock_init(&zones->lock);
+	zones->last_uid = 0;
+	zones->mask = 0;
+	zones->flags = flags;
+
+	return zones;
+}
+
+int mlx4_zone_add_one(struct mlx4_zone_allocator *zone_alloc,
+		      struct mlx4_bitmap *bitmap,
+		      u32 flags,
+		      int priority,
+		      int offset,
+		      u32 *puid)
+{
+	u32 mask = mlx4_bitmap_masked_value(bitmap, (u32)-1);
+	struct mlx4_zone_entry *it;
+	struct mlx4_zone_entry *zone = kmalloc(sizeof(*zone), GFP_KERNEL);
+
+	if (NULL == zone)
+		return -ENOMEM;
+
+	zone->flags = flags;
+	zone->bitmap = bitmap;
+	zone->use_rr = (flags & MLX4_ZONE_USE_RR) ? MLX4_USE_RR : 0;
+	zone->priority = priority;
+	zone->offset = offset;
+
+	spin_lock(&zone_alloc->lock);
+
+	zone->uid = zone_alloc->last_uid++;
+	zone->allocator = zone_alloc;
+
+	if (zone_alloc->mask < mask)
+		zone_alloc->mask = mask;
+
+	list_for_each_entry(it, &zone_alloc->prios, prio_list)
+		if (it->priority >= priority)
+			break;
+
+	if (&it->prio_list == &zone_alloc->prios || it->priority > priority)
+		list_add_tail(&zone->prio_list, &it->prio_list);
+	list_add_tail(&zone->list, &it->list);
+
+	spin_unlock(&zone_alloc->lock);
+
+	*puid = zone->uid;
+
+	return 0;
+}
+
+/* Should be called under a lock */
+static void __mlx4_zone_remove_one_entry(struct mlx4_zone_entry *entry)
+{
+	struct mlx4_zone_allocator *zone_alloc = entry->allocator;
+
+	if (!list_empty(&entry->prio_list)) {
+		/* Check if we need to add an alternative node to the prio list */
+		if (!list_is_last(&entry->list, &zone_alloc->entries)) {
+			struct mlx4_zone_entry *next = list_first_entry(&entry->list,
+									typeof(*next),
+									list);
+
+			if (next->priority == entry->priority)
+				list_add_tail(&next->prio_list, &entry->prio_list);
+		}
+
+		list_del(&entry->prio_list);
+	}
+
+	list_del(&entry->list);
+
+	if (zone_alloc->flags & MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP) {
+		u32 mask = 0;
+		struct mlx4_zone_entry *it;
+
+		list_for_each_entry(it, &zone_alloc->prios, prio_list) {
+			u32 cur_mask = mlx4_bitmap_masked_value(it->bitmap, (u32)-1);
+
+			if (mask < cur_mask)
+				mask = cur_mask;
+		}
+		zone_alloc->mask = mask;
+	}
+}
+
+void mlx4_zone_allocator_destroy(struct mlx4_zone_allocator *zone_alloc)
+{
+	struct mlx4_zone_entry *zone, *tmp;
+
+	spin_lock(&zone_alloc->lock);
+
+	list_for_each_entry_safe(zone, tmp, &zone_alloc->entries, list) {
+		list_del(&zone->list);
+		list_del(&zone->prio_list);
+		kfree(zone);
+	}
+
+	spin_unlock(&zone_alloc->lock);
+	kfree(zone_alloc);
+}
+
+/* Should be called under a lock */
+static u32 __mlx4_alloc_from_zone(struct mlx4_zone_entry *zone, int count,
+				  int align, u32 skip_mask, u32 *puid)
+{
+	u32 uid;
+	u32 res;
+	struct mlx4_zone_allocator *zone_alloc = zone->allocator;
+	struct mlx4_zone_entry *curr_node;
+
+	res = mlx4_bitmap_alloc_range(zone->bitmap, count,
+				      align, skip_mask);
+
+	if (res != (u32)-1) {
+		res += zone->offset;
+		uid = zone->uid;
+		goto out;
+	}
+
+	list_for_each_entry(curr_node, &zone_alloc->prios, prio_list) {
+		if (unlikely(curr_node->priority == zone->priority))
+			break;
+	}
+
+	if (zone->flags & MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO) {
+		struct mlx4_zone_entry *it = curr_node;
+
+		list_for_each_entry_continue_reverse(it, &zone_alloc->entries, list) {
+			res = mlx4_bitmap_alloc_range(it->bitmap, count,
+						      align, skip_mask);
+			if (res != (u32)-1) {
+				res += it->offset;
+				uid = it->uid;
+				goto out;
+			}
+		}
+	}
+
+	if (zone->flags & MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO) {
+		struct mlx4_zone_entry *it = curr_node;
+
+		list_for_each_entry_from(it, &zone_alloc->entries, list) {
+			if (unlikely(it == zone))
+				continue;
+
+			if (unlikely(it->priority != curr_node->priority))
+				break;
+
+			res = mlx4_bitmap_alloc_range(it->bitmap, count,
+						      align, skip_mask);
+			if (res != (u32)-1) {
+				res += it->offset;
+				uid = it->uid;
+				goto out;
+			}
+		}
+	}
+
+	if (zone->flags & MLX4_ZONE_FALLBACK_TO_HIGHER_PRIO) {
+		if (list_is_last(&curr_node->prio_list, &zone_alloc->prios))
+			goto out;
+
+		curr_node = list_first_entry(&curr_node->prio_list,
+					     typeof(*curr_node),
+					     prio_list);
+
+		list_for_each_entry_from(curr_node, &zone_alloc->entries, list) {
+			res = mlx4_bitmap_alloc_range(curr_node->bitmap, count,
+						      align, skip_mask);
+			if (res != (u32)-1) {
+				res += curr_node->offset;
+				uid = curr_node->uid;
+				goto out;
+			}
+		}
+	}
+
+out:
+	if (NULL != puid && res != (u32)-1)
+		*puid = uid;
+	return res;
+}
+
+/* Should be called under a lock */
+static void __mlx4_free_from_zone(struct mlx4_zone_entry *zone, u32 obj,
+				  u32 count)
+{
+	mlx4_bitmap_free_range(zone->bitmap, obj - zone->offset, count, zone->use_rr);
+}
+
+/* Should be called under a lock */
+static struct mlx4_zone_entry *__mlx4_find_zone_by_uid(
+		struct mlx4_zone_allocator *zones, u32 uid)
+{
+	struct mlx4_zone_entry *zone;
+
+	list_for_each_entry(zone, &zones->entries, list) {
+		if (zone->uid == uid)
+			return zone;
+	}
+
+	return NULL;
+}
+
+struct mlx4_bitmap *mlx4_zone_get_bitmap(struct mlx4_zone_allocator *zones, u32 uid)
+{
+	struct mlx4_zone_entry *zone;
+	struct mlx4_bitmap *bitmap;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	bitmap = zone == NULL ? NULL : zone->bitmap;
+
+	spin_unlock(&zones->lock);
+
+	return bitmap;
+}
+
+int mlx4_zone_remove_one(struct mlx4_zone_allocator *zones, u32 uid)
+{
+	struct mlx4_zone_entry *zone;
+	int res = 0;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	if (NULL == zone) {
+		res = -1;
+		goto out;
+	}
+
+	__mlx4_zone_remove_one_entry(zone);
+
+out:
+	spin_unlock(&zones->lock);
+	kfree(zone);
+
+	return res;
+}
+
+/* Should be called under a lock */
+static struct mlx4_zone_entry *__mlx4_find_zone_by_uid_unique(
+		struct mlx4_zone_allocator *zones, u32 obj)
+{
+	struct mlx4_zone_entry *zone, *zone_candidate = NULL;
+	u32 dist = (u32)-1;
+
+	/* Search for the smallest zone that this obj could be
+	 * allocated from. This is done in order to handle
+	 * situations when small bitmaps are allocated from bigger
+	 * bitmaps (and the allocated space is marked as reserved in
+	 * the bigger bitmap.
+	 */
+	list_for_each_entry(zone, &zones->entries, list) {
+		if (obj >= zone->offset) {
+			u32 mobj = (obj - zone->offset) & zones->mask;
+
+			if (mobj < zone->bitmap->max) {
+				u32 curr_dist = zone->bitmap->effective_len;
+
+				if (curr_dist < dist) {
+					dist = curr_dist;
+					zone_candidate = zone;
+				}
+			}
+		}
+	}
+
+	return zone_candidate;
+}
+
+u32 mlx4_zone_alloc_entries(struct mlx4_zone_allocator *zones, u32 uid, int count,
+			    int align, u32 skip_mask, u32 *puid)
+{
+	struct mlx4_zone_entry *zone;
+	int res = -1;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	if (NULL == zone)
+		goto out;
+
+	res = __mlx4_alloc_from_zone(zone, count, align, skip_mask, puid);
+
+out:
+	spin_unlock(&zones->lock);
+
+	return res;
+}
+
+u32 mlx4_zone_free_entries(struct mlx4_zone_allocator *zones, u32 uid, u32 obj, u32 count)
+{
+	struct mlx4_zone_entry *zone;
+	int res = 0;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	if (NULL == zone) {
+		res = -1;
+		goto out;
+	}
+
+	__mlx4_free_from_zone(zone, obj, count);
+
+out:
+	spin_unlock(&zones->lock);
+
+	return res;
+}
+
+u32 mlx4_zone_free_entries_unique(struct mlx4_zone_allocator *zones, u32 obj, u32 count)
+{
+	struct mlx4_zone_entry *zone;
+	int res;
+
+	if (!(zones->flags & MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP))
+		return -EFAULT;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid_unique(zones, obj);
+
+	if (NULL == zone) {
+		res = -1;
+		goto out;
+	}
+
+	__mlx4_free_from_zone(zone, obj, count);
+	res = 0;
+
+out:
+	spin_unlock(&zones->lock);
+
+	return res;
+}
+
+static int mlx4_buf_direct_alloc(struct mlx4_dev *dev, int size,
+				 struct mlx4_buf *buf)
+{
+	dma_addr_t t;
+
+	buf->nbufs        = 1;
+	buf->npages       = 1;
+	buf->page_shift   = get_order(size) + PAGE_SHIFT;
+	buf->direct.buf   =
+		dma_zalloc_coherent(&dev->persist->pdev->dev,
+				    size, &t, GFP_KERNEL);
+	if (!buf->direct.buf)
+		return -ENOMEM;
+
+	buf->direct.map = t;
+
+	while (t & ((1 << buf->page_shift) - 1)) {
+		--buf->page_shift;
+		buf->npages *= 2;
+	}
+
+	return 0;
+}
+
+/* Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0. If the
+ *  requested size is > max_direct, we split the allocation into
+ *  multiple pages, so we don't require too much contiguous memory.
+ */
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+		   struct mlx4_buf *buf)
+{
+	if (size <= max_direct) {
+		return mlx4_buf_direct_alloc(dev, size, buf);
+	} else {
+		dma_addr_t t;
+		int i;
+
+		buf->direct.buf = NULL;
+		buf->nbufs	= (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		buf->npages	= buf->nbufs;
+		buf->page_shift  = PAGE_SHIFT;
+		buf->page_list   = kcalloc(buf->nbufs, sizeof(*buf->page_list),
+					   GFP_KERNEL);
+		if (!buf->page_list)
+			return -ENOMEM;
+
+		for (i = 0; i < buf->nbufs; ++i) {
+			buf->page_list[i].buf =
+				dma_zalloc_coherent(&dev->persist->pdev->dev,
+						    PAGE_SIZE, &t, GFP_KERNEL);
+			if (!buf->page_list[i].buf)
+				goto err_free;
+
+			buf->page_list[i].map = t;
+		}
+	}
+
+	return 0;
+
+err_free:
+	mlx4_buf_free(dev, size, buf);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_alloc);
+
+void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
+{
+	if (buf->nbufs == 1) {
+		dma_free_coherent(&dev->persist->pdev->dev, size,
+				  buf->direct.buf, buf->direct.map);
+	} else {
+		int i;
+
+		for (i = 0; i < buf->nbufs; ++i)
+			if (buf->page_list[i].buf)
+				dma_free_coherent(&dev->persist->pdev->dev,
+						  PAGE_SIZE,
+						  buf->page_list[i].buf,
+						  buf->page_list[i].map);
+		kfree(buf->page_list);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_free);
+
+static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device)
+{
+	struct mlx4_db_pgdir *pgdir;
+
+	pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL);
+	if (!pgdir)
+		return NULL;
+
+	bitmap_fill(pgdir->order1, MLX4_DB_PER_PAGE / 2);
+	pgdir->bits[0] = pgdir->order0;
+	pgdir->bits[1] = pgdir->order1;
+	pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE,
+					    &pgdir->db_dma, GFP_KERNEL);
+	if (!pgdir->db_page) {
+		kfree(pgdir);
+		return NULL;
+	}
+
+	return pgdir;
+}
+
+static int mlx4_alloc_db_from_pgdir(struct mlx4_db_pgdir *pgdir,
+				    struct mlx4_db *db, int order)
+{
+	int o;
+	int i;
+
+	for (o = order; o <= 1; ++o) {
+		i = find_first_bit(pgdir->bits[o], MLX4_DB_PER_PAGE >> o);
+		if (i < MLX4_DB_PER_PAGE >> o)
+			goto found;
+	}
+
+	return -ENOMEM;
+
+found:
+	clear_bit(i, pgdir->bits[o]);
+
+	i <<= o;
+
+	if (o > order)
+		set_bit(i ^ 1, pgdir->bits[order]);
+
+	db->u.pgdir = pgdir;
+	db->index   = i;
+	db->db      = pgdir->db_page + db->index;
+	db->dma     = pgdir->db_dma  + db->index * 4;
+	db->order   = order;
+
+	return 0;
+}
+
+int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_db_pgdir *pgdir;
+	int ret = 0;
+
+	mutex_lock(&priv->pgdir_mutex);
+
+	list_for_each_entry(pgdir, &priv->pgdir_list, list)
+		if (!mlx4_alloc_db_from_pgdir(pgdir, db, order))
+			goto out;
+
+	pgdir = mlx4_alloc_db_pgdir(&dev->persist->pdev->dev);
+	if (!pgdir) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_add(&pgdir->list, &priv->pgdir_list);
+
+	/* This should never fail -- we just allocated an empty page: */
+	WARN_ON(mlx4_alloc_db_from_pgdir(pgdir, db, order));
+
+out:
+	mutex_unlock(&priv->pgdir_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_db_alloc);
+
+void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int o;
+	int i;
+
+	mutex_lock(&priv->pgdir_mutex);
+
+	o = db->order;
+	i = db->index;
+
+	if (db->order == 0 && test_bit(i ^ 1, db->u.pgdir->order0)) {
+		clear_bit(i ^ 1, db->u.pgdir->order0);
+		++o;
+	}
+	i >>= o;
+	set_bit(i, db->u.pgdir->bits[o]);
+
+	if (bitmap_full(db->u.pgdir->order1, MLX4_DB_PER_PAGE / 2)) {
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+				  db->u.pgdir->db_page, db->u.pgdir->db_dma);
+		list_del(&db->u.pgdir->list);
+		kfree(db->u.pgdir);
+	}
+
+	mutex_unlock(&priv->pgdir_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_db_free);
+
+int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+		       int size)
+{
+	int err;
+
+	err = mlx4_db_alloc(dev, &wqres->db, 1);
+	if (err)
+		return err;
+
+	*wqres->db.db = 0;
+
+	err = mlx4_buf_direct_alloc(dev, size, &wqres->buf);
+	if (err)
+		goto err_db;
+
+	err = mlx4_mtt_init(dev, wqres->buf.npages, wqres->buf.page_shift,
+			    &wqres->mtt);
+	if (err)
+		goto err_buf;
+
+	err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf);
+	if (err)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	mlx4_mtt_cleanup(dev, &wqres->mtt);
+err_buf:
+	mlx4_buf_free(dev, size, &wqres->buf);
+err_db:
+	mlx4_db_free(dev, &wqres->db);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_alloc_hwq_res);
+
+void mlx4_free_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+		       int size)
+{
+	mlx4_mtt_cleanup(dev, &wqres->mtt);
+	mlx4_buf_free(dev, size, &wqres->buf);
+	mlx4_db_free(dev, &wqres->db);
+}
+EXPORT_SYMBOL_GPL(mlx4_free_hwq_res);
diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
new file mode 100644
index 0000000..e2b6b0c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/workqueue.h>
+#include <linux/module.h>
+
+#include "mlx4.h"
+
+enum {
+	MLX4_CATAS_POLL_INTERVAL	= 5 * HZ,
+};
+
+
+
+int mlx4_internal_err_reset = 1;
+module_param_named(internal_err_reset, mlx4_internal_err_reset,  int, 0644);
+MODULE_PARM_DESC(internal_err_reset,
+		 "Reset device on internal errors if non-zero (default 1)");
+
+static int read_vendor_id(struct mlx4_dev *dev)
+{
+	u16 vendor_id = 0;
+	int ret;
+
+	ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id);
+	if (ret) {
+		mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret);
+		return ret;
+	}
+
+	if (vendor_id == 0xffff) {
+		mlx4_err(dev, "PCI can't be accessed to read vendor id\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx4_reset_master(struct mlx4_dev *dev)
+{
+	int err = 0;
+
+	if (mlx4_is_master(dev))
+		mlx4_report_internal_err_comm_event(dev);
+
+	if (!pci_channel_offline(dev->persist->pdev)) {
+		err = read_vendor_id(dev);
+		/* If PCI can't be accessed to read vendor ID we assume that its
+		 * link was disabled and chip was already reset.
+		 */
+		if (err)
+			return 0;
+
+		err = mlx4_reset(dev);
+		if (err)
+			mlx4_err(dev, "Fail to reset HCA\n");
+	}
+
+	return err;
+}
+
+static int mlx4_reset_slave(struct mlx4_dev *dev)
+{
+#define COM_CHAN_RST_REQ_OFFSET 0x10
+#define COM_CHAN_RST_ACK_OFFSET 0x08
+
+	u32 comm_flags;
+	u32 rst_req;
+	u32 rst_ack;
+	unsigned long end;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (pci_channel_offline(dev->persist->pdev))
+		return 0;
+
+	comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+				  MLX4_COMM_CHAN_FLAGS));
+	if (comm_flags == 0xffffffff) {
+		mlx4_err(dev, "VF reset is not needed\n");
+		return 0;
+	}
+
+	if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) {
+		mlx4_err(dev, "VF reset is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
+		COM_CHAN_RST_REQ_OFFSET;
+	rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
+		COM_CHAN_RST_ACK_OFFSET;
+	if (rst_req != rst_ack) {
+		mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n");
+		return -EIO;
+	}
+
+	rst_req ^= 1;
+	mlx4_warn(dev, "VF is sending reset request to Firmware\n");
+	comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET;
+	__raw_writel((__force u32)cpu_to_be32(comm_flags),
+		     (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS);
+	/* Make sure that our comm channel write doesn't
+	 * get mixed in with writes from another CPU.
+	 */
+	mmiowb();
+
+	end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies;
+	while (time_before(jiffies, end)) {
+		comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+					  MLX4_COMM_CHAN_FLAGS));
+		rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
+			COM_CHAN_RST_ACK_OFFSET;
+
+		/* Reading rst_req again since the communication channel can
+		 * be reset at any time by the PF and all its bits will be
+		 * set to zero.
+		 */
+		rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
+			COM_CHAN_RST_REQ_OFFSET;
+
+		if (rst_ack == rst_req) {
+			mlx4_warn(dev, "VF Reset succeed\n");
+			return 0;
+		}
+		cond_resched();
+	}
+	mlx4_err(dev, "Fail to send reset over the communication channel\n");
+	return -ETIMEDOUT;
+}
+
+int mlx4_comm_internal_err(u32 slave_read)
+{
+	return (u32)COMM_CHAN_EVENT_INTERNAL_ERR ==
+		(slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0;
+}
+
+void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
+{
+	int err;
+	struct mlx4_dev *dev;
+
+	if (!mlx4_internal_err_reset)
+		return;
+
+	mutex_lock(&persist->device_state_mutex);
+	if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		goto out;
+
+	dev = persist->dev;
+	mlx4_err(dev, "device is going to be reset\n");
+	if (mlx4_is_slave(dev))
+		err = mlx4_reset_slave(dev);
+	else
+		err = mlx4_reset_master(dev);
+
+	if (!err) {
+		mlx4_err(dev, "device was reset successfully\n");
+	} else {
+		/* EEH could have disabled the PCI channel during reset. That's
+		 * recoverable and the PCI error flow will handle it.
+		 */
+		if (!pci_channel_offline(dev->persist->pdev))
+			BUG_ON(1);
+	}
+	dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
+	mutex_unlock(&persist->device_state_mutex);
+
+	/* At that step HW was already reset, now notify clients */
+	mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
+	mlx4_cmd_wake_completions(dev);
+	return;
+
+out:
+	mutex_unlock(&persist->device_state_mutex);
+}
+
+static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
+{
+	int err = 0;
+
+	mlx4_enter_error_state(persist);
+	mutex_lock(&persist->interface_state_mutex);
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP &&
+	    !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) {
+		err = mlx4_restart_one(persist->pdev);
+		mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n",
+			  err);
+	}
+	mutex_unlock(&persist->interface_state_mutex);
+}
+
+static void dump_err_buf(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	int i;
+
+	mlx4_err(dev, "Internal error detected:\n");
+	for (i = 0; i < priv->fw.catas_size; ++i)
+		mlx4_err(dev, "  buf[%02x]: %08x\n",
+			 i, swab32(readl(priv->catas_err.map + i)));
+}
+
+static void poll_catas(struct timer_list *t)
+{
+	struct mlx4_priv *priv = from_timer(priv, t, catas_err.timer);
+	struct mlx4_dev *dev = &priv->dev;
+	u32 slave_read;
+
+	if (mlx4_is_slave(dev)) {
+		slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
+		if (mlx4_comm_internal_err(slave_read)) {
+			mlx4_warn(dev, "Internal error detected on the communication channel\n");
+			goto internal_err;
+		}
+	} else if (readl(priv->catas_err.map)) {
+		dump_err_buf(dev);
+		goto internal_err;
+	}
+
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx4_warn(dev, "Internal error mark was detected on device\n");
+		goto internal_err;
+	}
+
+	mod_timer(&priv->catas_err.timer,
+		  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+	return;
+
+internal_err:
+	if (mlx4_internal_err_reset)
+		queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
+}
+
+static void catas_reset(struct work_struct *work)
+{
+	struct mlx4_dev_persistent *persist =
+		container_of(work, struct mlx4_dev_persistent,
+			     catas_work);
+
+	mlx4_handle_error_state(persist);
+}
+
+void mlx4_start_catas_poll(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	phys_addr_t addr;
+
+	INIT_LIST_HEAD(&priv->catas_err.list);
+	timer_setup(&priv->catas_err.timer, poll_catas, 0);
+	priv->catas_err.map = NULL;
+
+	if (!mlx4_is_slave(dev)) {
+		addr = pci_resource_start(dev->persist->pdev,
+					  priv->fw.catas_bar) +
+					  priv->fw.catas_offset;
+
+		priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
+		if (!priv->catas_err.map) {
+			mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
+				  (unsigned long long)addr);
+			return;
+		}
+	}
+
+	priv->catas_err.timer.expires  =
+		round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL);
+	add_timer(&priv->catas_err.timer);
+}
+
+void mlx4_stop_catas_poll(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	del_timer_sync(&priv->catas_err.timer);
+
+	if (priv->catas_err.map) {
+		iounmap(priv->catas_err.map);
+		priv->catas_err.map = NULL;
+	}
+
+	if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION)
+		flush_workqueue(dev->persist->catas_wq);
+}
+
+int  mlx4_catas_init(struct mlx4_dev *dev)
+{
+	INIT_WORK(&dev->persist->catas_work, catas_reset);
+	dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health");
+	if (!dev->persist->catas_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_catas_end(struct mlx4_dev *dev)
+{
+	if (dev->persist->catas_wq) {
+		destroy_workqueue(dev->persist->catas_wq);
+		dev->persist->catas_wq = NULL;
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
new file mode 100644
index 0000000..6a9086d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -0,0 +1,3431 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/device.h>
+#include <linux/semaphore.h>
+#include <rdma/ib_smi.h>
+#include <linux/delay.h>
+#include <linux/etherdevice.h>
+
+#include <asm/io.h>
+
+#include "mlx4.h"
+#include "fw.h"
+#include "fw_qos.h"
+#include "mlx4_stats.h"
+
+#define CMD_POLL_TOKEN 0xffff
+#define INBOX_MASK	0xffffffffffffff00ULL
+
+#define CMD_CHAN_VER 1
+#define CMD_CHAN_IF_REV 1
+
+enum {
+	/* command completed successfully: */
+	CMD_STAT_OK		= 0x00,
+	/* Internal error (such as a bus error) occurred while processing command: */
+	CMD_STAT_INTERNAL_ERR	= 0x01,
+	/* Operation/command not supported or opcode modifier not supported: */
+	CMD_STAT_BAD_OP		= 0x02,
+	/* Parameter not supported or parameter out of range: */
+	CMD_STAT_BAD_PARAM	= 0x03,
+	/* System not enabled or bad system state: */
+	CMD_STAT_BAD_SYS_STATE	= 0x04,
+	/* Attempt to access reserved or unallocaterd resource: */
+	CMD_STAT_BAD_RESOURCE	= 0x05,
+	/* Requested resource is currently executing a command, or is otherwise busy: */
+	CMD_STAT_RESOURCE_BUSY	= 0x06,
+	/* Required capability exceeds device limits: */
+	CMD_STAT_EXCEED_LIM	= 0x08,
+	/* Resource is not in the appropriate state or ownership: */
+	CMD_STAT_BAD_RES_STATE	= 0x09,
+	/* Index out of range: */
+	CMD_STAT_BAD_INDEX	= 0x0a,
+	/* FW image corrupted: */
+	CMD_STAT_BAD_NVMEM	= 0x0b,
+	/* Error in ICM mapping (e.g. not enough auxiliary ICM pages to execute command): */
+	CMD_STAT_ICM_ERROR	= 0x0c,
+	/* Attempt to modify a QP/EE which is not in the presumed state: */
+	CMD_STAT_BAD_QP_STATE   = 0x10,
+	/* Bad segment parameters (Address/Size): */
+	CMD_STAT_BAD_SEG_PARAM	= 0x20,
+	/* Memory Region has Memory Windows bound to: */
+	CMD_STAT_REG_BOUND	= 0x21,
+	/* HCA local attached memory not present: */
+	CMD_STAT_LAM_NOT_PRE	= 0x22,
+	/* Bad management packet (silently discarded): */
+	CMD_STAT_BAD_PKT	= 0x30,
+	/* More outstanding CQEs in CQ than new CQ size: */
+	CMD_STAT_BAD_SIZE	= 0x40,
+	/* Multi Function device support required: */
+	CMD_STAT_MULTI_FUNC_REQ	= 0x50,
+};
+
+enum {
+	HCR_IN_PARAM_OFFSET	= 0x00,
+	HCR_IN_MODIFIER_OFFSET	= 0x08,
+	HCR_OUT_PARAM_OFFSET	= 0x0c,
+	HCR_TOKEN_OFFSET	= 0x14,
+	HCR_STATUS_OFFSET	= 0x18,
+
+	HCR_OPMOD_SHIFT		= 12,
+	HCR_T_BIT		= 21,
+	HCR_E_BIT		= 22,
+	HCR_GO_BIT		= 23
+};
+
+enum {
+	GO_BIT_TIMEOUT_MSECS	= 10000
+};
+
+enum mlx4_vlan_transition {
+	MLX4_VLAN_TRANSITION_VST_VST = 0,
+	MLX4_VLAN_TRANSITION_VST_VGT = 1,
+	MLX4_VLAN_TRANSITION_VGT_VST = 2,
+	MLX4_VLAN_TRANSITION_VGT_VGT = 3,
+};
+
+
+struct mlx4_cmd_context {
+	struct completion	done;
+	int			result;
+	int			next;
+	u64			out_param;
+	u16			token;
+	u8			fw_status;
+};
+
+static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
+				    struct mlx4_vhcr_cmd *in_vhcr);
+
+static int mlx4_status_to_errno(u8 status)
+{
+	static const int trans_table[] = {
+		[CMD_STAT_INTERNAL_ERR]	  = -EIO,
+		[CMD_STAT_BAD_OP]	  = -EPERM,
+		[CMD_STAT_BAD_PARAM]	  = -EINVAL,
+		[CMD_STAT_BAD_SYS_STATE]  = -ENXIO,
+		[CMD_STAT_BAD_RESOURCE]	  = -EBADF,
+		[CMD_STAT_RESOURCE_BUSY]  = -EBUSY,
+		[CMD_STAT_EXCEED_LIM]	  = -ENOMEM,
+		[CMD_STAT_BAD_RES_STATE]  = -EBADF,
+		[CMD_STAT_BAD_INDEX]	  = -EBADF,
+		[CMD_STAT_BAD_NVMEM]	  = -EFAULT,
+		[CMD_STAT_ICM_ERROR]	  = -ENFILE,
+		[CMD_STAT_BAD_QP_STATE]   = -EINVAL,
+		[CMD_STAT_BAD_SEG_PARAM]  = -EFAULT,
+		[CMD_STAT_REG_BOUND]	  = -EBUSY,
+		[CMD_STAT_LAM_NOT_PRE]	  = -EAGAIN,
+		[CMD_STAT_BAD_PKT]	  = -EINVAL,
+		[CMD_STAT_BAD_SIZE]	  = -ENOMEM,
+		[CMD_STAT_MULTI_FUNC_REQ] = -EACCES,
+	};
+
+	if (status >= ARRAY_SIZE(trans_table) ||
+	    (status != CMD_STAT_OK && trans_table[status] == 0))
+		return -EIO;
+
+	return trans_table[status];
+}
+
+static u8 mlx4_errno_to_status(int errno)
+{
+	switch (errno) {
+	case -EPERM:
+		return CMD_STAT_BAD_OP;
+	case -EINVAL:
+		return CMD_STAT_BAD_PARAM;
+	case -ENXIO:
+		return CMD_STAT_BAD_SYS_STATE;
+	case -EBUSY:
+		return CMD_STAT_RESOURCE_BUSY;
+	case -ENOMEM:
+		return CMD_STAT_EXCEED_LIM;
+	case -ENFILE:
+		return CMD_STAT_ICM_ERROR;
+	default:
+		return CMD_STAT_INTERNAL_ERR;
+	}
+}
+
+static int mlx4_internal_err_ret_value(struct mlx4_dev *dev, u16 op,
+				       u8 op_modifier)
+{
+	switch (op) {
+	case MLX4_CMD_UNMAP_ICM:
+	case MLX4_CMD_UNMAP_ICM_AUX:
+	case MLX4_CMD_UNMAP_FA:
+	case MLX4_CMD_2RST_QP:
+	case MLX4_CMD_HW2SW_EQ:
+	case MLX4_CMD_HW2SW_CQ:
+	case MLX4_CMD_HW2SW_SRQ:
+	case MLX4_CMD_HW2SW_MPT:
+	case MLX4_CMD_CLOSE_HCA:
+	case MLX4_QP_FLOW_STEERING_DETACH:
+	case MLX4_CMD_FREE_RES:
+	case MLX4_CMD_CLOSE_PORT:
+		return CMD_STAT_OK;
+
+	case MLX4_CMD_QP_ATTACH:
+		/* On Detach case return success */
+		if (op_modifier == 0)
+			return CMD_STAT_OK;
+		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+
+	default:
+		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+	}
+}
+
+static int mlx4_closing_cmd_fatal_error(u16 op, u8 fw_status)
+{
+	/* Any error during the closing commands below is considered fatal */
+	if (op == MLX4_CMD_CLOSE_HCA ||
+	    op == MLX4_CMD_HW2SW_EQ ||
+	    op == MLX4_CMD_HW2SW_CQ ||
+	    op == MLX4_CMD_2RST_QP ||
+	    op == MLX4_CMD_HW2SW_SRQ ||
+	    op == MLX4_CMD_SYNC_TPT ||
+	    op == MLX4_CMD_UNMAP_ICM ||
+	    op == MLX4_CMD_UNMAP_ICM_AUX ||
+	    op == MLX4_CMD_UNMAP_FA)
+		return 1;
+	/* Error on MLX4_CMD_HW2SW_MPT is fatal except when fw status equals
+	  * CMD_STAT_REG_BOUND.
+	  * This status indicates that memory region has memory windows bound to it
+	  * which may result from invalid user space usage and is not fatal.
+	  */
+	if (op == MLX4_CMD_HW2SW_MPT && fw_status != CMD_STAT_REG_BOUND)
+		return 1;
+	return 0;
+}
+
+static int mlx4_cmd_reset_flow(struct mlx4_dev *dev, u16 op, u8 op_modifier,
+			       int err)
+{
+	/* Only if reset flow is really active return code is based on
+	  * command, otherwise current error code is returned.
+	  */
+	if (mlx4_internal_err_reset) {
+		mlx4_enter_error_state(dev->persist);
+		err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+	}
+
+	return err;
+}
+
+static int comm_pending(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 status = readl(&priv->mfunc.comm->slave_read);
+
+	return (swab32(status) >> 31) != priv->cmd.comm_toggle;
+}
+
+static int mlx4_comm_cmd_post(struct mlx4_dev *dev, u8 cmd, u16 param)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 val;
+
+	/* To avoid writing to unknown addresses after the device state was
+	 * changed to internal error and the function was rest,
+	 * check the INTERNAL_ERROR flag which is updated under
+	 * device_state_mutex lock.
+	 */
+	mutex_lock(&dev->persist->device_state_mutex);
+
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		mutex_unlock(&dev->persist->device_state_mutex);
+		return -EIO;
+	}
+
+	priv->cmd.comm_toggle ^= 1;
+	val = param | (cmd << 16) | (priv->cmd.comm_toggle << 31);
+	__raw_writel((__force u32) cpu_to_be32(val),
+		     &priv->mfunc.comm->slave_write);
+	mmiowb();
+	mutex_unlock(&dev->persist->device_state_mutex);
+	return 0;
+}
+
+static int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param,
+		       unsigned long timeout)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned long end;
+	int err = 0;
+	int ret_from_pending = 0;
+
+	/* First, verify that the master reports correct status */
+	if (comm_pending(dev)) {
+		mlx4_warn(dev, "Communication channel is not idle - my toggle is %d (cmd:0x%x)\n",
+			  priv->cmd.comm_toggle, cmd);
+		return -EAGAIN;
+	}
+
+	/* Write command */
+	down(&priv->cmd.poll_sem);
+	if (mlx4_comm_cmd_post(dev, cmd, param)) {
+		/* Only in case the device state is INTERNAL_ERROR,
+		 * mlx4_comm_cmd_post returns with an error
+		 */
+		err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+		goto out;
+	}
+
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (comm_pending(dev) && time_before(jiffies, end))
+		cond_resched();
+	ret_from_pending = comm_pending(dev);
+	if (ret_from_pending) {
+		/* check if the slave is trying to boot in the middle of
+		 * FLR process. The only non-zero result in the RESET command
+		 * is MLX4_DELAY_RESET_SLAVE*/
+		if ((MLX4_COMM_CMD_RESET == cmd)) {
+			err = MLX4_DELAY_RESET_SLAVE;
+			goto out;
+		} else {
+			mlx4_warn(dev, "Communication channel command 0x%x timed out\n",
+				  cmd);
+			err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+		}
+	}
+
+	if (err)
+		mlx4_enter_error_state(dev->persist);
+out:
+	up(&priv->cmd.poll_sem);
+	return err;
+}
+
+static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 vhcr_cmd,
+			      u16 param, u16 op, unsigned long timeout)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_context *context;
+	unsigned long end;
+	int err = 0;
+
+	down(&cmd->event_sem);
+
+	spin_lock(&cmd->context_lock);
+	BUG_ON(cmd->free_head < 0);
+	context = &cmd->context[cmd->free_head];
+	context->token += cmd->token_mask + 1;
+	cmd->free_head = context->next;
+	spin_unlock(&cmd->context_lock);
+
+	reinit_completion(&context->done);
+
+	if (mlx4_comm_cmd_post(dev, vhcr_cmd, param)) {
+		/* Only in case the device state is INTERNAL_ERROR,
+		 * mlx4_comm_cmd_post returns with an error
+		 */
+		err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+		goto out;
+	}
+
+	if (!wait_for_completion_timeout(&context->done,
+					 msecs_to_jiffies(timeout))) {
+		mlx4_warn(dev, "communication channel command 0x%x (op=0x%x) timed out\n",
+			  vhcr_cmd, op);
+		goto out_reset;
+	}
+
+	err = context->result;
+	if (err && context->fw_status != CMD_STAT_MULTI_FUNC_REQ) {
+		mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+			 vhcr_cmd, context->fw_status);
+		if (mlx4_closing_cmd_fatal_error(op, context->fw_status))
+			goto out_reset;
+	}
+
+	/* wait for comm channel ready
+	 * this is necessary for prevention the race
+	 * when switching between event to polling mode
+	 * Skipping this section in case the device is in FATAL_ERROR state,
+	 * In this state, no commands are sent via the comm channel until
+	 * the device has returned from reset.
+	 */
+	if (!(dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)) {
+		end = msecs_to_jiffies(timeout) + jiffies;
+		while (comm_pending(dev) && time_before(jiffies, end))
+			cond_resched();
+	}
+	goto out;
+
+out_reset:
+	err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+	mlx4_enter_error_state(dev->persist);
+out:
+	spin_lock(&cmd->context_lock);
+	context->next = cmd->free_head;
+	cmd->free_head = context - cmd->context;
+	spin_unlock(&cmd->context_lock);
+
+	up(&cmd->event_sem);
+	return err;
+}
+
+int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
+		  u16 op, unsigned long timeout)
+{
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+
+	if (mlx4_priv(dev)->cmd.use_events)
+		return mlx4_comm_cmd_wait(dev, cmd, param, op, timeout);
+	return mlx4_comm_cmd_poll(dev, cmd, param, timeout);
+}
+
+static int cmd_pending(struct mlx4_dev *dev)
+{
+	u32 status;
+
+	if (pci_channel_offline(dev->persist->pdev))
+		return -EIO;
+
+	status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
+
+	return (status & swab32(1 << HCR_GO_BIT)) ||
+		(mlx4_priv(dev)->cmd.toggle ==
+		 !!(status & swab32(1 << HCR_T_BIT)));
+}
+
+static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+			 u32 in_modifier, u8 op_modifier, u16 op, u16 token,
+			 int event)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	u32 __iomem *hcr = cmd->hcr;
+	int ret = -EIO;
+	unsigned long end;
+
+	mutex_lock(&dev->persist->device_state_mutex);
+	/* To avoid writing to unknown addresses after the device state was
+	  * changed to internal error and the chip was reset,
+	  * check the INTERNAL_ERROR flag which is updated under
+	  * device_state_mutex lock.
+	  */
+	if (pci_channel_offline(dev->persist->pdev) ||
+	    (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)) {
+		/*
+		 * Device is going through error recovery
+		 * and cannot accept commands.
+		 */
+		goto out;
+	}
+
+	end = jiffies;
+	if (event)
+		end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS);
+
+	while (cmd_pending(dev)) {
+		if (pci_channel_offline(dev->persist->pdev)) {
+			/*
+			 * Device is going through error recovery
+			 * and cannot accept commands.
+			 */
+			goto out;
+		}
+
+		if (time_after_eq(jiffies, end)) {
+			mlx4_err(dev, "%s:cmd_pending failed\n", __func__);
+			goto out;
+		}
+		cond_resched();
+	}
+
+	/*
+	 * We use writel (instead of something like memcpy_toio)
+	 * because writes of less than 32 bits to the HCR don't work
+	 * (and some architectures such as ia64 implement memcpy_toio
+	 * in terms of writeb).
+	 */
+	__raw_writel((__force u32) cpu_to_be32(in_param >> 32),		  hcr + 0);
+	__raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  hcr + 1);
+	__raw_writel((__force u32) cpu_to_be32(in_modifier),		  hcr + 2);
+	__raw_writel((__force u32) cpu_to_be32(out_param >> 32),	  hcr + 3);
+	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), hcr + 4);
+	__raw_writel((__force u32) cpu_to_be32(token << 16),		  hcr + 5);
+
+	/* __raw_writel may not order writes. */
+	wmb();
+
+	__raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)		|
+					       (cmd->toggle << HCR_T_BIT)	|
+					       (event ? (1 << HCR_E_BIT) : 0)	|
+					       (op_modifier << HCR_OPMOD_SHIFT) |
+					       op), hcr + 6);
+
+	/*
+	 * Make sure that our HCR writes don't get mixed in with
+	 * writes from another CPU starting a FW command.
+	 */
+	mmiowb();
+
+	cmd->toggle = cmd->toggle ^ 1;
+
+	ret = 0;
+
+out:
+	if (ret)
+		mlx4_warn(dev, "Could not post command 0x%x: ret=%d, in_param=0x%llx, in_mod=0x%x, op_mod=0x%x\n",
+			  op, ret, in_param, in_modifier, op_modifier);
+	mutex_unlock(&dev->persist->device_state_mutex);
+
+	return ret;
+}
+
+static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			  int out_is_imm, u32 in_modifier, u8 op_modifier,
+			  u16 op, unsigned long timeout)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vhcr_cmd *vhcr = priv->mfunc.vhcr;
+	int ret;
+
+	mutex_lock(&priv->cmd.slave_cmd_mutex);
+
+	vhcr->in_param = cpu_to_be64(in_param);
+	vhcr->out_param = out_param ? cpu_to_be64(*out_param) : 0;
+	vhcr->in_modifier = cpu_to_be32(in_modifier);
+	vhcr->opcode = cpu_to_be16((((u16) op_modifier) << 12) | (op & 0xfff));
+	vhcr->token = cpu_to_be16(CMD_POLL_TOKEN);
+	vhcr->status = 0;
+	vhcr->flags = !!(priv->cmd.use_events) << 6;
+
+	if (mlx4_is_master(dev)) {
+		ret = mlx4_master_process_vhcr(dev, dev->caps.function, vhcr);
+		if (!ret) {
+			if (out_is_imm) {
+				if (out_param)
+					*out_param =
+						be64_to_cpu(vhcr->out_param);
+				else {
+					mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+						 op);
+					vhcr->status = CMD_STAT_BAD_PARAM;
+				}
+			}
+			ret = mlx4_status_to_errno(vhcr->status);
+		}
+		if (ret &&
+		    dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+			ret = mlx4_internal_err_ret_value(dev, op, op_modifier);
+	} else {
+		ret = mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_POST, 0, op,
+				    MLX4_COMM_TIME + timeout);
+		if (!ret) {
+			if (out_is_imm) {
+				if (out_param)
+					*out_param =
+						be64_to_cpu(vhcr->out_param);
+				else {
+					mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+						 op);
+					vhcr->status = CMD_STAT_BAD_PARAM;
+				}
+			}
+			ret = mlx4_status_to_errno(vhcr->status);
+		} else {
+			if (dev->persist->state &
+			    MLX4_DEVICE_STATE_INTERNAL_ERROR)
+				ret = mlx4_internal_err_ret_value(dev, op,
+								  op_modifier);
+			else
+				mlx4_err(dev, "failed execution of VHCR_POST command opcode 0x%x\n", op);
+		}
+	}
+
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+	return ret;
+}
+
+static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			 int out_is_imm, u32 in_modifier, u8 op_modifier,
+			 u16 op, unsigned long timeout)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	void __iomem *hcr = priv->cmd.hcr;
+	int err = 0;
+	unsigned long end;
+	u32 stat;
+
+	down(&priv->cmd.poll_sem);
+
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		/*
+		 * Device is going through error recovery
+		 * and cannot accept commands.
+		 */
+		err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+		goto out;
+	}
+
+	if (out_is_imm && !out_param) {
+		mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+			 op);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+			    in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
+	if (err)
+		goto out_reset;
+
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (cmd_pending(dev) && time_before(jiffies, end)) {
+		if (pci_channel_offline(dev->persist->pdev)) {
+			/*
+			 * Device is going through error recovery
+			 * and cannot accept commands.
+			 */
+			err = -EIO;
+			goto out_reset;
+		}
+
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+			err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+			goto out;
+		}
+
+		cond_resched();
+	}
+
+	if (cmd_pending(dev)) {
+		mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
+			  op);
+		err = -EIO;
+		goto out_reset;
+	}
+
+	if (out_is_imm)
+		*out_param =
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(hcr + HCR_OUT_PARAM_OFFSET)) << 32 |
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(hcr + HCR_OUT_PARAM_OFFSET + 4));
+	stat = be32_to_cpu((__force __be32)
+			   __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24;
+	err = mlx4_status_to_errno(stat);
+	if (err) {
+		mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+			 op, stat);
+		if (mlx4_closing_cmd_fatal_error(op, stat))
+			goto out_reset;
+		goto out;
+	}
+
+out_reset:
+	if (err)
+		err = mlx4_cmd_reset_flow(dev, op, op_modifier, err);
+out:
+	up(&priv->cmd.poll_sem);
+	return err;
+}
+
+void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_context *context =
+		&priv->cmd.context[token & priv->cmd.token_mask];
+
+	/* previously timed out command completing at long last */
+	if (token != context->token)
+		return;
+
+	context->fw_status = status;
+	context->result    = mlx4_status_to_errno(status);
+	context->out_param = out_param;
+
+	complete(&context->done);
+}
+
+static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			 int out_is_imm, u32 in_modifier, u8 op_modifier,
+			 u16 op, unsigned long timeout)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_context *context;
+	long ret_wait;
+	int err = 0;
+
+	down(&cmd->event_sem);
+
+	spin_lock(&cmd->context_lock);
+	BUG_ON(cmd->free_head < 0);
+	context = &cmd->context[cmd->free_head];
+	context->token += cmd->token_mask + 1;
+	cmd->free_head = context->next;
+	spin_unlock(&cmd->context_lock);
+
+	if (out_is_imm && !out_param) {
+		mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+			 op);
+		err = -EINVAL;
+		goto out;
+	}
+
+	reinit_completion(&context->done);
+
+	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+			    in_modifier, op_modifier, op, context->token, 1);
+	if (err)
+		goto out_reset;
+
+	if (op == MLX4_CMD_SENSE_PORT) {
+		ret_wait =
+			wait_for_completion_interruptible_timeout(&context->done,
+								  msecs_to_jiffies(timeout));
+		if (ret_wait < 0) {
+			context->fw_status = 0;
+			context->out_param = 0;
+			context->result = 0;
+		}
+	} else {
+		ret_wait = (long)wait_for_completion_timeout(&context->done,
+							     msecs_to_jiffies(timeout));
+	}
+	if (!ret_wait) {
+		mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
+			  op);
+		if (op == MLX4_CMD_NOP) {
+			err = -EBUSY;
+			goto out;
+		} else {
+			err = -EIO;
+			goto out_reset;
+		}
+	}
+
+	err = context->result;
+	if (err) {
+		/* Since we do not want to have this error message always
+		 * displayed at driver start when there are ConnectX2 HCAs
+		 * on the host, we deprecate the error message for this
+		 * specific command/input_mod/opcode_mod/fw-status to be debug.
+		 */
+		if (op == MLX4_CMD_SET_PORT &&
+		    (in_modifier == 1 || in_modifier == 2) &&
+		    op_modifier == MLX4_SET_PORT_IB_OPCODE &&
+		    context->fw_status == CMD_STAT_BAD_SIZE)
+			mlx4_dbg(dev, "command 0x%x failed: fw status = 0x%x\n",
+				 op, context->fw_status);
+		else
+			mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+				 op, context->fw_status);
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+			err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+		else if (mlx4_closing_cmd_fatal_error(op, context->fw_status))
+			goto out_reset;
+
+		goto out;
+	}
+
+	if (out_is_imm)
+		*out_param = context->out_param;
+
+out_reset:
+	if (err)
+		err = mlx4_cmd_reset_flow(dev, op, op_modifier, err);
+out:
+	spin_lock(&cmd->context_lock);
+	context->next = cmd->free_head;
+	cmd->free_head = context - cmd->context;
+	spin_unlock(&cmd->context_lock);
+
+	up(&cmd->event_sem);
+	return err;
+}
+
+int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+	       int out_is_imm, u32 in_modifier, u8 op_modifier,
+	       u16 op, unsigned long timeout, int native)
+{
+	if (pci_channel_offline(dev->persist->pdev))
+		return mlx4_cmd_reset_flow(dev, op, op_modifier, -EIO);
+
+	if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
+		int ret;
+
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+			return mlx4_internal_err_ret_value(dev, op,
+							  op_modifier);
+		down_read(&mlx4_priv(dev)->cmd.switch_sem);
+		if (mlx4_priv(dev)->cmd.use_events)
+			ret = mlx4_cmd_wait(dev, in_param, out_param,
+					    out_is_imm, in_modifier,
+					    op_modifier, op, timeout);
+		else
+			ret = mlx4_cmd_poll(dev, in_param, out_param,
+					    out_is_imm, in_modifier,
+					    op_modifier, op, timeout);
+
+		up_read(&mlx4_priv(dev)->cmd.switch_sem);
+		return ret;
+	}
+	return mlx4_slave_cmd(dev, in_param, out_param, out_is_imm,
+			      in_modifier, op_modifier, op, timeout);
+}
+EXPORT_SYMBOL_GPL(__mlx4_cmd);
+
+
+int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_ARM_COMM_CHANNEL,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+static int mlx4_ACCESS_MEM(struct mlx4_dev *dev, u64 master_addr,
+			   int slave, u64 slave_addr,
+			   int size, int is_read)
+{
+	u64 in_param;
+	u64 out_param;
+
+	if ((slave_addr & 0xfff) | (master_addr & 0xfff) |
+	    (slave & ~0x7f) | (size & 0xff)) {
+		mlx4_err(dev, "Bad access mem params - slave_addr:0x%llx master_addr:0x%llx slave_id:%d size:%d\n",
+			 slave_addr, master_addr, slave, size);
+		return -EINVAL;
+	}
+
+	if (is_read) {
+		in_param = (u64) slave | slave_addr;
+		out_param = (u64) dev->caps.function | master_addr;
+	} else {
+		in_param = (u64) dev->caps.function | master_addr;
+		out_param = (u64) slave | slave_addr;
+	}
+
+	return mlx4_cmd_imm(dev, in_param, &out_param, size, 0,
+			    MLX4_CMD_ACCESS_MEM,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+
+static int query_pkey_block(struct mlx4_dev *dev, u8 port, u16 index, u16 *pkey,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox)
+{
+	struct ib_smp *in_mad = (struct ib_smp *)(inbox->buf);
+	struct ib_smp *out_mad = (struct ib_smp *)(outbox->buf);
+	int err;
+	int i;
+
+	if (index & 0x1f)
+		return -EINVAL;
+
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, port, 3,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+
+	for (i = 0; i < 32; ++i)
+		pkey[i] = be16_to_cpu(((__be16 *) out_mad->data)[i]);
+
+	return err;
+}
+
+static int get_full_pkey_table(struct mlx4_dev *dev, u8 port, u16 *table,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < dev->caps.pkey_table_len[port]; i += 32) {
+		err = query_pkey_block(dev, port, i, table + i, inbox, outbox);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+#define PORT_CAPABILITY_LOCATION_IN_SMP 20
+#define PORT_STATE_OFFSET 32
+
+static enum ib_port_state vf_port_state(struct mlx4_dev *dev, int port, int vf)
+{
+	if (mlx4_get_slave_port_state(dev, vf, port) == SLAVE_PORT_UP)
+		return IB_PORT_ACTIVE;
+	else
+		return IB_PORT_DOWN;
+}
+
+static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	struct ib_smp *smp = inbox->buf;
+	u32 index;
+	u8 port, slave_port;
+	u8 opcode_modifier;
+	u16 *table;
+	int err;
+	int vidx, pidx;
+	int network_view;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct ib_smp *outsmp = outbox->buf;
+	__be16 *outtab = (__be16 *)(outsmp->data);
+	__be32 slave_cap_mask;
+	__be64 slave_node_guid;
+
+	slave_port = vhcr->in_modifier;
+	port = mlx4_slave_convert_port(dev, slave, slave_port);
+
+	/* network-view bit is for driver use only, and should not be passed to FW */
+	opcode_modifier = vhcr->op_modifier & ~0x8; /* clear netw view bit */
+	network_view = !!(vhcr->op_modifier & 0x8);
+
+	if (smp->base_version == 1 &&
+	    smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
+	    smp->class_version == 1) {
+		/* host view is paravirtualized */
+		if (!network_view && smp->method == IB_MGMT_METHOD_GET) {
+			if (smp->attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+				index = be32_to_cpu(smp->attr_mod);
+				if (port < 1 || port > dev->caps.num_ports)
+					return -EINVAL;
+				table = kcalloc((dev->caps.pkey_table_len[port] / 32) + 1,
+						sizeof(*table) * 32, GFP_KERNEL);
+
+				if (!table)
+					return -ENOMEM;
+				/* need to get the full pkey table because the paravirtualized
+				 * pkeys may be scattered among several pkey blocks.
+				 */
+				err = get_full_pkey_table(dev, port, table, inbox, outbox);
+				if (!err) {
+					for (vidx = index * 32; vidx < (index + 1) * 32; ++vidx) {
+						pidx = priv->virt2phys_pkey[slave][port - 1][vidx];
+						outtab[vidx % 32] = cpu_to_be16(table[pidx]);
+					}
+				}
+				kfree(table);
+				return err;
+			}
+			if (smp->attr_id == IB_SMP_ATTR_PORT_INFO) {
+				/*get the slave specific caps:*/
+				/*do the command */
+				smp->attr_mod = cpu_to_be32(port);
+				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+					    port, opcode_modifier,
+					    vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+				/* modify the response for slaves */
+				if (!err && slave != mlx4_master_func_num(dev)) {
+					u8 *state = outsmp->data + PORT_STATE_OFFSET;
+
+					*state = (*state & 0xf0) | vf_port_state(dev, port, slave);
+					slave_cap_mask = priv->mfunc.master.slave_state[slave].ib_cap_mask[port];
+					memcpy(outsmp->data + PORT_CAPABILITY_LOCATION_IN_SMP, &slave_cap_mask, 4);
+				}
+				return err;
+			}
+			if (smp->attr_id == IB_SMP_ATTR_GUID_INFO) {
+				__be64 guid = mlx4_get_admin_guid(dev, slave,
+								  port);
+
+				/* set the PF admin guid to the FW/HW burned
+				 * GUID, if it wasn't yet set
+				 */
+				if (slave == 0 && guid == 0) {
+					smp->attr_mod = 0;
+					err = mlx4_cmd_box(dev,
+							   inbox->dma,
+							   outbox->dma,
+							   vhcr->in_modifier,
+							   opcode_modifier,
+							   vhcr->op,
+							   MLX4_CMD_TIME_CLASS_C,
+							   MLX4_CMD_NATIVE);
+					if (err)
+						return err;
+					mlx4_set_admin_guid(dev,
+							    *(__be64 *)outsmp->
+							    data, slave, port);
+				} else {
+					memcpy(outsmp->data, &guid, 8);
+				}
+
+				/* clean all other gids */
+				memset(outsmp->data + 8, 0, 56);
+				return 0;
+			}
+			if (smp->attr_id == IB_SMP_ATTR_NODE_INFO) {
+				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+					     port, opcode_modifier,
+					     vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+				if (!err) {
+					slave_node_guid =  mlx4_get_slave_node_guid(dev, slave);
+					memcpy(outsmp->data + 12, &slave_node_guid, 8);
+				}
+				return err;
+			}
+		}
+	}
+
+	/* Non-privileged VFs are only allowed "host" view LID-routed 'Get' MADs.
+	 * These are the MADs used by ib verbs (such as ib_query_gids).
+	 */
+	if (slave != mlx4_master_func_num(dev) &&
+	    !mlx4_vf_smi_enabled(dev, slave, port)) {
+		if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
+		      smp->method == IB_MGMT_METHOD_GET) || network_view) {
+			mlx4_err(dev, "Unprivileged slave %d is trying to execute a Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. Rejecting\n",
+				 slave, smp->mgmt_class, smp->method,
+				 network_view ? "Network" : "Host",
+				 be16_to_cpu(smp->attr_id));
+			return -EPERM;
+		}
+	}
+
+	return mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+				    vhcr->in_modifier, opcode_modifier,
+				    vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+}
+
+static int mlx4_CMD_EPERM_wrapper(struct mlx4_dev *dev, int slave,
+		     struct mlx4_vhcr *vhcr,
+		     struct mlx4_cmd_mailbox *inbox,
+		     struct mlx4_cmd_mailbox *outbox,
+		     struct mlx4_cmd_info *cmd)
+{
+	return -EPERM;
+}
+
+int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave,
+		     struct mlx4_vhcr *vhcr,
+		     struct mlx4_cmd_mailbox *inbox,
+		     struct mlx4_cmd_mailbox *outbox,
+		     struct mlx4_cmd_info *cmd)
+{
+	u64 in_param;
+	u64 out_param;
+	int err;
+
+	in_param = cmd->has_inbox ? (u64) inbox->dma : vhcr->in_param;
+	out_param = cmd->has_outbox ? (u64) outbox->dma : vhcr->out_param;
+	if (cmd->encode_slave_id) {
+		in_param &= 0xffffffffffffff00ll;
+		in_param |= slave;
+	}
+
+	err = __mlx4_cmd(dev, in_param, &out_param, cmd->out_is_imm,
+			 vhcr->in_modifier, vhcr->op_modifier, vhcr->op,
+			 MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	if (cmd->out_is_imm)
+		vhcr->out_param = out_param;
+
+	return err;
+}
+
+static struct mlx4_cmd_info cmd_info[] = {
+	{
+		.opcode = MLX4_CMD_QUERY_FW,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_FW_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_HCA,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_DEV_CAP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_DEV_CAP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_FUNC_CAP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_FUNC_CAP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_ADAPTER,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_INIT_PORT,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_INIT_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_CLOSE_PORT,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm  = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CLOSE_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_PORT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SET_PORT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SET_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_MAP_EQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_MAP_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_EQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW_HEALTH_CHECK,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_NOP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_CONFIG_DEV,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CONFIG_DEV_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_ALLOC_RES,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = true,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_ALLOC_RES_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_FREE_RES,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_FREE_RES_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_MPT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_MPT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_MPT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_MPT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_MPT,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_MPT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_READ_MTT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_WRITE_MTT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_WRITE_MTT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SYNC_TPT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_EQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_EQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_CQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_CQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_CQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_MODIFY_CQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = true,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_MODIFY_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_SRQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_SRQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_SRQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_ARM_SRQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_ARM_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RST2INIT_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_RST2INIT_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_INIT2INIT_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_INIT2INIT_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_INIT2RTR_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_INIT2RTR_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RTR2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_RTR2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RTS2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_RTS2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SQERR2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SQERR2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_2ERR_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RTS2SQD_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SQD2SQD_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SQD2SQD_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SQD2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SQD2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_2RST_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_2RST_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_QP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SUSPEND_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_UNSUSPEND_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_UPDATE_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_UPDATE_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_GET_OP_REQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper,
+	},
+	{
+		.opcode = MLX4_CMD_ALLOCATE_VPP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper,
+	},
+	{
+		.opcode = MLX4_CMD_SET_VPORT_QOS,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper,
+	},
+	{
+		.opcode = MLX4_CMD_CONF_SPECIAL_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL, /* XXX verify: only demux can do this */
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_MAD_IFC,
+		.has_inbox = true,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_MAD_IFC_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_MAD_DEMUX,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_IF_STAT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_IF_STAT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_ACCESS_REG,
+		.has_inbox = true,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_ACCESS_REG_wrapper,
+	},
+	{
+		.opcode = MLX4_CMD_CONGESTION_CTRL_OPCODE,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper,
+	},
+	/* Native multicast commands are not available for guests */
+	{
+		.opcode = MLX4_CMD_QP_ATTACH,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_ATTACH_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_PROMISC,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_PROMISC_wrapper
+	},
+	/* Ethernet specific commands */
+	{
+		.opcode = MLX4_CMD_SET_VLAN_FLTR,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SET_VLAN_FLTR_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SET_MCAST_FLTR,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SET_MCAST_FLTR_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_DUMP_ETH_STATS,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_DUMP_ETH_STATS_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_INFORM_FLR_DONE,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	/* flow steering commands */
+	{
+		.opcode = MLX4_QP_FLOW_STEERING_ATTACH,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = true,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_FLOW_STEERING_ATTACH_wrapper
+	},
+	{
+		.opcode = MLX4_QP_FLOW_STEERING_DETACH,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_FLOW_STEERING_DETACH_wrapper
+	},
+	{
+		.opcode = MLX4_FLOW_STEERING_IB_UC_QP_RANGE,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_VIRT_PORT_MAP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper
+	},
+};
+
+static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
+				    struct mlx4_vhcr_cmd *in_vhcr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_info *cmd = NULL;
+	struct mlx4_vhcr_cmd *vhcr_cmd = in_vhcr ? in_vhcr : priv->mfunc.vhcr;
+	struct mlx4_vhcr *vhcr;
+	struct mlx4_cmd_mailbox *inbox = NULL;
+	struct mlx4_cmd_mailbox *outbox = NULL;
+	u64 in_param;
+	u64 out_param;
+	int ret = 0;
+	int i;
+	int err = 0;
+
+	/* Create sw representation of Virtual HCR */
+	vhcr = kzalloc(sizeof(struct mlx4_vhcr), GFP_KERNEL);
+	if (!vhcr)
+		return -ENOMEM;
+
+	/* DMA in the vHCR */
+	if (!in_vhcr) {
+		ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave,
+				      priv->mfunc.master.slave_state[slave].vhcr_dma,
+				      ALIGN(sizeof(struct mlx4_vhcr_cmd),
+					    MLX4_ACCESS_MEM_ALIGN), 1);
+		if (ret) {
+			if (!(dev->persist->state &
+			    MLX4_DEVICE_STATE_INTERNAL_ERROR))
+				mlx4_err(dev, "%s: Failed reading vhcr ret: 0x%x\n",
+					 __func__, ret);
+			kfree(vhcr);
+			return ret;
+		}
+	}
+
+	/* Fill SW VHCR fields */
+	vhcr->in_param = be64_to_cpu(vhcr_cmd->in_param);
+	vhcr->out_param = be64_to_cpu(vhcr_cmd->out_param);
+	vhcr->in_modifier = be32_to_cpu(vhcr_cmd->in_modifier);
+	vhcr->token = be16_to_cpu(vhcr_cmd->token);
+	vhcr->op = be16_to_cpu(vhcr_cmd->opcode) & 0xfff;
+	vhcr->op_modifier = (u8) (be16_to_cpu(vhcr_cmd->opcode) >> 12);
+	vhcr->e_bit = vhcr_cmd->flags & (1 << 6);
+
+	/* Lookup command */
+	for (i = 0; i < ARRAY_SIZE(cmd_info); ++i) {
+		if (vhcr->op == cmd_info[i].opcode) {
+			cmd = &cmd_info[i];
+			break;
+		}
+	}
+	if (!cmd) {
+		mlx4_err(dev, "Unknown command:0x%x accepted from slave:%d\n",
+			 vhcr->op, slave);
+		vhcr_cmd->status = CMD_STAT_BAD_PARAM;
+		goto out_status;
+	}
+
+	/* Read inbox */
+	if (cmd->has_inbox) {
+		vhcr->in_param &= INBOX_MASK;
+		inbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(inbox)) {
+			vhcr_cmd->status = CMD_STAT_BAD_SIZE;
+			inbox = NULL;
+			goto out_status;
+		}
+
+		ret = mlx4_ACCESS_MEM(dev, inbox->dma, slave,
+				      vhcr->in_param,
+				      MLX4_MAILBOX_SIZE, 1);
+		if (ret) {
+			if (!(dev->persist->state &
+			    MLX4_DEVICE_STATE_INTERNAL_ERROR))
+				mlx4_err(dev, "%s: Failed reading inbox (cmd:0x%x)\n",
+					 __func__, cmd->opcode);
+			vhcr_cmd->status = CMD_STAT_INTERNAL_ERR;
+			goto out_status;
+		}
+	}
+
+	/* Apply permission and bound checks if applicable */
+	if (cmd->verify && cmd->verify(dev, slave, vhcr, inbox)) {
+		mlx4_warn(dev, "Command:0x%x from slave: %d failed protection checks for resource_id:%d\n",
+			  vhcr->op, slave, vhcr->in_modifier);
+		vhcr_cmd->status = CMD_STAT_BAD_OP;
+		goto out_status;
+	}
+
+	/* Allocate outbox */
+	if (cmd->has_outbox) {
+		outbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(outbox)) {
+			vhcr_cmd->status = CMD_STAT_BAD_SIZE;
+			outbox = NULL;
+			goto out_status;
+		}
+	}
+
+	/* Execute the command! */
+	if (cmd->wrapper) {
+		err = cmd->wrapper(dev, slave, vhcr, inbox, outbox,
+				   cmd);
+		if (cmd->out_is_imm)
+			vhcr_cmd->out_param = cpu_to_be64(vhcr->out_param);
+	} else {
+		in_param = cmd->has_inbox ? (u64) inbox->dma :
+			vhcr->in_param;
+		out_param = cmd->has_outbox ? (u64) outbox->dma :
+			vhcr->out_param;
+		err = __mlx4_cmd(dev, in_param, &out_param,
+				 cmd->out_is_imm, vhcr->in_modifier,
+				 vhcr->op_modifier, vhcr->op,
+				 MLX4_CMD_TIME_CLASS_A,
+				 MLX4_CMD_NATIVE);
+
+		if (cmd->out_is_imm) {
+			vhcr->out_param = out_param;
+			vhcr_cmd->out_param = cpu_to_be64(vhcr->out_param);
+		}
+	}
+
+	if (err) {
+		if (!(dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)) {
+			if (vhcr->op == MLX4_CMD_ALLOC_RES &&
+			    (vhcr->in_modifier & 0xff) == RES_COUNTER &&
+			    err == -EDQUOT)
+				mlx4_dbg(dev,
+					 "Unable to allocate counter for slave %d (%d)\n",
+					 slave, err);
+			else
+				mlx4_warn(dev, "vhcr command:0x%x slave:%d failed with error:%d, status %d\n",
+					  vhcr->op, slave, vhcr->errno, err);
+		}
+		vhcr_cmd->status = mlx4_errno_to_status(err);
+		goto out_status;
+	}
+
+
+	/* Write outbox if command completed successfully */
+	if (cmd->has_outbox && !vhcr_cmd->status) {
+		ret = mlx4_ACCESS_MEM(dev, outbox->dma, slave,
+				      vhcr->out_param,
+				      MLX4_MAILBOX_SIZE, MLX4_CMD_WRAPPED);
+		if (ret) {
+			/* If we failed to write back the outbox after the
+			 *command was successfully executed, we must fail this
+			 * slave, as it is now in undefined state */
+			if (!(dev->persist->state &
+			    MLX4_DEVICE_STATE_INTERNAL_ERROR))
+				mlx4_err(dev, "%s:Failed writing outbox\n", __func__);
+			goto out;
+		}
+	}
+
+out_status:
+	/* DMA back vhcr result */
+	if (!in_vhcr) {
+		ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave,
+				      priv->mfunc.master.slave_state[slave].vhcr_dma,
+				      ALIGN(sizeof(struct mlx4_vhcr),
+					    MLX4_ACCESS_MEM_ALIGN),
+				      MLX4_CMD_WRAPPED);
+		if (ret)
+			mlx4_err(dev, "%s:Failed writing vhcr result\n",
+				 __func__);
+		else if (vhcr->e_bit &&
+			 mlx4_GEN_EQE(dev, slave, &priv->mfunc.master.cmd_eqe))
+				mlx4_warn(dev, "Failed to generate command completion eqe for slave %d\n",
+					  slave);
+	}
+
+out:
+	kfree(vhcr);
+	mlx4_free_cmd_mailbox(dev, inbox);
+	mlx4_free_cmd_mailbox(dev, outbox);
+	return ret;
+}
+
+static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv,
+					    int slave, int port)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_vport_state *vp_admin;
+	struct mlx4_vf_immed_vlan_work *work;
+	struct mlx4_dev *dev = &(priv->dev);
+	int err;
+	int admin_vlan_ix = NO_INDX;
+
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	if (vp_oper->state.default_vlan == vp_admin->default_vlan &&
+	    vp_oper->state.default_qos == vp_admin->default_qos &&
+	    vp_oper->state.vlan_proto == vp_admin->vlan_proto &&
+	    vp_oper->state.link_state == vp_admin->link_state &&
+	    vp_oper->state.qos_vport == vp_admin->qos_vport)
+		return 0;
+
+	if (!(priv->mfunc.master.slave_state[slave].active &&
+	      dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP)) {
+		/* even if the UPDATE_QP command isn't supported, we still want
+		 * to set this VF link according to the admin directive
+		 */
+		vp_oper->state.link_state = vp_admin->link_state;
+		return -1;
+	}
+
+	mlx4_dbg(dev, "updating immediately admin params slave %d port %d\n",
+		 slave, port);
+	mlx4_dbg(dev, "vlan %d QoS %d link down %d\n",
+		 vp_admin->default_vlan, vp_admin->default_qos,
+		 vp_admin->link_state);
+
+	work = kzalloc(sizeof(*work), GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	if (vp_oper->state.default_vlan != vp_admin->default_vlan) {
+		if (MLX4_VGT != vp_admin->default_vlan) {
+			err = __mlx4_register_vlan(&priv->dev, port,
+						   vp_admin->default_vlan,
+						   &admin_vlan_ix);
+			if (err) {
+				kfree(work);
+				mlx4_warn(&priv->dev,
+					  "No vlan resources slave %d, port %d\n",
+					  slave, port);
+				return err;
+			}
+		} else {
+			admin_vlan_ix = NO_INDX;
+		}
+		work->flags |= MLX4_VF_IMMED_VLAN_FLAG_VLAN;
+		mlx4_dbg(&priv->dev,
+			 "alloc vlan %d idx  %d slave %d port %d\n",
+			 (int)(vp_admin->default_vlan),
+			 admin_vlan_ix, slave, port);
+	}
+
+	/* save original vlan ix and vlan id */
+	work->orig_vlan_id = vp_oper->state.default_vlan;
+	work->orig_vlan_ix = vp_oper->vlan_idx;
+
+	/* handle new qos */
+	if (vp_oper->state.default_qos != vp_admin->default_qos)
+		work->flags |= MLX4_VF_IMMED_VLAN_FLAG_QOS;
+
+	if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_VLAN)
+		vp_oper->vlan_idx = admin_vlan_ix;
+
+	vp_oper->state.default_vlan = vp_admin->default_vlan;
+	vp_oper->state.default_qos = vp_admin->default_qos;
+	vp_oper->state.vlan_proto = vp_admin->vlan_proto;
+	vp_oper->state.link_state = vp_admin->link_state;
+	vp_oper->state.qos_vport = vp_admin->qos_vport;
+
+	if (vp_admin->link_state == IFLA_VF_LINK_STATE_DISABLE)
+		work->flags |= MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE;
+
+	/* iterate over QPs owned by this slave, using UPDATE_QP */
+	work->port = port;
+	work->slave = slave;
+	work->qos = vp_oper->state.default_qos;
+	work->qos_vport = vp_oper->state.qos_vport;
+	work->vlan_id = vp_oper->state.default_vlan;
+	work->vlan_ix = vp_oper->vlan_idx;
+	work->vlan_proto = vp_oper->state.vlan_proto;
+	work->priv = priv;
+	INIT_WORK(&work->work, mlx4_vf_immed_vlan_work_handler);
+	queue_work(priv->mfunc.master.comm_wq, &work->work);
+
+	return 0;
+}
+
+static void mlx4_set_default_port_qos(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_qos_manager *port_qos_ctl;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	port_qos_ctl = &priv->mfunc.master.qos_ctl[port];
+	bitmap_zero(port_qos_ctl->priority_bm, MLX4_NUM_UP);
+
+	/* Enable only default prio at PF init routine */
+	set_bit(MLX4_DEFAULT_QOS_PRIO, port_qos_ctl->priority_bm);
+}
+
+static void mlx4_allocate_port_vpps(struct mlx4_dev *dev, int port)
+{
+	int i;
+	int err;
+	int num_vfs;
+	u16 available_vpp;
+	u8 vpp_param[MLX4_NUM_UP];
+	struct mlx4_qos_manager *port_qos;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	err = mlx4_ALLOCATE_VPP_get(dev, port, &available_vpp, vpp_param);
+	if (err) {
+		mlx4_info(dev, "Failed query available VPPs\n");
+		return;
+	}
+
+	port_qos = &priv->mfunc.master.qos_ctl[port];
+	num_vfs = (available_vpp /
+		   bitmap_weight(port_qos->priority_bm, MLX4_NUM_UP));
+
+	for (i = 0; i < MLX4_NUM_UP; i++) {
+		if (test_bit(i, port_qos->priority_bm))
+			vpp_param[i] = num_vfs;
+	}
+
+	err = mlx4_ALLOCATE_VPP_set(dev, port, vpp_param);
+	if (err) {
+		mlx4_info(dev, "Failed allocating VPPs\n");
+		return;
+	}
+
+	/* Query actual allocated VPP, just to make sure */
+	err = mlx4_ALLOCATE_VPP_get(dev, port, &available_vpp, vpp_param);
+	if (err) {
+		mlx4_info(dev, "Failed query available VPPs\n");
+		return;
+	}
+
+	port_qos->num_of_qos_vfs = num_vfs;
+	mlx4_dbg(dev, "Port %d Available VPPs %d\n", port, available_vpp);
+
+	for (i = 0; i < MLX4_NUM_UP; i++)
+		mlx4_dbg(dev, "Port %d UP %d Allocated %d VPPs\n", port, i,
+			 vpp_param[i]);
+}
+
+static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave)
+{
+	int port, err;
+	struct mlx4_vport_state *vp_admin;
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_slave_state *slave_state =
+		&priv->mfunc.master.slave_state[slave];
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
+			&priv->dev, slave);
+	int min_port = find_first_bit(actv_ports.ports,
+				      priv->dev.caps.num_ports) + 1;
+	int max_port = min_port - 1 +
+		bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports);
+
+	for (port = min_port; port <= max_port; port++) {
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+		priv->mfunc.master.vf_oper[slave].smi_enabled[port] =
+			priv->mfunc.master.vf_admin[slave].enable_smi[port];
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+		if (vp_admin->vlan_proto != htons(ETH_P_8021AD) ||
+		    slave_state->vst_qinq_supported) {
+			vp_oper->state.vlan_proto   = vp_admin->vlan_proto;
+			vp_oper->state.default_vlan = vp_admin->default_vlan;
+			vp_oper->state.default_qos  = vp_admin->default_qos;
+		}
+		vp_oper->state.link_state = vp_admin->link_state;
+		vp_oper->state.mac        = vp_admin->mac;
+		vp_oper->state.spoofchk   = vp_admin->spoofchk;
+		vp_oper->state.tx_rate    = vp_admin->tx_rate;
+		vp_oper->state.qos_vport  = vp_admin->qos_vport;
+		vp_oper->state.guid       = vp_admin->guid;
+
+		if (MLX4_VGT != vp_admin->default_vlan) {
+			err = __mlx4_register_vlan(&priv->dev, port,
+						   vp_admin->default_vlan, &(vp_oper->vlan_idx));
+			if (err) {
+				vp_oper->vlan_idx = NO_INDX;
+				vp_oper->state.default_vlan = MLX4_VGT;
+				vp_oper->state.vlan_proto = htons(ETH_P_8021Q);
+				mlx4_warn(&priv->dev,
+					  "No vlan resources slave %d, port %d\n",
+					  slave, port);
+				return err;
+			}
+			mlx4_dbg(&priv->dev, "alloc vlan %d idx  %d slave %d port %d\n",
+				 (int)(vp_oper->state.default_vlan),
+				 vp_oper->vlan_idx, slave, port);
+		}
+		if (vp_admin->spoofchk) {
+			vp_oper->mac_idx = __mlx4_register_mac(&priv->dev,
+							       port,
+							       vp_admin->mac);
+			if (0 > vp_oper->mac_idx) {
+				err = vp_oper->mac_idx;
+				vp_oper->mac_idx = NO_INDX;
+				mlx4_warn(&priv->dev,
+					  "No mac resources slave %d, port %d\n",
+					  slave, port);
+				return err;
+			}
+			mlx4_dbg(&priv->dev, "alloc mac %llx idx  %d slave %d port %d\n",
+				 vp_oper->state.mac, vp_oper->mac_idx, slave, port);
+		}
+	}
+	return 0;
+}
+
+static void mlx4_master_deactivate_admin_state(struct mlx4_priv *priv, int slave)
+{
+	int port;
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
+			&priv->dev, slave);
+	int min_port = find_first_bit(actv_ports.ports,
+				      priv->dev.caps.num_ports) + 1;
+	int max_port = min_port - 1 +
+		bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports);
+
+
+	for (port = min_port; port <= max_port; port++) {
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+		priv->mfunc.master.vf_oper[slave].smi_enabled[port] =
+			MLX4_VF_SMI_DISABLED;
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		if (NO_INDX != vp_oper->vlan_idx) {
+			__mlx4_unregister_vlan(&priv->dev,
+					       port, vp_oper->state.default_vlan);
+			vp_oper->vlan_idx = NO_INDX;
+		}
+		if (NO_INDX != vp_oper->mac_idx) {
+			__mlx4_unregister_mac(&priv->dev, port, vp_oper->state.mac);
+			vp_oper->mac_idx = NO_INDX;
+		}
+	}
+	return;
+}
+
+static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
+			       u16 param, u8 toggle)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	u32 reply;
+	u8 is_going_down = 0;
+	int i;
+	unsigned long flags;
+
+	slave_state[slave].comm_toggle ^= 1;
+	reply = (u32) slave_state[slave].comm_toggle << 31;
+	if (toggle != slave_state[slave].comm_toggle) {
+		mlx4_warn(dev, "Incorrect toggle %d from slave %d. *** MASTER STATE COMPROMISED ***\n",
+			  toggle, slave);
+		goto reset_slave;
+	}
+	if (cmd == MLX4_COMM_CMD_RESET) {
+		mlx4_warn(dev, "Received reset from slave:%d\n", slave);
+		slave_state[slave].active = false;
+		slave_state[slave].old_vlan_api = false;
+		slave_state[slave].vst_qinq_supported = false;
+		mlx4_master_deactivate_admin_state(priv, slave);
+		for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i) {
+				slave_state[slave].event_eq[i].eqn = -1;
+				slave_state[slave].event_eq[i].token = 0;
+		}
+		/*check if we are in the middle of FLR process,
+		if so return "retry" status to the slave*/
+		if (MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd)
+			goto inform_slave_state;
+
+		mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_SHUTDOWN, slave);
+
+		/* write the version in the event field */
+		reply |= mlx4_comm_get_version();
+
+		goto reset_slave;
+	}
+	/*command from slave in the middle of FLR*/
+	if (cmd != MLX4_COMM_CMD_RESET &&
+	    MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd) {
+		mlx4_warn(dev, "slave:%d is Trying to run cmd(0x%x) in the middle of FLR\n",
+			  slave, cmd);
+		return;
+	}
+
+	switch (cmd) {
+	case MLX4_COMM_CMD_VHCR0:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_RESET)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma = ((u64) param) << 48;
+		priv->mfunc.master.slave_state[slave].cookie = 0;
+		break;
+	case MLX4_COMM_CMD_VHCR1:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR0)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma |= ((u64) param) << 32;
+		break;
+	case MLX4_COMM_CMD_VHCR2:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR1)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma |= ((u64) param) << 16;
+		break;
+	case MLX4_COMM_CMD_VHCR_EN:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR2)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma |= param;
+		if (mlx4_master_activate_admin_state(priv, slave))
+				goto reset_slave;
+		slave_state[slave].active = true;
+		mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_INIT, slave);
+		break;
+	case MLX4_COMM_CMD_VHCR_POST:
+		if ((slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_EN) &&
+		    (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST)) {
+			mlx4_warn(dev, "slave:%d is out of sync, cmd=0x%x, last command=0x%x, reset is needed\n",
+				  slave, cmd, slave_state[slave].last_cmd);
+			goto reset_slave;
+		}
+
+		mutex_lock(&priv->cmd.slave_cmd_mutex);
+		if (mlx4_master_process_vhcr(dev, slave, NULL)) {
+			mlx4_err(dev, "Failed processing vhcr for slave:%d, resetting slave\n",
+				 slave);
+			mutex_unlock(&priv->cmd.slave_cmd_mutex);
+			goto reset_slave;
+		}
+		mutex_unlock(&priv->cmd.slave_cmd_mutex);
+		break;
+	default:
+		mlx4_warn(dev, "Bad comm cmd:%d from slave:%d\n", cmd, slave);
+		goto reset_slave;
+	}
+	spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+	if (!slave_state[slave].is_slave_going_down)
+		slave_state[slave].last_cmd = cmd;
+	else
+		is_going_down = 1;
+	spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+	if (is_going_down) {
+		mlx4_warn(dev, "Slave is going down aborting command(%d) executing from slave:%d\n",
+			  cmd, slave);
+		return;
+	}
+	__raw_writel((__force u32) cpu_to_be32(reply),
+		     &priv->mfunc.comm[slave].slave_read);
+	mmiowb();
+
+	return;
+
+reset_slave:
+	/* cleanup any slave resources */
+	if (dev->persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_delete_all_resources_for_slave(dev, slave);
+
+	if (cmd != MLX4_COMM_CMD_RESET) {
+		mlx4_warn(dev, "Turn on internal error to force reset, slave=%d, cmd=0x%x\n",
+			  slave, cmd);
+		/* Turn on internal error letting slave reset itself immeditaly,
+		 * otherwise it might take till timeout on command is passed
+		 */
+		reply |= ((u32)COMM_CHAN_EVENT_INTERNAL_ERR);
+	}
+
+	spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+	if (!slave_state[slave].is_slave_going_down)
+		slave_state[slave].last_cmd = MLX4_COMM_CMD_RESET;
+	spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+	/*with slave in the middle of flr, no need to clean resources again.*/
+inform_slave_state:
+	memset(&slave_state[slave].event_eq, 0,
+	       sizeof(struct mlx4_slave_event_eq_info));
+	__raw_writel((__force u32) cpu_to_be32(reply),
+		     &priv->mfunc.comm[slave].slave_read);
+	wmb();
+}
+
+/* master command processing */
+void mlx4_master_comm_channel(struct work_struct *work)
+{
+	struct mlx4_mfunc_master_ctx *master =
+		container_of(work,
+			     struct mlx4_mfunc_master_ctx,
+			     comm_work);
+	struct mlx4_mfunc *mfunc =
+		container_of(master, struct mlx4_mfunc, master);
+	struct mlx4_priv *priv =
+		container_of(mfunc, struct mlx4_priv, mfunc);
+	struct mlx4_dev *dev = &priv->dev;
+	__be32 *bit_vec;
+	u32 comm_cmd;
+	u32 vec;
+	int i, j, slave;
+	int toggle;
+	int served = 0;
+	int reported = 0;
+	u32 slt;
+
+	bit_vec = master->comm_arm_bit_vector;
+	for (i = 0; i < COMM_CHANNEL_BIT_ARRAY_SIZE; i++) {
+		vec = be32_to_cpu(bit_vec[i]);
+		for (j = 0; j < 32; j++) {
+			if (!(vec & (1 << j)))
+				continue;
+			++reported;
+			slave = (i * 32) + j;
+			comm_cmd = swab32(readl(
+					  &mfunc->comm[slave].slave_write));
+			slt = swab32(readl(&mfunc->comm[slave].slave_read))
+				     >> 31;
+			toggle = comm_cmd >> 31;
+			if (toggle != slt) {
+				if (master->slave_state[slave].comm_toggle
+				    != slt) {
+					pr_info("slave %d out of sync. read toggle %d, state toggle %d. Resynching.\n",
+						slave, slt,
+						master->slave_state[slave].comm_toggle);
+					master->slave_state[slave].comm_toggle =
+						slt;
+				}
+				mlx4_master_do_cmd(dev, slave,
+						   comm_cmd >> 16 & 0xff,
+						   comm_cmd & 0xffff, toggle);
+				++served;
+			}
+		}
+	}
+
+	if (reported && reported != served)
+		mlx4_warn(dev, "Got command event with bitmask from %d slaves but %d were served\n",
+			  reported, served);
+
+	if (mlx4_ARM_COMM_CHANNEL(dev))
+		mlx4_warn(dev, "Failed to arm comm channel events\n");
+}
+
+static int sync_toggles(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 wr_toggle;
+	u32 rd_toggle;
+	unsigned long end;
+
+	wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write));
+	if (wr_toggle == 0xffffffff)
+		end = jiffies + msecs_to_jiffies(30000);
+	else
+		end = jiffies + msecs_to_jiffies(5000);
+
+	while (time_before(jiffies, end)) {
+		rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read));
+		if (wr_toggle == 0xffffffff || rd_toggle == 0xffffffff) {
+			/* PCI might be offline */
+
+			/* If device removal has been requested,
+			 * do not continue retrying.
+			 */
+			if (dev->persist->interface_state &
+			    MLX4_INTERFACE_STATE_NOWAIT) {
+				mlx4_warn(dev,
+					  "communication channel is offline\n");
+				return -EIO;
+			}
+
+			msleep(100);
+			wr_toggle = swab32(readl(&priv->mfunc.comm->
+					   slave_write));
+			continue;
+		}
+
+		if (rd_toggle >> 31 == wr_toggle >> 31) {
+			priv->cmd.comm_toggle = rd_toggle >> 31;
+			return 0;
+		}
+
+		cond_resched();
+	}
+
+	/*
+	 * we could reach here if for example the previous VM using this
+	 * function misbehaved and left the channel with unsynced state. We
+	 * should fix this here and give this VM a chance to use a properly
+	 * synced channel
+	 */
+	mlx4_warn(dev, "recovering from previously mis-behaved VM\n");
+	__raw_writel((__force u32) 0, &priv->mfunc.comm->slave_read);
+	__raw_writel((__force u32) 0, &priv->mfunc.comm->slave_write);
+	priv->cmd.comm_toggle = 0;
+
+	return 0;
+}
+
+int mlx4_multi_func_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state;
+	int i, j, err, port;
+
+	if (mlx4_is_master(dev))
+		priv->mfunc.comm =
+		ioremap(pci_resource_start(dev->persist->pdev,
+					   priv->fw.comm_bar) +
+			priv->fw.comm_base, MLX4_COMM_PAGESIZE);
+	else
+		priv->mfunc.comm =
+		ioremap(pci_resource_start(dev->persist->pdev, 2) +
+			MLX4_SLAVE_COMM_BASE, MLX4_COMM_PAGESIZE);
+	if (!priv->mfunc.comm) {
+		mlx4_err(dev, "Couldn't map communication vector\n");
+		goto err_vhcr;
+	}
+
+	if (mlx4_is_master(dev)) {
+		struct mlx4_vf_oper_state *vf_oper;
+		struct mlx4_vf_admin_state *vf_admin;
+
+		priv->mfunc.master.slave_state =
+			kzalloc(dev->num_slaves *
+				sizeof(struct mlx4_slave_state), GFP_KERNEL);
+		if (!priv->mfunc.master.slave_state)
+			goto err_comm;
+
+		priv->mfunc.master.vf_admin =
+			kzalloc(dev->num_slaves *
+				sizeof(struct mlx4_vf_admin_state), GFP_KERNEL);
+		if (!priv->mfunc.master.vf_admin)
+			goto err_comm_admin;
+
+		priv->mfunc.master.vf_oper =
+			kzalloc(dev->num_slaves *
+				sizeof(struct mlx4_vf_oper_state), GFP_KERNEL);
+		if (!priv->mfunc.master.vf_oper)
+			goto err_comm_oper;
+
+		for (i = 0; i < dev->num_slaves; ++i) {
+			vf_admin = &priv->mfunc.master.vf_admin[i];
+			vf_oper = &priv->mfunc.master.vf_oper[i];
+			s_state = &priv->mfunc.master.slave_state[i];
+			s_state->last_cmd = MLX4_COMM_CMD_RESET;
+			s_state->vst_qinq_supported = false;
+			mutex_init(&priv->mfunc.master.gen_eqe_mutex[i]);
+			for (j = 0; j < MLX4_EVENT_TYPES_NUM; ++j)
+				s_state->event_eq[j].eqn = -1;
+			__raw_writel((__force u32) 0,
+				     &priv->mfunc.comm[i].slave_write);
+			__raw_writel((__force u32) 0,
+				     &priv->mfunc.comm[i].slave_read);
+			mmiowb();
+			for (port = 1; port <= MLX4_MAX_PORTS; port++) {
+				struct mlx4_vport_state *admin_vport;
+				struct mlx4_vport_state *oper_vport;
+
+				s_state->vlan_filter[port] =
+					kzalloc(sizeof(struct mlx4_vlan_fltr),
+						GFP_KERNEL);
+				if (!s_state->vlan_filter[port]) {
+					if (--port)
+						kfree(s_state->vlan_filter[port]);
+					goto err_slaves;
+				}
+
+				admin_vport = &vf_admin->vport[port];
+				oper_vport = &vf_oper->vport[port].state;
+				INIT_LIST_HEAD(&s_state->mcast_filters[port]);
+				admin_vport->default_vlan = MLX4_VGT;
+				oper_vport->default_vlan = MLX4_VGT;
+				admin_vport->qos_vport =
+						MLX4_VPP_DEFAULT_VPORT;
+				oper_vport->qos_vport = MLX4_VPP_DEFAULT_VPORT;
+				admin_vport->vlan_proto = htons(ETH_P_8021Q);
+				oper_vport->vlan_proto = htons(ETH_P_8021Q);
+				vf_oper->vport[port].vlan_idx = NO_INDX;
+				vf_oper->vport[port].mac_idx = NO_INDX;
+				mlx4_set_random_admin_guid(dev, i, port);
+			}
+			spin_lock_init(&s_state->lock);
+		}
+
+		if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QOS_VPP) {
+			for (port = 1; port <= dev->caps.num_ports; port++) {
+				if (mlx4_is_eth(dev, port)) {
+					mlx4_set_default_port_qos(dev, port);
+					mlx4_allocate_port_vpps(dev, port);
+				}
+			}
+		}
+
+		memset(&priv->mfunc.master.cmd_eqe, 0, sizeof(struct mlx4_eqe));
+		priv->mfunc.master.cmd_eqe.type = MLX4_EVENT_TYPE_CMD;
+		INIT_WORK(&priv->mfunc.master.comm_work,
+			  mlx4_master_comm_channel);
+		INIT_WORK(&priv->mfunc.master.slave_event_work,
+			  mlx4_gen_slave_eqe);
+		INIT_WORK(&priv->mfunc.master.slave_flr_event_work,
+			  mlx4_master_handle_slave_flr);
+		spin_lock_init(&priv->mfunc.master.slave_state_lock);
+		spin_lock_init(&priv->mfunc.master.slave_eq.event_lock);
+		priv->mfunc.master.comm_wq =
+			create_singlethread_workqueue("mlx4_comm");
+		if (!priv->mfunc.master.comm_wq)
+			goto err_slaves;
+
+		if (mlx4_init_resource_tracker(dev))
+			goto err_thread;
+
+	} else {
+		err = sync_toggles(dev);
+		if (err) {
+			mlx4_err(dev, "Couldn't sync toggles\n");
+			goto err_comm;
+		}
+	}
+	return 0;
+
+err_thread:
+	flush_workqueue(priv->mfunc.master.comm_wq);
+	destroy_workqueue(priv->mfunc.master.comm_wq);
+err_slaves:
+	while (i--) {
+		for (port = 1; port <= MLX4_MAX_PORTS; port++)
+			kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]);
+	}
+	kfree(priv->mfunc.master.vf_oper);
+err_comm_oper:
+	kfree(priv->mfunc.master.vf_admin);
+err_comm_admin:
+	kfree(priv->mfunc.master.slave_state);
+err_comm:
+	iounmap(priv->mfunc.comm);
+	priv->mfunc.comm = NULL;
+err_vhcr:
+	dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+			  priv->mfunc.vhcr,
+			  priv->mfunc.vhcr_dma);
+	priv->mfunc.vhcr = NULL;
+	return -ENOMEM;
+}
+
+int mlx4_cmd_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int flags = 0;
+
+	if (!priv->cmd.initialized) {
+		init_rwsem(&priv->cmd.switch_sem);
+		mutex_init(&priv->cmd.slave_cmd_mutex);
+		sema_init(&priv->cmd.poll_sem, 1);
+		priv->cmd.use_events = 0;
+		priv->cmd.toggle     = 1;
+		priv->cmd.initialized = 1;
+		flags |= MLX4_CMD_CLEANUP_STRUCT;
+	}
+
+	if (!mlx4_is_slave(dev) && !priv->cmd.hcr) {
+		priv->cmd.hcr = ioremap(pci_resource_start(dev->persist->pdev,
+					0) + MLX4_HCR_BASE, MLX4_HCR_SIZE);
+		if (!priv->cmd.hcr) {
+			mlx4_err(dev, "Couldn't map command register\n");
+			goto err;
+		}
+		flags |= MLX4_CMD_CLEANUP_HCR;
+	}
+
+	if (mlx4_is_mfunc(dev) && !priv->mfunc.vhcr) {
+		priv->mfunc.vhcr = dma_alloc_coherent(&dev->persist->pdev->dev,
+						      PAGE_SIZE,
+						      &priv->mfunc.vhcr_dma,
+						      GFP_KERNEL);
+		if (!priv->mfunc.vhcr)
+			goto err;
+
+		flags |= MLX4_CMD_CLEANUP_VHCR;
+	}
+
+	if (!priv->cmd.pool) {
+		priv->cmd.pool = dma_pool_create("mlx4_cmd",
+						 &dev->persist->pdev->dev,
+						 MLX4_MAILBOX_SIZE,
+						 MLX4_MAILBOX_SIZE, 0);
+		if (!priv->cmd.pool)
+			goto err;
+
+		flags |= MLX4_CMD_CLEANUP_POOL;
+	}
+
+	return 0;
+
+err:
+	mlx4_cmd_cleanup(dev, flags);
+	return -ENOMEM;
+}
+
+void mlx4_report_internal_err_comm_event(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int slave;
+	u32 slave_read;
+
+	/* If the comm channel has not yet been initialized,
+	 * skip reporting the internal error event to all
+	 * the communication channels.
+	 */
+	if (!priv->mfunc.comm)
+		return;
+
+	/* Report an internal error event to all
+	 * communication channels.
+	 */
+	for (slave = 0; slave < dev->num_slaves; slave++) {
+		slave_read = swab32(readl(&priv->mfunc.comm[slave].slave_read));
+		slave_read |= (u32)COMM_CHAN_EVENT_INTERNAL_ERR;
+		__raw_writel((__force u32)cpu_to_be32(slave_read),
+			     &priv->mfunc.comm[slave].slave_read);
+		/* Make sure that our comm channel write doesn't
+		 * get mixed in with writes from another CPU.
+		 */
+		mmiowb();
+	}
+}
+
+void mlx4_multi_func_cleanup(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, port;
+
+	if (mlx4_is_master(dev)) {
+		flush_workqueue(priv->mfunc.master.comm_wq);
+		destroy_workqueue(priv->mfunc.master.comm_wq);
+		for (i = 0; i < dev->num_slaves; i++) {
+			for (port = 1; port <= MLX4_MAX_PORTS; port++)
+				kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]);
+		}
+		kfree(priv->mfunc.master.slave_state);
+		kfree(priv->mfunc.master.vf_admin);
+		kfree(priv->mfunc.master.vf_oper);
+		dev->num_slaves = 0;
+	}
+
+	iounmap(priv->mfunc.comm);
+	priv->mfunc.comm = NULL;
+}
+
+void mlx4_cmd_cleanup(struct mlx4_dev *dev, int cleanup_mask)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (priv->cmd.pool && (cleanup_mask & MLX4_CMD_CLEANUP_POOL)) {
+		dma_pool_destroy(priv->cmd.pool);
+		priv->cmd.pool = NULL;
+	}
+
+	if (!mlx4_is_slave(dev) && priv->cmd.hcr &&
+	    (cleanup_mask & MLX4_CMD_CLEANUP_HCR)) {
+		iounmap(priv->cmd.hcr);
+		priv->cmd.hcr = NULL;
+	}
+	if (mlx4_is_mfunc(dev) && priv->mfunc.vhcr &&
+	    (cleanup_mask & MLX4_CMD_CLEANUP_VHCR)) {
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+				  priv->mfunc.vhcr, priv->mfunc.vhcr_dma);
+		priv->mfunc.vhcr = NULL;
+	}
+	if (priv->cmd.initialized && (cleanup_mask & MLX4_CMD_CLEANUP_STRUCT))
+		priv->cmd.initialized = 0;
+}
+
+/*
+ * Switch to using events to issue FW commands (can only be called
+ * after event queue for command events has been initialized).
+ */
+int mlx4_cmd_use_events(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+	int err = 0;
+
+	priv->cmd.context = kmalloc(priv->cmd.max_cmds *
+				   sizeof(struct mlx4_cmd_context),
+				   GFP_KERNEL);
+	if (!priv->cmd.context)
+		return -ENOMEM;
+
+	down_write(&priv->cmd.switch_sem);
+	for (i = 0; i < priv->cmd.max_cmds; ++i) {
+		priv->cmd.context[i].token = i;
+		priv->cmd.context[i].next  = i + 1;
+		/* To support fatal error flow, initialize all
+		 * cmd contexts to allow simulating completions
+		 * with complete() at any time.
+		 */
+		init_completion(&priv->cmd.context[i].done);
+	}
+
+	priv->cmd.context[priv->cmd.max_cmds - 1].next = -1;
+	priv->cmd.free_head = 0;
+
+	sema_init(&priv->cmd.event_sem, priv->cmd.max_cmds);
+
+	for (priv->cmd.token_mask = 1;
+	     priv->cmd.token_mask < priv->cmd.max_cmds;
+	     priv->cmd.token_mask <<= 1)
+		; /* nothing */
+	--priv->cmd.token_mask;
+
+	down(&priv->cmd.poll_sem);
+	priv->cmd.use_events = 1;
+	up_write(&priv->cmd.switch_sem);
+
+	return err;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mlx4_cmd_use_polling(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	down_write(&priv->cmd.switch_sem);
+	priv->cmd.use_events = 0;
+
+	for (i = 0; i < priv->cmd.max_cmds; ++i)
+		down(&priv->cmd.event_sem);
+
+	kfree(priv->cmd.context);
+
+	up(&priv->cmd.poll_sem);
+	up_write(&priv->cmd.switch_sem);
+}
+
+struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof(*mailbox), GFP_KERNEL);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = dma_pool_zalloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
+				       &mailbox->dma);
+	if (!mailbox->buf) {
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return mailbox;
+}
+EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox);
+
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev,
+			   struct mlx4_cmd_mailbox *mailbox)
+{
+	if (!mailbox)
+		return;
+
+	dma_pool_free(mlx4_priv(dev)->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+EXPORT_SYMBOL_GPL(mlx4_free_cmd_mailbox);
+
+u32 mlx4_comm_get_version(void)
+{
+	 return ((u32) CMD_CHAN_IF_REV << 8) | (u32) CMD_CHAN_VER;
+}
+
+static int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf)
+{
+	if ((vf < 0) || (vf >= dev->persist->num_vfs)) {
+		mlx4_err(dev, "Bad vf number:%d (number of activated vf: %d)\n",
+			 vf, dev->persist->num_vfs);
+		return -EINVAL;
+	}
+
+	return vf+1;
+}
+
+int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave)
+{
+	if (slave < 1 || slave > dev->persist->num_vfs) {
+		mlx4_err(dev,
+			 "Bad slave number:%d (number of activated slaves: %lu)\n",
+			 slave, dev->num_slaves);
+		return -EINVAL;
+	}
+	return slave - 1;
+}
+
+void mlx4_cmd_wake_completions(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_context *context;
+	int i;
+
+	spin_lock(&priv->cmd.context_lock);
+	if (priv->cmd.context) {
+		for (i = 0; i < priv->cmd.max_cmds; ++i) {
+			context = &priv->cmd.context[i];
+			context->fw_status = CMD_STAT_INTERNAL_ERR;
+			context->result    =
+				mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+			complete(&context->done);
+		}
+	}
+	spin_unlock(&priv->cmd.context_lock);
+}
+
+struct mlx4_active_ports mlx4_get_active_ports(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_active_ports actv_ports;
+	int vf;
+
+	bitmap_zero(actv_ports.ports, MLX4_MAX_PORTS);
+
+	if (slave == 0) {
+		bitmap_fill(actv_ports.ports, dev->caps.num_ports);
+		return actv_ports;
+	}
+
+	vf = mlx4_get_vf_indx(dev, slave);
+	if (vf < 0)
+		return actv_ports;
+
+	bitmap_set(actv_ports.ports, dev->dev_vfs[vf].min_port - 1,
+		   min((int)dev->dev_vfs[mlx4_get_vf_indx(dev, slave)].n_ports,
+		   dev->caps.num_ports));
+
+	return actv_ports;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_active_ports);
+
+int mlx4_slave_convert_port(struct mlx4_dev *dev, int slave, int port)
+{
+	unsigned n;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	unsigned m = bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+
+	if (port <= 0 || port > m)
+		return -EINVAL;
+
+	n = find_first_bit(actv_ports.ports, dev->caps.num_ports);
+	if (port <= n)
+		port = n + 1;
+
+	return port;
+}
+EXPORT_SYMBOL_GPL(mlx4_slave_convert_port);
+
+int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	if (test_bit(port - 1, actv_ports.ports))
+		return port -
+			find_first_bit(actv_ports.ports, dev->caps.num_ports);
+
+	return -1;
+}
+EXPORT_SYMBOL_GPL(mlx4_phys_to_slave_port);
+
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport(struct mlx4_dev *dev,
+						   int port)
+{
+	unsigned i;
+	struct mlx4_slaves_pport slaves_pport;
+
+	bitmap_zero(slaves_pport.slaves, MLX4_MFUNC_MAX);
+
+	if (port <= 0 || port > dev->caps.num_ports)
+		return slaves_pport;
+
+	for (i = 0; i < dev->persist->num_vfs + 1; i++) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, i);
+		if (test_bit(port - 1, actv_ports.ports))
+			set_bit(i, slaves_pport.slaves);
+	}
+
+	return slaves_pport;
+}
+EXPORT_SYMBOL_GPL(mlx4_phys_to_slaves_pport);
+
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport_actv(
+		struct mlx4_dev *dev,
+		const struct mlx4_active_ports *crit_ports)
+{
+	unsigned i;
+	struct mlx4_slaves_pport slaves_pport;
+
+	bitmap_zero(slaves_pport.slaves, MLX4_MFUNC_MAX);
+
+	for (i = 0; i < dev->persist->num_vfs + 1; i++) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, i);
+		if (bitmap_equal(crit_ports->ports, actv_ports.ports,
+				 dev->caps.num_ports))
+			set_bit(i, slaves_pport.slaves);
+	}
+
+	return slaves_pport;
+}
+EXPORT_SYMBOL_GPL(mlx4_phys_to_slaves_pport_actv);
+
+static int mlx4_slaves_closest_port(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	int min_port = find_first_bit(actv_ports.ports, dev->caps.num_ports)
+			+ 1;
+	int max_port = min_port +
+		bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+
+	if (port < min_port)
+		port = min_port;
+	else if (port >= max_port)
+		port = max_port - 1;
+
+	return port;
+}
+
+static int mlx4_set_vport_qos(struct mlx4_priv *priv, int slave, int port,
+			      int max_tx_rate)
+{
+	int i;
+	int err;
+	struct mlx4_qos_manager *port_qos;
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_vport_qos_param vpp_qos[MLX4_NUM_UP];
+
+	port_qos = &priv->mfunc.master.qos_ctl[port];
+	memset(vpp_qos, 0, sizeof(struct mlx4_vport_qos_param) * MLX4_NUM_UP);
+
+	if (slave > port_qos->num_of_qos_vfs) {
+		mlx4_info(dev, "No available VPP resources for this VF\n");
+		return -EINVAL;
+	}
+
+	/* Query for default QoS values from Vport 0 is needed */
+	err = mlx4_SET_VPORT_QOS_get(dev, port, 0, vpp_qos);
+	if (err) {
+		mlx4_info(dev, "Failed to query Vport 0 QoS values\n");
+		return err;
+	}
+
+	for (i = 0; i < MLX4_NUM_UP; i++) {
+		if (test_bit(i, port_qos->priority_bm) && max_tx_rate) {
+			vpp_qos[i].max_avg_bw = max_tx_rate;
+			vpp_qos[i].enable = 1;
+		} else {
+			/* if user supplied tx_rate == 0, meaning no rate limit
+			 * configuration is required. so we are leaving the
+			 * value of max_avg_bw as queried from Vport 0.
+			 */
+			vpp_qos[i].enable = 0;
+		}
+	}
+
+	err = mlx4_SET_VPORT_QOS_set(dev, port, slave, vpp_qos);
+	if (err) {
+		mlx4_info(dev, "Failed to set Vport %d QoS values\n", slave);
+		return err;
+	}
+
+	return 0;
+}
+
+static bool mlx4_is_vf_vst_and_prio_qos(struct mlx4_dev *dev, int port,
+					struct mlx4_vport_state *vf_admin)
+{
+	struct mlx4_qos_manager *info;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!mlx4_is_master(dev) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QOS_VPP))
+		return false;
+
+	info = &priv->mfunc.master.qos_ctl[port];
+
+	if (vf_admin->default_vlan != MLX4_VGT &&
+	    test_bit(vf_admin->default_qos, info->priority_bm))
+		return true;
+
+	return false;
+}
+
+static bool mlx4_valid_vf_state_change(struct mlx4_dev *dev, int port,
+				       struct mlx4_vport_state *vf_admin,
+				       int vlan, int qos)
+{
+	struct mlx4_vport_state dummy_admin = {0};
+
+	if (!mlx4_is_vf_vst_and_prio_qos(dev, port, vf_admin) ||
+	    !vf_admin->tx_rate)
+		return true;
+
+	dummy_admin.default_qos = qos;
+	dummy_admin.default_vlan = vlan;
+
+	/* VF wants to move to other VST state which is valid with current
+	 * rate limit. Either differnt default vlan in VST or other
+	 * supported QoS priority. Otherwise we don't allow this change when
+	 * the TX rate is still configured.
+	 */
+	if (mlx4_is_vf_vst_and_prio_qos(dev, port, &dummy_admin))
+		return true;
+
+	mlx4_info(dev, "Cannot change VF state to %s while rate is set\n",
+		  (vlan == MLX4_VGT) ? "VGT" : "VST");
+
+	if (vlan != MLX4_VGT)
+		mlx4_info(dev, "VST priority %d not supported for QoS\n", qos);
+
+	mlx4_info(dev, "Please set rate to 0 prior to this VF state change\n");
+
+	return false;
+}
+
+int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u8 *mac)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if (!mlx4_is_master(dev))
+		return -EPROTONOSUPPORT;
+
+	if (is_multicast_ether_addr(mac))
+		return -EINVAL;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	if (s_info->spoofchk && is_zero_ether_addr(mac)) {
+		mlx4_info(dev, "MAC invalidation is not allowed when spoofchk is on\n");
+		return -EPERM;
+	}
+
+	s_info->mac = mlx4_mac_to_u64(mac);
+	mlx4_info(dev, "default mac on vf %d port %d to %llX will take effect only after vf restart\n",
+		  vf, port, s_info->mac);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_mac);
+
+
+int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos,
+		     __be16 proto)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *vf_admin;
+	struct mlx4_slave_state *slave_state;
+	struct mlx4_vport_oper_state *vf_oper;
+	int slave;
+
+	if ((!mlx4_is_master(dev)) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_VLAN_CONTROL))
+		return -EPROTONOSUPPORT;
+
+	if ((vlan > 4095) || (qos > 7))
+		return -EINVAL;
+
+	if (proto == htons(ETH_P_8021AD) &&
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP))
+		return -EPROTONOSUPPORT;
+
+	if (proto != htons(ETH_P_8021Q) &&
+	    proto != htons(ETH_P_8021AD))
+		return -EINVAL;
+
+	if ((proto == htons(ETH_P_8021AD)) &&
+	    ((vlan == 0) || (vlan == MLX4_VGT)))
+		return -EINVAL;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	slave_state = &priv->mfunc.master.slave_state[slave];
+	if ((proto == htons(ETH_P_8021AD)) && (slave_state->active) &&
+	    (!slave_state->vst_qinq_supported)) {
+		mlx4_err(dev, "vf %d does not support VST QinQ mode\n", vf);
+		return -EPROTONOSUPPORT;
+	}
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	vf_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+	vf_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+
+	if (!mlx4_valid_vf_state_change(dev, port, vf_admin, vlan, qos))
+		return -EPERM;
+
+	if ((0 == vlan) && (0 == qos))
+		vf_admin->default_vlan = MLX4_VGT;
+	else
+		vf_admin->default_vlan = vlan;
+	vf_admin->default_qos = qos;
+	vf_admin->vlan_proto = proto;
+
+	/* If rate was configured prior to VST, we saved the configured rate
+	 * in vf_admin->rate and now, if priority supported we enforce the QoS
+	 */
+	if (mlx4_is_vf_vst_and_prio_qos(dev, port, vf_admin) &&
+	    vf_admin->tx_rate)
+		vf_admin->qos_vport = slave;
+
+	/* Try to activate new vf state without restart,
+	 * this option is not supported while moving to VST QinQ mode.
+	 */
+	if ((proto == htons(ETH_P_8021AD) &&
+	     vf_oper->state.vlan_proto != proto) ||
+	    mlx4_master_immediate_activate_vlan_qos(priv, slave, port))
+		mlx4_info(dev,
+			  "updating vf %d port %d config will take effect on next VF restart\n",
+			  vf, port);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_vlan);
+
+int mlx4_set_vf_rate(struct mlx4_dev *dev, int port, int vf, int min_tx_rate,
+		     int max_tx_rate)
+{
+	int err;
+	int slave;
+	struct mlx4_vport_state *vf_admin;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!mlx4_is_master(dev) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QOS_VPP))
+		return -EPROTONOSUPPORT;
+
+	if (min_tx_rate) {
+		mlx4_info(dev, "Minimum BW share not supported\n");
+		return -EPROTONOSUPPORT;
+	}
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	vf_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	err = mlx4_set_vport_qos(priv, slave, port, max_tx_rate);
+	if (err) {
+		mlx4_info(dev, "vf %d failed to set rate %d\n", vf,
+			  max_tx_rate);
+		return err;
+	}
+
+	vf_admin->tx_rate = max_tx_rate;
+	/* if VF is not in supported mode (VST with supported prio),
+	 * we do not change vport configuration for its QPs, but save
+	 * the rate, so it will be enforced when it moves to supported
+	 * mode next time.
+	 */
+	if (!mlx4_is_vf_vst_and_prio_qos(dev, port, vf_admin)) {
+		mlx4_info(dev,
+			  "rate set for VF %d when not in valid state\n", vf);
+
+		if (vf_admin->default_vlan != MLX4_VGT)
+			mlx4_info(dev, "VST priority not supported by QoS\n");
+		else
+			mlx4_info(dev, "VF in VGT mode (needed VST)\n");
+
+		mlx4_info(dev,
+			  "rate %d take affect when VF moves to valid state\n",
+			  max_tx_rate);
+		return 0;
+	}
+
+	/* If user sets rate 0 assigning default vport for its QPs */
+	vf_admin->qos_vport = max_tx_rate ? slave : MLX4_VPP_DEFAULT_VPORT;
+
+	if (priv->mfunc.master.slave_state[slave].active &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP)
+		mlx4_master_immediate_activate_vlan_qos(priv, slave, port);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_rate);
+
+ /* mlx4_get_slave_default_vlan -
+ * return true if VST ( default vlan)
+ * if VST, will return vlan & qos (if not NULL)
+ */
+bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave,
+				 u16 *vlan, u8 *qos)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_priv *priv;
+
+	priv = mlx4_priv(dev);
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+
+	if (MLX4_VGT != vp_oper->state.default_vlan) {
+		if (vlan)
+			*vlan = vp_oper->state.default_vlan;
+		if (qos)
+			*qos = vp_oper->state.default_qos;
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_slave_default_vlan);
+
+int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+	u8 mac[ETH_ALEN];
+
+	if ((!mlx4_is_master(dev)) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FSM))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	mlx4_u64_to_mac(mac, s_info->mac);
+	if (setting && !is_valid_ether_addr(mac)) {
+		mlx4_info(dev, "Illegal MAC with spoofchk\n");
+		return -EPERM;
+	}
+
+	s_info->spoofchk = setting;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_spoofchk);
+
+int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_info *ivf)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if (!mlx4_is_master(dev))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	ivf->vf = vf;
+
+	/* need to convert it to a func */
+	ivf->mac[0] = ((s_info->mac >> (5*8)) & 0xff);
+	ivf->mac[1] = ((s_info->mac >> (4*8)) & 0xff);
+	ivf->mac[2] = ((s_info->mac >> (3*8)) & 0xff);
+	ivf->mac[3] = ((s_info->mac >> (2*8)) & 0xff);
+	ivf->mac[4] = ((s_info->mac >> (1*8)) & 0xff);
+	ivf->mac[5] = ((s_info->mac)  & 0xff);
+
+	ivf->vlan		= s_info->default_vlan;
+	ivf->qos		= s_info->default_qos;
+	ivf->vlan_proto		= s_info->vlan_proto;
+
+	if (mlx4_is_vf_vst_and_prio_qos(dev, port, s_info))
+		ivf->max_tx_rate = s_info->tx_rate;
+	else
+		ivf->max_tx_rate = 0;
+
+	ivf->min_tx_rate	= 0;
+	ivf->spoofchk		= s_info->spoofchk;
+	ivf->linkstate		= s_info->link_state;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_vf_config);
+
+int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+	u8 link_stat_event;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	switch (link_state) {
+	case IFLA_VF_LINK_STATE_AUTO:
+		/* get current link state */
+		if (!priv->sense.do_sense_port[port])
+			link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_ACTIVE;
+		else
+			link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_DOWN;
+	    break;
+
+	case IFLA_VF_LINK_STATE_ENABLE:
+		link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_ACTIVE;
+	    break;
+
+	case IFLA_VF_LINK_STATE_DISABLE:
+		link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_DOWN;
+	    break;
+
+	default:
+		mlx4_warn(dev, "unknown value for link_state %02x on slave %d port %d\n",
+			  link_state, slave, port);
+		return -EINVAL;
+	};
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	s_info->link_state = link_state;
+
+	/* send event */
+	mlx4_gen_port_state_change_eqe(dev, slave, port, link_stat_event);
+
+	if (mlx4_master_immediate_activate_vlan_qos(priv, slave, port))
+		mlx4_dbg(dev,
+			 "updating vf %d port %d no link state HW enforcement\n",
+			 vf, port);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_link_state);
+
+int mlx4_get_counter_stats(struct mlx4_dev *dev, int counter_index,
+			   struct mlx4_counter *counter_stats, int reset)
+{
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+	struct mlx4_counter *tmp_counter;
+	int err;
+	u32 if_stat_in_mod;
+
+	if (!counter_stats)
+		return -EINVAL;
+
+	if (counter_index == MLX4_SINK_COUNTER_INDEX(dev))
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memset(mailbox->buf, 0, sizeof(struct mlx4_counter));
+	if_stat_in_mod = counter_index;
+	if (reset)
+		if_stat_in_mod |= MLX4_QUERY_IF_STAT_RESET;
+	err = mlx4_cmd_box(dev, 0, mailbox->dma,
+			   if_stat_in_mod, 0,
+			   MLX4_CMD_QUERY_IF_STAT,
+			   MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_dbg(dev, "%s: failed to read statistics for counter index %d\n",
+			 __func__, counter_index);
+		goto if_stat_out;
+	}
+	tmp_counter = (struct mlx4_counter *)mailbox->buf;
+	counter_stats->counter_mode = tmp_counter->counter_mode;
+	if (counter_stats->counter_mode == 0) {
+		counter_stats->rx_frames =
+			cpu_to_be64(be64_to_cpu(counter_stats->rx_frames) +
+				    be64_to_cpu(tmp_counter->rx_frames));
+		counter_stats->tx_frames =
+			cpu_to_be64(be64_to_cpu(counter_stats->tx_frames) +
+				    be64_to_cpu(tmp_counter->tx_frames));
+		counter_stats->rx_bytes =
+			cpu_to_be64(be64_to_cpu(counter_stats->rx_bytes) +
+				    be64_to_cpu(tmp_counter->rx_bytes));
+		counter_stats->tx_bytes =
+			cpu_to_be64(be64_to_cpu(counter_stats->tx_bytes) +
+				    be64_to_cpu(tmp_counter->tx_bytes));
+	}
+
+if_stat_out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_counter_stats);
+
+int mlx4_get_vf_stats(struct mlx4_dev *dev, int port, int vf_idx,
+		      struct ifla_vf_stats *vf_stats)
+{
+	struct mlx4_counter tmp_vf_stats;
+	int slave;
+	int err = 0;
+
+	if (!vf_stats)
+		return -EINVAL;
+
+	if (!mlx4_is_master(dev))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf_idx);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	err = mlx4_calc_vf_counters(dev, slave, port, &tmp_vf_stats);
+	if (!err && tmp_vf_stats.counter_mode == 0) {
+		vf_stats->rx_packets = be64_to_cpu(tmp_vf_stats.rx_frames);
+		vf_stats->tx_packets = be64_to_cpu(tmp_vf_stats.tx_frames);
+		vf_stats->rx_bytes = be64_to_cpu(tmp_vf_stats.rx_bytes);
+		vf_stats->tx_bytes = be64_to_cpu(tmp_vf_stats.tx_bytes);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_vf_stats);
+
+int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS)
+		return 0;
+
+	return priv->mfunc.master.vf_oper[slave].smi_enabled[port] ==
+		MLX4_VF_SMI_ENABLED;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_smi_enabled);
+
+int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave == mlx4_master_func_num(dev))
+		return 1;
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS)
+		return 0;
+
+	return priv->mfunc.master.vf_admin[slave].enable_smi[port] ==
+		MLX4_VF_SMI_ENABLED;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_get_enable_smi_admin);
+
+int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
+				 int enabled)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
+			&priv->dev, slave);
+	int min_port = find_first_bit(actv_ports.ports,
+				      priv->dev.caps.num_ports) + 1;
+	int max_port = min_port - 1 +
+		bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports);
+
+	if (slave == mlx4_master_func_num(dev))
+		return 0;
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS ||
+	    enabled < 0 || enabled > 1)
+		return -EINVAL;
+
+	if (min_port == max_port && dev->caps.num_ports > 1) {
+		mlx4_info(dev, "SMI access disallowed for single ported VFs\n");
+		return -EPROTONOSUPPORT;
+	}
+
+	priv->mfunc.master.vf_admin[slave].enable_smi[port] = enabled;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_set_enable_smi_admin);
diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
new file mode 100644
index 0000000..d8e9a32
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/hardirq.h>
+#include <linux/export.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/cq.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+#define MLX4_CQ_STATUS_OK		( 0 << 28)
+#define MLX4_CQ_STATUS_OVERFLOW		( 9 << 28)
+#define MLX4_CQ_STATUS_WRITE_FAIL	(10 << 28)
+#define MLX4_CQ_FLAG_CC			( 1 << 18)
+#define MLX4_CQ_FLAG_OI			( 1 << 17)
+#define MLX4_CQ_STATE_ARMED		( 9 <<  8)
+#define MLX4_CQ_STATE_ARMED_SOL		( 6 <<  8)
+#define MLX4_EQ_STATE_FIRED		(10 <<  8)
+
+#define TASKLET_MAX_TIME 2
+#define TASKLET_MAX_TIME_JIFFIES msecs_to_jiffies(TASKLET_MAX_TIME)
+
+void mlx4_cq_tasklet_cb(unsigned long data)
+{
+	unsigned long flags;
+	unsigned long end = jiffies + TASKLET_MAX_TIME_JIFFIES;
+	struct mlx4_eq_tasklet *ctx = (struct mlx4_eq_tasklet *)data;
+	struct mlx4_cq *mcq, *temp;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	list_splice_tail_init(&ctx->list, &ctx->process_list);
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	list_for_each_entry_safe(mcq, temp, &ctx->process_list, tasklet_ctx.list) {
+		list_del_init(&mcq->tasklet_ctx.list);
+		mcq->tasklet_ctx.comp(mcq);
+		if (refcount_dec_and_test(&mcq->refcount))
+			complete(&mcq->free);
+		if (time_after(jiffies, end))
+			break;
+	}
+
+	if (!list_empty(&ctx->process_list))
+		tasklet_schedule(&ctx->task);
+}
+
+static void mlx4_add_cq_to_tasklet(struct mlx4_cq *cq)
+{
+	struct mlx4_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv;
+	unsigned long flags;
+	bool kick;
+
+	spin_lock_irqsave(&tasklet_ctx->lock, flags);
+	/* When migrating CQs between EQs will be implemented, please note
+	 * that you need to sync this point. It is possible that
+	 * while migrating a CQ, completions on the old EQs could
+	 * still arrive.
+	 */
+	if (list_empty_careful(&cq->tasklet_ctx.list)) {
+		refcount_inc(&cq->refcount);
+		kick = list_empty(&tasklet_ctx->list);
+		list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list);
+		if (kick)
+			tasklet_schedule(&tasklet_ctx->task);
+	}
+	spin_unlock_irqrestore(&tasklet_ctx->lock, flags);
+}
+
+void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
+{
+	struct mlx4_cq *cq;
+
+	rcu_read_lock();
+	cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree,
+			       cqn & (dev->caps.num_cqs - 1));
+	rcu_read_unlock();
+
+	if (!cq) {
+		mlx4_dbg(dev, "Completion event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	/* Acessing the CQ outside of rcu_read_lock is safe, because
+	 * the CQ is freed only after interrupt handling is completed.
+	 */
+	++cq->arm_sn;
+
+	cq->comp(cq);
+}
+
+void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type)
+{
+	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+	struct mlx4_cq *cq;
+
+	rcu_read_lock();
+	cq = radix_tree_lookup(&cq_table->tree, cqn & (dev->caps.num_cqs - 1));
+	rcu_read_unlock();
+
+	if (!cq) {
+		mlx4_dbg(dev, "Async event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	/* Acessing the CQ outside of rcu_read_lock is safe, because
+	 * the CQ is freed only after interrupt handling is completed.
+	 */
+	cq->event(cq, event_type);
+}
+
+static int mlx4_SW2HW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, cq_num, 0,
+			MLX4_CMD_SW2HW_CQ, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_MODIFY_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num, u32 opmod)
+{
+	return mlx4_cmd(dev, mailbox->dma, cq_num, opmod, MLX4_CMD_MODIFY_CQ,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_HW2SW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0,
+			    cq_num, mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   u16 count, u16 period)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cq_context = mailbox->buf;
+	cq_context->cq_max_count = cpu_to_be16(count);
+	cq_context->cq_period    = cpu_to_be16(period);
+
+	err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 1);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_modify);
+
+int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   int entries, struct mlx4_mtt *mtt)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	u64 mtt_addr;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cq_context = mailbox->buf;
+	cq_context->logsize_usrpage = cpu_to_be32(ilog2(entries) << 24);
+	cq_context->log_page_size   = mtt->page_shift - 12;
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	cq_context->mtt_base_addr_h = mtt_addr >> 32;
+	cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+
+	err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 0);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_resize);
+
+int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	int err;
+
+	*cqn = mlx4_bitmap_alloc(&cq_table->bitmap);
+	if (*cqn == -1)
+		return -ENOMEM;
+
+	err = mlx4_table_get(dev, &cq_table->table, *cqn);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn);
+	if (err)
+		goto err_put;
+	return 0;
+
+err_put:
+	mlx4_table_put(dev, &cq_table->table, *cqn);
+
+err_out:
+	mlx4_bitmap_free(&cq_table->bitmap, *cqn, MLX4_NO_RR);
+	return err;
+}
+
+static int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn, u8 usage)
+{
+	u32 in_modifier = RES_CQ | (((u32)usage & 3) << 30);
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param, in_modifier,
+				   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			return err;
+		else {
+			*cqn = get_param_l(&out_param);
+			return 0;
+		}
+	}
+	return __mlx4_cq_alloc_icm(dev, cqn);
+}
+
+void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+
+	mlx4_table_put(dev, &cq_table->cmpt_table, cqn);
+	mlx4_table_put(dev, &cq_table->table, cqn);
+	mlx4_bitmap_free(&cq_table->bitmap, cqn, MLX4_NO_RR);
+}
+
+static void mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, cqn);
+		err = mlx4_cmd(dev, in_param, RES_CQ, RES_OP_RESERVE_AND_MAP,
+			       MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			mlx4_warn(dev, "Failed freeing cq:%d\n", cqn);
+	} else
+		__mlx4_cq_free_icm(dev, cqn);
+}
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
+		  struct mlx4_mtt *mtt, struct mlx4_uar *uar, u64 db_rec,
+		  struct mlx4_cq *cq, unsigned vector, int collapsed,
+		  int timestamp_en)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	u64 mtt_addr;
+	int err;
+
+	if (vector >= dev->caps.num_comp_vectors)
+		return -EINVAL;
+
+	cq->vector = vector;
+
+	err = mlx4_cq_alloc_icm(dev, &cq->cqn, cq->usage);
+	if (err)
+		return err;
+
+	spin_lock(&cq_table->lock);
+	err = radix_tree_insert(&cq_table->tree, cq->cqn, cq);
+	spin_unlock(&cq_table->lock);
+	if (err)
+		goto err_icm;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_radix;
+	}
+
+	cq_context = mailbox->buf;
+	cq_context->flags	    = cpu_to_be32(!!collapsed << 18);
+	if (timestamp_en)
+		cq_context->flags  |= cpu_to_be32(1 << 19);
+
+	cq_context->logsize_usrpage =
+		cpu_to_be32((ilog2(nent) << 24) |
+			    mlx4_to_hw_uar_index(dev, uar->index));
+	cq_context->comp_eqn	    = priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].eqn;
+	cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	cq_context->mtt_base_addr_h = mtt_addr >> 32;
+	cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+	cq_context->db_rec_addr     = cpu_to_be64(db_rec);
+
+	err = mlx4_SW2HW_CQ(dev, mailbox, cq->cqn);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		goto err_radix;
+
+	cq->cons_index = 0;
+	cq->arm_sn     = 1;
+	cq->uar        = uar;
+	refcount_set(&cq->refcount, 1);
+	init_completion(&cq->free);
+	cq->comp = mlx4_add_cq_to_tasklet;
+	cq->tasklet_ctx.priv =
+		&priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].tasklet_ctx;
+	INIT_LIST_HEAD(&cq->tasklet_ctx.list);
+
+
+	cq->irq = priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].irq;
+	return 0;
+
+err_radix:
+	spin_lock(&cq_table->lock);
+	radix_tree_delete(&cq_table->tree, cq->cqn);
+	spin_unlock(&cq_table->lock);
+
+err_icm:
+	mlx4_cq_free_icm(dev, cq->cqn);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_alloc);
+
+void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	int err;
+
+	err = mlx4_HW2SW_CQ(dev, NULL, cq->cqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn);
+
+	spin_lock(&cq_table->lock);
+	radix_tree_delete(&cq_table->tree, cq->cqn);
+	spin_unlock(&cq_table->lock);
+
+	synchronize_irq(priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(cq->vector)].irq);
+	if (priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(cq->vector)].irq !=
+	    priv->eq_table.eq[MLX4_EQ_ASYNC].irq)
+		synchronize_irq(priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
+
+	if (refcount_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+	wait_for_completion(&cq->free);
+
+	mlx4_cq_free_icm(dev, cq->cqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_free);
+
+int mlx4_init_cq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+	int err;
+
+	spin_lock_init(&cq_table->lock);
+	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+	if (mlx4_is_slave(dev))
+		return 0;
+
+	err = mlx4_bitmap_init(&cq_table->bitmap, dev->caps.num_cqs,
+			       dev->caps.num_cqs - 1, dev->caps.reserved_cqs, 0);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void mlx4_cleanup_cq_table(struct mlx4_dev *dev)
+{
+	if (mlx4_is_slave(dev))
+		return;
+	/* Nothing to do to clean up radix_tree */
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->cq_table.bitmap);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
new file mode 100644
index 0000000..0247885
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx4/device.h>
+#include <linux/clocksource.h>
+
+#include "mlx4_en.h"
+
+/* mlx4_en_read_clock - read raw cycle counter (to be used by time counter)
+ */
+static u64 mlx4_en_read_clock(const struct cyclecounter *tc)
+{
+	struct mlx4_en_dev *mdev =
+		container_of(tc, struct mlx4_en_dev, cycles);
+	struct mlx4_dev *dev = mdev->dev;
+
+	return mlx4_read_clock(dev) & tc->mask;
+}
+
+u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe)
+{
+	u64 hi, lo;
+	struct mlx4_ts_cqe *ts_cqe = (struct mlx4_ts_cqe *)cqe;
+
+	lo = (u64)be16_to_cpu(ts_cqe->timestamp_lo);
+	hi = ((u64)be32_to_cpu(ts_cqe->timestamp_hi) + !lo) << 16;
+
+	return hi | lo;
+}
+
+void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev,
+			    struct skb_shared_hwtstamps *hwts,
+			    u64 timestamp)
+{
+	unsigned int seq;
+	u64 nsec;
+
+	do {
+		seq = read_seqbegin(&mdev->clock_lock);
+		nsec = timecounter_cyc2time(&mdev->clock, timestamp);
+	} while (read_seqretry(&mdev->clock_lock, seq));
+
+	memset(hwts, 0, sizeof(struct skb_shared_hwtstamps));
+	hwts->hwtstamp = ns_to_ktime(nsec);
+}
+
+/**
+ * mlx4_en_remove_timestamp - disable PTP device
+ * @mdev: board private structure
+ *
+ * Stop the PTP support.
+ **/
+void mlx4_en_remove_timestamp(struct mlx4_en_dev *mdev)
+{
+	if (mdev->ptp_clock) {
+		ptp_clock_unregister(mdev->ptp_clock);
+		mdev->ptp_clock = NULL;
+		mlx4_info(mdev, "removed PHC\n");
+	}
+}
+
+#define MLX4_EN_WRAP_AROUND_SEC	10UL
+/* By scheduling the overflow check every 5 seconds, we have a reasonably
+ * good chance we wont miss a wrap around.
+ * TOTO: Use a timer instead of a work queue to increase the guarantee.
+ */
+#define MLX4_EN_OVERFLOW_PERIOD (MLX4_EN_WRAP_AROUND_SEC * HZ / 2)
+
+void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev)
+{
+	bool timeout = time_is_before_jiffies(mdev->last_overflow_check +
+					      MLX4_EN_OVERFLOW_PERIOD);
+	unsigned long flags;
+
+	if (timeout) {
+		write_seqlock_irqsave(&mdev->clock_lock, flags);
+		timecounter_read(&mdev->clock);
+		write_sequnlock_irqrestore(&mdev->clock_lock, flags);
+		mdev->last_overflow_check = jiffies;
+	}
+}
+
+/**
+ * mlx4_en_phc_adjfreq - adjust the frequency of the hardware clock
+ * @ptp: ptp clock structure
+ * @delta: Desired frequency change in parts per billion
+ *
+ * Adjust the frequency of the PHC cycle counter by the indicated delta from
+ * the base frequency.
+ **/
+static int mlx4_en_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta)
+{
+	u64 adj;
+	u32 diff, mult;
+	int neg_adj = 0;
+	unsigned long flags;
+	struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev,
+						ptp_clock_info);
+
+	if (delta < 0) {
+		neg_adj = 1;
+		delta = -delta;
+	}
+	mult = mdev->nominal_c_mult;
+	adj = mult;
+	adj *= delta;
+	diff = div_u64(adj, 1000000000ULL);
+
+	write_seqlock_irqsave(&mdev->clock_lock, flags);
+	timecounter_read(&mdev->clock);
+	mdev->cycles.mult = neg_adj ? mult - diff : mult + diff;
+	write_sequnlock_irqrestore(&mdev->clock_lock, flags);
+
+	return 0;
+}
+
+/**
+ * mlx4_en_phc_adjtime - Shift the time of the hardware clock
+ * @ptp: ptp clock structure
+ * @delta: Desired change in nanoseconds
+ *
+ * Adjust the timer by resetting the timecounter structure.
+ **/
+static int mlx4_en_phc_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev,
+						ptp_clock_info);
+	unsigned long flags;
+
+	write_seqlock_irqsave(&mdev->clock_lock, flags);
+	timecounter_adjtime(&mdev->clock, delta);
+	write_sequnlock_irqrestore(&mdev->clock_lock, flags);
+
+	return 0;
+}
+
+/**
+ * mlx4_en_phc_gettime - Reads the current time from the hardware clock
+ * @ptp: ptp clock structure
+ * @ts: timespec structure to hold the current time value
+ *
+ * Read the timecounter and return the correct value in ns after converting
+ * it into a struct timespec.
+ **/
+static int mlx4_en_phc_gettime(struct ptp_clock_info *ptp,
+			       struct timespec64 *ts)
+{
+	struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev,
+						ptp_clock_info);
+	unsigned long flags;
+	u64 ns;
+
+	write_seqlock_irqsave(&mdev->clock_lock, flags);
+	ns = timecounter_read(&mdev->clock);
+	write_sequnlock_irqrestore(&mdev->clock_lock, flags);
+
+	*ts = ns_to_timespec64(ns);
+
+	return 0;
+}
+
+/**
+ * mlx4_en_phc_settime - Set the current time on the hardware clock
+ * @ptp: ptp clock structure
+ * @ts: timespec containing the new time for the cycle counter
+ *
+ * Reset the timecounter to use a new base value instead of the kernel
+ * wall timer value.
+ **/
+static int mlx4_en_phc_settime(struct ptp_clock_info *ptp,
+			       const struct timespec64 *ts)
+{
+	struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev,
+						ptp_clock_info);
+	u64 ns = timespec64_to_ns(ts);
+	unsigned long flags;
+
+	/* reset the timecounter */
+	write_seqlock_irqsave(&mdev->clock_lock, flags);
+	timecounter_init(&mdev->clock, &mdev->cycles, ns);
+	write_sequnlock_irqrestore(&mdev->clock_lock, flags);
+
+	return 0;
+}
+
+/**
+ * mlx4_en_phc_enable - enable or disable an ancillary feature
+ * @ptp: ptp clock structure
+ * @request: Desired resource to enable or disable
+ * @on: Caller passes one to enable or zero to disable
+ *
+ * Enable (or disable) ancillary features of the PHC subsystem.
+ * Currently, no ancillary features are supported.
+ **/
+static int mlx4_en_phc_enable(struct ptp_clock_info __always_unused *ptp,
+			      struct ptp_clock_request __always_unused *request,
+			      int __always_unused on)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct ptp_clock_info mlx4_en_ptp_clock_info = {
+	.owner		= THIS_MODULE,
+	.max_adj	= 100000000,
+	.n_alarm	= 0,
+	.n_ext_ts	= 0,
+	.n_per_out	= 0,
+	.n_pins		= 0,
+	.pps		= 0,
+	.adjfreq	= mlx4_en_phc_adjfreq,
+	.adjtime	= mlx4_en_phc_adjtime,
+	.gettime64	= mlx4_en_phc_gettime,
+	.settime64	= mlx4_en_phc_settime,
+	.enable		= mlx4_en_phc_enable,
+};
+
+
+/* This function calculates the max shift that enables the user range
+ * of MLX4_EN_WRAP_AROUND_SEC values in the cycles register.
+ */
+static u32 freq_to_shift(u16 freq)
+{
+	u32 freq_khz = freq * 1000;
+	u64 max_val_cycles = freq_khz * 1000 * MLX4_EN_WRAP_AROUND_SEC;
+	u64 max_val_cycles_rounded = 1ULL << fls64(max_val_cycles - 1);
+	/* calculate max possible multiplier in order to fit in 64bit */
+	u64 max_mul = div64_u64(ULLONG_MAX, max_val_cycles_rounded);
+
+	/* This comes from the reverse of clocksource_khz2mult */
+	return ilog2(div_u64(max_mul * freq_khz, 1000000));
+}
+
+void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
+{
+	struct mlx4_dev *dev = mdev->dev;
+	unsigned long flags;
+
+	/* mlx4_en_init_timestamp is called for each netdev.
+	 * mdev->ptp_clock is common for all ports, skip initialization if
+	 * was done for other port.
+	 */
+	if (mdev->ptp_clock)
+		return;
+
+	seqlock_init(&mdev->clock_lock);
+
+	memset(&mdev->cycles, 0, sizeof(mdev->cycles));
+	mdev->cycles.read = mlx4_en_read_clock;
+	mdev->cycles.mask = CLOCKSOURCE_MASK(48);
+	mdev->cycles.shift = freq_to_shift(dev->caps.hca_core_clock);
+	mdev->cycles.mult =
+		clocksource_khz2mult(1000 * dev->caps.hca_core_clock, mdev->cycles.shift);
+	mdev->nominal_c_mult = mdev->cycles.mult;
+
+	write_seqlock_irqsave(&mdev->clock_lock, flags);
+	timecounter_init(&mdev->clock, &mdev->cycles,
+			 ktime_to_ns(ktime_get_real()));
+	write_sequnlock_irqrestore(&mdev->clock_lock, flags);
+
+	/* Configure the PHC */
+	mdev->ptp_clock_info = mlx4_en_ptp_clock_info;
+	snprintf(mdev->ptp_clock_info.name, 16, "mlx4 ptp");
+
+	mdev->ptp_clock = ptp_clock_register(&mdev->ptp_clock_info,
+					     &mdev->pdev->dev);
+	if (IS_ERR(mdev->ptp_clock)) {
+		mdev->ptp_clock = NULL;
+		mlx4_err(mdev, "ptp_clock_register failed\n");
+	} else if (mdev->ptp_clock) {
+		mlx4_info(mdev, "registered PHC clock\n");
+	}
+
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
new file mode 100644
index 0000000..1e487ac
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_en.h"
+
+static void mlx4_en_cq_event(struct mlx4_cq *cq, enum mlx4_event event)
+{
+	return;
+}
+
+
+int mlx4_en_create_cq(struct mlx4_en_priv *priv,
+		      struct mlx4_en_cq **pcq,
+		      int entries, int ring, enum cq_type mode,
+		      int node)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq;
+	int err;
+
+	cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, node);
+	if (!cq) {
+		cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+		if (!cq) {
+			en_err(priv, "Failed to allocate CQ structure\n");
+			return -ENOMEM;
+		}
+	}
+
+	cq->size = entries;
+	cq->buf_size = cq->size * mdev->dev->caps.cqe_size;
+
+	cq->ring = ring;
+	cq->type = mode;
+	cq->vector = mdev->dev->caps.num_comp_vectors;
+
+	/* Allocate HW buffers on provided NUMA node.
+	 * dev->numa_node is used in mtt range allocation flow.
+	 */
+	set_dev_node(&mdev->dev->persist->pdev->dev, node);
+	err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres,
+				cq->buf_size);
+	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
+	if (err)
+		goto err_cq;
+
+	cq->buf = (struct mlx4_cqe *)cq->wqres.buf.direct.buf;
+	*pcq = cq;
+
+	return 0;
+
+err_cq:
+	kfree(cq);
+	*pcq = NULL;
+	return err;
+}
+
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
+			int cq_idx)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+	int timestamp_en = 0;
+	bool assigned_eq = false;
+
+	cq->dev = mdev->pndev[priv->port];
+	cq->mcq.set_ci_db  = cq->wqres.db.db;
+	cq->mcq.arm_db     = cq->wqres.db.db + 1;
+	*cq->mcq.set_ci_db = 0;
+	*cq->mcq.arm_db    = 0;
+	memset(cq->buf, 0, cq->buf_size);
+
+	if (cq->type == RX) {
+		if (!mlx4_is_eq_vector_valid(mdev->dev, priv->port,
+					     cq->vector)) {
+			cq->vector = cpumask_first(priv->rx_ring[cq->ring]->affinity_mask);
+
+			err = mlx4_assign_eq(mdev->dev, priv->port,
+					     &cq->vector);
+			if (err) {
+				mlx4_err(mdev, "Failed assigning an EQ to CQ vector %d\n",
+					 cq->vector);
+				goto free_eq;
+			}
+
+			assigned_eq = true;
+		}
+
+		cq->irq_desc =
+			irq_to_desc(mlx4_eq_get_irq(mdev->dev,
+						    cq->vector));
+	} else {
+		/* For TX we use the same irq per
+		ring we assigned for the RX    */
+		struct mlx4_en_cq *rx_cq;
+
+		cq_idx = cq_idx % priv->rx_ring_num;
+		rx_cq = priv->rx_cq[cq_idx];
+		cq->vector = rx_cq->vector;
+	}
+
+	if (cq->type == RX)
+		cq->size = priv->rx_ring[cq->ring]->actual_size;
+
+	if ((cq->type != RX && priv->hwtstamp_config.tx_type) ||
+	    (cq->type == RX && priv->hwtstamp_config.rx_filter))
+		timestamp_en = 1;
+
+	cq->mcq.usage = MLX4_RES_USAGE_DRIVER;
+	err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt,
+			    &mdev->priv_uar, cq->wqres.db.dma, &cq->mcq,
+			    cq->vector, 0, timestamp_en);
+	if (err)
+		goto free_eq;
+
+	cq->mcq.event = mlx4_en_cq_event;
+
+	switch (cq->type) {
+	case TX:
+		cq->mcq.comp = mlx4_en_tx_irq;
+		netif_tx_napi_add(cq->dev, &cq->napi, mlx4_en_poll_tx_cq,
+				  NAPI_POLL_WEIGHT);
+		napi_enable(&cq->napi);
+		break;
+	case RX:
+		cq->mcq.comp = mlx4_en_rx_irq;
+		netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_rx_cq, 64);
+		napi_enable(&cq->napi);
+		break;
+	case TX_XDP:
+		/* nothing regarding napi, it's shared with rx ring */
+		cq->xdp_busy = false;
+		break;
+	}
+
+	return 0;
+
+free_eq:
+	if (assigned_eq)
+		mlx4_release_eq(mdev->dev, cq->vector);
+	cq->vector = mdev->dev->caps.num_comp_vectors;
+	return err;
+}
+
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq = *pcq;
+
+	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
+	if (mlx4_is_eq_vector_valid(mdev->dev, priv->port, cq->vector) &&
+	    cq->type == RX)
+		mlx4_release_eq(priv->mdev->dev, cq->vector);
+	cq->vector = 0;
+	cq->buf_size = 0;
+	cq->buf = NULL;
+	kfree(cq);
+	*pcq = NULL;
+}
+
+void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	if (cq->type != TX_XDP) {
+		napi_disable(&cq->napi);
+		netif_napi_del(&cq->napi);
+	}
+
+	mlx4_cq_free(priv->mdev->dev, &cq->mcq);
+}
+
+/* Set rx cq moderation parameters */
+int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	return mlx4_cq_modify(priv->mdev->dev, &cq->mcq,
+			      cq->moder_cnt, cq->moder_time);
+}
+
+void mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	mlx4_cq_arm(&cq->mcq, MLX4_CQ_DB_REQ_NOT, priv->mdev->uar_map,
+		    &priv->mdev->uar_lock);
+}
+
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c
new file mode 100644
index 0000000..752a724
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c
@@ -0,0 +1,753 @@
+/*
+ * Copyright (c) 2011 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/dcbnl.h>
+#include <linux/math64.h>
+
+#include "mlx4_en.h"
+#include "fw_qos.h"
+
+enum {
+	MLX4_CEE_STATE_DOWN   = 0,
+	MLX4_CEE_STATE_UP     = 1,
+};
+
+/* Definitions for QCN
+ */
+
+struct mlx4_congestion_control_mb_prio_802_1_qau_params {
+	__be32 modify_enable_high;
+	__be32 modify_enable_low;
+	__be32 reserved1;
+	__be32 extended_enable;
+	__be32 rppp_max_rps;
+	__be32 rpg_time_reset;
+	__be32 rpg_byte_reset;
+	__be32 rpg_threshold;
+	__be32 rpg_max_rate;
+	__be32 rpg_ai_rate;
+	__be32 rpg_hai_rate;
+	__be32 rpg_gd;
+	__be32 rpg_min_dec_fac;
+	__be32 rpg_min_rate;
+	__be32 max_time_rise;
+	__be32 max_byte_rise;
+	__be32 max_qdelta;
+	__be32 min_qoffset;
+	__be32 gd_coefficient;
+	__be32 reserved2[5];
+	__be32 cp_sample_base;
+	__be32 reserved3[39];
+};
+
+struct mlx4_congestion_control_mb_prio_802_1_qau_statistics {
+	__be64 rppp_rp_centiseconds;
+	__be32 reserved1;
+	__be32 ignored_cnm;
+	__be32 rppp_created_rps;
+	__be32 estimated_total_rate;
+	__be32 max_active_rate_limiter_index;
+	__be32 dropped_cnms_busy_fw;
+	__be32 reserved2;
+	__be32 cnms_handled_successfully;
+	__be32 min_total_limiters_rate;
+	__be32 max_total_limiters_rate;
+	__be32 reserved3[4];
+};
+
+static u8 mlx4_en_dcbnl_getcap(struct net_device *dev, int capid, u8 *cap)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	switch (capid) {
+	case DCB_CAP_ATTR_PFC:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_DCBX:
+		*cap = priv->dcbx_cap;
+		break;
+	case DCB_CAP_ATTR_PFC_TCS:
+		*cap = 1 <<  mlx4_max_tc(priv->mdev->dev);
+		break;
+	default:
+		*cap = false;
+		break;
+	}
+
+	return 0;
+}
+
+static u8 mlx4_en_dcbnl_getpfcstate(struct net_device *netdev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+
+	return priv->cee_config.pfc_state;
+}
+
+static void mlx4_en_dcbnl_setpfcstate(struct net_device *netdev, u8 state)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+
+	priv->cee_config.pfc_state = state;
+}
+
+static void mlx4_en_dcbnl_get_pfc_cfg(struct net_device *netdev, int priority,
+				      u8 *setting)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+
+	*setting = priv->cee_config.dcb_pfc[priority];
+}
+
+static void mlx4_en_dcbnl_set_pfc_cfg(struct net_device *netdev, int priority,
+				      u8 setting)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+
+	priv->cee_config.dcb_pfc[priority] = setting;
+	priv->cee_config.pfc_state = true;
+}
+
+static int mlx4_en_dcbnl_getnumtcs(struct net_device *netdev, int tcid, u8 *num)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+
+	if (!(priv->flags & MLX4_EN_FLAG_DCB_ENABLED))
+		return -EINVAL;
+
+	if (tcid == DCB_NUMTCS_ATTR_PFC)
+		*num = mlx4_max_tc(priv->mdev->dev);
+	else
+		*num = 0;
+
+	return 0;
+}
+
+static u8 mlx4_en_dcbnl_set_all(struct net_device *netdev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	struct mlx4_en_port_profile *prof = priv->prof;
+	struct mlx4_en_dev *mdev = priv->mdev;
+	u8 tx_pause, tx_ppp, rx_pause, rx_ppp;
+
+	if (!(priv->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return 1;
+
+	if (priv->cee_config.pfc_state) {
+		int tc;
+		rx_ppp = prof->rx_ppp;
+		tx_ppp = prof->tx_ppp;
+
+		for (tc = 0; tc < CEE_DCBX_MAX_PRIO; tc++) {
+			u8 tc_mask = 1 << tc;
+
+			switch (priv->cee_config.dcb_pfc[tc]) {
+			case pfc_disabled:
+				tx_ppp &= ~tc_mask;
+				rx_ppp &= ~tc_mask;
+				break;
+			case pfc_enabled_full:
+				tx_ppp |= tc_mask;
+				rx_ppp |= tc_mask;
+				break;
+			case pfc_enabled_tx:
+				tx_ppp |= tc_mask;
+				rx_ppp &= ~tc_mask;
+				break;
+			case pfc_enabled_rx:
+				tx_ppp &= ~tc_mask;
+				rx_ppp |= tc_mask;
+				break;
+			default:
+				break;
+			}
+		}
+		rx_pause = !!(rx_ppp || tx_ppp) ? 0 : prof->rx_pause;
+		tx_pause = !!(rx_ppp || tx_ppp) ? 0 : prof->tx_pause;
+	} else {
+		rx_ppp = 0;
+		tx_ppp = 0;
+		rx_pause = prof->rx_pause;
+		tx_pause = prof->tx_pause;
+	}
+
+	if (mlx4_SET_PORT_general(mdev->dev, priv->port,
+				  priv->rx_skb_size + ETH_FCS_LEN,
+				  tx_pause, tx_ppp, rx_pause, rx_ppp)) {
+		en_err(priv, "Failed setting pause params\n");
+		return 1;
+	}
+
+	prof->tx_ppp = tx_ppp;
+	prof->rx_ppp = rx_ppp;
+	prof->tx_pause = tx_pause;
+	prof->rx_pause = rx_pause;
+
+	return 0;
+}
+
+static u8 mlx4_en_dcbnl_get_state(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (priv->flags & MLX4_EN_FLAG_DCB_ENABLED)
+		return MLX4_CEE_STATE_UP;
+
+	return MLX4_CEE_STATE_DOWN;
+}
+
+static u8 mlx4_en_dcbnl_set_state(struct net_device *dev, u8 state)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int num_tcs = 0;
+
+	if (!(priv->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return 1;
+
+	if (!!(state) == !!(priv->flags & MLX4_EN_FLAG_DCB_ENABLED))
+		return 0;
+
+	if (state) {
+		priv->flags |= MLX4_EN_FLAG_DCB_ENABLED;
+		num_tcs = IEEE_8021QAZ_MAX_TCS;
+	} else {
+		priv->flags &= ~MLX4_EN_FLAG_DCB_ENABLED;
+	}
+
+	if (mlx4_en_alloc_tx_queue_per_tc(dev, num_tcs))
+		return 1;
+
+	return 0;
+}
+
+/* On success returns a non-zero 802.1p user priority bitmap
+ * otherwise returns 0 as the invalid user priority bitmap to
+ * indicate an error.
+ */
+static int mlx4_en_dcbnl_getapp(struct net_device *netdev, u8 idtype, u16 id)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	struct dcb_app app = {
+				.selector = idtype,
+				.protocol = id,
+			     };
+	if (!(priv->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return 0;
+
+	return dcb_getapp(netdev, &app);
+}
+
+static int mlx4_en_dcbnl_setapp(struct net_device *netdev, u8 idtype,
+				u16 id, u8 up)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	struct dcb_app app;
+
+	if (!(priv->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return -EINVAL;
+
+	memset(&app, 0, sizeof(struct dcb_app));
+	app.selector = idtype;
+	app.protocol = id;
+	app.priority = up;
+
+	return dcb_setapp(netdev, &app);
+}
+
+static int mlx4_en_dcbnl_ieee_getets(struct net_device *dev,
+				   struct ieee_ets *ets)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct ieee_ets *my_ets = &priv->ets;
+
+	if (!my_ets)
+		return -EINVAL;
+
+	ets->ets_cap = IEEE_8021QAZ_MAX_TCS;
+	ets->cbs = my_ets->cbs;
+	memcpy(ets->tc_tx_bw, my_ets->tc_tx_bw, sizeof(ets->tc_tx_bw));
+	memcpy(ets->tc_tsa, my_ets->tc_tsa, sizeof(ets->tc_tsa));
+	memcpy(ets->prio_tc, my_ets->prio_tc, sizeof(ets->prio_tc));
+
+	return 0;
+}
+
+static int mlx4_en_ets_validate(struct mlx4_en_priv *priv, struct ieee_ets *ets)
+{
+	int i;
+	int total_ets_bw = 0;
+	int has_ets_tc = 0;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (ets->prio_tc[i] >= MLX4_EN_NUM_UP_HIGH) {
+			en_err(priv, "Bad priority in UP <=> TC mapping. TC: %d, UP: %d\n",
+					i, ets->prio_tc[i]);
+			return -EINVAL;
+		}
+
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_VENDOR:
+		case IEEE_8021QAZ_TSA_STRICT:
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			has_ets_tc = 1;
+			total_ets_bw += ets->tc_tx_bw[i];
+			break;
+		default:
+			en_err(priv, "TC[%d]: Not supported TSA: %d\n",
+					i, ets->tc_tsa[i]);
+			return -EOPNOTSUPP;
+		}
+	}
+
+	if (has_ets_tc && total_ets_bw != MLX4_EN_BW_MAX) {
+		en_err(priv, "Bad ETS BW sum: %d. Should be exactly 100%%\n",
+				total_ets_bw);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx4_en_config_port_scheduler(struct mlx4_en_priv *priv,
+		struct ieee_ets *ets, u16 *ratelimit)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int num_strict = 0;
+	int i;
+	__u8 tc_tx_bw[IEEE_8021QAZ_MAX_TCS] = { 0 };
+	__u8 pg[IEEE_8021QAZ_MAX_TCS] = { 0 };
+
+	ets = ets ?: &priv->ets;
+	ratelimit = ratelimit ?: priv->maxrate;
+
+	/* higher TC means higher priority => lower pg */
+	for (i = IEEE_8021QAZ_MAX_TCS - 1; i >= 0; i--) {
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_VENDOR:
+			pg[i] = MLX4_EN_TC_VENDOR;
+			tc_tx_bw[i] = MLX4_EN_BW_MAX;
+			break;
+		case IEEE_8021QAZ_TSA_STRICT:
+			pg[i] = num_strict++;
+			tc_tx_bw[i] = MLX4_EN_BW_MAX;
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			pg[i] = MLX4_EN_TC_ETS;
+			tc_tx_bw[i] = ets->tc_tx_bw[i] ?: MLX4_EN_BW_MIN;
+			break;
+		}
+	}
+
+	return mlx4_SET_PORT_SCHEDULER(mdev->dev, priv->port, tc_tx_bw, pg,
+			ratelimit);
+}
+
+static int
+mlx4_en_dcbnl_ieee_setets(struct net_device *dev, struct ieee_ets *ets)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx4_en_ets_validate(priv, ets);
+	if (err)
+		return err;
+
+	err = mlx4_SET_PORT_PRIO2TC(mdev->dev, priv->port, ets->prio_tc);
+	if (err)
+		return err;
+
+	err = mlx4_en_config_port_scheduler(priv, ets, NULL);
+	if (err)
+		return err;
+
+	memcpy(&priv->ets, ets, sizeof(priv->ets));
+
+	return 0;
+}
+
+static int mlx4_en_dcbnl_ieee_getpfc(struct net_device *dev,
+		struct ieee_pfc *pfc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	pfc->pfc_cap = IEEE_8021QAZ_MAX_TCS;
+	pfc->pfc_en = priv->prof->tx_ppp;
+
+	return 0;
+}
+
+static int mlx4_en_dcbnl_ieee_setpfc(struct net_device *dev,
+		struct ieee_pfc *pfc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_port_profile *prof = priv->prof;
+	struct mlx4_en_dev *mdev = priv->mdev;
+	u32 tx_pause, tx_ppp, rx_pause, rx_ppp;
+	int err;
+
+	en_dbg(DRV, priv, "cap: 0x%x en: 0x%x mbc: 0x%x delay: %d\n",
+			pfc->pfc_cap,
+			pfc->pfc_en,
+			pfc->mbc,
+			pfc->delay);
+
+	rx_pause = prof->rx_pause && !pfc->pfc_en;
+	tx_pause = prof->tx_pause && !pfc->pfc_en;
+	rx_ppp = pfc->pfc_en;
+	tx_ppp = pfc->pfc_en;
+
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    tx_pause, tx_ppp, rx_pause, rx_ppp);
+	if (err) {
+		en_err(priv, "Failed setting pause params\n");
+		return err;
+	}
+
+	mlx4_en_update_pfc_stats_bitmap(mdev->dev, &priv->stats_bitmap,
+					rx_ppp, rx_pause, tx_ppp, tx_pause);
+
+	prof->tx_ppp = tx_ppp;
+	prof->rx_ppp = rx_ppp;
+	prof->rx_pause = rx_pause;
+	prof->tx_pause = tx_pause;
+
+	return err;
+}
+
+static u8 mlx4_en_dcbnl_getdcbx(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	return priv->dcbx_cap;
+}
+
+static u8 mlx4_en_dcbnl_setdcbx(struct net_device *dev, u8 mode)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct ieee_ets ets = {0};
+	struct ieee_pfc pfc = {0};
+
+	if (mode == priv->dcbx_cap)
+		return 0;
+
+	if ((mode & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    ((mode & DCB_CAP_DCBX_VER_IEEE) &&
+	     (mode & DCB_CAP_DCBX_VER_CEE)) ||
+	    !(mode & DCB_CAP_DCBX_HOST))
+		goto err;
+
+	priv->dcbx_cap = mode;
+
+	ets.ets_cap = IEEE_8021QAZ_MAX_TCS;
+	pfc.pfc_cap = IEEE_8021QAZ_MAX_TCS;
+
+	if (mode & DCB_CAP_DCBX_VER_IEEE) {
+		if (mlx4_en_dcbnl_ieee_setets(dev, &ets))
+			goto err;
+		if (mlx4_en_dcbnl_ieee_setpfc(dev, &pfc))
+			goto err;
+	} else if (mode & DCB_CAP_DCBX_VER_CEE) {
+		if (mlx4_en_dcbnl_set_all(dev))
+			goto err;
+	} else {
+		if (mlx4_en_dcbnl_ieee_setets(dev, &ets))
+			goto err;
+		if (mlx4_en_dcbnl_ieee_setpfc(dev, &pfc))
+			goto err;
+		if (mlx4_en_alloc_tx_queue_per_tc(dev, 0))
+			goto err;
+	}
+
+	return 0;
+err:
+	return 1;
+}
+
+#define MLX4_RATELIMIT_UNITS_IN_KB 100000 /* rate-limit HW unit in Kbps */
+static int mlx4_en_dcbnl_ieee_getmaxrate(struct net_device *dev,
+				   struct ieee_maxrate *maxrate)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		maxrate->tc_maxrate[i] =
+			priv->maxrate[i] * MLX4_RATELIMIT_UNITS_IN_KB;
+
+	return 0;
+}
+
+static int mlx4_en_dcbnl_ieee_setmaxrate(struct net_device *dev,
+		struct ieee_maxrate *maxrate)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	u16 tmp[IEEE_8021QAZ_MAX_TCS];
+	int i, err;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		/* Convert from Kbps into HW units, rounding result up.
+		 * Setting to 0, means unlimited BW.
+		 */
+		tmp[i] = div_u64(maxrate->tc_maxrate[i] +
+				 MLX4_RATELIMIT_UNITS_IN_KB - 1,
+				 MLX4_RATELIMIT_UNITS_IN_KB);
+	}
+
+	err = mlx4_en_config_port_scheduler(priv, NULL, tmp);
+	if (err)
+		return err;
+
+	memcpy(priv->maxrate, tmp, sizeof(priv->maxrate));
+
+	return 0;
+}
+
+#define RPG_ENABLE_BIT	31
+#define CN_TAG_BIT	30
+
+static int mlx4_en_dcbnl_ieee_getqcn(struct net_device *dev,
+				     struct ieee_qcn *qcn)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_congestion_control_mb_prio_802_1_qau_params *hw_qcn;
+	struct mlx4_cmd_mailbox *mailbox_out = NULL;
+	u64 mailbox_in_dma = 0;
+	u32 inmod = 0;
+	int i, err;
+
+	if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QCN))
+		return -EOPNOTSUPP;
+
+	mailbox_out = mlx4_alloc_cmd_mailbox(priv->mdev->dev);
+	if (IS_ERR(mailbox_out))
+		return -ENOMEM;
+	hw_qcn =
+	(struct mlx4_congestion_control_mb_prio_802_1_qau_params *)
+	mailbox_out->buf;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		inmod = priv->port | ((1 << i) << 8) |
+			 (MLX4_CTRL_ALGO_802_1_QAU_REACTION_POINT << 16);
+		err = mlx4_cmd_box(priv->mdev->dev, mailbox_in_dma,
+				   mailbox_out->dma,
+				   inmod, MLX4_CONGESTION_CONTROL_GET_PARAMS,
+				   MLX4_CMD_CONGESTION_CTRL_OPCODE,
+				   MLX4_CMD_TIME_CLASS_C,
+				   MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_free_cmd_mailbox(priv->mdev->dev, mailbox_out);
+			return err;
+		}
+
+		qcn->rpg_enable[i] =
+			be32_to_cpu(hw_qcn->extended_enable) >> RPG_ENABLE_BIT;
+		qcn->rppp_max_rps[i] =
+			be32_to_cpu(hw_qcn->rppp_max_rps);
+		qcn->rpg_time_reset[i] =
+			be32_to_cpu(hw_qcn->rpg_time_reset);
+		qcn->rpg_byte_reset[i] =
+			be32_to_cpu(hw_qcn->rpg_byte_reset);
+		qcn->rpg_threshold[i] =
+			be32_to_cpu(hw_qcn->rpg_threshold);
+		qcn->rpg_max_rate[i] =
+			be32_to_cpu(hw_qcn->rpg_max_rate);
+		qcn->rpg_ai_rate[i] =
+			be32_to_cpu(hw_qcn->rpg_ai_rate);
+		qcn->rpg_hai_rate[i] =
+			be32_to_cpu(hw_qcn->rpg_hai_rate);
+		qcn->rpg_gd[i] =
+			be32_to_cpu(hw_qcn->rpg_gd);
+		qcn->rpg_min_dec_fac[i] =
+			be32_to_cpu(hw_qcn->rpg_min_dec_fac);
+		qcn->rpg_min_rate[i] =
+			be32_to_cpu(hw_qcn->rpg_min_rate);
+		qcn->cndd_state_machine[i] =
+			priv->cndd_state[i];
+	}
+	mlx4_free_cmd_mailbox(priv->mdev->dev, mailbox_out);
+	return 0;
+}
+
+static int mlx4_en_dcbnl_ieee_setqcn(struct net_device *dev,
+				     struct ieee_qcn *qcn)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_congestion_control_mb_prio_802_1_qau_params *hw_qcn;
+	struct mlx4_cmd_mailbox *mailbox_in = NULL;
+	u64 mailbox_in_dma = 0;
+	u32 inmod = 0;
+	int i, err;
+#define MODIFY_ENABLE_HIGH_MASK 0xc0000000
+#define MODIFY_ENABLE_LOW_MASK 0xffc00000
+
+	if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QCN))
+		return -EOPNOTSUPP;
+
+	mailbox_in = mlx4_alloc_cmd_mailbox(priv->mdev->dev);
+	if (IS_ERR(mailbox_in))
+		return -ENOMEM;
+
+	mailbox_in_dma = mailbox_in->dma;
+	hw_qcn =
+	(struct mlx4_congestion_control_mb_prio_802_1_qau_params *)mailbox_in->buf;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		inmod = priv->port | ((1 << i) << 8) |
+			 (MLX4_CTRL_ALGO_802_1_QAU_REACTION_POINT << 16);
+
+		/* Before updating QCN parameter,
+		 * need to set it's modify enable bit to 1
+		 */
+
+		hw_qcn->modify_enable_high = cpu_to_be32(
+						MODIFY_ENABLE_HIGH_MASK);
+		hw_qcn->modify_enable_low = cpu_to_be32(MODIFY_ENABLE_LOW_MASK);
+
+		hw_qcn->extended_enable = cpu_to_be32(qcn->rpg_enable[i] << RPG_ENABLE_BIT);
+		hw_qcn->rppp_max_rps = cpu_to_be32(qcn->rppp_max_rps[i]);
+		hw_qcn->rpg_time_reset = cpu_to_be32(qcn->rpg_time_reset[i]);
+		hw_qcn->rpg_byte_reset = cpu_to_be32(qcn->rpg_byte_reset[i]);
+		hw_qcn->rpg_threshold = cpu_to_be32(qcn->rpg_threshold[i]);
+		hw_qcn->rpg_max_rate = cpu_to_be32(qcn->rpg_max_rate[i]);
+		hw_qcn->rpg_ai_rate = cpu_to_be32(qcn->rpg_ai_rate[i]);
+		hw_qcn->rpg_hai_rate = cpu_to_be32(qcn->rpg_hai_rate[i]);
+		hw_qcn->rpg_gd = cpu_to_be32(qcn->rpg_gd[i]);
+		hw_qcn->rpg_min_dec_fac = cpu_to_be32(qcn->rpg_min_dec_fac[i]);
+		hw_qcn->rpg_min_rate = cpu_to_be32(qcn->rpg_min_rate[i]);
+		priv->cndd_state[i] = qcn->cndd_state_machine[i];
+		if (qcn->cndd_state_machine[i] == DCB_CNDD_INTERIOR_READY)
+			hw_qcn->extended_enable |= cpu_to_be32(1 << CN_TAG_BIT);
+
+		err = mlx4_cmd(priv->mdev->dev, mailbox_in_dma, inmod,
+			       MLX4_CONGESTION_CONTROL_SET_PARAMS,
+			       MLX4_CMD_CONGESTION_CTRL_OPCODE,
+			       MLX4_CMD_TIME_CLASS_C,
+			       MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_free_cmd_mailbox(priv->mdev->dev, mailbox_in);
+			return err;
+		}
+	}
+	mlx4_free_cmd_mailbox(priv->mdev->dev, mailbox_in);
+	return 0;
+}
+
+static int mlx4_en_dcbnl_ieee_getqcnstats(struct net_device *dev,
+					  struct ieee_qcn_stats *qcn_stats)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_congestion_control_mb_prio_802_1_qau_statistics *hw_qcn_stats;
+	struct mlx4_cmd_mailbox *mailbox_out = NULL;
+	u64 mailbox_in_dma = 0;
+	u32 inmod = 0;
+	int i, err;
+
+	if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QCN))
+		return -EOPNOTSUPP;
+
+	mailbox_out = mlx4_alloc_cmd_mailbox(priv->mdev->dev);
+	if (IS_ERR(mailbox_out))
+		return -ENOMEM;
+
+	hw_qcn_stats =
+	(struct mlx4_congestion_control_mb_prio_802_1_qau_statistics *)
+	mailbox_out->buf;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		inmod = priv->port | ((1 << i) << 8) |
+			 (MLX4_CTRL_ALGO_802_1_QAU_REACTION_POINT << 16);
+		err = mlx4_cmd_box(priv->mdev->dev, mailbox_in_dma,
+				   mailbox_out->dma, inmod,
+				   MLX4_CONGESTION_CONTROL_GET_STATISTICS,
+				   MLX4_CMD_CONGESTION_CTRL_OPCODE,
+				   MLX4_CMD_TIME_CLASS_C,
+				   MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_free_cmd_mailbox(priv->mdev->dev, mailbox_out);
+			return err;
+		}
+		qcn_stats->rppp_rp_centiseconds[i] =
+			be64_to_cpu(hw_qcn_stats->rppp_rp_centiseconds);
+		qcn_stats->rppp_created_rps[i] =
+			be32_to_cpu(hw_qcn_stats->rppp_created_rps);
+	}
+	mlx4_free_cmd_mailbox(priv->mdev->dev, mailbox_out);
+	return 0;
+}
+
+const struct dcbnl_rtnl_ops mlx4_en_dcbnl_ops = {
+	.ieee_getets		= mlx4_en_dcbnl_ieee_getets,
+	.ieee_setets		= mlx4_en_dcbnl_ieee_setets,
+	.ieee_getmaxrate	= mlx4_en_dcbnl_ieee_getmaxrate,
+	.ieee_setmaxrate	= mlx4_en_dcbnl_ieee_setmaxrate,
+	.ieee_getqcn		= mlx4_en_dcbnl_ieee_getqcn,
+	.ieee_setqcn		= mlx4_en_dcbnl_ieee_setqcn,
+	.ieee_getqcnstats	= mlx4_en_dcbnl_ieee_getqcnstats,
+	.ieee_getpfc		= mlx4_en_dcbnl_ieee_getpfc,
+	.ieee_setpfc		= mlx4_en_dcbnl_ieee_setpfc,
+
+	.getstate	= mlx4_en_dcbnl_get_state,
+	.setstate	= mlx4_en_dcbnl_set_state,
+	.getpfccfg	= mlx4_en_dcbnl_get_pfc_cfg,
+	.setpfccfg	= mlx4_en_dcbnl_set_pfc_cfg,
+	.setall		= mlx4_en_dcbnl_set_all,
+	.getcap		= mlx4_en_dcbnl_getcap,
+	.getnumtcs	= mlx4_en_dcbnl_getnumtcs,
+	.getpfcstate	= mlx4_en_dcbnl_getpfcstate,
+	.setpfcstate	= mlx4_en_dcbnl_setpfcstate,
+	.getapp		= mlx4_en_dcbnl_getapp,
+	.setapp		= mlx4_en_dcbnl_setapp,
+
+	.getdcbx	= mlx4_en_dcbnl_getdcbx,
+	.setdcbx	= mlx4_en_dcbnl_setdcbx,
+};
+
+const struct dcbnl_rtnl_ops mlx4_en_dcbnl_pfc_ops = {
+	.ieee_getpfc	= mlx4_en_dcbnl_ieee_getpfc,
+	.ieee_setpfc	= mlx4_en_dcbnl_ieee_setpfc,
+
+	.setstate	= mlx4_en_dcbnl_set_state,
+	.getpfccfg	= mlx4_en_dcbnl_get_pfc_cfg,
+	.setpfccfg	= mlx4_en_dcbnl_set_pfc_cfg,
+	.setall		= mlx4_en_dcbnl_set_all,
+	.getnumtcs	= mlx4_en_dcbnl_getnumtcs,
+	.getpfcstate	= mlx4_en_dcbnl_getpfcstate,
+	.setpfcstate	= mlx4_en_dcbnl_setpfcstate,
+	.getapp		= mlx4_en_dcbnl_getapp,
+	.setapp		= mlx4_en_dcbnl_setapp,
+
+	.getdcbx	= mlx4_en_dcbnl_getdcbx,
+	.setdcbx	= mlx4_en_dcbnl_setdcbx,
+};
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
new file mode 100644
index 0000000..f11b450
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -0,0 +1,2160 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/in.h>
+#include <net/ip.h>
+#include <linux/bitmap.h>
+
+#include "mlx4_en.h"
+#include "en_port.h"
+
+#define EN_ETHTOOL_QP_ATTACH (1ull << 63)
+#define EN_ETHTOOL_SHORT_MASK cpu_to_be16(0xffff)
+#define EN_ETHTOOL_WORD_MASK  cpu_to_be32(0xffffffff)
+
+static int mlx4_en_moderation_update(struct mlx4_en_priv *priv)
+{
+	int i, t;
+	int err = 0;
+
+	for (t = 0 ; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			priv->tx_cq[t][i]->moder_cnt = priv->tx_frames;
+			priv->tx_cq[t][i]->moder_time = priv->tx_usecs;
+			if (priv->port_up) {
+				err = mlx4_en_set_cq_moder(priv,
+							   priv->tx_cq[t][i]);
+				if (err)
+					return err;
+			}
+		}
+	}
+
+	if (priv->adaptive_rx_coal)
+		return 0;
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->rx_cq[i]->moder_cnt = priv->rx_frames;
+		priv->rx_cq[i]->moder_time = priv->rx_usecs;
+		priv->last_moder_time[i] = MLX4_EN_AUTO_CONF;
+		if (priv->port_up) {
+			err = mlx4_en_set_cq_moder(priv, priv->rx_cq[i]);
+			if (err)
+				return err;
+		}
+	}
+
+	return err;
+}
+
+static void
+mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, DRV_VERSION,
+		sizeof(drvinfo->version));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		"%d.%d.%d",
+		(u16) (mdev->dev->caps.fw_ver >> 32),
+		(u16) ((mdev->dev->caps.fw_ver >> 16) & 0xffff),
+		(u16) (mdev->dev->caps.fw_ver & 0xffff));
+	strlcpy(drvinfo->bus_info, pci_name(mdev->dev->persist->pdev),
+		sizeof(drvinfo->bus_info));
+}
+
+static const char mlx4_en_priv_flags[][ETH_GSTRING_LEN] = {
+	"blueflame",
+	"phv-bit"
+};
+
+static const char main_strings[][ETH_GSTRING_LEN] = {
+	/* main statistics */
+	"rx_packets", "tx_packets", "rx_bytes", "tx_bytes", "rx_errors",
+	"tx_errors", "rx_dropped", "tx_dropped", "multicast", "collisions",
+	"rx_length_errors", "rx_over_errors", "rx_crc_errors",
+	"rx_frame_errors", "rx_fifo_errors", "rx_missed_errors",
+	"tx_aborted_errors", "tx_carrier_errors", "tx_fifo_errors",
+	"tx_heartbeat_errors", "tx_window_errors",
+
+	/* port statistics */
+	"tso_packets",
+	"xmit_more",
+	"queue_stopped", "wake_queue", "tx_timeout", "rx_alloc_pages",
+	"rx_csum_good", "rx_csum_none", "rx_csum_complete", "tx_chksum_offload",
+
+	/* pf statistics */
+	"pf_rx_packets",
+	"pf_rx_bytes",
+	"pf_tx_packets",
+	"pf_tx_bytes",
+
+	/* priority flow control statistics rx */
+	"rx_pause_prio_0", "rx_pause_duration_prio_0",
+	"rx_pause_transition_prio_0",
+	"rx_pause_prio_1", "rx_pause_duration_prio_1",
+	"rx_pause_transition_prio_1",
+	"rx_pause_prio_2", "rx_pause_duration_prio_2",
+	"rx_pause_transition_prio_2",
+	"rx_pause_prio_3", "rx_pause_duration_prio_3",
+	"rx_pause_transition_prio_3",
+	"rx_pause_prio_4", "rx_pause_duration_prio_4",
+	"rx_pause_transition_prio_4",
+	"rx_pause_prio_5", "rx_pause_duration_prio_5",
+	"rx_pause_transition_prio_5",
+	"rx_pause_prio_6", "rx_pause_duration_prio_6",
+	"rx_pause_transition_prio_6",
+	"rx_pause_prio_7", "rx_pause_duration_prio_7",
+	"rx_pause_transition_prio_7",
+
+	/* flow control statistics rx */
+	"rx_pause", "rx_pause_duration", "rx_pause_transition",
+
+	/* priority flow control statistics tx */
+	"tx_pause_prio_0", "tx_pause_duration_prio_0",
+	"tx_pause_transition_prio_0",
+	"tx_pause_prio_1", "tx_pause_duration_prio_1",
+	"tx_pause_transition_prio_1",
+	"tx_pause_prio_2", "tx_pause_duration_prio_2",
+	"tx_pause_transition_prio_2",
+	"tx_pause_prio_3", "tx_pause_duration_prio_3",
+	"tx_pause_transition_prio_3",
+	"tx_pause_prio_4", "tx_pause_duration_prio_4",
+	"tx_pause_transition_prio_4",
+	"tx_pause_prio_5", "tx_pause_duration_prio_5",
+	"tx_pause_transition_prio_5",
+	"tx_pause_prio_6", "tx_pause_duration_prio_6",
+	"tx_pause_transition_prio_6",
+	"tx_pause_prio_7", "tx_pause_duration_prio_7",
+	"tx_pause_transition_prio_7",
+
+	/* flow control statistics tx */
+	"tx_pause", "tx_pause_duration", "tx_pause_transition",
+
+	/* packet statistics */
+	"rx_multicast_packets",
+	"rx_broadcast_packets",
+	"rx_jabbers",
+	"rx_in_range_length_error",
+	"rx_out_range_length_error",
+	"tx_multicast_packets",
+	"tx_broadcast_packets",
+	"rx_prio_0_packets", "rx_prio_0_bytes",
+	"rx_prio_1_packets", "rx_prio_1_bytes",
+	"rx_prio_2_packets", "rx_prio_2_bytes",
+	"rx_prio_3_packets", "rx_prio_3_bytes",
+	"rx_prio_4_packets", "rx_prio_4_bytes",
+	"rx_prio_5_packets", "rx_prio_5_bytes",
+	"rx_prio_6_packets", "rx_prio_6_bytes",
+	"rx_prio_7_packets", "rx_prio_7_bytes",
+	"rx_novlan_packets", "rx_novlan_bytes",
+	"tx_prio_0_packets", "tx_prio_0_bytes",
+	"tx_prio_1_packets", "tx_prio_1_bytes",
+	"tx_prio_2_packets", "tx_prio_2_bytes",
+	"tx_prio_3_packets", "tx_prio_3_bytes",
+	"tx_prio_4_packets", "tx_prio_4_bytes",
+	"tx_prio_5_packets", "tx_prio_5_bytes",
+	"tx_prio_6_packets", "tx_prio_6_bytes",
+	"tx_prio_7_packets", "tx_prio_7_bytes",
+	"tx_novlan_packets", "tx_novlan_bytes",
+
+	/* xdp statistics */
+	"rx_xdp_drop",
+	"rx_xdp_tx",
+	"rx_xdp_tx_full",
+
+	/* phy statistics */
+	"rx_packets_phy", "rx_bytes_phy",
+	"tx_packets_phy", "tx_bytes_phy",
+};
+
+static const char mlx4_en_test_names[][ETH_GSTRING_LEN]= {
+	"Interrupt Test",
+	"Link Test",
+	"Speed Test",
+	"Register Test",
+	"Loopback Test",
+};
+
+static u32 mlx4_en_get_msglevel(struct net_device *dev)
+{
+	return ((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable;
+}
+
+static void mlx4_en_set_msglevel(struct net_device *dev, u32 val)
+{
+	((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable = val;
+}
+
+static void mlx4_en_get_wol(struct net_device *netdev,
+			    struct ethtool_wolinfo *wol)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	struct mlx4_caps *caps = &priv->mdev->dev->caps;
+	int err = 0;
+	u64 config = 0;
+	u64 mask;
+
+	if ((priv->port < 1) || (priv->port > 2)) {
+		en_err(priv, "Failed to get WoL information\n");
+		return;
+	}
+
+	mask = (priv->port == 1) ? MLX4_DEV_CAP_FLAG_WOL_PORT1 :
+		MLX4_DEV_CAP_FLAG_WOL_PORT2;
+
+	if (!(caps->flags & mask)) {
+		wol->supported = 0;
+		wol->wolopts = 0;
+		return;
+	}
+
+	if (caps->wol_port[priv->port])
+		wol->supported = WAKE_MAGIC;
+	else
+		wol->supported = 0;
+
+	err = mlx4_wol_read(priv->mdev->dev, &config, priv->port);
+	if (err) {
+		en_err(priv, "Failed to get WoL information\n");
+		return;
+	}
+
+	if ((config & MLX4_EN_WOL_ENABLED) && (config & MLX4_EN_WOL_MAGIC))
+		wol->wolopts = WAKE_MAGIC;
+	else
+		wol->wolopts = 0;
+}
+
+static int mlx4_en_set_wol(struct net_device *netdev,
+			    struct ethtool_wolinfo *wol)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	u64 config = 0;
+	int err = 0;
+	u64 mask;
+
+	if ((priv->port < 1) || (priv->port > 2))
+		return -EOPNOTSUPP;
+
+	mask = (priv->port == 1) ? MLX4_DEV_CAP_FLAG_WOL_PORT1 :
+		MLX4_DEV_CAP_FLAG_WOL_PORT2;
+
+	if (!(priv->mdev->dev->caps.flags & mask))
+		return -EOPNOTSUPP;
+
+	if (wol->supported & ~WAKE_MAGIC)
+		return -EINVAL;
+
+	err = mlx4_wol_read(priv->mdev->dev, &config, priv->port);
+	if (err) {
+		en_err(priv, "Failed to get WoL info, unable to modify\n");
+		return err;
+	}
+
+	if (wol->wolopts & WAKE_MAGIC) {
+		config |= MLX4_EN_WOL_DO_MODIFY | MLX4_EN_WOL_ENABLED |
+				MLX4_EN_WOL_MAGIC;
+	} else {
+		config &= ~(MLX4_EN_WOL_ENABLED | MLX4_EN_WOL_MAGIC);
+		config |= MLX4_EN_WOL_DO_MODIFY;
+	}
+
+	err = mlx4_wol_write(priv->mdev->dev, config, priv->port);
+	if (err)
+		en_err(priv, "Failed to set WoL information\n");
+
+	return err;
+}
+
+struct bitmap_iterator {
+	unsigned long *stats_bitmap;
+	unsigned int count;
+	unsigned int iterator;
+	bool advance_array; /* if set, force no increments */
+};
+
+static inline void bitmap_iterator_init(struct bitmap_iterator *h,
+					unsigned long *stats_bitmap,
+					int count)
+{
+	h->iterator = 0;
+	h->advance_array = !bitmap_empty(stats_bitmap, count);
+	h->count = h->advance_array ? bitmap_weight(stats_bitmap, count)
+		: count;
+	h->stats_bitmap = stats_bitmap;
+}
+
+static inline int bitmap_iterator_test(struct bitmap_iterator *h)
+{
+	return !h->advance_array ? 1 : test_bit(h->iterator, h->stats_bitmap);
+}
+
+static inline int bitmap_iterator_inc(struct bitmap_iterator *h)
+{
+	return h->iterator++;
+}
+
+static inline unsigned int
+bitmap_iterator_count(struct bitmap_iterator *h)
+{
+	return h->count;
+}
+
+static int mlx4_en_get_sset_count(struct net_device *dev, int sset)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct bitmap_iterator it;
+
+	bitmap_iterator_init(&it, priv->stats_bitmap.bitmap, NUM_ALL_STATS);
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		return bitmap_iterator_count(&it) +
+			(priv->tx_ring_num[TX] * 2) +
+			(priv->rx_ring_num * (3 + NUM_XDP_STATS));
+	case ETH_SS_TEST:
+		return MLX4_EN_NUM_SELF_TEST - !(priv->mdev->dev->caps.flags
+					& MLX4_DEV_CAP_FLAG_UC_LOOPBACK) * 2;
+	case ETH_SS_PRIV_FLAGS:
+		return ARRAY_SIZE(mlx4_en_priv_flags);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void mlx4_en_get_ethtool_stats(struct net_device *dev,
+		struct ethtool_stats *stats, uint64_t *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int index = 0;
+	int i;
+	struct bitmap_iterator it;
+
+	bitmap_iterator_init(&it, priv->stats_bitmap.bitmap, NUM_ALL_STATS);
+
+	spin_lock_bh(&priv->stats_lock);
+
+	mlx4_en_fold_software_stats(dev);
+
+	for (i = 0; i < NUM_MAIN_STATS; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((unsigned long *)&dev->stats)[i];
+
+	for (i = 0; i < NUM_PORT_STATS; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((unsigned long *)&priv->port_stats)[i];
+
+	for (i = 0; i < NUM_PF_STATS; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] =
+				((unsigned long *)&priv->pf_stats)[i];
+
+	for (i = 0; i < NUM_FLOW_PRIORITY_STATS_RX;
+	     i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] =
+				((u64 *)&priv->rx_priority_flowstats)[i];
+
+	for (i = 0; i < NUM_FLOW_STATS_RX; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((u64 *)&priv->rx_flowstats)[i];
+
+	for (i = 0; i < NUM_FLOW_PRIORITY_STATS_TX;
+	     i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] =
+				((u64 *)&priv->tx_priority_flowstats)[i];
+
+	for (i = 0; i < NUM_FLOW_STATS_TX; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((u64 *)&priv->tx_flowstats)[i];
+
+	for (i = 0; i < NUM_PKT_STATS; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((unsigned long *)&priv->pkstats)[i];
+
+	for (i = 0; i < NUM_XDP_STATS; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((unsigned long *)&priv->xdp_stats)[i];
+
+	for (i = 0; i < NUM_PHY_STATS; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((unsigned long *)&priv->phy_stats)[i];
+
+	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+		data[index++] = priv->tx_ring[TX][i]->packets;
+		data[index++] = priv->tx_ring[TX][i]->bytes;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		data[index++] = priv->rx_ring[i]->packets;
+		data[index++] = priv->rx_ring[i]->bytes;
+		data[index++] = priv->rx_ring[i]->dropped;
+		data[index++] = priv->rx_ring[i]->xdp_drop;
+		data[index++] = priv->rx_ring[i]->xdp_tx;
+		data[index++] = priv->rx_ring[i]->xdp_tx_full;
+	}
+	spin_unlock_bh(&priv->stats_lock);
+
+}
+
+static void mlx4_en_self_test(struct net_device *dev,
+			      struct ethtool_test *etest, u64 *buf)
+{
+	mlx4_en_ex_selftest(dev, &etest->flags, buf);
+}
+
+static void mlx4_en_get_strings(struct net_device *dev,
+				uint32_t stringset, uint8_t *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int index = 0;
+	int i, strings = 0;
+	struct bitmap_iterator it;
+
+	bitmap_iterator_init(&it, priv->stats_bitmap.bitmap, NUM_ALL_STATS);
+
+	switch (stringset) {
+	case ETH_SS_TEST:
+		for (i = 0; i < MLX4_EN_NUM_SELF_TEST - 2; i++)
+			strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
+		if (priv->mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UC_LOOPBACK)
+			for (; i < MLX4_EN_NUM_SELF_TEST; i++)
+				strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
+		break;
+
+	case ETH_SS_STATS:
+		/* Add main counters */
+		for (i = 0; i < NUM_MAIN_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < NUM_PORT_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < NUM_PF_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < NUM_FLOW_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < NUM_PKT_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < NUM_XDP_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < NUM_PHY_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"tx%d_packets", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"tx%d_bytes", i);
+		}
+		for (i = 0; i < priv->rx_ring_num; i++) {
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_packets", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_bytes", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_dropped", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_xdp_drop", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_xdp_tx", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_xdp_tx_full", i);
+		}
+		break;
+	case ETH_SS_PRIV_FLAGS:
+		for (i = 0; i < ARRAY_SIZE(mlx4_en_priv_flags); i++)
+			strcpy(data + i * ETH_GSTRING_LEN,
+			       mlx4_en_priv_flags[i]);
+		break;
+
+	}
+}
+
+static u32 mlx4_en_autoneg_get(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	u32 autoneg = AUTONEG_DISABLE;
+
+	if ((mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP) &&
+	    (priv->port_state.flags & MLX4_EN_PORT_ANE))
+		autoneg = AUTONEG_ENABLE;
+
+	return autoneg;
+}
+
+static void ptys2ethtool_update_supported_port(unsigned long *mask,
+					       struct mlx4_ptys_reg *ptys_reg)
+{
+	u32 eth_proto = be32_to_cpu(ptys_reg->eth_proto_cap);
+
+	if (eth_proto & (MLX4_PROT_MASK(MLX4_10GBASE_T)
+			 | MLX4_PROT_MASK(MLX4_1000BASE_T)
+			 | MLX4_PROT_MASK(MLX4_100BASE_TX))) {
+		__set_bit(ETHTOOL_LINK_MODE_TP_BIT, mask);
+	} else if (eth_proto & (MLX4_PROT_MASK(MLX4_10GBASE_CR)
+			 | MLX4_PROT_MASK(MLX4_10GBASE_SR)
+			 | MLX4_PROT_MASK(MLX4_56GBASE_SR4)
+			 | MLX4_PROT_MASK(MLX4_40GBASE_CR4)
+			 | MLX4_PROT_MASK(MLX4_40GBASE_SR4)
+			 | MLX4_PROT_MASK(MLX4_1000BASE_CX_SGMII))) {
+		__set_bit(ETHTOOL_LINK_MODE_FIBRE_BIT, mask);
+	} else if (eth_proto & (MLX4_PROT_MASK(MLX4_56GBASE_KR4)
+			 | MLX4_PROT_MASK(MLX4_40GBASE_KR4)
+			 | MLX4_PROT_MASK(MLX4_20GBASE_KR2)
+			 | MLX4_PROT_MASK(MLX4_10GBASE_KR)
+			 | MLX4_PROT_MASK(MLX4_10GBASE_KX4)
+			 | MLX4_PROT_MASK(MLX4_1000BASE_KX))) {
+		__set_bit(ETHTOOL_LINK_MODE_Backplane_BIT, mask);
+	}
+}
+
+static u32 ptys_get_active_port(struct mlx4_ptys_reg *ptys_reg)
+{
+	u32 eth_proto = be32_to_cpu(ptys_reg->eth_proto_oper);
+
+	if (!eth_proto) /* link down */
+		eth_proto = be32_to_cpu(ptys_reg->eth_proto_cap);
+
+	if (eth_proto & (MLX4_PROT_MASK(MLX4_10GBASE_T)
+			 | MLX4_PROT_MASK(MLX4_1000BASE_T)
+			 | MLX4_PROT_MASK(MLX4_100BASE_TX))) {
+			return PORT_TP;
+	}
+
+	if (eth_proto & (MLX4_PROT_MASK(MLX4_10GBASE_SR)
+			 | MLX4_PROT_MASK(MLX4_56GBASE_SR4)
+			 | MLX4_PROT_MASK(MLX4_40GBASE_SR4)
+			 | MLX4_PROT_MASK(MLX4_1000BASE_CX_SGMII))) {
+			return PORT_FIBRE;
+	}
+
+	if (eth_proto & (MLX4_PROT_MASK(MLX4_10GBASE_CR)
+			 | MLX4_PROT_MASK(MLX4_56GBASE_CR4)
+			 | MLX4_PROT_MASK(MLX4_40GBASE_CR4))) {
+			return PORT_DA;
+	}
+
+	if (eth_proto & (MLX4_PROT_MASK(MLX4_56GBASE_KR4)
+			 | MLX4_PROT_MASK(MLX4_40GBASE_KR4)
+			 | MLX4_PROT_MASK(MLX4_20GBASE_KR2)
+			 | MLX4_PROT_MASK(MLX4_10GBASE_KR)
+			 | MLX4_PROT_MASK(MLX4_10GBASE_KX4)
+			 | MLX4_PROT_MASK(MLX4_1000BASE_KX))) {
+			return PORT_NONE;
+	}
+	return PORT_OTHER;
+}
+
+#define MLX4_LINK_MODES_SZ \
+	(FIELD_SIZEOF(struct mlx4_ptys_reg, eth_proto_cap) * 8)
+
+enum ethtool_report {
+	SUPPORTED = 0,
+	ADVERTISED = 1,
+};
+
+struct ptys2ethtool_config {
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(advertised);
+	u32 speed;
+};
+
+static unsigned long *ptys2ethtool_link_mode(struct ptys2ethtool_config *cfg,
+					     enum ethtool_report report)
+{
+	switch (report) {
+	case SUPPORTED:
+		return cfg->supported;
+	case ADVERTISED:
+		return cfg->advertised;
+	}
+	return NULL;
+}
+
+#define MLX4_BUILD_PTYS2ETHTOOL_CONFIG(reg_, speed_, ...)		\
+	({								\
+		struct ptys2ethtool_config *cfg;			\
+		const unsigned int modes[] = { __VA_ARGS__ };		\
+		unsigned int i;						\
+		cfg = &ptys2ethtool_map[reg_];				\
+		cfg->speed = speed_;					\
+		bitmap_zero(cfg->supported,				\
+			    __ETHTOOL_LINK_MODE_MASK_NBITS);		\
+		bitmap_zero(cfg->advertised,				\
+			    __ETHTOOL_LINK_MODE_MASK_NBITS);		\
+		for (i = 0 ; i < ARRAY_SIZE(modes) ; ++i) {		\
+			__set_bit(modes[i], cfg->supported);		\
+			__set_bit(modes[i], cfg->advertised);		\
+		}							\
+	})
+
+/* Translates mlx4 link mode to equivalent ethtool Link modes/speed */
+static struct ptys2ethtool_config ptys2ethtool_map[MLX4_LINK_MODES_SZ];
+
+void __init mlx4_en_init_ptys2ethtool_map(void)
+{
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_100BASE_TX, SPEED_100,
+				       ETHTOOL_LINK_MODE_100baseT_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_1000BASE_T, SPEED_1000,
+				       ETHTOOL_LINK_MODE_1000baseT_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_1000BASE_CX_SGMII, SPEED_1000,
+				       ETHTOOL_LINK_MODE_1000baseKX_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_1000BASE_KX, SPEED_1000,
+				       ETHTOOL_LINK_MODE_1000baseKX_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_T, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseT_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_CX4, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_KX4, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_KR, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_CR, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_SR, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_20GBASE_KR2, SPEED_20000,
+				       ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT,
+				       ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_40GBASE_CR4, SPEED_40000,
+				       ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_40GBASE_KR4, SPEED_40000,
+				       ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_40GBASE_SR4, SPEED_40000,
+				       ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_56GBASE_KR4, SPEED_56000,
+				       ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_56GBASE_CR4, SPEED_56000,
+				       ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_56GBASE_SR4, SPEED_56000,
+				       ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT);
+};
+
+static void ptys2ethtool_update_link_modes(unsigned long *link_modes,
+					   u32 eth_proto,
+					   enum ethtool_report report)
+{
+	int i;
+	for (i = 0; i < MLX4_LINK_MODES_SZ; i++) {
+		if (eth_proto & MLX4_PROT_MASK(i))
+			bitmap_or(link_modes, link_modes,
+				  ptys2ethtool_link_mode(&ptys2ethtool_map[i],
+							 report),
+				  __ETHTOOL_LINK_MODE_MASK_NBITS);
+	}
+}
+
+static u32 ethtool2ptys_link_modes(const unsigned long *link_modes,
+				   enum ethtool_report report)
+{
+	int i;
+	u32 ptys_modes = 0;
+
+	for (i = 0; i < MLX4_LINK_MODES_SZ; i++) {
+		if (bitmap_intersects(
+			    ptys2ethtool_link_mode(&ptys2ethtool_map[i],
+						   report),
+			    link_modes,
+			    __ETHTOOL_LINK_MODE_MASK_NBITS))
+			ptys_modes |= 1 << i;
+	}
+	return ptys_modes;
+}
+
+/* Convert actual speed (SPEED_XXX) to ptys link modes */
+static u32 speed2ptys_link_modes(u32 speed)
+{
+	int i;
+	u32 ptys_modes = 0;
+
+	for (i = 0; i < MLX4_LINK_MODES_SZ; i++) {
+		if (ptys2ethtool_map[i].speed == speed)
+			ptys_modes |= 1 << i;
+	}
+	return ptys_modes;
+}
+
+static int
+ethtool_get_ptys_link_ksettings(struct net_device *dev,
+				struct ethtool_link_ksettings *link_ksettings)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_ptys_reg ptys_reg;
+	u32 eth_proto;
+	int ret;
+
+	memset(&ptys_reg, 0, sizeof(ptys_reg));
+	ptys_reg.local_port = priv->port;
+	ptys_reg.proto_mask = MLX4_PTYS_EN;
+	ret = mlx4_ACCESS_PTYS_REG(priv->mdev->dev,
+				   MLX4_ACCESS_REG_QUERY, &ptys_reg);
+	if (ret) {
+		en_warn(priv, "Failed to run mlx4_ACCESS_PTYS_REG status(%x)",
+			ret);
+		return ret;
+	}
+	en_dbg(DRV, priv, "ptys_reg.proto_mask       %x\n",
+	       ptys_reg.proto_mask);
+	en_dbg(DRV, priv, "ptys_reg.eth_proto_cap    %x\n",
+	       be32_to_cpu(ptys_reg.eth_proto_cap));
+	en_dbg(DRV, priv, "ptys_reg.eth_proto_admin  %x\n",
+	       be32_to_cpu(ptys_reg.eth_proto_admin));
+	en_dbg(DRV, priv, "ptys_reg.eth_proto_oper   %x\n",
+	       be32_to_cpu(ptys_reg.eth_proto_oper));
+	en_dbg(DRV, priv, "ptys_reg.eth_proto_lp_adv %x\n",
+	       be32_to_cpu(ptys_reg.eth_proto_lp_adv));
+
+	/* reset supported/advertising masks */
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, supported);
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising);
+
+	ptys2ethtool_update_supported_port(link_ksettings->link_modes.supported,
+					   &ptys_reg);
+
+	eth_proto = be32_to_cpu(ptys_reg.eth_proto_cap);
+	ptys2ethtool_update_link_modes(link_ksettings->link_modes.supported,
+				       eth_proto, SUPPORTED);
+
+	eth_proto = be32_to_cpu(ptys_reg.eth_proto_admin);
+	ptys2ethtool_update_link_modes(link_ksettings->link_modes.advertising,
+				       eth_proto, ADVERTISED);
+
+	ethtool_link_ksettings_add_link_mode(link_ksettings, supported,
+					     Pause);
+	ethtool_link_ksettings_add_link_mode(link_ksettings, supported,
+					     Asym_Pause);
+
+	if (priv->prof->tx_pause)
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, Pause);
+	if (priv->prof->tx_pause ^ priv->prof->rx_pause)
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, Asym_Pause);
+
+	link_ksettings->base.port = ptys_get_active_port(&ptys_reg);
+
+	if (mlx4_en_autoneg_get(dev)) {
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, Autoneg);
+	}
+
+	link_ksettings->base.autoneg
+		= (priv->port_state.flags & MLX4_EN_PORT_ANC) ?
+		AUTONEG_ENABLE : AUTONEG_DISABLE;
+
+	eth_proto = be32_to_cpu(ptys_reg.eth_proto_lp_adv);
+
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, lp_advertising);
+	ptys2ethtool_update_link_modes(
+		link_ksettings->link_modes.lp_advertising,
+		eth_proto, ADVERTISED);
+	if (priv->port_state.flags & MLX4_EN_PORT_ANC)
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     lp_advertising, Autoneg);
+
+	link_ksettings->base.phy_address = 0;
+	link_ksettings->base.mdio_support = 0;
+	link_ksettings->base.eth_tp_mdix = ETH_TP_MDI_INVALID;
+	link_ksettings->base.eth_tp_mdix_ctrl = ETH_TP_MDI_AUTO;
+
+	return ret;
+}
+
+static void
+ethtool_get_default_link_ksettings(
+	struct net_device *dev, struct ethtool_link_ksettings *link_ksettings)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int trans_type;
+
+	link_ksettings->base.autoneg = AUTONEG_DISABLE;
+
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, supported);
+	ethtool_link_ksettings_add_link_mode(link_ksettings, supported,
+					     10000baseT_Full);
+
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising);
+	ethtool_link_ksettings_add_link_mode(link_ksettings, advertising,
+					     10000baseT_Full);
+
+	trans_type = priv->port_state.transceiver;
+	if (trans_type > 0 && trans_type <= 0xC) {
+		link_ksettings->base.port = PORT_FIBRE;
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, FIBRE);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, FIBRE);
+	} else if (trans_type == 0x80 || trans_type == 0) {
+		link_ksettings->base.port = PORT_TP;
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, TP);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, TP);
+	} else  {
+		link_ksettings->base.port = -1;
+	}
+}
+
+static int
+mlx4_en_get_link_ksettings(struct net_device *dev,
+			   struct ethtool_link_ksettings *link_ksettings)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int ret = -EINVAL;
+
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+
+	en_dbg(DRV, priv, "query port state.flags ANC(%x) ANE(%x)\n",
+	       priv->port_state.flags & MLX4_EN_PORT_ANC,
+	       priv->port_state.flags & MLX4_EN_PORT_ANE);
+
+	if (priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL)
+		ret = ethtool_get_ptys_link_ksettings(dev, link_ksettings);
+	if (ret) /* ETH PROT CRTL is not supported or PTYS CMD failed */
+		ethtool_get_default_link_ksettings(dev, link_ksettings);
+
+	if (netif_carrier_ok(dev)) {
+		link_ksettings->base.speed = priv->port_state.link_speed;
+		link_ksettings->base.duplex = DUPLEX_FULL;
+	} else {
+		link_ksettings->base.speed = SPEED_UNKNOWN;
+		link_ksettings->base.duplex = DUPLEX_UNKNOWN;
+	}
+	return 0;
+}
+
+/* Calculate PTYS admin according ethtool speed (SPEED_XXX) */
+static __be32 speed_set_ptys_admin(struct mlx4_en_priv *priv, u32 speed,
+				   __be32 proto_cap)
+{
+	__be32 proto_admin = 0;
+
+	if (!speed) { /* Speed = 0 ==> Reset Link modes */
+		proto_admin = proto_cap;
+		en_info(priv, "Speed was set to 0, Reset advertised Link Modes to default (%x)\n",
+			be32_to_cpu(proto_cap));
+	} else {
+		u32 ptys_link_modes = speed2ptys_link_modes(speed);
+
+		proto_admin = cpu_to_be32(ptys_link_modes) & proto_cap;
+		en_info(priv, "Setting Speed to %d\n", speed);
+	}
+	return proto_admin;
+}
+
+static int
+mlx4_en_set_link_ksettings(struct net_device *dev,
+			   const struct ethtool_link_ksettings *link_ksettings)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_ptys_reg ptys_reg;
+	__be32 proto_admin;
+	u8 cur_autoneg;
+	int ret;
+
+	u32 ptys_adv = ethtool2ptys_link_modes(
+		link_ksettings->link_modes.advertising, ADVERTISED);
+	const int speed = link_ksettings->base.speed;
+
+	en_dbg(DRV, priv,
+	       "Set Speed=%d adv={%*pbl} autoneg=%d duplex=%d\n",
+	       speed, __ETHTOOL_LINK_MODE_MASK_NBITS,
+	       link_ksettings->link_modes.advertising,
+	       link_ksettings->base.autoneg,
+	       link_ksettings->base.duplex);
+
+	if (!(priv->mdev->dev->caps.flags2 &
+	      MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL) ||
+	    (link_ksettings->base.duplex == DUPLEX_HALF))
+		return -EINVAL;
+
+	memset(&ptys_reg, 0, sizeof(ptys_reg));
+	ptys_reg.local_port = priv->port;
+	ptys_reg.proto_mask = MLX4_PTYS_EN;
+	ret = mlx4_ACCESS_PTYS_REG(priv->mdev->dev,
+				   MLX4_ACCESS_REG_QUERY, &ptys_reg);
+	if (ret) {
+		en_warn(priv, "Failed to QUERY mlx4_ACCESS_PTYS_REG status(%x)\n",
+			ret);
+		return 0;
+	}
+
+	cur_autoneg = ptys_reg.flags & MLX4_PTYS_AN_DISABLE_ADMIN ?
+				AUTONEG_DISABLE : AUTONEG_ENABLE;
+
+	if (link_ksettings->base.autoneg == AUTONEG_DISABLE) {
+		proto_admin = speed_set_ptys_admin(priv, speed,
+						   ptys_reg.eth_proto_cap);
+		if ((be32_to_cpu(proto_admin) &
+		     (MLX4_PROT_MASK(MLX4_1000BASE_CX_SGMII) |
+		      MLX4_PROT_MASK(MLX4_1000BASE_KX))) &&
+		    (ptys_reg.flags & MLX4_PTYS_AN_DISABLE_CAP))
+			ptys_reg.flags |= MLX4_PTYS_AN_DISABLE_ADMIN;
+	} else {
+		proto_admin = cpu_to_be32(ptys_adv);
+		ptys_reg.flags &= ~MLX4_PTYS_AN_DISABLE_ADMIN;
+	}
+
+	proto_admin &= ptys_reg.eth_proto_cap;
+	if (!proto_admin) {
+		en_warn(priv, "Not supported link mode(s) requested, check supported link modes.\n");
+		return -EINVAL; /* nothing to change due to bad input */
+	}
+
+	if ((proto_admin == ptys_reg.eth_proto_admin) &&
+	    ((ptys_reg.flags & MLX4_PTYS_AN_DISABLE_CAP) &&
+	     (link_ksettings->base.autoneg == cur_autoneg)))
+		return 0; /* Nothing to change */
+
+	en_dbg(DRV, priv, "mlx4_ACCESS_PTYS_REG SET: ptys_reg.eth_proto_admin = 0x%x\n",
+	       be32_to_cpu(proto_admin));
+
+	ptys_reg.eth_proto_admin = proto_admin;
+	ret = mlx4_ACCESS_PTYS_REG(priv->mdev->dev, MLX4_ACCESS_REG_WRITE,
+				   &ptys_reg);
+	if (ret) {
+		en_warn(priv, "Failed to write mlx4_ACCESS_PTYS_REG eth_proto_admin(0x%x) status(0x%x)",
+			be32_to_cpu(ptys_reg.eth_proto_admin), ret);
+		return ret;
+	}
+
+	mutex_lock(&priv->mdev->state_lock);
+	if (priv->port_up) {
+		en_warn(priv, "Port link mode changed, restarting port...\n");
+		mlx4_en_stop_port(dev, 1);
+		if (mlx4_en_start_port(dev))
+			en_err(priv, "Failed restarting port %d\n", priv->port);
+	}
+	mutex_unlock(&priv->mdev->state_lock);
+	return 0;
+}
+
+static int mlx4_en_get_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	coal->tx_coalesce_usecs = priv->tx_usecs;
+	coal->tx_max_coalesced_frames = priv->tx_frames;
+	coal->tx_max_coalesced_frames_irq = priv->tx_work_limit;
+
+	coal->rx_coalesce_usecs = priv->rx_usecs;
+	coal->rx_max_coalesced_frames = priv->rx_frames;
+
+	coal->pkt_rate_low = priv->pkt_rate_low;
+	coal->rx_coalesce_usecs_low = priv->rx_usecs_low;
+	coal->pkt_rate_high = priv->pkt_rate_high;
+	coal->rx_coalesce_usecs_high = priv->rx_usecs_high;
+	coal->rate_sample_interval = priv->sample_interval;
+	coal->use_adaptive_rx_coalesce = priv->adaptive_rx_coal;
+
+	return 0;
+}
+
+static int mlx4_en_set_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (!coal->tx_max_coalesced_frames_irq)
+		return -EINVAL;
+
+	if (coal->tx_coalesce_usecs > MLX4_EN_MAX_COAL_TIME ||
+	    coal->rx_coalesce_usecs > MLX4_EN_MAX_COAL_TIME ||
+	    coal->rx_coalesce_usecs_low > MLX4_EN_MAX_COAL_TIME ||
+	    coal->rx_coalesce_usecs_high > MLX4_EN_MAX_COAL_TIME) {
+		netdev_info(dev, "%s: maximum coalesce time supported is %d usecs\n",
+			    __func__, MLX4_EN_MAX_COAL_TIME);
+		return -ERANGE;
+	}
+
+	if (coal->tx_max_coalesced_frames > MLX4_EN_MAX_COAL_PKTS ||
+	    coal->rx_max_coalesced_frames > MLX4_EN_MAX_COAL_PKTS) {
+		netdev_info(dev, "%s: maximum coalesced frames supported is %d\n",
+			    __func__, MLX4_EN_MAX_COAL_PKTS);
+		return -ERANGE;
+	}
+
+	priv->rx_frames = (coal->rx_max_coalesced_frames ==
+			   MLX4_EN_AUTO_CONF) ?
+				MLX4_EN_RX_COAL_TARGET :
+				coal->rx_max_coalesced_frames;
+	priv->rx_usecs = (coal->rx_coalesce_usecs ==
+			  MLX4_EN_AUTO_CONF) ?
+				MLX4_EN_RX_COAL_TIME :
+				coal->rx_coalesce_usecs;
+
+	/* Setting TX coalescing parameters */
+	if (coal->tx_coalesce_usecs != priv->tx_usecs ||
+	    coal->tx_max_coalesced_frames != priv->tx_frames) {
+		priv->tx_usecs = coal->tx_coalesce_usecs;
+		priv->tx_frames = coal->tx_max_coalesced_frames;
+	}
+
+	/* Set adaptive coalescing params */
+	priv->pkt_rate_low = coal->pkt_rate_low;
+	priv->rx_usecs_low = coal->rx_coalesce_usecs_low;
+	priv->pkt_rate_high = coal->pkt_rate_high;
+	priv->rx_usecs_high = coal->rx_coalesce_usecs_high;
+	priv->sample_interval = coal->rate_sample_interval;
+	priv->adaptive_rx_coal = coal->use_adaptive_rx_coalesce;
+	priv->tx_work_limit = coal->tx_max_coalesced_frames_irq;
+
+	return mlx4_en_moderation_update(priv);
+}
+
+static int mlx4_en_set_pauseparam(struct net_device *dev,
+				struct ethtool_pauseparam *pause)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	u8 tx_pause, tx_ppp, rx_pause, rx_ppp;
+	int err;
+
+	if (pause->autoneg)
+		return -EINVAL;
+
+	tx_pause = !!(pause->tx_pause);
+	rx_pause = !!(pause->rx_pause);
+	rx_ppp = priv->prof->rx_ppp && !(tx_pause || rx_pause);
+	tx_ppp = priv->prof->tx_ppp && !(tx_pause || rx_pause);
+
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    tx_pause, tx_ppp, rx_pause, rx_ppp);
+	if (err) {
+		en_err(priv, "Failed setting pause params, err = %d\n", err);
+		return err;
+	}
+
+	mlx4_en_update_pfc_stats_bitmap(mdev->dev, &priv->stats_bitmap,
+					rx_ppp, rx_pause, tx_ppp, tx_pause);
+
+	priv->prof->tx_pause = tx_pause;
+	priv->prof->rx_pause = rx_pause;
+	priv->prof->tx_ppp = tx_ppp;
+	priv->prof->rx_ppp = rx_ppp;
+
+	return err;
+}
+
+static void mlx4_en_get_pauseparam(struct net_device *dev,
+				 struct ethtool_pauseparam *pause)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	pause->tx_pause = priv->prof->tx_pause;
+	pause->rx_pause = priv->prof->rx_pause;
+}
+
+static int mlx4_en_set_ringparam(struct net_device *dev,
+				 struct ethtool_ringparam *param)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_port_profile new_prof;
+	struct mlx4_en_priv *tmp;
+	u32 rx_size, tx_size;
+	int port_up = 0;
+	int err = 0;
+
+	if (param->rx_jumbo_pending || param->rx_mini_pending)
+		return -EINVAL;
+
+	if (param->rx_pending < MLX4_EN_MIN_RX_SIZE) {
+		en_warn(priv, "%s: rx_pending (%d) < min (%d)\n",
+			__func__, param->rx_pending,
+			MLX4_EN_MIN_RX_SIZE);
+		return -EINVAL;
+	}
+	if (param->tx_pending < MLX4_EN_MIN_TX_SIZE) {
+		en_warn(priv, "%s: tx_pending (%d) < min (%lu)\n",
+			__func__, param->tx_pending,
+			MLX4_EN_MIN_TX_SIZE);
+		return -EINVAL;
+	}
+
+	rx_size = roundup_pow_of_two(param->rx_pending);
+	tx_size = roundup_pow_of_two(param->tx_pending);
+
+	if (rx_size == (priv->port_up ? priv->rx_ring[0]->actual_size :
+					priv->rx_ring[0]->size) &&
+	    tx_size == priv->tx_ring[TX][0]->size)
+		return 0;
+
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	mutex_lock(&mdev->state_lock);
+	memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
+	new_prof.tx_ring_size = tx_size;
+	new_prof.rx_ring_size = rx_size;
+	err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof, true);
+	if (err)
+		goto out;
+
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_safe_replace_resources(priv, tmp);
+
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+	err = mlx4_en_moderation_update(priv);
+out:
+	kfree(tmp);
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+static void mlx4_en_get_ringparam(struct net_device *dev,
+				  struct ethtool_ringparam *param)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	memset(param, 0, sizeof(*param));
+	param->rx_max_pending = MLX4_EN_MAX_RX_SIZE;
+	param->tx_max_pending = MLX4_EN_MAX_TX_SIZE;
+	param->rx_pending = priv->port_up ?
+		priv->rx_ring[0]->actual_size : priv->rx_ring[0]->size;
+	param->tx_pending = priv->tx_ring[TX][0]->size;
+}
+
+static u32 mlx4_en_get_rxfh_indir_size(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	return rounddown_pow_of_two(priv->rx_ring_num);
+}
+
+static u32 mlx4_en_get_rxfh_key_size(struct net_device *netdev)
+{
+	return MLX4_EN_RSS_KEY_SIZE;
+}
+
+static int mlx4_en_check_rxfh_func(struct net_device *dev, u8 hfunc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	/* check if requested function is supported by the device */
+	if (hfunc == ETH_RSS_HASH_TOP) {
+		if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP))
+			return -EINVAL;
+		if (!(dev->features & NETIF_F_RXHASH))
+			en_warn(priv, "Toeplitz hash function should be used in conjunction with RX hashing for optimal performance\n");
+		return 0;
+	} else if (hfunc == ETH_RSS_HASH_XOR) {
+		if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_XOR))
+			return -EINVAL;
+		if (dev->features & NETIF_F_RXHASH)
+			en_warn(priv, "Enabling both XOR Hash function and RX Hashing can limit RPS functionality\n");
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key,
+			    u8 *hfunc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	u32 n = mlx4_en_get_rxfh_indir_size(dev);
+	u32 i, rss_rings;
+	int err = 0;
+
+	rss_rings = priv->prof->rss_rings ?: n;
+	rss_rings = rounddown_pow_of_two(rss_rings);
+
+	for (i = 0; i < n; i++) {
+		if (!ring_index)
+			break;
+		ring_index[i] = i % rss_rings;
+	}
+	if (key)
+		memcpy(key, priv->rss_key, MLX4_EN_RSS_KEY_SIZE);
+	if (hfunc)
+		*hfunc = priv->rss_hash_fn;
+	return err;
+}
+
+static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index,
+			    const u8 *key, const u8 hfunc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	u32 n = mlx4_en_get_rxfh_indir_size(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int port_up = 0;
+	int err = 0;
+	int i;
+	int rss_rings = 0;
+
+	/* Calculate RSS table size and make sure flows are spread evenly
+	 * between rings
+	 */
+	for (i = 0; i < n; i++) {
+		if (!ring_index)
+			break;
+		if (i > 0 && !ring_index[i] && !rss_rings)
+			rss_rings = i;
+
+		if (ring_index[i] != (i % (rss_rings ?: n)))
+			return -EINVAL;
+	}
+
+	if (!rss_rings)
+		rss_rings = n;
+
+	/* RSS table size must be an order of 2 */
+	if (!is_power_of_2(rss_rings))
+		return -EINVAL;
+
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE) {
+		err = mlx4_en_check_rxfh_func(dev, hfunc);
+		if (err)
+			return err;
+	}
+
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	if (ring_index)
+		priv->prof->rss_rings = rss_rings;
+	if (key)
+		memcpy(priv->rss_key, key, MLX4_EN_RSS_KEY_SIZE);
+	if (hfunc !=  ETH_RSS_HASH_NO_CHANGE)
+		priv->rss_hash_fn = hfunc;
+
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+#define all_zeros_or_all_ones(field)		\
+	((field) == 0 || (field) == (__force typeof(field))-1)
+
+static int mlx4_en_validate_flow(struct net_device *dev,
+				 struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_usrip4_spec *l3_mask;
+	struct ethtool_tcpip4_spec *l4_mask;
+	struct ethhdr *eth_mask;
+
+	if (cmd->fs.location >= MAX_NUM_OF_FS_RULES)
+		return -EINVAL;
+
+	if (cmd->fs.flow_type & FLOW_MAC_EXT) {
+		/* dest mac mask must be ff:ff:ff:ff:ff:ff */
+		if (!is_broadcast_ether_addr(cmd->fs.m_ext.h_dest))
+			return -EINVAL;
+	}
+
+	switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+		if (cmd->fs.m_u.tcp_ip4_spec.tos)
+			return -EINVAL;
+		l4_mask = &cmd->fs.m_u.tcp_ip4_spec;
+		/* don't allow mask which isn't all 0 or 1 */
+		if (!all_zeros_or_all_ones(l4_mask->ip4src) ||
+		    !all_zeros_or_all_ones(l4_mask->ip4dst) ||
+		    !all_zeros_or_all_ones(l4_mask->psrc) ||
+		    !all_zeros_or_all_ones(l4_mask->pdst))
+			return -EINVAL;
+		break;
+	case IP_USER_FLOW:
+		l3_mask = &cmd->fs.m_u.usr_ip4_spec;
+		if (l3_mask->l4_4_bytes || l3_mask->tos || l3_mask->proto ||
+		    cmd->fs.h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4 ||
+		    (!l3_mask->ip4src && !l3_mask->ip4dst) ||
+		    !all_zeros_or_all_ones(l3_mask->ip4src) ||
+		    !all_zeros_or_all_ones(l3_mask->ip4dst))
+			return -EINVAL;
+		break;
+	case ETHER_FLOW:
+		eth_mask = &cmd->fs.m_u.ether_spec;
+		/* source mac mask must not be set */
+		if (!is_zero_ether_addr(eth_mask->h_source))
+			return -EINVAL;
+
+		/* dest mac mask must be ff:ff:ff:ff:ff:ff */
+		if (!is_broadcast_ether_addr(eth_mask->h_dest))
+			return -EINVAL;
+
+		if (!all_zeros_or_all_ones(eth_mask->h_proto))
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if ((cmd->fs.flow_type & FLOW_EXT)) {
+		if (cmd->fs.m_ext.vlan_etype ||
+		    !((cmd->fs.m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK)) ==
+		      0 ||
+		      (cmd->fs.m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK)) ==
+		      cpu_to_be16(VLAN_VID_MASK)))
+			return -EINVAL;
+
+		if (cmd->fs.m_ext.vlan_tci) {
+			if (be16_to_cpu(cmd->fs.h_ext.vlan_tci) >= VLAN_N_VID)
+				return -EINVAL;
+
+		}
+	}
+
+	return 0;
+}
+
+static int mlx4_en_ethtool_add_mac_rule(struct ethtool_rxnfc *cmd,
+					struct list_head *rule_list_h,
+					struct mlx4_spec_list *spec_l2,
+					unsigned char *mac)
+{
+	int err = 0;
+	__be64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	spec_l2->id = MLX4_NET_TRANS_RULE_ID_ETH;
+	memcpy(spec_l2->eth.dst_mac_msk, &mac_msk, ETH_ALEN);
+	memcpy(spec_l2->eth.dst_mac, mac, ETH_ALEN);
+
+	if ((cmd->fs.flow_type & FLOW_EXT) &&
+	    (cmd->fs.m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK))) {
+		spec_l2->eth.vlan_id = cmd->fs.h_ext.vlan_tci;
+		spec_l2->eth.vlan_id_msk = cpu_to_be16(VLAN_VID_MASK);
+	}
+
+	list_add_tail(&spec_l2->list, rule_list_h);
+
+	return err;
+}
+
+static int mlx4_en_ethtool_add_mac_rule_by_ipv4(struct mlx4_en_priv *priv,
+						struct ethtool_rxnfc *cmd,
+						struct list_head *rule_list_h,
+						struct mlx4_spec_list *spec_l2,
+						__be32 ipv4_dst)
+{
+#ifdef CONFIG_INET
+	unsigned char mac[ETH_ALEN];
+
+	if (!ipv4_is_multicast(ipv4_dst)) {
+		if (cmd->fs.flow_type & FLOW_MAC_EXT)
+			memcpy(&mac, cmd->fs.h_ext.h_dest, ETH_ALEN);
+		else
+			memcpy(&mac, priv->dev->dev_addr, ETH_ALEN);
+	} else {
+		ip_eth_mc_map(ipv4_dst, mac);
+	}
+
+	return mlx4_en_ethtool_add_mac_rule(cmd, rule_list_h, spec_l2, &mac[0]);
+#else
+	return -EINVAL;
+#endif
+}
+
+static int add_ip_rule(struct mlx4_en_priv *priv,
+		       struct ethtool_rxnfc *cmd,
+		       struct list_head *list_h)
+{
+	int err;
+	struct mlx4_spec_list *spec_l2 = NULL;
+	struct mlx4_spec_list *spec_l3 = NULL;
+	struct ethtool_usrip4_spec *l3_mask = &cmd->fs.m_u.usr_ip4_spec;
+
+	spec_l3 = kzalloc(sizeof(*spec_l3), GFP_KERNEL);
+	spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL);
+	if (!spec_l2 || !spec_l3) {
+		err = -ENOMEM;
+		goto free_spec;
+	}
+
+	err = mlx4_en_ethtool_add_mac_rule_by_ipv4(priv, cmd, list_h, spec_l2,
+						   cmd->fs.h_u.
+						   usr_ip4_spec.ip4dst);
+	if (err)
+		goto free_spec;
+	spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4;
+	spec_l3->ipv4.src_ip = cmd->fs.h_u.usr_ip4_spec.ip4src;
+	if (l3_mask->ip4src)
+		spec_l3->ipv4.src_ip_msk = EN_ETHTOOL_WORD_MASK;
+	spec_l3->ipv4.dst_ip = cmd->fs.h_u.usr_ip4_spec.ip4dst;
+	if (l3_mask->ip4dst)
+		spec_l3->ipv4.dst_ip_msk = EN_ETHTOOL_WORD_MASK;
+	list_add_tail(&spec_l3->list, list_h);
+
+	return 0;
+
+free_spec:
+	kfree(spec_l2);
+	kfree(spec_l3);
+	return err;
+}
+
+static int add_tcp_udp_rule(struct mlx4_en_priv *priv,
+			     struct ethtool_rxnfc *cmd,
+			     struct list_head *list_h, int proto)
+{
+	int err;
+	struct mlx4_spec_list *spec_l2 = NULL;
+	struct mlx4_spec_list *spec_l3 = NULL;
+	struct mlx4_spec_list *spec_l4 = NULL;
+	struct ethtool_tcpip4_spec *l4_mask = &cmd->fs.m_u.tcp_ip4_spec;
+
+	spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL);
+	spec_l3 = kzalloc(sizeof(*spec_l3), GFP_KERNEL);
+	spec_l4 = kzalloc(sizeof(*spec_l4), GFP_KERNEL);
+	if (!spec_l2 || !spec_l3 || !spec_l4) {
+		err = -ENOMEM;
+		goto free_spec;
+	}
+
+	spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4;
+
+	if (proto == TCP_V4_FLOW) {
+		err = mlx4_en_ethtool_add_mac_rule_by_ipv4(priv, cmd, list_h,
+							   spec_l2,
+							   cmd->fs.h_u.
+							   tcp_ip4_spec.ip4dst);
+		if (err)
+			goto free_spec;
+		spec_l4->id = MLX4_NET_TRANS_RULE_ID_TCP;
+		spec_l3->ipv4.src_ip = cmd->fs.h_u.tcp_ip4_spec.ip4src;
+		spec_l3->ipv4.dst_ip = cmd->fs.h_u.tcp_ip4_spec.ip4dst;
+		spec_l4->tcp_udp.src_port = cmd->fs.h_u.tcp_ip4_spec.psrc;
+		spec_l4->tcp_udp.dst_port = cmd->fs.h_u.tcp_ip4_spec.pdst;
+	} else {
+		err = mlx4_en_ethtool_add_mac_rule_by_ipv4(priv, cmd, list_h,
+							   spec_l2,
+							   cmd->fs.h_u.
+							   udp_ip4_spec.ip4dst);
+		if (err)
+			goto free_spec;
+		spec_l4->id = MLX4_NET_TRANS_RULE_ID_UDP;
+		spec_l3->ipv4.src_ip = cmd->fs.h_u.udp_ip4_spec.ip4src;
+		spec_l3->ipv4.dst_ip = cmd->fs.h_u.udp_ip4_spec.ip4dst;
+		spec_l4->tcp_udp.src_port = cmd->fs.h_u.udp_ip4_spec.psrc;
+		spec_l4->tcp_udp.dst_port = cmd->fs.h_u.udp_ip4_spec.pdst;
+	}
+
+	if (l4_mask->ip4src)
+		spec_l3->ipv4.src_ip_msk = EN_ETHTOOL_WORD_MASK;
+	if (l4_mask->ip4dst)
+		spec_l3->ipv4.dst_ip_msk = EN_ETHTOOL_WORD_MASK;
+
+	if (l4_mask->psrc)
+		spec_l4->tcp_udp.src_port_msk = EN_ETHTOOL_SHORT_MASK;
+	if (l4_mask->pdst)
+		spec_l4->tcp_udp.dst_port_msk = EN_ETHTOOL_SHORT_MASK;
+
+	list_add_tail(&spec_l3->list, list_h);
+	list_add_tail(&spec_l4->list, list_h);
+
+	return 0;
+
+free_spec:
+	kfree(spec_l2);
+	kfree(spec_l3);
+	kfree(spec_l4);
+	return err;
+}
+
+static int mlx4_en_ethtool_to_net_trans_rule(struct net_device *dev,
+					     struct ethtool_rxnfc *cmd,
+					     struct list_head *rule_list_h)
+{
+	int err;
+	struct ethhdr *eth_spec;
+	struct mlx4_spec_list *spec_l2;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	err = mlx4_en_validate_flow(dev, cmd);
+	if (err)
+		return err;
+
+	switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	case ETHER_FLOW:
+		spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL);
+		if (!spec_l2)
+			return -ENOMEM;
+
+		eth_spec = &cmd->fs.h_u.ether_spec;
+		mlx4_en_ethtool_add_mac_rule(cmd, rule_list_h, spec_l2,
+					     &eth_spec->h_dest[0]);
+		spec_l2->eth.ether_type = eth_spec->h_proto;
+		if (eth_spec->h_proto)
+			spec_l2->eth.ether_type_enable = 1;
+		break;
+	case IP_USER_FLOW:
+		err = add_ip_rule(priv, cmd, rule_list_h);
+		break;
+	case TCP_V4_FLOW:
+		err = add_tcp_udp_rule(priv, cmd, rule_list_h, TCP_V4_FLOW);
+		break;
+	case UDP_V4_FLOW:
+		err = add_tcp_udp_rule(priv, cmd, rule_list_h, UDP_V4_FLOW);
+		break;
+	}
+
+	return err;
+}
+
+static int mlx4_en_flow_replace(struct net_device *dev,
+				struct ethtool_rxnfc *cmd)
+{
+	int err;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct ethtool_flow_id *loc_rule;
+	struct mlx4_spec_list *spec, *tmp_spec;
+	u32 qpn;
+	u64 reg_id;
+
+	struct mlx4_net_trans_rule rule = {
+		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+		.exclusive = 0,
+		.allow_loopback = 1,
+		.promisc_mode = MLX4_FS_REGULAR,
+	};
+
+	rule.port = priv->port;
+	rule.priority = MLX4_DOMAIN_ETHTOOL | cmd->fs.location;
+	INIT_LIST_HEAD(&rule.list);
+
+	/* Allow direct QP attaches if the EN_ETHTOOL_QP_ATTACH flag is set */
+	if (cmd->fs.ring_cookie == RX_CLS_FLOW_DISC)
+		qpn = priv->drop_qp.qpn;
+	else if (cmd->fs.ring_cookie & EN_ETHTOOL_QP_ATTACH) {
+		qpn = cmd->fs.ring_cookie & (EN_ETHTOOL_QP_ATTACH - 1);
+	} else {
+		if (cmd->fs.ring_cookie >= priv->rx_ring_num) {
+			en_warn(priv, "rxnfc: RX ring (%llu) doesn't exist\n",
+				cmd->fs.ring_cookie);
+			return -EINVAL;
+		}
+		qpn = priv->rss_map.qps[cmd->fs.ring_cookie].qpn;
+		if (!qpn) {
+			en_warn(priv, "rxnfc: RX ring (%llu) is inactive\n",
+				cmd->fs.ring_cookie);
+			return -EINVAL;
+		}
+	}
+	rule.qpn = qpn;
+	err = mlx4_en_ethtool_to_net_trans_rule(dev, cmd, &rule.list);
+	if (err)
+		goto out_free_list;
+
+	loc_rule = &priv->ethtool_rules[cmd->fs.location];
+	if (loc_rule->id) {
+		err = mlx4_flow_detach(priv->mdev->dev, loc_rule->id);
+		if (err) {
+			en_err(priv, "Fail to detach network rule at location %d. registration id = %llx\n",
+			       cmd->fs.location, loc_rule->id);
+			goto out_free_list;
+		}
+		loc_rule->id = 0;
+		memset(&loc_rule->flow_spec, 0,
+		       sizeof(struct ethtool_rx_flow_spec));
+		list_del(&loc_rule->list);
+	}
+	err = mlx4_flow_attach(priv->mdev->dev, &rule, &reg_id);
+	if (err) {
+		en_err(priv, "Fail to attach network rule at location %d\n",
+		       cmd->fs.location);
+		goto out_free_list;
+	}
+	loc_rule->id = reg_id;
+	memcpy(&loc_rule->flow_spec, &cmd->fs,
+	       sizeof(struct ethtool_rx_flow_spec));
+	list_add_tail(&loc_rule->list, &priv->ethtool_list);
+
+out_free_list:
+	list_for_each_entry_safe(spec, tmp_spec, &rule.list, list) {
+		list_del(&spec->list);
+		kfree(spec);
+	}
+	return err;
+}
+
+static int mlx4_en_flow_detach(struct net_device *dev,
+			       struct ethtool_rxnfc *cmd)
+{
+	int err = 0;
+	struct ethtool_flow_id *rule;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (cmd->fs.location >= MAX_NUM_OF_FS_RULES)
+		return -EINVAL;
+
+	rule = &priv->ethtool_rules[cmd->fs.location];
+	if (!rule->id) {
+		err =  -ENOENT;
+		goto out;
+	}
+
+	err = mlx4_flow_detach(priv->mdev->dev, rule->id);
+	if (err) {
+		en_err(priv, "Fail to detach network rule at location %d. registration id = 0x%llx\n",
+		       cmd->fs.location, rule->id);
+		goto out;
+	}
+	rule->id = 0;
+	memset(&rule->flow_spec, 0, sizeof(struct ethtool_rx_flow_spec));
+	list_del(&rule->list);
+out:
+	return err;
+
+}
+
+static int mlx4_en_get_flow(struct net_device *dev, struct ethtool_rxnfc *cmd,
+			    int loc)
+{
+	int err = 0;
+	struct ethtool_flow_id *rule;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (loc < 0 || loc >= MAX_NUM_OF_FS_RULES)
+		return -EINVAL;
+
+	rule = &priv->ethtool_rules[loc];
+	if (rule->id)
+		memcpy(&cmd->fs, &rule->flow_spec,
+		       sizeof(struct ethtool_rx_flow_spec));
+	else
+		err = -ENOENT;
+
+	return err;
+}
+
+static int mlx4_en_get_num_flows(struct mlx4_en_priv *priv)
+{
+
+	int i, res = 0;
+	for (i = 0; i < MAX_NUM_OF_FS_RULES; i++) {
+		if (priv->ethtool_rules[i].id)
+			res++;
+	}
+	return res;
+
+}
+
+static int mlx4_en_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
+			     u32 *rule_locs)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+	int i = 0, priority = 0;
+
+	if ((cmd->cmd == ETHTOOL_GRXCLSRLCNT ||
+	     cmd->cmd == ETHTOOL_GRXCLSRULE ||
+	     cmd->cmd == ETHTOOL_GRXCLSRLALL) &&
+	    (mdev->dev->caps.steering_mode !=
+	     MLX4_STEERING_MODE_DEVICE_MANAGED || !priv->port_up))
+		return -EINVAL;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_GRXRINGS:
+		cmd->data = priv->rx_ring_num;
+		break;
+	case ETHTOOL_GRXCLSRLCNT:
+		cmd->rule_cnt = mlx4_en_get_num_flows(priv);
+		break;
+	case ETHTOOL_GRXCLSRULE:
+		err = mlx4_en_get_flow(dev, cmd, cmd->fs.location);
+		break;
+	case ETHTOOL_GRXCLSRLALL:
+		while ((!err || err == -ENOENT) && priority < cmd->rule_cnt) {
+			err = mlx4_en_get_flow(dev, cmd, i);
+			if (!err)
+				rule_locs[priority++] = i;
+			i++;
+		}
+		err = 0;
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static int mlx4_en_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
+{
+	int err = 0;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	if (mdev->dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED || !priv->port_up)
+		return -EINVAL;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXCLSRLINS:
+		err = mlx4_en_flow_replace(dev, cmd);
+		break;
+	case ETHTOOL_SRXCLSRLDEL:
+		err = mlx4_en_flow_detach(dev, cmd);
+		break;
+	default:
+		en_warn(priv, "Unsupported ethtool command. (%d)\n", cmd->cmd);
+		return -EINVAL;
+	}
+
+	return err;
+}
+
+static int mlx4_en_get_max_num_rx_rings(struct net_device *dev)
+{
+	return min_t(int, num_online_cpus(), MAX_RX_RINGS);
+}
+
+static void mlx4_en_get_channels(struct net_device *dev,
+				 struct ethtool_channels *channel)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	channel->max_rx = mlx4_en_get_max_num_rx_rings(dev);
+	channel->max_tx = priv->mdev->profile.max_num_tx_rings_p_up;
+
+	channel->rx_count = priv->rx_ring_num;
+	channel->tx_count = priv->tx_ring_num[TX] /
+			    priv->prof->num_up;
+}
+
+static int mlx4_en_set_channels(struct net_device *dev,
+				struct ethtool_channels *channel)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_port_profile new_prof;
+	struct mlx4_en_priv *tmp;
+	int port_up = 0;
+	int xdp_count;
+	int err = 0;
+	u8 up;
+
+	if (!channel->tx_count || !channel->rx_count)
+		return -EINVAL;
+
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	mutex_lock(&mdev->state_lock);
+	xdp_count = priv->tx_ring_num[TX_XDP] ? channel->rx_count : 0;
+	if (channel->tx_count * priv->prof->num_up + xdp_count >
+	    priv->mdev->profile.max_num_tx_rings_p_up * priv->prof->num_up) {
+		err = -EINVAL;
+		en_err(priv,
+		       "Total number of TX and XDP rings (%d) exceeds the maximum supported (%d)\n",
+		       channel->tx_count * priv->prof->num_up  + xdp_count,
+		       MAX_TX_RINGS);
+		goto out;
+	}
+
+	memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
+	new_prof.num_tx_rings_p_up = channel->tx_count;
+	new_prof.tx_ring_num[TX] = channel->tx_count * priv->prof->num_up;
+	new_prof.tx_ring_num[TX_XDP] = xdp_count;
+	new_prof.rx_ring_num = channel->rx_count;
+
+	err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof, true);
+	if (err)
+		goto out;
+
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_safe_replace_resources(priv, tmp);
+
+	netif_set_real_num_rx_queues(dev, priv->rx_ring_num);
+
+	up = (priv->prof->num_up == MLX4_EN_NUM_UP_LOW) ?
+				    0 : priv->prof->num_up;
+	mlx4_en_setup_tc(dev, up);
+
+	en_warn(priv, "Using %d TX rings\n", priv->tx_ring_num[TX]);
+	en_warn(priv, "Using %d RX rings\n", priv->rx_ring_num);
+
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+	err = mlx4_en_moderation_update(priv);
+out:
+	mutex_unlock(&mdev->state_lock);
+	kfree(tmp);
+	return err;
+}
+
+static int mlx4_en_get_ts_info(struct net_device *dev,
+			       struct ethtool_ts_info *info)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int ret;
+
+	ret = ethtool_op_get_ts_info(dev, info);
+	if (ret)
+		return ret;
+
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) {
+		info->so_timestamping |=
+			SOF_TIMESTAMPING_TX_HARDWARE |
+			SOF_TIMESTAMPING_RX_HARDWARE |
+			SOF_TIMESTAMPING_RAW_HARDWARE;
+
+		info->tx_types =
+			(1 << HWTSTAMP_TX_OFF) |
+			(1 << HWTSTAMP_TX_ON);
+
+		info->rx_filters =
+			(1 << HWTSTAMP_FILTER_NONE) |
+			(1 << HWTSTAMP_FILTER_ALL);
+
+		if (mdev->ptp_clock)
+			info->phc_index = ptp_clock_index(mdev->ptp_clock);
+	}
+
+	return ret;
+}
+
+static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	bool bf_enabled_new = !!(flags & MLX4_EN_PRIV_FLAGS_BLUEFLAME);
+	bool bf_enabled_old = !!(priv->pflags & MLX4_EN_PRIV_FLAGS_BLUEFLAME);
+	bool phv_enabled_new = !!(flags & MLX4_EN_PRIV_FLAGS_PHV);
+	bool phv_enabled_old = !!(priv->pflags & MLX4_EN_PRIV_FLAGS_PHV);
+	int i;
+	int ret = 0;
+
+	if (bf_enabled_new != bf_enabled_old) {
+		int t;
+
+		if (bf_enabled_new) {
+			bool bf_supported = true;
+
+			for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++)
+				for (i = 0; i < priv->tx_ring_num[t]; i++)
+					bf_supported &=
+						priv->tx_ring[t][i]->bf_alloced;
+
+			if (!bf_supported) {
+				en_err(priv, "BlueFlame is not supported\n");
+				return -EINVAL;
+			}
+
+			priv->pflags |= MLX4_EN_PRIV_FLAGS_BLUEFLAME;
+		} else {
+			priv->pflags &= ~MLX4_EN_PRIV_FLAGS_BLUEFLAME;
+		}
+
+		for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++)
+			for (i = 0; i < priv->tx_ring_num[t]; i++)
+				priv->tx_ring[t][i]->bf_enabled =
+					bf_enabled_new;
+
+		en_info(priv, "BlueFlame %s\n",
+			bf_enabled_new ?  "Enabled" : "Disabled");
+	}
+
+	if (phv_enabled_new != phv_enabled_old) {
+		ret = set_phv_bit(mdev->dev, priv->port, (int)phv_enabled_new);
+		if (ret)
+			return ret;
+		else if (phv_enabled_new)
+			priv->pflags |= MLX4_EN_PRIV_FLAGS_PHV;
+		else
+			priv->pflags &= ~MLX4_EN_PRIV_FLAGS_PHV;
+		en_info(priv, "PHV bit %s\n",
+			phv_enabled_new ?  "Enabled" : "Disabled");
+	}
+	return 0;
+}
+
+static u32 mlx4_en_get_priv_flags(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	return priv->pflags;
+}
+
+static int mlx4_en_get_tunable(struct net_device *dev,
+			       const struct ethtool_tunable *tuna,
+			       void *data)
+{
+	const struct mlx4_en_priv *priv = netdev_priv(dev);
+	int ret = 0;
+
+	switch (tuna->id) {
+	case ETHTOOL_TX_COPYBREAK:
+		*(u32 *)data = priv->prof->inline_thold;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int mlx4_en_set_tunable(struct net_device *dev,
+			       const struct ethtool_tunable *tuna,
+			       const void *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int val, ret = 0;
+
+	switch (tuna->id) {
+	case ETHTOOL_TX_COPYBREAK:
+		val = *(u32 *)data;
+		if (val < MIN_PKT_LEN || val > MAX_INLINE)
+			ret = -EINVAL;
+		else
+			priv->prof->inline_thold = val;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int mlx4_en_get_module_info(struct net_device *dev,
+				   struct ethtool_modinfo *modinfo)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int ret;
+	u8 data[4];
+
+	/* Read first 2 bytes to get Module & REV ID */
+	ret = mlx4_get_module_info(mdev->dev, priv->port,
+				   0/*offset*/, 2/*size*/, data);
+	if (ret < 2)
+		return -EIO;
+
+	switch (data[0] /* identifier */) {
+	case MLX4_MODULE_ID_QSFP:
+		modinfo->type = ETH_MODULE_SFF_8436;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		break;
+	case MLX4_MODULE_ID_QSFP_PLUS:
+		if (data[1] >= 0x3) { /* revision id */
+			modinfo->type = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		}
+		break;
+	case MLX4_MODULE_ID_QSFP28:
+		modinfo->type = ETH_MODULE_SFF_8636;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+		break;
+	case MLX4_MODULE_ID_SFP:
+		modinfo->type = ETH_MODULE_SFF_8472;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx4_en_get_module_eeprom(struct net_device *dev,
+				     struct ethtool_eeprom *ee,
+				     u8 *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int offset = ee->offset;
+	int i = 0, ret;
+
+	if (ee->len == 0)
+		return -EINVAL;
+
+	memset(data, 0, ee->len);
+
+	while (i < ee->len) {
+		en_dbg(DRV, priv,
+		       "mlx4_get_module_info i(%d) offset(%d) len(%d)\n",
+		       i, offset, ee->len - i);
+
+		ret = mlx4_get_module_info(mdev->dev, priv->port,
+					   offset, ee->len - i, data + i);
+
+		if (!ret) /* Done reading */
+			return 0;
+
+		if (ret < 0) {
+			en_err(priv,
+			       "mlx4_get_module_info i(%d) offset(%d) bytes_to_read(%d) - FAILED (0x%x)\n",
+			       i, offset, ee->len - i, ret);
+			return 0;
+		}
+
+		i += ret;
+		offset += ret;
+	}
+	return 0;
+}
+
+static int mlx4_en_set_phys_id(struct net_device *dev,
+			       enum ethtool_phys_id_state state)
+{
+	int err;
+	u16 beacon_duration;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_BEACON))
+		return -EOPNOTSUPP;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		beacon_duration = PORT_BEACON_MAX_LIMIT;
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		beacon_duration = 0;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	err = mlx4_SET_PORT_BEACON(mdev->dev, priv->port, beacon_duration);
+	return err;
+}
+
+const struct ethtool_ops mlx4_en_ethtool_ops = {
+	.get_drvinfo = mlx4_en_get_drvinfo,
+	.get_link_ksettings = mlx4_en_get_link_ksettings,
+	.set_link_ksettings = mlx4_en_set_link_ksettings,
+	.get_link = ethtool_op_get_link,
+	.get_strings = mlx4_en_get_strings,
+	.get_sset_count = mlx4_en_get_sset_count,
+	.get_ethtool_stats = mlx4_en_get_ethtool_stats,
+	.self_test = mlx4_en_self_test,
+	.set_phys_id = mlx4_en_set_phys_id,
+	.get_wol = mlx4_en_get_wol,
+	.set_wol = mlx4_en_set_wol,
+	.get_msglevel = mlx4_en_get_msglevel,
+	.set_msglevel = mlx4_en_set_msglevel,
+	.get_coalesce = mlx4_en_get_coalesce,
+	.set_coalesce = mlx4_en_set_coalesce,
+	.get_pauseparam = mlx4_en_get_pauseparam,
+	.set_pauseparam = mlx4_en_set_pauseparam,
+	.get_ringparam = mlx4_en_get_ringparam,
+	.set_ringparam = mlx4_en_set_ringparam,
+	.get_rxnfc = mlx4_en_get_rxnfc,
+	.set_rxnfc = mlx4_en_set_rxnfc,
+	.get_rxfh_indir_size = mlx4_en_get_rxfh_indir_size,
+	.get_rxfh_key_size = mlx4_en_get_rxfh_key_size,
+	.get_rxfh = mlx4_en_get_rxfh,
+	.set_rxfh = mlx4_en_set_rxfh,
+	.get_channels = mlx4_en_get_channels,
+	.set_channels = mlx4_en_set_channels,
+	.get_ts_info = mlx4_en_get_ts_info,
+	.set_priv_flags = mlx4_en_set_priv_flags,
+	.get_priv_flags = mlx4_en_get_priv_flags,
+	.get_tunable		= mlx4_en_get_tunable,
+	.set_tunable		= mlx4_en_set_tunable,
+	.get_module_info = mlx4_en_get_module_info,
+	.get_module_eeprom = mlx4_en_get_module_eeprom
+};
+
+
+
+
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c
new file mode 100644
index 0000000..d25e16d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_en.h"
+
+MODULE_AUTHOR("Liran Liss, Yevgeny Petrilin");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA Ethernet driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static const char mlx4_en_version[] =
+	DRV_NAME ": Mellanox ConnectX HCA Ethernet driver v"
+	DRV_VERSION "\n";
+
+#define MLX4_EN_PARM_INT(X, def_val, desc) \
+	static unsigned int X = def_val;\
+	module_param(X , uint, 0444); \
+	MODULE_PARM_DESC(X, desc);
+
+
+/*
+ * Device scope module parameters
+ */
+
+/* Enable RSS UDP traffic */
+MLX4_EN_PARM_INT(udp_rss, 1,
+		 "Enable RSS for incoming UDP traffic or disabled (0)");
+
+/* Priority pausing */
+MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]."
+			   " Per priority bit mask");
+MLX4_EN_PARM_INT(pfcrx, 0, "Priority based Flow Control policy on RX[7:0]."
+			   " Per priority bit mask");
+
+MLX4_EN_PARM_INT(inline_thold, MAX_INLINE,
+		 "Threshold for using inline data (range: 17-104, default: 104)");
+
+#define MAX_PFC_TX     0xff
+#define MAX_PFC_RX     0xff
+
+void en_print(const char *level, const struct mlx4_en_priv *priv,
+	      const char *format, ...)
+{
+	va_list args;
+	struct va_format vaf;
+
+	va_start(args, format);
+
+	vaf.fmt = format;
+	vaf.va = &args;
+	if (priv->registered)
+		printk("%s%s: %s: %pV",
+		       level, DRV_NAME, priv->dev->name, &vaf);
+	else
+		printk("%s%s: %s: Port %d: %pV",
+		       level, DRV_NAME, dev_name(&priv->mdev->pdev->dev),
+		       priv->port, &vaf);
+	va_end(args);
+}
+
+void mlx4_en_update_loopback_state(struct net_device *dev,
+				   netdev_features_t features)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (features & NETIF_F_LOOPBACK)
+		priv->ctrl_flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
+	else
+		priv->ctrl_flags &= cpu_to_be32(~MLX4_WQE_CTRL_FORCE_LOOPBACK);
+
+	priv->flags &= ~(MLX4_EN_FLAG_RX_FILTER_NEEDED|
+			MLX4_EN_FLAG_ENABLE_HW_LOOPBACK);
+
+	/* Drop the packet if SRIOV is not enabled
+	 * and not performing the selftest or flb disabled
+	 */
+	if (mlx4_is_mfunc(priv->mdev->dev) &&
+	    !(features & NETIF_F_LOOPBACK) && !priv->validate_loopback)
+		priv->flags |= MLX4_EN_FLAG_RX_FILTER_NEEDED;
+
+	/* Set dmac in Tx WQE if we are in SRIOV mode or if loopback selftest
+	 * is requested
+	 */
+	if (mlx4_is_mfunc(priv->mdev->dev) || priv->validate_loopback)
+		priv->flags |= MLX4_EN_FLAG_ENABLE_HW_LOOPBACK;
+
+	mutex_lock(&priv->mdev->state_lock);
+	if ((priv->mdev->dev->caps.flags2 &
+	     MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB) &&
+	    priv->rss_map.indir_qp && priv->rss_map.indir_qp->qpn) {
+		int i;
+		int err = 0;
+		int loopback = !!(features & NETIF_F_LOOPBACK);
+
+		for (i = 0; i < priv->rx_ring_num; i++) {
+			int ret;
+
+			ret = mlx4_en_change_mcast_lb(priv,
+						      &priv->rss_map.qps[i],
+						      loopback);
+			if (!err)
+				err = ret;
+		}
+		if (err)
+			mlx4_warn(priv->mdev, "failed to change mcast loopback\n");
+	}
+	mutex_unlock(&priv->mdev->state_lock);
+}
+
+static void mlx4_en_get_profile(struct mlx4_en_dev *mdev)
+{
+	struct mlx4_en_profile *params = &mdev->profile;
+	int i;
+
+	params->udp_rss = udp_rss;
+	params->max_num_tx_rings_p_up = mlx4_low_memory_profile() ?
+		MLX4_EN_MIN_TX_RING_P_UP :
+		min_t(int, num_online_cpus(), MLX4_EN_MAX_TX_RING_P_UP);
+
+	if (params->udp_rss && !(mdev->dev->caps.flags
+					& MLX4_DEV_CAP_FLAG_UDP_RSS)) {
+		mlx4_warn(mdev, "UDP RSS is not supported on this device\n");
+		params->udp_rss = 0;
+	}
+	for (i = 1; i <= MLX4_MAX_PORTS; i++) {
+		params->prof[i].rx_pause = !(pfcrx || pfctx);
+		params->prof[i].rx_ppp = pfcrx;
+		params->prof[i].tx_pause = !(pfcrx || pfctx);
+		params->prof[i].tx_ppp = pfctx;
+		params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE;
+		params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE;
+		params->prof[i].num_up = MLX4_EN_NUM_UP_LOW;
+		params->prof[i].num_tx_rings_p_up = params->max_num_tx_rings_p_up;
+		params->prof[i].tx_ring_num[TX] = params->max_num_tx_rings_p_up *
+			params->prof[i].num_up;
+		params->prof[i].rss_rings = 0;
+		params->prof[i].inline_thold = inline_thold;
+	}
+}
+
+static void *mlx4_en_get_netdev(struct mlx4_dev *dev, void *ctx, u8 port)
+{
+	struct mlx4_en_dev *endev = ctx;
+
+	return endev->pndev[port];
+}
+
+static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr,
+			  enum mlx4_dev_event event, unsigned long port)
+{
+	struct mlx4_en_dev *mdev = (struct mlx4_en_dev *) endev_ptr;
+	struct mlx4_en_priv *priv;
+
+	switch (event) {
+	case MLX4_DEV_EVENT_PORT_UP:
+	case MLX4_DEV_EVENT_PORT_DOWN:
+		if (!mdev->pndev[port])
+			return;
+		priv = netdev_priv(mdev->pndev[port]);
+		/* To prevent races, we poll the link state in a separate
+		  task rather than changing it here */
+		priv->link_state = event;
+		queue_work(mdev->workqueue, &priv->linkstate_task);
+		break;
+
+	case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
+		mlx4_err(mdev, "Internal error detected, restarting device\n");
+		break;
+
+	case MLX4_DEV_EVENT_SLAVE_INIT:
+	case MLX4_DEV_EVENT_SLAVE_SHUTDOWN:
+		break;
+	default:
+		if (port < 1 || port > dev->caps.num_ports ||
+		    !mdev->pndev[port])
+			return;
+		mlx4_warn(mdev, "Unhandled event %d for port %d\n", event,
+			  (int) port);
+	}
+}
+
+static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
+{
+	struct mlx4_en_dev *mdev = endev_ptr;
+	int i;
+
+	mutex_lock(&mdev->state_lock);
+	mdev->device_up = false;
+	mutex_unlock(&mdev->state_lock);
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		if (mdev->pndev[i])
+			mlx4_en_destroy_netdev(mdev->pndev[i]);
+
+	flush_workqueue(mdev->workqueue);
+	destroy_workqueue(mdev->workqueue);
+	(void) mlx4_mr_free(dev, &mdev->mr);
+	iounmap(mdev->uar_map);
+	mlx4_uar_free(dev, &mdev->priv_uar);
+	mlx4_pd_free(dev, mdev->priv_pdn);
+	if (mdev->nb.notifier_call)
+		unregister_netdevice_notifier(&mdev->nb);
+	kfree(mdev);
+}
+
+static void mlx4_en_activate(struct mlx4_dev *dev, void *ctx)
+{
+	int i;
+	struct mlx4_en_dev *mdev = ctx;
+
+	/* Create a netdev for each port */
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		mlx4_info(mdev, "Activating port:%d\n", i);
+		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
+			mdev->pndev[i] = NULL;
+	}
+
+	/* register notifier */
+	mdev->nb.notifier_call = mlx4_en_netdev_event;
+	if (register_netdevice_notifier(&mdev->nb)) {
+		mdev->nb.notifier_call = NULL;
+		mlx4_err(mdev, "Failed to create notifier\n");
+	}
+}
+
+static void *mlx4_en_add(struct mlx4_dev *dev)
+{
+	struct mlx4_en_dev *mdev;
+	int i;
+
+	printk_once(KERN_INFO "%s", mlx4_en_version);
+
+	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+	if (!mdev)
+		goto err_free_res;
+
+	if (mlx4_pd_alloc(dev, &mdev->priv_pdn))
+		goto err_free_dev;
+
+	if (mlx4_uar_alloc(dev, &mdev->priv_uar))
+		goto err_pd;
+
+	mdev->uar_map = ioremap((phys_addr_t) mdev->priv_uar.pfn << PAGE_SHIFT,
+				PAGE_SIZE);
+	if (!mdev->uar_map)
+		goto err_uar;
+	spin_lock_init(&mdev->uar_lock);
+
+	mdev->dev = dev;
+	mdev->dma_device = &dev->persist->pdev->dev;
+	mdev->pdev = dev->persist->pdev;
+	mdev->device_up = false;
+
+	mdev->LSO_support = !!(dev->caps.flags & (1 << 15));
+	if (!mdev->LSO_support)
+		mlx4_warn(mdev, "LSO not supported, please upgrade to later FW version to enable LSO\n");
+
+	if (mlx4_mr_alloc(mdev->dev, mdev->priv_pdn, 0, ~0ull,
+			 MLX4_PERM_LOCAL_WRITE |  MLX4_PERM_LOCAL_READ,
+			 0, 0, &mdev->mr)) {
+		mlx4_err(mdev, "Failed allocating memory region\n");
+		goto err_map;
+	}
+	if (mlx4_mr_enable(mdev->dev, &mdev->mr)) {
+		mlx4_err(mdev, "Failed enabling memory region\n");
+		goto err_mr;
+	}
+
+	/* Build device profile according to supplied module parameters */
+	mlx4_en_get_profile(mdev);
+
+	/* Configure which ports to start according to module parameters */
+	mdev->port_cnt = 0;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		mdev->port_cnt++;
+
+	/* Set default number of RX rings*/
+	mlx4_en_set_num_rx_rings(mdev);
+
+	/* Create our own workqueue for reset/multicast tasks
+	 * Note: we cannot use the shared workqueue because of deadlocks caused
+	 *       by the rtnl lock */
+	mdev->workqueue = create_singlethread_workqueue("mlx4_en");
+	if (!mdev->workqueue)
+		goto err_mr;
+
+	/* At this stage all non-port specific tasks are complete:
+	 * mark the card state as up */
+	mutex_init(&mdev->state_lock);
+	mdev->device_up = true;
+
+	return mdev;
+
+err_mr:
+	(void) mlx4_mr_free(dev, &mdev->mr);
+err_map:
+	if (mdev->uar_map)
+		iounmap(mdev->uar_map);
+err_uar:
+	mlx4_uar_free(dev, &mdev->priv_uar);
+err_pd:
+	mlx4_pd_free(dev, mdev->priv_pdn);
+err_free_dev:
+	kfree(mdev);
+err_free_res:
+	return NULL;
+}
+
+static struct mlx4_interface mlx4_en_interface = {
+	.add		= mlx4_en_add,
+	.remove		= mlx4_en_remove,
+	.event		= mlx4_en_event,
+	.get_dev	= mlx4_en_get_netdev,
+	.protocol	= MLX4_PROT_ETH,
+	.activate	= mlx4_en_activate,
+};
+
+static void mlx4_en_verify_params(void)
+{
+	if (pfctx > MAX_PFC_TX) {
+		pr_warn("mlx4_en: WARNING: illegal module parameter pfctx 0x%x - should be in range 0-0x%x, will be changed to default (0)\n",
+			pfctx, MAX_PFC_TX);
+		pfctx = 0;
+	}
+
+	if (pfcrx > MAX_PFC_RX) {
+		pr_warn("mlx4_en: WARNING: illegal module parameter pfcrx 0x%x - should be in range 0-0x%x, will be changed to default (0)\n",
+			pfcrx, MAX_PFC_RX);
+		pfcrx = 0;
+	}
+
+	if (inline_thold < MIN_PKT_LEN || inline_thold > MAX_INLINE) {
+		pr_warn("mlx4_en: WARNING: illegal module parameter inline_thold %d - should be in range %d-%d, will be changed to default (%d)\n",
+			inline_thold, MIN_PKT_LEN, MAX_INLINE, MAX_INLINE);
+		inline_thold = MAX_INLINE;
+	}
+}
+
+static int __init mlx4_en_init(void)
+{
+	mlx4_en_verify_params();
+	mlx4_en_init_ptys2ethtool_map();
+
+	return mlx4_register_interface(&mlx4_en_interface);
+}
+
+static void __exit mlx4_en_cleanup(void)
+{
+	mlx4_unregister_interface(&mlx4_en_interface);
+}
+
+module_init(mlx4_en_init);
+module_exit(mlx4_en_cleanup);
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
new file mode 100644
index 0000000..9670b33
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -0,0 +1,3677 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/bpf.h>
+#include <linux/etherdevice.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <net/ip.h>
+#include <net/busy_poll.h>
+#include <net/vxlan.h>
+#include <net/devlink.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/cq.h>
+
+#include "mlx4_en.h"
+#include "en_port.h"
+
+#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \
+				   XDP_PACKET_HEADROOM))
+
+int mlx4_en_setup_tc(struct net_device *dev, u8 up)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int i;
+	unsigned int offset = 0;
+
+	if (up && up != MLX4_EN_NUM_UP_HIGH)
+		return -EINVAL;
+
+	netdev_set_num_tc(dev, up);
+	netif_set_real_num_tx_queues(dev, priv->tx_ring_num[TX]);
+	/* Partition Tx queues evenly amongst UP's */
+	for (i = 0; i < up; i++) {
+		netdev_set_tc_queue(dev, i, priv->num_tx_rings_p_up, offset);
+		offset += priv->num_tx_rings_p_up;
+	}
+
+#ifdef CONFIG_MLX4_EN_DCB
+	if (!mlx4_is_slave(priv->mdev->dev)) {
+		if (up) {
+			if (priv->dcbx_cap)
+				priv->flags |= MLX4_EN_FLAG_DCB_ENABLED;
+		} else {
+			priv->flags &= ~MLX4_EN_FLAG_DCB_ENABLED;
+			priv->cee_config.pfc_state = false;
+		}
+	}
+#endif /* CONFIG_MLX4_EN_DCB */
+
+	return 0;
+}
+
+int mlx4_en_alloc_tx_queue_per_tc(struct net_device *dev, u8 tc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_port_profile new_prof;
+	struct mlx4_en_priv *tmp;
+	int port_up = 0;
+	int err = 0;
+
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	mutex_lock(&mdev->state_lock);
+	memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
+	new_prof.num_up = (tc == 0) ? MLX4_EN_NUM_UP_LOW :
+				      MLX4_EN_NUM_UP_HIGH;
+	new_prof.tx_ring_num[TX] = new_prof.num_tx_rings_p_up *
+				   new_prof.num_up;
+	err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof, true);
+	if (err)
+		goto out;
+
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_safe_replace_resources(priv, tmp);
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err) {
+			en_err(priv, "Failed starting port for setup TC\n");
+			goto out;
+		}
+	}
+
+	err = mlx4_en_setup_tc(dev, tc);
+out:
+	mutex_unlock(&mdev->state_lock);
+	kfree(tmp);
+	return err;
+}
+
+static int __mlx4_en_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			      void *type_data)
+{
+	struct tc_mqprio_qopt *mqprio = type_data;
+
+	if (type != TC_SETUP_QDISC_MQPRIO)
+		return -EOPNOTSUPP;
+
+	if (mqprio->num_tc && mqprio->num_tc != MLX4_EN_NUM_UP_HIGH)
+		return -EINVAL;
+
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+	return mlx4_en_alloc_tx_queue_per_tc(dev, mqprio->num_tc);
+}
+
+#ifdef CONFIG_RFS_ACCEL
+
+struct mlx4_en_filter {
+	struct list_head next;
+	struct work_struct work;
+
+	u8     ip_proto;
+	__be32 src_ip;
+	__be32 dst_ip;
+	__be16 src_port;
+	__be16 dst_port;
+
+	int rxq_index;
+	struct mlx4_en_priv *priv;
+	u32 flow_id;			/* RFS infrastructure id */
+	int id;				/* mlx4_en driver id */
+	u64 reg_id;			/* Flow steering API id */
+	u8 activated;			/* Used to prevent expiry before filter
+					 * is attached
+					 */
+	struct hlist_node filter_chain;
+};
+
+static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv);
+
+static enum mlx4_net_trans_rule_id mlx4_ip_proto_to_trans_rule_id(u8 ip_proto)
+{
+	switch (ip_proto) {
+	case IPPROTO_UDP:
+		return MLX4_NET_TRANS_RULE_ID_UDP;
+	case IPPROTO_TCP:
+		return MLX4_NET_TRANS_RULE_ID_TCP;
+	default:
+		return MLX4_NET_TRANS_RULE_NUM;
+	}
+};
+
+/* Must not acquire state_lock, as its corresponding work_sync
+ * is done under it.
+ */
+static void mlx4_en_filter_work(struct work_struct *work)
+{
+	struct mlx4_en_filter *filter = container_of(work,
+						     struct mlx4_en_filter,
+						     work);
+	struct mlx4_en_priv *priv = filter->priv;
+	struct mlx4_spec_list spec_tcp_udp = {
+		.id = mlx4_ip_proto_to_trans_rule_id(filter->ip_proto),
+		{
+			.tcp_udp = {
+				.dst_port = filter->dst_port,
+				.dst_port_msk = (__force __be16)-1,
+				.src_port = filter->src_port,
+				.src_port_msk = (__force __be16)-1,
+			},
+		},
+	};
+	struct mlx4_spec_list spec_ip = {
+		.id = MLX4_NET_TRANS_RULE_ID_IPV4,
+		{
+			.ipv4 = {
+				.dst_ip = filter->dst_ip,
+				.dst_ip_msk = (__force __be32)-1,
+				.src_ip = filter->src_ip,
+				.src_ip_msk = (__force __be32)-1,
+			},
+		},
+	};
+	struct mlx4_spec_list spec_eth = {
+		.id = MLX4_NET_TRANS_RULE_ID_ETH,
+	};
+	struct mlx4_net_trans_rule rule = {
+		.list = LIST_HEAD_INIT(rule.list),
+		.queue_mode = MLX4_NET_TRANS_Q_LIFO,
+		.exclusive = 1,
+		.allow_loopback = 1,
+		.promisc_mode = MLX4_FS_REGULAR,
+		.port = priv->port,
+		.priority = MLX4_DOMAIN_RFS,
+	};
+	int rc;
+	__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	if (spec_tcp_udp.id >= MLX4_NET_TRANS_RULE_NUM) {
+		en_warn(priv, "RFS: ignoring unsupported ip protocol (%d)\n",
+			filter->ip_proto);
+		goto ignore;
+	}
+	list_add_tail(&spec_eth.list, &rule.list);
+	list_add_tail(&spec_ip.list, &rule.list);
+	list_add_tail(&spec_tcp_udp.list, &rule.list);
+
+	rule.qpn = priv->rss_map.qps[filter->rxq_index].qpn;
+	memcpy(spec_eth.eth.dst_mac, priv->dev->dev_addr, ETH_ALEN);
+	memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+
+	filter->activated = 0;
+
+	if (filter->reg_id) {
+		rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id);
+		if (rc && rc != -ENOENT)
+			en_err(priv, "Error detaching flow. rc = %d\n", rc);
+	}
+
+	rc = mlx4_flow_attach(priv->mdev->dev, &rule, &filter->reg_id);
+	if (rc)
+		en_err(priv, "Error attaching flow. err = %d\n", rc);
+
+ignore:
+	mlx4_en_filter_rfs_expire(priv);
+
+	filter->activated = 1;
+}
+
+static inline struct hlist_head *
+filter_hash_bucket(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip,
+		   __be16 src_port, __be16 dst_port)
+{
+	unsigned long l;
+	int bucket_idx;
+
+	l = (__force unsigned long)src_port |
+	    ((__force unsigned long)dst_port << 2);
+	l ^= (__force unsigned long)(src_ip ^ dst_ip);
+
+	bucket_idx = hash_long(l, MLX4_EN_FILTER_HASH_SHIFT);
+
+	return &priv->filter_hash[bucket_idx];
+}
+
+static struct mlx4_en_filter *
+mlx4_en_filter_alloc(struct mlx4_en_priv *priv, int rxq_index, __be32 src_ip,
+		     __be32 dst_ip, u8 ip_proto, __be16 src_port,
+		     __be16 dst_port, u32 flow_id)
+{
+	struct mlx4_en_filter *filter = NULL;
+
+	filter = kzalloc(sizeof(struct mlx4_en_filter), GFP_ATOMIC);
+	if (!filter)
+		return NULL;
+
+	filter->priv = priv;
+	filter->rxq_index = rxq_index;
+	INIT_WORK(&filter->work, mlx4_en_filter_work);
+
+	filter->src_ip = src_ip;
+	filter->dst_ip = dst_ip;
+	filter->ip_proto = ip_proto;
+	filter->src_port = src_port;
+	filter->dst_port = dst_port;
+
+	filter->flow_id = flow_id;
+
+	filter->id = priv->last_filter_id++ % RPS_NO_FILTER;
+
+	list_add_tail(&filter->next, &priv->filters);
+	hlist_add_head(&filter->filter_chain,
+		       filter_hash_bucket(priv, src_ip, dst_ip, src_port,
+					  dst_port));
+
+	return filter;
+}
+
+static void mlx4_en_filter_free(struct mlx4_en_filter *filter)
+{
+	struct mlx4_en_priv *priv = filter->priv;
+	int rc;
+
+	list_del(&filter->next);
+
+	rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id);
+	if (rc && rc != -ENOENT)
+		en_err(priv, "Error detaching flow. rc = %d\n", rc);
+
+	kfree(filter);
+}
+
+static inline struct mlx4_en_filter *
+mlx4_en_filter_find(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip,
+		    u8 ip_proto, __be16 src_port, __be16 dst_port)
+{
+	struct mlx4_en_filter *filter;
+	struct mlx4_en_filter *ret = NULL;
+
+	hlist_for_each_entry(filter,
+			     filter_hash_bucket(priv, src_ip, dst_ip,
+						src_port, dst_port),
+			     filter_chain) {
+		if (filter->src_ip == src_ip &&
+		    filter->dst_ip == dst_ip &&
+		    filter->ip_proto == ip_proto &&
+		    filter->src_port == src_port &&
+		    filter->dst_port == dst_port) {
+			ret = filter;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int
+mlx4_en_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+		   u16 rxq_index, u32 flow_id)
+{
+	struct mlx4_en_priv *priv = netdev_priv(net_dev);
+	struct mlx4_en_filter *filter;
+	const struct iphdr *ip;
+	const __be16 *ports;
+	u8 ip_proto;
+	__be32 src_ip;
+	__be32 dst_ip;
+	__be16 src_port;
+	__be16 dst_port;
+	int nhoff = skb_network_offset(skb);
+	int ret = 0;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		return -EPROTONOSUPPORT;
+
+	ip = (const struct iphdr *)(skb->data + nhoff);
+	if (ip_is_fragment(ip))
+		return -EPROTONOSUPPORT;
+
+	if ((ip->protocol != IPPROTO_TCP) && (ip->protocol != IPPROTO_UDP))
+		return -EPROTONOSUPPORT;
+	ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl);
+
+	ip_proto = ip->protocol;
+	src_ip = ip->saddr;
+	dst_ip = ip->daddr;
+	src_port = ports[0];
+	dst_port = ports[1];
+
+	spin_lock_bh(&priv->filters_lock);
+	filter = mlx4_en_filter_find(priv, src_ip, dst_ip, ip_proto,
+				     src_port, dst_port);
+	if (filter) {
+		if (filter->rxq_index == rxq_index)
+			goto out;
+
+		filter->rxq_index = rxq_index;
+	} else {
+		filter = mlx4_en_filter_alloc(priv, rxq_index,
+					      src_ip, dst_ip, ip_proto,
+					      src_port, dst_port, flow_id);
+		if (!filter) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	queue_work(priv->mdev->workqueue, &filter->work);
+
+out:
+	ret = filter->id;
+err:
+	spin_unlock_bh(&priv->filters_lock);
+
+	return ret;
+}
+
+void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_filter *filter, *tmp;
+	LIST_HEAD(del_list);
+
+	spin_lock_bh(&priv->filters_lock);
+	list_for_each_entry_safe(filter, tmp, &priv->filters, next) {
+		list_move(&filter->next, &del_list);
+		hlist_del(&filter->filter_chain);
+	}
+	spin_unlock_bh(&priv->filters_lock);
+
+	list_for_each_entry_safe(filter, tmp, &del_list, next) {
+		cancel_work_sync(&filter->work);
+		mlx4_en_filter_free(filter);
+	}
+}
+
+static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_filter *filter = NULL, *tmp, *last_filter = NULL;
+	LIST_HEAD(del_list);
+	int i = 0;
+
+	spin_lock_bh(&priv->filters_lock);
+	list_for_each_entry_safe(filter, tmp, &priv->filters, next) {
+		if (i > MLX4_EN_FILTER_EXPIRY_QUOTA)
+			break;
+
+		if (filter->activated &&
+		    !work_pending(&filter->work) &&
+		    rps_may_expire_flow(priv->dev,
+					filter->rxq_index, filter->flow_id,
+					filter->id)) {
+			list_move(&filter->next, &del_list);
+			hlist_del(&filter->filter_chain);
+		} else
+			last_filter = filter;
+
+		i++;
+	}
+
+	if (last_filter && (&last_filter->next != priv->filters.next))
+		list_move(&priv->filters, &last_filter->next);
+
+	spin_unlock_bh(&priv->filters_lock);
+
+	list_for_each_entry_safe(filter, tmp, &del_list, next)
+		mlx4_en_filter_free(filter);
+}
+#endif
+
+static int mlx4_en_vlan_rx_add_vid(struct net_device *dev,
+				   __be16 proto, u16 vid)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+	int idx;
+
+	en_dbg(HW, priv, "adding VLAN:%d\n", vid);
+
+	set_bit(vid, priv->active_vlans);
+
+	/* Add VID to port VLAN filter */
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up && priv->port_up) {
+		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv);
+		if (err) {
+			en_err(priv, "Failed configuring VLAN filter\n");
+			goto out;
+		}
+	}
+	err = mlx4_register_vlan(mdev->dev, priv->port, vid, &idx);
+	if (err)
+		en_dbg(HW, priv, "Failed adding vlan %d\n", vid);
+
+out:
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+static int mlx4_en_vlan_rx_kill_vid(struct net_device *dev,
+				    __be16 proto, u16 vid)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+
+	en_dbg(HW, priv, "Killing VID:%d\n", vid);
+
+	clear_bit(vid, priv->active_vlans);
+
+	/* Remove VID from port VLAN filter */
+	mutex_lock(&mdev->state_lock);
+	mlx4_unregister_vlan(mdev->dev, priv->port, vid);
+
+	if (mdev->device_up && priv->port_up) {
+		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv);
+		if (err)
+			en_err(priv, "Failed configuring VLAN filter\n");
+	}
+	mutex_unlock(&mdev->state_lock);
+
+	return err;
+}
+
+static void mlx4_en_u64_to_mac(unsigned char dst_mac[ETH_ALEN + 2], u64 src_mac)
+{
+	int i;
+	for (i = ETH_ALEN - 1; i >= 0; --i) {
+		dst_mac[i] = src_mac & 0xff;
+		src_mac >>= 8;
+	}
+	memset(&dst_mac[ETH_ALEN], 0, 2);
+}
+
+
+static int mlx4_en_tunnel_steer_add(struct mlx4_en_priv *priv, unsigned char *addr,
+				    int qpn, u64 *reg_id)
+{
+	int err;
+
+	if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN ||
+	    priv->mdev->dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC)
+		return 0; /* do nothing */
+
+	err = mlx4_tunnel_steer_add(priv->mdev->dev, addr, priv->port, qpn,
+				    MLX4_DOMAIN_NIC, reg_id);
+	if (err) {
+		en_err(priv, "failed to add vxlan steering rule, err %d\n", err);
+		return err;
+	}
+	en_dbg(DRV, priv, "added vxlan steering rule, mac %pM reg_id %llx\n", addr, *reg_id);
+	return 0;
+}
+
+
+static int mlx4_en_uc_steer_add(struct mlx4_en_priv *priv,
+				unsigned char *mac, int *qpn, u64 *reg_id)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	int err;
+
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_B0: {
+		struct mlx4_qp qp;
+		u8 gid[16] = {0};
+
+		qp.qpn = *qpn;
+		memcpy(&gid[10], mac, ETH_ALEN);
+		gid[5] = priv->port;
+
+		err = mlx4_unicast_attach(dev, &qp, gid, 0, MLX4_PROT_ETH);
+		break;
+	}
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		struct mlx4_spec_list spec_eth = { {NULL} };
+		__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+		struct mlx4_net_trans_rule rule = {
+			.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+			.exclusive = 0,
+			.allow_loopback = 1,
+			.promisc_mode = MLX4_FS_REGULAR,
+			.priority = MLX4_DOMAIN_NIC,
+		};
+
+		rule.port = priv->port;
+		rule.qpn = *qpn;
+		INIT_LIST_HEAD(&rule.list);
+
+		spec_eth.id = MLX4_NET_TRANS_RULE_ID_ETH;
+		memcpy(spec_eth.eth.dst_mac, mac, ETH_ALEN);
+		memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+		list_add_tail(&spec_eth.list, &rule.list);
+
+		err = mlx4_flow_attach(dev, &rule, reg_id);
+		break;
+	}
+	default:
+		return -EINVAL;
+	}
+	if (err)
+		en_warn(priv, "Failed Attaching Unicast\n");
+
+	return err;
+}
+
+static void mlx4_en_uc_steer_release(struct mlx4_en_priv *priv,
+				     unsigned char *mac, int qpn, u64 reg_id)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_B0: {
+		struct mlx4_qp qp;
+		u8 gid[16] = {0};
+
+		qp.qpn = qpn;
+		memcpy(&gid[10], mac, ETH_ALEN);
+		gid[5] = priv->port;
+
+		mlx4_unicast_detach(dev, &qp, gid, MLX4_PROT_ETH);
+		break;
+	}
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		mlx4_flow_detach(dev, reg_id);
+		break;
+	}
+	default:
+		en_err(priv, "Invalid steering mode.\n");
+	}
+}
+
+static int mlx4_en_get_qp(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	int index = 0;
+	int err = 0;
+	int *qpn = &priv->base_qpn;
+	u64 mac = mlx4_mac_to_u64(priv->dev->dev_addr);
+
+	en_dbg(DRV, priv, "Registering MAC: %pM for adding\n",
+	       priv->dev->dev_addr);
+	index = mlx4_register_mac(dev, priv->port, mac);
+	if (index < 0) {
+		err = index;
+		en_err(priv, "Failed adding MAC: %pM\n",
+		       priv->dev->dev_addr);
+		return err;
+	}
+
+	en_info(priv, "Steering Mode %d\n", dev->caps.steering_mode);
+
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) {
+		int base_qpn = mlx4_get_base_qpn(dev, priv->port);
+		*qpn = base_qpn + index;
+		return 0;
+	}
+
+	err = mlx4_qp_reserve_range(dev, 1, 1, qpn, MLX4_RESERVE_A0_QP,
+				    MLX4_RES_USAGE_DRIVER);
+	en_dbg(DRV, priv, "Reserved qp %d\n", *qpn);
+	if (err) {
+		en_err(priv, "Failed to reserve qp for mac registration\n");
+		mlx4_unregister_mac(dev, priv->port, mac);
+		return err;
+	}
+
+	return 0;
+}
+
+static void mlx4_en_put_qp(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	int qpn = priv->base_qpn;
+
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) {
+		u64 mac = mlx4_mac_to_u64(priv->dev->dev_addr);
+		en_dbg(DRV, priv, "Registering MAC: %pM for deleting\n",
+		       priv->dev->dev_addr);
+		mlx4_unregister_mac(dev, priv->port, mac);
+	} else {
+		en_dbg(DRV, priv, "Releasing qp: port %d, qpn %d\n",
+		       priv->port, qpn);
+		mlx4_qp_release_range(dev, qpn, 1);
+		priv->flags &= ~MLX4_EN_FLAG_FORCE_PROMISC;
+	}
+}
+
+static int mlx4_en_replace_mac(struct mlx4_en_priv *priv, int qpn,
+			       unsigned char *new_mac, unsigned char *prev_mac)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	int err = 0;
+	u64 new_mac_u64 = mlx4_mac_to_u64(new_mac);
+
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0) {
+		struct hlist_head *bucket;
+		unsigned int mac_hash;
+		struct mlx4_mac_entry *entry;
+		struct hlist_node *tmp;
+		u64 prev_mac_u64 = mlx4_mac_to_u64(prev_mac);
+
+		bucket = &priv->mac_hash[prev_mac[MLX4_EN_MAC_HASH_IDX]];
+		hlist_for_each_entry_safe(entry, tmp, bucket, hlist) {
+			if (ether_addr_equal_64bits(entry->mac, prev_mac)) {
+				mlx4_en_uc_steer_release(priv, entry->mac,
+							 qpn, entry->reg_id);
+				mlx4_unregister_mac(dev, priv->port,
+						    prev_mac_u64);
+				hlist_del_rcu(&entry->hlist);
+				synchronize_rcu();
+				memcpy(entry->mac, new_mac, ETH_ALEN);
+				entry->reg_id = 0;
+				mac_hash = new_mac[MLX4_EN_MAC_HASH_IDX];
+				hlist_add_head_rcu(&entry->hlist,
+						   &priv->mac_hash[mac_hash]);
+				mlx4_register_mac(dev, priv->port, new_mac_u64);
+				err = mlx4_en_uc_steer_add(priv, new_mac,
+							   &qpn,
+							   &entry->reg_id);
+				if (err)
+					return err;
+				if (priv->tunnel_reg_id) {
+					mlx4_flow_detach(priv->mdev->dev, priv->tunnel_reg_id);
+					priv->tunnel_reg_id = 0;
+				}
+				err = mlx4_en_tunnel_steer_add(priv, new_mac, qpn,
+							       &priv->tunnel_reg_id);
+				return err;
+			}
+		}
+		return -EINVAL;
+	}
+
+	return __mlx4_replace_mac(dev, priv->port, qpn, new_mac_u64);
+}
+
+static void mlx4_en_update_user_mac(struct mlx4_en_priv *priv,
+				    unsigned char new_mac[ETH_ALEN + 2])
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_USER_MAC_EN))
+		return;
+
+	err = mlx4_SET_PORT_user_mac(mdev->dev, priv->port, new_mac);
+	if (err)
+		en_err(priv, "Failed to pass user MAC(%pM) to Firmware for port %d, with error %d\n",
+		       new_mac, priv->port, err);
+}
+
+static int mlx4_en_do_set_mac(struct mlx4_en_priv *priv,
+			      unsigned char new_mac[ETH_ALEN + 2])
+{
+	int err = 0;
+
+	if (priv->port_up) {
+		/* Remove old MAC and insert the new one */
+		err = mlx4_en_replace_mac(priv, priv->base_qpn,
+					  new_mac, priv->current_mac);
+		if (err)
+			en_err(priv, "Failed changing HW MAC address\n");
+	} else
+		en_dbg(HW, priv, "Port is down while registering mac, exiting...\n");
+
+	if (!err)
+		memcpy(priv->current_mac, new_mac, sizeof(priv->current_mac));
+
+	return err;
+}
+
+static int mlx4_en_set_mac(struct net_device *dev, void *addr)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct sockaddr *saddr = addr;
+	unsigned char new_mac[ETH_ALEN + 2];
+	int err;
+
+	if (!is_valid_ether_addr(saddr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	mutex_lock(&mdev->state_lock);
+	memcpy(new_mac, saddr->sa_data, ETH_ALEN);
+	err = mlx4_en_do_set_mac(priv, new_mac);
+	if (err)
+		goto out;
+
+	memcpy(dev->dev_addr, saddr->sa_data, ETH_ALEN);
+	mlx4_en_update_user_mac(priv, new_mac);
+out:
+	mutex_unlock(&mdev->state_lock);
+
+	return err;
+}
+
+static void mlx4_en_clear_list(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_mc_list *tmp, *mc_to_del;
+
+	list_for_each_entry_safe(mc_to_del, tmp, &priv->mc_list, list) {
+		list_del(&mc_to_del->list);
+		kfree(mc_to_del);
+	}
+}
+
+static void mlx4_en_cache_mclist(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct netdev_hw_addr *ha;
+	struct mlx4_en_mc_list *tmp;
+
+	mlx4_en_clear_list(dev);
+	netdev_for_each_mc_addr(ha, dev) {
+		tmp = kzalloc(sizeof(struct mlx4_en_mc_list), GFP_ATOMIC);
+		if (!tmp) {
+			mlx4_en_clear_list(dev);
+			return;
+		}
+		memcpy(tmp->addr, ha->addr, ETH_ALEN);
+		list_add_tail(&tmp->list, &priv->mc_list);
+	}
+}
+
+static void update_mclist_flags(struct mlx4_en_priv *priv,
+				struct list_head *dst,
+				struct list_head *src)
+{
+	struct mlx4_en_mc_list *dst_tmp, *src_tmp, *new_mc;
+	bool found;
+
+	/* Find all the entries that should be removed from dst,
+	 * These are the entries that are not found in src
+	 */
+	list_for_each_entry(dst_tmp, dst, list) {
+		found = false;
+		list_for_each_entry(src_tmp, src, list) {
+			if (ether_addr_equal(dst_tmp->addr, src_tmp->addr)) {
+				found = true;
+				break;
+			}
+		}
+		if (!found)
+			dst_tmp->action = MCLIST_REM;
+	}
+
+	/* Add entries that exist in src but not in dst
+	 * mark them as need to add
+	 */
+	list_for_each_entry(src_tmp, src, list) {
+		found = false;
+		list_for_each_entry(dst_tmp, dst, list) {
+			if (ether_addr_equal(dst_tmp->addr, src_tmp->addr)) {
+				dst_tmp->action = MCLIST_NONE;
+				found = true;
+				break;
+			}
+		}
+		if (!found) {
+			new_mc = kmemdup(src_tmp,
+					 sizeof(struct mlx4_en_mc_list),
+					 GFP_KERNEL);
+			if (!new_mc)
+				return;
+
+			new_mc->action = MCLIST_ADD;
+			list_add_tail(&new_mc->list, dst);
+		}
+	}
+}
+
+static void mlx4_en_set_rx_mode(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (!priv->port_up)
+		return;
+
+	queue_work(priv->mdev->workqueue, &priv->rx_mode_task);
+}
+
+static void mlx4_en_set_promisc_mode(struct mlx4_en_priv *priv,
+				     struct mlx4_en_dev *mdev)
+{
+	int err = 0;
+
+	if (!(priv->flags & MLX4_EN_FLAG_PROMISC)) {
+		if (netif_msg_rx_status(priv))
+			en_warn(priv, "Entering promiscuous mode\n");
+		priv->flags |= MLX4_EN_FLAG_PROMISC;
+
+		/* Enable promiscouos mode */
+		switch (mdev->dev->caps.steering_mode) {
+		case MLX4_STEERING_MODE_DEVICE_MANAGED:
+			err = mlx4_flow_steer_promisc_add(mdev->dev,
+							  priv->port,
+							  priv->base_qpn,
+							  MLX4_FS_ALL_DEFAULT);
+			if (err)
+				en_err(priv, "Failed enabling promiscuous mode\n");
+			priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+			break;
+
+		case MLX4_STEERING_MODE_B0:
+			err = mlx4_unicast_promisc_add(mdev->dev,
+						       priv->base_qpn,
+						       priv->port);
+			if (err)
+				en_err(priv, "Failed enabling unicast promiscuous mode\n");
+
+			/* Add the default qp number as multicast
+			 * promisc
+			 */
+			if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) {
+				err = mlx4_multicast_promisc_add(mdev->dev,
+								 priv->base_qpn,
+								 priv->port);
+				if (err)
+					en_err(priv, "Failed enabling multicast promiscuous mode\n");
+				priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+			}
+			break;
+
+		case MLX4_STEERING_MODE_A0:
+			err = mlx4_SET_PORT_qpn_calc(mdev->dev,
+						     priv->port,
+						     priv->base_qpn,
+						     1);
+			if (err)
+				en_err(priv, "Failed enabling promiscuous mode\n");
+			break;
+		}
+
+		/* Disable port multicast filter (unconditionally) */
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			en_err(priv, "Failed disabling multicast filter\n");
+	}
+}
+
+static void mlx4_en_clear_promisc_mode(struct mlx4_en_priv *priv,
+				       struct mlx4_en_dev *mdev)
+{
+	int err = 0;
+
+	if (netif_msg_rx_status(priv))
+		en_warn(priv, "Leaving promiscuous mode\n");
+	priv->flags &= ~MLX4_EN_FLAG_PROMISC;
+
+	/* Disable promiscouos mode */
+	switch (mdev->dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		err = mlx4_flow_steer_promisc_remove(mdev->dev,
+						     priv->port,
+						     MLX4_FS_ALL_DEFAULT);
+		if (err)
+			en_err(priv, "Failed disabling promiscuous mode\n");
+		priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		break;
+
+	case MLX4_STEERING_MODE_B0:
+		err = mlx4_unicast_promisc_remove(mdev->dev,
+						  priv->base_qpn,
+						  priv->port);
+		if (err)
+			en_err(priv, "Failed disabling unicast promiscuous mode\n");
+		/* Disable Multicast promisc */
+		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
+			err = mlx4_multicast_promisc_remove(mdev->dev,
+							    priv->base_qpn,
+							    priv->port);
+			if (err)
+				en_err(priv, "Failed disabling multicast promiscuous mode\n");
+			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		}
+		break;
+
+	case MLX4_STEERING_MODE_A0:
+		err = mlx4_SET_PORT_qpn_calc(mdev->dev,
+					     priv->port,
+					     priv->base_qpn, 0);
+		if (err)
+			en_err(priv, "Failed disabling promiscuous mode\n");
+		break;
+	}
+}
+
+static void mlx4_en_do_multicast(struct mlx4_en_priv *priv,
+				 struct net_device *dev,
+				 struct mlx4_en_dev *mdev)
+{
+	struct mlx4_en_mc_list *mclist, *tmp;
+	u64 mcast_addr = 0;
+	u8 mc_list[16] = {0};
+	int err = 0;
+
+	/* Enable/disable the multicast filter according to IFF_ALLMULTI */
+	if (dev->flags & IFF_ALLMULTI) {
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			en_err(priv, "Failed disabling multicast filter\n");
+
+		/* Add the default qp number as multicast promisc */
+		if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) {
+			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_DEVICE_MANAGED:
+				err = mlx4_flow_steer_promisc_add(mdev->dev,
+								  priv->port,
+								  priv->base_qpn,
+								  MLX4_FS_MC_DEFAULT);
+				break;
+
+			case MLX4_STEERING_MODE_B0:
+				err = mlx4_multicast_promisc_add(mdev->dev,
+								 priv->base_qpn,
+								 priv->port);
+				break;
+
+			case MLX4_STEERING_MODE_A0:
+				break;
+			}
+			if (err)
+				en_err(priv, "Failed entering multicast promisc mode\n");
+			priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+		}
+	} else {
+		/* Disable Multicast promisc */
+		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
+			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_DEVICE_MANAGED:
+				err = mlx4_flow_steer_promisc_remove(mdev->dev,
+								     priv->port,
+								     MLX4_FS_MC_DEFAULT);
+				break;
+
+			case MLX4_STEERING_MODE_B0:
+				err = mlx4_multicast_promisc_remove(mdev->dev,
+								    priv->base_qpn,
+								    priv->port);
+				break;
+
+			case MLX4_STEERING_MODE_A0:
+				break;
+			}
+			if (err)
+				en_err(priv, "Failed disabling multicast promiscuous mode\n");
+			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		}
+
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			en_err(priv, "Failed disabling multicast filter\n");
+
+		/* Flush mcast filter and init it with broadcast address */
+		mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, ETH_BCAST,
+				    1, MLX4_MCAST_CONFIG);
+
+		/* Update multicast list - we cache all addresses so they won't
+		 * change while HW is updated holding the command semaphor */
+		netif_addr_lock_bh(dev);
+		mlx4_en_cache_mclist(dev);
+		netif_addr_unlock_bh(dev);
+		list_for_each_entry(mclist, &priv->mc_list, list) {
+			mcast_addr = mlx4_mac_to_u64(mclist->addr);
+			mlx4_SET_MCAST_FLTR(mdev->dev, priv->port,
+					    mcast_addr, 0, MLX4_MCAST_CONFIG);
+		}
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_ENABLE);
+		if (err)
+			en_err(priv, "Failed enabling multicast filter\n");
+
+		update_mclist_flags(priv, &priv->curr_list, &priv->mc_list);
+		list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) {
+			if (mclist->action == MCLIST_REM) {
+				/* detach this address and delete from list */
+				memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+				mc_list[5] = priv->port;
+				err = mlx4_multicast_detach(mdev->dev,
+							    priv->rss_map.indir_qp,
+							    mc_list,
+							    MLX4_PROT_ETH,
+							    mclist->reg_id);
+				if (err)
+					en_err(priv, "Fail to detach multicast address\n");
+
+				if (mclist->tunnel_reg_id) {
+					err = mlx4_flow_detach(priv->mdev->dev, mclist->tunnel_reg_id);
+					if (err)
+						en_err(priv, "Failed to detach multicast address\n");
+				}
+
+				/* remove from list */
+				list_del(&mclist->list);
+				kfree(mclist);
+			} else if (mclist->action == MCLIST_ADD) {
+				/* attach the address */
+				memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+				/* needed for B0 steering support */
+				mc_list[5] = priv->port;
+				err = mlx4_multicast_attach(mdev->dev,
+							    priv->rss_map.indir_qp,
+							    mc_list,
+							    priv->port, 0,
+							    MLX4_PROT_ETH,
+							    &mclist->reg_id);
+				if (err)
+					en_err(priv, "Fail to attach multicast address\n");
+
+				err = mlx4_en_tunnel_steer_add(priv, &mc_list[10], priv->base_qpn,
+							       &mclist->tunnel_reg_id);
+				if (err)
+					en_err(priv, "Failed to attach multicast address\n");
+			}
+		}
+	}
+}
+
+static void mlx4_en_do_uc_filter(struct mlx4_en_priv *priv,
+				 struct net_device *dev,
+				 struct mlx4_en_dev *mdev)
+{
+	struct netdev_hw_addr *ha;
+	struct mlx4_mac_entry *entry;
+	struct hlist_node *tmp;
+	bool found;
+	u64 mac;
+	int err = 0;
+	struct hlist_head *bucket;
+	unsigned int i;
+	int removed = 0;
+	u32 prev_flags;
+
+	/* Note that we do not need to protect our mac_hash traversal with rcu,
+	 * since all modification code is protected by mdev->state_lock
+	 */
+
+	/* find what to remove */
+	for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i) {
+		bucket = &priv->mac_hash[i];
+		hlist_for_each_entry_safe(entry, tmp, bucket, hlist) {
+			found = false;
+			netdev_for_each_uc_addr(ha, dev) {
+				if (ether_addr_equal_64bits(entry->mac,
+							    ha->addr)) {
+					found = true;
+					break;
+				}
+			}
+
+			/* MAC address of the port is not in uc list */
+			if (ether_addr_equal_64bits(entry->mac,
+						    priv->current_mac))
+				found = true;
+
+			if (!found) {
+				mac = mlx4_mac_to_u64(entry->mac);
+				mlx4_en_uc_steer_release(priv, entry->mac,
+							 priv->base_qpn,
+							 entry->reg_id);
+				mlx4_unregister_mac(mdev->dev, priv->port, mac);
+
+				hlist_del_rcu(&entry->hlist);
+				kfree_rcu(entry, rcu);
+				en_dbg(DRV, priv, "Removed MAC %pM on port:%d\n",
+				       entry->mac, priv->port);
+				++removed;
+			}
+		}
+	}
+
+	/* if we didn't remove anything, there is no use in trying to add
+	 * again once we are in a forced promisc mode state
+	 */
+	if ((priv->flags & MLX4_EN_FLAG_FORCE_PROMISC) && 0 == removed)
+		return;
+
+	prev_flags = priv->flags;
+	priv->flags &= ~MLX4_EN_FLAG_FORCE_PROMISC;
+
+	/* find what to add */
+	netdev_for_each_uc_addr(ha, dev) {
+		found = false;
+		bucket = &priv->mac_hash[ha->addr[MLX4_EN_MAC_HASH_IDX]];
+		hlist_for_each_entry(entry, bucket, hlist) {
+			if (ether_addr_equal_64bits(entry->mac, ha->addr)) {
+				found = true;
+				break;
+			}
+		}
+
+		if (!found) {
+			entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+			if (!entry) {
+				en_err(priv, "Failed adding MAC %pM on port:%d (out of memory)\n",
+				       ha->addr, priv->port);
+				priv->flags |= MLX4_EN_FLAG_FORCE_PROMISC;
+				break;
+			}
+			mac = mlx4_mac_to_u64(ha->addr);
+			memcpy(entry->mac, ha->addr, ETH_ALEN);
+			err = mlx4_register_mac(mdev->dev, priv->port, mac);
+			if (err < 0) {
+				en_err(priv, "Failed registering MAC %pM on port %d: %d\n",
+				       ha->addr, priv->port, err);
+				kfree(entry);
+				priv->flags |= MLX4_EN_FLAG_FORCE_PROMISC;
+				break;
+			}
+			err = mlx4_en_uc_steer_add(priv, ha->addr,
+						   &priv->base_qpn,
+						   &entry->reg_id);
+			if (err) {
+				en_err(priv, "Failed adding MAC %pM on port %d: %d\n",
+				       ha->addr, priv->port, err);
+				mlx4_unregister_mac(mdev->dev, priv->port, mac);
+				kfree(entry);
+				priv->flags |= MLX4_EN_FLAG_FORCE_PROMISC;
+				break;
+			} else {
+				unsigned int mac_hash;
+				en_dbg(DRV, priv, "Added MAC %pM on port:%d\n",
+				       ha->addr, priv->port);
+				mac_hash = ha->addr[MLX4_EN_MAC_HASH_IDX];
+				bucket = &priv->mac_hash[mac_hash];
+				hlist_add_head_rcu(&entry->hlist, bucket);
+			}
+		}
+	}
+
+	if (priv->flags & MLX4_EN_FLAG_FORCE_PROMISC) {
+		en_warn(priv, "Forcing promiscuous mode on port:%d\n",
+			priv->port);
+	} else if (prev_flags & MLX4_EN_FLAG_FORCE_PROMISC) {
+		en_warn(priv, "Stop forcing promiscuous mode on port:%d\n",
+			priv->port);
+	}
+}
+
+static void mlx4_en_do_set_rx_mode(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 rx_mode_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+
+	mutex_lock(&mdev->state_lock);
+	if (!mdev->device_up) {
+		en_dbg(HW, priv, "Card is not up, ignoring rx mode change.\n");
+		goto out;
+	}
+	if (!priv->port_up) {
+		en_dbg(HW, priv, "Port is down, ignoring rx mode change.\n");
+		goto out;
+	}
+
+	if (!netif_carrier_ok(dev)) {
+		if (!mlx4_en_QUERY_PORT(mdev, priv->port)) {
+			if (priv->port_state.link_state) {
+				priv->last_link_state = MLX4_DEV_EVENT_PORT_UP;
+				netif_carrier_on(dev);
+				en_dbg(LINK, priv, "Link Up\n");
+			}
+		}
+	}
+
+	if (dev->priv_flags & IFF_UNICAST_FLT)
+		mlx4_en_do_uc_filter(priv, dev, mdev);
+
+	/* Promsicuous mode: disable all filters */
+	if ((dev->flags & IFF_PROMISC) ||
+	    (priv->flags & MLX4_EN_FLAG_FORCE_PROMISC)) {
+		mlx4_en_set_promisc_mode(priv, mdev);
+		goto out;
+	}
+
+	/* Not in promiscuous mode */
+	if (priv->flags & MLX4_EN_FLAG_PROMISC)
+		mlx4_en_clear_promisc_mode(priv, mdev);
+
+	mlx4_en_do_multicast(priv, dev, mdev);
+out:
+	mutex_unlock(&mdev->state_lock);
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void mlx4_en_netpoll(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_cq *cq;
+	int i;
+
+	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+		cq = priv->tx_cq[TX][i];
+		napi_schedule(&cq->napi);
+	}
+}
+#endif
+
+static int mlx4_en_set_rss_steer_rules(struct mlx4_en_priv *priv)
+{
+	u64 reg_id;
+	int err = 0;
+	int *qpn = &priv->base_qpn;
+	struct mlx4_mac_entry *entry;
+
+	err = mlx4_en_uc_steer_add(priv, priv->dev->dev_addr, qpn, &reg_id);
+	if (err)
+		return err;
+
+	err = mlx4_en_tunnel_steer_add(priv, priv->dev->dev_addr, *qpn,
+				       &priv->tunnel_reg_id);
+	if (err)
+		goto tunnel_err;
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		err = -ENOMEM;
+		goto alloc_err;
+	}
+
+	memcpy(entry->mac, priv->dev->dev_addr, sizeof(entry->mac));
+	memcpy(priv->current_mac, entry->mac, sizeof(priv->current_mac));
+	entry->reg_id = reg_id;
+	hlist_add_head_rcu(&entry->hlist,
+			   &priv->mac_hash[entry->mac[MLX4_EN_MAC_HASH_IDX]]);
+
+	return 0;
+
+alloc_err:
+	if (priv->tunnel_reg_id)
+		mlx4_flow_detach(priv->mdev->dev, priv->tunnel_reg_id);
+
+tunnel_err:
+	mlx4_en_uc_steer_release(priv, priv->dev->dev_addr, *qpn, reg_id);
+	return err;
+}
+
+static void mlx4_en_delete_rss_steer_rules(struct mlx4_en_priv *priv)
+{
+	u64 mac;
+	unsigned int i;
+	int qpn = priv->base_qpn;
+	struct hlist_head *bucket;
+	struct hlist_node *tmp;
+	struct mlx4_mac_entry *entry;
+
+	for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i) {
+		bucket = &priv->mac_hash[i];
+		hlist_for_each_entry_safe(entry, tmp, bucket, hlist) {
+			mac = mlx4_mac_to_u64(entry->mac);
+			en_dbg(DRV, priv, "Registering MAC:%pM for deleting\n",
+			       entry->mac);
+			mlx4_en_uc_steer_release(priv, entry->mac,
+						 qpn, entry->reg_id);
+
+			mlx4_unregister_mac(priv->mdev->dev, priv->port, mac);
+			hlist_del_rcu(&entry->hlist);
+			kfree_rcu(entry, rcu);
+		}
+	}
+
+	if (priv->tunnel_reg_id) {
+		mlx4_flow_detach(priv->mdev->dev, priv->tunnel_reg_id);
+		priv->tunnel_reg_id = 0;
+	}
+}
+
+static void mlx4_en_tx_timeout(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int i;
+
+	if (netif_msg_timer(priv))
+		en_warn(priv, "Tx timeout called on port:%d\n", priv->port);
+
+	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+		struct mlx4_en_tx_ring *tx_ring = priv->tx_ring[TX][i];
+
+		if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, i)))
+			continue;
+		en_warn(priv, "TX timeout on queue: %d, QP: 0x%x, CQ: 0x%x, Cons: 0x%x, Prod: 0x%x\n",
+			i, tx_ring->qpn, tx_ring->sp_cqn,
+			tx_ring->cons, tx_ring->prod);
+	}
+
+	priv->port_stats.tx_timeout++;
+	en_dbg(DRV, priv, "Scheduling watchdog\n");
+	queue_work(mdev->workqueue, &priv->watchdog_task);
+}
+
+
+static void
+mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	spin_lock_bh(&priv->stats_lock);
+	mlx4_en_fold_software_stats(dev);
+	netdev_stats_to_stats64(stats, &dev->stats);
+	spin_unlock_bh(&priv->stats_lock);
+}
+
+static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_cq *cq;
+	int i, t;
+
+	/* If we haven't received a specific coalescing setting
+	 * (module param), we set the moderation parameters as follows:
+	 * - moder_cnt is set to the number of mtu sized packets to
+	 *   satisfy our coalescing target.
+	 * - moder_time is set to a fixed value.
+	 */
+	priv->rx_frames = MLX4_EN_RX_COAL_TARGET;
+	priv->rx_usecs = MLX4_EN_RX_COAL_TIME;
+	priv->tx_frames = MLX4_EN_TX_COAL_PKTS;
+	priv->tx_usecs = MLX4_EN_TX_COAL_TIME;
+	en_dbg(INTR, priv, "Default coalescing params for mtu:%d - rx_frames:%d rx_usecs:%d\n",
+	       priv->dev->mtu, priv->rx_frames, priv->rx_usecs);
+
+	/* Setup cq moderation params */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		cq = priv->rx_cq[i];
+		cq->moder_cnt = priv->rx_frames;
+		cq->moder_time = priv->rx_usecs;
+		priv->last_moder_time[i] = MLX4_EN_AUTO_CONF;
+		priv->last_moder_packets[i] = 0;
+		priv->last_moder_bytes[i] = 0;
+	}
+
+	for (t = 0 ; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			cq = priv->tx_cq[t][i];
+			cq->moder_cnt = priv->tx_frames;
+			cq->moder_time = priv->tx_usecs;
+		}
+	}
+
+	/* Reset auto-moderation params */
+	priv->pkt_rate_low = MLX4_EN_RX_RATE_LOW;
+	priv->rx_usecs_low = MLX4_EN_RX_COAL_TIME_LOW;
+	priv->pkt_rate_high = MLX4_EN_RX_RATE_HIGH;
+	priv->rx_usecs_high = MLX4_EN_RX_COAL_TIME_HIGH;
+	priv->sample_interval = MLX4_EN_SAMPLE_INTERVAL;
+	priv->adaptive_rx_coal = 1;
+	priv->last_moder_jiffies = 0;
+	priv->last_moder_tx_packets = 0;
+}
+
+static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv)
+{
+	unsigned long period = (unsigned long) (jiffies - priv->last_moder_jiffies);
+	u32 pkt_rate_high, pkt_rate_low;
+	struct mlx4_en_cq *cq;
+	unsigned long packets;
+	unsigned long rate;
+	unsigned long avg_pkt_size;
+	unsigned long rx_packets;
+	unsigned long rx_bytes;
+	unsigned long rx_pkt_diff;
+	int moder_time;
+	int ring, err;
+
+	if (!priv->adaptive_rx_coal || period < priv->sample_interval * HZ)
+		return;
+
+	pkt_rate_low = READ_ONCE(priv->pkt_rate_low);
+	pkt_rate_high = READ_ONCE(priv->pkt_rate_high);
+
+	for (ring = 0; ring < priv->rx_ring_num; ring++) {
+		rx_packets = READ_ONCE(priv->rx_ring[ring]->packets);
+		rx_bytes = READ_ONCE(priv->rx_ring[ring]->bytes);
+
+		rx_pkt_diff = rx_packets - priv->last_moder_packets[ring];
+		packets = rx_pkt_diff;
+		rate = packets * HZ / period;
+		avg_pkt_size = packets ? (rx_bytes -
+				priv->last_moder_bytes[ring]) / packets : 0;
+
+		/* Apply auto-moderation only when packet rate
+		 * exceeds a rate that it matters */
+		if (rate > (MLX4_EN_RX_RATE_THRESH / priv->rx_ring_num) &&
+		    avg_pkt_size > MLX4_EN_AVG_PKT_SMALL) {
+			if (rate <= pkt_rate_low)
+				moder_time = priv->rx_usecs_low;
+			else if (rate >= pkt_rate_high)
+				moder_time = priv->rx_usecs_high;
+			else
+				moder_time = (rate - pkt_rate_low) *
+					(priv->rx_usecs_high - priv->rx_usecs_low) /
+					(pkt_rate_high - pkt_rate_low) +
+					priv->rx_usecs_low;
+		} else {
+			moder_time = priv->rx_usecs_low;
+		}
+
+		cq = priv->rx_cq[ring];
+		if (moder_time != priv->last_moder_time[ring] ||
+		    cq->moder_cnt != priv->rx_frames) {
+			priv->last_moder_time[ring] = moder_time;
+			cq->moder_time = moder_time;
+			cq->moder_cnt = priv->rx_frames;
+			err = mlx4_en_set_cq_moder(priv, cq);
+			if (err)
+				en_err(priv, "Failed modifying moderation for cq:%d\n",
+				       ring);
+		}
+		priv->last_moder_packets[ring] = rx_packets;
+		priv->last_moder_bytes[ring] = rx_bytes;
+	}
+
+	priv->last_moder_jiffies = jiffies;
+}
+
+static void mlx4_en_do_get_stats(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
+						 stats_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up) {
+		if (priv->port_up) {
+			err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0);
+			if (err)
+				en_dbg(HW, priv, "Could not update stats\n");
+
+			mlx4_en_auto_moderation(priv);
+		}
+
+		queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+	}
+	if (mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port]) {
+		mlx4_en_do_set_mac(priv, priv->current_mac);
+		mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port] = 0;
+	}
+	mutex_unlock(&mdev->state_lock);
+}
+
+/* mlx4_en_service_task - Run service task for tasks that needed to be done
+ * periodically
+ */
+static void mlx4_en_service_task(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
+						 service_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up) {
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
+			mlx4_en_ptp_overflow_check(mdev);
+
+		mlx4_en_recover_from_oom(priv);
+		queue_delayed_work(mdev->workqueue, &priv->service_task,
+				   SERVICE_TASK_DELAY);
+	}
+	mutex_unlock(&mdev->state_lock);
+}
+
+static void mlx4_en_linkstate(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 linkstate_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int linkstate = priv->link_state;
+
+	mutex_lock(&mdev->state_lock);
+	/* If observable port state changed set carrier state and
+	 * report to system log */
+	if (priv->last_link_state != linkstate) {
+		if (linkstate == MLX4_DEV_EVENT_PORT_DOWN) {
+			en_info(priv, "Link Down\n");
+			netif_carrier_off(priv->dev);
+		} else {
+			en_info(priv, "Link Up\n");
+			netif_carrier_on(priv->dev);
+		}
+	}
+	priv->last_link_state = linkstate;
+	mutex_unlock(&mdev->state_lock);
+}
+
+static int mlx4_en_init_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
+{
+	struct mlx4_en_rx_ring *ring = priv->rx_ring[ring_idx];
+	int numa_node = priv->mdev->dev->numa_node;
+
+	if (!zalloc_cpumask_var(&ring->affinity_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpumask_set_cpu(cpumask_local_spread(ring_idx, numa_node),
+			ring->affinity_mask);
+	return 0;
+}
+
+static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
+{
+	free_cpumask_var(priv->rx_ring[ring_idx]->affinity_mask);
+}
+
+static void mlx4_en_init_recycle_ring(struct mlx4_en_priv *priv,
+				      int tx_ring_idx)
+{
+	struct mlx4_en_tx_ring *tx_ring = priv->tx_ring[TX_XDP][tx_ring_idx];
+	int rr_index = tx_ring_idx;
+
+	tx_ring->free_tx_desc = mlx4_en_recycle_tx_desc;
+	tx_ring->recycle_ring = priv->rx_ring[rr_index];
+	en_dbg(DRV, priv, "Set tx_ring[%d][%d]->recycle_ring = rx_ring[%d]\n",
+	       TX_XDP, tx_ring_idx, rr_index);
+}
+
+int mlx4_en_start_port(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq;
+	struct mlx4_en_tx_ring *tx_ring;
+	int rx_index = 0;
+	int err = 0;
+	int i, t;
+	int j;
+	u8 mc_list[16] = {0};
+
+	if (priv->port_up) {
+		en_dbg(DRV, priv, "start port called while port already up\n");
+		return 0;
+	}
+
+	INIT_LIST_HEAD(&priv->mc_list);
+	INIT_LIST_HEAD(&priv->curr_list);
+	INIT_LIST_HEAD(&priv->ethtool_list);
+	memset(&priv->ethtool_rules[0], 0,
+	       sizeof(struct ethtool_flow_id) * MAX_NUM_OF_FS_RULES);
+
+	/* Calculate Rx buf size */
+	dev->mtu = min(dev->mtu, priv->max_mtu);
+	mlx4_en_calc_rx_buf(dev);
+	en_dbg(DRV, priv, "Rx buf size:%d\n", priv->rx_skb_size);
+
+	/* Configure rx cq's and rings */
+	err = mlx4_en_activate_rx_rings(priv);
+	if (err) {
+		en_err(priv, "Failed to activate RX rings\n");
+		return err;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		cq = priv->rx_cq[i];
+
+		err = mlx4_en_init_affinity_hint(priv, i);
+		if (err) {
+			en_err(priv, "Failed preparing IRQ affinity hint\n");
+			goto cq_err;
+		}
+
+		err = mlx4_en_activate_cq(priv, cq, i);
+		if (err) {
+			en_err(priv, "Failed activating Rx CQ\n");
+			mlx4_en_free_affinity_hint(priv, i);
+			goto cq_err;
+		}
+
+		for (j = 0; j < cq->size; j++) {
+			struct mlx4_cqe *cqe = NULL;
+
+			cqe = mlx4_en_get_cqe(cq->buf, j, priv->cqe_size) +
+			      priv->cqe_factor;
+			cqe->owner_sr_opcode = MLX4_CQE_OWNER_MASK;
+		}
+
+		err = mlx4_en_set_cq_moder(priv, cq);
+		if (err) {
+			en_err(priv, "Failed setting cq moderation parameters\n");
+			mlx4_en_deactivate_cq(priv, cq);
+			mlx4_en_free_affinity_hint(priv, i);
+			goto cq_err;
+		}
+		mlx4_en_arm_cq(priv, cq);
+		priv->rx_ring[i]->cqn = cq->mcq.cqn;
+		++rx_index;
+	}
+
+	/* Set qp number */
+	en_dbg(DRV, priv, "Getting qp number for port %d\n", priv->port);
+	err = mlx4_en_get_qp(priv);
+	if (err) {
+		en_err(priv, "Failed getting eth qp\n");
+		goto cq_err;
+	}
+	mdev->mac_removed[priv->port] = 0;
+
+	priv->counter_index =
+			mlx4_get_default_counter_index(mdev->dev, priv->port);
+
+	err = mlx4_en_config_rss_steer(priv);
+	if (err) {
+		en_err(priv, "Failed configuring rss steering\n");
+		goto mac_err;
+	}
+
+	err = mlx4_en_create_drop_qp(priv);
+	if (err)
+		goto rss_err;
+
+	/* Configure tx cq's and rings */
+	for (t = 0 ; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		u8 num_tx_rings_p_up = t == TX ?
+			priv->num_tx_rings_p_up : priv->tx_ring_num[t];
+
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			/* Configure cq */
+			cq = priv->tx_cq[t][i];
+			err = mlx4_en_activate_cq(priv, cq, i);
+			if (err) {
+				en_err(priv, "Failed allocating Tx CQ\n");
+				goto tx_err;
+			}
+			err = mlx4_en_set_cq_moder(priv, cq);
+			if (err) {
+				en_err(priv, "Failed setting cq moderation parameters\n");
+				mlx4_en_deactivate_cq(priv, cq);
+				goto tx_err;
+			}
+			en_dbg(DRV, priv,
+			       "Resetting index of collapsed CQ:%d to -1\n", i);
+			cq->buf->wqe_index = cpu_to_be16(0xffff);
+
+			/* Configure ring */
+			tx_ring = priv->tx_ring[t][i];
+			err = mlx4_en_activate_tx_ring(priv, tx_ring,
+						       cq->mcq.cqn,
+						       i / num_tx_rings_p_up);
+			if (err) {
+				en_err(priv, "Failed allocating Tx ring\n");
+				mlx4_en_deactivate_cq(priv, cq);
+				goto tx_err;
+			}
+			if (t != TX_XDP) {
+				tx_ring->tx_queue = netdev_get_tx_queue(dev, i);
+				tx_ring->recycle_ring = NULL;
+
+				/* Arm CQ for TX completions */
+				mlx4_en_arm_cq(priv, cq);
+
+			} else {
+				mlx4_en_init_tx_xdp_ring_descs(priv, tx_ring);
+				mlx4_en_init_recycle_ring(priv, i);
+				/* XDP TX CQ should never be armed */
+			}
+
+			/* Set initial ownership of all Tx TXBBs to SW (1) */
+			for (j = 0; j < tx_ring->buf_size; j += STAMP_STRIDE)
+				*((u32 *)(tx_ring->buf + j)) = 0xffffffff;
+		}
+	}
+
+	/* Configure port */
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    priv->prof->tx_pause,
+				    priv->prof->tx_ppp,
+				    priv->prof->rx_pause,
+				    priv->prof->rx_ppp);
+	if (err) {
+		en_err(priv, "Failed setting port general configurations for port %d, with error %d\n",
+		       priv->port, err);
+		goto tx_err;
+	}
+
+	err = mlx4_SET_PORT_user_mtu(mdev->dev, priv->port, dev->mtu);
+	if (err) {
+		en_err(priv, "Failed to pass user MTU(%d) to Firmware for port %d, with error %d\n",
+		       dev->mtu, priv->port, err);
+		goto tx_err;
+	}
+
+	/* Set default qp number */
+	err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 0);
+	if (err) {
+		en_err(priv, "Failed setting default qp numbers\n");
+		goto tx_err;
+	}
+
+	if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+		err = mlx4_SET_PORT_VXLAN(mdev->dev, priv->port, VXLAN_STEER_BY_OUTER_MAC, 1);
+		if (err) {
+			en_err(priv, "Failed setting port L2 tunnel configuration, err %d\n",
+			       err);
+			goto tx_err;
+		}
+	}
+
+	/* Init port */
+	en_dbg(HW, priv, "Initializing port\n");
+	err = mlx4_INIT_PORT(mdev->dev, priv->port);
+	if (err) {
+		en_err(priv, "Failed Initializing port\n");
+		goto tx_err;
+	}
+
+	/* Set Unicast and VXLAN steering rules */
+	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_A0 &&
+	    mlx4_en_set_rss_steer_rules(priv))
+		mlx4_warn(mdev, "Failed setting steering rules\n");
+
+	/* Attach rx QP to bradcast address */
+	eth_broadcast_addr(&mc_list[10]);
+	mc_list[5] = priv->port; /* needed for B0 steering support */
+	if (mlx4_multicast_attach(mdev->dev, priv->rss_map.indir_qp, mc_list,
+				  priv->port, 0, MLX4_PROT_ETH,
+				  &priv->broadcast_id))
+		mlx4_warn(mdev, "Failed Attaching Broadcast\n");
+
+	/* Must redo promiscuous mode setup. */
+	priv->flags &= ~(MLX4_EN_FLAG_PROMISC | MLX4_EN_FLAG_MC_PROMISC);
+
+	/* Schedule multicast task to populate multicast list */
+	queue_work(mdev->workqueue, &priv->rx_mode_task);
+
+	if (priv->mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+		udp_tunnel_get_rx_info(dev);
+
+	priv->port_up = true;
+
+	/* Process all completions if exist to prevent
+	 * the queues freezing if they are full
+	 */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		local_bh_disable();
+		napi_schedule(&priv->rx_cq[i]->napi);
+		local_bh_enable();
+	}
+
+	netif_tx_start_all_queues(dev);
+	netif_device_attach(dev);
+
+	return 0;
+
+tx_err:
+	if (t == MLX4_EN_NUM_TX_TYPES) {
+		t--;
+		i = priv->tx_ring_num[t];
+	}
+	while (t >= 0) {
+		while (i--) {
+			mlx4_en_deactivate_tx_ring(priv, priv->tx_ring[t][i]);
+			mlx4_en_deactivate_cq(priv, priv->tx_cq[t][i]);
+		}
+		if (!t--)
+			break;
+		i = priv->tx_ring_num[t];
+	}
+	mlx4_en_destroy_drop_qp(priv);
+rss_err:
+	mlx4_en_release_rss_steer(priv);
+mac_err:
+	mlx4_en_put_qp(priv);
+cq_err:
+	while (rx_index--) {
+		mlx4_en_deactivate_cq(priv, priv->rx_cq[rx_index]);
+		mlx4_en_free_affinity_hint(priv, rx_index);
+	}
+	for (i = 0; i < priv->rx_ring_num; i++)
+		mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]);
+
+	return err; /* need to close devices */
+}
+
+
+void mlx4_en_stop_port(struct net_device *dev, int detach)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_mc_list *mclist, *tmp;
+	struct ethtool_flow_id *flow, *tmp_flow;
+	int i, t;
+	u8 mc_list[16] = {0};
+
+	if (!priv->port_up) {
+		en_dbg(DRV, priv, "stop port called while port already down\n");
+		return;
+	}
+
+	/* close port*/
+	mlx4_CLOSE_PORT(mdev->dev, priv->port);
+
+	/* Synchronize with tx routine */
+	netif_tx_lock_bh(dev);
+	if (detach)
+		netif_device_detach(dev);
+	netif_tx_stop_all_queues(dev);
+	netif_tx_unlock_bh(dev);
+
+	netif_tx_disable(dev);
+
+	spin_lock_bh(&priv->stats_lock);
+	mlx4_en_fold_software_stats(dev);
+	/* Set port as not active */
+	priv->port_up = false;
+	spin_unlock_bh(&priv->stats_lock);
+
+	priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev);
+
+	/* Promsicuous mode */
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		priv->flags &= ~(MLX4_EN_FLAG_PROMISC |
+				 MLX4_EN_FLAG_MC_PROMISC);
+		mlx4_flow_steer_promisc_remove(mdev->dev,
+					       priv->port,
+					       MLX4_FS_ALL_DEFAULT);
+		mlx4_flow_steer_promisc_remove(mdev->dev,
+					       priv->port,
+					       MLX4_FS_MC_DEFAULT);
+	} else if (priv->flags & MLX4_EN_FLAG_PROMISC) {
+		priv->flags &= ~MLX4_EN_FLAG_PROMISC;
+
+		/* Disable promiscouos mode */
+		mlx4_unicast_promisc_remove(mdev->dev, priv->base_qpn,
+					    priv->port);
+
+		/* Disable Multicast promisc */
+		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
+			mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn,
+						      priv->port);
+			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		}
+	}
+
+	/* Detach All multicasts */
+	eth_broadcast_addr(&mc_list[10]);
+	mc_list[5] = priv->port; /* needed for B0 steering support */
+	mlx4_multicast_detach(mdev->dev, priv->rss_map.indir_qp, mc_list,
+			      MLX4_PROT_ETH, priv->broadcast_id);
+	list_for_each_entry(mclist, &priv->curr_list, list) {
+		memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+		mc_list[5] = priv->port;
+		mlx4_multicast_detach(mdev->dev, priv->rss_map.indir_qp,
+				      mc_list, MLX4_PROT_ETH, mclist->reg_id);
+		if (mclist->tunnel_reg_id)
+			mlx4_flow_detach(mdev->dev, mclist->tunnel_reg_id);
+	}
+	mlx4_en_clear_list(dev);
+	list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) {
+		list_del(&mclist->list);
+		kfree(mclist);
+	}
+
+	/* Flush multicast filter */
+	mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 1, MLX4_MCAST_CONFIG);
+
+	/* Remove flow steering rules for the port*/
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		ASSERT_RTNL();
+		list_for_each_entry_safe(flow, tmp_flow,
+					 &priv->ethtool_list, list) {
+			mlx4_flow_detach(mdev->dev, flow->id);
+			list_del(&flow->list);
+		}
+	}
+
+	mlx4_en_destroy_drop_qp(priv);
+
+	/* Free TX Rings */
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			mlx4_en_deactivate_tx_ring(priv, priv->tx_ring[t][i]);
+			mlx4_en_deactivate_cq(priv, priv->tx_cq[t][i]);
+		}
+	}
+	msleep(10);
+
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++)
+		for (i = 0; i < priv->tx_ring_num[t]; i++)
+			mlx4_en_free_tx_buf(dev, priv->tx_ring[t][i]);
+
+	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_A0)
+		mlx4_en_delete_rss_steer_rules(priv);
+
+	/* Free RSS qps */
+	mlx4_en_release_rss_steer(priv);
+
+	/* Unregister Mac address for the port */
+	mlx4_en_put_qp(priv);
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN))
+		mdev->mac_removed[priv->port] = 1;
+
+	/* Free RX Rings */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		struct mlx4_en_cq *cq = priv->rx_cq[i];
+
+		napi_synchronize(&cq->napi);
+		mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]);
+		mlx4_en_deactivate_cq(priv, cq);
+
+		mlx4_en_free_affinity_hint(priv, i);
+	}
+}
+
+static void mlx4_en_restart(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 watchdog_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+
+	en_dbg(DRV, priv, "Watchdog task called for port %d\n", priv->port);
+
+	rtnl_lock();
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		mlx4_en_stop_port(dev, 1);
+		if (mlx4_en_start_port(dev))
+			en_err(priv, "Failed restarting port %d\n", priv->port);
+	}
+	mutex_unlock(&mdev->state_lock);
+	rtnl_unlock();
+}
+
+static void mlx4_en_clear_stats(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring **tx_ring;
+	int i;
+
+	if (!mlx4_is_slave(mdev->dev))
+		if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1))
+			en_dbg(HW, priv, "Failed dumping statistics\n");
+
+	memset(&priv->pstats, 0, sizeof(priv->pstats));
+	memset(&priv->pkstats, 0, sizeof(priv->pkstats));
+	memset(&priv->port_stats, 0, sizeof(priv->port_stats));
+	memset(&priv->rx_flowstats, 0, sizeof(priv->rx_flowstats));
+	memset(&priv->tx_flowstats, 0, sizeof(priv->tx_flowstats));
+	memset(&priv->rx_priority_flowstats, 0,
+	       sizeof(priv->rx_priority_flowstats));
+	memset(&priv->tx_priority_flowstats, 0,
+	       sizeof(priv->tx_priority_flowstats));
+	memset(&priv->pf_stats, 0, sizeof(priv->pf_stats));
+
+	tx_ring = priv->tx_ring[TX];
+	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+		tx_ring[i]->bytes = 0;
+		tx_ring[i]->packets = 0;
+		tx_ring[i]->tx_csum = 0;
+		tx_ring[i]->tx_dropped = 0;
+		tx_ring[i]->queue_stopped = 0;
+		tx_ring[i]->wake_queue = 0;
+		tx_ring[i]->tso_packets = 0;
+		tx_ring[i]->xmit_more = 0;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->rx_ring[i]->bytes = 0;
+		priv->rx_ring[i]->packets = 0;
+		priv->rx_ring[i]->csum_ok = 0;
+		priv->rx_ring[i]->csum_none = 0;
+		priv->rx_ring[i]->csum_complete = 0;
+	}
+}
+
+static int mlx4_en_open(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+
+	mutex_lock(&mdev->state_lock);
+
+	if (!mdev->device_up) {
+		en_err(priv, "Cannot open - device down/disabled\n");
+		err = -EBUSY;
+		goto out;
+	}
+
+	/* Reset HW statistics and SW counters */
+	mlx4_en_clear_stats(dev);
+
+	err = mlx4_en_start_port(dev);
+	if (err)
+		en_err(priv, "Failed starting port:%d\n", priv->port);
+
+out:
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+
+static int mlx4_en_close(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	en_dbg(IFDOWN, priv, "Close port called\n");
+
+	mutex_lock(&mdev->state_lock);
+
+	mlx4_en_stop_port(dev, 0);
+	netif_carrier_off(dev);
+
+	mutex_unlock(&mdev->state_lock);
+	return 0;
+}
+
+static void mlx4_en_free_resources(struct mlx4_en_priv *priv)
+{
+	int i, t;
+
+#ifdef CONFIG_RFS_ACCEL
+	priv->dev->rx_cpu_rmap = NULL;
+#endif
+
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			if (priv->tx_ring[t] && priv->tx_ring[t][i])
+				mlx4_en_destroy_tx_ring(priv,
+							&priv->tx_ring[t][i]);
+			if (priv->tx_cq[t] && priv->tx_cq[t][i])
+				mlx4_en_destroy_cq(priv, &priv->tx_cq[t][i]);
+		}
+		kfree(priv->tx_ring[t]);
+		kfree(priv->tx_cq[t]);
+	}
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		if (priv->rx_ring[i])
+			mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i],
+				priv->prof->rx_ring_size, priv->stride);
+		if (priv->rx_cq[i])
+			mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
+	}
+
+}
+
+static int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_port_profile *prof = priv->prof;
+	int i, t;
+	int node;
+
+	/* Create tx Rings */
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			node = cpu_to_node(i % num_online_cpus());
+			if (mlx4_en_create_cq(priv, &priv->tx_cq[t][i],
+					      prof->tx_ring_size, i, t, node))
+				goto err;
+
+			if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[t][i],
+						   prof->tx_ring_size,
+						   TXBB_SIZE, node, i))
+				goto err;
+		}
+	}
+
+	/* Create rx Rings */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		node = cpu_to_node(i % num_online_cpus());
+		if (mlx4_en_create_cq(priv, &priv->rx_cq[i],
+				      prof->rx_ring_size, i, RX, node))
+			goto err;
+
+		if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i],
+					   prof->rx_ring_size, priv->stride,
+					   node, i))
+			goto err;
+
+	}
+
+#ifdef CONFIG_RFS_ACCEL
+	priv->dev->rx_cpu_rmap = mlx4_get_cpu_rmap(priv->mdev->dev, priv->port);
+#endif
+
+	return 0;
+
+err:
+	en_err(priv, "Failed to allocate NIC resources\n");
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		if (priv->rx_ring[i])
+			mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i],
+						prof->rx_ring_size,
+						priv->stride);
+		if (priv->rx_cq[i])
+			mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
+	}
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			if (priv->tx_ring[t][i])
+				mlx4_en_destroy_tx_ring(priv,
+							&priv->tx_ring[t][i]);
+			if (priv->tx_cq[t][i])
+				mlx4_en_destroy_cq(priv, &priv->tx_cq[t][i]);
+		}
+	}
+	return -ENOMEM;
+}
+
+
+static int mlx4_en_copy_priv(struct mlx4_en_priv *dst,
+			     struct mlx4_en_priv *src,
+			     struct mlx4_en_port_profile *prof)
+{
+	int t;
+
+	memcpy(&dst->hwtstamp_config, &prof->hwtstamp_config,
+	       sizeof(dst->hwtstamp_config));
+	dst->num_tx_rings_p_up = prof->num_tx_rings_p_up;
+	dst->rx_ring_num = prof->rx_ring_num;
+	dst->flags = prof->flags;
+	dst->mdev = src->mdev;
+	dst->port = src->port;
+	dst->dev = src->dev;
+	dst->prof = prof;
+	dst->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+					 DS_SIZE * MLX4_EN_MAX_RX_FRAGS);
+
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		dst->tx_ring_num[t] = prof->tx_ring_num[t];
+		if (!dst->tx_ring_num[t])
+			continue;
+
+		dst->tx_ring[t] = kzalloc(sizeof(struct mlx4_en_tx_ring *) *
+					  MAX_TX_RINGS, GFP_KERNEL);
+		if (!dst->tx_ring[t])
+			goto err_free_tx;
+
+		dst->tx_cq[t] = kzalloc(sizeof(struct mlx4_en_cq *) *
+					MAX_TX_RINGS, GFP_KERNEL);
+		if (!dst->tx_cq[t]) {
+			kfree(dst->tx_ring[t]);
+			goto err_free_tx;
+		}
+	}
+
+	return 0;
+
+err_free_tx:
+	while (t--) {
+		kfree(dst->tx_ring[t]);
+		kfree(dst->tx_cq[t]);
+	}
+	return -ENOMEM;
+}
+
+static void mlx4_en_update_priv(struct mlx4_en_priv *dst,
+				struct mlx4_en_priv *src)
+{
+	int t;
+	memcpy(dst->rx_ring, src->rx_ring,
+	       sizeof(struct mlx4_en_rx_ring *) * src->rx_ring_num);
+	memcpy(dst->rx_cq, src->rx_cq,
+	       sizeof(struct mlx4_en_cq *) * src->rx_ring_num);
+	memcpy(&dst->hwtstamp_config, &src->hwtstamp_config,
+	       sizeof(dst->hwtstamp_config));
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		dst->tx_ring_num[t] = src->tx_ring_num[t];
+		dst->tx_ring[t] = src->tx_ring[t];
+		dst->tx_cq[t] = src->tx_cq[t];
+	}
+	dst->num_tx_rings_p_up = src->num_tx_rings_p_up;
+	dst->rx_ring_num = src->rx_ring_num;
+	memcpy(dst->prof, src->prof, sizeof(struct mlx4_en_port_profile));
+}
+
+int mlx4_en_try_alloc_resources(struct mlx4_en_priv *priv,
+				struct mlx4_en_priv *tmp,
+				struct mlx4_en_port_profile *prof,
+				bool carry_xdp_prog)
+{
+	struct bpf_prog *xdp_prog;
+	int i, t;
+
+	mlx4_en_copy_priv(tmp, priv, prof);
+
+	if (mlx4_en_alloc_resources(tmp)) {
+		en_warn(priv,
+			"%s: Resource allocation failed, using previous configuration\n",
+			__func__);
+		for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+			kfree(tmp->tx_ring[t]);
+			kfree(tmp->tx_cq[t]);
+		}
+		return -ENOMEM;
+	}
+
+	/* All rx_rings has the same xdp_prog.  Pick the first one. */
+	xdp_prog = rcu_dereference_protected(
+		priv->rx_ring[0]->xdp_prog,
+		lockdep_is_held(&priv->mdev->state_lock));
+
+	if (xdp_prog && carry_xdp_prog) {
+		xdp_prog = bpf_prog_add(xdp_prog, tmp->rx_ring_num);
+		if (IS_ERR(xdp_prog)) {
+			mlx4_en_free_resources(tmp);
+			return PTR_ERR(xdp_prog);
+		}
+		for (i = 0; i < tmp->rx_ring_num; i++)
+			rcu_assign_pointer(tmp->rx_ring[i]->xdp_prog,
+					   xdp_prog);
+	}
+
+	return 0;
+}
+
+void mlx4_en_safe_replace_resources(struct mlx4_en_priv *priv,
+				    struct mlx4_en_priv *tmp)
+{
+	mlx4_en_free_resources(priv);
+	mlx4_en_update_priv(priv, tmp);
+}
+
+void mlx4_en_destroy_netdev(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port);
+
+	/* Unregister device - this will close the port if it was up */
+	if (priv->registered) {
+		devlink_port_type_clear(mlx4_get_devlink_port(mdev->dev,
+							      priv->port));
+		unregister_netdev(dev);
+	}
+
+	if (priv->allocated)
+		mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE);
+
+	cancel_delayed_work(&priv->stats_task);
+	cancel_delayed_work(&priv->service_task);
+	/* flush any pending task for this netdev */
+	flush_workqueue(mdev->workqueue);
+
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
+		mlx4_en_remove_timestamp(mdev);
+
+	/* Detach the netdev so tasks would not attempt to access it */
+	mutex_lock(&mdev->state_lock);
+	mdev->pndev[priv->port] = NULL;
+	mdev->upper[priv->port] = NULL;
+
+#ifdef CONFIG_RFS_ACCEL
+	mlx4_en_cleanup_filters(priv);
+#endif
+
+	mlx4_en_free_resources(priv);
+	mutex_unlock(&mdev->state_lock);
+
+	free_netdev(dev);
+}
+
+static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (mtu > MLX4_EN_MAX_XDP_MTU) {
+		en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n",
+		       mtu, MLX4_EN_MAX_XDP_MTU);
+		return false;
+	}
+
+	return true;
+}
+
+static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+
+	en_dbg(DRV, priv, "Change MTU called - current:%d new:%d\n",
+		 dev->mtu, new_mtu);
+
+	if (priv->tx_ring_num[TX_XDP] &&
+	    !mlx4_en_check_xdp_mtu(dev, new_mtu))
+		return -EOPNOTSUPP;
+
+	dev->mtu = new_mtu;
+
+	if (netif_running(dev)) {
+		mutex_lock(&mdev->state_lock);
+		if (!mdev->device_up) {
+			/* NIC is probably restarting - let watchdog task reset
+			 * the port */
+			en_dbg(DRV, priv, "Change MTU called with card down!?\n");
+		} else {
+			mlx4_en_stop_port(dev, 1);
+			err = mlx4_en_start_port(dev);
+			if (err) {
+				en_err(priv, "Failed restarting port:%d\n",
+					 priv->port);
+				queue_work(mdev->workqueue, &priv->watchdog_task);
+			}
+		}
+		mutex_unlock(&mdev->state_lock);
+	}
+	return 0;
+}
+
+static int mlx4_en_hwtstamp_set(struct net_device *dev, struct ifreq *ifr)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct hwtstamp_config config;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	/* reserved for future extensions */
+	if (config.flags)
+		return -EINVAL;
+
+	/* device doesn't support time stamping */
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS))
+		return -EINVAL;
+
+	/* TX HW timestamp */
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+	case HWTSTAMP_TX_ON:
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	/* RX HW timestamp */
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
+		config.rx_filter = HWTSTAMP_FILTER_ALL;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	if (mlx4_en_reset_config(dev, config, dev->features)) {
+		config.tx_type = HWTSTAMP_TX_OFF;
+		config.rx_filter = HWTSTAMP_FILTER_NONE;
+	}
+
+	return copy_to_user(ifr->ifr_data, &config,
+			    sizeof(config)) ? -EFAULT : 0;
+}
+
+static int mlx4_en_hwtstamp_get(struct net_device *dev, struct ifreq *ifr)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	return copy_to_user(ifr->ifr_data, &priv->hwtstamp_config,
+			    sizeof(priv->hwtstamp_config)) ? -EFAULT : 0;
+}
+
+static int mlx4_en_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		return mlx4_en_hwtstamp_set(dev, ifr);
+	case SIOCGHWTSTAMP:
+		return mlx4_en_hwtstamp_get(dev, ifr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static netdev_features_t mlx4_en_fix_features(struct net_device *netdev,
+					      netdev_features_t features)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(netdev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	/* Since there is no support for separate RX C-TAG/S-TAG vlan accel
+	 * enable/disable make sure S-TAG flag is always in same state as
+	 * C-TAG.
+	 */
+	if (features & NETIF_F_HW_VLAN_CTAG_RX &&
+	    !(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN))
+		features |= NETIF_F_HW_VLAN_STAG_RX;
+	else
+		features &= ~NETIF_F_HW_VLAN_STAG_RX;
+
+	return features;
+}
+
+static int mlx4_en_set_features(struct net_device *netdev,
+		netdev_features_t features)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	bool reset = false;
+	int ret = 0;
+
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_RXFCS)) {
+		en_info(priv, "Turn %s RX-FCS\n",
+			(features & NETIF_F_RXFCS) ? "ON" : "OFF");
+		reset = true;
+	}
+
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_RXALL)) {
+		u8 ignore_fcs_value = (features & NETIF_F_RXALL) ? 1 : 0;
+
+		en_info(priv, "Turn %s RX-ALL\n",
+			ignore_fcs_value ? "ON" : "OFF");
+		ret = mlx4_SET_PORT_fcs_check(priv->mdev->dev,
+					      priv->port, ignore_fcs_value);
+		if (ret)
+			return ret;
+	}
+
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_HW_VLAN_CTAG_RX)) {
+		en_info(priv, "Turn %s RX vlan strip offload\n",
+			(features & NETIF_F_HW_VLAN_CTAG_RX) ? "ON" : "OFF");
+		reset = true;
+	}
+
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_HW_VLAN_CTAG_TX))
+		en_info(priv, "Turn %s TX vlan strip offload\n",
+			(features & NETIF_F_HW_VLAN_CTAG_TX) ? "ON" : "OFF");
+
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_HW_VLAN_STAG_TX))
+		en_info(priv, "Turn %s TX S-VLAN strip offload\n",
+			(features & NETIF_F_HW_VLAN_STAG_TX) ? "ON" : "OFF");
+
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_LOOPBACK)) {
+		en_info(priv, "Turn %s loopback\n",
+			(features & NETIF_F_LOOPBACK) ? "ON" : "OFF");
+		mlx4_en_update_loopback_state(netdev, features);
+	}
+
+	if (reset) {
+		ret = mlx4_en_reset_config(netdev, priv->hwtstamp_config,
+					   features);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac);
+}
+
+static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
+			       __be16 vlan_proto)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos,
+				vlan_proto);
+}
+
+static int mlx4_en_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
+			       int max_tx_rate)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_rate(mdev->dev, en_priv->port, vf, min_tx_rate,
+				max_tx_rate);
+}
+
+static int mlx4_en_set_vf_spoofchk(struct net_device *dev, int vf, bool setting)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_spoofchk(mdev->dev, en_priv->port, vf, setting);
+}
+
+static int mlx4_en_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivf)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_get_vf_config(mdev->dev, en_priv->port, vf, ivf);
+}
+
+static int mlx4_en_set_vf_link_state(struct net_device *dev, int vf, int link_state)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_link_state(mdev->dev, en_priv->port, vf, link_state);
+}
+
+static int mlx4_en_get_vf_stats(struct net_device *dev, int vf,
+				struct ifla_vf_stats *vf_stats)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_get_vf_stats(mdev->dev, en_priv->port, vf, vf_stats);
+}
+
+#define PORT_ID_BYTE_LEN 8
+static int mlx4_en_get_phys_port_id(struct net_device *dev,
+				    struct netdev_phys_item_id *ppid)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_dev *mdev = priv->mdev->dev;
+	int i;
+	u64 phys_port_id = mdev->caps.phys_port_id[priv->port];
+
+	if (!phys_port_id)
+		return -EOPNOTSUPP;
+
+	ppid->id_len = sizeof(phys_port_id);
+	for (i = PORT_ID_BYTE_LEN - 1; i >= 0; --i) {
+		ppid->id[i] =  phys_port_id & 0xff;
+		phys_port_id >>= 8;
+	}
+	return 0;
+}
+
+static void mlx4_en_add_vxlan_offloads(struct work_struct *work)
+{
+	int ret;
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 vxlan_add_task);
+
+	ret = mlx4_config_vxlan_port(priv->mdev->dev, priv->vxlan_port);
+	if (ret)
+		goto out;
+
+	ret = mlx4_SET_PORT_VXLAN(priv->mdev->dev, priv->port,
+				  VXLAN_STEER_BY_OUTER_MAC, 1);
+out:
+	if (ret) {
+		en_err(priv, "failed setting L2 tunnel configuration ret %d\n", ret);
+		return;
+	}
+
+	/* set offloads */
+	priv->dev->hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+				      NETIF_F_RXCSUM |
+				      NETIF_F_TSO | NETIF_F_TSO6 |
+				      NETIF_F_GSO_UDP_TUNNEL |
+				      NETIF_F_GSO_UDP_TUNNEL_CSUM |
+				      NETIF_F_GSO_PARTIAL;
+}
+
+static void mlx4_en_del_vxlan_offloads(struct work_struct *work)
+{
+	int ret;
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 vxlan_del_task);
+	/* unset offloads */
+	priv->dev->hw_enc_features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+					NETIF_F_RXCSUM |
+					NETIF_F_TSO | NETIF_F_TSO6 |
+					NETIF_F_GSO_UDP_TUNNEL |
+					NETIF_F_GSO_UDP_TUNNEL_CSUM |
+					NETIF_F_GSO_PARTIAL);
+
+	ret = mlx4_SET_PORT_VXLAN(priv->mdev->dev, priv->port,
+				  VXLAN_STEER_BY_OUTER_MAC, 0);
+	if (ret)
+		en_err(priv, "failed setting L2 tunnel configuration ret %d\n", ret);
+
+	priv->vxlan_port = 0;
+}
+
+static void mlx4_en_add_vxlan_port(struct  net_device *dev,
+				   struct udp_tunnel_info *ti)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	__be16 port = ti->port;
+	__be16 current_port;
+
+	if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+		return;
+
+	if (ti->sa_family != AF_INET)
+		return;
+
+	if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+		return;
+
+	current_port = priv->vxlan_port;
+	if (current_port && current_port != port) {
+		en_warn(priv, "vxlan port %d configured, can't add port %d\n",
+			ntohs(current_port), ntohs(port));
+		return;
+	}
+
+	priv->vxlan_port = port;
+	queue_work(priv->mdev->workqueue, &priv->vxlan_add_task);
+}
+
+static void mlx4_en_del_vxlan_port(struct  net_device *dev,
+				   struct udp_tunnel_info *ti)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	__be16 port = ti->port;
+	__be16 current_port;
+
+	if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+		return;
+
+	if (ti->sa_family != AF_INET)
+		return;
+
+	if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+		return;
+
+	current_port = priv->vxlan_port;
+	if (current_port != port) {
+		en_dbg(DRV, priv, "vxlan port %d isn't configured, ignoring\n", ntohs(port));
+		return;
+	}
+
+	queue_work(priv->mdev->workqueue, &priv->vxlan_del_task);
+}
+
+static netdev_features_t mlx4_en_features_check(struct sk_buff *skb,
+						struct net_device *dev,
+						netdev_features_t features)
+{
+	features = vlan_features_check(skb, features);
+	features = vxlan_features_check(skb, features);
+
+	/* The ConnectX-3 doesn't support outer IPv6 checksums but it does
+	 * support inner IPv6 checksums and segmentation so  we need to
+	 * strip that feature if this is an IPv6 encapsulated frame.
+	 */
+	if (skb->encapsulation &&
+	    (skb->ip_summed == CHECKSUM_PARTIAL)) {
+		struct mlx4_en_priv *priv = netdev_priv(dev);
+
+		if (!priv->vxlan_port ||
+		    (ip_hdr(skb)->version != 4) ||
+		    (udp_hdr(skb)->dest != priv->vxlan_port))
+			features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+	}
+
+	return features;
+}
+
+static int mlx4_en_set_tx_maxrate(struct net_device *dev, int queue_index, u32 maxrate)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_tx_ring *tx_ring = priv->tx_ring[TX][queue_index];
+	struct mlx4_update_qp_params params;
+	int err;
+
+	if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT))
+		return -EOPNOTSUPP;
+
+	/* rate provided to us in Mbs, check if it fits into 12 bits, if not use Gbs */
+	if (maxrate >> 12) {
+		params.rate_unit = MLX4_QP_RATE_LIMIT_GBS;
+		params.rate_val  = maxrate / 1000;
+	} else if (maxrate) {
+		params.rate_unit = MLX4_QP_RATE_LIMIT_MBS;
+		params.rate_val  = maxrate;
+	} else { /* zero serves to revoke the QP rate-limitation */
+		params.rate_unit = 0;
+		params.rate_val  = 0;
+	}
+
+	err = mlx4_update_qp(priv->mdev->dev, tx_ring->qpn, MLX4_UPDATE_QP_RATE_LIMIT,
+			     &params);
+	return err;
+}
+
+static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_port_profile new_prof;
+	struct bpf_prog *old_prog;
+	struct mlx4_en_priv *tmp;
+	int tx_changed = 0;
+	int xdp_ring_num;
+	int port_up = 0;
+	int err;
+	int i;
+
+	xdp_ring_num = prog ? priv->rx_ring_num : 0;
+
+	/* No need to reconfigure buffers when simply swapping the
+	 * program for a new one.
+	 */
+	if (priv->tx_ring_num[TX_XDP] == xdp_ring_num) {
+		if (prog) {
+			prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
+			if (IS_ERR(prog))
+				return PTR_ERR(prog);
+		}
+		mutex_lock(&mdev->state_lock);
+		for (i = 0; i < priv->rx_ring_num; i++) {
+			old_prog = rcu_dereference_protected(
+					priv->rx_ring[i]->xdp_prog,
+					lockdep_is_held(&mdev->state_lock));
+			rcu_assign_pointer(priv->rx_ring[i]->xdp_prog, prog);
+			if (old_prog)
+				bpf_prog_put(old_prog);
+		}
+		mutex_unlock(&mdev->state_lock);
+		return 0;
+	}
+
+	if (!mlx4_en_check_xdp_mtu(dev, dev->mtu))
+		return -EOPNOTSUPP;
+
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	if (prog) {
+		prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
+		if (IS_ERR(prog)) {
+			err = PTR_ERR(prog);
+			goto out;
+		}
+	}
+
+	mutex_lock(&mdev->state_lock);
+	memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
+	new_prof.tx_ring_num[TX_XDP] = xdp_ring_num;
+
+	if (priv->tx_ring_num[TX] + xdp_ring_num > MAX_TX_RINGS) {
+		tx_changed = 1;
+		new_prof.tx_ring_num[TX] =
+			MAX_TX_RINGS - ALIGN(xdp_ring_num, priv->prof->num_up);
+		en_warn(priv, "Reducing the number of TX rings, to not exceed the max total rings number.\n");
+	}
+
+	err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof, false);
+	if (err) {
+		if (prog)
+			bpf_prog_sub(prog, priv->rx_ring_num - 1);
+		goto unlock_out;
+	}
+
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_safe_replace_resources(priv, tmp);
+	if (tx_changed)
+		netif_set_real_num_tx_queues(dev, priv->tx_ring_num[TX]);
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		old_prog = rcu_dereference_protected(
+					priv->rx_ring[i]->xdp_prog,
+					lockdep_is_held(&mdev->state_lock));
+		rcu_assign_pointer(priv->rx_ring[i]->xdp_prog, prog);
+		if (old_prog)
+			bpf_prog_put(old_prog);
+	}
+
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err) {
+			en_err(priv, "Failed starting port %d for XDP change\n",
+			       priv->port);
+			queue_work(mdev->workqueue, &priv->watchdog_task);
+		}
+	}
+
+unlock_out:
+	mutex_unlock(&mdev->state_lock);
+out:
+	kfree(tmp);
+	return err;
+}
+
+static u32 mlx4_xdp_query(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	const struct bpf_prog *xdp_prog;
+	u32 prog_id = 0;
+
+	if (!priv->tx_ring_num[TX_XDP])
+		return prog_id;
+
+	mutex_lock(&mdev->state_lock);
+	xdp_prog = rcu_dereference_protected(
+		priv->rx_ring[0]->xdp_prog,
+		lockdep_is_held(&mdev->state_lock));
+	if (xdp_prog)
+		prog_id = xdp_prog->aux->id;
+	mutex_unlock(&mdev->state_lock);
+
+	return prog_id;
+}
+
+static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return mlx4_xdp_set(dev, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_id = mlx4_xdp_query(dev);
+		xdp->prog_attached = !!xdp->prog_id;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct net_device_ops mlx4_netdev_ops = {
+	.ndo_open		= mlx4_en_open,
+	.ndo_stop		= mlx4_en_close,
+	.ndo_start_xmit		= mlx4_en_xmit,
+	.ndo_select_queue	= mlx4_en_select_queue,
+	.ndo_get_stats64	= mlx4_en_get_stats64,
+	.ndo_set_rx_mode	= mlx4_en_set_rx_mode,
+	.ndo_set_mac_address	= mlx4_en_set_mac,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= mlx4_en_change_mtu,
+	.ndo_do_ioctl		= mlx4_en_ioctl,
+	.ndo_tx_timeout		= mlx4_en_tx_timeout,
+	.ndo_vlan_rx_add_vid	= mlx4_en_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= mlx4_en_vlan_rx_kill_vid,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= mlx4_en_netpoll,
+#endif
+	.ndo_set_features	= mlx4_en_set_features,
+	.ndo_fix_features	= mlx4_en_fix_features,
+	.ndo_setup_tc		= __mlx4_en_setup_tc,
+#ifdef CONFIG_RFS_ACCEL
+	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
+#endif
+	.ndo_get_phys_port_id	= mlx4_en_get_phys_port_id,
+	.ndo_udp_tunnel_add	= mlx4_en_add_vxlan_port,
+	.ndo_udp_tunnel_del	= mlx4_en_del_vxlan_port,
+	.ndo_features_check	= mlx4_en_features_check,
+	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
+	.ndo_bpf		= mlx4_xdp,
+};
+
+static const struct net_device_ops mlx4_netdev_ops_master = {
+	.ndo_open		= mlx4_en_open,
+	.ndo_stop		= mlx4_en_close,
+	.ndo_start_xmit		= mlx4_en_xmit,
+	.ndo_select_queue	= mlx4_en_select_queue,
+	.ndo_get_stats64	= mlx4_en_get_stats64,
+	.ndo_set_rx_mode	= mlx4_en_set_rx_mode,
+	.ndo_set_mac_address	= mlx4_en_set_mac,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= mlx4_en_change_mtu,
+	.ndo_tx_timeout		= mlx4_en_tx_timeout,
+	.ndo_vlan_rx_add_vid	= mlx4_en_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= mlx4_en_vlan_rx_kill_vid,
+	.ndo_set_vf_mac		= mlx4_en_set_vf_mac,
+	.ndo_set_vf_vlan	= mlx4_en_set_vf_vlan,
+	.ndo_set_vf_rate	= mlx4_en_set_vf_rate,
+	.ndo_set_vf_spoofchk	= mlx4_en_set_vf_spoofchk,
+	.ndo_set_vf_link_state	= mlx4_en_set_vf_link_state,
+	.ndo_get_vf_stats       = mlx4_en_get_vf_stats,
+	.ndo_get_vf_config	= mlx4_en_get_vf_config,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= mlx4_en_netpoll,
+#endif
+	.ndo_set_features	= mlx4_en_set_features,
+	.ndo_fix_features	= mlx4_en_fix_features,
+	.ndo_setup_tc		= __mlx4_en_setup_tc,
+#ifdef CONFIG_RFS_ACCEL
+	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
+#endif
+	.ndo_get_phys_port_id	= mlx4_en_get_phys_port_id,
+	.ndo_udp_tunnel_add	= mlx4_en_add_vxlan_port,
+	.ndo_udp_tunnel_del	= mlx4_en_del_vxlan_port,
+	.ndo_features_check	= mlx4_en_features_check,
+	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
+	.ndo_bpf		= mlx4_xdp,
+};
+
+struct mlx4_en_bond {
+	struct work_struct work;
+	struct mlx4_en_priv *priv;
+	int is_bonded;
+	struct mlx4_port_map port_map;
+};
+
+static void mlx4_en_bond_work(struct work_struct *work)
+{
+	struct mlx4_en_bond *bond = container_of(work,
+						     struct mlx4_en_bond,
+						     work);
+	int err = 0;
+	struct mlx4_dev *dev = bond->priv->mdev->dev;
+
+	if (bond->is_bonded) {
+		if (!mlx4_is_bonded(dev)) {
+			err = mlx4_bond(dev);
+			if (err)
+				en_err(bond->priv, "Fail to bond device\n");
+		}
+		if (!err) {
+			err = mlx4_port_map_set(dev, &bond->port_map);
+			if (err)
+				en_err(bond->priv, "Fail to set port map [%d][%d]: %d\n",
+				       bond->port_map.port1,
+				       bond->port_map.port2,
+				       err);
+		}
+	} else if (mlx4_is_bonded(dev)) {
+		err = mlx4_unbond(dev);
+		if (err)
+			en_err(bond->priv, "Fail to unbond device\n");
+	}
+	dev_put(bond->priv->dev);
+	kfree(bond);
+}
+
+static int mlx4_en_queue_bond_work(struct mlx4_en_priv *priv, int is_bonded,
+				   u8 v2p_p1, u8 v2p_p2)
+{
+	struct mlx4_en_bond *bond = NULL;
+
+	bond = kzalloc(sizeof(*bond), GFP_ATOMIC);
+	if (!bond)
+		return -ENOMEM;
+
+	INIT_WORK(&bond->work, mlx4_en_bond_work);
+	bond->priv = priv;
+	bond->is_bonded = is_bonded;
+	bond->port_map.port1 = v2p_p1;
+	bond->port_map.port2 = v2p_p2;
+	dev_hold(priv->dev);
+	queue_work(priv->mdev->workqueue, &bond->work);
+	return 0;
+}
+
+int mlx4_en_netdev_event(struct notifier_block *this,
+			 unsigned long event, void *ptr)
+{
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	u8 port = 0;
+	struct mlx4_en_dev *mdev;
+	struct mlx4_dev *dev;
+	int i, num_eth_ports = 0;
+	bool do_bond = true;
+	struct mlx4_en_priv *priv;
+	u8 v2p_port1 = 0;
+	u8 v2p_port2 = 0;
+
+	if (!net_eq(dev_net(ndev), &init_net))
+		return NOTIFY_DONE;
+
+	mdev = container_of(this, struct mlx4_en_dev, nb);
+	dev = mdev->dev;
+
+	/* Go into this mode only when two network devices set on two ports
+	 * of the same mlx4 device are slaves of the same bonding master
+	 */
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		++num_eth_ports;
+		if (!port && (mdev->pndev[i] == ndev))
+			port = i;
+		mdev->upper[i] = mdev->pndev[i] ?
+			netdev_master_upper_dev_get(mdev->pndev[i]) : NULL;
+		/* condition not met: network device is a slave */
+		if (!mdev->upper[i])
+			do_bond = false;
+		if (num_eth_ports < 2)
+			continue;
+		/* condition not met: same master */
+		if (mdev->upper[i] != mdev->upper[i-1])
+			do_bond = false;
+	}
+	/* condition not met: 2 salves */
+	do_bond = (num_eth_ports ==  2) ? do_bond : false;
+
+	/* handle only events that come with enough info */
+	if ((do_bond && (event != NETDEV_BONDING_INFO)) || !port)
+		return NOTIFY_DONE;
+
+	priv = netdev_priv(ndev);
+	if (do_bond) {
+		struct netdev_notifier_bonding_info *notifier_info = ptr;
+		struct netdev_bonding_info *bonding_info =
+			&notifier_info->bonding_info;
+
+		/* required mode 1, 2 or 4 */
+		if ((bonding_info->master.bond_mode != BOND_MODE_ACTIVEBACKUP) &&
+		    (bonding_info->master.bond_mode != BOND_MODE_XOR) &&
+		    (bonding_info->master.bond_mode != BOND_MODE_8023AD))
+			do_bond = false;
+
+		/* require exactly 2 slaves */
+		if (bonding_info->master.num_slaves != 2)
+			do_bond = false;
+
+		/* calc v2p */
+		if (do_bond) {
+			if (bonding_info->master.bond_mode ==
+			    BOND_MODE_ACTIVEBACKUP) {
+				/* in active-backup mode virtual ports are
+				 * mapped to the physical port of the active
+				 * slave */
+				if (bonding_info->slave.state ==
+				    BOND_STATE_BACKUP) {
+					if (port == 1) {
+						v2p_port1 = 2;
+						v2p_port2 = 2;
+					} else {
+						v2p_port1 = 1;
+						v2p_port2 = 1;
+					}
+				} else { /* BOND_STATE_ACTIVE */
+					if (port == 1) {
+						v2p_port1 = 1;
+						v2p_port2 = 1;
+					} else {
+						v2p_port1 = 2;
+						v2p_port2 = 2;
+					}
+				}
+			} else { /* Active-Active */
+				/* in active-active mode a virtual port is
+				 * mapped to the native physical port if and only
+				 * if the physical port is up */
+				__s8 link = bonding_info->slave.link;
+
+				if (port == 1)
+					v2p_port2 = 2;
+				else
+					v2p_port1 = 1;
+				if ((link == BOND_LINK_UP) ||
+				    (link == BOND_LINK_FAIL)) {
+					if (port == 1)
+						v2p_port1 = 1;
+					else
+						v2p_port2 = 2;
+				} else { /* BOND_LINK_DOWN || BOND_LINK_BACK */
+					if (port == 1)
+						v2p_port1 = 2;
+					else
+						v2p_port2 = 1;
+				}
+			}
+		}
+	}
+
+	mlx4_en_queue_bond_work(priv, do_bond,
+				v2p_port1, v2p_port2);
+
+	return NOTIFY_DONE;
+}
+
+void mlx4_en_update_pfc_stats_bitmap(struct mlx4_dev *dev,
+				     struct mlx4_en_stats_bitmap *stats_bitmap,
+				     u8 rx_ppp, u8 rx_pause,
+				     u8 tx_ppp, u8 tx_pause)
+{
+	int last_i = NUM_MAIN_STATS + NUM_PORT_STATS + NUM_PF_STATS;
+
+	if (!mlx4_is_slave(dev) &&
+	    (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN)) {
+		mutex_lock(&stats_bitmap->mutex);
+		bitmap_clear(stats_bitmap->bitmap, last_i, NUM_FLOW_STATS);
+
+		if (rx_ppp)
+			bitmap_set(stats_bitmap->bitmap, last_i,
+				   NUM_FLOW_PRIORITY_STATS_RX);
+		last_i += NUM_FLOW_PRIORITY_STATS_RX;
+
+		if (rx_pause && !(rx_ppp))
+			bitmap_set(stats_bitmap->bitmap, last_i,
+				   NUM_FLOW_STATS_RX);
+		last_i += NUM_FLOW_STATS_RX;
+
+		if (tx_ppp)
+			bitmap_set(stats_bitmap->bitmap, last_i,
+				   NUM_FLOW_PRIORITY_STATS_TX);
+		last_i += NUM_FLOW_PRIORITY_STATS_TX;
+
+		if (tx_pause && !(tx_ppp))
+			bitmap_set(stats_bitmap->bitmap, last_i,
+				   NUM_FLOW_STATS_TX);
+		last_i += NUM_FLOW_STATS_TX;
+
+		mutex_unlock(&stats_bitmap->mutex);
+	}
+}
+
+void mlx4_en_set_stats_bitmap(struct mlx4_dev *dev,
+			      struct mlx4_en_stats_bitmap *stats_bitmap,
+			      u8 rx_ppp, u8 rx_pause,
+			      u8 tx_ppp, u8 tx_pause)
+{
+	int last_i = 0;
+
+	mutex_init(&stats_bitmap->mutex);
+	bitmap_zero(stats_bitmap->bitmap, NUM_ALL_STATS);
+
+	if (mlx4_is_slave(dev)) {
+		bitmap_set(stats_bitmap->bitmap, last_i +
+					 MLX4_FIND_NETDEV_STAT(rx_packets), 1);
+		bitmap_set(stats_bitmap->bitmap, last_i +
+					 MLX4_FIND_NETDEV_STAT(tx_packets), 1);
+		bitmap_set(stats_bitmap->bitmap, last_i +
+					 MLX4_FIND_NETDEV_STAT(rx_bytes), 1);
+		bitmap_set(stats_bitmap->bitmap, last_i +
+					 MLX4_FIND_NETDEV_STAT(tx_bytes), 1);
+		bitmap_set(stats_bitmap->bitmap, last_i +
+					 MLX4_FIND_NETDEV_STAT(rx_dropped), 1);
+		bitmap_set(stats_bitmap->bitmap, last_i +
+					 MLX4_FIND_NETDEV_STAT(tx_dropped), 1);
+	} else {
+		bitmap_set(stats_bitmap->bitmap, last_i, NUM_MAIN_STATS);
+	}
+	last_i += NUM_MAIN_STATS;
+
+	bitmap_set(stats_bitmap->bitmap, last_i, NUM_PORT_STATS);
+	last_i += NUM_PORT_STATS;
+
+	if (mlx4_is_master(dev))
+		bitmap_set(stats_bitmap->bitmap, last_i,
+			   NUM_PF_STATS);
+	last_i += NUM_PF_STATS;
+
+	mlx4_en_update_pfc_stats_bitmap(dev, stats_bitmap,
+					rx_ppp, rx_pause,
+					tx_ppp, tx_pause);
+	last_i += NUM_FLOW_STATS;
+
+	if (!mlx4_is_slave(dev))
+		bitmap_set(stats_bitmap->bitmap, last_i, NUM_PKT_STATS);
+	last_i += NUM_PKT_STATS;
+
+	bitmap_set(stats_bitmap->bitmap, last_i, NUM_XDP_STATS);
+	last_i += NUM_XDP_STATS;
+
+	if (!mlx4_is_slave(dev))
+		bitmap_set(stats_bitmap->bitmap, last_i, NUM_PHY_STATS);
+	last_i += NUM_PHY_STATS;
+}
+
+int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
+			struct mlx4_en_port_profile *prof)
+{
+	struct net_device *dev;
+	struct mlx4_en_priv *priv;
+	int i, t;
+	int err;
+
+	dev = alloc_etherdev_mqs(sizeof(struct mlx4_en_priv),
+				 MAX_TX_RINGS, MAX_RX_RINGS);
+	if (dev == NULL)
+		return -ENOMEM;
+
+	netif_set_real_num_tx_queues(dev, prof->tx_ring_num[TX]);
+	netif_set_real_num_rx_queues(dev, prof->rx_ring_num);
+
+	SET_NETDEV_DEV(dev, &mdev->dev->persist->pdev->dev);
+	dev->dev_port = port - 1;
+
+	/*
+	 * Initialize driver private data
+	 */
+
+	priv = netdev_priv(dev);
+	memset(priv, 0, sizeof(struct mlx4_en_priv));
+	priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev);
+	spin_lock_init(&priv->stats_lock);
+	INIT_WORK(&priv->rx_mode_task, mlx4_en_do_set_rx_mode);
+	INIT_WORK(&priv->watchdog_task, mlx4_en_restart);
+	INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate);
+	INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats);
+	INIT_DELAYED_WORK(&priv->service_task, mlx4_en_service_task);
+	INIT_WORK(&priv->vxlan_add_task, mlx4_en_add_vxlan_offloads);
+	INIT_WORK(&priv->vxlan_del_task, mlx4_en_del_vxlan_offloads);
+#ifdef CONFIG_RFS_ACCEL
+	INIT_LIST_HEAD(&priv->filters);
+	spin_lock_init(&priv->filters_lock);
+#endif
+
+	priv->dev = dev;
+	priv->mdev = mdev;
+	priv->ddev = &mdev->pdev->dev;
+	priv->prof = prof;
+	priv->port = port;
+	priv->port_up = false;
+	priv->flags = prof->flags;
+	priv->pflags = MLX4_EN_PRIV_FLAGS_BLUEFLAME;
+	priv->ctrl_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE |
+			MLX4_WQE_CTRL_SOLICITED);
+	priv->num_tx_rings_p_up = mdev->profile.max_num_tx_rings_p_up;
+	priv->tx_work_limit = MLX4_EN_DEFAULT_TX_WORK;
+	netdev_rss_key_fill(priv->rss_key, sizeof(priv->rss_key));
+
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		priv->tx_ring_num[t] = prof->tx_ring_num[t];
+		if (!priv->tx_ring_num[t])
+			continue;
+
+		priv->tx_ring[t] = kzalloc(sizeof(struct mlx4_en_tx_ring *) *
+					   MAX_TX_RINGS, GFP_KERNEL);
+		if (!priv->tx_ring[t]) {
+			err = -ENOMEM;
+			goto out;
+		}
+		priv->tx_cq[t] = kzalloc(sizeof(struct mlx4_en_cq *) *
+					 MAX_TX_RINGS, GFP_KERNEL);
+		if (!priv->tx_cq[t]) {
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+	priv->rx_ring_num = prof->rx_ring_num;
+	priv->cqe_factor = (mdev->dev->caps.cqe_size == 64) ? 1 : 0;
+	priv->cqe_size = mdev->dev->caps.cqe_size;
+	priv->mac_index = -1;
+	priv->msg_enable = MLX4_EN_MSG_LEVEL;
+#ifdef CONFIG_MLX4_EN_DCB
+	if (!mlx4_is_slave(priv->mdev->dev)) {
+		u8 prio;
+
+		for (prio = 0; prio < IEEE_8021QAZ_MAX_TCS; ++prio) {
+			priv->ets.prio_tc[prio] = prio;
+			priv->ets.tc_tsa[prio]  = IEEE_8021QAZ_TSA_VENDOR;
+		}
+
+		priv->dcbx_cap = DCB_CAP_DCBX_VER_CEE | DCB_CAP_DCBX_HOST |
+			DCB_CAP_DCBX_VER_IEEE;
+		priv->flags |= MLX4_EN_DCB_ENABLED;
+		priv->cee_config.pfc_state = false;
+
+		for (i = 0; i < MLX4_EN_NUM_UP_HIGH; i++)
+			priv->cee_config.dcb_pfc[i] = pfc_disabled;
+
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETS_CFG) {
+			dev->dcbnl_ops = &mlx4_en_dcbnl_ops;
+		} else {
+			en_info(priv, "enabling only PFC DCB ops\n");
+			dev->dcbnl_ops = &mlx4_en_dcbnl_pfc_ops;
+		}
+	}
+#endif
+
+	for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i)
+		INIT_HLIST_HEAD(&priv->mac_hash[i]);
+
+	/* Query for default mac and max mtu */
+	priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port];
+
+	if (mdev->dev->caps.rx_checksum_flags_port[priv->port] &
+	    MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP)
+		priv->flags |= MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP;
+
+	/* Set default MAC */
+	dev->addr_len = ETH_ALEN;
+	mlx4_en_u64_to_mac(dev->dev_addr, mdev->dev->caps.def_mac[priv->port]);
+	if (!is_valid_ether_addr(dev->dev_addr)) {
+		en_err(priv, "Port: %d, invalid mac burned: %pM, quiting\n",
+		       priv->port, dev->dev_addr);
+		err = -EINVAL;
+		goto out;
+	} else if (mlx4_is_slave(priv->mdev->dev) &&
+		   (priv->mdev->dev->port_random_macs & 1 << priv->port)) {
+		/* Random MAC was assigned in mlx4_slave_cap
+		 * in mlx4_core module
+		 */
+		dev->addr_assign_type |= NET_ADDR_RANDOM;
+		en_warn(priv, "Assigned random MAC address %pM\n", dev->dev_addr);
+	}
+
+	memcpy(priv->current_mac, dev->dev_addr, sizeof(priv->current_mac));
+
+	priv->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+					  DS_SIZE * MLX4_EN_MAX_RX_FRAGS);
+	err = mlx4_en_alloc_resources(priv);
+	if (err)
+		goto out;
+
+	/* Initialize time stamping config */
+	priv->hwtstamp_config.flags = 0;
+	priv->hwtstamp_config.tx_type = HWTSTAMP_TX_OFF;
+	priv->hwtstamp_config.rx_filter = HWTSTAMP_FILTER_NONE;
+
+	/* Allocate page for receive rings */
+	err = mlx4_alloc_hwq_res(mdev->dev, &priv->res,
+				MLX4_EN_PAGE_SIZE);
+	if (err) {
+		en_err(priv, "Failed to allocate page for rx qps\n");
+		goto out;
+	}
+	priv->allocated = 1;
+
+	/*
+	 * Initialize netdev entry points
+	 */
+	if (mlx4_is_master(priv->mdev->dev))
+		dev->netdev_ops = &mlx4_netdev_ops_master;
+	else
+		dev->netdev_ops = &mlx4_netdev_ops;
+	dev->watchdog_timeo = MLX4_EN_WATCHDOG_TIMEOUT;
+	netif_set_real_num_tx_queues(dev, priv->tx_ring_num[TX]);
+	netif_set_real_num_rx_queues(dev, priv->rx_ring_num);
+
+	dev->ethtool_ops = &mlx4_en_ethtool_ops;
+
+	/*
+	 * Set driver features
+	 */
+	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+	if (mdev->LSO_support)
+		dev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
+
+	dev->vlan_features = dev->hw_features;
+
+	dev->hw_features |= NETIF_F_RXCSUM | NETIF_F_RXHASH;
+	dev->features = dev->hw_features | NETIF_F_HIGHDMA |
+			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
+			NETIF_F_HW_VLAN_CTAG_FILTER;
+	dev->hw_features |= NETIF_F_LOOPBACK |
+			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
+
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) {
+		dev->features |= NETIF_F_HW_VLAN_STAG_RX |
+			NETIF_F_HW_VLAN_STAG_FILTER;
+		dev->hw_features |= NETIF_F_HW_VLAN_STAG_RX;
+	}
+
+	if (mlx4_is_slave(mdev->dev)) {
+		bool vlan_offload_disabled;
+		int phv;
+
+		err = get_phv_bit(mdev->dev, port, &phv);
+		if (!err && phv) {
+			dev->hw_features |= NETIF_F_HW_VLAN_STAG_TX;
+			priv->pflags |= MLX4_EN_PRIV_FLAGS_PHV;
+		}
+		err = mlx4_get_is_vlan_offload_disabled(mdev->dev, port,
+							&vlan_offload_disabled);
+		if (!err && vlan_offload_disabled) {
+			dev->hw_features &= ~(NETIF_F_HW_VLAN_CTAG_TX |
+					      NETIF_F_HW_VLAN_CTAG_RX |
+					      NETIF_F_HW_VLAN_STAG_TX |
+					      NETIF_F_HW_VLAN_STAG_RX);
+			dev->features &= ~(NETIF_F_HW_VLAN_CTAG_TX |
+					   NETIF_F_HW_VLAN_CTAG_RX |
+					   NETIF_F_HW_VLAN_STAG_TX |
+					   NETIF_F_HW_VLAN_STAG_RX);
+		}
+	} else {
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN &&
+		    !(mdev->dev->caps.flags2 &
+		      MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN))
+			dev->hw_features |= NETIF_F_HW_VLAN_STAG_TX;
+	}
+
+	if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP)
+		dev->hw_features |= NETIF_F_RXFCS;
+
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_IGNORE_FCS)
+		dev->hw_features |= NETIF_F_RXALL;
+
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED &&
+	    mdev->dev->caps.dmfs_high_steer_mode != MLX4_STEERING_DMFS_A0_STATIC)
+		dev->hw_features |= NETIF_F_NTUPLE;
+
+	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_A0)
+		dev->priv_flags |= IFF_UNICAST_FLT;
+
+	/* Setting a default hash function value */
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP) {
+		priv->rss_hash_fn = ETH_RSS_HASH_TOP;
+	} else if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_XOR) {
+		priv->rss_hash_fn = ETH_RSS_HASH_XOR;
+	} else {
+		en_warn(priv,
+			"No RSS hash capabilities exposed, using Toeplitz\n");
+		priv->rss_hash_fn = ETH_RSS_HASH_TOP;
+	}
+
+	if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+		dev->hw_features |= NETIF_F_GSO_UDP_TUNNEL |
+				    NETIF_F_GSO_UDP_TUNNEL_CSUM |
+				    NETIF_F_GSO_PARTIAL;
+		dev->features    |= NETIF_F_GSO_UDP_TUNNEL |
+				    NETIF_F_GSO_UDP_TUNNEL_CSUM |
+				    NETIF_F_GSO_PARTIAL;
+		dev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM;
+	}
+
+	/* MTU range: 46 - hw-specific max */
+	dev->min_mtu = MLX4_EN_MIN_MTU;
+	dev->max_mtu = priv->max_mtu;
+
+	mdev->pndev[port] = dev;
+	mdev->upper[port] = NULL;
+
+	netif_carrier_off(dev);
+	mlx4_en_set_default_moderation(priv);
+
+	en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num[TX]);
+	en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num);
+
+	mlx4_en_update_loopback_state(priv->dev, priv->dev->features);
+
+	/* Configure port */
+	mlx4_en_calc_rx_buf(dev);
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    prof->tx_pause, prof->tx_ppp,
+				    prof->rx_pause, prof->rx_ppp);
+	if (err) {
+		en_err(priv, "Failed setting port general configurations for port %d, with error %d\n",
+		       priv->port, err);
+		goto out;
+	}
+
+	if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+		err = mlx4_SET_PORT_VXLAN(mdev->dev, priv->port, VXLAN_STEER_BY_OUTER_MAC, 1);
+		if (err) {
+			en_err(priv, "Failed setting port L2 tunnel configuration, err %d\n",
+			       err);
+			goto out;
+		}
+	}
+
+	/* Init port */
+	en_warn(priv, "Initializing port\n");
+	err = mlx4_INIT_PORT(mdev->dev, priv->port);
+	if (err) {
+		en_err(priv, "Failed Initializing port\n");
+		goto out;
+	}
+	queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+
+	/* Initialize time stamp mechanism */
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
+		mlx4_en_init_timestamp(mdev);
+
+	queue_delayed_work(mdev->workqueue, &priv->service_task,
+			   SERVICE_TASK_DELAY);
+
+	mlx4_en_set_stats_bitmap(mdev->dev, &priv->stats_bitmap,
+				 mdev->profile.prof[priv->port].rx_ppp,
+				 mdev->profile.prof[priv->port].rx_pause,
+				 mdev->profile.prof[priv->port].tx_ppp,
+				 mdev->profile.prof[priv->port].tx_pause);
+
+	err = register_netdev(dev);
+	if (err) {
+		en_err(priv, "Netdev registration failed for port %d\n", port);
+		goto out;
+	}
+
+	priv->registered = 1;
+	devlink_port_type_eth_set(mlx4_get_devlink_port(mdev->dev, priv->port),
+				  dev);
+
+	return 0;
+
+out:
+	mlx4_en_destroy_netdev(dev);
+	return err;
+}
+
+int mlx4_en_reset_config(struct net_device *dev,
+			 struct hwtstamp_config ts_config,
+			 netdev_features_t features)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_port_profile new_prof;
+	struct mlx4_en_priv *tmp;
+	int port_up = 0;
+	int err = 0;
+
+	if (priv->hwtstamp_config.tx_type == ts_config.tx_type &&
+	    priv->hwtstamp_config.rx_filter == ts_config.rx_filter &&
+	    !DEV_FEATURE_CHANGED(dev, features, NETIF_F_HW_VLAN_CTAG_RX) &&
+	    !DEV_FEATURE_CHANGED(dev, features, NETIF_F_RXFCS))
+		return 0; /* Nothing to change */
+
+	if (DEV_FEATURE_CHANGED(dev, features, NETIF_F_HW_VLAN_CTAG_RX) &&
+	    (features & NETIF_F_HW_VLAN_CTAG_RX) &&
+	    (priv->hwtstamp_config.rx_filter != HWTSTAMP_FILTER_NONE)) {
+		en_warn(priv, "Can't turn ON rx vlan offload while time-stamping rx filter is ON\n");
+		return -EINVAL;
+	}
+
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	mutex_lock(&mdev->state_lock);
+
+	memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
+	memcpy(&new_prof.hwtstamp_config, &ts_config, sizeof(ts_config));
+
+	err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof, true);
+	if (err)
+		goto out;
+
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_safe_replace_resources(priv, tmp);
+
+	if (DEV_FEATURE_CHANGED(dev, features, NETIF_F_HW_VLAN_CTAG_RX)) {
+		if (features & NETIF_F_HW_VLAN_CTAG_RX)
+			dev->features |= NETIF_F_HW_VLAN_CTAG_RX;
+		else
+			dev->features &= ~NETIF_F_HW_VLAN_CTAG_RX;
+	} else if (ts_config.rx_filter == HWTSTAMP_FILTER_NONE) {
+		/* RX time-stamping is OFF, update the RX vlan offload
+		 * to the latest wanted state
+		 */
+		if (dev->wanted_features & NETIF_F_HW_VLAN_CTAG_RX)
+			dev->features |= NETIF_F_HW_VLAN_CTAG_RX;
+		else
+			dev->features &= ~NETIF_F_HW_VLAN_CTAG_RX;
+	}
+
+	if (DEV_FEATURE_CHANGED(dev, features, NETIF_F_RXFCS)) {
+		if (features & NETIF_F_RXFCS)
+			dev->features |= NETIF_F_RXFCS;
+		else
+			dev->features &= ~NETIF_F_RXFCS;
+	}
+
+	/* RX vlan offload and RX time-stamping can't co-exist !
+	 * Regardless of the caller's choice,
+	 * Turn Off RX vlan offload in case of time-stamping is ON
+	 */
+	if (ts_config.rx_filter != HWTSTAMP_FILTER_NONE) {
+		if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
+			en_warn(priv, "Turning off RX vlan offload since RX time-stamping is ON\n");
+		dev->features &= ~NETIF_F_HW_VLAN_CTAG_RX;
+	}
+
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+out:
+	mutex_unlock(&mdev->state_lock);
+	kfree(tmp);
+	if (!err)
+		netdev_features_change(dev);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c
new file mode 100644
index 0000000..0158b88
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#include <linux/if_vlan.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+
+#include "en_port.h"
+#include "mlx4_en.h"
+
+
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_vlan_fltr_mbox *filter;
+	int i;
+	int j;
+	int index = 0;
+	u32 entry;
+	int err = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	filter = mailbox->buf;
+	for (i = VLAN_FLTR_SIZE - 1; i >= 0; i--) {
+		entry = 0;
+		for (j = 0; j < 32; j++)
+			if (test_bit(index++, priv->active_vlans))
+				entry |= 1 << j;
+		filter->entry[i] = cpu_to_be32(entry);
+	}
+	err = mlx4_cmd(dev, mailbox->dma, priv->port, 0, MLX4_CMD_SET_VLAN_FLTR,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port)
+{
+	struct mlx4_en_query_port_context *qport_context;
+	struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]);
+	struct mlx4_en_port_state *state = &priv->port_state;
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+	qport_context = mailbox->buf;
+
+	/* This command is always accessed from Ethtool context
+	 * already synchronized, no need in locking */
+	state->link_state = !!(qport_context->link_up & MLX4_EN_LINK_UP_MASK);
+	switch (qport_context->link_speed & MLX4_EN_SPEED_MASK) {
+	case MLX4_EN_100M_SPEED:
+		state->link_speed = SPEED_100;
+		break;
+	case MLX4_EN_1G_SPEED:
+		state->link_speed = SPEED_1000;
+		break;
+	case MLX4_EN_10G_SPEED_XAUI:
+	case MLX4_EN_10G_SPEED_XFI:
+		state->link_speed = SPEED_10000;
+		break;
+	case MLX4_EN_20G_SPEED:
+		state->link_speed = SPEED_20000;
+		break;
+	case MLX4_EN_40G_SPEED:
+		state->link_speed = SPEED_40000;
+		break;
+	case MLX4_EN_56G_SPEED:
+		state->link_speed = SPEED_56000;
+		break;
+	default:
+		state->link_speed = -1;
+		break;
+	}
+
+	state->transceiver = qport_context->transceiver;
+
+	state->flags = 0; /* Reset and recalculate the port flags */
+	state->flags |= (qport_context->link_up & MLX4_EN_ANC_MASK) ?
+		MLX4_EN_PORT_ANC : 0;
+	state->flags |= (qport_context->autoneg & MLX4_EN_AUTONEG_MASK) ?
+		MLX4_EN_PORT_ANE : 0;
+
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return err;
+}
+
+/* Each counter set is located in struct mlx4_en_stat_out_mbox
+ * with a const offset between its prio components.
+ * This function runs over a counter set and sum all of it's prio components.
+ */
+static unsigned long en_stats_adder(__be64 *start, __be64 *next, int num)
+{
+	__be64 *curr = start;
+	unsigned long ret = 0;
+	int i;
+	int offset = next - start;
+
+	for (i = 0; i < num; i++) {
+		ret += be64_to_cpu(*curr);
+		curr += offset;
+	}
+
+	return ret;
+}
+
+void mlx4_en_fold_software_stats(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	unsigned long packets, bytes;
+	int i;
+
+	if (!priv->port_up || mlx4_is_master(mdev->dev))
+		return;
+
+	packets = 0;
+	bytes = 0;
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		const struct mlx4_en_rx_ring *ring = priv->rx_ring[i];
+
+		packets += READ_ONCE(ring->packets);
+		bytes   += READ_ONCE(ring->bytes);
+	}
+	dev->stats.rx_packets = packets;
+	dev->stats.rx_bytes = bytes;
+
+	packets = 0;
+	bytes = 0;
+	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+		const struct mlx4_en_tx_ring *ring = priv->tx_ring[TX][i];
+
+		packets += READ_ONCE(ring->packets);
+		bytes   += READ_ONCE(ring->bytes);
+	}
+	dev->stats.tx_packets = packets;
+	dev->stats.tx_bytes = bytes;
+}
+
+int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
+{
+	struct mlx4_counter tmp_counter_stats;
+	struct mlx4_en_stat_out_mbox *mlx4_en_stats;
+	struct mlx4_en_stat_out_flow_control_mbox *flowstats;
+	struct net_device *dev = mdev->pndev[port];
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct net_device_stats *stats = &dev->stats;
+	struct mlx4_cmd_mailbox *mailbox, *mailbox_priority;
+	u64 in_mod = reset << 8 | port;
+	int err;
+	int i, counter_index;
+	unsigned long sw_tx_dropped = 0;
+	unsigned long sw_rx_dropped = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	mailbox_priority = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox_priority)) {
+		mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+		return PTR_ERR(mailbox_priority);
+	}
+
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0,
+			   MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	mlx4_en_stats = mailbox->buf;
+
+	memset(&tmp_counter_stats, 0, sizeof(tmp_counter_stats));
+	counter_index = mlx4_get_default_counter_index(mdev->dev, port);
+	err = mlx4_get_counter_stats(mdev->dev, counter_index,
+				     &tmp_counter_stats, reset);
+
+	/* 0xffs indicates invalid value */
+	memset(mailbox_priority->buf, 0xff,
+	       sizeof(*flowstats) * MLX4_NUM_PRIORITIES);
+
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN) {
+		memset(mailbox_priority->buf, 0,
+		       sizeof(*flowstats) * MLX4_NUM_PRIORITIES);
+		err = mlx4_cmd_box(mdev->dev, 0, mailbox_priority->dma,
+				   in_mod | MLX4_DUMP_ETH_STATS_FLOW_CONTROL,
+				   0, MLX4_CMD_DUMP_ETH_STATS,
+				   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+		if (err)
+			goto out;
+	}
+
+	flowstats = mailbox_priority->buf;
+
+	spin_lock_bh(&priv->stats_lock);
+
+	mlx4_en_fold_software_stats(dev);
+
+	priv->port_stats.rx_chksum_good = 0;
+	priv->port_stats.rx_chksum_none = 0;
+	priv->port_stats.rx_chksum_complete = 0;
+	priv->port_stats.rx_alloc_pages = 0;
+	priv->xdp_stats.rx_xdp_drop    = 0;
+	priv->xdp_stats.rx_xdp_tx      = 0;
+	priv->xdp_stats.rx_xdp_tx_full = 0;
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		const struct mlx4_en_rx_ring *ring = priv->rx_ring[i];
+
+		sw_rx_dropped			+= READ_ONCE(ring->dropped);
+		priv->port_stats.rx_chksum_good += READ_ONCE(ring->csum_ok);
+		priv->port_stats.rx_chksum_none += READ_ONCE(ring->csum_none);
+		priv->port_stats.rx_chksum_complete += READ_ONCE(ring->csum_complete);
+		priv->port_stats.rx_alloc_pages += READ_ONCE(ring->rx_alloc_pages);
+		priv->xdp_stats.rx_xdp_drop	+= READ_ONCE(ring->xdp_drop);
+		priv->xdp_stats.rx_xdp_tx	+= READ_ONCE(ring->xdp_tx);
+		priv->xdp_stats.rx_xdp_tx_full	+= READ_ONCE(ring->xdp_tx_full);
+	}
+	priv->port_stats.tx_chksum_offload = 0;
+	priv->port_stats.queue_stopped = 0;
+	priv->port_stats.wake_queue = 0;
+	priv->port_stats.tso_packets = 0;
+	priv->port_stats.xmit_more = 0;
+
+	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+		const struct mlx4_en_tx_ring *ring = priv->tx_ring[TX][i];
+
+		sw_tx_dropped			   += READ_ONCE(ring->tx_dropped);
+		priv->port_stats.tx_chksum_offload += READ_ONCE(ring->tx_csum);
+		priv->port_stats.queue_stopped     += READ_ONCE(ring->queue_stopped);
+		priv->port_stats.wake_queue        += READ_ONCE(ring->wake_queue);
+		priv->port_stats.tso_packets       += READ_ONCE(ring->tso_packets);
+		priv->port_stats.xmit_more         += READ_ONCE(ring->xmit_more);
+	}
+
+	if (!mlx4_is_slave(mdev->dev)) {
+		struct mlx4_en_phy_stats *p_stats = &priv->phy_stats;
+
+		p_stats->rx_packets_phy =
+			en_stats_adder(&mlx4_en_stats->RTOT_prio_0,
+				       &mlx4_en_stats->RTOT_prio_1,
+				       NUM_PRIORITIES);
+		p_stats->tx_packets_phy =
+			en_stats_adder(&mlx4_en_stats->TTOT_prio_0,
+				       &mlx4_en_stats->TTOT_prio_1,
+				       NUM_PRIORITIES);
+		p_stats->rx_bytes_phy =
+			en_stats_adder(&mlx4_en_stats->ROCT_prio_0,
+				       &mlx4_en_stats->ROCT_prio_1,
+				       NUM_PRIORITIES);
+		p_stats->tx_bytes_phy =
+			en_stats_adder(&mlx4_en_stats->TOCT_prio_0,
+				       &mlx4_en_stats->TOCT_prio_1,
+				       NUM_PRIORITIES);
+		if (mlx4_is_master(mdev->dev)) {
+			stats->rx_packets = p_stats->rx_packets_phy;
+			stats->tx_packets = p_stats->tx_packets_phy;
+			stats->rx_bytes = p_stats->rx_bytes_phy;
+			stats->tx_bytes = p_stats->tx_bytes_phy;
+		}
+	}
+
+	/* net device stats */
+	stats->rx_errors = be64_to_cpu(mlx4_en_stats->PCS) +
+			   be32_to_cpu(mlx4_en_stats->RJBBR) +
+			   be32_to_cpu(mlx4_en_stats->RCRC) +
+			   be32_to_cpu(mlx4_en_stats->RRUNT) +
+			   be64_to_cpu(mlx4_en_stats->RInRangeLengthErr) +
+			   be64_to_cpu(mlx4_en_stats->ROutRangeLengthErr) +
+			   be32_to_cpu(mlx4_en_stats->RSHORT) +
+			   en_stats_adder(&mlx4_en_stats->RGIANT_prio_0,
+					  &mlx4_en_stats->RGIANT_prio_1,
+					  NUM_PRIORITIES);
+	stats->tx_errors = en_stats_adder(&mlx4_en_stats->TGIANT_prio_0,
+					  &mlx4_en_stats->TGIANT_prio_1,
+					  NUM_PRIORITIES);
+	stats->multicast = en_stats_adder(&mlx4_en_stats->MCAST_prio_0,
+					  &mlx4_en_stats->MCAST_prio_1,
+					  NUM_PRIORITIES);
+	stats->rx_dropped = be32_to_cpu(mlx4_en_stats->RDROP) +
+			    sw_rx_dropped;
+	stats->rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength);
+	stats->rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC);
+	stats->rx_fifo_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+	stats->tx_dropped = be32_to_cpu(mlx4_en_stats->TDROP) +
+			    sw_tx_dropped;
+
+	/* RX stats */
+	priv->pkstats.rx_multicast_packets = stats->multicast;
+	priv->pkstats.rx_broadcast_packets =
+			en_stats_adder(&mlx4_en_stats->RBCAST_prio_0,
+				       &mlx4_en_stats->RBCAST_prio_1,
+				       NUM_PRIORITIES);
+	priv->pkstats.rx_jabbers = be32_to_cpu(mlx4_en_stats->RJBBR);
+	priv->pkstats.rx_in_range_length_error =
+		be64_to_cpu(mlx4_en_stats->RInRangeLengthErr);
+	priv->pkstats.rx_out_range_length_error =
+		be64_to_cpu(mlx4_en_stats->ROutRangeLengthErr);
+
+	/* Tx stats */
+	priv->pkstats.tx_multicast_packets =
+		en_stats_adder(&mlx4_en_stats->TMCAST_prio_0,
+			       &mlx4_en_stats->TMCAST_prio_1,
+			       NUM_PRIORITIES);
+	priv->pkstats.tx_broadcast_packets =
+		en_stats_adder(&mlx4_en_stats->TBCAST_prio_0,
+			       &mlx4_en_stats->TBCAST_prio_1,
+			       NUM_PRIORITIES);
+
+	priv->pkstats.rx_prio[0][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_0);
+	priv->pkstats.rx_prio[0][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_0);
+	priv->pkstats.rx_prio[1][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_1);
+	priv->pkstats.rx_prio[1][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_1);
+	priv->pkstats.rx_prio[2][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_2);
+	priv->pkstats.rx_prio[2][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_2);
+	priv->pkstats.rx_prio[3][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_3);
+	priv->pkstats.rx_prio[3][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_3);
+	priv->pkstats.rx_prio[4][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_4);
+	priv->pkstats.rx_prio[4][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_4);
+	priv->pkstats.rx_prio[5][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_5);
+	priv->pkstats.rx_prio[5][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_5);
+	priv->pkstats.rx_prio[6][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_6);
+	priv->pkstats.rx_prio[6][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_6);
+	priv->pkstats.rx_prio[7][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_7);
+	priv->pkstats.rx_prio[7][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_7);
+	priv->pkstats.rx_prio[8][0] = be64_to_cpu(mlx4_en_stats->RTOT_novlan);
+	priv->pkstats.rx_prio[8][1] = be64_to_cpu(mlx4_en_stats->ROCT_novlan);
+	priv->pkstats.tx_prio[0][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_0);
+	priv->pkstats.tx_prio[0][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_0);
+	priv->pkstats.tx_prio[1][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_1);
+	priv->pkstats.tx_prio[1][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_1);
+	priv->pkstats.tx_prio[2][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_2);
+	priv->pkstats.tx_prio[2][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_2);
+	priv->pkstats.tx_prio[3][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_3);
+	priv->pkstats.tx_prio[3][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_3);
+	priv->pkstats.tx_prio[4][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_4);
+	priv->pkstats.tx_prio[4][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_4);
+	priv->pkstats.tx_prio[5][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_5);
+	priv->pkstats.tx_prio[5][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_5);
+	priv->pkstats.tx_prio[6][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_6);
+	priv->pkstats.tx_prio[6][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_6);
+	priv->pkstats.tx_prio[7][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_7);
+	priv->pkstats.tx_prio[7][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_7);
+	priv->pkstats.tx_prio[8][0] = be64_to_cpu(mlx4_en_stats->TTOT_novlan);
+	priv->pkstats.tx_prio[8][1] = be64_to_cpu(mlx4_en_stats->TOCT_novlan);
+
+	if (tmp_counter_stats.counter_mode == 0) {
+		priv->pf_stats.rx_bytes   = be64_to_cpu(tmp_counter_stats.rx_bytes);
+		priv->pf_stats.tx_bytes   = be64_to_cpu(tmp_counter_stats.tx_bytes);
+		priv->pf_stats.rx_packets = be64_to_cpu(tmp_counter_stats.rx_frames);
+		priv->pf_stats.tx_packets = be64_to_cpu(tmp_counter_stats.tx_frames);
+	}
+
+	for (i = 0; i < MLX4_NUM_PRIORITIES; i++)	{
+		priv->rx_priority_flowstats[i].rx_pause =
+			be64_to_cpu(flowstats[i].rx_pause);
+		priv->rx_priority_flowstats[i].rx_pause_duration =
+			be64_to_cpu(flowstats[i].rx_pause_duration);
+		priv->rx_priority_flowstats[i].rx_pause_transition =
+			be64_to_cpu(flowstats[i].rx_pause_transition);
+		priv->tx_priority_flowstats[i].tx_pause =
+			be64_to_cpu(flowstats[i].tx_pause);
+		priv->tx_priority_flowstats[i].tx_pause_duration =
+			be64_to_cpu(flowstats[i].tx_pause_duration);
+		priv->tx_priority_flowstats[i].tx_pause_transition =
+			be64_to_cpu(flowstats[i].tx_pause_transition);
+	}
+
+	/* if pfc is not in use, all priorities counters have the same value */
+	priv->rx_flowstats.rx_pause =
+		be64_to_cpu(flowstats[0].rx_pause);
+	priv->rx_flowstats.rx_pause_duration =
+		be64_to_cpu(flowstats[0].rx_pause_duration);
+	priv->rx_flowstats.rx_pause_transition =
+		be64_to_cpu(flowstats[0].rx_pause_transition);
+	priv->tx_flowstats.tx_pause =
+		be64_to_cpu(flowstats[0].tx_pause);
+	priv->tx_flowstats.tx_pause_duration =
+		be64_to_cpu(flowstats[0].tx_pause_duration);
+	priv->tx_flowstats.tx_pause_transition =
+		be64_to_cpu(flowstats[0].tx_pause_transition);
+
+	spin_unlock_bh(&priv->stats_lock);
+
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox_priority);
+	return err;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.h b/drivers/net/ethernet/mellanox/mlx4/en_port.h
new file mode 100644
index 0000000..930f961
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_port.h
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _MLX4_EN_PORT_H_
+#define _MLX4_EN_PORT_H_
+
+
+#define SET_PORT_PROMISC_SHIFT	31
+#define SET_PORT_MC_PROMISC_SHIFT	30
+
+#define MLX4_EN_NUM_TC		8
+
+#define VLAN_FLTR_SIZE	128
+struct mlx4_set_vlan_fltr_mbox {
+	__be32 entry[VLAN_FLTR_SIZE];
+};
+
+
+enum {
+	MLX4_MCAST_CONFIG       = 0,
+	MLX4_MCAST_DISABLE      = 1,
+	MLX4_MCAST_ENABLE       = 2,
+};
+
+enum mlx4_link_mode {
+	MLX4_1000BASE_CX_SGMII	 = 0,
+	MLX4_1000BASE_KX	 = 1,
+	MLX4_10GBASE_CX4	 = 2,
+	MLX4_10GBASE_KX4	 = 3,
+	MLX4_10GBASE_KR		 = 4,
+	MLX4_20GBASE_KR2	 = 5,
+	MLX4_40GBASE_CR4	 = 6,
+	MLX4_40GBASE_KR4	 = 7,
+	MLX4_56GBASE_KR4	 = 8,
+	MLX4_10GBASE_CR		 = 12,
+	MLX4_10GBASE_SR		 = 13,
+	MLX4_40GBASE_SR4	 = 15,
+	MLX4_56GBASE_CR4	 = 17,
+	MLX4_56GBASE_SR4	 = 18,
+	MLX4_100BASE_TX		 = 24,
+	MLX4_1000BASE_T		 = 25,
+	MLX4_10GBASE_T		 = 26,
+};
+
+#define MLX4_PROT_MASK(link_mode) (1<<link_mode)
+
+enum {
+	MLX4_EN_100M_SPEED	= 0x04,
+	MLX4_EN_10G_SPEED_XAUI	= 0x00,
+	MLX4_EN_10G_SPEED_XFI	= 0x01,
+	MLX4_EN_1G_SPEED	= 0x02,
+	MLX4_EN_20G_SPEED	= 0x08,
+	MLX4_EN_40G_SPEED	= 0x40,
+	MLX4_EN_56G_SPEED	= 0x20,
+	MLX4_EN_OTHER_SPEED	= 0x0f,
+};
+
+struct mlx4_en_query_port_context {
+	u8 link_up;
+#define MLX4_EN_LINK_UP_MASK	0x80
+#define MLX4_EN_ANC_MASK	0x40
+	u8 autoneg;
+#define MLX4_EN_AUTONEG_MASK	0x80
+	__be16 mtu;
+	u8 reserved2;
+	u8 link_speed;
+#define MLX4_EN_SPEED_MASK	0x6f
+	u16 reserved3[5];
+	__be64 mac;
+	u8 transceiver;
+};
+
+
+struct mlx4_en_stat_out_mbox {
+	/* Received frames with a length of 64 octets */
+	__be64 R64_prio_0;
+	__be64 R64_prio_1;
+	__be64 R64_prio_2;
+	__be64 R64_prio_3;
+	__be64 R64_prio_4;
+	__be64 R64_prio_5;
+	__be64 R64_prio_6;
+	__be64 R64_prio_7;
+	__be64 R64_novlan;
+	/* Received frames with a length of 127 octets */
+	__be64 R127_prio_0;
+	__be64 R127_prio_1;
+	__be64 R127_prio_2;
+	__be64 R127_prio_3;
+	__be64 R127_prio_4;
+	__be64 R127_prio_5;
+	__be64 R127_prio_6;
+	__be64 R127_prio_7;
+	__be64 R127_novlan;
+	/* Received frames with a length of 255 octets */
+	__be64 R255_prio_0;
+	__be64 R255_prio_1;
+	__be64 R255_prio_2;
+	__be64 R255_prio_3;
+	__be64 R255_prio_4;
+	__be64 R255_prio_5;
+	__be64 R255_prio_6;
+	__be64 R255_prio_7;
+	__be64 R255_novlan;
+	/* Received frames with a length of 511 octets */
+	__be64 R511_prio_0;
+	__be64 R511_prio_1;
+	__be64 R511_prio_2;
+	__be64 R511_prio_3;
+	__be64 R511_prio_4;
+	__be64 R511_prio_5;
+	__be64 R511_prio_6;
+	__be64 R511_prio_7;
+	__be64 R511_novlan;
+	/* Received frames with a length of 1023 octets */
+	__be64 R1023_prio_0;
+	__be64 R1023_prio_1;
+	__be64 R1023_prio_2;
+	__be64 R1023_prio_3;
+	__be64 R1023_prio_4;
+	__be64 R1023_prio_5;
+	__be64 R1023_prio_6;
+	__be64 R1023_prio_7;
+	__be64 R1023_novlan;
+	/* Received frames with a length of 1518 octets */
+	__be64 R1518_prio_0;
+	__be64 R1518_prio_1;
+	__be64 R1518_prio_2;
+	__be64 R1518_prio_3;
+	__be64 R1518_prio_4;
+	__be64 R1518_prio_5;
+	__be64 R1518_prio_6;
+	__be64 R1518_prio_7;
+	__be64 R1518_novlan;
+	/* Received frames with a length of 1522 octets */
+	__be64 R1522_prio_0;
+	__be64 R1522_prio_1;
+	__be64 R1522_prio_2;
+	__be64 R1522_prio_3;
+	__be64 R1522_prio_4;
+	__be64 R1522_prio_5;
+	__be64 R1522_prio_6;
+	__be64 R1522_prio_7;
+	__be64 R1522_novlan;
+	/* Received frames with a length of 1548 octets */
+	__be64 R1548_prio_0;
+	__be64 R1548_prio_1;
+	__be64 R1548_prio_2;
+	__be64 R1548_prio_3;
+	__be64 R1548_prio_4;
+	__be64 R1548_prio_5;
+	__be64 R1548_prio_6;
+	__be64 R1548_prio_7;
+	__be64 R1548_novlan;
+	/* Received frames with a length of 1548 < octets < MTU */
+	__be64 R2MTU_prio_0;
+	__be64 R2MTU_prio_1;
+	__be64 R2MTU_prio_2;
+	__be64 R2MTU_prio_3;
+	__be64 R2MTU_prio_4;
+	__be64 R2MTU_prio_5;
+	__be64 R2MTU_prio_6;
+	__be64 R2MTU_prio_7;
+	__be64 R2MTU_novlan;
+	/* Received frames with a length of MTU< octets and good CRC */
+	__be64 RGIANT_prio_0;
+	__be64 RGIANT_prio_1;
+	__be64 RGIANT_prio_2;
+	__be64 RGIANT_prio_3;
+	__be64 RGIANT_prio_4;
+	__be64 RGIANT_prio_5;
+	__be64 RGIANT_prio_6;
+	__be64 RGIANT_prio_7;
+	__be64 RGIANT_novlan;
+	/* Received broadcast frames with good CRC */
+	__be64 RBCAST_prio_0;
+	__be64 RBCAST_prio_1;
+	__be64 RBCAST_prio_2;
+	__be64 RBCAST_prio_3;
+	__be64 RBCAST_prio_4;
+	__be64 RBCAST_prio_5;
+	__be64 RBCAST_prio_6;
+	__be64 RBCAST_prio_7;
+	__be64 RBCAST_novlan;
+	/* Received multicast frames with good CRC */
+	__be64 MCAST_prio_0;
+	__be64 MCAST_prio_1;
+	__be64 MCAST_prio_2;
+	__be64 MCAST_prio_3;
+	__be64 MCAST_prio_4;
+	__be64 MCAST_prio_5;
+	__be64 MCAST_prio_6;
+	__be64 MCAST_prio_7;
+	__be64 MCAST_novlan;
+	/* Received unicast not short or GIANT frames with good CRC */
+	__be64 RTOTG_prio_0;
+	__be64 RTOTG_prio_1;
+	__be64 RTOTG_prio_2;
+	__be64 RTOTG_prio_3;
+	__be64 RTOTG_prio_4;
+	__be64 RTOTG_prio_5;
+	__be64 RTOTG_prio_6;
+	__be64 RTOTG_prio_7;
+	__be64 RTOTG_novlan;
+
+	/* Count of total octets of received frames, includes framing characters */
+	__be64 RTTLOCT_prio_0;
+	/* Count of total octets of received frames, not including framing
+	   characters */
+	__be64 RTTLOCT_NOFRM_prio_0;
+	/* Count of Total number of octets received
+	   (only for frames without errors) */
+	__be64 ROCT_prio_0;
+
+	__be64 RTTLOCT_prio_1;
+	__be64 RTTLOCT_NOFRM_prio_1;
+	__be64 ROCT_prio_1;
+
+	__be64 RTTLOCT_prio_2;
+	__be64 RTTLOCT_NOFRM_prio_2;
+	__be64 ROCT_prio_2;
+
+	__be64 RTTLOCT_prio_3;
+	__be64 RTTLOCT_NOFRM_prio_3;
+	__be64 ROCT_prio_3;
+
+	__be64 RTTLOCT_prio_4;
+	__be64 RTTLOCT_NOFRM_prio_4;
+	__be64 ROCT_prio_4;
+
+	__be64 RTTLOCT_prio_5;
+	__be64 RTTLOCT_NOFRM_prio_5;
+	__be64 ROCT_prio_5;
+
+	__be64 RTTLOCT_prio_6;
+	__be64 RTTLOCT_NOFRM_prio_6;
+	__be64 ROCT_prio_6;
+
+	__be64 RTTLOCT_prio_7;
+	__be64 RTTLOCT_NOFRM_prio_7;
+	__be64 ROCT_prio_7;
+
+	__be64 RTTLOCT_novlan;
+	__be64 RTTLOCT_NOFRM_novlan;
+	__be64 ROCT_novlan;
+
+	/* Count of Total received frames including bad frames */
+	__be64 RTOT_prio_0;
+	/* Count of  Total number of received frames with 802.1Q encapsulation */
+	__be64 R1Q_prio_0;
+	__be64 reserved1;
+
+	__be64 RTOT_prio_1;
+	__be64 R1Q_prio_1;
+	__be64 reserved2;
+
+	__be64 RTOT_prio_2;
+	__be64 R1Q_prio_2;
+	__be64 reserved3;
+
+	__be64 RTOT_prio_3;
+	__be64 R1Q_prio_3;
+	__be64 reserved4;
+
+	__be64 RTOT_prio_4;
+	__be64 R1Q_prio_4;
+	__be64 reserved5;
+
+	__be64 RTOT_prio_5;
+	__be64 R1Q_prio_5;
+	__be64 reserved6;
+
+	__be64 RTOT_prio_6;
+	__be64 R1Q_prio_6;
+	__be64 reserved7;
+
+	__be64 RTOT_prio_7;
+	__be64 R1Q_prio_7;
+	__be64 reserved8;
+
+	__be64 RTOT_novlan;
+	__be64 R1Q_novlan;
+	__be64 reserved9;
+
+	/* Total number of Successfully Received Control Frames */
+	__be64 RCNTL;
+	__be64 reserved10;
+	__be64 reserved11;
+	__be64 reserved12;
+	/* Count of received frames with a length/type field  value between 46
+	   (42 for VLANtagged frames) and 1500 (also 1500 for VLAN-tagged frames),
+	   inclusive */
+	__be64 RInRangeLengthErr;
+	/* Count of received frames with length/type field between 1501 and 1535
+	   decimal, inclusive */
+	__be64 ROutRangeLengthErr;
+	/* Count of received frames that are longer than max allowed size for
+	   802.3 frames (1518/1522) */
+	__be64 RFrmTooLong;
+	/* Count frames received with PCS error */
+	__be64 PCS;
+
+	/* Transmit frames with a length of 64 octets */
+	__be64 T64_prio_0;
+	__be64 T64_prio_1;
+	__be64 T64_prio_2;
+	__be64 T64_prio_3;
+	__be64 T64_prio_4;
+	__be64 T64_prio_5;
+	__be64 T64_prio_6;
+	__be64 T64_prio_7;
+	__be64 T64_novlan;
+	__be64 T64_loopbk;
+	/* Transmit frames with a length of 65 to 127 octets. */
+	__be64 T127_prio_0;
+	__be64 T127_prio_1;
+	__be64 T127_prio_2;
+	__be64 T127_prio_3;
+	__be64 T127_prio_4;
+	__be64 T127_prio_5;
+	__be64 T127_prio_6;
+	__be64 T127_prio_7;
+	__be64 T127_novlan;
+	__be64 T127_loopbk;
+	/* Transmit frames with a length of 128 to 255 octets */
+	__be64 T255_prio_0;
+	__be64 T255_prio_1;
+	__be64 T255_prio_2;
+	__be64 T255_prio_3;
+	__be64 T255_prio_4;
+	__be64 T255_prio_5;
+	__be64 T255_prio_6;
+	__be64 T255_prio_7;
+	__be64 T255_novlan;
+	__be64 T255_loopbk;
+	/* Transmit frames with a length of 256 to 511 octets */
+	__be64 T511_prio_0;
+	__be64 T511_prio_1;
+	__be64 T511_prio_2;
+	__be64 T511_prio_3;
+	__be64 T511_prio_4;
+	__be64 T511_prio_5;
+	__be64 T511_prio_6;
+	__be64 T511_prio_7;
+	__be64 T511_novlan;
+	__be64 T511_loopbk;
+	/* Transmit frames with a length of 512 to 1023 octets */
+	__be64 T1023_prio_0;
+	__be64 T1023_prio_1;
+	__be64 T1023_prio_2;
+	__be64 T1023_prio_3;
+	__be64 T1023_prio_4;
+	__be64 T1023_prio_5;
+	__be64 T1023_prio_6;
+	__be64 T1023_prio_7;
+	__be64 T1023_novlan;
+	__be64 T1023_loopbk;
+	/* Transmit frames with a length of 1024 to 1518 octets */
+	__be64 T1518_prio_0;
+	__be64 T1518_prio_1;
+	__be64 T1518_prio_2;
+	__be64 T1518_prio_3;
+	__be64 T1518_prio_4;
+	__be64 T1518_prio_5;
+	__be64 T1518_prio_6;
+	__be64 T1518_prio_7;
+	__be64 T1518_novlan;
+	__be64 T1518_loopbk;
+	/* Counts transmit frames with a length of 1519 to 1522 bytes */
+	__be64 T1522_prio_0;
+	__be64 T1522_prio_1;
+	__be64 T1522_prio_2;
+	__be64 T1522_prio_3;
+	__be64 T1522_prio_4;
+	__be64 T1522_prio_5;
+	__be64 T1522_prio_6;
+	__be64 T1522_prio_7;
+	__be64 T1522_novlan;
+	__be64 T1522_loopbk;
+	/* Transmit frames with a length of 1523 to 1548 octets */
+	__be64 T1548_prio_0;
+	__be64 T1548_prio_1;
+	__be64 T1548_prio_2;
+	__be64 T1548_prio_3;
+	__be64 T1548_prio_4;
+	__be64 T1548_prio_5;
+	__be64 T1548_prio_6;
+	__be64 T1548_prio_7;
+	__be64 T1548_novlan;
+	__be64 T1548_loopbk;
+	/* Counts transmit frames with a length of 1549 to MTU bytes */
+	__be64 T2MTU_prio_0;
+	__be64 T2MTU_prio_1;
+	__be64 T2MTU_prio_2;
+	__be64 T2MTU_prio_3;
+	__be64 T2MTU_prio_4;
+	__be64 T2MTU_prio_5;
+	__be64 T2MTU_prio_6;
+	__be64 T2MTU_prio_7;
+	__be64 T2MTU_novlan;
+	__be64 T2MTU_loopbk;
+	/* Transmit frames with a length greater than MTU octets and a good CRC. */
+	__be64 TGIANT_prio_0;
+	__be64 TGIANT_prio_1;
+	__be64 TGIANT_prio_2;
+	__be64 TGIANT_prio_3;
+	__be64 TGIANT_prio_4;
+	__be64 TGIANT_prio_5;
+	__be64 TGIANT_prio_6;
+	__be64 TGIANT_prio_7;
+	__be64 TGIANT_novlan;
+	__be64 TGIANT_loopbk;
+	/* Transmit broadcast frames with a good CRC */
+	__be64 TBCAST_prio_0;
+	__be64 TBCAST_prio_1;
+	__be64 TBCAST_prio_2;
+	__be64 TBCAST_prio_3;
+	__be64 TBCAST_prio_4;
+	__be64 TBCAST_prio_5;
+	__be64 TBCAST_prio_6;
+	__be64 TBCAST_prio_7;
+	__be64 TBCAST_novlan;
+	__be64 TBCAST_loopbk;
+	/* Transmit multicast frames with a good CRC */
+	__be64 TMCAST_prio_0;
+	__be64 TMCAST_prio_1;
+	__be64 TMCAST_prio_2;
+	__be64 TMCAST_prio_3;
+	__be64 TMCAST_prio_4;
+	__be64 TMCAST_prio_5;
+	__be64 TMCAST_prio_6;
+	__be64 TMCAST_prio_7;
+	__be64 TMCAST_novlan;
+	__be64 TMCAST_loopbk;
+	/* Transmit good frames that are neither broadcast nor multicast */
+	__be64 TTOTG_prio_0;
+	__be64 TTOTG_prio_1;
+	__be64 TTOTG_prio_2;
+	__be64 TTOTG_prio_3;
+	__be64 TTOTG_prio_4;
+	__be64 TTOTG_prio_5;
+	__be64 TTOTG_prio_6;
+	__be64 TTOTG_prio_7;
+	__be64 TTOTG_novlan;
+	__be64 TTOTG_loopbk;
+
+	/* total octets of transmitted frames, including framing characters */
+	__be64 TTTLOCT_prio_0;
+	/* total octets of transmitted frames, not including framing characters */
+	__be64 TTTLOCT_NOFRM_prio_0;
+	/* ifOutOctets */
+	__be64 TOCT_prio_0;
+
+	__be64 TTTLOCT_prio_1;
+	__be64 TTTLOCT_NOFRM_prio_1;
+	__be64 TOCT_prio_1;
+
+	__be64 TTTLOCT_prio_2;
+	__be64 TTTLOCT_NOFRM_prio_2;
+	__be64 TOCT_prio_2;
+
+	__be64 TTTLOCT_prio_3;
+	__be64 TTTLOCT_NOFRM_prio_3;
+	__be64 TOCT_prio_3;
+
+	__be64 TTTLOCT_prio_4;
+	__be64 TTTLOCT_NOFRM_prio_4;
+	__be64 TOCT_prio_4;
+
+	__be64 TTTLOCT_prio_5;
+	__be64 TTTLOCT_NOFRM_prio_5;
+	__be64 TOCT_prio_5;
+
+	__be64 TTTLOCT_prio_6;
+	__be64 TTTLOCT_NOFRM_prio_6;
+	__be64 TOCT_prio_6;
+
+	__be64 TTTLOCT_prio_7;
+	__be64 TTTLOCT_NOFRM_prio_7;
+	__be64 TOCT_prio_7;
+
+	__be64 TTTLOCT_novlan;
+	__be64 TTTLOCT_NOFRM_novlan;
+	__be64 TOCT_novlan;
+
+	__be64 TTTLOCT_loopbk;
+	__be64 TTTLOCT_NOFRM_loopbk;
+	__be64 TOCT_loopbk;
+
+	/* Total frames transmitted with a good CRC that are not aborted  */
+	__be64 TTOT_prio_0;
+	/* Total number of frames transmitted with 802.1Q encapsulation */
+	__be64 T1Q_prio_0;
+	__be64 reserved13;
+
+	__be64 TTOT_prio_1;
+	__be64 T1Q_prio_1;
+	__be64 reserved14;
+
+	__be64 TTOT_prio_2;
+	__be64 T1Q_prio_2;
+	__be64 reserved15;
+
+	__be64 TTOT_prio_3;
+	__be64 T1Q_prio_3;
+	__be64 reserved16;
+
+	__be64 TTOT_prio_4;
+	__be64 T1Q_prio_4;
+	__be64 reserved17;
+
+	__be64 TTOT_prio_5;
+	__be64 T1Q_prio_5;
+	__be64 reserved18;
+
+	__be64 TTOT_prio_6;
+	__be64 T1Q_prio_6;
+	__be64 reserved19;
+
+	__be64 TTOT_prio_7;
+	__be64 T1Q_prio_7;
+	__be64 reserved20;
+
+	__be64 TTOT_novlan;
+	__be64 T1Q_novlan;
+	__be64 reserved21;
+
+	__be64 TTOT_loopbk;
+	__be64 T1Q_loopbk;
+	__be64 reserved22;
+
+	/* Received frames with a length greater than MTU octets and a bad CRC */
+	__be32 RJBBR;
+	/* Received frames with a bad CRC that are not runts, jabbers,
+	   or alignment errors */
+	__be32 RCRC;
+	/* Received frames with SFD with a length of less than 64 octets and a
+	   bad CRC */
+	__be32 RRUNT;
+	/* Received frames with a length less than 64 octets and a good CRC */
+	__be32 RSHORT;
+	/* Total Number of Received Packets Dropped */
+	__be32 RDROP;
+	/* Drop due to overflow  */
+	__be32 RdropOvflw;
+	/* Drop due to overflow */
+	__be32 RdropLength;
+	/* Total of good frames. Does not include frames received with
+	   frame-too-long, FCS, or length errors */
+	__be32 RTOTFRMS;
+	/* Total dropped Xmited packets */
+	__be32 TDROP;
+};
+
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
new file mode 100644
index 0000000..6883ac7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_en.h"
+
+void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
+			     int is_tx, int rss, int qpn, int cqn,
+			     int user_prio, struct mlx4_qp_context *context)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+
+	memset(context, 0, sizeof(*context));
+	context->flags = cpu_to_be32(7 << 16 | rss << MLX4_RSS_QPC_FLAG_OFFSET);
+	context->pd = cpu_to_be32(mdev->priv_pdn);
+	context->mtu_msgmax = 0xff;
+	if (!is_tx && !rss)
+		context->rq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
+	if (is_tx) {
+		context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP)
+			context->params2 |= cpu_to_be32(MLX4_QP_BIT_FPP);
+
+	} else {
+		context->sq_size_stride = ilog2(TXBB_SIZE) - 4;
+	}
+	context->usr_page = cpu_to_be32(mlx4_to_hw_uar_index(mdev->dev,
+					mdev->priv_uar.index));
+	context->local_qpn = cpu_to_be32(qpn);
+	context->pri_path.ackto = 1 & 0x07;
+	context->pri_path.sched_queue = 0x83 | (priv->port - 1) << 6;
+	/* force user priority per tx ring */
+	if (user_prio >= 0 && priv->prof->num_up == MLX4_EN_NUM_UP_HIGH) {
+		context->pri_path.sched_queue |= user_prio << 3;
+		context->pri_path.feup = MLX4_FEUP_FORCE_ETH_UP;
+	}
+	context->pri_path.counter_index = priv->counter_index;
+	context->cqn_send = cpu_to_be32(cqn);
+	context->cqn_recv = cpu_to_be32(cqn);
+	if (!rss &&
+	    (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK) &&
+	    context->pri_path.counter_index !=
+			    MLX4_SINK_COUNTER_INDEX(mdev->dev)) {
+		/* disable multicast loopback to qp with same counter */
+		if (!(dev->features & NETIF_F_LOOPBACK))
+			context->pri_path.fl |= MLX4_FL_ETH_SRC_CHECK_MC_LB;
+		context->pri_path.control |= MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER;
+	}
+	context->db_rec_addr = cpu_to_be64(priv->res.db.dma << 2);
+	if (!(dev->features & NETIF_F_HW_VLAN_CTAG_RX))
+		context->param3 |= cpu_to_be32(1 << 30);
+
+	if (!is_tx && !rss &&
+	    (mdev->dev->caps.tunnel_offload_mode ==  MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)) {
+		en_dbg(HW, priv, "Setting RX qp %x tunnel mode to RX tunneled & non-tunneled\n", qpn);
+		context->srqn = cpu_to_be32(7 << 28); /* this fills bits 30:28 */
+	}
+}
+
+int mlx4_en_change_mcast_lb(struct mlx4_en_priv *priv, struct mlx4_qp *qp,
+			    int loopback)
+{
+	int ret;
+	struct mlx4_update_qp_params qp_params;
+
+	memset(&qp_params, 0, sizeof(qp_params));
+	if (!loopback)
+		qp_params.flags = MLX4_UPDATE_QP_PARAMS_FLAGS_ETH_CHECK_MC_LB;
+
+	ret = mlx4_update_qp(priv->mdev->dev, qp->qpn,
+			     MLX4_UPDATE_QP_ETH_SRC_CHECK_MC_LB,
+			     &qp_params);
+
+	return ret;
+}
+
+void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event)
+{
+    return;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
new file mode 100644
index 0000000..5c613c6
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -0,0 +1,1264 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <net/busy_poll.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <linux/mlx4/cq.h>
+#include <linux/slab.h>
+#include <linux/mlx4/qp.h>
+#include <linux/skbuff.h>
+#include <linux/rculist.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/vmalloc.h>
+#include <linux/irq.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_checksum.h>
+#endif
+
+#include "mlx4_en.h"
+
+static int mlx4_alloc_page(struct mlx4_en_priv *priv,
+			   struct mlx4_en_rx_alloc *frag,
+			   gfp_t gfp)
+{
+	struct page *page;
+	dma_addr_t dma;
+
+	page = alloc_page(gfp);
+	if (unlikely(!page))
+		return -ENOMEM;
+	dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE, priv->dma_dir);
+	if (unlikely(dma_mapping_error(priv->ddev, dma))) {
+		__free_page(page);
+		return -ENOMEM;
+	}
+	frag->page = page;
+	frag->dma = dma;
+	frag->page_offset = priv->rx_headroom;
+	return 0;
+}
+
+static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
+			       struct mlx4_en_rx_ring *ring,
+			       struct mlx4_en_rx_desc *rx_desc,
+			       struct mlx4_en_rx_alloc *frags,
+			       gfp_t gfp)
+{
+	int i;
+
+	for (i = 0; i < priv->num_frags; i++, frags++) {
+		if (!frags->page) {
+			if (mlx4_alloc_page(priv, frags, gfp))
+				return -ENOMEM;
+			ring->rx_alloc_pages++;
+		}
+		rx_desc->data[i].addr = cpu_to_be64(frags->dma +
+						    frags->page_offset);
+	}
+	return 0;
+}
+
+static void mlx4_en_free_frag(const struct mlx4_en_priv *priv,
+			      struct mlx4_en_rx_alloc *frag)
+{
+	if (frag->page) {
+		dma_unmap_page(priv->ddev, frag->dma,
+			       PAGE_SIZE, priv->dma_dir);
+		__free_page(frag->page);
+	}
+	/* We need to clear all fields, otherwise a change of priv->log_rx_info
+	 * could lead to see garbage later in frag->page.
+	 */
+	memset(frag, 0, sizeof(*frag));
+}
+
+static void mlx4_en_init_rx_desc(const struct mlx4_en_priv *priv,
+				 struct mlx4_en_rx_ring *ring, int index)
+{
+	struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
+	int possible_frags;
+	int i;
+
+	/* Set size and memtype fields */
+	for (i = 0; i < priv->num_frags; i++) {
+		rx_desc->data[i].byte_count =
+			cpu_to_be32(priv->frag_info[i].frag_size);
+		rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key);
+	}
+
+	/* If the number of used fragments does not fill up the ring stride,
+	 * remaining (unused) fragments must be padded with null address/size
+	 * and a special memory key */
+	possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE;
+	for (i = priv->num_frags; i < possible_frags; i++) {
+		rx_desc->data[i].byte_count = 0;
+		rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD);
+		rx_desc->data[i].addr = 0;
+	}
+}
+
+static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
+				   struct mlx4_en_rx_ring *ring, int index,
+				   gfp_t gfp)
+{
+	struct mlx4_en_rx_desc *rx_desc = ring->buf +
+		(index << ring->log_stride);
+	struct mlx4_en_rx_alloc *frags = ring->rx_info +
+					(index << priv->log_rx_info);
+	if (likely(ring->page_cache.index > 0)) {
+		/* XDP uses a single page per frame */
+		if (!frags->page) {
+			ring->page_cache.index--;
+			frags->page = ring->page_cache.buf[ring->page_cache.index].page;
+			frags->dma  = ring->page_cache.buf[ring->page_cache.index].dma;
+		}
+		frags->page_offset = XDP_PACKET_HEADROOM;
+		rx_desc->data[0].addr = cpu_to_be64(frags->dma +
+						    XDP_PACKET_HEADROOM);
+		return 0;
+	}
+
+	return mlx4_en_alloc_frags(priv, ring, rx_desc, frags, gfp);
+}
+
+static bool mlx4_en_is_ring_empty(const struct mlx4_en_rx_ring *ring)
+{
+	return ring->prod == ring->cons;
+}
+
+static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
+{
+	*ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff);
+}
+
+/* slow path */
+static void mlx4_en_free_rx_desc(const struct mlx4_en_priv *priv,
+				 struct mlx4_en_rx_ring *ring,
+				 int index)
+{
+	struct mlx4_en_rx_alloc *frags;
+	int nr;
+
+	frags = ring->rx_info + (index << priv->log_rx_info);
+	for (nr = 0; nr < priv->num_frags; nr++) {
+		en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
+		mlx4_en_free_frag(priv, frags + nr);
+	}
+}
+
+/* Function not in fast-path */
+static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_rx_ring *ring;
+	int ring_ind;
+	int buf_ind;
+	int new_size;
+
+	for (buf_ind = 0; buf_ind < priv->prof->rx_ring_size; buf_ind++) {
+		for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+			ring = priv->rx_ring[ring_ind];
+
+			if (mlx4_en_prepare_rx_desc(priv, ring,
+						    ring->actual_size,
+						    GFP_KERNEL)) {
+				if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) {
+					en_err(priv, "Failed to allocate enough rx buffers\n");
+					return -ENOMEM;
+				} else {
+					new_size = rounddown_pow_of_two(ring->actual_size);
+					en_warn(priv, "Only %d buffers allocated reducing ring size to %d\n",
+						ring->actual_size, new_size);
+					goto reduce_rings;
+				}
+			}
+			ring->actual_size++;
+			ring->prod++;
+		}
+	}
+	return 0;
+
+reduce_rings:
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = priv->rx_ring[ring_ind];
+		while (ring->actual_size > new_size) {
+			ring->actual_size--;
+			ring->prod--;
+			mlx4_en_free_rx_desc(priv, ring, ring->actual_size);
+		}
+	}
+
+	return 0;
+}
+
+static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring)
+{
+	int index;
+
+	en_dbg(DRV, priv, "Freeing Rx buf - cons:%d prod:%d\n",
+	       ring->cons, ring->prod);
+
+	/* Unmap and free Rx buffers */
+	for (index = 0; index < ring->size; index++) {
+		en_dbg(DRV, priv, "Processing descriptor:%d\n", index);
+		mlx4_en_free_rx_desc(priv, ring, index);
+	}
+	ring->cons = 0;
+	ring->prod = 0;
+}
+
+void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev)
+{
+	int i;
+	int num_of_eqs;
+	int num_rx_rings;
+	struct mlx4_dev *dev = mdev->dev;
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		num_of_eqs = max_t(int, MIN_RX_RINGS,
+				   min_t(int,
+					 mlx4_get_eqs_per_port(mdev->dev, i),
+					 DEF_RX_RINGS));
+
+		num_rx_rings = mlx4_low_memory_profile() ? MIN_RX_RINGS :
+			min_t(int, num_of_eqs, num_online_cpus());
+		mdev->profile.prof[i].rx_ring_num =
+			rounddown_pow_of_two(num_rx_rings);
+	}
+}
+
+int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_rx_ring **pring,
+			   u32 size, u16 stride, int node, int queue_index)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rx_ring *ring;
+	int err = -ENOMEM;
+	int tmp;
+
+	ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node);
+	if (!ring) {
+		ring = kzalloc(sizeof(*ring), GFP_KERNEL);
+		if (!ring) {
+			en_err(priv, "Failed to allocate RX ring structure\n");
+			return -ENOMEM;
+		}
+	}
+
+	ring->prod = 0;
+	ring->cons = 0;
+	ring->size = size;
+	ring->size_mask = size - 1;
+	ring->stride = stride;
+	ring->log_stride = ffs(ring->stride) - 1;
+	ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
+
+	if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index) < 0)
+		goto err_ring;
+
+	tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
+					sizeof(struct mlx4_en_rx_alloc));
+	ring->rx_info = kvzalloc_node(tmp, GFP_KERNEL, node);
+	if (!ring->rx_info) {
+		err = -ENOMEM;
+		goto err_xdp_info;
+	}
+
+	en_dbg(DRV, priv, "Allocated rx_info ring at addr:%p size:%d\n",
+		 ring->rx_info, tmp);
+
+	/* Allocate HW buffers on provided NUMA node */
+	set_dev_node(&mdev->dev->persist->pdev->dev, node);
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
+	if (err)
+		goto err_info;
+
+	ring->buf = ring->wqres.buf.direct.buf;
+
+	ring->hwtstamp_rx_filter = priv->hwtstamp_config.rx_filter;
+
+	*pring = ring;
+	return 0;
+
+err_info:
+	kvfree(ring->rx_info);
+	ring->rx_info = NULL;
+err_xdp_info:
+	xdp_rxq_info_unreg(&ring->xdp_rxq);
+err_ring:
+	kfree(ring);
+	*pring = NULL;
+
+	return err;
+}
+
+int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_rx_ring *ring;
+	int i;
+	int ring_ind;
+	int err;
+	int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+					DS_SIZE * priv->num_frags);
+
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = priv->rx_ring[ring_ind];
+
+		ring->prod = 0;
+		ring->cons = 0;
+		ring->actual_size = 0;
+		ring->cqn = priv->rx_cq[ring_ind]->mcq.cqn;
+
+		ring->stride = stride;
+		if (ring->stride <= TXBB_SIZE) {
+			/* Stamp first unused send wqe */
+			__be32 *ptr = (__be32 *)ring->buf;
+			__be32 stamp = cpu_to_be32(1 << STAMP_SHIFT);
+			*ptr = stamp;
+			/* Move pointer to start of rx section */
+			ring->buf += TXBB_SIZE;
+		}
+
+		ring->log_stride = ffs(ring->stride) - 1;
+		ring->buf_size = ring->size * ring->stride;
+
+		memset(ring->buf, 0, ring->buf_size);
+		mlx4_en_update_rx_prod_db(ring);
+
+		/* Initialize all descriptors */
+		for (i = 0; i < ring->size; i++)
+			mlx4_en_init_rx_desc(priv, ring, i);
+	}
+	err = mlx4_en_fill_rx_buffers(priv);
+	if (err)
+		goto err_buffers;
+
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = priv->rx_ring[ring_ind];
+
+		ring->size_mask = ring->actual_size - 1;
+		mlx4_en_update_rx_prod_db(ring);
+	}
+
+	return 0;
+
+err_buffers:
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++)
+		mlx4_en_free_rx_buf(priv, priv->rx_ring[ring_ind]);
+
+	ring_ind = priv->rx_ring_num - 1;
+	while (ring_ind >= 0) {
+		if (priv->rx_ring[ring_ind]->stride <= TXBB_SIZE)
+			priv->rx_ring[ring_ind]->buf -= TXBB_SIZE;
+		ring_ind--;
+	}
+	return err;
+}
+
+/* We recover from out of memory by scheduling our napi poll
+ * function (mlx4_en_process_cq), which tries to allocate
+ * all missing RX buffers (call to mlx4_en_refill_rx_buffers).
+ */
+void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv)
+{
+	int ring;
+
+	if (!priv->port_up)
+		return;
+
+	for (ring = 0; ring < priv->rx_ring_num; ring++) {
+		if (mlx4_en_is_ring_empty(priv->rx_ring[ring])) {
+			local_bh_disable();
+			napi_reschedule(&priv->rx_cq[ring]->napi);
+			local_bh_enable();
+		}
+	}
+}
+
+/* When the rx ring is running in page-per-packet mode, a released frame can go
+ * directly into a small cache, to avoid unmapping or touching the page
+ * allocator. In bpf prog performance scenarios, buffers are either forwarded
+ * or dropped, never converted to skbs, so every page can come directly from
+ * this cache when it is sized to be a multiple of the napi budget.
+ */
+bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring,
+			struct mlx4_en_rx_alloc *frame)
+{
+	struct mlx4_en_page_cache *cache = &ring->page_cache;
+
+	if (cache->index >= MLX4_EN_CACHE_SIZE)
+		return false;
+
+	cache->buf[cache->index].page = frame->page;
+	cache->buf[cache->index].dma = frame->dma;
+	cache->index++;
+	return true;
+}
+
+void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_rx_ring **pring,
+			     u32 size, u16 stride)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rx_ring *ring = *pring;
+	struct bpf_prog *old_prog;
+
+	old_prog = rcu_dereference_protected(
+					ring->xdp_prog,
+					lockdep_is_held(&mdev->state_lock));
+	if (old_prog)
+		bpf_prog_put(old_prog);
+	xdp_rxq_info_unreg(&ring->xdp_rxq);
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
+	kvfree(ring->rx_info);
+	ring->rx_info = NULL;
+	kfree(ring);
+	*pring = NULL;
+}
+
+void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring)
+{
+	int i;
+
+	for (i = 0; i < ring->page_cache.index; i++) {
+		dma_unmap_page(priv->ddev, ring->page_cache.buf[i].dma,
+			       PAGE_SIZE, priv->dma_dir);
+		put_page(ring->page_cache.buf[i].page);
+	}
+	ring->page_cache.index = 0;
+	mlx4_en_free_rx_buf(priv, ring);
+	if (ring->stride <= TXBB_SIZE)
+		ring->buf -= TXBB_SIZE;
+}
+
+
+static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
+				    struct mlx4_en_rx_alloc *frags,
+				    struct sk_buff *skb,
+				    int length)
+{
+	const struct mlx4_en_frag_info *frag_info = priv->frag_info;
+	unsigned int truesize = 0;
+	int nr, frag_size;
+	struct page *page;
+	dma_addr_t dma;
+	bool release;
+
+	/* Collect used fragments while replacing them in the HW descriptors */
+	for (nr = 0;; frags++) {
+		frag_size = min_t(int, length, frag_info->frag_size);
+
+		page = frags->page;
+		if (unlikely(!page))
+			goto fail;
+
+		dma = frags->dma;
+		dma_sync_single_range_for_cpu(priv->ddev, dma, frags->page_offset,
+					      frag_size, priv->dma_dir);
+
+		__skb_fill_page_desc(skb, nr, page, frags->page_offset,
+				     frag_size);
+
+		truesize += frag_info->frag_stride;
+		if (frag_info->frag_stride == PAGE_SIZE / 2) {
+			frags->page_offset ^= PAGE_SIZE / 2;
+			release = page_count(page) != 1 ||
+				  page_is_pfmemalloc(page) ||
+				  page_to_nid(page) != numa_mem_id();
+		} else {
+			u32 sz_align = ALIGN(frag_size, SMP_CACHE_BYTES);
+
+			frags->page_offset += sz_align;
+			release = frags->page_offset + frag_info->frag_size > PAGE_SIZE;
+		}
+		if (release) {
+			dma_unmap_page(priv->ddev, dma, PAGE_SIZE, priv->dma_dir);
+			frags->page = NULL;
+		} else {
+			page_ref_inc(page);
+		}
+
+		nr++;
+		length -= frag_size;
+		if (!length)
+			break;
+		frag_info++;
+	}
+	skb->truesize += truesize;
+	return nr;
+
+fail:
+	while (nr > 0) {
+		nr--;
+		__skb_frag_unref(skb_shinfo(skb)->frags + nr);
+	}
+	return 0;
+}
+
+static void validate_loopback(struct mlx4_en_priv *priv, void *va)
+{
+	const unsigned char *data = va + ETH_HLEN;
+	int i;
+
+	for (i = 0; i < MLX4_LOOPBACK_TEST_PAYLOAD; i++) {
+		if (data[i] != (unsigned char)i)
+			return;
+	}
+	/* Loopback found */
+	priv->loopback_ok = 1;
+}
+
+static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
+				      struct mlx4_en_rx_ring *ring)
+{
+	u32 missing = ring->actual_size - (ring->prod - ring->cons);
+
+	/* Try to batch allocations, but not too much. */
+	if (missing < 8)
+		return;
+	do {
+		if (mlx4_en_prepare_rx_desc(priv, ring,
+					    ring->prod & ring->size_mask,
+					    GFP_ATOMIC | __GFP_MEMALLOC))
+			break;
+		ring->prod++;
+	} while (likely(--missing));
+
+	mlx4_en_update_rx_prod_db(ring);
+}
+
+/* When hardware doesn't strip the vlan, we need to calculate the checksum
+ * over it and add it to the hardware's checksum calculation
+ */
+static inline __wsum get_fixed_vlan_csum(__wsum hw_checksum,
+					 struct vlan_hdr *vlanh)
+{
+	return csum_add(hw_checksum, *(__wsum *)vlanh);
+}
+
+/* Although the stack expects checksum which doesn't include the pseudo
+ * header, the HW adds it. To address that, we are subtracting the pseudo
+ * header checksum from the checksum value provided by the HW.
+ */
+static int get_fixed_ipv4_csum(__wsum hw_checksum, struct sk_buff *skb,
+			       struct iphdr *iph)
+{
+	__u16 length_for_csum = 0;
+	__wsum csum_pseudo_header = 0;
+	__u8 ipproto = iph->protocol;
+
+	if (unlikely(ipproto == IPPROTO_SCTP))
+		return -1;
+
+	length_for_csum = (be16_to_cpu(iph->tot_len) - (iph->ihl << 2));
+	csum_pseudo_header = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+						length_for_csum, ipproto, 0);
+	skb->csum = csum_sub(hw_checksum, csum_pseudo_header);
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/* In IPv6 packets, besides subtracting the pseudo header checksum,
+ * we also compute/add the IP header checksum which
+ * is not added by the HW.
+ */
+static int get_fixed_ipv6_csum(__wsum hw_checksum, struct sk_buff *skb,
+			       struct ipv6hdr *ipv6h)
+{
+	__u8 nexthdr = ipv6h->nexthdr;
+	__wsum csum_pseudo_hdr = 0;
+
+	if (unlikely(nexthdr == IPPROTO_FRAGMENT ||
+		     nexthdr == IPPROTO_HOPOPTS ||
+		     nexthdr == IPPROTO_SCTP))
+		return -1;
+	hw_checksum = csum_add(hw_checksum, (__force __wsum)htons(nexthdr));
+
+	csum_pseudo_hdr = csum_partial(&ipv6h->saddr,
+				       sizeof(ipv6h->saddr) + sizeof(ipv6h->daddr), 0);
+	csum_pseudo_hdr = csum_add(csum_pseudo_hdr, (__force __wsum)ipv6h->payload_len);
+	csum_pseudo_hdr = csum_add(csum_pseudo_hdr,
+				   (__force __wsum)htons(nexthdr));
+
+	skb->csum = csum_sub(hw_checksum, csum_pseudo_hdr);
+	skb->csum = csum_add(skb->csum, csum_partial(ipv6h, sizeof(struct ipv6hdr), 0));
+	return 0;
+}
+#endif
+
+/* We reach this function only after checking that any of
+ * the (IPv4 | IPv6) bits are set in cqe->status.
+ */
+static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va,
+		      netdev_features_t dev_features)
+{
+	__wsum hw_checksum = 0;
+
+	void *hdr = (u8 *)va + sizeof(struct ethhdr);
+
+	hw_checksum = csum_unfold((__force __sum16)cqe->checksum);
+
+	if (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK) &&
+	    !(dev_features & NETIF_F_HW_VLAN_CTAG_RX)) {
+		hw_checksum = get_fixed_vlan_csum(hw_checksum, hdr);
+		hdr += sizeof(struct vlan_hdr);
+	}
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6))
+		return get_fixed_ipv6_csum(hw_checksum, skb, hdr);
+#endif
+	return get_fixed_ipv4_csum(hw_checksum, skb, hdr);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+#define MLX4_CQE_STATUS_IP_ANY (MLX4_CQE_STATUS_IPV4 | MLX4_CQE_STATUS_IPV6)
+#else
+#define MLX4_CQE_STATUS_IP_ANY (MLX4_CQE_STATUS_IPV4)
+#endif
+
+int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int factor = priv->cqe_factor;
+	struct mlx4_en_rx_ring *ring;
+	struct bpf_prog *xdp_prog;
+	int cq_ring = cq->ring;
+	bool doorbell_pending;
+	struct mlx4_cqe *cqe;
+	struct xdp_buff xdp;
+	int polled = 0;
+	int index;
+
+	if (unlikely(!priv->port_up || budget <= 0))
+		return 0;
+
+	ring = priv->rx_ring[cq_ring];
+
+	/* Protect accesses to: ring->xdp_prog, priv->mac_hash list */
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(ring->xdp_prog);
+	xdp.rxq = &ring->xdp_rxq;
+	doorbell_pending = 0;
+
+	/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
+	 * descriptor offset can be deduced from the CQE index instead of
+	 * reading 'cqe->index' */
+	index = cq->mcq.cons_index & ring->size_mask;
+	cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
+
+	/* Process all completed CQEs */
+	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
+		    cq->mcq.cons_index & cq->size)) {
+		struct mlx4_en_rx_alloc *frags;
+		enum pkt_hash_types hash_type;
+		struct sk_buff *skb;
+		unsigned int length;
+		int ip_summed;
+		void *va;
+		int nr;
+
+		frags = ring->rx_info + (index << priv->log_rx_info);
+		va = page_address(frags[0].page) + frags[0].page_offset;
+		prefetchw(va);
+		/*
+		 * make sure we read the CQE after we read the ownership bit
+		 */
+		dma_rmb();
+
+		/* Drop packet on bad receive or bad checksum */
+		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+						MLX4_CQE_OPCODE_ERROR)) {
+			en_err(priv, "CQE completed in error - vendor syndrom:%d syndrom:%d\n",
+			       ((struct mlx4_err_cqe *)cqe)->vendor_err_syndrome,
+			       ((struct mlx4_err_cqe *)cqe)->syndrome);
+			goto next;
+		}
+		if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) {
+			en_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n");
+			goto next;
+		}
+
+		/* Check if we need to drop the packet if SRIOV is not enabled
+		 * and not performing the selftest or flb disabled
+		 */
+		if (priv->flags & MLX4_EN_FLAG_RX_FILTER_NEEDED) {
+			const struct ethhdr *ethh = va;
+			dma_addr_t dma;
+			/* Get pointer to first fragment since we haven't
+			 * skb yet and cast it to ethhdr struct
+			 */
+			dma = frags[0].dma + frags[0].page_offset;
+			dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh),
+						DMA_FROM_DEVICE);
+
+			if (is_multicast_ether_addr(ethh->h_dest)) {
+				struct mlx4_mac_entry *entry;
+				struct hlist_head *bucket;
+				unsigned int mac_hash;
+
+				/* Drop the packet, since HW loopback-ed it */
+				mac_hash = ethh->h_source[MLX4_EN_MAC_HASH_IDX];
+				bucket = &priv->mac_hash[mac_hash];
+				hlist_for_each_entry_rcu(entry, bucket, hlist) {
+					if (ether_addr_equal_64bits(entry->mac,
+								    ethh->h_source))
+						goto next;
+				}
+			}
+		}
+
+		if (unlikely(priv->validate_loopback)) {
+			validate_loopback(priv, va);
+			goto next;
+		}
+
+		/*
+		 * Packet is OK - process it.
+		 */
+		length = be32_to_cpu(cqe->byte_cnt);
+		length -= ring->fcs_del;
+
+		/* A bpf program gets first chance to drop the packet. It may
+		 * read bytes but not past the end of the frag.
+		 */
+		if (xdp_prog) {
+			dma_addr_t dma;
+			void *orig_data;
+			u32 act;
+
+			dma = frags[0].dma + frags[0].page_offset;
+			dma_sync_single_for_cpu(priv->ddev, dma,
+						priv->frag_info[0].frag_size,
+						DMA_FROM_DEVICE);
+
+			xdp.data_hard_start = va - frags[0].page_offset;
+			xdp.data = va;
+			xdp_set_data_meta_invalid(&xdp);
+			xdp.data_end = xdp.data + length;
+			orig_data = xdp.data;
+
+			act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+			if (xdp.data != orig_data) {
+				length = xdp.data_end - xdp.data;
+				frags[0].page_offset = xdp.data -
+					xdp.data_hard_start;
+				va = xdp.data;
+			}
+
+			switch (act) {
+			case XDP_PASS:
+				break;
+			case XDP_TX:
+				if (likely(!mlx4_en_xmit_frame(ring, frags, priv,
+							length, cq_ring,
+							&doorbell_pending))) {
+					frags[0].page = NULL;
+					goto next;
+				}
+				trace_xdp_exception(dev, xdp_prog, act);
+				goto xdp_drop_no_cnt; /* Drop on xmit failure */
+			default:
+				bpf_warn_invalid_xdp_action(act);
+			case XDP_ABORTED:
+				trace_xdp_exception(dev, xdp_prog, act);
+			case XDP_DROP:
+				ring->xdp_drop++;
+xdp_drop_no_cnt:
+				goto next;
+			}
+		}
+
+		ring->bytes += length;
+		ring->packets++;
+
+		skb = napi_get_frags(&cq->napi);
+		if (unlikely(!skb))
+			goto next;
+
+		if (unlikely(ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL)) {
+			u64 timestamp = mlx4_en_get_cqe_ts(cqe);
+
+			mlx4_en_fill_hwtstamps(priv->mdev, skb_hwtstamps(skb),
+					       timestamp);
+		}
+		skb_record_rx_queue(skb, cq_ring);
+
+		if (likely(dev->features & NETIF_F_RXCSUM)) {
+			if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
+						       MLX4_CQE_STATUS_UDP)) &&
+			    (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
+			    cqe->checksum == cpu_to_be16(0xffff)) {
+				bool l2_tunnel;
+
+				l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
+					(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
+				ip_summed = CHECKSUM_UNNECESSARY;
+				hash_type = PKT_HASH_TYPE_L4;
+				if (l2_tunnel)
+					skb->csum_level = 1;
+				ring->csum_ok++;
+			} else {
+				if (!(priv->flags & MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP &&
+				      (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IP_ANY))))
+					goto csum_none;
+				if (check_csum(cqe, skb, va, dev->features))
+					goto csum_none;
+				ip_summed = CHECKSUM_COMPLETE;
+				hash_type = PKT_HASH_TYPE_L3;
+				ring->csum_complete++;
+			}
+		} else {
+csum_none:
+			ip_summed = CHECKSUM_NONE;
+			hash_type = PKT_HASH_TYPE_L3;
+			ring->csum_none++;
+		}
+		skb->ip_summed = ip_summed;
+		if (dev->features & NETIF_F_RXHASH)
+			skb_set_hash(skb,
+				     be32_to_cpu(cqe->immed_rss_invalid),
+				     hash_type);
+
+		if ((cqe->vlan_my_qpn &
+		     cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK)) &&
+		    (dev->features & NETIF_F_HW_VLAN_CTAG_RX))
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+					       be16_to_cpu(cqe->sl_vid));
+		else if ((cqe->vlan_my_qpn &
+			  cpu_to_be32(MLX4_CQE_SVLAN_PRESENT_MASK)) &&
+			 (dev->features & NETIF_F_HW_VLAN_STAG_RX))
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD),
+					       be16_to_cpu(cqe->sl_vid));
+
+		nr = mlx4_en_complete_rx_desc(priv, frags, skb, length);
+		if (likely(nr)) {
+			skb_shinfo(skb)->nr_frags = nr;
+			skb->len = length;
+			skb->data_len = length;
+			napi_gro_frags(&cq->napi);
+		} else {
+			skb->vlan_tci = 0;
+			skb_clear_hash(skb);
+		}
+next:
+		++cq->mcq.cons_index;
+		index = (cq->mcq.cons_index) & ring->size_mask;
+		cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
+		if (unlikely(++polled == budget))
+			break;
+	}
+
+	rcu_read_unlock();
+
+	if (likely(polled)) {
+		if (doorbell_pending) {
+			priv->tx_cq[TX_XDP][cq_ring]->xdp_busy = true;
+			mlx4_en_xmit_doorbell(priv->tx_ring[TX_XDP][cq_ring]);
+		}
+
+		mlx4_cq_set_ci(&cq->mcq);
+		wmb(); /* ensure HW sees CQ consumer before we post new buffers */
+		ring->cons = cq->mcq.cons_index;
+	}
+	AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled);
+
+	mlx4_en_refill_rx_buffers(priv, ring);
+
+	return polled;
+}
+
+
+void mlx4_en_rx_irq(struct mlx4_cq *mcq)
+{
+	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
+	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+
+	if (likely(priv->port_up))
+		napi_schedule_irqoff(&cq->napi);
+	else
+		mlx4_en_arm_cq(priv, cq);
+}
+
+/* Rx CQ polling - called by NAPI */
+int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
+{
+	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
+	struct net_device *dev = cq->dev;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_cq *xdp_tx_cq = NULL;
+	bool clean_complete = true;
+	int done;
+
+	if (priv->tx_ring_num[TX_XDP]) {
+		xdp_tx_cq = priv->tx_cq[TX_XDP][cq->ring];
+		if (xdp_tx_cq->xdp_busy) {
+			clean_complete = mlx4_en_process_tx_cq(dev, xdp_tx_cq,
+							       budget);
+			xdp_tx_cq->xdp_busy = !clean_complete;
+		}
+	}
+
+	done = mlx4_en_process_rx_cq(dev, cq, budget);
+
+	/* If we used up all the quota - we're probably not done yet... */
+	if (done == budget || !clean_complete) {
+		const struct cpumask *aff;
+		struct irq_data *idata;
+		int cpu_curr;
+
+		/* in case we got here because of !clean_complete */
+		done = budget;
+
+		INC_PERF_COUNTER(priv->pstats.napi_quota);
+
+		cpu_curr = smp_processor_id();
+		idata = irq_desc_get_irq_data(cq->irq_desc);
+		aff = irq_data_get_affinity_mask(idata);
+
+		if (likely(cpumask_test_cpu(cpu_curr, aff)))
+			return budget;
+
+		/* Current cpu is not according to smp_irq_affinity -
+		 * probably affinity changed. Need to stop this NAPI
+		 * poll, and restart it on the right CPU.
+		 * Try to avoid returning a too small value (like 0),
+		 * to not fool net_rx_action() and its netdev_budget
+		 */
+		if (done)
+			done--;
+	}
+	/* Done for now */
+	if (likely(napi_complete_done(napi, done)))
+		mlx4_en_arm_cq(priv, cq);
+	return done;
+}
+
+void mlx4_en_calc_rx_buf(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
+	int i = 0;
+
+	/* bpf requires buffers to be set up as 1 packet per page.
+	 * This only works when num_frags == 1.
+	 */
+	if (priv->tx_ring_num[TX_XDP]) {
+		priv->frag_info[0].frag_size = eff_mtu;
+		/* This will gain efficient xdp frame recycling at the
+		 * expense of more costly truesize accounting
+		 */
+		priv->frag_info[0].frag_stride = PAGE_SIZE;
+		priv->dma_dir = PCI_DMA_BIDIRECTIONAL;
+		priv->rx_headroom = XDP_PACKET_HEADROOM;
+		i = 1;
+	} else {
+		int frag_size_max = 2048, buf_size = 0;
+
+		/* should not happen, right ? */
+		if (eff_mtu > PAGE_SIZE + (MLX4_EN_MAX_RX_FRAGS - 1) * 2048)
+			frag_size_max = PAGE_SIZE;
+
+		while (buf_size < eff_mtu) {
+			int frag_stride, frag_size = eff_mtu - buf_size;
+			int pad, nb;
+
+			if (i < MLX4_EN_MAX_RX_FRAGS - 1)
+				frag_size = min(frag_size, frag_size_max);
+
+			priv->frag_info[i].frag_size = frag_size;
+			frag_stride = ALIGN(frag_size, SMP_CACHE_BYTES);
+			/* We can only pack 2 1536-bytes frames in on 4K page
+			 * Therefore, each frame would consume more bytes (truesize)
+			 */
+			nb = PAGE_SIZE / frag_stride;
+			pad = (PAGE_SIZE - nb * frag_stride) / nb;
+			pad &= ~(SMP_CACHE_BYTES - 1);
+			priv->frag_info[i].frag_stride = frag_stride + pad;
+
+			buf_size += frag_size;
+			i++;
+		}
+		priv->dma_dir = PCI_DMA_FROMDEVICE;
+		priv->rx_headroom = 0;
+	}
+
+	priv->num_frags = i;
+	priv->rx_skb_size = eff_mtu;
+	priv->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct mlx4_en_rx_alloc));
+
+	en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d num_frags:%d):\n",
+	       eff_mtu, priv->num_frags);
+	for (i = 0; i < priv->num_frags; i++) {
+		en_dbg(DRV,
+		       priv,
+		       "  frag:%d - size:%d stride:%d\n",
+		       i,
+		       priv->frag_info[i].frag_size,
+		       priv->frag_info[i].frag_stride);
+	}
+}
+
+/* RSS related functions */
+
+static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn,
+				 struct mlx4_en_rx_ring *ring,
+				 enum mlx4_qp_state *state,
+				 struct mlx4_qp *qp)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_qp_context *context;
+	int err = 0;
+
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	err = mlx4_qp_alloc(mdev->dev, qpn, qp);
+	if (err) {
+		en_err(priv, "Failed to allocate qp #%x\n", qpn);
+		goto out;
+	}
+	qp->event = mlx4_en_sqp_event;
+
+	memset(context, 0, sizeof(*context));
+	mlx4_en_fill_qp_context(priv, ring->actual_size, ring->stride, 0, 0,
+				qpn, ring->cqn, -1, context);
+	context->db_rec_addr = cpu_to_be64(ring->wqres.db.dma);
+
+	/* Cancel FCS removal if FW allows */
+	if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP) {
+		context->param3 |= cpu_to_be32(1 << 29);
+		if (priv->dev->features & NETIF_F_RXFCS)
+			ring->fcs_del = 0;
+		else
+			ring->fcs_del = ETH_FCS_LEN;
+	} else
+		ring->fcs_del = 0;
+
+	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, context, qp, state);
+	if (err) {
+		mlx4_qp_remove(mdev->dev, qp);
+		mlx4_qp_free(mdev->dev, qp);
+	}
+	mlx4_en_update_rx_prod_db(ring);
+out:
+	kfree(context);
+	return err;
+}
+
+int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
+{
+	int err;
+	u32 qpn;
+
+	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn,
+				    MLX4_RESERVE_A0_QP,
+				    MLX4_RES_USAGE_DRIVER);
+	if (err) {
+		en_err(priv, "Failed reserving drop qpn\n");
+		return err;
+	}
+	err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp);
+	if (err) {
+		en_err(priv, "Failed allocating drop qp\n");
+		mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
+		return err;
+	}
+
+	return 0;
+}
+
+void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv)
+{
+	u32 qpn;
+
+	qpn = priv->drop_qp.qpn;
+	mlx4_qp_remove(priv->mdev->dev, &priv->drop_qp);
+	mlx4_qp_free(priv->mdev->dev, &priv->drop_qp);
+	mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
+}
+
+/* Allocate rx qp's and configure them according to rss map */
+int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
+	struct mlx4_qp_context context;
+	struct mlx4_rss_context *rss_context;
+	int rss_rings;
+	void *ptr;
+	u8 rss_mask = (MLX4_RSS_IPV4 | MLX4_RSS_TCP_IPV4 | MLX4_RSS_IPV6 |
+			MLX4_RSS_TCP_IPV6);
+	int i, qpn;
+	int err = 0;
+	int good_qps = 0;
+	u8 flags;
+
+	en_dbg(DRV, priv, "Configuring rss steering\n");
+
+	flags = priv->rx_ring_num == 1 ? MLX4_RESERVE_A0_QP : 0;
+	err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num,
+				    priv->rx_ring_num,
+				    &rss_map->base_qpn, flags,
+				    MLX4_RES_USAGE_DRIVER);
+	if (err) {
+		en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num);
+		return err;
+	}
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		qpn = rss_map->base_qpn + i;
+		err = mlx4_en_config_rss_qp(priv, qpn, priv->rx_ring[i],
+					    &rss_map->state[i],
+					    &rss_map->qps[i]);
+		if (err)
+			goto rss_err;
+
+		++good_qps;
+	}
+
+	if (priv->rx_ring_num == 1) {
+		rss_map->indir_qp = &rss_map->qps[0];
+		priv->base_qpn = rss_map->indir_qp->qpn;
+		en_info(priv, "Optimized Non-RSS steering\n");
+		return 0;
+	}
+
+	rss_map->indir_qp = kzalloc(sizeof(*rss_map->indir_qp), GFP_KERNEL);
+	if (!rss_map->indir_qp) {
+		err = -ENOMEM;
+		goto rss_err;
+	}
+
+	/* Configure RSS indirection qp */
+	err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, rss_map->indir_qp);
+	if (err) {
+		en_err(priv, "Failed to allocate RSS indirection QP\n");
+		goto rss_err;
+	}
+
+	rss_map->indir_qp->event = mlx4_en_sqp_event;
+	mlx4_en_fill_qp_context(priv, 0, 0, 0, 1, priv->base_qpn,
+				priv->rx_ring[0]->cqn, -1, &context);
+
+	if (!priv->prof->rss_rings || priv->prof->rss_rings > priv->rx_ring_num)
+		rss_rings = priv->rx_ring_num;
+	else
+		rss_rings = priv->prof->rss_rings;
+
+	ptr = ((void *) &context) + offsetof(struct mlx4_qp_context, pri_path)
+					+ MLX4_RSS_OFFSET_IN_QPC_PRI_PATH;
+	rss_context = ptr;
+	rss_context->base_qpn = cpu_to_be32(ilog2(rss_rings) << 24 |
+					    (rss_map->base_qpn));
+	rss_context->default_qpn = cpu_to_be32(rss_map->base_qpn);
+	if (priv->mdev->profile.udp_rss) {
+		rss_mask |=  MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6;
+		rss_context->base_qpn_udp = rss_context->default_qpn;
+	}
+
+	if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+		en_info(priv, "Setting RSS context tunnel type to RSS on inner headers\n");
+		rss_mask |= MLX4_RSS_BY_INNER_HEADERS;
+	}
+
+	rss_context->flags = rss_mask;
+	rss_context->hash_fn = MLX4_RSS_HASH_TOP;
+	if (priv->rss_hash_fn == ETH_RSS_HASH_XOR) {
+		rss_context->hash_fn = MLX4_RSS_HASH_XOR;
+	} else if (priv->rss_hash_fn == ETH_RSS_HASH_TOP) {
+		rss_context->hash_fn = MLX4_RSS_HASH_TOP;
+		memcpy(rss_context->rss_key, priv->rss_key,
+		       MLX4_EN_RSS_KEY_SIZE);
+	} else {
+		en_err(priv, "Unknown RSS hash function requested\n");
+		err = -EINVAL;
+		goto indir_err;
+	}
+
+	err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context,
+			       rss_map->indir_qp, &rss_map->indir_state);
+	if (err)
+		goto indir_err;
+
+	return 0;
+
+indir_err:
+	mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, rss_map->indir_qp);
+	mlx4_qp_remove(mdev->dev, rss_map->indir_qp);
+	mlx4_qp_free(mdev->dev, rss_map->indir_qp);
+	kfree(rss_map->indir_qp);
+	rss_map->indir_qp = NULL;
+rss_err:
+	for (i = 0; i < good_qps; i++) {
+		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
+			       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
+		mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
+		mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
+	}
+	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
+	return err;
+}
+
+void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
+	int i;
+
+	if (priv->rx_ring_num > 1) {
+		mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
+			       MLX4_QP_STATE_RST, NULL, 0, 0,
+			       rss_map->indir_qp);
+		mlx4_qp_remove(mdev->dev, rss_map->indir_qp);
+		mlx4_qp_free(mdev->dev, rss_map->indir_qp);
+		kfree(rss_map->indir_qp);
+		rss_map->indir_qp = NULL;
+	}
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
+			       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
+		mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
+		mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
+	}
+	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c
new file mode 100644
index 0000000..946d9db
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/mlx4/driver.h>
+
+#include "mlx4_en.h"
+
+
+static int mlx4_en_test_registers(struct mlx4_en_priv *priv)
+{
+	return mlx4_cmd(priv->mdev->dev, 0, 0, 0, MLX4_CMD_HW_HEALTH_CHECK,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_en_test_loopback_xmit(struct mlx4_en_priv *priv)
+{
+	struct sk_buff *skb;
+	struct ethhdr *ethh;
+	unsigned char *packet;
+	unsigned int packet_size = MLX4_LOOPBACK_TEST_PAYLOAD;
+	unsigned int i;
+	int err;
+
+
+	/* build the pkt before xmit */
+	skb = netdev_alloc_skb(priv->dev, MLX4_LOOPBACK_TEST_PAYLOAD + ETH_HLEN + NET_IP_ALIGN);
+	if (!skb)
+		return -ENOMEM;
+
+	skb_reserve(skb, NET_IP_ALIGN);
+
+	ethh = skb_put(skb, sizeof(struct ethhdr));
+	packet = skb_put(skb, packet_size);
+	memcpy(ethh->h_dest, priv->dev->dev_addr, ETH_ALEN);
+	eth_zero_addr(ethh->h_source);
+	ethh->h_proto = htons(ETH_P_ARP);
+	skb_reset_mac_header(skb);
+	for (i = 0; i < packet_size; ++i)	/* fill our packet */
+		packet[i] = (unsigned char)(i & 0xff);
+
+	/* xmit the pkt */
+	err = mlx4_en_xmit(skb, priv->dev);
+	return err;
+}
+
+static int mlx4_en_test_loopback(struct mlx4_en_priv *priv)
+{
+	u32 loopback_ok = 0;
+	int i;
+
+        priv->loopback_ok = 0;
+	priv->validate_loopback = 1;
+
+	mlx4_en_update_loopback_state(priv->dev, priv->dev->features);
+
+	/* xmit */
+	if (mlx4_en_test_loopback_xmit(priv)) {
+		en_err(priv, "Transmitting loopback packet failed\n");
+		goto mlx4_en_test_loopback_exit;
+	}
+
+	/* polling for result */
+	for (i = 0; i < MLX4_EN_LOOPBACK_RETRIES; ++i) {
+		msleep(MLX4_EN_LOOPBACK_TIMEOUT);
+		if (priv->loopback_ok) {
+			loopback_ok = 1;
+			break;
+		}
+	}
+	if (!loopback_ok)
+		en_err(priv, "Loopback packet didn't arrive\n");
+
+mlx4_en_test_loopback_exit:
+
+	priv->validate_loopback = 0;
+
+	mlx4_en_update_loopback_state(priv->dev, priv->dev->features);
+	return !loopback_ok;
+}
+
+static int mlx4_en_test_interrupts(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+	int i = 0;
+
+	err = mlx4_test_async(mdev->dev);
+	/* When not in MSI_X or slave, test only async */
+	if (!(mdev->dev->flags & MLX4_FLAG_MSI_X) || mlx4_is_slave(mdev->dev))
+		return err;
+
+	/* A loop over all completion vectors of current port,
+	 * for each vector check whether it works by mapping command
+	 * completions to that vector and performing a NOP command
+	 */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		err = mlx4_test_interrupt(mdev->dev, priv->rx_cq[i]->vector);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+static int mlx4_en_test_link(struct mlx4_en_priv *priv)
+{
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+	if (priv->port_state.link_state == 1)
+		return 0;
+	else
+		return 1;
+}
+
+static int mlx4_en_test_speed(struct mlx4_en_priv *priv)
+{
+
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+
+	/* The device supports 100M, 1G, 10G, 20G, 40G and 56G speed */
+	if (priv->port_state.link_speed != SPEED_100 &&
+	    priv->port_state.link_speed != SPEED_1000 &&
+	    priv->port_state.link_speed != SPEED_10000 &&
+	    priv->port_state.link_speed != SPEED_20000 &&
+	    priv->port_state.link_speed != SPEED_40000 &&
+	    priv->port_state.link_speed != SPEED_56000)
+		return priv->port_state.link_speed;
+
+	return 0;
+}
+
+
+void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int i, carrier_ok;
+
+	memset(buf, 0, sizeof(u64) * MLX4_EN_NUM_SELF_TEST);
+
+	if (*flags & ETH_TEST_FL_OFFLINE) {
+		/* disable the interface */
+		carrier_ok = netif_carrier_ok(dev);
+
+		netif_carrier_off(dev);
+		/* Wait until all tx queues are empty.
+		 * there should not be any additional incoming traffic
+		 * since we turned the carrier off */
+		msleep(200);
+
+		if (priv->mdev->dev->caps.flags &
+					MLX4_DEV_CAP_FLAG_UC_LOOPBACK) {
+			buf[3] = mlx4_en_test_registers(priv);
+			if (priv->port_up && dev->mtu >= MLX4_SELFTEST_LB_MIN_MTU)
+				buf[4] = mlx4_en_test_loopback(priv);
+		}
+
+		if (carrier_ok)
+			netif_carrier_on(dev);
+
+	}
+	buf[0] = mlx4_en_test_interrupts(priv);
+	buf[1] = mlx4_en_test_link(priv);
+	buf[2] = mlx4_en_test_speed(priv);
+
+	for (i = 0; i < MLX4_EN_NUM_SELF_TEST; i++) {
+		if (buf[i])
+			*flags |= ETH_TEST_FL_FAILED;
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
new file mode 100644
index 0000000..6b68537
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -0,0 +1,1184 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <asm/page.h>
+#include <linux/mlx4/cq.h>
+#include <linux/slab.h>
+#include <linux/mlx4/qp.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <linux/prefetch.h>
+#include <linux/vmalloc.h>
+#include <linux/tcp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/moduleparam.h>
+
+#include "mlx4_en.h"
+
+int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_tx_ring **pring, u32 size,
+			   u16 stride, int node, int queue_index)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring *ring;
+	int tmp;
+	int err;
+
+	ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node);
+	if (!ring) {
+		ring = kzalloc(sizeof(*ring), GFP_KERNEL);
+		if (!ring) {
+			en_err(priv, "Failed allocating TX ring\n");
+			return -ENOMEM;
+		}
+	}
+
+	ring->size = size;
+	ring->size_mask = size - 1;
+	ring->sp_stride = stride;
+	ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS;
+
+	tmp = size * sizeof(struct mlx4_en_tx_info);
+	ring->tx_info = kvmalloc_node(tmp, GFP_KERNEL, node);
+	if (!ring->tx_info) {
+		err = -ENOMEM;
+		goto err_ring;
+	}
+
+	en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
+		 ring->tx_info, tmp);
+
+	ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, GFP_KERNEL, node);
+	if (!ring->bounce_buf) {
+		ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
+		if (!ring->bounce_buf) {
+			err = -ENOMEM;
+			goto err_info;
+		}
+	}
+	ring->buf_size = ALIGN(size * ring->sp_stride, MLX4_EN_PAGE_SIZE);
+
+	/* Allocate HW buffers on provided NUMA node */
+	set_dev_node(&mdev->dev->persist->pdev->dev, node);
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
+	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
+	if (err) {
+		en_err(priv, "Failed allocating hwq resources\n");
+		goto err_bounce;
+	}
+
+	ring->buf = ring->sp_wqres.buf.direct.buf;
+
+	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n",
+	       ring, ring->buf, ring->size, ring->buf_size,
+	       (unsigned long long) ring->sp_wqres.buf.direct.map);
+
+	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
+				    MLX4_RESERVE_ETH_BF_QP,
+				    MLX4_RES_USAGE_DRIVER);
+	if (err) {
+		en_err(priv, "failed reserving qp for TX ring\n");
+		goto err_hwq_res;
+	}
+
+	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->sp_qp);
+	if (err) {
+		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
+		goto err_reserve;
+	}
+	ring->sp_qp.event = mlx4_en_sqp_event;
+
+	err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
+	if (err) {
+		en_dbg(DRV, priv, "working without blueflame (%d)\n", err);
+		ring->bf.uar = &mdev->priv_uar;
+		ring->bf.uar->map = mdev->uar_map;
+		ring->bf_enabled = false;
+		ring->bf_alloced = false;
+		priv->pflags &= ~MLX4_EN_PRIV_FLAGS_BLUEFLAME;
+	} else {
+		ring->bf_alloced = true;
+		ring->bf_enabled = !!(priv->pflags &
+				      MLX4_EN_PRIV_FLAGS_BLUEFLAME);
+	}
+
+	ring->hwtstamp_tx_type = priv->hwtstamp_config.tx_type;
+	ring->queue_index = queue_index;
+
+	if (queue_index < priv->num_tx_rings_p_up)
+		cpumask_set_cpu(cpumask_local_spread(queue_index,
+						     priv->mdev->dev->numa_node),
+				&ring->sp_affinity_mask);
+
+	*pring = ring;
+	return 0;
+
+err_reserve:
+	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
+err_hwq_res:
+	mlx4_free_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
+err_bounce:
+	kfree(ring->bounce_buf);
+	ring->bounce_buf = NULL;
+err_info:
+	kvfree(ring->tx_info);
+	ring->tx_info = NULL;
+err_ring:
+	kfree(ring);
+	*pring = NULL;
+	return err;
+}
+
+void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring **pring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring *ring = *pring;
+	en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
+
+	if (ring->bf_alloced)
+		mlx4_bf_free(mdev->dev, &ring->bf);
+	mlx4_qp_remove(mdev->dev, &ring->sp_qp);
+	mlx4_qp_free(mdev->dev, &ring->sp_qp);
+	mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1);
+	mlx4_free_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
+	kfree(ring->bounce_buf);
+	ring->bounce_buf = NULL;
+	kvfree(ring->tx_info);
+	ring->tx_info = NULL;
+	kfree(ring);
+	*pring = NULL;
+}
+
+int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring *ring,
+			     int cq, int user_prio)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	ring->sp_cqn = cq;
+	ring->prod = 0;
+	ring->cons = 0xffffffff;
+	ring->last_nr_txbb = 1;
+	memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
+	memset(ring->buf, 0, ring->buf_size);
+	ring->free_tx_desc = mlx4_en_free_tx_desc;
+
+	ring->sp_qp_state = MLX4_QP_STATE_RST;
+	ring->doorbell_qpn = cpu_to_be32(ring->sp_qp.qpn << 8);
+	ring->mr_key = cpu_to_be32(mdev->mr.key);
+
+	mlx4_en_fill_qp_context(priv, ring->size, ring->sp_stride, 1, 0, ring->qpn,
+				ring->sp_cqn, user_prio, &ring->sp_context);
+	if (ring->bf_alloced)
+		ring->sp_context.usr_page =
+			cpu_to_be32(mlx4_to_hw_uar_index(mdev->dev,
+							 ring->bf.uar->index));
+
+	err = mlx4_qp_to_ready(mdev->dev, &ring->sp_wqres.mtt, &ring->sp_context,
+			       &ring->sp_qp, &ring->sp_qp_state);
+	if (!cpumask_empty(&ring->sp_affinity_mask))
+		netif_set_xps_queue(priv->dev, &ring->sp_affinity_mask,
+				    ring->queue_index);
+
+	return err;
+}
+
+void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_tx_ring *ring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	mlx4_qp_modify(mdev->dev, NULL, ring->sp_qp_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->sp_qp);
+}
+
+static inline bool mlx4_en_is_tx_ring_full(struct mlx4_en_tx_ring *ring)
+{
+	return ring->prod - ring->cons > ring->full_size;
+}
+
+static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
+			      struct mlx4_en_tx_ring *ring, int index,
+			      u8 owner)
+{
+	__be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
+	struct mlx4_en_tx_desc *tx_desc = ring->buf + (index << LOG_TXBB_SIZE);
+	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
+	void *end = ring->buf + ring->buf_size;
+	__be32 *ptr = (__be32 *)tx_desc;
+	int i;
+
+	/* Optimize the common case when there are no wraparounds */
+	if (likely((void *)tx_desc +
+		   (tx_info->nr_txbb << LOG_TXBB_SIZE) <= end)) {
+		/* Stamp the freed descriptor */
+		for (i = 0; i < tx_info->nr_txbb << LOG_TXBB_SIZE;
+		     i += STAMP_STRIDE) {
+			*ptr = stamp;
+			ptr += STAMP_DWORDS;
+		}
+	} else {
+		/* Stamp the freed descriptor */
+		for (i = 0; i < tx_info->nr_txbb << LOG_TXBB_SIZE;
+		     i += STAMP_STRIDE) {
+			*ptr = stamp;
+			ptr += STAMP_DWORDS;
+			if ((void *)ptr >= end) {
+				ptr = ring->buf;
+				stamp ^= cpu_to_be32(0x80000000);
+			}
+		}
+	}
+}
+
+
+u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
+			 struct mlx4_en_tx_ring *ring,
+			 int index, u64 timestamp,
+			 int napi_mode)
+{
+	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
+	struct mlx4_en_tx_desc *tx_desc = ring->buf + (index << LOG_TXBB_SIZE);
+	struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
+	void *end = ring->buf + ring->buf_size;
+	struct sk_buff *skb = tx_info->skb;
+	int nr_maps = tx_info->nr_maps;
+	int i;
+
+	/* We do not touch skb here, so prefetch skb->users location
+	 * to speedup consume_skb()
+	 */
+	prefetchw(&skb->users);
+
+	if (unlikely(timestamp)) {
+		struct skb_shared_hwtstamps hwts;
+
+		mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp);
+		skb_tstamp_tx(skb, &hwts);
+	}
+
+	if (!tx_info->inl) {
+		if (tx_info->linear)
+			dma_unmap_single(priv->ddev,
+					 tx_info->map0_dma,
+					 tx_info->map0_byte_count,
+					 PCI_DMA_TODEVICE);
+		else
+			dma_unmap_page(priv->ddev,
+				       tx_info->map0_dma,
+				       tx_info->map0_byte_count,
+				       PCI_DMA_TODEVICE);
+		/* Optimize the common case when there are no wraparounds */
+		if (likely((void *)tx_desc +
+			   (tx_info->nr_txbb << LOG_TXBB_SIZE) <= end)) {
+			for (i = 1; i < nr_maps; i++) {
+				data++;
+				dma_unmap_page(priv->ddev,
+					(dma_addr_t)be64_to_cpu(data->addr),
+					be32_to_cpu(data->byte_count),
+					PCI_DMA_TODEVICE);
+			}
+		} else {
+			if ((void *)data >= end)
+				data = ring->buf + ((void *)data - end);
+
+			for (i = 1; i < nr_maps; i++) {
+				data++;
+				/* Check for wraparound before unmapping */
+				if ((void *) data >= end)
+					data = ring->buf;
+				dma_unmap_page(priv->ddev,
+					(dma_addr_t)be64_to_cpu(data->addr),
+					be32_to_cpu(data->byte_count),
+					PCI_DMA_TODEVICE);
+			}
+		}
+	}
+	napi_consume_skb(skb, napi_mode);
+
+	return tx_info->nr_txbb;
+}
+
+u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
+			    struct mlx4_en_tx_ring *ring,
+			    int index, u64 timestamp,
+			    int napi_mode)
+{
+	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
+	struct mlx4_en_rx_alloc frame = {
+		.page = tx_info->page,
+		.dma = tx_info->map0_dma,
+	};
+
+	if (!mlx4_en_rx_recycle(ring->recycle_ring, &frame)) {
+		dma_unmap_page(priv->ddev, tx_info->map0_dma,
+			       PAGE_SIZE, priv->dma_dir);
+		put_page(tx_info->page);
+	}
+
+	return tx_info->nr_txbb;
+}
+
+int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int cnt = 0;
+
+	/* Skip last polled descriptor */
+	ring->cons += ring->last_nr_txbb;
+	en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
+		 ring->cons, ring->prod);
+
+	if ((u32) (ring->prod - ring->cons) > ring->size) {
+		if (netif_msg_tx_err(priv))
+			en_warn(priv, "Tx consumer passed producer!\n");
+		return 0;
+	}
+
+	while (ring->cons != ring->prod) {
+		ring->last_nr_txbb = ring->free_tx_desc(priv, ring,
+						ring->cons & ring->size_mask,
+						0, 0 /* Non-NAPI caller */);
+		ring->cons += ring->last_nr_txbb;
+		cnt++;
+	}
+
+	if (ring->tx_queue)
+		netdev_tx_reset_queue(ring->tx_queue);
+
+	if (cnt)
+		en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt);
+
+	return cnt;
+}
+
+bool mlx4_en_process_tx_cq(struct net_device *dev,
+			   struct mlx4_en_cq *cq, int napi_budget)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_cq *mcq = &cq->mcq;
+	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->type][cq->ring];
+	struct mlx4_cqe *cqe;
+	u16 index, ring_index, stamp_index;
+	u32 txbbs_skipped = 0;
+	u32 txbbs_stamp = 0;
+	u32 cons_index = mcq->cons_index;
+	int size = cq->size;
+	u32 size_mask = ring->size_mask;
+	struct mlx4_cqe *buf = cq->buf;
+	u32 packets = 0;
+	u32 bytes = 0;
+	int factor = priv->cqe_factor;
+	int done = 0;
+	int budget = priv->tx_work_limit;
+	u32 last_nr_txbb;
+	u32 ring_cons;
+
+	if (unlikely(!priv->port_up))
+		return true;
+
+	netdev_txq_bql_complete_prefetchw(ring->tx_queue);
+
+	index = cons_index & size_mask;
+	cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
+	last_nr_txbb = READ_ONCE(ring->last_nr_txbb);
+	ring_cons = READ_ONCE(ring->cons);
+	ring_index = ring_cons & size_mask;
+	stamp_index = ring_index;
+
+	/* Process all completed CQEs */
+	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
+			cons_index & size) && (done < budget)) {
+		u16 new_index;
+
+		/*
+		 * make sure we read the CQE after we read the
+		 * ownership bit
+		 */
+		dma_rmb();
+
+		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+			     MLX4_CQE_OPCODE_ERROR)) {
+			struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe;
+
+			en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n",
+			       cqe_err->vendor_err_syndrome,
+			       cqe_err->syndrome);
+		}
+
+		/* Skip over last polled CQE */
+		new_index = be16_to_cpu(cqe->wqe_index) & size_mask;
+
+		do {
+			u64 timestamp = 0;
+
+			txbbs_skipped += last_nr_txbb;
+			ring_index = (ring_index + last_nr_txbb) & size_mask;
+
+			if (unlikely(ring->tx_info[ring_index].ts_requested))
+				timestamp = mlx4_en_get_cqe_ts(cqe);
+
+			/* free next descriptor */
+			last_nr_txbb = ring->free_tx_desc(
+					priv, ring, ring_index,
+					timestamp, napi_budget);
+
+			mlx4_en_stamp_wqe(priv, ring, stamp_index,
+					  !!((ring_cons + txbbs_stamp) &
+						ring->size));
+			stamp_index = ring_index;
+			txbbs_stamp = txbbs_skipped;
+			packets++;
+			bytes += ring->tx_info[ring_index].nr_bytes;
+		} while ((++done < budget) && (ring_index != new_index));
+
+		++cons_index;
+		index = cons_index & size_mask;
+		cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
+	}
+
+	/*
+	 * To prevent CQ overflow we first update CQ consumer and only then
+	 * the ring consumer.
+	 */
+	mcq->cons_index = cons_index;
+	mlx4_cq_set_ci(mcq);
+	wmb();
+
+	/* we want to dirty this cache line once */
+	WRITE_ONCE(ring->last_nr_txbb, last_nr_txbb);
+	WRITE_ONCE(ring->cons, ring_cons + txbbs_skipped);
+
+	if (cq->type == TX_XDP)
+		return done < budget;
+
+	netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
+
+	/* Wakeup Tx queue if this stopped, and ring is not full.
+	 */
+	if (netif_tx_queue_stopped(ring->tx_queue) &&
+	    !mlx4_en_is_tx_ring_full(ring)) {
+		netif_tx_wake_queue(ring->tx_queue);
+		ring->wake_queue++;
+	}
+
+	return done < budget;
+}
+
+void mlx4_en_tx_irq(struct mlx4_cq *mcq)
+{
+	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
+	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+
+	if (likely(priv->port_up))
+		napi_schedule_irqoff(&cq->napi);
+	else
+		mlx4_en_arm_cq(priv, cq);
+}
+
+/* TX CQ polling - called by NAPI */
+int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget)
+{
+	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
+	struct net_device *dev = cq->dev;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	bool clean_complete;
+
+	clean_complete = mlx4_en_process_tx_cq(dev, cq, budget);
+	if (!clean_complete)
+		return budget;
+
+	napi_complete(napi);
+	mlx4_en_arm_cq(priv, cq);
+
+	return 0;
+}
+
+static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
+						      struct mlx4_en_tx_ring *ring,
+						      u32 index,
+						      unsigned int desc_size)
+{
+	u32 copy = (ring->size - index) << LOG_TXBB_SIZE;
+	int i;
+
+	for (i = desc_size - copy - 4; i >= 0; i -= 4) {
+		if ((i & (TXBB_SIZE - 1)) == 0)
+			wmb();
+
+		*((u32 *) (ring->buf + i)) =
+			*((u32 *) (ring->bounce_buf + copy + i));
+	}
+
+	for (i = copy - 4; i >= 4 ; i -= 4) {
+		if ((i & (TXBB_SIZE - 1)) == 0)
+			wmb();
+
+		*((u32 *)(ring->buf + (index << LOG_TXBB_SIZE) + i)) =
+			*((u32 *) (ring->bounce_buf + i));
+	}
+
+	/* Return real descriptor location */
+	return ring->buf + (index << LOG_TXBB_SIZE);
+}
+
+/* Decide if skb can be inlined in tx descriptor to avoid dma mapping
+ *
+ * It seems strange we do not simply use skb_copy_bits().
+ * This would allow to inline all skbs iff skb->len <= inline_thold
+ *
+ * Note that caller already checked skb was not a gso packet
+ */
+static bool is_inline(int inline_thold, const struct sk_buff *skb,
+		      const struct skb_shared_info *shinfo,
+		      void **pfrag)
+{
+	void *ptr;
+
+	if (skb->len > inline_thold || !inline_thold)
+		return false;
+
+	if (shinfo->nr_frags == 1) {
+		ptr = skb_frag_address_safe(&shinfo->frags[0]);
+		if (unlikely(!ptr))
+			return false;
+		*pfrag = ptr;
+		return true;
+	}
+	if (shinfo->nr_frags)
+		return false;
+	return true;
+}
+
+static int inline_size(const struct sk_buff *skb)
+{
+	if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
+	    <= MLX4_INLINE_ALIGN)
+		return ALIGN(skb->len + CTRL_SIZE +
+			     sizeof(struct mlx4_wqe_inline_seg), 16);
+	else
+		return ALIGN(skb->len + CTRL_SIZE + 2 *
+			     sizeof(struct mlx4_wqe_inline_seg), 16);
+}
+
+static int get_real_size(const struct sk_buff *skb,
+			 const struct skb_shared_info *shinfo,
+			 struct net_device *dev,
+			 int *lso_header_size,
+			 bool *inline_ok,
+			 void **pfrag)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int real_size;
+
+	if (shinfo->gso_size) {
+		*inline_ok = false;
+		if (skb->encapsulation)
+			*lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb);
+		else
+			*lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE +
+			ALIGN(*lso_header_size + 4, DS_SIZE);
+		if (unlikely(*lso_header_size != skb_headlen(skb))) {
+			/* We add a segment for the skb linear buffer only if
+			 * it contains data */
+			if (*lso_header_size < skb_headlen(skb))
+				real_size += DS_SIZE;
+			else {
+				if (netif_msg_tx_err(priv))
+					en_warn(priv, "Non-linear headers\n");
+				return 0;
+			}
+		}
+	} else {
+		*lso_header_size = 0;
+		*inline_ok = is_inline(priv->prof->inline_thold, skb,
+				       shinfo, pfrag);
+
+		if (*inline_ok)
+			real_size = inline_size(skb);
+		else
+			real_size = CTRL_SIZE +
+				    (shinfo->nr_frags + 1) * DS_SIZE;
+	}
+
+	return real_size;
+}
+
+static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
+			     const struct sk_buff *skb,
+			     const struct skb_shared_info *shinfo,
+			     void *fragptr)
+{
+	struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
+	int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof(*inl);
+	unsigned int hlen = skb_headlen(skb);
+
+	if (skb->len <= spc) {
+		if (likely(skb->len >= MIN_PKT_LEN)) {
+			inl->byte_count = cpu_to_be32(1 << 31 | skb->len);
+		} else {
+			inl->byte_count = cpu_to_be32(1 << 31 | MIN_PKT_LEN);
+			memset(((void *)(inl + 1)) + skb->len, 0,
+			       MIN_PKT_LEN - skb->len);
+		}
+		skb_copy_from_linear_data(skb, inl + 1, hlen);
+		if (shinfo->nr_frags)
+			memcpy(((void *)(inl + 1)) + hlen, fragptr,
+			       skb_frag_size(&shinfo->frags[0]));
+
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		if (hlen <= spc) {
+			skb_copy_from_linear_data(skb, inl + 1, hlen);
+			if (hlen < spc) {
+				memcpy(((void *)(inl + 1)) + hlen,
+				       fragptr, spc - hlen);
+				fragptr +=  spc - hlen;
+			}
+			inl = (void *) (inl + 1) + spc;
+			memcpy(((void *)(inl + 1)), fragptr, skb->len - spc);
+		} else {
+			skb_copy_from_linear_data(skb, inl + 1, spc);
+			inl = (void *) (inl + 1) + spc;
+			skb_copy_from_linear_data_offset(skb, spc, inl + 1,
+							 hlen - spc);
+			if (shinfo->nr_frags)
+				memcpy(((void *)(inl + 1)) + hlen - spc,
+				       fragptr,
+				       skb_frag_size(&shinfo->frags[0]));
+		}
+
+		dma_wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (skb->len - spc));
+	}
+}
+
+u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
+			 void *accel_priv, select_queue_fallback_t fallback)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	u16 rings_p_up = priv->num_tx_rings_p_up;
+
+	if (netdev_get_num_tc(dev))
+		return skb_tx_hash(dev, skb);
+
+	return fallback(dev, skb) % rings_p_up;
+}
+
+static void mlx4_bf_copy(void __iomem *dst, const void *src,
+			 unsigned int bytecnt)
+{
+	__iowrite64_copy(dst, src, bytecnt / 8);
+}
+
+void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring)
+{
+	wmb();
+	/* Since there is no iowrite*_native() that writes the
+	 * value as is, without byteswapping - using the one
+	 * the doesn't do byteswapping in the relevant arch
+	 * endianness.
+	 */
+#if defined(__LITTLE_ENDIAN)
+	iowrite32(
+#else
+	iowrite32be(
+#endif
+		  (__force u32)ring->doorbell_qpn,
+		  ring->bf.uar->map + MLX4_SEND_DOORBELL);
+}
+
+static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring *ring,
+				  struct mlx4_en_tx_desc *tx_desc,
+				  union mlx4_wqe_qpn_vlan qpn_vlan,
+				  int desc_size, int bf_index,
+				  __be32 op_own, bool bf_ok,
+				  bool send_doorbell)
+{
+	tx_desc->ctrl.qpn_vlan = qpn_vlan;
+
+	if (bf_ok) {
+		op_own |= htonl((bf_index & 0xffff) << 8);
+		/* Ensure new descriptor hits memory
+		 * before setting ownership of this descriptor to HW
+		 */
+		dma_wmb();
+		tx_desc->ctrl.owner_opcode = op_own;
+
+		wmb();
+
+		mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
+			     desc_size);
+
+		wmb();
+
+		ring->bf.offset ^= ring->bf.buf_size;
+	} else {
+		/* Ensure new descriptor hits memory
+		 * before setting ownership of this descriptor to HW
+		 */
+		dma_wmb();
+		tx_desc->ctrl.owner_opcode = op_own;
+		if (send_doorbell)
+			mlx4_en_xmit_doorbell(ring);
+		else
+			ring->xmit_more++;
+	}
+}
+
+static bool mlx4_en_build_dma_wqe(struct mlx4_en_priv *priv,
+				  struct skb_shared_info *shinfo,
+				  struct mlx4_wqe_data_seg *data,
+				  struct sk_buff *skb,
+				  int lso_header_size,
+				  __be32 mr_key,
+				  struct mlx4_en_tx_info *tx_info)
+{
+	struct device *ddev = priv->ddev;
+	dma_addr_t dma = 0;
+	u32 byte_count = 0;
+	int i_frag;
+
+	/* Map fragments if any */
+	for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) {
+		const struct skb_frag_struct *frag;
+
+		frag = &shinfo->frags[i_frag];
+		byte_count = skb_frag_size(frag);
+		dma = skb_frag_dma_map(ddev, frag,
+				       0, byte_count,
+				       DMA_TO_DEVICE);
+		if (dma_mapping_error(ddev, dma))
+			goto tx_drop_unmap;
+
+		data->addr = cpu_to_be64(dma);
+		data->lkey = mr_key;
+		dma_wmb();
+		data->byte_count = cpu_to_be32(byte_count);
+		--data;
+	}
+
+	/* Map linear part if needed */
+	if (tx_info->linear) {
+		byte_count = skb_headlen(skb) - lso_header_size;
+
+		dma = dma_map_single(ddev, skb->data +
+				     lso_header_size, byte_count,
+				     PCI_DMA_TODEVICE);
+		if (dma_mapping_error(ddev, dma))
+			goto tx_drop_unmap;
+
+		data->addr = cpu_to_be64(dma);
+		data->lkey = mr_key;
+		dma_wmb();
+		data->byte_count = cpu_to_be32(byte_count);
+	}
+	/* tx completion can avoid cache line miss for common cases */
+	tx_info->map0_dma = dma;
+	tx_info->map0_byte_count = byte_count;
+
+	return true;
+
+tx_drop_unmap:
+	en_err(priv, "DMA mapping error\n");
+
+	while (++i_frag < shinfo->nr_frags) {
+		++data;
+		dma_unmap_page(ddev, (dma_addr_t)be64_to_cpu(data->addr),
+			       be32_to_cpu(data->byte_count),
+			       PCI_DMA_TODEVICE);
+	}
+
+	return false;
+}
+
+netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	union mlx4_wqe_qpn_vlan	qpn_vlan = {};
+	struct mlx4_en_tx_ring *ring;
+	struct mlx4_en_tx_desc *tx_desc;
+	struct mlx4_wqe_data_seg *data;
+	struct mlx4_en_tx_info *tx_info;
+	int tx_ind;
+	int nr_txbb;
+	int desc_size;
+	int real_size;
+	u32 index, bf_index;
+	__be32 op_own;
+	int lso_header_size;
+	void *fragptr = NULL;
+	bool bounce = false;
+	bool send_doorbell;
+	bool stop_queue;
+	bool inline_ok;
+	u8 data_offset;
+	u32 ring_cons;
+	bool bf_ok;
+
+	tx_ind = skb_get_queue_mapping(skb);
+	ring = priv->tx_ring[TX][tx_ind];
+
+	if (unlikely(!priv->port_up))
+		goto tx_drop;
+
+	/* fetch ring->cons far ahead before needing it to avoid stall */
+	ring_cons = READ_ONCE(ring->cons);
+
+	real_size = get_real_size(skb, shinfo, dev, &lso_header_size,
+				  &inline_ok, &fragptr);
+	if (unlikely(!real_size))
+		goto tx_drop_count;
+
+	/* Align descriptor to TXBB size */
+	desc_size = ALIGN(real_size, TXBB_SIZE);
+	nr_txbb = desc_size >> LOG_TXBB_SIZE;
+	if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
+		if (netif_msg_tx_err(priv))
+			en_warn(priv, "Oversized header or SG list\n");
+		goto tx_drop_count;
+	}
+
+	bf_ok = ring->bf_enabled;
+	if (skb_vlan_tag_present(skb)) {
+		u16 vlan_proto;
+
+		qpn_vlan.vlan_tag = cpu_to_be16(skb_vlan_tag_get(skb));
+		vlan_proto = be16_to_cpu(skb->vlan_proto);
+		if (vlan_proto == ETH_P_8021AD)
+			qpn_vlan.ins_vlan = MLX4_WQE_CTRL_INS_SVLAN;
+		else if (vlan_proto == ETH_P_8021Q)
+			qpn_vlan.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN;
+		else
+			qpn_vlan.ins_vlan = 0;
+		bf_ok = false;
+	}
+
+	netdev_txq_bql_enqueue_prefetchw(ring->tx_queue);
+
+	/* Track current inflight packets for performance analysis */
+	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
+			 (u32)(ring->prod - ring_cons - 1));
+
+	/* Packet is good - grab an index and transmit it */
+	index = ring->prod & ring->size_mask;
+	bf_index = ring->prod;
+
+	/* See if we have enough space for whole descriptor TXBB for setting
+	 * SW ownership on next descriptor; if not, use a bounce buffer. */
+	if (likely(index + nr_txbb <= ring->size))
+		tx_desc = ring->buf + (index << LOG_TXBB_SIZE);
+	else {
+		tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
+		bounce = true;
+		bf_ok = false;
+	}
+
+	/* Save skb in tx_info ring */
+	tx_info = &ring->tx_info[index];
+	tx_info->skb = skb;
+	tx_info->nr_txbb = nr_txbb;
+
+	if (!lso_header_size) {
+		data = &tx_desc->data;
+		data_offset = offsetof(struct mlx4_en_tx_desc, data);
+	} else {
+		int lso_align = ALIGN(lso_header_size + 4, DS_SIZE);
+
+		data = (void *)&tx_desc->lso + lso_align;
+		data_offset = offsetof(struct mlx4_en_tx_desc, lso) + lso_align;
+	}
+
+	/* valid only for none inline segments */
+	tx_info->data_offset = data_offset;
+
+	tx_info->inl = inline_ok;
+
+	tx_info->linear = lso_header_size < skb_headlen(skb) && !inline_ok;
+
+	tx_info->nr_maps = shinfo->nr_frags + tx_info->linear;
+	data += tx_info->nr_maps - 1;
+
+	if (!tx_info->inl)
+		if (!mlx4_en_build_dma_wqe(priv, shinfo, data, skb,
+					   lso_header_size, ring->mr_key,
+					   tx_info))
+			goto tx_drop_count;
+
+	/*
+	 * For timestamping add flag to skb_shinfo and
+	 * set flag for further reference
+	 */
+	tx_info->ts_requested = 0;
+	if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON &&
+		     shinfo->tx_flags & SKBTX_HW_TSTAMP)) {
+		shinfo->tx_flags |= SKBTX_IN_PROGRESS;
+		tx_info->ts_requested = 1;
+	}
+
+	/* Prepare ctrl segement apart opcode+ownership, which depends on
+	 * whether LSO is used */
+	tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
+	if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
+		if (!skb->encapsulation)
+			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+								 MLX4_WQE_CTRL_TCP_UDP_CSUM);
+		else
+			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM);
+		ring->tx_csum++;
+	}
+
+	if (priv->flags & MLX4_EN_FLAG_ENABLE_HW_LOOPBACK) {
+		struct ethhdr *ethh;
+
+		/* Copy dst mac address to wqe. This allows loopback in eSwitch,
+		 * so that VFs and PF can communicate with each other
+		 */
+		ethh = (struct ethhdr *)skb->data;
+		tx_desc->ctrl.srcrb_flags16[0] = get_unaligned((__be16 *)ethh->h_dest);
+		tx_desc->ctrl.imm = get_unaligned((__be32 *)(ethh->h_dest + 2));
+	}
+
+	/* Handle LSO (TSO) packets */
+	if (lso_header_size) {
+		int i;
+
+		/* Mark opcode as LSO */
+		op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
+			((ring->prod & ring->size) ?
+				cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+
+		/* Fill in the LSO prefix */
+		tx_desc->lso.mss_hdr_size = cpu_to_be32(
+			shinfo->gso_size << 16 | lso_header_size);
+
+		/* Copy headers;
+		 * note that we already verified that it is linear */
+		memcpy(tx_desc->lso.header, skb->data, lso_header_size);
+
+		ring->tso_packets++;
+
+		i = shinfo->gso_segs;
+		tx_info->nr_bytes = skb->len + (i - 1) * lso_header_size;
+		ring->packets += i;
+	} else {
+		/* Normal (Non LSO) packet */
+		op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
+			((ring->prod & ring->size) ?
+			 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+		tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+		ring->packets++;
+	}
+	ring->bytes += tx_info->nr_bytes;
+	netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes);
+	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len);
+
+	if (tx_info->inl)
+		build_inline_wqe(tx_desc, skb, shinfo, fragptr);
+
+	if (skb->encapsulation) {
+		union {
+			struct iphdr *v4;
+			struct ipv6hdr *v6;
+			unsigned char *hdr;
+		} ip;
+		u8 proto;
+
+		ip.hdr = skb_inner_network_header(skb);
+		proto = (ip.v4->version == 4) ? ip.v4->protocol :
+						ip.v6->nexthdr;
+
+		if (proto == IPPROTO_TCP || proto == IPPROTO_UDP)
+			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP | MLX4_WQE_CTRL_ILP);
+		else
+			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP);
+	}
+
+	ring->prod += nr_txbb;
+
+	/* If we used a bounce buffer then copy descriptor back into place */
+	if (unlikely(bounce))
+		tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
+
+	skb_tx_timestamp(skb);
+
+	/* Check available TXBBs And 2K spare for prefetch */
+	stop_queue = mlx4_en_is_tx_ring_full(ring);
+	if (unlikely(stop_queue)) {
+		netif_tx_stop_queue(ring->tx_queue);
+		ring->queue_stopped++;
+	}
+	send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);
+
+	real_size = (real_size / 16) & 0x3f;
+
+	bf_ok &= desc_size <= MAX_BF && send_doorbell;
+
+	if (bf_ok)
+		qpn_vlan.bf_qpn = ring->doorbell_qpn | cpu_to_be32(real_size);
+	else
+		qpn_vlan.fence_size = real_size;
+
+	mlx4_en_tx_write_desc(ring, tx_desc, qpn_vlan, desc_size, bf_index,
+			      op_own, bf_ok, send_doorbell);
+
+	if (unlikely(stop_queue)) {
+		/* If queue was emptied after the if (stop_queue) , and before
+		 * the netif_tx_stop_queue() - need to wake the queue,
+		 * or else it will remain stopped forever.
+		 * Need a memory barrier to make sure ring->cons was not
+		 * updated before queue was stopped.
+		 */
+		smp_rmb();
+
+		ring_cons = READ_ONCE(ring->cons);
+		if (unlikely(!mlx4_en_is_tx_ring_full(ring))) {
+			netif_tx_wake_queue(ring->tx_queue);
+			ring->wake_queue++;
+		}
+	}
+	return NETDEV_TX_OK;
+
+tx_drop_count:
+	ring->tx_dropped++;
+tx_drop:
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
+}
+
+#define MLX4_EN_XDP_TX_NRTXBB  1
+#define MLX4_EN_XDP_TX_REAL_SZ (((CTRL_SIZE + MLX4_EN_XDP_TX_NRTXBB * DS_SIZE) \
+				 / 16) & 0x3f)
+
+void mlx4_en_init_tx_xdp_ring_descs(struct mlx4_en_priv *priv,
+				    struct mlx4_en_tx_ring *ring)
+{
+	int i;
+
+	for (i = 0; i < ring->size; i++) {
+		struct mlx4_en_tx_info *tx_info = &ring->tx_info[i];
+		struct mlx4_en_tx_desc *tx_desc = ring->buf +
+			(i << LOG_TXBB_SIZE);
+
+		tx_info->map0_byte_count = PAGE_SIZE;
+		tx_info->nr_txbb = MLX4_EN_XDP_TX_NRTXBB;
+		tx_info->data_offset = offsetof(struct mlx4_en_tx_desc, data);
+		tx_info->ts_requested = 0;
+		tx_info->nr_maps = 1;
+		tx_info->linear = 1;
+		tx_info->inl = 0;
+
+		tx_desc->data.lkey = ring->mr_key;
+		tx_desc->ctrl.qpn_vlan.fence_size = MLX4_EN_XDP_TX_REAL_SZ;
+		tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
+	}
+}
+
+netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
+			       struct mlx4_en_rx_alloc *frame,
+			       struct mlx4_en_priv *priv, unsigned int length,
+			       int tx_ind, bool *doorbell_pending)
+{
+	struct mlx4_en_tx_desc *tx_desc;
+	struct mlx4_en_tx_info *tx_info;
+	struct mlx4_wqe_data_seg *data;
+	struct mlx4_en_tx_ring *ring;
+	dma_addr_t dma;
+	__be32 op_own;
+	int index;
+
+	if (unlikely(!priv->port_up))
+		goto tx_drop;
+
+	ring = priv->tx_ring[TX_XDP][tx_ind];
+
+	if (unlikely(mlx4_en_is_tx_ring_full(ring)))
+		goto tx_drop_count;
+
+	index = ring->prod & ring->size_mask;
+	tx_info = &ring->tx_info[index];
+
+	/* Track current inflight packets for performance analysis */
+	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
+			 (u32)(ring->prod - READ_ONCE(ring->cons) - 1));
+
+	tx_desc = ring->buf + (index << LOG_TXBB_SIZE);
+	data = &tx_desc->data;
+
+	dma = frame->dma;
+
+	tx_info->page = frame->page;
+	frame->page = NULL;
+	tx_info->map0_dma = dma;
+	tx_info->nr_bytes = max_t(unsigned int, length, ETH_ZLEN);
+
+	dma_sync_single_range_for_device(priv->ddev, dma, frame->page_offset,
+					 length, PCI_DMA_TODEVICE);
+
+	data->addr = cpu_to_be64(dma + frame->page_offset);
+	dma_wmb();
+	data->byte_count = cpu_to_be32(length);
+
+	/* tx completion can avoid cache line miss for common cases */
+
+	op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
+		((ring->prod & ring->size) ?
+		 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+
+	rx_ring->xdp_tx++;
+	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, length);
+
+	ring->prod += MLX4_EN_XDP_TX_NRTXBB;
+
+	/* Ensure new descriptor hits memory
+	 * before setting ownership of this descriptor to HW
+	 */
+	dma_wmb();
+	tx_desc->ctrl.owner_opcode = op_own;
+	ring->xmit_more++;
+
+	*doorbell_pending = true;
+
+	return NETDEV_TX_OK;
+
+tx_drop_count:
+	rx_ring->xdp_tx_full++;
+	*doorbell_pending = true;
+tx_drop:
+	return NETDEV_TX_BUSY;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
new file mode 100644
index 0000000..6f57c05
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -0,0 +1,1565 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/cpu_rmap.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+enum {
+	MLX4_IRQNAME_SIZE	= 32
+};
+
+enum {
+	MLX4_NUM_ASYNC_EQE	= 0x100,
+	MLX4_NUM_SPARE_EQE	= 0x80,
+	MLX4_EQ_ENTRY_SIZE	= 0x20
+};
+
+#define MLX4_EQ_STATUS_OK	   ( 0 << 28)
+#define MLX4_EQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MLX4_EQ_OWNER_SW	   ( 0 << 24)
+#define MLX4_EQ_OWNER_HW	   ( 1 << 24)
+#define MLX4_EQ_FLAG_EC		   ( 1 << 18)
+#define MLX4_EQ_FLAG_OI		   ( 1 << 17)
+#define MLX4_EQ_STATE_ARMED	   ( 9 <<  8)
+#define MLX4_EQ_STATE_FIRED	   (10 <<  8)
+#define MLX4_EQ_STATE_ALWAYS_ARMED (11 <<  8)
+
+#define MLX4_ASYNC_EVENT_MASK ((1ull << MLX4_EVENT_TYPE_PATH_MIG)	    | \
+			       (1ull << MLX4_EVENT_TYPE_COMM_EST)	    | \
+			       (1ull << MLX4_EVENT_TYPE_SQ_DRAINED)	    | \
+			       (1ull << MLX4_EVENT_TYPE_CQ_ERROR)	    | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_CATAS_ERROR)	    | \
+			       (1ull << MLX4_EVENT_TYPE_EEC_CATAS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_PATH_MIG_FAILED)    | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_PORT_CHANGE)	    | \
+			       (1ull << MLX4_EVENT_TYPE_ECC_DETECT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE)    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_CMD)		    | \
+			       (1ull << MLX4_EVENT_TYPE_OP_REQUIRED)	    | \
+			       (1ull << MLX4_EVENT_TYPE_COMM_CHANNEL)       | \
+			       (1ull << MLX4_EVENT_TYPE_FLR_EVENT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_FATAL_WARNING))
+
+static u64 get_async_ev_mask(struct mlx4_dev *dev)
+{
+	u64 async_ev_mask = MLX4_ASYNC_EVENT_MASK;
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
+		async_ev_mask |= (1ull << MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT);
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT)
+		async_ev_mask |= (1ull << MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT);
+
+	return async_ev_mask;
+}
+
+static void eq_set_ci(struct mlx4_eq *eq, int req_not)
+{
+	__raw_writel((__force u32) cpu_to_be32((eq->cons_index & 0xffffff) |
+					       req_not << 31),
+		     eq->doorbell);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry, u8 eqe_factor,
+				u8 eqe_size)
+{
+	/* (entry & (eq->nent - 1)) gives us a cyclic array */
+	unsigned long offset = (entry & (eq->nent - 1)) * eqe_size;
+	/* CX3 is capable of extending the EQE from 32 to 64 bytes with
+	 * strides of 64B,128B and 256B.
+	 * When 64B EQE is used, the first (in the lower addresses)
+	 * 32 bytes in the 64 byte EQE are reserved and the next 32 bytes
+	 * contain the legacy EQE information.
+	 * In all other cases, the first 32B contains the legacy EQE info.
+	 */
+	return eq->page_list[offset / PAGE_SIZE].buf + (offset + (eqe_factor ? MLX4_EQ_ENTRY_SIZE : 0)) % PAGE_SIZE;
+}
+
+static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq, u8 eqe_factor, u8 size)
+{
+	struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index, eqe_factor, size);
+	return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe;
+}
+
+static struct mlx4_eqe *next_slave_event_eqe(struct mlx4_slave_event_eq *slave_eq)
+{
+	struct mlx4_eqe *eqe =
+		&slave_eq->event_eqe[slave_eq->cons & (SLAVE_EVENT_EQ_SIZE - 1)];
+	return (!!(eqe->owner & 0x80) ^
+		!!(slave_eq->cons & SLAVE_EVENT_EQ_SIZE)) ?
+		eqe : NULL;
+}
+
+void mlx4_gen_slave_eqe(struct work_struct *work)
+{
+	struct mlx4_mfunc_master_ctx *master =
+		container_of(work, struct mlx4_mfunc_master_ctx,
+			     slave_event_work);
+	struct mlx4_mfunc *mfunc =
+		container_of(master, struct mlx4_mfunc, master);
+	struct mlx4_priv *priv = container_of(mfunc, struct mlx4_priv, mfunc);
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_slave_event_eq *slave_eq = &mfunc->master.slave_eq;
+	struct mlx4_eqe *eqe;
+	u8 slave;
+	int i, phys_port, slave_port;
+
+	for (eqe = next_slave_event_eqe(slave_eq); eqe;
+	      eqe = next_slave_event_eqe(slave_eq)) {
+		slave = eqe->slave_id;
+
+		if (eqe->type == MLX4_EVENT_TYPE_PORT_CHANGE &&
+		    eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_DOWN &&
+		    mlx4_is_bonded(dev)) {
+			struct mlx4_port_cap port_cap;
+
+			if (!mlx4_QUERY_PORT(dev, 1, &port_cap) && port_cap.link_state)
+				goto consume;
+
+			if (!mlx4_QUERY_PORT(dev, 2, &port_cap) && port_cap.link_state)
+				goto consume;
+		}
+		/* All active slaves need to receive the event */
+		if (slave == ALL_SLAVES) {
+			for (i = 0; i <= dev->persist->num_vfs; i++) {
+				phys_port = 0;
+				if (eqe->type == MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT &&
+				    eqe->subtype == MLX4_DEV_PMC_SUBTYPE_PORT_INFO) {
+					phys_port  = eqe->event.port_mgmt_change.port;
+					slave_port = mlx4_phys_to_slave_port(dev, i, phys_port);
+					if (slave_port < 0) /* VF doesn't have this port */
+						continue;
+					eqe->event.port_mgmt_change.port = slave_port;
+				}
+				if (mlx4_GEN_EQE(dev, i, eqe))
+					mlx4_warn(dev, "Failed to generate event for slave %d\n",
+						  i);
+				if (phys_port)
+					eqe->event.port_mgmt_change.port = phys_port;
+			}
+		} else {
+			if (mlx4_GEN_EQE(dev, slave, eqe))
+				mlx4_warn(dev, "Failed to generate event for slave %d\n",
+					  slave);
+		}
+consume:
+		++slave_eq->cons;
+	}
+}
+
+
+static void slave_event(struct mlx4_dev *dev, u8 slave, struct mlx4_eqe *eqe)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_event_eq *slave_eq = &priv->mfunc.master.slave_eq;
+	struct mlx4_eqe *s_eqe;
+	unsigned long flags;
+
+	spin_lock_irqsave(&slave_eq->event_lock, flags);
+	s_eqe = &slave_eq->event_eqe[slave_eq->prod & (SLAVE_EVENT_EQ_SIZE - 1)];
+	if ((!!(s_eqe->owner & 0x80)) ^
+	    (!!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE))) {
+		mlx4_warn(dev, "Master failed to generate an EQE for slave: %d. No free EQE on slave events queue\n",
+			  slave);
+		spin_unlock_irqrestore(&slave_eq->event_lock, flags);
+		return;
+	}
+
+	memcpy(s_eqe, eqe, sizeof(struct mlx4_eqe) - 1);
+	s_eqe->slave_id = slave;
+	/* ensure all information is written before setting the ownersip bit */
+	dma_wmb();
+	s_eqe->owner = !!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE) ? 0x0 : 0x80;
+	++slave_eq->prod;
+
+	queue_work(priv->mfunc.master.comm_wq,
+		   &priv->mfunc.master.slave_event_work);
+	spin_unlock_irqrestore(&slave_eq->event_lock, flags);
+}
+
+static void mlx4_slave_event(struct mlx4_dev *dev, int slave,
+			     struct mlx4_eqe *eqe)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave < 0 || slave > dev->persist->num_vfs ||
+	    slave == dev->caps.function ||
+	    !priv->mfunc.master.slave_state[slave].active)
+		return;
+
+	slave_event(dev, slave, eqe);
+}
+
+#if defined(CONFIG_SMP)
+static void mlx4_set_eq_affinity_hint(struct mlx4_priv *priv, int vec)
+{
+	int hint_err;
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_eq *eq = &priv->eq_table.eq[vec];
+
+	if (!eq->affinity_mask || cpumask_empty(eq->affinity_mask))
+		return;
+
+	hint_err = irq_set_affinity_hint(eq->irq, eq->affinity_mask);
+	if (hint_err)
+		mlx4_warn(dev, "irq_set_affinity_hint failed, err %d\n", hint_err);
+}
+#endif
+
+int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port)
+{
+	struct mlx4_eqe eqe;
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_slave = &priv->mfunc.master.slave_state[slave];
+
+	if (!s_slave->active)
+		return 0;
+
+	memset(&eqe, 0, sizeof(eqe));
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE;
+	eqe.event.port_mgmt_change.port = mlx4_phys_to_slave_port(dev, slave, port);
+
+	return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_pkey_eqe);
+
+int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port)
+{
+	struct mlx4_eqe eqe;
+
+	/*don't send if we don't have the that slave */
+	if (dev->persist->num_vfs < slave)
+		return 0;
+	memset(&eqe, 0, sizeof(eqe));
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_GUID_INFO;
+	eqe.event.port_mgmt_change.port = mlx4_phys_to_slave_port(dev, slave, port);
+
+	return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_guid_change_eqe);
+
+int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port,
+				   u8 port_subtype_change)
+{
+	struct mlx4_eqe eqe;
+	u8 slave_port = mlx4_phys_to_slave_port(dev, slave, port);
+
+	/*don't send if we don't have the that slave */
+	if (dev->persist->num_vfs < slave)
+		return 0;
+	memset(&eqe, 0, sizeof(eqe));
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_CHANGE;
+	eqe.subtype = port_subtype_change;
+	eqe.event.port_change.port = cpu_to_be32(slave_port << 28);
+
+	mlx4_dbg(dev, "%s: sending: %d to slave: %d on port: %d\n", __func__,
+		 port_subtype_change, slave, port);
+	return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_port_state_change_eqe);
+
+enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+
+	if (slave >= dev->num_slaves || port > dev->caps.num_ports ||
+	    port <= 0 || !test_bit(port - 1, actv_ports.ports)) {
+		pr_err("%s: Error: asking for slave:%d, port:%d\n",
+		       __func__, slave, port);
+		return SLAVE_PORT_DOWN;
+	}
+	return s_state[slave].port_state[port];
+}
+EXPORT_SYMBOL(mlx4_get_slave_port_state);
+
+static int mlx4_set_slave_port_state(struct mlx4_dev *dev, int slave, u8 port,
+				     enum slave_port_state state)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+
+	if (slave >= dev->num_slaves || port > dev->caps.num_ports ||
+	    port <= 0 || !test_bit(port - 1, actv_ports.ports)) {
+		pr_err("%s: Error: asking for slave:%d, port:%d\n",
+		       __func__, slave, port);
+		return -1;
+	}
+	s_state[slave].port_state[port] = state;
+
+	return 0;
+}
+
+static void set_all_slave_state(struct mlx4_dev *dev, u8 port, int event)
+{
+	int i;
+	enum slave_port_gen_event gen_event;
+	struct mlx4_slaves_pport slaves_pport = mlx4_phys_to_slaves_pport(dev,
+									  port);
+
+	for (i = 0; i < dev->persist->num_vfs + 1; i++)
+		if (test_bit(i, slaves_pport.slaves))
+			set_and_calc_slave_port_state(dev, i, port,
+						      event, &gen_event);
+}
+/**************************************************************************
+	The function get as input the new event to that port,
+	and according to the prev state change the slave's port state.
+	The events are:
+		MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
+		MLX4_PORT_STATE_DEV_EVENT_PORT_UP
+		MLX4_PORT_STATE_IB_EVENT_GID_VALID
+		MLX4_PORT_STATE_IB_EVENT_GID_INVALID
+***************************************************************************/
+int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave,
+				  u8 port, int event,
+				  enum slave_port_gen_event *gen_event)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *ctx = NULL;
+	unsigned long flags;
+	int ret = -1;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	enum slave_port_state cur_state =
+		mlx4_get_slave_port_state(dev, slave, port);
+
+	*gen_event = SLAVE_PORT_GEN_EVENT_NONE;
+
+	if (slave >= dev->num_slaves || port > dev->caps.num_ports ||
+	    port <= 0 || !test_bit(port - 1, actv_ports.ports)) {
+		pr_err("%s: Error: asking for slave:%d, port:%d\n",
+		       __func__, slave, port);
+		return ret;
+	}
+
+	ctx = &priv->mfunc.master.slave_state[slave];
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	switch (cur_state) {
+	case SLAVE_PORT_DOWN:
+		if (MLX4_PORT_STATE_DEV_EVENT_PORT_UP == event)
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PENDING_UP);
+		break;
+	case SLAVE_PENDING_UP:
+		if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event)
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PORT_DOWN);
+		else if (MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID == event) {
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PORT_UP);
+			*gen_event = SLAVE_PORT_GEN_EVENT_UP;
+		}
+		break;
+	case SLAVE_PORT_UP:
+		if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event) {
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PORT_DOWN);
+			*gen_event = SLAVE_PORT_GEN_EVENT_DOWN;
+		} else if (MLX4_PORT_STATE_IB_EVENT_GID_INVALID ==
+				event) {
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PENDING_UP);
+			*gen_event = SLAVE_PORT_GEN_EVENT_DOWN;
+		}
+		break;
+	default:
+		pr_err("%s: BUG!!! UNKNOWN state: slave:%d, port:%d\n",
+		       __func__, slave, port);
+		goto out;
+	}
+	ret = mlx4_get_slave_port_state(dev, slave, port);
+
+out:
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL(set_and_calc_slave_port_state);
+
+int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr)
+{
+	struct mlx4_eqe eqe;
+
+	memset(&eqe, 0, sizeof(eqe));
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PORT_INFO;
+	eqe.event.port_mgmt_change.port = port;
+	eqe.event.port_mgmt_change.params.port_info.changed_attr =
+		cpu_to_be32((u32) attr);
+
+	slave_event(dev, ALL_SLAVES, &eqe);
+	return 0;
+}
+EXPORT_SYMBOL(mlx4_gen_slaves_port_mgt_ev);
+
+void mlx4_master_handle_slave_flr(struct work_struct *work)
+{
+	struct mlx4_mfunc_master_ctx *master =
+		container_of(work, struct mlx4_mfunc_master_ctx,
+			     slave_flr_event_work);
+	struct mlx4_mfunc *mfunc =
+		container_of(master, struct mlx4_mfunc, master);
+	struct mlx4_priv *priv =
+		container_of(mfunc, struct mlx4_priv, mfunc);
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	int i;
+	int err;
+	unsigned long flags;
+
+	mlx4_dbg(dev, "mlx4_handle_slave_flr\n");
+
+	for (i = 0 ; i < dev->num_slaves; i++) {
+
+		if (MLX4_COMM_CMD_FLR == slave_state[i].last_cmd) {
+			mlx4_dbg(dev, "mlx4_handle_slave_flr: clean slave: %d\n",
+				 i);
+			/* In case of 'Reset flow' FLR can be generated for
+			 * a slave before mlx4_load_one is done.
+			 * make sure interface is up before trying to delete
+			 * slave resources which weren't allocated yet.
+			 */
+			if (dev->persist->interface_state &
+			    MLX4_INTERFACE_STATE_UP)
+				mlx4_delete_all_resources_for_slave(dev, i);
+			/*return the slave to running mode*/
+			spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+			slave_state[i].last_cmd = MLX4_COMM_CMD_RESET;
+			slave_state[i].is_slave_going_down = 0;
+			spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+			/*notify the FW:*/
+			err = mlx4_cmd(dev, 0, i, 0, MLX4_CMD_INFORM_FLR_DONE,
+				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+			if (err)
+				mlx4_warn(dev, "Failed to notify FW on FLR done (slave:%d)\n",
+					  i);
+		}
+	}
+}
+
+static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_eqe *eqe;
+	int cqn;
+	int eqes_found = 0;
+	int set_ci = 0;
+	int port;
+	int slave = 0;
+	int ret;
+	u32 flr_slave;
+	u8 update_slave_state;
+	int i;
+	enum slave_port_gen_event gen_event;
+	unsigned long flags;
+	struct mlx4_vport_state *s_info;
+	int eqe_size = dev->caps.eqe_size;
+
+	while ((eqe = next_eqe_sw(eq, dev->caps.eqe_factor, eqe_size))) {
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		dma_rmb();
+
+		switch (eqe->type) {
+		case MLX4_EVENT_TYPE_COMP:
+			cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+			mlx4_cq_completion(dev, cqn);
+			break;
+
+		case MLX4_EVENT_TYPE_PATH_MIG:
+		case MLX4_EVENT_TYPE_COMM_EST:
+		case MLX4_EVENT_TYPE_SQ_DRAINED:
+		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+			mlx4_dbg(dev, "event %d arrived\n", eqe->type);
+			if (mlx4_is_master(dev)) {
+				/* forward only to slave owning the QP */
+				ret = mlx4_get_slave_from_resource_id(dev,
+						RES_QP,
+						be32_to_cpu(eqe->event.qp.qpn)
+						& 0xffffff, &slave);
+				if (ret && ret != -ENOENT) {
+					mlx4_dbg(dev, "QP event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n",
+						 eqe->type, eqe->subtype,
+						 eq->eqn, eq->cons_index, ret);
+					break;
+				}
+
+				if (!ret && slave != dev->caps.function) {
+					mlx4_slave_event(dev, slave, eqe);
+					break;
+				}
+
+			}
+			mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) &
+				      0xffffff, eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_SRQ_LIMIT:
+			mlx4_dbg(dev, "%s: MLX4_EVENT_TYPE_SRQ_LIMIT. srq_no=0x%x, eq 0x%x\n",
+				 __func__, be32_to_cpu(eqe->event.srq.srqn),
+				 eq->eqn);
+		case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+			if (mlx4_is_master(dev)) {
+				/* forward only to slave owning the SRQ */
+				ret = mlx4_get_slave_from_resource_id(dev,
+						RES_SRQ,
+						be32_to_cpu(eqe->event.srq.srqn)
+						& 0xffffff,
+						&slave);
+				if (ret && ret != -ENOENT) {
+					mlx4_warn(dev, "SRQ event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n",
+						  eqe->type, eqe->subtype,
+						  eq->eqn, eq->cons_index, ret);
+					break;
+				}
+				if (eqe->type ==
+				    MLX4_EVENT_TYPE_SRQ_CATAS_ERROR)
+					mlx4_warn(dev, "%s: slave:%d, srq_no:0x%x, event: %02x(%02x)\n",
+						  __func__, slave,
+						  be32_to_cpu(eqe->event.srq.srqn),
+						  eqe->type, eqe->subtype);
+
+				if (!ret && slave != dev->caps.function) {
+					if (eqe->type ==
+					    MLX4_EVENT_TYPE_SRQ_CATAS_ERROR)
+						mlx4_warn(dev, "%s: sending event %02x(%02x) to slave:%d\n",
+							  __func__, eqe->type,
+							  eqe->subtype, slave);
+					mlx4_slave_event(dev, slave, eqe);
+					break;
+				}
+			}
+			mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) &
+				       0xffffff, eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_CMD:
+			mlx4_cmd_event(dev,
+				       be16_to_cpu(eqe->event.cmd.token),
+				       eqe->event.cmd.status,
+				       be64_to_cpu(eqe->event.cmd.out_param));
+			break;
+
+		case MLX4_EVENT_TYPE_PORT_CHANGE: {
+			struct mlx4_slaves_pport slaves_port;
+			port = be32_to_cpu(eqe->event.port_change.port) >> 28;
+			slaves_port = mlx4_phys_to_slaves_pport(dev, port);
+			if (eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_DOWN) {
+				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_DOWN,
+						    port);
+				mlx4_priv(dev)->sense.do_sense_port[port] = 1;
+				if (!mlx4_is_master(dev))
+					break;
+				for (i = 0; i < dev->persist->num_vfs + 1;
+				     i++) {
+					int reported_port = mlx4_is_bonded(dev) ? 1 : mlx4_phys_to_slave_port(dev, i, port);
+
+					if (!test_bit(i, slaves_port.slaves) && !mlx4_is_bonded(dev))
+						continue;
+					if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) {
+						if (i == mlx4_master_func_num(dev))
+							continue;
+						mlx4_dbg(dev, "%s: Sending MLX4_PORT_CHANGE_SUBTYPE_DOWN to slave: %d, port:%d\n",
+							 __func__, i, port);
+						s_info = &priv->mfunc.master.vf_oper[i].vport[port].state;
+						if (IFLA_VF_LINK_STATE_AUTO == s_info->link_state) {
+							eqe->event.port_change.port =
+								cpu_to_be32(
+								(be32_to_cpu(eqe->event.port_change.port) & 0xFFFFFFF)
+								| (reported_port << 28));
+							mlx4_slave_event(dev, i, eqe);
+						}
+					} else {  /* IB port */
+						set_and_calc_slave_port_state(dev, i, port,
+									      MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
+									      &gen_event);
+						/*we can be in pending state, then do not send port_down event*/
+						if (SLAVE_PORT_GEN_EVENT_DOWN ==  gen_event) {
+							if (i == mlx4_master_func_num(dev))
+								continue;
+							eqe->event.port_change.port =
+								cpu_to_be32(
+								(be32_to_cpu(eqe->event.port_change.port) & 0xFFFFFFF)
+								| (mlx4_phys_to_slave_port(dev, i, port) << 28));
+							mlx4_slave_event(dev, i, eqe);
+						}
+					}
+				}
+			} else {
+				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP, port);
+
+				mlx4_priv(dev)->sense.do_sense_port[port] = 0;
+
+				if (!mlx4_is_master(dev))
+					break;
+				if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
+					for (i = 0;
+					     i < dev->persist->num_vfs + 1;
+					     i++) {
+						int reported_port = mlx4_is_bonded(dev) ? 1 : mlx4_phys_to_slave_port(dev, i, port);
+
+						if (!test_bit(i, slaves_port.slaves) && !mlx4_is_bonded(dev))
+							continue;
+						if (i == mlx4_master_func_num(dev))
+							continue;
+						s_info = &priv->mfunc.master.vf_oper[i].vport[port].state;
+						if (IFLA_VF_LINK_STATE_AUTO == s_info->link_state) {
+							eqe->event.port_change.port =
+								cpu_to_be32(
+								(be32_to_cpu(eqe->event.port_change.port) & 0xFFFFFFF)
+								| (reported_port << 28));
+							mlx4_slave_event(dev, i, eqe);
+						}
+					}
+				else /* IB port */
+					/* port-up event will be sent to a slave when the
+					 * slave's alias-guid is set. This is done in alias_GUID.c
+					 */
+					set_all_slave_state(dev, port, MLX4_DEV_EVENT_PORT_UP);
+			}
+			break;
+		}
+
+		case MLX4_EVENT_TYPE_CQ_ERROR:
+			mlx4_warn(dev, "CQ %s on CQN %06x\n",
+				  eqe->event.cq_err.syndrome == 1 ?
+				  "overrun" : "access violation",
+				  be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
+			if (mlx4_is_master(dev)) {
+				ret = mlx4_get_slave_from_resource_id(dev,
+					RES_CQ,
+					be32_to_cpu(eqe->event.cq_err.cqn)
+					& 0xffffff, &slave);
+				if (ret && ret != -ENOENT) {
+					mlx4_dbg(dev, "CQ event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n",
+						 eqe->type, eqe->subtype,
+						 eq->eqn, eq->cons_index, ret);
+					break;
+				}
+
+				if (!ret && slave != dev->caps.function) {
+					mlx4_slave_event(dev, slave, eqe);
+					break;
+				}
+			}
+			mlx4_cq_event(dev,
+				      be32_to_cpu(eqe->event.cq_err.cqn)
+				      & 0xffffff,
+				      eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_EQ_OVERFLOW:
+			mlx4_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+			break;
+
+		case MLX4_EVENT_TYPE_OP_REQUIRED:
+			atomic_inc(&priv->opreq_count);
+			/* FW commands can't be executed from interrupt context
+			 * working in deferred task
+			 */
+			queue_work(mlx4_wq, &priv->opreq_task);
+			break;
+
+		case MLX4_EVENT_TYPE_COMM_CHANNEL:
+			if (!mlx4_is_master(dev)) {
+				mlx4_warn(dev, "Received comm channel event for non master device\n");
+				break;
+			}
+			memcpy(&priv->mfunc.master.comm_arm_bit_vector,
+			       eqe->event.comm_channel_arm.bit_vec,
+			       sizeof(eqe->event.comm_channel_arm.bit_vec));
+			queue_work(priv->mfunc.master.comm_wq,
+				   &priv->mfunc.master.comm_work);
+			break;
+
+		case MLX4_EVENT_TYPE_FLR_EVENT:
+			flr_slave = be32_to_cpu(eqe->event.flr_event.slave_id);
+			if (!mlx4_is_master(dev)) {
+				mlx4_warn(dev, "Non-master function received FLR event\n");
+				break;
+			}
+
+			mlx4_dbg(dev, "FLR event for slave: %d\n", flr_slave);
+
+			if (flr_slave >= dev->num_slaves) {
+				mlx4_warn(dev,
+					  "Got FLR for unknown function: %d\n",
+					  flr_slave);
+				update_slave_state = 0;
+			} else
+				update_slave_state = 1;
+
+			spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+			if (update_slave_state) {
+				priv->mfunc.master.slave_state[flr_slave].active = false;
+				priv->mfunc.master.slave_state[flr_slave].last_cmd = MLX4_COMM_CMD_FLR;
+				priv->mfunc.master.slave_state[flr_slave].is_slave_going_down = 1;
+			}
+			spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_SHUTDOWN,
+					    flr_slave);
+			queue_work(priv->mfunc.master.comm_wq,
+				   &priv->mfunc.master.slave_flr_event_work);
+			break;
+
+		case MLX4_EVENT_TYPE_FATAL_WARNING:
+			if (eqe->subtype == MLX4_FATAL_WARNING_SUBTYPE_WARMING) {
+				if (mlx4_is_master(dev))
+					for (i = 0; i < dev->num_slaves; i++) {
+						mlx4_dbg(dev, "%s: Sending MLX4_FATAL_WARNING_SUBTYPE_WARMING to slave: %d\n",
+							 __func__, i);
+						if (i == dev->caps.function)
+							continue;
+						mlx4_slave_event(dev, i, eqe);
+					}
+				mlx4_err(dev, "Temperature Threshold was reached! Threshold: %d celsius degrees; Current Temperature: %d\n",
+					 be16_to_cpu(eqe->event.warming.warning_threshold),
+					 be16_to_cpu(eqe->event.warming.current_temperature));
+			} else
+				mlx4_warn(dev, "Unhandled event FATAL WARNING (%02x), subtype %02x on EQ %d at index %u. owner=%x, nent=0x%x, slave=%x, ownership=%s\n",
+					  eqe->type, eqe->subtype, eq->eqn,
+					  eq->cons_index, eqe->owner, eq->nent,
+					  eqe->slave_id,
+					  !!(eqe->owner & 0x80) ^
+					  !!(eq->cons_index & eq->nent) ? "HW" : "SW");
+
+			break;
+
+		case MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT:
+			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_MGMT_CHANGE,
+					    (unsigned long) eqe);
+			break;
+
+		case MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT:
+			switch (eqe->subtype) {
+			case MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_BAD_CABLE:
+				mlx4_warn(dev, "Bad cable detected on port %u\n",
+					  eqe->event.bad_cable.port);
+				break;
+			case MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_UNSUPPORTED_CABLE:
+				mlx4_warn(dev, "Unsupported cable detected\n");
+				break;
+			default:
+				mlx4_dbg(dev,
+					 "Unhandled recoverable error event detected: %02x(%02x) on EQ %d at index %u. owner=%x, nent=0x%x, ownership=%s\n",
+					 eqe->type, eqe->subtype, eq->eqn,
+					 eq->cons_index, eqe->owner, eq->nent,
+					 !!(eqe->owner & 0x80) ^
+					 !!(eq->cons_index & eq->nent) ? "HW" : "SW");
+				break;
+			}
+			break;
+
+		case MLX4_EVENT_TYPE_EEC_CATAS_ERROR:
+		case MLX4_EVENT_TYPE_ECC_DETECT:
+		default:
+			mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at index %u. owner=%x, nent=0x%x, slave=%x, ownership=%s\n",
+				  eqe->type, eqe->subtype, eq->eqn,
+				  eq->cons_index, eqe->owner, eq->nent,
+				  eqe->slave_id,
+				  !!(eqe->owner & 0x80) ^
+				  !!(eq->cons_index & eq->nent) ? "HW" : "SW");
+			break;
+		};
+
+		++eq->cons_index;
+		eqes_found = 1;
+		++set_ci;
+
+		/*
+		 * The HCA will think the queue has overflowed if we
+		 * don't tell it we've been processing events.  We
+		 * create our EQs with MLX4_NUM_SPARE_EQE extra
+		 * entries, so we must update our consumer index at
+		 * least that often.
+		 */
+		if (unlikely(set_ci >= MLX4_NUM_SPARE_EQE)) {
+			eq_set_ci(eq, 0);
+			set_ci = 0;
+		}
+	}
+
+	eq_set_ci(eq, 1);
+
+	return eqes_found;
+}
+
+static irqreturn_t mlx4_interrupt(int irq, void *dev_ptr)
+{
+	struct mlx4_dev *dev = dev_ptr;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int work = 0;
+	int i;
+
+	writel(priv->eq_table.clr_mask, priv->eq_table.clr_int);
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+		work |= mlx4_eq_int(dev, &priv->eq_table.eq[i]);
+
+	return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mlx4_msi_x_interrupt(int irq, void *eq_ptr)
+{
+	struct mlx4_eq  *eq  = eq_ptr;
+	struct mlx4_dev *dev = eq->dev;
+
+	mlx4_eq_int(dev, eq);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr,
+			struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_event_eq_info *event_eq =
+		priv->mfunc.master.slave_state[slave].event_eq;
+	u32 in_modifier = vhcr->in_modifier;
+	u32 eqn = in_modifier & 0x3FF;
+	u64 in_param =  vhcr->in_param;
+	int err = 0;
+	int i;
+
+	if (slave == dev->caps.function)
+		err = mlx4_cmd(dev, in_param, (in_modifier & 0x80000000) | eqn,
+			       0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B,
+			       MLX4_CMD_NATIVE);
+	if (!err)
+		for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i)
+			if (in_param & (1LL << i))
+				event_eq[i].eqn = in_modifier >> 31 ? -1 : eqn;
+
+	return err;
+}
+
+static int mlx4_MAP_EQ(struct mlx4_dev *dev, u64 event_mask, int unmap,
+			int eq_num)
+{
+	return mlx4_cmd(dev, event_mask, (unmap << 31) | eq_num,
+			0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_SW2HW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int eq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, eq_num, 0,
+			MLX4_CMD_SW2HW_EQ, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_HW2SW_EQ(struct mlx4_dev *dev,  int eq_num)
+{
+	return mlx4_cmd(dev, 0, eq_num, 1, MLX4_CMD_HW2SW_EQ,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_num_eq_uar(struct mlx4_dev *dev)
+{
+	/*
+	 * Each UAR holds 4 EQ doorbells.  To figure out how many UARs
+	 * we need to map, take the difference of highest index and
+	 * the lowest index we'll use and add 1.
+	 */
+	return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs) / 4 -
+		dev->caps.reserved_eqs / 4 + 1;
+}
+
+static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int index;
+
+	index = eq->eqn / 4 - dev->caps.reserved_eqs / 4;
+
+	if (!priv->eq_table.uar_map[index]) {
+		priv->eq_table.uar_map[index] =
+			ioremap(
+				pci_resource_start(dev->persist->pdev, 2) +
+				((eq->eqn / 4) << (dev->uar_page_shift)),
+				(1 << (dev->uar_page_shift)));
+		if (!priv->eq_table.uar_map[index]) {
+			mlx4_err(dev, "Couldn't map EQ doorbell for EQN 0x%06x\n",
+				 eq->eqn);
+			return NULL;
+		}
+	}
+
+	return priv->eq_table.uar_map[index] + 0x800 + 8 * (eq->eqn % 4);
+}
+
+static void mlx4_unmap_uar(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
+		if (priv->eq_table.uar_map[i]) {
+			iounmap(priv->eq_table.uar_map[i]);
+			priv->eq_table.uar_map[i] = NULL;
+		}
+}
+
+static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
+			  u8 intr, struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_eq_context *eq_context;
+	int npages;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	u64 mtt_addr;
+	int err = -ENOMEM;
+	int i;
+
+	eq->dev   = dev;
+	eq->nent  = roundup_pow_of_two(max(nent, 2));
+	/* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes, with
+	 * strides of 64B,128B and 256B.
+	 */
+	npages = PAGE_ALIGN(eq->nent * dev->caps.eqe_size) / PAGE_SIZE;
+
+	eq->page_list = kmalloc_array(npages, sizeof(*eq->page_list),
+				      GFP_KERNEL);
+	if (!eq->page_list)
+		goto err_out;
+
+	for (i = 0; i < npages; ++i)
+		eq->page_list[i].buf = NULL;
+
+	dma_list = kmalloc_array(npages, sizeof(*dma_list), GFP_KERNEL);
+	if (!dma_list)
+		goto err_out_free;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		goto err_out_free;
+	eq_context = mailbox->buf;
+
+	for (i = 0; i < npages; ++i) {
+		eq->page_list[i].buf = dma_alloc_coherent(&dev->persist->
+							  pdev->dev,
+							  PAGE_SIZE, &t,
+							  GFP_KERNEL);
+		if (!eq->page_list[i].buf)
+			goto err_out_free_pages;
+
+		dma_list[i] = t;
+		eq->page_list[i].map = t;
+
+		memset(eq->page_list[i].buf, 0, PAGE_SIZE);
+	}
+
+	eq->eqn = mlx4_bitmap_alloc(&priv->eq_table.bitmap);
+	if (eq->eqn == -1)
+		goto err_out_free_pages;
+
+	eq->doorbell = mlx4_get_eq_uar(dev, eq);
+	if (!eq->doorbell) {
+		err = -ENOMEM;
+		goto err_out_free_eq;
+	}
+
+	err = mlx4_mtt_init(dev, npages, PAGE_SHIFT, &eq->mtt);
+	if (err)
+		goto err_out_free_eq;
+
+	err = mlx4_write_mtt(dev, &eq->mtt, 0, npages, dma_list);
+	if (err)
+		goto err_out_free_mtt;
+
+	eq_context->flags	  = cpu_to_be32(MLX4_EQ_STATUS_OK   |
+						MLX4_EQ_STATE_ARMED);
+	eq_context->log_eq_size	  = ilog2(eq->nent);
+	eq_context->intr	  = intr;
+	eq_context->log_page_size = PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, &eq->mtt);
+	eq_context->mtt_base_addr_h = mtt_addr >> 32;
+	eq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+
+	err = mlx4_SW2HW_EQ(dev, mailbox, eq->eqn);
+	if (err) {
+		mlx4_warn(dev, "SW2HW_EQ failed (%d)\n", err);
+		goto err_out_free_mtt;
+	}
+
+	kfree(dma_list);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	eq->cons_index = 0;
+
+	INIT_LIST_HEAD(&eq->tasklet_ctx.list);
+	INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
+	spin_lock_init(&eq->tasklet_ctx.lock);
+	tasklet_init(&eq->tasklet_ctx.task, mlx4_cq_tasklet_cb,
+		     (unsigned long)&eq->tasklet_ctx);
+
+	return err;
+
+err_out_free_mtt:
+	mlx4_mtt_cleanup(dev, &eq->mtt);
+
+err_out_free_eq:
+	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn, MLX4_USE_RR);
+
+err_out_free_pages:
+	for (i = 0; i < npages; ++i)
+		if (eq->page_list[i].buf)
+			dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+					  eq->page_list[i].buf,
+					  eq->page_list[i].map);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_out_free:
+	kfree(eq->page_list);
+	kfree(dma_list);
+
+err_out:
+	return err;
+}
+
+static void mlx4_free_eq(struct mlx4_dev *dev,
+			 struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int i;
+	/* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes, with
+	 * strides of 64B,128B and 256B
+	 */
+	int npages = PAGE_ALIGN(dev->caps.eqe_size  * eq->nent) / PAGE_SIZE;
+
+	err = mlx4_HW2SW_EQ(dev, eq->eqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_EQ failed (%d)\n", err);
+
+	synchronize_irq(eq->irq);
+	tasklet_disable(&eq->tasklet_ctx.task);
+
+	mlx4_mtt_cleanup(dev, &eq->mtt);
+	for (i = 0; i < npages; ++i)
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+				  eq->page_list[i].buf,
+				  eq->page_list[i].map);
+
+	kfree(eq->page_list);
+	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn, MLX4_USE_RR);
+}
+
+static void mlx4_free_irqs(struct mlx4_dev *dev)
+{
+	struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table;
+	int	i;
+
+	if (eq_table->have_irq)
+		free_irq(dev->persist->pdev->irq, dev);
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+		if (eq_table->eq[i].have_irq) {
+			free_cpumask_var(eq_table->eq[i].affinity_mask);
+#if defined(CONFIG_SMP)
+			irq_set_affinity_hint(eq_table->eq[i].irq, NULL);
+#endif
+			free_irq(eq_table->eq[i].irq, eq_table->eq + i);
+			eq_table->eq[i].have_irq = 0;
+		}
+
+	kfree(eq_table->irq_names);
+}
+
+static int mlx4_map_clr_int(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->clr_base = ioremap(pci_resource_start(dev->persist->pdev,
+				 priv->fw.clr_int_bar) +
+				 priv->fw.clr_int_base, MLX4_CLR_INT_SIZE);
+	if (!priv->clr_base) {
+		mlx4_err(dev, "Couldn't map interrupt clear register, aborting\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void mlx4_unmap_clr_int(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	iounmap(priv->clr_base);
+}
+
+int mlx4_alloc_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->eq_table.eq = kcalloc(dev->caps.num_eqs - dev->caps.reserved_eqs,
+				    sizeof(*priv->eq_table.eq), GFP_KERNEL);
+	if (!priv->eq_table.eq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_free_eq_table(struct mlx4_dev *dev)
+{
+	kfree(mlx4_priv(dev)->eq_table.eq);
+}
+
+int mlx4_init_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int i;
+
+	priv->eq_table.uar_map = kcalloc(mlx4_num_eq_uar(dev),
+					 sizeof(*priv->eq_table.uar_map),
+					 GFP_KERNEL);
+	if (!priv->eq_table.uar_map) {
+		err = -ENOMEM;
+		goto err_out_free;
+	}
+
+	err = mlx4_bitmap_init(&priv->eq_table.bitmap,
+			       roundup_pow_of_two(dev->caps.num_eqs),
+			       dev->caps.num_eqs - 1,
+			       dev->caps.reserved_eqs,
+			       roundup_pow_of_two(dev->caps.num_eqs) -
+			       dev->caps.num_eqs);
+	if (err)
+		goto err_out_free;
+
+	for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
+		priv->eq_table.uar_map[i] = NULL;
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_map_clr_int(dev);
+		if (err)
+			goto err_out_bitmap;
+
+		priv->eq_table.clr_mask =
+			swab32(1 << (priv->eq_table.inta_pin & 31));
+		priv->eq_table.clr_int  = priv->clr_base +
+			(priv->eq_table.inta_pin < 32 ? 4 : 0);
+	}
+
+	priv->eq_table.irq_names =
+		kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1),
+			GFP_KERNEL);
+	if (!priv->eq_table.irq_names) {
+		err = -ENOMEM;
+		goto err_out_clr_int;
+	}
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) {
+		if (i == MLX4_EQ_ASYNC) {
+			err = mlx4_create_eq(dev,
+					     MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE,
+					     0, &priv->eq_table.eq[MLX4_EQ_ASYNC]);
+		} else {
+			struct mlx4_eq	*eq = &priv->eq_table.eq[i];
+#ifdef CONFIG_RFS_ACCEL
+			int port = find_first_bit(eq->actv_ports.ports,
+						  dev->caps.num_ports) + 1;
+
+			if (port <= dev->caps.num_ports) {
+				struct mlx4_port_info *info =
+					&mlx4_priv(dev)->port[port];
+
+				if (!info->rmap) {
+					info->rmap = alloc_irq_cpu_rmap(
+						mlx4_get_eqs_per_port(dev, port));
+					if (!info->rmap) {
+						mlx4_warn(dev, "Failed to allocate cpu rmap\n");
+						err = -ENOMEM;
+						goto err_out_unmap;
+					}
+				}
+
+				err = irq_cpu_rmap_add(
+					info->rmap, eq->irq);
+				if (err)
+					mlx4_warn(dev, "Failed adding irq rmap\n");
+			}
+#endif
+			err = mlx4_create_eq(dev, dev->quotas.cq +
+					     MLX4_NUM_SPARE_EQE,
+					     (dev->flags & MLX4_FLAG_MSI_X) ?
+					     i + 1 - !!(i > MLX4_EQ_ASYNC) : 0,
+					     eq);
+		}
+		if (err)
+			goto err_out_unmap;
+	}
+
+	if (dev->flags & MLX4_FLAG_MSI_X) {
+		const char *eq_name;
+
+		snprintf(priv->eq_table.irq_names +
+			 MLX4_EQ_ASYNC * MLX4_IRQNAME_SIZE,
+			 MLX4_IRQNAME_SIZE,
+			 "mlx4-async@pci:%s",
+			 pci_name(dev->persist->pdev));
+		eq_name = priv->eq_table.irq_names +
+			MLX4_EQ_ASYNC * MLX4_IRQNAME_SIZE;
+
+		err = request_irq(priv->eq_table.eq[MLX4_EQ_ASYNC].irq,
+				  mlx4_msi_x_interrupt, 0, eq_name,
+				  priv->eq_table.eq + MLX4_EQ_ASYNC);
+		if (err)
+			goto err_out_unmap;
+
+		priv->eq_table.eq[MLX4_EQ_ASYNC].have_irq = 1;
+	} else {
+		snprintf(priv->eq_table.irq_names,
+			 MLX4_IRQNAME_SIZE,
+			 DRV_NAME "@pci:%s",
+			 pci_name(dev->persist->pdev));
+		err = request_irq(dev->persist->pdev->irq, mlx4_interrupt,
+				  IRQF_SHARED, priv->eq_table.irq_names, dev);
+		if (err)
+			goto err_out_unmap;
+
+		priv->eq_table.have_irq = 1;
+	}
+
+	err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
+			  priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
+	if (err)
+		mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+			   priv->eq_table.eq[MLX4_EQ_ASYNC].eqn, err);
+
+	/* arm ASYNC eq */
+	eq_set_ci(&priv->eq_table.eq[MLX4_EQ_ASYNC], 1);
+
+	return 0;
+
+err_out_unmap:
+	while (i > 0)
+		mlx4_free_eq(dev, &priv->eq_table.eq[--i]);
+#ifdef CONFIG_RFS_ACCEL
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_priv(dev)->port[i].rmap) {
+			free_irq_cpu_rmap(mlx4_priv(dev)->port[i].rmap);
+			mlx4_priv(dev)->port[i].rmap = NULL;
+		}
+	}
+#endif
+	mlx4_free_irqs(dev);
+
+err_out_clr_int:
+	if (!mlx4_is_slave(dev))
+		mlx4_unmap_clr_int(dev);
+
+err_out_bitmap:
+	mlx4_unmap_uar(dev);
+	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+
+err_out_free:
+	kfree(priv->eq_table.uar_map);
+
+	return err;
+}
+
+void mlx4_cleanup_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 1,
+		    priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
+
+#ifdef CONFIG_RFS_ACCEL
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_priv(dev)->port[i].rmap) {
+			free_irq_cpu_rmap(mlx4_priv(dev)->port[i].rmap);
+			mlx4_priv(dev)->port[i].rmap = NULL;
+		}
+	}
+#endif
+	mlx4_free_irqs(dev);
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
+
+	if (!mlx4_is_slave(dev))
+		mlx4_unmap_clr_int(dev);
+
+	mlx4_unmap_uar(dev);
+	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+
+	kfree(priv->eq_table.uar_map);
+}
+
+/* A test that verifies that we can accept interrupts
+ * on the vector allocated for asynchronous events
+ */
+int mlx4_test_async(struct mlx4_dev *dev)
+{
+	return mlx4_NOP(dev);
+}
+EXPORT_SYMBOL(mlx4_test_async);
+
+/* A test that verifies that we can accept interrupts
+ * on the given irq vector of the tested port.
+ * Interrupts are checked using the NOP command.
+ */
+int mlx4_test_interrupt(struct mlx4_dev *dev, int vector)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	/* Temporary use polling for command completions */
+	mlx4_cmd_use_polling(dev);
+
+	/* Map the new eq to handle all asynchronous events */
+	err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
+			  priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].eqn);
+	if (err) {
+		mlx4_warn(dev, "Failed mapping eq for interrupt test\n");
+		goto out;
+	}
+
+	/* Go back to using events */
+	mlx4_cmd_use_events(dev);
+	err = mlx4_NOP(dev);
+
+	/* Return to default */
+	mlx4_cmd_use_polling(dev);
+out:
+	mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
+		    priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
+	mlx4_cmd_use_events(dev);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx4_test_interrupt);
+
+bool mlx4_is_eq_vector_valid(struct mlx4_dev *dev, u8 port, int vector)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	vector = MLX4_CQ_TO_EQ_VECTOR(vector);
+	if (vector < 0 || (vector >= dev->caps.num_comp_vectors + 1) ||
+	    (vector == MLX4_EQ_ASYNC))
+		return false;
+
+	return test_bit(port - 1, priv->eq_table.eq[vector].actv_ports.ports);
+}
+EXPORT_SYMBOL(mlx4_is_eq_vector_valid);
+
+u32 mlx4_get_eqs_per_port(struct mlx4_dev *dev, u8 port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned int i;
+	unsigned int sum = 0;
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; i++)
+		sum += !!test_bit(port - 1,
+				  priv->eq_table.eq[i].actv_ports.ports);
+
+	return sum;
+}
+EXPORT_SYMBOL(mlx4_get_eqs_per_port);
+
+int mlx4_is_eq_shared(struct mlx4_dev *dev, int vector)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	vector = MLX4_CQ_TO_EQ_VECTOR(vector);
+	if (vector <= 0 || (vector >= dev->caps.num_comp_vectors + 1))
+		return -EINVAL;
+
+	return !!(bitmap_weight(priv->eq_table.eq[vector].actv_ports.ports,
+				dev->caps.num_ports) > 1);
+}
+EXPORT_SYMBOL(mlx4_is_eq_shared);
+
+struct cpu_rmap *mlx4_get_cpu_rmap(struct mlx4_dev *dev, int port)
+{
+	return mlx4_priv(dev)->port[port].rmap;
+}
+EXPORT_SYMBOL(mlx4_get_cpu_rmap);
+
+int mlx4_assign_eq(struct mlx4_dev *dev, u8 port, int *vector)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err = 0, i = 0;
+	u32 min_ref_count_val = (u32)-1;
+	int requested_vector = MLX4_CQ_TO_EQ_VECTOR(*vector);
+	int *prequested_vector = NULL;
+
+
+	mutex_lock(&priv->msix_ctl.pool_lock);
+	if (requested_vector < (dev->caps.num_comp_vectors + 1) &&
+	    (requested_vector >= 0) &&
+	    (requested_vector != MLX4_EQ_ASYNC)) {
+		if (test_bit(port - 1,
+			     priv->eq_table.eq[requested_vector].actv_ports.ports)) {
+			prequested_vector = &requested_vector;
+		} else {
+			struct mlx4_eq *eq;
+
+			for (i = 1; i < port;
+			     requested_vector += mlx4_get_eqs_per_port(dev, i++))
+				;
+
+			eq = &priv->eq_table.eq[requested_vector];
+			if (requested_vector < dev->caps.num_comp_vectors + 1 &&
+			    test_bit(port - 1, eq->actv_ports.ports)) {
+				prequested_vector = &requested_vector;
+			}
+		}
+	}
+
+	if  (!prequested_vector) {
+		requested_vector = -1;
+		for (i = 0; min_ref_count_val && i < dev->caps.num_comp_vectors + 1;
+		     i++) {
+			struct mlx4_eq *eq = &priv->eq_table.eq[i];
+
+			if (min_ref_count_val > eq->ref_count &&
+			    test_bit(port - 1, eq->actv_ports.ports)) {
+				min_ref_count_val = eq->ref_count;
+				requested_vector = i;
+			}
+		}
+
+		if (requested_vector < 0) {
+			err = -ENOSPC;
+			goto err_unlock;
+		}
+
+		prequested_vector = &requested_vector;
+	}
+
+	if (!test_bit(*prequested_vector, priv->msix_ctl.pool_bm) &&
+	    dev->flags & MLX4_FLAG_MSI_X) {
+		set_bit(*prequested_vector, priv->msix_ctl.pool_bm);
+		snprintf(priv->eq_table.irq_names +
+			 *prequested_vector * MLX4_IRQNAME_SIZE,
+			 MLX4_IRQNAME_SIZE, "mlx4-%d@%s",
+			 *prequested_vector, dev_name(&dev->persist->pdev->dev));
+
+		err = request_irq(priv->eq_table.eq[*prequested_vector].irq,
+				  mlx4_msi_x_interrupt, 0,
+				  &priv->eq_table.irq_names[*prequested_vector << 5],
+				  priv->eq_table.eq + *prequested_vector);
+
+		if (err) {
+			clear_bit(*prequested_vector, priv->msix_ctl.pool_bm);
+			*prequested_vector = -1;
+		} else {
+#if defined(CONFIG_SMP)
+			mlx4_set_eq_affinity_hint(priv, *prequested_vector);
+#endif
+			eq_set_ci(&priv->eq_table.eq[*prequested_vector], 1);
+			priv->eq_table.eq[*prequested_vector].have_irq = 1;
+		}
+	}
+
+	if (!err && *prequested_vector >= 0)
+		priv->eq_table.eq[*prequested_vector].ref_count++;
+
+err_unlock:
+	mutex_unlock(&priv->msix_ctl.pool_lock);
+
+	if (!err && *prequested_vector >= 0)
+		*vector = MLX4_EQ_TO_CQ_VECTOR(*prequested_vector);
+	else
+		*vector = 0;
+
+	return err;
+}
+EXPORT_SYMBOL(mlx4_assign_eq);
+
+int mlx4_eq_get_irq(struct mlx4_dev *dev, int cq_vec)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(cq_vec)].irq;
+}
+EXPORT_SYMBOL(mlx4_eq_get_irq);
+
+void mlx4_release_eq(struct mlx4_dev *dev, int vec)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int eq_vec = MLX4_CQ_TO_EQ_VECTOR(vec);
+
+	mutex_lock(&priv->msix_ctl.pool_lock);
+	priv->eq_table.eq[eq_vec].ref_count--;
+
+	/* once we allocated EQ, we don't release it because it might be binded
+	 * to cpu_rmap.
+	 */
+	mutex_unlock(&priv->msix_ctl.pool_lock);
+}
+EXPORT_SYMBOL(mlx4_release_eq);
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
new file mode 100644
index 0000000..de6b3d4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -0,0 +1,3073 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/kernel.h>
+#include <uapi/rdma/mlx4-abi.h>
+
+#include "fw.h"
+#include "icm.h"
+
+enum {
+	MLX4_COMMAND_INTERFACE_MIN_REV		= 2,
+	MLX4_COMMAND_INTERFACE_MAX_REV		= 3,
+	MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS	= 3,
+};
+
+extern void __buggy_use_of_MLX4_GET(void);
+extern void __buggy_use_of_MLX4_PUT(void);
+
+static bool enable_qos;
+module_param(enable_qos, bool, 0444);
+MODULE_PARM_DESC(enable_qos, "Enable Enhanced QoS support (default: off)");
+
+#define MLX4_GET(dest, source, offset)				      \
+	do {							      \
+		void *__p = (char *) (source) + (offset);	      \
+		__be64 val;                                           \
+		switch (sizeof(dest)) {				      \
+		case 1: (dest) = *(u8 *) __p;	    break;	      \
+		case 2: (dest) = be16_to_cpup(__p); break;	      \
+		case 4: (dest) = be32_to_cpup(__p); break;	      \
+		case 8: val = get_unaligned((__be64 *)__p);           \
+			(dest) = be64_to_cpu(val);  break;            \
+		default: __buggy_use_of_MLX4_GET();		      \
+		}						      \
+	} while (0)
+
+#define MLX4_PUT(dest, source, offset)				      \
+	do {							      \
+		void *__d = ((char *) (dest) + (offset));	      \
+		switch (sizeof(source)) {			      \
+		case 1: *(u8 *) __d = (source);		       break; \
+		case 2:	*(__be16 *) __d = cpu_to_be16(source); break; \
+		case 4:	*(__be32 *) __d = cpu_to_be32(source); break; \
+		case 8:	*(__be64 *) __d = cpu_to_be64(source); break; \
+		default: __buggy_use_of_MLX4_PUT();		      \
+		}						      \
+	} while (0)
+
+static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags)
+{
+	static const char *fname[] = {
+		[ 0] = "RC transport",
+		[ 1] = "UC transport",
+		[ 2] = "UD transport",
+		[ 3] = "XRC transport",
+		[ 6] = "SRQ support",
+		[ 7] = "IPoIB checksum offload",
+		[ 8] = "P_Key violation counter",
+		[ 9] = "Q_Key violation counter",
+		[12] = "Dual Port Different Protocol (DPDP) support",
+		[15] = "Big LSO headers",
+		[16] = "MW support",
+		[17] = "APM support",
+		[18] = "Atomic ops support",
+		[19] = "Raw multicast support",
+		[20] = "Address vector port checking support",
+		[21] = "UD multicast support",
+		[30] = "IBoE support",
+		[32] = "Unicast loopback support",
+		[34] = "FCS header control",
+		[37] = "Wake On LAN (port1) support",
+		[38] = "Wake On LAN (port2) support",
+		[40] = "UDP RSS support",
+		[41] = "Unicast VEP steering support",
+		[42] = "Multicast VEP steering support",
+		[48] = "Counters support",
+		[52] = "RSS IP fragments support",
+		[53] = "Port ETS Scheduler support",
+		[55] = "Port link type sensing support",
+		[59] = "Port management change event support",
+		[61] = "64 byte EQE support",
+		[62] = "64 byte CQE support",
+	};
+	int i;
+
+	mlx4_dbg(dev, "DEV_CAP flags:\n");
+	for (i = 0; i < ARRAY_SIZE(fname); ++i)
+		if (fname[i] && (flags & (1LL << i)))
+			mlx4_dbg(dev, "    %s\n", fname[i]);
+}
+
+static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
+{
+	static const char * const fname[] = {
+		[0] = "RSS support",
+		[1] = "RSS Toeplitz Hash Function support",
+		[2] = "RSS XOR Hash Function support",
+		[3] = "Device managed flow steering support",
+		[4] = "Automatic MAC reassignment support",
+		[5] = "Time stamping support",
+		[6] = "VST (control vlan insertion/stripping) support",
+		[7] = "FSM (MAC anti-spoofing) support",
+		[8] = "Dynamic QP updates support",
+		[9] = "Device managed flow steering IPoIB support",
+		[10] = "TCP/IP offloads/flow-steering for VXLAN support",
+		[11] = "MAD DEMUX (Secure-Host) support",
+		[12] = "Large cache line (>64B) CQE stride support",
+		[13] = "Large cache line (>64B) EQE stride support",
+		[14] = "Ethernet protocol control support",
+		[15] = "Ethernet Backplane autoneg support",
+		[16] = "CONFIG DEV support",
+		[17] = "Asymmetric EQs support",
+		[18] = "More than 80 VFs support",
+		[19] = "Performance optimized for limited rule configuration flow steering support",
+		[20] = "Recoverable error events support",
+		[21] = "Port Remap support",
+		[22] = "QCN support",
+		[23] = "QP rate limiting support",
+		[24] = "Ethernet Flow control statistics support",
+		[25] = "Granular QoS per VF support",
+		[26] = "Port ETS Scheduler support",
+		[27] = "Port beacon support",
+		[28] = "RX-ALL support",
+		[29] = "802.1ad offload support",
+		[31] = "Modifying loopback source checks using UPDATE_QP support",
+		[32] = "Loopback source checks support",
+		[33] = "RoCEv2 support",
+		[34] = "DMFS Sniffer support (UC & MC)",
+		[35] = "Diag counters per port",
+		[36] = "QinQ VST mode support",
+		[37] = "sl to vl mapping table change event support",
+		[38] = "user MAC support",
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(fname); ++i)
+		if (fname[i] && (flags & (1LL << i)))
+			mlx4_dbg(dev, "    %s\n", fname[i]);
+}
+
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *inbox;
+	int err = 0;
+
+#define MOD_STAT_CFG_IN_SIZE		0x100
+
+#define MOD_STAT_CFG_PG_SZ_M_OFFSET	0x002
+#define MOD_STAT_CFG_PG_SZ_OFFSET	0x003
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	MLX4_PUT(inbox, cfg->log_pg_sz, MOD_STAT_CFG_PG_SZ_OFFSET);
+	MLX4_PUT(inbox, cfg->log_pg_sz_m, MOD_STAT_CFG_PG_SZ_M_OFFSET);
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_MOD_STAT_CFG,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_QUERY_FUNC(struct mlx4_dev *dev, struct mlx4_func *func, int slave)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	u8 in_modifier;
+	u8 field;
+	u16 field16;
+	int err;
+
+#define QUERY_FUNC_BUS_OFFSET			0x00
+#define QUERY_FUNC_DEVICE_OFFSET		0x01
+#define QUERY_FUNC_FUNCTION_OFFSET		0x01
+#define QUERY_FUNC_PHYSICAL_FUNCTION_OFFSET	0x03
+#define QUERY_FUNC_RSVD_EQS_OFFSET		0x04
+#define QUERY_FUNC_MAX_EQ_OFFSET		0x06
+#define QUERY_FUNC_RSVD_UARS_OFFSET		0x0b
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	in_modifier = slave;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, in_modifier, 0,
+			   MLX4_CMD_QUERY_FUNC,
+			   MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	MLX4_GET(field, outbox, QUERY_FUNC_BUS_OFFSET);
+	func->bus = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_FUNC_DEVICE_OFFSET);
+	func->device = field & 0xf1;
+	MLX4_GET(field, outbox, QUERY_FUNC_FUNCTION_OFFSET);
+	func->function = field & 0x7;
+	MLX4_GET(field, outbox, QUERY_FUNC_PHYSICAL_FUNCTION_OFFSET);
+	func->physical_function = field & 0xf;
+	MLX4_GET(field16, outbox, QUERY_FUNC_RSVD_EQS_OFFSET);
+	func->rsvd_eqs = field16 & 0xffff;
+	MLX4_GET(field16, outbox, QUERY_FUNC_MAX_EQ_OFFSET);
+	func->max_eq = field16 & 0xffff;
+	MLX4_GET(field, outbox, QUERY_FUNC_RSVD_UARS_OFFSET);
+	func->rsvd_uars = field & 0x0f;
+
+	mlx4_dbg(dev, "Bus: %d, Device: %d, Function: %d, Physical function: %d, Max EQs: %d, Reserved EQs: %d, Reserved UARs: %d\n",
+		 func->bus, func->device, func->function, func->physical_function,
+		 func->max_eq, func->rsvd_eqs, func->rsvd_uars);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static int mlx4_activate_vst_qinq(struct mlx4_priv *priv, int slave, int port)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_vport_state *vp_admin;
+	int err;
+
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	if (vp_admin->default_vlan != vp_oper->state.default_vlan) {
+		err = __mlx4_register_vlan(&priv->dev, port,
+					   vp_admin->default_vlan,
+					   &vp_oper->vlan_idx);
+		if (err) {
+			vp_oper->vlan_idx = NO_INDX;
+			mlx4_warn(&priv->dev,
+				  "No vlan resources slave %d, port %d\n",
+				  slave, port);
+			return err;
+		}
+		mlx4_dbg(&priv->dev, "alloc vlan %d idx  %d slave %d port %d\n",
+			 (int)(vp_oper->state.default_vlan),
+			 vp_oper->vlan_idx, slave, port);
+	}
+	vp_oper->state.vlan_proto   = vp_admin->vlan_proto;
+	vp_oper->state.default_vlan = vp_admin->default_vlan;
+	vp_oper->state.default_qos  = vp_admin->default_qos;
+
+	return 0;
+}
+
+static int mlx4_handle_vst_qinq(struct mlx4_priv *priv, int slave, int port)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_slave_state *slave_state;
+	struct mlx4_vport_state *vp_admin;
+	int err;
+
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+	slave_state = &priv->mfunc.master.slave_state[slave];
+
+	if ((vp_admin->vlan_proto != htons(ETH_P_8021AD)) ||
+	    (!slave_state->active))
+		return 0;
+
+	if (vp_oper->state.vlan_proto == vp_admin->vlan_proto &&
+	    vp_oper->state.default_vlan == vp_admin->default_vlan &&
+	    vp_oper->state.default_qos == vp_admin->default_qos)
+		return 0;
+
+	if (!slave_state->vst_qinq_supported) {
+		/* Warn and revert the request to set vst QinQ mode */
+		vp_admin->vlan_proto   = vp_oper->state.vlan_proto;
+		vp_admin->default_vlan = vp_oper->state.default_vlan;
+		vp_admin->default_qos  = vp_oper->state.default_qos;
+
+		mlx4_warn(&priv->dev,
+			  "Slave %d does not support VST QinQ mode\n", slave);
+		return 0;
+	}
+
+	err = mlx4_activate_vst_qinq(priv, slave, port);
+	return err;
+}
+
+int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u8	field, port;
+	u32	size, proxy_qp, qkey;
+	int	err = 0;
+	struct mlx4_func func;
+
+#define QUERY_FUNC_CAP_FLAGS_OFFSET		0x0
+#define QUERY_FUNC_CAP_NUM_PORTS_OFFSET		0x1
+#define QUERY_FUNC_CAP_PF_BHVR_OFFSET		0x4
+#define QUERY_FUNC_CAP_FMR_OFFSET		0x8
+#define QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP	0x10
+#define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP	0x14
+#define QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP	0x18
+#define QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP	0x20
+#define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP	0x24
+#define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP	0x28
+#define QUERY_FUNC_CAP_MAX_EQ_OFFSET		0x2c
+#define QUERY_FUNC_CAP_RESERVED_EQ_OFFSET	0x30
+#define QUERY_FUNC_CAP_QP_RESD_LKEY_OFFSET	0x48
+
+#define QUERY_FUNC_CAP_QP_QUOTA_OFFSET		0x50
+#define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET		0x54
+#define QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET		0x58
+#define QUERY_FUNC_CAP_MPT_QUOTA_OFFSET		0x60
+#define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET		0x64
+#define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET		0x68
+
+#define QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET	0x6c
+
+#define QUERY_FUNC_CAP_FMR_FLAG			0x80
+#define QUERY_FUNC_CAP_FLAG_RDMA		0x40
+#define QUERY_FUNC_CAP_FLAG_ETH			0x80
+#define QUERY_FUNC_CAP_FLAG_QUOTAS		0x10
+#define QUERY_FUNC_CAP_FLAG_RESD_LKEY		0x08
+#define QUERY_FUNC_CAP_FLAG_VALID_MAILBOX	0x04
+
+#define QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG	(1UL << 31)
+#define QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG	(1UL << 30)
+
+/* when opcode modifier = 1 */
+#define QUERY_FUNC_CAP_PHYS_PORT_OFFSET		0x3
+#define QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET	0x4
+#define QUERY_FUNC_CAP_FLAGS0_OFFSET		0x8
+#define QUERY_FUNC_CAP_FLAGS1_OFFSET		0xc
+
+#define QUERY_FUNC_CAP_QP0_TUNNEL		0x10
+#define QUERY_FUNC_CAP_QP0_PROXY		0x14
+#define QUERY_FUNC_CAP_QP1_TUNNEL		0x18
+#define QUERY_FUNC_CAP_QP1_PROXY		0x1c
+#define QUERY_FUNC_CAP_PHYS_PORT_ID		0x28
+
+#define QUERY_FUNC_CAP_FLAGS1_FORCE_MAC		0x40
+#define QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN	0x80
+#define QUERY_FUNC_CAP_FLAGS1_NIC_INFO			0x10
+#define QUERY_FUNC_CAP_VF_ENABLE_QP0		0x08
+
+#define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80
+#define QUERY_FUNC_CAP_PHV_BIT			0x40
+#define QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE	0x20
+
+#define QUERY_FUNC_CAP_SUPPORTS_VST_QINQ	BIT(30)
+#define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS BIT(31)
+
+	if (vhcr->op_modifier == 1) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, slave);
+		int converted_port = mlx4_slave_convert_port(
+				dev, slave, vhcr->in_modifier);
+		struct mlx4_vport_oper_state *vp_oper;
+
+		if (converted_port < 0)
+			return -EINVAL;
+
+		vhcr->in_modifier = converted_port;
+		/* phys-port = logical-port */
+		field = vhcr->in_modifier -
+			find_first_bit(actv_ports.ports, dev->caps.num_ports);
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_PHYS_PORT_OFFSET);
+
+		port = vhcr->in_modifier;
+		proxy_qp = dev->phys_caps.base_proxy_sqpn + 8 * slave + port - 1;
+
+		/* Set nic_info bit to mark new fields support */
+		field  = QUERY_FUNC_CAP_FLAGS1_NIC_INFO;
+
+		if (mlx4_vf_smi_enabled(dev, slave, port) &&
+		    !mlx4_get_parav_qkey(dev, proxy_qp, &qkey)) {
+			field |= QUERY_FUNC_CAP_VF_ENABLE_QP0;
+			MLX4_PUT(outbox->buf, qkey,
+				 QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET);
+		}
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS1_OFFSET);
+
+		/* size is now the QP number */
+		size = dev->phys_caps.base_tunnel_sqpn + 8 * slave + port - 1;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_TUNNEL);
+
+		size += 2;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_TUNNEL);
+
+		MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP0_PROXY);
+		proxy_qp += 2;
+		MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP1_PROXY);
+
+		MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier],
+			 QUERY_FUNC_CAP_PHYS_PORT_ID);
+
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		err = mlx4_handle_vst_qinq(priv, slave, port);
+		if (err)
+			return err;
+
+		field = 0;
+		if (dev->caps.phv_bit[port])
+			field |= QUERY_FUNC_CAP_PHV_BIT;
+		if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD))
+			field |= QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE;
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+
+	} else if (vhcr->op_modifier == 0) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, slave);
+		struct mlx4_slave_state *slave_state =
+			&priv->mfunc.master.slave_state[slave];
+
+		/* enable rdma and ethernet interfaces, new quota locations,
+		 * and reserved lkey
+		 */
+		field = (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA |
+			 QUERY_FUNC_CAP_FLAG_QUOTAS | QUERY_FUNC_CAP_FLAG_VALID_MAILBOX |
+			 QUERY_FUNC_CAP_FLAG_RESD_LKEY);
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS_OFFSET);
+
+		field = min(
+			bitmap_weight(actv_ports.ports, dev->caps.num_ports),
+			dev->caps.num_ports);
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_NUM_PORTS_OFFSET);
+
+		size = dev->caps.function_caps; /* set PF behaviours */
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_PF_BHVR_OFFSET);
+
+		field = 0; /* protected FMR support not available as yet */
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FMR_OFFSET);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_QP].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_QUOTA_OFFSET);
+		size = dev->caps.num_qps;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_SRQ].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET);
+		size = dev->caps.num_srqs;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_CQ].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET);
+		size = dev->caps.num_cqs;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP);
+
+		if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) ||
+		    mlx4_QUERY_FUNC(dev, &func, slave)) {
+			size = vhcr->in_modifier &
+				QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS ?
+				dev->caps.num_eqs :
+				rounddown_pow_of_two(dev->caps.num_eqs);
+			MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
+			size = dev->caps.reserved_eqs;
+			MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
+		} else {
+			size = vhcr->in_modifier &
+				QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS ?
+				func.max_eq :
+				rounddown_pow_of_two(func.max_eq);
+			MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
+			size = func.rsvd_eqs;
+			MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
+		}
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET);
+		size = dev->caps.num_mpts;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_MTT].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET);
+		size = dev->caps.num_mtts;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP);
+
+		size = dev->caps.num_mgms + dev->caps.num_amgms;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP);
+
+		size = QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG |
+			QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
+
+		size = dev->caps.reserved_lkey + ((slave << 8) & 0xFF00);
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_RESD_LKEY_OFFSET);
+
+		if (vhcr->in_modifier & QUERY_FUNC_CAP_SUPPORTS_VST_QINQ)
+			slave_state->vst_qinq_supported = true;
+
+	} else
+		err = -EINVAL;
+
+	return err;
+}
+
+int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
+			struct mlx4_func_cap *func_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32			*outbox;
+	u8			field, op_modifier;
+	u32			size, qkey;
+	int			err = 0, quotas = 0;
+	u32                     in_modifier;
+	u32			slave_caps;
+
+	op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */
+	slave_caps = QUERY_FUNC_CAP_SUPPORTS_VST_QINQ |
+		QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS;
+	in_modifier = op_modifier ? gen_or_port : slave_caps;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, in_modifier, op_modifier,
+			   MLX4_CMD_QUERY_FUNC_CAP,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	outbox = mailbox->buf;
+
+	if (!op_modifier) {
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS_OFFSET);
+		if (!(field & (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA))) {
+			mlx4_err(dev, "The host supports neither eth nor rdma interfaces\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+		func_cap->flags = field;
+		quotas = !!(func_cap->flags & QUERY_FUNC_CAP_FLAG_QUOTAS);
+
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_NUM_PORTS_OFFSET);
+		func_cap->num_ports = field;
+
+		MLX4_GET(size, outbox, QUERY_FUNC_CAP_PF_BHVR_OFFSET);
+		func_cap->pf_context_behaviour = size;
+
+		if (quotas) {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_QUOTA_OFFSET);
+			func_cap->qp_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET);
+			func_cap->srq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET);
+			func_cap->cq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET);
+			func_cap->mpt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET);
+			func_cap->mtt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
+			func_cap->mcg_quota = size & 0xFFFFFF;
+
+		} else {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP);
+			func_cap->qp_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP);
+			func_cap->srq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP);
+			func_cap->cq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP);
+			func_cap->mpt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP);
+			func_cap->mtt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP);
+			func_cap->mcg_quota = size & 0xFFFFFF;
+		}
+		MLX4_GET(size, outbox, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
+		func_cap->max_eq = size & 0xFFFFFF;
+
+		MLX4_GET(size, outbox, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
+		func_cap->reserved_eq = size & 0xFFFFFF;
+
+		if (func_cap->flags & QUERY_FUNC_CAP_FLAG_RESD_LKEY) {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_RESD_LKEY_OFFSET);
+			func_cap->reserved_lkey = size;
+		} else {
+			func_cap->reserved_lkey = 0;
+		}
+
+		func_cap->extra_flags = 0;
+
+		/* Mailbox data from 0x6c and onward should only be treated if
+		 * QUERY_FUNC_CAP_FLAG_VALID_MAILBOX is set in func_cap->flags
+		 */
+		if (func_cap->flags & QUERY_FUNC_CAP_FLAG_VALID_MAILBOX) {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
+			if (size & QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG)
+				func_cap->extra_flags |= MLX4_QUERY_FUNC_FLAGS_BF_RES_QP;
+			if (size & QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG)
+				func_cap->extra_flags |= MLX4_QUERY_FUNC_FLAGS_A0_RES_QP;
+		}
+
+		goto out;
+	}
+
+	/* logical port query */
+	if (gen_or_port > dev->caps.num_ports) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	MLX4_GET(func_cap->flags1, outbox, QUERY_FUNC_CAP_FLAGS1_OFFSET);
+	if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_ETH) {
+		if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN) {
+			mlx4_err(dev, "VLAN is enforced on this port\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+
+		if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_FORCE_MAC) {
+			mlx4_err(dev, "Force mac is enabled on this port\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+	} else if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_IB) {
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+		if (field & QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID) {
+			mlx4_err(dev, "phy_wqe_gid is enforced on this ib port\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+	}
+
+	MLX4_GET(field, outbox, QUERY_FUNC_CAP_PHYS_PORT_OFFSET);
+	func_cap->physical_port = field;
+	if (func_cap->physical_port != gen_or_port) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (func_cap->flags1 & QUERY_FUNC_CAP_VF_ENABLE_QP0) {
+		MLX4_GET(qkey, outbox, QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET);
+		func_cap->spec_qps.qp0_qkey = qkey;
+	} else {
+		func_cap->spec_qps.qp0_qkey = 0;
+	}
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_TUNNEL);
+	func_cap->spec_qps.qp0_tunnel = size & 0xFFFFFF;
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_PROXY);
+	func_cap->spec_qps.qp0_proxy = size & 0xFFFFFF;
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_TUNNEL);
+	func_cap->spec_qps.qp1_tunnel = size & 0xFFFFFF;
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_PROXY);
+	func_cap->spec_qps.qp1_proxy = size & 0xFFFFFF;
+
+	if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_NIC_INFO)
+		MLX4_GET(func_cap->phys_port_id, outbox,
+			 QUERY_FUNC_CAP_PHYS_PORT_ID);
+
+	MLX4_GET(func_cap->flags0, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+
+	/* All other resources are allocated by the master, but we still report
+	 * 'num' and 'reserved' capabilities as follows:
+	 * - num remains the maximum resource index
+	 * - 'num - reserved' is the total available objects of a resource, but
+	 *   resource indices may be less than 'reserved'
+	 * TODO: set per-resource quotas */
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+static void disable_unsupported_roce_caps(void *buf);
+
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u32 field32, flags, ext_flags;
+	u16 size;
+	u16 stat_rate;
+	int err;
+	int i;
+
+#define QUERY_DEV_CAP_OUT_SIZE		       0x100
+#define QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET		0x10
+#define QUERY_DEV_CAP_MAX_QP_SZ_OFFSET		0x11
+#define QUERY_DEV_CAP_RSVD_QP_OFFSET		0x12
+#define QUERY_DEV_CAP_MAX_QP_OFFSET		0x13
+#define QUERY_DEV_CAP_RSVD_SRQ_OFFSET		0x14
+#define QUERY_DEV_CAP_MAX_SRQ_OFFSET		0x15
+#define QUERY_DEV_CAP_RSVD_EEC_OFFSET		0x16
+#define QUERY_DEV_CAP_MAX_EEC_OFFSET		0x17
+#define QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET		0x19
+#define QUERY_DEV_CAP_RSVD_CQ_OFFSET		0x1a
+#define QUERY_DEV_CAP_MAX_CQ_OFFSET		0x1b
+#define QUERY_DEV_CAP_MAX_MPT_OFFSET		0x1d
+#define QUERY_DEV_CAP_RSVD_EQ_OFFSET		0x1e
+#define QUERY_DEV_CAP_MAX_EQ_OFFSET		0x1f
+#define QUERY_DEV_CAP_RSVD_MTT_OFFSET		0x20
+#define QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET		0x21
+#define QUERY_DEV_CAP_RSVD_MRW_OFFSET		0x22
+#define QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET	0x23
+#define QUERY_DEV_CAP_NUM_SYS_EQ_OFFSET		0x26
+#define QUERY_DEV_CAP_MAX_AV_OFFSET		0x27
+#define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET		0x29
+#define QUERY_DEV_CAP_MAX_RES_QP_OFFSET		0x2b
+#define QUERY_DEV_CAP_MAX_GSO_OFFSET		0x2d
+#define QUERY_DEV_CAP_RSS_OFFSET		0x2e
+#define QUERY_DEV_CAP_MAX_RDMA_OFFSET		0x2f
+#define QUERY_DEV_CAP_RSZ_SRQ_OFFSET		0x33
+#define QUERY_DEV_CAP_PORT_BEACON_OFFSET	0x34
+#define QUERY_DEV_CAP_ACK_DELAY_OFFSET		0x35
+#define QUERY_DEV_CAP_MTU_WIDTH_OFFSET		0x36
+#define QUERY_DEV_CAP_VL_PORT_OFFSET		0x37
+#define QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET		0x38
+#define QUERY_DEV_CAP_MAX_GID_OFFSET		0x3b
+#define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET	0x3c
+#define QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET	0x3e
+#define QUERY_DEV_CAP_MAX_PKEY_OFFSET		0x3f
+#define QUERY_DEV_CAP_EXT_FLAGS_OFFSET		0x40
+#define QUERY_DEV_CAP_WOL_OFFSET		0x43
+#define QUERY_DEV_CAP_FLAGS_OFFSET		0x44
+#define QUERY_DEV_CAP_RSVD_UAR_OFFSET		0x48
+#define QUERY_DEV_CAP_UAR_SZ_OFFSET		0x49
+#define QUERY_DEV_CAP_PAGE_SZ_OFFSET		0x4b
+#define QUERY_DEV_CAP_BF_OFFSET			0x4c
+#define QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET	0x4d
+#define QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET	0x4e
+#define QUERY_DEV_CAP_LOG_MAX_BF_PAGES_OFFSET	0x4f
+#define QUERY_DEV_CAP_MAX_SG_SQ_OFFSET		0x51
+#define QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET	0x52
+#define QUERY_DEV_CAP_MAX_SG_RQ_OFFSET		0x55
+#define QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET	0x56
+#define QUERY_DEV_CAP_USER_MAC_EN_OFFSET	0x5C
+#define QUERY_DEV_CAP_SVLAN_BY_QP_OFFSET	0x5D
+#define QUERY_DEV_CAP_MAX_QP_MCG_OFFSET		0x61
+#define QUERY_DEV_CAP_RSVD_MCG_OFFSET		0x62
+#define QUERY_DEV_CAP_MAX_MCG_OFFSET		0x63
+#define QUERY_DEV_CAP_RSVD_PD_OFFSET		0x64
+#define QUERY_DEV_CAP_MAX_PD_OFFSET		0x65
+#define QUERY_DEV_CAP_RSVD_XRC_OFFSET		0x66
+#define QUERY_DEV_CAP_MAX_XRC_OFFSET		0x67
+#define QUERY_DEV_CAP_MAX_COUNTERS_OFFSET	0x68
+#define QUERY_DEV_CAP_PORT_FLOWSTATS_COUNTERS_OFFSET	0x70
+#define QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET	0x70
+#define QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET	0x74
+#define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET	0x76
+#define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET	0x77
+#define QUERY_DEV_CAP_SL2VL_EVENT_OFFSET	0x78
+#define QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE	0x7a
+#define QUERY_DEV_CAP_ECN_QCN_VER_OFFSET	0x7b
+#define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET	0x80
+#define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET	0x82
+#define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET	0x84
+#define QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET	0x86
+#define QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET	0x88
+#define QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET	0x8a
+#define QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET	0x8c
+#define QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET	0x8e
+#define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET	0x90
+#define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET	0x92
+#define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x94
+#define QUERY_DEV_CAP_CONFIG_DEV_OFFSET		0x94
+#define QUERY_DEV_CAP_PHV_EN_OFFSET		0x96
+#define QUERY_DEV_CAP_RSVD_LKEY_OFFSET		0x98
+#define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET		0xa0
+#define QUERY_DEV_CAP_ETH_BACKPL_OFFSET		0x9c
+#define QUERY_DEV_CAP_DIAG_RPRT_PER_PORT	0x9c
+#define QUERY_DEV_CAP_FW_REASSIGN_MAC		0x9d
+#define QUERY_DEV_CAP_VXLAN			0x9e
+#define QUERY_DEV_CAP_MAD_DEMUX_OFFSET		0xb0
+#define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET	0xa8
+#define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET	0xac
+#define QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET	0xcc
+#define QUERY_DEV_CAP_QP_RATE_LIMIT_MAX_OFFSET	0xd0
+#define QUERY_DEV_CAP_QP_RATE_LIMIT_MIN_OFFSET	0xd2
+
+
+	dev_cap->flags2 = 0;
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	if (mlx4_is_mfunc(dev))
+		disable_unsupported_roce_caps(outbox);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET);
+	dev_cap->reserved_qps = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET);
+	dev_cap->max_qps = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_SRQ_OFFSET);
+	dev_cap->reserved_srqs = 1 << (field >> 4);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_OFFSET);
+	dev_cap->max_srqs = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET);
+	dev_cap->max_cq_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_CQ_OFFSET);
+	dev_cap->reserved_cqs = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_OFFSET);
+	dev_cap->max_cqs = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET);
+	dev_cap->max_mpts = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET);
+	dev_cap->reserved_eqs = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET);
+	dev_cap->max_eqs = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET);
+	dev_cap->reserved_mtts = 1 << (field >> 4);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MRW_OFFSET);
+	dev_cap->reserved_mrws = 1 << (field & 0xf);
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_NUM_SYS_EQ_OFFSET);
+	dev_cap->num_sys_eqs = size & 0xfff;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_REQ_QP_OFFSET);
+	dev_cap->max_requester_per_qp = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RES_QP_OFFSET);
+	dev_cap->max_responder_per_qp = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GSO_OFFSET);
+	field &= 0x1f;
+	if (!field)
+		dev_cap->max_gso_sz = 0;
+	else
+		dev_cap->max_gso_sz = 1 << field;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSS_OFFSET);
+	if (field & 0x20)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_XOR;
+	if (field & 0x10)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_TOP;
+	field &= 0xf;
+	if (field) {
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS;
+		dev_cap->max_rss_tbl_sz = 1 << field;
+	} else
+		dev_cap->max_rss_tbl_sz = 0;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RDMA_OFFSET);
+	dev_cap->max_rdma_global = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET);
+	dev_cap->local_ca_ack_delay = field & 0x1f;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+	dev_cap->num_ports = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET);
+	dev_cap->max_msg_sz = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PORT_FLOWSTATS_COUNTERS_OFFSET);
+	if (field & 0x10)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FS_EN;
+	dev_cap->fs_log_max_ucast_qp_range_size = field & 0x1f;
+	if (field & 0x20)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PORT_BEACON_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_BEACON;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DMFS_IPOIB;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET);
+	dev_cap->fs_max_num_qp_per_entry = field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_SL2VL_EVENT_OFFSET);
+	if (field & (1 << 5))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_ECN_QCN_VER_OFFSET);
+	if (field & 0x1)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_QCN;
+	MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
+	dev_cap->stat_rate_support = stat_rate;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_TS;
+	MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
+	dev_cap->flags = flags | (u64)ext_flags << 32;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_WOL_OFFSET);
+	dev_cap->wol_port[1] = !!(field & 0x20);
+	dev_cap->wol_port[2] = !!(field & 0x40);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET);
+	dev_cap->reserved_uars = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET);
+	dev_cap->uar_size = 1 << ((field & 0x3f) + 20);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PAGE_SZ_OFFSET);
+	dev_cap->min_page_sz = 1 << field;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_BF_OFFSET);
+	if (field & 0x80) {
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET);
+		dev_cap->bf_reg_size = 1 << (field & 0x1f);
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET);
+		if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size))
+			field = 3;
+		dev_cap->bf_regs_per_page = 1 << (field & 0x3f);
+	} else {
+		dev_cap->bf_reg_size = 0;
+	}
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_SQ_OFFSET);
+	dev_cap->max_sq_sg = field;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET);
+	dev_cap->max_sq_desc_sz = size;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_USER_MAC_EN_OFFSET);
+	if (field & (1 << 2))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_USER_MAC_EN;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_SVLAN_BY_QP_OFFSET);
+	if (field & 0x1)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_MCG_OFFSET);
+	dev_cap->max_qp_per_mcg = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MCG_OFFSET);
+	dev_cap->reserved_mgms = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MCG_OFFSET);
+	dev_cap->max_mcgs = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_PD_OFFSET);
+	dev_cap->reserved_pds = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET);
+	dev_cap->max_pds = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_XRC_OFFSET);
+	dev_cap->reserved_xrcds = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_XRC_OFFSET);
+	dev_cap->max_xrcds = 1 << (field & 0x1f);
+
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET);
+	dev_cap->rdmarc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET);
+	dev_cap->qpc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET);
+	dev_cap->aux_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET);
+	dev_cap->altc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET);
+	dev_cap->eqc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET);
+	dev_cap->cqc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET);
+	dev_cap->srq_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET);
+	dev_cap->cmpt_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET);
+	dev_cap->mtt_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET);
+	dev_cap->dmpt_entry_sz = size;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET);
+	dev_cap->max_srq_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_SZ_OFFSET);
+	dev_cap->max_qp_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSZ_SRQ_OFFSET);
+	dev_cap->resize_srq = field & 1;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_RQ_OFFSET);
+	dev_cap->max_rq_sg = field;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET);
+	dev_cap->max_rq_desc_sz = size;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE);
+	if (field & (1 << 4))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_QOS_VPP;
+	if (field & (1 << 5))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL;
+	if (field & (1 << 6))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+	if (field & (1 << 7))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+	MLX4_GET(dev_cap->bmme_flags, outbox,
+		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	if (dev_cap->bmme_flags & MLX4_FLAG_ROCE_V1_V2)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ROCE_V1_V2;
+	if (dev_cap->bmme_flags & MLX4_FLAG_PORT_REMAP)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_REMAP;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
+	if (field & 0x20)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CONFIG_DEV;
+	if (field & (1 << 2))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_IGNORE_FCS;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PHV_EN_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PHV_EN;
+	if (field & 0x40)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN;
+
+	MLX4_GET(dev_cap->reserved_lkey, outbox,
+		 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
+	MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETH_BACKPL_OFFSET);
+	if (field32 & (1 << 0))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP;
+	if (field32 & (1 << 7))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT;
+	MLX4_GET(field32, outbox, QUERY_DEV_CAP_DIAG_RPRT_PER_PORT);
+	if (field32 & (1 << 17))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FW_REASSIGN_MAC);
+	if (field & 1<<6)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_VXLAN);
+	if (field & 1<<3)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS;
+	if (field & (1 << 5))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETS_CFG;
+	MLX4_GET(dev_cap->max_icm_sz, outbox,
+		 QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET);
+	if (dev_cap->flags & MLX4_DEV_CAP_FLAG_COUNTERS)
+		MLX4_GET(dev_cap->max_counters, outbox,
+			 QUERY_DEV_CAP_MAX_COUNTERS_OFFSET);
+
+	MLX4_GET(field32, outbox,
+		 QUERY_DEV_CAP_MAD_DEMUX_OFFSET);
+	if (field32 & (1 << 0))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_MAD_DEMUX;
+
+	MLX4_GET(dev_cap->dmfs_high_rate_qpn_base, outbox,
+		 QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET);
+	dev_cap->dmfs_high_rate_qpn_base &= MGM_QPN_MASK;
+	MLX4_GET(dev_cap->dmfs_high_rate_qpn_range, outbox,
+		 QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET);
+	dev_cap->dmfs_high_rate_qpn_range &= MGM_QPN_MASK;
+
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET);
+	dev_cap->rl_caps.num_rates = size;
+	if (dev_cap->rl_caps.num_rates) {
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT;
+		MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_MAX_OFFSET);
+		dev_cap->rl_caps.max_val  = size & 0xfff;
+		dev_cap->rl_caps.max_unit = size >> 14;
+		MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_MIN_OFFSET);
+		dev_cap->rl_caps.min_val  = size & 0xfff;
+		dev_cap->rl_caps.min_unit = size >> 14;
+	}
+
+	MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	if (field32 & (1 << 16))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP;
+	if (field32 & (1 << 18))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB;
+	if (field32 & (1 << 19))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_LB_SRC_CHK;
+	if (field32 & (1 << 26))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VLAN_CONTROL;
+	if (field32 & (1 << 20))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FSM;
+	if (field32 & (1 << 21))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_80_VFS;
+
+	for (i = 1; i <= dev_cap->num_ports; i++) {
+		err = mlx4_QUERY_PORT(dev, i, dev_cap->port_cap + i);
+		if (err)
+			goto out;
+	}
+
+	/*
+	 * Each UAR has 4 EQ doorbells; so if a UAR is reserved, then
+	 * we can't use any EQs whose doorbell falls on that page,
+	 * even if the EQ itself isn't reserved.
+	 */
+	if (dev_cap->num_sys_eqs == 0)
+		dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4,
+					    dev_cap->reserved_eqs);
+	else
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SYS_EQS;
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+void mlx4_dev_cap_dump(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	if (dev_cap->bf_reg_size > 0)
+		mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n",
+			 dev_cap->bf_reg_size, dev_cap->bf_regs_per_page);
+	else
+		mlx4_dbg(dev, "BlueFlame not available\n");
+
+	mlx4_dbg(dev, "Base MM extensions: flags %08x, rsvd L_Key %08x\n",
+		 dev_cap->bmme_flags, dev_cap->reserved_lkey);
+	mlx4_dbg(dev, "Max ICM size %lld MB\n",
+		 (unsigned long long) dev_cap->max_icm_sz >> 20);
+	mlx4_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+		 dev_cap->max_qps, dev_cap->reserved_qps, dev_cap->qpc_entry_sz);
+	mlx4_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
+		 dev_cap->max_srqs, dev_cap->reserved_srqs, dev_cap->srq_entry_sz);
+	mlx4_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+		 dev_cap->max_cqs, dev_cap->reserved_cqs, dev_cap->cqc_entry_sz);
+	mlx4_dbg(dev, "Num sys EQs: %d, max EQs: %d, reserved EQs: %d, entry size: %d\n",
+		 dev_cap->num_sys_eqs, dev_cap->max_eqs, dev_cap->reserved_eqs,
+		 dev_cap->eqc_entry_sz);
+	mlx4_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+		 dev_cap->reserved_mrws, dev_cap->reserved_mtts);
+	mlx4_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+		 dev_cap->max_pds, dev_cap->reserved_pds, dev_cap->reserved_uars);
+	mlx4_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+		 dev_cap->max_pds, dev_cap->reserved_mgms);
+	mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
+		 dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz);
+	mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n",
+		 dev_cap->local_ca_ack_delay, 128 << dev_cap->port_cap[1].ib_mtu,
+		 dev_cap->port_cap[1].max_port_width);
+	mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n",
+		 dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
+	mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
+		 dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg);
+	mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
+	mlx4_dbg(dev, "Max counters: %d\n", dev_cap->max_counters);
+	mlx4_dbg(dev, "Max RSS Table size: %d\n", dev_cap->max_rss_tbl_sz);
+	mlx4_dbg(dev, "DMFS high rate steer QPn base: %d\n",
+		 dev_cap->dmfs_high_rate_qpn_base);
+	mlx4_dbg(dev, "DMFS high rate steer QPn range: %d\n",
+		 dev_cap->dmfs_high_rate_qpn_range);
+
+	if (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT) {
+		struct mlx4_rate_limit_caps *rl_caps = &dev_cap->rl_caps;
+
+		mlx4_dbg(dev, "QP Rate-Limit: #rates %d, unit/val max %d/%d, min %d/%d\n",
+			 rl_caps->num_rates, rl_caps->max_unit, rl_caps->max_val,
+			 rl_caps->min_unit, rl_caps->min_val);
+	}
+
+	dump_dev_cap_flags(dev, dev_cap->flags);
+	dump_dev_cap_flags2(dev, dev_cap->flags2);
+}
+
+int mlx4_QUERY_PORT(struct mlx4_dev *dev, int port, struct mlx4_port_cap *port_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u32 field32;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+				   MLX4_CMD_TIME_CLASS_A,
+				   MLX4_CMD_NATIVE);
+
+		if (err)
+			goto out;
+
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+		port_cap->max_vl	   = field >> 4;
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
+		port_cap->ib_mtu	   = field >> 4;
+		port_cap->max_port_width = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
+		port_cap->max_gids	   = 1 << (field & 0xf);
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET);
+		port_cap->max_pkeys	   = 1 << (field & 0xf);
+	} else {
+#define QUERY_PORT_SUPPORTED_TYPE_OFFSET	0x00
+#define QUERY_PORT_MTU_OFFSET			0x01
+#define QUERY_PORT_ETH_MTU_OFFSET		0x02
+#define QUERY_PORT_WIDTH_OFFSET			0x06
+#define QUERY_PORT_MAX_GID_PKEY_OFFSET		0x07
+#define QUERY_PORT_MAX_MACVLAN_OFFSET		0x0a
+#define QUERY_PORT_MAX_VL_OFFSET		0x0b
+#define QUERY_PORT_MAC_OFFSET			0x10
+#define QUERY_PORT_TRANS_VENDOR_OFFSET		0x18
+#define QUERY_PORT_WAVELENGTH_OFFSET		0x1c
+#define QUERY_PORT_TRANS_CODE_OFFSET		0x20
+
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT,
+				   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+		if (err)
+			goto out;
+
+		MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+		port_cap->link_state = (field & 0x80) >> 7;
+		port_cap->supported_port_types = field & 3;
+		port_cap->suggested_type = (field >> 3) & 1;
+		port_cap->default_sense = (field >> 4) & 1;
+		port_cap->dmfs_optimized_state = (field >> 5) & 1;
+		MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
+		port_cap->ib_mtu	   = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
+		port_cap->max_port_width = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET);
+		port_cap->max_gids	   = 1 << (field >> 4);
+		port_cap->max_pkeys	   = 1 << (field & 0xf);
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET);
+		port_cap->max_vl	   = field & 0xf;
+		port_cap->max_tc_eth	   = field >> 4;
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
+		port_cap->log_max_macs  = field & 0xf;
+		port_cap->log_max_vlans = field >> 4;
+		MLX4_GET(port_cap->eth_mtu, outbox, QUERY_PORT_ETH_MTU_OFFSET);
+		MLX4_GET(port_cap->def_mac, outbox, QUERY_PORT_MAC_OFFSET);
+		MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET);
+		port_cap->trans_type = field32 >> 24;
+		port_cap->vendor_oui = field32 & 0xffffff;
+		MLX4_GET(port_cap->wavelength, outbox, QUERY_PORT_WAVELENGTH_OFFSET);
+		MLX4_GET(port_cap->trans_code, outbox, QUERY_PORT_TRANS_CODE_OFFSET);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+#define DEV_CAP_EXT_2_FLAG_PFC_COUNTERS	(1 << 28)
+#define DEV_CAP_EXT_2_FLAG_VLAN_CONTROL (1 << 26)
+#define DEV_CAP_EXT_2_FLAG_80_VFS	(1 << 21)
+#define DEV_CAP_EXT_2_FLAG_FSM		(1 << 20)
+
+int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	u64	flags;
+	int	err = 0;
+	u8	field;
+	u16	field16;
+	u32	bmme_flags, field32;
+	int	real_port;
+	int	slave_port;
+	int	first_port;
+	struct mlx4_active_ports actv_ports;
+
+	err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+
+	disable_unsupported_roce_caps(outbox->buf);
+	/* add port mng change event capability and disable mw type 1
+	 * unconditionally to slaves
+	 */
+	MLX4_GET(flags, outbox->buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	flags |= MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV;
+	flags &= ~MLX4_DEV_CAP_FLAG_MEM_WINDOW;
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports);
+	for (slave_port = 0, real_port = first_port;
+	     real_port < first_port +
+	     bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+	     ++real_port, ++slave_port) {
+		if (flags & (MLX4_DEV_CAP_FLAG_WOL_PORT1 << real_port))
+			flags |= MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port;
+		else
+			flags &= ~(MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port);
+	}
+	for (; slave_port < dev->caps.num_ports; ++slave_port)
+		flags &= ~(MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port);
+
+	/* Not exposing RSS IP fragments to guests */
+	flags &= ~MLX4_DEV_CAP_FLAG_RSS_IP_FRAG;
+	MLX4_PUT(outbox->buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_VL_PORT_OFFSET);
+	field &= ~0x0F;
+	field |= bitmap_weight(actv_ports.ports, dev->caps.num_ports) & 0x0F;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_VL_PORT_OFFSET);
+
+	/* For guests, disable timestamp */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
+	field &= 0x7f;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
+
+	/* For guests, disable vxlan tunneling and QoS support */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_VXLAN);
+	field &= 0xd7;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_VXLAN);
+
+	/* For guests, disable port BEACON */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_PORT_BEACON_OFFSET);
+	field &= 0x7f;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_PORT_BEACON_OFFSET);
+
+	/* For guests, report Blueflame disabled */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_BF_OFFSET);
+	field &= 0x7f;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_BF_OFFSET);
+
+	/* For guests, disable mw type 2 and port remap*/
+	MLX4_GET(bmme_flags, outbox->buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	bmme_flags &= ~MLX4_BMME_FLAG_TYPE_2_WIN;
+	bmme_flags &= ~MLX4_FLAG_PORT_REMAP;
+	MLX4_PUT(outbox->buf, bmme_flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+
+	/* turn off device-managed steering capability if not enabled */
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		MLX4_GET(field, outbox->buf,
+			 QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
+		field &= 0x7f;
+		MLX4_PUT(outbox->buf, field,
+			 QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
+	}
+
+	/* turn off ipoib managed steering for guests */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
+	field &= ~0x80;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
+
+	/* turn off host side virt features (VST, FSM, etc) for guests */
+	MLX4_GET(field32, outbox->buf, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	field32 &= ~(DEV_CAP_EXT_2_FLAG_VLAN_CONTROL | DEV_CAP_EXT_2_FLAG_80_VFS |
+		     DEV_CAP_EXT_2_FLAG_FSM | DEV_CAP_EXT_2_FLAG_PFC_COUNTERS);
+	MLX4_PUT(outbox->buf, field32, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+
+	/* turn off QCN for guests */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_ECN_QCN_VER_OFFSET);
+	field &= 0xfe;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_ECN_QCN_VER_OFFSET);
+
+	/* turn off QP max-rate limiting for guests */
+	field16 = 0;
+	MLX4_PUT(outbox->buf, field16, QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET);
+
+	/* turn off QoS per VF support for guests */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE);
+	field &= 0xef;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE);
+
+	/* turn off ignore FCS feature for guests */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
+	field &= 0xfb;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
+
+	return 0;
+}
+
+static void disable_unsupported_roce_caps(void *buf)
+{
+	u32 flags;
+
+	MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	flags &= ~(1UL << 31);
+	MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	flags &= ~(1UL << 24);
+	MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	MLX4_GET(flags, buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	flags &= ~(MLX4_FLAG_ROCE_V1_V2);
+	MLX4_PUT(buf, flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+}
+
+int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 def_mac;
+	u8 port_type;
+	u16 short_field;
+	int err;
+	int admin_link_state;
+	int port = mlx4_slave_convert_port(dev, slave,
+					   vhcr->in_modifier & 0xFF);
+
+#define MLX4_VF_PORT_NO_LINK_SENSE_MASK	0xE0
+#define MLX4_PORT_LINK_UP_MASK		0x80
+#define QUERY_PORT_CUR_MAX_PKEY_OFFSET	0x0c
+#define QUERY_PORT_CUR_MAX_GID_OFFSET	0x0e
+
+	if (port < 0)
+		return -EINVAL;
+
+	/* Protect against untrusted guests: enforce that this is the
+	 * QUERY_PORT general query.
+	 */
+	if (vhcr->op_modifier || vhcr->in_modifier & ~0xFF)
+		return -EINVAL;
+
+	vhcr->in_modifier = port;
+
+	err = mlx4_cmd_box(dev, 0, outbox->dma, vhcr->in_modifier, 0,
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_NATIVE);
+
+	if (!err && dev->caps.function != slave) {
+		def_mac = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.mac;
+		MLX4_PUT(outbox->buf, def_mac, QUERY_PORT_MAC_OFFSET);
+
+		/* get port type - currently only eth is enabled */
+		MLX4_GET(port_type, outbox->buf,
+			 QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+
+		/* No link sensing allowed */
+		port_type &= MLX4_VF_PORT_NO_LINK_SENSE_MASK;
+		/* set port type to currently operating port type */
+		port_type |= (dev->caps.port_type[vhcr->in_modifier] & 0x3);
+
+		admin_link_state = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.link_state;
+		if (IFLA_VF_LINK_STATE_ENABLE == admin_link_state)
+			port_type |= MLX4_PORT_LINK_UP_MASK;
+		else if (IFLA_VF_LINK_STATE_DISABLE == admin_link_state)
+			port_type &= ~MLX4_PORT_LINK_UP_MASK;
+		else if (IFLA_VF_LINK_STATE_AUTO == admin_link_state && mlx4_is_bonded(dev)) {
+			int other_port = (port == 1) ? 2 : 1;
+			struct mlx4_port_cap port_cap;
+
+			err = mlx4_QUERY_PORT(dev, other_port, &port_cap);
+			if (err)
+				goto out;
+			port_type |= (port_cap.link_state << 7);
+		}
+
+		MLX4_PUT(outbox->buf, port_type,
+			 QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+
+		if (dev->caps.port_type[vhcr->in_modifier] == MLX4_PORT_TYPE_ETH)
+			short_field = mlx4_get_slave_num_gids(dev, slave, port);
+		else
+			short_field = 1; /* slave max gids */
+		MLX4_PUT(outbox->buf, short_field,
+			 QUERY_PORT_CUR_MAX_GID_OFFSET);
+
+		short_field = dev->caps.pkey_table_len[vhcr->in_modifier];
+		MLX4_PUT(outbox->buf, short_field,
+			 QUERY_PORT_CUR_MAX_PKEY_OFFSET);
+	}
+out:
+	return err;
+}
+
+int mlx4_get_slave_pkey_gid_tbl_len(struct mlx4_dev *dev, u8 port,
+				    int *gid_tbl_len, int *pkey_tbl_len)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32			*outbox;
+	u16			field;
+	int			err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err =  mlx4_cmd_box(dev, 0, mailbox->dma, port, 0,
+			    MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			    MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	outbox = mailbox->buf;
+
+	MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_GID_OFFSET);
+	*gid_tbl_len = field;
+
+	MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_PKEY_OFFSET);
+	*pkey_tbl_len = field;
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_get_slave_pkey_gid_tbl_len);
+
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_icm_iter iter;
+	__be64 *pages;
+	int lg;
+	int nent = 0;
+	int i;
+	int err = 0;
+	int ts = 0, tc = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	pages = mailbox->buf;
+
+	for (mlx4_icm_first(icm, &iter);
+	     !mlx4_icm_last(&iter);
+	     mlx4_icm_next(&iter)) {
+		/*
+		 * We have to pass pages that are aligned to their
+		 * size, so find the least significant 1 in the
+		 * address or size and use that as our log2 size.
+		 */
+		lg = ffs(mlx4_icm_addr(&iter) | mlx4_icm_size(&iter)) - 1;
+		if (lg < MLX4_ICM_PAGE_SHIFT) {
+			mlx4_warn(dev, "Got FW area not aligned to %d (%llx/%lx)\n",
+				  MLX4_ICM_PAGE_SIZE,
+				  (unsigned long long) mlx4_icm_addr(&iter),
+				  mlx4_icm_size(&iter));
+			err = -EINVAL;
+			goto out;
+		}
+
+		for (i = 0; i < mlx4_icm_size(&iter) >> lg; ++i) {
+			if (virt != -1) {
+				pages[nent * 2] = cpu_to_be64(virt);
+				virt += 1ULL << lg;
+			}
+
+			pages[nent * 2 + 1] =
+				cpu_to_be64((mlx4_icm_addr(&iter) + (i << lg)) |
+					    (lg - MLX4_ICM_PAGE_SHIFT));
+			ts += 1 << (lg - 10);
+			++tc;
+
+			if (++nent == MLX4_MAILBOX_SIZE / 16) {
+				err = mlx4_cmd(dev, mailbox->dma, nent, 0, op,
+						MLX4_CMD_TIME_CLASS_B,
+						MLX4_CMD_NATIVE);
+				if (err)
+					goto out;
+				nent = 0;
+			}
+		}
+	}
+
+	if (nent)
+		err = mlx4_cmd(dev, mailbox->dma, nent, 0, op,
+			       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	switch (op) {
+	case MLX4_CMD_MAP_FA:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB for FW\n", tc, ts);
+		break;
+	case MLX4_CMD_MAP_ICM_AUX:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB for ICM aux\n", tc, ts);
+		break;
+	case MLX4_CMD_MAP_ICM:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM\n",
+			 tc, ts, (unsigned long long) virt - (ts << 10));
+		break;
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_FA, icm, -1);
+}
+
+int mlx4_UNMAP_FA(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+
+int mlx4_RUN_FW(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+
+int mlx4_QUERY_FW(struct mlx4_dev *dev)
+{
+	struct mlx4_fw  *fw  = &mlx4_priv(dev)->fw;
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	int err = 0;
+	u64 fw_ver;
+	u16 cmd_if_rev;
+	u8 lg;
+
+#define QUERY_FW_OUT_SIZE             0x100
+#define QUERY_FW_VER_OFFSET            0x00
+#define QUERY_FW_PPF_ID		       0x09
+#define QUERY_FW_CMD_IF_REV_OFFSET     0x0a
+#define QUERY_FW_MAX_CMD_OFFSET        0x0f
+#define QUERY_FW_ERR_START_OFFSET      0x30
+#define QUERY_FW_ERR_SIZE_OFFSET       0x38
+#define QUERY_FW_ERR_BAR_OFFSET        0x3c
+
+#define QUERY_FW_SIZE_OFFSET           0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET   0x20
+#define QUERY_FW_CLR_INT_BAR_OFFSET    0x28
+
+#define QUERY_FW_COMM_BASE_OFFSET      0x40
+#define QUERY_FW_COMM_BAR_OFFSET       0x48
+
+#define QUERY_FW_CLOCK_OFFSET	       0x50
+#define QUERY_FW_CLOCK_BAR	       0x58
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_FW,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	MLX4_GET(fw_ver, outbox, QUERY_FW_VER_OFFSET);
+	/*
+	 * FW subminor version is at more significant bits than minor
+	 * version, so swap here.
+	 */
+	dev->caps.fw_ver = (fw_ver & 0xffff00000000ull) |
+		((fw_ver & 0xffff0000ull) >> 16) |
+		((fw_ver & 0x0000ffffull) << 16);
+
+	MLX4_GET(lg, outbox, QUERY_FW_PPF_ID);
+	dev->caps.function = lg;
+
+	if (mlx4_is_slave(dev))
+		goto out;
+
+
+	MLX4_GET(cmd_if_rev, outbox, QUERY_FW_CMD_IF_REV_OFFSET);
+	if (cmd_if_rev < MLX4_COMMAND_INTERFACE_MIN_REV ||
+	    cmd_if_rev > MLX4_COMMAND_INTERFACE_MAX_REV) {
+		mlx4_err(dev, "Installed FW has unsupported command interface revision %d\n",
+			 cmd_if_rev);
+		mlx4_err(dev, "(Installed FW version is %d.%d.%03d)\n",
+			 (int) (dev->caps.fw_ver >> 32),
+			 (int) (dev->caps.fw_ver >> 16) & 0xffff,
+			 (int) dev->caps.fw_ver & 0xffff);
+		mlx4_err(dev, "This driver version supports only revisions %d to %d\n",
+			 MLX4_COMMAND_INTERFACE_MIN_REV, MLX4_COMMAND_INTERFACE_MAX_REV);
+		err = -ENODEV;
+		goto out;
+	}
+
+	if (cmd_if_rev < MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS)
+		dev->flags |= MLX4_FLAG_OLD_PORT_CMDS;
+
+	MLX4_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+	cmd->max_cmds = 1 << lg;
+
+	mlx4_dbg(dev, "FW version %d.%d.%03d (cmd intf rev %d), max commands %d\n",
+		 (int) (dev->caps.fw_ver >> 32),
+		 (int) (dev->caps.fw_ver >> 16) & 0xffff,
+		 (int) dev->caps.fw_ver & 0xffff,
+		 cmd_if_rev, cmd->max_cmds);
+
+	MLX4_GET(fw->catas_offset, outbox, QUERY_FW_ERR_START_OFFSET);
+	MLX4_GET(fw->catas_size,   outbox, QUERY_FW_ERR_SIZE_OFFSET);
+	MLX4_GET(fw->catas_bar,    outbox, QUERY_FW_ERR_BAR_OFFSET);
+	fw->catas_bar = (fw->catas_bar >> 6) * 2;
+
+	mlx4_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x, BAR %d\n",
+		 (unsigned long long) fw->catas_offset, fw->catas_size, fw->catas_bar);
+
+	MLX4_GET(fw->fw_pages,     outbox, QUERY_FW_SIZE_OFFSET);
+	MLX4_GET(fw->clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+	MLX4_GET(fw->clr_int_bar,  outbox, QUERY_FW_CLR_INT_BAR_OFFSET);
+	fw->clr_int_bar = (fw->clr_int_bar >> 6) * 2;
+
+	MLX4_GET(fw->comm_base, outbox, QUERY_FW_COMM_BASE_OFFSET);
+	MLX4_GET(fw->comm_bar,  outbox, QUERY_FW_COMM_BAR_OFFSET);
+	fw->comm_bar = (fw->comm_bar >> 6) * 2;
+	mlx4_dbg(dev, "Communication vector bar:%d offset:0x%llx\n",
+		 fw->comm_bar, fw->comm_base);
+	mlx4_dbg(dev, "FW size %d KB\n", fw->fw_pages >> 2);
+
+	MLX4_GET(fw->clock_offset, outbox, QUERY_FW_CLOCK_OFFSET);
+	MLX4_GET(fw->clock_bar,    outbox, QUERY_FW_CLOCK_BAR);
+	fw->clock_bar = (fw->clock_bar >> 6) * 2;
+	mlx4_dbg(dev, "Internal clock bar:%d offset:0x%llx\n",
+		 fw->clock_bar, fw->clock_offset);
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	fw->fw_pages =
+		ALIGN(fw->fw_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+	mlx4_dbg(dev, "Clear int @ %llx, BAR %d\n",
+		 (unsigned long long) fw->clr_int_base, fw->clr_int_bar);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	u8 *outbuf;
+	int err;
+
+	outbuf = outbox->buf;
+	err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_FW,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+
+	/* for slaves, set pci PPF ID to invalid and zero out everything
+	 * else except FW version */
+	outbuf[0] = outbuf[1] = 0;
+	memset(&outbuf[8], 0, QUERY_FW_OUT_SIZE - 8);
+	outbuf[QUERY_FW_PPF_ID] = MLX4_INVALID_SLAVE_ID;
+
+	return 0;
+}
+
+static void get_board_id(void *vsd, char *board_id)
+{
+	int i;
+
+#define VSD_OFFSET_SIG1		0x00
+#define VSD_OFFSET_SIG2		0xde
+#define VSD_OFFSET_MLX_BOARD_ID	0xd0
+#define VSD_OFFSET_TS_BOARD_ID	0x20
+
+#define VSD_SIGNATURE_TOPSPIN	0x5ad
+
+	memset(board_id, 0, MLX4_BOARD_ID_LEN);
+
+	if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
+	    be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) {
+		strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MLX4_BOARD_ID_LEN);
+	} else {
+		/*
+		 * The board ID is a string but the firmware byte
+		 * swaps each 4-byte word before passing it back to
+		 * us.  Therefore we need to swab it before printing.
+		 */
+		u32 *bid_u32 = (u32 *)board_id;
+
+		for (i = 0; i < 4; ++i) {
+			u32 *addr;
+			u32 val;
+
+			addr = (u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4);
+			val = get_unaligned(addr);
+			val = swab32(val);
+			put_unaligned(val, &bid_u32[i]);
+		}
+	}
+}
+
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	int err;
+
+#define QUERY_ADAPTER_OUT_SIZE             0x100
+#define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
+#define QUERY_ADAPTER_VSD_OFFSET           0x20
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_ADAPTER,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	MLX4_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+	get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
+		     adapter->board_id);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be32 *inbox;
+	int err;
+	static const u8 a0_dmfs_hw_steering[] =  {
+		[MLX4_STEERING_DMFS_A0_DEFAULT]		= 0,
+		[MLX4_STEERING_DMFS_A0_DYNAMIC]		= 1,
+		[MLX4_STEERING_DMFS_A0_STATIC]		= 2,
+		[MLX4_STEERING_DMFS_A0_DISABLE]		= 3
+	};
+
+#define INIT_HCA_IN_SIZE		 0x200
+#define INIT_HCA_VERSION_OFFSET		 0x000
+#define	 INIT_HCA_VERSION		 2
+#define INIT_HCA_VXLAN_OFFSET		 0x0c
+#define INIT_HCA_CACHELINE_SZ_OFFSET	 0x0e
+#define INIT_HCA_FLAGS_OFFSET		 0x014
+#define INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET 0x018
+#define INIT_HCA_QPC_OFFSET		 0x020
+#define	 INIT_HCA_QPC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x10)
+#define	 INIT_HCA_LOG_QP_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x17)
+#define	 INIT_HCA_SRQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x28)
+#define	 INIT_HCA_LOG_SRQ_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x2f)
+#define	 INIT_HCA_CQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x30)
+#define	 INIT_HCA_LOG_CQ_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x37)
+#define	 INIT_HCA_EQE_CQE_OFFSETS	 (INIT_HCA_QPC_OFFSET + 0x38)
+#define	 INIT_HCA_EQE_CQE_STRIDE_OFFSET  (INIT_HCA_QPC_OFFSET + 0x3b)
+#define	 INIT_HCA_ALTC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x40)
+#define	 INIT_HCA_AUXC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x50)
+#define	 INIT_HCA_EQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x60)
+#define	 INIT_HCA_LOG_EQ_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x67)
+#define	INIT_HCA_NUM_SYS_EQS_OFFSET	(INIT_HCA_QPC_OFFSET + 0x6a)
+#define	 INIT_HCA_RDMARC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x70)
+#define	 INIT_HCA_LOG_RD_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x77)
+#define INIT_HCA_MCAST_OFFSET		 0x0c0
+#define	 INIT_HCA_MC_BASE_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x00)
+#define	 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define	 INIT_HCA_LOG_MC_HASH_SZ_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x16)
+#define  INIT_HCA_UC_STEERING_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x18)
+#define	 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define  INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN	0x6
+#define  INIT_HCA_FS_PARAM_OFFSET         0x1d0
+#define  INIT_HCA_FS_BASE_OFFSET          (INIT_HCA_FS_PARAM_OFFSET + 0x00)
+#define  INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x12)
+#define  INIT_HCA_FS_A0_OFFSET		  (INIT_HCA_FS_PARAM_OFFSET + 0x18)
+#define  INIT_HCA_FS_LOG_TABLE_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x1b)
+#define  INIT_HCA_FS_ETH_BITS_OFFSET      (INIT_HCA_FS_PARAM_OFFSET + 0x21)
+#define  INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x22)
+#define  INIT_HCA_FS_IB_BITS_OFFSET       (INIT_HCA_FS_PARAM_OFFSET + 0x25)
+#define  INIT_HCA_FS_IB_NUM_ADDRS_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x26)
+#define INIT_HCA_TPT_OFFSET		 0x0f0
+#define	 INIT_HCA_DMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x00)
+#define  INIT_HCA_TPT_MW_OFFSET		 (INIT_HCA_TPT_OFFSET + 0x08)
+#define	 INIT_HCA_LOG_MPT_SZ_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x0b)
+#define	 INIT_HCA_MTT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x10)
+#define	 INIT_HCA_CMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x18)
+#define INIT_HCA_UAR_OFFSET		 0x120
+#define	 INIT_HCA_LOG_UAR_SZ_OFFSET	 (INIT_HCA_UAR_OFFSET + 0x0a)
+#define  INIT_HCA_UAR_PAGE_SZ_OFFSET     (INIT_HCA_UAR_OFFSET + 0x0b)
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	*((u8 *) mailbox->buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION;
+
+	*((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) =
+		((ilog2(cache_line_size()) - 4) << 5) | (1 << 4);
+
+#if defined(__LITTLE_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+	/* Check port for UD address vector: */
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1);
+
+	/* Enable IPoIB checksumming if we can: */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 3);
+
+	/* Enable QoS support if module parameter set */
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETS_CFG && enable_qos)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 2);
+
+	/* enable counters */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 4);
+
+	/* Enable RSS spread to fragmented IP packets when supported */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_RSS_IP_FRAG)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 13);
+
+	/* CX3 is capable of extending CQEs/EQEs from 32 to 64 bytes */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_EQE) {
+		*(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 29);
+		dev->caps.eqe_size   = 64;
+		dev->caps.eqe_factor = 1;
+	} else {
+		dev->caps.eqe_size   = 32;
+		dev->caps.eqe_factor = 0;
+	}
+
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_CQE) {
+		*(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 30);
+		dev->caps.cqe_size   = 64;
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	} else {
+		dev->caps.cqe_size   = 32;
+	}
+
+	/* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */
+	if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) &&
+	    (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE)) {
+		dev->caps.eqe_size = cache_line_size();
+		dev->caps.cqe_size = cache_line_size();
+		dev->caps.eqe_factor = 0;
+		MLX4_PUT(inbox, (u8)((ilog2(dev->caps.eqe_size) - 5) << 4 |
+				      (ilog2(dev->caps.eqe_size) - 5)),
+			 INIT_HCA_EQE_CQE_STRIDE_OFFSET);
+
+		/* User still need to know to support CQE > 32B */
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	}
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT)
+		*(inbox + INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET / 4) |= cpu_to_be32(1 << 31);
+
+	/* QPC/EEC/CQC/EQC/RDMARC attributes */
+
+	MLX4_PUT(inbox, param->qpc_base,      INIT_HCA_QPC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_qps,   INIT_HCA_LOG_QP_OFFSET);
+	MLX4_PUT(inbox, param->srqc_base,     INIT_HCA_SRQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_srqs,  INIT_HCA_LOG_SRQ_OFFSET);
+	MLX4_PUT(inbox, param->cqc_base,      INIT_HCA_CQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_cqs,   INIT_HCA_LOG_CQ_OFFSET);
+	MLX4_PUT(inbox, param->altc_base,     INIT_HCA_ALTC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->auxc_base,     INIT_HCA_AUXC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->eqc_base,      INIT_HCA_EQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_eqs,   INIT_HCA_LOG_EQ_OFFSET);
+	MLX4_PUT(inbox, param->num_sys_eqs,   INIT_HCA_NUM_SYS_EQS_OFFSET);
+	MLX4_PUT(inbox, param->rdmarc_base,   INIT_HCA_RDMARC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET);
+
+	/* steering attributes */
+	if (dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |=
+			cpu_to_be32(1 <<
+				    INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN);
+
+		MLX4_PUT(inbox, param->mc_base, INIT_HCA_FS_BASE_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_entry_sz,
+			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_table_sz,
+			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+		/* Enable Ethernet flow steering
+		 * with udp unicast and tcp unicast
+		 */
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_STATIC)
+			MLX4_PUT(inbox,
+				 (u8)(MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
+				 INIT_HCA_FS_ETH_BITS_OFFSET);
+		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
+			 INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET);
+		/* Enable IPoIB flow steering
+		 * with udp unicast and tcp unicast
+		 */
+		MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
+			 INIT_HCA_FS_IB_BITS_OFFSET);
+		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
+			 INIT_HCA_FS_IB_NUM_ADDRS_OFFSET);
+
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+			MLX4_PUT(inbox,
+				 ((u8)(a0_dmfs_hw_steering[dev->caps.dmfs_high_steer_mode]
+				       << 6)),
+				 INIT_HCA_FS_A0_OFFSET);
+	} else {
+		MLX4_PUT(inbox, param->mc_base,	INIT_HCA_MC_BASE_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_entry_sz,
+			 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_hash_sz,
+			 INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_table_sz,
+			 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+		if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0)
+			MLX4_PUT(inbox, (u8) (1 << 3),
+				 INIT_HCA_UC_STEERING_OFFSET);
+	}
+
+	/* TPT attributes */
+
+	MLX4_PUT(inbox, param->dmpt_base,  INIT_HCA_DMPT_BASE_OFFSET);
+	MLX4_PUT(inbox, param->mw_enabled, INIT_HCA_TPT_MW_OFFSET);
+	MLX4_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MLX4_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
+	MLX4_PUT(inbox, param->cmpt_base,  INIT_HCA_CMPT_BASE_OFFSET);
+
+	/* UAR attributes */
+
+	MLX4_PUT(inbox, param->uar_page_sz,	INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	MLX4_PUT(inbox, param->log_uar_sz,      INIT_HCA_LOG_UAR_SZ_OFFSET);
+
+	/* set parser VXLAN attributes */
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS) {
+		u8 parser_params = 0;
+		MLX4_PUT(inbox, parser_params,	INIT_HCA_VXLAN_OFFSET);
+	}
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA,
+		       MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+
+	if (err)
+		mlx4_err(dev, "INIT_HCA returns %d\n", err);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_QUERY_HCA(struct mlx4_dev *dev,
+		   struct mlx4_init_hca_param *param)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be32 *outbox;
+	u32 dword_field;
+	int err;
+	u8 byte_field;
+	static const u8 a0_dmfs_query_hw_steering[] =  {
+		[0] = MLX4_STEERING_DMFS_A0_DEFAULT,
+		[1] = MLX4_STEERING_DMFS_A0_DYNAMIC,
+		[2] = MLX4_STEERING_DMFS_A0_STATIC,
+		[3] = MLX4_STEERING_DMFS_A0_DISABLE
+	};
+
+#define QUERY_HCA_GLOBAL_CAPS_OFFSET	0x04
+#define QUERY_HCA_CORE_CLOCK_OFFSET	0x0c
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0,
+			   MLX4_CMD_QUERY_HCA,
+			   MLX4_CMD_TIME_CLASS_B,
+			   !mlx4_is_slave(dev));
+	if (err)
+		goto out;
+
+	MLX4_GET(param->global_caps, outbox, QUERY_HCA_GLOBAL_CAPS_OFFSET);
+	MLX4_GET(param->hca_core_clock, outbox, QUERY_HCA_CORE_CLOCK_OFFSET);
+
+	/* QPC/EEC/CQC/EQC/RDMARC attributes */
+
+	MLX4_GET(param->qpc_base,      outbox, INIT_HCA_QPC_BASE_OFFSET);
+	MLX4_GET(param->log_num_qps,   outbox, INIT_HCA_LOG_QP_OFFSET);
+	MLX4_GET(param->srqc_base,     outbox, INIT_HCA_SRQC_BASE_OFFSET);
+	MLX4_GET(param->log_num_srqs,  outbox, INIT_HCA_LOG_SRQ_OFFSET);
+	MLX4_GET(param->cqc_base,      outbox, INIT_HCA_CQC_BASE_OFFSET);
+	MLX4_GET(param->log_num_cqs,   outbox, INIT_HCA_LOG_CQ_OFFSET);
+	MLX4_GET(param->altc_base,     outbox, INIT_HCA_ALTC_BASE_OFFSET);
+	MLX4_GET(param->auxc_base,     outbox, INIT_HCA_AUXC_BASE_OFFSET);
+	MLX4_GET(param->eqc_base,      outbox, INIT_HCA_EQC_BASE_OFFSET);
+	MLX4_GET(param->log_num_eqs,   outbox, INIT_HCA_LOG_EQ_OFFSET);
+	MLX4_GET(param->num_sys_eqs,   outbox, INIT_HCA_NUM_SYS_EQS_OFFSET);
+	MLX4_GET(param->rdmarc_base,   outbox, INIT_HCA_RDMARC_BASE_OFFSET);
+	MLX4_GET(param->log_rd_per_qp, outbox, INIT_HCA_LOG_RD_OFFSET);
+
+	MLX4_GET(dword_field, outbox, INIT_HCA_FLAGS_OFFSET);
+	if (dword_field & (1 << INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN)) {
+		param->steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED;
+	} else {
+		MLX4_GET(byte_field, outbox, INIT_HCA_UC_STEERING_OFFSET);
+		if (byte_field & 0x8)
+			param->steering_mode = MLX4_STEERING_MODE_B0;
+		else
+			param->steering_mode = MLX4_STEERING_MODE_A0;
+	}
+
+	if (dword_field & (1 << 13))
+		param->rss_ip_frags = 1;
+
+	/* steering attributes */
+	if (param->steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		MLX4_GET(param->mc_base, outbox, INIT_HCA_FS_BASE_OFFSET);
+		MLX4_GET(param->log_mc_entry_sz, outbox,
+			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
+		MLX4_GET(param->log_mc_table_sz, outbox,
+			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+		MLX4_GET(byte_field, outbox,
+			 INIT_HCA_FS_A0_OFFSET);
+		param->dmfs_high_steer_mode =
+			a0_dmfs_query_hw_steering[(byte_field >> 6) & 3];
+	} else {
+		MLX4_GET(param->mc_base, outbox, INIT_HCA_MC_BASE_OFFSET);
+		MLX4_GET(param->log_mc_entry_sz, outbox,
+			 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+		MLX4_GET(param->log_mc_hash_sz,  outbox,
+			 INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+		MLX4_GET(param->log_mc_table_sz, outbox,
+			 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+	}
+
+	/* CX3 is capable of extending CQEs/EQEs from 32 to 64 bytes */
+	MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_OFFSETS);
+	if (byte_field & 0x20) /* 64-bytes eqe enabled */
+		param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED;
+	if (byte_field & 0x40) /* 64-bytes cqe enabled */
+		param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED;
+
+	/* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */
+	MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_STRIDE_OFFSET);
+	if (byte_field) {
+		param->dev_cap_enabled |= MLX4_DEV_CAP_EQE_STRIDE_ENABLED;
+		param->dev_cap_enabled |= MLX4_DEV_CAP_CQE_STRIDE_ENABLED;
+		param->cqe_size = 1 << ((byte_field &
+					 MLX4_CQE_SIZE_MASK_STRIDE) + 5);
+		param->eqe_size = 1 << (((byte_field &
+					  MLX4_EQE_SIZE_MASK_STRIDE) >> 4) + 5);
+	}
+
+	/* TPT attributes */
+
+	MLX4_GET(param->dmpt_base,  outbox, INIT_HCA_DMPT_BASE_OFFSET);
+	MLX4_GET(param->mw_enabled, outbox, INIT_HCA_TPT_MW_OFFSET);
+	MLX4_GET(param->log_mpt_sz, outbox, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MLX4_GET(param->mtt_base,   outbox, INIT_HCA_MTT_BASE_OFFSET);
+	MLX4_GET(param->cmpt_base,  outbox, INIT_HCA_CMPT_BASE_OFFSET);
+
+	/* UAR attributes */
+
+	MLX4_GET(param->uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	MLX4_GET(param->log_uar_sz, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET);
+
+	/* phv_check enable */
+	MLX4_GET(byte_field, outbox, INIT_HCA_CACHELINE_SZ_OFFSET);
+	if (byte_field & 0x2)
+		param->phv_check_en = 1;
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+static int mlx4_hca_core_clock_update(struct mlx4_dev *dev)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be32 *outbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		mlx4_warn(dev, "hca_core_clock mailbox allocation failed\n");
+		return PTR_ERR(mailbox);
+	}
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0,
+			   MLX4_CMD_QUERY_HCA,
+			   MLX4_CMD_TIME_CLASS_B,
+			   !mlx4_is_slave(dev));
+	if (err) {
+		mlx4_warn(dev, "hca_core_clock update failed\n");
+		goto out;
+	}
+
+	MLX4_GET(dev->caps.hca_core_clock, outbox, QUERY_HCA_CORE_CLOCK_OFFSET);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+/* for IB-type ports only in SRIOV mode. Checks that both proxy QP0
+ * and real QP0 are active, so that the paravirtualized QP0 is ready
+ * to operate */
+static int check_qp0_state(struct mlx4_dev *dev, int function, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	/* irrelevant if not infiniband */
+	if (priv->mfunc.master.qp0_state[port].proxy_qp0_active &&
+	    priv->mfunc.master.qp0_state[port].qp0_active)
+		return 1;
+	return 0;
+}
+
+int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port = mlx4_slave_convert_port(dev, slave, vhcr->in_modifier);
+	int err;
+
+	if (port < 0)
+		return -EINVAL;
+
+	if (priv->mfunc.master.slave_state[slave].init_port_mask & (1 << port))
+		return 0;
+
+	if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
+		/* Enable port only if it was previously disabled */
+		if (!priv->mfunc.master.init_port_ref[port]) {
+			err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+			if (err)
+				return err;
+		}
+		priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+	} else {
+		if (slave == mlx4_master_func_num(dev)) {
+			if (check_qp0_state(dev, slave, port) &&
+			    !priv->mfunc.master.qp0_state[port].port_active) {
+				err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+					       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+				if (err)
+					return err;
+				priv->mfunc.master.qp0_state[port].port_active = 1;
+				priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+			}
+		} else
+			priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+	}
+	++priv->mfunc.master.init_port_ref[port];
+	return 0;
+}
+
+int mlx4_INIT_PORT(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *inbox;
+	int err;
+	u32 flags;
+	u16 field;
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+#define INIT_PORT_IN_SIZE          256
+#define INIT_PORT_FLAGS_OFFSET     0x00
+#define INIT_PORT_FLAG_SIG         (1 << 18)
+#define INIT_PORT_FLAG_NG          (1 << 17)
+#define INIT_PORT_FLAG_G0          (1 << 16)
+#define INIT_PORT_VL_SHIFT         4
+#define INIT_PORT_PORT_WIDTH_SHIFT 8
+#define INIT_PORT_MTU_OFFSET       0x04
+#define INIT_PORT_MAX_GID_OFFSET   0x06
+#define INIT_PORT_MAX_PKEY_OFFSET  0x0a
+#define INIT_PORT_GUID0_OFFSET     0x10
+#define INIT_PORT_NODE_GUID_OFFSET 0x18
+#define INIT_PORT_SI_GUID_OFFSET   0x20
+
+		mailbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+		inbox = mailbox->buf;
+
+		flags = 0;
+		flags |= (dev->caps.vl_cap[port] & 0xf) << INIT_PORT_VL_SHIFT;
+		flags |= (dev->caps.port_width_cap[port] & 0xf) << INIT_PORT_PORT_WIDTH_SHIFT;
+		MLX4_PUT(inbox, flags,		  INIT_PORT_FLAGS_OFFSET);
+
+		field = 128 << dev->caps.ib_mtu_cap[port];
+		MLX4_PUT(inbox, field, INIT_PORT_MTU_OFFSET);
+		field = dev->caps.gid_table_len[port];
+		MLX4_PUT(inbox, field, INIT_PORT_MAX_GID_OFFSET);
+		field = dev->caps.pkey_table_len[port];
+		MLX4_PUT(inbox, field, INIT_PORT_MAX_PKEY_OFFSET);
+
+		err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_INIT_PORT,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+		mlx4_free_cmd_mailbox(dev, mailbox);
+	} else
+		err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+
+	if (!err)
+		mlx4_hca_core_clock_update(dev);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_INIT_PORT);
+
+int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port = mlx4_slave_convert_port(dev, slave, vhcr->in_modifier);
+	int err;
+
+	if (port < 0)
+		return -EINVAL;
+
+	if (!(priv->mfunc.master.slave_state[slave].init_port_mask &
+	    (1 << port)))
+		return 0;
+
+	if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
+		if (priv->mfunc.master.init_port_ref[port] == 1) {
+			err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+			if (err)
+				return err;
+		}
+		priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
+	} else {
+		/* infiniband port */
+		if (slave == mlx4_master_func_num(dev)) {
+			if (!priv->mfunc.master.qp0_state[port].qp0_active &&
+			    priv->mfunc.master.qp0_state[port].port_active) {
+				err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+					       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+				if (err)
+					return err;
+				priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
+				priv->mfunc.master.qp0_state[port].port_active = 0;
+			}
+		} else
+			priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
+	}
+	--priv->mfunc.master.init_port_ref[port];
+	return 0;
+}
+
+int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port)
+{
+	return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT);
+
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic)
+{
+	return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA,
+			MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+}
+
+struct mlx4_config_dev {
+	__be32	update_flags;
+	__be32	rsvd1[3];
+	__be16	vxlan_udp_dport;
+	__be16	rsvd2;
+	__be16  roce_v2_entropy;
+	__be16  roce_v2_udp_dport;
+	__be32	roce_flags;
+	__be32	rsvd4[25];
+	__be16	rsvd5;
+	u8	rsvd6;
+	u8	rx_checksum_val;
+};
+
+#define MLX4_VXLAN_UDP_DPORT (1 << 0)
+#define MLX4_ROCE_V2_UDP_DPORT BIT(3)
+#define MLX4_DISABLE_RX_PORT BIT(18)
+
+static int mlx4_CONFIG_DEV_set(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
+{
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, config_dev, sizeof(*config_dev));
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_CONFIG_DEV,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static int mlx4_CONFIG_DEV_get(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
+{
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 1, MLX4_CMD_CONFIG_DEV,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	if (!err)
+		memcpy(config_dev, mailbox->buf, sizeof(*config_dev));
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+/* Conversion between the HW values and the actual functionality.
+ * The value represented by the array index,
+ * and the functionality determined by the flags.
+ */
+static const u8 config_dev_csum_flags[] = {
+	[0] =	0,
+	[1] =	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP,
+	[2] =	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP	|
+		MLX4_RX_CSUM_MODE_L4,
+	[3] =	MLX4_RX_CSUM_MODE_L4			|
+		MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP	|
+		MLX4_RX_CSUM_MODE_MULTI_VLAN
+};
+
+int mlx4_config_dev_retrieval(struct mlx4_dev *dev,
+			      struct mlx4_config_dev_params *params)
+{
+	struct mlx4_config_dev config_dev = {0};
+	int err;
+	u8 csum_mask;
+
+#define CONFIG_DEV_RX_CSUM_MODE_MASK			0x7
+#define CONFIG_DEV_RX_CSUM_MODE_PORT1_BIT_OFFSET	0
+#define CONFIG_DEV_RX_CSUM_MODE_PORT2_BIT_OFFSET	4
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_CONFIG_DEV))
+		return -EOPNOTSUPP;
+
+	err = mlx4_CONFIG_DEV_get(dev, &config_dev);
+	if (err)
+		return err;
+
+	csum_mask = (config_dev.rx_checksum_val >> CONFIG_DEV_RX_CSUM_MODE_PORT1_BIT_OFFSET) &
+			CONFIG_DEV_RX_CSUM_MODE_MASK;
+
+	if (csum_mask >= ARRAY_SIZE(config_dev_csum_flags))
+		return -EINVAL;
+	params->rx_csum_flags_port_1 = config_dev_csum_flags[csum_mask];
+
+	csum_mask = (config_dev.rx_checksum_val >> CONFIG_DEV_RX_CSUM_MODE_PORT2_BIT_OFFSET) &
+			CONFIG_DEV_RX_CSUM_MODE_MASK;
+
+	if (csum_mask >= ARRAY_SIZE(config_dev_csum_flags))
+		return -EINVAL;
+	params->rx_csum_flags_port_2 = config_dev_csum_flags[csum_mask];
+
+	params->vxlan_udp_dport = be16_to_cpu(config_dev.vxlan_udp_dport);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_config_dev_retrieval);
+
+int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags    = cpu_to_be32(MLX4_VXLAN_UDP_DPORT);
+	config_dev.vxlan_udp_dport = udp_port;
+
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+EXPORT_SYMBOL_GPL(mlx4_config_vxlan_port);
+
+#define CONFIG_DISABLE_RX_PORT BIT(15)
+int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags = cpu_to_be32(MLX4_DISABLE_RX_PORT);
+	if (dis)
+		config_dev.roce_flags =
+			cpu_to_be32(CONFIG_DISABLE_RX_PORT);
+
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+
+int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags    = cpu_to_be32(MLX4_ROCE_V2_UDP_DPORT);
+	config_dev.roce_v2_udp_dport = cpu_to_be16(udp_port);
+
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+EXPORT_SYMBOL_GPL(mlx4_config_roce_v2_port);
+
+int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct {
+		__be32 v_port1;
+		__be32 v_port2;
+	} *v2p;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+
+	v2p = mailbox->buf;
+	v2p->v_port1 = cpu_to_be32(port1);
+	v2p->v_port2 = cpu_to_be32(port2);
+
+	err = mlx4_cmd(dev, mailbox->dma, 0,
+		       MLX4_SET_PORT_VIRT2PHY, MLX4_CMD_VIRT_PORT_MAP,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
+{
+	int ret = mlx4_cmd_imm(dev, icm_size, aux_pages, 0, 0,
+			       MLX4_CMD_SET_ICM_SIZE,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (ret)
+		return ret;
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	*aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+	return 0;
+}
+
+int mlx4_NOP(struct mlx4_dev *dev)
+{
+	/* Input modifier of 0x1f means "finish as soon as possible." */
+	return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_NATIVE);
+}
+
+int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier,
+			     const u32 offset[],
+			     u32 value[], size_t array_len, u8 port)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	size_t i;
+	int ret;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	outbox = mailbox->buf;
+
+	ret = mlx4_cmd_box(dev, 0, mailbox->dma, port, op_modifier,
+			   MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (ret)
+		goto out;
+
+	for (i = 0; i < array_len; i++) {
+		if (offset[i] > MLX4_MAILBOX_SIZE) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		MLX4_GET(value[i], outbox, offset[i]);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+EXPORT_SYMBOL(mlx4_query_diag_counters);
+
+int mlx4_get_phys_port_id(struct mlx4_dev *dev)
+{
+	u8 port;
+	u32 *outbox;
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	u32 guid_hi, guid_lo;
+	int err, ret = 0;
+#define MOD_STAT_CFG_PORT_OFFSET 8
+#define MOD_STAT_CFG_GUID_H	 0X14
+#define MOD_STAT_CFG_GUID_L	 0X1c
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		in_mod = port << MOD_STAT_CFG_PORT_OFFSET;
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, in_mod, 0x2,
+				   MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A,
+				   MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_err(dev, "Fail to get port %d uplink guid\n",
+				 port);
+			ret = err;
+		} else {
+			MLX4_GET(guid_hi, outbox, MOD_STAT_CFG_GUID_H);
+			MLX4_GET(guid_lo, outbox, MOD_STAT_CFG_GUID_L);
+			dev->caps.phys_port_id[port] = (u64)guid_lo |
+						       (u64)guid_hi << 32;
+		}
+	}
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+
+#define MLX4_WOL_SETUP_MODE (5 << 28)
+int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port)
+{
+	u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8;
+
+	return mlx4_cmd_imm(dev, 0, config, in_mod, 0x3,
+			    MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A,
+			    MLX4_CMD_NATIVE);
+}
+EXPORT_SYMBOL_GPL(mlx4_wol_read);
+
+int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port)
+{
+	u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8;
+
+	return mlx4_cmd(dev, config, in_mod, 0x1, MLX4_CMD_MOD_STAT_CFG,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+EXPORT_SYMBOL_GPL(mlx4_wol_write);
+
+enum {
+	ADD_TO_MCG = 0x26,
+};
+
+
+void mlx4_opreq_action(struct work_struct *work)
+{
+	struct mlx4_priv *priv = container_of(work, struct mlx4_priv,
+					      opreq_task);
+	struct mlx4_dev *dev = &priv->dev;
+	int num_tasks = atomic_read(&priv->opreq_count);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 *outbox;
+	u32 modifier;
+	u16 token;
+	u16 type;
+	int err;
+	u32 num_qps;
+	struct mlx4_qp qp;
+	int i;
+	u8 rem_mcg;
+	u8 prot;
+
+#define GET_OP_REQ_MODIFIER_OFFSET	0x08
+#define GET_OP_REQ_TOKEN_OFFSET		0x14
+#define GET_OP_REQ_TYPE_OFFSET		0x1a
+#define GET_OP_REQ_DATA_OFFSET		0x20
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		mlx4_err(dev, "Failed to allocate mailbox for GET_OP_REQ\n");
+		return;
+	}
+	outbox = mailbox->buf;
+
+	while (num_tasks) {
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0,
+				   MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A,
+				   MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_err(dev, "Failed to retrieve required operation: %d\n",
+				 err);
+			return;
+		}
+		MLX4_GET(modifier, outbox, GET_OP_REQ_MODIFIER_OFFSET);
+		MLX4_GET(token, outbox, GET_OP_REQ_TOKEN_OFFSET);
+		MLX4_GET(type, outbox, GET_OP_REQ_TYPE_OFFSET);
+		type &= 0xfff;
+
+		switch (type) {
+		case ADD_TO_MCG:
+			if (dev->caps.steering_mode ==
+			    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+				mlx4_warn(dev, "ADD MCG operation is not supported in DEVICE_MANAGED steering mode\n");
+				err = EPERM;
+				break;
+			}
+			mgm = (struct mlx4_mgm *)((u8 *)(outbox) +
+						  GET_OP_REQ_DATA_OFFSET);
+			num_qps = be32_to_cpu(mgm->members_count) &
+				  MGM_QPN_MASK;
+			rem_mcg = ((u8 *)(&mgm->members_count))[0] & 1;
+			prot = ((u8 *)(&mgm->members_count))[0] >> 6;
+
+			for (i = 0; i < num_qps; i++) {
+				qp.qpn = be32_to_cpu(mgm->qp[i]);
+				if (rem_mcg)
+					err = mlx4_multicast_detach(dev, &qp,
+								    mgm->gid,
+								    prot, 0);
+				else
+					err = mlx4_multicast_attach(dev, &qp,
+								    mgm->gid,
+								    mgm->gid[5]
+								    , 0, prot,
+								    NULL);
+				if (err)
+					break;
+			}
+			break;
+		default:
+			mlx4_warn(dev, "Bad type for required operation\n");
+			err = EINVAL;
+			break;
+		}
+		err = mlx4_cmd(dev, 0, ((u32) err |
+					(__force u32)cpu_to_be32(token) << 16),
+			       1, MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A,
+			       MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_err(dev, "Failed to acknowledge required request: %d\n",
+				 err);
+			goto out;
+		}
+		memset(outbox, 0, 0xffc);
+		num_tasks = atomic_dec_return(&priv->opreq_count);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+}
+
+static int mlx4_check_smp_firewall_active(struct mlx4_dev *dev,
+					  struct mlx4_cmd_mailbox *mailbox)
+{
+#define MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET		0x10
+#define MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET		0x20
+#define MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET		0x40
+#define MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET	0x70
+
+	u32 set_attr_mask, getresp_attr_mask;
+	u32 trap_attr_mask, traprepress_attr_mask;
+
+	MLX4_GET(set_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall set_attribute_mask = 0x%x\n",
+		 set_attr_mask);
+
+	MLX4_GET(getresp_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall getresp_attribute_mask = 0x%x\n",
+		 getresp_attr_mask);
+
+	MLX4_GET(trap_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall trap_attribute_mask = 0x%x\n",
+		 trap_attr_mask);
+
+	MLX4_GET(traprepress_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall traprepress_attribute_mask = 0x%x\n",
+		 traprepress_attr_mask);
+
+	if (set_attr_mask && getresp_attr_mask && trap_attr_mask &&
+	    traprepress_attr_mask)
+		return 1;
+
+	return 0;
+}
+
+int mlx4_config_mad_demux(struct mlx4_dev *dev)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	/* Check if mad_demux is supported */
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_MAD_DEMUX))
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		mlx4_warn(dev, "Failed to allocate mailbox for cmd MAD_DEMUX");
+		return -ENOMEM;
+	}
+
+	/* Query mad_demux to find out which MADs are handled by internal sma */
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0x01 /* subn mgmt class */,
+			   MLX4_CMD_MAD_DEMUX_QUERY_RESTR, MLX4_CMD_MAD_DEMUX,
+			   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: query restrictions failed (%d)\n",
+			  err);
+		goto out;
+	}
+
+	if (mlx4_check_smp_firewall_active(dev, mailbox))
+		dev->flags |= MLX4_FLAG_SECURE_HOST;
+
+	/* Config mad_demux to handle all MADs returned by the query above */
+	err = mlx4_cmd(dev, mailbox->dma, 0x01 /* subn mgmt class */,
+		       MLX4_CMD_MAD_DEMUX_CONFIG, MLX4_CMD_MAD_DEMUX,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: configure failed (%d)\n", err);
+		goto out;
+	}
+
+	if (dev->flags & MLX4_FLAG_SECURE_HOST)
+		mlx4_warn(dev, "HCA operating in secure-host mode. SMP firewall activated.\n");
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+/* Access Reg commands */
+enum mlx4_access_reg_masks {
+	MLX4_ACCESS_REG_STATUS_MASK = 0x7f,
+	MLX4_ACCESS_REG_METHOD_MASK = 0x7f,
+	MLX4_ACCESS_REG_LEN_MASK = 0x7ff
+};
+
+struct mlx4_access_reg {
+	__be16 constant1;
+	u8 status;
+	u8 resrvd1;
+	__be16 reg_id;
+	u8 method;
+	u8 constant2;
+	__be32 resrvd2[2];
+	__be16 len_const;
+	__be16 resrvd3;
+#define MLX4_ACCESS_REG_HEADER_SIZE (20)
+	u8 reg_data[MLX4_MAILBOX_SIZE-MLX4_ACCESS_REG_HEADER_SIZE];
+} __attribute__((__packed__));
+
+/**
+ * mlx4_ACCESS_REG - Generic access reg command.
+ * @dev: mlx4_dev.
+ * @reg_id: register ID to access.
+ * @method: Access method Read/Write.
+ * @reg_len: register length to Read/Write in bytes.
+ * @reg_data: reg_data pointer to Read/Write From/To.
+ *
+ * Access ConnectX registers FW command.
+ * Returns 0 on success and copies outbox mlx4_access_reg data
+ * field into reg_data or a negative error code.
+ */
+static int mlx4_ACCESS_REG(struct mlx4_dev *dev, u16 reg_id,
+			   enum mlx4_access_reg_method method,
+			   u16 reg_len, void *reg_data)
+{
+	struct mlx4_cmd_mailbox *inbox, *outbox;
+	struct mlx4_access_reg *inbuf, *outbuf;
+	int err;
+
+	inbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inbox))
+		return PTR_ERR(inbox);
+
+	outbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outbox)) {
+		mlx4_free_cmd_mailbox(dev, inbox);
+		return PTR_ERR(outbox);
+	}
+
+	inbuf = inbox->buf;
+	outbuf = outbox->buf;
+
+	inbuf->constant1 = cpu_to_be16(0x1<<11 | 0x4);
+	inbuf->constant2 = 0x1;
+	inbuf->reg_id = cpu_to_be16(reg_id);
+	inbuf->method = method & MLX4_ACCESS_REG_METHOD_MASK;
+
+	reg_len = min(reg_len, (u16)(sizeof(inbuf->reg_data)));
+	inbuf->len_const =
+		cpu_to_be16(((reg_len/4 + 1) & MLX4_ACCESS_REG_LEN_MASK) |
+			    ((0x3) << 12));
+
+	memcpy(inbuf->reg_data, reg_data, reg_len);
+	err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, 0, 0,
+			   MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	if (outbuf->status & MLX4_ACCESS_REG_STATUS_MASK) {
+		err = outbuf->status & MLX4_ACCESS_REG_STATUS_MASK;
+		mlx4_err(dev,
+			 "MLX4_CMD_ACCESS_REG(%x) returned REG status (%x)\n",
+			 reg_id, err);
+		goto out;
+	}
+
+	memcpy(reg_data, outbuf->reg_data, reg_len);
+out:
+	mlx4_free_cmd_mailbox(dev, inbox);
+	mlx4_free_cmd_mailbox(dev, outbox);
+	return err;
+}
+
+/* ConnectX registers IDs */
+enum mlx4_reg_id {
+	MLX4_REG_ID_PTYS = 0x5004,
+};
+
+/**
+ * mlx4_ACCESS_PTYS_REG - Access PTYs (Port Type and Speed)
+ * register
+ * @dev: mlx4_dev.
+ * @method: Access method Read/Write.
+ * @ptys_reg: PTYS register data pointer.
+ *
+ * Access ConnectX PTYS register, to Read/Write Port Type/Speed
+ * configuration
+ * Returns 0 on success or a negative error code.
+ */
+int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev,
+			 enum mlx4_access_reg_method method,
+			 struct mlx4_ptys_reg *ptys_reg)
+{
+	return mlx4_ACCESS_REG(dev, MLX4_REG_ID_PTYS,
+			       method, sizeof(*ptys_reg), ptys_reg);
+}
+EXPORT_SYMBOL_GPL(mlx4_ACCESS_PTYS_REG);
+
+int mlx4_ACCESS_REG_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_access_reg *inbuf = inbox->buf;
+	u8 method = inbuf->method & MLX4_ACCESS_REG_METHOD_MASK;
+	u16 reg_id = be16_to_cpu(inbuf->reg_id);
+
+	if (slave != mlx4_master_func_num(dev) &&
+	    method == MLX4_ACCESS_REG_WRITE)
+		return -EPERM;
+
+	if (reg_id == MLX4_REG_ID_PTYS) {
+		struct mlx4_ptys_reg *ptys_reg =
+			(struct mlx4_ptys_reg *)inbuf->reg_data;
+
+		ptys_reg->local_port =
+			mlx4_slave_convert_port(dev, slave,
+						ptys_reg->local_port);
+	}
+
+	return mlx4_cmd_box(dev, inbox->dma, outbox->dma, vhcr->in_modifier,
+			    0, MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C,
+			    MLX4_CMD_NATIVE);
+}
+
+static int mlx4_SET_PORT_phv_bit(struct mlx4_dev *dev, u8 port, u8 phv_bit)
+{
+#define SET_PORT_GEN_PHV_VALID	0x10
+#define SET_PORT_GEN_PHV_EN	0x80
+
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+
+	context->flags2 |=  SET_PORT_GEN_PHV_VALID;
+	if (phv_bit)
+		context->phv_en |=  SET_PORT_GEN_PHV_EN;
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv)
+{
+	int err;
+	struct mlx4_func_cap func_cap;
+
+	memset(&func_cap, 0, sizeof(func_cap));
+	err = mlx4_QUERY_FUNC_CAP(dev, port, &func_cap);
+	if (!err)
+		*phv = func_cap.flags0 & QUERY_FUNC_CAP_PHV_BIT;
+	return err;
+}
+EXPORT_SYMBOL(get_phv_bit);
+
+int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val)
+{
+	int ret;
+
+	if (mlx4_is_slave(dev))
+		return -EPERM;
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN &&
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) {
+		ret = mlx4_SET_PORT_phv_bit(dev, port, new_val);
+		if (!ret)
+			dev->caps.phv_bit[port] = new_val;
+		return ret;
+	}
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(set_phv_bit);
+
+int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port,
+				      bool *vlan_offload_disabled)
+{
+	struct mlx4_func_cap func_cap;
+	int err;
+
+	memset(&func_cap, 0, sizeof(func_cap));
+	err = mlx4_QUERY_FUNC_CAP(dev, port, &func_cap);
+	if (!err)
+		*vlan_offload_disabled =
+			!!(func_cap.flags0 &
+			   QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_get_is_vlan_offload_disabled);
+
+void mlx4_replace_zero_macs(struct mlx4_dev *dev)
+{
+	int i;
+	u8 mac_addr[ETH_ALEN];
+
+	dev->port_random_macs = 0;
+	for (i = 1; i <= dev->caps.num_ports; ++i)
+		if (!dev->caps.def_mac[i] &&
+		    dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH) {
+			eth_random_addr(mac_addr);
+			dev->port_random_macs |= 1 << i;
+			dev->caps.def_mac[i] = mlx4_mac_to_u64(mac_addr);
+		}
+}
+EXPORT_SYMBOL_GPL(mlx4_replace_zero_macs);
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
new file mode 100644
index 0000000..cd6399c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_FW_H
+#define MLX4_FW_H
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_mod_stat_cfg {
+	u8 log_pg_sz;
+	u8 log_pg_sz_m;
+};
+
+struct mlx4_port_cap {
+	u8  link_state;
+	u8  supported_port_types;
+	u8  suggested_type;
+	u8  default_sense;
+	u8  log_max_macs;
+	u8  log_max_vlans;
+	int ib_mtu;
+	int max_port_width;
+	int max_vl;
+	int max_tc_eth;
+	int max_gids;
+	int max_pkeys;
+	u64 def_mac;
+	u16 eth_mtu;
+	int trans_type;
+	int vendor_oui;
+	u16 wavelength;
+	u64 trans_code;
+	u8 dmfs_optimized_state;
+};
+
+struct mlx4_dev_cap {
+	int max_srq_sz;
+	int max_qp_sz;
+	int reserved_qps;
+	int max_qps;
+	int reserved_srqs;
+	int max_srqs;
+	int max_cq_sz;
+	int reserved_cqs;
+	int max_cqs;
+	int max_mpts;
+	int reserved_eqs;
+	int max_eqs;
+	int num_sys_eqs;
+	int reserved_mtts;
+	int reserved_mrws;
+	int max_requester_per_qp;
+	int max_responder_per_qp;
+	int max_rdma_global;
+	int local_ca_ack_delay;
+	int num_ports;
+	u32 max_msg_sz;
+	u16 stat_rate_support;
+	int fs_log_max_ucast_qp_range_size;
+	int fs_max_num_qp_per_entry;
+	u64 flags;
+	u64 flags2;
+	int reserved_uars;
+	int uar_size;
+	int min_page_sz;
+	int bf_reg_size;
+	int bf_regs_per_page;
+	int max_sq_sg;
+	int max_sq_desc_sz;
+	int max_rq_sg;
+	int max_rq_desc_sz;
+	int max_qp_per_mcg;
+	int reserved_mgms;
+	int max_mcgs;
+	int reserved_pds;
+	int max_pds;
+	int reserved_xrcds;
+	int max_xrcds;
+	int qpc_entry_sz;
+	int rdmarc_entry_sz;
+	int altc_entry_sz;
+	int aux_entry_sz;
+	int srq_entry_sz;
+	int cqc_entry_sz;
+	int eqc_entry_sz;
+	int dmpt_entry_sz;
+	int cmpt_entry_sz;
+	int mtt_entry_sz;
+	int resize_srq;
+	u32 bmme_flags;
+	u32 reserved_lkey;
+	u64 max_icm_sz;
+	int max_gso_sz;
+	int max_rss_tbl_sz;
+	u32 max_counters;
+	u32 dmfs_high_rate_qpn_base;
+	u32 dmfs_high_rate_qpn_range;
+	struct mlx4_rate_limit_caps rl_caps;
+	struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1];
+	bool wol_port[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_func_cap {
+	u8	num_ports;
+	u8	flags;
+	u32	pf_context_behaviour;
+	int	qp_quota;
+	int	cq_quota;
+	int	srq_quota;
+	int	mpt_quota;
+	int	mtt_quota;
+	int	max_eq;
+	int	reserved_eq;
+	int	mcg_quota;
+	struct mlx4_spec_qps spec_qps;
+	u32	reserved_lkey;
+	u8	physical_port;
+	u8	flags0;
+	u8	flags1;
+	u64	phys_port_id;
+	u32	extra_flags;
+};
+
+struct mlx4_func {
+	int	bus;
+	int	device;
+	int	function;
+	int	physical_function;
+	int	rsvd_eqs;
+	int	max_eq;
+	int	rsvd_uars;
+};
+
+struct mlx4_adapter {
+	char board_id[MLX4_BOARD_ID_LEN];
+	u8   inta_pin;
+};
+
+struct mlx4_init_hca_param {
+	u64 qpc_base;
+	u64 rdmarc_base;
+	u64 auxc_base;
+	u64 altc_base;
+	u64 srqc_base;
+	u64 cqc_base;
+	u64 eqc_base;
+	u64 mc_base;
+	u64 dmpt_base;
+	u64 cmpt_base;
+	u64 mtt_base;
+	u64 global_caps;
+	u16 log_mc_entry_sz;
+	u16 log_mc_hash_sz;
+	u16 hca_core_clock; /* Internal Clock Frequency (in MHz) */
+	u8  log_num_qps;
+	u8  log_num_srqs;
+	u8  log_num_cqs;
+	u8  log_num_eqs;
+	u16 num_sys_eqs;
+	u8  log_rd_per_qp;
+	u8  log_mc_table_sz;
+	u8  log_mpt_sz;
+	u8  log_uar_sz;
+	u8  mw_enabled;  /* Enable memory windows */
+	u8  uar_page_sz; /* log pg sz in 4k chunks */
+	u8  steering_mode; /* for QUERY_HCA */
+	u8  dmfs_high_steer_mode; /* for QUERY_HCA */
+	u64 dev_cap_enabled;
+	u16 cqe_size; /* For use only when CQE stride feature enabled */
+	u16 eqe_size; /* For use only when EQE stride feature enabled */
+	u8 rss_ip_frags;
+	u8 phv_check_en; /* for QUERY_HCA */
+};
+
+struct mlx4_init_ib_param {
+	int port_width;
+	int vl_cap;
+	int mtu_cap;
+	u16 gid_cap;
+	u16 pkey_cap;
+	int set_guid0;
+	u64 guid0;
+	int set_node_guid;
+	u64 node_guid;
+	int set_si_guid;
+	u64 si_guid;
+};
+
+struct mlx4_set_ib_param {
+	int set_si_guid;
+	int reset_qkey_viol;
+	u64 si_guid;
+	u32 cap_mask;
+};
+
+void mlx4_dev_cap_dump(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
+int mlx4_QUERY_PORT(struct mlx4_dev *dev, int port, struct mlx4_port_cap *port_cap);
+int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
+			struct mlx4_func_cap *func_cap);
+int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_FUNC(struct mlx4_dev *dev, struct mlx4_func *func, int slave);
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_FA(struct mlx4_dev *dev);
+int mlx4_RUN_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter);
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param);
+int mlx4_QUERY_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param);
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic);
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt);
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages);
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
+int mlx4_NOP(struct mlx4_dev *dev);
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg);
+void mlx4_opreq_action(struct work_struct *work);
+
+#endif /* MLX4_FW_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw_qos.c b/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
new file mode 100644
index 0000000..3a09d71
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies.
+ * All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include "fw_qos.h"
+#include "fw.h"
+
+enum {
+	/* allocate vpp opcode modifiers */
+	MLX4_ALLOCATE_VPP_ALLOCATE	= 0x0,
+	MLX4_ALLOCATE_VPP_QUERY		= 0x1
+};
+
+enum {
+	/* set vport qos opcode modifiers */
+	MLX4_SET_VPORT_QOS_SET		= 0x0,
+	MLX4_SET_VPORT_QOS_QUERY	= 0x1
+};
+
+struct mlx4_set_port_prio2tc_context {
+	u8 prio2tc[4];
+};
+
+struct mlx4_port_scheduler_tc_cfg_be {
+	__be16 pg;
+	__be16 bw_precentage;
+	__be16 max_bw_units; /* 3-100Mbps, 4-1Gbps, other values - reserved */
+	__be16 max_bw_value;
+};
+
+struct mlx4_set_port_scheduler_context {
+	struct mlx4_port_scheduler_tc_cfg_be tc[MLX4_NUM_TC];
+};
+
+/* Granular Qos (per VF) section */
+struct mlx4_alloc_vpp_param {
+	__be32 available_vpp;
+	__be32 vpp_p_up[MLX4_NUM_UP];
+};
+
+struct mlx4_prio_qos_param {
+	__be32 bw_share;
+	__be32 max_avg_bw;
+	__be32 reserved;
+	__be32 enable;
+	__be32 reserved1[4];
+};
+
+struct mlx4_set_vport_context {
+	__be32 reserved[8];
+	struct mlx4_prio_qos_param qos_p_up[MLX4_NUM_UP];
+};
+
+int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_prio2tc_context *context;
+	int err;
+	u32 in_mod;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	context = mailbox->buf;
+
+	for (i = 0; i < MLX4_NUM_UP; i += 2)
+		context->prio2tc[i >> 1] = prio2tc[i] << 4 | prio2tc[i + 1];
+
+	in_mod = MLX4_SET_PORT_PRIO2TC << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_PRIO2TC);
+
+int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
+			    u8 *pg, u16 *ratelimit)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_scheduler_context *context;
+	int err;
+	u32 in_mod;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	context = mailbox->buf;
+
+	for (i = 0; i < MLX4_NUM_TC; i++) {
+		struct mlx4_port_scheduler_tc_cfg_be *tc = &context->tc[i];
+		u16 r;
+
+		if (ratelimit && ratelimit[i]) {
+			if (ratelimit[i] <= MLX4_MAX_100M_UNITS_VAL) {
+				r = ratelimit[i];
+				tc->max_bw_units =
+					htons(MLX4_RATELIMIT_100M_UNITS);
+			} else {
+				r = ratelimit[i] / 10;
+				tc->max_bw_units =
+					htons(MLX4_RATELIMIT_1G_UNITS);
+			}
+			tc->max_bw_value = htons(r);
+		} else {
+			tc->max_bw_value = htons(MLX4_RATELIMIT_DEFAULT);
+			tc->max_bw_units = htons(MLX4_RATELIMIT_1G_UNITS);
+		}
+
+		tc->pg = htons(pg[i]);
+		tc->bw_precentage = htons(tc_tx_bw[i]);
+	}
+
+	in_mod = MLX4_SET_PORT_SCHEDULER << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_SCHEDULER);
+
+int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8 port,
+			  u16 *available_vpp, u8 *vpp_p_up)
+{
+	int i;
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_alloc_vpp_param *out_param;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	out_param = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, port,
+			   MLX4_ALLOCATE_VPP_QUERY,
+			   MLX4_CMD_ALLOCATE_VPP,
+			   MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	/* Total number of supported VPPs */
+	*available_vpp = (u16)be32_to_cpu(out_param->available_vpp);
+
+	for (i = 0; i < MLX4_NUM_UP; i++)
+		vpp_p_up[i] = (u8)be32_to_cpu(out_param->vpp_p_up[i]);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx4_ALLOCATE_VPP_get);
+
+int mlx4_ALLOCATE_VPP_set(struct mlx4_dev *dev, u8 port, u8 *vpp_p_up)
+{
+	int i;
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_alloc_vpp_param *in_param;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	in_param = mailbox->buf;
+
+	for (i = 0; i < MLX4_NUM_UP; i++)
+		in_param->vpp_p_up[i] = cpu_to_be32(vpp_p_up[i]);
+
+	err = mlx4_cmd(dev, mailbox->dma, port,
+		       MLX4_ALLOCATE_VPP_ALLOCATE,
+		       MLX4_CMD_ALLOCATE_VPP,
+		       MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_ALLOCATE_VPP_set);
+
+int mlx4_SET_VPORT_QOS_get(struct mlx4_dev *dev, u8 port, u8 vport,
+			   struct mlx4_vport_qos_param *out_param)
+{
+	int i;
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_vport_context *ctx;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	ctx = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, (vport << 8) | port,
+			   MLX4_SET_VPORT_QOS_QUERY,
+			   MLX4_CMD_SET_VPORT_QOS,
+			   MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	for (i = 0; i < MLX4_NUM_UP; i++) {
+		out_param[i].bw_share = be32_to_cpu(ctx->qos_p_up[i].bw_share);
+		out_param[i].max_avg_bw =
+			be32_to_cpu(ctx->qos_p_up[i].max_avg_bw);
+		out_param[i].enable =
+			!!(be32_to_cpu(ctx->qos_p_up[i].enable) & 31);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_VPORT_QOS_get);
+
+int mlx4_SET_VPORT_QOS_set(struct mlx4_dev *dev, u8 port, u8 vport,
+			   struct mlx4_vport_qos_param *in_param)
+{
+	int i;
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_vport_context *ctx;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	ctx = mailbox->buf;
+
+	for (i = 0; i < MLX4_NUM_UP; i++) {
+		ctx->qos_p_up[i].bw_share = cpu_to_be32(in_param[i].bw_share);
+		ctx->qos_p_up[i].max_avg_bw =
+				cpu_to_be32(in_param[i].max_avg_bw);
+		ctx->qos_p_up[i].enable =
+				cpu_to_be32(in_param[i].enable << 31);
+	}
+
+	err = mlx4_cmd(dev, mailbox->dma, (vport << 8) | port,
+		       MLX4_SET_VPORT_QOS_SET,
+		       MLX4_CMD_SET_VPORT_QOS,
+		       MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_VPORT_QOS_set);
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw_qos.h b/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
new file mode 100644
index 0000000..5829975
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies.
+ * All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_FW_QOS_H
+#define MLX4_FW_QOS_H
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/device.h>
+
+#define MLX4_NUM_UP 8
+#define MLX4_NUM_TC 8
+
+/* Default supported priorities for VPP allocation */
+#define MLX4_DEFAULT_QOS_PRIO (0)
+
+/* Derived from FW feature definition, 0 is the default vport fo all QPs */
+#define MLX4_VPP_DEFAULT_VPORT (0)
+
+struct mlx4_vport_qos_param {
+	u32 bw_share;
+	u32 max_avg_bw;
+	u8 enable;
+};
+
+/**
+ * mlx4_SET_PORT_PRIO2TC - This routine maps user priorities to traffic
+ * classes of a given port and device.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @prio2tc: Array of TC associated with each priorities.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc);
+
+/**
+ * mlx4_SET_PORT_SCHEDULER - This routine configures the arbitration between
+ * traffic classes (ETS) and configured rate limit for traffic classes.
+ * tc_tx_bw, pg and ratelimit are arrays where each index represents a TC.
+ * The description for those parameters below refers to a single TC.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @tc_tx_bw: The percentage of the bandwidth allocated for traffic class
+ *  within a TC group. The sum of the bw_percentage of all the traffic
+ *  classes within a TC group must equal 100% for correct operation.
+ * @pg: The TC group the traffic class is associated with.
+ * @ratelimit: The maximal bandwidth allowed for the use by this traffic class.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
+			    u8 *pg, u16 *ratelimit);
+/**
+ * mlx4_ALLOCATE_VPP_get - Query port VPP available resources and allocation.
+ * Before distribution of VPPs to priorities, only available_vpp is returned.
+ * After initialization it returns the distribution of VPPs among priorities.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @available_vpp: Pointer to variable where number of available VPPs is stored
+ * @vpp_p_up: Distribution of VPPs to priorities is stored in this array
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8 port,
+			  u16 *available_vpp, u8 *vpp_p_up);
+/**
+ * mlx4_ALLOCATE_VPP_set - Distribution of VPPs among differnt priorities.
+ * The total number of VPPs assigned to all for a port must not exceed
+ * the value reported by available_vpp in mlx4_ALLOCATE_VPP_get.
+ * VPP allocation is allowed only after the port type has been set,
+ * and while no QPs are open for this port.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @vpp_p_up: Allocation of VPPs to different priorities.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_ALLOCATE_VPP_set(struct mlx4_dev *dev, u8 port, u8 *vpp_p_up);
+
+/**
+ * mlx4_SET_VPORT_QOS_get - Query QoS proporties of a Vport.
+ * Each priority allowed for the Vport is assigned with a share of the BW,
+ * and a BW limitation. This commands query the current QoS values.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @vport: Vport id.
+ * @out_param: Array of mlx4_vport_qos_param that will contain the values.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_SET_VPORT_QOS_get(struct mlx4_dev *dev, u8 port, u8 vport,
+			   struct mlx4_vport_qos_param *out_param);
+
+/**
+ * mlx4_SET_VPORT_QOS_set - Set QoS proporties of a Vport.
+ * QoS parameters can be modified at any time, but must be initialized
+ * before any QP is associated with the VPort.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @vport: Vport id.
+ * @out_param: Array of mlx4_vport_qos_param which holds the requested values.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_SET_VPORT_QOS_set(struct mlx4_dev *dev, u8 port, u8 vport,
+			   struct mlx4_vport_qos_param *in_param);
+
+#endif /* MLX4_FW_QOS_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
new file mode 100644
index 0000000..685337d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+#include "fw.h"
+
+/*
+ * We allocate in page size (default 4KB on many archs) chunks to avoid high
+ * order memory allocations in fragmented/high usage memory situation.
+ */
+enum {
+	MLX4_ICM_ALLOC_SIZE	= PAGE_SIZE,
+	MLX4_TABLE_CHUNK_SIZE	= PAGE_SIZE,
+};
+
+static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
+{
+	int i;
+
+	if (chunk->nsg > 0)
+		pci_unmap_sg(dev->persist->pdev, chunk->mem, chunk->npages,
+			     PCI_DMA_BIDIRECTIONAL);
+
+	for (i = 0; i < chunk->npages; ++i)
+		__free_pages(sg_page(&chunk->mem[i]),
+			     get_order(chunk->mem[i].length));
+}
+
+static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
+{
+	int i;
+
+	for (i = 0; i < chunk->npages; ++i)
+		dma_free_coherent(&dev->persist->pdev->dev,
+				  chunk->mem[i].length,
+				  lowmem_page_address(sg_page(&chunk->mem[i])),
+				  sg_dma_address(&chunk->mem[i]));
+}
+
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent)
+{
+	struct mlx4_icm_chunk *chunk, *tmp;
+
+	if (!icm)
+		return;
+
+	list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+		if (coherent)
+			mlx4_free_icm_coherent(dev, chunk);
+		else
+			mlx4_free_icm_pages(dev, chunk);
+
+		kfree(chunk);
+	}
+
+	kfree(icm);
+}
+
+static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order,
+				gfp_t gfp_mask, int node)
+{
+	struct page *page;
+
+	page = alloc_pages_node(node, gfp_mask, order);
+	if (!page) {
+		page = alloc_pages(gfp_mask, order);
+		if (!page)
+			return -ENOMEM;
+	}
+
+	sg_set_page(mem, page, PAGE_SIZE << order, 0);
+	return 0;
+}
+
+static int mlx4_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
+				    int order, gfp_t gfp_mask)
+{
+	void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order,
+				       &sg_dma_address(mem), gfp_mask);
+	if (!buf)
+		return -ENOMEM;
+
+	if (offset_in_page(buf)) {
+		dma_free_coherent(dev, PAGE_SIZE << order,
+				  buf, sg_dma_address(mem));
+		return -ENOMEM;
+	}
+
+	sg_set_buf(mem, buf, PAGE_SIZE << order);
+	sg_dma_len(mem) = PAGE_SIZE << order;
+	return 0;
+}
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
+				gfp_t gfp_mask, int coherent)
+{
+	struct mlx4_icm *icm;
+	struct mlx4_icm_chunk *chunk = NULL;
+	int cur_order;
+	int ret;
+
+	/* We use sg_set_buf for coherent allocs, which assumes low memory */
+	BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
+
+	icm = kmalloc_node(sizeof(*icm),
+			   gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN),
+			   dev->numa_node);
+	if (!icm) {
+		icm = kmalloc(sizeof(*icm),
+			      gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+		if (!icm)
+			return NULL;
+	}
+
+	icm->refcount = 0;
+	INIT_LIST_HEAD(&icm->chunk_list);
+
+	cur_order = get_order(MLX4_ICM_ALLOC_SIZE);
+
+	while (npages > 0) {
+		if (!chunk) {
+			chunk = kmalloc_node(sizeof(*chunk),
+					     gfp_mask & ~(__GFP_HIGHMEM |
+							  __GFP_NOWARN),
+					     dev->numa_node);
+			if (!chunk) {
+				chunk = kmalloc(sizeof(*chunk),
+						gfp_mask & ~(__GFP_HIGHMEM |
+							     __GFP_NOWARN));
+				if (!chunk)
+					goto fail;
+			}
+
+			sg_init_table(chunk->mem, MLX4_ICM_CHUNK_LEN);
+			chunk->npages = 0;
+			chunk->nsg    = 0;
+			list_add_tail(&chunk->list, &icm->chunk_list);
+		}
+
+		while (1 << cur_order > npages)
+			--cur_order;
+
+		if (coherent)
+			ret = mlx4_alloc_icm_coherent(&dev->persist->pdev->dev,
+						      &chunk->mem[chunk->npages],
+						      cur_order, gfp_mask);
+		else
+			ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages],
+						   cur_order, gfp_mask,
+						   dev->numa_node);
+
+		if (ret) {
+			if (--cur_order < 0)
+				goto fail;
+			else
+				continue;
+		}
+
+		++chunk->npages;
+
+		if (coherent)
+			++chunk->nsg;
+		else if (chunk->npages == MLX4_ICM_CHUNK_LEN) {
+			chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->mem,
+						chunk->npages,
+						PCI_DMA_BIDIRECTIONAL);
+
+			if (chunk->nsg <= 0)
+				goto fail;
+		}
+
+		if (chunk->npages == MLX4_ICM_CHUNK_LEN)
+			chunk = NULL;
+
+		npages -= 1 << cur_order;
+	}
+
+	if (!coherent && chunk) {
+		chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->mem,
+					chunk->npages,
+					PCI_DMA_BIDIRECTIONAL);
+
+		if (chunk->nsg <= 0)
+			goto fail;
+	}
+
+	return icm;
+
+fail:
+	mlx4_free_icm(dev, icm, coherent);
+	return NULL;
+}
+
+static int mlx4_MAP_ICM(struct mlx4_dev *dev, struct mlx4_icm *icm, u64 virt)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM, icm, virt);
+}
+
+static int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count)
+{
+	return mlx4_cmd(dev, virt, page_count, 0, MLX4_CMD_UNMAP_ICM,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM_AUX, icm, -1);
+}
+
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj)
+{
+	u32 i = (obj & (table->num_obj - 1)) /
+			(MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+	int ret = 0;
+
+	mutex_lock(&table->mutex);
+
+	if (table->icm[i]) {
+		++table->icm[i]->refcount;
+		goto out;
+	}
+
+	table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+				       (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+				       __GFP_NOWARN, table->coherent);
+	if (!table->icm[i]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (mlx4_MAP_ICM(dev, table->icm[i], table->virt +
+			 (u64) i * MLX4_TABLE_CHUNK_SIZE)) {
+		mlx4_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	++table->icm[i]->refcount;
+
+out:
+	mutex_unlock(&table->mutex);
+	return ret;
+}
+
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj)
+{
+	u32 i;
+	u64 offset;
+
+	i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+
+	mutex_lock(&table->mutex);
+
+	if (--table->icm[i]->refcount == 0) {
+		offset = (u64) i * MLX4_TABLE_CHUNK_SIZE;
+		mlx4_UNMAP_ICM(dev, table->virt + offset,
+			       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+		mlx4_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+	}
+
+	mutex_unlock(&table->mutex);
+}
+
+void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj,
+			dma_addr_t *dma_handle)
+{
+	int offset, dma_offset, i;
+	u64 idx;
+	struct mlx4_icm_chunk *chunk;
+	struct mlx4_icm *icm;
+	struct page *page = NULL;
+
+	if (!table->lowmem)
+		return NULL;
+
+	mutex_lock(&table->mutex);
+
+	idx = (u64) (obj & (table->num_obj - 1)) * table->obj_size;
+	icm = table->icm[idx / MLX4_TABLE_CHUNK_SIZE];
+	dma_offset = offset = idx % MLX4_TABLE_CHUNK_SIZE;
+
+	if (!icm)
+		goto out;
+
+	list_for_each_entry(chunk, &icm->chunk_list, list) {
+		for (i = 0; i < chunk->npages; ++i) {
+			if (dma_handle && dma_offset >= 0) {
+				if (sg_dma_len(&chunk->mem[i]) > dma_offset)
+					*dma_handle = sg_dma_address(&chunk->mem[i]) +
+						dma_offset;
+				dma_offset -= sg_dma_len(&chunk->mem[i]);
+			}
+			/*
+			 * DMA mapping can merge pages but not split them,
+			 * so if we found the page, dma_handle has already
+			 * been assigned to.
+			 */
+			if (chunk->mem[i].length > offset) {
+				page = sg_page(&chunk->mem[i]);
+				goto out;
+			}
+			offset -= chunk->mem[i].length;
+		}
+	}
+
+out:
+	mutex_unlock(&table->mutex);
+	return page ? lowmem_page_address(page) + offset : NULL;
+}
+
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			 u32 start, u32 end)
+{
+	int inc = MLX4_TABLE_CHUNK_SIZE / table->obj_size;
+	int err;
+	u32 i;
+
+	for (i = start; i <= end; i += inc) {
+		err = mlx4_table_get(dev, table, i);
+		if (err)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (i > start) {
+		i -= inc;
+		mlx4_table_put(dev, table, i);
+	}
+
+	return err;
+}
+
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			  u32 start, u32 end)
+{
+	u32 i;
+
+	for (i = start; i <= end; i += MLX4_TABLE_CHUNK_SIZE / table->obj_size)
+		mlx4_table_put(dev, table, i);
+}
+
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			u64 virt, int obj_size,	u32 nobj, int reserved,
+			int use_lowmem, int use_coherent)
+{
+	int obj_per_chunk;
+	int num_icm;
+	unsigned chunk_size;
+	int i;
+	u64 size;
+
+	obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
+	if (WARN_ON(!obj_per_chunk))
+		return -EINVAL;
+	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
+
+	table->icm      = kvzalloc(num_icm * sizeof(*table->icm), GFP_KERNEL);
+	if (!table->icm)
+		return -ENOMEM;
+	table->virt     = virt;
+	table->num_icm  = num_icm;
+	table->num_obj  = nobj;
+	table->obj_size = obj_size;
+	table->lowmem   = use_lowmem;
+	table->coherent = use_coherent;
+	mutex_init(&table->mutex);
+
+	size = (u64) nobj * obj_size;
+	for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+		chunk_size = MLX4_TABLE_CHUNK_SIZE;
+		if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > size)
+			chunk_size = PAGE_ALIGN(size -
+					i * MLX4_TABLE_CHUNK_SIZE);
+
+		table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
+					       (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+					       __GFP_NOWARN, use_coherent);
+		if (!table->icm[i])
+			goto err;
+		if (mlx4_MAP_ICM(dev, table->icm[i], virt + i * MLX4_TABLE_CHUNK_SIZE)) {
+			mlx4_free_icm(dev, table->icm[i], use_coherent);
+			table->icm[i] = NULL;
+			goto err;
+		}
+
+		/*
+		 * Add a reference to this ICM chunk so that it never
+		 * gets freed (since it contains reserved firmware objects).
+		 */
+		++table->icm[i]->refcount;
+	}
+
+	return 0;
+
+err:
+	for (i = 0; i < num_icm; ++i)
+		if (table->icm[i]) {
+			mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE,
+				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+			mlx4_free_icm(dev, table->icm[i], use_coherent);
+		}
+
+	kvfree(table->icm);
+
+	return -ENOMEM;
+}
+
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
+{
+	int i;
+
+	for (i = 0; i < table->num_icm; ++i)
+		if (table->icm[i]) {
+			mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
+				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+			mlx4_free_icm(dev, table->icm[i], table->coherent);
+		}
+
+	kvfree(table->icm);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.h b/drivers/net/ethernet/mellanox/mlx4/icm.h
new file mode 100644
index 0000000..c9169a4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_ICM_H
+#define MLX4_ICM_H
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/mutex.h>
+
+#define MLX4_ICM_CHUNK_LEN						\
+	((256 - sizeof(struct list_head) - 2 * sizeof(int)) /		\
+	 (sizeof(struct scatterlist)))
+
+enum {
+	MLX4_ICM_PAGE_SHIFT	= 12,
+	MLX4_ICM_PAGE_SIZE	= 1 << MLX4_ICM_PAGE_SHIFT,
+};
+
+struct mlx4_icm_chunk {
+	struct list_head	list;
+	int			npages;
+	int			nsg;
+	struct scatterlist	mem[MLX4_ICM_CHUNK_LEN];
+};
+
+struct mlx4_icm {
+	struct list_head	chunk_list;
+	int			refcount;
+};
+
+struct mlx4_icm_iter {
+	struct mlx4_icm	       *icm;
+	struct mlx4_icm_chunk  *chunk;
+	int			page_idx;
+};
+
+struct mlx4_dev;
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
+				gfp_t gfp_mask, int coherent);
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent);
+
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj);
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj);
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			 u32 start, u32 end);
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			  u32 start, u32 end);
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			u64 virt, int obj_size,	u32 nobj, int reserved,
+			int use_lowmem, int use_coherent);
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table);
+void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj, dma_addr_t *dma_handle);
+
+static inline void mlx4_icm_first(struct mlx4_icm *icm,
+				  struct mlx4_icm_iter *iter)
+{
+	iter->icm      = icm;
+	iter->chunk    = list_empty(&icm->chunk_list) ?
+		NULL : list_entry(icm->chunk_list.next,
+				  struct mlx4_icm_chunk, list);
+	iter->page_idx = 0;
+}
+
+static inline int mlx4_icm_last(struct mlx4_icm_iter *iter)
+{
+	return !iter->chunk;
+}
+
+static inline void mlx4_icm_next(struct mlx4_icm_iter *iter)
+{
+	if (++iter->page_idx >= iter->chunk->nsg) {
+		if (iter->chunk->list.next == &iter->icm->chunk_list) {
+			iter->chunk = NULL;
+			return;
+		}
+
+		iter->chunk = list_entry(iter->chunk->list.next,
+					 struct mlx4_icm_chunk, list);
+		iter->page_idx = 0;
+	}
+}
+
+static inline dma_addr_t mlx4_icm_addr(struct mlx4_icm_iter *iter)
+{
+	return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+static inline unsigned long mlx4_icm_size(struct mlx4_icm_iter *iter)
+{
+	return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+}
+
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
+
+#endif /* MLX4_ICM_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
new file mode 100644
index 0000000..65482f0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/errno.h>
+#include <net/devlink.h>
+
+#include "mlx4.h"
+
+struct mlx4_device_context {
+	struct list_head	list;
+	struct list_head	bond_list;
+	struct mlx4_interface  *intf;
+	void		       *context;
+};
+
+static LIST_HEAD(intf_list);
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(intf_mutex);
+
+static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+	struct mlx4_device_context *dev_ctx;
+
+	dev_ctx = kmalloc(sizeof(*dev_ctx), GFP_KERNEL);
+	if (!dev_ctx)
+		return;
+
+	dev_ctx->intf    = intf;
+	dev_ctx->context = intf->add(&priv->dev);
+
+	if (dev_ctx->context) {
+		spin_lock_irq(&priv->ctx_lock);
+		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+		spin_unlock_irq(&priv->ctx_lock);
+		if (intf->activate)
+			intf->activate(&priv->dev, dev_ctx->context);
+	} else
+		kfree(dev_ctx);
+
+}
+
+static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+	struct mlx4_device_context *dev_ctx;
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf == intf) {
+			spin_lock_irq(&priv->ctx_lock);
+			list_del(&dev_ctx->list);
+			spin_unlock_irq(&priv->ctx_lock);
+
+			intf->remove(&priv->dev, dev_ctx->context);
+			kfree(dev_ctx);
+			return;
+		}
+}
+
+int mlx4_register_interface(struct mlx4_interface *intf)
+{
+	struct mlx4_priv *priv;
+
+	if (!intf->add || !intf->remove)
+		return -EINVAL;
+
+	mutex_lock(&intf_mutex);
+
+	list_add_tail(&intf->list, &intf_list);
+	list_for_each_entry(priv, &dev_list, dev_list) {
+		if (mlx4_is_mfunc(&priv->dev) && (intf->flags & MLX4_INTFF_BONDING)) {
+			mlx4_dbg(&priv->dev,
+				 "SRIOV, disabling HA mode for intf proto %d\n", intf->protocol);
+			intf->flags &= ~MLX4_INTFF_BONDING;
+		}
+		mlx4_add_device(intf, priv);
+	}
+
+	mutex_unlock(&intf_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_interface);
+
+void mlx4_unregister_interface(struct mlx4_interface *intf)
+{
+	struct mlx4_priv *priv;
+
+	mutex_lock(&intf_mutex);
+
+	list_for_each_entry(priv, &dev_list, dev_list)
+		mlx4_remove_device(intf, priv);
+
+	list_del(&intf->list);
+
+	mutex_unlock(&intf_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_interface);
+
+int mlx4_do_bond(struct mlx4_dev *dev, bool enable)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx = NULL, *temp_dev_ctx;
+	unsigned long flags;
+	int ret;
+	LIST_HEAD(bond_list);
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP))
+		return -EOPNOTSUPP;
+
+	ret = mlx4_disable_rx_port_check(dev, enable);
+	if (ret) {
+		mlx4_err(dev, "Fail to %s rx port check\n",
+			 enable ? "enable" : "disable");
+		return ret;
+	}
+	if (enable) {
+		dev->flags |= MLX4_FLAG_BONDED;
+	} else {
+		ret = mlx4_virt2phy_port_map(dev, 1, 2);
+		if (ret) {
+			mlx4_err(dev, "Fail to reset port map\n");
+			return ret;
+		}
+		dev->flags &= ~MLX4_FLAG_BONDED;
+	}
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+	list_for_each_entry_safe(dev_ctx, temp_dev_ctx, &priv->ctx_list, list) {
+		if (dev_ctx->intf->flags & MLX4_INTFF_BONDING) {
+			list_add_tail(&dev_ctx->bond_list, &bond_list);
+			list_del(&dev_ctx->list);
+		}
+	}
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &bond_list, bond_list) {
+		dev_ctx->intf->remove(dev, dev_ctx->context);
+		dev_ctx->context =  dev_ctx->intf->add(dev);
+
+		spin_lock_irqsave(&priv->ctx_lock, flags);
+		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+		spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+		mlx4_dbg(dev, "Interface for protocol %d restarted with bonded mode %s\n",
+			 dev_ctx->intf->protocol, enable ?
+			 "enabled" : "disabled");
+	}
+	return 0;
+}
+
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type,
+			 unsigned long param)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf->event)
+			dev_ctx->intf->event(dev, dev_ctx->context, type, param);
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+}
+
+int mlx4_register_device(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_interface *intf;
+
+	mutex_lock(&intf_mutex);
+
+	dev->persist->interface_state |= MLX4_INTERFACE_STATE_UP;
+	list_add_tail(&priv->dev_list, &dev_list);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx4_add_device(intf, priv);
+
+	mutex_unlock(&intf_mutex);
+	mlx4_start_catas_poll(dev);
+
+	return 0;
+}
+
+void mlx4_unregister_device(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_interface *intf;
+
+	if (!(dev->persist->interface_state & MLX4_INTERFACE_STATE_UP))
+		return;
+
+	mlx4_stop_catas_poll(dev);
+	if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION &&
+	    mlx4_is_slave(dev)) {
+		/* In mlx4_remove_one on a VF */
+		u32 slave_read =
+			swab32(readl(&mlx4_priv(dev)->mfunc.comm->slave_read));
+
+		if (mlx4_comm_internal_err(slave_read)) {
+			mlx4_dbg(dev, "%s: comm channel is down, entering error state.\n",
+				 __func__);
+			mlx4_enter_error_state(dev->persist);
+		}
+	}
+	mutex_lock(&intf_mutex);
+
+	list_for_each_entry(intf, &intf_list, list)
+		mlx4_remove_device(intf, priv);
+
+	list_del(&priv->dev_list);
+	dev->persist->interface_state &= ~MLX4_INTERFACE_STATE_UP;
+
+	mutex_unlock(&intf_mutex);
+}
+
+void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx;
+	unsigned long flags;
+	void *result = NULL;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf->protocol == proto && dev_ctx->intf->get_dev) {
+			result = dev_ctx->intf->get_dev(dev, dev_ctx->context, port);
+			break;
+		}
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_protocol_dev);
+
+struct devlink_port *mlx4_get_devlink_port(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+
+	return &info->devlink_port;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_devlink_port);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
new file mode 100644
index 0000000..60172a3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -0,0 +1,4204 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/io-mapping.h>
+#include <linux/delay.h>
+#include <linux/kmod.h>
+#include <linux/etherdevice.h>
+#include <net/devlink.h>
+
+#include <uapi/rdma/mlx4-abi.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+#include "mlx4.h"
+#include "fw.h"
+#include "icm.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+struct workqueue_struct *mlx4_wq;
+
+#ifdef CONFIG_MLX4_DEBUG
+
+int mlx4_debug_level = 0;
+module_param_named(debug_level, mlx4_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_MLX4_DEBUG */
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x = 1;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static uint8_t num_vfs[3] = {0, 0, 0};
+static int num_vfs_argc;
+module_param_array(num_vfs, byte , &num_vfs_argc, 0444);
+MODULE_PARM_DESC(num_vfs, "enable #num_vfs functions if num_vfs > 0\n"
+			  "num_vfs=port1,port2,port1+2");
+
+static uint8_t probe_vf[3] = {0, 0, 0};
+static int probe_vfs_argc;
+module_param_array(probe_vf, byte, &probe_vfs_argc, 0444);
+MODULE_PARM_DESC(probe_vf, "number of vfs to probe by pf driver (num_vfs > 0)\n"
+			   "probe_vf=port1,port2,port1+2");
+
+static int mlx4_log_num_mgm_entry_size = MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE;
+module_param_named(log_num_mgm_entry_size,
+			mlx4_log_num_mgm_entry_size, int, 0444);
+MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num"
+					 " of qp per mcg, for example:"
+					 " 10 gives 248.range: 7 <="
+					 " log_num_mgm_entry_size <= 12."
+					 " To activate device managed"
+					 " flow steering when available, set to -1");
+
+static bool enable_64b_cqe_eqe = true;
+module_param(enable_64b_cqe_eqe, bool, 0444);
+MODULE_PARM_DESC(enable_64b_cqe_eqe,
+		 "Enable 64 byte CQEs/EQEs when the FW supports this (default: True)");
+
+static bool enable_4k_uar;
+module_param(enable_4k_uar, bool, 0444);
+MODULE_PARM_DESC(enable_4k_uar,
+		 "Enable using 4K UAR. Should not be enabled if have VFs which do not support 4K UARs (default: false)");
+
+#define PF_CONTEXT_BEHAVIOUR_MASK	(MLX4_FUNC_CAP_64B_EQE_CQE | \
+					 MLX4_FUNC_CAP_EQE_CQE_STRIDE | \
+					 MLX4_FUNC_CAP_DMFS_A0_STATIC)
+
+#define RESET_PERSIST_MASK_FLAGS	(MLX4_FLAG_SRIOV)
+
+static char mlx4_version[] =
+	DRV_NAME ": Mellanox ConnectX core driver v"
+	DRV_VERSION "\n";
+
+static const struct mlx4_profile default_profile = {
+	.num_qp		= 1 << 18,
+	.num_srq	= 1 << 16,
+	.rdmarc_per_qp	= 1 << 4,
+	.num_cq		= 1 << 16,
+	.num_mcg	= 1 << 13,
+	.num_mpt	= 1 << 19,
+	.num_mtt	= 1 << 20, /* It is really num mtt segements */
+};
+
+static const struct mlx4_profile low_mem_profile = {
+	.num_qp		= 1 << 17,
+	.num_srq	= 1 << 6,
+	.rdmarc_per_qp	= 1 << 4,
+	.num_cq		= 1 << 8,
+	.num_mcg	= 1 << 8,
+	.num_mpt	= 1 << 9,
+	.num_mtt	= 1 << 7,
+};
+
+static int log_num_mac = 7;
+module_param_named(log_num_mac, log_num_mac, int, 0444);
+MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)");
+
+static int log_num_vlan;
+module_param_named(log_num_vlan, log_num_vlan, int, 0444);
+MODULE_PARM_DESC(log_num_vlan, "Log2 max number of VLANs per ETH port (0-7)");
+/* Log2 max number of VLANs per ETH port (0-7) */
+#define MLX4_LOG_NUM_VLANS 7
+#define MLX4_MIN_LOG_NUM_VLANS 0
+#define MLX4_MIN_LOG_NUM_MAC 1
+
+static bool use_prio;
+module_param_named(use_prio, use_prio, bool, 0444);
+MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports (deprecated)");
+
+int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG);
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-7)");
+
+static int port_type_array[2] = {MLX4_PORT_TYPE_NONE, MLX4_PORT_TYPE_NONE};
+static int arr_argc = 2;
+module_param_array(port_type_array, int, &arr_argc, 0444);
+MODULE_PARM_DESC(port_type_array, "Array of port types: HW_DEFAULT (0) is default "
+				"1 for IB, 2 for Ethernet");
+
+struct mlx4_port_config {
+	struct list_head list;
+	enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1];
+	struct pci_dev *pdev;
+};
+
+static atomic_t pf_loading = ATOMIC_INIT(0);
+
+static inline void mlx4_set_num_reserved_uars(struct mlx4_dev *dev,
+					      struct mlx4_dev_cap *dev_cap)
+{
+	/* The reserved_uars is calculated by system page size unit.
+	 * Therefore, adjustment is added when the uar page size is less
+	 * than the system page size
+	 */
+	dev->caps.reserved_uars	=
+		max_t(int,
+		      mlx4_get_num_reserved_uar(dev),
+		      dev_cap->reserved_uars /
+			(1 << (PAGE_SHIFT - dev->uar_page_shift)));
+}
+
+int mlx4_check_port_params(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_type)
+{
+	int i;
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+		for (i = 0; i < dev->caps.num_ports - 1; i++) {
+			if (port_type[i] != port_type[i + 1]) {
+				mlx4_err(dev, "Only same port types supported on this HCA, aborting\n");
+				return -EINVAL;
+			}
+		}
+	}
+
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (!(port_type[i] & dev->caps.supported_type[i+1])) {
+			mlx4_err(dev, "Requested port type for port %d is not supported on this HCA\n",
+				 i + 1);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static void mlx4_set_port_mask(struct mlx4_dev *dev)
+{
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; ++i)
+		dev->caps.port_mask[i] = dev->caps.port_type[i];
+}
+
+enum {
+	MLX4_QUERY_FUNC_NUM_SYS_EQS = 1 << 0,
+};
+
+static int mlx4_query_func(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	int err = 0;
+	struct mlx4_func func;
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
+		err = mlx4_QUERY_FUNC(dev, &func, 0);
+		if (err) {
+			mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+			return err;
+		}
+		dev_cap->max_eqs = func.max_eq;
+		dev_cap->reserved_eqs = func.rsvd_eqs;
+		dev_cap->reserved_uars = func.rsvd_uars;
+		err |= MLX4_QUERY_FUNC_NUM_SYS_EQS;
+	}
+	return err;
+}
+
+static void mlx4_enable_cqe_eqe_stride(struct mlx4_dev *dev)
+{
+	struct mlx4_caps *dev_cap = &dev->caps;
+
+	/* FW not supporting or cancelled by user */
+	if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) ||
+	    !(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE))
+		return;
+
+	/* Must have 64B CQE_EQE enabled by FW to use bigger stride
+	 * When FW has NCSI it may decide not to report 64B CQE/EQEs
+	 */
+	if (!(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_EQE) ||
+	    !(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_CQE)) {
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+		return;
+	}
+
+	if (cache_line_size() == 128 || cache_line_size() == 256) {
+		mlx4_dbg(dev, "Enabling CQE stride cacheLine supported\n");
+		/* Changing the real data inside CQE size to 32B */
+		dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE;
+		dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE;
+
+		if (mlx4_is_master(dev))
+			dev_cap->function_caps |= MLX4_FUNC_CAP_EQE_CQE_STRIDE;
+	} else {
+		if (cache_line_size() != 32  && cache_line_size() != 64)
+			mlx4_dbg(dev, "Disabling CQE stride, cacheLine size unsupported\n");
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+	}
+}
+
+static int _mlx4_dev_port(struct mlx4_dev *dev, int port,
+			  struct mlx4_port_cap *port_cap)
+{
+	dev->caps.vl_cap[port]	    = port_cap->max_vl;
+	dev->caps.ib_mtu_cap[port]	    = port_cap->ib_mtu;
+	dev->phys_caps.gid_phys_table_len[port]  = port_cap->max_gids;
+	dev->phys_caps.pkey_phys_table_len[port] = port_cap->max_pkeys;
+	/* set gid and pkey table operating lengths by default
+	 * to non-sriov values
+	 */
+	dev->caps.gid_table_len[port]  = port_cap->max_gids;
+	dev->caps.pkey_table_len[port] = port_cap->max_pkeys;
+	dev->caps.port_width_cap[port] = port_cap->max_port_width;
+	dev->caps.eth_mtu_cap[port]    = port_cap->eth_mtu;
+	dev->caps.max_tc_eth	       = port_cap->max_tc_eth;
+	dev->caps.def_mac[port]        = port_cap->def_mac;
+	dev->caps.supported_type[port] = port_cap->supported_port_types;
+	dev->caps.suggested_type[port] = port_cap->suggested_type;
+	dev->caps.default_sense[port] = port_cap->default_sense;
+	dev->caps.trans_type[port]	    = port_cap->trans_type;
+	dev->caps.vendor_oui[port]     = port_cap->vendor_oui;
+	dev->caps.wavelength[port]     = port_cap->wavelength;
+	dev->caps.trans_code[port]     = port_cap->trans_code;
+
+	return 0;
+}
+
+static int mlx4_dev_port(struct mlx4_dev *dev, int port,
+			 struct mlx4_port_cap *port_cap)
+{
+	int err = 0;
+
+	err = mlx4_QUERY_PORT(dev, port, port_cap);
+
+	if (err)
+		mlx4_err(dev, "QUERY_PORT command failed.\n");
+
+	return err;
+}
+
+static inline void mlx4_enable_ignore_fcs(struct mlx4_dev *dev)
+{
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_IGNORE_FCS))
+		return;
+
+	if (mlx4_is_mfunc(dev)) {
+		mlx4_dbg(dev, "SRIOV mode - Disabling Ignore FCS");
+		dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_IGNORE_FCS;
+		return;
+	}
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP)) {
+		mlx4_dbg(dev,
+			 "Keep FCS is not supported - Disabling Ignore FCS");
+		dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_IGNORE_FCS;
+		return;
+	}
+}
+
+#define MLX4_A0_STEERING_TABLE_SIZE	256
+static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	int err;
+	int i;
+
+	err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
+		return err;
+	}
+	mlx4_dev_cap_dump(dev, dev_cap);
+
+	if (dev_cap->min_page_sz > PAGE_SIZE) {
+		mlx4_err(dev, "HCA minimum page size of %d bigger than kernel PAGE_SIZE of %ld, aborting\n",
+			 dev_cap->min_page_sz, PAGE_SIZE);
+		return -ENODEV;
+	}
+	if (dev_cap->num_ports > MLX4_MAX_PORTS) {
+		mlx4_err(dev, "HCA has %d ports, but we only support %d, aborting\n",
+			 dev_cap->num_ports, MLX4_MAX_PORTS);
+		return -ENODEV;
+	}
+
+	if (dev_cap->uar_size > pci_resource_len(dev->persist->pdev, 2)) {
+		mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
+			 dev_cap->uar_size,
+			 (unsigned long long)
+			 pci_resource_len(dev->persist->pdev, 2));
+		return -ENODEV;
+	}
+
+	dev->caps.num_ports	     = dev_cap->num_ports;
+	dev->caps.num_sys_eqs = dev_cap->num_sys_eqs;
+	dev->phys_caps.num_phys_eqs = dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS ?
+				      dev->caps.num_sys_eqs :
+				      MLX4_MAX_EQ_NUM;
+	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		err = _mlx4_dev_port(dev, i, dev_cap->port_cap + i);
+		if (err) {
+			mlx4_err(dev, "QUERY_PORT command failed, aborting\n");
+			return err;
+		}
+	}
+
+	dev->caps.uar_page_size	     = PAGE_SIZE;
+	dev->caps.num_uars	     = dev_cap->uar_size / PAGE_SIZE;
+	dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay;
+	dev->caps.bf_reg_size	     = dev_cap->bf_reg_size;
+	dev->caps.bf_regs_per_page   = dev_cap->bf_regs_per_page;
+	dev->caps.max_sq_sg	     = dev_cap->max_sq_sg;
+	dev->caps.max_rq_sg	     = dev_cap->max_rq_sg;
+	dev->caps.max_wqes	     = dev_cap->max_qp_sz;
+	dev->caps.max_qp_init_rdma   = dev_cap->max_requester_per_qp;
+	dev->caps.max_srq_wqes	     = dev_cap->max_srq_sz;
+	dev->caps.max_srq_sge	     = dev_cap->max_rq_sg - 1;
+	dev->caps.reserved_srqs	     = dev_cap->reserved_srqs;
+	dev->caps.max_sq_desc_sz     = dev_cap->max_sq_desc_sz;
+	dev->caps.max_rq_desc_sz     = dev_cap->max_rq_desc_sz;
+	/*
+	 * Subtract 1 from the limit because we need to allocate a
+	 * spare CQE so the HCA HW can tell the difference between an
+	 * empty CQ and a full CQ.
+	 */
+	dev->caps.max_cqes	     = dev_cap->max_cq_sz - 1;
+	dev->caps.reserved_cqs	     = dev_cap->reserved_cqs;
+	dev->caps.reserved_eqs	     = dev_cap->reserved_eqs;
+	dev->caps.reserved_mtts      = dev_cap->reserved_mtts;
+	dev->caps.reserved_mrws	     = dev_cap->reserved_mrws;
+
+	dev->caps.reserved_pds	     = dev_cap->reserved_pds;
+	dev->caps.reserved_xrcds     = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
+					dev_cap->reserved_xrcds : 0;
+	dev->caps.max_xrcds          = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
+					dev_cap->max_xrcds : 0;
+	dev->caps.mtt_entry_sz       = dev_cap->mtt_entry_sz;
+
+	dev->caps.max_msg_sz         = dev_cap->max_msg_sz;
+	dev->caps.page_size_cap	     = ~(u32) (dev_cap->min_page_sz - 1);
+	dev->caps.flags		     = dev_cap->flags;
+	dev->caps.flags2	     = dev_cap->flags2;
+	dev->caps.bmme_flags	     = dev_cap->bmme_flags;
+	dev->caps.reserved_lkey	     = dev_cap->reserved_lkey;
+	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
+	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
+	dev->caps.max_rss_tbl_sz     = dev_cap->max_rss_tbl_sz;
+	dev->caps.wol_port[1]          = dev_cap->wol_port[1];
+	dev->caps.wol_port[2]          = dev_cap->wol_port[2];
+
+	/* Save uar page shift */
+	if (!mlx4_is_slave(dev)) {
+		/* Virtual PCI function needs to determine UAR page size from
+		 * firmware. Only master PCI function can set the uar page size
+		 */
+		if (enable_4k_uar || !dev->persist->num_vfs)
+			dev->uar_page_shift = DEFAULT_UAR_PAGE_SHIFT;
+		else
+			dev->uar_page_shift = PAGE_SHIFT;
+
+		mlx4_set_num_reserved_uars(dev, dev_cap);
+	}
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN) {
+		struct mlx4_init_hca_param hca_param;
+
+		memset(&hca_param, 0, sizeof(hca_param));
+		err = mlx4_QUERY_HCA(dev, &hca_param);
+		/* Turn off PHV_EN flag in case phv_check_en is set.
+		 * phv_check_en is a HW check that parse the packet and verify
+		 * phv bit was reported correctly in the wqe. To allow QinQ
+		 * PHV_EN flag should be set and phv_check_en must be cleared
+		 * otherwise QinQ packets will be drop by the HW.
+		 */
+		if (err || hca_param.phv_check_en)
+			dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_PHV_EN;
+	}
+
+	/* Sense port always allowed on supported devices for ConnectX-1 and -2 */
+	if (mlx4_priv(dev)->pci_dev_data & MLX4_PCI_DEV_FORCE_SENSE_PORT)
+		dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
+	/* Don't do sense port on multifunction devices (for now at least) */
+	if (mlx4_is_mfunc(dev))
+		dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
+
+	if (mlx4_low_memory_profile()) {
+		dev->caps.log_num_macs  = MLX4_MIN_LOG_NUM_MAC;
+		dev->caps.log_num_vlans = MLX4_MIN_LOG_NUM_VLANS;
+	} else {
+		dev->caps.log_num_macs  = log_num_mac;
+		dev->caps.log_num_vlans = MLX4_LOG_NUM_VLANS;
+	}
+
+	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE;
+		if (dev->caps.supported_type[i]) {
+			/* if only ETH is supported - assign ETH */
+			if (dev->caps.supported_type[i] == MLX4_PORT_TYPE_ETH)
+				dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH;
+			/* if only IB is supported, assign IB */
+			else if (dev->caps.supported_type[i] ==
+				 MLX4_PORT_TYPE_IB)
+				dev->caps.port_type[i] = MLX4_PORT_TYPE_IB;
+			else {
+				/* if IB and ETH are supported, we set the port
+				 * type according to user selection of port type;
+				 * if user selected none, take the FW hint */
+				if (port_type_array[i - 1] == MLX4_PORT_TYPE_NONE)
+					dev->caps.port_type[i] = dev->caps.suggested_type[i] ?
+						MLX4_PORT_TYPE_ETH : MLX4_PORT_TYPE_IB;
+				else
+					dev->caps.port_type[i] = port_type_array[i - 1];
+			}
+		}
+		/*
+		 * Link sensing is allowed on the port if 3 conditions are true:
+		 * 1. Both protocols are supported on the port.
+		 * 2. Different types are supported on the port
+		 * 3. FW declared that it supports link sensing
+		 */
+		mlx4_priv(dev)->sense.sense_allowed[i] =
+			((dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO) &&
+			 (dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) &&
+			 (dev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT));
+
+		/*
+		 * If "default_sense" bit is set, we move the port to "AUTO" mode
+		 * and perform sense_port FW command to try and set the correct
+		 * port type from beginning
+		 */
+		if (mlx4_priv(dev)->sense.sense_allowed[i] && dev->caps.default_sense[i]) {
+			enum mlx4_port_type sensed_port = MLX4_PORT_TYPE_NONE;
+			dev->caps.possible_type[i] = MLX4_PORT_TYPE_AUTO;
+			mlx4_SENSE_PORT(dev, i, &sensed_port);
+			if (sensed_port != MLX4_PORT_TYPE_NONE)
+				dev->caps.port_type[i] = sensed_port;
+		} else {
+			dev->caps.possible_type[i] = dev->caps.port_type[i];
+		}
+
+		if (dev->caps.log_num_macs > dev_cap->port_cap[i].log_max_macs) {
+			dev->caps.log_num_macs = dev_cap->port_cap[i].log_max_macs;
+			mlx4_warn(dev, "Requested number of MACs is too much for port %d, reducing to %d\n",
+				  i, 1 << dev->caps.log_num_macs);
+		}
+		if (dev->caps.log_num_vlans > dev_cap->port_cap[i].log_max_vlans) {
+			dev->caps.log_num_vlans = dev_cap->port_cap[i].log_max_vlans;
+			mlx4_warn(dev, "Requested number of VLANs is too much for port %d, reducing to %d\n",
+				  i, 1 << dev->caps.log_num_vlans);
+		}
+	}
+
+	if (mlx4_is_master(dev) && (dev->caps.num_ports == 2) &&
+	    (port_type_array[0] == MLX4_PORT_TYPE_IB) &&
+	    (port_type_array[1] == MLX4_PORT_TYPE_ETH)) {
+		mlx4_warn(dev,
+			  "Granular QoS per VF not supported with IB/Eth configuration\n");
+		dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_QOS_VPP;
+	}
+
+	dev->caps.max_counters = dev_cap->max_counters;
+
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] =
+		(1 << dev->caps.log_num_macs) *
+		(1 << dev->caps.log_num_vlans) *
+		dev->caps.num_ports;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH;
+
+	if (dev_cap->dmfs_high_rate_qpn_base > 0 &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)
+		dev->caps.dmfs_high_rate_qpn_base = dev_cap->dmfs_high_rate_qpn_base;
+	else
+		dev->caps.dmfs_high_rate_qpn_base =
+			dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+
+	if (dev_cap->dmfs_high_rate_qpn_range > 0 &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN) {
+		dev->caps.dmfs_high_rate_qpn_range = dev_cap->dmfs_high_rate_qpn_range;
+		dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_DEFAULT;
+		dev->caps.flags2 |= MLX4_DEV_CAP_FLAG2_FS_A0;
+	} else {
+		dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_NOT_SUPPORTED;
+		dev->caps.dmfs_high_rate_qpn_base =
+			dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+		dev->caps.dmfs_high_rate_qpn_range = MLX4_A0_STEERING_TABLE_SIZE;
+	}
+
+	dev->caps.rl_caps = dev_cap->rl_caps;
+
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_RSS_RAW_ETH] =
+		dev->caps.dmfs_high_rate_qpn_range;
+
+	dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH];
+
+	dev->caps.sqp_demux = (mlx4_is_master(dev)) ? MLX4_MAX_NUM_SLAVES : 0;
+
+	if (!enable_64b_cqe_eqe && !mlx4_is_slave(dev)) {
+		if (dev_cap->flags &
+		    (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) {
+			mlx4_warn(dev, "64B EQEs/CQEs supported by the device but not enabled\n");
+			dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE;
+			dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE;
+		}
+
+		if (dev_cap->flags2 &
+		    (MLX4_DEV_CAP_FLAG2_CQE_STRIDE |
+		     MLX4_DEV_CAP_FLAG2_EQE_STRIDE)) {
+			mlx4_warn(dev, "Disabling EQE/CQE stride per user request\n");
+			dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+			dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+		}
+	}
+
+	if ((dev->caps.flags &
+	    (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) &&
+	    mlx4_is_master(dev))
+		dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE;
+
+	if (!mlx4_is_slave(dev)) {
+		mlx4_enable_cqe_eqe_stride(dev);
+		dev->caps.alloc_res_qp_mask =
+			(dev->caps.bf_reg_size ? MLX4_RESERVE_ETH_BF_QP : 0) |
+			MLX4_RESERVE_A0_QP;
+
+		if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETS_CFG) &&
+		    dev->caps.flags & MLX4_DEV_CAP_FLAG_SET_ETH_SCHED) {
+			mlx4_warn(dev, "Old device ETS support detected\n");
+			mlx4_warn(dev, "Consider upgrading device FW.\n");
+			dev->caps.flags2 |= MLX4_DEV_CAP_FLAG2_ETS_CFG;
+		}
+
+	} else {
+		dev->caps.alloc_res_qp_mask = 0;
+	}
+
+	mlx4_enable_ignore_fcs(dev);
+
+	return 0;
+}
+
+/*The function checks if there are live vf, return the num of them*/
+static int mlx4_how_many_lives_vf(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state;
+	int i;
+	int ret = 0;
+
+	for (i = 1/*the ppf is 0*/; i < dev->num_slaves; ++i) {
+		s_state = &priv->mfunc.master.slave_state[i];
+		if (s_state->active && s_state->last_cmd !=
+		    MLX4_COMM_CMD_RESET) {
+			mlx4_warn(dev, "%s: slave: %d is still active\n",
+				  __func__, i);
+			ret++;
+		}
+	}
+	return ret;
+}
+
+int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey)
+{
+	u32 qk = MLX4_RESERVED_QKEY_BASE;
+
+	if (qpn >= dev->phys_caps.base_tunnel_sqpn + 8 * MLX4_MFUNC_MAX ||
+	    qpn < dev->phys_caps.base_proxy_sqpn)
+		return -EINVAL;
+
+	if (qpn >= dev->phys_caps.base_tunnel_sqpn)
+		/* tunnel qp */
+		qk += qpn - dev->phys_caps.base_tunnel_sqpn;
+	else
+		qk += qpn - dev->phys_caps.base_proxy_sqpn;
+	*qkey = qk;
+	return 0;
+}
+EXPORT_SYMBOL(mlx4_get_parav_qkey);
+
+void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, int i, int val)
+{
+	struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+	if (!mlx4_is_master(dev))
+		return;
+
+	priv->virt2phys_pkey[slave][port - 1][i] = val;
+}
+EXPORT_SYMBOL(mlx4_sync_pkey_table);
+
+void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid)
+{
+	struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+	if (!mlx4_is_master(dev))
+		return;
+
+	priv->slave_node_guids[slave] = guid;
+}
+EXPORT_SYMBOL(mlx4_put_slave_node_guid);
+
+__be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+	if (!mlx4_is_master(dev))
+		return 0;
+
+	return priv->slave_node_guids[slave];
+}
+EXPORT_SYMBOL(mlx4_get_slave_node_guid);
+
+int mlx4_is_slave_active(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_slave;
+
+	if (!mlx4_is_master(dev))
+		return 0;
+
+	s_slave = &priv->mfunc.master.slave_state[slave];
+	return !!s_slave->active;
+}
+EXPORT_SYMBOL(mlx4_is_slave_active);
+
+void mlx4_handle_eth_header_mcast_prio(struct mlx4_net_trans_rule_hw_ctrl *ctrl,
+				       struct _rule_hw *eth_header)
+{
+	if (is_multicast_ether_addr(eth_header->eth.dst_mac) ||
+	    is_broadcast_ether_addr(eth_header->eth.dst_mac)) {
+		struct mlx4_net_trans_rule_hw_eth *eth =
+			(struct mlx4_net_trans_rule_hw_eth *)eth_header;
+		struct _rule_hw *next_rule = (struct _rule_hw *)(eth + 1);
+		bool last_rule = next_rule->size == 0 && next_rule->id == 0 &&
+			next_rule->rsvd == 0;
+
+		if (last_rule)
+			ctrl->prio = cpu_to_be16(MLX4_DOMAIN_NIC);
+	}
+}
+EXPORT_SYMBOL(mlx4_handle_eth_header_mcast_prio);
+
+static void slave_adjust_steering_mode(struct mlx4_dev *dev,
+				       struct mlx4_dev_cap *dev_cap,
+				       struct mlx4_init_hca_param *hca_param)
+{
+	dev->caps.steering_mode = hca_param->steering_mode;
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry;
+		dev->caps.fs_log_max_ucast_qp_range_size =
+			dev_cap->fs_log_max_ucast_qp_range_size;
+	} else
+		dev->caps.num_qp_per_mgm =
+			4 * ((1 << hca_param->log_mc_entry_sz)/16 - 2);
+
+	mlx4_dbg(dev, "Steering mode is: %s\n",
+		 mlx4_steering_mode_str(dev->caps.steering_mode));
+}
+
+static void mlx4_slave_destroy_special_qp_cap(struct mlx4_dev *dev)
+{
+	kfree(dev->caps.spec_qps);
+	dev->caps.spec_qps = NULL;
+}
+
+static int mlx4_slave_special_qp_cap(struct mlx4_dev *dev)
+{
+	struct mlx4_func_cap *func_cap = NULL;
+	struct mlx4_caps *caps = &dev->caps;
+	int i, err = 0;
+
+	func_cap = kzalloc(sizeof(*func_cap), GFP_KERNEL);
+	caps->spec_qps = kcalloc(caps->num_ports, sizeof(*caps->spec_qps), GFP_KERNEL);
+
+	if (!func_cap || !caps->spec_qps) {
+		mlx4_err(dev, "Failed to allocate memory for special qps cap\n");
+		err = -ENOMEM;
+		goto err_mem;
+	}
+
+	for (i = 1; i <= caps->num_ports; ++i) {
+		err = mlx4_QUERY_FUNC_CAP(dev, i, func_cap);
+		if (err) {
+			mlx4_err(dev, "QUERY_FUNC_CAP port command failed for port %d, aborting (%d)\n",
+				 i, err);
+			goto err_mem;
+		}
+		caps->spec_qps[i - 1] = func_cap->spec_qps;
+		caps->port_mask[i] = caps->port_type[i];
+		caps->phys_port_id[i] = func_cap->phys_port_id;
+		err = mlx4_get_slave_pkey_gid_tbl_len(dev, i,
+						      &caps->gid_table_len[i],
+						      &caps->pkey_table_len[i]);
+		if (err) {
+			mlx4_err(dev, "QUERY_PORT command failed for port %d, aborting (%d)\n",
+				 i, err);
+			goto err_mem;
+		}
+	}
+
+err_mem:
+	if (err)
+		mlx4_slave_destroy_special_qp_cap(dev);
+	kfree(func_cap);
+	return err;
+}
+
+static int mlx4_slave_cap(struct mlx4_dev *dev)
+{
+	int			   err;
+	u32			   page_size;
+	struct mlx4_dev_cap	   *dev_cap = NULL;
+	struct mlx4_func_cap	   *func_cap = NULL;
+	struct mlx4_init_hca_param *hca_param = NULL;
+
+	hca_param = kzalloc(sizeof(*hca_param), GFP_KERNEL);
+	func_cap = kzalloc(sizeof(*func_cap), GFP_KERNEL);
+	dev_cap = kzalloc(sizeof(*dev_cap), GFP_KERNEL);
+	if (!hca_param || !func_cap || !dev_cap) {
+		mlx4_err(dev, "Failed to allocate memory for slave_cap\n");
+		err = -ENOMEM;
+		goto free_mem;
+	}
+
+	err = mlx4_QUERY_HCA(dev, hca_param);
+	if (err) {
+		mlx4_err(dev, "QUERY_HCA command failed, aborting\n");
+		goto free_mem;
+	}
+
+	/* fail if the hca has an unknown global capability
+	 * at this time global_caps should be always zeroed
+	 */
+	if (hca_param->global_caps) {
+		mlx4_err(dev, "Unknown hca global capabilities\n");
+		err = -EINVAL;
+		goto free_mem;
+	}
+
+	dev->caps.hca_core_clock = hca_param->hca_core_clock;
+
+	dev->caps.max_qp_dest_rdma = 1 << hca_param->log_rd_per_qp;
+	err = mlx4_dev_cap(dev, dev_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
+		goto free_mem;
+	}
+
+	err = mlx4_QUERY_FW(dev);
+	if (err)
+		mlx4_err(dev, "QUERY_FW command failed: could not get FW version\n");
+
+	page_size = ~dev->caps.page_size_cap + 1;
+	mlx4_warn(dev, "HCA minimum page size:%d\n", page_size);
+	if (page_size > PAGE_SIZE) {
+		mlx4_err(dev, "HCA minimum page size of %d bigger than kernel PAGE_SIZE of %ld, aborting\n",
+			 page_size, PAGE_SIZE);
+		err = -ENODEV;
+		goto free_mem;
+	}
+
+	/* Set uar_page_shift for VF */
+	dev->uar_page_shift = hca_param->uar_page_sz + 12;
+
+	/* Make sure the master uar page size is valid */
+	if (dev->uar_page_shift > PAGE_SHIFT) {
+		mlx4_err(dev,
+			 "Invalid configuration: uar page size is larger than system page size\n");
+		err = -ENODEV;
+		goto free_mem;
+	}
+
+	/* Set reserved_uars based on the uar_page_shift */
+	mlx4_set_num_reserved_uars(dev, dev_cap);
+
+	/* Although uar page size in FW differs from system page size,
+	 * upper software layers (mlx4_ib, mlx4_en and part of mlx4_core)
+	 * still works with assumption that uar page size == system page size
+	 */
+	dev->caps.uar_page_size = PAGE_SIZE;
+
+	err = mlx4_QUERY_FUNC_CAP(dev, 0, func_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_FUNC_CAP general command failed, aborting (%d)\n",
+			 err);
+		goto free_mem;
+	}
+
+	if ((func_cap->pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) !=
+	    PF_CONTEXT_BEHAVIOUR_MASK) {
+		mlx4_err(dev, "Unknown pf context behaviour %x known flags %x\n",
+			 func_cap->pf_context_behaviour,
+			 PF_CONTEXT_BEHAVIOUR_MASK);
+		err = -EINVAL;
+		goto free_mem;
+	}
+
+	dev->caps.num_ports		= func_cap->num_ports;
+	dev->quotas.qp			= func_cap->qp_quota;
+	dev->quotas.srq			= func_cap->srq_quota;
+	dev->quotas.cq			= func_cap->cq_quota;
+	dev->quotas.mpt			= func_cap->mpt_quota;
+	dev->quotas.mtt			= func_cap->mtt_quota;
+	dev->caps.num_qps		= 1 << hca_param->log_num_qps;
+	dev->caps.num_srqs		= 1 << hca_param->log_num_srqs;
+	dev->caps.num_cqs		= 1 << hca_param->log_num_cqs;
+	dev->caps.num_mpts		= 1 << hca_param->log_mpt_sz;
+	dev->caps.num_eqs		= func_cap->max_eq;
+	dev->caps.reserved_eqs		= func_cap->reserved_eq;
+	dev->caps.reserved_lkey		= func_cap->reserved_lkey;
+	dev->caps.num_pds               = MLX4_NUM_PDS;
+	dev->caps.num_mgms              = 0;
+	dev->caps.num_amgms             = 0;
+
+	if (dev->caps.num_ports > MLX4_MAX_PORTS) {
+		mlx4_err(dev, "HCA has %d ports, but we only support %d, aborting\n",
+			 dev->caps.num_ports, MLX4_MAX_PORTS);
+		err = -ENODEV;
+		goto free_mem;
+	}
+
+	mlx4_replace_zero_macs(dev);
+
+	err = mlx4_slave_special_qp_cap(dev);
+	if (err) {
+		mlx4_err(dev, "Set special QP caps failed. aborting\n");
+		goto free_mem;
+	}
+
+	if (dev->caps.uar_page_size * (dev->caps.num_uars -
+				       dev->caps.reserved_uars) >
+				       pci_resource_len(dev->persist->pdev,
+							2)) {
+		mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
+			 dev->caps.uar_page_size * dev->caps.num_uars,
+			 (unsigned long long)
+			 pci_resource_len(dev->persist->pdev, 2));
+		err = -ENOMEM;
+		goto err_mem;
+	}
+
+	if (hca_param->dev_cap_enabled & MLX4_DEV_CAP_64B_EQE_ENABLED) {
+		dev->caps.eqe_size   = 64;
+		dev->caps.eqe_factor = 1;
+	} else {
+		dev->caps.eqe_size   = 32;
+		dev->caps.eqe_factor = 0;
+	}
+
+	if (hca_param->dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) {
+		dev->caps.cqe_size   = 64;
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	} else {
+		dev->caps.cqe_size   = 32;
+	}
+
+	if (hca_param->dev_cap_enabled & MLX4_DEV_CAP_EQE_STRIDE_ENABLED) {
+		dev->caps.eqe_size = hca_param->eqe_size;
+		dev->caps.eqe_factor = 0;
+	}
+
+	if (hca_param->dev_cap_enabled & MLX4_DEV_CAP_CQE_STRIDE_ENABLED) {
+		dev->caps.cqe_size = hca_param->cqe_size;
+		/* User still need to know when CQE > 32B */
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	}
+
+	dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+	mlx4_warn(dev, "Timestamping is not supported in slave mode\n");
+
+	dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_USER_MAC_EN;
+	mlx4_dbg(dev, "User MAC FW update is not supported in slave mode\n");
+
+	slave_adjust_steering_mode(dev, dev_cap, hca_param);
+	mlx4_dbg(dev, "RSS support for IP fragments is %s\n",
+		 hca_param->rss_ip_frags ? "on" : "off");
+
+	if (func_cap->extra_flags & MLX4_QUERY_FUNC_FLAGS_BF_RES_QP &&
+	    dev->caps.bf_reg_size)
+		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_ETH_BF_QP;
+
+	if (func_cap->extra_flags & MLX4_QUERY_FUNC_FLAGS_A0_RES_QP)
+		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_A0_QP;
+
+err_mem:
+	if (err)
+		mlx4_slave_destroy_special_qp_cap(dev);
+free_mem:
+	kfree(hca_param);
+	kfree(func_cap);
+	kfree(dev_cap);
+	return err;
+}
+
+static void mlx4_request_modules(struct mlx4_dev *dev)
+{
+	int port;
+	int has_ib_port = false;
+	int has_eth_port = false;
+#define EN_DRV_NAME	"mlx4_en"
+#define IB_DRV_NAME	"mlx4_ib"
+
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		if (dev->caps.port_type[port] == MLX4_PORT_TYPE_IB)
+			has_ib_port = true;
+		else if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
+			has_eth_port = true;
+	}
+
+	if (has_eth_port)
+		request_module_nowait(EN_DRV_NAME);
+	if (has_ib_port || (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
+		request_module_nowait(IB_DRV_NAME);
+}
+
+/*
+ * Change the port configuration of the device.
+ * Every user of this function must hold the port mutex.
+ */
+int mlx4_change_port_types(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_types)
+{
+	int err = 0;
+	int change = 0;
+	int port;
+
+	for (port = 0; port <  dev->caps.num_ports; port++) {
+		/* Change the port type only if the new type is different
+		 * from the current, and not set to Auto */
+		if (port_types[port] != dev->caps.port_type[port + 1])
+			change = 1;
+	}
+	if (change) {
+		mlx4_unregister_device(dev);
+		for (port = 1; port <= dev->caps.num_ports; port++) {
+			mlx4_CLOSE_PORT(dev, port);
+			dev->caps.port_type[port] = port_types[port - 1];
+			err = mlx4_SET_PORT(dev, port, -1);
+			if (err) {
+				mlx4_err(dev, "Failed to set port %d, aborting\n",
+					 port);
+				goto out;
+			}
+		}
+		mlx4_set_port_mask(dev);
+		err = mlx4_register_device(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to register device\n");
+			goto out;
+		}
+		mlx4_request_modules(dev);
+	}
+
+out:
+	return err;
+}
+
+static ssize_t show_port_type(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_attr);
+	struct mlx4_dev *mdev = info->dev;
+	char type[8];
+
+	sprintf(type, "%s",
+		(mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB) ?
+		"ib" : "eth");
+	if (mdev->caps.possible_type[info->port] == MLX4_PORT_TYPE_AUTO)
+		sprintf(buf, "auto (%s)\n", type);
+	else
+		sprintf(buf, "%s\n", type);
+
+	return strlen(buf);
+}
+
+static int __set_port_type(struct mlx4_port_info *info,
+			   enum mlx4_port_type port_type)
+{
+	struct mlx4_dev *mdev = info->dev;
+	struct mlx4_priv *priv = mlx4_priv(mdev);
+	enum mlx4_port_type types[MLX4_MAX_PORTS];
+	enum mlx4_port_type new_types[MLX4_MAX_PORTS];
+	int i;
+	int err = 0;
+
+	if ((port_type & mdev->caps.supported_type[info->port]) != port_type) {
+		mlx4_err(mdev,
+			 "Requested port type for port %d is not supported on this HCA\n",
+			 info->port);
+		err = -EINVAL;
+		goto err_sup;
+	}
+
+	mlx4_stop_sense(mdev);
+	mutex_lock(&priv->port_mutex);
+	info->tmp_type = port_type;
+
+	/* Possible type is always the one that was delivered */
+	mdev->caps.possible_type[info->port] = info->tmp_type;
+
+	for (i = 0; i < mdev->caps.num_ports; i++) {
+		types[i] = priv->port[i+1].tmp_type ? priv->port[i+1].tmp_type :
+					mdev->caps.possible_type[i+1];
+		if (types[i] == MLX4_PORT_TYPE_AUTO)
+			types[i] = mdev->caps.port_type[i+1];
+	}
+
+	if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) &&
+	    !(mdev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT)) {
+		for (i = 1; i <= mdev->caps.num_ports; i++) {
+			if (mdev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+				mdev->caps.possible_type[i] = mdev->caps.port_type[i];
+				err = -EINVAL;
+			}
+		}
+	}
+	if (err) {
+		mlx4_err(mdev, "Auto sensing is not supported on this HCA. Set only 'eth' or 'ib' for both ports (should be the same)\n");
+		goto out;
+	}
+
+	mlx4_do_sense_ports(mdev, new_types, types);
+
+	err = mlx4_check_port_params(mdev, new_types);
+	if (err)
+		goto out;
+
+	/* We are about to apply the changes after the configuration
+	 * was verified, no need to remember the temporary types
+	 * any more */
+	for (i = 0; i < mdev->caps.num_ports; i++)
+		priv->port[i + 1].tmp_type = 0;
+
+	err = mlx4_change_port_types(mdev, new_types);
+
+out:
+	mlx4_start_sense(mdev);
+	mutex_unlock(&priv->port_mutex);
+err_sup:
+	return err;
+}
+
+static ssize_t set_port_type(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_attr);
+	struct mlx4_dev *mdev = info->dev;
+	enum mlx4_port_type port_type;
+	static DEFINE_MUTEX(set_port_type_mutex);
+	int err;
+
+	mutex_lock(&set_port_type_mutex);
+
+	if (!strcmp(buf, "ib\n")) {
+		port_type = MLX4_PORT_TYPE_IB;
+	} else if (!strcmp(buf, "eth\n")) {
+		port_type = MLX4_PORT_TYPE_ETH;
+	} else if (!strcmp(buf, "auto\n")) {
+		port_type = MLX4_PORT_TYPE_AUTO;
+	} else {
+		mlx4_err(mdev, "%s is not supported port type\n", buf);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = __set_port_type(info, port_type);
+
+err_out:
+	mutex_unlock(&set_port_type_mutex);
+
+	return err ? err : count;
+}
+
+enum ibta_mtu {
+	IB_MTU_256  = 1,
+	IB_MTU_512  = 2,
+	IB_MTU_1024 = 3,
+	IB_MTU_2048 = 4,
+	IB_MTU_4096 = 5
+};
+
+static inline int int_to_ibta_mtu(int mtu)
+{
+	switch (mtu) {
+	case 256:  return IB_MTU_256;
+	case 512:  return IB_MTU_512;
+	case 1024: return IB_MTU_1024;
+	case 2048: return IB_MTU_2048;
+	case 4096: return IB_MTU_4096;
+	default: return -1;
+	}
+}
+
+static inline int ibta_mtu_to_int(enum ibta_mtu mtu)
+{
+	switch (mtu) {
+	case IB_MTU_256:  return  256;
+	case IB_MTU_512:  return  512;
+	case IB_MTU_1024: return 1024;
+	case IB_MTU_2048: return 2048;
+	case IB_MTU_4096: return 4096;
+	default: return -1;
+	}
+}
+
+static ssize_t show_port_ib_mtu(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_mtu_attr);
+	struct mlx4_dev *mdev = info->dev;
+
+	if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH)
+		mlx4_warn(mdev, "port level mtu is only used for IB ports\n");
+
+	sprintf(buf, "%d\n",
+			ibta_mtu_to_int(mdev->caps.port_ib_mtu[info->port]));
+	return strlen(buf);
+}
+
+static ssize_t set_port_ib_mtu(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_mtu_attr);
+	struct mlx4_dev *mdev = info->dev;
+	struct mlx4_priv *priv = mlx4_priv(mdev);
+	int err, port, mtu, ibta_mtu = -1;
+
+	if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH) {
+		mlx4_warn(mdev, "port level mtu is only used for IB ports\n");
+		return -EINVAL;
+	}
+
+	err = kstrtoint(buf, 0, &mtu);
+	if (!err)
+		ibta_mtu = int_to_ibta_mtu(mtu);
+
+	if (err || ibta_mtu < 0) {
+		mlx4_err(mdev, "%s is invalid IBTA mtu\n", buf);
+		return -EINVAL;
+	}
+
+	mdev->caps.port_ib_mtu[info->port] = ibta_mtu;
+
+	mlx4_stop_sense(mdev);
+	mutex_lock(&priv->port_mutex);
+	mlx4_unregister_device(mdev);
+	for (port = 1; port <= mdev->caps.num_ports; port++) {
+		mlx4_CLOSE_PORT(mdev, port);
+		err = mlx4_SET_PORT(mdev, port, -1);
+		if (err) {
+			mlx4_err(mdev, "Failed to set port %d, aborting\n",
+				 port);
+			goto err_set_port;
+		}
+	}
+	err = mlx4_register_device(mdev);
+err_set_port:
+	mutex_unlock(&priv->port_mutex);
+	mlx4_start_sense(mdev);
+	return err ? err : count;
+}
+
+/* bond for multi-function device */
+#define MAX_MF_BOND_ALLOWED_SLAVES 63
+static int mlx4_mf_bond(struct mlx4_dev *dev)
+{
+	int err = 0;
+	int nvfs;
+	struct mlx4_slaves_pport slaves_port1;
+	struct mlx4_slaves_pport slaves_port2;
+	DECLARE_BITMAP(slaves_port_1_2, MLX4_MFUNC_MAX);
+
+	slaves_port1 = mlx4_phys_to_slaves_pport(dev, 1);
+	slaves_port2 = mlx4_phys_to_slaves_pport(dev, 2);
+	bitmap_and(slaves_port_1_2,
+		   slaves_port1.slaves, slaves_port2.slaves,
+		   dev->persist->num_vfs + 1);
+
+	/* only single port vfs are allowed */
+	if (bitmap_weight(slaves_port_1_2, dev->persist->num_vfs + 1) > 1) {
+		mlx4_warn(dev, "HA mode unsupported for dual ported VFs\n");
+		return -EINVAL;
+	}
+
+	/* number of virtual functions is number of total functions minus one
+	 * physical function for each port.
+	 */
+	nvfs = bitmap_weight(slaves_port1.slaves, dev->persist->num_vfs + 1) +
+		bitmap_weight(slaves_port2.slaves, dev->persist->num_vfs + 1) - 2;
+
+	/* limit on maximum allowed VFs */
+	if (nvfs > MAX_MF_BOND_ALLOWED_SLAVES) {
+		mlx4_warn(dev, "HA mode is not supported for %d VFs (max %d are allowed)\n",
+			  nvfs, MAX_MF_BOND_ALLOWED_SLAVES);
+		return -EINVAL;
+	}
+
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		mlx4_warn(dev, "HA mode unsupported for NON DMFS steering\n");
+		return -EINVAL;
+	}
+
+	err = mlx4_bond_mac_table(dev);
+	if (err)
+		return err;
+	err = mlx4_bond_vlan_table(dev);
+	if (err)
+		goto err1;
+	err = mlx4_bond_fs_rules(dev);
+	if (err)
+		goto err2;
+
+	return 0;
+err2:
+	(void)mlx4_unbond_vlan_table(dev);
+err1:
+	(void)mlx4_unbond_mac_table(dev);
+	return err;
+}
+
+static int mlx4_mf_unbond(struct mlx4_dev *dev)
+{
+	int ret, ret1;
+
+	ret = mlx4_unbond_fs_rules(dev);
+	if (ret)
+		mlx4_warn(dev, "multifunction unbond for flow rules failed (%d)\n", ret);
+	ret1 = mlx4_unbond_mac_table(dev);
+	if (ret1) {
+		mlx4_warn(dev, "multifunction unbond for MAC table failed (%d)\n", ret1);
+		ret = ret1;
+	}
+	ret1 = mlx4_unbond_vlan_table(dev);
+	if (ret1) {
+		mlx4_warn(dev, "multifunction unbond for VLAN table failed (%d)\n", ret1);
+		ret = ret1;
+	}
+	return ret;
+}
+
+int mlx4_bond(struct mlx4_dev *dev)
+{
+	int ret = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->bond_mutex);
+
+	if (!mlx4_is_bonded(dev)) {
+		ret = mlx4_do_bond(dev, true);
+		if (ret)
+			mlx4_err(dev, "Failed to bond device: %d\n", ret);
+		if (!ret && mlx4_is_master(dev)) {
+			ret = mlx4_mf_bond(dev);
+			if (ret) {
+				mlx4_err(dev, "bond for multifunction failed\n");
+				mlx4_do_bond(dev, false);
+			}
+		}
+	}
+
+	mutex_unlock(&priv->bond_mutex);
+	if (!ret)
+		mlx4_dbg(dev, "Device is bonded\n");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_bond);
+
+int mlx4_unbond(struct mlx4_dev *dev)
+{
+	int ret = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->bond_mutex);
+
+	if (mlx4_is_bonded(dev)) {
+		int ret2 = 0;
+
+		ret = mlx4_do_bond(dev, false);
+		if (ret)
+			mlx4_err(dev, "Failed to unbond device: %d\n", ret);
+		if (mlx4_is_master(dev))
+			ret2 = mlx4_mf_unbond(dev);
+		if (ret2) {
+			mlx4_warn(dev, "Failed to unbond device for multifunction (%d)\n", ret2);
+			ret = ret2;
+		}
+	}
+
+	mutex_unlock(&priv->bond_mutex);
+	if (!ret)
+		mlx4_dbg(dev, "Device is unbonded\n");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_unbond);
+
+
+int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p)
+{
+	u8 port1 = v2p->port1;
+	u8 port2 = v2p->port2;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP))
+		return -EOPNOTSUPP;
+
+	mutex_lock(&priv->bond_mutex);
+
+	/* zero means keep current mapping for this port */
+	if (port1 == 0)
+		port1 = priv->v2p.port1;
+	if (port2 == 0)
+		port2 = priv->v2p.port2;
+
+	if ((port1 < 1) || (port1 > MLX4_MAX_PORTS) ||
+	    (port2 < 1) || (port2 > MLX4_MAX_PORTS) ||
+	    (port1 == 2 && port2 == 1)) {
+		/* besides boundary checks cross mapping makes
+		 * no sense and therefore not allowed */
+		err = -EINVAL;
+	} else if ((port1 == priv->v2p.port1) &&
+		 (port2 == priv->v2p.port2)) {
+		err = 0;
+	} else {
+		err = mlx4_virt2phy_port_map(dev, port1, port2);
+		if (!err) {
+			mlx4_dbg(dev, "port map changed: [%d][%d]\n",
+				 port1, port2);
+			priv->v2p.port1 = port1;
+			priv->v2p.port2 = port2;
+		} else {
+			mlx4_err(dev, "Failed to change port mape: %d\n", err);
+		}
+	}
+
+	mutex_unlock(&priv->bond_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_port_map_set);
+
+static int mlx4_load_fw(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages,
+					 GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!priv->fw.fw_icm) {
+		mlx4_err(dev, "Couldn't allocate FW area, aborting\n");
+		return -ENOMEM;
+	}
+
+	err = mlx4_MAP_FA(dev, priv->fw.fw_icm);
+	if (err) {
+		mlx4_err(dev, "MAP_FA command failed, aborting\n");
+		goto err_free;
+	}
+
+	err = mlx4_RUN_FW(dev);
+	if (err) {
+		mlx4_err(dev, "RUN_FW command failed, aborting\n");
+		goto err_unmap_fa;
+	}
+
+	return 0;
+
+err_unmap_fa:
+	mlx4_UNMAP_FA(dev);
+
+err_free:
+	mlx4_free_icm(dev, priv->fw.fw_icm, 0);
+	return err;
+}
+
+static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
+				int cmpt_entry_sz)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int num_eqs;
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_QP *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err)
+		goto err;
+
+	err = mlx4_init_icm_table(dev, &priv->srq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_SRQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_srqs,
+				  dev->caps.reserved_srqs, 0, 0);
+	if (err)
+		goto err_qp;
+
+	err = mlx4_init_icm_table(dev, &priv->cq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_CQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_cqs,
+				  dev->caps.reserved_cqs, 0, 0);
+	if (err)
+		goto err_srq;
+
+	num_eqs = dev->phys_caps.num_phys_eqs;
+	err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_EQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, num_eqs, num_eqs, 0, 0);
+	if (err)
+		goto err_cq;
+
+	return 0;
+
+err_cq:
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+
+err_srq:
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+
+err_qp:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err:
+	return err;
+}
+
+static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
+			 struct mlx4_init_hca_param *init_hca, u64 icm_size)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 aux_pages;
+	int num_eqs;
+	int err;
+
+	err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages);
+	if (err) {
+		mlx4_err(dev, "SET_ICM_SIZE command failed, aborting\n");
+		return err;
+	}
+
+	mlx4_dbg(dev, "%lld KB of HCA context requires %lld KB aux memory\n",
+		 (unsigned long long) icm_size >> 10,
+		 (unsigned long long) aux_pages << 2);
+
+	priv->fw.aux_icm = mlx4_alloc_icm(dev, aux_pages,
+					  GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!priv->fw.aux_icm) {
+		mlx4_err(dev, "Couldn't allocate aux memory, aborting\n");
+		return -ENOMEM;
+	}
+
+	err = mlx4_MAP_ICM_AUX(dev, priv->fw.aux_icm);
+	if (err) {
+		mlx4_err(dev, "MAP_ICM_AUX command failed, aborting\n");
+		goto err_free_aux;
+	}
+
+	err = mlx4_init_cmpt_table(dev, init_hca->cmpt_base, dev_cap->cmpt_entry_sz);
+	if (err) {
+		mlx4_err(dev, "Failed to map cMPT context memory, aborting\n");
+		goto err_unmap_aux;
+	}
+
+
+	num_eqs = dev->phys_caps.num_phys_eqs;
+	err = mlx4_init_icm_table(dev, &priv->eq_table.table,
+				  init_hca->eqc_base, dev_cap->eqc_entry_sz,
+				  num_eqs, num_eqs, 0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map EQ context memory, aborting\n");
+		goto err_unmap_cmpt;
+	}
+
+	/*
+	 * Reserved MTT entries must be aligned up to a cacheline
+	 * boundary, since the FW will write to them, while the driver
+	 * writes to all other MTT entries. (The variable
+	 * dev->caps.mtt_entry_sz below is really the MTT segment
+	 * size, not the raw entry size)
+	 */
+	dev->caps.reserved_mtts =
+		ALIGN(dev->caps.reserved_mtts * dev->caps.mtt_entry_sz,
+		      dma_get_cache_alignment()) / dev->caps.mtt_entry_sz;
+
+	err = mlx4_init_icm_table(dev, &priv->mr_table.mtt_table,
+				  init_hca->mtt_base,
+				  dev->caps.mtt_entry_sz,
+				  dev->caps.num_mtts,
+				  dev->caps.reserved_mtts, 1, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map MTT context memory, aborting\n");
+		goto err_unmap_eq;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->mr_table.dmpt_table,
+				  init_hca->dmpt_base,
+				  dev_cap->dmpt_entry_sz,
+				  dev->caps.num_mpts,
+				  dev->caps.reserved_mrws, 1, 1);
+	if (err) {
+		mlx4_err(dev, "Failed to map dMPT context memory, aborting\n");
+		goto err_unmap_mtt;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.qp_table,
+				  init_hca->qpc_base,
+				  dev_cap->qpc_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map QP context memory, aborting\n");
+		goto err_unmap_dmpt;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.auxc_table,
+				  init_hca->auxc_base,
+				  dev_cap->aux_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map AUXC context memory, aborting\n");
+		goto err_unmap_qp;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.altc_table,
+				  init_hca->altc_base,
+				  dev_cap->altc_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map ALTC context memory, aborting\n");
+		goto err_unmap_auxc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.rdmarc_table,
+				  init_hca->rdmarc_base,
+				  dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n");
+		goto err_unmap_altc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->cq_table.table,
+				  init_hca->cqc_base,
+				  dev_cap->cqc_entry_sz,
+				  dev->caps.num_cqs,
+				  dev->caps.reserved_cqs, 0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map CQ context memory, aborting\n");
+		goto err_unmap_rdmarc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->srq_table.table,
+				  init_hca->srqc_base,
+				  dev_cap->srq_entry_sz,
+				  dev->caps.num_srqs,
+				  dev->caps.reserved_srqs, 0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map SRQ context memory, aborting\n");
+		goto err_unmap_cq;
+	}
+
+	/*
+	 * For flow steering device managed mode it is required to use
+	 * mlx4_init_icm_table. For B0 steering mode it's not strictly
+	 * required, but for simplicity just map the whole multicast
+	 * group table now.  The table isn't very big and it's a lot
+	 * easier than trying to track ref counts.
+	 */
+	err = mlx4_init_icm_table(dev, &priv->mcg_table.table,
+				  init_hca->mc_base,
+				  mlx4_get_mgm_entry_size(dev),
+				  dev->caps.num_mgms + dev->caps.num_amgms,
+				  dev->caps.num_mgms + dev->caps.num_amgms,
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map MCG context memory, aborting\n");
+		goto err_unmap_srq;
+	}
+
+	return 0;
+
+err_unmap_srq:
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+
+err_unmap_cq:
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+
+err_unmap_rdmarc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+
+err_unmap_altc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+
+err_unmap_auxc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+
+err_unmap_qp:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+
+err_unmap_dmpt:
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+
+err_unmap_mtt:
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+
+err_unmap_eq:
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.table);
+
+err_unmap_cmpt:
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err_unmap_aux:
+	mlx4_UNMAP_ICM_AUX(dev);
+
+err_free_aux:
+	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+
+	return err;
+}
+
+static void mlx4_free_icms(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mlx4_cleanup_icm_table(dev, &priv->mcg_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+	mlx4_UNMAP_ICM_AUX(dev);
+	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+}
+
+static void mlx4_slave_exit(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->cmd.slave_cmd_mutex);
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_CMD_NA_OP,
+			  MLX4_COMM_TIME))
+		mlx4_warn(dev, "Failed to close slave function\n");
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+}
+
+static int map_bf_area(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	resource_size_t bf_start;
+	resource_size_t bf_len;
+	int err = 0;
+
+	if (!dev->caps.bf_reg_size)
+		return -ENXIO;
+
+	bf_start = pci_resource_start(dev->persist->pdev, 2) +
+			(dev->caps.num_uars << PAGE_SHIFT);
+	bf_len = pci_resource_len(dev->persist->pdev, 2) -
+			(dev->caps.num_uars << PAGE_SHIFT);
+	priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len);
+	if (!priv->bf_mapping)
+		err = -ENOMEM;
+
+	return err;
+}
+
+static void unmap_bf_area(struct mlx4_dev *dev)
+{
+	if (mlx4_priv(dev)->bf_mapping)
+		io_mapping_free(mlx4_priv(dev)->bf_mapping);
+}
+
+u64 mlx4_read_clock(struct mlx4_dev *dev)
+{
+	u32 clockhi, clocklo, clockhi1;
+	u64 cycles;
+	int i;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	for (i = 0; i < 10; i++) {
+		clockhi = swab32(readl(priv->clock_mapping));
+		clocklo = swab32(readl(priv->clock_mapping + 4));
+		clockhi1 = swab32(readl(priv->clock_mapping));
+		if (clockhi == clockhi1)
+			break;
+	}
+
+	cycles = (u64) clockhi << 32 | (u64) clocklo;
+
+	return cycles;
+}
+EXPORT_SYMBOL_GPL(mlx4_read_clock);
+
+
+static int map_internal_clock(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->clock_mapping =
+		ioremap(pci_resource_start(dev->persist->pdev,
+					   priv->fw.clock_bar) +
+			priv->fw.clock_offset, MLX4_CLOCK_SIZE);
+
+	if (!priv->clock_mapping)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
+				   struct mlx4_clock_params *params)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (mlx4_is_slave(dev))
+		return -EOPNOTSUPP;
+
+	if (!params)
+		return -EINVAL;
+
+	params->bar = priv->fw.clock_bar;
+	params->offset = priv->fw.clock_offset;
+	params->size = MLX4_CLOCK_SIZE;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_internal_clock_params);
+
+static void unmap_internal_clock(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (priv->clock_mapping)
+		iounmap(priv->clock_mapping);
+}
+
+static void mlx4_close_hca(struct mlx4_dev *dev)
+{
+	unmap_internal_clock(dev);
+	unmap_bf_area(dev);
+	if (mlx4_is_slave(dev))
+		mlx4_slave_exit(dev);
+	else {
+		mlx4_CLOSE_HCA(dev, 0);
+		mlx4_free_icms(dev);
+	}
+}
+
+static void mlx4_close_fw(struct mlx4_dev *dev)
+{
+	if (!mlx4_is_slave(dev)) {
+		mlx4_UNMAP_FA(dev);
+		mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0);
+	}
+}
+
+static int mlx4_comm_check_offline(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_OFFLINE_OFFSET 0x09
+
+	u32 comm_flags;
+	u32 offline_bit;
+	unsigned long end;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	end = msecs_to_jiffies(MLX4_COMM_OFFLINE_TIME_OUT) + jiffies;
+	while (time_before(jiffies, end)) {
+		comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+					  MLX4_COMM_CHAN_FLAGS));
+		offline_bit = (comm_flags &
+			       (u32)(1 << COMM_CHAN_OFFLINE_OFFSET));
+		if (!offline_bit)
+			return 0;
+
+		/* If device removal has been requested,
+		 * do not continue retrying.
+		 */
+		if (dev->persist->interface_state &
+		    MLX4_INTERFACE_STATE_NOWAIT)
+			break;
+
+		/* There are cases as part of AER/Reset flow that PF needs
+		 * around 100 msec to load. We therefore sleep for 100 msec
+		 * to allow other tasks to make use of that CPU during this
+		 * time interval.
+		 */
+		msleep(100);
+	}
+	mlx4_err(dev, "Communication channel is offline.\n");
+	return -EIO;
+}
+
+static void mlx4_reset_vf_support(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_RST_OFFSET 0x1e
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 comm_rst;
+	u32 comm_caps;
+
+	comm_caps = swab32(readl((__iomem char *)priv->mfunc.comm +
+				 MLX4_COMM_CHAN_CAPS));
+	comm_rst = (comm_caps & (u32)(1 << COMM_CHAN_RST_OFFSET));
+
+	if (comm_rst)
+		dev->caps.vf_caps |= MLX4_VF_CAP_FLAG_RESET;
+}
+
+static int mlx4_init_slave(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 dma = (u64) priv->mfunc.vhcr_dma;
+	int ret_from_reset = 0;
+	u32 slave_read;
+	u32 cmd_channel_ver;
+
+	if (atomic_read(&pf_loading)) {
+		mlx4_warn(dev, "PF is not ready - Deferring probe\n");
+		return -EPROBE_DEFER;
+	}
+
+	mutex_lock(&priv->cmd.slave_cmd_mutex);
+	priv->cmd.max_cmds = 1;
+	if (mlx4_comm_check_offline(dev)) {
+		mlx4_err(dev, "PF is not responsive, skipping initialization\n");
+		goto err_offline;
+	}
+
+	mlx4_reset_vf_support(dev);
+	mlx4_warn(dev, "Sending reset\n");
+	ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0,
+				       MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME);
+	/* if we are in the middle of flr the slave will try
+	 * NUM_OF_RESET_RETRIES times before leaving.*/
+	if (ret_from_reset) {
+		if (MLX4_DELAY_RESET_SLAVE == ret_from_reset) {
+			mlx4_warn(dev, "slave is currently in the middle of FLR - Deferring probe\n");
+			mutex_unlock(&priv->cmd.slave_cmd_mutex);
+			return -EPROBE_DEFER;
+		} else
+			goto err;
+	}
+
+	/* check the driver version - the slave I/F revision
+	 * must match the master's */
+	slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
+	cmd_channel_ver = mlx4_comm_get_version();
+
+	if (MLX4_COMM_GET_IF_REV(cmd_channel_ver) !=
+		MLX4_COMM_GET_IF_REV(slave_read)) {
+		mlx4_err(dev, "slave driver version is not supported by the master\n");
+		goto err;
+	}
+
+	mlx4_warn(dev, "Sending vhcr0\n");
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma >> 48,
+			     MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
+		goto err;
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma >> 32,
+			     MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
+		goto err;
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma >> 16,
+			     MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
+		goto err;
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma,
+			  MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
+		goto err;
+
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+	return 0;
+
+err:
+	mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_CMD_NA_OP, 0);
+err_offline:
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+	return -EIO;
+}
+
+static void mlx4_parav_master_pf_caps(struct mlx4_dev *dev)
+{
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH)
+			dev->caps.gid_table_len[i] =
+				mlx4_get_slave_num_gids(dev, 0, i);
+		else
+			dev->caps.gid_table_len[i] = 1;
+		dev->caps.pkey_table_len[i] =
+			dev->phys_caps.pkey_phys_table_len[i] - 1;
+	}
+}
+
+static int choose_log_fs_mgm_entry_size(int qp_per_entry)
+{
+	int i = MLX4_MIN_MGM_LOG_ENTRY_SIZE;
+
+	for (i = MLX4_MIN_MGM_LOG_ENTRY_SIZE; i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE;
+	      i++) {
+		if (qp_per_entry <= 4 * ((1 << i) / 16 - 2))
+			break;
+	}
+
+	return (i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE) ? i : -1;
+}
+
+static const char *dmfs_high_rate_steering_mode_str(int dmfs_high_steer_mode)
+{
+	switch (dmfs_high_steer_mode) {
+	case MLX4_STEERING_DMFS_A0_DEFAULT:
+		return "default performance";
+
+	case MLX4_STEERING_DMFS_A0_DYNAMIC:
+		return "dynamic hybrid mode";
+
+	case MLX4_STEERING_DMFS_A0_STATIC:
+		return "performance optimized for limited rule configuration (static)";
+
+	case MLX4_STEERING_DMFS_A0_DISABLE:
+		return "disabled performance optimized steering";
+
+	case MLX4_STEERING_DMFS_A0_NOT_SUPPORTED:
+		return "performance optimized steering not supported";
+
+	default:
+		return "Unrecognized mode";
+	}
+}
+
+#define MLX4_DMFS_A0_STEERING			(1UL << 2)
+
+static void choose_steering_mode(struct mlx4_dev *dev,
+				 struct mlx4_dev_cap *dev_cap)
+{
+	if (mlx4_log_num_mgm_entry_size <= 0) {
+		if ((-mlx4_log_num_mgm_entry_size) & MLX4_DMFS_A0_STEERING) {
+			if (dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+				mlx4_err(dev, "DMFS high rate mode not supported\n");
+			else
+				dev->caps.dmfs_high_steer_mode =
+					MLX4_STEERING_DMFS_A0_STATIC;
+		}
+	}
+
+	if (mlx4_log_num_mgm_entry_size <= 0 &&
+	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN &&
+	    (!mlx4_is_mfunc(dev) ||
+	     (dev_cap->fs_max_num_qp_per_entry >=
+	     (dev->persist->num_vfs + 1))) &&
+	    choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry) >=
+		MLX4_MIN_MGM_LOG_ENTRY_SIZE) {
+		dev->oper_log_mgm_entry_size =
+			choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry);
+		dev->caps.steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED;
+		dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry;
+		dev->caps.fs_log_max_ucast_qp_range_size =
+			dev_cap->fs_log_max_ucast_qp_range_size;
+	} else {
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+			dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_DISABLE;
+		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER &&
+		    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
+			dev->caps.steering_mode = MLX4_STEERING_MODE_B0;
+		else {
+			dev->caps.steering_mode = MLX4_STEERING_MODE_A0;
+
+			if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER ||
+			    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
+				mlx4_warn(dev, "Must have both UC_STEER and MC_STEER flags set to use B0 steering - falling back to A0 steering mode\n");
+		}
+		dev->oper_log_mgm_entry_size =
+			mlx4_log_num_mgm_entry_size > 0 ?
+			mlx4_log_num_mgm_entry_size :
+			MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE;
+		dev->caps.num_qp_per_mgm = mlx4_get_qp_per_mgm(dev);
+	}
+	mlx4_dbg(dev, "Steering mode is: %s, oper_log_mgm_entry_size = %d, modparam log_num_mgm_entry_size = %d\n",
+		 mlx4_steering_mode_str(dev->caps.steering_mode),
+		 dev->oper_log_mgm_entry_size,
+		 mlx4_log_num_mgm_entry_size);
+}
+
+static void choose_tunnel_offload_mode(struct mlx4_dev *dev,
+				       struct mlx4_dev_cap *dev_cap)
+{
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED &&
+	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS)
+		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_VXLAN;
+	else
+		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_NONE;
+
+	mlx4_dbg(dev, "Tunneling offload mode is: %s\n",  (dev->caps.tunnel_offload_mode
+		 == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) ? "vxlan" : "none");
+}
+
+static int mlx4_validate_optimized_steering(struct mlx4_dev *dev)
+{
+	int i;
+	struct mlx4_port_cap port_cap;
+
+	if (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+		return -EINVAL;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_dev_port(dev, i, &port_cap)) {
+			mlx4_err(dev,
+				 "QUERY_DEV_CAP command failed, can't veify DMFS high rate steering.\n");
+		} else if ((dev->caps.dmfs_high_steer_mode !=
+			    MLX4_STEERING_DMFS_A0_DEFAULT) &&
+			   (port_cap.dmfs_optimized_state ==
+			    !!(dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_DISABLE))) {
+			mlx4_err(dev,
+				 "DMFS high rate steer mode differ, driver requested %s but %s in FW.\n",
+				 dmfs_high_rate_steering_mode_str(
+					dev->caps.dmfs_high_steer_mode),
+				 (port_cap.dmfs_optimized_state ?
+					"enabled" : "disabled"));
+		}
+	}
+
+	return 0;
+}
+
+static int mlx4_init_fw(struct mlx4_dev *dev)
+{
+	struct mlx4_mod_stat_cfg   mlx4_cfg;
+	int err = 0;
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_QUERY_FW(dev);
+		if (err) {
+			if (err == -EACCES)
+				mlx4_info(dev, "non-primary physical function, skipping\n");
+			else
+				mlx4_err(dev, "QUERY_FW command failed, aborting\n");
+			return err;
+		}
+
+		err = mlx4_load_fw(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to start FW, aborting\n");
+			return err;
+		}
+
+		mlx4_cfg.log_pg_sz_m = 1;
+		mlx4_cfg.log_pg_sz = 0;
+		err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg);
+		if (err)
+			mlx4_warn(dev, "Failed to override log_pg_sz parameter\n");
+	}
+
+	return err;
+}
+
+static int mlx4_init_hca(struct mlx4_dev *dev)
+{
+	struct mlx4_priv	  *priv = mlx4_priv(dev);
+	struct mlx4_adapter	   adapter;
+	struct mlx4_dev_cap	   dev_cap;
+	struct mlx4_profile	   profile;
+	struct mlx4_init_hca_param init_hca;
+	u64 icm_size;
+	struct mlx4_config_dev_params params;
+	int err;
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_dev_cap(dev, &dev_cap);
+		if (err) {
+			mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
+			return err;
+		}
+
+		choose_steering_mode(dev, &dev_cap);
+		choose_tunnel_offload_mode(dev, &dev_cap);
+
+		if (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC &&
+		    mlx4_is_master(dev))
+			dev->caps.function_caps |= MLX4_FUNC_CAP_DMFS_A0_STATIC;
+
+		err = mlx4_get_phys_port_id(dev);
+		if (err)
+			mlx4_err(dev, "Fail to get physical port id\n");
+
+		if (mlx4_is_master(dev))
+			mlx4_parav_master_pf_caps(dev);
+
+		if (mlx4_low_memory_profile()) {
+			mlx4_info(dev, "Running from within kdump kernel. Using low memory profile\n");
+			profile = low_mem_profile;
+		} else {
+			profile = default_profile;
+		}
+		if (dev->caps.steering_mode ==
+		    MLX4_STEERING_MODE_DEVICE_MANAGED)
+			profile.num_mcg = MLX4_FS_NUM_MCG;
+
+		icm_size = mlx4_make_profile(dev, &profile, &dev_cap,
+					     &init_hca);
+		if ((long long) icm_size < 0) {
+			err = icm_size;
+			return err;
+		}
+
+		dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1;
+
+		if (enable_4k_uar || !dev->persist->num_vfs) {
+			init_hca.log_uar_sz = ilog2(dev->caps.num_uars) +
+						    PAGE_SHIFT - DEFAULT_UAR_PAGE_SHIFT;
+			init_hca.uar_page_sz = DEFAULT_UAR_PAGE_SHIFT - 12;
+		} else {
+			init_hca.log_uar_sz = ilog2(dev->caps.num_uars);
+			init_hca.uar_page_sz = PAGE_SHIFT - 12;
+		}
+
+		init_hca.mw_enabled = 0;
+		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
+		    dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN)
+			init_hca.mw_enabled = INIT_HCA_TPT_MW_ENABLE;
+
+		err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size);
+		if (err)
+			return err;
+
+		err = mlx4_INIT_HCA(dev, &init_hca);
+		if (err) {
+			mlx4_err(dev, "INIT_HCA command failed, aborting\n");
+			goto err_free_icm;
+		}
+
+		if (dev_cap.flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
+			err = mlx4_query_func(dev, &dev_cap);
+			if (err < 0) {
+				mlx4_err(dev, "QUERY_FUNC command failed, aborting.\n");
+				goto err_close;
+			} else if (err & MLX4_QUERY_FUNC_NUM_SYS_EQS) {
+				dev->caps.num_eqs = dev_cap.max_eqs;
+				dev->caps.reserved_eqs = dev_cap.reserved_eqs;
+				dev->caps.reserved_uars = dev_cap.reserved_uars;
+			}
+		}
+
+		/*
+		 * If TS is supported by FW
+		 * read HCA frequency by QUERY_HCA command
+		 */
+		if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) {
+			memset(&init_hca, 0, sizeof(init_hca));
+			err = mlx4_QUERY_HCA(dev, &init_hca);
+			if (err) {
+				mlx4_err(dev, "QUERY_HCA command failed, disable timestamp\n");
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+			} else {
+				dev->caps.hca_core_clock =
+					init_hca.hca_core_clock;
+			}
+
+			/* In case we got HCA frequency 0 - disable timestamping
+			 * to avoid dividing by zero
+			 */
+			if (!dev->caps.hca_core_clock) {
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+				mlx4_err(dev,
+					 "HCA frequency is 0 - timestamping is not supported\n");
+			} else if (map_internal_clock(dev)) {
+				/*
+				 * Map internal clock,
+				 * in case of failure disable timestamping
+				 */
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+				mlx4_err(dev, "Failed to map internal clock. Timestamping is not supported\n");
+			}
+		}
+
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED) {
+			if (mlx4_validate_optimized_steering(dev))
+				mlx4_warn(dev, "Optimized steering validation failed\n");
+
+			if (dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_DISABLE) {
+				dev->caps.dmfs_high_rate_qpn_base =
+					dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+				dev->caps.dmfs_high_rate_qpn_range =
+					MLX4_A0_STEERING_TABLE_SIZE;
+			}
+
+			mlx4_info(dev, "DMFS high rate steer mode is: %s\n",
+				  dmfs_high_rate_steering_mode_str(
+					dev->caps.dmfs_high_steer_mode));
+		}
+	} else {
+		err = mlx4_init_slave(dev);
+		if (err) {
+			if (err != -EPROBE_DEFER)
+				mlx4_err(dev, "Failed to initialize slave\n");
+			return err;
+		}
+
+		err = mlx4_slave_cap(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to obtain slave caps\n");
+			goto err_close;
+		}
+	}
+
+	if (map_bf_area(dev))
+		mlx4_dbg(dev, "Failed to map blue flame area\n");
+
+	/*Only the master set the ports, all the rest got it from it.*/
+	if (!mlx4_is_slave(dev))
+		mlx4_set_port_mask(dev);
+
+	err = mlx4_QUERY_ADAPTER(dev, &adapter);
+	if (err) {
+		mlx4_err(dev, "QUERY_ADAPTER command failed, aborting\n");
+		goto unmap_bf;
+	}
+
+	/* Query CONFIG_DEV parameters */
+	err = mlx4_config_dev_retrieval(dev, &params);
+	if (err && err != -EOPNOTSUPP) {
+		mlx4_err(dev, "Failed to query CONFIG_DEV parameters\n");
+	} else if (!err) {
+		dev->caps.rx_checksum_flags_port[1] = params.rx_csum_flags_port_1;
+		dev->caps.rx_checksum_flags_port[2] = params.rx_csum_flags_port_2;
+	}
+	priv->eq_table.inta_pin = adapter.inta_pin;
+	memcpy(dev->board_id, adapter.board_id, sizeof(dev->board_id));
+
+	return 0;
+
+unmap_bf:
+	unmap_internal_clock(dev);
+	unmap_bf_area(dev);
+
+	if (mlx4_is_slave(dev))
+		mlx4_slave_destroy_special_qp_cap(dev);
+
+err_close:
+	if (mlx4_is_slave(dev))
+		mlx4_slave_exit(dev);
+	else
+		mlx4_CLOSE_HCA(dev, 0);
+
+err_free_icm:
+	if (!mlx4_is_slave(dev))
+		mlx4_free_icms(dev);
+
+	return err;
+}
+
+static int mlx4_init_counters_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int nent_pow2;
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return -ENOENT;
+
+	if (!dev->caps.max_counters)
+		return -ENOSPC;
+
+	nent_pow2 = roundup_pow_of_two(dev->caps.max_counters);
+	/* reserve last counter index for sink counter */
+	return mlx4_bitmap_init(&priv->counters_bitmap, nent_pow2,
+				nent_pow2 - 1, 0,
+				nent_pow2 - dev->caps.max_counters + 1);
+}
+
+static void mlx4_cleanup_counters_table(struct mlx4_dev *dev)
+{
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return;
+
+	if (!dev->caps.max_counters)
+		return;
+
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->counters_bitmap);
+}
+
+static void mlx4_cleanup_default_counters(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port;
+
+	for (port = 0; port < dev->caps.num_ports; port++)
+		if (priv->def_counter[port] != -1)
+			mlx4_counter_free(dev,  priv->def_counter[port]);
+}
+
+static int mlx4_allocate_default_counters(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port, err = 0;
+	u32 idx;
+
+	for (port = 0; port < dev->caps.num_ports; port++)
+		priv->def_counter[port] = -1;
+
+	for (port = 0; port < dev->caps.num_ports; port++) {
+		err = mlx4_counter_alloc(dev, &idx, MLX4_RES_USAGE_DRIVER);
+
+		if (!err || err == -ENOSPC) {
+			priv->def_counter[port] = idx;
+		} else if (err == -ENOENT) {
+			err = 0;
+			continue;
+		} else if (mlx4_is_slave(dev) && err == -EINVAL) {
+			priv->def_counter[port] = MLX4_SINK_COUNTER_INDEX(dev);
+			mlx4_warn(dev, "can't allocate counter from old PF driver, using index %d\n",
+				  MLX4_SINK_COUNTER_INDEX(dev));
+			err = 0;
+		} else {
+			mlx4_err(dev, "%s: failed to allocate default counter port %d err %d\n",
+				 __func__, port + 1, err);
+			mlx4_cleanup_default_counters(dev);
+			return err;
+		}
+
+		mlx4_dbg(dev, "%s: default counter index %d for port %d\n",
+			 __func__, priv->def_counter[port], port + 1);
+	}
+
+	return err;
+}
+
+int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return -ENOENT;
+
+	*idx = mlx4_bitmap_alloc(&priv->counters_bitmap);
+	if (*idx == -1) {
+		*idx = MLX4_SINK_COUNTER_INDEX(dev);
+		return -ENOSPC;
+	}
+
+	return 0;
+}
+
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx, u8 usage)
+{
+	u32 in_modifier = RES_COUNTER | (((u32)usage & 3) << 30);
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param, in_modifier,
+				   RES_OP_RESERVE, MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (!err)
+			*idx = get_param_l(&out_param);
+
+		return err;
+	}
+	return __mlx4_counter_alloc(dev, idx);
+}
+EXPORT_SYMBOL_GPL(mlx4_counter_alloc);
+
+static int __mlx4_clear_if_stat(struct mlx4_dev *dev,
+				u8 counter_index)
+{
+	struct mlx4_cmd_mailbox *if_stat_mailbox;
+	int err;
+	u32 if_stat_in_mod = (counter_index & 0xff) | MLX4_QUERY_IF_STAT_RESET;
+
+	if_stat_mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(if_stat_mailbox))
+		return PTR_ERR(if_stat_mailbox);
+
+	err = mlx4_cmd_box(dev, 0, if_stat_mailbox->dma, if_stat_in_mod, 0,
+			   MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, if_stat_mailbox);
+	return err;
+}
+
+void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
+{
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return;
+
+	if (idx == MLX4_SINK_COUNTER_INDEX(dev))
+		return;
+
+	__mlx4_clear_if_stat(dev, idx);
+
+	mlx4_bitmap_free(&mlx4_priv(dev)->counters_bitmap, idx, MLX4_USE_RR);
+	return;
+}
+
+void mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, idx);
+		mlx4_cmd(dev, in_param, RES_COUNTER, RES_OP_RESERVE,
+			 MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+			 MLX4_CMD_WRAPPED);
+		return;
+	}
+	__mlx4_counter_free(dev, idx);
+}
+EXPORT_SYMBOL_GPL(mlx4_counter_free);
+
+int mlx4_get_default_counter_index(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return priv->def_counter[port - 1];
+}
+EXPORT_SYMBOL_GPL(mlx4_get_default_counter_index);
+
+void mlx4_set_admin_guid(struct mlx4_dev *dev, __be64 guid, int entry, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->mfunc.master.vf_admin[entry].vport[port].guid = guid;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_admin_guid);
+
+__be64 mlx4_get_admin_guid(struct mlx4_dev *dev, int entry, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return priv->mfunc.master.vf_admin[entry].vport[port].guid;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_admin_guid);
+
+void mlx4_set_random_admin_guid(struct mlx4_dev *dev, int entry, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	__be64 guid;
+
+	/* hw GUID */
+	if (entry == 0)
+		return;
+
+	get_random_bytes((char *)&guid, sizeof(guid));
+	guid &= ~(cpu_to_be64(1ULL << 56));
+	guid |= cpu_to_be64(1ULL << 57);
+	priv->mfunc.master.vf_admin[entry].vport[port].guid = guid;
+}
+
+static int mlx4_setup_hca(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int port;
+	__be32 ib_port_default_caps;
+
+	err = mlx4_init_uar_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize user access region table, aborting\n");
+		return err;
+	}
+
+	err = mlx4_uar_alloc(dev, &priv->driver_uar);
+	if (err) {
+		mlx4_err(dev, "Failed to allocate driver access region, aborting\n");
+		goto err_uar_table_free;
+	}
+
+	priv->kar = ioremap((phys_addr_t) priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!priv->kar) {
+		mlx4_err(dev, "Couldn't map kernel access region, aborting\n");
+		err = -ENOMEM;
+		goto err_uar_free;
+	}
+
+	err = mlx4_init_pd_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize protection domain table, aborting\n");
+		goto err_kar_unmap;
+	}
+
+	err = mlx4_init_xrcd_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize reliable connection domain table, aborting\n");
+		goto err_pd_table_free;
+	}
+
+	err = mlx4_init_mr_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize memory region table, aborting\n");
+		goto err_xrcd_table_free;
+	}
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_init_mcg_table(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to initialize multicast group table, aborting\n");
+			goto err_mr_table_free;
+		}
+		err = mlx4_config_mad_demux(dev);
+		if (err) {
+			mlx4_err(dev, "Failed in config_mad_demux, aborting\n");
+			goto err_mcg_table_free;
+		}
+	}
+
+	err = mlx4_init_eq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize event queue table, aborting\n");
+		goto err_mcg_table_free;
+	}
+
+	err = mlx4_cmd_use_events(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to switch to event-driven firmware commands, aborting\n");
+		goto err_eq_table_free;
+	}
+
+	err = mlx4_NOP(dev);
+	if (err) {
+		if (dev->flags & MLX4_FLAG_MSI_X) {
+			mlx4_warn(dev, "NOP command failed to generate MSI-X interrupt IRQ %d)\n",
+				  priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
+			mlx4_warn(dev, "Trying again without MSI-X\n");
+		} else {
+			mlx4_err(dev, "NOP command failed to generate interrupt (IRQ %d), aborting\n",
+				 priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
+			mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+		}
+
+		goto err_cmd_poll;
+	}
+
+	mlx4_dbg(dev, "NOP command IRQ test passed\n");
+
+	err = mlx4_init_cq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize completion queue table, aborting\n");
+		goto err_cmd_poll;
+	}
+
+	err = mlx4_init_srq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize shared receive queue table, aborting\n");
+		goto err_cq_table_free;
+	}
+
+	err = mlx4_init_qp_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize queue pair table, aborting\n");
+		goto err_srq_table_free;
+	}
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_init_counters_table(dev);
+		if (err && err != -ENOENT) {
+			mlx4_err(dev, "Failed to initialize counters table, aborting\n");
+			goto err_qp_table_free;
+		}
+	}
+
+	err = mlx4_allocate_default_counters(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to allocate default counters, aborting\n");
+		goto err_counters_table_free;
+	}
+
+	if (!mlx4_is_slave(dev)) {
+		for (port = 1; port <= dev->caps.num_ports; port++) {
+			ib_port_default_caps = 0;
+			err = mlx4_get_port_ib_caps(dev, port,
+						    &ib_port_default_caps);
+			if (err)
+				mlx4_warn(dev, "failed to get port %d default ib capabilities (%d). Continuing with caps = 0\n",
+					  port, err);
+			dev->caps.ib_port_def_cap[port] = ib_port_default_caps;
+
+			/* initialize per-slave default ib port capabilities */
+			if (mlx4_is_master(dev)) {
+				int i;
+				for (i = 0; i < dev->num_slaves; i++) {
+					if (i == mlx4_master_func_num(dev))
+						continue;
+					priv->mfunc.master.slave_state[i].ib_cap_mask[port] =
+						ib_port_default_caps;
+				}
+			}
+
+			if (mlx4_is_mfunc(dev))
+				dev->caps.port_ib_mtu[port] = IB_MTU_2048;
+			else
+				dev->caps.port_ib_mtu[port] = IB_MTU_4096;
+
+			err = mlx4_SET_PORT(dev, port, mlx4_is_master(dev) ?
+					    dev->caps.pkey_table_len[port] : -1);
+			if (err) {
+				mlx4_err(dev, "Failed to set port %d, aborting\n",
+					 port);
+				goto err_default_countes_free;
+			}
+		}
+	}
+
+	return 0;
+
+err_default_countes_free:
+	mlx4_cleanup_default_counters(dev);
+
+err_counters_table_free:
+	if (!mlx4_is_slave(dev))
+		mlx4_cleanup_counters_table(dev);
+
+err_qp_table_free:
+	mlx4_cleanup_qp_table(dev);
+
+err_srq_table_free:
+	mlx4_cleanup_srq_table(dev);
+
+err_cq_table_free:
+	mlx4_cleanup_cq_table(dev);
+
+err_cmd_poll:
+	mlx4_cmd_use_polling(dev);
+
+err_eq_table_free:
+	mlx4_cleanup_eq_table(dev);
+
+err_mcg_table_free:
+	if (!mlx4_is_slave(dev))
+		mlx4_cleanup_mcg_table(dev);
+
+err_mr_table_free:
+	mlx4_cleanup_mr_table(dev);
+
+err_xrcd_table_free:
+	mlx4_cleanup_xrcd_table(dev);
+
+err_pd_table_free:
+	mlx4_cleanup_pd_table(dev);
+
+err_kar_unmap:
+	iounmap(priv->kar);
+
+err_uar_free:
+	mlx4_uar_free(dev, &priv->driver_uar);
+
+err_uar_table_free:
+	mlx4_cleanup_uar_table(dev);
+	return err;
+}
+
+static int mlx4_init_affinity_hint(struct mlx4_dev *dev, int port, int eqn)
+{
+	int requested_cpu = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_eq *eq;
+	int off = 0;
+	int i;
+
+	if (eqn > dev->caps.num_comp_vectors)
+		return -EINVAL;
+
+	for (i = 1; i < port; i++)
+		off += mlx4_get_eqs_per_port(dev, i);
+
+	requested_cpu = eqn - off - !!(eqn > MLX4_EQ_ASYNC);
+
+	/* Meaning EQs are shared, and this call comes from the second port */
+	if (requested_cpu < 0)
+		return 0;
+
+	eq = &priv->eq_table.eq[eqn];
+
+	if (!zalloc_cpumask_var(&eq->affinity_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpumask_set_cpu(requested_cpu, eq->affinity_mask);
+
+	return 0;
+}
+
+static void mlx4_enable_msi_x(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct msix_entry *entries;
+	int i;
+	int port = 0;
+
+	if (msi_x) {
+		int nreq = min3(dev->caps.num_ports *
+				(int)num_online_cpus() + 1,
+				dev->caps.num_eqs - dev->caps.reserved_eqs,
+				MAX_MSIX);
+
+		entries = kcalloc(nreq, sizeof(*entries), GFP_KERNEL);
+		if (!entries)
+			goto no_msi;
+
+		for (i = 0; i < nreq; ++i)
+			entries[i].entry = i;
+
+		nreq = pci_enable_msix_range(dev->persist->pdev, entries, 2,
+					     nreq);
+
+		if (nreq < 0 || nreq < MLX4_EQ_ASYNC) {
+			kfree(entries);
+			goto no_msi;
+		}
+		/* 1 is reserved for events (asyncrounous EQ) */
+		dev->caps.num_comp_vectors = nreq - 1;
+
+		priv->eq_table.eq[MLX4_EQ_ASYNC].irq = entries[0].vector;
+		bitmap_zero(priv->eq_table.eq[MLX4_EQ_ASYNC].actv_ports.ports,
+			    dev->caps.num_ports);
+
+		for (i = 0; i < dev->caps.num_comp_vectors + 1; i++) {
+			if (i == MLX4_EQ_ASYNC)
+				continue;
+
+			priv->eq_table.eq[i].irq =
+				entries[i + 1 - !!(i > MLX4_EQ_ASYNC)].vector;
+
+			if (MLX4_IS_LEGACY_EQ_MODE(dev->caps)) {
+				bitmap_fill(priv->eq_table.eq[i].actv_ports.ports,
+					    dev->caps.num_ports);
+				/* We don't set affinity hint when there
+				 * aren't enough EQs
+				 */
+			} else {
+				set_bit(port,
+					priv->eq_table.eq[i].actv_ports.ports);
+				if (mlx4_init_affinity_hint(dev, port + 1, i))
+					mlx4_warn(dev, "Couldn't init hint cpumask for EQ %d\n",
+						  i);
+			}
+			/* We divide the Eqs evenly between the two ports.
+			 * (dev->caps.num_comp_vectors / dev->caps.num_ports)
+			 * refers to the number of Eqs per port
+			 * (i.e eqs_per_port). Theoretically, we would like to
+			 * write something like (i + 1) % eqs_per_port == 0.
+			 * However, since there's an asynchronous Eq, we have
+			 * to skip over it by comparing this condition to
+			 * !!((i + 1) > MLX4_EQ_ASYNC).
+			 */
+			if ((dev->caps.num_comp_vectors > dev->caps.num_ports) &&
+			    ((i + 1) %
+			     (dev->caps.num_comp_vectors / dev->caps.num_ports)) ==
+			    !!((i + 1) > MLX4_EQ_ASYNC))
+				/* If dev->caps.num_comp_vectors < dev->caps.num_ports,
+				 * everything is shared anyway.
+				 */
+				port++;
+		}
+
+		dev->flags |= MLX4_FLAG_MSI_X;
+
+		kfree(entries);
+		return;
+	}
+
+no_msi:
+	dev->caps.num_comp_vectors = 1;
+
+	BUG_ON(MLX4_EQ_ASYNC >= 2);
+	for (i = 0; i < 2; ++i) {
+		priv->eq_table.eq[i].irq = dev->persist->pdev->irq;
+		if (i != MLX4_EQ_ASYNC) {
+			bitmap_fill(priv->eq_table.eq[i].actv_ports.ports,
+				    dev->caps.num_ports);
+		}
+	}
+}
+
+static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
+{
+	struct devlink *devlink = priv_to_devlink(mlx4_priv(dev));
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	int err;
+
+	err = devlink_port_register(devlink, &info->devlink_port, port);
+	if (err)
+		return err;
+
+	info->dev = dev;
+	info->port = port;
+	if (!mlx4_is_slave(dev)) {
+		mlx4_init_mac_table(dev, &info->mac_table);
+		mlx4_init_vlan_table(dev, &info->vlan_table);
+		mlx4_init_roce_gid_table(dev, &info->gid_table);
+		info->base_qpn = mlx4_get_base_qpn(dev, port);
+	}
+
+	sprintf(info->dev_name, "mlx4_port%d", port);
+	info->port_attr.attr.name = info->dev_name;
+	if (mlx4_is_mfunc(dev)) {
+		info->port_attr.attr.mode = 0444;
+	} else {
+		info->port_attr.attr.mode = 0644;
+		info->port_attr.store     = set_port_type;
+	}
+	info->port_attr.show      = show_port_type;
+	sysfs_attr_init(&info->port_attr.attr);
+
+	err = device_create_file(&dev->persist->pdev->dev, &info->port_attr);
+	if (err) {
+		mlx4_err(dev, "Failed to create file for port %d\n", port);
+		devlink_port_unregister(&info->devlink_port);
+		info->port = -1;
+		return err;
+	}
+
+	sprintf(info->dev_mtu_name, "mlx4_port%d_mtu", port);
+	info->port_mtu_attr.attr.name = info->dev_mtu_name;
+	if (mlx4_is_mfunc(dev)) {
+		info->port_mtu_attr.attr.mode = 0444;
+	} else {
+		info->port_mtu_attr.attr.mode = 0644;
+		info->port_mtu_attr.store     = set_port_ib_mtu;
+	}
+	info->port_mtu_attr.show      = show_port_ib_mtu;
+	sysfs_attr_init(&info->port_mtu_attr.attr);
+
+	err = device_create_file(&dev->persist->pdev->dev,
+				 &info->port_mtu_attr);
+	if (err) {
+		mlx4_err(dev, "Failed to create mtu file for port %d\n", port);
+		device_remove_file(&info->dev->persist->pdev->dev,
+				   &info->port_attr);
+		devlink_port_unregister(&info->devlink_port);
+		info->port = -1;
+		return err;
+	}
+
+	return 0;
+}
+
+static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
+{
+	if (info->port < 0)
+		return;
+
+	device_remove_file(&info->dev->persist->pdev->dev, &info->port_attr);
+	device_remove_file(&info->dev->persist->pdev->dev,
+			   &info->port_mtu_attr);
+	devlink_port_unregister(&info->devlink_port);
+
+#ifdef CONFIG_RFS_ACCEL
+	free_irq_cpu_rmap(info->rmap);
+	info->rmap = NULL;
+#endif
+}
+
+static int mlx4_init_steering(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int num_entries = dev->caps.num_ports;
+	int i, j;
+
+	priv->steer = kzalloc(sizeof(struct mlx4_steer) * num_entries, GFP_KERNEL);
+	if (!priv->steer)
+		return -ENOMEM;
+
+	for (i = 0; i < num_entries; i++)
+		for (j = 0; j < MLX4_NUM_STEERS; j++) {
+			INIT_LIST_HEAD(&priv->steer[i].promisc_qps[j]);
+			INIT_LIST_HEAD(&priv->steer[i].steer_entries[j]);
+		}
+	return 0;
+}
+
+static void mlx4_clear_steering(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_steer_index *entry, *tmp_entry;
+	struct mlx4_promisc_qp *pqp, *tmp_pqp;
+	int num_entries = dev->caps.num_ports;
+	int i, j;
+
+	for (i = 0; i < num_entries; i++) {
+		for (j = 0; j < MLX4_NUM_STEERS; j++) {
+			list_for_each_entry_safe(pqp, tmp_pqp,
+						 &priv->steer[i].promisc_qps[j],
+						 list) {
+				list_del(&pqp->list);
+				kfree(pqp);
+			}
+			list_for_each_entry_safe(entry, tmp_entry,
+						 &priv->steer[i].steer_entries[j],
+						 list) {
+				list_del(&entry->list);
+				list_for_each_entry_safe(pqp, tmp_pqp,
+							 &entry->duplicates,
+							 list) {
+					list_del(&pqp->list);
+					kfree(pqp);
+				}
+				kfree(entry);
+			}
+		}
+	}
+	kfree(priv->steer);
+}
+
+static int extended_func_num(struct pci_dev *pdev)
+{
+	return PCI_SLOT(pdev->devfn) * 8 + PCI_FUNC(pdev->devfn);
+}
+
+#define MLX4_OWNER_BASE	0x8069c
+#define MLX4_OWNER_SIZE	4
+
+static int mlx4_get_ownership(struct mlx4_dev *dev)
+{
+	void __iomem *owner;
+	u32 ret;
+
+	if (pci_channel_offline(dev->persist->pdev))
+		return -EIO;
+
+	owner = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_OWNER_BASE,
+			MLX4_OWNER_SIZE);
+	if (!owner) {
+		mlx4_err(dev, "Failed to obtain ownership bit\n");
+		return -ENOMEM;
+	}
+
+	ret = readl(owner);
+	iounmap(owner);
+	return (int) !!ret;
+}
+
+static void mlx4_free_ownership(struct mlx4_dev *dev)
+{
+	void __iomem *owner;
+
+	if (pci_channel_offline(dev->persist->pdev))
+		return;
+
+	owner = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_OWNER_BASE,
+			MLX4_OWNER_SIZE);
+	if (!owner) {
+		mlx4_err(dev, "Failed to obtain ownership bit\n");
+		return;
+	}
+	writel(0, owner);
+	msleep(1000);
+	iounmap(owner);
+}
+
+#define SRIOV_VALID_STATE(flags) (!!((flags) & MLX4_FLAG_SRIOV)	==\
+				  !!((flags) & MLX4_FLAG_MASTER))
+
+static u64 mlx4_enable_sriov(struct mlx4_dev *dev, struct pci_dev *pdev,
+			     u8 total_vfs, int existing_vfs, int reset_flow)
+{
+	u64 dev_flags = dev->flags;
+	int err = 0;
+	int fw_enabled_sriov_vfs = min(pci_sriov_get_totalvfs(pdev),
+					MLX4_MAX_NUM_VF);
+
+	if (reset_flow) {
+		dev->dev_vfs = kcalloc(total_vfs, sizeof(*dev->dev_vfs),
+				       GFP_KERNEL);
+		if (!dev->dev_vfs)
+			goto free_mem;
+		return dev_flags;
+	}
+
+	atomic_inc(&pf_loading);
+	if (dev->flags &  MLX4_FLAG_SRIOV) {
+		if (existing_vfs != total_vfs) {
+			mlx4_err(dev, "SR-IOV was already enabled, but with num_vfs (%d) different than requested (%d)\n",
+				 existing_vfs, total_vfs);
+			total_vfs = existing_vfs;
+		}
+	}
+
+	dev->dev_vfs = kzalloc(total_vfs * sizeof(*dev->dev_vfs), GFP_KERNEL);
+	if (NULL == dev->dev_vfs) {
+		mlx4_err(dev, "Failed to allocate memory for VFs\n");
+		goto disable_sriov;
+	}
+
+	if (!(dev->flags &  MLX4_FLAG_SRIOV)) {
+		if (total_vfs > fw_enabled_sriov_vfs) {
+			mlx4_err(dev, "requested vfs (%d) > available vfs (%d). Continuing without SR_IOV\n",
+				 total_vfs, fw_enabled_sriov_vfs);
+			err = -ENOMEM;
+			goto disable_sriov;
+		}
+		mlx4_warn(dev, "Enabling SR-IOV with %d VFs\n", total_vfs);
+		err = pci_enable_sriov(pdev, total_vfs);
+	}
+	if (err) {
+		mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d)\n",
+			 err);
+		goto disable_sriov;
+	} else {
+		mlx4_warn(dev, "Running in master mode\n");
+		dev_flags |= MLX4_FLAG_SRIOV |
+			MLX4_FLAG_MASTER;
+		dev_flags &= ~MLX4_FLAG_SLAVE;
+		dev->persist->num_vfs = total_vfs;
+	}
+	return dev_flags;
+
+disable_sriov:
+	atomic_dec(&pf_loading);
+free_mem:
+	dev->persist->num_vfs = 0;
+	kfree(dev->dev_vfs);
+        dev->dev_vfs = NULL;
+	return dev_flags & ~MLX4_FLAG_MASTER;
+}
+
+enum {
+	MLX4_DEV_CAP_CHECK_NUM_VFS_ABOVE_64 = -1,
+};
+
+static int mlx4_check_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
+			      int *nvfs)
+{
+	int requested_vfs = nvfs[0] + nvfs[1] + nvfs[2];
+	/* Checking for 64 VFs as a limitation of CX2 */
+	if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_80_VFS) &&
+	    requested_vfs >= 64) {
+		mlx4_err(dev, "Requested %d VFs, but FW does not support more than 64\n",
+			 requested_vfs);
+		return MLX4_DEV_CAP_CHECK_NUM_VFS_ABOVE_64;
+	}
+	return 0;
+}
+
+static int mlx4_pci_enable_device(struct mlx4_dev *dev)
+{
+	struct pci_dev *pdev = dev->persist->pdev;
+	int err = 0;
+
+	mutex_lock(&dev->persist->pci_status_mutex);
+	if (dev->persist->pci_status == MLX4_PCI_STATUS_DISABLED) {
+		err = pci_enable_device(pdev);
+		if (!err)
+			dev->persist->pci_status = MLX4_PCI_STATUS_ENABLED;
+	}
+	mutex_unlock(&dev->persist->pci_status_mutex);
+
+	return err;
+}
+
+static void mlx4_pci_disable_device(struct mlx4_dev *dev)
+{
+	struct pci_dev *pdev = dev->persist->pdev;
+
+	mutex_lock(&dev->persist->pci_status_mutex);
+	if (dev->persist->pci_status == MLX4_PCI_STATUS_ENABLED) {
+		pci_disable_device(pdev);
+		dev->persist->pci_status = MLX4_PCI_STATUS_DISABLED;
+	}
+	mutex_unlock(&dev->persist->pci_status_mutex);
+}
+
+static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
+			 int total_vfs, int *nvfs, struct mlx4_priv *priv,
+			 int reset_flow)
+{
+	struct mlx4_dev *dev;
+	unsigned sum = 0;
+	int err;
+	int port;
+	int i;
+	struct mlx4_dev_cap *dev_cap = NULL;
+	int existing_vfs = 0;
+
+	dev = &priv->dev;
+
+	INIT_LIST_HEAD(&priv->ctx_list);
+	spin_lock_init(&priv->ctx_lock);
+
+	mutex_init(&priv->port_mutex);
+	mutex_init(&priv->bond_mutex);
+
+	INIT_LIST_HEAD(&priv->pgdir_list);
+	mutex_init(&priv->pgdir_mutex);
+	spin_lock_init(&priv->cmd.context_lock);
+
+	INIT_LIST_HEAD(&priv->bf_list);
+	mutex_init(&priv->bf_mutex);
+
+	dev->rev_id = pdev->revision;
+	dev->numa_node = dev_to_node(&pdev->dev);
+
+	/* Detect if this device is a virtual function */
+	if (pci_dev_data & MLX4_PCI_DEV_IS_VF) {
+		mlx4_warn(dev, "Detected virtual function - running in slave mode\n");
+		dev->flags |= MLX4_FLAG_SLAVE;
+	} else {
+		/* We reset the device and enable SRIOV only for physical
+		 * devices.  Try to claim ownership on the device;
+		 * if already taken, skip -- do not allow multiple PFs */
+		err = mlx4_get_ownership(dev);
+		if (err) {
+			if (err < 0)
+				return err;
+			else {
+				mlx4_warn(dev, "Multiple PFs not yet supported - Skipping PF\n");
+				return -EINVAL;
+			}
+		}
+
+		atomic_set(&priv->opreq_count, 0);
+		INIT_WORK(&priv->opreq_task, mlx4_opreq_action);
+
+		/*
+		 * Now reset the HCA before we touch the PCI capabilities or
+		 * attempt a firmware command, since a boot ROM may have left
+		 * the HCA in an undefined state.
+		 */
+		err = mlx4_reset(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to reset HCA, aborting\n");
+			goto err_sriov;
+		}
+
+		if (total_vfs) {
+			dev->flags = MLX4_FLAG_MASTER;
+			existing_vfs = pci_num_vf(pdev);
+			if (existing_vfs)
+				dev->flags |= MLX4_FLAG_SRIOV;
+			dev->persist->num_vfs = total_vfs;
+		}
+	}
+
+	/* on load remove any previous indication of internal error,
+	 * device is up.
+	 */
+	dev->persist->state = MLX4_DEVICE_STATE_UP;
+
+slave_start:
+	err = mlx4_cmd_init(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to init command interface, aborting\n");
+		goto err_sriov;
+	}
+
+	/* In slave functions, the communication channel must be initialized
+	 * before posting commands. Also, init num_slaves before calling
+	 * mlx4_init_hca */
+	if (mlx4_is_mfunc(dev)) {
+		if (mlx4_is_master(dev)) {
+			dev->num_slaves = MLX4_MAX_NUM_SLAVES;
+
+		} else {
+			dev->num_slaves = 0;
+			err = mlx4_multi_func_init(dev);
+			if (err) {
+				mlx4_err(dev, "Failed to init slave mfunc interface, aborting\n");
+				goto err_cmd;
+			}
+		}
+	}
+
+	err = mlx4_init_fw(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to init fw, aborting.\n");
+		goto err_mfunc;
+	}
+
+	if (mlx4_is_master(dev)) {
+		/* when we hit the goto slave_start below, dev_cap already initialized */
+		if (!dev_cap) {
+			dev_cap = kzalloc(sizeof(*dev_cap), GFP_KERNEL);
+
+			if (!dev_cap) {
+				err = -ENOMEM;
+				goto err_fw;
+			}
+
+			err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+			if (err) {
+				mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+				goto err_fw;
+			}
+
+			if (mlx4_check_dev_cap(dev, dev_cap, nvfs))
+				goto err_fw;
+
+			if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
+				u64 dev_flags = mlx4_enable_sriov(dev, pdev,
+								  total_vfs,
+								  existing_vfs,
+								  reset_flow);
+
+				mlx4_close_fw(dev);
+				mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
+				dev->flags = dev_flags;
+				if (!SRIOV_VALID_STATE(dev->flags)) {
+					mlx4_err(dev, "Invalid SRIOV state\n");
+					goto err_sriov;
+				}
+				err = mlx4_reset(dev);
+				if (err) {
+					mlx4_err(dev, "Failed to reset HCA, aborting.\n");
+					goto err_sriov;
+				}
+				goto slave_start;
+			}
+		} else {
+			/* Legacy mode FW requires SRIOV to be enabled before
+			 * doing QUERY_DEV_CAP, since max_eq's value is different if
+			 * SRIOV is enabled.
+			 */
+			memset(dev_cap, 0, sizeof(*dev_cap));
+			err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+			if (err) {
+				mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+				goto err_fw;
+			}
+
+			if (mlx4_check_dev_cap(dev, dev_cap, nvfs))
+				goto err_fw;
+		}
+	}
+
+	err = mlx4_init_hca(dev);
+	if (err) {
+		if (err == -EACCES) {
+			/* Not primary Physical function
+			 * Running in slave mode */
+			mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
+			/* We're not a PF */
+			if (dev->flags & MLX4_FLAG_SRIOV) {
+				if (!existing_vfs)
+					pci_disable_sriov(pdev);
+				if (mlx4_is_master(dev) && !reset_flow)
+					atomic_dec(&pf_loading);
+				dev->flags &= ~MLX4_FLAG_SRIOV;
+			}
+			if (!mlx4_is_slave(dev))
+				mlx4_free_ownership(dev);
+			dev->flags |= MLX4_FLAG_SLAVE;
+			dev->flags &= ~MLX4_FLAG_MASTER;
+			goto slave_start;
+		} else
+			goto err_fw;
+	}
+
+	if (mlx4_is_master(dev) && (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
+		u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs,
+						  existing_vfs, reset_flow);
+
+		if ((dev->flags ^ dev_flags) & (MLX4_FLAG_MASTER | MLX4_FLAG_SLAVE)) {
+			mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_VHCR);
+			dev->flags = dev_flags;
+			err = mlx4_cmd_init(dev);
+			if (err) {
+				/* Only VHCR is cleaned up, so could still
+				 * send FW commands
+				 */
+				mlx4_err(dev, "Failed to init VHCR command interface, aborting\n");
+				goto err_close;
+			}
+		} else {
+			dev->flags = dev_flags;
+		}
+
+		if (!SRIOV_VALID_STATE(dev->flags)) {
+			mlx4_err(dev, "Invalid SRIOV state\n");
+			goto err_close;
+		}
+	}
+
+	/* check if the device is functioning at its maximum possible speed.
+	 * No return code for this call, just warn the user in case of PCI
+	 * express device capabilities are under-satisfied by the bus.
+	 */
+	if (!mlx4_is_slave(dev))
+		pcie_print_link_status(dev->persist->pdev);
+
+	/* In master functions, the communication channel must be initialized
+	 * after obtaining its address from fw */
+	if (mlx4_is_master(dev)) {
+		if (dev->caps.num_ports < 2 &&
+		    num_vfs_argc > 1) {
+			err = -EINVAL;
+			mlx4_err(dev,
+				 "Error: Trying to configure VFs on port 2, but HCA has only %d physical ports\n",
+				 dev->caps.num_ports);
+			goto err_close;
+		}
+		memcpy(dev->persist->nvfs, nvfs, sizeof(dev->persist->nvfs));
+
+		for (i = 0;
+		     i < sizeof(dev->persist->nvfs)/
+		     sizeof(dev->persist->nvfs[0]); i++) {
+			unsigned j;
+
+			for (j = 0; j < dev->persist->nvfs[i]; ++sum, ++j) {
+				dev->dev_vfs[sum].min_port = i < 2 ? i + 1 : 1;
+				dev->dev_vfs[sum].n_ports = i < 2 ? 1 :
+					dev->caps.num_ports;
+			}
+		}
+
+		/* In master functions, the communication channel
+		 * must be initialized after obtaining its address from fw
+		 */
+		err = mlx4_multi_func_init(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to init master mfunc interface, aborting.\n");
+			goto err_close;
+		}
+	}
+
+	err = mlx4_alloc_eq_table(dev);
+	if (err)
+		goto err_master_mfunc;
+
+	bitmap_zero(priv->msix_ctl.pool_bm, MAX_MSIX);
+	mutex_init(&priv->msix_ctl.pool_lock);
+
+	mlx4_enable_msi_x(dev);
+	if ((mlx4_is_mfunc(dev)) &&
+	    !(dev->flags & MLX4_FLAG_MSI_X)) {
+		err = -EOPNOTSUPP;
+		mlx4_err(dev, "INTx is not supported in multi-function mode, aborting\n");
+		goto err_free_eq;
+	}
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_init_steering(dev);
+		if (err)
+			goto err_disable_msix;
+	}
+
+	mlx4_init_quotas(dev);
+
+	err = mlx4_setup_hca(dev);
+	if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X) &&
+	    !mlx4_is_mfunc(dev)) {
+		dev->flags &= ~MLX4_FLAG_MSI_X;
+		dev->caps.num_comp_vectors = 1;
+		pci_disable_msix(pdev);
+		err = mlx4_setup_hca(dev);
+	}
+
+	if (err)
+		goto err_steer;
+
+	/* When PF resources are ready arm its comm channel to enable
+	 * getting commands
+	 */
+	if (mlx4_is_master(dev)) {
+		err = mlx4_ARM_COMM_CHANNEL(dev);
+		if (err) {
+			mlx4_err(dev, " Failed to arm comm channel eq: %x\n",
+				 err);
+			goto err_steer;
+		}
+	}
+
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		err = mlx4_init_port_info(dev, port);
+		if (err)
+			goto err_port;
+	}
+
+	priv->v2p.port1 = 1;
+	priv->v2p.port2 = 2;
+
+	err = mlx4_register_device(dev);
+	if (err)
+		goto err_port;
+
+	mlx4_request_modules(dev);
+
+	mlx4_sense_init(dev);
+	mlx4_start_sense(dev);
+
+	priv->removed = 0;
+
+	if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
+		atomic_dec(&pf_loading);
+
+	kfree(dev_cap);
+	return 0;
+
+err_port:
+	for (--port; port >= 1; --port)
+		mlx4_cleanup_port_info(&priv->port[port]);
+
+	mlx4_cleanup_default_counters(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_cleanup_counters_table(dev);
+	mlx4_cleanup_qp_table(dev);
+	mlx4_cleanup_srq_table(dev);
+	mlx4_cleanup_cq_table(dev);
+	mlx4_cmd_use_polling(dev);
+	mlx4_cleanup_eq_table(dev);
+	mlx4_cleanup_mcg_table(dev);
+	mlx4_cleanup_mr_table(dev);
+	mlx4_cleanup_xrcd_table(dev);
+	mlx4_cleanup_pd_table(dev);
+	mlx4_cleanup_uar_table(dev);
+
+err_steer:
+	if (!mlx4_is_slave(dev))
+		mlx4_clear_steering(dev);
+
+err_disable_msix:
+	if (dev->flags & MLX4_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+
+err_free_eq:
+	mlx4_free_eq_table(dev);
+
+err_master_mfunc:
+	if (mlx4_is_master(dev)) {
+		mlx4_free_resource_tracker(dev, RES_TR_FREE_STRUCTS_ONLY);
+		mlx4_multi_func_cleanup(dev);
+	}
+
+	if (mlx4_is_slave(dev))
+		mlx4_slave_destroy_special_qp_cap(dev);
+
+err_close:
+	mlx4_close_hca(dev);
+
+err_fw:
+	mlx4_close_fw(dev);
+
+err_mfunc:
+	if (mlx4_is_slave(dev))
+		mlx4_multi_func_cleanup(dev);
+
+err_cmd:
+	mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
+
+err_sriov:
+	if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs) {
+		pci_disable_sriov(pdev);
+		dev->flags &= ~MLX4_FLAG_SRIOV;
+	}
+
+	if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
+		atomic_dec(&pf_loading);
+
+	kfree(priv->dev.dev_vfs);
+
+	if (!mlx4_is_slave(dev))
+		mlx4_free_ownership(dev);
+
+	kfree(dev_cap);
+	return err;
+}
+
+static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data,
+			   struct mlx4_priv *priv)
+{
+	int err;
+	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	int prb_vf[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	const int param_map[MLX4_MAX_PORTS + 1][MLX4_MAX_PORTS + 1] = {
+		{2, 0, 0}, {0, 1, 2}, {0, 1, 2} };
+	unsigned total_vfs = 0;
+	unsigned int i;
+
+	pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev));
+
+	err = mlx4_pci_enable_device(&priv->dev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n");
+		return err;
+	}
+
+	/* Due to requirement that all VFs and the PF are *guaranteed* 2 MACS
+	 * per port, we must limit the number of VFs to 63 (since their are
+	 * 128 MACs)
+	 */
+	for (i = 0; i < ARRAY_SIZE(nvfs) && i < num_vfs_argc;
+	     total_vfs += nvfs[param_map[num_vfs_argc - 1][i]], i++) {
+		nvfs[param_map[num_vfs_argc - 1][i]] = num_vfs[i];
+		if (nvfs[i] < 0) {
+			dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n");
+			err = -EINVAL;
+			goto err_disable_pdev;
+		}
+	}
+	for (i = 0; i < ARRAY_SIZE(prb_vf) && i < probe_vfs_argc;
+	     i++) {
+		prb_vf[param_map[probe_vfs_argc - 1][i]] = probe_vf[i];
+		if (prb_vf[i] < 0 || prb_vf[i] > nvfs[i]) {
+			dev_err(&pdev->dev, "probe_vf module parameter cannot be negative or greater than num_vfs\n");
+			err = -EINVAL;
+			goto err_disable_pdev;
+		}
+	}
+	if (total_vfs > MLX4_MAX_NUM_VF) {
+		dev_err(&pdev->dev,
+			"Requested more VF's (%d) than allowed by hw (%d)\n",
+			total_vfs, MLX4_MAX_NUM_VF);
+		err = -EINVAL;
+		goto err_disable_pdev;
+	}
+
+	for (i = 0; i < MLX4_MAX_PORTS; i++) {
+		if (nvfs[i] + nvfs[2] > MLX4_MAX_NUM_VF_P_PORT) {
+			dev_err(&pdev->dev,
+				"Requested more VF's (%d) for port (%d) than allowed by driver (%d)\n",
+				nvfs[i] + nvfs[2], i + 1,
+				MLX4_MAX_NUM_VF_P_PORT);
+			err = -EINVAL;
+			goto err_disable_pdev;
+		}
+	}
+
+	/* Check for BARs. */
+	if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) &&
+	    !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing DCS, aborting (driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%lx)\n",
+			pci_dev_data, pci_resource_flags(pdev, 0));
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing UAR, aborting\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n");
+		goto err_disable_pdev;
+	}
+
+	pci_set_master(pdev);
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n");
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n");
+			goto err_release_regions;
+		}
+	}
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit consistent PCI DMA mask\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, aborting\n");
+			goto err_release_regions;
+		}
+	}
+
+	/* Allow large DMA segments, up to the firmware limit of 1 GB */
+	dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024);
+	/* Detect if this device is a virtual function */
+	if (pci_dev_data & MLX4_PCI_DEV_IS_VF) {
+		/* When acting as pf, we normally skip vfs unless explicitly
+		 * requested to probe them.
+		 */
+		if (total_vfs) {
+			unsigned vfs_offset = 0;
+
+			for (i = 0; i < ARRAY_SIZE(nvfs) &&
+			     vfs_offset + nvfs[i] < extended_func_num(pdev);
+			     vfs_offset += nvfs[i], i++)
+				;
+			if (i == ARRAY_SIZE(nvfs)) {
+				err = -ENODEV;
+				goto err_release_regions;
+			}
+			if ((extended_func_num(pdev) - vfs_offset)
+			    > prb_vf[i]) {
+				dev_warn(&pdev->dev, "Skipping virtual function:%d\n",
+					 extended_func_num(pdev));
+				err = -ENODEV;
+				goto err_release_regions;
+			}
+		}
+	}
+
+	err = mlx4_catas_init(&priv->dev);
+	if (err)
+		goto err_release_regions;
+
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 0);
+	if (err)
+		goto err_catas;
+
+	return 0;
+
+err_catas:
+	mlx4_catas_end(&priv->dev);
+
+err_release_regions:
+	pci_release_regions(pdev);
+
+err_disable_pdev:
+	mlx4_pci_disable_device(&priv->dev);
+	return err;
+}
+
+static int mlx4_devlink_port_type_set(struct devlink_port *devlink_port,
+				      enum devlink_port_type port_type)
+{
+	struct mlx4_port_info *info = container_of(devlink_port,
+						   struct mlx4_port_info,
+						   devlink_port);
+	enum mlx4_port_type mlx4_port_type;
+
+	switch (port_type) {
+	case DEVLINK_PORT_TYPE_AUTO:
+		mlx4_port_type = MLX4_PORT_TYPE_AUTO;
+		break;
+	case DEVLINK_PORT_TYPE_ETH:
+		mlx4_port_type = MLX4_PORT_TYPE_ETH;
+		break;
+	case DEVLINK_PORT_TYPE_IB:
+		mlx4_port_type = MLX4_PORT_TYPE_IB;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return __set_port_type(info, mlx4_port_type);
+}
+
+static const struct devlink_ops mlx4_devlink_ops = {
+	.port_type_set	= mlx4_devlink_port_type_set,
+};
+
+static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct devlink *devlink;
+	struct mlx4_priv *priv;
+	struct mlx4_dev *dev;
+	int ret;
+
+	printk_once(KERN_INFO "%s", mlx4_version);
+
+	devlink = devlink_alloc(&mlx4_devlink_ops, sizeof(*priv));
+	if (!devlink)
+		return -ENOMEM;
+	priv = devlink_priv(devlink);
+
+	dev       = &priv->dev;
+	dev->persist = kzalloc(sizeof(*dev->persist), GFP_KERNEL);
+	if (!dev->persist) {
+		ret = -ENOMEM;
+		goto err_devlink_free;
+	}
+	dev->persist->pdev = pdev;
+	dev->persist->dev = dev;
+	pci_set_drvdata(pdev, dev->persist);
+	priv->pci_dev_data = id->driver_data;
+	mutex_init(&dev->persist->device_state_mutex);
+	mutex_init(&dev->persist->interface_state_mutex);
+	mutex_init(&dev->persist->pci_status_mutex);
+
+	ret = devlink_register(devlink, &pdev->dev);
+	if (ret)
+		goto err_persist_free;
+
+	ret =  __mlx4_init_one(pdev, id->driver_data, priv);
+	if (ret)
+		goto err_devlink_unregister;
+
+	pci_save_state(pdev);
+	return 0;
+
+err_devlink_unregister:
+	devlink_unregister(devlink);
+err_persist_free:
+	kfree(dev->persist);
+err_devlink_free:
+	devlink_free(devlink);
+	return ret;
+}
+
+static void mlx4_clean_dev(struct mlx4_dev *dev)
+{
+	struct mlx4_dev_persistent *persist = dev->persist;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned long	flags = (dev->flags & RESET_PERSIST_MASK_FLAGS);
+
+	memset(priv, 0, sizeof(*priv));
+	priv->dev.persist = persist;
+	priv->dev.flags = flags;
+}
+
+static void mlx4_unload_one(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev  *dev  = persist->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int               pci_dev_data;
+	int p, i;
+
+	if (priv->removed)
+		return;
+
+	/* saving current ports type for further use */
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		dev->persist->curr_port_type[i] = dev->caps.port_type[i + 1];
+		dev->persist->curr_port_poss_type[i] = dev->caps.
+						       possible_type[i + 1];
+	}
+
+	pci_dev_data = priv->pci_dev_data;
+
+	mlx4_stop_sense(dev);
+	mlx4_unregister_device(dev);
+
+	for (p = 1; p <= dev->caps.num_ports; p++) {
+		mlx4_cleanup_port_info(&priv->port[p]);
+		mlx4_CLOSE_PORT(dev, p);
+	}
+
+	if (mlx4_is_master(dev))
+		mlx4_free_resource_tracker(dev,
+					   RES_TR_FREE_SLAVES_ONLY);
+
+	mlx4_cleanup_default_counters(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_cleanup_counters_table(dev);
+	mlx4_cleanup_qp_table(dev);
+	mlx4_cleanup_srq_table(dev);
+	mlx4_cleanup_cq_table(dev);
+	mlx4_cmd_use_polling(dev);
+	mlx4_cleanup_eq_table(dev);
+	mlx4_cleanup_mcg_table(dev);
+	mlx4_cleanup_mr_table(dev);
+	mlx4_cleanup_xrcd_table(dev);
+	mlx4_cleanup_pd_table(dev);
+
+	if (mlx4_is_master(dev))
+		mlx4_free_resource_tracker(dev,
+					   RES_TR_FREE_STRUCTS_ONLY);
+
+	iounmap(priv->kar);
+	mlx4_uar_free(dev, &priv->driver_uar);
+	mlx4_cleanup_uar_table(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_clear_steering(dev);
+	mlx4_free_eq_table(dev);
+	if (mlx4_is_master(dev))
+		mlx4_multi_func_cleanup(dev);
+	mlx4_close_hca(dev);
+	mlx4_close_fw(dev);
+	if (mlx4_is_slave(dev))
+		mlx4_multi_func_cleanup(dev);
+	mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
+
+	if (dev->flags & MLX4_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+
+	if (!mlx4_is_slave(dev))
+		mlx4_free_ownership(dev);
+
+	mlx4_slave_destroy_special_qp_cap(dev);
+	kfree(dev->dev_vfs);
+
+	mlx4_clean_dev(dev);
+	priv->pci_dev_data = pci_dev_data;
+	priv->removed = 1;
+}
+
+static void mlx4_remove_one(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev  *dev  = persist->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct devlink *devlink = priv_to_devlink(priv);
+	int active_vfs = 0;
+
+	if (mlx4_is_slave(dev))
+		persist->interface_state |= MLX4_INTERFACE_STATE_NOWAIT;
+
+	mutex_lock(&persist->interface_state_mutex);
+	persist->interface_state |= MLX4_INTERFACE_STATE_DELETION;
+	mutex_unlock(&persist->interface_state_mutex);
+
+	/* Disabling SR-IOV is not allowed while there are active vf's */
+	if (mlx4_is_master(dev) && dev->flags & MLX4_FLAG_SRIOV) {
+		active_vfs = mlx4_how_many_lives_vf(dev);
+		if (active_vfs) {
+			pr_warn("Removing PF when there are active VF's !!\n");
+			pr_warn("Will not disable SR-IOV.\n");
+		}
+	}
+
+	/* device marked to be under deletion running now without the lock
+	 * letting other tasks to be terminated
+	 */
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_unload_one(pdev);
+	else
+		mlx4_info(dev, "%s: interface is down\n", __func__);
+	mlx4_catas_end(dev);
+	if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) {
+		mlx4_warn(dev, "Disabling SR-IOV\n");
+		pci_disable_sriov(pdev);
+	}
+
+	pci_release_regions(pdev);
+	mlx4_pci_disable_device(dev);
+	devlink_unregister(devlink);
+	kfree(dev->persist);
+	devlink_free(devlink);
+}
+
+static int restore_current_port_types(struct mlx4_dev *dev,
+				      enum mlx4_port_type *types,
+				      enum mlx4_port_type *poss_types)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err, i;
+
+	mlx4_stop_sense(dev);
+
+	mutex_lock(&priv->port_mutex);
+	for (i = 0; i < dev->caps.num_ports; i++)
+		dev->caps.possible_type[i + 1] = poss_types[i];
+	err = mlx4_change_port_types(dev, types);
+	mlx4_start_sense(dev);
+	mutex_unlock(&priv->port_mutex);
+
+	return err;
+}
+
+int mlx4_restart_one(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	 *dev  = persist->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	int pci_dev_data, err, total_vfs;
+
+	pci_dev_data = priv->pci_dev_data;
+	total_vfs = dev->persist->num_vfs;
+	memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
+
+	mlx4_unload_one(pdev);
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 1);
+	if (err) {
+		mlx4_err(dev, "%s: ERROR: mlx4_load_one failed, pci_name=%s, err=%d\n",
+			 __func__, pci_name(pdev), err);
+		return err;
+	}
+
+	err = restore_current_port_types(dev, dev->persist->curr_port_type,
+					 dev->persist->curr_port_poss_type);
+	if (err)
+		mlx4_err(dev, "could not restore original port types (%d)\n",
+			 err);
+
+	return err;
+}
+
+#define MLX_SP(id) { PCI_VDEVICE(MELLANOX, id), MLX4_PCI_DEV_FORCE_SENSE_PORT }
+#define MLX_VF(id) { PCI_VDEVICE(MELLANOX, id), MLX4_PCI_DEV_IS_VF }
+#define MLX_GN(id) { PCI_VDEVICE(MELLANOX, id), 0 }
+
+static const struct pci_device_id mlx4_pci_table[] = {
+#ifdef CONFIG_MLX4_CORE_GEN2
+	/* MT25408 "Hermon" */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_SDR),	/* SDR */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_DDR),	/* DDR */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_QDR),	/* QDR */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_DDR_GEN2), /* DDR Gen2 */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_QDR_GEN2),	/* QDR Gen2 */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_EN),	/* EN 10GigE */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_EN_GEN2),  /* EN 10GigE Gen2 */
+	/* MT25458 ConnectX EN 10GBASE-T */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_CONNECTX_EN),
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_CONNECTX_EN_T_GEN2),	/* Gen2 */
+	/* MT26468 ConnectX EN 10GigE PCIe Gen2*/
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_CONNECTX_EN_GEN2),
+	/* MT26438 ConnectX EN 40GigE PCIe Gen2 5GT/s */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_CONNECTX_EN_5_GEN2),
+	/* MT26478 ConnectX2 40GigE PCIe Gen2 */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_CONNECTX2),
+	/* MT25400 Family [ConnectX-2] */
+	MLX_VF(0x1002),					/* Virtual Function */
+#endif /* CONFIG_MLX4_CORE_GEN2 */
+	/* MT27500 Family [ConnectX-3] */
+	MLX_GN(PCI_DEVICE_ID_MELLANOX_CONNECTX3),
+	MLX_VF(0x1004),					/* Virtual Function */
+	MLX_GN(0x1005),					/* MT27510 Family */
+	MLX_GN(0x1006),					/* MT27511 Family */
+	MLX_GN(PCI_DEVICE_ID_MELLANOX_CONNECTX3_PRO),	/* MT27520 Family */
+	MLX_GN(0x1008),					/* MT27521 Family */
+	MLX_GN(0x1009),					/* MT27530 Family */
+	MLX_GN(0x100a),					/* MT27531 Family */
+	MLX_GN(0x100b),					/* MT27540 Family */
+	MLX_GN(0x100c),					/* MT27541 Family */
+	MLX_GN(0x100d),					/* MT27550 Family */
+	MLX_GN(0x100e),					/* MT27551 Family */
+	MLX_GN(0x100f),					/* MT27560 Family */
+	MLX_GN(0x1010),					/* MT27561 Family */
+
+	/*
+	 * See the mellanox_check_broken_intx_masking() quirk when
+	 * adding devices
+	 */
+
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mlx4_pci_table);
+
+static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev,
+					      pci_channel_state_t state)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+
+	mlx4_err(persist->dev, "mlx4_pci_err_detected was called\n");
+	mlx4_enter_error_state(persist);
+
+	mutex_lock(&persist->interface_state_mutex);
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_unload_one(pdev);
+
+	mutex_unlock(&persist->interface_state_mutex);
+	if (state == pci_channel_io_perm_failure)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	mlx4_pci_disable_device(persist->dev);
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	 *dev  = persist->dev;
+	int err;
+
+	mlx4_err(dev, "mlx4_pci_slot_reset was called\n");
+	err = mlx4_pci_enable_device(dev);
+	if (err) {
+		mlx4_err(dev, "Can not re-enable device, err=%d\n", err);
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+	pci_restore_state(pdev);
+	pci_save_state(pdev);
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void mlx4_pci_resume(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	 *dev  = persist->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	int total_vfs;
+	int err;
+
+	mlx4_err(dev, "%s was called\n", __func__);
+	total_vfs = dev->persist->num_vfs;
+	memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
+
+	mutex_lock(&persist->interface_state_mutex);
+	if (!(persist->interface_state & MLX4_INTERFACE_STATE_UP)) {
+		err = mlx4_load_one(pdev, priv->pci_dev_data, total_vfs, nvfs,
+				    priv, 1);
+		if (err) {
+			mlx4_err(dev, "%s: mlx4_load_one failed, err=%d\n",
+				 __func__,  err);
+			goto end;
+		}
+
+		err = restore_current_port_types(dev, dev->persist->
+						 curr_port_type, dev->persist->
+						 curr_port_poss_type);
+		if (err)
+			mlx4_err(dev, "could not restore original port types (%d)\n", err);
+	}
+end:
+	mutex_unlock(&persist->interface_state_mutex);
+
+}
+
+static void mlx4_shutdown(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+
+	mlx4_info(persist->dev, "mlx4_shutdown was called\n");
+	mutex_lock(&persist->interface_state_mutex);
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_unload_one(pdev);
+	mutex_unlock(&persist->interface_state_mutex);
+}
+
+static const struct pci_error_handlers mlx4_err_handler = {
+	.error_detected = mlx4_pci_err_detected,
+	.slot_reset     = mlx4_pci_slot_reset,
+	.resume		= mlx4_pci_resume,
+};
+
+static struct pci_driver mlx4_driver = {
+	.name		= DRV_NAME,
+	.id_table	= mlx4_pci_table,
+	.probe		= mlx4_init_one,
+	.shutdown	= mlx4_shutdown,
+	.remove		= mlx4_remove_one,
+	.err_handler    = &mlx4_err_handler,
+};
+
+static int __init mlx4_verify_params(void)
+{
+	if ((log_num_mac < 0) || (log_num_mac > 7)) {
+		pr_warn("mlx4_core: bad num_mac: %d\n", log_num_mac);
+		return -1;
+	}
+
+	if (log_num_vlan != 0)
+		pr_warn("mlx4_core: log_num_vlan - obsolete module param, using %d\n",
+			MLX4_LOG_NUM_VLANS);
+
+	if (use_prio != 0)
+		pr_warn("mlx4_core: use_prio - obsolete module param, ignored\n");
+
+	if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 7)) {
+		pr_warn("mlx4_core: bad log_mtts_per_seg: %d\n",
+			log_mtts_per_seg);
+		return -1;
+	}
+
+	/* Check if module param for ports type has legal combination */
+	if (port_type_array[0] == false && port_type_array[1] == true) {
+		pr_warn("Module parameter configuration ETH/IB is not supported. Switching to default configuration IB/IB\n");
+		port_type_array[0] = true;
+	}
+
+	if (mlx4_log_num_mgm_entry_size < -7 ||
+	    (mlx4_log_num_mgm_entry_size > 0 &&
+	     (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE ||
+	      mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE))) {
+		pr_warn("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not in legal range (-7..0 or %d..%d)\n",
+			mlx4_log_num_mgm_entry_size,
+			MLX4_MIN_MGM_LOG_ENTRY_SIZE,
+			MLX4_MAX_MGM_LOG_ENTRY_SIZE);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __init mlx4_init(void)
+{
+	int ret;
+
+	if (mlx4_verify_params())
+		return -EINVAL;
+
+
+	mlx4_wq = create_singlethread_workqueue("mlx4");
+	if (!mlx4_wq)
+		return -ENOMEM;
+
+	ret = pci_register_driver(&mlx4_driver);
+	if (ret < 0)
+		destroy_workqueue(mlx4_wq);
+	return ret < 0 ? ret : 0;
+}
+
+static void __exit mlx4_cleanup(void)
+{
+	pci_unregister_driver(&mlx4_driver);
+	destroy_workqueue(mlx4_wq);
+}
+
+module_init(mlx4_init);
+module_exit(mlx4_cleanup);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
new file mode 100644
index 0000000..4c5306d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -0,0 +1,1647 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/etherdevice.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+#include <linux/export.h>
+
+#include "mlx4.h"
+
+int mlx4_get_mgm_entry_size(struct mlx4_dev *dev)
+{
+	return 1 << dev->oper_log_mgm_entry_size;
+}
+
+int mlx4_get_qp_per_mgm(struct mlx4_dev *dev)
+{
+	return 4 * (mlx4_get_mgm_entry_size(dev) / 16 - 2);
+}
+
+static int mlx4_QP_FLOW_STEERING_ATTACH(struct mlx4_dev *dev,
+					struct mlx4_cmd_mailbox *mailbox,
+					u32 size,
+					u64 *reg_id)
+{
+	u64 imm;
+	int err = 0;
+
+	err = mlx4_cmd_imm(dev, mailbox->dma, &imm, size, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+	*reg_id = imm;
+
+	return err;
+}
+
+static int mlx4_QP_FLOW_STEERING_DETACH(struct mlx4_dev *dev, u64 regid)
+{
+	int err = 0;
+
+	err = mlx4_cmd(dev, regid, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+
+	return err;
+}
+
+static int mlx4_READ_ENTRY(struct mlx4_dev *dev, int index,
+			   struct mlx4_cmd_mailbox *mailbox)
+{
+	return mlx4_cmd_box(dev, 0, mailbox->dma, index, 0, MLX4_CMD_READ_MCG,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+
+static int mlx4_WRITE_ENTRY(struct mlx4_dev *dev, int index,
+			    struct mlx4_cmd_mailbox *mailbox)
+{
+	return mlx4_cmd(dev, mailbox->dma, index, 0, MLX4_CMD_WRITE_MCG,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+
+static int mlx4_WRITE_PROMISC(struct mlx4_dev *dev, u8 port, u8 steer,
+			      struct mlx4_cmd_mailbox *mailbox)
+{
+	u32 in_mod;
+
+	in_mod = (u32) port << 16 | steer << 1;
+	return mlx4_cmd(dev, mailbox->dma, in_mod, 0x1,
+			MLX4_CMD_WRITE_MCG, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_NATIVE);
+}
+
+static int mlx4_GID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 u16 *hash, u8 op_mod)
+{
+	u64 imm;
+	int err;
+
+	err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, op_mod,
+			   MLX4_CMD_MGID_HASH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+
+	if (!err)
+		*hash = imm;
+
+	return err;
+}
+
+static struct mlx4_promisc_qp *get_promisc_qp(struct mlx4_dev *dev, u8 port,
+					      enum mlx4_steer_type steer,
+					      u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_promisc_qp *pqp;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return NULL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	list_for_each_entry(pqp, &s_steer->promisc_qps[steer], list) {
+		if (pqp->qpn == qpn)
+			return pqp;
+	}
+	/* not found */
+	return NULL;
+}
+
+/*
+ * Add new entry to steering data structure.
+ * All promisc QPs should be added as well
+ */
+static int new_steering_entry(struct mlx4_dev *dev, u8 port,
+			      enum mlx4_steer_type steer,
+			      unsigned int index, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	struct mlx4_steer_index *new_entry;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp = NULL;
+	u32 prot;
+	int err;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+	new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
+	if (!new_entry)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&new_entry->duplicates);
+	new_entry->index = index;
+	list_add_tail(&new_entry->list, &s_steer->steer_entries[steer]);
+
+	/* If the given qpn is also a promisc qp,
+	 * it should be inserted to duplicates list
+	 */
+	pqp = get_promisc_qp(dev, port, steer, qpn);
+	if (pqp) {
+		dqp = kmalloc(sizeof(*dqp), GFP_KERNEL);
+		if (!dqp) {
+			err = -ENOMEM;
+			goto out_alloc;
+		}
+		dqp->qpn = qpn;
+		list_add_tail(&dqp->list, &new_entry->duplicates);
+	}
+
+	/* if no promisc qps for this vep, we are done */
+	if (list_empty(&s_steer->promisc_qps[steer]))
+		return 0;
+
+	/* now need to add all the promisc qps to the new
+	 * steering entry, as they should also receive the packets
+	 * destined to this address */
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = -ENOMEM;
+		goto out_alloc;
+	}
+	mgm = mailbox->buf;
+
+	err = mlx4_READ_ENTRY(dev, index, mailbox);
+	if (err)
+		goto out_mailbox;
+
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	prot = be32_to_cpu(mgm->members_count) >> 30;
+	list_for_each_entry(pqp, &s_steer->promisc_qps[steer], list) {
+		/* don't add already existing qpn */
+		if (pqp->qpn == qpn)
+			continue;
+		if (members_count == dev->caps.num_qp_per_mgm) {
+			/* out of space */
+			err = -ENOMEM;
+			goto out_mailbox;
+		}
+
+		/* add the qpn */
+		mgm->qp[members_count++] = cpu_to_be32(pqp->qpn & MGM_QPN_MASK);
+	}
+	/* update the qps count and update the entry with all the promisc qps*/
+	mgm->members_count = cpu_to_be32(members_count | (prot << 30));
+	err = mlx4_WRITE_ENTRY(dev, index, mailbox);
+
+out_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (!err)
+		return 0;
+out_alloc:
+	if (dqp) {
+		list_del(&dqp->list);
+		kfree(dqp);
+	}
+	list_del(&new_entry->list);
+	kfree(new_entry);
+	return err;
+}
+
+/* update the data structures with existing steering entry */
+static int existing_steering_entry(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_steer_type steer,
+				   unsigned int index, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_steer_index *tmp_entry, *entry = NULL;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	pqp = get_promisc_qp(dev, port, steer, qpn);
+	if (!pqp)
+		return 0; /* nothing to do */
+
+	list_for_each_entry(tmp_entry, &s_steer->steer_entries[steer], list) {
+		if (tmp_entry->index == index) {
+			entry = tmp_entry;
+			break;
+		}
+	}
+	if (unlikely(!entry)) {
+		mlx4_warn(dev, "Steering entry at index %x is not registered\n", index);
+		return -EINVAL;
+	}
+
+	/* the given qpn is listed as a promisc qpn
+	 * we need to add it as a duplicate to this entry
+	 * for future references */
+	list_for_each_entry(dqp, &entry->duplicates, list) {
+		if (qpn == dqp->qpn)
+			return 0; /* qp is already duplicated */
+	}
+
+	/* add the qp as a duplicate on this index */
+	dqp = kmalloc(sizeof(*dqp), GFP_KERNEL);
+	if (!dqp)
+		return -ENOMEM;
+	dqp->qpn = qpn;
+	list_add_tail(&dqp->list, &entry->duplicates);
+
+	return 0;
+}
+
+/* Check whether a qpn is a duplicate on steering entry
+ * If so, it should not be removed from mgm */
+static bool check_duplicate_entry(struct mlx4_dev *dev, u8 port,
+				  enum mlx4_steer_type steer,
+				  unsigned int index, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_steer_index *tmp_entry, *entry = NULL;
+	struct mlx4_promisc_qp *dqp, *tmp_dqp;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return NULL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	/* if qp is not promisc, it cannot be duplicated */
+	if (!get_promisc_qp(dev, port, steer, qpn))
+		return false;
+
+	/* The qp is promisc qp so it is a duplicate on this index
+	 * Find the index entry, and remove the duplicate */
+	list_for_each_entry(tmp_entry, &s_steer->steer_entries[steer], list) {
+		if (tmp_entry->index == index) {
+			entry = tmp_entry;
+			break;
+		}
+	}
+	if (unlikely(!entry)) {
+		mlx4_warn(dev, "Steering entry for index %x is not registered\n", index);
+		return false;
+	}
+	list_for_each_entry_safe(dqp, tmp_dqp, &entry->duplicates, list) {
+		if (dqp->qpn == qpn) {
+			list_del(&dqp->list);
+			kfree(dqp);
+		}
+	}
+	return true;
+}
+
+/* Returns true if all the QPs != tqpn contained in this entry
+ * are Promisc QPs. Returns false otherwise.
+ */
+static bool promisc_steering_entry(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_steer_type steer,
+				   unsigned int index, u32 tqpn,
+				   u32 *members_count)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 m_count;
+	bool ret = false;
+	int i;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return false;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return false;
+	mgm = mailbox->buf;
+
+	if (mlx4_READ_ENTRY(dev, index, mailbox))
+		goto out;
+	m_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	if (members_count)
+		*members_count = m_count;
+
+	for (i = 0;  i < m_count; i++) {
+		u32 qpn = be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK;
+		if (!get_promisc_qp(dev, port, steer, qpn) && qpn != tqpn) {
+			/* the qp is not promisc, the entry can't be removed */
+			goto out;
+		}
+	}
+	ret = true;
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+
+/* IF a steering entry contains only promisc QPs, it can be removed. */
+static bool can_remove_steering_entry(struct mlx4_dev *dev, u8 port,
+				      enum mlx4_steer_type steer,
+				      unsigned int index, u32 tqpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_steer_index *entry = NULL, *tmp_entry;
+	u32 members_count;
+	bool ret = false;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return NULL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	if (!promisc_steering_entry(dev, port, steer, index,
+				    tqpn, &members_count))
+		goto out;
+
+	/* All the qps currently registered for this entry are promiscuous,
+	  * Checking for duplicates */
+	ret = true;
+	list_for_each_entry_safe(entry, tmp_entry, &s_steer->steer_entries[steer], list) {
+		if (entry->index == index) {
+			if (list_empty(&entry->duplicates) ||
+			    members_count == 1) {
+				struct mlx4_promisc_qp *pqp, *tmp_pqp;
+				/* If there is only 1 entry in duplicates then
+				 * this is the QP we want to delete, going over
+				 * the list and deleting the entry.
+				 */
+				list_del(&entry->list);
+				list_for_each_entry_safe(pqp, tmp_pqp,
+							 &entry->duplicates,
+							 list) {
+					list_del(&pqp->list);
+					kfree(pqp);
+				}
+				kfree(entry);
+			} else {
+				/* This entry contains duplicates so it shouldn't be removed */
+				ret = false;
+				goto out;
+			}
+		}
+	}
+
+out:
+	return ret;
+}
+
+static int add_promisc_qp(struct mlx4_dev *dev, u8 port,
+			  enum mlx4_steer_type steer, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	struct mlx4_steer_index *entry;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp;
+	u32 members_count;
+	u32 prot;
+	int i;
+	bool found;
+	int err;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	mutex_lock(&priv->mcg_table.mutex);
+
+	if (get_promisc_qp(dev, port, steer, qpn)) {
+		err = 0;  /* Noting to do, already exists */
+		goto out_mutex;
+	}
+
+	pqp = kmalloc(sizeof(*pqp), GFP_KERNEL);
+	if (!pqp) {
+		err = -ENOMEM;
+		goto out_mutex;
+	}
+	pqp->qpn = qpn;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = -ENOMEM;
+		goto out_alloc;
+	}
+	mgm = mailbox->buf;
+
+	if (!(mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)) {
+		/* The promisc QP needs to be added for each one of the steering
+		 * entries. If it already exists, needs to be added as
+		 * a duplicate for this entry.
+		 */
+		list_for_each_entry(entry,
+				    &s_steer->steer_entries[steer],
+				    list) {
+			err = mlx4_READ_ENTRY(dev, entry->index, mailbox);
+			if (err)
+				goto out_mailbox;
+
+			members_count = be32_to_cpu(mgm->members_count) &
+					0xffffff;
+			prot = be32_to_cpu(mgm->members_count) >> 30;
+			found = false;
+			for (i = 0; i < members_count; i++) {
+				if ((be32_to_cpu(mgm->qp[i]) &
+				     MGM_QPN_MASK) == qpn) {
+					/* Entry already exists.
+					 * Add to duplicates.
+					 */
+					dqp = kmalloc(sizeof(*dqp), GFP_KERNEL);
+					if (!dqp) {
+						err = -ENOMEM;
+						goto out_mailbox;
+					}
+					dqp->qpn = qpn;
+					list_add_tail(&dqp->list,
+						      &entry->duplicates);
+					found = true;
+				}
+			}
+			if (!found) {
+				/* Need to add the qpn to mgm */
+				if (members_count ==
+				    dev->caps.num_qp_per_mgm) {
+					/* entry is full */
+					err = -ENOMEM;
+					goto out_mailbox;
+				}
+				mgm->qp[members_count++] =
+					cpu_to_be32(qpn & MGM_QPN_MASK);
+				mgm->members_count =
+					cpu_to_be32(members_count |
+						    (prot << 30));
+				err = mlx4_WRITE_ENTRY(dev, entry->index,
+						       mailbox);
+				if (err)
+					goto out_mailbox;
+			}
+		}
+	}
+
+	/* add the new qpn to list of promisc qps */
+	list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]);
+	/* now need to add all the promisc qps to default entry */
+	memset(mgm, 0, sizeof(*mgm));
+	members_count = 0;
+	list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list) {
+		if (members_count == dev->caps.num_qp_per_mgm) {
+			/* entry is full */
+			err = -ENOMEM;
+			goto out_list;
+		}
+		mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK);
+	}
+	mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30);
+
+	err = mlx4_WRITE_PROMISC(dev, port, steer, mailbox);
+	if (err)
+		goto out_list;
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	mutex_unlock(&priv->mcg_table.mutex);
+	return 0;
+
+out_list:
+	list_del(&pqp->list);
+out_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+out_alloc:
+	kfree(pqp);
+out_mutex:
+	mutex_unlock(&priv->mcg_table.mutex);
+	return err;
+}
+
+static int remove_promisc_qp(struct mlx4_dev *dev, u8 port,
+			     enum mlx4_steer_type steer, u32 qpn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_steer *s_steer;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	struct mlx4_steer_index *entry, *tmp_entry;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp;
+	u32 members_count;
+	bool found;
+	bool back_to_list = false;
+	int i;
+	int err;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+	mutex_lock(&priv->mcg_table.mutex);
+
+	pqp = get_promisc_qp(dev, port, steer, qpn);
+	if (unlikely(!pqp)) {
+		mlx4_warn(dev, "QP %x is not promiscuous QP\n", qpn);
+		/* nothing to do */
+		err = 0;
+		goto out_mutex;
+	}
+
+	/*remove from list of promisc qps */
+	list_del(&pqp->list);
+
+	/* set the default entry not to include the removed one */
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = -ENOMEM;
+		back_to_list = true;
+		goto out_list;
+	}
+	mgm = mailbox->buf;
+	members_count = 0;
+	list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list)
+		mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK);
+	mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30);
+
+	err = mlx4_WRITE_PROMISC(dev, port, steer, mailbox);
+	if (err)
+		goto out_mailbox;
+
+	if (!(mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)) {
+		/* Remove the QP from all the steering entries */
+		list_for_each_entry_safe(entry, tmp_entry,
+					 &s_steer->steer_entries[steer],
+					 list) {
+			found = false;
+			list_for_each_entry(dqp, &entry->duplicates, list) {
+				if (dqp->qpn == qpn) {
+					found = true;
+					break;
+				}
+			}
+			if (found) {
+				/* A duplicate, no need to change the MGM,
+				 * only update the duplicates list
+				 */
+				list_del(&dqp->list);
+				kfree(dqp);
+			} else {
+				int loc = -1;
+
+				err = mlx4_READ_ENTRY(dev,
+						      entry->index,
+						      mailbox);
+				if (err)
+					goto out_mailbox;
+				members_count =
+					be32_to_cpu(mgm->members_count) &
+					0xffffff;
+				if (!members_count) {
+					mlx4_warn(dev, "QP %06x wasn't found in entry %x mcount=0. deleting entry...\n",
+						  qpn, entry->index);
+					list_del(&entry->list);
+					kfree(entry);
+					continue;
+				}
+
+				for (i = 0; i < members_count; ++i)
+					if ((be32_to_cpu(mgm->qp[i]) &
+					     MGM_QPN_MASK) == qpn) {
+						loc = i;
+						break;
+					}
+
+				if (loc < 0) {
+					mlx4_err(dev, "QP %06x wasn't found in entry %d\n",
+						 qpn, entry->index);
+					err = -EINVAL;
+					goto out_mailbox;
+				}
+
+				/* Copy the last QP in this MGM
+				 * over removed QP
+				 */
+				mgm->qp[loc] = mgm->qp[members_count - 1];
+				mgm->qp[members_count - 1] = 0;
+				mgm->members_count =
+					cpu_to_be32(--members_count |
+						    (MLX4_PROT_ETH << 30));
+
+				err = mlx4_WRITE_ENTRY(dev,
+						       entry->index,
+						       mailbox);
+				if (err)
+					goto out_mailbox;
+			}
+		}
+	}
+
+out_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+out_list:
+	if (back_to_list)
+		list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]);
+	else
+		kfree(pqp);
+out_mutex:
+	mutex_unlock(&priv->mcg_table.mutex);
+	return err;
+}
+
+/*
+ * Caller must hold MCG table semaphore.  gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ *  Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_entry(struct mlx4_dev *dev, u8 port,
+		      u8 *gid, enum mlx4_protocol prot,
+		      struct mlx4_cmd_mailbox *mgm_mailbox,
+		      int *prev, int *index)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm = mgm_mailbox->buf;
+	u8 *mgid;
+	int err;
+	u16 hash;
+	u8 op_mod = (prot == MLX4_PROT_ETH) ?
+		!!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) : 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+	mgid = mailbox->buf;
+
+	memcpy(mgid, gid, 16);
+
+	err = mlx4_GID_HASH(dev, mailbox, &hash, op_mod);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		return err;
+
+	if (0)
+		mlx4_dbg(dev, "Hash for %pI6 is %04x\n", gid, hash);
+
+	*index = hash;
+	*prev  = -1;
+
+	do {
+		err = mlx4_READ_ENTRY(dev, *index, mgm_mailbox);
+		if (err)
+			return err;
+
+		if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) {
+			if (*index != hash) {
+				mlx4_err(dev, "Found zero MGID in AMGM\n");
+				err = -EINVAL;
+			}
+			return err;
+		}
+
+		if (!memcmp(mgm->gid, gid, 16) &&
+		    be32_to_cpu(mgm->members_count) >> 30 == prot)
+			return err;
+
+		*prev = *index;
+		*index = be32_to_cpu(mgm->next_gid_index) >> 6;
+	} while (*index);
+
+	*index = -1;
+	return err;
+}
+
+static const u8 __promisc_mode[] = {
+	[MLX4_FS_REGULAR]   = 0x0,
+	[MLX4_FS_ALL_DEFAULT] = 0x1,
+	[MLX4_FS_MC_DEFAULT] = 0x3,
+	[MLX4_FS_MIRROR_RX_PORT] = 0x4,
+	[MLX4_FS_MIRROR_SX_PORT] = 0x5,
+	[MLX4_FS_UC_SNIFFER] = 0x6,
+	[MLX4_FS_MC_SNIFFER] = 0x7,
+};
+
+int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
+				    enum mlx4_net_trans_promisc_mode flow_type)
+{
+	if (flow_type >= MLX4_FS_MODE_NUM) {
+		mlx4_err(dev, "Invalid flow type. type = %d\n", flow_type);
+		return -EINVAL;
+	}
+	return __promisc_mode[flow_type];
+}
+EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_mode);
+
+static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl,
+				  struct mlx4_net_trans_rule_hw_ctrl *hw)
+{
+	u8 flags = 0;
+
+	flags = ctrl->queue_mode == MLX4_NET_TRANS_Q_LIFO ? 1 : 0;
+	flags |= ctrl->exclusive ? (1 << 2) : 0;
+	flags |= ctrl->allow_loopback ? (1 << 3) : 0;
+
+	hw->flags = flags;
+	hw->type = __promisc_mode[ctrl->promisc_mode];
+	hw->prio = cpu_to_be16(ctrl->priority);
+	hw->port = ctrl->port;
+	hw->qpn = cpu_to_be32(ctrl->qpn);
+}
+
+const u16 __sw_id_hw[] = {
+	[MLX4_NET_TRANS_RULE_ID_ETH]     = 0xE001,
+	[MLX4_NET_TRANS_RULE_ID_IB]      = 0xE005,
+	[MLX4_NET_TRANS_RULE_ID_IPV6]    = 0xE003,
+	[MLX4_NET_TRANS_RULE_ID_IPV4]    = 0xE002,
+	[MLX4_NET_TRANS_RULE_ID_TCP]     = 0xE004,
+	[MLX4_NET_TRANS_RULE_ID_UDP]     = 0xE006,
+	[MLX4_NET_TRANS_RULE_ID_VXLAN]	 = 0xE008
+};
+
+int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev,
+				  enum mlx4_net_trans_rule_id id)
+{
+	if (id >= MLX4_NET_TRANS_RULE_NUM) {
+		mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
+		return -EINVAL;
+	}
+	return __sw_id_hw[id];
+}
+EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_id);
+
+static const int __rule_hw_sz[] = {
+	[MLX4_NET_TRANS_RULE_ID_ETH] =
+		sizeof(struct mlx4_net_trans_rule_hw_eth),
+	[MLX4_NET_TRANS_RULE_ID_IB] =
+		sizeof(struct mlx4_net_trans_rule_hw_ib),
+	[MLX4_NET_TRANS_RULE_ID_IPV6] = 0,
+	[MLX4_NET_TRANS_RULE_ID_IPV4] =
+		sizeof(struct mlx4_net_trans_rule_hw_ipv4),
+	[MLX4_NET_TRANS_RULE_ID_TCP] =
+		sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
+	[MLX4_NET_TRANS_RULE_ID_UDP] =
+		sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
+	[MLX4_NET_TRANS_RULE_ID_VXLAN] =
+		sizeof(struct mlx4_net_trans_rule_hw_vxlan)
+};
+
+int mlx4_hw_rule_sz(struct mlx4_dev *dev,
+	       enum mlx4_net_trans_rule_id id)
+{
+	if (id >= MLX4_NET_TRANS_RULE_NUM) {
+		mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
+		return -EINVAL;
+	}
+
+	return __rule_hw_sz[id];
+}
+EXPORT_SYMBOL_GPL(mlx4_hw_rule_sz);
+
+static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec,
+			    struct _rule_hw *rule_hw)
+{
+	if (mlx4_hw_rule_sz(dev, spec->id) < 0)
+		return -EINVAL;
+	memset(rule_hw, 0, mlx4_hw_rule_sz(dev, spec->id));
+	rule_hw->id = cpu_to_be16(__sw_id_hw[spec->id]);
+	rule_hw->size = mlx4_hw_rule_sz(dev, spec->id) >> 2;
+
+	switch (spec->id) {
+	case MLX4_NET_TRANS_RULE_ID_ETH:
+		memcpy(rule_hw->eth.dst_mac, spec->eth.dst_mac, ETH_ALEN);
+		memcpy(rule_hw->eth.dst_mac_msk, spec->eth.dst_mac_msk,
+		       ETH_ALEN);
+		memcpy(rule_hw->eth.src_mac, spec->eth.src_mac, ETH_ALEN);
+		memcpy(rule_hw->eth.src_mac_msk, spec->eth.src_mac_msk,
+		       ETH_ALEN);
+		if (spec->eth.ether_type_enable) {
+			rule_hw->eth.ether_type_enable = 1;
+			rule_hw->eth.ether_type = spec->eth.ether_type;
+		}
+		rule_hw->eth.vlan_tag = spec->eth.vlan_id;
+		rule_hw->eth.vlan_tag_msk = spec->eth.vlan_id_msk;
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_IB:
+		rule_hw->ib.l3_qpn = spec->ib.l3_qpn;
+		rule_hw->ib.qpn_mask = spec->ib.qpn_msk;
+		memcpy(&rule_hw->ib.dst_gid, &spec->ib.dst_gid, 16);
+		memcpy(&rule_hw->ib.dst_gid_msk, &spec->ib.dst_gid_msk, 16);
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_IPV6:
+		return -EOPNOTSUPP;
+
+	case MLX4_NET_TRANS_RULE_ID_IPV4:
+		rule_hw->ipv4.src_ip = spec->ipv4.src_ip;
+		rule_hw->ipv4.src_ip_msk = spec->ipv4.src_ip_msk;
+		rule_hw->ipv4.dst_ip = spec->ipv4.dst_ip;
+		rule_hw->ipv4.dst_ip_msk = spec->ipv4.dst_ip_msk;
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_TCP:
+	case MLX4_NET_TRANS_RULE_ID_UDP:
+		rule_hw->tcp_udp.dst_port = spec->tcp_udp.dst_port;
+		rule_hw->tcp_udp.dst_port_msk = spec->tcp_udp.dst_port_msk;
+		rule_hw->tcp_udp.src_port = spec->tcp_udp.src_port;
+		rule_hw->tcp_udp.src_port_msk = spec->tcp_udp.src_port_msk;
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_VXLAN:
+		rule_hw->vxlan.vni =
+			cpu_to_be32(be32_to_cpu(spec->vxlan.vni) << 8);
+		rule_hw->vxlan.vni_mask =
+			cpu_to_be32(be32_to_cpu(spec->vxlan.vni_mask) << 8);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return __rule_hw_sz[spec->id];
+}
+
+static void mlx4_err_rule(struct mlx4_dev *dev, char *str,
+			  struct mlx4_net_trans_rule *rule)
+{
+#define BUF_SIZE 256
+	struct mlx4_spec_list *cur;
+	char buf[BUF_SIZE];
+	int len = 0;
+
+	mlx4_err(dev, "%s", str);
+	len += snprintf(buf + len, BUF_SIZE - len,
+			"port = %d prio = 0x%x qp = 0x%x ",
+			rule->port, rule->priority, rule->qpn);
+
+	list_for_each_entry(cur, &rule->list, list) {
+		switch (cur->id) {
+		case MLX4_NET_TRANS_RULE_ID_ETH:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dmac = %pM ", &cur->eth.dst_mac);
+			if (cur->eth.ether_type)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"ethertype = 0x%x ",
+						be16_to_cpu(cur->eth.ether_type));
+			if (cur->eth.vlan_id)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"vlan-id = %d ",
+						be16_to_cpu(cur->eth.vlan_id));
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_IPV4:
+			if (cur->ipv4.src_ip)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"src-ip = %pI4 ",
+						&cur->ipv4.src_ip);
+			if (cur->ipv4.dst_ip)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"dst-ip = %pI4 ",
+						&cur->ipv4.dst_ip);
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_TCP:
+		case MLX4_NET_TRANS_RULE_ID_UDP:
+			if (cur->tcp_udp.src_port)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"src-port = %d ",
+						be16_to_cpu(cur->tcp_udp.src_port));
+			if (cur->tcp_udp.dst_port)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"dst-port = %d ",
+						be16_to_cpu(cur->tcp_udp.dst_port));
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_IB:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dst-gid = %pI6\n", cur->ib.dst_gid);
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dst-gid-mask = %pI6\n",
+					cur->ib.dst_gid_msk);
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_VXLAN:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"VNID = %d ", be32_to_cpu(cur->vxlan.vni));
+			break;
+		case MLX4_NET_TRANS_RULE_ID_IPV6:
+			break;
+
+		default:
+			break;
+		}
+	}
+	len += snprintf(buf + len, BUF_SIZE - len, "\n");
+	mlx4_err(dev, "%s", buf);
+
+	if (len >= BUF_SIZE)
+		mlx4_err(dev, "Network rule error message was truncated, print buffer is too small\n");
+}
+
+int mlx4_flow_attach(struct mlx4_dev *dev,
+		     struct mlx4_net_trans_rule *rule, u64 *reg_id)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_spec_list *cur;
+	u32 size = 0;
+	int ret;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (!mlx4_qp_lookup(dev, rule->qpn)) {
+		mlx4_err_rule(dev, "QP doesn't exist\n", rule);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	trans_rule_ctrl_to_hw(rule, mailbox->buf);
+
+	size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+
+	list_for_each_entry(cur, &rule->list, list) {
+		ret = parse_trans_rule(dev, cur, mailbox->buf + size);
+		if (ret < 0)
+			goto out;
+
+		size += ret;
+	}
+
+	ret = mlx4_QP_FLOW_STEERING_ATTACH(dev, mailbox, size >> 2, reg_id);
+	if (ret == -ENOMEM) {
+		mlx4_err_rule(dev,
+			      "mcg table is full. Fail to register network rule\n",
+			      rule);
+	} else if (ret) {
+		if (ret == -ENXIO) {
+			if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED)
+				mlx4_err_rule(dev,
+					      "DMFS is not enabled, "
+					      "failed to register network rule.\n",
+					      rule);
+			else
+				mlx4_err_rule(dev,
+					      "Rule exceeds the dmfs_high_rate_mode limitations, "
+					      "failed to register network rule.\n",
+					      rule);
+
+		} else {
+			mlx4_err_rule(dev, "Fail to register network rule.\n", rule);
+		}
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_attach);
+
+int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id)
+{
+	int err;
+
+	err = mlx4_QP_FLOW_STEERING_DETACH(dev, reg_id);
+	if (err)
+		mlx4_err(dev, "Fail to detach network rule. registration id = 0x%llx\n",
+			 reg_id);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_detach);
+
+int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr,
+			  int port, int qpn, u16 prio, u64 *reg_id)
+{
+	int err;
+	struct mlx4_spec_list spec_eth_outer = { {NULL} };
+	struct mlx4_spec_list spec_vxlan     = { {NULL} };
+	struct mlx4_spec_list spec_eth_inner = { {NULL} };
+
+	struct mlx4_net_trans_rule rule = {
+		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+		.exclusive = 0,
+		.allow_loopback = 1,
+		.promisc_mode = MLX4_FS_REGULAR,
+	};
+
+	__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	rule.port = port;
+	rule.qpn = qpn;
+	rule.priority = prio;
+	INIT_LIST_HEAD(&rule.list);
+
+	spec_eth_outer.id = MLX4_NET_TRANS_RULE_ID_ETH;
+	memcpy(spec_eth_outer.eth.dst_mac, addr, ETH_ALEN);
+	memcpy(spec_eth_outer.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+
+	spec_vxlan.id = MLX4_NET_TRANS_RULE_ID_VXLAN;    /* any vxlan header */
+	spec_eth_inner.id = MLX4_NET_TRANS_RULE_ID_ETH;	 /* any inner eth header */
+
+	list_add_tail(&spec_eth_outer.list, &rule.list);
+	list_add_tail(&spec_vxlan.list,     &rule.list);
+	list_add_tail(&spec_eth_inner.list, &rule.list);
+
+	err = mlx4_flow_attach(dev, &rule, reg_id);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_tunnel_steer_add);
+
+int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn,
+				      u32 max_range_qpn)
+{
+	int err;
+	u64 in_param;
+
+	in_param = ((u64) min_range_qpn) << 32;
+	in_param |= ((u64) max_range_qpn) & 0xFFFFFFFF;
+
+	err = mlx4_cmd(dev, in_param, 0, 0,
+			MLX4_FLOW_STEERING_IB_UC_QP_RANGE,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_FLOW_STEERING_IB_UC_QP_RANGE);
+
+int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback, enum mlx4_protocol prot,
+			  enum mlx4_steer_type steer)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	int index = -1, prev;
+	int link = 0;
+	int i;
+	int err;
+	u8 port = gid[5];
+	u8 new_entry = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&priv->mcg_table.mutex);
+	err = find_entry(dev, port, gid, prot,
+			 mailbox, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index != -1) {
+		if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) {
+			new_entry = 1;
+			memcpy(mgm->gid, gid, 16);
+		}
+	} else {
+		link = 1;
+
+		index = mlx4_bitmap_alloc(&priv->mcg_table.bitmap);
+		if (index == -1) {
+			mlx4_err(dev, "No AMGM entries left\n");
+			err = -ENOMEM;
+			goto out;
+		}
+		index += dev->caps.num_mgms;
+
+		new_entry = 1;
+		memset(mgm, 0, sizeof(*mgm));
+		memcpy(mgm->gid, gid, 16);
+	}
+
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	if (members_count == dev->caps.num_qp_per_mgm) {
+		mlx4_err(dev, "MGM at index %x is full\n", index);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < members_count; ++i)
+		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) {
+			mlx4_dbg(dev, "QP %06x already a member of MGM\n", qp->qpn);
+			err = 0;
+			goto out;
+		}
+
+	if (block_mcast_loopback)
+		mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) |
+						       (1U << MGM_BLCK_LB_BIT));
+	else
+		mgm->qp[members_count++] = cpu_to_be32(qp->qpn & MGM_QPN_MASK);
+
+	mgm->members_count = cpu_to_be32(members_count | (u32) prot << 30);
+
+	err = mlx4_WRITE_ENTRY(dev, index, mailbox);
+	if (err)
+		goto out;
+
+	if (!link)
+		goto out;
+
+	err = mlx4_READ_ENTRY(dev, prev, mailbox);
+	if (err)
+		goto out;
+
+	mgm->next_gid_index = cpu_to_be32(index << 6);
+
+	err = mlx4_WRITE_ENTRY(dev, prev, mailbox);
+	if (err)
+		goto out;
+
+out:
+	if (prot == MLX4_PROT_ETH && index != -1) {
+		/* manage the steering entry for promisc mode */
+		if (new_entry)
+			err = new_steering_entry(dev, port, steer,
+						 index, qp->qpn);
+		else
+			err = existing_steering_entry(dev, port, steer,
+						      index, qp->qpn);
+	}
+	if (err && link && index != -1) {
+		if (index < dev->caps.num_mgms)
+			mlx4_warn(dev, "Got AMGM index %d < %d\n",
+				  index, dev->caps.num_mgms);
+		else
+			mlx4_bitmap_free(&priv->mcg_table.bitmap,
+					 index - dev->caps.num_mgms, MLX4_USE_RR);
+	}
+	mutex_unlock(&priv->mcg_table.mutex);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol prot, enum mlx4_steer_type steer)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	int prev, index;
+	int i, loc = -1;
+	int err;
+	u8 port = gid[5];
+	bool removed_entry = false;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&priv->mcg_table.mutex);
+
+	err = find_entry(dev, port, gid, prot,
+			 mailbox, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index == -1) {
+		mlx4_err(dev, "MGID %pI6 not found\n", gid);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* If this QP is also a promisc QP, it shouldn't be removed only if
+	 * at least one none promisc QP is also attached to this MCG
+	 */
+	if (prot == MLX4_PROT_ETH &&
+	    check_duplicate_entry(dev, port, steer, index, qp->qpn) &&
+	    !promisc_steering_entry(dev, port, steer, index, qp->qpn, NULL))
+			goto out;
+
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	for (i = 0; i < members_count; ++i)
+		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) {
+			loc = i;
+			break;
+		}
+
+	if (loc == -1) {
+		mlx4_err(dev, "QP %06x not found in MGM\n", qp->qpn);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* copy the last QP in this MGM over removed QP */
+	mgm->qp[loc] = mgm->qp[members_count - 1];
+	mgm->qp[members_count - 1] = 0;
+	mgm->members_count = cpu_to_be32(--members_count | (u32) prot << 30);
+
+	if (prot == MLX4_PROT_ETH)
+		removed_entry = can_remove_steering_entry(dev, port, steer,
+								index, qp->qpn);
+	if (members_count && (prot != MLX4_PROT_ETH || !removed_entry)) {
+		err = mlx4_WRITE_ENTRY(dev, index, mailbox);
+		goto out;
+	}
+
+	/* We are going to delete the entry, members count should be 0 */
+	mgm->members_count = cpu_to_be32((u32) prot << 30);
+
+	if (prev == -1) {
+		/* Remove entry from MGM */
+		int amgm_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		if (amgm_index) {
+			err = mlx4_READ_ENTRY(dev, amgm_index, mailbox);
+			if (err)
+				goto out;
+		} else
+			memset(mgm->gid, 0, 16);
+
+		err = mlx4_WRITE_ENTRY(dev, index, mailbox);
+		if (err)
+			goto out;
+
+		if (amgm_index) {
+			if (amgm_index < dev->caps.num_mgms)
+				mlx4_warn(dev, "MGM entry %d had AMGM index %d < %d\n",
+					  index, amgm_index, dev->caps.num_mgms);
+			else
+				mlx4_bitmap_free(&priv->mcg_table.bitmap,
+						 amgm_index - dev->caps.num_mgms, MLX4_USE_RR);
+		}
+	} else {
+		/* Remove entry from AMGM */
+		int cur_next_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		err = mlx4_READ_ENTRY(dev, prev, mailbox);
+		if (err)
+			goto out;
+
+		mgm->next_gid_index = cpu_to_be32(cur_next_index << 6);
+
+		err = mlx4_WRITE_ENTRY(dev, prev, mailbox);
+		if (err)
+			goto out;
+
+		if (index < dev->caps.num_mgms)
+			mlx4_warn(dev, "entry %d had next AMGM index %d < %d\n",
+				  prev, index, dev->caps.num_mgms);
+		else
+			mlx4_bitmap_free(&priv->mcg_table.bitmap,
+					 index - dev->caps.num_mgms, MLX4_USE_RR);
+	}
+
+out:
+	mutex_unlock(&priv->mcg_table.mutex);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err && dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		/* In case device is under an error, return success as a closing command */
+		err = 0;
+	return err;
+}
+
+static int mlx4_QP_ATTACH(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			  u8 gid[16], u8 attach, u8 block_loopback,
+			  enum mlx4_protocol prot)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err = 0;
+	int qpn;
+
+	if (!mlx4_is_mfunc(dev))
+		return -EBADF;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, gid, 16);
+	qpn = qp->qpn;
+	qpn |= (prot << 28);
+	if (attach && block_loopback)
+		qpn |= (1 << 31);
+
+	err = mlx4_cmd(dev, mailbox->dma, qpn, attach,
+		       MLX4_CMD_QP_ATTACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err && !attach &&
+	    dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		err = 0;
+	return err;
+}
+
+int mlx4_trans_to_dmfs_attach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			      u8 gid[16], u8 port,
+			      int block_mcast_loopback,
+			      enum mlx4_protocol prot, u64 *reg_id)
+{
+		struct mlx4_spec_list spec = { {NULL} };
+		__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+		struct mlx4_net_trans_rule rule = {
+			.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+			.exclusive = 0,
+			.promisc_mode = MLX4_FS_REGULAR,
+			.priority = MLX4_DOMAIN_NIC,
+		};
+
+		rule.allow_loopback = !block_mcast_loopback;
+		rule.port = port;
+		rule.qpn = qp->qpn;
+		INIT_LIST_HEAD(&rule.list);
+
+		switch (prot) {
+		case MLX4_PROT_ETH:
+			spec.id = MLX4_NET_TRANS_RULE_ID_ETH;
+			memcpy(spec.eth.dst_mac, &gid[10], ETH_ALEN);
+			memcpy(spec.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+			break;
+
+		case MLX4_PROT_IB_IPV6:
+			spec.id = MLX4_NET_TRANS_RULE_ID_IB;
+			memcpy(spec.ib.dst_gid, gid, 16);
+			memset(&spec.ib.dst_gid_msk, 0xff, 16);
+			break;
+		default:
+			return -EINVAL;
+		}
+		list_add_tail(&spec.list, &rule.list);
+
+		return mlx4_flow_attach(dev, &rule, reg_id);
+}
+
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  u8 port, int block_mcast_loopback,
+			  enum mlx4_protocol prot, u64 *reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		if (prot == MLX4_PROT_ETH)
+			return 0;
+
+	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH)
+			gid[7] |= (MLX4_MC_STEER << 1);
+
+		if (mlx4_is_mfunc(dev))
+			return mlx4_QP_ATTACH(dev, qp, gid, 1,
+					      block_mcast_loopback, prot);
+		return mlx4_qp_attach_common(dev, qp, gid,
+					     block_mcast_loopback, prot,
+					     MLX4_MC_STEER);
+
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_trans_to_dmfs_attach(dev, qp, gid, port,
+						 block_mcast_loopback,
+						 prot, reg_id);
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_attach);
+
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol prot, u64 reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		if (prot == MLX4_PROT_ETH)
+			return 0;
+
+	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH)
+			gid[7] |= (MLX4_MC_STEER << 1);
+
+		if (mlx4_is_mfunc(dev))
+			return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot);
+
+		return mlx4_qp_detach_common(dev, qp, gid, prot,
+					     MLX4_MC_STEER);
+
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_flow_detach(dev, reg_id);
+
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_detach);
+
+int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port,
+				u32 qpn, enum mlx4_net_trans_promisc_mode mode)
+{
+	struct mlx4_net_trans_rule rule = {
+		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+		.exclusive = 0,
+		.allow_loopback = 1,
+	};
+
+	u64 *regid_p;
+
+	switch (mode) {
+	case MLX4_FS_ALL_DEFAULT:
+		regid_p = &dev->regid_promisc_array[port];
+		break;
+	case MLX4_FS_MC_DEFAULT:
+		regid_p = &dev->regid_allmulti_array[port];
+		break;
+	default:
+		return -1;
+	}
+
+	if (*regid_p != 0)
+		return -1;
+
+	rule.promisc_mode = mode;
+	rule.port = port;
+	rule.qpn = qpn;
+	INIT_LIST_HEAD(&rule.list);
+	mlx4_err(dev, "going promisc on %x\n", port);
+
+	return  mlx4_flow_attach(dev, &rule, regid_p);
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_add);
+
+int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_net_trans_promisc_mode mode)
+{
+	int ret;
+	u64 *regid_p;
+
+	switch (mode) {
+	case MLX4_FS_ALL_DEFAULT:
+		regid_p = &dev->regid_promisc_array[port];
+		break;
+	case MLX4_FS_MC_DEFAULT:
+		regid_p = &dev->regid_allmulti_array[port];
+		break;
+	default:
+		return -1;
+	}
+
+	if (*regid_p == 0)
+		return -1;
+
+	ret =  mlx4_flow_detach(dev, *regid_p);
+	if (ret == 0)
+		*regid_p = 0;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_remove);
+
+int mlx4_unicast_attach(struct mlx4_dev *dev,
+			struct mlx4_qp *qp, u8 gid[16],
+			int block_mcast_loopback, enum mlx4_protocol prot)
+{
+	if (prot == MLX4_PROT_ETH)
+		gid[7] |= (MLX4_UC_STEER << 1);
+
+	if (mlx4_is_mfunc(dev))
+		return mlx4_QP_ATTACH(dev, qp, gid, 1,
+					block_mcast_loopback, prot);
+
+	return mlx4_qp_attach_common(dev, qp, gid, block_mcast_loopback,
+					prot, MLX4_UC_STEER);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_attach);
+
+int mlx4_unicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			       u8 gid[16], enum mlx4_protocol prot)
+{
+	if (prot == MLX4_PROT_ETH)
+		gid[7] |= (MLX4_UC_STEER << 1);
+
+	if (mlx4_is_mfunc(dev))
+		return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot);
+
+	return mlx4_qp_detach_common(dev, qp, gid, prot, MLX4_UC_STEER);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_detach);
+
+int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd)
+{
+	u32 qpn = (u32) vhcr->in_param & 0xffffffff;
+	int port = mlx4_slave_convert_port(dev, slave, vhcr->in_param >> 62);
+	enum mlx4_steer_type steer = vhcr->in_modifier;
+
+	if (port < 0)
+		return -EINVAL;
+
+	/* Promiscuous unicast is not allowed in mfunc */
+	if (mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)
+		return 0;
+
+	if (vhcr->op_modifier)
+		return add_promisc_qp(dev, port, steer, qpn);
+	else
+		return remove_promisc_qp(dev, port, steer, qpn);
+}
+
+static int mlx4_PROMISC(struct mlx4_dev *dev, u32 qpn,
+			enum mlx4_steer_type steer, u8 add, u8 port)
+{
+	return mlx4_cmd(dev, (u64) qpn | (u64) port << 62, (u32) steer, add,
+			MLX4_CMD_PROMISC, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
+}
+
+int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 1, port);
+
+	return add_promisc_qp(dev, port, MLX4_MC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_add);
+
+int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 0, port);
+
+	return remove_promisc_qp(dev, port, MLX4_MC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_remove);
+
+int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 1, port);
+
+	return add_promisc_qp(dev, port, MLX4_UC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_add);
+
+int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 0, port);
+
+	return remove_promisc_qp(dev, port, MLX4_UC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_remove);
+
+int mlx4_init_mcg_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	/* No need for mcg_table when fw managed the mcg table*/
+	if (dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return 0;
+	err = mlx4_bitmap_init(&priv->mcg_table.bitmap, dev->caps.num_amgms,
+			       dev->caps.num_amgms - 1, 0, 0);
+	if (err)
+		return err;
+
+	mutex_init(&priv->mcg_table.mutex);
+
+	return 0;
+}
+
+void mlx4_cleanup_mcg_table(struct mlx4_dev *dev)
+{
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
new file mode 100644
index 0000000..c68da19
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -0,0 +1,1482 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_H
+#define MLX4_H
+
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
+#include <linux/timer.h>
+#include <linux/semaphore.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <net/devlink.h>
+#include <linux/rwsem.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/cmd.h>
+#include "fw_qos.h"
+
+#define DRV_NAME	"mlx4_core"
+#define PFX		DRV_NAME ": "
+#define DRV_VERSION	"4.0-0"
+
+#define MLX4_FS_UDP_UC_EN		(1 << 1)
+#define MLX4_FS_TCP_UC_EN		(1 << 2)
+#define MLX4_FS_NUM_OF_L2_ADDR		8
+#define MLX4_FS_MGM_LOG_ENTRY_SIZE	7
+#define MLX4_FS_NUM_MCG			(1 << 17)
+
+#define INIT_HCA_TPT_MW_ENABLE          (1 << 7)
+
+#define MLX4_QUERY_IF_STAT_RESET	BIT(31)
+
+enum {
+	MLX4_HCR_BASE		= 0x80680,
+	MLX4_HCR_SIZE		= 0x0001c,
+	MLX4_CLR_INT_SIZE	= 0x00008,
+	MLX4_SLAVE_COMM_BASE	= 0x0,
+	MLX4_COMM_PAGESIZE	= 0x1000,
+	MLX4_CLOCK_SIZE		= 0x00008,
+	MLX4_COMM_CHAN_CAPS	= 0x8,
+	MLX4_COMM_CHAN_FLAGS	= 0xc
+};
+
+enum {
+	MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE = 10,
+	MLX4_MIN_MGM_LOG_ENTRY_SIZE = 7,
+	MLX4_MAX_MGM_LOG_ENTRY_SIZE = 12,
+	MLX4_MAX_QP_PER_MGM = 4 * ((1 << MLX4_MAX_MGM_LOG_ENTRY_SIZE) / 16 - 2),
+	MLX4_MTT_ENTRY_PER_SEG	= 8,
+};
+
+enum {
+	MLX4_NUM_PDS		= 1 << 15
+};
+
+enum {
+	MLX4_CMPT_TYPE_QP	= 0,
+	MLX4_CMPT_TYPE_SRQ	= 1,
+	MLX4_CMPT_TYPE_CQ	= 2,
+	MLX4_CMPT_TYPE_EQ	= 3,
+	MLX4_CMPT_NUM_TYPE
+};
+
+enum {
+	MLX4_CMPT_SHIFT		= 24,
+	MLX4_NUM_CMPTS		= MLX4_CMPT_NUM_TYPE << MLX4_CMPT_SHIFT
+};
+
+enum mlx4_mpt_state {
+	MLX4_MPT_DISABLED = 0,
+	MLX4_MPT_EN_HW,
+	MLX4_MPT_EN_SW
+};
+
+#define MLX4_COMM_TIME		10000
+#define MLX4_COMM_OFFLINE_TIME_OUT 30000
+#define MLX4_COMM_CMD_NA_OP    0x0
+
+
+enum {
+	MLX4_COMM_CMD_RESET,
+	MLX4_COMM_CMD_VHCR0,
+	MLX4_COMM_CMD_VHCR1,
+	MLX4_COMM_CMD_VHCR2,
+	MLX4_COMM_CMD_VHCR_EN,
+	MLX4_COMM_CMD_VHCR_POST,
+	MLX4_COMM_CMD_FLR = 254
+};
+
+enum {
+	MLX4_VF_SMI_DISABLED,
+	MLX4_VF_SMI_ENABLED
+};
+
+/*The flag indicates that the slave should delay the RESET cmd*/
+#define MLX4_DELAY_RESET_SLAVE 0xbbbbbbb
+/*indicates how many retries will be done if we are in the middle of FLR*/
+#define NUM_OF_RESET_RETRIES	10
+#define SLEEP_TIME_IN_RESET	(2 * 1000)
+enum mlx4_resource {
+	RES_QP,
+	RES_CQ,
+	RES_SRQ,
+	RES_XRCD,
+	RES_MPT,
+	RES_MTT,
+	RES_MAC,
+	RES_VLAN,
+	RES_NPORT_ID,
+	RES_COUNTER,
+	RES_FS_RULE,
+	RES_EQ,
+	MLX4_NUM_OF_RESOURCE_TYPE
+};
+
+enum mlx4_alloc_mode {
+	RES_OP_RESERVE,
+	RES_OP_RESERVE_AND_MAP,
+	RES_OP_MAP_ICM,
+};
+
+enum mlx4_res_tracker_free_type {
+	RES_TR_FREE_ALL,
+	RES_TR_FREE_SLAVES_ONLY,
+	RES_TR_FREE_STRUCTS_ONLY,
+};
+
+/*
+ *Virtual HCR structures.
+ * mlx4_vhcr is the sw representation, in machine endianness
+ *
+ * mlx4_vhcr_cmd is the formalized structure, the one that is passed
+ * to FW to go through communication channel.
+ * It is big endian, and has the same structure as the physical HCR
+ * used by command interface
+ */
+struct mlx4_vhcr {
+	u64	in_param;
+	u64	out_param;
+	u32	in_modifier;
+	u32	errno;
+	u16	op;
+	u16	token;
+	u8	op_modifier;
+	u8	e_bit;
+};
+
+struct mlx4_vhcr_cmd {
+	__be64 in_param;
+	__be32 in_modifier;
+	u32 reserved1;
+	__be64 out_param;
+	__be16 token;
+	u16 reserved;
+	u8 status;
+	u8 flags;
+	__be16 opcode;
+};
+
+struct mlx4_cmd_info {
+	u16 opcode;
+	bool has_inbox;
+	bool has_outbox;
+	bool out_is_imm;
+	bool encode_slave_id;
+	int (*verify)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr,
+		      struct mlx4_cmd_mailbox *inbox);
+	int (*wrapper)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr,
+		       struct mlx4_cmd_mailbox *inbox,
+		       struct mlx4_cmd_mailbox *outbox,
+		       struct mlx4_cmd_info *cmd);
+};
+
+#ifdef CONFIG_MLX4_DEBUG
+extern int mlx4_debug_level;
+#else /* CONFIG_MLX4_DEBUG */
+#define mlx4_debug_level	(0)
+#endif /* CONFIG_MLX4_DEBUG */
+
+#define mlx4_dbg(mdev, format, ...)					\
+do {									\
+	if (mlx4_debug_level)						\
+		dev_printk(KERN_DEBUG,					\
+			   &(mdev)->persist->pdev->dev, format,		\
+			   ##__VA_ARGS__);				\
+} while (0)
+
+#define mlx4_err(mdev, format, ...)					\
+	dev_err(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
+#define mlx4_info(mdev, format, ...)					\
+	dev_info(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
+#define mlx4_warn(mdev, format, ...)					\
+	dev_warn(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
+
+extern int log_mtts_per_seg;
+extern int mlx4_internal_err_reset;
+
+#define MLX4_MAX_NUM_SLAVES	(min(MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF, \
+				     MLX4_MFUNC_MAX))
+#define ALL_SLAVES 0xff
+
+struct mlx4_bitmap {
+	u32			last;
+	u32			top;
+	u32			max;
+	u32                     reserved_top;
+	u32			mask;
+	u32			avail;
+	u32			effective_len;
+	spinlock_t		lock;
+	unsigned long	       *table;
+};
+
+struct mlx4_buddy {
+	unsigned long	      **bits;
+	unsigned int	       *num_free;
+	u32			max_order;
+	spinlock_t		lock;
+};
+
+struct mlx4_icm;
+
+struct mlx4_icm_table {
+	u64			virt;
+	int			num_icm;
+	u32			num_obj;
+	int			obj_size;
+	int			lowmem;
+	int			coherent;
+	struct mutex		mutex;
+	struct mlx4_icm	      **icm;
+};
+
+#define MLX4_MPT_FLAG_SW_OWNS	    (0xfUL << 28)
+#define MLX4_MPT_FLAG_FREE	    (0x3UL << 28)
+#define MLX4_MPT_FLAG_MIO	    (1 << 17)
+#define MLX4_MPT_FLAG_BIND_ENABLE   (1 << 15)
+#define MLX4_MPT_FLAG_PHYSICAL	    (1 <<  9)
+#define MLX4_MPT_FLAG_REGION	    (1 <<  8)
+
+#define MLX4_MPT_PD_MASK	    (0x1FFFFUL)
+#define MLX4_MPT_PD_VF_MASK	    (0xFE0000UL)
+#define MLX4_MPT_PD_FLAG_FAST_REG   (1 << 27)
+#define MLX4_MPT_PD_FLAG_RAE	    (1 << 28)
+#define MLX4_MPT_PD_FLAG_EN_INV	    (3 << 24)
+
+#define MLX4_MPT_QP_FLAG_BOUND_QP   (1 << 7)
+
+#define MLX4_MPT_STATUS_SW		0xF0
+#define MLX4_MPT_STATUS_HW		0x00
+
+#define MLX4_CQE_SIZE_MASK_STRIDE	0x3
+#define MLX4_EQE_SIZE_MASK_STRIDE	0x30
+
+#define MLX4_EQ_ASYNC			0
+#define MLX4_EQ_TO_CQ_VECTOR(vector)	((vector) - \
+					 !!((int)(vector) >= MLX4_EQ_ASYNC))
+#define MLX4_CQ_TO_EQ_VECTOR(vector)	((vector) + \
+					 !!((int)(vector) >= MLX4_EQ_ASYNC))
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_mpt_entry {
+	__be32 flags;
+	__be32 qpn;
+	__be32 key;
+	__be32 pd_flags;
+	__be64 start;
+	__be64 length;
+	__be32 lkey;
+	__be32 win_cnt;
+	u8	reserved1[3];
+	u8	mtt_rep;
+	__be64 mtt_addr;
+	__be32 mtt_sz;
+	__be32 entity_size;
+	__be32 first_byte_offset;
+} __packed;
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_eq_context {
+	__be32			flags;
+	u16			reserved1[3];
+	__be16			page_offset;
+	u8			log_eq_size;
+	u8			reserved2[4];
+	u8			eq_period;
+	u8			reserved3;
+	u8			eq_max_count;
+	u8			reserved4[3];
+	u8			intr;
+	u8			log_page_size;
+	u8			reserved5[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	u32			reserved6[2];
+	__be32			consumer_index;
+	__be32			producer_index;
+	u32			reserved7[4];
+};
+
+struct mlx4_cq_context {
+	__be32			flags;
+	u16			reserved1[3];
+	__be16			page_offset;
+	__be32			logsize_usrpage;
+	__be16			cq_period;
+	__be16			cq_max_count;
+	u8			reserved2[3];
+	u8			comp_eqn;
+	u8			log_page_size;
+	u8			reserved3[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	__be32			last_notified_index;
+	__be32			solicit_producer_index;
+	__be32			consumer_index;
+	__be32			producer_index;
+	u32			reserved4[2];
+	__be64			db_rec_addr;
+};
+
+struct mlx4_srq_context {
+	__be32			state_logsize_srqn;
+	u8			logstride;
+	u8			reserved1;
+	__be16			xrcd;
+	__be32			pg_offset_cqn;
+	u32			reserved2;
+	u8			log_page_size;
+	u8			reserved3[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	__be32			pd;
+	__be16			limit_watermark;
+	__be16			wqe_cnt;
+	u16			reserved4;
+	__be16			wqe_counter;
+	u32			reserved5;
+	__be64			db_rec_addr;
+};
+
+struct mlx4_eq_tasklet {
+	struct list_head list;
+	struct list_head process_list;
+	struct tasklet_struct task;
+	/* lock on completion tasklet list */
+	spinlock_t lock;
+};
+
+struct mlx4_eq {
+	struct mlx4_dev	       *dev;
+	void __iomem	       *doorbell;
+	int			eqn;
+	u32			cons_index;
+	u16			irq;
+	u16			have_irq;
+	int			nent;
+	struct mlx4_buf_list   *page_list;
+	struct mlx4_mtt		mtt;
+	struct mlx4_eq_tasklet	tasklet_ctx;
+	struct mlx4_active_ports actv_ports;
+	u32			ref_count;
+	cpumask_var_t		affinity_mask;
+};
+
+struct mlx4_slave_eqe {
+	u8 type;
+	u8 port;
+	u32 param;
+};
+
+struct mlx4_slave_event_eq_info {
+	int eqn;
+	u16 token;
+};
+
+struct mlx4_profile {
+	int			num_qp;
+	int			rdmarc_per_qp;
+	int			num_srq;
+	int			num_cq;
+	int			num_mcg;
+	int			num_mpt;
+	unsigned		num_mtt;
+};
+
+struct mlx4_fw {
+	u64			clr_int_base;
+	u64			catas_offset;
+	u64			comm_base;
+	u64			clock_offset;
+	struct mlx4_icm	       *fw_icm;
+	struct mlx4_icm	       *aux_icm;
+	u32			catas_size;
+	u16			fw_pages;
+	u8			clr_int_bar;
+	u8			catas_bar;
+	u8			comm_bar;
+	u8			clock_bar;
+};
+
+struct mlx4_comm {
+	u32			slave_write;
+	u32			slave_read;
+};
+
+enum {
+	MLX4_MCAST_CONFIG       = 0,
+	MLX4_MCAST_DISABLE      = 1,
+	MLX4_MCAST_ENABLE       = 2,
+};
+
+#define VLAN_FLTR_SIZE	128
+
+struct mlx4_vlan_fltr {
+	__be32 entry[VLAN_FLTR_SIZE];
+};
+
+struct mlx4_mcast_entry {
+	struct list_head list;
+	u64 addr;
+};
+
+struct mlx4_promisc_qp {
+	struct list_head list;
+	u32 qpn;
+};
+
+struct mlx4_steer_index {
+	struct list_head list;
+	unsigned int index;
+	struct list_head duplicates;
+};
+
+#define MLX4_EVENT_TYPES_NUM 64
+
+struct mlx4_slave_state {
+	u8 comm_toggle;
+	u8 last_cmd;
+	u8 init_port_mask;
+	bool active;
+	bool old_vlan_api;
+	bool vst_qinq_supported;
+	u8 function;
+	dma_addr_t vhcr_dma;
+	u16 user_mtu[MLX4_MAX_PORTS + 1];
+	u16 mtu[MLX4_MAX_PORTS + 1];
+	__be32 ib_cap_mask[MLX4_MAX_PORTS + 1];
+	struct mlx4_slave_eqe eq[MLX4_MFUNC_MAX_EQES];
+	struct list_head mcast_filters[MLX4_MAX_PORTS + 1];
+	struct mlx4_vlan_fltr *vlan_filter[MLX4_MAX_PORTS + 1];
+	/* event type to eq number lookup */
+	struct mlx4_slave_event_eq_info event_eq[MLX4_EVENT_TYPES_NUM];
+	u16 eq_pi;
+	u16 eq_ci;
+	spinlock_t lock;
+	/*initialized via the kzalloc*/
+	u8 is_slave_going_down;
+	u32 cookie;
+	enum slave_port_state port_state[MLX4_MAX_PORTS + 1];
+};
+
+#define MLX4_VGT 4095
+#define NO_INDX  (-1)
+
+struct mlx4_vport_state {
+	u64 mac;
+	u16 default_vlan;
+	u8  default_qos;
+	__be16 vlan_proto;
+	u32 tx_rate;
+	bool spoofchk;
+	u32 link_state;
+	u8 qos_vport;
+	__be64 guid;
+};
+
+struct mlx4_vf_admin_state {
+	struct mlx4_vport_state vport[MLX4_MAX_PORTS + 1];
+	u8 enable_smi[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_vport_oper_state {
+	struct mlx4_vport_state state;
+	int mac_idx;
+	int vlan_idx;
+};
+
+struct mlx4_vf_oper_state {
+	struct mlx4_vport_oper_state vport[MLX4_MAX_PORTS + 1];
+	u8 smi_enabled[MLX4_MAX_PORTS + 1];
+};
+
+struct slave_list {
+	struct mutex mutex;
+	struct list_head res_list[MLX4_NUM_OF_RESOURCE_TYPE];
+};
+
+struct resource_allocator {
+	spinlock_t alloc_lock; /* protect quotas */
+	union {
+		int res_reserved;
+		int res_port_rsvd[MLX4_MAX_PORTS];
+	};
+	union {
+		int res_free;
+		int res_port_free[MLX4_MAX_PORTS];
+	};
+	int *quota;
+	int *allocated;
+	int *guaranteed;
+};
+
+struct mlx4_resource_tracker {
+	spinlock_t lock;
+	/* tree for each resources */
+	struct rb_root res_tree[MLX4_NUM_OF_RESOURCE_TYPE];
+	/* num_of_slave's lists, one per slave */
+	struct slave_list *slave_list;
+	struct resource_allocator res_alloc[MLX4_NUM_OF_RESOURCE_TYPE];
+};
+
+#define SLAVE_EVENT_EQ_SIZE	128
+struct mlx4_slave_event_eq {
+	u32 eqn;
+	u32 cons;
+	u32 prod;
+	spinlock_t event_lock;
+	struct mlx4_eqe event_eqe[SLAVE_EVENT_EQ_SIZE];
+};
+
+struct mlx4_qos_manager {
+	int num_of_qos_vfs;
+	DECLARE_BITMAP(priority_bm, MLX4_NUM_UP);
+};
+
+struct mlx4_master_qp0_state {
+	int proxy_qp0_active;
+	int qp0_active;
+	int port_active;
+};
+
+struct mlx4_mfunc_master_ctx {
+	struct mlx4_slave_state *slave_state;
+	struct mlx4_vf_admin_state *vf_admin;
+	struct mlx4_vf_oper_state *vf_oper;
+	struct mlx4_master_qp0_state qp0_state[MLX4_MAX_PORTS + 1];
+	int			init_port_ref[MLX4_MAX_PORTS + 1];
+	u16			max_mtu[MLX4_MAX_PORTS + 1];
+	u16			max_user_mtu[MLX4_MAX_PORTS + 1];
+	u8			pptx;
+	u8			pprx;
+	int			disable_mcast_ref[MLX4_MAX_PORTS + 1];
+	struct mlx4_resource_tracker res_tracker;
+	struct workqueue_struct *comm_wq;
+	struct work_struct	comm_work;
+	struct work_struct	slave_event_work;
+	struct work_struct	slave_flr_event_work;
+	spinlock_t		slave_state_lock;
+	__be32			comm_arm_bit_vector[4];
+	struct mlx4_eqe		cmd_eqe;
+	struct mlx4_slave_event_eq slave_eq;
+	struct mutex		gen_eqe_mutex[MLX4_MFUNC_MAX];
+	struct mlx4_qos_manager qos_ctl[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_mfunc {
+	struct mlx4_comm __iomem       *comm;
+	struct mlx4_vhcr_cmd	       *vhcr;
+	dma_addr_t			vhcr_dma;
+
+	struct mlx4_mfunc_master_ctx	master;
+};
+
+#define MGM_QPN_MASK       0x00FFFFFF
+#define MGM_BLCK_LB_BIT    30
+
+struct mlx4_mgm {
+	__be32			next_gid_index;
+	__be32			members_count;
+	u32			reserved[2];
+	u8			gid[16];
+	__be32			qp[MLX4_MAX_QP_PER_MGM];
+};
+
+struct mlx4_cmd {
+	struct dma_pool	       *pool;
+	void __iomem	       *hcr;
+	struct mutex		slave_cmd_mutex;
+	struct semaphore	poll_sem;
+	struct semaphore	event_sem;
+	struct rw_semaphore	switch_sem;
+	int			max_cmds;
+	spinlock_t		context_lock;
+	int			free_head;
+	struct mlx4_cmd_context *context;
+	u16			token_mask;
+	u8			use_events;
+	u8			toggle;
+	u8			comm_toggle;
+	u8			initialized;
+};
+
+enum {
+	MLX4_VF_IMMED_VLAN_FLAG_VLAN = 1 << 0,
+	MLX4_VF_IMMED_VLAN_FLAG_QOS = 1 << 1,
+	MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE = 1 << 2,
+};
+struct mlx4_vf_immed_vlan_work {
+	struct work_struct	work;
+	struct mlx4_priv	*priv;
+	int			flags;
+	int			slave;
+	int			vlan_ix;
+	int			orig_vlan_ix;
+	u8			port;
+	u8			qos;
+	u8                      qos_vport;
+	u16			vlan_id;
+	u16			orig_vlan_id;
+	__be16			vlan_proto;
+};
+
+
+struct mlx4_uar_table {
+	struct mlx4_bitmap	bitmap;
+};
+
+struct mlx4_mr_table {
+	struct mlx4_bitmap	mpt_bitmap;
+	struct mlx4_buddy	mtt_buddy;
+	u64			mtt_base;
+	u64			mpt_base;
+	struct mlx4_icm_table	mtt_table;
+	struct mlx4_icm_table	dmpt_table;
+};
+
+struct mlx4_cq_table {
+	struct mlx4_bitmap	bitmap;
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+struct mlx4_eq_table {
+	struct mlx4_bitmap	bitmap;
+	char		       *irq_names;
+	void __iomem	       *clr_int;
+	void __iomem	      **uar_map;
+	u32			clr_mask;
+	struct mlx4_eq	       *eq;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+	int			have_irq;
+	u8			inta_pin;
+};
+
+struct mlx4_srq_table {
+	struct mlx4_bitmap	bitmap;
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+enum mlx4_qp_table_zones {
+	MLX4_QP_TABLE_ZONE_GENERAL,
+	MLX4_QP_TABLE_ZONE_RSS,
+	MLX4_QP_TABLE_ZONE_RAW_ETH,
+	MLX4_QP_TABLE_ZONE_NUM
+};
+
+struct mlx4_qp_table {
+	struct mlx4_bitmap	*bitmap_gen;
+	struct mlx4_zone_allocator *zones;
+	u32			zones_uids[MLX4_QP_TABLE_ZONE_NUM];
+	u32			rdmarc_base;
+	int			rdmarc_shift;
+	spinlock_t		lock;
+	struct mlx4_icm_table	qp_table;
+	struct mlx4_icm_table	auxc_table;
+	struct mlx4_icm_table	altc_table;
+	struct mlx4_icm_table	rdmarc_table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+struct mlx4_mcg_table {
+	struct mutex		mutex;
+	struct mlx4_bitmap	bitmap;
+	struct mlx4_icm_table	table;
+};
+
+struct mlx4_catas_err {
+	u32 __iomem	       *map;
+	struct timer_list	timer;
+	struct list_head	list;
+};
+
+#define MLX4_MAX_MAC_NUM	128
+#define MLX4_MAC_TABLE_SIZE	(MLX4_MAX_MAC_NUM << 3)
+
+struct mlx4_mac_table {
+	__be64			entries[MLX4_MAX_MAC_NUM];
+	int			refs[MLX4_MAX_MAC_NUM];
+	bool			is_dup[MLX4_MAX_MAC_NUM];
+	struct mutex		mutex;
+	int			total;
+	int			max;
+};
+
+#define MLX4_ROCE_GID_ENTRY_SIZE	16
+
+struct mlx4_roce_gid_entry {
+	u8 raw[MLX4_ROCE_GID_ENTRY_SIZE];
+};
+
+struct mlx4_roce_gid_table {
+	struct mlx4_roce_gid_entry	roce_gids[MLX4_ROCE_MAX_GIDS];
+	struct mutex			mutex;
+};
+
+#define MLX4_MAX_VLAN_NUM	128
+#define MLX4_VLAN_TABLE_SIZE	(MLX4_MAX_VLAN_NUM << 2)
+
+struct mlx4_vlan_table {
+	__be32			entries[MLX4_MAX_VLAN_NUM];
+	int			refs[MLX4_MAX_VLAN_NUM];
+	int			is_dup[MLX4_MAX_VLAN_NUM];
+	struct mutex		mutex;
+	int			total;
+	int			max;
+};
+
+#define SET_PORT_GEN_ALL_VALID	(MLX4_FLAG_V_MTU_MASK	| \
+				 MLX4_FLAG_V_PPRX_MASK	| \
+				 MLX4_FLAG_V_PPTX_MASK)
+#define SET_PORT_PROMISC_SHIFT		31
+#define SET_PORT_MC_PROMISC_SHIFT	30
+
+enum {
+	MCAST_DIRECT_ONLY	= 0,
+	MCAST_DIRECT		= 1,
+	MCAST_DEFAULT		= 2
+};
+
+
+struct mlx4_set_port_general_context {
+	u16 reserved1;
+	u8 flags2;
+	u8 flags;
+	union {
+		u8 ignore_fcs;
+		u8 roce_mode;
+	};
+	u8 reserved2;
+	__be16 mtu;
+	u8 pptx;
+	u8 pfctx;
+	u16 reserved3;
+	u8 pprx;
+	u8 pfcrx;
+	u16 reserved4;
+	u32 reserved5;
+	u8 phv_en;
+	u8 reserved6[5];
+	__be16 user_mtu;
+	u16 reserved7;
+	u8 user_mac[6];
+};
+
+struct mlx4_set_port_rqp_calc_context {
+	__be32 base_qpn;
+	u8 rererved;
+	u8 n_mac;
+	u8 n_vlan;
+	u8 n_prio;
+	u8 reserved2[3];
+	u8 mac_miss;
+	u8 intra_no_vlan;
+	u8 no_vlan;
+	u8 intra_vlan_miss;
+	u8 vlan_miss;
+	u8 reserved3[3];
+	u8 no_vlan_prio;
+	__be32 promisc;
+	__be32 mcast;
+};
+
+struct mlx4_port_info {
+	struct mlx4_dev	       *dev;
+	int			port;
+	char			dev_name[16];
+	struct device_attribute port_attr;
+	enum mlx4_port_type	tmp_type;
+	char			dev_mtu_name[16];
+	struct device_attribute port_mtu_attr;
+	struct mlx4_mac_table	mac_table;
+	struct mlx4_vlan_table	vlan_table;
+	struct mlx4_roce_gid_table gid_table;
+	int			base_qpn;
+	struct cpu_rmap		*rmap;
+	struct devlink_port	devlink_port;
+};
+
+struct mlx4_sense {
+	struct mlx4_dev		*dev;
+	u8			do_sense_port[MLX4_MAX_PORTS + 1];
+	u8			sense_allowed[MLX4_MAX_PORTS + 1];
+	struct delayed_work	sense_poll;
+};
+
+struct mlx4_msix_ctl {
+	DECLARE_BITMAP(pool_bm, MAX_MSIX);
+	struct mutex	pool_lock;
+};
+
+struct mlx4_steer {
+	struct list_head promisc_qps[MLX4_NUM_STEERS];
+	struct list_head steer_entries[MLX4_NUM_STEERS];
+};
+
+enum {
+	MLX4_PCI_DEV_IS_VF		= 1 << 0,
+	MLX4_PCI_DEV_FORCE_SENSE_PORT	= 1 << 1,
+};
+
+enum {
+	MLX4_NO_RR	= 0,
+	MLX4_USE_RR	= 1,
+};
+
+struct mlx4_priv {
+	struct mlx4_dev		dev;
+
+	struct list_head	dev_list;
+	struct list_head	ctx_list;
+	spinlock_t		ctx_lock;
+
+	int			pci_dev_data;
+	int                     removed;
+
+	struct list_head        pgdir_list;
+	struct mutex            pgdir_mutex;
+
+	struct mlx4_fw		fw;
+	struct mlx4_cmd		cmd;
+	struct mlx4_mfunc	mfunc;
+
+	struct mlx4_bitmap	pd_bitmap;
+	struct mlx4_bitmap	xrcd_bitmap;
+	struct mlx4_uar_table	uar_table;
+	struct mlx4_mr_table	mr_table;
+	struct mlx4_cq_table	cq_table;
+	struct mlx4_eq_table	eq_table;
+	struct mlx4_srq_table	srq_table;
+	struct mlx4_qp_table	qp_table;
+	struct mlx4_mcg_table	mcg_table;
+	struct mlx4_bitmap	counters_bitmap;
+	int			def_counter[MLX4_MAX_PORTS];
+
+	struct mlx4_catas_err	catas_err;
+
+	void __iomem	       *clr_base;
+
+	struct mlx4_uar		driver_uar;
+	void __iomem	       *kar;
+	struct mlx4_port_info	port[MLX4_MAX_PORTS + 1];
+	struct mlx4_sense       sense;
+	struct mutex		port_mutex;
+	struct mlx4_msix_ctl	msix_ctl;
+	struct mlx4_steer	*steer;
+	struct list_head	bf_list;
+	struct mutex		bf_mutex;
+	struct io_mapping	*bf_mapping;
+	void __iomem            *clock_mapping;
+	int			reserved_mtts;
+	int			fs_hash_mode;
+	u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	struct mlx4_port_map	v2p; /* cached port mapping configuration */
+	struct mutex		bond_mutex; /* for bond mode */
+	__be64			slave_node_guids[MLX4_MFUNC_MAX];
+
+	atomic_t		opreq_count;
+	struct work_struct	opreq_task;
+};
+
+static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
+{
+	return container_of(dev, struct mlx4_priv, dev);
+}
+
+#define MLX4_SENSE_RANGE	(HZ * 3)
+
+extern struct workqueue_struct *mlx4_wq;
+
+u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr);
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
+			    int align, u32 skip_mask);
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt,
+			    int use_rr);
+u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap);
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+		     u32 reserved_bot, u32 resetrved_top);
+void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap);
+
+int mlx4_reset(struct mlx4_dev *dev);
+
+int mlx4_alloc_eq_table(struct mlx4_dev *dev);
+void mlx4_free_eq_table(struct mlx4_dev *dev);
+
+int mlx4_init_pd_table(struct mlx4_dev *dev);
+int mlx4_init_xrcd_table(struct mlx4_dev *dev);
+int mlx4_init_uar_table(struct mlx4_dev *dev);
+int mlx4_init_mr_table(struct mlx4_dev *dev);
+int mlx4_init_eq_table(struct mlx4_dev *dev);
+int mlx4_init_cq_table(struct mlx4_dev *dev);
+int mlx4_init_qp_table(struct mlx4_dev *dev);
+int mlx4_init_srq_table(struct mlx4_dev *dev);
+int mlx4_init_mcg_table(struct mlx4_dev *dev);
+
+void mlx4_cleanup_pd_table(struct mlx4_dev *dev);
+void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev);
+void mlx4_cleanup_uar_table(struct mlx4_dev *dev);
+void mlx4_cleanup_mr_table(struct mlx4_dev *dev);
+void mlx4_cleanup_eq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_cq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_qp_table(struct mlx4_dev *dev);
+void mlx4_cleanup_srq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_mcg_table(struct mlx4_dev *dev);
+int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn);
+void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn);
+int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn);
+void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn);
+int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn);
+void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn);
+int __mlx4_mpt_reserve(struct mlx4_dev *dev);
+void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index);
+int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index);
+void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index);
+u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order);
+void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order);
+
+int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SYNC_TPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_CONFIG_DEV_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave,
+		     struct mlx4_vhcr *vhcr,
+		     struct mlx4_cmd_mailbox *inbox,
+		     struct mlx4_cmd_mailbox *outbox,
+		     struct mlx4_cmd_info *cmd);
+int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			    int *base, u8 flags);
+void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
+int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     int start_index, int npages, u64 *page_list);
+int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
+void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
+int mlx4_calc_vf_counters(struct mlx4_dev *dev, int slave, int port,
+			  struct mlx4_counter *data);
+int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn);
+void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn);
+
+void mlx4_start_catas_poll(struct mlx4_dev *dev);
+void mlx4_stop_catas_poll(struct mlx4_dev *dev);
+int mlx4_catas_init(struct mlx4_dev *dev);
+void mlx4_catas_end(struct mlx4_dev *dev);
+int mlx4_restart_one(struct pci_dev *pdev);
+int mlx4_register_device(struct mlx4_dev *dev);
+void mlx4_unregister_device(struct mlx4_dev *dev);
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type,
+			 unsigned long param);
+
+struct mlx4_dev_cap;
+struct mlx4_init_hca_param;
+
+u64 mlx4_make_profile(struct mlx4_dev *dev,
+		      struct mlx4_profile *request,
+		      struct mlx4_dev_cap *dev_cap,
+		      struct mlx4_init_hca_param *init_hca);
+void mlx4_master_comm_channel(struct work_struct *work);
+void mlx4_gen_slave_eqe(struct work_struct *work);
+void mlx4_master_handle_slave_flr(struct work_struct *work);
+
+int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd);
+int mlx4_COMM_INT_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_GEN_QP_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr,
+			struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd);
+int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd);
+int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd);
+int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd);
+int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd);
+int mlx4_2ERR_QP_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_RTS2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_QP_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+
+int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe);
+
+enum {
+	MLX4_CMD_CLEANUP_STRUCT = 1UL << 0,
+	MLX4_CMD_CLEANUP_POOL	= 1UL << 1,
+	MLX4_CMD_CLEANUP_HCR	= 1UL << 2,
+	MLX4_CMD_CLEANUP_VHCR	= 1UL << 3,
+	MLX4_CMD_CLEANUP_ALL	= (MLX4_CMD_CLEANUP_VHCR << 1) - 1
+};
+
+int mlx4_cmd_init(struct mlx4_dev *dev);
+void mlx4_cmd_cleanup(struct mlx4_dev *dev, int cleanup_mask);
+int mlx4_multi_func_init(struct mlx4_dev *dev);
+int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev);
+void mlx4_multi_func_cleanup(struct mlx4_dev *dev);
+void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param);
+int mlx4_cmd_use_events(struct mlx4_dev *dev);
+void mlx4_cmd_use_polling(struct mlx4_dev *dev);
+
+int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
+		  u16 op, unsigned long timeout);
+
+void mlx4_cq_tasklet_cb(unsigned long data);
+void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn);
+void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type);
+
+void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type);
+
+void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
+
+void mlx4_enter_error_state(struct mlx4_dev_persistent *persist);
+int mlx4_comm_internal_err(u32 slave_read);
+
+int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
+		    enum mlx4_port_type *type);
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+			 enum mlx4_port_type *stype,
+			 enum mlx4_port_type *defaults);
+void mlx4_start_sense(struct mlx4_dev *dev);
+void mlx4_stop_sense(struct mlx4_dev *dev);
+void mlx4_sense_init(struct mlx4_dev *dev);
+int mlx4_check_port_params(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_type);
+int mlx4_change_port_types(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_types);
+
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
+void mlx4_init_roce_gid_table(struct mlx4_dev *dev,
+			      struct mlx4_roce_gid_table *table);
+void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan);
+int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
+int mlx4_bond_vlan_table(struct mlx4_dev *dev);
+int mlx4_unbond_vlan_table(struct mlx4_dev *dev);
+int mlx4_bond_mac_table(struct mlx4_dev *dev);
+int mlx4_unbond_mac_table(struct mlx4_dev *dev);
+
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz);
+/* resource tracker functions*/
+int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
+				    enum mlx4_resource resource_type,
+				    u64 resource_id, int *slave);
+void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave_id);
+void mlx4_reset_roce_gids(struct mlx4_dev *dev, int slave);
+int mlx4_init_resource_tracker(struct mlx4_dev *dev);
+
+void mlx4_free_resource_tracker(struct mlx4_dev *dev,
+				enum mlx4_res_tracker_free_type type);
+
+int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps);
+
+int mlx4_get_slave_pkey_gid_tbl_len(struct mlx4_dev *dev, u8 port,
+				    int *gid_tbl_len, int *pkey_tbl_len);
+
+int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+
+int mlx4_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+
+int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol prot, enum mlx4_steer_type steer);
+int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback, enum mlx4_protocol prot,
+			  enum mlx4_steer_type steer);
+int mlx4_trans_to_dmfs_attach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			      u8 gid[16], u8 port,
+			      int block_mcast_loopback,
+			      enum mlx4_protocol prot, u64 *reg_id);
+int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd);
+int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd);
+int mlx4_common_set_vlan_fltr(struct mlx4_dev *dev, int function,
+				     int port, void *buf);
+int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave,
+				   struct mlx4_vhcr *vhcr,
+				   struct mlx4_cmd_mailbox *inbox,
+				   struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd);
+int mlx4_PKEY_TABLE_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd);
+int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd);
+int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd);
+int mlx4_ACCESS_REG_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+
+int mlx4_get_mgm_entry_size(struct mlx4_dev *dev);
+int mlx4_get_qp_per_mgm(struct mlx4_dev *dev);
+
+static inline void set_param_l(u64 *arg, u32 val)
+{
+	*arg = (*arg & 0xffffffff00000000ULL) | (u64) val;
+}
+
+static inline void set_param_h(u64 *arg, u32 val)
+{
+	*arg = (*arg & 0xffffffff) | ((u64) val << 32);
+}
+
+static inline u32 get_param_l(u64 *arg)
+{
+	return (u32) (*arg & 0xffffffff);
+}
+
+static inline u32 get_param_h(u64 *arg)
+{
+	return (u32)(*arg >> 32);
+}
+
+static inline spinlock_t *mlx4_tlock(struct mlx4_dev *dev)
+{
+	return &mlx4_priv(dev)->mfunc.master.res_tracker.lock;
+}
+
+#define NOT_MASKED_PD_BITS 17
+
+void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work);
+
+void mlx4_init_quotas(struct mlx4_dev *dev);
+
+/* for VFs, replace zero MACs with randomly-generated MACs at driver start */
+void mlx4_replace_zero_macs(struct mlx4_dev *dev);
+int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port);
+/* Returns the VF index of slave */
+int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave);
+int mlx4_config_mad_demux(struct mlx4_dev *dev);
+int mlx4_do_bond(struct mlx4_dev *dev, bool enable);
+int mlx4_bond_fs_rules(struct mlx4_dev *dev);
+int mlx4_unbond_fs_rules(struct mlx4_dev *dev);
+
+enum mlx4_zone_flags {
+	MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO	= 1UL << 0,
+	MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO	= 1UL << 1,
+	MLX4_ZONE_FALLBACK_TO_HIGHER_PRIO	= 1UL << 2,
+	MLX4_ZONE_USE_RR			= 1UL << 3,
+};
+
+enum mlx4_zone_alloc_flags {
+	/* No two objects could overlap between zones. UID
+	 * could be left unused. If this flag is given and
+	 * two overlapped zones are used, an object will be free'd
+	 * from the smallest possible matching zone.
+	 */
+	MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP	= 1UL << 0,
+};
+
+struct mlx4_zone_allocator;
+
+/* Create a new zone allocator */
+struct mlx4_zone_allocator *mlx4_zone_allocator_create(enum mlx4_zone_alloc_flags flags);
+
+/* Attach a mlx4_bitmap <bitmap> of priority <priority> to the zone allocator
+ * <zone_alloc>. Allocating an object from this zone adds an offset <offset>.
+ * Similarly, when searching for an object to free, this offset it taken into
+ * account. The use_rr mlx4_ib parameter for allocating objects from this <bitmap>
+ * is given through the MLX4_ZONE_USE_RR flag in <flags>.
+ * When an allocation fails, <zone_alloc> tries to allocate from other zones
+ * according to the policy set by <flags>. <puid> is the unique identifier
+ * received to this zone.
+ */
+int mlx4_zone_add_one(struct mlx4_zone_allocator *zone_alloc,
+		      struct mlx4_bitmap *bitmap,
+		      u32 flags,
+		      int priority,
+		      int offset,
+		      u32 *puid);
+
+/* Remove bitmap indicated by <uid> from <zone_alloc> */
+int mlx4_zone_remove_one(struct mlx4_zone_allocator *zone_alloc, u32 uid);
+
+/* Delete the zone allocator <zone_alloc. This function doesn't destroy
+ * the attached bitmaps.
+ */
+void mlx4_zone_allocator_destroy(struct mlx4_zone_allocator *zone_alloc);
+
+/* Allocate <count> objects with align <align> and skip_mask <skip_mask>
+ * from the mlx4_bitmap whose uid is <uid>. The bitmap which we actually
+ * allocated from is returned in <puid>. If the allocation fails, a negative
+ * number is returned. Otherwise, the offset of the first object is returned.
+ */
+u32 mlx4_zone_alloc_entries(struct mlx4_zone_allocator *zones, u32 uid, int count,
+			    int align, u32 skip_mask, u32 *puid);
+
+/* Free <count> objects, start from <obj> of the uid <uid> from zone_allocator
+ * <zones>.
+ */
+u32 mlx4_zone_free_entries(struct mlx4_zone_allocator *zones,
+			   u32 uid, u32 obj, u32 count);
+
+/* If <zones> was allocated with MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP, instead of
+ * specifying the uid when freeing an object, zone allocator could figure it by
+ * itself. Other parameters are similar to mlx4_zone_free.
+ */
+u32 mlx4_zone_free_entries_unique(struct mlx4_zone_allocator *zones, u32 obj, u32 count);
+
+/* Returns a pointer to mlx4_bitmap that was attached to <zones> with <uid> */
+struct mlx4_bitmap *mlx4_zone_get_bitmap(struct mlx4_zone_allocator *zones, u32 uid);
+
+#endif /* MLX4_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
new file mode 100644
index 0000000..ace6545
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -0,0 +1,847 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _MLX4_EN_H_
+#define _MLX4_EN_H_
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/net_tstamp.h>
+#ifdef CONFIG_MLX4_EN_DCB
+#include <linux/dcbnl.h>
+#endif
+#include <linux/cpu_rmap.h>
+#include <linux/ptp_clock_kernel.h>
+#include <net/xdp.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/srq.h>
+#include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/cmd.h>
+
+#include "en_port.h"
+#include "mlx4_stats.h"
+
+#define DRV_NAME	"mlx4_en"
+#define DRV_VERSION	"4.0-0"
+
+#define MLX4_EN_MSG_LEVEL	(NETIF_MSG_LINK | NETIF_MSG_IFDOWN)
+
+/*
+ * Device constants
+ */
+
+
+#define MLX4_EN_PAGE_SHIFT	12
+#define MLX4_EN_PAGE_SIZE	(1 << MLX4_EN_PAGE_SHIFT)
+#define DEF_RX_RINGS		16
+#define MAX_RX_RINGS		128
+#define MIN_RX_RINGS		4
+#define LOG_TXBB_SIZE		6
+#define TXBB_SIZE		BIT(LOG_TXBB_SIZE)
+#define HEADROOM		(2048 / TXBB_SIZE + 1)
+#define STAMP_STRIDE		64
+#define STAMP_DWORDS		(STAMP_STRIDE / 4)
+#define STAMP_SHIFT		31
+#define STAMP_VAL		0x7fffffff
+#define STATS_DELAY		(HZ / 4)
+#define SERVICE_TASK_DELAY	(HZ / 4)
+#define MAX_NUM_OF_FS_RULES	256
+
+#define MLX4_EN_FILTER_HASH_SHIFT 4
+#define MLX4_EN_FILTER_EXPIRY_QUOTA 60
+
+/* Typical TSO descriptor with 16 gather entries is 352 bytes... */
+#define MAX_DESC_SIZE		512
+#define MAX_DESC_TXBBS		(MAX_DESC_SIZE / TXBB_SIZE)
+
+/*
+ * OS related constants and tunables
+ */
+
+#define MLX4_EN_PRIV_FLAGS_BLUEFLAME 1
+#define MLX4_EN_PRIV_FLAGS_PHV	     2
+
+#define MLX4_EN_WATCHDOG_TIMEOUT	(15 * HZ)
+
+/* Use the maximum between 16384 and a single page */
+#define MLX4_EN_ALLOC_SIZE	PAGE_ALIGN(16384)
+
+#define MLX4_EN_MAX_RX_FRAGS	4
+
+/* Maximum ring sizes */
+#define MLX4_EN_MAX_TX_SIZE	8192
+#define MLX4_EN_MAX_RX_SIZE	8192
+
+/* Minimum ring size for our page-allocation scheme to work */
+#define MLX4_EN_MIN_RX_SIZE	(MLX4_EN_ALLOC_SIZE / SMP_CACHE_BYTES)
+#define MLX4_EN_MIN_TX_SIZE	(4096 / TXBB_SIZE)
+
+#define MLX4_EN_SMALL_PKT_SIZE		64
+#define MLX4_EN_MIN_TX_RING_P_UP	1
+#define MLX4_EN_MAX_TX_RING_P_UP	32
+#define MLX4_EN_NUM_UP_LOW		1
+#define MLX4_EN_NUM_UP_HIGH		8
+#define MLX4_EN_DEF_RX_RING_SIZE  	1024
+#define MLX4_EN_DEF_TX_RING_SIZE	MLX4_EN_DEF_RX_RING_SIZE
+#define MAX_TX_RINGS			(MLX4_EN_MAX_TX_RING_P_UP * \
+					 MLX4_EN_NUM_UP_HIGH)
+
+#define MLX4_EN_DEFAULT_TX_WORK		256
+
+/* Target number of packets to coalesce with interrupt moderation */
+#define MLX4_EN_RX_COAL_TARGET	44
+#define MLX4_EN_RX_COAL_TIME	0x10
+
+#define MLX4_EN_TX_COAL_PKTS	16
+#define MLX4_EN_TX_COAL_TIME	0x10
+
+#define MLX4_EN_MAX_COAL_PKTS	U16_MAX
+#define MLX4_EN_MAX_COAL_TIME	U16_MAX
+
+#define MLX4_EN_RX_RATE_LOW		400000
+#define MLX4_EN_RX_COAL_TIME_LOW	0
+#define MLX4_EN_RX_RATE_HIGH		450000
+#define MLX4_EN_RX_COAL_TIME_HIGH	128
+#define MLX4_EN_RX_SIZE_THRESH		1024
+#define MLX4_EN_RX_RATE_THRESH		(1000000 / MLX4_EN_RX_COAL_TIME_HIGH)
+#define MLX4_EN_SAMPLE_INTERVAL		0
+#define MLX4_EN_AVG_PKT_SMALL		256
+
+#define MLX4_EN_AUTO_CONF	0xffff
+
+#define MLX4_EN_DEF_RX_PAUSE	1
+#define MLX4_EN_DEF_TX_PAUSE	1
+
+/* Interval between successive polls in the Tx routine when polling is used
+   instead of interrupts (in per-core Tx rings) - should be power of 2 */
+#define MLX4_EN_TX_POLL_MODER	16
+#define MLX4_EN_TX_POLL_TIMEOUT	(HZ / 4)
+
+#define SMALL_PACKET_SIZE      (256 - NET_IP_ALIGN)
+#define HEADER_COPY_SIZE       (128 - NET_IP_ALIGN)
+#define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETH_HLEN)
+#define PREAMBLE_LEN           8
+#define MLX4_SELFTEST_LB_MIN_MTU (MLX4_LOOPBACK_TEST_PAYLOAD + NET_IP_ALIGN + \
+				  ETH_HLEN + PREAMBLE_LEN)
+
+#define MLX4_EN_MIN_MTU		46
+/* VLAN_HLEN is added twice,to support skb vlan tagged with multiple
+ * headers. (For example: ETH_P_8021Q and ETH_P_8021AD).
+ */
+#define MLX4_EN_EFF_MTU(mtu)	((mtu) + ETH_HLEN + (2 * VLAN_HLEN))
+#define ETH_BCAST		0xffffffffffffULL
+
+#define MLX4_EN_LOOPBACK_RETRIES	5
+#define MLX4_EN_LOOPBACK_TIMEOUT	100
+
+#ifdef MLX4_EN_PERF_STAT
+/* Number of samples to 'average' */
+#define AVG_SIZE			128
+#define AVG_FACTOR			1024
+
+#define INC_PERF_COUNTER(cnt)		(++(cnt))
+#define ADD_PERF_COUNTER(cnt, add)	((cnt) += (add))
+#define AVG_PERF_COUNTER(cnt, sample) \
+	((cnt) = ((cnt) * (AVG_SIZE - 1) + (sample) * AVG_FACTOR) / AVG_SIZE)
+#define GET_PERF_COUNTER(cnt)		(cnt)
+#define GET_AVG_PERF_COUNTER(cnt)	((cnt) / AVG_FACTOR)
+
+#else
+
+#define INC_PERF_COUNTER(cnt)		do {} while (0)
+#define ADD_PERF_COUNTER(cnt, add)	do {} while (0)
+#define AVG_PERF_COUNTER(cnt, sample)	do {} while (0)
+#define GET_PERF_COUNTER(cnt)		(0)
+#define GET_AVG_PERF_COUNTER(cnt)	(0)
+#endif /* MLX4_EN_PERF_STAT */
+
+/* Constants for TX flow */
+enum {
+	MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */
+	MAX_BF = 256,
+	MIN_PKT_LEN = 17,
+};
+
+/*
+ * Configurables
+ */
+
+enum cq_type {
+	/* keep tx types first */
+	TX,
+	TX_XDP,
+#define MLX4_EN_NUM_TX_TYPES (TX_XDP + 1)
+	RX,
+};
+
+
+/*
+ * Useful macros
+ */
+#define ROUNDUP_LOG2(x)		ilog2(roundup_pow_of_two(x))
+#define XNOR(x, y)		(!(x) == !(y))
+
+
+struct mlx4_en_tx_info {
+	union {
+		struct sk_buff *skb;
+		struct page *page;
+	};
+	dma_addr_t	map0_dma;
+	u32		map0_byte_count;
+	u32		nr_txbb;
+	u32		nr_bytes;
+	u8		linear;
+	u8		data_offset;
+	u8		inl;
+	u8		ts_requested;
+	u8		nr_maps;
+} ____cacheline_aligned_in_smp;
+
+
+#define MLX4_EN_BIT_DESC_OWN	0x80000000
+#define CTRL_SIZE	sizeof(struct mlx4_wqe_ctrl_seg)
+#define MLX4_EN_MEMTYPE_PAD	0x100
+#define DS_SIZE		sizeof(struct mlx4_wqe_data_seg)
+
+
+struct mlx4_en_tx_desc {
+	struct mlx4_wqe_ctrl_seg ctrl;
+	union {
+		struct mlx4_wqe_data_seg data; /* at least one data segment */
+		struct mlx4_wqe_lso_seg lso;
+		struct mlx4_wqe_inline_seg inl;
+	};
+};
+
+#define MLX4_EN_USE_SRQ		0x01000000
+
+#define MLX4_EN_CX3_LOW_ID	0x1000
+#define MLX4_EN_CX3_HIGH_ID	0x1005
+
+struct mlx4_en_rx_alloc {
+	struct page	*page;
+	dma_addr_t	dma;
+	u32		page_offset;
+};
+
+#define MLX4_EN_CACHE_SIZE (2 * NAPI_POLL_WEIGHT)
+
+struct mlx4_en_page_cache {
+	u32 index;
+	struct {
+		struct page	*page;
+		dma_addr_t	dma;
+	} buf[MLX4_EN_CACHE_SIZE];
+};
+
+struct mlx4_en_priv;
+
+struct mlx4_en_tx_ring {
+	/* cache line used and dirtied in tx completion
+	 * (mlx4_en_free_tx_buf())
+	 */
+	u32			last_nr_txbb;
+	u32			cons;
+	unsigned long		wake_queue;
+	struct netdev_queue	*tx_queue;
+	u32			(*free_tx_desc)(struct mlx4_en_priv *priv,
+						struct mlx4_en_tx_ring *ring,
+						int index,
+						u64 timestamp, int napi_mode);
+	struct mlx4_en_rx_ring	*recycle_ring;
+
+	/* cache line used and dirtied in mlx4_en_xmit() */
+	u32			prod ____cacheline_aligned_in_smp;
+	unsigned int		tx_dropped;
+	unsigned long		bytes;
+	unsigned long		packets;
+	unsigned long		tx_csum;
+	unsigned long		tso_packets;
+	unsigned long		xmit_more;
+	struct mlx4_bf		bf;
+
+	/* Following part should be mostly read */
+	__be32			doorbell_qpn;
+	__be32			mr_key;
+	u32			size; /* number of TXBBs */
+	u32			size_mask;
+	u32			full_size;
+	u32			buf_size;
+	void			*buf;
+	struct mlx4_en_tx_info	*tx_info;
+	int			qpn;
+	u8			queue_index;
+	bool			bf_enabled;
+	bool			bf_alloced;
+	u8			hwtstamp_tx_type;
+	u8			*bounce_buf;
+
+	/* Not used in fast path
+	 * Only queue_stopped might be used if BQL is not properly working.
+	 */
+	unsigned long		queue_stopped;
+	struct mlx4_hwq_resources sp_wqres;
+	struct mlx4_qp		sp_qp;
+	struct mlx4_qp_context	sp_context;
+	cpumask_t		sp_affinity_mask;
+	enum mlx4_qp_state	sp_qp_state;
+	u16			sp_stride;
+	u16			sp_cqn;	/* index of port CQ associated with this ring */
+} ____cacheline_aligned_in_smp;
+
+struct mlx4_en_rx_desc {
+	/* actual number of entries depends on rx ring stride */
+	struct mlx4_wqe_data_seg data[0];
+};
+
+struct mlx4_en_rx_ring {
+	struct mlx4_hwq_resources wqres;
+	u32 size ;	/* number of Rx descs*/
+	u32 actual_size;
+	u32 size_mask;
+	u16 stride;
+	u16 log_stride;
+	u16 cqn;	/* index of port CQ associated with this ring */
+	u32 prod;
+	u32 cons;
+	u32 buf_size;
+	u8  fcs_del;
+	void *buf;
+	void *rx_info;
+	struct bpf_prog __rcu *xdp_prog;
+	struct mlx4_en_page_cache page_cache;
+	unsigned long bytes;
+	unsigned long packets;
+	unsigned long csum_ok;
+	unsigned long csum_none;
+	unsigned long csum_complete;
+	unsigned long rx_alloc_pages;
+	unsigned long xdp_drop;
+	unsigned long xdp_tx;
+	unsigned long xdp_tx_full;
+	unsigned long dropped;
+	int hwtstamp_rx_filter;
+	cpumask_var_t affinity_mask;
+	struct xdp_rxq_info xdp_rxq;
+};
+
+struct mlx4_en_cq {
+	struct mlx4_cq          mcq;
+	struct mlx4_hwq_resources wqres;
+	int                     ring;
+	struct net_device      *dev;
+	union {
+		struct napi_struct napi;
+		bool               xdp_busy;
+	};
+	int size;
+	int buf_size;
+	int vector;
+	enum cq_type type;
+	u16 moder_time;
+	u16 moder_cnt;
+	struct mlx4_cqe *buf;
+#define MLX4_EN_OPCODE_ERROR	0x1e
+
+	struct irq_desc *irq_desc;
+};
+
+struct mlx4_en_port_profile {
+	u32 flags;
+	u32 tx_ring_num[MLX4_EN_NUM_TX_TYPES];
+	u32 rx_ring_num;
+	u32 tx_ring_size;
+	u32 rx_ring_size;
+	u8 num_tx_rings_p_up;
+	u8 rx_pause;
+	u8 rx_ppp;
+	u8 tx_pause;
+	u8 tx_ppp;
+	u8 num_up;
+	int rss_rings;
+	int inline_thold;
+	struct hwtstamp_config hwtstamp_config;
+};
+
+struct mlx4_en_profile {
+	int udp_rss;
+	u8 rss_mask;
+	u32 active_ports;
+	u32 small_pkt_int;
+	u8 no_reset;
+	u8 max_num_tx_rings_p_up;
+	struct mlx4_en_port_profile prof[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_en_dev {
+	struct mlx4_dev         *dev;
+	struct pci_dev		*pdev;
+	struct mutex		state_lock;
+	struct net_device       *pndev[MLX4_MAX_PORTS + 1];
+	struct net_device       *upper[MLX4_MAX_PORTS + 1];
+	u32                     port_cnt;
+	bool			device_up;
+	struct mlx4_en_profile  profile;
+	u32			LSO_support;
+	struct workqueue_struct *workqueue;
+	struct device           *dma_device;
+	void __iomem            *uar_map;
+	struct mlx4_uar         priv_uar;
+	struct mlx4_mr		mr;
+	u32                     priv_pdn;
+	spinlock_t              uar_lock;
+	u8			mac_removed[MLX4_MAX_PORTS + 1];
+	u32			nominal_c_mult;
+	struct cyclecounter	cycles;
+	seqlock_t		clock_lock;
+	struct timecounter	clock;
+	unsigned long		last_overflow_check;
+	struct ptp_clock	*ptp_clock;
+	struct ptp_clock_info	ptp_clock_info;
+	struct notifier_block	nb;
+};
+
+
+struct mlx4_en_rss_map {
+	int base_qpn;
+	struct mlx4_qp qps[MAX_RX_RINGS];
+	enum mlx4_qp_state state[MAX_RX_RINGS];
+	struct mlx4_qp *indir_qp;
+	enum mlx4_qp_state indir_state;
+};
+
+enum mlx4_en_port_flag {
+	MLX4_EN_PORT_ANC = 1<<0, /* Auto-negotiation complete */
+	MLX4_EN_PORT_ANE = 1<<1, /* Auto-negotiation enabled */
+};
+
+struct mlx4_en_port_state {
+	int link_state;
+	int link_speed;
+	int transceiver;
+	u32 flags;
+};
+
+enum mlx4_en_mclist_act {
+	MCLIST_NONE,
+	MCLIST_REM,
+	MCLIST_ADD,
+};
+
+struct mlx4_en_mc_list {
+	struct list_head	list;
+	enum mlx4_en_mclist_act	action;
+	u8			addr[ETH_ALEN];
+	u64			reg_id;
+	u64			tunnel_reg_id;
+};
+
+struct mlx4_en_frag_info {
+	u16 frag_size;
+	u32 frag_stride;
+};
+
+#ifdef CONFIG_MLX4_EN_DCB
+/* Minimal TC BW - setting to 0 will block traffic */
+#define MLX4_EN_BW_MIN 1
+#define MLX4_EN_BW_MAX 100 /* Utilize 100% of the line */
+
+#define MLX4_EN_TC_VENDOR 0
+#define MLX4_EN_TC_ETS 7
+
+enum dcb_pfc_type {
+	pfc_disabled = 0,
+	pfc_enabled_full,
+	pfc_enabled_tx,
+	pfc_enabled_rx
+};
+
+struct mlx4_en_cee_config {
+	bool	pfc_state;
+	enum	dcb_pfc_type dcb_pfc[MLX4_EN_NUM_UP_HIGH];
+};
+#endif
+
+struct ethtool_flow_id {
+	struct list_head list;
+	struct ethtool_rx_flow_spec flow_spec;
+	u64 id;
+};
+
+enum {
+	MLX4_EN_FLAG_PROMISC		= (1 << 0),
+	MLX4_EN_FLAG_MC_PROMISC		= (1 << 1),
+	/* whether we need to enable hardware loopback by putting dmac
+	 * in Tx WQE
+	 */
+	MLX4_EN_FLAG_ENABLE_HW_LOOPBACK	= (1 << 2),
+	/* whether we need to drop packets that hardware loopback-ed */
+	MLX4_EN_FLAG_RX_FILTER_NEEDED	= (1 << 3),
+	MLX4_EN_FLAG_FORCE_PROMISC	= (1 << 4),
+	MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP	= (1 << 5),
+#ifdef CONFIG_MLX4_EN_DCB
+	MLX4_EN_FLAG_DCB_ENABLED        = (1 << 6),
+#endif
+};
+
+#define PORT_BEACON_MAX_LIMIT (65535)
+#define MLX4_EN_MAC_HASH_SIZE (1 << BITS_PER_BYTE)
+#define MLX4_EN_MAC_HASH_IDX 5
+
+struct mlx4_en_stats_bitmap {
+	DECLARE_BITMAP(bitmap, NUM_ALL_STATS);
+	struct mutex mutex; /* for mutual access to stats bitmap */
+};
+
+struct mlx4_en_priv {
+	struct mlx4_en_dev *mdev;
+	struct mlx4_en_port_profile *prof;
+	struct net_device *dev;
+	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
+	struct mlx4_en_port_state port_state;
+	spinlock_t stats_lock;
+	struct ethtool_flow_id ethtool_rules[MAX_NUM_OF_FS_RULES];
+	/* To allow rules removal while port is going down */
+	struct list_head ethtool_list;
+
+	unsigned long last_moder_packets[MAX_RX_RINGS];
+	unsigned long last_moder_tx_packets;
+	unsigned long last_moder_bytes[MAX_RX_RINGS];
+	unsigned long last_moder_jiffies;
+	int last_moder_time[MAX_RX_RINGS];
+	u16 rx_usecs;
+	u16 rx_frames;
+	u16 tx_usecs;
+	u16 tx_frames;
+	u32 pkt_rate_low;
+	u16 rx_usecs_low;
+	u32 pkt_rate_high;
+	u16 rx_usecs_high;
+	u32 sample_interval;
+	u32 adaptive_rx_coal;
+	u32 msg_enable;
+	u32 loopback_ok;
+	u32 validate_loopback;
+
+	struct mlx4_hwq_resources res;
+	int link_state;
+	int last_link_state;
+	bool port_up;
+	int port;
+	int registered;
+	int allocated;
+	int stride;
+	unsigned char current_mac[ETH_ALEN + 2];
+	int mac_index;
+	unsigned max_mtu;
+	int base_qpn;
+	int cqe_factor;
+	int cqe_size;
+
+	struct mlx4_en_rss_map rss_map;
+	__be32 ctrl_flags;
+	u32 flags;
+	u8 num_tx_rings_p_up;
+	u32 tx_work_limit;
+	u32 tx_ring_num[MLX4_EN_NUM_TX_TYPES];
+	u32 rx_ring_num;
+	u32 rx_skb_size;
+	struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS];
+	u8 num_frags;
+	u8 log_rx_info;
+	u8 dma_dir;
+	u16 rx_headroom;
+
+	struct mlx4_en_tx_ring **tx_ring[MLX4_EN_NUM_TX_TYPES];
+	struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS];
+	struct mlx4_en_cq **tx_cq[MLX4_EN_NUM_TX_TYPES];
+	struct mlx4_en_cq *rx_cq[MAX_RX_RINGS];
+	struct mlx4_qp drop_qp;
+	struct work_struct rx_mode_task;
+	struct work_struct watchdog_task;
+	struct work_struct linkstate_task;
+	struct delayed_work stats_task;
+	struct delayed_work service_task;
+	struct work_struct vxlan_add_task;
+	struct work_struct vxlan_del_task;
+	struct mlx4_en_perf_stats pstats;
+	struct mlx4_en_pkt_stats pkstats;
+	struct mlx4_en_counter_stats pf_stats;
+	struct mlx4_en_flow_stats_rx rx_priority_flowstats[MLX4_NUM_PRIORITIES];
+	struct mlx4_en_flow_stats_tx tx_priority_flowstats[MLX4_NUM_PRIORITIES];
+	struct mlx4_en_flow_stats_rx rx_flowstats;
+	struct mlx4_en_flow_stats_tx tx_flowstats;
+	struct mlx4_en_port_stats port_stats;
+	struct mlx4_en_xdp_stats xdp_stats;
+	struct mlx4_en_phy_stats phy_stats;
+	struct mlx4_en_stats_bitmap stats_bitmap;
+	struct list_head mc_list;
+	struct list_head curr_list;
+	u64 broadcast_id;
+	struct mlx4_en_stat_out_mbox hw_stats;
+	int vids[128];
+	bool wol;
+	struct device *ddev;
+	struct hlist_head mac_hash[MLX4_EN_MAC_HASH_SIZE];
+	struct hwtstamp_config hwtstamp_config;
+	u32 counter_index;
+
+#ifdef CONFIG_MLX4_EN_DCB
+#define MLX4_EN_DCB_ENABLED	0x3
+	struct ieee_ets ets;
+	u16 maxrate[IEEE_8021QAZ_MAX_TCS];
+	enum dcbnl_cndd_states cndd_state[IEEE_8021QAZ_MAX_TCS];
+	struct mlx4_en_cee_config cee_config;
+	u8 dcbx_cap;
+#endif
+#ifdef CONFIG_RFS_ACCEL
+	spinlock_t filters_lock;
+	int last_filter_id;
+	struct list_head filters;
+	struct hlist_head filter_hash[1 << MLX4_EN_FILTER_HASH_SHIFT];
+#endif
+	u64 tunnel_reg_id;
+	__be16 vxlan_port;
+
+	u32 pflags;
+	u8 rss_key[MLX4_EN_RSS_KEY_SIZE];
+	u8 rss_hash_fn;
+};
+
+enum mlx4_en_wol {
+	MLX4_EN_WOL_MAGIC = (1ULL << 61),
+	MLX4_EN_WOL_ENABLED = (1ULL << 62),
+};
+
+struct mlx4_mac_entry {
+	struct hlist_node hlist;
+	unsigned char mac[ETH_ALEN + 2];
+	u64 reg_id;
+	struct rcu_head rcu;
+};
+
+static inline struct mlx4_cqe *mlx4_en_get_cqe(void *buf, int idx, int cqe_sz)
+{
+	return buf + idx * cqe_sz;
+}
+
+#define MLX4_EN_WOL_DO_MODIFY (1ULL << 63)
+
+void mlx4_en_init_ptys2ethtool_map(void);
+void mlx4_en_update_loopback_state(struct net_device *dev,
+				   netdev_features_t features);
+
+void mlx4_en_destroy_netdev(struct net_device *dev);
+int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
+			struct mlx4_en_port_profile *prof);
+
+int mlx4_en_start_port(struct net_device *dev);
+void mlx4_en_stop_port(struct net_device *dev, int detach);
+
+void mlx4_en_set_stats_bitmap(struct mlx4_dev *dev,
+			      struct mlx4_en_stats_bitmap *stats_bitmap,
+			      u8 rx_ppp, u8 rx_pause,
+			      u8 tx_ppp, u8 tx_pause);
+
+int mlx4_en_try_alloc_resources(struct mlx4_en_priv *priv,
+				struct mlx4_en_priv *tmp,
+				struct mlx4_en_port_profile *prof,
+				bool carry_xdp_prog);
+void mlx4_en_safe_replace_resources(struct mlx4_en_priv *priv,
+				    struct mlx4_en_priv *tmp);
+
+int mlx4_en_create_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq,
+		      int entries, int ring, enum cq_type mode, int node);
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq);
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
+			int cq_idx);
+void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+void mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+
+void mlx4_en_tx_irq(struct mlx4_cq *mcq);
+u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
+			 void *accel_priv, select_queue_fallback_t fallback);
+netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev);
+netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
+			       struct mlx4_en_rx_alloc *frame,
+			       struct mlx4_en_priv *priv, unsigned int length,
+			       int tx_ind, bool *doorbell_pending);
+void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring);
+bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring,
+			struct mlx4_en_rx_alloc *frame);
+
+int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_tx_ring **pring,
+			   u32 size, u16 stride,
+			   int node, int queue_index);
+void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring **pring);
+void mlx4_en_init_tx_xdp_ring_descs(struct mlx4_en_priv *priv,
+				    struct mlx4_en_tx_ring *ring);
+int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring *ring,
+			     int cq, int user_prio);
+void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_tx_ring *ring);
+void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev);
+void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv);
+int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_rx_ring **pring,
+			   u32 size, u16 stride, int node, int queue_index);
+void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_rx_ring **pring,
+			     u32 size, u16 stride);
+int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv);
+void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring);
+int mlx4_en_process_rx_cq(struct net_device *dev,
+			  struct mlx4_en_cq *cq,
+			  int budget);
+int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget);
+int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget);
+bool mlx4_en_process_tx_cq(struct net_device *dev,
+			   struct mlx4_en_cq *cq, int napi_budget);
+u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
+			 struct mlx4_en_tx_ring *ring,
+			 int index, u64 timestamp,
+			 int napi_mode);
+u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
+			    struct mlx4_en_tx_ring *ring,
+			    int index, u64 timestamp,
+			    int napi_mode);
+void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
+		int is_tx, int rss, int qpn, int cqn, int user_prio,
+		struct mlx4_qp_context *context);
+void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event);
+int mlx4_en_change_mcast_lb(struct mlx4_en_priv *priv, struct mlx4_qp *qp,
+			    int loopback);
+void mlx4_en_calc_rx_buf(struct net_device *dev);
+int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv);
+void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv);
+int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv);
+void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv);
+int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring);
+void mlx4_en_rx_irq(struct mlx4_cq *mcq);
+
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv);
+
+void mlx4_en_fold_software_stats(struct net_device *dev);
+int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset);
+int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port);
+
+#ifdef CONFIG_MLX4_EN_DCB
+extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_ops;
+extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_pfc_ops;
+#endif
+
+int mlx4_en_setup_tc(struct net_device *dev, u8 up);
+int mlx4_en_alloc_tx_queue_per_tc(struct net_device *dev, u8 tc);
+
+#ifdef CONFIG_RFS_ACCEL
+void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv);
+#endif
+
+#define MLX4_EN_NUM_SELF_TEST	5
+void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf);
+void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev);
+
+#define DEV_FEATURE_CHANGED(dev, new_features, feature) \
+	((dev->features & feature) ^ (new_features & feature))
+
+int mlx4_en_reset_config(struct net_device *dev,
+			 struct hwtstamp_config ts_config,
+			 netdev_features_t new_features);
+void mlx4_en_update_pfc_stats_bitmap(struct mlx4_dev *dev,
+				     struct mlx4_en_stats_bitmap *stats_bitmap,
+				     u8 rx_ppp, u8 rx_pause,
+				     u8 tx_ppp, u8 tx_pause);
+int mlx4_en_netdev_event(struct notifier_block *this,
+			 unsigned long event, void *ptr);
+
+/*
+ * Functions for time stamping
+ */
+u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe);
+void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev,
+			    struct skb_shared_hwtstamps *hwts,
+			    u64 timestamp);
+void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev);
+void mlx4_en_remove_timestamp(struct mlx4_en_dev *mdev);
+
+/* Globals
+ */
+extern const struct ethtool_ops mlx4_en_ethtool_ops;
+
+
+
+/*
+ * printk / logging functions
+ */
+
+__printf(3, 4)
+void en_print(const char *level, const struct mlx4_en_priv *priv,
+	      const char *format, ...);
+
+#define en_dbg(mlevel, priv, format, ...)				\
+do {									\
+	if (NETIF_MSG_##mlevel & (priv)->msg_enable)			\
+		en_print(KERN_DEBUG, priv, format, ##__VA_ARGS__);	\
+} while (0)
+#define en_warn(priv, format, ...)					\
+	en_print(KERN_WARNING, priv, format, ##__VA_ARGS__)
+#define en_err(priv, format, ...)					\
+	en_print(KERN_ERR, priv, format, ##__VA_ARGS__)
+#define en_info(priv, format, ...)					\
+	en_print(KERN_INFO, priv, format, ##__VA_ARGS__)
+
+#define mlx4_err(mdev, format, ...)					\
+	pr_err(DRV_NAME " %s: " format,					\
+	       dev_name(&(mdev)->pdev->dev), ##__VA_ARGS__)
+#define mlx4_info(mdev, format, ...)					\
+	pr_info(DRV_NAME " %s: " format,				\
+		dev_name(&(mdev)->pdev->dev), ##__VA_ARGS__)
+#define mlx4_warn(mdev, format, ...)					\
+	pr_warn(DRV_NAME " %s: " format,				\
+		dev_name(&(mdev)->pdev->dev), ##__VA_ARGS__)
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_stats.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_stats.h
new file mode 100644
index 0000000..86b6051
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_stats.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MLX4_STATS_
+#define _MLX4_STATS_
+
+#ifdef MLX4_EN_PERF_STAT
+#define NUM_PERF_STATS			NUM_PERF_COUNTERS
+#else
+#define NUM_PERF_STATS			0
+#endif
+
+#define NUM_PRIORITIES	9
+#define NUM_PRIORITY_STATS 2
+
+struct mlx4_en_pkt_stats {
+	unsigned long rx_multicast_packets;
+	unsigned long rx_broadcast_packets;
+	unsigned long rx_jabbers;
+	unsigned long rx_in_range_length_error;
+	unsigned long rx_out_range_length_error;
+	unsigned long tx_multicast_packets;
+	unsigned long tx_broadcast_packets;
+	unsigned long rx_prio[NUM_PRIORITIES][NUM_PRIORITY_STATS];
+	unsigned long tx_prio[NUM_PRIORITIES][NUM_PRIORITY_STATS];
+#define NUM_PKT_STATS		43
+};
+
+struct mlx4_en_counter_stats {
+	unsigned long rx_packets;
+	unsigned long rx_bytes;
+	unsigned long tx_packets;
+	unsigned long tx_bytes;
+#define NUM_PF_STATS      4
+};
+
+struct mlx4_en_port_stats {
+	unsigned long tso_packets;
+	unsigned long xmit_more;
+	unsigned long queue_stopped;
+	unsigned long wake_queue;
+	unsigned long tx_timeout;
+	unsigned long rx_alloc_pages;
+	unsigned long rx_chksum_good;
+	unsigned long rx_chksum_none;
+	unsigned long rx_chksum_complete;
+	unsigned long tx_chksum_offload;
+#define NUM_PORT_STATS		10
+};
+
+struct mlx4_en_perf_stats {
+	u32 tx_poll;
+	u64 tx_pktsz_avg;
+	u32 inflight_avg;
+	u16 tx_coal_avg;
+	u16 rx_coal_avg;
+	u32 napi_quota;
+#define NUM_PERF_COUNTERS		6
+};
+
+struct mlx4_en_xdp_stats {
+	unsigned long rx_xdp_drop;
+	unsigned long rx_xdp_tx;
+	unsigned long rx_xdp_tx_full;
+#define NUM_XDP_STATS		3
+};
+
+struct mlx4_en_phy_stats {
+	unsigned long rx_packets_phy;
+	unsigned long rx_bytes_phy;
+	unsigned long tx_packets_phy;
+	unsigned long tx_bytes_phy;
+#define NUM_PHY_STATS		4
+};
+
+#define NUM_MAIN_STATS	21
+
+#define MLX4_NUM_PRIORITIES	8
+
+struct mlx4_en_flow_stats_rx {
+	u64 rx_pause;
+	u64 rx_pause_duration;
+	u64 rx_pause_transition;
+#define NUM_FLOW_STATS_RX	3
+#define NUM_FLOW_PRIORITY_STATS_RX	(NUM_FLOW_STATS_RX * \
+					 MLX4_NUM_PRIORITIES)
+};
+
+struct mlx4_en_flow_stats_tx {
+	u64 tx_pause;
+	u64 tx_pause_duration;
+	u64 tx_pause_transition;
+#define NUM_FLOW_STATS_TX	3
+#define NUM_FLOW_PRIORITY_STATS_TX	(NUM_FLOW_STATS_TX * \
+					 MLX4_NUM_PRIORITIES)
+};
+
+#define NUM_FLOW_STATS (NUM_FLOW_STATS_RX + NUM_FLOW_STATS_TX + \
+			NUM_FLOW_PRIORITY_STATS_TX + \
+			NUM_FLOW_PRIORITY_STATS_RX)
+
+struct mlx4_en_stat_out_flow_control_mbox {
+	/* Total number of PAUSE frames received from the far-end port */
+	__be64 rx_pause;
+	/* Total number of microseconds that far-end port requested to pause
+	* transmission of packets
+	*/
+	__be64 rx_pause_duration;
+	/* Number of received transmission from XOFF state to XON state */
+	__be64 rx_pause_transition;
+	/* Total number of PAUSE frames sent from the far-end port */
+	__be64 tx_pause;
+	/* Total time in microseconds that transmission of packets has been
+	* paused
+	*/
+	__be64 tx_pause_duration;
+	/* Number of transmitter transitions from XOFF state to XON state */
+	__be64 tx_pause_transition;
+	/* Reserverd */
+	__be64 reserved[2];
+};
+
+enum {
+	MLX4_DUMP_ETH_STATS_FLOW_CONTROL = 1 << 12
+};
+
+#define NUM_ALL_STATS	(NUM_MAIN_STATS + NUM_PORT_STATS + NUM_PKT_STATS + \
+			 NUM_FLOW_STATS + NUM_PERF_STATS + NUM_PF_STATS + \
+			 NUM_XDP_STATS + NUM_PHY_STATS)
+
+#define MLX4_FIND_NETDEV_STAT(n) (offsetof(struct net_device_stats, n) / \
+				  sizeof(((struct net_device_stats *)0)->n))
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c
new file mode 100644
index 0000000..2e84f10
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
@@ -0,0 +1,1156 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+static u32 mlx4_buddy_alloc(struct mlx4_buddy *buddy, int order)
+{
+	int o;
+	int m;
+	u32 seg;
+
+	spin_lock(&buddy->lock);
+
+	for (o = order; o <= buddy->max_order; ++o)
+		if (buddy->num_free[o]) {
+			m = 1 << (buddy->max_order - o);
+			seg = find_first_bit(buddy->bits[o], m);
+			if (seg < m)
+				goto found;
+		}
+
+	spin_unlock(&buddy->lock);
+	return -1;
+
+ found:
+	clear_bit(seg, buddy->bits[o]);
+	--buddy->num_free[o];
+
+	while (o > order) {
+		--o;
+		seg <<= 1;
+		set_bit(seg ^ 1, buddy->bits[o]);
+		++buddy->num_free[o];
+	}
+
+	spin_unlock(&buddy->lock);
+
+	seg <<= order;
+
+	return seg;
+}
+
+static void mlx4_buddy_free(struct mlx4_buddy *buddy, u32 seg, int order)
+{
+	seg >>= order;
+
+	spin_lock(&buddy->lock);
+
+	while (test_bit(seg ^ 1, buddy->bits[order])) {
+		clear_bit(seg ^ 1, buddy->bits[order]);
+		--buddy->num_free[order];
+		seg >>= 1;
+		++order;
+	}
+
+	set_bit(seg, buddy->bits[order]);
+	++buddy->num_free[order];
+
+	spin_unlock(&buddy->lock);
+}
+
+static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order)
+{
+	int i, s;
+
+	buddy->max_order = max_order;
+	spin_lock_init(&buddy->lock);
+
+	buddy->bits = kcalloc(buddy->max_order + 1, sizeof(long *),
+			      GFP_KERNEL);
+	buddy->num_free = kcalloc(buddy->max_order + 1, sizeof(*buddy->num_free),
+				  GFP_KERNEL);
+	if (!buddy->bits || !buddy->num_free)
+		goto err_out;
+
+	for (i = 0; i <= buddy->max_order; ++i) {
+		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
+		buddy->bits[i] = kvmalloc_array(s, sizeof(long), GFP_KERNEL | __GFP_ZERO);
+		if (!buddy->bits[i])
+			goto err_out_free;
+	}
+
+	set_bit(0, buddy->bits[buddy->max_order]);
+	buddy->num_free[buddy->max_order] = 1;
+
+	return 0;
+
+err_out_free:
+	for (i = 0; i <= buddy->max_order; ++i)
+		kvfree(buddy->bits[i]);
+
+err_out:
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+
+	return -ENOMEM;
+}
+
+static void mlx4_buddy_cleanup(struct mlx4_buddy *buddy)
+{
+	int i;
+
+	for (i = 0; i <= buddy->max_order; ++i)
+		kvfree(buddy->bits[i]);
+
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+}
+
+u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+	u32 seg;
+	int seg_order;
+	u32 offset;
+
+	seg_order = max_t(int, order - log_mtts_per_seg, 0);
+
+	seg = mlx4_buddy_alloc(&mr_table->mtt_buddy, seg_order);
+	if (seg == -1)
+		return -1;
+
+	offset = seg * (1 << log_mtts_per_seg);
+
+	if (mlx4_table_get_range(dev, &mr_table->mtt_table, offset,
+				 offset + (1 << order) - 1)) {
+		mlx4_buddy_free(&mr_table->mtt_buddy, seg, seg_order);
+		return -1;
+	}
+
+	return offset;
+}
+
+static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order)
+{
+	u64 in_param = 0;
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, order);
+		err = mlx4_cmd_imm(dev, in_param, &out_param, RES_MTT,
+						       RES_OP_RESERVE_AND_MAP,
+						       MLX4_CMD_ALLOC_RES,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_WRAPPED);
+		if (err)
+			return -1;
+		return get_param_l(&out_param);
+	}
+	return __mlx4_alloc_mtt_range(dev, order);
+}
+
+int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
+		  struct mlx4_mtt *mtt)
+{
+	int i;
+
+	if (!npages) {
+		mtt->order      = -1;
+		mtt->page_shift = MLX4_ICM_PAGE_SHIFT;
+		return 0;
+	} else
+		mtt->page_shift = page_shift;
+
+	for (mtt->order = 0, i = 1; i < npages; i <<= 1)
+		++mtt->order;
+
+	mtt->offset = mlx4_alloc_mtt_range(dev, mtt->order);
+	if (mtt->offset == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_init);
+
+void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 offset, int order)
+{
+	u32 first_seg;
+	int seg_order;
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	seg_order = max_t(int, order - log_mtts_per_seg, 0);
+	first_seg = offset / (1 << log_mtts_per_seg);
+
+	mlx4_buddy_free(&mr_table->mtt_buddy, first_seg, seg_order);
+	mlx4_table_put_range(dev, &mr_table->mtt_table, offset,
+			     offset + (1 << order) - 1);
+}
+
+static void mlx4_free_mtt_range(struct mlx4_dev *dev, u32 offset, int order)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, offset);
+		set_param_h(&in_param, order);
+		err = mlx4_cmd(dev, in_param, RES_MTT, RES_OP_RESERVE_AND_MAP,
+						       MLX4_CMD_FREE_RES,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_WRAPPED);
+		if (err)
+			mlx4_warn(dev, "Failed to free mtt range at:%d order:%d\n",
+				  offset, order);
+		return;
+	}
+	__mlx4_free_mtt_range(dev, offset, order);
+}
+
+void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+{
+	if (mtt->order < 0)
+		return;
+
+	mlx4_free_mtt_range(dev, mtt->offset, mtt->order);
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_cleanup);
+
+u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+{
+	return (u64) mtt->offset * dev->caps.mtt_entry_sz;
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_addr);
+
+static u32 hw_index_to_key(u32 ind)
+{
+	return (ind >> 24) | (ind << 8);
+}
+
+static u32 key_to_hw_index(u32 key)
+{
+	return (key << 24) | (key >> 8);
+}
+
+static int mlx4_SW2HW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int mpt_index)
+{
+	return mlx4_cmd(dev, mailbox->dma, mpt_index,
+			0, MLX4_CMD_SW2HW_MPT, MLX4_CMD_TIME_CLASS_B,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int mpt_index)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index,
+			    !mailbox, MLX4_CMD_HW2SW_MPT,
+			    MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+}
+
+/* Must protect against concurrent access */
+int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+		       struct mlx4_mpt_entry ***mpt_entry)
+{
+	int err;
+	int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+
+	if (mmr->enabled != MLX4_MPT_EN_HW)
+		return -EINVAL;
+
+	err = mlx4_HW2SW_MPT(dev, NULL, key);
+	if (err) {
+		mlx4_warn(dev, "HW2SW_MPT failed (%d).", err);
+		mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n");
+		return err;
+	}
+
+	mmr->enabled = MLX4_MPT_EN_SW;
+
+	if (!mlx4_is_mfunc(dev)) {
+		**mpt_entry = mlx4_table_find(
+				&mlx4_priv(dev)->mr_table.dmpt_table,
+				key, NULL);
+	} else {
+		mailbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, key,
+				   0, MLX4_CMD_QUERY_MPT,
+				   MLX4_CMD_TIME_CLASS_B,
+				   MLX4_CMD_WRAPPED);
+		if (err)
+			goto free_mailbox;
+
+		*mpt_entry = (struct mlx4_mpt_entry **)&mailbox->buf;
+	}
+
+	if (!(*mpt_entry) || !(**mpt_entry)) {
+		err = -ENOMEM;
+		goto free_mailbox;
+	}
+
+	return 0;
+
+free_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_get_mpt);
+
+int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+			 struct mlx4_mpt_entry **mpt_entry)
+{
+	int err;
+
+	if (!mlx4_is_mfunc(dev)) {
+		/* Make sure any changes to this entry are flushed */
+		wmb();
+
+		*(u8 *)(*mpt_entry) = MLX4_MPT_STATUS_HW;
+
+		/* Make sure the new status is written */
+		wmb();
+
+		err = mlx4_SYNC_TPT(dev);
+	} else {
+		int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
+
+		struct mlx4_cmd_mailbox *mailbox =
+			container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
+				     buf);
+
+		err = mlx4_SW2HW_MPT(dev, mailbox, key);
+	}
+
+	if (!err) {
+		mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK;
+		mmr->enabled = MLX4_MPT_EN_HW;
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_write_mpt);
+
+void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
+			struct mlx4_mpt_entry **mpt_entry)
+{
+	if (mlx4_is_mfunc(dev)) {
+		struct mlx4_cmd_mailbox *mailbox =
+			container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
+				     buf);
+		mlx4_free_cmd_mailbox(dev, mailbox);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_put_mpt);
+
+int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
+			 u32 pdn)
+{
+	u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags) & ~MLX4_MPT_PD_MASK;
+	/* The wrapper function will put the slave's id here */
+	if (mlx4_is_mfunc(dev))
+		pd_flags &= ~MLX4_MPT_PD_VF_MASK;
+
+	mpt_entry->pd_flags = cpu_to_be32(pd_flags |
+					  (pdn & MLX4_MPT_PD_MASK)
+					  | MLX4_MPT_PD_FLAG_EN_INV);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_pd);
+
+int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
+			     struct mlx4_mpt_entry *mpt_entry,
+			     u32 access)
+{
+	u32 flags = (be32_to_cpu(mpt_entry->flags) & ~MLX4_PERM_MASK) |
+		    (access & MLX4_PERM_MASK);
+
+	mpt_entry->flags = cpu_to_be32(flags);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_access);
+
+static int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
+			   u64 iova, u64 size, u32 access, int npages,
+			   int page_shift, struct mlx4_mr *mr)
+{
+	mr->iova       = iova;
+	mr->size       = size;
+	mr->pd	       = pd;
+	mr->access     = access;
+	mr->enabled    = MLX4_MPT_DISABLED;
+	mr->key	       = hw_index_to_key(mridx);
+
+	return mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+}
+
+static int mlx4_WRITE_MTT(struct mlx4_dev *dev,
+			  struct mlx4_cmd_mailbox *mailbox,
+			  int num_entries)
+{
+	return mlx4_cmd(dev, mailbox->dma, num_entries, 0, MLX4_CMD_WRITE_MTT,
+			MLX4_CMD_TIME_CLASS_A,  MLX4_CMD_WRAPPED);
+}
+
+int __mlx4_mpt_reserve(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return mlx4_bitmap_alloc(&priv->mr_table.mpt_bitmap);
+}
+
+static int mlx4_mpt_reserve(struct mlx4_dev *dev)
+{
+	u64 out_param;
+
+	if (mlx4_is_mfunc(dev)) {
+		if (mlx4_cmd_imm(dev, 0, &out_param, RES_MPT, RES_OP_RESERVE,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED))
+			return -1;
+		return get_param_l(&out_param);
+	}
+	return  __mlx4_mpt_reserve(dev);
+}
+
+void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, index, MLX4_NO_RR);
+}
+
+static void mlx4_mpt_release(struct mlx4_dev *dev, u32 index)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, index);
+		if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_RESERVE,
+			       MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed to release mr index:%d\n",
+				  index);
+		return;
+	}
+	__mlx4_mpt_release(dev, index);
+}
+
+int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	return mlx4_table_get(dev, &mr_table->dmpt_table, index);
+}
+
+static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index)
+{
+	u64 param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&param, index);
+		return mlx4_cmd_imm(dev, param, &param, RES_MPT, RES_OP_MAP_ICM,
+							MLX4_CMD_ALLOC_RES,
+							MLX4_CMD_TIME_CLASS_A,
+							MLX4_CMD_WRAPPED);
+	}
+	return __mlx4_mpt_alloc_icm(dev, index);
+}
+
+void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	mlx4_table_put(dev, &mr_table->dmpt_table, index);
+}
+
+static void mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, index);
+		if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_MAP_ICM,
+			     MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+			     MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed to free icm of mr index:%d\n",
+				  index);
+		return;
+	}
+	return __mlx4_mpt_free_icm(dev, index);
+}
+
+int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
+		  int npages, int page_shift, struct mlx4_mr *mr)
+{
+	u32 index;
+	int err;
+
+	index = mlx4_mpt_reserve(dev);
+	if (index == -1)
+		return -ENOMEM;
+
+	err = mlx4_mr_alloc_reserved(dev, index, pd, iova, size,
+				     access, npages, page_shift, mr);
+	if (err)
+		mlx4_mpt_release(dev, index);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_alloc);
+
+static int mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	int err;
+
+	if (mr->enabled == MLX4_MPT_EN_HW) {
+		err = mlx4_HW2SW_MPT(dev, NULL,
+				     key_to_hw_index(mr->key) &
+				     (dev->caps.num_mpts - 1));
+		if (err) {
+			mlx4_warn(dev, "HW2SW_MPT failed (%d), MR has MWs bound to it\n",
+				  err);
+			return err;
+		}
+
+		mr->enabled = MLX4_MPT_EN_SW;
+	}
+	mlx4_mtt_cleanup(dev, &mr->mtt);
+
+	return 0;
+}
+
+int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	int ret;
+
+	ret = mlx4_mr_free_reserved(dev, mr);
+	if (ret)
+		return ret;
+	if (mr->enabled)
+		mlx4_mpt_free_icm(dev, key_to_hw_index(mr->key));
+	mlx4_mpt_release(dev, key_to_hw_index(mr->key));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_free);
+
+void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	mlx4_mtt_cleanup(dev, &mr->mtt);
+	mr->mtt.order = -1;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_cleanup);
+
+int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
+			    u64 iova, u64 size, int npages,
+			    int page_shift, struct mlx4_mpt_entry *mpt_entry)
+{
+	int err;
+
+	err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+	if (err)
+		return err;
+
+	mpt_entry->start       = cpu_to_be64(iova);
+	mpt_entry->length      = cpu_to_be64(size);
+	mpt_entry->entity_size = cpu_to_be32(page_shift);
+	mpt_entry->flags    &= ~(cpu_to_be32(MLX4_MPT_FLAG_FREE |
+					   MLX4_MPT_FLAG_SW_OWNS));
+	if (mr->mtt.order < 0) {
+		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
+		mpt_entry->mtt_addr = 0;
+	} else {
+		mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev,
+						  &mr->mtt));
+		if (mr->mtt.page_shift == 0)
+			mpt_entry->mtt_sz    = cpu_to_be32(1 << mr->mtt.order);
+	}
+	if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) {
+		/* fast register MR in free state */
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG |
+						   MLX4_MPT_PD_FLAG_RAE);
+	} else {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
+	}
+	mr->enabled = MLX4_MPT_EN_SW;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_write);
+
+int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mpt_entry *mpt_entry;
+	int err;
+
+	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mr->key));
+	if (err)
+		return err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_table;
+	}
+	mpt_entry = mailbox->buf;
+	mpt_entry->flags = cpu_to_be32(MLX4_MPT_FLAG_MIO	 |
+				       MLX4_MPT_FLAG_REGION	 |
+				       mr->access);
+
+	mpt_entry->key	       = cpu_to_be32(key_to_hw_index(mr->key));
+	mpt_entry->pd_flags    = cpu_to_be32(mr->pd | MLX4_MPT_PD_FLAG_EN_INV);
+	mpt_entry->start       = cpu_to_be64(mr->iova);
+	mpt_entry->length      = cpu_to_be64(mr->size);
+	mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift);
+
+	if (mr->mtt.order < 0) {
+		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
+		mpt_entry->mtt_addr = 0;
+	} else {
+		mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev,
+						  &mr->mtt));
+	}
+
+	if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) {
+		/* fast register MR in free state */
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG |
+						   MLX4_MPT_PD_FLAG_RAE);
+		mpt_entry->mtt_sz    = cpu_to_be32(1 << mr->mtt.order);
+	} else {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
+	}
+
+	err = mlx4_SW2HW_MPT(dev, mailbox,
+			     key_to_hw_index(mr->key) & (dev->caps.num_mpts - 1));
+	if (err) {
+		mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_cmd;
+	}
+	mr->enabled = MLX4_MPT_EN_HW;
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return 0;
+
+err_cmd:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_table:
+	mlx4_mpt_free_icm(dev, key_to_hw_index(mr->key));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_enable);
+
+static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+				int start_index, int npages, u64 *page_list)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	__be64 *mtts;
+	dma_addr_t dma_handle;
+	int i;
+
+	mtts = mlx4_table_find(&priv->mr_table.mtt_table, mtt->offset +
+			       start_index, &dma_handle);
+
+	if (!mtts)
+		return -ENOMEM;
+
+	dma_sync_single_for_cpu(&dev->persist->pdev->dev, dma_handle,
+				npages * sizeof(u64), DMA_TO_DEVICE);
+
+	for (i = 0; i < npages; ++i)
+		mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
+
+	dma_sync_single_for_device(&dev->persist->pdev->dev, dma_handle,
+				   npages * sizeof(u64), DMA_TO_DEVICE);
+
+	return 0;
+}
+
+int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     int start_index, int npages, u64 *page_list)
+{
+	int err = 0;
+	int chunk;
+	int mtts_per_page;
+	int max_mtts_first_page;
+
+	/* compute how may mtts fit in the first page */
+	mtts_per_page = PAGE_SIZE / sizeof(u64);
+	max_mtts_first_page = mtts_per_page - (mtt->offset + start_index)
+			      % mtts_per_page;
+
+	chunk = min_t(int, max_mtts_first_page, npages);
+
+	while (npages > 0) {
+		err = mlx4_write_mtt_chunk(dev, mtt, start_index, chunk, page_list);
+		if (err)
+			return err;
+		npages      -= chunk;
+		start_index += chunk;
+		page_list   += chunk;
+
+		chunk = min_t(int, mtts_per_page, npages);
+	}
+	return err;
+}
+
+int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   int start_index, int npages, u64 *page_list)
+{
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+	__be64 *inbox = NULL;
+	int chunk;
+	int err = 0;
+	int i;
+
+	if (mtt->order < 0)
+		return -EINVAL;
+
+	if (mlx4_is_mfunc(dev)) {
+		mailbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+		inbox = mailbox->buf;
+
+		while (npages > 0) {
+			chunk = min_t(int, MLX4_MAILBOX_SIZE / sizeof(u64) - 2,
+				      npages);
+			inbox[0] = cpu_to_be64(mtt->offset + start_index);
+			inbox[1] = 0;
+			for (i = 0; i < chunk; ++i)
+				inbox[i + 2] = cpu_to_be64(page_list[i] |
+					       MLX4_MTT_FLAG_PRESENT);
+			err = mlx4_WRITE_MTT(dev, mailbox, chunk);
+			if (err) {
+				mlx4_free_cmd_mailbox(dev, mailbox);
+				return err;
+			}
+
+			npages      -= chunk;
+			start_index += chunk;
+			page_list   += chunk;
+		}
+		mlx4_free_cmd_mailbox(dev, mailbox);
+		return err;
+	}
+
+	return __mlx4_write_mtt(dev, mtt, start_index, npages, page_list);
+}
+EXPORT_SYMBOL_GPL(mlx4_write_mtt);
+
+int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		       struct mlx4_buf *buf)
+{
+	u64 *page_list;
+	int err;
+	int i;
+
+	page_list = kcalloc(buf->npages, sizeof(*page_list), GFP_KERNEL);
+	if (!page_list)
+		return -ENOMEM;
+
+	for (i = 0; i < buf->npages; ++i)
+		if (buf->nbufs == 1)
+			page_list[i] = buf->direct.map + (i << buf->page_shift);
+		else
+			page_list[i] = buf->page_list[i].map;
+
+	err = mlx4_write_mtt(dev, mtt, 0, buf->npages, page_list);
+
+	kfree(page_list);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_write_mtt);
+
+int mlx4_mw_alloc(struct mlx4_dev *dev, u32 pd, enum mlx4_mw_type type,
+		  struct mlx4_mw *mw)
+{
+	u32 index;
+
+	if ((type == MLX4_MW_TYPE_1 &&
+	     !(dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW)) ||
+	     (type == MLX4_MW_TYPE_2 &&
+	     !(dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN)))
+		return -EOPNOTSUPP;
+
+	index = mlx4_mpt_reserve(dev);
+	if (index == -1)
+		return -ENOMEM;
+
+	mw->key	    = hw_index_to_key(index);
+	mw->pd      = pd;
+	mw->type    = type;
+	mw->enabled = MLX4_MPT_DISABLED;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mw_alloc);
+
+int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mpt_entry *mpt_entry;
+	int err;
+
+	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mw->key));
+	if (err)
+		return err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_table;
+	}
+	mpt_entry = mailbox->buf;
+
+	/* Note that the MLX4_MPT_FLAG_REGION bit in mpt_entry->flags is turned
+	 * off, thus creating a memory window and not a memory region.
+	 */
+	mpt_entry->key	       = cpu_to_be32(key_to_hw_index(mw->key));
+	mpt_entry->pd_flags    = cpu_to_be32(mw->pd);
+	if (mw->type == MLX4_MW_TYPE_2) {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->qpn       = cpu_to_be32(MLX4_MPT_QP_FLAG_BOUND_QP);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_EN_INV);
+	}
+
+	err = mlx4_SW2HW_MPT(dev, mailbox,
+			     key_to_hw_index(mw->key) &
+			     (dev->caps.num_mpts - 1));
+	if (err) {
+		mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_cmd;
+	}
+	mw->enabled = MLX4_MPT_EN_HW;
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return 0;
+
+err_cmd:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_table:
+	mlx4_mpt_free_icm(dev, key_to_hw_index(mw->key));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mw_enable);
+
+void mlx4_mw_free(struct mlx4_dev *dev, struct mlx4_mw *mw)
+{
+	int err;
+
+	if (mw->enabled == MLX4_MPT_EN_HW) {
+		err = mlx4_HW2SW_MPT(dev, NULL,
+				     key_to_hw_index(mw->key) &
+				     (dev->caps.num_mpts - 1));
+		if (err)
+			mlx4_warn(dev, "xxx HW2SW_MPT failed (%d)\n", err);
+
+		mw->enabled = MLX4_MPT_EN_SW;
+	}
+	if (mw->enabled)
+		mlx4_mpt_free_icm(dev, key_to_hw_index(mw->key));
+	mlx4_mpt_release(dev, key_to_hw_index(mw->key));
+}
+EXPORT_SYMBOL_GPL(mlx4_mw_free);
+
+int mlx4_init_mr_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mr_table *mr_table = &priv->mr_table;
+	int err;
+
+	/* Nothing to do for slaves - all MR handling is forwarded
+	* to the master */
+	if (mlx4_is_slave(dev))
+		return 0;
+
+	if (!is_power_of_2(dev->caps.num_mpts))
+		return -EINVAL;
+
+	err = mlx4_bitmap_init(&mr_table->mpt_bitmap, dev->caps.num_mpts,
+			       ~0, dev->caps.reserved_mrws, 0);
+	if (err)
+		return err;
+
+	err = mlx4_buddy_init(&mr_table->mtt_buddy,
+			      ilog2((u32)dev->caps.num_mtts /
+			      (1 << log_mtts_per_seg)));
+	if (err)
+		goto err_buddy;
+
+	if (dev->caps.reserved_mtts) {
+		priv->reserved_mtts =
+			mlx4_alloc_mtt_range(dev,
+					     fls(dev->caps.reserved_mtts - 1));
+		if (priv->reserved_mtts < 0) {
+			mlx4_warn(dev, "MTT table of order %u is too small\n",
+				  mr_table->mtt_buddy.max_order);
+			err = -ENOMEM;
+			goto err_reserve_mtts;
+		}
+	}
+
+	return 0;
+
+err_reserve_mtts:
+	mlx4_buddy_cleanup(&mr_table->mtt_buddy);
+
+err_buddy:
+	mlx4_bitmap_cleanup(&mr_table->mpt_bitmap);
+
+	return err;
+}
+
+void mlx4_cleanup_mr_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mr_table *mr_table = &priv->mr_table;
+
+	if (mlx4_is_slave(dev))
+		return;
+	if (priv->reserved_mtts >= 0)
+		mlx4_free_mtt_range(dev, priv->reserved_mtts,
+				    fls(dev->caps.reserved_mtts - 1));
+	mlx4_buddy_cleanup(&mr_table->mtt_buddy);
+	mlx4_bitmap_cleanup(&mr_table->mpt_bitmap);
+}
+
+static inline int mlx4_check_fmr(struct mlx4_fmr *fmr, u64 *page_list,
+				  int npages, u64 iova)
+{
+	int i, page_mask;
+
+	if (npages > fmr->max_pages)
+		return -EINVAL;
+
+	page_mask = (1 << fmr->page_shift) - 1;
+
+	/* We are getting page lists, so va must be page aligned. */
+	if (iova & page_mask)
+		return -EINVAL;
+
+	/* Trust the user not to pass misaligned data in page_list */
+	if (0)
+		for (i = 0; i < npages; ++i) {
+			if (page_list[i] & ~page_mask)
+				return -EINVAL;
+		}
+
+	if (fmr->maps >= fmr->max_maps)
+		return -EINVAL;
+
+	return 0;
+}
+
+int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
+		      int npages, u64 iova, u32 *lkey, u32 *rkey)
+{
+	u32 key;
+	int i, err;
+
+	err = mlx4_check_fmr(fmr, page_list, npages, iova);
+	if (err)
+		return err;
+
+	++fmr->maps;
+
+	key = key_to_hw_index(fmr->mr.key);
+	key += dev->caps.num_mpts;
+	*lkey = *rkey = fmr->mr.key = hw_index_to_key(key);
+
+	*(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW;
+
+	/* Make sure MPT status is visible before writing MTT entries */
+	wmb();
+
+	dma_sync_single_for_cpu(&dev->persist->pdev->dev, fmr->dma_handle,
+				npages * sizeof(u64), DMA_TO_DEVICE);
+
+	for (i = 0; i < npages; ++i)
+		fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
+
+	dma_sync_single_for_device(&dev->persist->pdev->dev, fmr->dma_handle,
+				   npages * sizeof(u64), DMA_TO_DEVICE);
+
+	fmr->mpt->key    = cpu_to_be32(key);
+	fmr->mpt->lkey   = cpu_to_be32(key);
+	fmr->mpt->length = cpu_to_be64(npages * (1ull << fmr->page_shift));
+	fmr->mpt->start  = cpu_to_be64(iova);
+
+	/* Make MTT entries are visible before setting MPT status */
+	wmb();
+
+	*(u8 *) fmr->mpt = MLX4_MPT_STATUS_HW;
+
+	/* Make sure MPT status is visible before consumer can use FMR */
+	wmb();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr);
+
+int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
+		   int max_maps, u8 page_shift, struct mlx4_fmr *fmr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err = -ENOMEM;
+
+	if (max_maps > dev->caps.max_fmr_maps)
+		return -EINVAL;
+
+	if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32)
+		return -EINVAL;
+
+	/* All MTTs must fit in the same page */
+	if (max_pages * sizeof(*fmr->mtts) > PAGE_SIZE)
+		return -EINVAL;
+
+	fmr->page_shift = page_shift;
+	fmr->max_pages  = max_pages;
+	fmr->max_maps   = max_maps;
+	fmr->maps = 0;
+
+	err = mlx4_mr_alloc(dev, pd, 0, 0, access, max_pages,
+			    page_shift, &fmr->mr);
+	if (err)
+		return err;
+
+	fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table,
+				    fmr->mr.mtt.offset,
+				    &fmr->dma_handle);
+
+	if (!fmr->mtts) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	return 0;
+
+err_free:
+	(void) mlx4_mr_free(dev, &fmr->mr);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_alloc);
+
+int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	err = mlx4_mr_enable(dev, &fmr->mr);
+	if (err)
+		return err;
+
+	fmr->mpt = mlx4_table_find(&priv->mr_table.dmpt_table,
+				    key_to_hw_index(fmr->mr.key), NULL);
+	if (!fmr->mpt)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_enable);
+
+void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
+		    u32 *lkey, u32 *rkey)
+{
+	if (!fmr->maps)
+		return;
+
+	/* To unmap: it is sufficient to take back ownership from HW */
+	*(u8 *)fmr->mpt = MLX4_MPT_STATUS_SW;
+
+	/* Make sure MPT status is visible */
+	wmb();
+
+	fmr->maps = 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_unmap);
+
+int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
+{
+	int ret;
+
+	if (fmr->maps)
+		return -EBUSY;
+	if (fmr->mr.enabled == MLX4_MPT_EN_HW) {
+		/* In case of FMR was enabled and unmapped
+		 * make sure to give ownership of MPT back to HW
+		 * so HW2SW_MPT command will success.
+		 */
+		*(u8 *)fmr->mpt = MLX4_MPT_STATUS_SW;
+		/* Make sure MPT status is visible before changing MPT fields */
+		wmb();
+		fmr->mpt->length = 0;
+		fmr->mpt->start  = 0;
+		/* Make sure MPT data is visible after changing MPT status */
+		wmb();
+		*(u8 *)fmr->mpt = MLX4_MPT_STATUS_HW;
+		/* make sure MPT status is visible */
+		wmb();
+	}
+
+	ret = mlx4_mr_free(dev, &fmr->mr);
+	if (ret)
+		return ret;
+	fmr->mr.enabled = MLX4_MPT_DISABLED;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_free);
+
+int mlx4_SYNC_TPT(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+EXPORT_SYMBOL_GPL(mlx4_SYNC_TPT);
diff --git a/drivers/net/ethernet/mellanox/mlx4/pd.c b/drivers/net/ethernet/mellanox/mlx4/pd.c
new file mode 100644
index 0000000..6fc156a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/pd.c
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/io-mapping.h>
+
+#include <asm/page.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+enum {
+	MLX4_NUM_RESERVED_UARS = 8
+};
+
+int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	*pdn = mlx4_bitmap_alloc(&priv->pd_bitmap);
+	if (*pdn == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_pd_alloc);
+
+void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->pd_bitmap, pdn, MLX4_USE_RR);
+}
+EXPORT_SYMBOL_GPL(mlx4_pd_free);
+
+int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	*xrcdn = mlx4_bitmap_alloc(&priv->xrcd_bitmap);
+	if (*xrcdn == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn)
+{
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param,
+				   RES_XRCD, RES_OP_RESERVE,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			return err;
+
+		*xrcdn = get_param_l(&out_param);
+		return 0;
+	}
+	return __mlx4_xrcd_alloc(dev, xrcdn);
+}
+EXPORT_SYMBOL_GPL(mlx4_xrcd_alloc);
+
+void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->xrcd_bitmap, xrcdn, MLX4_USE_RR);
+}
+
+void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, xrcdn);
+		err = mlx4_cmd(dev, in_param, RES_XRCD,
+			       RES_OP_RESERVE, MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			mlx4_warn(dev, "Failed to release xrcdn %d\n", xrcdn);
+	} else
+		__mlx4_xrcd_free(dev, xrcdn);
+}
+EXPORT_SYMBOL_GPL(mlx4_xrcd_free);
+
+int mlx4_init_pd_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return mlx4_bitmap_init(&priv->pd_bitmap, dev->caps.num_pds,
+				(1 << NOT_MASKED_PD_BITS) - 1,
+				 dev->caps.reserved_pds, 0);
+}
+
+void mlx4_cleanup_pd_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->pd_bitmap);
+}
+
+int mlx4_init_xrcd_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return mlx4_bitmap_init(&priv->xrcd_bitmap, (1 << 16),
+				(1 << 16) - 1, dev->caps.reserved_xrcds + 1, 0);
+}
+
+void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->xrcd_bitmap);
+}
+
+int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar)
+{
+	int offset;
+
+	uar->index = mlx4_bitmap_alloc(&mlx4_priv(dev)->uar_table.bitmap);
+	if (uar->index == -1)
+		return -ENOMEM;
+
+	if (mlx4_is_slave(dev))
+		offset = uar->index % ((int)pci_resource_len(dev->persist->pdev,
+							     2) /
+				       dev->caps.uar_page_size);
+	else
+		offset = uar->index;
+	uar->pfn = (pci_resource_start(dev->persist->pdev, 2) >> PAGE_SHIFT)
+		    + offset;
+	uar->map = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_uar_alloc);
+
+void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->uar_table.bitmap, uar->index, MLX4_USE_RR);
+}
+EXPORT_SYMBOL_GPL(mlx4_uar_free);
+
+int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_uar *uar;
+	int err = 0;
+	int idx;
+
+	if (!priv->bf_mapping)
+		return -ENOMEM;
+
+	mutex_lock(&priv->bf_mutex);
+	if (!list_empty(&priv->bf_list))
+		uar = list_entry(priv->bf_list.next, struct mlx4_uar, bf_list);
+	else {
+		if (mlx4_bitmap_avail(&priv->uar_table.bitmap) < MLX4_NUM_RESERVED_UARS) {
+			err = -ENOMEM;
+			goto out;
+		}
+		uar = kmalloc_node(sizeof(*uar), GFP_KERNEL, node);
+		if (!uar) {
+			uar = kmalloc(sizeof(*uar), GFP_KERNEL);
+			if (!uar) {
+				err = -ENOMEM;
+				goto out;
+			}
+		}
+		err = mlx4_uar_alloc(dev, uar);
+		if (err)
+			goto free_kmalloc;
+
+		uar->map = ioremap(uar->pfn << PAGE_SHIFT, PAGE_SIZE);
+		if (!uar->map) {
+			err = -ENOMEM;
+			goto free_uar;
+		}
+
+		uar->bf_map = io_mapping_map_wc(priv->bf_mapping,
+						uar->index << PAGE_SHIFT,
+						PAGE_SIZE);
+		if (!uar->bf_map) {
+			err = -ENOMEM;
+			goto unamp_uar;
+		}
+		uar->free_bf_bmap = 0;
+		list_add(&uar->bf_list, &priv->bf_list);
+	}
+
+	idx = ffz(uar->free_bf_bmap);
+	uar->free_bf_bmap |= 1 << idx;
+	bf->uar = uar;
+	bf->offset = 0;
+	bf->buf_size = dev->caps.bf_reg_size / 2;
+	bf->reg = uar->bf_map + idx * dev->caps.bf_reg_size;
+	if (uar->free_bf_bmap == (1 << dev->caps.bf_regs_per_page) - 1)
+		list_del_init(&uar->bf_list);
+
+	goto out;
+
+unamp_uar:
+	bf->uar = NULL;
+	iounmap(uar->map);
+
+free_uar:
+	mlx4_uar_free(dev, uar);
+
+free_kmalloc:
+	kfree(uar);
+
+out:
+	mutex_unlock(&priv->bf_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_bf_alloc);
+
+void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int idx;
+
+	if (!bf->uar || !bf->uar->bf_map)
+		return;
+
+	mutex_lock(&priv->bf_mutex);
+	idx = (bf->reg - bf->uar->bf_map) / dev->caps.bf_reg_size;
+	bf->uar->free_bf_bmap &= ~(1 << idx);
+	if (!bf->uar->free_bf_bmap) {
+		if (!list_empty(&bf->uar->bf_list))
+			list_del(&bf->uar->bf_list);
+
+		io_mapping_unmap(bf->uar->bf_map);
+		iounmap(bf->uar->map);
+		mlx4_uar_free(dev, bf->uar);
+		kfree(bf->uar);
+	} else if (list_empty(&bf->uar->bf_list))
+		list_add(&bf->uar->bf_list, &priv->bf_list);
+
+	mutex_unlock(&priv->bf_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_bf_free);
+
+int mlx4_init_uar_table(struct mlx4_dev *dev)
+{
+	int num_reserved_uar = mlx4_get_num_reserved_uar(dev);
+
+	mlx4_dbg(dev, "uar_page_shift = %d", dev->uar_page_shift);
+	mlx4_dbg(dev, "Effective reserved_uars=%d", dev->caps.reserved_uars);
+
+	if (dev->caps.num_uars <= num_reserved_uar) {
+		mlx4_err(
+			dev, "Only %d UAR pages (need more than %d)\n",
+			dev->caps.num_uars, num_reserved_uar);
+		mlx4_err(dev, "Increase firmware log2_uar_bar_megabytes?\n");
+		return -ENODEV;
+	}
+
+	return mlx4_bitmap_init(&mlx4_priv(dev)->uar_table.bitmap,
+				dev->caps.num_uars, dev->caps.num_uars - 1,
+				dev->caps.reserved_uars, 0);
+}
+
+void mlx4_cleanup_uar_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->uar_table.bitmap);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
new file mode 100644
index 0000000..3ef3406
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -0,0 +1,2135 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/export.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "mlx4_stats.h"
+
+#define MLX4_MAC_VALID		(1ull << 63)
+
+#define MLX4_VLAN_VALID		(1u << 31)
+#define MLX4_VLAN_MASK		0xfff
+
+#define MLX4_STATS_TRAFFIC_COUNTERS_MASK	0xfULL
+#define MLX4_STATS_TRAFFIC_DROPS_MASK		0xc0ULL
+#define MLX4_STATS_ERROR_COUNTERS_MASK		0x1ffc30ULL
+#define MLX4_STATS_PORT_COUNTERS_MASK		0x1fe00000ULL
+
+#define MLX4_FLAG2_V_IGNORE_FCS_MASK		BIT(1)
+#define MLX4_FLAG2_V_USER_MTU_MASK		BIT(5)
+#define MLX4_FLAG2_V_USER_MAC_MASK		BIT(6)
+#define MLX4_FLAG_V_MTU_MASK			BIT(0)
+#define MLX4_FLAG_V_PPRX_MASK			BIT(1)
+#define MLX4_FLAG_V_PPTX_MASK			BIT(2)
+#define MLX4_IGNORE_FCS_MASK			0x1
+#define MLX4_TC_MAX_NUMBER			8
+
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		table->entries[i] = 0;
+		table->refs[i]	 = 0;
+		table->is_dup[i] = false;
+	}
+	table->max   = 1 << dev->caps.log_num_macs;
+	table->total = 0;
+}
+
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		table->entries[i] = 0;
+		table->refs[i]	 = 0;
+		table->is_dup[i] = false;
+	}
+	table->max   = (1 << dev->caps.log_num_vlans) - MLX4_VLAN_REGULAR;
+	table->total = 0;
+}
+
+void mlx4_init_roce_gid_table(struct mlx4_dev *dev,
+			      struct mlx4_roce_gid_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++)
+		memset(table->roce_gids[i].raw, 0, MLX4_ROCE_GID_ENTRY_SIZE);
+}
+
+static int validate_index(struct mlx4_dev *dev,
+			  struct mlx4_mac_table *table, int index)
+{
+	int err = 0;
+
+	if (index < 0 || index >= table->max || !table->entries[index]) {
+		mlx4_warn(dev, "No valid Mac entry for the given index\n");
+		err = -EINVAL;
+	}
+	return err;
+}
+
+static int find_index(struct mlx4_dev *dev,
+		      struct mlx4_mac_table *table, u64 mac)
+{
+	int i;
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (table->refs[i] &&
+		    (MLX4_MAC_MASK & mac) ==
+		    (MLX4_MAC_MASK & be64_to_cpu(table->entries[i])))
+			return i;
+	}
+	/* Mac not found */
+	return -EINVAL;
+}
+
+static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port,
+				   __be64 *entries)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, entries, MLX4_MAC_TABLE_SIZE);
+
+	in_mod = MLX4_SET_PORT_MAC_TABLE << 8 | port;
+
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	struct mlx4_mac_table *table = &info->mac_table;
+	int i;
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (!table->refs[i])
+			continue;
+
+		if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
+			*idx = i;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(mlx4_find_cached_mac);
+
+static bool mlx4_need_mf_bond(struct mlx4_dev *dev)
+{
+	int i, num_eth_ports = 0;
+
+	if (!mlx4_is_mfunc(dev))
+		return false;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		++num_eth_ports;
+
+	return (num_eth_ports ==  2) ? true : false;
+}
+
+int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	struct mlx4_mac_table *table = &info->mac_table;
+	int i, err = 0;
+	int free = -1;
+	int free_for_dup = -1;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_mac_table *dup_table = &mlx4_priv(dev)->port[dup_port].mac_table;
+	bool need_mf_bond = mlx4_need_mf_bond(dev);
+	bool can_mf_bond = true;
+
+	mlx4_dbg(dev, "Registering MAC: 0x%llx for port %d %s duplicate\n",
+		 (unsigned long long)mac, port,
+		 dup ? "with" : "without");
+
+	if (need_mf_bond) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
+
+	if (need_mf_bond) {
+		int index_at_port = -1;
+		int index_at_dup_port = -1;
+
+		for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+			if (((MLX4_MAC_MASK & mac) == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))))
+				index_at_port = i;
+			if (((MLX4_MAC_MASK & mac) == (MLX4_MAC_MASK & be64_to_cpu(dup_table->entries[i]))))
+				index_at_dup_port = i;
+		}
+
+		/* check that same mac is not in the tables at different indices */
+		if ((index_at_port != index_at_dup_port) &&
+		    (index_at_port >= 0) &&
+		    (index_at_dup_port >= 0))
+			can_mf_bond = false;
+
+		/* If the mac is already in the primary table, the slot must be
+		 * available in the duplicate table as well.
+		 */
+		if (index_at_port >= 0 && index_at_dup_port < 0 &&
+		    dup_table->refs[index_at_port]) {
+			can_mf_bond = false;
+		}
+		/* If the mac is already in the duplicate table, check that the
+		 * corresponding index is not occupied in the primary table, or
+		 * the primary table already contains the mac at the same index.
+		 * Otherwise, you cannot bond (primary contains a different mac
+		 * at that index).
+		 */
+		if (index_at_dup_port >= 0) {
+			if (!table->refs[index_at_dup_port] ||
+			    ((MLX4_MAC_MASK & mac) == (MLX4_MAC_MASK & be64_to_cpu(table->entries[index_at_dup_port]))))
+				free_for_dup = index_at_dup_port;
+			else
+				can_mf_bond = false;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (!table->refs[i]) {
+			if (free < 0)
+				free = i;
+			if (free_for_dup < 0 && need_mf_bond && can_mf_bond) {
+				if (!dup_table->refs[i])
+					free_for_dup = i;
+			}
+			continue;
+		}
+
+		if ((MLX4_MAC_MASK & mac) ==
+		     (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
+			/* MAC already registered, increment ref count */
+			err = i;
+			++table->refs[i];
+			if (dup) {
+				u64 dup_mac = MLX4_MAC_MASK & be64_to_cpu(dup_table->entries[i]);
+
+				if (dup_mac != mac || !dup_table->is_dup[i]) {
+					mlx4_warn(dev, "register mac: expect duplicate mac 0x%llx on port %d index %d\n",
+						  mac, dup_port, i);
+				}
+			}
+			goto out;
+		}
+	}
+
+	if (need_mf_bond && (free_for_dup < 0)) {
+		if (dup) {
+			mlx4_warn(dev, "Fail to allocate duplicate MAC table entry\n");
+			mlx4_warn(dev, "High Availability for virtual functions may not work as expected\n");
+			dup = false;
+		}
+		can_mf_bond = false;
+	}
+
+	if (need_mf_bond && can_mf_bond)
+		free = free_for_dup;
+
+	mlx4_dbg(dev, "Free MAC index is %d\n", free);
+
+	if (table->total == table->max) {
+		/* No free mac entries */
+		err = -ENOSPC;
+		goto out;
+	}
+
+	/* Register new MAC */
+	table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID);
+
+	err = mlx4_set_port_mac_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_err(dev, "Failed adding MAC: 0x%llx\n",
+			 (unsigned long long) mac);
+		table->entries[free] = 0;
+		goto out;
+	}
+	table->refs[free] = 1;
+	table->is_dup[free] = false;
+	++table->total;
+	if (dup) {
+		dup_table->refs[free] = 0;
+		dup_table->is_dup[free] = true;
+		dup_table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID);
+
+		err = mlx4_set_port_mac_table(dev, dup_port, dup_table->entries);
+		if (unlikely(err)) {
+			mlx4_warn(dev, "Failed adding duplicate mac: 0x%llx\n", mac);
+			dup_table->is_dup[free] = false;
+			dup_table->entries[free] = 0;
+			goto out;
+		}
+		++dup_table->total;
+	}
+	err = free;
+out:
+	if (need_mf_bond) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(__mlx4_register_mac);
+
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	u64 out_param = 0;
+	int err = -EINVAL;
+
+	if (mlx4_is_mfunc(dev)) {
+		if (!(dev->flags & MLX4_FLAG_OLD_REG_MAC)) {
+			err = mlx4_cmd_imm(dev, mac, &out_param,
+					   ((u32) port) << 8 | (u32) RES_MAC,
+					   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+					   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		}
+		if (err && err == -EINVAL && mlx4_is_slave(dev)) {
+			/* retry using old REG_MAC format */
+			set_param_l(&out_param, port);
+			err = mlx4_cmd_imm(dev, mac, &out_param, RES_MAC,
+					   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+					   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+			if (!err)
+				dev->flags |= MLX4_FLAG_OLD_REG_MAC;
+		}
+		if (err)
+			return err;
+
+		return get_param_l(&out_param);
+	}
+	return __mlx4_register_mac(dev, port, mac);
+}
+EXPORT_SYMBOL_GPL(mlx4_register_mac);
+
+int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port)
+{
+	return dev->caps.reserved_qps_base[MLX4_QP_REGION_ETH_ADDR] +
+			(port - 1) * (1 << dev->caps.log_num_macs);
+}
+EXPORT_SYMBOL_GPL(mlx4_get_base_qpn);
+
+void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	struct mlx4_port_info *info;
+	struct mlx4_mac_table *table;
+	int index;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_mac_table *dup_table = &mlx4_priv(dev)->port[dup_port].mac_table;
+
+	if (port < 1 || port > dev->caps.num_ports) {
+		mlx4_warn(dev, "invalid port number (%d), aborting...\n", port);
+		return;
+	}
+	info = &mlx4_priv(dev)->port[port];
+	table = &info->mac_table;
+
+	if (dup) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
+
+	index = find_index(dev, table, mac);
+
+	if (validate_index(dev, table, index))
+		goto out;
+
+	if (--table->refs[index] || table->is_dup[index]) {
+		mlx4_dbg(dev, "Have more references for index %d, no need to modify mac table\n",
+			 index);
+		if (!table->refs[index])
+			dup_table->is_dup[index] = false;
+		goto out;
+	}
+
+	table->entries[index] = 0;
+	if (mlx4_set_port_mac_table(dev, port, table->entries))
+		mlx4_warn(dev, "Fail to set mac in port %d during unregister\n", port);
+	--table->total;
+
+	if (dup) {
+		dup_table->is_dup[index] = false;
+		if (dup_table->refs[index])
+			goto out;
+		dup_table->entries[index] = 0;
+		if (mlx4_set_port_mac_table(dev, dup_port, dup_table->entries))
+			mlx4_warn(dev, "Fail to set mac in duplicate port %d during unregister\n", dup_port);
+
+		--table->total;
+	}
+out:
+	if (dup) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
+}
+EXPORT_SYMBOL_GPL(__mlx4_unregister_mac);
+
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	u64 out_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		if (!(dev->flags & MLX4_FLAG_OLD_REG_MAC)) {
+			(void) mlx4_cmd_imm(dev, mac, &out_param,
+					    ((u32) port) << 8 | (u32) RES_MAC,
+					    RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES,
+					    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		} else {
+			/* use old unregister mac format */
+			set_param_l(&out_param, port);
+			(void) mlx4_cmd_imm(dev, mac, &out_param, RES_MAC,
+					    RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES,
+					    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		}
+		return;
+	}
+	__mlx4_unregister_mac(dev, port, mac);
+	return;
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_mac);
+
+int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	struct mlx4_mac_table *table = &info->mac_table;
+	int index = qpn - info->base_qpn;
+	int err = 0;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_mac_table *dup_table = &mlx4_priv(dev)->port[dup_port].mac_table;
+
+	/* CX1 doesn't support multi-functions */
+	if (dup) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
+
+	err = validate_index(dev, table, index);
+	if (err)
+		goto out;
+
+	table->entries[index] = cpu_to_be64(new_mac | MLX4_MAC_VALID);
+
+	err = mlx4_set_port_mac_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_err(dev, "Failed adding MAC: 0x%llx\n",
+			 (unsigned long long) new_mac);
+		table->entries[index] = 0;
+	} else {
+		if (dup) {
+			dup_table->entries[index] = cpu_to_be64(new_mac | MLX4_MAC_VALID);
+
+			err = mlx4_set_port_mac_table(dev, dup_port, dup_table->entries);
+			if (unlikely(err)) {
+				mlx4_err(dev, "Failed adding duplicate MAC: 0x%llx\n",
+					 (unsigned long long)new_mac);
+				dup_table->entries[index] = 0;
+			}
+		}
+	}
+out:
+	if (dup) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(__mlx4_replace_mac);
+
+static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port,
+				    __be32 *entries)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, entries, MLX4_VLAN_TABLE_SIZE);
+	in_mod = MLX4_SET_PORT_VLAN_TABLE << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int i;
+
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; ++i) {
+		if (table->refs[i] &&
+		    (vid == (MLX4_VLAN_MASK &
+			      be32_to_cpu(table->entries[i])))) {
+			/* VLAN already registered, increase reference count */
+			*idx = i;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(mlx4_find_cached_vlan);
+
+int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
+				int *index)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int i, err = 0;
+	int free = -1;
+	int free_for_dup = -1;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_vlan_table *dup_table = &mlx4_priv(dev)->port[dup_port].vlan_table;
+	bool need_mf_bond = mlx4_need_mf_bond(dev);
+	bool can_mf_bond = true;
+
+	mlx4_dbg(dev, "Registering VLAN: %d for port %d %s duplicate\n",
+		 vlan, port,
+		 dup ? "with" : "without");
+
+	if (need_mf_bond) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
+
+	if (table->total == table->max) {
+		/* No free vlan entries */
+		err = -ENOSPC;
+		goto out;
+	}
+
+	if (need_mf_bond) {
+		int index_at_port = -1;
+		int index_at_dup_port = -1;
+
+		for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
+			if ((vlan == (MLX4_VLAN_MASK & be32_to_cpu(table->entries[i]))))
+				index_at_port = i;
+			if ((vlan == (MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[i]))))
+				index_at_dup_port = i;
+		}
+		/* check that same vlan is not in the tables at different indices */
+		if ((index_at_port != index_at_dup_port) &&
+		    (index_at_port >= 0) &&
+		    (index_at_dup_port >= 0))
+			can_mf_bond = false;
+
+		/* If the vlan is already in the primary table, the slot must be
+		 * available in the duplicate table as well.
+		 */
+		if (index_at_port >= 0 && index_at_dup_port < 0 &&
+		    dup_table->refs[index_at_port]) {
+			can_mf_bond = false;
+		}
+		/* If the vlan is already in the duplicate table, check that the
+		 * corresponding index is not occupied in the primary table, or
+		 * the primary table already contains the vlan at the same index.
+		 * Otherwise, you cannot bond (primary contains a different vlan
+		 * at that index).
+		 */
+		if (index_at_dup_port >= 0) {
+			if (!table->refs[index_at_dup_port] ||
+			    (vlan == (MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[index_at_dup_port]))))
+				free_for_dup = index_at_dup_port;
+			else
+				can_mf_bond = false;
+		}
+	}
+
+	for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (!table->refs[i]) {
+			if (free < 0)
+				free = i;
+			if (free_for_dup < 0 && need_mf_bond && can_mf_bond) {
+				if (!dup_table->refs[i])
+					free_for_dup = i;
+			}
+		}
+
+		if ((table->refs[i] || table->is_dup[i]) &&
+		    (vlan == (MLX4_VLAN_MASK &
+			      be32_to_cpu(table->entries[i])))) {
+			/* Vlan already registered, increase references count */
+			mlx4_dbg(dev, "vlan %u is already registered.\n", vlan);
+			*index = i;
+			++table->refs[i];
+			if (dup) {
+				u16 dup_vlan = MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[i]);
+
+				if (dup_vlan != vlan || !dup_table->is_dup[i]) {
+					mlx4_warn(dev, "register vlan: expected duplicate vlan %u on port %d index %d\n",
+						  vlan, dup_port, i);
+				}
+			}
+			goto out;
+		}
+	}
+
+	if (need_mf_bond && (free_for_dup < 0)) {
+		if (dup) {
+			mlx4_warn(dev, "Fail to allocate duplicate VLAN table entry\n");
+			mlx4_warn(dev, "High Availability for virtual functions may not work as expected\n");
+			dup = false;
+		}
+		can_mf_bond = false;
+	}
+
+	if (need_mf_bond && can_mf_bond)
+		free = free_for_dup;
+
+	if (free < 0) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Register new VLAN */
+	table->refs[free] = 1;
+	table->is_dup[free] = false;
+	table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
+
+	err = mlx4_set_port_vlan_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_warn(dev, "Failed adding vlan: %u\n", vlan);
+		table->refs[free] = 0;
+		table->entries[free] = 0;
+		goto out;
+	}
+	++table->total;
+	if (dup) {
+		dup_table->refs[free] = 0;
+		dup_table->is_dup[free] = true;
+		dup_table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
+
+		err = mlx4_set_port_vlan_table(dev, dup_port, dup_table->entries);
+		if (unlikely(err)) {
+			mlx4_warn(dev, "Failed adding duplicate vlan: %u\n", vlan);
+			dup_table->is_dup[free] = false;
+			dup_table->entries[free] = 0;
+			goto out;
+		}
+		++dup_table->total;
+	}
+
+	*index = free;
+out:
+	if (need_mf_bond) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
+	return err;
+}
+
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index)
+{
+	u64 out_param = 0;
+	int err;
+
+	if (vlan > 4095)
+		return -EINVAL;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, vlan, &out_param,
+				   ((u32) port) << 8 | (u32) RES_VLAN,
+				   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (!err)
+			*index = get_param_l(&out_param);
+
+		return err;
+	}
+	return __mlx4_register_vlan(dev, port, vlan, index);
+}
+EXPORT_SYMBOL_GPL(mlx4_register_vlan);
+
+void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int index;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_vlan_table *dup_table = &mlx4_priv(dev)->port[dup_port].vlan_table;
+
+	if (dup) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
+
+	if (mlx4_find_cached_vlan(dev, port, vlan, &index)) {
+		mlx4_warn(dev, "vlan 0x%x is not in the vlan table\n", vlan);
+		goto out;
+	}
+
+	if (index < MLX4_VLAN_REGULAR) {
+		mlx4_warn(dev, "Trying to free special vlan index %d\n", index);
+		goto out;
+	}
+
+	if (--table->refs[index] || table->is_dup[index]) {
+		mlx4_dbg(dev, "Have %d more references for index %d, no need to modify vlan table\n",
+			 table->refs[index], index);
+		if (!table->refs[index])
+			dup_table->is_dup[index] = false;
+		goto out;
+	}
+	table->entries[index] = 0;
+	if (mlx4_set_port_vlan_table(dev, port, table->entries))
+		mlx4_warn(dev, "Fail to set vlan in port %d during unregister\n", port);
+	--table->total;
+	if (dup) {
+		dup_table->is_dup[index] = false;
+		if (dup_table->refs[index])
+			goto out;
+		dup_table->entries[index] = 0;
+		if (mlx4_set_port_vlan_table(dev, dup_port, dup_table->entries))
+			mlx4_warn(dev, "Fail to set vlan in duplicate port %d during unregister\n", dup_port);
+		--dup_table->total;
+	}
+out:
+	if (dup) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
+}
+
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
+{
+	u64 out_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		(void) mlx4_cmd_imm(dev, vlan, &out_param,
+				    ((u32) port) << 8 | (u32) RES_VLAN,
+				    RES_OP_RESERVE_AND_MAP,
+				    MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+				    MLX4_CMD_WRAPPED);
+		return;
+	}
+	__mlx4_unregister_vlan(dev, port, vlan);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_vlan);
+
+int mlx4_bond_mac_table(struct mlx4_dev *dev)
+{
+	struct mlx4_mac_table *t1 = &mlx4_priv(dev)->port[1].mac_table;
+	struct mlx4_mac_table *t2 = &mlx4_priv(dev)->port[2].mac_table;
+	int ret = 0;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if ((t1->entries[i] != t2->entries[i]) &&
+		    t1->entries[i] && t2->entries[i]) {
+			mlx4_warn(dev, "can't duplicate entry %d in mac table\n", i);
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (t1->entries[i] && !t2->entries[i]) {
+			t2->entries[i] = t1->entries[i];
+			t2->is_dup[i] = true;
+			update2 = true;
+		} else if (!t1->entries[i] && t2->entries[i]) {
+			t1->entries[i] = t2->entries[i];
+			t1->is_dup[i] = true;
+			update1 = true;
+		} else if (t1->entries[i] && t2->entries[i]) {
+			t1->is_dup[i] = true;
+			t2->is_dup[i] = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_mac_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set MAC table for port 1 (%d)\n", ret);
+	}
+	if (!ret && update2) {
+		ret = mlx4_set_port_mac_table(dev, 2, t2->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set MAC table for port 2 (%d)\n", ret);
+	}
+
+	if (ret)
+		mlx4_warn(dev, "failed to create mirror MAC tables\n");
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_unbond_mac_table(struct mlx4_dev *dev)
+{
+	struct mlx4_mac_table *t1 = &mlx4_priv(dev)->port[1].mac_table;
+	struct mlx4_mac_table *t2 = &mlx4_priv(dev)->port[2].mac_table;
+	int ret = 0;
+	int ret1;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (t1->entries[i] != t2->entries[i]) {
+			mlx4_warn(dev, "mac table is in an unexpected state when trying to unbond\n");
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (!t1->entries[i])
+			continue;
+		t1->is_dup[i] = false;
+		if (!t1->refs[i]) {
+			t1->entries[i] = 0;
+			update1 = true;
+		}
+		t2->is_dup[i] = false;
+		if (!t2->refs[i]) {
+			t2->entries[i] = 0;
+			update2 = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_mac_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to unmirror MAC tables for port 1(%d)\n", ret);
+	}
+	if (update2) {
+		ret1 = mlx4_set_port_mac_table(dev, 2, t2->entries);
+		if (ret1) {
+			mlx4_warn(dev, "failed to unmirror MAC tables for port 2(%d)\n", ret1);
+			ret = ret1;
+		}
+	}
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_bond_vlan_table(struct mlx4_dev *dev)
+{
+	struct mlx4_vlan_table *t1 = &mlx4_priv(dev)->port[1].vlan_table;
+	struct mlx4_vlan_table *t2 = &mlx4_priv(dev)->port[2].vlan_table;
+	int ret = 0;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if ((t1->entries[i] != t2->entries[i]) &&
+		    t1->entries[i] && t2->entries[i]) {
+			mlx4_warn(dev, "can't duplicate entry %d in vlan table\n", i);
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (t1->entries[i] && !t2->entries[i]) {
+			t2->entries[i] = t1->entries[i];
+			t2->is_dup[i] = true;
+			update2 = true;
+		} else if (!t1->entries[i] && t2->entries[i]) {
+			t1->entries[i] = t2->entries[i];
+			t1->is_dup[i] = true;
+			update1 = true;
+		} else if (t1->entries[i] && t2->entries[i]) {
+			t1->is_dup[i] = true;
+			t2->is_dup[i] = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_vlan_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set VLAN table for port 1 (%d)\n", ret);
+	}
+	if (!ret && update2) {
+		ret = mlx4_set_port_vlan_table(dev, 2, t2->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set VLAN table for port 2 (%d)\n", ret);
+	}
+
+	if (ret)
+		mlx4_warn(dev, "failed to create mirror VLAN tables\n");
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_unbond_vlan_table(struct mlx4_dev *dev)
+{
+	struct mlx4_vlan_table *t1 = &mlx4_priv(dev)->port[1].vlan_table;
+	struct mlx4_vlan_table *t2 = &mlx4_priv(dev)->port[2].vlan_table;
+	int ret = 0;
+	int ret1;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (t1->entries[i] != t2->entries[i]) {
+			mlx4_warn(dev, "vlan table is in an unexpected state when trying to unbond\n");
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (!t1->entries[i])
+			continue;
+		t1->is_dup[i] = false;
+		if (!t1->refs[i]) {
+			t1->entries[i] = 0;
+			update1 = true;
+		}
+		t2->is_dup[i] = false;
+		if (!t2->refs[i]) {
+			t2->entries[i] = 0;
+			update2 = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_vlan_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to unmirror VLAN tables for port 1(%d)\n", ret);
+	}
+	if (update2) {
+		ret1 = mlx4_set_port_vlan_table(dev, 2, t2->entries);
+		if (ret1) {
+			mlx4_warn(dev, "failed to unmirror VLAN tables for port 2(%d)\n", ret1);
+			ret = ret1;
+		}
+	}
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps)
+{
+	struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+	u8 *inbuf, *outbuf;
+	int err;
+
+	inmailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+
+	outmailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outmailbox)) {
+		mlx4_free_cmd_mailbox(dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	inbuf = inmailbox->buf;
+	outbuf = outmailbox->buf;
+	inbuf[0] = 1;
+	inbuf[1] = 1;
+	inbuf[2] = 1;
+	inbuf[3] = 1;
+	*(__be16 *) (&inbuf[16]) = cpu_to_be16(0x0015);
+	*(__be32 *) (&inbuf[20]) = cpu_to_be32(port);
+
+	err = mlx4_cmd_box(dev, inmailbox->dma, outmailbox->dma, port, 3,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (!err)
+		*caps = *(__be32 *) (outbuf + 84);
+	mlx4_free_cmd_mailbox(dev, inmailbox);
+	mlx4_free_cmd_mailbox(dev, outmailbox);
+	return err;
+}
+static struct mlx4_roce_gid_entry zgid_entry;
+
+int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port)
+{
+	int vfs;
+	int slave_gid = slave;
+	unsigned i;
+	struct mlx4_slaves_pport slaves_pport;
+	struct mlx4_active_ports actv_ports;
+	unsigned max_port_p_one;
+
+	if (slave == 0)
+		return MLX4_ROCE_PF_GIDS;
+
+	/* Slave is a VF */
+	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	max_port_p_one = find_first_bit(actv_ports.ports, dev->caps.num_ports) +
+		bitmap_weight(actv_ports.ports, dev->caps.num_ports) + 1;
+
+	for (i = 1; i < max_port_p_one; i++) {
+		struct mlx4_active_ports exclusive_ports;
+		struct mlx4_slaves_pport slaves_pport_actv;
+		bitmap_zero(exclusive_ports.ports, dev->caps.num_ports);
+		set_bit(i - 1, exclusive_ports.ports);
+		if (i == port)
+			continue;
+		slaves_pport_actv = mlx4_phys_to_slaves_pport_actv(
+				    dev, &exclusive_ports);
+		slave_gid -= bitmap_weight(slaves_pport_actv.slaves,
+					   dev->persist->num_vfs + 1);
+	}
+	vfs = bitmap_weight(slaves_pport.slaves, dev->persist->num_vfs + 1) - 1;
+	if (slave_gid <= ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) % vfs))
+		return ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs) + 1;
+	return (MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs;
+}
+
+int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port)
+{
+	int gids;
+	unsigned i;
+	int slave_gid = slave;
+	int vfs;
+
+	struct mlx4_slaves_pport slaves_pport;
+	struct mlx4_active_ports actv_ports;
+	unsigned max_port_p_one;
+
+	if (slave == 0)
+		return 0;
+
+	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	max_port_p_one = find_first_bit(actv_ports.ports, dev->caps.num_ports) +
+		bitmap_weight(actv_ports.ports, dev->caps.num_ports) + 1;
+
+	for (i = 1; i < max_port_p_one; i++) {
+		struct mlx4_active_ports exclusive_ports;
+		struct mlx4_slaves_pport slaves_pport_actv;
+		bitmap_zero(exclusive_ports.ports, dev->caps.num_ports);
+		set_bit(i - 1, exclusive_ports.ports);
+		if (i == port)
+			continue;
+		slaves_pport_actv = mlx4_phys_to_slaves_pport_actv(
+				    dev, &exclusive_ports);
+		slave_gid -= bitmap_weight(slaves_pport_actv.slaves,
+					   dev->persist->num_vfs + 1);
+	}
+	gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
+	vfs = bitmap_weight(slaves_pport.slaves, dev->persist->num_vfs + 1) - 1;
+	if (slave_gid <= gids % vfs)
+		return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave_gid - 1);
+
+	return MLX4_ROCE_PF_GIDS + (gids % vfs) +
+		((gids / vfs) * (slave_gid - 1));
+}
+EXPORT_SYMBOL_GPL(mlx4_get_base_gid_ix);
+
+static int mlx4_reset_roce_port_gids(struct mlx4_dev *dev, int slave,
+				     int port, struct mlx4_cmd_mailbox *mailbox)
+{
+	struct mlx4_roce_gid_entry *gid_entry_mbox;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int num_gids, base, offset;
+	int i, err;
+
+	num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+	base = mlx4_get_base_gid_ix(dev, slave, port);
+
+	memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
+
+	mutex_lock(&(priv->port[port].gid_table.mutex));
+	/* Zero-out gids belonging to that slave in the port GID table */
+	for (i = 0, offset = base; i < num_gids; offset++, i++)
+		memcpy(priv->port[port].gid_table.roce_gids[offset].raw,
+		       zgid_entry.raw, MLX4_ROCE_GID_ENTRY_SIZE);
+
+	/* Now, copy roce port gids table to mailbox for passing to FW */
+	gid_entry_mbox = (struct mlx4_roce_gid_entry *)mailbox->buf;
+	for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++)
+		memcpy(gid_entry_mbox->raw,
+		       priv->port[port].gid_table.roce_gids[i].raw,
+		       MLX4_ROCE_GID_ENTRY_SIZE);
+
+	err = mlx4_cmd(dev, mailbox->dma,
+		       ((u32)port) | (MLX4_SET_PORT_GID_TABLE << 8),
+		       MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	mutex_unlock(&(priv->port[port].gid_table.mutex));
+	return err;
+}
+
+
+void mlx4_reset_roce_gids(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_active_ports actv_ports;
+	struct mlx4_cmd_mailbox *mailbox;
+	int num_eth_ports, err;
+	int i;
+
+	if (slave < 0 || slave > dev->persist->num_vfs)
+		return;
+
+	actv_ports = mlx4_get_active_ports(dev, slave);
+
+	for (i = 0, num_eth_ports = 0; i < dev->caps.num_ports; i++) {
+		if (test_bit(i, actv_ports.ports)) {
+			if (dev->caps.port_type[i + 1] != MLX4_PORT_TYPE_ETH)
+				continue;
+			num_eth_ports++;
+		}
+	}
+
+	if (!num_eth_ports)
+		return;
+
+	/* have ETH ports.  Alloc mailbox for SET_PORT command */
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return;
+
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (test_bit(i, actv_ports.ports)) {
+			if (dev->caps.port_type[i + 1] != MLX4_PORT_TYPE_ETH)
+				continue;
+			err = mlx4_reset_roce_port_gids(dev, slave, i + 1, mailbox);
+			if (err)
+				mlx4_warn(dev, "Could not reset ETH port GID table for slave %d, port %d (%d)\n",
+					  slave, i + 1, err);
+		}
+	}
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return;
+}
+
+static void
+mlx4_en_set_port_mtu(struct mlx4_dev *dev, int slave, int port,
+		     struct mlx4_set_port_general_context *gen_context)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master;
+	struct mlx4_slave_state *slave_st = &master->slave_state[slave];
+	u16 mtu, prev_mtu;
+
+	/* Mtu is configured as the max USER_MTU among all
+	 * the functions on the port.
+	 */
+	mtu = be16_to_cpu(gen_context->mtu);
+	mtu = min_t(int, mtu, dev->caps.eth_mtu_cap[port] +
+		    ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN);
+	prev_mtu = slave_st->mtu[port];
+	slave_st->mtu[port] = mtu;
+	if (mtu > master->max_mtu[port])
+		master->max_mtu[port] = mtu;
+	if (mtu < prev_mtu && prev_mtu == master->max_mtu[port]) {
+		int i;
+
+		slave_st->mtu[port] = mtu;
+		master->max_mtu[port] = mtu;
+		for (i = 0; i < dev->num_slaves; i++)
+			master->max_mtu[port] =
+				max_t(u16, master->max_mtu[port],
+				      master->slave_state[i].mtu[port]);
+	}
+	gen_context->mtu = cpu_to_be16(master->max_mtu[port]);
+}
+
+static void
+mlx4_en_set_port_user_mtu(struct mlx4_dev *dev, int slave, int port,
+			  struct mlx4_set_port_general_context *gen_context)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master;
+	struct mlx4_slave_state *slave_st = &master->slave_state[slave];
+	u16 user_mtu, prev_user_mtu;
+
+	/* User Mtu is configured as the max USER_MTU among all
+	 * the functions on the port.
+	 */
+	user_mtu = be16_to_cpu(gen_context->user_mtu);
+	user_mtu = min_t(int, user_mtu, dev->caps.eth_mtu_cap[port]);
+	prev_user_mtu = slave_st->user_mtu[port];
+	slave_st->user_mtu[port] = user_mtu;
+	if (user_mtu > master->max_user_mtu[port])
+		master->max_user_mtu[port] = user_mtu;
+	if (user_mtu < prev_user_mtu &&
+	    prev_user_mtu == master->max_user_mtu[port]) {
+		int i;
+
+		slave_st->user_mtu[port] = user_mtu;
+		master->max_user_mtu[port] = user_mtu;
+		for (i = 0; i < dev->num_slaves; i++)
+			master->max_user_mtu[port] =
+				max_t(u16, master->max_user_mtu[port],
+				      master->slave_state[i].user_mtu[port]);
+	}
+	gen_context->user_mtu = cpu_to_be16(master->max_user_mtu[port]);
+}
+
+static void
+mlx4_en_set_port_global_pause(struct mlx4_dev *dev, int slave,
+			      struct mlx4_set_port_general_context *gen_context)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master;
+
+	/* Slave cannot change Global Pause configuration */
+	if (slave != mlx4_master_func_num(dev) &&
+	    (gen_context->pptx != master->pptx ||
+	     gen_context->pprx != master->pprx)) {
+		gen_context->pptx = master->pptx;
+		gen_context->pprx = master->pprx;
+		mlx4_warn(dev, "denying Global Pause change for slave:%d\n",
+			  slave);
+	} else {
+		master->pptx = gen_context->pptx;
+		master->pprx = gen_context->pprx;
+	}
+}
+
+static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod,
+				u8 op_mod, struct mlx4_cmd_mailbox *inbox)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_port_info *port_info;
+	struct mlx4_set_port_rqp_calc_context *qpn_context;
+	struct mlx4_set_port_general_context *gen_context;
+	struct mlx4_roce_gid_entry *gid_entry_tbl, *gid_entry_mbox, *gid_entry_mb1;
+	int reset_qkey_viols;
+	int port;
+	int is_eth;
+	int num_gids;
+	int base;
+	u32 in_modifier;
+	u32 promisc;
+	int err;
+	int i, j;
+	int offset;
+	__be32 agg_cap_mask;
+	__be32 slave_cap_mask;
+	__be32 new_cap_mask;
+
+	port = in_mod & 0xff;
+	in_modifier = in_mod >> 8;
+	is_eth = op_mod;
+	port_info = &priv->port[port];
+
+	/* Slaves cannot perform SET_PORT operations,
+	 * except for changing MTU and USER_MTU.
+	 */
+	if (is_eth) {
+		if (slave != dev->caps.function &&
+		    in_modifier != MLX4_SET_PORT_GENERAL &&
+		    in_modifier != MLX4_SET_PORT_GID_TABLE) {
+			mlx4_warn(dev, "denying SET_PORT for slave:%d\n",
+					slave);
+			return -EINVAL;
+		}
+		switch (in_modifier) {
+		case MLX4_SET_PORT_RQP_CALC:
+			qpn_context = inbox->buf;
+			qpn_context->base_qpn =
+				cpu_to_be32(port_info->base_qpn);
+			qpn_context->n_mac = 0x7;
+			promisc = be32_to_cpu(qpn_context->promisc) >>
+				SET_PORT_PROMISC_SHIFT;
+			qpn_context->promisc = cpu_to_be32(
+				promisc << SET_PORT_PROMISC_SHIFT |
+				port_info->base_qpn);
+			promisc = be32_to_cpu(qpn_context->mcast) >>
+				SET_PORT_MC_PROMISC_SHIFT;
+			qpn_context->mcast = cpu_to_be32(
+				promisc << SET_PORT_MC_PROMISC_SHIFT |
+				port_info->base_qpn);
+			break;
+		case MLX4_SET_PORT_GENERAL:
+			gen_context = inbox->buf;
+
+			if (gen_context->flags & MLX4_FLAG_V_MTU_MASK)
+				mlx4_en_set_port_mtu(dev, slave, port,
+						     gen_context);
+
+			if (gen_context->flags2 & MLX4_FLAG2_V_USER_MTU_MASK)
+				mlx4_en_set_port_user_mtu(dev, slave, port,
+							  gen_context);
+
+			if (gen_context->flags &
+			    (MLX4_FLAG_V_PPRX_MASK | MLX4_FLAG_V_PPTX_MASK))
+				mlx4_en_set_port_global_pause(dev, slave,
+							      gen_context);
+
+			break;
+		case MLX4_SET_PORT_GID_TABLE:
+			/* change to MULTIPLE entries: number of guest's gids
+			 * need a FOR-loop here over number of gids the guest has.
+			 * 1. Check no duplicates in gids passed by slave
+			 */
+			num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+			base = mlx4_get_base_gid_ix(dev, slave, port);
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
+			for (i = 0; i < num_gids; gid_entry_mbox++, i++) {
+				if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw,
+					    sizeof(zgid_entry)))
+					continue;
+				gid_entry_mb1 = gid_entry_mbox + 1;
+				for (j = i + 1; j < num_gids; gid_entry_mb1++, j++) {
+					if (!memcmp(gid_entry_mb1->raw,
+						    zgid_entry.raw, sizeof(zgid_entry)))
+						continue;
+					if (!memcmp(gid_entry_mb1->raw, gid_entry_mbox->raw,
+						    sizeof(gid_entry_mbox->raw))) {
+						/* found duplicate */
+						return -EINVAL;
+					}
+				}
+			}
+
+			/* 2. Check that do not have duplicates in OTHER
+			 *    entries in the port GID table
+			 */
+
+			mutex_lock(&(priv->port[port].gid_table.mutex));
+			for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) {
+				if (i >= base && i < base + num_gids)
+					continue; /* don't compare to slave's current gids */
+				gid_entry_tbl = &priv->port[port].gid_table.roce_gids[i];
+				if (!memcmp(gid_entry_tbl->raw, zgid_entry.raw, sizeof(zgid_entry)))
+					continue;
+				gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
+				for (j = 0; j < num_gids; gid_entry_mbox++, j++) {
+					if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw,
+						    sizeof(zgid_entry)))
+						continue;
+					if (!memcmp(gid_entry_mbox->raw, gid_entry_tbl->raw,
+						    sizeof(gid_entry_tbl->raw))) {
+						/* found duplicate */
+						mlx4_warn(dev, "requested gid entry for slave:%d is a duplicate of gid at index %d\n",
+							  slave, i);
+						mutex_unlock(&(priv->port[port].gid_table.mutex));
+						return -EINVAL;
+					}
+				}
+			}
+
+			/* insert slave GIDs with memcpy, starting at slave's base index */
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
+			for (i = 0, offset = base; i < num_gids; gid_entry_mbox++, offset++, i++)
+				memcpy(priv->port[port].gid_table.roce_gids[offset].raw,
+				       gid_entry_mbox->raw, MLX4_ROCE_GID_ENTRY_SIZE);
+
+			/* Now, copy roce port gids table to current mailbox for passing to FW */
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
+			for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++)
+				memcpy(gid_entry_mbox->raw,
+				       priv->port[port].gid_table.roce_gids[i].raw,
+				       MLX4_ROCE_GID_ENTRY_SIZE);
+
+			err = mlx4_cmd(dev, inbox->dma, in_mod & 0xffff, op_mod,
+				       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				       MLX4_CMD_NATIVE);
+			mutex_unlock(&(priv->port[port].gid_table.mutex));
+			return err;
+		}
+
+		return mlx4_cmd(dev, inbox->dma, in_mod & 0xffff, op_mod,
+				MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				MLX4_CMD_NATIVE);
+	}
+
+	/* Slaves are not allowed to SET_PORT beacon (LED) blink */
+	if (op_mod == MLX4_SET_PORT_BEACON_OPCODE) {
+		mlx4_warn(dev, "denying SET_PORT Beacon slave:%d\n", slave);
+		return -EPERM;
+	}
+
+	/* For IB, we only consider:
+	 * - The capability mask, which is set to the aggregate of all
+	 *   slave function capabilities
+	 * - The QKey violatin counter - reset according to each request.
+	 */
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		reset_qkey_viols = (*(u8 *) inbox->buf) & 0x40;
+		new_cap_mask = ((__be32 *) inbox->buf)[2];
+	} else {
+		reset_qkey_viols = ((u8 *) inbox->buf)[3] & 0x1;
+		new_cap_mask = ((__be32 *) inbox->buf)[1];
+	}
+
+	/* slave may not set the IS_SM capability for the port */
+	if (slave != mlx4_master_func_num(dev) &&
+	    (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_IS_SM))
+		return -EINVAL;
+
+	/* No DEV_MGMT in multifunc mode */
+	if (mlx4_is_mfunc(dev) &&
+	    (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_DEV_MGMT_SUP))
+		return -EINVAL;
+
+	agg_cap_mask = 0;
+	slave_cap_mask =
+		priv->mfunc.master.slave_state[slave].ib_cap_mask[port];
+	priv->mfunc.master.slave_state[slave].ib_cap_mask[port] = new_cap_mask;
+	for (i = 0; i < dev->num_slaves; i++)
+		agg_cap_mask |=
+			priv->mfunc.master.slave_state[i].ib_cap_mask[port];
+
+	/* only clear mailbox for guests.  Master may be setting
+	* MTU or PKEY table size
+	*/
+	if (slave != dev->caps.function)
+		memset(inbox->buf, 0, 256);
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		*(u8 *) inbox->buf	   |= !!reset_qkey_viols << 6;
+		((__be32 *) inbox->buf)[2] = agg_cap_mask;
+	} else {
+		((u8 *) inbox->buf)[3]     |= !!reset_qkey_viols;
+		((__be32 *) inbox->buf)[1] = agg_cap_mask;
+	}
+
+	err = mlx4_cmd(dev, inbox->dma, port, is_eth, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err)
+		priv->mfunc.master.slave_state[slave].ib_cap_mask[port] =
+			slave_cap_mask;
+	return err;
+}
+
+int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int port = mlx4_slave_convert_port(
+			dev, slave, vhcr->in_modifier & 0xFF);
+
+	if (port < 0)
+		return -EINVAL;
+
+	vhcr->in_modifier = (vhcr->in_modifier & ~0xFF) |
+			    (port & 0xFF);
+
+	return mlx4_common_set_port(dev, slave, vhcr->in_modifier,
+				    vhcr->op_modifier, inbox);
+}
+
+/* bit locations for set port command with zero op modifier */
+enum {
+	MLX4_SET_PORT_VL_CAP	 = 4, /* bits 7:4 */
+	MLX4_SET_PORT_MTU_CAP	 = 12, /* bits 15:12 */
+	MLX4_CHANGE_PORT_PKEY_TBL_SZ = 20,
+	MLX4_CHANGE_PORT_VL_CAP	 = 21,
+	MLX4_CHANGE_PORT_MTU_CAP = 22,
+};
+
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err, vl_cap, pkey_tbl_flag = 0;
+
+	if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port];
+
+	if (pkey_tbl_sz >= 0 && mlx4_is_master(dev)) {
+		pkey_tbl_flag = 1;
+		((__be16 *) mailbox->buf)[20] = cpu_to_be16(pkey_tbl_sz);
+	}
+
+	/* IB VL CAP enum isn't used by the firmware, just numerical values */
+	for (vl_cap = 8; vl_cap >= 1; vl_cap >>= 1) {
+		((__be32 *) mailbox->buf)[0] = cpu_to_be32(
+			(1 << MLX4_CHANGE_PORT_MTU_CAP) |
+			(1 << MLX4_CHANGE_PORT_VL_CAP)  |
+			(pkey_tbl_flag << MLX4_CHANGE_PORT_PKEY_TBL_SZ) |
+			(dev->caps.port_ib_mtu[port] << MLX4_SET_PORT_MTU_CAP) |
+			(vl_cap << MLX4_SET_PORT_VL_CAP));
+		err = mlx4_cmd(dev, mailbox->dma, port,
+			       MLX4_SET_PORT_IB_OPCODE, MLX4_CMD_SET_PORT,
+			       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+		if (err != -ENOMEM)
+			break;
+	}
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+#define SET_PORT_ROCE_2_FLAGS          0x10
+#define MLX4_SET_PORT_ROCE_V1_V2       0x2
+int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
+			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	int err;
+	u32 in_mod;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->flags = SET_PORT_GEN_ALL_VALID;
+	context->mtu = cpu_to_be16(mtu);
+	context->pptx = (pptx * (!pfctx)) << 7;
+	context->pfctx = pfctx;
+	context->pprx = (pprx * (!pfcrx)) << 7;
+	context->pfcrx = pfcrx;
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+		context->flags |= SET_PORT_ROCE_2_FLAGS;
+		context->roce_mode |=
+			MLX4_SET_PORT_ROCE_V1_V2 << 4;
+	}
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_general);
+
+int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
+			   u8 promisc)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_rqp_calc_context *context;
+	int err;
+	u32 in_mod;
+	u32 m_promisc = (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) ?
+		MCAST_DIRECT : MCAST_DEFAULT;
+
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0)
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->base_qpn = cpu_to_be32(base_qpn);
+	context->n_mac = dev->caps.log_num_macs;
+	context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_SHIFT |
+				       base_qpn);
+	context->mcast = cpu_to_be32(m_promisc << SET_PORT_MC_PROMISC_SHIFT |
+				     base_qpn);
+	context->intra_no_vlan = 0;
+	context->no_vlan = MLX4_NO_VLAN_IDX;
+	context->intra_vlan_miss = 0;
+	context->vlan_miss = MLX4_VLAN_MISS_IDX;
+
+	in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_qpn_calc);
+
+int mlx4_SET_PORT_user_mtu(struct mlx4_dev *dev, u8 port, u16 user_mtu)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->flags2 |= MLX4_FLAG2_V_USER_MTU_MASK;
+	context->user_mtu = cpu_to_be16(user_mtu);
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_user_mtu);
+
+int mlx4_SET_PORT_user_mac(struct mlx4_dev *dev, u8 port, u8 *user_mac)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->flags2 |= MLX4_FLAG2_V_USER_MAC_MASK;
+	memcpy(context->user_mac, user_mac, sizeof(context->user_mac));
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_user_mac);
+
+int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port, u8 ignore_fcs_value)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->flags2 |= MLX4_FLAG2_V_IGNORE_FCS_MASK;
+	if (ignore_fcs_value)
+		context->ignore_fcs |= MLX4_IGNORE_FCS_MASK;
+	else
+		context->ignore_fcs &= ~MLX4_IGNORE_FCS_MASK;
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_fcs_check);
+
+enum {
+	VXLAN_ENABLE_MODIFY	= 1 << 7,
+	VXLAN_STEERING_MODIFY	= 1 << 6,
+
+	VXLAN_ENABLE		= 1 << 7,
+};
+
+struct mlx4_set_port_vxlan_context {
+	u32	reserved1;
+	u8	modify_flags;
+	u8	reserved2;
+	u8	enable_flags;
+	u8	steering;
+};
+
+int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable)
+{
+	int err;
+	u32 in_mod;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_vxlan_context  *context;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	memset(context, 0, sizeof(*context));
+
+	context->modify_flags = VXLAN_ENABLE_MODIFY | VXLAN_STEERING_MODIFY;
+	if (enable)
+		context->enable_flags = VXLAN_ENABLE;
+	context->steering  = steering;
+
+	in_mod = MLX4_SET_PORT_VXLAN << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_VXLAN);
+
+int mlx4_SET_PORT_BEACON(struct mlx4_dev *dev, u8 port, u16 time)
+{
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	*((__be32 *)mailbox->buf) = cpu_to_be32(time);
+
+	err = mlx4_cmd(dev, mailbox->dma, port, MLX4_SET_PORT_BEACON_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_BEACON);
+
+int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	int err = 0;
+
+	return err;
+}
+
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port,
+			u64 mac, u64 clear, u8 mode)
+{
+	return mlx4_cmd(dev, (mac | (clear << 63)), port, mode,
+			MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B,
+			MLX4_CMD_WRAPPED);
+}
+EXPORT_SYMBOL(mlx4_SET_MCAST_FLTR);
+
+int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	int err = 0;
+
+	return err;
+}
+
+int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	return 0;
+}
+
+int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid,
+				 int *slave_id)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, found_ix = -1;
+	int vf_gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
+	struct mlx4_slaves_pport slaves_pport;
+	unsigned num_vfs;
+	int slave_gid;
+
+	if (!mlx4_is_mfunc(dev))
+		return -EINVAL;
+
+	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
+	num_vfs = bitmap_weight(slaves_pport.slaves,
+				dev->persist->num_vfs + 1) - 1;
+
+	for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) {
+		if (!memcmp(priv->port[port].gid_table.roce_gids[i].raw, gid,
+			    MLX4_ROCE_GID_ENTRY_SIZE)) {
+			found_ix = i;
+			break;
+		}
+	}
+
+	if (found_ix >= 0) {
+		/* Calculate a slave_gid which is the slave number in the gid
+		 * table and not a globally unique slave number.
+		 */
+		if (found_ix < MLX4_ROCE_PF_GIDS)
+			slave_gid = 0;
+		else if (found_ix < MLX4_ROCE_PF_GIDS + (vf_gids % num_vfs) *
+			 (vf_gids / num_vfs + 1))
+			slave_gid = ((found_ix - MLX4_ROCE_PF_GIDS) /
+				     (vf_gids / num_vfs + 1)) + 1;
+		else
+			slave_gid =
+			((found_ix - MLX4_ROCE_PF_GIDS -
+			  ((vf_gids % num_vfs) * ((vf_gids / num_vfs + 1)))) /
+			 (vf_gids / num_vfs)) + vf_gids % num_vfs + 1;
+
+		/* Calculate the globally unique slave id */
+		if (slave_gid) {
+			struct mlx4_active_ports exclusive_ports;
+			struct mlx4_active_ports actv_ports;
+			struct mlx4_slaves_pport slaves_pport_actv;
+			unsigned max_port_p_one;
+			int num_vfs_before = 0;
+			int candidate_slave_gid;
+
+			/* Calculate how many VFs are on the previous port, if exists */
+			for (i = 1; i < port; i++) {
+				bitmap_zero(exclusive_ports.ports, dev->caps.num_ports);
+				set_bit(i - 1, exclusive_ports.ports);
+				slaves_pport_actv =
+					mlx4_phys_to_slaves_pport_actv(
+							dev, &exclusive_ports);
+				num_vfs_before += bitmap_weight(
+						slaves_pport_actv.slaves,
+						dev->persist->num_vfs + 1);
+			}
+
+			/* candidate_slave_gid isn't necessarily the correct slave, but
+			 * it has the same number of ports and is assigned to the same
+			 * ports as the real slave we're looking for. On dual port VF,
+			 * slave_gid = [single port VFs on port <port>] +
+			 * [offset of the current slave from the first dual port VF] +
+			 * 1 (for the PF).
+			 */
+			candidate_slave_gid = slave_gid + num_vfs_before;
+
+			actv_ports = mlx4_get_active_ports(dev, candidate_slave_gid);
+			max_port_p_one = find_first_bit(
+				actv_ports.ports, dev->caps.num_ports) +
+				bitmap_weight(actv_ports.ports,
+					      dev->caps.num_ports) + 1;
+
+			/* Calculate the real slave number */
+			for (i = 1; i < max_port_p_one; i++) {
+				if (i == port)
+					continue;
+				bitmap_zero(exclusive_ports.ports,
+					    dev->caps.num_ports);
+				set_bit(i - 1, exclusive_ports.ports);
+				slaves_pport_actv =
+					mlx4_phys_to_slaves_pport_actv(
+						dev, &exclusive_ports);
+				slave_gid += bitmap_weight(
+						slaves_pport_actv.slaves,
+						dev->persist->num_vfs + 1);
+			}
+		}
+		*slave_id = slave_gid;
+	}
+
+	return (found_ix >= 0) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL(mlx4_get_slave_from_roce_gid);
+
+int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id,
+				 u8 *gid)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!mlx4_is_master(dev))
+		return -EINVAL;
+
+	memcpy(gid, priv->port[port].gid_table.roce_gids[slave_id].raw,
+	       MLX4_ROCE_GID_ENTRY_SIZE);
+	return 0;
+}
+EXPORT_SYMBOL(mlx4_get_roce_gid_from_slave);
+
+/* Cable Module Info */
+#define MODULE_INFO_MAX_READ 48
+
+#define I2C_ADDR_LOW  0x50
+#define I2C_ADDR_HIGH 0x51
+#define I2C_PAGE_SIZE 256
+
+/* Module Info Data */
+struct mlx4_cable_info {
+	u8	i2c_addr;
+	u8	page_num;
+	__be16	dev_mem_address;
+	__be16	reserved1;
+	__be16	size;
+	__be32	reserved2[2];
+	u8	data[MODULE_INFO_MAX_READ];
+};
+
+enum cable_info_err {
+	 CABLE_INF_INV_PORT      = 0x1,
+	 CABLE_INF_OP_NOSUP      = 0x2,
+	 CABLE_INF_NOT_CONN      = 0x3,
+	 CABLE_INF_NO_EEPRM      = 0x4,
+	 CABLE_INF_PAGE_ERR      = 0x5,
+	 CABLE_INF_INV_ADDR      = 0x6,
+	 CABLE_INF_I2C_ADDR      = 0x7,
+	 CABLE_INF_QSFP_VIO      = 0x8,
+	 CABLE_INF_I2C_BUSY      = 0x9,
+};
+
+#define MAD_STATUS_2_CABLE_ERR(mad_status) ((mad_status >> 8) & 0xFF)
+
+static inline const char *cable_info_mad_err_str(u16 mad_status)
+{
+	u8 err = MAD_STATUS_2_CABLE_ERR(mad_status);
+
+	switch (err) {
+	case CABLE_INF_INV_PORT:
+		return "invalid port selected";
+	case CABLE_INF_OP_NOSUP:
+		return "operation not supported for this port (the port is of type CX4 or internal)";
+	case CABLE_INF_NOT_CONN:
+		return "cable is not connected";
+	case CABLE_INF_NO_EEPRM:
+		return "the connected cable has no EPROM (passive copper cable)";
+	case CABLE_INF_PAGE_ERR:
+		return "page number is greater than 15";
+	case CABLE_INF_INV_ADDR:
+		return "invalid device_address or size (that is, size equals 0 or address+size is greater than 256)";
+	case CABLE_INF_I2C_ADDR:
+		return "invalid I2C slave address";
+	case CABLE_INF_QSFP_VIO:
+		return "at least one cable violates the QSFP specification and ignores the modsel signal";
+	case CABLE_INF_I2C_BUSY:
+		return "I2C bus is constantly busy";
+	}
+	return "Unknown Error";
+}
+
+/**
+ * mlx4_get_module_info - Read cable module eeprom data
+ * @dev: mlx4_dev.
+ * @port: port number.
+ * @offset: byte offset in eeprom to start reading data from.
+ * @size: num of bytes to read.
+ * @data: output buffer to put the requested data into.
+ *
+ * Reads cable module eeprom data, puts the outcome data into
+ * data pointer paramer.
+ * Returns num of read bytes on success or a negative error
+ * code.
+ */
+int mlx4_get_module_info(struct mlx4_dev *dev, u8 port,
+			 u16 offset, u16 size, u8 *data)
+{
+	struct mlx4_cmd_mailbox *inbox, *outbox;
+	struct mlx4_mad_ifc *inmad, *outmad;
+	struct mlx4_cable_info *cable_info;
+	u16 i2c_addr;
+	int ret;
+
+	if (size > MODULE_INFO_MAX_READ)
+		size = MODULE_INFO_MAX_READ;
+
+	inbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inbox))
+		return PTR_ERR(inbox);
+
+	outbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outbox)) {
+		mlx4_free_cmd_mailbox(dev, inbox);
+		return PTR_ERR(outbox);
+	}
+
+	inmad = (struct mlx4_mad_ifc *)(inbox->buf);
+	outmad = (struct mlx4_mad_ifc *)(outbox->buf);
+
+	inmad->method = 0x1; /* Get */
+	inmad->class_version = 0x1;
+	inmad->mgmt_class = 0x1;
+	inmad->base_version = 0x1;
+	inmad->attr_id = cpu_to_be16(0xFF60); /* Module Info */
+
+	if (offset < I2C_PAGE_SIZE && offset + size > I2C_PAGE_SIZE)
+		/* Cross pages reads are not allowed
+		 * read until offset 256 in low page
+		 */
+		size -= offset + size - I2C_PAGE_SIZE;
+
+	i2c_addr = I2C_ADDR_LOW;
+	if (offset >= I2C_PAGE_SIZE) {
+		/* Reset offset to high page */
+		i2c_addr = I2C_ADDR_HIGH;
+		offset -= I2C_PAGE_SIZE;
+	}
+
+	cable_info = (struct mlx4_cable_info *)inmad->data;
+	cable_info->dev_mem_address = cpu_to_be16(offset);
+	cable_info->page_num = 0;
+	cable_info->i2c_addr = i2c_addr;
+	cable_info->size = cpu_to_be16(size);
+
+	ret = mlx4_cmd_box(dev, inbox->dma, outbox->dma, port, 3,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (ret)
+		goto out;
+
+	if (be16_to_cpu(outmad->status)) {
+		/* Mad returned with bad status */
+		ret = be16_to_cpu(outmad->status);
+		mlx4_warn(dev,
+			  "MLX4_CMD_MAD_IFC Get Module info attr(%x) port(%d) i2c_addr(%x) offset(%d) size(%d): Response Mad Status(%x) - %s\n",
+			  0xFF60, port, i2c_addr, offset, size,
+			  ret, cable_info_mad_err_str(ret));
+
+		if (i2c_addr == I2C_ADDR_HIGH &&
+		    MAD_STATUS_2_CABLE_ERR(ret) == CABLE_INF_I2C_ADDR)
+			/* Some SFP cables do not support i2c slave
+			 * address 0x51 (high page), abort silently.
+			 */
+			ret = 0;
+		else
+			ret = -ret;
+		goto out;
+	}
+	cable_info = (struct mlx4_cable_info *)outmad->data;
+	memcpy(data, cable_info->data, size);
+	ret = size;
+out:
+	mlx4_free_cmd_mailbox(dev, inbox);
+	mlx4_free_cmd_mailbox(dev, outbox);
+	return ret;
+}
+EXPORT_SYMBOL(mlx4_get_module_info);
+
+int mlx4_max_tc(struct mlx4_dev *dev)
+{
+	u8 num_tc = dev->caps.max_tc_eth;
+
+	if (!num_tc)
+		num_tc = MLX4_TC_MAX_NUMBER;
+
+	return num_tc;
+}
+EXPORT_SYMBOL(mlx4_max_tc);
diff --git a/drivers/net/ethernet/mellanox/mlx4/profile.c b/drivers/net/ethernet/mellanox/mlx4/profile.c
new file mode 100644
index 0000000..bae8b22
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/profile.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+enum {
+	MLX4_RES_QP,
+	MLX4_RES_RDMARC,
+	MLX4_RES_ALTC,
+	MLX4_RES_AUXC,
+	MLX4_RES_SRQ,
+	MLX4_RES_CQ,
+	MLX4_RES_EQ,
+	MLX4_RES_DMPT,
+	MLX4_RES_CMPT,
+	MLX4_RES_MTT,
+	MLX4_RES_MCG,
+	MLX4_RES_NUM
+};
+
+static const char *res_name[] = {
+	[MLX4_RES_QP]		= "QP",
+	[MLX4_RES_RDMARC]	= "RDMARC",
+	[MLX4_RES_ALTC]		= "ALTC",
+	[MLX4_RES_AUXC]		= "AUXC",
+	[MLX4_RES_SRQ]		= "SRQ",
+	[MLX4_RES_CQ]		= "CQ",
+	[MLX4_RES_EQ]		= "EQ",
+	[MLX4_RES_DMPT]		= "DMPT",
+	[MLX4_RES_CMPT]		= "CMPT",
+	[MLX4_RES_MTT]		= "MTT",
+	[MLX4_RES_MCG]		= "MCG",
+};
+
+u64 mlx4_make_profile(struct mlx4_dev *dev,
+		      struct mlx4_profile *request,
+		      struct mlx4_dev_cap *dev_cap,
+		      struct mlx4_init_hca_param *init_hca)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource {
+		u64 size;
+		u64 start;
+		int type;
+		u32 num;
+		int log_num;
+	};
+
+	u64 total_size = 0;
+	struct mlx4_resource *profile;
+	struct sysinfo si;
+	int i, j;
+
+	profile = kcalloc(MLX4_RES_NUM, sizeof(*profile), GFP_KERNEL);
+	if (!profile)
+		return -ENOMEM;
+
+	/*
+	 * We want to scale the number of MTTs with the size of the
+	 * system memory, since it makes sense to register a lot of
+	 * memory on a system with a lot of memory.  As a heuristic,
+	 * make sure we have enough MTTs to cover twice the system
+	 * memory (with PAGE_SIZE entries).
+	 *
+	 * This number has to be a power of two and fit into 32 bits
+	 * due to device limitations, so cap this at 2^31 as well.
+	 * That limits us to 8TB of memory registration per HCA with
+	 * 4KB pages, which is probably OK for the next few months.
+	 */
+	si_meminfo(&si);
+	request->num_mtt =
+		roundup_pow_of_two(max_t(unsigned, request->num_mtt,
+					 min(1UL << (31 - log_mtts_per_seg),
+					     si.totalram >> (log_mtts_per_seg - 1))));
+
+	profile[MLX4_RES_QP].size     = dev_cap->qpc_entry_sz;
+	profile[MLX4_RES_RDMARC].size = dev_cap->rdmarc_entry_sz;
+	profile[MLX4_RES_ALTC].size   = dev_cap->altc_entry_sz;
+	profile[MLX4_RES_AUXC].size   = dev_cap->aux_entry_sz;
+	profile[MLX4_RES_SRQ].size    = dev_cap->srq_entry_sz;
+	profile[MLX4_RES_CQ].size     = dev_cap->cqc_entry_sz;
+	profile[MLX4_RES_EQ].size     = dev_cap->eqc_entry_sz;
+	profile[MLX4_RES_DMPT].size   = dev_cap->dmpt_entry_sz;
+	profile[MLX4_RES_CMPT].size   = dev_cap->cmpt_entry_sz;
+	profile[MLX4_RES_MTT].size    = dev_cap->mtt_entry_sz;
+	profile[MLX4_RES_MCG].size    = mlx4_get_mgm_entry_size(dev);
+
+	profile[MLX4_RES_QP].num      = request->num_qp;
+	profile[MLX4_RES_RDMARC].num  = request->num_qp * request->rdmarc_per_qp;
+	profile[MLX4_RES_ALTC].num    = request->num_qp;
+	profile[MLX4_RES_AUXC].num    = request->num_qp;
+	profile[MLX4_RES_SRQ].num     = request->num_srq;
+	profile[MLX4_RES_CQ].num      = request->num_cq;
+	profile[MLX4_RES_EQ].num = mlx4_is_mfunc(dev) ? dev->phys_caps.num_phys_eqs :
+					min_t(unsigned, dev_cap->max_eqs, MAX_MSIX);
+	profile[MLX4_RES_DMPT].num    = request->num_mpt;
+	profile[MLX4_RES_CMPT].num    = MLX4_NUM_CMPTS;
+	profile[MLX4_RES_MTT].num     = request->num_mtt * (1 << log_mtts_per_seg);
+	profile[MLX4_RES_MCG].num     = request->num_mcg;
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		profile[i].type     = i;
+		profile[i].num      = roundup_pow_of_two(profile[i].num);
+		profile[i].log_num  = ilog2(profile[i].num);
+		profile[i].size    *= profile[i].num;
+		profile[i].size     = max(profile[i].size, (u64) PAGE_SIZE);
+	}
+
+	/*
+	 * Sort the resources in decreasing order of size.  Since they
+	 * all have sizes that are powers of 2, we'll be able to keep
+	 * resources aligned to their size and pack them without gaps
+	 * using the sorted order.
+	 */
+	for (i = MLX4_RES_NUM; i > 0; --i)
+		for (j = 1; j < i; ++j) {
+			if (profile[j].size > profile[j - 1].size)
+				swap(profile[j], profile[j - 1]);
+		}
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		if (profile[i].size) {
+			profile[i].start = total_size;
+			total_size	+= profile[i].size;
+		}
+
+		if (total_size > dev_cap->max_icm_sz) {
+			mlx4_err(dev, "Profile requires 0x%llx bytes; won't fit in 0x%llx bytes of context memory\n",
+				 (unsigned long long) total_size,
+				 (unsigned long long) dev_cap->max_icm_sz);
+			kfree(profile);
+			return -ENOMEM;
+		}
+
+		if (profile[i].size)
+			mlx4_dbg(dev, "  profile[%2d] (%6s): 2^%02d entries @ 0x%10llx, size 0x%10llx\n",
+				 i, res_name[profile[i].type],
+				 profile[i].log_num,
+				 (unsigned long long) profile[i].start,
+				 (unsigned long long) profile[i].size);
+	}
+
+	mlx4_dbg(dev, "HCA context memory: reserving %d KB\n",
+		 (int) (total_size >> 10));
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		switch (profile[i].type) {
+		case MLX4_RES_QP:
+			dev->caps.num_qps     = profile[i].num;
+			init_hca->qpc_base    = profile[i].start;
+			init_hca->log_num_qps = profile[i].log_num;
+			break;
+		case MLX4_RES_RDMARC:
+			for (priv->qp_table.rdmarc_shift = 0;
+			     request->num_qp << priv->qp_table.rdmarc_shift < profile[i].num;
+			     ++priv->qp_table.rdmarc_shift)
+				; /* nothing */
+			dev->caps.max_qp_dest_rdma = 1 << priv->qp_table.rdmarc_shift;
+			priv->qp_table.rdmarc_base   = (u32) profile[i].start;
+			init_hca->rdmarc_base	     = profile[i].start;
+			init_hca->log_rd_per_qp	     = priv->qp_table.rdmarc_shift;
+			break;
+		case MLX4_RES_ALTC:
+			init_hca->altc_base = profile[i].start;
+			break;
+		case MLX4_RES_AUXC:
+			init_hca->auxc_base = profile[i].start;
+			break;
+		case MLX4_RES_SRQ:
+			dev->caps.num_srqs     = profile[i].num;
+			init_hca->srqc_base    = profile[i].start;
+			init_hca->log_num_srqs = profile[i].log_num;
+			break;
+		case MLX4_RES_CQ:
+			dev->caps.num_cqs     = profile[i].num;
+			init_hca->cqc_base    = profile[i].start;
+			init_hca->log_num_cqs = profile[i].log_num;
+			break;
+		case MLX4_RES_EQ:
+			if (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
+				init_hca->log_num_eqs = 0x1f;
+				init_hca->eqc_base    = profile[i].start;
+				init_hca->num_sys_eqs = dev_cap->num_sys_eqs;
+			} else {
+				dev->caps.num_eqs     = roundup_pow_of_two(
+								min_t(unsigned,
+								      dev_cap->max_eqs,
+								      MAX_MSIX));
+				init_hca->eqc_base    = profile[i].start;
+				init_hca->log_num_eqs = ilog2(dev->caps.num_eqs);
+			}
+			break;
+		case MLX4_RES_DMPT:
+			dev->caps.num_mpts	= profile[i].num;
+			priv->mr_table.mpt_base = profile[i].start;
+			init_hca->dmpt_base	= profile[i].start;
+			init_hca->log_mpt_sz	= profile[i].log_num;
+			break;
+		case MLX4_RES_CMPT:
+			init_hca->cmpt_base	 = profile[i].start;
+			break;
+		case MLX4_RES_MTT:
+			dev->caps.num_mtts	 = profile[i].num;
+			priv->mr_table.mtt_base	 = profile[i].start;
+			init_hca->mtt_base	 = profile[i].start;
+			break;
+		case MLX4_RES_MCG:
+			init_hca->mc_base	  = profile[i].start;
+			init_hca->log_mc_entry_sz =
+					ilog2(mlx4_get_mgm_entry_size(dev));
+			init_hca->log_mc_table_sz = profile[i].log_num;
+			if (dev->caps.steering_mode ==
+			    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+				dev->caps.num_mgms = profile[i].num;
+			} else {
+				init_hca->log_mc_hash_sz =
+						profile[i].log_num - 1;
+				dev->caps.num_mgms = profile[i].num >> 1;
+				dev->caps.num_amgms = profile[i].num >> 1;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * PDs don't take any HCA memory, but we assign them as part
+	 * of the HCA profile anyway.
+	 */
+	dev->caps.num_pds = MLX4_NUM_PDS;
+
+	kfree(profile);
+	return total_size;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
new file mode 100644
index 0000000..427e7a3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -0,0 +1,965 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/gfp.h>
+#include <linux/export.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+/* QP to support BF should have bits 6,7 cleared */
+#define MLX4_BF_QP_SKIP_MASK	0xc0
+#define MLX4_MAX_BF_QP_RANGE	0x40
+
+void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	struct mlx4_qp *qp;
+
+	spin_lock(&qp_table->lock);
+
+	qp = __mlx4_qp_lookup(dev, qpn);
+	if (qp)
+		refcount_inc(&qp->refcount);
+
+	spin_unlock(&qp_table->lock);
+
+	if (!qp) {
+		mlx4_dbg(dev, "Async event for none existent QP %08x\n", qpn);
+		return;
+	}
+
+	qp->event(qp, event_type);
+
+	if (refcount_dec_and_test(&qp->refcount))
+		complete(&qp->free);
+}
+
+/* used for INIT/CLOSE port logic */
+static int is_master_qp0(struct mlx4_dev *dev, struct mlx4_qp *qp, int *real_qp0, int *proxy_qp0)
+{
+	/* this procedure is called after we already know we are on the master */
+	/* qp0 is either the proxy qp0, or the real qp0 */
+	u32 pf_proxy_offset = dev->phys_caps.base_proxy_sqpn + 8 * mlx4_master_func_num(dev);
+	*proxy_qp0 = qp->qpn >= pf_proxy_offset && qp->qpn <= pf_proxy_offset + 1;
+
+	*real_qp0 = qp->qpn >= dev->phys_caps.base_sqpn &&
+		qp->qpn <= dev->phys_caps.base_sqpn + 1;
+
+	return *real_qp0 || *proxy_qp0;
+}
+
+static int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		     struct mlx4_qp_context *context,
+		     enum mlx4_qp_optpar optpar,
+		     int sqd_event, struct mlx4_qp *qp, int native)
+{
+	static const u16 op[MLX4_QP_NUM_STATE][MLX4_QP_NUM_STATE] = {
+		[MLX4_QP_STATE_RST] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_INIT]	= MLX4_CMD_RST2INIT_QP,
+		},
+		[MLX4_QP_STATE_INIT]  = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_INIT]	= MLX4_CMD_INIT2INIT_QP,
+			[MLX4_QP_STATE_RTR]	= MLX4_CMD_INIT2RTR_QP,
+		},
+		[MLX4_QP_STATE_RTR]   = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_RTR2RTS_QP,
+		},
+		[MLX4_QP_STATE_RTS]   = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_RTS2RTS_QP,
+			[MLX4_QP_STATE_SQD]	= MLX4_CMD_RTS2SQD_QP,
+		},
+		[MLX4_QP_STATE_SQD] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_SQD2RTS_QP,
+			[MLX4_QP_STATE_SQD]	= MLX4_CMD_SQD2SQD_QP,
+		},
+		[MLX4_QP_STATE_SQER] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_SQERR2RTS_QP,
+		},
+		[MLX4_QP_STATE_ERR] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+		}
+	};
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	int ret = 0;
+	int real_qp0 = 0;
+	int proxy_qp0 = 0;
+	u8 port;
+
+	if (cur_state >= MLX4_QP_NUM_STATE || new_state >= MLX4_QP_NUM_STATE ||
+	    !op[cur_state][new_state])
+		return -EINVAL;
+
+	if (op[cur_state][new_state] == MLX4_CMD_2RST_QP) {
+		ret = mlx4_cmd(dev, 0, qp->qpn, 2,
+			MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A, native);
+		if (mlx4_is_master(dev) && cur_state != MLX4_QP_STATE_ERR &&
+		    cur_state != MLX4_QP_STATE_RST &&
+		    is_master_qp0(dev, qp, &real_qp0, &proxy_qp0)) {
+			port = (qp->qpn & 1) + 1;
+			if (proxy_qp0)
+				priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0;
+			else
+				priv->mfunc.master.qp0_state[port].qp0_active = 0;
+		}
+		return ret;
+	}
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (cur_state == MLX4_QP_STATE_RST && new_state == MLX4_QP_STATE_INIT) {
+		u64 mtt_addr = mlx4_mtt_addr(dev, mtt);
+		context->mtt_base_addr_h = mtt_addr >> 32;
+		context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+		context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+	}
+
+	if ((cur_state == MLX4_QP_STATE_RTR) &&
+	    (new_state == MLX4_QP_STATE_RTS) &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+		context->roce_entropy =
+			cpu_to_be16(mlx4_qp_roce_entropy(dev, qp->qpn));
+
+	*(__be32 *) mailbox->buf = cpu_to_be32(optpar);
+	memcpy(mailbox->buf + 8, context, sizeof(*context));
+
+	((struct mlx4_qp_context *) (mailbox->buf + 8))->local_qpn =
+		cpu_to_be32(qp->qpn);
+
+	ret = mlx4_cmd(dev, mailbox->dma,
+		       qp->qpn | (!!sqd_event << 31),
+		       new_state == MLX4_QP_STATE_RST ? 2 : 0,
+		       op[cur_state][new_state], MLX4_CMD_TIME_CLASS_C, native);
+
+	if (mlx4_is_master(dev) && is_master_qp0(dev, qp, &real_qp0, &proxy_qp0)) {
+		port = (qp->qpn & 1) + 1;
+		if (cur_state != MLX4_QP_STATE_ERR &&
+		    cur_state != MLX4_QP_STATE_RST &&
+		    new_state == MLX4_QP_STATE_ERR) {
+			if (proxy_qp0)
+				priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0;
+			else
+				priv->mfunc.master.qp0_state[port].qp0_active = 0;
+		} else if (new_state == MLX4_QP_STATE_RTR) {
+			if (proxy_qp0)
+				priv->mfunc.master.qp0_state[port].proxy_qp0_active = 1;
+			else
+				priv->mfunc.master.qp0_state[port].qp0_active = 1;
+		}
+	}
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+
+int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		   struct mlx4_qp_context *context,
+		   enum mlx4_qp_optpar optpar,
+		   int sqd_event, struct mlx4_qp *qp)
+{
+	return __mlx4_qp_modify(dev, mtt, cur_state, new_state, context,
+				optpar, sqd_event, qp, 0);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_modify);
+
+int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			    int *base, u8 flags)
+{
+	u32 uid;
+	int bf_qp = !!(flags & (u8)MLX4_RESERVE_ETH_BF_QP);
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+
+	if (cnt > MLX4_MAX_BF_QP_RANGE && bf_qp)
+		return -ENOMEM;
+
+	uid = MLX4_QP_TABLE_ZONE_GENERAL;
+	if (flags & (u8)MLX4_RESERVE_A0_QP) {
+		if (bf_qp)
+			uid = MLX4_QP_TABLE_ZONE_RAW_ETH;
+		else
+			uid = MLX4_QP_TABLE_ZONE_RSS;
+	}
+
+	*base = mlx4_zone_alloc_entries(qp_table->zones, uid, cnt, align,
+					bf_qp ? MLX4_BF_QP_SKIP_MASK : 0, NULL);
+	if (*base == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			  int *base, u8 flags, u8 usage)
+{
+	u32 in_modifier = RES_QP | (((u32)usage & 3) << 30);
+	u64 in_param = 0;
+	u64 out_param;
+	int err;
+
+	/* Turn off all unsupported QP allocation flags */
+	flags &= dev->caps.alloc_res_qp_mask;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, (((u32)flags) << 24) | (u32)cnt);
+		set_param_h(&in_param, align);
+		err = mlx4_cmd_imm(dev, in_param, &out_param,
+				   in_modifier, RES_OP_RESERVE,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			return err;
+
+		*base = get_param_l(&out_param);
+		return 0;
+	}
+	return __mlx4_qp_reserve_range(dev, cnt, align, base, flags);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_reserve_range);
+
+void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+
+	if (mlx4_is_qp_reserved(dev, (u32) base_qpn))
+		return;
+	mlx4_zone_free_entries_unique(qp_table->zones, base_qpn, cnt);
+}
+
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (!cnt)
+		return;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, base_qpn);
+		set_param_h(&in_param, cnt);
+		err = mlx4_cmd(dev, in_param, RES_QP, RES_OP_RESERVE,
+			       MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err) {
+			mlx4_warn(dev, "Failed to release qp range base:%d cnt:%d\n",
+				  base_qpn, cnt);
+		}
+	} else
+		 __mlx4_qp_release_range(dev, base_qpn, cnt);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_release_range);
+
+int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	int err;
+
+	err = mlx4_table_get(dev, &qp_table->qp_table, qpn);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &qp_table->auxc_table, qpn);
+	if (err)
+		goto err_put_qp;
+
+	err = mlx4_table_get(dev, &qp_table->altc_table, qpn);
+	if (err)
+		goto err_put_auxc;
+
+	err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn);
+	if (err)
+		goto err_put_altc;
+
+	err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn);
+	if (err)
+		goto err_put_rdmarc;
+
+	return 0;
+
+err_put_rdmarc:
+	mlx4_table_put(dev, &qp_table->rdmarc_table, qpn);
+
+err_put_altc:
+	mlx4_table_put(dev, &qp_table->altc_table, qpn);
+
+err_put_auxc:
+	mlx4_table_put(dev, &qp_table->auxc_table, qpn);
+
+err_put_qp:
+	mlx4_table_put(dev, &qp_table->qp_table, qpn);
+
+err_out:
+	return err;
+}
+
+static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn)
+{
+	u64 param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&param, qpn);
+		return mlx4_cmd_imm(dev, param, &param, RES_QP, RES_OP_MAP_ICM,
+				    MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A,
+				    MLX4_CMD_WRAPPED);
+	}
+	return __mlx4_qp_alloc_icm(dev, qpn);
+}
+
+void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+
+	mlx4_table_put(dev, &qp_table->cmpt_table, qpn);
+	mlx4_table_put(dev, &qp_table->rdmarc_table, qpn);
+	mlx4_table_put(dev, &qp_table->altc_table, qpn);
+	mlx4_table_put(dev, &qp_table->auxc_table, qpn);
+	mlx4_table_put(dev, &qp_table->qp_table, qpn);
+}
+
+static void mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, qpn);
+		if (mlx4_cmd(dev, in_param, RES_QP, RES_OP_MAP_ICM,
+			     MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+			     MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed to free icm of qp:%d\n", qpn);
+	} else
+		__mlx4_qp_free_icm(dev, qpn);
+}
+
+struct mlx4_qp *mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	struct mlx4_qp *qp;
+
+	spin_lock_irq(&qp_table->lock);
+
+	qp = __mlx4_qp_lookup(dev, qpn);
+
+	spin_unlock_irq(&qp_table->lock);
+	return qp;
+}
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	int err;
+
+	if (!qpn)
+		return -EINVAL;
+
+	qp->qpn = qpn;
+
+	err = mlx4_qp_alloc_icm(dev, qpn);
+	if (err)
+		return err;
+
+	spin_lock_irq(&qp_table->lock);
+	err = radix_tree_insert(&dev->qp_table_tree, qp->qpn &
+				(dev->caps.num_qps - 1), qp);
+	spin_unlock_irq(&qp_table->lock);
+	if (err)
+		goto err_icm;
+
+	refcount_set(&qp->refcount, 1);
+	init_completion(&qp->free);
+
+	return 0;
+
+err_icm:
+	mlx4_qp_free_icm(dev, qpn);
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(mlx4_qp_alloc);
+
+int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn,
+		   enum mlx4_update_qp_attr attr,
+		   struct mlx4_update_qp_params *params)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_update_qp_context *cmd;
+	u64 pri_addr_path_mask = 0;
+	u64 qp_mask = 0;
+	int err = 0;
+
+	if (!attr || (attr & ~MLX4_UPDATE_QP_SUPPORTED_ATTRS))
+		return -EINVAL;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cmd = (struct mlx4_update_qp_context *)mailbox->buf;
+
+	if (attr & MLX4_UPDATE_QP_SMAC) {
+		pri_addr_path_mask |= 1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX;
+		cmd->qp_context.pri_path.grh_mylmc = params->smac_index;
+	}
+
+	if (attr & MLX4_UPDATE_QP_ETH_SRC_CHECK_MC_LB) {
+		if (!(dev->caps.flags2
+		      & MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB)) {
+			mlx4_warn(dev,
+				  "Trying to set src check LB, but it isn't supported\n");
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+		pri_addr_path_mask |=
+			1ULL << MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB;
+		if (params->flags &
+		    MLX4_UPDATE_QP_PARAMS_FLAGS_ETH_CHECK_MC_LB) {
+			cmd->qp_context.pri_path.fl |=
+				MLX4_FL_ETH_SRC_CHECK_MC_LB;
+		}
+	}
+
+	if (attr & MLX4_UPDATE_QP_VSD) {
+		qp_mask |= 1ULL << MLX4_UPD_QP_MASK_VSD;
+		if (params->flags & MLX4_UPDATE_QP_PARAMS_FLAGS_VSD_ENABLE)
+			cmd->qp_context.param3 |= cpu_to_be32(MLX4_STRIP_VLAN);
+	}
+
+	if (attr & MLX4_UPDATE_QP_RATE_LIMIT) {
+		qp_mask |= 1ULL << MLX4_UPD_QP_MASK_RATE_LIMIT;
+		cmd->qp_context.rate_limit_params = cpu_to_be16((params->rate_unit << 14) | params->rate_val);
+	}
+
+	if (attr & MLX4_UPDATE_QP_QOS_VPORT) {
+		if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QOS_VPP)) {
+			mlx4_warn(dev, "Granular QoS per VF is not enabled\n");
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+
+		qp_mask |= 1ULL << MLX4_UPD_QP_MASK_QOS_VPP;
+		cmd->qp_context.qos_vport = params->qos_vport;
+	}
+
+	cmd->primary_addr_path_mask = cpu_to_be64(pri_addr_path_mask);
+	cmd->qp_mask = cpu_to_be64(qp_mask);
+
+	err = mlx4_cmd(dev, mailbox->dma, qpn & 0xffffff, 0,
+		       MLX4_CMD_UPDATE_QP, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_update_qp);
+
+void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp_table->lock, flags);
+	radix_tree_delete(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1));
+	spin_unlock_irqrestore(&qp_table->lock, flags);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_remove);
+
+void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp)
+{
+	if (refcount_dec_and_test(&qp->refcount))
+		complete(&qp->free);
+	wait_for_completion(&qp->free);
+
+	mlx4_qp_free_icm(dev, qp->qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_free);
+
+static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn)
+{
+	return mlx4_cmd(dev, 0, base_qpn, 0, MLX4_CMD_CONF_SPECIAL_QP,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+#define MLX4_QP_TABLE_RSS_ETH_PRIORITY 2
+#define MLX4_QP_TABLE_RAW_ETH_PRIORITY 1
+#define MLX4_QP_TABLE_RAW_ETH_SIZE     256
+
+static int mlx4_create_zones(struct mlx4_dev *dev,
+			     u32 reserved_bottom_general,
+			     u32 reserved_top_general,
+			     u32 reserved_bottom_rss,
+			     u32 start_offset_rss,
+			     u32 max_table_offset)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	struct mlx4_bitmap (*bitmap)[MLX4_QP_TABLE_ZONE_NUM] = NULL;
+	int bitmap_initialized = 0;
+	u32 last_offset;
+	int k;
+	int err;
+
+	qp_table->zones = mlx4_zone_allocator_create(MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP);
+
+	if (NULL == qp_table->zones)
+		return -ENOMEM;
+
+	bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL);
+
+	if (NULL == bitmap) {
+		err = -ENOMEM;
+		goto free_zone;
+	}
+
+	err = mlx4_bitmap_init(*bitmap + MLX4_QP_TABLE_ZONE_GENERAL, dev->caps.num_qps,
+			       (1 << 23) - 1, reserved_bottom_general,
+			       reserved_top_general);
+
+	if (err)
+		goto free_bitmap;
+
+	++bitmap_initialized;
+
+	err = mlx4_zone_add_one(qp_table->zones, *bitmap + MLX4_QP_TABLE_ZONE_GENERAL,
+				MLX4_ZONE_FALLBACK_TO_HIGHER_PRIO |
+				MLX4_ZONE_USE_RR, 0,
+				0, qp_table->zones_uids + MLX4_QP_TABLE_ZONE_GENERAL);
+
+	if (err)
+		goto free_bitmap;
+
+	err = mlx4_bitmap_init(*bitmap + MLX4_QP_TABLE_ZONE_RSS,
+			       reserved_bottom_rss,
+			       reserved_bottom_rss - 1,
+			       dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+			       reserved_bottom_rss - start_offset_rss);
+
+	if (err)
+		goto free_bitmap;
+
+	++bitmap_initialized;
+
+	err = mlx4_zone_add_one(qp_table->zones, *bitmap + MLX4_QP_TABLE_ZONE_RSS,
+				MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO |
+				MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO |
+				MLX4_ZONE_USE_RR, MLX4_QP_TABLE_RSS_ETH_PRIORITY,
+				0, qp_table->zones_uids + MLX4_QP_TABLE_ZONE_RSS);
+
+	if (err)
+		goto free_bitmap;
+
+	last_offset = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+	/*  We have a single zone for the A0 steering QPs area of the FW. This area
+	 *  needs to be split into subareas. One set of subareas is for RSS QPs
+	 *  (in which qp number bits 6 and/or 7 are set); the other set of subareas
+	 *  is for RAW_ETH QPs, which require that both bits 6 and 7 are zero.
+	 *  Currently, the values returned by the FW (A0 steering area starting qp number
+	 *  and A0 steering area size) are such that there are only two subareas -- one
+	 *  for RSS and one for RAW_ETH.
+	 */
+	for (k = MLX4_QP_TABLE_ZONE_RSS + 1; k < sizeof(*bitmap)/sizeof((*bitmap)[0]);
+	     k++) {
+		int size;
+		u32 offset = start_offset_rss;
+		u32 bf_mask;
+		u32 requested_size;
+
+		/* Assuming MLX4_BF_QP_SKIP_MASK is consecutive ones, this calculates
+		 * a mask of all LSB bits set until (and not including) the first
+		 * set bit of  MLX4_BF_QP_SKIP_MASK. For example, if MLX4_BF_QP_SKIP_MASK
+		 * is 0xc0, bf_mask will be 0x3f.
+		 */
+		bf_mask = (MLX4_BF_QP_SKIP_MASK & ~(MLX4_BF_QP_SKIP_MASK - 1)) - 1;
+		requested_size = min((u32)MLX4_QP_TABLE_RAW_ETH_SIZE, bf_mask + 1);
+
+		if (((last_offset & MLX4_BF_QP_SKIP_MASK) &&
+		     ((int)(max_table_offset - last_offset)) >=
+		     roundup_pow_of_two(MLX4_BF_QP_SKIP_MASK)) ||
+		    (!(last_offset & MLX4_BF_QP_SKIP_MASK) &&
+		     !((last_offset + requested_size - 1) &
+		       MLX4_BF_QP_SKIP_MASK)))
+			size = requested_size;
+		else {
+			u32 candidate_offset =
+				(last_offset | MLX4_BF_QP_SKIP_MASK | bf_mask) + 1;
+
+			if (last_offset & MLX4_BF_QP_SKIP_MASK)
+				last_offset = candidate_offset;
+
+			/* From this point, the BF bits are 0 */
+
+			if (last_offset > max_table_offset) {
+				/* need to skip */
+				size = -1;
+			} else {
+				size = min3(max_table_offset - last_offset,
+					    bf_mask - (last_offset & bf_mask),
+					    requested_size);
+				if (size < requested_size) {
+					int candidate_size;
+
+					candidate_size = min3(
+						max_table_offset - candidate_offset,
+						bf_mask - (last_offset & bf_mask),
+						requested_size);
+
+					/*  We will not take this path if last_offset was
+					 *  already set above to candidate_offset
+					 */
+					if (candidate_size > size) {
+						last_offset = candidate_offset;
+						size = candidate_size;
+					}
+				}
+			}
+		}
+
+		if (size > 0) {
+			/* mlx4_bitmap_alloc_range will find a contiguous range of "size"
+			 * QPs in which both bits 6 and 7 are zero, because we pass it the
+			 * MLX4_BF_SKIP_MASK).
+			 */
+			offset = mlx4_bitmap_alloc_range(
+					*bitmap + MLX4_QP_TABLE_ZONE_RSS,
+					size, 1,
+					MLX4_BF_QP_SKIP_MASK);
+
+			if (offset == (u32)-1) {
+				err = -ENOMEM;
+				break;
+			}
+
+			last_offset = offset + size;
+
+			err = mlx4_bitmap_init(*bitmap + k, roundup_pow_of_two(size),
+					       roundup_pow_of_two(size) - 1, 0,
+					       roundup_pow_of_two(size) - size);
+		} else {
+			/* Add an empty bitmap, we'll allocate from different zones (since
+			 * at least one is reserved)
+			 */
+			err = mlx4_bitmap_init(*bitmap + k, 1,
+					       MLX4_QP_TABLE_RAW_ETH_SIZE - 1, 0,
+					       0);
+			mlx4_bitmap_alloc_range(*bitmap + k, 1, 1, 0);
+		}
+
+		if (err)
+			break;
+
+		++bitmap_initialized;
+
+		err = mlx4_zone_add_one(qp_table->zones, *bitmap + k,
+					MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO |
+					MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO |
+					MLX4_ZONE_USE_RR, MLX4_QP_TABLE_RAW_ETH_PRIORITY,
+					offset, qp_table->zones_uids + k);
+
+		if (err)
+			break;
+	}
+
+	if (err)
+		goto free_bitmap;
+
+	qp_table->bitmap_gen = *bitmap;
+
+	return err;
+
+free_bitmap:
+	for (k = 0; k < bitmap_initialized; k++)
+		mlx4_bitmap_cleanup(*bitmap + k);
+	kfree(bitmap);
+free_zone:
+	mlx4_zone_allocator_destroy(qp_table->zones);
+	return err;
+}
+
+static void mlx4_cleanup_qp_zones(struct mlx4_dev *dev)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+
+	if (qp_table->zones) {
+		int i;
+
+		for (i = 0;
+		     i < sizeof(qp_table->zones_uids)/sizeof(qp_table->zones_uids[0]);
+		     i++) {
+			struct mlx4_bitmap *bitmap =
+				mlx4_zone_get_bitmap(qp_table->zones,
+						     qp_table->zones_uids[i]);
+
+			mlx4_zone_remove_one(qp_table->zones, qp_table->zones_uids[i]);
+			if (NULL == bitmap)
+				continue;
+
+			mlx4_bitmap_cleanup(bitmap);
+		}
+		mlx4_zone_allocator_destroy(qp_table->zones);
+		kfree(qp_table->bitmap_gen);
+		qp_table->bitmap_gen = NULL;
+		qp_table->zones = NULL;
+	}
+}
+
+int mlx4_init_qp_table(struct mlx4_dev *dev)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	int err;
+	int reserved_from_top = 0;
+	int reserved_from_bot;
+	int k;
+	int fixed_reserved_from_bot_rv = 0;
+	int bottom_reserved_for_rss_bitmap;
+	u32 max_table_offset = dev->caps.dmfs_high_rate_qpn_base +
+			dev->caps.dmfs_high_rate_qpn_range;
+
+	spin_lock_init(&qp_table->lock);
+	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
+	if (mlx4_is_slave(dev))
+		return 0;
+
+	/* We reserve 2 extra QPs per port for the special QPs.  The
+	 * block of special QPs must be aligned to a multiple of 8, so
+	 * round up.
+	 *
+	 * We also reserve the MSB of the 24-bit QP number to indicate
+	 * that a QP is an XRC QP.
+	 */
+	for (k = 0; k <= MLX4_QP_REGION_BOTTOM; k++)
+		fixed_reserved_from_bot_rv += dev->caps.reserved_qps_cnt[k];
+
+	if (fixed_reserved_from_bot_rv < max_table_offset)
+		fixed_reserved_from_bot_rv = max_table_offset;
+
+	/* We reserve at least 1 extra for bitmaps that we don't have enough space for*/
+	bottom_reserved_for_rss_bitmap =
+		roundup_pow_of_two(fixed_reserved_from_bot_rv + 1);
+	dev->phys_caps.base_sqpn = ALIGN(bottom_reserved_for_rss_bitmap, 8);
+
+	{
+		int sort[MLX4_NUM_QP_REGION];
+		int i, j;
+		int last_base = dev->caps.num_qps;
+
+		for (i = 1; i < MLX4_NUM_QP_REGION; ++i)
+			sort[i] = i;
+
+		for (i = MLX4_NUM_QP_REGION; i > MLX4_QP_REGION_BOTTOM; --i) {
+			for (j = MLX4_QP_REGION_BOTTOM + 2; j < i; ++j) {
+				if (dev->caps.reserved_qps_cnt[sort[j]] >
+				    dev->caps.reserved_qps_cnt[sort[j - 1]])
+					swap(sort[j], sort[j - 1]);
+			}
+		}
+
+		for (i = MLX4_QP_REGION_BOTTOM + 1; i < MLX4_NUM_QP_REGION; ++i) {
+			last_base -= dev->caps.reserved_qps_cnt[sort[i]];
+			dev->caps.reserved_qps_base[sort[i]] = last_base;
+			reserved_from_top +=
+				dev->caps.reserved_qps_cnt[sort[i]];
+		}
+	}
+
+       /* Reserve 8 real SQPs in both native and SRIOV modes.
+	* In addition, in SRIOV mode, reserve 8 proxy SQPs per function
+	* (for all PFs and VFs), and 8 corresponding tunnel QPs.
+	* Each proxy SQP works opposite its own tunnel QP.
+	*
+	* The QPs are arranged as follows:
+	* a. 8 real SQPs
+	* b. All the proxy SQPs (8 per function)
+	* c. All the tunnel QPs (8 per function)
+	*/
+	reserved_from_bot = mlx4_num_reserved_sqps(dev);
+	if (reserved_from_bot + reserved_from_top > dev->caps.num_qps) {
+		mlx4_err(dev, "Number of reserved QPs is higher than number of QPs\n");
+		return -EINVAL;
+	}
+
+	err = mlx4_create_zones(dev, reserved_from_bot, reserved_from_bot,
+				bottom_reserved_for_rss_bitmap,
+				fixed_reserved_from_bot_rv,
+				max_table_offset);
+
+	if (err)
+		return err;
+
+	if (mlx4_is_mfunc(dev)) {
+		/* for PPF use */
+		dev->phys_caps.base_proxy_sqpn = dev->phys_caps.base_sqpn + 8;
+		dev->phys_caps.base_tunnel_sqpn = dev->phys_caps.base_sqpn + 8 + 8 * MLX4_MFUNC_MAX;
+
+		/* In mfunc, calculate proxy and tunnel qp offsets for the PF here,
+		 * since the PF does not call mlx4_slave_caps */
+		dev->caps.spec_qps = kcalloc(dev->caps.num_ports,
+					     sizeof(*dev->caps.spec_qps),
+					     GFP_KERNEL);
+		if (!dev->caps.spec_qps) {
+			err = -ENOMEM;
+			goto err_mem;
+		}
+
+		for (k = 0; k < dev->caps.num_ports; k++) {
+			dev->caps.spec_qps[k].qp0_proxy = dev->phys_caps.base_proxy_sqpn +
+				8 * mlx4_master_func_num(dev) + k;
+			dev->caps.spec_qps[k].qp0_tunnel = dev->caps.spec_qps[k].qp0_proxy + 8 * MLX4_MFUNC_MAX;
+			dev->caps.spec_qps[k].qp1_proxy = dev->phys_caps.base_proxy_sqpn +
+				8 * mlx4_master_func_num(dev) + MLX4_MAX_PORTS + k;
+			dev->caps.spec_qps[k].qp1_tunnel = dev->caps.spec_qps[k].qp1_proxy + 8 * MLX4_MFUNC_MAX;
+		}
+	}
+
+
+	err = mlx4_CONF_SPECIAL_QP(dev, dev->phys_caps.base_sqpn);
+	if (err)
+		goto err_mem;
+
+	return err;
+
+err_mem:
+	kfree(dev->caps.spec_qps);
+	dev->caps.spec_qps = NULL;
+	mlx4_cleanup_qp_zones(dev);
+	return err;
+}
+
+void mlx4_cleanup_qp_table(struct mlx4_dev *dev)
+{
+	if (mlx4_is_slave(dev))
+		return;
+
+	mlx4_CONF_SPECIAL_QP(dev, 0);
+
+	mlx4_cleanup_qp_zones(dev);
+}
+
+int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
+		  struct mlx4_qp_context *context)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, qp->qpn, 0,
+			   MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_WRAPPED);
+	if (!err)
+		memcpy(context, mailbox->buf + 8, sizeof(*context));
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_query);
+
+int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     struct mlx4_qp_context *context,
+		     struct mlx4_qp *qp, enum mlx4_qp_state *qp_state)
+{
+	int err;
+	int i;
+	enum mlx4_qp_state states[] = {
+		MLX4_QP_STATE_RST,
+		MLX4_QP_STATE_INIT,
+		MLX4_QP_STATE_RTR,
+		MLX4_QP_STATE_RTS
+	};
+
+	for (i = 0; i < ARRAY_SIZE(states) - 1; i++) {
+		context->flags &= cpu_to_be32(~(0xf << 28));
+		context->flags |= cpu_to_be32(states[i + 1] << 28);
+		if (states[i + 1] != MLX4_QP_STATE_RTR)
+			context->params2 &= ~cpu_to_be32(MLX4_QP_BIT_FPP);
+		err = mlx4_qp_modify(dev, mtt, states[i], states[i + 1],
+				     context, 0, 0, qp);
+		if (err) {
+			mlx4_err(dev, "Failed to bring QP to state: %d with error: %d\n",
+				 states[i + 1], err);
+			return err;
+		}
+
+		*qp_state = states[i + 1];
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_to_ready);
+
+u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn)
+{
+	struct mlx4_qp_context context;
+	struct mlx4_qp qp;
+	int err;
+
+	qp.qpn = qpn;
+	err = mlx4_qp_query(dev, &qp, &context);
+	if (!err) {
+		u32 dest_qpn = be32_to_cpu(context.remote_qpn) & 0xffffff;
+		u16 folded_dst = folded_qp(dest_qpn);
+		u16 folded_src = folded_qp(qpn);
+
+		return (dest_qpn != qpn) ?
+			((folded_dst ^ folded_src) | 0xC000) :
+			folded_src | 0xC000;
+	}
+	return 0xdead;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/reset.c b/drivers/net/ethernet/mellanox/mlx4/reset.c
new file mode 100644
index 0000000..0076d88
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/reset.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+
+#include "mlx4.h"
+
+int mlx4_reset(struct mlx4_dev *dev)
+{
+	void __iomem *reset;
+	u32 *hca_header = NULL;
+	int pcie_cap;
+	u16 devctl;
+	u16 linkctl;
+	u16 vendor;
+	unsigned long end;
+	u32 sem;
+	int i;
+	int err = 0;
+
+#define MLX4_RESET_BASE		0xf0000
+#define MLX4_RESET_SIZE		  0x400
+#define MLX4_SEM_OFFSET		  0x3fc
+#define MLX4_RESET_OFFSET	   0x10
+#define MLX4_RESET_VALUE	swab32(1)
+
+#define MLX4_SEM_TIMEOUT_JIFFIES	(10 * HZ)
+#define MLX4_RESET_TIMEOUT_JIFFIES	(2 * HZ)
+
+	/*
+	 * Reset the chip.  This is somewhat ugly because we have to
+	 * save off the PCI header before reset and then restore it
+	 * after the chip reboots.  We skip config space offsets 22
+	 * and 23 since those have a special meaning.
+	 */
+
+	/* Do we need to save off the full 4K PCI Express header?? */
+	hca_header = kmalloc(256, GFP_KERNEL);
+	if (!hca_header) {
+		err = -ENOMEM;
+		mlx4_err(dev, "Couldn't allocate memory to save HCA PCI header, aborting\n");
+		goto out;
+	}
+
+	pcie_cap = pci_pcie_cap(dev->persist->pdev);
+
+	for (i = 0; i < 64; ++i) {
+		if (i == 22 || i == 23)
+			continue;
+		if (pci_read_config_dword(dev->persist->pdev, i * 4,
+					  hca_header + i)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't save HCA PCI header, aborting\n");
+			goto out;
+		}
+	}
+
+	reset = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_RESET_BASE,
+			MLX4_RESET_SIZE);
+	if (!reset) {
+		err = -ENOMEM;
+		mlx4_err(dev, "Couldn't map HCA reset register, aborting\n");
+		goto out;
+	}
+
+	/* grab HW semaphore to lock out flash updates */
+	end = jiffies + MLX4_SEM_TIMEOUT_JIFFIES;
+	do {
+		sem = readl(reset + MLX4_SEM_OFFSET);
+		if (!sem)
+			break;
+
+		msleep(1);
+	} while (time_before(jiffies, end));
+
+	if (sem) {
+		mlx4_err(dev, "Failed to obtain HW semaphore, aborting\n");
+		err = -EAGAIN;
+		iounmap(reset);
+		goto out;
+	}
+
+	/* actually hit reset */
+	writel(MLX4_RESET_VALUE, reset + MLX4_RESET_OFFSET);
+	iounmap(reset);
+
+	/* Docs say to wait one second before accessing device */
+	msleep(1000);
+
+	end = jiffies + MLX4_RESET_TIMEOUT_JIFFIES;
+	do {
+		if (!pci_read_config_word(dev->persist->pdev, PCI_VENDOR_ID,
+					  &vendor) && vendor != 0xffff)
+			break;
+
+		msleep(1);
+	} while (time_before(jiffies, end));
+
+	if (vendor == 0xffff) {
+		err = -ENODEV;
+		mlx4_err(dev, "PCI device did not come back after reset, aborting\n");
+		goto out;
+	}
+
+	/* Now restore the PCI headers */
+	if (pcie_cap) {
+		devctl = hca_header[(pcie_cap + PCI_EXP_DEVCTL) / 4];
+		if (pcie_capability_write_word(dev->persist->pdev,
+					       PCI_EXP_DEVCTL,
+					       devctl)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA PCI Express Device Control register, aborting\n");
+			goto out;
+		}
+		linkctl = hca_header[(pcie_cap + PCI_EXP_LNKCTL) / 4];
+		if (pcie_capability_write_word(dev->persist->pdev,
+					       PCI_EXP_LNKCTL,
+					       linkctl)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA PCI Express Link control register, aborting\n");
+			goto out;
+		}
+	}
+
+	for (i = 0; i < 16; ++i) {
+		if (i * 4 == PCI_COMMAND)
+			continue;
+
+		if (pci_write_config_dword(dev->persist->pdev, i * 4,
+					   hca_header[i])) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA reg %x, aborting\n",
+				 i);
+			goto out;
+		}
+	}
+
+	if (pci_write_config_dword(dev->persist->pdev, PCI_COMMAND,
+				   hca_header[PCI_COMMAND / 4])) {
+		err = -ENODEV;
+		mlx4_err(dev, "Couldn't restore HCA COMMAND, aborting\n");
+		goto out;
+	}
+
+out:
+	kfree(hca_header);
+
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
new file mode 100644
index 0000000..29e50f7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -0,0 +1,5414 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies.
+ * All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+
+#include "mlx4.h"
+#include "fw.h"
+#include "mlx4_stats.h"
+
+#define MLX4_MAC_VALID		(1ull << 63)
+#define MLX4_PF_COUNTERS_PER_PORT	2
+#define MLX4_VF_COUNTERS_PER_PORT	1
+
+struct mac_res {
+	struct list_head list;
+	u64 mac;
+	int ref_count;
+	u8 smac_index;
+	u8 port;
+};
+
+struct vlan_res {
+	struct list_head list;
+	u16 vlan;
+	int ref_count;
+	int vlan_index;
+	u8 port;
+};
+
+struct res_common {
+	struct list_head	list;
+	struct rb_node		node;
+	u64		        res_id;
+	int			owner;
+	int			state;
+	int			from_state;
+	int			to_state;
+	int			removing;
+	const char		*func_name;
+};
+
+enum {
+	RES_ANY_BUSY = 1
+};
+
+struct res_gid {
+	struct list_head	list;
+	u8			gid[16];
+	enum mlx4_protocol	prot;
+	enum mlx4_steer_type	steer;
+	u64			reg_id;
+};
+
+enum res_qp_states {
+	RES_QP_BUSY = RES_ANY_BUSY,
+
+	/* QP number was allocated */
+	RES_QP_RESERVED,
+
+	/* ICM memory for QP context was mapped */
+	RES_QP_MAPPED,
+
+	/* QP is in hw ownership */
+	RES_QP_HW
+};
+
+struct res_qp {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	struct res_cq	       *rcq;
+	struct res_cq	       *scq;
+	struct res_srq	       *srq;
+	struct list_head	mcg_list;
+	spinlock_t		mcg_spl;
+	int			local_qpn;
+	atomic_t		ref_count;
+	u32			qpc_flags;
+	/* saved qp params before VST enforcement in order to restore on VGT */
+	u8			sched_queue;
+	__be32			param3;
+	u8			vlan_control;
+	u8			fvl_rx;
+	u8			pri_path_fl;
+	u8			vlan_index;
+	u8			feup;
+};
+
+enum res_mtt_states {
+	RES_MTT_BUSY = RES_ANY_BUSY,
+	RES_MTT_ALLOCATED,
+};
+
+static inline const char *mtt_states_str(enum res_mtt_states state)
+{
+	switch (state) {
+	case RES_MTT_BUSY: return "RES_MTT_BUSY";
+	case RES_MTT_ALLOCATED: return "RES_MTT_ALLOCATED";
+	default: return "Unknown";
+	}
+}
+
+struct res_mtt {
+	struct res_common	com;
+	int			order;
+	atomic_t		ref_count;
+};
+
+enum res_mpt_states {
+	RES_MPT_BUSY = RES_ANY_BUSY,
+	RES_MPT_RESERVED,
+	RES_MPT_MAPPED,
+	RES_MPT_HW,
+};
+
+struct res_mpt {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	int			key;
+};
+
+enum res_eq_states {
+	RES_EQ_BUSY = RES_ANY_BUSY,
+	RES_EQ_RESERVED,
+	RES_EQ_HW,
+};
+
+struct res_eq {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+};
+
+enum res_cq_states {
+	RES_CQ_BUSY = RES_ANY_BUSY,
+	RES_CQ_ALLOCATED,
+	RES_CQ_HW,
+};
+
+struct res_cq {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	atomic_t		ref_count;
+};
+
+enum res_srq_states {
+	RES_SRQ_BUSY = RES_ANY_BUSY,
+	RES_SRQ_ALLOCATED,
+	RES_SRQ_HW,
+};
+
+struct res_srq {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	struct res_cq	       *cq;
+	atomic_t		ref_count;
+};
+
+enum res_counter_states {
+	RES_COUNTER_BUSY = RES_ANY_BUSY,
+	RES_COUNTER_ALLOCATED,
+};
+
+struct res_counter {
+	struct res_common	com;
+	int			port;
+};
+
+enum res_xrcdn_states {
+	RES_XRCD_BUSY = RES_ANY_BUSY,
+	RES_XRCD_ALLOCATED,
+};
+
+struct res_xrcdn {
+	struct res_common	com;
+	int			port;
+};
+
+enum res_fs_rule_states {
+	RES_FS_RULE_BUSY = RES_ANY_BUSY,
+	RES_FS_RULE_ALLOCATED,
+};
+
+struct res_fs_rule {
+	struct res_common	com;
+	int			qpn;
+	/* VF DMFS mbox with port flipped */
+	void			*mirr_mbox;
+	/* > 0 --> apply mirror when getting into HA mode      */
+	/* = 0 --> un-apply mirror when getting out of HA mode */
+	u32			mirr_mbox_size;
+	struct list_head	mirr_list;
+	u64			mirr_rule_id;
+};
+
+static void *res_tracker_lookup(struct rb_root *root, u64 res_id)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct res_common *res = rb_entry(node, struct res_common,
+						  node);
+
+		if (res_id < res->res_id)
+			node = node->rb_left;
+		else if (res_id > res->res_id)
+			node = node->rb_right;
+		else
+			return res;
+	}
+	return NULL;
+}
+
+static int res_tracker_insert(struct rb_root *root, struct res_common *res)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct res_common *this = rb_entry(*new, struct res_common,
+						   node);
+
+		parent = *new;
+		if (res->res_id < this->res_id)
+			new = &((*new)->rb_left);
+		else if (res->res_id > this->res_id)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&res->node, parent, new);
+	rb_insert_color(&res->node, root);
+
+	return 0;
+}
+
+enum qp_transition {
+	QP_TRANS_INIT2RTR,
+	QP_TRANS_RTR2RTS,
+	QP_TRANS_RTS2RTS,
+	QP_TRANS_SQERR2RTS,
+	QP_TRANS_SQD2SQD,
+	QP_TRANS_SQD2RTS
+};
+
+/* For Debug uses */
+static const char *resource_str(enum mlx4_resource rt)
+{
+	switch (rt) {
+	case RES_QP: return "RES_QP";
+	case RES_CQ: return "RES_CQ";
+	case RES_SRQ: return "RES_SRQ";
+	case RES_MPT: return "RES_MPT";
+	case RES_MTT: return "RES_MTT";
+	case RES_MAC: return  "RES_MAC";
+	case RES_VLAN: return  "RES_VLAN";
+	case RES_EQ: return "RES_EQ";
+	case RES_COUNTER: return "RES_COUNTER";
+	case RES_FS_RULE: return "RES_FS_RULE";
+	case RES_XRCD: return "RES_XRCD";
+	default: return "Unknown resource type !!!";
+	};
+}
+
+static void rem_slave_vlans(struct mlx4_dev *dev, int slave);
+static inline int mlx4_grant_resource(struct mlx4_dev *dev, int slave,
+				      enum mlx4_resource res_type, int count,
+				      int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct resource_allocator *res_alloc =
+		&priv->mfunc.master.res_tracker.res_alloc[res_type];
+	int err = -EDQUOT;
+	int allocated, free, reserved, guaranteed, from_free;
+	int from_rsvd;
+
+	if (slave > dev->persist->num_vfs)
+		return -EINVAL;
+
+	spin_lock(&res_alloc->alloc_lock);
+	allocated = (port > 0) ?
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] :
+		res_alloc->allocated[slave];
+	free = (port > 0) ? res_alloc->res_port_free[port - 1] :
+		res_alloc->res_free;
+	reserved = (port > 0) ? res_alloc->res_port_rsvd[port - 1] :
+		res_alloc->res_reserved;
+	guaranteed = res_alloc->guaranteed[slave];
+
+	if (allocated + count > res_alloc->quota[slave]) {
+		mlx4_warn(dev, "VF %d port %d res %s: quota exceeded, count %d alloc %d quota %d\n",
+			  slave, port, resource_str(res_type), count,
+			  allocated, res_alloc->quota[slave]);
+		goto out;
+	}
+
+	if (allocated + count <= guaranteed) {
+		err = 0;
+		from_rsvd = count;
+	} else {
+		/* portion may need to be obtained from free area */
+		if (guaranteed - allocated > 0)
+			from_free = count - (guaranteed - allocated);
+		else
+			from_free = count;
+
+		from_rsvd = count - from_free;
+
+		if (free - from_free >= reserved)
+			err = 0;
+		else
+			mlx4_warn(dev, "VF %d port %d res %s: free pool empty, free %d from_free %d rsvd %d\n",
+				  slave, port, resource_str(res_type), free,
+				  from_free, reserved);
+	}
+
+	if (!err) {
+		/* grant the request */
+		if (port > 0) {
+			res_alloc->allocated[(port - 1) *
+			(dev->persist->num_vfs + 1) + slave] += count;
+			res_alloc->res_port_free[port - 1] -= count;
+			res_alloc->res_port_rsvd[port - 1] -= from_rsvd;
+		} else {
+			res_alloc->allocated[slave] += count;
+			res_alloc->res_free -= count;
+			res_alloc->res_reserved -= from_rsvd;
+		}
+	}
+
+out:
+	spin_unlock(&res_alloc->alloc_lock);
+	return err;
+}
+
+static inline void mlx4_release_resource(struct mlx4_dev *dev, int slave,
+				    enum mlx4_resource res_type, int count,
+				    int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct resource_allocator *res_alloc =
+		&priv->mfunc.master.res_tracker.res_alloc[res_type];
+	int allocated, guaranteed, from_rsvd;
+
+	if (slave > dev->persist->num_vfs)
+		return;
+
+	spin_lock(&res_alloc->alloc_lock);
+
+	allocated = (port > 0) ?
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] :
+		res_alloc->allocated[slave];
+	guaranteed = res_alloc->guaranteed[slave];
+
+	if (allocated - count >= guaranteed) {
+		from_rsvd = 0;
+	} else {
+		/* portion may need to be returned to reserved area */
+		if (allocated - guaranteed > 0)
+			from_rsvd = count - (allocated - guaranteed);
+		else
+			from_rsvd = count;
+	}
+
+	if (port > 0) {
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] -= count;
+		res_alloc->res_port_free[port - 1] += count;
+		res_alloc->res_port_rsvd[port - 1] += from_rsvd;
+	} else {
+		res_alloc->allocated[slave] -= count;
+		res_alloc->res_free += count;
+		res_alloc->res_reserved += from_rsvd;
+	}
+
+	spin_unlock(&res_alloc->alloc_lock);
+	return;
+}
+
+static inline void initialize_res_quotas(struct mlx4_dev *dev,
+					 struct resource_allocator *res_alloc,
+					 enum mlx4_resource res_type,
+					 int vf, int num_instances)
+{
+	res_alloc->guaranteed[vf] = num_instances /
+				    (2 * (dev->persist->num_vfs + 1));
+	res_alloc->quota[vf] = (num_instances / 2) + res_alloc->guaranteed[vf];
+	if (vf == mlx4_master_func_num(dev)) {
+		res_alloc->res_free = num_instances;
+		if (res_type == RES_MTT) {
+			/* reserved mtts will be taken out of the PF allocation */
+			res_alloc->res_free += dev->caps.reserved_mtts;
+			res_alloc->guaranteed[vf] += dev->caps.reserved_mtts;
+			res_alloc->quota[vf] += dev->caps.reserved_mtts;
+		}
+	}
+}
+
+void mlx4_init_quotas(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int pf;
+
+	/* quotas for VFs are initialized in mlx4_slave_cap */
+	if (mlx4_is_slave(dev))
+		return;
+
+	if (!mlx4_is_mfunc(dev)) {
+		dev->quotas.qp = dev->caps.num_qps - dev->caps.reserved_qps -
+			mlx4_num_reserved_sqps(dev);
+		dev->quotas.cq = dev->caps.num_cqs - dev->caps.reserved_cqs;
+		dev->quotas.srq = dev->caps.num_srqs - dev->caps.reserved_srqs;
+		dev->quotas.mtt = dev->caps.num_mtts - dev->caps.reserved_mtts;
+		dev->quotas.mpt = dev->caps.num_mpts - dev->caps.reserved_mrws;
+		return;
+	}
+
+	pf = mlx4_master_func_num(dev);
+	dev->quotas.qp =
+		priv->mfunc.master.res_tracker.res_alloc[RES_QP].quota[pf];
+	dev->quotas.cq =
+		priv->mfunc.master.res_tracker.res_alloc[RES_CQ].quota[pf];
+	dev->quotas.srq =
+		priv->mfunc.master.res_tracker.res_alloc[RES_SRQ].quota[pf];
+	dev->quotas.mtt =
+		priv->mfunc.master.res_tracker.res_alloc[RES_MTT].quota[pf];
+	dev->quotas.mpt =
+		priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[pf];
+}
+
+static int get_max_gauranteed_vfs_counter(struct mlx4_dev *dev)
+{
+	/* reduce the sink counter */
+	return (dev->caps.max_counters - 1 -
+		(MLX4_PF_COUNTERS_PER_PORT * MLX4_MAX_PORTS))
+		/ MLX4_MAX_PORTS;
+}
+
+int mlx4_init_resource_tracker(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, j;
+	int t;
+	int max_vfs_guarantee_counter = get_max_gauranteed_vfs_counter(dev);
+
+	priv->mfunc.master.res_tracker.slave_list =
+		kzalloc(dev->num_slaves * sizeof(struct slave_list),
+			GFP_KERNEL);
+	if (!priv->mfunc.master.res_tracker.slave_list)
+		return -ENOMEM;
+
+	for (i = 0 ; i < dev->num_slaves; i++) {
+		for (t = 0; t < MLX4_NUM_OF_RESOURCE_TYPE; ++t)
+			INIT_LIST_HEAD(&priv->mfunc.master.res_tracker.
+				       slave_list[i].res_list[t]);
+		mutex_init(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
+	}
+
+	mlx4_dbg(dev, "Started init_resource_tracker: %ld slaves\n",
+		 dev->num_slaves);
+	for (i = 0 ; i < MLX4_NUM_OF_RESOURCE_TYPE; i++)
+		priv->mfunc.master.res_tracker.res_tree[i] = RB_ROOT;
+
+	for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
+		struct resource_allocator *res_alloc =
+			&priv->mfunc.master.res_tracker.res_alloc[i];
+		res_alloc->quota = kmalloc((dev->persist->num_vfs + 1) *
+					   sizeof(int), GFP_KERNEL);
+		res_alloc->guaranteed = kmalloc((dev->persist->num_vfs + 1) *
+						sizeof(int), GFP_KERNEL);
+		if (i == RES_MAC || i == RES_VLAN)
+			res_alloc->allocated = kzalloc(MLX4_MAX_PORTS *
+						       (dev->persist->num_vfs
+						       + 1) *
+						       sizeof(int), GFP_KERNEL);
+		else
+			res_alloc->allocated = kzalloc((dev->persist->
+							num_vfs + 1) *
+						       sizeof(int), GFP_KERNEL);
+		/* Reduce the sink counter */
+		if (i == RES_COUNTER)
+			res_alloc->res_free = dev->caps.max_counters - 1;
+
+		if (!res_alloc->quota || !res_alloc->guaranteed ||
+		    !res_alloc->allocated)
+			goto no_mem_err;
+
+		spin_lock_init(&res_alloc->alloc_lock);
+		for (t = 0; t < dev->persist->num_vfs + 1; t++) {
+			struct mlx4_active_ports actv_ports =
+				mlx4_get_active_ports(dev, t);
+			switch (i) {
+			case RES_QP:
+				initialize_res_quotas(dev, res_alloc, RES_QP,
+						      t, dev->caps.num_qps -
+						      dev->caps.reserved_qps -
+						      mlx4_num_reserved_sqps(dev));
+				break;
+			case RES_CQ:
+				initialize_res_quotas(dev, res_alloc, RES_CQ,
+						      t, dev->caps.num_cqs -
+						      dev->caps.reserved_cqs);
+				break;
+			case RES_SRQ:
+				initialize_res_quotas(dev, res_alloc, RES_SRQ,
+						      t, dev->caps.num_srqs -
+						      dev->caps.reserved_srqs);
+				break;
+			case RES_MPT:
+				initialize_res_quotas(dev, res_alloc, RES_MPT,
+						      t, dev->caps.num_mpts -
+						      dev->caps.reserved_mrws);
+				break;
+			case RES_MTT:
+				initialize_res_quotas(dev, res_alloc, RES_MTT,
+						      t, dev->caps.num_mtts -
+						      dev->caps.reserved_mtts);
+				break;
+			case RES_MAC:
+				if (t == mlx4_master_func_num(dev)) {
+					int max_vfs_pport = 0;
+					/* Calculate the max vfs per port for */
+					/* both ports.			      */
+					for (j = 0; j < dev->caps.num_ports;
+					     j++) {
+						struct mlx4_slaves_pport slaves_pport =
+							mlx4_phys_to_slaves_pport(dev, j + 1);
+						unsigned current_slaves =
+							bitmap_weight(slaves_pport.slaves,
+								      dev->caps.num_ports) - 1;
+						if (max_vfs_pport < current_slaves)
+							max_vfs_pport =
+								current_slaves;
+					}
+					res_alloc->quota[t] =
+						MLX4_MAX_MAC_NUM -
+						2 * max_vfs_pport;
+					res_alloc->guaranteed[t] = 2;
+					for (j = 0; j < MLX4_MAX_PORTS; j++)
+						res_alloc->res_port_free[j] =
+							MLX4_MAX_MAC_NUM;
+				} else {
+					res_alloc->quota[t] = MLX4_MAX_MAC_NUM;
+					res_alloc->guaranteed[t] = 2;
+				}
+				break;
+			case RES_VLAN:
+				if (t == mlx4_master_func_num(dev)) {
+					res_alloc->quota[t] = MLX4_MAX_VLAN_NUM;
+					res_alloc->guaranteed[t] = MLX4_MAX_VLAN_NUM / 2;
+					for (j = 0; j < MLX4_MAX_PORTS; j++)
+						res_alloc->res_port_free[j] =
+							res_alloc->quota[t];
+				} else {
+					res_alloc->quota[t] = MLX4_MAX_VLAN_NUM / 2;
+					res_alloc->guaranteed[t] = 0;
+				}
+				break;
+			case RES_COUNTER:
+				res_alloc->quota[t] = dev->caps.max_counters;
+				if (t == mlx4_master_func_num(dev))
+					res_alloc->guaranteed[t] =
+						MLX4_PF_COUNTERS_PER_PORT *
+						MLX4_MAX_PORTS;
+				else if (t <= max_vfs_guarantee_counter)
+					res_alloc->guaranteed[t] =
+						MLX4_VF_COUNTERS_PER_PORT *
+						MLX4_MAX_PORTS;
+				else
+					res_alloc->guaranteed[t] = 0;
+				break;
+			default:
+				break;
+			}
+			if (i == RES_MAC || i == RES_VLAN) {
+				for (j = 0; j < dev->caps.num_ports; j++)
+					if (test_bit(j, actv_ports.ports))
+						res_alloc->res_port_rsvd[j] +=
+							res_alloc->guaranteed[t];
+			} else {
+				res_alloc->res_reserved += res_alloc->guaranteed[t];
+			}
+		}
+	}
+	spin_lock_init(&priv->mfunc.master.res_tracker.lock);
+	return 0;
+
+no_mem_err:
+	for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
+		kfree(priv->mfunc.master.res_tracker.res_alloc[i].allocated);
+		priv->mfunc.master.res_tracker.res_alloc[i].allocated = NULL;
+		kfree(priv->mfunc.master.res_tracker.res_alloc[i].guaranteed);
+		priv->mfunc.master.res_tracker.res_alloc[i].guaranteed = NULL;
+		kfree(priv->mfunc.master.res_tracker.res_alloc[i].quota);
+		priv->mfunc.master.res_tracker.res_alloc[i].quota = NULL;
+	}
+	return -ENOMEM;
+}
+
+void mlx4_free_resource_tracker(struct mlx4_dev *dev,
+				enum mlx4_res_tracker_free_type type)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	if (priv->mfunc.master.res_tracker.slave_list) {
+		if (type != RES_TR_FREE_STRUCTS_ONLY) {
+			for (i = 0; i < dev->num_slaves; i++) {
+				if (type == RES_TR_FREE_ALL ||
+				    dev->caps.function != i)
+					mlx4_delete_all_resources_for_slave(dev, i);
+			}
+			/* free master's vlans */
+			i = dev->caps.function;
+			mlx4_reset_roce_gids(dev, i);
+			mutex_lock(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
+			rem_slave_vlans(dev, i);
+			mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
+		}
+
+		if (type != RES_TR_FREE_SLAVES_ONLY) {
+			for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
+				kfree(priv->mfunc.master.res_tracker.res_alloc[i].allocated);
+				priv->mfunc.master.res_tracker.res_alloc[i].allocated = NULL;
+				kfree(priv->mfunc.master.res_tracker.res_alloc[i].guaranteed);
+				priv->mfunc.master.res_tracker.res_alloc[i].guaranteed = NULL;
+				kfree(priv->mfunc.master.res_tracker.res_alloc[i].quota);
+				priv->mfunc.master.res_tracker.res_alloc[i].quota = NULL;
+			}
+			kfree(priv->mfunc.master.res_tracker.slave_list);
+			priv->mfunc.master.res_tracker.slave_list = NULL;
+		}
+	}
+}
+
+static void update_pkey_index(struct mlx4_dev *dev, int slave,
+			      struct mlx4_cmd_mailbox *inbox)
+{
+	u8 sched = *(u8 *)(inbox->buf + 64);
+	u8 orig_index = *(u8 *)(inbox->buf + 35);
+	u8 new_index;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port;
+
+	port = (sched >> 6 & 1) + 1;
+
+	new_index = priv->virt2phys_pkey[slave][port - 1][orig_index];
+	*(u8 *)(inbox->buf + 35) = new_index;
+}
+
+static void update_gid(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *inbox,
+		       u8 slave)
+{
+	struct mlx4_qp_context	*qp_ctx = inbox->buf + 8;
+	enum mlx4_qp_optpar	optpar = be32_to_cpu(*(__be32 *) inbox->buf);
+	u32			ts = (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff;
+	int port;
+
+	if (MLX4_QP_ST_UD == ts) {
+		port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+		if (mlx4_is_eth(dev, port))
+			qp_ctx->pri_path.mgid_index =
+				mlx4_get_base_gid_ix(dev, slave, port) | 0x80;
+		else
+			qp_ctx->pri_path.mgid_index = slave | 0x80;
+
+	} else if (MLX4_QP_ST_RC == ts || MLX4_QP_ST_XRC == ts || MLX4_QP_ST_UC == ts) {
+		if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) {
+			port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+			if (mlx4_is_eth(dev, port)) {
+				qp_ctx->pri_path.mgid_index +=
+					mlx4_get_base_gid_ix(dev, slave, port);
+				qp_ctx->pri_path.mgid_index &= 0x7f;
+			} else {
+				qp_ctx->pri_path.mgid_index = slave & 0x7F;
+			}
+		}
+		if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
+			port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1;
+			if (mlx4_is_eth(dev, port)) {
+				qp_ctx->alt_path.mgid_index +=
+					mlx4_get_base_gid_ix(dev, slave, port);
+				qp_ctx->alt_path.mgid_index &= 0x7f;
+			} else {
+				qp_ctx->alt_path.mgid_index = slave & 0x7F;
+			}
+		}
+	}
+}
+
+static int handle_counter(struct mlx4_dev *dev, struct mlx4_qp_context *qpc,
+			  u8 slave, int port);
+
+static int update_vport_qp_param(struct mlx4_dev *dev,
+				 struct mlx4_cmd_mailbox *inbox,
+				 u8 slave, u32 qpn)
+{
+	struct mlx4_qp_context	*qpc = inbox->buf + 8;
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_priv *priv;
+	u32 qp_type;
+	int port, err = 0;
+
+	port = (qpc->pri_path.sched_queue & 0x40) ? 2 : 1;
+	priv = mlx4_priv(dev);
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	qp_type	= (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+
+	err = handle_counter(dev, qpc, slave, port);
+	if (err)
+		goto out;
+
+	if (MLX4_VGT != vp_oper->state.default_vlan) {
+		/* the reserved QPs (special, proxy, tunnel)
+		 * do not operate over vlans
+		 */
+		if (mlx4_is_qp_reserved(dev, qpn))
+			return 0;
+
+		/* force strip vlan by clear vsd, MLX QP refers to Raw Ethernet */
+		if (qp_type == MLX4_QP_ST_UD ||
+		    (qp_type == MLX4_QP_ST_MLX && mlx4_is_eth(dev, port))) {
+			if (dev->caps.bmme_flags & MLX4_BMME_FLAG_VSD_INIT2RTR) {
+				*(__be32 *)inbox->buf =
+					cpu_to_be32(be32_to_cpu(*(__be32 *)inbox->buf) |
+					MLX4_QP_OPTPAR_VLAN_STRIPPING);
+				qpc->param3 &= ~cpu_to_be32(MLX4_STRIP_VLAN);
+			} else {
+				struct mlx4_update_qp_params params = {.flags = 0};
+
+				err = mlx4_update_qp(dev, qpn, MLX4_UPDATE_QP_VSD, &params);
+				if (err)
+					goto out;
+			}
+		}
+
+		/* preserve IF_COUNTER flag */
+		qpc->pri_path.vlan_control &=
+			MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER;
+		if (vp_oper->state.link_state == IFLA_VF_LINK_STATE_DISABLE &&
+		    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP) {
+			qpc->pri_path.vlan_control |=
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+		} else if (0 != vp_oper->state.default_vlan) {
+			if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD)) {
+				/* vst QinQ should block untagged on TX,
+				 * but cvlan is in payload and phv is set so
+				 * hw see it as untagged. Block tagged instead.
+				 */
+				qpc->pri_path.vlan_control |=
+					MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+					MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+			} else { /* vst 802.1Q */
+				qpc->pri_path.vlan_control |=
+					MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+			}
+		} else { /* priority tagged */
+			qpc->pri_path.vlan_control |=
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+		}
+
+		qpc->pri_path.fvl_rx |= MLX4_FVL_RX_FORCE_ETH_VLAN;
+		qpc->pri_path.vlan_index = vp_oper->vlan_idx;
+		qpc->pri_path.fl |= MLX4_FL_ETH_HIDE_CQE_VLAN;
+		if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD))
+			qpc->pri_path.fl |= MLX4_FL_SV;
+		else
+			qpc->pri_path.fl |= MLX4_FL_CV;
+		qpc->pri_path.feup |= MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN;
+		qpc->pri_path.sched_queue &= 0xC7;
+		qpc->pri_path.sched_queue |= (vp_oper->state.default_qos) << 3;
+		qpc->qos_vport = vp_oper->state.qos_vport;
+	}
+	if (vp_oper->state.spoofchk) {
+		qpc->pri_path.feup |= MLX4_FSM_FORCE_ETH_SRC_MAC;
+		qpc->pri_path.grh_mylmc = (0x80 & qpc->pri_path.grh_mylmc) + vp_oper->mac_idx;
+	}
+out:
+	return err;
+}
+
+static int mpt_mask(struct mlx4_dev *dev)
+{
+	return dev->caps.num_mpts - 1;
+}
+
+static const char *mlx4_resource_type_to_str(enum mlx4_resource t)
+{
+	switch (t) {
+	case RES_QP:
+		return "QP";
+	case RES_CQ:
+		return "CQ";
+	case RES_SRQ:
+		return "SRQ";
+	case RES_XRCD:
+		return "XRCD";
+	case RES_MPT:
+		return "MPT";
+	case RES_MTT:
+		return "MTT";
+	case RES_MAC:
+		return "MAC";
+	case RES_VLAN:
+		return "VLAN";
+	case RES_COUNTER:
+		return "COUNTER";
+	case RES_FS_RULE:
+		return "FS_RULE";
+	case RES_EQ:
+		return "EQ";
+	default:
+		return "INVALID RESOURCE";
+	}
+}
+
+static void *find_res(struct mlx4_dev *dev, u64 res_id,
+		      enum mlx4_resource type)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return res_tracker_lookup(&priv->mfunc.master.res_tracker.res_tree[type],
+				  res_id);
+}
+
+static int _get_res(struct mlx4_dev *dev, int slave, u64 res_id,
+		    enum mlx4_resource type,
+		    void *res, const char *func_name)
+{
+	struct res_common *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = find_res(dev, res_id, type);
+	if (!r) {
+		err = -ENONET;
+		goto exit;
+	}
+
+	if (r->state == RES_ANY_BUSY) {
+		mlx4_warn(dev,
+			  "%s(%d) trying to get resource %llx of type %s, but it's already taken by %s\n",
+			  func_name, slave, res_id, mlx4_resource_type_to_str(type),
+			  r->func_name);
+		err = -EBUSY;
+		goto exit;
+	}
+
+	if (r->owner != slave) {
+		err = -EPERM;
+		goto exit;
+	}
+
+	r->from_state = r->state;
+	r->state = RES_ANY_BUSY;
+	r->func_name = func_name;
+
+	if (res)
+		*((struct res_common **)res) = r;
+
+exit:
+	spin_unlock_irq(mlx4_tlock(dev));
+	return err;
+}
+
+#define get_res(dev, slave, res_id, type, res) \
+	_get_res((dev), (slave), (res_id), (type), (res), __func__)
+
+int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
+				    enum mlx4_resource type,
+				    u64 res_id, int *slave)
+{
+
+	struct res_common *r;
+	int err = -ENOENT;
+	int id = res_id;
+
+	if (type == RES_QP)
+		id &= 0x7fffff;
+	spin_lock(mlx4_tlock(dev));
+
+	r = find_res(dev, id, type);
+	if (r) {
+		*slave = r->owner;
+		err = 0;
+	}
+	spin_unlock(mlx4_tlock(dev));
+
+	return err;
+}
+
+static void put_res(struct mlx4_dev *dev, int slave, u64 res_id,
+		    enum mlx4_resource type)
+{
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = find_res(dev, res_id, type);
+	if (r) {
+		r->state = r->from_state;
+		r->func_name = "";
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static int counter_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			     u64 in_param, u64 *out_param, int port);
+
+static int handle_existing_counter(struct mlx4_dev *dev, u8 slave, int port,
+				   int counter_index)
+{
+	struct res_common *r;
+	struct res_counter *counter;
+	int ret = 0;
+
+	if (counter_index == MLX4_SINK_COUNTER_INDEX(dev))
+		return ret;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = find_res(dev, counter_index, RES_COUNTER);
+	if (!r || r->owner != slave) {
+		ret = -EINVAL;
+	} else {
+		counter = container_of(r, struct res_counter, com);
+		if (!counter->port)
+			counter->port = port;
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+	return ret;
+}
+
+static int handle_unexisting_counter(struct mlx4_dev *dev,
+				     struct mlx4_qp_context *qpc, u8 slave,
+				     int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *tmp;
+	struct res_counter *counter;
+	u64 counter_idx = MLX4_SINK_COUNTER_INDEX(dev);
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry(tmp,
+			    &tracker->slave_list[slave].res_list[RES_COUNTER],
+			    list) {
+		counter = container_of(tmp, struct res_counter, com);
+		if (port == counter->port) {
+			qpc->pri_path.counter_index  = counter->com.res_id;
+			spin_unlock_irq(mlx4_tlock(dev));
+			return 0;
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	/* No existing counter, need to allocate a new counter */
+	err = counter_alloc_res(dev, slave, RES_OP_RESERVE, 0, 0, &counter_idx,
+				port);
+	if (err == -ENOENT) {
+		err = 0;
+	} else if (err && err != -ENOSPC) {
+		mlx4_err(dev, "%s: failed to create new counter for slave %d err %d\n",
+			 __func__, slave, err);
+	} else {
+		qpc->pri_path.counter_index = counter_idx;
+		mlx4_dbg(dev, "%s: alloc new counter for slave %d index %d\n",
+			 __func__, slave, qpc->pri_path.counter_index);
+		err = 0;
+	}
+
+	return err;
+}
+
+static int handle_counter(struct mlx4_dev *dev, struct mlx4_qp_context *qpc,
+			  u8 slave, int port)
+{
+	if (qpc->pri_path.counter_index != MLX4_SINK_COUNTER_INDEX(dev))
+		return handle_existing_counter(dev, slave, port,
+					       qpc->pri_path.counter_index);
+
+	return handle_unexisting_counter(dev, qpc, slave, port);
+}
+
+static struct res_common *alloc_qp_tr(int id)
+{
+	struct res_qp *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_QP_RESERVED;
+	ret->local_qpn = id;
+	INIT_LIST_HEAD(&ret->mcg_list);
+	spin_lock_init(&ret->mcg_spl);
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_mtt_tr(int id, int order)
+{
+	struct res_mtt *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->order = order;
+	ret->com.state = RES_MTT_ALLOCATED;
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_mpt_tr(int id, int key)
+{
+	struct res_mpt *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_MPT_RESERVED;
+	ret->key = key;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_eq_tr(int id)
+{
+	struct res_eq *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_EQ_RESERVED;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_cq_tr(int id)
+{
+	struct res_cq *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_CQ_ALLOCATED;
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_srq_tr(int id)
+{
+	struct res_srq *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_SRQ_ALLOCATED;
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_counter_tr(int id, int port)
+{
+	struct res_counter *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_COUNTER_ALLOCATED;
+	ret->port = port;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_xrcdn_tr(int id)
+{
+	struct res_xrcdn *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_XRCD_ALLOCATED;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_fs_rule_tr(u64 id, int qpn)
+{
+	struct res_fs_rule *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_FS_RULE_ALLOCATED;
+	ret->qpn = qpn;
+	return &ret->com;
+}
+
+static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave,
+				   int extra)
+{
+	struct res_common *ret;
+
+	switch (type) {
+	case RES_QP:
+		ret = alloc_qp_tr(id);
+		break;
+	case RES_MPT:
+		ret = alloc_mpt_tr(id, extra);
+		break;
+	case RES_MTT:
+		ret = alloc_mtt_tr(id, extra);
+		break;
+	case RES_EQ:
+		ret = alloc_eq_tr(id);
+		break;
+	case RES_CQ:
+		ret = alloc_cq_tr(id);
+		break;
+	case RES_SRQ:
+		ret = alloc_srq_tr(id);
+		break;
+	case RES_MAC:
+		pr_err("implementation missing\n");
+		return NULL;
+	case RES_COUNTER:
+		ret = alloc_counter_tr(id, extra);
+		break;
+	case RES_XRCD:
+		ret = alloc_xrcdn_tr(id);
+		break;
+	case RES_FS_RULE:
+		ret = alloc_fs_rule_tr(id, extra);
+		break;
+	default:
+		return NULL;
+	}
+	if (ret)
+		ret->owner = slave;
+
+	return ret;
+}
+
+int mlx4_calc_vf_counters(struct mlx4_dev *dev, int slave, int port,
+			  struct mlx4_counter *data)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *tmp;
+	struct res_counter *counter;
+	int *counters_arr;
+	int i = 0, err = 0;
+
+	memset(data, 0, sizeof(*data));
+
+	counters_arr = kmalloc_array(dev->caps.max_counters,
+				     sizeof(*counters_arr), GFP_KERNEL);
+	if (!counters_arr)
+		return -ENOMEM;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry(tmp,
+			    &tracker->slave_list[slave].res_list[RES_COUNTER],
+			    list) {
+		counter = container_of(tmp, struct res_counter, com);
+		if (counter->port == port) {
+			counters_arr[i] = (int)tmp->res_id;
+			i++;
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+	counters_arr[i] = -1;
+
+	i = 0;
+
+	while (counters_arr[i] != -1) {
+		err = mlx4_get_counter_stats(dev, counters_arr[i], data,
+					     0);
+		if (err) {
+			memset(data, 0, sizeof(*data));
+			goto table_changed;
+		}
+		i++;
+	}
+
+table_changed:
+	kfree(counters_arr);
+	return 0;
+}
+
+static int add_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
+			 enum mlx4_resource type, int extra)
+{
+	int i;
+	int err;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct res_common **res_arr;
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct rb_root *root = &tracker->res_tree[type];
+
+	res_arr = kcalloc(count, sizeof(*res_arr), GFP_KERNEL);
+	if (!res_arr)
+		return -ENOMEM;
+
+	for (i = 0; i < count; ++i) {
+		res_arr[i] = alloc_tr(base + i, type, slave, extra);
+		if (!res_arr[i]) {
+			for (--i; i >= 0; --i)
+				kfree(res_arr[i]);
+
+			kfree(res_arr);
+			return -ENOMEM;
+		}
+	}
+
+	spin_lock_irq(mlx4_tlock(dev));
+	for (i = 0; i < count; ++i) {
+		if (find_res(dev, base + i, type)) {
+			err = -EEXIST;
+			goto undo;
+		}
+		err = res_tracker_insert(root, res_arr[i]);
+		if (err)
+			goto undo;
+		list_add_tail(&res_arr[i]->list,
+			      &tracker->slave_list[slave].res_list[type]);
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+	kfree(res_arr);
+
+	return 0;
+
+undo:
+	for (--i; i >= 0; --i) {
+		rb_erase(&res_arr[i]->node, root);
+		list_del_init(&res_arr[i]->list);
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	for (i = 0; i < count; ++i)
+		kfree(res_arr[i]);
+
+	kfree(res_arr);
+
+	return err;
+}
+
+static int remove_qp_ok(struct res_qp *res)
+{
+	if (res->com.state == RES_QP_BUSY || atomic_read(&res->ref_count) ||
+	    !list_empty(&res->mcg_list)) {
+		pr_err("resource tracker: fail to remove qp, state %d, ref_count %d\n",
+		       res->com.state, atomic_read(&res->ref_count));
+		return -EBUSY;
+	} else if (res->com.state != RES_QP_RESERVED) {
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static int remove_mtt_ok(struct res_mtt *res, int order)
+{
+	if (res->com.state == RES_MTT_BUSY ||
+	    atomic_read(&res->ref_count)) {
+		pr_devel("%s-%d: state %s, ref_count %d\n",
+			 __func__, __LINE__,
+			 mtt_states_str(res->com.state),
+			 atomic_read(&res->ref_count));
+		return -EBUSY;
+	} else if (res->com.state != RES_MTT_ALLOCATED)
+		return -EPERM;
+	else if (res->order != order)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int remove_mpt_ok(struct res_mpt *res)
+{
+	if (res->com.state == RES_MPT_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_MPT_RESERVED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_eq_ok(struct res_eq *res)
+{
+	if (res->com.state == RES_MPT_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_MPT_RESERVED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_counter_ok(struct res_counter *res)
+{
+	if (res->com.state == RES_COUNTER_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_COUNTER_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_xrcdn_ok(struct res_xrcdn *res)
+{
+	if (res->com.state == RES_XRCD_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_XRCD_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_fs_rule_ok(struct res_fs_rule *res)
+{
+	if (res->com.state == RES_FS_RULE_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_FS_RULE_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_cq_ok(struct res_cq *res)
+{
+	if (res->com.state == RES_CQ_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_CQ_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_srq_ok(struct res_srq *res)
+{
+	if (res->com.state == RES_SRQ_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_SRQ_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_ok(struct res_common *res, enum mlx4_resource type, int extra)
+{
+	switch (type) {
+	case RES_QP:
+		return remove_qp_ok((struct res_qp *)res);
+	case RES_CQ:
+		return remove_cq_ok((struct res_cq *)res);
+	case RES_SRQ:
+		return remove_srq_ok((struct res_srq *)res);
+	case RES_MPT:
+		return remove_mpt_ok((struct res_mpt *)res);
+	case RES_MTT:
+		return remove_mtt_ok((struct res_mtt *)res, extra);
+	case RES_MAC:
+		return -EOPNOTSUPP;
+	case RES_EQ:
+		return remove_eq_ok((struct res_eq *)res);
+	case RES_COUNTER:
+		return remove_counter_ok((struct res_counter *)res);
+	case RES_XRCD:
+		return remove_xrcdn_ok((struct res_xrcdn *)res);
+	case RES_FS_RULE:
+		return remove_fs_rule_ok((struct res_fs_rule *)res);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int rem_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
+			 enum mlx4_resource type, int extra)
+{
+	u64 i;
+	int err;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	for (i = base; i < base + count; ++i) {
+		r = res_tracker_lookup(&tracker->res_tree[type], i);
+		if (!r) {
+			err = -ENOENT;
+			goto out;
+		}
+		if (r->owner != slave) {
+			err = -EPERM;
+			goto out;
+		}
+		err = remove_ok(r, type, extra);
+		if (err)
+			goto out;
+	}
+
+	for (i = base; i < base + count; ++i) {
+		r = res_tracker_lookup(&tracker->res_tree[type], i);
+		rb_erase(&r->node, &tracker->res_tree[type]);
+		list_del(&r->list);
+		kfree(r);
+	}
+	err = 0;
+
+out:
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn,
+				enum res_qp_states state, struct res_qp **qp,
+				int alloc)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_qp *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_QP], qpn);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_QP_BUSY:
+			mlx4_dbg(dev, "%s: failed RES_QP, 0x%llx\n",
+				 __func__, r->com.res_id);
+			err = -EBUSY;
+			break;
+
+		case RES_QP_RESERVED:
+			if (r->com.state == RES_QP_MAPPED && !alloc)
+				break;
+
+			mlx4_dbg(dev, "failed RES_QP, 0x%llx\n", r->com.res_id);
+			err = -EINVAL;
+			break;
+
+		case RES_QP_MAPPED:
+			if ((r->com.state == RES_QP_RESERVED && alloc) ||
+			    r->com.state == RES_QP_HW)
+				break;
+			else {
+				mlx4_dbg(dev, "failed RES_QP, 0x%llx\n",
+					  r->com.res_id);
+				err = -EINVAL;
+			}
+
+			break;
+
+		case RES_QP_HW:
+			if (r->com.state != RES_QP_MAPPED)
+				err = -EINVAL;
+			break;
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_QP_BUSY;
+			if (qp)
+				*qp = r;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int mr_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
+				enum res_mpt_states state, struct res_mpt **mpt)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_mpt *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_MPT], index);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_MPT_BUSY:
+			err = -EINVAL;
+			break;
+
+		case RES_MPT_RESERVED:
+			if (r->com.state != RES_MPT_MAPPED)
+				err = -EINVAL;
+			break;
+
+		case RES_MPT_MAPPED:
+			if (r->com.state != RES_MPT_RESERVED &&
+			    r->com.state != RES_MPT_HW)
+				err = -EINVAL;
+			break;
+
+		case RES_MPT_HW:
+			if (r->com.state != RES_MPT_MAPPED)
+				err = -EINVAL;
+			break;
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_MPT_BUSY;
+			if (mpt)
+				*mpt = r;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int eq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
+				enum res_eq_states state, struct res_eq **eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_eq *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_EQ], index);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_EQ_BUSY:
+			err = -EINVAL;
+			break;
+
+		case RES_EQ_RESERVED:
+			if (r->com.state != RES_EQ_HW)
+				err = -EINVAL;
+			break;
+
+		case RES_EQ_HW:
+			if (r->com.state != RES_EQ_RESERVED)
+				err = -EINVAL;
+			break;
+
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_EQ_BUSY;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	if (!err && eq)
+		*eq = r;
+
+	return err;
+}
+
+static int cq_res_start_move_to(struct mlx4_dev *dev, int slave, int cqn,
+				enum res_cq_states state, struct res_cq **cq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_cq *r;
+	int err;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_CQ], cqn);
+	if (!r) {
+		err = -ENOENT;
+	} else if (r->com.owner != slave) {
+		err = -EPERM;
+	} else if (state == RES_CQ_ALLOCATED) {
+		if (r->com.state != RES_CQ_HW)
+			err = -EINVAL;
+		else if (atomic_read(&r->ref_count))
+			err = -EBUSY;
+		else
+			err = 0;
+	} else if (state != RES_CQ_HW || r->com.state != RES_CQ_ALLOCATED) {
+		err = -EINVAL;
+	} else {
+		err = 0;
+	}
+
+	if (!err) {
+		r->com.from_state = r->com.state;
+		r->com.to_state = state;
+		r->com.state = RES_CQ_BUSY;
+		if (cq)
+			*cq = r;
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int srq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
+				 enum res_srq_states state, struct res_srq **srq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_srq *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_SRQ], index);
+	if (!r) {
+		err = -ENOENT;
+	} else if (r->com.owner != slave) {
+		err = -EPERM;
+	} else if (state == RES_SRQ_ALLOCATED) {
+		if (r->com.state != RES_SRQ_HW)
+			err = -EINVAL;
+		else if (atomic_read(&r->ref_count))
+			err = -EBUSY;
+	} else if (state != RES_SRQ_HW || r->com.state != RES_SRQ_ALLOCATED) {
+		err = -EINVAL;
+	}
+
+	if (!err) {
+		r->com.from_state = r->com.state;
+		r->com.to_state = state;
+		r->com.state = RES_SRQ_BUSY;
+		if (srq)
+			*srq = r;
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static void res_abort_move(struct mlx4_dev *dev, int slave,
+			   enum mlx4_resource type, int id)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[type], id);
+	if (r && (r->owner == slave))
+		r->state = r->from_state;
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void res_end_move(struct mlx4_dev *dev, int slave,
+			 enum mlx4_resource type, int id)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[type], id);
+	if (r && (r->owner == slave))
+		r->state = r->to_state;
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static int valid_reserved(struct mlx4_dev *dev, int slave, int qpn)
+{
+	return mlx4_is_qp_reserved(dev, qpn) &&
+		(mlx4_is_master(dev) || mlx4_is_guest_proxy(dev, slave, qpn));
+}
+
+static int fw_reserved(struct mlx4_dev *dev, int qpn)
+{
+	return qpn < dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+}
+
+static int qp_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int err;
+	int count;
+	int align;
+	int base;
+	int qpn;
+	u8 flags;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		count = get_param_l(&in_param) & 0xffffff;
+		/* Turn off all unsupported QP allocation flags that the
+		 * slave tries to set.
+		 */
+		flags = (get_param_l(&in_param) >> 24) & dev->caps.alloc_res_qp_mask;
+		align = get_param_h(&in_param);
+		err = mlx4_grant_resource(dev, slave, RES_QP, count, 0);
+		if (err)
+			return err;
+
+		err = __mlx4_qp_reserve_range(dev, count, align, &base, flags);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_QP, count, 0);
+			return err;
+		}
+
+		err = add_res_range(dev, slave, base, count, RES_QP, 0);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_QP, count, 0);
+			__mlx4_qp_release_range(dev, base, count);
+			return err;
+		}
+		set_param_l(out_param, base);
+		break;
+	case RES_OP_MAP_ICM:
+		qpn = get_param_l(&in_param) & 0x7fffff;
+		if (valid_reserved(dev, slave, qpn)) {
+			err = add_res_range(dev, slave, qpn, 1, RES_QP, 0);
+			if (err)
+				return err;
+		}
+
+		err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED,
+					   NULL, 1);
+		if (err)
+			return err;
+
+		if (!fw_reserved(dev, qpn)) {
+			err = __mlx4_qp_alloc_icm(dev, qpn);
+			if (err) {
+				res_abort_move(dev, slave, RES_QP, qpn);
+				return err;
+			}
+		}
+
+		res_end_move(dev, slave, RES_QP, qpn);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static int mtt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param)
+{
+	int err = -EINVAL;
+	int base;
+	int order;
+
+	if (op != RES_OP_RESERVE_AND_MAP)
+		return err;
+
+	order = get_param_l(&in_param);
+
+	err = mlx4_grant_resource(dev, slave, RES_MTT, 1 << order, 0);
+	if (err)
+		return err;
+
+	base = __mlx4_alloc_mtt_range(dev, order);
+	if (base == -1) {
+		mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0);
+		return -ENOMEM;
+	}
+
+	err = add_res_range(dev, slave, base, 1, RES_MTT, order);
+	if (err) {
+		mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0);
+		__mlx4_free_mtt_range(dev, base, order);
+	} else {
+		set_param_l(out_param, base);
+	}
+
+	return err;
+}
+
+static int mpt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param)
+{
+	int err = -EINVAL;
+	int index;
+	int id;
+	struct res_mpt *mpt;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		err = mlx4_grant_resource(dev, slave, RES_MPT, 1, 0);
+		if (err)
+			break;
+
+		index = __mlx4_mpt_reserve(dev);
+		if (index == -1) {
+			mlx4_release_resource(dev, slave, RES_MPT, 1, 0);
+			break;
+		}
+		id = index & mpt_mask(dev);
+
+		err = add_res_range(dev, slave, id, 1, RES_MPT, index);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_MPT, 1, 0);
+			__mlx4_mpt_release(dev, index);
+			break;
+		}
+		set_param_l(out_param, index);
+		break;
+	case RES_OP_MAP_ICM:
+		index = get_param_l(&in_param);
+		id = index & mpt_mask(dev);
+		err = mr_res_start_move_to(dev, slave, id,
+					   RES_MPT_MAPPED, &mpt);
+		if (err)
+			return err;
+
+		err = __mlx4_mpt_alloc_icm(dev, mpt->key);
+		if (err) {
+			res_abort_move(dev, slave, RES_MPT, id);
+			return err;
+		}
+
+		res_end_move(dev, slave, RES_MPT, id);
+		break;
+	}
+	return err;
+}
+
+static int cq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int cqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		err = mlx4_grant_resource(dev, slave, RES_CQ, 1, 0);
+		if (err)
+			break;
+
+		err = __mlx4_cq_alloc_icm(dev, &cqn);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_CQ, 1, 0);
+			break;
+		}
+
+		err = add_res_range(dev, slave, cqn, 1, RES_CQ, 0);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_CQ, 1, 0);
+			__mlx4_cq_free_icm(dev, cqn);
+			break;
+		}
+
+		set_param_l(out_param, cqn);
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int srq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param)
+{
+	int srqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		err = mlx4_grant_resource(dev, slave, RES_SRQ, 1, 0);
+		if (err)
+			break;
+
+		err = __mlx4_srq_alloc_icm(dev, &srqn);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_SRQ, 1, 0);
+			break;
+		}
+
+		err = add_res_range(dev, slave, srqn, 1, RES_SRQ, 0);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_SRQ, 1, 0);
+			__mlx4_srq_free_icm(dev, srqn);
+			break;
+		}
+
+		set_param_l(out_param, srqn);
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int mac_find_smac_ix_in_slave(struct mlx4_dev *dev, int slave, int port,
+				     u8 smac_index, u64 *mac)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		if (res->smac_index == smac_index && res->port == (u8) port) {
+			*mac = res->mac;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int mac_add_to_slave(struct mlx4_dev *dev, int slave, u64 mac, int port, u8 smac_index)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		if (res->mac == mac && res->port == (u8) port) {
+			/* mac found. update ref count */
+			++res->ref_count;
+			return 0;
+		}
+	}
+
+	if (mlx4_grant_resource(dev, slave, RES_MAC, 1, port))
+		return -EINVAL;
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res) {
+		mlx4_release_resource(dev, slave, RES_MAC, 1, port);
+		return -ENOMEM;
+	}
+	res->mac = mac;
+	res->port = (u8) port;
+	res->smac_index = smac_index;
+	res->ref_count = 1;
+	list_add_tail(&res->list,
+		      &tracker->slave_list[slave].res_list[RES_MAC]);
+	return 0;
+}
+
+static void mac_del_from_slave(struct mlx4_dev *dev, int slave, u64 mac,
+			       int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		if (res->mac == mac && res->port == (u8) port) {
+			if (!--res->ref_count) {
+				list_del(&res->list);
+				mlx4_release_resource(dev, slave, RES_MAC, 1, port);
+				kfree(res);
+			}
+			break;
+		}
+	}
+}
+
+static void rem_slave_macs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+	int i;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		list_del(&res->list);
+		/* dereference the mac the num times the slave referenced it */
+		for (i = 0; i < res->ref_count; i++)
+			__mlx4_unregister_mac(dev, res->port, res->mac);
+		mlx4_release_resource(dev, slave, RES_MAC, 1, res->port);
+		kfree(res);
+	}
+}
+
+static int mac_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param, int in_port)
+{
+	int err = -EINVAL;
+	int port;
+	u64 mac;
+	u8 smac_index;
+
+	if (op != RES_OP_RESERVE_AND_MAP)
+		return err;
+
+	port = !in_port ? get_param_l(out_param) : in_port;
+	port = mlx4_slave_convert_port(
+			dev, slave, port);
+
+	if (port < 0)
+		return -EINVAL;
+	mac = in_param;
+
+	err = __mlx4_register_mac(dev, port, mac);
+	if (err >= 0) {
+		smac_index = err;
+		set_param_l(out_param, err);
+		err = 0;
+	}
+
+	if (!err) {
+		err = mac_add_to_slave(dev, slave, mac, port, smac_index);
+		if (err)
+			__mlx4_unregister_mac(dev, port, mac);
+	}
+	return err;
+}
+
+static int vlan_add_to_slave(struct mlx4_dev *dev, int slave, u16 vlan,
+			     int port, int vlan_index)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *vlan_list =
+		&tracker->slave_list[slave].res_list[RES_VLAN];
+	struct vlan_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, vlan_list, list) {
+		if (res->vlan == vlan && res->port == (u8) port) {
+			/* vlan found. update ref count */
+			++res->ref_count;
+			return 0;
+		}
+	}
+
+	if (mlx4_grant_resource(dev, slave, RES_VLAN, 1, port))
+		return -EINVAL;
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res) {
+		mlx4_release_resource(dev, slave, RES_VLAN, 1, port);
+		return -ENOMEM;
+	}
+	res->vlan = vlan;
+	res->port = (u8) port;
+	res->vlan_index = vlan_index;
+	res->ref_count = 1;
+	list_add_tail(&res->list,
+		      &tracker->slave_list[slave].res_list[RES_VLAN]);
+	return 0;
+}
+
+
+static void vlan_del_from_slave(struct mlx4_dev *dev, int slave, u16 vlan,
+				int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *vlan_list =
+		&tracker->slave_list[slave].res_list[RES_VLAN];
+	struct vlan_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, vlan_list, list) {
+		if (res->vlan == vlan && res->port == (u8) port) {
+			if (!--res->ref_count) {
+				list_del(&res->list);
+				mlx4_release_resource(dev, slave, RES_VLAN,
+						      1, port);
+				kfree(res);
+			}
+			break;
+		}
+	}
+}
+
+static void rem_slave_vlans(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *vlan_list =
+		&tracker->slave_list[slave].res_list[RES_VLAN];
+	struct vlan_res *res, *tmp;
+	int i;
+
+	list_for_each_entry_safe(res, tmp, vlan_list, list) {
+		list_del(&res->list);
+		/* dereference the vlan the num times the slave referenced it */
+		for (i = 0; i < res->ref_count; i++)
+			__mlx4_unregister_vlan(dev, res->port, res->vlan);
+		mlx4_release_resource(dev, slave, RES_VLAN, 1, res->port);
+		kfree(res);
+	}
+}
+
+static int vlan_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			  u64 in_param, u64 *out_param, int in_port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	int err;
+	u16 vlan;
+	int vlan_index;
+	int port;
+
+	port = !in_port ? get_param_l(out_param) : in_port;
+
+	if (!port || op != RES_OP_RESERVE_AND_MAP)
+		return -EINVAL;
+
+	port = mlx4_slave_convert_port(
+			dev, slave, port);
+
+	if (port < 0)
+		return -EINVAL;
+	/* upstream kernels had NOP for reg/unreg vlan. Continue this. */
+	if (!in_port && port > 0 && port <= dev->caps.num_ports) {
+		slave_state[slave].old_vlan_api = true;
+		return 0;
+	}
+
+	vlan = (u16) in_param;
+
+	err = __mlx4_register_vlan(dev, port, vlan, &vlan_index);
+	if (!err) {
+		set_param_l(out_param, (u32) vlan_index);
+		err = vlan_add_to_slave(dev, slave, vlan, port, vlan_index);
+		if (err)
+			__mlx4_unregister_vlan(dev, port, vlan);
+	}
+	return err;
+}
+
+static int counter_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			     u64 in_param, u64 *out_param, int port)
+{
+	u32 index;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	err = mlx4_grant_resource(dev, slave, RES_COUNTER, 1, 0);
+	if (err)
+		return err;
+
+	err = __mlx4_counter_alloc(dev, &index);
+	if (err) {
+		mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+		return err;
+	}
+
+	err = add_res_range(dev, slave, index, 1, RES_COUNTER, port);
+	if (err) {
+		__mlx4_counter_free(dev, index);
+		mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+	} else {
+		set_param_l(out_param, index);
+	}
+
+	return err;
+}
+
+static int xrcdn_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			   u64 in_param, u64 *out_param)
+{
+	u32 xrcdn;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	err = __mlx4_xrcd_alloc(dev, &xrcdn);
+	if (err)
+		return err;
+
+	err = add_res_range(dev, slave, xrcdn, 1, RES_XRCD, 0);
+	if (err)
+		__mlx4_xrcd_free(dev, xrcdn);
+	else
+		set_param_l(out_param, xrcdn);
+
+	return err;
+}
+
+int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int alop = vhcr->op_modifier;
+
+	switch (vhcr->in_modifier & 0xFF) {
+	case RES_QP:
+		err = qp_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MTT:
+		err = mtt_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MPT:
+		err = mpt_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_CQ:
+		err = cq_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_SRQ:
+		err = srq_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MAC:
+		err = mac_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param,
+				    (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_VLAN:
+		err = vlan_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				     vhcr->in_param, &vhcr->out_param,
+				     (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_COUNTER:
+		err = counter_alloc_res(dev, slave, vhcr->op_modifier, alop,
+					vhcr->in_param, &vhcr->out_param, 0);
+		break;
+
+	case RES_XRCD:
+		err = xrcdn_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				      vhcr->in_param, &vhcr->out_param);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int qp_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+		       u64 in_param)
+{
+	int err;
+	int count;
+	int base;
+	int qpn;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		base = get_param_l(&in_param) & 0x7fffff;
+		count = get_param_h(&in_param);
+		err = rem_res_range(dev, slave, base, count, RES_QP, 0);
+		if (err)
+			break;
+		mlx4_release_resource(dev, slave, RES_QP, count, 0);
+		__mlx4_qp_release_range(dev, base, count);
+		break;
+	case RES_OP_MAP_ICM:
+		qpn = get_param_l(&in_param) & 0x7fffff;
+		err = qp_res_start_move_to(dev, slave, qpn, RES_QP_RESERVED,
+					   NULL, 0);
+		if (err)
+			return err;
+
+		if (!fw_reserved(dev, qpn))
+			__mlx4_qp_free_icm(dev, qpn);
+
+		res_end_move(dev, slave, RES_QP, qpn);
+
+		if (valid_reserved(dev, slave, qpn))
+			err = rem_res_range(dev, slave, qpn, 1, RES_QP, 0);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static int mtt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int err = -EINVAL;
+	int base;
+	int order;
+
+	if (op != RES_OP_RESERVE_AND_MAP)
+		return err;
+
+	base = get_param_l(&in_param);
+	order = get_param_h(&in_param);
+	err = rem_res_range(dev, slave, base, 1, RES_MTT, order);
+	if (!err) {
+		mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0);
+		__mlx4_free_mtt_range(dev, base, order);
+	}
+	return err;
+}
+
+static int mpt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param)
+{
+	int err = -EINVAL;
+	int index;
+	int id;
+	struct res_mpt *mpt;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		index = get_param_l(&in_param);
+		id = index & mpt_mask(dev);
+		err = get_res(dev, slave, id, RES_MPT, &mpt);
+		if (err)
+			break;
+		index = mpt->key;
+		put_res(dev, slave, id, RES_MPT);
+
+		err = rem_res_range(dev, slave, id, 1, RES_MPT, 0);
+		if (err)
+			break;
+		mlx4_release_resource(dev, slave, RES_MPT, 1, 0);
+		__mlx4_mpt_release(dev, index);
+		break;
+	case RES_OP_MAP_ICM:
+		index = get_param_l(&in_param);
+		id = index & mpt_mask(dev);
+		err = mr_res_start_move_to(dev, slave, id,
+					   RES_MPT_RESERVED, &mpt);
+		if (err)
+			return err;
+
+		__mlx4_mpt_free_icm(dev, mpt->key);
+		res_end_move(dev, slave, RES_MPT, id);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static int cq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+		       u64 in_param, u64 *out_param)
+{
+	int cqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		cqn = get_param_l(&in_param);
+		err = rem_res_range(dev, slave, cqn, 1, RES_CQ, 0);
+		if (err)
+			break;
+
+		mlx4_release_resource(dev, slave, RES_CQ, 1, 0);
+		__mlx4_cq_free_icm(dev, cqn);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int srq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int srqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		srqn = get_param_l(&in_param);
+		err = rem_res_range(dev, slave, srqn, 1, RES_SRQ, 0);
+		if (err)
+			break;
+
+		mlx4_release_resource(dev, slave, RES_SRQ, 1, 0);
+		__mlx4_srq_free_icm(dev, srqn);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int mac_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			    u64 in_param, u64 *out_param, int in_port)
+{
+	int port;
+	int err = 0;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		port = !in_port ? get_param_l(out_param) : in_port;
+		port = mlx4_slave_convert_port(
+				dev, slave, port);
+
+		if (port < 0)
+			return -EINVAL;
+		mac_del_from_slave(dev, slave, in_param, port);
+		__mlx4_unregister_mac(dev, port, in_param);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+
+}
+
+static int vlan_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			    u64 in_param, u64 *out_param, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	int err = 0;
+
+	port = mlx4_slave_convert_port(
+			dev, slave, port);
+
+	if (port < 0)
+		return -EINVAL;
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		if (slave_state[slave].old_vlan_api)
+			return 0;
+		if (!port)
+			return -EINVAL;
+		vlan_del_from_slave(dev, slave, in_param, port);
+		__mlx4_unregister_vlan(dev, port, in_param);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int counter_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			    u64 in_param, u64 *out_param)
+{
+	int index;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	index = get_param_l(&in_param);
+	if (index == MLX4_SINK_COUNTER_INDEX(dev))
+		return 0;
+
+	err = rem_res_range(dev, slave, index, 1, RES_COUNTER, 0);
+	if (err)
+		return err;
+
+	__mlx4_counter_free(dev, index);
+	mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+
+	return err;
+}
+
+static int xrcdn_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			  u64 in_param, u64 *out_param)
+{
+	int xrcdn;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	xrcdn = get_param_l(&in_param);
+	err = rem_res_range(dev, slave, xrcdn, 1, RES_XRCD, 0);
+	if (err)
+		return err;
+
+	__mlx4_xrcd_free(dev, xrcdn);
+
+	return err;
+}
+
+int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err = -EINVAL;
+	int alop = vhcr->op_modifier;
+
+	switch (vhcr->in_modifier & 0xFF) {
+	case RES_QP:
+		err = qp_free_res(dev, slave, vhcr->op_modifier, alop,
+				  vhcr->in_param);
+		break;
+
+	case RES_MTT:
+		err = mtt_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MPT:
+		err = mpt_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param);
+		break;
+
+	case RES_CQ:
+		err = cq_free_res(dev, slave, vhcr->op_modifier, alop,
+				  vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_SRQ:
+		err = srq_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MAC:
+		err = mac_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param,
+				   (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_VLAN:
+		err = vlan_free_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param,
+				    (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_COUNTER:
+		err = counter_free_res(dev, slave, vhcr->op_modifier, alop,
+				       vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_XRCD:
+		err = xrcdn_free_res(dev, slave, vhcr->op_modifier, alop,
+				     vhcr->in_param, &vhcr->out_param);
+
+	default:
+		break;
+	}
+	return err;
+}
+
+/* ugly but other choices are uglier */
+static int mr_phys_mpt(struct mlx4_mpt_entry *mpt)
+{
+	return (be32_to_cpu(mpt->flags) >> 9) & 1;
+}
+
+static int mr_get_mtt_addr(struct mlx4_mpt_entry *mpt)
+{
+	return (int)be64_to_cpu(mpt->mtt_addr) & 0xfffffff8;
+}
+
+static int mr_get_mtt_size(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->mtt_sz);
+}
+
+static u32 mr_get_pd(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->pd_flags) & 0x00ffffff;
+}
+
+static int mr_is_fmr(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->pd_flags) & MLX4_MPT_PD_FLAG_FAST_REG;
+}
+
+static int mr_is_bind_enabled(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->flags) & MLX4_MPT_FLAG_BIND_ENABLE;
+}
+
+static int mr_is_region(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->flags) & MLX4_MPT_FLAG_REGION;
+}
+
+static int qp_get_mtt_addr(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int srq_get_mtt_addr(struct mlx4_srq_context *srqc)
+{
+	return be32_to_cpu(srqc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int qp_get_mtt_size(struct mlx4_qp_context *qpc)
+{
+	int page_shift = (qpc->log_page_size & 0x3f) + 12;
+	int log_sq_size = (qpc->sq_size_stride >> 3) & 0xf;
+	int log_sq_sride = qpc->sq_size_stride & 7;
+	int log_rq_size = (qpc->rq_size_stride >> 3) & 0xf;
+	int log_rq_stride = qpc->rq_size_stride & 7;
+	int srq = (be32_to_cpu(qpc->srqn) >> 24) & 1;
+	int rss = (be32_to_cpu(qpc->flags) >> 13) & 1;
+	u32 ts = (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+	int xrc = (ts == MLX4_QP_ST_XRC) ? 1 : 0;
+	int sq_size;
+	int rq_size;
+	int total_pages;
+	int total_mem;
+	int page_offset = (be32_to_cpu(qpc->params2) >> 6) & 0x3f;
+
+	sq_size = 1 << (log_sq_size + log_sq_sride + 4);
+	rq_size = (srq|rss|xrc) ? 0 : (1 << (log_rq_size + log_rq_stride + 4));
+	total_mem = sq_size + rq_size;
+	total_pages =
+		roundup_pow_of_two((total_mem + (page_offset << 6)) >>
+				   page_shift);
+
+	return total_pages;
+}
+
+static int check_mtt_range(struct mlx4_dev *dev, int slave, int start,
+			   int size, struct res_mtt *mtt)
+{
+	int res_start = mtt->com.res_id;
+	int res_size = (1 << mtt->order);
+
+	if (start < res_start || start + size > res_start + res_size)
+		return -EPERM;
+	return 0;
+}
+
+int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier;
+	struct res_mtt *mtt;
+	struct res_mpt *mpt = NULL;
+	int mtt_base = mr_get_mtt_addr(inbox->buf) / dev->caps.mtt_entry_sz;
+	int phys;
+	int id;
+	u32 pd;
+	int pd_slave;
+
+	id = index & mpt_mask(dev);
+	err = mr_res_start_move_to(dev, slave, id, RES_MPT_HW, &mpt);
+	if (err)
+		return err;
+
+	/* Disable memory windows for VFs. */
+	if (!mr_is_region(inbox->buf)) {
+		err = -EPERM;
+		goto ex_abort;
+	}
+
+	/* Make sure that the PD bits related to the slave id are zeros. */
+	pd = mr_get_pd(inbox->buf);
+	pd_slave = (pd >> 17) & 0x7f;
+	if (pd_slave != 0 && --pd_slave != slave) {
+		err = -EPERM;
+		goto ex_abort;
+	}
+
+	if (mr_is_fmr(inbox->buf)) {
+		/* FMR and Bind Enable are forbidden in slave devices. */
+		if (mr_is_bind_enabled(inbox->buf)) {
+			err = -EPERM;
+			goto ex_abort;
+		}
+		/* FMR and Memory Windows are also forbidden. */
+		if (!mr_is_region(inbox->buf)) {
+			err = -EPERM;
+			goto ex_abort;
+		}
+	}
+
+	phys = mr_phys_mpt(inbox->buf);
+	if (!phys) {
+		err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+		if (err)
+			goto ex_abort;
+
+		err = check_mtt_range(dev, slave, mtt_base,
+				      mr_get_mtt_size(inbox->buf), mtt);
+		if (err)
+			goto ex_put;
+
+		mpt->mtt = mtt;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put;
+
+	if (!phys) {
+		atomic_inc(&mtt->ref_count);
+		put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	}
+
+	res_end_move(dev, slave, RES_MPT, id);
+	return 0;
+
+ex_put:
+	if (!phys)
+		put_res(dev, slave, mtt->com.res_id, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_MPT, id);
+
+	return err;
+}
+
+int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier;
+	struct res_mpt *mpt;
+	int id;
+
+	id = index & mpt_mask(dev);
+	err = mr_res_start_move_to(dev, slave, id, RES_MPT_MAPPED, &mpt);
+	if (err)
+		return err;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_abort;
+
+	if (mpt->mtt)
+		atomic_dec(&mpt->mtt->ref_count);
+
+	res_end_move(dev, slave, RES_MPT, id);
+	return 0;
+
+ex_abort:
+	res_abort_move(dev, slave, RES_MPT, id);
+
+	return err;
+}
+
+int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier;
+	struct res_mpt *mpt;
+	int id;
+
+	id = index & mpt_mask(dev);
+	err = get_res(dev, slave, id, RES_MPT, &mpt);
+	if (err)
+		return err;
+
+	if (mpt->com.from_state == RES_MPT_MAPPED) {
+		/* In order to allow rereg in SRIOV, we need to alter the MPT entry. To do
+		 * that, the VF must read the MPT. But since the MPT entry memory is not
+		 * in the VF's virtual memory space, it must use QUERY_MPT to obtain the
+		 * entry contents. To guarantee that the MPT cannot be changed, the driver
+		 * must perform HW2SW_MPT before this query and return the MPT entry to HW
+		 * ownership fofollowing the change. The change here allows the VF to
+		 * perform QUERY_MPT also when the entry is in SW ownership.
+		 */
+		struct mlx4_mpt_entry *mpt_entry = mlx4_table_find(
+					&mlx4_priv(dev)->mr_table.dmpt_table,
+					mpt->key, NULL);
+
+		if (NULL == mpt_entry || NULL == outbox->buf) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		memcpy(outbox->buf, mpt_entry, sizeof(*mpt_entry));
+
+		err = 0;
+	} else if (mpt->com.from_state == RES_MPT_HW) {
+		err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	} else {
+		err = -EBUSY;
+		goto out;
+	}
+
+
+out:
+	put_res(dev, slave, id, RES_MPT);
+	return err;
+}
+
+static int qp_get_rcqn(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->cqn_recv) & 0xffffff;
+}
+
+static int qp_get_scqn(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->cqn_send) & 0xffffff;
+}
+
+static u32 qp_get_srqn(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->srqn) & 0x1ffffff;
+}
+
+static void adjust_proxy_tun_qkey(struct mlx4_dev *dev, struct mlx4_vhcr *vhcr,
+				  struct mlx4_qp_context *context)
+{
+	u32 qpn = vhcr->in_modifier & 0xffffff;
+	u32 qkey = 0;
+
+	if (mlx4_get_parav_qkey(dev, qpn, &qkey))
+		return;
+
+	/* adjust qkey in qp context */
+	context->qkey = cpu_to_be32(qkey);
+}
+
+static int adjust_qp_sched_queue(struct mlx4_dev *dev, int slave,
+				 struct mlx4_qp_context *qpc,
+				 struct mlx4_cmd_mailbox *inbox);
+
+int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_mtt *mtt;
+	struct res_qp *qp;
+	struct mlx4_qp_context *qpc = inbox->buf + 8;
+	int mtt_base = qp_get_mtt_addr(qpc) / dev->caps.mtt_entry_sz;
+	int mtt_size = qp_get_mtt_size(qpc);
+	struct res_cq *rcq;
+	struct res_cq *scq;
+	int rcqn = qp_get_rcqn(qpc);
+	int scqn = qp_get_scqn(qpc);
+	u32 srqn = qp_get_srqn(qpc) & 0xffffff;
+	int use_srq = (qp_get_srqn(qpc) >> 24) & 1;
+	struct res_srq *srq;
+	int local_qpn = be32_to_cpu(qpc->local_qpn) & 0xffffff;
+
+	err = adjust_qp_sched_queue(dev, slave, qpc, inbox);
+	if (err)
+		return err;
+
+	err = qp_res_start_move_to(dev, slave, qpn, RES_QP_HW, &qp, 0);
+	if (err)
+		return err;
+	qp->local_qpn = local_qpn;
+	qp->sched_queue = 0;
+	qp->param3 = 0;
+	qp->vlan_control = 0;
+	qp->fvl_rx = 0;
+	qp->pri_path_fl = 0;
+	qp->vlan_index = 0;
+	qp->feup = 0;
+	qp->qpc_flags = be32_to_cpu(qpc->flags);
+
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto ex_abort;
+
+	err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt);
+	if (err)
+		goto ex_put_mtt;
+
+	err = get_res(dev, slave, rcqn, RES_CQ, &rcq);
+	if (err)
+		goto ex_put_mtt;
+
+	if (scqn != rcqn) {
+		err = get_res(dev, slave, scqn, RES_CQ, &scq);
+		if (err)
+			goto ex_put_rcq;
+	} else
+		scq = rcq;
+
+	if (use_srq) {
+		err = get_res(dev, slave, srqn, RES_SRQ, &srq);
+		if (err)
+			goto ex_put_scq;
+	}
+
+	adjust_proxy_tun_qkey(dev, vhcr, qpc);
+	update_pkey_index(dev, slave, inbox);
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put_srq;
+	atomic_inc(&mtt->ref_count);
+	qp->mtt = mtt;
+	atomic_inc(&rcq->ref_count);
+	qp->rcq = rcq;
+	atomic_inc(&scq->ref_count);
+	qp->scq = scq;
+
+	if (scqn != rcqn)
+		put_res(dev, slave, scqn, RES_CQ);
+
+	if (use_srq) {
+		atomic_inc(&srq->ref_count);
+		put_res(dev, slave, srqn, RES_SRQ);
+		qp->srq = srq;
+	}
+
+	/* Save param3 for dynamic changes from VST back to VGT */
+	qp->param3 = qpc->param3;
+	put_res(dev, slave, rcqn, RES_CQ);
+	put_res(dev, slave, mtt_base, RES_MTT);
+	res_end_move(dev, slave, RES_QP, qpn);
+
+	return 0;
+
+ex_put_srq:
+	if (use_srq)
+		put_res(dev, slave, srqn, RES_SRQ);
+ex_put_scq:
+	if (scqn != rcqn)
+		put_res(dev, slave, scqn, RES_CQ);
+ex_put_rcq:
+	put_res(dev, slave, rcqn, RES_CQ);
+ex_put_mtt:
+	put_res(dev, slave, mtt_base, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_QP, qpn);
+
+	return err;
+}
+
+static int eq_get_mtt_addr(struct mlx4_eq_context *eqc)
+{
+	return be32_to_cpu(eqc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int eq_get_mtt_size(struct mlx4_eq_context *eqc)
+{
+	int log_eq_size = eqc->log_eq_size & 0x1f;
+	int page_shift = (eqc->log_page_size & 0x3f) + 12;
+
+	if (log_eq_size + 5 < page_shift)
+		return 1;
+
+	return 1 << (log_eq_size + 5 - page_shift);
+}
+
+static int cq_get_mtt_addr(struct mlx4_cq_context *cqc)
+{
+	return be32_to_cpu(cqc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int cq_get_mtt_size(struct mlx4_cq_context *cqc)
+{
+	int log_cq_size = (be32_to_cpu(cqc->logsize_usrpage) >> 24) & 0x1f;
+	int page_shift = (cqc->log_page_size & 0x3f) + 12;
+
+	if (log_cq_size + 5 < page_shift)
+		return 1;
+
+	return 1 << (log_cq_size + 5 - page_shift);
+}
+
+int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int eqn = vhcr->in_modifier;
+	int res_id = (slave << 10) | eqn;
+	struct mlx4_eq_context *eqc = inbox->buf;
+	int mtt_base = eq_get_mtt_addr(eqc) / dev->caps.mtt_entry_sz;
+	int mtt_size = eq_get_mtt_size(eqc);
+	struct res_eq *eq;
+	struct res_mtt *mtt;
+
+	err = add_res_range(dev, slave, res_id, 1, RES_EQ, 0);
+	if (err)
+		return err;
+	err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_HW, &eq);
+	if (err)
+		goto out_add;
+
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto out_move;
+
+	err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt);
+	if (err)
+		goto out_put;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto out_put;
+
+	atomic_inc(&mtt->ref_count);
+	eq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_EQ, res_id);
+	return 0;
+
+out_put:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+out_move:
+	res_abort_move(dev, slave, RES_EQ, res_id);
+out_add:
+	rem_res_range(dev, slave, res_id, 1, RES_EQ, 0);
+	return err;
+}
+
+int mlx4_CONFIG_DEV_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	u8 get = vhcr->op_modifier;
+
+	if (get != 1)
+		return -EPERM;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+
+	return err;
+}
+
+static int get_containing_mtt(struct mlx4_dev *dev, int slave, int start,
+			      int len, struct res_mtt **res)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_mtt *mtt;
+	int err = -EINVAL;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry(mtt, &tracker->slave_list[slave].res_list[RES_MTT],
+			    com.list) {
+		if (!check_mtt_range(dev, slave, start, len, mtt)) {
+			*res = mtt;
+			mtt->com.from_state = mtt->com.state;
+			mtt->com.state = RES_MTT_BUSY;
+			err = 0;
+			break;
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int verify_qp_parameters(struct mlx4_dev *dev,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				enum qp_transition transition, u8 slave)
+{
+	u32			qp_type;
+	u32			qpn;
+	struct mlx4_qp_context	*qp_ctx;
+	enum mlx4_qp_optpar	optpar;
+	int port;
+	int num_gids;
+
+	qp_ctx  = inbox->buf + 8;
+	qp_type	= (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff;
+	optpar	= be32_to_cpu(*(__be32 *) inbox->buf);
+
+	if (slave != mlx4_master_func_num(dev)) {
+		qp_ctx->params2 &= ~cpu_to_be32(MLX4_QP_BIT_FPP);
+		/* setting QP rate-limit is disallowed for VFs */
+		if (qp_ctx->rate_limit_params)
+			return -EPERM;
+	}
+
+	switch (qp_type) {
+	case MLX4_QP_ST_RC:
+	case MLX4_QP_ST_XRC:
+	case MLX4_QP_ST_UC:
+		switch (transition) {
+		case QP_TRANS_INIT2RTR:
+		case QP_TRANS_RTR2RTS:
+		case QP_TRANS_RTS2RTS:
+		case QP_TRANS_SQD2SQD:
+		case QP_TRANS_SQD2RTS:
+			if (slave != mlx4_master_func_num(dev)) {
+				if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) {
+					port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+					if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB)
+						num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+					else
+						num_gids = 1;
+					if (qp_ctx->pri_path.mgid_index >= num_gids)
+						return -EINVAL;
+				}
+				if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
+					port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1;
+					if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB)
+						num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+					else
+						num_gids = 1;
+					if (qp_ctx->alt_path.mgid_index >= num_gids)
+						return -EINVAL;
+				}
+			}
+			break;
+		default:
+			break;
+		}
+		break;
+
+	case MLX4_QP_ST_MLX:
+		qpn = vhcr->in_modifier & 0x7fffff;
+		port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+		if (transition == QP_TRANS_INIT2RTR &&
+		    slave != mlx4_master_func_num(dev) &&
+		    mlx4_is_qp_reserved(dev, qpn) &&
+		    !mlx4_vf_smi_enabled(dev, slave, port)) {
+			/* only enabled VFs may create MLX proxy QPs */
+			mlx4_err(dev, "%s: unprivileged slave %d attempting to create an MLX proxy special QP on port %d\n",
+				 __func__, slave, port);
+			return -EPERM;
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_mtt mtt;
+	__be64 *page_list = inbox->buf;
+	u64 *pg_list = (u64 *)page_list;
+	int i;
+	struct res_mtt *rmtt = NULL;
+	int start = be64_to_cpu(page_list[0]);
+	int npages = vhcr->in_modifier;
+	int err;
+
+	err = get_containing_mtt(dev, slave, start, npages, &rmtt);
+	if (err)
+		return err;
+
+	/* Call the SW implementation of write_mtt:
+	 * - Prepare a dummy mtt struct
+	 * - Translate inbox contents to simple addresses in host endianness */
+	mtt.offset = 0;  /* TBD this is broken but I don't handle it since
+			    we don't really use it */
+	mtt.order = 0;
+	mtt.page_shift = 0;
+	for (i = 0; i < npages; ++i)
+		pg_list[i + 2] = (be64_to_cpu(page_list[i + 2]) & ~1ULL);
+
+	err = __mlx4_write_mtt(dev, &mtt, be64_to_cpu(page_list[0]), npages,
+			       ((u64 *)page_list + 2));
+
+	if (rmtt)
+		put_res(dev, slave, rmtt->com.res_id, RES_MTT);
+
+	return err;
+}
+
+int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int eqn = vhcr->in_modifier;
+	int res_id = eqn | (slave << 10);
+	struct res_eq *eq;
+	int err;
+
+	err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_RESERVED, &eq);
+	if (err)
+		return err;
+
+	err = get_res(dev, slave, eq->mtt->com.res_id, RES_MTT, NULL);
+	if (err)
+		goto ex_abort;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put;
+
+	atomic_dec(&eq->mtt->ref_count);
+	put_res(dev, slave, eq->mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_EQ, res_id);
+	rem_res_range(dev, slave, res_id, 1, RES_EQ, 0);
+
+	return 0;
+
+ex_put:
+	put_res(dev, slave, eq->mtt->com.res_id, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_EQ, res_id);
+
+	return err;
+}
+
+int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_event_eq_info *event_eq;
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_modifier = 0;
+	int err;
+	int res_id;
+	struct res_eq *req;
+
+	if (!priv->mfunc.master.slave_state)
+		return -EINVAL;
+
+	/* check for slave valid, slave not PF, and slave active */
+	if (slave < 0 || slave > dev->persist->num_vfs ||
+	    slave == dev->caps.function ||
+	    !priv->mfunc.master.slave_state[slave].active)
+		return 0;
+
+	event_eq = &priv->mfunc.master.slave_state[slave].event_eq[eqe->type];
+
+	/* Create the event only if the slave is registered */
+	if (event_eq->eqn < 0)
+		return 0;
+
+	mutex_lock(&priv->mfunc.master.gen_eqe_mutex[slave]);
+	res_id = (slave << 10) | event_eq->eqn;
+	err = get_res(dev, slave, res_id, RES_EQ, &req);
+	if (err)
+		goto unlock;
+
+	if (req->com.from_state != RES_EQ_HW) {
+		err = -EINVAL;
+		goto put;
+	}
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto put;
+	}
+
+	if (eqe->type == MLX4_EVENT_TYPE_CMD) {
+		++event_eq->token;
+		eqe->event.cmd.token = cpu_to_be16(event_eq->token);
+	}
+
+	memcpy(mailbox->buf, (u8 *) eqe, 28);
+
+	in_modifier = (slave & 0xff) | ((event_eq->eqn & 0x3ff) << 16);
+
+	err = mlx4_cmd(dev, mailbox->dma, in_modifier, 0,
+		       MLX4_CMD_GEN_EQE, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	put_res(dev, slave, res_id, RES_EQ);
+	mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+
+put:
+	put_res(dev, slave, res_id, RES_EQ);
+
+unlock:
+	mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]);
+	return err;
+}
+
+int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int eqn = vhcr->in_modifier;
+	int res_id = eqn | (slave << 10);
+	struct res_eq *eq;
+	int err;
+
+	err = get_res(dev, slave, res_id, RES_EQ, &eq);
+	if (err)
+		return err;
+
+	if (eq->com.from_state != RES_EQ_HW) {
+		err = -EINVAL;
+		goto ex_put;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+
+ex_put:
+	put_res(dev, slave, res_id, RES_EQ);
+	return err;
+}
+
+int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int cqn = vhcr->in_modifier;
+	struct mlx4_cq_context *cqc = inbox->buf;
+	int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz;
+	struct res_cq *cq = NULL;
+	struct res_mtt *mtt;
+
+	err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_HW, &cq);
+	if (err)
+		return err;
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto out_move;
+	err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt);
+	if (err)
+		goto out_put;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto out_put;
+	atomic_inc(&mtt->ref_count);
+	cq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_CQ, cqn);
+	return 0;
+
+out_put:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+out_move:
+	res_abort_move(dev, slave, RES_CQ, cqn);
+	return err;
+}
+
+int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int cqn = vhcr->in_modifier;
+	struct res_cq *cq = NULL;
+
+	err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_ALLOCATED, &cq);
+	if (err)
+		return err;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto out_move;
+	atomic_dec(&cq->mtt->ref_count);
+	res_end_move(dev, slave, RES_CQ, cqn);
+	return 0;
+
+out_move:
+	res_abort_move(dev, slave, RES_CQ, cqn);
+	return err;
+}
+
+int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int cqn = vhcr->in_modifier;
+	struct res_cq *cq;
+	int err;
+
+	err = get_res(dev, slave, cqn, RES_CQ, &cq);
+	if (err)
+		return err;
+
+	if (cq->com.from_state != RES_CQ_HW)
+		goto ex_put;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+ex_put:
+	put_res(dev, slave, cqn, RES_CQ);
+
+	return err;
+}
+
+static int handle_resize(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd,
+			 struct res_cq *cq)
+{
+	int err;
+	struct res_mtt *orig_mtt;
+	struct res_mtt *mtt;
+	struct mlx4_cq_context *cqc = inbox->buf;
+	int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz;
+
+	err = get_res(dev, slave, cq->mtt->com.res_id, RES_MTT, &orig_mtt);
+	if (err)
+		return err;
+
+	if (orig_mtt != cq->mtt) {
+		err = -EINVAL;
+		goto ex_put;
+	}
+
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto ex_put;
+
+	err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt);
+	if (err)
+		goto ex_put1;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put1;
+	atomic_dec(&orig_mtt->ref_count);
+	put_res(dev, slave, orig_mtt->com.res_id, RES_MTT);
+	atomic_inc(&mtt->ref_count);
+	cq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	return 0;
+
+ex_put1:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+ex_put:
+	put_res(dev, slave, orig_mtt->com.res_id, RES_MTT);
+
+	return err;
+
+}
+
+int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int cqn = vhcr->in_modifier;
+	struct res_cq *cq;
+	int err;
+
+	err = get_res(dev, slave, cqn, RES_CQ, &cq);
+	if (err)
+		return err;
+
+	if (cq->com.from_state != RES_CQ_HW)
+		goto ex_put;
+
+	if (vhcr->op_modifier == 0) {
+		err = handle_resize(dev, slave, vhcr, inbox, outbox, cmd, cq);
+		goto ex_put;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+ex_put:
+	put_res(dev, slave, cqn, RES_CQ);
+
+	return err;
+}
+
+static int srq_get_mtt_size(struct mlx4_srq_context *srqc)
+{
+	int log_srq_size = (be32_to_cpu(srqc->state_logsize_srqn) >> 24) & 0xf;
+	int log_rq_stride = srqc->logstride & 7;
+	int page_shift = (srqc->log_page_size & 0x3f) + 12;
+
+	if (log_srq_size + log_rq_stride + 4 < page_shift)
+		return 1;
+
+	return 1 << (log_srq_size + log_rq_stride + 4 - page_shift);
+}
+
+int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_mtt *mtt;
+	struct res_srq *srq = NULL;
+	struct mlx4_srq_context *srqc = inbox->buf;
+	int mtt_base = srq_get_mtt_addr(srqc) / dev->caps.mtt_entry_sz;
+
+	if (srqn != (be32_to_cpu(srqc->state_logsize_srqn) & 0xffffff))
+		return -EINVAL;
+
+	err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_HW, &srq);
+	if (err)
+		return err;
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto ex_abort;
+	err = check_mtt_range(dev, slave, mtt_base, srq_get_mtt_size(srqc),
+			      mtt);
+	if (err)
+		goto ex_put_mtt;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put_mtt;
+
+	atomic_inc(&mtt->ref_count);
+	srq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_SRQ, srqn);
+	return 0;
+
+ex_put_mtt:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_SRQ, srqn);
+
+	return err;
+}
+
+int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_srq *srq = NULL;
+
+	err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_ALLOCATED, &srq);
+	if (err)
+		return err;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_abort;
+	atomic_dec(&srq->mtt->ref_count);
+	if (srq->cq)
+		atomic_dec(&srq->cq->ref_count);
+	res_end_move(dev, slave, RES_SRQ, srqn);
+
+	return 0;
+
+ex_abort:
+	res_abort_move(dev, slave, RES_SRQ, srqn);
+
+	return err;
+}
+
+int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_srq *srq;
+
+	err = get_res(dev, slave, srqn, RES_SRQ, &srq);
+	if (err)
+		return err;
+	if (srq->com.from_state != RES_SRQ_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	put_res(dev, slave, srqn, RES_SRQ);
+	return err;
+}
+
+int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_srq *srq;
+
+	err = get_res(dev, slave, srqn, RES_SRQ, &srq);
+	if (err)
+		return err;
+
+	if (srq->com.from_state != RES_SRQ_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	put_res(dev, slave, srqn, RES_SRQ);
+	return err;
+}
+
+int mlx4_GEN_QP_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr,
+			struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_qp *qp;
+
+	err = get_res(dev, slave, qpn, RES_QP, &qp);
+	if (err)
+		return err;
+	if (qp->com.from_state != RES_QP_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_qp_context *context = inbox->buf + 8;
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	update_pkey_index(dev, slave, inbox);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+static int adjust_qp_sched_queue(struct mlx4_dev *dev, int slave,
+				  struct mlx4_qp_context *qpc,
+				  struct mlx4_cmd_mailbox *inbox)
+{
+	enum mlx4_qp_optpar optpar = be32_to_cpu(*(__be32 *)inbox->buf);
+	u8 pri_sched_queue;
+	int port = mlx4_slave_convert_port(
+		   dev, slave, (qpc->pri_path.sched_queue >> 6 & 1) + 1) - 1;
+
+	if (port < 0)
+		return -EINVAL;
+
+	pri_sched_queue = (qpc->pri_path.sched_queue & ~(1 << 6)) |
+			  ((port & 1) << 6);
+
+	if (optpar & (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | MLX4_QP_OPTPAR_SCHED_QUEUE) ||
+	    qpc->pri_path.sched_queue || mlx4_is_eth(dev, port + 1)) {
+		qpc->pri_path.sched_queue = pri_sched_queue;
+	}
+
+	if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
+		port = mlx4_slave_convert_port(
+				dev, slave, (qpc->alt_path.sched_queue >> 6 & 1)
+				+ 1) - 1;
+		if (port < 0)
+			return -EINVAL;
+		qpc->alt_path.sched_queue =
+			(qpc->alt_path.sched_queue & ~(1 << 6)) |
+			(port & 1) << 6;
+	}
+	return 0;
+}
+
+static int roce_verify_mac(struct mlx4_dev *dev, int slave,
+				struct mlx4_qp_context *qpc,
+				struct mlx4_cmd_mailbox *inbox)
+{
+	u64 mac;
+	int port;
+	u32 ts = (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+	u8 sched = *(u8 *)(inbox->buf + 64);
+	u8 smac_ix;
+
+	port = (sched >> 6 & 1) + 1;
+	if (mlx4_is_eth(dev, port) && (ts != MLX4_QP_ST_MLX)) {
+		smac_ix = qpc->pri_path.grh_mylmc & 0x7f;
+		if (mac_find_smac_ix_in_slave(dev, slave, port, smac_ix, &mac))
+			return -ENOENT;
+	}
+	return 0;
+}
+
+int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *qpc = inbox->buf + 8;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_qp *qp;
+	u8 orig_sched_queue;
+	u8 orig_vlan_control = qpc->pri_path.vlan_control;
+	u8 orig_fvl_rx = qpc->pri_path.fvl_rx;
+	u8 orig_pri_path_fl = qpc->pri_path.fl;
+	u8 orig_vlan_index = qpc->pri_path.vlan_index;
+	u8 orig_feup = qpc->pri_path.feup;
+
+	err = adjust_qp_sched_queue(dev, slave, qpc, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_INIT2RTR, slave);
+	if (err)
+		return err;
+
+	if (roce_verify_mac(dev, slave, qpc, inbox))
+		return -EINVAL;
+
+	update_pkey_index(dev, slave, inbox);
+	update_gid(dev, inbox, (u8)slave);
+	adjust_proxy_tun_qkey(dev, vhcr, qpc);
+	orig_sched_queue = qpc->pri_path.sched_queue;
+
+	err = get_res(dev, slave, qpn, RES_QP, &qp);
+	if (err)
+		return err;
+	if (qp->com.from_state != RES_QP_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = update_vport_qp_param(dev, inbox, slave, qpn);
+	if (err)
+		goto out;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	/* if no error, save sched queue value passed in by VF. This is
+	 * essentially the QOS value provided by the VF. This will be useful
+	 * if we allow dynamic changes from VST back to VGT
+	 */
+	if (!err) {
+		qp->sched_queue = orig_sched_queue;
+		qp->vlan_control = orig_vlan_control;
+		qp->fvl_rx	=  orig_fvl_rx;
+		qp->pri_path_fl = orig_pri_path_fl;
+		qp->vlan_index  = orig_vlan_index;
+		qp->feup	= orig_feup;
+	}
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTR2RTS, slave);
+	if (err)
+		return err;
+
+	update_pkey_index(dev, slave, inbox);
+	update_gid(dev, inbox, (u8)slave);
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTS2RTS, slave);
+	if (err)
+		return err;
+
+	update_pkey_index(dev, slave, inbox);
+	update_gid(dev, inbox, (u8)slave);
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+
+int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_qp_context *context = inbox->buf + 8;
+	int err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2SQD, slave);
+	if (err)
+		return err;
+
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	update_gid(dev, inbox, (u8)slave);
+	update_pkey_index(dev, slave, inbox);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2RTS, slave);
+	if (err)
+		return err;
+
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	update_gid(dev, inbox, (u8)slave);
+	update_pkey_index(dev, slave, inbox);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_qp *qp;
+
+	err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED, &qp, 0);
+	if (err)
+		return err;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_abort;
+
+	atomic_dec(&qp->mtt->ref_count);
+	atomic_dec(&qp->rcq->ref_count);
+	atomic_dec(&qp->scq->ref_count);
+	if (qp->srq)
+		atomic_dec(&qp->srq->ref_count);
+	res_end_move(dev, slave, RES_QP, qpn);
+	return 0;
+
+ex_abort:
+	res_abort_move(dev, slave, RES_QP, qpn);
+
+	return err;
+}
+
+static struct res_gid *find_gid(struct mlx4_dev *dev, int slave,
+				struct res_qp *rqp, u8 *gid)
+{
+	struct res_gid *res;
+
+	list_for_each_entry(res, &rqp->mcg_list, list) {
+		if (!memcmp(res->gid, gid, 16))
+			return res;
+	}
+	return NULL;
+}
+
+static int add_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp,
+		       u8 *gid, enum mlx4_protocol prot,
+		       enum mlx4_steer_type steer, u64 reg_id)
+{
+	struct res_gid *res;
+	int err;
+
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	spin_lock_irq(&rqp->mcg_spl);
+	if (find_gid(dev, slave, rqp, gid)) {
+		kfree(res);
+		err = -EEXIST;
+	} else {
+		memcpy(res->gid, gid, 16);
+		res->prot = prot;
+		res->steer = steer;
+		res->reg_id = reg_id;
+		list_add_tail(&res->list, &rqp->mcg_list);
+		err = 0;
+	}
+	spin_unlock_irq(&rqp->mcg_spl);
+
+	return err;
+}
+
+static int rem_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp,
+		       u8 *gid, enum mlx4_protocol prot,
+		       enum mlx4_steer_type steer, u64 *reg_id)
+{
+	struct res_gid *res;
+	int err;
+
+	spin_lock_irq(&rqp->mcg_spl);
+	res = find_gid(dev, slave, rqp, gid);
+	if (!res || res->prot != prot || res->steer != steer)
+		err = -EINVAL;
+	else {
+		*reg_id = res->reg_id;
+		list_del(&res->list);
+		kfree(res);
+		err = 0;
+	}
+	spin_unlock_irq(&rqp->mcg_spl);
+
+	return err;
+}
+
+static int qp_attach(struct mlx4_dev *dev, int slave, struct mlx4_qp *qp,
+		     u8 gid[16], int block_loopback, enum mlx4_protocol prot,
+		     enum mlx4_steer_type type, u64 *reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		int port = mlx4_slave_convert_port(dev, slave, gid[5]);
+		if (port < 0)
+			return port;
+		return mlx4_trans_to_dmfs_attach(dev, qp, gid, port,
+						block_loopback, prot,
+						reg_id);
+	}
+	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH) {
+			int port = mlx4_slave_convert_port(dev, slave, gid[5]);
+			if (port < 0)
+				return port;
+			gid[5] = port;
+		}
+		return mlx4_qp_attach_common(dev, qp, gid,
+					    block_loopback, prot, type);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int qp_detach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+		     u8 gid[16], enum mlx4_protocol prot,
+		     enum mlx4_steer_type type, u64 reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_flow_detach(dev, reg_id);
+	case MLX4_STEERING_MODE_B0:
+		return mlx4_qp_detach_common(dev, qp, gid, prot, type);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int mlx4_adjust_port(struct mlx4_dev *dev, int slave,
+			    u8 *gid, enum mlx4_protocol prot)
+{
+	int real_port;
+
+	if (prot != MLX4_PROT_ETH)
+		return 0;
+
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0 ||
+	    dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		real_port = mlx4_slave_convert_port(dev, slave, gid[5]);
+		if (real_port < 0)
+			return -EINVAL;
+		gid[5] = real_port;
+	}
+
+	return 0;
+}
+
+int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_qp qp; /* dummy for calling attach/detach */
+	u8 *gid = inbox->buf;
+	enum mlx4_protocol prot = (vhcr->in_modifier >> 28) & 0x7;
+	int err;
+	int qpn;
+	struct res_qp *rqp;
+	u64 reg_id = 0;
+	int attach = vhcr->op_modifier;
+	int block_loopback = vhcr->in_modifier >> 31;
+	u8 steer_type_mask = 2;
+	enum mlx4_steer_type type = (gid[7] & steer_type_mask) >> 1;
+
+	qpn = vhcr->in_modifier & 0xffffff;
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err)
+		return err;
+
+	qp.qpn = qpn;
+	if (attach) {
+		err = qp_attach(dev, slave, &qp, gid, block_loopback, prot,
+				type, &reg_id);
+		if (err) {
+			pr_err("Fail to attach rule to qp 0x%x\n", qpn);
+			goto ex_put;
+		}
+		err = add_mcg_res(dev, slave, rqp, gid, prot, type, reg_id);
+		if (err)
+			goto ex_detach;
+	} else {
+		err = mlx4_adjust_port(dev, slave, gid, prot);
+		if (err)
+			goto ex_put;
+
+		err = rem_mcg_res(dev, slave, rqp, gid, prot, type, &reg_id);
+		if (err)
+			goto ex_put;
+
+		err = qp_detach(dev, &qp, gid, prot, type, reg_id);
+		if (err)
+			pr_err("Fail to detach rule from qp 0x%x reg_id = 0x%llx\n",
+			       qpn, reg_id);
+	}
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+
+ex_detach:
+	qp_detach(dev, &qp, gid, prot, type, reg_id);
+ex_put:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+/*
+ * MAC validation for Flow Steering rules.
+ * VF can attach rules only with a mac address which is assigned to it.
+ */
+static int validate_eth_header_mac(int slave, struct _rule_hw *eth_header,
+				   struct list_head *rlist)
+{
+	struct mac_res *res, *tmp;
+	__be64 be_mac;
+
+	/* make sure it isn't multicast or broadcast mac*/
+	if (!is_multicast_ether_addr(eth_header->eth.dst_mac) &&
+	    !is_broadcast_ether_addr(eth_header->eth.dst_mac)) {
+		list_for_each_entry_safe(res, tmp, rlist, list) {
+			be_mac = cpu_to_be64(res->mac << 16);
+			if (ether_addr_equal((u8 *)&be_mac, eth_header->eth.dst_mac))
+				return 0;
+		}
+		pr_err("MAC %pM doesn't belong to VF %d, Steering rule rejected\n",
+		       eth_header->eth.dst_mac, slave);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * In case of missing eth header, append eth header with a MAC address
+ * assigned to the VF.
+ */
+static int add_eth_header(struct mlx4_dev *dev, int slave,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct list_head *rlist, int header_id)
+{
+	struct mac_res *res, *tmp;
+	u8 port;
+	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+	struct mlx4_net_trans_rule_hw_eth *eth_header;
+	struct mlx4_net_trans_rule_hw_ipv4 *ip_header;
+	struct mlx4_net_trans_rule_hw_tcp_udp *l4_header;
+	__be64 be_mac = 0;
+	__be64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf;
+	port = ctrl->port;
+	eth_header = (struct mlx4_net_trans_rule_hw_eth *)(ctrl + 1);
+
+	/* Clear a space in the inbox for eth header */
+	switch (header_id) {
+	case MLX4_NET_TRANS_RULE_ID_IPV4:
+		ip_header =
+			(struct mlx4_net_trans_rule_hw_ipv4 *)(eth_header + 1);
+		memmove(ip_header, eth_header,
+			sizeof(*ip_header) + sizeof(*l4_header));
+		break;
+	case MLX4_NET_TRANS_RULE_ID_TCP:
+	case MLX4_NET_TRANS_RULE_ID_UDP:
+		l4_header = (struct mlx4_net_trans_rule_hw_tcp_udp *)
+			    (eth_header + 1);
+		memmove(l4_header, eth_header, sizeof(*l4_header));
+		break;
+	default:
+		return -EINVAL;
+	}
+	list_for_each_entry_safe(res, tmp, rlist, list) {
+		if (port == res->port) {
+			be_mac = cpu_to_be64(res->mac << 16);
+			break;
+		}
+	}
+	if (!be_mac) {
+		pr_err("Failed adding eth header to FS rule, Can't find matching MAC for port %d\n",
+		       port);
+		return -EINVAL;
+	}
+
+	memset(eth_header, 0, sizeof(*eth_header));
+	eth_header->size = sizeof(*eth_header) >> 2;
+	eth_header->id = cpu_to_be16(__sw_id_hw[MLX4_NET_TRANS_RULE_ID_ETH]);
+	memcpy(eth_header->dst_mac, &be_mac, ETH_ALEN);
+	memcpy(eth_header->dst_mac_msk, &mac_msk, ETH_ALEN);
+
+	return 0;
+
+}
+
+#define MLX4_UPD_QP_PATH_MASK_SUPPORTED      (                                \
+	1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX                     |\
+	1ULL << MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB)
+int mlx4_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd_info)
+{
+	int err;
+	u32 qpn = vhcr->in_modifier & 0xffffff;
+	struct res_qp *rqp;
+	u64 mac;
+	unsigned port;
+	u64 pri_addr_path_mask;
+	struct mlx4_update_qp_context *cmd;
+	int smac_index;
+
+	cmd = (struct mlx4_update_qp_context *)inbox->buf;
+
+	pri_addr_path_mask = be64_to_cpu(cmd->primary_addr_path_mask);
+	if (cmd->qp_mask || cmd->secondary_addr_path_mask ||
+	    (pri_addr_path_mask & ~MLX4_UPD_QP_PATH_MASK_SUPPORTED))
+		return -EPERM;
+
+	if ((pri_addr_path_mask &
+	     (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB)) &&
+		!(dev->caps.flags2 &
+		  MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB)) {
+		mlx4_warn(dev, "Src check LB for slave %d isn't supported\n",
+			  slave);
+		return -EOPNOTSUPP;
+	}
+
+	/* Just change the smac for the QP */
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err) {
+		mlx4_err(dev, "Updating qpn 0x%x for slave %d rejected\n", qpn, slave);
+		return err;
+	}
+
+	port = (rqp->sched_queue >> 6 & 1) + 1;
+
+	if (pri_addr_path_mask & (1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX)) {
+		smac_index = cmd->qp_context.pri_path.grh_mylmc;
+		err = mac_find_smac_ix_in_slave(dev, slave, port,
+						smac_index, &mac);
+
+		if (err) {
+			mlx4_err(dev, "Failed to update qpn 0x%x, MAC is invalid. smac_ix: %d\n",
+				 qpn, smac_index);
+			goto err_mac;
+		}
+	}
+
+	err = mlx4_cmd(dev, inbox->dma,
+		       vhcr->in_modifier, 0,
+		       MLX4_CMD_UPDATE_QP, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_err(dev, "Failed to update qpn on qpn 0x%x, command failed\n", qpn);
+		goto err_mac;
+	}
+
+err_mac:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+static u32 qp_attach_mbox_size(void *mbox)
+{
+	u32 size = sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+	struct _rule_hw  *rule_header;
+
+	rule_header = (struct _rule_hw *)(mbox + size);
+
+	while (rule_header->size) {
+		size += rule_header->size * sizeof(u32);
+		rule_header += 1;
+	}
+	return size;
+}
+
+static int mlx4_do_mirror_rule(struct mlx4_dev *dev, struct res_fs_rule *fs_rule);
+
+int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd)
+{
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *rlist = &tracker->slave_list[slave].res_list[RES_MAC];
+	int err;
+	int qpn;
+	struct res_qp *rqp;
+	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+	struct _rule_hw  *rule_header;
+	int header_id;
+	struct res_fs_rule *rrule;
+	u32 mbox_size;
+
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return -EOPNOTSUPP;
+
+	ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf;
+	err = mlx4_slave_convert_port(dev, slave, ctrl->port);
+	if (err <= 0)
+		return -EINVAL;
+	ctrl->port = err;
+	qpn = be32_to_cpu(ctrl->qpn) & 0xffffff;
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err) {
+		pr_err("Steering rule with qpn 0x%x rejected\n", qpn);
+		return err;
+	}
+	rule_header = (struct _rule_hw *)(ctrl + 1);
+	header_id = map_hw_to_sw_id(be16_to_cpu(rule_header->id));
+
+	if (header_id == MLX4_NET_TRANS_RULE_ID_ETH)
+		mlx4_handle_eth_header_mcast_prio(ctrl, rule_header);
+
+	switch (header_id) {
+	case MLX4_NET_TRANS_RULE_ID_ETH:
+		if (validate_eth_header_mac(slave, rule_header, rlist)) {
+			err = -EINVAL;
+			goto err_put_qp;
+		}
+		break;
+	case MLX4_NET_TRANS_RULE_ID_IB:
+		break;
+	case MLX4_NET_TRANS_RULE_ID_IPV4:
+	case MLX4_NET_TRANS_RULE_ID_TCP:
+	case MLX4_NET_TRANS_RULE_ID_UDP:
+		pr_warn("Can't attach FS rule without L2 headers, adding L2 header\n");
+		if (add_eth_header(dev, slave, inbox, rlist, header_id)) {
+			err = -EINVAL;
+			goto err_put_qp;
+		}
+		vhcr->in_modifier +=
+			sizeof(struct mlx4_net_trans_rule_hw_eth) >> 2;
+		break;
+	default:
+		pr_err("Corrupted mailbox\n");
+		err = -EINVAL;
+		goto err_put_qp;
+	}
+
+	err = mlx4_cmd_imm(dev, inbox->dma, &vhcr->out_param,
+			   vhcr->in_modifier, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto err_put_qp;
+
+
+	err = add_res_range(dev, slave, vhcr->out_param, 1, RES_FS_RULE, qpn);
+	if (err) {
+		mlx4_err(dev, "Fail to add flow steering resources\n");
+		goto err_detach;
+	}
+
+	err = get_res(dev, slave, vhcr->out_param, RES_FS_RULE, &rrule);
+	if (err)
+		goto err_detach;
+
+	mbox_size = qp_attach_mbox_size(inbox->buf);
+	rrule->mirr_mbox = kmalloc(mbox_size, GFP_KERNEL);
+	if (!rrule->mirr_mbox) {
+		err = -ENOMEM;
+		goto err_put_rule;
+	}
+	rrule->mirr_mbox_size = mbox_size;
+	rrule->mirr_rule_id = 0;
+	memcpy(rrule->mirr_mbox, inbox->buf, mbox_size);
+
+	/* set different port */
+	ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)rrule->mirr_mbox;
+	if (ctrl->port == 1)
+		ctrl->port = 2;
+	else
+		ctrl->port = 1;
+
+	if (mlx4_is_bonded(dev))
+		mlx4_do_mirror_rule(dev, rrule);
+
+	atomic_inc(&rqp->ref_count);
+
+err_put_rule:
+	put_res(dev, slave, vhcr->out_param, RES_FS_RULE);
+err_detach:
+	/* detach rule on error */
+	if (err)
+		mlx4_cmd(dev, vhcr->out_param, 0, 0,
+			 MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+			 MLX4_CMD_NATIVE);
+err_put_qp:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+static int mlx4_undo_mirror_rule(struct mlx4_dev *dev, struct res_fs_rule *fs_rule)
+{
+	int err;
+
+	err = rem_res_range(dev, fs_rule->com.owner, fs_rule->com.res_id, 1, RES_FS_RULE, 0);
+	if (err) {
+		mlx4_err(dev, "Fail to remove flow steering resources\n");
+		return err;
+	}
+
+	mlx4_cmd(dev, fs_rule->com.res_id, 0, 0, MLX4_QP_FLOW_STEERING_DETACH,
+		 MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	return 0;
+}
+
+int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct res_qp *rqp;
+	struct res_fs_rule *rrule;
+	u64 mirr_reg_id;
+	int qpn;
+
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return -EOPNOTSUPP;
+
+	err = get_res(dev, slave, vhcr->in_param, RES_FS_RULE, &rrule);
+	if (err)
+		return err;
+
+	if (!rrule->mirr_mbox) {
+		mlx4_err(dev, "Mirror rules cannot be removed explicitly\n");
+		put_res(dev, slave, vhcr->in_param, RES_FS_RULE);
+		return -EINVAL;
+	}
+	mirr_reg_id = rrule->mirr_rule_id;
+	kfree(rrule->mirr_mbox);
+	qpn = rrule->qpn;
+
+	/* Release the rule form busy state before removal */
+	put_res(dev, slave, vhcr->in_param, RES_FS_RULE);
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err)
+		return err;
+
+	if (mirr_reg_id && mlx4_is_bonded(dev)) {
+		err = get_res(dev, slave, mirr_reg_id, RES_FS_RULE, &rrule);
+		if (err) {
+			mlx4_err(dev, "Fail to get resource of mirror rule\n");
+		} else {
+			put_res(dev, slave, mirr_reg_id, RES_FS_RULE);
+			mlx4_undo_mirror_rule(dev, rrule);
+		}
+	}
+	err = rem_res_range(dev, slave, vhcr->in_param, 1, RES_FS_RULE, 0);
+	if (err) {
+		mlx4_err(dev, "Fail to remove flow steering resources\n");
+		goto out;
+	}
+
+	err = mlx4_cmd(dev, vhcr->in_param, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+	if (!err)
+		atomic_dec(&rqp->ref_count);
+out:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+enum {
+	BUSY_MAX_RETRIES = 10
+};
+
+int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier & 0xffff;
+
+	err = get_res(dev, slave, index, RES_COUNTER, NULL);
+	if (err)
+		return err;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	put_res(dev, slave, index, RES_COUNTER);
+	return err;
+}
+
+static void detach_qp(struct mlx4_dev *dev, int slave, struct res_qp *rqp)
+{
+	struct res_gid *rgid;
+	struct res_gid *tmp;
+	struct mlx4_qp qp; /* dummy for calling attach/detach */
+
+	list_for_each_entry_safe(rgid, tmp, &rqp->mcg_list, list) {
+		switch (dev->caps.steering_mode) {
+		case MLX4_STEERING_MODE_DEVICE_MANAGED:
+			mlx4_flow_detach(dev, rgid->reg_id);
+			break;
+		case MLX4_STEERING_MODE_B0:
+			qp.qpn = rqp->local_qpn;
+			(void) mlx4_qp_detach_common(dev, &qp, rgid->gid,
+						     rgid->prot, rgid->steer);
+			break;
+		}
+		list_del(&rgid->list);
+		kfree(rgid);
+	}
+}
+
+static int _move_all_busy(struct mlx4_dev *dev, int slave,
+			  enum mlx4_resource type, int print)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct list_head *rlist = &tracker->slave_list[slave].res_list[type];
+	struct res_common *r;
+	struct res_common *tmp;
+	int busy;
+
+	busy = 0;
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(r, tmp, rlist, list) {
+		if (r->owner == slave) {
+			if (!r->removing) {
+				if (r->state == RES_ANY_BUSY) {
+					if (print)
+						mlx4_dbg(dev,
+							 "%s id 0x%llx is busy\n",
+							  resource_str(type),
+							  r->res_id);
+					++busy;
+				} else {
+					r->from_state = r->state;
+					r->state = RES_ANY_BUSY;
+					r->removing = 1;
+				}
+			}
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return busy;
+}
+
+static int move_all_busy(struct mlx4_dev *dev, int slave,
+			 enum mlx4_resource type)
+{
+	unsigned long begin;
+	int busy;
+
+	begin = jiffies;
+	do {
+		busy = _move_all_busy(dev, slave, type, 0);
+		if (time_after(jiffies, begin + 5 * HZ))
+			break;
+		if (busy)
+			cond_resched();
+	} while (busy);
+
+	if (busy)
+		busy = _move_all_busy(dev, slave, type, 1);
+
+	return busy;
+}
+static void rem_slave_qps(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *qp_list =
+		&tracker->slave_list[slave].res_list[RES_QP];
+	struct res_qp *qp;
+	struct res_qp *tmp;
+	int state;
+	u64 in_param;
+	int qpn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_QP);
+	if (err)
+		mlx4_warn(dev, "rem_slave_qps: Could not move all qps to busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(qp, tmp, qp_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (qp->com.owner == slave) {
+			qpn = qp->com.res_id;
+			detach_qp(dev, slave, qp);
+			state = qp->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_QP_RESERVED:
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&qp->com.node,
+						 &tracker->res_tree[RES_QP]);
+					list_del(&qp->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					if (!valid_reserved(dev, slave, qpn)) {
+						__mlx4_qp_release_range(dev, qpn, 1);
+						mlx4_release_resource(dev, slave,
+								      RES_QP, 1, 0);
+					}
+					kfree(qp);
+					state = 0;
+					break;
+				case RES_QP_MAPPED:
+					if (!valid_reserved(dev, slave, qpn))
+						__mlx4_qp_free_icm(dev, qpn);
+					state = RES_QP_RESERVED;
+					break;
+				case RES_QP_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param,
+						       qp->local_qpn, 2,
+						       MLX4_CMD_2RST_QP,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_qps: failed to move slave %d qpn %d to reset\n",
+							 slave, qp->local_qpn);
+					atomic_dec(&qp->rcq->ref_count);
+					atomic_dec(&qp->scq->ref_count);
+					atomic_dec(&qp->mtt->ref_count);
+					if (qp->srq)
+						atomic_dec(&qp->srq->ref_count);
+					state = RES_QP_MAPPED;
+					break;
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_srqs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *srq_list =
+		&tracker->slave_list[slave].res_list[RES_SRQ];
+	struct res_srq *srq;
+	struct res_srq *tmp;
+	int state;
+	u64 in_param;
+	LIST_HEAD(tlist);
+	int srqn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_SRQ);
+	if (err)
+		mlx4_warn(dev, "rem_slave_srqs: Could not move all srqs - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(srq, tmp, srq_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (srq->com.owner == slave) {
+			srqn = srq->com.res_id;
+			state = srq->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_SRQ_ALLOCATED:
+					__mlx4_srq_free_icm(dev, srqn);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&srq->com.node,
+						 &tracker->res_tree[RES_SRQ]);
+					list_del(&srq->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave,
+							      RES_SRQ, 1, 0);
+					kfree(srq);
+					state = 0;
+					break;
+
+				case RES_SRQ_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param, srqn, 1,
+						       MLX4_CMD_HW2SW_SRQ,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_srqs: failed to move slave %d srq %d to SW ownership\n",
+							 slave, srqn);
+
+					atomic_dec(&srq->mtt->ref_count);
+					if (srq->cq)
+						atomic_dec(&srq->cq->ref_count);
+					state = RES_SRQ_ALLOCATED;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_cqs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *cq_list =
+		&tracker->slave_list[slave].res_list[RES_CQ];
+	struct res_cq *cq;
+	struct res_cq *tmp;
+	int state;
+	u64 in_param;
+	LIST_HEAD(tlist);
+	int cqn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_CQ);
+	if (err)
+		mlx4_warn(dev, "rem_slave_cqs: Could not move all cqs - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(cq, tmp, cq_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (cq->com.owner == slave && !atomic_read(&cq->ref_count)) {
+			cqn = cq->com.res_id;
+			state = cq->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_CQ_ALLOCATED:
+					__mlx4_cq_free_icm(dev, cqn);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&cq->com.node,
+						 &tracker->res_tree[RES_CQ]);
+					list_del(&cq->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave,
+							      RES_CQ, 1, 0);
+					kfree(cq);
+					state = 0;
+					break;
+
+				case RES_CQ_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param, cqn, 1,
+						       MLX4_CMD_HW2SW_CQ,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_cqs: failed to move slave %d cq %d to SW ownership\n",
+							 slave, cqn);
+					atomic_dec(&cq->mtt->ref_count);
+					state = RES_CQ_ALLOCATED;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_mrs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mpt_list =
+		&tracker->slave_list[slave].res_list[RES_MPT];
+	struct res_mpt *mpt;
+	struct res_mpt *tmp;
+	int state;
+	u64 in_param;
+	LIST_HEAD(tlist);
+	int mptn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_MPT);
+	if (err)
+		mlx4_warn(dev, "rem_slave_mrs: Could not move all mpts - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(mpt, tmp, mpt_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (mpt->com.owner == slave) {
+			mptn = mpt->com.res_id;
+			state = mpt->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_MPT_RESERVED:
+					__mlx4_mpt_release(dev, mpt->key);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&mpt->com.node,
+						 &tracker->res_tree[RES_MPT]);
+					list_del(&mpt->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave,
+							      RES_MPT, 1, 0);
+					kfree(mpt);
+					state = 0;
+					break;
+
+				case RES_MPT_MAPPED:
+					__mlx4_mpt_free_icm(dev, mpt->key);
+					state = RES_MPT_RESERVED;
+					break;
+
+				case RES_MPT_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param, mptn, 0,
+						     MLX4_CMD_HW2SW_MPT,
+						     MLX4_CMD_TIME_CLASS_A,
+						     MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_mrs: failed to move slave %d mpt %d to SW ownership\n",
+							 slave, mptn);
+					if (mpt->mtt)
+						atomic_dec(&mpt->mtt->ref_count);
+					state = RES_MPT_MAPPED;
+					break;
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_mtts(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct list_head *mtt_list =
+		&tracker->slave_list[slave].res_list[RES_MTT];
+	struct res_mtt *mtt;
+	struct res_mtt *tmp;
+	int state;
+	LIST_HEAD(tlist);
+	int base;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_MTT);
+	if (err)
+		mlx4_warn(dev, "rem_slave_mtts: Could not move all mtts  - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(mtt, tmp, mtt_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (mtt->com.owner == slave) {
+			base = mtt->com.res_id;
+			state = mtt->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_MTT_ALLOCATED:
+					__mlx4_free_mtt_range(dev, base,
+							      mtt->order);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&mtt->com.node,
+						 &tracker->res_tree[RES_MTT]);
+					list_del(&mtt->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave, RES_MTT,
+							      1 << mtt->order, 0);
+					kfree(mtt);
+					state = 0;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static int mlx4_do_mirror_rule(struct mlx4_dev *dev, struct res_fs_rule *fs_rule)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	struct res_fs_rule *mirr_rule;
+	u64 reg_id;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (!fs_rule->mirr_mbox) {
+		mlx4_err(dev, "rule mirroring mailbox is null\n");
+		return -EINVAL;
+	}
+	memcpy(mailbox->buf, fs_rule->mirr_mbox, fs_rule->mirr_mbox_size);
+	err = mlx4_cmd_imm(dev, mailbox->dma, &reg_id, fs_rule->mirr_mbox_size >> 2, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	if (err)
+		goto err;
+
+	err = add_res_range(dev, fs_rule->com.owner, reg_id, 1, RES_FS_RULE, fs_rule->qpn);
+	if (err)
+		goto err_detach;
+
+	err = get_res(dev, fs_rule->com.owner, reg_id, RES_FS_RULE, &mirr_rule);
+	if (err)
+		goto err_rem;
+
+	fs_rule->mirr_rule_id = reg_id;
+	mirr_rule->mirr_rule_id = 0;
+	mirr_rule->mirr_mbox_size = 0;
+	mirr_rule->mirr_mbox = NULL;
+	put_res(dev, fs_rule->com.owner, reg_id, RES_FS_RULE);
+
+	return 0;
+err_rem:
+	rem_res_range(dev, fs_rule->com.owner, reg_id, 1, RES_FS_RULE, 0);
+err_detach:
+	mlx4_cmd(dev, reg_id, 0, 0, MLX4_QP_FLOW_STEERING_DETACH,
+		 MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+err:
+	return err;
+}
+
+static int mlx4_mirror_fs_rules(struct mlx4_dev *dev, bool bond)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct rb_root *root = &tracker->res_tree[RES_FS_RULE];
+	struct rb_node *p;
+	struct res_fs_rule *fs_rule;
+	int err = 0;
+	LIST_HEAD(mirr_list);
+
+	for (p = rb_first(root); p; p = rb_next(p)) {
+		fs_rule = rb_entry(p, struct res_fs_rule, com.node);
+		if ((bond && fs_rule->mirr_mbox_size) ||
+		    (!bond && !fs_rule->mirr_mbox_size))
+			list_add_tail(&fs_rule->mirr_list, &mirr_list);
+	}
+
+	list_for_each_entry(fs_rule, &mirr_list, mirr_list) {
+		if (bond)
+			err += mlx4_do_mirror_rule(dev, fs_rule);
+		else
+			err += mlx4_undo_mirror_rule(dev, fs_rule);
+	}
+	return err;
+}
+
+int mlx4_bond_fs_rules(struct mlx4_dev *dev)
+{
+	return mlx4_mirror_fs_rules(dev, true);
+}
+
+int mlx4_unbond_fs_rules(struct mlx4_dev *dev)
+{
+	return mlx4_mirror_fs_rules(dev, false);
+}
+
+static void rem_slave_fs_rule(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct list_head *fs_rule_list =
+		&tracker->slave_list[slave].res_list[RES_FS_RULE];
+	struct res_fs_rule *fs_rule;
+	struct res_fs_rule *tmp;
+	int state;
+	u64 base;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_FS_RULE);
+	if (err)
+		mlx4_warn(dev, "rem_slave_fs_rule: Could not move all mtts to busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(fs_rule, tmp, fs_rule_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (fs_rule->com.owner == slave) {
+			base = fs_rule->com.res_id;
+			state = fs_rule->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_FS_RULE_ALLOCATED:
+					/* detach rule */
+					err = mlx4_cmd(dev, base, 0, 0,
+						       MLX4_QP_FLOW_STEERING_DETACH,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&fs_rule->com.node,
+						 &tracker->res_tree[RES_FS_RULE]);
+					list_del(&fs_rule->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					kfree(fs_rule->mirr_mbox);
+					kfree(fs_rule);
+					state = 0;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_eqs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *eq_list =
+		&tracker->slave_list[slave].res_list[RES_EQ];
+	struct res_eq *eq;
+	struct res_eq *tmp;
+	int err;
+	int state;
+	LIST_HEAD(tlist);
+	int eqn;
+
+	err = move_all_busy(dev, slave, RES_EQ);
+	if (err)
+		mlx4_warn(dev, "rem_slave_eqs: Could not move all eqs - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(eq, tmp, eq_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (eq->com.owner == slave) {
+			eqn = eq->com.res_id;
+			state = eq->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_EQ_RESERVED:
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&eq->com.node,
+						 &tracker->res_tree[RES_EQ]);
+					list_del(&eq->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					kfree(eq);
+					state = 0;
+					break;
+
+				case RES_EQ_HW:
+					err = mlx4_cmd(dev, slave, eqn & 0x3ff,
+						       1, MLX4_CMD_HW2SW_EQ,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_eqs: failed to move slave %d eqs %d to SW ownership\n",
+							 slave, eqn & 0x3ff);
+					atomic_dec(&eq->mtt->ref_count);
+					state = RES_EQ_RESERVED;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_counters(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *counter_list =
+		&tracker->slave_list[slave].res_list[RES_COUNTER];
+	struct res_counter *counter;
+	struct res_counter *tmp;
+	int err;
+	int *counters_arr = NULL;
+	int i, j;
+
+	err = move_all_busy(dev, slave, RES_COUNTER);
+	if (err)
+		mlx4_warn(dev, "rem_slave_counters: Could not move all counters - too busy for slave %d\n",
+			  slave);
+
+	counters_arr = kmalloc_array(dev->caps.max_counters,
+				     sizeof(*counters_arr), GFP_KERNEL);
+	if (!counters_arr)
+		return;
+
+	do {
+		i = 0;
+		j = 0;
+		spin_lock_irq(mlx4_tlock(dev));
+		list_for_each_entry_safe(counter, tmp, counter_list, com.list) {
+			if (counter->com.owner == slave) {
+				counters_arr[i++] = counter->com.res_id;
+				rb_erase(&counter->com.node,
+					 &tracker->res_tree[RES_COUNTER]);
+				list_del(&counter->com.list);
+				kfree(counter);
+			}
+		}
+		spin_unlock_irq(mlx4_tlock(dev));
+
+		while (j < i) {
+			__mlx4_counter_free(dev, counters_arr[j++]);
+			mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+		}
+	} while (i);
+
+	kfree(counters_arr);
+}
+
+static void rem_slave_xrcdns(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *xrcdn_list =
+		&tracker->slave_list[slave].res_list[RES_XRCD];
+	struct res_xrcdn *xrcd;
+	struct res_xrcdn *tmp;
+	int err;
+	int xrcdn;
+
+	err = move_all_busy(dev, slave, RES_XRCD);
+	if (err)
+		mlx4_warn(dev, "rem_slave_xrcdns: Could not move all xrcdns - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(xrcd, tmp, xrcdn_list, com.list) {
+		if (xrcd->com.owner == slave) {
+			xrcdn = xrcd->com.res_id;
+			rb_erase(&xrcd->com.node, &tracker->res_tree[RES_XRCD]);
+			list_del(&xrcd->com.list);
+			kfree(xrcd);
+			__mlx4_xrcd_free(dev, xrcdn);
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	mlx4_reset_roce_gids(dev, slave);
+	mutex_lock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex);
+	rem_slave_vlans(dev, slave);
+	rem_slave_macs(dev, slave);
+	rem_slave_fs_rule(dev, slave);
+	rem_slave_qps(dev, slave);
+	rem_slave_srqs(dev, slave);
+	rem_slave_cqs(dev, slave);
+	rem_slave_mrs(dev, slave);
+	rem_slave_eqs(dev, slave);
+	rem_slave_mtts(dev, slave);
+	rem_slave_counters(dev, slave);
+	rem_slave_xrcdns(dev, slave);
+	mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex);
+}
+
+static void update_qos_vpp(struct mlx4_update_qp_context *ctx,
+			   struct mlx4_vf_immed_vlan_work *work)
+{
+	ctx->qp_mask |= cpu_to_be64(1ULL << MLX4_UPD_QP_MASK_QOS_VPP);
+	ctx->qp_context.qos_vport = work->qos_vport;
+}
+
+void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work)
+{
+	struct mlx4_vf_immed_vlan_work *work =
+		container_of(_work, struct mlx4_vf_immed_vlan_work, work);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_update_qp_context *upd_context;
+	struct mlx4_dev *dev = &work->priv->dev;
+	struct mlx4_resource_tracker *tracker =
+		&work->priv->mfunc.master.res_tracker;
+	struct list_head *qp_list =
+		&tracker->slave_list[work->slave].res_list[RES_QP];
+	struct res_qp *qp;
+	struct res_qp *tmp;
+	u64 qp_path_mask_vlan_ctrl =
+		       ((1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_UNTAGGED) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_1P) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_TAGGED) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_UNTAGGED) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_1P) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_TAGGED));
+
+	u64 qp_path_mask = ((1ULL << MLX4_UPD_QP_PATH_MASK_VLAN_INDEX) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_FVL) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_CV) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_SV) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_HIDE_CQE_VLAN) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_FEUP) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_FVL_RX) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_SCHED_QUEUE));
+
+	int err;
+	int port, errors = 0;
+	u8 vlan_control;
+
+	if (mlx4_is_slave(dev)) {
+		mlx4_warn(dev, "Trying to update-qp in slave %d\n",
+			  work->slave);
+		goto out;
+	}
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		goto out;
+	if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE) /* block all */
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+	else if (!work->vlan_id)
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+	else if (work->vlan_proto == htons(ETH_P_8021AD))
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+	else  /* vst 802.1Q */
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+
+	upd_context = mailbox->buf;
+	upd_context->qp_mask = cpu_to_be64(1ULL << MLX4_UPD_QP_MASK_VSD);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(qp, tmp, qp_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (qp->com.owner == work->slave) {
+			if (qp->com.from_state != RES_QP_HW ||
+			    !qp->sched_queue ||  /* no INIT2RTR trans yet */
+			    mlx4_is_qp_reserved(dev, qp->local_qpn) ||
+			    qp->qpc_flags & (1 << MLX4_RSS_QPC_FLAG_OFFSET)) {
+				spin_lock_irq(mlx4_tlock(dev));
+				continue;
+			}
+			port = (qp->sched_queue >> 6 & 1) + 1;
+			if (port != work->port) {
+				spin_lock_irq(mlx4_tlock(dev));
+				continue;
+			}
+			if (MLX4_QP_ST_RC == ((qp->qpc_flags >> 16) & 0xff))
+				upd_context->primary_addr_path_mask = cpu_to_be64(qp_path_mask);
+			else
+				upd_context->primary_addr_path_mask =
+					cpu_to_be64(qp_path_mask | qp_path_mask_vlan_ctrl);
+			if (work->vlan_id == MLX4_VGT) {
+				upd_context->qp_context.param3 = qp->param3;
+				upd_context->qp_context.pri_path.vlan_control = qp->vlan_control;
+				upd_context->qp_context.pri_path.fvl_rx = qp->fvl_rx;
+				upd_context->qp_context.pri_path.vlan_index = qp->vlan_index;
+				upd_context->qp_context.pri_path.fl = qp->pri_path_fl;
+				upd_context->qp_context.pri_path.feup = qp->feup;
+				upd_context->qp_context.pri_path.sched_queue =
+					qp->sched_queue;
+			} else {
+				upd_context->qp_context.param3 = qp->param3 & ~cpu_to_be32(MLX4_STRIP_VLAN);
+				upd_context->qp_context.pri_path.vlan_control = vlan_control;
+				upd_context->qp_context.pri_path.vlan_index = work->vlan_ix;
+				upd_context->qp_context.pri_path.fvl_rx =
+					qp->fvl_rx | MLX4_FVL_RX_FORCE_ETH_VLAN;
+				upd_context->qp_context.pri_path.fl =
+					qp->pri_path_fl | MLX4_FL_ETH_HIDE_CQE_VLAN;
+				if (work->vlan_proto == htons(ETH_P_8021AD))
+					upd_context->qp_context.pri_path.fl |= MLX4_FL_SV;
+				else
+					upd_context->qp_context.pri_path.fl |= MLX4_FL_CV;
+				upd_context->qp_context.pri_path.feup =
+					qp->feup | MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN;
+				upd_context->qp_context.pri_path.sched_queue =
+					qp->sched_queue & 0xC7;
+				upd_context->qp_context.pri_path.sched_queue |=
+					((work->qos & 0x7) << 3);
+
+				if (dev->caps.flags2 &
+				    MLX4_DEV_CAP_FLAG2_QOS_VPP)
+					update_qos_vpp(upd_context, work);
+			}
+
+			err = mlx4_cmd(dev, mailbox->dma,
+				       qp->local_qpn & 0xffffff,
+				       0, MLX4_CMD_UPDATE_QP,
+				       MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+			if (err) {
+				mlx4_info(dev, "UPDATE_QP failed for slave %d, port %d, qpn %d (%d)\n",
+					  work->slave, port, qp->local_qpn, err);
+				errors++;
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	if (errors)
+		mlx4_err(dev, "%d UPDATE_QP failures for slave %d, port %d\n",
+			 errors, work->slave, work->port);
+
+	/* unregister previous vlan_id if needed and we had no errors
+	 * while updating the QPs
+	 */
+	if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_VLAN && !errors &&
+	    NO_INDX != work->orig_vlan_ix)
+		__mlx4_unregister_vlan(&work->priv->dev, work->port,
+				       work->orig_vlan_id);
+out:
+	kfree(work);
+	return;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/sense.c b/drivers/net/ethernet/mellanox/mlx4/sense.c
new file mode 100644
index 0000000..094773d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/sense.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+
+int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
+		    enum mlx4_port_type *type)
+{
+	u64 out_param;
+	int err = 0;
+
+	err = mlx4_cmd_imm(dev, 0, &out_param, port, 0,
+			   MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_WRAPPED);
+	if (err) {
+		mlx4_err(dev, "Sense command failed for port: %d\n", port);
+		return err;
+	}
+
+	if (out_param > 2) {
+		mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", out_param);
+		return -EINVAL;
+	}
+
+	*type = out_param;
+	return 0;
+}
+
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+			 enum mlx4_port_type *stype,
+			 enum mlx4_port_type *defaults)
+{
+	struct mlx4_sense *sense = &mlx4_priv(dev)->sense;
+	int err;
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		stype[i - 1] = 0;
+		if (sense->do_sense_port[i] && sense->sense_allowed[i] &&
+		    dev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+			err = mlx4_SENSE_PORT(dev, i, &stype[i - 1]);
+			if (err)
+				stype[i - 1] = defaults[i - 1];
+		} else
+			stype[i - 1] = defaults[i - 1];
+	}
+
+	/*
+	 * If sensed nothing, remain in current configuration.
+	 */
+	for (i = 0; i < dev->caps.num_ports; i++)
+		stype[i] = stype[i] ? stype[i] : defaults[i];
+
+}
+
+static void mlx4_sense_port(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mlx4_sense *sense = container_of(delay, struct mlx4_sense,
+						sense_poll);
+	struct mlx4_dev *dev = sense->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	enum mlx4_port_type stype[MLX4_MAX_PORTS];
+
+	mutex_lock(&priv->port_mutex);
+	mlx4_do_sense_ports(dev, stype, &dev->caps.port_type[1]);
+
+	if (mlx4_check_port_params(dev, stype))
+		goto sense_again;
+
+	if (mlx4_change_port_types(dev, stype))
+		mlx4_err(dev, "Failed to change port_types\n");
+
+sense_again:
+	mutex_unlock(&priv->port_mutex);
+	queue_delayed_work(mlx4_wq , &sense->sense_poll,
+			   round_jiffies_relative(MLX4_SENSE_RANGE));
+}
+
+void mlx4_start_sense(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_sense *sense = &priv->sense;
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP))
+		return;
+
+	queue_delayed_work(mlx4_wq , &sense->sense_poll,
+			   round_jiffies_relative(MLX4_SENSE_RANGE));
+}
+
+void mlx4_stop_sense(struct mlx4_dev *dev)
+{
+	cancel_delayed_work_sync(&mlx4_priv(dev)->sense.sense_poll);
+}
+
+void  mlx4_sense_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_sense *sense = &priv->sense;
+	int port;
+
+	sense->dev = dev;
+	for (port = 1; port <= dev->caps.num_ports; port++)
+		sense->do_sense_port[port] = 1;
+
+	INIT_DEFERRABLE_WORK(&sense->sense_poll, mlx4_sense_port);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c
new file mode 100644
index 0000000..cbe4d97
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/srq.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/srq.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_srq *srq;
+
+	rcu_read_lock();
+	srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1));
+	rcu_read_unlock();
+	if (srq)
+		refcount_inc(&srq->refcount);
+	else {
+		mlx4_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
+		return;
+	}
+
+	srq->event(srq, event_type);
+
+	if (refcount_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+}
+
+static int mlx4_SW2HW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, srq_num, 0,
+			MLX4_CMD_SW2HW_SRQ, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_HW2SW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, srq_num,
+			    mailbox ? 0 : 1, MLX4_CMD_HW2SW_SRQ,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_ARM_SRQ(struct mlx4_dev *dev, int srq_num, int limit_watermark)
+{
+	return mlx4_cmd(dev, limit_watermark, srq_num, 0, MLX4_CMD_ARM_SRQ,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_QUERY_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox->dma, srq_num, 0, MLX4_CMD_QUERY_SRQ,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	int err;
+
+
+	*srqn = mlx4_bitmap_alloc(&srq_table->bitmap);
+	if (*srqn == -1)
+		return -ENOMEM;
+
+	err = mlx4_table_get(dev, &srq_table->table, *srqn);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn);
+	if (err)
+		goto err_put;
+	return 0;
+
+err_put:
+	mlx4_table_put(dev, &srq_table->table, *srqn);
+
+err_out:
+	mlx4_bitmap_free(&srq_table->bitmap, *srqn, MLX4_NO_RR);
+	return err;
+}
+
+static int mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn)
+{
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param, RES_SRQ,
+				   RES_OP_RESERVE_AND_MAP,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (!err)
+			*srqn = get_param_l(&out_param);
+
+		return err;
+	}
+	return __mlx4_srq_alloc_icm(dev, srqn);
+}
+
+void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+
+	mlx4_table_put(dev, &srq_table->cmpt_table, srqn);
+	mlx4_table_put(dev, &srq_table->table, srqn);
+	mlx4_bitmap_free(&srq_table->bitmap, srqn, MLX4_NO_RR);
+}
+
+static void mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, srqn);
+		if (mlx4_cmd(dev, in_param, RES_SRQ, RES_OP_RESERVE_AND_MAP,
+			     MLX4_CMD_FREE_RES,
+			     MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed freeing cq:%d\n", srqn);
+		return;
+	}
+	__mlx4_srq_free_icm(dev, srqn);
+}
+
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd,
+		   struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_srq_context *srq_context;
+	u64 mtt_addr;
+	int err;
+
+	err = mlx4_srq_alloc_icm(dev, &srq->srqn);
+	if (err)
+		return err;
+
+	spin_lock_irq(&srq_table->lock);
+	err = radix_tree_insert(&srq_table->tree, srq->srqn, srq);
+	spin_unlock_irq(&srq_table->lock);
+	if (err)
+		goto err_icm;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_radix;
+	}
+
+	srq_context = mailbox->buf;
+	srq_context->state_logsize_srqn = cpu_to_be32((ilog2(srq->max) << 24) |
+						      srq->srqn);
+	srq_context->logstride          = srq->wqe_shift - 4;
+	srq_context->xrcd		= cpu_to_be16(xrcd);
+	srq_context->pg_offset_cqn	= cpu_to_be32(cqn & 0xffffff);
+	srq_context->log_page_size      = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	srq_context->mtt_base_addr_h    = mtt_addr >> 32;
+	srq_context->mtt_base_addr_l    = cpu_to_be32(mtt_addr & 0xffffffff);
+	srq_context->pd			= cpu_to_be32(pdn);
+	srq_context->db_rec_addr        = cpu_to_be64(db_rec);
+
+	err = mlx4_SW2HW_SRQ(dev, mailbox, srq->srqn);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		goto err_radix;
+
+	refcount_set(&srq->refcount, 1);
+	init_completion(&srq->free);
+
+	return 0;
+
+err_radix:
+	spin_lock_irq(&srq_table->lock);
+	radix_tree_delete(&srq_table->tree, srq->srqn);
+	spin_unlock_irq(&srq_table->lock);
+
+err_icm:
+	mlx4_srq_free_icm(dev, srq->srqn);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_alloc);
+
+void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	int err;
+
+	err = mlx4_HW2SW_SRQ(dev, NULL, srq->srqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_SRQ failed (%d) for SRQN %06x\n", err, srq->srqn);
+
+	spin_lock_irq(&srq_table->lock);
+	radix_tree_delete(&srq_table->tree, srq->srqn);
+	spin_unlock_irq(&srq_table->lock);
+
+	if (refcount_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+	wait_for_completion(&srq->free);
+
+	mlx4_srq_free_icm(dev, srq->srqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_free);
+
+int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark)
+{
+	return mlx4_ARM_SRQ(dev, srq->srqn, limit_watermark);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_arm);
+
+int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_srq_context *srq_context;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	srq_context = mailbox->buf;
+
+	err = mlx4_QUERY_SRQ(dev, mailbox, srq->srqn);
+	if (err)
+		goto err_out;
+	*limit_watermark = be16_to_cpu(srq_context->limit_watermark);
+
+err_out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_query);
+
+int mlx4_init_srq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	int err;
+
+	spin_lock_init(&srq_table->lock);
+	INIT_RADIX_TREE(&srq_table->tree, GFP_ATOMIC);
+	if (mlx4_is_slave(dev))
+		return 0;
+
+	err = mlx4_bitmap_init(&srq_table->bitmap, dev->caps.num_srqs,
+			       dev->caps.num_srqs - 1, dev->caps.reserved_srqs, 0);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void mlx4_cleanup_srq_table(struct mlx4_dev *dev)
+{
+	if (mlx4_is_slave(dev))
+		return;
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->srq_table.bitmap);
+}
+
+struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_srq *srq;
+
+	rcu_read_lock();
+	srq = radix_tree_lookup(&srq_table->tree,
+				srqn & (dev->caps.num_srqs - 1));
+	rcu_read_unlock();
+
+	return srq;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_lookup);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
new file mode 100644
index 0000000..c032319
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -0,0 +1,87 @@
+#
+# Mellanox driver configuration
+#
+
+config MLX5_CORE
+	tristate "Mellanox Technologies ConnectX-4 and Connect-IB core driver"
+	depends on MAY_USE_DEVLINK
+	depends on PCI
+	imply PTP_1588_CLOCK
+	default n
+	---help---
+	  Core driver for low level functionality of the ConnectX-4 and
+	  Connect-IB cards by Mellanox Technologies.
+
+config MLX5_ACCEL
+	bool
+
+config MLX5_FPGA
+        bool "Mellanox Technologies Innova support"
+        depends on MLX5_CORE
+	select MLX5_ACCEL
+        ---help---
+          Build support for the Innova family of network cards by Mellanox
+          Technologies. Innova network cards are comprised of a ConnectX chip
+          and an FPGA chip on one board. If you select this option, the
+          mlx5_core driver will include the Innova FPGA core and allow building
+          sandbox-specific client drivers.
+
+config MLX5_CORE_EN
+	bool "Mellanox Technologies ConnectX-4 Ethernet support"
+	depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE
+	depends on IPV6=y || IPV6=n || MLX5_CORE=m
+	default n
+	---help---
+	  Ethernet support in Mellanox Technologies ConnectX-4 NIC.
+
+config MLX5_MPFS
+        bool "Mellanox Technologies MLX5 MPFS support"
+        depends on MLX5_CORE_EN
+	default y
+        ---help---
+	  Mellanox Technologies Ethernet Multi-Physical Function Switch (MPFS)
+          support in ConnectX NIC. MPFs is required for when multi-PF configuration
+          is enabled to allow passing user configured unicast MAC addresses to the
+          requesting PF.
+
+config MLX5_ESWITCH
+	bool "Mellanox Technologies MLX5 SRIOV E-Switch support"
+	depends on MLX5_CORE_EN && NET_SWITCHDEV
+	default y
+	---help---
+	  Mellanox Technologies Ethernet SRIOV E-Switch support in ConnectX NIC.
+          E-Switch provides internal SRIOV packet steering and switching for the
+          enabled VFs and PF in two available modes:
+                Legacy SRIOV mode (L2 mac vlan steering based).
+                Switchdev mode (eswitch offloads).
+
+config MLX5_CORE_EN_DCB
+	bool "Data Center Bridging (DCB) Support"
+	default y
+	depends on MLX5_CORE_EN && DCB
+	---help---
+	  Say Y here if you want to use Data Center Bridging (DCB) in the
+	  driver.
+	  If set to N, will not be able to configure QoS and ratelimit attributes.
+	  This flag is depended on the kernel's DCB support.
+
+	  If unsure, set to Y
+
+config MLX5_CORE_IPOIB
+	bool "Mellanox Technologies ConnectX-4 IPoIB offloads support"
+	depends on MLX5_CORE_EN
+	default n
+	---help---
+	  MLX5 IPoIB offloads & acceleration support.
+
+config MLX5_EN_IPSEC
+	bool "IPSec XFRM cryptography-offload accelaration"
+	depends on MLX5_ACCEL
+	depends on MLX5_CORE_EN
+	depends on XFRM_OFFLOAD
+	depends on INET_ESP_OFFLOAD || INET6_ESP_OFFLOAD
+	default n
+	---help---
+	  Build support for IPsec cryptography-offload accelaration in the NIC.
+	  Note: Support for hardware with this capability needs to be selected
+	  for this option to become available.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
new file mode 100644
index 0000000..c805769
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_MLX5_CORE)		+= mlx5_core.o
+subdir-ccflags-y += -I$(src)
+
+mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
+		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o \
+		mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
+		fs_counters.o rl.o lag.o dev.o wq.o lib/gid.o lib/clock.o \
+		diag/fs_tracepoint.o
+
+mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o
+
+mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o \
+		fpga/ipsec.o
+
+mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
+		en_tx.o en_rx.o en_dim.o en_txrx.o en_stats.o vxlan.o \
+		en_arfs.o en_fs_ethtool.o en_selftest.o
+
+mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
+
+mlx5_core-$(CONFIG_MLX5_ESWITCH) += eswitch.o eswitch_offloads.o en_rep.o en_tc.o
+
+mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) +=  en_dcbnl.o
+
+mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o ipoib/ethtool.o ipoib/ipoib_vlan.o
+
+mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \
+		en_accel/ipsec_stats.o
+
+CFLAGS_tracepoint.o := -I$(src)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/accel/Makefile
new file mode 100644
index 0000000..d8e1711
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
new file mode 100644
index 0000000..9f1b193
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx5/device.h>
+
+#include "accel/ipsec.h"
+#include "mlx5_core.h"
+#include "fpga/ipsec.h"
+
+u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_ipsec_device_caps(mdev);
+}
+EXPORT_SYMBOL_GPL(mlx5_accel_ipsec_device_caps);
+
+unsigned int mlx5_accel_ipsec_counters_count(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_ipsec_counters_count(mdev);
+}
+
+int mlx5_accel_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters,
+				   unsigned int count)
+{
+	return mlx5_fpga_ipsec_counters_read(mdev, counters, count);
+}
+
+void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev,
+				       struct mlx5_accel_esp_xfrm *xfrm,
+				       const __be32 saddr[4],
+				       const __be32 daddr[4],
+				       const __be32 spi, bool is_ipv6)
+{
+	return mlx5_fpga_ipsec_create_sa_ctx(mdev, xfrm, saddr, daddr,
+					     spi, is_ipv6);
+}
+
+void mlx5_accel_esp_free_hw_context(void *context)
+{
+	mlx5_fpga_ipsec_delete_sa_ctx(context);
+}
+
+int mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_ipsec_init(mdev);
+}
+
+void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev)
+{
+	mlx5_fpga_ipsec_cleanup(mdev);
+}
+
+struct mlx5_accel_esp_xfrm *
+mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev,
+			   const struct mlx5_accel_esp_xfrm_attrs *attrs,
+			   u32 flags)
+{
+	struct mlx5_accel_esp_xfrm *xfrm;
+
+	xfrm = mlx5_fpga_esp_create_xfrm(mdev, attrs, flags);
+	if (IS_ERR(xfrm))
+		return xfrm;
+
+	xfrm->mdev = mdev;
+	return xfrm;
+}
+EXPORT_SYMBOL_GPL(mlx5_accel_esp_create_xfrm);
+
+void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm)
+{
+	mlx5_fpga_esp_destroy_xfrm(xfrm);
+}
+EXPORT_SYMBOL_GPL(mlx5_accel_esp_destroy_xfrm);
+
+int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
+			       const struct mlx5_accel_esp_xfrm_attrs *attrs)
+{
+	return mlx5_fpga_esp_modify_xfrm(xfrm, attrs);
+}
+EXPORT_SYMBOL_GPL(mlx5_accel_esp_modify_xfrm);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
new file mode 100644
index 0000000..024dbd2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5_ACCEL_IPSEC_H__
+#define __MLX5_ACCEL_IPSEC_H__
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/accel.h>
+
+#ifdef CONFIG_MLX5_ACCEL
+
+#define MLX5_IPSEC_DEV(mdev) (mlx5_accel_ipsec_device_caps(mdev) & \
+			      MLX5_ACCEL_IPSEC_CAP_DEVICE)
+
+unsigned int mlx5_accel_ipsec_counters_count(struct mlx5_core_dev *mdev);
+int mlx5_accel_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters,
+				   unsigned int count);
+
+void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev,
+				       struct mlx5_accel_esp_xfrm *xfrm,
+				       const __be32 saddr[4],
+				       const __be32 daddr[4],
+				       const __be32 spi, bool is_ipv6);
+void mlx5_accel_esp_free_hw_context(void *context);
+
+int mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev);
+void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev);
+
+#else
+
+#define MLX5_IPSEC_DEV(mdev) false
+
+static inline void *
+mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev,
+				 struct mlx5_accel_esp_xfrm *xfrm,
+				 const __be32 saddr[4],
+				 const __be32 daddr[4],
+				 const __be32 spi, bool is_ipv6)
+{
+	return NULL;
+}
+
+static inline void mlx5_accel_esp_free_hw_context(void *context)
+{
+}
+
+static inline int mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev)
+{
+	return 0;
+}
+
+static inline void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev)
+{
+}
+
+#endif
+
+#endif	/* __MLX5_ACCEL_IPSEC_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
new file mode 100644
index 0000000..323ffe8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/bitmap.h>
+#include <linux/dma-mapping.h>
+#include <linux/vmalloc.h>
+#include <linux/mlx5/driver.h>
+
+#include "mlx5_core.h"
+
+struct mlx5_db_pgdir {
+	struct list_head	list;
+	unsigned long	       *bitmap;
+	__be32		       *db_page;
+	dma_addr_t		db_dma;
+};
+
+/* Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0.
+ */
+
+static void *mlx5_dma_zalloc_coherent_node(struct mlx5_core_dev *dev,
+					   size_t size, dma_addr_t *dma_handle,
+					   int node)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	int original_node;
+	void *cpu_handle;
+
+	mutex_lock(&priv->alloc_mutex);
+	original_node = dev_to_node(&dev->pdev->dev);
+	set_dev_node(&dev->pdev->dev, node);
+	cpu_handle = dma_zalloc_coherent(&dev->pdev->dev, size,
+					 dma_handle, GFP_KERNEL);
+	set_dev_node(&dev->pdev->dev, original_node);
+	mutex_unlock(&priv->alloc_mutex);
+	return cpu_handle;
+}
+
+int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+			struct mlx5_frag_buf *buf, int node)
+{
+	dma_addr_t t;
+
+	buf->size = size;
+	buf->npages       = 1;
+	buf->page_shift   = (u8)get_order(size) + PAGE_SHIFT;
+
+	buf->frags = kzalloc(sizeof(*buf->frags), GFP_KERNEL);
+	if (!buf->frags)
+		return -ENOMEM;
+
+	buf->frags->buf   = mlx5_dma_zalloc_coherent_node(dev, size,
+							  &t, node);
+	if (!buf->frags->buf)
+		goto err_out;
+
+	buf->frags->map = t;
+
+	while (t & ((1 << buf->page_shift) - 1)) {
+		--buf->page_shift;
+		buf->npages *= 2;
+	}
+
+	return 0;
+err_out:
+	kfree(buf->frags);
+	return -ENOMEM;
+}
+
+int mlx5_buf_alloc(struct mlx5_core_dev *dev,
+		   int size, struct mlx5_frag_buf *buf)
+{
+	return mlx5_buf_alloc_node(dev, size, buf, dev->priv.numa_node);
+}
+EXPORT_SYMBOL(mlx5_buf_alloc);
+
+void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf)
+{
+	dma_free_coherent(&dev->pdev->dev, buf->size, buf->frags->buf,
+			  buf->frags->map);
+
+	kfree(buf->frags);
+}
+EXPORT_SYMBOL_GPL(mlx5_buf_free);
+
+int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+			     struct mlx5_frag_buf *buf, int node)
+{
+	int i;
+
+	buf->size = size;
+	buf->npages = 1 << get_order(size);
+	buf->page_shift = PAGE_SHIFT;
+	buf->frags = kcalloc(buf->npages, sizeof(struct mlx5_buf_list),
+			     GFP_KERNEL);
+	if (!buf->frags)
+		goto err_out;
+
+	for (i = 0; i < buf->npages; i++) {
+		struct mlx5_buf_list *frag = &buf->frags[i];
+		int frag_sz = min_t(int, size, PAGE_SIZE);
+
+		frag->buf = mlx5_dma_zalloc_coherent_node(dev, frag_sz,
+							  &frag->map, node);
+		if (!frag->buf)
+			goto err_free_buf;
+		if (frag->map & ((1 << buf->page_shift) - 1)) {
+			dma_free_coherent(&dev->pdev->dev, frag_sz,
+					  buf->frags[i].buf, buf->frags[i].map);
+			mlx5_core_warn(dev, "unexpected map alignment: %pad, page_shift=%d\n",
+				       &frag->map, buf->page_shift);
+			goto err_free_buf;
+		}
+		size -= frag_sz;
+	}
+
+	return 0;
+
+err_free_buf:
+	while (i--)
+		dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, buf->frags[i].buf,
+				  buf->frags[i].map);
+	kfree(buf->frags);
+err_out:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(mlx5_frag_buf_alloc_node);
+
+void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf)
+{
+	int size = buf->size;
+	int i;
+
+	for (i = 0; i < buf->npages; i++) {
+		int frag_sz = min_t(int, size, PAGE_SIZE);
+
+		dma_free_coherent(&dev->pdev->dev, frag_sz, buf->frags[i].buf,
+				  buf->frags[i].map);
+		size -= frag_sz;
+	}
+	kfree(buf->frags);
+}
+EXPORT_SYMBOL_GPL(mlx5_frag_buf_free);
+
+static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct mlx5_core_dev *dev,
+						 int node)
+{
+	u32 db_per_page = PAGE_SIZE / cache_line_size();
+	struct mlx5_db_pgdir *pgdir;
+
+	pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL);
+	if (!pgdir)
+		return NULL;
+
+	pgdir->bitmap = kcalloc(BITS_TO_LONGS(db_per_page),
+				sizeof(unsigned long),
+				GFP_KERNEL);
+
+	if (!pgdir->bitmap) {
+		kfree(pgdir);
+		return NULL;
+	}
+
+	bitmap_fill(pgdir->bitmap, db_per_page);
+
+	pgdir->db_page = mlx5_dma_zalloc_coherent_node(dev, PAGE_SIZE,
+						       &pgdir->db_dma, node);
+	if (!pgdir->db_page) {
+		kfree(pgdir->bitmap);
+		kfree(pgdir);
+		return NULL;
+	}
+
+	return pgdir;
+}
+
+static int mlx5_alloc_db_from_pgdir(struct mlx5_db_pgdir *pgdir,
+				    struct mlx5_db *db)
+{
+	u32 db_per_page = PAGE_SIZE / cache_line_size();
+	int offset;
+	int i;
+
+	i = find_first_bit(pgdir->bitmap, db_per_page);
+	if (i >= db_per_page)
+		return -ENOMEM;
+
+	__clear_bit(i, pgdir->bitmap);
+
+	db->u.pgdir = pgdir;
+	db->index   = i;
+	offset = db->index * cache_line_size();
+	db->db      = pgdir->db_page + offset / sizeof(*pgdir->db_page);
+	db->dma     = pgdir->db_dma  + offset;
+
+	db->db[0] = 0;
+	db->db[1] = 0;
+
+	return 0;
+}
+
+int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db, int node)
+{
+	struct mlx5_db_pgdir *pgdir;
+	int ret = 0;
+
+	mutex_lock(&dev->priv.pgdir_mutex);
+
+	list_for_each_entry(pgdir, &dev->priv.pgdir_list, list)
+		if (!mlx5_alloc_db_from_pgdir(pgdir, db))
+			goto out;
+
+	pgdir = mlx5_alloc_db_pgdir(dev, node);
+	if (!pgdir) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_add(&pgdir->list, &dev->priv.pgdir_list);
+
+	/* This should never fail -- we just allocated an empty page: */
+	WARN_ON(mlx5_alloc_db_from_pgdir(pgdir, db));
+
+out:
+	mutex_unlock(&dev->priv.pgdir_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx5_db_alloc_node);
+
+int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db)
+{
+	return mlx5_db_alloc_node(dev, db, dev->priv.numa_node);
+}
+EXPORT_SYMBOL_GPL(mlx5_db_alloc);
+
+void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db)
+{
+	u32 db_per_page = PAGE_SIZE / cache_line_size();
+
+	mutex_lock(&dev->priv.pgdir_mutex);
+
+	__set_bit(db->index, db->u.pgdir->bitmap);
+
+	if (bitmap_full(db->u.pgdir->bitmap, db_per_page)) {
+		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+				  db->u.pgdir->db_page, db->u.pgdir->db_dma);
+		list_del(&db->u.pgdir->list);
+		kfree(db->u.pgdir->bitmap);
+		kfree(db->u.pgdir);
+	}
+
+	mutex_unlock(&dev->priv.pgdir_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx5_db_free);
+
+void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas)
+{
+	u64 addr;
+	int i;
+
+	for (i = 0; i < buf->npages; i++) {
+		addr = buf->frags->map + (i << buf->page_shift);
+
+		pas[i] = cpu_to_be64(addr);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx5_fill_page_array);
+
+void mlx5_fill_page_frag_array(struct mlx5_frag_buf *buf, __be64 *pas)
+{
+	int i;
+
+	for (i = 0; i < buf->npages; i++)
+		pas[i] = cpu_to_be64(buf->frags[i].map);
+}
+EXPORT_SYMBOL_GPL(mlx5_fill_page_frag_array);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
new file mode 100644
index 0000000..21cd170
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -0,0 +1,1887 @@
+/*
+ * Copyright (c) 2013-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/random.h>
+#include <linux/io-mapping.h>
+#include <linux/mlx5/driver.h>
+#include <linux/debugfs.h>
+
+#include "mlx5_core.h"
+
+enum {
+	CMD_IF_REV = 5,
+};
+
+enum {
+	CMD_MODE_POLLING,
+	CMD_MODE_EVENTS
+};
+
+enum {
+	MLX5_CMD_DELIVERY_STAT_OK			= 0x0,
+	MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR		= 0x1,
+	MLX5_CMD_DELIVERY_STAT_TOK_ERR			= 0x2,
+	MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR		= 0x3,
+	MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR	= 0x4,
+	MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR		= 0x5,
+	MLX5_CMD_DELIVERY_STAT_FW_ERR			= 0x6,
+	MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR		= 0x7,
+	MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR		= 0x8,
+	MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR	= 0x9,
+	MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR		= 0x10,
+};
+
+static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd,
+					   struct mlx5_cmd_msg *in,
+					   struct mlx5_cmd_msg *out,
+					   void *uout, int uout_size,
+					   mlx5_cmd_cbk_t cbk,
+					   void *context, int page_queue)
+{
+	gfp_t alloc_flags = cbk ? GFP_ATOMIC : GFP_KERNEL;
+	struct mlx5_cmd_work_ent *ent;
+
+	ent = kzalloc(sizeof(*ent), alloc_flags);
+	if (!ent)
+		return ERR_PTR(-ENOMEM);
+
+	ent->in		= in;
+	ent->out	= out;
+	ent->uout	= uout;
+	ent->uout_size	= uout_size;
+	ent->callback	= cbk;
+	ent->context	= context;
+	ent->cmd	= cmd;
+	ent->page_queue = page_queue;
+
+	return ent;
+}
+
+static u8 alloc_token(struct mlx5_cmd *cmd)
+{
+	u8 token;
+
+	spin_lock(&cmd->token_lock);
+	cmd->token++;
+	if (cmd->token == 0)
+		cmd->token++;
+	token = cmd->token;
+	spin_unlock(&cmd->token_lock);
+
+	return token;
+}
+
+static int alloc_ent(struct mlx5_cmd *cmd)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cmd->alloc_lock, flags);
+	ret = find_first_bit(&cmd->bitmask, cmd->max_reg_cmds);
+	if (ret < cmd->max_reg_cmds)
+		clear_bit(ret, &cmd->bitmask);
+	spin_unlock_irqrestore(&cmd->alloc_lock, flags);
+
+	return ret < cmd->max_reg_cmds ? ret : -ENOMEM;
+}
+
+static void free_ent(struct mlx5_cmd *cmd, int idx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cmd->alloc_lock, flags);
+	set_bit(idx, &cmd->bitmask);
+	spin_unlock_irqrestore(&cmd->alloc_lock, flags);
+}
+
+static struct mlx5_cmd_layout *get_inst(struct mlx5_cmd *cmd, int idx)
+{
+	return cmd->cmd_buf + (idx << cmd->log_stride);
+}
+
+static u8 xor8_buf(void *buf, size_t offset, int len)
+{
+	u8 *ptr = buf;
+	u8 sum = 0;
+	int i;
+	int end = len + offset;
+
+	for (i = offset; i < end; i++)
+		sum ^= ptr[i];
+
+	return sum;
+}
+
+static int verify_block_sig(struct mlx5_cmd_prot_block *block)
+{
+	size_t rsvd0_off = offsetof(struct mlx5_cmd_prot_block, rsvd0);
+	int xor_len = sizeof(*block) - sizeof(block->data) - 1;
+
+	if (xor8_buf(block, rsvd0_off, xor_len) != 0xff)
+		return -EINVAL;
+
+	if (xor8_buf(block, 0, sizeof(*block)) != 0xff)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void calc_block_sig(struct mlx5_cmd_prot_block *block)
+{
+	int ctrl_xor_len = sizeof(*block) - sizeof(block->data) - 2;
+	size_t rsvd0_off = offsetof(struct mlx5_cmd_prot_block, rsvd0);
+
+	block->ctrl_sig = ~xor8_buf(block, rsvd0_off, ctrl_xor_len);
+	block->sig = ~xor8_buf(block, 0, sizeof(*block) - 1);
+}
+
+static void calc_chain_sig(struct mlx5_cmd_msg *msg)
+{
+	struct mlx5_cmd_mailbox *next = msg->next;
+	int size = msg->len;
+	int blen = size - min_t(int, sizeof(msg->first.data), size);
+	int n = (blen + MLX5_CMD_DATA_BLOCK_SIZE - 1)
+		/ MLX5_CMD_DATA_BLOCK_SIZE;
+	int i = 0;
+
+	for (i = 0; i < n && next; i++)  {
+		calc_block_sig(next->buf);
+		next = next->next;
+	}
+}
+
+static void set_signature(struct mlx5_cmd_work_ent *ent, int csum)
+{
+	ent->lay->sig = ~xor8_buf(ent->lay, 0,  sizeof(*ent->lay));
+	if (csum) {
+		calc_chain_sig(ent->in);
+		calc_chain_sig(ent->out);
+	}
+}
+
+static void poll_timeout(struct mlx5_cmd_work_ent *ent)
+{
+	unsigned long poll_end = jiffies + msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC + 1000);
+	u8 own;
+
+	do {
+		own = ent->lay->status_own;
+		if (!(own & CMD_OWNER_HW)) {
+			ent->ret = 0;
+			return;
+		}
+		usleep_range(5000, 10000);
+	} while (time_before(jiffies, poll_end));
+
+	ent->ret = -ETIMEDOUT;
+}
+
+static void free_cmd(struct mlx5_cmd_work_ent *ent)
+{
+	kfree(ent);
+}
+
+static int verify_signature(struct mlx5_cmd_work_ent *ent)
+{
+	struct mlx5_cmd_mailbox *next = ent->out->next;
+	int err;
+	u8 sig;
+	int size = ent->out->len;
+	int blen = size - min_t(int, sizeof(ent->out->first.data), size);
+	int n = (blen + MLX5_CMD_DATA_BLOCK_SIZE - 1)
+		/ MLX5_CMD_DATA_BLOCK_SIZE;
+	int i = 0;
+
+	sig = xor8_buf(ent->lay, 0, sizeof(*ent->lay));
+	if (sig != 0xff)
+		return -EINVAL;
+
+	for (i = 0; i < n && next; i++) {
+		err = verify_block_sig(next->buf);
+		if (err)
+			return err;
+
+		next = next->next;
+	}
+
+	return 0;
+}
+
+static void dump_buf(void *buf, int size, int data_only, int offset)
+{
+	__be32 *p = buf;
+	int i;
+
+	for (i = 0; i < size; i += 16) {
+		pr_debug("%03x: %08x %08x %08x %08x\n", offset, be32_to_cpu(p[0]),
+			 be32_to_cpu(p[1]), be32_to_cpu(p[2]),
+			 be32_to_cpu(p[3]));
+		p += 4;
+		offset += 16;
+	}
+	if (!data_only)
+		pr_debug("\n");
+}
+
+static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
+				       u32 *synd, u8 *status)
+{
+	*synd = 0;
+	*status = 0;
+
+	switch (op) {
+	case MLX5_CMD_OP_TEARDOWN_HCA:
+	case MLX5_CMD_OP_DISABLE_HCA:
+	case MLX5_CMD_OP_MANAGE_PAGES:
+	case MLX5_CMD_OP_DESTROY_MKEY:
+	case MLX5_CMD_OP_DESTROY_EQ:
+	case MLX5_CMD_OP_DESTROY_CQ:
+	case MLX5_CMD_OP_DESTROY_QP:
+	case MLX5_CMD_OP_DESTROY_PSV:
+	case MLX5_CMD_OP_DESTROY_SRQ:
+	case MLX5_CMD_OP_DESTROY_XRC_SRQ:
+	case MLX5_CMD_OP_DESTROY_DCT:
+	case MLX5_CMD_OP_DEALLOC_Q_COUNTER:
+	case MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT:
+	case MLX5_CMD_OP_DESTROY_QOS_PARA_VPORT:
+	case MLX5_CMD_OP_DEALLOC_PD:
+	case MLX5_CMD_OP_DEALLOC_UAR:
+	case MLX5_CMD_OP_DETACH_FROM_MCG:
+	case MLX5_CMD_OP_DEALLOC_XRCD:
+	case MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN:
+	case MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT:
+	case MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY:
+	case MLX5_CMD_OP_DESTROY_LAG:
+	case MLX5_CMD_OP_DESTROY_VPORT_LAG:
+	case MLX5_CMD_OP_DESTROY_TIR:
+	case MLX5_CMD_OP_DESTROY_SQ:
+	case MLX5_CMD_OP_DESTROY_RQ:
+	case MLX5_CMD_OP_DESTROY_RMP:
+	case MLX5_CMD_OP_DESTROY_TIS:
+	case MLX5_CMD_OP_DESTROY_RQT:
+	case MLX5_CMD_OP_DESTROY_FLOW_TABLE:
+	case MLX5_CMD_OP_DESTROY_FLOW_GROUP:
+	case MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY:
+	case MLX5_CMD_OP_DEALLOC_FLOW_COUNTER:
+	case MLX5_CMD_OP_2ERR_QP:
+	case MLX5_CMD_OP_2RST_QP:
+	case MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT:
+	case MLX5_CMD_OP_MODIFY_FLOW_TABLE:
+	case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
+	case MLX5_CMD_OP_SET_FLOW_TABLE_ROOT:
+	case MLX5_CMD_OP_DEALLOC_ENCAP_HEADER:
+	case MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT:
+	case MLX5_CMD_OP_FPGA_DESTROY_QP:
+		return MLX5_CMD_STAT_OK;
+
+	case MLX5_CMD_OP_QUERY_HCA_CAP:
+	case MLX5_CMD_OP_QUERY_ADAPTER:
+	case MLX5_CMD_OP_INIT_HCA:
+	case MLX5_CMD_OP_ENABLE_HCA:
+	case MLX5_CMD_OP_QUERY_PAGES:
+	case MLX5_CMD_OP_SET_HCA_CAP:
+	case MLX5_CMD_OP_QUERY_ISSI:
+	case MLX5_CMD_OP_SET_ISSI:
+	case MLX5_CMD_OP_CREATE_MKEY:
+	case MLX5_CMD_OP_QUERY_MKEY:
+	case MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS:
+	case MLX5_CMD_OP_PAGE_FAULT_RESUME:
+	case MLX5_CMD_OP_CREATE_EQ:
+	case MLX5_CMD_OP_QUERY_EQ:
+	case MLX5_CMD_OP_GEN_EQE:
+	case MLX5_CMD_OP_CREATE_CQ:
+	case MLX5_CMD_OP_QUERY_CQ:
+	case MLX5_CMD_OP_MODIFY_CQ:
+	case MLX5_CMD_OP_CREATE_QP:
+	case MLX5_CMD_OP_RST2INIT_QP:
+	case MLX5_CMD_OP_INIT2RTR_QP:
+	case MLX5_CMD_OP_RTR2RTS_QP:
+	case MLX5_CMD_OP_RTS2RTS_QP:
+	case MLX5_CMD_OP_SQERR2RTS_QP:
+	case MLX5_CMD_OP_QUERY_QP:
+	case MLX5_CMD_OP_SQD_RTS_QP:
+	case MLX5_CMD_OP_INIT2INIT_QP:
+	case MLX5_CMD_OP_CREATE_PSV:
+	case MLX5_CMD_OP_CREATE_SRQ:
+	case MLX5_CMD_OP_QUERY_SRQ:
+	case MLX5_CMD_OP_ARM_RQ:
+	case MLX5_CMD_OP_CREATE_XRC_SRQ:
+	case MLX5_CMD_OP_QUERY_XRC_SRQ:
+	case MLX5_CMD_OP_ARM_XRC_SRQ:
+	case MLX5_CMD_OP_CREATE_DCT:
+	case MLX5_CMD_OP_DRAIN_DCT:
+	case MLX5_CMD_OP_QUERY_DCT:
+	case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION:
+	case MLX5_CMD_OP_QUERY_VPORT_STATE:
+	case MLX5_CMD_OP_MODIFY_VPORT_STATE:
+	case MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT:
+	case MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT:
+	case MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT:
+	case MLX5_CMD_OP_QUERY_ROCE_ADDRESS:
+	case MLX5_CMD_OP_SET_ROCE_ADDRESS:
+	case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT:
+	case MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT:
+	case MLX5_CMD_OP_QUERY_HCA_VPORT_GID:
+	case MLX5_CMD_OP_QUERY_HCA_VPORT_PKEY:
+	case MLX5_CMD_OP_QUERY_VNIC_ENV:
+	case MLX5_CMD_OP_QUERY_VPORT_COUNTER:
+	case MLX5_CMD_OP_ALLOC_Q_COUNTER:
+	case MLX5_CMD_OP_QUERY_Q_COUNTER:
+	case MLX5_CMD_OP_SET_PP_RATE_LIMIT:
+	case MLX5_CMD_OP_QUERY_RATE_LIMIT:
+	case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
+	case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
+	case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT:
+	case MLX5_CMD_OP_CREATE_QOS_PARA_VPORT:
+	case MLX5_CMD_OP_ALLOC_PD:
+	case MLX5_CMD_OP_ALLOC_UAR:
+	case MLX5_CMD_OP_CONFIG_INT_MODERATION:
+	case MLX5_CMD_OP_ACCESS_REG:
+	case MLX5_CMD_OP_ATTACH_TO_MCG:
+	case MLX5_CMD_OP_GET_DROPPED_PACKET_LOG:
+	case MLX5_CMD_OP_MAD_IFC:
+	case MLX5_CMD_OP_QUERY_MAD_DEMUX:
+	case MLX5_CMD_OP_SET_MAD_DEMUX:
+	case MLX5_CMD_OP_NOP:
+	case MLX5_CMD_OP_ALLOC_XRCD:
+	case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN:
+	case MLX5_CMD_OP_QUERY_CONG_STATUS:
+	case MLX5_CMD_OP_MODIFY_CONG_STATUS:
+	case MLX5_CMD_OP_QUERY_CONG_PARAMS:
+	case MLX5_CMD_OP_MODIFY_CONG_PARAMS:
+	case MLX5_CMD_OP_QUERY_CONG_STATISTICS:
+	case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT:
+	case MLX5_CMD_OP_SET_L2_TABLE_ENTRY:
+	case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY:
+	case MLX5_CMD_OP_CREATE_LAG:
+	case MLX5_CMD_OP_MODIFY_LAG:
+	case MLX5_CMD_OP_QUERY_LAG:
+	case MLX5_CMD_OP_CREATE_VPORT_LAG:
+	case MLX5_CMD_OP_CREATE_TIR:
+	case MLX5_CMD_OP_MODIFY_TIR:
+	case MLX5_CMD_OP_QUERY_TIR:
+	case MLX5_CMD_OP_CREATE_SQ:
+	case MLX5_CMD_OP_MODIFY_SQ:
+	case MLX5_CMD_OP_QUERY_SQ:
+	case MLX5_CMD_OP_CREATE_RQ:
+	case MLX5_CMD_OP_MODIFY_RQ:
+	case MLX5_CMD_OP_QUERY_RQ:
+	case MLX5_CMD_OP_CREATE_RMP:
+	case MLX5_CMD_OP_MODIFY_RMP:
+	case MLX5_CMD_OP_QUERY_RMP:
+	case MLX5_CMD_OP_CREATE_TIS:
+	case MLX5_CMD_OP_MODIFY_TIS:
+	case MLX5_CMD_OP_QUERY_TIS:
+	case MLX5_CMD_OP_CREATE_RQT:
+	case MLX5_CMD_OP_MODIFY_RQT:
+	case MLX5_CMD_OP_QUERY_RQT:
+
+	case MLX5_CMD_OP_CREATE_FLOW_TABLE:
+	case MLX5_CMD_OP_QUERY_FLOW_TABLE:
+	case MLX5_CMD_OP_CREATE_FLOW_GROUP:
+	case MLX5_CMD_OP_QUERY_FLOW_GROUP:
+	case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY:
+	case MLX5_CMD_OP_ALLOC_FLOW_COUNTER:
+	case MLX5_CMD_OP_QUERY_FLOW_COUNTER:
+	case MLX5_CMD_OP_ALLOC_ENCAP_HEADER:
+	case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT:
+	case MLX5_CMD_OP_FPGA_CREATE_QP:
+	case MLX5_CMD_OP_FPGA_MODIFY_QP:
+	case MLX5_CMD_OP_FPGA_QUERY_QP:
+	case MLX5_CMD_OP_FPGA_QUERY_QP_COUNTERS:
+		*status = MLX5_DRIVER_STATUS_ABORTED;
+		*synd = MLX5_DRIVER_SYND;
+		return -EIO;
+	default:
+		mlx5_core_err(dev, "Unknown FW command (%d)\n", op);
+		return -EINVAL;
+	}
+}
+
+const char *mlx5_command_str(int command)
+{
+#define MLX5_COMMAND_STR_CASE(__cmd) case MLX5_CMD_OP_ ## __cmd: return #__cmd
+
+	switch (command) {
+	MLX5_COMMAND_STR_CASE(QUERY_HCA_CAP);
+	MLX5_COMMAND_STR_CASE(QUERY_ADAPTER);
+	MLX5_COMMAND_STR_CASE(INIT_HCA);
+	MLX5_COMMAND_STR_CASE(TEARDOWN_HCA);
+	MLX5_COMMAND_STR_CASE(ENABLE_HCA);
+	MLX5_COMMAND_STR_CASE(DISABLE_HCA);
+	MLX5_COMMAND_STR_CASE(QUERY_PAGES);
+	MLX5_COMMAND_STR_CASE(MANAGE_PAGES);
+	MLX5_COMMAND_STR_CASE(SET_HCA_CAP);
+	MLX5_COMMAND_STR_CASE(QUERY_ISSI);
+	MLX5_COMMAND_STR_CASE(SET_ISSI);
+	MLX5_COMMAND_STR_CASE(CREATE_MKEY);
+	MLX5_COMMAND_STR_CASE(QUERY_MKEY);
+	MLX5_COMMAND_STR_CASE(DESTROY_MKEY);
+	MLX5_COMMAND_STR_CASE(QUERY_SPECIAL_CONTEXTS);
+	MLX5_COMMAND_STR_CASE(PAGE_FAULT_RESUME);
+	MLX5_COMMAND_STR_CASE(CREATE_EQ);
+	MLX5_COMMAND_STR_CASE(DESTROY_EQ);
+	MLX5_COMMAND_STR_CASE(QUERY_EQ);
+	MLX5_COMMAND_STR_CASE(GEN_EQE);
+	MLX5_COMMAND_STR_CASE(CREATE_CQ);
+	MLX5_COMMAND_STR_CASE(DESTROY_CQ);
+	MLX5_COMMAND_STR_CASE(QUERY_CQ);
+	MLX5_COMMAND_STR_CASE(MODIFY_CQ);
+	MLX5_COMMAND_STR_CASE(CREATE_QP);
+	MLX5_COMMAND_STR_CASE(DESTROY_QP);
+	MLX5_COMMAND_STR_CASE(RST2INIT_QP);
+	MLX5_COMMAND_STR_CASE(INIT2RTR_QP);
+	MLX5_COMMAND_STR_CASE(RTR2RTS_QP);
+	MLX5_COMMAND_STR_CASE(RTS2RTS_QP);
+	MLX5_COMMAND_STR_CASE(SQERR2RTS_QP);
+	MLX5_COMMAND_STR_CASE(2ERR_QP);
+	MLX5_COMMAND_STR_CASE(2RST_QP);
+	MLX5_COMMAND_STR_CASE(QUERY_QP);
+	MLX5_COMMAND_STR_CASE(SQD_RTS_QP);
+	MLX5_COMMAND_STR_CASE(INIT2INIT_QP);
+	MLX5_COMMAND_STR_CASE(CREATE_PSV);
+	MLX5_COMMAND_STR_CASE(DESTROY_PSV);
+	MLX5_COMMAND_STR_CASE(CREATE_SRQ);
+	MLX5_COMMAND_STR_CASE(DESTROY_SRQ);
+	MLX5_COMMAND_STR_CASE(QUERY_SRQ);
+	MLX5_COMMAND_STR_CASE(ARM_RQ);
+	MLX5_COMMAND_STR_CASE(CREATE_XRC_SRQ);
+	MLX5_COMMAND_STR_CASE(DESTROY_XRC_SRQ);
+	MLX5_COMMAND_STR_CASE(QUERY_XRC_SRQ);
+	MLX5_COMMAND_STR_CASE(ARM_XRC_SRQ);
+	MLX5_COMMAND_STR_CASE(CREATE_DCT);
+	MLX5_COMMAND_STR_CASE(DESTROY_DCT);
+	MLX5_COMMAND_STR_CASE(DRAIN_DCT);
+	MLX5_COMMAND_STR_CASE(QUERY_DCT);
+	MLX5_COMMAND_STR_CASE(ARM_DCT_FOR_KEY_VIOLATION);
+	MLX5_COMMAND_STR_CASE(QUERY_VPORT_STATE);
+	MLX5_COMMAND_STR_CASE(MODIFY_VPORT_STATE);
+	MLX5_COMMAND_STR_CASE(QUERY_ESW_VPORT_CONTEXT);
+	MLX5_COMMAND_STR_CASE(MODIFY_ESW_VPORT_CONTEXT);
+	MLX5_COMMAND_STR_CASE(QUERY_NIC_VPORT_CONTEXT);
+	MLX5_COMMAND_STR_CASE(MODIFY_NIC_VPORT_CONTEXT);
+	MLX5_COMMAND_STR_CASE(QUERY_ROCE_ADDRESS);
+	MLX5_COMMAND_STR_CASE(SET_ROCE_ADDRESS);
+	MLX5_COMMAND_STR_CASE(QUERY_HCA_VPORT_CONTEXT);
+	MLX5_COMMAND_STR_CASE(MODIFY_HCA_VPORT_CONTEXT);
+	MLX5_COMMAND_STR_CASE(QUERY_HCA_VPORT_GID);
+	MLX5_COMMAND_STR_CASE(QUERY_HCA_VPORT_PKEY);
+	MLX5_COMMAND_STR_CASE(QUERY_VNIC_ENV);
+	MLX5_COMMAND_STR_CASE(QUERY_VPORT_COUNTER);
+	MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER);
+	MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER);
+	MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER);
+	MLX5_COMMAND_STR_CASE(SET_PP_RATE_LIMIT);
+	MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT);
+	MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT);
+	MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT);
+	MLX5_COMMAND_STR_CASE(QUERY_SCHEDULING_ELEMENT);
+	MLX5_COMMAND_STR_CASE(MODIFY_SCHEDULING_ELEMENT);
+	MLX5_COMMAND_STR_CASE(CREATE_QOS_PARA_VPORT);
+	MLX5_COMMAND_STR_CASE(DESTROY_QOS_PARA_VPORT);
+	MLX5_COMMAND_STR_CASE(ALLOC_PD);
+	MLX5_COMMAND_STR_CASE(DEALLOC_PD);
+	MLX5_COMMAND_STR_CASE(ALLOC_UAR);
+	MLX5_COMMAND_STR_CASE(DEALLOC_UAR);
+	MLX5_COMMAND_STR_CASE(CONFIG_INT_MODERATION);
+	MLX5_COMMAND_STR_CASE(ACCESS_REG);
+	MLX5_COMMAND_STR_CASE(ATTACH_TO_MCG);
+	MLX5_COMMAND_STR_CASE(DETACH_FROM_MCG);
+	MLX5_COMMAND_STR_CASE(GET_DROPPED_PACKET_LOG);
+	MLX5_COMMAND_STR_CASE(MAD_IFC);
+	MLX5_COMMAND_STR_CASE(QUERY_MAD_DEMUX);
+	MLX5_COMMAND_STR_CASE(SET_MAD_DEMUX);
+	MLX5_COMMAND_STR_CASE(NOP);
+	MLX5_COMMAND_STR_CASE(ALLOC_XRCD);
+	MLX5_COMMAND_STR_CASE(DEALLOC_XRCD);
+	MLX5_COMMAND_STR_CASE(ALLOC_TRANSPORT_DOMAIN);
+	MLX5_COMMAND_STR_CASE(DEALLOC_TRANSPORT_DOMAIN);
+	MLX5_COMMAND_STR_CASE(QUERY_CONG_STATUS);
+	MLX5_COMMAND_STR_CASE(MODIFY_CONG_STATUS);
+	MLX5_COMMAND_STR_CASE(QUERY_CONG_PARAMS);
+	MLX5_COMMAND_STR_CASE(MODIFY_CONG_PARAMS);
+	MLX5_COMMAND_STR_CASE(QUERY_CONG_STATISTICS);
+	MLX5_COMMAND_STR_CASE(ADD_VXLAN_UDP_DPORT);
+	MLX5_COMMAND_STR_CASE(DELETE_VXLAN_UDP_DPORT);
+	MLX5_COMMAND_STR_CASE(SET_L2_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(QUERY_L2_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(DELETE_L2_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(SET_WOL_ROL);
+	MLX5_COMMAND_STR_CASE(QUERY_WOL_ROL);
+	MLX5_COMMAND_STR_CASE(CREATE_LAG);
+	MLX5_COMMAND_STR_CASE(MODIFY_LAG);
+	MLX5_COMMAND_STR_CASE(QUERY_LAG);
+	MLX5_COMMAND_STR_CASE(DESTROY_LAG);
+	MLX5_COMMAND_STR_CASE(CREATE_VPORT_LAG);
+	MLX5_COMMAND_STR_CASE(DESTROY_VPORT_LAG);
+	MLX5_COMMAND_STR_CASE(CREATE_TIR);
+	MLX5_COMMAND_STR_CASE(MODIFY_TIR);
+	MLX5_COMMAND_STR_CASE(DESTROY_TIR);
+	MLX5_COMMAND_STR_CASE(QUERY_TIR);
+	MLX5_COMMAND_STR_CASE(CREATE_SQ);
+	MLX5_COMMAND_STR_CASE(MODIFY_SQ);
+	MLX5_COMMAND_STR_CASE(DESTROY_SQ);
+	MLX5_COMMAND_STR_CASE(QUERY_SQ);
+	MLX5_COMMAND_STR_CASE(CREATE_RQ);
+	MLX5_COMMAND_STR_CASE(MODIFY_RQ);
+	MLX5_COMMAND_STR_CASE(DESTROY_RQ);
+	MLX5_COMMAND_STR_CASE(QUERY_RQ);
+	MLX5_COMMAND_STR_CASE(CREATE_RMP);
+	MLX5_COMMAND_STR_CASE(MODIFY_RMP);
+	MLX5_COMMAND_STR_CASE(DESTROY_RMP);
+	MLX5_COMMAND_STR_CASE(QUERY_RMP);
+	MLX5_COMMAND_STR_CASE(CREATE_TIS);
+	MLX5_COMMAND_STR_CASE(MODIFY_TIS);
+	MLX5_COMMAND_STR_CASE(DESTROY_TIS);
+	MLX5_COMMAND_STR_CASE(QUERY_TIS);
+	MLX5_COMMAND_STR_CASE(CREATE_RQT);
+	MLX5_COMMAND_STR_CASE(MODIFY_RQT);
+	MLX5_COMMAND_STR_CASE(DESTROY_RQT);
+	MLX5_COMMAND_STR_CASE(QUERY_RQT);
+	MLX5_COMMAND_STR_CASE(SET_FLOW_TABLE_ROOT);
+	MLX5_COMMAND_STR_CASE(CREATE_FLOW_TABLE);
+	MLX5_COMMAND_STR_CASE(DESTROY_FLOW_TABLE);
+	MLX5_COMMAND_STR_CASE(QUERY_FLOW_TABLE);
+	MLX5_COMMAND_STR_CASE(CREATE_FLOW_GROUP);
+	MLX5_COMMAND_STR_CASE(DESTROY_FLOW_GROUP);
+	MLX5_COMMAND_STR_CASE(QUERY_FLOW_GROUP);
+	MLX5_COMMAND_STR_CASE(SET_FLOW_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(QUERY_FLOW_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(DELETE_FLOW_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(ALLOC_FLOW_COUNTER);
+	MLX5_COMMAND_STR_CASE(DEALLOC_FLOW_COUNTER);
+	MLX5_COMMAND_STR_CASE(QUERY_FLOW_COUNTER);
+	MLX5_COMMAND_STR_CASE(MODIFY_FLOW_TABLE);
+	MLX5_COMMAND_STR_CASE(ALLOC_ENCAP_HEADER);
+	MLX5_COMMAND_STR_CASE(DEALLOC_ENCAP_HEADER);
+	MLX5_COMMAND_STR_CASE(ALLOC_MODIFY_HEADER_CONTEXT);
+	MLX5_COMMAND_STR_CASE(DEALLOC_MODIFY_HEADER_CONTEXT);
+	MLX5_COMMAND_STR_CASE(FPGA_CREATE_QP);
+	MLX5_COMMAND_STR_CASE(FPGA_MODIFY_QP);
+	MLX5_COMMAND_STR_CASE(FPGA_QUERY_QP);
+	MLX5_COMMAND_STR_CASE(FPGA_QUERY_QP_COUNTERS);
+	MLX5_COMMAND_STR_CASE(FPGA_DESTROY_QP);
+	default: return "unknown command opcode";
+	}
+}
+
+static const char *cmd_status_str(u8 status)
+{
+	switch (status) {
+	case MLX5_CMD_STAT_OK:
+		return "OK";
+	case MLX5_CMD_STAT_INT_ERR:
+		return "internal error";
+	case MLX5_CMD_STAT_BAD_OP_ERR:
+		return "bad operation";
+	case MLX5_CMD_STAT_BAD_PARAM_ERR:
+		return "bad parameter";
+	case MLX5_CMD_STAT_BAD_SYS_STATE_ERR:
+		return "bad system state";
+	case MLX5_CMD_STAT_BAD_RES_ERR:
+		return "bad resource";
+	case MLX5_CMD_STAT_RES_BUSY:
+		return "resource busy";
+	case MLX5_CMD_STAT_LIM_ERR:
+		return "limits exceeded";
+	case MLX5_CMD_STAT_BAD_RES_STATE_ERR:
+		return "bad resource state";
+	case MLX5_CMD_STAT_IX_ERR:
+		return "bad index";
+	case MLX5_CMD_STAT_NO_RES_ERR:
+		return "no resources";
+	case MLX5_CMD_STAT_BAD_INP_LEN_ERR:
+		return "bad input length";
+	case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR:
+		return "bad output length";
+	case MLX5_CMD_STAT_BAD_QP_STATE_ERR:
+		return "bad QP state";
+	case MLX5_CMD_STAT_BAD_PKT_ERR:
+		return "bad packet (discarded)";
+	case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR:
+		return "bad size too many outstanding CQEs";
+	default:
+		return "unknown status";
+	}
+}
+
+static int cmd_status_to_err(u8 status)
+{
+	switch (status) {
+	case MLX5_CMD_STAT_OK:				return 0;
+	case MLX5_CMD_STAT_INT_ERR:			return -EIO;
+	case MLX5_CMD_STAT_BAD_OP_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_BAD_PARAM_ERR:		return -EINVAL;
+	case MLX5_CMD_STAT_BAD_SYS_STATE_ERR:		return -EIO;
+	case MLX5_CMD_STAT_BAD_RES_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_RES_BUSY:			return -EBUSY;
+	case MLX5_CMD_STAT_LIM_ERR:			return -ENOMEM;
+	case MLX5_CMD_STAT_BAD_RES_STATE_ERR:		return -EINVAL;
+	case MLX5_CMD_STAT_IX_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_NO_RES_ERR:			return -EAGAIN;
+	case MLX5_CMD_STAT_BAD_INP_LEN_ERR:		return -EIO;
+	case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR:		return -EIO;
+	case MLX5_CMD_STAT_BAD_QP_STATE_ERR:		return -EINVAL;
+	case MLX5_CMD_STAT_BAD_PKT_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR:	return -EINVAL;
+	default:					return -EIO;
+	}
+}
+
+struct mlx5_ifc_mbox_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_mbox_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome)
+{
+	*status = MLX5_GET(mbox_out, out, status);
+	*syndrome = MLX5_GET(mbox_out, out, syndrome);
+}
+
+static int mlx5_cmd_check(struct mlx5_core_dev *dev, void *in, void *out)
+{
+	u32 syndrome;
+	u8  status;
+	u16 opcode;
+	u16 op_mod;
+
+	mlx5_cmd_mbox_status(out, &status, &syndrome);
+	if (!status)
+		return 0;
+
+	opcode = MLX5_GET(mbox_in, in, opcode);
+	op_mod = MLX5_GET(mbox_in, in, op_mod);
+
+	mlx5_core_err(dev,
+		      "%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x)\n",
+		      mlx5_command_str(opcode),
+		      opcode, op_mod,
+		      cmd_status_str(status),
+		      status,
+		      syndrome);
+
+	return cmd_status_to_err(status);
+}
+
+static void dump_command(struct mlx5_core_dev *dev,
+			 struct mlx5_cmd_work_ent *ent, int input)
+{
+	struct mlx5_cmd_msg *msg = input ? ent->in : ent->out;
+	u16 op = MLX5_GET(mbox_in, ent->lay->in, opcode);
+	struct mlx5_cmd_mailbox *next = msg->next;
+	int data_only;
+	u32 offset = 0;
+	int dump_len;
+
+	data_only = !!(mlx5_core_debug_mask & (1 << MLX5_CMD_DATA));
+
+	if (data_only)
+		mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_DATA,
+				   "dump command data %s(0x%x) %s\n",
+				   mlx5_command_str(op), op,
+				   input ? "INPUT" : "OUTPUT");
+	else
+		mlx5_core_dbg(dev, "dump command %s(0x%x) %s\n",
+			      mlx5_command_str(op), op,
+			      input ? "INPUT" : "OUTPUT");
+
+	if (data_only) {
+		if (input) {
+			dump_buf(ent->lay->in, sizeof(ent->lay->in), 1, offset);
+			offset += sizeof(ent->lay->in);
+		} else {
+			dump_buf(ent->lay->out, sizeof(ent->lay->out), 1, offset);
+			offset += sizeof(ent->lay->out);
+		}
+	} else {
+		dump_buf(ent->lay, sizeof(*ent->lay), 0, offset);
+		offset += sizeof(*ent->lay);
+	}
+
+	while (next && offset < msg->len) {
+		if (data_only) {
+			dump_len = min_t(int, MLX5_CMD_DATA_BLOCK_SIZE, msg->len - offset);
+			dump_buf(next->buf, dump_len, 1, offset);
+			offset += MLX5_CMD_DATA_BLOCK_SIZE;
+		} else {
+			mlx5_core_dbg(dev, "command block:\n");
+			dump_buf(next->buf, sizeof(struct mlx5_cmd_prot_block), 0, offset);
+			offset += sizeof(struct mlx5_cmd_prot_block);
+		}
+		next = next->next;
+	}
+
+	if (data_only)
+		pr_debug("\n");
+}
+
+static u16 msg_to_opcode(struct mlx5_cmd_msg *in)
+{
+	return MLX5_GET(mbox_in, in->first.data, opcode);
+}
+
+static void cb_timeout_handler(struct work_struct *work)
+{
+	struct delayed_work *dwork = container_of(work, struct delayed_work,
+						  work);
+	struct mlx5_cmd_work_ent *ent = container_of(dwork,
+						     struct mlx5_cmd_work_ent,
+						     cb_timeout_work);
+	struct mlx5_core_dev *dev = container_of(ent->cmd, struct mlx5_core_dev,
+						 cmd);
+
+	ent->ret = -ETIMEDOUT;
+	mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n",
+		       mlx5_command_str(msg_to_opcode(ent->in)),
+		       msg_to_opcode(ent->in));
+	mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
+}
+
+static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg);
+static void mlx5_free_cmd_msg(struct mlx5_core_dev *dev,
+			      struct mlx5_cmd_msg *msg);
+
+static void cmd_work_handler(struct work_struct *work)
+{
+	struct mlx5_cmd_work_ent *ent = container_of(work, struct mlx5_cmd_work_ent, work);
+	struct mlx5_cmd *cmd = ent->cmd;
+	struct mlx5_core_dev *dev = container_of(cmd, struct mlx5_core_dev, cmd);
+	unsigned long cb_timeout = msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC);
+	struct mlx5_cmd_layout *lay;
+	struct semaphore *sem;
+	unsigned long flags;
+	bool poll_cmd = ent->polling;
+	int alloc_ret;
+
+	sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem;
+	down(sem);
+	if (!ent->page_queue) {
+		alloc_ret = alloc_ent(cmd);
+		if (alloc_ret < 0) {
+			mlx5_core_err(dev, "failed to allocate command entry\n");
+			if (ent->callback) {
+				ent->callback(-EAGAIN, ent->context);
+				mlx5_free_cmd_msg(dev, ent->out);
+				free_msg(dev, ent->in);
+				free_cmd(ent);
+			} else {
+				ent->ret = -EAGAIN;
+				complete(&ent->done);
+			}
+			up(sem);
+			return;
+		}
+		ent->idx = alloc_ret;
+	} else {
+		ent->idx = cmd->max_reg_cmds;
+		spin_lock_irqsave(&cmd->alloc_lock, flags);
+		clear_bit(ent->idx, &cmd->bitmask);
+		spin_unlock_irqrestore(&cmd->alloc_lock, flags);
+	}
+
+	cmd->ent_arr[ent->idx] = ent;
+	set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state);
+	lay = get_inst(cmd, ent->idx);
+	ent->lay = lay;
+	memset(lay, 0, sizeof(*lay));
+	memcpy(lay->in, ent->in->first.data, sizeof(lay->in));
+	ent->op = be32_to_cpu(lay->in[0]) >> 16;
+	if (ent->in->next)
+		lay->in_ptr = cpu_to_be64(ent->in->next->dma);
+	lay->inlen = cpu_to_be32(ent->in->len);
+	if (ent->out->next)
+		lay->out_ptr = cpu_to_be64(ent->out->next->dma);
+	lay->outlen = cpu_to_be32(ent->out->len);
+	lay->type = MLX5_PCI_CMD_XPORT;
+	lay->token = ent->token;
+	lay->status_own = CMD_OWNER_HW;
+	set_signature(ent, !cmd->checksum_disabled);
+	dump_command(dev, ent, 1);
+	ent->ts1 = ktime_get_ns();
+
+	if (ent->callback)
+		schedule_delayed_work(&ent->cb_timeout_work, cb_timeout);
+
+	/* Skip sending command to fw if internal error */
+	if (pci_channel_offline(dev->pdev) ||
+	    dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		u8 status = 0;
+		u32 drv_synd;
+
+		ent->ret = mlx5_internal_err_ret_value(dev, msg_to_opcode(ent->in), &drv_synd, &status);
+		MLX5_SET(mbox_out, ent->out, status, status);
+		MLX5_SET(mbox_out, ent->out, syndrome, drv_synd);
+
+		mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
+		return;
+	}
+
+	/* ring doorbell after the descriptor is valid */
+	mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx);
+	wmb();
+	iowrite32be(1 << ent->idx, &dev->iseg->cmd_dbell);
+	mmiowb();
+	/* if not in polling don't use ent after this point */
+	if (cmd->mode == CMD_MODE_POLLING || poll_cmd) {
+		poll_timeout(ent);
+		/* make sure we read the descriptor after ownership is SW */
+		rmb();
+		mlx5_cmd_comp_handler(dev, 1UL << ent->idx, (ent->ret == -ETIMEDOUT));
+	}
+}
+
+static const char *deliv_status_to_str(u8 status)
+{
+	switch (status) {
+	case MLX5_CMD_DELIVERY_STAT_OK:
+		return "no errors";
+	case MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR:
+		return "signature error";
+	case MLX5_CMD_DELIVERY_STAT_TOK_ERR:
+		return "token error";
+	case MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR:
+		return "bad block number";
+	case MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR:
+		return "output pointer not aligned to block size";
+	case MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR:
+		return "input pointer not aligned to block size";
+	case MLX5_CMD_DELIVERY_STAT_FW_ERR:
+		return "firmware internal error";
+	case MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR:
+		return "command input length error";
+	case MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR:
+		return "command output length error";
+	case MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR:
+		return "reserved fields not cleared";
+	case MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR:
+		return "bad command descriptor type";
+	default:
+		return "unknown status code";
+	}
+}
+
+static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
+{
+	unsigned long timeout = msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC);
+	struct mlx5_cmd *cmd = &dev->cmd;
+	int err;
+
+	if (cmd->mode == CMD_MODE_POLLING || ent->polling) {
+		wait_for_completion(&ent->done);
+	} else if (!wait_for_completion_timeout(&ent->done, timeout)) {
+		ent->ret = -ETIMEDOUT;
+		mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
+	}
+
+	err = ent->ret;
+
+	if (err == -ETIMEDOUT) {
+		mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n",
+			       mlx5_command_str(msg_to_opcode(ent->in)),
+			       msg_to_opcode(ent->in));
+	}
+	mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n",
+		      err, deliv_status_to_str(ent->status), ent->status);
+
+	return err;
+}
+
+/*  Notes:
+ *    1. Callback functions may not sleep
+ *    2. page queue commands do not support asynchrous completion
+ */
+static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
+			   struct mlx5_cmd_msg *out, void *uout, int uout_size,
+			   mlx5_cmd_cbk_t callback,
+			   void *context, int page_queue, u8 *status,
+			   u8 token, bool force_polling)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct mlx5_cmd_work_ent *ent;
+	struct mlx5_cmd_stats *stats;
+	int err = 0;
+	s64 ds;
+	u16 op;
+
+	if (callback && page_queue)
+		return -EINVAL;
+
+	ent = alloc_cmd(cmd, in, out, uout, uout_size, callback, context,
+			page_queue);
+	if (IS_ERR(ent))
+		return PTR_ERR(ent);
+
+	ent->token = token;
+	ent->polling = force_polling;
+
+	if (!callback)
+		init_completion(&ent->done);
+
+	INIT_DELAYED_WORK(&ent->cb_timeout_work, cb_timeout_handler);
+	INIT_WORK(&ent->work, cmd_work_handler);
+	if (page_queue) {
+		cmd_work_handler(&ent->work);
+	} else if (!queue_work(cmd->wq, &ent->work)) {
+		mlx5_core_warn(dev, "failed to queue work\n");
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	if (callback)
+		goto out;
+
+	err = wait_func(dev, ent);
+	if (err == -ETIMEDOUT)
+		goto out;
+
+	ds = ent->ts2 - ent->ts1;
+	op = MLX5_GET(mbox_in, in->first.data, opcode);
+	if (op < ARRAY_SIZE(cmd->stats)) {
+		stats = &cmd->stats[op];
+		spin_lock_irq(&stats->lock);
+		stats->sum += ds;
+		++stats->n;
+		spin_unlock_irq(&stats->lock);
+	}
+	mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_TIME,
+			   "fw exec time for %s is %lld nsec\n",
+			   mlx5_command_str(op), ds);
+	*status = ent->status;
+
+out_free:
+	free_cmd(ent);
+out:
+	return err;
+}
+
+static ssize_t dbg_write(struct file *filp, const char __user *buf,
+			 size_t count, loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	char lbuf[3];
+	int err;
+
+	if (!dbg->in_msg || !dbg->out_msg)
+		return -ENOMEM;
+
+	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
+		return -EFAULT;
+
+	lbuf[sizeof(lbuf) - 1] = 0;
+
+	if (strcmp(lbuf, "go"))
+		return -EINVAL;
+
+	err = mlx5_cmd_exec(dev, dbg->in_msg, dbg->inlen, dbg->out_msg, dbg->outlen);
+
+	return err ? err : count;
+}
+
+static const struct file_operations fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= dbg_write,
+};
+
+static int mlx5_copy_to_msg(struct mlx5_cmd_msg *to, void *from, int size,
+			    u8 token)
+{
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_mailbox *next;
+	int copy;
+
+	if (!to || !from)
+		return -ENOMEM;
+
+	copy = min_t(int, size, sizeof(to->first.data));
+	memcpy(to->first.data, from, copy);
+	size -= copy;
+	from += copy;
+
+	next = to->next;
+	while (size) {
+		if (!next) {
+			/* this is a BUG */
+			return -ENOMEM;
+		}
+
+		copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE);
+		block = next->buf;
+		memcpy(block->data, from, copy);
+		from += copy;
+		size -= copy;
+		block->token = token;
+		next = next->next;
+	}
+
+	return 0;
+}
+
+static int mlx5_copy_from_msg(void *to, struct mlx5_cmd_msg *from, int size)
+{
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_mailbox *next;
+	int copy;
+
+	if (!to || !from)
+		return -ENOMEM;
+
+	copy = min_t(int, size, sizeof(from->first.data));
+	memcpy(to, from->first.data, copy);
+	size -= copy;
+	to += copy;
+
+	next = from->next;
+	while (size) {
+		if (!next) {
+			/* this is a BUG */
+			return -ENOMEM;
+		}
+
+		copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE);
+		block = next->buf;
+
+		memcpy(to, block->data, copy);
+		to += copy;
+		size -= copy;
+		next = next->next;
+	}
+
+	return 0;
+}
+
+static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev,
+					      gfp_t flags)
+{
+	struct mlx5_cmd_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof(*mailbox), flags);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = dma_pool_zalloc(dev->cmd.pool, flags,
+				       &mailbox->dma);
+	if (!mailbox->buf) {
+		mlx5_core_dbg(dev, "failed allocation\n");
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+	mailbox->next = NULL;
+
+	return mailbox;
+}
+
+static void free_cmd_box(struct mlx5_core_dev *dev,
+			 struct mlx5_cmd_mailbox *mailbox)
+{
+	dma_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+
+static struct mlx5_cmd_msg *mlx5_alloc_cmd_msg(struct mlx5_core_dev *dev,
+					       gfp_t flags, int size,
+					       u8 token)
+{
+	struct mlx5_cmd_mailbox *tmp, *head = NULL;
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_msg *msg;
+	int blen;
+	int err;
+	int n;
+	int i;
+
+	msg = kzalloc(sizeof(*msg), flags);
+	if (!msg)
+		return ERR_PTR(-ENOMEM);
+
+	blen = size - min_t(int, sizeof(msg->first.data), size);
+	n = (blen + MLX5_CMD_DATA_BLOCK_SIZE - 1) / MLX5_CMD_DATA_BLOCK_SIZE;
+
+	for (i = 0; i < n; i++) {
+		tmp = alloc_cmd_box(dev, flags);
+		if (IS_ERR(tmp)) {
+			mlx5_core_warn(dev, "failed allocating block\n");
+			err = PTR_ERR(tmp);
+			goto err_alloc;
+		}
+
+		block = tmp->buf;
+		tmp->next = head;
+		block->next = cpu_to_be64(tmp->next ? tmp->next->dma : 0);
+		block->block_num = cpu_to_be32(n - i - 1);
+		block->token = token;
+		head = tmp;
+	}
+	msg->next = head;
+	msg->len = size;
+	return msg;
+
+err_alloc:
+	while (head) {
+		tmp = head->next;
+		free_cmd_box(dev, head);
+		head = tmp;
+	}
+	kfree(msg);
+
+	return ERR_PTR(err);
+}
+
+static void mlx5_free_cmd_msg(struct mlx5_core_dev *dev,
+			      struct mlx5_cmd_msg *msg)
+{
+	struct mlx5_cmd_mailbox *head = msg->next;
+	struct mlx5_cmd_mailbox *next;
+
+	while (head) {
+		next = head->next;
+		free_cmd_box(dev, head);
+		head = next;
+	}
+	kfree(msg);
+}
+
+static ssize_t data_write(struct file *filp, const char __user *buf,
+			  size_t count, loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	void *ptr;
+
+	if (*pos != 0)
+		return -EINVAL;
+
+	kfree(dbg->in_msg);
+	dbg->in_msg = NULL;
+	dbg->inlen = 0;
+	ptr = memdup_user(buf, count);
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+	dbg->in_msg = ptr;
+	dbg->inlen = count;
+
+	*pos = count;
+
+	return count;
+}
+
+static ssize_t data_read(struct file *filp, char __user *buf, size_t count,
+			 loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	int copy;
+
+	if (*pos)
+		return 0;
+
+	if (!dbg->out_msg)
+		return -ENOMEM;
+
+	copy = min_t(int, count, dbg->outlen);
+	if (copy_to_user(buf, dbg->out_msg, copy))
+		return -EFAULT;
+
+	*pos += copy;
+
+	return copy;
+}
+
+static const struct file_operations dfops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= data_write,
+	.read	= data_read,
+};
+
+static ssize_t outlen_read(struct file *filp, char __user *buf, size_t count,
+			   loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	char outlen[8];
+	int err;
+
+	if (*pos)
+		return 0;
+
+	err = snprintf(outlen, sizeof(outlen), "%d", dbg->outlen);
+	if (err < 0)
+		return err;
+
+	if (copy_to_user(buf, &outlen, err))
+		return -EFAULT;
+
+	*pos += err;
+
+	return err;
+}
+
+static ssize_t outlen_write(struct file *filp, const char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	char outlen_str[8];
+	int outlen;
+	void *ptr;
+	int err;
+
+	if (*pos != 0 || count > 6)
+		return -EINVAL;
+
+	kfree(dbg->out_msg);
+	dbg->out_msg = NULL;
+	dbg->outlen = 0;
+
+	if (copy_from_user(outlen_str, buf, count))
+		return -EFAULT;
+
+	outlen_str[7] = 0;
+
+	err = sscanf(outlen_str, "%d", &outlen);
+	if (err < 0)
+		return err;
+
+	ptr = kzalloc(outlen, GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	dbg->out_msg = ptr;
+	dbg->outlen = outlen;
+
+	*pos = count;
+
+	return count;
+}
+
+static const struct file_operations olfops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= outlen_write,
+	.read	= outlen_read,
+};
+
+static void set_wqname(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+
+	snprintf(cmd->wq_name, sizeof(cmd->wq_name), "mlx5_cmd_%s",
+		 dev_name(&dev->pdev->dev));
+}
+
+static void clean_debug_files(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+
+	if (!mlx5_debugfs_root)
+		return;
+
+	mlx5_cmdif_debugfs_cleanup(dev);
+	debugfs_remove_recursive(dbg->dbg_root);
+}
+
+static int create_debugfs_files(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	int err = -ENOMEM;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dbg->dbg_root = debugfs_create_dir("cmd", dev->priv.dbg_root);
+	if (!dbg->dbg_root)
+		return err;
+
+	dbg->dbg_in = debugfs_create_file("in", 0400, dbg->dbg_root,
+					  dev, &dfops);
+	if (!dbg->dbg_in)
+		goto err_dbg;
+
+	dbg->dbg_out = debugfs_create_file("out", 0200, dbg->dbg_root,
+					   dev, &dfops);
+	if (!dbg->dbg_out)
+		goto err_dbg;
+
+	dbg->dbg_outlen = debugfs_create_file("out_len", 0600, dbg->dbg_root,
+					      dev, &olfops);
+	if (!dbg->dbg_outlen)
+		goto err_dbg;
+
+	dbg->dbg_status = debugfs_create_u8("status", 0600, dbg->dbg_root,
+					    &dbg->status);
+	if (!dbg->dbg_status)
+		goto err_dbg;
+
+	dbg->dbg_run = debugfs_create_file("run", 0200, dbg->dbg_root, dev, &fops);
+	if (!dbg->dbg_run)
+		goto err_dbg;
+
+	mlx5_cmdif_debugfs_init(dev);
+
+	return 0;
+
+err_dbg:
+	clean_debug_files(dev);
+	return err;
+}
+
+static void mlx5_cmd_change_mod(struct mlx5_core_dev *dev, int mode)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	int i;
+
+	for (i = 0; i < cmd->max_reg_cmds; i++)
+		down(&cmd->sem);
+	down(&cmd->pages_sem);
+
+	cmd->mode = mode;
+
+	up(&cmd->pages_sem);
+	for (i = 0; i < cmd->max_reg_cmds; i++)
+		up(&cmd->sem);
+}
+
+void mlx5_cmd_use_events(struct mlx5_core_dev *dev)
+{
+	mlx5_cmd_change_mod(dev, CMD_MODE_EVENTS);
+}
+
+void mlx5_cmd_use_polling(struct mlx5_core_dev *dev)
+{
+	mlx5_cmd_change_mod(dev, CMD_MODE_POLLING);
+}
+
+static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg)
+{
+	unsigned long flags;
+
+	if (msg->parent) {
+		spin_lock_irqsave(&msg->parent->lock, flags);
+		list_add_tail(&msg->list, &msg->parent->head);
+		spin_unlock_irqrestore(&msg->parent->lock, flags);
+	} else {
+		mlx5_free_cmd_msg(dev, msg);
+	}
+}
+
+void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct mlx5_cmd_work_ent *ent;
+	mlx5_cmd_cbk_t callback;
+	void *context;
+	int err;
+	int i;
+	s64 ds;
+	struct mlx5_cmd_stats *stats;
+	unsigned long flags;
+	unsigned long vector;
+
+	/* there can be at most 32 command queues */
+	vector = vec & 0xffffffff;
+	for (i = 0; i < (1 << cmd->log_sz); i++) {
+		if (test_bit(i, &vector)) {
+			struct semaphore *sem;
+
+			ent = cmd->ent_arr[i];
+
+			/* if we already completed the command, ignore it */
+			if (!test_and_clear_bit(MLX5_CMD_ENT_STATE_PENDING_COMP,
+						&ent->state)) {
+				/* only real completion can free the cmd slot */
+				if (!forced) {
+					mlx5_core_err(dev, "Command completion arrived after timeout (entry idx = %d).\n",
+						      ent->idx);
+					free_ent(cmd, ent->idx);
+					free_cmd(ent);
+				}
+				continue;
+			}
+
+			if (ent->callback)
+				cancel_delayed_work(&ent->cb_timeout_work);
+			if (ent->page_queue)
+				sem = &cmd->pages_sem;
+			else
+				sem = &cmd->sem;
+			ent->ts2 = ktime_get_ns();
+			memcpy(ent->out->first.data, ent->lay->out, sizeof(ent->lay->out));
+			dump_command(dev, ent, 0);
+			if (!ent->ret) {
+				if (!cmd->checksum_disabled)
+					ent->ret = verify_signature(ent);
+				else
+					ent->ret = 0;
+				if (vec & MLX5_TRIGGERED_CMD_COMP)
+					ent->status = MLX5_DRIVER_STATUS_ABORTED;
+				else
+					ent->status = ent->lay->status_own >> 1;
+
+				mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n",
+					      ent->ret, deliv_status_to_str(ent->status), ent->status);
+			}
+
+			/* only real completion will free the entry slot */
+			if (!forced)
+				free_ent(cmd, ent->idx);
+
+			if (ent->callback) {
+				ds = ent->ts2 - ent->ts1;
+				if (ent->op < ARRAY_SIZE(cmd->stats)) {
+					stats = &cmd->stats[ent->op];
+					spin_lock_irqsave(&stats->lock, flags);
+					stats->sum += ds;
+					++stats->n;
+					spin_unlock_irqrestore(&stats->lock, flags);
+				}
+
+				callback = ent->callback;
+				context = ent->context;
+				err = ent->ret;
+				if (!err) {
+					err = mlx5_copy_from_msg(ent->uout,
+								 ent->out,
+								 ent->uout_size);
+
+					err = err ? err : mlx5_cmd_check(dev,
+									ent->in->first.data,
+									ent->uout);
+				}
+
+				mlx5_free_cmd_msg(dev, ent->out);
+				free_msg(dev, ent->in);
+
+				err = err ? err : ent->status;
+				if (!forced)
+					free_cmd(ent);
+				callback(err, context);
+			} else {
+				complete(&ent->done);
+			}
+			up(sem);
+		}
+	}
+}
+EXPORT_SYMBOL(mlx5_cmd_comp_handler);
+
+static int status_to_err(u8 status)
+{
+	return status ? -1 : 0; /* TBD more meaningful codes */
+}
+
+static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size,
+				      gfp_t gfp)
+{
+	struct mlx5_cmd_msg *msg = ERR_PTR(-ENOMEM);
+	struct cmd_msg_cache *ch = NULL;
+	struct mlx5_cmd *cmd = &dev->cmd;
+	int i;
+
+	if (in_size <= 16)
+		goto cache_miss;
+
+	for (i = 0; i < MLX5_NUM_COMMAND_CACHES; i++) {
+		ch = &cmd->cache[i];
+		if (in_size > ch->max_inbox_size)
+			continue;
+		spin_lock_irq(&ch->lock);
+		if (list_empty(&ch->head)) {
+			spin_unlock_irq(&ch->lock);
+			continue;
+		}
+		msg = list_entry(ch->head.next, typeof(*msg), list);
+		/* For cached lists, we must explicitly state what is
+		 * the real size
+		 */
+		msg->len = in_size;
+		list_del(&msg->list);
+		spin_unlock_irq(&ch->lock);
+		break;
+	}
+
+	if (!IS_ERR(msg))
+		return msg;
+
+cache_miss:
+	msg = mlx5_alloc_cmd_msg(dev, gfp, in_size, 0);
+	return msg;
+}
+
+static int is_manage_pages(void *in)
+{
+	return MLX5_GET(mbox_in, in, opcode) == MLX5_CMD_OP_MANAGE_PAGES;
+}
+
+static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+		    int out_size, mlx5_cmd_cbk_t callback, void *context,
+		    bool force_polling)
+{
+	struct mlx5_cmd_msg *inb;
+	struct mlx5_cmd_msg *outb;
+	int pages_queue;
+	gfp_t gfp;
+	int err;
+	u8 status = 0;
+	u32 drv_synd;
+	u8 token;
+
+	if (pci_channel_offline(dev->pdev) ||
+	    dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		u16 opcode = MLX5_GET(mbox_in, in, opcode);
+
+		err = mlx5_internal_err_ret_value(dev, opcode, &drv_synd, &status);
+		MLX5_SET(mbox_out, out, status, status);
+		MLX5_SET(mbox_out, out, syndrome, drv_synd);
+		return err;
+	}
+
+	pages_queue = is_manage_pages(in);
+	gfp = callback ? GFP_ATOMIC : GFP_KERNEL;
+
+	inb = alloc_msg(dev, in_size, gfp);
+	if (IS_ERR(inb)) {
+		err = PTR_ERR(inb);
+		return err;
+	}
+
+	token = alloc_token(&dev->cmd);
+
+	err = mlx5_copy_to_msg(inb, in, in_size, token);
+	if (err) {
+		mlx5_core_warn(dev, "err %d\n", err);
+		goto out_in;
+	}
+
+	outb = mlx5_alloc_cmd_msg(dev, gfp, out_size, token);
+	if (IS_ERR(outb)) {
+		err = PTR_ERR(outb);
+		goto out_in;
+	}
+
+	err = mlx5_cmd_invoke(dev, inb, outb, out, out_size, callback, context,
+			      pages_queue, &status, token, force_polling);
+	if (err)
+		goto out_out;
+
+	mlx5_core_dbg(dev, "err %d, status %d\n", err, status);
+	if (status) {
+		err = status_to_err(status);
+		goto out_out;
+	}
+
+	if (!callback)
+		err = mlx5_copy_from_msg(out, outb, out_size);
+
+out_out:
+	if (!callback)
+		mlx5_free_cmd_msg(dev, outb);
+
+out_in:
+	if (!callback)
+		free_msg(dev, inb);
+	return err;
+}
+
+int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+		  int out_size)
+{
+	int err;
+
+	err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, false);
+	return err ? : mlx5_cmd_check(dev, in, out);
+}
+EXPORT_SYMBOL(mlx5_cmd_exec);
+
+int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size,
+		     void *out, int out_size, mlx5_cmd_cbk_t callback,
+		     void *context)
+{
+	return cmd_exec(dev, in, in_size, out, out_size, callback, context,
+			false);
+}
+EXPORT_SYMBOL(mlx5_cmd_exec_cb);
+
+int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size,
+			  void *out, int out_size)
+{
+	int err;
+
+	err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, true);
+
+	return err ? : mlx5_cmd_check(dev, in, out);
+}
+EXPORT_SYMBOL(mlx5_cmd_exec_polling);
+
+static void destroy_msg_cache(struct mlx5_core_dev *dev)
+{
+	struct cmd_msg_cache *ch;
+	struct mlx5_cmd_msg *msg;
+	struct mlx5_cmd_msg *n;
+	int i;
+
+	for (i = 0; i < MLX5_NUM_COMMAND_CACHES; i++) {
+		ch = &dev->cmd.cache[i];
+		list_for_each_entry_safe(msg, n, &ch->head, list) {
+			list_del(&msg->list);
+			mlx5_free_cmd_msg(dev, msg);
+		}
+	}
+}
+
+static unsigned cmd_cache_num_ent[MLX5_NUM_COMMAND_CACHES] = {
+	512, 32, 16, 8, 2
+};
+
+static unsigned cmd_cache_ent_size[MLX5_NUM_COMMAND_CACHES] = {
+	16 + MLX5_CMD_DATA_BLOCK_SIZE,
+	16 + MLX5_CMD_DATA_BLOCK_SIZE * 2,
+	16 + MLX5_CMD_DATA_BLOCK_SIZE * 16,
+	16 + MLX5_CMD_DATA_BLOCK_SIZE * 256,
+	16 + MLX5_CMD_DATA_BLOCK_SIZE * 512,
+};
+
+static void create_msg_cache(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct cmd_msg_cache *ch;
+	struct mlx5_cmd_msg *msg;
+	int i;
+	int k;
+
+	/* Initialize and fill the caches with initial entries */
+	for (k = 0; k < MLX5_NUM_COMMAND_CACHES; k++) {
+		ch = &cmd->cache[k];
+		spin_lock_init(&ch->lock);
+		INIT_LIST_HEAD(&ch->head);
+		ch->num_ent = cmd_cache_num_ent[k];
+		ch->max_inbox_size = cmd_cache_ent_size[k];
+		for (i = 0; i < ch->num_ent; i++) {
+			msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL | __GFP_NOWARN,
+						 ch->max_inbox_size, 0);
+			if (IS_ERR(msg))
+				break;
+			msg->parent = ch;
+			list_add_tail(&msg->list, &ch->head);
+		}
+	}
+}
+
+static int alloc_cmd_page(struct mlx5_core_dev *dev, struct mlx5_cmd *cmd)
+{
+	struct device *ddev = &dev->pdev->dev;
+
+	cmd->cmd_alloc_buf = dma_zalloc_coherent(ddev, MLX5_ADAPTER_PAGE_SIZE,
+						 &cmd->alloc_dma, GFP_KERNEL);
+	if (!cmd->cmd_alloc_buf)
+		return -ENOMEM;
+
+	/* make sure it is aligned to 4K */
+	if (!((uintptr_t)cmd->cmd_alloc_buf & (MLX5_ADAPTER_PAGE_SIZE - 1))) {
+		cmd->cmd_buf = cmd->cmd_alloc_buf;
+		cmd->dma = cmd->alloc_dma;
+		cmd->alloc_size = MLX5_ADAPTER_PAGE_SIZE;
+		return 0;
+	}
+
+	dma_free_coherent(ddev, MLX5_ADAPTER_PAGE_SIZE, cmd->cmd_alloc_buf,
+			  cmd->alloc_dma);
+	cmd->cmd_alloc_buf = dma_zalloc_coherent(ddev,
+						 2 * MLX5_ADAPTER_PAGE_SIZE - 1,
+						 &cmd->alloc_dma, GFP_KERNEL);
+	if (!cmd->cmd_alloc_buf)
+		return -ENOMEM;
+
+	cmd->cmd_buf = PTR_ALIGN(cmd->cmd_alloc_buf, MLX5_ADAPTER_PAGE_SIZE);
+	cmd->dma = ALIGN(cmd->alloc_dma, MLX5_ADAPTER_PAGE_SIZE);
+	cmd->alloc_size = 2 * MLX5_ADAPTER_PAGE_SIZE - 1;
+	return 0;
+}
+
+static void free_cmd_page(struct mlx5_core_dev *dev, struct mlx5_cmd *cmd)
+{
+	struct device *ddev = &dev->pdev->dev;
+
+	dma_free_coherent(ddev, cmd->alloc_size, cmd->cmd_alloc_buf,
+			  cmd->alloc_dma);
+}
+
+int mlx5_cmd_init(struct mlx5_core_dev *dev)
+{
+	int size = sizeof(struct mlx5_cmd_prot_block);
+	int align = roundup_pow_of_two(size);
+	struct mlx5_cmd *cmd = &dev->cmd;
+	u32 cmd_h, cmd_l;
+	u16 cmd_if_rev;
+	int err;
+	int i;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd_if_rev = cmdif_rev(dev);
+	if (cmd_if_rev != CMD_IF_REV) {
+		dev_err(&dev->pdev->dev,
+			"Driver cmdif rev(%d) differs from firmware's(%d)\n",
+			CMD_IF_REV, cmd_if_rev);
+		return -EINVAL;
+	}
+
+	cmd->pool = dma_pool_create("mlx5_cmd", &dev->pdev->dev, size, align,
+				    0);
+	if (!cmd->pool)
+		return -ENOMEM;
+
+	err = alloc_cmd_page(dev, cmd);
+	if (err)
+		goto err_free_pool;
+
+	cmd_l = ioread32be(&dev->iseg->cmdq_addr_l_sz) & 0xff;
+	cmd->log_sz = cmd_l >> 4 & 0xf;
+	cmd->log_stride = cmd_l & 0xf;
+	if (1 << cmd->log_sz > MLX5_MAX_COMMANDS) {
+		dev_err(&dev->pdev->dev, "firmware reports too many outstanding commands %d\n",
+			1 << cmd->log_sz);
+		err = -EINVAL;
+		goto err_free_page;
+	}
+
+	if (cmd->log_sz + cmd->log_stride > MLX5_ADAPTER_PAGE_SHIFT) {
+		dev_err(&dev->pdev->dev, "command queue size overflow\n");
+		err = -EINVAL;
+		goto err_free_page;
+	}
+
+	cmd->checksum_disabled = 1;
+	cmd->max_reg_cmds = (1 << cmd->log_sz) - 1;
+	cmd->bitmask = (1UL << cmd->max_reg_cmds) - 1;
+
+	cmd->cmdif_rev = ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16;
+	if (cmd->cmdif_rev > CMD_IF_REV) {
+		dev_err(&dev->pdev->dev, "driver does not support command interface version. driver %d, firmware %d\n",
+			CMD_IF_REV, cmd->cmdif_rev);
+		err = -EOPNOTSUPP;
+		goto err_free_page;
+	}
+
+	spin_lock_init(&cmd->alloc_lock);
+	spin_lock_init(&cmd->token_lock);
+	for (i = 0; i < ARRAY_SIZE(cmd->stats); i++)
+		spin_lock_init(&cmd->stats[i].lock);
+
+	sema_init(&cmd->sem, cmd->max_reg_cmds);
+	sema_init(&cmd->pages_sem, 1);
+
+	cmd_h = (u32)((u64)(cmd->dma) >> 32);
+	cmd_l = (u32)(cmd->dma);
+	if (cmd_l & 0xfff) {
+		dev_err(&dev->pdev->dev, "invalid command queue address\n");
+		err = -ENOMEM;
+		goto err_free_page;
+	}
+
+	iowrite32be(cmd_h, &dev->iseg->cmdq_addr_h);
+	iowrite32be(cmd_l, &dev->iseg->cmdq_addr_l_sz);
+
+	/* Make sure firmware sees the complete address before we proceed */
+	wmb();
+
+	mlx5_core_dbg(dev, "descriptor at dma 0x%llx\n", (unsigned long long)(cmd->dma));
+
+	cmd->mode = CMD_MODE_POLLING;
+
+	create_msg_cache(dev);
+
+	set_wqname(dev);
+	cmd->wq = create_singlethread_workqueue(cmd->wq_name);
+	if (!cmd->wq) {
+		dev_err(&dev->pdev->dev, "failed to create command workqueue\n");
+		err = -ENOMEM;
+		goto err_cache;
+	}
+
+	err = create_debugfs_files(dev);
+	if (err) {
+		err = -ENOMEM;
+		goto err_wq;
+	}
+
+	return 0;
+
+err_wq:
+	destroy_workqueue(cmd->wq);
+
+err_cache:
+	destroy_msg_cache(dev);
+
+err_free_page:
+	free_cmd_page(dev, cmd);
+
+err_free_pool:
+	dma_pool_destroy(cmd->pool);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_cmd_init);
+
+void mlx5_cmd_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+
+	clean_debug_files(dev);
+	destroy_workqueue(cmd->wq);
+	destroy_msg_cache(dev);
+	free_cmd_page(dev, cmd);
+	dma_pool_destroy(cmd->pool);
+}
+EXPORT_SYMBOL(mlx5_cmd_cleanup);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
new file mode 100644
index 0000000..a417912
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <rdma/ib_verbs.h>
+#include <linux/mlx5/cq.h>
+#include "mlx5_core.h"
+
+#define TASKLET_MAX_TIME 2
+#define TASKLET_MAX_TIME_JIFFIES msecs_to_jiffies(TASKLET_MAX_TIME)
+
+void mlx5_cq_tasklet_cb(unsigned long data)
+{
+	unsigned long flags;
+	unsigned long end = jiffies + TASKLET_MAX_TIME_JIFFIES;
+	struct mlx5_eq_tasklet *ctx = (struct mlx5_eq_tasklet *)data;
+	struct mlx5_core_cq *mcq;
+	struct mlx5_core_cq *temp;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	list_splice_tail_init(&ctx->list, &ctx->process_list);
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	list_for_each_entry_safe(mcq, temp, &ctx->process_list,
+				 tasklet_ctx.list) {
+		list_del_init(&mcq->tasklet_ctx.list);
+		mcq->tasklet_ctx.comp(mcq);
+		mlx5_cq_put(mcq);
+		if (time_after(jiffies, end))
+			break;
+	}
+
+	if (!list_empty(&ctx->process_list))
+		tasklet_schedule(&ctx->task);
+}
+
+static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq)
+{
+	unsigned long flags;
+	struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv;
+
+	spin_lock_irqsave(&tasklet_ctx->lock, flags);
+	/* When migrating CQs between EQs will be implemented, please note
+	 * that you need to sync this point. It is possible that
+	 * while migrating a CQ, completions on the old EQs could
+	 * still arrive.
+	 */
+	if (list_empty_careful(&cq->tasklet_ctx.list)) {
+		mlx5_cq_hold(cq);
+		list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list);
+	}
+	spin_unlock_irqrestore(&tasklet_ctx->lock, flags);
+}
+
+int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			u32 *in, int inlen)
+{
+	int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context), c_eqn);
+	u32 dout[MLX5_ST_SZ_DW(destroy_cq_out)];
+	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
+	u32 din[MLX5_ST_SZ_DW(destroy_cq_in)];
+	struct mlx5_eq *eq;
+	int err;
+
+	eq = mlx5_eqn2eq(dev, eqn);
+	if (IS_ERR(eq))
+		return PTR_ERR(eq);
+
+	memset(out, 0, sizeof(out));
+	MLX5_SET(create_cq_in, in, opcode, MLX5_CMD_OP_CREATE_CQ);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (err)
+		return err;
+
+	cq->cqn = MLX5_GET(create_cq_out, out, cqn);
+	cq->cons_index = 0;
+	cq->arm_sn     = 0;
+	cq->eq         = eq;
+	refcount_set(&cq->refcount, 1);
+	init_completion(&cq->free);
+	if (!cq->comp)
+		cq->comp = mlx5_add_cq_to_tasklet;
+	/* assuming CQ will be deleted before the EQ */
+	cq->tasklet_ctx.priv = &eq->tasklet_ctx;
+	INIT_LIST_HEAD(&cq->tasklet_ctx.list);
+
+	/* Add to comp EQ CQ tree to recv comp events */
+	err = mlx5_eq_add_cq(eq, cq);
+	if (err)
+		goto err_cmd;
+
+	/* Add to async EQ CQ tree to recv async events */
+	err = mlx5_eq_add_cq(&dev->priv.eq_table.async_eq, cq);
+	if (err)
+		goto err_cq_add;
+
+	cq->pid = current->pid;
+	err = mlx5_debug_cq_add(dev, cq);
+	if (err)
+		mlx5_core_dbg(dev, "failed adding CP 0x%x to debug file system\n",
+			      cq->cqn);
+
+	cq->uar = dev->priv.uar;
+
+	return 0;
+
+err_cq_add:
+	mlx5_eq_del_cq(eq, cq);
+err_cmd:
+	memset(din, 0, sizeof(din));
+	memset(dout, 0, sizeof(dout));
+	MLX5_SET(destroy_cq_in, din, opcode, MLX5_CMD_OP_DESTROY_CQ);
+	MLX5_SET(destroy_cq_in, din, cqn, cq->cqn);
+	mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_cq);
+
+int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_cq_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {0};
+	int err;
+
+	err = mlx5_eq_del_cq(&dev->priv.eq_table.async_eq, cq);
+	if (err)
+		return err;
+
+	err = mlx5_eq_del_cq(cq->eq, cq);
+	if (err)
+		return err;
+
+	MLX5_SET(destroy_cq_in, in, opcode, MLX5_CMD_OP_DESTROY_CQ);
+	MLX5_SET(destroy_cq_in, in, cqn, cq->cqn);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	synchronize_irq(cq->irqn);
+
+	mlx5_debug_cq_remove(dev, cq);
+	mlx5_cq_put(cq);
+	wait_for_completion(&cq->free);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_core_destroy_cq);
+
+int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+		       u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_cq_in)] = {0};
+
+	MLX5_SET(query_cq_in, in, opcode, MLX5_CMD_OP_QUERY_CQ);
+	MLX5_SET(query_cq_in, in, cqn, cq->cqn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_cq);
+
+int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_cq_out)] = {0};
+
+	MLX5_SET(modify_cq_in, in, opcode, MLX5_CMD_OP_MODIFY_CQ);
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_modify_cq);
+
+int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev,
+				   struct mlx5_core_cq *cq,
+				   u16 cq_period,
+				   u16 cq_max_count)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_cq_in)] = {0};
+	void *cqc;
+
+	MLX5_SET(modify_cq_in, in, cqn, cq->cqn);
+	cqc = MLX5_ADDR_OF(modify_cq_in, in, cq_context);
+	MLX5_SET(cqc, cqc, cq_period, cq_period);
+	MLX5_SET(cqc, cqc, cq_max_count, cq_max_count);
+	MLX5_SET(modify_cq_in, in,
+		 modify_field_select_resize_field_select.modify_field_select.modify_field_select,
+		 MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT);
+
+	return mlx5_core_modify_cq(dev, cq, in, sizeof(in));
+}
+EXPORT_SYMBOL(mlx5_core_modify_cq_moderation);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
new file mode 100644
index 0000000..7ecadb5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
@@ -0,0 +1,611 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+
+enum {
+	QP_PID,
+	QP_STATE,
+	QP_XPORT,
+	QP_MTU,
+	QP_N_RECV,
+	QP_RECV_SZ,
+	QP_N_SEND,
+	QP_LOG_PG_SZ,
+	QP_RQPN,
+};
+
+static char *qp_fields[] = {
+	[QP_PID]	= "pid",
+	[QP_STATE]	= "state",
+	[QP_XPORT]	= "transport",
+	[QP_MTU]	= "mtu",
+	[QP_N_RECV]	= "num_recv",
+	[QP_RECV_SZ]	= "rcv_wqe_sz",
+	[QP_N_SEND]	= "num_send",
+	[QP_LOG_PG_SZ]	= "log2_page_sz",
+	[QP_RQPN]	= "remote_qpn",
+};
+
+enum {
+	EQ_NUM_EQES,
+	EQ_INTR,
+	EQ_LOG_PG_SZ,
+};
+
+static char *eq_fields[] = {
+	[EQ_NUM_EQES]	= "num_eqes",
+	[EQ_INTR]	= "intr",
+	[EQ_LOG_PG_SZ]	= "log_page_size",
+};
+
+enum {
+	CQ_PID,
+	CQ_NUM_CQES,
+	CQ_LOG_PG_SZ,
+};
+
+static char *cq_fields[] = {
+	[CQ_PID]	= "pid",
+	[CQ_NUM_CQES]	= "num_cqes",
+	[CQ_LOG_PG_SZ]	= "log_page_size",
+};
+
+struct dentry *mlx5_debugfs_root;
+EXPORT_SYMBOL(mlx5_debugfs_root);
+
+void mlx5_register_debugfs(void)
+{
+	mlx5_debugfs_root = debugfs_create_dir("mlx5", NULL);
+	if (IS_ERR_OR_NULL(mlx5_debugfs_root))
+		mlx5_debugfs_root = NULL;
+}
+
+void mlx5_unregister_debugfs(void)
+{
+	debugfs_remove(mlx5_debugfs_root);
+}
+
+int mlx5_qp_debugfs_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	atomic_set(&dev->num_qps, 0);
+
+	dev->priv.qp_debugfs = debugfs_create_dir("QPs",  dev->priv.dbg_root);
+	if (!dev->priv.qp_debugfs)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.qp_debugfs);
+}
+
+int mlx5_eq_debugfs_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dev->priv.eq_debugfs = debugfs_create_dir("EQs",  dev->priv.dbg_root);
+	if (!dev->priv.eq_debugfs)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.eq_debugfs);
+}
+
+static ssize_t average_read(struct file *filp, char __user *buf, size_t count,
+			    loff_t *pos)
+{
+	struct mlx5_cmd_stats *stats;
+	u64 field = 0;
+	int ret;
+	char tbuf[22];
+
+	if (*pos)
+		return 0;
+
+	stats = filp->private_data;
+	spin_lock_irq(&stats->lock);
+	if (stats->n)
+		field = div64_u64(stats->sum, stats->n);
+	spin_unlock_irq(&stats->lock);
+	ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field);
+	if (ret > 0) {
+		if (copy_to_user(buf, tbuf, ret))
+			return -EFAULT;
+	}
+
+	*pos += ret;
+	return ret;
+}
+
+static ssize_t average_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct mlx5_cmd_stats *stats;
+
+	stats = filp->private_data;
+	spin_lock_irq(&stats->lock);
+	stats->sum = 0;
+	stats->n = 0;
+	spin_unlock_irq(&stats->lock);
+
+	*pos += count;
+
+	return count;
+}
+
+static const struct file_operations stats_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.read	= average_read,
+	.write	= average_write,
+};
+
+int mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_stats *stats;
+	struct dentry **cmd;
+	const char *namep;
+	int err;
+	int i;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	cmd = &dev->priv.cmdif_debugfs;
+	*cmd = debugfs_create_dir("commands", dev->priv.dbg_root);
+	if (!*cmd)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(dev->cmd.stats); i++) {
+		stats = &dev->cmd.stats[i];
+		namep = mlx5_command_str(i);
+		if (strcmp(namep, "unknown command opcode")) {
+			stats->root = debugfs_create_dir(namep, *cmd);
+			if (!stats->root) {
+				mlx5_core_warn(dev, "failed adding command %d\n",
+					       i);
+				err = -ENOMEM;
+				goto out;
+			}
+
+			stats->avg = debugfs_create_file("average", 0400,
+							 stats->root, stats,
+							 &stats_fops);
+			if (!stats->avg) {
+				mlx5_core_warn(dev, "failed creating debugfs file\n");
+				err = -ENOMEM;
+				goto out;
+			}
+
+			stats->count = debugfs_create_u64("n", 0400,
+							  stats->root,
+							  &stats->n);
+			if (!stats->count) {
+				mlx5_core_warn(dev, "failed creating debugfs file\n");
+				err = -ENOMEM;
+				goto out;
+			}
+		}
+	}
+
+	return 0;
+out:
+	debugfs_remove_recursive(dev->priv.cmdif_debugfs);
+	return err;
+}
+
+void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.cmdif_debugfs);
+}
+
+int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dev->priv.cq_debugfs = debugfs_create_dir("CQs",  dev->priv.dbg_root);
+	if (!dev->priv.cq_debugfs)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.cq_debugfs);
+}
+
+static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp,
+			 int index, int *is_str)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_qp_out);
+	struct mlx5_qp_context *ctx;
+	u64 param = 0;
+	u32 *out;
+	int err;
+	int no_sq;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return param;
+
+	err = mlx5_core_qp_query(dev, qp, out, outlen);
+	if (err) {
+		mlx5_core_warn(dev, "failed to query qp err=%d\n", err);
+		goto out;
+	}
+
+	*is_str = 0;
+
+	/* FIXME: use MLX5_GET rather than mlx5_qp_context manual struct */
+	ctx = (struct mlx5_qp_context *)MLX5_ADDR_OF(query_qp_out, out, qpc);
+
+	switch (index) {
+	case QP_PID:
+		param = qp->pid;
+		break;
+	case QP_STATE:
+		param = (unsigned long)mlx5_qp_state_str(be32_to_cpu(ctx->flags) >> 28);
+		*is_str = 1;
+		break;
+	case QP_XPORT:
+		param = (unsigned long)mlx5_qp_type_str((be32_to_cpu(ctx->flags) >> 16) & 0xff);
+		*is_str = 1;
+		break;
+	case QP_MTU:
+		switch (ctx->mtu_msgmax >> 5) {
+		case IB_MTU_256:
+			param = 256;
+			break;
+		case IB_MTU_512:
+			param = 512;
+			break;
+		case IB_MTU_1024:
+			param = 1024;
+			break;
+		case IB_MTU_2048:
+			param = 2048;
+			break;
+		case IB_MTU_4096:
+			param = 4096;
+			break;
+		default:
+			param = 0;
+		}
+		break;
+	case QP_N_RECV:
+		param = 1 << ((ctx->rq_size_stride >> 3) & 0xf);
+		break;
+	case QP_RECV_SZ:
+		param = 1 << ((ctx->rq_size_stride & 7) + 4);
+		break;
+	case QP_N_SEND:
+		no_sq = be16_to_cpu(ctx->sq_crq_size) >> 15;
+		if (!no_sq)
+			param = 1 << (be16_to_cpu(ctx->sq_crq_size) >> 11);
+		else
+			param = 0;
+		break;
+	case QP_LOG_PG_SZ:
+		param = (be32_to_cpu(ctx->log_pg_sz_remote_qpn) >> 24) & 0x1f;
+		param += 12;
+		break;
+	case QP_RQPN:
+		param = be32_to_cpu(ctx->log_pg_sz_remote_qpn) & 0xffffff;
+		break;
+	}
+
+out:
+	kfree(out);
+	return param;
+}
+
+static u64 eq_read_field(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+			 int index)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_eq_out);
+	u64 param = 0;
+	void *ctx;
+	u32 *out;
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return param;
+
+	err = mlx5_core_eq_query(dev, eq, out, outlen);
+	if (err) {
+		mlx5_core_warn(dev, "failed to query eq\n");
+		goto out;
+	}
+	ctx = MLX5_ADDR_OF(query_eq_out, out, eq_context_entry);
+
+	switch (index) {
+	case EQ_NUM_EQES:
+		param = 1 << MLX5_GET(eqc, ctx, log_eq_size);
+		break;
+	case EQ_INTR:
+		param = MLX5_GET(eqc, ctx, intr);
+		break;
+	case EQ_LOG_PG_SZ:
+		param = MLX5_GET(eqc, ctx, log_page_size) + 12;
+		break;
+	}
+
+out:
+	kfree(out);
+	return param;
+}
+
+static u64 cq_read_field(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			 int index)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_cq_out);
+	u64 param = 0;
+	void *ctx;
+	u32 *out;
+	int err;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return param;
+
+	err = mlx5_core_query_cq(dev, cq, out, outlen);
+	if (err) {
+		mlx5_core_warn(dev, "failed to query cq\n");
+		goto out;
+	}
+	ctx = MLX5_ADDR_OF(query_cq_out, out, cq_context);
+
+	switch (index) {
+	case CQ_PID:
+		param = cq->pid;
+		break;
+	case CQ_NUM_CQES:
+		param = 1 << MLX5_GET(cqc, ctx, log_cq_size);
+		break;
+	case CQ_LOG_PG_SZ:
+		param = MLX5_GET(cqc, ctx, log_page_size);
+		break;
+	}
+
+out:
+	kvfree(out);
+	return param;
+}
+
+static ssize_t dbg_read(struct file *filp, char __user *buf, size_t count,
+			loff_t *pos)
+{
+	struct mlx5_field_desc *desc;
+	struct mlx5_rsc_debug *d;
+	char tbuf[18];
+	int is_str = 0;
+	u64 field;
+	int ret;
+
+	if (*pos)
+		return 0;
+
+	desc = filp->private_data;
+	d = (void *)(desc - desc->i) - sizeof(*d);
+	switch (d->type) {
+	case MLX5_DBG_RSC_QP:
+		field = qp_read_field(d->dev, d->object, desc->i, &is_str);
+		break;
+
+	case MLX5_DBG_RSC_EQ:
+		field = eq_read_field(d->dev, d->object, desc->i);
+		break;
+
+	case MLX5_DBG_RSC_CQ:
+		field = cq_read_field(d->dev, d->object, desc->i);
+		break;
+
+	default:
+		mlx5_core_warn(d->dev, "invalid resource type %d\n", d->type);
+		return -EINVAL;
+	}
+
+	if (is_str)
+		ret = snprintf(tbuf, sizeof(tbuf), "%s\n", (const char *)(unsigned long)field);
+	else
+		ret = snprintf(tbuf, sizeof(tbuf), "0x%llx\n", field);
+
+	if (ret > 0) {
+		if (copy_to_user(buf, tbuf, ret))
+			return -EFAULT;
+	}
+
+	*pos += ret;
+	return ret;
+}
+
+static const struct file_operations fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.read	= dbg_read,
+};
+
+static int add_res_tree(struct mlx5_core_dev *dev, enum dbg_rsc_type type,
+			struct dentry *root, struct mlx5_rsc_debug **dbg,
+			int rsn, char **field, int nfile, void *data)
+{
+	struct mlx5_rsc_debug *d;
+	char resn[32];
+	int err;
+	int i;
+
+	d = kzalloc(sizeof(*d) + nfile * sizeof(d->fields[0]), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	d->dev = dev;
+	d->object = data;
+	d->type = type;
+	sprintf(resn, "0x%x", rsn);
+	d->root = debugfs_create_dir(resn,  root);
+	if (!d->root) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	for (i = 0; i < nfile; i++) {
+		d->fields[i].i = i;
+		d->fields[i].dent = debugfs_create_file(field[i], 0400,
+							d->root, &d->fields[i],
+							&fops);
+		if (!d->fields[i].dent) {
+			err = -ENOMEM;
+			goto out_rem;
+		}
+	}
+	*dbg = d;
+
+	return 0;
+out_rem:
+	debugfs_remove_recursive(d->root);
+
+out_free:
+	kfree(d);
+	return err;
+}
+
+static void rem_res_tree(struct mlx5_rsc_debug *d)
+{
+	debugfs_remove_recursive(d->root);
+	kfree(d);
+}
+
+int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp)
+{
+	int err;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	err = add_res_tree(dev, MLX5_DBG_RSC_QP, dev->priv.qp_debugfs,
+			   &qp->dbg, qp->qpn, qp_fields,
+			   ARRAY_SIZE(qp_fields), qp);
+	if (err)
+		qp->dbg = NULL;
+
+	return err;
+}
+
+void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	if (qp->dbg)
+		rem_res_tree(qp->dbg);
+}
+
+int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	int err;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	err = add_res_tree(dev, MLX5_DBG_RSC_EQ, dev->priv.eq_debugfs,
+			   &eq->dbg, eq->eqn, eq_fields,
+			   ARRAY_SIZE(eq_fields), eq);
+	if (err)
+		eq->dbg = NULL;
+
+	return err;
+}
+
+void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	if (eq->dbg)
+		rem_res_tree(eq->dbg);
+}
+
+int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
+{
+	int err;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	err = add_res_tree(dev, MLX5_DBG_RSC_CQ, dev->priv.cq_debugfs,
+			   &cq->dbg, cq->cqn, cq_fields,
+			   ARRAY_SIZE(cq_fields), cq);
+	if (err)
+		cq->dbg = NULL;
+
+	return err;
+}
+
+void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	if (cq->dbg)
+		rem_res_tree(cq->dbg);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
new file mode 100644
index 0000000..b994b80
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+
+static LIST_HEAD(intf_list);
+static LIST_HEAD(mlx5_dev_list);
+/* intf dev list mutex */
+static DEFINE_MUTEX(mlx5_intf_mutex);
+
+struct mlx5_device_context {
+	struct list_head	list;
+	struct mlx5_interface  *intf;
+	void		       *context;
+	unsigned long		state;
+};
+
+struct mlx5_delayed_event {
+	struct list_head	list;
+	struct mlx5_core_dev	*dev;
+	enum mlx5_dev_event	event;
+	unsigned long		param;
+};
+
+enum {
+	MLX5_INTERFACE_ADDED,
+	MLX5_INTERFACE_ATTACHED,
+};
+
+static void add_delayed_event(struct mlx5_priv *priv,
+			      struct mlx5_core_dev *dev,
+			      enum mlx5_dev_event event,
+			      unsigned long param)
+{
+	struct mlx5_delayed_event *delayed_event;
+
+	delayed_event = kzalloc(sizeof(*delayed_event), GFP_ATOMIC);
+	if (!delayed_event) {
+		mlx5_core_err(dev, "event %d is missed\n", event);
+		return;
+	}
+
+	mlx5_core_dbg(dev, "Accumulating event %d\n", event);
+	delayed_event->dev = dev;
+	delayed_event->event = event;
+	delayed_event->param = param;
+	list_add_tail(&delayed_event->list, &priv->waiting_events_list);
+}
+
+static void delayed_event_release(struct mlx5_device_context *dev_ctx,
+				  struct mlx5_priv *priv)
+{
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+	struct mlx5_delayed_event *de;
+	struct mlx5_delayed_event *n;
+	struct list_head temp;
+
+	INIT_LIST_HEAD(&temp);
+
+	spin_lock_irq(&priv->ctx_lock);
+
+	priv->is_accum_events = false;
+	list_splice_init(&priv->waiting_events_list, &temp);
+	if (!dev_ctx->context)
+		goto out;
+	list_for_each_entry_safe(de, n, &temp, list)
+		dev_ctx->intf->event(dev, dev_ctx->context, de->event, de->param);
+
+out:
+	spin_unlock_irq(&priv->ctx_lock);
+
+	list_for_each_entry_safe(de, n, &temp, list) {
+		list_del(&de->list);
+		kfree(de);
+	}
+}
+
+/* accumulating events that can come after mlx5_ib calls to
+ * ib_register_device, till adding that interface to the events list.
+ */
+static void delayed_event_start(struct mlx5_priv *priv)
+{
+	spin_lock_irq(&priv->ctx_lock);
+	priv->is_accum_events = true;
+	spin_unlock_irq(&priv->ctx_lock);
+}
+
+void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	if (!mlx5_lag_intf_add(intf, priv))
+		return;
+
+	dev_ctx = kzalloc(sizeof(*dev_ctx), GFP_KERNEL);
+	if (!dev_ctx)
+		return;
+
+	dev_ctx->intf = intf;
+
+	delayed_event_start(priv);
+
+	dev_ctx->context = intf->add(dev);
+	set_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state);
+	if (intf->attach)
+		set_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state);
+
+	if (dev_ctx->context) {
+		spin_lock_irq(&priv->ctx_lock);
+		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+		if (dev_ctx->intf->pfault) {
+			if (priv->pfault) {
+				mlx5_core_err(dev, "multiple page fault handlers not supported");
+			} else {
+				priv->pfault_ctx = dev_ctx->context;
+				priv->pfault = dev_ctx->intf->pfault;
+			}
+		}
+#endif
+		spin_unlock_irq(&priv->ctx_lock);
+	}
+
+	delayed_event_release(dev_ctx, priv);
+
+	if (!dev_ctx->context)
+		kfree(dev_ctx);
+}
+
+static struct mlx5_device_context *mlx5_get_device(struct mlx5_interface *intf,
+						   struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf == intf)
+			return dev_ctx;
+	return NULL;
+}
+
+void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	dev_ctx = mlx5_get_device(intf, priv);
+	if (!dev_ctx)
+		return;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	spin_lock_irq(&priv->ctx_lock);
+	if (priv->pfault == dev_ctx->intf->pfault)
+		priv->pfault = NULL;
+	spin_unlock_irq(&priv->ctx_lock);
+
+	synchronize_srcu(&priv->pfault_srcu);
+#endif
+
+	spin_lock_irq(&priv->ctx_lock);
+	list_del(&dev_ctx->list);
+	spin_unlock_irq(&priv->ctx_lock);
+
+	if (test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
+		intf->remove(dev, dev_ctx->context);
+
+	kfree(dev_ctx);
+}
+
+static void mlx5_attach_interface(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	dev_ctx = mlx5_get_device(intf, priv);
+	if (!dev_ctx)
+		return;
+
+	delayed_event_start(priv);
+	if (intf->attach) {
+		if (test_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state))
+			goto out;
+		intf->attach(dev, dev_ctx->context);
+		set_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state);
+	} else {
+		if (test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
+			goto out;
+		dev_ctx->context = intf->add(dev);
+		set_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state);
+	}
+
+out:
+	delayed_event_release(dev_ctx, priv);
+}
+
+void mlx5_attach_device(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_interface *intf;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx5_attach_interface(intf, priv);
+	mutex_unlock(&mlx5_intf_mutex);
+}
+
+static void mlx5_detach_interface(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	dev_ctx = mlx5_get_device(intf, priv);
+	if (!dev_ctx)
+		return;
+
+	if (intf->detach) {
+		if (!test_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state))
+			return;
+		intf->detach(dev, dev_ctx->context);
+		clear_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state);
+	} else {
+		if (!test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
+			return;
+		intf->remove(dev, dev_ctx->context);
+		clear_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state);
+	}
+}
+
+void mlx5_detach_device(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_interface *intf;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx5_detach_interface(intf, priv);
+	mutex_unlock(&mlx5_intf_mutex);
+}
+
+bool mlx5_device_registered(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv;
+	bool found = false;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_for_each_entry(priv, &mlx5_dev_list, dev_list)
+		if (priv == &dev->priv)
+			found = true;
+	mutex_unlock(&mlx5_intf_mutex);
+
+	return found;
+}
+
+int mlx5_register_device(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_interface *intf;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_add_tail(&priv->dev_list, &mlx5_dev_list);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx5_add_device(intf, priv);
+	mutex_unlock(&mlx5_intf_mutex);
+
+	return 0;
+}
+
+void mlx5_unregister_device(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_interface *intf;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx5_remove_device(intf, priv);
+	list_del(&priv->dev_list);
+	mutex_unlock(&mlx5_intf_mutex);
+}
+
+int mlx5_register_interface(struct mlx5_interface *intf)
+{
+	struct mlx5_priv *priv;
+
+	if (!intf->add || !intf->remove)
+		return -EINVAL;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_add_tail(&intf->list, &intf_list);
+	list_for_each_entry(priv, &mlx5_dev_list, dev_list)
+		mlx5_add_device(intf, priv);
+	mutex_unlock(&mlx5_intf_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_register_interface);
+
+void mlx5_unregister_interface(struct mlx5_interface *intf)
+{
+	struct mlx5_priv *priv;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_for_each_entry(priv, &mlx5_dev_list, dev_list)
+		mlx5_remove_device(intf, priv);
+	list_del(&intf->list);
+	mutex_unlock(&mlx5_intf_mutex);
+}
+EXPORT_SYMBOL(mlx5_unregister_interface);
+
+void mlx5_reload_interface(struct mlx5_core_dev *mdev, int protocol)
+{
+	mutex_lock(&mlx5_intf_mutex);
+	mlx5_remove_dev_by_protocol(mdev, protocol);
+	mlx5_add_dev_by_protocol(mdev, protocol);
+	mutex_unlock(&mlx5_intf_mutex);
+}
+
+void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol)
+{
+	struct mlx5_priv *priv = &mdev->priv;
+	struct mlx5_device_context *dev_ctx;
+	unsigned long flags;
+	void *result = NULL;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &mdev->priv.ctx_list, list)
+		if ((dev_ctx->intf->protocol == protocol) &&
+		    dev_ctx->intf->get_dev) {
+			result = dev_ctx->intf->get_dev(dev_ctx->context);
+			break;
+		}
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+	return result;
+}
+EXPORT_SYMBOL(mlx5_get_protocol_dev);
+
+/* Must be called with intf_mutex held */
+void mlx5_add_dev_by_protocol(struct mlx5_core_dev *dev, int protocol)
+{
+	struct mlx5_interface *intf;
+
+	list_for_each_entry(intf, &intf_list, list)
+		if (intf->protocol == protocol) {
+			mlx5_add_device(intf, &dev->priv);
+			break;
+		}
+}
+
+/* Must be called with intf_mutex held */
+void mlx5_remove_dev_by_protocol(struct mlx5_core_dev *dev, int protocol)
+{
+	struct mlx5_interface *intf;
+
+	list_for_each_entry(intf, &intf_list, list)
+		if (intf->protocol == protocol) {
+			mlx5_remove_device(intf, &dev->priv);
+			break;
+		}
+}
+
+static u16 mlx5_gen_pci_id(struct mlx5_core_dev *dev)
+{
+	return (u16)((dev->pdev->bus->number << 8) |
+		     PCI_SLOT(dev->pdev->devfn));
+}
+
+/* Must be called with intf_mutex held */
+struct mlx5_core_dev *mlx5_get_next_phys_dev(struct mlx5_core_dev *dev)
+{
+	u16 pci_id = mlx5_gen_pci_id(dev);
+	struct mlx5_core_dev *res = NULL;
+	struct mlx5_core_dev *tmp_dev;
+	struct mlx5_priv *priv;
+
+	list_for_each_entry(priv, &mlx5_dev_list, dev_list) {
+		tmp_dev = container_of(priv, struct mlx5_core_dev, priv);
+		if ((dev != tmp_dev) && (mlx5_gen_pci_id(tmp_dev) == pci_id)) {
+			res = tmp_dev;
+			break;
+		}
+	}
+
+	return res;
+}
+
+void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
+		     unsigned long param)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_device_context *dev_ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	if (priv->is_accum_events)
+		add_delayed_event(priv, dev, event, param);
+
+	/* After mlx5_detach_device, the dev_ctx->intf is still set and dev_ctx is
+	 * still in priv->ctx_list. In this case, only notify the dev_ctx if its
+	 * ADDED or ATTACHED bit are set.
+	 */
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf->event &&
+		    (test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state) ||
+		     test_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state)))
+			dev_ctx->intf->event(dev, dev_ctx->context, event, param);
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+void mlx5_core_page_fault(struct mlx5_core_dev *dev,
+			  struct mlx5_pagefault *pfault)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&priv->pfault_srcu);
+	if (priv->pfault)
+		priv->pfault(dev, priv->pfault_ctx, pfault);
+	srcu_read_unlock(&priv->pfault_srcu, srcu_idx);
+}
+#endif
+
+void mlx5_dev_list_lock(void)
+{
+	mutex_lock(&mlx5_intf_mutex);
+}
+
+void mlx5_dev_list_unlock(void)
+{
+	mutex_unlock(&mlx5_intf_mutex);
+}
+
+int mlx5_dev_list_trylock(void)
+{
+	return mutex_trylock(&mlx5_intf_mutex);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile
new file mode 100644
index 0000000..d8e1711
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
new file mode 100644
index 0000000..d93ff56
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define CREATE_TRACE_POINTS
+
+#include "fs_tracepoint.h"
+#include <linux/stringify.h>
+
+#define DECLARE_MASK_VAL(type, name) struct {type m; type v; } name
+#define MASK_VAL(type, spec, name, mask, val, fld)	\
+		DECLARE_MASK_VAL(type, name) =		\
+			{.m = MLX5_GET(spec, mask, fld),\
+			 .v = MLX5_GET(spec, val, fld)}
+#define MASK_VAL_BE(type, spec, name, mask, val, fld)	\
+		    DECLARE_MASK_VAL(type, name) =	\
+			{.m = MLX5_GET_BE(type, spec, mask, fld),\
+			 .v = MLX5_GET_BE(type, spec, val, fld)}
+#define GET_MASKED_VAL(name) (name.m & name.v)
+
+#define GET_MASK_VAL(name, type, mask, val, fld)	\
+		(name.m = MLX5_GET(type, mask, fld),	\
+		 name.v = MLX5_GET(type, val, fld),	\
+		 name.m & name.v)
+#define PRINT_MASKED_VAL(name, p, format) {		\
+	if (name.m)			\
+		trace_seq_printf(p, __stringify(name) "=" format " ", name.v); \
+	}
+#define PRINT_MASKED_VALP(name, cast, p, format) {	\
+	if (name.m)			\
+		trace_seq_printf(p, __stringify(name) "=" format " ",	       \
+				 (cast)&name.v);\
+	}
+
+static void print_lyr_2_4_hdrs(struct trace_seq *p,
+			       const u32 *mask, const u32 *value)
+{
+#define MASK_VAL_L2(type, name, fld) \
+	MASK_VAL(type, fte_match_set_lyr_2_4, name, mask, value, fld)
+	DECLARE_MASK_VAL(u64, smac) = {
+		.m = MLX5_GET(fte_match_set_lyr_2_4, mask, smac_47_16) << 16 |
+		     MLX5_GET(fte_match_set_lyr_2_4, mask, smac_15_0),
+		.v = MLX5_GET(fte_match_set_lyr_2_4, value, smac_47_16) << 16 |
+		     MLX5_GET(fte_match_set_lyr_2_4, value, smac_15_0)};
+	DECLARE_MASK_VAL(u64, dmac) = {
+		.m = MLX5_GET(fte_match_set_lyr_2_4, mask, dmac_47_16) << 16 |
+		     MLX5_GET(fte_match_set_lyr_2_4, mask, dmac_15_0),
+		.v = MLX5_GET(fte_match_set_lyr_2_4, value, dmac_47_16) << 16 |
+		     MLX5_GET(fte_match_set_lyr_2_4, value, dmac_15_0)};
+	MASK_VAL_L2(u16, ethertype, ethertype);
+
+	PRINT_MASKED_VALP(smac, u8 *, p, "%pM");
+	PRINT_MASKED_VALP(dmac, u8 *, p, "%pM");
+	PRINT_MASKED_VAL(ethertype, p, "%04x");
+
+	if (ethertype.m == 0xffff) {
+		if (ethertype.v == ETH_P_IP) {
+#define MASK_VAL_L2_BE(type, name, fld) \
+	MASK_VAL_BE(type, fte_match_set_lyr_2_4, name, mask, value, fld)
+			MASK_VAL_L2_BE(u32, src_ipv4,
+				       src_ipv4_src_ipv6.ipv4_layout.ipv4);
+			MASK_VAL_L2_BE(u32, dst_ipv4,
+				       dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+
+			PRINT_MASKED_VALP(src_ipv4, typeof(&src_ipv4.v), p,
+					  "%pI4");
+			PRINT_MASKED_VALP(dst_ipv4, typeof(&dst_ipv4.v), p,
+					  "%pI4");
+		} else if (ethertype.v == ETH_P_IPV6) {
+			static const struct in6_addr full_ones = {
+				.in6_u.u6_addr32 = {__constant_htonl(0xffffffff),
+						    __constant_htonl(0xffffffff),
+						    __constant_htonl(0xffffffff),
+						    __constant_htonl(0xffffffff)},
+			};
+			DECLARE_MASK_VAL(struct in6_addr, src_ipv6);
+			DECLARE_MASK_VAL(struct in6_addr, dst_ipv6);
+
+			memcpy(src_ipv6.m.in6_u.u6_addr8,
+			       MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask,
+					    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+			       sizeof(src_ipv6.m));
+			memcpy(dst_ipv6.m.in6_u.u6_addr8,
+			       MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask,
+					    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+			       sizeof(dst_ipv6.m));
+			memcpy(src_ipv6.v.in6_u.u6_addr8,
+			       MLX5_ADDR_OF(fte_match_set_lyr_2_4, value,
+					    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+			       sizeof(src_ipv6.v));
+			memcpy(dst_ipv6.v.in6_u.u6_addr8,
+			       MLX5_ADDR_OF(fte_match_set_lyr_2_4, value,
+					    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+			       sizeof(dst_ipv6.v));
+
+			if (!memcmp(&src_ipv6.m, &full_ones, sizeof(full_ones)))
+				trace_seq_printf(p, "src_ipv6=%pI6 ",
+						 &src_ipv6.v);
+			if (!memcmp(&dst_ipv6.m, &full_ones, sizeof(full_ones)))
+				trace_seq_printf(p, "dst_ipv6=%pI6 ",
+						 &dst_ipv6.v);
+		}
+	}
+
+#define PRINT_MASKED_VAL_L2(type, name, fld, p, format) {\
+	MASK_VAL_L2(type, name, fld);		         \
+	PRINT_MASKED_VAL(name, p, format);		 \
+}
+
+	PRINT_MASKED_VAL_L2(u8, ip_protocol, ip_protocol, p, "%02x");
+	PRINT_MASKED_VAL_L2(u16, tcp_flags, tcp_flags, p, "%x");
+	PRINT_MASKED_VAL_L2(u16, tcp_sport, tcp_sport, p, "%u");
+	PRINT_MASKED_VAL_L2(u16, tcp_dport, tcp_dport, p, "%u");
+	PRINT_MASKED_VAL_L2(u16, udp_sport, udp_sport, p, "%u");
+	PRINT_MASKED_VAL_L2(u16, udp_dport, udp_dport, p, "%u");
+	PRINT_MASKED_VAL_L2(u16, first_vid, first_vid, p, "%04x");
+	PRINT_MASKED_VAL_L2(u8, first_prio, first_prio, p, "%x");
+	PRINT_MASKED_VAL_L2(u8, first_cfi, first_cfi, p, "%d");
+	PRINT_MASKED_VAL_L2(u8, ip_dscp, ip_dscp, p, "%02x");
+	PRINT_MASKED_VAL_L2(u8, ip_ecn, ip_ecn, p, "%x");
+	PRINT_MASKED_VAL_L2(u8, cvlan_tag, cvlan_tag, p, "%d");
+	PRINT_MASKED_VAL_L2(u8, svlan_tag, svlan_tag, p, "%d");
+	PRINT_MASKED_VAL_L2(u8, frag, frag, p, "%d");
+}
+
+static void print_misc_parameters_hdrs(struct trace_seq *p,
+				       const u32 *mask, const u32 *value)
+{
+#define MASK_VAL_MISC(type, name, fld) \
+	MASK_VAL(type, fte_match_set_misc, name, mask, value, fld)
+#define PRINT_MASKED_VAL_MISC(type, name, fld, p, format) {\
+	MASK_VAL_MISC(type, name, fld);			   \
+	PRINT_MASKED_VAL(name, p, format);		   \
+}
+	DECLARE_MASK_VAL(u64, gre_key) = {
+		.m = MLX5_GET(fte_match_set_misc, mask, gre_key_h) << 8 |
+		     MLX5_GET(fte_match_set_misc, mask, gre_key_l),
+		.v = MLX5_GET(fte_match_set_misc, value, gre_key_h) << 8 |
+		     MLX5_GET(fte_match_set_misc, value, gre_key_l)};
+
+	PRINT_MASKED_VAL(gre_key, p, "%llu");
+	PRINT_MASKED_VAL_MISC(u32, source_sqn, source_sqn, p, "%u");
+	PRINT_MASKED_VAL_MISC(u16, source_port, source_port, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, outer_second_prio, outer_second_prio,
+			      p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, outer_second_cfi, outer_second_cfi, p, "%u");
+	PRINT_MASKED_VAL_MISC(u16, outer_second_vid, outer_second_vid, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, inner_second_prio, inner_second_prio,
+			      p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, inner_second_cfi, inner_second_cfi, p, "%u");
+	PRINT_MASKED_VAL_MISC(u16, inner_second_vid, inner_second_vid, p, "%u");
+
+	PRINT_MASKED_VAL_MISC(u8, outer_second_cvlan_tag,
+			      outer_second_cvlan_tag, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, inner_second_cvlan_tag,
+			      inner_second_cvlan_tag, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, outer_second_svlan_tag,
+			      outer_second_svlan_tag, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, inner_second_svlan_tag,
+			      inner_second_svlan_tag, p, "%u");
+
+	PRINT_MASKED_VAL_MISC(u8, gre_protocol, gre_protocol, p, "%u");
+
+	PRINT_MASKED_VAL_MISC(u32, vxlan_vni, vxlan_vni, p, "%u");
+	PRINT_MASKED_VAL_MISC(u32, outer_ipv6_flow_label, outer_ipv6_flow_label,
+			      p, "%x");
+	PRINT_MASKED_VAL_MISC(u32, inner_ipv6_flow_label, inner_ipv6_flow_label,
+			      p, "%x");
+}
+
+const char *parse_fs_hdrs(struct trace_seq *p,
+			  u8 match_criteria_enable,
+			  const u32 *mask_outer,
+			  const u32 *mask_misc,
+			  const u32 *mask_inner,
+			  const u32 *value_outer,
+			  const u32 *value_misc,
+			  const u32 *value_inner)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	if (match_criteria_enable &
+	    1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS) {
+		trace_seq_printf(p, "[outer] ");
+		print_lyr_2_4_hdrs(p, mask_outer, value_outer);
+	}
+
+	if (match_criteria_enable &
+	    1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS) {
+		trace_seq_printf(p, "[misc] ");
+		print_misc_parameters_hdrs(p, mask_misc, value_misc);
+	}
+	if (match_criteria_enable &
+	    1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS) {
+		trace_seq_printf(p, "[inner] ");
+		print_lyr_2_4_hdrs(p, mask_inner, value_inner);
+	}
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+const char *parse_fs_dst(struct trace_seq *p,
+			 const struct mlx5_flow_destination *dst,
+			 u32 counter_id)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	switch (dst->type) {
+	case MLX5_FLOW_DESTINATION_TYPE_VPORT:
+		trace_seq_printf(p, "vport=%u\n", dst->vport_num);
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE:
+		trace_seq_printf(p, "ft=%p\n", dst->ft);
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_TIR:
+		trace_seq_printf(p, "tir=%u\n", dst->tir_num);
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_COUNTER:
+		trace_seq_printf(p, "counter_id=%u\n", counter_id);
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_PORT:
+		trace_seq_printf(p, "port\n");
+		break;
+	}
+
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_add_fg);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_fg);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_set_fte);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_fte);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_add_rule);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_rule);
+
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
new file mode 100644
index 0000000..09f178a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(_MLX5_FS_TP_) || defined(TRACE_HEADER_MULTI_READ)
+#define _MLX5_FS_TP_
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+#include "../fs_core.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mlx5
+
+#define __parse_fs_hdrs(match_criteria_enable, mouter, mmisc, minner, vouter, \
+			vinner, vmisc)					      \
+	parse_fs_hdrs(p, match_criteria_enable, mouter, mmisc, minner, vouter,\
+		      vinner, vmisc)
+
+const char *parse_fs_hdrs(struct trace_seq *p,
+			  u8 match_criteria_enable,
+			  const u32 *mask_outer,
+			  const u32 *mask_misc,
+			  const u32 *mask_inner,
+			  const u32 *value_outer,
+			  const u32 *value_misc,
+			  const u32 *value_inner);
+
+#define __parse_fs_dst(dst, counter_id) \
+	parse_fs_dst(p, (const struct mlx5_flow_destination *)dst, counter_id)
+
+const char *parse_fs_dst(struct trace_seq *p,
+			 const struct mlx5_flow_destination *dst,
+			 u32 counter_id);
+
+TRACE_EVENT(mlx5_fs_add_fg,
+	    TP_PROTO(const struct mlx5_flow_group *fg),
+	    TP_ARGS(fg),
+	    TP_STRUCT__entry(
+		__field(const struct mlx5_flow_group *, fg)
+		__field(const struct mlx5_flow_table *, ft)
+		__field(u32, start_index)
+		__field(u32, end_index)
+		__field(u32, id)
+		__field(u8, mask_enable)
+		__array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, mask_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, mask_misc, MLX5_ST_SZ_DW(fte_match_set_misc))
+	    ),
+	    TP_fast_assign(
+			   __entry->fg = fg;
+			   fs_get_obj(__entry->ft, fg->node.parent);
+			   __entry->start_index = fg->start_index;
+			   __entry->end_index = fg->start_index + fg->max_ftes;
+			   __entry->id = fg->id;
+			   __entry->mask_enable = fg->mask.match_criteria_enable;
+			   memcpy(__entry->mask_outer,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fg->mask.match_criteria,
+					       outer_headers),
+				  sizeof(__entry->mask_outer));
+			   memcpy(__entry->mask_inner,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fg->mask.match_criteria,
+					       inner_headers),
+				  sizeof(__entry->mask_inner));
+			   memcpy(__entry->mask_misc,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fg->mask.match_criteria,
+					       misc_parameters),
+				  sizeof(__entry->mask_misc));
+
+	    ),
+	    TP_printk("fg=%p ft=%p id=%u start=%u end=%u bit_mask=%02x %s\n",
+		      __entry->fg, __entry->ft, __entry->id,
+		      __entry->start_index, __entry->end_index,
+		      __entry->mask_enable,
+		      __parse_fs_hdrs(__entry->mask_enable,
+				      __entry->mask_outer,
+				      __entry->mask_misc,
+				      __entry->mask_inner,
+				      __entry->mask_outer,
+				      __entry->mask_misc,
+				      __entry->mask_inner))
+	    );
+
+TRACE_EVENT(mlx5_fs_del_fg,
+	    TP_PROTO(const struct mlx5_flow_group *fg),
+	    TP_ARGS(fg),
+	    TP_STRUCT__entry(
+		__field(const struct mlx5_flow_group *, fg)
+		__field(u32, id)
+	    ),
+	    TP_fast_assign(
+			   __entry->fg = fg;
+			   __entry->id = fg->id;
+
+	    ),
+	    TP_printk("fg=%p id=%u\n",
+		      __entry->fg, __entry->id)
+	    );
+
+#define ACTION_FLAGS \
+	{MLX5_FLOW_CONTEXT_ACTION_ALLOW,	 "ALLOW"},\
+	{MLX5_FLOW_CONTEXT_ACTION_DROP,		 "DROP"},\
+	{MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,	 "FWD"},\
+	{MLX5_FLOW_CONTEXT_ACTION_COUNT,	 "CNT"},\
+	{MLX5_FLOW_CONTEXT_ACTION_ENCAP,	 "ENCAP"},\
+	{MLX5_FLOW_CONTEXT_ACTION_DECAP,	 "DECAP"},\
+	{MLX5_FLOW_CONTEXT_ACTION_MOD_HDR,	 "MOD_HDR"},\
+	{MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH,	 "VLAN_PUSH"},\
+	{MLX5_FLOW_CONTEXT_ACTION_VLAN_POP,	 "VLAN_POP"},\
+	{MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO, "NEXT_PRIO"}
+
+TRACE_EVENT(mlx5_fs_set_fte,
+	    TP_PROTO(const struct fs_fte *fte, int new_fte),
+	    TP_ARGS(fte, new_fte),
+	    TP_STRUCT__entry(
+		__field(const struct fs_fte *, fte)
+		__field(const struct mlx5_flow_group *, fg)
+		__field(u32, group_index)
+		__field(u32, index)
+		__field(u32, action)
+		__field(u32, flow_tag)
+		__field(u8,  mask_enable)
+		__field(int, new_fte)
+		__array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, mask_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, mask_misc, MLX5_ST_SZ_DW(fte_match_set_misc))
+		__array(u32, value_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, value_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, value_misc, MLX5_ST_SZ_DW(fte_match_set_misc))
+	    ),
+	    TP_fast_assign(
+			   __entry->fte = fte;
+			   __entry->new_fte = new_fte;
+			   fs_get_obj(__entry->fg, fte->node.parent);
+			   __entry->group_index = __entry->fg->id;
+			   __entry->index = fte->index;
+			   __entry->action = fte->action.action;
+			   __entry->mask_enable = __entry->fg->mask.match_criteria_enable;
+			   __entry->flow_tag = fte->action.flow_tag;
+			   memcpy(__entry->mask_outer,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &__entry->fg->mask.match_criteria,
+					       outer_headers),
+				  sizeof(__entry->mask_outer));
+			   memcpy(__entry->mask_inner,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &__entry->fg->mask.match_criteria,
+					       inner_headers),
+				  sizeof(__entry->mask_inner));
+			   memcpy(__entry->mask_misc,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &__entry->fg->mask.match_criteria,
+					       misc_parameters),
+				  sizeof(__entry->mask_misc));
+			   memcpy(__entry->value_outer,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fte->val,
+					       outer_headers),
+				  sizeof(__entry->value_outer));
+			   memcpy(__entry->value_inner,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fte->val,
+					       inner_headers),
+				  sizeof(__entry->value_inner));
+			   memcpy(__entry->value_misc,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fte->val,
+					       misc_parameters),
+				  sizeof(__entry->value_misc));
+
+	    ),
+	    TP_printk("op=%s fte=%p fg=%p index=%u group_index=%u action=<%s> flow_tag=%x %s\n",
+		      __entry->new_fte ? "add" : "set",
+		      __entry->fte, __entry->fg, __entry->index,
+		      __entry->group_index, __print_flags(__entry->action, "|",
+							  ACTION_FLAGS),
+		      __entry->flow_tag,
+		      __parse_fs_hdrs(__entry->mask_enable,
+				      __entry->mask_outer,
+				      __entry->mask_misc,
+				      __entry->mask_inner,
+				      __entry->value_outer,
+				      __entry->value_misc,
+				      __entry->value_inner))
+	    );
+
+TRACE_EVENT(mlx5_fs_del_fte,
+	    TP_PROTO(const struct fs_fte *fte),
+	    TP_ARGS(fte),
+	    TP_STRUCT__entry(
+		__field(const struct fs_fte *, fte)
+		__field(u32, index)
+	    ),
+	    TP_fast_assign(
+			   __entry->fte = fte;
+			   __entry->index = fte->index;
+
+	    ),
+	    TP_printk("fte=%p index=%u\n",
+		      __entry->fte, __entry->index)
+	    );
+
+TRACE_EVENT(mlx5_fs_add_rule,
+	    TP_PROTO(const struct mlx5_flow_rule *rule),
+	    TP_ARGS(rule),
+	    TP_STRUCT__entry(
+		__field(const struct mlx5_flow_rule *, rule)
+		__field(const struct fs_fte *, fte)
+		__field(u32, sw_action)
+		__field(u32, index)
+		__field(u32, counter_id)
+		__array(u8, destination, sizeof(struct mlx5_flow_destination))
+	    ),
+	    TP_fast_assign(
+			   __entry->rule = rule;
+			   fs_get_obj(__entry->fte, rule->node.parent);
+			   __entry->index = __entry->fte->dests_size - 1;
+			   __entry->sw_action = rule->sw_action;
+			   memcpy(__entry->destination,
+				  &rule->dest_attr,
+				  sizeof(__entry->destination));
+			   if (rule->dest_attr.type & MLX5_FLOW_DESTINATION_TYPE_COUNTER &&
+			       rule->dest_attr.counter)
+				__entry->counter_id =
+				rule->dest_attr.counter->id;
+	    ),
+	    TP_printk("rule=%p fte=%p index=%u sw_action=<%s> [dst] %s\n",
+		      __entry->rule, __entry->fte, __entry->index,
+		      __print_flags(__entry->sw_action, "|", ACTION_FLAGS),
+		      __parse_fs_dst(__entry->destination, __entry->counter_id))
+	    );
+
+TRACE_EVENT(mlx5_fs_del_rule,
+	    TP_PROTO(const struct mlx5_flow_rule *rule),
+	    TP_ARGS(rule),
+	    TP_STRUCT__entry(
+		__field(const struct mlx5_flow_rule *, rule)
+		__field(const struct fs_fte *, fte)
+	    ),
+	    TP_fast_assign(
+			   __entry->rule = rule;
+			   fs_get_obj(__entry->fte, rule->node.parent);
+	    ),
+	    TP_printk("rule=%p fte=%p\n",
+		      __entry->rule, __entry->fte)
+	    );
+#endif
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ./diag
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE fs_tracepoint
+#include <trace/define_trace.h>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
new file mode 100644
index 0000000..30cad07
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -0,0 +1,1110 @@
+/*
+ * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __MLX5_EN_H__
+#define __MLX5_EN_H__
+
+#include <linux/if_vlan.h>
+#include <linux/etherdevice.h>
+#include <linux/timecounter.h>
+#include <linux/net_tstamp.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/crash_dump.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/port.h>
+#include <linux/mlx5/vport.h>
+#include <linux/mlx5/transobj.h>
+#include <linux/mlx5/fs.h>
+#include <linux/rhashtable.h>
+#include <net/switchdev.h>
+#include <net/xdp.h>
+#include <linux/net_dim.h>
+#include "wq.h"
+#include "mlx5_core.h"
+#include "en_stats.h"
+
+#define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
+
+#define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
+
+#define MLX5E_HW2SW_MTU(params, hwmtu) ((hwmtu) - ((params)->hard_mtu))
+#define MLX5E_SW2HW_MTU(params, swmtu) ((swmtu) + ((params)->hard_mtu))
+
+#define MLX5E_MAX_DSCP          64
+#define MLX5E_MAX_NUM_TC	8
+
+#define MLX5_RX_HEADROOM NET_SKB_PAD
+#define MLX5_SKB_FRAG_SZ(len)	(SKB_DATA_ALIGN(len) +	\
+				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+
+#define MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev) \
+	(6 + MLX5_CAP_GEN(mdev, cache_line_128byte)) /* HW restriction */
+#define MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, req) \
+	max_t(u32, MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev), req)
+#define MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev)       MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 6)
+#define MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 8)
+#define MLX5E_MPWQE_STRIDE_SZ(mdev, cqe_cmprs) \
+	(cqe_cmprs ? MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) : \
+	MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev))
+
+#define MLX5_MPWRQ_LOG_WQE_SZ			18
+#define MLX5_MPWRQ_WQE_PAGE_ORDER  (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \
+				    MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0)
+#define MLX5_MPWRQ_PAGES_PER_WQE		BIT(MLX5_MPWRQ_WQE_PAGE_ORDER)
+
+#define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2)
+#define MLX5E_REQUIRED_WQE_MTTS		(ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8))
+#define MLX5E_LOG_ALIGNED_MPWQE_PPW	(ilog2(MLX5E_REQUIRED_WQE_MTTS))
+#define MLX5E_REQUIRED_MTTS(wqes)	(wqes * MLX5E_REQUIRED_WQE_MTTS)
+#define MLX5E_MAX_RQ_NUM_MTTS	\
+	((1 << 16) * 2) /* So that MLX5_MTT_OCTW(num_mtts) fits into u16 */
+#define MLX5E_ORDER2_MAX_PACKET_MTU (order_base_2(10 * 1024))
+#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW	\
+		(ilog2(MLX5E_MAX_RQ_NUM_MTTS / MLX5E_REQUIRED_WQE_MTTS))
+#define MLX5E_LOG_MAX_RQ_NUM_PACKETS_MPW \
+	(MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW + \
+	 (MLX5_MPWRQ_LOG_WQE_SZ - MLX5E_ORDER2_MAX_PACKET_MTU))
+
+#define MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE                0x6
+#define MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE                0xa
+#define MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE                0xd
+
+#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE                0x1
+#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
+#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE min_t(u8, 0xd,	\
+					       MLX5E_LOG_MAX_RQ_NUM_PACKETS_MPW)
+
+#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW            0x2
+
+#define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD	(256)
+
+#define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 (64 * 1024)
+#define MLX5E_DEFAULT_LRO_TIMEOUT                       32
+#define MLX5E_LRO_TIMEOUT_ARR_SIZE                      4
+
+#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC      0x10
+#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE 0x3
+#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS      0x20
+#define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC      0x10
+#define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC_FROM_CQE 0x10
+#define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS      0x20
+#define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES                0x80
+#define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES_MPW            0x2
+
+#define MLX5E_LOG_INDIR_RQT_SIZE       0x7
+#define MLX5E_INDIR_RQT_SIZE           BIT(MLX5E_LOG_INDIR_RQT_SIZE)
+#define MLX5E_MIN_NUM_CHANNELS         0x1
+#define MLX5E_MAX_NUM_CHANNELS         (MLX5E_INDIR_RQT_SIZE >> 1)
+#define MLX5E_MAX_NUM_SQS              (MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC)
+#define MLX5E_TX_CQ_POLL_BUDGET        128
+#define MLX5E_UPDATE_STATS_INTERVAL    200 /* msecs */
+#define MLX5E_SQ_RECOVER_MIN_INTERVAL  500 /* msecs */
+
+#define MLX5E_UMR_WQE_INLINE_SZ \
+	(sizeof(struct mlx5e_umr_wqe) + \
+	 ALIGN(MLX5_MPWRQ_PAGES_PER_WQE * sizeof(struct mlx5_mtt), \
+	       MLX5_UMR_MTT_ALIGNMENT))
+#define MLX5E_UMR_WQEBBS \
+	(DIV_ROUND_UP(MLX5E_UMR_WQE_INLINE_SZ, MLX5_SEND_WQE_BB))
+#define MLX5E_ICOSQ_MAX_WQEBBS MLX5E_UMR_WQEBBS
+
+#define MLX5E_XDP_MIN_INLINE (ETH_HLEN + VLAN_HLEN)
+#define MLX5E_XDP_TX_DS_COUNT \
+	((sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS) + 1 /* SG DS */)
+
+#define MLX5E_NUM_MAIN_GROUPS 9
+
+#define MLX5E_MSG_LEVEL			NETIF_MSG_LINK
+
+#define mlx5e_dbg(mlevel, priv, format, ...)                    \
+do {                                                            \
+	if (NETIF_MSG_##mlevel & (priv)->msglevel)              \
+		netdev_warn(priv->netdev, format,               \
+			    ##__VA_ARGS__);                     \
+} while (0)
+
+
+static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size)
+{
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return min_t(u16, MLX5E_PARAMS_DEFAULT_MIN_RX_WQES_MPW,
+			     wq_size / 2);
+	default:
+		return min_t(u16, MLX5E_PARAMS_DEFAULT_MIN_RX_WQES,
+			     wq_size / 2);
+	}
+}
+
+static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
+{
+	return is_kdump_kernel() ?
+		MLX5E_MIN_NUM_CHANNELS :
+		min_t(int, mdev->priv.eq_table.num_comp_vectors,
+		      MLX5E_MAX_NUM_CHANNELS);
+}
+
+struct mlx5e_tx_wqe {
+	struct mlx5_wqe_ctrl_seg ctrl;
+	struct mlx5_wqe_eth_seg  eth;
+};
+
+struct mlx5e_rx_wqe {
+	struct mlx5_wqe_srq_next_seg  next;
+	struct mlx5_wqe_data_seg      data;
+};
+
+struct mlx5e_umr_wqe {
+	struct mlx5_wqe_ctrl_seg       ctrl;
+	struct mlx5_wqe_umr_ctrl_seg   uctrl;
+	struct mlx5_mkey_seg           mkc;
+	struct mlx5_mtt                inline_mtts[0];
+};
+
+extern const char mlx5e_self_tests[][ETH_GSTRING_LEN];
+
+static const char mlx5e_priv_flags[][ETH_GSTRING_LEN] = {
+	"rx_cqe_moder",
+	"tx_cqe_moder",
+	"rx_cqe_compress",
+	"rx_striding_rq",
+};
+
+enum mlx5e_priv_flag {
+	MLX5E_PFLAG_RX_CQE_BASED_MODER = (1 << 0),
+	MLX5E_PFLAG_TX_CQE_BASED_MODER = (1 << 1),
+	MLX5E_PFLAG_RX_CQE_COMPRESS = (1 << 2),
+	MLX5E_PFLAG_RX_STRIDING_RQ = (1 << 3),
+};
+
+#define MLX5E_SET_PFLAG(params, pflag, enable)			\
+	do {							\
+		if (enable)					\
+			(params)->pflags |= (pflag);		\
+		else						\
+			(params)->pflags &= ~(pflag);		\
+	} while (0)
+
+#define MLX5E_GET_PFLAG(params, pflag) (!!((params)->pflags & (pflag)))
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+#define MLX5E_MAX_BW_ALLOC 100 /* Max percentage of BW allocation */
+#endif
+
+struct mlx5e_params {
+	u8  log_sq_size;
+	u8  rq_wq_type;
+	u8  log_rq_mtu_frames;
+	u16 num_channels;
+	u8  num_tc;
+	bool rx_cqe_compress_def;
+	struct net_dim_cq_moder rx_cq_moderation;
+	struct net_dim_cq_moder tx_cq_moderation;
+	bool lro_en;
+	u32 lro_wqe_sz;
+	u8  tx_min_inline_mode;
+	u8  rss_hfunc;
+	u8  toeplitz_hash_key[40];
+	u32 indirection_rqt[MLX5E_INDIR_RQT_SIZE];
+	bool vlan_strip_disable;
+	bool scatter_fcs_en;
+	bool rx_dim_enabled;
+	u32 lro_timeout;
+	u32 pflags;
+	struct bpf_prog *xdp_prog;
+	unsigned int sw_mtu;
+	int hard_mtu;
+};
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+struct mlx5e_cee_config {
+	/* bw pct for priority group */
+	u8                         pg_bw_pct[CEE_DCBX_MAX_PGS];
+	u8                         prio_to_pg_map[CEE_DCBX_MAX_PRIO];
+	bool                       pfc_setting[CEE_DCBX_MAX_PRIO];
+	bool                       pfc_enable;
+};
+
+enum {
+	MLX5_DCB_CHG_RESET,
+	MLX5_DCB_NO_CHG,
+	MLX5_DCB_CHG_NO_RESET,
+};
+
+struct mlx5e_dcbx {
+	enum mlx5_dcbx_oper_mode   mode;
+	struct mlx5e_cee_config    cee_cfg; /* pending configuration */
+	u8                         dscp_app_cnt;
+
+	/* The only setting that cannot be read from FW */
+	u8                         tc_tsa[IEEE_8021QAZ_MAX_TCS];
+	u8                         cap;
+};
+
+struct mlx5e_dcbx_dp {
+	u8                         dscp2prio[MLX5E_MAX_DSCP];
+	u8                         trust_state;
+};
+#endif
+
+enum {
+	MLX5E_RQ_STATE_ENABLED,
+	MLX5E_RQ_STATE_AM,
+};
+
+#define MLX5E_TEST_BIT(state, nr) (state & BIT(nr))
+
+struct mlx5e_cq {
+	/* data path - accessed per cqe */
+	struct mlx5_cqwq           wq;
+
+	/* data path - accessed per napi poll */
+	u16                        event_ctr;
+	struct napi_struct        *napi;
+	struct mlx5_core_cq        mcq;
+	struct mlx5e_channel      *channel;
+
+	/* cqe decompression */
+	struct mlx5_cqe64          title;
+	struct mlx5_mini_cqe8      mini_arr[MLX5_MINI_CQE_ARRAY_SIZE];
+	u8                         mini_arr_idx;
+	u16                        decmprs_left;
+	u16                        decmprs_wqe_counter;
+
+	/* control */
+	struct mlx5_core_dev      *mdev;
+	struct mlx5_frag_wq_ctrl   wq_ctrl;
+} ____cacheline_aligned_in_smp;
+
+struct mlx5e_tx_wqe_info {
+	struct sk_buff *skb;
+	u32 num_bytes;
+	u8  num_wqebbs;
+	u8  num_dma;
+};
+
+enum mlx5e_dma_map_type {
+	MLX5E_DMA_MAP_SINGLE,
+	MLX5E_DMA_MAP_PAGE
+};
+
+struct mlx5e_sq_dma {
+	dma_addr_t              addr;
+	u32                     size;
+	enum mlx5e_dma_map_type type;
+};
+
+enum {
+	MLX5E_SQ_STATE_ENABLED,
+	MLX5E_SQ_STATE_RECOVERING,
+	MLX5E_SQ_STATE_IPSEC,
+};
+
+struct mlx5e_sq_wqe_info {
+	u8  opcode;
+};
+
+struct mlx5e_txqsq {
+	/* data path */
+
+	/* dirtied @completion */
+	u16                        cc;
+	u32                        dma_fifo_cc;
+
+	/* dirtied @xmit */
+	u16                        pc ____cacheline_aligned_in_smp;
+	u32                        dma_fifo_pc;
+	struct mlx5e_sq_stats      stats;
+
+	struct mlx5e_cq            cq;
+
+	/* write@xmit, read@completion */
+	struct {
+		struct mlx5e_sq_dma       *dma_fifo;
+		struct mlx5e_tx_wqe_info  *wqe_info;
+	} db;
+
+	/* read only */
+	struct mlx5_wq_cyc         wq;
+	u32                        dma_fifo_mask;
+	void __iomem              *uar_map;
+	struct netdev_queue       *txq;
+	u32                        sqn;
+	u8                         min_inline_mode;
+	u16                        edge;
+	struct device             *pdev;
+	__be32                     mkey_be;
+	unsigned long              state;
+	struct hwtstamp_config    *tstamp;
+	struct mlx5_clock         *clock;
+
+	/* control path */
+	struct mlx5_wq_ctrl        wq_ctrl;
+	struct mlx5e_channel      *channel;
+	int                        txq_ix;
+	u32                        rate_limit;
+	struct mlx5e_txqsq_recover {
+		struct work_struct         recover_work;
+		u64                        last_recover;
+	} recover;
+} ____cacheline_aligned_in_smp;
+
+struct mlx5e_xdpsq {
+	/* data path */
+
+	/* dirtied @rx completion */
+	u16                        cc;
+	u16                        pc;
+
+	struct mlx5e_cq            cq;
+
+	/* write@xmit, read@completion */
+	struct {
+		struct mlx5e_dma_info     *di;
+		bool                       doorbell;
+	} db;
+
+	/* read only */
+	struct mlx5_wq_cyc         wq;
+	void __iomem              *uar_map;
+	u32                        sqn;
+	struct device             *pdev;
+	__be32                     mkey_be;
+	u8                         min_inline_mode;
+	unsigned long              state;
+
+	/* control path */
+	struct mlx5_wq_ctrl        wq_ctrl;
+	struct mlx5e_channel      *channel;
+} ____cacheline_aligned_in_smp;
+
+struct mlx5e_icosq {
+	/* data path */
+
+	/* dirtied @xmit */
+	u16                        pc ____cacheline_aligned_in_smp;
+
+	struct mlx5e_cq            cq;
+
+	/* write@xmit, read@completion */
+	struct {
+		struct mlx5e_sq_wqe_info *ico_wqe;
+	} db;
+
+	/* read only */
+	struct mlx5_wq_cyc         wq;
+	void __iomem              *uar_map;
+	u32                        sqn;
+	u16                        edge;
+	unsigned long              state;
+
+	/* control path */
+	struct mlx5_wq_ctrl        wq_ctrl;
+	struct mlx5e_channel      *channel;
+} ____cacheline_aligned_in_smp;
+
+static inline bool
+mlx5e_wqc_has_room_for(struct mlx5_wq_cyc *wq, u16 cc, u16 pc, u16 n)
+{
+	return (((wq->sz_m1 & (cc - pc)) >= n) || (cc == pc));
+}
+
+struct mlx5e_dma_info {
+	struct page	*page;
+	dma_addr_t	addr;
+};
+
+struct mlx5e_wqe_frag_info {
+	struct mlx5e_dma_info di;
+	u32 offset;
+};
+
+struct mlx5e_umr_dma_info {
+	struct mlx5e_dma_info  dma_info[MLX5_MPWRQ_PAGES_PER_WQE];
+};
+
+struct mlx5e_mpw_info {
+	struct mlx5e_umr_dma_info umr;
+	u16 consumed_strides;
+	DECLARE_BITMAP(xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
+};
+
+/* a single cache unit is capable to serve one napi call (for non-striding rq)
+ * or a MPWQE (for striding rq).
+ */
+#define MLX5E_CACHE_UNIT	(MLX5_MPWRQ_PAGES_PER_WQE > NAPI_POLL_WEIGHT ? \
+				 MLX5_MPWRQ_PAGES_PER_WQE : NAPI_POLL_WEIGHT)
+#define MLX5E_CACHE_SIZE	(4 * roundup_pow_of_two(MLX5E_CACHE_UNIT))
+struct mlx5e_page_cache {
+	u32 head;
+	u32 tail;
+	struct mlx5e_dma_info page_cache[MLX5E_CACHE_SIZE];
+};
+
+struct mlx5e_rq;
+typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*);
+typedef struct sk_buff *
+(*mlx5e_fp_skb_from_cqe_mpwrq)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+			       u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq);
+typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16);
+
+enum mlx5e_rq_flag {
+	MLX5E_RQ_FLAG_XDP_XMIT = BIT(0),
+};
+
+struct mlx5e_rq {
+	/* data path */
+	struct mlx5_wq_ll      wq;
+
+	union {
+		struct {
+			struct mlx5e_wqe_frag_info *frag_info;
+			u32 frag_sz;	/* max possible skb frag_sz */
+			union {
+				bool page_reuse;
+			};
+		} wqe;
+		struct {
+			struct mlx5e_umr_wqe   umr_wqe;
+			struct mlx5e_mpw_info *info;
+			mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq;
+			u16                    num_strides;
+			u8                     log_stride_sz;
+			bool                   umr_in_progress;
+		} mpwqe;
+	};
+	struct {
+		u16            headroom;
+		u8             page_order;
+		u8             map_dir;   /* dma map direction */
+	} buff;
+
+	struct mlx5e_channel  *channel;
+	struct device         *pdev;
+	struct net_device     *netdev;
+	struct mlx5e_rq_stats  stats;
+	struct mlx5e_cq        cq;
+	struct mlx5e_page_cache page_cache;
+	struct hwtstamp_config *tstamp;
+	struct mlx5_clock      *clock;
+
+	mlx5e_fp_handle_rx_cqe handle_rx_cqe;
+	mlx5e_fp_post_rx_wqes  post_wqes;
+	mlx5e_fp_dealloc_wqe   dealloc_wqe;
+
+	unsigned long          state;
+	int                    ix;
+
+	struct net_dim         dim; /* Dynamic Interrupt Moderation */
+
+	/* XDP */
+	struct bpf_prog       *xdp_prog;
+	unsigned int           hw_mtu;
+	struct mlx5e_xdpsq     xdpsq;
+	DECLARE_BITMAP(flags, 8);
+
+	/* control */
+	struct mlx5_wq_ctrl    wq_ctrl;
+	__be32                 mkey_be;
+	u8                     wq_type;
+	u32                    rqn;
+	struct mlx5_core_dev  *mdev;
+	struct mlx5_core_mkey  umr_mkey;
+
+	/* XDP read-mostly */
+	struct xdp_rxq_info    xdp_rxq;
+} ____cacheline_aligned_in_smp;
+
+struct mlx5e_channel {
+	/* data path */
+	struct mlx5e_rq            rq;
+	struct mlx5e_txqsq         sq[MLX5E_MAX_NUM_TC];
+	struct mlx5e_icosq         icosq;   /* internal control operations */
+	bool                       xdp;
+	struct napi_struct         napi;
+	struct device             *pdev;
+	struct net_device         *netdev;
+	__be32                     mkey_be;
+	u8                         num_tc;
+
+	/* data path - accessed per napi poll */
+	struct irq_desc *irq_desc;
+	struct mlx5e_ch_stats      stats;
+
+	/* control */
+	struct mlx5e_priv         *priv;
+	struct mlx5_core_dev      *mdev;
+	struct hwtstamp_config    *tstamp;
+	int                        ix;
+	int                        cpu;
+};
+
+struct mlx5e_channels {
+	struct mlx5e_channel **c;
+	unsigned int           num;
+	struct mlx5e_params    params;
+};
+
+enum mlx5e_traffic_types {
+	MLX5E_TT_IPV4_TCP,
+	MLX5E_TT_IPV6_TCP,
+	MLX5E_TT_IPV4_UDP,
+	MLX5E_TT_IPV6_UDP,
+	MLX5E_TT_IPV4_IPSEC_AH,
+	MLX5E_TT_IPV6_IPSEC_AH,
+	MLX5E_TT_IPV4_IPSEC_ESP,
+	MLX5E_TT_IPV6_IPSEC_ESP,
+	MLX5E_TT_IPV4,
+	MLX5E_TT_IPV6,
+	MLX5E_TT_ANY,
+	MLX5E_NUM_TT,
+	MLX5E_NUM_INDIR_TIRS = MLX5E_TT_ANY,
+};
+
+enum mlx5e_tunnel_types {
+	MLX5E_TT_IPV4_GRE,
+	MLX5E_TT_IPV6_GRE,
+	MLX5E_NUM_TUNNEL_TT,
+};
+
+enum {
+	MLX5E_STATE_ASYNC_EVENTS_ENABLED,
+	MLX5E_STATE_OPENED,
+	MLX5E_STATE_DESTROYING,
+};
+
+struct mlx5e_vxlan_db {
+	spinlock_t			lock; /* protect vxlan table */
+	struct radix_tree_root		tree;
+};
+
+struct mlx5e_l2_rule {
+	u8  addr[ETH_ALEN + 2];
+	struct mlx5_flow_handle *rule;
+};
+
+struct mlx5e_flow_table {
+	int num_groups;
+	struct mlx5_flow_table *t;
+	struct mlx5_flow_group **g;
+};
+
+#define MLX5E_L2_ADDR_HASH_SIZE BIT(BITS_PER_BYTE)
+
+struct mlx5e_tc_table {
+	struct mlx5_flow_table		*t;
+
+	struct rhashtable_params        ht_params;
+	struct rhashtable               ht;
+
+	DECLARE_HASHTABLE(mod_hdr_tbl, 8);
+	DECLARE_HASHTABLE(hairpin_tbl, 8);
+};
+
+struct mlx5e_vlan_table {
+	struct mlx5e_flow_table		ft;
+	DECLARE_BITMAP(active_cvlans, VLAN_N_VID);
+	DECLARE_BITMAP(active_svlans, VLAN_N_VID);
+	struct mlx5_flow_handle	*active_cvlans_rule[VLAN_N_VID];
+	struct mlx5_flow_handle	*active_svlans_rule[VLAN_N_VID];
+	struct mlx5_flow_handle	*untagged_rule;
+	struct mlx5_flow_handle	*any_cvlan_rule;
+	struct mlx5_flow_handle	*any_svlan_rule;
+	bool			cvlan_filter_disabled;
+};
+
+struct mlx5e_l2_table {
+	struct mlx5e_flow_table    ft;
+	struct hlist_head          netdev_uc[MLX5E_L2_ADDR_HASH_SIZE];
+	struct hlist_head          netdev_mc[MLX5E_L2_ADDR_HASH_SIZE];
+	struct mlx5e_l2_rule	   broadcast;
+	struct mlx5e_l2_rule	   allmulti;
+	struct mlx5e_l2_rule	   promisc;
+	bool                       broadcast_enabled;
+	bool                       allmulti_enabled;
+	bool                       promisc_enabled;
+};
+
+/* L3/L4 traffic type classifier */
+struct mlx5e_ttc_table {
+	struct mlx5e_flow_table  ft;
+	struct mlx5_flow_handle	 *rules[MLX5E_NUM_TT];
+	struct mlx5_flow_handle  *tunnel_rules[MLX5E_NUM_TUNNEL_TT];
+};
+
+#define ARFS_HASH_SHIFT BITS_PER_BYTE
+#define ARFS_HASH_SIZE BIT(BITS_PER_BYTE)
+struct arfs_table {
+	struct mlx5e_flow_table  ft;
+	struct mlx5_flow_handle	 *default_rule;
+	struct hlist_head	 rules_hash[ARFS_HASH_SIZE];
+};
+
+enum  arfs_type {
+	ARFS_IPV4_TCP,
+	ARFS_IPV6_TCP,
+	ARFS_IPV4_UDP,
+	ARFS_IPV6_UDP,
+	ARFS_NUM_TYPES,
+};
+
+struct mlx5e_arfs_tables {
+	struct arfs_table arfs_tables[ARFS_NUM_TYPES];
+	/* Protect aRFS rules list */
+	spinlock_t                     arfs_lock;
+	struct list_head               rules;
+	int                            last_filter_id;
+	struct workqueue_struct        *wq;
+};
+
+/* NIC prio FTS */
+enum {
+	MLX5E_VLAN_FT_LEVEL = 0,
+	MLX5E_L2_FT_LEVEL,
+	MLX5E_TTC_FT_LEVEL,
+	MLX5E_INNER_TTC_FT_LEVEL,
+	MLX5E_ARFS_FT_LEVEL
+};
+
+enum {
+	MLX5E_TC_FT_LEVEL = 0,
+	MLX5E_TC_TTC_FT_LEVEL,
+};
+
+struct mlx5e_ethtool_table {
+	struct mlx5_flow_table *ft;
+	int                    num_rules;
+};
+
+#define ETHTOOL_NUM_L3_L4_FTS 7
+#define ETHTOOL_NUM_L2_FTS 4
+
+struct mlx5e_ethtool_steering {
+	struct mlx5e_ethtool_table      l3_l4_ft[ETHTOOL_NUM_L3_L4_FTS];
+	struct mlx5e_ethtool_table      l2_ft[ETHTOOL_NUM_L2_FTS];
+	struct list_head                rules;
+	int                             tot_num_rules;
+};
+
+struct mlx5e_flow_steering {
+	struct mlx5_flow_namespace      *ns;
+	struct mlx5e_ethtool_steering   ethtool;
+	struct mlx5e_tc_table           tc;
+	struct mlx5e_vlan_table         vlan;
+	struct mlx5e_l2_table           l2;
+	struct mlx5e_ttc_table          ttc;
+	struct mlx5e_ttc_table          inner_ttc;
+	struct mlx5e_arfs_tables        arfs;
+};
+
+struct mlx5e_rqt {
+	u32              rqtn;
+	bool		 enabled;
+};
+
+struct mlx5e_tir {
+	u32		  tirn;
+	struct mlx5e_rqt  rqt;
+	struct list_head  list;
+};
+
+enum {
+	MLX5E_TC_PRIO = 0,
+	MLX5E_NIC_PRIO
+};
+
+struct mlx5e_priv {
+	/* priv data path fields - start */
+	struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC];
+	int channel_tc2txq[MLX5E_MAX_NUM_CHANNELS][MLX5E_MAX_NUM_TC];
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	struct mlx5e_dcbx_dp       dcbx_dp;
+#endif
+	/* priv data path fields - end */
+
+	u32                        msglevel;
+	unsigned long              state;
+	struct mutex               state_lock; /* Protects Interface state */
+	struct mlx5e_rq            drop_rq;
+
+	struct mlx5e_channels      channels;
+	u32                        tisn[MLX5E_MAX_NUM_TC];
+	struct mlx5e_rqt           indir_rqt;
+	struct mlx5e_tir           indir_tir[MLX5E_NUM_INDIR_TIRS];
+	struct mlx5e_tir           inner_indir_tir[MLX5E_NUM_INDIR_TIRS];
+	struct mlx5e_tir           direct_tir[MLX5E_MAX_NUM_CHANNELS];
+	u32                        tx_rates[MLX5E_MAX_NUM_SQS];
+
+	struct mlx5e_flow_steering fs;
+	struct mlx5e_vxlan_db      vxlan;
+
+	struct workqueue_struct    *wq;
+	struct work_struct         update_carrier_work;
+	struct work_struct         set_rx_mode_work;
+	struct work_struct         tx_timeout_work;
+	struct delayed_work        update_stats_work;
+
+	struct mlx5_core_dev      *mdev;
+	struct net_device         *netdev;
+	struct mlx5e_stats         stats;
+	struct hwtstamp_config     tstamp;
+	u16                        q_counter;
+	u16                        drop_rq_q_counter;
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	struct mlx5e_dcbx          dcbx;
+#endif
+
+	const struct mlx5e_profile *profile;
+	void                      *ppriv;
+#ifdef CONFIG_MLX5_EN_IPSEC
+	struct mlx5e_ipsec        *ipsec;
+#endif
+};
+
+struct mlx5e_profile {
+	void	(*init)(struct mlx5_core_dev *mdev,
+			struct net_device *netdev,
+			const struct mlx5e_profile *profile, void *ppriv);
+	void	(*cleanup)(struct mlx5e_priv *priv);
+	int	(*init_rx)(struct mlx5e_priv *priv);
+	void	(*cleanup_rx)(struct mlx5e_priv *priv);
+	int	(*init_tx)(struct mlx5e_priv *priv);
+	void	(*cleanup_tx)(struct mlx5e_priv *priv);
+	void	(*enable)(struct mlx5e_priv *priv);
+	void	(*disable)(struct mlx5e_priv *priv);
+	void	(*update_stats)(struct mlx5e_priv *priv);
+	void	(*update_carrier)(struct mlx5e_priv *priv);
+	int	(*max_nch)(struct mlx5_core_dev *mdev);
+	struct {
+		mlx5e_fp_handle_rx_cqe handle_rx_cqe;
+		mlx5e_fp_handle_rx_cqe handle_rx_cqe_mpwqe;
+	} rx_handlers;
+	void	(*netdev_registered_init)(struct mlx5e_priv *priv);
+	void    (*netdev_registered_remove)(struct mlx5e_priv *priv);
+	int	max_tc;
+};
+
+void mlx5e_build_ptys2ethtool_map(void);
+
+u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
+		       void *accel_priv, select_queue_fallback_t fallback);
+netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev);
+
+void mlx5e_completion_event(struct mlx5_core_cq *mcq);
+void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event);
+int mlx5e_napi_poll(struct napi_struct *napi, int budget);
+bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget);
+int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget);
+bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq);
+void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq);
+void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq);
+
+bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev);
+bool mlx5e_striding_rq_possible(struct mlx5_core_dev *mdev,
+				struct mlx5e_params *params);
+
+void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
+			bool recycle);
+void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq);
+bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq);
+void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix);
+void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix);
+void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi);
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+				u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+				   u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+
+void mlx5e_update_stats(struct mlx5e_priv *priv);
+
+int mlx5e_create_flow_steering(struct mlx5e_priv *priv);
+void mlx5e_destroy_flow_steering(struct mlx5e_priv *priv);
+void mlx5e_init_l2_addr(struct mlx5e_priv *priv);
+void mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft);
+int mlx5e_self_test_num(struct mlx5e_priv *priv);
+void mlx5e_self_test(struct net_device *ndev, struct ethtool_test *etest,
+		     u64 *buf);
+int mlx5e_ethtool_get_flow(struct mlx5e_priv *priv, struct ethtool_rxnfc *info,
+			   int location);
+int mlx5e_ethtool_get_all_flows(struct mlx5e_priv *priv,
+				struct ethtool_rxnfc *info, u32 *rule_locs);
+int mlx5e_ethtool_flow_replace(struct mlx5e_priv *priv,
+			       struct ethtool_rx_flow_spec *fs);
+int mlx5e_ethtool_flow_remove(struct mlx5e_priv *priv,
+			      int location);
+void mlx5e_ethtool_init_steering(struct mlx5e_priv *priv);
+void mlx5e_ethtool_cleanup_steering(struct mlx5e_priv *priv);
+void mlx5e_set_rx_mode_work(struct work_struct *work);
+
+int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr);
+int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr);
+int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool val);
+
+int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto,
+			  u16 vid);
+int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __always_unused __be16 proto,
+			   u16 vid);
+void mlx5e_enable_cvlan_filter(struct mlx5e_priv *priv);
+void mlx5e_disable_cvlan_filter(struct mlx5e_priv *priv);
+void mlx5e_timestamp_init(struct mlx5e_priv *priv);
+
+struct mlx5e_redirect_rqt_param {
+	bool is_rss;
+	union {
+		u32 rqn; /* Direct RQN (Non-RSS) */
+		struct {
+			u8 hfunc;
+			struct mlx5e_channels *channels;
+		} rss; /* RSS data */
+	};
+};
+
+int mlx5e_redirect_rqt(struct mlx5e_priv *priv, u32 rqtn, int sz,
+		       struct mlx5e_redirect_rqt_param rrp);
+void mlx5e_build_indir_tir_ctx_hash(struct mlx5e_params *params,
+				    enum mlx5e_traffic_types tt,
+				    void *tirc, bool inner);
+
+int mlx5e_open_locked(struct net_device *netdev);
+int mlx5e_close_locked(struct net_device *netdev);
+
+int mlx5e_open_channels(struct mlx5e_priv *priv,
+			struct mlx5e_channels *chs);
+void mlx5e_close_channels(struct mlx5e_channels *chs);
+
+/* Function pointer to be used to modify WH settings while
+ * switching channels
+ */
+typedef int (*mlx5e_fp_hw_modify)(struct mlx5e_priv *priv);
+void mlx5e_switch_priv_channels(struct mlx5e_priv *priv,
+				struct mlx5e_channels *new_chs,
+				mlx5e_fp_hw_modify hw_modify);
+void mlx5e_activate_priv_channels(struct mlx5e_priv *priv);
+void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv);
+
+void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len,
+				   int num_channels);
+int mlx5e_get_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
+
+void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params,
+				 u8 cq_period_mode);
+void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
+				 u8 cq_period_mode);
+void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
+void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
+			       struct mlx5e_params *params);
+
+static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
+{
+	return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) &&
+		MLX5_CAP_FLOWTABLE_NIC_RX(mdev, ft_field_support.inner_ip_version));
+}
+
+static inline
+struct mlx5e_tx_wqe *mlx5e_post_nop(struct mlx5_wq_cyc *wq, u32 sqn, u16 *pc)
+{
+	u16                         pi   = *pc & wq->sz_m1;
+	struct mlx5e_tx_wqe        *wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
+	struct mlx5_wqe_ctrl_seg   *cseg = &wqe->ctrl;
+
+	memset(cseg, 0, sizeof(*cseg));
+
+	cseg->opmod_idx_opcode = cpu_to_be32((*pc << 8) | MLX5_OPCODE_NOP);
+	cseg->qpn_ds           = cpu_to_be32((sqn << 8) | 0x01);
+
+	(*pc)++;
+
+	return wqe;
+}
+
+static inline
+void mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc,
+		     void __iomem *uar_map,
+		     struct mlx5_wqe_ctrl_seg *ctrl)
+{
+	ctrl->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+	/* ensure wqe is visible to device before updating doorbell record */
+	dma_wmb();
+
+	*wq->db = cpu_to_be32(pc);
+
+	/* ensure doorbell record is visible to device before ringing the
+	 * doorbell
+	 */
+	wmb();
+
+	mlx5_write64((__be32 *)ctrl, uar_map, NULL);
+}
+
+static inline void mlx5e_cq_arm(struct mlx5e_cq *cq)
+{
+	struct mlx5_core_cq *mcq;
+
+	mcq = &cq->mcq;
+	mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, cq->wq.cc);
+}
+
+extern const struct ethtool_ops mlx5e_ethtool_ops;
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+extern const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops;
+int mlx5e_dcbnl_ieee_setets_core(struct mlx5e_priv *priv, struct ieee_ets *ets);
+void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv);
+void mlx5e_dcbnl_init_app(struct mlx5e_priv *priv);
+void mlx5e_dcbnl_delete_app(struct mlx5e_priv *priv);
+#endif
+
+#ifndef CONFIG_RFS_ACCEL
+static inline int mlx5e_arfs_create_tables(struct mlx5e_priv *priv)
+{
+	return 0;
+}
+
+static inline void mlx5e_arfs_destroy_tables(struct mlx5e_priv *priv) {}
+
+static inline int mlx5e_arfs_enable(struct mlx5e_priv *priv)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int mlx5e_arfs_disable(struct mlx5e_priv *priv)
+{
+	return -EOPNOTSUPP;
+}
+#else
+int mlx5e_arfs_create_tables(struct mlx5e_priv *priv);
+void mlx5e_arfs_destroy_tables(struct mlx5e_priv *priv);
+int mlx5e_arfs_enable(struct mlx5e_priv *priv);
+int mlx5e_arfs_disable(struct mlx5e_priv *priv);
+int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+			u16 rxq_index, u32 flow_id);
+#endif
+
+int mlx5e_create_tir(struct mlx5_core_dev *mdev,
+		     struct mlx5e_tir *tir, u32 *in, int inlen);
+void mlx5e_destroy_tir(struct mlx5_core_dev *mdev,
+		       struct mlx5e_tir *tir);
+int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev);
+void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev);
+int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb);
+
+/* common netdev helpers */
+int mlx5e_create_indirect_rqt(struct mlx5e_priv *priv);
+
+int mlx5e_create_indirect_tirs(struct mlx5e_priv *priv);
+void mlx5e_destroy_indirect_tirs(struct mlx5e_priv *priv);
+
+int mlx5e_create_direct_rqts(struct mlx5e_priv *priv);
+void mlx5e_destroy_direct_rqts(struct mlx5e_priv *priv);
+int mlx5e_create_direct_tirs(struct mlx5e_priv *priv);
+void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv);
+void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt);
+
+struct ttc_params {
+	struct mlx5_flow_table_attr ft_attr;
+	u32 any_tt_tirn;
+	u32 indir_tirn[MLX5E_NUM_INDIR_TIRS];
+	struct mlx5e_ttc_table *inner_ttc;
+};
+
+void mlx5e_set_ttc_basic_params(struct mlx5e_priv *priv, struct ttc_params *ttc_params);
+void mlx5e_set_ttc_ft_params(struct ttc_params *ttc_params);
+void mlx5e_set_inner_ttc_ft_params(struct ttc_params *ttc_params);
+
+int mlx5e_create_ttc_table(struct mlx5e_priv *priv, struct ttc_params *params,
+			   struct mlx5e_ttc_table *ttc);
+void mlx5e_destroy_ttc_table(struct mlx5e_priv *priv,
+			     struct mlx5e_ttc_table *ttc);
+
+int mlx5e_create_inner_ttc_table(struct mlx5e_priv *priv, struct ttc_params *params,
+				 struct mlx5e_ttc_table *ttc);
+void mlx5e_destroy_inner_ttc_table(struct mlx5e_priv *priv,
+				   struct mlx5e_ttc_table *ttc);
+
+int mlx5e_create_tis(struct mlx5_core_dev *mdev, int tc,
+		     u32 underlay_qpn, u32 *tisn);
+void mlx5e_destroy_tis(struct mlx5_core_dev *mdev, u32 tisn);
+
+int mlx5e_create_tises(struct mlx5e_priv *priv);
+void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv);
+int mlx5e_close(struct net_device *netdev);
+int mlx5e_open(struct net_device *netdev);
+void mlx5e_update_stats_work(struct work_struct *work);
+
+int mlx5e_bits_invert(unsigned long a, int size);
+
+/* ethtool helpers */
+void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
+			       struct ethtool_drvinfo *drvinfo);
+void mlx5e_ethtool_get_strings(struct mlx5e_priv *priv,
+			       uint32_t stringset, uint8_t *data);
+int mlx5e_ethtool_get_sset_count(struct mlx5e_priv *priv, int sset);
+void mlx5e_ethtool_get_ethtool_stats(struct mlx5e_priv *priv,
+				     struct ethtool_stats *stats, u64 *data);
+void mlx5e_ethtool_get_ringparam(struct mlx5e_priv *priv,
+				 struct ethtool_ringparam *param);
+int mlx5e_ethtool_set_ringparam(struct mlx5e_priv *priv,
+				struct ethtool_ringparam *param);
+void mlx5e_ethtool_get_channels(struct mlx5e_priv *priv,
+				struct ethtool_channels *ch);
+int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv,
+			       struct ethtool_channels *ch);
+int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv,
+			       struct ethtool_coalesce *coal);
+int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
+			       struct ethtool_coalesce *coal);
+int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv,
+			      struct ethtool_ts_info *info);
+int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv,
+			       struct ethtool_flash *flash);
+
+int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+			    void *cb_priv);
+
+/* mlx5e generic netdev management API */
+struct net_device*
+mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile,
+		    void *ppriv);
+int mlx5e_attach_netdev(struct mlx5e_priv *priv);
+void mlx5e_detach_netdev(struct mlx5e_priv *priv);
+void mlx5e_destroy_netdev(struct mlx5e_priv *priv);
+void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
+			    struct mlx5e_params *params,
+			    u16 max_channels, u16 mtu);
+u8 mlx5e_params_calculate_tx_min_inline(struct mlx5_core_dev *mdev);
+void mlx5e_rx_dim_work(struct work_struct *work);
+#endif /* __MLX5_EN_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/Makefile
new file mode 100644
index 0000000..d8e1711
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
new file mode 100644
index 0000000..cf58c96
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <crypto/internal/geniv.h>
+#include <crypto/aead.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+
+#include "en.h"
+#include "en_accel/ipsec.h"
+#include "en_accel/ipsec_rxtx.h"
+
+
+static struct mlx5e_ipsec_sa_entry *to_ipsec_sa_entry(struct xfrm_state *x)
+{
+	struct mlx5e_ipsec_sa_entry *sa;
+
+	if (!x)
+		return NULL;
+
+	sa = (struct mlx5e_ipsec_sa_entry *)x->xso.offload_handle;
+	if (!sa)
+		return NULL;
+
+	WARN_ON(sa->x != x);
+	return sa;
+}
+
+struct xfrm_state *mlx5e_ipsec_sadb_rx_lookup(struct mlx5e_ipsec *ipsec,
+					      unsigned int handle)
+{
+	struct mlx5e_ipsec_sa_entry *sa_entry;
+	struct xfrm_state *ret = NULL;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu(ipsec->sadb_rx, sa_entry, hlist, handle)
+		if (sa_entry->handle == handle) {
+			ret = sa_entry->x;
+			xfrm_state_hold(ret);
+			break;
+		}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int mlx5e_ipsec_sadb_rx_add(struct mlx5e_ipsec_sa_entry *sa_entry)
+{
+	struct mlx5e_ipsec *ipsec = sa_entry->ipsec;
+	unsigned long flags;
+	int ret;
+
+	ret = ida_simple_get(&ipsec->halloc, 1, 0, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+
+	spin_lock_irqsave(&ipsec->sadb_rx_lock, flags);
+	sa_entry->handle = ret;
+	hash_add_rcu(ipsec->sadb_rx, &sa_entry->hlist, sa_entry->handle);
+	spin_unlock_irqrestore(&ipsec->sadb_rx_lock, flags);
+
+	return 0;
+}
+
+static void mlx5e_ipsec_sadb_rx_del(struct mlx5e_ipsec_sa_entry *sa_entry)
+{
+	struct mlx5e_ipsec *ipsec = sa_entry->ipsec;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ipsec->sadb_rx_lock, flags);
+	hash_del_rcu(&sa_entry->hlist);
+	spin_unlock_irqrestore(&ipsec->sadb_rx_lock, flags);
+}
+
+static void mlx5e_ipsec_sadb_rx_free(struct mlx5e_ipsec_sa_entry *sa_entry)
+{
+	struct mlx5e_ipsec *ipsec = sa_entry->ipsec;
+
+	/* xfrm already doing sync rcu between del and free callbacks */
+
+	ida_simple_remove(&ipsec->halloc, sa_entry->handle);
+}
+
+static bool mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry)
+{
+	struct xfrm_replay_state_esn *replay_esn;
+	u32 seq_bottom;
+	u8 overlap;
+	u32 *esn;
+
+	if (!(sa_entry->x->props.flags & XFRM_STATE_ESN)) {
+		sa_entry->esn_state.trigger = 0;
+		return false;
+	}
+
+	replay_esn = sa_entry->x->replay_esn;
+	seq_bottom = replay_esn->seq - replay_esn->replay_window + 1;
+	overlap = sa_entry->esn_state.overlap;
+
+	sa_entry->esn_state.esn = xfrm_replay_seqhi(sa_entry->x,
+						    htonl(seq_bottom));
+	esn = &sa_entry->esn_state.esn;
+
+	sa_entry->esn_state.trigger = 1;
+	if (unlikely(overlap && seq_bottom < MLX5E_IPSEC_ESN_SCOPE_MID)) {
+		++(*esn);
+		sa_entry->esn_state.overlap = 0;
+		return true;
+	} else if (unlikely(!overlap &&
+			    (seq_bottom >= MLX5E_IPSEC_ESN_SCOPE_MID))) {
+		sa_entry->esn_state.overlap = 1;
+		return true;
+	}
+
+	return false;
+}
+
+static void
+mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry,
+				   struct mlx5_accel_esp_xfrm_attrs *attrs)
+{
+	struct xfrm_state *x = sa_entry->x;
+	struct aes_gcm_keymat *aes_gcm = &attrs->keymat.aes_gcm;
+	struct aead_geniv_ctx *geniv_ctx;
+	struct crypto_aead *aead;
+	unsigned int crypto_data_len, key_len;
+	int ivsize;
+
+	memset(attrs, 0, sizeof(*attrs));
+
+	/* key */
+	crypto_data_len = (x->aead->alg_key_len + 7) / 8;
+	key_len = crypto_data_len - 4; /* 4 bytes salt at end */
+
+	memcpy(aes_gcm->aes_key, x->aead->alg_key, key_len);
+	aes_gcm->key_len = key_len * 8;
+
+	/* salt and seq_iv */
+	aead = x->data;
+	geniv_ctx = crypto_aead_ctx(aead);
+	ivsize = crypto_aead_ivsize(aead);
+	memcpy(&aes_gcm->seq_iv, &geniv_ctx->salt, ivsize);
+	memcpy(&aes_gcm->salt, x->aead->alg_key + key_len,
+	       sizeof(aes_gcm->salt));
+
+	/* iv len */
+	aes_gcm->icv_len = x->aead->alg_icv_len;
+
+	/* esn */
+	if (sa_entry->esn_state.trigger) {
+		attrs->flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED;
+		attrs->esn = sa_entry->esn_state.esn;
+		if (sa_entry->esn_state.overlap)
+			attrs->flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
+	}
+
+	/* rx handle */
+	attrs->sa_handle = sa_entry->handle;
+
+	/* algo type */
+	attrs->keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM;
+
+	/* action */
+	attrs->action = (!(x->xso.flags & XFRM_OFFLOAD_INBOUND)) ?
+			MLX5_ACCEL_ESP_ACTION_ENCRYPT :
+			MLX5_ACCEL_ESP_ACTION_DECRYPT;
+	/* flags */
+	attrs->flags |= (x->props.mode == XFRM_MODE_TRANSPORT) ?
+			MLX5_ACCEL_ESP_FLAGS_TRANSPORT :
+			MLX5_ACCEL_ESP_FLAGS_TUNNEL;
+}
+
+static inline int mlx5e_xfrm_validate_state(struct xfrm_state *x)
+{
+	struct net_device *netdev = x->xso.dev;
+	struct mlx5e_priv *priv;
+
+	priv = netdev_priv(netdev);
+
+	if (x->props.aalgo != SADB_AALG_NONE) {
+		netdev_info(netdev, "Cannot offload authenticated xfrm states\n");
+		return -EINVAL;
+	}
+	if (x->props.ealgo != SADB_X_EALG_AES_GCM_ICV16) {
+		netdev_info(netdev, "Only AES-GCM-ICV16 xfrm state may be offloaded\n");
+		return -EINVAL;
+	}
+	if (x->props.calgo != SADB_X_CALG_NONE) {
+		netdev_info(netdev, "Cannot offload compressed xfrm states\n");
+		return -EINVAL;
+	}
+	if (x->props.flags & XFRM_STATE_ESN &&
+	    !(mlx5_accel_ipsec_device_caps(priv->mdev) &
+	    MLX5_ACCEL_IPSEC_CAP_ESN)) {
+		netdev_info(netdev, "Cannot offload ESN xfrm states\n");
+		return -EINVAL;
+	}
+	if (x->props.family != AF_INET &&
+	    x->props.family != AF_INET6) {
+		netdev_info(netdev, "Only IPv4/6 xfrm states may be offloaded\n");
+		return -EINVAL;
+	}
+	if (x->props.mode != XFRM_MODE_TRANSPORT &&
+	    x->props.mode != XFRM_MODE_TUNNEL) {
+		dev_info(&netdev->dev, "Only transport and tunnel xfrm states may be offloaded\n");
+		return -EINVAL;
+	}
+	if (x->id.proto != IPPROTO_ESP) {
+		netdev_info(netdev, "Only ESP xfrm state may be offloaded\n");
+		return -EINVAL;
+	}
+	if (x->encap) {
+		netdev_info(netdev, "Encapsulated xfrm state may not be offloaded\n");
+		return -EINVAL;
+	}
+	if (!x->aead) {
+		netdev_info(netdev, "Cannot offload xfrm states without aead\n");
+		return -EINVAL;
+	}
+	if (x->aead->alg_icv_len != 128) {
+		netdev_info(netdev, "Cannot offload xfrm states with AEAD ICV length other than 128bit\n");
+		return -EINVAL;
+	}
+	if ((x->aead->alg_key_len != 128 + 32) &&
+	    (x->aead->alg_key_len != 256 + 32)) {
+		netdev_info(netdev, "Cannot offload xfrm states with AEAD key length other than 128/256 bit\n");
+		return -EINVAL;
+	}
+	if (x->tfcpad) {
+		netdev_info(netdev, "Cannot offload xfrm states with tfc padding\n");
+		return -EINVAL;
+	}
+	if (!x->geniv) {
+		netdev_info(netdev, "Cannot offload xfrm states without geniv\n");
+		return -EINVAL;
+	}
+	if (strcmp(x->geniv, "seqiv")) {
+		netdev_info(netdev, "Cannot offload xfrm states with geniv other than seqiv\n");
+		return -EINVAL;
+	}
+	if (x->props.family == AF_INET6 &&
+	    !(mlx5_accel_ipsec_device_caps(priv->mdev) &
+	     MLX5_ACCEL_IPSEC_CAP_IPV6)) {
+		netdev_info(netdev, "IPv6 xfrm state offload is not supported by this device\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int mlx5e_xfrm_add_state(struct xfrm_state *x)
+{
+	struct mlx5e_ipsec_sa_entry *sa_entry = NULL;
+	struct net_device *netdev = x->xso.dev;
+	struct mlx5_accel_esp_xfrm_attrs attrs;
+	struct mlx5e_priv *priv;
+	__be32 saddr[4] = {0}, daddr[4] = {0}, spi;
+	bool is_ipv6 = false;
+	int err;
+
+	priv = netdev_priv(netdev);
+
+	err = mlx5e_xfrm_validate_state(x);
+	if (err)
+		return err;
+
+	sa_entry = kzalloc(sizeof(*sa_entry), GFP_KERNEL);
+	if (!sa_entry) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	sa_entry->x = x;
+	sa_entry->ipsec = priv->ipsec;
+
+	/* Add the SA to handle processed incoming packets before the add SA
+	 * completion was received
+	 */
+	if (x->xso.flags & XFRM_OFFLOAD_INBOUND) {
+		err = mlx5e_ipsec_sadb_rx_add(sa_entry);
+		if (err) {
+			netdev_info(netdev, "Failed adding to SADB_RX: %d\n", err);
+			goto err_entry;
+		}
+	} else {
+		sa_entry->set_iv_op = (x->props.flags & XFRM_STATE_ESN) ?
+				mlx5e_ipsec_set_iv_esn : mlx5e_ipsec_set_iv;
+	}
+
+	/* check esn */
+	mlx5e_ipsec_update_esn_state(sa_entry);
+
+	/* create xfrm */
+	mlx5e_ipsec_build_accel_xfrm_attrs(sa_entry, &attrs);
+	sa_entry->xfrm =
+		mlx5_accel_esp_create_xfrm(priv->mdev, &attrs,
+					   MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA);
+	if (IS_ERR(sa_entry->xfrm)) {
+		err = PTR_ERR(sa_entry->xfrm);
+		goto err_sadb_rx;
+	}
+
+	/* create hw context */
+	if (x->props.family == AF_INET) {
+		saddr[3] = x->props.saddr.a4;
+		daddr[3] = x->id.daddr.a4;
+	} else {
+		memcpy(saddr, x->props.saddr.a6, sizeof(saddr));
+		memcpy(daddr, x->id.daddr.a6, sizeof(daddr));
+		is_ipv6 = true;
+	}
+	spi = x->id.spi;
+	sa_entry->hw_context =
+			mlx5_accel_esp_create_hw_context(priv->mdev,
+							 sa_entry->xfrm,
+							 saddr, daddr, spi,
+							 is_ipv6);
+	if (IS_ERR(sa_entry->hw_context)) {
+		err = PTR_ERR(sa_entry->hw_context);
+		goto err_xfrm;
+	}
+
+	x->xso.offload_handle = (unsigned long)sa_entry;
+	goto out;
+
+err_xfrm:
+	mlx5_accel_esp_destroy_xfrm(sa_entry->xfrm);
+err_sadb_rx:
+	if (x->xso.flags & XFRM_OFFLOAD_INBOUND) {
+		mlx5e_ipsec_sadb_rx_del(sa_entry);
+		mlx5e_ipsec_sadb_rx_free(sa_entry);
+	}
+err_entry:
+	kfree(sa_entry);
+out:
+	return err;
+}
+
+static void mlx5e_xfrm_del_state(struct xfrm_state *x)
+{
+	struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x);
+
+	if (!sa_entry)
+		return;
+
+	if (x->xso.flags & XFRM_OFFLOAD_INBOUND)
+		mlx5e_ipsec_sadb_rx_del(sa_entry);
+}
+
+static void mlx5e_xfrm_free_state(struct xfrm_state *x)
+{
+	struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x);
+
+	if (!sa_entry)
+		return;
+
+	if (sa_entry->hw_context) {
+		flush_workqueue(sa_entry->ipsec->wq);
+		mlx5_accel_esp_free_hw_context(sa_entry->hw_context);
+		mlx5_accel_esp_destroy_xfrm(sa_entry->xfrm);
+	}
+
+	if (x->xso.flags & XFRM_OFFLOAD_INBOUND)
+		mlx5e_ipsec_sadb_rx_free(sa_entry);
+
+	kfree(sa_entry);
+}
+
+int mlx5e_ipsec_init(struct mlx5e_priv *priv)
+{
+	struct mlx5e_ipsec *ipsec = NULL;
+
+	if (!MLX5_IPSEC_DEV(priv->mdev)) {
+		netdev_dbg(priv->netdev, "Not an IPSec offload device\n");
+		return 0;
+	}
+
+	ipsec = kzalloc(sizeof(*ipsec), GFP_KERNEL);
+	if (!ipsec)
+		return -ENOMEM;
+
+	hash_init(ipsec->sadb_rx);
+	spin_lock_init(&ipsec->sadb_rx_lock);
+	ida_init(&ipsec->halloc);
+	ipsec->en_priv = priv;
+	ipsec->en_priv->ipsec = ipsec;
+	ipsec->no_trailer = !!(mlx5_accel_ipsec_device_caps(priv->mdev) &
+			       MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER);
+	ipsec->wq = alloc_ordered_workqueue("mlx5e_ipsec: %s", 0,
+					    priv->netdev->name);
+	if (!ipsec->wq) {
+		kfree(ipsec);
+		return -ENOMEM;
+	}
+	netdev_dbg(priv->netdev, "IPSec attached to netdevice\n");
+	return 0;
+}
+
+void mlx5e_ipsec_cleanup(struct mlx5e_priv *priv)
+{
+	struct mlx5e_ipsec *ipsec = priv->ipsec;
+
+	if (!ipsec)
+		return;
+
+	drain_workqueue(ipsec->wq);
+	destroy_workqueue(ipsec->wq);
+
+	ida_destroy(&ipsec->halloc);
+	kfree(ipsec);
+	priv->ipsec = NULL;
+}
+
+static bool mlx5e_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
+{
+	if (x->props.family == AF_INET) {
+		/* Offload with IPv4 options is not supported yet */
+		if (ip_hdr(skb)->ihl > 5)
+			return false;
+	} else {
+		/* Offload with IPv6 extension headers is not support yet */
+		if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr))
+			return false;
+	}
+
+	return true;
+}
+
+struct mlx5e_ipsec_modify_state_work {
+	struct work_struct		work;
+	struct mlx5_accel_esp_xfrm_attrs attrs;
+	struct mlx5e_ipsec_sa_entry	*sa_entry;
+};
+
+static void _update_xfrm_state(struct work_struct *work)
+{
+	int ret;
+	struct mlx5e_ipsec_modify_state_work *modify_work =
+		container_of(work, struct mlx5e_ipsec_modify_state_work, work);
+	struct mlx5e_ipsec_sa_entry *sa_entry = modify_work->sa_entry;
+
+	ret = mlx5_accel_esp_modify_xfrm(sa_entry->xfrm,
+					 &modify_work->attrs);
+	if (ret)
+		netdev_warn(sa_entry->ipsec->en_priv->netdev,
+			    "Not an IPSec offload device\n");
+
+	kfree(modify_work);
+}
+
+static void mlx5e_xfrm_advance_esn_state(struct xfrm_state *x)
+{
+	struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x);
+	struct mlx5e_ipsec_modify_state_work *modify_work;
+	bool need_update;
+
+	if (!sa_entry)
+		return;
+
+	need_update = mlx5e_ipsec_update_esn_state(sa_entry);
+	if (!need_update)
+		return;
+
+	modify_work = kzalloc(sizeof(*modify_work), GFP_ATOMIC);
+	if (!modify_work)
+		return;
+
+	mlx5e_ipsec_build_accel_xfrm_attrs(sa_entry, &modify_work->attrs);
+	modify_work->sa_entry = sa_entry;
+
+	INIT_WORK(&modify_work->work, _update_xfrm_state);
+	WARN_ON(!queue_work(sa_entry->ipsec->wq, &modify_work->work));
+}
+
+static const struct xfrmdev_ops mlx5e_ipsec_xfrmdev_ops = {
+	.xdo_dev_state_add	= mlx5e_xfrm_add_state,
+	.xdo_dev_state_delete	= mlx5e_xfrm_del_state,
+	.xdo_dev_state_free	= mlx5e_xfrm_free_state,
+	.xdo_dev_offload_ok	= mlx5e_ipsec_offload_ok,
+	.xdo_dev_state_advance_esn = mlx5e_xfrm_advance_esn_state,
+};
+
+void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct net_device *netdev = priv->netdev;
+
+	if (!priv->ipsec)
+		return;
+
+	if (!(mlx5_accel_ipsec_device_caps(mdev) & MLX5_ACCEL_IPSEC_CAP_ESP) ||
+	    !MLX5_CAP_ETH(mdev, swp)) {
+		mlx5_core_dbg(mdev, "mlx5e: ESP and SWP offload not supported\n");
+		return;
+	}
+
+	mlx5_core_info(mdev, "mlx5e: IPSec ESP acceleration enabled\n");
+	netdev->xfrmdev_ops = &mlx5e_ipsec_xfrmdev_ops;
+	netdev->features |= NETIF_F_HW_ESP;
+	netdev->hw_enc_features |= NETIF_F_HW_ESP;
+
+	if (!MLX5_CAP_ETH(mdev, swp_csum)) {
+		mlx5_core_dbg(mdev, "mlx5e: SWP checksum not supported\n");
+		return;
+	}
+
+	netdev->features |= NETIF_F_HW_ESP_TX_CSUM;
+	netdev->hw_enc_features |= NETIF_F_HW_ESP_TX_CSUM;
+
+	if (!(mlx5_accel_ipsec_device_caps(mdev) & MLX5_ACCEL_IPSEC_CAP_LSO) ||
+	    !MLX5_CAP_ETH(mdev, swp_lso)) {
+		mlx5_core_dbg(mdev, "mlx5e: ESP LSO not supported\n");
+		return;
+	}
+
+	mlx5_core_dbg(mdev, "mlx5e: ESP GSO capability turned on\n");
+	netdev->features |= NETIF_F_GSO_ESP;
+	netdev->hw_features |= NETIF_F_GSO_ESP;
+	netdev->hw_enc_features |= NETIF_F_GSO_ESP;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
new file mode 100644
index 0000000..1198fc1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5E_IPSEC_H__
+#define __MLX5E_IPSEC_H__
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+
+#include <linux/mlx5/device.h>
+#include <net/xfrm.h>
+#include <linux/idr.h>
+
+#include "accel/ipsec.h"
+
+#define MLX5E_IPSEC_SADB_RX_BITS 10
+#define MLX5E_IPSEC_ESN_SCOPE_MID 0x80000000L
+
+#define MLX5E_METADATA_ETHER_TYPE (0x8CE4)
+#define MLX5E_METADATA_ETHER_LEN 8
+
+struct mlx5e_priv;
+
+struct mlx5e_ipsec_sw_stats {
+	atomic64_t ipsec_rx_drop_sp_alloc;
+	atomic64_t ipsec_rx_drop_sadb_miss;
+	atomic64_t ipsec_rx_drop_syndrome;
+	atomic64_t ipsec_tx_drop_bundle;
+	atomic64_t ipsec_tx_drop_no_state;
+	atomic64_t ipsec_tx_drop_not_ip;
+	atomic64_t ipsec_tx_drop_trailer;
+	atomic64_t ipsec_tx_drop_metadata;
+};
+
+struct mlx5e_ipsec_stats {
+	u64 ipsec_dec_in_packets;
+	u64 ipsec_dec_out_packets;
+	u64 ipsec_dec_bypass_packets;
+	u64 ipsec_enc_in_packets;
+	u64 ipsec_enc_out_packets;
+	u64 ipsec_enc_bypass_packets;
+	u64 ipsec_dec_drop_packets;
+	u64 ipsec_dec_auth_fail_packets;
+	u64 ipsec_enc_drop_packets;
+	u64 ipsec_add_sa_success;
+	u64 ipsec_add_sa_fail;
+	u64 ipsec_del_sa_success;
+	u64 ipsec_del_sa_fail;
+	u64 ipsec_cmd_drop;
+};
+
+struct mlx5e_ipsec {
+	struct mlx5e_priv *en_priv;
+	DECLARE_HASHTABLE(sadb_rx, MLX5E_IPSEC_SADB_RX_BITS);
+	bool no_trailer;
+	spinlock_t sadb_rx_lock; /* Protects sadb_rx and halloc */
+	struct ida halloc;
+	struct mlx5e_ipsec_sw_stats sw_stats;
+	struct mlx5e_ipsec_stats stats;
+	struct workqueue_struct *wq;
+};
+
+struct mlx5e_ipsec_esn_state {
+	u32 esn;
+	u8 trigger: 1;
+	u8 overlap: 1;
+};
+
+struct mlx5e_ipsec_sa_entry {
+	struct hlist_node hlist; /* Item in SADB_RX hashtable */
+	struct mlx5e_ipsec_esn_state esn_state;
+	unsigned int handle; /* Handle in SADB_RX */
+	struct xfrm_state *x;
+	struct mlx5e_ipsec *ipsec;
+	struct mlx5_accel_esp_xfrm *xfrm;
+	void *hw_context;
+	void (*set_iv_op)(struct sk_buff *skb, struct xfrm_state *x,
+			  struct xfrm_offload *xo);
+};
+
+void mlx5e_ipsec_build_inverse_table(void);
+int mlx5e_ipsec_init(struct mlx5e_priv *priv);
+void mlx5e_ipsec_cleanup(struct mlx5e_priv *priv);
+void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv);
+
+int mlx5e_ipsec_get_count(struct mlx5e_priv *priv);
+int mlx5e_ipsec_get_strings(struct mlx5e_priv *priv, uint8_t *data);
+void mlx5e_ipsec_update_stats(struct mlx5e_priv *priv);
+int mlx5e_ipsec_get_stats(struct mlx5e_priv *priv, u64 *data);
+
+struct xfrm_state *mlx5e_ipsec_sadb_rx_lookup(struct mlx5e_ipsec *dev,
+					      unsigned int handle);
+
+#else
+
+static inline void mlx5e_ipsec_build_inverse_table(void)
+{
+}
+
+static inline int mlx5e_ipsec_init(struct mlx5e_priv *priv)
+{
+	return 0;
+}
+
+static inline void mlx5e_ipsec_cleanup(struct mlx5e_priv *priv)
+{
+}
+
+static inline void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv)
+{
+}
+
+static inline int mlx5e_ipsec_get_count(struct mlx5e_priv *priv)
+{
+	return 0;
+}
+
+static inline int mlx5e_ipsec_get_strings(struct mlx5e_priv *priv,
+					  uint8_t *data)
+{
+	return 0;
+}
+
+static inline void mlx5e_ipsec_update_stats(struct mlx5e_priv *priv)
+{
+}
+
+static inline int mlx5e_ipsec_get_stats(struct mlx5e_priv *priv, u64 *data)
+{
+	return 0;
+}
+
+#endif
+
+#endif	/* __MLX5E_IPSEC_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c
new file mode 100644
index 0000000..c245d8e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <crypto/aead.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+
+#include "en_accel/ipsec_rxtx.h"
+#include "en_accel/ipsec.h"
+#include "en.h"
+
+enum {
+	MLX5E_IPSEC_RX_SYNDROME_DECRYPTED = 0x11,
+	MLX5E_IPSEC_RX_SYNDROME_AUTH_FAILED = 0x12,
+	MLX5E_IPSEC_RX_SYNDROME_BAD_PROTO = 0x17,
+};
+
+struct mlx5e_ipsec_rx_metadata {
+	unsigned char   nexthdr;
+	__be32		sa_handle;
+} __packed;
+
+enum {
+	MLX5E_IPSEC_TX_SYNDROME_OFFLOAD = 0x8,
+	MLX5E_IPSEC_TX_SYNDROME_OFFLOAD_WITH_LSO_TCP = 0x9,
+};
+
+struct mlx5e_ipsec_tx_metadata {
+	__be16 mss_inv;         /* 1/MSS in 16bit fixed point, only for LSO */
+	__be16 seq;             /* LSBs of the first TCP seq, only for LSO */
+	u8     esp_next_proto;  /* Next protocol of ESP */
+} __packed;
+
+struct mlx5e_ipsec_metadata {
+	unsigned char syndrome;
+	union {
+		unsigned char raw[5];
+		/* from FPGA to host, on successful decrypt */
+		struct mlx5e_ipsec_rx_metadata rx;
+		/* from host to FPGA */
+		struct mlx5e_ipsec_tx_metadata tx;
+	} __packed content;
+	/* packet type ID field	*/
+	__be16 ethertype;
+} __packed;
+
+#define MAX_LSO_MSS 2048
+
+/* Pre-calculated (Q0.16) fixed-point inverse 1/x function */
+static __be16 mlx5e_ipsec_inverse_table[MAX_LSO_MSS];
+
+static inline __be16 mlx5e_ipsec_mss_inv(struct sk_buff *skb)
+{
+	return mlx5e_ipsec_inverse_table[skb_shinfo(skb)->gso_size];
+}
+
+static struct mlx5e_ipsec_metadata *mlx5e_ipsec_add_metadata(struct sk_buff *skb)
+{
+	struct mlx5e_ipsec_metadata *mdata;
+	struct ethhdr *eth;
+
+	if (unlikely(skb_cow_head(skb, sizeof(*mdata))))
+		return ERR_PTR(-ENOMEM);
+
+	eth = (struct ethhdr *)skb_push(skb, sizeof(*mdata));
+	skb->mac_header -= sizeof(*mdata);
+	mdata = (struct mlx5e_ipsec_metadata *)(eth + 1);
+
+	memmove(skb->data, skb->data + sizeof(*mdata),
+		2 * ETH_ALEN);
+
+	eth->h_proto = cpu_to_be16(MLX5E_METADATA_ETHER_TYPE);
+
+	memset(mdata->content.raw, 0, sizeof(mdata->content.raw));
+	return mdata;
+}
+
+static int mlx5e_ipsec_remove_trailer(struct sk_buff *skb, struct xfrm_state *x)
+{
+	unsigned int alen = crypto_aead_authsize(x->data);
+	struct ipv6hdr *ipv6hdr = ipv6_hdr(skb);
+	struct iphdr *ipv4hdr = ip_hdr(skb);
+	unsigned int trailer_len;
+	u8 plen;
+	int ret;
+
+	ret = skb_copy_bits(skb, skb->len - alen - 2, &plen, 1);
+	if (unlikely(ret))
+		return ret;
+
+	trailer_len = alen + plen + 2;
+
+	pskb_trim(skb, skb->len - trailer_len);
+	if (skb->protocol == htons(ETH_P_IP)) {
+		ipv4hdr->tot_len = htons(ntohs(ipv4hdr->tot_len) - trailer_len);
+		ip_send_check(ipv4hdr);
+	} else {
+		ipv6hdr->payload_len = htons(ntohs(ipv6hdr->payload_len) -
+					     trailer_len);
+	}
+	return 0;
+}
+
+static void mlx5e_ipsec_set_swp(struct sk_buff *skb,
+				struct mlx5_wqe_eth_seg *eseg, u8 mode,
+				struct xfrm_offload *xo)
+{
+	u8 proto;
+
+	/* Tunnel Mode:
+	 * SWP:      OutL3       InL3  InL4
+	 * Pkt: MAC  IP     ESP  IP    L4
+	 *
+	 * Transport Mode:
+	 * SWP:      OutL3       InL4
+	 *           InL3
+	 * Pkt: MAC  IP     ESP  L4
+	 *
+	 * Offsets are in 2-byte words, counting from start of frame
+	 */
+	eseg->swp_outer_l3_offset = skb_network_offset(skb) / 2;
+	if (skb->protocol == htons(ETH_P_IPV6))
+		eseg->swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L3_IPV6;
+
+	if (mode == XFRM_MODE_TUNNEL) {
+		eseg->swp_inner_l3_offset = skb_inner_network_offset(skb) / 2;
+		if (xo->proto == IPPROTO_IPV6) {
+			eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6;
+			proto = inner_ipv6_hdr(skb)->nexthdr;
+		} else {
+			proto = inner_ip_hdr(skb)->protocol;
+		}
+	} else {
+		eseg->swp_inner_l3_offset = skb_network_offset(skb) / 2;
+		if (skb->protocol == htons(ETH_P_IPV6))
+			eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6;
+		proto = xo->proto;
+	}
+	switch (proto) {
+	case IPPROTO_UDP:
+		eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP;
+		/* Fall through */
+	case IPPROTO_TCP:
+		eseg->swp_inner_l4_offset = skb_inner_transport_offset(skb) / 2;
+		break;
+	}
+}
+
+void mlx5e_ipsec_set_iv_esn(struct sk_buff *skb, struct xfrm_state *x,
+			    struct xfrm_offload *xo)
+{
+	struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+	__u32 oseq = replay_esn->oseq;
+	int iv_offset;
+	__be64 seqno;
+	u32 seq_hi;
+
+	if (unlikely(skb_is_gso(skb) && oseq < MLX5E_IPSEC_ESN_SCOPE_MID &&
+		     MLX5E_IPSEC_ESN_SCOPE_MID < (oseq - skb_shinfo(skb)->gso_segs))) {
+		seq_hi = xo->seq.hi - 1;
+	} else {
+		seq_hi = xo->seq.hi;
+	}
+
+	/* Place the SN in the IV field */
+	seqno = cpu_to_be64(xo->seq.low + ((u64)seq_hi << 32));
+	iv_offset = skb_transport_offset(skb) + sizeof(struct ip_esp_hdr);
+	skb_store_bits(skb, iv_offset, &seqno, 8);
+}
+
+void mlx5e_ipsec_set_iv(struct sk_buff *skb, struct xfrm_state *x,
+			struct xfrm_offload *xo)
+{
+	int iv_offset;
+	__be64 seqno;
+
+	/* Place the SN in the IV field */
+	seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
+	iv_offset = skb_transport_offset(skb) + sizeof(struct ip_esp_hdr);
+	skb_store_bits(skb, iv_offset, &seqno, 8);
+}
+
+static void mlx5e_ipsec_set_metadata(struct sk_buff *skb,
+				     struct mlx5e_ipsec_metadata *mdata,
+				     struct xfrm_offload *xo)
+{
+	struct ip_esp_hdr *esph;
+	struct tcphdr *tcph;
+
+	if (skb_is_gso(skb)) {
+		/* Add LSO metadata indication */
+		esph = ip_esp_hdr(skb);
+		tcph = inner_tcp_hdr(skb);
+		netdev_dbg(skb->dev, "   Offloading GSO packet outer L3 %u; L4 %u; Inner L3 %u; L4 %u\n",
+			   skb->network_header,
+			   skb->transport_header,
+			   skb->inner_network_header,
+			   skb->inner_transport_header);
+		netdev_dbg(skb->dev, "   Offloading GSO packet of len %u; mss %u; TCP sp %u dp %u seq 0x%x ESP seq 0x%x\n",
+			   skb->len, skb_shinfo(skb)->gso_size,
+			   ntohs(tcph->source), ntohs(tcph->dest),
+			   ntohl(tcph->seq), ntohl(esph->seq_no));
+		mdata->syndrome = MLX5E_IPSEC_TX_SYNDROME_OFFLOAD_WITH_LSO_TCP;
+		mdata->content.tx.mss_inv = mlx5e_ipsec_mss_inv(skb);
+		mdata->content.tx.seq = htons(ntohl(tcph->seq) & 0xFFFF);
+	} else {
+		mdata->syndrome = MLX5E_IPSEC_TX_SYNDROME_OFFLOAD;
+	}
+	mdata->content.tx.esp_next_proto = xo->proto;
+
+	netdev_dbg(skb->dev, "   TX metadata syndrome %u proto %u mss_inv %04x seq %04x\n",
+		   mdata->syndrome, mdata->content.tx.esp_next_proto,
+		   ntohs(mdata->content.tx.mss_inv),
+		   ntohs(mdata->content.tx.seq));
+}
+
+struct sk_buff *mlx5e_ipsec_handle_tx_skb(struct net_device *netdev,
+					  struct mlx5e_tx_wqe *wqe,
+					  struct sk_buff *skb)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	struct mlx5e_ipsec_metadata *mdata;
+	struct mlx5e_ipsec_sa_entry *sa_entry;
+	struct xfrm_state *x;
+
+	if (!xo)
+		return skb;
+
+	if (unlikely(skb->sp->len != 1)) {
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_bundle);
+		goto drop;
+	}
+
+	x = xfrm_input_state(skb);
+	if (unlikely(!x)) {
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_no_state);
+		goto drop;
+	}
+
+	if (unlikely(!x->xso.offload_handle ||
+		     (skb->protocol != htons(ETH_P_IP) &&
+		      skb->protocol != htons(ETH_P_IPV6)))) {
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_not_ip);
+		goto drop;
+	}
+
+	if (!skb_is_gso(skb))
+		if (unlikely(mlx5e_ipsec_remove_trailer(skb, x))) {
+			atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_trailer);
+			goto drop;
+		}
+	mdata = mlx5e_ipsec_add_metadata(skb);
+	if (IS_ERR(mdata)) {
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_metadata);
+		goto drop;
+	}
+	mlx5e_ipsec_set_swp(skb, &wqe->eth, x->props.mode, xo);
+	sa_entry = (struct mlx5e_ipsec_sa_entry *)x->xso.offload_handle;
+	sa_entry->set_iv_op(skb, x, xo);
+	mlx5e_ipsec_set_metadata(skb, mdata, xo);
+
+	return skb;
+
+drop:
+	kfree_skb(skb);
+	return NULL;
+}
+
+static inline struct xfrm_state *
+mlx5e_ipsec_build_sp(struct net_device *netdev, struct sk_buff *skb,
+		     struct mlx5e_ipsec_metadata *mdata)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct xfrm_offload *xo;
+	struct xfrm_state *xs;
+	u32 sa_handle;
+
+	skb->sp = secpath_dup(skb->sp);
+	if (unlikely(!skb->sp)) {
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_sp_alloc);
+		return NULL;
+	}
+
+	sa_handle = be32_to_cpu(mdata->content.rx.sa_handle);
+	xs = mlx5e_ipsec_sadb_rx_lookup(priv->ipsec, sa_handle);
+	if (unlikely(!xs)) {
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_sadb_miss);
+		return NULL;
+	}
+
+	skb->sp->xvec[skb->sp->len++] = xs;
+	skb->sp->olen++;
+
+	xo = xfrm_offload(skb);
+	xo->flags = CRYPTO_DONE;
+	switch (mdata->syndrome) {
+	case MLX5E_IPSEC_RX_SYNDROME_DECRYPTED:
+		xo->status = CRYPTO_SUCCESS;
+		if (likely(priv->ipsec->no_trailer)) {
+			xo->flags |= XFRM_ESP_NO_TRAILER;
+			xo->proto = mdata->content.rx.nexthdr;
+		}
+		break;
+	case MLX5E_IPSEC_RX_SYNDROME_AUTH_FAILED:
+		xo->status = CRYPTO_TUNNEL_ESP_AUTH_FAILED;
+		break;
+	case MLX5E_IPSEC_RX_SYNDROME_BAD_PROTO:
+		xo->status = CRYPTO_INVALID_PROTOCOL;
+		break;
+	default:
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_syndrome);
+		return NULL;
+	}
+	return xs;
+}
+
+struct sk_buff *mlx5e_ipsec_handle_rx_skb(struct net_device *netdev,
+					  struct sk_buff *skb)
+{
+	struct mlx5e_ipsec_metadata *mdata;
+	struct ethhdr *old_eth;
+	struct ethhdr *new_eth;
+	struct xfrm_state *xs;
+	__be16 *ethtype;
+
+	/* Detect inline metadata */
+	if (skb->len < ETH_HLEN + MLX5E_METADATA_ETHER_LEN)
+		return skb;
+	ethtype = (__be16 *)(skb->data + ETH_ALEN * 2);
+	if (*ethtype != cpu_to_be16(MLX5E_METADATA_ETHER_TYPE))
+		return skb;
+
+	/* Use the metadata */
+	mdata = (struct mlx5e_ipsec_metadata *)(skb->data + ETH_HLEN);
+	xs = mlx5e_ipsec_build_sp(netdev, skb, mdata);
+	if (unlikely(!xs)) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	/* Remove the metadata from the buffer */
+	old_eth = (struct ethhdr *)skb->data;
+	new_eth = (struct ethhdr *)(skb->data + MLX5E_METADATA_ETHER_LEN);
+	memmove(new_eth, old_eth, 2 * ETH_ALEN);
+	/* Ethertype is already in its new place */
+	skb_pull_inline(skb, MLX5E_METADATA_ETHER_LEN);
+
+	return skb;
+}
+
+bool mlx5e_ipsec_feature_check(struct sk_buff *skb, struct net_device *netdev,
+			       netdev_features_t features)
+{
+	struct xfrm_state *x;
+
+	if (skb->sp && skb->sp->len) {
+		x = skb->sp->xvec[0];
+		if (x && x->xso.offload_handle)
+			return true;
+	}
+	return false;
+}
+
+void mlx5e_ipsec_build_inverse_table(void)
+{
+	u16 mss_inv;
+	u32 mss;
+
+	/* Calculate 1/x inverse table for use in GSO data path.
+	 * Using this table, we provide the IPSec accelerator with the value of
+	 * 1/gso_size so that it can infer the position of each segment inside
+	 * the GSO, and increment the ESP sequence number, and generate the IV.
+	 * The HW needs this value in Q0.16 fixed-point number format
+	 */
+	mlx5e_ipsec_inverse_table[1] = htons(0xFFFF);
+	for (mss = 2; mss < MAX_LSO_MSS; mss++) {
+		mss_inv = div_u64(1ULL << 32, mss) >> 16;
+		mlx5e_ipsec_inverse_table[mss] = htons(mss_inv);
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h
new file mode 100644
index 0000000..2bfbbef
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5E_IPSEC_RXTX_H__
+#define __MLX5E_IPSEC_RXTX_H__
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+
+#include <linux/skbuff.h>
+#include <net/xfrm.h>
+#include "en.h"
+
+struct sk_buff *mlx5e_ipsec_handle_rx_skb(struct net_device *netdev,
+					  struct sk_buff *skb);
+void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+
+void mlx5e_ipsec_inverse_table_init(void);
+bool mlx5e_ipsec_feature_check(struct sk_buff *skb, struct net_device *netdev,
+			       netdev_features_t features);
+void mlx5e_ipsec_set_iv_esn(struct sk_buff *skb, struct xfrm_state *x,
+			    struct xfrm_offload *xo);
+void mlx5e_ipsec_set_iv(struct sk_buff *skb, struct xfrm_state *x,
+			struct xfrm_offload *xo);
+struct sk_buff *mlx5e_ipsec_handle_tx_skb(struct net_device *netdev,
+					  struct mlx5e_tx_wqe *wqe,
+					  struct sk_buff *skb);
+
+#endif /* CONFIG_MLX5_EN_IPSEC */
+
+#endif /* __MLX5E_IPSEC_RXTX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c
new file mode 100644
index 0000000..6fea592
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/ethtool.h>
+#include <net/sock.h>
+
+#include "en.h"
+#include "accel/ipsec.h"
+#include "fpga/sdk.h"
+#include "en_accel/ipsec.h"
+
+static const struct counter_desc mlx5e_ipsec_hw_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_in_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_out_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_bypass_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_enc_in_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_enc_out_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_enc_bypass_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_drop_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_auth_fail_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_enc_drop_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_add_sa_success) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_add_sa_fail) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_del_sa_success) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_del_sa_fail) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_cmd_drop) },
+};
+
+static const struct counter_desc mlx5e_ipsec_sw_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_rx_drop_sp_alloc) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_rx_drop_sadb_miss) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_rx_drop_syndrome) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_bundle) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_no_state) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_not_ip) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_trailer) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_metadata) },
+};
+
+#define MLX5E_READ_CTR_ATOMIC64(ptr, dsc, i) \
+	atomic64_read((atomic64_t *)((char *)(ptr) + (dsc)[i].offset))
+
+#define NUM_IPSEC_HW_COUNTERS ARRAY_SIZE(mlx5e_ipsec_hw_stats_desc)
+#define NUM_IPSEC_SW_COUNTERS ARRAY_SIZE(mlx5e_ipsec_sw_stats_desc)
+
+#define NUM_IPSEC_COUNTERS (NUM_IPSEC_HW_COUNTERS + NUM_IPSEC_SW_COUNTERS)
+
+int mlx5e_ipsec_get_count(struct mlx5e_priv *priv)
+{
+	if (!priv->ipsec)
+		return 0;
+
+	return NUM_IPSEC_COUNTERS;
+}
+
+int mlx5e_ipsec_get_strings(struct mlx5e_priv *priv, uint8_t *data)
+{
+	unsigned int i, idx = 0;
+
+	if (!priv->ipsec)
+		return 0;
+
+	for (i = 0; i < NUM_IPSEC_HW_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       mlx5e_ipsec_hw_stats_desc[i].format);
+
+	for (i = 0; i < NUM_IPSEC_SW_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       mlx5e_ipsec_sw_stats_desc[i].format);
+
+	return NUM_IPSEC_COUNTERS;
+}
+
+void mlx5e_ipsec_update_stats(struct mlx5e_priv *priv)
+{
+	int ret;
+
+	if (!priv->ipsec)
+		return;
+
+	ret = mlx5_accel_ipsec_counters_read(priv->mdev, (u64 *)&priv->ipsec->stats,
+					     NUM_IPSEC_HW_COUNTERS);
+	if (ret)
+		memset(&priv->ipsec->stats, 0, sizeof(priv->ipsec->stats));
+}
+
+int mlx5e_ipsec_get_stats(struct mlx5e_priv *priv, u64 *data)
+{
+	int i, idx = 0;
+
+	if (!priv->ipsec)
+		return 0;
+
+	for (i = 0; i < NUM_IPSEC_HW_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_CPU(&priv->ipsec->stats,
+						   mlx5e_ipsec_hw_stats_desc, i);
+
+	for (i = 0; i < NUM_IPSEC_SW_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR_ATOMIC64(&priv->ipsec->sw_stats,
+						      mlx5e_ipsec_sw_stats_desc, i);
+
+	return NUM_IPSEC_COUNTERS;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
new file mode 100644
index 0000000..610d485
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -0,0 +1,738 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef CONFIG_RFS_ACCEL
+
+#include <linux/hash.h>
+#include <linux/mlx5/fs.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "en.h"
+
+struct arfs_tuple {
+	__be16 etype;
+	u8     ip_proto;
+	union {
+		__be32 src_ipv4;
+		struct in6_addr src_ipv6;
+	};
+	union {
+		__be32 dst_ipv4;
+		struct in6_addr dst_ipv6;
+	};
+	__be16 src_port;
+	__be16 dst_port;
+};
+
+struct arfs_rule {
+	struct mlx5e_priv	*priv;
+	struct work_struct      arfs_work;
+	struct mlx5_flow_handle *rule;
+	struct hlist_node	hlist;
+	int			rxq;
+	/* Flow ID passed to ndo_rx_flow_steer */
+	int			flow_id;
+	/* Filter ID returned by ndo_rx_flow_steer */
+	int			filter_id;
+	struct arfs_tuple	tuple;
+};
+
+#define mlx5e_for_each_arfs_rule(hn, tmp, arfs_tables, i, j) \
+	for (i = 0; i < ARFS_NUM_TYPES; i++) \
+		mlx5e_for_each_hash_arfs_rule(hn, tmp, arfs_tables[i].rules_hash, j)
+
+#define mlx5e_for_each_hash_arfs_rule(hn, tmp, hash, j) \
+	for (j = 0; j < ARFS_HASH_SIZE; j++) \
+		hlist_for_each_entry_safe(hn, tmp, &hash[j], hlist)
+
+static enum mlx5e_traffic_types arfs_get_tt(enum arfs_type type)
+{
+	switch (type) {
+	case ARFS_IPV4_TCP:
+		return MLX5E_TT_IPV4_TCP;
+	case ARFS_IPV4_UDP:
+		return MLX5E_TT_IPV4_UDP;
+	case ARFS_IPV6_TCP:
+		return MLX5E_TT_IPV6_TCP;
+	case ARFS_IPV6_UDP:
+		return MLX5E_TT_IPV6_UDP;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int arfs_disable(struct mlx5e_priv *priv)
+{
+	struct mlx5_flow_destination dest = {};
+	struct mlx5e_tir *tir = priv->indir_tir;
+	int err = 0;
+	int tt;
+	int i;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	for (i = 0; i < ARFS_NUM_TYPES; i++) {
+		dest.tir_num = tir[i].tirn;
+		tt = arfs_get_tt(i);
+		/* Modify ttc rules destination to bypass the aRFS tables*/
+		err = mlx5_modify_rule_destination(priv->fs.ttc.rules[tt],
+						   &dest, NULL);
+		if (err) {
+			netdev_err(priv->netdev,
+				   "%s: modify ttc destination failed\n",
+				   __func__);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static void arfs_del_rules(struct mlx5e_priv *priv);
+
+int mlx5e_arfs_disable(struct mlx5e_priv *priv)
+{
+	arfs_del_rules(priv);
+
+	return arfs_disable(priv);
+}
+
+int mlx5e_arfs_enable(struct mlx5e_priv *priv)
+{
+	struct mlx5_flow_destination dest = {};
+	int err = 0;
+	int tt;
+	int i;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	for (i = 0; i < ARFS_NUM_TYPES; i++) {
+		dest.ft = priv->fs.arfs.arfs_tables[i].ft.t;
+		tt = arfs_get_tt(i);
+		/* Modify ttc rules destination to point on the aRFS FTs */
+		err = mlx5_modify_rule_destination(priv->fs.ttc.rules[tt],
+						   &dest, NULL);
+		if (err) {
+			netdev_err(priv->netdev,
+				   "%s: modify ttc destination failed err=%d\n",
+				   __func__, err);
+			arfs_disable(priv);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static void arfs_destroy_table(struct arfs_table *arfs_t)
+{
+	mlx5_del_flow_rules(arfs_t->default_rule);
+	mlx5e_destroy_flow_table(&arfs_t->ft);
+}
+
+void mlx5e_arfs_destroy_tables(struct mlx5e_priv *priv)
+{
+	int i;
+
+	if (!(priv->netdev->hw_features & NETIF_F_NTUPLE))
+		return;
+
+	arfs_del_rules(priv);
+	destroy_workqueue(priv->fs.arfs.wq);
+	for (i = 0; i < ARFS_NUM_TYPES; i++) {
+		if (!IS_ERR_OR_NULL(priv->fs.arfs.arfs_tables[i].ft.t))
+			arfs_destroy_table(&priv->fs.arfs.arfs_tables[i]);
+	}
+}
+
+static int arfs_add_default_rule(struct mlx5e_priv *priv,
+				 enum arfs_type type)
+{
+	struct arfs_table *arfs_t = &priv->fs.arfs.arfs_tables[type];
+	struct mlx5e_tir *tir = priv->indir_tir;
+	struct mlx5_flow_destination dest = {};
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct mlx5_flow_spec *spec;
+	enum mlx5e_traffic_types tt;
+	int err = 0;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	tt = arfs_get_tt(type);
+	if (tt == -EINVAL) {
+		netdev_err(priv->netdev, "%s: bad arfs_type: %d\n",
+			   __func__, type);
+		err = -EINVAL;
+		goto out;
+	}
+
+	dest.tir_num = tir[tt].tirn;
+
+	arfs_t->default_rule = mlx5_add_flow_rules(arfs_t->ft.t, spec,
+						   &flow_act,
+						   &dest, 1);
+	if (IS_ERR(arfs_t->default_rule)) {
+		err = PTR_ERR(arfs_t->default_rule);
+		arfs_t->default_rule = NULL;
+		netdev_err(priv->netdev, "%s: add rule failed, arfs type=%d\n",
+			   __func__, type);
+	}
+out:
+	kvfree(spec);
+	return err;
+}
+
+#define MLX5E_ARFS_NUM_GROUPS	2
+#define MLX5E_ARFS_GROUP1_SIZE	BIT(12)
+#define MLX5E_ARFS_GROUP2_SIZE	BIT(0)
+#define MLX5E_ARFS_TABLE_SIZE	(MLX5E_ARFS_GROUP1_SIZE +\
+				 MLX5E_ARFS_GROUP2_SIZE)
+static int arfs_create_groups(struct mlx5e_flow_table *ft,
+			      enum  arfs_type type)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	void *outer_headers_c;
+	int ix = 0;
+	u32 *in;
+	int err;
+	u8 *mc;
+
+	ft->g = kcalloc(MLX5E_ARFS_NUM_GROUPS,
+			sizeof(*ft->g), GFP_KERNEL);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if  (!in || !ft->g) {
+		kvfree(ft->g);
+		kvfree(in);
+		return -ENOMEM;
+	}
+
+	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+	outer_headers_c = MLX5_ADDR_OF(fte_match_param, mc,
+				       outer_headers);
+	MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, ethertype);
+	switch (type) {
+	case ARFS_IPV4_TCP:
+	case ARFS_IPV6_TCP:
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport);
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport);
+		break;
+	case ARFS_IPV4_UDP:
+	case ARFS_IPV6_UDP:
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, udp_dport);
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, udp_sport);
+		break;
+	default:
+		err = -EINVAL;
+		goto out;
+	}
+
+	switch (type) {
+	case ARFS_IPV4_TCP:
+	case ARFS_IPV4_UDP:
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c,
+				 src_ipv4_src_ipv6.ipv4_layout.ipv4);
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c,
+				 dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+		break;
+	case ARFS_IPV6_TCP:
+	case ARFS_IPV6_UDP:
+		memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       0xff, 16);
+		memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       0xff, 16);
+		break;
+	default:
+		err = -EINVAL;
+		goto out;
+	}
+
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_ARFS_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_ARFS_GROUP2_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	kvfree(in);
+	return 0;
+
+err:
+	err = PTR_ERR(ft->g[ft->num_groups]);
+	ft->g[ft->num_groups] = NULL;
+out:
+	kvfree(in);
+
+	return err;
+}
+
+static int arfs_create_table(struct mlx5e_priv *priv,
+			     enum arfs_type type)
+{
+	struct mlx5e_arfs_tables *arfs = &priv->fs.arfs;
+	struct mlx5e_flow_table *ft = &arfs->arfs_tables[type].ft;
+	struct mlx5_flow_table_attr ft_attr = {};
+	int err;
+
+	ft->num_groups = 0;
+
+	ft_attr.max_fte = MLX5E_ARFS_TABLE_SIZE;
+	ft_attr.level = MLX5E_ARFS_FT_LEVEL;
+	ft_attr.prio = MLX5E_NIC_PRIO;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr);
+	if (IS_ERR(ft->t)) {
+		err = PTR_ERR(ft->t);
+		ft->t = NULL;
+		return err;
+	}
+
+	err = arfs_create_groups(ft, type);
+	if (err)
+		goto err;
+
+	err = arfs_add_default_rule(priv, type);
+	if (err)
+		goto err;
+
+	return 0;
+err:
+	mlx5e_destroy_flow_table(ft);
+	return err;
+}
+
+int mlx5e_arfs_create_tables(struct mlx5e_priv *priv)
+{
+	int err = 0;
+	int i;
+
+	if (!(priv->netdev->hw_features & NETIF_F_NTUPLE))
+		return 0;
+
+	spin_lock_init(&priv->fs.arfs.arfs_lock);
+	INIT_LIST_HEAD(&priv->fs.arfs.rules);
+	priv->fs.arfs.wq = create_singlethread_workqueue("mlx5e_arfs");
+	if (!priv->fs.arfs.wq)
+		return -ENOMEM;
+
+	for (i = 0; i < ARFS_NUM_TYPES; i++) {
+		err = arfs_create_table(priv, i);
+		if (err)
+			goto err;
+	}
+	return 0;
+err:
+	mlx5e_arfs_destroy_tables(priv);
+	return err;
+}
+
+#define MLX5E_ARFS_EXPIRY_QUOTA 60
+
+static void arfs_may_expire_flow(struct mlx5e_priv *priv)
+{
+	struct arfs_rule *arfs_rule;
+	struct hlist_node *htmp;
+	int quota = 0;
+	int i;
+	int j;
+
+	HLIST_HEAD(del_list);
+	spin_lock_bh(&priv->fs.arfs.arfs_lock);
+	mlx5e_for_each_arfs_rule(arfs_rule, htmp, priv->fs.arfs.arfs_tables, i, j) {
+		if (quota++ > MLX5E_ARFS_EXPIRY_QUOTA)
+			break;
+		if (!work_pending(&arfs_rule->arfs_work) &&
+		    rps_may_expire_flow(priv->netdev,
+					arfs_rule->rxq, arfs_rule->flow_id,
+					arfs_rule->filter_id)) {
+			hlist_del_init(&arfs_rule->hlist);
+			hlist_add_head(&arfs_rule->hlist, &del_list);
+		}
+	}
+	spin_unlock_bh(&priv->fs.arfs.arfs_lock);
+	hlist_for_each_entry_safe(arfs_rule, htmp, &del_list, hlist) {
+		if (arfs_rule->rule)
+			mlx5_del_flow_rules(arfs_rule->rule);
+		hlist_del(&arfs_rule->hlist);
+		kfree(arfs_rule);
+	}
+}
+
+static void arfs_del_rules(struct mlx5e_priv *priv)
+{
+	struct hlist_node *htmp;
+	struct arfs_rule *rule;
+	int i;
+	int j;
+
+	HLIST_HEAD(del_list);
+	spin_lock_bh(&priv->fs.arfs.arfs_lock);
+	mlx5e_for_each_arfs_rule(rule, htmp, priv->fs.arfs.arfs_tables, i, j) {
+		hlist_del_init(&rule->hlist);
+		hlist_add_head(&rule->hlist, &del_list);
+	}
+	spin_unlock_bh(&priv->fs.arfs.arfs_lock);
+
+	hlist_for_each_entry_safe(rule, htmp, &del_list, hlist) {
+		cancel_work_sync(&rule->arfs_work);
+		if (rule->rule)
+			mlx5_del_flow_rules(rule->rule);
+		hlist_del(&rule->hlist);
+		kfree(rule);
+	}
+}
+
+static struct hlist_head *
+arfs_hash_bucket(struct arfs_table *arfs_t, __be16 src_port,
+		 __be16 dst_port)
+{
+	unsigned long l;
+	int bucket_idx;
+
+	l = (__force unsigned long)src_port |
+	    ((__force unsigned long)dst_port << 2);
+
+	bucket_idx = hash_long(l, ARFS_HASH_SHIFT);
+
+	return &arfs_t->rules_hash[bucket_idx];
+}
+
+static u8 arfs_get_ip_proto(const struct sk_buff *skb)
+{
+	return (skb->protocol == htons(ETH_P_IP)) ?
+		ip_hdr(skb)->protocol : ipv6_hdr(skb)->nexthdr;
+}
+
+static struct arfs_table *arfs_get_table(struct mlx5e_arfs_tables *arfs,
+					 u8 ip_proto, __be16 etype)
+{
+	if (etype == htons(ETH_P_IP) && ip_proto == IPPROTO_TCP)
+		return &arfs->arfs_tables[ARFS_IPV4_TCP];
+	if (etype == htons(ETH_P_IP) && ip_proto == IPPROTO_UDP)
+		return &arfs->arfs_tables[ARFS_IPV4_UDP];
+	if (etype == htons(ETH_P_IPV6) && ip_proto == IPPROTO_TCP)
+		return &arfs->arfs_tables[ARFS_IPV6_TCP];
+	if (etype == htons(ETH_P_IPV6) && ip_proto == IPPROTO_UDP)
+		return &arfs->arfs_tables[ARFS_IPV6_UDP];
+
+	return NULL;
+}
+
+static struct mlx5_flow_handle *arfs_add_rule(struct mlx5e_priv *priv,
+					      struct arfs_rule *arfs_rule)
+{
+	struct mlx5e_arfs_tables *arfs = &priv->fs.arfs;
+	struct arfs_tuple *tuple = &arfs_rule->tuple;
+	struct mlx5_flow_handle *rule = NULL;
+	struct mlx5_flow_destination dest = {};
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct arfs_table *arfs_table;
+	struct mlx5_flow_spec *spec;
+	struct mlx5_flow_table *ft;
+	int err = 0;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		err = -ENOMEM;
+		goto out;
+	}
+	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+			 outer_headers.ethertype);
+	MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype,
+		 ntohs(tuple->etype));
+	arfs_table = arfs_get_table(arfs, tuple->ip_proto, tuple->etype);
+	if (!arfs_table) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	ft = arfs_table->ft.t;
+	if (tuple->ip_proto == IPPROTO_TCP) {
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.tcp_dport);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.tcp_sport);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.tcp_dport,
+			 ntohs(tuple->dst_port));
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.tcp_sport,
+			 ntohs(tuple->src_port));
+	} else {
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.udp_dport);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.udp_sport);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.udp_dport,
+			 ntohs(tuple->dst_port));
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.udp_sport,
+			 ntohs(tuple->src_port));
+	}
+	if (tuple->etype == htons(ETH_P_IP)) {
+		memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &tuple->src_ipv4,
+		       4);
+		memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &tuple->dst_ipv4,
+		       4);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+	} else {
+		memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &tuple->src_ipv6,
+		       16);
+		memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &tuple->dst_ipv6,
+		       16);
+		memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       0xff,
+		       16);
+		memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       0xff,
+		       16);
+	}
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	dest.tir_num = priv->direct_tir[arfs_rule->rxq].tirn;
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		netdev_err(priv->netdev, "%s: add rule(filter id=%d, rq idx=%d) failed, err=%d\n",
+			   __func__, arfs_rule->filter_id, arfs_rule->rxq, err);
+	}
+
+out:
+	kvfree(spec);
+	return err ? ERR_PTR(err) : rule;
+}
+
+static void arfs_modify_rule_rq(struct mlx5e_priv *priv,
+				struct mlx5_flow_handle *rule, u16 rxq)
+{
+	struct mlx5_flow_destination dst = {};
+	int err = 0;
+
+	dst.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	dst.tir_num = priv->direct_tir[rxq].tirn;
+	err =  mlx5_modify_rule_destination(rule, &dst, NULL);
+	if (err)
+		netdev_warn(priv->netdev,
+			    "Failed to modfiy aRFS rule destination to rq=%d\n", rxq);
+}
+
+static void arfs_handle_work(struct work_struct *work)
+{
+	struct arfs_rule *arfs_rule = container_of(work,
+						   struct arfs_rule,
+						   arfs_work);
+	struct mlx5e_priv *priv = arfs_rule->priv;
+	struct mlx5_flow_handle *rule;
+
+	mutex_lock(&priv->state_lock);
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		spin_lock_bh(&priv->fs.arfs.arfs_lock);
+		hlist_del(&arfs_rule->hlist);
+		spin_unlock_bh(&priv->fs.arfs.arfs_lock);
+
+		mutex_unlock(&priv->state_lock);
+		kfree(arfs_rule);
+		goto out;
+	}
+	mutex_unlock(&priv->state_lock);
+
+	if (!arfs_rule->rule) {
+		rule = arfs_add_rule(priv, arfs_rule);
+		if (IS_ERR(rule))
+			goto out;
+		arfs_rule->rule = rule;
+	} else {
+		arfs_modify_rule_rq(priv, arfs_rule->rule,
+				    arfs_rule->rxq);
+	}
+out:
+	arfs_may_expire_flow(priv);
+}
+
+/* return L4 destination port from ip4/6 packets */
+static __be16 arfs_get_dst_port(const struct sk_buff *skb)
+{
+	char *transport_header;
+
+	transport_header = skb_transport_header(skb);
+	if (arfs_get_ip_proto(skb) == IPPROTO_TCP)
+		return ((struct tcphdr *)transport_header)->dest;
+	return ((struct udphdr *)transport_header)->dest;
+}
+
+/* return L4 source port from ip4/6 packets */
+static __be16 arfs_get_src_port(const struct sk_buff *skb)
+{
+	char *transport_header;
+
+	transport_header = skb_transport_header(skb);
+	if (arfs_get_ip_proto(skb) == IPPROTO_TCP)
+		return ((struct tcphdr *)transport_header)->source;
+	return ((struct udphdr *)transport_header)->source;
+}
+
+static struct arfs_rule *arfs_alloc_rule(struct mlx5e_priv *priv,
+					 struct arfs_table *arfs_t,
+					 const struct sk_buff *skb,
+					 u16 rxq, u32 flow_id)
+{
+	struct arfs_rule *rule;
+	struct arfs_tuple *tuple;
+
+	rule = kzalloc(sizeof(*rule), GFP_ATOMIC);
+	if (!rule)
+		return NULL;
+
+	rule->priv = priv;
+	rule->rxq = rxq;
+	INIT_WORK(&rule->arfs_work, arfs_handle_work);
+
+	tuple = &rule->tuple;
+	tuple->etype = skb->protocol;
+	if (tuple->etype == htons(ETH_P_IP)) {
+		tuple->src_ipv4 = ip_hdr(skb)->saddr;
+		tuple->dst_ipv4 = ip_hdr(skb)->daddr;
+	} else {
+		memcpy(&tuple->src_ipv6, &ipv6_hdr(skb)->saddr,
+		       sizeof(struct in6_addr));
+		memcpy(&tuple->dst_ipv6, &ipv6_hdr(skb)->daddr,
+		       sizeof(struct in6_addr));
+	}
+	tuple->ip_proto = arfs_get_ip_proto(skb);
+	tuple->src_port = arfs_get_src_port(skb);
+	tuple->dst_port = arfs_get_dst_port(skb);
+
+	rule->flow_id = flow_id;
+	rule->filter_id = priv->fs.arfs.last_filter_id++ % RPS_NO_FILTER;
+
+	hlist_add_head(&rule->hlist,
+		       arfs_hash_bucket(arfs_t, tuple->src_port,
+					tuple->dst_port));
+	return rule;
+}
+
+static bool arfs_cmp_ips(struct arfs_tuple *tuple,
+			 const struct sk_buff *skb)
+{
+	if (tuple->etype == htons(ETH_P_IP) &&
+	    tuple->src_ipv4 == ip_hdr(skb)->saddr &&
+	    tuple->dst_ipv4 == ip_hdr(skb)->daddr)
+		return true;
+	if (tuple->etype == htons(ETH_P_IPV6) &&
+	    (!memcmp(&tuple->src_ipv6, &ipv6_hdr(skb)->saddr,
+		     sizeof(struct in6_addr))) &&
+	    (!memcmp(&tuple->dst_ipv6, &ipv6_hdr(skb)->daddr,
+		     sizeof(struct in6_addr))))
+		return true;
+	return false;
+}
+
+static struct arfs_rule *arfs_find_rule(struct arfs_table *arfs_t,
+					const struct sk_buff *skb)
+{
+	struct arfs_rule *arfs_rule;
+	struct hlist_head *head;
+	__be16 src_port = arfs_get_src_port(skb);
+	__be16 dst_port = arfs_get_dst_port(skb);
+
+	head = arfs_hash_bucket(arfs_t, src_port, dst_port);
+	hlist_for_each_entry(arfs_rule, head, hlist) {
+		if (arfs_rule->tuple.src_port == src_port &&
+		    arfs_rule->tuple.dst_port == dst_port &&
+		    arfs_cmp_ips(&arfs_rule->tuple, skb)) {
+			return arfs_rule;
+		}
+	}
+
+	return NULL;
+}
+
+int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+			u16 rxq_index, u32 flow_id)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_arfs_tables *arfs = &priv->fs.arfs;
+	struct arfs_table *arfs_t;
+	struct arfs_rule *arfs_rule;
+
+	if (skb->protocol != htons(ETH_P_IP) &&
+	    skb->protocol != htons(ETH_P_IPV6))
+		return -EPROTONOSUPPORT;
+
+	arfs_t = arfs_get_table(arfs, arfs_get_ip_proto(skb), skb->protocol);
+	if (!arfs_t)
+		return -EPROTONOSUPPORT;
+
+	spin_lock_bh(&arfs->arfs_lock);
+	arfs_rule = arfs_find_rule(arfs_t, skb);
+	if (arfs_rule) {
+		if (arfs_rule->rxq == rxq_index) {
+			spin_unlock_bh(&arfs->arfs_lock);
+			return arfs_rule->filter_id;
+		}
+		arfs_rule->rxq = rxq_index;
+	} else {
+		arfs_rule = arfs_alloc_rule(priv, arfs_t, skb,
+					    rxq_index, flow_id);
+		if (!arfs_rule) {
+			spin_unlock_bh(&arfs->arfs_lock);
+			return -ENOMEM;
+		}
+	}
+	queue_work(priv->fs.arfs.wq, &arfs_rule->arfs_work);
+	spin_unlock_bh(&arfs->arfs_lock);
+	return arfs_rule->filter_id;
+}
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
new file mode 100644
index 0000000..db3278c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "en.h"
+
+/* mlx5e global resources should be placed in this file.
+ * Global resources are common to all the netdevices crated on the same nic.
+ */
+
+int mlx5e_create_tir(struct mlx5_core_dev *mdev,
+		     struct mlx5e_tir *tir, u32 *in, int inlen)
+{
+	int err;
+
+	err = mlx5_core_create_tir(mdev, in, inlen, &tir->tirn);
+	if (err)
+		return err;
+
+	list_add(&tir->list, &mdev->mlx5e_res.td.tirs_list);
+
+	return 0;
+}
+
+void mlx5e_destroy_tir(struct mlx5_core_dev *mdev,
+		       struct mlx5e_tir *tir)
+{
+	mlx5_core_destroy_tir(mdev, tir->tirn);
+	list_del(&tir->list);
+}
+
+static int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
+			     struct mlx5_core_mkey *mkey)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	void *mkc;
+	u32 *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
+	MLX5_SET(mkc, mkc, lw, 1);
+	MLX5_SET(mkc, mkc, lr, 1);
+
+	MLX5_SET(mkc, mkc, pd, pdn);
+	MLX5_SET(mkc, mkc, length64, 1);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+
+	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+
+int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
+{
+	struct mlx5e_resources *res = &mdev->mlx5e_res;
+	int err;
+
+	err = mlx5_core_alloc_pd(mdev, &res->pdn);
+	if (err) {
+		mlx5_core_err(mdev, "alloc pd failed, %d\n", err);
+		return err;
+	}
+
+	err = mlx5_core_alloc_transport_domain(mdev, &res->td.tdn);
+	if (err) {
+		mlx5_core_err(mdev, "alloc td failed, %d\n", err);
+		goto err_dealloc_pd;
+	}
+
+	err = mlx5e_create_mkey(mdev, res->pdn, &res->mkey);
+	if (err) {
+		mlx5_core_err(mdev, "create mkey failed, %d\n", err);
+		goto err_dealloc_transport_domain;
+	}
+
+	err = mlx5_alloc_bfreg(mdev, &res->bfreg, false, false);
+	if (err) {
+		mlx5_core_err(mdev, "alloc bfreg failed, %d\n", err);
+		goto err_destroy_mkey;
+	}
+
+	INIT_LIST_HEAD(&mdev->mlx5e_res.td.tirs_list);
+
+	return 0;
+
+err_destroy_mkey:
+	mlx5_core_destroy_mkey(mdev, &res->mkey);
+err_dealloc_transport_domain:
+	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
+err_dealloc_pd:
+	mlx5_core_dealloc_pd(mdev, res->pdn);
+	return err;
+}
+
+void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev)
+{
+	struct mlx5e_resources *res = &mdev->mlx5e_res;
+
+	mlx5_free_bfreg(mdev, &res->bfreg);
+	mlx5_core_destroy_mkey(mdev, &res->mkey);
+	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
+	mlx5_core_dealloc_pd(mdev, res->pdn);
+	memset(res, 0, sizeof(*res));
+}
+
+int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_tir *tir;
+	int err  = -ENOMEM;
+	u32 tirn = 0;
+	int inlen;
+	void *in;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		goto out;
+
+	if (enable_uc_lb)
+		MLX5_SET(modify_tir_in, in, ctx.self_lb_block,
+			 MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_);
+
+	MLX5_SET(modify_tir_in, in, bitmask.self_lb_en, 1);
+
+	list_for_each_entry(tir, &mdev->mlx5e_res.td.tirs_list, list) {
+		tirn = tir->tirn;
+		err = mlx5_core_modify_tir(mdev, tirn, in, inlen);
+		if (err)
+			goto out;
+	}
+
+out:
+	kvfree(in);
+	if (err)
+		netdev_err(priv->netdev, "refresh tir(0x%x) failed, %d\n", tirn, err);
+
+	return err;
+}
+
+u8 mlx5e_params_calculate_tx_min_inline(struct mlx5_core_dev *mdev)
+{
+	u8 min_inline_mode;
+
+	mlx5_query_min_inline(mdev, &min_inline_mode);
+	if (min_inline_mode == MLX5_INLINE_MODE_NONE &&
+	    !MLX5_CAP_ETH(mdev, wqe_vlan_insert))
+		min_inline_mode = MLX5_INLINE_MODE_L2;
+
+	return min_inline_mode;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
new file mode 100644
index 0000000..c641d56
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -0,0 +1,1095 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/device.h>
+#include <linux/netdevice.h>
+#include "en.h"
+
+#define MLX5E_MAX_PRIORITY 8
+
+#define MLX5E_100MB (100000)
+#define MLX5E_1GB   (1000000)
+
+#define MLX5E_CEE_STATE_UP    1
+#define MLX5E_CEE_STATE_DOWN  0
+
+enum {
+	MLX5E_VENDOR_TC_GROUP_NUM = 7,
+	MLX5E_LOWEST_PRIO_GROUP   = 0,
+};
+
+#define MLX5_DSCP_SUPPORTED(mdev) (MLX5_CAP_GEN(mdev, qcam_reg)  && \
+				   MLX5_CAP_QCAM_REG(mdev, qpts) && \
+				   MLX5_CAP_QCAM_REG(mdev, qpdpm))
+
+static int mlx5e_set_trust_state(struct mlx5e_priv *priv, u8 trust_state);
+static int mlx5e_set_dscp2prio(struct mlx5e_priv *priv, u8 dscp, u8 prio);
+
+/* If dcbx mode is non-host set the dcbx mode to host.
+ */
+static int mlx5e_dcbnl_set_dcbx_mode(struct mlx5e_priv *priv,
+				     enum mlx5_dcbx_oper_mode mode)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 param[MLX5_ST_SZ_DW(dcbx_param)];
+	int err;
+
+	err = mlx5_query_port_dcbx_param(mdev, param);
+	if (err)
+		return err;
+
+	MLX5_SET(dcbx_param, param, version_admin, mode);
+	if (mode != MLX5E_DCBX_PARAM_VER_OPER_HOST)
+		MLX5_SET(dcbx_param, param, willing_admin, 1);
+
+	return mlx5_set_port_dcbx_param(mdev, param);
+}
+
+static int mlx5e_dcbnl_switch_to_host_mode(struct mlx5e_priv *priv)
+{
+	struct mlx5e_dcbx *dcbx = &priv->dcbx;
+	int err;
+
+	if (!MLX5_CAP_GEN(priv->mdev, dcbx))
+		return 0;
+
+	if (dcbx->mode == MLX5E_DCBX_PARAM_VER_OPER_HOST)
+		return 0;
+
+	err = mlx5e_dcbnl_set_dcbx_mode(priv, MLX5E_DCBX_PARAM_VER_OPER_HOST);
+	if (err)
+		return err;
+
+	dcbx->mode = MLX5E_DCBX_PARAM_VER_OPER_HOST;
+	return 0;
+}
+
+static int mlx5e_dcbnl_ieee_getets(struct net_device *netdev,
+				   struct ieee_ets *ets)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 tc_group[IEEE_8021QAZ_MAX_TCS];
+	bool is_tc_group_6_exist = false;
+	bool is_zero_bw_ets_tc = false;
+	int err = 0;
+	int i;
+
+	if (!MLX5_CAP_GEN(priv->mdev, ets))
+		return -EOPNOTSUPP;
+
+	ets->ets_cap = mlx5_max_tc(priv->mdev) + 1;
+	for (i = 0; i < ets->ets_cap; i++) {
+		err = mlx5_query_port_prio_tc(mdev, i, &ets->prio_tc[i]);
+		if (err)
+			return err;
+
+		err = mlx5_query_port_tc_group(mdev, i, &tc_group[i]);
+		if (err)
+			return err;
+
+		err = mlx5_query_port_tc_bw_alloc(mdev, i, &ets->tc_tx_bw[i]);
+		if (err)
+			return err;
+
+		if (ets->tc_tx_bw[i] < MLX5E_MAX_BW_ALLOC &&
+		    tc_group[i] == (MLX5E_LOWEST_PRIO_GROUP + 1))
+			is_zero_bw_ets_tc = true;
+
+		if (tc_group[i] == (MLX5E_VENDOR_TC_GROUP_NUM - 1))
+			is_tc_group_6_exist = true;
+	}
+
+	/* Report 0% ets tc if exits*/
+	if (is_zero_bw_ets_tc) {
+		for (i = 0; i < ets->ets_cap; i++)
+			if (tc_group[i] == MLX5E_LOWEST_PRIO_GROUP)
+				ets->tc_tx_bw[i] = 0;
+	}
+
+	/* Update tc_tsa based on fw setting*/
+	for (i = 0; i < ets->ets_cap; i++) {
+		if (ets->tc_tx_bw[i] < MLX5E_MAX_BW_ALLOC)
+			priv->dcbx.tc_tsa[i] = IEEE_8021QAZ_TSA_ETS;
+		else if (tc_group[i] == MLX5E_VENDOR_TC_GROUP_NUM &&
+			 !is_tc_group_6_exist)
+			priv->dcbx.tc_tsa[i] = IEEE_8021QAZ_TSA_VENDOR;
+	}
+	memcpy(ets->tc_tsa, priv->dcbx.tc_tsa, sizeof(ets->tc_tsa));
+
+	return err;
+}
+
+static void mlx5e_build_tc_group(struct ieee_ets *ets, u8 *tc_group, int max_tc)
+{
+	bool any_tc_mapped_to_ets = false;
+	bool ets_zero_bw = false;
+	int strict_group;
+	int i;
+
+	for (i = 0; i <= max_tc; i++) {
+		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) {
+			any_tc_mapped_to_ets = true;
+			if (!ets->tc_tx_bw[i])
+				ets_zero_bw = true;
+		}
+	}
+
+	/* strict group has higher priority than ets group */
+	strict_group = MLX5E_LOWEST_PRIO_GROUP;
+	if (any_tc_mapped_to_ets)
+		strict_group++;
+	if (ets_zero_bw)
+		strict_group++;
+
+	for (i = 0; i <= max_tc; i++) {
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_VENDOR:
+			tc_group[i] = MLX5E_VENDOR_TC_GROUP_NUM;
+			break;
+		case IEEE_8021QAZ_TSA_STRICT:
+			tc_group[i] = strict_group++;
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			tc_group[i] = MLX5E_LOWEST_PRIO_GROUP;
+			if (ets->tc_tx_bw[i] && ets_zero_bw)
+				tc_group[i] = MLX5E_LOWEST_PRIO_GROUP + 1;
+			break;
+		}
+	}
+}
+
+static void mlx5e_build_tc_tx_bw(struct ieee_ets *ets, u8 *tc_tx_bw,
+				 u8 *tc_group, int max_tc)
+{
+	int bw_for_ets_zero_bw_tc = 0;
+	int last_ets_zero_bw_tc = -1;
+	int num_ets_zero_bw = 0;
+	int i;
+
+	for (i = 0; i <= max_tc; i++) {
+		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS &&
+		    !ets->tc_tx_bw[i]) {
+			num_ets_zero_bw++;
+			last_ets_zero_bw_tc = i;
+		}
+	}
+
+	if (num_ets_zero_bw)
+		bw_for_ets_zero_bw_tc = MLX5E_MAX_BW_ALLOC / num_ets_zero_bw;
+
+	for (i = 0; i <= max_tc; i++) {
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_VENDOR:
+			tc_tx_bw[i] = MLX5E_MAX_BW_ALLOC;
+			break;
+		case IEEE_8021QAZ_TSA_STRICT:
+			tc_tx_bw[i] = MLX5E_MAX_BW_ALLOC;
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			tc_tx_bw[i] = ets->tc_tx_bw[i] ?
+				      ets->tc_tx_bw[i] :
+				      bw_for_ets_zero_bw_tc;
+			break;
+		}
+	}
+
+	/* Make sure the total bw for ets zero bw group is 100% */
+	if (last_ets_zero_bw_tc != -1)
+		tc_tx_bw[last_ets_zero_bw_tc] +=
+			MLX5E_MAX_BW_ALLOC % num_ets_zero_bw;
+}
+
+/* If there are ETS BW 0,
+ *   Set ETS group # to 1 for all ETS non zero BW tcs. Their sum must be 100%.
+ *   Set group #0 to all the ETS BW 0 tcs and
+ *     equally splits the 100% BW between them
+ *   Report both group #0 and #1 as ETS type.
+ *     All the tcs in group #0 will be reported with 0% BW.
+ */
+int mlx5e_dcbnl_ieee_setets_core(struct mlx5e_priv *priv, struct ieee_ets *ets)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 tc_tx_bw[IEEE_8021QAZ_MAX_TCS];
+	u8 tc_group[IEEE_8021QAZ_MAX_TCS];
+	int max_tc = mlx5_max_tc(mdev);
+	int err, i;
+
+	mlx5e_build_tc_group(ets, tc_group, max_tc);
+	mlx5e_build_tc_tx_bw(ets, tc_tx_bw, tc_group, max_tc);
+
+	err = mlx5_set_port_prio_tc(mdev, ets->prio_tc);
+	if (err)
+		return err;
+
+	err = mlx5_set_port_tc_group(mdev, tc_group);
+	if (err)
+		return err;
+
+	err = mlx5_set_port_tc_bw_alloc(mdev, tc_tx_bw);
+
+	if (err)
+		return err;
+
+	memcpy(priv->dcbx.tc_tsa, ets->tc_tsa, sizeof(ets->tc_tsa));
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		mlx5e_dbg(HW, priv, "%s: prio_%d <=> tc_%d\n",
+			  __func__, i, ets->prio_tc[i]);
+		mlx5e_dbg(HW, priv, "%s: tc_%d <=> tx_bw_%d%%, group_%d\n",
+			  __func__, i, tc_tx_bw[i], tc_group[i]);
+	}
+
+	return err;
+}
+
+static int mlx5e_dbcnl_validate_ets(struct net_device *netdev,
+				    struct ieee_ets *ets)
+{
+	bool have_ets_tc = false;
+	int bw_sum = 0;
+	int i;
+
+	/* Validate Priority */
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (ets->prio_tc[i] >= MLX5E_MAX_PRIORITY) {
+			netdev_err(netdev,
+				   "Failed to validate ETS: priority value greater than max(%d)\n",
+				    MLX5E_MAX_PRIORITY);
+			return -EINVAL;
+		}
+	}
+
+	/* Validate Bandwidth Sum */
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) {
+			have_ets_tc = true;
+			bw_sum += ets->tc_tx_bw[i];
+		}
+	}
+
+	if (have_ets_tc && bw_sum != 100) {
+		netdev_err(netdev,
+			   "Failed to validate ETS: BW sum is illegal\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int mlx5e_dcbnl_ieee_setets(struct net_device *netdev,
+				   struct ieee_ets *ets)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	if (!MLX5_CAP_GEN(priv->mdev, ets))
+		return -EOPNOTSUPP;
+
+	err = mlx5e_dbcnl_validate_ets(netdev, ets);
+	if (err)
+		return err;
+
+	err = mlx5e_dcbnl_ieee_setets_core(priv, ets);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int mlx5e_dcbnl_ieee_getpfc(struct net_device *dev,
+				   struct ieee_pfc *pfc)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	int i;
+
+	pfc->pfc_cap = mlx5_max_tc(mdev) + 1;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		pfc->requests[i]    = PPORT_PER_PRIO_GET(pstats, i, tx_pause);
+		pfc->indications[i] = PPORT_PER_PRIO_GET(pstats, i, rx_pause);
+	}
+
+	return mlx5_query_port_pfc(mdev, &pfc->pfc_en, NULL);
+}
+
+static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev,
+				   struct ieee_pfc *pfc)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 curr_pfc_en;
+	int ret;
+
+	mlx5_query_port_pfc(mdev, &curr_pfc_en, NULL);
+
+	if (pfc->pfc_en == curr_pfc_en)
+		return 0;
+
+	ret = mlx5_set_port_pfc(mdev, pfc->pfc_en, pfc->pfc_en);
+	mlx5_toggle_port_link(mdev);
+
+	if (!ret) {
+		mlx5e_dbg(HW, priv,
+			  "%s: PFC per priority bit mask: 0x%x\n",
+			  __func__, pfc->pfc_en);
+	}
+	return ret;
+}
+
+static u8 mlx5e_dcbnl_getdcbx(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	return priv->dcbx.cap;
+}
+
+static u8 mlx5e_dcbnl_setdcbx(struct net_device *dev, u8 mode)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_dcbx *dcbx = &priv->dcbx;
+
+	if (mode & DCB_CAP_DCBX_LLD_MANAGED)
+		return 1;
+
+	if ((!mode) && MLX5_CAP_GEN(priv->mdev, dcbx)) {
+		if (dcbx->mode == MLX5E_DCBX_PARAM_VER_OPER_AUTO)
+			return 0;
+
+		/* set dcbx to fw controlled */
+		if (!mlx5e_dcbnl_set_dcbx_mode(priv, MLX5E_DCBX_PARAM_VER_OPER_AUTO)) {
+			dcbx->mode = MLX5E_DCBX_PARAM_VER_OPER_AUTO;
+			dcbx->cap &= ~DCB_CAP_DCBX_HOST;
+			return 0;
+		}
+
+		return 1;
+	}
+
+	if (!(mode & DCB_CAP_DCBX_HOST))
+		return 1;
+
+	if (mlx5e_dcbnl_switch_to_host_mode(netdev_priv(dev)))
+		return 1;
+
+	dcbx->cap = mode;
+
+	return 0;
+}
+
+static int mlx5e_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct dcb_app temp;
+	bool is_new;
+	int err;
+
+	if (app->selector != IEEE_8021QAZ_APP_SEL_DSCP)
+		return -EINVAL;
+
+	if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager))
+		return -EINVAL;
+
+	if (!MLX5_DSCP_SUPPORTED(priv->mdev))
+		return -EINVAL;
+
+	if (app->protocol >= MLX5E_MAX_DSCP)
+		return -EINVAL;
+
+	/* Save the old entry info */
+	temp.selector = IEEE_8021QAZ_APP_SEL_DSCP;
+	temp.protocol = app->protocol;
+	temp.priority = priv->dcbx_dp.dscp2prio[app->protocol];
+
+	/* Check if need to switch to dscp trust state */
+	if (!priv->dcbx.dscp_app_cnt) {
+		err =  mlx5e_set_trust_state(priv, MLX5_QPTS_TRUST_DSCP);
+		if (err)
+			return err;
+	}
+
+	/* Skip the fw command if new and old mapping are the same */
+	if (app->priority != priv->dcbx_dp.dscp2prio[app->protocol]) {
+		err = mlx5e_set_dscp2prio(priv, app->protocol, app->priority);
+		if (err)
+			goto fw_err;
+	}
+
+	/* Delete the old entry if exists */
+	is_new = false;
+	err = dcb_ieee_delapp(dev, &temp);
+	if (err)
+		is_new = true;
+
+	/* Add new entry and update counter */
+	err = dcb_ieee_setapp(dev, app);
+	if (err)
+		return err;
+
+	if (is_new)
+		priv->dcbx.dscp_app_cnt++;
+
+	return err;
+
+fw_err:
+	mlx5e_set_trust_state(priv, MLX5_QPTS_TRUST_PCP);
+	return err;
+}
+
+static int mlx5e_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int err;
+
+	if (app->selector != IEEE_8021QAZ_APP_SEL_DSCP)
+		return -EINVAL;
+
+	if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager))
+		return -EINVAL;
+
+	if (!MLX5_DSCP_SUPPORTED(priv->mdev))
+		return -EINVAL;
+
+	if (app->protocol >= MLX5E_MAX_DSCP)
+		return -EINVAL;
+
+	/* Skip if no dscp app entry */
+	if (!priv->dcbx.dscp_app_cnt)
+		return -ENOENT;
+
+	/* Check if the entry matches fw setting */
+	if (app->priority != priv->dcbx_dp.dscp2prio[app->protocol])
+		return -ENOENT;
+
+	/* Delete the app entry */
+	err = dcb_ieee_delapp(dev, app);
+	if (err)
+		return err;
+
+	/* Reset the priority mapping back to zero */
+	err = mlx5e_set_dscp2prio(priv, app->protocol, 0);
+	if (err)
+		goto fw_err;
+
+	priv->dcbx.dscp_app_cnt--;
+
+	/* Check if need to switch to pcp trust state */
+	if (!priv->dcbx.dscp_app_cnt)
+		err = mlx5e_set_trust_state(priv, MLX5_QPTS_TRUST_PCP);
+
+	return err;
+
+fw_err:
+	mlx5e_set_trust_state(priv, MLX5_QPTS_TRUST_PCP);
+	return err;
+}
+
+static int mlx5e_dcbnl_ieee_getmaxrate(struct net_device *netdev,
+				       struct ieee_maxrate *maxrate)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 max_bw_value[IEEE_8021QAZ_MAX_TCS];
+	u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS];
+	int err;
+	int i;
+
+	err = mlx5_query_port_ets_rate_limit(mdev, max_bw_value, max_bw_unit);
+	if (err)
+		return err;
+
+	memset(maxrate->tc_maxrate, 0, sizeof(maxrate->tc_maxrate));
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		switch (max_bw_unit[i]) {
+		case MLX5_100_MBPS_UNIT:
+			maxrate->tc_maxrate[i] = max_bw_value[i] * MLX5E_100MB;
+			break;
+		case MLX5_GBPS_UNIT:
+			maxrate->tc_maxrate[i] = max_bw_value[i] * MLX5E_1GB;
+			break;
+		case MLX5_BW_NO_LIMIT:
+			break;
+		default:
+			WARN(true, "non-supported BW unit");
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int mlx5e_dcbnl_ieee_setmaxrate(struct net_device *netdev,
+				       struct ieee_maxrate *maxrate)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 max_bw_value[IEEE_8021QAZ_MAX_TCS];
+	u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS];
+	__u64 upper_limit_mbps = roundup(255 * MLX5E_100MB, MLX5E_1GB);
+	int i;
+
+	memset(max_bw_value, 0, sizeof(max_bw_value));
+	memset(max_bw_unit, 0, sizeof(max_bw_unit));
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		if (!maxrate->tc_maxrate[i]) {
+			max_bw_unit[i]  = MLX5_BW_NO_LIMIT;
+			continue;
+		}
+		if (maxrate->tc_maxrate[i] < upper_limit_mbps) {
+			max_bw_value[i] = div_u64(maxrate->tc_maxrate[i],
+						  MLX5E_100MB);
+			max_bw_value[i] = max_bw_value[i] ? max_bw_value[i] : 1;
+			max_bw_unit[i]  = MLX5_100_MBPS_UNIT;
+		} else {
+			max_bw_value[i] = div_u64(maxrate->tc_maxrate[i],
+						  MLX5E_1GB);
+			max_bw_unit[i]  = MLX5_GBPS_UNIT;
+		}
+	}
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		mlx5e_dbg(HW, priv, "%s: tc_%d <=> max_bw %d Gbps\n",
+			  __func__, i, max_bw_value[i]);
+	}
+
+	return mlx5_modify_port_ets_rate_limit(mdev, max_bw_value, max_bw_unit);
+}
+
+static u8 mlx5e_dcbnl_setall(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct ieee_ets ets;
+	struct ieee_pfc pfc;
+	int err = -EOPNOTSUPP;
+	int i;
+
+	if (!MLX5_CAP_GEN(mdev, ets))
+		goto out;
+
+	memset(&ets, 0, sizeof(ets));
+	memset(&pfc, 0, sizeof(pfc));
+
+	ets.ets_cap = IEEE_8021QAZ_MAX_TCS;
+	for (i = 0; i < CEE_DCBX_MAX_PGS; i++) {
+		ets.tc_tx_bw[i] = cee_cfg->pg_bw_pct[i];
+		ets.tc_rx_bw[i] = cee_cfg->pg_bw_pct[i];
+		ets.tc_tsa[i]   = IEEE_8021QAZ_TSA_ETS;
+		ets.prio_tc[i]  = cee_cfg->prio_to_pg_map[i];
+		mlx5e_dbg(HW, priv,
+			  "%s: Priority group %d: tx_bw %d, rx_bw %d, prio_tc %d\n",
+			  __func__, i, ets.tc_tx_bw[i], ets.tc_rx_bw[i],
+			  ets.prio_tc[i]);
+	}
+
+	err = mlx5e_dbcnl_validate_ets(netdev, &ets);
+	if (err) {
+		netdev_err(netdev,
+			   "%s, Failed to validate ETS: %d\n", __func__, err);
+		goto out;
+	}
+
+	err = mlx5e_dcbnl_ieee_setets_core(priv, &ets);
+	if (err) {
+		netdev_err(netdev,
+			   "%s, Failed to set ETS: %d\n", __func__, err);
+		goto out;
+	}
+
+	/* Set PFC */
+	pfc.pfc_cap = mlx5_max_tc(mdev) + 1;
+	if (!cee_cfg->pfc_enable)
+		pfc.pfc_en = 0;
+	else
+		for (i = 0; i < CEE_DCBX_MAX_PRIO; i++)
+			pfc.pfc_en |= cee_cfg->pfc_setting[i] << i;
+
+	err = mlx5e_dcbnl_ieee_setpfc(netdev, &pfc);
+	if (err) {
+		netdev_err(netdev,
+			   "%s, Failed to set PFC: %d\n", __func__, err);
+		goto out;
+	}
+out:
+	return err ? MLX5_DCB_NO_CHG : MLX5_DCB_CHG_RESET;
+}
+
+static u8 mlx5e_dcbnl_getstate(struct net_device *netdev)
+{
+	return MLX5E_CEE_STATE_UP;
+}
+
+static void mlx5e_dcbnl_getpermhwaddr(struct net_device *netdev,
+				      u8 *perm_addr)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (!perm_addr)
+		return;
+
+	memset(perm_addr, 0xff, MAX_ADDR_LEN);
+
+	mlx5_query_nic_vport_mac_address(priv->mdev, 0, perm_addr);
+}
+
+static void mlx5e_dcbnl_setpgtccfgtx(struct net_device *netdev,
+				     int priority, u8 prio_type,
+				     u8 pgid, u8 bw_pct, u8 up_map)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+
+	if (priority >= CEE_DCBX_MAX_PRIO) {
+		netdev_err(netdev,
+			   "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	if (pgid >= CEE_DCBX_MAX_PGS) {
+		netdev_err(netdev,
+			   "%s, priority group is out of range\n", __func__);
+		return;
+	}
+
+	cee_cfg->prio_to_pg_map[priority] = pgid;
+}
+
+static void mlx5e_dcbnl_setpgbwgcfgtx(struct net_device *netdev,
+				      int pgid, u8 bw_pct)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+
+	if (pgid >= CEE_DCBX_MAX_PGS) {
+		netdev_err(netdev,
+			   "%s, priority group is out of range\n", __func__);
+		return;
+	}
+
+	cee_cfg->pg_bw_pct[pgid] = bw_pct;
+}
+
+static void mlx5e_dcbnl_getpgtccfgtx(struct net_device *netdev,
+				     int priority, u8 *prio_type,
+				     u8 *pgid, u8 *bw_pct, u8 *up_map)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	if (!MLX5_CAP_GEN(priv->mdev, ets)) {
+		netdev_err(netdev, "%s, ets is not supported\n", __func__);
+		return;
+	}
+
+	if (priority >= CEE_DCBX_MAX_PRIO) {
+		netdev_err(netdev,
+			   "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	*prio_type = 0;
+	*bw_pct = 0;
+	*up_map = 0;
+
+	if (mlx5_query_port_prio_tc(mdev, priority, pgid))
+		*pgid = 0;
+}
+
+static void mlx5e_dcbnl_getpgbwgcfgtx(struct net_device *netdev,
+				      int pgid, u8 *bw_pct)
+{
+	struct ieee_ets ets;
+
+	if (pgid >= CEE_DCBX_MAX_PGS) {
+		netdev_err(netdev,
+			   "%s, priority group is out of range\n", __func__);
+		return;
+	}
+
+	mlx5e_dcbnl_ieee_getets(netdev, &ets);
+	*bw_pct = ets.tc_tx_bw[pgid];
+}
+
+static void mlx5e_dcbnl_setpfccfg(struct net_device *netdev,
+				  int priority, u8 setting)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+
+	if (priority >= CEE_DCBX_MAX_PRIO) {
+		netdev_err(netdev,
+			   "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	if (setting > 1)
+		return;
+
+	cee_cfg->pfc_setting[priority] = setting;
+}
+
+static int
+mlx5e_dcbnl_get_priority_pfc(struct net_device *netdev,
+			     int priority, u8 *setting)
+{
+	struct ieee_pfc pfc;
+	int err;
+
+	err = mlx5e_dcbnl_ieee_getpfc(netdev, &pfc);
+
+	if (err)
+		*setting = 0;
+	else
+		*setting = (pfc.pfc_en >> priority) & 0x01;
+
+	return err;
+}
+
+static void mlx5e_dcbnl_getpfccfg(struct net_device *netdev,
+				  int priority, u8 *setting)
+{
+	if (priority >= CEE_DCBX_MAX_PRIO) {
+		netdev_err(netdev,
+			   "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	if (!setting)
+		return;
+
+	mlx5e_dcbnl_get_priority_pfc(netdev, priority, setting);
+}
+
+static u8 mlx5e_dcbnl_getcap(struct net_device *netdev,
+			     int capid, u8 *cap)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 rval = 0;
+
+	switch (capid) {
+	case DCB_CAP_ATTR_PG:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_PFC:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_UP2TC:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_PG_TCS:
+		*cap = 1 << mlx5_max_tc(mdev);
+		break;
+	case DCB_CAP_ATTR_PFC_TCS:
+		*cap = 1 << mlx5_max_tc(mdev);
+		break;
+	case DCB_CAP_ATTR_GSP:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_BCN:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_DCBX:
+		*cap = priv->dcbx.cap |
+		       DCB_CAP_DCBX_VER_CEE |
+		       DCB_CAP_DCBX_VER_IEEE;
+		break;
+	default:
+		*cap = 0;
+		rval = 1;
+		break;
+	}
+
+	return rval;
+}
+
+static int mlx5e_dcbnl_getnumtcs(struct net_device *netdev,
+				 int tcs_id, u8 *num)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	switch (tcs_id) {
+	case DCB_NUMTCS_ATTR_PG:
+	case DCB_NUMTCS_ATTR_PFC:
+		*num = mlx5_max_tc(mdev) + 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static u8 mlx5e_dcbnl_getpfcstate(struct net_device *netdev)
+{
+	struct ieee_pfc pfc;
+
+	if (mlx5e_dcbnl_ieee_getpfc(netdev, &pfc))
+		return MLX5E_CEE_STATE_DOWN;
+
+	return pfc.pfc_en ? MLX5E_CEE_STATE_UP : MLX5E_CEE_STATE_DOWN;
+}
+
+static void mlx5e_dcbnl_setpfcstate(struct net_device *netdev, u8 state)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+
+	if ((state != MLX5E_CEE_STATE_UP) && (state != MLX5E_CEE_STATE_DOWN))
+		return;
+
+	cee_cfg->pfc_enable = state;
+}
+
+const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops = {
+	.ieee_getets	= mlx5e_dcbnl_ieee_getets,
+	.ieee_setets	= mlx5e_dcbnl_ieee_setets,
+	.ieee_getmaxrate = mlx5e_dcbnl_ieee_getmaxrate,
+	.ieee_setmaxrate = mlx5e_dcbnl_ieee_setmaxrate,
+	.ieee_getpfc	= mlx5e_dcbnl_ieee_getpfc,
+	.ieee_setpfc	= mlx5e_dcbnl_ieee_setpfc,
+	.ieee_setapp    = mlx5e_dcbnl_ieee_setapp,
+	.ieee_delapp    = mlx5e_dcbnl_ieee_delapp,
+	.getdcbx	= mlx5e_dcbnl_getdcbx,
+	.setdcbx	= mlx5e_dcbnl_setdcbx,
+
+/* CEE interfaces */
+	.setall         = mlx5e_dcbnl_setall,
+	.getstate       = mlx5e_dcbnl_getstate,
+	.getpermhwaddr  = mlx5e_dcbnl_getpermhwaddr,
+
+	.setpgtccfgtx   = mlx5e_dcbnl_setpgtccfgtx,
+	.setpgbwgcfgtx  = mlx5e_dcbnl_setpgbwgcfgtx,
+	.getpgtccfgtx   = mlx5e_dcbnl_getpgtccfgtx,
+	.getpgbwgcfgtx  = mlx5e_dcbnl_getpgbwgcfgtx,
+
+	.setpfccfg      = mlx5e_dcbnl_setpfccfg,
+	.getpfccfg      = mlx5e_dcbnl_getpfccfg,
+	.getcap         = mlx5e_dcbnl_getcap,
+	.getnumtcs      = mlx5e_dcbnl_getnumtcs,
+	.getpfcstate    = mlx5e_dcbnl_getpfcstate,
+	.setpfcstate    = mlx5e_dcbnl_setpfcstate,
+};
+
+static void mlx5e_dcbnl_query_dcbx_mode(struct mlx5e_priv *priv,
+					enum mlx5_dcbx_oper_mode *mode)
+{
+	u32 out[MLX5_ST_SZ_DW(dcbx_param)];
+
+	*mode = MLX5E_DCBX_PARAM_VER_OPER_HOST;
+
+	if (!mlx5_query_port_dcbx_param(priv->mdev, out))
+		*mode = MLX5_GET(dcbx_param, out, version_oper);
+
+	/* From driver's point of view, we only care if the mode
+	 * is host (HOST) or non-host (AUTO)
+	 */
+	if (*mode != MLX5E_DCBX_PARAM_VER_OPER_HOST)
+		*mode = MLX5E_DCBX_PARAM_VER_OPER_AUTO;
+}
+
+static void mlx5e_ets_init(struct mlx5e_priv *priv)
+{
+	struct ieee_ets ets;
+	int err;
+	int i;
+
+	if (!MLX5_CAP_GEN(priv->mdev, ets))
+		return;
+
+	memset(&ets, 0, sizeof(ets));
+	ets.ets_cap = mlx5_max_tc(priv->mdev) + 1;
+	for (i = 0; i < ets.ets_cap; i++) {
+		ets.tc_tx_bw[i] = MLX5E_MAX_BW_ALLOC;
+		ets.tc_tsa[i] = IEEE_8021QAZ_TSA_VENDOR;
+		ets.prio_tc[i] = i;
+	}
+
+	if (ets.ets_cap > 1) {
+		/* tclass[prio=0]=1, tclass[prio=1]=0, tclass[prio=i]=i (for i>1) */
+		ets.prio_tc[0] = 1;
+		ets.prio_tc[1] = 0;
+	}
+
+	err = mlx5e_dcbnl_ieee_setets_core(priv, &ets);
+	if (err)
+		netdev_err(priv->netdev,
+			   "%s, Failed to init ETS: %d\n", __func__, err);
+}
+
+enum {
+	INIT,
+	DELETE,
+};
+
+static void mlx5e_dcbnl_dscp_app(struct mlx5e_priv *priv, int action)
+{
+	struct dcb_app temp;
+	int i;
+
+	if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager))
+		return;
+
+	if (!MLX5_DSCP_SUPPORTED(priv->mdev))
+		return;
+
+	/* No SEL_DSCP entry in non DSCP state */
+	if (priv->dcbx_dp.trust_state != MLX5_QPTS_TRUST_DSCP)
+		return;
+
+	temp.selector = IEEE_8021QAZ_APP_SEL_DSCP;
+	for (i = 0; i < MLX5E_MAX_DSCP; i++) {
+		temp.protocol = i;
+		temp.priority = priv->dcbx_dp.dscp2prio[i];
+		if (action == INIT)
+			dcb_ieee_setapp(priv->netdev, &temp);
+		else
+			dcb_ieee_delapp(priv->netdev, &temp);
+	}
+
+	priv->dcbx.dscp_app_cnt = (action == INIT) ? MLX5E_MAX_DSCP : 0;
+}
+
+void mlx5e_dcbnl_init_app(struct mlx5e_priv *priv)
+{
+	mlx5e_dcbnl_dscp_app(priv, INIT);
+}
+
+void mlx5e_dcbnl_delete_app(struct mlx5e_priv *priv)
+{
+	mlx5e_dcbnl_dscp_app(priv, DELETE);
+}
+
+static void mlx5e_trust_update_tx_min_inline_mode(struct mlx5e_priv *priv,
+						  struct mlx5e_params *params)
+{
+	params->tx_min_inline_mode = mlx5e_params_calculate_tx_min_inline(priv->mdev);
+	if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_DSCP &&
+	    params->tx_min_inline_mode == MLX5_INLINE_MODE_L2)
+		params->tx_min_inline_mode = MLX5_INLINE_MODE_IP;
+}
+
+static void mlx5e_trust_update_sq_inline_mode(struct mlx5e_priv *priv)
+{
+	struct mlx5e_channels new_channels = {};
+
+	mutex_lock(&priv->state_lock);
+
+	new_channels.params = priv->channels.params;
+	mlx5e_trust_update_tx_min_inline_mode(priv, &new_channels.params);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		goto out;
+	}
+
+	/* Skip if tx_min_inline is the same */
+	if (new_channels.params.tx_min_inline_mode ==
+	    priv->channels.params.tx_min_inline_mode)
+		goto out;
+
+	if (mlx5e_open_channels(priv, &new_channels))
+		goto out;
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+
+out:
+	mutex_unlock(&priv->state_lock);
+}
+
+static int mlx5e_set_trust_state(struct mlx5e_priv *priv, u8 trust_state)
+{
+	int err;
+
+	err =  mlx5_set_trust_state(priv->mdev, trust_state);
+	if (err)
+		return err;
+	priv->dcbx_dp.trust_state = trust_state;
+	mlx5e_trust_update_sq_inline_mode(priv);
+
+	return err;
+}
+
+static int mlx5e_set_dscp2prio(struct mlx5e_priv *priv, u8 dscp, u8 prio)
+{
+	int err;
+
+	err = mlx5_set_dscp2prio(priv->mdev, dscp, prio);
+	if (err)
+		return err;
+
+	priv->dcbx_dp.dscp2prio[dscp] = prio;
+	return err;
+}
+
+static int mlx5e_trust_initialize(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	if (!MLX5_DSCP_SUPPORTED(mdev))
+		return 0;
+
+	err = mlx5_query_trust_state(priv->mdev, &priv->dcbx_dp.trust_state);
+	if (err)
+		return err;
+
+	mlx5e_trust_update_tx_min_inline_mode(priv, &priv->channels.params);
+
+	err = mlx5_query_dscp2prio(priv->mdev, priv->dcbx_dp.dscp2prio);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv)
+{
+	struct mlx5e_dcbx *dcbx = &priv->dcbx;
+
+	mlx5e_trust_initialize(priv);
+
+	if (!MLX5_CAP_GEN(priv->mdev, qos))
+		return;
+
+	if (MLX5_CAP_GEN(priv->mdev, dcbx))
+		mlx5e_dcbnl_query_dcbx_mode(priv, &dcbx->mode);
+
+	priv->dcbx.cap = DCB_CAP_DCBX_VER_CEE |
+			 DCB_CAP_DCBX_VER_IEEE;
+	if (priv->dcbx.mode == MLX5E_DCBX_PARAM_VER_OPER_HOST)
+		priv->dcbx.cap |= DCB_CAP_DCBX_HOST;
+
+	mlx5e_ets_init(priv);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
new file mode 100644
index 0000000..602851a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/net_dim.h>
+#include "en.h"
+
+void mlx5e_rx_dim_work(struct work_struct *work)
+{
+	struct net_dim *dim = container_of(work, struct net_dim,
+					   work);
+	struct mlx5e_rq *rq = container_of(dim, struct mlx5e_rq, dim);
+	struct net_dim_cq_moder cur_profile = net_dim_get_profile(dim->mode,
+								  dim->profile_ix);
+
+	mlx5_core_modify_cq_moderation(rq->mdev, &rq->cq.mcq,
+				       cur_profile.usec, cur_profile.pkts);
+
+	dim->state = NET_DIM_START_MEASURE;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
new file mode 100644
index 0000000..37fd024
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -0,0 +1,1712 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "en.h"
+
+void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
+			       struct ethtool_drvinfo *drvinfo)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	strlcpy(drvinfo->driver, DRIVER_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, DRIVER_VERSION,
+		sizeof(drvinfo->version));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "%d.%d.%04d (%.16s)",
+		 fw_rev_maj(mdev), fw_rev_min(mdev), fw_rev_sub(mdev),
+		 mdev->board_id);
+	strlcpy(drvinfo->bus_info, pci_name(mdev->pdev),
+		sizeof(drvinfo->bus_info));
+}
+
+static void mlx5e_get_drvinfo(struct net_device *dev,
+			      struct ethtool_drvinfo *drvinfo)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	mlx5e_ethtool_get_drvinfo(priv, drvinfo);
+}
+
+struct ptys2ethtool_config {
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(advertised);
+	u32 speed;
+};
+
+static struct ptys2ethtool_config ptys2ethtool_table[MLX5E_LINK_MODES_NUMBER];
+
+#define MLX5_BUILD_PTYS2ETHTOOL_CONFIG(reg_, speed_, ...)               \
+	({                                                              \
+		struct ptys2ethtool_config *cfg;                        \
+		const unsigned int modes[] = { __VA_ARGS__ };           \
+		unsigned int i;                                         \
+		cfg = &ptys2ethtool_table[reg_];                        \
+		cfg->speed = speed_;                                    \
+		bitmap_zero(cfg->supported,                             \
+			    __ETHTOOL_LINK_MODE_MASK_NBITS);            \
+		bitmap_zero(cfg->advertised,                            \
+			    __ETHTOOL_LINK_MODE_MASK_NBITS);            \
+		for (i = 0 ; i < ARRAY_SIZE(modes) ; ++i) {             \
+			__set_bit(modes[i], cfg->supported);            \
+			__set_bit(modes[i], cfg->advertised);           \
+		}                                                       \
+	})
+
+void mlx5e_build_ptys2ethtool_map(void)
+{
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_1000BASE_CX_SGMII, SPEED_1000,
+				       ETHTOOL_LINK_MODE_1000baseKX_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_1000BASE_KX, SPEED_1000,
+				       ETHTOOL_LINK_MODE_1000baseKX_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_CX4, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_KX4, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_KR, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_20GBASE_KR2, SPEED_20000,
+				       ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_CR4, SPEED_40000,
+				       ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_KR4, SPEED_40000,
+				       ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_56GBASE_R4, SPEED_56000,
+				       ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_CR, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_SR, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_ER, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_SR4, SPEED_40000,
+				       ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_LR4, SPEED_40000,
+				       ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_SR2, SPEED_50000,
+				       ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_CR4, SPEED_100000,
+				       ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_SR4, SPEED_100000,
+				       ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_KR4, SPEED_100000,
+				       ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_LR4, SPEED_100000,
+				       ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_T, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseT_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_CR, SPEED_25000,
+				       ETHTOOL_LINK_MODE_25000baseCR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_KR, SPEED_25000,
+				       ETHTOOL_LINK_MODE_25000baseKR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_SR, SPEED_25000,
+				       ETHTOOL_LINK_MODE_25000baseSR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_CR2, SPEED_50000,
+				       ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_KR2, SPEED_50000,
+				       ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT);
+}
+
+int mlx5e_ethtool_get_sset_count(struct mlx5e_priv *priv, int sset)
+{
+	int i, num_stats = 0;
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < mlx5e_num_stats_grps; i++)
+			num_stats += mlx5e_stats_grps[i].get_num_stats(priv);
+		return num_stats;
+	case ETH_SS_PRIV_FLAGS:
+		return ARRAY_SIZE(mlx5e_priv_flags);
+	case ETH_SS_TEST:
+		return mlx5e_self_test_num(priv);
+	/* fallthrough */
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx5e_get_sset_count(struct net_device *dev, int sset)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	return mlx5e_ethtool_get_sset_count(priv, sset);
+}
+
+static void mlx5e_fill_stats_strings(struct mlx5e_priv *priv, u8 *data)
+{
+	int i, idx = 0;
+
+	for (i = 0; i < mlx5e_num_stats_grps; i++)
+		idx = mlx5e_stats_grps[i].fill_strings(priv, data, idx);
+}
+
+void mlx5e_ethtool_get_strings(struct mlx5e_priv *priv, u32 stringset, u8 *data)
+{
+	int i;
+
+	switch (stringset) {
+	case ETH_SS_PRIV_FLAGS:
+		for (i = 0; i < ARRAY_SIZE(mlx5e_priv_flags); i++)
+			strcpy(data + i * ETH_GSTRING_LEN, mlx5e_priv_flags[i]);
+		break;
+
+	case ETH_SS_TEST:
+		for (i = 0; i < mlx5e_self_test_num(priv); i++)
+			strcpy(data + i * ETH_GSTRING_LEN,
+			       mlx5e_self_tests[i]);
+		break;
+
+	case ETH_SS_STATS:
+		mlx5e_fill_stats_strings(priv, data);
+		break;
+	}
+}
+
+static void mlx5e_get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	mlx5e_ethtool_get_strings(priv, stringset, data);
+}
+
+void mlx5e_ethtool_get_ethtool_stats(struct mlx5e_priv *priv,
+				     struct ethtool_stats *stats, u64 *data)
+{
+	int i, idx = 0;
+
+	mutex_lock(&priv->state_lock);
+	mlx5e_update_stats(priv);
+	mutex_unlock(&priv->state_lock);
+
+	for (i = 0; i < mlx5e_num_stats_grps; i++)
+		idx = mlx5e_stats_grps[i].fill_stats(priv, data, idx);
+}
+
+static void mlx5e_get_ethtool_stats(struct net_device *dev,
+				    struct ethtool_stats *stats,
+				    u64 *data)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	mlx5e_ethtool_get_ethtool_stats(priv, stats, data);
+}
+
+void mlx5e_ethtool_get_ringparam(struct mlx5e_priv *priv,
+				 struct ethtool_ringparam *param)
+{
+	param->rx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE;
+	param->tx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE;
+	param->rx_pending     = 1 << priv->channels.params.log_rq_mtu_frames;
+	param->tx_pending     = 1 << priv->channels.params.log_sq_size;
+}
+
+static void mlx5e_get_ringparam(struct net_device *dev,
+				struct ethtool_ringparam *param)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	mlx5e_ethtool_get_ringparam(priv, param);
+}
+
+int mlx5e_ethtool_set_ringparam(struct mlx5e_priv *priv,
+				struct ethtool_ringparam *param)
+{
+	struct mlx5e_channels new_channels = {};
+	u8 log_rq_size;
+	u8 log_sq_size;
+	int err = 0;
+
+	if (param->rx_jumbo_pending) {
+		netdev_info(priv->netdev, "%s: rx_jumbo_pending not supported\n",
+			    __func__);
+		return -EINVAL;
+	}
+	if (param->rx_mini_pending) {
+		netdev_info(priv->netdev, "%s: rx_mini_pending not supported\n",
+			    __func__);
+		return -EINVAL;
+	}
+
+	if (param->rx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE)) {
+		netdev_info(priv->netdev, "%s: rx_pending (%d) < min (%d)\n",
+			    __func__, param->rx_pending,
+			    1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE);
+		return -EINVAL;
+	}
+
+	if (param->tx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) {
+		netdev_info(priv->netdev, "%s: tx_pending (%d) < min (%d)\n",
+			    __func__, param->tx_pending,
+			    1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
+		return -EINVAL;
+	}
+
+	log_rq_size = order_base_2(param->rx_pending);
+	log_sq_size = order_base_2(param->tx_pending);
+
+	if (log_rq_size == priv->channels.params.log_rq_mtu_frames &&
+	    log_sq_size == priv->channels.params.log_sq_size)
+		return 0;
+
+	mutex_lock(&priv->state_lock);
+
+	new_channels.params = priv->channels.params;
+	new_channels.params.log_rq_mtu_frames = log_rq_size;
+	new_channels.params.log_sq_size = log_sq_size;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		goto unlock;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto unlock;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+
+unlock:
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static int mlx5e_set_ringparam(struct net_device *dev,
+			       struct ethtool_ringparam *param)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	return mlx5e_ethtool_set_ringparam(priv, param);
+}
+
+void mlx5e_ethtool_get_channels(struct mlx5e_priv *priv,
+				struct ethtool_channels *ch)
+{
+	ch->max_combined   = priv->profile->max_nch(priv->mdev);
+	ch->combined_count = priv->channels.params.num_channels;
+}
+
+static void mlx5e_get_channels(struct net_device *dev,
+			       struct ethtool_channels *ch)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	mlx5e_ethtool_get_channels(priv, ch);
+}
+
+int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv,
+			       struct ethtool_channels *ch)
+{
+	unsigned int count = ch->combined_count;
+	struct mlx5e_channels new_channels = {};
+	bool arfs_enabled;
+	int err = 0;
+
+	if (!count) {
+		netdev_info(priv->netdev, "%s: combined_count=0 not supported\n",
+			    __func__);
+		return -EINVAL;
+	}
+
+	if (priv->channels.params.num_channels == count)
+		return 0;
+
+	mutex_lock(&priv->state_lock);
+
+	new_channels.params = priv->channels.params;
+	new_channels.params.num_channels = count;
+	if (!netif_is_rxfh_configured(priv->netdev))
+		mlx5e_build_default_indir_rqt(new_channels.params.indirection_rqt,
+					      MLX5E_INDIR_RQT_SIZE, count);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		goto out;
+	}
+
+	/* Create fresh channels with new parameters */
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto out;
+
+	arfs_enabled = priv->netdev->features & NETIF_F_NTUPLE;
+	if (arfs_enabled)
+		mlx5e_arfs_disable(priv);
+
+	/* Switch to new channels, set new parameters and close old ones */
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+
+	if (arfs_enabled) {
+		err = mlx5e_arfs_enable(priv);
+		if (err)
+			netdev_err(priv->netdev, "%s: mlx5e_arfs_enable failed: %d\n",
+				   __func__, err);
+	}
+
+out:
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static int mlx5e_set_channels(struct net_device *dev,
+			      struct ethtool_channels *ch)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	return mlx5e_ethtool_set_channels(priv, ch);
+}
+
+int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv,
+			       struct ethtool_coalesce *coal)
+{
+	if (!MLX5_CAP_GEN(priv->mdev, cq_moderation))
+		return -EOPNOTSUPP;
+
+	coal->rx_coalesce_usecs       = priv->channels.params.rx_cq_moderation.usec;
+	coal->rx_max_coalesced_frames = priv->channels.params.rx_cq_moderation.pkts;
+	coal->tx_coalesce_usecs       = priv->channels.params.tx_cq_moderation.usec;
+	coal->tx_max_coalesced_frames = priv->channels.params.tx_cq_moderation.pkts;
+	coal->use_adaptive_rx_coalesce = priv->channels.params.rx_dim_enabled;
+
+	return 0;
+}
+
+static int mlx5e_get_coalesce(struct net_device *netdev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	return mlx5e_ethtool_get_coalesce(priv, coal);
+}
+
+#define MLX5E_MAX_COAL_TIME		MLX5_MAX_CQ_PERIOD
+#define MLX5E_MAX_COAL_FRAMES		MLX5_MAX_CQ_COUNT
+
+static void
+mlx5e_set_priv_channels_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int tc;
+	int i;
+
+	for (i = 0; i < priv->channels.num; ++i) {
+		struct mlx5e_channel *c = priv->channels.c[i];
+
+		for (tc = 0; tc < c->num_tc; tc++) {
+			mlx5_core_modify_cq_moderation(mdev,
+						&c->sq[tc].cq.mcq,
+						coal->tx_coalesce_usecs,
+						coal->tx_max_coalesced_frames);
+		}
+
+		mlx5_core_modify_cq_moderation(mdev, &c->rq.cq.mcq,
+					       coal->rx_coalesce_usecs,
+					       coal->rx_max_coalesced_frames);
+	}
+}
+
+int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
+			       struct ethtool_coalesce *coal)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_channels new_channels = {};
+	int err = 0;
+	bool reset;
+
+	if (!MLX5_CAP_GEN(mdev, cq_moderation))
+		return -EOPNOTSUPP;
+
+	if (coal->tx_coalesce_usecs > MLX5E_MAX_COAL_TIME ||
+	    coal->rx_coalesce_usecs > MLX5E_MAX_COAL_TIME) {
+		netdev_info(priv->netdev, "%s: maximum coalesce time supported is %lu usecs\n",
+			    __func__, MLX5E_MAX_COAL_TIME);
+		return -ERANGE;
+	}
+
+	if (coal->tx_max_coalesced_frames > MLX5E_MAX_COAL_FRAMES ||
+	    coal->rx_max_coalesced_frames > MLX5E_MAX_COAL_FRAMES) {
+		netdev_info(priv->netdev, "%s: maximum coalesced frames supported is %lu\n",
+			    __func__, MLX5E_MAX_COAL_FRAMES);
+		return -ERANGE;
+	}
+
+	mutex_lock(&priv->state_lock);
+	new_channels.params = priv->channels.params;
+
+	new_channels.params.tx_cq_moderation.usec = coal->tx_coalesce_usecs;
+	new_channels.params.tx_cq_moderation.pkts = coal->tx_max_coalesced_frames;
+	new_channels.params.rx_cq_moderation.usec = coal->rx_coalesce_usecs;
+	new_channels.params.rx_cq_moderation.pkts = coal->rx_max_coalesced_frames;
+	new_channels.params.rx_dim_enabled        = !!coal->use_adaptive_rx_coalesce;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		goto out;
+	}
+	/* we are opened */
+
+	reset = !!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled;
+	if (!reset) {
+		mlx5e_set_priv_channels_coalesce(priv, coal);
+		priv->channels.params = new_channels.params;
+		goto out;
+	}
+
+	/* open fresh channels with new coal parameters */
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto out;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+
+out:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+static int mlx5e_set_coalesce(struct net_device *netdev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+
+	return mlx5e_ethtool_set_coalesce(priv, coal);
+}
+
+static void ptys2ethtool_supported_link(unsigned long *supported_modes,
+					u32 eth_proto_cap)
+{
+	unsigned long proto_cap = eth_proto_cap;
+	int proto;
+
+	for_each_set_bit(proto, &proto_cap, MLX5E_LINK_MODES_NUMBER)
+		bitmap_or(supported_modes, supported_modes,
+			  ptys2ethtool_table[proto].supported,
+			  __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static void ptys2ethtool_adver_link(unsigned long *advertising_modes,
+				    u32 eth_proto_cap)
+{
+	unsigned long proto_cap = eth_proto_cap;
+	int proto;
+
+	for_each_set_bit(proto, &proto_cap, MLX5E_LINK_MODES_NUMBER)
+		bitmap_or(advertising_modes, advertising_modes,
+			  ptys2ethtool_table[proto].advertised,
+			  __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static void ptys2ethtool_supported_advertised_port(struct ethtool_link_ksettings *link_ksettings,
+						   u32 eth_proto_cap,
+						   u8 connector_type)
+{
+	if (!connector_type || connector_type >= MLX5E_CONNECTOR_TYPE_NUMBER) {
+		if (eth_proto_cap & (MLX5E_PROT_MASK(MLX5E_10GBASE_CR)
+				   | MLX5E_PROT_MASK(MLX5E_10GBASE_SR)
+				   | MLX5E_PROT_MASK(MLX5E_40GBASE_CR4)
+				   | MLX5E_PROT_MASK(MLX5E_40GBASE_SR4)
+				   | MLX5E_PROT_MASK(MLX5E_100GBASE_SR4)
+				   | MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII))) {
+			ethtool_link_ksettings_add_link_mode(link_ksettings,
+							     supported,
+							     FIBRE);
+			ethtool_link_ksettings_add_link_mode(link_ksettings,
+							     advertising,
+							     FIBRE);
+		}
+
+		if (eth_proto_cap & (MLX5E_PROT_MASK(MLX5E_100GBASE_KR4)
+				   | MLX5E_PROT_MASK(MLX5E_40GBASE_KR4)
+				   | MLX5E_PROT_MASK(MLX5E_10GBASE_KR)
+				   | MLX5E_PROT_MASK(MLX5E_10GBASE_KX4)
+				   | MLX5E_PROT_MASK(MLX5E_1000BASE_KX))) {
+			ethtool_link_ksettings_add_link_mode(link_ksettings,
+							     supported,
+							     Backplane);
+			ethtool_link_ksettings_add_link_mode(link_ksettings,
+							     advertising,
+							     Backplane);
+		}
+		return;
+	}
+
+	switch (connector_type) {
+	case MLX5E_PORT_TP:
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, TP);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, TP);
+		break;
+	case MLX5E_PORT_AUI:
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, AUI);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, AUI);
+		break;
+	case MLX5E_PORT_BNC:
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, BNC);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, BNC);
+		break;
+	case MLX5E_PORT_MII:
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, MII);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, MII);
+		break;
+	case MLX5E_PORT_FIBRE:
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, FIBRE);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, FIBRE);
+		break;
+	case MLX5E_PORT_DA:
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, Backplane);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, Backplane);
+		break;
+	case MLX5E_PORT_NONE:
+	case MLX5E_PORT_OTHER:
+	default:
+		break;
+	}
+}
+
+int mlx5e_get_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
+{
+	u32 max_speed = 0;
+	u32 proto_cap;
+	int err;
+	int i;
+
+	err = mlx5_query_port_proto_cap(mdev, &proto_cap, MLX5_PTYS_EN);
+	if (err)
+		return err;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i)
+		if (proto_cap & MLX5E_PROT_MASK(i))
+			max_speed = max(max_speed, ptys2ethtool_table[i].speed);
+
+	*speed = max_speed;
+	return 0;
+}
+
+static void get_speed_duplex(struct net_device *netdev,
+			     u32 eth_proto_oper,
+			     struct ethtool_link_ksettings *link_ksettings)
+{
+	int i;
+	u32 speed = SPEED_UNKNOWN;
+	u8 duplex = DUPLEX_UNKNOWN;
+
+	if (!netif_carrier_ok(netdev))
+		goto out;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) {
+		if (eth_proto_oper & MLX5E_PROT_MASK(i)) {
+			speed = ptys2ethtool_table[i].speed;
+			duplex = DUPLEX_FULL;
+			break;
+		}
+	}
+out:
+	link_ksettings->base.speed = speed;
+	link_ksettings->base.duplex = duplex;
+}
+
+static void get_supported(u32 eth_proto_cap,
+			  struct ethtool_link_ksettings *link_ksettings)
+{
+	unsigned long *supported = link_ksettings->link_modes.supported;
+
+	ptys2ethtool_supported_link(supported, eth_proto_cap);
+	ethtool_link_ksettings_add_link_mode(link_ksettings, supported, Pause);
+}
+
+static void get_advertising(u32 eth_proto_cap, u8 tx_pause,
+			    u8 rx_pause,
+			    struct ethtool_link_ksettings *link_ksettings)
+{
+	unsigned long *advertising = link_ksettings->link_modes.advertising;
+
+	ptys2ethtool_adver_link(advertising, eth_proto_cap);
+	if (rx_pause)
+		ethtool_link_ksettings_add_link_mode(link_ksettings, advertising, Pause);
+	if (tx_pause ^ rx_pause)
+		ethtool_link_ksettings_add_link_mode(link_ksettings, advertising, Asym_Pause);
+}
+
+static int ptys2connector_type[MLX5E_CONNECTOR_TYPE_NUMBER] = {
+		[MLX5E_PORT_UNKNOWN]            = PORT_OTHER,
+		[MLX5E_PORT_NONE]               = PORT_NONE,
+		[MLX5E_PORT_TP]                 = PORT_TP,
+		[MLX5E_PORT_AUI]                = PORT_AUI,
+		[MLX5E_PORT_BNC]                = PORT_BNC,
+		[MLX5E_PORT_MII]                = PORT_MII,
+		[MLX5E_PORT_FIBRE]              = PORT_FIBRE,
+		[MLX5E_PORT_DA]                 = PORT_DA,
+		[MLX5E_PORT_OTHER]              = PORT_OTHER,
+	};
+
+static u8 get_connector_port(u32 eth_proto, u8 connector_type)
+{
+	if (connector_type && connector_type < MLX5E_CONNECTOR_TYPE_NUMBER)
+		return ptys2connector_type[connector_type];
+
+	if (eth_proto &
+	    (MLX5E_PROT_MASK(MLX5E_10GBASE_SR)   |
+	     MLX5E_PROT_MASK(MLX5E_40GBASE_SR4)  |
+	     MLX5E_PROT_MASK(MLX5E_100GBASE_SR4) |
+	     MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII))) {
+		return PORT_FIBRE;
+	}
+
+	if (eth_proto &
+	    (MLX5E_PROT_MASK(MLX5E_40GBASE_CR4) |
+	     MLX5E_PROT_MASK(MLX5E_10GBASE_CR)  |
+	     MLX5E_PROT_MASK(MLX5E_100GBASE_CR4))) {
+		return PORT_DA;
+	}
+
+	if (eth_proto &
+	    (MLX5E_PROT_MASK(MLX5E_10GBASE_KX4) |
+	     MLX5E_PROT_MASK(MLX5E_10GBASE_KR)  |
+	     MLX5E_PROT_MASK(MLX5E_40GBASE_KR4) |
+	     MLX5E_PROT_MASK(MLX5E_100GBASE_KR4))) {
+		return PORT_NONE;
+	}
+
+	return PORT_OTHER;
+}
+
+static void get_lp_advertising(u32 eth_proto_lp,
+			       struct ethtool_link_ksettings *link_ksettings)
+{
+	unsigned long *lp_advertising = link_ksettings->link_modes.lp_advertising;
+
+	ptys2ethtool_adver_link(lp_advertising, eth_proto_lp);
+}
+
+static int mlx5e_get_link_ksettings(struct net_device *netdev,
+				    struct ethtool_link_ksettings *link_ksettings)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0};
+	u32 rx_pause = 0;
+	u32 tx_pause = 0;
+	u32 eth_proto_cap;
+	u32 eth_proto_admin;
+	u32 eth_proto_lp;
+	u32 eth_proto_oper;
+	u8 an_disable_admin;
+	u8 an_status;
+	u8 connector_type;
+	int err;
+
+	err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1);
+	if (err) {
+		netdev_err(netdev, "%s: query port ptys failed: %d\n",
+			   __func__, err);
+		goto err_query_ptys;
+	}
+
+	eth_proto_cap    = MLX5_GET(ptys_reg, out, eth_proto_capability);
+	eth_proto_admin  = MLX5_GET(ptys_reg, out, eth_proto_admin);
+	eth_proto_oper   = MLX5_GET(ptys_reg, out, eth_proto_oper);
+	eth_proto_lp     = MLX5_GET(ptys_reg, out, eth_proto_lp_advertise);
+	an_disable_admin = MLX5_GET(ptys_reg, out, an_disable_admin);
+	an_status        = MLX5_GET(ptys_reg, out, an_status);
+	connector_type   = MLX5_GET(ptys_reg, out, connector_type);
+
+	mlx5_query_port_pause(mdev, &rx_pause, &tx_pause);
+
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, supported);
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising);
+
+	get_supported(eth_proto_cap, link_ksettings);
+	get_advertising(eth_proto_admin, tx_pause, rx_pause, link_ksettings);
+	get_speed_duplex(netdev, eth_proto_oper, link_ksettings);
+
+	eth_proto_oper = eth_proto_oper ? eth_proto_oper : eth_proto_cap;
+
+	link_ksettings->base.port = get_connector_port(eth_proto_oper,
+						       connector_type);
+	ptys2ethtool_supported_advertised_port(link_ksettings, eth_proto_admin,
+					       connector_type);
+	get_lp_advertising(eth_proto_lp, link_ksettings);
+
+	if (an_status == MLX5_AN_COMPLETE)
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     lp_advertising, Autoneg);
+
+	link_ksettings->base.autoneg = an_disable_admin ? AUTONEG_DISABLE :
+							  AUTONEG_ENABLE;
+	ethtool_link_ksettings_add_link_mode(link_ksettings, supported,
+					     Autoneg);
+	if (!an_disable_admin)
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, Autoneg);
+
+err_query_ptys:
+	return err;
+}
+
+static u32 mlx5e_ethtool2ptys_adver_link(const unsigned long *link_modes)
+{
+	u32 i, ptys_modes = 0;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) {
+		if (bitmap_intersects(ptys2ethtool_table[i].advertised,
+				      link_modes,
+				      __ETHTOOL_LINK_MODE_MASK_NBITS))
+			ptys_modes |= MLX5E_PROT_MASK(i);
+	}
+
+	return ptys_modes;
+}
+
+static u32 mlx5e_ethtool2ptys_speed_link(u32 speed)
+{
+	u32 i, speed_links = 0;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) {
+		if (ptys2ethtool_table[i].speed == speed)
+			speed_links |= MLX5E_PROT_MASK(i);
+	}
+
+	return speed_links;
+}
+
+static int mlx5e_set_link_ksettings(struct net_device *netdev,
+				    const struct ethtool_link_ksettings *link_ksettings)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 eth_proto_cap, eth_proto_admin;
+	bool an_changes = false;
+	u8 an_disable_admin;
+	u8 an_disable_cap;
+	bool an_disable;
+	u32 link_modes;
+	u8 an_status;
+	u32 speed;
+	int err;
+
+	speed = link_ksettings->base.speed;
+
+	link_modes = link_ksettings->base.autoneg == AUTONEG_ENABLE ?
+		mlx5e_ethtool2ptys_adver_link(link_ksettings->link_modes.advertising) :
+		mlx5e_ethtool2ptys_speed_link(speed);
+
+	err = mlx5_query_port_proto_cap(mdev, &eth_proto_cap, MLX5_PTYS_EN);
+	if (err) {
+		netdev_err(netdev, "%s: query port eth proto cap failed: %d\n",
+			   __func__, err);
+		goto out;
+	}
+
+	link_modes = link_modes & eth_proto_cap;
+	if (!link_modes) {
+		netdev_err(netdev, "%s: Not supported link mode(s) requested",
+			   __func__);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = mlx5_query_port_proto_admin(mdev, &eth_proto_admin, MLX5_PTYS_EN);
+	if (err) {
+		netdev_err(netdev, "%s: query port eth proto admin failed: %d\n",
+			   __func__, err);
+		goto out;
+	}
+
+	mlx5_query_port_autoneg(mdev, MLX5_PTYS_EN, &an_status,
+				&an_disable_cap, &an_disable_admin);
+
+	an_disable = link_ksettings->base.autoneg == AUTONEG_DISABLE;
+	an_changes = ((!an_disable && an_disable_admin) ||
+		      (an_disable && !an_disable_admin));
+
+	if (!an_changes && link_modes == eth_proto_admin)
+		goto out;
+
+	mlx5_set_port_ptys(mdev, an_disable, link_modes, MLX5_PTYS_EN);
+	mlx5_toggle_port_link(mdev);
+
+out:
+	return err;
+}
+
+static u32 mlx5e_get_rxfh_key_size(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	return sizeof(priv->channels.params.toeplitz_hash_key);
+}
+
+static u32 mlx5e_get_rxfh_indir_size(struct net_device *netdev)
+{
+	return MLX5E_INDIR_RQT_SIZE;
+}
+
+static int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
+			  u8 *hfunc)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (indir)
+		memcpy(indir, priv->channels.params.indirection_rqt,
+		       sizeof(priv->channels.params.indirection_rqt));
+
+	if (key)
+		memcpy(key, priv->channels.params.toeplitz_hash_key,
+		       sizeof(priv->channels.params.toeplitz_hash_key));
+
+	if (hfunc)
+		*hfunc = priv->channels.params.rss_hfunc;
+
+	return 0;
+}
+
+static void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen)
+{
+	void *tirc = MLX5_ADDR_OF(modify_tir_in, in, ctx);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int ctxlen = MLX5_ST_SZ_BYTES(tirc);
+	int tt;
+
+	MLX5_SET(modify_tir_in, in, bitmask.hash, 1);
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
+		memset(tirc, 0, ctxlen);
+		mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, false);
+		mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in, inlen);
+	}
+
+	if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
+		return;
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
+		memset(tirc, 0, ctxlen);
+		mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, true);
+		mlx5_core_modify_tir(mdev, priv->inner_indir_tir[tt].tirn, in, inlen);
+	}
+}
+
+static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
+			  const u8 *key, const u8 hfunc)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
+	bool hash_changed = false;
+	void *in;
+
+	if ((hfunc != ETH_RSS_HASH_NO_CHANGE) &&
+	    (hfunc != ETH_RSS_HASH_XOR) &&
+	    (hfunc != ETH_RSS_HASH_TOP))
+		return -EINVAL;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	mutex_lock(&priv->state_lock);
+
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    hfunc != priv->channels.params.rss_hfunc) {
+		priv->channels.params.rss_hfunc = hfunc;
+		hash_changed = true;
+	}
+
+	if (indir) {
+		memcpy(priv->channels.params.indirection_rqt, indir,
+		       sizeof(priv->channels.params.indirection_rqt));
+
+		if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+			u32 rqtn = priv->indir_rqt.rqtn;
+			struct mlx5e_redirect_rqt_param rrp = {
+				.is_rss = true,
+				{
+					.rss = {
+						.hfunc = priv->channels.params.rss_hfunc,
+						.channels  = &priv->channels,
+					},
+				},
+			};
+
+			mlx5e_redirect_rqt(priv, rqtn, MLX5E_INDIR_RQT_SIZE, rrp);
+		}
+	}
+
+	if (key) {
+		memcpy(priv->channels.params.toeplitz_hash_key, key,
+		       sizeof(priv->channels.params.toeplitz_hash_key));
+		hash_changed = hash_changed ||
+			       priv->channels.params.rss_hfunc == ETH_RSS_HASH_TOP;
+	}
+
+	if (hash_changed)
+		mlx5e_modify_tirs_hash(priv, in, inlen);
+
+	mutex_unlock(&priv->state_lock);
+
+	kvfree(in);
+
+	return 0;
+}
+
+static int mlx5e_get_rxnfc(struct net_device *netdev,
+			   struct ethtool_rxnfc *info, u32 *rule_locs)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err = 0;
+
+	switch (info->cmd) {
+	case ETHTOOL_GRXRINGS:
+		info->data = priv->channels.params.num_channels;
+		break;
+	case ETHTOOL_GRXCLSRLCNT:
+		info->rule_cnt = priv->fs.ethtool.tot_num_rules;
+		break;
+	case ETHTOOL_GRXCLSRULE:
+		err = mlx5e_ethtool_get_flow(priv, info, info->fs.location);
+		break;
+	case ETHTOOL_GRXCLSRLALL:
+		err = mlx5e_ethtool_get_all_flows(priv, info, rule_locs);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+#define MLX5E_PFC_PREVEN_AUTO_TOUT_MSEC		100
+#define MLX5E_PFC_PREVEN_TOUT_MAX_MSEC		8000
+#define MLX5E_PFC_PREVEN_MINOR_PRECENT		85
+#define MLX5E_PFC_PREVEN_TOUT_MIN_MSEC		80
+#define MLX5E_DEVICE_STALL_MINOR_WATERMARK(critical_tout) \
+	max_t(u16, MLX5E_PFC_PREVEN_TOUT_MIN_MSEC, \
+	      (critical_tout * MLX5E_PFC_PREVEN_MINOR_PRECENT) / 100)
+
+static int mlx5e_get_pfc_prevention_tout(struct net_device *netdev,
+					 u16 *pfc_prevention_tout)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	if (!MLX5_CAP_PCAM_FEATURE((priv)->mdev, pfcc_mask) ||
+	    !MLX5_CAP_DEBUG((priv)->mdev, stall_detect))
+		return -EOPNOTSUPP;
+
+	return mlx5_query_port_stall_watermark(mdev, pfc_prevention_tout, NULL);
+}
+
+static int mlx5e_set_pfc_prevention_tout(struct net_device *netdev,
+					 u16 pfc_preven)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 critical_tout;
+	u16 minor;
+
+	if (!MLX5_CAP_PCAM_FEATURE((priv)->mdev, pfcc_mask) ||
+	    !MLX5_CAP_DEBUG((priv)->mdev, stall_detect))
+		return -EOPNOTSUPP;
+
+	critical_tout = (pfc_preven == PFC_STORM_PREVENTION_AUTO) ?
+			MLX5E_PFC_PREVEN_AUTO_TOUT_MSEC :
+			pfc_preven;
+
+	if (critical_tout != PFC_STORM_PREVENTION_DISABLE &&
+	    (critical_tout > MLX5E_PFC_PREVEN_TOUT_MAX_MSEC ||
+	     critical_tout < MLX5E_PFC_PREVEN_TOUT_MIN_MSEC)) {
+		netdev_info(netdev, "%s: pfc prevention tout not in range (%d-%d)\n",
+			    __func__, MLX5E_PFC_PREVEN_TOUT_MIN_MSEC,
+			    MLX5E_PFC_PREVEN_TOUT_MAX_MSEC);
+		return -EINVAL;
+	}
+
+	minor = MLX5E_DEVICE_STALL_MINOR_WATERMARK(critical_tout);
+	return mlx5_set_port_stall_watermark(mdev, critical_tout,
+					     minor);
+}
+
+static int mlx5e_get_tunable(struct net_device *dev,
+			     const struct ethtool_tunable *tuna,
+			     void *data)
+{
+	int err;
+
+	switch (tuna->id) {
+	case ETHTOOL_PFC_PREVENTION_TOUT:
+		err = mlx5e_get_pfc_prevention_tout(dev, data);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int mlx5e_set_tunable(struct net_device *dev,
+			     const struct ethtool_tunable *tuna,
+			     const void *data)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int err;
+
+	mutex_lock(&priv->state_lock);
+
+	switch (tuna->id) {
+	case ETHTOOL_PFC_PREVENTION_TOUT:
+		err = mlx5e_set_pfc_prevention_tout(dev, *(u16 *)data);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+static void mlx5e_get_pauseparam(struct net_device *netdev,
+				 struct ethtool_pauseparam *pauseparam)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx5_query_port_pause(mdev, &pauseparam->rx_pause,
+				    &pauseparam->tx_pause);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5_query_port_pause failed:0x%x\n",
+			   __func__, err);
+	}
+}
+
+static int mlx5e_set_pauseparam(struct net_device *netdev,
+				struct ethtool_pauseparam *pauseparam)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	if (pauseparam->autoneg)
+		return -EINVAL;
+
+	err = mlx5_set_port_pause(mdev,
+				  pauseparam->rx_pause ? 1 : 0,
+				  pauseparam->tx_pause ? 1 : 0);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5_set_port_pause failed:0x%x\n",
+			   __func__, err);
+	}
+
+	return err;
+}
+
+int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv,
+			      struct ethtool_ts_info *info)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int ret;
+
+	ret = ethtool_op_get_ts_info(priv->netdev, info);
+	if (ret)
+		return ret;
+
+	info->phc_index = mdev->clock.ptp ?
+			  ptp_clock_index(mdev->clock.ptp) : -1;
+
+	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
+		return 0;
+
+	info->so_timestamping |= SOF_TIMESTAMPING_TX_HARDWARE |
+				 SOF_TIMESTAMPING_RX_HARDWARE |
+				 SOF_TIMESTAMPING_RAW_HARDWARE;
+
+	info->tx_types = BIT(HWTSTAMP_TX_OFF) |
+			 BIT(HWTSTAMP_TX_ON);
+
+	info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) |
+			   BIT(HWTSTAMP_FILTER_ALL);
+
+	return 0;
+}
+
+static int mlx5e_get_ts_info(struct net_device *dev,
+			     struct ethtool_ts_info *info)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	return mlx5e_ethtool_get_ts_info(priv, info);
+}
+
+static __u32 mlx5e_get_wol_supported(struct mlx5_core_dev *mdev)
+{
+	__u32 ret = 0;
+
+	if (MLX5_CAP_GEN(mdev, wol_g))
+		ret |= WAKE_MAGIC;
+
+	if (MLX5_CAP_GEN(mdev, wol_s))
+		ret |= WAKE_MAGICSECURE;
+
+	if (MLX5_CAP_GEN(mdev, wol_a))
+		ret |= WAKE_ARP;
+
+	if (MLX5_CAP_GEN(mdev, wol_b))
+		ret |= WAKE_BCAST;
+
+	if (MLX5_CAP_GEN(mdev, wol_m))
+		ret |= WAKE_MCAST;
+
+	if (MLX5_CAP_GEN(mdev, wol_u))
+		ret |= WAKE_UCAST;
+
+	if (MLX5_CAP_GEN(mdev, wol_p))
+		ret |= WAKE_PHY;
+
+	return ret;
+}
+
+static __u32 mlx5e_refomrat_wol_mode_mlx5_to_linux(u8 mode)
+{
+	__u32 ret = 0;
+
+	if (mode & MLX5_WOL_MAGIC)
+		ret |= WAKE_MAGIC;
+
+	if (mode & MLX5_WOL_SECURED_MAGIC)
+		ret |= WAKE_MAGICSECURE;
+
+	if (mode & MLX5_WOL_ARP)
+		ret |= WAKE_ARP;
+
+	if (mode & MLX5_WOL_BROADCAST)
+		ret |= WAKE_BCAST;
+
+	if (mode & MLX5_WOL_MULTICAST)
+		ret |= WAKE_MCAST;
+
+	if (mode & MLX5_WOL_UNICAST)
+		ret |= WAKE_UCAST;
+
+	if (mode & MLX5_WOL_PHY_ACTIVITY)
+		ret |= WAKE_PHY;
+
+	return ret;
+}
+
+static u8 mlx5e_refomrat_wol_mode_linux_to_mlx5(__u32 mode)
+{
+	u8 ret = 0;
+
+	if (mode & WAKE_MAGIC)
+		ret |= MLX5_WOL_MAGIC;
+
+	if (mode & WAKE_MAGICSECURE)
+		ret |= MLX5_WOL_SECURED_MAGIC;
+
+	if (mode & WAKE_ARP)
+		ret |= MLX5_WOL_ARP;
+
+	if (mode & WAKE_BCAST)
+		ret |= MLX5_WOL_BROADCAST;
+
+	if (mode & WAKE_MCAST)
+		ret |= MLX5_WOL_MULTICAST;
+
+	if (mode & WAKE_UCAST)
+		ret |= MLX5_WOL_UNICAST;
+
+	if (mode & WAKE_PHY)
+		ret |= MLX5_WOL_PHY_ACTIVITY;
+
+	return ret;
+}
+
+static void mlx5e_get_wol(struct net_device *netdev,
+			  struct ethtool_wolinfo *wol)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 mlx5_wol_mode;
+	int err;
+
+	memset(wol, 0, sizeof(*wol));
+
+	wol->supported = mlx5e_get_wol_supported(mdev);
+	if (!wol->supported)
+		return;
+
+	err = mlx5_query_port_wol(mdev, &mlx5_wol_mode);
+	if (err)
+		return;
+
+	wol->wolopts = mlx5e_refomrat_wol_mode_mlx5_to_linux(mlx5_wol_mode);
+}
+
+static int mlx5e_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	__u32 wol_supported = mlx5e_get_wol_supported(mdev);
+	u32 mlx5_wol_mode;
+
+	if (!wol_supported)
+		return -EOPNOTSUPP;
+
+	if (wol->wolopts & ~wol_supported)
+		return -EINVAL;
+
+	mlx5_wol_mode = mlx5e_refomrat_wol_mode_linux_to_mlx5(wol->wolopts);
+
+	return mlx5_set_port_wol(mdev, mlx5_wol_mode);
+}
+
+static u32 mlx5e_get_msglevel(struct net_device *dev)
+{
+	return ((struct mlx5e_priv *)netdev_priv(dev))->msglevel;
+}
+
+static void mlx5e_set_msglevel(struct net_device *dev, u32 val)
+{
+	((struct mlx5e_priv *)netdev_priv(dev))->msglevel = val;
+}
+
+static int mlx5e_set_phys_id(struct net_device *dev,
+			     enum ethtool_phys_id_state state)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 beacon_duration;
+
+	if (!MLX5_CAP_GEN(mdev, beacon_led))
+		return -EOPNOTSUPP;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		beacon_duration = MLX5_BEACON_DURATION_INF;
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		beacon_duration = MLX5_BEACON_DURATION_OFF;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return mlx5_set_port_beacon(mdev, beacon_duration);
+}
+
+static int mlx5e_get_module_info(struct net_device *netdev,
+				 struct ethtool_modinfo *modinfo)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *dev = priv->mdev;
+	int size_read = 0;
+	u8 data[4];
+
+	size_read = mlx5_query_module_eeprom(dev, 0, 2, data);
+	if (size_read < 2)
+		return -EIO;
+
+	/* data[0] = identifier byte */
+	switch (data[0]) {
+	case MLX5_MODULE_ID_QSFP:
+		modinfo->type       = ETH_MODULE_SFF_8436;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		break;
+	case MLX5_MODULE_ID_QSFP_PLUS:
+	case MLX5_MODULE_ID_QSFP28:
+		/* data[1] = revision id */
+		if (data[0] == MLX5_MODULE_ID_QSFP28 || data[1] >= 0x3) {
+			modinfo->type       = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+		} else {
+			modinfo->type       = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		}
+		break;
+	case MLX5_MODULE_ID_SFP:
+		modinfo->type       = ETH_MODULE_SFF_8472;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		break;
+	default:
+		netdev_err(priv->netdev, "%s: cable type not recognized:0x%x\n",
+			   __func__, data[0]);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5e_get_module_eeprom(struct net_device *netdev,
+				   struct ethtool_eeprom *ee,
+				   u8 *data)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int offset = ee->offset;
+	int size_read;
+	int i = 0;
+
+	if (!ee->len)
+		return -EINVAL;
+
+	memset(data, 0, ee->len);
+
+	while (i < ee->len) {
+		size_read = mlx5_query_module_eeprom(mdev, offset, ee->len - i,
+						     data + i);
+
+		if (!size_read)
+			/* Done reading */
+			return 0;
+
+		if (size_read < 0) {
+			netdev_err(priv->netdev, "%s: mlx5_query_eeprom failed:0x%x\n",
+				   __func__, size_read);
+			return 0;
+		}
+
+		i += size_read;
+		offset += size_read;
+	}
+
+	return 0;
+}
+
+typedef int (*mlx5e_pflag_handler)(struct net_device *netdev, bool enable);
+
+static int set_pflag_cqe_based_moder(struct net_device *netdev, bool enable,
+				     bool is_rx_cq)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_channels new_channels = {};
+	bool mode_changed;
+	u8 cq_period_mode, current_cq_period_mode;
+	int err = 0;
+
+	cq_period_mode = enable ?
+		MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
+		MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
+	current_cq_period_mode = is_rx_cq ?
+		priv->channels.params.rx_cq_moderation.cq_period_mode :
+		priv->channels.params.tx_cq_moderation.cq_period_mode;
+	mode_changed = cq_period_mode != current_cq_period_mode;
+
+	if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE &&
+	    !MLX5_CAP_GEN(mdev, cq_period_start_from_cqe))
+		return -EOPNOTSUPP;
+
+	if (!mode_changed)
+		return 0;
+
+	new_channels.params = priv->channels.params;
+	if (is_rx_cq)
+		mlx5e_set_rx_cq_mode_params(&new_channels.params, cq_period_mode);
+	else
+		mlx5e_set_tx_cq_mode_params(&new_channels.params, cq_period_mode);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		return 0;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		return err;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+	return 0;
+}
+
+static int set_pflag_tx_cqe_based_moder(struct net_device *netdev, bool enable)
+{
+	return set_pflag_cqe_based_moder(netdev, enable, false);
+}
+
+static int set_pflag_rx_cqe_based_moder(struct net_device *netdev, bool enable)
+{
+	return set_pflag_cqe_based_moder(netdev, enable, true);
+}
+
+int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool new_val)
+{
+	bool curr_val = MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS);
+	struct mlx5e_channels new_channels = {};
+	int err = 0;
+
+	if (!MLX5_CAP_GEN(priv->mdev, cqe_compression))
+		return new_val ? -EOPNOTSUPP : 0;
+
+	if (curr_val == new_val)
+		return 0;
+
+	new_channels.params = priv->channels.params;
+	MLX5E_SET_PFLAG(&new_channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS, new_val);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		return 0;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		return err;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+	mlx5e_dbg(DRV, priv, "MLX5E: RxCqeCmprss was turned %s\n",
+		  MLX5E_GET_PFLAG(&priv->channels.params,
+				  MLX5E_PFLAG_RX_CQE_COMPRESS) ? "ON" : "OFF");
+
+	return 0;
+}
+
+static int set_pflag_rx_cqe_compress(struct net_device *netdev,
+				     bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	if (!MLX5_CAP_GEN(mdev, cqe_compression))
+		return -EOPNOTSUPP;
+
+	if (enable && priv->tstamp.rx_filter != HWTSTAMP_FILTER_NONE) {
+		netdev_err(netdev, "Can't enable cqe compression while timestamping is enabled.\n");
+		return -EINVAL;
+	}
+
+	mlx5e_modify_rx_cqe_compression_locked(priv, enable);
+	priv->channels.params.rx_cqe_compress_def = enable;
+
+	return 0;
+}
+
+static int set_pflag_rx_striding_rq(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_channels new_channels = {};
+	int err;
+
+	if (enable) {
+		if (!mlx5e_check_fragmented_striding_rq_cap(mdev))
+			return -EOPNOTSUPP;
+		if (!mlx5e_striding_rq_possible(mdev, &priv->channels.params))
+			return -EINVAL;
+	}
+
+	new_channels.params = priv->channels.params;
+
+	MLX5E_SET_PFLAG(&new_channels.params, MLX5E_PFLAG_RX_STRIDING_RQ, enable);
+	mlx5e_set_rq_type(mdev, &new_channels.params);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		return 0;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		return err;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+	return 0;
+}
+
+static int mlx5e_handle_pflag(struct net_device *netdev,
+			      u32 wanted_flags,
+			      enum mlx5e_priv_flag flag,
+			      mlx5e_pflag_handler pflag_handler)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	bool enable = !!(wanted_flags & flag);
+	u32 changes = wanted_flags ^ priv->channels.params.pflags;
+	int err;
+
+	if (!(changes & flag))
+		return 0;
+
+	err = pflag_handler(netdev, enable);
+	if (err) {
+		netdev_err(netdev, "%s private flag 0x%x failed err %d\n",
+			   enable ? "Enable" : "Disable", flag, err);
+		return err;
+	}
+
+	MLX5E_SET_PFLAG(&priv->channels.params, flag, enable);
+	return 0;
+}
+
+static int mlx5e_set_priv_flags(struct net_device *netdev, u32 pflags)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	mutex_lock(&priv->state_lock);
+	err = mlx5e_handle_pflag(netdev, pflags,
+				 MLX5E_PFLAG_RX_CQE_BASED_MODER,
+				 set_pflag_rx_cqe_based_moder);
+	if (err)
+		goto out;
+
+	err = mlx5e_handle_pflag(netdev, pflags,
+				 MLX5E_PFLAG_TX_CQE_BASED_MODER,
+				 set_pflag_tx_cqe_based_moder);
+	if (err)
+		goto out;
+
+	err = mlx5e_handle_pflag(netdev, pflags,
+				 MLX5E_PFLAG_RX_CQE_COMPRESS,
+				 set_pflag_rx_cqe_compress);
+	if (err)
+		goto out;
+
+	err = mlx5e_handle_pflag(netdev, pflags,
+				 MLX5E_PFLAG_RX_STRIDING_RQ,
+				 set_pflag_rx_striding_rq);
+
+out:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+static u32 mlx5e_get_priv_flags(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	return priv->channels.params.pflags;
+}
+
+static int mlx5e_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
+{
+	int err = 0;
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXCLSRLINS:
+		err = mlx5e_ethtool_flow_replace(priv, &cmd->fs);
+		break;
+	case ETHTOOL_SRXCLSRLDEL:
+		err = mlx5e_ethtool_flow_remove(priv, cmd->fs.location);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv,
+			       struct ethtool_flash *flash)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->netdev;
+	const struct firmware *fw;
+	int err;
+
+	if (flash->region != ETHTOOL_FLASH_ALL_REGIONS)
+		return -EOPNOTSUPP;
+
+	err = request_firmware_direct(&fw, flash->data, &dev->dev);
+	if (err)
+		return err;
+
+	dev_hold(dev);
+	rtnl_unlock();
+
+	err = mlx5_firmware_flash(mdev, fw);
+	release_firmware(fw);
+
+	rtnl_lock();
+	dev_put(dev);
+	return err;
+}
+
+static int mlx5e_flash_device(struct net_device *dev,
+			      struct ethtool_flash *flash)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	return mlx5e_ethtool_flash_device(priv, flash);
+}
+
+const struct ethtool_ops mlx5e_ethtool_ops = {
+	.get_drvinfo       = mlx5e_get_drvinfo,
+	.get_link          = ethtool_op_get_link,
+	.get_strings       = mlx5e_get_strings,
+	.get_sset_count    = mlx5e_get_sset_count,
+	.get_ethtool_stats = mlx5e_get_ethtool_stats,
+	.get_ringparam     = mlx5e_get_ringparam,
+	.set_ringparam     = mlx5e_set_ringparam,
+	.get_channels      = mlx5e_get_channels,
+	.set_channels      = mlx5e_set_channels,
+	.get_coalesce      = mlx5e_get_coalesce,
+	.set_coalesce      = mlx5e_set_coalesce,
+	.get_link_ksettings  = mlx5e_get_link_ksettings,
+	.set_link_ksettings  = mlx5e_set_link_ksettings,
+	.get_rxfh_key_size   = mlx5e_get_rxfh_key_size,
+	.get_rxfh_indir_size = mlx5e_get_rxfh_indir_size,
+	.get_rxfh          = mlx5e_get_rxfh,
+	.set_rxfh          = mlx5e_set_rxfh,
+	.get_rxnfc         = mlx5e_get_rxnfc,
+	.set_rxnfc         = mlx5e_set_rxnfc,
+	.flash_device      = mlx5e_flash_device,
+	.get_tunable       = mlx5e_get_tunable,
+	.set_tunable       = mlx5e_set_tunable,
+	.get_pauseparam    = mlx5e_get_pauseparam,
+	.set_pauseparam    = mlx5e_set_pauseparam,
+	.get_ts_info       = mlx5e_get_ts_info,
+	.set_phys_id       = mlx5e_set_phys_id,
+	.get_wol	   = mlx5e_get_wol,
+	.set_wol	   = mlx5e_set_wol,
+	.get_module_info   = mlx5e_get_module_info,
+	.get_module_eeprom = mlx5e_get_module_eeprom,
+	.get_priv_flags    = mlx5e_get_priv_flags,
+	.set_priv_flags    = mlx5e_set_priv_flags,
+	.self_test         = mlx5e_self_test,
+	.get_msglevel      = mlx5e_get_msglevel,
+	.set_msglevel      = mlx5e_set_msglevel,
+
+};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
new file mode 100644
index 0000000..f64dda2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -0,0 +1,1568 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/list.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/mlx5/fs.h>
+#include "en.h"
+#include "lib/mpfs.h"
+
+static int mlx5e_add_l2_flow_rule(struct mlx5e_priv *priv,
+				  struct mlx5e_l2_rule *ai, int type);
+static void mlx5e_del_l2_flow_rule(struct mlx5e_priv *priv,
+				   struct mlx5e_l2_rule *ai);
+
+enum {
+	MLX5E_FULLMATCH = 0,
+	MLX5E_ALLMULTI  = 1,
+	MLX5E_PROMISC   = 2,
+};
+
+enum {
+	MLX5E_UC        = 0,
+	MLX5E_MC_IPV4   = 1,
+	MLX5E_MC_IPV6   = 2,
+	MLX5E_MC_OTHER  = 3,
+};
+
+enum {
+	MLX5E_ACTION_NONE = 0,
+	MLX5E_ACTION_ADD  = 1,
+	MLX5E_ACTION_DEL  = 2,
+};
+
+struct mlx5e_l2_hash_node {
+	struct hlist_node          hlist;
+	u8                         action;
+	struct mlx5e_l2_rule ai;
+	bool   mpfs;
+};
+
+static inline int mlx5e_hash_l2(u8 *addr)
+{
+	return addr[5];
+}
+
+static void mlx5e_add_l2_to_hash(struct hlist_head *hash, u8 *addr)
+{
+	struct mlx5e_l2_hash_node *hn;
+	int ix = mlx5e_hash_l2(addr);
+	int found = 0;
+
+	hlist_for_each_entry(hn, &hash[ix], hlist)
+		if (ether_addr_equal_64bits(hn->ai.addr, addr)) {
+			found = 1;
+			break;
+		}
+
+	if (found) {
+		hn->action = MLX5E_ACTION_NONE;
+		return;
+	}
+
+	hn = kzalloc(sizeof(*hn), GFP_ATOMIC);
+	if (!hn)
+		return;
+
+	ether_addr_copy(hn->ai.addr, addr);
+	hn->action = MLX5E_ACTION_ADD;
+
+	hlist_add_head(&hn->hlist, &hash[ix]);
+}
+
+static void mlx5e_del_l2_from_hash(struct mlx5e_l2_hash_node *hn)
+{
+	hlist_del(&hn->hlist);
+	kfree(hn);
+}
+
+static int mlx5e_vport_context_update_vlans(struct mlx5e_priv *priv)
+{
+	struct net_device *ndev = priv->netdev;
+	int max_list_size;
+	int list_size;
+	u16 *vlans;
+	int vlan;
+	int err;
+	int i;
+
+	list_size = 0;
+	for_each_set_bit(vlan, priv->fs.vlan.active_cvlans, VLAN_N_VID)
+		list_size++;
+
+	max_list_size = 1 << MLX5_CAP_GEN(priv->mdev, log_max_vlan_list);
+
+	if (list_size > max_list_size) {
+		netdev_warn(ndev,
+			    "netdev vlans list size (%d) > (%d) max vport list size, some vlans will be dropped\n",
+			    list_size, max_list_size);
+		list_size = max_list_size;
+	}
+
+	vlans = kcalloc(list_size, sizeof(*vlans), GFP_KERNEL);
+	if (!vlans)
+		return -ENOMEM;
+
+	i = 0;
+	for_each_set_bit(vlan, priv->fs.vlan.active_cvlans, VLAN_N_VID) {
+		if (i >= list_size)
+			break;
+		vlans[i++] = vlan;
+	}
+
+	err = mlx5_modify_nic_vport_vlans(priv->mdev, vlans, list_size);
+	if (err)
+		netdev_err(ndev, "Failed to modify vport vlans list err(%d)\n",
+			   err);
+
+	kfree(vlans);
+	return err;
+}
+
+enum mlx5e_vlan_rule_type {
+	MLX5E_VLAN_RULE_TYPE_UNTAGGED,
+	MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID,
+	MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID,
+	MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID,
+	MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID,
+};
+
+static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
+				 enum mlx5e_vlan_rule_type rule_type,
+				 u16 vid, struct mlx5_flow_spec *spec)
+{
+	struct mlx5_flow_table *ft = priv->fs.vlan.ft.t;
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_handle **rule_p;
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	int err = 0;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	dest.ft = priv->fs.l2.ft.t;
+
+	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+
+	switch (rule_type) {
+	case MLX5E_VLAN_RULE_TYPE_UNTAGGED:
+		/* cvlan_tag enabled in match criteria and
+		 * disabled in match value means both S & C tags
+		 * don't exist (untagged of both)
+		 */
+		rule_p = &priv->fs.vlan.untagged_rule;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.cvlan_tag);
+		break;
+	case MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID:
+		rule_p = &priv->fs.vlan.any_cvlan_rule;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.cvlan_tag);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.cvlan_tag, 1);
+		break;
+	case MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID:
+		rule_p = &priv->fs.vlan.any_svlan_rule;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.svlan_tag);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.svlan_tag, 1);
+		break;
+	case MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID:
+		rule_p = &priv->fs.vlan.active_svlans_rule[vid];
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.svlan_tag);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.svlan_tag, 1);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.first_vid);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid,
+			 vid);
+		break;
+	default: /* MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID */
+		rule_p = &priv->fs.vlan.active_cvlans_rule[vid];
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.cvlan_tag);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.cvlan_tag, 1);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.first_vid);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid,
+			 vid);
+		break;
+	}
+
+	*rule_p = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
+
+	if (IS_ERR(*rule_p)) {
+		err = PTR_ERR(*rule_p);
+		*rule_p = NULL;
+		netdev_err(priv->netdev, "%s: add rule failed\n", __func__);
+	}
+
+	return err;
+}
+
+static int mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
+			       enum mlx5e_vlan_rule_type rule_type, u16 vid)
+{
+	struct mlx5_flow_spec *spec;
+	int err = 0;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return -ENOMEM;
+
+	if (rule_type == MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID)
+		mlx5e_vport_context_update_vlans(priv);
+
+	err = __mlx5e_add_vlan_rule(priv, rule_type, vid, spec);
+
+	kvfree(spec);
+
+	return err;
+}
+
+static void mlx5e_del_vlan_rule(struct mlx5e_priv *priv,
+				enum mlx5e_vlan_rule_type rule_type, u16 vid)
+{
+	switch (rule_type) {
+	case MLX5E_VLAN_RULE_TYPE_UNTAGGED:
+		if (priv->fs.vlan.untagged_rule) {
+			mlx5_del_flow_rules(priv->fs.vlan.untagged_rule);
+			priv->fs.vlan.untagged_rule = NULL;
+		}
+		break;
+	case MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID:
+		if (priv->fs.vlan.any_cvlan_rule) {
+			mlx5_del_flow_rules(priv->fs.vlan.any_cvlan_rule);
+			priv->fs.vlan.any_cvlan_rule = NULL;
+		}
+		break;
+	case MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID:
+		if (priv->fs.vlan.any_svlan_rule) {
+			mlx5_del_flow_rules(priv->fs.vlan.any_svlan_rule);
+			priv->fs.vlan.any_svlan_rule = NULL;
+		}
+		break;
+	case MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID:
+		if (priv->fs.vlan.active_svlans_rule[vid]) {
+			mlx5_del_flow_rules(priv->fs.vlan.active_svlans_rule[vid]);
+			priv->fs.vlan.active_svlans_rule[vid] = NULL;
+		}
+		break;
+	case MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID:
+		mlx5e_vport_context_update_vlans(priv);
+		if (priv->fs.vlan.active_cvlans_rule[vid]) {
+			mlx5_del_flow_rules(priv->fs.vlan.active_cvlans_rule[vid]);
+			priv->fs.vlan.active_cvlans_rule[vid] = NULL;
+		}
+		mlx5e_vport_context_update_vlans(priv);
+		break;
+	}
+}
+
+static void mlx5e_del_any_vid_rules(struct mlx5e_priv *priv)
+{
+	mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0);
+	mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, 0);
+}
+
+static int mlx5e_add_any_vid_rules(struct mlx5e_priv *priv)
+{
+	int err;
+
+	err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0);
+	if (err)
+		return err;
+
+	return mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, 0);
+}
+
+void mlx5e_enable_cvlan_filter(struct mlx5e_priv *priv)
+{
+	if (!priv->fs.vlan.cvlan_filter_disabled)
+		return;
+
+	priv->fs.vlan.cvlan_filter_disabled = false;
+	if (priv->netdev->flags & IFF_PROMISC)
+		return;
+	mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0);
+}
+
+void mlx5e_disable_cvlan_filter(struct mlx5e_priv *priv)
+{
+	if (priv->fs.vlan.cvlan_filter_disabled)
+		return;
+
+	priv->fs.vlan.cvlan_filter_disabled = true;
+	if (priv->netdev->flags & IFF_PROMISC)
+		return;
+	mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0);
+}
+
+static int mlx5e_vlan_rx_add_cvid(struct mlx5e_priv *priv, u16 vid)
+{
+	int err;
+
+	set_bit(vid, priv->fs.vlan.active_cvlans);
+
+	err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID, vid);
+	if (err)
+		clear_bit(vid, priv->fs.vlan.active_cvlans);
+
+	return err;
+}
+
+static int mlx5e_vlan_rx_add_svid(struct mlx5e_priv *priv, u16 vid)
+{
+	struct net_device *netdev = priv->netdev;
+	int err;
+
+	set_bit(vid, priv->fs.vlan.active_svlans);
+
+	err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, vid);
+	if (err) {
+		clear_bit(vid, priv->fs.vlan.active_svlans);
+		return err;
+	}
+
+	/* Need to fix some features.. */
+	netdev_update_features(netdev);
+	return err;
+}
+
+int mlx5e_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	if (be16_to_cpu(proto) == ETH_P_8021Q)
+		return mlx5e_vlan_rx_add_cvid(priv, vid);
+	else if (be16_to_cpu(proto) == ETH_P_8021AD)
+		return mlx5e_vlan_rx_add_svid(priv, vid);
+
+	return -EOPNOTSUPP;
+}
+
+int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	if (be16_to_cpu(proto) == ETH_P_8021Q) {
+		clear_bit(vid, priv->fs.vlan.active_cvlans);
+		mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID, vid);
+	} else if (be16_to_cpu(proto) == ETH_P_8021AD) {
+		clear_bit(vid, priv->fs.vlan.active_svlans);
+		mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, vid);
+		netdev_update_features(dev);
+	}
+
+	return 0;
+}
+
+static void mlx5e_add_vlan_rules(struct mlx5e_priv *priv)
+{
+	int i;
+
+	mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
+
+	for_each_set_bit(i, priv->fs.vlan.active_cvlans, VLAN_N_VID) {
+		mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID, i);
+	}
+
+	for_each_set_bit(i, priv->fs.vlan.active_svlans, VLAN_N_VID)
+		mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, i);
+
+	if (priv->fs.vlan.cvlan_filter_disabled &&
+	    !(priv->netdev->flags & IFF_PROMISC))
+		mlx5e_add_any_vid_rules(priv);
+}
+
+static void mlx5e_del_vlan_rules(struct mlx5e_priv *priv)
+{
+	int i;
+
+	mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
+
+	for_each_set_bit(i, priv->fs.vlan.active_cvlans, VLAN_N_VID) {
+		mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID, i);
+	}
+
+	for_each_set_bit(i, priv->fs.vlan.active_svlans, VLAN_N_VID)
+		mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, i);
+
+	if (priv->fs.vlan.cvlan_filter_disabled &&
+	    !(priv->netdev->flags & IFF_PROMISC))
+		mlx5e_del_any_vid_rules(priv);
+}
+
+#define mlx5e_for_each_hash_node(hn, tmp, hash, i) \
+	for (i = 0; i < MLX5E_L2_ADDR_HASH_SIZE; i++) \
+		hlist_for_each_entry_safe(hn, tmp, &hash[i], hlist)
+
+static void mlx5e_execute_l2_action(struct mlx5e_priv *priv,
+				    struct mlx5e_l2_hash_node *hn)
+{
+	u8 action = hn->action;
+	u8 mac_addr[ETH_ALEN];
+	int l2_err = 0;
+
+	ether_addr_copy(mac_addr, hn->ai.addr);
+
+	switch (action) {
+	case MLX5E_ACTION_ADD:
+		mlx5e_add_l2_flow_rule(priv, &hn->ai, MLX5E_FULLMATCH);
+		if (!is_multicast_ether_addr(mac_addr)) {
+			l2_err = mlx5_mpfs_add_mac(priv->mdev, mac_addr);
+			hn->mpfs = !l2_err;
+		}
+		hn->action = MLX5E_ACTION_NONE;
+		break;
+
+	case MLX5E_ACTION_DEL:
+		if (!is_multicast_ether_addr(mac_addr) && hn->mpfs)
+			l2_err = mlx5_mpfs_del_mac(priv->mdev, mac_addr);
+		mlx5e_del_l2_flow_rule(priv, &hn->ai);
+		mlx5e_del_l2_from_hash(hn);
+		break;
+	}
+
+	if (l2_err)
+		netdev_warn(priv->netdev, "MPFS, failed to %s mac %pM, err(%d)\n",
+			    action == MLX5E_ACTION_ADD ? "add" : "del", mac_addr, l2_err);
+}
+
+static void mlx5e_sync_netdev_addr(struct mlx5e_priv *priv)
+{
+	struct net_device *netdev = priv->netdev;
+	struct netdev_hw_addr *ha;
+
+	netif_addr_lock_bh(netdev);
+
+	mlx5e_add_l2_to_hash(priv->fs.l2.netdev_uc,
+			     priv->netdev->dev_addr);
+
+	netdev_for_each_uc_addr(ha, netdev)
+		mlx5e_add_l2_to_hash(priv->fs.l2.netdev_uc, ha->addr);
+
+	netdev_for_each_mc_addr(ha, netdev)
+		mlx5e_add_l2_to_hash(priv->fs.l2.netdev_mc, ha->addr);
+
+	netif_addr_unlock_bh(netdev);
+}
+
+static void mlx5e_fill_addr_array(struct mlx5e_priv *priv, int list_type,
+				  u8 addr_array[][ETH_ALEN], int size)
+{
+	bool is_uc = (list_type == MLX5_NVPRT_LIST_TYPE_UC);
+	struct net_device *ndev = priv->netdev;
+	struct mlx5e_l2_hash_node *hn;
+	struct hlist_head *addr_list;
+	struct hlist_node *tmp;
+	int i = 0;
+	int hi;
+
+	addr_list = is_uc ? priv->fs.l2.netdev_uc : priv->fs.l2.netdev_mc;
+
+	if (is_uc) /* Make sure our own address is pushed first */
+		ether_addr_copy(addr_array[i++], ndev->dev_addr);
+	else if (priv->fs.l2.broadcast_enabled)
+		ether_addr_copy(addr_array[i++], ndev->broadcast);
+
+	mlx5e_for_each_hash_node(hn, tmp, addr_list, hi) {
+		if (ether_addr_equal(ndev->dev_addr, hn->ai.addr))
+			continue;
+		if (i >= size)
+			break;
+		ether_addr_copy(addr_array[i++], hn->ai.addr);
+	}
+}
+
+static void mlx5e_vport_context_update_addr_list(struct mlx5e_priv *priv,
+						 int list_type)
+{
+	bool is_uc = (list_type == MLX5_NVPRT_LIST_TYPE_UC);
+	struct mlx5e_l2_hash_node *hn;
+	u8 (*addr_array)[ETH_ALEN] = NULL;
+	struct hlist_head *addr_list;
+	struct hlist_node *tmp;
+	int max_size;
+	int size;
+	int err;
+	int hi;
+
+	size = is_uc ? 0 : (priv->fs.l2.broadcast_enabled ? 1 : 0);
+	max_size = is_uc ?
+		1 << MLX5_CAP_GEN(priv->mdev, log_max_current_uc_list) :
+		1 << MLX5_CAP_GEN(priv->mdev, log_max_current_mc_list);
+
+	addr_list = is_uc ? priv->fs.l2.netdev_uc : priv->fs.l2.netdev_mc;
+	mlx5e_for_each_hash_node(hn, tmp, addr_list, hi)
+		size++;
+
+	if (size > max_size) {
+		netdev_warn(priv->netdev,
+			    "netdev %s list size (%d) > (%d) max vport list size, some addresses will be dropped\n",
+			    is_uc ? "UC" : "MC", size, max_size);
+		size = max_size;
+	}
+
+	if (size) {
+		addr_array = kcalloc(size, ETH_ALEN, GFP_KERNEL);
+		if (!addr_array) {
+			err = -ENOMEM;
+			goto out;
+		}
+		mlx5e_fill_addr_array(priv, list_type, addr_array, size);
+	}
+
+	err = mlx5_modify_nic_vport_mac_list(priv->mdev, list_type, addr_array, size);
+out:
+	if (err)
+		netdev_err(priv->netdev,
+			   "Failed to modify vport %s list err(%d)\n",
+			   is_uc ? "UC" : "MC", err);
+	kfree(addr_array);
+}
+
+static void mlx5e_vport_context_update(struct mlx5e_priv *priv)
+{
+	struct mlx5e_l2_table *ea = &priv->fs.l2;
+
+	mlx5e_vport_context_update_addr_list(priv, MLX5_NVPRT_LIST_TYPE_UC);
+	mlx5e_vport_context_update_addr_list(priv, MLX5_NVPRT_LIST_TYPE_MC);
+	mlx5_modify_nic_vport_promisc(priv->mdev, 0,
+				      ea->allmulti_enabled,
+				      ea->promisc_enabled);
+}
+
+static void mlx5e_apply_netdev_addr(struct mlx5e_priv *priv)
+{
+	struct mlx5e_l2_hash_node *hn;
+	struct hlist_node *tmp;
+	int i;
+
+	mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_uc, i)
+		mlx5e_execute_l2_action(priv, hn);
+
+	mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_mc, i)
+		mlx5e_execute_l2_action(priv, hn);
+}
+
+static void mlx5e_handle_netdev_addr(struct mlx5e_priv *priv)
+{
+	struct mlx5e_l2_hash_node *hn;
+	struct hlist_node *tmp;
+	int i;
+
+	mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_uc, i)
+		hn->action = MLX5E_ACTION_DEL;
+	mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_mc, i)
+		hn->action = MLX5E_ACTION_DEL;
+
+	if (!test_bit(MLX5E_STATE_DESTROYING, &priv->state))
+		mlx5e_sync_netdev_addr(priv);
+
+	mlx5e_apply_netdev_addr(priv);
+}
+
+void mlx5e_set_rx_mode_work(struct work_struct *work)
+{
+	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
+					       set_rx_mode_work);
+
+	struct mlx5e_l2_table *ea = &priv->fs.l2;
+	struct net_device *ndev = priv->netdev;
+
+	bool rx_mode_enable   = !test_bit(MLX5E_STATE_DESTROYING, &priv->state);
+	bool promisc_enabled   = rx_mode_enable && (ndev->flags & IFF_PROMISC);
+	bool allmulti_enabled  = rx_mode_enable && (ndev->flags & IFF_ALLMULTI);
+	bool broadcast_enabled = rx_mode_enable;
+
+	bool enable_promisc    = !ea->promisc_enabled   &&  promisc_enabled;
+	bool disable_promisc   =  ea->promisc_enabled   && !promisc_enabled;
+	bool enable_allmulti   = !ea->allmulti_enabled  &&  allmulti_enabled;
+	bool disable_allmulti  =  ea->allmulti_enabled  && !allmulti_enabled;
+	bool enable_broadcast  = !ea->broadcast_enabled &&  broadcast_enabled;
+	bool disable_broadcast =  ea->broadcast_enabled && !broadcast_enabled;
+
+	if (enable_promisc) {
+		if (!priv->channels.params.vlan_strip_disable)
+			netdev_warn_once(ndev,
+					 "S-tagged traffic will be dropped while C-tag vlan stripping is enabled\n");
+		mlx5e_add_l2_flow_rule(priv, &ea->promisc, MLX5E_PROMISC);
+		if (!priv->fs.vlan.cvlan_filter_disabled)
+			mlx5e_add_any_vid_rules(priv);
+	}
+	if (enable_allmulti)
+		mlx5e_add_l2_flow_rule(priv, &ea->allmulti, MLX5E_ALLMULTI);
+	if (enable_broadcast)
+		mlx5e_add_l2_flow_rule(priv, &ea->broadcast, MLX5E_FULLMATCH);
+
+	mlx5e_handle_netdev_addr(priv);
+
+	if (disable_broadcast)
+		mlx5e_del_l2_flow_rule(priv, &ea->broadcast);
+	if (disable_allmulti)
+		mlx5e_del_l2_flow_rule(priv, &ea->allmulti);
+	if (disable_promisc) {
+		if (!priv->fs.vlan.cvlan_filter_disabled)
+			mlx5e_del_any_vid_rules(priv);
+		mlx5e_del_l2_flow_rule(priv, &ea->promisc);
+	}
+
+	ea->promisc_enabled   = promisc_enabled;
+	ea->allmulti_enabled  = allmulti_enabled;
+	ea->broadcast_enabled = broadcast_enabled;
+
+	mlx5e_vport_context_update(priv);
+}
+
+static void mlx5e_destroy_groups(struct mlx5e_flow_table *ft)
+{
+	int i;
+
+	for (i = ft->num_groups - 1; i >= 0; i--) {
+		if (!IS_ERR_OR_NULL(ft->g[i]))
+			mlx5_destroy_flow_group(ft->g[i]);
+		ft->g[i] = NULL;
+	}
+	ft->num_groups = 0;
+}
+
+void mlx5e_init_l2_addr(struct mlx5e_priv *priv)
+{
+	ether_addr_copy(priv->fs.l2.broadcast.addr, priv->netdev->broadcast);
+}
+
+void mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft)
+{
+	mlx5e_destroy_groups(ft);
+	kfree(ft->g);
+	mlx5_destroy_flow_table(ft->t);
+	ft->t = NULL;
+}
+
+static void mlx5e_cleanup_ttc_rules(struct mlx5e_ttc_table *ttc)
+{
+	int i;
+
+	for (i = 0; i < MLX5E_NUM_TT; i++) {
+		if (!IS_ERR_OR_NULL(ttc->rules[i])) {
+			mlx5_del_flow_rules(ttc->rules[i]);
+			ttc->rules[i] = NULL;
+		}
+	}
+
+	for (i = 0; i < MLX5E_NUM_TUNNEL_TT; i++) {
+		if (!IS_ERR_OR_NULL(ttc->tunnel_rules[i])) {
+			mlx5_del_flow_rules(ttc->tunnel_rules[i]);
+			ttc->tunnel_rules[i] = NULL;
+		}
+	}
+}
+
+struct mlx5e_etype_proto {
+	u16 etype;
+	u8 proto;
+};
+
+static struct mlx5e_etype_proto ttc_rules[] = {
+	[MLX5E_TT_IPV4_TCP] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_TCP,
+	},
+	[MLX5E_TT_IPV6_TCP] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_TCP,
+	},
+	[MLX5E_TT_IPV4_UDP] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_UDP,
+	},
+	[MLX5E_TT_IPV6_UDP] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_UDP,
+	},
+	[MLX5E_TT_IPV4_IPSEC_AH] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_AH,
+	},
+	[MLX5E_TT_IPV6_IPSEC_AH] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_AH,
+	},
+	[MLX5E_TT_IPV4_IPSEC_ESP] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_ESP,
+	},
+	[MLX5E_TT_IPV6_IPSEC_ESP] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_ESP,
+	},
+	[MLX5E_TT_IPV4] = {
+		.etype = ETH_P_IP,
+		.proto = 0,
+	},
+	[MLX5E_TT_IPV6] = {
+		.etype = ETH_P_IPV6,
+		.proto = 0,
+	},
+	[MLX5E_TT_ANY] = {
+		.etype = 0,
+		.proto = 0,
+	},
+};
+
+static struct mlx5e_etype_proto ttc_tunnel_rules[] = {
+	[MLX5E_TT_IPV4_GRE] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_GRE,
+	},
+	[MLX5E_TT_IPV6_GRE] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_GRE,
+	},
+};
+
+static u8 mlx5e_etype_to_ipv(u16 ethertype)
+{
+	if (ethertype == ETH_P_IP)
+		return 4;
+
+	if (ethertype == ETH_P_IPV6)
+		return 6;
+
+	return 0;
+}
+
+static struct mlx5_flow_handle *
+mlx5e_generate_ttc_rule(struct mlx5e_priv *priv,
+			struct mlx5_flow_table *ft,
+			struct mlx5_flow_destination *dest,
+			u16 etype,
+			u8 proto)
+{
+	int match_ipv_outer = MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, ft_field_support.outer_ip_version);
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_spec *spec;
+	int err = 0;
+	u8 ipv;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return ERR_PTR(-ENOMEM);
+
+	if (proto) {
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_protocol);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, proto);
+	}
+
+	ipv = mlx5e_etype_to_ipv(etype);
+	if (match_ipv_outer && ipv) {
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_version);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, ipv);
+	} else if (etype) {
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ethertype);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype, etype);
+	}
+
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		netdev_err(priv->netdev, "%s: add rule failed\n", __func__);
+	}
+
+	kvfree(spec);
+	return err ? ERR_PTR(err) : rule;
+}
+
+static int mlx5e_generate_ttc_table_rules(struct mlx5e_priv *priv,
+					  struct ttc_params *params,
+					  struct mlx5e_ttc_table *ttc)
+{
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_handle **rules;
+	struct mlx5_flow_table *ft;
+	int tt;
+	int err;
+
+	ft = ttc->ft.t;
+	rules = ttc->rules;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	for (tt = 0; tt < MLX5E_NUM_TT; tt++) {
+		if (tt == MLX5E_TT_ANY)
+			dest.tir_num = params->any_tt_tirn;
+		else
+			dest.tir_num = params->indir_tirn[tt];
+		rules[tt] = mlx5e_generate_ttc_rule(priv, ft, &dest,
+						    ttc_rules[tt].etype,
+						    ttc_rules[tt].proto);
+		if (IS_ERR(rules[tt]))
+			goto del_rules;
+	}
+
+	if (!params->inner_ttc || !mlx5e_tunnel_inner_ft_supported(priv->mdev))
+		return 0;
+
+	rules     = ttc->tunnel_rules;
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	dest.ft   = params->inner_ttc->ft.t;
+	for (tt = 0; tt < MLX5E_NUM_TUNNEL_TT; tt++) {
+		rules[tt] = mlx5e_generate_ttc_rule(priv, ft, &dest,
+						    ttc_tunnel_rules[tt].etype,
+						    ttc_tunnel_rules[tt].proto);
+		if (IS_ERR(rules[tt]))
+			goto del_rules;
+	}
+
+	return 0;
+
+del_rules:
+	err = PTR_ERR(rules[tt]);
+	rules[tt] = NULL;
+	mlx5e_cleanup_ttc_rules(ttc);
+	return err;
+}
+
+#define MLX5E_TTC_NUM_GROUPS	3
+#define MLX5E_TTC_GROUP1_SIZE	(BIT(3) + MLX5E_NUM_TUNNEL_TT)
+#define MLX5E_TTC_GROUP2_SIZE	 BIT(1)
+#define MLX5E_TTC_GROUP3_SIZE	 BIT(0)
+#define MLX5E_TTC_TABLE_SIZE	(MLX5E_TTC_GROUP1_SIZE +\
+				 MLX5E_TTC_GROUP2_SIZE +\
+				 MLX5E_TTC_GROUP3_SIZE)
+
+#define MLX5E_INNER_TTC_NUM_GROUPS	3
+#define MLX5E_INNER_TTC_GROUP1_SIZE	BIT(3)
+#define MLX5E_INNER_TTC_GROUP2_SIZE	BIT(1)
+#define MLX5E_INNER_TTC_GROUP3_SIZE	BIT(0)
+#define MLX5E_INNER_TTC_TABLE_SIZE	(MLX5E_INNER_TTC_GROUP1_SIZE +\
+					 MLX5E_INNER_TTC_GROUP2_SIZE +\
+					 MLX5E_INNER_TTC_GROUP3_SIZE)
+
+static int mlx5e_create_ttc_table_groups(struct mlx5e_ttc_table *ttc,
+					 bool use_ipv)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5e_flow_table *ft = &ttc->ft;
+	int ix = 0;
+	u32 *in;
+	int err;
+	u8 *mc;
+
+	ft->g = kcalloc(MLX5E_TTC_NUM_GROUPS,
+			sizeof(*ft->g), GFP_KERNEL);
+	if (!ft->g)
+		return -ENOMEM;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		kfree(ft->g);
+		return -ENOMEM;
+	}
+
+	/* L4 Group */
+	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol);
+	if (use_ipv)
+		MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_version);
+	else
+		MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_TTC_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	/* L3 Group */
+	MLX5_SET(fte_match_param, mc, outer_headers.ip_protocol, 0);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_TTC_GROUP2_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	/* Any Group */
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_TTC_GROUP3_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	kvfree(in);
+	return 0;
+
+err:
+	err = PTR_ERR(ft->g[ft->num_groups]);
+	ft->g[ft->num_groups] = NULL;
+	kvfree(in);
+
+	return err;
+}
+
+static struct mlx5_flow_handle *
+mlx5e_generate_inner_ttc_rule(struct mlx5e_priv *priv,
+			      struct mlx5_flow_table *ft,
+			      struct mlx5_flow_destination *dest,
+			      u16 etype, u8 proto)
+{
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_spec *spec;
+	int err = 0;
+	u8 ipv;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return ERR_PTR(-ENOMEM);
+
+	ipv = mlx5e_etype_to_ipv(etype);
+	if (etype && ipv) {
+		spec->match_criteria_enable = MLX5_MATCH_INNER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, inner_headers.ip_version);
+		MLX5_SET(fte_match_param, spec->match_value, inner_headers.ip_version, ipv);
+	}
+
+	if (proto) {
+		spec->match_criteria_enable = MLX5_MATCH_INNER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, inner_headers.ip_protocol);
+		MLX5_SET(fte_match_param, spec->match_value, inner_headers.ip_protocol, proto);
+	}
+
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		netdev_err(priv->netdev, "%s: add rule failed\n", __func__);
+	}
+
+	kvfree(spec);
+	return err ? ERR_PTR(err) : rule;
+}
+
+static int mlx5e_generate_inner_ttc_table_rules(struct mlx5e_priv *priv,
+						struct ttc_params *params,
+						struct mlx5e_ttc_table *ttc)
+{
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_handle **rules;
+	struct mlx5_flow_table *ft;
+	int err;
+	int tt;
+
+	ft = ttc->ft.t;
+	rules = ttc->rules;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	for (tt = 0; tt < MLX5E_NUM_TT; tt++) {
+		if (tt == MLX5E_TT_ANY)
+			dest.tir_num = params->any_tt_tirn;
+		else
+			dest.tir_num = params->indir_tirn[tt];
+
+		rules[tt] = mlx5e_generate_inner_ttc_rule(priv, ft, &dest,
+							  ttc_rules[tt].etype,
+							  ttc_rules[tt].proto);
+		if (IS_ERR(rules[tt]))
+			goto del_rules;
+	}
+
+	return 0;
+
+del_rules:
+	err = PTR_ERR(rules[tt]);
+	rules[tt] = NULL;
+	mlx5e_cleanup_ttc_rules(ttc);
+	return err;
+}
+
+static int mlx5e_create_inner_ttc_table_groups(struct mlx5e_ttc_table *ttc)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5e_flow_table *ft = &ttc->ft;
+	int ix = 0;
+	u32 *in;
+	int err;
+	u8 *mc;
+
+	ft->g = kcalloc(MLX5E_INNER_TTC_NUM_GROUPS, sizeof(*ft->g), GFP_KERNEL);
+	if (!ft->g)
+		return -ENOMEM;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		kfree(ft->g);
+		return -ENOMEM;
+	}
+
+	/* L4 Group */
+	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+	MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_protocol);
+	MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_version);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_INNER_HEADERS);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_INNER_TTC_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	/* L3 Group */
+	MLX5_SET(fte_match_param, mc, inner_headers.ip_protocol, 0);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_INNER_TTC_GROUP2_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	/* Any Group */
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_INNER_TTC_GROUP3_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	kvfree(in);
+	return 0;
+
+err:
+	err = PTR_ERR(ft->g[ft->num_groups]);
+	ft->g[ft->num_groups] = NULL;
+	kvfree(in);
+
+	return err;
+}
+
+void mlx5e_set_ttc_basic_params(struct mlx5e_priv *priv,
+				struct ttc_params *ttc_params)
+{
+	ttc_params->any_tt_tirn = priv->direct_tir[0].tirn;
+	ttc_params->inner_ttc = &priv->fs.inner_ttc;
+}
+
+void mlx5e_set_inner_ttc_ft_params(struct ttc_params *ttc_params)
+{
+	struct mlx5_flow_table_attr *ft_attr = &ttc_params->ft_attr;
+
+	ft_attr->max_fte = MLX5E_INNER_TTC_TABLE_SIZE;
+	ft_attr->level = MLX5E_INNER_TTC_FT_LEVEL;
+	ft_attr->prio = MLX5E_NIC_PRIO;
+}
+
+void mlx5e_set_ttc_ft_params(struct ttc_params *ttc_params)
+
+{
+	struct mlx5_flow_table_attr *ft_attr = &ttc_params->ft_attr;
+
+	ft_attr->max_fte = MLX5E_TTC_TABLE_SIZE;
+	ft_attr->level = MLX5E_TTC_FT_LEVEL;
+	ft_attr->prio = MLX5E_NIC_PRIO;
+}
+
+int mlx5e_create_inner_ttc_table(struct mlx5e_priv *priv, struct ttc_params *params,
+				 struct mlx5e_ttc_table *ttc)
+{
+	struct mlx5e_flow_table *ft = &ttc->ft;
+	int err;
+
+	if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
+		return 0;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &params->ft_attr);
+	if (IS_ERR(ft->t)) {
+		err = PTR_ERR(ft->t);
+		ft->t = NULL;
+		return err;
+	}
+
+	err = mlx5e_create_inner_ttc_table_groups(ttc);
+	if (err)
+		goto err;
+
+	err = mlx5e_generate_inner_ttc_table_rules(priv, params, ttc);
+	if (err)
+		goto err;
+
+	return 0;
+
+err:
+	mlx5e_destroy_flow_table(ft);
+	return err;
+}
+
+void mlx5e_destroy_inner_ttc_table(struct mlx5e_priv *priv,
+				   struct mlx5e_ttc_table *ttc)
+{
+	if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
+		return;
+
+	mlx5e_cleanup_ttc_rules(ttc);
+	mlx5e_destroy_flow_table(&ttc->ft);
+}
+
+void mlx5e_destroy_ttc_table(struct mlx5e_priv *priv,
+			     struct mlx5e_ttc_table *ttc)
+{
+	mlx5e_cleanup_ttc_rules(ttc);
+	mlx5e_destroy_flow_table(&ttc->ft);
+}
+
+int mlx5e_create_ttc_table(struct mlx5e_priv *priv, struct ttc_params *params,
+			   struct mlx5e_ttc_table *ttc)
+{
+	bool match_ipv_outer = MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, ft_field_support.outer_ip_version);
+	struct mlx5e_flow_table *ft = &ttc->ft;
+	int err;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &params->ft_attr);
+	if (IS_ERR(ft->t)) {
+		err = PTR_ERR(ft->t);
+		ft->t = NULL;
+		return err;
+	}
+
+	err = mlx5e_create_ttc_table_groups(ttc, match_ipv_outer);
+	if (err)
+		goto err;
+
+	err = mlx5e_generate_ttc_table_rules(priv, params, ttc);
+	if (err)
+		goto err;
+
+	return 0;
+err:
+	mlx5e_destroy_flow_table(ft);
+	return err;
+}
+
+static void mlx5e_del_l2_flow_rule(struct mlx5e_priv *priv,
+				   struct mlx5e_l2_rule *ai)
+{
+	if (!IS_ERR_OR_NULL(ai->rule)) {
+		mlx5_del_flow_rules(ai->rule);
+		ai->rule = NULL;
+	}
+}
+
+static int mlx5e_add_l2_flow_rule(struct mlx5e_priv *priv,
+				  struct mlx5e_l2_rule *ai, int type)
+{
+	struct mlx5_flow_table *ft = priv->fs.l2.ft.t;
+	struct mlx5_flow_destination dest = {};
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct mlx5_flow_spec *spec;
+	int err = 0;
+	u8 *mc_dmac;
+	u8 *mv_dmac;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return -ENOMEM;
+
+	mc_dmac = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+			       outer_headers.dmac_47_16);
+	mv_dmac = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+			       outer_headers.dmac_47_16);
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	dest.ft = priv->fs.ttc.ft.t;
+
+	switch (type) {
+	case MLX5E_FULLMATCH:
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		eth_broadcast_addr(mc_dmac);
+		ether_addr_copy(mv_dmac, ai->addr);
+		break;
+
+	case MLX5E_ALLMULTI:
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		mc_dmac[0] = 0x01;
+		mv_dmac[0] = 0x01;
+		break;
+
+	case MLX5E_PROMISC:
+		break;
+	}
+
+	ai->rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
+	if (IS_ERR(ai->rule)) {
+		netdev_err(priv->netdev, "%s: add l2 rule(mac:%pM) failed\n",
+			   __func__, mv_dmac);
+		err = PTR_ERR(ai->rule);
+		ai->rule = NULL;
+	}
+
+	kvfree(spec);
+
+	return err;
+}
+
+#define MLX5E_NUM_L2_GROUPS	   3
+#define MLX5E_L2_GROUP1_SIZE	   BIT(0)
+#define MLX5E_L2_GROUP2_SIZE	   BIT(15)
+#define MLX5E_L2_GROUP3_SIZE	   BIT(0)
+#define MLX5E_L2_TABLE_SIZE	   (MLX5E_L2_GROUP1_SIZE +\
+				    MLX5E_L2_GROUP2_SIZE +\
+				    MLX5E_L2_GROUP3_SIZE)
+static int mlx5e_create_l2_table_groups(struct mlx5e_l2_table *l2_table)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5e_flow_table *ft = &l2_table->ft;
+	int ix = 0;
+	u8 *mc_dmac;
+	u32 *in;
+	int err;
+	u8 *mc;
+
+	ft->g = kcalloc(MLX5E_NUM_L2_GROUPS, sizeof(*ft->g), GFP_KERNEL);
+	if (!ft->g)
+		return -ENOMEM;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		kfree(ft->g);
+		return -ENOMEM;
+	}
+
+	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+	mc_dmac = MLX5_ADDR_OF(fte_match_param, mc,
+			       outer_headers.dmac_47_16);
+	/* Flow Group for promiscuous */
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_L2_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	/* Flow Group for full match */
+	eth_broadcast_addr(mc_dmac);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_L2_GROUP2_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	/* Flow Group for allmulti */
+	eth_zero_addr(mc_dmac);
+	mc_dmac[0] = 0x01;
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_L2_GROUP3_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	kvfree(in);
+	return 0;
+
+err_destroy_groups:
+	err = PTR_ERR(ft->g[ft->num_groups]);
+	ft->g[ft->num_groups] = NULL;
+	mlx5e_destroy_groups(ft);
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5e_destroy_l2_table(struct mlx5e_priv *priv)
+{
+	mlx5e_destroy_flow_table(&priv->fs.l2.ft);
+}
+
+static int mlx5e_create_l2_table(struct mlx5e_priv *priv)
+{
+	struct mlx5e_l2_table *l2_table = &priv->fs.l2;
+	struct mlx5e_flow_table *ft = &l2_table->ft;
+	struct mlx5_flow_table_attr ft_attr = {};
+	int err;
+
+	ft->num_groups = 0;
+
+	ft_attr.max_fte = MLX5E_L2_TABLE_SIZE;
+	ft_attr.level = MLX5E_L2_FT_LEVEL;
+	ft_attr.prio = MLX5E_NIC_PRIO;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr);
+	if (IS_ERR(ft->t)) {
+		err = PTR_ERR(ft->t);
+		ft->t = NULL;
+		return err;
+	}
+
+	err = mlx5e_create_l2_table_groups(l2_table);
+	if (err)
+		goto err_destroy_flow_table;
+
+	return 0;
+
+err_destroy_flow_table:
+	mlx5_destroy_flow_table(ft->t);
+	ft->t = NULL;
+
+	return err;
+}
+
+#define MLX5E_NUM_VLAN_GROUPS	4
+#define MLX5E_VLAN_GROUP0_SIZE	BIT(12)
+#define MLX5E_VLAN_GROUP1_SIZE	BIT(12)
+#define MLX5E_VLAN_GROUP2_SIZE	BIT(1)
+#define MLX5E_VLAN_GROUP3_SIZE	BIT(0)
+#define MLX5E_VLAN_TABLE_SIZE	(MLX5E_VLAN_GROUP0_SIZE +\
+				 MLX5E_VLAN_GROUP1_SIZE +\
+				 MLX5E_VLAN_GROUP2_SIZE +\
+				 MLX5E_VLAN_GROUP3_SIZE)
+
+static int __mlx5e_create_vlan_table_groups(struct mlx5e_flow_table *ft, u32 *in,
+					    int inlen)
+{
+	int err;
+	int ix = 0;
+	u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.first_vid);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_VLAN_GROUP0_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.svlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.first_vid);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_VLAN_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_VLAN_GROUP2_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.svlan_tag);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_VLAN_GROUP3_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	return 0;
+
+err_destroy_groups:
+	err = PTR_ERR(ft->g[ft->num_groups]);
+	ft->g[ft->num_groups] = NULL;
+	mlx5e_destroy_groups(ft);
+
+	return err;
+}
+
+static int mlx5e_create_vlan_table_groups(struct mlx5e_flow_table *ft)
+{
+	u32 *in;
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	err = __mlx5e_create_vlan_table_groups(ft, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+
+static int mlx5e_create_vlan_table(struct mlx5e_priv *priv)
+{
+	struct mlx5e_flow_table *ft = &priv->fs.vlan.ft;
+	struct mlx5_flow_table_attr ft_attr = {};
+	int err;
+
+	ft->num_groups = 0;
+
+	ft_attr.max_fte = MLX5E_VLAN_TABLE_SIZE;
+	ft_attr.level = MLX5E_VLAN_FT_LEVEL;
+	ft_attr.prio = MLX5E_NIC_PRIO;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr);
+
+	if (IS_ERR(ft->t)) {
+		err = PTR_ERR(ft->t);
+		ft->t = NULL;
+		return err;
+	}
+	ft->g = kcalloc(MLX5E_NUM_VLAN_GROUPS, sizeof(*ft->g), GFP_KERNEL);
+	if (!ft->g) {
+		err = -ENOMEM;
+		goto err_destroy_vlan_table;
+	}
+
+	err = mlx5e_create_vlan_table_groups(ft);
+	if (err)
+		goto err_free_g;
+
+	mlx5e_add_vlan_rules(priv);
+
+	return 0;
+
+err_free_g:
+	kfree(ft->g);
+err_destroy_vlan_table:
+	mlx5_destroy_flow_table(ft->t);
+	ft->t = NULL;
+
+	return err;
+}
+
+static void mlx5e_destroy_vlan_table(struct mlx5e_priv *priv)
+{
+	mlx5e_del_vlan_rules(priv);
+	mlx5e_destroy_flow_table(&priv->fs.vlan.ft);
+}
+
+int mlx5e_create_flow_steering(struct mlx5e_priv *priv)
+{
+	struct ttc_params ttc_params = {};
+	int tt, err;
+
+	priv->fs.ns = mlx5_get_flow_namespace(priv->mdev,
+					       MLX5_FLOW_NAMESPACE_KERNEL);
+
+	if (!priv->fs.ns)
+		return -EOPNOTSUPP;
+
+	err = mlx5e_arfs_create_tables(priv);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create arfs tables, err=%d\n",
+			   err);
+		priv->netdev->hw_features &= ~NETIF_F_NTUPLE;
+	}
+
+	mlx5e_set_ttc_basic_params(priv, &ttc_params);
+	mlx5e_set_inner_ttc_ft_params(&ttc_params);
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
+		ttc_params.indir_tirn[tt] = priv->inner_indir_tir[tt].tirn;
+
+	err = mlx5e_create_inner_ttc_table(priv, &ttc_params, &priv->fs.inner_ttc);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create inner ttc table, err=%d\n",
+			   err);
+		goto err_destroy_arfs_tables;
+	}
+
+	mlx5e_set_ttc_ft_params(&ttc_params);
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
+		ttc_params.indir_tirn[tt] = priv->indir_tir[tt].tirn;
+
+	err = mlx5e_create_ttc_table(priv, &ttc_params, &priv->fs.ttc);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create ttc table, err=%d\n",
+			   err);
+		goto err_destroy_inner_ttc_table;
+	}
+
+	err = mlx5e_create_l2_table(priv);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create l2 table, err=%d\n",
+			   err);
+		goto err_destroy_ttc_table;
+	}
+
+	err = mlx5e_create_vlan_table(priv);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create vlan table, err=%d\n",
+			   err);
+		goto err_destroy_l2_table;
+	}
+
+	mlx5e_ethtool_init_steering(priv);
+
+	return 0;
+
+err_destroy_l2_table:
+	mlx5e_destroy_l2_table(priv);
+err_destroy_ttc_table:
+	mlx5e_destroy_ttc_table(priv, &priv->fs.ttc);
+err_destroy_inner_ttc_table:
+	mlx5e_destroy_inner_ttc_table(priv, &priv->fs.inner_ttc);
+err_destroy_arfs_tables:
+	mlx5e_arfs_destroy_tables(priv);
+
+	return err;
+}
+
+void mlx5e_destroy_flow_steering(struct mlx5e_priv *priv)
+{
+	mlx5e_destroy_vlan_table(priv);
+	mlx5e_destroy_l2_table(priv);
+	mlx5e_destroy_ttc_table(priv, &priv->fs.ttc);
+	mlx5e_destroy_inner_ttc_table(priv, &priv->fs.inner_ttc);
+	mlx5e_arfs_destroy_tables(priv);
+	mlx5e_ethtool_cleanup_steering(priv);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
new file mode 100644
index 0000000..eafc592
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -0,0 +1,589 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/fs.h>
+#include "en.h"
+
+struct mlx5e_ethtool_rule {
+	struct list_head             list;
+	struct ethtool_rx_flow_spec  flow_spec;
+	struct mlx5_flow_handle	     *rule;
+	struct mlx5e_ethtool_table   *eth_ft;
+};
+
+static void put_flow_table(struct mlx5e_ethtool_table *eth_ft)
+{
+	if (!--eth_ft->num_rules) {
+		mlx5_destroy_flow_table(eth_ft->ft);
+		eth_ft->ft = NULL;
+	}
+}
+
+#define MLX5E_ETHTOOL_L3_L4_PRIO 0
+#define MLX5E_ETHTOOL_L2_PRIO (MLX5E_ETHTOOL_L3_L4_PRIO + ETHTOOL_NUM_L3_L4_FTS)
+#define MLX5E_ETHTOOL_NUM_ENTRIES 64000
+#define MLX5E_ETHTOOL_NUM_GROUPS  10
+static struct mlx5e_ethtool_table *get_flow_table(struct mlx5e_priv *priv,
+						  struct ethtool_rx_flow_spec *fs,
+						  int num_tuples)
+{
+	struct mlx5e_ethtool_table *eth_ft;
+	struct mlx5_flow_namespace *ns;
+	struct mlx5_flow_table *ft;
+	int max_tuples;
+	int table_size;
+	int prio;
+
+	switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+		max_tuples = ETHTOOL_NUM_L3_L4_FTS;
+		prio = MLX5E_ETHTOOL_L3_L4_PRIO + (max_tuples - num_tuples);
+		eth_ft = &priv->fs.ethtool.l3_l4_ft[prio];
+		break;
+	case IP_USER_FLOW:
+		max_tuples = ETHTOOL_NUM_L3_L4_FTS;
+		prio = MLX5E_ETHTOOL_L3_L4_PRIO + (max_tuples - num_tuples);
+		eth_ft = &priv->fs.ethtool.l3_l4_ft[prio];
+		break;
+	case ETHER_FLOW:
+		max_tuples = ETHTOOL_NUM_L2_FTS;
+		prio = max_tuples - num_tuples;
+		eth_ft = &priv->fs.ethtool.l2_ft[prio];
+		prio += MLX5E_ETHTOOL_L2_PRIO;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	eth_ft->num_rules++;
+	if (eth_ft->ft)
+		return eth_ft;
+
+	ns = mlx5_get_flow_namespace(priv->mdev,
+				     MLX5_FLOW_NAMESPACE_ETHTOOL);
+	if (!ns)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	table_size = min_t(u32, BIT(MLX5_CAP_FLOWTABLE(priv->mdev,
+						       flow_table_properties_nic_receive.log_max_ft_size)),
+			   MLX5E_ETHTOOL_NUM_ENTRIES);
+	ft = mlx5_create_auto_grouped_flow_table(ns, prio,
+						 table_size,
+						 MLX5E_ETHTOOL_NUM_GROUPS, 0, 0);
+	if (IS_ERR(ft))
+		return (void *)ft;
+
+	eth_ft->ft = ft;
+	return eth_ft;
+}
+
+static void mask_spec(u8 *mask, u8 *val, size_t size)
+{
+	unsigned int i;
+
+	for (i = 0; i < size; i++, mask++, val++)
+		*((u8 *)val) = *((u8 *)mask) & *((u8 *)val);
+}
+
+static void set_ips(void *outer_headers_v, void *outer_headers_c, __be32 ip4src_m,
+		    __be32 ip4src_v, __be32 ip4dst_m, __be32 ip4dst_v)
+{
+	if (ip4src_m) {
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &ip4src_v, sizeof(ip4src_v));
+		memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       0xff, sizeof(ip4src_m));
+	}
+	if (ip4dst_m) {
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &ip4dst_v, sizeof(ip4dst_v));
+		memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       0xff, sizeof(ip4dst_m));
+	}
+	MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
+		 ethertype, ETH_P_IP);
+	MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
+		 ethertype, 0xffff);
+}
+
+static int set_flow_attrs(u32 *match_c, u32 *match_v,
+			  struct ethtool_rx_flow_spec *fs)
+{
+	void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					     outer_headers);
+	void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					     outer_headers);
+	u32 flow_type = fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT);
+	struct ethtool_tcpip4_spec *l4_mask;
+	struct ethtool_tcpip4_spec *l4_val;
+	struct ethtool_usrip4_spec *l3_mask;
+	struct ethtool_usrip4_spec *l3_val;
+	struct ethhdr *eth_val;
+	struct ethhdr *eth_mask;
+
+	switch (flow_type) {
+	case TCP_V4_FLOW:
+		l4_mask = &fs->m_u.tcp_ip4_spec;
+		l4_val = &fs->h_u.tcp_ip4_spec;
+		set_ips(outer_headers_v, outer_headers_c, l4_mask->ip4src,
+			l4_val->ip4src, l4_mask->ip4dst, l4_val->ip4dst);
+
+		if (l4_mask->psrc) {
+			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport,
+				 0xffff);
+			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_sport,
+				 ntohs(l4_val->psrc));
+		}
+		if (l4_mask->pdst) {
+			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport,
+				 0xffff);
+			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_dport,
+				 ntohs(l4_val->pdst));
+		}
+		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol,
+			 0xffff);
+		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol,
+			 IPPROTO_TCP);
+		break;
+	case UDP_V4_FLOW:
+		l4_mask = &fs->m_u.tcp_ip4_spec;
+		l4_val = &fs->h_u.tcp_ip4_spec;
+		set_ips(outer_headers_v, outer_headers_c, l4_mask->ip4src,
+			l4_val->ip4src, l4_mask->ip4dst, l4_val->ip4dst);
+
+		if (l4_mask->psrc) {
+			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_sport,
+				 0xffff);
+			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_sport,
+				 ntohs(l4_val->psrc));
+		}
+		if (l4_mask->pdst) {
+			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_dport,
+				 0xffff);
+			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_dport,
+				 ntohs(l4_val->pdst));
+		}
+		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol,
+			 0xffff);
+		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol,
+			 IPPROTO_UDP);
+		break;
+	case IP_USER_FLOW:
+		l3_mask = &fs->m_u.usr_ip4_spec;
+		l3_val = &fs->h_u.usr_ip4_spec;
+		set_ips(outer_headers_v, outer_headers_c, l3_mask->ip4src,
+			l3_val->ip4src, l3_mask->ip4dst, l3_val->ip4dst);
+		break;
+	case ETHER_FLOW:
+		eth_mask = &fs->m_u.ether_spec;
+		eth_val = &fs->h_u.ether_spec;
+
+		mask_spec((u8 *)eth_mask, (u8 *)eth_val, sizeof(*eth_mask));
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+					     outer_headers_c, smac_47_16),
+				eth_mask->h_source);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+					     outer_headers_v, smac_47_16),
+				eth_val->h_source);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+					     outer_headers_c, dmac_47_16),
+				eth_mask->h_dest);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+					     outer_headers_v, dmac_47_16),
+				eth_val->h_dest);
+		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ethertype,
+			 ntohs(eth_mask->h_proto));
+		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ethertype,
+			 ntohs(eth_val->h_proto));
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if ((fs->flow_type & FLOW_EXT) &&
+	    (fs->m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK))) {
+		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
+			 cvlan_tag, 1);
+		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
+			 cvlan_tag, 1);
+		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
+			 first_vid, 0xfff);
+		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
+			 first_vid, ntohs(fs->h_ext.vlan_tci));
+	}
+	if (fs->flow_type & FLOW_MAC_EXT &&
+	    !is_zero_ether_addr(fs->m_ext.h_dest)) {
+		mask_spec(fs->m_ext.h_dest, fs->h_ext.h_dest, ETH_ALEN);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+					     outer_headers_c, dmac_47_16),
+				fs->m_ext.h_dest);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+					     outer_headers_v, dmac_47_16),
+				fs->h_ext.h_dest);
+	}
+
+	return 0;
+}
+
+static void add_rule_to_list(struct mlx5e_priv *priv,
+			     struct mlx5e_ethtool_rule *rule)
+{
+	struct mlx5e_ethtool_rule *iter;
+	struct list_head *head = &priv->fs.ethtool.rules;
+
+	list_for_each_entry(iter, &priv->fs.ethtool.rules, list) {
+		if (iter->flow_spec.location > rule->flow_spec.location)
+			break;
+		head = &iter->list;
+	}
+	priv->fs.ethtool.tot_num_rules++;
+	list_add(&rule->list, head);
+}
+
+static bool outer_header_zero(u32 *match_criteria)
+{
+	int size = MLX5_FLD_SZ_BYTES(fte_match_param, outer_headers);
+	char *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_criteria,
+					     outer_headers);
+
+	return outer_headers_c[0] == 0 && !memcmp(outer_headers_c,
+						  outer_headers_c + 1,
+						  size - 1);
+}
+
+static struct mlx5_flow_handle *
+add_ethtool_flow_rule(struct mlx5e_priv *priv,
+		      struct mlx5_flow_table *ft,
+		      struct ethtool_rx_flow_spec *fs)
+{
+	struct mlx5_flow_destination *dst = NULL;
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_spec *spec;
+	struct mlx5_flow_handle *rule;
+	int err = 0;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return ERR_PTR(-ENOMEM);
+	err = set_flow_attrs(spec->match_criteria, spec->match_value,
+			     fs);
+	if (err)
+		goto free;
+
+	if (fs->ring_cookie == RX_CLS_FLOW_DISC) {
+		flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+	} else {
+		dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+		if (!dst) {
+			err = -ENOMEM;
+			goto free;
+		}
+
+		dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+		dst->tir_num = priv->direct_tir[fs->ring_cookie].tirn;
+		flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	}
+
+	spec->match_criteria_enable = (!outer_header_zero(spec->match_criteria));
+	flow_act.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dst, dst ? 1 : 0);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		netdev_err(priv->netdev, "%s: failed to add ethtool steering rule: %d\n",
+			   __func__, err);
+		goto free;
+	}
+free:
+	kvfree(spec);
+	kfree(dst);
+	return err ? ERR_PTR(err) : rule;
+}
+
+static void del_ethtool_rule(struct mlx5e_priv *priv,
+			     struct mlx5e_ethtool_rule *eth_rule)
+{
+	if (eth_rule->rule)
+		mlx5_del_flow_rules(eth_rule->rule);
+	list_del(&eth_rule->list);
+	priv->fs.ethtool.tot_num_rules--;
+	put_flow_table(eth_rule->eth_ft);
+	kfree(eth_rule);
+}
+
+static struct mlx5e_ethtool_rule *find_ethtool_rule(struct mlx5e_priv *priv,
+						    int location)
+{
+	struct mlx5e_ethtool_rule *iter;
+
+	list_for_each_entry(iter, &priv->fs.ethtool.rules, list) {
+		if (iter->flow_spec.location == location)
+			return iter;
+	}
+	return NULL;
+}
+
+static struct mlx5e_ethtool_rule *get_ethtool_rule(struct mlx5e_priv *priv,
+						   int location)
+{
+	struct mlx5e_ethtool_rule *eth_rule;
+
+	eth_rule = find_ethtool_rule(priv, location);
+	if (eth_rule)
+		del_ethtool_rule(priv, eth_rule);
+
+	eth_rule = kzalloc(sizeof(*eth_rule), GFP_KERNEL);
+	if (!eth_rule)
+		return ERR_PTR(-ENOMEM);
+
+	add_rule_to_list(priv, eth_rule);
+	return eth_rule;
+}
+
+#define MAX_NUM_OF_ETHTOOL_RULES BIT(10)
+
+#define all_ones(field) (field == (__force typeof(field))-1)
+#define all_zeros_or_all_ones(field)		\
+	((field) == 0 || (field) == (__force typeof(field))-1)
+
+static int validate_flow(struct mlx5e_priv *priv,
+			 struct ethtool_rx_flow_spec *fs)
+{
+	struct ethtool_tcpip4_spec *l4_mask;
+	struct ethtool_usrip4_spec *l3_mask;
+	struct ethhdr *eth_mask;
+	int num_tuples = 0;
+
+	if (fs->location >= MAX_NUM_OF_ETHTOOL_RULES)
+		return -EINVAL;
+
+	if (fs->ring_cookie >= priv->channels.params.num_channels &&
+	    fs->ring_cookie != RX_CLS_FLOW_DISC)
+		return -EINVAL;
+
+	switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	case ETHER_FLOW:
+		eth_mask = &fs->m_u.ether_spec;
+		if (!is_zero_ether_addr(eth_mask->h_dest))
+			num_tuples++;
+		if (!is_zero_ether_addr(eth_mask->h_source))
+			num_tuples++;
+		if (eth_mask->h_proto)
+			num_tuples++;
+		break;
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+		if (fs->m_u.tcp_ip4_spec.tos)
+			return -EINVAL;
+		l4_mask = &fs->m_u.tcp_ip4_spec;
+		if (l4_mask->ip4src) {
+			if (!all_ones(l4_mask->ip4src))
+				return -EINVAL;
+			num_tuples++;
+		}
+		if (l4_mask->ip4dst) {
+			if (!all_ones(l4_mask->ip4dst))
+				return -EINVAL;
+			num_tuples++;
+		}
+		if (l4_mask->psrc) {
+			if (!all_ones(l4_mask->psrc))
+				return -EINVAL;
+			num_tuples++;
+		}
+		if (l4_mask->pdst) {
+			if (!all_ones(l4_mask->pdst))
+				return -EINVAL;
+			num_tuples++;
+		}
+		/* Flow is TCP/UDP */
+		num_tuples++;
+		break;
+	case IP_USER_FLOW:
+		l3_mask = &fs->m_u.usr_ip4_spec;
+		if (l3_mask->l4_4_bytes || l3_mask->tos || l3_mask->proto ||
+		    fs->h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4)
+			return -EINVAL;
+		if (l3_mask->ip4src) {
+			if (!all_ones(l3_mask->ip4src))
+				return -EINVAL;
+			num_tuples++;
+		}
+		if (l3_mask->ip4dst) {
+			if (!all_ones(l3_mask->ip4dst))
+				return -EINVAL;
+			num_tuples++;
+		}
+		/* Flow is IPv4 */
+		num_tuples++;
+		break;
+	default:
+		return -EINVAL;
+	}
+	if ((fs->flow_type & FLOW_EXT)) {
+		if (fs->m_ext.vlan_etype ||
+		    (fs->m_ext.vlan_tci != cpu_to_be16(VLAN_VID_MASK)))
+			return -EINVAL;
+
+		if (fs->m_ext.vlan_tci) {
+			if (be16_to_cpu(fs->h_ext.vlan_tci) >= VLAN_N_VID)
+				return -EINVAL;
+		}
+		num_tuples++;
+	}
+
+	if (fs->flow_type & FLOW_MAC_EXT &&
+	    !is_zero_ether_addr(fs->m_ext.h_dest))
+		num_tuples++;
+
+	return num_tuples;
+}
+
+int mlx5e_ethtool_flow_replace(struct mlx5e_priv *priv,
+			       struct ethtool_rx_flow_spec *fs)
+{
+	struct mlx5e_ethtool_table *eth_ft;
+	struct mlx5e_ethtool_rule *eth_rule;
+	struct mlx5_flow_handle *rule;
+	int num_tuples;
+	int err;
+
+	num_tuples = validate_flow(priv, fs);
+	if (num_tuples <= 0) {
+		netdev_warn(priv->netdev, "%s: flow is not valid\n",  __func__);
+		return -EINVAL;
+	}
+
+	eth_ft = get_flow_table(priv, fs, num_tuples);
+	if (IS_ERR(eth_ft))
+		return PTR_ERR(eth_ft);
+
+	eth_rule = get_ethtool_rule(priv, fs->location);
+	if (IS_ERR(eth_rule)) {
+		put_flow_table(eth_ft);
+		return PTR_ERR(eth_rule);
+	}
+
+	eth_rule->flow_spec = *fs;
+	eth_rule->eth_ft = eth_ft;
+	if (!eth_ft->ft) {
+		err = -EINVAL;
+		goto del_ethtool_rule;
+	}
+	rule = add_ethtool_flow_rule(priv, eth_ft->ft, fs);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		goto del_ethtool_rule;
+	}
+
+	eth_rule->rule = rule;
+
+	return 0;
+
+del_ethtool_rule:
+	del_ethtool_rule(priv, eth_rule);
+
+	return err;
+}
+
+int mlx5e_ethtool_flow_remove(struct mlx5e_priv *priv,
+			      int location)
+{
+	struct mlx5e_ethtool_rule *eth_rule;
+	int err = 0;
+
+	if (location >= MAX_NUM_OF_ETHTOOL_RULES)
+		return -ENOSPC;
+
+	eth_rule = find_ethtool_rule(priv, location);
+	if (!eth_rule) {
+		err =  -ENOENT;
+		goto out;
+	}
+
+	del_ethtool_rule(priv, eth_rule);
+out:
+	return err;
+}
+
+int mlx5e_ethtool_get_flow(struct mlx5e_priv *priv, struct ethtool_rxnfc *info,
+			   int location)
+{
+	struct mlx5e_ethtool_rule *eth_rule;
+
+	if (location < 0 || location >= MAX_NUM_OF_ETHTOOL_RULES)
+		return -EINVAL;
+
+	list_for_each_entry(eth_rule, &priv->fs.ethtool.rules, list) {
+		if (eth_rule->flow_spec.location == location) {
+			info->fs = eth_rule->flow_spec;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int mlx5e_ethtool_get_all_flows(struct mlx5e_priv *priv, struct ethtool_rxnfc *info,
+				u32 *rule_locs)
+{
+	int location = 0;
+	int idx = 0;
+	int err = 0;
+
+	info->data = MAX_NUM_OF_ETHTOOL_RULES;
+	while ((!err || err == -ENOENT) && idx < info->rule_cnt) {
+		err = mlx5e_ethtool_get_flow(priv, info, location);
+		if (!err)
+			rule_locs[idx++] = location;
+		location++;
+	}
+	return err;
+}
+
+void mlx5e_ethtool_cleanup_steering(struct mlx5e_priv *priv)
+{
+	struct mlx5e_ethtool_rule *iter;
+	struct mlx5e_ethtool_rule *temp;
+
+	list_for_each_entry_safe(iter, temp, &priv->fs.ethtool.rules, list)
+		del_ethtool_rule(priv, iter);
+}
+
+void mlx5e_ethtool_init_steering(struct mlx5e_priv *priv)
+{
+	INIT_LIST_HEAD(&priv->fs.ethtool.rules);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
new file mode 100644
index 0000000..b29c1d9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -0,0 +1,4771 @@
+/*
+ * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <net/tc_act/tc_gact.h>
+#include <net/pkt_cls.h>
+#include <linux/mlx5/fs.h>
+#include <net/vxlan.h>
+#include <linux/bpf.h>
+#include "eswitch.h"
+#include "en.h"
+#include "en_tc.h"
+#include "en_rep.h"
+#include "en_accel/ipsec.h"
+#include "en_accel/ipsec_rxtx.h"
+#include "accel/ipsec.h"
+#include "vxlan.h"
+
+struct mlx5e_rq_param {
+	u32			rqc[MLX5_ST_SZ_DW(rqc)];
+	struct mlx5_wq_param	wq;
+};
+
+struct mlx5e_sq_param {
+	u32                        sqc[MLX5_ST_SZ_DW(sqc)];
+	struct mlx5_wq_param       wq;
+};
+
+struct mlx5e_cq_param {
+	u32                        cqc[MLX5_ST_SZ_DW(cqc)];
+	struct mlx5_wq_param       wq;
+	u16                        eq_ix;
+	u8                         cq_period_mode;
+};
+
+struct mlx5e_channel_param {
+	struct mlx5e_rq_param      rq;
+	struct mlx5e_sq_param      sq;
+	struct mlx5e_sq_param      xdp_sq;
+	struct mlx5e_sq_param      icosq;
+	struct mlx5e_cq_param      rx_cq;
+	struct mlx5e_cq_param      tx_cq;
+	struct mlx5e_cq_param      icosq_cq;
+};
+
+bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
+{
+	bool striding_rq_umr = MLX5_CAP_GEN(mdev, striding_rq) &&
+		MLX5_CAP_GEN(mdev, umr_ptr_rlky) &&
+		MLX5_CAP_ETH(mdev, reg_umr_sq);
+	u16 max_wqe_sz_cap = MLX5_CAP_GEN(mdev, max_wqe_sz_sq);
+	bool inline_umr = MLX5E_UMR_WQE_INLINE_SZ <= max_wqe_sz_cap;
+
+	if (!striding_rq_umr)
+		return false;
+	if (!inline_umr) {
+		mlx5_core_warn(mdev, "Cannot support Striding RQ: UMR WQE size (%d) exceeds maximum supported (%d).\n",
+			       (int)MLX5E_UMR_WQE_INLINE_SZ, max_wqe_sz_cap);
+		return false;
+	}
+	return true;
+}
+
+static u32 mlx5e_mpwqe_get_linear_frag_sz(struct mlx5e_params *params)
+{
+	if (!params->xdp_prog) {
+		u16 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+		u16 rq_headroom = MLX5_RX_HEADROOM + NET_IP_ALIGN;
+
+		return MLX5_SKB_FRAG_SZ(rq_headroom + hw_mtu);
+	}
+
+	return PAGE_SIZE;
+}
+
+static u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params)
+{
+	u32 linear_frag_sz = mlx5e_mpwqe_get_linear_frag_sz(params);
+
+	return MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz);
+}
+
+static bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
+					 struct mlx5e_params *params)
+{
+	u32 frag_sz = mlx5e_mpwqe_get_linear_frag_sz(params);
+	s8 signed_log_num_strides_param;
+	u8 log_num_strides;
+
+	if (params->lro_en || frag_sz > PAGE_SIZE)
+		return false;
+
+	if (MLX5_CAP_GEN(mdev, ext_stride_num_range))
+		return true;
+
+	log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(frag_sz);
+	signed_log_num_strides_param =
+		(s8)log_num_strides - MLX5_MPWQE_LOG_NUM_STRIDES_BASE;
+
+	return signed_log_num_strides_param >= 0;
+}
+
+static u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params)
+{
+	if (params->log_rq_mtu_frames <
+	    mlx5e_mpwqe_log_pkts_per_wqe(params) + MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW)
+		return MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW;
+
+	return params->log_rq_mtu_frames - mlx5e_mpwqe_log_pkts_per_wqe(params);
+}
+
+static u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev,
+					  struct mlx5e_params *params)
+{
+	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
+		return order_base_2(mlx5e_mpwqe_get_linear_frag_sz(params));
+
+	return MLX5E_MPWQE_STRIDE_SZ(mdev,
+		MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
+}
+
+static u8 mlx5e_mpwqe_get_log_num_strides(struct mlx5_core_dev *mdev,
+					  struct mlx5e_params *params)
+{
+	return MLX5_MPWRQ_LOG_WQE_SZ -
+		mlx5e_mpwqe_get_log_stride_size(mdev, params);
+}
+
+static u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
+				 struct mlx5e_params *params)
+{
+	u16 linear_rq_headroom = params->xdp_prog ?
+		XDP_PACKET_HEADROOM : MLX5_RX_HEADROOM;
+
+	linear_rq_headroom += NET_IP_ALIGN;
+
+	if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST)
+		return linear_rq_headroom;
+
+	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
+		return linear_rq_headroom;
+
+	return 0;
+}
+
+void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
+			       struct mlx5e_params *params)
+{
+	params->lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
+	params->log_rq_mtu_frames = is_kdump_kernel() ?
+		MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE :
+		MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
+	switch (params->rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		/* Extra room needed for build_skb */
+		params->lro_wqe_sz -= mlx5e_get_rq_headroom(mdev, params) +
+			SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	}
+
+	mlx5_core_info(mdev, "MLX5E: StrdRq(%d) RqSz(%ld) StrdSz(%ld) RxCqeCmprss(%d)\n",
+		       params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ,
+		       params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ ?
+		       BIT(mlx5e_mpwqe_get_log_rq_size(params)) :
+		       BIT(params->log_rq_mtu_frames),
+		       BIT(mlx5e_mpwqe_get_log_stride_size(mdev, params)),
+		       MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
+}
+
+bool mlx5e_striding_rq_possible(struct mlx5_core_dev *mdev,
+				struct mlx5e_params *params)
+{
+	return mlx5e_check_fragmented_striding_rq_cap(mdev) &&
+		!MLX5_IPSEC_DEV(mdev) &&
+		!(params->xdp_prog && !mlx5e_rx_mpwqe_is_linear_skb(mdev, params));
+}
+
+void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params)
+{
+	params->rq_wq_type = mlx5e_striding_rq_possible(mdev, params) &&
+		MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ) ?
+		MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ :
+		MLX5_WQ_TYPE_LINKED_LIST;
+}
+
+static void mlx5e_update_carrier(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 port_state;
+
+	port_state = mlx5_query_vport_state(mdev,
+					    MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT,
+					    0);
+
+	if (port_state == VPORT_STATE_UP) {
+		netdev_info(priv->netdev, "Link up\n");
+		netif_carrier_on(priv->netdev);
+	} else {
+		netdev_info(priv->netdev, "Link down\n");
+		netif_carrier_off(priv->netdev);
+	}
+}
+
+static void mlx5e_update_carrier_work(struct work_struct *work)
+{
+	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
+					       update_carrier_work);
+
+	mutex_lock(&priv->state_lock);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		if (priv->profile->update_carrier)
+			priv->profile->update_carrier(priv);
+	mutex_unlock(&priv->state_lock);
+}
+
+void mlx5e_update_stats(struct mlx5e_priv *priv)
+{
+	int i;
+
+	for (i = mlx5e_num_stats_grps - 1; i >= 0; i--)
+		if (mlx5e_stats_grps[i].update_stats)
+			mlx5e_stats_grps[i].update_stats(priv);
+}
+
+static void mlx5e_update_ndo_stats(struct mlx5e_priv *priv)
+{
+	int i;
+
+	for (i = mlx5e_num_stats_grps - 1; i >= 0; i--)
+		if (mlx5e_stats_grps[i].update_stats_mask &
+		    MLX5E_NDO_UPDATE_STATS)
+			mlx5e_stats_grps[i].update_stats(priv);
+}
+
+void mlx5e_update_stats_work(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct mlx5e_priv *priv = container_of(dwork, struct mlx5e_priv,
+					       update_stats_work);
+	mutex_lock(&priv->state_lock);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->profile->update_stats(priv);
+		queue_delayed_work(priv->wq, dwork,
+				   msecs_to_jiffies(MLX5E_UPDATE_STATS_INTERVAL));
+	}
+	mutex_unlock(&priv->state_lock);
+}
+
+static void mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv,
+			      enum mlx5_dev_event event, unsigned long param)
+{
+	struct mlx5e_priv *priv = vpriv;
+
+	if (!test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, &priv->state))
+		return;
+
+	switch (event) {
+	case MLX5_DEV_EVENT_PORT_UP:
+	case MLX5_DEV_EVENT_PORT_DOWN:
+		queue_work(priv->wq, &priv->update_carrier_work);
+		break;
+	default:
+		break;
+	}
+}
+
+static void mlx5e_enable_async_events(struct mlx5e_priv *priv)
+{
+	set_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, &priv->state);
+}
+
+static void mlx5e_disable_async_events(struct mlx5e_priv *priv)
+{
+	clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, &priv->state);
+	synchronize_irq(pci_irq_vector(priv->mdev->pdev, MLX5_EQ_VEC_ASYNC));
+}
+
+static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
+				       struct mlx5e_icosq *sq,
+				       struct mlx5e_umr_wqe *wqe)
+{
+	struct mlx5_wqe_ctrl_seg      *cseg = &wqe->ctrl;
+	struct mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->uctrl;
+	u8 ds_cnt = DIV_ROUND_UP(MLX5E_UMR_WQE_INLINE_SZ, MLX5_SEND_WQE_DS);
+
+	cseg->qpn_ds    = cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) |
+				      ds_cnt);
+	cseg->fm_ce_se  = MLX5_WQE_CTRL_CQ_UPDATE;
+	cseg->imm       = rq->mkey_be;
+
+	ucseg->flags = MLX5_UMR_TRANSLATION_OFFSET_EN | MLX5_UMR_INLINE;
+	ucseg->xlt_octowords =
+		cpu_to_be16(MLX5_MTT_OCTW(MLX5_MPWRQ_PAGES_PER_WQE));
+	ucseg->mkey_mask     = cpu_to_be64(MLX5_MKEY_MASK_FREE);
+}
+
+static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
+				     struct mlx5e_channel *c)
+{
+	int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
+
+	rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
+				      GFP_KERNEL, cpu_to_node(c->cpu));
+	if (!rq->mpwqe.info)
+		return -ENOMEM;
+
+	mlx5e_build_umr_wqe(rq, &c->icosq, &rq->mpwqe.umr_wqe);
+
+	return 0;
+}
+
+static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
+				 u64 npages, u8 page_shift,
+				 struct mlx5_core_mkey *umr_mkey)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	void *mkc;
+	u32 *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+	MLX5_SET(mkc, mkc, free, 1);
+	MLX5_SET(mkc, mkc, umr_en, 1);
+	MLX5_SET(mkc, mkc, lw, 1);
+	MLX5_SET(mkc, mkc, lr, 1);
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.pdn);
+	MLX5_SET64(mkc, mkc, len, npages << page_shift);
+	MLX5_SET(mkc, mkc, translations_octword_size,
+		 MLX5_MTT_OCTW(npages));
+	MLX5_SET(mkc, mkc, log_page_size, page_shift);
+
+	err = mlx5_core_create_mkey(mdev, umr_mkey, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+
+static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq *rq)
+{
+	u64 num_mtts = MLX5E_REQUIRED_MTTS(mlx5_wq_ll_get_size(&rq->wq));
+
+	return mlx5e_create_umr_mkey(mdev, num_mtts, PAGE_SHIFT, &rq->umr_mkey);
+}
+
+static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix)
+{
+	return (wqe_ix << MLX5E_LOG_ALIGNED_MPWQE_PPW) << PAGE_SHIFT;
+}
+
+static int mlx5e_alloc_rq(struct mlx5e_channel *c,
+			  struct mlx5e_params *params,
+			  struct mlx5e_rq_param *rqp,
+			  struct mlx5e_rq *rq)
+{
+	struct mlx5_core_dev *mdev = c->mdev;
+	void *rqc = rqp->rqc;
+	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	u32 byte_count;
+	int npages;
+	int wq_sz;
+	int err;
+	int i;
+
+	rqp->wq.db_numa_node = cpu_to_node(c->cpu);
+
+	err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
+				&rq->wq_ctrl);
+	if (err)
+		return err;
+
+	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
+
+	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
+
+	rq->wq_type = params->rq_wq_type;
+	rq->pdev    = c->pdev;
+	rq->netdev  = c->netdev;
+	rq->tstamp  = c->tstamp;
+	rq->clock   = &mdev->clock;
+	rq->channel = c;
+	rq->ix      = c->ix;
+	rq->mdev    = mdev;
+	rq->hw_mtu  = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+
+	rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL;
+	if (IS_ERR(rq->xdp_prog)) {
+		err = PTR_ERR(rq->xdp_prog);
+		rq->xdp_prog = NULL;
+		goto err_rq_wq_destroy;
+	}
+
+	err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix);
+	if (err < 0)
+		goto err_rq_wq_destroy;
+
+	rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
+	rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params);
+
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		rq->post_wqes = mlx5e_post_rx_mpwqes;
+		rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
+
+		rq->handle_rx_cqe = c->priv->profile->rx_handlers.handle_rx_cqe_mpwqe;
+#ifdef CONFIG_MLX5_EN_IPSEC
+		if (MLX5_IPSEC_DEV(mdev)) {
+			err = -EINVAL;
+			netdev_err(c->netdev, "MPWQE RQ with IPSec offload not supported\n");
+			goto err_rq_wq_destroy;
+		}
+#endif
+		if (!rq->handle_rx_cqe) {
+			err = -EINVAL;
+			netdev_err(c->netdev, "RX handler of MPWQE RQ is not set, err %d\n", err);
+			goto err_rq_wq_destroy;
+		}
+
+		rq->mpwqe.skb_from_cqe_mpwrq =
+			mlx5e_rx_mpwqe_is_linear_skb(mdev, params) ?
+			mlx5e_skb_from_cqe_mpwrq_linear :
+			mlx5e_skb_from_cqe_mpwrq_nonlinear;
+		rq->mpwqe.log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params);
+		rq->mpwqe.num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params));
+
+		byte_count = rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz;
+
+		err = mlx5e_create_rq_umr_mkey(mdev, rq);
+		if (err)
+			goto err_rq_wq_destroy;
+		rq->mkey_be = cpu_to_be32(rq->umr_mkey.key);
+
+		err = mlx5e_rq_alloc_mpwqe_info(rq, c);
+		if (err)
+			goto err_destroy_umr_mkey;
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		rq->wqe.frag_info =
+			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
+				     GFP_KERNEL, cpu_to_node(c->cpu));
+		if (!rq->wqe.frag_info) {
+			err = -ENOMEM;
+			goto err_rq_wq_destroy;
+		}
+		rq->post_wqes = mlx5e_post_rx_wqes;
+		rq->dealloc_wqe = mlx5e_dealloc_rx_wqe;
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+		if (c->priv->ipsec)
+			rq->handle_rx_cqe = mlx5e_ipsec_handle_rx_cqe;
+		else
+#endif
+			rq->handle_rx_cqe = c->priv->profile->rx_handlers.handle_rx_cqe;
+		if (!rq->handle_rx_cqe) {
+			kfree(rq->wqe.frag_info);
+			err = -EINVAL;
+			netdev_err(c->netdev, "RX handler of RQ is not set, err %d\n", err);
+			goto err_rq_wq_destroy;
+		}
+
+		byte_count = params->lro_en  ?
+				params->lro_wqe_sz :
+				MLX5E_SW2HW_MTU(params, params->sw_mtu);
+#ifdef CONFIG_MLX5_EN_IPSEC
+		if (MLX5_IPSEC_DEV(mdev))
+			byte_count += MLX5E_METADATA_ETHER_LEN;
+#endif
+		rq->wqe.page_reuse = !params->xdp_prog && !params->lro_en;
+
+		/* calc the required page order */
+		rq->wqe.frag_sz = MLX5_SKB_FRAG_SZ(rq->buff.headroom + byte_count);
+		npages = DIV_ROUND_UP(rq->wqe.frag_sz, PAGE_SIZE);
+		rq->buff.page_order = order_base_2(npages);
+
+		byte_count |= MLX5_HW_START_PADDING;
+		rq->mkey_be = c->mkey_be;
+	}
+
+	for (i = 0; i < wq_sz; i++) {
+		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
+
+		if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
+			u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
+
+			wqe->data.addr = cpu_to_be64(dma_offset + rq->buff.headroom);
+		}
+
+		wqe->data.byte_count = cpu_to_be32(byte_count);
+		wqe->data.lkey = rq->mkey_be;
+	}
+
+	INIT_WORK(&rq->dim.work, mlx5e_rx_dim_work);
+
+	switch (params->rx_cq_moderation.cq_period_mode) {
+	case MLX5_CQ_PERIOD_MODE_START_FROM_CQE:
+		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE;
+		break;
+	case MLX5_CQ_PERIOD_MODE_START_FROM_EQE:
+	default:
+		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	}
+
+	rq->page_cache.head = 0;
+	rq->page_cache.tail = 0;
+
+	return 0;
+
+err_destroy_umr_mkey:
+	mlx5_core_destroy_mkey(mdev, &rq->umr_mkey);
+
+err_rq_wq_destroy:
+	if (rq->xdp_prog)
+		bpf_prog_put(rq->xdp_prog);
+	xdp_rxq_info_unreg(&rq->xdp_rxq);
+	mlx5_wq_destroy(&rq->wq_ctrl);
+
+	return err;
+}
+
+static void mlx5e_free_rq(struct mlx5e_rq *rq)
+{
+	int i;
+
+	if (rq->xdp_prog)
+		bpf_prog_put(rq->xdp_prog);
+
+	xdp_rxq_info_unreg(&rq->xdp_rxq);
+
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		kfree(rq->mpwqe.info);
+		mlx5_core_destroy_mkey(rq->mdev, &rq->umr_mkey);
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		kfree(rq->wqe.frag_info);
+	}
+
+	for (i = rq->page_cache.head; i != rq->page_cache.tail;
+	     i = (i + 1) & (MLX5E_CACHE_SIZE - 1)) {
+		struct mlx5e_dma_info *dma_info = &rq->page_cache.page_cache[i];
+
+		mlx5e_page_release(rq, dma_info, false);
+	}
+	mlx5_wq_destroy(&rq->wq_ctrl);
+}
+
+static int mlx5e_create_rq(struct mlx5e_rq *rq,
+			   struct mlx5e_rq_param *param)
+{
+	struct mlx5_core_dev *mdev = rq->mdev;
+
+	void *in;
+	void *rqc;
+	void *wq;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_rq_in) +
+		sizeof(u64) * rq->wq_ctrl.buf.npages;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+	wq  = MLX5_ADDR_OF(rqc, rqc, wq);
+
+	memcpy(rqc, param->rqc, sizeof(param->rqc));
+
+	MLX5_SET(rqc,  rqc, cqn,		rq->cq.mcq.cqn);
+	MLX5_SET(rqc,  rqc, state,		MLX5_RQC_STATE_RST);
+	MLX5_SET(wq,   wq,  log_wq_pg_sz,	rq->wq_ctrl.buf.page_shift -
+						MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET64(wq, wq,  dbr_addr,		rq->wq_ctrl.db.dma);
+
+	mlx5_fill_page_array(&rq->wq_ctrl.buf,
+			     (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
+
+	err = mlx5_core_create_rq(mdev, in, inlen, &rq->rqn);
+
+	kvfree(in);
+
+	return err;
+}
+
+static int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state,
+				 int next_state)
+{
+	struct mlx5_core_dev *mdev = rq->mdev;
+
+	void *in;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+	MLX5_SET(modify_rq_in, in, rq_state, curr_state);
+	MLX5_SET(rqc, rqc, state, next_state);
+
+	err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+static int mlx5e_modify_rq_scatter_fcs(struct mlx5e_rq *rq, bool enable)
+{
+	struct mlx5e_channel *c = rq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+	MLX5_SET(modify_rq_in, in, rq_state, MLX5_RQC_STATE_RDY);
+	MLX5_SET64(modify_rq_in, in, modify_bitmask,
+		   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_SCATTER_FCS);
+	MLX5_SET(rqc, rqc, scatter_fcs, enable);
+	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY);
+
+	err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd)
+{
+	struct mlx5e_channel *c = rq->channel;
+	struct mlx5_core_dev *mdev = c->mdev;
+	void *in;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+	MLX5_SET(modify_rq_in, in, rq_state, MLX5_RQC_STATE_RDY);
+	MLX5_SET64(modify_rq_in, in, modify_bitmask,
+		   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD);
+	MLX5_SET(rqc, rqc, vsd, vsd);
+	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY);
+
+	err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5e_destroy_rq(struct mlx5e_rq *rq)
+{
+	mlx5_core_destroy_rq(rq->mdev, rq->rqn);
+}
+
+static int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq)
+{
+	unsigned long exp_time = jiffies + msecs_to_jiffies(20000);
+	struct mlx5e_channel *c = rq->channel;
+
+	struct mlx5_wq_ll *wq = &rq->wq;
+	u16 min_wqes = mlx5_min_rx_wqes(rq->wq_type, mlx5_wq_ll_get_size(wq));
+
+	while (time_before(jiffies, exp_time)) {
+		if (wq->cur_sz >= min_wqes)
+			return 0;
+
+		msleep(20);
+	}
+
+	netdev_warn(c->netdev, "Failed to get min RX wqes on RQN[0x%x] wq cur_sz(%d) min_rx_wqes(%d)\n",
+		    rq->rqn, wq->cur_sz, min_wqes);
+	return -ETIMEDOUT;
+}
+
+static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
+{
+	struct mlx5_wq_ll *wq = &rq->wq;
+	struct mlx5e_rx_wqe *wqe;
+	__be16 wqe_ix_be;
+	u16 wqe_ix;
+
+	/* UMR WQE (if in progress) is always at wq->head */
+	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ &&
+	    rq->mpwqe.umr_in_progress)
+		mlx5e_free_rx_mpwqe(rq, &rq->mpwqe.info[wq->head]);
+
+	while (!mlx5_wq_ll_is_empty(wq)) {
+		wqe_ix_be = *wq->tail_next;
+		wqe_ix    = be16_to_cpu(wqe_ix_be);
+		wqe       = mlx5_wq_ll_get_wqe(&rq->wq, wqe_ix);
+		rq->dealloc_wqe(rq, wqe_ix);
+		mlx5_wq_ll_pop(&rq->wq, wqe_ix_be,
+			       &wqe->next.next_wqe_index);
+	}
+
+	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST && rq->wqe.page_reuse) {
+		/* Clean outstanding pages on handled WQEs that decided to do page-reuse,
+		 * but yet to be re-posted.
+		 */
+		int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
+
+		for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
+			rq->dealloc_wqe(rq, wqe_ix);
+	}
+}
+
+static int mlx5e_open_rq(struct mlx5e_channel *c,
+			 struct mlx5e_params *params,
+			 struct mlx5e_rq_param *param,
+			 struct mlx5e_rq *rq)
+{
+	int err;
+
+	err = mlx5e_alloc_rq(c, params, param, rq);
+	if (err)
+		return err;
+
+	err = mlx5e_create_rq(rq, param);
+	if (err)
+		goto err_free_rq;
+
+	err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
+	if (err)
+		goto err_destroy_rq;
+
+	if (params->rx_dim_enabled)
+		c->rq.state |= BIT(MLX5E_RQ_STATE_AM);
+
+	return 0;
+
+err_destroy_rq:
+	mlx5e_destroy_rq(rq);
+err_free_rq:
+	mlx5e_free_rq(rq);
+
+	return err;
+}
+
+static void mlx5e_activate_rq(struct mlx5e_rq *rq)
+{
+	struct mlx5e_icosq *sq = &rq->channel->icosq;
+	u16 pi = sq->pc & sq->wq.sz_m1;
+	struct mlx5e_tx_wqe *nopwqe;
+
+	set_bit(MLX5E_RQ_STATE_ENABLED, &rq->state);
+	sq->db.ico_wqe[pi].opcode     = MLX5_OPCODE_NOP;
+	nopwqe = mlx5e_post_nop(&sq->wq, sq->sqn, &sq->pc);
+	mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &nopwqe->ctrl);
+}
+
+static void mlx5e_deactivate_rq(struct mlx5e_rq *rq)
+{
+	clear_bit(MLX5E_RQ_STATE_ENABLED, &rq->state);
+	napi_synchronize(&rq->channel->napi); /* prevent mlx5e_post_rx_wqes */
+}
+
+static void mlx5e_close_rq(struct mlx5e_rq *rq)
+{
+	cancel_work_sync(&rq->dim.work);
+	mlx5e_destroy_rq(rq);
+	mlx5e_free_rx_descs(rq);
+	mlx5e_free_rq(rq);
+}
+
+static void mlx5e_free_xdpsq_db(struct mlx5e_xdpsq *sq)
+{
+	kfree(sq->db.di);
+}
+
+static int mlx5e_alloc_xdpsq_db(struct mlx5e_xdpsq *sq, int numa)
+{
+	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
+
+	sq->db.di = kzalloc_node(sizeof(*sq->db.di) * wq_sz,
+				     GFP_KERNEL, numa);
+	if (!sq->db.di) {
+		mlx5e_free_xdpsq_db(sq);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
+			     struct mlx5e_params *params,
+			     struct mlx5e_sq_param *param,
+			     struct mlx5e_xdpsq *sq)
+{
+	void *sqc_wq               = MLX5_ADDR_OF(sqc, param->sqc, wq);
+	struct mlx5_core_dev *mdev = c->mdev;
+	int err;
+
+	sq->pdev      = c->pdev;
+	sq->mkey_be   = c->mkey_be;
+	sq->channel   = c;
+	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+	sq->min_inline_mode = params->tx_min_inline_mode;
+
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
+	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
+	if (err)
+		return err;
+	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
+
+	err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
+	if (err)
+		goto err_sq_wq_destroy;
+
+	return 0;
+
+err_sq_wq_destroy:
+	mlx5_wq_destroy(&sq->wq_ctrl);
+
+	return err;
+}
+
+static void mlx5e_free_xdpsq(struct mlx5e_xdpsq *sq)
+{
+	mlx5e_free_xdpsq_db(sq);
+	mlx5_wq_destroy(&sq->wq_ctrl);
+}
+
+static void mlx5e_free_icosq_db(struct mlx5e_icosq *sq)
+{
+	kfree(sq->db.ico_wqe);
+}
+
+static int mlx5e_alloc_icosq_db(struct mlx5e_icosq *sq, int numa)
+{
+	u8 wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
+
+	sq->db.ico_wqe = kzalloc_node(sizeof(*sq->db.ico_wqe) * wq_sz,
+				      GFP_KERNEL, numa);
+	if (!sq->db.ico_wqe)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
+			     struct mlx5e_sq_param *param,
+			     struct mlx5e_icosq *sq)
+{
+	void *sqc_wq               = MLX5_ADDR_OF(sqc, param->sqc, wq);
+	struct mlx5_core_dev *mdev = c->mdev;
+	int err;
+
+	sq->channel   = c;
+	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
+	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
+	if (err)
+		return err;
+	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
+
+	err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
+	if (err)
+		goto err_sq_wq_destroy;
+
+	sq->edge = (sq->wq.sz_m1 + 1) - MLX5E_ICOSQ_MAX_WQEBBS;
+
+	return 0;
+
+err_sq_wq_destroy:
+	mlx5_wq_destroy(&sq->wq_ctrl);
+
+	return err;
+}
+
+static void mlx5e_free_icosq(struct mlx5e_icosq *sq)
+{
+	mlx5e_free_icosq_db(sq);
+	mlx5_wq_destroy(&sq->wq_ctrl);
+}
+
+static void mlx5e_free_txqsq_db(struct mlx5e_txqsq *sq)
+{
+	kfree(sq->db.wqe_info);
+	kfree(sq->db.dma_fifo);
+}
+
+static int mlx5e_alloc_txqsq_db(struct mlx5e_txqsq *sq, int numa)
+{
+	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
+	int df_sz = wq_sz * MLX5_SEND_WQEBB_NUM_DS;
+
+	sq->db.dma_fifo = kzalloc_node(df_sz * sizeof(*sq->db.dma_fifo),
+					   GFP_KERNEL, numa);
+	sq->db.wqe_info = kzalloc_node(wq_sz * sizeof(*sq->db.wqe_info),
+					   GFP_KERNEL, numa);
+	if (!sq->db.dma_fifo || !sq->db.wqe_info) {
+		mlx5e_free_txqsq_db(sq);
+		return -ENOMEM;
+	}
+
+	sq->dma_fifo_mask = df_sz - 1;
+
+	return 0;
+}
+
+static void mlx5e_sq_recover(struct work_struct *work);
+static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
+			     int txq_ix,
+			     struct mlx5e_params *params,
+			     struct mlx5e_sq_param *param,
+			     struct mlx5e_txqsq *sq)
+{
+	void *sqc_wq               = MLX5_ADDR_OF(sqc, param->sqc, wq);
+	struct mlx5_core_dev *mdev = c->mdev;
+	int err;
+
+	sq->pdev      = c->pdev;
+	sq->tstamp    = c->tstamp;
+	sq->clock     = &mdev->clock;
+	sq->mkey_be   = c->mkey_be;
+	sq->channel   = c;
+	sq->txq_ix    = txq_ix;
+	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+	sq->min_inline_mode = params->tx_min_inline_mode;
+	INIT_WORK(&sq->recover.recover_work, mlx5e_sq_recover);
+	if (MLX5_IPSEC_DEV(c->priv->mdev))
+		set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
+
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
+	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
+	if (err)
+		return err;
+	sq->wq.db    = &sq->wq.db[MLX5_SND_DBR];
+
+	err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
+	if (err)
+		goto err_sq_wq_destroy;
+
+	sq->edge = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS;
+
+	return 0;
+
+err_sq_wq_destroy:
+	mlx5_wq_destroy(&sq->wq_ctrl);
+
+	return err;
+}
+
+static void mlx5e_free_txqsq(struct mlx5e_txqsq *sq)
+{
+	mlx5e_free_txqsq_db(sq);
+	mlx5_wq_destroy(&sq->wq_ctrl);
+}
+
+struct mlx5e_create_sq_param {
+	struct mlx5_wq_ctrl        *wq_ctrl;
+	u32                         cqn;
+	u32                         tisn;
+	u8                          tis_lst_sz;
+	u8                          min_inline_mode;
+};
+
+static int mlx5e_create_sq(struct mlx5_core_dev *mdev,
+			   struct mlx5e_sq_param *param,
+			   struct mlx5e_create_sq_param *csp,
+			   u32 *sqn)
+{
+	void *in;
+	void *sqc;
+	void *wq;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_sq_in) +
+		sizeof(u64) * csp->wq_ctrl->buf.npages;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
+	wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	memcpy(sqc, param->sqc, sizeof(param->sqc));
+	MLX5_SET(sqc,  sqc, tis_lst_sz, csp->tis_lst_sz);
+	MLX5_SET(sqc,  sqc, tis_num_0, csp->tisn);
+	MLX5_SET(sqc,  sqc, cqn, csp->cqn);
+
+	if (MLX5_CAP_ETH(mdev, wqe_inline_mode) == MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
+		MLX5_SET(sqc,  sqc, min_wqe_inline_mode, csp->min_inline_mode);
+
+	MLX5_SET(sqc,  sqc, state, MLX5_SQC_STATE_RST);
+	MLX5_SET(sqc,  sqc, flush_in_error_en, 1);
+
+	MLX5_SET(wq,   wq, wq_type,       MLX5_WQ_TYPE_CYCLIC);
+	MLX5_SET(wq,   wq, uar_page,      mdev->mlx5e_res.bfreg.index);
+	MLX5_SET(wq,   wq, log_wq_pg_sz,  csp->wq_ctrl->buf.page_shift -
+					  MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET64(wq, wq, dbr_addr,      csp->wq_ctrl->db.dma);
+
+	mlx5_fill_page_array(&csp->wq_ctrl->buf, (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
+
+	err = mlx5_core_create_sq(mdev, in, inlen, sqn);
+
+	kvfree(in);
+
+	return err;
+}
+
+struct mlx5e_modify_sq_param {
+	int curr_state;
+	int next_state;
+	bool rl_update;
+	int rl_index;
+};
+
+static int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
+			   struct mlx5e_modify_sq_param *p)
+{
+	void *in;
+	void *sqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
+
+	MLX5_SET(modify_sq_in, in, sq_state, p->curr_state);
+	MLX5_SET(sqc, sqc, state, p->next_state);
+	if (p->rl_update && p->next_state == MLX5_SQC_STATE_RDY) {
+		MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
+		MLX5_SET(sqc,  sqc, packet_pacing_rate_limit_index, p->rl_index);
+	}
+
+	err = mlx5_core_modify_sq(mdev, sqn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5e_destroy_sq(struct mlx5_core_dev *mdev, u32 sqn)
+{
+	mlx5_core_destroy_sq(mdev, sqn);
+}
+
+static int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev,
+			       struct mlx5e_sq_param *param,
+			       struct mlx5e_create_sq_param *csp,
+			       u32 *sqn)
+{
+	struct mlx5e_modify_sq_param msp = {0};
+	int err;
+
+	err = mlx5e_create_sq(mdev, param, csp, sqn);
+	if (err)
+		return err;
+
+	msp.curr_state = MLX5_SQC_STATE_RST;
+	msp.next_state = MLX5_SQC_STATE_RDY;
+	err = mlx5e_modify_sq(mdev, *sqn, &msp);
+	if (err)
+		mlx5e_destroy_sq(mdev, *sqn);
+
+	return err;
+}
+
+static int mlx5e_set_sq_maxrate(struct net_device *dev,
+				struct mlx5e_txqsq *sq, u32 rate);
+
+static int mlx5e_open_txqsq(struct mlx5e_channel *c,
+			    u32 tisn,
+			    int txq_ix,
+			    struct mlx5e_params *params,
+			    struct mlx5e_sq_param *param,
+			    struct mlx5e_txqsq *sq)
+{
+	struct mlx5e_create_sq_param csp = {};
+	u32 tx_rate;
+	int err;
+
+	err = mlx5e_alloc_txqsq(c, txq_ix, params, param, sq);
+	if (err)
+		return err;
+
+	csp.tisn            = tisn;
+	csp.tis_lst_sz      = 1;
+	csp.cqn             = sq->cq.mcq.cqn;
+	csp.wq_ctrl         = &sq->wq_ctrl;
+	csp.min_inline_mode = sq->min_inline_mode;
+	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn);
+	if (err)
+		goto err_free_txqsq;
+
+	tx_rate = c->priv->tx_rates[sq->txq_ix];
+	if (tx_rate)
+		mlx5e_set_sq_maxrate(c->netdev, sq, tx_rate);
+
+	return 0;
+
+err_free_txqsq:
+	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	mlx5e_free_txqsq(sq);
+
+	return err;
+}
+
+static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
+{
+	WARN_ONCE(sq->cc != sq->pc,
+		  "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
+		  sq->sqn, sq->cc, sq->pc);
+	sq->cc = 0;
+	sq->dma_fifo_cc = 0;
+	sq->pc = 0;
+}
+
+static void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq)
+{
+	sq->txq = netdev_get_tx_queue(sq->channel->netdev, sq->txq_ix);
+	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
+	set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	netdev_tx_reset_queue(sq->txq);
+	netif_tx_start_queue(sq->txq);
+}
+
+static inline void netif_tx_disable_queue(struct netdev_queue *txq)
+{
+	__netif_tx_lock_bh(txq);
+	netif_tx_stop_queue(txq);
+	__netif_tx_unlock_bh(txq);
+}
+
+static void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq)
+{
+	struct mlx5e_channel *c = sq->channel;
+
+	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	/* prevent netif_tx_wake_queue */
+	napi_synchronize(&c->napi);
+
+	netif_tx_disable_queue(sq->txq);
+
+	/* last doorbell out, godspeed .. */
+	if (mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, 1)) {
+		struct mlx5e_tx_wqe *nop;
+
+		sq->db.wqe_info[(sq->pc & sq->wq.sz_m1)].skb = NULL;
+		nop = mlx5e_post_nop(&sq->wq, sq->sqn, &sq->pc);
+		mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &nop->ctrl);
+	}
+}
+
+static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq)
+{
+	struct mlx5e_channel *c = sq->channel;
+	struct mlx5_core_dev *mdev = c->mdev;
+	struct mlx5_rate_limit rl = {0};
+
+	mlx5e_destroy_sq(mdev, sq->sqn);
+	if (sq->rate_limit) {
+		rl.rate = sq->rate_limit;
+		mlx5_rl_remove_rate(mdev, &rl);
+	}
+	mlx5e_free_txqsq_descs(sq);
+	mlx5e_free_txqsq(sq);
+}
+
+static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
+{
+	unsigned long exp_time = jiffies + msecs_to_jiffies(2000);
+
+	while (time_before(jiffies, exp_time)) {
+		if (sq->cc == sq->pc)
+			return 0;
+
+		msleep(20);
+	}
+
+	netdev_err(sq->channel->netdev,
+		   "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
+		   sq->sqn, sq->cc, sq->pc);
+
+	return -ETIMEDOUT;
+}
+
+static int mlx5e_sq_to_ready(struct mlx5e_txqsq *sq, int curr_state)
+{
+	struct mlx5_core_dev *mdev = sq->channel->mdev;
+	struct net_device *dev = sq->channel->netdev;
+	struct mlx5e_modify_sq_param msp = {0};
+	int err;
+
+	msp.curr_state = curr_state;
+	msp.next_state = MLX5_SQC_STATE_RST;
+
+	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
+	if (err) {
+		netdev_err(dev, "Failed to move sq 0x%x to reset\n", sq->sqn);
+		return err;
+	}
+
+	memset(&msp, 0, sizeof(msp));
+	msp.curr_state = MLX5_SQC_STATE_RST;
+	msp.next_state = MLX5_SQC_STATE_RDY;
+
+	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
+	if (err) {
+		netdev_err(dev, "Failed to move sq 0x%x to ready\n", sq->sqn);
+		return err;
+	}
+
+	return 0;
+}
+
+static void mlx5e_sq_recover(struct work_struct *work)
+{
+	struct mlx5e_txqsq_recover *recover =
+		container_of(work, struct mlx5e_txqsq_recover,
+			     recover_work);
+	struct mlx5e_txqsq *sq = container_of(recover, struct mlx5e_txqsq,
+					      recover);
+	struct mlx5_core_dev *mdev = sq->channel->mdev;
+	struct net_device *dev = sq->channel->netdev;
+	u8 state;
+	int err;
+
+	err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
+	if (err) {
+		netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
+			   sq->sqn, err);
+		return;
+	}
+
+	if (state != MLX5_RQC_STATE_ERR) {
+		netdev_err(dev, "SQ 0x%x not in ERROR state\n", sq->sqn);
+		return;
+	}
+
+	netif_tx_disable_queue(sq->txq);
+
+	if (mlx5e_wait_for_sq_flush(sq))
+		return;
+
+	/* If the interval between two consecutive recovers per SQ is too
+	 * short, don't recover to avoid infinite loop of ERR_CQE -> recover.
+	 * If we reached this state, there is probably a bug that needs to be
+	 * fixed. let's keep the queue close and let tx timeout cleanup.
+	 */
+	if (jiffies_to_msecs(jiffies - recover->last_recover) <
+	    MLX5E_SQ_RECOVER_MIN_INTERVAL) {
+		netdev_err(dev, "Recover SQ 0x%x canceled, too many error CQEs\n",
+			   sq->sqn);
+		return;
+	}
+
+	/* At this point, no new packets will arrive from the stack as TXQ is
+	 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
+	 * pending WQEs.  SQ can safely reset the SQ.
+	 */
+	if (mlx5e_sq_to_ready(sq, state))
+		return;
+
+	mlx5e_reset_txqsq_cc_pc(sq);
+	sq->stats.recover++;
+	recover->last_recover = jiffies;
+	mlx5e_activate_txqsq(sq);
+}
+
+static int mlx5e_open_icosq(struct mlx5e_channel *c,
+			    struct mlx5e_params *params,
+			    struct mlx5e_sq_param *param,
+			    struct mlx5e_icosq *sq)
+{
+	struct mlx5e_create_sq_param csp = {};
+	int err;
+
+	err = mlx5e_alloc_icosq(c, param, sq);
+	if (err)
+		return err;
+
+	csp.cqn             = sq->cq.mcq.cqn;
+	csp.wq_ctrl         = &sq->wq_ctrl;
+	csp.min_inline_mode = params->tx_min_inline_mode;
+	set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn);
+	if (err)
+		goto err_free_icosq;
+
+	return 0;
+
+err_free_icosq:
+	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	mlx5e_free_icosq(sq);
+
+	return err;
+}
+
+static void mlx5e_close_icosq(struct mlx5e_icosq *sq)
+{
+	struct mlx5e_channel *c = sq->channel;
+
+	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	napi_synchronize(&c->napi);
+
+	mlx5e_destroy_sq(c->mdev, sq->sqn);
+	mlx5e_free_icosq(sq);
+}
+
+static int mlx5e_open_xdpsq(struct mlx5e_channel *c,
+			    struct mlx5e_params *params,
+			    struct mlx5e_sq_param *param,
+			    struct mlx5e_xdpsq *sq)
+{
+	unsigned int ds_cnt = MLX5E_XDP_TX_DS_COUNT;
+	struct mlx5e_create_sq_param csp = {};
+	unsigned int inline_hdr_sz = 0;
+	int err;
+	int i;
+
+	err = mlx5e_alloc_xdpsq(c, params, param, sq);
+	if (err)
+		return err;
+
+	csp.tis_lst_sz      = 1;
+	csp.tisn            = c->priv->tisn[0]; /* tc = 0 */
+	csp.cqn             = sq->cq.mcq.cqn;
+	csp.wq_ctrl         = &sq->wq_ctrl;
+	csp.min_inline_mode = sq->min_inline_mode;
+	set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn);
+	if (err)
+		goto err_free_xdpsq;
+
+	if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) {
+		inline_hdr_sz = MLX5E_XDP_MIN_INLINE;
+		ds_cnt++;
+	}
+
+	/* Pre initialize fixed WQE fields */
+	for (i = 0; i < mlx5_wq_cyc_get_size(&sq->wq); i++) {
+		struct mlx5e_tx_wqe      *wqe  = mlx5_wq_cyc_get_wqe(&sq->wq, i);
+		struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl;
+		struct mlx5_wqe_eth_seg  *eseg = &wqe->eth;
+		struct mlx5_wqe_data_seg *dseg;
+
+		cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
+		eseg->inline_hdr.sz = cpu_to_be16(inline_hdr_sz);
+
+		dseg = (struct mlx5_wqe_data_seg *)cseg + (ds_cnt - 1);
+		dseg->lkey = sq->mkey_be;
+	}
+
+	return 0;
+
+err_free_xdpsq:
+	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	mlx5e_free_xdpsq(sq);
+
+	return err;
+}
+
+static void mlx5e_close_xdpsq(struct mlx5e_xdpsq *sq)
+{
+	struct mlx5e_channel *c = sq->channel;
+
+	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	napi_synchronize(&c->napi);
+
+	mlx5e_destroy_sq(c->mdev, sq->sqn);
+	mlx5e_free_xdpsq_descs(sq);
+	mlx5e_free_xdpsq(sq);
+}
+
+static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev,
+				 struct mlx5e_cq_param *param,
+				 struct mlx5e_cq *cq)
+{
+	struct mlx5_core_cq *mcq = &cq->mcq;
+	int eqn_not_used;
+	unsigned int irqn;
+	int err;
+	u32 i;
+
+	err = mlx5_cqwq_create(mdev, &param->wq, param->cqc, &cq->wq,
+			       &cq->wq_ctrl);
+	if (err)
+		return err;
+
+	mlx5_vector2eqn(mdev, param->eq_ix, &eqn_not_used, &irqn);
+
+	mcq->cqe_sz     = 64;
+	mcq->set_ci_db  = cq->wq_ctrl.db.db;
+	mcq->arm_db     = cq->wq_ctrl.db.db + 1;
+	*mcq->set_ci_db = 0;
+	*mcq->arm_db    = 0;
+	mcq->vector     = param->eq_ix;
+	mcq->comp       = mlx5e_completion_event;
+	mcq->event      = mlx5e_cq_error_event;
+	mcq->irqn       = irqn;
+
+	for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
+		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
+
+		cqe->op_own = 0xf1;
+	}
+
+	cq->mdev = mdev;
+
+	return 0;
+}
+
+static int mlx5e_alloc_cq(struct mlx5e_channel *c,
+			  struct mlx5e_cq_param *param,
+			  struct mlx5e_cq *cq)
+{
+	struct mlx5_core_dev *mdev = c->priv->mdev;
+	int err;
+
+	param->wq.buf_numa_node = cpu_to_node(c->cpu);
+	param->wq.db_numa_node  = cpu_to_node(c->cpu);
+	param->eq_ix   = c->ix;
+
+	err = mlx5e_alloc_cq_common(mdev, param, cq);
+
+	cq->napi    = &c->napi;
+	cq->channel = c;
+
+	return err;
+}
+
+static void mlx5e_free_cq(struct mlx5e_cq *cq)
+{
+	mlx5_cqwq_destroy(&cq->wq_ctrl);
+}
+
+static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
+{
+	struct mlx5_core_dev *mdev = cq->mdev;
+	struct mlx5_core_cq *mcq = &cq->mcq;
+
+	void *in;
+	void *cqc;
+	int inlen;
+	unsigned int irqn_not_used;
+	int eqn;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
+		sizeof(u64) * cq->wq_ctrl.frag_buf.npages;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+
+	memcpy(cqc, param->cqc, sizeof(param->cqc));
+
+	mlx5_fill_page_frag_array(&cq->wq_ctrl.frag_buf,
+				  (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas));
+
+	mlx5_vector2eqn(mdev, param->eq_ix, &eqn, &irqn_not_used);
+
+	MLX5_SET(cqc,   cqc, cq_period_mode, param->cq_period_mode);
+	MLX5_SET(cqc,   cqc, c_eqn,         eqn);
+	MLX5_SET(cqc,   cqc, uar_page,      mdev->priv.uar->index);
+	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.frag_buf.page_shift -
+					    MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET64(cqc, cqc, dbr_addr,      cq->wq_ctrl.db.dma);
+
+	err = mlx5_core_create_cq(mdev, mcq, in, inlen);
+
+	kvfree(in);
+
+	if (err)
+		return err;
+
+	mlx5e_cq_arm(cq);
+
+	return 0;
+}
+
+static void mlx5e_destroy_cq(struct mlx5e_cq *cq)
+{
+	mlx5_core_destroy_cq(cq->mdev, &cq->mcq);
+}
+
+static int mlx5e_open_cq(struct mlx5e_channel *c,
+			 struct net_dim_cq_moder moder,
+			 struct mlx5e_cq_param *param,
+			 struct mlx5e_cq *cq)
+{
+	struct mlx5_core_dev *mdev = c->mdev;
+	int err;
+
+	err = mlx5e_alloc_cq(c, param, cq);
+	if (err)
+		return err;
+
+	err = mlx5e_create_cq(cq, param);
+	if (err)
+		goto err_free_cq;
+
+	if (MLX5_CAP_GEN(mdev, cq_moderation))
+		mlx5_core_modify_cq_moderation(mdev, &cq->mcq, moder.usec, moder.pkts);
+	return 0;
+
+err_free_cq:
+	mlx5e_free_cq(cq);
+
+	return err;
+}
+
+static void mlx5e_close_cq(struct mlx5e_cq *cq)
+{
+	mlx5e_destroy_cq(cq);
+	mlx5e_free_cq(cq);
+}
+
+static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
+{
+	return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
+}
+
+static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
+			     struct mlx5e_params *params,
+			     struct mlx5e_channel_param *cparam)
+{
+	int err;
+	int tc;
+
+	for (tc = 0; tc < c->num_tc; tc++) {
+		err = mlx5e_open_cq(c, params->tx_cq_moderation,
+				    &cparam->tx_cq, &c->sq[tc].cq);
+		if (err)
+			goto err_close_tx_cqs;
+	}
+
+	return 0;
+
+err_close_tx_cqs:
+	for (tc--; tc >= 0; tc--)
+		mlx5e_close_cq(&c->sq[tc].cq);
+
+	return err;
+}
+
+static void mlx5e_close_tx_cqs(struct mlx5e_channel *c)
+{
+	int tc;
+
+	for (tc = 0; tc < c->num_tc; tc++)
+		mlx5e_close_cq(&c->sq[tc].cq);
+}
+
+static int mlx5e_open_sqs(struct mlx5e_channel *c,
+			  struct mlx5e_params *params,
+			  struct mlx5e_channel_param *cparam)
+{
+	int err;
+	int tc;
+
+	for (tc = 0; tc < params->num_tc; tc++) {
+		int txq_ix = c->ix + tc * params->num_channels;
+
+		err = mlx5e_open_txqsq(c, c->priv->tisn[tc], txq_ix,
+				       params, &cparam->sq, &c->sq[tc]);
+		if (err)
+			goto err_close_sqs;
+	}
+
+	return 0;
+
+err_close_sqs:
+	for (tc--; tc >= 0; tc--)
+		mlx5e_close_txqsq(&c->sq[tc]);
+
+	return err;
+}
+
+static void mlx5e_close_sqs(struct mlx5e_channel *c)
+{
+	int tc;
+
+	for (tc = 0; tc < c->num_tc; tc++)
+		mlx5e_close_txqsq(&c->sq[tc]);
+}
+
+static int mlx5e_set_sq_maxrate(struct net_device *dev,
+				struct mlx5e_txqsq *sq, u32 rate)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_modify_sq_param msp = {0};
+	struct mlx5_rate_limit rl = {0};
+	u16 rl_index = 0;
+	int err;
+
+	if (rate == sq->rate_limit)
+		/* nothing to do */
+		return 0;
+
+	if (sq->rate_limit) {
+		rl.rate = sq->rate_limit;
+		/* remove current rl index to free space to next ones */
+		mlx5_rl_remove_rate(mdev, &rl);
+	}
+
+	sq->rate_limit = 0;
+
+	if (rate) {
+		rl.rate = rate;
+		err = mlx5_rl_add_rate(mdev, &rl_index, &rl);
+		if (err) {
+			netdev_err(dev, "Failed configuring rate %u: %d\n",
+				   rate, err);
+			return err;
+		}
+	}
+
+	msp.curr_state = MLX5_SQC_STATE_RDY;
+	msp.next_state = MLX5_SQC_STATE_RDY;
+	msp.rl_index   = rl_index;
+	msp.rl_update  = true;
+	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
+	if (err) {
+		netdev_err(dev, "Failed configuring rate %u: %d\n",
+			   rate, err);
+		/* remove the rate from the table */
+		if (rate)
+			mlx5_rl_remove_rate(mdev, &rl);
+		return err;
+	}
+
+	sq->rate_limit = rate;
+	return 0;
+}
+
+static int mlx5e_set_tx_maxrate(struct net_device *dev, int index, u32 rate)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_txqsq *sq = priv->txq2sq[index];
+	int err = 0;
+
+	if (!mlx5_rl_is_supported(mdev)) {
+		netdev_err(dev, "Rate limiting is not supported on this device\n");
+		return -EINVAL;
+	}
+
+	/* rate is given in Mb/sec, HW config is in Kb/sec */
+	rate = rate << 10;
+
+	/* Check whether rate in valid range, 0 is always valid */
+	if (rate && !mlx5_rl_is_in_range(mdev, rate)) {
+		netdev_err(dev, "TX rate %u, is not in range\n", rate);
+		return -ERANGE;
+	}
+
+	mutex_lock(&priv->state_lock);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		err = mlx5e_set_sq_maxrate(dev, sq, rate);
+	if (!err)
+		priv->tx_rates[index] = rate;
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
+			      struct mlx5e_params *params,
+			      struct mlx5e_channel_param *cparam,
+			      struct mlx5e_channel **cp)
+{
+	struct net_dim_cq_moder icocq_moder = {0, 0};
+	struct net_device *netdev = priv->netdev;
+	int cpu = mlx5e_get_cpu(priv, ix);
+	struct mlx5e_channel *c;
+	unsigned int irq;
+	int err;
+	int eqn;
+
+	c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
+	if (!c)
+		return -ENOMEM;
+
+	c->priv     = priv;
+	c->mdev     = priv->mdev;
+	c->tstamp   = &priv->tstamp;
+	c->ix       = ix;
+	c->cpu      = cpu;
+	c->pdev     = &priv->mdev->pdev->dev;
+	c->netdev   = priv->netdev;
+	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
+	c->num_tc   = params->num_tc;
+	c->xdp      = !!params->xdp_prog;
+
+	mlx5_vector2eqn(priv->mdev, ix, &eqn, &irq);
+	c->irq_desc = irq_to_desc(irq);
+
+	netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64);
+
+	err = mlx5e_open_cq(c, icocq_moder, &cparam->icosq_cq, &c->icosq.cq);
+	if (err)
+		goto err_napi_del;
+
+	err = mlx5e_open_tx_cqs(c, params, cparam);
+	if (err)
+		goto err_close_icosq_cq;
+
+	err = mlx5e_open_cq(c, params->rx_cq_moderation, &cparam->rx_cq, &c->rq.cq);
+	if (err)
+		goto err_close_tx_cqs;
+
+	/* XDP SQ CQ params are same as normal TXQ sq CQ params */
+	err = c->xdp ? mlx5e_open_cq(c, params->tx_cq_moderation,
+				     &cparam->tx_cq, &c->rq.xdpsq.cq) : 0;
+	if (err)
+		goto err_close_rx_cq;
+
+	napi_enable(&c->napi);
+
+	err = mlx5e_open_icosq(c, params, &cparam->icosq, &c->icosq);
+	if (err)
+		goto err_disable_napi;
+
+	err = mlx5e_open_sqs(c, params, cparam);
+	if (err)
+		goto err_close_icosq;
+
+	err = c->xdp ? mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, &c->rq.xdpsq) : 0;
+	if (err)
+		goto err_close_sqs;
+
+	err = mlx5e_open_rq(c, params, &cparam->rq, &c->rq);
+	if (err)
+		goto err_close_xdp_sq;
+
+	*cp = c;
+
+	return 0;
+err_close_xdp_sq:
+	if (c->xdp)
+		mlx5e_close_xdpsq(&c->rq.xdpsq);
+
+err_close_sqs:
+	mlx5e_close_sqs(c);
+
+err_close_icosq:
+	mlx5e_close_icosq(&c->icosq);
+
+err_disable_napi:
+	napi_disable(&c->napi);
+	if (c->xdp)
+		mlx5e_close_cq(&c->rq.xdpsq.cq);
+
+err_close_rx_cq:
+	mlx5e_close_cq(&c->rq.cq);
+
+err_close_tx_cqs:
+	mlx5e_close_tx_cqs(c);
+
+err_close_icosq_cq:
+	mlx5e_close_cq(&c->icosq.cq);
+
+err_napi_del:
+	netif_napi_del(&c->napi);
+	kfree(c);
+
+	return err;
+}
+
+static void mlx5e_activate_channel(struct mlx5e_channel *c)
+{
+	int tc;
+
+	for (tc = 0; tc < c->num_tc; tc++)
+		mlx5e_activate_txqsq(&c->sq[tc]);
+	mlx5e_activate_rq(&c->rq);
+	netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix);
+}
+
+static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
+{
+	int tc;
+
+	mlx5e_deactivate_rq(&c->rq);
+	for (tc = 0; tc < c->num_tc; tc++)
+		mlx5e_deactivate_txqsq(&c->sq[tc]);
+}
+
+static void mlx5e_close_channel(struct mlx5e_channel *c)
+{
+	mlx5e_close_rq(&c->rq);
+	if (c->xdp)
+		mlx5e_close_xdpsq(&c->rq.xdpsq);
+	mlx5e_close_sqs(c);
+	mlx5e_close_icosq(&c->icosq);
+	napi_disable(&c->napi);
+	if (c->xdp)
+		mlx5e_close_cq(&c->rq.xdpsq.cq);
+	mlx5e_close_cq(&c->rq.cq);
+	mlx5e_close_tx_cqs(c);
+	mlx5e_close_cq(&c->icosq.cq);
+	netif_napi_del(&c->napi);
+
+	kfree(c);
+}
+
+static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
+				 struct mlx5e_params *params,
+				 struct mlx5e_rq_param *param)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *rqc = param->rqc;
+	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
+
+	switch (params->rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		MLX5_SET(wq, wq, log_wqe_num_of_strides,
+			 mlx5e_mpwqe_get_log_num_strides(mdev, params) -
+			 MLX5_MPWQE_LOG_NUM_STRIDES_BASE);
+		MLX5_SET(wq, wq, log_wqe_stride_size,
+			 mlx5e_mpwqe_get_log_stride_size(mdev, params) -
+			 MLX5_MPWQE_LOG_STRIDE_SZ_BASE);
+		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ);
+		MLX5_SET(wq, wq, log_wq_sz, mlx5e_mpwqe_get_log_rq_size(params));
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
+		MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames);
+	}
+
+	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
+	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
+	MLX5_SET(wq, wq, pd,               mdev->mlx5e_res.pdn);
+	MLX5_SET(rqc, rqc, counter_set_id, priv->q_counter);
+	MLX5_SET(rqc, rqc, vsd,            params->vlan_strip_disable);
+	MLX5_SET(rqc, rqc, scatter_fcs,    params->scatter_fcs_en);
+
+	param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev);
+	param->wq.linear = 1;
+}
+
+static void mlx5e_build_drop_rq_param(struct mlx5e_priv *priv,
+				      struct mlx5e_rq_param *param)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *rqc = param->rqc;
+	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
+
+	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
+	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
+	MLX5_SET(rqc, rqc, counter_set_id, priv->drop_rq_q_counter);
+
+	param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev);
+}
+
+static void mlx5e_build_sq_param_common(struct mlx5e_priv *priv,
+					struct mlx5e_sq_param *param)
+{
+	void *sqc = param->sqc;
+	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
+	MLX5_SET(wq, wq, pd,            priv->mdev->mlx5e_res.pdn);
+
+	param->wq.buf_numa_node = dev_to_node(&priv->mdev->pdev->dev);
+}
+
+static void mlx5e_build_sq_param(struct mlx5e_priv *priv,
+				 struct mlx5e_params *params,
+				 struct mlx5e_sq_param *param)
+{
+	void *sqc = param->sqc;
+	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	mlx5e_build_sq_param_common(priv, param);
+	MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size);
+	MLX5_SET(sqc, sqc, allow_swp, !!MLX5_IPSEC_DEV(priv->mdev));
+}
+
+static void mlx5e_build_common_cq_param(struct mlx5e_priv *priv,
+					struct mlx5e_cq_param *param)
+{
+	void *cqc = param->cqc;
+
+	MLX5_SET(cqc, cqc, uar_page, priv->mdev->priv.uar->index);
+}
+
+static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
+				    struct mlx5e_params *params,
+				    struct mlx5e_cq_param *param)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *cqc = param->cqc;
+	u8 log_cq_size;
+
+	switch (params->rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		log_cq_size = mlx5e_mpwqe_get_log_rq_size(params) +
+			mlx5e_mpwqe_get_log_num_strides(mdev, params);
+		break;
+	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		log_cq_size = params->log_rq_mtu_frames;
+	}
+
+	MLX5_SET(cqc, cqc, log_cq_size, log_cq_size);
+	if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)) {
+		MLX5_SET(cqc, cqc, mini_cqe_res_format, MLX5_CQE_FORMAT_CSUM);
+		MLX5_SET(cqc, cqc, cqe_comp_en, 1);
+	}
+
+	mlx5e_build_common_cq_param(priv, param);
+	param->cq_period_mode = params->rx_cq_moderation.cq_period_mode;
+}
+
+static void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
+				    struct mlx5e_params *params,
+				    struct mlx5e_cq_param *param)
+{
+	void *cqc = param->cqc;
+
+	MLX5_SET(cqc, cqc, log_cq_size, params->log_sq_size);
+
+	mlx5e_build_common_cq_param(priv, param);
+	param->cq_period_mode = params->tx_cq_moderation.cq_period_mode;
+}
+
+static void mlx5e_build_ico_cq_param(struct mlx5e_priv *priv,
+				     u8 log_wq_size,
+				     struct mlx5e_cq_param *param)
+{
+	void *cqc = param->cqc;
+
+	MLX5_SET(cqc, cqc, log_cq_size, log_wq_size);
+
+	mlx5e_build_common_cq_param(priv, param);
+
+	param->cq_period_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+}
+
+static void mlx5e_build_icosq_param(struct mlx5e_priv *priv,
+				    u8 log_wq_size,
+				    struct mlx5e_sq_param *param)
+{
+	void *sqc = param->sqc;
+	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	mlx5e_build_sq_param_common(priv, param);
+
+	MLX5_SET(wq, wq, log_wq_sz, log_wq_size);
+	MLX5_SET(sqc, sqc, reg_umr, MLX5_CAP_ETH(priv->mdev, reg_umr_sq));
+}
+
+static void mlx5e_build_xdpsq_param(struct mlx5e_priv *priv,
+				    struct mlx5e_params *params,
+				    struct mlx5e_sq_param *param)
+{
+	void *sqc = param->sqc;
+	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	mlx5e_build_sq_param_common(priv, param);
+	MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size);
+}
+
+static void mlx5e_build_channel_param(struct mlx5e_priv *priv,
+				      struct mlx5e_params *params,
+				      struct mlx5e_channel_param *cparam)
+{
+	u8 icosq_log_wq_sz = MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
+
+	mlx5e_build_rq_param(priv, params, &cparam->rq);
+	mlx5e_build_sq_param(priv, params, &cparam->sq);
+	mlx5e_build_xdpsq_param(priv, params, &cparam->xdp_sq);
+	mlx5e_build_icosq_param(priv, icosq_log_wq_sz, &cparam->icosq);
+	mlx5e_build_rx_cq_param(priv, params, &cparam->rx_cq);
+	mlx5e_build_tx_cq_param(priv, params, &cparam->tx_cq);
+	mlx5e_build_ico_cq_param(priv, icosq_log_wq_sz, &cparam->icosq_cq);
+}
+
+int mlx5e_open_channels(struct mlx5e_priv *priv,
+			struct mlx5e_channels *chs)
+{
+	struct mlx5e_channel_param *cparam;
+	int err = -ENOMEM;
+	int i;
+
+	chs->num = chs->params.num_channels;
+
+	chs->c = kcalloc(chs->num, sizeof(struct mlx5e_channel *), GFP_KERNEL);
+	cparam = kzalloc(sizeof(struct mlx5e_channel_param), GFP_KERNEL);
+	if (!chs->c || !cparam)
+		goto err_free;
+
+	mlx5e_build_channel_param(priv, &chs->params, cparam);
+	for (i = 0; i < chs->num; i++) {
+		err = mlx5e_open_channel(priv, i, &chs->params, cparam, &chs->c[i]);
+		if (err)
+			goto err_close_channels;
+	}
+
+	kfree(cparam);
+	return 0;
+
+err_close_channels:
+	for (i--; i >= 0; i--)
+		mlx5e_close_channel(chs->c[i]);
+
+err_free:
+	kfree(chs->c);
+	kfree(cparam);
+	chs->num = 0;
+	return err;
+}
+
+static void mlx5e_activate_channels(struct mlx5e_channels *chs)
+{
+	int i;
+
+	for (i = 0; i < chs->num; i++)
+		mlx5e_activate_channel(chs->c[i]);
+}
+
+static int mlx5e_wait_channels_min_rx_wqes(struct mlx5e_channels *chs)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; i < chs->num; i++) {
+		err = mlx5e_wait_for_min_rx_wqes(&chs->c[i]->rq);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+static void mlx5e_deactivate_channels(struct mlx5e_channels *chs)
+{
+	int i;
+
+	for (i = 0; i < chs->num; i++)
+		mlx5e_deactivate_channel(chs->c[i]);
+}
+
+void mlx5e_close_channels(struct mlx5e_channels *chs)
+{
+	int i;
+
+	for (i = 0; i < chs->num; i++)
+		mlx5e_close_channel(chs->c[i]);
+
+	kfree(chs->c);
+	chs->num = 0;
+}
+
+static int
+mlx5e_create_rqt(struct mlx5e_priv *priv, int sz, struct mlx5e_rqt *rqt)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *rqtc;
+	int inlen;
+	int err;
+	u32 *in;
+	int i;
+
+	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
+
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
+	MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
+
+	for (i = 0; i < sz; i++)
+		MLX5_SET(rqtc, rqtc, rq_num[i], priv->drop_rq.rqn);
+
+	err = mlx5_core_create_rqt(mdev, in, inlen, &rqt->rqtn);
+	if (!err)
+		rqt->enabled = true;
+
+	kvfree(in);
+	return err;
+}
+
+void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt)
+{
+	rqt->enabled = false;
+	mlx5_core_destroy_rqt(priv->mdev, rqt->rqtn);
+}
+
+int mlx5e_create_indirect_rqt(struct mlx5e_priv *priv)
+{
+	struct mlx5e_rqt *rqt = &priv->indir_rqt;
+	int err;
+
+	err = mlx5e_create_rqt(priv, MLX5E_INDIR_RQT_SIZE, rqt);
+	if (err)
+		mlx5_core_warn(priv->mdev, "create indirect rqts failed, %d\n", err);
+	return err;
+}
+
+int mlx5e_create_direct_rqts(struct mlx5e_priv *priv)
+{
+	struct mlx5e_rqt *rqt;
+	int err;
+	int ix;
+
+	for (ix = 0; ix < priv->profile->max_nch(priv->mdev); ix++) {
+		rqt = &priv->direct_tir[ix].rqt;
+		err = mlx5e_create_rqt(priv, 1 /*size */, rqt);
+		if (err)
+			goto err_destroy_rqts;
+	}
+
+	return 0;
+
+err_destroy_rqts:
+	mlx5_core_warn(priv->mdev, "create direct rqts failed, %d\n", err);
+	for (ix--; ix >= 0; ix--)
+		mlx5e_destroy_rqt(priv, &priv->direct_tir[ix].rqt);
+
+	return err;
+}
+
+void mlx5e_destroy_direct_rqts(struct mlx5e_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < priv->profile->max_nch(priv->mdev); i++)
+		mlx5e_destroy_rqt(priv, &priv->direct_tir[i].rqt);
+}
+
+static int mlx5e_rx_hash_fn(int hfunc)
+{
+	return (hfunc == ETH_RSS_HASH_TOP) ?
+	       MLX5_RX_HASH_FN_TOEPLITZ :
+	       MLX5_RX_HASH_FN_INVERTED_XOR8;
+}
+
+int mlx5e_bits_invert(unsigned long a, int size)
+{
+	int inv = 0;
+	int i;
+
+	for (i = 0; i < size; i++)
+		inv |= (test_bit(size - i - 1, &a) ? 1 : 0) << i;
+
+	return inv;
+}
+
+static void mlx5e_fill_rqt_rqns(struct mlx5e_priv *priv, int sz,
+				struct mlx5e_redirect_rqt_param rrp, void *rqtc)
+{
+	int i;
+
+	for (i = 0; i < sz; i++) {
+		u32 rqn;
+
+		if (rrp.is_rss) {
+			int ix = i;
+
+			if (rrp.rss.hfunc == ETH_RSS_HASH_XOR)
+				ix = mlx5e_bits_invert(i, ilog2(sz));
+
+			ix = priv->channels.params.indirection_rqt[ix];
+			rqn = rrp.rss.channels->c[ix]->rq.rqn;
+		} else {
+			rqn = rrp.rqn;
+		}
+		MLX5_SET(rqtc, rqtc, rq_num[i], rqn);
+	}
+}
+
+int mlx5e_redirect_rqt(struct mlx5e_priv *priv, u32 rqtn, int sz,
+		       struct mlx5e_redirect_rqt_param rrp)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *rqtc;
+	int inlen;
+	u32 *in;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32) * sz;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
+
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
+	MLX5_SET(modify_rqt_in, in, bitmask.rqn_list, 1);
+	mlx5e_fill_rqt_rqns(priv, sz, rrp, rqtc);
+	err = mlx5_core_modify_rqt(mdev, rqtn, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+
+static u32 mlx5e_get_direct_rqn(struct mlx5e_priv *priv, int ix,
+				struct mlx5e_redirect_rqt_param rrp)
+{
+	if (!rrp.is_rss)
+		return rrp.rqn;
+
+	if (ix >= rrp.rss.channels->num)
+		return priv->drop_rq.rqn;
+
+	return rrp.rss.channels->c[ix]->rq.rqn;
+}
+
+static void mlx5e_redirect_rqts(struct mlx5e_priv *priv,
+				struct mlx5e_redirect_rqt_param rrp)
+{
+	u32 rqtn;
+	int ix;
+
+	if (priv->indir_rqt.enabled) {
+		/* RSS RQ table */
+		rqtn = priv->indir_rqt.rqtn;
+		mlx5e_redirect_rqt(priv, rqtn, MLX5E_INDIR_RQT_SIZE, rrp);
+	}
+
+	for (ix = 0; ix < priv->profile->max_nch(priv->mdev); ix++) {
+		struct mlx5e_redirect_rqt_param direct_rrp = {
+			.is_rss = false,
+			{
+				.rqn    = mlx5e_get_direct_rqn(priv, ix, rrp)
+			},
+		};
+
+		/* Direct RQ Tables */
+		if (!priv->direct_tir[ix].rqt.enabled)
+			continue;
+
+		rqtn = priv->direct_tir[ix].rqt.rqtn;
+		mlx5e_redirect_rqt(priv, rqtn, 1, direct_rrp);
+	}
+}
+
+static void mlx5e_redirect_rqts_to_channels(struct mlx5e_priv *priv,
+					    struct mlx5e_channels *chs)
+{
+	struct mlx5e_redirect_rqt_param rrp = {
+		.is_rss        = true,
+		{
+			.rss = {
+				.channels  = chs,
+				.hfunc     = chs->params.rss_hfunc,
+			}
+		},
+	};
+
+	mlx5e_redirect_rqts(priv, rrp);
+}
+
+static void mlx5e_redirect_rqts_to_drop(struct mlx5e_priv *priv)
+{
+	struct mlx5e_redirect_rqt_param drop_rrp = {
+		.is_rss = false,
+		{
+			.rqn = priv->drop_rq.rqn,
+		},
+	};
+
+	mlx5e_redirect_rqts(priv, drop_rrp);
+}
+
+static void mlx5e_build_tir_ctx_lro(struct mlx5e_params *params, void *tirc)
+{
+	if (!params->lro_en)
+		return;
+
+#define ROUGH_MAX_L2_L3_HDR_SZ 256
+
+	MLX5_SET(tirc, tirc, lro_enable_mask,
+		 MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
+		 MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO);
+	MLX5_SET(tirc, tirc, lro_max_ip_payload_size,
+		 (params->lro_wqe_sz - ROUGH_MAX_L2_L3_HDR_SZ) >> 8);
+	MLX5_SET(tirc, tirc, lro_timeout_period_usecs, params->lro_timeout);
+}
+
+void mlx5e_build_indir_tir_ctx_hash(struct mlx5e_params *params,
+				    enum mlx5e_traffic_types tt,
+				    void *tirc, bool inner)
+{
+	void *hfso = inner ? MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner) :
+			     MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
+
+#define MLX5_HASH_IP            (MLX5_HASH_FIELD_SEL_SRC_IP   |\
+				 MLX5_HASH_FIELD_SEL_DST_IP)
+
+#define MLX5_HASH_IP_L4PORTS    (MLX5_HASH_FIELD_SEL_SRC_IP   |\
+				 MLX5_HASH_FIELD_SEL_DST_IP   |\
+				 MLX5_HASH_FIELD_SEL_L4_SPORT |\
+				 MLX5_HASH_FIELD_SEL_L4_DPORT)
+
+#define MLX5_HASH_IP_IPSEC_SPI  (MLX5_HASH_FIELD_SEL_SRC_IP   |\
+				 MLX5_HASH_FIELD_SEL_DST_IP   |\
+				 MLX5_HASH_FIELD_SEL_IPSEC_SPI)
+
+	MLX5_SET(tirc, tirc, rx_hash_fn, mlx5e_rx_hash_fn(params->rss_hfunc));
+	if (params->rss_hfunc == ETH_RSS_HASH_TOP) {
+		void *rss_key = MLX5_ADDR_OF(tirc, tirc,
+					     rx_hash_toeplitz_key);
+		size_t len = MLX5_FLD_SZ_BYTES(tirc,
+					       rx_hash_toeplitz_key);
+
+		MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+		memcpy(rss_key, params->toeplitz_hash_key, len);
+	}
+
+	switch (tt) {
+	case MLX5E_TT_IPV4_TCP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_TCP);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_L4PORTS);
+		break;
+
+	case MLX5E_TT_IPV6_TCP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_TCP);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_L4PORTS);
+		break;
+
+	case MLX5E_TT_IPV4_UDP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_UDP);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_L4PORTS);
+		break;
+
+	case MLX5E_TT_IPV6_UDP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_UDP);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_L4PORTS);
+		break;
+
+	case MLX5E_TT_IPV4_IPSEC_AH:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_IPSEC_SPI);
+		break;
+
+	case MLX5E_TT_IPV6_IPSEC_AH:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_IPSEC_SPI);
+		break;
+
+	case MLX5E_TT_IPV4_IPSEC_ESP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_IPSEC_SPI);
+		break;
+
+	case MLX5E_TT_IPV6_IPSEC_ESP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_IPSEC_SPI);
+		break;
+
+	case MLX5E_TT_IPV4:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP);
+		break;
+
+	case MLX5E_TT_IPV6:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP);
+		break;
+	default:
+		WARN_ONCE(true, "%s: bad traffic type!\n", __func__);
+	}
+}
+
+static int mlx5e_modify_tirs_lro(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *tirc;
+	int inlen;
+	int err;
+	int tt;
+	int ix;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_tir_in, in, bitmask.lro, 1);
+	tirc = MLX5_ADDR_OF(modify_tir_in, in, ctx);
+
+	mlx5e_build_tir_ctx_lro(&priv->channels.params, tirc);
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
+		err = mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in,
+					   inlen);
+		if (err)
+			goto free_in;
+	}
+
+	for (ix = 0; ix < priv->profile->max_nch(priv->mdev); ix++) {
+		err = mlx5_core_modify_tir(mdev, priv->direct_tir[ix].tirn,
+					   in, inlen);
+		if (err)
+			goto free_in;
+	}
+
+free_in:
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5e_build_inner_indir_tir_ctx(struct mlx5e_priv *priv,
+					    enum mlx5e_traffic_types tt,
+					    u32 *tirc)
+{
+	MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
+
+	mlx5e_build_tir_ctx_lro(&priv->channels.params, tirc);
+
+	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
+	MLX5_SET(tirc, tirc, indirect_table, priv->indir_rqt.rqtn);
+	MLX5_SET(tirc, tirc, tunneled_offload_en, 0x1);
+
+	mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, true);
+}
+
+static int mlx5e_set_mtu(struct mlx5_core_dev *mdev,
+			 struct mlx5e_params *params, u16 mtu)
+{
+	u16 hw_mtu = MLX5E_SW2HW_MTU(params, mtu);
+	int err;
+
+	err = mlx5_set_port_mtu(mdev, hw_mtu, 1);
+	if (err)
+		return err;
+
+	/* Update vport context MTU */
+	mlx5_modify_nic_vport_mtu(mdev, hw_mtu);
+	return 0;
+}
+
+static void mlx5e_query_mtu(struct mlx5_core_dev *mdev,
+			    struct mlx5e_params *params, u16 *mtu)
+{
+	u16 hw_mtu = 0;
+	int err;
+
+	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
+	if (err || !hw_mtu) /* fallback to port oper mtu */
+		mlx5_query_port_oper_mtu(mdev, &hw_mtu, 1);
+
+	*mtu = MLX5E_HW2SW_MTU(params, hw_mtu);
+}
+
+static int mlx5e_set_dev_port_mtu(struct mlx5e_priv *priv)
+{
+	struct mlx5e_params *params = &priv->channels.params;
+	struct net_device *netdev = priv->netdev;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 mtu;
+	int err;
+
+	err = mlx5e_set_mtu(mdev, params, params->sw_mtu);
+	if (err)
+		return err;
+
+	mlx5e_query_mtu(mdev, params, &mtu);
+	if (mtu != params->sw_mtu)
+		netdev_warn(netdev, "%s: VPort MTU %d is different than netdev mtu %d\n",
+			    __func__, mtu, params->sw_mtu);
+
+	params->sw_mtu = mtu;
+	return 0;
+}
+
+static void mlx5e_netdev_set_tcs(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int nch = priv->channels.params.num_channels;
+	int ntc = priv->channels.params.num_tc;
+	int tc;
+
+	netdev_reset_tc(netdev);
+
+	if (ntc == 1)
+		return;
+
+	netdev_set_num_tc(netdev, ntc);
+
+	/* Map netdev TCs to offset 0
+	 * We have our own UP to TXQ mapping for QoS
+	 */
+	for (tc = 0; tc < ntc; tc++)
+		netdev_set_tc_queue(netdev, tc, nch, 0);
+}
+
+static void mlx5e_build_channels_tx_maps(struct mlx5e_priv *priv)
+{
+	struct mlx5e_channel *c;
+	struct mlx5e_txqsq *sq;
+	int i, tc;
+
+	for (i = 0; i < priv->channels.num; i++)
+		for (tc = 0; tc < priv->profile->max_tc; tc++)
+			priv->channel_tc2txq[i][tc] = i + tc * priv->channels.num;
+
+	for (i = 0; i < priv->channels.num; i++) {
+		c = priv->channels.c[i];
+		for (tc = 0; tc < c->num_tc; tc++) {
+			sq = &c->sq[tc];
+			priv->txq2sq[sq->txq_ix] = sq;
+		}
+	}
+}
+
+void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
+{
+	int num_txqs = priv->channels.num * priv->channels.params.num_tc;
+	struct net_device *netdev = priv->netdev;
+
+	mlx5e_netdev_set_tcs(netdev);
+	netif_set_real_num_tx_queues(netdev, num_txqs);
+	netif_set_real_num_rx_queues(netdev, priv->channels.num);
+
+	mlx5e_build_channels_tx_maps(priv);
+	mlx5e_activate_channels(&priv->channels);
+	netif_tx_start_all_queues(priv->netdev);
+
+	if (MLX5_VPORT_MANAGER(priv->mdev))
+		mlx5e_add_sqs_fwd_rules(priv);
+
+	mlx5e_wait_channels_min_rx_wqes(&priv->channels);
+	mlx5e_redirect_rqts_to_channels(priv, &priv->channels);
+}
+
+void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv)
+{
+	mlx5e_redirect_rqts_to_drop(priv);
+
+	if (MLX5_VPORT_MANAGER(priv->mdev))
+		mlx5e_remove_sqs_fwd_rules(priv);
+
+	/* FIXME: This is a W/A only for tx timeout watch dog false alarm when
+	 * polling for inactive tx queues.
+	 */
+	netif_tx_stop_all_queues(priv->netdev);
+	netif_tx_disable(priv->netdev);
+	mlx5e_deactivate_channels(&priv->channels);
+}
+
+void mlx5e_switch_priv_channels(struct mlx5e_priv *priv,
+				struct mlx5e_channels *new_chs,
+				mlx5e_fp_hw_modify hw_modify)
+{
+	struct net_device *netdev = priv->netdev;
+	int new_num_txqs;
+	int carrier_ok;
+	new_num_txqs = new_chs->num * new_chs->params.num_tc;
+
+	carrier_ok = netif_carrier_ok(netdev);
+	netif_carrier_off(netdev);
+
+	if (new_num_txqs < netdev->real_num_tx_queues)
+		netif_set_real_num_tx_queues(netdev, new_num_txqs);
+
+	mlx5e_deactivate_priv_channels(priv);
+	mlx5e_close_channels(&priv->channels);
+
+	priv->channels = *new_chs;
+
+	/* New channels are ready to roll, modify HW settings if needed */
+	if (hw_modify)
+		hw_modify(priv);
+
+	mlx5e_refresh_tirs(priv, false);
+	mlx5e_activate_priv_channels(priv);
+
+	/* return carrier back if needed */
+	if (carrier_ok)
+		netif_carrier_on(netdev);
+}
+
+void mlx5e_timestamp_init(struct mlx5e_priv *priv)
+{
+	priv->tstamp.tx_type   = HWTSTAMP_TX_OFF;
+	priv->tstamp.rx_filter = HWTSTAMP_FILTER_NONE;
+}
+
+int mlx5e_open_locked(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	set_bit(MLX5E_STATE_OPENED, &priv->state);
+
+	err = mlx5e_open_channels(priv, &priv->channels);
+	if (err)
+		goto err_clear_state_opened_flag;
+
+	mlx5e_refresh_tirs(priv, false);
+	mlx5e_activate_priv_channels(priv);
+	if (priv->profile->update_carrier)
+		priv->profile->update_carrier(priv);
+
+	if (priv->profile->update_stats)
+		queue_delayed_work(priv->wq, &priv->update_stats_work, 0);
+
+	return 0;
+
+err_clear_state_opened_flag:
+	clear_bit(MLX5E_STATE_OPENED, &priv->state);
+	return err;
+}
+
+int mlx5e_open(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	mutex_lock(&priv->state_lock);
+	err = mlx5e_open_locked(netdev);
+	if (!err)
+		mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_UP);
+	mutex_unlock(&priv->state_lock);
+
+	if (mlx5e_vxlan_allowed(priv->mdev))
+		udp_tunnel_get_rx_info(netdev);
+
+	return err;
+}
+
+int mlx5e_close_locked(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	/* May already be CLOSED in case a previous configuration operation
+	 * (e.g RX/TX queue size change) that involves close&open failed.
+	 */
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		return 0;
+
+	clear_bit(MLX5E_STATE_OPENED, &priv->state);
+
+	netif_carrier_off(priv->netdev);
+	mlx5e_deactivate_priv_channels(priv);
+	mlx5e_close_channels(&priv->channels);
+
+	return 0;
+}
+
+int mlx5e_close(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	if (!netif_device_present(netdev))
+		return -ENODEV;
+
+	mutex_lock(&priv->state_lock);
+	mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_DOWN);
+	err = mlx5e_close_locked(netdev);
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static int mlx5e_alloc_drop_rq(struct mlx5_core_dev *mdev,
+			       struct mlx5e_rq *rq,
+			       struct mlx5e_rq_param *param)
+{
+	void *rqc = param->rqc;
+	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	int err;
+
+	param->wq.db_numa_node = param->wq.buf_numa_node;
+
+	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wq,
+				&rq->wq_ctrl);
+	if (err)
+		return err;
+
+	/* Mark as unused given "Drop-RQ" packets never reach XDP */
+	xdp_rxq_info_unused(&rq->xdp_rxq);
+
+	rq->mdev = mdev;
+
+	return 0;
+}
+
+static int mlx5e_alloc_drop_cq(struct mlx5_core_dev *mdev,
+			       struct mlx5e_cq *cq,
+			       struct mlx5e_cq_param *param)
+{
+	param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev);
+	param->wq.db_numa_node  = dev_to_node(&mdev->pdev->dev);
+
+	return mlx5e_alloc_cq_common(mdev, param, cq);
+}
+
+static int mlx5e_open_drop_rq(struct mlx5e_priv *priv,
+			      struct mlx5e_rq *drop_rq)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_cq_param cq_param = {};
+	struct mlx5e_rq_param rq_param = {};
+	struct mlx5e_cq *cq = &drop_rq->cq;
+	int err;
+
+	mlx5e_build_drop_rq_param(priv, &rq_param);
+
+	err = mlx5e_alloc_drop_cq(mdev, cq, &cq_param);
+	if (err)
+		return err;
+
+	err = mlx5e_create_cq(cq, &cq_param);
+	if (err)
+		goto err_free_cq;
+
+	err = mlx5e_alloc_drop_rq(mdev, drop_rq, &rq_param);
+	if (err)
+		goto err_destroy_cq;
+
+	err = mlx5e_create_rq(drop_rq, &rq_param);
+	if (err)
+		goto err_free_rq;
+
+	err = mlx5e_modify_rq_state(drop_rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
+	if (err)
+		mlx5_core_warn(priv->mdev, "modify_rq_state failed, rx_if_down_packets won't be counted %d\n", err);
+
+	return 0;
+
+err_free_rq:
+	mlx5e_free_rq(drop_rq);
+
+err_destroy_cq:
+	mlx5e_destroy_cq(cq);
+
+err_free_cq:
+	mlx5e_free_cq(cq);
+
+	return err;
+}
+
+static void mlx5e_close_drop_rq(struct mlx5e_rq *drop_rq)
+{
+	mlx5e_destroy_rq(drop_rq);
+	mlx5e_free_rq(drop_rq);
+	mlx5e_destroy_cq(&drop_rq->cq);
+	mlx5e_free_cq(&drop_rq->cq);
+}
+
+int mlx5e_create_tis(struct mlx5_core_dev *mdev, int tc,
+		     u32 underlay_qpn, u32 *tisn)
+{
+	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0};
+	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+
+	MLX5_SET(tisc, tisc, prio, tc << 1);
+	MLX5_SET(tisc, tisc, underlay_qpn, underlay_qpn);
+	MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.td.tdn);
+
+	if (mlx5_lag_is_lacp_owner(mdev))
+		MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1);
+
+	return mlx5_core_create_tis(mdev, in, sizeof(in), tisn);
+}
+
+void mlx5e_destroy_tis(struct mlx5_core_dev *mdev, u32 tisn)
+{
+	mlx5_core_destroy_tis(mdev, tisn);
+}
+
+int mlx5e_create_tises(struct mlx5e_priv *priv)
+{
+	int err;
+	int tc;
+
+	for (tc = 0; tc < priv->profile->max_tc; tc++) {
+		err = mlx5e_create_tis(priv->mdev, tc, 0, &priv->tisn[tc]);
+		if (err)
+			goto err_close_tises;
+	}
+
+	return 0;
+
+err_close_tises:
+	for (tc--; tc >= 0; tc--)
+		mlx5e_destroy_tis(priv->mdev, priv->tisn[tc]);
+
+	return err;
+}
+
+void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv)
+{
+	int tc;
+
+	for (tc = 0; tc < priv->profile->max_tc; tc++)
+		mlx5e_destroy_tis(priv->mdev, priv->tisn[tc]);
+}
+
+static void mlx5e_build_indir_tir_ctx(struct mlx5e_priv *priv,
+				      enum mlx5e_traffic_types tt,
+				      u32 *tirc)
+{
+	MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
+
+	mlx5e_build_tir_ctx_lro(&priv->channels.params, tirc);
+
+	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
+	MLX5_SET(tirc, tirc, indirect_table, priv->indir_rqt.rqtn);
+	mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, false);
+}
+
+static void mlx5e_build_direct_tir_ctx(struct mlx5e_priv *priv, u32 rqtn, u32 *tirc)
+{
+	MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
+
+	mlx5e_build_tir_ctx_lro(&priv->channels.params, tirc);
+
+	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
+	MLX5_SET(tirc, tirc, indirect_table, rqtn);
+	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_INVERTED_XOR8);
+}
+
+int mlx5e_create_indirect_tirs(struct mlx5e_priv *priv)
+{
+	struct mlx5e_tir *tir;
+	void *tirc;
+	int inlen;
+	int i = 0;
+	int err;
+	u32 *in;
+	int tt;
+
+	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
+		memset(in, 0, inlen);
+		tir = &priv->indir_tir[tt];
+		tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+		mlx5e_build_indir_tir_ctx(priv, tt, tirc);
+		err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
+		if (err) {
+			mlx5_core_warn(priv->mdev, "create indirect tirs failed, %d\n", err);
+			goto err_destroy_inner_tirs;
+		}
+	}
+
+	if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
+		goto out;
+
+	for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++) {
+		memset(in, 0, inlen);
+		tir = &priv->inner_indir_tir[i];
+		tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+		mlx5e_build_inner_indir_tir_ctx(priv, i, tirc);
+		err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
+		if (err) {
+			mlx5_core_warn(priv->mdev, "create inner indirect tirs failed, %d\n", err);
+			goto err_destroy_inner_tirs;
+		}
+	}
+
+out:
+	kvfree(in);
+
+	return 0;
+
+err_destroy_inner_tirs:
+	for (i--; i >= 0; i--)
+		mlx5e_destroy_tir(priv->mdev, &priv->inner_indir_tir[i]);
+
+	for (tt--; tt >= 0; tt--)
+		mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[tt]);
+
+	kvfree(in);
+
+	return err;
+}
+
+int mlx5e_create_direct_tirs(struct mlx5e_priv *priv)
+{
+	int nch = priv->profile->max_nch(priv->mdev);
+	struct mlx5e_tir *tir;
+	void *tirc;
+	int inlen;
+	int err;
+	u32 *in;
+	int ix;
+
+	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	for (ix = 0; ix < nch; ix++) {
+		memset(in, 0, inlen);
+		tir = &priv->direct_tir[ix];
+		tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+		mlx5e_build_direct_tir_ctx(priv, priv->direct_tir[ix].rqt.rqtn, tirc);
+		err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
+		if (err)
+			goto err_destroy_ch_tirs;
+	}
+
+	kvfree(in);
+
+	return 0;
+
+err_destroy_ch_tirs:
+	mlx5_core_warn(priv->mdev, "create direct tirs failed, %d\n", err);
+	for (ix--; ix >= 0; ix--)
+		mlx5e_destroy_tir(priv->mdev, &priv->direct_tir[ix]);
+
+	kvfree(in);
+
+	return err;
+}
+
+void mlx5e_destroy_indirect_tirs(struct mlx5e_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++)
+		mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[i]);
+
+	if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
+		return;
+
+	for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++)
+		mlx5e_destroy_tir(priv->mdev, &priv->inner_indir_tir[i]);
+}
+
+void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv)
+{
+	int nch = priv->profile->max_nch(priv->mdev);
+	int i;
+
+	for (i = 0; i < nch; i++)
+		mlx5e_destroy_tir(priv->mdev, &priv->direct_tir[i]);
+}
+
+static int mlx5e_modify_channels_scatter_fcs(struct mlx5e_channels *chs, bool enable)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; i < chs->num; i++) {
+		err = mlx5e_modify_rq_scatter_fcs(&chs->c[i]->rq, enable);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int mlx5e_modify_channels_vsd(struct mlx5e_channels *chs, bool vsd)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; i < chs->num; i++) {
+		err = mlx5e_modify_rq_vsd(&chs->c[i]->rq, vsd);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int mlx5e_setup_tc_mqprio(struct net_device *netdev,
+				 struct tc_mqprio_qopt *mqprio)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_channels new_channels = {};
+	u8 tc = mqprio->num_tc;
+	int err = 0;
+
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+	if (tc && tc != MLX5E_MAX_NUM_TC)
+		return -EINVAL;
+
+	mutex_lock(&priv->state_lock);
+
+	new_channels.params = priv->channels.params;
+	new_channels.params.num_tc = tc ? tc : 1;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		goto out;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto out;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+out:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+#ifdef CONFIG_MLX5_ESWITCH
+static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv,
+				     struct tc_cls_flower_offload *cls_flower)
+{
+	switch (cls_flower->command) {
+	case TC_CLSFLOWER_REPLACE:
+		return mlx5e_configure_flower(priv, cls_flower);
+	case TC_CLSFLOWER_DESTROY:
+		return mlx5e_delete_flower(priv, cls_flower);
+	case TC_CLSFLOWER_STATS:
+		return mlx5e_stats_flower(priv, cls_flower);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+			    void *cb_priv)
+{
+	struct mlx5e_priv *priv = cb_priv;
+
+	if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return mlx5e_setup_tc_cls_flower(priv, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx5e_setup_tc_block(struct net_device *dev,
+				struct tc_block_offload *f)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		return tcf_block_cb_register(f->block, mlx5e_setup_tc_block_cb,
+					     priv, priv);
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block, mlx5e_setup_tc_block_cb,
+					priv);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+#endif
+
+static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			  void *type_data)
+{
+	switch (type) {
+#ifdef CONFIG_MLX5_ESWITCH
+	case TC_SETUP_BLOCK:
+		return mlx5e_setup_tc_block(dev, type_data);
+#endif
+	case TC_SETUP_QDISC_MQPRIO:
+		return mlx5e_setup_tc_mqprio(dev, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void
+mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_sw_stats *sstats = &priv->stats.sw;
+	struct mlx5e_vport_stats *vstats = &priv->stats.vport;
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+
+	if (mlx5e_is_uplink_rep(priv)) {
+		stats->rx_packets = PPORT_802_3_GET(pstats, a_frames_received_ok);
+		stats->rx_bytes   = PPORT_802_3_GET(pstats, a_octets_received_ok);
+		stats->tx_packets = PPORT_802_3_GET(pstats, a_frames_transmitted_ok);
+		stats->tx_bytes   = PPORT_802_3_GET(pstats, a_octets_transmitted_ok);
+	} else {
+		stats->rx_packets = sstats->rx_packets;
+		stats->rx_bytes   = sstats->rx_bytes;
+		stats->tx_packets = sstats->tx_packets;
+		stats->tx_bytes   = sstats->tx_bytes;
+		stats->tx_dropped = sstats->tx_queue_dropped;
+	}
+
+	stats->rx_dropped = priv->stats.qcnt.rx_out_of_buffer;
+
+	stats->rx_length_errors =
+		PPORT_802_3_GET(pstats, a_in_range_length_errors) +
+		PPORT_802_3_GET(pstats, a_out_of_range_length_field) +
+		PPORT_802_3_GET(pstats, a_frame_too_long_errors);
+	stats->rx_crc_errors =
+		PPORT_802_3_GET(pstats, a_frame_check_sequence_errors);
+	stats->rx_frame_errors = PPORT_802_3_GET(pstats, a_alignment_errors);
+	stats->tx_aborted_errors = PPORT_2863_GET(pstats, if_out_discards);
+	stats->rx_errors = stats->rx_length_errors + stats->rx_crc_errors +
+			   stats->rx_frame_errors;
+	stats->tx_errors = stats->tx_aborted_errors + stats->tx_carrier_errors;
+
+	/* vport multicast also counts packets that are dropped due to steering
+	 * or rx out of buffer
+	 */
+	stats->multicast =
+		VPORT_COUNTER_GET(vstats, received_eth_multicast.packets);
+}
+
+static void mlx5e_set_rx_mode(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	queue_work(priv->wq, &priv->set_rx_mode_work);
+}
+
+static int mlx5e_set_mac(struct net_device *netdev, void *addr)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct sockaddr *saddr = addr;
+
+	if (!is_valid_ether_addr(saddr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	netif_addr_lock_bh(netdev);
+	ether_addr_copy(netdev->dev_addr, saddr->sa_data);
+	netif_addr_unlock_bh(netdev);
+
+	queue_work(priv->wq, &priv->set_rx_mode_work);
+
+	return 0;
+}
+
+#define MLX5E_SET_FEATURE(features, feature, enable)	\
+	do {						\
+		if (enable)				\
+			*features |= feature;		\
+		else					\
+			*features &= ~feature;		\
+	} while (0)
+
+typedef int (*mlx5e_feature_handler)(struct net_device *netdev, bool enable);
+
+static int set_feature_lro(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_channels new_channels = {};
+	struct mlx5e_params *old_params;
+	int err = 0;
+	bool reset;
+
+	mutex_lock(&priv->state_lock);
+
+	old_params = &priv->channels.params;
+	reset = test_bit(MLX5E_STATE_OPENED, &priv->state);
+
+	new_channels.params = *old_params;
+	new_channels.params.lro_en = enable;
+
+	if (old_params->rq_wq_type != MLX5_WQ_TYPE_LINKED_LIST) {
+		if (mlx5e_rx_mpwqe_is_linear_skb(mdev, old_params) ==
+		    mlx5e_rx_mpwqe_is_linear_skb(mdev, &new_channels.params))
+			reset = false;
+	}
+
+	if (!reset) {
+		*old_params = new_channels.params;
+		err = mlx5e_modify_tirs_lro(priv);
+		goto out;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto out;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, mlx5e_modify_tirs_lro);
+out:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+static int set_feature_cvlan_filter(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (enable)
+		mlx5e_enable_cvlan_filter(priv);
+	else
+		mlx5e_disable_cvlan_filter(priv);
+
+	return 0;
+}
+
+static int set_feature_tc_num_filters(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (!enable && mlx5e_tc_num_filters(priv)) {
+		netdev_err(netdev,
+			   "Active offloaded tc filters, can't turn hw_tc_offload off\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int set_feature_rx_all(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_set_port_fcs(mdev, !enable);
+}
+
+static int set_feature_rx_fcs(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	mutex_lock(&priv->state_lock);
+
+	priv->channels.params.scatter_fcs_en = enable;
+	err = mlx5e_modify_channels_scatter_fcs(&priv->channels, enable);
+	if (err)
+		priv->channels.params.scatter_fcs_en = !enable;
+
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static int set_feature_rx_vlan(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err = 0;
+
+	mutex_lock(&priv->state_lock);
+
+	priv->channels.params.vlan_strip_disable = !enable;
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		goto unlock;
+
+	err = mlx5e_modify_channels_vsd(&priv->channels, !enable);
+	if (err)
+		priv->channels.params.vlan_strip_disable = enable;
+
+unlock:
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+#ifdef CONFIG_RFS_ACCEL
+static int set_feature_arfs(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	if (enable)
+		err = mlx5e_arfs_enable(priv);
+	else
+		err = mlx5e_arfs_disable(priv);
+
+	return err;
+}
+#endif
+
+static int mlx5e_handle_feature(struct net_device *netdev,
+				netdev_features_t *features,
+				netdev_features_t wanted_features,
+				netdev_features_t feature,
+				mlx5e_feature_handler feature_handler)
+{
+	netdev_features_t changes = wanted_features ^ netdev->features;
+	bool enable = !!(wanted_features & feature);
+	int err;
+
+	if (!(changes & feature))
+		return 0;
+
+	err = feature_handler(netdev, enable);
+	if (err) {
+		netdev_err(netdev, "%s feature %pNF failed, err %d\n",
+			   enable ? "Enable" : "Disable", &feature, err);
+		return err;
+	}
+
+	MLX5E_SET_FEATURE(features, feature, enable);
+	return 0;
+}
+
+static int mlx5e_set_features(struct net_device *netdev,
+			      netdev_features_t features)
+{
+	netdev_features_t oper_features = netdev->features;
+	int err = 0;
+
+#define MLX5E_HANDLE_FEATURE(feature, handler) \
+	mlx5e_handle_feature(netdev, &oper_features, features, feature, handler)
+
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro);
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER,
+				    set_feature_cvlan_filter);
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_tc_num_filters);
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXALL, set_feature_rx_all);
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs);
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_RX, set_feature_rx_vlan);
+#ifdef CONFIG_RFS_ACCEL
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_NTUPLE, set_feature_arfs);
+#endif
+
+	if (err) {
+		netdev->features = oper_features;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
+					    netdev_features_t features)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	mutex_lock(&priv->state_lock);
+	if (!bitmap_empty(priv->fs.vlan.active_svlans, VLAN_N_VID)) {
+		/* HW strips the outer C-tag header, this is a problem
+		 * for S-tag traffic.
+		 */
+		features &= ~NETIF_F_HW_VLAN_CTAG_RX;
+		if (!priv->channels.params.vlan_strip_disable)
+			netdev_warn(netdev, "Dropping C-tag vlan stripping offload due to S-tag vlan\n");
+	}
+	mutex_unlock(&priv->state_lock);
+
+	return features;
+}
+
+static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_channels new_channels = {};
+	struct mlx5e_params *params;
+	int err = 0;
+	bool reset;
+
+	mutex_lock(&priv->state_lock);
+
+	params = &priv->channels.params;
+
+	reset = !params->lro_en;
+	reset = reset && test_bit(MLX5E_STATE_OPENED, &priv->state);
+
+	new_channels.params = *params;
+	new_channels.params.sw_mtu = new_mtu;
+
+	if (params->rq_wq_type != MLX5_WQ_TYPE_LINKED_LIST) {
+		u8 ppw_old = mlx5e_mpwqe_log_pkts_per_wqe(params);
+		u8 ppw_new = mlx5e_mpwqe_log_pkts_per_wqe(&new_channels.params);
+
+		reset = reset && (ppw_old != ppw_new);
+	}
+
+	if (!reset) {
+		params->sw_mtu = new_mtu;
+		mlx5e_set_dev_port_mtu(priv);
+		netdev->mtu = params->sw_mtu;
+		goto out;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto out;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, mlx5e_set_dev_port_mtu);
+	netdev->mtu = new_channels.params.sw_mtu;
+
+out:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr)
+{
+	struct hwtstamp_config config;
+	int err;
+
+	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	/* TX HW timestamp */
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+	case HWTSTAMP_TX_ON:
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	mutex_lock(&priv->state_lock);
+	/* RX HW timestamp */
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		/* Reset CQE compression to Admin default */
+		mlx5e_modify_rx_cqe_compression_locked(priv, priv->channels.params.rx_cqe_compress_def);
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
+		/* Disable CQE compression */
+		netdev_warn(priv->netdev, "Disabling cqe compression");
+		err = mlx5e_modify_rx_cqe_compression_locked(priv, false);
+		if (err) {
+			netdev_err(priv->netdev, "Failed disabling cqe compression err=%d\n", err);
+			mutex_unlock(&priv->state_lock);
+			return err;
+		}
+		config.rx_filter = HWTSTAMP_FILTER_ALL;
+		break;
+	default:
+		mutex_unlock(&priv->state_lock);
+		return -ERANGE;
+	}
+
+	memcpy(&priv->tstamp, &config, sizeof(config));
+	mutex_unlock(&priv->state_lock);
+
+	return copy_to_user(ifr->ifr_data, &config,
+			    sizeof(config)) ? -EFAULT : 0;
+}
+
+int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr)
+{
+	struct hwtstamp_config *cfg = &priv->tstamp;
+
+	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
+		return -EOPNOTSUPP;
+
+	return copy_to_user(ifr->ifr_data, cfg, sizeof(*cfg)) ? -EFAULT : 0;
+}
+
+static int mlx5e_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		return mlx5e_hwstamp_set(priv, ifr);
+	case SIOCGHWTSTAMP:
+		return mlx5e_hwstamp_get(priv, ifr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+#ifdef CONFIG_MLX5_ESWITCH
+static int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_eswitch_set_vport_mac(mdev->priv.eswitch, vf + 1, mac);
+}
+
+static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
+			     __be16 vlan_proto)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
+	return mlx5_eswitch_set_vport_vlan(mdev->priv.eswitch, vf + 1,
+					   vlan, qos);
+}
+
+static int mlx5e_set_vf_spoofchk(struct net_device *dev, int vf, bool setting)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_eswitch_set_vport_spoofchk(mdev->priv.eswitch, vf + 1, setting);
+}
+
+static int mlx5e_set_vf_trust(struct net_device *dev, int vf, bool setting)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_eswitch_set_vport_trust(mdev->priv.eswitch, vf + 1, setting);
+}
+
+static int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
+			     int max_tx_rate)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_eswitch_set_vport_rate(mdev->priv.eswitch, vf + 1,
+					   max_tx_rate, min_tx_rate);
+}
+
+static int mlx5_vport_link2ifla(u8 esw_link)
+{
+	switch (esw_link) {
+	case MLX5_ESW_VPORT_ADMIN_STATE_DOWN:
+		return IFLA_VF_LINK_STATE_DISABLE;
+	case MLX5_ESW_VPORT_ADMIN_STATE_UP:
+		return IFLA_VF_LINK_STATE_ENABLE;
+	}
+	return IFLA_VF_LINK_STATE_AUTO;
+}
+
+static int mlx5_ifla_link2vport(u8 ifla_link)
+{
+	switch (ifla_link) {
+	case IFLA_VF_LINK_STATE_DISABLE:
+		return MLX5_ESW_VPORT_ADMIN_STATE_DOWN;
+	case IFLA_VF_LINK_STATE_ENABLE:
+		return MLX5_ESW_VPORT_ADMIN_STATE_UP;
+	}
+	return MLX5_ESW_VPORT_ADMIN_STATE_AUTO;
+}
+
+static int mlx5e_set_vf_link_state(struct net_device *dev, int vf,
+				   int link_state)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_eswitch_set_vport_state(mdev->priv.eswitch, vf + 1,
+					    mlx5_ifla_link2vport(link_state));
+}
+
+static int mlx5e_get_vf_config(struct net_device *dev,
+			       int vf, struct ifla_vf_info *ivi)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx5_eswitch_get_vport_config(mdev->priv.eswitch, vf + 1, ivi);
+	if (err)
+		return err;
+	ivi->linkstate = mlx5_vport_link2ifla(ivi->linkstate);
+	return 0;
+}
+
+static int mlx5e_get_vf_stats(struct net_device *dev,
+			      int vf, struct ifla_vf_stats *vf_stats)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_eswitch_get_vport_stats(mdev->priv.eswitch, vf + 1,
+					    vf_stats);
+}
+#endif
+
+static void mlx5e_add_vxlan_port(struct net_device *netdev,
+				 struct udp_tunnel_info *ti)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+		return;
+
+	if (!mlx5e_vxlan_allowed(priv->mdev))
+		return;
+
+	mlx5e_vxlan_queue_work(priv, ti->sa_family, be16_to_cpu(ti->port), 1);
+}
+
+static void mlx5e_del_vxlan_port(struct net_device *netdev,
+				 struct udp_tunnel_info *ti)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+		return;
+
+	if (!mlx5e_vxlan_allowed(priv->mdev))
+		return;
+
+	mlx5e_vxlan_queue_work(priv, ti->sa_family, be16_to_cpu(ti->port), 0);
+}
+
+static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
+						     struct sk_buff *skb,
+						     netdev_features_t features)
+{
+	unsigned int offset = 0;
+	struct udphdr *udph;
+	u8 proto;
+	u16 port;
+
+	switch (vlan_get_protocol(skb)) {
+	case htons(ETH_P_IP):
+		proto = ip_hdr(skb)->protocol;
+		break;
+	case htons(ETH_P_IPV6):
+		proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL);
+		break;
+	default:
+		goto out;
+	}
+
+	switch (proto) {
+	case IPPROTO_GRE:
+		return features;
+	case IPPROTO_UDP:
+		udph = udp_hdr(skb);
+		port = be16_to_cpu(udph->dest);
+
+		/* Verify if UDP port is being offloaded by HW */
+		if (mlx5e_vxlan_lookup_port(priv, port))
+			return features;
+	}
+
+out:
+	/* Disable CSUM and GSO if the udp dport is not offloaded by HW */
+	return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+}
+
+static netdev_features_t mlx5e_features_check(struct sk_buff *skb,
+					      struct net_device *netdev,
+					      netdev_features_t features)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	features = vlan_features_check(skb, features);
+	features = vxlan_features_check(skb, features);
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+	if (mlx5e_ipsec_feature_check(skb, netdev, features))
+		return features;
+#endif
+
+	/* Validate if the tunneled packet is being offloaded by HW */
+	if (skb->encapsulation &&
+	    (features & NETIF_F_CSUM_MASK || features & NETIF_F_GSO_MASK))
+		return mlx5e_tunnel_features_check(priv, skb, features);
+
+	return features;
+}
+
+static bool mlx5e_tx_timeout_eq_recover(struct net_device *dev,
+					struct mlx5e_txqsq *sq)
+{
+	struct mlx5_eq *eq = sq->cq.mcq.eq;
+	u32 eqe_count;
+
+	netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
+		   eq->eqn, eq->cons_index, eq->irqn);
+
+	eqe_count = mlx5_eq_poll_irq_disabled(eq);
+	if (!eqe_count)
+		return false;
+
+	netdev_err(dev, "Recover %d eqes on EQ 0x%x\n", eqe_count, eq->eqn);
+	sq->channel->stats.eq_rearm++;
+	return true;
+}
+
+static void mlx5e_tx_timeout_work(struct work_struct *work)
+{
+	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
+					       tx_timeout_work);
+	struct net_device *dev = priv->netdev;
+	bool reopen_channels = false;
+	int i, err;
+
+	rtnl_lock();
+	mutex_lock(&priv->state_lock);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		goto unlock;
+
+	for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; i++) {
+		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, i);
+		struct mlx5e_txqsq *sq = priv->txq2sq[i];
+
+		if (!netif_xmit_stopped(dev_queue))
+			continue;
+
+		netdev_err(dev,
+			   "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n",
+			   i, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
+			   jiffies_to_usecs(jiffies - dev_queue->trans_start));
+
+		/* If we recover a lost interrupt, most likely TX timeout will
+		 * be resolved, skip reopening channels
+		 */
+		if (!mlx5e_tx_timeout_eq_recover(dev, sq)) {
+			clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+			reopen_channels = true;
+		}
+	}
+
+	if (!reopen_channels)
+		goto unlock;
+
+	mlx5e_close_locked(dev);
+	err = mlx5e_open_locked(dev);
+	if (err)
+		netdev_err(priv->netdev,
+			   "mlx5e_open_locked failed recovering from a tx_timeout, err(%d).\n",
+			   err);
+
+unlock:
+	mutex_unlock(&priv->state_lock);
+	rtnl_unlock();
+}
+
+static void mlx5e_tx_timeout(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	netdev_err(dev, "TX timeout detected\n");
+	queue_work(priv->wq, &priv->tx_timeout_work);
+}
+
+static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct bpf_prog *old_prog;
+	int err = 0;
+	bool reset, was_opened;
+	int i;
+
+	mutex_lock(&priv->state_lock);
+
+	if ((netdev->features & NETIF_F_LRO) && prog) {
+		netdev_warn(netdev, "can't set XDP while LRO is on, disable LRO first\n");
+		err = -EINVAL;
+		goto unlock;
+	}
+
+	if ((netdev->features & NETIF_F_HW_ESP) && prog) {
+		netdev_warn(netdev, "can't set XDP with IPSec offload\n");
+		err = -EINVAL;
+		goto unlock;
+	}
+
+	was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
+	/* no need for full reset when exchanging programs */
+	reset = (!priv->channels.params.xdp_prog || !prog);
+
+	if (was_opened && reset)
+		mlx5e_close_locked(netdev);
+	if (was_opened && !reset) {
+		/* num_channels is invariant here, so we can take the
+		 * batched reference right upfront.
+		 */
+		prog = bpf_prog_add(prog, priv->channels.num);
+		if (IS_ERR(prog)) {
+			err = PTR_ERR(prog);
+			goto unlock;
+		}
+	}
+
+	/* exchange programs, extra prog reference we got from caller
+	 * as long as we don't fail from this point onwards.
+	 */
+	old_prog = xchg(&priv->channels.params.xdp_prog, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	if (reset) /* change RQ type according to priv->xdp_prog */
+		mlx5e_set_rq_type(priv->mdev, &priv->channels.params);
+
+	if (was_opened && reset)
+		mlx5e_open_locked(netdev);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state) || reset)
+		goto unlock;
+
+	/* exchanging programs w/o reset, we update ref counts on behalf
+	 * of the channels RQs here.
+	 */
+	for (i = 0; i < priv->channels.num; i++) {
+		struct mlx5e_channel *c = priv->channels.c[i];
+
+		clear_bit(MLX5E_RQ_STATE_ENABLED, &c->rq.state);
+		napi_synchronize(&c->napi);
+		/* prevent mlx5e_poll_rx_cq from accessing rq->xdp_prog */
+
+		old_prog = xchg(&c->rq.xdp_prog, prog);
+
+		set_bit(MLX5E_RQ_STATE_ENABLED, &c->rq.state);
+		/* napi_schedule in case we have missed anything */
+		napi_schedule(&c->napi);
+
+		if (old_prog)
+			bpf_prog_put(old_prog);
+	}
+
+unlock:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+static u32 mlx5e_xdp_query(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	const struct bpf_prog *xdp_prog;
+	u32 prog_id = 0;
+
+	mutex_lock(&priv->state_lock);
+	xdp_prog = priv->channels.params.xdp_prog;
+	if (xdp_prog)
+		prog_id = xdp_prog->aux->id;
+	mutex_unlock(&priv->state_lock);
+
+	return prog_id;
+}
+
+static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return mlx5e_xdp_set(dev, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_id = mlx5e_xdp_query(dev);
+		xdp->prog_attached = !!xdp->prog_id;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Fake "interrupt" called by netpoll (eg netconsole) to send skbs without
+ * reenabling interrupts.
+ */
+static void mlx5e_netpoll(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_channels *chs = &priv->channels;
+
+	int i;
+
+	for (i = 0; i < chs->num; i++)
+		napi_schedule(&chs->c[i]->napi);
+}
+#endif
+
+static const struct net_device_ops mlx5e_netdev_ops = {
+	.ndo_open                = mlx5e_open,
+	.ndo_stop                = mlx5e_close,
+	.ndo_start_xmit          = mlx5e_xmit,
+	.ndo_setup_tc            = mlx5e_setup_tc,
+	.ndo_select_queue        = mlx5e_select_queue,
+	.ndo_get_stats64         = mlx5e_get_stats,
+	.ndo_set_rx_mode         = mlx5e_set_rx_mode,
+	.ndo_set_mac_address     = mlx5e_set_mac,
+	.ndo_vlan_rx_add_vid     = mlx5e_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid    = mlx5e_vlan_rx_kill_vid,
+	.ndo_set_features        = mlx5e_set_features,
+	.ndo_fix_features        = mlx5e_fix_features,
+	.ndo_change_mtu          = mlx5e_change_mtu,
+	.ndo_do_ioctl            = mlx5e_ioctl,
+	.ndo_set_tx_maxrate      = mlx5e_set_tx_maxrate,
+	.ndo_udp_tunnel_add      = mlx5e_add_vxlan_port,
+	.ndo_udp_tunnel_del      = mlx5e_del_vxlan_port,
+	.ndo_features_check      = mlx5e_features_check,
+#ifdef CONFIG_RFS_ACCEL
+	.ndo_rx_flow_steer	 = mlx5e_rx_flow_steer,
+#endif
+	.ndo_tx_timeout          = mlx5e_tx_timeout,
+	.ndo_bpf		 = mlx5e_xdp,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller     = mlx5e_netpoll,
+#endif
+#ifdef CONFIG_MLX5_ESWITCH
+	/* SRIOV E-Switch NDOs */
+	.ndo_set_vf_mac          = mlx5e_set_vf_mac,
+	.ndo_set_vf_vlan         = mlx5e_set_vf_vlan,
+	.ndo_set_vf_spoofchk     = mlx5e_set_vf_spoofchk,
+	.ndo_set_vf_trust        = mlx5e_set_vf_trust,
+	.ndo_set_vf_rate         = mlx5e_set_vf_rate,
+	.ndo_get_vf_config       = mlx5e_get_vf_config,
+	.ndo_set_vf_link_state   = mlx5e_set_vf_link_state,
+	.ndo_get_vf_stats        = mlx5e_get_vf_stats,
+	.ndo_has_offload_stats	 = mlx5e_has_offload_stats,
+	.ndo_get_offload_stats	 = mlx5e_get_offload_stats,
+#endif
+};
+
+static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
+{
+	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return -EOPNOTSUPP;
+	if (!MLX5_CAP_GEN(mdev, eth_net_offloads) ||
+	    !MLX5_CAP_GEN(mdev, nic_flow_table) ||
+	    !MLX5_CAP_ETH(mdev, csum_cap) ||
+	    !MLX5_CAP_ETH(mdev, max_lso_cap) ||
+	    !MLX5_CAP_ETH(mdev, vlan_cap) ||
+	    !MLX5_CAP_ETH(mdev, rss_ind_tbl_cap) ||
+	    MLX5_CAP_FLOWTABLE(mdev,
+			       flow_table_properties_nic_receive.max_ft_level)
+			       < 3) {
+		mlx5_core_warn(mdev,
+			       "Not creating net device, some required device capabilities are missing\n");
+		return -EOPNOTSUPP;
+	}
+	if (!MLX5_CAP_ETH(mdev, self_lb_en_modifiable))
+		mlx5_core_warn(mdev, "Self loop back prevention is not supported\n");
+	if (!MLX5_CAP_GEN(mdev, cq_moderation))
+		mlx5_core_warn(mdev, "CQ moderation is not supported\n");
+
+	return 0;
+}
+
+void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len,
+				   int num_channels)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		indirection_rqt[i] = i % num_channels;
+}
+
+static bool slow_pci_heuristic(struct mlx5_core_dev *mdev)
+{
+	u32 link_speed = 0;
+	u32 pci_bw = 0;
+
+	mlx5e_get_max_linkspeed(mdev, &link_speed);
+	pci_bw = pcie_bandwidth_available(mdev->pdev, NULL, NULL, NULL);
+	mlx5_core_dbg_once(mdev, "Max link speed = %d, PCI BW = %d\n",
+			   link_speed, pci_bw);
+
+#define MLX5E_SLOW_PCI_RATIO (2)
+
+	return link_speed && pci_bw &&
+		link_speed > MLX5E_SLOW_PCI_RATIO * pci_bw;
+}
+
+void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
+{
+	params->tx_cq_moderation.cq_period_mode = cq_period_mode;
+
+	params->tx_cq_moderation.pkts =
+		MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
+	params->tx_cq_moderation.usec =
+		MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC;
+
+	if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE)
+		params->tx_cq_moderation.usec =
+			MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC_FROM_CQE;
+
+	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_TX_CQE_BASED_MODER,
+			params->tx_cq_moderation.cq_period_mode ==
+				MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
+}
+
+void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
+{
+	params->rx_cq_moderation.cq_period_mode = cq_period_mode;
+
+	params->rx_cq_moderation.pkts =
+		MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS;
+	params->rx_cq_moderation.usec =
+		MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
+
+	if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE)
+		params->rx_cq_moderation.usec =
+			MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE;
+
+	if (params->rx_dim_enabled) {
+		switch (cq_period_mode) {
+		case MLX5_CQ_PERIOD_MODE_START_FROM_CQE:
+			params->rx_cq_moderation =
+				net_dim_get_def_profile(NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE);
+			break;
+		case MLX5_CQ_PERIOD_MODE_START_FROM_EQE:
+		default:
+			params->rx_cq_moderation =
+				net_dim_get_def_profile(NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE);
+		}
+	}
+
+	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_BASED_MODER,
+			params->rx_cq_moderation.cq_period_mode ==
+				MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
+}
+
+static u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout)
+{
+	int i;
+
+	/* The supported periods are organized in ascending order */
+	for (i = 0; i < MLX5E_LRO_TIMEOUT_ARR_SIZE - 1; i++)
+		if (MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]) >= wanted_timeout)
+			break;
+
+	return MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]);
+}
+
+void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
+			    struct mlx5e_params *params,
+			    u16 max_channels, u16 mtu)
+{
+	u8 rx_cq_period_mode;
+
+	params->sw_mtu = mtu;
+	params->hard_mtu = MLX5E_ETH_HARD_MTU;
+	params->num_channels = max_channels;
+	params->num_tc       = 1;
+
+	/* SQ */
+	params->log_sq_size = is_kdump_kernel() ?
+		MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE :
+		MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
+
+	/* set CQE compression */
+	params->rx_cqe_compress_def = false;
+	if (MLX5_CAP_GEN(mdev, cqe_compression) &&
+	    MLX5_CAP_GEN(mdev, vport_group_manager))
+		params->rx_cqe_compress_def = slow_pci_heuristic(mdev);
+
+	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS, params->rx_cqe_compress_def);
+
+	/* RQ */
+	if (mlx5e_striding_rq_possible(mdev, params))
+		MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ,
+				!slow_pci_heuristic(mdev));
+	mlx5e_set_rq_type(mdev, params);
+	mlx5e_init_rq_type_params(mdev, params);
+
+	/* HW LRO */
+
+	/* TODO: && MLX5_CAP_ETH(mdev, lro_cap) */
+	if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
+		if (!mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
+			params->lro_en = !slow_pci_heuristic(mdev);
+	params->lro_timeout = mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_LRO_TIMEOUT);
+
+	/* CQ moderation params */
+	rx_cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
+			MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
+			MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
+	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
+	mlx5e_set_rx_cq_mode_params(params, rx_cq_period_mode);
+	mlx5e_set_tx_cq_mode_params(params, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
+
+	/* TX inline */
+	params->tx_min_inline_mode = mlx5e_params_calculate_tx_min_inline(mdev);
+
+	/* RSS */
+	params->rss_hfunc = ETH_RSS_HASH_XOR;
+	netdev_rss_key_fill(params->toeplitz_hash_key, sizeof(params->toeplitz_hash_key));
+	mlx5e_build_default_indir_rqt(params->indirection_rqt,
+				      MLX5E_INDIR_RQT_SIZE, max_channels);
+}
+
+static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev,
+					struct net_device *netdev,
+					const struct mlx5e_profile *profile,
+					void *ppriv)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	priv->mdev        = mdev;
+	priv->netdev      = netdev;
+	priv->profile     = profile;
+	priv->ppriv       = ppriv;
+	priv->msglevel    = MLX5E_MSG_LEVEL;
+
+	mlx5e_build_nic_params(mdev, &priv->channels.params,
+			       profile->max_nch(mdev), netdev->mtu);
+
+	mutex_init(&priv->state_lock);
+
+	INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work);
+	INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work);
+	INIT_WORK(&priv->tx_timeout_work, mlx5e_tx_timeout_work);
+	INIT_DELAYED_WORK(&priv->update_stats_work, mlx5e_update_stats_work);
+
+	mlx5e_timestamp_init(priv);
+}
+
+static void mlx5e_set_netdev_dev_addr(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	mlx5_query_nic_vport_mac_address(priv->mdev, 0, netdev->dev_addr);
+	if (is_zero_ether_addr(netdev->dev_addr) &&
+	    !MLX5_CAP_GEN(priv->mdev, vport_group_manager)) {
+		eth_hw_addr_random(netdev);
+		mlx5_core_info(priv->mdev, "Assigned random MAC address %pM\n", netdev->dev_addr);
+	}
+}
+
+#if IS_ENABLED(CONFIG_MLX5_ESWITCH)
+static const struct switchdev_ops mlx5e_switchdev_ops = {
+	.switchdev_port_attr_get	= mlx5e_attr_get,
+};
+#endif
+
+static void mlx5e_build_nic_netdev(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	bool fcs_supported;
+	bool fcs_enabled;
+
+	SET_NETDEV_DEV(netdev, &mdev->pdev->dev);
+
+	netdev->netdev_ops = &mlx5e_netdev_ops;
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	if (MLX5_CAP_GEN(mdev, vport_group_manager) && MLX5_CAP_GEN(mdev, qos))
+		netdev->dcbnl_ops = &mlx5e_dcbnl_ops;
+#endif
+
+	netdev->watchdog_timeo    = 15 * HZ;
+
+	netdev->ethtool_ops	  = &mlx5e_ethtool_ops;
+
+	netdev->vlan_features    |= NETIF_F_SG;
+	netdev->vlan_features    |= NETIF_F_IP_CSUM;
+	netdev->vlan_features    |= NETIF_F_IPV6_CSUM;
+	netdev->vlan_features    |= NETIF_F_GRO;
+	netdev->vlan_features    |= NETIF_F_TSO;
+	netdev->vlan_features    |= NETIF_F_TSO6;
+	netdev->vlan_features    |= NETIF_F_RXCSUM;
+	netdev->vlan_features    |= NETIF_F_RXHASH;
+
+	netdev->hw_enc_features  |= NETIF_F_HW_VLAN_CTAG_TX;
+	netdev->hw_enc_features  |= NETIF_F_HW_VLAN_CTAG_RX;
+
+	if (!!MLX5_CAP_ETH(mdev, lro_cap))
+		netdev->vlan_features    |= NETIF_F_LRO;
+
+	netdev->hw_features       = netdev->vlan_features;
+	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_TX;
+	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_RX;
+	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_FILTER;
+	netdev->hw_features      |= NETIF_F_HW_VLAN_STAG_TX;
+
+	if (mlx5e_vxlan_allowed(mdev) || MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
+		netdev->hw_features     |= NETIF_F_GSO_PARTIAL;
+		netdev->hw_enc_features |= NETIF_F_IP_CSUM;
+		netdev->hw_enc_features |= NETIF_F_IPV6_CSUM;
+		netdev->hw_enc_features |= NETIF_F_TSO;
+		netdev->hw_enc_features |= NETIF_F_TSO6;
+		netdev->hw_enc_features |= NETIF_F_GSO_PARTIAL;
+	}
+
+	if (mlx5e_vxlan_allowed(mdev)) {
+		netdev->hw_features     |= NETIF_F_GSO_UDP_TUNNEL |
+					   NETIF_F_GSO_UDP_TUNNEL_CSUM;
+		netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL |
+					   NETIF_F_GSO_UDP_TUNNEL_CSUM;
+		netdev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM;
+	}
+
+	if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
+		netdev->hw_features     |= NETIF_F_GSO_GRE |
+					   NETIF_F_GSO_GRE_CSUM;
+		netdev->hw_enc_features |= NETIF_F_GSO_GRE |
+					   NETIF_F_GSO_GRE_CSUM;
+		netdev->gso_partial_features |= NETIF_F_GSO_GRE |
+						NETIF_F_GSO_GRE_CSUM;
+	}
+
+	mlx5_query_port_fcs(mdev, &fcs_supported, &fcs_enabled);
+
+	if (fcs_supported)
+		netdev->hw_features |= NETIF_F_RXALL;
+
+	if (MLX5_CAP_ETH(mdev, scatter_fcs))
+		netdev->hw_features |= NETIF_F_RXFCS;
+
+	netdev->features          = netdev->hw_features;
+	if (!priv->channels.params.lro_en)
+		netdev->features  &= ~NETIF_F_LRO;
+
+	if (fcs_enabled)
+		netdev->features  &= ~NETIF_F_RXALL;
+
+	if (!priv->channels.params.scatter_fcs_en)
+		netdev->features  &= ~NETIF_F_RXFCS;
+
+#define FT_CAP(f) MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.f)
+	if (FT_CAP(flow_modify_en) &&
+	    FT_CAP(modify_root) &&
+	    FT_CAP(identified_miss_table_mode) &&
+	    FT_CAP(flow_table_modify)) {
+		netdev->hw_features      |= NETIF_F_HW_TC;
+#ifdef CONFIG_RFS_ACCEL
+		netdev->hw_features	 |= NETIF_F_NTUPLE;
+#endif
+	}
+
+	netdev->features         |= NETIF_F_HIGHDMA;
+	netdev->features         |= NETIF_F_HW_VLAN_STAG_FILTER;
+
+	netdev->priv_flags       |= IFF_UNICAST_FLT;
+
+	mlx5e_set_netdev_dev_addr(netdev);
+
+#if IS_ENABLED(CONFIG_MLX5_ESWITCH)
+	if (MLX5_VPORT_MANAGER(mdev))
+		netdev->switchdev_ops = &mlx5e_switchdev_ops;
+#endif
+
+	mlx5e_ipsec_build_netdev(priv);
+}
+
+static void mlx5e_create_q_counters(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx5_core_alloc_q_counter(mdev, &priv->q_counter);
+	if (err) {
+		mlx5_core_warn(mdev, "alloc queue counter failed, %d\n", err);
+		priv->q_counter = 0;
+	}
+
+	err = mlx5_core_alloc_q_counter(mdev, &priv->drop_rq_q_counter);
+	if (err) {
+		mlx5_core_warn(mdev, "alloc drop RQ counter failed, %d\n", err);
+		priv->drop_rq_q_counter = 0;
+	}
+}
+
+static void mlx5e_destroy_q_counters(struct mlx5e_priv *priv)
+{
+	if (priv->q_counter)
+		mlx5_core_dealloc_q_counter(priv->mdev, priv->q_counter);
+
+	if (priv->drop_rq_q_counter)
+		mlx5_core_dealloc_q_counter(priv->mdev, priv->drop_rq_q_counter);
+}
+
+static void mlx5e_nic_init(struct mlx5_core_dev *mdev,
+			   struct net_device *netdev,
+			   const struct mlx5e_profile *profile,
+			   void *ppriv)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	mlx5e_build_nic_netdev_priv(mdev, netdev, profile, ppriv);
+	err = mlx5e_ipsec_init(priv);
+	if (err)
+		mlx5_core_err(mdev, "IPSec initialization failed, %d\n", err);
+	mlx5e_build_nic_netdev(netdev);
+	mlx5e_vxlan_init(priv);
+}
+
+static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
+{
+	mlx5e_ipsec_cleanup(priv);
+	mlx5e_vxlan_cleanup(priv);
+}
+
+static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx5e_create_indirect_rqt(priv);
+	if (err)
+		return err;
+
+	err = mlx5e_create_direct_rqts(priv);
+	if (err)
+		goto err_destroy_indirect_rqts;
+
+	err = mlx5e_create_indirect_tirs(priv);
+	if (err)
+		goto err_destroy_direct_rqts;
+
+	err = mlx5e_create_direct_tirs(priv);
+	if (err)
+		goto err_destroy_indirect_tirs;
+
+	err = mlx5e_create_flow_steering(priv);
+	if (err) {
+		mlx5_core_warn(mdev, "create flow steering failed, %d\n", err);
+		goto err_destroy_direct_tirs;
+	}
+
+	err = mlx5e_tc_init(priv);
+	if (err)
+		goto err_destroy_flow_steering;
+
+	return 0;
+
+err_destroy_flow_steering:
+	mlx5e_destroy_flow_steering(priv);
+err_destroy_direct_tirs:
+	mlx5e_destroy_direct_tirs(priv);
+err_destroy_indirect_tirs:
+	mlx5e_destroy_indirect_tirs(priv);
+err_destroy_direct_rqts:
+	mlx5e_destroy_direct_rqts(priv);
+err_destroy_indirect_rqts:
+	mlx5e_destroy_rqt(priv, &priv->indir_rqt);
+	return err;
+}
+
+static void mlx5e_cleanup_nic_rx(struct mlx5e_priv *priv)
+{
+	mlx5e_tc_cleanup(priv);
+	mlx5e_destroy_flow_steering(priv);
+	mlx5e_destroy_direct_tirs(priv);
+	mlx5e_destroy_indirect_tirs(priv);
+	mlx5e_destroy_direct_rqts(priv);
+	mlx5e_destroy_rqt(priv, &priv->indir_rqt);
+}
+
+static int mlx5e_init_nic_tx(struct mlx5e_priv *priv)
+{
+	int err;
+
+	err = mlx5e_create_tises(priv);
+	if (err) {
+		mlx5_core_warn(priv->mdev, "create tises failed, %d\n", err);
+		return err;
+	}
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	mlx5e_dcbnl_initialize(priv);
+#endif
+	return 0;
+}
+
+static void mlx5e_nic_enable(struct mlx5e_priv *priv)
+{
+	struct net_device *netdev = priv->netdev;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 max_mtu;
+
+	mlx5e_init_l2_addr(priv);
+
+	/* Marking the link as currently not needed by the Driver */
+	if (!netif_running(netdev))
+		mlx5_set_port_admin_status(mdev, MLX5_PORT_DOWN);
+
+	/* MTU range: 68 - hw-specific max */
+	netdev->min_mtu = ETH_MIN_MTU;
+	mlx5_query_port_max_mtu(priv->mdev, &max_mtu, 1);
+	netdev->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu);
+	mlx5e_set_dev_port_mtu(priv);
+
+	mlx5_lag_add(mdev, netdev);
+
+	mlx5e_enable_async_events(priv);
+
+	if (MLX5_VPORT_MANAGER(priv->mdev))
+		mlx5e_register_vport_reps(priv);
+
+	if (netdev->reg_state != NETREG_REGISTERED)
+		return;
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	mlx5e_dcbnl_init_app(priv);
+#endif
+
+	queue_work(priv->wq, &priv->set_rx_mode_work);
+
+	rtnl_lock();
+	if (netif_running(netdev))
+		mlx5e_open(netdev);
+	netif_device_attach(netdev);
+	rtnl_unlock();
+}
+
+static void mlx5e_nic_disable(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	if (priv->netdev->reg_state == NETREG_REGISTERED)
+		mlx5e_dcbnl_delete_app(priv);
+#endif
+
+	rtnl_lock();
+	if (netif_running(priv->netdev))
+		mlx5e_close(priv->netdev);
+	netif_device_detach(priv->netdev);
+	rtnl_unlock();
+
+	queue_work(priv->wq, &priv->set_rx_mode_work);
+
+	if (MLX5_VPORT_MANAGER(priv->mdev))
+		mlx5e_unregister_vport_reps(priv);
+
+	mlx5e_disable_async_events(priv);
+	mlx5_lag_remove(mdev);
+}
+
+static const struct mlx5e_profile mlx5e_nic_profile = {
+	.init		   = mlx5e_nic_init,
+	.cleanup	   = mlx5e_nic_cleanup,
+	.init_rx	   = mlx5e_init_nic_rx,
+	.cleanup_rx	   = mlx5e_cleanup_nic_rx,
+	.init_tx	   = mlx5e_init_nic_tx,
+	.cleanup_tx	   = mlx5e_cleanup_nic_tx,
+	.enable		   = mlx5e_nic_enable,
+	.disable	   = mlx5e_nic_disable,
+	.update_stats	   = mlx5e_update_ndo_stats,
+	.max_nch	   = mlx5e_get_max_num_channels,
+	.update_carrier	   = mlx5e_update_carrier,
+	.rx_handlers.handle_rx_cqe       = mlx5e_handle_rx_cqe,
+	.rx_handlers.handle_rx_cqe_mpwqe = mlx5e_handle_rx_cqe_mpwrq,
+	.max_tc		   = MLX5E_MAX_NUM_TC,
+};
+
+/* mlx5e generic netdev management API (move to en_common.c) */
+
+struct net_device *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
+				       const struct mlx5e_profile *profile,
+				       void *ppriv)
+{
+	int nch = profile->max_nch(mdev);
+	struct net_device *netdev;
+	struct mlx5e_priv *priv;
+
+	netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv),
+				    nch * profile->max_tc,
+				    nch);
+	if (!netdev) {
+		mlx5_core_err(mdev, "alloc_etherdev_mqs() failed\n");
+		return NULL;
+	}
+
+#ifdef CONFIG_RFS_ACCEL
+	netdev->rx_cpu_rmap = mdev->rmap;
+#endif
+
+	profile->init(mdev, netdev, profile, ppriv);
+
+	netif_carrier_off(netdev);
+
+	priv = netdev_priv(netdev);
+
+	priv->wq = create_singlethread_workqueue("mlx5e");
+	if (!priv->wq)
+		goto err_cleanup_nic;
+
+	return netdev;
+
+err_cleanup_nic:
+	if (profile->cleanup)
+		profile->cleanup(priv);
+	free_netdev(netdev);
+
+	return NULL;
+}
+
+int mlx5e_attach_netdev(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	const struct mlx5e_profile *profile;
+	int err;
+
+	profile = priv->profile;
+	clear_bit(MLX5E_STATE_DESTROYING, &priv->state);
+
+	err = profile->init_tx(priv);
+	if (err)
+		goto out;
+
+	mlx5e_create_q_counters(priv);
+
+	err = mlx5e_open_drop_rq(priv, &priv->drop_rq);
+	if (err) {
+		mlx5_core_err(mdev, "open drop rq failed, %d\n", err);
+		goto err_destroy_q_counters;
+	}
+
+	err = profile->init_rx(priv);
+	if (err)
+		goto err_close_drop_rq;
+
+	if (profile->enable)
+		profile->enable(priv);
+
+	return 0;
+
+err_close_drop_rq:
+	mlx5e_close_drop_rq(&priv->drop_rq);
+
+err_destroy_q_counters:
+	mlx5e_destroy_q_counters(priv);
+	profile->cleanup_tx(priv);
+
+out:
+	return err;
+}
+
+void mlx5e_detach_netdev(struct mlx5e_priv *priv)
+{
+	const struct mlx5e_profile *profile = priv->profile;
+
+	set_bit(MLX5E_STATE_DESTROYING, &priv->state);
+
+	if (profile->disable)
+		profile->disable(priv);
+	flush_workqueue(priv->wq);
+
+	profile->cleanup_rx(priv);
+	mlx5e_close_drop_rq(&priv->drop_rq);
+	mlx5e_destroy_q_counters(priv);
+	profile->cleanup_tx(priv);
+	cancel_delayed_work_sync(&priv->update_stats_work);
+}
+
+void mlx5e_destroy_netdev(struct mlx5e_priv *priv)
+{
+	const struct mlx5e_profile *profile = priv->profile;
+	struct net_device *netdev = priv->netdev;
+
+	destroy_workqueue(priv->wq);
+	if (profile->cleanup)
+		profile->cleanup(priv);
+	free_netdev(netdev);
+}
+
+/* mlx5e_attach and mlx5e_detach scope should be only creating/destroying
+ * hardware contexts and to connect it to the current netdev.
+ */
+static int mlx5e_attach(struct mlx5_core_dev *mdev, void *vpriv)
+{
+	struct mlx5e_priv *priv = vpriv;
+	struct net_device *netdev = priv->netdev;
+	int err;
+
+	if (netif_device_present(netdev))
+		return 0;
+
+	err = mlx5e_create_mdev_resources(mdev);
+	if (err)
+		return err;
+
+	err = mlx5e_attach_netdev(priv);
+	if (err) {
+		mlx5e_destroy_mdev_resources(mdev);
+		return err;
+	}
+
+	return 0;
+}
+
+static void mlx5e_detach(struct mlx5_core_dev *mdev, void *vpriv)
+{
+	struct mlx5e_priv *priv = vpriv;
+	struct net_device *netdev = priv->netdev;
+
+	if (!netif_device_present(netdev))
+		return;
+
+	mlx5e_detach_netdev(priv);
+	mlx5e_destroy_mdev_resources(mdev);
+}
+
+static void *mlx5e_add(struct mlx5_core_dev *mdev)
+{
+	struct net_device *netdev;
+	void *rpriv = NULL;
+	void *priv;
+	int err;
+
+	err = mlx5e_check_required_hca_cap(mdev);
+	if (err)
+		return NULL;
+
+#ifdef CONFIG_MLX5_ESWITCH
+	if (MLX5_VPORT_MANAGER(mdev)) {
+		rpriv = mlx5e_alloc_nic_rep_priv(mdev);
+		if (!rpriv) {
+			mlx5_core_warn(mdev, "Failed to alloc NIC rep priv data\n");
+			return NULL;
+		}
+	}
+#endif
+
+	netdev = mlx5e_create_netdev(mdev, &mlx5e_nic_profile, rpriv);
+	if (!netdev) {
+		mlx5_core_err(mdev, "mlx5e_create_netdev failed\n");
+		goto err_free_rpriv;
+	}
+
+	priv = netdev_priv(netdev);
+
+	err = mlx5e_attach(mdev, priv);
+	if (err) {
+		mlx5_core_err(mdev, "mlx5e_attach failed, %d\n", err);
+		goto err_destroy_netdev;
+	}
+
+	err = register_netdev(netdev);
+	if (err) {
+		mlx5_core_err(mdev, "register_netdev failed, %d\n", err);
+		goto err_detach;
+	}
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	mlx5e_dcbnl_init_app(priv);
+#endif
+	return priv;
+
+err_detach:
+	mlx5e_detach(mdev, priv);
+err_destroy_netdev:
+	mlx5e_destroy_netdev(priv);
+err_free_rpriv:
+	kfree(rpriv);
+	return NULL;
+}
+
+static void mlx5e_remove(struct mlx5_core_dev *mdev, void *vpriv)
+{
+	struct mlx5e_priv *priv = vpriv;
+	void *ppriv = priv->ppriv;
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	mlx5e_dcbnl_delete_app(priv);
+#endif
+	unregister_netdev(priv->netdev);
+	mlx5e_detach(mdev, vpriv);
+	mlx5e_destroy_netdev(priv);
+	kfree(ppriv);
+}
+
+static void *mlx5e_get_netdev(void *vpriv)
+{
+	struct mlx5e_priv *priv = vpriv;
+
+	return priv->netdev;
+}
+
+static struct mlx5_interface mlx5e_interface = {
+	.add       = mlx5e_add,
+	.remove    = mlx5e_remove,
+	.attach    = mlx5e_attach,
+	.detach    = mlx5e_detach,
+	.event     = mlx5e_async_event,
+	.protocol  = MLX5_INTERFACE_PROTOCOL_ETH,
+	.get_dev   = mlx5e_get_netdev,
+};
+
+void mlx5e_init(void)
+{
+	mlx5e_ipsec_build_inverse_table();
+	mlx5e_build_ptys2ethtool_map();
+	mlx5_register_interface(&mlx5e_interface);
+}
+
+void mlx5e_cleanup(void)
+{
+	mlx5_unregister_interface(&mlx5e_interface);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
new file mode 100644
index 0000000..876c3e4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -0,0 +1,1239 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <generated/utsrelease.h>
+#include <linux/mlx5/fs.h>
+#include <net/switchdev.h>
+#include <net/pkt_cls.h>
+#include <net/act_api.h>
+#include <net/netevent.h>
+#include <net/arp.h>
+
+#include "eswitch.h"
+#include "en.h"
+#include "en_rep.h"
+#include "en_tc.h"
+#include "fs_core.h"
+
+#define MLX5E_REP_PARAMS_LOG_SQ_SIZE \
+	max(0x6, MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)
+#define MLX5E_REP_PARAMS_LOG_RQ_SIZE \
+	max(0x6, MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE)
+
+static const char mlx5e_rep_driver_name[] = "mlx5e_rep";
+
+static void mlx5e_rep_get_drvinfo(struct net_device *dev,
+				  struct ethtool_drvinfo *drvinfo)
+{
+	strlcpy(drvinfo->driver, mlx5e_rep_driver_name,
+		sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, UTS_RELEASE, sizeof(drvinfo->version));
+}
+
+static const struct counter_desc sw_rep_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_bytes) },
+};
+
+#define NUM_VPORT_REP_COUNTERS	ARRAY_SIZE(sw_rep_stats_desc)
+
+static void mlx5e_rep_get_strings(struct net_device *dev,
+				  u32 stringset, uint8_t *data)
+{
+	int i;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < NUM_VPORT_REP_COUNTERS; i++)
+			strcpy(data + (i * ETH_GSTRING_LEN),
+			       sw_rep_stats_desc[i].format);
+		break;
+	}
+}
+
+static void mlx5e_rep_update_hw_counters(struct mlx5e_priv *priv)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct rtnl_link_stats64 *vport_stats;
+	struct ifla_vf_stats vf_stats;
+	int err;
+
+	err = mlx5_eswitch_get_vport_stats(esw, rep->vport, &vf_stats);
+	if (err) {
+		pr_warn("vport %d error %d reading stats\n", rep->vport, err);
+		return;
+	}
+
+	vport_stats = &priv->stats.vf_vport;
+	/* flip tx/rx as we are reporting the counters for the switch vport */
+	vport_stats->rx_packets = vf_stats.tx_packets;
+	vport_stats->rx_bytes   = vf_stats.tx_bytes;
+	vport_stats->tx_packets = vf_stats.rx_packets;
+	vport_stats->tx_bytes   = vf_stats.rx_bytes;
+}
+
+static void mlx5e_rep_update_sw_counters(struct mlx5e_priv *priv)
+{
+	struct mlx5e_sw_stats *s = &priv->stats.sw;
+	struct mlx5e_rq_stats *rq_stats;
+	struct mlx5e_sq_stats *sq_stats;
+	int i, j;
+
+	memset(s, 0, sizeof(*s));
+	for (i = 0; i < priv->channels.num; i++) {
+		struct mlx5e_channel *c = priv->channels.c[i];
+
+		rq_stats = &c->rq.stats;
+
+		s->rx_packets	+= rq_stats->packets;
+		s->rx_bytes	+= rq_stats->bytes;
+
+		for (j = 0; j < priv->channels.params.num_tc; j++) {
+			sq_stats = &c->sq[j].stats;
+
+			s->tx_packets		+= sq_stats->packets;
+			s->tx_bytes		+= sq_stats->bytes;
+		}
+	}
+}
+
+static void mlx5e_rep_update_stats(struct mlx5e_priv *priv)
+{
+	mlx5e_rep_update_sw_counters(priv);
+	mlx5e_rep_update_hw_counters(priv);
+}
+
+static void mlx5e_rep_get_ethtool_stats(struct net_device *dev,
+					struct ethtool_stats *stats, u64 *data)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int i;
+
+	if (!data)
+		return;
+
+	mutex_lock(&priv->state_lock);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		mlx5e_rep_update_sw_counters(priv);
+	mutex_unlock(&priv->state_lock);
+
+	for (i = 0; i < NUM_VPORT_REP_COUNTERS; i++)
+		data[i] = MLX5E_READ_CTR64_CPU(&priv->stats.sw,
+					       sw_rep_stats_desc, i);
+}
+
+static int mlx5e_rep_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return NUM_VPORT_REP_COUNTERS;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static const struct ethtool_ops mlx5e_rep_ethtool_ops = {
+	.get_drvinfo	   = mlx5e_rep_get_drvinfo,
+	.get_link	   = ethtool_op_get_link,
+	.get_strings       = mlx5e_rep_get_strings,
+	.get_sset_count    = mlx5e_rep_get_sset_count,
+	.get_ethtool_stats = mlx5e_rep_get_ethtool_stats,
+};
+
+int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+
+	if (esw->mode == SRIOV_NONE)
+		return -EOPNOTSUPP;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
+		attr->u.ppid.id_len = ETH_ALEN;
+		ether_addr_copy(attr->u.ppid.id, rep->hw_id);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static void mlx5e_sqs2vport_stop(struct mlx5_eswitch *esw,
+				 struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_rep_sq *rep_sq, *tmp;
+	struct mlx5e_rep_priv *rpriv;
+
+	if (esw->mode != SRIOV_OFFLOADS)
+		return;
+
+	rpriv = mlx5e_rep_to_rep_priv(rep);
+	list_for_each_entry_safe(rep_sq, tmp, &rpriv->vport_sqs_list, list) {
+		mlx5_eswitch_del_send_to_vport_rule(rep_sq->send_to_vport_rule);
+		list_del(&rep_sq->list);
+		kfree(rep_sq);
+	}
+}
+
+static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
+				 struct mlx5_eswitch_rep *rep,
+				 u32 *sqns_array, int sqns_num)
+{
+	struct mlx5_flow_handle *flow_rule;
+	struct mlx5e_rep_priv *rpriv;
+	struct mlx5e_rep_sq *rep_sq;
+	int err;
+	int i;
+
+	if (esw->mode != SRIOV_OFFLOADS)
+		return 0;
+
+	rpriv = mlx5e_rep_to_rep_priv(rep);
+	for (i = 0; i < sqns_num; i++) {
+		rep_sq = kzalloc(sizeof(*rep_sq), GFP_KERNEL);
+		if (!rep_sq) {
+			err = -ENOMEM;
+			goto out_err;
+		}
+
+		/* Add re-inject rule to the PF/representor sqs */
+		flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw,
+								rep->vport,
+								sqns_array[i]);
+		if (IS_ERR(flow_rule)) {
+			err = PTR_ERR(flow_rule);
+			kfree(rep_sq);
+			goto out_err;
+		}
+		rep_sq->send_to_vport_rule = flow_rule;
+		list_add(&rep_sq->list, &rpriv->vport_sqs_list);
+	}
+	return 0;
+
+out_err:
+	mlx5e_sqs2vport_stop(esw, rep);
+	return err;
+}
+
+int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct mlx5e_channel *c;
+	int n, tc, num_sqs = 0;
+	int err = -ENOMEM;
+	u32 *sqs;
+
+	sqs = kcalloc(priv->channels.num * priv->channels.params.num_tc, sizeof(*sqs), GFP_KERNEL);
+	if (!sqs)
+		goto out;
+
+	for (n = 0; n < priv->channels.num; n++) {
+		c = priv->channels.c[n];
+		for (tc = 0; tc < c->num_tc; tc++)
+			sqs[num_sqs++] = c->sq[tc].sqn;
+	}
+
+	err = mlx5e_sqs2vport_start(esw, rep, sqs, num_sqs);
+	kfree(sqs);
+
+out:
+	if (err)
+		netdev_warn(priv->netdev, "Failed to add SQs FWD rules %d\n", err);
+	return err;
+}
+
+void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+
+	mlx5e_sqs2vport_stop(esw, rep);
+}
+
+static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	unsigned long ipv6_interval = NEIGH_VAR(&nd_tbl.parms,
+						DELAY_PROBE_TIME);
+#else
+	unsigned long ipv6_interval = ~0UL;
+#endif
+	unsigned long ipv4_interval = NEIGH_VAR(&arp_tbl.parms,
+						DELAY_PROBE_TIME);
+	struct net_device *netdev = rpriv->netdev;
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	rpriv->neigh_update.min_interval = min_t(unsigned long, ipv6_interval, ipv4_interval);
+	mlx5_fc_update_sampling_interval(priv->mdev, rpriv->neigh_update.min_interval);
+}
+
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv)
+{
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+
+	mlx5_fc_queue_stats_work(priv->mdev,
+				 &neigh_update->neigh_stats_work,
+				 neigh_update->min_interval);
+}
+
+static void mlx5e_rep_neigh_stats_work(struct work_struct *work)
+{
+	struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv,
+						    neigh_update.neigh_stats_work.work);
+	struct net_device *netdev = rpriv->netdev;
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_neigh_hash_entry *nhe;
+
+	rtnl_lock();
+	if (!list_empty(&rpriv->neigh_update.neigh_list))
+		mlx5e_rep_queue_neigh_stats_work(priv);
+
+	list_for_each_entry(nhe, &rpriv->neigh_update.neigh_list, neigh_list)
+		mlx5e_tc_update_neigh_used_value(nhe);
+
+	rtnl_unlock();
+}
+
+static void mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe)
+{
+	refcount_inc(&nhe->refcnt);
+}
+
+static void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe)
+{
+	if (refcount_dec_and_test(&nhe->refcnt))
+		kfree(nhe);
+}
+
+static void mlx5e_rep_update_flows(struct mlx5e_priv *priv,
+				   struct mlx5e_encap_entry *e,
+				   bool neigh_connected,
+				   unsigned char ha[ETH_ALEN])
+{
+	struct ethhdr *eth = (struct ethhdr *)e->encap_header;
+
+	ASSERT_RTNL();
+
+	if ((!neigh_connected && (e->flags & MLX5_ENCAP_ENTRY_VALID)) ||
+	    !ether_addr_equal(e->h_dest, ha))
+		mlx5e_tc_encap_flows_del(priv, e);
+
+	if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) {
+		ether_addr_copy(e->h_dest, ha);
+		ether_addr_copy(eth->h_dest, ha);
+
+		mlx5e_tc_encap_flows_add(priv, e);
+	}
+}
+
+static void mlx5e_rep_neigh_update(struct work_struct *work)
+{
+	struct mlx5e_neigh_hash_entry *nhe =
+		container_of(work, struct mlx5e_neigh_hash_entry, neigh_update_work);
+	struct neighbour *n = nhe->n;
+	struct mlx5e_encap_entry *e;
+	unsigned char ha[ETH_ALEN];
+	struct mlx5e_priv *priv;
+	bool neigh_connected;
+	bool encap_connected;
+	u8 nud_state, dead;
+
+	rtnl_lock();
+
+	/* If these parameters are changed after we release the lock,
+	 * we'll receive another event letting us know about it.
+	 * We use this lock to avoid inconsistency between the neigh validity
+	 * and it's hw address.
+	 */
+	read_lock_bh(&n->lock);
+	memcpy(ha, n->ha, ETH_ALEN);
+	nud_state = n->nud_state;
+	dead = n->dead;
+	read_unlock_bh(&n->lock);
+
+	neigh_connected = (nud_state & NUD_VALID) && !dead;
+
+	list_for_each_entry(e, &nhe->encap_list, encap_list) {
+		encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID);
+		priv = netdev_priv(e->out_dev);
+
+		if (encap_connected != neigh_connected ||
+		    !ether_addr_equal(e->h_dest, ha))
+			mlx5e_rep_update_flows(priv, e, neigh_connected, ha);
+	}
+	mlx5e_rep_neigh_entry_release(nhe);
+	rtnl_unlock();
+	neigh_release(n);
+}
+
+static struct mlx5e_neigh_hash_entry *
+mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
+			     struct mlx5e_neigh *m_neigh);
+
+static int mlx5e_rep_netevent_event(struct notifier_block *nb,
+				    unsigned long event, void *ptr)
+{
+	struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv,
+						    neigh_update.netevent_nb);
+	struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+	struct net_device *netdev = rpriv->netdev;
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_neigh_hash_entry *nhe = NULL;
+	struct mlx5e_neigh m_neigh = {};
+	struct neigh_parms *p;
+	struct neighbour *n;
+	bool found = false;
+
+	switch (event) {
+	case NETEVENT_NEIGH_UPDATE:
+		n = ptr;
+#if IS_ENABLED(CONFIG_IPV6)
+		if (n->tbl != &nd_tbl && n->tbl != &arp_tbl)
+#else
+		if (n->tbl != &arp_tbl)
+#endif
+			return NOTIFY_DONE;
+
+		m_neigh.dev = n->dev;
+		m_neigh.family = n->ops->family;
+		memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
+
+		/* We are in atomic context and can't take RTNL mutex, so use
+		 * spin_lock_bh to lookup the neigh table. bh is used since
+		 * netevent can be called from a softirq context.
+		 */
+		spin_lock_bh(&neigh_update->encap_lock);
+		nhe = mlx5e_rep_neigh_entry_lookup(priv, &m_neigh);
+		if (!nhe) {
+			spin_unlock_bh(&neigh_update->encap_lock);
+			return NOTIFY_DONE;
+		}
+
+		/* This assignment is valid as long as the the neigh reference
+		 * is taken
+		 */
+		nhe->n = n;
+
+		/* Take a reference to ensure the neighbour and mlx5 encap
+		 * entry won't be destructed until we drop the reference in
+		 * delayed work.
+		 */
+		neigh_hold(n);
+		mlx5e_rep_neigh_entry_hold(nhe);
+
+		if (!queue_work(priv->wq, &nhe->neigh_update_work)) {
+			mlx5e_rep_neigh_entry_release(nhe);
+			neigh_release(n);
+		}
+		spin_unlock_bh(&neigh_update->encap_lock);
+		break;
+
+	case NETEVENT_DELAY_PROBE_TIME_UPDATE:
+		p = ptr;
+
+		/* We check the device is present since we don't care about
+		 * changes in the default table, we only care about changes
+		 * done per device delay prob time parameter.
+		 */
+#if IS_ENABLED(CONFIG_IPV6)
+		if (!p->dev || (p->tbl != &nd_tbl && p->tbl != &arp_tbl))
+#else
+		if (!p->dev || p->tbl != &arp_tbl)
+#endif
+			return NOTIFY_DONE;
+
+		/* We are in atomic context and can't take RTNL mutex,
+		 * so use spin_lock_bh to walk the neigh list and look for
+		 * the relevant device. bh is used since netevent can be
+		 * called from a softirq context.
+		 */
+		spin_lock_bh(&neigh_update->encap_lock);
+		list_for_each_entry(nhe, &neigh_update->neigh_list, neigh_list) {
+			if (p->dev == nhe->m_neigh.dev) {
+				found = true;
+				break;
+			}
+		}
+		spin_unlock_bh(&neigh_update->encap_lock);
+		if (!found)
+			return NOTIFY_DONE;
+
+		neigh_update->min_interval = min_t(unsigned long,
+						   NEIGH_VAR(p, DELAY_PROBE_TIME),
+						   neigh_update->min_interval);
+		mlx5_fc_update_sampling_interval(priv->mdev,
+						 neigh_update->min_interval);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static const struct rhashtable_params mlx5e_neigh_ht_params = {
+	.head_offset = offsetof(struct mlx5e_neigh_hash_entry, rhash_node),
+	.key_offset = offsetof(struct mlx5e_neigh_hash_entry, m_neigh),
+	.key_len = sizeof(struct mlx5e_neigh),
+	.automatic_shrinking = true,
+};
+
+static int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv)
+{
+	struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+	int err;
+
+	err = rhashtable_init(&neigh_update->neigh_ht, &mlx5e_neigh_ht_params);
+	if (err)
+		return err;
+
+	INIT_LIST_HEAD(&neigh_update->neigh_list);
+	spin_lock_init(&neigh_update->encap_lock);
+	INIT_DELAYED_WORK(&neigh_update->neigh_stats_work,
+			  mlx5e_rep_neigh_stats_work);
+	mlx5e_rep_neigh_update_init_interval(rpriv);
+
+	rpriv->neigh_update.netevent_nb.notifier_call = mlx5e_rep_netevent_event;
+	err = register_netevent_notifier(&rpriv->neigh_update.netevent_nb);
+	if (err)
+		goto out_err;
+	return 0;
+
+out_err:
+	rhashtable_destroy(&neigh_update->neigh_ht);
+	return err;
+}
+
+static void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv)
+{
+	struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+	struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
+
+	unregister_netevent_notifier(&neigh_update->netevent_nb);
+
+	flush_workqueue(priv->wq); /* flush neigh update works */
+
+	cancel_delayed_work_sync(&rpriv->neigh_update.neigh_stats_work);
+
+	rhashtable_destroy(&neigh_update->neigh_ht);
+}
+
+static int mlx5e_rep_neigh_entry_insert(struct mlx5e_priv *priv,
+					struct mlx5e_neigh_hash_entry *nhe)
+{
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	int err;
+
+	err = rhashtable_insert_fast(&rpriv->neigh_update.neigh_ht,
+				     &nhe->rhash_node,
+				     mlx5e_neigh_ht_params);
+	if (err)
+		return err;
+
+	list_add(&nhe->neigh_list, &rpriv->neigh_update.neigh_list);
+
+	return err;
+}
+
+static void mlx5e_rep_neigh_entry_remove(struct mlx5e_priv *priv,
+					 struct mlx5e_neigh_hash_entry *nhe)
+{
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+	spin_lock_bh(&rpriv->neigh_update.encap_lock);
+
+	list_del(&nhe->neigh_list);
+
+	rhashtable_remove_fast(&rpriv->neigh_update.neigh_ht,
+			       &nhe->rhash_node,
+			       mlx5e_neigh_ht_params);
+	spin_unlock_bh(&rpriv->neigh_update.encap_lock);
+}
+
+/* This function must only be called under RTNL lock or under the
+ * representor's encap_lock in case RTNL mutex can't be held.
+ */
+static struct mlx5e_neigh_hash_entry *
+mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
+			     struct mlx5e_neigh *m_neigh)
+{
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+
+	return rhashtable_lookup_fast(&neigh_update->neigh_ht, m_neigh,
+				      mlx5e_neigh_ht_params);
+}
+
+static int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv,
+					struct mlx5e_encap_entry *e,
+					struct mlx5e_neigh_hash_entry **nhe)
+{
+	int err;
+
+	*nhe = kzalloc(sizeof(**nhe), GFP_KERNEL);
+	if (!*nhe)
+		return -ENOMEM;
+
+	memcpy(&(*nhe)->m_neigh, &e->m_neigh, sizeof(e->m_neigh));
+	INIT_WORK(&(*nhe)->neigh_update_work, mlx5e_rep_neigh_update);
+	INIT_LIST_HEAD(&(*nhe)->encap_list);
+	refcount_set(&(*nhe)->refcnt, 1);
+
+	err = mlx5e_rep_neigh_entry_insert(priv, *nhe);
+	if (err)
+		goto out_free;
+	return 0;
+
+out_free:
+	kfree(*nhe);
+	return err;
+}
+
+static void mlx5e_rep_neigh_entry_destroy(struct mlx5e_priv *priv,
+					  struct mlx5e_neigh_hash_entry *nhe)
+{
+	/* The neigh hash entry must be removed from the hash table regardless
+	 * of the reference count value, so it won't be found by the next
+	 * neigh notification call. The neigh hash entry reference count is
+	 * incremented only during creation and neigh notification calls and
+	 * protects from freeing the nhe struct.
+	 */
+	mlx5e_rep_neigh_entry_remove(priv, nhe);
+	mlx5e_rep_neigh_entry_release(nhe);
+}
+
+int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
+				 struct mlx5e_encap_entry *e)
+{
+	struct mlx5e_neigh_hash_entry *nhe;
+	int err;
+
+	nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh);
+	if (!nhe) {
+		err = mlx5e_rep_neigh_entry_create(priv, e, &nhe);
+		if (err)
+			return err;
+	}
+	list_add(&e->encap_list, &nhe->encap_list);
+	return 0;
+}
+
+void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
+				  struct mlx5e_encap_entry *e)
+{
+	struct mlx5e_neigh_hash_entry *nhe;
+
+	list_del(&e->encap_list);
+	nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh);
+
+	if (list_empty(&nhe->encap_list))
+		mlx5e_rep_neigh_entry_destroy(priv, nhe);
+}
+
+static int mlx5e_rep_open(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	int err;
+
+	mutex_lock(&priv->state_lock);
+	err = mlx5e_open_locked(dev);
+	if (err)
+		goto unlock;
+
+	if (!mlx5_modify_vport_admin_state(priv->mdev,
+			MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
+			rep->vport, MLX5_ESW_VPORT_ADMIN_STATE_UP))
+		netif_carrier_on(dev);
+
+unlock:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+static int mlx5e_rep_close(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	int ret;
+
+	mutex_lock(&priv->state_lock);
+	mlx5_modify_vport_admin_state(priv->mdev,
+			MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
+			rep->vport, MLX5_ESW_VPORT_ADMIN_STATE_DOWN);
+	ret = mlx5e_close_locked(dev);
+	mutex_unlock(&priv->state_lock);
+	return ret;
+}
+
+static int mlx5e_rep_get_phys_port_name(struct net_device *dev,
+					char *buf, size_t len)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	int ret;
+
+	ret = snprintf(buf, len, "%d", rep->vport - 1);
+	if (ret >= len)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static int
+mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv,
+			      struct tc_cls_flower_offload *cls_flower)
+{
+	switch (cls_flower->command) {
+	case TC_CLSFLOWER_REPLACE:
+		return mlx5e_configure_flower(priv, cls_flower);
+	case TC_CLSFLOWER_DESTROY:
+		return mlx5e_delete_flower(priv, cls_flower);
+	case TC_CLSFLOWER_STATS:
+		return mlx5e_stats_flower(priv, cls_flower);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data,
+				 void *cb_priv)
+{
+	struct mlx5e_priv *priv = cb_priv;
+
+	if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return mlx5e_rep_setup_tc_cls_flower(priv, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx5e_rep_setup_tc_block(struct net_device *dev,
+				    struct tc_block_offload *f)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		return tcf_block_cb_register(f->block, mlx5e_rep_setup_tc_cb,
+					     priv, priv);
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block, mlx5e_rep_setup_tc_cb, priv);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			      void *type_data)
+{
+	switch (type) {
+	case TC_SETUP_BLOCK:
+		return mlx5e_rep_setup_tc_block(dev, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep;
+
+	if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager))
+		return false;
+
+	rep = rpriv->rep;
+	if (esw->mode == SRIOV_OFFLOADS &&
+	    rep && rep->vport == FDB_UPLINK_VPORT)
+		return true;
+
+	return false;
+}
+
+static bool mlx5e_is_vf_vport_rep(struct mlx5e_priv *priv)
+{
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+
+	if (rep && rep->vport != FDB_UPLINK_VPORT)
+		return true;
+
+	return false;
+}
+
+bool mlx5e_has_offload_stats(const struct net_device *dev, int attr_id)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		if (mlx5e_is_vf_vport_rep(priv) || mlx5e_is_uplink_rep(priv))
+			return true;
+	}
+
+	return false;
+}
+
+static int
+mlx5e_get_sw_stats64(const struct net_device *dev,
+		     struct rtnl_link_stats64 *stats)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_sw_stats *sstats = &priv->stats.sw;
+
+	stats->rx_packets = sstats->rx_packets;
+	stats->rx_bytes   = sstats->rx_bytes;
+	stats->tx_packets = sstats->tx_packets;
+	stats->tx_bytes   = sstats->tx_bytes;
+
+	stats->tx_dropped = sstats->tx_queue_dropped;
+
+	return 0;
+}
+
+int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev,
+			    void *sp)
+{
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		return mlx5e_get_sw_stats64(dev, sp);
+	}
+
+	return -EINVAL;
+}
+
+static void
+mlx5e_rep_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	memcpy(stats, &priv->stats.vf_vport, sizeof(*stats));
+}
+
+static const struct switchdev_ops mlx5e_rep_switchdev_ops = {
+	.switchdev_port_attr_get	= mlx5e_attr_get,
+};
+
+static const struct net_device_ops mlx5e_netdev_ops_rep = {
+	.ndo_open                = mlx5e_rep_open,
+	.ndo_stop                = mlx5e_rep_close,
+	.ndo_start_xmit          = mlx5e_xmit,
+	.ndo_get_phys_port_name  = mlx5e_rep_get_phys_port_name,
+	.ndo_setup_tc            = mlx5e_rep_setup_tc,
+	.ndo_get_stats64         = mlx5e_rep_get_stats,
+	.ndo_has_offload_stats	 = mlx5e_has_offload_stats,
+	.ndo_get_offload_stats	 = mlx5e_get_offload_stats,
+};
+
+static void mlx5e_build_rep_params(struct mlx5_core_dev *mdev,
+				   struct mlx5e_params *params, u16 mtu)
+{
+	u8 cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
+					 MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
+					 MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
+
+	params->hard_mtu    = MLX5E_ETH_HARD_MTU;
+	params->sw_mtu      = mtu;
+	params->log_sq_size = MLX5E_REP_PARAMS_LOG_SQ_SIZE;
+	params->rq_wq_type  = MLX5_WQ_TYPE_LINKED_LIST;
+	params->log_rq_mtu_frames = MLX5E_REP_PARAMS_LOG_RQ_SIZE;
+
+	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
+	mlx5e_set_rx_cq_mode_params(params, cq_period_mode);
+
+	params->num_tc                = 1;
+	params->lro_wqe_sz            = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
+
+	mlx5_query_min_inline(mdev, &params->tx_min_inline_mode);
+}
+
+static void mlx5e_build_rep_netdev(struct net_device *netdev)
+{
+	netdev->netdev_ops = &mlx5e_netdev_ops_rep;
+
+	netdev->watchdog_timeo    = 15 * HZ;
+
+	netdev->ethtool_ops	  = &mlx5e_rep_ethtool_ops;
+
+	netdev->switchdev_ops = &mlx5e_rep_switchdev_ops;
+
+	netdev->features	 |= NETIF_F_VLAN_CHALLENGED | NETIF_F_HW_TC | NETIF_F_NETNS_LOCAL;
+	netdev->hw_features      |= NETIF_F_HW_TC;
+
+	eth_hw_addr_random(netdev);
+}
+
+static void mlx5e_init_rep(struct mlx5_core_dev *mdev,
+			   struct net_device *netdev,
+			   const struct mlx5e_profile *profile,
+			   void *ppriv)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	priv->mdev                         = mdev;
+	priv->netdev                       = netdev;
+	priv->profile                      = profile;
+	priv->ppriv                        = ppriv;
+
+	mutex_init(&priv->state_lock);
+
+	INIT_DELAYED_WORK(&priv->update_stats_work, mlx5e_update_stats_work);
+
+	priv->channels.params.num_channels = profile->max_nch(mdev);
+
+	mlx5e_build_rep_params(mdev, &priv->channels.params, netdev->mtu);
+	mlx5e_build_rep_netdev(netdev);
+
+	mlx5e_timestamp_init(priv);
+}
+
+static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct mlx5_flow_handle *flow_rule;
+	int err;
+
+	mlx5e_init_l2_addr(priv);
+
+	err = mlx5e_create_direct_rqts(priv);
+	if (err)
+		return err;
+
+	err = mlx5e_create_direct_tirs(priv);
+	if (err)
+		goto err_destroy_direct_rqts;
+
+	flow_rule = mlx5_eswitch_create_vport_rx_rule(esw,
+						      rep->vport,
+						      priv->direct_tir[0].tirn);
+	if (IS_ERR(flow_rule)) {
+		err = PTR_ERR(flow_rule);
+		goto err_destroy_direct_tirs;
+	}
+	rpriv->vport_rx_rule = flow_rule;
+
+	err = mlx5e_tc_init(priv);
+	if (err)
+		goto err_del_flow_rule;
+
+	return 0;
+
+err_del_flow_rule:
+	mlx5_del_flow_rules(rpriv->vport_rx_rule);
+err_destroy_direct_tirs:
+	mlx5e_destroy_direct_tirs(priv);
+err_destroy_direct_rqts:
+	mlx5e_destroy_direct_rqts(priv);
+	return err;
+}
+
+static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv)
+{
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+	mlx5e_tc_cleanup(priv);
+	mlx5_del_flow_rules(rpriv->vport_rx_rule);
+	mlx5e_destroy_direct_tirs(priv);
+	mlx5e_destroy_direct_rqts(priv);
+}
+
+static int mlx5e_init_rep_tx(struct mlx5e_priv *priv)
+{
+	int err;
+
+	err = mlx5e_create_tises(priv);
+	if (err) {
+		mlx5_core_warn(priv->mdev, "create tises failed, %d\n", err);
+		return err;
+	}
+	return 0;
+}
+
+static int mlx5e_get_rep_max_num_channels(struct mlx5_core_dev *mdev)
+{
+#define	MLX5E_PORT_REPRESENTOR_NCH 1
+	return MLX5E_PORT_REPRESENTOR_NCH;
+}
+
+static const struct mlx5e_profile mlx5e_rep_profile = {
+	.init			= mlx5e_init_rep,
+	.init_rx		= mlx5e_init_rep_rx,
+	.cleanup_rx		= mlx5e_cleanup_rep_rx,
+	.init_tx		= mlx5e_init_rep_tx,
+	.cleanup_tx		= mlx5e_cleanup_nic_tx,
+	.update_stats           = mlx5e_rep_update_stats,
+	.max_nch		= mlx5e_get_rep_max_num_channels,
+	.update_carrier		= NULL,
+	.rx_handlers.handle_rx_cqe       = mlx5e_handle_rx_cqe_rep,
+	.rx_handlers.handle_rx_cqe_mpwqe = NULL /* Not supported */,
+	.max_tc			= 1,
+};
+
+/* e-Switch vport representors */
+
+static int
+mlx5e_nic_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
+	struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
+
+	int err;
+
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		err = mlx5e_add_sqs_fwd_rules(priv);
+		if (err)
+			return err;
+	}
+
+	err = mlx5e_rep_neigh_init(rpriv);
+	if (err)
+		goto err_remove_sqs;
+
+	return 0;
+
+err_remove_sqs:
+	mlx5e_remove_sqs_fwd_rules(priv);
+	return err;
+}
+
+static void
+mlx5e_nic_rep_unload(struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
+	struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
+
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		mlx5e_remove_sqs_fwd_rules(priv);
+
+	/* clean (and re-init) existing uplink offloaded TC rules */
+	mlx5e_tc_cleanup(priv);
+	mlx5e_tc_init(priv);
+
+	mlx5e_rep_neigh_cleanup(rpriv);
+}
+
+static int
+mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_rep_priv *uplink_rpriv;
+	struct mlx5e_rep_priv *rpriv;
+	struct net_device *netdev;
+	struct mlx5e_priv *upriv;
+	int err;
+
+	rpriv = kzalloc(sizeof(*rpriv), GFP_KERNEL);
+	if (!rpriv)
+		return -ENOMEM;
+
+	netdev = mlx5e_create_netdev(dev, &mlx5e_rep_profile, rpriv);
+	if (!netdev) {
+		pr_warn("Failed to create representor netdev for vport %d\n",
+			rep->vport);
+		kfree(rpriv);
+		return -EINVAL;
+	}
+
+	rpriv->netdev = netdev;
+	rpriv->rep = rep;
+	rep->rep_if[REP_ETH].priv = rpriv;
+	INIT_LIST_HEAD(&rpriv->vport_sqs_list);
+
+	err = mlx5e_attach_netdev(netdev_priv(netdev));
+	if (err) {
+		pr_warn("Failed to attach representor netdev for vport %d\n",
+			rep->vport);
+		goto err_destroy_netdev;
+	}
+
+	err = mlx5e_rep_neigh_init(rpriv);
+	if (err) {
+		pr_warn("Failed to initialized neighbours handling for vport %d\n",
+			rep->vport);
+		goto err_detach_netdev;
+	}
+
+	uplink_rpriv = mlx5_eswitch_get_uplink_priv(dev->priv.eswitch, REP_ETH);
+	upriv = netdev_priv(uplink_rpriv->netdev);
+	err = tc_setup_cb_egdev_register(netdev, mlx5e_setup_tc_block_cb,
+					 upriv);
+	if (err)
+		goto err_neigh_cleanup;
+
+	err = register_netdev(netdev);
+	if (err) {
+		pr_warn("Failed to register representor netdev for vport %d\n",
+			rep->vport);
+		goto err_egdev_cleanup;
+	}
+
+	return 0;
+
+err_egdev_cleanup:
+	tc_setup_cb_egdev_unregister(netdev, mlx5e_setup_tc_block_cb,
+				     upriv);
+
+err_neigh_cleanup:
+	mlx5e_rep_neigh_cleanup(rpriv);
+
+err_detach_netdev:
+	mlx5e_detach_netdev(netdev_priv(netdev));
+
+err_destroy_netdev:
+	mlx5e_destroy_netdev(netdev_priv(netdev));
+	kfree(rpriv);
+	return err;
+}
+
+static void
+mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
+	struct net_device *netdev = rpriv->netdev;
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_rep_priv *uplink_rpriv;
+	void *ppriv = priv->ppriv;
+	struct mlx5e_priv *upriv;
+
+	unregister_netdev(netdev);
+	uplink_rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch,
+						    REP_ETH);
+	upriv = netdev_priv(uplink_rpriv->netdev);
+	tc_setup_cb_egdev_unregister(netdev, mlx5e_setup_tc_block_cb,
+				     upriv);
+	mlx5e_rep_neigh_cleanup(rpriv);
+	mlx5e_detach_netdev(priv);
+	mlx5e_destroy_netdev(priv);
+	kfree(ppriv); /* mlx5e_rep_priv */
+}
+
+static void *mlx5e_vport_rep_get_proto_dev(struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_rep_priv *rpriv;
+
+	rpriv = mlx5e_rep_to_rep_priv(rep);
+
+	return rpriv->netdev;
+}
+
+static void mlx5e_rep_register_vf_vports(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_eswitch *esw   = mdev->priv.eswitch;
+	int total_vfs = MLX5_TOTAL_VPORTS(mdev);
+	int vport;
+
+	for (vport = 1; vport < total_vfs; vport++) {
+		struct mlx5_eswitch_rep_if rep_if = {};
+
+		rep_if.load = mlx5e_vport_rep_load;
+		rep_if.unload = mlx5e_vport_rep_unload;
+		rep_if.get_proto_dev = mlx5e_vport_rep_get_proto_dev;
+		mlx5_eswitch_register_vport_rep(esw, vport, &rep_if, REP_ETH);
+	}
+}
+
+static void mlx5e_rep_unregister_vf_vports(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_eswitch *esw = mdev->priv.eswitch;
+	int total_vfs = MLX5_TOTAL_VPORTS(mdev);
+	int vport;
+
+	for (vport = 1; vport < total_vfs; vport++)
+		mlx5_eswitch_unregister_vport_rep(esw, vport, REP_ETH);
+}
+
+void mlx5e_register_vport_reps(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_eswitch *esw   = mdev->priv.eswitch;
+	struct mlx5_eswitch_rep_if rep_if;
+	struct mlx5e_rep_priv *rpriv;
+
+	rpriv = priv->ppriv;
+	rpriv->netdev = priv->netdev;
+
+	rep_if.load = mlx5e_nic_rep_load;
+	rep_if.unload = mlx5e_nic_rep_unload;
+	rep_if.get_proto_dev = mlx5e_vport_rep_get_proto_dev;
+	rep_if.priv = rpriv;
+	INIT_LIST_HEAD(&rpriv->vport_sqs_list);
+	mlx5_eswitch_register_vport_rep(esw, 0, &rep_if, REP_ETH); /* UPLINK PF vport*/
+
+	mlx5e_rep_register_vf_vports(priv); /* VFs vports */
+}
+
+void mlx5e_unregister_vport_reps(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_eswitch *esw   = mdev->priv.eswitch;
+
+	mlx5e_rep_unregister_vf_vports(priv); /* VFs vports */
+	mlx5_eswitch_unregister_vport_rep(esw, 0, REP_ETH); /* UPLINK PF*/
+}
+
+void *mlx5e_alloc_nic_rep_priv(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_eswitch *esw = mdev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv;
+
+	rpriv = kzalloc(sizeof(*rpriv), GFP_KERNEL);
+	if (!rpriv)
+		return NULL;
+
+	rpriv->rep = &esw->offloads.vport_reps[0];
+	return rpriv;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
new file mode 100644
index 0000000..b9b481f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5E_REP_H__
+#define __MLX5E_REP_H__
+
+#include <net/ip_tunnels.h>
+#include <linux/rhashtable.h>
+#include "eswitch.h"
+#include "en.h"
+
+#ifdef CONFIG_MLX5_ESWITCH
+struct mlx5e_neigh_update_table {
+	struct rhashtable       neigh_ht;
+	/* Save the neigh hash entries in a list in addition to the hash table
+	 * (neigh_ht). In order to iterate easily over the neigh entries.
+	 * Used for stats query.
+	 */
+	struct list_head	neigh_list;
+	/* protect lookup/remove operations */
+	spinlock_t              encap_lock;
+	struct notifier_block   netevent_nb;
+	struct delayed_work     neigh_stats_work;
+	unsigned long           min_interval; /* jiffies */
+};
+
+struct mlx5e_rep_priv {
+	struct mlx5_eswitch_rep *rep;
+	struct mlx5e_neigh_update_table neigh_update;
+	struct net_device      *netdev;
+	struct mlx5_flow_handle *vport_rx_rule;
+	struct list_head       vport_sqs_list;
+};
+
+static inline
+struct mlx5e_rep_priv *mlx5e_rep_to_rep_priv(struct mlx5_eswitch_rep *rep)
+{
+	return (struct mlx5e_rep_priv *)rep->rep_if[REP_ETH].priv;
+}
+
+struct mlx5e_neigh {
+	struct net_device *dev;
+	union {
+		__be32	v4;
+		struct in6_addr v6;
+	} dst_ip;
+	int family;
+};
+
+struct mlx5e_neigh_hash_entry {
+	struct rhash_head rhash_node;
+	struct mlx5e_neigh m_neigh;
+
+	/* Save the neigh hash entry in a list on the representor in
+	 * addition to the hash table. In order to iterate easily over the
+	 * neighbour entries. Used for stats query.
+	 */
+	struct list_head neigh_list;
+
+	/* encap list sharing the same neigh */
+	struct list_head encap_list;
+
+	/* valid only when the neigh reference is taken during
+	 * neigh_update_work workqueue callback.
+	 */
+	struct neighbour *n;
+	struct work_struct neigh_update_work;
+
+	/* neigh hash entry can be deleted only when the refcount is zero.
+	 * refcount is needed to avoid neigh hash entry removal by TC, while
+	 * it's used by the neigh notification call.
+	 */
+	refcount_t refcnt;
+
+	/* Save the last reported time offloaded trafic pass over one of the
+	 * neigh hash entry flows. Use it to periodically update the neigh
+	 * 'used' value and avoid neigh deleting by the kernel.
+	 */
+	unsigned long reported_lastuse;
+};
+
+enum {
+	/* set when the encap entry is successfully offloaded into HW */
+	MLX5_ENCAP_ENTRY_VALID     = BIT(0),
+};
+
+struct mlx5e_encap_entry {
+	/* neigh hash entry list of encaps sharing the same neigh */
+	struct list_head encap_list;
+	struct mlx5e_neigh m_neigh;
+	/* a node of the eswitch encap hash table which keeping all the encap
+	 * entries
+	 */
+	struct hlist_node encap_hlist;
+	struct list_head flows;
+	u32 encap_id;
+	struct ip_tunnel_info tun_info;
+	unsigned char h_dest[ETH_ALEN];	/* destination eth addr	*/
+
+	struct net_device *out_dev;
+	int tunnel_type;
+	u8 flags;
+	char *encap_header;
+	int encap_size;
+};
+
+struct mlx5e_rep_sq {
+	struct mlx5_flow_handle	*send_to_vport_rule;
+	struct list_head	 list;
+};
+
+void *mlx5e_alloc_nic_rep_priv(struct mlx5_core_dev *mdev);
+void mlx5e_register_vport_reps(struct mlx5e_priv *priv);
+void mlx5e_unregister_vport_reps(struct mlx5e_priv *priv);
+bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv);
+int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv);
+void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv);
+
+int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev, void *sp);
+bool mlx5e_has_offload_stats(const struct net_device *dev, int attr_id);
+
+int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr);
+void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+
+int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
+				 struct mlx5e_encap_entry *e);
+void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
+				  struct mlx5e_encap_entry *e);
+
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv);
+#else /* CONFIG_MLX5_ESWITCH */
+static inline void mlx5e_register_vport_reps(struct mlx5e_priv *priv) {}
+static inline void mlx5e_unregister_vport_reps(struct mlx5e_priv *priv) {}
+static inline bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv) { return false; }
+static inline int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv) { return 0; }
+static inline void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv) {}
+#endif
+
+#endif /* __MLX5E_REP_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
new file mode 100644
index 0000000..1ff0b0e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -0,0 +1,1413 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/prefetch.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/bpf_trace.h>
+#include <net/busy_poll.h>
+#include <net/ip6_checksum.h>
+#include "en.h"
+#include "en_tc.h"
+#include "eswitch.h"
+#include "en_rep.h"
+#include "ipoib/ipoib.h"
+#include "en_accel/ipsec_rxtx.h"
+#include "lib/clock.h"
+
+static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
+{
+	return config->rx_filter == HWTSTAMP_FILTER_ALL;
+}
+
+static inline void mlx5e_read_cqe_slot(struct mlx5e_cq *cq, u32 cqcc,
+				       void *data)
+{
+	u32 ci = cqcc & cq->wq.fbc.sz_m1;
+
+	memcpy(data, mlx5_cqwq_get_wqe(&cq->wq, ci), sizeof(struct mlx5_cqe64));
+}
+
+static inline void mlx5e_read_title_slot(struct mlx5e_rq *rq,
+					 struct mlx5e_cq *cq, u32 cqcc)
+{
+	mlx5e_read_cqe_slot(cq, cqcc, &cq->title);
+	cq->decmprs_left        = be32_to_cpu(cq->title.byte_cnt);
+	cq->decmprs_wqe_counter = be16_to_cpu(cq->title.wqe_counter);
+	rq->stats.cqe_compress_blks++;
+}
+
+static inline void mlx5e_read_mini_arr_slot(struct mlx5e_cq *cq, u32 cqcc)
+{
+	mlx5e_read_cqe_slot(cq, cqcc, cq->mini_arr);
+	cq->mini_arr_idx = 0;
+}
+
+static inline void mlx5e_cqes_update_owner(struct mlx5e_cq *cq, u32 cqcc, int n)
+{
+	struct mlx5_frag_buf_ctrl *fbc = &cq->wq.fbc;
+	u8 op_own = (cqcc >> fbc->log_sz) & 1;
+	u32 wq_sz = 1 << fbc->log_sz;
+	u32 ci = cqcc & fbc->sz_m1;
+	u32 ci_top = min_t(u32, wq_sz, ci + n);
+
+	for (; ci < ci_top; ci++, n--) {
+		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, ci);
+
+		cqe->op_own = op_own;
+	}
+
+	if (unlikely(ci == wq_sz)) {
+		op_own = !op_own;
+		for (ci = 0; ci < n; ci++) {
+			struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, ci);
+
+			cqe->op_own = op_own;
+		}
+	}
+}
+
+static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq,
+					struct mlx5e_cq *cq, u32 cqcc)
+{
+	cq->title.byte_cnt     = cq->mini_arr[cq->mini_arr_idx].byte_cnt;
+	cq->title.check_sum    = cq->mini_arr[cq->mini_arr_idx].checksum;
+	cq->title.op_own      &= 0xf0;
+	cq->title.op_own      |= 0x01 & (cqcc >> cq->wq.fbc.log_sz);
+	cq->title.wqe_counter  = cpu_to_be16(cq->decmprs_wqe_counter);
+
+	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
+		cq->decmprs_wqe_counter +=
+			mpwrq_get_cqe_consumed_strides(&cq->title);
+	else
+		cq->decmprs_wqe_counter =
+			(cq->decmprs_wqe_counter + 1) & rq->wq.sz_m1;
+}
+
+static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
+						struct mlx5e_cq *cq, u32 cqcc)
+{
+	mlx5e_decompress_cqe(rq, cq, cqcc);
+	cq->title.rss_hash_type   = 0;
+	cq->title.rss_hash_result = 0;
+}
+
+static inline u32 mlx5e_decompress_cqes_cont(struct mlx5e_rq *rq,
+					     struct mlx5e_cq *cq,
+					     int update_owner_only,
+					     int budget_rem)
+{
+	u32 cqcc = cq->wq.cc + update_owner_only;
+	u32 cqe_count;
+	u32 i;
+
+	cqe_count = min_t(u32, cq->decmprs_left, budget_rem);
+
+	for (i = update_owner_only; i < cqe_count;
+	     i++, cq->mini_arr_idx++, cqcc++) {
+		if (cq->mini_arr_idx == MLX5_MINI_CQE_ARRAY_SIZE)
+			mlx5e_read_mini_arr_slot(cq, cqcc);
+
+		mlx5e_decompress_cqe_no_hash(rq, cq, cqcc);
+		rq->handle_rx_cqe(rq, &cq->title);
+	}
+	mlx5e_cqes_update_owner(cq, cq->wq.cc, cqcc - cq->wq.cc);
+	cq->wq.cc = cqcc;
+	cq->decmprs_left -= cqe_count;
+	rq->stats.cqe_compress_pkts += cqe_count;
+
+	return cqe_count;
+}
+
+static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
+					      struct mlx5e_cq *cq,
+					      int budget_rem)
+{
+	mlx5e_read_title_slot(rq, cq, cq->wq.cc);
+	mlx5e_read_mini_arr_slot(cq, cq->wq.cc + 1);
+	mlx5e_decompress_cqe(rq, cq, cq->wq.cc);
+	rq->handle_rx_cqe(rq, &cq->title);
+	cq->mini_arr_idx++;
+
+	return mlx5e_decompress_cqes_cont(rq, cq, 1, budget_rem) - 1;
+}
+
+#define RQ_PAGE_SIZE(rq) ((1 << rq->buff.page_order) << PAGE_SHIFT)
+
+static inline bool mlx5e_page_is_reserved(struct page *page)
+{
+	return page_is_pfmemalloc(page) || page_to_nid(page) != numa_mem_id();
+}
+
+static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq,
+				      struct mlx5e_dma_info *dma_info)
+{
+	struct mlx5e_page_cache *cache = &rq->page_cache;
+	u32 tail_next = (cache->tail + 1) & (MLX5E_CACHE_SIZE - 1);
+
+	if (tail_next == cache->head) {
+		rq->stats.cache_full++;
+		return false;
+	}
+
+	if (unlikely(mlx5e_page_is_reserved(dma_info->page))) {
+		rq->stats.cache_waive++;
+		return false;
+	}
+
+	cache->page_cache[cache->tail] = *dma_info;
+	cache->tail = tail_next;
+	return true;
+}
+
+static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
+				      struct mlx5e_dma_info *dma_info)
+{
+	struct mlx5e_page_cache *cache = &rq->page_cache;
+
+	if (unlikely(cache->head == cache->tail)) {
+		rq->stats.cache_empty++;
+		return false;
+	}
+
+	if (page_ref_count(cache->page_cache[cache->head].page) != 1) {
+		rq->stats.cache_busy++;
+		return false;
+	}
+
+	*dma_info = cache->page_cache[cache->head];
+	cache->head = (cache->head + 1) & (MLX5E_CACHE_SIZE - 1);
+	rq->stats.cache_reuse++;
+
+	dma_sync_single_for_device(rq->pdev, dma_info->addr,
+				   RQ_PAGE_SIZE(rq),
+				   DMA_FROM_DEVICE);
+	return true;
+}
+
+static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
+					  struct mlx5e_dma_info *dma_info)
+{
+	if (mlx5e_rx_cache_get(rq, dma_info))
+		return 0;
+
+	dma_info->page = dev_alloc_pages(rq->buff.page_order);
+	if (unlikely(!dma_info->page))
+		return -ENOMEM;
+
+	dma_info->addr = dma_map_page(rq->pdev, dma_info->page, 0,
+				      RQ_PAGE_SIZE(rq), rq->buff.map_dir);
+	if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) {
+		put_page(dma_info->page);
+		dma_info->page = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
+			bool recycle)
+{
+	if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info))
+		return;
+
+	dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
+		       rq->buff.map_dir);
+	put_page(dma_info->page);
+}
+
+static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
+				    struct mlx5e_wqe_frag_info *wi)
+{
+	return rq->wqe.page_reuse && wi->di.page &&
+		(wi->offset + rq->wqe.frag_sz <= RQ_PAGE_SIZE(rq)) &&
+		!mlx5e_page_is_reserved(wi->di.page);
+}
+
+static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
+{
+	struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
+
+	/* check if page exists, hence can be reused */
+	if (!wi->di.page) {
+		if (unlikely(mlx5e_page_alloc_mapped(rq, &wi->di)))
+			return -ENOMEM;
+		wi->offset = 0;
+	}
+
+	wqe->data.addr = cpu_to_be64(wi->di.addr + wi->offset + rq->buff.headroom);
+	return 0;
+}
+
+static inline void mlx5e_free_rx_wqe(struct mlx5e_rq *rq,
+				     struct mlx5e_wqe_frag_info *wi)
+{
+	mlx5e_page_release(rq, &wi->di, true);
+	wi->di.page = NULL;
+}
+
+static inline void mlx5e_free_rx_wqe_reuse(struct mlx5e_rq *rq,
+					   struct mlx5e_wqe_frag_info *wi)
+{
+	if (mlx5e_page_reuse(rq, wi)) {
+		rq->stats.page_reuse++;
+		return;
+	}
+
+	mlx5e_free_rx_wqe(rq, wi);
+}
+
+void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
+{
+	struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
+
+	if (wi->di.page)
+		mlx5e_free_rx_wqe(rq, wi);
+}
+
+static inline void mlx5e_add_skb_frag_mpwqe(struct mlx5e_rq *rq,
+					    struct sk_buff *skb,
+					    struct mlx5e_dma_info *di,
+					    u32 frag_offset, u32 len)
+{
+	unsigned int truesize = ALIGN(len, BIT(rq->mpwqe.log_stride_sz));
+
+	dma_sync_single_for_cpu(rq->pdev,
+				di->addr + frag_offset,
+				len, DMA_FROM_DEVICE);
+	page_ref_inc(di->page);
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+			di->page, frag_offset, len, truesize);
+}
+
+static inline void
+mlx5e_copy_skb_header_mpwqe(struct device *pdev,
+			    struct sk_buff *skb,
+			    struct mlx5e_dma_info *dma_info,
+			    u32 offset, u32 headlen)
+{
+	u16 headlen_pg = min_t(u32, headlen, PAGE_SIZE - offset);
+	unsigned int len;
+
+	 /* Aligning len to sizeof(long) optimizes memcpy performance */
+	len = ALIGN(headlen_pg, sizeof(long));
+	dma_sync_single_for_cpu(pdev, dma_info->addr + offset, len,
+				DMA_FROM_DEVICE);
+	skb_copy_to_linear_data(skb, page_address(dma_info->page) + offset, len);
+
+	if (unlikely(offset + headlen > PAGE_SIZE)) {
+		dma_info++;
+		headlen_pg = len;
+		len = ALIGN(headlen - headlen_pg, sizeof(long));
+		dma_sync_single_for_cpu(pdev, dma_info->addr, len,
+					DMA_FROM_DEVICE);
+		skb_copy_to_linear_data_offset(skb, headlen_pg,
+					       page_address(dma_info->page),
+					       len);
+	}
+}
+
+void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
+{
+	const bool no_xdp_xmit =
+		bitmap_empty(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
+	struct mlx5e_dma_info *dma_info = wi->umr.dma_info;
+	int i;
+
+	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++)
+		if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap))
+			mlx5e_page_release(rq, &dma_info[i], true);
+}
+
+static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
+{
+	struct mlx5_wq_ll *wq = &rq->wq;
+	struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
+
+	rq->mpwqe.umr_in_progress = false;
+
+	mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
+
+	/* ensure wqes are visible to device before updating doorbell record */
+	dma_wmb();
+
+	mlx5_wq_ll_update_db_record(wq);
+}
+
+static inline u16 mlx5e_icosq_wrap_cnt(struct mlx5e_icosq *sq)
+{
+	return sq->pc >> MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
+}
+
+static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
+{
+	struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix];
+	struct mlx5e_dma_info *dma_info = &wi->umr.dma_info[0];
+	struct mlx5e_icosq *sq = &rq->channel->icosq;
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	struct mlx5e_umr_wqe *umr_wqe;
+	u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1);
+	int err;
+	u16 pi;
+	int i;
+
+	/* fill sq edge with nops to avoid wqe wrap around */
+	while ((pi = (sq->pc & wq->sz_m1)) > sq->edge) {
+		sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_NOP;
+		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
+	}
+
+	umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
+	if (unlikely(mlx5e_icosq_wrap_cnt(sq) < 2))
+		memcpy(umr_wqe, &rq->mpwqe.umr_wqe,
+		       offsetof(struct mlx5e_umr_wqe, inline_mtts));
+
+	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) {
+		err = mlx5e_page_alloc_mapped(rq, dma_info);
+		if (unlikely(err))
+			goto err_unmap;
+		umr_wqe->inline_mtts[i].ptag = cpu_to_be64(dma_info->addr | MLX5_EN_WR);
+	}
+
+	bitmap_zero(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
+	wi->consumed_strides = 0;
+
+	rq->mpwqe.umr_in_progress = true;
+
+	umr_wqe->ctrl.opmod_idx_opcode =
+		cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
+			    MLX5_OPCODE_UMR);
+	umr_wqe->uctrl.xlt_offset = cpu_to_be16(xlt_offset);
+
+	sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_UMR;
+	sq->pc += MLX5E_UMR_WQEBBS;
+	mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &umr_wqe->ctrl);
+
+	return 0;
+
+err_unmap:
+	while (--i >= 0) {
+		dma_info--;
+		mlx5e_page_release(rq, dma_info, true);
+	}
+	rq->stats.buff_alloc_err++;
+
+	return err;
+}
+
+void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
+{
+	struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix];
+
+	mlx5e_free_rx_mpwqe(rq, wi);
+}
+
+bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
+{
+	struct mlx5_wq_ll *wq = &rq->wq;
+	int err;
+
+	if (unlikely(!MLX5E_TEST_BIT(rq->state, MLX5E_RQ_STATE_ENABLED)))
+		return false;
+
+	if (mlx5_wq_ll_is_full(wq))
+		return false;
+
+	do {
+		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
+
+		err = mlx5e_alloc_rx_wqe(rq, wqe, wq->head);
+		if (unlikely(err)) {
+			rq->stats.buff_alloc_err++;
+			break;
+		}
+
+		mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
+	} while (!mlx5_wq_ll_is_full(wq));
+
+	/* ensure wqes are visible to device before updating doorbell record */
+	dma_wmb();
+
+	mlx5_wq_ll_update_db_record(wq);
+
+	return !!err;
+}
+
+static inline void mlx5e_poll_ico_single_cqe(struct mlx5e_cq *cq,
+					     struct mlx5e_icosq *sq,
+					     struct mlx5e_rq *rq,
+					     struct mlx5_cqe64 *cqe)
+{
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	u16 ci = be16_to_cpu(cqe->wqe_counter) & wq->sz_m1;
+	struct mlx5e_sq_wqe_info *icowi = &sq->db.ico_wqe[ci];
+
+	mlx5_cqwq_pop(&cq->wq);
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_REQ)) {
+		netdev_WARN_ONCE(cq->channel->netdev,
+				 "Bad OP in ICOSQ CQE: 0x%x\n", cqe->op_own);
+		return;
+	}
+
+	if (likely(icowi->opcode == MLX5_OPCODE_UMR)) {
+		mlx5e_post_rx_mpwqe(rq);
+		return;
+	}
+
+	if (unlikely(icowi->opcode != MLX5_OPCODE_NOP))
+		netdev_WARN_ONCE(cq->channel->netdev,
+				 "Bad OPCODE in ICOSQ WQE info: 0x%x\n", icowi->opcode);
+}
+
+static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
+{
+	struct mlx5e_icosq *sq = container_of(cq, struct mlx5e_icosq, cq);
+	struct mlx5_cqe64 *cqe;
+
+	if (unlikely(!MLX5E_TEST_BIT(sq->state, MLX5E_SQ_STATE_ENABLED)))
+		return;
+
+	cqe = mlx5_cqwq_get_cqe(&cq->wq);
+	if (likely(!cqe))
+		return;
+
+	/* by design, there's only a single cqe */
+	mlx5e_poll_ico_single_cqe(cq, sq, rq, cqe);
+
+	mlx5_cqwq_update_db_record(&cq->wq);
+}
+
+bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
+{
+	struct mlx5_wq_ll *wq = &rq->wq;
+
+	if (unlikely(!MLX5E_TEST_BIT(rq->state, MLX5E_RQ_STATE_ENABLED)))
+		return false;
+
+	mlx5e_poll_ico_cq(&rq->channel->icosq.cq, rq);
+
+	if (mlx5_wq_ll_is_full(wq))
+		return false;
+
+	if (!rq->mpwqe.umr_in_progress)
+		mlx5e_alloc_rx_mpwqe(rq, wq->head);
+
+	return false;
+}
+
+static void mlx5e_lro_update_tcp_hdr(struct mlx5_cqe64 *cqe, struct tcphdr *tcp)
+{
+	u8 l4_hdr_type = get_cqe_l4_hdr_type(cqe);
+	u8 tcp_ack     = (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) ||
+			 (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA);
+
+	tcp->check                      = 0;
+	tcp->psh                        = get_cqe_lro_tcppsh(cqe);
+
+	if (tcp_ack) {
+		tcp->ack                = 1;
+		tcp->ack_seq            = cqe->lro_ack_seq_num;
+		tcp->window             = cqe->lro_tcp_win;
+	}
+}
+
+static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe,
+				 u32 cqe_bcnt)
+{
+	struct ethhdr	*eth = (struct ethhdr *)(skb->data);
+	struct tcphdr	*tcp;
+	int network_depth = 0;
+	__wsum check;
+	__be16 proto;
+	u16 tot_len;
+	void *ip_p;
+
+	proto = __vlan_get_protocol(skb, eth->h_proto, &network_depth);
+
+	tot_len = cqe_bcnt - network_depth;
+	ip_p = skb->data + network_depth;
+
+	if (proto == htons(ETH_P_IP)) {
+		struct iphdr *ipv4 = ip_p;
+
+		tcp = ip_p + sizeof(struct iphdr);
+		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+		ipv4->ttl               = cqe->lro_min_ttl;
+		ipv4->tot_len           = cpu_to_be16(tot_len);
+		ipv4->check             = 0;
+		ipv4->check             = ip_fast_csum((unsigned char *)ipv4,
+						       ipv4->ihl);
+
+		mlx5e_lro_update_tcp_hdr(cqe, tcp);
+		check = csum_partial(tcp, tcp->doff * 4,
+				     csum_unfold((__force __sum16)cqe->check_sum));
+		/* Almost done, don't forget the pseudo header */
+		tcp->check = csum_tcpudp_magic(ipv4->saddr, ipv4->daddr,
+					       tot_len - sizeof(struct iphdr),
+					       IPPROTO_TCP, check);
+	} else {
+		u16 payload_len = tot_len - sizeof(struct ipv6hdr);
+		struct ipv6hdr *ipv6 = ip_p;
+
+		tcp = ip_p + sizeof(struct ipv6hdr);
+		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+
+		ipv6->hop_limit         = cqe->lro_min_ttl;
+		ipv6->payload_len       = cpu_to_be16(payload_len);
+
+		mlx5e_lro_update_tcp_hdr(cqe, tcp);
+		check = csum_partial(tcp, tcp->doff * 4,
+				     csum_unfold((__force __sum16)cqe->check_sum));
+		/* Almost done, don't forget the pseudo header */
+		tcp->check = csum_ipv6_magic(&ipv6->saddr, &ipv6->daddr, payload_len,
+					     IPPROTO_TCP, check);
+	}
+}
+
+static inline void mlx5e_skb_set_hash(struct mlx5_cqe64 *cqe,
+				      struct sk_buff *skb)
+{
+	u8 cht = cqe->rss_hash_type;
+	int ht = (cht & CQE_RSS_HTYPE_L4) ? PKT_HASH_TYPE_L4 :
+		 (cht & CQE_RSS_HTYPE_IP) ? PKT_HASH_TYPE_L3 :
+					    PKT_HASH_TYPE_NONE;
+	skb_set_hash(skb, be32_to_cpu(cqe->rss_hash_result), ht);
+}
+
+static inline bool is_last_ethertype_ip(struct sk_buff *skb, int *network_depth)
+{
+	__be16 ethertype = ((struct ethhdr *)skb->data)->h_proto;
+
+	ethertype = __vlan_get_protocol(skb, ethertype, network_depth);
+	return (ethertype == htons(ETH_P_IP) || ethertype == htons(ETH_P_IPV6));
+}
+
+static __be32 mlx5e_get_fcs(struct sk_buff *skb)
+{
+	int last_frag_sz, bytes_in_prev, nr_frags;
+	u8 *fcs_p1, *fcs_p2;
+	skb_frag_t *last_frag;
+	__be32 fcs_bytes;
+
+	if (!skb_is_nonlinear(skb))
+		return *(__be32 *)(skb->data + skb->len - ETH_FCS_LEN);
+
+	nr_frags = skb_shinfo(skb)->nr_frags;
+	last_frag = &skb_shinfo(skb)->frags[nr_frags - 1];
+	last_frag_sz = skb_frag_size(last_frag);
+
+	/* If all FCS data is in last frag */
+	if (last_frag_sz >= ETH_FCS_LEN)
+		return *(__be32 *)(skb_frag_address(last_frag) +
+				   last_frag_sz - ETH_FCS_LEN);
+
+	fcs_p2 = (u8 *)skb_frag_address(last_frag);
+	bytes_in_prev = ETH_FCS_LEN - last_frag_sz;
+
+	/* Find where the other part of the FCS is - Linear or another frag */
+	if (nr_frags == 1) {
+		fcs_p1 = skb_tail_pointer(skb);
+	} else {
+		skb_frag_t *prev_frag = &skb_shinfo(skb)->frags[nr_frags - 2];
+
+		fcs_p1 = skb_frag_address(prev_frag) +
+			    skb_frag_size(prev_frag);
+	}
+	fcs_p1 -= bytes_in_prev;
+
+	memcpy(&fcs_bytes, fcs_p1, bytes_in_prev);
+	memcpy(((u8 *)&fcs_bytes) + bytes_in_prev, fcs_p2, last_frag_sz);
+
+	return fcs_bytes;
+}
+
+static inline void mlx5e_handle_csum(struct net_device *netdev,
+				     struct mlx5_cqe64 *cqe,
+				     struct mlx5e_rq *rq,
+				     struct sk_buff *skb,
+				     bool   lro)
+{
+	int network_depth = 0;
+
+	if (unlikely(!(netdev->features & NETIF_F_RXCSUM)))
+		goto csum_none;
+
+	if (lro) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		rq->stats.csum_unnecessary++;
+		return;
+	}
+
+	if (likely(is_last_ethertype_ip(skb, &network_depth))) {
+		skb->ip_summed = CHECKSUM_COMPLETE;
+		skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
+		if (network_depth > ETH_HLEN)
+			/* CQE csum is calculated from the IP header and does
+			 * not cover VLAN headers (if present). This will add
+			 * the checksum manually.
+			 */
+			skb->csum = csum_partial(skb->data + ETH_HLEN,
+						 network_depth - ETH_HLEN,
+						 skb->csum);
+		if (unlikely(netdev->features & NETIF_F_RXFCS))
+			skb->csum = csum_add(skb->csum,
+					     (__force __wsum)mlx5e_get_fcs(skb));
+		rq->stats.csum_complete++;
+		return;
+	}
+
+	if (likely((cqe->hds_ip_ext & CQE_L3_OK) &&
+		   (cqe->hds_ip_ext & CQE_L4_OK))) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		if (cqe_is_tunneled(cqe)) {
+			skb->csum_level = 1;
+			skb->encapsulation = 1;
+			rq->stats.csum_unnecessary_inner++;
+			return;
+		}
+		rq->stats.csum_unnecessary++;
+		return;
+	}
+csum_none:
+	skb->ip_summed = CHECKSUM_NONE;
+	rq->stats.csum_none++;
+}
+
+static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
+				      u32 cqe_bcnt,
+				      struct mlx5e_rq *rq,
+				      struct sk_buff *skb)
+{
+	struct net_device *netdev = rq->netdev;
+	int lro_num_seg;
+
+	skb->mac_len = ETH_HLEN;
+	lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
+	if (lro_num_seg > 1) {
+		mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt);
+		skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg);
+		/* Subtract one since we already counted this as one
+		 * "regular" packet in mlx5e_complete_rx_cqe()
+		 */
+		rq->stats.packets += lro_num_seg - 1;
+		rq->stats.lro_packets++;
+		rq->stats.lro_bytes += cqe_bcnt;
+	}
+
+	if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp)))
+		skb_hwtstamps(skb)->hwtstamp =
+				mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe));
+
+	skb_record_rx_queue(skb, rq->ix);
+
+	if (likely(netdev->features & NETIF_F_RXHASH))
+		mlx5e_skb_set_hash(cqe, skb);
+
+	if (cqe_has_vlan(cqe)) {
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+				       be16_to_cpu(cqe->vlan_info));
+		rq->stats.removed_vlan_packets++;
+	}
+
+	skb->mark = be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK;
+
+	mlx5e_handle_csum(netdev, cqe, rq, skb, !!lro_num_seg);
+	skb->protocol = eth_type_trans(skb, netdev);
+}
+
+static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq,
+					 struct mlx5_cqe64 *cqe,
+					 u32 cqe_bcnt,
+					 struct sk_buff *skb)
+{
+	rq->stats.packets++;
+	rq->stats.bytes += cqe_bcnt;
+	mlx5e_build_rx_skb(cqe, cqe_bcnt, rq, skb);
+}
+
+static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_xdpsq *sq)
+{
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	struct mlx5e_tx_wqe *wqe;
+	u16 pi = (sq->pc - 1) & wq->sz_m1; /* last pi */
+
+	wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
+
+	mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &wqe->ctrl);
+}
+
+static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
+					struct mlx5e_dma_info *di,
+					const struct xdp_buff *xdp)
+{
+	struct mlx5e_xdpsq       *sq   = &rq->xdpsq;
+	struct mlx5_wq_cyc       *wq   = &sq->wq;
+	u16                       pi   = sq->pc & wq->sz_m1;
+	struct mlx5e_tx_wqe      *wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
+
+	struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl;
+	struct mlx5_wqe_eth_seg  *eseg = &wqe->eth;
+	struct mlx5_wqe_data_seg *dseg;
+
+	ptrdiff_t data_offset = xdp->data - xdp->data_hard_start;
+	dma_addr_t dma_addr  = di->addr + data_offset;
+	unsigned int dma_len = xdp->data_end - xdp->data;
+
+	prefetchw(wqe);
+
+	if (unlikely(dma_len < MLX5E_XDP_MIN_INLINE || rq->hw_mtu < dma_len)) {
+		rq->stats.xdp_drop++;
+		return false;
+	}
+
+	if (unlikely(!mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1))) {
+		if (sq->db.doorbell) {
+			/* SQ is full, ring doorbell */
+			mlx5e_xmit_xdp_doorbell(sq);
+			sq->db.doorbell = false;
+		}
+		rq->stats.xdp_tx_full++;
+		return false;
+	}
+
+	dma_sync_single_for_device(sq->pdev, dma_addr, dma_len, PCI_DMA_TODEVICE);
+
+	cseg->fm_ce_se = 0;
+
+	dseg = (struct mlx5_wqe_data_seg *)eseg + 1;
+
+	/* copy the inline part if required */
+	if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) {
+		memcpy(eseg->inline_hdr.start, xdp->data, MLX5E_XDP_MIN_INLINE);
+		eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
+		dma_len  -= MLX5E_XDP_MIN_INLINE;
+		dma_addr += MLX5E_XDP_MIN_INLINE;
+		dseg++;
+	}
+
+	/* write the dma part */
+	dseg->addr       = cpu_to_be64(dma_addr);
+	dseg->byte_count = cpu_to_be32(dma_len);
+
+	cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_SEND);
+
+	/* move page to reference to sq responsibility,
+	 * and mark so it's not put back in page-cache.
+	 */
+	__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
+	sq->db.di[pi] = *di;
+	sq->pc++;
+
+	sq->db.doorbell = true;
+
+	rq->stats.xdp_tx++;
+	return true;
+}
+
+/* returns true if packet was consumed by xdp */
+static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
+				   struct mlx5e_dma_info *di,
+				   void *va, u16 *rx_headroom, u32 *len)
+{
+	const struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
+	struct xdp_buff xdp;
+	u32 act;
+
+	if (!prog)
+		return false;
+
+	xdp.data = va + *rx_headroom;
+	xdp_set_data_meta_invalid(&xdp);
+	xdp.data_end = xdp.data + *len;
+	xdp.data_hard_start = va;
+	xdp.rxq = &rq->xdp_rxq;
+
+	act = bpf_prog_run_xdp(prog, &xdp);
+	switch (act) {
+	case XDP_PASS:
+		*rx_headroom = xdp.data - xdp.data_hard_start;
+		*len = xdp.data_end - xdp.data;
+		return false;
+	case XDP_TX:
+		if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp)))
+			trace_xdp_exception(rq->netdev, prog, act);
+		return true;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+	case XDP_ABORTED:
+		trace_xdp_exception(rq->netdev, prog, act);
+	case XDP_DROP:
+		rq->stats.xdp_drop++;
+		return true;
+	}
+}
+
+static inline
+struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
+				       u32 frag_size, u16 headroom,
+				       u32 cqe_bcnt)
+{
+	struct sk_buff *skb = build_skb(va, frag_size);
+
+	if (unlikely(!skb)) {
+		rq->stats.buff_alloc_err++;
+		return NULL;
+	}
+
+	skb_reserve(skb, headroom);
+	skb_put(skb, cqe_bcnt);
+
+	return skb;
+}
+
+static inline
+struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
+{
+	struct mlx5e_dma_info *di = &wi->di;
+	u16 rx_headroom = rq->buff.headroom;
+	struct sk_buff *skb;
+	void *va, *data;
+	bool consumed;
+	u32 frag_size;
+
+	va             = page_address(di->page) + wi->offset;
+	data           = va + rx_headroom;
+	frag_size      = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
+
+	dma_sync_single_range_for_cpu(rq->pdev, di->addr, wi->offset,
+				      frag_size, DMA_FROM_DEVICE);
+	prefetch(data);
+	wi->offset += frag_size;
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
+		rq->stats.wqe_err++;
+		return NULL;
+	}
+
+	rcu_read_lock();
+	consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt);
+	rcu_read_unlock();
+	if (consumed)
+		return NULL; /* page/packet was consumed by XDP */
+
+	skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* queue up for recycling/reuse */
+	page_ref_inc(di->page);
+
+	return skb;
+}
+
+void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	struct mlx5e_wqe_frag_info *wi;
+	struct mlx5e_rx_wqe *wqe;
+	__be16 wqe_counter_be;
+	struct sk_buff *skb;
+	u16 wqe_counter;
+	u32 cqe_bcnt;
+
+	wqe_counter_be = cqe->wqe_counter;
+	wqe_counter    = be16_to_cpu(wqe_counter_be);
+	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wi             = &rq->wqe.frag_info[wqe_counter];
+	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+
+	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	if (!skb) {
+		/* probably for XDP */
+		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
+			wi->di.page = NULL;
+			/* do not return page to cache, it will be returned on XDP_TX completion */
+			goto wq_ll_pop;
+		}
+		/* probably an XDP_DROP, save the page-reuse checks */
+		mlx5e_free_rx_wqe(rq, wi);
+		goto wq_ll_pop;
+	}
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+	napi_gro_receive(rq->cq.napi, skb);
+
+	mlx5e_free_rx_wqe_reuse(rq, wi);
+wq_ll_pop:
+	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+		       &wqe->next.next_wqe_index);
+}
+
+#ifdef CONFIG_MLX5_ESWITCH
+void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	struct net_device *netdev = rq->netdev;
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_rep_priv *rpriv  = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct mlx5e_wqe_frag_info *wi;
+	struct mlx5e_rx_wqe *wqe;
+	struct sk_buff *skb;
+	__be16 wqe_counter_be;
+	u16 wqe_counter;
+	u32 cqe_bcnt;
+
+	wqe_counter_be = cqe->wqe_counter;
+	wqe_counter    = be16_to_cpu(wqe_counter_be);
+	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wi             = &rq->wqe.frag_info[wqe_counter];
+	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+
+	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	if (!skb) {
+		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
+			wi->di.page = NULL;
+			/* do not return page to cache, it will be returned on XDP_TX completion */
+			goto wq_ll_pop;
+		}
+		/* probably an XDP_DROP, save the page-reuse checks */
+		mlx5e_free_rx_wqe(rq, wi);
+		goto wq_ll_pop;
+	}
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+
+	if (rep->vlan && skb_vlan_tag_present(skb))
+		skb_vlan_pop(skb);
+
+	napi_gro_receive(rq->cq.napi, skb);
+
+	mlx5e_free_rx_wqe_reuse(rq, wi);
+wq_ll_pop:
+	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+		       &wqe->next.next_wqe_index);
+}
+#endif
+
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+				   u16 cqe_bcnt, u32 head_offset, u32 page_idx)
+{
+	u16 headlen = min_t(u16, MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, cqe_bcnt);
+	struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
+	u32 frag_offset    = head_offset + headlen;
+	u32 byte_cnt       = cqe_bcnt - headlen;
+	struct mlx5e_dma_info *head_di = di;
+	struct sk_buff *skb;
+
+	skb = napi_alloc_skb(rq->cq.napi,
+			     ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, sizeof(long)));
+	if (unlikely(!skb)) {
+		rq->stats.buff_alloc_err++;
+		return NULL;
+	}
+
+	prefetchw(skb->data);
+
+	if (unlikely(frag_offset >= PAGE_SIZE)) {
+		di++;
+		frag_offset -= PAGE_SIZE;
+	}
+
+	while (byte_cnt) {
+		u32 pg_consumed_bytes =
+			min_t(u32, PAGE_SIZE - frag_offset, byte_cnt);
+
+		mlx5e_add_skb_frag_mpwqe(rq, skb, di, frag_offset,
+					 pg_consumed_bytes);
+		byte_cnt -= pg_consumed_bytes;
+		frag_offset = 0;
+		di++;
+	}
+	/* copy header */
+	mlx5e_copy_skb_header_mpwqe(rq->pdev, skb, head_di,
+				    head_offset, headlen);
+	/* skb linear part was allocated with headlen and aligned to long */
+	skb->tail += headlen;
+	skb->len  += headlen;
+
+	return skb;
+}
+
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+				u16 cqe_bcnt, u32 head_offset, u32 page_idx)
+{
+	struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
+	u16 rx_headroom = rq->buff.headroom;
+	u32 cqe_bcnt32 = cqe_bcnt;
+	struct sk_buff *skb;
+	void *va, *data;
+	u32 frag_size;
+	bool consumed;
+
+	va             = page_address(di->page) + head_offset;
+	data           = va + rx_headroom;
+	frag_size      = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32);
+
+	dma_sync_single_range_for_cpu(rq->pdev, di->addr, head_offset,
+				      frag_size, DMA_FROM_DEVICE);
+	prefetch(data);
+
+	rcu_read_lock();
+	consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32);
+	rcu_read_unlock();
+	if (consumed) {
+		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
+			__set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */
+		return NULL; /* page/packet was consumed by XDP */
+	}
+
+	skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* queue up for recycling/reuse */
+	page_ref_inc(di->page);
+
+	return skb;
+}
+
+void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	u16 cstrides       = mpwrq_get_cqe_consumed_strides(cqe);
+	u16 wqe_id         = be16_to_cpu(cqe->wqe_id);
+	struct mlx5e_mpw_info *wi = &rq->mpwqe.info[wqe_id];
+	u16 stride_ix      = mpwrq_get_cqe_stride_index(cqe);
+	u32 wqe_offset     = stride_ix << rq->mpwqe.log_stride_sz;
+	u32 head_offset    = wqe_offset & (PAGE_SIZE - 1);
+	u32 page_idx       = wqe_offset >> PAGE_SHIFT;
+	struct mlx5e_rx_wqe *wqe;
+	struct sk_buff *skb;
+	u16 cqe_bcnt;
+
+	wi->consumed_strides += cstrides;
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
+		rq->stats.wqe_err++;
+		goto mpwrq_cqe_out;
+	}
+
+	if (unlikely(mpwrq_is_filler_cqe(cqe))) {
+		rq->stats.mpwqe_filler++;
+		goto mpwrq_cqe_out;
+	}
+
+	cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);
+
+	skb = rq->mpwqe.skb_from_cqe_mpwrq(rq, wi, cqe_bcnt, head_offset,
+					   page_idx);
+	if (!skb)
+		goto mpwrq_cqe_out;
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+	napi_gro_receive(rq->cq.napi, skb);
+
+mpwrq_cqe_out:
+	if (likely(wi->consumed_strides < rq->mpwqe.num_strides))
+		return;
+
+	wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_id);
+	mlx5e_free_rx_mpwqe(rq, wi);
+	mlx5_wq_ll_pop(&rq->wq, cqe->wqe_id, &wqe->next.next_wqe_index);
+}
+
+int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
+{
+	struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
+	struct mlx5e_xdpsq *xdpsq;
+	struct mlx5_cqe64 *cqe;
+	int work_done = 0;
+
+	if (unlikely(!MLX5E_TEST_BIT(rq->state, MLX5E_RQ_STATE_ENABLED)))
+		return 0;
+
+	if (cq->decmprs_left)
+		work_done += mlx5e_decompress_cqes_cont(rq, cq, 0, budget);
+
+	cqe = mlx5_cqwq_get_cqe(&cq->wq);
+	if (!cqe)
+		return 0;
+
+	xdpsq = &rq->xdpsq;
+
+	do {
+		if (mlx5_get_cqe_format(cqe) == MLX5_COMPRESSED) {
+			work_done +=
+				mlx5e_decompress_cqes_start(rq, cq,
+							    budget - work_done);
+			continue;
+		}
+
+		mlx5_cqwq_pop(&cq->wq);
+
+		rq->handle_rx_cqe(rq, cqe);
+	} while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(&cq->wq)));
+
+	if (xdpsq->db.doorbell) {
+		mlx5e_xmit_xdp_doorbell(xdpsq);
+		xdpsq->db.doorbell = false;
+	}
+
+	mlx5_cqwq_update_db_record(&cq->wq);
+
+	/* ensure cq space is freed before enabling more cqes */
+	wmb();
+
+	return work_done;
+}
+
+bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
+{
+	struct mlx5e_xdpsq *sq;
+	struct mlx5_cqe64 *cqe;
+	struct mlx5e_rq *rq;
+	u16 sqcc;
+	int i;
+
+	sq = container_of(cq, struct mlx5e_xdpsq, cq);
+
+	if (unlikely(!MLX5E_TEST_BIT(sq->state, MLX5E_SQ_STATE_ENABLED)))
+		return false;
+
+	cqe = mlx5_cqwq_get_cqe(&cq->wq);
+	if (!cqe)
+		return false;
+
+	rq = container_of(sq, struct mlx5e_rq, xdpsq);
+
+	/* sq->cc must be updated only after mlx5_cqwq_update_db_record(),
+	 * otherwise a cq overrun may occur
+	 */
+	sqcc = sq->cc;
+
+	i = 0;
+	do {
+		u16 wqe_counter;
+		bool last_wqe;
+
+		mlx5_cqwq_pop(&cq->wq);
+
+		wqe_counter = be16_to_cpu(cqe->wqe_counter);
+
+		do {
+			struct mlx5e_dma_info *di;
+			u16 ci;
+
+			last_wqe = (sqcc == wqe_counter);
+
+			ci = sqcc & sq->wq.sz_m1;
+			di = &sq->db.di[ci];
+
+			sqcc++;
+			/* Recycle RX page */
+			mlx5e_page_release(rq, di, true);
+		} while (!last_wqe);
+	} while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq)));
+
+	mlx5_cqwq_update_db_record(&cq->wq);
+
+	/* ensure cq space is freed before enabling more cqes */
+	wmb();
+
+	sq->cc = sqcc;
+	return (i == MLX5E_TX_CQ_POLL_BUDGET);
+}
+
+void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq)
+{
+	struct mlx5e_rq *rq = container_of(sq, struct mlx5e_rq, xdpsq);
+	struct mlx5e_dma_info *di;
+	u16 ci;
+
+	while (sq->cc != sq->pc) {
+		ci = sq->cc & sq->wq.sz_m1;
+		di = &sq->db.di[ci];
+		sq->cc++;
+
+		mlx5e_page_release(rq, di, false);
+	}
+}
+
+#ifdef CONFIG_MLX5_CORE_IPOIB
+
+#define MLX5_IB_GRH_DGID_OFFSET 24
+#define MLX5_GID_SIZE           16
+
+static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
+					 struct mlx5_cqe64 *cqe,
+					 u32 cqe_bcnt,
+					 struct sk_buff *skb)
+{
+	struct hwtstamp_config *tstamp;
+	struct net_device *netdev;
+	struct mlx5e_priv *priv;
+	char *pseudo_header;
+	u32 qpn;
+	u8 *dgid;
+	u8 g;
+
+	qpn = be32_to_cpu(cqe->sop_drop_qpn) & 0xffffff;
+	netdev = mlx5i_pkey_get_netdev(rq->netdev, qpn);
+
+	/* No mapping present, cannot process SKB. This might happen if a child
+	 * interface is going down while having unprocessed CQEs on parent RQ
+	 */
+	if (unlikely(!netdev)) {
+		/* TODO: add drop counters support */
+		skb->dev = NULL;
+		pr_warn_once("Unable to map QPN %u to dev - dropping skb\n", qpn);
+		return;
+	}
+
+	priv = mlx5i_epriv(netdev);
+	tstamp = &priv->tstamp;
+
+	g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3;
+	dgid = skb->data + MLX5_IB_GRH_DGID_OFFSET;
+	if ((!g) || dgid[0] != 0xff)
+		skb->pkt_type = PACKET_HOST;
+	else if (memcmp(dgid, netdev->broadcast + 4, MLX5_GID_SIZE) == 0)
+		skb->pkt_type = PACKET_BROADCAST;
+	else
+		skb->pkt_type = PACKET_MULTICAST;
+
+	/* TODO: IB/ipoib: Allow mcast packets from other VFs
+	 * 68996a6e760e5c74654723eeb57bf65628ae87f4
+	 */
+
+	skb_pull(skb, MLX5_IB_GRH_BYTES);
+
+	skb->protocol = *((__be16 *)(skb->data));
+
+	skb->ip_summed = CHECKSUM_COMPLETE;
+	skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
+
+	if (unlikely(mlx5e_rx_hw_stamp(tstamp)))
+		skb_hwtstamps(skb)->hwtstamp =
+				mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe));
+
+	skb_record_rx_queue(skb, rq->ix);
+
+	if (likely(netdev->features & NETIF_F_RXHASH))
+		mlx5e_skb_set_hash(cqe, skb);
+
+	/* 20 bytes of ipoib header and 4 for encap existing */
+	pseudo_header = skb_push(skb, MLX5_IPOIB_PSEUDO_LEN);
+	memset(pseudo_header, 0, MLX5_IPOIB_PSEUDO_LEN);
+	skb_reset_mac_header(skb);
+	skb_pull(skb, MLX5_IPOIB_HARD_LEN);
+
+	skb->dev = netdev;
+
+	rq->stats.csum_complete++;
+	rq->stats.packets++;
+	rq->stats.bytes += cqe_bcnt;
+}
+
+void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	struct mlx5e_wqe_frag_info *wi;
+	struct mlx5e_rx_wqe *wqe;
+	__be16 wqe_counter_be;
+	struct sk_buff *skb;
+	u16 wqe_counter;
+	u32 cqe_bcnt;
+
+	wqe_counter_be = cqe->wqe_counter;
+	wqe_counter    = be16_to_cpu(wqe_counter_be);
+	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wi             = &rq->wqe.frag_info[wqe_counter];
+	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+
+	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	if (!skb)
+		goto wq_free_wqe;
+
+	mlx5i_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+	if (unlikely(!skb->dev)) {
+		dev_kfree_skb_any(skb);
+		goto wq_free_wqe;
+	}
+	napi_gro_receive(rq->cq.napi, skb);
+
+wq_free_wqe:
+	mlx5e_free_rx_wqe_reuse(rq, wi);
+	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+		       &wqe->next.next_wqe_index);
+}
+
+#endif /* CONFIG_MLX5_CORE_IPOIB */
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+
+void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	struct mlx5e_wqe_frag_info *wi;
+	struct mlx5e_rx_wqe *wqe;
+	__be16 wqe_counter_be;
+	struct sk_buff *skb;
+	u16 wqe_counter;
+	u32 cqe_bcnt;
+
+	wqe_counter_be = cqe->wqe_counter;
+	wqe_counter    = be16_to_cpu(wqe_counter_be);
+	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wi             = &rq->wqe.frag_info[wqe_counter];
+	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+
+	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	if (unlikely(!skb)) {
+		/* a DROP, save the page-reuse checks */
+		mlx5e_free_rx_wqe(rq, wi);
+		goto wq_ll_pop;
+	}
+	skb = mlx5e_ipsec_handle_rx_skb(rq->netdev, skb);
+	if (unlikely(!skb)) {
+		mlx5e_free_rx_wqe(rq, wi);
+		goto wq_ll_pop;
+	}
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+	napi_gro_receive(rq->cq.napi, skb);
+
+	mlx5e_free_rx_wqe_reuse(rq, wi);
+wq_ll_pop:
+	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+		       &wqe->next.next_wqe_index);
+}
+
+#endif /* CONFIG_MLX5_EN_IPSEC */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
new file mode 100644
index 0000000..027f54a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/prefetch.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <net/udp.h>
+#include "en.h"
+
+enum {
+	MLX5E_ST_LINK_STATE,
+	MLX5E_ST_LINK_SPEED,
+	MLX5E_ST_HEALTH_INFO,
+#ifdef CONFIG_INET
+	MLX5E_ST_LOOPBACK,
+#endif
+	MLX5E_ST_NUM,
+};
+
+const char mlx5e_self_tests[MLX5E_ST_NUM][ETH_GSTRING_LEN] = {
+	"Link Test",
+	"Speed Test",
+	"Health Test",
+#ifdef CONFIG_INET
+	"Loopback Test",
+#endif
+};
+
+int mlx5e_self_test_num(struct mlx5e_priv *priv)
+{
+	return ARRAY_SIZE(mlx5e_self_tests);
+}
+
+static int mlx5e_test_health_info(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_health *health = &priv->mdev->priv.health;
+
+	return health->sick ? 1 : 0;
+}
+
+static int mlx5e_test_link_state(struct mlx5e_priv *priv)
+{
+	u8 port_state;
+
+	if (!netif_carrier_ok(priv->netdev))
+		return 1;
+
+	port_state = mlx5_query_vport_state(priv->mdev, MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT, 0);
+	return port_state == VPORT_STATE_UP ? 0 : 1;
+}
+
+static int mlx5e_test_link_speed(struct mlx5e_priv *priv)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	u32 eth_proto_oper;
+	int i;
+
+	if (!netif_carrier_ok(priv->netdev))
+		return 1;
+
+	if (mlx5_query_port_ptys(priv->mdev, out, sizeof(out), MLX5_PTYS_EN, 1))
+		return 1;
+
+	eth_proto_oper = MLX5_GET(ptys_reg, out, eth_proto_oper);
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; i++) {
+		if (eth_proto_oper & MLX5E_PROT_MASK(i))
+			return 0;
+	}
+	return 1;
+}
+
+#ifdef CONFIG_INET
+/* loopback test */
+#define MLX5E_TEST_PKT_SIZE (MLX5_MPWRQ_SMALL_PACKET_THRESHOLD - NET_IP_ALIGN)
+static const char mlx5e_test_text[ETH_GSTRING_LEN] = "MLX5E SELF TEST";
+#define MLX5E_TEST_MAGIC 0x5AEED15C001ULL
+
+struct mlx5ehdr {
+	__be32 version;
+	__be64 magic;
+	char   text[ETH_GSTRING_LEN];
+};
+
+static struct sk_buff *mlx5e_test_get_udp_skb(struct mlx5e_priv *priv)
+{
+	struct sk_buff *skb = NULL;
+	struct mlx5ehdr *mlxh;
+	struct ethhdr *ethh;
+	struct udphdr *udph;
+	struct iphdr *iph;
+	int datalen, iplen;
+
+	datalen = MLX5E_TEST_PKT_SIZE -
+		  (sizeof(*ethh) + sizeof(*iph) + sizeof(*udph));
+
+	skb = netdev_alloc_skb(priv->netdev, MLX5E_TEST_PKT_SIZE);
+	if (!skb) {
+		netdev_err(priv->netdev, "\tFailed to alloc loopback skb\n");
+		return NULL;
+	}
+
+	prefetchw(skb->data);
+	skb_reserve(skb, NET_IP_ALIGN);
+
+	/*  Reserve for ethernet and IP header  */
+	ethh = skb_push(skb, ETH_HLEN);
+	skb_reset_mac_header(skb);
+
+	skb_set_network_header(skb, skb->len);
+	iph = skb_put(skb, sizeof(struct iphdr));
+
+	skb_set_transport_header(skb, skb->len);
+	udph = skb_put(skb, sizeof(struct udphdr));
+
+	/* Fill ETH header */
+	ether_addr_copy(ethh->h_dest, priv->netdev->dev_addr);
+	eth_zero_addr(ethh->h_source);
+	ethh->h_proto = htons(ETH_P_IP);
+
+	/* Fill UDP header */
+	udph->source = htons(9);
+	udph->dest = htons(9); /* Discard Protocol */
+	udph->len = htons(datalen + sizeof(struct udphdr));
+	udph->check = 0;
+
+	/* Fill IP header */
+	iph->ihl = 5;
+	iph->ttl = 32;
+	iph->version = 4;
+	iph->protocol = IPPROTO_UDP;
+	iplen = sizeof(struct iphdr) + sizeof(struct udphdr) + datalen;
+	iph->tot_len = htons(iplen);
+	iph->frag_off = 0;
+	iph->saddr = 0;
+	iph->daddr = 0;
+	iph->tos = 0;
+	iph->id = 0;
+	ip_send_check(iph);
+
+	/* Fill test header and data */
+	mlxh = skb_put(skb, sizeof(*mlxh));
+	mlxh->version = 0;
+	mlxh->magic = cpu_to_be64(MLX5E_TEST_MAGIC);
+	strlcpy(mlxh->text, mlx5e_test_text, sizeof(mlxh->text));
+	datalen -= sizeof(*mlxh);
+	skb_put_zero(skb, datalen);
+
+	skb->csum = 0;
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	udp4_hwcsum(skb, iph->saddr, iph->daddr);
+
+	skb->protocol = htons(ETH_P_IP);
+	skb->pkt_type = PACKET_HOST;
+	skb->dev = priv->netdev;
+
+	return skb;
+}
+
+struct mlx5e_lbt_priv {
+	struct packet_type pt;
+	struct completion comp;
+	bool loopback_ok;
+	bool local_lb;
+};
+
+static int
+mlx5e_test_loopback_validate(struct sk_buff *skb,
+			     struct net_device *ndev,
+			     struct packet_type *pt,
+			     struct net_device *orig_ndev)
+{
+	struct mlx5e_lbt_priv *lbtp = pt->af_packet_priv;
+	struct mlx5ehdr *mlxh;
+	struct ethhdr *ethh;
+	struct udphdr *udph;
+	struct iphdr *iph;
+
+	/* We are only going to peek, no need to clone the SKB */
+	if (MLX5E_TEST_PKT_SIZE - ETH_HLEN > skb_headlen(skb))
+		goto out;
+
+	ethh = (struct ethhdr *)skb_mac_header(skb);
+	if (!ether_addr_equal(ethh->h_dest, orig_ndev->dev_addr))
+		goto out;
+
+	iph = ip_hdr(skb);
+	if (iph->protocol != IPPROTO_UDP)
+		goto out;
+
+	/* Don't assume skb_transport_header() was set */
+	udph = (struct udphdr *)((u8 *)iph + 4 * iph->ihl);
+	if (udph->dest != htons(9))
+		goto out;
+
+	mlxh = (struct mlx5ehdr *)((char *)udph + sizeof(*udph));
+	if (mlxh->magic != cpu_to_be64(MLX5E_TEST_MAGIC))
+		goto out; /* so close ! */
+
+	/* bingo */
+	lbtp->loopback_ok = true;
+	complete(&lbtp->comp);
+out:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int mlx5e_test_loopback_setup(struct mlx5e_priv *priv,
+				     struct mlx5e_lbt_priv *lbtp)
+{
+	int err = 0;
+
+	/* Temporarily enable local_lb */
+	err = mlx5_nic_vport_query_local_lb(priv->mdev, &lbtp->local_lb);
+	if (err)
+		return err;
+
+	if (!lbtp->local_lb) {
+		err = mlx5_nic_vport_update_local_lb(priv->mdev, true);
+		if (err)
+			return err;
+	}
+
+	err = mlx5e_refresh_tirs(priv, true);
+	if (err)
+		goto out;
+
+	lbtp->loopback_ok = false;
+	init_completion(&lbtp->comp);
+
+	lbtp->pt.type = htons(ETH_P_IP);
+	lbtp->pt.func = mlx5e_test_loopback_validate;
+	lbtp->pt.dev = priv->netdev;
+	lbtp->pt.af_packet_priv = lbtp;
+	dev_add_pack(&lbtp->pt);
+
+	return 0;
+
+out:
+	if (!lbtp->local_lb)
+		mlx5_nic_vport_update_local_lb(priv->mdev, false);
+
+	return err;
+}
+
+static void mlx5e_test_loopback_cleanup(struct mlx5e_priv *priv,
+					struct mlx5e_lbt_priv *lbtp)
+{
+	if (!lbtp->local_lb)
+		mlx5_nic_vport_update_local_lb(priv->mdev, false);
+
+	dev_remove_pack(&lbtp->pt);
+	mlx5e_refresh_tirs(priv, false);
+}
+
+#define MLX5E_LB_VERIFY_TIMEOUT (msecs_to_jiffies(200))
+static int mlx5e_test_loopback(struct mlx5e_priv *priv)
+{
+	struct mlx5e_lbt_priv *lbtp;
+	struct sk_buff *skb = NULL;
+	int err;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		netdev_err(priv->netdev,
+			   "\tCan't perform loopback test while device is down\n");
+		return -ENODEV;
+	}
+
+	lbtp = kzalloc(sizeof(*lbtp), GFP_KERNEL);
+	if (!lbtp)
+		return -ENOMEM;
+	lbtp->loopback_ok = false;
+
+	err = mlx5e_test_loopback_setup(priv, lbtp);
+	if (err)
+		goto out;
+
+	skb = mlx5e_test_get_udp_skb(priv);
+	if (!skb) {
+		err = -ENOMEM;
+		goto cleanup;
+	}
+
+	skb_set_queue_mapping(skb, 0);
+	err = dev_queue_xmit(skb);
+	if (err) {
+		netdev_err(priv->netdev,
+			   "\tFailed to xmit loopback packet err(%d)\n",
+			   err);
+		goto cleanup;
+	}
+
+	wait_for_completion_timeout(&lbtp->comp, MLX5E_LB_VERIFY_TIMEOUT);
+	err = !lbtp->loopback_ok;
+
+cleanup:
+	mlx5e_test_loopback_cleanup(priv, lbtp);
+out:
+	kfree(lbtp);
+	return err;
+}
+#endif
+
+static int (*mlx5e_st_func[MLX5E_ST_NUM])(struct mlx5e_priv *) = {
+	mlx5e_test_link_state,
+	mlx5e_test_link_speed,
+	mlx5e_test_health_info,
+#ifdef CONFIG_INET
+	mlx5e_test_loopback,
+#endif
+};
+
+void mlx5e_self_test(struct net_device *ndev, struct ethtool_test *etest,
+		     u64 *buf)
+{
+	struct mlx5e_priv *priv = netdev_priv(ndev);
+	int i;
+
+	memset(buf, 0, sizeof(u64) * MLX5E_ST_NUM);
+
+	mutex_lock(&priv->state_lock);
+	netdev_info(ndev, "Self test begin..\n");
+
+	for (i = 0; i < MLX5E_ST_NUM; i++) {
+		netdev_info(ndev, "\t[%d] %s start..\n",
+			    i, mlx5e_self_tests[i]);
+		buf[i] = mlx5e_st_func[i](priv);
+		netdev_info(ndev, "\t[%d] %s end: result(%lld)\n",
+			    i, mlx5e_self_tests[i], buf[i]);
+	}
+
+	mutex_unlock(&priv->state_lock);
+
+	for (i = 0; i < MLX5E_ST_NUM; i++) {
+		if (buf[i]) {
+			etest->flags |= ETH_TEST_FL_FAILED;
+			break;
+		}
+	}
+	netdev_info(ndev, "Self test out: status flags(0x%x)\n",
+		    etest->flags);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
new file mode 100644
index 0000000..b08c944
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -0,0 +1,1277 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "en.h"
+#include "en_accel/ipsec.h"
+
+static const struct counter_desc sw_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_inner_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_inner_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_added_vlan_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_lro_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_lro_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_removed_vlan_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_unnecessary) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_none) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_complete) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_unnecessary_inner) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_drop) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_full) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_none) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial_inner) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_stopped) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_dropped) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xmit_more) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_recover) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_wqe_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_blks) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_pkts) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_page_reuse) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_reuse) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_full) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_empty) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_busy) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_waive) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_eq_rearm) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, link_down_events_phy) },
+};
+
+#define NUM_SW_COUNTERS			ARRAY_SIZE(sw_stats_desc)
+
+static int mlx5e_grp_sw_get_num_stats(struct mlx5e_priv *priv)
+{
+	return NUM_SW_COUNTERS;
+}
+
+static int mlx5e_grp_sw_fill_strings(struct mlx5e_priv *priv, u8 *data, int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_SW_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, sw_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_sw_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_SW_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_CPU(&priv->stats.sw, sw_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_sw_stats temp, *s = &temp;
+	struct mlx5e_rq_stats *rq_stats;
+	struct mlx5e_sq_stats *sq_stats;
+	struct mlx5e_ch_stats *ch_stats;
+	int i, j;
+
+	memset(s, 0, sizeof(*s));
+	for (i = 0; i < priv->channels.num; i++) {
+		struct mlx5e_channel *c = priv->channels.c[i];
+
+		rq_stats = &c->rq.stats;
+		ch_stats = &c->stats;
+
+		s->rx_packets	+= rq_stats->packets;
+		s->rx_bytes	+= rq_stats->bytes;
+		s->rx_lro_packets += rq_stats->lro_packets;
+		s->rx_lro_bytes	+= rq_stats->lro_bytes;
+		s->rx_removed_vlan_packets += rq_stats->removed_vlan_packets;
+		s->rx_csum_none	+= rq_stats->csum_none;
+		s->rx_csum_complete += rq_stats->csum_complete;
+		s->rx_csum_unnecessary += rq_stats->csum_unnecessary;
+		s->rx_csum_unnecessary_inner += rq_stats->csum_unnecessary_inner;
+		s->rx_xdp_drop += rq_stats->xdp_drop;
+		s->rx_xdp_tx += rq_stats->xdp_tx;
+		s->rx_xdp_tx_full += rq_stats->xdp_tx_full;
+		s->rx_wqe_err   += rq_stats->wqe_err;
+		s->rx_mpwqe_filler += rq_stats->mpwqe_filler;
+		s->rx_buff_alloc_err += rq_stats->buff_alloc_err;
+		s->rx_cqe_compress_blks += rq_stats->cqe_compress_blks;
+		s->rx_cqe_compress_pkts += rq_stats->cqe_compress_pkts;
+		s->rx_page_reuse  += rq_stats->page_reuse;
+		s->rx_cache_reuse += rq_stats->cache_reuse;
+		s->rx_cache_full  += rq_stats->cache_full;
+		s->rx_cache_empty += rq_stats->cache_empty;
+		s->rx_cache_busy  += rq_stats->cache_busy;
+		s->rx_cache_waive += rq_stats->cache_waive;
+		s->ch_eq_rearm += ch_stats->eq_rearm;
+
+		for (j = 0; j < priv->channels.params.num_tc; j++) {
+			sq_stats = &c->sq[j].stats;
+
+			s->tx_packets		+= sq_stats->packets;
+			s->tx_bytes		+= sq_stats->bytes;
+			s->tx_tso_packets	+= sq_stats->tso_packets;
+			s->tx_tso_bytes		+= sq_stats->tso_bytes;
+			s->tx_tso_inner_packets	+= sq_stats->tso_inner_packets;
+			s->tx_tso_inner_bytes	+= sq_stats->tso_inner_bytes;
+			s->tx_added_vlan_packets += sq_stats->added_vlan_packets;
+			s->tx_queue_stopped	+= sq_stats->stopped;
+			s->tx_queue_wake	+= sq_stats->wake;
+			s->tx_queue_dropped	+= sq_stats->dropped;
+			s->tx_cqe_err		+= sq_stats->cqe_err;
+			s->tx_recover		+= sq_stats->recover;
+			s->tx_xmit_more		+= sq_stats->xmit_more;
+			s->tx_csum_partial_inner += sq_stats->csum_partial_inner;
+			s->tx_csum_none		+= sq_stats->csum_none;
+			s->tx_csum_partial	+= sq_stats->csum_partial;
+		}
+	}
+
+	s->link_down_events_phy = MLX5_GET(ppcnt_reg,
+				priv->stats.pport.phy_counters,
+				counter_set.phys_layer_cntrs.link_down_events);
+	memcpy(&priv->stats.sw, s, sizeof(*s));
+}
+
+static const struct counter_desc q_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_qcounter_stats, rx_out_of_buffer) },
+};
+
+static const struct counter_desc drop_rq_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_qcounter_stats, rx_if_down_packets) },
+};
+
+#define NUM_Q_COUNTERS			ARRAY_SIZE(q_stats_desc)
+#define NUM_DROP_RQ_COUNTERS		ARRAY_SIZE(drop_rq_stats_desc)
+
+static int mlx5e_grp_q_get_num_stats(struct mlx5e_priv *priv)
+{
+	int num_stats = 0;
+
+	if (priv->q_counter)
+		num_stats += NUM_Q_COUNTERS;
+
+	if (priv->drop_rq_q_counter)
+		num_stats += NUM_DROP_RQ_COUNTERS;
+
+	return num_stats;
+}
+
+static int mlx5e_grp_q_fill_strings(struct mlx5e_priv *priv, u8 *data, int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_Q_COUNTERS && priv->q_counter; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       q_stats_desc[i].format);
+
+	for (i = 0; i < NUM_DROP_RQ_COUNTERS && priv->drop_rq_q_counter; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       drop_rq_stats_desc[i].format);
+
+	return idx;
+}
+
+static int mlx5e_grp_q_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_Q_COUNTERS && priv->q_counter; i++)
+		data[idx++] = MLX5E_READ_CTR32_CPU(&priv->stats.qcnt,
+						   q_stats_desc, i);
+	for (i = 0; i < NUM_DROP_RQ_COUNTERS && priv->drop_rq_q_counter; i++)
+		data[idx++] = MLX5E_READ_CTR32_CPU(&priv->stats.qcnt,
+						   drop_rq_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_q_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_qcounter_stats *qcnt = &priv->stats.qcnt;
+	u32 out[MLX5_ST_SZ_DW(query_q_counter_out)];
+
+	if (priv->q_counter &&
+	    !mlx5_core_query_q_counter(priv->mdev, priv->q_counter, 0, out,
+				       sizeof(out)))
+		qcnt->rx_out_of_buffer = MLX5_GET(query_q_counter_out,
+						  out, out_of_buffer);
+	if (priv->drop_rq_q_counter &&
+	    !mlx5_core_query_q_counter(priv->mdev, priv->drop_rq_q_counter, 0,
+				       out, sizeof(out)))
+		qcnt->rx_if_down_packets = MLX5_GET(query_q_counter_out, out,
+						    out_of_buffer);
+}
+
+#define VNIC_ENV_OFF(c) MLX5_BYTE_OFF(query_vnic_env_out, c)
+static const struct counter_desc vnic_env_stats_desc[] = {
+	{ "rx_steer_missed_packets",
+		VNIC_ENV_OFF(vport_env.nic_receive_steering_discard) },
+};
+
+#define NUM_VNIC_ENV_COUNTERS		ARRAY_SIZE(vnic_env_stats_desc)
+
+static int mlx5e_grp_vnic_env_get_num_stats(struct mlx5e_priv *priv)
+{
+	return MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard) ?
+		NUM_VNIC_ENV_COUNTERS : 0;
+}
+
+static int mlx5e_grp_vnic_env_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					   int idx)
+{
+	int i;
+
+	if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard))
+		return idx;
+
+	for (i = 0; i < NUM_VNIC_ENV_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       vnic_env_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_vnic_env_fill_stats(struct mlx5e_priv *priv, u64 *data,
+					 int idx)
+{
+	int i;
+
+	if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard))
+		return idx;
+
+	for (i = 0; i < NUM_VNIC_ENV_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(priv->stats.vnic.query_vnic_env_out,
+						  vnic_env_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_vnic_env_update_stats(struct mlx5e_priv *priv)
+{
+	u32 *out = (u32 *)priv->stats.vnic.query_vnic_env_out;
+	int outlen = MLX5_ST_SZ_BYTES(query_vnic_env_out);
+	u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {0};
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard))
+		return;
+
+	MLX5_SET(query_vnic_env_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VNIC_ENV);
+	MLX5_SET(query_vnic_env_in, in, op_mod, 0);
+	MLX5_SET(query_vnic_env_in, in, other_vport, 0);
+	mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
+}
+
+#define VPORT_COUNTER_OFF(c) MLX5_BYTE_OFF(query_vport_counter_out, c)
+static const struct counter_desc vport_stats_desc[] = {
+	{ "rx_vport_unicast_packets",
+		VPORT_COUNTER_OFF(received_eth_unicast.packets) },
+	{ "rx_vport_unicast_bytes",
+		VPORT_COUNTER_OFF(received_eth_unicast.octets) },
+	{ "tx_vport_unicast_packets",
+		VPORT_COUNTER_OFF(transmitted_eth_unicast.packets) },
+	{ "tx_vport_unicast_bytes",
+		VPORT_COUNTER_OFF(transmitted_eth_unicast.octets) },
+	{ "rx_vport_multicast_packets",
+		VPORT_COUNTER_OFF(received_eth_multicast.packets) },
+	{ "rx_vport_multicast_bytes",
+		VPORT_COUNTER_OFF(received_eth_multicast.octets) },
+	{ "tx_vport_multicast_packets",
+		VPORT_COUNTER_OFF(transmitted_eth_multicast.packets) },
+	{ "tx_vport_multicast_bytes",
+		VPORT_COUNTER_OFF(transmitted_eth_multicast.octets) },
+	{ "rx_vport_broadcast_packets",
+		VPORT_COUNTER_OFF(received_eth_broadcast.packets) },
+	{ "rx_vport_broadcast_bytes",
+		VPORT_COUNTER_OFF(received_eth_broadcast.octets) },
+	{ "tx_vport_broadcast_packets",
+		VPORT_COUNTER_OFF(transmitted_eth_broadcast.packets) },
+	{ "tx_vport_broadcast_bytes",
+		VPORT_COUNTER_OFF(transmitted_eth_broadcast.octets) },
+	{ "rx_vport_rdma_unicast_packets",
+		VPORT_COUNTER_OFF(received_ib_unicast.packets) },
+	{ "rx_vport_rdma_unicast_bytes",
+		VPORT_COUNTER_OFF(received_ib_unicast.octets) },
+	{ "tx_vport_rdma_unicast_packets",
+		VPORT_COUNTER_OFF(transmitted_ib_unicast.packets) },
+	{ "tx_vport_rdma_unicast_bytes",
+		VPORT_COUNTER_OFF(transmitted_ib_unicast.octets) },
+	{ "rx_vport_rdma_multicast_packets",
+		VPORT_COUNTER_OFF(received_ib_multicast.packets) },
+	{ "rx_vport_rdma_multicast_bytes",
+		VPORT_COUNTER_OFF(received_ib_multicast.octets) },
+	{ "tx_vport_rdma_multicast_packets",
+		VPORT_COUNTER_OFF(transmitted_ib_multicast.packets) },
+	{ "tx_vport_rdma_multicast_bytes",
+		VPORT_COUNTER_OFF(transmitted_ib_multicast.octets) },
+};
+
+#define NUM_VPORT_COUNTERS		ARRAY_SIZE(vport_stats_desc)
+
+static int mlx5e_grp_vport_get_num_stats(struct mlx5e_priv *priv)
+{
+	return NUM_VPORT_COUNTERS;
+}
+
+static int mlx5e_grp_vport_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, vport_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_vport_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				      int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(priv->stats.vport.query_vport_out,
+						  vport_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_vport_update_stats(struct mlx5e_priv *priv)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
+	u32 *out = (u32 *)priv->stats.vport.query_vport_out;
+	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {0};
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	MLX5_SET(query_vport_counter_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_COUNTER);
+	MLX5_SET(query_vport_counter_in, in, op_mod, 0);
+	MLX5_SET(query_vport_counter_in, in, other_vport, 0);
+	mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
+}
+
+#define PPORT_802_3_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_802_3_cntrs_grp_data_layout.c##_high)
+static const struct counter_desc pport_802_3_stats_desc[] = {
+	{ "tx_packets_phy", PPORT_802_3_OFF(a_frames_transmitted_ok) },
+	{ "rx_packets_phy", PPORT_802_3_OFF(a_frames_received_ok) },
+	{ "rx_crc_errors_phy", PPORT_802_3_OFF(a_frame_check_sequence_errors) },
+	{ "tx_bytes_phy", PPORT_802_3_OFF(a_octets_transmitted_ok) },
+	{ "rx_bytes_phy", PPORT_802_3_OFF(a_octets_received_ok) },
+	{ "tx_multicast_phy", PPORT_802_3_OFF(a_multicast_frames_xmitted_ok) },
+	{ "tx_broadcast_phy", PPORT_802_3_OFF(a_broadcast_frames_xmitted_ok) },
+	{ "rx_multicast_phy", PPORT_802_3_OFF(a_multicast_frames_received_ok) },
+	{ "rx_broadcast_phy", PPORT_802_3_OFF(a_broadcast_frames_received_ok) },
+	{ "rx_in_range_len_errors_phy", PPORT_802_3_OFF(a_in_range_length_errors) },
+	{ "rx_out_of_range_len_phy", PPORT_802_3_OFF(a_out_of_range_length_field) },
+	{ "rx_oversize_pkts_phy", PPORT_802_3_OFF(a_frame_too_long_errors) },
+	{ "rx_symbol_err_phy", PPORT_802_3_OFF(a_symbol_error_during_carrier) },
+	{ "tx_mac_control_phy", PPORT_802_3_OFF(a_mac_control_frames_transmitted) },
+	{ "rx_mac_control_phy", PPORT_802_3_OFF(a_mac_control_frames_received) },
+	{ "rx_unsupported_op_phy", PPORT_802_3_OFF(a_unsupported_opcodes_received) },
+	{ "rx_pause_ctrl_phy", PPORT_802_3_OFF(a_pause_mac_ctrl_frames_received) },
+	{ "tx_pause_ctrl_phy", PPORT_802_3_OFF(a_pause_mac_ctrl_frames_transmitted) },
+};
+
+#define NUM_PPORT_802_3_COUNTERS	ARRAY_SIZE(pport_802_3_stats_desc)
+
+static int mlx5e_grp_802_3_get_num_stats(struct mlx5e_priv *priv)
+{
+	return NUM_PPORT_802_3_COUNTERS;
+}
+
+static int mlx5e_grp_802_3_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PPORT_802_3_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, pport_802_3_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_802_3_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				      int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PPORT_802_3_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.IEEE_802_3_counters,
+						  pport_802_3_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_802_3_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	out = pstats->IEEE_802_3_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+}
+
+#define PPORT_2863_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_2863_cntrs_grp_data_layout.c##_high)
+static const struct counter_desc pport_2863_stats_desc[] = {
+	{ "rx_discards_phy", PPORT_2863_OFF(if_in_discards) },
+	{ "tx_discards_phy", PPORT_2863_OFF(if_out_discards) },
+	{ "tx_errors_phy", PPORT_2863_OFF(if_out_errors) },
+};
+
+#define NUM_PPORT_2863_COUNTERS		ARRAY_SIZE(pport_2863_stats_desc)
+
+static int mlx5e_grp_2863_get_num_stats(struct mlx5e_priv *priv)
+{
+	return NUM_PPORT_2863_COUNTERS;
+}
+
+static int mlx5e_grp_2863_fill_strings(struct mlx5e_priv *priv, u8 *data,
+				       int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PPORT_2863_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, pport_2863_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_2863_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				     int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PPORT_2863_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.RFC_2863_counters,
+						  pport_2863_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_2863_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	out = pstats->RFC_2863_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+}
+
+#define PPORT_2819_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_2819_cntrs_grp_data_layout.c##_high)
+static const struct counter_desc pport_2819_stats_desc[] = {
+	{ "rx_undersize_pkts_phy", PPORT_2819_OFF(ether_stats_undersize_pkts) },
+	{ "rx_fragments_phy", PPORT_2819_OFF(ether_stats_fragments) },
+	{ "rx_jabbers_phy", PPORT_2819_OFF(ether_stats_jabbers) },
+	{ "rx_64_bytes_phy", PPORT_2819_OFF(ether_stats_pkts64octets) },
+	{ "rx_65_to_127_bytes_phy", PPORT_2819_OFF(ether_stats_pkts65to127octets) },
+	{ "rx_128_to_255_bytes_phy", PPORT_2819_OFF(ether_stats_pkts128to255octets) },
+	{ "rx_256_to_511_bytes_phy", PPORT_2819_OFF(ether_stats_pkts256to511octets) },
+	{ "rx_512_to_1023_bytes_phy", PPORT_2819_OFF(ether_stats_pkts512to1023octets) },
+	{ "rx_1024_to_1518_bytes_phy", PPORT_2819_OFF(ether_stats_pkts1024to1518octets) },
+	{ "rx_1519_to_2047_bytes_phy", PPORT_2819_OFF(ether_stats_pkts1519to2047octets) },
+	{ "rx_2048_to_4095_bytes_phy", PPORT_2819_OFF(ether_stats_pkts2048to4095octets) },
+	{ "rx_4096_to_8191_bytes_phy", PPORT_2819_OFF(ether_stats_pkts4096to8191octets) },
+	{ "rx_8192_to_10239_bytes_phy", PPORT_2819_OFF(ether_stats_pkts8192to10239octets) },
+};
+
+#define NUM_PPORT_2819_COUNTERS		ARRAY_SIZE(pport_2819_stats_desc)
+
+static int mlx5e_grp_2819_get_num_stats(struct mlx5e_priv *priv)
+{
+	return NUM_PPORT_2819_COUNTERS;
+}
+
+static int mlx5e_grp_2819_fill_strings(struct mlx5e_priv *priv, u8 *data,
+				       int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PPORT_2819_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, pport_2819_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_2819_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				     int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PPORT_2819_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.RFC_2819_counters,
+						  pport_2819_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_2819_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	out = pstats->RFC_2819_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+}
+
+#define PPORT_PHY_STATISTICAL_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.phys_layer_statistical_cntrs.c##_high)
+static const struct counter_desc pport_phy_statistical_stats_desc[] = {
+	{ "rx_pcs_symbol_err_phy", PPORT_PHY_STATISTICAL_OFF(phy_symbol_errors) },
+	{ "rx_corrected_bits_phy", PPORT_PHY_STATISTICAL_OFF(phy_corrected_bits) },
+};
+
+#define NUM_PPORT_PHY_COUNTERS		ARRAY_SIZE(pport_phy_statistical_stats_desc)
+
+static int mlx5e_grp_phy_get_num_stats(struct mlx5e_priv *priv)
+{
+	return MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group) ?
+		NUM_PPORT_PHY_COUNTERS : 0;
+}
+
+static int mlx5e_grp_phy_fill_strings(struct mlx5e_priv *priv, u8 *data,
+				      int idx)
+{
+	int i;
+
+	if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group))
+		for (i = 0; i < NUM_PPORT_PHY_COUNTERS; i++)
+			strcpy(data + (idx++) * ETH_GSTRING_LEN,
+			       pport_phy_statistical_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_phy_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx)
+{
+	int i;
+
+	if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group))
+		for (i = 0; i < NUM_PPORT_PHY_COUNTERS; i++)
+			data[idx++] =
+				MLX5E_READ_CTR64_BE(&priv->stats.pport.phy_statistical_counters,
+						    pport_phy_statistical_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_phy_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	out = pstats->phy_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+
+	if (!MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group))
+		return;
+
+	out = pstats->phy_statistical_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+}
+
+#define PPORT_ETH_EXT_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_extended_cntrs_grp_data_layout.c##_high)
+static const struct counter_desc pport_eth_ext_stats_desc[] = {
+	{ "rx_buffer_passed_thres_phy", PPORT_ETH_EXT_OFF(rx_buffer_almost_full) },
+};
+
+#define NUM_PPORT_ETH_EXT_COUNTERS	ARRAY_SIZE(pport_eth_ext_stats_desc)
+
+static int mlx5e_grp_eth_ext_get_num_stats(struct mlx5e_priv *priv)
+{
+	if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, rx_buffer_fullness_counters))
+		return NUM_PPORT_ETH_EXT_COUNTERS;
+
+	return 0;
+}
+
+static int mlx5e_grp_eth_ext_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					  int idx)
+{
+	int i;
+
+	if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, rx_buffer_fullness_counters))
+		for (i = 0; i < NUM_PPORT_ETH_EXT_COUNTERS; i++)
+			strcpy(data + (idx++) * ETH_GSTRING_LEN,
+			       pport_eth_ext_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_eth_ext_fill_stats(struct mlx5e_priv *priv, u64 *data,
+					int idx)
+{
+	int i;
+
+	if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, rx_buffer_fullness_counters))
+		for (i = 0; i < NUM_PPORT_ETH_EXT_COUNTERS; i++)
+			data[idx++] =
+				MLX5E_READ_CTR64_BE(&priv->stats.pport.eth_ext_counters,
+						    pport_eth_ext_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_eth_ext_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+
+	if (!MLX5_CAP_PCAM_FEATURE(mdev, rx_buffer_fullness_counters))
+		return;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	out = pstats->eth_ext_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+}
+
+#define PCIE_PERF_OFF(c) \
+	MLX5_BYTE_OFF(mpcnt_reg, counter_set.pcie_perf_cntrs_grp_data_layout.c)
+static const struct counter_desc pcie_perf_stats_desc[] = {
+	{ "rx_pci_signal_integrity", PCIE_PERF_OFF(rx_errors) },
+	{ "tx_pci_signal_integrity", PCIE_PERF_OFF(tx_errors) },
+};
+
+#define PCIE_PERF_OFF64(c) \
+	MLX5_BYTE_OFF(mpcnt_reg, counter_set.pcie_perf_cntrs_grp_data_layout.c##_high)
+static const struct counter_desc pcie_perf_stats_desc64[] = {
+	{ "outbound_pci_buffer_overflow", PCIE_PERF_OFF64(tx_overflow_buffer_pkt) },
+};
+
+static const struct counter_desc pcie_perf_stall_stats_desc[] = {
+	{ "outbound_pci_stalled_rd", PCIE_PERF_OFF(outbound_stalled_reads) },
+	{ "outbound_pci_stalled_wr", PCIE_PERF_OFF(outbound_stalled_writes) },
+	{ "outbound_pci_stalled_rd_events", PCIE_PERF_OFF(outbound_stalled_reads_events) },
+	{ "outbound_pci_stalled_wr_events", PCIE_PERF_OFF(outbound_stalled_writes_events) },
+};
+
+#define NUM_PCIE_PERF_COUNTERS		ARRAY_SIZE(pcie_perf_stats_desc)
+#define NUM_PCIE_PERF_COUNTERS64	ARRAY_SIZE(pcie_perf_stats_desc64)
+#define NUM_PCIE_PERF_STALL_COUNTERS	ARRAY_SIZE(pcie_perf_stall_stats_desc)
+
+static int mlx5e_grp_pcie_get_num_stats(struct mlx5e_priv *priv)
+{
+	int num_stats = 0;
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_performance_group))
+		num_stats += NUM_PCIE_PERF_COUNTERS;
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, tx_overflow_buffer_pkt))
+		num_stats += NUM_PCIE_PERF_COUNTERS64;
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_outbound_stalled))
+		num_stats += NUM_PCIE_PERF_STALL_COUNTERS;
+
+	return num_stats;
+}
+
+static int mlx5e_grp_pcie_fill_strings(struct mlx5e_priv *priv, u8 *data,
+				       int idx)
+{
+	int i;
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_performance_group))
+		for (i = 0; i < NUM_PCIE_PERF_COUNTERS; i++)
+			strcpy(data + (idx++) * ETH_GSTRING_LEN,
+			       pcie_perf_stats_desc[i].format);
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, tx_overflow_buffer_pkt))
+		for (i = 0; i < NUM_PCIE_PERF_COUNTERS64; i++)
+			strcpy(data + (idx++) * ETH_GSTRING_LEN,
+			       pcie_perf_stats_desc64[i].format);
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_outbound_stalled))
+		for (i = 0; i < NUM_PCIE_PERF_STALL_COUNTERS; i++)
+			strcpy(data + (idx++) * ETH_GSTRING_LEN,
+			       pcie_perf_stall_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_pcie_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				     int idx)
+{
+	int i;
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_performance_group))
+		for (i = 0; i < NUM_PCIE_PERF_COUNTERS; i++)
+			data[idx++] =
+				MLX5E_READ_CTR32_BE(&priv->stats.pcie.pcie_perf_counters,
+						    pcie_perf_stats_desc, i);
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, tx_overflow_buffer_pkt))
+		for (i = 0; i < NUM_PCIE_PERF_COUNTERS64; i++)
+			data[idx++] =
+				MLX5E_READ_CTR64_BE(&priv->stats.pcie.pcie_perf_counters,
+						    pcie_perf_stats_desc64, i);
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_outbound_stalled))
+		for (i = 0; i < NUM_PCIE_PERF_STALL_COUNTERS; i++)
+			data[idx++] =
+				MLX5E_READ_CTR32_BE(&priv->stats.pcie.pcie_perf_counters,
+						    pcie_perf_stall_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_pcie_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pcie_stats *pcie_stats = &priv->stats.pcie;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(mpcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(mpcnt_reg);
+	void *out;
+
+	if (!MLX5_CAP_MCAM_FEATURE(mdev, pcie_performance_group))
+		return;
+
+	out = pcie_stats->pcie_perf_counters;
+	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
+}
+
+#define PPORT_PER_PRIO_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_per_prio_grp_data_layout.c##_high)
+static const struct counter_desc pport_per_prio_traffic_stats_desc[] = {
+	{ "rx_prio%d_bytes", PPORT_PER_PRIO_OFF(rx_octets) },
+	{ "rx_prio%d_packets", PPORT_PER_PRIO_OFF(rx_frames) },
+	{ "tx_prio%d_bytes", PPORT_PER_PRIO_OFF(tx_octets) },
+	{ "tx_prio%d_packets", PPORT_PER_PRIO_OFF(tx_frames) },
+};
+
+#define NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS	ARRAY_SIZE(pport_per_prio_traffic_stats_desc)
+
+static int mlx5e_grp_per_prio_traffic_get_num_stats(struct mlx5e_priv *priv)
+{
+	return NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS * NUM_PPORT_PRIO;
+}
+
+static int mlx5e_grp_per_prio_traffic_fill_strings(struct mlx5e_priv *priv,
+						   u8 *data,
+						   int idx)
+{
+	int i, prio;
+
+	for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
+		for (i = 0; i < NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS; i++)
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				pport_per_prio_traffic_stats_desc[i].format, prio);
+	}
+
+	return idx;
+}
+
+static int mlx5e_grp_per_prio_traffic_fill_stats(struct mlx5e_priv *priv,
+						 u64 *data,
+						 int idx)
+{
+	int i, prio;
+
+	for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
+		for (i = 0; i < NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS; i++)
+			data[idx++] =
+				MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[prio],
+						    pport_per_prio_traffic_stats_desc, i);
+	}
+
+	return idx;
+}
+
+static const struct counter_desc pport_per_prio_pfc_stats_desc[] = {
+	/* %s is "global" or "prio{i}" */
+	{ "rx_%s_pause", PPORT_PER_PRIO_OFF(rx_pause) },
+	{ "rx_%s_pause_duration", PPORT_PER_PRIO_OFF(rx_pause_duration) },
+	{ "tx_%s_pause", PPORT_PER_PRIO_OFF(tx_pause) },
+	{ "tx_%s_pause_duration", PPORT_PER_PRIO_OFF(tx_pause_duration) },
+	{ "rx_%s_pause_transition", PPORT_PER_PRIO_OFF(rx_pause_transition) },
+};
+
+static const struct counter_desc pport_pfc_stall_stats_desc[] = {
+	{ "tx_pause_storm_warning_events ", PPORT_PER_PRIO_OFF(device_stall_minor_watermark_cnt) },
+	{ "tx_pause_storm_error_events", PPORT_PER_PRIO_OFF(device_stall_critical_watermark_cnt) },
+};
+
+#define NUM_PPORT_PER_PRIO_PFC_COUNTERS		ARRAY_SIZE(pport_per_prio_pfc_stats_desc)
+#define NUM_PPORT_PFC_STALL_COUNTERS(priv)	(ARRAY_SIZE(pport_pfc_stall_stats_desc) * \
+						 MLX5_CAP_PCAM_FEATURE((priv)->mdev, pfcc_mask) * \
+						 MLX5_CAP_DEBUG((priv)->mdev, stall_detect))
+
+static unsigned long mlx5e_query_pfc_combined(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 pfc_en_tx;
+	u8 pfc_en_rx;
+	int err;
+
+	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return 0;
+
+	err = mlx5_query_port_pfc(mdev, &pfc_en_tx, &pfc_en_rx);
+
+	return err ? 0 : pfc_en_tx | pfc_en_rx;
+}
+
+static bool mlx5e_query_global_pause_combined(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 rx_pause;
+	u32 tx_pause;
+	int err;
+
+	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return false;
+
+	err = mlx5_query_port_pause(mdev, &rx_pause, &tx_pause);
+
+	return err ? false : rx_pause | tx_pause;
+}
+
+static int mlx5e_grp_per_prio_pfc_get_num_stats(struct mlx5e_priv *priv)
+{
+	return (mlx5e_query_global_pause_combined(priv) +
+		hweight8(mlx5e_query_pfc_combined(priv))) *
+		NUM_PPORT_PER_PRIO_PFC_COUNTERS +
+		NUM_PPORT_PFC_STALL_COUNTERS(priv);
+}
+
+static int mlx5e_grp_per_prio_pfc_fill_strings(struct mlx5e_priv *priv,
+					       u8 *data,
+					       int idx)
+{
+	unsigned long pfc_combined;
+	int i, prio;
+
+	pfc_combined = mlx5e_query_pfc_combined(priv);
+	for_each_set_bit(prio, &pfc_combined, NUM_PPORT_PRIO) {
+		for (i = 0; i < NUM_PPORT_PER_PRIO_PFC_COUNTERS; i++) {
+			char pfc_string[ETH_GSTRING_LEN];
+
+			snprintf(pfc_string, sizeof(pfc_string), "prio%d", prio);
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				pport_per_prio_pfc_stats_desc[i].format, pfc_string);
+		}
+	}
+
+	if (mlx5e_query_global_pause_combined(priv)) {
+		for (i = 0; i < NUM_PPORT_PER_PRIO_PFC_COUNTERS; i++) {
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				pport_per_prio_pfc_stats_desc[i].format, "global");
+		}
+	}
+
+	for (i = 0; i < NUM_PPORT_PFC_STALL_COUNTERS(priv); i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       pport_pfc_stall_stats_desc[i].format);
+
+	return idx;
+}
+
+static int mlx5e_grp_per_prio_pfc_fill_stats(struct mlx5e_priv *priv,
+					     u64 *data,
+					     int idx)
+{
+	unsigned long pfc_combined;
+	int i, prio;
+
+	pfc_combined = mlx5e_query_pfc_combined(priv);
+	for_each_set_bit(prio, &pfc_combined, NUM_PPORT_PRIO) {
+		for (i = 0; i < NUM_PPORT_PER_PRIO_PFC_COUNTERS; i++) {
+			data[idx++] =
+				MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[prio],
+						    pport_per_prio_pfc_stats_desc, i);
+		}
+	}
+
+	if (mlx5e_query_global_pause_combined(priv)) {
+		for (i = 0; i < NUM_PPORT_PER_PRIO_PFC_COUNTERS; i++) {
+			data[idx++] =
+				MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[0],
+						    pport_per_prio_pfc_stats_desc, i);
+		}
+	}
+
+	for (i = 0; i < NUM_PPORT_PFC_STALL_COUNTERS(priv); i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[0],
+						  pport_pfc_stall_stats_desc, i);
+
+	return idx;
+}
+
+static int mlx5e_grp_per_prio_get_num_stats(struct mlx5e_priv *priv)
+{
+	return mlx5e_grp_per_prio_traffic_get_num_stats(priv) +
+		mlx5e_grp_per_prio_pfc_get_num_stats(priv);
+}
+
+static int mlx5e_grp_per_prio_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					   int idx)
+{
+	idx = mlx5e_grp_per_prio_traffic_fill_strings(priv, data, idx);
+	idx = mlx5e_grp_per_prio_pfc_fill_strings(priv, data, idx);
+	return idx;
+}
+
+static int mlx5e_grp_per_prio_fill_stats(struct mlx5e_priv *priv, u64 *data,
+					 int idx)
+{
+	idx = mlx5e_grp_per_prio_traffic_fill_stats(priv, data, idx);
+	idx = mlx5e_grp_per_prio_pfc_fill_stats(priv, data, idx);
+	return idx;
+}
+
+static void mlx5e_grp_per_prio_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	int prio;
+	void *out;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP);
+	for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
+		out = pstats->per_prio_counters[prio];
+		MLX5_SET(ppcnt_reg, in, prio_tc, prio);
+		mlx5_core_access_reg(mdev, in, sz, out, sz,
+				     MLX5_REG_PPCNT, 0, 0);
+	}
+}
+
+static const struct counter_desc mlx5e_pme_status_desc[] = {
+	{ "module_unplug", 8 },
+};
+
+static const struct counter_desc mlx5e_pme_error_desc[] = {
+	{ "module_bus_stuck", 16 },       /* bus stuck (I2C or data shorted) */
+	{ "module_high_temp", 48 },       /* high temperature */
+	{ "module_bad_shorted", 56 },    /* bad or shorted cable/module */
+};
+
+#define NUM_PME_STATUS_STATS		ARRAY_SIZE(mlx5e_pme_status_desc)
+#define NUM_PME_ERR_STATS		ARRAY_SIZE(mlx5e_pme_error_desc)
+
+static int mlx5e_grp_pme_get_num_stats(struct mlx5e_priv *priv)
+{
+	return NUM_PME_STATUS_STATS + NUM_PME_ERR_STATS;
+}
+
+static int mlx5e_grp_pme_fill_strings(struct mlx5e_priv *priv, u8 *data,
+				      int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PME_STATUS_STATS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, mlx5e_pme_status_desc[i].format);
+
+	for (i = 0; i < NUM_PME_ERR_STATS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, mlx5e_pme_error_desc[i].format);
+
+	return idx;
+}
+
+static int mlx5e_grp_pme_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				    int idx)
+{
+	struct mlx5_priv *mlx5_priv = &priv->mdev->priv;
+	int i;
+
+	for (i = 0; i < NUM_PME_STATUS_STATS; i++)
+		data[idx++] = MLX5E_READ_CTR64_CPU(mlx5_priv->pme_stats.status_counters,
+						   mlx5e_pme_status_desc, i);
+
+	for (i = 0; i < NUM_PME_ERR_STATS; i++)
+		data[idx++] = MLX5E_READ_CTR64_CPU(mlx5_priv->pme_stats.error_counters,
+						   mlx5e_pme_error_desc, i);
+
+	return idx;
+}
+
+static int mlx5e_grp_ipsec_get_num_stats(struct mlx5e_priv *priv)
+{
+	return mlx5e_ipsec_get_count(priv);
+}
+
+static int mlx5e_grp_ipsec_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					int idx)
+{
+	return idx + mlx5e_ipsec_get_strings(priv,
+					     data + idx * ETH_GSTRING_LEN);
+}
+
+static int mlx5e_grp_ipsec_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				      int idx)
+{
+	return idx + mlx5e_ipsec_get_stats(priv, data + idx);
+}
+
+static void mlx5e_grp_ipsec_update_stats(struct mlx5e_priv *priv)
+{
+	mlx5e_ipsec_update_stats(priv);
+}
+
+static const struct counter_desc rq_stats_desc[] = {
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, packets) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, bytes) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_complete) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary_inner) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_none) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_drop) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_tx) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_tx_full) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, lro_packets) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, lro_bytes) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, removed_vlan_packets) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, wqe_err) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, mpwqe_filler) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, buff_alloc_err) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_blks) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, page_reuse) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_reuse) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_full) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_empty) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_busy) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_waive) },
+};
+
+static const struct counter_desc sq_stats_desc[] = {
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, packets) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, bytes) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_packets) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_bytes) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_inner_packets) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_inner_bytes) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_partial) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_partial_inner) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, added_vlan_packets) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, nop) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_none) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, stopped) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, wake) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, dropped) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, xmit_more) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_err) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, recover) },
+};
+
+static const struct counter_desc ch_stats_desc[] = {
+	{ MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, eq_rearm) },
+};
+
+#define NUM_RQ_STATS			ARRAY_SIZE(rq_stats_desc)
+#define NUM_SQ_STATS			ARRAY_SIZE(sq_stats_desc)
+#define NUM_CH_STATS			ARRAY_SIZE(ch_stats_desc)
+
+static int mlx5e_grp_channels_get_num_stats(struct mlx5e_priv *priv)
+{
+	return (NUM_RQ_STATS * priv->channels.num) +
+		(NUM_CH_STATS * priv->channels.num) +
+		(NUM_SQ_STATS * priv->channels.num * priv->channels.params.num_tc);
+}
+
+static int mlx5e_grp_channels_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					   int idx)
+{
+	int i, j, tc;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		return idx;
+
+	for (i = 0; i < priv->channels.num; i++)
+		for (j = 0; j < NUM_CH_STATS; j++)
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				ch_stats_desc[j].format, i);
+
+	for (i = 0; i < priv->channels.num; i++)
+		for (j = 0; j < NUM_RQ_STATS; j++)
+			sprintf(data + (idx++) * ETH_GSTRING_LEN, rq_stats_desc[j].format, i);
+
+	for (tc = 0; tc < priv->channels.params.num_tc; tc++)
+		for (i = 0; i < priv->channels.num; i++)
+			for (j = 0; j < NUM_SQ_STATS; j++)
+				sprintf(data + (idx++) * ETH_GSTRING_LEN,
+					sq_stats_desc[j].format,
+					priv->channel_tc2txq[i][tc]);
+
+	return idx;
+}
+
+static int mlx5e_grp_channels_fill_stats(struct mlx5e_priv *priv, u64 *data,
+					 int idx)
+{
+	struct mlx5e_channels *channels = &priv->channels;
+	int i, j, tc;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		return idx;
+
+	for (i = 0; i < channels->num; i++)
+		for (j = 0; j < NUM_CH_STATS; j++)
+			data[idx++] =
+				MLX5E_READ_CTR64_CPU(&channels->c[i]->stats,
+						     ch_stats_desc, j);
+
+	for (i = 0; i < channels->num; i++)
+		for (j = 0; j < NUM_RQ_STATS; j++)
+			data[idx++] =
+				MLX5E_READ_CTR64_CPU(&channels->c[i]->rq.stats,
+						     rq_stats_desc, j);
+
+	for (tc = 0; tc < priv->channels.params.num_tc; tc++)
+		for (i = 0; i < channels->num; i++)
+			for (j = 0; j < NUM_SQ_STATS; j++)
+				data[idx++] =
+					MLX5E_READ_CTR64_CPU(&channels->c[i]->sq[tc].stats,
+							     sq_stats_desc, j);
+
+	return idx;
+}
+
+/* The stats groups order is opposite to the update_stats() order calls */
+const struct mlx5e_stats_grp mlx5e_stats_grps[] = {
+	{
+		.get_num_stats = mlx5e_grp_sw_get_num_stats,
+		.fill_strings = mlx5e_grp_sw_fill_strings,
+		.fill_stats = mlx5e_grp_sw_fill_stats,
+		.update_stats_mask = MLX5E_NDO_UPDATE_STATS,
+		.update_stats = mlx5e_grp_sw_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_q_get_num_stats,
+		.fill_strings = mlx5e_grp_q_fill_strings,
+		.fill_stats = mlx5e_grp_q_fill_stats,
+		.update_stats_mask = MLX5E_NDO_UPDATE_STATS,
+		.update_stats = mlx5e_grp_q_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_vnic_env_get_num_stats,
+		.fill_strings = mlx5e_grp_vnic_env_fill_strings,
+		.fill_stats = mlx5e_grp_vnic_env_fill_stats,
+		.update_stats = mlx5e_grp_vnic_env_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_vport_get_num_stats,
+		.fill_strings = mlx5e_grp_vport_fill_strings,
+		.fill_stats = mlx5e_grp_vport_fill_stats,
+		.update_stats_mask = MLX5E_NDO_UPDATE_STATS,
+		.update_stats = mlx5e_grp_vport_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_802_3_get_num_stats,
+		.fill_strings = mlx5e_grp_802_3_fill_strings,
+		.fill_stats = mlx5e_grp_802_3_fill_stats,
+		.update_stats_mask = MLX5E_NDO_UPDATE_STATS,
+		.update_stats = mlx5e_grp_802_3_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_2863_get_num_stats,
+		.fill_strings = mlx5e_grp_2863_fill_strings,
+		.fill_stats = mlx5e_grp_2863_fill_stats,
+		.update_stats = mlx5e_grp_2863_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_2819_get_num_stats,
+		.fill_strings = mlx5e_grp_2819_fill_strings,
+		.fill_stats = mlx5e_grp_2819_fill_stats,
+		.update_stats = mlx5e_grp_2819_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_phy_get_num_stats,
+		.fill_strings = mlx5e_grp_phy_fill_strings,
+		.fill_stats = mlx5e_grp_phy_fill_stats,
+		.update_stats = mlx5e_grp_phy_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_eth_ext_get_num_stats,
+		.fill_strings = mlx5e_grp_eth_ext_fill_strings,
+		.fill_stats = mlx5e_grp_eth_ext_fill_stats,
+		.update_stats = mlx5e_grp_eth_ext_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_pcie_get_num_stats,
+		.fill_strings = mlx5e_grp_pcie_fill_strings,
+		.fill_stats = mlx5e_grp_pcie_fill_stats,
+		.update_stats = mlx5e_grp_pcie_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_per_prio_get_num_stats,
+		.fill_strings = mlx5e_grp_per_prio_fill_strings,
+		.fill_stats = mlx5e_grp_per_prio_fill_stats,
+		.update_stats = mlx5e_grp_per_prio_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_pme_get_num_stats,
+		.fill_strings = mlx5e_grp_pme_fill_strings,
+		.fill_stats = mlx5e_grp_pme_fill_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_ipsec_get_num_stats,
+		.fill_strings = mlx5e_grp_ipsec_fill_strings,
+		.fill_stats = mlx5e_grp_ipsec_fill_stats,
+		.update_stats = mlx5e_grp_ipsec_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_channels_get_num_stats,
+		.fill_strings = mlx5e_grp_channels_fill_strings,
+		.fill_stats = mlx5e_grp_channels_fill_stats,
+	}
+};
+
+const int mlx5e_num_stats_grps = ARRAY_SIZE(mlx5e_stats_grps);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
new file mode 100644
index 0000000..53111a2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __MLX5_EN_STATS_H__
+#define __MLX5_EN_STATS_H__
+
+#define MLX5E_READ_CTR64_CPU(ptr, dsc, i) \
+	(*(u64 *)((char *)ptr + dsc[i].offset))
+#define MLX5E_READ_CTR64_BE(ptr, dsc, i) \
+	be64_to_cpu(*(__be64 *)((char *)ptr + dsc[i].offset))
+#define MLX5E_READ_CTR32_CPU(ptr, dsc, i) \
+	(*(u32 *)((char *)ptr + dsc[i].offset))
+#define MLX5E_READ_CTR32_BE(ptr, dsc, i) \
+	be32_to_cpu(*(__be32 *)((char *)ptr + dsc[i].offset))
+
+#define MLX5E_DECLARE_STAT(type, fld) #fld, offsetof(type, fld)
+#define MLX5E_DECLARE_RX_STAT(type, fld) "rx%d_"#fld, offsetof(type, fld)
+#define MLX5E_DECLARE_TX_STAT(type, fld) "tx%d_"#fld, offsetof(type, fld)
+#define MLX5E_DECLARE_CH_STAT(type, fld) "ch%d_"#fld, offsetof(type, fld)
+
+struct counter_desc {
+	char		format[ETH_GSTRING_LEN];
+	size_t		offset; /* Byte offset */
+};
+
+struct mlx5e_sw_stats {
+	u64 rx_packets;
+	u64 rx_bytes;
+	u64 tx_packets;
+	u64 tx_bytes;
+	u64 tx_tso_packets;
+	u64 tx_tso_bytes;
+	u64 tx_tso_inner_packets;
+	u64 tx_tso_inner_bytes;
+	u64 tx_added_vlan_packets;
+	u64 rx_lro_packets;
+	u64 rx_lro_bytes;
+	u64 rx_removed_vlan_packets;
+	u64 rx_csum_unnecessary;
+	u64 rx_csum_none;
+	u64 rx_csum_complete;
+	u64 rx_csum_unnecessary_inner;
+	u64 rx_xdp_drop;
+	u64 rx_xdp_tx;
+	u64 rx_xdp_tx_full;
+	u64 tx_csum_none;
+	u64 tx_csum_partial;
+	u64 tx_csum_partial_inner;
+	u64 tx_queue_stopped;
+	u64 tx_queue_wake;
+	u64 tx_queue_dropped;
+	u64 tx_xmit_more;
+	u64 tx_cqe_err;
+	u64 tx_recover;
+	u64 rx_wqe_err;
+	u64 rx_mpwqe_filler;
+	u64 rx_buff_alloc_err;
+	u64 rx_cqe_compress_blks;
+	u64 rx_cqe_compress_pkts;
+	u64 rx_page_reuse;
+	u64 rx_cache_reuse;
+	u64 rx_cache_full;
+	u64 rx_cache_empty;
+	u64 rx_cache_busy;
+	u64 rx_cache_waive;
+	u64 ch_eq_rearm;
+
+	/* Special handling counters */
+	u64 link_down_events_phy;
+};
+
+struct mlx5e_qcounter_stats {
+	u32 rx_out_of_buffer;
+	u32 rx_if_down_packets;
+};
+
+struct mlx5e_vnic_env_stats {
+	__be64 query_vnic_env_out[MLX5_ST_SZ_QW(query_vnic_env_out)];
+};
+
+#define VPORT_COUNTER_GET(vstats, c) MLX5_GET64(query_vport_counter_out, \
+						vstats->query_vport_out, c)
+
+struct mlx5e_vport_stats {
+	__be64 query_vport_out[MLX5_ST_SZ_QW(query_vport_counter_out)];
+};
+
+#define PPORT_802_3_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, pstats->IEEE_802_3_counters, \
+		   counter_set.eth_802_3_cntrs_grp_data_layout.c##_high)
+#define PPORT_2863_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, pstats->RFC_2863_counters, \
+		   counter_set.eth_2863_cntrs_grp_data_layout.c##_high)
+#define PPORT_2819_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, pstats->RFC_2819_counters, \
+		   counter_set.eth_2819_cntrs_grp_data_layout.c##_high)
+#define PPORT_PHY_STATISTICAL_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, (pstats)->phy_statistical_counters, \
+		   counter_set.phys_layer_statistical_cntrs.c##_high)
+#define PPORT_PER_PRIO_GET(pstats, prio, c) \
+	MLX5_GET64(ppcnt_reg, pstats->per_prio_counters[prio], \
+		   counter_set.eth_per_prio_grp_data_layout.c##_high)
+#define NUM_PPORT_PRIO				8
+#define PPORT_ETH_EXT_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, (pstats)->eth_ext_counters, \
+		   counter_set.eth_extended_cntrs_grp_data_layout.c##_high)
+
+struct mlx5e_pport_stats {
+	__be64 IEEE_802_3_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 RFC_2863_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 RFC_2819_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 per_prio_counters[NUM_PPORT_PRIO][MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 phy_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 phy_statistical_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 eth_ext_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+};
+
+#define PCIE_PERF_GET(pcie_stats, c) \
+	MLX5_GET(mpcnt_reg, (pcie_stats)->pcie_perf_counters, \
+		 counter_set.pcie_perf_cntrs_grp_data_layout.c)
+
+#define PCIE_PERF_GET64(pcie_stats, c) \
+	MLX5_GET64(mpcnt_reg, (pcie_stats)->pcie_perf_counters, \
+		   counter_set.pcie_perf_cntrs_grp_data_layout.c##_high)
+
+struct mlx5e_pcie_stats {
+	__be64 pcie_perf_counters[MLX5_ST_SZ_QW(mpcnt_reg)];
+};
+
+struct mlx5e_rq_stats {
+	u64 packets;
+	u64 bytes;
+	u64 csum_complete;
+	u64 csum_unnecessary;
+	u64 csum_unnecessary_inner;
+	u64 csum_none;
+	u64 lro_packets;
+	u64 lro_bytes;
+	u64 removed_vlan_packets;
+	u64 xdp_drop;
+	u64 xdp_tx;
+	u64 xdp_tx_full;
+	u64 wqe_err;
+	u64 mpwqe_filler;
+	u64 buff_alloc_err;
+	u64 cqe_compress_blks;
+	u64 cqe_compress_pkts;
+	u64 page_reuse;
+	u64 cache_reuse;
+	u64 cache_full;
+	u64 cache_empty;
+	u64 cache_busy;
+	u64 cache_waive;
+};
+
+struct mlx5e_sq_stats {
+	/* commonly accessed in data path */
+	u64 packets;
+	u64 bytes;
+	u64 xmit_more;
+	u64 tso_packets;
+	u64 tso_bytes;
+	u64 tso_inner_packets;
+	u64 tso_inner_bytes;
+	u64 csum_partial;
+	u64 csum_partial_inner;
+	u64 added_vlan_packets;
+	u64 nop;
+	/* less likely accessed in data path */
+	u64 csum_none;
+	u64 stopped;
+	u64 wake;
+	u64 dropped;
+	u64 cqe_err;
+	u64 recover;
+};
+
+struct mlx5e_ch_stats {
+	u64 eq_rearm;
+};
+
+struct mlx5e_stats {
+	struct mlx5e_sw_stats sw;
+	struct mlx5e_qcounter_stats qcnt;
+	struct mlx5e_vnic_env_stats vnic;
+	struct mlx5e_vport_stats vport;
+	struct mlx5e_pport_stats pport;
+	struct rtnl_link_stats64 vf_vport;
+	struct mlx5e_pcie_stats pcie;
+};
+
+enum {
+	MLX5E_NDO_UPDATE_STATS = BIT(0x1),
+};
+
+struct mlx5e_priv;
+struct mlx5e_stats_grp {
+	u16 update_stats_mask;
+	int (*get_num_stats)(struct mlx5e_priv *priv);
+	int (*fill_strings)(struct mlx5e_priv *priv, u8 *data, int idx);
+	int (*fill_stats)(struct mlx5e_priv *priv, u64 *data, int idx);
+	void (*update_stats)(struct mlx5e_priv *priv);
+};
+
+extern const struct mlx5e_stats_grp mlx5e_stats_grps[];
+extern const int mlx5e_num_stats_grps;
+
+#endif /* __MLX5_EN_STATS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
new file mode 100644
index 0000000..b94276d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -0,0 +1,2727 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <net/flow_dissector.h>
+#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_gact.h>
+#include <net/tc_act/tc_skbedit.h>
+#include <linux/mlx5/fs.h>
+#include <linux/mlx5/device.h>
+#include <linux/rhashtable.h>
+#include <net/switchdev.h>
+#include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_vlan.h>
+#include <net/tc_act/tc_tunnel_key.h>
+#include <net/tc_act/tc_pedit.h>
+#include <net/tc_act/tc_csum.h>
+#include <net/vxlan.h>
+#include <net/arp.h>
+#include "en.h"
+#include "en_rep.h"
+#include "en_tc.h"
+#include "eswitch.h"
+#include "vxlan.h"
+#include "fs_core.h"
+
+struct mlx5_nic_flow_attr {
+	u32 action;
+	u32 flow_tag;
+	u32 mod_hdr_id;
+	u32 hairpin_tirn;
+	struct mlx5_flow_table	*hairpin_ft;
+};
+
+enum {
+	MLX5E_TC_FLOW_ESWITCH	= BIT(0),
+	MLX5E_TC_FLOW_NIC	= BIT(1),
+	MLX5E_TC_FLOW_OFFLOADED	= BIT(2),
+	MLX5E_TC_FLOW_HAIRPIN	= BIT(3),
+	MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(4),
+};
+
+struct mlx5e_tc_flow {
+	struct rhash_head	node;
+	u64			cookie;
+	u8			flags;
+	struct mlx5_flow_handle *rule;
+	struct list_head	encap;   /* flows sharing the same encap ID */
+	struct list_head	mod_hdr; /* flows sharing the same mod hdr ID */
+	struct list_head	hairpin; /* flows sharing the same hairpin */
+	union {
+		struct mlx5_esw_flow_attr esw_attr[0];
+		struct mlx5_nic_flow_attr nic_attr[0];
+	};
+};
+
+struct mlx5e_tc_flow_parse_attr {
+	struct ip_tunnel_info tun_info;
+	struct mlx5_flow_spec spec;
+	int num_mod_hdr_actions;
+	void *mod_hdr_actions;
+	int mirred_ifindex;
+};
+
+enum {
+	MLX5_HEADER_TYPE_VXLAN = 0x0,
+	MLX5_HEADER_TYPE_NVGRE = 0x1,
+};
+
+#define MLX5E_TC_TABLE_NUM_GROUPS 4
+#define MLX5E_TC_TABLE_MAX_GROUP_SIZE (1 << 16)
+
+struct mlx5e_hairpin {
+	struct mlx5_hairpin *pair;
+
+	struct mlx5_core_dev *func_mdev;
+	struct mlx5e_priv *func_priv;
+	u32 tdn;
+	u32 tirn;
+
+	int num_channels;
+	struct mlx5e_rqt indir_rqt;
+	u32 indir_tirn[MLX5E_NUM_INDIR_TIRS];
+	struct mlx5e_ttc_table ttc;
+};
+
+struct mlx5e_hairpin_entry {
+	/* a node of a hash table which keeps all the  hairpin entries */
+	struct hlist_node hairpin_hlist;
+
+	/* flows sharing the same hairpin */
+	struct list_head flows;
+
+	u16 peer_vhca_id;
+	u8 prio;
+	struct mlx5e_hairpin *hp;
+};
+
+struct mod_hdr_key {
+	int num_actions;
+	void *actions;
+};
+
+struct mlx5e_mod_hdr_entry {
+	/* a node of a hash table which keeps all the mod_hdr entries */
+	struct hlist_node mod_hdr_hlist;
+
+	/* flows sharing the same mod_hdr entry */
+	struct list_head flows;
+
+	struct mod_hdr_key key;
+
+	u32 mod_hdr_id;
+};
+
+#define MLX5_MH_ACT_SZ MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)
+
+static inline u32 hash_mod_hdr_info(struct mod_hdr_key *key)
+{
+	return jhash(key->actions,
+		     key->num_actions * MLX5_MH_ACT_SZ, 0);
+}
+
+static inline int cmp_mod_hdr_info(struct mod_hdr_key *a,
+				   struct mod_hdr_key *b)
+{
+	if (a->num_actions != b->num_actions)
+		return 1;
+
+	return memcmp(a->actions, b->actions, a->num_actions * MLX5_MH_ACT_SZ);
+}
+
+static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
+				struct mlx5e_tc_flow *flow,
+				struct mlx5e_tc_flow_parse_attr *parse_attr)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	int num_actions, actions_size, namespace, err;
+	struct mlx5e_mod_hdr_entry *mh;
+	struct mod_hdr_key key;
+	bool found = false;
+	u32 hash_key;
+
+	num_actions  = parse_attr->num_mod_hdr_actions;
+	actions_size = MLX5_MH_ACT_SZ * num_actions;
+
+	key.actions = parse_attr->mod_hdr_actions;
+	key.num_actions = num_actions;
+
+	hash_key = hash_mod_hdr_info(&key);
+
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH) {
+		namespace = MLX5_FLOW_NAMESPACE_FDB;
+		hash_for_each_possible(esw->offloads.mod_hdr_tbl, mh,
+				       mod_hdr_hlist, hash_key) {
+			if (!cmp_mod_hdr_info(&mh->key, &key)) {
+				found = true;
+				break;
+			}
+		}
+	} else {
+		namespace = MLX5_FLOW_NAMESPACE_KERNEL;
+		hash_for_each_possible(priv->fs.tc.mod_hdr_tbl, mh,
+				       mod_hdr_hlist, hash_key) {
+			if (!cmp_mod_hdr_info(&mh->key, &key)) {
+				found = true;
+				break;
+			}
+		}
+	}
+
+	if (found)
+		goto attach_flow;
+
+	mh = kzalloc(sizeof(*mh) + actions_size, GFP_KERNEL);
+	if (!mh)
+		return -ENOMEM;
+
+	mh->key.actions = (void *)mh + sizeof(*mh);
+	memcpy(mh->key.actions, key.actions, actions_size);
+	mh->key.num_actions = num_actions;
+	INIT_LIST_HEAD(&mh->flows);
+
+	err = mlx5_modify_header_alloc(priv->mdev, namespace,
+				       mh->key.num_actions,
+				       mh->key.actions,
+				       &mh->mod_hdr_id);
+	if (err)
+		goto out_err;
+
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH)
+		hash_add(esw->offloads.mod_hdr_tbl, &mh->mod_hdr_hlist, hash_key);
+	else
+		hash_add(priv->fs.tc.mod_hdr_tbl, &mh->mod_hdr_hlist, hash_key);
+
+attach_flow:
+	list_add(&flow->mod_hdr, &mh->flows);
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH)
+		flow->esw_attr->mod_hdr_id = mh->mod_hdr_id;
+	else
+		flow->nic_attr->mod_hdr_id = mh->mod_hdr_id;
+
+	return 0;
+
+out_err:
+	kfree(mh);
+	return err;
+}
+
+static void mlx5e_detach_mod_hdr(struct mlx5e_priv *priv,
+				 struct mlx5e_tc_flow *flow)
+{
+	struct list_head *next = flow->mod_hdr.next;
+
+	list_del(&flow->mod_hdr);
+
+	if (list_empty(next)) {
+		struct mlx5e_mod_hdr_entry *mh;
+
+		mh = list_entry(next, struct mlx5e_mod_hdr_entry, flows);
+
+		mlx5_modify_header_dealloc(priv->mdev, mh->mod_hdr_id);
+		hash_del(&mh->mod_hdr_hlist);
+		kfree(mh);
+	}
+}
+
+static
+struct mlx5_core_dev *mlx5e_hairpin_get_mdev(struct net *net, int ifindex)
+{
+	struct net_device *netdev;
+	struct mlx5e_priv *priv;
+
+	netdev = __dev_get_by_index(net, ifindex);
+	priv = netdev_priv(netdev);
+	return priv->mdev;
+}
+
+static int mlx5e_hairpin_create_transport(struct mlx5e_hairpin *hp)
+{
+	u32 in[MLX5_ST_SZ_DW(create_tir_in)] = {0};
+	void *tirc;
+	int err;
+
+	err = mlx5_core_alloc_transport_domain(hp->func_mdev, &hp->tdn);
+	if (err)
+		goto alloc_tdn_err;
+
+	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+
+	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT);
+	MLX5_SET(tirc, tirc, inline_rqn, hp->pair->rqn[0]);
+	MLX5_SET(tirc, tirc, transport_domain, hp->tdn);
+
+	err = mlx5_core_create_tir(hp->func_mdev, in, MLX5_ST_SZ_BYTES(create_tir_in), &hp->tirn);
+	if (err)
+		goto create_tir_err;
+
+	return 0;
+
+create_tir_err:
+	mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn);
+alloc_tdn_err:
+	return err;
+}
+
+static void mlx5e_hairpin_destroy_transport(struct mlx5e_hairpin *hp)
+{
+	mlx5_core_destroy_tir(hp->func_mdev, hp->tirn);
+	mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn);
+}
+
+static void mlx5e_hairpin_fill_rqt_rqns(struct mlx5e_hairpin *hp, void *rqtc)
+{
+	u32 indirection_rqt[MLX5E_INDIR_RQT_SIZE], rqn;
+	struct mlx5e_priv *priv = hp->func_priv;
+	int i, ix, sz = MLX5E_INDIR_RQT_SIZE;
+
+	mlx5e_build_default_indir_rqt(indirection_rqt, sz,
+				      hp->num_channels);
+
+	for (i = 0; i < sz; i++) {
+		ix = i;
+		if (priv->channels.params.rss_hfunc == ETH_RSS_HASH_XOR)
+			ix = mlx5e_bits_invert(i, ilog2(sz));
+		ix = indirection_rqt[ix];
+		rqn = hp->pair->rqn[ix];
+		MLX5_SET(rqtc, rqtc, rq_num[i], rqn);
+	}
+}
+
+static int mlx5e_hairpin_create_indirect_rqt(struct mlx5e_hairpin *hp)
+{
+	int inlen, err, sz = MLX5E_INDIR_RQT_SIZE;
+	struct mlx5e_priv *priv = hp->func_priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *rqtc;
+	u32 *in;
+
+	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
+
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
+	MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
+
+	mlx5e_hairpin_fill_rqt_rqns(hp, rqtc);
+
+	err = mlx5_core_create_rqt(mdev, in, inlen, &hp->indir_rqt.rqtn);
+	if (!err)
+		hp->indir_rqt.enabled = true;
+
+	kvfree(in);
+	return err;
+}
+
+static int mlx5e_hairpin_create_indirect_tirs(struct mlx5e_hairpin *hp)
+{
+	struct mlx5e_priv *priv = hp->func_priv;
+	u32 in[MLX5_ST_SZ_DW(create_tir_in)];
+	int tt, i, err;
+	void *tirc;
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
+		memset(in, 0, MLX5_ST_SZ_BYTES(create_tir_in));
+		tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+
+		MLX5_SET(tirc, tirc, transport_domain, hp->tdn);
+		MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
+		MLX5_SET(tirc, tirc, indirect_table, hp->indir_rqt.rqtn);
+		mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, false);
+
+		err = mlx5_core_create_tir(hp->func_mdev, in,
+					   MLX5_ST_SZ_BYTES(create_tir_in), &hp->indir_tirn[tt]);
+		if (err) {
+			mlx5_core_warn(hp->func_mdev, "create indirect tirs failed, %d\n", err);
+			goto err_destroy_tirs;
+		}
+	}
+	return 0;
+
+err_destroy_tirs:
+	for (i = 0; i < tt; i++)
+		mlx5_core_destroy_tir(hp->func_mdev, hp->indir_tirn[i]);
+	return err;
+}
+
+static void mlx5e_hairpin_destroy_indirect_tirs(struct mlx5e_hairpin *hp)
+{
+	int tt;
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
+		mlx5_core_destroy_tir(hp->func_mdev, hp->indir_tirn[tt]);
+}
+
+static void mlx5e_hairpin_set_ttc_params(struct mlx5e_hairpin *hp,
+					 struct ttc_params *ttc_params)
+{
+	struct mlx5_flow_table_attr *ft_attr = &ttc_params->ft_attr;
+	int tt;
+
+	memset(ttc_params, 0, sizeof(*ttc_params));
+
+	ttc_params->any_tt_tirn = hp->tirn;
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
+		ttc_params->indir_tirn[tt] = hp->indir_tirn[tt];
+
+	ft_attr->max_fte = MLX5E_NUM_TT;
+	ft_attr->level = MLX5E_TC_TTC_FT_LEVEL;
+	ft_attr->prio = MLX5E_TC_PRIO;
+}
+
+static int mlx5e_hairpin_rss_init(struct mlx5e_hairpin *hp)
+{
+	struct mlx5e_priv *priv = hp->func_priv;
+	struct ttc_params ttc_params;
+	int err;
+
+	err = mlx5e_hairpin_create_indirect_rqt(hp);
+	if (err)
+		return err;
+
+	err = mlx5e_hairpin_create_indirect_tirs(hp);
+	if (err)
+		goto err_create_indirect_tirs;
+
+	mlx5e_hairpin_set_ttc_params(hp, &ttc_params);
+	err = mlx5e_create_ttc_table(priv, &ttc_params, &hp->ttc);
+	if (err)
+		goto err_create_ttc_table;
+
+	netdev_dbg(priv->netdev, "add hairpin: using %d channels rss ttc table id %x\n",
+		   hp->num_channels, hp->ttc.ft.t->id);
+
+	return 0;
+
+err_create_ttc_table:
+	mlx5e_hairpin_destroy_indirect_tirs(hp);
+err_create_indirect_tirs:
+	mlx5e_destroy_rqt(priv, &hp->indir_rqt);
+
+	return err;
+}
+
+static void mlx5e_hairpin_rss_cleanup(struct mlx5e_hairpin *hp)
+{
+	struct mlx5e_priv *priv = hp->func_priv;
+
+	mlx5e_destroy_ttc_table(priv, &hp->ttc);
+	mlx5e_hairpin_destroy_indirect_tirs(hp);
+	mlx5e_destroy_rqt(priv, &hp->indir_rqt);
+}
+
+static struct mlx5e_hairpin *
+mlx5e_hairpin_create(struct mlx5e_priv *priv, struct mlx5_hairpin_params *params,
+		     int peer_ifindex)
+{
+	struct mlx5_core_dev *func_mdev, *peer_mdev;
+	struct mlx5e_hairpin *hp;
+	struct mlx5_hairpin *pair;
+	int err;
+
+	hp = kzalloc(sizeof(*hp), GFP_KERNEL);
+	if (!hp)
+		return ERR_PTR(-ENOMEM);
+
+	func_mdev = priv->mdev;
+	peer_mdev = mlx5e_hairpin_get_mdev(dev_net(priv->netdev), peer_ifindex);
+
+	pair = mlx5_core_hairpin_create(func_mdev, peer_mdev, params);
+	if (IS_ERR(pair)) {
+		err = PTR_ERR(pair);
+		goto create_pair_err;
+	}
+	hp->pair = pair;
+	hp->func_mdev = func_mdev;
+	hp->func_priv = priv;
+	hp->num_channels = params->num_channels;
+
+	err = mlx5e_hairpin_create_transport(hp);
+	if (err)
+		goto create_transport_err;
+
+	if (hp->num_channels > 1) {
+		err = mlx5e_hairpin_rss_init(hp);
+		if (err)
+			goto rss_init_err;
+	}
+
+	return hp;
+
+rss_init_err:
+	mlx5e_hairpin_destroy_transport(hp);
+create_transport_err:
+	mlx5_core_hairpin_destroy(hp->pair);
+create_pair_err:
+	kfree(hp);
+	return ERR_PTR(err);
+}
+
+static void mlx5e_hairpin_destroy(struct mlx5e_hairpin *hp)
+{
+	if (hp->num_channels > 1)
+		mlx5e_hairpin_rss_cleanup(hp);
+	mlx5e_hairpin_destroy_transport(hp);
+	mlx5_core_hairpin_destroy(hp->pair);
+	kvfree(hp);
+}
+
+static inline u32 hash_hairpin_info(u16 peer_vhca_id, u8 prio)
+{
+	return (peer_vhca_id << 16 | prio);
+}
+
+static struct mlx5e_hairpin_entry *mlx5e_hairpin_get(struct mlx5e_priv *priv,
+						     u16 peer_vhca_id, u8 prio)
+{
+	struct mlx5e_hairpin_entry *hpe;
+	u32 hash_key = hash_hairpin_info(peer_vhca_id, prio);
+
+	hash_for_each_possible(priv->fs.tc.hairpin_tbl, hpe,
+			       hairpin_hlist, hash_key) {
+		if (hpe->peer_vhca_id == peer_vhca_id && hpe->prio == prio)
+			return hpe;
+	}
+
+	return NULL;
+}
+
+#define UNKNOWN_MATCH_PRIO 8
+
+static int mlx5e_hairpin_get_prio(struct mlx5e_priv *priv,
+				  struct mlx5_flow_spec *spec, u8 *match_prio)
+{
+	void *headers_c, *headers_v;
+	u8 prio_val, prio_mask = 0;
+	bool vlan_present;
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	if (priv->dcbx_dp.trust_state != MLX5_QPTS_TRUST_PCP) {
+		netdev_warn(priv->netdev,
+			    "only PCP trust state supported for hairpin\n");
+		return -EOPNOTSUPP;
+	}
+#endif
+	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
+	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
+
+	vlan_present = MLX5_GET(fte_match_set_lyr_2_4, headers_v, cvlan_tag);
+	if (vlan_present) {
+		prio_mask = MLX5_GET(fte_match_set_lyr_2_4, headers_c, first_prio);
+		prio_val = MLX5_GET(fte_match_set_lyr_2_4, headers_v, first_prio);
+	}
+
+	if (!vlan_present || !prio_mask) {
+		prio_val = UNKNOWN_MATCH_PRIO;
+	} else if (prio_mask != 0x7) {
+		netdev_warn(priv->netdev,
+			    "masked priority match not supported for hairpin\n");
+		return -EOPNOTSUPP;
+	}
+
+	*match_prio = prio_val;
+	return 0;
+}
+
+static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
+				  struct mlx5e_tc_flow *flow,
+				  struct mlx5e_tc_flow_parse_attr *parse_attr)
+{
+	int peer_ifindex = parse_attr->mirred_ifindex;
+	struct mlx5_hairpin_params params;
+	struct mlx5_core_dev *peer_mdev;
+	struct mlx5e_hairpin_entry *hpe;
+	struct mlx5e_hairpin *hp;
+	u64 link_speed64;
+	u32 link_speed;
+	u8 match_prio;
+	u16 peer_id;
+	int err;
+
+	peer_mdev = mlx5e_hairpin_get_mdev(dev_net(priv->netdev), peer_ifindex);
+	if (!MLX5_CAP_GEN(priv->mdev, hairpin) || !MLX5_CAP_GEN(peer_mdev, hairpin)) {
+		netdev_warn(priv->netdev, "hairpin is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	peer_id = MLX5_CAP_GEN(peer_mdev, vhca_id);
+	err = mlx5e_hairpin_get_prio(priv, &parse_attr->spec, &match_prio);
+	if (err)
+		return err;
+	hpe = mlx5e_hairpin_get(priv, peer_id, match_prio);
+	if (hpe)
+		goto attach_flow;
+
+	hpe = kzalloc(sizeof(*hpe), GFP_KERNEL);
+	if (!hpe)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&hpe->flows);
+	hpe->peer_vhca_id = peer_id;
+	hpe->prio = match_prio;
+
+	params.log_data_size = 15;
+	params.log_data_size = min_t(u8, params.log_data_size,
+				     MLX5_CAP_GEN(priv->mdev, log_max_hairpin_wq_data_sz));
+	params.log_data_size = max_t(u8, params.log_data_size,
+				     MLX5_CAP_GEN(priv->mdev, log_min_hairpin_wq_data_sz));
+
+	params.log_num_packets = params.log_data_size -
+				 MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(priv->mdev);
+	params.log_num_packets = min_t(u8, params.log_num_packets,
+				       MLX5_CAP_GEN(priv->mdev, log_max_hairpin_num_packets));
+
+	params.q_counter = priv->q_counter;
+	/* set hairpin pair per each 50Gbs share of the link */
+	mlx5e_get_max_linkspeed(priv->mdev, &link_speed);
+	link_speed = max_t(u32, link_speed, 50000);
+	link_speed64 = link_speed;
+	do_div(link_speed64, 50000);
+	params.num_channels = link_speed64;
+
+	hp = mlx5e_hairpin_create(priv, &params, peer_ifindex);
+	if (IS_ERR(hp)) {
+		err = PTR_ERR(hp);
+		goto create_hairpin_err;
+	}
+
+	netdev_dbg(priv->netdev, "add hairpin: tirn %x rqn %x peer %s sqn %x prio %d (log) data %d packets %d\n",
+		   hp->tirn, hp->pair->rqn[0], hp->pair->peer_mdev->priv.name,
+		   hp->pair->sqn[0], match_prio, params.log_data_size, params.log_num_packets);
+
+	hpe->hp = hp;
+	hash_add(priv->fs.tc.hairpin_tbl, &hpe->hairpin_hlist,
+		 hash_hairpin_info(peer_id, match_prio));
+
+attach_flow:
+	if (hpe->hp->num_channels > 1) {
+		flow->flags |= MLX5E_TC_FLOW_HAIRPIN_RSS;
+		flow->nic_attr->hairpin_ft = hpe->hp->ttc.ft.t;
+	} else {
+		flow->nic_attr->hairpin_tirn = hpe->hp->tirn;
+	}
+	list_add(&flow->hairpin, &hpe->flows);
+
+	return 0;
+
+create_hairpin_err:
+	kfree(hpe);
+	return err;
+}
+
+static void mlx5e_hairpin_flow_del(struct mlx5e_priv *priv,
+				   struct mlx5e_tc_flow *flow)
+{
+	struct list_head *next = flow->hairpin.next;
+
+	list_del(&flow->hairpin);
+
+	/* no more hairpin flows for us, release the hairpin pair */
+	if (list_empty(next)) {
+		struct mlx5e_hairpin_entry *hpe;
+
+		hpe = list_entry(next, struct mlx5e_hairpin_entry, flows);
+
+		netdev_dbg(priv->netdev, "del hairpin: peer %s\n",
+			   hpe->hp->pair->peer_mdev->priv.name);
+
+		mlx5e_hairpin_destroy(hpe->hp);
+		hash_del(&hpe->hairpin_hlist);
+		kfree(hpe);
+	}
+}
+
+static struct mlx5_flow_handle *
+mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
+		      struct mlx5e_tc_flow_parse_attr *parse_attr,
+		      struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_nic_flow_attr *attr = flow->nic_attr;
+	struct mlx5_core_dev *dev = priv->mdev;
+	struct mlx5_flow_destination dest[2] = {};
+	struct mlx5_flow_act flow_act = {
+		.action = attr->action,
+		.has_flow_tag = true,
+		.flow_tag = attr->flow_tag,
+		.encap_id = 0,
+	};
+	struct mlx5_fc *counter = NULL;
+	struct mlx5_flow_handle *rule;
+	bool table_created = false;
+	int err, dest_ix = 0;
+
+	if (flow->flags & MLX5E_TC_FLOW_HAIRPIN) {
+		err = mlx5e_hairpin_flow_add(priv, flow, parse_attr);
+		if (err) {
+			rule = ERR_PTR(err);
+			goto err_add_hairpin_flow;
+		}
+		if (flow->flags & MLX5E_TC_FLOW_HAIRPIN_RSS) {
+			dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+			dest[dest_ix].ft = attr->hairpin_ft;
+		} else {
+			dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+			dest[dest_ix].tir_num = attr->hairpin_tirn;
+		}
+		dest_ix++;
+	} else if (attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
+		dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+		dest[dest_ix].ft = priv->fs.vlan.ft.t;
+		dest_ix++;
+	}
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		counter = mlx5_fc_create(dev, true);
+		if (IS_ERR(counter)) {
+			rule = ERR_CAST(counter);
+			goto err_fc_create;
+		}
+		dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dest[dest_ix].counter = counter;
+		dest_ix++;
+	}
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
+		err = mlx5e_attach_mod_hdr(priv, flow, parse_attr);
+		flow_act.modify_id = attr->mod_hdr_id;
+		kfree(parse_attr->mod_hdr_actions);
+		if (err) {
+			rule = ERR_PTR(err);
+			goto err_create_mod_hdr_id;
+		}
+	}
+
+	if (IS_ERR_OR_NULL(priv->fs.tc.t)) {
+		int tc_grp_size, tc_tbl_size;
+		u32 max_flow_counter;
+
+		max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
+				    MLX5_CAP_GEN(dev, max_flow_counter_15_0);
+
+		tc_grp_size = min_t(int, max_flow_counter, MLX5E_TC_TABLE_MAX_GROUP_SIZE);
+
+		tc_tbl_size = min_t(int, tc_grp_size * MLX5E_TC_TABLE_NUM_GROUPS,
+				    BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev, log_max_ft_size)));
+
+		priv->fs.tc.t =
+			mlx5_create_auto_grouped_flow_table(priv->fs.ns,
+							    MLX5E_TC_PRIO,
+							    tc_tbl_size,
+							    MLX5E_TC_TABLE_NUM_GROUPS,
+							    MLX5E_TC_FT_LEVEL, 0);
+		if (IS_ERR(priv->fs.tc.t)) {
+			netdev_err(priv->netdev,
+				   "Failed to create tc offload table\n");
+			rule = ERR_CAST(priv->fs.tc.t);
+			goto err_create_ft;
+		}
+
+		table_created = true;
+	}
+
+	parse_attr->spec.match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	rule = mlx5_add_flow_rules(priv->fs.tc.t, &parse_attr->spec,
+				   &flow_act, dest, dest_ix);
+
+	if (IS_ERR(rule))
+		goto err_add_rule;
+
+	return rule;
+
+err_add_rule:
+	if (table_created) {
+		mlx5_destroy_flow_table(priv->fs.tc.t);
+		priv->fs.tc.t = NULL;
+	}
+err_create_ft:
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+		mlx5e_detach_mod_hdr(priv, flow);
+err_create_mod_hdr_id:
+	mlx5_fc_destroy(dev, counter);
+err_fc_create:
+	if (flow->flags & MLX5E_TC_FLOW_HAIRPIN)
+		mlx5e_hairpin_flow_del(priv, flow);
+err_add_hairpin_flow:
+	return rule;
+}
+
+static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv,
+				  struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_nic_flow_attr *attr = flow->nic_attr;
+	struct mlx5_fc *counter = NULL;
+
+	counter = mlx5_flow_rule_counter(flow->rule);
+	mlx5_del_flow_rules(flow->rule);
+	mlx5_fc_destroy(priv->mdev, counter);
+
+	if (!mlx5e_tc_num_filters(priv) && (priv->fs.tc.t)) {
+		mlx5_destroy_flow_table(priv->fs.tc.t);
+		priv->fs.tc.t = NULL;
+	}
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+		mlx5e_detach_mod_hdr(priv, flow);
+
+	if (flow->flags & MLX5E_TC_FLOW_HAIRPIN)
+		mlx5e_hairpin_flow_del(priv, flow);
+}
+
+static void mlx5e_detach_encap(struct mlx5e_priv *priv,
+			       struct mlx5e_tc_flow *flow);
+
+static int mlx5e_attach_encap(struct mlx5e_priv *priv,
+			      struct ip_tunnel_info *tun_info,
+			      struct net_device *mirred_dev,
+			      struct net_device **encap_dev,
+			      struct mlx5e_tc_flow *flow);
+
+static struct mlx5_flow_handle *
+mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
+		      struct mlx5e_tc_flow_parse_attr *parse_attr,
+		      struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+	struct net_device *out_dev, *encap_dev = NULL;
+	struct mlx5_flow_handle *rule = NULL;
+	struct mlx5e_rep_priv *rpriv;
+	struct mlx5e_priv *out_priv;
+	int err;
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) {
+		out_dev = __dev_get_by_index(dev_net(priv->netdev),
+					     attr->parse_attr->mirred_ifindex);
+		err = mlx5e_attach_encap(priv, &parse_attr->tun_info,
+					 out_dev, &encap_dev, flow);
+		if (err) {
+			rule = ERR_PTR(err);
+			if (err != -EAGAIN)
+				goto err_attach_encap;
+		}
+		out_priv = netdev_priv(encap_dev);
+		rpriv = out_priv->ppriv;
+		attr->out_rep = rpriv->rep;
+	}
+
+	err = mlx5_eswitch_add_vlan_action(esw, attr);
+	if (err) {
+		rule = ERR_PTR(err);
+		goto err_add_vlan;
+	}
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
+		err = mlx5e_attach_mod_hdr(priv, flow, parse_attr);
+		kfree(parse_attr->mod_hdr_actions);
+		if (err) {
+			rule = ERR_PTR(err);
+			goto err_mod_hdr;
+		}
+	}
+
+	/* we get here if (1) there's no error (rule being null) or when
+	 * (2) there's an encap action and we're on -EAGAIN (no valid neigh)
+	 */
+	if (rule != ERR_PTR(-EAGAIN)) {
+		rule = mlx5_eswitch_add_offloaded_rule(esw, &parse_attr->spec, attr);
+		if (IS_ERR(rule))
+			goto err_add_rule;
+	}
+	return rule;
+
+err_add_rule:
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+		mlx5e_detach_mod_hdr(priv, flow);
+err_mod_hdr:
+	mlx5_eswitch_del_vlan_action(esw, attr);
+err_add_vlan:
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP)
+		mlx5e_detach_encap(priv, flow);
+err_attach_encap:
+	return rule;
+}
+
+static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
+				  struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+
+	if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+		flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
+		mlx5_eswitch_del_offloaded_rule(esw, flow->rule, attr);
+	}
+
+	mlx5_eswitch_del_vlan_action(esw, attr);
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) {
+		mlx5e_detach_encap(priv, flow);
+		kvfree(attr->parse_attr);
+	}
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+		mlx5e_detach_mod_hdr(priv, flow);
+}
+
+void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
+			      struct mlx5e_encap_entry *e)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5_esw_flow_attr *esw_attr;
+	struct mlx5e_tc_flow *flow;
+	int err;
+
+	err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
+			       e->encap_size, e->encap_header,
+			       &e->encap_id);
+	if (err) {
+		mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %d\n",
+			       err);
+		return;
+	}
+	e->flags |= MLX5_ENCAP_ENTRY_VALID;
+	mlx5e_rep_queue_neigh_stats_work(priv);
+
+	list_for_each_entry(flow, &e->flows, encap) {
+		esw_attr = flow->esw_attr;
+		esw_attr->encap_id = e->encap_id;
+		flow->rule = mlx5_eswitch_add_offloaded_rule(esw, &esw_attr->parse_attr->spec, esw_attr);
+		if (IS_ERR(flow->rule)) {
+			err = PTR_ERR(flow->rule);
+			mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
+				       err);
+			continue;
+		}
+		flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
+	}
+}
+
+void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
+			      struct mlx5e_encap_entry *e)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_tc_flow *flow;
+
+	list_for_each_entry(flow, &e->flows, encap) {
+		if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+			flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
+			mlx5_eswitch_del_offloaded_rule(esw, flow->rule, flow->esw_attr);
+		}
+	}
+
+	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
+		e->flags &= ~MLX5_ENCAP_ENTRY_VALID;
+		mlx5_encap_dealloc(priv->mdev, e->encap_id);
+	}
+}
+
+void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
+{
+	struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
+	u64 bytes, packets, lastuse = 0;
+	struct mlx5e_tc_flow *flow;
+	struct mlx5e_encap_entry *e;
+	struct mlx5_fc *counter;
+	struct neigh_table *tbl;
+	bool neigh_used = false;
+	struct neighbour *n;
+
+	if (m_neigh->family == AF_INET)
+		tbl = &arp_tbl;
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (m_neigh->family == AF_INET6)
+		tbl = &nd_tbl;
+#endif
+	else
+		return;
+
+	list_for_each_entry(e, &nhe->encap_list, encap_list) {
+		if (!(e->flags & MLX5_ENCAP_ENTRY_VALID))
+			continue;
+		list_for_each_entry(flow, &e->flows, encap) {
+			if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+				counter = mlx5_flow_rule_counter(flow->rule);
+				mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
+				if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
+					neigh_used = true;
+					break;
+				}
+			}
+		}
+	}
+
+	if (neigh_used) {
+		nhe->reported_lastuse = jiffies;
+
+		/* find the relevant neigh according to the cached device and
+		 * dst ip pair
+		 */
+		n = neigh_lookup(tbl, &m_neigh->dst_ip, m_neigh->dev);
+		if (!n) {
+			WARN(1, "The neighbour already freed\n");
+			return;
+		}
+
+		neigh_event_send(n, NULL);
+		neigh_release(n);
+	}
+}
+
+static void mlx5e_detach_encap(struct mlx5e_priv *priv,
+			       struct mlx5e_tc_flow *flow)
+{
+	struct list_head *next = flow->encap.next;
+
+	list_del(&flow->encap);
+	if (list_empty(next)) {
+		struct mlx5e_encap_entry *e;
+
+		e = list_entry(next, struct mlx5e_encap_entry, flows);
+		mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
+
+		if (e->flags & MLX5_ENCAP_ENTRY_VALID)
+			mlx5_encap_dealloc(priv->mdev, e->encap_id);
+
+		hash_del_rcu(&e->encap_hlist);
+		kfree(e->encap_header);
+		kfree(e);
+	}
+}
+
+static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
+			      struct mlx5e_tc_flow *flow)
+{
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH)
+		mlx5e_tc_del_fdb_flow(priv, flow);
+	else
+		mlx5e_tc_del_nic_flow(priv, flow);
+}
+
+static void parse_vxlan_attr(struct mlx5_flow_spec *spec,
+			     struct tc_cls_flower_offload *f)
+{
+	void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				       outer_headers);
+	void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				       outer_headers);
+	void *misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    misc_parameters);
+	void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    misc_parameters);
+
+	MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ip_protocol);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_UDP);
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
+		struct flow_dissector_key_keyid *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_KEYID,
+						  f->key);
+		struct flow_dissector_key_keyid *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_KEYID,
+						  f->mask);
+		MLX5_SET(fte_match_set_misc, misc_c, vxlan_vni,
+			 be32_to_cpu(mask->keyid));
+		MLX5_SET(fte_match_set_misc, misc_v, vxlan_vni,
+			 be32_to_cpu(key->keyid));
+	}
+}
+
+static int parse_tunnel_attr(struct mlx5e_priv *priv,
+			     struct mlx5_flow_spec *spec,
+			     struct tc_cls_flower_offload *f)
+{
+	void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				       outer_headers);
+	void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				       outer_headers);
+
+	struct flow_dissector_key_control *enc_control =
+		skb_flow_dissector_target(f->dissector,
+					  FLOW_DISSECTOR_KEY_ENC_CONTROL,
+					  f->key);
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
+		struct flow_dissector_key_ports *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_PORTS,
+						  f->key);
+		struct flow_dissector_key_ports *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_PORTS,
+						  f->mask);
+		struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+		struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+		struct net_device *up_dev = uplink_rpriv->netdev;
+		struct mlx5e_priv *up_priv = netdev_priv(up_dev);
+
+		/* Full udp dst port must be given */
+		if (memchr_inv(&mask->dst, 0xff, sizeof(mask->dst)))
+			goto vxlan_match_offload_err;
+
+		if (mlx5e_vxlan_lookup_port(up_priv, be16_to_cpu(key->dst)) &&
+		    MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap))
+			parse_vxlan_attr(spec, f);
+		else {
+			netdev_warn(priv->netdev,
+				    "%d isn't an offloaded vxlan udp dport\n", be16_to_cpu(key->dst));
+			return -EOPNOTSUPP;
+		}
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+			 udp_dport, ntohs(mask->dst));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+			 udp_dport, ntohs(key->dst));
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+			 udp_sport, ntohs(mask->src));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+			 udp_sport, ntohs(key->src));
+	} else { /* udp dst port must be given */
+vxlan_match_offload_err:
+		netdev_warn(priv->netdev,
+			    "IP tunnel decap offload supported only for vxlan, must set UDP dport\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (enc_control->addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+		struct flow_dissector_key_ipv4_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
+						  f->key);
+		struct flow_dissector_key_ipv4_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
+						  f->mask);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+			 src_ipv4_src_ipv6.ipv4_layout.ipv4,
+			 ntohl(mask->src));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+			 src_ipv4_src_ipv6.ipv4_layout.ipv4,
+			 ntohl(key->src));
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+			 dst_ipv4_dst_ipv6.ipv4_layout.ipv4,
+			 ntohl(mask->dst));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+			 dst_ipv4_dst_ipv6.ipv4_layout.ipv4,
+			 ntohl(key->dst));
+
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IP);
+	} else if (enc_control->addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+		struct flow_dissector_key_ipv6_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
+						  f->key);
+		struct flow_dissector_key_ipv6_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
+						  f->mask);
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &mask->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &key->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &mask->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IPV6);
+	}
+
+	/* Enforce DMAC when offloading incoming tunneled flows.
+	 * Flow counters require a match on the DMAC.
+	 */
+	MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, dmac_47_16);
+	MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, dmac_15_0);
+	ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				     dmac_47_16), priv->netdev->dev_addr);
+
+	/* let software handle IP fragments */
+	MLX5_SET(fte_match_set_lyr_2_4, headers_c, frag, 1);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, 0);
+
+	return 0;
+}
+
+static int __parse_cls_flower(struct mlx5e_priv *priv,
+			      struct mlx5_flow_spec *spec,
+			      struct tc_cls_flower_offload *f,
+			      u8 *min_inline)
+{
+	void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				       outer_headers);
+	void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				       outer_headers);
+	u16 addr_type = 0;
+	u8 ip_proto = 0;
+
+	*min_inline = MLX5_INLINE_MODE_L2;
+
+	if (f->dissector->used_keys &
+	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
+	      BIT(FLOW_DISSECTOR_KEY_BASIC) |
+	      BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_VLAN) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_PORTS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_PORTS)	|
+	      BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) |
+	      BIT(FLOW_DISSECTOR_KEY_TCP) |
+	      BIT(FLOW_DISSECTOR_KEY_IP))) {
+		netdev_warn(priv->netdev, "Unsupported key used: 0x%x\n",
+			    f->dissector->used_keys);
+		return -EOPNOTSUPP;
+	}
+
+	if ((dissector_uses_key(f->dissector,
+				FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) ||
+	     dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_KEYID) ||
+	     dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) &&
+	    dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) {
+		struct flow_dissector_key_control *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_CONTROL,
+						  f->key);
+		switch (key->addr_type) {
+		case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+		case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+			if (parse_tunnel_attr(priv, spec, f))
+				return -EOPNOTSUPP;
+			break;
+		default:
+			return -EOPNOTSUPP;
+		}
+
+		/* In decap flow, header pointers should point to the inner
+		 * headers, outer header were already set by parse_tunnel_attr
+		 */
+		headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+					 inner_headers);
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
+		struct flow_dissector_key_control *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_CONTROL,
+						  f->key);
+
+		struct flow_dissector_key_control *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_CONTROL,
+						  f->mask);
+		addr_type = key->addr_type;
+
+		/* the HW doesn't support frag first/later */
+		if (mask->flags & FLOW_DIS_FIRST_FRAG)
+			return -EOPNOTSUPP;
+
+		if (mask->flags & FLOW_DIS_IS_FRAGMENT) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c, frag, 1);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag,
+				 key->flags & FLOW_DIS_IS_FRAGMENT);
+
+			/* the HW doesn't need L3 inline to match on frag=no */
+			if (key->flags & FLOW_DIS_IS_FRAGMENT)
+				*min_inline = MLX5_INLINE_MODE_IP;
+		}
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->key);
+		struct flow_dissector_key_basic *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->mask);
+		ip_proto = key->ip_proto;
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype,
+			 ntohs(mask->n_proto));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype,
+			 ntohs(key->n_proto));
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
+			 mask->ip_proto);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+			 key->ip_proto);
+
+		if (mask->ip_proto)
+			*min_inline = MLX5_INLINE_MODE_IP;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+		struct flow_dissector_key_eth_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ETH_ADDRS,
+						  f->key);
+		struct flow_dissector_key_eth_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ETH_ADDRS,
+						  f->mask);
+
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+					     dmac_47_16),
+				mask->dst);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+					     dmac_47_16),
+				key->dst);
+
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+					     smac_47_16),
+				mask->src);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+					     smac_47_16),
+				key->src);
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_VLAN)) {
+		struct flow_dissector_key_vlan *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLAN,
+						  f->key);
+		struct flow_dissector_key_vlan *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLAN,
+						  f->mask);
+		if (mask->vlan_id || mask->vlan_priority) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1);
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_vid, mask->vlan_id);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, key->vlan_id);
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_prio, mask->vlan_priority);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_prio, key->vlan_priority);
+		}
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+		struct flow_dissector_key_ipv4_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						  f->key);
+		struct flow_dissector_key_ipv4_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						  f->mask);
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &mask->src, sizeof(mask->src));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &key->src, sizeof(key->src));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &mask->dst, sizeof(mask->dst));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &key->dst, sizeof(key->dst));
+
+		if (mask->src || mask->dst)
+			*min_inline = MLX5_INLINE_MODE_IP;
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+		struct flow_dissector_key_ipv6_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+						  f->key);
+		struct flow_dissector_key_ipv6_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+						  f->mask);
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &mask->src, sizeof(mask->src));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &key->src, sizeof(key->src));
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &mask->dst, sizeof(mask->dst));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &key->dst, sizeof(key->dst));
+
+		if (ipv6_addr_type(&mask->src) != IPV6_ADDR_ANY ||
+		    ipv6_addr_type(&mask->dst) != IPV6_ADDR_ANY)
+			*min_inline = MLX5_INLINE_MODE_IP;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_IP)) {
+		struct flow_dissector_key_ip *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IP,
+						  f->key);
+		struct flow_dissector_key_ip *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IP,
+						  f->mask);
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_ecn, mask->tos & 0x3);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, key->tos & 0x3);
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_dscp, mask->tos >> 2);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_dscp, key->tos  >> 2);
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ttl_hoplimit, mask->ttl);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ttl_hoplimit, key->ttl);
+
+		if (mask->ttl &&
+		    !MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev,
+						ft_field_support.outer_ipv4_ttl))
+			return -EOPNOTSUPP;
+
+		if (mask->tos || mask->ttl)
+			*min_inline = MLX5_INLINE_MODE_IP;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_PORTS)) {
+		struct flow_dissector_key_ports *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_PORTS,
+						  f->key);
+		struct flow_dissector_key_ports *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_PORTS,
+						  f->mask);
+		switch (ip_proto) {
+		case IPPROTO_TCP:
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 tcp_sport, ntohs(mask->src));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 tcp_sport, ntohs(key->src));
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 tcp_dport, ntohs(mask->dst));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 tcp_dport, ntohs(key->dst));
+			break;
+
+		case IPPROTO_UDP:
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 udp_sport, ntohs(mask->src));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 udp_sport, ntohs(key->src));
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 udp_dport, ntohs(mask->dst));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 udp_dport, ntohs(key->dst));
+			break;
+		default:
+			netdev_err(priv->netdev,
+				   "Only UDP and TCP transport are supported\n");
+			return -EINVAL;
+		}
+
+		if (mask->src || mask->dst)
+			*min_inline = MLX5_INLINE_MODE_TCP_UDP;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_TCP)) {
+		struct flow_dissector_key_tcp *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_TCP,
+						  f->key);
+		struct flow_dissector_key_tcp *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_TCP,
+						  f->mask);
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_flags,
+			 ntohs(mask->flags));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_flags,
+			 ntohs(key->flags));
+
+		if (mask->flags)
+			*min_inline = MLX5_INLINE_MODE_TCP_UDP;
+	}
+
+	return 0;
+}
+
+static int parse_cls_flower(struct mlx5e_priv *priv,
+			    struct mlx5e_tc_flow *flow,
+			    struct mlx5_flow_spec *spec,
+			    struct tc_cls_flower_offload *f)
+{
+	struct mlx5_core_dev *dev = priv->mdev;
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep;
+	u8 min_inline;
+	int err;
+
+	err = __parse_cls_flower(priv, spec, f, &min_inline);
+
+	if (!err && (flow->flags & MLX5E_TC_FLOW_ESWITCH)) {
+		rep = rpriv->rep;
+		if (rep->vport != FDB_UPLINK_VPORT &&
+		    (esw->offloads.inline_mode != MLX5_INLINE_MODE_NONE &&
+		    esw->offloads.inline_mode < min_inline)) {
+			netdev_warn(priv->netdev,
+				    "Flow is not offloaded due to min inline setting, required %d actual %d\n",
+				    min_inline, esw->offloads.inline_mode);
+			return -EOPNOTSUPP;
+		}
+	}
+
+	return err;
+}
+
+struct pedit_headers {
+	struct ethhdr  eth;
+	struct iphdr   ip4;
+	struct ipv6hdr ip6;
+	struct tcphdr  tcp;
+	struct udphdr  udp;
+};
+
+static int pedit_header_offsets[] = {
+	[TCA_PEDIT_KEY_EX_HDR_TYPE_ETH] = offsetof(struct pedit_headers, eth),
+	[TCA_PEDIT_KEY_EX_HDR_TYPE_IP4] = offsetof(struct pedit_headers, ip4),
+	[TCA_PEDIT_KEY_EX_HDR_TYPE_IP6] = offsetof(struct pedit_headers, ip6),
+	[TCA_PEDIT_KEY_EX_HDR_TYPE_TCP] = offsetof(struct pedit_headers, tcp),
+	[TCA_PEDIT_KEY_EX_HDR_TYPE_UDP] = offsetof(struct pedit_headers, udp),
+};
+
+#define pedit_header(_ph, _htype) ((void *)(_ph) + pedit_header_offsets[_htype])
+
+static int set_pedit_val(u8 hdr_type, u32 mask, u32 val, u32 offset,
+			 struct pedit_headers *masks,
+			 struct pedit_headers *vals)
+{
+	u32 *curr_pmask, *curr_pval;
+
+	if (hdr_type >= __PEDIT_HDR_TYPE_MAX)
+		goto out_err;
+
+	curr_pmask = (u32 *)(pedit_header(masks, hdr_type) + offset);
+	curr_pval  = (u32 *)(pedit_header(vals, hdr_type) + offset);
+
+	if (*curr_pmask & mask)  /* disallow acting twice on the same location */
+		goto out_err;
+
+	*curr_pmask |= mask;
+	*curr_pval  |= (val & mask);
+
+	return 0;
+
+out_err:
+	return -EOPNOTSUPP;
+}
+
+struct mlx5_fields {
+	u8  field;
+	u8  size;
+	u32 offset;
+};
+
+#define OFFLOAD(fw_field, size, field, off) \
+		{MLX5_ACTION_IN_FIELD_OUT_ ## fw_field, size, offsetof(struct pedit_headers, field) + (off)}
+
+static struct mlx5_fields fields[] = {
+	OFFLOAD(DMAC_47_16, 4, eth.h_dest[0], 0),
+	OFFLOAD(DMAC_47_16, 4, eth.h_dest[0], 0),
+	OFFLOAD(DMAC_15_0,  2, eth.h_dest[4], 0),
+	OFFLOAD(SMAC_47_16, 4, eth.h_source[0], 0),
+	OFFLOAD(SMAC_15_0,  2, eth.h_source[4], 0),
+	OFFLOAD(ETHERTYPE,  2, eth.h_proto, 0),
+
+	OFFLOAD(IP_TTL, 1, ip4.ttl,   0),
+	OFFLOAD(SIPV4,  4, ip4.saddr, 0),
+	OFFLOAD(DIPV4,  4, ip4.daddr, 0),
+
+	OFFLOAD(SIPV6_127_96, 4, ip6.saddr.s6_addr32[0], 0),
+	OFFLOAD(SIPV6_95_64,  4, ip6.saddr.s6_addr32[1], 0),
+	OFFLOAD(SIPV6_63_32,  4, ip6.saddr.s6_addr32[2], 0),
+	OFFLOAD(SIPV6_31_0,   4, ip6.saddr.s6_addr32[3], 0),
+	OFFLOAD(DIPV6_127_96, 4, ip6.daddr.s6_addr32[0], 0),
+	OFFLOAD(DIPV6_95_64,  4, ip6.daddr.s6_addr32[1], 0),
+	OFFLOAD(DIPV6_63_32,  4, ip6.daddr.s6_addr32[2], 0),
+	OFFLOAD(DIPV6_31_0,   4, ip6.daddr.s6_addr32[3], 0),
+	OFFLOAD(IPV6_HOPLIMIT, 1, ip6.hop_limit, 0),
+
+	OFFLOAD(TCP_SPORT, 2, tcp.source,  0),
+	OFFLOAD(TCP_DPORT, 2, tcp.dest,    0),
+	OFFLOAD(TCP_FLAGS, 1, tcp.ack_seq, 5),
+
+	OFFLOAD(UDP_SPORT, 2, udp.source, 0),
+	OFFLOAD(UDP_DPORT, 2, udp.dest,   0),
+};
+
+/* On input attr->num_mod_hdr_actions tells how many HW actions can be parsed at
+ * max from the SW pedit action. On success, it says how many HW actions were
+ * actually parsed.
+ */
+static int offload_pedit_fields(struct pedit_headers *masks,
+				struct pedit_headers *vals,
+				struct mlx5e_tc_flow_parse_attr *parse_attr)
+{
+	struct pedit_headers *set_masks, *add_masks, *set_vals, *add_vals;
+	int i, action_size, nactions, max_actions, first, last, next_z;
+	void *s_masks_p, *a_masks_p, *vals_p;
+	struct mlx5_fields *f;
+	u8 cmd, field_bsize;
+	u32 s_mask, a_mask;
+	unsigned long mask;
+	__be32 mask_be32;
+	__be16 mask_be16;
+	void *action;
+
+	set_masks = &masks[TCA_PEDIT_KEY_EX_CMD_SET];
+	add_masks = &masks[TCA_PEDIT_KEY_EX_CMD_ADD];
+	set_vals = &vals[TCA_PEDIT_KEY_EX_CMD_SET];
+	add_vals = &vals[TCA_PEDIT_KEY_EX_CMD_ADD];
+
+	action_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto);
+	action = parse_attr->mod_hdr_actions;
+	max_actions = parse_attr->num_mod_hdr_actions;
+	nactions = 0;
+
+	for (i = 0; i < ARRAY_SIZE(fields); i++) {
+		f = &fields[i];
+		/* avoid seeing bits set from previous iterations */
+		s_mask = 0;
+		a_mask = 0;
+
+		s_masks_p = (void *)set_masks + f->offset;
+		a_masks_p = (void *)add_masks + f->offset;
+
+		memcpy(&s_mask, s_masks_p, f->size);
+		memcpy(&a_mask, a_masks_p, f->size);
+
+		if (!s_mask && !a_mask) /* nothing to offload here */
+			continue;
+
+		if (s_mask && a_mask) {
+			printk(KERN_WARNING "mlx5: can't set and add to the same HW field (%x)\n", f->field);
+			return -EOPNOTSUPP;
+		}
+
+		if (nactions == max_actions) {
+			printk(KERN_WARNING "mlx5: parsed %d pedit actions, can't do more\n", nactions);
+			return -EOPNOTSUPP;
+		}
+
+		if (s_mask) {
+			cmd  = MLX5_ACTION_TYPE_SET;
+			mask = s_mask;
+			vals_p = (void *)set_vals + f->offset;
+			/* clear to denote we consumed this field */
+			memset(s_masks_p, 0, f->size);
+		} else {
+			cmd  = MLX5_ACTION_TYPE_ADD;
+			mask = a_mask;
+			vals_p = (void *)add_vals + f->offset;
+			/* clear to denote we consumed this field */
+			memset(a_masks_p, 0, f->size);
+		}
+
+		field_bsize = f->size * BITS_PER_BYTE;
+
+		if (field_bsize == 32) {
+			mask_be32 = *(__be32 *)&mask;
+			mask = (__force unsigned long)cpu_to_le32(be32_to_cpu(mask_be32));
+		} else if (field_bsize == 16) {
+			mask_be16 = *(__be16 *)&mask;
+			mask = (__force unsigned long)cpu_to_le16(be16_to_cpu(mask_be16));
+		}
+
+		first = find_first_bit(&mask, field_bsize);
+		next_z = find_next_zero_bit(&mask, field_bsize, first);
+		last  = find_last_bit(&mask, field_bsize);
+		if (first < next_z && next_z < last) {
+			printk(KERN_WARNING "mlx5: rewrite of few sub-fields (mask %lx) isn't offloaded\n",
+			       mask);
+			return -EOPNOTSUPP;
+		}
+
+		MLX5_SET(set_action_in, action, action_type, cmd);
+		MLX5_SET(set_action_in, action, field, f->field);
+
+		if (cmd == MLX5_ACTION_TYPE_SET) {
+			MLX5_SET(set_action_in, action, offset, first);
+			/* length is num of bits to be written, zero means length of 32 */
+			MLX5_SET(set_action_in, action, length, (last - first + 1));
+		}
+
+		if (field_bsize == 32)
+			MLX5_SET(set_action_in, action, data, ntohl(*(__be32 *)vals_p) >> first);
+		else if (field_bsize == 16)
+			MLX5_SET(set_action_in, action, data, ntohs(*(__be16 *)vals_p) >> first);
+		else if (field_bsize == 8)
+			MLX5_SET(set_action_in, action, data, *(u8 *)vals_p >> first);
+
+		action += action_size;
+		nactions++;
+	}
+
+	parse_attr->num_mod_hdr_actions = nactions;
+	return 0;
+}
+
+static int alloc_mod_hdr_actions(struct mlx5e_priv *priv,
+				 const struct tc_action *a, int namespace,
+				 struct mlx5e_tc_flow_parse_attr *parse_attr)
+{
+	int nkeys, action_size, max_actions;
+
+	nkeys = tcf_pedit_nkeys(a);
+	action_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto);
+
+	if (namespace == MLX5_FLOW_NAMESPACE_FDB) /* FDB offloading */
+		max_actions = MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, max_modify_header_actions);
+	else /* namespace is MLX5_FLOW_NAMESPACE_KERNEL - NIC offloading */
+		max_actions = MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, max_modify_header_actions);
+
+	/* can get up to crazingly 16 HW actions in 32 bits pedit SW key */
+	max_actions = min(max_actions, nkeys * 16);
+
+	parse_attr->mod_hdr_actions = kcalloc(max_actions, action_size, GFP_KERNEL);
+	if (!parse_attr->mod_hdr_actions)
+		return -ENOMEM;
+
+	parse_attr->num_mod_hdr_actions = max_actions;
+	return 0;
+}
+
+static const struct pedit_headers zero_masks = {};
+
+static int parse_tc_pedit_action(struct mlx5e_priv *priv,
+				 const struct tc_action *a, int namespace,
+				 struct mlx5e_tc_flow_parse_attr *parse_attr)
+{
+	struct pedit_headers masks[__PEDIT_CMD_MAX], vals[__PEDIT_CMD_MAX], *cmd_masks;
+	int nkeys, i, err = -EOPNOTSUPP;
+	u32 mask, val, offset;
+	u8 cmd, htype;
+
+	nkeys = tcf_pedit_nkeys(a);
+
+	memset(masks, 0, sizeof(struct pedit_headers) * __PEDIT_CMD_MAX);
+	memset(vals,  0, sizeof(struct pedit_headers) * __PEDIT_CMD_MAX);
+
+	for (i = 0; i < nkeys; i++) {
+		htype = tcf_pedit_htype(a, i);
+		cmd = tcf_pedit_cmd(a, i);
+		err = -EOPNOTSUPP; /* can't be all optimistic */
+
+		if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK) {
+			printk(KERN_WARNING "mlx5: legacy pedit isn't offloaded\n");
+			goto out_err;
+		}
+
+		if (cmd != TCA_PEDIT_KEY_EX_CMD_SET && cmd != TCA_PEDIT_KEY_EX_CMD_ADD) {
+			printk(KERN_WARNING "mlx5: pedit cmd %d isn't offloaded\n", cmd);
+			goto out_err;
+		}
+
+		mask = tcf_pedit_mask(a, i);
+		val = tcf_pedit_val(a, i);
+		offset = tcf_pedit_offset(a, i);
+
+		err = set_pedit_val(htype, ~mask, val, offset, &masks[cmd], &vals[cmd]);
+		if (err)
+			goto out_err;
+	}
+
+	err = alloc_mod_hdr_actions(priv, a, namespace, parse_attr);
+	if (err)
+		goto out_err;
+
+	err = offload_pedit_fields(masks, vals, parse_attr);
+	if (err < 0)
+		goto out_dealloc_parsed_actions;
+
+	for (cmd = 0; cmd < __PEDIT_CMD_MAX; cmd++) {
+		cmd_masks = &masks[cmd];
+		if (memcmp(cmd_masks, &zero_masks, sizeof(zero_masks))) {
+			printk(KERN_WARNING "mlx5: attempt to offload an unsupported field (cmd %d)\n",
+			       cmd);
+			print_hex_dump(KERN_WARNING, "mask: ", DUMP_PREFIX_ADDRESS,
+				       16, 1, cmd_masks, sizeof(zero_masks), true);
+			err = -EOPNOTSUPP;
+			goto out_dealloc_parsed_actions;
+		}
+	}
+
+	return 0;
+
+out_dealloc_parsed_actions:
+	kfree(parse_attr->mod_hdr_actions);
+out_err:
+	return err;
+}
+
+static bool csum_offload_supported(struct mlx5e_priv *priv, u32 action, u32 update_flags)
+{
+	u32 prot_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR | TCA_CSUM_UPDATE_FLAG_TCP |
+			 TCA_CSUM_UPDATE_FLAG_UDP;
+
+	/*  The HW recalcs checksums only if re-writing headers */
+	if (!(action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)) {
+		netdev_warn(priv->netdev,
+			    "TC csum action is only offloaded with pedit\n");
+		return false;
+	}
+
+	if (update_flags & ~prot_flags) {
+		netdev_warn(priv->netdev,
+			    "can't offload TC csum action for some header/s - flags %#x\n",
+			    update_flags);
+		return false;
+	}
+
+	return true;
+}
+
+static bool modify_header_match_supported(struct mlx5_flow_spec *spec,
+					  struct tcf_exts *exts)
+{
+	const struct tc_action *a;
+	bool modify_ip_header;
+	LIST_HEAD(actions);
+	u8 htype, ip_proto;
+	void *headers_v;
+	u16 ethertype;
+	int nkeys, i;
+
+	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
+	ethertype = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ethertype);
+
+	/* for non-IP we only re-write MACs, so we're okay */
+	if (ethertype != ETH_P_IP && ethertype != ETH_P_IPV6)
+		goto out_ok;
+
+	modify_ip_header = false;
+	tcf_exts_to_list(exts, &actions);
+	list_for_each_entry(a, &actions, list) {
+		if (!is_tcf_pedit(a))
+			continue;
+
+		nkeys = tcf_pedit_nkeys(a);
+		for (i = 0; i < nkeys; i++) {
+			htype = tcf_pedit_htype(a, i);
+			if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 ||
+			    htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6) {
+				modify_ip_header = true;
+				break;
+			}
+		}
+	}
+
+	ip_proto = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol);
+	if (modify_ip_header && ip_proto != IPPROTO_TCP &&
+	    ip_proto != IPPROTO_UDP && ip_proto != IPPROTO_ICMP) {
+		pr_info("can't offload re-write of ip proto %d\n", ip_proto);
+		return false;
+	}
+
+out_ok:
+	return true;
+}
+
+static bool actions_match_supported(struct mlx5e_priv *priv,
+				    struct tcf_exts *exts,
+				    struct mlx5e_tc_flow_parse_attr *parse_attr,
+				    struct mlx5e_tc_flow *flow)
+{
+	u32 actions;
+
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH)
+		actions = flow->esw_attr->action;
+	else
+		actions = flow->nic_attr->action;
+
+	if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+		return modify_header_match_supported(&parse_attr->spec, exts);
+
+	return true;
+}
+
+static bool same_hw_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv)
+{
+	struct mlx5_core_dev *fmdev, *pmdev;
+	u16 func_id, peer_id;
+
+	fmdev = priv->mdev;
+	pmdev = peer_priv->mdev;
+
+	func_id = (u16)((fmdev->pdev->bus->number << 8) | PCI_SLOT(fmdev->pdev->devfn));
+	peer_id = (u16)((pmdev->pdev->bus->number << 8) | PCI_SLOT(pmdev->pdev->devfn));
+
+	return (func_id == peer_id);
+}
+
+static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
+				struct mlx5e_tc_flow_parse_attr *parse_attr,
+				struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_nic_flow_attr *attr = flow->nic_attr;
+	const struct tc_action *a;
+	LIST_HEAD(actions);
+	int err;
+
+	if (!tcf_exts_has_actions(exts))
+		return -EINVAL;
+
+	attr->flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
+	attr->action = 0;
+
+	tcf_exts_to_list(exts, &actions);
+	list_for_each_entry(a, &actions, list) {
+		if (is_tcf_gact_shot(a)) {
+			attr->action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
+			if (MLX5_CAP_FLOWTABLE(priv->mdev,
+					       flow_table_properties_nic_receive.flow_counter))
+				attr->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+			continue;
+		}
+
+		if (is_tcf_pedit(a)) {
+			err = parse_tc_pedit_action(priv, a, MLX5_FLOW_NAMESPACE_KERNEL,
+						    parse_attr);
+			if (err)
+				return err;
+
+			attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR |
+					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+			continue;
+		}
+
+		if (is_tcf_csum(a)) {
+			if (csum_offload_supported(priv, attr->action,
+						   tcf_csum_update_flags(a)))
+				continue;
+
+			return -EOPNOTSUPP;
+		}
+
+		if (is_tcf_mirred_egress_redirect(a)) {
+			struct net_device *peer_dev = tcf_mirred_dev(a);
+
+			if (priv->netdev->netdev_ops == peer_dev->netdev_ops &&
+			    same_hw_devs(priv, netdev_priv(peer_dev))) {
+				parse_attr->mirred_ifindex = peer_dev->ifindex;
+				flow->flags |= MLX5E_TC_FLOW_HAIRPIN;
+				attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+						MLX5_FLOW_CONTEXT_ACTION_COUNT;
+			} else {
+				netdev_warn(priv->netdev, "device %s not on same HW, can't offload\n",
+					    peer_dev->name);
+				return -EINVAL;
+			}
+			continue;
+		}
+
+		if (is_tcf_skbedit_mark(a)) {
+			u32 mark = tcf_skbedit_mark(a);
+
+			if (mark & ~MLX5E_TC_FLOW_ID_MASK) {
+				netdev_warn(priv->netdev, "Bad flow mark - only 16 bit is supported: 0x%x\n",
+					    mark);
+				return -EINVAL;
+			}
+
+			attr->flow_tag = mark;
+			attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+			continue;
+		}
+
+		return -EINVAL;
+	}
+
+	if (!actions_match_supported(priv, exts, parse_attr, flow))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static inline int cmp_encap_info(struct ip_tunnel_key *a,
+				 struct ip_tunnel_key *b)
+{
+	return memcmp(a, b, sizeof(*a));
+}
+
+static inline int hash_encap_info(struct ip_tunnel_key *key)
+{
+	return jhash(key, sizeof(*key), 0);
+}
+
+static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
+				   struct net_device *mirred_dev,
+				   struct net_device **out_dev,
+				   struct flowi4 *fl4,
+				   struct neighbour **out_n,
+				   int *out_ttl)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *uplink_rpriv;
+	struct rtable *rt;
+	struct neighbour *n = NULL;
+
+#if IS_ENABLED(CONFIG_INET)
+	int ret;
+
+	rt = ip_route_output_key(dev_net(mirred_dev), fl4);
+	ret = PTR_ERR_OR_ZERO(rt);
+	if (ret)
+		return ret;
+#else
+	return -EOPNOTSUPP;
+#endif
+	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+	/* if the egress device isn't on the same HW e-switch, we use the uplink */
+	if (!switchdev_port_same_parent_id(priv->netdev, rt->dst.dev))
+		*out_dev = uplink_rpriv->netdev;
+	else
+		*out_dev = rt->dst.dev;
+
+	*out_ttl = ip4_dst_hoplimit(&rt->dst);
+	n = dst_neigh_lookup(&rt->dst, &fl4->daddr);
+	ip_rt_put(rt);
+	if (!n)
+		return -ENOMEM;
+
+	*out_n = n;
+	return 0;
+}
+
+static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
+				   struct net_device *mirred_dev,
+				   struct net_device **out_dev,
+				   struct flowi6 *fl6,
+				   struct neighbour **out_n,
+				   int *out_ttl)
+{
+	struct neighbour *n = NULL;
+	struct dst_entry *dst;
+
+#if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
+	struct mlx5e_rep_priv *uplink_rpriv;
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	int ret;
+
+	ret = ipv6_stub->ipv6_dst_lookup(dev_net(mirred_dev), NULL, &dst,
+					 fl6);
+	if (ret < 0)
+		return ret;
+
+	*out_ttl = ip6_dst_hoplimit(dst);
+
+	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+	/* if the egress device isn't on the same HW e-switch, we use the uplink */
+	if (!switchdev_port_same_parent_id(priv->netdev, dst->dev))
+		*out_dev = uplink_rpriv->netdev;
+	else
+		*out_dev = dst->dev;
+#else
+	return -EOPNOTSUPP;
+#endif
+
+	n = dst_neigh_lookup(dst, &fl6->daddr);
+	dst_release(dst);
+	if (!n)
+		return -ENOMEM;
+
+	*out_n = n;
+	return 0;
+}
+
+static void gen_vxlan_header_ipv4(struct net_device *out_dev,
+				  char buf[], int encap_size,
+				  unsigned char h_dest[ETH_ALEN],
+				  int ttl,
+				  __be32 daddr,
+				  __be32 saddr,
+				  __be16 udp_dst_port,
+				  __be32 vx_vni)
+{
+	struct ethhdr *eth = (struct ethhdr *)buf;
+	struct iphdr  *ip = (struct iphdr *)((char *)eth + sizeof(struct ethhdr));
+	struct udphdr *udp = (struct udphdr *)((char *)ip + sizeof(struct iphdr));
+	struct vxlanhdr *vxh = (struct vxlanhdr *)((char *)udp + sizeof(struct udphdr));
+
+	memset(buf, 0, encap_size);
+
+	ether_addr_copy(eth->h_dest, h_dest);
+	ether_addr_copy(eth->h_source, out_dev->dev_addr);
+	eth->h_proto = htons(ETH_P_IP);
+
+	ip->daddr = daddr;
+	ip->saddr = saddr;
+
+	ip->ttl = ttl;
+	ip->protocol = IPPROTO_UDP;
+	ip->version = 0x4;
+	ip->ihl = 0x5;
+
+	udp->dest = udp_dst_port;
+	vxh->vx_flags = VXLAN_HF_VNI;
+	vxh->vx_vni = vxlan_vni_field(vx_vni);
+}
+
+static void gen_vxlan_header_ipv6(struct net_device *out_dev,
+				  char buf[], int encap_size,
+				  unsigned char h_dest[ETH_ALEN],
+				  int ttl,
+				  struct in6_addr *daddr,
+				  struct in6_addr *saddr,
+				  __be16 udp_dst_port,
+				  __be32 vx_vni)
+{
+	struct ethhdr *eth = (struct ethhdr *)buf;
+	struct ipv6hdr *ip6h = (struct ipv6hdr *)((char *)eth + sizeof(struct ethhdr));
+	struct udphdr *udp = (struct udphdr *)((char *)ip6h + sizeof(struct ipv6hdr));
+	struct vxlanhdr *vxh = (struct vxlanhdr *)((char *)udp + sizeof(struct udphdr));
+
+	memset(buf, 0, encap_size);
+
+	ether_addr_copy(eth->h_dest, h_dest);
+	ether_addr_copy(eth->h_source, out_dev->dev_addr);
+	eth->h_proto = htons(ETH_P_IPV6);
+
+	ip6_flow_hdr(ip6h, 0, 0);
+	/* the HW fills up ipv6 payload len */
+	ip6h->nexthdr     = IPPROTO_UDP;
+	ip6h->hop_limit   = ttl;
+	ip6h->daddr	  = *daddr;
+	ip6h->saddr	  = *saddr;
+
+	udp->dest = udp_dst_port;
+	vxh->vx_flags = VXLAN_HF_VNI;
+	vxh->vx_vni = vxlan_vni_field(vx_vni);
+}
+
+static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
+					  struct net_device *mirred_dev,
+					  struct mlx5e_encap_entry *e)
+{
+	int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
+	int ipv4_encap_size = ETH_HLEN + sizeof(struct iphdr) + VXLAN_HLEN;
+	struct ip_tunnel_key *tun_key = &e->tun_info.key;
+	struct net_device *out_dev;
+	struct neighbour *n = NULL;
+	struct flowi4 fl4 = {};
+	char *encap_header;
+	int ttl, err;
+	u8 nud_state;
+
+	if (max_encap_size < ipv4_encap_size) {
+		mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n",
+			       ipv4_encap_size, max_encap_size);
+		return -EOPNOTSUPP;
+	}
+
+	encap_header = kzalloc(ipv4_encap_size, GFP_KERNEL);
+	if (!encap_header)
+		return -ENOMEM;
+
+	switch (e->tunnel_type) {
+	case MLX5_HEADER_TYPE_VXLAN:
+		fl4.flowi4_proto = IPPROTO_UDP;
+		fl4.fl4_dport = tun_key->tp_dst;
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		goto free_encap;
+	}
+	fl4.flowi4_tos = tun_key->tos;
+	fl4.daddr = tun_key->u.ipv4.dst;
+	fl4.saddr = tun_key->u.ipv4.src;
+
+	err = mlx5e_route_lookup_ipv4(priv, mirred_dev, &out_dev,
+				      &fl4, &n, &ttl);
+	if (err)
+		goto free_encap;
+
+	/* used by mlx5e_detach_encap to lookup a neigh hash table
+	 * entry in the neigh hash table when a user deletes a rule
+	 */
+	e->m_neigh.dev = n->dev;
+	e->m_neigh.family = n->ops->family;
+	memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
+	e->out_dev = out_dev;
+
+	/* It's importent to add the neigh to the hash table before checking
+	 * the neigh validity state. So if we'll get a notification, in case the
+	 * neigh changes it's validity state, we would find the relevant neigh
+	 * in the hash.
+	 */
+	err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e);
+	if (err)
+		goto free_encap;
+
+	read_lock_bh(&n->lock);
+	nud_state = n->nud_state;
+	ether_addr_copy(e->h_dest, n->ha);
+	read_unlock_bh(&n->lock);
+
+	switch (e->tunnel_type) {
+	case MLX5_HEADER_TYPE_VXLAN:
+		gen_vxlan_header_ipv4(out_dev, encap_header,
+				      ipv4_encap_size, e->h_dest, ttl,
+				      fl4.daddr,
+				      fl4.saddr, tun_key->tp_dst,
+				      tunnel_id_to_key32(tun_key->tun_id));
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		goto destroy_neigh_entry;
+	}
+	e->encap_size = ipv4_encap_size;
+	e->encap_header = encap_header;
+
+	if (!(nud_state & NUD_VALID)) {
+		neigh_event_send(n, NULL);
+		err = -EAGAIN;
+		goto out;
+	}
+
+	err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
+			       ipv4_encap_size, encap_header, &e->encap_id);
+	if (err)
+		goto destroy_neigh_entry;
+
+	e->flags |= MLX5_ENCAP_ENTRY_VALID;
+	mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev));
+	neigh_release(n);
+	return err;
+
+destroy_neigh_entry:
+	mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
+free_encap:
+	kfree(encap_header);
+out:
+	if (n)
+		neigh_release(n);
+	return err;
+}
+
+static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
+					  struct net_device *mirred_dev,
+					  struct mlx5e_encap_entry *e)
+{
+	int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
+	int ipv6_encap_size = ETH_HLEN + sizeof(struct ipv6hdr) + VXLAN_HLEN;
+	struct ip_tunnel_key *tun_key = &e->tun_info.key;
+	struct net_device *out_dev;
+	struct neighbour *n = NULL;
+	struct flowi6 fl6 = {};
+	char *encap_header;
+	int err, ttl = 0;
+	u8 nud_state;
+
+	if (max_encap_size < ipv6_encap_size) {
+		mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n",
+			       ipv6_encap_size, max_encap_size);
+		return -EOPNOTSUPP;
+	}
+
+	encap_header = kzalloc(ipv6_encap_size, GFP_KERNEL);
+	if (!encap_header)
+		return -ENOMEM;
+
+	switch (e->tunnel_type) {
+	case MLX5_HEADER_TYPE_VXLAN:
+		fl6.flowi6_proto = IPPROTO_UDP;
+		fl6.fl6_dport = tun_key->tp_dst;
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		goto free_encap;
+	}
+
+	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tun_key->tos), tun_key->label);
+	fl6.daddr = tun_key->u.ipv6.dst;
+	fl6.saddr = tun_key->u.ipv6.src;
+
+	err = mlx5e_route_lookup_ipv6(priv, mirred_dev, &out_dev,
+				      &fl6, &n, &ttl);
+	if (err)
+		goto free_encap;
+
+	/* used by mlx5e_detach_encap to lookup a neigh hash table
+	 * entry in the neigh hash table when a user deletes a rule
+	 */
+	e->m_neigh.dev = n->dev;
+	e->m_neigh.family = n->ops->family;
+	memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
+	e->out_dev = out_dev;
+
+	/* It's importent to add the neigh to the hash table before checking
+	 * the neigh validity state. So if we'll get a notification, in case the
+	 * neigh changes it's validity state, we would find the relevant neigh
+	 * in the hash.
+	 */
+	err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e);
+	if (err)
+		goto free_encap;
+
+	read_lock_bh(&n->lock);
+	nud_state = n->nud_state;
+	ether_addr_copy(e->h_dest, n->ha);
+	read_unlock_bh(&n->lock);
+
+	switch (e->tunnel_type) {
+	case MLX5_HEADER_TYPE_VXLAN:
+		gen_vxlan_header_ipv6(out_dev, encap_header,
+				      ipv6_encap_size, e->h_dest, ttl,
+				      &fl6.daddr,
+				      &fl6.saddr, tun_key->tp_dst,
+				      tunnel_id_to_key32(tun_key->tun_id));
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		goto destroy_neigh_entry;
+	}
+
+	e->encap_size = ipv6_encap_size;
+	e->encap_header = encap_header;
+
+	if (!(nud_state & NUD_VALID)) {
+		neigh_event_send(n, NULL);
+		err = -EAGAIN;
+		goto out;
+	}
+
+	err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
+			       ipv6_encap_size, encap_header, &e->encap_id);
+	if (err)
+		goto destroy_neigh_entry;
+
+	e->flags |= MLX5_ENCAP_ENTRY_VALID;
+	mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev));
+	neigh_release(n);
+	return err;
+
+destroy_neigh_entry:
+	mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
+free_encap:
+	kfree(encap_header);
+out:
+	if (n)
+		neigh_release(n);
+	return err;
+}
+
+static int mlx5e_attach_encap(struct mlx5e_priv *priv,
+			      struct ip_tunnel_info *tun_info,
+			      struct net_device *mirred_dev,
+			      struct net_device **encap_dev,
+			      struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw,
+									   REP_ETH);
+	struct net_device *up_dev = uplink_rpriv->netdev;
+	unsigned short family = ip_tunnel_info_af(tun_info);
+	struct mlx5e_priv *up_priv = netdev_priv(up_dev);
+	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+	struct ip_tunnel_key *key = &tun_info->key;
+	struct mlx5e_encap_entry *e;
+	int tunnel_type, err = 0;
+	uintptr_t hash_key;
+	bool found = false;
+
+	/* udp dst port must be set */
+	if (!memchr_inv(&key->tp_dst, 0, sizeof(key->tp_dst)))
+		goto vxlan_encap_offload_err;
+
+	/* setting udp src port isn't supported */
+	if (memchr_inv(&key->tp_src, 0, sizeof(key->tp_src))) {
+vxlan_encap_offload_err:
+		netdev_warn(priv->netdev,
+			    "must set udp dst port and not set udp src port\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (mlx5e_vxlan_lookup_port(up_priv, be16_to_cpu(key->tp_dst)) &&
+	    MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) {
+		tunnel_type = MLX5_HEADER_TYPE_VXLAN;
+	} else {
+		netdev_warn(priv->netdev,
+			    "%d isn't an offloaded vxlan udp dport\n", be16_to_cpu(key->tp_dst));
+		return -EOPNOTSUPP;
+	}
+
+	hash_key = hash_encap_info(key);
+
+	hash_for_each_possible_rcu(esw->offloads.encap_tbl, e,
+				   encap_hlist, hash_key) {
+		if (!cmp_encap_info(&e->tun_info.key, key)) {
+			found = true;
+			break;
+		}
+	}
+
+	/* must verify if encap is valid or not */
+	if (found)
+		goto attach_flow;
+
+	e = kzalloc(sizeof(*e), GFP_KERNEL);
+	if (!e)
+		return -ENOMEM;
+
+	e->tun_info = *tun_info;
+	e->tunnel_type = tunnel_type;
+	INIT_LIST_HEAD(&e->flows);
+
+	if (family == AF_INET)
+		err = mlx5e_create_encap_header_ipv4(priv, mirred_dev, e);
+	else if (family == AF_INET6)
+		err = mlx5e_create_encap_header_ipv6(priv, mirred_dev, e);
+
+	if (err && err != -EAGAIN)
+		goto out_err;
+
+	hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key);
+
+attach_flow:
+	list_add(&flow->encap, &e->flows);
+	*encap_dev = e->out_dev;
+	if (e->flags & MLX5_ENCAP_ENTRY_VALID)
+		attr->encap_id = e->encap_id;
+	else
+		err = -EAGAIN;
+
+	return err;
+
+out_err:
+	kfree(e);
+	return err;
+}
+
+static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
+				struct mlx5e_tc_flow_parse_attr *parse_attr,
+				struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct ip_tunnel_info *info = NULL;
+	const struct tc_action *a;
+	LIST_HEAD(actions);
+	bool encap = false;
+	int err = 0;
+
+	if (!tcf_exts_has_actions(exts))
+		return -EINVAL;
+
+	memset(attr, 0, sizeof(*attr));
+	attr->in_rep = rpriv->rep;
+
+	tcf_exts_to_list(exts, &actions);
+	list_for_each_entry(a, &actions, list) {
+		if (is_tcf_gact_shot(a)) {
+			attr->action |= MLX5_FLOW_CONTEXT_ACTION_DROP |
+					MLX5_FLOW_CONTEXT_ACTION_COUNT;
+			continue;
+		}
+
+		if (is_tcf_pedit(a)) {
+			err = parse_tc_pedit_action(priv, a, MLX5_FLOW_NAMESPACE_FDB,
+						    parse_attr);
+			if (err)
+				return err;
+
+			attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+			continue;
+		}
+
+		if (is_tcf_csum(a)) {
+			if (csum_offload_supported(priv, attr->action,
+						   tcf_csum_update_flags(a)))
+				continue;
+
+			return -EOPNOTSUPP;
+		}
+
+		if (is_tcf_mirred_egress_redirect(a)) {
+			struct net_device *out_dev;
+			struct mlx5e_priv *out_priv;
+
+			out_dev = tcf_mirred_dev(a);
+
+			if (switchdev_port_same_parent_id(priv->netdev,
+							  out_dev)) {
+				attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+					MLX5_FLOW_CONTEXT_ACTION_COUNT;
+				out_priv = netdev_priv(out_dev);
+				rpriv = out_priv->ppriv;
+				attr->out_rep = rpriv->rep;
+			} else if (encap) {
+				parse_attr->mirred_ifindex = out_dev->ifindex;
+				parse_attr->tun_info = *info;
+				attr->parse_attr = parse_attr;
+				attr->action |= MLX5_FLOW_CONTEXT_ACTION_ENCAP |
+					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+					MLX5_FLOW_CONTEXT_ACTION_COUNT;
+				/* attr->out_rep is resolved when we handle encap */
+			} else {
+				pr_err("devices %s %s not on same switch HW, can't offload forwarding\n",
+				       priv->netdev->name, out_dev->name);
+				return -EINVAL;
+			}
+			continue;
+		}
+
+		if (is_tcf_tunnel_set(a)) {
+			info = tcf_tunnel_info(a);
+			if (info)
+				encap = true;
+			else
+				return -EOPNOTSUPP;
+			continue;
+		}
+
+		if (is_tcf_vlan(a)) {
+			if (tcf_vlan_action(a) == TCA_VLAN_ACT_POP) {
+				attr->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_POP;
+			} else if (tcf_vlan_action(a) == TCA_VLAN_ACT_PUSH) {
+				attr->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH;
+				attr->vlan_vid = tcf_vlan_push_vid(a);
+				if (mlx5_eswitch_vlan_actions_supported(priv->mdev)) {
+					attr->vlan_prio = tcf_vlan_push_prio(a);
+					attr->vlan_proto = tcf_vlan_push_proto(a);
+					if (!attr->vlan_proto)
+						attr->vlan_proto = htons(ETH_P_8021Q);
+				} else if (tcf_vlan_push_proto(a) != htons(ETH_P_8021Q) ||
+					   tcf_vlan_push_prio(a)) {
+					return -EOPNOTSUPP;
+				}
+			} else { /* action is TCA_VLAN_ACT_MODIFY */
+				return -EOPNOTSUPP;
+			}
+			continue;
+		}
+
+		if (is_tcf_tunnel_release(a)) {
+			attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
+			continue;
+		}
+
+		return -EINVAL;
+	}
+
+	if (!actions_match_supported(priv, exts, parse_attr, flow))
+		return -EOPNOTSUPP;
+
+	return err;
+}
+
+int mlx5e_configure_flower(struct mlx5e_priv *priv,
+			   struct tc_cls_flower_offload *f)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_tc_flow_parse_attr *parse_attr;
+	struct mlx5e_tc_table *tc = &priv->fs.tc;
+	struct mlx5e_tc_flow *flow;
+	int attr_size, err = 0;
+	u8 flow_flags = 0;
+
+	if (esw && esw->mode == SRIOV_OFFLOADS) {
+		flow_flags = MLX5E_TC_FLOW_ESWITCH;
+		attr_size  = sizeof(struct mlx5_esw_flow_attr);
+	} else {
+		flow_flags = MLX5E_TC_FLOW_NIC;
+		attr_size  = sizeof(struct mlx5_nic_flow_attr);
+	}
+
+	flow = kzalloc(sizeof(*flow) + attr_size, GFP_KERNEL);
+	parse_attr = kvzalloc(sizeof(*parse_attr), GFP_KERNEL);
+	if (!parse_attr || !flow) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	flow->cookie = f->cookie;
+	flow->flags = flow_flags;
+
+	err = parse_cls_flower(priv, flow, &parse_attr->spec, f);
+	if (err < 0)
+		goto err_free;
+
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH) {
+		err = parse_tc_fdb_actions(priv, f->exts, parse_attr, flow);
+		if (err < 0)
+			goto err_free;
+		flow->rule = mlx5e_tc_add_fdb_flow(priv, parse_attr, flow);
+	} else {
+		err = parse_tc_nic_actions(priv, f->exts, parse_attr, flow);
+		if (err < 0)
+			goto err_free;
+		flow->rule = mlx5e_tc_add_nic_flow(priv, parse_attr, flow);
+	}
+
+	if (IS_ERR(flow->rule)) {
+		err = PTR_ERR(flow->rule);
+		if (err != -EAGAIN)
+			goto err_free;
+	}
+
+	if (err != -EAGAIN)
+		flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
+
+	if (!(flow->flags & MLX5E_TC_FLOW_ESWITCH) ||
+	    !(flow->esw_attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP))
+		kvfree(parse_attr);
+
+	err = rhashtable_insert_fast(&tc->ht, &flow->node,
+				     tc->ht_params);
+	if (err) {
+		mlx5e_tc_del_flow(priv, flow);
+		kfree(flow);
+	}
+
+	return err;
+
+err_free:
+	kvfree(parse_attr);
+	kfree(flow);
+	return err;
+}
+
+int mlx5e_delete_flower(struct mlx5e_priv *priv,
+			struct tc_cls_flower_offload *f)
+{
+	struct mlx5e_tc_flow *flow;
+	struct mlx5e_tc_table *tc = &priv->fs.tc;
+
+	flow = rhashtable_lookup_fast(&tc->ht, &f->cookie,
+				      tc->ht_params);
+	if (!flow)
+		return -EINVAL;
+
+	rhashtable_remove_fast(&tc->ht, &flow->node, tc->ht_params);
+
+	mlx5e_tc_del_flow(priv, flow);
+
+	kfree(flow);
+
+	return 0;
+}
+
+int mlx5e_stats_flower(struct mlx5e_priv *priv,
+		       struct tc_cls_flower_offload *f)
+{
+	struct mlx5e_tc_table *tc = &priv->fs.tc;
+	struct mlx5e_tc_flow *flow;
+	struct mlx5_fc *counter;
+	u64 bytes;
+	u64 packets;
+	u64 lastuse;
+
+	flow = rhashtable_lookup_fast(&tc->ht, &f->cookie,
+				      tc->ht_params);
+	if (!flow)
+		return -EINVAL;
+
+	if (!(flow->flags & MLX5E_TC_FLOW_OFFLOADED))
+		return 0;
+
+	counter = mlx5_flow_rule_counter(flow->rule);
+	if (!counter)
+		return 0;
+
+	mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
+
+	tcf_exts_stats_update(f->exts, bytes, packets, lastuse);
+
+	return 0;
+}
+
+static const struct rhashtable_params mlx5e_tc_flow_ht_params = {
+	.head_offset = offsetof(struct mlx5e_tc_flow, node),
+	.key_offset = offsetof(struct mlx5e_tc_flow, cookie),
+	.key_len = sizeof(((struct mlx5e_tc_flow *)0)->cookie),
+	.automatic_shrinking = true,
+};
+
+int mlx5e_tc_init(struct mlx5e_priv *priv)
+{
+	struct mlx5e_tc_table *tc = &priv->fs.tc;
+
+	hash_init(tc->mod_hdr_tbl);
+	hash_init(tc->hairpin_tbl);
+
+	tc->ht_params = mlx5e_tc_flow_ht_params;
+	return rhashtable_init(&tc->ht, &tc->ht_params);
+}
+
+static void _mlx5e_tc_del_flow(void *ptr, void *arg)
+{
+	struct mlx5e_tc_flow *flow = ptr;
+	struct mlx5e_priv *priv = arg;
+
+	mlx5e_tc_del_flow(priv, flow);
+	kfree(flow);
+}
+
+void mlx5e_tc_cleanup(struct mlx5e_priv *priv)
+{
+	struct mlx5e_tc_table *tc = &priv->fs.tc;
+
+	rhashtable_free_and_destroy(&tc->ht, _mlx5e_tc_del_flow, priv);
+
+	if (!IS_ERR_OR_NULL(tc->t)) {
+		mlx5_destroy_flow_table(tc->t);
+		tc->t = NULL;
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
new file mode 100644
index 0000000..c14c263
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_EN_TC_H__
+#define __MLX5_EN_TC_H__
+
+#include <net/pkt_cls.h>
+
+#define MLX5E_TC_FLOW_ID_MASK 0x0000ffff
+
+#ifdef CONFIG_MLX5_ESWITCH
+int mlx5e_tc_init(struct mlx5e_priv *priv);
+void mlx5e_tc_cleanup(struct mlx5e_priv *priv);
+
+int mlx5e_configure_flower(struct mlx5e_priv *priv,
+			   struct tc_cls_flower_offload *f);
+int mlx5e_delete_flower(struct mlx5e_priv *priv,
+			struct tc_cls_flower_offload *f);
+
+int mlx5e_stats_flower(struct mlx5e_priv *priv,
+		       struct tc_cls_flower_offload *f);
+
+struct mlx5e_encap_entry;
+void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
+			      struct mlx5e_encap_entry *e);
+void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
+			      struct mlx5e_encap_entry *e);
+
+struct mlx5e_neigh_hash_entry;
+void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe);
+
+static inline int mlx5e_tc_num_filters(struct mlx5e_priv *priv)
+{
+	return atomic_read(&priv->fs.tc.ht.nelems);
+}
+
+#else /* CONFIG_MLX5_ESWITCH */
+static inline int  mlx5e_tc_init(struct mlx5e_priv *priv) { return 0; }
+static inline void mlx5e_tc_cleanup(struct mlx5e_priv *priv) {}
+static inline int  mlx5e_tc_num_filters(struct mlx5e_priv *priv) { return 0; }
+#endif
+
+#endif /* __MLX5_EN_TC_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
new file mode 100644
index 0000000..5532aa3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -0,0 +1,664 @@
+/*
+ * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+#include <net/dsfield.h>
+#include "en.h"
+#include "ipoib/ipoib.h"
+#include "en_accel/ipsec_rxtx.h"
+#include "lib/clock.h"
+
+#define MLX5E_SQ_NOPS_ROOM  MLX5_SEND_WQE_MAX_WQEBBS
+#define MLX5E_SQ_STOP_ROOM (MLX5_SEND_WQE_MAX_WQEBBS +\
+			    MLX5E_SQ_NOPS_ROOM)
+
+static inline void mlx5e_tx_dma_unmap(struct device *pdev,
+				      struct mlx5e_sq_dma *dma)
+{
+	switch (dma->type) {
+	case MLX5E_DMA_MAP_SINGLE:
+		dma_unmap_single(pdev, dma->addr, dma->size, DMA_TO_DEVICE);
+		break;
+	case MLX5E_DMA_MAP_PAGE:
+		dma_unmap_page(pdev, dma->addr, dma->size, DMA_TO_DEVICE);
+		break;
+	default:
+		WARN_ONCE(true, "mlx5e_tx_dma_unmap unknown DMA type!\n");
+	}
+}
+
+static inline void mlx5e_dma_push(struct mlx5e_txqsq *sq,
+				  dma_addr_t addr,
+				  u32 size,
+				  enum mlx5e_dma_map_type map_type)
+{
+	u32 i = sq->dma_fifo_pc & sq->dma_fifo_mask;
+
+	sq->db.dma_fifo[i].addr = addr;
+	sq->db.dma_fifo[i].size = size;
+	sq->db.dma_fifo[i].type = map_type;
+	sq->dma_fifo_pc++;
+}
+
+static inline struct mlx5e_sq_dma *mlx5e_dma_get(struct mlx5e_txqsq *sq, u32 i)
+{
+	return &sq->db.dma_fifo[i & sq->dma_fifo_mask];
+}
+
+static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma)
+{
+	int i;
+
+	for (i = 0; i < num_dma; i++) {
+		struct mlx5e_sq_dma *last_pushed_dma =
+			mlx5e_dma_get(sq, --sq->dma_fifo_pc);
+
+		mlx5e_tx_dma_unmap(sq->pdev, last_pushed_dma);
+	}
+}
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+static inline int mlx5e_get_dscp_up(struct mlx5e_priv *priv, struct sk_buff *skb)
+{
+	int dscp_cp = 0;
+
+	if (skb->protocol == htons(ETH_P_IP))
+		dscp_cp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		dscp_cp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+
+	return priv->dcbx_dp.dscp2prio[dscp_cp];
+}
+#endif
+
+u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
+		       void *accel_priv, select_queue_fallback_t fallback)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int channel_ix = fallback(dev, skb);
+	u16 num_channels;
+	int up = 0;
+
+	if (!netdev_get_num_tc(dev))
+		return channel_ix;
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_DSCP)
+		up = mlx5e_get_dscp_up(priv, skb);
+	else
+#endif
+		if (skb_vlan_tag_present(skb))
+			up = skb->vlan_tci >> VLAN_PRIO_SHIFT;
+
+	/* channel_ix can be larger than num_channels since
+	 * dev->num_real_tx_queues = num_channels * num_tc
+	 */
+	num_channels = priv->channels.params.num_channels;
+	if (channel_ix >= num_channels)
+		channel_ix = reciprocal_scale(channel_ix, num_channels);
+
+	return priv->channel_tc2txq[channel_ix][up];
+}
+
+static inline int mlx5e_skb_l2_header_offset(struct sk_buff *skb)
+{
+#define MLX5E_MIN_INLINE (ETH_HLEN + VLAN_HLEN)
+
+	return max(skb_network_offset(skb), MLX5E_MIN_INLINE);
+}
+
+static inline int mlx5e_skb_l3_header_offset(struct sk_buff *skb)
+{
+	struct flow_keys keys;
+
+	if (skb_transport_header_was_set(skb))
+		return skb_transport_offset(skb);
+	else if (skb_flow_dissect_flow_keys(skb, &keys, 0))
+		return keys.control.thoff;
+	else
+		return mlx5e_skb_l2_header_offset(skb);
+}
+
+static inline u16 mlx5e_calc_min_inline(enum mlx5_inline_modes mode,
+					struct sk_buff *skb)
+{
+	u16 hlen;
+
+	switch (mode) {
+	case MLX5_INLINE_MODE_NONE:
+		return 0;
+	case MLX5_INLINE_MODE_TCP_UDP:
+		hlen = eth_get_headlen(skb->data, skb_headlen(skb));
+		if (hlen == ETH_HLEN && !skb_vlan_tag_present(skb))
+			hlen += VLAN_HLEN;
+		break;
+	case MLX5_INLINE_MODE_IP:
+		/* When transport header is set to zero, it means no transport
+		 * header. When transport header is set to 0xff's, it means
+		 * transport header wasn't set.
+		 */
+		if (skb_transport_offset(skb)) {
+			hlen = mlx5e_skb_l3_header_offset(skb);
+			break;
+		}
+		/* fall through */
+	case MLX5_INLINE_MODE_L2:
+	default:
+		hlen = mlx5e_skb_l2_header_offset(skb);
+	}
+	return min_t(u16, hlen, skb_headlen(skb));
+}
+
+static inline void mlx5e_tx_skb_pull_inline(unsigned char **skb_data,
+					    unsigned int *skb_len,
+					    unsigned int len)
+{
+	*skb_len -= len;
+	*skb_data += len;
+}
+
+static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs,
+				     unsigned char **skb_data,
+				     unsigned int *skb_len)
+{
+	struct vlan_ethhdr *vhdr = (struct vlan_ethhdr *)start;
+	int cpy1_sz = 2 * ETH_ALEN;
+	int cpy2_sz = ihs - cpy1_sz;
+
+	memcpy(vhdr, *skb_data, cpy1_sz);
+	mlx5e_tx_skb_pull_inline(skb_data, skb_len, cpy1_sz);
+	vhdr->h_vlan_proto = skb->vlan_proto;
+	vhdr->h_vlan_TCI = cpu_to_be16(skb_vlan_tag_get(skb));
+	memcpy(&vhdr->h_vlan_encapsulated_proto, *skb_data, cpy2_sz);
+	mlx5e_tx_skb_pull_inline(skb_data, skb_len, cpy2_sz);
+}
+
+static inline void
+mlx5e_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg)
+{
+	if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
+		eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM;
+		if (skb->encapsulation) {
+			eseg->cs_flags |= MLX5_ETH_WQE_L3_INNER_CSUM |
+					  MLX5_ETH_WQE_L4_INNER_CSUM;
+			sq->stats.csum_partial_inner++;
+		} else {
+			eseg->cs_flags |= MLX5_ETH_WQE_L4_CSUM;
+			sq->stats.csum_partial++;
+		}
+	} else
+		sq->stats.csum_none++;
+}
+
+static inline u16
+mlx5e_txwqe_build_eseg_gso(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			   struct mlx5_wqe_eth_seg *eseg, unsigned int *num_bytes)
+{
+	u16 ihs;
+
+	eseg->mss    = cpu_to_be16(skb_shinfo(skb)->gso_size);
+
+	if (skb->encapsulation) {
+		ihs = skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb);
+		sq->stats.tso_inner_packets++;
+		sq->stats.tso_inner_bytes += skb->len - ihs;
+	} else {
+		ihs = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		sq->stats.tso_packets++;
+		sq->stats.tso_bytes += skb->len - ihs;
+	}
+
+	*num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs;
+	return ihs;
+}
+
+static inline int
+mlx5e_txwqe_build_dsegs(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			unsigned char *skb_data, u16 headlen,
+			struct mlx5_wqe_data_seg *dseg)
+{
+	dma_addr_t dma_addr = 0;
+	u8 num_dma          = 0;
+	int i;
+
+	if (headlen) {
+		dma_addr = dma_map_single(sq->pdev, skb_data, headlen,
+					  DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(sq->pdev, dma_addr)))
+			goto dma_unmap_wqe_err;
+
+		dseg->addr       = cpu_to_be64(dma_addr);
+		dseg->lkey       = sq->mkey_be;
+		dseg->byte_count = cpu_to_be32(headlen);
+
+		mlx5e_dma_push(sq, dma_addr, headlen, MLX5E_DMA_MAP_SINGLE);
+		num_dma++;
+		dseg++;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i];
+		int fsz = skb_frag_size(frag);
+
+		dma_addr = skb_frag_dma_map(sq->pdev, frag, 0, fsz,
+					    DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(sq->pdev, dma_addr)))
+			goto dma_unmap_wqe_err;
+
+		dseg->addr       = cpu_to_be64(dma_addr);
+		dseg->lkey       = sq->mkey_be;
+		dseg->byte_count = cpu_to_be32(fsz);
+
+		mlx5e_dma_push(sq, dma_addr, fsz, MLX5E_DMA_MAP_PAGE);
+		num_dma++;
+		dseg++;
+	}
+
+	return num_dma;
+
+dma_unmap_wqe_err:
+	mlx5e_dma_unmap_wqe_err(sq, num_dma);
+	return -ENOMEM;
+}
+
+static inline void
+mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+		     u8 opcode, u16 ds_cnt, u32 num_bytes, u8 num_dma,
+		     struct mlx5e_tx_wqe_info *wi, struct mlx5_wqe_ctrl_seg *cseg)
+{
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	u16 pi;
+
+	wi->num_bytes = num_bytes;
+	wi->num_dma = num_dma;
+	wi->num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
+	wi->skb = skb;
+
+	cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
+	cseg->qpn_ds           = cpu_to_be32((sq->sqn << 8) | ds_cnt);
+
+	netdev_tx_sent_queue(sq->txq, num_bytes);
+
+	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
+		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+
+	sq->pc += wi->num_wqebbs;
+	if (unlikely(!mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, MLX5E_SQ_STOP_ROOM))) {
+		netif_tx_stop_queue(sq->txq);
+		sq->stats.stopped++;
+	}
+
+	if (!skb->xmit_more || netif_xmit_stopped(sq->txq))
+		mlx5e_notify_hw(wq, sq->pc, sq->uar_map, cseg);
+
+	/* fill sq edge with nops to avoid wqe wrap around */
+	while ((pi = (sq->pc & wq->sz_m1)) > sq->edge) {
+		sq->db.wqe_info[pi].skb = NULL;
+		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
+		sq->stats.nop++;
+	}
+}
+
+static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+				 struct mlx5e_tx_wqe *wqe, u16 pi)
+{
+	struct mlx5e_tx_wqe_info *wi   = &sq->db.wqe_info[pi];
+
+	struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl;
+	struct mlx5_wqe_eth_seg  *eseg = &wqe->eth;
+
+	unsigned char *skb_data = skb->data;
+	unsigned int skb_len = skb->len;
+	u8  opcode = MLX5_OPCODE_SEND;
+	unsigned int num_bytes;
+	int num_dma;
+	u16 headlen;
+	u16 ds_cnt;
+	u16 ihs;
+
+	mlx5e_txwqe_build_eseg_csum(sq, skb, eseg);
+
+	if (skb_is_gso(skb)) {
+		opcode = MLX5_OPCODE_LSO;
+		ihs = mlx5e_txwqe_build_eseg_gso(sq, skb, eseg, &num_bytes);
+		sq->stats.packets += skb_shinfo(skb)->gso_segs;
+	} else {
+		ihs = mlx5e_calc_min_inline(sq->min_inline_mode, skb);
+		num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+		sq->stats.packets++;
+	}
+	sq->stats.bytes += num_bytes;
+	sq->stats.xmit_more += skb->xmit_more;
+
+	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
+	if (ihs) {
+		if (skb_vlan_tag_present(skb)) {
+			mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs, &skb_data, &skb_len);
+			ihs += VLAN_HLEN;
+			sq->stats.added_vlan_packets++;
+		} else {
+			memcpy(eseg->inline_hdr.start, skb_data, ihs);
+			mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs);
+		}
+		eseg->inline_hdr.sz = cpu_to_be16(ihs);
+		ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr.start), MLX5_SEND_WQE_DS);
+	} else if (skb_vlan_tag_present(skb)) {
+		eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN);
+		if (skb->vlan_proto == cpu_to_be16(ETH_P_8021AD))
+			eseg->insert.type |= cpu_to_be16(MLX5_ETH_WQE_SVLAN);
+		eseg->insert.vlan_tci = cpu_to_be16(skb_vlan_tag_get(skb));
+		sq->stats.added_vlan_packets++;
+	}
+
+	headlen = skb_len - skb->data_len;
+	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb_data, headlen,
+					  (struct mlx5_wqe_data_seg *)cseg + ds_cnt);
+	if (unlikely(num_dma < 0))
+		goto err_drop;
+
+	mlx5e_txwqe_complete(sq, skb, opcode, ds_cnt + num_dma,
+			     num_bytes, num_dma, wi, cseg);
+
+	return NETDEV_TX_OK;
+
+err_drop:
+	sq->stats.dropped++;
+	dev_kfree_skb_any(skb);
+
+	return NETDEV_TX_OK;
+}
+
+netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_txqsq *sq = priv->txq2sq[skb_get_queue_mapping(skb)];
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	u16 pi = sq->pc & wq->sz_m1;
+	struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi);
+
+	memset(wqe, 0, sizeof(*wqe));
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+	if (sq->state & BIT(MLX5E_SQ_STATE_IPSEC)) {
+		skb = mlx5e_ipsec_handle_tx_skb(dev, wqe, skb);
+		if (unlikely(!skb))
+			return NETDEV_TX_OK;
+	}
+#endif
+
+	return mlx5e_sq_xmit(sq, skb, wqe, pi);
+}
+
+static void mlx5e_dump_error_cqe(struct mlx5e_txqsq *sq,
+				 struct mlx5_err_cqe *err_cqe)
+{
+	u32 ci = mlx5_cqwq_get_ci(&sq->cq.wq);
+
+	netdev_err(sq->channel->netdev,
+		   "Error cqe on cqn 0x%x, ci 0x%x, sqn 0x%x, syndrome 0x%x, vendor syndrome 0x%x\n",
+		   sq->cq.mcq.cqn, ci, sq->sqn, err_cqe->syndrome,
+		   err_cqe->vendor_err_synd);
+	mlx5_dump_err_cqe(sq->cq.mdev, err_cqe);
+}
+
+bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
+{
+	struct mlx5e_txqsq *sq;
+	struct mlx5_cqe64 *cqe;
+	u32 dma_fifo_cc;
+	u32 nbytes;
+	u16 npkts;
+	u16 sqcc;
+	int i;
+
+	sq = container_of(cq, struct mlx5e_txqsq, cq);
+
+	if (unlikely(!MLX5E_TEST_BIT(sq->state, MLX5E_SQ_STATE_ENABLED)))
+		return false;
+
+	cqe = mlx5_cqwq_get_cqe(&cq->wq);
+	if (!cqe)
+		return false;
+
+	npkts = 0;
+	nbytes = 0;
+
+	/* sq->cc must be updated only after mlx5_cqwq_update_db_record(),
+	 * otherwise a cq overrun may occur
+	 */
+	sqcc = sq->cc;
+
+	/* avoid dirtying sq cache line every cqe */
+	dma_fifo_cc = sq->dma_fifo_cc;
+
+	i = 0;
+	do {
+		u16 wqe_counter;
+		bool last_wqe;
+
+		mlx5_cqwq_pop(&cq->wq);
+
+		wqe_counter = be16_to_cpu(cqe->wqe_counter);
+
+		if (unlikely(cqe->op_own >> 4 == MLX5_CQE_REQ_ERR)) {
+			if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING,
+					      &sq->state)) {
+				mlx5e_dump_error_cqe(sq,
+						     (struct mlx5_err_cqe *)cqe);
+				queue_work(cq->channel->priv->wq,
+					   &sq->recover.recover_work);
+			}
+			sq->stats.cqe_err++;
+		}
+
+		do {
+			struct mlx5e_tx_wqe_info *wi;
+			struct sk_buff *skb;
+			u16 ci;
+			int j;
+
+			last_wqe = (sqcc == wqe_counter);
+
+			ci = sqcc & sq->wq.sz_m1;
+			wi = &sq->db.wqe_info[ci];
+			skb = wi->skb;
+
+			if (unlikely(!skb)) { /* nop */
+				sqcc++;
+				continue;
+			}
+
+			if (unlikely(skb_shinfo(skb)->tx_flags &
+				     SKBTX_HW_TSTAMP)) {
+				struct skb_shared_hwtstamps hwts = {};
+
+				hwts.hwtstamp =
+					mlx5_timecounter_cyc2time(sq->clock,
+								  get_cqe_ts(cqe));
+				skb_tstamp_tx(skb, &hwts);
+			}
+
+			for (j = 0; j < wi->num_dma; j++) {
+				struct mlx5e_sq_dma *dma =
+					mlx5e_dma_get(sq, dma_fifo_cc++);
+
+				mlx5e_tx_dma_unmap(sq->pdev, dma);
+			}
+
+			npkts++;
+			nbytes += wi->num_bytes;
+			sqcc += wi->num_wqebbs;
+			napi_consume_skb(skb, napi_budget);
+		} while (!last_wqe);
+
+	} while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq)));
+
+	mlx5_cqwq_update_db_record(&cq->wq);
+
+	/* ensure cq space is freed before enabling more cqes */
+	wmb();
+
+	sq->dma_fifo_cc = dma_fifo_cc;
+	sq->cc = sqcc;
+
+	netdev_tx_completed_queue(sq->txq, npkts, nbytes);
+
+	if (netif_tx_queue_stopped(sq->txq) &&
+	    mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc,
+				   MLX5E_SQ_STOP_ROOM) &&
+	    !test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) {
+		netif_tx_wake_queue(sq->txq);
+		sq->stats.wake++;
+	}
+
+	return (i == MLX5E_TX_CQ_POLL_BUDGET);
+}
+
+void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq)
+{
+	struct mlx5e_tx_wqe_info *wi;
+	struct sk_buff *skb;
+	u16 ci;
+	int i;
+
+	while (sq->cc != sq->pc) {
+		ci = sq->cc & sq->wq.sz_m1;
+		wi = &sq->db.wqe_info[ci];
+		skb = wi->skb;
+
+		if (!skb) { /* nop */
+			sq->cc++;
+			continue;
+		}
+
+		for (i = 0; i < wi->num_dma; i++) {
+			struct mlx5e_sq_dma *dma =
+				mlx5e_dma_get(sq, sq->dma_fifo_cc++);
+
+			mlx5e_tx_dma_unmap(sq->pdev, dma);
+		}
+
+		dev_kfree_skb_any(skb);
+		sq->cc += wi->num_wqebbs;
+	}
+}
+
+#ifdef CONFIG_MLX5_CORE_IPOIB
+
+struct mlx5_wqe_eth_pad {
+	u8 rsvd0[16];
+};
+
+struct mlx5i_tx_wqe {
+	struct mlx5_wqe_ctrl_seg     ctrl;
+	struct mlx5_wqe_datagram_seg datagram;
+	struct mlx5_wqe_eth_pad      pad;
+	struct mlx5_wqe_eth_seg      eth;
+};
+
+static inline void
+mlx5i_txwqe_build_datagram(struct mlx5_av *av, u32 dqpn, u32 dqkey,
+			   struct mlx5_wqe_datagram_seg *dseg)
+{
+	memcpy(&dseg->av, av, sizeof(struct mlx5_av));
+	dseg->av.dqp_dct = cpu_to_be32(dqpn | MLX5_EXTENDED_UD_AV);
+	dseg->av.key.qkey.qkey = cpu_to_be32(dqkey);
+}
+
+netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			  struct mlx5_av *av, u32 dqpn, u32 dqkey)
+{
+	struct mlx5_wq_cyc       *wq   = &sq->wq;
+	u16                       pi   = sq->pc & wq->sz_m1;
+	struct mlx5i_tx_wqe      *wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
+	struct mlx5e_tx_wqe_info *wi   = &sq->db.wqe_info[pi];
+
+	struct mlx5_wqe_ctrl_seg     *cseg = &wqe->ctrl;
+	struct mlx5_wqe_datagram_seg *datagram = &wqe->datagram;
+	struct mlx5_wqe_eth_seg      *eseg = &wqe->eth;
+
+	unsigned char *skb_data = skb->data;
+	unsigned int skb_len = skb->len;
+	u8  opcode = MLX5_OPCODE_SEND;
+	unsigned int num_bytes;
+	int num_dma;
+	u16 headlen;
+	u16 ds_cnt;
+	u16 ihs;
+
+	memset(wqe, 0, sizeof(*wqe));
+
+	mlx5i_txwqe_build_datagram(av, dqpn, dqkey, datagram);
+
+	mlx5e_txwqe_build_eseg_csum(sq, skb, eseg);
+
+	if (skb_is_gso(skb)) {
+		opcode = MLX5_OPCODE_LSO;
+		ihs = mlx5e_txwqe_build_eseg_gso(sq, skb, eseg, &num_bytes);
+		sq->stats.packets += skb_shinfo(skb)->gso_segs;
+	} else {
+		ihs = mlx5e_calc_min_inline(sq->min_inline_mode, skb);
+		num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+		sq->stats.packets++;
+	}
+
+	sq->stats.bytes += num_bytes;
+	sq->stats.xmit_more += skb->xmit_more;
+
+	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
+	if (ihs) {
+		memcpy(eseg->inline_hdr.start, skb_data, ihs);
+		mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs);
+		eseg->inline_hdr.sz = cpu_to_be16(ihs);
+		ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr.start), MLX5_SEND_WQE_DS);
+	}
+
+	headlen = skb_len - skb->data_len;
+	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb_data, headlen,
+					  (struct mlx5_wqe_data_seg *)cseg + ds_cnt);
+	if (unlikely(num_dma < 0))
+		goto err_drop;
+
+	mlx5e_txwqe_complete(sq, skb, opcode, ds_cnt + num_dma,
+			     num_bytes, num_dma, wi, cseg);
+
+	return NETDEV_TX_OK;
+
+err_drop:
+	sq->stats.dropped++;
+	dev_kfree_skb_any(skb);
+
+	return NETDEV_TX_OK;
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
new file mode 100644
index 0000000..f292bb3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/irq.h>
+#include "en.h"
+
+static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
+{
+	int current_cpu = smp_processor_id();
+	const struct cpumask *aff;
+	struct irq_data *idata;
+
+	idata = irq_desc_get_irq_data(c->irq_desc);
+	aff = irq_data_get_affinity_mask(idata);
+	return cpumask_test_cpu(current_cpu, aff);
+}
+
+int mlx5e_napi_poll(struct napi_struct *napi, int budget)
+{
+	struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel,
+					       napi);
+	bool busy = false;
+	int work_done = 0;
+	int i;
+
+	for (i = 0; i < c->num_tc; i++)
+		busy |= mlx5e_poll_tx_cq(&c->sq[i].cq, budget);
+
+	if (c->xdp)
+		busy |= mlx5e_poll_xdpsq_cq(&c->rq.xdpsq.cq);
+
+	if (likely(budget)) { /* budget=0 means: don't poll rx rings */
+		work_done = mlx5e_poll_rx_cq(&c->rq.cq, budget);
+		busy |= work_done == budget;
+	}
+
+	busy |= c->rq.post_wqes(&c->rq);
+
+	if (busy) {
+		if (likely(mlx5e_channel_no_affinity_change(c)))
+			return budget;
+		if (budget && work_done == budget)
+			work_done--;
+	}
+
+	if (unlikely(!napi_complete_done(napi, work_done)))
+		return work_done;
+
+	for (i = 0; i < c->num_tc; i++)
+		mlx5e_cq_arm(&c->sq[i].cq);
+
+	if (MLX5E_TEST_BIT(c->rq.state, MLX5E_RQ_STATE_AM)) {
+		struct net_dim_sample dim_sample;
+		net_dim_sample(c->rq.cq.event_ctr,
+			       c->rq.stats.packets,
+			       c->rq.stats.bytes,
+			       &dim_sample);
+		net_dim(&c->rq.dim, dim_sample);
+	}
+
+	mlx5e_cq_arm(&c->rq.cq);
+	mlx5e_cq_arm(&c->icosq.cq);
+
+	return work_done;
+}
+
+void mlx5e_completion_event(struct mlx5_core_cq *mcq)
+{
+	struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq);
+
+	cq->event_ctr++;
+	napi_schedule(cq->napi);
+}
+
+void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event)
+{
+	struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq);
+	struct mlx5e_channel *c = cq->channel;
+	struct net_device *netdev = c->netdev;
+
+	netdev_err(netdev, "%s: cqn=0x%.6x event=0x%.2x\n",
+		   __func__, mcq->cqn, event);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
new file mode 100644
index 0000000..1814f80
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#ifdef CONFIG_RFS_ACCEL
+#include <linux/cpu_rmap.h>
+#endif
+#include "mlx5_core.h"
+#include "fpga/core.h"
+#include "eswitch.h"
+
+enum {
+	MLX5_EQE_SIZE		= sizeof(struct mlx5_eqe),
+	MLX5_EQE_OWNER_INIT_VAL	= 0x1,
+};
+
+enum {
+	MLX5_EQ_STATE_ARMED		= 0x9,
+	MLX5_EQ_STATE_FIRED		= 0xa,
+	MLX5_EQ_STATE_ALWAYS_ARMED	= 0xb,
+};
+
+enum {
+	MLX5_NUM_SPARE_EQE	= 0x80,
+	MLX5_NUM_ASYNC_EQE	= 0x1000,
+	MLX5_NUM_CMD_EQE	= 32,
+	MLX5_NUM_PF_DRAIN	= 64,
+};
+
+enum {
+	MLX5_EQ_DOORBEL_OFFSET	= 0x40,
+};
+
+#define MLX5_ASYNC_EVENT_MASK ((1ull << MLX5_EVENT_TYPE_PATH_MIG)	    | \
+			       (1ull << MLX5_EVENT_TYPE_COMM_EST)	    | \
+			       (1ull << MLX5_EVENT_TYPE_SQ_DRAINED)	    | \
+			       (1ull << MLX5_EVENT_TYPE_CQ_ERROR)	    | \
+			       (1ull << MLX5_EVENT_TYPE_WQ_CATAS_ERROR)	    | \
+			       (1ull << MLX5_EVENT_TYPE_PATH_MIG_FAILED)    | \
+			       (1ull << MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+			       (1ull << MLX5_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+			       (1ull << MLX5_EVENT_TYPE_PORT_CHANGE)	    | \
+			       (1ull << MLX5_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+			       (1ull << MLX5_EVENT_TYPE_SRQ_LAST_WQE)	    | \
+			       (1ull << MLX5_EVENT_TYPE_SRQ_RQ_LIMIT))
+
+struct map_eq_in {
+	u64	mask;
+	u32	reserved;
+	u32	unmap_eqn;
+};
+
+struct cre_des_eq {
+	u8	reserved[15];
+	u8	eqn;
+};
+
+static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_eq_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_eq_in)]   = {0};
+
+	MLX5_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ);
+	MLX5_SET(destroy_eq_in, in, eq_number, eqn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, u32 entry)
+{
+	return mlx5_buf_offset(&eq->buf, entry * MLX5_EQE_SIZE);
+}
+
+static struct mlx5_eqe *next_eqe_sw(struct mlx5_eq *eq)
+{
+	struct mlx5_eqe *eqe = get_eqe(eq, eq->cons_index & (eq->nent - 1));
+
+	return ((eqe->owner & 1) ^ !!(eq->cons_index & eq->nent)) ? NULL : eqe;
+}
+
+static const char *eqe_type_str(u8 type)
+{
+	switch (type) {
+	case MLX5_EVENT_TYPE_COMP:
+		return "MLX5_EVENT_TYPE_COMP";
+	case MLX5_EVENT_TYPE_PATH_MIG:
+		return "MLX5_EVENT_TYPE_PATH_MIG";
+	case MLX5_EVENT_TYPE_COMM_EST:
+		return "MLX5_EVENT_TYPE_COMM_EST";
+	case MLX5_EVENT_TYPE_SQ_DRAINED:
+		return "MLX5_EVENT_TYPE_SQ_DRAINED";
+	case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+		return "MLX5_EVENT_TYPE_SRQ_LAST_WQE";
+	case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+		return "MLX5_EVENT_TYPE_SRQ_RQ_LIMIT";
+	case MLX5_EVENT_TYPE_CQ_ERROR:
+		return "MLX5_EVENT_TYPE_CQ_ERROR";
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+		return "MLX5_EVENT_TYPE_WQ_CATAS_ERROR";
+	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+		return "MLX5_EVENT_TYPE_PATH_MIG_FAILED";
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		return "MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR";
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+		return "MLX5_EVENT_TYPE_WQ_ACCESS_ERROR";
+	case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+		return "MLX5_EVENT_TYPE_SRQ_CATAS_ERROR";
+	case MLX5_EVENT_TYPE_INTERNAL_ERROR:
+		return "MLX5_EVENT_TYPE_INTERNAL_ERROR";
+	case MLX5_EVENT_TYPE_PORT_CHANGE:
+		return "MLX5_EVENT_TYPE_PORT_CHANGE";
+	case MLX5_EVENT_TYPE_GPIO_EVENT:
+		return "MLX5_EVENT_TYPE_GPIO_EVENT";
+	case MLX5_EVENT_TYPE_PORT_MODULE_EVENT:
+		return "MLX5_EVENT_TYPE_PORT_MODULE_EVENT";
+	case MLX5_EVENT_TYPE_REMOTE_CONFIG:
+		return "MLX5_EVENT_TYPE_REMOTE_CONFIG";
+	case MLX5_EVENT_TYPE_DB_BF_CONGESTION:
+		return "MLX5_EVENT_TYPE_DB_BF_CONGESTION";
+	case MLX5_EVENT_TYPE_STALL_EVENT:
+		return "MLX5_EVENT_TYPE_STALL_EVENT";
+	case MLX5_EVENT_TYPE_CMD:
+		return "MLX5_EVENT_TYPE_CMD";
+	case MLX5_EVENT_TYPE_PAGE_REQUEST:
+		return "MLX5_EVENT_TYPE_PAGE_REQUEST";
+	case MLX5_EVENT_TYPE_PAGE_FAULT:
+		return "MLX5_EVENT_TYPE_PAGE_FAULT";
+	case MLX5_EVENT_TYPE_PPS_EVENT:
+		return "MLX5_EVENT_TYPE_PPS_EVENT";
+	case MLX5_EVENT_TYPE_NIC_VPORT_CHANGE:
+		return "MLX5_EVENT_TYPE_NIC_VPORT_CHANGE";
+	case MLX5_EVENT_TYPE_FPGA_ERROR:
+		return "MLX5_EVENT_TYPE_FPGA_ERROR";
+	case MLX5_EVENT_TYPE_GENERAL_EVENT:
+		return "MLX5_EVENT_TYPE_GENERAL_EVENT";
+	default:
+		return "Unrecognized event";
+	}
+}
+
+static enum mlx5_dev_event port_subtype_event(u8 subtype)
+{
+	switch (subtype) {
+	case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+		return MLX5_DEV_EVENT_PORT_DOWN;
+	case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+		return MLX5_DEV_EVENT_PORT_UP;
+	case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
+		return MLX5_DEV_EVENT_PORT_INITIALIZED;
+	case MLX5_PORT_CHANGE_SUBTYPE_LID:
+		return MLX5_DEV_EVENT_LID_CHANGE;
+	case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
+		return MLX5_DEV_EVENT_PKEY_CHANGE;
+	case MLX5_PORT_CHANGE_SUBTYPE_GUID:
+		return MLX5_DEV_EVENT_GUID_CHANGE;
+	case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
+		return MLX5_DEV_EVENT_CLIENT_REREG;
+	}
+	return -1;
+}
+
+static void eq_update_ci(struct mlx5_eq *eq, int arm)
+{
+	__be32 __iomem *addr = eq->doorbell + (arm ? 0 : 2);
+	u32 val = (eq->cons_index & 0xffffff) | (eq->eqn << 24);
+
+	__raw_writel((__force u32)cpu_to_be32(val), addr);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+static void eqe_pf_action(struct work_struct *work)
+{
+	struct mlx5_pagefault *pfault = container_of(work,
+						     struct mlx5_pagefault,
+						     work);
+	struct mlx5_eq *eq = pfault->eq;
+
+	mlx5_core_page_fault(eq->dev, pfault);
+	mempool_free(pfault, eq->pf_ctx.pool);
+}
+
+static void eq_pf_process(struct mlx5_eq *eq)
+{
+	struct mlx5_core_dev *dev = eq->dev;
+	struct mlx5_eqe_page_fault *pf_eqe;
+	struct mlx5_pagefault *pfault;
+	struct mlx5_eqe *eqe;
+	int set_ci = 0;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		pfault = mempool_alloc(eq->pf_ctx.pool, GFP_ATOMIC);
+		if (!pfault) {
+			schedule_work(&eq->pf_ctx.work);
+			break;
+		}
+
+		dma_rmb();
+		pf_eqe = &eqe->data.page_fault;
+		pfault->event_subtype = eqe->sub_type;
+		pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
+
+		mlx5_core_dbg(dev,
+			      "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
+			      eqe->sub_type, pfault->bytes_committed);
+
+		switch (eqe->sub_type) {
+		case MLX5_PFAULT_SUBTYPE_RDMA:
+			/* RDMA based event */
+			pfault->type =
+				be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
+			pfault->token =
+				be32_to_cpu(pf_eqe->rdma.pftype_token) &
+				MLX5_24BIT_MASK;
+			pfault->rdma.r_key =
+				be32_to_cpu(pf_eqe->rdma.r_key);
+			pfault->rdma.packet_size =
+				be16_to_cpu(pf_eqe->rdma.packet_length);
+			pfault->rdma.rdma_op_len =
+				be32_to_cpu(pf_eqe->rdma.rdma_op_len);
+			pfault->rdma.rdma_va =
+				be64_to_cpu(pf_eqe->rdma.rdma_va);
+			mlx5_core_dbg(dev,
+				      "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
+				      pfault->type, pfault->token,
+				      pfault->rdma.r_key);
+			mlx5_core_dbg(dev,
+				      "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
+				      pfault->rdma.rdma_op_len,
+				      pfault->rdma.rdma_va);
+			break;
+
+		case MLX5_PFAULT_SUBTYPE_WQE:
+			/* WQE based event */
+			pfault->type =
+				be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24;
+			pfault->token =
+				be32_to_cpu(pf_eqe->wqe.token);
+			pfault->wqe.wq_num =
+				be32_to_cpu(pf_eqe->wqe.pftype_wq) &
+				MLX5_24BIT_MASK;
+			pfault->wqe.wqe_index =
+				be16_to_cpu(pf_eqe->wqe.wqe_index);
+			pfault->wqe.packet_size =
+				be16_to_cpu(pf_eqe->wqe.packet_length);
+			mlx5_core_dbg(dev,
+				      "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
+				      pfault->type, pfault->token,
+				      pfault->wqe.wq_num,
+				      pfault->wqe.wqe_index);
+			break;
+
+		default:
+			mlx5_core_warn(dev,
+				       "Unsupported page fault event sub-type: 0x%02hhx\n",
+				       eqe->sub_type);
+			/* Unsupported page faults should still be
+			 * resolved by the page fault handler
+			 */
+		}
+
+		pfault->eq = eq;
+		INIT_WORK(&pfault->work, eqe_pf_action);
+		queue_work(eq->pf_ctx.wq, &pfault->work);
+
+		++eq->cons_index;
+		++set_ci;
+
+		if (unlikely(set_ci >= MLX5_NUM_SPARE_EQE)) {
+			eq_update_ci(eq, 0);
+			set_ci = 0;
+		}
+	}
+
+	eq_update_ci(eq, 1);
+}
+
+static irqreturn_t mlx5_eq_pf_int(int irq, void *eq_ptr)
+{
+	struct mlx5_eq *eq = eq_ptr;
+	unsigned long flags;
+
+	if (spin_trylock_irqsave(&eq->pf_ctx.lock, flags)) {
+		eq_pf_process(eq);
+		spin_unlock_irqrestore(&eq->pf_ctx.lock, flags);
+	} else {
+		schedule_work(&eq->pf_ctx.work);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/* mempool_refill() was proposed but unfortunately wasn't accepted
+ * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
+ * Chip workaround.
+ */
+static void mempool_refill(mempool_t *pool)
+{
+	while (pool->curr_nr < pool->min_nr)
+		mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
+}
+
+static void eq_pf_action(struct work_struct *work)
+{
+	struct mlx5_eq *eq = container_of(work, struct mlx5_eq, pf_ctx.work);
+
+	mempool_refill(eq->pf_ctx.pool);
+
+	spin_lock_irq(&eq->pf_ctx.lock);
+	eq_pf_process(eq);
+	spin_unlock_irq(&eq->pf_ctx.lock);
+}
+
+static int init_pf_ctx(struct mlx5_eq_pagefault *pf_ctx, const char *name)
+{
+	spin_lock_init(&pf_ctx->lock);
+	INIT_WORK(&pf_ctx->work, eq_pf_action);
+
+	pf_ctx->wq = alloc_ordered_workqueue(name,
+					     WQ_MEM_RECLAIM);
+	if (!pf_ctx->wq)
+		return -ENOMEM;
+
+	pf_ctx->pool = mempool_create_kmalloc_pool
+		(MLX5_NUM_PF_DRAIN, sizeof(struct mlx5_pagefault));
+	if (!pf_ctx->pool)
+		goto err_wq;
+
+	return 0;
+err_wq:
+	destroy_workqueue(pf_ctx->wq);
+	return -ENOMEM;
+}
+
+int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token,
+				u32 wq_num, u8 type, int error)
+{
+	u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)]   = {0};
+
+	MLX5_SET(page_fault_resume_in, in, opcode,
+		 MLX5_CMD_OP_PAGE_FAULT_RESUME);
+	MLX5_SET(page_fault_resume_in, in, error, !!error);
+	MLX5_SET(page_fault_resume_in, in, page_fault_type, type);
+	MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
+	MLX5_SET(page_fault_resume_in, in, token, token);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
+#endif
+
+static void general_event_handler(struct mlx5_core_dev *dev,
+				  struct mlx5_eqe *eqe)
+{
+	switch (eqe->sub_type) {
+	case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
+		if (dev->event)
+			dev->event(dev, MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT, 0);
+		break;
+	default:
+		mlx5_core_dbg(dev, "General event with unrecognized subtype: sub_type %d\n",
+			      eqe->sub_type);
+	}
+}
+
+/* caller must eventually call mlx5_cq_put on the returned cq */
+static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32 cqn)
+{
+	struct mlx5_cq_table *table = &eq->cq_table;
+	struct mlx5_core_cq *cq = NULL;
+
+	spin_lock(&table->lock);
+	cq = radix_tree_lookup(&table->tree, cqn);
+	if (likely(cq))
+		mlx5_cq_hold(cq);
+	spin_unlock(&table->lock);
+
+	return cq;
+}
+
+static void mlx5_eq_cq_completion(struct mlx5_eq *eq, u32 cqn)
+{
+	struct mlx5_core_cq *cq = mlx5_eq_cq_get(eq, cqn);
+
+	if (unlikely(!cq)) {
+		mlx5_core_warn(eq->dev, "Completion event for bogus CQ 0x%x\n", cqn);
+		return;
+	}
+
+	++cq->arm_sn;
+
+	cq->comp(cq);
+
+	mlx5_cq_put(cq);
+}
+
+static void mlx5_eq_cq_event(struct mlx5_eq *eq, u32 cqn, int event_type)
+{
+	struct mlx5_core_cq *cq = mlx5_eq_cq_get(eq, cqn);
+
+	if (unlikely(!cq)) {
+		mlx5_core_warn(eq->dev, "Async event for bogus CQ 0x%x\n", cqn);
+		return;
+	}
+
+	cq->event(cq, event_type);
+
+	mlx5_cq_put(cq);
+}
+
+static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
+{
+	struct mlx5_eq *eq = eq_ptr;
+	struct mlx5_core_dev *dev = eq->dev;
+	struct mlx5_eqe *eqe;
+	int set_ci = 0;
+	u32 cqn = -1;
+	u32 rsn;
+	u8 port;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		dma_rmb();
+
+		mlx5_core_dbg(eq->dev, "eqn %d, eqe type %s\n",
+			      eq->eqn, eqe_type_str(eqe->type));
+		switch (eqe->type) {
+		case MLX5_EVENT_TYPE_COMP:
+			cqn = be32_to_cpu(eqe->data.comp.cqn) & 0xffffff;
+			mlx5_eq_cq_completion(eq, cqn);
+			break;
+		case MLX5_EVENT_TYPE_DCT_DRAINED:
+			rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
+			rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN);
+			mlx5_rsc_event(dev, rsn, eqe->type);
+			break;
+		case MLX5_EVENT_TYPE_PATH_MIG:
+		case MLX5_EVENT_TYPE_COMM_EST:
+		case MLX5_EVENT_TYPE_SQ_DRAINED:
+		case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+		case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+		case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+		case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+			rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+			rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN);
+			mlx5_core_dbg(dev, "event %s(%d) arrived on resource 0x%x\n",
+				      eqe_type_str(eqe->type), eqe->type, rsn);
+			mlx5_rsc_event(dev, rsn, eqe->type);
+			break;
+
+		case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+		case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+			rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+			mlx5_core_dbg(dev, "SRQ event %s(%d): srqn 0x%x\n",
+				      eqe_type_str(eqe->type), eqe->type, rsn);
+			mlx5_srq_event(dev, rsn, eqe->type);
+			break;
+
+		case MLX5_EVENT_TYPE_CMD:
+			mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector), false);
+			break;
+
+		case MLX5_EVENT_TYPE_PORT_CHANGE:
+			port = (eqe->data.port.port >> 4) & 0xf;
+			switch (eqe->sub_type) {
+			case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+			case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+			case MLX5_PORT_CHANGE_SUBTYPE_LID:
+			case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
+			case MLX5_PORT_CHANGE_SUBTYPE_GUID:
+			case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
+			case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
+				if (dev->event)
+					dev->event(dev, port_subtype_event(eqe->sub_type),
+						   (unsigned long)port);
+				break;
+			default:
+				mlx5_core_warn(dev, "Port event with unrecognized subtype: port %d, sub_type %d\n",
+					       port, eqe->sub_type);
+			}
+			break;
+		case MLX5_EVENT_TYPE_CQ_ERROR:
+			cqn = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
+			mlx5_core_warn(dev, "CQ error on CQN 0x%x, syndrome 0x%x\n",
+				       cqn, eqe->data.cq_err.syndrome);
+			mlx5_eq_cq_event(eq, cqn, eqe->type);
+			break;
+
+		case MLX5_EVENT_TYPE_PAGE_REQUEST:
+			{
+				u16 func_id = be16_to_cpu(eqe->data.req_pages.func_id);
+				s32 npages = be32_to_cpu(eqe->data.req_pages.num_pages);
+
+				mlx5_core_dbg(dev, "page request for func 0x%x, npages %d\n",
+					      func_id, npages);
+				mlx5_core_req_pages_handler(dev, func_id, npages);
+			}
+			break;
+
+		case MLX5_EVENT_TYPE_NIC_VPORT_CHANGE:
+			mlx5_eswitch_vport_event(dev->priv.eswitch, eqe);
+			break;
+
+		case MLX5_EVENT_TYPE_PORT_MODULE_EVENT:
+			mlx5_port_module_event(dev, eqe);
+			break;
+
+		case MLX5_EVENT_TYPE_PPS_EVENT:
+			mlx5_pps_event(dev, eqe);
+			break;
+
+		case MLX5_EVENT_TYPE_FPGA_ERROR:
+			mlx5_fpga_event(dev, eqe->type, &eqe->data.raw);
+			break;
+
+		case MLX5_EVENT_TYPE_GENERAL_EVENT:
+			general_event_handler(dev, eqe);
+			break;
+		default:
+			mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n",
+				       eqe->type, eq->eqn);
+			break;
+		}
+
+		++eq->cons_index;
+		++set_ci;
+
+		/* The HCA will think the queue has overflowed if we
+		 * don't tell it we've been processing events.  We
+		 * create our EQs with MLX5_NUM_SPARE_EQE extra
+		 * entries, so we must update our consumer index at
+		 * least that often.
+		 */
+		if (unlikely(set_ci >= MLX5_NUM_SPARE_EQE)) {
+			eq_update_ci(eq, 0);
+			set_ci = 0;
+		}
+	}
+
+	eq_update_ci(eq, 1);
+
+	if (cqn != -1)
+		tasklet_schedule(&eq->tasklet_ctx.task);
+
+	return IRQ_HANDLED;
+}
+
+/* Some architectures don't latch interrupts when they are disabled, so using
+ * mlx5_eq_poll_irq_disabled could end up losing interrupts while trying to
+ * avoid losing them.  It is not recommended to use it, unless this is the last
+ * resort.
+ */
+u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq *eq)
+{
+	u32 count_eqe;
+
+	disable_irq(eq->irqn);
+	count_eqe = eq->cons_index;
+	mlx5_eq_int(eq->irqn, eq);
+	count_eqe = eq->cons_index - count_eqe;
+	enable_irq(eq->irqn);
+
+	return count_eqe;
+}
+
+static void init_eq_buf(struct mlx5_eq *eq)
+{
+	struct mlx5_eqe *eqe;
+	int i;
+
+	for (i = 0; i < eq->nent; i++) {
+		eqe = get_eqe(eq, i);
+		eqe->owner = MLX5_EQE_OWNER_INIT_VAL;
+	}
+}
+
+int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
+		       int nent, u64 mask, const char *name,
+		       enum mlx5_eq_type type)
+{
+	struct mlx5_cq_table *cq_table = &eq->cq_table;
+	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
+	struct mlx5_priv *priv = &dev->priv;
+	irq_handler_t handler;
+	__be64 *pas;
+	void *eqc;
+	int inlen;
+	u32 *in;
+	int err;
+
+	/* Init CQ table */
+	memset(cq_table, 0, sizeof(*cq_table));
+	spin_lock_init(&cq_table->lock);
+	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+
+	eq->type = type;
+	eq->nent = roundup_pow_of_two(nent + MLX5_NUM_SPARE_EQE);
+	eq->cons_index = 0;
+	err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, &eq->buf);
+	if (err)
+		return err;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (type == MLX5_EQ_TYPE_PF)
+		handler = mlx5_eq_pf_int;
+	else
+#endif
+		handler = mlx5_eq_int;
+
+	init_eq_buf(eq);
+
+	inlen = MLX5_ST_SZ_BYTES(create_eq_in) +
+		MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->buf.npages;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+
+	pas = (__be64 *)MLX5_ADDR_OF(create_eq_in, in, pas);
+	mlx5_fill_page_array(&eq->buf, pas);
+
+	MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ);
+	MLX5_SET64(create_eq_in, in, event_bitmask, mask);
+
+	eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry);
+	MLX5_SET(eqc, eqc, log_eq_size, ilog2(eq->nent));
+	MLX5_SET(eqc, eqc, uar_page, priv->uar->index);
+	MLX5_SET(eqc, eqc, intr, vecidx);
+	MLX5_SET(eqc, eqc, log_page_size,
+		 eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (err)
+		goto err_in;
+
+	snprintf(priv->irq_info[vecidx].name, MLX5_MAX_IRQ_NAME, "%s@pci:%s",
+		 name, pci_name(dev->pdev));
+
+	eq->eqn = MLX5_GET(create_eq_out, out, eq_number);
+	eq->irqn = pci_irq_vector(dev->pdev, vecidx);
+	eq->dev = dev;
+	eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET;
+	err = request_irq(eq->irqn, handler, 0,
+			  priv->irq_info[vecidx].name, eq);
+	if (err)
+		goto err_eq;
+
+	err = mlx5_debug_eq_add(dev, eq);
+	if (err)
+		goto err_irq;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (type == MLX5_EQ_TYPE_PF) {
+		err = init_pf_ctx(&eq->pf_ctx, name);
+		if (err)
+			goto err_irq;
+	} else
+#endif
+	{
+		INIT_LIST_HEAD(&eq->tasklet_ctx.list);
+		INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
+		spin_lock_init(&eq->tasklet_ctx.lock);
+		tasklet_init(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb,
+			     (unsigned long)&eq->tasklet_ctx);
+	}
+
+	/* EQs are created in ARMED state
+	 */
+	eq_update_ci(eq, 1);
+
+	kvfree(in);
+	return 0;
+
+err_irq:
+	free_irq(eq->irqn, eq);
+
+err_eq:
+	mlx5_cmd_destroy_eq(dev, eq->eqn);
+
+err_in:
+	kvfree(in);
+
+err_buf:
+	mlx5_buf_free(dev, &eq->buf);
+	return err;
+}
+
+int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	int err;
+
+	mlx5_debug_eq_remove(dev, eq);
+	free_irq(eq->irqn, eq);
+	err = mlx5_cmd_destroy_eq(dev, eq->eqn);
+	if (err)
+		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
+			       eq->eqn);
+	synchronize_irq(eq->irqn);
+
+	if (eq->type == MLX5_EQ_TYPE_COMP) {
+		tasklet_disable(&eq->tasklet_ctx.task);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	} else if (eq->type == MLX5_EQ_TYPE_PF) {
+		cancel_work_sync(&eq->pf_ctx.work);
+		destroy_workqueue(eq->pf_ctx.wq);
+		mempool_destroy(eq->pf_ctx.pool);
+#endif
+	}
+	mlx5_buf_free(dev, &eq->buf);
+
+	return err;
+}
+
+int mlx5_eq_add_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq)
+{
+	struct mlx5_cq_table *table = &eq->cq_table;
+	int err;
+
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree, cq->cqn, cq);
+	spin_unlock_irq(&table->lock);
+
+	return err;
+}
+
+int mlx5_eq_del_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq)
+{
+	struct mlx5_cq_table *table = &eq->cq_table;
+	struct mlx5_core_cq *tmp;
+
+	spin_lock_irq(&table->lock);
+	tmp = radix_tree_delete(&table->tree, cq->cqn);
+	spin_unlock_irq(&table->lock);
+
+	if (!tmp) {
+		mlx5_core_warn(eq->dev, "cq 0x%x not found in eq 0x%x tree\n", eq->eqn, cq->cqn);
+		return -ENOENT;
+	}
+
+	if (tmp != cq) {
+		mlx5_core_warn(eq->dev, "corruption on cqn 0x%x in eq 0x%x\n", eq->eqn, cq->cqn);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int mlx5_eq_init(struct mlx5_core_dev *dev)
+{
+	int err;
+
+	spin_lock_init(&dev->priv.eq_table.lock);
+
+	err = mlx5_eq_debugfs_init(dev);
+
+	return err;
+}
+
+void mlx5_eq_cleanup(struct mlx5_core_dev *dev)
+{
+	mlx5_eq_debugfs_cleanup(dev);
+}
+
+int mlx5_start_eqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	u64 async_event_mask = MLX5_ASYNC_EVENT_MASK;
+	int err;
+
+	if (MLX5_VPORT_MANAGER(dev))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_NIC_VPORT_CHANGE);
+
+	if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH &&
+	    MLX5_CAP_GEN(dev, general_notification_event))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_GENERAL_EVENT);
+
+	if (MLX5_CAP_GEN(dev, port_module_event))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PORT_MODULE_EVENT);
+	else
+		mlx5_core_dbg(dev, "port_module_event is not set\n");
+
+	if (MLX5_PPS_CAP(dev))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PPS_EVENT);
+
+	if (MLX5_CAP_GEN(dev, fpga))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR);
+	if (MLX5_CAP_GEN_MAX(dev, dct))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED);
+
+
+	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
+				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
+				 "mlx5_cmd_eq", MLX5_EQ_TYPE_ASYNC);
+	if (err) {
+		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
+		return err;
+	}
+
+	mlx5_cmd_use_events(dev);
+
+	err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC,
+				 MLX5_NUM_ASYNC_EQE, async_event_mask,
+				 "mlx5_async_eq", MLX5_EQ_TYPE_ASYNC);
+	if (err) {
+		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
+		goto err1;
+	}
+
+	err = mlx5_create_map_eq(dev, &table->pages_eq,
+				 MLX5_EQ_VEC_PAGES,
+				 /* TODO: sriov max_vf + */ 1,
+				 1 << MLX5_EVENT_TYPE_PAGE_REQUEST, "mlx5_pages_eq",
+				 MLX5_EQ_TYPE_ASYNC);
+	if (err) {
+		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
+		goto err2;
+	}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (MLX5_CAP_GEN(dev, pg)) {
+		err = mlx5_create_map_eq(dev, &table->pfault_eq,
+					 MLX5_EQ_VEC_PFAULT,
+					 MLX5_NUM_ASYNC_EQE,
+					 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
+					 "mlx5_page_fault_eq",
+					 MLX5_EQ_TYPE_PF);
+		if (err) {
+			mlx5_core_warn(dev, "failed to create page fault EQ %d\n",
+				       err);
+			goto err3;
+		}
+	}
+
+	return err;
+err3:
+	mlx5_destroy_unmap_eq(dev, &table->pages_eq);
+#else
+	return err;
+#endif
+
+err2:
+	mlx5_destroy_unmap_eq(dev, &table->async_eq);
+
+err1:
+	mlx5_cmd_use_polling(dev);
+	mlx5_destroy_unmap_eq(dev, &table->cmd_eq);
+	return err;
+}
+
+void mlx5_stop_eqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	int err;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (MLX5_CAP_GEN(dev, pg)) {
+		err = mlx5_destroy_unmap_eq(dev, &table->pfault_eq);
+		if (err)
+			mlx5_core_err(dev, "failed to destroy page fault eq, err(%d)\n",
+				      err);
+	}
+#endif
+
+	err = mlx5_destroy_unmap_eq(dev, &table->pages_eq);
+	if (err)
+		mlx5_core_err(dev, "failed to destroy pages eq, err(%d)\n",
+			      err);
+
+	err = mlx5_destroy_unmap_eq(dev, &table->async_eq);
+	if (err)
+		mlx5_core_err(dev, "failed to destroy async eq, err(%d)\n",
+			      err);
+	mlx5_cmd_use_polling(dev);
+
+	err = mlx5_destroy_unmap_eq(dev, &table->cmd_eq);
+	if (err)
+		mlx5_core_err(dev, "failed to destroy command eq, err(%d)\n",
+			      err);
+}
+
+int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		       u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_eq_in)] = {0};
+
+	MLX5_SET(query_eq_in, in, opcode, MLX5_CMD_OP_QUERY_EQ);
+	MLX5_SET(query_eq_in, in, eq_number, eq->eqn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+
+/* This function should only be called after mlx5_cmd_force_teardown_hca */
+void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_eq *eq;
+
+#ifdef CONFIG_RFS_ACCEL
+	if (dev->rmap) {
+		free_irq_cpu_rmap(dev->rmap);
+		dev->rmap = NULL;
+	}
+#endif
+	list_for_each_entry(eq, &table->comp_eqs_list, list)
+		free_irq(eq->irqn, eq);
+
+	free_irq(table->pages_eq.irqn, &table->pages_eq);
+	free_irq(table->async_eq.irqn, &table->async_eq);
+	free_irq(table->cmd_eq.irqn, &table->cmd_eq);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (MLX5_CAP_GEN(dev, pg))
+		free_irq(table->pfault_eq.irqn, &table->pfault_eq);
+#endif
+	pci_free_irq_vectors(dev->pdev);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
new file mode 100644
index 0000000..1352d13
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -0,0 +1,2226 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/mlx5_ifc.h>
+#include <linux/mlx5/vport.h>
+#include <linux/mlx5/fs.h>
+#include "mlx5_core.h"
+#include "eswitch.h"
+#include "fs_core.h"
+
+#define UPLINK_VPORT 0xFFFF
+
+enum {
+	MLX5_ACTION_NONE = 0,
+	MLX5_ACTION_ADD  = 1,
+	MLX5_ACTION_DEL  = 2,
+};
+
+/* Vport UC/MC hash node */
+struct vport_addr {
+	struct l2addr_node     node;
+	u8                     action;
+	u32                    vport;
+	struct mlx5_flow_handle *flow_rule;
+	bool mpfs; /* UC MAC was added to MPFs */
+	/* A flag indicating that mac was added due to mc promiscuous vport */
+	bool mc_promisc;
+};
+
+enum {
+	UC_ADDR_CHANGE = BIT(0),
+	MC_ADDR_CHANGE = BIT(1),
+	PROMISC_CHANGE = BIT(3),
+};
+
+/* Vport context events */
+#define SRIOV_VPORT_EVENTS (UC_ADDR_CHANGE | \
+			    MC_ADDR_CHANGE | \
+			    PROMISC_CHANGE)
+
+static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, u16 vport,
+					u32 events_mask)
+{
+	int in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)]   = {0};
+	int out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)] = {0};
+	void *nic_vport_ctx;
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 opcode, MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.change_event, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in,
+				     in, nic_vport_context);
+
+	MLX5_SET(nic_vport_context, nic_vport_ctx, arm_change_event, 1);
+
+	if (events_mask & UC_ADDR_CHANGE)
+		MLX5_SET(nic_vport_context, nic_vport_ctx,
+			 event_on_uc_address_change, 1);
+	if (events_mask & MC_ADDR_CHANGE)
+		MLX5_SET(nic_vport_context, nic_vport_ctx,
+			 event_on_mc_address_change, 1);
+	if (events_mask & PROMISC_CHANGE)
+		MLX5_SET(nic_vport_context, nic_vport_ctx,
+			 event_on_promisc_change, 1);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+/* E-Switch vport context HW commands */
+static int modify_esw_vport_context_cmd(struct mlx5_core_dev *dev, u16 vport,
+					void *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_esw_vport_context_out)] = {0};
+
+	MLX5_SET(modify_esw_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT);
+	MLX5_SET(modify_esw_vport_context_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(modify_esw_vport_context_in, in, other_vport, 1);
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+
+static int modify_esw_vport_cvlan(struct mlx5_core_dev *dev, u32 vport,
+				  u16 vlan, u8 qos, u8 set_flags)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_esw_vport_context_in)] = {0};
+
+	if (!MLX5_CAP_ESW(dev, vport_cvlan_strip) ||
+	    !MLX5_CAP_ESW(dev, vport_cvlan_insert_if_not_exist))
+		return -EOPNOTSUPP;
+
+	esw_debug(dev, "Set Vport[%d] VLAN %d qos %d set=%x\n",
+		  vport, vlan, qos, set_flags);
+
+	if (set_flags & SET_VLAN_STRIP)
+		MLX5_SET(modify_esw_vport_context_in, in,
+			 esw_vport_context.vport_cvlan_strip, 1);
+
+	if (set_flags & SET_VLAN_INSERT) {
+		/* insert only if no vlan in packet */
+		MLX5_SET(modify_esw_vport_context_in, in,
+			 esw_vport_context.vport_cvlan_insert, 1);
+
+		MLX5_SET(modify_esw_vport_context_in, in,
+			 esw_vport_context.cvlan_pcp, qos);
+		MLX5_SET(modify_esw_vport_context_in, in,
+			 esw_vport_context.cvlan_id, vlan);
+	}
+
+	MLX5_SET(modify_esw_vport_context_in, in,
+		 field_select.vport_cvlan_strip, 1);
+	MLX5_SET(modify_esw_vport_context_in, in,
+		 field_select.vport_cvlan_insert, 1);
+
+	return modify_esw_vport_context_cmd(dev, vport, in, sizeof(in));
+}
+
+/* E-Switch FDB */
+static struct mlx5_flow_handle *
+__esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
+			 u8 mac_c[ETH_ALEN], u8 mac_v[ETH_ALEN])
+{
+	int match_header = (is_zero_ether_addr(mac_c) ? 0 :
+			    MLX5_MATCH_OUTER_HEADERS);
+	struct mlx5_flow_handle *flow_rule = NULL;
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_spec *spec;
+	void *mv_misc = NULL;
+	void *mc_misc = NULL;
+	u8 *dmac_v = NULL;
+	u8 *dmac_c = NULL;
+
+	if (rx_rule)
+		match_header |= MLX5_MATCH_MISC_PARAMETERS;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return NULL;
+
+	dmac_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+			      outer_headers.dmac_47_16);
+	dmac_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+			      outer_headers.dmac_47_16);
+
+	if (match_header & MLX5_MATCH_OUTER_HEADERS) {
+		ether_addr_copy(dmac_v, mac_v);
+		ether_addr_copy(dmac_c, mac_c);
+	}
+
+	if (match_header & MLX5_MATCH_MISC_PARAMETERS) {
+		mv_misc  = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+					misc_parameters);
+		mc_misc  = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+					misc_parameters);
+		MLX5_SET(fte_match_set_misc, mv_misc, source_port, UPLINK_VPORT);
+		MLX5_SET_TO_ONES(fte_match_set_misc, mc_misc, source_port);
+	}
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+	dest.vport_num = vport;
+
+	esw_debug(esw->dev,
+		  "\tFDB add rule dmac_v(%pM) dmac_c(%pM) -> vport(%d)\n",
+		  dmac_v, dmac_c, vport);
+	spec->match_criteria_enable = match_header;
+	flow_act.action =  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	flow_rule =
+		mlx5_add_flow_rules(esw->fdb_table.fdb, spec,
+				    &flow_act, &dest, 1);
+	if (IS_ERR(flow_rule)) {
+		esw_warn(esw->dev,
+			 "FDB: Failed to add flow rule: dmac_v(%pM) dmac_c(%pM) -> vport(%d), err(%ld)\n",
+			 dmac_v, dmac_c, vport, PTR_ERR(flow_rule));
+		flow_rule = NULL;
+	}
+
+	kvfree(spec);
+	return flow_rule;
+}
+
+static struct mlx5_flow_handle *
+esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u8 mac[ETH_ALEN], u32 vport)
+{
+	u8 mac_c[ETH_ALEN];
+
+	eth_broadcast_addr(mac_c);
+	return __esw_fdb_set_vport_rule(esw, vport, false, mac_c, mac);
+}
+
+static struct mlx5_flow_handle *
+esw_fdb_set_vport_allmulti_rule(struct mlx5_eswitch *esw, u32 vport)
+{
+	u8 mac_c[ETH_ALEN];
+	u8 mac_v[ETH_ALEN];
+
+	eth_zero_addr(mac_c);
+	eth_zero_addr(mac_v);
+	mac_c[0] = 0x01;
+	mac_v[0] = 0x01;
+	return __esw_fdb_set_vport_rule(esw, vport, false, mac_c, mac_v);
+}
+
+static struct mlx5_flow_handle *
+esw_fdb_set_vport_promisc_rule(struct mlx5_eswitch *esw, u32 vport)
+{
+	u8 mac_c[ETH_ALEN];
+	u8 mac_v[ETH_ALEN];
+
+	eth_zero_addr(mac_c);
+	eth_zero_addr(mac_v);
+	return __esw_fdb_set_vport_rule(esw, vport, true, mac_c, mac_v);
+}
+
+static int esw_create_legacy_fdb_table(struct mlx5_eswitch *esw, int nvports)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *root_ns;
+	struct mlx5_flow_table *fdb;
+	struct mlx5_flow_group *g;
+	void *match_criteria;
+	int table_size;
+	u32 *flow_group_in;
+	u8 *dmac;
+	int err = 0;
+
+	esw_debug(dev, "Create FDB log_max_size(%d)\n",
+		  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
+
+	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+	if (!root_ns) {
+		esw_warn(dev, "Failed to get FDB flow namespace\n");
+		return -EOPNOTSUPP;
+	}
+
+	flow_group_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!flow_group_in)
+		return -ENOMEM;
+
+	table_size = BIT(MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
+
+	ft_attr.max_fte = table_size;
+	fdb = mlx5_create_flow_table(root_ns, &ft_attr);
+	if (IS_ERR(fdb)) {
+		err = PTR_ERR(fdb);
+		esw_warn(dev, "Failed to create FDB Table err %d\n", err);
+		goto out;
+	}
+	esw->fdb_table.fdb = fdb;
+
+	/* Addresses group : Full match unicast/multicast addresses */
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_OUTER_HEADERS);
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+	dmac = MLX5_ADDR_OF(fte_match_param, match_criteria, outer_headers.dmac_47_16);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+	/* Preserve 2 entries for allmulti and promisc rules*/
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, table_size - 3);
+	eth_broadcast_addr(dmac);
+	g = mlx5_create_flow_group(fdb, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create flow group err(%d)\n", err);
+		goto out;
+	}
+	esw->fdb_table.legacy.addr_grp = g;
+
+	/* Allmulti group : One rule that forwards any mcast traffic */
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, table_size - 2);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, table_size - 2);
+	eth_zero_addr(dmac);
+	dmac[0] = 0x01;
+	g = mlx5_create_flow_group(fdb, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create allmulti flow group err(%d)\n", err);
+		goto out;
+	}
+	esw->fdb_table.legacy.allmulti_grp = g;
+
+	/* Promiscuous group :
+	 * One rule that forward all unmatched traffic from previous groups
+	 */
+	eth_zero_addr(dmac);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_MISC_PARAMETERS);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, misc_parameters.source_port);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, table_size - 1);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, table_size - 1);
+	g = mlx5_create_flow_group(fdb, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create promisc flow group err(%d)\n", err);
+		goto out;
+	}
+	esw->fdb_table.legacy.promisc_grp = g;
+
+out:
+	if (err) {
+		if (!IS_ERR_OR_NULL(esw->fdb_table.legacy.allmulti_grp)) {
+			mlx5_destroy_flow_group(esw->fdb_table.legacy.allmulti_grp);
+			esw->fdb_table.legacy.allmulti_grp = NULL;
+		}
+		if (!IS_ERR_OR_NULL(esw->fdb_table.legacy.addr_grp)) {
+			mlx5_destroy_flow_group(esw->fdb_table.legacy.addr_grp);
+			esw->fdb_table.legacy.addr_grp = NULL;
+		}
+		if (!IS_ERR_OR_NULL(esw->fdb_table.fdb)) {
+			mlx5_destroy_flow_table(esw->fdb_table.fdb);
+			esw->fdb_table.fdb = NULL;
+		}
+	}
+
+	kvfree(flow_group_in);
+	return err;
+}
+
+static void esw_destroy_legacy_fdb_table(struct mlx5_eswitch *esw)
+{
+	if (!esw->fdb_table.fdb)
+		return;
+
+	esw_debug(esw->dev, "Destroy FDB Table\n");
+	mlx5_destroy_flow_group(esw->fdb_table.legacy.promisc_grp);
+	mlx5_destroy_flow_group(esw->fdb_table.legacy.allmulti_grp);
+	mlx5_destroy_flow_group(esw->fdb_table.legacy.addr_grp);
+	mlx5_destroy_flow_table(esw->fdb_table.fdb);
+	esw->fdb_table.fdb = NULL;
+	esw->fdb_table.legacy.addr_grp = NULL;
+	esw->fdb_table.legacy.allmulti_grp = NULL;
+	esw->fdb_table.legacy.promisc_grp = NULL;
+}
+
+/* E-Switch vport UC/MC lists management */
+typedef int (*vport_addr_action)(struct mlx5_eswitch *esw,
+				 struct vport_addr *vaddr);
+
+static int esw_add_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
+{
+	u8 *mac = vaddr->node.addr;
+	u32 vport = vaddr->vport;
+	int err;
+
+	/* Skip mlx5_mpfs_add_mac for PFs,
+	 * it is already done by the PF netdev in mlx5e_execute_l2_action
+	 */
+	if (!vport)
+		goto fdb_add;
+
+	err = mlx5_mpfs_add_mac(esw->dev, mac);
+	if (err) {
+		esw_warn(esw->dev,
+			 "Failed to add L2 table mac(%pM) for vport(%d), err(%d)\n",
+			 mac, vport, err);
+		return err;
+	}
+	vaddr->mpfs = true;
+
+fdb_add:
+	/* SRIOV is enabled: Forward UC MAC to vport */
+	if (esw->fdb_table.fdb && esw->mode == SRIOV_LEGACY)
+		vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport);
+
+	esw_debug(esw->dev, "\tADDED UC MAC: vport[%d] %pM fr(%p)\n",
+		  vport, mac, vaddr->flow_rule);
+
+	return 0;
+}
+
+static int esw_del_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
+{
+	u8 *mac = vaddr->node.addr;
+	u32 vport = vaddr->vport;
+	int err = 0;
+
+	/* Skip mlx5_mpfs_del_mac for PFs,
+	 * it is already done by the PF netdev in mlx5e_execute_l2_action
+	 */
+	if (!vport || !vaddr->mpfs)
+		goto fdb_del;
+
+	err = mlx5_mpfs_del_mac(esw->dev, mac);
+	if (err)
+		esw_warn(esw->dev,
+			 "Failed to del L2 table mac(%pM) for vport(%d), err(%d)\n",
+			 mac, vport, err);
+	vaddr->mpfs = false;
+
+fdb_del:
+	if (vaddr->flow_rule)
+		mlx5_del_flow_rules(vaddr->flow_rule);
+	vaddr->flow_rule = NULL;
+
+	return 0;
+}
+
+static void update_allmulti_vports(struct mlx5_eswitch *esw,
+				   struct vport_addr *vaddr,
+				   struct esw_mc_addr *esw_mc)
+{
+	u8 *mac = vaddr->node.addr;
+	u32 vport_idx = 0;
+
+	for (vport_idx = 0; vport_idx < esw->total_vports; vport_idx++) {
+		struct mlx5_vport *vport = &esw->vports[vport_idx];
+		struct hlist_head *vport_hash = vport->mc_list;
+		struct vport_addr *iter_vaddr =
+					l2addr_hash_find(vport_hash,
+							 mac,
+							 struct vport_addr);
+		if (IS_ERR_OR_NULL(vport->allmulti_rule) ||
+		    vaddr->vport == vport_idx)
+			continue;
+		switch (vaddr->action) {
+		case MLX5_ACTION_ADD:
+			if (iter_vaddr)
+				continue;
+			iter_vaddr = l2addr_hash_add(vport_hash, mac,
+						     struct vport_addr,
+						     GFP_KERNEL);
+			if (!iter_vaddr) {
+				esw_warn(esw->dev,
+					 "ALL-MULTI: Failed to add MAC(%pM) to vport[%d] DB\n",
+					 mac, vport_idx);
+				continue;
+			}
+			iter_vaddr->vport = vport_idx;
+			iter_vaddr->flow_rule =
+					esw_fdb_set_vport_rule(esw,
+							       mac,
+							       vport_idx);
+			iter_vaddr->mc_promisc = true;
+			break;
+		case MLX5_ACTION_DEL:
+			if (!iter_vaddr)
+				continue;
+			mlx5_del_flow_rules(iter_vaddr->flow_rule);
+			l2addr_hash_del(iter_vaddr);
+			break;
+		}
+	}
+}
+
+static int esw_add_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
+{
+	struct hlist_head *hash = esw->mc_table;
+	struct esw_mc_addr *esw_mc;
+	u8 *mac = vaddr->node.addr;
+	u32 vport = vaddr->vport;
+
+	if (!esw->fdb_table.fdb)
+		return 0;
+
+	esw_mc = l2addr_hash_find(hash, mac, struct esw_mc_addr);
+	if (esw_mc)
+		goto add;
+
+	esw_mc = l2addr_hash_add(hash, mac, struct esw_mc_addr, GFP_KERNEL);
+	if (!esw_mc)
+		return -ENOMEM;
+
+	esw_mc->uplink_rule = /* Forward MC MAC to Uplink */
+		esw_fdb_set_vport_rule(esw, mac, UPLINK_VPORT);
+
+	/* Add this multicast mac to all the mc promiscuous vports */
+	update_allmulti_vports(esw, vaddr, esw_mc);
+
+add:
+	/* If the multicast mac is added as a result of mc promiscuous vport,
+	 * don't increment the multicast ref count
+	 */
+	if (!vaddr->mc_promisc)
+		esw_mc->refcnt++;
+
+	/* Forward MC MAC to vport */
+	vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport);
+	esw_debug(esw->dev,
+		  "\tADDED MC MAC: vport[%d] %pM fr(%p) refcnt(%d) uplinkfr(%p)\n",
+		  vport, mac, vaddr->flow_rule,
+		  esw_mc->refcnt, esw_mc->uplink_rule);
+	return 0;
+}
+
+static int esw_del_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
+{
+	struct hlist_head *hash = esw->mc_table;
+	struct esw_mc_addr *esw_mc;
+	u8 *mac = vaddr->node.addr;
+	u32 vport = vaddr->vport;
+
+	if (!esw->fdb_table.fdb)
+		return 0;
+
+	esw_mc = l2addr_hash_find(hash, mac, struct esw_mc_addr);
+	if (!esw_mc) {
+		esw_warn(esw->dev,
+			 "Failed to find eswitch MC addr for MAC(%pM) vport(%d)",
+			 mac, vport);
+		return -EINVAL;
+	}
+	esw_debug(esw->dev,
+		  "\tDELETE MC MAC: vport[%d] %pM fr(%p) refcnt(%d) uplinkfr(%p)\n",
+		  vport, mac, vaddr->flow_rule, esw_mc->refcnt,
+		  esw_mc->uplink_rule);
+
+	if (vaddr->flow_rule)
+		mlx5_del_flow_rules(vaddr->flow_rule);
+	vaddr->flow_rule = NULL;
+
+	/* If the multicast mac is added as a result of mc promiscuous vport,
+	 * don't decrement the multicast ref count.
+	 */
+	if (vaddr->mc_promisc || (--esw_mc->refcnt > 0))
+		return 0;
+
+	/* Remove this multicast mac from all the mc promiscuous vports */
+	update_allmulti_vports(esw, vaddr, esw_mc);
+
+	if (esw_mc->uplink_rule)
+		mlx5_del_flow_rules(esw_mc->uplink_rule);
+
+	l2addr_hash_del(esw_mc);
+	return 0;
+}
+
+/* Apply vport UC/MC list to HW l2 table and FDB table */
+static void esw_apply_vport_addr_list(struct mlx5_eswitch *esw,
+				      u32 vport_num, int list_type)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	bool is_uc = list_type == MLX5_NVPRT_LIST_TYPE_UC;
+	vport_addr_action vport_addr_add;
+	vport_addr_action vport_addr_del;
+	struct vport_addr *addr;
+	struct l2addr_node *node;
+	struct hlist_head *hash;
+	struct hlist_node *tmp;
+	int hi;
+
+	vport_addr_add = is_uc ? esw_add_uc_addr :
+				 esw_add_mc_addr;
+	vport_addr_del = is_uc ? esw_del_uc_addr :
+				 esw_del_mc_addr;
+
+	hash = is_uc ? vport->uc_list : vport->mc_list;
+	for_each_l2hash_node(node, tmp, hash, hi) {
+		addr = container_of(node, struct vport_addr, node);
+		switch (addr->action) {
+		case MLX5_ACTION_ADD:
+			vport_addr_add(esw, addr);
+			addr->action = MLX5_ACTION_NONE;
+			break;
+		case MLX5_ACTION_DEL:
+			vport_addr_del(esw, addr);
+			l2addr_hash_del(addr);
+			break;
+		}
+	}
+}
+
+/* Sync vport UC/MC list from vport context */
+static void esw_update_vport_addr_list(struct mlx5_eswitch *esw,
+				       u32 vport_num, int list_type)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	bool is_uc = list_type == MLX5_NVPRT_LIST_TYPE_UC;
+	u8 (*mac_list)[ETH_ALEN];
+	struct l2addr_node *node;
+	struct vport_addr *addr;
+	struct hlist_head *hash;
+	struct hlist_node *tmp;
+	int size;
+	int err;
+	int hi;
+	int i;
+
+	size = is_uc ? MLX5_MAX_UC_PER_VPORT(esw->dev) :
+		       MLX5_MAX_MC_PER_VPORT(esw->dev);
+
+	mac_list = kcalloc(size, ETH_ALEN, GFP_KERNEL);
+	if (!mac_list)
+		return;
+
+	hash = is_uc ? vport->uc_list : vport->mc_list;
+
+	for_each_l2hash_node(node, tmp, hash, hi) {
+		addr = container_of(node, struct vport_addr, node);
+		addr->action = MLX5_ACTION_DEL;
+	}
+
+	if (!vport->enabled)
+		goto out;
+
+	err = mlx5_query_nic_vport_mac_list(esw->dev, vport_num, list_type,
+					    mac_list, &size);
+	if (err)
+		goto out;
+	esw_debug(esw->dev, "vport[%d] context update %s list size (%d)\n",
+		  vport_num, is_uc ? "UC" : "MC", size);
+
+	for (i = 0; i < size; i++) {
+		if (is_uc && !is_valid_ether_addr(mac_list[i]))
+			continue;
+
+		if (!is_uc && !is_multicast_ether_addr(mac_list[i]))
+			continue;
+
+		addr = l2addr_hash_find(hash, mac_list[i], struct vport_addr);
+		if (addr) {
+			addr->action = MLX5_ACTION_NONE;
+			/* If this mac was previously added because of allmulti
+			 * promiscuous rx mode, its now converted to be original
+			 * vport mac.
+			 */
+			if (addr->mc_promisc) {
+				struct esw_mc_addr *esw_mc =
+					l2addr_hash_find(esw->mc_table,
+							 mac_list[i],
+							 struct esw_mc_addr);
+				if (!esw_mc) {
+					esw_warn(esw->dev,
+						 "Failed to MAC(%pM) in mcast DB\n",
+						 mac_list[i]);
+					continue;
+				}
+				esw_mc->refcnt++;
+				addr->mc_promisc = false;
+			}
+			continue;
+		}
+
+		addr = l2addr_hash_add(hash, mac_list[i], struct vport_addr,
+				       GFP_KERNEL);
+		if (!addr) {
+			esw_warn(esw->dev,
+				 "Failed to add MAC(%pM) to vport[%d] DB\n",
+				 mac_list[i], vport_num);
+			continue;
+		}
+		addr->vport = vport_num;
+		addr->action = MLX5_ACTION_ADD;
+	}
+out:
+	kfree(mac_list);
+}
+
+/* Sync vport UC/MC list from vport context
+ * Must be called after esw_update_vport_addr_list
+ */
+static void esw_update_vport_mc_promisc(struct mlx5_eswitch *esw, u32 vport_num)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	struct l2addr_node *node;
+	struct vport_addr *addr;
+	struct hlist_head *hash;
+	struct hlist_node *tmp;
+	int hi;
+
+	hash = vport->mc_list;
+
+	for_each_l2hash_node(node, tmp, esw->mc_table, hi) {
+		u8 *mac = node->addr;
+
+		addr = l2addr_hash_find(hash, mac, struct vport_addr);
+		if (addr) {
+			if (addr->action == MLX5_ACTION_DEL)
+				addr->action = MLX5_ACTION_NONE;
+			continue;
+		}
+		addr = l2addr_hash_add(hash, mac, struct vport_addr,
+				       GFP_KERNEL);
+		if (!addr) {
+			esw_warn(esw->dev,
+				 "Failed to add allmulti MAC(%pM) to vport[%d] DB\n",
+				 mac, vport_num);
+			continue;
+		}
+		addr->vport = vport_num;
+		addr->action = MLX5_ACTION_ADD;
+		addr->mc_promisc = true;
+	}
+}
+
+/* Apply vport rx mode to HW FDB table */
+static void esw_apply_vport_rx_mode(struct mlx5_eswitch *esw, u32 vport_num,
+				    bool promisc, bool mc_promisc)
+{
+	struct esw_mc_addr *allmulti_addr = &esw->mc_promisc;
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+
+	if (IS_ERR_OR_NULL(vport->allmulti_rule) != mc_promisc)
+		goto promisc;
+
+	if (mc_promisc) {
+		vport->allmulti_rule =
+				esw_fdb_set_vport_allmulti_rule(esw, vport_num);
+		if (!allmulti_addr->uplink_rule)
+			allmulti_addr->uplink_rule =
+				esw_fdb_set_vport_allmulti_rule(esw,
+								UPLINK_VPORT);
+		allmulti_addr->refcnt++;
+	} else if (vport->allmulti_rule) {
+		mlx5_del_flow_rules(vport->allmulti_rule);
+		vport->allmulti_rule = NULL;
+
+		if (--allmulti_addr->refcnt > 0)
+			goto promisc;
+
+		if (allmulti_addr->uplink_rule)
+			mlx5_del_flow_rules(allmulti_addr->uplink_rule);
+		allmulti_addr->uplink_rule = NULL;
+	}
+
+promisc:
+	if (IS_ERR_OR_NULL(vport->promisc_rule) != promisc)
+		return;
+
+	if (promisc) {
+		vport->promisc_rule = esw_fdb_set_vport_promisc_rule(esw,
+								     vport_num);
+	} else if (vport->promisc_rule) {
+		mlx5_del_flow_rules(vport->promisc_rule);
+		vport->promisc_rule = NULL;
+	}
+}
+
+/* Sync vport rx mode from vport context */
+static void esw_update_vport_rx_mode(struct mlx5_eswitch *esw, u32 vport_num)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	int promisc_all = 0;
+	int promisc_uc = 0;
+	int promisc_mc = 0;
+	int err;
+
+	err = mlx5_query_nic_vport_promisc(esw->dev,
+					   vport_num,
+					   &promisc_uc,
+					   &promisc_mc,
+					   &promisc_all);
+	if (err)
+		return;
+	esw_debug(esw->dev, "vport[%d] context update rx mode promisc_all=%d, all_multi=%d\n",
+		  vport_num, promisc_all, promisc_mc);
+
+	if (!vport->info.trusted || !vport->enabled) {
+		promisc_uc = 0;
+		promisc_mc = 0;
+		promisc_all = 0;
+	}
+
+	esw_apply_vport_rx_mode(esw, vport_num, promisc_all,
+				(promisc_all || promisc_mc));
+}
+
+static void esw_vport_change_handle_locked(struct mlx5_vport *vport)
+{
+	struct mlx5_core_dev *dev = vport->dev;
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	u8 mac[ETH_ALEN];
+
+	mlx5_query_nic_vport_mac_address(dev, vport->vport, mac);
+	esw_debug(dev, "vport[%d] Context Changed: perm mac: %pM\n",
+		  vport->vport, mac);
+
+	if (vport->enabled_events & UC_ADDR_CHANGE) {
+		esw_update_vport_addr_list(esw, vport->vport,
+					   MLX5_NVPRT_LIST_TYPE_UC);
+		esw_apply_vport_addr_list(esw, vport->vport,
+					  MLX5_NVPRT_LIST_TYPE_UC);
+	}
+
+	if (vport->enabled_events & MC_ADDR_CHANGE) {
+		esw_update_vport_addr_list(esw, vport->vport,
+					   MLX5_NVPRT_LIST_TYPE_MC);
+	}
+
+	if (vport->enabled_events & PROMISC_CHANGE) {
+		esw_update_vport_rx_mode(esw, vport->vport);
+		if (!IS_ERR_OR_NULL(vport->allmulti_rule))
+			esw_update_vport_mc_promisc(esw, vport->vport);
+	}
+
+	if (vport->enabled_events & (PROMISC_CHANGE | MC_ADDR_CHANGE)) {
+		esw_apply_vport_addr_list(esw, vport->vport,
+					  MLX5_NVPRT_LIST_TYPE_MC);
+	}
+
+	esw_debug(esw->dev, "vport[%d] Context Changed: Done\n", vport->vport);
+	if (vport->enabled)
+		arm_vport_context_events_cmd(dev, vport->vport,
+					     vport->enabled_events);
+}
+
+static void esw_vport_change_handler(struct work_struct *work)
+{
+	struct mlx5_vport *vport =
+		container_of(work, struct mlx5_vport, vport_change_handler);
+	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
+
+	mutex_lock(&esw->state_lock);
+	esw_vport_change_handle_locked(vport);
+	mutex_unlock(&esw->state_lock);
+}
+
+static int esw_vport_enable_egress_acl(struct mlx5_eswitch *esw,
+				       struct mlx5_vport *vport)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_group *vlan_grp = NULL;
+	struct mlx5_flow_group *drop_grp = NULL;
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *root_ns;
+	struct mlx5_flow_table *acl;
+	void *match_criteria;
+	u32 *flow_group_in;
+	/* The egress acl table contains 2 rules:
+	 * 1)Allow traffic with vlan_tag=vst_vlan_id
+	 * 2)Drop all other traffic.
+	 */
+	int table_size = 2;
+	int err = 0;
+
+	if (!MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support))
+		return -EOPNOTSUPP;
+
+	if (!IS_ERR_OR_NULL(vport->egress.acl))
+		return 0;
+
+	esw_debug(dev, "Create vport[%d] egress ACL log_max_size(%d)\n",
+		  vport->vport, MLX5_CAP_ESW_EGRESS_ACL(dev, log_max_ft_size));
+
+	root_ns = mlx5_get_flow_vport_acl_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_EGRESS,
+						    vport->vport);
+	if (!root_ns) {
+		esw_warn(dev, "Failed to get E-Switch egress flow namespace for vport (%d)\n", vport->vport);
+		return -EOPNOTSUPP;
+	}
+
+	flow_group_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!flow_group_in)
+		return -ENOMEM;
+
+	acl = mlx5_create_vport_flow_table(root_ns, 0, table_size, 0, vport->vport);
+	if (IS_ERR(acl)) {
+		err = PTR_ERR(acl);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] egress flow Table, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.first_vid);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0);
+
+	vlan_grp = mlx5_create_flow_group(acl, flow_group_in);
+	if (IS_ERR(vlan_grp)) {
+		err = PTR_ERR(vlan_grp);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] egress allowed vlans flow group, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1);
+	drop_grp = mlx5_create_flow_group(acl, flow_group_in);
+	if (IS_ERR(drop_grp)) {
+		err = PTR_ERR(drop_grp);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] egress drop flow group, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+
+	vport->egress.acl = acl;
+	vport->egress.drop_grp = drop_grp;
+	vport->egress.allowed_vlans_grp = vlan_grp;
+out:
+	kvfree(flow_group_in);
+	if (err && !IS_ERR_OR_NULL(vlan_grp))
+		mlx5_destroy_flow_group(vlan_grp);
+	if (err && !IS_ERR_OR_NULL(acl))
+		mlx5_destroy_flow_table(acl);
+	return err;
+}
+
+static void esw_vport_cleanup_egress_rules(struct mlx5_eswitch *esw,
+					   struct mlx5_vport *vport)
+{
+	if (!IS_ERR_OR_NULL(vport->egress.allowed_vlan))
+		mlx5_del_flow_rules(vport->egress.allowed_vlan);
+
+	if (!IS_ERR_OR_NULL(vport->egress.drop_rule))
+		mlx5_del_flow_rules(vport->egress.drop_rule);
+
+	vport->egress.allowed_vlan = NULL;
+	vport->egress.drop_rule = NULL;
+}
+
+static void esw_vport_disable_egress_acl(struct mlx5_eswitch *esw,
+					 struct mlx5_vport *vport)
+{
+	if (IS_ERR_OR_NULL(vport->egress.acl))
+		return;
+
+	esw_debug(esw->dev, "Destroy vport[%d] E-Switch egress ACL\n", vport->vport);
+
+	esw_vport_cleanup_egress_rules(esw, vport);
+	mlx5_destroy_flow_group(vport->egress.allowed_vlans_grp);
+	mlx5_destroy_flow_group(vport->egress.drop_grp);
+	mlx5_destroy_flow_table(vport->egress.acl);
+	vport->egress.allowed_vlans_grp = NULL;
+	vport->egress.drop_grp = NULL;
+	vport->egress.acl = NULL;
+}
+
+static int esw_vport_enable_ingress_acl(struct mlx5_eswitch *esw,
+					struct mlx5_vport *vport)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *root_ns;
+	struct mlx5_flow_table *acl;
+	struct mlx5_flow_group *g;
+	void *match_criteria;
+	u32 *flow_group_in;
+	/* The ingress acl table contains 4 groups
+	 * (2 active rules at the same time -
+	 *      1 allow rule from one of the first 3 groups.
+	 *      1 drop rule from the last group):
+	 * 1)Allow untagged traffic with smac=original mac.
+	 * 2)Allow untagged traffic.
+	 * 3)Allow traffic with smac=original mac.
+	 * 4)Drop all other traffic.
+	 */
+	int table_size = 4;
+	int err = 0;
+
+	if (!MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support))
+		return -EOPNOTSUPP;
+
+	if (!IS_ERR_OR_NULL(vport->ingress.acl))
+		return 0;
+
+	esw_debug(dev, "Create vport[%d] ingress ACL log_max_size(%d)\n",
+		  vport->vport, MLX5_CAP_ESW_INGRESS_ACL(dev, log_max_ft_size));
+
+	root_ns = mlx5_get_flow_vport_acl_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS,
+						    vport->vport);
+	if (!root_ns) {
+		esw_warn(dev, "Failed to get E-Switch ingress flow namespace for vport (%d)\n", vport->vport);
+		return -EOPNOTSUPP;
+	}
+
+	flow_group_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!flow_group_in)
+		return -ENOMEM;
+
+	acl = mlx5_create_vport_flow_table(root_ns, 0, table_size, 0, vport->vport);
+	if (IS_ERR(acl)) {
+		err = PTR_ERR(acl);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] ingress flow Table, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+	vport->ingress.acl = acl;
+
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0);
+
+	g = mlx5_create_flow_group(acl, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] ingress untagged spoofchk flow group, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+	vport->ingress.allow_untagged_spoofchk_grp = g;
+
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1);
+
+	g = mlx5_create_flow_group(acl, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] ingress untagged flow group, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+	vport->ingress.allow_untagged_only_grp = g;
+
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 2);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 2);
+
+	g = mlx5_create_flow_group(acl, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] ingress spoofchk flow group, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+	vport->ingress.allow_spoofchk_only_grp = g;
+
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 3);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 3);
+
+	g = mlx5_create_flow_group(acl, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] ingress drop flow group, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+	vport->ingress.drop_grp = g;
+
+out:
+	if (err) {
+		if (!IS_ERR_OR_NULL(vport->ingress.allow_spoofchk_only_grp))
+			mlx5_destroy_flow_group(
+					vport->ingress.allow_spoofchk_only_grp);
+		if (!IS_ERR_OR_NULL(vport->ingress.allow_untagged_only_grp))
+			mlx5_destroy_flow_group(
+					vport->ingress.allow_untagged_only_grp);
+		if (!IS_ERR_OR_NULL(vport->ingress.allow_untagged_spoofchk_grp))
+			mlx5_destroy_flow_group(
+				vport->ingress.allow_untagged_spoofchk_grp);
+		if (!IS_ERR_OR_NULL(vport->ingress.acl))
+			mlx5_destroy_flow_table(vport->ingress.acl);
+	}
+
+	kvfree(flow_group_in);
+	return err;
+}
+
+static void esw_vport_cleanup_ingress_rules(struct mlx5_eswitch *esw,
+					    struct mlx5_vport *vport)
+{
+	if (!IS_ERR_OR_NULL(vport->ingress.drop_rule))
+		mlx5_del_flow_rules(vport->ingress.drop_rule);
+
+	if (!IS_ERR_OR_NULL(vport->ingress.allow_rule))
+		mlx5_del_flow_rules(vport->ingress.allow_rule);
+
+	vport->ingress.drop_rule = NULL;
+	vport->ingress.allow_rule = NULL;
+}
+
+static void esw_vport_disable_ingress_acl(struct mlx5_eswitch *esw,
+					  struct mlx5_vport *vport)
+{
+	if (IS_ERR_OR_NULL(vport->ingress.acl))
+		return;
+
+	esw_debug(esw->dev, "Destroy vport[%d] E-Switch ingress ACL\n", vport->vport);
+
+	esw_vport_cleanup_ingress_rules(esw, vport);
+	mlx5_destroy_flow_group(vport->ingress.allow_spoofchk_only_grp);
+	mlx5_destroy_flow_group(vport->ingress.allow_untagged_only_grp);
+	mlx5_destroy_flow_group(vport->ingress.allow_untagged_spoofchk_grp);
+	mlx5_destroy_flow_group(vport->ingress.drop_grp);
+	mlx5_destroy_flow_table(vport->ingress.acl);
+	vport->ingress.acl = NULL;
+	vport->ingress.drop_grp = NULL;
+	vport->ingress.allow_spoofchk_only_grp = NULL;
+	vport->ingress.allow_untagged_only_grp = NULL;
+	vport->ingress.allow_untagged_spoofchk_grp = NULL;
+}
+
+static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
+				    struct mlx5_vport *vport)
+{
+	struct mlx5_fc *counter = vport->ingress.drop_counter;
+	struct mlx5_flow_destination drop_ctr_dst = {0};
+	struct mlx5_flow_destination *dst = NULL;
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_spec *spec;
+	int dest_num = 0;
+	int err = 0;
+	u8 *smac_v;
+
+	if (vport->info.spoofchk && !is_valid_ether_addr(vport->info.mac)) {
+		mlx5_core_warn(esw->dev,
+			       "vport[%d] configure ingress rules failed, illegal mac with spoofchk\n",
+			       vport->vport);
+		return -EPERM;
+	}
+
+	esw_vport_cleanup_ingress_rules(esw, vport);
+
+	if (!vport->info.vlan && !vport->info.qos && !vport->info.spoofchk) {
+		esw_vport_disable_ingress_acl(esw, vport);
+		return 0;
+	}
+
+	err = esw_vport_enable_ingress_acl(esw, vport);
+	if (err) {
+		mlx5_core_warn(esw->dev,
+			       "failed to enable ingress acl (%d) on vport[%d]\n",
+			       err, vport->vport);
+		return err;
+	}
+
+	esw_debug(esw->dev,
+		  "vport[%d] configure ingress rules, vlan(%d) qos(%d)\n",
+		  vport->vport, vport->info.vlan, vport->info.qos);
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (vport->info.vlan || vport->info.qos)
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag);
+
+	if (vport->info.spoofchk) {
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.smac_47_16);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.smac_15_0);
+		smac_v = MLX5_ADDR_OF(fte_match_param,
+				      spec->match_value,
+				      outer_headers.smac_47_16);
+		ether_addr_copy(smac_v, vport->info.mac);
+	}
+
+	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+	vport->ingress.allow_rule =
+		mlx5_add_flow_rules(vport->ingress.acl, spec,
+				    &flow_act, NULL, 0);
+	if (IS_ERR(vport->ingress.allow_rule)) {
+		err = PTR_ERR(vport->ingress.allow_rule);
+		esw_warn(esw->dev,
+			 "vport[%d] configure ingress allow rule, err(%d)\n",
+			 vport->vport, err);
+		vport->ingress.allow_rule = NULL;
+		goto out;
+	}
+
+	memset(spec, 0, sizeof(*spec));
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+
+	/* Attach drop flow counter */
+	if (counter) {
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+		drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		drop_ctr_dst.counter = counter;
+		dst = &drop_ctr_dst;
+		dest_num++;
+	}
+	vport->ingress.drop_rule =
+		mlx5_add_flow_rules(vport->ingress.acl, spec,
+				    &flow_act, dst, dest_num);
+	if (IS_ERR(vport->ingress.drop_rule)) {
+		err = PTR_ERR(vport->ingress.drop_rule);
+		esw_warn(esw->dev,
+			 "vport[%d] configure ingress drop rule, err(%d)\n",
+			 vport->vport, err);
+		vport->ingress.drop_rule = NULL;
+		goto out;
+	}
+
+out:
+	if (err)
+		esw_vport_cleanup_ingress_rules(esw, vport);
+	kvfree(spec);
+	return err;
+}
+
+static int esw_vport_egress_config(struct mlx5_eswitch *esw,
+				   struct mlx5_vport *vport)
+{
+	struct mlx5_fc *counter = vport->egress.drop_counter;
+	struct mlx5_flow_destination drop_ctr_dst = {0};
+	struct mlx5_flow_destination *dst = NULL;
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_spec *spec;
+	int dest_num = 0;
+	int err = 0;
+
+	esw_vport_cleanup_egress_rules(esw, vport);
+
+	if (!vport->info.vlan && !vport->info.qos) {
+		esw_vport_disable_egress_acl(esw, vport);
+		return 0;
+	}
+
+	err = esw_vport_enable_egress_acl(esw, vport);
+	if (err) {
+		mlx5_core_warn(esw->dev,
+			       "failed to enable egress acl (%d) on vport[%d]\n",
+			       err, vport->vport);
+		return err;
+	}
+
+	esw_debug(esw->dev,
+		  "vport[%d] configure egress rules, vlan(%d) qos(%d)\n",
+		  vport->vport, vport->info.vlan, vport->info.qos);
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Allowed vlan rule */
+	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.cvlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.first_vid);
+	MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, vport->info.vlan);
+
+	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+	vport->egress.allowed_vlan =
+		mlx5_add_flow_rules(vport->egress.acl, spec,
+				    &flow_act, NULL, 0);
+	if (IS_ERR(vport->egress.allowed_vlan)) {
+		err = PTR_ERR(vport->egress.allowed_vlan);
+		esw_warn(esw->dev,
+			 "vport[%d] configure egress allowed vlan rule failed, err(%d)\n",
+			 vport->vport, err);
+		vport->egress.allowed_vlan = NULL;
+		goto out;
+	}
+
+	/* Drop others rule (star rule) */
+	memset(spec, 0, sizeof(*spec));
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+
+	/* Attach egress drop flow counter */
+	if (counter) {
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+		drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		drop_ctr_dst.counter = counter;
+		dst = &drop_ctr_dst;
+		dest_num++;
+	}
+	vport->egress.drop_rule =
+		mlx5_add_flow_rules(vport->egress.acl, spec,
+				    &flow_act, dst, dest_num);
+	if (IS_ERR(vport->egress.drop_rule)) {
+		err = PTR_ERR(vport->egress.drop_rule);
+		esw_warn(esw->dev,
+			 "vport[%d] configure egress drop rule failed, err(%d)\n",
+			 vport->vport, err);
+		vport->egress.drop_rule = NULL;
+	}
+out:
+	kvfree(spec);
+	return err;
+}
+
+/* Vport QoS management */
+static int esw_create_tsar(struct mlx5_eswitch *esw)
+{
+	u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0};
+	struct mlx5_core_dev *dev = esw->dev;
+	int err;
+
+	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling))
+		return 0;
+
+	if (esw->qos.enabled)
+		return -EEXIST;
+
+	err = mlx5_create_scheduling_element_cmd(dev,
+						 SCHEDULING_HIERARCHY_E_SWITCH,
+						 tsar_ctx,
+						 &esw->qos.root_tsar_id);
+	if (err) {
+		esw_warn(esw->dev, "E-Switch create TSAR failed (%d)\n", err);
+		return err;
+	}
+
+	esw->qos.enabled = true;
+	return 0;
+}
+
+static void esw_destroy_tsar(struct mlx5_eswitch *esw)
+{
+	int err;
+
+	if (!esw->qos.enabled)
+		return;
+
+	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
+						  SCHEDULING_HIERARCHY_E_SWITCH,
+						  esw->qos.root_tsar_id);
+	if (err)
+		esw_warn(esw->dev, "E-Switch destroy TSAR failed (%d)\n", err);
+
+	esw->qos.enabled = false;
+}
+
+static int esw_vport_enable_qos(struct mlx5_eswitch *esw, int vport_num,
+				u32 initial_max_rate, u32 initial_bw_share)
+{
+	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0};
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	struct mlx5_core_dev *dev = esw->dev;
+	void *vport_elem;
+	int err = 0;
+
+	if (!esw->qos.enabled || !MLX5_CAP_GEN(dev, qos) ||
+	    !MLX5_CAP_QOS(dev, esw_scheduling))
+		return 0;
+
+	if (vport->qos.enabled)
+		return -EEXIST;
+
+	MLX5_SET(scheduling_context, sched_ctx, element_type,
+		 SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
+	vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx,
+				  element_attributes);
+	MLX5_SET(vport_element, vport_elem, vport_number, vport_num);
+	MLX5_SET(scheduling_context, sched_ctx, parent_element_id,
+		 esw->qos.root_tsar_id);
+	MLX5_SET(scheduling_context, sched_ctx, max_average_bw,
+		 initial_max_rate);
+	MLX5_SET(scheduling_context, sched_ctx, bw_share, initial_bw_share);
+
+	err = mlx5_create_scheduling_element_cmd(dev,
+						 SCHEDULING_HIERARCHY_E_SWITCH,
+						 sched_ctx,
+						 &vport->qos.esw_tsar_ix);
+	if (err) {
+		esw_warn(esw->dev, "E-Switch create TSAR vport element failed (vport=%d,err=%d)\n",
+			 vport_num, err);
+		return err;
+	}
+
+	vport->qos.enabled = true;
+	return 0;
+}
+
+static void esw_vport_disable_qos(struct mlx5_eswitch *esw, int vport_num)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	int err = 0;
+
+	if (!vport->qos.enabled)
+		return;
+
+	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
+						  SCHEDULING_HIERARCHY_E_SWITCH,
+						  vport->qos.esw_tsar_ix);
+	if (err)
+		esw_warn(esw->dev, "E-Switch destroy TSAR vport element failed (vport=%d,err=%d)\n",
+			 vport_num, err);
+
+	vport->qos.enabled = false;
+}
+
+static int esw_vport_qos_config(struct mlx5_eswitch *esw, int vport_num,
+				u32 max_rate, u32 bw_share)
+{
+	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0};
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	struct mlx5_core_dev *dev = esw->dev;
+	void *vport_elem;
+	u32 bitmask = 0;
+	int err = 0;
+
+	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling))
+		return -EOPNOTSUPP;
+
+	if (!vport->qos.enabled)
+		return -EIO;
+
+	MLX5_SET(scheduling_context, sched_ctx, element_type,
+		 SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
+	vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx,
+				  element_attributes);
+	MLX5_SET(vport_element, vport_elem, vport_number, vport_num);
+	MLX5_SET(scheduling_context, sched_ctx, parent_element_id,
+		 esw->qos.root_tsar_id);
+	MLX5_SET(scheduling_context, sched_ctx, max_average_bw,
+		 max_rate);
+	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
+	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
+	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE;
+
+	err = mlx5_modify_scheduling_element_cmd(dev,
+						 SCHEDULING_HIERARCHY_E_SWITCH,
+						 sched_ctx,
+						 vport->qos.esw_tsar_ix,
+						 bitmask);
+	if (err) {
+		esw_warn(esw->dev, "E-Switch modify TSAR vport element failed (vport=%d,err=%d)\n",
+			 vport_num, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static void node_guid_gen_from_mac(u64 *node_guid, u8 mac[ETH_ALEN])
+{
+	((u8 *)node_guid)[7] = mac[0];
+	((u8 *)node_guid)[6] = mac[1];
+	((u8 *)node_guid)[5] = mac[2];
+	((u8 *)node_guid)[4] = 0xff;
+	((u8 *)node_guid)[3] = 0xfe;
+	((u8 *)node_guid)[2] = mac[3];
+	((u8 *)node_guid)[1] = mac[4];
+	((u8 *)node_guid)[0] = mac[5];
+}
+
+static void esw_apply_vport_conf(struct mlx5_eswitch *esw,
+				 struct mlx5_vport *vport)
+{
+	int vport_num = vport->vport;
+
+	if (!vport_num)
+		return;
+
+	mlx5_modify_vport_admin_state(esw->dev,
+				      MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
+				      vport_num,
+				      vport->info.link_state);
+	mlx5_modify_nic_vport_mac_address(esw->dev, vport_num, vport->info.mac);
+	mlx5_modify_nic_vport_node_guid(esw->dev, vport_num, vport->info.node_guid);
+	modify_esw_vport_cvlan(esw->dev, vport_num, vport->info.vlan, vport->info.qos,
+			       (vport->info.vlan || vport->info.qos));
+
+	/* Only legacy mode needs ACLs */
+	if (esw->mode == SRIOV_LEGACY) {
+		esw_vport_ingress_config(esw, vport);
+		esw_vport_egress_config(esw, vport);
+	}
+}
+
+static void esw_vport_create_drop_counters(struct mlx5_vport *vport)
+{
+	struct mlx5_core_dev *dev = vport->dev;
+
+	if (MLX5_CAP_ESW_INGRESS_ACL(dev, flow_counter)) {
+		vport->ingress.drop_counter = mlx5_fc_create(dev, false);
+		if (IS_ERR(vport->ingress.drop_counter)) {
+			esw_warn(dev,
+				 "vport[%d] configure ingress drop rule counter failed\n",
+				 vport->vport);
+			vport->ingress.drop_counter = NULL;
+		}
+	}
+
+	if (MLX5_CAP_ESW_EGRESS_ACL(dev, flow_counter)) {
+		vport->egress.drop_counter = mlx5_fc_create(dev, false);
+		if (IS_ERR(vport->egress.drop_counter)) {
+			esw_warn(dev,
+				 "vport[%d] configure egress drop rule counter failed\n",
+				 vport->vport);
+			vport->egress.drop_counter = NULL;
+		}
+	}
+}
+
+static void esw_vport_destroy_drop_counters(struct mlx5_vport *vport)
+{
+	struct mlx5_core_dev *dev = vport->dev;
+
+	if (vport->ingress.drop_counter)
+		mlx5_fc_destroy(dev, vport->ingress.drop_counter);
+	if (vport->egress.drop_counter)
+		mlx5_fc_destroy(dev, vport->egress.drop_counter);
+}
+
+static void esw_enable_vport(struct mlx5_eswitch *esw, int vport_num,
+			     int enable_events)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+
+	mutex_lock(&esw->state_lock);
+	WARN_ON(vport->enabled);
+
+	esw_debug(esw->dev, "Enabling VPORT(%d)\n", vport_num);
+
+	/* Create steering drop counters for ingress and egress ACLs */
+	if (vport_num && esw->mode == SRIOV_LEGACY)
+		esw_vport_create_drop_counters(vport);
+
+	/* Restore old vport configuration */
+	esw_apply_vport_conf(esw, vport);
+
+	/* Attach vport to the eswitch rate limiter */
+	if (esw_vport_enable_qos(esw, vport_num, vport->info.max_rate,
+				 vport->qos.bw_share))
+		esw_warn(esw->dev, "Failed to attach vport %d to eswitch rate limiter", vport_num);
+
+	/* Sync with current vport context */
+	vport->enabled_events = enable_events;
+	vport->enabled = true;
+
+	/* only PF is trusted by default */
+	if (!vport_num)
+		vport->info.trusted = true;
+
+	esw_vport_change_handle_locked(vport);
+
+	esw->enabled_vports++;
+	esw_debug(esw->dev, "Enabled VPORT(%d)\n", vport_num);
+	mutex_unlock(&esw->state_lock);
+}
+
+static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+
+	if (!vport->enabled)
+		return;
+
+	esw_debug(esw->dev, "Disabling vport(%d)\n", vport_num);
+	/* Mark this vport as disabled to discard new events */
+	vport->enabled = false;
+
+	synchronize_irq(pci_irq_vector(esw->dev->pdev, MLX5_EQ_VEC_ASYNC));
+	/* Wait for current already scheduled events to complete */
+	flush_workqueue(esw->work_queue);
+	/* Disable events from this vport */
+	arm_vport_context_events_cmd(esw->dev, vport->vport, 0);
+	mutex_lock(&esw->state_lock);
+	/* We don't assume VFs will cleanup after themselves.
+	 * Calling vport change handler while vport is disabled will cleanup
+	 * the vport resources.
+	 */
+	esw_vport_change_handle_locked(vport);
+	vport->enabled_events = 0;
+	esw_vport_disable_qos(esw, vport_num);
+	if (vport_num && esw->mode == SRIOV_LEGACY) {
+		mlx5_modify_vport_admin_state(esw->dev,
+					      MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
+					      vport_num,
+					      MLX5_ESW_VPORT_ADMIN_STATE_DOWN);
+		esw_vport_disable_egress_acl(esw, vport);
+		esw_vport_disable_ingress_acl(esw, vport);
+		esw_vport_destroy_drop_counters(vport);
+	}
+	esw->enabled_vports--;
+	mutex_unlock(&esw->state_lock);
+}
+
+/* Public E-Switch API */
+#define ESW_ALLOWED(esw) ((esw) && MLX5_VPORT_MANAGER((esw)->dev))
+
+int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
+{
+	int err;
+	int i, enabled_events;
+
+	if (!ESW_ALLOWED(esw))
+		return 0;
+
+	if (!MLX5_CAP_GEN(esw->dev, eswitch_flow_table) ||
+	    !MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ft_support)) {
+		esw_warn(esw->dev, "E-Switch FDB is not supported, aborting ...\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!MLX5_CAP_ESW_INGRESS_ACL(esw->dev, ft_support))
+		esw_warn(esw->dev, "E-Switch ingress ACL is not supported by FW\n");
+
+	if (!MLX5_CAP_ESW_EGRESS_ACL(esw->dev, ft_support))
+		esw_warn(esw->dev, "E-Switch engress ACL is not supported by FW\n");
+
+	esw_info(esw->dev, "E-Switch enable SRIOV: nvfs(%d) mode (%d)\n", nvfs, mode);
+	esw->mode = mode;
+
+	if (mode == SRIOV_LEGACY) {
+		err = esw_create_legacy_fdb_table(esw, nvfs + 1);
+	} else {
+		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_IB);
+
+		err = esw_offloads_init(esw, nvfs + 1);
+	}
+
+	if (err)
+		goto abort;
+
+	err = esw_create_tsar(esw);
+	if (err)
+		esw_warn(esw->dev, "Failed to create eswitch TSAR");
+
+	/* Don't enable vport events when in SRIOV_OFFLOADS mode, since:
+	 * 1. L2 table (MPFS) is programmed by PF/VF representors netdevs set_rx_mode
+	 * 2. FDB/Eswitch is programmed by user space tools
+	 */
+	enabled_events = (mode == SRIOV_LEGACY) ? SRIOV_VPORT_EVENTS : 0;
+	for (i = 0; i <= nvfs; i++)
+		esw_enable_vport(esw, i, enabled_events);
+
+	esw_info(esw->dev, "SRIOV enabled: active vports(%d)\n",
+		 esw->enabled_vports);
+	return 0;
+
+abort:
+	esw->mode = SRIOV_NONE;
+
+	if (mode == SRIOV_OFFLOADS)
+		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_IB);
+
+	return err;
+}
+
+void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
+{
+	struct esw_mc_addr *mc_promisc;
+	int old_mode;
+	int nvports;
+	int i;
+
+	if (!ESW_ALLOWED(esw) || esw->mode == SRIOV_NONE)
+		return;
+
+	esw_info(esw->dev, "disable SRIOV: active vports(%d) mode(%d)\n",
+		 esw->enabled_vports, esw->mode);
+
+	mc_promisc = &esw->mc_promisc;
+	nvports = esw->enabled_vports;
+
+	for (i = 0; i < esw->total_vports; i++)
+		esw_disable_vport(esw, i);
+
+	if (mc_promisc && mc_promisc->uplink_rule)
+		mlx5_del_flow_rules(mc_promisc->uplink_rule);
+
+	esw_destroy_tsar(esw);
+
+	if (esw->mode == SRIOV_LEGACY)
+		esw_destroy_legacy_fdb_table(esw);
+	else if (esw->mode == SRIOV_OFFLOADS)
+		esw_offloads_cleanup(esw, nvports);
+
+	old_mode = esw->mode;
+	esw->mode = SRIOV_NONE;
+
+	if (old_mode == SRIOV_OFFLOADS)
+		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_IB);
+}
+
+int mlx5_eswitch_init(struct mlx5_core_dev *dev)
+{
+	int total_vports = MLX5_TOTAL_VPORTS(dev);
+	struct mlx5_eswitch *esw;
+	int vport_num;
+	int err;
+
+	if (!MLX5_VPORT_MANAGER(dev))
+		return 0;
+
+	esw_info(dev,
+		 "Total vports %d, per vport: max uc(%d) max mc(%d)\n",
+		 total_vports,
+		 MLX5_MAX_UC_PER_VPORT(dev),
+		 MLX5_MAX_MC_PER_VPORT(dev));
+
+	esw = kzalloc(sizeof(*esw), GFP_KERNEL);
+	if (!esw)
+		return -ENOMEM;
+
+	esw->dev = dev;
+
+	esw->work_queue = create_singlethread_workqueue("mlx5_esw_wq");
+	if (!esw->work_queue) {
+		err = -ENOMEM;
+		goto abort;
+	}
+
+	esw->vports = kcalloc(total_vports, sizeof(struct mlx5_vport),
+			      GFP_KERNEL);
+	if (!esw->vports) {
+		err = -ENOMEM;
+		goto abort;
+	}
+
+	err = esw_offloads_init_reps(esw);
+	if (err)
+		goto abort;
+
+	hash_init(esw->offloads.encap_tbl);
+	hash_init(esw->offloads.mod_hdr_tbl);
+	mutex_init(&esw->state_lock);
+
+	for (vport_num = 0; vport_num < total_vports; vport_num++) {
+		struct mlx5_vport *vport = &esw->vports[vport_num];
+
+		vport->vport = vport_num;
+		vport->info.link_state = MLX5_ESW_VPORT_ADMIN_STATE_AUTO;
+		vport->dev = dev;
+		INIT_WORK(&vport->vport_change_handler,
+			  esw_vport_change_handler);
+	}
+
+	esw->total_vports = total_vports;
+	esw->enabled_vports = 0;
+	esw->mode = SRIOV_NONE;
+	esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
+	if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, encap) &&
+	    MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap))
+		esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_BASIC;
+	else
+		esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE;
+
+	dev->priv.eswitch = esw;
+	return 0;
+abort:
+	if (esw->work_queue)
+		destroy_workqueue(esw->work_queue);
+	esw_offloads_cleanup_reps(esw);
+	kfree(esw->vports);
+	kfree(esw);
+	return err;
+}
+
+void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)
+{
+	if (!esw || !MLX5_VPORT_MANAGER(esw->dev))
+		return;
+
+	esw_info(esw->dev, "cleanup\n");
+
+	esw->dev->priv.eswitch = NULL;
+	destroy_workqueue(esw->work_queue);
+	esw_offloads_cleanup_reps(esw);
+	kfree(esw->vports);
+	kfree(esw);
+}
+
+void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe)
+{
+	struct mlx5_eqe_vport_change *vc_eqe = &eqe->data.vport_change;
+	u16 vport_num = be16_to_cpu(vc_eqe->vport_num);
+	struct mlx5_vport *vport;
+
+	if (!esw) {
+		pr_warn("MLX5 E-Switch: vport %d got an event while eswitch is not initialized\n",
+			vport_num);
+		return;
+	}
+
+	vport = &esw->vports[vport_num];
+	if (vport->enabled)
+		queue_work(esw->work_queue, &vport->vport_change_handler);
+}
+
+/* Vport Administration */
+#define LEGAL_VPORT(esw, vport) (vport >= 0 && vport < esw->total_vports)
+
+int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
+			       int vport, u8 mac[ETH_ALEN])
+{
+	struct mlx5_vport *evport;
+	u64 node_guid;
+	int err = 0;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport) || is_multicast_ether_addr(mac))
+		return -EINVAL;
+
+	mutex_lock(&esw->state_lock);
+	evport = &esw->vports[vport];
+
+	if (evport->info.spoofchk && !is_valid_ether_addr(mac)) {
+		mlx5_core_warn(esw->dev,
+			       "MAC invalidation is not allowed when spoofchk is on, vport(%d)\n",
+			       vport);
+		err = -EPERM;
+		goto unlock;
+	}
+
+	err = mlx5_modify_nic_vport_mac_address(esw->dev, vport, mac);
+	if (err) {
+		mlx5_core_warn(esw->dev,
+			       "Failed to mlx5_modify_nic_vport_mac vport(%d) err=(%d)\n",
+			       vport, err);
+		goto unlock;
+	}
+
+	node_guid_gen_from_mac(&node_guid, mac);
+	err = mlx5_modify_nic_vport_node_guid(esw->dev, vport, node_guid);
+	if (err)
+		mlx5_core_warn(esw->dev,
+			       "Failed to set vport %d node guid, err = %d. RDMA_CM will not function properly for this VF.\n",
+			       vport, err);
+
+	ether_addr_copy(evport->info.mac, mac);
+	evport->info.node_guid = node_guid;
+	if (evport->enabled && esw->mode == SRIOV_LEGACY)
+		err = esw_vport_ingress_config(esw, evport);
+
+unlock:
+	mutex_unlock(&esw->state_lock);
+	return err;
+}
+
+int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw,
+				 int vport, int link_state)
+{
+	struct mlx5_vport *evport;
+	int err = 0;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport))
+		return -EINVAL;
+
+	mutex_lock(&esw->state_lock);
+	evport = &esw->vports[vport];
+
+	err = mlx5_modify_vport_admin_state(esw->dev,
+					    MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
+					    vport, link_state);
+	if (err) {
+		mlx5_core_warn(esw->dev,
+			       "Failed to set vport %d link state, err = %d",
+			       vport, err);
+		goto unlock;
+	}
+
+	evport->info.link_state = link_state;
+
+unlock:
+	mutex_unlock(&esw->state_lock);
+	return 0;
+}
+
+int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
+				  int vport, struct ifla_vf_info *ivi)
+{
+	struct mlx5_vport *evport;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport))
+		return -EINVAL;
+
+	evport = &esw->vports[vport];
+
+	memset(ivi, 0, sizeof(*ivi));
+	ivi->vf = vport - 1;
+
+	mutex_lock(&esw->state_lock);
+	ether_addr_copy(ivi->mac, evport->info.mac);
+	ivi->linkstate = evport->info.link_state;
+	ivi->vlan = evport->info.vlan;
+	ivi->qos = evport->info.qos;
+	ivi->spoofchk = evport->info.spoofchk;
+	ivi->trusted = evport->info.trusted;
+	ivi->min_tx_rate = evport->info.min_rate;
+	ivi->max_tx_rate = evport->info.max_rate;
+	mutex_unlock(&esw->state_lock);
+
+	return 0;
+}
+
+int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
+				  int vport, u16 vlan, u8 qos, u8 set_flags)
+{
+	struct mlx5_vport *evport;
+	int err = 0;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport) || (vlan > 4095) || (qos > 7))
+		return -EINVAL;
+
+	mutex_lock(&esw->state_lock);
+	evport = &esw->vports[vport];
+
+	err = modify_esw_vport_cvlan(esw->dev, vport, vlan, qos, set_flags);
+	if (err)
+		goto unlock;
+
+	evport->info.vlan = vlan;
+	evport->info.qos = qos;
+	if (evport->enabled && esw->mode == SRIOV_LEGACY) {
+		err = esw_vport_ingress_config(esw, evport);
+		if (err)
+			goto unlock;
+		err = esw_vport_egress_config(esw, evport);
+	}
+
+unlock:
+	mutex_unlock(&esw->state_lock);
+	return err;
+}
+
+int mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
+				int vport, u16 vlan, u8 qos)
+{
+	u8 set_flags = 0;
+
+	if (vlan || qos)
+		set_flags = SET_VLAN_STRIP | SET_VLAN_INSERT;
+
+	return __mlx5_eswitch_set_vport_vlan(esw, vport, vlan, qos, set_flags);
+}
+
+int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw,
+				    int vport, bool spoofchk)
+{
+	struct mlx5_vport *evport;
+	bool pschk;
+	int err = 0;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport))
+		return -EINVAL;
+
+	mutex_lock(&esw->state_lock);
+	evport = &esw->vports[vport];
+	pschk = evport->info.spoofchk;
+	evport->info.spoofchk = spoofchk;
+	if (evport->enabled && esw->mode == SRIOV_LEGACY)
+		err = esw_vport_ingress_config(esw, evport);
+	if (err)
+		evport->info.spoofchk = pschk;
+	mutex_unlock(&esw->state_lock);
+
+	return err;
+}
+
+int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw,
+				 int vport, bool setting)
+{
+	struct mlx5_vport *evport;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport))
+		return -EINVAL;
+
+	mutex_lock(&esw->state_lock);
+	evport = &esw->vports[vport];
+	evport->info.trusted = setting;
+	if (evport->enabled)
+		esw_vport_change_handle_locked(evport);
+	mutex_unlock(&esw->state_lock);
+
+	return 0;
+}
+
+static u32 calculate_vports_min_rate_divider(struct mlx5_eswitch *esw)
+{
+	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
+	struct mlx5_vport *evport;
+	u32 max_guarantee = 0;
+	int i;
+
+	for (i = 0; i <= esw->total_vports; i++) {
+		evport = &esw->vports[i];
+		if (!evport->enabled || evport->info.min_rate < max_guarantee)
+			continue;
+		max_guarantee = evport->info.min_rate;
+	}
+
+	return max_t(u32, max_guarantee / fw_max_bw_share, 1);
+}
+
+static int normalize_vports_min_rate(struct mlx5_eswitch *esw, u32 divider)
+{
+	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
+	struct mlx5_vport *evport;
+	u32 vport_max_rate;
+	u32 vport_min_rate;
+	u32 bw_share;
+	int err;
+	int i;
+
+	for (i = 0; i <= esw->total_vports; i++) {
+		evport = &esw->vports[i];
+		if (!evport->enabled)
+			continue;
+		vport_min_rate = evport->info.min_rate;
+		vport_max_rate = evport->info.max_rate;
+		bw_share = MLX5_MIN_BW_SHARE;
+
+		if (vport_min_rate)
+			bw_share = MLX5_RATE_TO_BW_SHARE(vport_min_rate,
+							 divider,
+							 fw_max_bw_share);
+
+		if (bw_share == evport->qos.bw_share)
+			continue;
+
+		err = esw_vport_qos_config(esw, i, vport_max_rate,
+					   bw_share);
+		if (!err)
+			evport->qos.bw_share = bw_share;
+		else
+			return err;
+	}
+
+	return 0;
+}
+
+int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, int vport,
+				u32 max_rate, u32 min_rate)
+{
+	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
+	bool min_rate_supported = MLX5_CAP_QOS(esw->dev, esw_bw_share) &&
+					fw_max_bw_share >= MLX5_MIN_BW_SHARE;
+	bool max_rate_supported = MLX5_CAP_QOS(esw->dev, esw_rate_limit);
+	struct mlx5_vport *evport;
+	u32 previous_min_rate;
+	u32 divider;
+	int err = 0;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport))
+		return -EINVAL;
+	if ((min_rate && !min_rate_supported) || (max_rate && !max_rate_supported))
+		return -EOPNOTSUPP;
+
+	mutex_lock(&esw->state_lock);
+	evport = &esw->vports[vport];
+
+	if (min_rate == evport->info.min_rate)
+		goto set_max_rate;
+
+	previous_min_rate = evport->info.min_rate;
+	evport->info.min_rate = min_rate;
+	divider = calculate_vports_min_rate_divider(esw);
+	err = normalize_vports_min_rate(esw, divider);
+	if (err) {
+		evport->info.min_rate = previous_min_rate;
+		goto unlock;
+	}
+
+set_max_rate:
+	if (max_rate == evport->info.max_rate)
+		goto unlock;
+
+	err = esw_vport_qos_config(esw, vport, max_rate, evport->qos.bw_share);
+	if (!err)
+		evport->info.max_rate = max_rate;
+
+unlock:
+	mutex_unlock(&esw->state_lock);
+	return err;
+}
+
+static int mlx5_eswitch_query_vport_drop_stats(struct mlx5_core_dev *dev,
+					       int vport_idx,
+					       struct mlx5_vport_drop_stats *stats)
+{
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	struct mlx5_vport *vport = &esw->vports[vport_idx];
+	u64 rx_discard_vport_down, tx_discard_vport_down;
+	u64 bytes = 0;
+	u16 idx = 0;
+	int err = 0;
+
+	if (!vport->enabled || esw->mode != SRIOV_LEGACY)
+		return 0;
+
+	if (vport->egress.drop_counter) {
+		idx = vport->egress.drop_counter->id;
+		mlx5_fc_query(dev, idx, &stats->rx_dropped, &bytes);
+	}
+
+	if (vport->ingress.drop_counter) {
+		idx = vport->ingress.drop_counter->id;
+		mlx5_fc_query(dev, idx, &stats->tx_dropped, &bytes);
+	}
+
+	if (!MLX5_CAP_GEN(dev, receive_discard_vport_down) &&
+	    !MLX5_CAP_GEN(dev, transmit_discard_vport_down))
+		return 0;
+
+	err = mlx5_query_vport_down_stats(dev, vport_idx,
+					  &rx_discard_vport_down,
+					  &tx_discard_vport_down);
+	if (err)
+		return err;
+
+	if (MLX5_CAP_GEN(dev, receive_discard_vport_down))
+		stats->rx_dropped += rx_discard_vport_down;
+	if (MLX5_CAP_GEN(dev, transmit_discard_vport_down))
+		stats->tx_dropped += tx_discard_vport_down;
+
+	return 0;
+}
+
+int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
+				 int vport,
+				 struct ifla_vf_stats *vf_stats)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
+	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {0};
+	struct mlx5_vport_drop_stats stats = {0};
+	int err = 0;
+	u32 *out;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport))
+		return -EINVAL;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_vport_counter_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VPORT_COUNTER);
+	MLX5_SET(query_vport_counter_in, in, op_mod, 0);
+	MLX5_SET(query_vport_counter_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_vport_counter_in, in, other_vport, 1);
+
+	memset(out, 0, outlen);
+	err = mlx5_cmd_exec(esw->dev, in, sizeof(in), out, outlen);
+	if (err)
+		goto free_out;
+
+	#define MLX5_GET_CTR(p, x) \
+		MLX5_GET64(query_vport_counter_out, p, x)
+
+	memset(vf_stats, 0, sizeof(*vf_stats));
+	vf_stats->rx_packets =
+		MLX5_GET_CTR(out, received_eth_unicast.packets) +
+		MLX5_GET_CTR(out, received_ib_unicast.packets) +
+		MLX5_GET_CTR(out, received_eth_multicast.packets) +
+		MLX5_GET_CTR(out, received_ib_multicast.packets) +
+		MLX5_GET_CTR(out, received_eth_broadcast.packets);
+
+	vf_stats->rx_bytes =
+		MLX5_GET_CTR(out, received_eth_unicast.octets) +
+		MLX5_GET_CTR(out, received_ib_unicast.octets) +
+		MLX5_GET_CTR(out, received_eth_multicast.octets) +
+		MLX5_GET_CTR(out, received_ib_multicast.octets) +
+		MLX5_GET_CTR(out, received_eth_broadcast.octets);
+
+	vf_stats->tx_packets =
+		MLX5_GET_CTR(out, transmitted_eth_unicast.packets) +
+		MLX5_GET_CTR(out, transmitted_ib_unicast.packets) +
+		MLX5_GET_CTR(out, transmitted_eth_multicast.packets) +
+		MLX5_GET_CTR(out, transmitted_ib_multicast.packets) +
+		MLX5_GET_CTR(out, transmitted_eth_broadcast.packets);
+
+	vf_stats->tx_bytes =
+		MLX5_GET_CTR(out, transmitted_eth_unicast.octets) +
+		MLX5_GET_CTR(out, transmitted_ib_unicast.octets) +
+		MLX5_GET_CTR(out, transmitted_eth_multicast.octets) +
+		MLX5_GET_CTR(out, transmitted_ib_multicast.octets) +
+		MLX5_GET_CTR(out, transmitted_eth_broadcast.octets);
+
+	vf_stats->multicast =
+		MLX5_GET_CTR(out, received_eth_multicast.packets) +
+		MLX5_GET_CTR(out, received_ib_multicast.packets);
+
+	vf_stats->broadcast =
+		MLX5_GET_CTR(out, received_eth_broadcast.packets);
+
+	err = mlx5_eswitch_query_vport_drop_stats(esw->dev, vport, &stats);
+	if (err)
+		goto free_out;
+	vf_stats->rx_dropped = stats.rx_dropped;
+	vf_stats->tx_dropped = stats.tx_dropped;
+
+free_out:
+	kvfree(out);
+	return err;
+}
+
+u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw)
+{
+	return esw->mode;
+}
+EXPORT_SYMBOL_GPL(mlx5_eswitch_mode);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
new file mode 100644
index 0000000..4cd773f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_ESWITCH_H__
+#define __MLX5_ESWITCH_H__
+
+#include <linux/if_ether.h>
+#include <linux/if_link.h>
+#include <net/devlink.h>
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/eswitch.h>
+#include "lib/mpfs.h"
+
+#ifdef CONFIG_MLX5_ESWITCH
+
+#define MLX5_MAX_UC_PER_VPORT(dev) \
+	(1 << MLX5_CAP_GEN(dev, log_max_current_uc_list))
+
+#define MLX5_MAX_MC_PER_VPORT(dev) \
+	(1 << MLX5_CAP_GEN(dev, log_max_current_mc_list))
+
+#define FDB_UPLINK_VPORT 0xffff
+
+#define MLX5_MIN_BW_SHARE 1
+
+#define MLX5_RATE_TO_BW_SHARE(rate, divider, limit) \
+	min_t(u32, max_t(u32, (rate) / (divider), MLX5_MIN_BW_SHARE), limit)
+
+struct vport_ingress {
+	struct mlx5_flow_table *acl;
+	struct mlx5_flow_group *allow_untagged_spoofchk_grp;
+	struct mlx5_flow_group *allow_spoofchk_only_grp;
+	struct mlx5_flow_group *allow_untagged_only_grp;
+	struct mlx5_flow_group *drop_grp;
+	struct mlx5_flow_handle  *allow_rule;
+	struct mlx5_flow_handle  *drop_rule;
+	struct mlx5_fc           *drop_counter;
+};
+
+struct vport_egress {
+	struct mlx5_flow_table *acl;
+	struct mlx5_flow_group *allowed_vlans_grp;
+	struct mlx5_flow_group *drop_grp;
+	struct mlx5_flow_handle  *allowed_vlan;
+	struct mlx5_flow_handle  *drop_rule;
+	struct mlx5_fc           *drop_counter;
+};
+
+struct mlx5_vport_drop_stats {
+	u64 rx_dropped;
+	u64 tx_dropped;
+};
+
+struct mlx5_vport_info {
+	u8                      mac[ETH_ALEN];
+	u16                     vlan;
+	u8                      qos;
+	u64                     node_guid;
+	int                     link_state;
+	u32                     min_rate;
+	u32                     max_rate;
+	bool                    spoofchk;
+	bool                    trusted;
+};
+
+struct mlx5_vport {
+	struct mlx5_core_dev    *dev;
+	int                     vport;
+	struct hlist_head       uc_list[MLX5_L2_ADDR_HASH_SIZE];
+	struct hlist_head       mc_list[MLX5_L2_ADDR_HASH_SIZE];
+	struct mlx5_flow_handle *promisc_rule;
+	struct mlx5_flow_handle *allmulti_rule;
+	struct work_struct      vport_change_handler;
+
+	struct vport_ingress    ingress;
+	struct vport_egress     egress;
+
+	struct mlx5_vport_info  info;
+
+	struct {
+		bool            enabled;
+		u32             esw_tsar_ix;
+		u32             bw_share;
+	} qos;
+
+	bool                    enabled;
+	u16                     enabled_events;
+};
+
+struct mlx5_eswitch_fdb {
+	void *fdb;
+	union {
+		struct legacy_fdb {
+			struct mlx5_flow_group *addr_grp;
+			struct mlx5_flow_group *allmulti_grp;
+			struct mlx5_flow_group *promisc_grp;
+		} legacy;
+
+		struct offloads_fdb {
+			struct mlx5_flow_table *fdb;
+			struct mlx5_flow_group *send_to_vport_grp;
+			struct mlx5_flow_group *miss_grp;
+			struct mlx5_flow_handle *miss_rule_uni;
+			struct mlx5_flow_handle *miss_rule_multi;
+			int vlan_push_pop_refcount;
+		} offloads;
+	};
+};
+
+struct mlx5_esw_offload {
+	struct mlx5_flow_table *ft_offloads;
+	struct mlx5_flow_group *vport_rx_group;
+	struct mlx5_eswitch_rep *vport_reps;
+	DECLARE_HASHTABLE(encap_tbl, 8);
+	DECLARE_HASHTABLE(mod_hdr_tbl, 8);
+	u8 inline_mode;
+	u64 num_flows;
+	u8 encap;
+};
+
+/* E-Switch MC FDB table hash node */
+struct esw_mc_addr { /* SRIOV only */
+	struct l2addr_node     node;
+	struct mlx5_flow_handle *uplink_rule; /* Forward to uplink rule */
+	u32                    refcnt;
+};
+
+struct mlx5_eswitch {
+	struct mlx5_core_dev    *dev;
+	struct mlx5_eswitch_fdb fdb_table;
+	struct hlist_head       mc_table[MLX5_L2_ADDR_HASH_SIZE];
+	struct workqueue_struct *work_queue;
+	struct mlx5_vport       *vports;
+	int                     total_vports;
+	int                     enabled_vports;
+	/* Synchronize between vport change events
+	 * and async SRIOV admin state changes
+	 */
+	struct mutex            state_lock;
+	struct esw_mc_addr	mc_promisc;
+
+	struct {
+		bool            enabled;
+		u32             root_tsar_id;
+	} qos;
+
+	struct mlx5_esw_offload offloads;
+	int                     mode;
+};
+
+void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports);
+int esw_offloads_init(struct mlx5_eswitch *esw, int nvports);
+void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw);
+int esw_offloads_init_reps(struct mlx5_eswitch *esw);
+
+/* E-Switch API */
+int mlx5_eswitch_init(struct mlx5_core_dev *dev);
+void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw);
+void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe);
+int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode);
+void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw);
+int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
+			       int vport, u8 mac[ETH_ALEN]);
+int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw,
+				 int vport, int link_state);
+int mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
+				int vport, u16 vlan, u8 qos);
+int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw,
+				    int vport, bool spoofchk);
+int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw,
+				 int vport_num, bool setting);
+int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, int vport,
+				u32 max_rate, u32 min_rate);
+int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
+				  int vport, struct ifla_vf_info *ivi);
+int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
+				 int vport,
+				 struct ifla_vf_stats *vf_stats);
+void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule);
+
+struct mlx5_flow_spec;
+struct mlx5_esw_flow_attr;
+
+struct mlx5_flow_handle *
+mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
+				struct mlx5_flow_spec *spec,
+				struct mlx5_esw_flow_attr *attr);
+void
+mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
+				struct mlx5_flow_handle *rule,
+				struct mlx5_esw_flow_attr *attr);
+
+struct mlx5_flow_handle *
+mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn);
+
+enum {
+	SET_VLAN_STRIP	= BIT(0),
+	SET_VLAN_INSERT	= BIT(1)
+};
+
+struct mlx5_esw_flow_attr {
+	struct mlx5_eswitch_rep *in_rep;
+	struct mlx5_eswitch_rep *out_rep;
+
+	int	action;
+	__be16	vlan_proto;
+	u16	vlan_vid;
+	u8	vlan_prio;
+	bool	vlan_handled;
+	u32	encap_id;
+	u32	mod_hdr_id;
+	struct mlx5e_tc_flow_parse_attr *parse_attr;
+};
+
+int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode);
+int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode);
+int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode);
+int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode);
+int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode);
+int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap);
+int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap);
+void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type);
+
+int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
+				 struct mlx5_esw_flow_attr *attr);
+int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw,
+				 struct mlx5_esw_flow_attr *attr);
+int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
+				  int vport, u16 vlan, u8 qos, u8 set_flags);
+
+static inline bool mlx5_eswitch_vlan_actions_supported(struct mlx5_core_dev *dev)
+{
+	return MLX5_CAP_ESW_FLOWTABLE_FDB(dev, pop_vlan) &&
+	       MLX5_CAP_ESW_FLOWTABLE_FDB(dev, push_vlan);
+}
+
+#define MLX5_DEBUG_ESWITCH_MASK BIT(3)
+
+#define esw_info(dev, format, ...)				\
+	pr_info("(%s): E-Switch: " format, (dev)->priv.name, ##__VA_ARGS__)
+
+#define esw_warn(dev, format, ...)				\
+	pr_warn("(%s): E-Switch: " format, (dev)->priv.name, ##__VA_ARGS__)
+
+#define esw_debug(dev, format, ...)				\
+	mlx5_core_dbg_mask(dev, MLX5_DEBUG_ESWITCH_MASK, format, ##__VA_ARGS__)
+#else  /* CONFIG_MLX5_ESWITCH */
+/* eswitch API stubs */
+static inline int  mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; }
+static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {}
+static inline void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe) {}
+static inline int  mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode) { return 0; }
+static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw) {}
+#endif /* CONFIG_MLX5_ESWITCH */
+
+#endif /* __MLX5_ESWITCH_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
new file mode 100644
index 0000000..35e256e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -0,0 +1,1266 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/mlx5_ifc.h>
+#include <linux/mlx5/vport.h>
+#include <linux/mlx5/fs.h>
+#include "mlx5_core.h"
+#include "eswitch.h"
+
+enum {
+	FDB_FAST_PATH = 0,
+	FDB_SLOW_PATH
+};
+
+struct mlx5_flow_handle *
+mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
+				struct mlx5_flow_spec *spec,
+				struct mlx5_esw_flow_attr *attr)
+{
+	struct mlx5_flow_destination dest[2] = {};
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_fc *counter = NULL;
+	struct mlx5_flow_handle *rule;
+	void *misc;
+	int i = 0;
+
+	if (esw->mode != SRIOV_OFFLOADS)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	flow_act.action = attr->action;
+	/* if per flow vlan pop/push is emulated, don't set that into the firmware */
+	if (!mlx5_eswitch_vlan_actions_supported(esw->dev))
+		flow_act.action &= ~(MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH |
+				     MLX5_FLOW_CONTEXT_ACTION_VLAN_POP);
+	else if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH) {
+		flow_act.vlan.ethtype = ntohs(attr->vlan_proto);
+		flow_act.vlan.vid = attr->vlan_vid;
+		flow_act.vlan.prio = attr->vlan_prio;
+	}
+
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
+		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+		dest[i].vport_num = attr->out_rep->vport;
+		i++;
+	}
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		counter = mlx5_fc_create(esw->dev, true);
+		if (IS_ERR(counter)) {
+			rule = ERR_CAST(counter);
+			goto err_counter_alloc;
+		}
+		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dest[i].counter = counter;
+		i++;
+	}
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
+	MLX5_SET(fte_match_set_misc, misc, source_port, attr->in_rep->vport);
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
+	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+
+	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS |
+				      MLX5_MATCH_MISC_PARAMETERS;
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DECAP)
+		spec->match_criteria_enable |= MLX5_MATCH_INNER_HEADERS;
+
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+		flow_act.modify_id = attr->mod_hdr_id;
+
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_ENCAP)
+		flow_act.encap_id = attr->encap_id;
+
+	rule = mlx5_add_flow_rules((struct mlx5_flow_table *)esw->fdb_table.fdb,
+				   spec, &flow_act, dest, i);
+	if (IS_ERR(rule))
+		goto err_add_rule;
+	else
+		esw->offloads.num_flows++;
+
+	return rule;
+
+err_add_rule:
+	mlx5_fc_destroy(esw->dev, counter);
+err_counter_alloc:
+	return rule;
+}
+
+void
+mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
+				struct mlx5_flow_handle *rule,
+				struct mlx5_esw_flow_attr *attr)
+{
+	struct mlx5_fc *counter = NULL;
+
+	counter = mlx5_flow_rule_counter(rule);
+	mlx5_del_flow_rules(rule);
+	mlx5_fc_destroy(esw->dev, counter);
+	esw->offloads.num_flows--;
+}
+
+static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val)
+{
+	struct mlx5_eswitch_rep *rep;
+	int vf_vport, err = 0;
+
+	esw_debug(esw->dev, "%s applying global %s policy\n", __func__, val ? "pop" : "none");
+	for (vf_vport = 1; vf_vport < esw->enabled_vports; vf_vport++) {
+		rep = &esw->offloads.vport_reps[vf_vport];
+		if (!rep->rep_if[REP_ETH].valid)
+			continue;
+
+		err = __mlx5_eswitch_set_vport_vlan(esw, rep->vport, 0, 0, val);
+		if (err)
+			goto out;
+	}
+
+out:
+	return err;
+}
+
+static struct mlx5_eswitch_rep *
+esw_vlan_action_get_vport(struct mlx5_esw_flow_attr *attr, bool push, bool pop)
+{
+	struct mlx5_eswitch_rep *in_rep, *out_rep, *vport = NULL;
+
+	in_rep  = attr->in_rep;
+	out_rep = attr->out_rep;
+
+	if (push)
+		vport = in_rep;
+	else if (pop)
+		vport = out_rep;
+	else
+		vport = in_rep;
+
+	return vport;
+}
+
+static int esw_add_vlan_action_check(struct mlx5_esw_flow_attr *attr,
+				     bool push, bool pop, bool fwd)
+{
+	struct mlx5_eswitch_rep *in_rep, *out_rep;
+
+	if ((push || pop) && !fwd)
+		goto out_notsupp;
+
+	in_rep  = attr->in_rep;
+	out_rep = attr->out_rep;
+
+	if (push && in_rep->vport == FDB_UPLINK_VPORT)
+		goto out_notsupp;
+
+	if (pop && out_rep->vport == FDB_UPLINK_VPORT)
+		goto out_notsupp;
+
+	/* vport has vlan push configured, can't offload VF --> wire rules w.o it */
+	if (!push && !pop && fwd)
+		if (in_rep->vlan && out_rep->vport == FDB_UPLINK_VPORT)
+			goto out_notsupp;
+
+	/* protects against (1) setting rules with different vlans to push and
+	 * (2) setting rules w.o vlans (attr->vlan = 0) && w. vlans to push (!= 0)
+	 */
+	if (push && in_rep->vlan_refcount && (in_rep->vlan != attr->vlan_vid))
+		goto out_notsupp;
+
+	return 0;
+
+out_notsupp:
+	return -EOPNOTSUPP;
+}
+
+int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
+				 struct mlx5_esw_flow_attr *attr)
+{
+	struct offloads_fdb *offloads = &esw->fdb_table.offloads;
+	struct mlx5_eswitch_rep *vport = NULL;
+	bool push, pop, fwd;
+	int err = 0;
+
+	/* nop if we're on the vlan push/pop non emulation mode */
+	if (mlx5_eswitch_vlan_actions_supported(esw->dev))
+		return 0;
+
+	push = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH);
+	pop  = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP);
+	fwd  = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST);
+
+	err = esw_add_vlan_action_check(attr, push, pop, fwd);
+	if (err)
+		return err;
+
+	attr->vlan_handled = false;
+
+	vport = esw_vlan_action_get_vport(attr, push, pop);
+
+	if (!push && !pop && fwd) {
+		/* tracks VF --> wire rules without vlan push action */
+		if (attr->out_rep->vport == FDB_UPLINK_VPORT) {
+			vport->vlan_refcount++;
+			attr->vlan_handled = true;
+		}
+
+		return 0;
+	}
+
+	if (!push && !pop)
+		return 0;
+
+	if (!(offloads->vlan_push_pop_refcount)) {
+		/* it's the 1st vlan rule, apply global vlan pop policy */
+		err = esw_set_global_vlan_pop(esw, SET_VLAN_STRIP);
+		if (err)
+			goto out;
+	}
+	offloads->vlan_push_pop_refcount++;
+
+	if (push) {
+		if (vport->vlan_refcount)
+			goto skip_set_push;
+
+		err = __mlx5_eswitch_set_vport_vlan(esw, vport->vport, attr->vlan_vid, 0,
+						    SET_VLAN_INSERT | SET_VLAN_STRIP);
+		if (err)
+			goto out;
+		vport->vlan = attr->vlan_vid;
+skip_set_push:
+		vport->vlan_refcount++;
+	}
+out:
+	if (!err)
+		attr->vlan_handled = true;
+	return err;
+}
+
+int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw,
+				 struct mlx5_esw_flow_attr *attr)
+{
+	struct offloads_fdb *offloads = &esw->fdb_table.offloads;
+	struct mlx5_eswitch_rep *vport = NULL;
+	bool push, pop, fwd;
+	int err = 0;
+
+	/* nop if we're on the vlan push/pop non emulation mode */
+	if (mlx5_eswitch_vlan_actions_supported(esw->dev))
+		return 0;
+
+	if (!attr->vlan_handled)
+		return 0;
+
+	push = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH);
+	pop  = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP);
+	fwd  = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST);
+
+	vport = esw_vlan_action_get_vport(attr, push, pop);
+
+	if (!push && !pop && fwd) {
+		/* tracks VF --> wire rules without vlan push action */
+		if (attr->out_rep->vport == FDB_UPLINK_VPORT)
+			vport->vlan_refcount--;
+
+		return 0;
+	}
+
+	if (push) {
+		vport->vlan_refcount--;
+		if (vport->vlan_refcount)
+			goto skip_unset_push;
+
+		vport->vlan = 0;
+		err = __mlx5_eswitch_set_vport_vlan(esw, vport->vport,
+						    0, 0, SET_VLAN_STRIP);
+		if (err)
+			goto out;
+	}
+
+skip_unset_push:
+	offloads->vlan_push_pop_refcount--;
+	if (offloads->vlan_push_pop_refcount)
+		return 0;
+
+	/* no more vlan rules, stop global vlan pop policy */
+	err = esw_set_global_vlan_pop(esw, 0);
+
+out:
+	return err;
+}
+
+struct mlx5_flow_handle *
+mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn)
+{
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_handle *flow_rule;
+	struct mlx5_flow_spec *spec;
+	void *misc;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		flow_rule = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
+	MLX5_SET(fte_match_set_misc, misc, source_sqn, sqn);
+	MLX5_SET(fte_match_set_misc, misc, source_port, 0x0); /* source vport is 0 */
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
+	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_sqn);
+	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+
+	spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+	dest.vport_num = vport;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+
+	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
+					&flow_act, &dest, 1);
+	if (IS_ERR(flow_rule))
+		esw_warn(esw->dev, "FDB: Failed to add send to vport rule err %ld\n", PTR_ERR(flow_rule));
+out:
+	kvfree(spec);
+	return flow_rule;
+}
+EXPORT_SYMBOL(mlx5_eswitch_add_send_to_vport_rule);
+
+void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule)
+{
+	mlx5_del_flow_rules(rule);
+}
+
+static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
+{
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_handle *flow_rule = NULL;
+	struct mlx5_flow_spec *spec;
+	void *headers_c;
+	void *headers_v;
+	int err = 0;
+	u8 *dmac_c;
+	u8 *dmac_v;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				 outer_headers);
+	dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c,
+			      outer_headers.dmac_47_16);
+	dmac_c[0] = 0x01;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+	dest.vport_num = 0;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+
+	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
+					&flow_act, &dest, 1);
+	if (IS_ERR(flow_rule)) {
+		err = PTR_ERR(flow_rule);
+		esw_warn(esw->dev,  "FDB: Failed to add unicast miss flow rule err %d\n", err);
+		goto out;
+	}
+
+	esw->fdb_table.offloads.miss_rule_uni = flow_rule;
+
+	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				 outer_headers);
+	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v,
+			      outer_headers.dmac_47_16);
+	dmac_v[0] = 0x01;
+	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
+					&flow_act, &dest, 1);
+	if (IS_ERR(flow_rule)) {
+		err = PTR_ERR(flow_rule);
+		esw_warn(esw->dev, "FDB: Failed to add multicast miss flow rule err %d\n", err);
+		mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule_uni);
+		goto out;
+	}
+
+	esw->fdb_table.offloads.miss_rule_multi = flow_rule;
+
+out:
+	kvfree(spec);
+	return err;
+}
+
+#define ESW_OFFLOADS_NUM_GROUPS  4
+
+static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
+{
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *root_ns;
+	struct mlx5_flow_table *fdb = NULL;
+	int esw_size, err = 0;
+	u32 flags = 0;
+	u32 max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
+				MLX5_CAP_GEN(dev, max_flow_counter_15_0);
+
+	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+	if (!root_ns) {
+		esw_warn(dev, "Failed to get FDB flow namespace\n");
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d)*groups(%d))\n",
+		  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size),
+		  max_flow_counter, ESW_OFFLOADS_NUM_GROUPS);
+
+	esw_size = min_t(int, max_flow_counter * ESW_OFFLOADS_NUM_GROUPS,
+			 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
+
+	if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
+		flags |= MLX5_FLOW_TABLE_TUNNEL_EN;
+
+	fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
+						  esw_size,
+						  ESW_OFFLOADS_NUM_GROUPS, 0,
+						  flags);
+	if (IS_ERR(fdb)) {
+		err = PTR_ERR(fdb);
+		esw_warn(dev, "Failed to create Fast path FDB Table err %d\n", err);
+		goto out;
+	}
+	esw->fdb_table.fdb = fdb;
+
+out:
+	return err;
+}
+
+static void esw_destroy_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
+{
+	mlx5_destroy_flow_table(esw->fdb_table.fdb);
+}
+
+#define MAX_PF_SQ 256
+#define MAX_SQ_NVPORTS 32
+
+static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *root_ns;
+	struct mlx5_flow_table *fdb = NULL;
+	int table_size, ix, err = 0;
+	struct mlx5_flow_group *g;
+	void *match_criteria;
+	u32 *flow_group_in;
+	u8 *dmac;
+
+	esw_debug(esw->dev, "Create offloads FDB Tables\n");
+	flow_group_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!flow_group_in)
+		return -ENOMEM;
+
+	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+	if (!root_ns) {
+		esw_warn(dev, "Failed to get FDB flow namespace\n");
+		err = -EOPNOTSUPP;
+		goto ns_err;
+	}
+
+	err = esw_create_offloads_fast_fdb_table(esw);
+	if (err)
+		goto fast_fdb_err;
+
+	table_size = nvports * MAX_SQ_NVPORTS + MAX_PF_SQ + 2;
+
+	ft_attr.max_fte = table_size;
+	ft_attr.prio = FDB_SLOW_PATH;
+
+	fdb = mlx5_create_flow_table(root_ns, &ft_attr);
+	if (IS_ERR(fdb)) {
+		err = PTR_ERR(fdb);
+		esw_warn(dev, "Failed to create slow path FDB Table err %d\n", err);
+		goto slow_fdb_err;
+	}
+	esw->fdb_table.offloads.fdb = fdb;
+
+	/* create send-to-vport group */
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_MISC_PARAMETERS);
+
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, misc_parameters.source_sqn);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, misc_parameters.source_port);
+
+	ix = nvports * MAX_SQ_NVPORTS + MAX_PF_SQ;
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, ix - 1);
+
+	g = mlx5_create_flow_group(fdb, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create send-to-vport flow group err(%d)\n", err);
+		goto send_vport_err;
+	}
+	esw->fdb_table.offloads.send_to_vport_grp = g;
+
+	/* create miss group */
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_OUTER_HEADERS);
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in,
+				      match_criteria);
+	dmac = MLX5_ADDR_OF(fte_match_param, match_criteria,
+			    outer_headers.dmac_47_16);
+	dmac[0] = 0x01;
+
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, ix);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, ix + 2);
+
+	g = mlx5_create_flow_group(fdb, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create miss flow group err(%d)\n", err);
+		goto miss_err;
+	}
+	esw->fdb_table.offloads.miss_grp = g;
+
+	err = esw_add_fdb_miss_rule(esw);
+	if (err)
+		goto miss_rule_err;
+
+	return 0;
+
+miss_rule_err:
+	mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
+miss_err:
+	mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
+send_vport_err:
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.fdb);
+slow_fdb_err:
+	mlx5_destroy_flow_table(esw->fdb_table.fdb);
+fast_fdb_err:
+ns_err:
+	kvfree(flow_group_in);
+	return err;
+}
+
+static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw)
+{
+	if (!esw->fdb_table.fdb)
+		return;
+
+	esw_debug(esw->dev, "Destroy offloads FDB Tables\n");
+	mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule_multi);
+	mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule_uni);
+	mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
+	mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
+
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.fdb);
+	esw_destroy_offloads_fast_fdb_table(esw);
+}
+
+static int esw_create_offloads_table(struct mlx5_eswitch *esw)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_table *ft_offloads;
+	struct mlx5_flow_namespace *ns;
+	int err = 0;
+
+	ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_OFFLOADS);
+	if (!ns) {
+		esw_warn(esw->dev, "Failed to get offloads flow namespace\n");
+		return -EOPNOTSUPP;
+	}
+
+	ft_attr.max_fte = dev->priv.sriov.num_vfs + 2;
+
+	ft_offloads = mlx5_create_flow_table(ns, &ft_attr);
+	if (IS_ERR(ft_offloads)) {
+		err = PTR_ERR(ft_offloads);
+		esw_warn(esw->dev, "Failed to create offloads table, err %d\n", err);
+		return err;
+	}
+
+	esw->offloads.ft_offloads = ft_offloads;
+	return 0;
+}
+
+static void esw_destroy_offloads_table(struct mlx5_eswitch *esw)
+{
+	struct mlx5_esw_offload *offloads = &esw->offloads;
+
+	mlx5_destroy_flow_table(offloads->ft_offloads);
+}
+
+static int esw_create_vport_rx_group(struct mlx5_eswitch *esw)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_group *g;
+	struct mlx5_priv *priv = &esw->dev->priv;
+	u32 *flow_group_in;
+	void *match_criteria, *misc;
+	int err = 0;
+	int nvports = priv->sriov.num_vfs + 2;
+
+	flow_group_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!flow_group_in)
+		return -ENOMEM;
+
+	/* create vport rx group */
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_MISC_PARAMETERS);
+
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+	misc = MLX5_ADDR_OF(fte_match_param, match_criteria, misc_parameters);
+	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, nvports - 1);
+
+	g = mlx5_create_flow_group(esw->offloads.ft_offloads, flow_group_in);
+
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		mlx5_core_warn(esw->dev, "Failed to create vport rx group err %d\n", err);
+		goto out;
+	}
+
+	esw->offloads.vport_rx_group = g;
+out:
+	kfree(flow_group_in);
+	return err;
+}
+
+static void esw_destroy_vport_rx_group(struct mlx5_eswitch *esw)
+{
+	mlx5_destroy_flow_group(esw->offloads.vport_rx_group);
+}
+
+struct mlx5_flow_handle *
+mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn)
+{
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_handle *flow_rule;
+	struct mlx5_flow_spec *spec;
+	void *misc;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		flow_rule = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
+	MLX5_SET(fte_match_set_misc, misc, source_port, vport);
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
+	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+
+	spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	dest.tir_num = tirn;
+
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	flow_rule = mlx5_add_flow_rules(esw->offloads.ft_offloads, spec,
+					&flow_act, &dest, 1);
+	if (IS_ERR(flow_rule)) {
+		esw_warn(esw->dev, "fs offloads: Failed to add vport rx rule err %ld\n", PTR_ERR(flow_rule));
+		goto out;
+	}
+
+out:
+	kvfree(spec);
+	return flow_rule;
+}
+
+static int esw_offloads_start(struct mlx5_eswitch *esw)
+{
+	int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs;
+
+	if (esw->mode != SRIOV_LEGACY) {
+		esw_warn(esw->dev, "Can't set offloads mode, SRIOV legacy not enabled\n");
+		return -EINVAL;
+	}
+
+	mlx5_eswitch_disable_sriov(esw);
+	err = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_OFFLOADS);
+	if (err) {
+		esw_warn(esw->dev, "Failed setting eswitch to offloads, err %d\n", err);
+		err1 = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_LEGACY);
+		if (err1)
+			esw_warn(esw->dev, "Failed setting eswitch back to legacy, err %d\n", err1);
+	}
+	if (esw->offloads.inline_mode == MLX5_INLINE_MODE_NONE) {
+		if (mlx5_eswitch_inline_mode_get(esw,
+						 num_vfs,
+						 &esw->offloads.inline_mode)) {
+			esw->offloads.inline_mode = MLX5_INLINE_MODE_L2;
+			esw_warn(esw->dev, "Inline mode is different between vports\n");
+		}
+	}
+	return err;
+}
+
+void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw)
+{
+	kfree(esw->offloads.vport_reps);
+}
+
+int esw_offloads_init_reps(struct mlx5_eswitch *esw)
+{
+	int total_vfs = MLX5_TOTAL_VPORTS(esw->dev);
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_esw_offload *offloads;
+	struct mlx5_eswitch_rep *rep;
+	u8 hw_id[ETH_ALEN];
+	int vport;
+
+	esw->offloads.vport_reps = kcalloc(total_vfs,
+					   sizeof(struct mlx5_eswitch_rep),
+					   GFP_KERNEL);
+	if (!esw->offloads.vport_reps)
+		return -ENOMEM;
+
+	offloads = &esw->offloads;
+	mlx5_query_nic_vport_mac_address(dev, 0, hw_id);
+
+	for (vport = 0; vport < total_vfs; vport++) {
+		rep = &offloads->vport_reps[vport];
+
+		rep->vport = vport;
+		ether_addr_copy(rep->hw_id, hw_id);
+	}
+
+	offloads->vport_reps[0].vport = FDB_UPLINK_VPORT;
+
+	return 0;
+}
+
+static void esw_offloads_unload_reps_type(struct mlx5_eswitch *esw, int nvports,
+					  u8 rep_type)
+{
+	struct mlx5_eswitch_rep *rep;
+	int vport;
+
+	for (vport = nvports - 1; vport >= 0; vport--) {
+		rep = &esw->offloads.vport_reps[vport];
+		if (!rep->rep_if[rep_type].valid)
+			continue;
+
+		rep->rep_if[rep_type].unload(rep);
+	}
+}
+
+static void esw_offloads_unload_reps(struct mlx5_eswitch *esw, int nvports)
+{
+	u8 rep_type = NUM_REP_TYPES;
+
+	while (rep_type-- > 0)
+		esw_offloads_unload_reps_type(esw, nvports, rep_type);
+}
+
+static int esw_offloads_load_reps_type(struct mlx5_eswitch *esw, int nvports,
+				       u8 rep_type)
+{
+	struct mlx5_eswitch_rep *rep;
+	int vport;
+	int err;
+
+	for (vport = 0; vport < nvports; vport++) {
+		rep = &esw->offloads.vport_reps[vport];
+		if (!rep->rep_if[rep_type].valid)
+			continue;
+
+		err = rep->rep_if[rep_type].load(esw->dev, rep);
+		if (err)
+			goto err_reps;
+	}
+
+	return 0;
+
+err_reps:
+	esw_offloads_unload_reps_type(esw, vport, rep_type);
+	return err;
+}
+
+static int esw_offloads_load_reps(struct mlx5_eswitch *esw, int nvports)
+{
+	u8 rep_type = 0;
+	int err;
+
+	for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) {
+		err = esw_offloads_load_reps_type(esw, nvports, rep_type);
+		if (err)
+			goto err_reps;
+	}
+
+	return err;
+
+err_reps:
+	while (rep_type-- > 0)
+		esw_offloads_unload_reps_type(esw, nvports, rep_type);
+	return err;
+}
+
+int esw_offloads_init(struct mlx5_eswitch *esw, int nvports)
+{
+	int err;
+
+	err = esw_create_offloads_fdb_tables(esw, nvports);
+	if (err)
+		return err;
+
+	err = esw_create_offloads_table(esw);
+	if (err)
+		goto create_ft_err;
+
+	err = esw_create_vport_rx_group(esw);
+	if (err)
+		goto create_fg_err;
+
+	err = esw_offloads_load_reps(esw, nvports);
+	if (err)
+		goto err_reps;
+
+	return 0;
+
+err_reps:
+	esw_destroy_vport_rx_group(esw);
+
+create_fg_err:
+	esw_destroy_offloads_table(esw);
+
+create_ft_err:
+	esw_destroy_offloads_fdb_tables(esw);
+
+	return err;
+}
+
+static int esw_offloads_stop(struct mlx5_eswitch *esw)
+{
+	int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs;
+
+	mlx5_eswitch_disable_sriov(esw);
+	err = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_LEGACY);
+	if (err) {
+		esw_warn(esw->dev, "Failed setting eswitch to legacy, err %d\n", err);
+		err1 = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_OFFLOADS);
+		if (err1)
+			esw_warn(esw->dev, "Failed setting eswitch back to offloads, err %d\n", err);
+	}
+
+	/* enable back PF RoCE */
+	mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_IB);
+
+	return err;
+}
+
+void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports)
+{
+	esw_offloads_unload_reps(esw, nvports);
+	esw_destroy_vport_rx_group(esw);
+	esw_destroy_offloads_table(esw);
+	esw_destroy_offloads_fdb_tables(esw);
+}
+
+static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode)
+{
+	switch (mode) {
+	case DEVLINK_ESWITCH_MODE_LEGACY:
+		*mlx5_mode = SRIOV_LEGACY;
+		break;
+	case DEVLINK_ESWITCH_MODE_SWITCHDEV:
+		*mlx5_mode = SRIOV_OFFLOADS;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int esw_mode_to_devlink(u16 mlx5_mode, u16 *mode)
+{
+	switch (mlx5_mode) {
+	case SRIOV_LEGACY:
+		*mode = DEVLINK_ESWITCH_MODE_LEGACY;
+		break;
+	case SRIOV_OFFLOADS:
+		*mode = DEVLINK_ESWITCH_MODE_SWITCHDEV;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int esw_inline_mode_from_devlink(u8 mode, u8 *mlx5_mode)
+{
+	switch (mode) {
+	case DEVLINK_ESWITCH_INLINE_MODE_NONE:
+		*mlx5_mode = MLX5_INLINE_MODE_NONE;
+		break;
+	case DEVLINK_ESWITCH_INLINE_MODE_LINK:
+		*mlx5_mode = MLX5_INLINE_MODE_L2;
+		break;
+	case DEVLINK_ESWITCH_INLINE_MODE_NETWORK:
+		*mlx5_mode = MLX5_INLINE_MODE_IP;
+		break;
+	case DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT:
+		*mlx5_mode = MLX5_INLINE_MODE_TCP_UDP;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int esw_inline_mode_to_devlink(u8 mlx5_mode, u8 *mode)
+{
+	switch (mlx5_mode) {
+	case MLX5_INLINE_MODE_NONE:
+		*mode = DEVLINK_ESWITCH_INLINE_MODE_NONE;
+		break;
+	case MLX5_INLINE_MODE_L2:
+		*mode = DEVLINK_ESWITCH_INLINE_MODE_LINK;
+		break;
+	case MLX5_INLINE_MODE_IP:
+		*mode = DEVLINK_ESWITCH_INLINE_MODE_NETWORK;
+		break;
+	case MLX5_INLINE_MODE_TCP_UDP:
+		*mode = DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5_devlink_eswitch_check(struct devlink *devlink)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+
+	if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return -EOPNOTSUPP;
+
+	if (!MLX5_CAP_GEN(dev, vport_group_manager))
+		return -EOPNOTSUPP;
+
+	if (dev->priv.eswitch->mode == SRIOV_NONE)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	u16 cur_mlx5_mode, mlx5_mode = 0;
+	int err;
+
+	err = mlx5_devlink_eswitch_check(devlink);
+	if (err)
+		return err;
+
+	cur_mlx5_mode = dev->priv.eswitch->mode;
+
+	if (esw_mode_from_devlink(mode, &mlx5_mode))
+		return -EINVAL;
+
+	if (cur_mlx5_mode == mlx5_mode)
+		return 0;
+
+	if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV)
+		return esw_offloads_start(dev->priv.eswitch);
+	else if (mode == DEVLINK_ESWITCH_MODE_LEGACY)
+		return esw_offloads_stop(dev->priv.eswitch);
+	else
+		return -EINVAL;
+}
+
+int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	int err;
+
+	err = mlx5_devlink_eswitch_check(devlink);
+	if (err)
+		return err;
+
+	return esw_mode_to_devlink(dev->priv.eswitch->mode, mode);
+}
+
+int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	int err, vport;
+	u8 mlx5_mode;
+
+	err = mlx5_devlink_eswitch_check(devlink);
+	if (err)
+		return err;
+
+	switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
+	case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+		if (mode == DEVLINK_ESWITCH_INLINE_MODE_NONE)
+			return 0;
+		/* fall through */
+	case MLX5_CAP_INLINE_MODE_L2:
+		esw_warn(dev, "Inline mode can't be set\n");
+		return -EOPNOTSUPP;
+	case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+		break;
+	}
+
+	if (esw->offloads.num_flows > 0) {
+		esw_warn(dev, "Can't set inline mode when flows are configured\n");
+		return -EOPNOTSUPP;
+	}
+
+	err = esw_inline_mode_from_devlink(mode, &mlx5_mode);
+	if (err)
+		goto out;
+
+	for (vport = 1; vport < esw->enabled_vports; vport++) {
+		err = mlx5_modify_nic_vport_min_inline(dev, vport, mlx5_mode);
+		if (err) {
+			esw_warn(dev, "Failed to set min inline on vport %d\n",
+				 vport);
+			goto revert_inline_mode;
+		}
+	}
+
+	esw->offloads.inline_mode = mlx5_mode;
+	return 0;
+
+revert_inline_mode:
+	while (--vport > 0)
+		mlx5_modify_nic_vport_min_inline(dev,
+						 vport,
+						 esw->offloads.inline_mode);
+out:
+	return err;
+}
+
+int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	int err;
+
+	err = mlx5_devlink_eswitch_check(devlink);
+	if (err)
+		return err;
+
+	return esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode);
+}
+
+int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode)
+{
+	u8 prev_mlx5_mode, mlx5_mode = MLX5_INLINE_MODE_L2;
+	struct mlx5_core_dev *dev = esw->dev;
+	int vport;
+
+	if (!MLX5_CAP_GEN(dev, vport_group_manager))
+		return -EOPNOTSUPP;
+
+	if (esw->mode == SRIOV_NONE)
+		return -EOPNOTSUPP;
+
+	switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
+	case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+		mlx5_mode = MLX5_INLINE_MODE_NONE;
+		goto out;
+	case MLX5_CAP_INLINE_MODE_L2:
+		mlx5_mode = MLX5_INLINE_MODE_L2;
+		goto out;
+	case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+		goto query_vports;
+	}
+
+query_vports:
+	for (vport = 1; vport <= nvfs; vport++) {
+		mlx5_query_nic_vport_min_inline(dev, vport, &mlx5_mode);
+		if (vport > 1 && prev_mlx5_mode != mlx5_mode)
+			return -EINVAL;
+		prev_mlx5_mode = mlx5_mode;
+	}
+
+out:
+	*mode = mlx5_mode;
+	return 0;
+}
+
+int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	int err;
+
+	err = mlx5_devlink_eswitch_check(devlink);
+	if (err)
+		return err;
+
+	if (encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE &&
+	    (!MLX5_CAP_ESW_FLOWTABLE_FDB(dev, encap) ||
+	     !MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap)))
+		return -EOPNOTSUPP;
+
+	if (encap && encap != DEVLINK_ESWITCH_ENCAP_MODE_BASIC)
+		return -EOPNOTSUPP;
+
+	if (esw->mode == SRIOV_LEGACY) {
+		esw->offloads.encap = encap;
+		return 0;
+	}
+
+	if (esw->offloads.encap == encap)
+		return 0;
+
+	if (esw->offloads.num_flows > 0) {
+		esw_warn(dev, "Can't set encapsulation when flows are configured\n");
+		return -EOPNOTSUPP;
+	}
+
+	esw_destroy_offloads_fast_fdb_table(esw);
+
+	esw->offloads.encap = encap;
+	err = esw_create_offloads_fast_fdb_table(esw);
+	if (err) {
+		esw_warn(esw->dev, "Failed re-creating fast FDB table, err %d\n", err);
+		esw->offloads.encap = !encap;
+		(void)esw_create_offloads_fast_fdb_table(esw);
+	}
+	return err;
+}
+
+int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	int err;
+
+	err = mlx5_devlink_eswitch_check(devlink);
+	if (err)
+		return err;
+
+	*encap = esw->offloads.encap;
+	return 0;
+}
+
+void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
+				     int vport_index,
+				     struct mlx5_eswitch_rep_if *__rep_if,
+				     u8 rep_type)
+{
+	struct mlx5_esw_offload *offloads = &esw->offloads;
+	struct mlx5_eswitch_rep_if *rep_if;
+
+	rep_if = &offloads->vport_reps[vport_index].rep_if[rep_type];
+
+	rep_if->load   = __rep_if->load;
+	rep_if->unload = __rep_if->unload;
+	rep_if->get_proto_dev = __rep_if->get_proto_dev;
+	rep_if->priv = __rep_if->priv;
+
+	rep_if->valid = true;
+}
+EXPORT_SYMBOL(mlx5_eswitch_register_vport_rep);
+
+void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
+				       int vport_index, u8 rep_type)
+{
+	struct mlx5_esw_offload *offloads = &esw->offloads;
+	struct mlx5_eswitch_rep *rep;
+
+	rep = &offloads->vport_reps[vport_index];
+
+	if (esw->mode == SRIOV_OFFLOADS && esw->vports[vport_index].enabled)
+		rep->rep_if[rep_type].unload(rep);
+
+	rep->rep_if[rep_type].valid = false;
+}
+EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_rep);
+
+void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
+{
+#define UPLINK_REP_INDEX 0
+	struct mlx5_esw_offload *offloads = &esw->offloads;
+	struct mlx5_eswitch_rep *rep;
+
+	rep = &offloads->vport_reps[UPLINK_REP_INDEX];
+	return rep->rep_if[rep_type].priv;
+}
+
+void *mlx5_eswitch_get_proto_dev(struct mlx5_eswitch *esw,
+				 int vport,
+				 u8 rep_type)
+{
+	struct mlx5_esw_offload *offloads = &esw->offloads;
+	struct mlx5_eswitch_rep *rep;
+
+	if (vport == FDB_UPLINK_VPORT)
+		vport = UPLINK_REP_INDEX;
+
+	rep = &offloads->vport_reps[vport];
+
+	if (rep->rep_if[rep_type].valid &&
+	    rep->rep_if[rep_type].get_proto_dev)
+		return rep->rep_if[rep_type].get_proto_dev(rep);
+	return NULL;
+}
+EXPORT_SYMBOL(mlx5_eswitch_get_proto_dev);
+
+void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type)
+{
+	return mlx5_eswitch_get_proto_dev(esw, UPLINK_REP_INDEX, rep_type);
+}
+EXPORT_SYMBOL(mlx5_eswitch_uplink_get_proto_dev);
+
+struct mlx5_eswitch_rep *mlx5_eswitch_vport_rep(struct mlx5_eswitch *esw,
+						int vport)
+{
+	return &esw->offloads.vport_reps[vport];
+}
+EXPORT_SYMBOL(mlx5_eswitch_vport_rep);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/fpga/Makefile
new file mode 100644
index 0000000..d8e1711
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c
new file mode 100644
index 0000000..c0fd221
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/device.h>
+
+#include "mlx5_core.h"
+#include "fpga/cmd.h"
+
+#define MLX5_FPGA_ACCESS_REG_SZ (MLX5_ST_SZ_DW(fpga_access_reg) + \
+				 MLX5_FPGA_ACCESS_REG_SIZE_MAX)
+
+int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr,
+			 void *buf, bool write)
+{
+	u32 in[MLX5_FPGA_ACCESS_REG_SZ] = {0};
+	u32 out[MLX5_FPGA_ACCESS_REG_SZ];
+	int err;
+
+	if (size & 3)
+		return -EINVAL;
+	if (addr & 3)
+		return -EINVAL;
+	if (size > MLX5_FPGA_ACCESS_REG_SIZE_MAX)
+		return -EINVAL;
+
+	MLX5_SET(fpga_access_reg, in, size, size);
+	MLX5_SET64(fpga_access_reg, in, address, addr);
+	if (write)
+		memcpy(MLX5_ADDR_OF(fpga_access_reg, in, data), buf, size);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_FPGA_ACCESS_REG, 0, write);
+	if (err)
+		return err;
+
+	if (!write)
+		memcpy(buf, MLX5_ADDR_OF(fpga_access_reg, out, data), size);
+
+	return 0;
+}
+
+int mlx5_fpga_caps(struct mlx5_core_dev *dev)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_cap)] = {0};
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), dev->caps.fpga,
+				    MLX5_ST_SZ_BYTES(fpga_cap),
+				    MLX5_REG_FPGA_CAP, 0, 0);
+}
+
+int mlx5_fpga_ctrl_op(struct mlx5_core_dev *dev, u8 op)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_ctrl)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_ctrl)];
+
+	MLX5_SET(fpga_ctrl, in, operation, op);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				    MLX5_REG_FPGA_CTRL, 0, true);
+}
+
+int mlx5_fpga_sbu_caps(struct mlx5_core_dev *dev, void *caps, int size)
+{
+	unsigned int cap_size = MLX5_CAP_FPGA(dev, sandbox_extended_caps_len);
+	u64 addr = MLX5_CAP64_FPGA(dev, sandbox_extended_caps_addr);
+	unsigned int read;
+	int ret = 0;
+
+	if (cap_size > size) {
+		mlx5_core_warn(dev, "Not enough buffer %u for FPGA SBU caps %u",
+			       size, cap_size);
+		return -EINVAL;
+	}
+
+	while (cap_size > 0) {
+		read = min_t(unsigned int, cap_size,
+			     MLX5_FPGA_ACCESS_REG_SIZE_MAX);
+
+		ret = mlx5_fpga_access_reg(dev, read, addr, caps, false);
+		if (ret) {
+			mlx5_core_warn(dev, "Error reading FPGA SBU caps %u bytes at address 0x%llx: %d",
+				       read, addr, ret);
+			return ret;
+		}
+
+		cap_size -= read;
+		addr += read;
+		caps += read;
+	}
+
+	return ret;
+}
+
+int mlx5_fpga_query(struct mlx5_core_dev *dev, struct mlx5_fpga_query *query)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_ctrl)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_ctrl)];
+	int err;
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_FPGA_CTRL, 0, false);
+	if (err)
+		return err;
+
+	query->status = MLX5_GET(fpga_ctrl, out, status);
+	query->admin_image = MLX5_GET(fpga_ctrl, out, flash_select_admin);
+	query->oper_image = MLX5_GET(fpga_ctrl, out, flash_select_oper);
+	return 0;
+}
+
+int mlx5_fpga_create_qp(struct mlx5_core_dev *dev, void *fpga_qpc,
+			u32 *fpga_qpn)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_create_qp_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_create_qp_out)];
+	int ret;
+
+	MLX5_SET(fpga_create_qp_in, in, opcode, MLX5_CMD_OP_FPGA_CREATE_QP);
+	memcpy(MLX5_ADDR_OF(fpga_create_qp_in, in, fpga_qpc), fpga_qpc,
+	       MLX5_FLD_SZ_BYTES(fpga_create_qp_in, fpga_qpc));
+
+	ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (ret)
+		return ret;
+
+	memcpy(fpga_qpc, MLX5_ADDR_OF(fpga_create_qp_out, out, fpga_qpc),
+	       MLX5_FLD_SZ_BYTES(fpga_create_qp_out, fpga_qpc));
+	*fpga_qpn = MLX5_GET(fpga_create_qp_out, out, fpga_qpn);
+	return ret;
+}
+
+int mlx5_fpga_modify_qp(struct mlx5_core_dev *dev, u32 fpga_qpn,
+			enum mlx5_fpga_qpc_field_select fields,
+			void *fpga_qpc)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_modify_qp_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_modify_qp_out)];
+
+	MLX5_SET(fpga_modify_qp_in, in, opcode, MLX5_CMD_OP_FPGA_MODIFY_QP);
+	MLX5_SET(fpga_modify_qp_in, in, field_select, fields);
+	MLX5_SET(fpga_modify_qp_in, in, fpga_qpn, fpga_qpn);
+	memcpy(MLX5_ADDR_OF(fpga_modify_qp_in, in, fpga_qpc), fpga_qpc,
+	       MLX5_FLD_SZ_BYTES(fpga_modify_qp_in, fpga_qpc));
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_fpga_query_qp(struct mlx5_core_dev *dev,
+		       u32 fpga_qpn, void *fpga_qpc)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_query_qp_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_query_qp_out)];
+	int ret;
+
+	MLX5_SET(fpga_query_qp_in, in, opcode, MLX5_CMD_OP_FPGA_QUERY_QP);
+	MLX5_SET(fpga_query_qp_in, in, fpga_qpn, fpga_qpn);
+
+	ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (ret)
+		return ret;
+
+	memcpy(fpga_qpc, MLX5_ADDR_OF(fpga_query_qp_out, out, fpga_qpc),
+	       MLX5_FLD_SZ_BYTES(fpga_query_qp_out, fpga_qpc));
+	return ret;
+}
+
+int mlx5_fpga_destroy_qp(struct mlx5_core_dev *dev, u32 fpga_qpn)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_destroy_qp_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_destroy_qp_out)];
+
+	MLX5_SET(fpga_destroy_qp_in, in, opcode, MLX5_CMD_OP_FPGA_DESTROY_QP);
+	MLX5_SET(fpga_destroy_qp_in, in, fpga_qpn, fpga_qpn);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_fpga_query_qp_counters(struct mlx5_core_dev *dev, u32 fpga_qpn,
+				bool clear, struct mlx5_fpga_qp_counters *data)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_query_qp_counters_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_query_qp_counters_out)];
+	int ret;
+
+	MLX5_SET(fpga_query_qp_counters_in, in, opcode,
+		 MLX5_CMD_OP_FPGA_QUERY_QP_COUNTERS);
+	MLX5_SET(fpga_query_qp_counters_in, in, clear, clear);
+	MLX5_SET(fpga_query_qp_counters_in, in, fpga_qpn, fpga_qpn);
+
+	ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (ret)
+		return ret;
+
+	data->rx_ack_packets = MLX5_GET64(fpga_query_qp_counters_out, out,
+					  rx_ack_packets);
+	data->rx_send_packets = MLX5_GET64(fpga_query_qp_counters_out, out,
+					   rx_send_packets);
+	data->tx_ack_packets = MLX5_GET64(fpga_query_qp_counters_out, out,
+					  tx_ack_packets);
+	data->tx_send_packets = MLX5_GET64(fpga_query_qp_counters_out, out,
+					   tx_send_packets);
+	data->rx_total_drop = MLX5_GET64(fpga_query_qp_counters_out, out,
+					 rx_total_drop);
+
+	return ret;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
new file mode 100644
index 0000000..d05233c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_FPGA_H__
+#define __MLX5_FPGA_H__
+
+#include <linux/mlx5/driver.h>
+
+enum mlx5_fpga_image {
+	MLX5_FPGA_IMAGE_USER = 0,
+	MLX5_FPGA_IMAGE_FACTORY,
+};
+
+enum mlx5_fpga_status {
+	MLX5_FPGA_STATUS_SUCCESS = 0,
+	MLX5_FPGA_STATUS_FAILURE = 1,
+	MLX5_FPGA_STATUS_IN_PROGRESS = 2,
+	MLX5_FPGA_STATUS_NONE = 0xFFFF,
+};
+
+struct mlx5_fpga_query {
+	enum mlx5_fpga_image admin_image;
+	enum mlx5_fpga_image oper_image;
+	enum mlx5_fpga_status status;
+};
+
+enum mlx5_fpga_qpc_field_select {
+	MLX5_FPGA_QPC_STATE = BIT(0),
+};
+
+struct mlx5_fpga_qp_counters {
+	u64 rx_ack_packets;
+	u64 rx_send_packets;
+	u64 tx_ack_packets;
+	u64 tx_send_packets;
+	u64 rx_total_drop;
+};
+
+int mlx5_fpga_caps(struct mlx5_core_dev *dev);
+int mlx5_fpga_query(struct mlx5_core_dev *dev, struct mlx5_fpga_query *query);
+int mlx5_fpga_ctrl_op(struct mlx5_core_dev *dev, u8 op);
+int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr,
+			 void *buf, bool write);
+int mlx5_fpga_sbu_caps(struct mlx5_core_dev *dev, void *caps, int size);
+
+int mlx5_fpga_create_qp(struct mlx5_core_dev *dev, void *fpga_qpc,
+			u32 *fpga_qpn);
+int mlx5_fpga_modify_qp(struct mlx5_core_dev *dev, u32 fpga_qpn,
+			enum mlx5_fpga_qpc_field_select fields, void *fpga_qpc);
+int mlx5_fpga_query_qp(struct mlx5_core_dev *dev, u32 fpga_qpn, void *fpga_qpc);
+int mlx5_fpga_query_qp_counters(struct mlx5_core_dev *dev, u32 fpga_qpn,
+				bool clear, struct mlx5_fpga_qp_counters *data);
+int mlx5_fpga_destroy_qp(struct mlx5_core_dev *dev, u32 fpga_qpn);
+
+#endif /* __MLX5_FPGA_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
new file mode 100644
index 0000000..de7fe08
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -0,0 +1,1043 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <net/addrconf.h>
+#include <linux/etherdevice.h>
+#include <linux/mlx5/vport.h>
+
+#include "mlx5_core.h"
+#include "lib/mlx5.h"
+#include "fpga/conn.h"
+
+#define MLX5_FPGA_PKEY 0xFFFF
+#define MLX5_FPGA_PKEY_INDEX 0 /* RoCE PKEY 0xFFFF is always at index 0 */
+#define MLX5_FPGA_RECV_SIZE 2048
+#define MLX5_FPGA_PORT_NUM 1
+#define MLX5_FPGA_CQ_BUDGET 64
+
+static int mlx5_fpga_conn_map_buf(struct mlx5_fpga_conn *conn,
+				  struct mlx5_fpga_dma_buf *buf)
+{
+	struct device *dma_device;
+	int err = 0;
+
+	if (unlikely(!buf->sg[0].data))
+		goto out;
+
+	dma_device = &conn->fdev->mdev->pdev->dev;
+	buf->sg[0].dma_addr = dma_map_single(dma_device, buf->sg[0].data,
+					     buf->sg[0].size, buf->dma_dir);
+	err = dma_mapping_error(dma_device, buf->sg[0].dma_addr);
+	if (unlikely(err)) {
+		mlx5_fpga_warn(conn->fdev, "DMA error on sg 0: %d\n", err);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (!buf->sg[1].data)
+		goto out;
+
+	buf->sg[1].dma_addr = dma_map_single(dma_device, buf->sg[1].data,
+					     buf->sg[1].size, buf->dma_dir);
+	err = dma_mapping_error(dma_device, buf->sg[1].dma_addr);
+	if (unlikely(err)) {
+		mlx5_fpga_warn(conn->fdev, "DMA error on sg 1: %d\n", err);
+		dma_unmap_single(dma_device, buf->sg[0].dma_addr,
+				 buf->sg[0].size, buf->dma_dir);
+		err = -ENOMEM;
+	}
+
+out:
+	return err;
+}
+
+static void mlx5_fpga_conn_unmap_buf(struct mlx5_fpga_conn *conn,
+				     struct mlx5_fpga_dma_buf *buf)
+{
+	struct device *dma_device;
+
+	dma_device = &conn->fdev->mdev->pdev->dev;
+	if (buf->sg[1].data)
+		dma_unmap_single(dma_device, buf->sg[1].dma_addr,
+				 buf->sg[1].size, buf->dma_dir);
+
+	if (likely(buf->sg[0].data))
+		dma_unmap_single(dma_device, buf->sg[0].dma_addr,
+				 buf->sg[0].size, buf->dma_dir);
+}
+
+static int mlx5_fpga_conn_post_recv(struct mlx5_fpga_conn *conn,
+				    struct mlx5_fpga_dma_buf *buf)
+{
+	struct mlx5_wqe_data_seg *data;
+	unsigned int ix;
+	int err = 0;
+
+	err = mlx5_fpga_conn_map_buf(conn, buf);
+	if (unlikely(err))
+		goto out;
+
+	if (unlikely(conn->qp.rq.pc - conn->qp.rq.cc >= conn->qp.rq.size)) {
+		mlx5_fpga_conn_unmap_buf(conn, buf);
+		return -EBUSY;
+	}
+
+	ix = conn->qp.rq.pc & (conn->qp.rq.size - 1);
+	data = mlx5_wq_cyc_get_wqe(&conn->qp.wq.rq, ix);
+	data->byte_count = cpu_to_be32(buf->sg[0].size);
+	data->lkey = cpu_to_be32(conn->fdev->conn_res.mkey.key);
+	data->addr = cpu_to_be64(buf->sg[0].dma_addr);
+
+	conn->qp.rq.pc++;
+	conn->qp.rq.bufs[ix] = buf;
+
+	/* Make sure that descriptors are written before doorbell record. */
+	dma_wmb();
+	*conn->qp.wq.rq.db = cpu_to_be32(conn->qp.rq.pc & 0xffff);
+out:
+	return err;
+}
+
+static void mlx5_fpga_conn_notify_hw(struct mlx5_fpga_conn *conn, void *wqe)
+{
+	/* ensure wqe is visible to device before updating doorbell record */
+	dma_wmb();
+	*conn->qp.wq.sq.db = cpu_to_be32(conn->qp.sq.pc);
+	/* Make sure that doorbell record is visible before ringing */
+	wmb();
+	mlx5_write64(wqe, conn->fdev->conn_res.uar->map + MLX5_BF_OFFSET, NULL);
+}
+
+static void mlx5_fpga_conn_post_send(struct mlx5_fpga_conn *conn,
+				     struct mlx5_fpga_dma_buf *buf)
+{
+	struct mlx5_wqe_ctrl_seg *ctrl;
+	struct mlx5_wqe_data_seg *data;
+	unsigned int ix, sgi;
+	int size = 1;
+
+	ix = conn->qp.sq.pc & (conn->qp.sq.size - 1);
+
+	ctrl = mlx5_wq_cyc_get_wqe(&conn->qp.wq.sq, ix);
+	data = (void *)(ctrl + 1);
+
+	for (sgi = 0; sgi < ARRAY_SIZE(buf->sg); sgi++) {
+		if (!buf->sg[sgi].data)
+			break;
+		data->byte_count = cpu_to_be32(buf->sg[sgi].size);
+		data->lkey = cpu_to_be32(conn->fdev->conn_res.mkey.key);
+		data->addr = cpu_to_be64(buf->sg[sgi].dma_addr);
+		data++;
+		size++;
+	}
+
+	ctrl->imm = 0;
+	ctrl->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+	ctrl->opmod_idx_opcode = cpu_to_be32(((conn->qp.sq.pc & 0xffff) << 8) |
+					     MLX5_OPCODE_SEND);
+	ctrl->qpn_ds = cpu_to_be32(size | (conn->qp.mqp.qpn << 8));
+
+	conn->qp.sq.pc++;
+	conn->qp.sq.bufs[ix] = buf;
+	mlx5_fpga_conn_notify_hw(conn, ctrl);
+}
+
+int mlx5_fpga_conn_send(struct mlx5_fpga_conn *conn,
+			struct mlx5_fpga_dma_buf *buf)
+{
+	unsigned long flags;
+	int err;
+
+	if (!conn->qp.active)
+		return -ENOTCONN;
+
+	err = mlx5_fpga_conn_map_buf(conn, buf);
+	if (err)
+		return err;
+
+	spin_lock_irqsave(&conn->qp.sq.lock, flags);
+
+	if (conn->qp.sq.pc - conn->qp.sq.cc >= conn->qp.sq.size) {
+		list_add_tail(&buf->list, &conn->qp.sq.backlog);
+		goto out_unlock;
+	}
+
+	mlx5_fpga_conn_post_send(conn, buf);
+
+out_unlock:
+	spin_unlock_irqrestore(&conn->qp.sq.lock, flags);
+	return err;
+}
+
+static int mlx5_fpga_conn_post_recv_buf(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_dma_buf *buf;
+	int err;
+
+	buf = kzalloc(sizeof(*buf) + MLX5_FPGA_RECV_SIZE, 0);
+	if (!buf)
+		return -ENOMEM;
+
+	buf->sg[0].data = (void *)(buf + 1);
+	buf->sg[0].size = MLX5_FPGA_RECV_SIZE;
+	buf->dma_dir = DMA_FROM_DEVICE;
+
+	err = mlx5_fpga_conn_post_recv(conn, buf);
+	if (err)
+		kfree(buf);
+
+	return err;
+}
+
+static int mlx5_fpga_conn_create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
+				      struct mlx5_core_mkey *mkey)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	void *mkc;
+	u32 *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
+	MLX5_SET(mkc, mkc, lw, 1);
+	MLX5_SET(mkc, mkc, lr, 1);
+
+	MLX5_SET(mkc, mkc, pd, pdn);
+	MLX5_SET(mkc, mkc, length64, 1);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+
+	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+
+static void mlx5_fpga_conn_rq_cqe(struct mlx5_fpga_conn *conn,
+				  struct mlx5_cqe64 *cqe, u8 status)
+{
+	struct mlx5_fpga_dma_buf *buf;
+	int ix, err;
+
+	ix = be16_to_cpu(cqe->wqe_counter) & (conn->qp.rq.size - 1);
+	buf = conn->qp.rq.bufs[ix];
+	conn->qp.rq.bufs[ix] = NULL;
+	if (!status)
+		buf->sg[0].size = be32_to_cpu(cqe->byte_cnt);
+	conn->qp.rq.cc++;
+
+	if (unlikely(status && (status != MLX5_CQE_SYNDROME_WR_FLUSH_ERR)))
+		mlx5_fpga_warn(conn->fdev, "RQ buf %p on FPGA QP %u completion status %d\n",
+			       buf, conn->fpga_qpn, status);
+	else
+		mlx5_fpga_dbg(conn->fdev, "RQ buf %p on FPGA QP %u completion status %d\n",
+			      buf, conn->fpga_qpn, status);
+
+	mlx5_fpga_conn_unmap_buf(conn, buf);
+
+	if (unlikely(status || !conn->qp.active)) {
+		conn->qp.active = false;
+		kfree(buf);
+		return;
+	}
+
+	mlx5_fpga_dbg(conn->fdev, "Message with %u bytes received successfully\n",
+		      buf->sg[0].size);
+	conn->recv_cb(conn->cb_arg, buf);
+
+	buf->sg[0].size = MLX5_FPGA_RECV_SIZE;
+	err = mlx5_fpga_conn_post_recv(conn, buf);
+	if (unlikely(err)) {
+		mlx5_fpga_warn(conn->fdev,
+			       "Failed to re-post recv buf: %d\n", err);
+		kfree(buf);
+	}
+}
+
+static void mlx5_fpga_conn_sq_cqe(struct mlx5_fpga_conn *conn,
+				  struct mlx5_cqe64 *cqe, u8 status)
+{
+	struct mlx5_fpga_dma_buf *buf, *nextbuf;
+	unsigned long flags;
+	int ix;
+
+	spin_lock_irqsave(&conn->qp.sq.lock, flags);
+
+	ix = be16_to_cpu(cqe->wqe_counter) & (conn->qp.sq.size - 1);
+	buf = conn->qp.sq.bufs[ix];
+	conn->qp.sq.bufs[ix] = NULL;
+	conn->qp.sq.cc++;
+
+	/* Handle backlog still under the spinlock to ensure message post order */
+	if (unlikely(!list_empty(&conn->qp.sq.backlog))) {
+		if (likely(conn->qp.active)) {
+			nextbuf = list_first_entry(&conn->qp.sq.backlog,
+						   struct mlx5_fpga_dma_buf, list);
+			list_del(&nextbuf->list);
+			mlx5_fpga_conn_post_send(conn, nextbuf);
+		}
+	}
+
+	spin_unlock_irqrestore(&conn->qp.sq.lock, flags);
+
+	if (unlikely(status && (status != MLX5_CQE_SYNDROME_WR_FLUSH_ERR)))
+		mlx5_fpga_warn(conn->fdev, "SQ buf %p on FPGA QP %u completion status %d\n",
+			       buf, conn->fpga_qpn, status);
+	else
+		mlx5_fpga_dbg(conn->fdev, "SQ buf %p on FPGA QP %u completion status %d\n",
+			      buf, conn->fpga_qpn, status);
+
+	mlx5_fpga_conn_unmap_buf(conn, buf);
+
+	if (likely(buf->complete))
+		buf->complete(conn, conn->fdev, buf, status);
+
+	if (unlikely(status))
+		conn->qp.active = false;
+}
+
+static void mlx5_fpga_conn_handle_cqe(struct mlx5_fpga_conn *conn,
+				      struct mlx5_cqe64 *cqe)
+{
+	u8 opcode, status = 0;
+
+	opcode = cqe->op_own >> 4;
+
+	switch (opcode) {
+	case MLX5_CQE_REQ_ERR:
+		status = ((struct mlx5_err_cqe *)cqe)->syndrome;
+		/* Fall through */
+	case MLX5_CQE_REQ:
+		mlx5_fpga_conn_sq_cqe(conn, cqe, status);
+		break;
+
+	case MLX5_CQE_RESP_ERR:
+		status = ((struct mlx5_err_cqe *)cqe)->syndrome;
+		/* Fall through */
+	case MLX5_CQE_RESP_SEND:
+		mlx5_fpga_conn_rq_cqe(conn, cqe, status);
+		break;
+	default:
+		mlx5_fpga_warn(conn->fdev, "Unexpected cqe opcode %u\n",
+			       opcode);
+	}
+}
+
+static void mlx5_fpga_conn_arm_cq(struct mlx5_fpga_conn *conn)
+{
+	mlx5_cq_arm(&conn->cq.mcq, MLX5_CQ_DB_REQ_NOT,
+		    conn->fdev->conn_res.uar->map, conn->cq.wq.cc);
+}
+
+static void mlx5_fpga_conn_cq_event(struct mlx5_core_cq *mcq,
+				    enum mlx5_event event)
+{
+	struct mlx5_fpga_conn *conn;
+
+	conn = container_of(mcq, struct mlx5_fpga_conn, cq.mcq);
+	mlx5_fpga_warn(conn->fdev, "CQ event %u on CQ #%u\n", event, mcq->cqn);
+}
+
+static void mlx5_fpga_conn_event(struct mlx5_core_qp *mqp, int event)
+{
+	struct mlx5_fpga_conn *conn;
+
+	conn = container_of(mqp, struct mlx5_fpga_conn, qp.mqp);
+	mlx5_fpga_warn(conn->fdev, "QP event %u on QP #%u\n", event, mqp->qpn);
+}
+
+static inline void mlx5_fpga_conn_cqes(struct mlx5_fpga_conn *conn,
+				       unsigned int budget)
+{
+	struct mlx5_cqe64 *cqe;
+
+	while (budget) {
+		cqe = mlx5_cqwq_get_cqe(&conn->cq.wq);
+		if (!cqe)
+			break;
+
+		budget--;
+		mlx5_cqwq_pop(&conn->cq.wq);
+		mlx5_fpga_conn_handle_cqe(conn, cqe);
+		mlx5_cqwq_update_db_record(&conn->cq.wq);
+	}
+	if (!budget) {
+		tasklet_schedule(&conn->cq.tasklet);
+		return;
+	}
+
+	mlx5_fpga_dbg(conn->fdev, "Re-arming CQ with cc# %u\n", conn->cq.wq.cc);
+	/* ensure cq space is freed before enabling more cqes */
+	wmb();
+	mlx5_fpga_conn_arm_cq(conn);
+}
+
+static void mlx5_fpga_conn_cq_tasklet(unsigned long data)
+{
+	struct mlx5_fpga_conn *conn = (void *)data;
+
+	if (unlikely(!conn->qp.active))
+		return;
+	mlx5_fpga_conn_cqes(conn, MLX5_FPGA_CQ_BUDGET);
+}
+
+static void mlx5_fpga_conn_cq_complete(struct mlx5_core_cq *mcq)
+{
+	struct mlx5_fpga_conn *conn;
+
+	conn = container_of(mcq, struct mlx5_fpga_conn, cq.mcq);
+	if (unlikely(!conn->qp.active))
+		return;
+	mlx5_fpga_conn_cqes(conn, MLX5_FPGA_CQ_BUDGET);
+}
+
+static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {0};
+	struct mlx5_wq_param wqp;
+	struct mlx5_cqe64 *cqe;
+	int inlen, err, eqn;
+	unsigned int irqn;
+	void *cqc, *in;
+	__be64 *pas;
+	u32 i;
+
+	cq_size = roundup_pow_of_two(cq_size);
+	MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(cq_size));
+
+	wqp.buf_numa_node = mdev->priv.numa_node;
+	wqp.db_numa_node  = mdev->priv.numa_node;
+
+	err = mlx5_cqwq_create(mdev, &wqp, temp_cqc, &conn->cq.wq,
+			       &conn->cq.wq_ctrl);
+	if (err)
+		return err;
+
+	for (i = 0; i < mlx5_cqwq_get_size(&conn->cq.wq); i++) {
+		cqe = mlx5_cqwq_get_wqe(&conn->cq.wq, i);
+		cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK;
+	}
+
+	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
+		sizeof(u64) * conn->cq.wq_ctrl.frag_buf.npages;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_cqwq;
+	}
+
+	err = mlx5_vector2eqn(mdev, smp_processor_id(), &eqn, &irqn);
+	if (err)
+		goto err_cqwq;
+
+	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+	MLX5_SET(cqc, cqc, log_cq_size, ilog2(cq_size));
+	MLX5_SET(cqc, cqc, c_eqn, eqn);
+	MLX5_SET(cqc, cqc, uar_page, fdev->conn_res.uar->index);
+	MLX5_SET(cqc, cqc, log_page_size, conn->cq.wq_ctrl.frag_buf.page_shift -
+			   MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET64(cqc, cqc, dbr_addr, conn->cq.wq_ctrl.db.dma);
+
+	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
+	mlx5_fill_page_frag_array(&conn->cq.wq_ctrl.frag_buf, pas);
+
+	err = mlx5_core_create_cq(mdev, &conn->cq.mcq, in, inlen);
+	kvfree(in);
+
+	if (err)
+		goto err_cqwq;
+
+	conn->cq.mcq.cqe_sz     = 64;
+	conn->cq.mcq.set_ci_db  = conn->cq.wq_ctrl.db.db;
+	conn->cq.mcq.arm_db     = conn->cq.wq_ctrl.db.db + 1;
+	*conn->cq.mcq.set_ci_db = 0;
+	*conn->cq.mcq.arm_db    = 0;
+	conn->cq.mcq.vector     = 0;
+	conn->cq.mcq.comp       = mlx5_fpga_conn_cq_complete;
+	conn->cq.mcq.event      = mlx5_fpga_conn_cq_event;
+	conn->cq.mcq.irqn       = irqn;
+	conn->cq.mcq.uar        = fdev->conn_res.uar;
+	tasklet_init(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet,
+		     (unsigned long)conn);
+
+	mlx5_fpga_dbg(fdev, "Created CQ #0x%x\n", conn->cq.mcq.cqn);
+
+	goto out;
+
+err_cqwq:
+	mlx5_cqwq_destroy(&conn->cq.wq_ctrl);
+out:
+	return err;
+}
+
+static void mlx5_fpga_conn_destroy_cq(struct mlx5_fpga_conn *conn)
+{
+	tasklet_disable(&conn->cq.tasklet);
+	tasklet_kill(&conn->cq.tasklet);
+	mlx5_core_destroy_cq(conn->fdev->mdev, &conn->cq.mcq);
+	mlx5_cqwq_destroy(&conn->cq.wq_ctrl);
+}
+
+static int mlx5_fpga_conn_create_wq(struct mlx5_fpga_conn *conn, void *qpc)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	struct mlx5_wq_param wqp;
+
+	wqp.buf_numa_node = mdev->priv.numa_node;
+	wqp.db_numa_node  = mdev->priv.numa_node;
+
+	return mlx5_wq_qp_create(mdev, &wqp, qpc, &conn->qp.wq,
+				 &conn->qp.wq_ctrl);
+}
+
+static int mlx5_fpga_conn_create_qp(struct mlx5_fpga_conn *conn,
+				    unsigned int tx_size, unsigned int rx_size)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {0};
+	void *in = NULL, *qpc;
+	int err, inlen;
+
+	conn->qp.rq.pc = 0;
+	conn->qp.rq.cc = 0;
+	conn->qp.rq.size = roundup_pow_of_two(rx_size);
+	conn->qp.sq.pc = 0;
+	conn->qp.sq.cc = 0;
+	conn->qp.sq.size = roundup_pow_of_two(tx_size);
+
+	MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
+	MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(conn->qp.rq.size));
+	MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(conn->qp.sq.size));
+	err = mlx5_fpga_conn_create_wq(conn, temp_qpc);
+	if (err)
+		goto out;
+
+	conn->qp.rq.bufs = kvzalloc(sizeof(conn->qp.rq.bufs[0]) *
+				    conn->qp.rq.size, GFP_KERNEL);
+	if (!conn->qp.rq.bufs) {
+		err = -ENOMEM;
+		goto err_wq;
+	}
+
+	conn->qp.sq.bufs = kvzalloc(sizeof(conn->qp.sq.bufs[0]) *
+				    conn->qp.sq.size, GFP_KERNEL);
+	if (!conn->qp.sq.bufs) {
+		err = -ENOMEM;
+		goto err_rq_bufs;
+	}
+
+	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
+		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
+		conn->qp.wq_ctrl.buf.npages;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_sq_bufs;
+	}
+
+	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+	MLX5_SET(qpc, qpc, uar_page, fdev->conn_res.uar->index);
+	MLX5_SET(qpc, qpc, log_page_size,
+		 conn->qp.wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET(qpc, qpc, fre, 1);
+	MLX5_SET(qpc, qpc, rlky, 1);
+	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
+	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+	MLX5_SET(qpc, qpc, pd, fdev->conn_res.pdn);
+	MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
+	MLX5_SET(qpc, qpc, log_rq_size, ilog2(conn->qp.rq.size));
+	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
+	MLX5_SET(qpc, qpc, log_sq_size, ilog2(conn->qp.sq.size));
+	MLX5_SET(qpc, qpc, cqn_snd, conn->cq.mcq.cqn);
+	MLX5_SET(qpc, qpc, cqn_rcv, conn->cq.mcq.cqn);
+	MLX5_SET64(qpc, qpc, dbr_addr, conn->qp.wq_ctrl.db.dma);
+	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
+		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
+
+	mlx5_fill_page_array(&conn->qp.wq_ctrl.buf,
+			     (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas));
+
+	err = mlx5_core_create_qp(mdev, &conn->qp.mqp, in, inlen);
+	if (err)
+		goto err_sq_bufs;
+
+	conn->qp.mqp.event = mlx5_fpga_conn_event;
+	mlx5_fpga_dbg(fdev, "Created QP #0x%x\n", conn->qp.mqp.qpn);
+
+	goto out;
+
+err_sq_bufs:
+	kvfree(conn->qp.sq.bufs);
+err_rq_bufs:
+	kvfree(conn->qp.rq.bufs);
+err_wq:
+	mlx5_wq_destroy(&conn->qp.wq_ctrl);
+out:
+	kvfree(in);
+	return err;
+}
+
+static void mlx5_fpga_conn_free_recv_bufs(struct mlx5_fpga_conn *conn)
+{
+	int ix;
+
+	for (ix = 0; ix < conn->qp.rq.size; ix++) {
+		if (!conn->qp.rq.bufs[ix])
+			continue;
+		mlx5_fpga_conn_unmap_buf(conn, conn->qp.rq.bufs[ix]);
+		kfree(conn->qp.rq.bufs[ix]);
+		conn->qp.rq.bufs[ix] = NULL;
+	}
+}
+
+static void mlx5_fpga_conn_flush_send_bufs(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_dma_buf *buf, *temp;
+	int ix;
+
+	for (ix = 0; ix < conn->qp.sq.size; ix++) {
+		buf = conn->qp.sq.bufs[ix];
+		if (!buf)
+			continue;
+		conn->qp.sq.bufs[ix] = NULL;
+		mlx5_fpga_conn_unmap_buf(conn, buf);
+		if (!buf->complete)
+			continue;
+		buf->complete(conn, conn->fdev, buf, MLX5_CQE_SYNDROME_WR_FLUSH_ERR);
+	}
+	list_for_each_entry_safe(buf, temp, &conn->qp.sq.backlog, list) {
+		mlx5_fpga_conn_unmap_buf(conn, buf);
+		if (!buf->complete)
+			continue;
+		buf->complete(conn, conn->fdev, buf, MLX5_CQE_SYNDROME_WR_FLUSH_ERR);
+	}
+}
+
+static void mlx5_fpga_conn_destroy_qp(struct mlx5_fpga_conn *conn)
+{
+	mlx5_core_destroy_qp(conn->fdev->mdev, &conn->qp.mqp);
+	mlx5_fpga_conn_free_recv_bufs(conn);
+	mlx5_fpga_conn_flush_send_bufs(conn);
+	kvfree(conn->qp.sq.bufs);
+	kvfree(conn->qp.rq.bufs);
+	mlx5_wq_destroy(&conn->qp.wq_ctrl);
+}
+
+static inline int mlx5_fpga_conn_reset_qp(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_core_dev *mdev = conn->fdev->mdev;
+
+	mlx5_fpga_dbg(conn->fdev, "Modifying QP %u to RST\n", conn->qp.mqp.qpn);
+
+	return mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2RST_QP, 0, NULL,
+				   &conn->qp.mqp);
+}
+
+static inline int mlx5_fpga_conn_init_qp(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	u32 *qpc = NULL;
+	int err;
+
+	mlx5_fpga_dbg(conn->fdev, "Modifying QP %u to INIT\n", conn->qp.mqp.qpn);
+
+	qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL);
+	if (!qpc) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
+	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+	MLX5_SET(qpc, qpc, primary_address_path.pkey_index, MLX5_FPGA_PKEY_INDEX);
+	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, MLX5_FPGA_PORT_NUM);
+	MLX5_SET(qpc, qpc, pd, conn->fdev->conn_res.pdn);
+	MLX5_SET(qpc, qpc, cqn_snd, conn->cq.mcq.cqn);
+	MLX5_SET(qpc, qpc, cqn_rcv, conn->cq.mcq.cqn);
+	MLX5_SET64(qpc, qpc, dbr_addr, conn->qp.wq_ctrl.db.dma);
+
+	err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RST2INIT_QP, 0, qpc,
+				  &conn->qp.mqp);
+	if (err) {
+		mlx5_fpga_warn(fdev, "qp_modify RST2INIT failed: %d\n", err);
+		goto out;
+	}
+
+out:
+	kfree(qpc);
+	return err;
+}
+
+static inline int mlx5_fpga_conn_rtr_qp(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	u32 *qpc = NULL;
+	int err;
+
+	mlx5_fpga_dbg(conn->fdev, "QP RTR\n");
+
+	qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL);
+	if (!qpc) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_1K_BYTES);
+	MLX5_SET(qpc, qpc, log_msg_max, (u8)MLX5_CAP_GEN(mdev, log_max_msg));
+	MLX5_SET(qpc, qpc, remote_qpn, conn->fpga_qpn);
+	MLX5_SET(qpc, qpc, next_rcv_psn,
+		 MLX5_GET(fpga_qpc, conn->fpga_qpc, next_send_psn));
+	MLX5_SET(qpc, qpc, primary_address_path.pkey_index, MLX5_FPGA_PKEY_INDEX);
+	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, MLX5_FPGA_PORT_NUM);
+	ether_addr_copy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32),
+			MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, fpga_mac_47_32));
+	MLX5_SET(qpc, qpc, primary_address_path.udp_sport,
+		 MLX5_CAP_ROCE(mdev, r_roce_min_src_udp_port));
+	MLX5_SET(qpc, qpc, primary_address_path.src_addr_index,
+		 conn->qp.sgid_index);
+	MLX5_SET(qpc, qpc, primary_address_path.hop_limit, 0);
+	memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
+	       MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, fpga_ip),
+	       MLX5_FLD_SZ_BYTES(qpc, primary_address_path.rgid_rip));
+
+	err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_INIT2RTR_QP, 0, qpc,
+				  &conn->qp.mqp);
+	if (err) {
+		mlx5_fpga_warn(fdev, "qp_modify RST2INIT failed: %d\n", err);
+		goto out;
+	}
+
+out:
+	kfree(qpc);
+	return err;
+}
+
+static inline int mlx5_fpga_conn_rts_qp(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	u32 *qpc = NULL;
+	u32 opt_mask;
+	int err;
+
+	mlx5_fpga_dbg(conn->fdev, "QP RTS\n");
+
+	qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL);
+	if (!qpc) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpc, qpc, log_ack_req_freq, 8);
+	MLX5_SET(qpc, qpc, min_rnr_nak, 0x12);
+	MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x12); /* ~1.07s */
+	MLX5_SET(qpc, qpc, next_send_psn,
+		 MLX5_GET(fpga_qpc, conn->fpga_qpc, next_rcv_psn));
+	MLX5_SET(qpc, qpc, retry_count, 7);
+	MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
+
+	opt_mask = MLX5_QP_OPTPAR_RNR_TIMEOUT;
+	err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RTR2RTS_QP, opt_mask, qpc,
+				  &conn->qp.mqp);
+	if (err) {
+		mlx5_fpga_warn(fdev, "qp_modify RST2INIT failed: %d\n", err);
+		goto out;
+	}
+
+out:
+	kfree(qpc);
+	return err;
+}
+
+static int mlx5_fpga_conn_connect(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	int err;
+
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, state, MLX5_FPGA_QPC_STATE_ACTIVE);
+	err = mlx5_fpga_modify_qp(conn->fdev->mdev, conn->fpga_qpn,
+				  MLX5_FPGA_QPC_STATE, &conn->fpga_qpc);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to activate FPGA RC QP: %d\n", err);
+		goto out;
+	}
+
+	err = mlx5_fpga_conn_reset_qp(conn);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to change QP state to reset\n");
+		goto err_fpga_qp;
+	}
+
+	err = mlx5_fpga_conn_init_qp(conn);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to modify QP from RESET to INIT\n");
+		goto err_fpga_qp;
+	}
+	conn->qp.active = true;
+
+	while (!mlx5_fpga_conn_post_recv_buf(conn))
+		;
+
+	err = mlx5_fpga_conn_rtr_qp(conn);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to change QP state from INIT to RTR\n");
+		goto err_recv_bufs;
+	}
+
+	err = mlx5_fpga_conn_rts_qp(conn);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to change QP state from RTR to RTS\n");
+		goto err_recv_bufs;
+	}
+	goto out;
+
+err_recv_bufs:
+	mlx5_fpga_conn_free_recv_bufs(conn);
+err_fpga_qp:
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, state, MLX5_FPGA_QPC_STATE_INIT);
+	if (mlx5_fpga_modify_qp(conn->fdev->mdev, conn->fpga_qpn,
+				MLX5_FPGA_QPC_STATE, &conn->fpga_qpc))
+		mlx5_fpga_err(fdev, "Failed to revert FPGA QP to INIT\n");
+out:
+	return err;
+}
+
+struct mlx5_fpga_conn *mlx5_fpga_conn_create(struct mlx5_fpga_device *fdev,
+					     struct mlx5_fpga_conn_attr *attr,
+					     enum mlx5_ifc_fpga_qp_type qp_type)
+{
+	struct mlx5_fpga_conn *ret, *conn;
+	u8 *remote_mac, *remote_ip;
+	int err;
+
+	if (!attr->recv_cb)
+		return ERR_PTR(-EINVAL);
+
+	conn = kzalloc(sizeof(*conn), GFP_KERNEL);
+	if (!conn)
+		return ERR_PTR(-ENOMEM);
+
+	conn->fdev = fdev;
+	INIT_LIST_HEAD(&conn->qp.sq.backlog);
+
+	spin_lock_init(&conn->qp.sq.lock);
+
+	conn->recv_cb = attr->recv_cb;
+	conn->cb_arg = attr->cb_arg;
+
+	remote_mac = MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, remote_mac_47_32);
+	err = mlx5_query_nic_vport_mac_address(fdev->mdev, 0, remote_mac);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to query local MAC: %d\n", err);
+		ret = ERR_PTR(err);
+		goto err;
+	}
+
+	/* Build Modified EUI-64 IPv6 address from the MAC address */
+	remote_ip = MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, remote_ip);
+	remote_ip[0] = 0xfe;
+	remote_ip[1] = 0x80;
+	addrconf_addr_eui48(&remote_ip[8], remote_mac);
+
+	err = mlx5_core_reserved_gid_alloc(fdev->mdev, &conn->qp.sgid_index);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to allocate SGID: %d\n", err);
+		ret = ERR_PTR(err);
+		goto err;
+	}
+
+	err = mlx5_core_roce_gid_set(fdev->mdev, conn->qp.sgid_index,
+				     MLX5_ROCE_VERSION_2,
+				     MLX5_ROCE_L3_TYPE_IPV6,
+				     remote_ip, remote_mac, true, 0,
+				     MLX5_FPGA_PORT_NUM);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to set SGID: %d\n", err);
+		ret = ERR_PTR(err);
+		goto err_rsvd_gid;
+	}
+	mlx5_fpga_dbg(fdev, "Reserved SGID index %u\n", conn->qp.sgid_index);
+
+	/* Allow for one cqe per rx/tx wqe, plus one cqe for the next wqe,
+	 * created during processing of the cqe
+	 */
+	err = mlx5_fpga_conn_create_cq(conn,
+				       (attr->tx_size + attr->rx_size) * 2);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to create CQ: %d\n", err);
+		ret = ERR_PTR(err);
+		goto err_gid;
+	}
+
+	mlx5_fpga_conn_arm_cq(conn);
+
+	err = mlx5_fpga_conn_create_qp(conn, attr->tx_size, attr->rx_size);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to create QP: %d\n", err);
+		ret = ERR_PTR(err);
+		goto err_cq;
+	}
+
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, state, MLX5_FPGA_QPC_STATE_INIT);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, qp_type, qp_type);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, st, MLX5_FPGA_QPC_ST_RC);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, ether_type, ETH_P_8021Q);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, vid, 0);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, next_rcv_psn, 1);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, next_send_psn, 0);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, pkey, MLX5_FPGA_PKEY);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, remote_qpn, conn->qp.mqp.qpn);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, rnr_retry, 7);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, retry_count, 7);
+
+	err = mlx5_fpga_create_qp(fdev->mdev, &conn->fpga_qpc,
+				  &conn->fpga_qpn);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to create FPGA RC QP: %d\n", err);
+		ret = ERR_PTR(err);
+		goto err_qp;
+	}
+
+	err = mlx5_fpga_conn_connect(conn);
+	if (err) {
+		ret = ERR_PTR(err);
+		goto err_conn;
+	}
+
+	mlx5_fpga_dbg(fdev, "FPGA QPN is %u\n", conn->fpga_qpn);
+	ret = conn;
+	goto out;
+
+err_conn:
+	mlx5_fpga_destroy_qp(conn->fdev->mdev, conn->fpga_qpn);
+err_qp:
+	mlx5_fpga_conn_destroy_qp(conn);
+err_cq:
+	mlx5_fpga_conn_destroy_cq(conn);
+err_gid:
+	mlx5_core_roce_gid_set(fdev->mdev, conn->qp.sgid_index, 0, 0, NULL,
+			       NULL, false, 0, MLX5_FPGA_PORT_NUM);
+err_rsvd_gid:
+	mlx5_core_reserved_gid_free(fdev->mdev, conn->qp.sgid_index);
+err:
+	kfree(conn);
+out:
+	return ret;
+}
+
+void mlx5_fpga_conn_destroy(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	int err = 0;
+
+	conn->qp.active = false;
+	tasklet_disable(&conn->cq.tasklet);
+	synchronize_irq(conn->cq.mcq.irqn);
+
+	mlx5_fpga_destroy_qp(conn->fdev->mdev, conn->fpga_qpn);
+	err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2ERR_QP, 0, NULL,
+				  &conn->qp.mqp);
+	if (err)
+		mlx5_fpga_warn(fdev, "qp_modify 2ERR failed: %d\n", err);
+	mlx5_fpga_conn_destroy_qp(conn);
+	mlx5_fpga_conn_destroy_cq(conn);
+
+	mlx5_core_roce_gid_set(conn->fdev->mdev, conn->qp.sgid_index, 0, 0,
+			       NULL, NULL, false, 0, MLX5_FPGA_PORT_NUM);
+	mlx5_core_reserved_gid_free(conn->fdev->mdev, conn->qp.sgid_index);
+	kfree(conn);
+}
+
+int mlx5_fpga_conn_device_init(struct mlx5_fpga_device *fdev)
+{
+	int err;
+
+	err = mlx5_nic_vport_enable_roce(fdev->mdev);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to enable RoCE: %d\n", err);
+		goto out;
+	}
+
+	fdev->conn_res.uar = mlx5_get_uars_page(fdev->mdev);
+	if (IS_ERR(fdev->conn_res.uar)) {
+		err = PTR_ERR(fdev->conn_res.uar);
+		mlx5_fpga_err(fdev, "get_uars_page failed, %d\n", err);
+		goto err_roce;
+	}
+	mlx5_fpga_dbg(fdev, "Allocated UAR index %u\n",
+		      fdev->conn_res.uar->index);
+
+	err = mlx5_core_alloc_pd(fdev->mdev, &fdev->conn_res.pdn);
+	if (err) {
+		mlx5_fpga_err(fdev, "alloc pd failed, %d\n", err);
+		goto err_uar;
+	}
+	mlx5_fpga_dbg(fdev, "Allocated PD %u\n", fdev->conn_res.pdn);
+
+	err = mlx5_fpga_conn_create_mkey(fdev->mdev, fdev->conn_res.pdn,
+					 &fdev->conn_res.mkey);
+	if (err) {
+		mlx5_fpga_err(fdev, "create mkey failed, %d\n", err);
+		goto err_dealloc_pd;
+	}
+	mlx5_fpga_dbg(fdev, "Created mkey 0x%x\n", fdev->conn_res.mkey.key);
+
+	return 0;
+
+err_dealloc_pd:
+	mlx5_core_dealloc_pd(fdev->mdev, fdev->conn_res.pdn);
+err_uar:
+	mlx5_put_uars_page(fdev->mdev, fdev->conn_res.uar);
+err_roce:
+	mlx5_nic_vport_disable_roce(fdev->mdev);
+out:
+	return err;
+}
+
+void mlx5_fpga_conn_device_cleanup(struct mlx5_fpga_device *fdev)
+{
+	mlx5_core_destroy_mkey(fdev->mdev, &fdev->conn_res.mkey);
+	mlx5_core_dealloc_pd(fdev->mdev, fdev->conn_res.pdn);
+	mlx5_put_uars_page(fdev->mdev, fdev->conn_res.uar);
+	mlx5_nic_vport_disable_roce(fdev->mdev);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h
new file mode 100644
index 0000000..44bd9ec
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5_FPGA_CONN_H__
+#define __MLX5_FPGA_CONN_H__
+
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+
+#include "fpga/core.h"
+#include "fpga/sdk.h"
+#include "wq.h"
+
+struct mlx5_fpga_conn {
+	struct mlx5_fpga_device *fdev;
+
+	void (*recv_cb)(void *cb_arg, struct mlx5_fpga_dma_buf *buf);
+	void *cb_arg;
+
+	/* FPGA QP */
+	u32 fpga_qpc[MLX5_ST_SZ_DW(fpga_qpc)];
+	u32 fpga_qpn;
+
+	/* CQ */
+	struct {
+		struct mlx5_cqwq wq;
+		struct mlx5_frag_wq_ctrl wq_ctrl;
+		struct mlx5_core_cq mcq;
+		struct tasklet_struct tasklet;
+	} cq;
+
+	/* QP */
+	struct {
+		bool active;
+		int sgid_index;
+		struct mlx5_wq_qp wq;
+		struct mlx5_wq_ctrl wq_ctrl;
+		struct mlx5_core_qp mqp;
+		struct {
+			spinlock_t lock; /* Protects all SQ state */
+			unsigned int pc;
+			unsigned int cc;
+			unsigned int size;
+			struct mlx5_fpga_dma_buf **bufs;
+			struct list_head backlog;
+		} sq;
+		struct {
+			unsigned int pc;
+			unsigned int cc;
+			unsigned int size;
+			struct mlx5_fpga_dma_buf **bufs;
+		} rq;
+	} qp;
+};
+
+int mlx5_fpga_conn_device_init(struct mlx5_fpga_device *fdev);
+void mlx5_fpga_conn_device_cleanup(struct mlx5_fpga_device *fdev);
+struct mlx5_fpga_conn *
+mlx5_fpga_conn_create(struct mlx5_fpga_device *fdev,
+		      struct mlx5_fpga_conn_attr *attr,
+		      enum mlx5_ifc_fpga_qp_type qp_type);
+void mlx5_fpga_conn_destroy(struct mlx5_fpga_conn *conn);
+int mlx5_fpga_conn_send(struct mlx5_fpga_conn *conn,
+			struct mlx5_fpga_dma_buf *buf);
+
+#endif /* __MLX5_FPGA_CONN_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
new file mode 100644
index 0000000..dc89703
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+
+#include "mlx5_core.h"
+#include "lib/mlx5.h"
+#include "fpga/core.h"
+#include "fpga/conn.h"
+
+static const char *const mlx5_fpga_error_strings[] = {
+	"Null Syndrome",
+	"Corrupted DDR",
+	"Flash Timeout",
+	"Internal Link Error",
+	"Watchdog HW Failure",
+	"I2C Failure",
+	"Image Changed",
+	"Temperature Critical",
+};
+
+static struct mlx5_fpga_device *mlx5_fpga_device_alloc(void)
+{
+	struct mlx5_fpga_device *fdev = NULL;
+
+	fdev = kzalloc(sizeof(*fdev), GFP_KERNEL);
+	if (!fdev)
+		return NULL;
+
+	spin_lock_init(&fdev->state_lock);
+	fdev->state = MLX5_FPGA_STATUS_NONE;
+	return fdev;
+}
+
+static const char *mlx5_fpga_image_name(enum mlx5_fpga_image image)
+{
+	switch (image) {
+	case MLX5_FPGA_IMAGE_USER:
+		return "user";
+	case MLX5_FPGA_IMAGE_FACTORY:
+		return "factory";
+	default:
+		return "unknown";
+	}
+}
+
+static int mlx5_fpga_device_load_check(struct mlx5_fpga_device *fdev)
+{
+	struct mlx5_fpga_query query;
+	int err;
+
+	err = mlx5_fpga_query(fdev->mdev, &query);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to query status: %d\n", err);
+		return err;
+	}
+
+	fdev->last_admin_image = query.admin_image;
+	fdev->last_oper_image = query.oper_image;
+
+	mlx5_fpga_dbg(fdev, "Status %u; Admin image %u; Oper image %u\n",
+		      query.status, query.admin_image, query.oper_image);
+
+	if (query.status != MLX5_FPGA_STATUS_SUCCESS) {
+		mlx5_fpga_err(fdev, "%s image failed to load; status %u\n",
+			      mlx5_fpga_image_name(fdev->last_oper_image),
+			      query.status);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int mlx5_fpga_device_brb(struct mlx5_fpga_device *fdev)
+{
+	int err;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+
+	err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_ON);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to set bypass on: %d\n", err);
+		return err;
+	}
+	err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_RESET_SANDBOX);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to reset SBU: %d\n", err);
+		return err;
+	}
+	err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_OFF);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to set bypass off: %d\n", err);
+		return err;
+	}
+	return 0;
+}
+
+int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	unsigned long flags;
+	unsigned int max_num_qps;
+	int err;
+
+	if (!fdev)
+		return 0;
+
+	err = mlx5_fpga_device_load_check(fdev);
+	if (err)
+		goto out;
+
+	err = mlx5_fpga_caps(fdev->mdev);
+	if (err)
+		goto out;
+
+	mlx5_fpga_info(fdev, "device %u; %s image, version %u\n",
+		       MLX5_CAP_FPGA(fdev->mdev, fpga_device),
+		       mlx5_fpga_image_name(fdev->last_oper_image),
+		       MLX5_CAP_FPGA(fdev->mdev, image_version));
+
+	max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps);
+	err = mlx5_core_reserve_gids(mdev, max_num_qps);
+	if (err)
+		goto out;
+
+	err = mlx5_fpga_conn_device_init(fdev);
+	if (err)
+		goto err_rsvd_gid;
+
+	if (fdev->last_oper_image == MLX5_FPGA_IMAGE_USER) {
+		err = mlx5_fpga_device_brb(fdev);
+		if (err)
+			goto err_conn_init;
+	}
+
+	goto out;
+
+err_conn_init:
+	mlx5_fpga_conn_device_cleanup(fdev);
+
+err_rsvd_gid:
+	mlx5_core_unreserve_gids(mdev, max_num_qps);
+out:
+	spin_lock_irqsave(&fdev->state_lock, flags);
+	fdev->state = err ? MLX5_FPGA_STATUS_FAILURE : MLX5_FPGA_STATUS_SUCCESS;
+	spin_unlock_irqrestore(&fdev->state_lock, flags);
+	return err;
+}
+
+int mlx5_fpga_init(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = NULL;
+
+	if (!MLX5_CAP_GEN(mdev, fpga)) {
+		mlx5_core_dbg(mdev, "FPGA capability not present\n");
+		return 0;
+	}
+
+	mlx5_core_dbg(mdev, "Initializing FPGA\n");
+
+	fdev = mlx5_fpga_device_alloc();
+	if (!fdev)
+		return -ENOMEM;
+
+	fdev->mdev = mdev;
+	mdev->fpga = fdev;
+
+	return 0;
+}
+
+void mlx5_fpga_device_stop(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	unsigned int max_num_qps;
+	unsigned long flags;
+	int err;
+
+	if (!fdev)
+		return;
+
+	spin_lock_irqsave(&fdev->state_lock, flags);
+	if (fdev->state != MLX5_FPGA_STATUS_SUCCESS) {
+		spin_unlock_irqrestore(&fdev->state_lock, flags);
+		return;
+	}
+	fdev->state = MLX5_FPGA_STATUS_NONE;
+	spin_unlock_irqrestore(&fdev->state_lock, flags);
+
+	if (fdev->last_oper_image == MLX5_FPGA_IMAGE_USER) {
+		err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_ON);
+		if (err)
+			mlx5_fpga_err(fdev, "Failed to re-set SBU bypass on: %d\n",
+				      err);
+	}
+
+	mlx5_fpga_conn_device_cleanup(fdev);
+	max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps);
+	mlx5_core_unreserve_gids(mdev, max_num_qps);
+}
+
+void mlx5_fpga_cleanup(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+
+	mlx5_fpga_device_stop(mdev);
+	kfree(fdev);
+	mdev->fpga = NULL;
+}
+
+static const char *mlx5_fpga_syndrome_to_string(u8 syndrome)
+{
+	if (syndrome < ARRAY_SIZE(mlx5_fpga_error_strings))
+		return mlx5_fpga_error_strings[syndrome];
+	return "Unknown";
+}
+
+void mlx5_fpga_event(struct mlx5_core_dev *mdev, u8 event, void *data)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	const char *event_name;
+	bool teardown = false;
+	unsigned long flags;
+	u8 syndrome;
+
+	if (event != MLX5_EVENT_TYPE_FPGA_ERROR) {
+		mlx5_fpga_warn_ratelimited(fdev, "Unexpected event %u\n",
+					   event);
+		return;
+	}
+
+	syndrome = MLX5_GET(fpga_error_event, data, syndrome);
+	event_name = mlx5_fpga_syndrome_to_string(syndrome);
+
+	spin_lock_irqsave(&fdev->state_lock, flags);
+	switch (fdev->state) {
+	case MLX5_FPGA_STATUS_SUCCESS:
+		mlx5_fpga_warn(fdev, "Error %u: %s\n", syndrome, event_name);
+		teardown = true;
+		break;
+	default:
+		mlx5_fpga_warn_ratelimited(fdev, "Unexpected error event %u: %s\n",
+					   syndrome, event_name);
+	}
+	spin_unlock_irqrestore(&fdev->state_lock, flags);
+	/* We tear-down the card's interfaces and functionality because
+	 * the FPGA bump-on-the-wire is misbehaving and we lose ability
+	 * to communicate with the network. User may still be able to
+	 * recover by re-programming or debugging the FPGA
+	 */
+	if (teardown)
+		mlx5_trigger_health_work(fdev->mdev);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
new file mode 100644
index 0000000..82405ed
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_FPGA_CORE_H__
+#define __MLX5_FPGA_CORE_H__
+
+#ifdef CONFIG_MLX5_FPGA
+
+#include "fpga/cmd.h"
+
+/* Represents an Innova device */
+struct mlx5_fpga_device {
+	struct mlx5_core_dev *mdev;
+	spinlock_t state_lock; /* Protects state transitions */
+	enum mlx5_fpga_status state;
+	enum mlx5_fpga_image last_admin_image;
+	enum mlx5_fpga_image last_oper_image;
+
+	/* QP Connection resources */
+	struct {
+		u32 pdn;
+		struct mlx5_core_mkey mkey;
+		struct mlx5_uars_page *uar;
+	} conn_res;
+
+	struct mlx5_fpga_ipsec *ipsec;
+};
+
+#define mlx5_fpga_dbg(__adev, format, ...) \
+	dev_dbg(&(__adev)->mdev->pdev->dev, "FPGA: %s:%d:(pid %d): " format, \
+		 __func__, __LINE__, current->pid, ##__VA_ARGS__)
+
+#define mlx5_fpga_err(__adev, format, ...) \
+	dev_err(&(__adev)->mdev->pdev->dev, "FPGA: %s:%d:(pid %d): " format, \
+		__func__, __LINE__, current->pid, ##__VA_ARGS__)
+
+#define mlx5_fpga_warn(__adev, format, ...) \
+	dev_warn(&(__adev)->mdev->pdev->dev, "FPGA: %s:%d:(pid %d): " format, \
+		__func__, __LINE__, current->pid, ##__VA_ARGS__)
+
+#define mlx5_fpga_warn_ratelimited(__adev, format, ...) \
+	dev_warn_ratelimited(&(__adev)->mdev->pdev->dev, "FPGA: %s:%d: " \
+		format, __func__, __LINE__, ##__VA_ARGS__)
+
+#define mlx5_fpga_notice(__adev, format, ...) \
+	dev_notice(&(__adev)->mdev->pdev->dev, "FPGA: " format, ##__VA_ARGS__)
+
+#define mlx5_fpga_info(__adev, format, ...) \
+	dev_info(&(__adev)->mdev->pdev->dev, "FPGA: " format, ##__VA_ARGS__)
+
+int mlx5_fpga_init(struct mlx5_core_dev *mdev);
+void mlx5_fpga_cleanup(struct mlx5_core_dev *mdev);
+int mlx5_fpga_device_start(struct mlx5_core_dev *mdev);
+void mlx5_fpga_device_stop(struct mlx5_core_dev *mdev);
+void mlx5_fpga_event(struct mlx5_core_dev *mdev, u8 event, void *data);
+
+#else
+
+static inline int mlx5_fpga_init(struct mlx5_core_dev *mdev)
+{
+	return 0;
+}
+
+static inline void mlx5_fpga_cleanup(struct mlx5_core_dev *mdev)
+{
+}
+
+static inline int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
+{
+	return 0;
+}
+
+static inline void mlx5_fpga_device_stop(struct mlx5_core_dev *mdev)
+{
+}
+
+static inline void mlx5_fpga_event(struct mlx5_core_dev *mdev, u8 event,
+				   void *data)
+{
+}
+
+#endif
+
+#endif /* __MLX5_FPGA_CORE_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
new file mode 100644
index 0000000..fad8c2e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
@@ -0,0 +1,1530 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/rhashtable.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/fs_helpers.h>
+#include <linux/mlx5/fs.h>
+#include <linux/rbtree.h>
+
+#include "mlx5_core.h"
+#include "fs_cmd.h"
+#include "fpga/ipsec.h"
+#include "fpga/sdk.h"
+#include "fpga/core.h"
+
+#define SBU_QP_QUEUE_SIZE 8
+#define MLX5_FPGA_IPSEC_CMD_TIMEOUT_MSEC	(60 * 1000)
+
+enum mlx5_fpga_ipsec_cmd_status {
+	MLX5_FPGA_IPSEC_CMD_PENDING,
+	MLX5_FPGA_IPSEC_CMD_SEND_FAIL,
+	MLX5_FPGA_IPSEC_CMD_COMPLETE,
+};
+
+struct mlx5_fpga_ipsec_cmd_context {
+	struct mlx5_fpga_dma_buf buf;
+	enum mlx5_fpga_ipsec_cmd_status status;
+	struct mlx5_ifc_fpga_ipsec_cmd_resp resp;
+	int status_code;
+	struct completion complete;
+	struct mlx5_fpga_device *dev;
+	struct list_head list; /* Item in pending_cmds */
+	u8 command[0];
+};
+
+struct mlx5_fpga_esp_xfrm;
+
+struct mlx5_fpga_ipsec_sa_ctx {
+	struct rhash_head		hash;
+	struct mlx5_ifc_fpga_ipsec_sa	hw_sa;
+	struct mlx5_core_dev		*dev;
+	struct mlx5_fpga_esp_xfrm	*fpga_xfrm;
+};
+
+struct mlx5_fpga_esp_xfrm {
+	unsigned int			num_rules;
+	struct mlx5_fpga_ipsec_sa_ctx	*sa_ctx;
+	struct mutex			lock; /* xfrm lock */
+	struct mlx5_accel_esp_xfrm	accel_xfrm;
+};
+
+struct mlx5_fpga_ipsec_rule {
+	struct rb_node			node;
+	struct fs_fte			*fte;
+	struct mlx5_fpga_ipsec_sa_ctx	*ctx;
+};
+
+static const struct rhashtable_params rhash_sa = {
+	.key_len = FIELD_SIZEOF(struct mlx5_fpga_ipsec_sa_ctx, hw_sa),
+	.key_offset = offsetof(struct mlx5_fpga_ipsec_sa_ctx, hw_sa),
+	.head_offset = offsetof(struct mlx5_fpga_ipsec_sa_ctx, hash),
+	.automatic_shrinking = true,
+	.min_size = 1,
+};
+
+struct mlx5_fpga_ipsec {
+	struct mlx5_fpga_device *fdev;
+	struct list_head pending_cmds;
+	spinlock_t pending_cmds_lock; /* Protects pending_cmds */
+	u32 caps[MLX5_ST_SZ_DW(ipsec_extended_cap)];
+	struct mlx5_fpga_conn *conn;
+
+	struct notifier_block	fs_notifier_ingress_bypass;
+	struct notifier_block	fs_notifier_egress;
+
+	/* Map hardware SA           -->  SA context
+	 *     (mlx5_fpga_ipsec_sa)       (mlx5_fpga_ipsec_sa_ctx)
+	 * We will use this hash to avoid SAs duplication in fpga which
+	 * aren't allowed
+	 */
+	struct rhashtable sa_hash;	/* hw_sa -> mlx5_fpga_ipsec_sa_ctx */
+	struct mutex sa_hash_lock;
+
+	/* Tree holding all rules for this fpga device
+	 * Key for searching a rule (mlx5_fpga_ipsec_rule) is (ft, id)
+	 */
+	struct rb_root rules_rb;
+	struct mutex rules_rb_lock; /* rules lock */
+};
+
+static bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev)
+{
+	if (!mdev->fpga || !MLX5_CAP_GEN(mdev, fpga))
+		return false;
+
+	if (MLX5_CAP_FPGA(mdev, ieee_vendor_id) !=
+	    MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX)
+		return false;
+
+	if (MLX5_CAP_FPGA(mdev, sandbox_product_id) !=
+	    MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_IPSEC)
+		return false;
+
+	return true;
+}
+
+static void mlx5_fpga_ipsec_send_complete(struct mlx5_fpga_conn *conn,
+					  struct mlx5_fpga_device *fdev,
+					  struct mlx5_fpga_dma_buf *buf,
+					  u8 status)
+{
+	struct mlx5_fpga_ipsec_cmd_context *context;
+
+	if (status) {
+		context = container_of(buf, struct mlx5_fpga_ipsec_cmd_context,
+				       buf);
+		mlx5_fpga_warn(fdev, "IPSec command send failed with status %u\n",
+			       status);
+		context->status = MLX5_FPGA_IPSEC_CMD_SEND_FAIL;
+		complete(&context->complete);
+	}
+}
+
+static inline
+int syndrome_to_errno(enum mlx5_ifc_fpga_ipsec_response_syndrome syndrome)
+{
+	switch (syndrome) {
+	case MLX5_FPGA_IPSEC_RESPONSE_SUCCESS:
+		return 0;
+	case MLX5_FPGA_IPSEC_RESPONSE_SADB_ISSUE:
+		return -EEXIST;
+	case MLX5_FPGA_IPSEC_RESPONSE_ILLEGAL_REQUEST:
+		return -EINVAL;
+	case MLX5_FPGA_IPSEC_RESPONSE_WRITE_RESPONSE_ISSUE:
+		return -EIO;
+	}
+	return -EIO;
+}
+
+static void mlx5_fpga_ipsec_recv(void *cb_arg, struct mlx5_fpga_dma_buf *buf)
+{
+	struct mlx5_ifc_fpga_ipsec_cmd_resp *resp = buf->sg[0].data;
+	struct mlx5_fpga_ipsec_cmd_context *context;
+	enum mlx5_ifc_fpga_ipsec_response_syndrome syndrome;
+	struct mlx5_fpga_device *fdev = cb_arg;
+	unsigned long flags;
+
+	if (buf->sg[0].size < sizeof(*resp)) {
+		mlx5_fpga_warn(fdev, "Short receive from FPGA IPSec: %u < %zu bytes\n",
+			       buf->sg[0].size, sizeof(*resp));
+		return;
+	}
+
+	mlx5_fpga_dbg(fdev, "mlx5_ipsec recv_cb syndrome %08x\n",
+		      ntohl(resp->syndrome));
+
+	spin_lock_irqsave(&fdev->ipsec->pending_cmds_lock, flags);
+	context = list_first_entry_or_null(&fdev->ipsec->pending_cmds,
+					   struct mlx5_fpga_ipsec_cmd_context,
+					   list);
+	if (context)
+		list_del(&context->list);
+	spin_unlock_irqrestore(&fdev->ipsec->pending_cmds_lock, flags);
+
+	if (!context) {
+		mlx5_fpga_warn(fdev, "Received IPSec offload response without pending command request\n");
+		return;
+	}
+	mlx5_fpga_dbg(fdev, "Handling response for %p\n", context);
+
+	syndrome = ntohl(resp->syndrome);
+	context->status_code = syndrome_to_errno(syndrome);
+	context->status = MLX5_FPGA_IPSEC_CMD_COMPLETE;
+	memcpy(&context->resp, resp, sizeof(*resp));
+
+	if (context->status_code)
+		mlx5_fpga_warn(fdev, "IPSec command failed with syndrome %08x\n",
+			       syndrome);
+
+	complete(&context->complete);
+}
+
+static void *mlx5_fpga_ipsec_cmd_exec(struct mlx5_core_dev *mdev,
+				      const void *cmd, int cmd_size)
+{
+	struct mlx5_fpga_ipsec_cmd_context *context;
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	unsigned long flags;
+	int res;
+
+	if (!fdev || !fdev->ipsec)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (cmd_size & 3)
+		return ERR_PTR(-EINVAL);
+
+	context = kzalloc(sizeof(*context) + cmd_size, GFP_ATOMIC);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	context->status = MLX5_FPGA_IPSEC_CMD_PENDING;
+	context->dev = fdev;
+	context->buf.complete = mlx5_fpga_ipsec_send_complete;
+	init_completion(&context->complete);
+	memcpy(&context->command, cmd, cmd_size);
+	context->buf.sg[0].size = cmd_size;
+	context->buf.sg[0].data = &context->command;
+
+	spin_lock_irqsave(&fdev->ipsec->pending_cmds_lock, flags);
+	res = mlx5_fpga_sbu_conn_sendmsg(fdev->ipsec->conn, &context->buf);
+	if (!res)
+		list_add_tail(&context->list, &fdev->ipsec->pending_cmds);
+	spin_unlock_irqrestore(&fdev->ipsec->pending_cmds_lock, flags);
+
+	if (res) {
+		mlx5_fpga_warn(fdev, "Failed to send IPSec command: %d\n", res);
+		kfree(context);
+		return ERR_PTR(res);
+	}
+
+	/* Context will be freed by wait func after completion */
+	return context;
+}
+
+static int mlx5_fpga_ipsec_cmd_wait(void *ctx)
+{
+	struct mlx5_fpga_ipsec_cmd_context *context = ctx;
+	unsigned long timeout =
+		msecs_to_jiffies(MLX5_FPGA_IPSEC_CMD_TIMEOUT_MSEC);
+	int res;
+
+	res = wait_for_completion_timeout(&context->complete, timeout);
+	if (!res) {
+		mlx5_fpga_warn(context->dev, "Failure waiting for IPSec command response\n");
+		return -ETIMEDOUT;
+	}
+
+	if (context->status == MLX5_FPGA_IPSEC_CMD_COMPLETE)
+		res = context->status_code;
+	else
+		res = -EIO;
+
+	return res;
+}
+
+static inline bool is_v2_sadb_supported(struct mlx5_fpga_ipsec *fipsec)
+{
+	if (MLX5_GET(ipsec_extended_cap, fipsec->caps, v2_command))
+		return true;
+	return false;
+}
+
+static int mlx5_fpga_ipsec_update_hw_sa(struct mlx5_fpga_device *fdev,
+					struct mlx5_ifc_fpga_ipsec_sa *hw_sa,
+					int opcode)
+{
+	struct mlx5_core_dev *dev = fdev->mdev;
+	struct mlx5_ifc_fpga_ipsec_sa *sa;
+	struct mlx5_fpga_ipsec_cmd_context *cmd_context;
+	size_t sa_cmd_size;
+	int err;
+
+	hw_sa->ipsec_sa_v1.cmd = htonl(opcode);
+	if (is_v2_sadb_supported(fdev->ipsec))
+		sa_cmd_size = sizeof(*hw_sa);
+	else
+		sa_cmd_size = sizeof(hw_sa->ipsec_sa_v1);
+
+	cmd_context = (struct mlx5_fpga_ipsec_cmd_context *)
+			mlx5_fpga_ipsec_cmd_exec(dev, hw_sa, sa_cmd_size);
+	if (IS_ERR(cmd_context))
+		return PTR_ERR(cmd_context);
+
+	err = mlx5_fpga_ipsec_cmd_wait(cmd_context);
+	if (err)
+		goto out;
+
+	sa = (struct mlx5_ifc_fpga_ipsec_sa *)&cmd_context->command;
+	if (sa->ipsec_sa_v1.sw_sa_handle != cmd_context->resp.sw_sa_handle) {
+		mlx5_fpga_err(fdev, "mismatch SA handle. cmd 0x%08x vs resp 0x%08x\n",
+			      ntohl(sa->ipsec_sa_v1.sw_sa_handle),
+			      ntohl(cmd_context->resp.sw_sa_handle));
+		err = -EIO;
+	}
+
+out:
+	kfree(cmd_context);
+	return err;
+}
+
+u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	u32 ret = 0;
+
+	if (mlx5_fpga_is_ipsec_device(mdev)) {
+		ret |= MLX5_ACCEL_IPSEC_CAP_DEVICE;
+		ret |= MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA;
+	} else {
+		return ret;
+	}
+
+	if (!fdev->ipsec)
+		return ret;
+
+	if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, esp))
+		ret |= MLX5_ACCEL_IPSEC_CAP_ESP;
+
+	if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, ipv6))
+		ret |= MLX5_ACCEL_IPSEC_CAP_IPV6;
+
+	if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, lso))
+		ret |= MLX5_ACCEL_IPSEC_CAP_LSO;
+
+	if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, rx_no_trailer))
+		ret |= MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER;
+
+	if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, esn)) {
+		ret |= MLX5_ACCEL_IPSEC_CAP_ESN;
+		ret |= MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN;
+	}
+
+	return ret;
+}
+
+unsigned int mlx5_fpga_ipsec_counters_count(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+
+	if (!fdev || !fdev->ipsec)
+		return 0;
+
+	return MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps,
+			number_of_ipsec_counters);
+}
+
+int mlx5_fpga_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters,
+				  unsigned int counters_count)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	unsigned int i;
+	__be32 *data;
+	u32 count;
+	u64 addr;
+	int ret;
+
+	if (!fdev || !fdev->ipsec)
+		return 0;
+
+	addr = (u64)MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps,
+			     ipsec_counters_addr_low) +
+	       ((u64)MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps,
+			     ipsec_counters_addr_high) << 32);
+
+	count = mlx5_fpga_ipsec_counters_count(mdev);
+
+	data = kzalloc(sizeof(*data) * count * 2, GFP_KERNEL);
+	if (!data) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = mlx5_fpga_mem_read(fdev, count * sizeof(u64), addr, data,
+				 MLX5_FPGA_ACCESS_TYPE_DONTCARE);
+	if (ret < 0) {
+		mlx5_fpga_err(fdev, "Failed to read IPSec counters from HW: %d\n",
+			      ret);
+		goto out;
+	}
+	ret = 0;
+
+	if (count > counters_count)
+		count = counters_count;
+
+	/* Each counter is low word, then high. But each word is big-endian */
+	for (i = 0; i < count; i++)
+		counters[i] = (u64)ntohl(data[i * 2]) |
+			      ((u64)ntohl(data[i * 2 + 1]) << 32);
+
+out:
+	kfree(data);
+	return ret;
+}
+
+static int mlx5_fpga_ipsec_set_caps(struct mlx5_core_dev *mdev, u32 flags)
+{
+	struct mlx5_fpga_ipsec_cmd_context *context;
+	struct mlx5_ifc_fpga_ipsec_cmd_cap cmd = {0};
+	int err;
+
+	cmd.cmd = htonl(MLX5_FPGA_IPSEC_CMD_OP_SET_CAP);
+	cmd.flags = htonl(flags);
+	context = mlx5_fpga_ipsec_cmd_exec(mdev, &cmd, sizeof(cmd));
+	if (IS_ERR(context)) {
+		err = PTR_ERR(context);
+		goto out;
+	}
+
+	err = mlx5_fpga_ipsec_cmd_wait(context);
+	if (err)
+		goto out;
+
+	if ((context->resp.flags & cmd.flags) != cmd.flags) {
+		mlx5_fpga_err(context->dev, "Failed to set capabilities. cmd 0x%08x vs resp 0x%08x\n",
+			      cmd.flags,
+			      context->resp.flags);
+		err = -EIO;
+	}
+
+out:
+	return err;
+}
+
+static int mlx5_fpga_ipsec_enable_supported_caps(struct mlx5_core_dev *mdev)
+{
+	u32 dev_caps = mlx5_fpga_ipsec_device_caps(mdev);
+	u32 flags = 0;
+
+	if (dev_caps & MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER)
+		flags |= MLX5_FPGA_IPSEC_CAP_NO_TRAILER;
+
+	return mlx5_fpga_ipsec_set_caps(mdev, flags);
+}
+
+static void
+mlx5_fpga_ipsec_build_hw_xfrm(struct mlx5_core_dev *mdev,
+			      const struct mlx5_accel_esp_xfrm_attrs *xfrm_attrs,
+			      struct mlx5_ifc_fpga_ipsec_sa *hw_sa)
+{
+	const struct aes_gcm_keymat *aes_gcm = &xfrm_attrs->keymat.aes_gcm;
+
+	/* key */
+	memcpy(&hw_sa->ipsec_sa_v1.key_enc, aes_gcm->aes_key,
+	       aes_gcm->key_len / 8);
+	/* Duplicate 128 bit key twice according to HW layout */
+	if (aes_gcm->key_len == 128)
+		memcpy(&hw_sa->ipsec_sa_v1.key_enc[16],
+		       aes_gcm->aes_key, aes_gcm->key_len / 8);
+
+	/* salt and seq_iv */
+	memcpy(&hw_sa->ipsec_sa_v1.gcm.salt_iv, &aes_gcm->seq_iv,
+	       sizeof(aes_gcm->seq_iv));
+	memcpy(&hw_sa->ipsec_sa_v1.gcm.salt, &aes_gcm->salt,
+	       sizeof(aes_gcm->salt));
+
+	/* esn */
+	if (xfrm_attrs->flags & MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED) {
+		hw_sa->ipsec_sa_v1.flags |= MLX5_FPGA_IPSEC_SA_ESN_EN;
+		hw_sa->ipsec_sa_v1.flags |=
+				(xfrm_attrs->flags &
+				 MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP) ?
+					MLX5_FPGA_IPSEC_SA_ESN_OVERLAP : 0;
+		hw_sa->esn = htonl(xfrm_attrs->esn);
+	} else {
+		hw_sa->ipsec_sa_v1.flags &= ~MLX5_FPGA_IPSEC_SA_ESN_EN;
+		hw_sa->ipsec_sa_v1.flags &=
+				~(xfrm_attrs->flags &
+				  MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP) ?
+					MLX5_FPGA_IPSEC_SA_ESN_OVERLAP : 0;
+		hw_sa->esn = 0;
+	}
+
+	/* rx handle */
+	hw_sa->ipsec_sa_v1.sw_sa_handle = htonl(xfrm_attrs->sa_handle);
+
+	/* enc mode */
+	switch (aes_gcm->key_len) {
+	case 128:
+		hw_sa->ipsec_sa_v1.enc_mode =
+			MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_128_AUTH_128;
+		break;
+	case 256:
+		hw_sa->ipsec_sa_v1.enc_mode =
+			MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_256_AUTH_128;
+		break;
+	}
+
+	/* flags */
+	hw_sa->ipsec_sa_v1.flags |= MLX5_FPGA_IPSEC_SA_SA_VALID |
+			MLX5_FPGA_IPSEC_SA_SPI_EN |
+			MLX5_FPGA_IPSEC_SA_IP_ESP;
+
+	if (xfrm_attrs->action & MLX5_ACCEL_ESP_ACTION_ENCRYPT)
+		hw_sa->ipsec_sa_v1.flags |= MLX5_FPGA_IPSEC_SA_DIR_SX;
+	else
+		hw_sa->ipsec_sa_v1.flags &= ~MLX5_FPGA_IPSEC_SA_DIR_SX;
+}
+
+static void
+mlx5_fpga_ipsec_build_hw_sa(struct mlx5_core_dev *mdev,
+			    struct mlx5_accel_esp_xfrm_attrs *xfrm_attrs,
+			    const __be32 saddr[4],
+			    const __be32 daddr[4],
+			    const __be32 spi, bool is_ipv6,
+			    struct mlx5_ifc_fpga_ipsec_sa *hw_sa)
+{
+	mlx5_fpga_ipsec_build_hw_xfrm(mdev, xfrm_attrs, hw_sa);
+
+	/* IPs */
+	memcpy(hw_sa->ipsec_sa_v1.sip, saddr, sizeof(hw_sa->ipsec_sa_v1.sip));
+	memcpy(hw_sa->ipsec_sa_v1.dip, daddr, sizeof(hw_sa->ipsec_sa_v1.dip));
+
+	/* SPI */
+	hw_sa->ipsec_sa_v1.spi = spi;
+
+	/* flags */
+	if (is_ipv6)
+		hw_sa->ipsec_sa_v1.flags |= MLX5_FPGA_IPSEC_SA_IPV6;
+}
+
+static bool is_full_mask(const void *p, size_t len)
+{
+	WARN_ON(len % 4);
+
+	return !memchr_inv(p, 0xff, len);
+}
+
+static bool validate_fpga_full_mask(struct mlx5_core_dev *dev,
+				    const u32 *match_c,
+				    const u32 *match_v)
+{
+	const void *misc_params_c = MLX5_ADDR_OF(fte_match_param,
+						 match_c,
+						 misc_parameters);
+	const void *headers_c = MLX5_ADDR_OF(fte_match_param,
+					     match_c,
+					     outer_headers);
+	const void *headers_v = MLX5_ADDR_OF(fte_match_param,
+					     match_v,
+					     outer_headers);
+
+	if (mlx5_fs_is_outer_ipv4_flow(dev, headers_c, headers_v)) {
+		const void *s_ipv4_c = MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+						    headers_c,
+						    src_ipv4_src_ipv6.ipv4_layout.ipv4);
+		const void *d_ipv4_c = MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+						    headers_c,
+						    dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+
+		if (!is_full_mask(s_ipv4_c, MLX5_FLD_SZ_BYTES(ipv4_layout,
+							      ipv4)) ||
+		    !is_full_mask(d_ipv4_c, MLX5_FLD_SZ_BYTES(ipv4_layout,
+							      ipv4)))
+			return false;
+	} else {
+		const void *s_ipv6_c = MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+						    headers_c,
+						    src_ipv4_src_ipv6.ipv6_layout.ipv6);
+		const void *d_ipv6_c = MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+						    headers_c,
+						    dst_ipv4_dst_ipv6.ipv6_layout.ipv6);
+
+		if (!is_full_mask(s_ipv6_c, MLX5_FLD_SZ_BYTES(ipv6_layout,
+							      ipv6)) ||
+		    !is_full_mask(d_ipv6_c, MLX5_FLD_SZ_BYTES(ipv6_layout,
+							      ipv6)))
+			return false;
+	}
+
+	if (!is_full_mask(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c,
+				       outer_esp_spi),
+			  MLX5_FLD_SZ_BYTES(fte_match_set_misc, outer_esp_spi)))
+		return false;
+
+	return true;
+}
+
+static bool mlx5_is_fpga_ipsec_rule(struct mlx5_core_dev *dev,
+				    u8 match_criteria_enable,
+				    const u32 *match_c,
+				    const u32 *match_v)
+{
+	u32 ipsec_dev_caps = mlx5_accel_ipsec_device_caps(dev);
+	bool ipv6_flow;
+
+	ipv6_flow = mlx5_fs_is_outer_ipv6_flow(dev, match_c, match_v);
+
+	if (!(match_criteria_enable & MLX5_MATCH_OUTER_HEADERS) ||
+	    mlx5_fs_is_outer_udp_flow(match_c, match_v) ||
+	    mlx5_fs_is_outer_tcp_flow(match_c, match_v) ||
+	    mlx5_fs_is_vxlan_flow(match_c) ||
+	    !(mlx5_fs_is_outer_ipv4_flow(dev, match_c, match_v) ||
+	      ipv6_flow))
+		return false;
+
+	if (!(ipsec_dev_caps & MLX5_ACCEL_IPSEC_CAP_DEVICE))
+		return false;
+
+	if (!(ipsec_dev_caps & MLX5_ACCEL_IPSEC_CAP_ESP) &&
+	    mlx5_fs_is_outer_ipsec_flow(match_c))
+		return false;
+
+	if (!(ipsec_dev_caps & MLX5_ACCEL_IPSEC_CAP_IPV6) &&
+	    ipv6_flow)
+		return false;
+
+	if (!validate_fpga_full_mask(dev, match_c, match_v))
+		return false;
+
+	return true;
+}
+
+static bool mlx5_is_fpga_egress_ipsec_rule(struct mlx5_core_dev *dev,
+					   u8 match_criteria_enable,
+					   const u32 *match_c,
+					   const u32 *match_v,
+					   struct mlx5_flow_act *flow_act)
+{
+	const void *outer_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					   outer_headers);
+	bool is_dmac = MLX5_GET(fte_match_set_lyr_2_4, outer_c, dmac_47_16) ||
+			MLX5_GET(fte_match_set_lyr_2_4, outer_c, dmac_15_0);
+	bool is_smac = MLX5_GET(fte_match_set_lyr_2_4, outer_c, smac_47_16) ||
+			MLX5_GET(fte_match_set_lyr_2_4, outer_c, smac_15_0);
+	int ret;
+
+	ret = mlx5_is_fpga_ipsec_rule(dev, match_criteria_enable, match_c,
+				      match_v);
+	if (!ret)
+		return ret;
+
+	if (is_dmac || is_smac ||
+	    (match_criteria_enable &
+	     ~(MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS)) ||
+	    (flow_act->action & ~(MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | MLX5_FLOW_CONTEXT_ACTION_ALLOW)) ||
+	     flow_act->has_flow_tag)
+		return false;
+
+	return true;
+}
+
+void *mlx5_fpga_ipsec_create_sa_ctx(struct mlx5_core_dev *mdev,
+				    struct mlx5_accel_esp_xfrm *accel_xfrm,
+				    const __be32 saddr[4],
+				    const __be32 daddr[4],
+				    const __be32 spi, bool is_ipv6)
+{
+	struct mlx5_fpga_ipsec_sa_ctx *sa_ctx;
+	struct mlx5_fpga_esp_xfrm *fpga_xfrm =
+			container_of(accel_xfrm, typeof(*fpga_xfrm),
+				     accel_xfrm);
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	struct mlx5_fpga_ipsec *fipsec = fdev->ipsec;
+	int opcode, err;
+	void *context;
+
+	/* alloc SA */
+	sa_ctx = kzalloc(sizeof(*sa_ctx), GFP_KERNEL);
+	if (!sa_ctx)
+		return ERR_PTR(-ENOMEM);
+
+	sa_ctx->dev = mdev;
+
+	/* build candidate SA */
+	mlx5_fpga_ipsec_build_hw_sa(mdev, &accel_xfrm->attrs,
+				    saddr, daddr, spi, is_ipv6,
+				    &sa_ctx->hw_sa);
+
+	mutex_lock(&fpga_xfrm->lock);
+
+	if (fpga_xfrm->sa_ctx) {        /* multiple rules for same accel_xfrm */
+		/* all rules must be with same IPs and SPI */
+		if (memcmp(&sa_ctx->hw_sa, &fpga_xfrm->sa_ctx->hw_sa,
+			   sizeof(sa_ctx->hw_sa))) {
+			context = ERR_PTR(-EINVAL);
+			goto exists;
+		}
+
+		++fpga_xfrm->num_rules;
+		context = fpga_xfrm->sa_ctx;
+		goto exists;
+	}
+
+	/* This is unbounded fpga_xfrm, try to add to hash */
+	mutex_lock(&fipsec->sa_hash_lock);
+
+	err = rhashtable_lookup_insert_fast(&fipsec->sa_hash, &sa_ctx->hash,
+					    rhash_sa);
+	if (err) {
+		/* Can't bound different accel_xfrm to already existing sa_ctx.
+		 * This is because we can't support multiple ketmats for
+		 * same IPs and SPI
+		 */
+		context = ERR_PTR(-EEXIST);
+		goto unlock_hash;
+	}
+
+	/* Bound accel_xfrm to sa_ctx */
+	opcode = is_v2_sadb_supported(fdev->ipsec) ?
+			MLX5_FPGA_IPSEC_CMD_OP_ADD_SA_V2 :
+			MLX5_FPGA_IPSEC_CMD_OP_ADD_SA;
+	err = mlx5_fpga_ipsec_update_hw_sa(fdev, &sa_ctx->hw_sa, opcode);
+	sa_ctx->hw_sa.ipsec_sa_v1.cmd = 0;
+	if (err) {
+		context = ERR_PTR(err);
+		goto delete_hash;
+	}
+
+	mutex_unlock(&fipsec->sa_hash_lock);
+
+	++fpga_xfrm->num_rules;
+	fpga_xfrm->sa_ctx = sa_ctx;
+	sa_ctx->fpga_xfrm = fpga_xfrm;
+
+	mutex_unlock(&fpga_xfrm->lock);
+
+	return sa_ctx;
+
+delete_hash:
+	WARN_ON(rhashtable_remove_fast(&fipsec->sa_hash, &sa_ctx->hash,
+				       rhash_sa));
+unlock_hash:
+	mutex_unlock(&fipsec->sa_hash_lock);
+
+exists:
+	mutex_unlock(&fpga_xfrm->lock);
+	kfree(sa_ctx);
+	return context;
+}
+
+static void *
+mlx5_fpga_ipsec_fs_create_sa_ctx(struct mlx5_core_dev *mdev,
+				 struct fs_fte *fte,
+				 bool is_egress)
+{
+	struct mlx5_accel_esp_xfrm *accel_xfrm;
+	__be32 saddr[4], daddr[4], spi;
+	struct mlx5_flow_group *fg;
+	bool is_ipv6 = false;
+
+	fs_get_obj(fg, fte->node.parent);
+	/* validate */
+	if (is_egress &&
+	    !mlx5_is_fpga_egress_ipsec_rule(mdev,
+					    fg->mask.match_criteria_enable,
+					    fg->mask.match_criteria,
+					    fte->val,
+					    &fte->action))
+		return ERR_PTR(-EINVAL);
+	else if (!mlx5_is_fpga_ipsec_rule(mdev,
+					  fg->mask.match_criteria_enable,
+					  fg->mask.match_criteria,
+					  fte->val))
+		return ERR_PTR(-EINVAL);
+
+	/* get xfrm context */
+	accel_xfrm =
+		(struct mlx5_accel_esp_xfrm *)fte->action.esp_id;
+
+	/* IPs */
+	if (mlx5_fs_is_outer_ipv4_flow(mdev, fg->mask.match_criteria,
+				       fte->val)) {
+		memcpy(&saddr[3],
+		       MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+				    fte->val,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+				    sizeof(saddr[3]));
+		memcpy(&daddr[3],
+		       MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+				    fte->val,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+				    sizeof(daddr[3]));
+	} else {
+		memcpy(saddr,
+		       MLX5_ADDR_OF(fte_match_param,
+				    fte->val,
+				    outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6),
+				    sizeof(saddr));
+		memcpy(daddr,
+		       MLX5_ADDR_OF(fte_match_param,
+				    fte->val,
+				    outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+				    sizeof(daddr));
+		is_ipv6 = true;
+	}
+
+	/* SPI */
+	spi = MLX5_GET_BE(typeof(spi),
+			  fte_match_param, fte->val,
+			  misc_parameters.outer_esp_spi);
+
+	/* create */
+	return mlx5_fpga_ipsec_create_sa_ctx(mdev, accel_xfrm,
+					     saddr, daddr,
+					     spi, is_ipv6);
+}
+
+static void
+mlx5_fpga_ipsec_release_sa_ctx(struct mlx5_fpga_ipsec_sa_ctx *sa_ctx)
+{
+	struct mlx5_fpga_device *fdev = sa_ctx->dev->fpga;
+	struct mlx5_fpga_ipsec *fipsec = fdev->ipsec;
+	int opcode = is_v2_sadb_supported(fdev->ipsec) ?
+			MLX5_FPGA_IPSEC_CMD_OP_DEL_SA_V2 :
+			MLX5_FPGA_IPSEC_CMD_OP_DEL_SA;
+	int err;
+
+	err = mlx5_fpga_ipsec_update_hw_sa(fdev, &sa_ctx->hw_sa, opcode);
+	sa_ctx->hw_sa.ipsec_sa_v1.cmd = 0;
+	if (err) {
+		WARN_ON(err);
+		return;
+	}
+
+	mutex_lock(&fipsec->sa_hash_lock);
+	WARN_ON(rhashtable_remove_fast(&fipsec->sa_hash, &sa_ctx->hash,
+				       rhash_sa));
+	mutex_unlock(&fipsec->sa_hash_lock);
+}
+
+void mlx5_fpga_ipsec_delete_sa_ctx(void *context)
+{
+	struct mlx5_fpga_esp_xfrm *fpga_xfrm =
+			((struct mlx5_fpga_ipsec_sa_ctx *)context)->fpga_xfrm;
+
+	mutex_lock(&fpga_xfrm->lock);
+	if (!--fpga_xfrm->num_rules) {
+		mlx5_fpga_ipsec_release_sa_ctx(fpga_xfrm->sa_ctx);
+		fpga_xfrm->sa_ctx = NULL;
+	}
+	mutex_unlock(&fpga_xfrm->lock);
+}
+
+static inline struct mlx5_fpga_ipsec_rule *
+_rule_search(struct rb_root *root, struct fs_fte *fte)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct mlx5_fpga_ipsec_rule *rule =
+				container_of(node, struct mlx5_fpga_ipsec_rule,
+					     node);
+
+		if (rule->fte < fte)
+			node = node->rb_left;
+		else if (rule->fte > fte)
+			node = node->rb_right;
+		else
+			return rule;
+	}
+	return NULL;
+}
+
+static struct mlx5_fpga_ipsec_rule *
+rule_search(struct mlx5_fpga_ipsec *ipsec_dev, struct fs_fte *fte)
+{
+	struct mlx5_fpga_ipsec_rule *rule;
+
+	mutex_lock(&ipsec_dev->rules_rb_lock);
+	rule = _rule_search(&ipsec_dev->rules_rb, fte);
+	mutex_unlock(&ipsec_dev->rules_rb_lock);
+
+	return rule;
+}
+
+static inline int _rule_insert(struct rb_root *root,
+			       struct mlx5_fpga_ipsec_rule *rule)
+{
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct mlx5_fpga_ipsec_rule *this =
+				container_of(*new, struct mlx5_fpga_ipsec_rule,
+					     node);
+
+		parent = *new;
+		if (rule->fte < this->fte)
+			new = &((*new)->rb_left);
+		else if (rule->fte > this->fte)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&rule->node, parent, new);
+	rb_insert_color(&rule->node, root);
+
+	return 0;
+}
+
+static int rule_insert(struct mlx5_fpga_ipsec *ipsec_dev,
+		       struct mlx5_fpga_ipsec_rule *rule)
+{
+	int ret;
+
+	mutex_lock(&ipsec_dev->rules_rb_lock);
+	ret = _rule_insert(&ipsec_dev->rules_rb, rule);
+	mutex_unlock(&ipsec_dev->rules_rb_lock);
+
+	return ret;
+}
+
+static inline void _rule_delete(struct mlx5_fpga_ipsec *ipsec_dev,
+				struct mlx5_fpga_ipsec_rule *rule)
+{
+	struct rb_root *root = &ipsec_dev->rules_rb;
+
+	mutex_lock(&ipsec_dev->rules_rb_lock);
+	rb_erase(&rule->node, root);
+	mutex_unlock(&ipsec_dev->rules_rb_lock);
+}
+
+static void rule_delete(struct mlx5_fpga_ipsec *ipsec_dev,
+			struct mlx5_fpga_ipsec_rule *rule)
+{
+	_rule_delete(ipsec_dev, rule);
+	kfree(rule);
+}
+
+struct mailbox_mod {
+	uintptr_t			saved_esp_id;
+	u32				saved_action;
+	u32				saved_outer_esp_spi_value;
+};
+
+static void restore_spec_mailbox(struct fs_fte *fte,
+				 struct mailbox_mod *mbox_mod)
+{
+	char *misc_params_v = MLX5_ADDR_OF(fte_match_param,
+					   fte->val,
+					   misc_parameters);
+
+	MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi,
+		 mbox_mod->saved_outer_esp_spi_value);
+	fte->action.action |= mbox_mod->saved_action;
+	fte->action.esp_id = (uintptr_t)mbox_mod->saved_esp_id;
+}
+
+static void modify_spec_mailbox(struct mlx5_core_dev *mdev,
+				struct fs_fte *fte,
+				struct mailbox_mod *mbox_mod)
+{
+	char *misc_params_v = MLX5_ADDR_OF(fte_match_param,
+					   fte->val,
+					   misc_parameters);
+
+	mbox_mod->saved_esp_id = fte->action.esp_id;
+	mbox_mod->saved_action = fte->action.action &
+			(MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+			 MLX5_FLOW_CONTEXT_ACTION_DECRYPT);
+	mbox_mod->saved_outer_esp_spi_value =
+			MLX5_GET(fte_match_set_misc, misc_params_v,
+				 outer_esp_spi);
+
+	fte->action.esp_id = 0;
+	fte->action.action &= ~(MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+				MLX5_FLOW_CONTEXT_ACTION_DECRYPT);
+	if (!MLX5_CAP_FLOWTABLE(mdev,
+				flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
+		MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi, 0);
+}
+
+static enum fs_flow_table_type egress_to_fs_ft(bool egress)
+{
+	return egress ? FS_FT_NIC_TX : FS_FT_NIC_RX;
+}
+
+static int fpga_ipsec_fs_create_flow_group(struct mlx5_core_dev *dev,
+					   struct mlx5_flow_table *ft,
+					   u32 *in,
+					   unsigned int *group_id,
+					   bool is_egress)
+{
+	int (*create_flow_group)(struct mlx5_core_dev *dev,
+				 struct mlx5_flow_table *ft, u32 *in,
+				 unsigned int *group_id) =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(is_egress))->create_flow_group;
+	char *misc_params_c = MLX5_ADDR_OF(create_flow_group_in, in,
+					   match_criteria.misc_parameters);
+	u32 saved_outer_esp_spi_mask;
+	u8 match_criteria_enable;
+	int ret;
+
+	if (MLX5_CAP_FLOWTABLE(dev,
+			       flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
+		return create_flow_group(dev, ft, in, group_id);
+
+	match_criteria_enable =
+		MLX5_GET(create_flow_group_in, in, match_criteria_enable);
+	saved_outer_esp_spi_mask =
+		MLX5_GET(fte_match_set_misc, misc_params_c, outer_esp_spi);
+	if (!match_criteria_enable || !saved_outer_esp_spi_mask)
+		return create_flow_group(dev, ft, in, group_id);
+
+	MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi, 0);
+
+	if (!(*misc_params_c) &&
+	    !memcmp(misc_params_c, misc_params_c + 1, MLX5_ST_SZ_BYTES(fte_match_set_misc) - 1))
+		MLX5_SET(create_flow_group_in, in, match_criteria_enable,
+			 match_criteria_enable & ~MLX5_MATCH_MISC_PARAMETERS);
+
+	ret = create_flow_group(dev, ft, in, group_id);
+
+	MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi, saved_outer_esp_spi_mask);
+	MLX5_SET(create_flow_group_in, in, match_criteria_enable, match_criteria_enable);
+
+	return ret;
+}
+
+static int fpga_ipsec_fs_create_fte(struct mlx5_core_dev *dev,
+				    struct mlx5_flow_table *ft,
+				    struct mlx5_flow_group *fg,
+				    struct fs_fte *fte,
+				    bool is_egress)
+{
+	int (*create_fte)(struct mlx5_core_dev *dev,
+			  struct mlx5_flow_table *ft,
+			  struct mlx5_flow_group *fg,
+			  struct fs_fte *fte) =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(is_egress))->create_fte;
+	struct mlx5_fpga_device *fdev = dev->fpga;
+	struct mlx5_fpga_ipsec *fipsec = fdev->ipsec;
+	struct mlx5_fpga_ipsec_rule *rule;
+	bool is_esp = fte->action.esp_id;
+	struct mailbox_mod mbox_mod;
+	int ret;
+
+	if (!is_esp ||
+	    !(fte->action.action &
+	      (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+	       MLX5_FLOW_CONTEXT_ACTION_DECRYPT)))
+		return create_fte(dev, ft, fg, fte);
+
+	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+	if (!rule)
+		return -ENOMEM;
+
+	rule->ctx = mlx5_fpga_ipsec_fs_create_sa_ctx(dev, fte, is_egress);
+	if (IS_ERR(rule->ctx)) {
+		int err = PTR_ERR(rule->ctx);
+		kfree(rule);
+		return err;
+	}
+
+	rule->fte = fte;
+	WARN_ON(rule_insert(fipsec, rule));
+
+	modify_spec_mailbox(dev, fte, &mbox_mod);
+	ret = create_fte(dev, ft, fg, fte);
+	restore_spec_mailbox(fte, &mbox_mod);
+	if (ret) {
+		_rule_delete(fipsec, rule);
+		mlx5_fpga_ipsec_delete_sa_ctx(rule->ctx);
+		kfree(rule);
+	}
+
+	return ret;
+}
+
+static int fpga_ipsec_fs_update_fte(struct mlx5_core_dev *dev,
+				    struct mlx5_flow_table *ft,
+				    unsigned int group_id,
+				    int modify_mask,
+				    struct fs_fte *fte,
+				    bool is_egress)
+{
+	int (*update_fte)(struct mlx5_core_dev *dev,
+			  struct mlx5_flow_table *ft,
+			  unsigned int group_id,
+			  int modify_mask,
+			  struct fs_fte *fte) =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(is_egress))->update_fte;
+	bool is_esp = fte->action.esp_id;
+	struct mailbox_mod mbox_mod;
+	int ret;
+
+	if (!is_esp ||
+	    !(fte->action.action &
+	      (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+	       MLX5_FLOW_CONTEXT_ACTION_DECRYPT)))
+		return update_fte(dev, ft, group_id, modify_mask, fte);
+
+	modify_spec_mailbox(dev, fte, &mbox_mod);
+	ret = update_fte(dev, ft, group_id, modify_mask, fte);
+	restore_spec_mailbox(fte, &mbox_mod);
+
+	return ret;
+}
+
+static int fpga_ipsec_fs_delete_fte(struct mlx5_core_dev *dev,
+				    struct mlx5_flow_table *ft,
+				    struct fs_fte *fte,
+				    bool is_egress)
+{
+	int (*delete_fte)(struct mlx5_core_dev *dev,
+			  struct mlx5_flow_table *ft,
+			  struct fs_fte *fte) =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(is_egress))->delete_fte;
+	struct mlx5_fpga_device *fdev = dev->fpga;
+	struct mlx5_fpga_ipsec *fipsec = fdev->ipsec;
+	struct mlx5_fpga_ipsec_rule *rule;
+	bool is_esp = fte->action.esp_id;
+	struct mailbox_mod mbox_mod;
+	int ret;
+
+	if (!is_esp ||
+	    !(fte->action.action &
+	      (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+	       MLX5_FLOW_CONTEXT_ACTION_DECRYPT)))
+		return delete_fte(dev, ft, fte);
+
+	rule = rule_search(fipsec, fte);
+	if (!rule)
+		return -ENOENT;
+
+	mlx5_fpga_ipsec_delete_sa_ctx(rule->ctx);
+	rule_delete(fipsec, rule);
+
+	modify_spec_mailbox(dev, fte, &mbox_mod);
+	ret = delete_fte(dev, ft, fte);
+	restore_spec_mailbox(fte, &mbox_mod);
+
+	return ret;
+}
+
+static int
+mlx5_fpga_ipsec_fs_create_flow_group_egress(struct mlx5_core_dev *dev,
+					    struct mlx5_flow_table *ft,
+					    u32 *in,
+					    unsigned int *group_id)
+{
+	return fpga_ipsec_fs_create_flow_group(dev, ft, in, group_id, true);
+}
+
+static int
+mlx5_fpga_ipsec_fs_create_fte_egress(struct mlx5_core_dev *dev,
+				     struct mlx5_flow_table *ft,
+				     struct mlx5_flow_group *fg,
+				     struct fs_fte *fte)
+{
+	return fpga_ipsec_fs_create_fte(dev, ft, fg, fte, true);
+}
+
+static int
+mlx5_fpga_ipsec_fs_update_fte_egress(struct mlx5_core_dev *dev,
+				     struct mlx5_flow_table *ft,
+				     unsigned int group_id,
+				     int modify_mask,
+				     struct fs_fte *fte)
+{
+	return fpga_ipsec_fs_update_fte(dev, ft, group_id, modify_mask, fte,
+					true);
+}
+
+static int
+mlx5_fpga_ipsec_fs_delete_fte_egress(struct mlx5_core_dev *dev,
+				     struct mlx5_flow_table *ft,
+				     struct fs_fte *fte)
+{
+	return fpga_ipsec_fs_delete_fte(dev, ft, fte, true);
+}
+
+static int
+mlx5_fpga_ipsec_fs_create_flow_group_ingress(struct mlx5_core_dev *dev,
+					     struct mlx5_flow_table *ft,
+					     u32 *in,
+					     unsigned int *group_id)
+{
+	return fpga_ipsec_fs_create_flow_group(dev, ft, in, group_id, false);
+}
+
+static int
+mlx5_fpga_ipsec_fs_create_fte_ingress(struct mlx5_core_dev *dev,
+				      struct mlx5_flow_table *ft,
+				      struct mlx5_flow_group *fg,
+				      struct fs_fte *fte)
+{
+	return fpga_ipsec_fs_create_fte(dev, ft, fg, fte, false);
+}
+
+static int
+mlx5_fpga_ipsec_fs_update_fte_ingress(struct mlx5_core_dev *dev,
+				      struct mlx5_flow_table *ft,
+				      unsigned int group_id,
+				      int modify_mask,
+				      struct fs_fte *fte)
+{
+	return fpga_ipsec_fs_update_fte(dev, ft, group_id, modify_mask, fte,
+					false);
+}
+
+static int
+mlx5_fpga_ipsec_fs_delete_fte_ingress(struct mlx5_core_dev *dev,
+				      struct mlx5_flow_table *ft,
+				      struct fs_fte *fte)
+{
+	return fpga_ipsec_fs_delete_fte(dev, ft, fte, false);
+}
+
+static struct mlx5_flow_cmds fpga_ipsec_ingress;
+static struct mlx5_flow_cmds fpga_ipsec_egress;
+
+const struct mlx5_flow_cmds *mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type)
+{
+	switch (type) {
+	case FS_FT_NIC_RX:
+		return &fpga_ipsec_ingress;
+	case FS_FT_NIC_TX:
+		return &fpga_ipsec_egress;
+	default:
+		WARN_ON(true);
+		return NULL;
+	}
+}
+
+int mlx5_fpga_ipsec_init(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_conn_attr init_attr = {0};
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	struct mlx5_fpga_conn *conn;
+	int err;
+
+	if (!mlx5_fpga_is_ipsec_device(mdev))
+		return 0;
+
+	fdev->ipsec = kzalloc(sizeof(*fdev->ipsec), GFP_KERNEL);
+	if (!fdev->ipsec)
+		return -ENOMEM;
+
+	fdev->ipsec->fdev = fdev;
+
+	err = mlx5_fpga_get_sbu_caps(fdev, sizeof(fdev->ipsec->caps),
+				     fdev->ipsec->caps);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to retrieve IPSec extended capabilities: %d\n",
+			      err);
+		goto error;
+	}
+
+	INIT_LIST_HEAD(&fdev->ipsec->pending_cmds);
+	spin_lock_init(&fdev->ipsec->pending_cmds_lock);
+
+	init_attr.rx_size = SBU_QP_QUEUE_SIZE;
+	init_attr.tx_size = SBU_QP_QUEUE_SIZE;
+	init_attr.recv_cb = mlx5_fpga_ipsec_recv;
+	init_attr.cb_arg = fdev;
+	conn = mlx5_fpga_sbu_conn_create(fdev, &init_attr);
+	if (IS_ERR(conn)) {
+		err = PTR_ERR(conn);
+		mlx5_fpga_err(fdev, "Error creating IPSec command connection %d\n",
+			      err);
+		goto error;
+	}
+	fdev->ipsec->conn = conn;
+
+	err = rhashtable_init(&fdev->ipsec->sa_hash, &rhash_sa);
+	if (err)
+		goto err_destroy_conn;
+	mutex_init(&fdev->ipsec->sa_hash_lock);
+
+	fdev->ipsec->rules_rb = RB_ROOT;
+	mutex_init(&fdev->ipsec->rules_rb_lock);
+
+	err = mlx5_fpga_ipsec_enable_supported_caps(mdev);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to enable IPSec extended capabilities: %d\n",
+			      err);
+		goto err_destroy_hash;
+	}
+
+	return 0;
+
+err_destroy_hash:
+	rhashtable_destroy(&fdev->ipsec->sa_hash);
+
+err_destroy_conn:
+	mlx5_fpga_sbu_conn_destroy(conn);
+
+error:
+	kfree(fdev->ipsec);
+	fdev->ipsec = NULL;
+	return err;
+}
+
+static void destroy_rules_rb(struct rb_root *root)
+{
+	struct mlx5_fpga_ipsec_rule *r, *tmp;
+
+	rbtree_postorder_for_each_entry_safe(r, tmp, root, node) {
+		rb_erase(&r->node, root);
+		mlx5_fpga_ipsec_delete_sa_ctx(r->ctx);
+		kfree(r);
+	}
+}
+
+void mlx5_fpga_ipsec_cleanup(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+
+	if (!mlx5_fpga_is_ipsec_device(mdev))
+		return;
+
+	destroy_rules_rb(&fdev->ipsec->rules_rb);
+	rhashtable_destroy(&fdev->ipsec->sa_hash);
+
+	mlx5_fpga_sbu_conn_destroy(fdev->ipsec->conn);
+	kfree(fdev->ipsec);
+	fdev->ipsec = NULL;
+}
+
+void mlx5_fpga_ipsec_build_fs_cmds(void)
+{
+	/* ingress */
+	fpga_ipsec_ingress.create_flow_table =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->create_flow_table;
+	fpga_ipsec_ingress.destroy_flow_table =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->destroy_flow_table;
+	fpga_ipsec_ingress.modify_flow_table =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->modify_flow_table;
+	fpga_ipsec_ingress.create_flow_group =
+		mlx5_fpga_ipsec_fs_create_flow_group_ingress;
+	fpga_ipsec_ingress.destroy_flow_group =
+		 mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->destroy_flow_group;
+	fpga_ipsec_ingress.create_fte =
+		mlx5_fpga_ipsec_fs_create_fte_ingress;
+	fpga_ipsec_ingress.update_fte =
+		mlx5_fpga_ipsec_fs_update_fte_ingress;
+	fpga_ipsec_ingress.delete_fte =
+		mlx5_fpga_ipsec_fs_delete_fte_ingress;
+	fpga_ipsec_ingress.update_root_ft =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->update_root_ft;
+
+	/* egress */
+	fpga_ipsec_egress.create_flow_table =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->create_flow_table;
+	fpga_ipsec_egress.destroy_flow_table =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->destroy_flow_table;
+	fpga_ipsec_egress.modify_flow_table =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->modify_flow_table;
+	fpga_ipsec_egress.create_flow_group =
+		mlx5_fpga_ipsec_fs_create_flow_group_egress;
+	fpga_ipsec_egress.destroy_flow_group =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->destroy_flow_group;
+	fpga_ipsec_egress.create_fte =
+		mlx5_fpga_ipsec_fs_create_fte_egress;
+	fpga_ipsec_egress.update_fte =
+		mlx5_fpga_ipsec_fs_update_fte_egress;
+	fpga_ipsec_egress.delete_fte =
+		mlx5_fpga_ipsec_fs_delete_fte_egress;
+	fpga_ipsec_egress.update_root_ft =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->update_root_ft;
+}
+
+static int
+mlx5_fpga_esp_validate_xfrm_attrs(struct mlx5_core_dev *mdev,
+				  const struct mlx5_accel_esp_xfrm_attrs *attrs)
+{
+	if (attrs->tfc_pad) {
+		mlx5_core_err(mdev, "Cannot offload xfrm states with tfc padding\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (attrs->replay_type != MLX5_ACCEL_ESP_REPLAY_NONE) {
+		mlx5_core_err(mdev, "Cannot offload xfrm states with anti replay\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (attrs->keymat_type != MLX5_ACCEL_ESP_KEYMAT_AES_GCM) {
+		mlx5_core_err(mdev, "Only aes gcm keymat is supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (attrs->keymat.aes_gcm.iv_algo !=
+	    MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ) {
+		mlx5_core_err(mdev, "Only iv sequence algo is supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (attrs->keymat.aes_gcm.icv_len != 128) {
+		mlx5_core_err(mdev, "Cannot offload xfrm states with AEAD ICV length other than 128bit\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (attrs->keymat.aes_gcm.key_len != 128 &&
+	    attrs->keymat.aes_gcm.key_len != 256) {
+		mlx5_core_err(mdev, "Cannot offload xfrm states with AEAD key length other than 128/256 bit\n");
+		return -EOPNOTSUPP;
+	}
+
+	if ((attrs->flags & MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED) &&
+	    (!MLX5_GET(ipsec_extended_cap, mdev->fpga->ipsec->caps,
+		       v2_command))) {
+		mlx5_core_err(mdev, "Cannot offload xfrm states with AEAD key length other than 128/256 bit\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+struct mlx5_accel_esp_xfrm *
+mlx5_fpga_esp_create_xfrm(struct mlx5_core_dev *mdev,
+			  const struct mlx5_accel_esp_xfrm_attrs *attrs,
+			  u32 flags)
+{
+	struct mlx5_fpga_esp_xfrm *fpga_xfrm;
+
+	if (!(flags & MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA)) {
+		mlx5_core_warn(mdev, "Tried to create an esp action without metadata\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (mlx5_fpga_esp_validate_xfrm_attrs(mdev, attrs)) {
+		mlx5_core_warn(mdev, "Tried to create an esp with unsupported attrs\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	fpga_xfrm = kzalloc(sizeof(*fpga_xfrm), GFP_KERNEL);
+	if (!fpga_xfrm)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&fpga_xfrm->lock);
+	memcpy(&fpga_xfrm->accel_xfrm.attrs, attrs,
+	       sizeof(fpga_xfrm->accel_xfrm.attrs));
+
+	return &fpga_xfrm->accel_xfrm;
+}
+
+void mlx5_fpga_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm)
+{
+	struct mlx5_fpga_esp_xfrm *fpga_xfrm =
+			container_of(xfrm, struct mlx5_fpga_esp_xfrm,
+				     accel_xfrm);
+	/* assuming no sa_ctx are connected to this xfrm_ctx */
+	kfree(fpga_xfrm);
+}
+
+int mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
+			      const struct mlx5_accel_esp_xfrm_attrs *attrs)
+{
+	struct mlx5_core_dev *mdev = xfrm->mdev;
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	struct mlx5_fpga_ipsec *fipsec = fdev->ipsec;
+	struct mlx5_fpga_esp_xfrm *fpga_xfrm;
+	struct mlx5_ifc_fpga_ipsec_sa org_hw_sa;
+
+	int err = 0;
+
+	if (!memcmp(&xfrm->attrs, attrs, sizeof(xfrm->attrs)))
+		return 0;
+
+	if (!mlx5_fpga_esp_validate_xfrm_attrs(mdev, attrs)) {
+		mlx5_core_warn(mdev, "Tried to create an esp with unsupported attrs\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (is_v2_sadb_supported(fipsec)) {
+		mlx5_core_warn(mdev, "Modify esp is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	fpga_xfrm = container_of(xfrm, struct mlx5_fpga_esp_xfrm, accel_xfrm);
+
+	mutex_lock(&fpga_xfrm->lock);
+
+	if (!fpga_xfrm->sa_ctx)
+		/* Unbounded xfrm, chane only sw attrs */
+		goto change_sw_xfrm_attrs;
+
+	/* copy original hw sa */
+	memcpy(&org_hw_sa, &fpga_xfrm->sa_ctx->hw_sa, sizeof(org_hw_sa));
+	mutex_lock(&fipsec->sa_hash_lock);
+	/* remove original hw sa from hash */
+	WARN_ON(rhashtable_remove_fast(&fipsec->sa_hash,
+				       &fpga_xfrm->sa_ctx->hash, rhash_sa));
+	/* update hw_sa with new xfrm attrs*/
+	mlx5_fpga_ipsec_build_hw_xfrm(xfrm->mdev, attrs,
+				      &fpga_xfrm->sa_ctx->hw_sa);
+	/* try to insert new hw_sa to hash */
+	err = rhashtable_insert_fast(&fipsec->sa_hash,
+				     &fpga_xfrm->sa_ctx->hash, rhash_sa);
+	if (err)
+		goto rollback_sa;
+
+	/* modify device with new hw_sa */
+	err = mlx5_fpga_ipsec_update_hw_sa(fdev, &fpga_xfrm->sa_ctx->hw_sa,
+					   MLX5_FPGA_IPSEC_CMD_OP_MOD_SA_V2);
+	fpga_xfrm->sa_ctx->hw_sa.ipsec_sa_v1.cmd = 0;
+	if (err)
+		WARN_ON(rhashtable_remove_fast(&fipsec->sa_hash,
+					       &fpga_xfrm->sa_ctx->hash,
+					       rhash_sa));
+rollback_sa:
+	if (err) {
+		/* return original hw_sa to hash */
+		memcpy(&fpga_xfrm->sa_ctx->hw_sa, &org_hw_sa,
+		       sizeof(org_hw_sa));
+		WARN_ON(rhashtable_insert_fast(&fipsec->sa_hash,
+					       &fpga_xfrm->sa_ctx->hash,
+					       rhash_sa));
+	}
+	mutex_unlock(&fipsec->sa_hash_lock);
+
+change_sw_xfrm_attrs:
+	if (!err)
+		memcpy(&xfrm->attrs, attrs, sizeof(xfrm->attrs));
+	mutex_unlock(&fpga_xfrm->lock);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h
new file mode 100644
index 0000000..2b5e63b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5_FPGA_IPSEC_H__
+#define __MLX5_FPGA_IPSEC_H__
+
+#include "accel/ipsec.h"
+#include "fs_cmd.h"
+
+#ifdef CONFIG_MLX5_FPGA
+
+u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev);
+unsigned int mlx5_fpga_ipsec_counters_count(struct mlx5_core_dev *mdev);
+int mlx5_fpga_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters,
+				  unsigned int counters_count);
+
+void *mlx5_fpga_ipsec_create_sa_ctx(struct mlx5_core_dev *mdev,
+				    struct mlx5_accel_esp_xfrm *accel_xfrm,
+				    const __be32 saddr[4],
+				    const __be32 daddr[4],
+				    const __be32 spi, bool is_ipv6);
+void mlx5_fpga_ipsec_delete_sa_ctx(void *context);
+
+int mlx5_fpga_ipsec_init(struct mlx5_core_dev *mdev);
+void mlx5_fpga_ipsec_cleanup(struct mlx5_core_dev *mdev);
+void mlx5_fpga_ipsec_build_fs_cmds(void);
+
+struct mlx5_accel_esp_xfrm *
+mlx5_fpga_esp_create_xfrm(struct mlx5_core_dev *mdev,
+			  const struct mlx5_accel_esp_xfrm_attrs *attrs,
+			  u32 flags);
+void mlx5_fpga_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm);
+int mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
+			      const struct mlx5_accel_esp_xfrm_attrs *attrs);
+
+const struct mlx5_flow_cmds *
+mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type);
+
+#else
+
+static inline u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev)
+{
+	return 0;
+}
+
+static inline unsigned int
+mlx5_fpga_ipsec_counters_count(struct mlx5_core_dev *mdev)
+{
+	return 0;
+}
+
+static inline int mlx5_fpga_ipsec_counters_read(struct mlx5_core_dev *mdev,
+						u64 *counters)
+{
+	return 0;
+}
+
+static inline void *
+mlx5_fpga_ipsec_create_sa_ctx(struct mlx5_core_dev *mdev,
+			      struct mlx5_accel_esp_xfrm *accel_xfrm,
+			      const __be32 saddr[4],
+			      const __be32 daddr[4],
+			      const __be32 spi, bool is_ipv6)
+{
+	return NULL;
+}
+
+static inline void mlx5_fpga_ipsec_delete_sa_ctx(void *context)
+{
+}
+
+static inline int mlx5_fpga_ipsec_init(struct mlx5_core_dev *mdev)
+{
+	return 0;
+}
+
+static inline void mlx5_fpga_ipsec_cleanup(struct mlx5_core_dev *mdev)
+{
+}
+
+static inline void mlx5_fpga_ipsec_build_fs_cmds(void)
+{
+}
+
+static inline struct mlx5_accel_esp_xfrm *
+mlx5_fpga_esp_create_xfrm(struct mlx5_core_dev *mdev,
+			  const struct mlx5_accel_esp_xfrm_attrs *attrs,
+			  u32 flags)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void mlx5_fpga_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm)
+{
+}
+
+static inline int
+mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
+			  const struct mlx5_accel_esp_xfrm_attrs *attrs)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline const struct mlx5_flow_cmds *
+mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type)
+{
+	return mlx5_fs_cmd_get_default(type);
+}
+
+#endif /* CONFIG_MLX5_FPGA */
+
+#endif	/* __MLX5_FPGA_SADB_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
new file mode 100644
index 0000000..1496296
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx5/device.h>
+
+#include "fpga/core.h"
+#include "fpga/conn.h"
+#include "fpga/sdk.h"
+
+struct mlx5_fpga_conn *
+mlx5_fpga_sbu_conn_create(struct mlx5_fpga_device *fdev,
+			  struct mlx5_fpga_conn_attr *attr)
+{
+	return mlx5_fpga_conn_create(fdev, attr, MLX5_FPGA_QPC_QP_TYPE_SANDBOX_QP);
+}
+EXPORT_SYMBOL(mlx5_fpga_sbu_conn_create);
+
+void mlx5_fpga_sbu_conn_destroy(struct mlx5_fpga_conn *conn)
+{
+	mlx5_fpga_conn_destroy(conn);
+}
+EXPORT_SYMBOL(mlx5_fpga_sbu_conn_destroy);
+
+int mlx5_fpga_sbu_conn_sendmsg(struct mlx5_fpga_conn *conn,
+			       struct mlx5_fpga_dma_buf *buf)
+{
+	return mlx5_fpga_conn_send(conn, buf);
+}
+EXPORT_SYMBOL(mlx5_fpga_sbu_conn_sendmsg);
+
+static int mlx5_fpga_mem_read_i2c(struct mlx5_fpga_device *fdev, size_t size,
+				  u64 addr, u8 *buf)
+{
+	size_t max_size = MLX5_FPGA_ACCESS_REG_SIZE_MAX;
+	size_t bytes_done = 0;
+	u8 actual_size;
+	int err;
+
+	if (!size)
+		return -EINVAL;
+
+	if (!fdev->mdev)
+		return -ENOTCONN;
+
+	while (bytes_done < size) {
+		actual_size = min(max_size, (size - bytes_done));
+
+		err = mlx5_fpga_access_reg(fdev->mdev, actual_size,
+					   addr + bytes_done,
+					   buf + bytes_done, false);
+		if (err) {
+			mlx5_fpga_err(fdev, "Failed to read over I2C: %d\n",
+				      err);
+			break;
+		}
+
+		bytes_done += actual_size;
+	}
+
+	return err;
+}
+
+static int mlx5_fpga_mem_write_i2c(struct mlx5_fpga_device *fdev, size_t size,
+				   u64 addr, u8 *buf)
+{
+	size_t max_size = MLX5_FPGA_ACCESS_REG_SIZE_MAX;
+	size_t bytes_done = 0;
+	u8 actual_size;
+	int err;
+
+	if (!size)
+		return -EINVAL;
+
+	if (!fdev->mdev)
+		return -ENOTCONN;
+
+	while (bytes_done < size) {
+		actual_size = min(max_size, (size - bytes_done));
+
+		err = mlx5_fpga_access_reg(fdev->mdev, actual_size,
+					   addr + bytes_done,
+					   buf + bytes_done, true);
+		if (err) {
+			mlx5_fpga_err(fdev, "Failed to write FPGA crspace\n");
+			break;
+		}
+
+		bytes_done += actual_size;
+	}
+
+	return err;
+}
+
+int mlx5_fpga_mem_read(struct mlx5_fpga_device *fdev, size_t size, u64 addr,
+		       void *buf, enum mlx5_fpga_access_type access_type)
+{
+	int ret;
+
+	switch (access_type) {
+	case MLX5_FPGA_ACCESS_TYPE_I2C:
+		ret = mlx5_fpga_mem_read_i2c(fdev, size, addr, buf);
+		if (ret)
+			return ret;
+		break;
+	default:
+		mlx5_fpga_warn(fdev, "Unexpected read access_type %u\n",
+			       access_type);
+		return -EACCES;
+	}
+
+	return size;
+}
+EXPORT_SYMBOL(mlx5_fpga_mem_read);
+
+int mlx5_fpga_mem_write(struct mlx5_fpga_device *fdev, size_t size, u64 addr,
+			void *buf, enum mlx5_fpga_access_type access_type)
+{
+	int ret;
+
+	switch (access_type) {
+	case MLX5_FPGA_ACCESS_TYPE_I2C:
+		ret = mlx5_fpga_mem_write_i2c(fdev, size, addr, buf);
+		if (ret)
+			return ret;
+		break;
+	default:
+		mlx5_fpga_warn(fdev, "Unexpected write access_type %u\n",
+			       access_type);
+		return -EACCES;
+	}
+
+	return size;
+}
+EXPORT_SYMBOL(mlx5_fpga_mem_write);
+
+int mlx5_fpga_get_sbu_caps(struct mlx5_fpga_device *fdev, int size, void *buf)
+{
+	return mlx5_fpga_sbu_caps(fdev->mdev, buf, size);
+}
+EXPORT_SYMBOL(mlx5_fpga_get_sbu_caps);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
new file mode 100644
index 0000000..baa537e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef MLX5_FPGA_SDK_H
+#define MLX5_FPGA_SDK_H
+
+#include <linux/types.h>
+#include <linux/dma-direction.h>
+
+/**
+ * DOC: Innova SDK
+ * This header defines the in-kernel API for Innova FPGA client drivers.
+ */
+
+enum mlx5_fpga_access_type {
+	MLX5_FPGA_ACCESS_TYPE_I2C = 0x0,
+	MLX5_FPGA_ACCESS_TYPE_DONTCARE = 0x0,
+};
+
+struct mlx5_fpga_conn;
+struct mlx5_fpga_device;
+
+/**
+ * struct mlx5_fpga_dma_entry - A scatter-gather DMA entry
+ */
+struct mlx5_fpga_dma_entry {
+	/** @data: Virtual address pointer to the data */
+	void *data;
+	/** @size: Size in bytes of the data */
+	unsigned int size;
+	/** @dma_addr: Private member. Physical DMA-mapped address of the data */
+	dma_addr_t dma_addr;
+};
+
+/**
+ * struct mlx5_fpga_dma_buf - A packet buffer
+ * May contain up to 2 scatter-gather data entries
+ */
+struct mlx5_fpga_dma_buf {
+	/** @dma_dir: DMA direction */
+	enum dma_data_direction dma_dir;
+	/** @sg: Scatter-gather entries pointing to the data in memory */
+	struct mlx5_fpga_dma_entry sg[2];
+	/** @list: Item in SQ backlog, for TX packets */
+	struct list_head list;
+	/**
+	 * @complete: Completion routine, for TX packets
+	 * @conn: FPGA Connection this packet was sent to
+	 * @fdev: FPGA device this packet was sent to
+	 * @buf: The packet buffer
+	 * @status: 0 if successful, or an error code otherwise
+	 */
+	void (*complete)(struct mlx5_fpga_conn *conn,
+			 struct mlx5_fpga_device *fdev,
+			 struct mlx5_fpga_dma_buf *buf, u8 status);
+};
+
+/**
+ * struct mlx5_fpga_conn_attr - FPGA connection attributes
+ * Describes the attributes of a connection
+ */
+struct mlx5_fpga_conn_attr {
+	/** @tx_size: Size of connection TX queue, in packets */
+	unsigned int tx_size;
+	/** @rx_size: Size of connection RX queue, in packets */
+	unsigned int rx_size;
+	/**
+	 * @recv_cb: Callback function which is called for received packets
+	 * @cb_arg: The value provided in mlx5_fpga_conn_attr.cb_arg
+	 * @buf: A buffer containing a received packet
+	 *
+	 * buf is guaranteed to only contain a single scatter-gather entry.
+	 * The size of the actual packet received is specified in buf.sg[0].size
+	 * When this callback returns, the packet buffer may be re-used for
+	 * subsequent receives.
+	 */
+	void (*recv_cb)(void *cb_arg, struct mlx5_fpga_dma_buf *buf);
+	void *cb_arg;
+};
+
+/**
+ * mlx5_fpga_sbu_conn_create() - Initialize a new FPGA SBU connection
+ * @fdev: The FPGA device
+ * @attr: Attributes of the new connection
+ *
+ * Sets up a new FPGA SBU connection with the specified attributes.
+ * The receive callback function may be called for incoming messages even
+ * before this function returns.
+ *
+ * The caller must eventually destroy the connection by calling
+ * mlx5_fpga_sbu_conn_destroy.
+ *
+ * Return: A new connection, or ERR_PTR() error value otherwise.
+ */
+struct mlx5_fpga_conn *
+mlx5_fpga_sbu_conn_create(struct mlx5_fpga_device *fdev,
+			  struct mlx5_fpga_conn_attr *attr);
+
+/**
+ * mlx5_fpga_sbu_conn_destroy() - Destroy an FPGA SBU connection
+ * @conn: The FPGA SBU connection to destroy
+ *
+ * Cleans up an FPGA SBU connection which was previously created with
+ * mlx5_fpga_sbu_conn_create.
+ */
+void mlx5_fpga_sbu_conn_destroy(struct mlx5_fpga_conn *conn);
+
+/**
+ * mlx5_fpga_sbu_conn_sendmsg() - Queue the transmission of a packet
+ * @fdev: An FPGA SBU connection
+ * @buf: The packet buffer
+ *
+ * Queues a packet for transmission over an FPGA SBU connection.
+ * The buffer should not be modified or freed until completion.
+ * Upon completion, the buf's complete() callback is invoked, indicating the
+ * success or error status of the transmission.
+ *
+ * Return: 0 if successful, or an error value otherwise.
+ */
+int mlx5_fpga_sbu_conn_sendmsg(struct mlx5_fpga_conn *conn,
+			       struct mlx5_fpga_dma_buf *buf);
+
+/**
+ * mlx5_fpga_mem_read() - Read from FPGA memory address space
+ * @fdev: The FPGA device
+ * @size: Size of chunk to read, in bytes
+ * @addr: Starting address to read from, in FPGA address space
+ * @buf: Buffer to read into
+ * @access_type: Method for reading
+ *
+ * Reads from the specified address into the specified buffer.
+ * The address may point to configuration space or to DDR.
+ * Large reads may be performed internally as several non-atomic operations.
+ * This function may sleep, so should not be called from atomic contexts.
+ *
+ * Return: 0 if successful, or an error value otherwise.
+ */
+int mlx5_fpga_mem_read(struct mlx5_fpga_device *fdev, size_t size, u64 addr,
+		       void *buf, enum mlx5_fpga_access_type access_type);
+
+/**
+ * mlx5_fpga_mem_write() - Write to FPGA memory address space
+ * @fdev: The FPGA device
+ * @size: Size of chunk to write, in bytes
+ * @addr: Starting address to write to, in FPGA address space
+ * @buf: Buffer which contains data to write
+ * @access_type: Method for writing
+ *
+ * Writes the specified buffer data to FPGA memory at the specified address.
+ * The address may point to configuration space or to DDR.
+ * Large writes may be performed internally as several non-atomic operations.
+ * This function may sleep, so should not be called from atomic contexts.
+ *
+ * Return: 0 if successful, or an error value otherwise.
+ */
+int mlx5_fpga_mem_write(struct mlx5_fpga_device *fdev, size_t size, u64 addr,
+			void *buf, enum mlx5_fpga_access_type access_type);
+
+/**
+ * mlx5_fpga_get_sbu_caps() - Read the SBU capabilities
+ * @fdev: The FPGA device
+ * @size: Size of the buffer to read into
+ * @buf: Buffer to read the capabilities into
+ *
+ * Reads the FPGA SBU capabilities into the specified buffer.
+ * The format of the capabilities buffer is SBU-dependent.
+ *
+ * Return: 0 if successful
+ *         -EINVAL if the buffer is not large enough to contain SBU caps
+ *         or any other error value otherwise.
+ */
+int mlx5_fpga_get_sbu_caps(struct mlx5_fpga_device *fdev, int size, void *buf);
+
+#endif /* MLX5_FPGA_SDK_H */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
new file mode 100644
index 0000000..ef5afd7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -0,0 +1,747 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/mlx5_ifc.h>
+
+#include "fs_core.h"
+#include "fs_cmd.h"
+#include "mlx5_core.h"
+#include "eswitch.h"
+
+static int mlx5_cmd_stub_update_root_ft(struct mlx5_core_dev *dev,
+					struct mlx5_flow_table *ft,
+					u32 underlay_qpn,
+					bool disconnect)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_create_flow_table(struct mlx5_core_dev *dev,
+					   u16 vport,
+					   enum fs_flow_table_op_mod op_mod,
+					   enum fs_flow_table_type type,
+					   unsigned int level,
+					   unsigned int log_size,
+					   struct mlx5_flow_table *next_ft,
+					   unsigned int *table_id, u32 flags)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_destroy_flow_table(struct mlx5_core_dev *dev,
+					    struct mlx5_flow_table *ft)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_modify_flow_table(struct mlx5_core_dev *dev,
+					   struct mlx5_flow_table *ft,
+					   struct mlx5_flow_table *next_ft)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_create_flow_group(struct mlx5_core_dev *dev,
+					   struct mlx5_flow_table *ft,
+					   u32 *in,
+					   unsigned int *group_id)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_destroy_flow_group(struct mlx5_core_dev *dev,
+					    struct mlx5_flow_table *ft,
+					    unsigned int group_id)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_create_fte(struct mlx5_core_dev *dev,
+				    struct mlx5_flow_table *ft,
+				    struct mlx5_flow_group *group,
+				    struct fs_fte *fte)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_update_fte(struct mlx5_core_dev *dev,
+				    struct mlx5_flow_table *ft,
+				    unsigned int group_id,
+				    int modify_mask,
+				    struct fs_fte *fte)
+{
+	return -EOPNOTSUPP;
+}
+
+static int mlx5_cmd_stub_delete_fte(struct mlx5_core_dev *dev,
+				    struct mlx5_flow_table *ft,
+				    struct fs_fte *fte)
+{
+	return 0;
+}
+
+static int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
+				   struct mlx5_flow_table *ft, u32 underlay_qpn,
+				   bool disconnect)
+{
+	u32 in[MLX5_ST_SZ_DW(set_flow_table_root_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(set_flow_table_root_out)] = {0};
+
+	if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) &&
+	    underlay_qpn == 0)
+		return 0;
+
+	MLX5_SET(set_flow_table_root_in, in, opcode,
+		 MLX5_CMD_OP_SET_FLOW_TABLE_ROOT);
+	MLX5_SET(set_flow_table_root_in, in, table_type, ft->type);
+
+	if (disconnect) {
+		MLX5_SET(set_flow_table_root_in, in, op_mod, 1);
+		MLX5_SET(set_flow_table_root_in, in, table_id, 0);
+	} else {
+		MLX5_SET(set_flow_table_root_in, in, op_mod, 0);
+		MLX5_SET(set_flow_table_root_in, in, table_id, ft->id);
+	}
+
+	MLX5_SET(set_flow_table_root_in, in, underlay_qpn, underlay_qpn);
+	if (ft->vport) {
+		MLX5_SET(set_flow_table_root_in, in, vport_number, ft->vport);
+		MLX5_SET(set_flow_table_root_in, in, other_vport, 1);
+	}
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
+				      u16 vport,
+				      enum fs_flow_table_op_mod op_mod,
+				      enum fs_flow_table_type type,
+				      unsigned int level,
+				      unsigned int log_size,
+				      struct mlx5_flow_table *next_ft,
+				      unsigned int *table_id, u32 flags)
+{
+	int en_encap_decap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN);
+	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)]   = {0};
+	int err;
+
+	MLX5_SET(create_flow_table_in, in, opcode,
+		 MLX5_CMD_OP_CREATE_FLOW_TABLE);
+
+	MLX5_SET(create_flow_table_in, in, table_type, type);
+	MLX5_SET(create_flow_table_in, in, flow_table_context.level, level);
+	MLX5_SET(create_flow_table_in, in, flow_table_context.log_size, log_size);
+	if (vport) {
+		MLX5_SET(create_flow_table_in, in, vport_number, vport);
+		MLX5_SET(create_flow_table_in, in, other_vport, 1);
+	}
+
+	MLX5_SET(create_flow_table_in, in, flow_table_context.decap_en,
+		 en_encap_decap);
+	MLX5_SET(create_flow_table_in, in, flow_table_context.encap_en,
+		 en_encap_decap);
+
+	switch (op_mod) {
+	case FS_FT_OP_MOD_NORMAL:
+		if (next_ft) {
+			MLX5_SET(create_flow_table_in, in,
+				 flow_table_context.table_miss_action, 1);
+			MLX5_SET(create_flow_table_in, in,
+				 flow_table_context.table_miss_id, next_ft->id);
+		}
+		break;
+
+	case FS_FT_OP_MOD_LAG_DEMUX:
+		MLX5_SET(create_flow_table_in, in, op_mod, 0x1);
+		if (next_ft)
+			MLX5_SET(create_flow_table_in, in,
+				 flow_table_context.lag_master_next_table_id,
+				 next_ft->id);
+		break;
+	}
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*table_id = MLX5_GET(create_flow_table_out, out,
+				     table_id);
+	return err;
+}
+
+static int mlx5_cmd_destroy_flow_table(struct mlx5_core_dev *dev,
+				       struct mlx5_flow_table *ft)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_flow_table_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_flow_table_out)] = {0};
+
+	MLX5_SET(destroy_flow_table_in, in, opcode,
+		 MLX5_CMD_OP_DESTROY_FLOW_TABLE);
+	MLX5_SET(destroy_flow_table_in, in, table_type, ft->type);
+	MLX5_SET(destroy_flow_table_in, in, table_id, ft->id);
+	if (ft->vport) {
+		MLX5_SET(destroy_flow_table_in, in, vport_number, ft->vport);
+		MLX5_SET(destroy_flow_table_in, in, other_vport, 1);
+	}
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_cmd_modify_flow_table(struct mlx5_core_dev *dev,
+				      struct mlx5_flow_table *ft,
+				      struct mlx5_flow_table *next_ft)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_flow_table_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(modify_flow_table_out)] = {0};
+
+	MLX5_SET(modify_flow_table_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_FLOW_TABLE);
+	MLX5_SET(modify_flow_table_in, in, table_type, ft->type);
+	MLX5_SET(modify_flow_table_in, in, table_id, ft->id);
+
+	if (ft->op_mod == FS_FT_OP_MOD_LAG_DEMUX) {
+		MLX5_SET(modify_flow_table_in, in, modify_field_select,
+			 MLX5_MODIFY_FLOW_TABLE_LAG_NEXT_TABLE_ID);
+		if (next_ft) {
+			MLX5_SET(modify_flow_table_in, in,
+				 flow_table_context.lag_master_next_table_id, next_ft->id);
+		} else {
+			MLX5_SET(modify_flow_table_in, in,
+				 flow_table_context.lag_master_next_table_id, 0);
+		}
+	} else {
+		if (ft->vport) {
+			MLX5_SET(modify_flow_table_in, in, vport_number,
+				 ft->vport);
+			MLX5_SET(modify_flow_table_in, in, other_vport, 1);
+		}
+		MLX5_SET(modify_flow_table_in, in, modify_field_select,
+			 MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID);
+		if (next_ft) {
+			MLX5_SET(modify_flow_table_in, in,
+				 flow_table_context.table_miss_action, 1);
+			MLX5_SET(modify_flow_table_in, in,
+				 flow_table_context.table_miss_id,
+				 next_ft->id);
+		} else {
+			MLX5_SET(modify_flow_table_in, in,
+				 flow_table_context.table_miss_action, 0);
+		}
+	}
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_cmd_create_flow_group(struct mlx5_core_dev *dev,
+				      struct mlx5_flow_table *ft,
+				      u32 *in,
+				      unsigned int *group_id)
+{
+	u32 out[MLX5_ST_SZ_DW(create_flow_group_out)] = {0};
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	int err;
+
+	MLX5_SET(create_flow_group_in, in, opcode,
+		 MLX5_CMD_OP_CREATE_FLOW_GROUP);
+	MLX5_SET(create_flow_group_in, in, table_type, ft->type);
+	MLX5_SET(create_flow_group_in, in, table_id, ft->id);
+	if (ft->vport) {
+		MLX5_SET(create_flow_group_in, in, vport_number, ft->vport);
+		MLX5_SET(create_flow_group_in, in, other_vport, 1);
+	}
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*group_id = MLX5_GET(create_flow_group_out, out,
+				     group_id);
+	return err;
+}
+
+static int mlx5_cmd_destroy_flow_group(struct mlx5_core_dev *dev,
+				       struct mlx5_flow_table *ft,
+				       unsigned int group_id)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_flow_group_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_flow_group_in)]   = {0};
+
+	MLX5_SET(destroy_flow_group_in, in, opcode,
+		 MLX5_CMD_OP_DESTROY_FLOW_GROUP);
+	MLX5_SET(destroy_flow_group_in, in, table_type, ft->type);
+	MLX5_SET(destroy_flow_group_in, in, table_id, ft->id);
+	MLX5_SET(destroy_flow_group_in, in, group_id, group_id);
+	if (ft->vport) {
+		MLX5_SET(destroy_flow_group_in, in, vport_number, ft->vport);
+		MLX5_SET(destroy_flow_group_in, in, other_vport, 1);
+	}
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
+			    int opmod, int modify_mask,
+			    struct mlx5_flow_table *ft,
+			    unsigned group_id,
+			    struct fs_fte *fte)
+{
+	unsigned int inlen = MLX5_ST_SZ_BYTES(set_fte_in) +
+		fte->dests_size * MLX5_ST_SZ_BYTES(dest_format_struct);
+	u32 out[MLX5_ST_SZ_DW(set_fte_out)] = {0};
+	struct mlx5_flow_rule *dst;
+	void *in_flow_context, *vlan;
+	void *in_match_value;
+	void *in_dests;
+	u32 *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(set_fte_in, in, opcode, MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY);
+	MLX5_SET(set_fte_in, in, op_mod, opmod);
+	MLX5_SET(set_fte_in, in, modify_enable_mask, modify_mask);
+	MLX5_SET(set_fte_in, in, table_type, ft->type);
+	MLX5_SET(set_fte_in, in, table_id,   ft->id);
+	MLX5_SET(set_fte_in, in, flow_index, fte->index);
+	if (ft->vport) {
+		MLX5_SET(set_fte_in, in, vport_number, ft->vport);
+		MLX5_SET(set_fte_in, in, other_vport, 1);
+	}
+
+	in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context);
+	MLX5_SET(flow_context, in_flow_context, group_id, group_id);
+
+	MLX5_SET(flow_context, in_flow_context, flow_tag, fte->action.flow_tag);
+	MLX5_SET(flow_context, in_flow_context, action, fte->action.action);
+	MLX5_SET(flow_context, in_flow_context, encap_id, fte->action.encap_id);
+	MLX5_SET(flow_context, in_flow_context, modify_header_id,
+		 fte->action.modify_id);
+
+	vlan = MLX5_ADDR_OF(flow_context, in_flow_context, push_vlan);
+
+	MLX5_SET(vlan, vlan, ethtype, fte->action.vlan.ethtype);
+	MLX5_SET(vlan, vlan, vid, fte->action.vlan.vid);
+	MLX5_SET(vlan, vlan, prio, fte->action.vlan.prio);
+
+	in_match_value = MLX5_ADDR_OF(flow_context, in_flow_context,
+				      match_value);
+	memcpy(in_match_value, &fte->val, sizeof(fte->val));
+
+	in_dests = MLX5_ADDR_OF(flow_context, in_flow_context, destination);
+	if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
+		int list_size = 0;
+
+		list_for_each_entry(dst, &fte->node.children, node.list) {
+			unsigned int id;
+
+			if (dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+				continue;
+
+			MLX5_SET(dest_format_struct, in_dests, destination_type,
+				 dst->dest_attr.type);
+			if (dst->dest_attr.type ==
+			    MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) {
+				id = dst->dest_attr.ft->id;
+			} else {
+				id = dst->dest_attr.tir_num;
+			}
+			MLX5_SET(dest_format_struct, in_dests, destination_id, id);
+			in_dests += MLX5_ST_SZ_BYTES(dest_format_struct);
+			list_size++;
+		}
+
+		MLX5_SET(flow_context, in_flow_context, destination_list_size,
+			 list_size);
+	}
+
+	if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		int max_list_size = BIT(MLX5_CAP_FLOWTABLE_TYPE(dev,
+					log_max_flow_counter,
+					ft->type));
+		int list_size = 0;
+
+		list_for_each_entry(dst, &fte->node.children, node.list) {
+			if (dst->dest_attr.type !=
+			    MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+				continue;
+
+			MLX5_SET(flow_counter_list, in_dests, flow_counter_id,
+				 dst->dest_attr.counter->id);
+			in_dests += MLX5_ST_SZ_BYTES(dest_format_struct);
+			list_size++;
+		}
+		if (list_size > max_list_size) {
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		MLX5_SET(flow_context, in_flow_context, flow_counter_list_size,
+			 list_size);
+	}
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+err_out:
+	kvfree(in);
+	return err;
+}
+
+static int mlx5_cmd_create_fte(struct mlx5_core_dev *dev,
+			       struct mlx5_flow_table *ft,
+			       struct mlx5_flow_group *group,
+			       struct fs_fte *fte)
+{
+	unsigned int group_id = group->id;
+
+	return mlx5_cmd_set_fte(dev, 0, 0, ft, group_id, fte);
+}
+
+static int mlx5_cmd_update_fte(struct mlx5_core_dev *dev,
+			       struct mlx5_flow_table *ft,
+			       unsigned int group_id,
+			       int modify_mask,
+			       struct fs_fte *fte)
+{
+	int opmod;
+	int atomic_mod_cap = MLX5_CAP_FLOWTABLE(dev,
+						flow_table_properties_nic_receive.
+						flow_modify_en);
+	if (!atomic_mod_cap)
+		return -EOPNOTSUPP;
+	opmod = 1;
+
+	return	mlx5_cmd_set_fte(dev, opmod, modify_mask, ft, group_id, fte);
+}
+
+static int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
+			       struct mlx5_flow_table *ft,
+			       struct fs_fte *fte)
+{
+	u32 out[MLX5_ST_SZ_DW(delete_fte_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(delete_fte_in)]   = {0};
+
+	MLX5_SET(delete_fte_in, in, opcode, MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY);
+	MLX5_SET(delete_fte_in, in, table_type, ft->type);
+	MLX5_SET(delete_fte_in, in, table_id, ft->id);
+	MLX5_SET(delete_fte_in, in, flow_index, fte->index);
+	if (ft->vport) {
+		MLX5_SET(delete_fte_in, in, vport_number, ft->vport);
+		MLX5_SET(delete_fte_in, in, other_vport, 1);
+	}
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u32 *id)
+{
+	u32 in[MLX5_ST_SZ_DW(alloc_flow_counter_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(alloc_flow_counter_out)] = {0};
+	int err;
+
+	MLX5_SET(alloc_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_FLOW_COUNTER);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*id = MLX5_GET(alloc_flow_counter_out, out, flow_counter_id);
+	return err;
+}
+
+int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u32 id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_flow_counter_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(dealloc_flow_counter_out)] = {0};
+
+	MLX5_SET(dealloc_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_FLOW_COUNTER);
+	MLX5_SET(dealloc_flow_counter_in, in, flow_counter_id, id);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u32 id,
+		      u64 *packets, u64 *bytes)
+{
+	u32 out[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+		MLX5_ST_SZ_BYTES(traffic_counter)]   = {0};
+	u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {0};
+	void *stats;
+	int err = 0;
+
+	MLX5_SET(query_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_FLOW_COUNTER);
+	MLX5_SET(query_flow_counter_in, in, op_mod, 0);
+	MLX5_SET(query_flow_counter_in, in, flow_counter_id, id);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	stats = MLX5_ADDR_OF(query_flow_counter_out, out, flow_statistics);
+	*packets = MLX5_GET64(traffic_counter, stats, packets);
+	*bytes = MLX5_GET64(traffic_counter, stats, octets);
+	return 0;
+}
+
+struct mlx5_cmd_fc_bulk {
+	u32 id;
+	int num;
+	int outlen;
+	u32 out[0];
+};
+
+struct mlx5_cmd_fc_bulk *
+mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev, u32 id, int num)
+{
+	struct mlx5_cmd_fc_bulk *b;
+	int outlen =
+		MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+		MLX5_ST_SZ_BYTES(traffic_counter) * num;
+
+	b = kzalloc(sizeof(*b) + outlen, GFP_KERNEL);
+	if (!b)
+		return NULL;
+
+	b->id = id;
+	b->num = num;
+	b->outlen = outlen;
+
+	return b;
+}
+
+void mlx5_cmd_fc_bulk_free(struct mlx5_cmd_fc_bulk *b)
+{
+	kfree(b);
+}
+
+int
+mlx5_cmd_fc_bulk_query(struct mlx5_core_dev *dev, struct mlx5_cmd_fc_bulk *b)
+{
+	u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {0};
+
+	MLX5_SET(query_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_FLOW_COUNTER);
+	MLX5_SET(query_flow_counter_in, in, op_mod, 0);
+	MLX5_SET(query_flow_counter_in, in, flow_counter_id, b->id);
+	MLX5_SET(query_flow_counter_in, in, num_of_counters, b->num);
+	return mlx5_cmd_exec(dev, in, sizeof(in), b->out, b->outlen);
+}
+
+void mlx5_cmd_fc_bulk_get(struct mlx5_core_dev *dev,
+			  struct mlx5_cmd_fc_bulk *b, u32 id,
+			  u64 *packets, u64 *bytes)
+{
+	int index = id - b->id;
+	void *stats;
+
+	if (index < 0 || index >= b->num) {
+		mlx5_core_warn(dev, "Flow counter id (0x%x) out of range (0x%x..0x%x). Counter ignored.\n",
+			       id, b->id, b->id + b->num - 1);
+		return;
+	}
+
+	stats = MLX5_ADDR_OF(query_flow_counter_out, b->out,
+			     flow_statistics[index]);
+	*packets = MLX5_GET64(traffic_counter, stats, packets);
+	*bytes = MLX5_GET64(traffic_counter, stats, octets);
+}
+
+int mlx5_encap_alloc(struct mlx5_core_dev *dev,
+		     int header_type,
+		     size_t size,
+		     void *encap_header,
+		     u32 *encap_id)
+{
+	int max_encap_size = MLX5_CAP_ESW(dev, max_encap_header_size);
+	u32 out[MLX5_ST_SZ_DW(alloc_encap_header_out)];
+	void *encap_header_in;
+	void *header;
+	int inlen;
+	int err;
+	u32 *in;
+
+	if (size > max_encap_size) {
+		mlx5_core_warn(dev, "encap size %zd too big, max supported is %d\n",
+			       size, max_encap_size);
+		return -EINVAL;
+	}
+
+	in = kzalloc(MLX5_ST_SZ_BYTES(alloc_encap_header_in) + size,
+		     GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	encap_header_in = MLX5_ADDR_OF(alloc_encap_header_in, in, encap_header);
+	header = MLX5_ADDR_OF(encap_header_in, encap_header_in, encap_header);
+	inlen = header - (void *)in  + size;
+
+	memset(in, 0, inlen);
+	MLX5_SET(alloc_encap_header_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_ENCAP_HEADER);
+	MLX5_SET(encap_header_in, encap_header_in, encap_header_size, size);
+	MLX5_SET(encap_header_in, encap_header_in, header_type, header_type);
+	memcpy(header, encap_header, size);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+
+	*encap_id = MLX5_GET(alloc_encap_header_out, out, encap_id);
+	kfree(in);
+	return err;
+}
+
+void mlx5_encap_dealloc(struct mlx5_core_dev *dev, u32 encap_id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_encap_header_in)];
+	u32 out[MLX5_ST_SZ_DW(dealloc_encap_header_out)];
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(dealloc_encap_header_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_ENCAP_HEADER);
+	MLX5_SET(dealloc_encap_header_in, in, encap_id, encap_id);
+
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_modify_header_alloc(struct mlx5_core_dev *dev,
+			     u8 namespace, u8 num_actions,
+			     void *modify_actions, u32 *modify_header_id)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_modify_header_context_out)];
+	int max_actions, actions_size, inlen, err;
+	void *actions_in;
+	u8 table_type;
+	u32 *in;
+
+	switch (namespace) {
+	case MLX5_FLOW_NAMESPACE_FDB:
+		max_actions = MLX5_CAP_ESW_FLOWTABLE_FDB(dev, max_modify_header_actions);
+		table_type = FS_FT_FDB;
+		break;
+	case MLX5_FLOW_NAMESPACE_KERNEL:
+		max_actions = MLX5_CAP_FLOWTABLE_NIC_RX(dev, max_modify_header_actions);
+		table_type = FS_FT_NIC_RX;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	if (num_actions > max_actions) {
+		mlx5_core_warn(dev, "too many modify header actions %d, max supported %d\n",
+			       num_actions, max_actions);
+		return -EOPNOTSUPP;
+	}
+
+	actions_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto) * num_actions;
+	inlen = MLX5_ST_SZ_BYTES(alloc_modify_header_context_in) + actions_size;
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(alloc_modify_header_context_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT);
+	MLX5_SET(alloc_modify_header_context_in, in, table_type, table_type);
+	MLX5_SET(alloc_modify_header_context_in, in, num_of_actions, num_actions);
+
+	actions_in = MLX5_ADDR_OF(alloc_modify_header_context_in, in, actions);
+	memcpy(actions_in, modify_actions, actions_size);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+
+	*modify_header_id = MLX5_GET(alloc_modify_header_context_out, out, modify_header_id);
+	kfree(in);
+	return err;
+}
+
+void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev, u32 modify_header_id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_modify_header_context_in)];
+	u32 out[MLX5_ST_SZ_DW(dealloc_modify_header_context_out)];
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(dealloc_modify_header_context_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT);
+	MLX5_SET(dealloc_modify_header_context_in, in, modify_header_id,
+		 modify_header_id);
+
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static const struct mlx5_flow_cmds mlx5_flow_cmds = {
+	.create_flow_table = mlx5_cmd_create_flow_table,
+	.destroy_flow_table = mlx5_cmd_destroy_flow_table,
+	.modify_flow_table = mlx5_cmd_modify_flow_table,
+	.create_flow_group = mlx5_cmd_create_flow_group,
+	.destroy_flow_group = mlx5_cmd_destroy_flow_group,
+	.create_fte = mlx5_cmd_create_fte,
+	.update_fte = mlx5_cmd_update_fte,
+	.delete_fte = mlx5_cmd_delete_fte,
+	.update_root_ft = mlx5_cmd_update_root_ft,
+};
+
+static const struct mlx5_flow_cmds mlx5_flow_cmd_stubs = {
+	.create_flow_table = mlx5_cmd_stub_create_flow_table,
+	.destroy_flow_table = mlx5_cmd_stub_destroy_flow_table,
+	.modify_flow_table = mlx5_cmd_stub_modify_flow_table,
+	.create_flow_group = mlx5_cmd_stub_create_flow_group,
+	.destroy_flow_group = mlx5_cmd_stub_destroy_flow_group,
+	.create_fte = mlx5_cmd_stub_create_fte,
+	.update_fte = mlx5_cmd_stub_update_fte,
+	.delete_fte = mlx5_cmd_stub_delete_fte,
+	.update_root_ft = mlx5_cmd_stub_update_root_ft,
+};
+
+static const struct mlx5_flow_cmds *mlx5_fs_cmd_get_fw_cmds(void)
+{
+	return &mlx5_flow_cmds;
+}
+
+static const struct mlx5_flow_cmds *mlx5_fs_cmd_get_stub_cmds(void)
+{
+	return &mlx5_flow_cmd_stubs;
+}
+
+const struct mlx5_flow_cmds *mlx5_fs_cmd_get_default(enum fs_flow_table_type type)
+{
+	switch (type) {
+	case FS_FT_NIC_RX:
+	case FS_FT_ESW_EGRESS_ACL:
+	case FS_FT_ESW_INGRESS_ACL:
+	case FS_FT_FDB:
+	case FS_FT_SNIFFER_RX:
+	case FS_FT_SNIFFER_TX:
+		return mlx5_fs_cmd_get_fw_cmds();
+	case FS_FT_NIC_TX:
+	default:
+		return mlx5_fs_cmd_get_stub_cmds();
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
new file mode 100644
index 0000000..6228ba7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MLX5_FS_CMD_
+#define _MLX5_FS_CMD_
+
+#include "fs_core.h"
+
+struct mlx5_flow_cmds {
+	int (*create_flow_table)(struct mlx5_core_dev *dev,
+				 u16 vport,
+				 enum fs_flow_table_op_mod op_mod,
+				 enum fs_flow_table_type type,
+				 unsigned int level, unsigned int log_size,
+				 struct mlx5_flow_table *next_ft,
+				 unsigned int *table_id, u32 flags);
+	int (*destroy_flow_table)(struct mlx5_core_dev *dev,
+				  struct mlx5_flow_table *ft);
+
+	int (*modify_flow_table)(struct mlx5_core_dev *dev,
+				 struct mlx5_flow_table *ft,
+				 struct mlx5_flow_table *next_ft);
+
+	int (*create_flow_group)(struct mlx5_core_dev *dev,
+				 struct mlx5_flow_table *ft,
+				 u32 *in,
+				 unsigned int *group_id);
+
+	int (*destroy_flow_group)(struct mlx5_core_dev *dev,
+				  struct mlx5_flow_table *ft,
+				  unsigned int group_id);
+
+	int (*create_fte)(struct mlx5_core_dev *dev,
+			  struct mlx5_flow_table *ft,
+			  struct mlx5_flow_group *fg,
+			  struct fs_fte *fte);
+
+	int (*update_fte)(struct mlx5_core_dev *dev,
+			  struct mlx5_flow_table *ft,
+			  unsigned int group_id,
+			  int modify_mask,
+			  struct fs_fte *fte);
+
+	int (*delete_fte)(struct mlx5_core_dev *dev,
+			  struct mlx5_flow_table *ft,
+			  struct fs_fte *fte);
+
+	int (*update_root_ft)(struct mlx5_core_dev *dev,
+			      struct mlx5_flow_table *ft,
+			      u32 underlay_qpn,
+			      bool disconnect);
+};
+
+int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u32 *id);
+int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u32 id);
+int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u32 id,
+		      u64 *packets, u64 *bytes);
+
+struct mlx5_cmd_fc_bulk;
+
+struct mlx5_cmd_fc_bulk *
+mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev, u32 id, int num);
+void mlx5_cmd_fc_bulk_free(struct mlx5_cmd_fc_bulk *b);
+int
+mlx5_cmd_fc_bulk_query(struct mlx5_core_dev *dev, struct mlx5_cmd_fc_bulk *b);
+void mlx5_cmd_fc_bulk_get(struct mlx5_core_dev *dev,
+			  struct mlx5_cmd_fc_bulk *b, u32 id,
+			  u64 *packets, u64 *bytes);
+
+const struct mlx5_flow_cmds *mlx5_fs_cmd_get_default(enum fs_flow_table_type type);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
new file mode 100644
index 0000000..c39c169
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -0,0 +1,2753 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mutex.h>
+#include <linux/mlx5/driver.h>
+
+#include "mlx5_core.h"
+#include "fs_core.h"
+#include "fs_cmd.h"
+#include "diag/fs_tracepoint.h"
+#include "accel/ipsec.h"
+#include "fpga/ipsec.h"
+
+#define INIT_TREE_NODE_ARRAY_SIZE(...)	(sizeof((struct init_tree_node[]){__VA_ARGS__}) /\
+					 sizeof(struct init_tree_node))
+
+#define ADD_PRIO(num_prios_val, min_level_val, num_levels_val, caps_val,\
+		 ...) {.type = FS_TYPE_PRIO,\
+	.min_ft_level = min_level_val,\
+	.num_levels = num_levels_val,\
+	.num_leaf_prios = num_prios_val,\
+	.caps = caps_val,\
+	.children = (struct init_tree_node[]) {__VA_ARGS__},\
+	.ar_size = INIT_TREE_NODE_ARRAY_SIZE(__VA_ARGS__) \
+}
+
+#define ADD_MULTIPLE_PRIO(num_prios_val, num_levels_val, ...)\
+	ADD_PRIO(num_prios_val, 0, num_levels_val, {},\
+		 __VA_ARGS__)\
+
+#define ADD_NS(...) {.type = FS_TYPE_NAMESPACE,\
+	.children = (struct init_tree_node[]) {__VA_ARGS__},\
+	.ar_size = INIT_TREE_NODE_ARRAY_SIZE(__VA_ARGS__) \
+}
+
+#define INIT_CAPS_ARRAY_SIZE(...) (sizeof((long[]){__VA_ARGS__}) /\
+				   sizeof(long))
+
+#define FS_CAP(cap) (__mlx5_bit_off(flow_table_nic_cap, cap))
+
+#define FS_REQUIRED_CAPS(...) {.arr_sz = INIT_CAPS_ARRAY_SIZE(__VA_ARGS__), \
+			       .caps = (long[]) {__VA_ARGS__} }
+
+#define FS_CHAINING_CAPS  FS_REQUIRED_CAPS(FS_CAP(flow_table_properties_nic_receive.flow_modify_en), \
+					   FS_CAP(flow_table_properties_nic_receive.modify_root), \
+					   FS_CAP(flow_table_properties_nic_receive.identified_miss_table_mode), \
+					   FS_CAP(flow_table_properties_nic_receive.flow_table_modify))
+
+#define LEFTOVERS_NUM_LEVELS 1
+#define LEFTOVERS_NUM_PRIOS 1
+
+#define BY_PASS_PRIO_NUM_LEVELS 1
+#define BY_PASS_MIN_LEVEL (ETHTOOL_MIN_LEVEL + MLX5_BY_PASS_NUM_PRIOS +\
+			   LEFTOVERS_NUM_PRIOS)
+
+#define ETHTOOL_PRIO_NUM_LEVELS 1
+#define ETHTOOL_NUM_PRIOS 11
+#define ETHTOOL_MIN_LEVEL (KERNEL_MIN_LEVEL + ETHTOOL_NUM_PRIOS)
+/* Vlan, mac, ttc, inner ttc, aRFS */
+#define KERNEL_NIC_PRIO_NUM_LEVELS 5
+#define KERNEL_NIC_NUM_PRIOS 1
+/* One more level for tc */
+#define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 1)
+
+#define KERNEL_NIC_TC_NUM_PRIOS  1
+#define KERNEL_NIC_TC_NUM_LEVELS 2
+
+#define ANCHOR_NUM_LEVELS 1
+#define ANCHOR_NUM_PRIOS 1
+#define ANCHOR_MIN_LEVEL (BY_PASS_MIN_LEVEL + 1)
+
+#define OFFLOADS_MAX_FT 1
+#define OFFLOADS_NUM_PRIOS 1
+#define OFFLOADS_MIN_LEVEL (ANCHOR_MIN_LEVEL + 1)
+
+#define LAG_PRIO_NUM_LEVELS 1
+#define LAG_NUM_PRIOS 1
+#define LAG_MIN_LEVEL (OFFLOADS_MIN_LEVEL + 1)
+
+struct node_caps {
+	size_t	arr_sz;
+	long	*caps;
+};
+
+static struct init_tree_node {
+	enum fs_node_type	type;
+	struct init_tree_node *children;
+	int ar_size;
+	struct node_caps caps;
+	int min_ft_level;
+	int num_leaf_prios;
+	int prio;
+	int num_levels;
+} root_fs = {
+	.type = FS_TYPE_NAMESPACE,
+	.ar_size = 7,
+	.children = (struct init_tree_node[]) {
+		ADD_PRIO(0, BY_PASS_MIN_LEVEL, 0,
+			 FS_CHAINING_CAPS,
+			 ADD_NS(ADD_MULTIPLE_PRIO(MLX5_BY_PASS_NUM_PRIOS,
+						  BY_PASS_PRIO_NUM_LEVELS))),
+		ADD_PRIO(0, LAG_MIN_LEVEL, 0,
+			 FS_CHAINING_CAPS,
+			 ADD_NS(ADD_MULTIPLE_PRIO(LAG_NUM_PRIOS,
+						  LAG_PRIO_NUM_LEVELS))),
+		ADD_PRIO(0, OFFLOADS_MIN_LEVEL, 0, {},
+			 ADD_NS(ADD_MULTIPLE_PRIO(OFFLOADS_NUM_PRIOS, OFFLOADS_MAX_FT))),
+		ADD_PRIO(0, ETHTOOL_MIN_LEVEL, 0,
+			 FS_CHAINING_CAPS,
+			 ADD_NS(ADD_MULTIPLE_PRIO(ETHTOOL_NUM_PRIOS,
+						  ETHTOOL_PRIO_NUM_LEVELS))),
+		ADD_PRIO(0, KERNEL_MIN_LEVEL, 0, {},
+			 ADD_NS(ADD_MULTIPLE_PRIO(KERNEL_NIC_TC_NUM_PRIOS, KERNEL_NIC_TC_NUM_LEVELS),
+				ADD_MULTIPLE_PRIO(KERNEL_NIC_NUM_PRIOS,
+						  KERNEL_NIC_PRIO_NUM_LEVELS))),
+		ADD_PRIO(0, BY_PASS_MIN_LEVEL, 0,
+			 FS_CHAINING_CAPS,
+			 ADD_NS(ADD_MULTIPLE_PRIO(LEFTOVERS_NUM_PRIOS, LEFTOVERS_NUM_LEVELS))),
+		ADD_PRIO(0, ANCHOR_MIN_LEVEL, 0, {},
+			 ADD_NS(ADD_MULTIPLE_PRIO(ANCHOR_NUM_PRIOS, ANCHOR_NUM_LEVELS))),
+	}
+};
+
+enum fs_i_lock_class {
+	FS_LOCK_GRANDPARENT,
+	FS_LOCK_PARENT,
+	FS_LOCK_CHILD
+};
+
+static const struct rhashtable_params rhash_fte = {
+	.key_len = FIELD_SIZEOF(struct fs_fte, val),
+	.key_offset = offsetof(struct fs_fte, val),
+	.head_offset = offsetof(struct fs_fte, hash),
+	.automatic_shrinking = true,
+	.min_size = 1,
+};
+
+static const struct rhashtable_params rhash_fg = {
+	.key_len = FIELD_SIZEOF(struct mlx5_flow_group, mask),
+	.key_offset = offsetof(struct mlx5_flow_group, mask),
+	.head_offset = offsetof(struct mlx5_flow_group, hash),
+	.automatic_shrinking = true,
+	.min_size = 1,
+
+};
+
+static void del_hw_flow_table(struct fs_node *node);
+static void del_hw_flow_group(struct fs_node *node);
+static void del_hw_fte(struct fs_node *node);
+static void del_sw_flow_table(struct fs_node *node);
+static void del_sw_flow_group(struct fs_node *node);
+static void del_sw_fte(struct fs_node *node);
+static void del_sw_prio(struct fs_node *node);
+static void del_sw_ns(struct fs_node *node);
+/* Delete rule (destination) is special case that 
+ * requires to lock the FTE for all the deletion process.
+ */
+static void del_sw_hw_rule(struct fs_node *node);
+static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
+				struct mlx5_flow_destination *d2);
+static void cleanup_root_ns(struct mlx5_flow_root_namespace *root_ns);
+static struct mlx5_flow_rule *
+find_flow_rule(struct fs_fte *fte,
+	       struct mlx5_flow_destination *dest);
+
+static void tree_init_node(struct fs_node *node,
+			   void (*del_hw_func)(struct fs_node *),
+			   void (*del_sw_func)(struct fs_node *))
+{
+	refcount_set(&node->refcount, 1);
+	INIT_LIST_HEAD(&node->list);
+	INIT_LIST_HEAD(&node->children);
+	init_rwsem(&node->lock);
+	node->del_hw_func = del_hw_func;
+	node->del_sw_func = del_sw_func;
+	node->active = false;
+}
+
+static void tree_add_node(struct fs_node *node, struct fs_node *parent)
+{
+	if (parent)
+		refcount_inc(&parent->refcount);
+	node->parent = parent;
+
+	/* Parent is the root */
+	if (!parent)
+		node->root = node;
+	else
+		node->root = parent->root;
+}
+
+static int tree_get_node(struct fs_node *node)
+{
+	return refcount_inc_not_zero(&node->refcount);
+}
+
+static void nested_down_read_ref_node(struct fs_node *node,
+				      enum fs_i_lock_class class)
+{
+	if (node) {
+		down_read_nested(&node->lock, class);
+		refcount_inc(&node->refcount);
+	}
+}
+
+static void nested_down_write_ref_node(struct fs_node *node,
+				       enum fs_i_lock_class class)
+{
+	if (node) {
+		down_write_nested(&node->lock, class);
+		refcount_inc(&node->refcount);
+	}
+}
+
+static void down_write_ref_node(struct fs_node *node)
+{
+	if (node) {
+		down_write(&node->lock);
+		refcount_inc(&node->refcount);
+	}
+}
+
+static void up_read_ref_node(struct fs_node *node)
+{
+	refcount_dec(&node->refcount);
+	up_read(&node->lock);
+}
+
+static void up_write_ref_node(struct fs_node *node)
+{
+	refcount_dec(&node->refcount);
+	up_write(&node->lock);
+}
+
+static void tree_put_node(struct fs_node *node)
+{
+	struct fs_node *parent_node = node->parent;
+
+	if (refcount_dec_and_test(&node->refcount)) {
+		if (node->del_hw_func)
+			node->del_hw_func(node);
+		if (parent_node) {
+			/* Only root namespace doesn't have parent and we just
+			 * need to free its node.
+			 */
+			down_write_ref_node(parent_node);
+			list_del_init(&node->list);
+			if (node->del_sw_func)
+				node->del_sw_func(node);
+			up_write_ref_node(parent_node);
+		} else {
+			kfree(node);
+		}
+		node = NULL;
+	}
+	if (!node && parent_node)
+		tree_put_node(parent_node);
+}
+
+static int tree_remove_node(struct fs_node *node)
+{
+	if (refcount_read(&node->refcount) > 1) {
+		refcount_dec(&node->refcount);
+		return -EEXIST;
+	}
+	tree_put_node(node);
+	return 0;
+}
+
+static struct fs_prio *find_prio(struct mlx5_flow_namespace *ns,
+				 unsigned int prio)
+{
+	struct fs_prio *iter_prio;
+
+	fs_for_each_prio(iter_prio, ns) {
+		if (iter_prio->prio == prio)
+			return iter_prio;
+	}
+
+	return NULL;
+}
+
+static bool check_last_reserved(const u32 *match_criteria)
+{
+	char *match_criteria_reserved =
+		MLX5_ADDR_OF(fte_match_param, match_criteria, MLX5_FTE_MATCH_PARAM_RESERVED);
+
+	return	!match_criteria_reserved[0] &&
+		!memcmp(match_criteria_reserved, match_criteria_reserved + 1,
+			MLX5_FLD_SZ_BYTES(fte_match_param,
+					  MLX5_FTE_MATCH_PARAM_RESERVED) - 1);
+}
+
+static bool check_valid_mask(u8 match_criteria_enable, const u32 *match_criteria)
+{
+	if (match_criteria_enable & ~(
+		(1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS)   |
+		(1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS) |
+		(1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS)))
+		return false;
+
+	if (!(match_criteria_enable &
+	      1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS)) {
+		char *fg_type_mask = MLX5_ADDR_OF(fte_match_param,
+						  match_criteria, outer_headers);
+
+		if (fg_type_mask[0] ||
+		    memcmp(fg_type_mask, fg_type_mask + 1,
+			   MLX5_ST_SZ_BYTES(fte_match_set_lyr_2_4) - 1))
+			return false;
+	}
+
+	if (!(match_criteria_enable &
+	      1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS)) {
+		char *fg_type_mask = MLX5_ADDR_OF(fte_match_param,
+						  match_criteria, misc_parameters);
+
+		if (fg_type_mask[0] ||
+		    memcmp(fg_type_mask, fg_type_mask + 1,
+			   MLX5_ST_SZ_BYTES(fte_match_set_misc) - 1))
+			return false;
+	}
+
+	if (!(match_criteria_enable &
+	      1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS)) {
+		char *fg_type_mask = MLX5_ADDR_OF(fte_match_param,
+						  match_criteria, inner_headers);
+
+		if (fg_type_mask[0] ||
+		    memcmp(fg_type_mask, fg_type_mask + 1,
+			   MLX5_ST_SZ_BYTES(fte_match_set_lyr_2_4) - 1))
+			return false;
+	}
+
+	return check_last_reserved(match_criteria);
+}
+
+static bool check_valid_spec(const struct mlx5_flow_spec *spec)
+{
+	int i;
+
+	if (!check_valid_mask(spec->match_criteria_enable, spec->match_criteria)) {
+		pr_warn("mlx5_core: Match criteria given mismatches match_criteria_enable\n");
+		return false;
+	}
+
+	for (i = 0; i < MLX5_ST_SZ_DW_MATCH_PARAM; i++)
+		if (spec->match_value[i] & ~spec->match_criteria[i]) {
+			pr_warn("mlx5_core: match_value differs from match_criteria\n");
+			return false;
+		}
+
+	return check_last_reserved(spec->match_value);
+}
+
+static struct mlx5_flow_root_namespace *find_root(struct fs_node *node)
+{
+	struct fs_node *root;
+	struct mlx5_flow_namespace *ns;
+
+	root = node->root;
+
+	if (WARN_ON(root->type != FS_TYPE_NAMESPACE)) {
+		pr_warn("mlx5: flow steering node is not in tree or garbaged\n");
+		return NULL;
+	}
+
+	ns = container_of(root, struct mlx5_flow_namespace, node);
+	return container_of(ns, struct mlx5_flow_root_namespace, ns);
+}
+
+static inline struct mlx5_flow_steering *get_steering(struct fs_node *node)
+{
+	struct mlx5_flow_root_namespace *root = find_root(node);
+
+	if (root)
+		return root->dev->priv.steering;
+	return NULL;
+}
+
+static inline struct mlx5_core_dev *get_dev(struct fs_node *node)
+{
+	struct mlx5_flow_root_namespace *root = find_root(node);
+
+	if (root)
+		return root->dev;
+	return NULL;
+}
+
+static void del_sw_ns(struct fs_node *node)
+{
+	kfree(node);
+}
+
+static void del_sw_prio(struct fs_node *node)
+{
+	kfree(node);
+}
+
+static void del_hw_flow_table(struct fs_node *node)
+{
+	struct mlx5_flow_root_namespace *root;
+	struct mlx5_flow_table *ft;
+	struct mlx5_core_dev *dev;
+	int err;
+
+	fs_get_obj(ft, node);
+	dev = get_dev(&ft->node);
+	root = find_root(&ft->node);
+
+	if (node->active) {
+		err = root->cmds->destroy_flow_table(dev, ft);
+		if (err)
+			mlx5_core_warn(dev, "flow steering can't destroy ft\n");
+	}
+}
+
+static void del_sw_flow_table(struct fs_node *node)
+{
+	struct mlx5_flow_table *ft;
+	struct fs_prio *prio;
+
+	fs_get_obj(ft, node);
+
+	rhltable_destroy(&ft->fgs_hash);
+	fs_get_obj(prio, ft->node.parent);
+	prio->num_ft--;
+	kfree(ft);
+}
+
+static void del_sw_hw_rule(struct fs_node *node)
+{
+	struct mlx5_flow_root_namespace *root;
+	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_table *ft;
+	struct mlx5_flow_group *fg;
+	struct fs_fte *fte;
+	int modify_mask;
+	struct mlx5_core_dev *dev = get_dev(node);
+	int err;
+	bool update_fte = false;
+
+	fs_get_obj(rule, node);
+	fs_get_obj(fte, rule->node.parent);
+	fs_get_obj(fg, fte->node.parent);
+	fs_get_obj(ft, fg->node.parent);
+	trace_mlx5_fs_del_rule(rule);
+	if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
+		mutex_lock(&rule->dest_attr.ft->lock);
+		list_del(&rule->next_ft);
+		mutex_unlock(&rule->dest_attr.ft->lock);
+	}
+
+	if (rule->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER  &&
+	    --fte->dests_size) {
+		modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) |
+			      BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS);
+		fte->action.action &= ~MLX5_FLOW_CONTEXT_ACTION_COUNT;
+		update_fte = true;
+		goto out;
+	}
+
+	if ((fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) &&
+	    --fte->dests_size) {
+		modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST),
+		update_fte = true;
+	}
+out:
+	root = find_root(&ft->node);
+	if (update_fte && fte->dests_size) {
+		err = root->cmds->update_fte(dev, ft, fg->id, modify_mask, fte);
+		if (err)
+			mlx5_core_warn(dev,
+				       "%s can't del rule fg id=%d fte_index=%d\n",
+				       __func__, fg->id, fte->index);
+	}
+	kfree(rule);
+}
+
+static void del_hw_fte(struct fs_node *node)
+{
+	struct mlx5_flow_root_namespace *root;
+	struct mlx5_flow_table *ft;
+	struct mlx5_flow_group *fg;
+	struct mlx5_core_dev *dev;
+	struct fs_fte *fte;
+	int err;
+
+	fs_get_obj(fte, node);
+	fs_get_obj(fg, fte->node.parent);
+	fs_get_obj(ft, fg->node.parent);
+
+	trace_mlx5_fs_del_fte(fte);
+	dev = get_dev(&ft->node);
+	root = find_root(&ft->node);
+	if (node->active) {
+		err = root->cmds->delete_fte(dev, ft, fte);
+		if (err)
+			mlx5_core_warn(dev,
+				       "flow steering can't delete fte in index %d of flow group id %d\n",
+				       fte->index, fg->id);
+	}
+}
+
+static void del_sw_fte(struct fs_node *node)
+{
+	struct mlx5_flow_steering *steering = get_steering(node);
+	struct mlx5_flow_group *fg;
+	struct fs_fte *fte;
+	int err;
+
+	fs_get_obj(fte, node);
+	fs_get_obj(fg, fte->node.parent);
+
+	err = rhashtable_remove_fast(&fg->ftes_hash,
+				     &fte->hash,
+				     rhash_fte);
+	WARN_ON(err);
+	ida_simple_remove(&fg->fte_allocator, fte->index - fg->start_index);
+	kmem_cache_free(steering->ftes_cache, fte);
+}
+
+static void del_hw_flow_group(struct fs_node *node)
+{
+	struct mlx5_flow_root_namespace *root;
+	struct mlx5_flow_group *fg;
+	struct mlx5_flow_table *ft;
+	struct mlx5_core_dev *dev;
+
+	fs_get_obj(fg, node);
+	fs_get_obj(ft, fg->node.parent);
+	dev = get_dev(&ft->node);
+	trace_mlx5_fs_del_fg(fg);
+
+	root = find_root(&ft->node);
+	if (fg->node.active && root->cmds->destroy_flow_group(dev, ft, fg->id))
+		mlx5_core_warn(dev, "flow steering can't destroy fg %d of ft %d\n",
+			       fg->id, ft->id);
+}
+
+static void del_sw_flow_group(struct fs_node *node)
+{
+	struct mlx5_flow_steering *steering = get_steering(node);
+	struct mlx5_flow_group *fg;
+	struct mlx5_flow_table *ft;
+	int err;
+
+	fs_get_obj(fg, node);
+	fs_get_obj(ft, fg->node.parent);
+
+	rhashtable_destroy(&fg->ftes_hash);
+	ida_destroy(&fg->fte_allocator);
+	if (ft->autogroup.active)
+		ft->autogroup.num_groups--;
+	err = rhltable_remove(&ft->fgs_hash,
+			      &fg->hash,
+			      rhash_fg);
+	WARN_ON(err);
+	kmem_cache_free(steering->fgs_cache, fg);
+}
+
+static int insert_fte(struct mlx5_flow_group *fg, struct fs_fte *fte)
+{
+	int index;
+	int ret;
+
+	index = ida_simple_get(&fg->fte_allocator, 0, fg->max_ftes, GFP_KERNEL);
+	if (index < 0)
+		return index;
+
+	fte->index = index + fg->start_index;
+	ret = rhashtable_insert_fast(&fg->ftes_hash,
+				     &fte->hash,
+				     rhash_fte);
+	if (ret)
+		goto err_ida_remove;
+
+	tree_add_node(&fte->node, &fg->node);
+	list_add_tail(&fte->node.list, &fg->node.children);
+	return 0;
+
+err_ida_remove:
+	ida_simple_remove(&fg->fte_allocator, index);
+	return ret;
+}
+
+static struct fs_fte *alloc_fte(struct mlx5_flow_table *ft,
+				u32 *match_value,
+				struct mlx5_flow_act *flow_act)
+{
+	struct mlx5_flow_steering *steering = get_steering(&ft->node);
+	struct fs_fte *fte;
+
+	fte = kmem_cache_zalloc(steering->ftes_cache, GFP_KERNEL);
+	if (!fte)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(fte->val, match_value, sizeof(fte->val));
+	fte->node.type =  FS_TYPE_FLOW_ENTRY;
+	fte->action = *flow_act;
+
+	tree_init_node(&fte->node, del_hw_fte, del_sw_fte);
+
+	return fte;
+}
+
+static void dealloc_flow_group(struct mlx5_flow_steering *steering,
+			       struct mlx5_flow_group *fg)
+{
+	rhashtable_destroy(&fg->ftes_hash);
+	kmem_cache_free(steering->fgs_cache, fg);
+}
+
+static struct mlx5_flow_group *alloc_flow_group(struct mlx5_flow_steering *steering,
+						u8 match_criteria_enable,
+						void *match_criteria,
+						int start_index,
+						int end_index)
+{
+	struct mlx5_flow_group *fg;
+	int ret;
+
+	fg = kmem_cache_zalloc(steering->fgs_cache, GFP_KERNEL);
+	if (!fg)
+		return ERR_PTR(-ENOMEM);
+
+	ret = rhashtable_init(&fg->ftes_hash, &rhash_fte);
+	if (ret) {
+		kmem_cache_free(steering->fgs_cache, fg);
+		return ERR_PTR(ret);
+}
+	ida_init(&fg->fte_allocator);
+	fg->mask.match_criteria_enable = match_criteria_enable;
+	memcpy(&fg->mask.match_criteria, match_criteria,
+	       sizeof(fg->mask.match_criteria));
+	fg->node.type =  FS_TYPE_FLOW_GROUP;
+	fg->start_index = start_index;
+	fg->max_ftes = end_index - start_index + 1;
+
+	return fg;
+}
+
+static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *ft,
+						       u8 match_criteria_enable,
+						       void *match_criteria,
+						       int start_index,
+						       int end_index,
+						       struct list_head *prev)
+{
+	struct mlx5_flow_steering *steering = get_steering(&ft->node);
+	struct mlx5_flow_group *fg;
+	int ret;
+
+	fg = alloc_flow_group(steering, match_criteria_enable, match_criteria,
+			      start_index, end_index);
+	if (IS_ERR(fg))
+		return fg;
+
+	/* initialize refcnt, add to parent list */
+	ret = rhltable_insert(&ft->fgs_hash,
+			      &fg->hash,
+			      rhash_fg);
+	if (ret) {
+		dealloc_flow_group(steering, fg);
+		return ERR_PTR(ret);
+	}
+
+	tree_init_node(&fg->node, del_hw_flow_group, del_sw_flow_group);
+	tree_add_node(&fg->node, &ft->node);
+	/* Add node to group list */
+	list_add(&fg->node.list, prev);
+	atomic_inc(&ft->node.version);
+
+	return fg;
+}
+
+static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, int max_fte,
+						enum fs_flow_table_type table_type,
+						enum fs_flow_table_op_mod op_mod,
+						u32 flags)
+{
+	struct mlx5_flow_table *ft;
+	int ret;
+
+	ft  = kzalloc(sizeof(*ft), GFP_KERNEL);
+	if (!ft)
+		return ERR_PTR(-ENOMEM);
+
+	ret = rhltable_init(&ft->fgs_hash, &rhash_fg);
+	if (ret) {
+		kfree(ft);
+		return ERR_PTR(ret);
+	}
+
+	ft->level = level;
+	ft->node.type = FS_TYPE_FLOW_TABLE;
+	ft->op_mod = op_mod;
+	ft->type = table_type;
+	ft->vport = vport;
+	ft->max_fte = max_fte;
+	ft->flags = flags;
+	INIT_LIST_HEAD(&ft->fwd_rules);
+	mutex_init(&ft->lock);
+
+	return ft;
+}
+
+/* If reverse is false, then we search for the first flow table in the
+ * root sub-tree from start(closest from right), else we search for the
+ * last flow table in the root sub-tree till start(closest from left).
+ */
+static struct mlx5_flow_table *find_closest_ft_recursive(struct fs_node  *root,
+							 struct list_head *start,
+							 bool reverse)
+{
+#define list_advance_entry(pos, reverse)		\
+	((reverse) ? list_prev_entry(pos, list) : list_next_entry(pos, list))
+
+#define list_for_each_advance_continue(pos, head, reverse)	\
+	for (pos = list_advance_entry(pos, reverse);		\
+	     &pos->list != (head);				\
+	     pos = list_advance_entry(pos, reverse))
+
+	struct fs_node *iter = list_entry(start, struct fs_node, list);
+	struct mlx5_flow_table *ft = NULL;
+
+	if (!root)
+		return NULL;
+
+	list_for_each_advance_continue(iter, &root->children, reverse) {
+		if (iter->type == FS_TYPE_FLOW_TABLE) {
+			fs_get_obj(ft, iter);
+			return ft;
+		}
+		ft = find_closest_ft_recursive(iter, &iter->children, reverse);
+		if (ft)
+			return ft;
+	}
+
+	return ft;
+}
+
+/* If reverse if false then return the first flow table in next priority of
+ * prio in the tree, else return the last flow table in the previous priority
+ * of prio in the tree.
+ */
+static struct mlx5_flow_table *find_closest_ft(struct fs_prio *prio, bool reverse)
+{
+	struct mlx5_flow_table *ft = NULL;
+	struct fs_node *curr_node;
+	struct fs_node *parent;
+
+	parent = prio->node.parent;
+	curr_node = &prio->node;
+	while (!ft && parent) {
+		ft = find_closest_ft_recursive(parent, &curr_node->list, reverse);
+		curr_node = parent;
+		parent = curr_node->parent;
+	}
+	return ft;
+}
+
+/* Assuming all the tree is locked by mutex chain lock */
+static struct mlx5_flow_table *find_next_chained_ft(struct fs_prio *prio)
+{
+	return find_closest_ft(prio, false);
+}
+
+/* Assuming all the tree is locked by mutex chain lock */
+static struct mlx5_flow_table *find_prev_chained_ft(struct fs_prio *prio)
+{
+	return find_closest_ft(prio, true);
+}
+
+static int connect_fts_in_prio(struct mlx5_core_dev *dev,
+			       struct fs_prio *prio,
+			       struct mlx5_flow_table *ft)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&prio->node);
+	struct mlx5_flow_table *iter;
+	int i = 0;
+	int err;
+
+	fs_for_each_ft(iter, prio) {
+		i++;
+		err = root->cmds->modify_flow_table(dev, iter, ft);
+		if (err) {
+			mlx5_core_warn(dev, "Failed to modify flow table %d\n",
+				       iter->id);
+			/* The driver is out of sync with the FW */
+			if (i > 1)
+				WARN_ON(true);
+			return err;
+		}
+	}
+	return 0;
+}
+
+/* Connect flow tables from previous priority of prio to ft */
+static int connect_prev_fts(struct mlx5_core_dev *dev,
+			    struct mlx5_flow_table *ft,
+			    struct fs_prio *prio)
+{
+	struct mlx5_flow_table *prev_ft;
+
+	prev_ft = find_prev_chained_ft(prio);
+	if (prev_ft) {
+		struct fs_prio *prev_prio;
+
+		fs_get_obj(prev_prio, prev_ft->node.parent);
+		return connect_fts_in_prio(dev, prev_prio, ft);
+	}
+	return 0;
+}
+
+static int update_root_ft_create(struct mlx5_flow_table *ft, struct fs_prio
+				 *prio)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&prio->node);
+	struct mlx5_ft_underlay_qp *uqp;
+	int min_level = INT_MAX;
+	int err;
+	u32 qpn;
+
+	if (root->root_ft)
+		min_level = root->root_ft->level;
+
+	if (ft->level >= min_level)
+		return 0;
+
+	if (list_empty(&root->underlay_qpns)) {
+		/* Don't set any QPN (zero) in case QPN list is empty */
+		qpn = 0;
+		err = root->cmds->update_root_ft(root->dev, ft, qpn, false);
+	} else {
+		list_for_each_entry(uqp, &root->underlay_qpns, list) {
+			qpn = uqp->qpn;
+			err = root->cmds->update_root_ft(root->dev, ft,
+							 qpn, false);
+			if (err)
+				break;
+		}
+	}
+
+	if (err)
+		mlx5_core_warn(root->dev,
+			       "Update root flow table of id(%u) qpn(%d) failed\n",
+			       ft->id, qpn);
+	else
+		root->root_ft = ft;
+
+	return err;
+}
+
+static int _mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
+					 struct mlx5_flow_destination *dest)
+{
+	struct mlx5_flow_root_namespace *root;
+	struct mlx5_flow_table *ft;
+	struct mlx5_flow_group *fg;
+	struct fs_fte *fte;
+	int modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
+	int err = 0;
+
+	fs_get_obj(fte, rule->node.parent);
+	if (!(fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
+		return -EINVAL;
+	down_write_ref_node(&fte->node);
+	fs_get_obj(fg, fte->node.parent);
+	fs_get_obj(ft, fg->node.parent);
+
+	memcpy(&rule->dest_attr, dest, sizeof(*dest));
+	root = find_root(&ft->node);
+	err = root->cmds->update_fte(get_dev(&ft->node), ft, fg->id,
+				     modify_mask, fte);
+	up_write_ref_node(&fte->node);
+
+	return err;
+}
+
+int mlx5_modify_rule_destination(struct mlx5_flow_handle *handle,
+				 struct mlx5_flow_destination *new_dest,
+				 struct mlx5_flow_destination *old_dest)
+{
+	int i;
+
+	if (!old_dest) {
+		if (handle->num_rules != 1)
+			return -EINVAL;
+		return _mlx5_modify_rule_destination(handle->rule[0],
+						     new_dest);
+	}
+
+	for (i = 0; i < handle->num_rules; i++) {
+		if (mlx5_flow_dests_cmp(new_dest, &handle->rule[i]->dest_attr))
+			return _mlx5_modify_rule_destination(handle->rule[i],
+							     new_dest);
+	}
+
+	return -EINVAL;
+}
+
+/* Modify/set FWD rules that point on old_next_ft to point on new_next_ft  */
+static int connect_fwd_rules(struct mlx5_core_dev *dev,
+			     struct mlx5_flow_table *new_next_ft,
+			     struct mlx5_flow_table *old_next_ft)
+{
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_rule *iter;
+	int err = 0;
+
+	/* new_next_ft and old_next_ft could be NULL only
+	 * when we create/destroy the anchor flow table.
+	 */
+	if (!new_next_ft || !old_next_ft)
+		return 0;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	dest.ft = new_next_ft;
+
+	mutex_lock(&old_next_ft->lock);
+	list_splice_init(&old_next_ft->fwd_rules, &new_next_ft->fwd_rules);
+	mutex_unlock(&old_next_ft->lock);
+	list_for_each_entry(iter, &new_next_ft->fwd_rules, next_ft) {
+		err = _mlx5_modify_rule_destination(iter, &dest);
+		if (err)
+			pr_err("mlx5_core: failed to modify rule to point on flow table %d\n",
+			       new_next_ft->id);
+	}
+	return 0;
+}
+
+static int connect_flow_table(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft,
+			      struct fs_prio *prio)
+{
+	struct mlx5_flow_table *next_ft;
+	int err = 0;
+
+	/* Connect_prev_fts and update_root_ft_create are mutually exclusive */
+
+	if (list_empty(&prio->node.children)) {
+		err = connect_prev_fts(dev, ft, prio);
+		if (err)
+			return err;
+
+		next_ft = find_next_chained_ft(prio);
+		err = connect_fwd_rules(dev, ft, next_ft);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_FLOWTABLE(dev,
+			       flow_table_properties_nic_receive.modify_root))
+		err = update_root_ft_create(ft, prio);
+	return err;
+}
+
+static void list_add_flow_table(struct mlx5_flow_table *ft,
+				struct fs_prio *prio)
+{
+	struct list_head *prev = &prio->node.children;
+	struct mlx5_flow_table *iter;
+
+	fs_for_each_ft(iter, prio) {
+		if (iter->level > ft->level)
+			break;
+		prev = &iter->node.list;
+	}
+	list_add(&ft->node.list, prev);
+}
+
+static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
+							struct mlx5_flow_table_attr *ft_attr,
+							enum fs_flow_table_op_mod op_mod,
+							u16 vport)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&ns->node);
+	struct mlx5_flow_table *next_ft = NULL;
+	struct fs_prio *fs_prio = NULL;
+	struct mlx5_flow_table *ft;
+	int log_table_sz;
+	int err;
+
+	if (!root) {
+		pr_err("mlx5: flow steering failed to find root of namespace\n");
+		return ERR_PTR(-ENODEV);
+	}
+
+	mutex_lock(&root->chain_lock);
+	fs_prio = find_prio(ns, ft_attr->prio);
+	if (!fs_prio) {
+		err = -EINVAL;
+		goto unlock_root;
+	}
+	if (ft_attr->level >= fs_prio->num_levels) {
+		err = -ENOSPC;
+		goto unlock_root;
+	}
+	/* The level is related to the
+	 * priority level range.
+	 */
+	ft_attr->level += fs_prio->start_level;
+	ft = alloc_flow_table(ft_attr->level,
+			      vport,
+			      ft_attr->max_fte ? roundup_pow_of_two(ft_attr->max_fte) : 0,
+			      root->table_type,
+			      op_mod, ft_attr->flags);
+	if (IS_ERR(ft)) {
+		err = PTR_ERR(ft);
+		goto unlock_root;
+	}
+
+	tree_init_node(&ft->node, del_hw_flow_table, del_sw_flow_table);
+	log_table_sz = ft->max_fte ? ilog2(ft->max_fte) : 0;
+	next_ft = find_next_chained_ft(fs_prio);
+	err = root->cmds->create_flow_table(root->dev, ft->vport, ft->op_mod,
+					    ft->type, ft->level, log_table_sz,
+					    next_ft, &ft->id, ft->flags);
+	if (err)
+		goto free_ft;
+
+	err = connect_flow_table(root->dev, ft, fs_prio);
+	if (err)
+		goto destroy_ft;
+	ft->node.active = true;
+	down_write_ref_node(&fs_prio->node);
+	tree_add_node(&ft->node, &fs_prio->node);
+	list_add_flow_table(ft, fs_prio);
+	fs_prio->num_ft++;
+	up_write_ref_node(&fs_prio->node);
+	mutex_unlock(&root->chain_lock);
+	return ft;
+destroy_ft:
+	root->cmds->destroy_flow_table(root->dev, ft);
+free_ft:
+	kfree(ft);
+unlock_root:
+	mutex_unlock(&root->chain_lock);
+	return ERR_PTR(err);
+}
+
+struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
+					       struct mlx5_flow_table_attr *ft_attr)
+{
+	return __mlx5_create_flow_table(ns, ft_attr, FS_FT_OP_MOD_NORMAL, 0);
+}
+
+struct mlx5_flow_table *mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
+						     int prio, int max_fte,
+						     u32 level, u16 vport)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+
+	ft_attr.max_fte = max_fte;
+	ft_attr.level   = level;
+	ft_attr.prio    = prio;
+
+	return __mlx5_create_flow_table(ns, &ft_attr, FS_FT_OP_MOD_NORMAL, vport);
+}
+
+struct mlx5_flow_table*
+mlx5_create_lag_demux_flow_table(struct mlx5_flow_namespace *ns,
+				 int prio, u32 level)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+
+	ft_attr.level = level;
+	ft_attr.prio  = prio;
+	return __mlx5_create_flow_table(ns, &ft_attr, FS_FT_OP_MOD_LAG_DEMUX, 0);
+}
+EXPORT_SYMBOL(mlx5_create_lag_demux_flow_table);
+
+struct mlx5_flow_table*
+mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
+				    int prio,
+				    int num_flow_table_entries,
+				    int max_num_groups,
+				    u32 level,
+				    u32 flags)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_flow_table *ft;
+
+	if (max_num_groups > num_flow_table_entries)
+		return ERR_PTR(-EINVAL);
+
+	ft_attr.max_fte = num_flow_table_entries;
+	ft_attr.prio    = prio;
+	ft_attr.level   = level;
+	ft_attr.flags   = flags;
+
+	ft = mlx5_create_flow_table(ns, &ft_attr);
+	if (IS_ERR(ft))
+		return ft;
+
+	ft->autogroup.active = true;
+	ft->autogroup.required_groups = max_num_groups;
+
+	return ft;
+}
+EXPORT_SYMBOL(mlx5_create_auto_grouped_flow_table);
+
+struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
+					       u32 *fg_in)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&ft->node);
+	void *match_criteria = MLX5_ADDR_OF(create_flow_group_in,
+					    fg_in, match_criteria);
+	u8 match_criteria_enable = MLX5_GET(create_flow_group_in,
+					    fg_in,
+					    match_criteria_enable);
+	int start_index = MLX5_GET(create_flow_group_in, fg_in,
+				   start_flow_index);
+	int end_index = MLX5_GET(create_flow_group_in, fg_in,
+				 end_flow_index);
+	struct mlx5_core_dev *dev = get_dev(&ft->node);
+	struct mlx5_flow_group *fg;
+	int err;
+
+	if (!check_valid_mask(match_criteria_enable, match_criteria))
+		return ERR_PTR(-EINVAL);
+
+	if (ft->autogroup.active)
+		return ERR_PTR(-EPERM);
+
+	down_write_ref_node(&ft->node);
+	fg = alloc_insert_flow_group(ft, match_criteria_enable, match_criteria,
+				     start_index, end_index,
+				     ft->node.children.prev);
+	up_write_ref_node(&ft->node);
+	if (IS_ERR(fg))
+		return fg;
+
+	err = root->cmds->create_flow_group(dev, ft, fg_in, &fg->id);
+	if (err) {
+		tree_put_node(&fg->node);
+		return ERR_PTR(err);
+	}
+	trace_mlx5_fs_add_fg(fg);
+	fg->node.active = true;
+
+	return fg;
+}
+
+static struct mlx5_flow_rule *alloc_rule(struct mlx5_flow_destination *dest)
+{
+	struct mlx5_flow_rule *rule;
+
+	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+	if (!rule)
+		return NULL;
+
+	INIT_LIST_HEAD(&rule->next_ft);
+	rule->node.type = FS_TYPE_FLOW_DEST;
+	if (dest)
+		memcpy(&rule->dest_attr, dest, sizeof(*dest));
+
+	return rule;
+}
+
+static struct mlx5_flow_handle *alloc_handle(int num_rules)
+{
+	struct mlx5_flow_handle *handle;
+
+	handle = kzalloc(sizeof(*handle) + sizeof(handle->rule[0]) *
+			  num_rules, GFP_KERNEL);
+	if (!handle)
+		return NULL;
+
+	handle->num_rules = num_rules;
+
+	return handle;
+}
+
+static void destroy_flow_handle(struct fs_fte *fte,
+				struct mlx5_flow_handle *handle,
+				struct mlx5_flow_destination *dest,
+				int i)
+{
+	for (; --i >= 0;) {
+		if (refcount_dec_and_test(&handle->rule[i]->node.refcount)) {
+			fte->dests_size--;
+			list_del(&handle->rule[i]->node.list);
+			kfree(handle->rule[i]);
+		}
+	}
+	kfree(handle);
+}
+
+static struct mlx5_flow_handle *
+create_flow_handle(struct fs_fte *fte,
+		   struct mlx5_flow_destination *dest,
+		   int dest_num,
+		   int *modify_mask,
+		   bool *new_rule)
+{
+	struct mlx5_flow_handle *handle;
+	struct mlx5_flow_rule *rule = NULL;
+	static int count = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS);
+	static int dst = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
+	int type;
+	int i = 0;
+
+	handle = alloc_handle((dest_num) ? dest_num : 1);
+	if (!handle)
+		return ERR_PTR(-ENOMEM);
+
+	do {
+		if (dest) {
+			rule = find_flow_rule(fte, dest + i);
+			if (rule) {
+				refcount_inc(&rule->node.refcount);
+				goto rule_found;
+			}
+		}
+
+		*new_rule = true;
+		rule = alloc_rule(dest + i);
+		if (!rule)
+			goto free_rules;
+
+		/* Add dest to dests list- we need flow tables to be in the
+		 * end of the list for forward to next prio rules.
+		 */
+		tree_init_node(&rule->node, NULL, del_sw_hw_rule);
+		if (dest &&
+		    dest[i].type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
+			list_add(&rule->node.list, &fte->node.children);
+		else
+			list_add_tail(&rule->node.list, &fte->node.children);
+		if (dest) {
+			fte->dests_size++;
+
+			type = dest[i].type ==
+				MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+			*modify_mask |= type ? count : dst;
+		}
+rule_found:
+		handle->rule[i] = rule;
+	} while (++i < dest_num);
+
+	return handle;
+
+free_rules:
+	destroy_flow_handle(fte, handle, dest, i);
+	return ERR_PTR(-ENOMEM);
+}
+
+/* fte should not be deleted while calling this function */
+static struct mlx5_flow_handle *
+add_rule_fte(struct fs_fte *fte,
+	     struct mlx5_flow_group *fg,
+	     struct mlx5_flow_destination *dest,
+	     int dest_num,
+	     bool update_action)
+{
+	struct mlx5_flow_root_namespace *root;
+	struct mlx5_flow_handle *handle;
+	struct mlx5_flow_table *ft;
+	int modify_mask = 0;
+	int err;
+	bool new_rule = false;
+
+	handle = create_flow_handle(fte, dest, dest_num, &modify_mask,
+				    &new_rule);
+	if (IS_ERR(handle) || !new_rule)
+		goto out;
+
+	if (update_action)
+		modify_mask |= BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION);
+
+	fs_get_obj(ft, fg->node.parent);
+	root = find_root(&fg->node);
+	if (!(fte->status & FS_FTE_STATUS_EXISTING))
+		err = root->cmds->create_fte(get_dev(&ft->node),
+					     ft, fg, fte);
+	else
+		err = root->cmds->update_fte(get_dev(&ft->node), ft, fg->id,
+						     modify_mask, fte);
+	if (err)
+		goto free_handle;
+
+	fte->node.active = true;
+	fte->status |= FS_FTE_STATUS_EXISTING;
+	atomic_inc(&fte->node.version);
+
+out:
+	return handle;
+
+free_handle:
+	destroy_flow_handle(fte, handle, dest, handle->num_rules);
+	return ERR_PTR(err);
+}
+
+static struct mlx5_flow_group *alloc_auto_flow_group(struct mlx5_flow_table  *ft,
+						     struct mlx5_flow_spec *spec)
+{
+	struct list_head *prev = &ft->node.children;
+	struct mlx5_flow_group *fg;
+	unsigned int candidate_index = 0;
+	unsigned int group_size = 0;
+
+	if (!ft->autogroup.active)
+		return ERR_PTR(-ENOENT);
+
+	if (ft->autogroup.num_groups < ft->autogroup.required_groups)
+		/* We save place for flow groups in addition to max types */
+		group_size = ft->max_fte / (ft->autogroup.required_groups + 1);
+
+	/*  ft->max_fte == ft->autogroup.max_types */
+	if (group_size == 0)
+		group_size = 1;
+
+	/* sorted by start_index */
+	fs_for_each_fg(fg, ft) {
+		if (candidate_index + group_size > fg->start_index)
+			candidate_index = fg->start_index + fg->max_ftes;
+		else
+			break;
+		prev = &fg->node.list;
+	}
+
+	if (candidate_index + group_size > ft->max_fte)
+		return ERR_PTR(-ENOSPC);
+
+	fg = alloc_insert_flow_group(ft,
+				     spec->match_criteria_enable,
+				     spec->match_criteria,
+				     candidate_index,
+				     candidate_index + group_size - 1,
+				     prev);
+	if (IS_ERR(fg))
+		goto out;
+
+	ft->autogroup.num_groups++;
+
+out:
+	return fg;
+}
+
+static int create_auto_flow_group(struct mlx5_flow_table *ft,
+				  struct mlx5_flow_group *fg)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&ft->node);
+	struct mlx5_core_dev *dev = get_dev(&ft->node);
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	void *match_criteria_addr;
+	int err;
+	u32 *in;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(create_flow_group_in, in, match_criteria_enable,
+		 fg->mask.match_criteria_enable);
+	MLX5_SET(create_flow_group_in, in, start_flow_index, fg->start_index);
+	MLX5_SET(create_flow_group_in, in, end_flow_index,   fg->start_index +
+		 fg->max_ftes - 1);
+	match_criteria_addr = MLX5_ADDR_OF(create_flow_group_in,
+					   in, match_criteria);
+	memcpy(match_criteria_addr, fg->mask.match_criteria,
+	       sizeof(fg->mask.match_criteria));
+
+	err = root->cmds->create_flow_group(dev, ft, in, &fg->id);
+	if (!err) {
+		fg->node.active = true;
+		trace_mlx5_fs_add_fg(fg);
+	}
+
+	kvfree(in);
+	return err;
+}
+
+static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
+				struct mlx5_flow_destination *d2)
+{
+	if (d1->type == d2->type) {
+		if ((d1->type == MLX5_FLOW_DESTINATION_TYPE_VPORT &&
+		     d1->vport_num == d2->vport_num) ||
+		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE &&
+		     d1->ft == d2->ft) ||
+		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_TIR &&
+		     d1->tir_num == d2->tir_num))
+			return true;
+	}
+
+	return false;
+}
+
+static struct mlx5_flow_rule *find_flow_rule(struct fs_fte *fte,
+					     struct mlx5_flow_destination *dest)
+{
+	struct mlx5_flow_rule *rule;
+
+	list_for_each_entry(rule, &fte->node.children, node.list) {
+		if (mlx5_flow_dests_cmp(&rule->dest_attr, dest))
+			return rule;
+	}
+	return NULL;
+}
+
+static bool check_conflicting_actions(u32 action1, u32 action2)
+{
+	u32 xored_actions = action1 ^ action2;
+
+	/* if one rule only wants to count, it's ok */
+	if (action1 == MLX5_FLOW_CONTEXT_ACTION_COUNT ||
+	    action2 == MLX5_FLOW_CONTEXT_ACTION_COUNT)
+		return false;
+
+	if (xored_actions & (MLX5_FLOW_CONTEXT_ACTION_DROP  |
+			     MLX5_FLOW_CONTEXT_ACTION_ENCAP |
+			     MLX5_FLOW_CONTEXT_ACTION_DECAP |
+			     MLX5_FLOW_CONTEXT_ACTION_MOD_HDR  |
+			     MLX5_FLOW_CONTEXT_ACTION_VLAN_POP |
+			     MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH))
+		return true;
+
+	return false;
+}
+
+static int check_conflicting_ftes(struct fs_fte *fte, const struct mlx5_flow_act *flow_act)
+{
+	if (check_conflicting_actions(flow_act->action, fte->action.action)) {
+		mlx5_core_warn(get_dev(&fte->node),
+			       "Found two FTEs with conflicting actions\n");
+		return -EEXIST;
+	}
+
+	if (flow_act->has_flow_tag &&
+	    fte->action.flow_tag != flow_act->flow_tag) {
+		mlx5_core_warn(get_dev(&fte->node),
+			       "FTE flow tag %u already exists with different flow tag %u\n",
+			       fte->action.flow_tag,
+			       flow_act->flow_tag);
+		return -EEXIST;
+	}
+
+	return 0;
+}
+
+static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
+					    u32 *match_value,
+					    struct mlx5_flow_act *flow_act,
+					    struct mlx5_flow_destination *dest,
+					    int dest_num,
+					    struct fs_fte *fte)
+{
+	struct mlx5_flow_handle *handle;
+	int old_action;
+	int i;
+	int ret;
+
+	ret = check_conflicting_ftes(fte, flow_act);
+	if (ret)
+		return ERR_PTR(ret);
+
+	old_action = fte->action.action;
+	fte->action.action |= flow_act->action;
+	handle = add_rule_fte(fte, fg, dest, dest_num,
+			      old_action != flow_act->action);
+	if (IS_ERR(handle)) {
+		fte->action.action = old_action;
+		return handle;
+	}
+	trace_mlx5_fs_set_fte(fte, false);
+
+	for (i = 0; i < handle->num_rules; i++) {
+		if (refcount_read(&handle->rule[i]->node.refcount) == 1) {
+			tree_add_node(&handle->rule[i]->node, &fte->node);
+			trace_mlx5_fs_add_rule(handle->rule[i]);
+		}
+	}
+	return handle;
+}
+
+struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_handle *handle)
+{
+	struct mlx5_flow_rule *dst;
+	struct fs_fte *fte;
+
+	fs_get_obj(fte, handle->rule[0]->node.parent);
+
+	fs_for_each_dst(dst, fte) {
+		if (dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+			return dst->dest_attr.counter;
+	}
+
+	return NULL;
+}
+
+static bool counter_is_valid(struct mlx5_fc *counter, u32 action)
+{
+	if (!(action & MLX5_FLOW_CONTEXT_ACTION_COUNT))
+		return !counter;
+
+	if (!counter)
+		return false;
+
+	return (action & (MLX5_FLOW_CONTEXT_ACTION_DROP |
+			  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST));
+}
+
+static bool dest_is_valid(struct mlx5_flow_destination *dest,
+			  u32 action,
+			  struct mlx5_flow_table *ft)
+{
+	if (dest && (dest->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER))
+		return counter_is_valid(dest->counter, action);
+
+	if (!(action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
+		return true;
+
+	if (!dest || ((dest->type ==
+	    MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) &&
+	    (dest->ft->level <= ft->level)))
+		return false;
+	return true;
+}
+
+struct match_list {
+	struct list_head	list;
+	struct mlx5_flow_group *g;
+};
+
+struct match_list_head {
+	struct list_head  list;
+	struct match_list first;
+};
+
+static void free_match_list(struct match_list_head *head)
+{
+	if (!list_empty(&head->list)) {
+		struct match_list *iter, *match_tmp;
+
+		list_del(&head->first.list);
+		tree_put_node(&head->first.g->node);
+		list_for_each_entry_safe(iter, match_tmp, &head->list,
+					 list) {
+			tree_put_node(&iter->g->node);
+			list_del(&iter->list);
+			kfree(iter);
+		}
+	}
+}
+
+static int build_match_list(struct match_list_head *match_head,
+			    struct mlx5_flow_table *ft,
+			    struct mlx5_flow_spec *spec)
+{
+	struct rhlist_head *tmp, *list;
+	struct mlx5_flow_group *g;
+	int err = 0;
+
+	rcu_read_lock();
+	INIT_LIST_HEAD(&match_head->list);
+	/* Collect all fgs which has a matching match_criteria */
+	list = rhltable_lookup(&ft->fgs_hash, spec, rhash_fg);
+	/* RCU is atomic, we can't execute FW commands here */
+	rhl_for_each_entry_rcu(g, tmp, list, hash) {
+		struct match_list *curr_match;
+
+		if (likely(list_empty(&match_head->list))) {
+			if (!tree_get_node(&g->node))
+				continue;
+			match_head->first.g = g;
+			list_add_tail(&match_head->first.list,
+				      &match_head->list);
+			continue;
+		}
+
+		curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC);
+		if (!curr_match) {
+			free_match_list(match_head);
+			err = -ENOMEM;
+			goto out;
+		}
+		if (!tree_get_node(&g->node)) {
+			kfree(curr_match);
+			continue;
+		}
+		curr_match->g = g;
+		list_add_tail(&curr_match->list, &match_head->list);
+	}
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static u64 matched_fgs_get_version(struct list_head *match_head)
+{
+	struct match_list *iter;
+	u64 version = 0;
+
+	list_for_each_entry(iter, match_head, list)
+		version += (u64)atomic_read(&iter->g->node.version);
+	return version;
+}
+
+static struct mlx5_flow_handle *
+try_add_to_existing_fg(struct mlx5_flow_table *ft,
+		       struct list_head *match_head,
+		       struct mlx5_flow_spec *spec,
+		       struct mlx5_flow_act *flow_act,
+		       struct mlx5_flow_destination *dest,
+		       int dest_num,
+		       int ft_version)
+{
+	struct mlx5_flow_steering *steering = get_steering(&ft->node);
+	struct mlx5_flow_group *g;
+	struct mlx5_flow_handle *rule;
+	struct match_list *iter;
+	bool take_write = false;
+	struct fs_fte *fte;
+	u64  version;
+	int err;
+
+	fte = alloc_fte(ft, spec->match_value, flow_act);
+	if (IS_ERR(fte))
+		return  ERR_PTR(-ENOMEM);
+
+	list_for_each_entry(iter, match_head, list) {
+		nested_down_read_ref_node(&iter->g->node, FS_LOCK_PARENT);
+	}
+
+search_again_locked:
+	version = matched_fgs_get_version(match_head);
+	/* Try to find a fg that already contains a matching fte */
+	list_for_each_entry(iter, match_head, list) {
+		struct fs_fte *fte_tmp;
+
+		g = iter->g;
+		fte_tmp = rhashtable_lookup_fast(&g->ftes_hash, spec->match_value,
+						 rhash_fte);
+		if (!fte_tmp || !tree_get_node(&fte_tmp->node))
+			continue;
+
+		nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD);
+		if (!take_write) {
+			list_for_each_entry(iter, match_head, list)
+				up_read_ref_node(&iter->g->node);
+		} else {
+			list_for_each_entry(iter, match_head, list)
+				up_write_ref_node(&iter->g->node);
+		}
+
+		rule = add_rule_fg(g, spec->match_value,
+				   flow_act, dest, dest_num, fte_tmp);
+		up_write_ref_node(&fte_tmp->node);
+		tree_put_node(&fte_tmp->node);
+		kmem_cache_free(steering->ftes_cache, fte);
+		return rule;
+	}
+
+	/* No group with matching fte found. Try to add a new fte to any
+	 * matching fg.
+	 */
+
+	if (!take_write) {
+		list_for_each_entry(iter, match_head, list)
+			up_read_ref_node(&iter->g->node);
+		list_for_each_entry(iter, match_head, list)
+			nested_down_write_ref_node(&iter->g->node,
+						   FS_LOCK_PARENT);
+		take_write = true;
+	}
+
+	/* Check the ft version, for case that new flow group
+	 * was added while the fgs weren't locked
+	 */
+	if (atomic_read(&ft->node.version) != ft_version) {
+		rule = ERR_PTR(-EAGAIN);
+		goto out;
+	}
+
+	/* Check the fgs version, for case the new FTE with the
+	 * same values was added while the fgs weren't locked
+	 */
+	if (version != matched_fgs_get_version(match_head))
+		goto search_again_locked;
+
+	list_for_each_entry(iter, match_head, list) {
+		g = iter->g;
+
+		if (!g->node.active)
+			continue;
+		err = insert_fte(g, fte);
+		if (err) {
+			if (err == -ENOSPC)
+				continue;
+			list_for_each_entry(iter, match_head, list)
+				up_write_ref_node(&iter->g->node);
+			kmem_cache_free(steering->ftes_cache, fte);
+			return ERR_PTR(err);
+		}
+
+		nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
+		list_for_each_entry(iter, match_head, list)
+			up_write_ref_node(&iter->g->node);
+		rule = add_rule_fg(g, spec->match_value,
+				   flow_act, dest, dest_num, fte);
+		up_write_ref_node(&fte->node);
+		tree_put_node(&fte->node);
+		return rule;
+	}
+	rule = ERR_PTR(-ENOENT);
+out:
+	list_for_each_entry(iter, match_head, list)
+		up_write_ref_node(&iter->g->node);
+	kmem_cache_free(steering->ftes_cache, fte);
+	return rule;
+}
+
+static struct mlx5_flow_handle *
+_mlx5_add_flow_rules(struct mlx5_flow_table *ft,
+		     struct mlx5_flow_spec *spec,
+		     struct mlx5_flow_act *flow_act,
+		     struct mlx5_flow_destination *dest,
+		     int dest_num)
+
+{
+	struct mlx5_flow_steering *steering = get_steering(&ft->node);
+	struct mlx5_flow_group *g;
+	struct mlx5_flow_handle *rule;
+	struct match_list_head match_head;
+	bool take_write = false;
+	struct fs_fte *fte;
+	int version;
+	int err;
+	int i;
+
+	if (!check_valid_spec(spec))
+		return ERR_PTR(-EINVAL);
+
+	for (i = 0; i < dest_num; i++) {
+		if (!dest_is_valid(&dest[i], flow_act->action, ft))
+			return ERR_PTR(-EINVAL);
+	}
+	nested_down_read_ref_node(&ft->node, FS_LOCK_GRANDPARENT);
+search_again_locked:
+	version = atomic_read(&ft->node.version);
+
+	/* Collect all fgs which has a matching match_criteria */
+	err = build_match_list(&match_head, ft, spec);
+	if (err) {
+		if (take_write)
+			up_write_ref_node(&ft->node);
+		return ERR_PTR(err);
+	}
+
+	if (!take_write)
+		up_read_ref_node(&ft->node);
+
+	rule = try_add_to_existing_fg(ft, &match_head.list, spec, flow_act, dest,
+				      dest_num, version);
+	free_match_list(&match_head);
+	if (!IS_ERR(rule) ||
+	    (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN)) {
+		if (take_write)
+			up_write_ref_node(&ft->node);
+		return rule;
+	}
+
+	if (!take_write) {
+		nested_down_write_ref_node(&ft->node, FS_LOCK_GRANDPARENT);
+		take_write = true;
+	}
+
+	if (PTR_ERR(rule) == -EAGAIN ||
+	    version != atomic_read(&ft->node.version))
+		goto search_again_locked;
+
+	g = alloc_auto_flow_group(ft, spec);
+	if (IS_ERR(g)) {
+		rule = (void *)g;
+		up_write_ref_node(&ft->node);
+		return rule;
+	}
+
+	nested_down_write_ref_node(&g->node, FS_LOCK_PARENT);
+	up_write_ref_node(&ft->node);
+
+	err = create_auto_flow_group(ft, g);
+	if (err)
+		goto err_release_fg;
+
+	fte = alloc_fte(ft, spec->match_value, flow_act);
+	if (IS_ERR(fte)) {
+		err = PTR_ERR(fte);
+		goto err_release_fg;
+	}
+
+	err = insert_fte(g, fte);
+	if (err) {
+		kmem_cache_free(steering->ftes_cache, fte);
+		goto err_release_fg;
+	}
+
+	nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
+	up_write_ref_node(&g->node);
+	rule = add_rule_fg(g, spec->match_value, flow_act, dest,
+			   dest_num, fte);
+	up_write_ref_node(&fte->node);
+	tree_put_node(&fte->node);
+	tree_put_node(&g->node);
+	return rule;
+
+err_release_fg:
+	up_write_ref_node(&g->node);
+	tree_put_node(&g->node);
+	return ERR_PTR(err);
+}
+
+static bool fwd_next_prio_supported(struct mlx5_flow_table *ft)
+{
+	return ((ft->type == FS_FT_NIC_RX) &&
+		(MLX5_CAP_FLOWTABLE(get_dev(&ft->node), nic_rx_multi_path_tirs)));
+}
+
+struct mlx5_flow_handle *
+mlx5_add_flow_rules(struct mlx5_flow_table *ft,
+		    struct mlx5_flow_spec *spec,
+		    struct mlx5_flow_act *flow_act,
+		    struct mlx5_flow_destination *dest,
+		    int dest_num)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&ft->node);
+	struct mlx5_flow_destination gen_dest = {};
+	struct mlx5_flow_table *next_ft = NULL;
+	struct mlx5_flow_handle *handle = NULL;
+	u32 sw_action = flow_act->action;
+	struct fs_prio *prio;
+
+	fs_get_obj(prio, ft->node.parent);
+	if (flow_act->action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
+		if (!fwd_next_prio_supported(ft))
+			return ERR_PTR(-EOPNOTSUPP);
+		if (dest)
+			return ERR_PTR(-EINVAL);
+		mutex_lock(&root->chain_lock);
+		next_ft = find_next_chained_ft(prio);
+		if (next_ft) {
+			gen_dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+			gen_dest.ft = next_ft;
+			dest = &gen_dest;
+			dest_num = 1;
+			flow_act->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+		} else {
+			mutex_unlock(&root->chain_lock);
+			return ERR_PTR(-EOPNOTSUPP);
+		}
+	}
+
+	handle = _mlx5_add_flow_rules(ft, spec, flow_act, dest, dest_num);
+
+	if (sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
+		if (!IS_ERR_OR_NULL(handle) &&
+		    (list_empty(&handle->rule[0]->next_ft))) {
+			mutex_lock(&next_ft->lock);
+			list_add(&handle->rule[0]->next_ft,
+				 &next_ft->fwd_rules);
+			mutex_unlock(&next_ft->lock);
+			handle->rule[0]->sw_action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
+		}
+		mutex_unlock(&root->chain_lock);
+	}
+	return handle;
+}
+EXPORT_SYMBOL(mlx5_add_flow_rules);
+
+void mlx5_del_flow_rules(struct mlx5_flow_handle *handle)
+{
+	int i;
+
+	for (i = handle->num_rules - 1; i >= 0; i--)
+		tree_remove_node(&handle->rule[i]->node);
+	kfree(handle);
+}
+EXPORT_SYMBOL(mlx5_del_flow_rules);
+
+/* Assuming prio->node.children(flow tables) is sorted by level */
+static struct mlx5_flow_table *find_next_ft(struct mlx5_flow_table *ft)
+{
+	struct fs_prio *prio;
+
+	fs_get_obj(prio, ft->node.parent);
+
+	if (!list_is_last(&ft->node.list, &prio->node.children))
+		return list_next_entry(ft, node.list);
+	return find_next_chained_ft(prio);
+}
+
+static int update_root_ft_destroy(struct mlx5_flow_table *ft)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&ft->node);
+	struct mlx5_ft_underlay_qp *uqp;
+	struct mlx5_flow_table *new_root_ft = NULL;
+	int err = 0;
+	u32 qpn;
+
+	if (root->root_ft != ft)
+		return 0;
+
+	new_root_ft = find_next_ft(ft);
+	if (!new_root_ft) {
+		root->root_ft = NULL;
+		return 0;
+	}
+
+	if (list_empty(&root->underlay_qpns)) {
+		/* Don't set any QPN (zero) in case QPN list is empty */
+		qpn = 0;
+		err = root->cmds->update_root_ft(root->dev, new_root_ft,
+						 qpn, false);
+	} else {
+		list_for_each_entry(uqp, &root->underlay_qpns, list) {
+			qpn = uqp->qpn;
+			err = root->cmds->update_root_ft(root->dev,
+							 new_root_ft, qpn,
+							 false);
+			if (err)
+				break;
+		}
+	}
+
+	if (err)
+		mlx5_core_warn(root->dev,
+			       "Update root flow table of id(%u) qpn(%d) failed\n",
+			       ft->id, qpn);
+	else
+		root->root_ft = new_root_ft;
+
+	return 0;
+}
+
+/* Connect flow table from previous priority to
+ * the next flow table.
+ */
+static int disconnect_flow_table(struct mlx5_flow_table *ft)
+{
+	struct mlx5_core_dev *dev = get_dev(&ft->node);
+	struct mlx5_flow_table *next_ft;
+	struct fs_prio *prio;
+	int err = 0;
+
+	err = update_root_ft_destroy(ft);
+	if (err)
+		return err;
+
+	fs_get_obj(prio, ft->node.parent);
+	if  (!(list_first_entry(&prio->node.children,
+				struct mlx5_flow_table,
+				node.list) == ft))
+		return 0;
+
+	next_ft = find_next_chained_ft(prio);
+	err = connect_fwd_rules(dev, next_ft, ft);
+	if (err)
+		return err;
+
+	err = connect_prev_fts(dev, next_ft, prio);
+	if (err)
+		mlx5_core_warn(dev, "Failed to disconnect flow table %d\n",
+			       ft->id);
+	return err;
+}
+
+int mlx5_destroy_flow_table(struct mlx5_flow_table *ft)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&ft->node);
+	int err = 0;
+
+	mutex_lock(&root->chain_lock);
+	err = disconnect_flow_table(ft);
+	if (err) {
+		mutex_unlock(&root->chain_lock);
+		return err;
+	}
+	if (tree_remove_node(&ft->node))
+		mlx5_core_warn(get_dev(&ft->node), "Flow table %d wasn't destroyed, refcount > 1\n",
+			       ft->id);
+	mutex_unlock(&root->chain_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_destroy_flow_table);
+
+void mlx5_destroy_flow_group(struct mlx5_flow_group *fg)
+{
+	if (tree_remove_node(&fg->node))
+		mlx5_core_warn(get_dev(&fg->node), "Flow group %d wasn't destroyed, refcount > 1\n",
+			       fg->id);
+}
+
+struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
+						    enum mlx5_flow_namespace_type type)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	struct mlx5_flow_root_namespace *root_ns;
+	int prio;
+	struct fs_prio *fs_prio;
+	struct mlx5_flow_namespace *ns;
+
+	if (!steering)
+		return NULL;
+
+	switch (type) {
+	case MLX5_FLOW_NAMESPACE_BYPASS:
+	case MLX5_FLOW_NAMESPACE_LAG:
+	case MLX5_FLOW_NAMESPACE_OFFLOADS:
+	case MLX5_FLOW_NAMESPACE_ETHTOOL:
+	case MLX5_FLOW_NAMESPACE_KERNEL:
+	case MLX5_FLOW_NAMESPACE_LEFTOVERS:
+	case MLX5_FLOW_NAMESPACE_ANCHOR:
+		prio = type;
+		break;
+	case MLX5_FLOW_NAMESPACE_FDB:
+		if (steering->fdb_root_ns)
+			return &steering->fdb_root_ns->ns;
+		else
+			return NULL;
+	case MLX5_FLOW_NAMESPACE_SNIFFER_RX:
+		if (steering->sniffer_rx_root_ns)
+			return &steering->sniffer_rx_root_ns->ns;
+		else
+			return NULL;
+	case MLX5_FLOW_NAMESPACE_SNIFFER_TX:
+		if (steering->sniffer_tx_root_ns)
+			return &steering->sniffer_tx_root_ns->ns;
+		else
+			return NULL;
+	case MLX5_FLOW_NAMESPACE_EGRESS:
+		if (steering->egress_root_ns)
+			return &steering->egress_root_ns->ns;
+		else
+			return NULL;
+	default:
+		return NULL;
+	}
+
+	root_ns = steering->root_ns;
+	if (!root_ns)
+		return NULL;
+
+	fs_prio = find_prio(&root_ns->ns, prio);
+	if (!fs_prio)
+		return NULL;
+
+	ns = list_first_entry(&fs_prio->node.children,
+			      typeof(*ns),
+			      node.list);
+
+	return ns;
+}
+EXPORT_SYMBOL(mlx5_get_flow_namespace);
+
+struct mlx5_flow_namespace *mlx5_get_flow_vport_acl_namespace(struct mlx5_core_dev *dev,
+							      enum mlx5_flow_namespace_type type,
+							      int vport)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+
+	if (!steering || vport >= MLX5_TOTAL_VPORTS(dev))
+		return NULL;
+
+	switch (type) {
+	case MLX5_FLOW_NAMESPACE_ESW_EGRESS:
+		if (steering->esw_egress_root_ns &&
+		    steering->esw_egress_root_ns[vport])
+			return &steering->esw_egress_root_ns[vport]->ns;
+		else
+			return NULL;
+	case MLX5_FLOW_NAMESPACE_ESW_INGRESS:
+		if (steering->esw_ingress_root_ns &&
+		    steering->esw_ingress_root_ns[vport])
+			return &steering->esw_ingress_root_ns[vport]->ns;
+		else
+			return NULL;
+	default:
+		return NULL;
+	}
+}
+
+static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
+				      unsigned int prio, int num_levels)
+{
+	struct fs_prio *fs_prio;
+
+	fs_prio = kzalloc(sizeof(*fs_prio), GFP_KERNEL);
+	if (!fs_prio)
+		return ERR_PTR(-ENOMEM);
+
+	fs_prio->node.type = FS_TYPE_PRIO;
+	tree_init_node(&fs_prio->node, NULL, del_sw_prio);
+	tree_add_node(&fs_prio->node, &ns->node);
+	fs_prio->num_levels = num_levels;
+	fs_prio->prio = prio;
+	list_add_tail(&fs_prio->node.list, &ns->node.children);
+
+	return fs_prio;
+}
+
+static struct mlx5_flow_namespace *fs_init_namespace(struct mlx5_flow_namespace
+						     *ns)
+{
+	ns->node.type = FS_TYPE_NAMESPACE;
+
+	return ns;
+}
+
+static struct mlx5_flow_namespace *fs_create_namespace(struct fs_prio *prio)
+{
+	struct mlx5_flow_namespace	*ns;
+
+	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	if (!ns)
+		return ERR_PTR(-ENOMEM);
+
+	fs_init_namespace(ns);
+	tree_init_node(&ns->node, NULL, del_sw_ns);
+	tree_add_node(&ns->node, &prio->node);
+	list_add_tail(&ns->node.list, &prio->node.children);
+
+	return ns;
+}
+
+static int create_leaf_prios(struct mlx5_flow_namespace *ns, int prio,
+			     struct init_tree_node *prio_metadata)
+{
+	struct fs_prio *fs_prio;
+	int i;
+
+	for (i = 0; i < prio_metadata->num_leaf_prios; i++) {
+		fs_prio = fs_create_prio(ns, prio++, prio_metadata->num_levels);
+		if (IS_ERR(fs_prio))
+			return PTR_ERR(fs_prio);
+	}
+	return 0;
+}
+
+#define FLOW_TABLE_BIT_SZ 1
+#define GET_FLOW_TABLE_CAP(dev, offset) \
+	((be32_to_cpu(*((__be32 *)(dev->caps.hca_cur[MLX5_CAP_FLOW_TABLE]) +	\
+			offset / 32)) >>					\
+	  (32 - FLOW_TABLE_BIT_SZ - (offset & 0x1f))) & FLOW_TABLE_BIT_SZ)
+static bool has_required_caps(struct mlx5_core_dev *dev, struct node_caps *caps)
+{
+	int i;
+
+	for (i = 0; i < caps->arr_sz; i++) {
+		if (!GET_FLOW_TABLE_CAP(dev, caps->caps[i]))
+			return false;
+	}
+	return true;
+}
+
+static int init_root_tree_recursive(struct mlx5_flow_steering *steering,
+				    struct init_tree_node *init_node,
+				    struct fs_node *fs_parent_node,
+				    struct init_tree_node *init_parent_node,
+				    int prio)
+{
+	int max_ft_level = MLX5_CAP_FLOWTABLE(steering->dev,
+					      flow_table_properties_nic_receive.
+					      max_ft_level);
+	struct mlx5_flow_namespace *fs_ns;
+	struct fs_prio *fs_prio;
+	struct fs_node *base;
+	int i;
+	int err;
+
+	if (init_node->type == FS_TYPE_PRIO) {
+		if ((init_node->min_ft_level > max_ft_level) ||
+		    !has_required_caps(steering->dev, &init_node->caps))
+			return 0;
+
+		fs_get_obj(fs_ns, fs_parent_node);
+		if (init_node->num_leaf_prios)
+			return create_leaf_prios(fs_ns, prio, init_node);
+		fs_prio = fs_create_prio(fs_ns, prio, init_node->num_levels);
+		if (IS_ERR(fs_prio))
+			return PTR_ERR(fs_prio);
+		base = &fs_prio->node;
+	} else if (init_node->type == FS_TYPE_NAMESPACE) {
+		fs_get_obj(fs_prio, fs_parent_node);
+		fs_ns = fs_create_namespace(fs_prio);
+		if (IS_ERR(fs_ns))
+			return PTR_ERR(fs_ns);
+		base = &fs_ns->node;
+	} else {
+		return -EINVAL;
+	}
+	prio = 0;
+	for (i = 0; i < init_node->ar_size; i++) {
+		err = init_root_tree_recursive(steering, &init_node->children[i],
+					       base, init_node, prio);
+		if (err)
+			return err;
+		if (init_node->children[i].type == FS_TYPE_PRIO &&
+		    init_node->children[i].num_leaf_prios) {
+			prio += init_node->children[i].num_leaf_prios;
+		}
+	}
+
+	return 0;
+}
+
+static int init_root_tree(struct mlx5_flow_steering *steering,
+			  struct init_tree_node *init_node,
+			  struct fs_node *fs_parent_node)
+{
+	int i;
+	struct mlx5_flow_namespace *fs_ns;
+	int err;
+
+	fs_get_obj(fs_ns, fs_parent_node);
+	for (i = 0; i < init_node->ar_size; i++) {
+		err = init_root_tree_recursive(steering, &init_node->children[i],
+					       &fs_ns->node,
+					       init_node, i);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static struct mlx5_flow_root_namespace
+*create_root_ns(struct mlx5_flow_steering *steering,
+		enum fs_flow_table_type table_type)
+{
+	const struct mlx5_flow_cmds *cmds = mlx5_fs_cmd_get_default(table_type);
+	struct mlx5_flow_root_namespace *root_ns;
+	struct mlx5_flow_namespace *ns;
+
+	if (mlx5_accel_ipsec_device_caps(steering->dev) & MLX5_ACCEL_IPSEC_CAP_DEVICE &&
+	    (table_type == FS_FT_NIC_RX || table_type == FS_FT_NIC_TX))
+		cmds = mlx5_fs_cmd_get_default_ipsec_fpga_cmds(table_type);
+
+	/* Create the root namespace */
+	root_ns = kvzalloc(sizeof(*root_ns), GFP_KERNEL);
+	if (!root_ns)
+		return NULL;
+
+	root_ns->dev = steering->dev;
+	root_ns->table_type = table_type;
+	root_ns->cmds = cmds;
+
+	INIT_LIST_HEAD(&root_ns->underlay_qpns);
+
+	ns = &root_ns->ns;
+	fs_init_namespace(ns);
+	mutex_init(&root_ns->chain_lock);
+	tree_init_node(&ns->node, NULL, NULL);
+	tree_add_node(&ns->node, NULL);
+
+	return root_ns;
+}
+
+static void set_prio_attrs_in_prio(struct fs_prio *prio, int acc_level);
+
+static int set_prio_attrs_in_ns(struct mlx5_flow_namespace *ns, int acc_level)
+{
+	struct fs_prio *prio;
+
+	fs_for_each_prio(prio, ns) {
+		 /* This updates prio start_level and num_levels */
+		set_prio_attrs_in_prio(prio, acc_level);
+		acc_level += prio->num_levels;
+	}
+	return acc_level;
+}
+
+static void set_prio_attrs_in_prio(struct fs_prio *prio, int acc_level)
+{
+	struct mlx5_flow_namespace *ns;
+	int acc_level_ns = acc_level;
+
+	prio->start_level = acc_level;
+	fs_for_each_ns(ns, prio)
+		/* This updates start_level and num_levels of ns's priority descendants */
+		acc_level_ns = set_prio_attrs_in_ns(ns, acc_level);
+	if (!prio->num_levels)
+		prio->num_levels = acc_level_ns - prio->start_level;
+	WARN_ON(prio->num_levels < acc_level_ns - prio->start_level);
+}
+
+static void set_prio_attrs(struct mlx5_flow_root_namespace *root_ns)
+{
+	struct mlx5_flow_namespace *ns = &root_ns->ns;
+	struct fs_prio *prio;
+	int start_level = 0;
+
+	fs_for_each_prio(prio, ns) {
+		set_prio_attrs_in_prio(prio, start_level);
+		start_level += prio->num_levels;
+	}
+}
+
+#define ANCHOR_PRIO 0
+#define ANCHOR_SIZE 1
+#define ANCHOR_LEVEL 0
+static int create_anchor_flow_table(struct mlx5_flow_steering *steering)
+{
+	struct mlx5_flow_namespace *ns = NULL;
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_flow_table *ft;
+
+	ns = mlx5_get_flow_namespace(steering->dev, MLX5_FLOW_NAMESPACE_ANCHOR);
+	if (WARN_ON(!ns))
+		return -EINVAL;
+
+	ft_attr.max_fte = ANCHOR_SIZE;
+	ft_attr.level   = ANCHOR_LEVEL;
+	ft_attr.prio    = ANCHOR_PRIO;
+
+	ft = mlx5_create_flow_table(ns, &ft_attr);
+	if (IS_ERR(ft)) {
+		mlx5_core_err(steering->dev, "Failed to create last anchor flow table");
+		return PTR_ERR(ft);
+	}
+	return 0;
+}
+
+static int init_root_ns(struct mlx5_flow_steering *steering)
+{
+	int err;
+
+	steering->root_ns = create_root_ns(steering, FS_FT_NIC_RX);
+	if (!steering->root_ns)
+		return -ENOMEM;
+
+	err = init_root_tree(steering, &root_fs, &steering->root_ns->ns.node);
+	if (err)
+		goto out_err;
+
+	set_prio_attrs(steering->root_ns);
+	err = create_anchor_flow_table(steering);
+	if (err)
+		goto out_err;
+
+	return 0;
+
+out_err:
+	cleanup_root_ns(steering->root_ns);
+	steering->root_ns = NULL;
+	return err;
+}
+
+static void clean_tree(struct fs_node *node)
+{
+	if (node) {
+		struct fs_node *iter;
+		struct fs_node *temp;
+
+		tree_get_node(node);
+		list_for_each_entry_safe(iter, temp, &node->children, list)
+			clean_tree(iter);
+		tree_put_node(node);
+		tree_remove_node(node);
+	}
+}
+
+static void cleanup_root_ns(struct mlx5_flow_root_namespace *root_ns)
+{
+	if (!root_ns)
+		return;
+
+	clean_tree(&root_ns->ns.node);
+}
+
+static void cleanup_egress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int i;
+
+	if (!steering->esw_egress_root_ns)
+		return;
+
+	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++)
+		cleanup_root_ns(steering->esw_egress_root_ns[i]);
+
+	kfree(steering->esw_egress_root_ns);
+}
+
+static void cleanup_ingress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int i;
+
+	if (!steering->esw_ingress_root_ns)
+		return;
+
+	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++)
+		cleanup_root_ns(steering->esw_ingress_root_ns[i]);
+
+	kfree(steering->esw_ingress_root_ns);
+}
+
+void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+
+	cleanup_root_ns(steering->root_ns);
+	cleanup_egress_acls_root_ns(dev);
+	cleanup_ingress_acls_root_ns(dev);
+	cleanup_root_ns(steering->fdb_root_ns);
+	cleanup_root_ns(steering->sniffer_rx_root_ns);
+	cleanup_root_ns(steering->sniffer_tx_root_ns);
+	cleanup_root_ns(steering->egress_root_ns);
+	mlx5_cleanup_fc_stats(dev);
+	kmem_cache_destroy(steering->ftes_cache);
+	kmem_cache_destroy(steering->fgs_cache);
+	kfree(steering);
+}
+
+static int init_sniffer_tx_root_ns(struct mlx5_flow_steering *steering)
+{
+	struct fs_prio *prio;
+
+	steering->sniffer_tx_root_ns = create_root_ns(steering, FS_FT_SNIFFER_TX);
+	if (!steering->sniffer_tx_root_ns)
+		return -ENOMEM;
+
+	/* Create single prio */
+	prio = fs_create_prio(&steering->sniffer_tx_root_ns->ns, 0, 1);
+	if (IS_ERR(prio)) {
+		cleanup_root_ns(steering->sniffer_tx_root_ns);
+		return PTR_ERR(prio);
+	}
+	return 0;
+}
+
+static int init_sniffer_rx_root_ns(struct mlx5_flow_steering *steering)
+{
+	struct fs_prio *prio;
+
+	steering->sniffer_rx_root_ns = create_root_ns(steering, FS_FT_SNIFFER_RX);
+	if (!steering->sniffer_rx_root_ns)
+		return -ENOMEM;
+
+	/* Create single prio */
+	prio = fs_create_prio(&steering->sniffer_rx_root_ns->ns, 0, 1);
+	if (IS_ERR(prio)) {
+		cleanup_root_ns(steering->sniffer_rx_root_ns);
+		return PTR_ERR(prio);
+	}
+	return 0;
+}
+
+static int init_fdb_root_ns(struct mlx5_flow_steering *steering)
+{
+	struct fs_prio *prio;
+
+	steering->fdb_root_ns = create_root_ns(steering, FS_FT_FDB);
+	if (!steering->fdb_root_ns)
+		return -ENOMEM;
+
+	prio = fs_create_prio(&steering->fdb_root_ns->ns, 0, 1);
+	if (IS_ERR(prio))
+		goto out_err;
+
+	prio = fs_create_prio(&steering->fdb_root_ns->ns, 1, 1);
+	if (IS_ERR(prio))
+		goto out_err;
+
+	set_prio_attrs(steering->fdb_root_ns);
+	return 0;
+
+out_err:
+	cleanup_root_ns(steering->fdb_root_ns);
+	steering->fdb_root_ns = NULL;
+	return PTR_ERR(prio);
+}
+
+static int init_egress_acl_root_ns(struct mlx5_flow_steering *steering, int vport)
+{
+	struct fs_prio *prio;
+
+	steering->esw_egress_root_ns[vport] = create_root_ns(steering, FS_FT_ESW_EGRESS_ACL);
+	if (!steering->esw_egress_root_ns[vport])
+		return -ENOMEM;
+
+	/* create 1 prio*/
+	prio = fs_create_prio(&steering->esw_egress_root_ns[vport]->ns, 0, 1);
+	return PTR_ERR_OR_ZERO(prio);
+}
+
+static int init_ingress_acl_root_ns(struct mlx5_flow_steering *steering, int vport)
+{
+	struct fs_prio *prio;
+
+	steering->esw_ingress_root_ns[vport] = create_root_ns(steering, FS_FT_ESW_INGRESS_ACL);
+	if (!steering->esw_ingress_root_ns[vport])
+		return -ENOMEM;
+
+	/* create 1 prio*/
+	prio = fs_create_prio(&steering->esw_ingress_root_ns[vport]->ns, 0, 1);
+	return PTR_ERR_OR_ZERO(prio);
+}
+
+static int init_egress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int err;
+	int i;
+
+	steering->esw_egress_root_ns = kcalloc(MLX5_TOTAL_VPORTS(dev),
+					       sizeof(*steering->esw_egress_root_ns),
+					       GFP_KERNEL);
+	if (!steering->esw_egress_root_ns)
+		return -ENOMEM;
+
+	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++) {
+		err = init_egress_acl_root_ns(steering, i);
+		if (err)
+			goto cleanup_root_ns;
+	}
+
+	return 0;
+
+cleanup_root_ns:
+	for (i--; i >= 0; i--)
+		cleanup_root_ns(steering->esw_egress_root_ns[i]);
+	kfree(steering->esw_egress_root_ns);
+	return err;
+}
+
+static int init_ingress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int err;
+	int i;
+
+	steering->esw_ingress_root_ns = kcalloc(MLX5_TOTAL_VPORTS(dev),
+						sizeof(*steering->esw_ingress_root_ns),
+						GFP_KERNEL);
+	if (!steering->esw_ingress_root_ns)
+		return -ENOMEM;
+
+	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++) {
+		err = init_ingress_acl_root_ns(steering, i);
+		if (err)
+			goto cleanup_root_ns;
+	}
+
+	return 0;
+
+cleanup_root_ns:
+	for (i--; i >= 0; i--)
+		cleanup_root_ns(steering->esw_ingress_root_ns[i]);
+	kfree(steering->esw_ingress_root_ns);
+	return err;
+}
+
+static int init_egress_root_ns(struct mlx5_flow_steering *steering)
+{
+	struct fs_prio *prio;
+
+	steering->egress_root_ns = create_root_ns(steering,
+						  FS_FT_NIC_TX);
+	if (!steering->egress_root_ns)
+		return -ENOMEM;
+
+	/* create 1 prio*/
+	prio = fs_create_prio(&steering->egress_root_ns->ns, 0, 1);
+	return PTR_ERR_OR_ZERO(prio);
+}
+
+int mlx5_init_fs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering;
+	int err = 0;
+
+	err = mlx5_init_fc_stats(dev);
+	if (err)
+		return err;
+
+	steering = kzalloc(sizeof(*steering), GFP_KERNEL);
+	if (!steering)
+		return -ENOMEM;
+	steering->dev = dev;
+	dev->priv.steering = steering;
+
+	steering->fgs_cache = kmem_cache_create("mlx5_fs_fgs",
+						sizeof(struct mlx5_flow_group), 0,
+						0, NULL);
+	steering->ftes_cache = kmem_cache_create("mlx5_fs_ftes", sizeof(struct fs_fte), 0,
+						 0, NULL);
+	if (!steering->ftes_cache || !steering->fgs_cache) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	if ((((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
+	      (MLX5_CAP_GEN(dev, nic_flow_table))) ||
+	     ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) &&
+	      MLX5_CAP_GEN(dev, ipoib_enhanced_offloads))) &&
+	    MLX5_CAP_FLOWTABLE_NIC_RX(dev, ft_support)) {
+		err = init_root_ns(steering);
+		if (err)
+			goto err;
+	}
+
+	if (MLX5_CAP_GEN(dev, eswitch_flow_table)) {
+		if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, ft_support)) {
+			err = init_fdb_root_ns(steering);
+			if (err)
+				goto err;
+		}
+		if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) {
+			err = init_egress_acls_root_ns(dev);
+			if (err)
+				goto err;
+		}
+		if (MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support)) {
+			err = init_ingress_acls_root_ns(dev);
+			if (err)
+				goto err;
+		}
+	}
+
+	if (MLX5_CAP_FLOWTABLE_SNIFFER_RX(dev, ft_support)) {
+		err = init_sniffer_rx_root_ns(steering);
+		if (err)
+			goto err;
+	}
+
+	if (MLX5_CAP_FLOWTABLE_SNIFFER_TX(dev, ft_support)) {
+		err = init_sniffer_tx_root_ns(steering);
+		if (err)
+			goto err;
+	}
+
+	if (MLX5_IPSEC_DEV(dev)) {
+		err = init_egress_root_ns(steering);
+		if (err)
+			goto err;
+	}
+
+	return 0;
+err:
+	mlx5_cleanup_fs(dev);
+	return err;
+}
+
+int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn)
+{
+	struct mlx5_flow_root_namespace *root = dev->priv.steering->root_ns;
+	struct mlx5_ft_underlay_qp *new_uqp;
+	int err = 0;
+
+	new_uqp = kzalloc(sizeof(*new_uqp), GFP_KERNEL);
+	if (!new_uqp)
+		return -ENOMEM;
+
+	mutex_lock(&root->chain_lock);
+
+	if (!root->root_ft) {
+		err = -EINVAL;
+		goto update_ft_fail;
+	}
+
+	err = root->cmds->update_root_ft(dev, root->root_ft, underlay_qpn,
+					 false);
+	if (err) {
+		mlx5_core_warn(dev, "Failed adding underlay QPN (%u) to root FT err(%d)\n",
+			       underlay_qpn, err);
+		goto update_ft_fail;
+	}
+
+	new_uqp->qpn = underlay_qpn;
+	list_add_tail(&new_uqp->list, &root->underlay_qpns);
+
+	mutex_unlock(&root->chain_lock);
+
+	return 0;
+
+update_ft_fail:
+	mutex_unlock(&root->chain_lock);
+	kfree(new_uqp);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_fs_add_rx_underlay_qpn);
+
+int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn)
+{
+	struct mlx5_flow_root_namespace *root = dev->priv.steering->root_ns;
+	struct mlx5_ft_underlay_qp *uqp;
+	bool found = false;
+	int err = 0;
+
+	mutex_lock(&root->chain_lock);
+	list_for_each_entry(uqp, &root->underlay_qpns, list) {
+		if (uqp->qpn == underlay_qpn) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		mlx5_core_warn(dev, "Failed finding underlay qp (%u) in qpn list\n",
+			       underlay_qpn);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = root->cmds->update_root_ft(dev, root->root_ft, underlay_qpn,
+					 true);
+	if (err)
+		mlx5_core_warn(dev, "Failed removing underlay QPN (%u) from root FT err(%d)\n",
+			       underlay_qpn, err);
+
+	list_del(&uqp->list);
+	mutex_unlock(&root->chain_lock);
+	kfree(uqp);
+
+	return 0;
+
+out:
+	mutex_unlock(&root->chain_lock);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_fs_remove_rx_underlay_qpn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
new file mode 100644
index 0000000..e26d3e9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MLX5_FS_CORE_
+#define _MLX5_FS_CORE_
+
+#include <linux/refcount.h>
+#include <linux/mlx5/fs.h>
+#include <linux/rhashtable.h>
+
+enum fs_node_type {
+	FS_TYPE_NAMESPACE,
+	FS_TYPE_PRIO,
+	FS_TYPE_FLOW_TABLE,
+	FS_TYPE_FLOW_GROUP,
+	FS_TYPE_FLOW_ENTRY,
+	FS_TYPE_FLOW_DEST
+};
+
+enum fs_flow_table_type {
+	FS_FT_NIC_RX          = 0x0,
+	FS_FT_NIC_TX          = 0x1,
+	FS_FT_ESW_EGRESS_ACL  = 0x2,
+	FS_FT_ESW_INGRESS_ACL = 0x3,
+	FS_FT_FDB             = 0X4,
+	FS_FT_SNIFFER_RX	= 0X5,
+	FS_FT_SNIFFER_TX	= 0X6,
+	FS_FT_MAX_TYPE = FS_FT_SNIFFER_TX,
+};
+
+enum fs_flow_table_op_mod {
+	FS_FT_OP_MOD_NORMAL,
+	FS_FT_OP_MOD_LAG_DEMUX,
+};
+
+enum fs_fte_status {
+	FS_FTE_STATUS_EXISTING = 1UL << 0,
+};
+
+struct mlx5_flow_steering {
+	struct mlx5_core_dev *dev;
+	struct kmem_cache               *fgs_cache;
+	struct kmem_cache               *ftes_cache;
+	struct mlx5_flow_root_namespace *root_ns;
+	struct mlx5_flow_root_namespace *fdb_root_ns;
+	struct mlx5_flow_root_namespace **esw_egress_root_ns;
+	struct mlx5_flow_root_namespace **esw_ingress_root_ns;
+	struct mlx5_flow_root_namespace	*sniffer_tx_root_ns;
+	struct mlx5_flow_root_namespace	*sniffer_rx_root_ns;
+	struct mlx5_flow_root_namespace	*egress_root_ns;
+};
+
+struct fs_node {
+	struct list_head	list;
+	struct list_head	children;
+	enum fs_node_type	type;
+	struct fs_node		*parent;
+	struct fs_node		*root;
+	/* lock the node for writing and traversing */
+	struct rw_semaphore	lock;
+	refcount_t		refcount;
+	bool			active;
+	void			(*del_hw_func)(struct fs_node *);
+	void			(*del_sw_func)(struct fs_node *);
+	atomic_t		version;
+};
+
+struct mlx5_flow_rule {
+	struct fs_node				node;
+	struct mlx5_flow_destination		dest_attr;
+	/* next_ft should be accessed under chain_lock and only of
+	 * destination type is FWD_NEXT_fT.
+	 */
+	struct list_head			next_ft;
+	u32					sw_action;
+};
+
+struct mlx5_flow_handle {
+	int num_rules;
+	struct mlx5_flow_rule *rule[];
+};
+
+/* Type of children is mlx5_flow_group */
+struct mlx5_flow_table {
+	struct fs_node			node;
+	u32				id;
+	u16				vport;
+	unsigned int			max_fte;
+	unsigned int			level;
+	enum fs_flow_table_type		type;
+	enum fs_flow_table_op_mod	op_mod;
+	struct {
+		bool			active;
+		unsigned int		required_groups;
+		unsigned int		num_groups;
+	} autogroup;
+	/* Protect fwd_rules */
+	struct mutex			lock;
+	/* FWD rules that point on this flow table */
+	struct list_head		fwd_rules;
+	u32				flags;
+	struct rhltable			fgs_hash;
+};
+
+struct mlx5_fc_cache {
+	u64 packets;
+	u64 bytes;
+	u64 lastuse;
+};
+
+struct mlx5_fc {
+	struct rb_node node;
+	struct list_head list;
+
+	/* last{packets,bytes} members are used when calculating the delta since
+	 * last reading
+	 */
+	u64 lastpackets;
+	u64 lastbytes;
+
+	u32 id;
+	bool deleted;
+	bool aging;
+
+	struct mlx5_fc_cache cache ____cacheline_aligned_in_smp;
+};
+
+struct mlx5_ft_underlay_qp {
+	struct list_head list;
+	u32 qpn;
+};
+
+#define MLX5_FTE_MATCH_PARAM_RESERVED	reserved_at_600
+/* Calculate the fte_match_param length and without the reserved length.
+ * Make sure the reserved field is the last.
+ */
+#define MLX5_ST_SZ_DW_MATCH_PARAM					    \
+	((MLX5_BYTE_OFF(fte_match_param, MLX5_FTE_MATCH_PARAM_RESERVED) / sizeof(u32)) + \
+	 BUILD_BUG_ON_ZERO(MLX5_ST_SZ_BYTES(fte_match_param) !=		     \
+			   MLX5_FLD_SZ_BYTES(fte_match_param,		     \
+					     MLX5_FTE_MATCH_PARAM_RESERVED) +\
+			   MLX5_BYTE_OFF(fte_match_param,		     \
+					 MLX5_FTE_MATCH_PARAM_RESERVED)))
+
+/* Type of children is mlx5_flow_rule */
+struct fs_fte {
+	struct fs_node			node;
+	u32				val[MLX5_ST_SZ_DW_MATCH_PARAM];
+	u32				dests_size;
+	u32				index;
+	struct mlx5_flow_act		action;
+	enum fs_fte_status		status;
+	struct mlx5_fc			*counter;
+	struct rhash_head		hash;
+};
+
+/* Type of children is mlx5_flow_table/namespace */
+struct fs_prio {
+	struct fs_node			node;
+	unsigned int			num_levels;
+	unsigned int			start_level;
+	unsigned int			prio;
+	unsigned int			num_ft;
+};
+
+/* Type of children is fs_prio */
+struct mlx5_flow_namespace {
+	/* parent == NULL => root ns */
+	struct	fs_node			node;
+};
+
+struct mlx5_flow_group_mask {
+	u8	match_criteria_enable;
+	u32	match_criteria[MLX5_ST_SZ_DW_MATCH_PARAM];
+};
+
+/* Type of children is fs_fte */
+struct mlx5_flow_group {
+	struct fs_node			node;
+	struct mlx5_flow_group_mask	mask;
+	u32				start_index;
+	u32				max_ftes;
+	struct ida			fte_allocator;
+	u32				id;
+	struct rhashtable		ftes_hash;
+	struct rhlist_head		hash;
+};
+
+struct mlx5_flow_root_namespace {
+	struct mlx5_flow_namespace	ns;
+	enum   fs_flow_table_type	table_type;
+	struct mlx5_core_dev		*dev;
+	struct mlx5_flow_table		*root_ft;
+	/* Should be held when chaining flow tables */
+	struct mutex			chain_lock;
+	struct list_head		underlay_qpns;
+	const struct mlx5_flow_cmds	*cmds;
+};
+
+int mlx5_init_fc_stats(struct mlx5_core_dev *dev);
+void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev);
+void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
+			      struct delayed_work *dwork,
+			      unsigned long delay);
+void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
+				      unsigned long interval);
+int mlx5_fc_query(struct mlx5_core_dev *dev, u16 id,
+		  u64 *packets, u64 *bytes);
+
+int mlx5_init_fs(struct mlx5_core_dev *dev);
+void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
+
+#define fs_get_obj(v, _node)  {v = container_of((_node), typeof(*v), node); }
+
+#define fs_list_for_each_entry(pos, root)		\
+	list_for_each_entry(pos, root, node.list)
+
+#define fs_list_for_each_entry_safe(pos, tmp, root)		\
+	list_for_each_entry_safe(pos, tmp, root, node.list)
+
+#define fs_for_each_ns_or_ft_reverse(pos, prio)				\
+	list_for_each_entry_reverse(pos, &(prio)->node.children, list)
+
+#define fs_for_each_ns_or_ft(pos, prio)					\
+	list_for_each_entry(pos, (&(prio)->node.children), list)
+
+#define fs_for_each_prio(pos, ns)			\
+	fs_list_for_each_entry(pos, &(ns)->node.children)
+
+#define fs_for_each_ns(pos, prio)			\
+	fs_list_for_each_entry(pos, &(prio)->node.children)
+
+#define fs_for_each_ft(pos, prio)			\
+	fs_list_for_each_entry(pos, &(prio)->node.children)
+
+#define fs_for_each_ft_safe(pos, tmp, prio)			\
+	fs_list_for_each_entry_safe(pos, tmp, &(prio)->node.children)
+
+#define fs_for_each_fg(pos, ft)			\
+	fs_list_for_each_entry(pos, &(ft)->node.children)
+
+#define fs_for_each_fte(pos, fg)			\
+	fs_list_for_each_entry(pos, &(fg)->node.children)
+
+#define fs_for_each_dst(pos, fte)			\
+	fs_list_for_each_entry(pos, &(fte)->node.children)
+
+#define MLX5_CAP_FLOWTABLE_TYPE(mdev, cap, type) (		\
+	(type == FS_FT_NIC_RX) ? MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) :		\
+	(type == FS_FT_ESW_EGRESS_ACL) ? MLX5_CAP_ESW_EGRESS_ACL(mdev, cap) :		\
+	(type == FS_FT_ESW_INGRESS_ACL) ? MLX5_CAP_ESW_INGRESS_ACL(mdev, cap) :		\
+	(type == FS_FT_FDB) ? MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) :		\
+	(type == FS_FT_SNIFFER_RX) ? MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) :		\
+	(type == FS_FT_SNIFFER_TX) ? MLX5_CAP_FLOWTABLE_SNIFFER_TX(mdev, cap) :		\
+	(BUILD_BUG_ON_ZERO(FS_FT_SNIFFER_TX != FS_FT_MAX_TYPE))\
+	)
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
new file mode 100644
index 0000000..b7ab929
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/fs.h>
+#include <linux/rbtree.h>
+#include "mlx5_core.h"
+#include "fs_core.h"
+#include "fs_cmd.h"
+
+#define MLX5_FC_STATS_PERIOD msecs_to_jiffies(1000)
+/* Max number of counters to query in bulk read is 32K */
+#define MLX5_SW_MAX_COUNTERS_BULK BIT(15)
+
+/* locking scheme:
+ *
+ * It is the responsibility of the user to prevent concurrent calls or bad
+ * ordering to mlx5_fc_create(), mlx5_fc_destroy() and accessing a reference
+ * to struct mlx5_fc.
+ * e.g en_tc.c is protected by RTNL lock of its caller, and will never call a
+ * dump (access to struct mlx5_fc) after a counter is destroyed.
+ *
+ * access to counter list:
+ * - create (user context)
+ *   - mlx5_fc_create() only adds to an addlist to be used by
+ *     mlx5_fc_stats_query_work(). addlist is protected by a spinlock.
+ *   - spawn thread to do the actual destroy
+ *
+ * - destroy (user context)
+ *   - mark a counter as deleted
+ *   - spawn thread to do the actual del
+ *
+ * - dump (user context)
+ *   user should not call dump after destroy
+ *
+ * - query (single thread workqueue context)
+ *   destroy/dump - no conflict (see destroy)
+ *   query/dump - packets and bytes might be inconsistent (since update is not
+ *                atomic)
+ *   query/create - no conflict (see create)
+ *   since every create/destroy spawn the work, only after necessary time has
+ *   elapsed, the thread will actually query the hardware.
+ */
+
+static void mlx5_fc_stats_insert(struct rb_root *root, struct mlx5_fc *counter)
+{
+	struct rb_node **new = &root->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*new) {
+		struct mlx5_fc *this = rb_entry(*new, struct mlx5_fc, node);
+		int result = counter->id - this->id;
+
+		parent = *new;
+		if (result < 0)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&counter->node, parent, new);
+	rb_insert_color(&counter->node, root);
+}
+
+/* The function returns the last node that was queried so the caller
+ * function can continue calling it till all counters are queried.
+ */
+static struct rb_node *mlx5_fc_stats_query(struct mlx5_core_dev *dev,
+					   struct mlx5_fc *first,
+					   u32 last_id)
+{
+	struct mlx5_cmd_fc_bulk *b;
+	struct rb_node *node = NULL;
+	u32 afirst_id;
+	int num;
+	int err;
+
+	int max_bulk = min_t(int, MLX5_SW_MAX_COUNTERS_BULK,
+			     (1 << MLX5_CAP_GEN(dev, log_max_flow_counter_bulk)));
+
+	/* first id must be aligned to 4 when using bulk query */
+	afirst_id = first->id & ~0x3;
+
+	/* number of counters to query inc. the last counter */
+	num = ALIGN(last_id - afirst_id + 1, 4);
+	if (num > max_bulk) {
+		num = max_bulk;
+		last_id = afirst_id + num - 1;
+	}
+
+	b = mlx5_cmd_fc_bulk_alloc(dev, afirst_id, num);
+	if (!b) {
+		mlx5_core_err(dev, "Error allocating resources for bulk query\n");
+		return NULL;
+	}
+
+	err = mlx5_cmd_fc_bulk_query(dev, b);
+	if (err) {
+		mlx5_core_err(dev, "Error doing bulk query: %d\n", err);
+		goto out;
+	}
+
+	for (node = &first->node; node; node = rb_next(node)) {
+		struct mlx5_fc *counter = rb_entry(node, struct mlx5_fc, node);
+		struct mlx5_fc_cache *c = &counter->cache;
+		u64 packets;
+		u64 bytes;
+
+		if (counter->id > last_id)
+			break;
+
+		mlx5_cmd_fc_bulk_get(dev, b,
+				     counter->id, &packets, &bytes);
+
+		if (c->packets == packets)
+			continue;
+
+		c->packets = packets;
+		c->bytes = bytes;
+		c->lastuse = jiffies;
+	}
+
+out:
+	mlx5_cmd_fc_bulk_free(b);
+
+	return node;
+}
+
+static void mlx5_fc_stats_work(struct work_struct *work)
+{
+	struct mlx5_core_dev *dev = container_of(work, struct mlx5_core_dev,
+						 priv.fc_stats.work.work);
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	unsigned long now = jiffies;
+	struct mlx5_fc *counter = NULL;
+	struct mlx5_fc *last = NULL;
+	struct rb_node *node;
+	LIST_HEAD(tmplist);
+
+	spin_lock(&fc_stats->addlist_lock);
+
+	list_splice_tail_init(&fc_stats->addlist, &tmplist);
+
+	if (!list_empty(&tmplist) || !RB_EMPTY_ROOT(&fc_stats->counters))
+		queue_delayed_work(fc_stats->wq, &fc_stats->work,
+				   fc_stats->sampling_interval);
+
+	spin_unlock(&fc_stats->addlist_lock);
+
+	list_for_each_entry(counter, &tmplist, list)
+		mlx5_fc_stats_insert(&fc_stats->counters, counter);
+
+	node = rb_first(&fc_stats->counters);
+	while (node) {
+		counter = rb_entry(node, struct mlx5_fc, node);
+
+		node = rb_next(node);
+
+		if (counter->deleted) {
+			rb_erase(&counter->node, &fc_stats->counters);
+
+			mlx5_cmd_fc_free(dev, counter->id);
+
+			kfree(counter);
+			continue;
+		}
+
+		last = counter;
+	}
+
+	if (time_before(now, fc_stats->next_query) || !last)
+		return;
+
+	node = rb_first(&fc_stats->counters);
+	while (node) {
+		counter = rb_entry(node, struct mlx5_fc, node);
+
+		node = mlx5_fc_stats_query(dev, counter, last->id);
+	}
+
+	fc_stats->next_query = now + fc_stats->sampling_interval;
+}
+
+struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc *counter;
+	int err;
+
+	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	if (!counter)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_cmd_fc_alloc(dev, &counter->id);
+	if (err)
+		goto err_out;
+
+	if (aging) {
+		counter->cache.lastuse = jiffies;
+		counter->aging = true;
+
+		spin_lock(&fc_stats->addlist_lock);
+		list_add(&counter->list, &fc_stats->addlist);
+		spin_unlock(&fc_stats->addlist_lock);
+
+		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
+	}
+
+	return counter;
+
+err_out:
+	kfree(counter);
+
+	return ERR_PTR(err);
+}
+
+void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	if (!counter)
+		return;
+
+	if (counter->aging) {
+		counter->deleted = true;
+		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
+		return;
+	}
+
+	mlx5_cmd_fc_free(dev, counter->id);
+	kfree(counter);
+}
+
+int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	fc_stats->counters = RB_ROOT;
+	INIT_LIST_HEAD(&fc_stats->addlist);
+	spin_lock_init(&fc_stats->addlist_lock);
+
+	fc_stats->wq = create_singlethread_workqueue("mlx5_fc");
+	if (!fc_stats->wq)
+		return -ENOMEM;
+
+	fc_stats->sampling_interval = MLX5_FC_STATS_PERIOD;
+	INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work);
+
+	return 0;
+}
+
+void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc *counter;
+	struct mlx5_fc *tmp;
+	struct rb_node *node;
+
+	cancel_delayed_work_sync(&dev->priv.fc_stats.work);
+	destroy_workqueue(dev->priv.fc_stats.wq);
+	dev->priv.fc_stats.wq = NULL;
+
+	list_for_each_entry_safe(counter, tmp, &fc_stats->addlist, list) {
+		list_del(&counter->list);
+
+		mlx5_cmd_fc_free(dev, counter->id);
+
+		kfree(counter);
+	}
+
+	node = rb_first(&fc_stats->counters);
+	while (node) {
+		counter = rb_entry(node, struct mlx5_fc, node);
+
+		node = rb_next(node);
+
+		rb_erase(&counter->node, &fc_stats->counters);
+
+		mlx5_cmd_fc_free(dev, counter->id);
+
+		kfree(counter);
+	}
+}
+
+int mlx5_fc_query(struct mlx5_core_dev *dev, u16 id,
+		  u64 *packets, u64 *bytes)
+{
+	return mlx5_cmd_fc_query(dev, id, packets, bytes);
+}
+
+void mlx5_fc_query_cached(struct mlx5_fc *counter,
+			  u64 *bytes, u64 *packets, u64 *lastuse)
+{
+	struct mlx5_fc_cache c;
+
+	c = counter->cache;
+
+	*bytes = c.bytes - counter->lastbytes;
+	*packets = c.packets - counter->lastpackets;
+	*lastuse = c.lastuse;
+
+	counter->lastbytes = c.bytes;
+	counter->lastpackets = c.packets;
+}
+
+void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
+			      struct delayed_work *dwork,
+			      unsigned long delay)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	queue_delayed_work(fc_stats->wq, dwork, delay);
+}
+
+void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
+				      unsigned long interval)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	fc_stats->sampling_interval = min_t(unsigned long, interval,
+					    fc_stats->sampling_interval);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
new file mode 100644
index 0000000..afd9f4f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -0,0 +1,526 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/module.h>
+#include "mlx5_core.h"
+#include "../../mlxfw/mlxfw.h"
+
+static int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev, u32 *out,
+				  int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_adapter_in)] = {0};
+
+	MLX5_SET(query_adapter_in, in, opcode, MLX5_CMD_OP_QUERY_ADAPTER);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+
+int mlx5_query_board_id(struct mlx5_core_dev *dev)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_adapter_out);
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_cmd_query_adapter(dev, out, outlen);
+	if (err)
+		goto out;
+
+	memcpy(dev->board_id,
+	       MLX5_ADDR_OF(query_adapter_out, out,
+			    query_adapter_struct.vsd_contd_psid),
+	       MLX5_FLD_SZ_BYTES(query_adapter_out,
+				 query_adapter_struct.vsd_contd_psid));
+
+out:
+	kfree(out);
+	return err;
+}
+
+int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_adapter_out);
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_cmd_query_adapter(mdev, out, outlen);
+	if (err)
+		goto out;
+
+	*vendor_id = MLX5_GET(query_adapter_out, out,
+			      query_adapter_struct.ieee_vendor_id);
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_query_vendor_id);
+
+static int mlx5_get_pcam_reg(struct mlx5_core_dev *dev)
+{
+	return mlx5_query_pcam_reg(dev, dev->caps.pcam,
+				   MLX5_PCAM_FEATURE_ENHANCED_FEATURES,
+				   MLX5_PCAM_REGS_5000_TO_507F);
+}
+
+static int mlx5_get_mcam_reg(struct mlx5_core_dev *dev)
+{
+	return mlx5_query_mcam_reg(dev, dev->caps.mcam,
+				   MLX5_MCAM_FEATURE_ENHANCED_FEATURES,
+				   MLX5_MCAM_REGS_FIRST_128);
+}
+
+static int mlx5_get_qcam_reg(struct mlx5_core_dev *dev)
+{
+	return mlx5_query_qcam_reg(dev, dev->caps.qcam,
+				   MLX5_QCAM_FEATURE_ENHANCED_FEATURES,
+				   MLX5_QCAM_REGS_FIRST_128);
+}
+
+int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
+{
+	int err;
+
+	err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL);
+	if (err)
+		return err;
+
+	if (MLX5_CAP_GEN(dev, eth_net_offloads)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ETHERNET_OFFLOADS);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, ipoib_enhanced_offloads)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_IPOIB_ENHANCED_OFFLOADS);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, pg)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ODP);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, atomic)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, roce)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ROCE);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, nic_flow_table) ||
+	    MLX5_CAP_GEN(dev, ipoib_enhanced_offloads)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_FLOW_TABLE);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, vport_group_manager) &&
+	    MLX5_CAP_GEN(dev, eswitch_flow_table)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH_FLOW_TABLE);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, eswitch_flow_table)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, vector_calc)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_VECTOR_CALC);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, qos)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_QOS);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, debug))
+		mlx5_core_get_caps(dev, MLX5_CAP_DEBUG);
+
+	if (MLX5_CAP_GEN(dev, pcam_reg))
+		mlx5_get_pcam_reg(dev);
+
+	if (MLX5_CAP_GEN(dev, mcam_reg))
+		mlx5_get_mcam_reg(dev);
+
+	if (MLX5_CAP_GEN(dev, qcam_reg))
+		mlx5_get_qcam_reg(dev);
+
+	if (MLX5_CAP_GEN(dev, device_memory)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_DEV_MEM);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id)
+{
+	u32 out[MLX5_ST_SZ_DW(init_hca_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(init_hca_in)]   = {0};
+	int i;
+
+	MLX5_SET(init_hca_in, in, opcode, MLX5_CMD_OP_INIT_HCA);
+
+	if (MLX5_CAP_GEN(dev, sw_owner_id)) {
+		for (i = 0; i < 4; i++)
+			MLX5_ARRAY_SET(init_hca_in, in, sw_owner_id, i,
+				       sw_owner_id[i]);
+	}
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev)
+{
+	u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(teardown_hca_in)]   = {0};
+
+	MLX5_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev)
+{
+	u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {0};
+	int force_state;
+	int ret;
+
+	if (!MLX5_CAP_GEN(dev, force_teardown)) {
+		mlx5_core_dbg(dev, "force teardown is not supported in the firmware\n");
+		return -EOPNOTSUPP;
+	}
+
+	MLX5_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA);
+	MLX5_SET(teardown_hca_in, in, profile, MLX5_TEARDOWN_HCA_IN_PROFILE_FORCE_CLOSE);
+
+	ret = mlx5_cmd_exec_polling(dev, in, sizeof(in), out, sizeof(out));
+	if (ret)
+		return ret;
+
+	force_state = MLX5_GET(teardown_hca_out, out, force_state);
+	if (force_state == MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL) {
+		mlx5_core_warn(dev, "teardown with force mode failed, doing normal teardown\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+enum mlxsw_reg_mcc_instruction {
+	MLX5_REG_MCC_INSTRUCTION_LOCK_UPDATE_HANDLE = 0x01,
+	MLX5_REG_MCC_INSTRUCTION_RELEASE_UPDATE_HANDLE = 0x02,
+	MLX5_REG_MCC_INSTRUCTION_UPDATE_COMPONENT = 0x03,
+	MLX5_REG_MCC_INSTRUCTION_VERIFY_COMPONENT = 0x04,
+	MLX5_REG_MCC_INSTRUCTION_ACTIVATE = 0x06,
+	MLX5_REG_MCC_INSTRUCTION_CANCEL = 0x08,
+};
+
+static int mlx5_reg_mcc_set(struct mlx5_core_dev *dev,
+			    enum mlxsw_reg_mcc_instruction instr,
+			    u16 component_index, u32 update_handle,
+			    u32 component_size)
+{
+	u32 out[MLX5_ST_SZ_DW(mcc_reg)];
+	u32 in[MLX5_ST_SZ_DW(mcc_reg)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(mcc_reg, in, instruction, instr);
+	MLX5_SET(mcc_reg, in, component_index, component_index);
+	MLX5_SET(mcc_reg, in, update_handle, update_handle);
+	MLX5_SET(mcc_reg, in, component_size, component_size);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_MCC, 0, 1);
+}
+
+static int mlx5_reg_mcc_query(struct mlx5_core_dev *dev,
+			      u32 *update_handle, u8 *error_code,
+			      u8 *control_state)
+{
+	u32 out[MLX5_ST_SZ_DW(mcc_reg)];
+	u32 in[MLX5_ST_SZ_DW(mcc_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+	MLX5_SET(mcc_reg, in, update_handle, *update_handle);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_MCC, 0, 0);
+	if (err)
+		goto out;
+
+	*update_handle = MLX5_GET(mcc_reg, out, update_handle);
+	*error_code = MLX5_GET(mcc_reg, out, error_code);
+	*control_state = MLX5_GET(mcc_reg, out, control_state);
+
+out:
+	return err;
+}
+
+static int mlx5_reg_mcda_set(struct mlx5_core_dev *dev,
+			     u32 update_handle,
+			     u32 offset, u16 size,
+			     u8 *data)
+{
+	int err, in_size = MLX5_ST_SZ_BYTES(mcda_reg) + size;
+	u32 out[MLX5_ST_SZ_DW(mcda_reg)];
+	int i, j, dw_size = size >> 2;
+	__be32 data_element;
+	u32 *in;
+
+	in = kzalloc(in_size, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(mcda_reg, in, update_handle, update_handle);
+	MLX5_SET(mcda_reg, in, offset, offset);
+	MLX5_SET(mcda_reg, in, size, size);
+
+	for (i = 0; i < dw_size; i++) {
+		j = i * 4;
+		data_element = htonl(*(u32 *)&data[j]);
+		memcpy(MLX5_ADDR_OF(mcda_reg, in, data) + j, &data_element, 4);
+	}
+
+	err = mlx5_core_access_reg(dev, in, in_size, out,
+				   sizeof(out), MLX5_REG_MCDA, 0, 1);
+	kfree(in);
+	return err;
+}
+
+static int mlx5_reg_mcqi_query(struct mlx5_core_dev *dev,
+			       u16 component_index,
+			       u32 *max_component_size,
+			       u8 *log_mcda_word_size,
+			       u16 *mcda_max_write_size)
+{
+	u32 out[MLX5_ST_SZ_DW(mcqi_reg) + MLX5_ST_SZ_DW(mcqi_cap)];
+	int offset = MLX5_ST_SZ_DW(mcqi_reg);
+	u32 in[MLX5_ST_SZ_DW(mcqi_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(mcqi_reg, in, component_index, component_index);
+	MLX5_SET(mcqi_reg, in, data_size, MLX5_ST_SZ_BYTES(mcqi_cap));
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_MCQI, 0, 0);
+	if (err)
+		goto out;
+
+	*max_component_size = MLX5_GET(mcqi_cap, out + offset, max_component_size);
+	*log_mcda_word_size = MLX5_GET(mcqi_cap, out + offset, log_mcda_word_size);
+	*mcda_max_write_size = MLX5_GET(mcqi_cap, out + offset, mcda_max_write_size);
+
+out:
+	return err;
+}
+
+struct mlx5_mlxfw_dev {
+	struct mlxfw_dev mlxfw_dev;
+	struct mlx5_core_dev *mlx5_core_dev;
+};
+
+static int mlx5_component_query(struct mlxfw_dev *mlxfw_dev,
+				u16 component_index, u32 *p_max_size,
+				u8 *p_align_bits, u16 *p_max_write_size)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	return mlx5_reg_mcqi_query(dev, component_index, p_max_size,
+				   p_align_bits, p_max_write_size);
+}
+
+static int mlx5_fsm_lock(struct mlxfw_dev *mlxfw_dev, u32 *fwhandle)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+	u8 control_state, error_code;
+	int err;
+
+	*fwhandle = 0;
+	err = mlx5_reg_mcc_query(dev, fwhandle, &error_code, &control_state);
+	if (err)
+		return err;
+
+	if (control_state != MLXFW_FSM_STATE_IDLE)
+		return -EBUSY;
+
+	return mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_LOCK_UPDATE_HANDLE,
+				0, *fwhandle, 0);
+}
+
+static int mlx5_fsm_component_update(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				     u16 component_index, u32 component_size)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	return mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_UPDATE_COMPONENT,
+				component_index, fwhandle, component_size);
+}
+
+static int mlx5_fsm_block_download(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				   u8 *data, u16 size, u32 offset)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	return mlx5_reg_mcda_set(dev, fwhandle, offset, size, data);
+}
+
+static int mlx5_fsm_component_verify(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				     u16 component_index)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	return mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_VERIFY_COMPONENT,
+				component_index, fwhandle, 0);
+}
+
+static int mlx5_fsm_activate(struct mlxfw_dev *mlxfw_dev, u32 fwhandle)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	return mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_ACTIVATE,	0,
+				fwhandle, 0);
+}
+
+static int mlx5_fsm_query_state(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				enum mlxfw_fsm_state *fsm_state,
+				enum mlxfw_fsm_state_err *fsm_state_err)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+	u8 control_state, error_code;
+	int err;
+
+	err = mlx5_reg_mcc_query(dev, &fwhandle, &error_code, &control_state);
+	if (err)
+		return err;
+
+	*fsm_state = control_state;
+	*fsm_state_err = min_t(enum mlxfw_fsm_state_err, error_code,
+			       MLXFW_FSM_STATE_ERR_MAX);
+	return 0;
+}
+
+static void mlx5_fsm_cancel(struct mlxfw_dev *mlxfw_dev, u32 fwhandle)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_CANCEL, 0, fwhandle, 0);
+}
+
+static void mlx5_fsm_release(struct mlxfw_dev *mlxfw_dev, u32 fwhandle)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_RELEASE_UPDATE_HANDLE, 0,
+			 fwhandle, 0);
+}
+
+static const struct mlxfw_dev_ops mlx5_mlxfw_dev_ops = {
+	.component_query	= mlx5_component_query,
+	.fsm_lock		= mlx5_fsm_lock,
+	.fsm_component_update	= mlx5_fsm_component_update,
+	.fsm_block_download	= mlx5_fsm_block_download,
+	.fsm_component_verify	= mlx5_fsm_component_verify,
+	.fsm_activate		= mlx5_fsm_activate,
+	.fsm_query_state	= mlx5_fsm_query_state,
+	.fsm_cancel		= mlx5_fsm_cancel,
+	.fsm_release		= mlx5_fsm_release
+};
+
+int mlx5_firmware_flash(struct mlx5_core_dev *dev,
+			const struct firmware *firmware)
+{
+	struct mlx5_mlxfw_dev mlx5_mlxfw_dev = {
+		.mlxfw_dev = {
+			.ops = &mlx5_mlxfw_dev_ops,
+			.psid = dev->board_id,
+			.psid_size = strlen(dev->board_id),
+		},
+		.mlx5_core_dev = dev
+	};
+
+	if (!MLX5_CAP_GEN(dev, mcam_reg)  ||
+	    !MLX5_CAP_MCAM_REG(dev, mcqi) ||
+	    !MLX5_CAP_MCAM_REG(dev, mcc)  ||
+	    !MLX5_CAP_MCAM_REG(dev, mcda)) {
+		pr_info("%s flashing isn't supported by the running FW\n", __func__);
+		return -EOPNOTSUPP;
+	}
+
+	return mlxfw_firmware_flash(&mlx5_mlxfw_dev.mlxfw_dev, firmware);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
new file mode 100644
index 0000000..d39b0b7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/vmalloc.h>
+#include <linux/hardirq.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+enum {
+	MLX5_HEALTH_POLL_INTERVAL	= 2 * HZ,
+	MAX_MISSES			= 3,
+};
+
+enum {
+	MLX5_HEALTH_SYNDR_FW_ERR		= 0x1,
+	MLX5_HEALTH_SYNDR_IRISC_ERR		= 0x7,
+	MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR	= 0x8,
+	MLX5_HEALTH_SYNDR_CRC_ERR		= 0x9,
+	MLX5_HEALTH_SYNDR_FETCH_PCI_ERR		= 0xa,
+	MLX5_HEALTH_SYNDR_HW_FTL_ERR		= 0xb,
+	MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR	= 0xc,
+	MLX5_HEALTH_SYNDR_EQ_ERR		= 0xd,
+	MLX5_HEALTH_SYNDR_EQ_INV		= 0xe,
+	MLX5_HEALTH_SYNDR_FFSER_ERR		= 0xf,
+	MLX5_HEALTH_SYNDR_HIGH_TEMP		= 0x10
+};
+
+enum {
+	MLX5_NIC_IFC_FULL		= 0,
+	MLX5_NIC_IFC_DISABLED		= 1,
+	MLX5_NIC_IFC_NO_DRAM_NIC	= 2,
+	MLX5_NIC_IFC_INVALID		= 3
+};
+
+enum {
+	MLX5_DROP_NEW_HEALTH_WORK,
+	MLX5_DROP_NEW_RECOVERY_WORK,
+};
+
+static u8 get_nic_state(struct mlx5_core_dev *dev)
+{
+	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
+}
+
+static void trigger_cmd_completions(struct mlx5_core_dev *dev)
+{
+	unsigned long flags;
+	u64 vector;
+
+	/* wait for pending handlers to complete */
+	synchronize_irq(pci_irq_vector(dev->pdev, MLX5_EQ_VEC_CMD));
+	spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
+	vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
+	if (!vector)
+		goto no_trig;
+
+	vector |= MLX5_TRIGGERED_CMD_COMP;
+	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+
+	mlx5_core_dbg(dev, "vector 0x%llx\n", vector);
+	mlx5_cmd_comp_handler(dev, vector, true);
+	return;
+
+no_trig:
+	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+}
+
+static int in_fatal(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
+
+	if (get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
+		return 1;
+
+	if (ioread32be(&h->fw_ver) == 0xffffffff)
+		return 1;
+
+	return 0;
+}
+
+void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
+{
+	mutex_lock(&dev->intf_state_mutex);
+	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+		goto unlock;
+
+	mlx5_core_err(dev, "start\n");
+	if (pci_channel_offline(dev->pdev) || in_fatal(dev) || force) {
+		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+		trigger_cmd_completions(dev);
+	}
+
+	mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 1);
+	mlx5_core_err(dev, "end\n");
+
+unlock:
+	mutex_unlock(&dev->intf_state_mutex);
+}
+
+static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
+{
+	u8 nic_interface = get_nic_state(dev);
+
+	switch (nic_interface) {
+	case MLX5_NIC_IFC_FULL:
+		mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n");
+		break;
+
+	case MLX5_NIC_IFC_DISABLED:
+		mlx5_core_warn(dev, "starting teardown\n");
+		break;
+
+	case MLX5_NIC_IFC_NO_DRAM_NIC:
+		mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n");
+		break;
+	default:
+		mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n",
+			       nic_interface);
+	}
+
+	mlx5_disable_device(dev);
+}
+
+static void health_recover(struct work_struct *work)
+{
+	struct mlx5_core_health *health;
+	struct delayed_work *dwork;
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+	u8 nic_state;
+
+	dwork = container_of(work, struct delayed_work, work);
+	health = container_of(dwork, struct mlx5_core_health, recover_work);
+	priv = container_of(health, struct mlx5_priv, health);
+	dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	nic_state = get_nic_state(dev);
+	if (nic_state == MLX5_NIC_IFC_INVALID) {
+		dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n");
+		return;
+	}
+
+	dev_err(&dev->pdev->dev, "starting health recovery flow\n");
+	mlx5_recover_device(dev);
+}
+
+/* How much time to wait until health resetting the driver (in msecs) */
+#define MLX5_RECOVERY_DELAY_MSECS 60000
+static void health_care(struct work_struct *work)
+{
+	unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS);
+	struct mlx5_core_health *health;
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+	unsigned long flags;
+
+	health = container_of(work, struct mlx5_core_health, work);
+	priv = container_of(health, struct mlx5_priv, health);
+	dev = container_of(priv, struct mlx5_core_dev, priv);
+	mlx5_core_warn(dev, "handling bad device here\n");
+	mlx5_handle_bad_state(dev);
+
+	spin_lock_irqsave(&health->wq_lock, flags);
+	if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags))
+		schedule_delayed_work(&health->recover_work, recover_delay);
+	else
+		dev_err(&dev->pdev->dev,
+			"new health works are not permitted at this stage\n");
+	spin_unlock_irqrestore(&health->wq_lock, flags);
+}
+
+static const char *hsynd_str(u8 synd)
+{
+	switch (synd) {
+	case MLX5_HEALTH_SYNDR_FW_ERR:
+		return "firmware internal error";
+	case MLX5_HEALTH_SYNDR_IRISC_ERR:
+		return "irisc not responding";
+	case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
+		return "unrecoverable hardware error";
+	case MLX5_HEALTH_SYNDR_CRC_ERR:
+		return "firmware CRC error";
+	case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
+		return "ICM fetch PCI error";
+	case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
+		return "HW fatal error\n";
+	case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
+		return "async EQ buffer overrun";
+	case MLX5_HEALTH_SYNDR_EQ_ERR:
+		return "EQ error";
+	case MLX5_HEALTH_SYNDR_EQ_INV:
+		return "Invalid EQ referenced";
+	case MLX5_HEALTH_SYNDR_FFSER_ERR:
+		return "FFSER error";
+	case MLX5_HEALTH_SYNDR_HIGH_TEMP:
+		return "High temperature";
+	default:
+		return "unrecognized error";
+	}
+}
+
+static void print_health_info(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
+	char fw_str[18];
+	u32 fw;
+	int i;
+
+	/* If the syndrome is 0, the device is OK and no need to print buffer */
+	if (!ioread8(&h->synd))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
+		dev_err(&dev->pdev->dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i));
+
+	dev_err(&dev->pdev->dev, "assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr));
+	dev_err(&dev->pdev->dev, "assert_callra 0x%08x\n", ioread32be(&h->assert_callra));
+	sprintf(fw_str, "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev));
+	dev_err(&dev->pdev->dev, "fw_ver %s\n", fw_str);
+	dev_err(&dev->pdev->dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
+	dev_err(&dev->pdev->dev, "irisc_index %d\n", ioread8(&h->irisc_index));
+	dev_err(&dev->pdev->dev, "synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd)));
+	dev_err(&dev->pdev->dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
+	fw = ioread32be(&h->fw_ver);
+	dev_err(&dev->pdev->dev, "raw fw_ver 0x%08x\n", fw);
+}
+
+static unsigned long get_next_poll_jiffies(void)
+{
+	unsigned long next;
+
+	get_random_bytes(&next, sizeof(next));
+	next %= HZ;
+	next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
+
+	return next;
+}
+
+void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned long flags;
+
+	spin_lock_irqsave(&health->wq_lock, flags);
+	if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
+		queue_work(health->wq, &health->work);
+	else
+		dev_err(&dev->pdev->dev,
+			"new health works are not permitted at this stage\n");
+	spin_unlock_irqrestore(&health->wq_lock, flags);
+}
+
+static void poll_health(struct timer_list *t)
+{
+	struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
+	struct mlx5_core_health *health = &dev->priv.health;
+	u32 count;
+
+	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+		goto out;
+
+	count = ioread32be(health->health_counter);
+	if (count == health->prev)
+		++health->miss_counter;
+	else
+		health->miss_counter = 0;
+
+	health->prev = count;
+	if (health->miss_counter == MAX_MISSES) {
+		dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n");
+		print_health_info(dev);
+	}
+
+	if (in_fatal(dev) && !health->sick) {
+		health->sick = true;
+		print_health_info(dev);
+		mlx5_trigger_health_work(dev);
+	}
+
+out:
+	mod_timer(&health->timer, get_next_poll_jiffies());
+}
+
+void mlx5_start_health_poll(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	timer_setup(&health->timer, poll_health, 0);
+	health->sick = 0;
+	clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
+	clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
+	health->health = &dev->iseg->health;
+	health->health_counter = &dev->iseg->health_counter;
+
+	health->timer.expires = round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL);
+	add_timer(&health->timer);
+}
+
+void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	del_timer_sync(&health->timer);
+}
+
+void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned long flags;
+
+	spin_lock_irqsave(&health->wq_lock, flags);
+	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
+	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
+	spin_unlock_irqrestore(&health->wq_lock, flags);
+	cancel_delayed_work_sync(&health->recover_work);
+	cancel_work_sync(&health->work);
+}
+
+void mlx5_drain_health_recovery(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned long flags;
+
+	spin_lock_irqsave(&health->wq_lock, flags);
+	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
+	spin_unlock_irqrestore(&health->wq_lock, flags);
+	cancel_delayed_work_sync(&dev->priv.health.recover_work);
+}
+
+void mlx5_health_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	destroy_workqueue(health->wq);
+}
+
+int mlx5_health_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health;
+	char *name;
+
+	health = &dev->priv.health;
+	name = kmalloc(64, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	strcpy(name, "mlx5_health");
+	strcat(name, dev_name(&dev->pdev->dev));
+	health->wq = create_singlethread_workqueue(name);
+	kfree(name);
+	if (!health->wq)
+		return -ENOMEM;
+	spin_lock_init(&health->wq_lock);
+	INIT_WORK(&health->work, health_care);
+	INIT_DELAYED_WORK(&health->recover_work, health_recover);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/Makefile
new file mode 100644
index 0000000..d8e1711
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
new file mode 100644
index 0000000..90cb50f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "en.h"
+#include "ipoib.h"
+
+static void mlx5i_get_drvinfo(struct net_device *dev,
+			      struct ethtool_drvinfo *drvinfo)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	mlx5e_ethtool_get_drvinfo(priv, drvinfo);
+	strlcpy(drvinfo->driver, DRIVER_NAME "[ib_ipoib]",
+		sizeof(drvinfo->driver));
+}
+
+static void mlx5i_get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+	struct mlx5e_priv *priv  = mlx5i_epriv(dev);
+
+	mlx5e_ethtool_get_strings(priv, stringset, data);
+}
+
+static int mlx5i_get_sset_count(struct net_device *dev, int sset)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	return mlx5e_ethtool_get_sset_count(priv, sset);
+}
+
+static void mlx5i_get_ethtool_stats(struct net_device *dev,
+				    struct ethtool_stats *stats,
+				    u64 *data)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	mlx5e_ethtool_get_ethtool_stats(priv, stats, data);
+}
+
+static int mlx5i_set_ringparam(struct net_device *dev,
+			       struct ethtool_ringparam *param)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	return mlx5e_ethtool_set_ringparam(priv, param);
+}
+
+static void mlx5i_get_ringparam(struct net_device *dev,
+				struct ethtool_ringparam *param)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	mlx5e_ethtool_get_ringparam(priv, param);
+}
+
+static int mlx5i_set_channels(struct net_device *dev,
+			      struct ethtool_channels *ch)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	return mlx5e_ethtool_set_channels(priv, ch);
+}
+
+static void mlx5i_get_channels(struct net_device *dev,
+			       struct ethtool_channels *ch)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	mlx5e_ethtool_get_channels(priv, ch);
+}
+
+static int mlx5i_set_coalesce(struct net_device *netdev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+
+	return mlx5e_ethtool_set_coalesce(priv, coal);
+}
+
+static int mlx5i_get_coalesce(struct net_device *netdev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+
+	return mlx5e_ethtool_get_coalesce(priv, coal);
+}
+
+static int mlx5i_get_ts_info(struct net_device *netdev,
+			     struct ethtool_ts_info *info)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+
+	return mlx5e_ethtool_get_ts_info(priv, info);
+}
+
+static int mlx5i_flash_device(struct net_device *netdev,
+			      struct ethtool_flash *flash)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+
+	return mlx5e_ethtool_flash_device(priv, flash);
+}
+
+enum mlx5_ptys_width {
+	MLX5_PTYS_WIDTH_1X	= 1 << 0,
+	MLX5_PTYS_WIDTH_2X	= 1 << 1,
+	MLX5_PTYS_WIDTH_4X	= 1 << 2,
+	MLX5_PTYS_WIDTH_8X	= 1 << 3,
+	MLX5_PTYS_WIDTH_12X	= 1 << 4,
+};
+
+static inline int mlx5_ptys_width_enum_to_int(enum mlx5_ptys_width width)
+{
+	switch (width) {
+	case MLX5_PTYS_WIDTH_1X:  return  1;
+	case MLX5_PTYS_WIDTH_2X:  return  2;
+	case MLX5_PTYS_WIDTH_4X:  return  4;
+	case MLX5_PTYS_WIDTH_8X:  return  8;
+	case MLX5_PTYS_WIDTH_12X: return 12;
+	default:		  return -1;
+	}
+}
+
+enum mlx5_ptys_rate {
+	MLX5_PTYS_RATE_SDR	= 1 << 0,
+	MLX5_PTYS_RATE_DDR	= 1 << 1,
+	MLX5_PTYS_RATE_QDR	= 1 << 2,
+	MLX5_PTYS_RATE_FDR10	= 1 << 3,
+	MLX5_PTYS_RATE_FDR	= 1 << 4,
+	MLX5_PTYS_RATE_EDR	= 1 << 5,
+	MLX5_PTYS_RATE_HDR	= 1 << 6,
+};
+
+static inline int mlx5_ptys_rate_enum_to_int(enum mlx5_ptys_rate rate)
+{
+	switch (rate) {
+	case MLX5_PTYS_RATE_SDR:   return 2500;
+	case MLX5_PTYS_RATE_DDR:   return 5000;
+	case MLX5_PTYS_RATE_QDR:
+	case MLX5_PTYS_RATE_FDR10: return 10000;
+	case MLX5_PTYS_RATE_FDR:   return 14000;
+	case MLX5_PTYS_RATE_EDR:   return 25000;
+	case MLX5_PTYS_RATE_HDR:   return 50000;
+	default:		   return -1;
+	}
+}
+
+static int mlx5i_get_port_settings(struct net_device *netdev,
+				   u16 *ib_link_width_oper, u16 *ib_proto_oper)
+{
+	struct mlx5e_priv *priv    = mlx5i_epriv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0};
+	int ret;
+
+	ret = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_IB, 1);
+	if (ret)
+		return ret;
+
+	*ib_link_width_oper = MLX5_GET(ptys_reg, out, ib_link_width_oper);
+	*ib_proto_oper      = MLX5_GET(ptys_reg, out, ib_proto_oper);
+
+	return 0;
+}
+
+static int mlx5i_get_speed_settings(u16 ib_link_width_oper, u16 ib_proto_oper)
+{
+	int rate, width;
+
+	rate = mlx5_ptys_rate_enum_to_int(ib_proto_oper);
+	if (rate < 0)
+		return -EINVAL;
+	width = mlx5_ptys_width_enum_to_int(ib_link_width_oper);
+	if (width < 0)
+		return -EINVAL;
+
+	return rate * width;
+}
+
+static int mlx5i_get_link_ksettings(struct net_device *netdev,
+				    struct ethtool_link_ksettings *link_ksettings)
+{
+	u16 ib_link_width_oper;
+	u16 ib_proto_oper;
+	int speed, ret;
+
+	ret = mlx5i_get_port_settings(netdev, &ib_link_width_oper, &ib_proto_oper);
+	if (ret)
+		return ret;
+
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, supported);
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising);
+
+	speed = mlx5i_get_speed_settings(ib_link_width_oper, ib_proto_oper);
+	if (speed < 0)
+		return -EINVAL;
+
+	link_ksettings->base.duplex = DUPLEX_FULL;
+	link_ksettings->base.port = PORT_OTHER;
+
+	link_ksettings->base.autoneg = AUTONEG_DISABLE;
+
+	link_ksettings->base.speed = speed;
+
+	return 0;
+}
+
+const struct ethtool_ops mlx5i_ethtool_ops = {
+	.get_drvinfo        = mlx5i_get_drvinfo,
+	.get_strings        = mlx5i_get_strings,
+	.get_sset_count     = mlx5i_get_sset_count,
+	.get_ethtool_stats  = mlx5i_get_ethtool_stats,
+	.get_ringparam      = mlx5i_get_ringparam,
+	.set_ringparam      = mlx5i_set_ringparam,
+	.flash_device       = mlx5i_flash_device,
+	.get_channels       = mlx5i_get_channels,
+	.set_channels       = mlx5i_set_channels,
+	.get_coalesce       = mlx5i_get_coalesce,
+	.set_coalesce       = mlx5i_set_coalesce,
+	.get_ts_info        = mlx5i_get_ts_info,
+	.get_link_ksettings = mlx5i_get_link_ksettings,
+	.get_link           = ethtool_op_get_link,
+};
+
+const struct ethtool_ops mlx5i_pkey_ethtool_ops = {
+	.get_drvinfo        = mlx5i_get_drvinfo,
+	.get_link           = ethtool_op_get_link,
+	.get_ts_info        = mlx5i_get_ts_info,
+};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
new file mode 100644
index 0000000..af3bb2f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -0,0 +1,685 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_verbs.h>
+#include <linux/mlx5/fs.h>
+#include "en.h"
+#include "ipoib.h"
+
+#define IB_DEFAULT_Q_KEY   0xb1b
+#define MLX5I_PARAMS_DEFAULT_LOG_RQ_SIZE 9
+
+static int mlx5i_open(struct net_device *netdev);
+static int mlx5i_close(struct net_device *netdev);
+static int mlx5i_change_mtu(struct net_device *netdev, int new_mtu);
+
+static const struct net_device_ops mlx5i_netdev_ops = {
+	.ndo_open                = mlx5i_open,
+	.ndo_stop                = mlx5i_close,
+	.ndo_init                = mlx5i_dev_init,
+	.ndo_uninit              = mlx5i_dev_cleanup,
+	.ndo_change_mtu          = mlx5i_change_mtu,
+	.ndo_do_ioctl            = mlx5i_ioctl,
+};
+
+/* IPoIB mlx5 netdev profile */
+static void mlx5i_build_nic_params(struct mlx5_core_dev *mdev,
+				   struct mlx5e_params *params)
+{
+	/* Override RQ params as IPoIB supports only LINKED LIST RQ for now */
+	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ, false);
+	mlx5e_set_rq_type(mdev, params);
+	mlx5e_init_rq_type_params(mdev, params);
+
+	/* RQ size in ipoib by default is 512 */
+	params->log_rq_mtu_frames = is_kdump_kernel() ?
+		MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE :
+		MLX5I_PARAMS_DEFAULT_LOG_RQ_SIZE;
+
+	params->lro_en = false;
+	params->hard_mtu = MLX5_IB_GRH_BYTES + MLX5_IPOIB_HARD_LEN;
+}
+
+/* Called directly after IPoIB netdevice was created to initialize SW structs */
+void mlx5i_init(struct mlx5_core_dev *mdev,
+		struct net_device *netdev,
+		const struct mlx5e_profile *profile,
+		void *ppriv)
+{
+	struct mlx5e_priv *priv  = mlx5i_epriv(netdev);
+
+	/* priv init */
+	priv->mdev        = mdev;
+	priv->netdev      = netdev;
+	priv->profile     = profile;
+	priv->ppriv       = ppriv;
+	mutex_init(&priv->state_lock);
+
+	mlx5e_build_nic_params(mdev, &priv->channels.params,
+			       profile->max_nch(mdev), netdev->mtu);
+	mlx5i_build_nic_params(mdev, &priv->channels.params);
+
+	mlx5e_timestamp_init(priv);
+
+	/* netdev init */
+	netdev->hw_features    |= NETIF_F_SG;
+	netdev->hw_features    |= NETIF_F_IP_CSUM;
+	netdev->hw_features    |= NETIF_F_IPV6_CSUM;
+	netdev->hw_features    |= NETIF_F_GRO;
+	netdev->hw_features    |= NETIF_F_TSO;
+	netdev->hw_features    |= NETIF_F_TSO6;
+	netdev->hw_features    |= NETIF_F_RXCSUM;
+	netdev->hw_features    |= NETIF_F_RXHASH;
+
+	netdev->netdev_ops = &mlx5i_netdev_ops;
+	netdev->ethtool_ops = &mlx5i_ethtool_ops;
+}
+
+/* Called directly before IPoIB netdevice is destroyed to cleanup SW structs */
+static void mlx5i_cleanup(struct mlx5e_priv *priv)
+{
+	/* Do nothing .. */
+}
+
+int mlx5i_init_underlay_qp(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5i_priv *ipriv = priv->ppriv;
+	struct mlx5_core_qp *qp = &ipriv->qp;
+	struct mlx5_qp_context *context;
+	int ret;
+
+	/* QP states */
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	context->flags = cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
+	context->pri_path.port = 1;
+	context->pri_path.pkey_index = cpu_to_be16(ipriv->pkey_index);
+	context->qkey = cpu_to_be32(IB_DEFAULT_Q_KEY);
+
+	ret = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RST2INIT_QP, 0, context, qp);
+	if (ret) {
+		mlx5_core_err(mdev, "Failed to modify qp RST2INIT, err: %d\n", ret);
+		goto err_qp_modify_to_err;
+	}
+	memset(context, 0, sizeof(*context));
+
+	ret = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_INIT2RTR_QP, 0, context, qp);
+	if (ret) {
+		mlx5_core_err(mdev, "Failed to modify qp INIT2RTR, err: %d\n", ret);
+		goto err_qp_modify_to_err;
+	}
+
+	ret = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RTR2RTS_QP, 0, context, qp);
+	if (ret) {
+		mlx5_core_err(mdev, "Failed to modify qp RTR2RTS, err: %d\n", ret);
+		goto err_qp_modify_to_err;
+	}
+
+	kfree(context);
+	return 0;
+
+err_qp_modify_to_err:
+	mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2ERR_QP, 0, &context, qp);
+	kfree(context);
+	return ret;
+}
+
+void mlx5i_uninit_underlay_qp(struct mlx5e_priv *priv)
+{
+	struct mlx5i_priv *ipriv = priv->ppriv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_qp_context context;
+	int err;
+
+	err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2RST_QP, 0, &context,
+				  &ipriv->qp);
+	if (err)
+		mlx5_core_err(mdev, "Failed to modify qp 2RST, err: %d\n", err);
+}
+
+#define MLX5_QP_ENHANCED_ULP_STATELESS_MODE 2
+
+int mlx5i_create_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp)
+{
+	u32 *in = NULL;
+	void *addr_path;
+	int ret = 0;
+	int inlen;
+	void *qpc;
+
+	inlen = MLX5_ST_SZ_BYTES(create_qp_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_UD);
+	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+	MLX5_SET(qpc, qpc, ulp_stateless_offload_mode,
+		 MLX5_QP_ENHANCED_ULP_STATELESS_MODE);
+
+	addr_path = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
+	MLX5_SET(ads, addr_path, vhca_port_num, 1);
+	MLX5_SET(ads, addr_path, grh, 1);
+
+	ret = mlx5_core_create_qp(mdev, qp, in, inlen);
+	if (ret) {
+		mlx5_core_err(mdev, "Failed creating IPoIB QP err : %d\n", ret);
+		goto out;
+	}
+
+out:
+	kvfree(in);
+	return ret;
+}
+
+void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp)
+{
+	mlx5_core_destroy_qp(mdev, qp);
+}
+
+static int mlx5i_init_tx(struct mlx5e_priv *priv)
+{
+	struct mlx5i_priv *ipriv = priv->ppriv;
+	int err;
+
+	err = mlx5i_create_underlay_qp(priv->mdev, &ipriv->qp);
+	if (err) {
+		mlx5_core_warn(priv->mdev, "create underlay QP failed, %d\n", err);
+		return err;
+	}
+
+	err = mlx5e_create_tis(priv->mdev, 0 /* tc */, ipriv->qp.qpn, &priv->tisn[0]);
+	if (err) {
+		mlx5_core_warn(priv->mdev, "create tis failed, %d\n", err);
+		goto err_destroy_underlay_qp;
+	}
+
+	return 0;
+
+err_destroy_underlay_qp:
+	mlx5i_destroy_underlay_qp(priv->mdev, &ipriv->qp);
+	return err;
+}
+
+static void mlx5i_cleanup_tx(struct mlx5e_priv *priv)
+{
+	struct mlx5i_priv *ipriv = priv->ppriv;
+
+	mlx5e_destroy_tis(priv->mdev, priv->tisn[0]);
+	mlx5i_destroy_underlay_qp(priv->mdev, &ipriv->qp);
+}
+
+static int mlx5i_create_flow_steering(struct mlx5e_priv *priv)
+{
+	struct ttc_params ttc_params = {};
+	int tt, err;
+
+	priv->fs.ns = mlx5_get_flow_namespace(priv->mdev,
+					       MLX5_FLOW_NAMESPACE_KERNEL);
+
+	if (!priv->fs.ns)
+		return -EINVAL;
+
+	err = mlx5e_arfs_create_tables(priv);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create arfs tables, err=%d\n",
+			   err);
+		priv->netdev->hw_features &= ~NETIF_F_NTUPLE;
+	}
+
+	mlx5e_set_ttc_basic_params(priv, &ttc_params);
+	mlx5e_set_inner_ttc_ft_params(&ttc_params);
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
+		ttc_params.indir_tirn[tt] = priv->inner_indir_tir[tt].tirn;
+
+	err = mlx5e_create_inner_ttc_table(priv, &ttc_params, &priv->fs.inner_ttc);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create inner ttc table, err=%d\n",
+			   err);
+		goto err_destroy_arfs_tables;
+	}
+
+	mlx5e_set_ttc_ft_params(&ttc_params);
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
+		ttc_params.indir_tirn[tt] = priv->indir_tir[tt].tirn;
+
+	err = mlx5e_create_ttc_table(priv, &ttc_params, &priv->fs.ttc);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create ttc table, err=%d\n",
+			   err);
+		goto err_destroy_inner_ttc_table;
+	}
+
+	return 0;
+
+err_destroy_inner_ttc_table:
+	mlx5e_destroy_inner_ttc_table(priv, &priv->fs.inner_ttc);
+err_destroy_arfs_tables:
+	mlx5e_arfs_destroy_tables(priv);
+
+	return err;
+}
+
+static void mlx5i_destroy_flow_steering(struct mlx5e_priv *priv)
+{
+	mlx5e_destroy_ttc_table(priv, &priv->fs.ttc);
+	mlx5e_destroy_inner_ttc_table(priv, &priv->fs.inner_ttc);
+	mlx5e_arfs_destroy_tables(priv);
+}
+
+static int mlx5i_init_rx(struct mlx5e_priv *priv)
+{
+	int err;
+
+	err = mlx5e_create_indirect_rqt(priv);
+	if (err)
+		return err;
+
+	err = mlx5e_create_direct_rqts(priv);
+	if (err)
+		goto err_destroy_indirect_rqts;
+
+	err = mlx5e_create_indirect_tirs(priv);
+	if (err)
+		goto err_destroy_direct_rqts;
+
+	err = mlx5e_create_direct_tirs(priv);
+	if (err)
+		goto err_destroy_indirect_tirs;
+
+	err = mlx5i_create_flow_steering(priv);
+	if (err)
+		goto err_destroy_direct_tirs;
+
+	return 0;
+
+err_destroy_direct_tirs:
+	mlx5e_destroy_direct_tirs(priv);
+err_destroy_indirect_tirs:
+	mlx5e_destroy_indirect_tirs(priv);
+err_destroy_direct_rqts:
+	mlx5e_destroy_direct_rqts(priv);
+err_destroy_indirect_rqts:
+	mlx5e_destroy_rqt(priv, &priv->indir_rqt);
+	return err;
+}
+
+static void mlx5i_cleanup_rx(struct mlx5e_priv *priv)
+{
+	mlx5i_destroy_flow_steering(priv);
+	mlx5e_destroy_direct_tirs(priv);
+	mlx5e_destroy_indirect_tirs(priv);
+	mlx5e_destroy_direct_rqts(priv);
+	mlx5e_destroy_rqt(priv, &priv->indir_rqt);
+}
+
+static const struct mlx5e_profile mlx5i_nic_profile = {
+	.init		   = mlx5i_init,
+	.cleanup	   = mlx5i_cleanup,
+	.init_tx	   = mlx5i_init_tx,
+	.cleanup_tx	   = mlx5i_cleanup_tx,
+	.init_rx	   = mlx5i_init_rx,
+	.cleanup_rx	   = mlx5i_cleanup_rx,
+	.enable		   = NULL, /* mlx5i_enable */
+	.disable	   = NULL, /* mlx5i_disable */
+	.update_stats	   = NULL, /* mlx5i_update_stats */
+	.max_nch	   = mlx5e_get_max_num_channels,
+	.update_carrier    = NULL, /* no HW update in IB link */
+	.rx_handlers.handle_rx_cqe       = mlx5i_handle_rx_cqe,
+	.rx_handlers.handle_rx_cqe_mpwqe = NULL, /* Not supported */
+	.max_tc		   = MLX5I_MAX_NUM_TC,
+};
+
+/* mlx5i netdev NDos */
+
+static int mlx5i_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+	struct mlx5e_channels new_channels = {};
+	struct mlx5e_params *params;
+	int err = 0;
+
+	mutex_lock(&priv->state_lock);
+
+	params = &priv->channels.params;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		params->sw_mtu = new_mtu;
+		netdev->mtu = params->sw_mtu;
+		goto out;
+	}
+
+	new_channels.params = *params;
+	new_channels.params.sw_mtu = new_mtu;
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto out;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+	netdev->mtu = new_channels.params.sw_mtu;
+
+out:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+int mlx5i_dev_init(struct net_device *dev)
+{
+	struct mlx5e_priv    *priv   = mlx5i_epriv(dev);
+	struct mlx5i_priv    *ipriv  = priv->ppriv;
+
+	/* Set dev address using underlay QP */
+	dev->dev_addr[1] = (ipriv->qp.qpn >> 16) & 0xff;
+	dev->dev_addr[2] = (ipriv->qp.qpn >>  8) & 0xff;
+	dev->dev_addr[3] = (ipriv->qp.qpn) & 0xff;
+
+	/* Add QPN to net-device mapping to HT */
+	mlx5i_pkey_add_qpn(dev ,ipriv->qp.qpn);
+
+	return 0;
+}
+
+int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		return mlx5e_hwstamp_set(priv, ifr);
+	case SIOCGHWTSTAMP:
+		return mlx5e_hwstamp_get(priv, ifr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+void mlx5i_dev_cleanup(struct net_device *dev)
+{
+	struct mlx5e_priv    *priv   = mlx5i_epriv(dev);
+	struct mlx5i_priv    *ipriv = priv->ppriv;
+
+	mlx5i_uninit_underlay_qp(priv);
+
+	/* Delete QPN to net-device mapping from HT */
+	mlx5i_pkey_del_qpn(dev, ipriv->qp.qpn);
+}
+
+static int mlx5i_open(struct net_device *netdev)
+{
+	struct mlx5e_priv *epriv = mlx5i_epriv(netdev);
+	struct mlx5i_priv *ipriv = epriv->ppriv;
+	struct mlx5_core_dev *mdev = epriv->mdev;
+	int err;
+
+	mutex_lock(&epriv->state_lock);
+
+	set_bit(MLX5E_STATE_OPENED, &epriv->state);
+
+	err = mlx5i_init_underlay_qp(epriv);
+	if (err) {
+		mlx5_core_warn(mdev, "prepare underlay qp state failed, %d\n", err);
+		goto err_clear_state_opened_flag;
+	}
+
+	err = mlx5_fs_add_rx_underlay_qpn(mdev, ipriv->qp.qpn);
+	if (err) {
+		mlx5_core_warn(mdev, "attach underlay qp to ft failed, %d\n", err);
+		goto err_reset_qp;
+	}
+
+	err = mlx5e_open_channels(epriv, &epriv->channels);
+	if (err)
+		goto err_remove_fs_underlay_qp;
+
+	mlx5e_refresh_tirs(epriv, false);
+	mlx5e_activate_priv_channels(epriv);
+
+	mutex_unlock(&epriv->state_lock);
+	return 0;
+
+err_remove_fs_underlay_qp:
+	mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn);
+err_reset_qp:
+	mlx5i_uninit_underlay_qp(epriv);
+err_clear_state_opened_flag:
+	clear_bit(MLX5E_STATE_OPENED, &epriv->state);
+	mutex_unlock(&epriv->state_lock);
+	return err;
+}
+
+static int mlx5i_close(struct net_device *netdev)
+{
+	struct mlx5e_priv *epriv = mlx5i_epriv(netdev);
+	struct mlx5i_priv *ipriv = epriv->ppriv;
+	struct mlx5_core_dev *mdev = epriv->mdev;
+
+	/* May already be CLOSED in case a previous configuration operation
+	 * (e.g RX/TX queue size change) that involves close&open failed.
+	 */
+	mutex_lock(&epriv->state_lock);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &epriv->state))
+		goto unlock;
+
+	clear_bit(MLX5E_STATE_OPENED, &epriv->state);
+
+	netif_carrier_off(epriv->netdev);
+	mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn);
+	mlx5i_uninit_underlay_qp(epriv);
+	mlx5e_deactivate_priv_channels(epriv);
+	mlx5e_close_channels(&epriv->channels);
+unlock:
+	mutex_unlock(&epriv->state_lock);
+	return 0;
+}
+
+/* IPoIB RDMA netdev callbacks */
+static int mlx5i_attach_mcast(struct net_device *netdev, struct ib_device *hca,
+			      union ib_gid *gid, u16 lid, int set_qkey,
+			      u32 qkey)
+{
+	struct mlx5e_priv    *epriv = mlx5i_epriv(netdev);
+	struct mlx5_core_dev *mdev  = epriv->mdev;
+	struct mlx5i_priv    *ipriv = epriv->ppriv;
+	int err;
+
+	mlx5_core_dbg(mdev, "attaching QPN 0x%x, MGID %pI6\n", ipriv->qp.qpn, gid->raw);
+	err = mlx5_core_attach_mcg(mdev, gid, ipriv->qp.qpn);
+	if (err)
+		mlx5_core_warn(mdev, "failed attaching QPN 0x%x, MGID %pI6\n",
+			       ipriv->qp.qpn, gid->raw);
+
+	if (set_qkey) {
+		mlx5_core_dbg(mdev, "%s setting qkey 0x%x\n",
+			      netdev->name, qkey);
+		ipriv->qkey = qkey;
+	}
+
+	return err;
+}
+
+static int mlx5i_detach_mcast(struct net_device *netdev, struct ib_device *hca,
+			      union ib_gid *gid, u16 lid)
+{
+	struct mlx5e_priv    *epriv = mlx5i_epriv(netdev);
+	struct mlx5_core_dev *mdev  = epriv->mdev;
+	struct mlx5i_priv    *ipriv = epriv->ppriv;
+	int err;
+
+	mlx5_core_dbg(mdev, "detaching QPN 0x%x, MGID %pI6\n", ipriv->qp.qpn, gid->raw);
+
+	err = mlx5_core_detach_mcg(mdev, gid, ipriv->qp.qpn);
+	if (err)
+		mlx5_core_dbg(mdev, "failed detaching QPN 0x%x, MGID %pI6\n",
+			      ipriv->qp.qpn, gid->raw);
+
+	return err;
+}
+
+static int mlx5i_xmit(struct net_device *dev, struct sk_buff *skb,
+		      struct ib_ah *address, u32 dqpn)
+{
+	struct mlx5e_priv *epriv = mlx5i_epriv(dev);
+	struct mlx5e_txqsq *sq   = epriv->txq2sq[skb_get_queue_mapping(skb)];
+	struct mlx5_ib_ah *mah   = to_mah(address);
+	struct mlx5i_priv *ipriv = epriv->ppriv;
+
+	return mlx5i_sq_xmit(sq, skb, &mah->av, dqpn, ipriv->qkey);
+}
+
+static void mlx5i_set_pkey_index(struct net_device *netdev, int id)
+{
+	struct mlx5i_priv *ipriv = netdev_priv(netdev);
+
+	ipriv->pkey_index = (u16)id;
+}
+
+static int mlx5i_check_required_hca_cap(struct mlx5_core_dev *mdev)
+{
+	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_IB)
+		return -EOPNOTSUPP;
+
+	if (!MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads)) {
+		mlx5_core_warn(mdev, "IPoIB enhanced offloads are not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
+					  struct ib_device *ibdev,
+					  const char *name,
+					  void (*setup)(struct net_device *))
+{
+	const struct mlx5e_profile *profile;
+	struct net_device *netdev;
+	struct mlx5i_priv *ipriv;
+	struct mlx5e_priv *epriv;
+	struct rdma_netdev *rn;
+	bool sub_interface;
+	int nch;
+	int err;
+
+	if (mlx5i_check_required_hca_cap(mdev)) {
+		mlx5_core_warn(mdev, "Accelerated mode is not supported\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	/* TODO: Need to find a better way to check if child device*/
+	sub_interface = (mdev->mlx5e_res.pdn != 0);
+
+	if (sub_interface)
+		profile = mlx5i_pkey_get_profile();
+	else
+		profile = &mlx5i_nic_profile;
+
+	nch = profile->max_nch(mdev);
+
+	netdev = alloc_netdev_mqs(sizeof(struct mlx5i_priv) + sizeof(struct mlx5e_priv),
+				  name, NET_NAME_UNKNOWN,
+				  setup,
+				  nch * MLX5E_MAX_NUM_TC,
+				  nch);
+	if (!netdev) {
+		mlx5_core_warn(mdev, "alloc_netdev_mqs failed\n");
+		return NULL;
+	}
+
+	ipriv = netdev_priv(netdev);
+	epriv = mlx5i_epriv(netdev);
+
+	epriv->wq = create_singlethread_workqueue("mlx5i");
+	if (!epriv->wq)
+		goto err_free_netdev;
+
+	ipriv->sub_interface = sub_interface;
+	if (!ipriv->sub_interface) {
+		err = mlx5i_pkey_qpn_ht_init(netdev);
+		if (err) {
+			mlx5_core_warn(mdev, "allocate qpn_to_netdev ht failed\n");
+			goto destroy_wq;
+		}
+
+		/* This should only be called once per mdev */
+		err = mlx5e_create_mdev_resources(mdev);
+		if (err)
+			goto destroy_ht;
+	}
+
+	profile->init(mdev, netdev, profile, ipriv);
+
+	mlx5e_attach_netdev(epriv);
+	netif_carrier_off(netdev);
+
+	/* set rdma_netdev func pointers */
+	rn = &ipriv->rn;
+	rn->hca  = ibdev;
+	rn->send = mlx5i_xmit;
+	rn->attach_mcast = mlx5i_attach_mcast;
+	rn->detach_mcast = mlx5i_detach_mcast;
+	rn->set_id = mlx5i_set_pkey_index;
+
+	return netdev;
+
+destroy_ht:
+	mlx5i_pkey_qpn_ht_cleanup(netdev);
+destroy_wq:
+	destroy_workqueue(epriv->wq);
+err_free_netdev:
+	free_netdev(netdev);
+
+	return NULL;
+}
+EXPORT_SYMBOL(mlx5_rdma_netdev_alloc);
+
+void mlx5_rdma_netdev_free(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+	struct mlx5i_priv *ipriv = priv->ppriv;
+	const struct mlx5e_profile *profile = priv->profile;
+
+	mlx5e_detach_netdev(priv);
+	profile->cleanup(priv);
+	destroy_workqueue(priv->wq);
+
+	if (!ipriv->sub_interface) {
+		mlx5i_pkey_qpn_ht_cleanup(netdev);
+		mlx5e_destroy_mdev_resources(priv->mdev);
+	}
+	free_netdev(netdev);
+}
+EXPORT_SYMBOL(mlx5_rdma_netdev_free);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
new file mode 100644
index 0000000..6d9053b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5E_IPOB_H__
+#define __MLX5E_IPOB_H__
+
+#include <linux/mlx5/fs.h>
+#include "en.h"
+
+#define MLX5I_MAX_NUM_TC 1
+
+extern const struct ethtool_ops mlx5i_ethtool_ops;
+extern const struct ethtool_ops mlx5i_pkey_ethtool_ops;
+
+#define MLX5_IB_GRH_BYTES       40
+#define MLX5_IPOIB_ENCAP_LEN    4
+#define MLX5_IPOIB_PSEUDO_LEN   20
+#define MLX5_IPOIB_HARD_LEN     (MLX5_IPOIB_PSEUDO_LEN + MLX5_IPOIB_ENCAP_LEN)
+
+/* ipoib rdma netdev's private data structure */
+struct mlx5i_priv {
+	struct rdma_netdev rn; /* keep this first */
+	struct mlx5_core_qp qp;
+	bool   sub_interface;
+	u32    qkey;
+	u16    pkey_index;
+	struct mlx5i_pkey_qpn_ht *qpn_htbl;
+	char  *mlx5e_priv[0];
+};
+
+/* Underlay QP create/destroy functions */
+int mlx5i_create_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp);
+void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp);
+
+/* Underlay QP state modification init/uninit functions */
+int mlx5i_init_underlay_qp(struct mlx5e_priv *priv);
+void mlx5i_uninit_underlay_qp(struct mlx5e_priv *priv);
+
+/* Allocate/Free underlay QPN to net-device hash table */
+int mlx5i_pkey_qpn_ht_init(struct net_device *netdev);
+void mlx5i_pkey_qpn_ht_cleanup(struct net_device *netdev);
+
+/* Add/Remove an underlay QPN to net-device mapping to/from the hash table */
+int mlx5i_pkey_add_qpn(struct net_device *netdev, u32 qpn);
+int mlx5i_pkey_del_qpn(struct net_device *netdev, u32 qpn);
+
+/* Get the net-device corresponding to the given underlay QPN */
+struct net_device *mlx5i_pkey_get_netdev(struct net_device *netdev, u32 qpn);
+
+/* Shared ndo functions */
+int mlx5i_dev_init(struct net_device *dev);
+void mlx5i_dev_cleanup(struct net_device *dev);
+int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
+
+/* Parent profile functions */
+void mlx5i_init(struct mlx5_core_dev *mdev,
+		struct net_device *netdev,
+		const struct mlx5e_profile *profile,
+		void *ppriv);
+
+/* Get child interface nic profile */
+const struct mlx5e_profile *mlx5i_pkey_get_profile(void);
+
+/* Extract mlx5e_priv from IPoIB netdev */
+#define mlx5i_epriv(netdev) ((void *)(((struct mlx5i_priv *)netdev_priv(netdev))->mlx5e_priv))
+
+netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			  struct mlx5_av *av, u32 dqpn, u32 dqkey);
+void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+
+#endif /* __MLX5E_IPOB_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
new file mode 100644
index 0000000..54a188f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/hash.h>
+#include "ipoib.h"
+
+#define MLX5I_MAX_LOG_PKEY_SUP 7
+
+struct qpn_to_netdev {
+	struct net_device *netdev;
+	struct hlist_node hlist;
+	u32 underlay_qpn;
+};
+
+struct mlx5i_pkey_qpn_ht {
+	struct hlist_head buckets[1 << MLX5I_MAX_LOG_PKEY_SUP];
+	spinlock_t ht_lock; /* Synchronise with NAPI */
+};
+
+int mlx5i_pkey_qpn_ht_init(struct net_device *netdev)
+{
+	struct mlx5i_priv *ipriv = netdev_priv(netdev);
+	struct mlx5i_pkey_qpn_ht *qpn_htbl;
+
+	qpn_htbl = kzalloc(sizeof(*qpn_htbl), GFP_KERNEL);
+	if (!qpn_htbl)
+		return -ENOMEM;
+
+	ipriv->qpn_htbl = qpn_htbl;
+	spin_lock_init(&qpn_htbl->ht_lock);
+
+	return 0;
+}
+
+void mlx5i_pkey_qpn_ht_cleanup(struct net_device *netdev)
+{
+	struct mlx5i_priv *ipriv = netdev_priv(netdev);
+
+	kfree(ipriv->qpn_htbl);
+}
+
+static struct qpn_to_netdev *mlx5i_find_qpn_to_netdev_node(struct hlist_head *buckets,
+							   u32 qpn)
+{
+	struct hlist_head *h = &buckets[hash_32(qpn, MLX5I_MAX_LOG_PKEY_SUP)];
+	struct qpn_to_netdev *node;
+
+	hlist_for_each_entry(node, h, hlist) {
+		if (node->underlay_qpn == qpn)
+			return node;
+	}
+
+	return NULL;
+}
+
+int mlx5i_pkey_add_qpn(struct net_device *netdev, u32 qpn)
+{
+	struct mlx5i_priv *ipriv = netdev_priv(netdev);
+	struct mlx5i_pkey_qpn_ht *ht = ipriv->qpn_htbl;
+	u8 key = hash_32(qpn, MLX5I_MAX_LOG_PKEY_SUP);
+	struct qpn_to_netdev *new_node;
+
+	new_node = kzalloc(sizeof(*new_node), GFP_KERNEL);
+	if (!new_node)
+		return -ENOMEM;
+
+	new_node->netdev = netdev;
+	new_node->underlay_qpn = qpn;
+	spin_lock_bh(&ht->ht_lock);
+	hlist_add_head(&new_node->hlist, &ht->buckets[key]);
+	spin_unlock_bh(&ht->ht_lock);
+
+	return 0;
+}
+
+int mlx5i_pkey_del_qpn(struct net_device *netdev, u32 qpn)
+{
+	struct mlx5e_priv *epriv = mlx5i_epriv(netdev);
+	struct mlx5i_priv *ipriv = epriv->ppriv;
+	struct mlx5i_pkey_qpn_ht *ht = ipriv->qpn_htbl;
+	struct qpn_to_netdev *node;
+
+	node = mlx5i_find_qpn_to_netdev_node(ht->buckets, qpn);
+	if (!node) {
+		mlx5_core_warn(epriv->mdev, "QPN to netdev delete from HT failed\n");
+		return -EINVAL;
+	}
+
+	spin_lock_bh(&ht->ht_lock);
+	hlist_del_init(&node->hlist);
+	spin_unlock_bh(&ht->ht_lock);
+	kfree(node);
+
+	return 0;
+}
+
+struct net_device *mlx5i_pkey_get_netdev(struct net_device *netdev, u32 qpn)
+{
+	struct mlx5i_priv *ipriv = netdev_priv(netdev);
+	struct qpn_to_netdev *node;
+
+	node = mlx5i_find_qpn_to_netdev_node(ipriv->qpn_htbl->buckets, qpn);
+	if (!node)
+		return NULL;
+
+	return node->netdev;
+}
+
+static int mlx5i_pkey_open(struct net_device *netdev);
+static int mlx5i_pkey_close(struct net_device *netdev);
+static int mlx5i_pkey_dev_init(struct net_device *dev);
+static void mlx5i_pkey_dev_cleanup(struct net_device *netdev);
+static int mlx5i_pkey_change_mtu(struct net_device *netdev, int new_mtu);
+static int mlx5i_pkey_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
+
+static const struct net_device_ops mlx5i_pkey_netdev_ops = {
+	.ndo_open                = mlx5i_pkey_open,
+	.ndo_stop                = mlx5i_pkey_close,
+	.ndo_init                = mlx5i_pkey_dev_init,
+	.ndo_uninit              = mlx5i_pkey_dev_cleanup,
+	.ndo_change_mtu          = mlx5i_pkey_change_mtu,
+	.ndo_do_ioctl            = mlx5i_pkey_ioctl,
+};
+
+/* Child NDOs */
+static int mlx5i_pkey_dev_init(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+	struct mlx5i_priv *ipriv, *parent_ipriv;
+	struct net_device *parent_dev;
+	int parent_ifindex;
+
+	ipriv = priv->ppriv;
+
+	/* Get QPN to netdevice hash table from parent */
+	parent_ifindex = dev->netdev_ops->ndo_get_iflink(dev);
+	parent_dev = dev_get_by_index(dev_net(dev), parent_ifindex);
+	if (!parent_dev) {
+		mlx5_core_warn(priv->mdev, "failed to get parent device\n");
+		return -EINVAL;
+	}
+
+	parent_ipriv = netdev_priv(parent_dev);
+	ipriv->qpn_htbl = parent_ipriv->qpn_htbl;
+	dev_put(parent_dev);
+
+	return mlx5i_dev_init(dev);
+}
+
+static int mlx5i_pkey_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	return mlx5i_ioctl(dev, ifr, cmd);
+}
+
+static void mlx5i_pkey_dev_cleanup(struct net_device *netdev)
+{
+	return mlx5i_dev_cleanup(netdev);
+}
+
+static int mlx5i_pkey_open(struct net_device *netdev)
+{
+	struct mlx5e_priv *epriv = mlx5i_epriv(netdev);
+	struct mlx5i_priv *ipriv = epriv->ppriv;
+	struct mlx5_core_dev *mdev = epriv->mdev;
+	int err;
+
+	mutex_lock(&epriv->state_lock);
+
+	set_bit(MLX5E_STATE_OPENED, &epriv->state);
+
+	err = mlx5i_init_underlay_qp(epriv);
+	if (err) {
+		mlx5_core_warn(mdev, "prepare child underlay qp state failed, %d\n", err);
+		goto err_release_lock;
+	}
+
+	err = mlx5_fs_add_rx_underlay_qpn(mdev, ipriv->qp.qpn);
+	if (err) {
+		mlx5_core_warn(mdev, "attach child underlay qp to ft failed, %d\n", err);
+		goto err_unint_underlay_qp;
+	}
+
+	err = mlx5e_create_tis(mdev, 0 /* tc */, ipriv->qp.qpn, &epriv->tisn[0]);
+	if (err) {
+		mlx5_core_warn(mdev, "create child tis failed, %d\n", err);
+		goto err_remove_rx_uderlay_qp;
+	}
+
+	err = mlx5e_open_channels(epriv, &epriv->channels);
+	if (err) {
+		mlx5_core_warn(mdev, "opening child channels failed, %d\n", err);
+		goto err_clear_state_opened_flag;
+	}
+	mlx5e_refresh_tirs(epriv, false);
+	mlx5e_activate_priv_channels(epriv);
+	mutex_unlock(&epriv->state_lock);
+
+	return 0;
+
+err_clear_state_opened_flag:
+	mlx5e_destroy_tis(mdev, epriv->tisn[0]);
+err_remove_rx_uderlay_qp:
+	mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn);
+err_unint_underlay_qp:
+	mlx5i_uninit_underlay_qp(epriv);
+err_release_lock:
+	clear_bit(MLX5E_STATE_OPENED, &epriv->state);
+	mutex_unlock(&epriv->state_lock);
+	return err;
+}
+
+static int mlx5i_pkey_close(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+	struct mlx5i_priv *ipriv = priv->ppriv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	mutex_lock(&priv->state_lock);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		goto unlock;
+
+	clear_bit(MLX5E_STATE_OPENED, &priv->state);
+
+	netif_carrier_off(priv->netdev);
+	mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn);
+	mlx5i_uninit_underlay_qp(priv);
+	mlx5e_deactivate_priv_channels(priv);
+	mlx5e_close_channels(&priv->channels);
+	mlx5e_destroy_tis(mdev, priv->tisn[0]);
+unlock:
+	mutex_unlock(&priv->state_lock);
+	return 0;
+}
+
+static int mlx5i_pkey_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+
+	mutex_lock(&priv->state_lock);
+	netdev->mtu = new_mtu;
+	mutex_unlock(&priv->state_lock);
+
+	return 0;
+}
+
+/* Called directly after IPoIB netdevice was created to initialize SW structs */
+static void mlx5i_pkey_init(struct mlx5_core_dev *mdev,
+			     struct net_device *netdev,
+			     const struct mlx5e_profile *profile,
+			     void *ppriv)
+{
+	struct mlx5e_priv *priv  = mlx5i_epriv(netdev);
+
+	mlx5i_init(mdev, netdev, profile, ppriv);
+
+	/* Override parent ndo */
+	netdev->netdev_ops = &mlx5i_pkey_netdev_ops;
+
+	/* Set child limited ethtool support */
+	netdev->ethtool_ops = &mlx5i_pkey_ethtool_ops;
+
+	/* Use dummy rqs */
+	priv->channels.params.log_rq_mtu_frames = MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE;
+}
+
+/* Called directly before IPoIB netdevice is destroyed to cleanup SW structs */
+static void mlx5i_pkey_cleanup(struct mlx5e_priv *priv)
+{
+	/* Do nothing .. */
+}
+
+static int mlx5i_pkey_init_tx(struct mlx5e_priv *priv)
+{
+	struct mlx5i_priv *ipriv = priv->ppriv;
+	int err;
+
+	err = mlx5i_create_underlay_qp(priv->mdev, &ipriv->qp);
+	if (err) {
+		mlx5_core_warn(priv->mdev, "create child underlay QP failed, %d\n", err);
+		return err;
+	}
+
+	return 0;
+}
+
+static void mlx5i_pkey_cleanup_tx(struct mlx5e_priv *priv)
+{
+	struct mlx5i_priv *ipriv = priv->ppriv;
+
+	mlx5i_destroy_underlay_qp(priv->mdev, &ipriv->qp);
+}
+
+static int mlx5i_pkey_init_rx(struct mlx5e_priv *priv)
+{
+	/* Since the rx resources are shared between child and parent, the
+	 * parent interface is taking care of rx resource allocation and init
+	 */
+	return 0;
+}
+
+static void mlx5i_pkey_cleanup_rx(struct mlx5e_priv *priv)
+{
+	/* Since the rx resources are shared between child and parent, the
+	 * parent interface is taking care of rx resource free and de-init
+	 */
+}
+
+static const struct mlx5e_profile mlx5i_pkey_nic_profile = {
+	.init		   = mlx5i_pkey_init,
+	.cleanup	   = mlx5i_pkey_cleanup,
+	.init_tx	   = mlx5i_pkey_init_tx,
+	.cleanup_tx	   = mlx5i_pkey_cleanup_tx,
+	.init_rx	   = mlx5i_pkey_init_rx,
+	.cleanup_rx	   = mlx5i_pkey_cleanup_rx,
+	.enable		   = NULL,
+	.disable	   = NULL,
+	.update_stats	   = NULL,
+	.max_nch	   = mlx5e_get_max_num_channels,
+	.rx_handlers.handle_rx_cqe       = mlx5i_handle_rx_cqe,
+	.rx_handlers.handle_rx_cqe_mpwqe = NULL, /* Not supported */
+	.max_tc		   = MLX5I_MAX_NUM_TC,
+};
+
+const struct mlx5e_profile *mlx5i_pkey_get_profile(void)
+{
+	return &mlx5i_pkey_nic_profile;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
new file mode 100644
index 0000000..582b2f1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@@ -0,0 +1,691 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
+#include "mlx5_core.h"
+
+enum {
+	MLX5_LAG_FLAG_BONDED = 1 << 0,
+};
+
+struct lag_func {
+	struct mlx5_core_dev *dev;
+	struct net_device    *netdev;
+};
+
+/* Used for collection of netdev event info. */
+struct lag_tracker {
+	enum   netdev_lag_tx_type           tx_type;
+	struct netdev_lag_lower_state_info  netdev_state[MLX5_MAX_PORTS];
+	bool is_bonded;
+};
+
+/* LAG data of a ConnectX card.
+ * It serves both its phys functions.
+ */
+struct mlx5_lag {
+	u8                        flags;
+	u8                        v2p_map[MLX5_MAX_PORTS];
+	struct lag_func           pf[MLX5_MAX_PORTS];
+	struct lag_tracker        tracker;
+	struct delayed_work       bond_work;
+	struct notifier_block     nb;
+
+	/* Admin state. Allow lag only if allowed is true
+	 * even if network conditions for lag were met
+	 */
+	bool                      allowed;
+};
+
+/* General purpose, use for short periods of time.
+ * Beware of lock dependencies (preferably, no locks should be acquired
+ * under it).
+ */
+static DEFINE_MUTEX(lag_mutex);
+
+static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 remap_port1,
+			       u8 remap_port2)
+{
+	u32   in[MLX5_ST_SZ_DW(create_lag_in)]   = {0};
+	u32   out[MLX5_ST_SZ_DW(create_lag_out)] = {0};
+	void *lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx);
+
+	MLX5_SET(create_lag_in, in, opcode, MLX5_CMD_OP_CREATE_LAG);
+
+	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, remap_port1);
+	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, remap_port2);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 remap_port1,
+			       u8 remap_port2)
+{
+	u32   in[MLX5_ST_SZ_DW(modify_lag_in)]   = {0};
+	u32   out[MLX5_ST_SZ_DW(modify_lag_out)] = {0};
+	void *lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx);
+
+	MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG);
+	MLX5_SET(modify_lag_in, in, field_select, 0x1);
+
+	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, remap_port1);
+	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, remap_port2);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_cmd_destroy_lag(struct mlx5_core_dev *dev)
+{
+	u32  in[MLX5_ST_SZ_DW(destroy_lag_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_lag_out)] = {0};
+
+	MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev)
+{
+	u32  in[MLX5_ST_SZ_DW(create_vport_lag_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(create_vport_lag_out)] = {0};
+
+	MLX5_SET(create_vport_lag_in, in, opcode, MLX5_CMD_OP_CREATE_VPORT_LAG);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_cmd_create_vport_lag);
+
+int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
+{
+	u32  in[MLX5_ST_SZ_DW(destroy_vport_lag_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_vport_lag_out)] = {0};
+
+	MLX5_SET(destroy_vport_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_VPORT_LAG);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);
+
+static int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
+				       bool reset, void *out, int out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = { };
+
+	MLX5_SET(query_cong_statistics_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_CONG_STATISTICS);
+	MLX5_SET(query_cong_statistics_in, in, clear, reset);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
+}
+
+static struct mlx5_lag *mlx5_lag_dev_get(struct mlx5_core_dev *dev)
+{
+	return dev->priv.lag;
+}
+
+static int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
+				       struct net_device *ndev)
+{
+	int i;
+
+	for (i = 0; i < MLX5_MAX_PORTS; i++)
+		if (ldev->pf[i].netdev == ndev)
+			return i;
+
+	return -1;
+}
+
+static bool mlx5_lag_is_bonded(struct mlx5_lag *ldev)
+{
+	return !!(ldev->flags & MLX5_LAG_FLAG_BONDED);
+}
+
+static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker,
+					   u8 *port1, u8 *port2)
+{
+	*port1 = 1;
+	*port2 = 2;
+	if (!tracker->netdev_state[0].tx_enabled ||
+	    !tracker->netdev_state[0].link_up) {
+		*port1 = 2;
+		return;
+	}
+
+	if (!tracker->netdev_state[1].tx_enabled ||
+	    !tracker->netdev_state[1].link_up)
+		*port2 = 1;
+}
+
+static void mlx5_activate_lag(struct mlx5_lag *ldev,
+			      struct lag_tracker *tracker)
+{
+	struct mlx5_core_dev *dev0 = ldev->pf[0].dev;
+	int err;
+
+	ldev->flags |= MLX5_LAG_FLAG_BONDED;
+
+	mlx5_infer_tx_affinity_mapping(tracker, &ldev->v2p_map[0],
+				       &ldev->v2p_map[1]);
+
+	err = mlx5_cmd_create_lag(dev0, ldev->v2p_map[0], ldev->v2p_map[1]);
+	if (err)
+		mlx5_core_err(dev0,
+			      "Failed to create LAG (%d)\n",
+			      err);
+}
+
+static void mlx5_deactivate_lag(struct mlx5_lag *ldev)
+{
+	struct mlx5_core_dev *dev0 = ldev->pf[0].dev;
+	int err;
+
+	ldev->flags &= ~MLX5_LAG_FLAG_BONDED;
+
+	err = mlx5_cmd_destroy_lag(dev0);
+	if (err)
+		mlx5_core_err(dev0,
+			      "Failed to destroy LAG (%d)\n",
+			      err);
+}
+
+static void mlx5_do_bond(struct mlx5_lag *ldev)
+{
+	struct mlx5_core_dev *dev0 = ldev->pf[0].dev;
+	struct mlx5_core_dev *dev1 = ldev->pf[1].dev;
+	struct lag_tracker tracker;
+	u8 v2p_port1, v2p_port2;
+	int i, err;
+	bool do_bond;
+
+	if (!dev0 || !dev1)
+		return;
+
+	mutex_lock(&lag_mutex);
+	tracker = ldev->tracker;
+	mutex_unlock(&lag_mutex);
+
+	do_bond = tracker.is_bonded && ldev->allowed;
+
+	if (do_bond && !mlx5_lag_is_bonded(ldev)) {
+		for (i = 0; i < MLX5_MAX_PORTS; i++)
+			mlx5_remove_dev_by_protocol(ldev->pf[i].dev,
+						    MLX5_INTERFACE_PROTOCOL_IB);
+
+		mlx5_activate_lag(ldev, &tracker);
+
+		mlx5_add_dev_by_protocol(dev0, MLX5_INTERFACE_PROTOCOL_IB);
+		mlx5_nic_vport_enable_roce(dev1);
+	} else if (do_bond && mlx5_lag_is_bonded(ldev)) {
+		mlx5_infer_tx_affinity_mapping(&tracker, &v2p_port1,
+					       &v2p_port2);
+
+		if ((v2p_port1 != ldev->v2p_map[0]) ||
+		    (v2p_port2 != ldev->v2p_map[1])) {
+			ldev->v2p_map[0] = v2p_port1;
+			ldev->v2p_map[1] = v2p_port2;
+
+			err = mlx5_cmd_modify_lag(dev0, v2p_port1, v2p_port2);
+			if (err)
+				mlx5_core_err(dev0,
+					      "Failed to modify LAG (%d)\n",
+					      err);
+		}
+	} else if (!do_bond && mlx5_lag_is_bonded(ldev)) {
+		mlx5_remove_dev_by_protocol(dev0, MLX5_INTERFACE_PROTOCOL_IB);
+		mlx5_nic_vport_disable_roce(dev1);
+
+		mlx5_deactivate_lag(ldev);
+
+		for (i = 0; i < MLX5_MAX_PORTS; i++)
+			if (ldev->pf[i].dev)
+				mlx5_add_dev_by_protocol(ldev->pf[i].dev,
+							 MLX5_INTERFACE_PROTOCOL_IB);
+	}
+}
+
+static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay)
+{
+	schedule_delayed_work(&ldev->bond_work, delay);
+}
+
+static void mlx5_do_bond_work(struct work_struct *work)
+{
+	struct delayed_work *delayed_work = to_delayed_work(work);
+	struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag,
+					     bond_work);
+	int status;
+
+	status = mlx5_dev_list_trylock();
+	if (!status) {
+		/* 1 sec delay. */
+		mlx5_queue_bond_work(ldev, HZ);
+		return;
+	}
+
+	mlx5_do_bond(ldev);
+	mlx5_dev_list_unlock();
+}
+
+static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev,
+					 struct lag_tracker *tracker,
+					 struct net_device *ndev,
+					 struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *upper = info->upper_dev, *ndev_tmp;
+	struct netdev_lag_upper_info *lag_upper_info = NULL;
+	bool is_bonded;
+	int bond_status = 0;
+	int num_slaves = 0;
+	int idx;
+
+	if (!netif_is_lag_master(upper))
+		return 0;
+
+	if (info->linking)
+		lag_upper_info = info->upper_info;
+
+	/* The event may still be of interest if the slave does not belong to
+	 * us, but is enslaved to a master which has one or more of our netdevs
+	 * as slaves (e.g., if a new slave is added to a master that bonds two
+	 * of our netdevs, we should unbond).
+	 */
+	rcu_read_lock();
+	for_each_netdev_in_bond_rcu(upper, ndev_tmp) {
+		idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
+		if (idx > -1)
+			bond_status |= (1 << idx);
+
+		num_slaves++;
+	}
+	rcu_read_unlock();
+
+	/* None of this lagdev's netdevs are slaves of this master. */
+	if (!(bond_status & 0x3))
+		return 0;
+
+	if (lag_upper_info)
+		tracker->tx_type = lag_upper_info->tx_type;
+
+	/* Determine bonding status:
+	 * A device is considered bonded if both its physical ports are slaves
+	 * of the same lag master, and only them.
+	 * Lag mode must be activebackup or hash.
+	 */
+	is_bonded = (num_slaves == MLX5_MAX_PORTS) &&
+		    (bond_status == 0x3) &&
+		    ((tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) ||
+		     (tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH));
+
+	if (tracker->is_bonded != is_bonded) {
+		tracker->is_bonded = is_bonded;
+		return 1;
+	}
+
+	return 0;
+}
+
+static int mlx5_handle_changelowerstate_event(struct mlx5_lag *ldev,
+					      struct lag_tracker *tracker,
+					      struct net_device *ndev,
+					      struct netdev_notifier_changelowerstate_info *info)
+{
+	struct netdev_lag_lower_state_info *lag_lower_info;
+	int idx;
+
+	if (!netif_is_lag_port(ndev))
+		return 0;
+
+	idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev);
+	if (idx == -1)
+		return 0;
+
+	/* This information is used to determine virtual to physical
+	 * port mapping.
+	 */
+	lag_lower_info = info->lower_state_info;
+	if (!lag_lower_info)
+		return 0;
+
+	tracker->netdev_state[idx] = *lag_lower_info;
+
+	return 1;
+}
+
+static int mlx5_lag_netdev_event(struct notifier_block *this,
+				 unsigned long event, void *ptr)
+{
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	struct lag_tracker tracker;
+	struct mlx5_lag *ldev;
+	int changed = 0;
+
+	if (!net_eq(dev_net(ndev), &init_net))
+		return NOTIFY_DONE;
+
+	if ((event != NETDEV_CHANGEUPPER) && (event != NETDEV_CHANGELOWERSTATE))
+		return NOTIFY_DONE;
+
+	ldev    = container_of(this, struct mlx5_lag, nb);
+	tracker = ldev->tracker;
+
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		changed = mlx5_handle_changeupper_event(ldev, &tracker, ndev,
+							ptr);
+		break;
+	case NETDEV_CHANGELOWERSTATE:
+		changed = mlx5_handle_changelowerstate_event(ldev, &tracker,
+							     ndev, ptr);
+		break;
+	}
+
+	mutex_lock(&lag_mutex);
+	ldev->tracker = tracker;
+	mutex_unlock(&lag_mutex);
+
+	if (changed)
+		mlx5_queue_bond_work(ldev, 0);
+
+	return NOTIFY_DONE;
+}
+
+static bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
+{
+	if ((ldev->pf[0].dev && mlx5_sriov_is_enabled(ldev->pf[0].dev)) ||
+	    (ldev->pf[1].dev && mlx5_sriov_is_enabled(ldev->pf[1].dev)))
+		return false;
+	else
+		return true;
+}
+
+static struct mlx5_lag *mlx5_lag_dev_alloc(void)
+{
+	struct mlx5_lag *ldev;
+
+	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
+	if (!ldev)
+		return NULL;
+
+	INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);
+	ldev->allowed = mlx5_lag_check_prereq(ldev);
+
+	return ldev;
+}
+
+static void mlx5_lag_dev_free(struct mlx5_lag *ldev)
+{
+	kfree(ldev);
+}
+
+static void mlx5_lag_dev_add_pf(struct mlx5_lag *ldev,
+				struct mlx5_core_dev *dev,
+				struct net_device *netdev)
+{
+	unsigned int fn = PCI_FUNC(dev->pdev->devfn);
+
+	if (fn >= MLX5_MAX_PORTS)
+		return;
+
+	mutex_lock(&lag_mutex);
+	ldev->pf[fn].dev    = dev;
+	ldev->pf[fn].netdev = netdev;
+	ldev->tracker.netdev_state[fn].link_up = 0;
+	ldev->tracker.netdev_state[fn].tx_enabled = 0;
+
+	ldev->allowed = mlx5_lag_check_prereq(ldev);
+	dev->priv.lag = ldev;
+
+	mutex_unlock(&lag_mutex);
+}
+
+static void mlx5_lag_dev_remove_pf(struct mlx5_lag *ldev,
+				   struct mlx5_core_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < MLX5_MAX_PORTS; i++)
+		if (ldev->pf[i].dev == dev)
+			break;
+
+	if (i == MLX5_MAX_PORTS)
+		return;
+
+	mutex_lock(&lag_mutex);
+	memset(&ldev->pf[i], 0, sizeof(*ldev->pf));
+
+	dev->priv.lag = NULL;
+	ldev->allowed = mlx5_lag_check_prereq(ldev);
+	mutex_unlock(&lag_mutex);
+}
+
+/* Must be called with intf_mutex held */
+void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
+{
+	struct mlx5_lag *ldev = NULL;
+	struct mlx5_core_dev *tmp_dev;
+
+	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
+	    !MLX5_CAP_GEN(dev, lag_master) ||
+	    (MLX5_CAP_GEN(dev, num_lag_ports) != MLX5_MAX_PORTS))
+		return;
+
+	tmp_dev = mlx5_get_next_phys_dev(dev);
+	if (tmp_dev)
+		ldev = tmp_dev->priv.lag;
+
+	if (!ldev) {
+		ldev = mlx5_lag_dev_alloc();
+		if (!ldev) {
+			mlx5_core_err(dev, "Failed to alloc lag dev\n");
+			return;
+		}
+	}
+
+	mlx5_lag_dev_add_pf(ldev, dev, netdev);
+
+	if (!ldev->nb.notifier_call) {
+		ldev->nb.notifier_call = mlx5_lag_netdev_event;
+		if (register_netdevice_notifier(&ldev->nb)) {
+			ldev->nb.notifier_call = NULL;
+			mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
+		}
+	}
+}
+
+/* Must be called with intf_mutex held */
+void mlx5_lag_remove(struct mlx5_core_dev *dev)
+{
+	struct mlx5_lag *ldev;
+	int i;
+
+	ldev = mlx5_lag_dev_get(dev);
+	if (!ldev)
+		return;
+
+	if (mlx5_lag_is_bonded(ldev))
+		mlx5_deactivate_lag(ldev);
+
+	mlx5_lag_dev_remove_pf(ldev, dev);
+
+	for (i = 0; i < MLX5_MAX_PORTS; i++)
+		if (ldev->pf[i].dev)
+			break;
+
+	if (i == MLX5_MAX_PORTS) {
+		if (ldev->nb.notifier_call)
+			unregister_netdevice_notifier(&ldev->nb);
+		cancel_delayed_work_sync(&ldev->bond_work);
+		mlx5_lag_dev_free(ldev);
+	}
+}
+
+bool mlx5_lag_is_active(struct mlx5_core_dev *dev)
+{
+	struct mlx5_lag *ldev;
+	bool res;
+
+	mutex_lock(&lag_mutex);
+	ldev = mlx5_lag_dev_get(dev);
+	res  = ldev && mlx5_lag_is_bonded(ldev);
+	mutex_unlock(&lag_mutex);
+
+	return res;
+}
+EXPORT_SYMBOL(mlx5_lag_is_active);
+
+static int mlx5_lag_set_state(struct mlx5_core_dev *dev, bool allow)
+{
+	struct mlx5_lag *ldev;
+	int ret = 0;
+	bool lag_active;
+
+	mlx5_dev_list_lock();
+
+	ldev = mlx5_lag_dev_get(dev);
+	if (!ldev) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+	lag_active = mlx5_lag_is_bonded(ldev);
+	if (!mlx5_lag_check_prereq(ldev) && allow) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+	if (ldev->allowed == allow)
+		goto unlock;
+	ldev->allowed = allow;
+	if ((lag_active && !allow) || allow)
+		mlx5_do_bond(ldev);
+unlock:
+	mlx5_dev_list_unlock();
+	return ret;
+}
+
+int mlx5_lag_forbid(struct mlx5_core_dev *dev)
+{
+	return mlx5_lag_set_state(dev, false);
+}
+
+int mlx5_lag_allow(struct mlx5_core_dev *dev)
+{
+	return mlx5_lag_set_state(dev, true);
+}
+
+struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev)
+{
+	struct net_device *ndev = NULL;
+	struct mlx5_lag *ldev;
+
+	mutex_lock(&lag_mutex);
+	ldev = mlx5_lag_dev_get(dev);
+
+	if (!(ldev && mlx5_lag_is_bonded(ldev)))
+		goto unlock;
+
+	if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+		ndev = ldev->tracker.netdev_state[0].tx_enabled ?
+		       ldev->pf[0].netdev : ldev->pf[1].netdev;
+	} else {
+		ndev = ldev->pf[0].netdev;
+	}
+	if (ndev)
+		dev_hold(ndev);
+
+unlock:
+	mutex_unlock(&lag_mutex);
+
+	return ndev;
+}
+EXPORT_SYMBOL(mlx5_lag_get_roce_netdev);
+
+bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev,
+						 priv);
+	struct mlx5_lag *ldev;
+
+	if (intf->protocol != MLX5_INTERFACE_PROTOCOL_IB)
+		return true;
+
+	ldev = mlx5_lag_dev_get(dev);
+	if (!ldev || !mlx5_lag_is_bonded(ldev) || ldev->pf[0].dev == dev)
+		return true;
+
+	/* If bonded, we do not add an IB device for PF1. */
+	return false;
+}
+
+int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
+				 u64 *values,
+				 int num_counters,
+				 size_t *offsets)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
+	struct mlx5_core_dev *mdev[MLX5_MAX_PORTS];
+	struct mlx5_lag *ldev;
+	int num_ports;
+	int ret, i, j;
+	void *out;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	memset(values, 0, sizeof(*values) * num_counters);
+
+	mutex_lock(&lag_mutex);
+	ldev = mlx5_lag_dev_get(dev);
+	if (ldev && mlx5_lag_is_bonded(ldev)) {
+		num_ports = MLX5_MAX_PORTS;
+		mdev[0] = ldev->pf[0].dev;
+		mdev[1] = ldev->pf[1].dev;
+	} else {
+		num_ports = 1;
+		mdev[0] = dev;
+	}
+
+	for (i = 0; i < num_ports; ++i) {
+		ret = mlx5_cmd_query_cong_counter(mdev[i], false, out, outlen);
+		if (ret)
+			goto unlock;
+
+		for (j = 0; j < num_counters; ++j)
+			values[j] += be64_to_cpup((__be64 *)(out + offsets[j]));
+	}
+
+unlock:
+	mutex_unlock(&lag_mutex);
+	kvfree(out);
+	return ret;
+}
+EXPORT_SYMBOL(mlx5_lag_query_cong_counters);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/lib/Makefile
new file mode 100644
index 0000000..d8e1711
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
new file mode 100644
index 0000000..8570355
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
@@ -0,0 +1,585 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/highmem.h>
+#include <rdma/mlx5-abi.h>
+#include "en.h"
+#include "clock.h"
+
+enum {
+	MLX5_CYCLES_SHIFT	= 23
+};
+
+enum {
+	MLX5_PIN_MODE_IN		= 0x0,
+	MLX5_PIN_MODE_OUT		= 0x1,
+};
+
+enum {
+	MLX5_OUT_PATTERN_PULSE		= 0x0,
+	MLX5_OUT_PATTERN_PERIODIC	= 0x1,
+};
+
+enum {
+	MLX5_EVENT_MODE_DISABLE	= 0x0,
+	MLX5_EVENT_MODE_REPETETIVE	= 0x1,
+	MLX5_EVENT_MODE_ONCE_TILL_ARM	= 0x2,
+};
+
+enum {
+	MLX5_MTPPS_FS_ENABLE			= BIT(0x0),
+	MLX5_MTPPS_FS_PATTERN			= BIT(0x2),
+	MLX5_MTPPS_FS_PIN_MODE			= BIT(0x3),
+	MLX5_MTPPS_FS_TIME_STAMP		= BIT(0x4),
+	MLX5_MTPPS_FS_OUT_PULSE_DURATION	= BIT(0x5),
+	MLX5_MTPPS_FS_ENH_OUT_PER_ADJ		= BIT(0x7),
+};
+
+static u64 read_internal_timer(const struct cyclecounter *cc)
+{
+	struct mlx5_clock *clock = container_of(cc, struct mlx5_clock, cycles);
+	struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev,
+						  clock);
+
+	return mlx5_read_internal_timer(mdev) & cc->mask;
+}
+
+static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_ib_clock_info *clock_info = mdev->clock_info;
+	struct mlx5_clock *clock = &mdev->clock;
+	u32 sign;
+
+	if (!clock_info)
+		return;
+
+	sign = smp_load_acquire(&clock_info->sign);
+	smp_store_mb(clock_info->sign,
+		     sign | MLX5_IB_CLOCK_INFO_KERNEL_UPDATING);
+
+	clock_info->cycles = clock->tc.cycle_last;
+	clock_info->mult   = clock->cycles.mult;
+	clock_info->nsec   = clock->tc.nsec;
+	clock_info->frac   = clock->tc.frac;
+
+	smp_store_release(&clock_info->sign,
+			  sign + MLX5_IB_CLOCK_INFO_KERNEL_UPDATING * 2);
+}
+
+static void mlx5_pps_out(struct work_struct *work)
+{
+	struct mlx5_pps *pps_info = container_of(work, struct mlx5_pps,
+						 out_work);
+	struct mlx5_clock *clock = container_of(pps_info, struct mlx5_clock,
+						pps_info);
+	struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev,
+						  clock);
+	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < clock->ptp_info.n_pins; i++) {
+		u64 tstart;
+
+		write_lock_irqsave(&clock->lock, flags);
+		tstart = clock->pps_info.start[i];
+		clock->pps_info.start[i] = 0;
+		write_unlock_irqrestore(&clock->lock, flags);
+		if (!tstart)
+			continue;
+
+		MLX5_SET(mtpps_reg, in, pin, i);
+		MLX5_SET64(mtpps_reg, in, time_stamp, tstart);
+		MLX5_SET(mtpps_reg, in, field_select, MLX5_MTPPS_FS_TIME_STAMP);
+		mlx5_set_mtpps(mdev, in, sizeof(in));
+	}
+}
+
+static void mlx5_timestamp_overflow(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct mlx5_clock *clock = container_of(dwork, struct mlx5_clock,
+						overflow_work);
+	unsigned long flags;
+
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_read(&clock->tc);
+	mlx5_update_clock_info_page(clock->mdev);
+	write_unlock_irqrestore(&clock->lock, flags);
+	schedule_delayed_work(&clock->overflow_work, clock->overflow_period);
+}
+
+static int mlx5_ptp_settime(struct ptp_clock_info *ptp,
+			    const struct timespec64 *ts)
+{
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						 ptp_info);
+	u64 ns = timespec64_to_ns(ts);
+	unsigned long flags;
+
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_init(&clock->tc, &clock->cycles, ns);
+	mlx5_update_clock_info_page(clock->mdev);
+	write_unlock_irqrestore(&clock->lock, flags);
+
+	return 0;
+}
+
+static int mlx5_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						ptp_info);
+	u64 ns;
+	unsigned long flags;
+
+	write_lock_irqsave(&clock->lock, flags);
+	ns = timecounter_read(&clock->tc);
+	write_unlock_irqrestore(&clock->lock, flags);
+
+	*ts = ns_to_timespec64(ns);
+
+	return 0;
+}
+
+static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						ptp_info);
+	unsigned long flags;
+
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_adjtime(&clock->tc, delta);
+	mlx5_update_clock_info_page(clock->mdev);
+	write_unlock_irqrestore(&clock->lock, flags);
+
+	return 0;
+}
+
+static int mlx5_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
+{
+	u64 adj;
+	u32 diff;
+	unsigned long flags;
+	int neg_adj = 0;
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						ptp_info);
+
+	if (delta < 0) {
+		neg_adj = 1;
+		delta = -delta;
+	}
+
+	adj = clock->nominal_c_mult;
+	adj *= delta;
+	diff = div_u64(adj, 1000000000ULL);
+
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_read(&clock->tc);
+	clock->cycles.mult = neg_adj ? clock->nominal_c_mult - diff :
+				       clock->nominal_c_mult + diff;
+	mlx5_update_clock_info_page(clock->mdev);
+	write_unlock_irqrestore(&clock->lock, flags);
+
+	return 0;
+}
+
+static int mlx5_extts_configure(struct ptp_clock_info *ptp,
+				struct ptp_clock_request *rq,
+				int on)
+{
+	struct mlx5_clock *clock =
+			container_of(ptp, struct mlx5_clock, ptp_info);
+	struct mlx5_core_dev *mdev =
+			container_of(clock, struct mlx5_core_dev, clock);
+	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+	u32 field_select = 0;
+	u8 pin_mode = 0;
+	u8 pattern = 0;
+	int pin = -1;
+	int err = 0;
+
+	if (!MLX5_PPS_CAP(mdev))
+		return -EOPNOTSUPP;
+
+	if (rq->extts.index >= clock->ptp_info.n_pins)
+		return -EINVAL;
+
+	if (on) {
+		pin = ptp_find_pin(clock->ptp, PTP_PF_EXTTS, rq->extts.index);
+		if (pin < 0)
+			return -EBUSY;
+		pin_mode = MLX5_PIN_MODE_IN;
+		pattern = !!(rq->extts.flags & PTP_FALLING_EDGE);
+		field_select = MLX5_MTPPS_FS_PIN_MODE |
+			       MLX5_MTPPS_FS_PATTERN |
+			       MLX5_MTPPS_FS_ENABLE;
+	} else {
+		pin = rq->extts.index;
+		field_select = MLX5_MTPPS_FS_ENABLE;
+	}
+
+	MLX5_SET(mtpps_reg, in, pin, pin);
+	MLX5_SET(mtpps_reg, in, pin_mode, pin_mode);
+	MLX5_SET(mtpps_reg, in, pattern, pattern);
+	MLX5_SET(mtpps_reg, in, enable, on);
+	MLX5_SET(mtpps_reg, in, field_select, field_select);
+
+	err = mlx5_set_mtpps(mdev, in, sizeof(in));
+	if (err)
+		return err;
+
+	return mlx5_set_mtppse(mdev, pin, 0,
+			       MLX5_EVENT_MODE_REPETETIVE & on);
+}
+
+static int mlx5_perout_configure(struct ptp_clock_info *ptp,
+				 struct ptp_clock_request *rq,
+				 int on)
+{
+	struct mlx5_clock *clock =
+			container_of(ptp, struct mlx5_clock, ptp_info);
+	struct mlx5_core_dev *mdev =
+			container_of(clock, struct mlx5_core_dev, clock);
+	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+	u64 nsec_now, nsec_delta, time_stamp = 0;
+	u64 cycles_now, cycles_delta;
+	struct timespec64 ts;
+	unsigned long flags;
+	u32 field_select = 0;
+	u8 pin_mode = 0;
+	u8 pattern = 0;
+	int pin = -1;
+	int err = 0;
+	s64 ns;
+
+	if (!MLX5_PPS_CAP(mdev))
+		return -EOPNOTSUPP;
+
+	if (rq->perout.index >= clock->ptp_info.n_pins)
+		return -EINVAL;
+
+	if (on) {
+		pin = ptp_find_pin(clock->ptp, PTP_PF_PEROUT,
+				   rq->perout.index);
+		if (pin < 0)
+			return -EBUSY;
+
+		pin_mode = MLX5_PIN_MODE_OUT;
+		pattern = MLX5_OUT_PATTERN_PERIODIC;
+		ts.tv_sec = rq->perout.period.sec;
+		ts.tv_nsec = rq->perout.period.nsec;
+		ns = timespec64_to_ns(&ts);
+
+		if ((ns >> 1) != 500000000LL)
+			return -EINVAL;
+
+		ts.tv_sec = rq->perout.start.sec;
+		ts.tv_nsec = rq->perout.start.nsec;
+		ns = timespec64_to_ns(&ts);
+		cycles_now = mlx5_read_internal_timer(mdev);
+		write_lock_irqsave(&clock->lock, flags);
+		nsec_now = timecounter_cyc2time(&clock->tc, cycles_now);
+		nsec_delta = ns - nsec_now;
+		cycles_delta = div64_u64(nsec_delta << clock->cycles.shift,
+					 clock->cycles.mult);
+		write_unlock_irqrestore(&clock->lock, flags);
+		time_stamp = cycles_now + cycles_delta;
+		field_select = MLX5_MTPPS_FS_PIN_MODE |
+			       MLX5_MTPPS_FS_PATTERN |
+			       MLX5_MTPPS_FS_ENABLE |
+			       MLX5_MTPPS_FS_TIME_STAMP;
+	} else {
+		pin = rq->perout.index;
+		field_select = MLX5_MTPPS_FS_ENABLE;
+	}
+
+	MLX5_SET(mtpps_reg, in, pin, pin);
+	MLX5_SET(mtpps_reg, in, pin_mode, pin_mode);
+	MLX5_SET(mtpps_reg, in, pattern, pattern);
+	MLX5_SET(mtpps_reg, in, enable, on);
+	MLX5_SET64(mtpps_reg, in, time_stamp, time_stamp);
+	MLX5_SET(mtpps_reg, in, field_select, field_select);
+
+	err = mlx5_set_mtpps(mdev, in, sizeof(in));
+	if (err)
+		return err;
+
+	return mlx5_set_mtppse(mdev, pin, 0,
+			       MLX5_EVENT_MODE_REPETETIVE & on);
+}
+
+static int mlx5_pps_configure(struct ptp_clock_info *ptp,
+			      struct ptp_clock_request *rq,
+			      int on)
+{
+	struct mlx5_clock *clock =
+			container_of(ptp, struct mlx5_clock, ptp_info);
+
+	clock->pps_info.enabled = !!on;
+	return 0;
+}
+
+static int mlx5_ptp_enable(struct ptp_clock_info *ptp,
+			   struct ptp_clock_request *rq,
+			   int on)
+{
+	switch (rq->type) {
+	case PTP_CLK_REQ_EXTTS:
+		return mlx5_extts_configure(ptp, rq, on);
+	case PTP_CLK_REQ_PEROUT:
+		return mlx5_perout_configure(ptp, rq, on);
+	case PTP_CLK_REQ_PPS:
+		return mlx5_pps_configure(ptp, rq, on);
+	default:
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+static int mlx5_ptp_verify(struct ptp_clock_info *ptp, unsigned int pin,
+			   enum ptp_pin_function func, unsigned int chan)
+{
+	return (func == PTP_PF_PHYSYNC) ? -EOPNOTSUPP : 0;
+}
+
+static const struct ptp_clock_info mlx5_ptp_clock_info = {
+	.owner		= THIS_MODULE,
+	.name		= "mlx5_p2p",
+	.max_adj	= 100000000,
+	.n_alarm	= 0,
+	.n_ext_ts	= 0,
+	.n_per_out	= 0,
+	.n_pins		= 0,
+	.pps		= 0,
+	.adjfreq	= mlx5_ptp_adjfreq,
+	.adjtime	= mlx5_ptp_adjtime,
+	.gettime64	= mlx5_ptp_gettime,
+	.settime64	= mlx5_ptp_settime,
+	.enable		= NULL,
+	.verify		= NULL,
+};
+
+static int mlx5_init_pin_config(struct mlx5_clock *clock)
+{
+	int i;
+
+	clock->ptp_info.pin_config =
+			kzalloc(sizeof(*clock->ptp_info.pin_config) *
+				clock->ptp_info.n_pins, GFP_KERNEL);
+	if (!clock->ptp_info.pin_config)
+		return -ENOMEM;
+	clock->ptp_info.enable = mlx5_ptp_enable;
+	clock->ptp_info.verify = mlx5_ptp_verify;
+	clock->ptp_info.pps = 1;
+
+	for (i = 0; i < clock->ptp_info.n_pins; i++) {
+		snprintf(clock->ptp_info.pin_config[i].name,
+			 sizeof(clock->ptp_info.pin_config[i].name),
+			 "mlx5_pps%d", i);
+		clock->ptp_info.pin_config[i].index = i;
+		clock->ptp_info.pin_config[i].func = PTP_PF_NONE;
+		clock->ptp_info.pin_config[i].chan = i;
+	}
+
+	return 0;
+}
+
+static void mlx5_get_pps_caps(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_clock *clock = &mdev->clock;
+	u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+
+	mlx5_query_mtpps(mdev, out, sizeof(out));
+
+	clock->ptp_info.n_pins = MLX5_GET(mtpps_reg, out,
+					  cap_number_of_pps_pins);
+	clock->ptp_info.n_ext_ts = MLX5_GET(mtpps_reg, out,
+					    cap_max_num_of_pps_in_pins);
+	clock->ptp_info.n_per_out = MLX5_GET(mtpps_reg, out,
+					     cap_max_num_of_pps_out_pins);
+
+	clock->pps_info.pin_caps[0] = MLX5_GET(mtpps_reg, out, cap_pin_0_mode);
+	clock->pps_info.pin_caps[1] = MLX5_GET(mtpps_reg, out, cap_pin_1_mode);
+	clock->pps_info.pin_caps[2] = MLX5_GET(mtpps_reg, out, cap_pin_2_mode);
+	clock->pps_info.pin_caps[3] = MLX5_GET(mtpps_reg, out, cap_pin_3_mode);
+	clock->pps_info.pin_caps[4] = MLX5_GET(mtpps_reg, out, cap_pin_4_mode);
+	clock->pps_info.pin_caps[5] = MLX5_GET(mtpps_reg, out, cap_pin_5_mode);
+	clock->pps_info.pin_caps[6] = MLX5_GET(mtpps_reg, out, cap_pin_6_mode);
+	clock->pps_info.pin_caps[7] = MLX5_GET(mtpps_reg, out, cap_pin_7_mode);
+}
+
+void mlx5_pps_event(struct mlx5_core_dev *mdev,
+		    struct mlx5_eqe *eqe)
+{
+	struct mlx5_clock *clock = &mdev->clock;
+	struct ptp_clock_event ptp_event;
+	struct timespec64 ts;
+	u64 nsec_now, nsec_delta;
+	u64 cycles_now, cycles_delta;
+	int pin = eqe->data.pps.pin;
+	s64 ns;
+	unsigned long flags;
+
+	switch (clock->ptp_info.pin_config[pin].func) {
+	case PTP_PF_EXTTS:
+		ptp_event.index = pin;
+		ptp_event.timestamp = timecounter_cyc2time(&clock->tc,
+					be64_to_cpu(eqe->data.pps.time_stamp));
+		if (clock->pps_info.enabled) {
+			ptp_event.type = PTP_CLOCK_PPSUSR;
+			ptp_event.pps_times.ts_real =
+					ns_to_timespec64(ptp_event.timestamp);
+		} else {
+			ptp_event.type = PTP_CLOCK_EXTTS;
+		}
+		ptp_clock_event(clock->ptp, &ptp_event);
+		break;
+	case PTP_PF_PEROUT:
+		mlx5_ptp_gettime(&clock->ptp_info, &ts);
+		cycles_now = mlx5_read_internal_timer(mdev);
+		ts.tv_sec += 1;
+		ts.tv_nsec = 0;
+		ns = timespec64_to_ns(&ts);
+		write_lock_irqsave(&clock->lock, flags);
+		nsec_now = timecounter_cyc2time(&clock->tc, cycles_now);
+		nsec_delta = ns - nsec_now;
+		cycles_delta = div64_u64(nsec_delta << clock->cycles.shift,
+					 clock->cycles.mult);
+		clock->pps_info.start[pin] = cycles_now + cycles_delta;
+		schedule_work(&clock->pps_info.out_work);
+		write_unlock_irqrestore(&clock->lock, flags);
+		break;
+	default:
+		mlx5_core_err(mdev, " Unhandled event\n");
+	}
+}
+
+void mlx5_init_clock(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_clock *clock = &mdev->clock;
+	u64 ns;
+	u64 frac = 0;
+	u32 dev_freq;
+
+	dev_freq = MLX5_CAP_GEN(mdev, device_frequency_khz);
+	if (!dev_freq) {
+		mlx5_core_warn(mdev, "invalid device_frequency_khz, aborting HW clock init\n");
+		return;
+	}
+	rwlock_init(&clock->lock);
+	clock->cycles.read = read_internal_timer;
+	clock->cycles.shift = MLX5_CYCLES_SHIFT;
+	clock->cycles.mult = clocksource_khz2mult(dev_freq,
+						  clock->cycles.shift);
+	clock->nominal_c_mult = clock->cycles.mult;
+	clock->cycles.mask = CLOCKSOURCE_MASK(41);
+	clock->mdev = mdev;
+
+	timecounter_init(&clock->tc, &clock->cycles,
+			 ktime_to_ns(ktime_get_real()));
+
+	/* Calculate period in seconds to call the overflow watchdog - to make
+	 * sure counter is checked at least once every wrap around.
+	 */
+	ns = cyclecounter_cyc2ns(&clock->cycles, clock->cycles.mask,
+				 frac, &frac);
+	do_div(ns, NSEC_PER_SEC / 2 / HZ);
+	clock->overflow_period = ns;
+
+	mdev->clock_info_page = alloc_page(GFP_KERNEL);
+	if (mdev->clock_info_page) {
+		mdev->clock_info = kmap(mdev->clock_info_page);
+		if (!mdev->clock_info) {
+			__free_page(mdev->clock_info_page);
+			mlx5_core_warn(mdev, "failed to map clock page\n");
+		} else {
+			mdev->clock_info->sign   = 0;
+			mdev->clock_info->nsec   = clock->tc.nsec;
+			mdev->clock_info->cycles = clock->tc.cycle_last;
+			mdev->clock_info->mask   = clock->cycles.mask;
+			mdev->clock_info->mult   = clock->nominal_c_mult;
+			mdev->clock_info->shift  = clock->cycles.shift;
+			mdev->clock_info->frac   = clock->tc.frac;
+			mdev->clock_info->overflow_period =
+						clock->overflow_period;
+		}
+	}
+
+	INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out);
+	INIT_DELAYED_WORK(&clock->overflow_work, mlx5_timestamp_overflow);
+	if (clock->overflow_period)
+		schedule_delayed_work(&clock->overflow_work, 0);
+	else
+		mlx5_core_warn(mdev, "invalid overflow period, overflow_work is not scheduled\n");
+
+	/* Configure the PHC */
+	clock->ptp_info = mlx5_ptp_clock_info;
+
+	/* Initialize 1PPS data structures */
+	if (MLX5_PPS_CAP(mdev))
+		mlx5_get_pps_caps(mdev);
+	if (clock->ptp_info.n_pins)
+		mlx5_init_pin_config(clock);
+
+	clock->ptp = ptp_clock_register(&clock->ptp_info,
+					&mdev->pdev->dev);
+	if (IS_ERR(clock->ptp)) {
+		mlx5_core_warn(mdev, "ptp_clock_register failed %ld\n",
+			       PTR_ERR(clock->ptp));
+		clock->ptp = NULL;
+	}
+}
+
+void mlx5_cleanup_clock(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_clock *clock = &mdev->clock;
+
+	if (!MLX5_CAP_GEN(mdev, device_frequency_khz))
+		return;
+
+	if (clock->ptp) {
+		ptp_clock_unregister(clock->ptp);
+		clock->ptp = NULL;
+	}
+
+	cancel_work_sync(&clock->pps_info.out_work);
+	cancel_delayed_work_sync(&clock->overflow_work);
+
+	if (mdev->clock_info) {
+		kunmap(mdev->clock_info_page);
+		__free_page(mdev->clock_info_page);
+		mdev->clock_info = NULL;
+	}
+
+	kfree(clock->ptp_info.pin_config);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
new file mode 100644
index 0000000..a8eeced
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __LIB_CLOCK_H__
+#define __LIB_CLOCK_H__
+
+void mlx5_init_clock(struct mlx5_core_dev *mdev);
+void mlx5_cleanup_clock(struct mlx5_core_dev *mdev);
+
+static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock,
+						u64 timestamp)
+{
+	u64 nsec;
+
+	read_lock(&clock->lock);
+	nsec = timecounter_cyc2time(&clock->tc, timestamp);
+	read_unlock(&clock->lock);
+
+	return ns_to_ktime(nsec);
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c
new file mode 100644
index 0000000..7722a3f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/etherdevice.h>
+#include <linux/idr.h>
+#include "mlx5_core.h"
+#include "lib/mlx5.h"
+
+void mlx5_init_reserved_gids(struct mlx5_core_dev *dev)
+{
+	unsigned int tblsz = MLX5_CAP_ROCE(dev, roce_address_table_size);
+
+	ida_init(&dev->roce.reserved_gids.ida);
+	dev->roce.reserved_gids.start = tblsz;
+	dev->roce.reserved_gids.count = 0;
+}
+
+void mlx5_cleanup_reserved_gids(struct mlx5_core_dev *dev)
+{
+	WARN_ON(!ida_is_empty(&dev->roce.reserved_gids.ida));
+	dev->roce.reserved_gids.start = 0;
+	dev->roce.reserved_gids.count = 0;
+	ida_destroy(&dev->roce.reserved_gids.ida);
+}
+
+int mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count)
+{
+	if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
+		mlx5_core_err(dev, "Cannot reserve GIDs when interfaces are up\n");
+		return -EPERM;
+	}
+	if (dev->roce.reserved_gids.start < count) {
+		mlx5_core_warn(dev, "GID table exhausted attempting to reserve %d more GIDs\n",
+			       count);
+		return -ENOMEM;
+	}
+	if (dev->roce.reserved_gids.count + count > MLX5_MAX_RESERVED_GIDS) {
+		mlx5_core_warn(dev, "Unable to reserve %d more GIDs\n", count);
+		return -ENOMEM;
+	}
+
+	dev->roce.reserved_gids.start -= count;
+	dev->roce.reserved_gids.count += count;
+	mlx5_core_dbg(dev, "Reserved %u GIDs starting at %u\n",
+		      dev->roce.reserved_gids.count,
+		      dev->roce.reserved_gids.start);
+	return 0;
+}
+
+void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count)
+{
+	WARN(test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state), "Unreserving GIDs when interfaces are up");
+	WARN(count > dev->roce.reserved_gids.count, "Unreserving %u GIDs when only %u reserved",
+	     count, dev->roce.reserved_gids.count);
+
+	dev->roce.reserved_gids.start += count;
+	dev->roce.reserved_gids.count -= count;
+	mlx5_core_dbg(dev, "%u GIDs starting at %u left reserved\n",
+		      dev->roce.reserved_gids.count,
+		      dev->roce.reserved_gids.start);
+}
+
+int mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index)
+{
+	int end = dev->roce.reserved_gids.start +
+		  dev->roce.reserved_gids.count;
+	int index = 0;
+
+	index = ida_simple_get(&dev->roce.reserved_gids.ida,
+			       dev->roce.reserved_gids.start, end,
+			       GFP_KERNEL);
+	if (index < 0)
+		return index;
+
+	mlx5_core_dbg(dev, "Allocating reserved GID %u\n", index);
+	*gid_index = index;
+	return 0;
+}
+
+void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index)
+{
+	mlx5_core_dbg(dev, "Freeing reserved GID %u\n", gid_index);
+	ida_simple_remove(&dev->roce.reserved_gids.ida, gid_index);
+}
+
+unsigned int mlx5_core_reserved_gids_count(struct mlx5_core_dev *dev)
+{
+	return dev->roce.reserved_gids.count;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_reserved_gids_count);
+
+int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index,
+			   u8 roce_version, u8 roce_l3_type, const u8 *gid,
+			   const u8 *mac, bool vlan, u16 vlan_id, u8 port_num)
+{
+#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
+	u32  in[MLX5_ST_SZ_DW(set_roce_address_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0};
+	void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
+	char *addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, in_addr,
+					  source_l3_address);
+	void *addr_mac = MLX5_ADDR_OF(roce_addr_layout, in_addr,
+				      source_mac_47_32);
+	int gidsz = MLX5_FLD_SZ_BYTES(roce_addr_layout, source_l3_address);
+
+	if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return -EINVAL;
+
+	if (gid) {
+		if (vlan) {
+			MLX5_SET_RA(in_addr, vlan_valid, 1);
+			MLX5_SET_RA(in_addr, vlan_id, vlan_id);
+		}
+
+		ether_addr_copy(addr_mac, mac);
+		MLX5_SET_RA(in_addr, roce_version, roce_version);
+		MLX5_SET_RA(in_addr, roce_l3_type, roce_l3_type);
+		memcpy(addr_l3_addr, gid, gidsz);
+	}
+
+	if (MLX5_CAP_GEN(dev, num_vhca_ports) > 0)
+		MLX5_SET(set_roce_address_in, in, vhca_port_num, port_num);
+
+	MLX5_SET(set_roce_address_in, in, roce_address_index, index);
+	MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_roce_gid_set);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
new file mode 100644
index 0000000..7550b1c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __LIB_MLX5_H__
+#define __LIB_MLX5_H__
+
+void mlx5_init_reserved_gids(struct mlx5_core_dev *dev);
+void mlx5_cleanup_reserved_gids(struct mlx5_core_dev *dev);
+int  mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count);
+void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count);
+int  mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index);
+void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
new file mode 100644
index 0000000..7cb6712
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/mlx5_ifc.h>
+#include "mlx5_core.h"
+#include "lib/mpfs.h"
+
+/* HW L2 Table (MPFS) management */
+static int set_l2table_entry_cmd(struct mlx5_core_dev *dev, u32 index, u8 *mac)
+{
+	u32 in[MLX5_ST_SZ_DW(set_l2_table_entry_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(set_l2_table_entry_out)] = {0};
+	u8 *in_mac_addr;
+
+	MLX5_SET(set_l2_table_entry_in, in, opcode, MLX5_CMD_OP_SET_L2_TABLE_ENTRY);
+	MLX5_SET(set_l2_table_entry_in, in, table_index, index);
+
+	in_mac_addr = MLX5_ADDR_OF(set_l2_table_entry_in, in, mac_address);
+	ether_addr_copy(&in_mac_addr[2], mac);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int del_l2table_entry_cmd(struct mlx5_core_dev *dev, u32 index)
+{
+	u32 in[MLX5_ST_SZ_DW(delete_l2_table_entry_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(delete_l2_table_entry_out)] = {0};
+
+	MLX5_SET(delete_l2_table_entry_in, in, opcode, MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY);
+	MLX5_SET(delete_l2_table_entry_in, in, table_index, index);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+/* UC L2 table hash node */
+struct l2table_node {
+	struct l2addr_node node;
+	u32                index; /* index in HW l2 table */
+};
+
+struct mlx5_mpfs {
+	struct hlist_head    hash[MLX5_L2_ADDR_HASH_SIZE];
+	struct mutex         lock; /* Synchronize l2 table access */
+	u32                  size;
+	unsigned long        *bitmap;
+};
+
+static int alloc_l2table_index(struct mlx5_mpfs *l2table, u32 *ix)
+{
+	int err = 0;
+
+	*ix = find_first_zero_bit(l2table->bitmap, l2table->size);
+	if (*ix >= l2table->size)
+		err = -ENOSPC;
+	else
+		__set_bit(*ix, l2table->bitmap);
+
+	return err;
+}
+
+static void free_l2table_index(struct mlx5_mpfs *l2table, u32 ix)
+{
+	__clear_bit(ix, l2table->bitmap);
+}
+
+int mlx5_mpfs_init(struct mlx5_core_dev *dev)
+{
+	int l2table_size = 1 << MLX5_CAP_GEN(dev, log_max_l2_table);
+	struct mlx5_mpfs *mpfs;
+
+	if (!MLX5_VPORT_MANAGER(dev))
+		return 0;
+
+	mpfs = kzalloc(sizeof(*mpfs), GFP_KERNEL);
+	if (!mpfs)
+		return -ENOMEM;
+
+	mutex_init(&mpfs->lock);
+	mpfs->size   = l2table_size;
+	mpfs->bitmap = kcalloc(BITS_TO_LONGS(l2table_size),
+			       sizeof(uintptr_t), GFP_KERNEL);
+	if (!mpfs->bitmap) {
+		kfree(mpfs);
+		return -ENOMEM;
+	}
+
+	dev->priv.mpfs = mpfs;
+	return 0;
+}
+
+void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_mpfs *mpfs = dev->priv.mpfs;
+
+	if (!MLX5_VPORT_MANAGER(dev))
+		return;
+
+	WARN_ON(!hlist_empty(mpfs->hash));
+	kfree(mpfs->bitmap);
+	kfree(mpfs);
+}
+
+int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac)
+{
+	struct mlx5_mpfs *mpfs = dev->priv.mpfs;
+	struct l2table_node *l2addr;
+	u32 index;
+	int err;
+
+	if (!MLX5_VPORT_MANAGER(dev))
+		return 0;
+
+	mutex_lock(&mpfs->lock);
+
+	l2addr = l2addr_hash_find(mpfs->hash, mac, struct l2table_node);
+	if (l2addr) {
+		err = -EEXIST;
+		goto abort;
+	}
+
+	err = alloc_l2table_index(mpfs, &index);
+	if (err)
+		goto abort;
+
+	l2addr = l2addr_hash_add(mpfs->hash, mac, struct l2table_node, GFP_KERNEL);
+	if (!l2addr) {
+		free_l2table_index(mpfs, index);
+		err = -ENOMEM;
+		goto abort;
+	}
+
+	l2addr->index = index;
+	err = set_l2table_entry_cmd(dev, index, mac);
+	if (err) {
+		l2addr_hash_del(l2addr);
+		free_l2table_index(mpfs, index);
+	}
+
+	mlx5_core_dbg(dev, "MPFS mac added %pM, index (%d)\n", mac, index);
+abort:
+	mutex_unlock(&mpfs->lock);
+	return err;
+}
+
+int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac)
+{
+	struct mlx5_mpfs *mpfs = dev->priv.mpfs;
+	struct l2table_node *l2addr;
+	int err = 0;
+	u32 index;
+
+	if (!MLX5_VPORT_MANAGER(dev))
+		return 0;
+
+	mutex_lock(&mpfs->lock);
+
+	l2addr = l2addr_hash_find(mpfs->hash, mac, struct l2table_node);
+	if (!l2addr) {
+		err = -ENOENT;
+		goto unlock;
+	}
+
+	index = l2addr->index;
+	del_l2table_entry_cmd(dev, index);
+	l2addr_hash_del(l2addr);
+	free_l2table_index(mpfs, index);
+	mlx5_core_dbg(dev, "MPFS mac deleted %pM, index (%d)\n", mac, index);
+unlock:
+	mutex_unlock(&mpfs->lock);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h
new file mode 100644
index 0000000..4a7b2c3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_MPFS_H__
+#define __MLX5_MPFS_H__
+
+#include <linux/if_ether.h>
+#include <linux/mlx5/device.h>
+
+/* L2 -mac address based- hash helpers */
+#define MLX5_L2_ADDR_HASH_SIZE (BIT(BITS_PER_BYTE))
+#define MLX5_L2_ADDR_HASH(addr) (addr[5])
+
+struct l2addr_node {
+	struct hlist_node hlist;
+	u8                addr[ETH_ALEN];
+};
+
+#define for_each_l2hash_node(hn, tmp, hash, i) \
+	for (i = 0; i < MLX5_L2_ADDR_HASH_SIZE; i++) \
+		hlist_for_each_entry_safe(hn, tmp, &(hash)[i], hlist)
+
+#define l2addr_hash_find(hash, mac, type) ({                \
+	int ix = MLX5_L2_ADDR_HASH(mac);                    \
+	bool found = false;                                 \
+	type *ptr = NULL;                                   \
+							    \
+	hlist_for_each_entry(ptr, &(hash)[ix], node.hlist)  \
+		if (ether_addr_equal(ptr->node.addr, mac)) {\
+			found = true;                       \
+			break;                              \
+		}                                           \
+	if (!found)                                         \
+		ptr = NULL;                                 \
+	ptr;                                                \
+})
+
+#define l2addr_hash_add(hash, mac, type, gfp) ({            \
+	int ix = MLX5_L2_ADDR_HASH(mac);                    \
+	type *ptr = NULL;                                   \
+							    \
+	ptr = kzalloc(sizeof(type), gfp);                   \
+	if (ptr) {                                          \
+		ether_addr_copy(ptr->node.addr, mac);       \
+		hlist_add_head(&ptr->node.hlist, &(hash)[ix]);\
+	}                                                   \
+	ptr;                                                \
+})
+
+#define l2addr_hash_del(ptr) ({                             \
+	hlist_del(&(ptr)->node.hlist);                      \
+	kfree(ptr);                                         \
+})
+
+#ifdef CONFIG_MLX5_MPFS
+int  mlx5_mpfs_init(struct mlx5_core_dev *dev);
+void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev);
+int  mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac);
+int  mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac);
+#else /* #ifndef CONFIG_MLX5_MPFS */
+static inline int  mlx5_mpfs_init(struct mlx5_core_dev *dev) { return 0; }
+static inline void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev) {}
+static inline int  mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; }
+static inline int  mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; }
+#endif
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mad.c b/drivers/net/ethernet/mellanox/mlx5/core/mad.c
new file mode 100644
index 0000000..3a3b000
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mad.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
+		      u16 opmod, u8 port)
+{
+	int outlen = MLX5_ST_SZ_BYTES(mad_ifc_out);
+	int inlen = MLX5_ST_SZ_BYTES(mad_ifc_in);
+	int err = -ENOMEM;
+	void *data;
+	void *resp;
+	u32 *out;
+	u32 *in;
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!in || !out)
+		goto out;
+
+	MLX5_SET(mad_ifc_in, in, opcode, MLX5_CMD_OP_MAD_IFC);
+	MLX5_SET(mad_ifc_in, in, op_mod, opmod);
+	MLX5_SET(mad_ifc_in, in, port, port);
+
+	data = MLX5_ADDR_OF(mad_ifc_in, in, mad);
+	memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad));
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, outlen);
+	if (err)
+		goto out;
+
+	resp = MLX5_ADDR_OF(mad_ifc_out, out, response_mad_packet);
+	memcpy(outb, resp,
+	       MLX5_FLD_SZ_BYTES(mad_ifc_out, response_mad_packet));
+
+out:
+	kfree(out);
+	kfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_mad_ifc);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
new file mode 100644
index 0000000..e2c465b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -0,0 +1,1702 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/io-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/srq.h>
+#include <linux/debugfs.h>
+#include <linux/kmod.h>
+#include <linux/mlx5/mlx5_ifc.h>
+#include <linux/mlx5/vport.h>
+#ifdef CONFIG_RFS_ACCEL
+#include <linux/cpu_rmap.h>
+#endif
+#include <net/devlink.h>
+#include "mlx5_core.h"
+#include "fs_core.h"
+#include "lib/mpfs.h"
+#include "eswitch.h"
+#include "lib/mlx5.h"
+#include "fpga/core.h"
+#include "fpga/ipsec.h"
+#include "accel/ipsec.h"
+#include "lib/clock.h"
+
+MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox Connect-IB, ConnectX-4 core driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRIVER_VERSION);
+
+unsigned int mlx5_core_debug_mask;
+module_param_named(debug_mask, mlx5_core_debug_mask, uint, 0644);
+MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec time, 3 = both. Default=0");
+
+#define MLX5_DEFAULT_PROF	2
+static unsigned int prof_sel = MLX5_DEFAULT_PROF;
+module_param_named(prof_sel, prof_sel, uint, 0444);
+MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2");
+
+static u32 sw_owner_id[4];
+
+enum {
+	MLX5_ATOMIC_REQ_MODE_BE = 0x0,
+	MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS = 0x1,
+};
+
+static struct mlx5_profile profile[] = {
+	[0] = {
+		.mask           = 0,
+	},
+	[1] = {
+		.mask		= MLX5_PROF_MASK_QP_SIZE,
+		.log_max_qp	= 12,
+	},
+	[2] = {
+		.mask		= MLX5_PROF_MASK_QP_SIZE |
+				  MLX5_PROF_MASK_MR_CACHE,
+		.log_max_qp	= 18,
+		.mr_cache[0]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[1]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[2]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[3]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[4]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[5]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[6]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[7]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[8]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[9]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[10]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[11]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[12]	= {
+			.size	= 64,
+			.limit	= 32
+		},
+		.mr_cache[13]	= {
+			.size	= 32,
+			.limit	= 16
+		},
+		.mr_cache[14]	= {
+			.size	= 16,
+			.limit	= 8
+		},
+		.mr_cache[15]	= {
+			.size	= 8,
+			.limit	= 4
+		},
+		.mr_cache[16]	= {
+			.size	= 8,
+			.limit	= 4
+		},
+		.mr_cache[17]	= {
+			.size	= 8,
+			.limit	= 4
+		},
+		.mr_cache[18]	= {
+			.size	= 8,
+			.limit	= 4
+		},
+		.mr_cache[19]	= {
+			.size	= 4,
+			.limit	= 2
+		},
+		.mr_cache[20]	= {
+			.size	= 4,
+			.limit	= 2
+		},
+	},
+};
+
+#define FW_INIT_TIMEOUT_MILI		2000
+#define FW_INIT_WAIT_MS			2
+#define FW_PRE_INIT_TIMEOUT_MILI	10000
+
+static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili)
+{
+	unsigned long end = jiffies + msecs_to_jiffies(max_wait_mili);
+	int err = 0;
+
+	while (fw_initializing(dev)) {
+		if (time_after(jiffies, end)) {
+			err = -EBUSY;
+			break;
+		}
+		msleep(FW_INIT_WAIT_MS);
+	}
+
+	return err;
+}
+
+static void mlx5_set_driver_version(struct mlx5_core_dev *dev)
+{
+	int driver_ver_sz = MLX5_FLD_SZ_BYTES(set_driver_version_in,
+					      driver_version);
+	u8 in[MLX5_ST_SZ_BYTES(set_driver_version_in)] = {0};
+	u8 out[MLX5_ST_SZ_BYTES(set_driver_version_out)] = {0};
+	int remaining_size = driver_ver_sz;
+	char *string;
+
+	if (!MLX5_CAP_GEN(dev, driver_version))
+		return;
+
+	string = MLX5_ADDR_OF(set_driver_version_in, in, driver_version);
+
+	strncpy(string, "Linux", remaining_size);
+
+	remaining_size = max_t(int, 0, driver_ver_sz - strlen(string));
+	strncat(string, ",", remaining_size);
+
+	remaining_size = max_t(int, 0, driver_ver_sz - strlen(string));
+	strncat(string, DRIVER_NAME, remaining_size);
+
+	remaining_size = max_t(int, 0, driver_ver_sz - strlen(string));
+	strncat(string, ",", remaining_size);
+
+	remaining_size = max_t(int, 0, driver_ver_sz - strlen(string));
+	strncat(string, DRIVER_VERSION, remaining_size);
+
+	/*Send the command*/
+	MLX5_SET(set_driver_version_in, in, opcode,
+		 MLX5_CMD_OP_SET_DRIVER_VERSION);
+
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int set_dma_caps(struct pci_dev *pdev)
+{
+	int err;
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n");
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n");
+			return err;
+		}
+	}
+
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev,
+			 "Warning: couldn't set 64-bit consistent PCI DMA mask\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev,
+				"Can't set consistent PCI DMA mask, aborting\n");
+			return err;
+		}
+	}
+
+	dma_set_max_seg_size(&pdev->dev, 2u * 1024 * 1024 * 1024);
+	return err;
+}
+
+static int mlx5_pci_enable_device(struct mlx5_core_dev *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int err = 0;
+
+	mutex_lock(&dev->pci_status_mutex);
+	if (dev->pci_status == MLX5_PCI_STATUS_DISABLED) {
+		err = pci_enable_device(pdev);
+		if (!err)
+			dev->pci_status = MLX5_PCI_STATUS_ENABLED;
+	}
+	mutex_unlock(&dev->pci_status_mutex);
+
+	return err;
+}
+
+static void mlx5_pci_disable_device(struct mlx5_core_dev *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+
+	mutex_lock(&dev->pci_status_mutex);
+	if (dev->pci_status == MLX5_PCI_STATUS_ENABLED) {
+		pci_disable_device(pdev);
+		dev->pci_status = MLX5_PCI_STATUS_DISABLED;
+	}
+	mutex_unlock(&dev->pci_status_mutex);
+}
+
+static int request_bar(struct pci_dev *pdev)
+{
+	int err = 0;
+
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing registers BAR, aborting\n");
+		return -ENODEV;
+	}
+
+	err = pci_request_regions(pdev, DRIVER_NAME);
+	if (err)
+		dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n");
+
+	return err;
+}
+
+static void release_bar(struct pci_dev *pdev)
+{
+	pci_release_regions(pdev);
+}
+
+static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_eq_table *table = &priv->eq_table;
+	int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
+	int nvec;
+	int err;
+
+	nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
+	       MLX5_EQ_VEC_COMP_BASE;
+	nvec = min_t(int, nvec, num_eqs);
+	if (nvec <= MLX5_EQ_VEC_COMP_BASE)
+		return -ENOMEM;
+
+	priv->irq_info = kcalloc(nvec, sizeof(*priv->irq_info), GFP_KERNEL);
+	if (!priv->irq_info)
+		return -ENOMEM;
+
+	nvec = pci_alloc_irq_vectors(dev->pdev,
+			MLX5_EQ_VEC_COMP_BASE + 1, nvec,
+			PCI_IRQ_MSIX);
+	if (nvec < 0) {
+		err = nvec;
+		goto err_free_irq_info;
+	}
+
+	table->num_comp_vectors = nvec - MLX5_EQ_VEC_COMP_BASE;
+
+	return 0;
+
+err_free_irq_info:
+	kfree(priv->irq_info);
+	return err;
+}
+
+static void mlx5_free_irq_vectors(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+
+	pci_free_irq_vectors(dev->pdev);
+	kfree(priv->irq_info);
+}
+
+struct mlx5_reg_host_endianness {
+	u8	he;
+	u8      rsvd[15];
+};
+
+#define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos))
+
+enum {
+	MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) |
+				MLX5_DEV_CAP_FLAG_DCT,
+};
+
+static u16 to_fw_pkey_sz(struct mlx5_core_dev *dev, u32 size)
+{
+	switch (size) {
+	case 128:
+		return 0;
+	case 256:
+		return 1;
+	case 512:
+		return 2;
+	case 1024:
+		return 3;
+	case 2048:
+		return 4;
+	case 4096:
+		return 5;
+	default:
+		mlx5_core_warn(dev, "invalid pkey table size %d\n", size);
+		return 0;
+	}
+}
+
+static int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev,
+				   enum mlx5_cap_type cap_type,
+				   enum mlx5_cap_mode cap_mode)
+{
+	u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)];
+	int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+	void *out, *hca_caps;
+	u16 opmod = (cap_type << 1) | (cap_mode & 0x01);
+	int err;
+
+	memset(in, 0, sizeof(in));
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+	MLX5_SET(query_hca_cap_in, in, op_mod, opmod);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
+	if (err) {
+		mlx5_core_warn(dev,
+			       "QUERY_HCA_CAP : type(%x) opmode(%x) Failed(%d)\n",
+			       cap_type, cap_mode, err);
+		goto query_ex;
+	}
+
+	hca_caps =  MLX5_ADDR_OF(query_hca_cap_out, out, capability);
+
+	switch (cap_mode) {
+	case HCA_CAP_OPMOD_GET_MAX:
+		memcpy(dev->caps.hca_max[cap_type], hca_caps,
+		       MLX5_UN_SZ_BYTES(hca_cap_union));
+		break;
+	case HCA_CAP_OPMOD_GET_CUR:
+		memcpy(dev->caps.hca_cur[cap_type], hca_caps,
+		       MLX5_UN_SZ_BYTES(hca_cap_union));
+		break;
+	default:
+		mlx5_core_warn(dev,
+			       "Tried to query dev cap type(%x) with wrong opmode(%x)\n",
+			       cap_type, cap_mode);
+		err = -EINVAL;
+		break;
+	}
+query_ex:
+	kfree(out);
+	return err;
+}
+
+int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type)
+{
+	int ret;
+
+	ret = mlx5_core_get_caps_mode(dev, cap_type, HCA_CAP_OPMOD_GET_CUR);
+	if (ret)
+		return ret;
+	return mlx5_core_get_caps_mode(dev, cap_type, HCA_CAP_OPMOD_GET_MAX);
+}
+
+static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz, int opmod)
+{
+	u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)] = {0};
+
+	MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP);
+	MLX5_SET(set_hca_cap_in, in, op_mod, opmod << 1);
+	return mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out));
+}
+
+static int handle_hca_cap_atomic(struct mlx5_core_dev *dev)
+{
+	void *set_ctx;
+	void *set_hca_cap;
+	int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
+	int req_endianness;
+	int err;
+
+	if (MLX5_CAP_GEN(dev, atomic)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC);
+		if (err)
+			return err;
+	} else {
+		return 0;
+	}
+
+	req_endianness =
+		MLX5_CAP_ATOMIC(dev,
+				supported_atomic_req_8B_endianness_mode_1);
+
+	if (req_endianness != MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS)
+		return 0;
+
+	set_ctx = kzalloc(set_sz, GFP_KERNEL);
+	if (!set_ctx)
+		return -ENOMEM;
+
+	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability);
+
+	/* Set requestor to host endianness */
+	MLX5_SET(atomic_caps, set_hca_cap, atomic_req_8B_endianness_mode,
+		 MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS);
+
+	err = set_caps(dev, set_ctx, set_sz, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC);
+
+	kfree(set_ctx);
+	return err;
+}
+
+static int handle_hca_cap(struct mlx5_core_dev *dev)
+{
+	void *set_ctx = NULL;
+	struct mlx5_profile *prof = dev->profile;
+	int err = -ENOMEM;
+	int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
+	void *set_hca_cap;
+
+	set_ctx = kzalloc(set_sz, GFP_KERNEL);
+	if (!set_ctx)
+		goto query_ex;
+
+	err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL);
+	if (err)
+		goto query_ex;
+
+	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx,
+				   capability);
+	memcpy(set_hca_cap, dev->caps.hca_cur[MLX5_CAP_GENERAL],
+	       MLX5_ST_SZ_BYTES(cmd_hca_cap));
+
+	mlx5_core_dbg(dev, "Current Pkey table size %d Setting new size %d\n",
+		      mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(dev, pkey_table_size)),
+		      128);
+	/* we limit the size of the pkey table to 128 entries for now */
+	MLX5_SET(cmd_hca_cap, set_hca_cap, pkey_table_size,
+		 to_fw_pkey_sz(dev, 128));
+
+	/* Check log_max_qp from HCA caps to set in current profile */
+	if (MLX5_CAP_GEN_MAX(dev, log_max_qp) < profile[prof_sel].log_max_qp) {
+		mlx5_core_warn(dev, "log_max_qp value in current profile is %d, changing it to HCA capability limit (%d)\n",
+			       profile[prof_sel].log_max_qp,
+			       MLX5_CAP_GEN_MAX(dev, log_max_qp));
+		profile[prof_sel].log_max_qp = MLX5_CAP_GEN_MAX(dev, log_max_qp);
+	}
+	if (prof->mask & MLX5_PROF_MASK_QP_SIZE)
+		MLX5_SET(cmd_hca_cap, set_hca_cap, log_max_qp,
+			 prof->log_max_qp);
+
+	/* disable cmdif checksum */
+	MLX5_SET(cmd_hca_cap, set_hca_cap, cmdif_checksum, 0);
+
+	/* Enable 4K UAR only when HCA supports it and page size is bigger
+	 * than 4K.
+	 */
+	if (MLX5_CAP_GEN_MAX(dev, uar_4k) && PAGE_SIZE > 4096)
+		MLX5_SET(cmd_hca_cap, set_hca_cap, uar_4k, 1);
+
+	MLX5_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, PAGE_SHIFT - 12);
+
+	if (MLX5_CAP_GEN_MAX(dev, cache_line_128byte))
+		MLX5_SET(cmd_hca_cap,
+			 set_hca_cap,
+			 cache_line_128byte,
+			 cache_line_size() >= 128 ? 1 : 0);
+
+	if (MLX5_CAP_GEN_MAX(dev, dct))
+		MLX5_SET(cmd_hca_cap, set_hca_cap, dct, 1);
+
+	if (MLX5_CAP_GEN_MAX(dev, num_vhca_ports))
+		MLX5_SET(cmd_hca_cap,
+			 set_hca_cap,
+			 num_vhca_ports,
+			 MLX5_CAP_GEN_MAX(dev, num_vhca_ports));
+
+	err = set_caps(dev, set_ctx, set_sz,
+		       MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
+
+query_ex:
+	kfree(set_ctx);
+	return err;
+}
+
+static int set_hca_ctrl(struct mlx5_core_dev *dev)
+{
+	struct mlx5_reg_host_endianness he_in;
+	struct mlx5_reg_host_endianness he_out;
+	int err;
+
+	if (!mlx5_core_is_pf(dev))
+		return 0;
+
+	memset(&he_in, 0, sizeof(he_in));
+	he_in.he = MLX5_SET_HOST_ENDIANNESS;
+	err = mlx5_core_access_reg(dev, &he_in,  sizeof(he_in),
+					&he_out, sizeof(he_out),
+					MLX5_REG_HOST_ENDIANNESS, 0, 1);
+	return err;
+}
+
+static int mlx5_core_set_hca_defaults(struct mlx5_core_dev *dev)
+{
+	int ret = 0;
+
+	/* Disable local_lb by default */
+	if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH)
+		ret = mlx5_nic_vport_update_local_lb(dev, false);
+
+	return ret;
+}
+
+int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id)
+{
+	u32 out[MLX5_ST_SZ_DW(enable_hca_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(enable_hca_in)]   = {0};
+
+	MLX5_SET(enable_hca_in, in, opcode, MLX5_CMD_OP_ENABLE_HCA);
+	MLX5_SET(enable_hca_in, in, function_id, func_id);
+	return mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+}
+
+int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id)
+{
+	u32 out[MLX5_ST_SZ_DW(disable_hca_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(disable_hca_in)]   = {0};
+
+	MLX5_SET(disable_hca_in, in, opcode, MLX5_CMD_OP_DISABLE_HCA);
+	MLX5_SET(disable_hca_in, in, function_id, func_id);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
+{
+	u32 timer_h, timer_h1, timer_l;
+
+	timer_h = ioread32be(&dev->iseg->internal_timer_h);
+	timer_l = ioread32be(&dev->iseg->internal_timer_l);
+	timer_h1 = ioread32be(&dev->iseg->internal_timer_h);
+	if (timer_h != timer_h1) /* wrap around */
+		timer_l = ioread32be(&dev->iseg->internal_timer_l);
+
+	return (u64)timer_l | (u64)timer_h1 << 32;
+}
+
+static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
+{
+	struct mlx5_priv *priv  = &mdev->priv;
+	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
+
+	if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
+		mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
+		return -ENOMEM;
+	}
+
+	cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node),
+			priv->irq_info[i].mask);
+
+	if (IS_ENABLED(CONFIG_SMP) &&
+	    irq_set_affinity_hint(irq, priv->irq_info[i].mask))
+		mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq);
+
+	return 0;
+}
+
+static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
+{
+	struct mlx5_priv *priv  = &mdev->priv;
+	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
+
+	irq_set_affinity_hint(irq, NULL);
+	free_cpumask_var(priv->irq_info[i].mask);
+}
+
+static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
+		err = mlx5_irq_set_affinity_hint(mdev, i);
+		if (err)
+			goto err_out;
+	}
+
+	return 0;
+
+err_out:
+	for (i--; i >= 0; i--)
+		mlx5_irq_clear_affinity_hint(mdev, i);
+
+	return err;
+}
+
+static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
+{
+	int i;
+
+	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
+		mlx5_irq_clear_affinity_hint(mdev, i);
+}
+
+int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
+		    unsigned int *irqn)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_eq *eq, *n;
+	int err = -ENOENT;
+
+	spin_lock(&table->lock);
+	list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) {
+		if (eq->index == vector) {
+			*eqn = eq->eqn;
+			*irqn = eq->irqn;
+			err = 0;
+			break;
+		}
+	}
+	spin_unlock(&table->lock);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_vector2eqn);
+
+struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_eq *eq;
+
+	spin_lock(&table->lock);
+	list_for_each_entry(eq, &table->comp_eqs_list, list)
+		if (eq->eqn == eqn) {
+			spin_unlock(&table->lock);
+			return eq;
+		}
+
+	spin_unlock(&table->lock);
+
+	return ERR_PTR(-ENOENT);
+}
+
+static void free_comp_eqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_eq *eq, *n;
+
+#ifdef CONFIG_RFS_ACCEL
+	if (dev->rmap) {
+		free_irq_cpu_rmap(dev->rmap);
+		dev->rmap = NULL;
+	}
+#endif
+	spin_lock(&table->lock);
+	list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) {
+		list_del(&eq->list);
+		spin_unlock(&table->lock);
+		if (mlx5_destroy_unmap_eq(dev, eq))
+			mlx5_core_warn(dev, "failed to destroy EQ 0x%x\n",
+				       eq->eqn);
+		kfree(eq);
+		spin_lock(&table->lock);
+	}
+	spin_unlock(&table->lock);
+}
+
+static int alloc_comp_eqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	char name[MLX5_MAX_IRQ_NAME];
+	struct mlx5_eq *eq;
+	int ncomp_vec;
+	int nent;
+	int err;
+	int i;
+
+	INIT_LIST_HEAD(&table->comp_eqs_list);
+	ncomp_vec = table->num_comp_vectors;
+	nent = MLX5_COMP_EQ_SIZE;
+#ifdef CONFIG_RFS_ACCEL
+	dev->rmap = alloc_irq_cpu_rmap(ncomp_vec);
+	if (!dev->rmap)
+		return -ENOMEM;
+#endif
+	for (i = 0; i < ncomp_vec; i++) {
+		eq = kzalloc(sizeof(*eq), GFP_KERNEL);
+		if (!eq) {
+			err = -ENOMEM;
+			goto clean;
+		}
+
+#ifdef CONFIG_RFS_ACCEL
+		irq_cpu_rmap_add(dev->rmap, pci_irq_vector(dev->pdev,
+				 MLX5_EQ_VEC_COMP_BASE + i));
+#endif
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
+		err = mlx5_create_map_eq(dev, eq,
+					 i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
+					 name, MLX5_EQ_TYPE_COMP);
+		if (err) {
+			kfree(eq);
+			goto clean;
+		}
+		mlx5_core_dbg(dev, "allocated completion EQN %d\n", eq->eqn);
+		eq->index = i;
+		spin_lock(&table->lock);
+		list_add_tail(&eq->list, &table->comp_eqs_list);
+		spin_unlock(&table->lock);
+	}
+
+	return 0;
+
+clean:
+	free_comp_eqs(dev);
+	return err;
+}
+
+static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
+{
+	u32 query_in[MLX5_ST_SZ_DW(query_issi_in)]   = {0};
+	u32 query_out[MLX5_ST_SZ_DW(query_issi_out)] = {0};
+	u32 sup_issi;
+	int err;
+
+	MLX5_SET(query_issi_in, query_in, opcode, MLX5_CMD_OP_QUERY_ISSI);
+	err = mlx5_cmd_exec(dev, query_in, sizeof(query_in),
+			    query_out, sizeof(query_out));
+	if (err) {
+		u32 syndrome;
+		u8 status;
+
+		mlx5_cmd_mbox_status(query_out, &status, &syndrome);
+		if (!status || syndrome == MLX5_DRIVER_SYND) {
+			mlx5_core_err(dev, "Failed to query ISSI err(%d) status(%d) synd(%d)\n",
+				      err, status, syndrome);
+			return err;
+		}
+
+		mlx5_core_warn(dev, "Query ISSI is not supported by FW, ISSI is 0\n");
+		dev->issi = 0;
+		return 0;
+	}
+
+	sup_issi = MLX5_GET(query_issi_out, query_out, supported_issi_dw0);
+
+	if (sup_issi & (1 << 1)) {
+		u32 set_in[MLX5_ST_SZ_DW(set_issi_in)]   = {0};
+		u32 set_out[MLX5_ST_SZ_DW(set_issi_out)] = {0};
+
+		MLX5_SET(set_issi_in, set_in, opcode, MLX5_CMD_OP_SET_ISSI);
+		MLX5_SET(set_issi_in, set_in, current_issi, 1);
+		err = mlx5_cmd_exec(dev, set_in, sizeof(set_in),
+				    set_out, sizeof(set_out));
+		if (err) {
+			mlx5_core_err(dev, "Failed to set ISSI to 1 err(%d)\n",
+				      err);
+			return err;
+		}
+
+		dev->issi = 1;
+
+		return 0;
+	} else if (sup_issi & (1 << 0) || !sup_issi) {
+		return 0;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static int mlx5_pci_init(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int err = 0;
+
+	pci_set_drvdata(dev->pdev, dev);
+	strncpy(priv->name, dev_name(&pdev->dev), MLX5_MAX_NAME_LEN);
+	priv->name[MLX5_MAX_NAME_LEN - 1] = 0;
+
+	mutex_init(&priv->pgdir_mutex);
+	INIT_LIST_HEAD(&priv->pgdir_list);
+	spin_lock_init(&priv->mkey_lock);
+
+	mutex_init(&priv->alloc_mutex);
+
+	priv->numa_node = dev_to_node(&dev->pdev->dev);
+
+	priv->dbg_root = debugfs_create_dir(dev_name(&pdev->dev), mlx5_debugfs_root);
+	if (!priv->dbg_root)
+		return -ENOMEM;
+
+	err = mlx5_pci_enable_device(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n");
+		goto err_dbg;
+	}
+
+	err = request_bar(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "error requesting BARs, aborting\n");
+		goto err_disable;
+	}
+
+	pci_set_master(pdev);
+
+	err = set_dma_caps(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed setting DMA capabilities mask, aborting\n");
+		goto err_clr_master;
+	}
+
+	dev->iseg_base = pci_resource_start(dev->pdev, 0);
+	dev->iseg = ioremap(dev->iseg_base, sizeof(*dev->iseg));
+	if (!dev->iseg) {
+		err = -ENOMEM;
+		dev_err(&pdev->dev, "Failed mapping initialization segment, aborting\n");
+		goto err_clr_master;
+	}
+
+	return 0;
+
+err_clr_master:
+	pci_clear_master(dev->pdev);
+	release_bar(dev->pdev);
+err_disable:
+	mlx5_pci_disable_device(dev);
+
+err_dbg:
+	debugfs_remove(priv->dbg_root);
+	return err;
+}
+
+static void mlx5_pci_close(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
+{
+	iounmap(dev->iseg);
+	pci_clear_master(dev->pdev);
+	release_bar(dev->pdev);
+	mlx5_pci_disable_device(dev);
+	debugfs_remove(priv->dbg_root);
+}
+
+static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int err;
+
+	err = mlx5_query_board_id(dev);
+	if (err) {
+		dev_err(&pdev->dev, "query board id failed\n");
+		goto out;
+	}
+
+	err = mlx5_eq_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to initialize eq\n");
+		goto out;
+	}
+
+	err = mlx5_cq_debugfs_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to initialize cq debugfs\n");
+		goto err_eq_cleanup;
+	}
+
+	mlx5_init_qp_table(dev);
+
+	mlx5_init_srq_table(dev);
+
+	mlx5_init_mkey_table(dev);
+
+	mlx5_init_reserved_gids(dev);
+
+	mlx5_init_clock(dev);
+
+	err = mlx5_init_rl_table(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init rate limiting\n");
+		goto err_tables_cleanup;
+	}
+
+	err = mlx5_mpfs_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init l2 table %d\n", err);
+		goto err_rl_cleanup;
+	}
+
+	err = mlx5_eswitch_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init eswitch %d\n", err);
+		goto err_mpfs_cleanup;
+	}
+
+	err = mlx5_sriov_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init sriov %d\n", err);
+		goto err_eswitch_cleanup;
+	}
+
+	err = mlx5_fpga_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init fpga device %d\n", err);
+		goto err_sriov_cleanup;
+	}
+
+	return 0;
+
+err_sriov_cleanup:
+	mlx5_sriov_cleanup(dev);
+err_eswitch_cleanup:
+	mlx5_eswitch_cleanup(dev->priv.eswitch);
+err_mpfs_cleanup:
+	mlx5_mpfs_cleanup(dev);
+err_rl_cleanup:
+	mlx5_cleanup_rl_table(dev);
+err_tables_cleanup:
+	mlx5_cleanup_mkey_table(dev);
+	mlx5_cleanup_srq_table(dev);
+	mlx5_cleanup_qp_table(dev);
+	mlx5_cq_debugfs_cleanup(dev);
+
+err_eq_cleanup:
+	mlx5_eq_cleanup(dev);
+
+out:
+	return err;
+}
+
+static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
+{
+	mlx5_fpga_cleanup(dev);
+	mlx5_sriov_cleanup(dev);
+	mlx5_eswitch_cleanup(dev->priv.eswitch);
+	mlx5_mpfs_cleanup(dev);
+	mlx5_cleanup_rl_table(dev);
+	mlx5_cleanup_clock(dev);
+	mlx5_cleanup_reserved_gids(dev);
+	mlx5_cleanup_mkey_table(dev);
+	mlx5_cleanup_srq_table(dev);
+	mlx5_cleanup_qp_table(dev);
+	mlx5_cq_debugfs_cleanup(dev);
+	mlx5_eq_cleanup(dev);
+}
+
+static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
+			 bool boot)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int err;
+
+	mutex_lock(&dev->intf_state_mutex);
+	if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
+		dev_warn(&dev->pdev->dev, "%s: interface is up, NOP\n",
+			 __func__);
+		goto out;
+	}
+
+	dev_info(&pdev->dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev),
+		 fw_rev_min(dev), fw_rev_sub(dev));
+
+	/* Only PFs hold the relevant PCIe information for this query */
+	if (mlx5_core_is_pf(dev))
+		pcie_print_link_status(dev->pdev);
+
+	/* on load removing any previous indication of internal error, device is
+	 * up
+	 */
+	dev->state = MLX5_DEVICE_STATE_UP;
+
+	/* wait for firmware to accept initialization segments configurations
+	 */
+	err = wait_fw_init(dev, FW_PRE_INIT_TIMEOUT_MILI);
+	if (err) {
+		dev_err(&dev->pdev->dev, "Firmware over %d MS in pre-initializing state, aborting\n",
+			FW_PRE_INIT_TIMEOUT_MILI);
+		goto out_err;
+	}
+
+	err = mlx5_cmd_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed initializing command interface, aborting\n");
+		goto out_err;
+	}
+
+	err = wait_fw_init(dev, FW_INIT_TIMEOUT_MILI);
+	if (err) {
+		dev_err(&dev->pdev->dev, "Firmware over %d MS in initializing state, aborting\n",
+			FW_INIT_TIMEOUT_MILI);
+		goto err_cmd_cleanup;
+	}
+
+	err = mlx5_core_enable_hca(dev, 0);
+	if (err) {
+		dev_err(&pdev->dev, "enable hca failed\n");
+		goto err_cmd_cleanup;
+	}
+
+	err = mlx5_core_set_issi(dev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to set issi\n");
+		goto err_disable_hca;
+	}
+
+	err = mlx5_satisfy_startup_pages(dev, 1);
+	if (err) {
+		dev_err(&pdev->dev, "failed to allocate boot pages\n");
+		goto err_disable_hca;
+	}
+
+	err = set_hca_ctrl(dev);
+	if (err) {
+		dev_err(&pdev->dev, "set_hca_ctrl failed\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = handle_hca_cap(dev);
+	if (err) {
+		dev_err(&pdev->dev, "handle_hca_cap failed\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = handle_hca_cap_atomic(dev);
+	if (err) {
+		dev_err(&pdev->dev, "handle_hca_cap_atomic failed\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = mlx5_satisfy_startup_pages(dev, 0);
+	if (err) {
+		dev_err(&pdev->dev, "failed to allocate init pages\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = mlx5_pagealloc_start(dev);
+	if (err) {
+		dev_err(&pdev->dev, "mlx5_pagealloc_start failed\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = mlx5_cmd_init_hca(dev, sw_owner_id);
+	if (err) {
+		dev_err(&pdev->dev, "init hca failed\n");
+		goto err_pagealloc_stop;
+	}
+
+	mlx5_set_driver_version(dev);
+
+	mlx5_start_health_poll(dev);
+
+	err = mlx5_query_hca_caps(dev);
+	if (err) {
+		dev_err(&pdev->dev, "query hca failed\n");
+		goto err_stop_poll;
+	}
+
+	if (boot) {
+		err = mlx5_init_once(dev, priv);
+		if (err) {
+			dev_err(&pdev->dev, "sw objs init failed\n");
+			goto err_stop_poll;
+		}
+	}
+
+	err = mlx5_alloc_irq_vectors(dev);
+	if (err) {
+		dev_err(&pdev->dev, "alloc irq vectors failed\n");
+		goto err_cleanup_once;
+	}
+
+	dev->priv.uar = mlx5_get_uars_page(dev);
+	if (IS_ERR(dev->priv.uar)) {
+		dev_err(&pdev->dev, "Failed allocating uar, aborting\n");
+		err = PTR_ERR(dev->priv.uar);
+		goto err_disable_msix;
+	}
+
+	err = mlx5_start_eqs(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to start pages and async EQs\n");
+		goto err_put_uars;
+	}
+
+	err = alloc_comp_eqs(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to alloc completion EQs\n");
+		goto err_stop_eqs;
+	}
+
+	err = mlx5_irq_set_affinity_hints(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
+		goto err_affinity_hints;
+	}
+
+	err = mlx5_fpga_device_start(dev);
+	if (err) {
+		dev_err(&pdev->dev, "fpga device start failed %d\n", err);
+		goto err_fpga_start;
+	}
+
+	err = mlx5_accel_ipsec_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "IPSec device start failed %d\n", err);
+		goto err_ipsec_start;
+	}
+
+	err = mlx5_init_fs(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init flow steering\n");
+		goto err_fs;
+	}
+
+	err = mlx5_core_set_hca_defaults(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to set hca defaults\n");
+		goto err_fs;
+	}
+
+	err = mlx5_sriov_attach(dev);
+	if (err) {
+		dev_err(&pdev->dev, "sriov init failed %d\n", err);
+		goto err_sriov;
+	}
+
+	if (mlx5_device_registered(dev)) {
+		mlx5_attach_device(dev);
+	} else {
+		err = mlx5_register_device(dev);
+		if (err) {
+			dev_err(&pdev->dev, "mlx5_register_device failed %d\n", err);
+			goto err_reg_dev;
+		}
+	}
+
+	set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
+out:
+	mutex_unlock(&dev->intf_state_mutex);
+
+	return 0;
+
+err_reg_dev:
+	mlx5_sriov_detach(dev);
+
+err_sriov:
+	mlx5_cleanup_fs(dev);
+
+err_fs:
+	mlx5_accel_ipsec_cleanup(dev);
+
+err_ipsec_start:
+	mlx5_fpga_device_stop(dev);
+
+err_fpga_start:
+	mlx5_irq_clear_affinity_hints(dev);
+
+err_affinity_hints:
+	free_comp_eqs(dev);
+
+err_stop_eqs:
+	mlx5_stop_eqs(dev);
+
+err_put_uars:
+	mlx5_put_uars_page(dev, priv->uar);
+
+err_disable_msix:
+	mlx5_free_irq_vectors(dev);
+
+err_cleanup_once:
+	if (boot)
+		mlx5_cleanup_once(dev);
+
+err_stop_poll:
+	mlx5_stop_health_poll(dev);
+	if (mlx5_cmd_teardown_hca(dev)) {
+		dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
+		goto out_err;
+	}
+
+err_pagealloc_stop:
+	mlx5_pagealloc_stop(dev);
+
+reclaim_boot_pages:
+	mlx5_reclaim_startup_pages(dev);
+
+err_disable_hca:
+	mlx5_core_disable_hca(dev, 0);
+
+err_cmd_cleanup:
+	mlx5_cmd_cleanup(dev);
+
+out_err:
+	dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+	mutex_unlock(&dev->intf_state_mutex);
+
+	return err;
+}
+
+static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
+			   bool cleanup)
+{
+	int err = 0;
+
+	if (cleanup)
+		mlx5_drain_health_recovery(dev);
+
+	mutex_lock(&dev->intf_state_mutex);
+	if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
+		dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n",
+			 __func__);
+		if (cleanup)
+			mlx5_cleanup_once(dev);
+		goto out;
+	}
+
+	clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
+
+	if (mlx5_device_registered(dev))
+		mlx5_detach_device(dev);
+
+	mlx5_sriov_detach(dev);
+	mlx5_cleanup_fs(dev);
+	mlx5_accel_ipsec_cleanup(dev);
+	mlx5_fpga_device_stop(dev);
+	mlx5_irq_clear_affinity_hints(dev);
+	free_comp_eqs(dev);
+	mlx5_stop_eqs(dev);
+	mlx5_put_uars_page(dev, priv->uar);
+	mlx5_free_irq_vectors(dev);
+	if (cleanup)
+		mlx5_cleanup_once(dev);
+	mlx5_stop_health_poll(dev);
+	err = mlx5_cmd_teardown_hca(dev);
+	if (err) {
+		dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
+		goto out;
+	}
+	mlx5_pagealloc_stop(dev);
+	mlx5_reclaim_startup_pages(dev);
+	mlx5_core_disable_hca(dev, 0);
+	mlx5_cmd_cleanup(dev);
+
+out:
+	mutex_unlock(&dev->intf_state_mutex);
+	return err;
+}
+
+struct mlx5_core_event_handler {
+	void (*event)(struct mlx5_core_dev *dev,
+		      enum mlx5_dev_event event,
+		      void *data);
+};
+
+static const struct devlink_ops mlx5_devlink_ops = {
+#ifdef CONFIG_MLX5_ESWITCH
+	.eswitch_mode_set = mlx5_devlink_eswitch_mode_set,
+	.eswitch_mode_get = mlx5_devlink_eswitch_mode_get,
+	.eswitch_inline_mode_set = mlx5_devlink_eswitch_inline_mode_set,
+	.eswitch_inline_mode_get = mlx5_devlink_eswitch_inline_mode_get,
+	.eswitch_encap_mode_set = mlx5_devlink_eswitch_encap_mode_set,
+	.eswitch_encap_mode_get = mlx5_devlink_eswitch_encap_mode_get,
+#endif
+};
+
+#define MLX5_IB_MOD "mlx5_ib"
+static int init_one(struct pci_dev *pdev,
+		    const struct pci_device_id *id)
+{
+	struct mlx5_core_dev *dev;
+	struct devlink *devlink;
+	struct mlx5_priv *priv;
+	int err;
+
+	devlink = devlink_alloc(&mlx5_devlink_ops, sizeof(*dev));
+	if (!devlink) {
+		dev_err(&pdev->dev, "kzalloc failed\n");
+		return -ENOMEM;
+	}
+
+	dev = devlink_priv(devlink);
+	priv = &dev->priv;
+	priv->pci_dev_data = id->driver_data;
+
+	pci_set_drvdata(pdev, dev);
+
+	dev->pdev = pdev;
+	dev->event = mlx5_core_event;
+	dev->profile = &profile[prof_sel];
+
+	INIT_LIST_HEAD(&priv->ctx_list);
+	spin_lock_init(&priv->ctx_lock);
+	mutex_init(&dev->pci_status_mutex);
+	mutex_init(&dev->intf_state_mutex);
+
+	INIT_LIST_HEAD(&priv->waiting_events_list);
+	priv->is_accum_events = false;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	err = init_srcu_struct(&priv->pfault_srcu);
+	if (err) {
+		dev_err(&pdev->dev, "init_srcu_struct failed with error code %d\n",
+			err);
+		goto clean_dev;
+	}
+#endif
+	mutex_init(&priv->bfregs.reg_head.lock);
+	mutex_init(&priv->bfregs.wc_head.lock);
+	INIT_LIST_HEAD(&priv->bfregs.reg_head.list);
+	INIT_LIST_HEAD(&priv->bfregs.wc_head.list);
+
+	err = mlx5_pci_init(dev, priv);
+	if (err) {
+		dev_err(&pdev->dev, "mlx5_pci_init failed with error code %d\n", err);
+		goto clean_srcu;
+	}
+
+	err = mlx5_health_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "mlx5_health_init failed with error code %d\n", err);
+		goto close_pci;
+	}
+
+	mlx5_pagealloc_init(dev);
+
+	err = mlx5_load_one(dev, priv, true);
+	if (err) {
+		dev_err(&pdev->dev, "mlx5_load_one failed with error code %d\n", err);
+		goto clean_health;
+	}
+
+	request_module_nowait(MLX5_IB_MOD);
+
+	err = devlink_register(devlink, &pdev->dev);
+	if (err)
+		goto clean_load;
+
+	pci_save_state(pdev);
+	return 0;
+
+clean_load:
+	mlx5_unload_one(dev, priv, true);
+clean_health:
+	mlx5_pagealloc_cleanup(dev);
+	mlx5_health_cleanup(dev);
+close_pci:
+	mlx5_pci_close(dev, priv);
+clean_srcu:
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	cleanup_srcu_struct(&priv->pfault_srcu);
+clean_dev:
+#endif
+	devlink_free(devlink);
+
+	return err;
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	struct devlink *devlink = priv_to_devlink(dev);
+	struct mlx5_priv *priv = &dev->priv;
+
+	devlink_unregister(devlink);
+	mlx5_unregister_device(dev);
+
+	if (mlx5_unload_one(dev, priv, true)) {
+		dev_err(&dev->pdev->dev, "mlx5_unload_one failed\n");
+		mlx5_health_cleanup(dev);
+		return;
+	}
+
+	mlx5_pagealloc_cleanup(dev);
+	mlx5_health_cleanup(dev);
+	mlx5_pci_close(dev, priv);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	cleanup_srcu_struct(&priv->pfault_srcu);
+#endif
+	devlink_free(devlink);
+}
+
+static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
+					      pci_channel_state_t state)
+{
+	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+	struct mlx5_priv *priv = &dev->priv;
+
+	dev_info(&pdev->dev, "%s was called\n", __func__);
+
+	mlx5_enter_error_state(dev, false);
+	mlx5_unload_one(dev, priv, false);
+	/* In case of kernel call drain the health wq */
+	if (state) {
+		mlx5_drain_health_wq(dev);
+		mlx5_pci_disable_device(dev);
+	}
+
+	return state == pci_channel_io_perm_failure ?
+		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+}
+
+/* wait for the device to show vital signs by waiting
+ * for the health counter to start counting.
+ */
+static int wait_vital(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+	struct mlx5_core_health *health = &dev->priv.health;
+	const int niter = 100;
+	u32 last_count = 0;
+	u32 count;
+	int i;
+
+	for (i = 0; i < niter; i++) {
+		count = ioread32be(health->health_counter);
+		if (count && count != 0xffffffff) {
+			if (last_count && last_count != count) {
+				dev_info(&pdev->dev, "Counter value 0x%x after %d iterations\n", count, i);
+				return 0;
+			}
+			last_count = count;
+		}
+		msleep(50);
+	}
+
+	return -ETIMEDOUT;
+}
+
+static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+	int err;
+
+	dev_info(&pdev->dev, "%s was called\n", __func__);
+
+	err = mlx5_pci_enable_device(dev);
+	if (err) {
+		dev_err(&pdev->dev, "%s: mlx5_pci_enable_device failed with error code: %d\n"
+			, __func__, err);
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+	pci_restore_state(pdev);
+	pci_save_state(pdev);
+
+	if (wait_vital(pdev)) {
+		dev_err(&pdev->dev, "%s: wait_vital timed out\n", __func__);
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void mlx5_pci_resume(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+	struct mlx5_priv *priv = &dev->priv;
+	int err;
+
+	dev_info(&pdev->dev, "%s was called\n", __func__);
+
+	err = mlx5_load_one(dev, priv, false);
+	if (err)
+		dev_err(&pdev->dev, "%s: mlx5_load_one failed with error code: %d\n"
+			, __func__, err);
+	else
+		dev_info(&pdev->dev, "%s: device recovered\n", __func__);
+}
+
+static const struct pci_error_handlers mlx5_err_handler = {
+	.error_detected = mlx5_pci_err_detected,
+	.slot_reset	= mlx5_pci_slot_reset,
+	.resume		= mlx5_pci_resume
+};
+
+static int mlx5_try_fast_unload(struct mlx5_core_dev *dev)
+{
+	int ret;
+
+	if (!MLX5_CAP_GEN(dev, force_teardown)) {
+		mlx5_core_dbg(dev, "force teardown is not supported in the firmware\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx5_core_dbg(dev, "Device in internal error state, giving up\n");
+		return -EAGAIN;
+	}
+
+	/* Panic tear down fw command will stop the PCI bus communication
+	 * with the HCA, so the health polll is no longer needed.
+	 */
+	mlx5_drain_health_wq(dev);
+	mlx5_stop_health_poll(dev);
+
+	ret = mlx5_cmd_force_teardown_hca(dev);
+	if (ret) {
+		mlx5_core_dbg(dev, "Firmware couldn't do fast unload error: %d\n", ret);
+		mlx5_start_health_poll(dev);
+		return ret;
+	}
+
+	mlx5_enter_error_state(dev, true);
+
+	/* Some platforms requiring freeing the IRQ's in the shutdown
+	 * flow. If they aren't freed they can't be allocated after
+	 * kexec. There is no need to cleanup the mlx5_core software
+	 * contexts.
+	 */
+	mlx5_irq_clear_affinity_hints(dev);
+	mlx5_core_eq_free_irqs(dev);
+
+	return 0;
+}
+
+static void shutdown(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	struct mlx5_priv *priv = &dev->priv;
+	int err;
+
+	dev_info(&pdev->dev, "Shutdown was called\n");
+	err = mlx5_try_fast_unload(dev);
+	if (err)
+		mlx5_unload_one(dev, priv, false);
+	mlx5_pci_disable_device(dev);
+}
+
+static const struct pci_device_id mlx5_core_pci_table[] = {
+	{ PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_CONNECTIB) },
+	{ PCI_VDEVICE(MELLANOX, 0x1012), MLX5_PCI_DEV_IS_VF},	/* Connect-IB VF */
+	{ PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_CONNECTX4) },
+	{ PCI_VDEVICE(MELLANOX, 0x1014), MLX5_PCI_DEV_IS_VF},	/* ConnectX-4 VF */
+	{ PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_CONNECTX4_LX) },
+	{ PCI_VDEVICE(MELLANOX, 0x1016), MLX5_PCI_DEV_IS_VF},	/* ConnectX-4LX VF */
+	{ PCI_VDEVICE(MELLANOX, 0x1017) },			/* ConnectX-5, PCIe 3.0 */
+	{ PCI_VDEVICE(MELLANOX, 0x1018), MLX5_PCI_DEV_IS_VF},	/* ConnectX-5 VF */
+	{ PCI_VDEVICE(MELLANOX, 0x1019) },			/* ConnectX-5 Ex */
+	{ PCI_VDEVICE(MELLANOX, 0x101a), MLX5_PCI_DEV_IS_VF},	/* ConnectX-5 Ex VF */
+	{ PCI_VDEVICE(MELLANOX, 0x101b) },			/* ConnectX-6 */
+	{ PCI_VDEVICE(MELLANOX, 0x101c), MLX5_PCI_DEV_IS_VF},	/* ConnectX-6 VF */
+	{ PCI_VDEVICE(MELLANOX, 0xa2d2) },			/* BlueField integrated ConnectX-5 network controller */
+	{ PCI_VDEVICE(MELLANOX, 0xa2d3), MLX5_PCI_DEV_IS_VF},	/* BlueField integrated ConnectX-5 network controller VF */
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table);
+
+void mlx5_disable_device(struct mlx5_core_dev *dev)
+{
+	mlx5_pci_err_detected(dev->pdev, 0);
+}
+
+void mlx5_recover_device(struct mlx5_core_dev *dev)
+{
+	mlx5_pci_disable_device(dev);
+	if (mlx5_pci_slot_reset(dev->pdev) == PCI_ERS_RESULT_RECOVERED)
+		mlx5_pci_resume(dev->pdev);
+}
+
+static struct pci_driver mlx5_core_driver = {
+	.name           = DRIVER_NAME,
+	.id_table       = mlx5_core_pci_table,
+	.probe          = init_one,
+	.remove         = remove_one,
+	.shutdown	= shutdown,
+	.err_handler	= &mlx5_err_handler,
+	.sriov_configure   = mlx5_core_sriov_configure,
+};
+
+static void mlx5_core_verify_params(void)
+{
+	if (prof_sel >= ARRAY_SIZE(profile)) {
+		pr_warn("mlx5_core: WARNING: Invalid module parameter prof_sel %d, valid range 0-%zu, changing back to default(%d)\n",
+			prof_sel,
+			ARRAY_SIZE(profile) - 1,
+			MLX5_DEFAULT_PROF);
+		prof_sel = MLX5_DEFAULT_PROF;
+	}
+}
+
+static int __init init(void)
+{
+	int err;
+
+	get_random_bytes(&sw_owner_id, sizeof(sw_owner_id));
+
+	mlx5_core_verify_params();
+	mlx5_fpga_ipsec_build_fs_cmds();
+	mlx5_register_debugfs();
+
+	err = pci_register_driver(&mlx5_core_driver);
+	if (err)
+		goto err_debug;
+
+#ifdef CONFIG_MLX5_CORE_EN
+	mlx5e_init();
+#endif
+
+	return 0;
+
+err_debug:
+	mlx5_unregister_debugfs();
+	return err;
+}
+
+static void __exit cleanup(void)
+{
+#ifdef CONFIG_MLX5_CORE_EN
+	mlx5e_cleanup();
+#endif
+	pci_unregister_driver(&mlx5_core_driver);
+	mlx5_unregister_debugfs();
+}
+
+module_init(init);
+module_exit(cleanup);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c
new file mode 100644
index 0000000..ba2b09c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <rdma/ib_verbs.h>
+#include "mlx5_core.h"
+
+int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn)
+{
+	u32 out[MLX5_ST_SZ_DW(attach_to_mcg_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)]   = {0};
+	void *gid;
+
+	MLX5_SET(attach_to_mcg_in, in, opcode, MLX5_CMD_OP_ATTACH_TO_MCG);
+	MLX5_SET(attach_to_mcg_in, in, qpn, qpn);
+	gid = MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid);
+	memcpy(gid, mgid, sizeof(*mgid));
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_attach_mcg);
+
+int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn)
+{
+	u32 out[MLX5_ST_SZ_DW(detach_from_mcg_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)]   = {0};
+	void *gid;
+
+	MLX5_SET(detach_from_mcg_in, in, opcode, MLX5_CMD_OP_DETACH_FROM_MCG);
+	MLX5_SET(detach_from_mcg_in, in, qpn, qpn);
+	gid = MLX5_ADDR_OF(detach_from_mcg_in, in, multicast_gid);
+	memcpy(gid, mgid, sizeof(*mgid));
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_detach_mcg);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
new file mode 100644
index 0000000..023882d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_CORE_H__
+#define __MLX5_CORE_H__
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/if_link.h>
+#include <linux/firmware.h>
+#include <linux/mlx5/cq.h>
+
+#define DRIVER_NAME "mlx5_core"
+#define DRIVER_VERSION "5.0-0"
+
+extern uint mlx5_core_debug_mask;
+
+#define mlx5_core_dbg(__dev, format, ...)				\
+	dev_dbg(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,		\
+		 __func__, __LINE__, current->pid,			\
+		 ##__VA_ARGS__)
+
+#define mlx5_core_dbg_once(__dev, format, ...)				\
+	dev_dbg_once(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,	\
+		     __func__, __LINE__, current->pid,			\
+		     ##__VA_ARGS__)
+
+#define mlx5_core_dbg_mask(__dev, mask, format, ...)			\
+do {									\
+	if ((mask) & mlx5_core_debug_mask)				\
+		mlx5_core_dbg(__dev, format, ##__VA_ARGS__);		\
+} while (0)
+
+#define mlx5_core_err(__dev, format, ...)				\
+	dev_err(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,	\
+		__func__, __LINE__, current->pid,	\
+	       ##__VA_ARGS__)
+
+#define mlx5_core_warn(__dev, format, ...)				\
+	dev_warn(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,	\
+		 __func__, __LINE__, current->pid,			\
+		##__VA_ARGS__)
+
+#define mlx5_core_info(__dev, format, ...)				\
+	dev_info(&(__dev)->pdev->dev, format, ##__VA_ARGS__)
+
+enum {
+	MLX5_CMD_DATA, /* print command payload only */
+	MLX5_CMD_TIME, /* print command execution time */
+};
+
+enum {
+	MLX5_DRIVER_STATUS_ABORTED = 0xfe,
+	MLX5_DRIVER_SYND = 0xbadd00de,
+};
+
+int mlx5_query_hca_caps(struct mlx5_core_dev *dev);
+int mlx5_query_board_id(struct mlx5_core_dev *dev);
+int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id);
+int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
+int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev);
+void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
+		     unsigned long param);
+void mlx5_core_page_fault(struct mlx5_core_dev *dev,
+			  struct mlx5_pagefault *pfault);
+void mlx5_pps_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
+void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
+void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force);
+void mlx5_disable_device(struct mlx5_core_dev *dev);
+void mlx5_recover_device(struct mlx5_core_dev *dev);
+int mlx5_sriov_init(struct mlx5_core_dev *dev);
+void mlx5_sriov_cleanup(struct mlx5_core_dev *dev);
+int mlx5_sriov_attach(struct mlx5_core_dev *dev);
+void mlx5_sriov_detach(struct mlx5_core_dev *dev);
+int mlx5_core_sriov_configure(struct pci_dev *dev, int num_vfs);
+bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev);
+int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id);
+int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id);
+int mlx5_create_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+				       void *context, u32 *element_id);
+int mlx5_modify_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+				       void *context, u32 element_id,
+				       u32 modify_bitmask);
+int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+					u32 element_id);
+int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev);
+u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev);
+
+int mlx5_eq_init(struct mlx5_core_dev *dev);
+void mlx5_eq_cleanup(struct mlx5_core_dev *dev);
+int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
+		       int nent, u64 mask, const char *name,
+		       enum mlx5_eq_type type);
+int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+int mlx5_eq_add_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq);
+int mlx5_eq_del_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq);
+int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		       u32 *out, int outlen);
+int mlx5_start_eqs(struct mlx5_core_dev *dev);
+void mlx5_stop_eqs(struct mlx5_core_dev *dev);
+/* This function should only be called after mlx5_cmd_force_teardown_hca */
+void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev);
+struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn);
+u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq *eq);
+void mlx5_cq_tasklet_cb(unsigned long data);
+void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced);
+int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+int mlx5_eq_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev);
+int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev);
+
+int mlx5_query_pcam_reg(struct mlx5_core_dev *dev, u32 *pcam, u8 feature_group,
+			u8 access_reg_group);
+int mlx5_query_mcam_reg(struct mlx5_core_dev *dev, u32 *mcap, u8 feature_group,
+			u8 access_reg_group);
+int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam,
+			u8 feature_group, u8 access_reg_group);
+
+void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev);
+void mlx5_lag_remove(struct mlx5_core_dev *dev);
+
+void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv);
+void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv);
+void mlx5_attach_device(struct mlx5_core_dev *dev);
+void mlx5_detach_device(struct mlx5_core_dev *dev);
+bool mlx5_device_registered(struct mlx5_core_dev *dev);
+int mlx5_register_device(struct mlx5_core_dev *dev);
+void mlx5_unregister_device(struct mlx5_core_dev *dev);
+void mlx5_add_dev_by_protocol(struct mlx5_core_dev *dev, int protocol);
+void mlx5_remove_dev_by_protocol(struct mlx5_core_dev *dev, int protocol);
+struct mlx5_core_dev *mlx5_get_next_phys_dev(struct mlx5_core_dev *dev);
+void mlx5_dev_list_lock(void);
+void mlx5_dev_list_unlock(void);
+int mlx5_dev_list_trylock(void);
+int mlx5_encap_alloc(struct mlx5_core_dev *dev,
+		     int header_type,
+		     size_t size,
+		     void *encap_header,
+		     u32 *encap_id);
+void mlx5_encap_dealloc(struct mlx5_core_dev *dev, u32 encap_id);
+
+int mlx5_modify_header_alloc(struct mlx5_core_dev *dev,
+			     u8 namespace, u8 num_actions,
+			     void *modify_actions, u32 *modify_header_id);
+void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev, u32 modify_header_id);
+
+bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv);
+
+int mlx5_query_mtpps(struct mlx5_core_dev *dev, u32 *mtpps, u32 mtpps_size);
+int mlx5_set_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size);
+int mlx5_query_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 *arm, u8 *mode);
+int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode);
+
+#define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) &&		\
+			    MLX5_CAP_GEN((mdev), pps_modify) &&		\
+			    MLX5_CAP_MCAM_FEATURE((mdev), mtpps_fs) &&	\
+			    MLX5_CAP_MCAM_FEATURE((mdev), mtpps_enh_out_per_adj))
+
+int mlx5_firmware_flash(struct mlx5_core_dev *dev, const struct firmware *fw);
+
+void mlx5e_init(void);
+void mlx5e_cleanup(void);
+
+static inline int mlx5_lag_is_lacp_owner(struct mlx5_core_dev *dev)
+{
+	/* LACP owner conditions:
+	 * 1) Function is physical.
+	 * 2) LAG is supported by FW.
+	 * 3) LAG is managed by driver (currently the only option).
+	 */
+	return  MLX5_CAP_GEN(dev, vport_group_manager) &&
+		   (MLX5_CAP_GEN(dev, num_lag_ports) > 1) &&
+		    MLX5_CAP_GEN(dev, lag_master);
+}
+
+int mlx5_lag_allow(struct mlx5_core_dev *dev);
+int mlx5_lag_forbid(struct mlx5_core_dev *dev);
+
+void mlx5_reload_interface(struct mlx5_core_dev *mdev, int protocol);
+#endif /* __MLX5_CORE_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
new file mode 100644
index 0000000..b9736f5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+void mlx5_init_mkey_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_mkey_table *table = &dev->priv.mkey_table;
+
+	memset(table, 0, sizeof(*table));
+	rwlock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+}
+
+void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev)
+{
+}
+
+int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
+			     struct mlx5_core_mkey *mkey,
+			     u32 *in, int inlen,
+			     u32 *out, int outlen,
+			     mlx5_cmd_cbk_t callback, void *context)
+{
+	struct mlx5_mkey_table *table = &dev->priv.mkey_table;
+	u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {0};
+	u32 mkey_index;
+	void *mkc;
+	int err;
+	u8 key;
+
+	spin_lock_irq(&dev->priv.mkey_lock);
+	key = dev->priv.mkey_key++;
+	spin_unlock_irq(&dev->priv.mkey_lock);
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
+	MLX5_SET(mkc, mkc, mkey_7_0, key);
+
+	if (callback)
+		return mlx5_cmd_exec_cb(dev, in, inlen, out, outlen,
+					callback, context);
+
+	err = mlx5_cmd_exec(dev, in, inlen, lout, sizeof(lout));
+	if (err)
+		return err;
+
+	mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index);
+	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
+	mkey->size = MLX5_GET64(mkc, mkc, len);
+	mkey->key = mlx5_idx_to_mkey(mkey_index) | key;
+	mkey->pd = MLX5_GET(mkc, mkc, pd);
+
+	mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n",
+		      mkey_index, key, mkey->key);
+
+	/* connect to mkey tree */
+	write_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key), mkey);
+	write_unlock_irq(&table->lock);
+	if (err) {
+		mlx5_core_warn(dev, "failed radix tree insert of mkey 0x%x, %d\n",
+			       mlx5_base_mkey(mkey->key), err);
+		mlx5_core_destroy_mkey(dev, mkey);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_mkey_cb);
+
+int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
+			  struct mlx5_core_mkey *mkey,
+			  u32 *in, int inlen)
+{
+	return mlx5_core_create_mkey_cb(dev, mkey, in, inlen,
+					NULL, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(mlx5_core_create_mkey);
+
+int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev,
+			   struct mlx5_core_mkey *mkey)
+{
+	struct mlx5_mkey_table *table = &dev->priv.mkey_table;
+	u32 out[MLX5_ST_SZ_DW(destroy_mkey_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)]   = {0};
+	struct mlx5_core_mkey *deleted_mkey;
+	unsigned long flags;
+
+	write_lock_irqsave(&table->lock, flags);
+	deleted_mkey = radix_tree_delete(&table->tree, mlx5_base_mkey(mkey->key));
+	write_unlock_irqrestore(&table->lock, flags);
+	if (!deleted_mkey) {
+		mlx5_core_warn(dev, "failed radix tree delete of mkey 0x%x\n",
+			       mlx5_base_mkey(mkey->key));
+		return -ENOENT;
+	}
+
+	MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY);
+	MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key));
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_destroy_mkey);
+
+int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey,
+			 u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_mkey_in)] = {0};
+
+	memset(out, 0, outlen);
+	MLX5_SET(query_mkey_in, in, opcode, MLX5_CMD_OP_QUERY_MKEY);
+	MLX5_SET(query_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key));
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_mkey);
+
+int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey,
+			     u32 *mkey)
+{
+	u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)]   = {0};
+	int err;
+
+	MLX5_SET(query_special_contexts_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*mkey = MLX5_GET(query_special_contexts_out, out,
+				 dump_fill_mkey);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_dump_fill_mkey);
+
+static inline u32 mlx5_get_psv(u32 *out, int psv_index)
+{
+	switch (psv_index) {
+	case 1: return MLX5_GET(create_psv_out, out, psv1_index);
+	case 2: return MLX5_GET(create_psv_out, out, psv2_index);
+	case 3: return MLX5_GET(create_psv_out, out, psv3_index);
+	default: return MLX5_GET(create_psv_out, out, psv0_index);
+	}
+}
+
+int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn,
+			 int npsvs, u32 *sig_index)
+{
+	u32 out[MLX5_ST_SZ_DW(create_psv_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(create_psv_in)]   = {0};
+	int i, err;
+
+	if (npsvs > MLX5_MAX_PSVS)
+		return -EINVAL;
+
+	MLX5_SET(create_psv_in, in, opcode, MLX5_CMD_OP_CREATE_PSV);
+	MLX5_SET(create_psv_in, in, pd, pdn);
+	MLX5_SET(create_psv_in, in, num_psv, npsvs);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	for (i = 0; i < npsvs; i++)
+		sig_index[i] = mlx5_get_psv(out, i);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_psv);
+
+int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_psv_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_psv_in)]   = {0};
+
+	MLX5_SET(destroy_psv_in, in, opcode, MLX5_CMD_OP_DESTROY_PSV);
+	MLX5_SET(destroy_psv_in, in, psvn, psv_num);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_destroy_psv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
new file mode 100644
index 0000000..e36d3e3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
@@ -0,0 +1,579 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/highmem.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+enum {
+	MLX5_PAGES_CANT_GIVE	= 0,
+	MLX5_PAGES_GIVE		= 1,
+	MLX5_PAGES_TAKE		= 2
+};
+
+struct mlx5_pages_req {
+	struct mlx5_core_dev *dev;
+	u16	func_id;
+	s32	npages;
+	struct work_struct work;
+};
+
+struct fw_page {
+	struct rb_node		rb_node;
+	u64			addr;
+	struct page	       *page;
+	u16			func_id;
+	unsigned long		bitmask;
+	struct list_head	list;
+	unsigned		free_count;
+};
+
+enum {
+	MAX_RECLAIM_TIME_MSECS	= 5000,
+	MAX_RECLAIM_VFS_PAGES_TIME_MSECS = 2 * 1000 * 60,
+};
+
+enum {
+	MLX5_MAX_RECLAIM_TIME_MILI	= 5000,
+	MLX5_NUM_4K_IN_PAGE		= PAGE_SIZE / MLX5_ADAPTER_PAGE_SIZE,
+};
+
+static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u16 func_id)
+{
+	struct rb_root *root = &dev->priv.page_root;
+	struct rb_node **new = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct fw_page *nfp;
+	struct fw_page *tfp;
+	int i;
+
+	while (*new) {
+		parent = *new;
+		tfp = rb_entry(parent, struct fw_page, rb_node);
+		if (tfp->addr < addr)
+			new = &parent->rb_left;
+		else if (tfp->addr > addr)
+			new = &parent->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	nfp = kzalloc(sizeof(*nfp), GFP_KERNEL);
+	if (!nfp)
+		return -ENOMEM;
+
+	nfp->addr = addr;
+	nfp->page = page;
+	nfp->func_id = func_id;
+	nfp->free_count = MLX5_NUM_4K_IN_PAGE;
+	for (i = 0; i < MLX5_NUM_4K_IN_PAGE; i++)
+		set_bit(i, &nfp->bitmask);
+
+	rb_link_node(&nfp->rb_node, parent, new);
+	rb_insert_color(&nfp->rb_node, root);
+	list_add(&nfp->list, &dev->priv.free_list);
+
+	return 0;
+}
+
+static struct fw_page *find_fw_page(struct mlx5_core_dev *dev, u64 addr)
+{
+	struct rb_root *root = &dev->priv.page_root;
+	struct rb_node *tmp = root->rb_node;
+	struct fw_page *result = NULL;
+	struct fw_page *tfp;
+
+	while (tmp) {
+		tfp = rb_entry(tmp, struct fw_page, rb_node);
+		if (tfp->addr < addr) {
+			tmp = tmp->rb_left;
+		} else if (tfp->addr > addr) {
+			tmp = tmp->rb_right;
+		} else {
+			result = tfp;
+			break;
+		}
+	}
+
+	return result;
+}
+
+static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id,
+				s32 *npages, int boot)
+{
+	u32 out[MLX5_ST_SZ_DW(query_pages_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(query_pages_in)]   = {0};
+	int err;
+
+	MLX5_SET(query_pages_in, in, opcode, MLX5_CMD_OP_QUERY_PAGES);
+	MLX5_SET(query_pages_in, in, op_mod, boot ?
+		 MLX5_QUERY_PAGES_IN_OP_MOD_BOOT_PAGES :
+		 MLX5_QUERY_PAGES_IN_OP_MOD_INIT_PAGES);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	*npages = MLX5_GET(query_pages_out, out, num_pages);
+	*func_id = MLX5_GET(query_pages_out, out, function_id);
+
+	return err;
+}
+
+static int alloc_4k(struct mlx5_core_dev *dev, u64 *addr)
+{
+	struct fw_page *fp;
+	unsigned n;
+
+	if (list_empty(&dev->priv.free_list))
+		return -ENOMEM;
+
+	fp = list_entry(dev->priv.free_list.next, struct fw_page, list);
+	n = find_first_bit(&fp->bitmask, 8 * sizeof(fp->bitmask));
+	if (n >= MLX5_NUM_4K_IN_PAGE) {
+		mlx5_core_warn(dev, "alloc 4k bug\n");
+		return -ENOENT;
+	}
+	clear_bit(n, &fp->bitmask);
+	fp->free_count--;
+	if (!fp->free_count)
+		list_del(&fp->list);
+
+	*addr = fp->addr + n * MLX5_ADAPTER_PAGE_SIZE;
+
+	return 0;
+}
+
+#define MLX5_U64_4K_PAGE_MASK ((~(u64)0U) << PAGE_SHIFT)
+
+static void free_4k(struct mlx5_core_dev *dev, u64 addr)
+{
+	struct fw_page *fwp;
+	int n;
+
+	fwp = find_fw_page(dev, addr & MLX5_U64_4K_PAGE_MASK);
+	if (!fwp) {
+		mlx5_core_warn(dev, "page not found\n");
+		return;
+	}
+
+	n = (addr & ~MLX5_U64_4K_PAGE_MASK) >> MLX5_ADAPTER_PAGE_SHIFT;
+	fwp->free_count++;
+	set_bit(n, &fwp->bitmask);
+	if (fwp->free_count == MLX5_NUM_4K_IN_PAGE) {
+		rb_erase(&fwp->rb_node, &dev->priv.page_root);
+		if (fwp->free_count != 1)
+			list_del(&fwp->list);
+		dma_unmap_page(&dev->pdev->dev, addr & MLX5_U64_4K_PAGE_MASK,
+			       PAGE_SIZE, DMA_BIDIRECTIONAL);
+		__free_page(fwp->page);
+		kfree(fwp);
+	} else if (fwp->free_count == 1) {
+		list_add(&fwp->list, &dev->priv.free_list);
+	}
+}
+
+static int alloc_system_page(struct mlx5_core_dev *dev, u16 func_id)
+{
+	struct page *page;
+	u64 zero_addr = 1;
+	u64 addr;
+	int err;
+	int nid = dev_to_node(&dev->pdev->dev);
+
+	page = alloc_pages_node(nid, GFP_HIGHUSER, 0);
+	if (!page) {
+		mlx5_core_warn(dev, "failed to allocate page\n");
+		return -ENOMEM;
+	}
+map:
+	addr = dma_map_page(&dev->pdev->dev, page, 0,
+			    PAGE_SIZE, DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(&dev->pdev->dev, addr)) {
+		mlx5_core_warn(dev, "failed dma mapping page\n");
+		err = -ENOMEM;
+		goto err_mapping;
+	}
+
+	/* Firmware doesn't support page with physical address 0 */
+	if (addr == 0) {
+		zero_addr = addr;
+		goto map;
+	}
+
+	err = insert_page(dev, addr, page, func_id);
+	if (err) {
+		mlx5_core_err(dev, "failed to track allocated page\n");
+		dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+	}
+
+err_mapping:
+	if (err)
+		__free_page(page);
+
+	if (zero_addr == 0)
+		dma_unmap_page(&dev->pdev->dev, zero_addr, PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+
+	return err;
+}
+
+static void page_notify_fail(struct mlx5_core_dev *dev, u16 func_id)
+{
+	u32 out[MLX5_ST_SZ_DW(manage_pages_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(manage_pages_in)]   = {0};
+	int err;
+
+	MLX5_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES);
+	MLX5_SET(manage_pages_in, in, op_mod, MLX5_PAGES_CANT_GIVE);
+	MLX5_SET(manage_pages_in, in, function_id, func_id);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		mlx5_core_warn(dev, "page notify failed func_id(%d) err(%d)\n",
+			       func_id, err);
+}
+
+static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages,
+		      int notify_fail)
+{
+	u32 out[MLX5_ST_SZ_DW(manage_pages_out)] = {0};
+	int inlen = MLX5_ST_SZ_BYTES(manage_pages_in);
+	u64 addr;
+	int err;
+	u32 *in;
+	int i;
+
+	inlen += npages * MLX5_FLD_SZ_BYTES(manage_pages_in, pas[0]);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		mlx5_core_warn(dev, "vzalloc failed %d\n", inlen);
+		goto out_free;
+	}
+
+	for (i = 0; i < npages; i++) {
+retry:
+		err = alloc_4k(dev, &addr);
+		if (err) {
+			if (err == -ENOMEM)
+				err = alloc_system_page(dev, func_id);
+			if (err)
+				goto out_4k;
+
+			goto retry;
+		}
+		MLX5_ARRAY_SET64(manage_pages_in, in, pas, i, addr);
+	}
+
+	MLX5_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES);
+	MLX5_SET(manage_pages_in, in, op_mod, MLX5_PAGES_GIVE);
+	MLX5_SET(manage_pages_in, in, function_id, func_id);
+	MLX5_SET(manage_pages_in, in, input_num_entries, npages);
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (err) {
+		mlx5_core_warn(dev, "func_id 0x%x, npages %d, err %d\n",
+			       func_id, npages, err);
+		goto out_4k;
+	}
+
+	dev->priv.fw_pages += npages;
+	if (func_id)
+		dev->priv.vfs_pages += npages;
+
+	mlx5_core_dbg(dev, "err %d\n", err);
+
+	kvfree(in);
+	return 0;
+
+out_4k:
+	for (i--; i >= 0; i--)
+		free_4k(dev, MLX5_GET64(manage_pages_in, in, pas[i]));
+out_free:
+	kvfree(in);
+	if (notify_fail)
+		page_notify_fail(dev, func_id);
+	return err;
+}
+
+static int reclaim_pages_cmd(struct mlx5_core_dev *dev,
+			     u32 *in, int in_size, u32 *out, int out_size)
+{
+	struct fw_page *fwp;
+	struct rb_node *p;
+	u32 func_id;
+	u32 npages;
+	u32 i = 0;
+
+	if (dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR)
+		return mlx5_cmd_exec(dev, in, in_size, out, out_size);
+
+	/* No hard feelings, we want our pages back! */
+	npages = MLX5_GET(manage_pages_in, in, input_num_entries);
+	func_id = MLX5_GET(manage_pages_in, in, function_id);
+
+	p = rb_first(&dev->priv.page_root);
+	while (p && i < npages) {
+		fwp = rb_entry(p, struct fw_page, rb_node);
+		p = rb_next(p);
+		if (fwp->func_id != func_id)
+			continue;
+
+		MLX5_ARRAY_SET64(manage_pages_out, out, pas, i, fwp->addr);
+		i++;
+	}
+
+	MLX5_SET(manage_pages_out, out, output_num_entries, i);
+	return 0;
+}
+
+static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages,
+			 int *nclaimed)
+{
+	int outlen = MLX5_ST_SZ_BYTES(manage_pages_out);
+	u32 in[MLX5_ST_SZ_DW(manage_pages_in)] = {0};
+	int num_claimed;
+	u32 *out;
+	int err;
+	int i;
+
+	if (nclaimed)
+		*nclaimed = 0;
+
+	outlen += npages * MLX5_FLD_SZ_BYTES(manage_pages_out, pas[0]);
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES);
+	MLX5_SET(manage_pages_in, in, op_mod, MLX5_PAGES_TAKE);
+	MLX5_SET(manage_pages_in, in, function_id, func_id);
+	MLX5_SET(manage_pages_in, in, input_num_entries, npages);
+
+	mlx5_core_dbg(dev, "npages %d, outlen %d\n", npages, outlen);
+	err = reclaim_pages_cmd(dev, in, sizeof(in), out, outlen);
+	if (err) {
+		mlx5_core_err(dev, "failed reclaiming pages: err %d\n", err);
+		goto out_free;
+	}
+
+	num_claimed = MLX5_GET(manage_pages_out, out, output_num_entries);
+	if (num_claimed > npages) {
+		mlx5_core_warn(dev, "fw returned %d, driver asked %d => corruption\n",
+			       num_claimed, npages);
+		err = -EINVAL;
+		goto out_free;
+	}
+
+	for (i = 0; i < num_claimed; i++)
+		free_4k(dev, MLX5_GET64(manage_pages_out, out, pas[i]));
+
+	if (nclaimed)
+		*nclaimed = num_claimed;
+
+	dev->priv.fw_pages -= num_claimed;
+	if (func_id)
+		dev->priv.vfs_pages -= num_claimed;
+
+out_free:
+	kvfree(out);
+	return err;
+}
+
+static void pages_work_handler(struct work_struct *work)
+{
+	struct mlx5_pages_req *req = container_of(work, struct mlx5_pages_req, work);
+	struct mlx5_core_dev *dev = req->dev;
+	int err = 0;
+
+	if (req->npages < 0)
+		err = reclaim_pages(dev, req->func_id, -1 * req->npages, NULL);
+	else if (req->npages > 0)
+		err = give_pages(dev, req->func_id, req->npages, 1);
+
+	if (err)
+		mlx5_core_warn(dev, "%s fail %d\n",
+			       req->npages < 0 ? "reclaim" : "give", err);
+
+	kfree(req);
+}
+
+void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
+				 s32 npages)
+{
+	struct mlx5_pages_req *req;
+
+	req = kzalloc(sizeof(*req), GFP_ATOMIC);
+	if (!req) {
+		mlx5_core_warn(dev, "failed to allocate pages request\n");
+		return;
+	}
+
+	req->dev = dev;
+	req->func_id = func_id;
+	req->npages = npages;
+	INIT_WORK(&req->work, pages_work_handler);
+	queue_work(dev->priv.pg_wq, &req->work);
+}
+
+int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot)
+{
+	u16 uninitialized_var(func_id);
+	s32 uninitialized_var(npages);
+	int err;
+
+	err = mlx5_cmd_query_pages(dev, &func_id, &npages, boot);
+	if (err)
+		return err;
+
+	mlx5_core_dbg(dev, "requested %d %s pages for func_id 0x%x\n",
+		      npages, boot ? "boot" : "init", func_id);
+
+	return give_pages(dev, func_id, npages, 0);
+}
+
+enum {
+	MLX5_BLKS_FOR_RECLAIM_PAGES = 12
+};
+
+static int optimal_reclaimed_pages(void)
+{
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_layout *lay;
+	int ret;
+
+	ret = (sizeof(lay->out) + MLX5_BLKS_FOR_RECLAIM_PAGES * sizeof(block->data) -
+	       MLX5_ST_SZ_BYTES(manage_pages_out)) /
+	       MLX5_FLD_SZ_BYTES(manage_pages_out, pas[0]);
+
+	return ret;
+}
+
+int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev)
+{
+	unsigned long end = jiffies + msecs_to_jiffies(MAX_RECLAIM_TIME_MSECS);
+	struct fw_page *fwp;
+	struct rb_node *p;
+	int nclaimed = 0;
+	int err = 0;
+
+	do {
+		p = rb_first(&dev->priv.page_root);
+		if (p) {
+			fwp = rb_entry(p, struct fw_page, rb_node);
+			err = reclaim_pages(dev, fwp->func_id,
+					    optimal_reclaimed_pages(),
+					    &nclaimed);
+
+			if (err) {
+				mlx5_core_warn(dev, "failed reclaiming pages (%d)\n",
+					       err);
+				return err;
+			}
+			if (nclaimed)
+				end = jiffies + msecs_to_jiffies(MAX_RECLAIM_TIME_MSECS);
+		}
+		if (time_after(jiffies, end)) {
+			mlx5_core_warn(dev, "FW did not return all pages. giving up...\n");
+			break;
+		}
+	} while (p);
+
+	WARN(dev->priv.fw_pages,
+	     "FW pages counter is %d after reclaiming all pages\n",
+	     dev->priv.fw_pages);
+	WARN(dev->priv.vfs_pages,
+	     "VFs FW pages counter is %d after reclaiming all pages\n",
+	     dev->priv.vfs_pages);
+
+	return 0;
+}
+
+void mlx5_pagealloc_init(struct mlx5_core_dev *dev)
+{
+	dev->priv.page_root = RB_ROOT;
+	INIT_LIST_HEAD(&dev->priv.free_list);
+}
+
+void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev)
+{
+	/* nothing */
+}
+
+int mlx5_pagealloc_start(struct mlx5_core_dev *dev)
+{
+	dev->priv.pg_wq = create_singlethread_workqueue("mlx5_page_allocator");
+	if (!dev->priv.pg_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_pagealloc_stop(struct mlx5_core_dev *dev)
+{
+	destroy_workqueue(dev->priv.pg_wq);
+}
+
+int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev)
+{
+	unsigned long end = jiffies + msecs_to_jiffies(MAX_RECLAIM_VFS_PAGES_TIME_MSECS);
+	int prev_vfs_pages = dev->priv.vfs_pages;
+
+	/* In case of internal error we will free the pages manually later */
+	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx5_core_warn(dev, "Skipping wait for vf pages stage");
+		return 0;
+	}
+
+	mlx5_core_dbg(dev, "Waiting for %d pages from %s\n", prev_vfs_pages,
+		      dev->priv.name);
+	while (dev->priv.vfs_pages) {
+		if (time_after(jiffies, end)) {
+			mlx5_core_warn(dev, "aborting while there are %d pending pages\n", dev->priv.vfs_pages);
+			return -ETIMEDOUT;
+		}
+		if (dev->priv.vfs_pages < prev_vfs_pages) {
+			end = jiffies + msecs_to_jiffies(MAX_RECLAIM_VFS_PAGES_TIME_MSECS);
+			prev_vfs_pages = dev->priv.vfs_pages;
+		}
+		msleep(50);
+	}
+
+	mlx5_core_dbg(dev, "All pages received from %s\n", dev->priv.name);
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pd.c b/drivers/net/ethernet/mellanox/mlx5/core/pd.c
new file mode 100644
index 0000000..bd830d8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pd.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(alloc_pd_in)]   = {0};
+	int err;
+
+	MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*pdn = MLX5_GET(alloc_pd_out, out, pd);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_alloc_pd);
+
+int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn)
+{
+	u32 out[MLX5_ST_SZ_DW(dealloc_pd_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)]   = {0};
+
+	MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD);
+	MLX5_SET(dealloc_pd_in, in, pd, pdn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_dealloc_pd);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
new file mode 100644
index 0000000..fa9d076
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -0,0 +1,1120 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/port.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in,
+			 int size_in, void *data_out, int size_out,
+			 u16 reg_id, int arg, int write)
+{
+	int outlen = MLX5_ST_SZ_BYTES(access_register_out) + size_out;
+	int inlen = MLX5_ST_SZ_BYTES(access_register_in) + size_in;
+	int err = -ENOMEM;
+	u32 *out = NULL;
+	u32 *in = NULL;
+	void *data;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!in || !out)
+		goto out;
+
+	data = MLX5_ADDR_OF(access_register_in, in, register_data);
+	memcpy(data, data_in, size_in);
+
+	MLX5_SET(access_register_in, in, opcode, MLX5_CMD_OP_ACCESS_REG);
+	MLX5_SET(access_register_in, in, op_mod, !write);
+	MLX5_SET(access_register_in, in, argument, arg);
+	MLX5_SET(access_register_in, in, register_id, reg_id);
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, outlen);
+	if (err)
+		goto out;
+
+	data = MLX5_ADDR_OF(access_register_out, out, register_data);
+	memcpy(data_out, data, size_out);
+
+out:
+	kvfree(out);
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_access_reg);
+
+int mlx5_query_pcam_reg(struct mlx5_core_dev *dev, u32 *pcam, u8 feature_group,
+			u8 access_reg_group)
+{
+	u32 in[MLX5_ST_SZ_DW(pcam_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(pcam_reg);
+
+	MLX5_SET(pcam_reg, in, feature_group, feature_group);
+	MLX5_SET(pcam_reg, in, access_reg_group, access_reg_group);
+
+	return mlx5_core_access_reg(dev, in, sz, pcam, sz, MLX5_REG_PCAM, 0, 0);
+}
+
+int mlx5_query_mcam_reg(struct mlx5_core_dev *dev, u32 *mcam, u8 feature_group,
+			u8 access_reg_group)
+{
+	u32 in[MLX5_ST_SZ_DW(mcam_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(mcam_reg);
+
+	MLX5_SET(mcam_reg, in, feature_group, feature_group);
+	MLX5_SET(mcam_reg, in, access_reg_group, access_reg_group);
+
+	return mlx5_core_access_reg(dev, in, sz, mcam, sz, MLX5_REG_MCAM, 0, 0);
+}
+
+int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam,
+			u8 feature_group, u8 access_reg_group)
+{
+	u32 in[MLX5_ST_SZ_DW(qcam_reg)] = {};
+	int sz = MLX5_ST_SZ_BYTES(qcam_reg);
+
+	MLX5_SET(qcam_reg, in, feature_group, feature_group);
+	MLX5_SET(qcam_reg, in, access_reg_group, access_reg_group);
+
+	return mlx5_core_access_reg(mdev, in, sz, qcam, sz, MLX5_REG_QCAM, 0, 0);
+}
+
+struct mlx5_reg_pcap {
+	u8			rsvd0;
+	u8			port_num;
+	u8			rsvd1[2];
+	__be32			caps_127_96;
+	__be32			caps_95_64;
+	__be32			caps_63_32;
+	__be32			caps_31_0;
+};
+
+int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps)
+{
+	struct mlx5_reg_pcap in;
+	struct mlx5_reg_pcap out;
+
+	memset(&in, 0, sizeof(in));
+	in.caps_127_96 = cpu_to_be32(caps);
+	in.port_num = port_num;
+
+	return mlx5_core_access_reg(dev, &in, sizeof(in), &out,
+				    sizeof(out), MLX5_REG_PCAP, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_caps);
+
+int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
+			 int ptys_size, int proto_mask, u8 local_port)
+{
+	u32 in[MLX5_ST_SZ_DW(ptys_reg)] = {0};
+
+	MLX5_SET(ptys_reg, in, local_port, local_port);
+	MLX5_SET(ptys_reg, in, proto_mask, proto_mask);
+	return mlx5_core_access_reg(dev, in, sizeof(in), ptys,
+				    ptys_size, MLX5_REG_PTYS, 0, 0);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_ptys);
+
+int mlx5_set_port_beacon(struct mlx5_core_dev *dev, u16 beacon_duration)
+{
+	u32 in[MLX5_ST_SZ_DW(mlcr_reg)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(mlcr_reg)];
+
+	MLX5_SET(mlcr_reg, in, local_port, 1);
+	MLX5_SET(mlcr_reg, in, beacon_duration, beacon_duration);
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_MLCR, 0, 1);
+}
+
+int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev,
+			      u32 *proto_cap, int proto_mask)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask, 1);
+	if (err)
+		return err;
+
+	if (proto_mask == MLX5_PTYS_EN)
+		*proto_cap = MLX5_GET(ptys_reg, out, eth_proto_capability);
+	else
+		*proto_cap = MLX5_GET(ptys_reg, out, ib_proto_capability);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_proto_cap);
+
+int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev,
+				u32 *proto_admin, int proto_mask)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask, 1);
+	if (err)
+		return err;
+
+	if (proto_mask == MLX5_PTYS_EN)
+		*proto_admin = MLX5_GET(ptys_reg, out, eth_proto_admin);
+	else
+		*proto_admin = MLX5_GET(ptys_reg, out, ib_proto_admin);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_proto_admin);
+
+int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev,
+				    u8 *link_width_oper, u8 local_port)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_IB, local_port);
+	if (err)
+		return err;
+
+	*link_width_oper = MLX5_GET(ptys_reg, out, ib_link_width_oper);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_link_width_oper);
+
+int mlx5_query_port_eth_proto_oper(struct mlx5_core_dev *dev,
+				   u32 *proto_oper, u8 local_port)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_EN,
+				   local_port);
+	if (err)
+		return err;
+
+	*proto_oper = MLX5_GET(ptys_reg, out, eth_proto_oper);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_query_port_eth_proto_oper);
+
+int mlx5_query_port_ib_proto_oper(struct mlx5_core_dev *dev,
+				  u8 *proto_oper, u8 local_port)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_IB,
+				   local_port);
+	if (err)
+		return err;
+
+	*proto_oper = MLX5_GET(ptys_reg, out, ib_proto_oper);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_query_port_ib_proto_oper);
+
+int mlx5_set_port_ptys(struct mlx5_core_dev *dev, bool an_disable,
+		       u32 proto_admin, int proto_mask)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	u32 in[MLX5_ST_SZ_DW(ptys_reg)];
+	u8 an_disable_admin;
+	u8 an_disable_cap;
+	u8 an_status;
+
+	mlx5_query_port_autoneg(dev, proto_mask, &an_status,
+				&an_disable_cap, &an_disable_admin);
+	if (!an_disable_cap && an_disable)
+		return -EPERM;
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(ptys_reg, in, local_port, 1);
+	MLX5_SET(ptys_reg, in, an_disable_admin, an_disable);
+	MLX5_SET(ptys_reg, in, proto_mask, proto_mask);
+	if (proto_mask == MLX5_PTYS_EN)
+		MLX5_SET(ptys_reg, in, eth_proto_admin, proto_admin);
+	else
+		MLX5_SET(ptys_reg, in, ib_proto_admin, proto_admin);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_PTYS, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_ptys);
+
+/* This function should be used after setting a port register only */
+void mlx5_toggle_port_link(struct mlx5_core_dev *dev)
+{
+	enum mlx5_port_status ps;
+
+	mlx5_query_port_admin_status(dev, &ps);
+	mlx5_set_port_admin_status(dev, MLX5_PORT_DOWN);
+	if (ps == MLX5_PORT_UP)
+		mlx5_set_port_admin_status(dev, MLX5_PORT_UP);
+}
+EXPORT_SYMBOL_GPL(mlx5_toggle_port_link);
+
+int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
+			       enum mlx5_port_status status)
+{
+	u32 in[MLX5_ST_SZ_DW(paos_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(paos_reg)];
+
+	MLX5_SET(paos_reg, in, local_port, 1);
+	MLX5_SET(paos_reg, in, admin_status, status);
+	MLX5_SET(paos_reg, in, ase, 1);
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_PAOS, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_admin_status);
+
+int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
+				 enum mlx5_port_status *status)
+{
+	u32 in[MLX5_ST_SZ_DW(paos_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(paos_reg)];
+	int err;
+
+	MLX5_SET(paos_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_PAOS, 0, 0);
+	if (err)
+		return err;
+	*status = MLX5_GET(paos_reg, out, admin_status);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_admin_status);
+
+static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, u16 *admin_mtu,
+				u16 *max_mtu, u16 *oper_mtu, u8 port)
+{
+	u32 in[MLX5_ST_SZ_DW(pmtu_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(pmtu_reg)];
+
+	MLX5_SET(pmtu_reg, in, local_port, port);
+	mlx5_core_access_reg(dev, in, sizeof(in), out,
+			     sizeof(out), MLX5_REG_PMTU, 0, 0);
+
+	if (max_mtu)
+		*max_mtu  = MLX5_GET(pmtu_reg, out, max_mtu);
+	if (oper_mtu)
+		*oper_mtu = MLX5_GET(pmtu_reg, out, oper_mtu);
+	if (admin_mtu)
+		*admin_mtu = MLX5_GET(pmtu_reg, out, admin_mtu);
+}
+
+int mlx5_set_port_mtu(struct mlx5_core_dev *dev, u16 mtu, u8 port)
+{
+	u32 in[MLX5_ST_SZ_DW(pmtu_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(pmtu_reg)];
+
+	MLX5_SET(pmtu_reg, in, admin_mtu, mtu);
+	MLX5_SET(pmtu_reg, in, local_port, port);
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_PMTU, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_mtu);
+
+void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, u16 *max_mtu,
+			     u8 port)
+{
+	mlx5_query_port_mtu(dev, NULL, max_mtu, NULL, port);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_max_mtu);
+
+void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, u16 *oper_mtu,
+			      u8 port)
+{
+	mlx5_query_port_mtu(dev, NULL, NULL, oper_mtu, port);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_oper_mtu);
+
+static int mlx5_query_module_num(struct mlx5_core_dev *dev, int *module_num)
+{
+	u32 in[MLX5_ST_SZ_DW(pmlp_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(pmlp_reg)];
+	int module_mapping;
+	int err;
+
+	MLX5_SET(pmlp_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_PMLP, 0, 0);
+	if (err)
+		return err;
+
+	module_mapping = MLX5_GET(pmlp_reg, out, lane0_module_mapping);
+	*module_num = module_mapping & MLX5_EEPROM_IDENTIFIER_BYTE_MASK;
+
+	return 0;
+}
+
+int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
+			     u16 offset, u16 size, u8 *data)
+{
+	u32 out[MLX5_ST_SZ_DW(mcia_reg)];
+	u32 in[MLX5_ST_SZ_DW(mcia_reg)];
+	int module_num;
+	u16 i2c_addr;
+	int status;
+	int err;
+	void *ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0);
+
+	err = mlx5_query_module_num(dev, &module_num);
+	if (err)
+		return err;
+
+	memset(in, 0, sizeof(in));
+	size = min_t(int, size, MLX5_EEPROM_MAX_BYTES);
+
+	if (offset < MLX5_EEPROM_PAGE_LENGTH &&
+	    offset + size > MLX5_EEPROM_PAGE_LENGTH)
+		/* Cross pages read, read until offset 256 in low page */
+		size -= offset + size - MLX5_EEPROM_PAGE_LENGTH;
+
+	i2c_addr = MLX5_I2C_ADDR_LOW;
+	if (offset >= MLX5_EEPROM_PAGE_LENGTH) {
+		i2c_addr = MLX5_I2C_ADDR_HIGH;
+		offset -= MLX5_EEPROM_PAGE_LENGTH;
+	}
+
+	MLX5_SET(mcia_reg, in, l, 0);
+	MLX5_SET(mcia_reg, in, module, module_num);
+	MLX5_SET(mcia_reg, in, i2c_device_address, i2c_addr);
+	MLX5_SET(mcia_reg, in, page_number, 0);
+	MLX5_SET(mcia_reg, in, device_address, offset);
+	MLX5_SET(mcia_reg, in, size, size);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_MCIA, 0, 0);
+	if (err)
+		return err;
+
+	status = MLX5_GET(mcia_reg, out, status);
+	if (status) {
+		mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n",
+			      status);
+		return -EIO;
+	}
+
+	memcpy(data, ptr, size);
+
+	return size;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom);
+
+static int mlx5_query_port_pvlc(struct mlx5_core_dev *dev, u32 *pvlc,
+				int pvlc_size,  u8 local_port)
+{
+	u32 in[MLX5_ST_SZ_DW(pvlc_reg)] = {0};
+
+	MLX5_SET(pvlc_reg, in, local_port, local_port);
+	return mlx5_core_access_reg(dev, in, sizeof(in), pvlc,
+				    pvlc_size, MLX5_REG_PVLC, 0, 0);
+}
+
+int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
+			      u8 *vl_hw_cap, u8 local_port)
+{
+	u32 out[MLX5_ST_SZ_DW(pvlc_reg)];
+	int err;
+
+	err = mlx5_query_port_pvlc(dev, out, sizeof(out), local_port);
+	if (err)
+		return err;
+
+	*vl_hw_cap = MLX5_GET(pvlc_reg, out, vl_hw_cap);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_vl_hw_cap);
+
+int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev,
+			     u8 port_num, void *out, size_t sz)
+{
+	u32 *in;
+	int err;
+
+	in  = kvzalloc(sz, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		return err;
+	}
+
+	MLX5_SET(ppcnt_reg, in, local_port, port_num);
+
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_INFINIBAND_PORT_COUNTERS_GROUP);
+	err = mlx5_core_access_reg(dev, in, sz, out,
+				   sz, MLX5_REG_PPCNT, 0, 0);
+
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_query_ib_ppcnt);
+
+static int mlx5_query_pfcc_reg(struct mlx5_core_dev *dev, u32 *out,
+			       u32 out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0};
+
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    out_size, MLX5_REG_PFCC, 0, 0);
+}
+
+int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+	MLX5_SET(pfcc_reg, in, pptx, tx_pause);
+	MLX5_SET(pfcc_reg, in, pprx, rx_pause);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_PFCC, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_pause);
+
+int mlx5_query_port_pause(struct mlx5_core_dev *dev,
+			  u32 *rx_pause, u32 *tx_pause)
+{
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+	int err;
+
+	err = mlx5_query_pfcc_reg(dev, out, sizeof(out));
+	if (err)
+		return err;
+
+	if (rx_pause)
+		*rx_pause = MLX5_GET(pfcc_reg, out, pprx);
+
+	if (tx_pause)
+		*tx_pause = MLX5_GET(pfcc_reg, out, pptx);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_pause);
+
+int mlx5_set_port_stall_watermark(struct mlx5_core_dev *dev,
+				  u16 stall_critical_watermark,
+				  u16 stall_minor_watermark)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+	MLX5_SET(pfcc_reg, in, pptx_mask_n, 1);
+	MLX5_SET(pfcc_reg, in, pprx_mask_n, 1);
+	MLX5_SET(pfcc_reg, in, ppan_mask_n, 1);
+	MLX5_SET(pfcc_reg, in, critical_stall_mask, 1);
+	MLX5_SET(pfcc_reg, in, minor_stall_mask, 1);
+	MLX5_SET(pfcc_reg, in, device_stall_critical_watermark,
+		 stall_critical_watermark);
+	MLX5_SET(pfcc_reg, in, device_stall_minor_watermark, stall_minor_watermark);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_PFCC, 0, 1);
+}
+
+int mlx5_query_port_stall_watermark(struct mlx5_core_dev *dev,
+				    u16 *stall_critical_watermark,
+				    u16 *stall_minor_watermark)
+{
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+	int err;
+
+	err = mlx5_query_pfcc_reg(dev, out, sizeof(out));
+	if (err)
+		return err;
+
+	if (stall_critical_watermark)
+		*stall_critical_watermark = MLX5_GET(pfcc_reg, out,
+						     device_stall_critical_watermark);
+
+	if (stall_minor_watermark)
+		*stall_minor_watermark = MLX5_GET(pfcc_reg, out,
+						  device_stall_minor_watermark);
+
+	return 0;
+}
+
+int mlx5_set_port_pfc(struct mlx5_core_dev *dev, u8 pfc_en_tx, u8 pfc_en_rx)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+	MLX5_SET(pfcc_reg, in, pfctx, pfc_en_tx);
+	MLX5_SET(pfcc_reg, in, pfcrx, pfc_en_rx);
+	MLX5_SET_TO_ONES(pfcc_reg, in, prio_mask_tx);
+	MLX5_SET_TO_ONES(pfcc_reg, in, prio_mask_rx);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_PFCC, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_pfc);
+
+int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx, u8 *pfc_en_rx)
+{
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+	int err;
+
+	err = mlx5_query_pfcc_reg(dev, out, sizeof(out));
+	if (err)
+		return err;
+
+	if (pfc_en_tx)
+		*pfc_en_tx = MLX5_GET(pfcc_reg, out, pfctx);
+
+	if (pfc_en_rx)
+		*pfc_en_rx = MLX5_GET(pfcc_reg, out, pfcrx);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_pfc);
+
+void mlx5_query_port_autoneg(struct mlx5_core_dev *dev, int proto_mask,
+			     u8 *an_status,
+			     u8 *an_disable_cap, u8 *an_disable_admin)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+
+	*an_status = 0;
+	*an_disable_cap = 0;
+	*an_disable_admin = 0;
+
+	if (mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask, 1))
+		return;
+
+	*an_status = MLX5_GET(ptys_reg, out, an_status);
+	*an_disable_cap = MLX5_GET(ptys_reg, out, an_disable_cap);
+	*an_disable_admin = MLX5_GET(ptys_reg, out, an_disable_admin);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_autoneg);
+
+int mlx5_max_tc(struct mlx5_core_dev *mdev)
+{
+	u8 num_tc = MLX5_CAP_GEN(mdev, max_tc) ? : 8;
+
+	return num_tc - 1;
+}
+
+int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out)
+{
+	u32 in[MLX5_ST_SZ_DW(dcbx_param)] = {0};
+
+	MLX5_SET(dcbx_param, in, port_number, 1);
+
+	return  mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				    sizeof(in), MLX5_REG_DCBX_PARAM, 0, 0);
+}
+
+int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in)
+{
+	u32 out[MLX5_ST_SZ_DW(dcbx_param)];
+
+	MLX5_SET(dcbx_param, in, port_number, 1);
+
+	return mlx5_core_access_reg(mdev, in, sizeof(out), out,
+				    sizeof(out), MLX5_REG_DCBX_PARAM, 0, 1);
+}
+
+int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc)
+{
+	u32 in[MLX5_ST_SZ_DW(qtct_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(qtct_reg)];
+	int err;
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		if (prio_tc[i] > mlx5_max_tc(mdev))
+			return -EINVAL;
+
+		MLX5_SET(qtct_reg, in, prio, i);
+		MLX5_SET(qtct_reg, in, tclass, prio_tc[i]);
+
+		err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+					   sizeof(out), MLX5_REG_QTCT, 0, 1);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_prio_tc);
+
+int mlx5_query_port_prio_tc(struct mlx5_core_dev *mdev,
+			    u8 prio, u8 *tc)
+{
+	u32 in[MLX5_ST_SZ_DW(qtct_reg)];
+	u32 out[MLX5_ST_SZ_DW(qtct_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(qtct_reg, in, port_number, 1);
+	MLX5_SET(qtct_reg, in, prio, prio);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_QTCT, 0, 0);
+	if (!err)
+		*tc = MLX5_GET(qtct_reg, out, tclass);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_prio_tc);
+
+static int mlx5_set_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *in,
+				   int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(qtct_reg)];
+
+	if (!MLX5_CAP_GEN(mdev, ets))
+		return -EOPNOTSUPP;
+
+	return mlx5_core_access_reg(mdev, in, inlen, out, sizeof(out),
+				    MLX5_REG_QETCR, 0, 1);
+}
+
+static int mlx5_query_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *out,
+				     int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(qtct_reg)];
+
+	if (!MLX5_CAP_GEN(mdev, ets))
+		return -EOPNOTSUPP;
+
+	memset(in, 0, sizeof(in));
+	return mlx5_core_access_reg(mdev, in, sizeof(in), out, outlen,
+				    MLX5_REG_QETCR, 0, 0);
+}
+
+int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group)
+{
+	u32 in[MLX5_ST_SZ_DW(qetc_reg)] = {0};
+	int i;
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		MLX5_SET(qetc_reg, in, tc_configuration[i].g, 1);
+		MLX5_SET(qetc_reg, in, tc_configuration[i].group, tc_group[i]);
+	}
+
+	return mlx5_set_port_qetcr_reg(mdev, in, sizeof(in));
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_tc_group);
+
+int mlx5_query_port_tc_group(struct mlx5_core_dev *mdev,
+			     u8 tc, u8 *tc_group)
+{
+	u32 out[MLX5_ST_SZ_DW(qetc_reg)];
+	void *ets_tcn_conf;
+	int err;
+
+	err = mlx5_query_port_qetcr_reg(mdev, out, sizeof(out));
+	if (err)
+		return err;
+
+	ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, out,
+				    tc_configuration[tc]);
+
+	*tc_group = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf,
+			     group);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_tc_group);
+
+int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw)
+{
+	u32 in[MLX5_ST_SZ_DW(qetc_reg)] = {0};
+	int i;
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		MLX5_SET(qetc_reg, in, tc_configuration[i].b, 1);
+		MLX5_SET(qetc_reg, in, tc_configuration[i].bw_allocation, tc_bw[i]);
+	}
+
+	return mlx5_set_port_qetcr_reg(mdev, in, sizeof(in));
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_tc_bw_alloc);
+
+int mlx5_query_port_tc_bw_alloc(struct mlx5_core_dev *mdev,
+				u8 tc, u8 *bw_pct)
+{
+	u32 out[MLX5_ST_SZ_DW(qetc_reg)];
+	void *ets_tcn_conf;
+	int err;
+
+	err = mlx5_query_port_qetcr_reg(mdev, out, sizeof(out));
+	if (err)
+		return err;
+
+	ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, out,
+				    tc_configuration[tc]);
+
+	*bw_pct = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf,
+			   bw_allocation);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_tc_bw_alloc);
+
+int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev,
+				    u8 *max_bw_value,
+				    u8 *max_bw_units)
+{
+	u32 in[MLX5_ST_SZ_DW(qetc_reg)] = {0};
+	void *ets_tcn_conf;
+	int i;
+
+	MLX5_SET(qetc_reg, in, port_number, 1);
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, in, tc_configuration[i]);
+
+		MLX5_SET(ets_tcn_config_reg, ets_tcn_conf, r, 1);
+		MLX5_SET(ets_tcn_config_reg, ets_tcn_conf, max_bw_units,
+			 max_bw_units[i]);
+		MLX5_SET(ets_tcn_config_reg, ets_tcn_conf, max_bw_value,
+			 max_bw_value[i]);
+	}
+
+	return mlx5_set_port_qetcr_reg(mdev, in, sizeof(in));
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_port_ets_rate_limit);
+
+int mlx5_query_port_ets_rate_limit(struct mlx5_core_dev *mdev,
+				   u8 *max_bw_value,
+				   u8 *max_bw_units)
+{
+	u32 out[MLX5_ST_SZ_DW(qetc_reg)];
+	void *ets_tcn_conf;
+	int err;
+	int i;
+
+	err = mlx5_query_port_qetcr_reg(mdev, out, sizeof(out));
+	if (err)
+		return err;
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, out, tc_configuration[i]);
+
+		max_bw_value[i] = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf,
+					   max_bw_value);
+		max_bw_units[i] = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf,
+					   max_bw_units);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_ets_rate_limit);
+
+int mlx5_set_port_wol(struct mlx5_core_dev *mdev, u8 wol_mode)
+{
+	u32 in[MLX5_ST_SZ_DW(set_wol_rol_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(set_wol_rol_out)] = {0};
+
+	MLX5_SET(set_wol_rol_in, in, opcode, MLX5_CMD_OP_SET_WOL_ROL);
+	MLX5_SET(set_wol_rol_in, in, wol_mode_valid, 1);
+	MLX5_SET(set_wol_rol_in, in, wol_mode, wol_mode);
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_wol);
+
+int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode)
+{
+	u32 in[MLX5_ST_SZ_DW(query_wol_rol_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(query_wol_rol_out)] = {0};
+	int err;
+
+	MLX5_SET(query_wol_rol_in, in, opcode, MLX5_CMD_OP_QUERY_WOL_ROL);
+	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*wol_mode = MLX5_GET(query_wol_rol_out, out, wol_mode);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_wol);
+
+static int mlx5_query_ports_check(struct mlx5_core_dev *mdev, u32 *out,
+				  int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {0};
+
+	MLX5_SET(pcmr_reg, in, local_port, 1);
+	return mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				    outlen, MLX5_REG_PCMR, 0, 0);
+}
+
+static int mlx5_set_ports_check(struct mlx5_core_dev *mdev, u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(pcmr_reg)];
+
+	return mlx5_core_access_reg(mdev, in, inlen, out,
+				    sizeof(out), MLX5_REG_PCMR, 0, 1);
+}
+
+int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable)
+{
+	u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {0};
+
+	MLX5_SET(pcmr_reg, in, local_port, 1);
+	MLX5_SET(pcmr_reg, in, fcs_chk, enable);
+	return mlx5_set_ports_check(mdev, in, sizeof(in));
+}
+
+void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
+			 bool *enabled)
+{
+	u32 out[MLX5_ST_SZ_DW(pcmr_reg)];
+	/* Default values for FW which do not support MLX5_REG_PCMR */
+	*supported = false;
+	*enabled = true;
+
+	if (!MLX5_CAP_GEN(mdev, ports_check))
+		return;
+
+	if (mlx5_query_ports_check(mdev, out, sizeof(out)))
+		return;
+
+	*supported = !!(MLX5_GET(pcmr_reg, out, fcs_cap));
+	*enabled = !!(MLX5_GET(pcmr_reg, out, fcs_chk));
+}
+
+static const char *mlx5_pme_status[MLX5_MODULE_STATUS_NUM] = {
+	"Cable plugged",   /* MLX5_MODULE_STATUS_PLUGGED    = 0x1 */
+	"Cable unplugged", /* MLX5_MODULE_STATUS_UNPLUGGED  = 0x2 */
+	"Cable error",     /* MLX5_MODULE_STATUS_ERROR      = 0x3 */
+};
+
+static const char *mlx5_pme_error[MLX5_MODULE_EVENT_ERROR_NUM] = {
+	"Power budget exceeded",
+	"Long Range for non MLNX cable",
+	"Bus stuck(I2C or data shorted)",
+	"No EEPROM/retry timeout",
+	"Enforce part number list",
+	"Unknown identifier",
+	"High Temperature",
+	"Bad or shorted cable/module",
+	"Unknown status",
+};
+
+void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
+{
+	enum port_module_event_status_type module_status;
+	enum port_module_event_error_type error_type;
+	struct mlx5_eqe_port_module *module_event_eqe;
+	struct mlx5_priv *priv = &dev->priv;
+	u8 module_num;
+
+	module_event_eqe = &eqe->data.port_module;
+	module_num = module_event_eqe->module;
+	module_status = module_event_eqe->module_status &
+			PORT_MODULE_EVENT_MODULE_STATUS_MASK;
+	error_type = module_event_eqe->error_type &
+		     PORT_MODULE_EVENT_ERROR_TYPE_MASK;
+
+	if (module_status < MLX5_MODULE_STATUS_ERROR) {
+		priv->pme_stats.status_counters[module_status - 1]++;
+	} else if (module_status == MLX5_MODULE_STATUS_ERROR) {
+		if (error_type >= MLX5_MODULE_EVENT_ERROR_UNKNOWN)
+			/* Unknown error type */
+			error_type = MLX5_MODULE_EVENT_ERROR_UNKNOWN;
+		priv->pme_stats.error_counters[error_type]++;
+	}
+
+	if (!printk_ratelimit())
+		return;
+
+	if (module_status < MLX5_MODULE_STATUS_ERROR)
+		mlx5_core_info(dev,
+			       "Port module event: module %u, %s\n",
+			       module_num, mlx5_pme_status[module_status - 1]);
+
+	else if (module_status == MLX5_MODULE_STATUS_ERROR)
+		mlx5_core_info(dev,
+			       "Port module event[error]: module %u, %s, %s\n",
+			       module_num, mlx5_pme_status[module_status - 1],
+			       mlx5_pme_error[error_type]);
+}
+
+int mlx5_query_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size)
+{
+	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+
+	return mlx5_core_access_reg(mdev, in, sizeof(in), mtpps,
+				    mtpps_size, MLX5_REG_MTPPS, 0, 0);
+}
+
+int mlx5_set_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size)
+{
+	u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+
+	return mlx5_core_access_reg(mdev, mtpps, mtpps_size, out,
+				    sizeof(out), MLX5_REG_MTPPS, 0, 1);
+}
+
+int mlx5_query_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 *arm, u8 *mode)
+{
+	u32 out[MLX5_ST_SZ_DW(mtppse_reg)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtppse_reg)] = {0};
+	int err = 0;
+
+	MLX5_SET(mtppse_reg, in, pin, pin);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_MTPPSE, 0, 0);
+	if (err)
+		return err;
+
+	*arm = MLX5_GET(mtppse_reg, in, event_arm);
+	*mode = MLX5_GET(mtppse_reg, in, event_generation_mode);
+
+	return err;
+}
+
+int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode)
+{
+	u32 out[MLX5_ST_SZ_DW(mtppse_reg)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtppse_reg)] = {0};
+
+	MLX5_SET(mtppse_reg, in, pin, pin);
+	MLX5_SET(mtppse_reg, in, event_arm, arm);
+	MLX5_SET(mtppse_reg, in, event_generation_mode, mode);
+
+	return mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_MTPPSE, 0, 1);
+}
+
+int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state)
+{
+	u32 out[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	int err;
+
+	MLX5_SET(qpts_reg, in, local_port, 1);
+	MLX5_SET(qpts_reg, in, trust_state, trust_state);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_QPTS, 0, 1);
+	return err;
+}
+
+int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state)
+{
+	u32 out[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	int err;
+
+	MLX5_SET(qpts_reg, in, local_port, 1);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_QPTS, 0, 0);
+	if (!err)
+		*trust_state = MLX5_GET(qpts_reg, out, trust_state);
+
+	return err;
+}
+
+int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio)
+{
+	int sz = MLX5_ST_SZ_BYTES(qpdpm_reg);
+	void *qpdpm_dscp;
+	void *out;
+	void *in;
+	int err;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpdpm_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 0);
+	if (err)
+		goto out;
+
+	memcpy(in, out, sz);
+	MLX5_SET(qpdpm_reg, in, local_port, 1);
+
+	/* Update the corresponding dscp entry */
+	qpdpm_dscp = MLX5_ADDR_OF(qpdpm_reg, in, dscp[dscp]);
+	MLX5_SET16(qpdpm_dscp_reg, qpdpm_dscp, prio, prio);
+	MLX5_SET16(qpdpm_dscp_reg, qpdpm_dscp, e, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 1);
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
+
+/* dscp2prio[i]: priority that dscp i mapped to */
+#define MLX5E_SUPPORTED_DSCP 64
+int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio)
+{
+	int sz = MLX5_ST_SZ_BYTES(qpdpm_reg);
+	void *qpdpm_dscp;
+	void *out;
+	void *in;
+	int err;
+	int i;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpdpm_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 0);
+	if (err)
+		goto out;
+
+	for (i = 0; i < (MLX5E_SUPPORTED_DSCP); i++) {
+		qpdpm_dscp = MLX5_ADDR_OF(qpdpm_reg, out, dscp[i]);
+		dscp2prio[i] = MLX5_GET16(qpdpm_dscp_reg, qpdpm_dscp, prio);
+	}
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
new file mode 100644
index 0000000..02d6c5b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -0,0 +1,635 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/transobj.h>
+
+#include "mlx5_core.h"
+
+static struct mlx5_core_rsc_common *mlx5_get_rsc(struct mlx5_core_dev *dev,
+						 u32 rsn)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	struct mlx5_core_rsc_common *common;
+
+	spin_lock(&table->lock);
+
+	common = radix_tree_lookup(&table->tree, rsn);
+	if (common)
+		atomic_inc(&common->refcount);
+
+	spin_unlock(&table->lock);
+
+	if (!common) {
+		mlx5_core_warn(dev, "Async event for bogus resource 0x%x\n",
+			       rsn);
+		return NULL;
+	}
+	return common;
+}
+
+void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common)
+{
+	if (atomic_dec_and_test(&common->refcount))
+		complete(&common->free);
+}
+
+static u64 qp_allowed_event_types(void)
+{
+	u64 mask;
+
+	mask = BIT(MLX5_EVENT_TYPE_PATH_MIG) |
+	       BIT(MLX5_EVENT_TYPE_COMM_EST) |
+	       BIT(MLX5_EVENT_TYPE_SQ_DRAINED) |
+	       BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) |
+	       BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR) |
+	       BIT(MLX5_EVENT_TYPE_PATH_MIG_FAILED) |
+	       BIT(MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) |
+	       BIT(MLX5_EVENT_TYPE_WQ_ACCESS_ERROR);
+
+	return mask;
+}
+
+static u64 rq_allowed_event_types(void)
+{
+	u64 mask;
+
+	mask = BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) |
+	       BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR);
+
+	return mask;
+}
+
+static u64 sq_allowed_event_types(void)
+{
+	return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR);
+}
+
+static u64 dct_allowed_event_types(void)
+{
+	return BIT(MLX5_EVENT_TYPE_DCT_DRAINED);
+}
+
+static bool is_event_type_allowed(int rsc_type, int event_type)
+{
+	switch (rsc_type) {
+	case MLX5_EVENT_QUEUE_TYPE_QP:
+		return BIT(event_type) & qp_allowed_event_types();
+	case MLX5_EVENT_QUEUE_TYPE_RQ:
+		return BIT(event_type) & rq_allowed_event_types();
+	case MLX5_EVENT_QUEUE_TYPE_SQ:
+		return BIT(event_type) & sq_allowed_event_types();
+	case MLX5_EVENT_QUEUE_TYPE_DCT:
+		return BIT(event_type) & dct_allowed_event_types();
+	default:
+		WARN(1, "Event arrived for unknown resource type");
+		return false;
+	}
+}
+
+void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
+{
+	struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, rsn);
+	struct mlx5_core_dct *dct;
+	struct mlx5_core_qp *qp;
+
+	if (!common)
+		return;
+
+	if (!is_event_type_allowed((rsn >> MLX5_USER_INDEX_LEN), event_type)) {
+		mlx5_core_warn(dev, "event 0x%.2x is not allowed on resource 0x%.8x\n",
+			       event_type, rsn);
+		return;
+	}
+
+	switch (common->res) {
+	case MLX5_RES_QP:
+	case MLX5_RES_RQ:
+	case MLX5_RES_SQ:
+		qp = (struct mlx5_core_qp *)common;
+		qp->event(qp, event_type);
+		break;
+	case MLX5_RES_DCT:
+		dct = (struct mlx5_core_dct *)common;
+		if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED)
+			complete(&dct->drained);
+		break;
+	default:
+		mlx5_core_warn(dev, "invalid resource type for 0x%x\n", rsn);
+	}
+
+	mlx5_core_put_rsc(common);
+}
+
+static int create_resource_common(struct mlx5_core_dev *dev,
+				  struct mlx5_core_qp *qp,
+				  int rsc_type)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	int err;
+
+	qp->common.res = rsc_type;
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree,
+				qp->qpn | (rsc_type << MLX5_USER_INDEX_LEN),
+				qp);
+	spin_unlock_irq(&table->lock);
+	if (err)
+		return err;
+
+	atomic_set(&qp->common.refcount, 1);
+	init_completion(&qp->common.free);
+	qp->pid = current->pid;
+
+	return 0;
+}
+
+static void destroy_resource_common(struct mlx5_core_dev *dev,
+				    struct mlx5_core_qp *qp)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	unsigned long flags;
+
+	spin_lock_irqsave(&table->lock, flags);
+	radix_tree_delete(&table->tree,
+			  qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN));
+	spin_unlock_irqrestore(&table->lock, flags);
+	mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp);
+	wait_for_completion(&qp->common.free);
+}
+
+int mlx5_core_create_dct(struct mlx5_core_dev *dev,
+			 struct mlx5_core_dct *dct,
+			 u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(create_dct_out)]   = {0};
+	u32 din[MLX5_ST_SZ_DW(destroy_dct_in)]   = {0};
+	u32 dout[MLX5_ST_SZ_DW(destroy_dct_out)] = {0};
+	struct mlx5_core_qp *qp = &dct->mqp;
+	int err;
+
+	init_completion(&dct->drained);
+	MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT);
+
+	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	if (err) {
+		mlx5_core_warn(dev, "create DCT failed, ret %d\n", err);
+		return err;
+	}
+
+	qp->qpn = MLX5_GET(create_dct_out, out, dctn);
+	err = create_resource_common(dev, qp, MLX5_RES_DCT);
+	if (err)
+		goto err_cmd;
+
+	return 0;
+err_cmd:
+	MLX5_SET(destroy_dct_in, din, opcode, MLX5_CMD_OP_DESTROY_DCT);
+	MLX5_SET(destroy_dct_in, din, dctn, qp->qpn);
+	mlx5_cmd_exec(dev, (void *)&in, sizeof(din),
+		      (void *)&out, sizeof(dout));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_create_dct);
+
+int mlx5_core_create_qp(struct mlx5_core_dev *dev,
+			struct mlx5_core_qp *qp,
+			u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {0};
+	u32 dout[MLX5_ST_SZ_DW(destroy_qp_out)];
+	u32 din[MLX5_ST_SZ_DW(destroy_qp_in)];
+	int err;
+
+	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (err)
+		return err;
+
+	qp->qpn = MLX5_GET(create_qp_out, out, qpn);
+	mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn);
+
+	err = create_resource_common(dev, qp, MLX5_RES_QP);
+	if (err)
+		goto err_cmd;
+
+	err = mlx5_debug_qp_add(dev, qp);
+	if (err)
+		mlx5_core_dbg(dev, "failed adding QP 0x%x to debug file system\n",
+			      qp->qpn);
+
+	atomic_inc(&dev->num_qps);
+
+	return 0;
+
+err_cmd:
+	memset(din, 0, sizeof(din));
+	memset(dout, 0, sizeof(dout));
+	MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP);
+	MLX5_SET(destroy_qp_in, din, qpn, qp->qpn);
+	mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_create_qp);
+
+static int mlx5_core_drain_dct(struct mlx5_core_dev *dev,
+			       struct mlx5_core_dct *dct)
+{
+	u32 out[MLX5_ST_SZ_DW(drain_dct_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(drain_dct_in)]   = {0};
+	struct mlx5_core_qp *qp = &dct->mqp;
+
+	MLX5_SET(drain_dct_in, in, opcode, MLX5_CMD_OP_DRAIN_DCT);
+	MLX5_SET(drain_dct_in, in, dctn, qp->qpn);
+	return mlx5_cmd_exec(dev, (void *)&in, sizeof(in),
+			     (void *)&out, sizeof(out));
+}
+
+int mlx5_core_destroy_dct(struct mlx5_core_dev *dev,
+			  struct mlx5_core_dct *dct)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_dct_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_dct_in)]   = {0};
+	struct mlx5_core_qp *qp = &dct->mqp;
+	int err;
+
+	err = mlx5_core_drain_dct(dev, dct);
+	if (err) {
+		if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+			goto destroy;
+		} else {
+			mlx5_core_warn(dev, "failed drain DCT 0x%x with error 0x%x\n", qp->qpn, err);
+			return err;
+		}
+	}
+	wait_for_completion(&dct->drained);
+destroy:
+	destroy_resource_common(dev, &dct->mqp);
+	MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT);
+	MLX5_SET(destroy_dct_in, in, dctn, qp->qpn);
+	err = mlx5_cmd_exec(dev, (void *)&in, sizeof(in),
+			    (void *)&out, sizeof(out));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_destroy_dct);
+
+int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
+			 struct mlx5_core_qp *qp)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_qp_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)]   = {0};
+	int err;
+
+	mlx5_debug_qp_remove(dev, qp);
+
+	destroy_resource_common(dev, qp);
+
+	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
+	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	atomic_dec(&dev->num_qps);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp);
+
+int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev,
+			     u32 timeout_usec)
+{
+	u32 out[MLX5_ST_SZ_DW(set_delay_drop_params_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(set_delay_drop_params_in)]   = {0};
+
+	MLX5_SET(set_delay_drop_params_in, in, opcode,
+		 MLX5_CMD_OP_SET_DELAY_DROP_PARAMS);
+	MLX5_SET(set_delay_drop_params_in, in, delay_drop_timeout,
+		 timeout_usec / 100);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_set_delay_drop);
+
+struct mbox_info {
+	u32 *in;
+	u32 *out;
+	int inlen;
+	int outlen;
+};
+
+static int mbox_alloc(struct mbox_info *mbox, int inlen, int outlen)
+{
+	mbox->inlen  = inlen;
+	mbox->outlen = outlen;
+	mbox->in = kzalloc(mbox->inlen, GFP_KERNEL);
+	mbox->out = kzalloc(mbox->outlen, GFP_KERNEL);
+	if (!mbox->in || !mbox->out) {
+		kfree(mbox->in);
+		kfree(mbox->out);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void mbox_free(struct mbox_info *mbox)
+{
+	kfree(mbox->in);
+	kfree(mbox->out);
+}
+
+static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn,
+				u32 opt_param_mask, void *qpc,
+				struct mbox_info *mbox)
+{
+	mbox->out = NULL;
+	mbox->in = NULL;
+
+#define MBOX_ALLOC(mbox, typ)  \
+	mbox_alloc(mbox, MLX5_ST_SZ_BYTES(typ##_in), MLX5_ST_SZ_BYTES(typ##_out))
+
+#define MOD_QP_IN_SET(typ, in, _opcode, _qpn) \
+	MLX5_SET(typ##_in, in, opcode, _opcode); \
+	MLX5_SET(typ##_in, in, qpn, _qpn)
+
+#define MOD_QP_IN_SET_QPC(typ, in, _opcode, _qpn, _opt_p, _qpc) \
+	MOD_QP_IN_SET(typ, in, _opcode, _qpn); \
+	MLX5_SET(typ##_in, in, opt_param_mask, _opt_p); \
+	memcpy(MLX5_ADDR_OF(typ##_in, in, qpc), _qpc, MLX5_ST_SZ_BYTES(qpc))
+
+	switch (opcode) {
+	/* 2RST & 2ERR */
+	case MLX5_CMD_OP_2RST_QP:
+		if (MBOX_ALLOC(mbox, qp_2rst))
+			return -ENOMEM;
+		MOD_QP_IN_SET(qp_2rst, mbox->in, opcode, qpn);
+		break;
+	case MLX5_CMD_OP_2ERR_QP:
+		if (MBOX_ALLOC(mbox, qp_2err))
+			return -ENOMEM;
+		MOD_QP_IN_SET(qp_2err, mbox->in, opcode, qpn);
+		break;
+
+	/* MODIFY with QPC */
+	case MLX5_CMD_OP_RST2INIT_QP:
+		if (MBOX_ALLOC(mbox, rst2init_qp))
+			return -ENOMEM;
+		 MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn,
+				   opt_param_mask, qpc);
+		 break;
+	case MLX5_CMD_OP_INIT2RTR_QP:
+		if (MBOX_ALLOC(mbox, init2rtr_qp))
+			return -ENOMEM;
+		 MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn,
+				   opt_param_mask, qpc);
+		 break;
+	case MLX5_CMD_OP_RTR2RTS_QP:
+		if (MBOX_ALLOC(mbox, rtr2rts_qp))
+			return -ENOMEM;
+		 MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn,
+				   opt_param_mask, qpc);
+		 break;
+	case MLX5_CMD_OP_RTS2RTS_QP:
+		if (MBOX_ALLOC(mbox, rts2rts_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(rts2rts_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc);
+		break;
+	case MLX5_CMD_OP_SQERR2RTS_QP:
+		if (MBOX_ALLOC(mbox, sqerr2rts_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(sqerr2rts_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc);
+		break;
+	case MLX5_CMD_OP_INIT2INIT_QP:
+		if (MBOX_ALLOC(mbox, init2init_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(init2init_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc);
+		break;
+	default:
+		mlx5_core_err(dev, "Unknown transition for modify QP: OP(0x%x) QPN(0x%x)\n",
+			      opcode, qpn);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 opcode,
+			u32 opt_param_mask, void *qpc,
+			struct mlx5_core_qp *qp)
+{
+	struct mbox_info mbox;
+	int err;
+
+	err = modify_qp_mbox_alloc(dev, opcode, qp->qpn,
+				   opt_param_mask, qpc, &mbox);
+	if (err)
+		return err;
+
+	err = mlx5_cmd_exec(dev, mbox.in, mbox.inlen, mbox.out, mbox.outlen);
+	mbox_free(&mbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_qp_modify);
+
+void mlx5_init_qp_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+
+	memset(table, 0, sizeof(*table));
+	spin_lock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+	mlx5_qp_debugfs_init(dev);
+}
+
+void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev)
+{
+	mlx5_qp_debugfs_cleanup(dev);
+}
+
+int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp,
+		       u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {0};
+
+	MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP);
+	MLX5_SET(query_qp_in, in, qpn, qp->qpn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL_GPL(mlx5_core_qp_query);
+
+int mlx5_core_dct_query(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct,
+			u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_dct_in)] = {0};
+	struct mlx5_core_qp *qp = &dct->mqp;
+
+	MLX5_SET(query_dct_in, in, opcode, MLX5_CMD_OP_QUERY_DCT);
+	MLX5_SET(query_dct_in, in, dctn, qp->qpn);
+
+	return mlx5_cmd_exec(dev, (void *)&in, sizeof(in),
+			     (void *)out, outlen);
+}
+EXPORT_SYMBOL_GPL(mlx5_core_dct_query);
+
+int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)]   = {0};
+	int err;
+
+	MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_xrcd_alloc);
+
+int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn)
+{
+	u32 out[MLX5_ST_SZ_DW(dealloc_xrcd_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)]   = {0};
+
+	MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD);
+	MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc);
+
+int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *rq)
+{
+	int err;
+	u32 rqn;
+
+	err = mlx5_core_create_rq(dev, in, inlen, &rqn);
+	if (err)
+		return err;
+
+	rq->qpn = rqn;
+	err = create_resource_common(dev, rq, MLX5_RES_RQ);
+	if (err)
+		goto err_destroy_rq;
+
+	return 0;
+
+err_destroy_rq:
+	mlx5_core_destroy_rq(dev, rq->qpn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_rq_tracked);
+
+void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev,
+				  struct mlx5_core_qp *rq)
+{
+	destroy_resource_common(dev, rq);
+	mlx5_core_destroy_rq(dev, rq->qpn);
+}
+EXPORT_SYMBOL(mlx5_core_destroy_rq_tracked);
+
+int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *sq)
+{
+	int err;
+	u32 sqn;
+
+	err = mlx5_core_create_sq(dev, in, inlen, &sqn);
+	if (err)
+		return err;
+
+	sq->qpn = sqn;
+	err = create_resource_common(dev, sq, MLX5_RES_SQ);
+	if (err)
+		goto err_destroy_sq;
+
+	return 0;
+
+err_destroy_sq:
+	mlx5_core_destroy_sq(dev, sq->qpn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_sq_tracked);
+
+void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
+				  struct mlx5_core_qp *sq)
+{
+	destroy_resource_common(dev, sq);
+	mlx5_core_destroy_sq(dev, sq->qpn);
+}
+EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked);
+
+int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id)
+{
+	u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {0};
+	int err;
+
+	MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*counter_id = MLX5_GET(alloc_q_counter_out, out,
+				       counter_set_id);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_alloc_q_counter);
+
+int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(dealloc_q_counter_out)] = {0};
+
+	MLX5_SET(dealloc_q_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
+	MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter_id);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_dealloc_q_counter);
+
+int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id,
+			      int reset, void *out, int out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {0};
+
+	MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
+	MLX5_SET(query_q_counter_in, in, clear, reset);
+	MLX5_SET(query_q_counter_in, in, counter_set_id, counter_id);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
+}
+EXPORT_SYMBOL_GPL(mlx5_core_query_q_counter);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
new file mode 100644
index 0000000..bc86dff
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2013-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+/* Scheduling element fw management */
+int mlx5_create_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+				       void *ctx, u32 *element_id)
+{
+	u32 in[MLX5_ST_SZ_DW(create_scheduling_element_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(create_scheduling_element_in)] = {0};
+	void *schedc;
+	int err;
+
+	schedc = MLX5_ADDR_OF(create_scheduling_element_in, in,
+			      scheduling_context);
+	MLX5_SET(create_scheduling_element_in, in, opcode,
+		 MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT);
+	MLX5_SET(create_scheduling_element_in, in, scheduling_hierarchy,
+		 hierarchy);
+	memcpy(schedc, ctx, MLX5_ST_SZ_BYTES(scheduling_context));
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	*element_id = MLX5_GET(create_scheduling_element_out, out,
+			       scheduling_element_id);
+	return 0;
+}
+
+int mlx5_modify_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+				       void *ctx, u32 element_id,
+				       u32 modify_bitmask)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_scheduling_element_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(modify_scheduling_element_in)] = {0};
+	void *schedc;
+
+	schedc = MLX5_ADDR_OF(modify_scheduling_element_in, in,
+			      scheduling_context);
+	MLX5_SET(modify_scheduling_element_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT);
+	MLX5_SET(modify_scheduling_element_in, in, scheduling_element_id,
+		 element_id);
+	MLX5_SET(modify_scheduling_element_in, in, modify_bitmask,
+		 modify_bitmask);
+	MLX5_SET(modify_scheduling_element_in, in, scheduling_hierarchy,
+		 hierarchy);
+	memcpy(schedc, ctx, MLX5_ST_SZ_BYTES(scheduling_context));
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+					u32 element_id)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_scheduling_element_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_scheduling_element_in)] = {0};
+
+	MLX5_SET(destroy_scheduling_element_in, in, opcode,
+		 MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT);
+	MLX5_SET(destroy_scheduling_element_in, in, scheduling_element_id,
+		 element_id);
+	MLX5_SET(destroy_scheduling_element_in, in, scheduling_hierarchy,
+		 hierarchy);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+/* Finds an entry where we can register the given rate
+ * If the rate already exists, return the entry where it is registered,
+ * otherwise return the first available entry.
+ * If the table is full, return NULL
+ */
+static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
+					   struct mlx5_rate_limit *rl)
+{
+	struct mlx5_rl_entry *ret_entry = NULL;
+	bool empty_found = false;
+	int i;
+
+	for (i = 0; i < table->max_size; i++) {
+		if (mlx5_rl_are_equal(&table->rl_entry[i].rl, rl))
+			return &table->rl_entry[i];
+		if (!empty_found && !table->rl_entry[i].rl.rate) {
+			empty_found = true;
+			ret_entry = &table->rl_entry[i];
+		}
+	}
+
+	return ret_entry;
+}
+
+static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev,
+				      u16 index,
+				      struct mlx5_rate_limit *rl)
+{
+	u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {0};
+
+	MLX5_SET(set_pp_rate_limit_in, in, opcode,
+		 MLX5_CMD_OP_SET_PP_RATE_LIMIT);
+	MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, index);
+	MLX5_SET(set_pp_rate_limit_in, in, rate_limit, rl->rate);
+	MLX5_SET(set_pp_rate_limit_in, in, burst_upper_bound, rl->max_burst_sz);
+	MLX5_SET(set_pp_rate_limit_in, in, typical_packet_size, rl->typical_pkt_sz);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+bool mlx5_rl_is_in_range(struct mlx5_core_dev *dev, u32 rate)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+
+	return (rate <= table->max_rate && rate >= table->min_rate);
+}
+EXPORT_SYMBOL(mlx5_rl_is_in_range);
+
+bool mlx5_rl_are_equal(struct mlx5_rate_limit *rl_0,
+		       struct mlx5_rate_limit *rl_1)
+{
+	return ((rl_0->rate == rl_1->rate) &&
+		(rl_0->max_burst_sz == rl_1->max_burst_sz) &&
+		(rl_0->typical_pkt_sz == rl_1->typical_pkt_sz));
+}
+EXPORT_SYMBOL(mlx5_rl_are_equal);
+
+int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u16 *index,
+		     struct mlx5_rate_limit *rl)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	struct mlx5_rl_entry *entry;
+	int err = 0;
+
+	mutex_lock(&table->rl_lock);
+
+	if (!rl->rate || !mlx5_rl_is_in_range(dev, rl->rate)) {
+		mlx5_core_err(dev, "Invalid rate: %u, should be %u to %u\n",
+			      rl->rate, table->min_rate, table->max_rate);
+		err = -EINVAL;
+		goto out;
+	}
+
+	entry = find_rl_entry(table, rl);
+	if (!entry) {
+		mlx5_core_err(dev, "Max number of %u rates reached\n",
+			      table->max_size);
+		err = -ENOSPC;
+		goto out;
+	}
+	if (entry->refcount) {
+		/* rate already configured */
+		entry->refcount++;
+	} else {
+		/* new rate limit */
+		err = mlx5_set_pp_rate_limit_cmd(dev, entry->index, rl);
+		if (err) {
+			mlx5_core_err(dev, "Failed configuring rate limit(err %d): \
+				      rate %u, max_burst_sz %u, typical_pkt_sz %u\n",
+				      err, rl->rate, rl->max_burst_sz,
+				      rl->typical_pkt_sz);
+			goto out;
+		}
+		entry->rl = *rl;
+		entry->refcount = 1;
+	}
+	*index = entry->index;
+
+out:
+	mutex_unlock(&table->rl_lock);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_rl_add_rate);
+
+void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, struct mlx5_rate_limit *rl)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	struct mlx5_rl_entry *entry = NULL;
+	struct mlx5_rate_limit reset_rl = {0};
+
+	/* 0 is a reserved value for unlimited rate */
+	if (rl->rate == 0)
+		return;
+
+	mutex_lock(&table->rl_lock);
+	entry = find_rl_entry(table, rl);
+	if (!entry || !entry->refcount) {
+		mlx5_core_warn(dev, "Rate %u, max_burst_sz %u typical_pkt_sz %u \
+			       are not configured\n",
+			       rl->rate, rl->max_burst_sz, rl->typical_pkt_sz);
+		goto out;
+	}
+
+	entry->refcount--;
+	if (!entry->refcount) {
+		/* need to remove rate */
+		mlx5_set_pp_rate_limit_cmd(dev, entry->index, &reset_rl);
+		entry->rl = reset_rl;
+	}
+
+out:
+	mutex_unlock(&table->rl_lock);
+}
+EXPORT_SYMBOL(mlx5_rl_remove_rate);
+
+int mlx5_init_rl_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	int i;
+
+	mutex_init(&table->rl_lock);
+	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, packet_pacing)) {
+		table->max_size = 0;
+		return 0;
+	}
+
+	/* First entry is reserved for unlimited rate */
+	table->max_size = MLX5_CAP_QOS(dev, packet_pacing_rate_table_size) - 1;
+	table->max_rate = MLX5_CAP_QOS(dev, packet_pacing_max_rate);
+	table->min_rate = MLX5_CAP_QOS(dev, packet_pacing_min_rate);
+
+	table->rl_entry = kcalloc(table->max_size, sizeof(struct mlx5_rl_entry),
+				  GFP_KERNEL);
+	if (!table->rl_entry)
+		return -ENOMEM;
+
+	/* The index represents the index in HW rate limit table
+	 * Index 0 is reserved for unlimited rate
+	 */
+	for (i = 0; i < table->max_size; i++)
+		table->rl_entry[i].index = i + 1;
+
+	/* Index 0 is reserved */
+	mlx5_core_info(dev, "Rate limit: %u rates are supported, range: %uMbps to %uMbps\n",
+		       table->max_size,
+		       table->min_rate >> 10,
+		       table->max_rate >> 10);
+
+	return 0;
+}
+
+void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	struct mlx5_rate_limit rl = {0};
+	int i;
+
+	/* Clear all configured rates */
+	for (i = 0; i < table->max_size; i++)
+		if (table->rl_entry[i].rl.rate)
+			mlx5_set_pp_rate_limit_cmd(dev, table->rl_entry[i].index,
+						   &rl);
+
+	kfree(dev->priv.rl_table.rl_entry);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
new file mode 100644
index 0000000..2a8b529
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2014, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
+#include "mlx5_core.h"
+#include "eswitch.h"
+
+bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+
+	return !!sriov->num_vfs;
+}
+
+static int sriov_restore_guids(struct mlx5_core_dev *dev, int vf)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	struct mlx5_hca_vport_context *in;
+	int err = 0;
+
+	/* Restore sriov guid and policy settings */
+	if (sriov->vfs_ctx[vf].node_guid ||
+	    sriov->vfs_ctx[vf].port_guid ||
+	    sriov->vfs_ctx[vf].policy != MLX5_POLICY_INVALID) {
+		in = kzalloc(sizeof(*in), GFP_KERNEL);
+		if (!in)
+			return -ENOMEM;
+
+		in->node_guid = sriov->vfs_ctx[vf].node_guid;
+		in->port_guid = sriov->vfs_ctx[vf].port_guid;
+		in->policy = sriov->vfs_ctx[vf].policy;
+		in->field_select =
+			!!(in->port_guid) * MLX5_HCA_VPORT_SEL_PORT_GUID |
+			!!(in->node_guid) * MLX5_HCA_VPORT_SEL_NODE_GUID |
+			!!(in->policy) * MLX5_HCA_VPORT_SEL_STATE_POLICY;
+
+		err = mlx5_core_modify_hca_vport_context(dev, 1, 1, vf + 1, in);
+		if (err)
+			mlx5_core_warn(dev, "modify vport context failed, unable to restore VF %d settings\n", vf);
+
+		kfree(in);
+	}
+
+	return err;
+}
+
+static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int err;
+	int vf;
+
+	if (sriov->enabled_vfs) {
+		mlx5_core_warn(dev,
+			       "failed to enable SRIOV on device, already enabled with %d vfs\n",
+			       sriov->enabled_vfs);
+		return -EBUSY;
+	}
+
+	err = mlx5_eswitch_enable_sriov(dev->priv.eswitch, num_vfs, SRIOV_LEGACY);
+	if (err) {
+		mlx5_core_warn(dev,
+			       "failed to enable eswitch SRIOV (%d)\n", err);
+		return err;
+	}
+
+	for (vf = 0; vf < num_vfs; vf++) {
+		err = mlx5_core_enable_hca(dev, vf + 1);
+		if (err) {
+			mlx5_core_warn(dev, "failed to enable VF %d (%d)\n", vf, err);
+			continue;
+		}
+		sriov->vfs_ctx[vf].enabled = 1;
+		sriov->enabled_vfs++;
+		if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) {
+			err = sriov_restore_guids(dev, vf);
+			if (err) {
+				mlx5_core_warn(dev,
+					       "failed to restore VF %d settings, err %d\n",
+					       vf, err);
+				continue;
+			}
+		}
+		mlx5_core_dbg(dev, "successfully enabled VF* %d\n", vf);
+	}
+
+	return 0;
+}
+
+static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int err;
+	int vf;
+
+	if (!sriov->enabled_vfs)
+		goto out;
+
+	for (vf = 0; vf < sriov->num_vfs; vf++) {
+		if (!sriov->vfs_ctx[vf].enabled)
+			continue;
+		err = mlx5_core_disable_hca(dev, vf + 1);
+		if (err) {
+			mlx5_core_warn(dev, "failed to disable VF %d\n", vf);
+			continue;
+		}
+		sriov->vfs_ctx[vf].enabled = 0;
+		sriov->enabled_vfs--;
+	}
+
+out:
+	mlx5_eswitch_disable_sriov(dev->priv.eswitch);
+
+	if (mlx5_wait_for_vf_pages(dev))
+		mlx5_core_warn(dev, "timeout reclaiming VFs pages\n");
+}
+
+static int mlx5_pci_enable_sriov(struct pci_dev *pdev, int num_vfs)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	int err = 0;
+
+	if (pci_num_vf(pdev)) {
+		mlx5_core_warn(dev, "Unable to enable pci sriov, already enabled\n");
+		return -EBUSY;
+	}
+
+	err = pci_enable_sriov(pdev, num_vfs);
+	if (err)
+		mlx5_core_warn(dev, "pci_enable_sriov failed : %d\n", err);
+
+	return err;
+}
+
+static void mlx5_pci_disable_sriov(struct pci_dev *pdev)
+{
+	pci_disable_sriov(pdev);
+}
+
+static int mlx5_sriov_enable(struct pci_dev *pdev, int num_vfs)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int err = 0;
+
+	err = mlx5_device_enable_sriov(dev, num_vfs);
+	if (err) {
+		mlx5_core_warn(dev, "mlx5_device_enable_sriov failed : %d\n", err);
+		return err;
+	}
+
+	err = mlx5_pci_enable_sriov(pdev, num_vfs);
+	if (err) {
+		mlx5_core_warn(dev, "mlx5_pci_enable_sriov failed : %d\n", err);
+		mlx5_device_disable_sriov(dev);
+		return err;
+	}
+
+	sriov->num_vfs = num_vfs;
+
+	return 0;
+}
+
+static void mlx5_sriov_disable(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+
+	mlx5_pci_disable_sriov(pdev);
+	mlx5_device_disable_sriov(dev);
+	sriov->num_vfs = 0;
+}
+
+int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	int err = 0;
+
+	mlx5_core_dbg(dev, "requested num_vfs %d\n", num_vfs);
+	if (!mlx5_core_is_pf(dev))
+		return -EPERM;
+
+	if (num_vfs) {
+		int ret;
+
+		ret = mlx5_lag_forbid(dev);
+		if (ret && (ret != -ENODEV))
+			return ret;
+	}
+
+	if (num_vfs) {
+		err = mlx5_sriov_enable(pdev, num_vfs);
+	} else {
+		mlx5_sriov_disable(pdev);
+		mlx5_lag_allow(dev);
+	}
+
+	return err ? err : num_vfs;
+}
+
+int mlx5_sriov_attach(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+
+	if (!mlx5_core_is_pf(dev) || !sriov->num_vfs)
+		return 0;
+
+	/* If sriov VFs exist in PCI level, enable them in device level */
+	return mlx5_device_enable_sriov(dev, sriov->num_vfs);
+}
+
+void mlx5_sriov_detach(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_core_is_pf(dev))
+		return;
+
+	mlx5_device_disable_sriov(dev);
+}
+
+int mlx5_sriov_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	struct pci_dev *pdev = dev->pdev;
+	int total_vfs;
+
+	if (!mlx5_core_is_pf(dev))
+		return 0;
+
+	total_vfs = pci_sriov_get_totalvfs(pdev);
+	sriov->num_vfs = pci_num_vf(pdev);
+	sriov->vfs_ctx = kcalloc(total_vfs, sizeof(*sriov->vfs_ctx), GFP_KERNEL);
+	if (!sriov->vfs_ctx)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_sriov_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+
+	if (!mlx5_core_is_pf(dev))
+		return;
+
+	kfree(sriov->vfs_ctx);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
new file mode 100644
index 0000000..23cc337
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
@@ -0,0 +1,692 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/mlx5/srq.h>
+#include <rdma/ib_verbs.h>
+#include "mlx5_core.h"
+#include <linux/mlx5/transobj.h>
+
+void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_core_srq *srq;
+
+	spin_lock(&table->lock);
+
+	srq = radix_tree_lookup(&table->tree, srqn);
+	if (srq)
+		atomic_inc(&srq->refcount);
+
+	spin_unlock(&table->lock);
+
+	if (!srq) {
+		mlx5_core_warn(dev, "Async event for bogus SRQ 0x%08x\n", srqn);
+		return;
+	}
+
+	srq->event(srq, event_type);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+}
+
+static int get_pas_size(struct mlx5_srq_attr *in)
+{
+	u32 log_page_size = in->log_page_size + 12;
+	u32 log_srq_size  = in->log_size;
+	u32 log_rq_stride = in->wqe_shift;
+	u32 page_offset   = in->page_offset;
+	u32 po_quanta	  = 1 << (log_page_size - 6);
+	u32 rq_sz	  = 1 << (log_srq_size + 4 + log_rq_stride);
+	u32 page_size	  = 1 << log_page_size;
+	u32 rq_sz_po      = rq_sz + (page_offset * po_quanta);
+	u32 rq_num_pas	  = (rq_sz_po + page_size - 1) / page_size;
+
+	return rq_num_pas * sizeof(u64);
+}
+
+static void set_wq(void *wq, struct mlx5_srq_attr *in)
+{
+	MLX5_SET(wq,   wq, wq_signature,  !!(in->flags
+		 & MLX5_SRQ_FLAG_WQ_SIG));
+	MLX5_SET(wq,   wq, log_wq_pg_sz,  in->log_page_size);
+	MLX5_SET(wq,   wq, log_wq_stride, in->wqe_shift + 4);
+	MLX5_SET(wq,   wq, log_wq_sz,     in->log_size);
+	MLX5_SET(wq,   wq, page_offset,   in->page_offset);
+	MLX5_SET(wq,   wq, lwm,		  in->lwm);
+	MLX5_SET(wq,   wq, pd,		  in->pd);
+	MLX5_SET64(wq, wq, dbr_addr,	  in->db_record);
+}
+
+static void set_srqc(void *srqc, struct mlx5_srq_attr *in)
+{
+	MLX5_SET(srqc,   srqc, wq_signature,  !!(in->flags
+		 & MLX5_SRQ_FLAG_WQ_SIG));
+	MLX5_SET(srqc,   srqc, log_page_size, in->log_page_size);
+	MLX5_SET(srqc,   srqc, log_rq_stride, in->wqe_shift);
+	MLX5_SET(srqc,   srqc, log_srq_size,  in->log_size);
+	MLX5_SET(srqc,   srqc, page_offset,   in->page_offset);
+	MLX5_SET(srqc,	 srqc, lwm,	      in->lwm);
+	MLX5_SET(srqc,	 srqc, pd,	      in->pd);
+	MLX5_SET64(srqc, srqc, dbr_addr,      in->db_record);
+	MLX5_SET(srqc,	 srqc, xrcd,	      in->xrcd);
+	MLX5_SET(srqc,	 srqc, cqn,	      in->cqn);
+}
+
+static void get_wq(void *wq, struct mlx5_srq_attr *in)
+{
+	if (MLX5_GET(wq, wq, wq_signature))
+		in->flags &= MLX5_SRQ_FLAG_WQ_SIG;
+	in->log_page_size = MLX5_GET(wq,   wq, log_wq_pg_sz);
+	in->wqe_shift	  = MLX5_GET(wq,   wq, log_wq_stride) - 4;
+	in->log_size	  = MLX5_GET(wq,   wq, log_wq_sz);
+	in->page_offset   = MLX5_GET(wq,   wq, page_offset);
+	in->lwm		  = MLX5_GET(wq,   wq, lwm);
+	in->pd		  = MLX5_GET(wq,   wq, pd);
+	in->db_record	  = MLX5_GET64(wq, wq, dbr_addr);
+}
+
+static void get_srqc(void *srqc, struct mlx5_srq_attr *in)
+{
+	if (MLX5_GET(srqc, srqc, wq_signature))
+		in->flags &= MLX5_SRQ_FLAG_WQ_SIG;
+	in->log_page_size = MLX5_GET(srqc,   srqc, log_page_size);
+	in->wqe_shift	  = MLX5_GET(srqc,   srqc, log_rq_stride);
+	in->log_size	  = MLX5_GET(srqc,   srqc, log_srq_size);
+	in->page_offset   = MLX5_GET(srqc,   srqc, page_offset);
+	in->lwm		  = MLX5_GET(srqc,   srqc, lwm);
+	in->pd		  = MLX5_GET(srqc,   srqc, pd);
+	in->db_record	  = MLX5_GET64(srqc, srqc, dbr_addr);
+}
+
+struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_core_srq *srq;
+
+	spin_lock(&table->lock);
+
+	srq = radix_tree_lookup(&table->tree, srqn);
+	if (srq)
+		atomic_inc(&srq->refcount);
+
+	spin_unlock(&table->lock);
+
+	return srq;
+}
+EXPORT_SYMBOL(mlx5_core_get_srq);
+
+static int create_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			  struct mlx5_srq_attr *in)
+{
+	u32 create_out[MLX5_ST_SZ_DW(create_srq_out)] = {0};
+	void *create_in;
+	void *srqc;
+	void *pas;
+	int pas_size;
+	int inlen;
+	int err;
+
+	pas_size  = get_pas_size(in);
+	inlen	  = MLX5_ST_SZ_BYTES(create_srq_in) + pas_size;
+	create_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!create_in)
+		return -ENOMEM;
+
+	srqc = MLX5_ADDR_OF(create_srq_in, create_in, srq_context_entry);
+	pas = MLX5_ADDR_OF(create_srq_in, create_in, pas);
+
+	set_srqc(srqc, in);
+	memcpy(pas, in->pas, pas_size);
+
+	MLX5_SET(create_srq_in, create_in, opcode,
+		 MLX5_CMD_OP_CREATE_SRQ);
+
+	err = mlx5_cmd_exec(dev, create_in, inlen, create_out,
+			    sizeof(create_out));
+	kvfree(create_in);
+	if (!err)
+		srq->srqn = MLX5_GET(create_srq_out, create_out, srqn);
+
+	return err;
+}
+
+static int destroy_srq_cmd(struct mlx5_core_dev *dev,
+			   struct mlx5_core_srq *srq)
+{
+	u32 srq_in[MLX5_ST_SZ_DW(destroy_srq_in)] = {0};
+	u32 srq_out[MLX5_ST_SZ_DW(destroy_srq_out)] = {0};
+
+	MLX5_SET(destroy_srq_in, srq_in, opcode,
+		 MLX5_CMD_OP_DESTROY_SRQ);
+	MLX5_SET(destroy_srq_in, srq_in, srqn, srq->srqn);
+
+	return mlx5_cmd_exec(dev, srq_in, sizeof(srq_in),
+			     srq_out, sizeof(srq_out));
+}
+
+static int arm_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+		       u16 lwm, int is_srq)
+{
+	u32 srq_in[MLX5_ST_SZ_DW(arm_rq_in)] = {0};
+	u32 srq_out[MLX5_ST_SZ_DW(arm_rq_out)] = {0};
+
+	MLX5_SET(arm_rq_in, srq_in, opcode, MLX5_CMD_OP_ARM_RQ);
+	MLX5_SET(arm_rq_in, srq_in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_SRQ);
+	MLX5_SET(arm_rq_in, srq_in, srq_number, srq->srqn);
+	MLX5_SET(arm_rq_in, srq_in, lwm,      lwm);
+
+	return  mlx5_cmd_exec(dev, srq_in, sizeof(srq_in),
+			      srq_out, sizeof(srq_out));
+}
+
+static int query_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_srq_attr *out)
+{
+	u32 srq_in[MLX5_ST_SZ_DW(query_srq_in)] = {0};
+	u32 *srq_out;
+	void *srqc;
+	int err;
+
+	srq_out = kvzalloc(MLX5_ST_SZ_BYTES(query_srq_out), GFP_KERNEL);
+	if (!srq_out)
+		return -ENOMEM;
+
+	MLX5_SET(query_srq_in, srq_in, opcode,
+		 MLX5_CMD_OP_QUERY_SRQ);
+	MLX5_SET(query_srq_in, srq_in, srqn, srq->srqn);
+	err =  mlx5_cmd_exec(dev, srq_in, sizeof(srq_in),
+			     srq_out, MLX5_ST_SZ_BYTES(query_srq_out));
+	if (err)
+		goto out;
+
+	srqc = MLX5_ADDR_OF(query_srq_out, srq_out, srq_context_entry);
+	get_srqc(srqc, out);
+	if (MLX5_GET(srqc, srqc, state) != MLX5_SRQC_STATE_GOOD)
+		out->flags |= MLX5_SRQ_FLAG_ERR;
+out:
+	kvfree(srq_out);
+	return err;
+}
+
+static int create_xrc_srq_cmd(struct mlx5_core_dev *dev,
+			      struct mlx5_core_srq *srq,
+			      struct mlx5_srq_attr *in)
+{
+	u32 create_out[MLX5_ST_SZ_DW(create_xrc_srq_out)];
+	void *create_in;
+	void *xrc_srqc;
+	void *pas;
+	int pas_size;
+	int inlen;
+	int err;
+
+	pas_size  = get_pas_size(in);
+	inlen	  = MLX5_ST_SZ_BYTES(create_xrc_srq_in) + pas_size;
+	create_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!create_in)
+		return -ENOMEM;
+
+	xrc_srqc = MLX5_ADDR_OF(create_xrc_srq_in, create_in,
+				xrc_srq_context_entry);
+	pas	 = MLX5_ADDR_OF(create_xrc_srq_in, create_in, pas);
+
+	set_srqc(xrc_srqc, in);
+	MLX5_SET(xrc_srqc, xrc_srqc, user_index, in->user_index);
+	memcpy(pas, in->pas, pas_size);
+	MLX5_SET(create_xrc_srq_in, create_in, opcode,
+		 MLX5_CMD_OP_CREATE_XRC_SRQ);
+
+	memset(create_out, 0, sizeof(create_out));
+	err = mlx5_cmd_exec(dev, create_in, inlen, create_out,
+			    sizeof(create_out));
+	if (err)
+		goto out;
+
+	srq->srqn = MLX5_GET(create_xrc_srq_out, create_out, xrc_srqn);
+out:
+	kvfree(create_in);
+	return err;
+}
+
+static int destroy_xrc_srq_cmd(struct mlx5_core_dev *dev,
+			       struct mlx5_core_srq *srq)
+{
+	u32 xrcsrq_in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)]   = {0};
+	u32 xrcsrq_out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)] = {0};
+
+	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, opcode,
+		 MLX5_CMD_OP_DESTROY_XRC_SRQ);
+	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+
+	return mlx5_cmd_exec(dev, xrcsrq_in, sizeof(xrcsrq_in),
+			     xrcsrq_out, sizeof(xrcsrq_out));
+}
+
+static int arm_xrc_srq_cmd(struct mlx5_core_dev *dev,
+			   struct mlx5_core_srq *srq, u16 lwm)
+{
+	u32 xrcsrq_in[MLX5_ST_SZ_DW(arm_xrc_srq_in)]   = {0};
+	u32 xrcsrq_out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0};
+
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, opcode,   MLX5_CMD_OP_ARM_XRC_SRQ);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, op_mod,   MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, lwm,      lwm);
+
+	return  mlx5_cmd_exec(dev, xrcsrq_in, sizeof(xrcsrq_in),
+			      xrcsrq_out, sizeof(xrcsrq_out));
+}
+
+static int query_xrc_srq_cmd(struct mlx5_core_dev *dev,
+			     struct mlx5_core_srq *srq,
+			     struct mlx5_srq_attr *out)
+{
+	u32 xrcsrq_in[MLX5_ST_SZ_DW(query_xrc_srq_in)];
+	u32 *xrcsrq_out;
+	void *xrc_srqc;
+	int err;
+
+	xrcsrq_out = kvzalloc(MLX5_ST_SZ_BYTES(query_xrc_srq_out), GFP_KERNEL);
+	if (!xrcsrq_out)
+		return -ENOMEM;
+	memset(xrcsrq_in, 0, sizeof(xrcsrq_in));
+
+	MLX5_SET(query_xrc_srq_in, xrcsrq_in, opcode,
+		 MLX5_CMD_OP_QUERY_XRC_SRQ);
+	MLX5_SET(query_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+
+	err =  mlx5_cmd_exec(dev, xrcsrq_in, sizeof(xrcsrq_in), xrcsrq_out,
+			     MLX5_ST_SZ_BYTES(query_xrc_srq_out));
+	if (err)
+		goto out;
+
+	xrc_srqc = MLX5_ADDR_OF(query_xrc_srq_out, xrcsrq_out,
+				xrc_srq_context_entry);
+	get_srqc(xrc_srqc, out);
+	if (MLX5_GET(xrc_srqc, xrc_srqc, state) != MLX5_XRC_SRQC_STATE_GOOD)
+		out->flags |= MLX5_SRQ_FLAG_ERR;
+
+out:
+	kvfree(xrcsrq_out);
+	return err;
+}
+
+static int create_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			  struct mlx5_srq_attr *in)
+{
+	void *create_in;
+	void *rmpc;
+	void *wq;
+	int pas_size;
+	int inlen;
+	int err;
+
+	pas_size = get_pas_size(in);
+	inlen = MLX5_ST_SZ_BYTES(create_rmp_in) + pas_size;
+	create_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!create_in)
+		return -ENOMEM;
+
+	rmpc = MLX5_ADDR_OF(create_rmp_in, create_in, ctx);
+	wq = MLX5_ADDR_OF(rmpc, rmpc, wq);
+
+	MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
+	set_wq(wq, in);
+	memcpy(MLX5_ADDR_OF(rmpc, rmpc, wq.pas), in->pas, pas_size);
+
+	err = mlx5_core_create_rmp(dev, create_in, inlen, &srq->srqn);
+
+	kvfree(create_in);
+	return err;
+}
+
+static int destroy_rmp_cmd(struct mlx5_core_dev *dev,
+			   struct mlx5_core_srq *srq)
+{
+	return mlx5_core_destroy_rmp(dev, srq->srqn);
+}
+
+static int arm_rmp_cmd(struct mlx5_core_dev *dev,
+		       struct mlx5_core_srq *srq,
+		       u16 lwm)
+{
+	void *in;
+	void *rmpc;
+	void *wq;
+	void *bitmask;
+	int err;
+
+	in = kvzalloc(MLX5_ST_SZ_BYTES(modify_rmp_in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rmpc =	  MLX5_ADDR_OF(modify_rmp_in,   in,   ctx);
+	bitmask = MLX5_ADDR_OF(modify_rmp_in,   in,   bitmask);
+	wq   =	  MLX5_ADDR_OF(rmpc,	        rmpc, wq);
+
+	MLX5_SET(modify_rmp_in, in,	 rmp_state, MLX5_RMPC_STATE_RDY);
+	MLX5_SET(modify_rmp_in, in,	 rmpn,      srq->srqn);
+	MLX5_SET(wq,		wq,	 lwm,	    lwm);
+	MLX5_SET(rmp_bitmask,	bitmask, lwm,	    1);
+	MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
+
+	err = mlx5_core_modify_rmp(dev, in, MLX5_ST_SZ_BYTES(modify_rmp_in));
+
+	kvfree(in);
+	return err;
+}
+
+static int query_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_srq_attr *out)
+{
+	u32 *rmp_out;
+	void *rmpc;
+	int err;
+
+	rmp_out =  kvzalloc(MLX5_ST_SZ_BYTES(query_rmp_out), GFP_KERNEL);
+	if (!rmp_out)
+		return -ENOMEM;
+
+	err = mlx5_core_query_rmp(dev, srq->srqn, rmp_out);
+	if (err)
+		goto out;
+
+	rmpc = MLX5_ADDR_OF(query_rmp_out, rmp_out, rmp_context);
+	get_wq(MLX5_ADDR_OF(rmpc, rmpc, wq), out);
+	if (MLX5_GET(rmpc, rmpc, state) != MLX5_RMPC_STATE_RDY)
+		out->flags |= MLX5_SRQ_FLAG_ERR;
+
+out:
+	kvfree(rmp_out);
+	return err;
+}
+
+static int create_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			  struct mlx5_srq_attr *in)
+{
+	u32 create_out[MLX5_ST_SZ_DW(create_xrq_out)] = {0};
+	void *create_in;
+	void *xrqc;
+	void *wq;
+	int pas_size;
+	int inlen;
+	int err;
+
+	pas_size = get_pas_size(in);
+	inlen = MLX5_ST_SZ_BYTES(create_xrq_in) + pas_size;
+	create_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!create_in)
+		return -ENOMEM;
+
+	xrqc = MLX5_ADDR_OF(create_xrq_in, create_in, xrq_context);
+	wq = MLX5_ADDR_OF(xrqc, xrqc, wq);
+
+	set_wq(wq, in);
+	memcpy(MLX5_ADDR_OF(xrqc, xrqc, wq.pas), in->pas, pas_size);
+
+	if (in->type == IB_SRQT_TM) {
+		MLX5_SET(xrqc, xrqc, topology, MLX5_XRQC_TOPOLOGY_TAG_MATCHING);
+		if (in->flags & MLX5_SRQ_FLAG_RNDV)
+			MLX5_SET(xrqc, xrqc, offload, MLX5_XRQC_OFFLOAD_RNDV);
+		MLX5_SET(xrqc, xrqc,
+			 tag_matching_topology_context.log_matching_list_sz,
+			 in->tm_log_list_size);
+	}
+	MLX5_SET(xrqc, xrqc, user_index, in->user_index);
+	MLX5_SET(xrqc, xrqc, cqn, in->cqn);
+	MLX5_SET(create_xrq_in, create_in, opcode, MLX5_CMD_OP_CREATE_XRQ);
+	err = mlx5_cmd_exec(dev, create_in, inlen, create_out,
+			    sizeof(create_out));
+	kvfree(create_in);
+	if (!err)
+		srq->srqn = MLX5_GET(create_xrq_out, create_out, xrqn);
+
+	return err;
+}
+
+static int destroy_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_xrq_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_xrq_out)] = {0};
+
+	MLX5_SET(destroy_xrq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRQ);
+	MLX5_SET(destroy_xrq_in, in, xrqn,   srq->srqn);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int arm_xrq_cmd(struct mlx5_core_dev *dev,
+		       struct mlx5_core_srq *srq,
+		       u16 lwm)
+{
+	u32 out[MLX5_ST_SZ_DW(arm_rq_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {0};
+
+	MLX5_SET(arm_rq_in, in, opcode,     MLX5_CMD_OP_ARM_RQ);
+	MLX5_SET(arm_rq_in, in, op_mod,     MLX5_ARM_RQ_IN_OP_MOD_XRQ);
+	MLX5_SET(arm_rq_in, in, srq_number, srq->srqn);
+	MLX5_SET(arm_rq_in, in, lwm,	    lwm);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int query_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_srq_attr *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_xrq_in)] = {0};
+	u32 *xrq_out;
+	int outlen = MLX5_ST_SZ_BYTES(query_xrq_out);
+	void *xrqc;
+	int err;
+
+	xrq_out = kvzalloc(outlen, GFP_KERNEL);
+	if (!xrq_out)
+		return -ENOMEM;
+
+	MLX5_SET(query_xrq_in, in, opcode, MLX5_CMD_OP_QUERY_XRQ);
+	MLX5_SET(query_xrq_in, in, xrqn, srq->srqn);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), xrq_out, outlen);
+	if (err)
+		goto out;
+
+	xrqc = MLX5_ADDR_OF(query_xrq_out, xrq_out, xrq_context);
+	get_wq(MLX5_ADDR_OF(xrqc, xrqc, wq), out);
+	if (MLX5_GET(xrqc, xrqc, state) != MLX5_XRQC_STATE_GOOD)
+		out->flags |= MLX5_SRQ_FLAG_ERR;
+	out->tm_next_tag =
+		MLX5_GET(xrqc, xrqc,
+			 tag_matching_topology_context.append_next_index);
+	out->tm_hw_phase_cnt =
+		MLX5_GET(xrqc, xrqc,
+			 tag_matching_topology_context.hw_phase_cnt);
+	out->tm_sw_phase_cnt =
+		MLX5_GET(xrqc, xrqc,
+			 tag_matching_topology_context.sw_phase_cnt);
+
+out:
+	kvfree(xrq_out);
+	return err;
+}
+
+static int create_srq_split(struct mlx5_core_dev *dev,
+			    struct mlx5_core_srq *srq,
+			    struct mlx5_srq_attr *in)
+{
+	if (!dev->issi)
+		return create_srq_cmd(dev, srq, in);
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
+		return create_xrc_srq_cmd(dev, srq, in);
+	case MLX5_RES_XRQ:
+		return create_xrq_cmd(dev, srq, in);
+	default:
+		return create_rmp_cmd(dev, srq, in);
+	}
+}
+
+static int destroy_srq_split(struct mlx5_core_dev *dev,
+			     struct mlx5_core_srq *srq)
+{
+	if (!dev->issi)
+		return destroy_srq_cmd(dev, srq);
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
+		return destroy_xrc_srq_cmd(dev, srq);
+	case MLX5_RES_XRQ:
+		return destroy_xrq_cmd(dev, srq);
+	default:
+		return destroy_rmp_cmd(dev, srq);
+	}
+}
+
+int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_srq_attr *in)
+{
+	int err;
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+
+	switch (in->type) {
+	case IB_SRQT_XRC:
+		srq->common.res = MLX5_RES_XSRQ;
+		break;
+	case IB_SRQT_TM:
+		srq->common.res = MLX5_RES_XRQ;
+		break;
+	default:
+		srq->common.res = MLX5_RES_SRQ;
+	}
+
+	err = create_srq_split(dev, srq, in);
+	if (err)
+		return err;
+
+	atomic_set(&srq->refcount, 1);
+	init_completion(&srq->free);
+
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree, srq->srqn, srq);
+	spin_unlock_irq(&table->lock);
+	if (err) {
+		mlx5_core_warn(dev, "err %d, srqn 0x%x\n", err, srq->srqn);
+		goto err_destroy_srq_split;
+	}
+
+	return 0;
+
+err_destroy_srq_split:
+	destroy_srq_split(dev, srq);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_srq);
+
+int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_core_srq *tmp;
+	int err;
+
+	spin_lock_irq(&table->lock);
+	tmp = radix_tree_delete(&table->tree, srq->srqn);
+	spin_unlock_irq(&table->lock);
+	if (!tmp) {
+		mlx5_core_warn(dev, "srq 0x%x not found in tree\n", srq->srqn);
+		return -EINVAL;
+	}
+	if (tmp != srq) {
+		mlx5_core_warn(dev, "corruption on srqn 0x%x\n", srq->srqn);
+		return -EINVAL;
+	}
+
+	err = destroy_srq_split(dev, srq);
+	if (err)
+		return err;
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+	wait_for_completion(&srq->free);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_core_destroy_srq);
+
+int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			struct mlx5_srq_attr *out)
+{
+	if (!dev->issi)
+		return query_srq_cmd(dev, srq, out);
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
+		return query_xrc_srq_cmd(dev, srq, out);
+	case MLX5_RES_XRQ:
+		return query_xrq_cmd(dev, srq, out);
+	default:
+		return query_rmp_cmd(dev, srq, out);
+	}
+}
+EXPORT_SYMBOL(mlx5_core_query_srq);
+
+int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+		      u16 lwm, int is_srq)
+{
+	if (!dev->issi)
+		return arm_srq_cmd(dev, srq, lwm, is_srq);
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
+		return arm_xrc_srq_cmd(dev, srq, lwm);
+	case MLX5_RES_XRQ:
+		return arm_xrq_cmd(dev, srq, lwm);
+	default:
+		return arm_rmp_cmd(dev, srq, lwm);
+	}
+}
+EXPORT_SYMBOL(mlx5_core_arm_srq);
+
+void mlx5_init_srq_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+
+	memset(table, 0, sizeof(*table));
+	spin_lock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+}
+
+void mlx5_cleanup_srq_table(struct mlx5_core_dev *dev)
+{
+	/* nothing */
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
new file mode 100644
index 0000000..dae1c5c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include <linux/mlx5/transobj.h>
+
+int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn)
+{
+	u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {0};
+	int err;
+
+	MLX5_SET(alloc_transport_domain_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*tdn = MLX5_GET(alloc_transport_domain_out, out,
+				transport_domain);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_alloc_transport_domain);
+
+void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)] = {0};
+
+	MLX5_SET(dealloc_transport_domain_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN);
+	MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_dealloc_transport_domain);
+
+int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_rq_out)] = {0};
+	int err;
+
+	MLX5_SET(create_rq_in, in, opcode, MLX5_CMD_OP_CREATE_RQ);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*rqn = MLX5_GET(create_rq_out, out, rqn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_rq);
+
+int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_rq_out)];
+
+	MLX5_SET(modify_rq_in, in, rqn, rqn);
+	MLX5_SET(modify_rq_in, in, opcode, MLX5_CMD_OP_MODIFY_RQ);
+
+	memset(out, 0, sizeof(out));
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_modify_rq);
+
+void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rq_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_rq_out)] = {0};
+
+	MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ);
+	MLX5_SET(destroy_rq_in, in, rqn, rqn);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_destroy_rq);
+
+int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_rq_in)] = {0};
+	int outlen = MLX5_ST_SZ_BYTES(query_rq_out);
+
+	MLX5_SET(query_rq_in, in, opcode, MLX5_CMD_OP_QUERY_RQ);
+	MLX5_SET(query_rq_in, in, rqn, rqn);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_rq);
+
+int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_sq_out)] = {0};
+	int err;
+
+	MLX5_SET(create_sq_in, in, opcode, MLX5_CMD_OP_CREATE_SQ);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*sqn = MLX5_GET(create_sq_out, out, sqn);
+
+	return err;
+}
+
+int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_sq_out)] = {0};
+
+	MLX5_SET(modify_sq_in, in, sqn, sqn);
+	MLX5_SET(modify_sq_in, in, opcode, MLX5_CMD_OP_MODIFY_SQ);
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_modify_sq);
+
+void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_sq_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_sq_out)] = {0};
+
+	MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ);
+	MLX5_SET(destroy_sq_in, in, sqn, sqn);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_sq_in)] = {0};
+	int outlen = MLX5_ST_SZ_BYTES(query_sq_out);
+
+	MLX5_SET(query_sq_in, in, opcode, MLX5_CMD_OP_QUERY_SQ);
+	MLX5_SET(query_sq_in, in, sqn, sqn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_sq);
+
+int mlx5_core_query_sq_state(struct mlx5_core_dev *dev, u32 sqn, u8 *state)
+{
+	void *out;
+	void *sqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(query_sq_out);
+	out = kvzalloc(inlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_core_query_sq(dev, sqn, out);
+	if (err)
+		goto out;
+
+	sqc = MLX5_ADDR_OF(query_sq_out, out, sq_context);
+	*state = MLX5_GET(sqc, sqc, state);
+
+out:
+	kvfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_query_sq_state);
+
+int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *tirn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_tir_out)] = {0};
+	int err;
+
+	MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*tirn = MLX5_GET(create_tir_out, out, tirn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_tir);
+
+int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
+			 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_tir_out)] = {0};
+
+	MLX5_SET(modify_tir_in, in, tirn, tirn);
+	MLX5_SET(modify_tir_in, in, opcode, MLX5_CMD_OP_MODIFY_TIR);
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+
+void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_tir_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_tir_out)] = {0};
+
+	MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR);
+	MLX5_SET(destroy_tir_in, in, tirn, tirn);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_destroy_tir);
+
+int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *tisn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_tis_out)] = {0};
+	int err;
+
+	MLX5_SET(create_tis_in, in, opcode, MLX5_CMD_OP_CREATE_TIS);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*tisn = MLX5_GET(create_tis_out, out, tisn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_tis);
+
+int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in,
+			 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_tis_out)] = {0};
+
+	MLX5_SET(modify_tis_in, in, tisn, tisn);
+	MLX5_SET(modify_tis_in, in, opcode, MLX5_CMD_OP_MODIFY_TIS);
+
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_modify_tis);
+
+void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_tis_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_tis_out)] = {0};
+
+	MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS);
+	MLX5_SET(destroy_tis_in, in, tisn, tisn);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_destroy_tis);
+
+int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *rmpn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_rmp_out)] = {0};
+	int err;
+
+	MLX5_SET(create_rmp_in, in, opcode, MLX5_CMD_OP_CREATE_RMP);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*rmpn = MLX5_GET(create_rmp_out, out, rmpn);
+
+	return err;
+}
+
+int mlx5_core_modify_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_rmp_out)] = {0};
+
+	MLX5_SET(modify_rmp_in, in, opcode, MLX5_CMD_OP_MODIFY_RMP);
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+
+int mlx5_core_destroy_rmp(struct mlx5_core_dev *dev, u32 rmpn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rmp_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_rmp_out)] = {0};
+
+	MLX5_SET(destroy_rmp_in, in, opcode, MLX5_CMD_OP_DESTROY_RMP);
+	MLX5_SET(destroy_rmp_in, in, rmpn, rmpn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out,
+					  sizeof(out));
+}
+
+int mlx5_core_query_rmp(struct mlx5_core_dev *dev, u32 rmpn, u32 *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_rmp_in)] = {0};
+	int outlen = MLX5_ST_SZ_BYTES(query_rmp_out);
+
+	MLX5_SET(query_rmp_in, in, opcode, MLX5_CMD_OP_QUERY_RMP);
+	MLX5_SET(query_rmp_in, in, rmpn,   rmpn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+
+int mlx5_core_arm_rmp(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm)
+{
+	void *in;
+	void *rmpc;
+	void *wq;
+	void *bitmask;
+	int  err;
+
+	in = kvzalloc(MLX5_ST_SZ_BYTES(modify_rmp_in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rmpc    = MLX5_ADDR_OF(modify_rmp_in,   in,   ctx);
+	bitmask = MLX5_ADDR_OF(modify_rmp_in,   in,   bitmask);
+	wq      = MLX5_ADDR_OF(rmpc,	        rmpc, wq);
+
+	MLX5_SET(modify_rmp_in, in,	 rmp_state, MLX5_RMPC_STATE_RDY);
+	MLX5_SET(modify_rmp_in, in,	 rmpn,      rmpn);
+	MLX5_SET(wq,		wq,	 lwm,	    lwm);
+	MLX5_SET(rmp_bitmask,	bitmask, lwm,	    1);
+	MLX5_SET(rmpc,		rmpc,	 state,	    MLX5_RMPC_STATE_RDY);
+
+	err =  mlx5_core_modify_rmp(dev, in, MLX5_ST_SZ_BYTES(modify_rmp_in));
+
+	kvfree(in);
+
+	return err;
+}
+
+int mlx5_core_create_xsrq(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			  u32 *xsrqn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_xrc_srq_out)] = {0};
+	int err;
+
+	MLX5_SET(create_xrc_srq_in, in, opcode,     MLX5_CMD_OP_CREATE_XRC_SRQ);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*xsrqn = MLX5_GET(create_xrc_srq_out, out, xrc_srqn);
+
+	return err;
+}
+
+int mlx5_core_destroy_xsrq(struct mlx5_core_dev *dev, u32 xsrqn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)] = {0};
+
+	MLX5_SET(destroy_xrc_srq_in, in, opcode,   MLX5_CMD_OP_DESTROY_XRC_SRQ);
+	MLX5_SET(destroy_xrc_srq_in, in, xrc_srqn, xsrqn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 xsrqn, u16 lwm)
+{
+	u32 in[MLX5_ST_SZ_DW(arm_xrc_srq_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0};
+
+	MLX5_SET(arm_xrc_srq_in, in, opcode,   MLX5_CMD_OP_ARM_XRC_SRQ);
+	MLX5_SET(arm_xrc_srq_in, in, xrc_srqn, xsrqn);
+	MLX5_SET(arm_xrc_srq_in, in, lwm,      lwm);
+	MLX5_SET(arm_xrc_srq_in, in, op_mod,
+		 MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *rqtn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {0};
+	int err;
+
+	MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*rqtn = MLX5_GET(create_rqt_out, out, rqtn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_rqt);
+
+int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in,
+			 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_rqt_out)] = {0};
+
+	MLX5_SET(modify_rqt_in, in, rqtn, rqtn);
+	MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT);
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+
+void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {0};
+
+	MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT);
+	MLX5_SET(destroy_rqt_in, in, rqtn, rqtn);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_destroy_rqt);
+
+static int mlx5_hairpin_create_rq(struct mlx5_core_dev *mdev,
+				  struct mlx5_hairpin_params *params, u32 *rqn)
+{
+	u32 in[MLX5_ST_SZ_DW(create_rq_in)] = {0};
+	void *rqc, *wq;
+
+	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+	wq  = MLX5_ADDR_OF(rqc, rqc, wq);
+
+	MLX5_SET(rqc, rqc, hairpin, 1);
+	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
+	MLX5_SET(rqc, rqc, counter_set_id, params->q_counter);
+
+	MLX5_SET(wq, wq, log_hairpin_data_sz, params->log_data_size);
+	MLX5_SET(wq, wq, log_hairpin_num_packets, params->log_num_packets);
+
+	return mlx5_core_create_rq(mdev, in, MLX5_ST_SZ_BYTES(create_rq_in), rqn);
+}
+
+static int mlx5_hairpin_create_sq(struct mlx5_core_dev *mdev,
+				  struct mlx5_hairpin_params *params, u32 *sqn)
+{
+	u32 in[MLX5_ST_SZ_DW(create_sq_in)] = {0};
+	void *sqc, *wq;
+
+	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
+	wq  = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	MLX5_SET(sqc, sqc, hairpin, 1);
+	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
+
+	MLX5_SET(wq, wq, log_hairpin_data_sz, params->log_data_size);
+	MLX5_SET(wq, wq, log_hairpin_num_packets, params->log_num_packets);
+
+	return mlx5_core_create_sq(mdev, in, MLX5_ST_SZ_BYTES(create_sq_in), sqn);
+}
+
+static int mlx5_hairpin_create_queues(struct mlx5_hairpin *hp,
+				      struct mlx5_hairpin_params *params)
+{
+	int i, j, err;
+
+	for (i = 0; i < hp->num_channels; i++) {
+		err = mlx5_hairpin_create_rq(hp->func_mdev, params, &hp->rqn[i]);
+		if (err)
+			goto out_err_rq;
+	}
+
+	for (i = 0; i < hp->num_channels; i++) {
+		err = mlx5_hairpin_create_sq(hp->peer_mdev, params, &hp->sqn[i]);
+		if (err)
+			goto out_err_sq;
+	}
+
+	return 0;
+
+out_err_sq:
+	for (j = 0; j < i; j++)
+		mlx5_core_destroy_sq(hp->peer_mdev, hp->sqn[j]);
+	i = hp->num_channels;
+out_err_rq:
+	for (j = 0; j < i; j++)
+		mlx5_core_destroy_rq(hp->func_mdev, hp->rqn[j]);
+	return err;
+}
+
+static void mlx5_hairpin_destroy_queues(struct mlx5_hairpin *hp)
+{
+	int i;
+
+	for (i = 0; i < hp->num_channels; i++) {
+		mlx5_core_destroy_rq(hp->func_mdev, hp->rqn[i]);
+		mlx5_core_destroy_sq(hp->peer_mdev, hp->sqn[i]);
+	}
+}
+
+static int mlx5_hairpin_modify_rq(struct mlx5_core_dev *func_mdev, u32 rqn,
+				  int curr_state, int next_state,
+				  u16 peer_vhca, u32 peer_sq)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_rq_in)] = {0};
+	void *rqc;
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+	if (next_state == MLX5_RQC_STATE_RDY) {
+		MLX5_SET(rqc, rqc, hairpin_peer_sq, peer_sq);
+		MLX5_SET(rqc, rqc, hairpin_peer_vhca, peer_vhca);
+	}
+
+	MLX5_SET(modify_rq_in, in, rq_state, curr_state);
+	MLX5_SET(rqc, rqc, state, next_state);
+
+	return mlx5_core_modify_rq(func_mdev, rqn,
+				   in, MLX5_ST_SZ_BYTES(modify_rq_in));
+}
+
+static int mlx5_hairpin_modify_sq(struct mlx5_core_dev *peer_mdev, u32 sqn,
+				  int curr_state, int next_state,
+				  u16 peer_vhca, u32 peer_rq)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_sq_in)] = {0};
+	void *sqc;
+
+	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
+
+	if (next_state == MLX5_RQC_STATE_RDY) {
+		MLX5_SET(sqc, sqc, hairpin_peer_rq, peer_rq);
+		MLX5_SET(sqc, sqc, hairpin_peer_vhca, peer_vhca);
+	}
+
+	MLX5_SET(modify_sq_in, in, sq_state, curr_state);
+	MLX5_SET(sqc, sqc, state, next_state);
+
+	return mlx5_core_modify_sq(peer_mdev, sqn,
+				   in, MLX5_ST_SZ_BYTES(modify_sq_in));
+}
+
+static int mlx5_hairpin_pair_queues(struct mlx5_hairpin *hp)
+{
+	int i, j, err;
+
+	/* set peer SQs */
+	for (i = 0; i < hp->num_channels; i++) {
+		err = mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[i],
+					     MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY,
+					     MLX5_CAP_GEN(hp->func_mdev, vhca_id), hp->rqn[i]);
+		if (err)
+			goto err_modify_sq;
+	}
+
+	/* set func RQs */
+	for (i = 0; i < hp->num_channels; i++) {
+		err = mlx5_hairpin_modify_rq(hp->func_mdev, hp->rqn[i],
+					     MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY,
+					     MLX5_CAP_GEN(hp->peer_mdev, vhca_id), hp->sqn[i]);
+		if (err)
+			goto err_modify_rq;
+	}
+
+	return 0;
+
+err_modify_rq:
+	for (j = 0; j < i; j++)
+		mlx5_hairpin_modify_rq(hp->func_mdev, hp->rqn[j], MLX5_RQC_STATE_RDY,
+				       MLX5_RQC_STATE_RST, 0, 0);
+	i = hp->num_channels;
+err_modify_sq:
+	for (j = 0; j < i; j++)
+		mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[j], MLX5_SQC_STATE_RDY,
+				       MLX5_SQC_STATE_RST, 0, 0);
+	return err;
+}
+
+static void mlx5_hairpin_unpair_queues(struct mlx5_hairpin *hp)
+{
+	int i;
+
+	/* unset func RQs */
+	for (i = 0; i < hp->num_channels; i++)
+		mlx5_hairpin_modify_rq(hp->func_mdev, hp->rqn[i], MLX5_RQC_STATE_RDY,
+				       MLX5_RQC_STATE_RST, 0, 0);
+
+	/* unset peer SQs */
+	for (i = 0; i < hp->num_channels; i++)
+		mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[i], MLX5_SQC_STATE_RDY,
+				       MLX5_SQC_STATE_RST, 0, 0);
+}
+
+struct mlx5_hairpin *
+mlx5_core_hairpin_create(struct mlx5_core_dev *func_mdev,
+			 struct mlx5_core_dev *peer_mdev,
+			 struct mlx5_hairpin_params *params)
+{
+	struct mlx5_hairpin *hp;
+	int size, err;
+
+	size = sizeof(*hp) + params->num_channels * 2 * sizeof(u32);
+	hp = kzalloc(size, GFP_KERNEL);
+	if (!hp)
+		return ERR_PTR(-ENOMEM);
+
+	hp->func_mdev = func_mdev;
+	hp->peer_mdev = peer_mdev;
+	hp->num_channels = params->num_channels;
+
+	hp->rqn = (void *)hp + sizeof(*hp);
+	hp->sqn = hp->rqn + params->num_channels;
+
+	/* alloc and pair func --> peer hairpin */
+	err = mlx5_hairpin_create_queues(hp, params);
+	if (err)
+		goto err_create_queues;
+
+	err = mlx5_hairpin_pair_queues(hp);
+	if (err)
+		goto err_pair_queues;
+
+	return hp;
+
+err_pair_queues:
+	mlx5_hairpin_destroy_queues(hp);
+err_create_queues:
+	kfree(hp);
+	return ERR_PTR(err);
+}
+
+void mlx5_core_hairpin_destroy(struct mlx5_hairpin *hp)
+{
+	mlx5_hairpin_unpair_queues(hp);
+	mlx5_hairpin_destroy_queues(hp);
+	kfree(hp);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
new file mode 100644
index 0000000..8b97066
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/io-mapping.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_uar_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(alloc_uar_in)]   = {0};
+	int err;
+
+	MLX5_SET(alloc_uar_in, in, opcode, MLX5_CMD_OP_ALLOC_UAR);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*uarn = MLX5_GET(alloc_uar_out, out, uar);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_cmd_alloc_uar);
+
+int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn)
+{
+	u32 out[MLX5_ST_SZ_DW(dealloc_uar_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(dealloc_uar_in)]   = {0};
+
+	MLX5_SET(dealloc_uar_in, in, opcode, MLX5_CMD_OP_DEALLOC_UAR);
+	MLX5_SET(dealloc_uar_in, in, uar, uarn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_cmd_free_uar);
+
+static int uars_per_sys_page(struct mlx5_core_dev *mdev)
+{
+	if (MLX5_CAP_GEN(mdev, uar_4k))
+		return MLX5_CAP_GEN(mdev, num_of_uars_per_page);
+
+	return 1;
+}
+
+static u64 uar2pfn(struct mlx5_core_dev *mdev, u32 index)
+{
+	u32 system_page_index;
+
+	if (MLX5_CAP_GEN(mdev, uar_4k))
+		system_page_index = index >> (PAGE_SHIFT - MLX5_ADAPTER_PAGE_SHIFT);
+	else
+		system_page_index = index;
+
+	return (pci_resource_start(mdev->pdev, 0) >> PAGE_SHIFT) + system_page_index;
+}
+
+static void up_rel_func(struct kref *kref)
+{
+	struct mlx5_uars_page *up = container_of(kref, struct mlx5_uars_page, ref_count);
+
+	list_del(&up->list);
+	iounmap(up->map);
+	if (mlx5_cmd_free_uar(up->mdev, up->index))
+		mlx5_core_warn(up->mdev, "failed to free uar index %d\n", up->index);
+	kfree(up->reg_bitmap);
+	kfree(up->fp_bitmap);
+	kfree(up);
+}
+
+static struct mlx5_uars_page *alloc_uars_page(struct mlx5_core_dev *mdev,
+					      bool map_wc)
+{
+	struct mlx5_uars_page *up;
+	int err = -ENOMEM;
+	phys_addr_t pfn;
+	int bfregs;
+	int i;
+
+	bfregs = uars_per_sys_page(mdev) * MLX5_BFREGS_PER_UAR;
+	up = kzalloc(sizeof(*up), GFP_KERNEL);
+	if (!up)
+		return ERR_PTR(err);
+
+	up->mdev = mdev;
+	up->reg_bitmap = kcalloc(BITS_TO_LONGS(bfregs), sizeof(unsigned long), GFP_KERNEL);
+	if (!up->reg_bitmap)
+		goto error1;
+
+	up->fp_bitmap = kcalloc(BITS_TO_LONGS(bfregs), sizeof(unsigned long), GFP_KERNEL);
+	if (!up->fp_bitmap)
+		goto error1;
+
+	for (i = 0; i < bfregs; i++)
+		if ((i % MLX5_BFREGS_PER_UAR) < MLX5_NON_FP_BFREGS_PER_UAR)
+			set_bit(i, up->reg_bitmap);
+		else
+			set_bit(i, up->fp_bitmap);
+
+	up->bfregs = bfregs;
+	up->fp_avail = bfregs * MLX5_FP_BFREGS_PER_UAR / MLX5_BFREGS_PER_UAR;
+	up->reg_avail = bfregs * MLX5_NON_FP_BFREGS_PER_UAR / MLX5_BFREGS_PER_UAR;
+
+	err = mlx5_cmd_alloc_uar(mdev, &up->index);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_cmd_alloc_uar() failed, %d\n", err);
+		goto error1;
+	}
+
+	pfn = uar2pfn(mdev, up->index);
+	if (map_wc) {
+		up->map = ioremap_wc(pfn << PAGE_SHIFT, PAGE_SIZE);
+		if (!up->map) {
+			err = -EAGAIN;
+			goto error2;
+		}
+	} else {
+		up->map = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+		if (!up->map) {
+			err = -ENOMEM;
+			goto error2;
+		}
+	}
+	kref_init(&up->ref_count);
+	mlx5_core_dbg(mdev, "allocated UAR page: index %d, total bfregs %d\n",
+		      up->index, up->bfregs);
+	return up;
+
+error2:
+	if (mlx5_cmd_free_uar(mdev, up->index))
+		mlx5_core_warn(mdev, "failed to free uar index %d\n", up->index);
+error1:
+	kfree(up->fp_bitmap);
+	kfree(up->reg_bitmap);
+	kfree(up);
+	return ERR_PTR(err);
+}
+
+struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_uars_page *ret;
+
+	mutex_lock(&mdev->priv.bfregs.reg_head.lock);
+	if (!list_empty(&mdev->priv.bfregs.reg_head.list)) {
+		ret = list_first_entry(&mdev->priv.bfregs.reg_head.list,
+				       struct mlx5_uars_page, list);
+		kref_get(&ret->ref_count);
+		goto out;
+	}
+	ret = alloc_uars_page(mdev, false);
+	if (IS_ERR(ret))
+		goto out;
+	list_add(&ret->list, &mdev->priv.bfregs.reg_head.list);
+out:
+	mutex_unlock(&mdev->priv.bfregs.reg_head.lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(mlx5_get_uars_page);
+
+void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up)
+{
+	mutex_lock(&mdev->priv.bfregs.reg_head.lock);
+	kref_put(&up->ref_count, up_rel_func);
+	mutex_unlock(&mdev->priv.bfregs.reg_head.lock);
+}
+EXPORT_SYMBOL(mlx5_put_uars_page);
+
+static unsigned long map_offset(struct mlx5_core_dev *mdev, int dbi)
+{
+	/* return the offset in bytes from the start of the page to the
+	 * blue flame area of the UAR
+	 */
+	return dbi / MLX5_BFREGS_PER_UAR * MLX5_ADAPTER_PAGE_SIZE +
+	       (dbi % MLX5_BFREGS_PER_UAR) *
+	       (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) + MLX5_BF_OFFSET;
+}
+
+static int alloc_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg,
+		       bool map_wc, bool fast_path)
+{
+	struct mlx5_bfreg_data *bfregs;
+	struct mlx5_uars_page *up;
+	struct list_head *head;
+	unsigned long *bitmap;
+	unsigned int *avail;
+	struct mutex *lock;  /* pointer to right mutex */
+	int dbi;
+
+	bfregs = &mdev->priv.bfregs;
+	if (map_wc) {
+		head = &bfregs->wc_head.list;
+		lock = &bfregs->wc_head.lock;
+	} else {
+		head = &bfregs->reg_head.list;
+		lock = &bfregs->reg_head.lock;
+	}
+	mutex_lock(lock);
+	if (list_empty(head)) {
+		up = alloc_uars_page(mdev, map_wc);
+		if (IS_ERR(up)) {
+			mutex_unlock(lock);
+			return PTR_ERR(up);
+		}
+		list_add(&up->list, head);
+	} else {
+		up = list_entry(head->next, struct mlx5_uars_page, list);
+		kref_get(&up->ref_count);
+	}
+	if (fast_path) {
+		bitmap = up->fp_bitmap;
+		avail = &up->fp_avail;
+	} else {
+		bitmap = up->reg_bitmap;
+		avail = &up->reg_avail;
+	}
+	dbi = find_first_bit(bitmap, up->bfregs);
+	clear_bit(dbi, bitmap);
+	(*avail)--;
+	if (!(*avail))
+		list_del(&up->list);
+
+	bfreg->map = up->map + map_offset(mdev, dbi);
+	bfreg->up = up;
+	bfreg->wc = map_wc;
+	bfreg->index = up->index + dbi / MLX5_BFREGS_PER_UAR;
+	mutex_unlock(lock);
+
+	return 0;
+}
+
+int mlx5_alloc_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg,
+		     bool map_wc, bool fast_path)
+{
+	int err;
+
+	err = alloc_bfreg(mdev, bfreg, map_wc, fast_path);
+	if (!err)
+		return 0;
+
+	if (err == -EAGAIN && map_wc)
+		return alloc_bfreg(mdev, bfreg, false, fast_path);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_alloc_bfreg);
+
+static unsigned int addr_to_dbi_in_syspage(struct mlx5_core_dev *dev,
+					   struct mlx5_uars_page *up,
+					   struct mlx5_sq_bfreg *bfreg)
+{
+	unsigned int uar_idx;
+	unsigned int bfreg_idx;
+	unsigned int bf_reg_size;
+
+	bf_reg_size = 1 << MLX5_CAP_GEN(dev, log_bf_reg_size);
+
+	uar_idx = (bfreg->map - up->map) >> MLX5_ADAPTER_PAGE_SHIFT;
+	bfreg_idx = (((uintptr_t)bfreg->map % MLX5_ADAPTER_PAGE_SIZE) - MLX5_BF_OFFSET) / bf_reg_size;
+
+	return uar_idx * MLX5_BFREGS_PER_UAR + bfreg_idx;
+}
+
+void mlx5_free_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg)
+{
+	struct mlx5_bfreg_data *bfregs;
+	struct mlx5_uars_page *up;
+	struct mutex *lock; /* pointer to right mutex */
+	unsigned int dbi;
+	bool fp;
+	unsigned int *avail;
+	unsigned long *bitmap;
+	struct list_head *head;
+
+	bfregs = &mdev->priv.bfregs;
+	if (bfreg->wc) {
+		head = &bfregs->wc_head.list;
+		lock = &bfregs->wc_head.lock;
+	} else {
+		head = &bfregs->reg_head.list;
+		lock = &bfregs->reg_head.lock;
+	}
+	up = bfreg->up;
+	dbi = addr_to_dbi_in_syspage(mdev, up, bfreg);
+	fp = (dbi % MLX5_BFREGS_PER_UAR) >= MLX5_NON_FP_BFREGS_PER_UAR;
+	if (fp) {
+		avail = &up->fp_avail;
+		bitmap = up->fp_bitmap;
+	} else {
+		avail = &up->reg_avail;
+		bitmap = up->reg_bitmap;
+	}
+	mutex_lock(lock);
+	(*avail)++;
+	set_bit(dbi, bitmap);
+	if (*avail == 1)
+		list_add_tail(&up->list, head);
+
+	kref_put(&up->ref_count, up_rel_func);
+	mutex_unlock(lock);
+}
+EXPORT_SYMBOL(mlx5_free_bfreg);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
new file mode 100644
index 0000000..177e076
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -0,0 +1,1217 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
+#include "mlx5_core.h"
+
+/* Mutex to hold while enabling or disabling RoCE */
+static DEFINE_MUTEX(mlx5_roce_en_lock);
+
+static int _mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod,
+				   u16 vport, u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {0};
+
+	MLX5_SET(query_vport_state_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VPORT_STATE);
+	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
+	MLX5_SET(query_vport_state_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_vport_state_in, in, other_vport, 1);
+
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
+}
+
+u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
+{
+	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {0};
+
+	_mlx5_query_vport_state(mdev, opmod, vport, out, sizeof(out));
+
+	return MLX5_GET(query_vport_state_out, out, state);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_vport_state);
+
+u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
+{
+	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {0};
+
+	_mlx5_query_vport_state(mdev, opmod, vport, out, sizeof(out));
+
+	return MLX5_GET(query_vport_state_out, out, admin_state);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_vport_admin_state);
+
+int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
+				  u16 vport, u8 state)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(modify_vport_state_out)] = {0};
+
+	MLX5_SET(modify_vport_state_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_VPORT_STATE);
+	MLX5_SET(modify_vport_state_in, in, op_mod, opmod);
+	MLX5_SET(modify_vport_state_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(modify_vport_state_in, in, other_vport, 1);
+	MLX5_SET(modify_vport_state_in, in, admin_state, state);
+
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_vport_admin_state);
+
+static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport,
+					u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {0};
+
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
+}
+
+static int mlx5_modify_nic_vport_context(struct mlx5_core_dev *mdev, void *in,
+					 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)] = {0};
+
+	MLX5_SET(modify_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
+	return mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
+}
+
+int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
+				    u16 vport, u8 *min_inline)
+{
+	u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {0};
+	int err;
+
+	err = mlx5_query_nic_vport_context(mdev, vport, out, sizeof(out));
+	if (!err)
+		*min_inline = MLX5_GET(query_nic_vport_context_out, out,
+				       nic_vport_context.min_wqe_inline_mode);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_min_inline);
+
+void mlx5_query_min_inline(struct mlx5_core_dev *mdev,
+			   u8 *min_inline_mode)
+{
+	switch (MLX5_CAP_ETH(mdev, wqe_inline_mode)) {
+	case MLX5_CAP_INLINE_MODE_L2:
+		*min_inline_mode = MLX5_INLINE_MODE_L2;
+		break;
+	case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+		mlx5_query_nic_vport_min_inline(mdev, 0, min_inline_mode);
+		break;
+	case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+		*min_inline_mode = MLX5_INLINE_MODE_NONE;
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(mlx5_query_min_inline);
+
+int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev,
+				     u16 vport, u8 min_inline)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)] = {0};
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *nic_vport_ctx;
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.min_inline, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
+	MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
+
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in,
+				     in, nic_vport_context);
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 min_wqe_inline_mode, min_inline);
+
+	return mlx5_modify_nic_vport_context(mdev, in, inlen);
+}
+
+int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
+				     u16 vport, u8 *addr)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	u8 *out_addr;
+	int err;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	out_addr = MLX5_ADDR_OF(query_nic_vport_context_out, out,
+				nic_vport_context.permanent_address);
+
+	err = mlx5_query_nic_vport_context(mdev, vport, out, outlen);
+	if (!err)
+		ether_addr_copy(addr, &out_addr[2]);
+
+	kvfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mac_address);
+
+int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev,
+				      u16 vport, u8 *addr)
+{
+	void *in;
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	int err;
+	void *nic_vport_ctx;
+	u8 *perm_mac;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.permanent_address, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
+
+	if (vport)
+		MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
+
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in,
+				     in, nic_vport_context);
+	perm_mac = MLX5_ADDR_OF(nic_vport_context, nic_vport_ctx,
+				permanent_address);
+
+	ether_addr_copy(&perm_mac[2], addr);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_address);
+
+int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	u32 *out;
+	int err;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+	if (!err)
+		*mtu = MLX5_GET(query_nic_vport_context_out, out,
+				nic_vport_context.mtu);
+
+	kvfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mtu);
+
+int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu, mtu);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mtu);
+
+int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev,
+				  u32 vport,
+				  enum mlx5_list_type list_type,
+				  u8 addr_list[][ETH_ALEN],
+				  int *list_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {0};
+	void *nic_vport_ctx;
+	int max_list_size;
+	int req_list_size;
+	int out_sz;
+	void *out;
+	int err;
+	int i;
+
+	req_list_size = *list_size;
+
+	max_list_size = list_type == MLX5_NVPRT_LIST_TYPE_UC ?
+		1 << MLX5_CAP_GEN(dev, log_max_current_uc_list) :
+		1 << MLX5_CAP_GEN(dev, log_max_current_mc_list);
+
+	if (req_list_size > max_list_size) {
+		mlx5_core_warn(dev, "Requested list size (%d) > (%d) max_list_size\n",
+			       req_list_size, max_list_size);
+		req_list_size = max_list_size;
+	}
+
+	out_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) +
+			req_list_size * MLX5_ST_SZ_BYTES(mac_address_layout);
+
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, allowed_list_type, list_type);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+
+	if (vport)
+		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
+	if (err)
+		goto out;
+
+	nic_vport_ctx = MLX5_ADDR_OF(query_nic_vport_context_out, out,
+				     nic_vport_context);
+	req_list_size = MLX5_GET(nic_vport_context, nic_vport_ctx,
+				 allowed_list_size);
+
+	*list_size = req_list_size;
+	for (i = 0; i < req_list_size; i++) {
+		u8 *mac_addr = MLX5_ADDR_OF(nic_vport_context,
+					nic_vport_ctx,
+					current_uc_mac_address[i]) + 2;
+		ether_addr_copy(addr_list[i], mac_addr);
+	}
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mac_list);
+
+int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev,
+				   enum mlx5_list_type list_type,
+				   u8 addr_list[][ETH_ALEN],
+				   int list_size)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)];
+	void *nic_vport_ctx;
+	int max_list_size;
+	int in_sz;
+	void *in;
+	int err;
+	int i;
+
+	max_list_size = list_type == MLX5_NVPRT_LIST_TYPE_UC ?
+		 1 << MLX5_CAP_GEN(dev, log_max_current_uc_list) :
+		 1 << MLX5_CAP_GEN(dev, log_max_current_mc_list);
+
+	if (list_size > max_list_size)
+		return -ENOSPC;
+
+	in_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) +
+		list_size * MLX5_ST_SZ_BYTES(mac_address_layout);
+
+	memset(out, 0, sizeof(out));
+	in = kzalloc(in_sz, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.addresses_list, 1);
+
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in, in,
+				     nic_vport_context);
+
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 allowed_list_type, list_type);
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 allowed_list_size, list_size);
+
+	for (i = 0; i < list_size; i++) {
+		u8 *curr_mac = MLX5_ADDR_OF(nic_vport_context,
+					    nic_vport_ctx,
+					    current_uc_mac_address[i]) + 2;
+		ether_addr_copy(curr_mac, addr_list[i]);
+	}
+
+	err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out));
+	kfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_list);
+
+int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev,
+			       u32 vport,
+			       u16 vlans[],
+			       int *size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)];
+	void *nic_vport_ctx;
+	int req_list_size;
+	int max_list_size;
+	int out_sz;
+	void *out;
+	int err;
+	int i;
+
+	req_list_size = *size;
+	max_list_size = 1 << MLX5_CAP_GEN(dev, log_max_vlan_list);
+	if (req_list_size > max_list_size) {
+		mlx5_core_warn(dev, "Requested list size (%d) > (%d) max list size\n",
+			       req_list_size, max_list_size);
+		req_list_size = max_list_size;
+	}
+
+	out_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) +
+			req_list_size * MLX5_ST_SZ_BYTES(vlan_layout);
+
+	memset(in, 0, sizeof(in));
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, allowed_list_type,
+		 MLX5_NVPRT_LIST_TYPE_VLAN);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+
+	if (vport)
+		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
+	if (err)
+		goto out;
+
+	nic_vport_ctx = MLX5_ADDR_OF(query_nic_vport_context_out, out,
+				     nic_vport_context);
+	req_list_size = MLX5_GET(nic_vport_context, nic_vport_ctx,
+				 allowed_list_size);
+
+	*size = req_list_size;
+	for (i = 0; i < req_list_size; i++) {
+		void *vlan_addr = MLX5_ADDR_OF(nic_vport_context,
+					       nic_vport_ctx,
+					       current_uc_mac_address[i]);
+		vlans[i] = MLX5_GET(vlan_layout, vlan_addr, vlan);
+	}
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_vlans);
+
+int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev,
+				u16 vlans[],
+				int list_size)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)];
+	void *nic_vport_ctx;
+	int max_list_size;
+	int in_sz;
+	void *in;
+	int err;
+	int i;
+
+	max_list_size = 1 << MLX5_CAP_GEN(dev, log_max_vlan_list);
+
+	if (list_size > max_list_size)
+		return -ENOSPC;
+
+	in_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) +
+		list_size * MLX5_ST_SZ_BYTES(vlan_layout);
+
+	memset(out, 0, sizeof(out));
+	in = kzalloc(in_sz, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.addresses_list, 1);
+
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in, in,
+				     nic_vport_context);
+
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 allowed_list_type, MLX5_NVPRT_LIST_TYPE_VLAN);
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 allowed_list_size, list_size);
+
+	for (i = 0; i < list_size; i++) {
+		void *vlan_addr = MLX5_ADDR_OF(nic_vport_context,
+					       nic_vport_ctx,
+					       current_uc_mac_address[i]);
+		MLX5_SET(vlan_layout, vlan_addr, vlan, vlans[i]);
+	}
+
+	err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out));
+	kfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_vlans);
+
+int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
+					   u64 *system_image_guid)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+	*system_image_guid = MLX5_GET64(query_nic_vport_context_out, out,
+					nic_vport_context.system_image_guid);
+
+	kfree(out);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_system_image_guid);
+
+int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+	*node_guid = MLX5_GET64(query_nic_vport_context_out, out,
+				nic_vport_context.node_guid);
+
+	kfree(out);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_node_guid);
+
+int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
+				    u32 vport, u64 node_guid)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *nic_vport_context;
+	void *in;
+	int err;
+
+	if (!vport)
+		return -EINVAL;
+	if (!MLX5_CAP_GEN(mdev, vport_group_manager))
+		return -EACCES;
+	if (!MLX5_CAP_ESW(mdev, nic_vport_node_guid_modify))
+		return -EOPNOTSUPP;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.node_guid, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
+	MLX5_SET(modify_nic_vport_context_in, in, other_vport, !!vport);
+
+	nic_vport_context = MLX5_ADDR_OF(modify_nic_vport_context_in,
+					 in, nic_vport_context);
+	MLX5_SET64(nic_vport_context, nic_vport_context, node_guid, node_guid);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
+					u16 *qkey_viol_cntr)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+	*qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out,
+				   nic_vport_context.qkey_violation_counter);
+
+	kfree(out);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_qkey_viol_cntr);
+
+int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
+			     u8 port_num, u16  vf_num, u16 gid_index,
+			     union ib_gid *gid)
+{
+	int in_sz = MLX5_ST_SZ_BYTES(query_hca_vport_gid_in);
+	int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_gid_out);
+	int is_group_manager;
+	void *out = NULL;
+	void *in = NULL;
+	union ib_gid *tmp;
+	int tbsz;
+	int nout;
+	int err;
+
+	is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager);
+	tbsz = mlx5_get_gid_table_len(MLX5_CAP_GEN(dev, gid_table_size));
+	mlx5_core_dbg(dev, "vf_num %d, index %d, gid_table_size %d\n",
+		      vf_num, gid_index, tbsz);
+
+	if (gid_index > tbsz && gid_index != 0xffff)
+		return -EINVAL;
+
+	if (gid_index == 0xffff)
+		nout = tbsz;
+	else
+		nout = 1;
+
+	out_sz += nout * sizeof(*gid);
+
+	in = kzalloc(in_sz, GFP_KERNEL);
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(query_hca_vport_gid_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_GID);
+	if (other_vport) {
+		if (is_group_manager) {
+			MLX5_SET(query_hca_vport_gid_in, in, vport_number, vf_num);
+			MLX5_SET(query_hca_vport_gid_in, in, other_vport, 1);
+		} else {
+			err = -EPERM;
+			goto out;
+		}
+	}
+	MLX5_SET(query_hca_vport_gid_in, in, gid_index, gid_index);
+
+	if (MLX5_CAP_GEN(dev, num_ports) == 2)
+		MLX5_SET(query_hca_vport_gid_in, in, port_num, port_num);
+
+	err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz);
+	if (err)
+		goto out;
+
+	tmp = out + MLX5_ST_SZ_BYTES(query_hca_vport_gid_out);
+	gid->global.subnet_prefix = tmp->global.subnet_prefix;
+	gid->global.interface_id = tmp->global.interface_id;
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_gid);
+
+int mlx5_query_hca_vport_pkey(struct mlx5_core_dev *dev, u8 other_vport,
+			      u8 port_num, u16 vf_num, u16 pkey_index,
+			      u16 *pkey)
+{
+	int in_sz = MLX5_ST_SZ_BYTES(query_hca_vport_pkey_in);
+	int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_pkey_out);
+	int is_group_manager;
+	void *out = NULL;
+	void *in = NULL;
+	void *pkarr;
+	int nout;
+	int tbsz;
+	int err;
+	int i;
+
+	is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager);
+
+	tbsz = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(dev, pkey_table_size));
+	if (pkey_index > tbsz && pkey_index != 0xffff)
+		return -EINVAL;
+
+	if (pkey_index == 0xffff)
+		nout = tbsz;
+	else
+		nout = 1;
+
+	out_sz += nout * MLX5_ST_SZ_BYTES(pkey);
+
+	in = kzalloc(in_sz, GFP_KERNEL);
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(query_hca_vport_pkey_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_PKEY);
+	if (other_vport) {
+		if (is_group_manager) {
+			MLX5_SET(query_hca_vport_pkey_in, in, vport_number, vf_num);
+			MLX5_SET(query_hca_vport_pkey_in, in, other_vport, 1);
+		} else {
+			err = -EPERM;
+			goto out;
+		}
+	}
+	MLX5_SET(query_hca_vport_pkey_in, in, pkey_index, pkey_index);
+
+	if (MLX5_CAP_GEN(dev, num_ports) == 2)
+		MLX5_SET(query_hca_vport_pkey_in, in, port_num, port_num);
+
+	err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz);
+	if (err)
+		goto out;
+
+	pkarr = MLX5_ADDR_OF(query_hca_vport_pkey_out, out, pkey);
+	for (i = 0; i < nout; i++, pkey++, pkarr += MLX5_ST_SZ_BYTES(pkey))
+		*pkey = MLX5_GET_PR(pkey, pkarr, pkey);
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_pkey);
+
+int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev,
+				 u8 other_vport, u8 port_num,
+				 u16 vf_num,
+				 struct mlx5_hca_vport_context *rep)
+{
+	int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_context_out);
+	int in[MLX5_ST_SZ_DW(query_hca_vport_context_in)] = {0};
+	int is_group_manager;
+	void *out;
+	void *ctx;
+	int err;
+
+	is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager);
+
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_hca_vport_context_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT);
+
+	if (other_vport) {
+		if (is_group_manager) {
+			MLX5_SET(query_hca_vport_context_in, in, other_vport, 1);
+			MLX5_SET(query_hca_vport_context_in, in, vport_number, vf_num);
+		} else {
+			err = -EPERM;
+			goto ex;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev, num_ports) == 2)
+		MLX5_SET(query_hca_vport_context_in, in, port_num, port_num);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out,  out_sz);
+	if (err)
+		goto ex;
+
+	ctx = MLX5_ADDR_OF(query_hca_vport_context_out, out, hca_vport_context);
+	rep->field_select = MLX5_GET_PR(hca_vport_context, ctx, field_select);
+	rep->sm_virt_aware = MLX5_GET_PR(hca_vport_context, ctx, sm_virt_aware);
+	rep->has_smi = MLX5_GET_PR(hca_vport_context, ctx, has_smi);
+	rep->has_raw = MLX5_GET_PR(hca_vport_context, ctx, has_raw);
+	rep->policy = MLX5_GET_PR(hca_vport_context, ctx, vport_state_policy);
+	rep->phys_state = MLX5_GET_PR(hca_vport_context, ctx,
+				      port_physical_state);
+	rep->vport_state = MLX5_GET_PR(hca_vport_context, ctx, vport_state);
+	rep->port_physical_state = MLX5_GET_PR(hca_vport_context, ctx,
+					       port_physical_state);
+	rep->port_guid = MLX5_GET64_PR(hca_vport_context, ctx, port_guid);
+	rep->node_guid = MLX5_GET64_PR(hca_vport_context, ctx, node_guid);
+	rep->cap_mask1 = MLX5_GET_PR(hca_vport_context, ctx, cap_mask1);
+	rep->cap_mask1_perm = MLX5_GET_PR(hca_vport_context, ctx,
+					  cap_mask1_field_select);
+	rep->cap_mask2 = MLX5_GET_PR(hca_vport_context, ctx, cap_mask2);
+	rep->cap_mask2_perm = MLX5_GET_PR(hca_vport_context, ctx,
+					  cap_mask2_field_select);
+	rep->lid = MLX5_GET_PR(hca_vport_context, ctx, lid);
+	rep->init_type_reply = MLX5_GET_PR(hca_vport_context, ctx,
+					   init_type_reply);
+	rep->lmc = MLX5_GET_PR(hca_vport_context, ctx, lmc);
+	rep->subnet_timeout = MLX5_GET_PR(hca_vport_context, ctx,
+					  subnet_timeout);
+	rep->sm_lid = MLX5_GET_PR(hca_vport_context, ctx, sm_lid);
+	rep->sm_sl = MLX5_GET_PR(hca_vport_context, ctx, sm_sl);
+	rep->qkey_violation_counter = MLX5_GET_PR(hca_vport_context, ctx,
+						  qkey_violation_counter);
+	rep->pkey_violation_counter = MLX5_GET_PR(hca_vport_context, ctx,
+						  pkey_violation_counter);
+	rep->grh_required = MLX5_GET_PR(hca_vport_context, ctx, grh_required);
+	rep->sys_image_guid = MLX5_GET64_PR(hca_vport_context, ctx,
+					    system_image_guid);
+
+ex:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_context);
+
+int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev,
+					   u64 *sys_image_guid)
+{
+	struct mlx5_hca_vport_context *rep;
+	int err;
+
+	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+	if (!rep)
+		return -ENOMEM;
+
+	err = mlx5_query_hca_vport_context(dev, 0, 1, 0, rep);
+	if (!err)
+		*sys_image_guid = rep->sys_image_guid;
+
+	kfree(rep);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_system_image_guid);
+
+int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev,
+				   u64 *node_guid)
+{
+	struct mlx5_hca_vport_context *rep;
+	int err;
+
+	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+	if (!rep)
+		return -ENOMEM;
+
+	err = mlx5_query_hca_vport_context(dev, 0, 1, 0, rep);
+	if (!err)
+		*node_guid = rep->node_guid;
+
+	kfree(rep);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_node_guid);
+
+int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev,
+				 u32 vport,
+				 int *promisc_uc,
+				 int *promisc_mc,
+				 int *promisc_all)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_query_nic_vport_context(mdev, vport, out, outlen);
+	if (err)
+		goto out;
+
+	*promisc_uc = MLX5_GET(query_nic_vport_context_out, out,
+			       nic_vport_context.promisc_uc);
+	*promisc_mc = MLX5_GET(query_nic_vport_context_out, out,
+			       nic_vport_context.promisc_mc);
+	*promisc_all = MLX5_GET(query_nic_vport_context_out, out,
+				nic_vport_context.promisc_all);
+
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_promisc);
+
+int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev,
+				  int promisc_uc,
+				  int promisc_mc,
+				  int promisc_all)
+{
+	void *in;
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.promisc, 1);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.promisc_uc, promisc_uc);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.promisc_mc, promisc_mc);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.promisc_all, promisc_all);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_promisc);
+
+enum {
+	UC_LOCAL_LB,
+	MC_LOCAL_LB
+};
+
+int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *in;
+	int err;
+
+	if (!MLX5_CAP_GEN(mdev, disable_local_lb_mc) &&
+	    !MLX5_CAP_GEN(mdev, disable_local_lb_uc))
+		return 0;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.disable_mc_local_lb, !enable);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.disable_uc_local_lb, !enable);
+
+	if (MLX5_CAP_GEN(mdev, disable_local_lb_mc))
+		MLX5_SET(modify_nic_vport_context_in, in,
+			 field_select.disable_mc_local_lb, 1);
+
+	if (MLX5_CAP_GEN(mdev, disable_local_lb_uc))
+		MLX5_SET(modify_nic_vport_context_in, in,
+			 field_select.disable_uc_local_lb, 1);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	if (!err)
+		mlx5_core_dbg(mdev, "%s local_lb\n",
+			      enable ? "enable" : "disable");
+
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_update_local_lb);
+
+int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	u32 *out;
+	int value;
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+	if (err)
+		goto out;
+
+	value = MLX5_GET(query_nic_vport_context_out, out,
+			 nic_vport_context.disable_mc_local_lb) << MC_LOCAL_LB;
+
+	value |= MLX5_GET(query_nic_vport_context_out, out,
+			  nic_vport_context.disable_uc_local_lb) << UC_LOCAL_LB;
+
+	*status = !value;
+
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_query_local_lb);
+
+enum mlx5_vport_roce_state {
+	MLX5_VPORT_ROCE_DISABLED = 0,
+	MLX5_VPORT_ROCE_ENABLED  = 1,
+};
+
+static int mlx5_nic_vport_update_roce_state(struct mlx5_core_dev *mdev,
+					    enum mlx5_vport_roce_state state)
+{
+	void *in;
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.roce_en, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.roce_en,
+		 state);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev)
+{
+	int err = 0;
+
+	mutex_lock(&mlx5_roce_en_lock);
+	if (!mdev->roce.roce_en)
+		err = mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_ENABLED);
+
+	if (!err)
+		mdev->roce.roce_en++;
+	mutex_unlock(&mlx5_roce_en_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_enable_roce);
+
+int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev)
+{
+	int err = 0;
+
+	mutex_lock(&mlx5_roce_en_lock);
+	if (mdev->roce.roce_en) {
+		mdev->roce.roce_en--;
+		if (mdev->roce.roce_en == 0)
+			err = mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED);
+
+		if (err)
+			mdev->roce.roce_en++;
+	}
+	mutex_unlock(&mlx5_roce_en_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_disable_roce);
+
+int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport,
+				  int vf, u8 port_num, void *out,
+				  size_t out_sz)
+{
+	int	in_sz = MLX5_ST_SZ_BYTES(query_vport_counter_in);
+	int	is_group_manager;
+	void   *in;
+	int	err;
+
+	is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager);
+	in = kvzalloc(in_sz, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		return err;
+	}
+
+	MLX5_SET(query_vport_counter_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VPORT_COUNTER);
+	if (other_vport) {
+		if (is_group_manager) {
+			MLX5_SET(query_vport_counter_in, in, other_vport, 1);
+			MLX5_SET(query_vport_counter_in, in, vport_number, vf + 1);
+		} else {
+			err = -EPERM;
+			goto free;
+		}
+	}
+	if (MLX5_CAP_GEN(dev, num_ports) == 2)
+		MLX5_SET(query_vport_counter_in, in, port_num, port_num);
+
+	err = mlx5_cmd_exec(dev, in, in_sz, out,  out_sz);
+free:
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_query_vport_counter);
+
+int mlx5_query_vport_down_stats(struct mlx5_core_dev *mdev, u16 vport,
+				u64 *rx_discard_vport_down,
+				u64 *tx_discard_vport_down)
+{
+	u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {0};
+	int err;
+
+	MLX5_SET(query_vnic_env_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VNIC_ENV);
+	MLX5_SET(query_vnic_env_in, in, op_mod, 0);
+	MLX5_SET(query_vnic_env_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_vnic_env_in, in, other_vport, 1);
+
+	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	*rx_discard_vport_down = MLX5_GET64(query_vnic_env_out, out,
+					    vport_env.receive_discard_vport_down);
+	*tx_discard_vport_down = MLX5_GET64(query_vnic_env_out, out,
+					    vport_env.transmit_discard_vport_down);
+	return 0;
+}
+
+int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev,
+				       u8 other_vport, u8 port_num,
+				       int vf,
+				       struct mlx5_hca_vport_context *req)
+{
+	int in_sz = MLX5_ST_SZ_BYTES(modify_hca_vport_context_in);
+	u8 out[MLX5_ST_SZ_BYTES(modify_hca_vport_context_out)];
+	int is_group_manager;
+	void *in;
+	int err;
+	void *ctx;
+
+	mlx5_core_dbg(dev, "vf %d\n", vf);
+	is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager);
+	in = kzalloc(in_sz, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	memset(out, 0, sizeof(out));
+	MLX5_SET(modify_hca_vport_context_in, in, opcode, MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT);
+	if (other_vport) {
+		if (is_group_manager) {
+			MLX5_SET(modify_hca_vport_context_in, in, other_vport, 1);
+			MLX5_SET(modify_hca_vport_context_in, in, vport_number, vf);
+		} else {
+			err = -EPERM;
+			goto ex;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev, num_ports) > 1)
+		MLX5_SET(modify_hca_vport_context_in, in, port_num, port_num);
+
+	ctx = MLX5_ADDR_OF(modify_hca_vport_context_in, in, hca_vport_context);
+	MLX5_SET(hca_vport_context, ctx, field_select, req->field_select);
+	MLX5_SET(hca_vport_context, ctx, sm_virt_aware, req->sm_virt_aware);
+	MLX5_SET(hca_vport_context, ctx, has_smi, req->has_smi);
+	MLX5_SET(hca_vport_context, ctx, has_raw, req->has_raw);
+	MLX5_SET(hca_vport_context, ctx, vport_state_policy, req->policy);
+	MLX5_SET(hca_vport_context, ctx, port_physical_state, req->phys_state);
+	MLX5_SET(hca_vport_context, ctx, vport_state, req->vport_state);
+	MLX5_SET64(hca_vport_context, ctx, port_guid, req->port_guid);
+	MLX5_SET64(hca_vport_context, ctx, node_guid, req->node_guid);
+	MLX5_SET(hca_vport_context, ctx, cap_mask1, req->cap_mask1);
+	MLX5_SET(hca_vport_context, ctx, cap_mask1_field_select, req->cap_mask1_perm);
+	MLX5_SET(hca_vport_context, ctx, cap_mask2, req->cap_mask2);
+	MLX5_SET(hca_vport_context, ctx, cap_mask2_field_select, req->cap_mask2_perm);
+	MLX5_SET(hca_vport_context, ctx, lid, req->lid);
+	MLX5_SET(hca_vport_context, ctx, init_type_reply, req->init_type_reply);
+	MLX5_SET(hca_vport_context, ctx, lmc, req->lmc);
+	MLX5_SET(hca_vport_context, ctx, subnet_timeout, req->subnet_timeout);
+	MLX5_SET(hca_vport_context, ctx, sm_lid, req->sm_lid);
+	MLX5_SET(hca_vport_context, ctx, sm_sl, req->sm_sl);
+	MLX5_SET(hca_vport_context, ctx, qkey_violation_counter, req->qkey_violation_counter);
+	MLX5_SET(hca_vport_context, ctx, pkey_violation_counter, req->pkey_violation_counter);
+	err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out));
+ex:
+	kfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_modify_hca_vport_context);
+
+int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev,
+				       struct mlx5_core_dev *port_mdev)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	err = mlx5_nic_vport_enable_roce(port_mdev);
+	if (err)
+		goto free;
+
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.affiliation, 1);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.affiliated_vhca_id,
+		 MLX5_CAP_GEN(master_mdev, vhca_id));
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.affiliation_criteria,
+		 MLX5_CAP_GEN(port_mdev, affiliate_nic_vport_criteria));
+
+	err = mlx5_modify_nic_vport_context(port_mdev, in, inlen);
+	if (err)
+		mlx5_nic_vport_disable_roce(port_mdev);
+
+free:
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_affiliate_multiport);
+
+int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.affiliation, 1);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.affiliated_vhca_id, 0);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.affiliation_criteria, 0);
+
+	err = mlx5_modify_nic_vport_context(port_mdev, in, inlen);
+	if (!err)
+		mlx5_nic_vport_disable_roce(port_mdev);
+
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_unaffiliate_multiport);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
new file mode 100644
index 0000000..2f74953
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include "vxlan.h"
+
+void mlx5e_vxlan_init(struct mlx5e_priv *priv)
+{
+	struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
+
+	spin_lock_init(&vxlan_db->lock);
+	INIT_RADIX_TREE(&vxlan_db->tree, GFP_ATOMIC);
+}
+
+static int mlx5e_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port)
+{
+	u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(add_vxlan_udp_dport_out)] = {0};
+
+	MLX5_SET(add_vxlan_udp_dport_in, in, opcode,
+		 MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT);
+	MLX5_SET(add_vxlan_udp_dport_in, in, vxlan_udp_port, port);
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5e_vxlan_core_del_port_cmd(struct mlx5_core_dev *mdev, u16 port)
+{
+	u32 in[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_out)] = {0};
+
+	MLX5_SET(delete_vxlan_udp_dport_in, in, opcode,
+		 MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT);
+	MLX5_SET(delete_vxlan_udp_dport_in, in, vxlan_udp_port, port);
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
+struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port)
+{
+	struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
+	struct mlx5e_vxlan *vxlan;
+
+	spin_lock_bh(&vxlan_db->lock);
+	vxlan = radix_tree_lookup(&vxlan_db->tree, port);
+	spin_unlock_bh(&vxlan_db->lock);
+
+	return vxlan;
+}
+
+static void mlx5e_vxlan_add_port(struct work_struct *work)
+{
+	struct mlx5e_vxlan_work *vxlan_work =
+		container_of(work, struct mlx5e_vxlan_work, work);
+	struct mlx5e_priv *priv = vxlan_work->priv;
+	struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
+	u16 port = vxlan_work->port;
+	struct mlx5e_vxlan *vxlan;
+	int err;
+
+	mutex_lock(&priv->state_lock);
+	vxlan = mlx5e_vxlan_lookup_port(priv, port);
+	if (vxlan) {
+		atomic_inc(&vxlan->refcount);
+		goto free_work;
+	}
+
+	if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port))
+		goto free_work;
+
+	vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL);
+	if (!vxlan)
+		goto err_delete_port;
+
+	vxlan->udp_port = port;
+	atomic_set(&vxlan->refcount, 1);
+
+	spin_lock_bh(&vxlan_db->lock);
+	err = radix_tree_insert(&vxlan_db->tree, vxlan->udp_port, vxlan);
+	spin_unlock_bh(&vxlan_db->lock);
+	if (err)
+		goto err_free;
+
+	goto free_work;
+
+err_free:
+	kfree(vxlan);
+err_delete_port:
+	mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
+free_work:
+	mutex_unlock(&priv->state_lock);
+	kfree(vxlan_work);
+}
+
+static void mlx5e_vxlan_del_port(struct work_struct *work)
+{
+	struct mlx5e_vxlan_work *vxlan_work =
+		container_of(work, struct mlx5e_vxlan_work, work);
+	struct mlx5e_priv *priv         = vxlan_work->priv;
+	struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
+	u16 port = vxlan_work->port;
+	struct mlx5e_vxlan *vxlan;
+	bool remove = false;
+
+	mutex_lock(&priv->state_lock);
+	spin_lock_bh(&vxlan_db->lock);
+	vxlan = radix_tree_lookup(&vxlan_db->tree, port);
+	if (!vxlan)
+		goto out_unlock;
+
+	if (atomic_dec_and_test(&vxlan->refcount)) {
+		radix_tree_delete(&vxlan_db->tree, port);
+		remove = true;
+	}
+
+out_unlock:
+	spin_unlock_bh(&vxlan_db->lock);
+
+	if (remove) {
+		mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
+		kfree(vxlan);
+	}
+	mutex_unlock(&priv->state_lock);
+	kfree(vxlan_work);
+}
+
+void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, sa_family_t sa_family,
+			    u16 port, int add)
+{
+	struct mlx5e_vxlan_work *vxlan_work;
+
+	vxlan_work = kmalloc(sizeof(*vxlan_work), GFP_ATOMIC);
+	if (!vxlan_work)
+		return;
+
+	if (add)
+		INIT_WORK(&vxlan_work->work, mlx5e_vxlan_add_port);
+	else
+		INIT_WORK(&vxlan_work->work, mlx5e_vxlan_del_port);
+
+	vxlan_work->priv = priv;
+	vxlan_work->port = port;
+	vxlan_work->sa_family = sa_family;
+	queue_work(priv->wq, &vxlan_work->work);
+}
+
+void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv)
+{
+	struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
+	struct mlx5e_vxlan *vxlan;
+	unsigned int port = 0;
+
+	/* Lockless since we are the only radix-tree consumers, wq is disabled */
+	while (radix_tree_gang_lookup(&vxlan_db->tree, (void **)&vxlan, port, 1)) {
+		port = vxlan->udp_port;
+		radix_tree_delete(&vxlan_db->tree, port);
+		mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
+		kfree(vxlan);
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
new file mode 100644
index 0000000..5ef6ae7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __MLX5_VXLAN_H__
+#define __MLX5_VXLAN_H__
+
+#include <linux/mlx5/driver.h>
+#include "en.h"
+
+struct mlx5e_vxlan {
+	atomic_t refcount;
+	u16 udp_port;
+};
+
+struct mlx5e_vxlan_work {
+	struct work_struct	work;
+	struct mlx5e_priv	*priv;
+	sa_family_t		sa_family;
+	u16			port;
+};
+
+static inline bool mlx5e_vxlan_allowed(struct mlx5_core_dev *mdev)
+{
+	return (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) &&
+		mlx5_core_is_pf(mdev));
+}
+
+void mlx5e_vxlan_init(struct mlx5e_priv *priv);
+void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv);
+
+void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, sa_family_t sa_family,
+			    u16 port, int add);
+struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port);
+
+#endif /* __MLX5_VXLAN_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
new file mode 100644
index 0000000..ea66448
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include "wq.h"
+#include "mlx5_core.h"
+
+u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq)
+{
+	return (u32)wq->sz_m1 + 1;
+}
+
+u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq)
+{
+	return wq->fbc.sz_m1 + 1;
+}
+
+u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq)
+{
+	return (u32)wq->sz_m1 + 1;
+}
+
+static u32 mlx5_wq_cyc_get_byte_size(struct mlx5_wq_cyc *wq)
+{
+	return mlx5_wq_cyc_get_size(wq) << wq->log_stride;
+}
+
+static u32 mlx5_wq_qp_get_byte_size(struct mlx5_wq_qp *wq)
+{
+	return mlx5_wq_cyc_get_byte_size(&wq->rq) +
+	       mlx5_wq_cyc_get_byte_size(&wq->sq);
+}
+
+static u32 mlx5_cqwq_get_byte_size(struct mlx5_cqwq *wq)
+{
+	return mlx5_cqwq_get_size(wq) << wq->fbc.log_stride;
+}
+
+static u32 mlx5_wq_ll_get_byte_size(struct mlx5_wq_ll *wq)
+{
+	return mlx5_wq_ll_get_size(wq) << wq->log_stride;
+}
+
+int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		       void *wqc, struct mlx5_wq_cyc *wq,
+		       struct mlx5_wq_ctrl *wq_ctrl)
+{
+	int err;
+
+	wq->log_stride = MLX5_GET(wq, wqc, log_wq_stride);
+	wq->sz_m1 = (1 << MLX5_GET(wq, wqc, log_wq_sz)) - 1;
+
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
+		return err;
+	}
+
+	err = mlx5_buf_alloc_node(mdev, mlx5_wq_cyc_get_byte_size(wq),
+				  &wq_ctrl->buf, param->buf_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_buf_alloc_node() failed, %d\n", err);
+		goto err_db_free;
+	}
+
+	wq->buf = wq_ctrl->buf.frags->buf;
+	wq->db  = wq_ctrl->db.db;
+
+	wq_ctrl->mdev = mdev;
+
+	return 0;
+
+err_db_free:
+	mlx5_db_free(mdev, &wq_ctrl->db);
+
+	return err;
+}
+
+int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		      void *qpc, struct mlx5_wq_qp *wq,
+		      struct mlx5_wq_ctrl *wq_ctrl)
+{
+	int err;
+
+	wq->rq.log_stride = MLX5_GET(qpc, qpc, log_rq_stride) + 4;
+	wq->rq.sz_m1 = (1 << MLX5_GET(qpc, qpc, log_rq_size)) - 1;
+
+	wq->sq.log_stride = ilog2(MLX5_SEND_WQE_BB);
+	wq->sq.sz_m1 = (1 << MLX5_GET(qpc, qpc, log_sq_size)) - 1;
+
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
+		return err;
+	}
+
+	err = mlx5_buf_alloc_node(mdev, mlx5_wq_qp_get_byte_size(wq),
+				  &wq_ctrl->buf, param->buf_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_buf_alloc_node() failed, %d\n", err);
+		goto err_db_free;
+	}
+
+	wq->rq.buf = wq_ctrl->buf.frags->buf;
+	wq->sq.buf = wq->rq.buf + mlx5_wq_cyc_get_byte_size(&wq->rq);
+	wq->rq.db  = &wq_ctrl->db.db[MLX5_RCV_DBR];
+	wq->sq.db  = &wq_ctrl->db.db[MLX5_SND_DBR];
+
+	wq_ctrl->mdev = mdev;
+
+	return 0;
+
+err_db_free:
+	mlx5_db_free(mdev, &wq_ctrl->db);
+
+	return err;
+}
+
+int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		     void *cqc, struct mlx5_cqwq *wq,
+		     struct mlx5_frag_wq_ctrl *wq_ctrl)
+{
+	int err;
+
+	mlx5_core_init_cq_frag_buf(&wq->fbc, cqc);
+
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
+		return err;
+	}
+
+	err = mlx5_frag_buf_alloc_node(mdev, mlx5_cqwq_get_byte_size(wq),
+				       &wq_ctrl->frag_buf,
+				       param->buf_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n",
+			       err);
+		goto err_db_free;
+	}
+
+	wq->fbc.frag_buf = wq_ctrl->frag_buf;
+	wq->db  = wq_ctrl->db.db;
+
+	wq_ctrl->mdev = mdev;
+
+	return 0;
+
+err_db_free:
+	mlx5_db_free(mdev, &wq_ctrl->db);
+
+	return err;
+}
+
+int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		      void *wqc, struct mlx5_wq_ll *wq,
+		      struct mlx5_wq_ctrl *wq_ctrl)
+{
+	struct mlx5_wqe_srq_next_seg *next_seg;
+	int err;
+	int i;
+
+	wq->log_stride = MLX5_GET(wq, wqc, log_wq_stride);
+	wq->sz_m1 = (1 << MLX5_GET(wq, wqc, log_wq_sz)) - 1;
+
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
+		return err;
+	}
+
+	err = mlx5_buf_alloc_node(mdev, mlx5_wq_ll_get_byte_size(wq),
+				  &wq_ctrl->buf, param->buf_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_buf_alloc_node() failed, %d\n", err);
+		goto err_db_free;
+	}
+
+	wq->buf = wq_ctrl->buf.frags->buf;
+	wq->db  = wq_ctrl->db.db;
+
+	for (i = 0; i < wq->sz_m1; i++) {
+		next_seg = mlx5_wq_ll_get_wqe(wq, i);
+		next_seg->next_wqe_index = cpu_to_be16(i + 1);
+	}
+	next_seg = mlx5_wq_ll_get_wqe(wq, i);
+	wq->tail_next = &next_seg->next_wqe_index;
+
+	wq_ctrl->mdev = mdev;
+
+	return 0;
+
+err_db_free:
+	mlx5_db_free(mdev, &wq_ctrl->db);
+
+	return err;
+}
+
+void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl)
+{
+	mlx5_buf_free(wq_ctrl->mdev, &wq_ctrl->buf);
+	mlx5_db_free(wq_ctrl->mdev, &wq_ctrl->db);
+}
+
+void mlx5_cqwq_destroy(struct mlx5_frag_wq_ctrl *wq_ctrl)
+{
+	mlx5_frag_buf_free(wq_ctrl->mdev, &wq_ctrl->frag_buf);
+	mlx5_db_free(wq_ctrl->mdev, &wq_ctrl->db);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
new file mode 100644
index 0000000..fca90b9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_WQ_H__
+#define __MLX5_WQ_H__
+
+#include <linux/mlx5/mlx5_ifc.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+
+struct mlx5_wq_param {
+	int		linear;
+	int		buf_numa_node;
+	int		db_numa_node;
+};
+
+struct mlx5_wq_ctrl {
+	struct mlx5_core_dev	*mdev;
+	struct mlx5_frag_buf	buf;
+	struct mlx5_db		db;
+};
+
+struct mlx5_frag_wq_ctrl {
+	struct mlx5_core_dev	*mdev;
+	struct mlx5_frag_buf	frag_buf;
+	struct mlx5_db		db;
+};
+
+struct mlx5_wq_cyc {
+	void			*buf;
+	__be32			*db;
+	u16			sz_m1;
+	u8			log_stride;
+};
+
+struct mlx5_wq_qp {
+	struct mlx5_wq_cyc	rq;
+	struct mlx5_wq_cyc	sq;
+};
+
+struct mlx5_cqwq {
+	struct mlx5_frag_buf_ctrl fbc;
+	__be32			  *db;
+	u32			  cc; /* consumer counter */
+};
+
+struct mlx5_wq_ll {
+	void			*buf;
+	__be32			*db;
+	__be16			*tail_next;
+	u16			sz_m1;
+	u16			head;
+	u16			wqe_ctr;
+	u16			cur_sz;
+	u8			log_stride;
+};
+
+int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		       void *wqc, struct mlx5_wq_cyc *wq,
+		       struct mlx5_wq_ctrl *wq_ctrl);
+u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq);
+
+int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		      void *qpc, struct mlx5_wq_qp *wq,
+		      struct mlx5_wq_ctrl *wq_ctrl);
+
+int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		     void *cqc, struct mlx5_cqwq *wq,
+		     struct mlx5_frag_wq_ctrl *wq_ctrl);
+u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq);
+
+int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		      void *wqc, struct mlx5_wq_ll *wq,
+		      struct mlx5_wq_ctrl *wq_ctrl);
+u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq);
+
+void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl);
+void mlx5_cqwq_destroy(struct mlx5_frag_wq_ctrl *wq_ctrl);
+
+static inline u16 mlx5_wq_cyc_ctr2ix(struct mlx5_wq_cyc *wq, u16 ctr)
+{
+	return ctr & wq->sz_m1;
+}
+
+static inline void *mlx5_wq_cyc_get_wqe(struct mlx5_wq_cyc *wq, u16 ix)
+{
+	return wq->buf + (ix << wq->log_stride);
+}
+
+static inline int mlx5_wq_cyc_cc_bigger(u16 cc1, u16 cc2)
+{
+	int equal   = (cc1 == cc2);
+	int smaller = 0x8000 & (cc1 - cc2);
+
+	return !equal && !smaller;
+}
+
+static inline u32 mlx5_cqwq_get_ci(struct mlx5_cqwq *wq)
+{
+	return wq->cc & wq->fbc.sz_m1;
+}
+
+static inline void *mlx5_cqwq_get_wqe(struct mlx5_cqwq *wq, u32 ix)
+{
+	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
+}
+
+static inline u32 mlx5_cqwq_get_wrap_cnt(struct mlx5_cqwq *wq)
+{
+	return wq->cc >> wq->fbc.log_sz;
+}
+
+static inline void mlx5_cqwq_pop(struct mlx5_cqwq *wq)
+{
+	wq->cc++;
+}
+
+static inline void mlx5_cqwq_update_db_record(struct mlx5_cqwq *wq)
+{
+	*wq->db = cpu_to_be32(wq->cc & 0xffffff);
+}
+
+static inline struct mlx5_cqe64 *mlx5_cqwq_get_cqe(struct mlx5_cqwq *wq)
+{
+	u32 ci = mlx5_cqwq_get_ci(wq);
+	struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(wq, ci);
+	u8 cqe_ownership_bit = cqe->op_own & MLX5_CQE_OWNER_MASK;
+	u8 sw_ownership_val = mlx5_cqwq_get_wrap_cnt(wq) & 1;
+
+	if (cqe_ownership_bit != sw_ownership_val)
+		return NULL;
+
+	/* ensure cqe content is read after cqe ownership bit */
+	dma_rmb();
+
+	return cqe;
+}
+
+static inline int mlx5_wq_ll_is_full(struct mlx5_wq_ll *wq)
+{
+	return wq->cur_sz == wq->sz_m1;
+}
+
+static inline int mlx5_wq_ll_is_empty(struct mlx5_wq_ll *wq)
+{
+	return !wq->cur_sz;
+}
+
+static inline void *mlx5_wq_ll_get_wqe(struct mlx5_wq_ll *wq, u16 ix)
+{
+	return wq->buf + (ix << wq->log_stride);
+}
+
+static inline void mlx5_wq_ll_push(struct mlx5_wq_ll *wq, u16 head_next)
+{
+	wq->head = head_next;
+	wq->wqe_ctr++;
+	wq->cur_sz++;
+}
+
+static inline void mlx5_wq_ll_pop(struct mlx5_wq_ll *wq, __be16 ix,
+				  __be16 *next_tail_next)
+{
+	*wq->tail_next = ix;
+	wq->tail_next = next_tail_next;
+	wq->cur_sz--;
+}
+
+static inline void mlx5_wq_ll_update_db_record(struct mlx5_wq_ll *wq)
+{
+	*wq->db = cpu_to_be32(wq->wqe_ctr);
+}
+
+#endif /* __MLX5_WQ_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlxfw/Kconfig b/drivers/net/ethernet/mellanox/mlxfw/Kconfig
new file mode 100644
index 0000000..186ebe7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/Kconfig
@@ -0,0 +1,13 @@
+#
+# Mellanox firmware flash library configuration
+#
+
+config MLXFW
+	tristate "Mellanox Technologies firmware flash module"
+	---help---
+	  This driver supports Mellanox Technologies Firmware
+	  flashing common logic.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxfw.
+	select XZ_DEC
diff --git a/drivers/net/ethernet/mellanox/mlxfw/Makefile b/drivers/net/ethernet/mellanox/mlxfw/Makefile
new file mode 100644
index 0000000..7448b30
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_MLXFW)	+= mlxfw.o
+mlxfw-objs		:= mlxfw_fsm.o mlxfw_mfa2_tlv_multi.o mlxfw_mfa2.o
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h
new file mode 100644
index 0000000..7a712b6
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h
@@ -0,0 +1,111 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXFW_H
+#define _MLXFW_H
+
+#include <linux/firmware.h>
+
+enum mlxfw_fsm_state {
+	MLXFW_FSM_STATE_IDLE,
+	MLXFW_FSM_STATE_LOCKED,
+	MLXFW_FSM_STATE_INITIALIZE,
+	MLXFW_FSM_STATE_DOWNLOAD,
+	MLXFW_FSM_STATE_VERIFY,
+	MLXFW_FSM_STATE_APPLY,
+	MLXFW_FSM_STATE_ACTIVATE,
+};
+
+enum mlxfw_fsm_state_err {
+	MLXFW_FSM_STATE_ERR_OK,
+	MLXFW_FSM_STATE_ERR_ERROR,
+	MLXFW_FSM_STATE_ERR_REJECTED_DIGEST_ERR,
+	MLXFW_FSM_STATE_ERR_REJECTED_NOT_APPLICABLE,
+	MLXFW_FSM_STATE_ERR_REJECTED_UNKNOWN_KEY,
+	MLXFW_FSM_STATE_ERR_REJECTED_AUTH_FAILED,
+	MLXFW_FSM_STATE_ERR_REJECTED_UNSIGNED,
+	MLXFW_FSM_STATE_ERR_REJECTED_KEY_NOT_APPLICABLE,
+	MLXFW_FSM_STATE_ERR_REJECTED_BAD_FORMAT,
+	MLXFW_FSM_STATE_ERR_BLOCKED_PENDING_RESET,
+	MLXFW_FSM_STATE_ERR_MAX,
+};
+
+struct mlxfw_dev;
+
+struct mlxfw_dev_ops {
+	int (*component_query)(struct mlxfw_dev *mlxfw_dev, u16 component_index,
+			       u32 *p_max_size, u8 *p_align_bits,
+			       u16 *p_max_write_size);
+
+	int (*fsm_lock)(struct mlxfw_dev *mlxfw_dev, u32 *fwhandle);
+
+	int (*fsm_component_update)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				    u16 component_index, u32 component_size);
+
+	int (*fsm_block_download)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				  u8 *data, u16 size, u32 offset);
+
+	int (*fsm_component_verify)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				    u16 component_index);
+
+	int (*fsm_activate)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle);
+
+	int (*fsm_query_state)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+			       enum mlxfw_fsm_state *fsm_state,
+			       enum mlxfw_fsm_state_err *fsm_state_err);
+
+	void (*fsm_cancel)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle);
+
+	void (*fsm_release)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle);
+};
+
+struct mlxfw_dev {
+	const struct mlxfw_dev_ops *ops;
+	const char *psid;
+	u16 psid_size;
+};
+
+#if IS_REACHABLE(CONFIG_MLXFW)
+int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev,
+			 const struct firmware *firmware);
+#else
+static inline
+int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev,
+			 const struct firmware *firmware)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c
new file mode 100644
index 0000000..2cf8912
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c
@@ -0,0 +1,273 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) "mlxfw: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+
+#include "mlxfw.h"
+#include "mlxfw_mfa2.h"
+
+#define MLXFW_FSM_STATE_WAIT_CYCLE_MS 200
+#define MLXFW_FSM_STATE_WAIT_TIMEOUT_MS 30000
+#define MLXFW_FSM_STATE_WAIT_ROUNDS \
+	(MLXFW_FSM_STATE_WAIT_TIMEOUT_MS / MLXFW_FSM_STATE_WAIT_CYCLE_MS)
+#define MLXFW_FSM_MAX_COMPONENT_SIZE (10 * (1 << 20))
+
+static const char * const mlxfw_fsm_state_err_str[] = {
+	[MLXFW_FSM_STATE_ERR_ERROR] =
+		"general error",
+	[MLXFW_FSM_STATE_ERR_REJECTED_DIGEST_ERR] =
+		"component hash mismatch",
+	[MLXFW_FSM_STATE_ERR_REJECTED_NOT_APPLICABLE] =
+		"component not applicable",
+	[MLXFW_FSM_STATE_ERR_REJECTED_UNKNOWN_KEY] =
+		"unknown key",
+	[MLXFW_FSM_STATE_ERR_REJECTED_AUTH_FAILED] =
+		"authentication failed",
+	[MLXFW_FSM_STATE_ERR_REJECTED_UNSIGNED] =
+		"component was not signed",
+	[MLXFW_FSM_STATE_ERR_REJECTED_KEY_NOT_APPLICABLE] =
+		"key not applicable",
+	[MLXFW_FSM_STATE_ERR_REJECTED_BAD_FORMAT] =
+		"bad format",
+	[MLXFW_FSM_STATE_ERR_BLOCKED_PENDING_RESET] =
+		"pending reset",
+	[MLXFW_FSM_STATE_ERR_MAX] =
+		"unknown error"
+};
+
+static int mlxfw_fsm_state_wait(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				enum mlxfw_fsm_state fsm_state)
+{
+	enum mlxfw_fsm_state_err fsm_state_err;
+	enum mlxfw_fsm_state curr_fsm_state;
+	int times;
+	int err;
+
+	times = MLXFW_FSM_STATE_WAIT_ROUNDS;
+retry:
+	err = mlxfw_dev->ops->fsm_query_state(mlxfw_dev, fwhandle,
+					      &curr_fsm_state, &fsm_state_err);
+	if (err)
+		return err;
+
+	if (fsm_state_err != MLXFW_FSM_STATE_ERR_OK) {
+		pr_err("Firmware flash failed: %s\n",
+		       mlxfw_fsm_state_err_str[fsm_state_err]);
+		return -EINVAL;
+	}
+	if (curr_fsm_state != fsm_state) {
+		if (--times == 0) {
+			pr_err("Timeout reached on FSM state change");
+			return -ETIMEDOUT;
+		}
+		msleep(MLXFW_FSM_STATE_WAIT_CYCLE_MS);
+		goto retry;
+	}
+	return 0;
+}
+
+#define MLXFW_ALIGN_DOWN(x, align_bits) ((x) & ~((1 << (align_bits)) - 1))
+#define MLXFW_ALIGN_UP(x, align_bits) \
+		MLXFW_ALIGN_DOWN((x) + ((1 << (align_bits)) - 1), (align_bits))
+
+static int mlxfw_flash_component(struct mlxfw_dev *mlxfw_dev,
+				 u32 fwhandle,
+				 struct mlxfw_mfa2_component *comp)
+{
+	u16 comp_max_write_size;
+	u8 comp_align_bits;
+	u32 comp_max_size;
+	u16 block_size;
+	u8 *block_ptr;
+	u32 offset;
+	int err;
+
+	err = mlxfw_dev->ops->component_query(mlxfw_dev, comp->index,
+					      &comp_max_size, &comp_align_bits,
+					      &comp_max_write_size);
+	if (err)
+		return err;
+
+	comp_max_size = min_t(u32, comp_max_size, MLXFW_FSM_MAX_COMPONENT_SIZE);
+	if (comp->data_size > comp_max_size) {
+		pr_err("Component %d is of size %d which is bigger than limit %d\n",
+		       comp->index, comp->data_size, comp_max_size);
+		return -EINVAL;
+	}
+
+	comp_max_write_size = MLXFW_ALIGN_DOWN(comp_max_write_size,
+					       comp_align_bits);
+
+	pr_debug("Component update\n");
+	err = mlxfw_dev->ops->fsm_component_update(mlxfw_dev, fwhandle,
+						   comp->index,
+						   comp->data_size);
+	if (err)
+		return err;
+
+	err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle,
+				   MLXFW_FSM_STATE_DOWNLOAD);
+	if (err)
+		goto err_out;
+
+	pr_debug("Component download\n");
+	for (offset = 0;
+	     offset < MLXFW_ALIGN_UP(comp->data_size, comp_align_bits);
+	     offset += comp_max_write_size) {
+		block_ptr = comp->data + offset;
+		block_size = (u16) min_t(u32, comp->data_size - offset,
+					 comp_max_write_size);
+		err = mlxfw_dev->ops->fsm_block_download(mlxfw_dev, fwhandle,
+							 block_ptr, block_size,
+							 offset);
+		if (err)
+			goto err_out;
+	}
+
+	pr_debug("Component verify\n");
+	err = mlxfw_dev->ops->fsm_component_verify(mlxfw_dev, fwhandle,
+						   comp->index);
+	if (err)
+		goto err_out;
+
+	err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, MLXFW_FSM_STATE_LOCKED);
+	if (err)
+		goto err_out;
+	return 0;
+
+err_out:
+	mlxfw_dev->ops->fsm_cancel(mlxfw_dev, fwhandle);
+	return err;
+}
+
+static int mlxfw_flash_components(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				  struct mlxfw_mfa2_file *mfa2_file)
+{
+	u32 component_count;
+	int err;
+	int i;
+
+	err = mlxfw_mfa2_file_component_count(mfa2_file, mlxfw_dev->psid,
+					      mlxfw_dev->psid_size,
+					      &component_count);
+	if (err) {
+		pr_err("Could not find device PSID in MFA2 file\n");
+		return err;
+	}
+
+	for (i = 0; i < component_count; i++) {
+		struct mlxfw_mfa2_component *comp;
+
+		comp = mlxfw_mfa2_file_component_get(mfa2_file, mlxfw_dev->psid,
+						     mlxfw_dev->psid_size, i);
+		if (IS_ERR(comp))
+			return PTR_ERR(comp);
+
+		pr_info("Flashing component type %d\n", comp->index);
+		err = mlxfw_flash_component(mlxfw_dev, fwhandle, comp);
+		mlxfw_mfa2_file_component_put(comp);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev,
+			 const struct firmware *firmware)
+{
+	struct mlxfw_mfa2_file *mfa2_file;
+	u32 fwhandle;
+	int err;
+
+	if (!mlxfw_mfa2_check(firmware)) {
+		pr_err("Firmware file is not MFA2\n");
+		return -EINVAL;
+	}
+
+	mfa2_file = mlxfw_mfa2_file_init(firmware);
+	if (IS_ERR(mfa2_file))
+		return PTR_ERR(mfa2_file);
+
+	pr_info("Initialize firmware flash process\n");
+	err = mlxfw_dev->ops->fsm_lock(mlxfw_dev, &fwhandle);
+	if (err) {
+		pr_err("Could not lock the firmware FSM\n");
+		goto err_fsm_lock;
+	}
+
+	err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle,
+				   MLXFW_FSM_STATE_LOCKED);
+	if (err)
+		goto err_state_wait_idle_to_locked;
+
+	err = mlxfw_flash_components(mlxfw_dev, fwhandle, mfa2_file);
+	if (err)
+		goto err_flash_components;
+
+	pr_debug("Activate image\n");
+	err = mlxfw_dev->ops->fsm_activate(mlxfw_dev, fwhandle);
+	if (err) {
+		pr_err("Could not activate the downloaded image\n");
+		goto err_fsm_activate;
+	}
+
+	err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, MLXFW_FSM_STATE_LOCKED);
+	if (err)
+		goto err_state_wait_activate_to_locked;
+
+	pr_debug("Handle release\n");
+	mlxfw_dev->ops->fsm_release(mlxfw_dev, fwhandle);
+
+	pr_info("Firmware flash done.\n");
+	mlxfw_mfa2_file_fini(mfa2_file);
+	return 0;
+
+err_state_wait_activate_to_locked:
+err_fsm_activate:
+err_flash_components:
+err_state_wait_idle_to_locked:
+	mlxfw_dev->ops->fsm_release(mlxfw_dev, fwhandle);
+err_fsm_lock:
+	mlxfw_mfa2_file_fini(mfa2_file);
+	return err;
+}
+EXPORT_SYMBOL(mlxfw_firmware_flash);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox firmware flash lib");
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c
new file mode 100644
index 0000000..993cb5b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c
@@ -0,0 +1,619 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) "mlxfw_mfa2: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/xz.h>
+#include "mlxfw_mfa2.h"
+#include "mlxfw_mfa2_file.h"
+#include "mlxfw_mfa2_tlv.h"
+#include "mlxfw_mfa2_format.h"
+#include "mlxfw_mfa2_tlv_multi.h"
+
+/*               MFA2 FILE
+ *  +----------------------------------+
+ *  |        MFA2 finger print         |
+ *  +----------------------------------+
+ *  |   package descriptor multi_tlv   |
+ *  | +------------------------------+ |     +-----------------+
+ *  | |    package descriptor tlv    +-----> |num_devices=n    |
+ *  | +------------------------------+ |     |num_components=m |
+ *  +----------------------------------+     |CB offset        |
+ *  |    device descriptor multi_tlv   |     |...              |
+ *  | +------------------------------+ |     |                 |
+ *  | |           PSID tlv           | |     +-----------------+
+ *  | +------------------------------+ |
+ *  | |     component index tlv      | |
+ *  | +------------------------------+ |
+ *  +----------------------------------+
+ *  |  component descriptor multi_tlv  |
+ *  | +------------------------------+ |     +-----------------+
+ *  | |  component descriptor tlv    +-----> |Among others:    |
+ *  | +------------------------------+ |     |CB offset=o      |
+ *  +----------------------------------+     |comp index=i     |
+ *  |                                  |     |...              |
+ *  |                                  |     |                 |
+ *  |                                  |     +-----------------+
+ *  |        COMPONENT BLOCK (CB)      |
+ *  |                                  |
+ *  |                                  |
+ *  |                                  |
+ *  +----------------------------------+
+ *
+ * On the top level, an MFA2 file contains:
+ *  - Fingerprint
+ *  - Several multi_tlvs (TLVs of type MLXFW_MFA2_TLV_MULTI, as defined in
+ *    mlxfw_mfa2_format.h)
+ *  - Compresses content block
+ *
+ * The first multi_tlv
+ * -------------------
+ * The first multi TLV is treated as package descriptor, and expected to have a
+ * first TLV child of type MLXFW_MFA2_TLV_PACKAGE_DESCRIPTOR which contains all
+ * the global information needed to parse the file. Among others, it contains
+ * the number of device descriptors and component descriptor following this
+ * multi TLV.
+ *
+ * The device descriptor multi_tlv
+ * -------------------------------
+ * The multi TLVs following the package descriptor are treated as device
+ * descriptor, and are expected to have the following children:
+ *  - PSID TLV child of type MLXFW_MFA2_TLV_PSID containing that device PSID.
+ *  - Component index of type MLXFW_MFA2_TLV_COMPONENT_PTR that contains that
+ *    device component index.
+ *
+ * The component descriptor multi_tlv
+ * ----------------------------------
+ * The multi TLVs following the device descriptor multi TLVs are treated as
+ * component descriptor, and are expected to have a first child of type
+ * MLXFW_MFA2_TLV_COMPONENT_DESCRIPTOR that contains mostly the component index,
+ * needed for the flash process and the offset to the binary within the
+ * component block.
+ */
+
+static const u8 mlxfw_mfa2_fingerprint[] = "MLNX.MFA2.XZ.00!";
+static const int mlxfw_mfa2_fingerprint_len =
+			sizeof(mlxfw_mfa2_fingerprint) - 1;
+
+static const u8 mlxfw_mfa2_comp_magic[] = "#BIN.COMPONENT!#";
+static const int mlxfw_mfa2_comp_magic_len = sizeof(mlxfw_mfa2_comp_magic) - 1;
+
+bool mlxfw_mfa2_check(const struct firmware *fw)
+{
+	if (fw->size < sizeof(mlxfw_mfa2_fingerprint))
+		return false;
+
+	return memcmp(fw->data, mlxfw_mfa2_fingerprint,
+		      mlxfw_mfa2_fingerprint_len) == 0;
+}
+
+static bool
+mlxfw_mfa2_tlv_multi_validate(const struct mlxfw_mfa2_file *mfa2_file,
+			      const struct mlxfw_mfa2_tlv_multi *multi)
+{
+	const struct mlxfw_mfa2_tlv *tlv;
+	u16 idx;
+
+	/* Check that all children are valid */
+	mlxfw_mfa2_tlv_multi_foreach(mfa2_file, tlv, idx, multi) {
+		if (!tlv) {
+			pr_err("Multi has invalid child");
+			return false;
+		}
+	}
+	return true;
+}
+
+static bool
+mlxfw_mfa2_file_dev_validate(const struct mlxfw_mfa2_file *mfa2_file,
+			     const struct mlxfw_mfa2_tlv *dev_tlv,
+			     u16 dev_idx)
+{
+	const struct mlxfw_mfa2_tlv_component_ptr *cptr;
+	const struct mlxfw_mfa2_tlv_multi *multi;
+	const struct mlxfw_mfa2_tlv_psid *psid;
+	const struct mlxfw_mfa2_tlv *tlv;
+	u16 cptr_count;
+	u16 cptr_idx;
+	int err;
+
+	pr_debug("Device %d\n", dev_idx);
+
+	multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, dev_tlv);
+	if (!multi) {
+		pr_err("Device %d is not a valid TLV error\n", dev_idx);
+		return false;
+	}
+
+	if (!mlxfw_mfa2_tlv_multi_validate(mfa2_file, multi))
+		return false;
+
+	/* Validate the device has PSID tlv */
+	tlv = mlxfw_mfa2_tlv_multi_child_find(mfa2_file, multi,
+					      MLXFW_MFA2_TLV_PSID, 0);
+	if (!tlv) {
+		pr_err("Device %d does not have PSID\n", dev_idx);
+		return false;
+	}
+
+	psid = mlxfw_mfa2_tlv_psid_get(mfa2_file, tlv);
+	if (!psid) {
+		pr_err("Device %d PSID TLV is not valid\n", dev_idx);
+		return false;
+	}
+
+	print_hex_dump_debug("  -- Device PSID ", DUMP_PREFIX_NONE, 16, 16,
+			     psid->psid, be16_to_cpu(tlv->len), true);
+
+	/* Validate the device has COMPONENT_PTR */
+	err = mlxfw_mfa2_tlv_multi_child_count(mfa2_file, multi,
+					       MLXFW_MFA2_TLV_COMPONENT_PTR,
+					       &cptr_count);
+	if (err)
+		return false;
+
+	if (cptr_count == 0) {
+		pr_err("Device %d has no components\n", dev_idx);
+		return false;
+	}
+
+	for (cptr_idx = 0; cptr_idx < cptr_count; cptr_idx++) {
+		tlv = mlxfw_mfa2_tlv_multi_child_find(mfa2_file, multi,
+						      MLXFW_MFA2_TLV_COMPONENT_PTR,
+						      cptr_idx);
+		if (!tlv)
+			return false;
+
+		cptr = mlxfw_mfa2_tlv_component_ptr_get(mfa2_file, tlv);
+		if (!cptr) {
+			pr_err("Device %d COMPONENT_PTR TLV is not valid\n",
+			       dev_idx);
+			return false;
+		}
+
+		pr_debug("  -- Component index %d\n",
+			 be16_to_cpu(cptr->component_index));
+	}
+	return true;
+}
+
+static bool
+mlxfw_mfa2_file_comp_validate(const struct mlxfw_mfa2_file *mfa2_file,
+			      const struct mlxfw_mfa2_tlv *comp_tlv,
+			      u16 comp_idx)
+{
+	const struct mlxfw_mfa2_tlv_component_descriptor *cdesc;
+	const struct mlxfw_mfa2_tlv_multi *multi;
+	const struct mlxfw_mfa2_tlv *tlv;
+
+	pr_debug("Component %d\n", comp_idx);
+
+	multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, comp_tlv);
+	if (!multi) {
+		pr_err("Component %d is not a valid TLV error\n", comp_idx);
+		return false;
+	}
+
+	if (!mlxfw_mfa2_tlv_multi_validate(mfa2_file, multi))
+		return false;
+
+	/* Check that component have COMPONENT_DESCRIPTOR as first child */
+	tlv = mlxfw_mfa2_tlv_multi_child(mfa2_file, multi);
+	if (!tlv) {
+		pr_err("Component descriptor %d multi TLV error\n", comp_idx);
+		return false;
+	}
+
+	cdesc = mlxfw_mfa2_tlv_component_descriptor_get(mfa2_file, tlv);
+	if (!cdesc) {
+		pr_err("Component %d does not have a valid descriptor\n",
+		       comp_idx);
+		return false;
+	}
+	pr_debug("  -- Component type %d\n", be16_to_cpu(cdesc->identifier));
+	pr_debug("  -- Offset 0x%llx and size %d\n",
+		 ((u64) be32_to_cpu(cdesc->cb_offset_h) << 32)
+		 | be32_to_cpu(cdesc->cb_offset_l), be32_to_cpu(cdesc->size));
+
+	return true;
+}
+
+static bool mlxfw_mfa2_file_validate(const struct mlxfw_mfa2_file *mfa2_file)
+{
+	const struct mlxfw_mfa2_tlv *tlv;
+	u16 idx;
+
+	pr_debug("Validating file\n");
+
+	/* check that all the devices exist */
+	mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, mfa2_file->first_dev,
+			       mfa2_file->dev_count) {
+		if (!tlv) {
+			pr_err("Device TLV error\n");
+			return false;
+		}
+
+		/* Check each device */
+		if (!mlxfw_mfa2_file_dev_validate(mfa2_file, tlv, idx))
+			return false;
+	}
+
+	/* check that all the components exist */
+	mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, mfa2_file->first_component,
+			       mfa2_file->component_count) {
+		if (!tlv) {
+			pr_err("Device TLV error\n");
+			return false;
+		}
+
+		/* Check each component */
+		if (!mlxfw_mfa2_file_comp_validate(mfa2_file, tlv, idx))
+			return false;
+	}
+	return true;
+}
+
+struct mlxfw_mfa2_file *mlxfw_mfa2_file_init(const struct firmware *fw)
+{
+	const struct mlxfw_mfa2_tlv_package_descriptor *pd;
+	const struct mlxfw_mfa2_tlv_multi *multi;
+	const struct mlxfw_mfa2_tlv *multi_child;
+	const struct mlxfw_mfa2_tlv *first_tlv;
+	struct mlxfw_mfa2_file *mfa2_file;
+	const void *first_tlv_ptr;
+	const void *cb_top_ptr;
+
+	mfa2_file = kcalloc(1, sizeof(*mfa2_file), GFP_KERNEL);
+	if (!mfa2_file)
+		return ERR_PTR(-ENOMEM);
+
+	mfa2_file->fw = fw;
+	first_tlv_ptr = fw->data + NLA_ALIGN(mlxfw_mfa2_fingerprint_len);
+	first_tlv = mlxfw_mfa2_tlv_get(mfa2_file, first_tlv_ptr);
+	if (!first_tlv) {
+		pr_err("Could not parse package descriptor TLV\n");
+		goto err_out;
+	}
+
+	multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, first_tlv);
+	if (!multi) {
+		pr_err("First TLV is not of valid multi type\n");
+		goto err_out;
+	}
+
+	multi_child = mlxfw_mfa2_tlv_multi_child(mfa2_file, multi);
+	if (!multi_child)
+		goto err_out;
+
+	pd = mlxfw_mfa2_tlv_package_descriptor_get(mfa2_file, multi_child);
+	if (!pd) {
+		pr_err("Could not parse package descriptor TLV\n");
+		goto err_out;
+	}
+
+	mfa2_file->first_dev = mlxfw_mfa2_tlv_next(mfa2_file, first_tlv);
+	if (!mfa2_file->first_dev) {
+		pr_err("First device TLV is not valid\n");
+		goto err_out;
+	}
+
+	mfa2_file->dev_count = be16_to_cpu(pd->num_devices);
+	mfa2_file->first_component = mlxfw_mfa2_tlv_advance(mfa2_file,
+							    mfa2_file->first_dev,
+							    mfa2_file->dev_count);
+	mfa2_file->component_count = be16_to_cpu(pd->num_components);
+	mfa2_file->cb = fw->data + NLA_ALIGN(be32_to_cpu(pd->cb_offset));
+	if (!mlxfw_mfa2_valid_ptr(mfa2_file, mfa2_file->cb)) {
+		pr_err("Component block is out side the file\n");
+		goto err_out;
+	}
+	mfa2_file->cb_archive_size = be32_to_cpu(pd->cb_archive_size);
+	cb_top_ptr = mfa2_file->cb + mfa2_file->cb_archive_size - 1;
+	if (!mlxfw_mfa2_valid_ptr(mfa2_file, cb_top_ptr)) {
+		pr_err("Component block size is too big\n");
+		goto err_out;
+	}
+
+	if (!mlxfw_mfa2_file_validate(mfa2_file))
+		goto err_out;
+	return mfa2_file;
+err_out:
+	kfree(mfa2_file);
+	return ERR_PTR(-EINVAL);
+}
+
+static const struct mlxfw_mfa2_tlv_multi *
+mlxfw_mfa2_tlv_dev_get(const struct mlxfw_mfa2_file *mfa2_file,
+		       const char *psid, u16 psid_size)
+{
+	const struct mlxfw_mfa2_tlv_psid *tlv_psid;
+	const struct mlxfw_mfa2_tlv_multi *dev_multi;
+	const struct mlxfw_mfa2_tlv *dev_tlv;
+	const struct mlxfw_mfa2_tlv *tlv;
+	u32 idx;
+
+	/* for each device tlv */
+	mlxfw_mfa2_tlv_foreach(mfa2_file, dev_tlv, idx, mfa2_file->first_dev,
+			       mfa2_file->dev_count) {
+		if (!dev_tlv)
+			return NULL;
+
+		dev_multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, dev_tlv);
+		if (!dev_multi)
+			return NULL;
+
+		/* find psid child and compare */
+		tlv = mlxfw_mfa2_tlv_multi_child_find(mfa2_file, dev_multi,
+						      MLXFW_MFA2_TLV_PSID, 0);
+		if (!tlv)
+			return NULL;
+		if (be16_to_cpu(tlv->len) != psid_size)
+			continue;
+
+		tlv_psid = mlxfw_mfa2_tlv_psid_get(mfa2_file, tlv);
+		if (!tlv_psid)
+			return NULL;
+
+		if (memcmp(psid, tlv_psid->psid, psid_size) == 0)
+			return dev_multi;
+	}
+
+	return NULL;
+}
+
+int mlxfw_mfa2_file_component_count(const struct mlxfw_mfa2_file *mfa2_file,
+				    const char *psid, u32 psid_size,
+				    u32 *p_count)
+{
+	const struct mlxfw_mfa2_tlv_multi *dev_multi;
+	u16 count;
+	int err;
+
+	dev_multi = mlxfw_mfa2_tlv_dev_get(mfa2_file, psid, psid_size);
+	if (!dev_multi)
+		return -EINVAL;
+
+	err = mlxfw_mfa2_tlv_multi_child_count(mfa2_file, dev_multi,
+					       MLXFW_MFA2_TLV_COMPONENT_PTR,
+					       &count);
+	if (err)
+		return err;
+
+	*p_count = count;
+	return 0;
+}
+
+static int mlxfw_mfa2_xz_dec_run(struct xz_dec *xz_dec, struct xz_buf *xz_buf,
+				 bool *finished)
+{
+	enum xz_ret xz_ret;
+
+	xz_ret = xz_dec_run(xz_dec, xz_buf);
+
+	switch (xz_ret) {
+	case XZ_STREAM_END:
+		*finished = true;
+		return 0;
+	case XZ_OK:
+		*finished = false;
+		return 0;
+	case XZ_MEM_ERROR:
+		pr_err("xz no memory\n");
+		return -ENOMEM;
+	case XZ_DATA_ERROR:
+		pr_err("xz file corrupted\n");
+		return -EINVAL;
+	case XZ_FORMAT_ERROR:
+		pr_err("xz format not found\n");
+		return -EINVAL;
+	case XZ_OPTIONS_ERROR:
+		pr_err("unsupported xz option\n");
+		return -EINVAL;
+	case XZ_MEMLIMIT_ERROR:
+		pr_err("xz dictionary too small\n");
+		return -EINVAL;
+	default:
+		pr_err("xz error %d\n", xz_ret);
+		return -EINVAL;
+	}
+}
+
+static int mlxfw_mfa2_file_cb_offset_xz(const struct mlxfw_mfa2_file *mfa2_file,
+					off_t off, size_t size, u8 *buf)
+{
+	struct xz_dec *xz_dec;
+	struct xz_buf dec_buf;
+	off_t curr_off = 0;
+	bool finished;
+	int err;
+
+	xz_dec = xz_dec_init(XZ_DYNALLOC, (u32) -1);
+	if (!xz_dec)
+		return -EINVAL;
+
+	dec_buf.in_size = mfa2_file->cb_archive_size;
+	dec_buf.in = mfa2_file->cb;
+	dec_buf.in_pos = 0;
+	dec_buf.out = buf;
+
+	/* decode up to the offset */
+	do {
+		dec_buf.out_pos = 0;
+		dec_buf.out_size = min_t(size_t, size, off - curr_off);
+		if (dec_buf.out_size == 0)
+			break;
+
+		err = mlxfw_mfa2_xz_dec_run(xz_dec, &dec_buf, &finished);
+		if (err)
+			goto out;
+		if (finished) {
+			pr_err("xz section too short\n");
+			err = -EINVAL;
+			goto out;
+		}
+		curr_off += dec_buf.out_pos;
+	} while (curr_off != off);
+
+	/* decode the needed section */
+	dec_buf.out_pos = 0;
+	dec_buf.out_size = size;
+	err = mlxfw_mfa2_xz_dec_run(xz_dec, &dec_buf, &finished);
+out:
+	xz_dec_end(xz_dec);
+	return err;
+}
+
+static const struct mlxfw_mfa2_tlv_component_descriptor *
+mlxfw_mfa2_file_component_tlv_get(const struct mlxfw_mfa2_file *mfa2_file,
+				  u16 comp_index)
+{
+	const struct mlxfw_mfa2_tlv_multi *multi;
+	const struct mlxfw_mfa2_tlv *multi_child;
+	const struct mlxfw_mfa2_tlv *comp_tlv;
+
+	if (comp_index > mfa2_file->component_count)
+		return NULL;
+
+	comp_tlv = mlxfw_mfa2_tlv_advance(mfa2_file, mfa2_file->first_component,
+					  comp_index);
+	if (!comp_tlv)
+		return NULL;
+
+	multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, comp_tlv);
+	if (!multi)
+		return NULL;
+
+	multi_child = mlxfw_mfa2_tlv_multi_child(mfa2_file, multi);
+	if (!multi_child)
+		return NULL;
+
+	return mlxfw_mfa2_tlv_component_descriptor_get(mfa2_file, multi_child);
+}
+
+struct mlxfw_mfa2_comp_data {
+	struct mlxfw_mfa2_component comp;
+	u8 buff[0];
+};
+
+static const struct mlxfw_mfa2_tlv_component_descriptor *
+mlxfw_mfa2_file_component_find(const struct mlxfw_mfa2_file *mfa2_file,
+			       const char *psid, int psid_size,
+			       int component_index)
+{
+	const struct mlxfw_mfa2_tlv_component_ptr *cptr;
+	const struct mlxfw_mfa2_tlv_multi *dev_multi;
+	const struct mlxfw_mfa2_tlv *cptr_tlv;
+	u16 comp_idx;
+
+	dev_multi = mlxfw_mfa2_tlv_dev_get(mfa2_file, psid, psid_size);
+	if (!dev_multi)
+		return NULL;
+
+	cptr_tlv = mlxfw_mfa2_tlv_multi_child_find(mfa2_file, dev_multi,
+						   MLXFW_MFA2_TLV_COMPONENT_PTR,
+						   component_index);
+	if (!cptr_tlv)
+		return NULL;
+
+	cptr = mlxfw_mfa2_tlv_component_ptr_get(mfa2_file, cptr_tlv);
+	if (!cptr)
+		return NULL;
+
+	comp_idx = be16_to_cpu(cptr->component_index);
+	return mlxfw_mfa2_file_component_tlv_get(mfa2_file, comp_idx);
+}
+
+struct mlxfw_mfa2_component *
+mlxfw_mfa2_file_component_get(const struct mlxfw_mfa2_file *mfa2_file,
+			      const char *psid, int psid_size,
+			      int component_index)
+{
+	const struct mlxfw_mfa2_tlv_component_descriptor *comp;
+	struct mlxfw_mfa2_comp_data *comp_data;
+	u32 comp_buf_size;
+	off_t cb_offset;
+	u32 comp_size;
+	int err;
+
+	comp = mlxfw_mfa2_file_component_find(mfa2_file, psid, psid_size,
+					      component_index);
+	if (!comp)
+		return ERR_PTR(-EINVAL);
+
+	cb_offset = (u64) be32_to_cpu(comp->cb_offset_h) << 32 |
+		    be32_to_cpu(comp->cb_offset_l);
+	comp_size = be32_to_cpu(comp->size);
+	comp_buf_size = comp_size + mlxfw_mfa2_comp_magic_len;
+
+	comp_data = kmalloc(sizeof(*comp_data) + comp_buf_size, GFP_KERNEL);
+	if (!comp_data)
+		return ERR_PTR(-ENOMEM);
+	comp_data->comp.data_size = comp_size;
+	comp_data->comp.index = be16_to_cpu(comp->identifier);
+	err = mlxfw_mfa2_file_cb_offset_xz(mfa2_file, cb_offset, comp_buf_size,
+					   comp_data->buff);
+	if (err) {
+		pr_err("Component could not be reached in CB\n");
+		goto err_out;
+	}
+
+	if (memcmp(comp_data->buff, mlxfw_mfa2_comp_magic,
+		   mlxfw_mfa2_comp_magic_len) != 0) {
+		pr_err("Component has wrong magic\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	comp_data->comp.data = comp_data->buff + mlxfw_mfa2_comp_magic_len;
+	return &comp_data->comp;
+err_out:
+	kfree(comp_data);
+	return ERR_PTR(err);
+}
+
+void mlxfw_mfa2_file_component_put(struct mlxfw_mfa2_component *comp)
+{
+	const struct mlxfw_mfa2_comp_data *comp_data;
+
+	comp_data = container_of(comp, struct mlxfw_mfa2_comp_data, comp);
+	kfree(comp_data);
+}
+
+void mlxfw_mfa2_file_fini(struct mlxfw_mfa2_file *mfa2_file)
+{
+	kfree(mfa2_file);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h
new file mode 100644
index 0000000..20472aa
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h
@@ -0,0 +1,66 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXFW_MFA2_H
+#define _MLXFW_MFA2_H
+
+#include <linux/firmware.h>
+#include "mlxfw.h"
+
+struct mlxfw_mfa2_component {
+	u16 index;
+	u32 data_size;
+	u8 *data;
+};
+
+struct mlxfw_mfa2_file;
+
+bool mlxfw_mfa2_check(const struct firmware *fw);
+
+struct mlxfw_mfa2_file *mlxfw_mfa2_file_init(const struct firmware *fw);
+
+int mlxfw_mfa2_file_component_count(const struct mlxfw_mfa2_file *mfa2_file,
+				    const char *psid, u32 psid_size,
+				    u32 *p_count);
+
+struct mlxfw_mfa2_component *
+mlxfw_mfa2_file_component_get(const struct mlxfw_mfa2_file *mfa2_file,
+			      const char *psid, int psid_size,
+			      int component_index);
+
+void mlxfw_mfa2_file_component_put(struct mlxfw_mfa2_component *component);
+
+void mlxfw_mfa2_file_fini(struct mlxfw_mfa2_file *mfa2_file);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h
new file mode 100644
index 0000000..f667942
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h
@@ -0,0 +1,60 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXFW_MFA2_FILE_H
+#define _MLXFW_MFA2_FILE_H
+
+#include <linux/firmware.h>
+#include <linux/kernel.h>
+
+struct mlxfw_mfa2_file {
+	const struct firmware *fw;
+	const struct mlxfw_mfa2_tlv *first_dev;
+	u16 dev_count;
+	const struct mlxfw_mfa2_tlv *first_component;
+	u16 component_count;
+	const void *cb; /* components block */
+	u32 cb_archive_size; /* size of compressed components block */
+};
+
+static inline bool mlxfw_mfa2_valid_ptr(const struct mlxfw_mfa2_file *mfa2_file,
+					const void *ptr)
+{
+	const void *valid_to = mfa2_file->fw->data + mfa2_file->fw->size;
+	const void *valid_from = mfa2_file->fw->data;
+
+	return ptr > valid_from && ptr < valid_to;
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_format.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_format.h
new file mode 100644
index 0000000..dd66737
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_format.h
@@ -0,0 +1,103 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_format.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _MLXFW_MFA2_FORMAT_H
+#define _MLXFW_MFA2_FORMAT_H
+
+#include "mlxfw_mfa2_file.h"
+#include "mlxfw_mfa2_tlv.h"
+
+enum mlxfw_mfa2_tlv_type {
+	MLXFW_MFA2_TLV_MULTI_PART = 0x01,
+	MLXFW_MFA2_TLV_PACKAGE_DESCRIPTOR = 0x02,
+	MLXFW_MFA2_TLV_COMPONENT_DESCRIPTOR = 0x04,
+	MLXFW_MFA2_TLV_COMPONENT_PTR = 0x22,
+	MLXFW_MFA2_TLV_PSID = 0x2A,
+};
+
+enum mlxfw_mfa2_compression_type {
+	MLXFW_MFA2_COMPRESSION_TYPE_NONE,
+	MLXFW_MFA2_COMPRESSION_TYPE_XZ,
+};
+
+struct mlxfw_mfa2_tlv_package_descriptor {
+	__be16 num_components;
+	__be16 num_devices;
+	__be32 cb_offset;
+	__be32 cb_archive_size;
+	__be32 cb_size_h;
+	__be32 cb_size_l;
+	u8 padding[3];
+	u8 cv_compression;
+	__be32 user_data_offset;
+} __packed;
+
+MLXFW_MFA2_TLV(package_descriptor, struct mlxfw_mfa2_tlv_package_descriptor,
+	       MLXFW_MFA2_TLV_PACKAGE_DESCRIPTOR);
+
+struct mlxfw_mfa2_tlv_multi {
+	__be16 num_extensions;
+	__be16 total_len;
+} __packed;
+
+MLXFW_MFA2_TLV(multi, struct mlxfw_mfa2_tlv_multi,
+	       MLXFW_MFA2_TLV_MULTI_PART);
+
+struct mlxfw_mfa2_tlv_psid {
+	u8 psid[0];
+} __packed;
+
+MLXFW_MFA2_TLV_VARSIZE(psid, struct mlxfw_mfa2_tlv_psid,
+		       MLXFW_MFA2_TLV_PSID);
+
+struct mlxfw_mfa2_tlv_component_ptr {
+	__be16 storage_id;
+	__be16 component_index;
+	__be32 storage_address;
+} __packed;
+
+MLXFW_MFA2_TLV(component_ptr, struct mlxfw_mfa2_tlv_component_ptr,
+	       MLXFW_MFA2_TLV_COMPONENT_PTR);
+
+struct mlxfw_mfa2_tlv_component_descriptor {
+	__be16 pldm_classification;
+	__be16 identifier;
+	__be32 cb_offset_h;
+	__be32 cb_offset_l;
+	__be32 size;
+} __packed;
+
+MLXFW_MFA2_TLV(component_descriptor, struct mlxfw_mfa2_tlv_component_descriptor,
+	       MLXFW_MFA2_TLV_COMPONENT_DESCRIPTOR);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h
new file mode 100644
index 0000000..cc013e7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h
@@ -0,0 +1,98 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXFW_MFA2_TLV_H
+#define _MLXFW_MFA2_TLV_H
+
+#include <linux/kernel.h>
+#include "mlxfw_mfa2_file.h"
+
+struct mlxfw_mfa2_tlv {
+	u8 version;
+	u8 type;
+	__be16 len;
+	u8 data[0];
+} __packed;
+
+static inline const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_get(const struct mlxfw_mfa2_file *mfa2_file, const void *ptr)
+{
+	if (!mlxfw_mfa2_valid_ptr(mfa2_file, ptr) ||
+	    !mlxfw_mfa2_valid_ptr(mfa2_file, ptr + sizeof(struct mlxfw_mfa2_tlv)))
+		return NULL;
+	return ptr;
+}
+
+static inline const void *
+mlxfw_mfa2_tlv_payload_get(const struct mlxfw_mfa2_file *mfa2_file,
+			   const struct mlxfw_mfa2_tlv *tlv, u8 payload_type,
+			   size_t payload_size, bool varsize)
+{
+	void *tlv_top;
+
+	tlv_top = (void *) tlv + be16_to_cpu(tlv->len) - 1;
+	if (!mlxfw_mfa2_valid_ptr(mfa2_file, tlv) ||
+	    !mlxfw_mfa2_valid_ptr(mfa2_file, tlv_top))
+		return NULL;
+	if (tlv->type != payload_type)
+		return NULL;
+	if (varsize && (be16_to_cpu(tlv->len) < payload_size))
+		return NULL;
+	if (!varsize && (be16_to_cpu(tlv->len) != payload_size))
+		return NULL;
+
+	return tlv->data;
+}
+
+#define MLXFW_MFA2_TLV(name, payload_type, tlv_type)			       \
+static inline const payload_type *					       \
+mlxfw_mfa2_tlv_ ## name ## _get(const struct mlxfw_mfa2_file *mfa2_file,       \
+				const struct mlxfw_mfa2_tlv *tlv)	       \
+{									       \
+	return mlxfw_mfa2_tlv_payload_get(mfa2_file, tlv,		       \
+					  tlv_type, sizeof(payload_type),      \
+					  false);			       \
+}
+
+#define MLXFW_MFA2_TLV_VARSIZE(name, payload_type, tlv_type)		       \
+static inline const payload_type *					       \
+mlxfw_mfa2_tlv_ ## name ## _get(const struct mlxfw_mfa2_file *mfa2_file,       \
+				const struct mlxfw_mfa2_tlv *tlv)	       \
+{									       \
+	return mlxfw_mfa2_tlv_payload_get(mfa2_file, tlv,		       \
+					  tlv_type, sizeof(payload_type),      \
+					  true);			       \
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c
new file mode 100644
index 0000000..0094b92
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c
@@ -0,0 +1,126 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) "MFA2: " fmt
+
+#include "mlxfw_mfa2_tlv_multi.h"
+#include <uapi/linux/netlink.h>
+
+#define MLXFW_MFA2_TLV_TOTAL_SIZE(tlv) \
+	NLA_ALIGN(sizeof(*(tlv)) + be16_to_cpu((tlv)->len))
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_multi_child(const struct mlxfw_mfa2_file *mfa2_file,
+			   const struct mlxfw_mfa2_tlv_multi *multi)
+{
+	size_t multi_len;
+
+	multi_len = NLA_ALIGN(sizeof(struct mlxfw_mfa2_tlv_multi));
+	return mlxfw_mfa2_tlv_get(mfa2_file, (void *) multi + multi_len);
+}
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_next(const struct mlxfw_mfa2_file *mfa2_file,
+		    const struct mlxfw_mfa2_tlv *tlv)
+{
+	const struct mlxfw_mfa2_tlv_multi *multi;
+	u16 tlv_len;
+	void *next;
+
+	tlv_len = MLXFW_MFA2_TLV_TOTAL_SIZE(tlv);
+
+	if (tlv->type == MLXFW_MFA2_TLV_MULTI_PART) {
+		multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, tlv);
+		tlv_len = NLA_ALIGN(tlv_len + be16_to_cpu(multi->total_len));
+	}
+
+	next = (void *) tlv + tlv_len;
+	return mlxfw_mfa2_tlv_get(mfa2_file, next);
+}
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_advance(const struct mlxfw_mfa2_file *mfa2_file,
+		       const struct mlxfw_mfa2_tlv *from_tlv, u16 count)
+{
+	const struct mlxfw_mfa2_tlv *tlv;
+	u16 idx;
+
+	mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, from_tlv, count)
+		if (!tlv)
+			return NULL;
+	return tlv;
+}
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_multi_child_find(const struct mlxfw_mfa2_file *mfa2_file,
+				const struct mlxfw_mfa2_tlv_multi *multi,
+				enum mlxfw_mfa2_tlv_type type, u16 index)
+{
+	const struct mlxfw_mfa2_tlv *tlv;
+	u16 skip = 0;
+	u16 idx;
+
+	mlxfw_mfa2_tlv_multi_foreach(mfa2_file, tlv, idx, multi) {
+		if (!tlv) {
+			pr_err("TLV parsing error\n");
+			return NULL;
+		}
+		if (tlv->type == type)
+			if (skip++ == index)
+				return tlv;
+	}
+	return NULL;
+}
+
+int mlxfw_mfa2_tlv_multi_child_count(const struct mlxfw_mfa2_file *mfa2_file,
+				     const struct mlxfw_mfa2_tlv_multi *multi,
+				     enum mlxfw_mfa2_tlv_type type,
+				     u16 *p_count)
+{
+	const struct mlxfw_mfa2_tlv *tlv;
+	u16 count = 0;
+	u16 idx;
+
+	mlxfw_mfa2_tlv_multi_foreach(mfa2_file, tlv, idx, multi) {
+		if (!tlv) {
+			pr_err("TLV parsing error\n");
+			return -EINVAL;
+		}
+
+		if (tlv->type == type)
+			count++;
+	}
+	*p_count = count;
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h
new file mode 100644
index 0000000..2c66789
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h
@@ -0,0 +1,71 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _MLXFW_MFA2_TLV_MULTI_H
+#define _MLXFW_MFA2_TLV_MULTI_H
+
+#include "mlxfw_mfa2_tlv.h"
+#include "mlxfw_mfa2_format.h"
+#include "mlxfw_mfa2_file.h"
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_multi_child(const struct mlxfw_mfa2_file *mfa2_file,
+			   const struct mlxfw_mfa2_tlv_multi *multi);
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_next(const struct mlxfw_mfa2_file *mfa2_file,
+		    const struct mlxfw_mfa2_tlv *tlv);
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_advance(const struct mlxfw_mfa2_file *mfa2_file,
+		       const struct mlxfw_mfa2_tlv *from_tlv, u16 count);
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_multi_child_find(const struct mlxfw_mfa2_file *mfa2_file,
+				const struct mlxfw_mfa2_tlv_multi *multi,
+				enum mlxfw_mfa2_tlv_type type, u16 index);
+
+int mlxfw_mfa2_tlv_multi_child_count(const struct mlxfw_mfa2_file *mfa2_file,
+				     const struct mlxfw_mfa2_tlv_multi *multi,
+				     enum mlxfw_mfa2_tlv_type type,
+				     u16 *p_count);
+
+#define mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, from_tlv, count) \
+	for (idx = 0, tlv = from_tlv; idx < (count); \
+	     idx++, tlv = mlxfw_mfa2_tlv_next(mfa2_file, tlv))
+
+#define mlxfw_mfa2_tlv_multi_foreach(mfa2_file, tlv, idx, multi) \
+	mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, \
+			       mlxfw_mfa2_tlv_multi_child(mfa2_file, multi), \
+			       be16_to_cpu(multi->num_extensions) + 1)
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/Kconfig b/drivers/net/ethernet/mellanox/mlxsw/Kconfig
new file mode 100644
index 0000000..f4d9c99
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/Kconfig
@@ -0,0 +1,108 @@
+#
+# Mellanox switch drivers configuration
+#
+
+config MLXSW_CORE
+	tristate "Mellanox Technologies Switch ASICs support"
+	depends on MAY_USE_DEVLINK
+	---help---
+	  This driver supports Mellanox Technologies Switch ASICs family.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_core.
+
+config MLXSW_CORE_HWMON
+	bool "HWMON support for Mellanox Technologies Switch ASICs"
+	depends on MLXSW_CORE && HWMON
+	depends on !(MLXSW_CORE=y && HWMON=m)
+	default y
+	---help---
+	  Say Y here if you want to expose HWMON interface on mlxsw devices.
+
+config MLXSW_CORE_THERMAL
+	bool "Thermal zone support for Mellanox Technologies Switch ASICs"
+	depends on MLXSW_CORE && THERMAL
+	depends on !(MLXSW_CORE=y && THERMAL=m)
+	default y
+	---help---
+	 Say Y here if you want to automatically control fans speed according
+	 ambient temperature reported by ASIC.
+
+config MLXSW_PCI
+	tristate "PCI bus implementation for Mellanox Technologies Switch ASICs"
+	depends on PCI && HAS_DMA && HAS_IOMEM && MLXSW_CORE
+	default m
+	---help---
+	  This is PCI bus implementation for Mellanox Technologies Switch ASICs.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_pci.
+
+config MLXSW_I2C
+	tristate "I2C bus implementation for Mellanox Technologies Switch ASICs"
+	depends on I2C && MLXSW_CORE
+	default m
+	---help---
+	  This is I2C bus implementation for Mellanox Technologies Switch ASICs.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_i2c.
+
+config MLXSW_SWITCHIB
+	tristate "Mellanox Technologies SwitchIB and SwitchIB-2 support"
+	depends on MLXSW_CORE && MLXSW_PCI && NET_SWITCHDEV
+	default m
+	---help---
+	  This driver supports Mellanox Technologies SwitchIB and SwitchIB-2
+	  Infiniband Switch ASICs.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_switchib.
+
+config MLXSW_SWITCHX2
+	tristate "Mellanox Technologies SwitchX-2 support"
+	depends on MLXSW_CORE && MLXSW_PCI && NET_SWITCHDEV
+	default m
+	---help---
+	  This driver supports Mellanox Technologies SwitchX-2 Ethernet
+	  Switch ASICs.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_switchx2.
+
+config MLXSW_SPECTRUM
+	tristate "Mellanox Technologies Spectrum support"
+	depends on MLXSW_CORE && MLXSW_PCI && NET_SWITCHDEV && VLAN_8021Q
+	depends on PSAMPLE || PSAMPLE=n
+	depends on BRIDGE || BRIDGE=n
+	depends on IPV6 || IPV6=n
+	depends on NET_IPGRE || NET_IPGRE=n
+	depends on IPV6_GRE || IPV6_GRE=n
+	select PARMAN
+	select MLXFW
+	default m
+	---help---
+	  This driver supports Mellanox Technologies Spectrum Ethernet
+	  Switch ASICs.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_spectrum.
+
+config MLXSW_SPECTRUM_DCB
+	bool "Data Center Bridging (DCB) support"
+	depends on MLXSW_SPECTRUM && DCB
+	default y
+	---help---
+	  Say Y here if you want to use Data Center Bridging (DCB) in the
+	  driver.
+
+config MLXSW_MINIMAL
+	tristate "Mellanox Technologies minimal I2C support"
+	depends on MLXSW_CORE && MLXSW_I2C
+	default m
+	---help---
+	  This driver supports I2C access for Mellanox Technologies Switch
+	  ASICs.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_minimal.
diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile
new file mode 100644
index 0000000..0cadcab
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_MLXSW_CORE)	+= mlxsw_core.o
+mlxsw_core-objs			:= core.o core_acl_flex_keys.o \
+				   core_acl_flex_actions.o
+mlxsw_core-$(CONFIG_MLXSW_CORE_HWMON) += core_hwmon.o
+mlxsw_core-$(CONFIG_MLXSW_CORE_THERMAL) += core_thermal.o
+obj-$(CONFIG_MLXSW_PCI)		+= mlxsw_pci.o
+mlxsw_pci-objs			:= pci.o
+obj-$(CONFIG_MLXSW_I2C)		+= mlxsw_i2c.o
+mlxsw_i2c-objs			:= i2c.o
+obj-$(CONFIG_MLXSW_SWITCHIB)	+= mlxsw_switchib.o
+mlxsw_switchib-objs		:= switchib.o
+obj-$(CONFIG_MLXSW_SWITCHX2)	+= mlxsw_switchx2.o
+mlxsw_switchx2-objs		:= switchx2.o
+obj-$(CONFIG_MLXSW_SPECTRUM)	+= mlxsw_spectrum.o
+mlxsw_spectrum-objs		:= spectrum.o spectrum_buffers.o \
+				   spectrum_switchdev.o spectrum_router.o \
+				   spectrum_kvdl.o spectrum_acl_tcam.o \
+				   spectrum_acl.o spectrum_flower.o \
+				   spectrum_cnt.o spectrum_fid.o \
+				   spectrum_ipip.o spectrum_acl_flex_actions.o \
+				   spectrum_mr.o spectrum_mr_tcam.o \
+				   spectrum_qdisc.o spectrum_span.o
+mlxsw_spectrum-$(CONFIG_MLXSW_SPECTRUM_DCB)	+= spectrum_dcb.o
+mlxsw_spectrum-$(CONFIG_NET_DEVLINK) += spectrum_dpipe.o
+obj-$(CONFIG_MLXSW_MINIMAL)	+= mlxsw_minimal.o
+mlxsw_minimal-objs		:= minimal.o
diff --git a/drivers/net/ethernet/mellanox/mlxsw/cmd.h b/drivers/net/ethernet/mellanox/mlxsw/cmd.h
new file mode 100644
index 0000000..479511c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/cmd.h
@@ -0,0 +1,1183 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/cmd.h
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2015 Ido Schimmel <idosch@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_CMD_H
+#define _MLXSW_CMD_H
+
+#include "item.h"
+
+#define MLXSW_CMD_MBOX_SIZE	4096
+
+static inline char *mlxsw_cmd_mbox_alloc(void)
+{
+	return kzalloc(MLXSW_CMD_MBOX_SIZE, GFP_KERNEL);
+}
+
+static inline void mlxsw_cmd_mbox_free(char *mbox)
+{
+	kfree(mbox);
+}
+
+static inline void mlxsw_cmd_mbox_zero(char *mbox)
+{
+	memset(mbox, 0, MLXSW_CMD_MBOX_SIZE);
+}
+
+struct mlxsw_core;
+
+int mlxsw_cmd_exec(struct mlxsw_core *mlxsw_core, u16 opcode, u8 opcode_mod,
+		   u32 in_mod, bool out_mbox_direct,
+		   char *in_mbox, size_t in_mbox_size,
+		   char *out_mbox, size_t out_mbox_size);
+
+static inline int mlxsw_cmd_exec_in(struct mlxsw_core *mlxsw_core, u16 opcode,
+				    u8 opcode_mod, u32 in_mod, char *in_mbox,
+				    size_t in_mbox_size)
+{
+	return mlxsw_cmd_exec(mlxsw_core, opcode, opcode_mod, in_mod, false,
+			      in_mbox, in_mbox_size, NULL, 0);
+}
+
+static inline int mlxsw_cmd_exec_out(struct mlxsw_core *mlxsw_core, u16 opcode,
+				     u8 opcode_mod, u32 in_mod,
+				     bool out_mbox_direct,
+				     char *out_mbox, size_t out_mbox_size)
+{
+	return mlxsw_cmd_exec(mlxsw_core, opcode, opcode_mod, in_mod,
+			      out_mbox_direct, NULL, 0,
+			      out_mbox, out_mbox_size);
+}
+
+static inline int mlxsw_cmd_exec_none(struct mlxsw_core *mlxsw_core, u16 opcode,
+				      u8 opcode_mod, u32 in_mod)
+{
+	return mlxsw_cmd_exec(mlxsw_core, opcode, opcode_mod, in_mod, false,
+			      NULL, 0, NULL, 0);
+}
+
+enum mlxsw_cmd_opcode {
+	MLXSW_CMD_OPCODE_QUERY_FW		= 0x004,
+	MLXSW_CMD_OPCODE_QUERY_BOARDINFO	= 0x006,
+	MLXSW_CMD_OPCODE_QUERY_AQ_CAP		= 0x003,
+	MLXSW_CMD_OPCODE_MAP_FA			= 0xFFF,
+	MLXSW_CMD_OPCODE_UNMAP_FA		= 0xFFE,
+	MLXSW_CMD_OPCODE_CONFIG_PROFILE		= 0x100,
+	MLXSW_CMD_OPCODE_ACCESS_REG		= 0x040,
+	MLXSW_CMD_OPCODE_SW2HW_DQ		= 0x201,
+	MLXSW_CMD_OPCODE_HW2SW_DQ		= 0x202,
+	MLXSW_CMD_OPCODE_2ERR_DQ		= 0x01E,
+	MLXSW_CMD_OPCODE_QUERY_DQ		= 0x022,
+	MLXSW_CMD_OPCODE_SW2HW_CQ		= 0x016,
+	MLXSW_CMD_OPCODE_HW2SW_CQ		= 0x017,
+	MLXSW_CMD_OPCODE_QUERY_CQ		= 0x018,
+	MLXSW_CMD_OPCODE_SW2HW_EQ		= 0x013,
+	MLXSW_CMD_OPCODE_HW2SW_EQ		= 0x014,
+	MLXSW_CMD_OPCODE_QUERY_EQ		= 0x015,
+	MLXSW_CMD_OPCODE_QUERY_RESOURCES	= 0x101,
+};
+
+static inline const char *mlxsw_cmd_opcode_str(u16 opcode)
+{
+	switch (opcode) {
+	case MLXSW_CMD_OPCODE_QUERY_FW:
+		return "QUERY_FW";
+	case MLXSW_CMD_OPCODE_QUERY_BOARDINFO:
+		return "QUERY_BOARDINFO";
+	case MLXSW_CMD_OPCODE_QUERY_AQ_CAP:
+		return "QUERY_AQ_CAP";
+	case MLXSW_CMD_OPCODE_MAP_FA:
+		return "MAP_FA";
+	case MLXSW_CMD_OPCODE_UNMAP_FA:
+		return "UNMAP_FA";
+	case MLXSW_CMD_OPCODE_CONFIG_PROFILE:
+		return "CONFIG_PROFILE";
+	case MLXSW_CMD_OPCODE_ACCESS_REG:
+		return "ACCESS_REG";
+	case MLXSW_CMD_OPCODE_SW2HW_DQ:
+		return "SW2HW_DQ";
+	case MLXSW_CMD_OPCODE_HW2SW_DQ:
+		return "HW2SW_DQ";
+	case MLXSW_CMD_OPCODE_2ERR_DQ:
+		return "2ERR_DQ";
+	case MLXSW_CMD_OPCODE_QUERY_DQ:
+		return "QUERY_DQ";
+	case MLXSW_CMD_OPCODE_SW2HW_CQ:
+		return "SW2HW_CQ";
+	case MLXSW_CMD_OPCODE_HW2SW_CQ:
+		return "HW2SW_CQ";
+	case MLXSW_CMD_OPCODE_QUERY_CQ:
+		return "QUERY_CQ";
+	case MLXSW_CMD_OPCODE_SW2HW_EQ:
+		return "SW2HW_EQ";
+	case MLXSW_CMD_OPCODE_HW2SW_EQ:
+		return "HW2SW_EQ";
+	case MLXSW_CMD_OPCODE_QUERY_EQ:
+		return "QUERY_EQ";
+	case MLXSW_CMD_OPCODE_QUERY_RESOURCES:
+		return "QUERY_RESOURCES";
+	default:
+		return "*UNKNOWN*";
+	}
+}
+
+enum mlxsw_cmd_status {
+	/* Command execution succeeded. */
+	MLXSW_CMD_STATUS_OK		= 0x00,
+	/* Internal error (e.g. bus error) occurred while processing command. */
+	MLXSW_CMD_STATUS_INTERNAL_ERR	= 0x01,
+	/* Operation/command not supported or opcode modifier not supported. */
+	MLXSW_CMD_STATUS_BAD_OP		= 0x02,
+	/* Parameter not supported, parameter out of range. */
+	MLXSW_CMD_STATUS_BAD_PARAM	= 0x03,
+	/* System was not enabled or bad system state. */
+	MLXSW_CMD_STATUS_BAD_SYS_STATE	= 0x04,
+	/* Attempt to access reserved or unallocated resource, or resource in
+	 * inappropriate ownership.
+	 */
+	MLXSW_CMD_STATUS_BAD_RESOURCE	= 0x05,
+	/* Requested resource is currently executing a command. */
+	MLXSW_CMD_STATUS_RESOURCE_BUSY	= 0x06,
+	/* Required capability exceeds device limits. */
+	MLXSW_CMD_STATUS_EXCEED_LIM	= 0x08,
+	/* Resource is not in the appropriate state or ownership. */
+	MLXSW_CMD_STATUS_BAD_RES_STATE	= 0x09,
+	/* Index out of range (might be beyond table size or attempt to
+	 * access a reserved resource).
+	 */
+	MLXSW_CMD_STATUS_BAD_INDEX	= 0x0A,
+	/* NVMEM checksum/CRC failed. */
+	MLXSW_CMD_STATUS_BAD_NVMEM	= 0x0B,
+	/* Bad management packet (silently discarded). */
+	MLXSW_CMD_STATUS_BAD_PKT	= 0x30,
+};
+
+static inline const char *mlxsw_cmd_status_str(u8 status)
+{
+	switch (status) {
+	case MLXSW_CMD_STATUS_OK:
+		return "OK";
+	case MLXSW_CMD_STATUS_INTERNAL_ERR:
+		return "INTERNAL_ERR";
+	case MLXSW_CMD_STATUS_BAD_OP:
+		return "BAD_OP";
+	case MLXSW_CMD_STATUS_BAD_PARAM:
+		return "BAD_PARAM";
+	case MLXSW_CMD_STATUS_BAD_SYS_STATE:
+		return "BAD_SYS_STATE";
+	case MLXSW_CMD_STATUS_BAD_RESOURCE:
+		return "BAD_RESOURCE";
+	case MLXSW_CMD_STATUS_RESOURCE_BUSY:
+		return "RESOURCE_BUSY";
+	case MLXSW_CMD_STATUS_EXCEED_LIM:
+		return "EXCEED_LIM";
+	case MLXSW_CMD_STATUS_BAD_RES_STATE:
+		return "BAD_RES_STATE";
+	case MLXSW_CMD_STATUS_BAD_INDEX:
+		return "BAD_INDEX";
+	case MLXSW_CMD_STATUS_BAD_NVMEM:
+		return "BAD_NVMEM";
+	case MLXSW_CMD_STATUS_BAD_PKT:
+		return "BAD_PKT";
+	default:
+		return "*UNKNOWN*";
+	}
+}
+
+/* QUERY_FW - Query Firmware
+ * -------------------------
+ * OpMod == 0, INMmod == 0
+ * -----------------------
+ * The QUERY_FW command retrieves information related to firmware, command
+ * interface version and the amount of resources that should be allocated to
+ * the firmware.
+ */
+
+static inline int mlxsw_cmd_query_fw(struct mlxsw_core *mlxsw_core,
+				     char *out_mbox)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_QUERY_FW,
+				  0, 0, false, out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_query_fw_fw_pages
+ * Amount of physical memory to be allocatedfor firmware usage in 4KB pages.
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_pages, 0x00, 16, 16);
+
+/* cmd_mbox_query_fw_fw_rev_major
+ * Firmware Revision - Major
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_rev_major, 0x00, 0, 16);
+
+/* cmd_mbox_query_fw_fw_rev_subminor
+ * Firmware Sub-minor version (Patch level)
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_rev_subminor, 0x04, 16, 16);
+
+/* cmd_mbox_query_fw_fw_rev_minor
+ * Firmware Revision - Minor
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_rev_minor, 0x04, 0, 16);
+
+/* cmd_mbox_query_fw_core_clk
+ * Internal Clock Frequency (in MHz)
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, core_clk, 0x08, 16, 16);
+
+/* cmd_mbox_query_fw_cmd_interface_rev
+ * Command Interface Interpreter Revision ID. This number is bumped up
+ * every time a non-backward-compatible change is done for the command
+ * interface. The current cmd_interface_rev is 1.
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, cmd_interface_rev, 0x08, 0, 16);
+
+/* cmd_mbox_query_fw_dt
+ * If set, Debug Trace is supported
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, dt, 0x0C, 31, 1);
+
+/* cmd_mbox_query_fw_api_version
+ * Indicates the version of the API, to enable software querying
+ * for compatibility. The current api_version is 1.
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, api_version, 0x0C, 0, 16);
+
+/* cmd_mbox_query_fw_fw_hour
+ * Firmware timestamp - hour
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_hour, 0x10, 24, 8);
+
+/* cmd_mbox_query_fw_fw_minutes
+ * Firmware timestamp - minutes
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_minutes, 0x10, 16, 8);
+
+/* cmd_mbox_query_fw_fw_seconds
+ * Firmware timestamp - seconds
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_seconds, 0x10, 8, 8);
+
+/* cmd_mbox_query_fw_fw_year
+ * Firmware timestamp - year
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_year, 0x14, 16, 16);
+
+/* cmd_mbox_query_fw_fw_month
+ * Firmware timestamp - month
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_month, 0x14, 8, 8);
+
+/* cmd_mbox_query_fw_fw_day
+ * Firmware timestamp - day
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_day, 0x14, 0, 8);
+
+/* cmd_mbox_query_fw_clr_int_base_offset
+ * Clear Interrupt register's offset from clr_int_bar register
+ * in PCI address space.
+ */
+MLXSW_ITEM64(cmd_mbox, query_fw, clr_int_base_offset, 0x20, 0, 64);
+
+/* cmd_mbox_query_fw_clr_int_bar
+ * PCI base address register (BAR) where clr_int register is located.
+ * 00 - BAR 0-1 (64 bit BAR)
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, clr_int_bar, 0x28, 30, 2);
+
+/* cmd_mbox_query_fw_error_buf_offset
+ * Read Only buffer for internal error reports of offset
+ * from error_buf_bar register in PCI address space).
+ */
+MLXSW_ITEM64(cmd_mbox, query_fw, error_buf_offset, 0x30, 0, 64);
+
+/* cmd_mbox_query_fw_error_buf_size
+ * Internal error buffer size in DWORDs
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, error_buf_size, 0x38, 0, 32);
+
+/* cmd_mbox_query_fw_error_int_bar
+ * PCI base address register (BAR) where error buffer
+ * register is located.
+ * 00 - BAR 0-1 (64 bit BAR)
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, error_int_bar, 0x3C, 30, 2);
+
+/* cmd_mbox_query_fw_doorbell_page_offset
+ * Offset of the doorbell page
+ */
+MLXSW_ITEM64(cmd_mbox, query_fw, doorbell_page_offset, 0x40, 0, 64);
+
+/* cmd_mbox_query_fw_doorbell_page_bar
+ * PCI base address register (BAR) of the doorbell page
+ * 00 - BAR 0-1 (64 bit BAR)
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, doorbell_page_bar, 0x48, 30, 2);
+
+/* QUERY_BOARDINFO - Query Board Information
+ * -----------------------------------------
+ * OpMod == 0 (N/A), INMmod == 0 (N/A)
+ * -----------------------------------
+ * The QUERY_BOARDINFO command retrieves adapter specific parameters.
+ */
+
+static inline int mlxsw_cmd_boardinfo(struct mlxsw_core *mlxsw_core,
+				      char *out_mbox)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_QUERY_BOARDINFO,
+				  0, 0, false, out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_boardinfo_intapin
+ * When PCIe interrupt messages are being used, this value is used for clearing
+ * an interrupt. When using MSI-X, this register is not used.
+ */
+MLXSW_ITEM32(cmd_mbox, boardinfo, intapin, 0x10, 24, 8);
+
+/* cmd_mbox_boardinfo_vsd_vendor_id
+ * PCISIG Vendor ID (www.pcisig.com/membership/vid_search) of the vendor
+ * specifying/formatting the VSD. The vsd_vendor_id identifies the management
+ * domain of the VSD/PSID data. Different vendors may choose different VSD/PSID
+ * format and encoding as long as they use their assigned vsd_vendor_id.
+ */
+MLXSW_ITEM32(cmd_mbox, boardinfo, vsd_vendor_id, 0x1C, 0, 16);
+
+/* cmd_mbox_boardinfo_vsd
+ * Vendor Specific Data. The VSD string that is burnt to the Flash
+ * with the firmware.
+ */
+#define MLXSW_CMD_BOARDINFO_VSD_LEN 208
+MLXSW_ITEM_BUF(cmd_mbox, boardinfo, vsd, 0x20, MLXSW_CMD_BOARDINFO_VSD_LEN);
+
+/* cmd_mbox_boardinfo_psid
+ * The PSID field is a 16-ascii (byte) character string which acts as
+ * the board ID. The PSID format is used in conjunction with
+ * Mellanox vsd_vendor_id (15B3h).
+ */
+#define MLXSW_CMD_BOARDINFO_PSID_LEN 16
+MLXSW_ITEM_BUF(cmd_mbox, boardinfo, psid, 0xF0, MLXSW_CMD_BOARDINFO_PSID_LEN);
+
+/* QUERY_AQ_CAP - Query Asynchronous Queues Capabilities
+ * -----------------------------------------------------
+ * OpMod == 0 (N/A), INMmod == 0 (N/A)
+ * -----------------------------------
+ * The QUERY_AQ_CAP command returns the device asynchronous queues
+ * capabilities supported.
+ */
+
+static inline int mlxsw_cmd_query_aq_cap(struct mlxsw_core *mlxsw_core,
+					 char *out_mbox)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_QUERY_AQ_CAP,
+				  0, 0, false, out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_query_aq_cap_log_max_sdq_sz
+ * Log (base 2) of max WQEs allowed on SDQ.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, log_max_sdq_sz, 0x00, 24, 8);
+
+/* cmd_mbox_query_aq_cap_max_num_sdqs
+ * Maximum number of SDQs.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, max_num_sdqs, 0x00, 0, 8);
+
+/* cmd_mbox_query_aq_cap_log_max_rdq_sz
+ * Log (base 2) of max WQEs allowed on RDQ.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, log_max_rdq_sz, 0x04, 24, 8);
+
+/* cmd_mbox_query_aq_cap_max_num_rdqs
+ * Maximum number of RDQs.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, max_num_rdqs, 0x04, 0, 8);
+
+/* cmd_mbox_query_aq_cap_log_max_cq_sz
+ * Log (base 2) of max CQEs allowed on CQ.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, log_max_cq_sz, 0x08, 24, 8);
+
+/* cmd_mbox_query_aq_cap_max_num_cqs
+ * Maximum number of CQs.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, max_num_cqs, 0x08, 0, 8);
+
+/* cmd_mbox_query_aq_cap_log_max_eq_sz
+ * Log (base 2) of max EQEs allowed on EQ.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, log_max_eq_sz, 0x0C, 24, 8);
+
+/* cmd_mbox_query_aq_cap_max_num_eqs
+ * Maximum number of EQs.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, max_num_eqs, 0x0C, 0, 8);
+
+/* cmd_mbox_query_aq_cap_max_sg_sq
+ * The maximum S/G list elements in an DSQ. DSQ must not contain
+ * more S/G entries than indicated here.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, max_sg_sq, 0x10, 8, 8);
+
+/* cmd_mbox_query_aq_cap_
+ * The maximum S/G list elements in an DRQ. DRQ must not contain
+ * more S/G entries than indicated here.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, max_sg_rq, 0x10, 0, 8);
+
+/* MAP_FA - Map Firmware Area
+ * --------------------------
+ * OpMod == 0 (N/A), INMmod == Number of VPM entries
+ * -------------------------------------------------
+ * The MAP_FA command passes physical pages to the switch. These pages
+ * are used to store the device firmware. MAP_FA can be executed multiple
+ * times until all the firmware area is mapped (the size that should be
+ * mapped is retrieved through the QUERY_FW command). All required pages
+ * must be mapped to finish the initialization phase. Physical memory
+ * passed in this command must be pinned.
+ */
+
+#define MLXSW_CMD_MAP_FA_VPM_ENTRIES_MAX 32
+
+static inline int mlxsw_cmd_map_fa(struct mlxsw_core *mlxsw_core,
+				   char *in_mbox, u32 vpm_entries_count)
+{
+	return mlxsw_cmd_exec_in(mlxsw_core, MLXSW_CMD_OPCODE_MAP_FA,
+				 0, vpm_entries_count,
+				 in_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_map_fa_pa
+ * Physical Address.
+ */
+MLXSW_ITEM64_INDEXED(cmd_mbox, map_fa, pa, 0x00, 12, 52, 0x08, 0x00, true);
+
+/* cmd_mbox_map_fa_log2size
+ * Log (base 2) of the size in 4KB pages of the physical and contiguous memory
+ * that starts at PA_L/H.
+ */
+MLXSW_ITEM32_INDEXED(cmd_mbox, map_fa, log2size, 0x00, 0, 5, 0x08, 0x04, false);
+
+/* UNMAP_FA - Unmap Firmware Area
+ * ------------------------------
+ * OpMod == 0 (N/A), INMmod == 0 (N/A)
+ * -----------------------------------
+ * The UNMAP_FA command unload the firmware and unmaps all the
+ * firmware area. After this command is completed the device will not access
+ * the pages that were mapped to the firmware area. After executing UNMAP_FA
+ * command, software reset must be done prior to execution of MAP_FW command.
+ */
+
+static inline int mlxsw_cmd_unmap_fa(struct mlxsw_core *mlxsw_core)
+{
+	return mlxsw_cmd_exec_none(mlxsw_core, MLXSW_CMD_OPCODE_UNMAP_FA, 0, 0);
+}
+
+/* QUERY_RESOURCES - Query chip resources
+ * --------------------------------------
+ * OpMod == 0 (N/A) , INMmod is index
+ * ----------------------------------
+ * The QUERY_RESOURCES command retrieves information related to chip resources
+ * by resource ID. Every command returns 32 entries. INmod is being use as base.
+ * for example, index 1 will return entries 32-63. When the tables end and there
+ * are no more sources in the table, will return resource id 0xFFF to indicate
+ * it.
+ */
+
+#define MLXSW_CMD_QUERY_RESOURCES_TABLE_END_ID 0xffff
+#define MLXSW_CMD_QUERY_RESOURCES_MAX_QUERIES 100
+#define MLXSW_CMD_QUERY_RESOURCES_PER_QUERY 32
+
+static inline int mlxsw_cmd_query_resources(struct mlxsw_core *mlxsw_core,
+					    char *out_mbox, int index)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_QUERY_RESOURCES,
+				  0, index, false, out_mbox,
+				  MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_query_resource_id
+ * The resource id. 0xFFFF indicates table's end.
+ */
+MLXSW_ITEM32_INDEXED(cmd_mbox, query_resource, id, 0x00, 16, 16, 0x8, 0, false);
+
+/* cmd_mbox_query_resource_data
+ * The resource
+ */
+MLXSW_ITEM64_INDEXED(cmd_mbox, query_resource, data,
+		     0x00, 0, 40, 0x8, 0, false);
+
+/* CONFIG_PROFILE (Set) - Configure Switch Profile
+ * ------------------------------
+ * OpMod == 1 (Set), INMmod == 0 (N/A)
+ * -----------------------------------
+ * The CONFIG_PROFILE command sets the switch profile. The command can be
+ * executed on the device only once at startup in order to allocate and
+ * configure all switch resources and prepare it for operational mode.
+ * It is not possible to change the device profile after the chip is
+ * in operational mode.
+ * Failure of the CONFIG_PROFILE command leaves the hardware in an indeterminate
+ * state therefore it is required to perform software reset to the device
+ * following an unsuccessful completion of the command. It is required
+ * to perform software reset to the device to change an existing profile.
+ */
+
+static inline int mlxsw_cmd_config_profile_set(struct mlxsw_core *mlxsw_core,
+					       char *in_mbox)
+{
+	return mlxsw_cmd_exec_in(mlxsw_core, MLXSW_CMD_OPCODE_CONFIG_PROFILE,
+				 1, 0, in_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_config_profile_set_max_vepa_channels
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_vepa_channels, 0x0C, 0, 1);
+
+/* cmd_mbox_config_profile_set_max_lag
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_lag, 0x0C, 1, 1);
+
+/* cmd_mbox_config_profile_set_max_port_per_lag
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_port_per_lag, 0x0C, 2, 1);
+
+/* cmd_mbox_config_profile_set_max_mid
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_mid, 0x0C, 3, 1);
+
+/* cmd_mbox_config_profile_set_max_pgt
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_pgt, 0x0C, 4, 1);
+
+/* cmd_mbox_config_profile_set_max_system_port
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_system_port, 0x0C, 5, 1);
+
+/* cmd_mbox_config_profile_set_max_vlan_groups
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_vlan_groups, 0x0C, 6, 1);
+
+/* cmd_mbox_config_profile_set_max_regions
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_regions, 0x0C, 7, 1);
+
+/* cmd_mbox_config_profile_set_flood_mode
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_flood_mode, 0x0C, 8, 1);
+
+/* cmd_mbox_config_profile_set_max_flood_tables
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_flood_tables, 0x0C, 9, 1);
+
+/* cmd_mbox_config_profile_set_max_ib_mc
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_ib_mc, 0x0C, 12, 1);
+
+/* cmd_mbox_config_profile_set_max_pkey
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_pkey, 0x0C, 13, 1);
+
+/* cmd_mbox_config_profile_set_adaptive_routing_group_cap
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile,
+	     set_adaptive_routing_group_cap, 0x0C, 14, 1);
+
+/* cmd_mbox_config_profile_set_ar_sec
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_ar_sec, 0x0C, 15, 1);
+
+/* cmd_mbox_config_set_kvd_linear_size
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_kvd_linear_size, 0x0C, 24, 1);
+
+/* cmd_mbox_config_set_kvd_hash_single_size
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_kvd_hash_single_size, 0x0C, 25, 1);
+
+/* cmd_mbox_config_set_kvd_hash_double_size
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_kvd_hash_double_size, 0x0C, 26, 1);
+
+/* cmd_mbox_config_profile_max_vepa_channels
+ * Maximum number of VEPA channels per port (0 through 16)
+ * 0 - multi-channel VEPA is disabled
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_vepa_channels, 0x10, 0, 8);
+
+/* cmd_mbox_config_profile_max_lag
+ * Maximum number of LAG IDs requested.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_lag, 0x14, 0, 16);
+
+/* cmd_mbox_config_profile_max_port_per_lag
+ * Maximum number of ports per LAG requested.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_port_per_lag, 0x18, 0, 16);
+
+/* cmd_mbox_config_profile_max_mid
+ * Maximum Multicast IDs.
+ * Multicast IDs are allocated from 0 to max_mid-1
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_mid, 0x1C, 0, 16);
+
+/* cmd_mbox_config_profile_max_pgt
+ * Maximum records in the Port Group Table per Switch Partition.
+ * Port Group Table indexes are from 0 to max_pgt-1
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_pgt, 0x20, 0, 16);
+
+/* cmd_mbox_config_profile_max_system_port
+ * The maximum number of system ports that can be allocated.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_system_port, 0x24, 0, 16);
+
+/* cmd_mbox_config_profile_max_vlan_groups
+ * Maximum number VLAN Groups for VLAN binding.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_vlan_groups, 0x28, 0, 12);
+
+/* cmd_mbox_config_profile_max_regions
+ * Maximum number of TCAM Regions.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_regions, 0x2C, 0, 16);
+
+/* cmd_mbox_config_profile_max_flood_tables
+ * Maximum number of single-entry flooding tables. Different flooding tables
+ * can be associated with different packet types.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_flood_tables, 0x30, 16, 4);
+
+/* cmd_mbox_config_profile_max_vid_flood_tables
+ * Maximum number of per-vid flooding tables. Flooding tables are associated
+ * to the different packet types for the different switch partitions.
+ * Table size is 4K entries covering all VID space.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_vid_flood_tables, 0x30, 8, 4);
+
+/* cmd_mbox_config_profile_flood_mode
+ * Flooding mode to use.
+ * 0-2 - Backward compatible modes for SwitchX devices.
+ * 3 - Mixed mode, where:
+ * max_flood_tables indicates the number of single-entry tables.
+ * max_vid_flood_tables indicates the number of per-VID tables.
+ * max_fid_offset_flood_tables indicates the number of FID-offset tables.
+ * max_fid_flood_tables indicates the number of per-FID tables.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, flood_mode, 0x30, 0, 2);
+
+/* cmd_mbox_config_profile_max_fid_offset_flood_tables
+ * Maximum number of FID-offset flooding tables.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile,
+	     max_fid_offset_flood_tables, 0x34, 24, 4);
+
+/* cmd_mbox_config_profile_fid_offset_flood_table_size
+ * The size (number of entries) of each FID-offset flood table.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile,
+	     fid_offset_flood_table_size, 0x34, 0, 16);
+
+/* cmd_mbox_config_profile_max_fid_flood_tables
+ * Maximum number of per-FID flooding tables.
+ *
+ * Note: This flooding tables cover special FIDs only (vFIDs), starting at
+ * FID value 4K and higher.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_fid_flood_tables, 0x38, 24, 4);
+
+/* cmd_mbox_config_profile_fid_flood_table_size
+ * The size (number of entries) of each per-FID table.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, fid_flood_table_size, 0x38, 0, 16);
+
+/* cmd_mbox_config_profile_max_ib_mc
+ * Maximum number of multicast FDB records for InfiniBand
+ * FDB (in 512 chunks) per InfiniBand switch partition.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_ib_mc, 0x40, 0, 15);
+
+/* cmd_mbox_config_profile_max_pkey
+ * Maximum per port PKEY table size (for PKEY enforcement)
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_pkey, 0x44, 0, 15);
+
+/* cmd_mbox_config_profile_ar_sec
+ * Primary/secondary capability
+ * Describes the number of adaptive routing sub-groups
+ * 0 - disable primary/secondary (single group)
+ * 1 - enable primary/secondary (2 sub-groups)
+ * 2 - 3 sub-groups: Not supported in SwitchX, SwitchX-2
+ * 3 - 4 sub-groups: Not supported in SwitchX, SwitchX-2
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, ar_sec, 0x4C, 24, 2);
+
+/* cmd_mbox_config_profile_adaptive_routing_group_cap
+ * Adaptive Routing Group Capability. Indicates the number of AR groups
+ * supported. Note that when Primary/secondary is enabled, each
+ * primary/secondary couple consumes 2 adaptive routing entries.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, adaptive_routing_group_cap, 0x4C, 0, 16);
+
+/* cmd_mbox_config_profile_arn
+ * Adaptive Routing Notification Enable
+ * Not supported in SwitchX, SwitchX-2
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, arn, 0x50, 31, 1);
+
+/* cmd_mbox_config_kvd_linear_size
+ * KVD Linear Size
+ * Valid for Spectrum only
+ * Allowed values are 128*N where N=0 or higher
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, kvd_linear_size, 0x54, 0, 24);
+
+/* cmd_mbox_config_kvd_hash_single_size
+ * KVD Hash single-entries size
+ * Valid for Spectrum only
+ * Allowed values are 128*N where N=0 or higher
+ * Must be greater or equal to cap_min_kvd_hash_single_size
+ * Must be smaller or equal to cap_kvd_size - kvd_linear_size
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, kvd_hash_single_size, 0x58, 0, 24);
+
+/* cmd_mbox_config_kvd_hash_double_size
+ * KVD Hash double-entries size (units of single-size entries)
+ * Valid for Spectrum only
+ * Allowed values are 128*N where N=0 or higher
+ * Must be either 0 or greater or equal to cap_min_kvd_hash_double_size
+ * Must be smaller or equal to cap_kvd_size - kvd_linear_size
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, kvd_hash_double_size, 0x5C, 0, 24);
+
+/* cmd_mbox_config_profile_swid_config_mask
+ * Modify Switch Partition Configuration mask. When set, the configu-
+ * ration value for the Switch Partition are taken from the mailbox.
+ * When clear, the current configuration values are used.
+ * Bit 0 - set type
+ * Bit 1 - properties
+ * Other - reserved
+ */
+MLXSW_ITEM32_INDEXED(cmd_mbox, config_profile, swid_config_mask,
+		     0x60, 24, 8, 0x08, 0x00, false);
+
+/* cmd_mbox_config_profile_swid_config_type
+ * Switch Partition type.
+ * 0000 - disabled (Switch Partition does not exist)
+ * 0001 - InfiniBand
+ * 0010 - Ethernet
+ * 1000 - router port (SwitchX-2 only)
+ * Other - reserved
+ */
+MLXSW_ITEM32_INDEXED(cmd_mbox, config_profile, swid_config_type,
+		     0x60, 20, 4, 0x08, 0x00, false);
+
+/* cmd_mbox_config_profile_swid_config_properties
+ * Switch Partition properties.
+ */
+MLXSW_ITEM32_INDEXED(cmd_mbox, config_profile, swid_config_properties,
+		     0x60, 0, 8, 0x08, 0x00, false);
+
+/* ACCESS_REG - Access EMAD Supported Register
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == 0 (N/A)
+ * -------------------------------------
+ * The ACCESS_REG command supports accessing device registers. This access
+ * is mainly used for bootstrapping.
+ */
+
+static inline int mlxsw_cmd_access_reg(struct mlxsw_core *mlxsw_core,
+				       char *in_mbox, char *out_mbox)
+{
+	return mlxsw_cmd_exec(mlxsw_core, MLXSW_CMD_OPCODE_ACCESS_REG,
+			      0, 0, false, in_mbox, MLXSW_CMD_MBOX_SIZE,
+			      out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* SW2HW_DQ - Software to Hardware DQ
+ * ----------------------------------
+ * OpMod == 0 (send DQ) / OpMod == 1 (receive DQ)
+ * INMmod == DQ number
+ * ----------------------------------------------
+ * The SW2HW_DQ command transitions a descriptor queue from software to
+ * hardware ownership. The command enables posting WQEs and ringing DoorBells
+ * on the descriptor queue.
+ */
+
+static inline int __mlxsw_cmd_sw2hw_dq(struct mlxsw_core *mlxsw_core,
+				       char *in_mbox, u32 dq_number,
+				       u8 opcode_mod)
+{
+	return mlxsw_cmd_exec_in(mlxsw_core, MLXSW_CMD_OPCODE_SW2HW_DQ,
+				 opcode_mod, dq_number,
+				 in_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+enum {
+	MLXSW_CMD_OPCODE_MOD_SDQ = 0,
+	MLXSW_CMD_OPCODE_MOD_RDQ = 1,
+};
+
+static inline int mlxsw_cmd_sw2hw_sdq(struct mlxsw_core *mlxsw_core,
+				      char *in_mbox, u32 dq_number)
+{
+	return __mlxsw_cmd_sw2hw_dq(mlxsw_core, in_mbox, dq_number,
+				    MLXSW_CMD_OPCODE_MOD_SDQ);
+}
+
+static inline int mlxsw_cmd_sw2hw_rdq(struct mlxsw_core *mlxsw_core,
+				      char *in_mbox, u32 dq_number)
+{
+	return __mlxsw_cmd_sw2hw_dq(mlxsw_core, in_mbox, dq_number,
+				    MLXSW_CMD_OPCODE_MOD_RDQ);
+}
+
+/* cmd_mbox_sw2hw_dq_cq
+ * Number of the CQ that this Descriptor Queue reports completions to.
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_dq, cq, 0x00, 24, 8);
+
+/* cmd_mbox_sw2hw_dq_sdq_tclass
+ * SDQ: CPU Egress TClass
+ * RDQ: Reserved
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_dq, sdq_tclass, 0x00, 16, 6);
+
+/* cmd_mbox_sw2hw_dq_log2_dq_sz
+ * Log (base 2) of the Descriptor Queue size in 4KB pages.
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_dq, log2_dq_sz, 0x00, 0, 6);
+
+/* cmd_mbox_sw2hw_dq_pa
+ * Physical Address.
+ */
+MLXSW_ITEM64_INDEXED(cmd_mbox, sw2hw_dq, pa, 0x10, 12, 52, 0x08, 0x00, true);
+
+/* HW2SW_DQ - Hardware to Software DQ
+ * ----------------------------------
+ * OpMod == 0 (send DQ) / OpMod == 1 (receive DQ)
+ * INMmod == DQ number
+ * ----------------------------------------------
+ * The HW2SW_DQ command transitions a descriptor queue from hardware to
+ * software ownership. Incoming packets on the DQ are silently discarded,
+ * SW should not post descriptors on nonoperational DQs.
+ */
+
+static inline int __mlxsw_cmd_hw2sw_dq(struct mlxsw_core *mlxsw_core,
+				       u32 dq_number, u8 opcode_mod)
+{
+	return mlxsw_cmd_exec_none(mlxsw_core, MLXSW_CMD_OPCODE_HW2SW_DQ,
+				   opcode_mod, dq_number);
+}
+
+static inline int mlxsw_cmd_hw2sw_sdq(struct mlxsw_core *mlxsw_core,
+				      u32 dq_number)
+{
+	return __mlxsw_cmd_hw2sw_dq(mlxsw_core, dq_number,
+				    MLXSW_CMD_OPCODE_MOD_SDQ);
+}
+
+static inline int mlxsw_cmd_hw2sw_rdq(struct mlxsw_core *mlxsw_core,
+				      u32 dq_number)
+{
+	return __mlxsw_cmd_hw2sw_dq(mlxsw_core, dq_number,
+				    MLXSW_CMD_OPCODE_MOD_RDQ);
+}
+
+/* 2ERR_DQ - To Error DQ
+ * ---------------------
+ * OpMod == 0 (send DQ) / OpMod == 1 (receive DQ)
+ * INMmod == DQ number
+ * ----------------------------------------------
+ * The 2ERR_DQ command transitions the DQ into the error state from the state
+ * in which it has been. While the command is executed, some in-process
+ * descriptors may complete. Once the DQ transitions into the error state,
+ * if there are posted descriptors on the RDQ/SDQ, the hardware writes
+ * a completion with error (flushed) for all descriptors posted in the RDQ/SDQ.
+ * When the command is completed successfully, the DQ is already in
+ * the error state.
+ */
+
+static inline int __mlxsw_cmd_2err_dq(struct mlxsw_core *mlxsw_core,
+				      u32 dq_number, u8 opcode_mod)
+{
+	return mlxsw_cmd_exec_none(mlxsw_core, MLXSW_CMD_OPCODE_2ERR_DQ,
+				   opcode_mod, dq_number);
+}
+
+static inline int mlxsw_cmd_2err_sdq(struct mlxsw_core *mlxsw_core,
+				     u32 dq_number)
+{
+	return __mlxsw_cmd_2err_dq(mlxsw_core, dq_number,
+				   MLXSW_CMD_OPCODE_MOD_SDQ);
+}
+
+static inline int mlxsw_cmd_2err_rdq(struct mlxsw_core *mlxsw_core,
+				     u32 dq_number)
+{
+	return __mlxsw_cmd_2err_dq(mlxsw_core, dq_number,
+				   MLXSW_CMD_OPCODE_MOD_RDQ);
+}
+
+/* QUERY_DQ - Query DQ
+ * ---------------------
+ * OpMod == 0 (send DQ) / OpMod == 1 (receive DQ)
+ * INMmod == DQ number
+ * ----------------------------------------------
+ * The QUERY_DQ command retrieves a snapshot of DQ parameters from the hardware.
+ *
+ * Note: Output mailbox has the same format as SW2HW_DQ.
+ */
+
+static inline int __mlxsw_cmd_query_dq(struct mlxsw_core *mlxsw_core,
+				       char *out_mbox, u32 dq_number,
+				       u8 opcode_mod)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_2ERR_DQ,
+				  opcode_mod, dq_number, false,
+				  out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+static inline int mlxsw_cmd_query_sdq(struct mlxsw_core *mlxsw_core,
+				      char *out_mbox, u32 dq_number)
+{
+	return __mlxsw_cmd_query_dq(mlxsw_core, out_mbox, dq_number,
+				    MLXSW_CMD_OPCODE_MOD_SDQ);
+}
+
+static inline int mlxsw_cmd_query_rdq(struct mlxsw_core *mlxsw_core,
+				      char *out_mbox, u32 dq_number)
+{
+	return __mlxsw_cmd_query_dq(mlxsw_core, out_mbox, dq_number,
+				    MLXSW_CMD_OPCODE_MOD_RDQ);
+}
+
+/* SW2HW_CQ - Software to Hardware CQ
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == CQ number
+ * -------------------------------------
+ * The SW2HW_CQ command transfers ownership of a CQ context entry from software
+ * to hardware. The command takes the CQ context entry from the input mailbox
+ * and stores it in the CQC in the ownership of the hardware. The command fails
+ * if the requested CQC entry is already in the ownership of the hardware.
+ */
+
+static inline int mlxsw_cmd_sw2hw_cq(struct mlxsw_core *mlxsw_core,
+				     char *in_mbox, u32 cq_number)
+{
+	return mlxsw_cmd_exec_in(mlxsw_core, MLXSW_CMD_OPCODE_SW2HW_CQ,
+				 0, cq_number, in_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_sw2hw_cq_cv
+ * CQE Version.
+ * 0 - CQE Version 0, 1 - CQE Version 1
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_cq, cv, 0x00, 28, 4);
+
+/* cmd_mbox_sw2hw_cq_c_eqn
+ * Event Queue this CQ reports completion events to.
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_cq, c_eqn, 0x00, 24, 1);
+
+/* cmd_mbox_sw2hw_cq_st
+ * Event delivery state machine
+ * 0x0 - FIRED
+ * 0x1 - ARMED (Request for Notification)
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_cq, st, 0x00, 8, 1);
+
+/* cmd_mbox_sw2hw_cq_log_cq_size
+ * Log (base 2) of the CQ size (in entries).
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_cq, log_cq_size, 0x00, 0, 4);
+
+/* cmd_mbox_sw2hw_cq_producer_counter
+ * Producer Counter. The counter is incremented for each CQE that is
+ * written by the HW to the CQ.
+ * Maintained by HW (valid for the QUERY_CQ command only)
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_cq, producer_counter, 0x04, 0, 16);
+
+/* cmd_mbox_sw2hw_cq_pa
+ * Physical Address.
+ */
+MLXSW_ITEM64_INDEXED(cmd_mbox, sw2hw_cq, pa, 0x10, 11, 53, 0x08, 0x00, true);
+
+/* HW2SW_CQ - Hardware to Software CQ
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == CQ number
+ * -------------------------------------
+ * The HW2SW_CQ command transfers ownership of a CQ context entry from hardware
+ * to software. The CQC entry is invalidated as a result of this command.
+ */
+
+static inline int mlxsw_cmd_hw2sw_cq(struct mlxsw_core *mlxsw_core,
+				     u32 cq_number)
+{
+	return mlxsw_cmd_exec_none(mlxsw_core, MLXSW_CMD_OPCODE_HW2SW_CQ,
+				   0, cq_number);
+}
+
+/* QUERY_CQ - Query CQ
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == CQ number
+ * -------------------------------------
+ * The QUERY_CQ command retrieves a snapshot of the current CQ context entry.
+ * The command stores the snapshot in the output mailbox in the software format.
+ * Note that the CQ context state and values are not affected by the QUERY_CQ
+ * command. The QUERY_CQ command is for debug purposes only.
+ *
+ * Note: Output mailbox has the same format as SW2HW_CQ.
+ */
+
+static inline int mlxsw_cmd_query_cq(struct mlxsw_core *mlxsw_core,
+				     char *out_mbox, u32 cq_number)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_QUERY_CQ,
+				  0, cq_number, false,
+				  out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* SW2HW_EQ - Software to Hardware EQ
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == EQ number
+ * -------------------------------------
+ * The SW2HW_EQ command transfers ownership of an EQ context entry from software
+ * to hardware. The command takes the EQ context entry from the input mailbox
+ * and stores it in the EQC in the ownership of the hardware. The command fails
+ * if the requested EQC entry is already in the ownership of the hardware.
+ */
+
+static inline int mlxsw_cmd_sw2hw_eq(struct mlxsw_core *mlxsw_core,
+				     char *in_mbox, u32 eq_number)
+{
+	return mlxsw_cmd_exec_in(mlxsw_core, MLXSW_CMD_OPCODE_SW2HW_EQ,
+				 0, eq_number, in_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_sw2hw_eq_int_msix
+ * When set, MSI-X cycles will be generated by this EQ.
+ * When cleared, an interrupt will be generated by this EQ.
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_eq, int_msix, 0x00, 24, 1);
+
+/* cmd_mbox_sw2hw_eq_st
+ * Event delivery state machine
+ * 0x0 - FIRED
+ * 0x1 - ARMED (Request for Notification)
+ * 0x11 - Always ARMED
+ * other - reserved
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_eq, st, 0x00, 8, 2);
+
+/* cmd_mbox_sw2hw_eq_log_eq_size
+ * Log (base 2) of the EQ size (in entries).
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_eq, log_eq_size, 0x00, 0, 4);
+
+/* cmd_mbox_sw2hw_eq_producer_counter
+ * Producer Counter. The counter is incremented for each EQE that is written
+ * by the HW to the EQ.
+ * Maintained by HW (valid for the QUERY_EQ command only)
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_eq, producer_counter, 0x04, 0, 16);
+
+/* cmd_mbox_sw2hw_eq_pa
+ * Physical Address.
+ */
+MLXSW_ITEM64_INDEXED(cmd_mbox, sw2hw_eq, pa, 0x10, 11, 53, 0x08, 0x00, true);
+
+/* HW2SW_EQ - Hardware to Software EQ
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == EQ number
+ * -------------------------------------
+ */
+
+static inline int mlxsw_cmd_hw2sw_eq(struct mlxsw_core *mlxsw_core,
+				     u32 eq_number)
+{
+	return mlxsw_cmd_exec_none(mlxsw_core, MLXSW_CMD_OPCODE_HW2SW_EQ,
+				   0, eq_number);
+}
+
+/* QUERY_EQ - Query EQ
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == EQ number
+ * -------------------------------------
+ *
+ * Note: Output mailbox has the same format as SW2HW_EQ.
+ */
+
+static inline int mlxsw_cmd_query_eq(struct mlxsw_core *mlxsw_core,
+				     char *out_mbox, u32 eq_number)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_QUERY_EQ,
+				  0, eq_number, false,
+				  out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
new file mode 100644
index 0000000..e13ac3b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -0,0 +1,1890 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/core.c
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2015 Ido Schimmel <idosch@mellanox.com>
+ * Copyright (c) 2015 Elad Raz <eladr@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/err.h>
+#include <linux/if_link.h>
+#include <linux/netdevice.h>
+#include <linux/completion.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/random.h>
+#include <linux/jiffies.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <asm/byteorder.h>
+#include <net/devlink.h>
+#include <trace/events/devlink.h>
+
+#include "core.h"
+#include "item.h"
+#include "cmd.h"
+#include "port.h"
+#include "trap.h"
+#include "emad.h"
+#include "reg.h"
+#include "resources.h"
+
+static LIST_HEAD(mlxsw_core_driver_list);
+static DEFINE_SPINLOCK(mlxsw_core_driver_list_lock);
+
+static const char mlxsw_core_driver_name[] = "mlxsw_core";
+
+static struct workqueue_struct *mlxsw_wq;
+static struct workqueue_struct *mlxsw_owq;
+
+struct mlxsw_core_port {
+	struct devlink_port devlink_port;
+	void *port_driver_priv;
+	u8 local_port;
+};
+
+void *mlxsw_core_port_driver_priv(struct mlxsw_core_port *mlxsw_core_port)
+{
+	return mlxsw_core_port->port_driver_priv;
+}
+EXPORT_SYMBOL(mlxsw_core_port_driver_priv);
+
+static bool mlxsw_core_port_check(struct mlxsw_core_port *mlxsw_core_port)
+{
+	return mlxsw_core_port->port_driver_priv != NULL;
+}
+
+struct mlxsw_core {
+	struct mlxsw_driver *driver;
+	const struct mlxsw_bus *bus;
+	void *bus_priv;
+	const struct mlxsw_bus_info *bus_info;
+	struct workqueue_struct *emad_wq;
+	struct list_head rx_listener_list;
+	struct list_head event_listener_list;
+	struct {
+		atomic64_t tid;
+		struct list_head trans_list;
+		spinlock_t trans_list_lock; /* protects trans_list writes */
+		bool use_emad;
+	} emad;
+	struct {
+		u8 *mapping; /* lag_id+port_index to local_port mapping */
+	} lag;
+	struct mlxsw_res res;
+	struct mlxsw_hwmon *hwmon;
+	struct mlxsw_thermal *thermal;
+	struct mlxsw_core_port *ports;
+	unsigned int max_ports;
+	bool reload_fail;
+	unsigned long driver_priv[0];
+	/* driver_priv has to be always the last item */
+};
+
+#define MLXSW_PORT_MAX_PORTS_DEFAULT	0x40
+
+static int mlxsw_ports_init(struct mlxsw_core *mlxsw_core)
+{
+	/* Switch ports are numbered from 1 to queried value */
+	if (MLXSW_CORE_RES_VALID(mlxsw_core, MAX_SYSTEM_PORT))
+		mlxsw_core->max_ports = MLXSW_CORE_RES_GET(mlxsw_core,
+							   MAX_SYSTEM_PORT) + 1;
+	else
+		mlxsw_core->max_ports = MLXSW_PORT_MAX_PORTS_DEFAULT + 1;
+
+	mlxsw_core->ports = kcalloc(mlxsw_core->max_ports,
+				    sizeof(struct mlxsw_core_port), GFP_KERNEL);
+	if (!mlxsw_core->ports)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void mlxsw_ports_fini(struct mlxsw_core *mlxsw_core)
+{
+	kfree(mlxsw_core->ports);
+}
+
+unsigned int mlxsw_core_max_ports(const struct mlxsw_core *mlxsw_core)
+{
+	return mlxsw_core->max_ports;
+}
+EXPORT_SYMBOL(mlxsw_core_max_ports);
+
+void *mlxsw_core_driver_priv(struct mlxsw_core *mlxsw_core)
+{
+	return mlxsw_core->driver_priv;
+}
+EXPORT_SYMBOL(mlxsw_core_driver_priv);
+
+struct mlxsw_rx_listener_item {
+	struct list_head list;
+	struct mlxsw_rx_listener rxl;
+	void *priv;
+};
+
+struct mlxsw_event_listener_item {
+	struct list_head list;
+	struct mlxsw_event_listener el;
+	void *priv;
+};
+
+/******************
+ * EMAD processing
+ ******************/
+
+/* emad_eth_hdr_dmac
+ * Destination MAC in EMAD's Ethernet header.
+ * Must be set to 01:02:c9:00:00:01
+ */
+MLXSW_ITEM_BUF(emad, eth_hdr, dmac, 0x00, 6);
+
+/* emad_eth_hdr_smac
+ * Source MAC in EMAD's Ethernet header.
+ * Must be set to 00:02:c9:01:02:03
+ */
+MLXSW_ITEM_BUF(emad, eth_hdr, smac, 0x06, 6);
+
+/* emad_eth_hdr_ethertype
+ * Ethertype in EMAD's Ethernet header.
+ * Must be set to 0x8932
+ */
+MLXSW_ITEM32(emad, eth_hdr, ethertype, 0x0C, 16, 16);
+
+/* emad_eth_hdr_mlx_proto
+ * Mellanox protocol.
+ * Must be set to 0x0.
+ */
+MLXSW_ITEM32(emad, eth_hdr, mlx_proto, 0x0C, 8, 8);
+
+/* emad_eth_hdr_ver
+ * Mellanox protocol version.
+ * Must be set to 0x0.
+ */
+MLXSW_ITEM32(emad, eth_hdr, ver, 0x0C, 4, 4);
+
+/* emad_op_tlv_type
+ * Type of the TLV.
+ * Must be set to 0x1 (operation TLV).
+ */
+MLXSW_ITEM32(emad, op_tlv, type, 0x00, 27, 5);
+
+/* emad_op_tlv_len
+ * Length of the operation TLV in u32.
+ * Must be set to 0x4.
+ */
+MLXSW_ITEM32(emad, op_tlv, len, 0x00, 16, 11);
+
+/* emad_op_tlv_dr
+ * Direct route bit. Setting to 1 indicates the EMAD is a direct route
+ * EMAD. DR TLV must follow.
+ *
+ * Note: Currently not supported and must not be set.
+ */
+MLXSW_ITEM32(emad, op_tlv, dr, 0x00, 15, 1);
+
+/* emad_op_tlv_status
+ * Returned status in case of EMAD response. Must be set to 0 in case
+ * of EMAD request.
+ * 0x0 - success
+ * 0x1 - device is busy. Requester should retry
+ * 0x2 - Mellanox protocol version not supported
+ * 0x3 - unknown TLV
+ * 0x4 - register not supported
+ * 0x5 - operation class not supported
+ * 0x6 - EMAD method not supported
+ * 0x7 - bad parameter (e.g. port out of range)
+ * 0x8 - resource not available
+ * 0x9 - message receipt acknowledgment. Requester should retry
+ * 0x70 - internal error
+ */
+MLXSW_ITEM32(emad, op_tlv, status, 0x00, 8, 7);
+
+/* emad_op_tlv_register_id
+ * Register ID of register within register TLV.
+ */
+MLXSW_ITEM32(emad, op_tlv, register_id, 0x04, 16, 16);
+
+/* emad_op_tlv_r
+ * Response bit. Setting to 1 indicates Response, otherwise request.
+ */
+MLXSW_ITEM32(emad, op_tlv, r, 0x04, 15, 1);
+
+/* emad_op_tlv_method
+ * EMAD method type.
+ * 0x1 - query
+ * 0x2 - write
+ * 0x3 - send (currently not supported)
+ * 0x4 - event
+ */
+MLXSW_ITEM32(emad, op_tlv, method, 0x04, 8, 7);
+
+/* emad_op_tlv_class
+ * EMAD operation class. Must be set to 0x1 (REG_ACCESS).
+ */
+MLXSW_ITEM32(emad, op_tlv, class, 0x04, 0, 8);
+
+/* emad_op_tlv_tid
+ * EMAD transaction ID. Used for pairing request and response EMADs.
+ */
+MLXSW_ITEM64(emad, op_tlv, tid, 0x08, 0, 64);
+
+/* emad_reg_tlv_type
+ * Type of the TLV.
+ * Must be set to 0x3 (register TLV).
+ */
+MLXSW_ITEM32(emad, reg_tlv, type, 0x00, 27, 5);
+
+/* emad_reg_tlv_len
+ * Length of the operation TLV in u32.
+ */
+MLXSW_ITEM32(emad, reg_tlv, len, 0x00, 16, 11);
+
+/* emad_end_tlv_type
+ * Type of the TLV.
+ * Must be set to 0x0 (end TLV).
+ */
+MLXSW_ITEM32(emad, end_tlv, type, 0x00, 27, 5);
+
+/* emad_end_tlv_len
+ * Length of the end TLV in u32.
+ * Must be set to 1.
+ */
+MLXSW_ITEM32(emad, end_tlv, len, 0x00, 16, 11);
+
+enum mlxsw_core_reg_access_type {
+	MLXSW_CORE_REG_ACCESS_TYPE_QUERY,
+	MLXSW_CORE_REG_ACCESS_TYPE_WRITE,
+};
+
+static inline const char *
+mlxsw_core_reg_access_type_str(enum mlxsw_core_reg_access_type type)
+{
+	switch (type) {
+	case MLXSW_CORE_REG_ACCESS_TYPE_QUERY:
+		return "query";
+	case MLXSW_CORE_REG_ACCESS_TYPE_WRITE:
+		return "write";
+	}
+	BUG();
+}
+
+static void mlxsw_emad_pack_end_tlv(char *end_tlv)
+{
+	mlxsw_emad_end_tlv_type_set(end_tlv, MLXSW_EMAD_TLV_TYPE_END);
+	mlxsw_emad_end_tlv_len_set(end_tlv, MLXSW_EMAD_END_TLV_LEN);
+}
+
+static void mlxsw_emad_pack_reg_tlv(char *reg_tlv,
+				    const struct mlxsw_reg_info *reg,
+				    char *payload)
+{
+	mlxsw_emad_reg_tlv_type_set(reg_tlv, MLXSW_EMAD_TLV_TYPE_REG);
+	mlxsw_emad_reg_tlv_len_set(reg_tlv, reg->len / sizeof(u32) + 1);
+	memcpy(reg_tlv + sizeof(u32), payload, reg->len);
+}
+
+static void mlxsw_emad_pack_op_tlv(char *op_tlv,
+				   const struct mlxsw_reg_info *reg,
+				   enum mlxsw_core_reg_access_type type,
+				   u64 tid)
+{
+	mlxsw_emad_op_tlv_type_set(op_tlv, MLXSW_EMAD_TLV_TYPE_OP);
+	mlxsw_emad_op_tlv_len_set(op_tlv, MLXSW_EMAD_OP_TLV_LEN);
+	mlxsw_emad_op_tlv_dr_set(op_tlv, 0);
+	mlxsw_emad_op_tlv_status_set(op_tlv, 0);
+	mlxsw_emad_op_tlv_register_id_set(op_tlv, reg->id);
+	mlxsw_emad_op_tlv_r_set(op_tlv, MLXSW_EMAD_OP_TLV_REQUEST);
+	if (type == MLXSW_CORE_REG_ACCESS_TYPE_QUERY)
+		mlxsw_emad_op_tlv_method_set(op_tlv,
+					     MLXSW_EMAD_OP_TLV_METHOD_QUERY);
+	else
+		mlxsw_emad_op_tlv_method_set(op_tlv,
+					     MLXSW_EMAD_OP_TLV_METHOD_WRITE);
+	mlxsw_emad_op_tlv_class_set(op_tlv,
+				    MLXSW_EMAD_OP_TLV_CLASS_REG_ACCESS);
+	mlxsw_emad_op_tlv_tid_set(op_tlv, tid);
+}
+
+static int mlxsw_emad_construct_eth_hdr(struct sk_buff *skb)
+{
+	char *eth_hdr = skb_push(skb, MLXSW_EMAD_ETH_HDR_LEN);
+
+	mlxsw_emad_eth_hdr_dmac_memcpy_to(eth_hdr, MLXSW_EMAD_EH_DMAC);
+	mlxsw_emad_eth_hdr_smac_memcpy_to(eth_hdr, MLXSW_EMAD_EH_SMAC);
+	mlxsw_emad_eth_hdr_ethertype_set(eth_hdr, MLXSW_EMAD_EH_ETHERTYPE);
+	mlxsw_emad_eth_hdr_mlx_proto_set(eth_hdr, MLXSW_EMAD_EH_MLX_PROTO);
+	mlxsw_emad_eth_hdr_ver_set(eth_hdr, MLXSW_EMAD_EH_PROTO_VERSION);
+
+	skb_reset_mac_header(skb);
+
+	return 0;
+}
+
+static void mlxsw_emad_construct(struct sk_buff *skb,
+				 const struct mlxsw_reg_info *reg,
+				 char *payload,
+				 enum mlxsw_core_reg_access_type type,
+				 u64 tid)
+{
+	char *buf;
+
+	buf = skb_push(skb, MLXSW_EMAD_END_TLV_LEN * sizeof(u32));
+	mlxsw_emad_pack_end_tlv(buf);
+
+	buf = skb_push(skb, reg->len + sizeof(u32));
+	mlxsw_emad_pack_reg_tlv(buf, reg, payload);
+
+	buf = skb_push(skb, MLXSW_EMAD_OP_TLV_LEN * sizeof(u32));
+	mlxsw_emad_pack_op_tlv(buf, reg, type, tid);
+
+	mlxsw_emad_construct_eth_hdr(skb);
+}
+
+static char *mlxsw_emad_op_tlv(const struct sk_buff *skb)
+{
+	return ((char *) (skb->data + MLXSW_EMAD_ETH_HDR_LEN));
+}
+
+static char *mlxsw_emad_reg_tlv(const struct sk_buff *skb)
+{
+	return ((char *) (skb->data + MLXSW_EMAD_ETH_HDR_LEN +
+				      MLXSW_EMAD_OP_TLV_LEN * sizeof(u32)));
+}
+
+static char *mlxsw_emad_reg_payload(const char *op_tlv)
+{
+	return ((char *) (op_tlv + (MLXSW_EMAD_OP_TLV_LEN + 1) * sizeof(u32)));
+}
+
+static u64 mlxsw_emad_get_tid(const struct sk_buff *skb)
+{
+	char *op_tlv;
+
+	op_tlv = mlxsw_emad_op_tlv(skb);
+	return mlxsw_emad_op_tlv_tid_get(op_tlv);
+}
+
+static bool mlxsw_emad_is_resp(const struct sk_buff *skb)
+{
+	char *op_tlv;
+
+	op_tlv = mlxsw_emad_op_tlv(skb);
+	return (mlxsw_emad_op_tlv_r_get(op_tlv) == MLXSW_EMAD_OP_TLV_RESPONSE);
+}
+
+static int mlxsw_emad_process_status(char *op_tlv,
+				     enum mlxsw_emad_op_tlv_status *p_status)
+{
+	*p_status = mlxsw_emad_op_tlv_status_get(op_tlv);
+
+	switch (*p_status) {
+	case MLXSW_EMAD_OP_TLV_STATUS_SUCCESS:
+		return 0;
+	case MLXSW_EMAD_OP_TLV_STATUS_BUSY:
+	case MLXSW_EMAD_OP_TLV_STATUS_MESSAGE_RECEIPT_ACK:
+		return -EAGAIN;
+	case MLXSW_EMAD_OP_TLV_STATUS_VERSION_NOT_SUPPORTED:
+	case MLXSW_EMAD_OP_TLV_STATUS_UNKNOWN_TLV:
+	case MLXSW_EMAD_OP_TLV_STATUS_REGISTER_NOT_SUPPORTED:
+	case MLXSW_EMAD_OP_TLV_STATUS_CLASS_NOT_SUPPORTED:
+	case MLXSW_EMAD_OP_TLV_STATUS_METHOD_NOT_SUPPORTED:
+	case MLXSW_EMAD_OP_TLV_STATUS_BAD_PARAMETER:
+	case MLXSW_EMAD_OP_TLV_STATUS_RESOURCE_NOT_AVAILABLE:
+	case MLXSW_EMAD_OP_TLV_STATUS_INTERNAL_ERROR:
+	default:
+		return -EIO;
+	}
+}
+
+static int
+mlxsw_emad_process_status_skb(struct sk_buff *skb,
+			      enum mlxsw_emad_op_tlv_status *p_status)
+{
+	return mlxsw_emad_process_status(mlxsw_emad_op_tlv(skb), p_status);
+}
+
+struct mlxsw_reg_trans {
+	struct list_head list;
+	struct list_head bulk_list;
+	struct mlxsw_core *core;
+	struct sk_buff *tx_skb;
+	struct mlxsw_tx_info tx_info;
+	struct delayed_work timeout_dw;
+	unsigned int retries;
+	u64 tid;
+	struct completion completion;
+	atomic_t active;
+	mlxsw_reg_trans_cb_t *cb;
+	unsigned long cb_priv;
+	const struct mlxsw_reg_info *reg;
+	enum mlxsw_core_reg_access_type type;
+	int err;
+	enum mlxsw_emad_op_tlv_status emad_status;
+	struct rcu_head rcu;
+};
+
+#define MLXSW_EMAD_TIMEOUT_MS 200
+
+static void mlxsw_emad_trans_timeout_schedule(struct mlxsw_reg_trans *trans)
+{
+	unsigned long timeout = msecs_to_jiffies(MLXSW_EMAD_TIMEOUT_MS);
+
+	queue_delayed_work(trans->core->emad_wq, &trans->timeout_dw, timeout);
+}
+
+static int mlxsw_emad_transmit(struct mlxsw_core *mlxsw_core,
+			       struct mlxsw_reg_trans *trans)
+{
+	struct sk_buff *skb;
+	int err;
+
+	skb = skb_copy(trans->tx_skb, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	trace_devlink_hwmsg(priv_to_devlink(mlxsw_core), false, 0,
+			    skb->data + mlxsw_core->driver->txhdr_len,
+			    skb->len - mlxsw_core->driver->txhdr_len);
+
+	atomic_set(&trans->active, 1);
+	err = mlxsw_core_skb_transmit(mlxsw_core, skb, &trans->tx_info);
+	if (err) {
+		dev_kfree_skb(skb);
+		return err;
+	}
+	mlxsw_emad_trans_timeout_schedule(trans);
+	return 0;
+}
+
+static void mlxsw_emad_trans_finish(struct mlxsw_reg_trans *trans, int err)
+{
+	struct mlxsw_core *mlxsw_core = trans->core;
+
+	dev_kfree_skb(trans->tx_skb);
+	spin_lock_bh(&mlxsw_core->emad.trans_list_lock);
+	list_del_rcu(&trans->list);
+	spin_unlock_bh(&mlxsw_core->emad.trans_list_lock);
+	trans->err = err;
+	complete(&trans->completion);
+}
+
+static void mlxsw_emad_transmit_retry(struct mlxsw_core *mlxsw_core,
+				      struct mlxsw_reg_trans *trans)
+{
+	int err;
+
+	if (trans->retries < MLXSW_EMAD_MAX_RETRY) {
+		trans->retries++;
+		err = mlxsw_emad_transmit(trans->core, trans);
+		if (err == 0)
+			return;
+	} else {
+		err = -EIO;
+	}
+	mlxsw_emad_trans_finish(trans, err);
+}
+
+static void mlxsw_emad_trans_timeout_work(struct work_struct *work)
+{
+	struct mlxsw_reg_trans *trans = container_of(work,
+						     struct mlxsw_reg_trans,
+						     timeout_dw.work);
+
+	if (!atomic_dec_and_test(&trans->active))
+		return;
+
+	mlxsw_emad_transmit_retry(trans->core, trans);
+}
+
+static void mlxsw_emad_process_response(struct mlxsw_core *mlxsw_core,
+					struct mlxsw_reg_trans *trans,
+					struct sk_buff *skb)
+{
+	int err;
+
+	if (!atomic_dec_and_test(&trans->active))
+		return;
+
+	err = mlxsw_emad_process_status_skb(skb, &trans->emad_status);
+	if (err == -EAGAIN) {
+		mlxsw_emad_transmit_retry(mlxsw_core, trans);
+	} else {
+		if (err == 0) {
+			char *op_tlv = mlxsw_emad_op_tlv(skb);
+
+			if (trans->cb)
+				trans->cb(mlxsw_core,
+					  mlxsw_emad_reg_payload(op_tlv),
+					  trans->reg->len, trans->cb_priv);
+		}
+		mlxsw_emad_trans_finish(trans, err);
+	}
+}
+
+/* called with rcu read lock held */
+static void mlxsw_emad_rx_listener_func(struct sk_buff *skb, u8 local_port,
+					void *priv)
+{
+	struct mlxsw_core *mlxsw_core = priv;
+	struct mlxsw_reg_trans *trans;
+
+	trace_devlink_hwmsg(priv_to_devlink(mlxsw_core), true, 0,
+			    skb->data, skb->len);
+
+	if (!mlxsw_emad_is_resp(skb))
+		goto free_skb;
+
+	list_for_each_entry_rcu(trans, &mlxsw_core->emad.trans_list, list) {
+		if (mlxsw_emad_get_tid(skb) == trans->tid) {
+			mlxsw_emad_process_response(mlxsw_core, trans, skb);
+			break;
+		}
+	}
+
+free_skb:
+	dev_kfree_skb(skb);
+}
+
+static const struct mlxsw_listener mlxsw_emad_rx_listener =
+	MLXSW_RXL(mlxsw_emad_rx_listener_func, ETHEMAD, TRAP_TO_CPU, false,
+		  EMAD, DISCARD);
+
+static int mlxsw_emad_init(struct mlxsw_core *mlxsw_core)
+{
+	struct workqueue_struct *emad_wq;
+	u64 tid;
+	int err;
+
+	if (!(mlxsw_core->bus->features & MLXSW_BUS_F_TXRX))
+		return 0;
+
+	emad_wq = alloc_workqueue("mlxsw_core_emad", WQ_MEM_RECLAIM, 0);
+	if (!emad_wq)
+		return -ENOMEM;
+	mlxsw_core->emad_wq = emad_wq;
+
+	/* Set the upper 32 bits of the transaction ID field to a random
+	 * number. This allows us to discard EMADs addressed to other
+	 * devices.
+	 */
+	get_random_bytes(&tid, 4);
+	tid <<= 32;
+	atomic64_set(&mlxsw_core->emad.tid, tid);
+
+	INIT_LIST_HEAD(&mlxsw_core->emad.trans_list);
+	spin_lock_init(&mlxsw_core->emad.trans_list_lock);
+
+	err = mlxsw_core_trap_register(mlxsw_core, &mlxsw_emad_rx_listener,
+				       mlxsw_core);
+	if (err)
+		return err;
+
+	err = mlxsw_core->driver->basic_trap_groups_set(mlxsw_core);
+	if (err)
+		goto err_emad_trap_set;
+	mlxsw_core->emad.use_emad = true;
+
+	return 0;
+
+err_emad_trap_set:
+	mlxsw_core_trap_unregister(mlxsw_core, &mlxsw_emad_rx_listener,
+				   mlxsw_core);
+	destroy_workqueue(mlxsw_core->emad_wq);
+	return err;
+}
+
+static void mlxsw_emad_fini(struct mlxsw_core *mlxsw_core)
+{
+
+	if (!(mlxsw_core->bus->features & MLXSW_BUS_F_TXRX))
+		return;
+
+	mlxsw_core->emad.use_emad = false;
+	mlxsw_core_trap_unregister(mlxsw_core, &mlxsw_emad_rx_listener,
+				   mlxsw_core);
+	destroy_workqueue(mlxsw_core->emad_wq);
+}
+
+static struct sk_buff *mlxsw_emad_alloc(const struct mlxsw_core *mlxsw_core,
+					u16 reg_len)
+{
+	struct sk_buff *skb;
+	u16 emad_len;
+
+	emad_len = (reg_len + sizeof(u32) + MLXSW_EMAD_ETH_HDR_LEN +
+		    (MLXSW_EMAD_OP_TLV_LEN + MLXSW_EMAD_END_TLV_LEN) *
+		    sizeof(u32) + mlxsw_core->driver->txhdr_len);
+	if (emad_len > MLXSW_EMAD_MAX_FRAME_LEN)
+		return NULL;
+
+	skb = netdev_alloc_skb(NULL, emad_len);
+	if (!skb)
+		return NULL;
+	memset(skb->data, 0, emad_len);
+	skb_reserve(skb, emad_len);
+
+	return skb;
+}
+
+static int mlxsw_emad_reg_access(struct mlxsw_core *mlxsw_core,
+				 const struct mlxsw_reg_info *reg,
+				 char *payload,
+				 enum mlxsw_core_reg_access_type type,
+				 struct mlxsw_reg_trans *trans,
+				 struct list_head *bulk_list,
+				 mlxsw_reg_trans_cb_t *cb,
+				 unsigned long cb_priv, u64 tid)
+{
+	struct sk_buff *skb;
+	int err;
+
+	dev_dbg(mlxsw_core->bus_info->dev, "EMAD reg access (tid=%llx,reg_id=%x(%s),type=%s)\n",
+		tid, reg->id, mlxsw_reg_id_str(reg->id),
+		mlxsw_core_reg_access_type_str(type));
+
+	skb = mlxsw_emad_alloc(mlxsw_core, reg->len);
+	if (!skb)
+		return -ENOMEM;
+
+	list_add_tail(&trans->bulk_list, bulk_list);
+	trans->core = mlxsw_core;
+	trans->tx_skb = skb;
+	trans->tx_info.local_port = MLXSW_PORT_CPU_PORT;
+	trans->tx_info.is_emad = true;
+	INIT_DELAYED_WORK(&trans->timeout_dw, mlxsw_emad_trans_timeout_work);
+	trans->tid = tid;
+	init_completion(&trans->completion);
+	trans->cb = cb;
+	trans->cb_priv = cb_priv;
+	trans->reg = reg;
+	trans->type = type;
+
+	mlxsw_emad_construct(skb, reg, payload, type, trans->tid);
+	mlxsw_core->driver->txhdr_construct(skb, &trans->tx_info);
+
+	spin_lock_bh(&mlxsw_core->emad.trans_list_lock);
+	list_add_tail_rcu(&trans->list, &mlxsw_core->emad.trans_list);
+	spin_unlock_bh(&mlxsw_core->emad.trans_list_lock);
+	err = mlxsw_emad_transmit(mlxsw_core, trans);
+	if (err)
+		goto err_out;
+	return 0;
+
+err_out:
+	spin_lock_bh(&mlxsw_core->emad.trans_list_lock);
+	list_del_rcu(&trans->list);
+	spin_unlock_bh(&mlxsw_core->emad.trans_list_lock);
+	list_del(&trans->bulk_list);
+	dev_kfree_skb(trans->tx_skb);
+	return err;
+}
+
+/*****************
+ * Core functions
+ *****************/
+
+int mlxsw_core_driver_register(struct mlxsw_driver *mlxsw_driver)
+{
+	spin_lock(&mlxsw_core_driver_list_lock);
+	list_add_tail(&mlxsw_driver->list, &mlxsw_core_driver_list);
+	spin_unlock(&mlxsw_core_driver_list_lock);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_core_driver_register);
+
+void mlxsw_core_driver_unregister(struct mlxsw_driver *mlxsw_driver)
+{
+	spin_lock(&mlxsw_core_driver_list_lock);
+	list_del(&mlxsw_driver->list);
+	spin_unlock(&mlxsw_core_driver_list_lock);
+}
+EXPORT_SYMBOL(mlxsw_core_driver_unregister);
+
+static struct mlxsw_driver *__driver_find(const char *kind)
+{
+	struct mlxsw_driver *mlxsw_driver;
+
+	list_for_each_entry(mlxsw_driver, &mlxsw_core_driver_list, list) {
+		if (strcmp(mlxsw_driver->kind, kind) == 0)
+			return mlxsw_driver;
+	}
+	return NULL;
+}
+
+static struct mlxsw_driver *mlxsw_core_driver_get(const char *kind)
+{
+	struct mlxsw_driver *mlxsw_driver;
+
+	spin_lock(&mlxsw_core_driver_list_lock);
+	mlxsw_driver = __driver_find(kind);
+	spin_unlock(&mlxsw_core_driver_list_lock);
+	return mlxsw_driver;
+}
+
+static void mlxsw_core_driver_put(const char *kind)
+{
+	struct mlxsw_driver *mlxsw_driver;
+
+	spin_lock(&mlxsw_core_driver_list_lock);
+	mlxsw_driver = __driver_find(kind);
+	spin_unlock(&mlxsw_core_driver_list_lock);
+}
+
+static int mlxsw_devlink_port_split(struct devlink *devlink,
+				    unsigned int port_index,
+				    unsigned int count)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+
+	if (port_index >= mlxsw_core->max_ports)
+		return -EINVAL;
+	if (!mlxsw_core->driver->port_split)
+		return -EOPNOTSUPP;
+	return mlxsw_core->driver->port_split(mlxsw_core, port_index, count);
+}
+
+static int mlxsw_devlink_port_unsplit(struct devlink *devlink,
+				      unsigned int port_index)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+
+	if (port_index >= mlxsw_core->max_ports)
+		return -EINVAL;
+	if (!mlxsw_core->driver->port_unsplit)
+		return -EOPNOTSUPP;
+	return mlxsw_core->driver->port_unsplit(mlxsw_core, port_index);
+}
+
+static int
+mlxsw_devlink_sb_pool_get(struct devlink *devlink,
+			  unsigned int sb_index, u16 pool_index,
+			  struct devlink_sb_pool_info *pool_info)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+
+	if (!mlxsw_driver->sb_pool_get)
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_pool_get(mlxsw_core, sb_index,
+					 pool_index, pool_info);
+}
+
+static int
+mlxsw_devlink_sb_pool_set(struct devlink *devlink,
+			  unsigned int sb_index, u16 pool_index, u32 size,
+			  enum devlink_sb_threshold_type threshold_type)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+
+	if (!mlxsw_driver->sb_pool_set)
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_pool_set(mlxsw_core, sb_index,
+					 pool_index, size, threshold_type);
+}
+
+static void *__dl_port(struct devlink_port *devlink_port)
+{
+	return container_of(devlink_port, struct mlxsw_core_port, devlink_port);
+}
+
+static int mlxsw_devlink_port_type_set(struct devlink_port *devlink_port,
+				       enum devlink_port_type port_type)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->port_type_set)
+		return -EOPNOTSUPP;
+
+	return mlxsw_driver->port_type_set(mlxsw_core,
+					   mlxsw_core_port->local_port,
+					   port_type);
+}
+
+static int mlxsw_devlink_sb_port_pool_get(struct devlink_port *devlink_port,
+					  unsigned int sb_index, u16 pool_index,
+					  u32 *p_threshold)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->sb_port_pool_get ||
+	    !mlxsw_core_port_check(mlxsw_core_port))
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_port_pool_get(mlxsw_core_port, sb_index,
+					      pool_index, p_threshold);
+}
+
+static int mlxsw_devlink_sb_port_pool_set(struct devlink_port *devlink_port,
+					  unsigned int sb_index, u16 pool_index,
+					  u32 threshold)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->sb_port_pool_set ||
+	    !mlxsw_core_port_check(mlxsw_core_port))
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_port_pool_set(mlxsw_core_port, sb_index,
+					      pool_index, threshold);
+}
+
+static int
+mlxsw_devlink_sb_tc_pool_bind_get(struct devlink_port *devlink_port,
+				  unsigned int sb_index, u16 tc_index,
+				  enum devlink_sb_pool_type pool_type,
+				  u16 *p_pool_index, u32 *p_threshold)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->sb_tc_pool_bind_get ||
+	    !mlxsw_core_port_check(mlxsw_core_port))
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_tc_pool_bind_get(mlxsw_core_port, sb_index,
+						 tc_index, pool_type,
+						 p_pool_index, p_threshold);
+}
+
+static int
+mlxsw_devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port,
+				  unsigned int sb_index, u16 tc_index,
+				  enum devlink_sb_pool_type pool_type,
+				  u16 pool_index, u32 threshold)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->sb_tc_pool_bind_set ||
+	    !mlxsw_core_port_check(mlxsw_core_port))
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_tc_pool_bind_set(mlxsw_core_port, sb_index,
+						 tc_index, pool_type,
+						 pool_index, threshold);
+}
+
+static int mlxsw_devlink_sb_occ_snapshot(struct devlink *devlink,
+					 unsigned int sb_index)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+
+	if (!mlxsw_driver->sb_occ_snapshot)
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_occ_snapshot(mlxsw_core, sb_index);
+}
+
+static int mlxsw_devlink_sb_occ_max_clear(struct devlink *devlink,
+					  unsigned int sb_index)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+
+	if (!mlxsw_driver->sb_occ_max_clear)
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_occ_max_clear(mlxsw_core, sb_index);
+}
+
+static int
+mlxsw_devlink_sb_occ_port_pool_get(struct devlink_port *devlink_port,
+				   unsigned int sb_index, u16 pool_index,
+				   u32 *p_cur, u32 *p_max)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->sb_occ_port_pool_get ||
+	    !mlxsw_core_port_check(mlxsw_core_port))
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_occ_port_pool_get(mlxsw_core_port, sb_index,
+						  pool_index, p_cur, p_max);
+}
+
+static int
+mlxsw_devlink_sb_occ_tc_port_bind_get(struct devlink_port *devlink_port,
+				      unsigned int sb_index, u16 tc_index,
+				      enum devlink_sb_pool_type pool_type,
+				      u32 *p_cur, u32 *p_max)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->sb_occ_tc_port_bind_get ||
+	    !mlxsw_core_port_check(mlxsw_core_port))
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_occ_tc_port_bind_get(mlxsw_core_port,
+						     sb_index, tc_index,
+						     pool_type, p_cur, p_max);
+}
+
+static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	const struct mlxsw_bus *mlxsw_bus = mlxsw_core->bus;
+	int err;
+
+	if (!mlxsw_bus->reset)
+		return -EOPNOTSUPP;
+
+	mlxsw_core_bus_device_unregister(mlxsw_core, true);
+	mlxsw_bus->reset(mlxsw_core->bus_priv);
+	err = mlxsw_core_bus_device_register(mlxsw_core->bus_info,
+					     mlxsw_core->bus,
+					     mlxsw_core->bus_priv, true,
+					     devlink);
+	if (err)
+		mlxsw_core->reload_fail = true;
+	return err;
+}
+
+static const struct devlink_ops mlxsw_devlink_ops = {
+	.reload				= mlxsw_devlink_core_bus_device_reload,
+	.port_type_set			= mlxsw_devlink_port_type_set,
+	.port_split			= mlxsw_devlink_port_split,
+	.port_unsplit			= mlxsw_devlink_port_unsplit,
+	.sb_pool_get			= mlxsw_devlink_sb_pool_get,
+	.sb_pool_set			= mlxsw_devlink_sb_pool_set,
+	.sb_port_pool_get		= mlxsw_devlink_sb_port_pool_get,
+	.sb_port_pool_set		= mlxsw_devlink_sb_port_pool_set,
+	.sb_tc_pool_bind_get		= mlxsw_devlink_sb_tc_pool_bind_get,
+	.sb_tc_pool_bind_set		= mlxsw_devlink_sb_tc_pool_bind_set,
+	.sb_occ_snapshot		= mlxsw_devlink_sb_occ_snapshot,
+	.sb_occ_max_clear		= mlxsw_devlink_sb_occ_max_clear,
+	.sb_occ_port_pool_get		= mlxsw_devlink_sb_occ_port_pool_get,
+	.sb_occ_tc_port_bind_get	= mlxsw_devlink_sb_occ_tc_port_bind_get,
+};
+
+int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
+				   const struct mlxsw_bus *mlxsw_bus,
+				   void *bus_priv, bool reload,
+				   struct devlink *devlink)
+{
+	const char *device_kind = mlxsw_bus_info->device_kind;
+	struct mlxsw_core *mlxsw_core;
+	struct mlxsw_driver *mlxsw_driver;
+	struct mlxsw_res *res;
+	size_t alloc_size;
+	int err;
+
+	mlxsw_driver = mlxsw_core_driver_get(device_kind);
+	if (!mlxsw_driver)
+		return -EINVAL;
+
+	if (!reload) {
+		alloc_size = sizeof(*mlxsw_core) + mlxsw_driver->priv_size;
+		devlink = devlink_alloc(&mlxsw_devlink_ops, alloc_size);
+		if (!devlink) {
+			err = -ENOMEM;
+			goto err_devlink_alloc;
+		}
+	}
+
+	mlxsw_core = devlink_priv(devlink);
+	INIT_LIST_HEAD(&mlxsw_core->rx_listener_list);
+	INIT_LIST_HEAD(&mlxsw_core->event_listener_list);
+	mlxsw_core->driver = mlxsw_driver;
+	mlxsw_core->bus = mlxsw_bus;
+	mlxsw_core->bus_priv = bus_priv;
+	mlxsw_core->bus_info = mlxsw_bus_info;
+
+	res = mlxsw_driver->res_query_enabled ? &mlxsw_core->res : NULL;
+	err = mlxsw_bus->init(bus_priv, mlxsw_core, mlxsw_driver->profile, res);
+	if (err)
+		goto err_bus_init;
+
+	if (mlxsw_driver->resources_register && !reload) {
+		err = mlxsw_driver->resources_register(mlxsw_core);
+		if (err)
+			goto err_register_resources;
+	}
+
+	err = mlxsw_ports_init(mlxsw_core);
+	if (err)
+		goto err_ports_init;
+
+	if (MLXSW_CORE_RES_VALID(mlxsw_core, MAX_LAG) &&
+	    MLXSW_CORE_RES_VALID(mlxsw_core, MAX_LAG_MEMBERS)) {
+		alloc_size = sizeof(u8) *
+			MLXSW_CORE_RES_GET(mlxsw_core, MAX_LAG) *
+			MLXSW_CORE_RES_GET(mlxsw_core, MAX_LAG_MEMBERS);
+		mlxsw_core->lag.mapping = kzalloc(alloc_size, GFP_KERNEL);
+		if (!mlxsw_core->lag.mapping) {
+			err = -ENOMEM;
+			goto err_alloc_lag_mapping;
+		}
+	}
+
+	err = mlxsw_emad_init(mlxsw_core);
+	if (err)
+		goto err_emad_init;
+
+	if (!reload) {
+		err = devlink_register(devlink, mlxsw_bus_info->dev);
+		if (err)
+			goto err_devlink_register;
+	}
+
+	err = mlxsw_hwmon_init(mlxsw_core, mlxsw_bus_info, &mlxsw_core->hwmon);
+	if (err)
+		goto err_hwmon_init;
+
+	err = mlxsw_thermal_init(mlxsw_core, mlxsw_bus_info,
+				 &mlxsw_core->thermal);
+	if (err)
+		goto err_thermal_init;
+
+	if (mlxsw_driver->init) {
+		err = mlxsw_driver->init(mlxsw_core, mlxsw_bus_info);
+		if (err)
+			goto err_driver_init;
+	}
+
+	return 0;
+
+err_driver_init:
+	mlxsw_thermal_fini(mlxsw_core->thermal);
+err_thermal_init:
+err_hwmon_init:
+	if (!reload)
+		devlink_unregister(devlink);
+err_devlink_register:
+	mlxsw_emad_fini(mlxsw_core);
+err_emad_init:
+	kfree(mlxsw_core->lag.mapping);
+err_alloc_lag_mapping:
+	mlxsw_ports_fini(mlxsw_core);
+err_ports_init:
+	if (!reload)
+		devlink_resources_unregister(devlink, NULL);
+err_register_resources:
+	mlxsw_bus->fini(bus_priv);
+err_bus_init:
+	if (!reload)
+		devlink_free(devlink);
+err_devlink_alloc:
+	mlxsw_core_driver_put(device_kind);
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_core_bus_device_register);
+
+void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core,
+				      bool reload)
+{
+	const char *device_kind = mlxsw_core->bus_info->device_kind;
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+
+	if (mlxsw_core->reload_fail)
+		goto reload_fail;
+
+	if (mlxsw_core->driver->fini)
+		mlxsw_core->driver->fini(mlxsw_core);
+	mlxsw_thermal_fini(mlxsw_core->thermal);
+	if (!reload)
+		devlink_unregister(devlink);
+	mlxsw_emad_fini(mlxsw_core);
+	kfree(mlxsw_core->lag.mapping);
+	mlxsw_ports_fini(mlxsw_core);
+	if (!reload)
+		devlink_resources_unregister(devlink, NULL);
+	mlxsw_core->bus->fini(mlxsw_core->bus_priv);
+	if (reload)
+		return;
+reload_fail:
+	devlink_free(devlink);
+	mlxsw_core_driver_put(device_kind);
+}
+EXPORT_SYMBOL(mlxsw_core_bus_device_unregister);
+
+bool mlxsw_core_skb_transmit_busy(struct mlxsw_core *mlxsw_core,
+				  const struct mlxsw_tx_info *tx_info)
+{
+	return mlxsw_core->bus->skb_transmit_busy(mlxsw_core->bus_priv,
+						  tx_info);
+}
+EXPORT_SYMBOL(mlxsw_core_skb_transmit_busy);
+
+int mlxsw_core_skb_transmit(struct mlxsw_core *mlxsw_core, struct sk_buff *skb,
+			    const struct mlxsw_tx_info *tx_info)
+{
+	return mlxsw_core->bus->skb_transmit(mlxsw_core->bus_priv, skb,
+					     tx_info);
+}
+EXPORT_SYMBOL(mlxsw_core_skb_transmit);
+
+static bool __is_rx_listener_equal(const struct mlxsw_rx_listener *rxl_a,
+				   const struct mlxsw_rx_listener *rxl_b)
+{
+	return (rxl_a->func == rxl_b->func &&
+		rxl_a->local_port == rxl_b->local_port &&
+		rxl_a->trap_id == rxl_b->trap_id);
+}
+
+static struct mlxsw_rx_listener_item *
+__find_rx_listener_item(struct mlxsw_core *mlxsw_core,
+			const struct mlxsw_rx_listener *rxl,
+			void *priv)
+{
+	struct mlxsw_rx_listener_item *rxl_item;
+
+	list_for_each_entry(rxl_item, &mlxsw_core->rx_listener_list, list) {
+		if (__is_rx_listener_equal(&rxl_item->rxl, rxl) &&
+		    rxl_item->priv == priv)
+			return rxl_item;
+	}
+	return NULL;
+}
+
+int mlxsw_core_rx_listener_register(struct mlxsw_core *mlxsw_core,
+				    const struct mlxsw_rx_listener *rxl,
+				    void *priv)
+{
+	struct mlxsw_rx_listener_item *rxl_item;
+
+	rxl_item = __find_rx_listener_item(mlxsw_core, rxl, priv);
+	if (rxl_item)
+		return -EEXIST;
+	rxl_item = kmalloc(sizeof(*rxl_item), GFP_KERNEL);
+	if (!rxl_item)
+		return -ENOMEM;
+	rxl_item->rxl = *rxl;
+	rxl_item->priv = priv;
+
+	list_add_rcu(&rxl_item->list, &mlxsw_core->rx_listener_list);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_core_rx_listener_register);
+
+void mlxsw_core_rx_listener_unregister(struct mlxsw_core *mlxsw_core,
+				       const struct mlxsw_rx_listener *rxl,
+				       void *priv)
+{
+	struct mlxsw_rx_listener_item *rxl_item;
+
+	rxl_item = __find_rx_listener_item(mlxsw_core, rxl, priv);
+	if (!rxl_item)
+		return;
+	list_del_rcu(&rxl_item->list);
+	synchronize_rcu();
+	kfree(rxl_item);
+}
+EXPORT_SYMBOL(mlxsw_core_rx_listener_unregister);
+
+static void mlxsw_core_event_listener_func(struct sk_buff *skb, u8 local_port,
+					   void *priv)
+{
+	struct mlxsw_event_listener_item *event_listener_item = priv;
+	struct mlxsw_reg_info reg;
+	char *payload;
+	char *op_tlv = mlxsw_emad_op_tlv(skb);
+	char *reg_tlv = mlxsw_emad_reg_tlv(skb);
+
+	reg.id = mlxsw_emad_op_tlv_register_id_get(op_tlv);
+	reg.len = (mlxsw_emad_reg_tlv_len_get(reg_tlv) - 1) * sizeof(u32);
+	payload = mlxsw_emad_reg_payload(op_tlv);
+	event_listener_item->el.func(&reg, payload, event_listener_item->priv);
+	dev_kfree_skb(skb);
+}
+
+static bool __is_event_listener_equal(const struct mlxsw_event_listener *el_a,
+				      const struct mlxsw_event_listener *el_b)
+{
+	return (el_a->func == el_b->func &&
+		el_a->trap_id == el_b->trap_id);
+}
+
+static struct mlxsw_event_listener_item *
+__find_event_listener_item(struct mlxsw_core *mlxsw_core,
+			   const struct mlxsw_event_listener *el,
+			   void *priv)
+{
+	struct mlxsw_event_listener_item *el_item;
+
+	list_for_each_entry(el_item, &mlxsw_core->event_listener_list, list) {
+		if (__is_event_listener_equal(&el_item->el, el) &&
+		    el_item->priv == priv)
+			return el_item;
+	}
+	return NULL;
+}
+
+int mlxsw_core_event_listener_register(struct mlxsw_core *mlxsw_core,
+				       const struct mlxsw_event_listener *el,
+				       void *priv)
+{
+	int err;
+	struct mlxsw_event_listener_item *el_item;
+	const struct mlxsw_rx_listener rxl = {
+		.func = mlxsw_core_event_listener_func,
+		.local_port = MLXSW_PORT_DONT_CARE,
+		.trap_id = el->trap_id,
+	};
+
+	el_item = __find_event_listener_item(mlxsw_core, el, priv);
+	if (el_item)
+		return -EEXIST;
+	el_item = kmalloc(sizeof(*el_item), GFP_KERNEL);
+	if (!el_item)
+		return -ENOMEM;
+	el_item->el = *el;
+	el_item->priv = priv;
+
+	err = mlxsw_core_rx_listener_register(mlxsw_core, &rxl, el_item);
+	if (err)
+		goto err_rx_listener_register;
+
+	/* No reason to save item if we did not manage to register an RX
+	 * listener for it.
+	 */
+	list_add_rcu(&el_item->list, &mlxsw_core->event_listener_list);
+
+	return 0;
+
+err_rx_listener_register:
+	kfree(el_item);
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_core_event_listener_register);
+
+void mlxsw_core_event_listener_unregister(struct mlxsw_core *mlxsw_core,
+					  const struct mlxsw_event_listener *el,
+					  void *priv)
+{
+	struct mlxsw_event_listener_item *el_item;
+	const struct mlxsw_rx_listener rxl = {
+		.func = mlxsw_core_event_listener_func,
+		.local_port = MLXSW_PORT_DONT_CARE,
+		.trap_id = el->trap_id,
+	};
+
+	el_item = __find_event_listener_item(mlxsw_core, el, priv);
+	if (!el_item)
+		return;
+	mlxsw_core_rx_listener_unregister(mlxsw_core, &rxl, el_item);
+	list_del(&el_item->list);
+	kfree(el_item);
+}
+EXPORT_SYMBOL(mlxsw_core_event_listener_unregister);
+
+static int mlxsw_core_listener_register(struct mlxsw_core *mlxsw_core,
+					const struct mlxsw_listener *listener,
+					void *priv)
+{
+	if (listener->is_event)
+		return mlxsw_core_event_listener_register(mlxsw_core,
+						&listener->u.event_listener,
+						priv);
+	else
+		return mlxsw_core_rx_listener_register(mlxsw_core,
+						&listener->u.rx_listener,
+						priv);
+}
+
+static void mlxsw_core_listener_unregister(struct mlxsw_core *mlxsw_core,
+				      const struct mlxsw_listener *listener,
+				      void *priv)
+{
+	if (listener->is_event)
+		mlxsw_core_event_listener_unregister(mlxsw_core,
+						     &listener->u.event_listener,
+						     priv);
+	else
+		mlxsw_core_rx_listener_unregister(mlxsw_core,
+						  &listener->u.rx_listener,
+						  priv);
+}
+
+int mlxsw_core_trap_register(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_listener *listener, void *priv)
+{
+	char hpkt_pl[MLXSW_REG_HPKT_LEN];
+	int err;
+
+	err = mlxsw_core_listener_register(mlxsw_core, listener, priv);
+	if (err)
+		return err;
+
+	mlxsw_reg_hpkt_pack(hpkt_pl, listener->action, listener->trap_id,
+			    listener->trap_group, listener->is_ctrl);
+	err = mlxsw_reg_write(mlxsw_core,  MLXSW_REG(hpkt), hpkt_pl);
+	if (err)
+		goto err_trap_set;
+
+	return 0;
+
+err_trap_set:
+	mlxsw_core_listener_unregister(mlxsw_core, listener, priv);
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_core_trap_register);
+
+void mlxsw_core_trap_unregister(struct mlxsw_core *mlxsw_core,
+				const struct mlxsw_listener *listener,
+				void *priv)
+{
+	char hpkt_pl[MLXSW_REG_HPKT_LEN];
+
+	if (!listener->is_event) {
+		mlxsw_reg_hpkt_pack(hpkt_pl, listener->unreg_action,
+				    listener->trap_id, listener->trap_group,
+				    listener->is_ctrl);
+		mlxsw_reg_write(mlxsw_core, MLXSW_REG(hpkt), hpkt_pl);
+	}
+
+	mlxsw_core_listener_unregister(mlxsw_core, listener, priv);
+}
+EXPORT_SYMBOL(mlxsw_core_trap_unregister);
+
+static u64 mlxsw_core_tid_get(struct mlxsw_core *mlxsw_core)
+{
+	return atomic64_inc_return(&mlxsw_core->emad.tid);
+}
+
+static int mlxsw_core_reg_access_emad(struct mlxsw_core *mlxsw_core,
+				      const struct mlxsw_reg_info *reg,
+				      char *payload,
+				      enum mlxsw_core_reg_access_type type,
+				      struct list_head *bulk_list,
+				      mlxsw_reg_trans_cb_t *cb,
+				      unsigned long cb_priv)
+{
+	u64 tid = mlxsw_core_tid_get(mlxsw_core);
+	struct mlxsw_reg_trans *trans;
+	int err;
+
+	trans = kzalloc(sizeof(*trans), GFP_KERNEL);
+	if (!trans)
+		return -ENOMEM;
+
+	err = mlxsw_emad_reg_access(mlxsw_core, reg, payload, type, trans,
+				    bulk_list, cb, cb_priv, tid);
+	if (err) {
+		kfree(trans);
+		return err;
+	}
+	return 0;
+}
+
+int mlxsw_reg_trans_query(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_reg_info *reg, char *payload,
+			  struct list_head *bulk_list,
+			  mlxsw_reg_trans_cb_t *cb, unsigned long cb_priv)
+{
+	return mlxsw_core_reg_access_emad(mlxsw_core, reg, payload,
+					  MLXSW_CORE_REG_ACCESS_TYPE_QUERY,
+					  bulk_list, cb, cb_priv);
+}
+EXPORT_SYMBOL(mlxsw_reg_trans_query);
+
+int mlxsw_reg_trans_write(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_reg_info *reg, char *payload,
+			  struct list_head *bulk_list,
+			  mlxsw_reg_trans_cb_t *cb, unsigned long cb_priv)
+{
+	return mlxsw_core_reg_access_emad(mlxsw_core, reg, payload,
+					  MLXSW_CORE_REG_ACCESS_TYPE_WRITE,
+					  bulk_list, cb, cb_priv);
+}
+EXPORT_SYMBOL(mlxsw_reg_trans_write);
+
+static int mlxsw_reg_trans_wait(struct mlxsw_reg_trans *trans)
+{
+	struct mlxsw_core *mlxsw_core = trans->core;
+	int err;
+
+	wait_for_completion(&trans->completion);
+	cancel_delayed_work_sync(&trans->timeout_dw);
+	err = trans->err;
+
+	if (trans->retries)
+		dev_warn(mlxsw_core->bus_info->dev, "EMAD retries (%d/%d) (tid=%llx)\n",
+			 trans->retries, MLXSW_EMAD_MAX_RETRY, trans->tid);
+	if (err)
+		dev_err(mlxsw_core->bus_info->dev, "EMAD reg access failed (tid=%llx,reg_id=%x(%s),type=%s,status=%x(%s))\n",
+			trans->tid, trans->reg->id,
+			mlxsw_reg_id_str(trans->reg->id),
+			mlxsw_core_reg_access_type_str(trans->type),
+			trans->emad_status,
+			mlxsw_emad_op_tlv_status_str(trans->emad_status));
+
+	list_del(&trans->bulk_list);
+	kfree_rcu(trans, rcu);
+	return err;
+}
+
+int mlxsw_reg_trans_bulk_wait(struct list_head *bulk_list)
+{
+	struct mlxsw_reg_trans *trans;
+	struct mlxsw_reg_trans *tmp;
+	int sum_err = 0;
+	int err;
+
+	list_for_each_entry_safe(trans, tmp, bulk_list, bulk_list) {
+		err = mlxsw_reg_trans_wait(trans);
+		if (err && sum_err == 0)
+			sum_err = err; /* first error to be returned */
+	}
+	return sum_err;
+}
+EXPORT_SYMBOL(mlxsw_reg_trans_bulk_wait);
+
+static int mlxsw_core_reg_access_cmd(struct mlxsw_core *mlxsw_core,
+				     const struct mlxsw_reg_info *reg,
+				     char *payload,
+				     enum mlxsw_core_reg_access_type type)
+{
+	enum mlxsw_emad_op_tlv_status status;
+	int err, n_retry;
+	char *in_mbox, *out_mbox, *tmp;
+
+	dev_dbg(mlxsw_core->bus_info->dev, "Reg cmd access (reg_id=%x(%s),type=%s)\n",
+		reg->id, mlxsw_reg_id_str(reg->id),
+		mlxsw_core_reg_access_type_str(type));
+
+	in_mbox = mlxsw_cmd_mbox_alloc();
+	if (!in_mbox)
+		return -ENOMEM;
+
+	out_mbox = mlxsw_cmd_mbox_alloc();
+	if (!out_mbox) {
+		err = -ENOMEM;
+		goto free_in_mbox;
+	}
+
+	mlxsw_emad_pack_op_tlv(in_mbox, reg, type,
+			       mlxsw_core_tid_get(mlxsw_core));
+	tmp = in_mbox + MLXSW_EMAD_OP_TLV_LEN * sizeof(u32);
+	mlxsw_emad_pack_reg_tlv(tmp, reg, payload);
+
+	n_retry = 0;
+retry:
+	err = mlxsw_cmd_access_reg(mlxsw_core, in_mbox, out_mbox);
+	if (!err) {
+		err = mlxsw_emad_process_status(out_mbox, &status);
+		if (err) {
+			if (err == -EAGAIN && n_retry++ < MLXSW_EMAD_MAX_RETRY)
+				goto retry;
+			dev_err(mlxsw_core->bus_info->dev, "Reg cmd access status failed (status=%x(%s))\n",
+				status, mlxsw_emad_op_tlv_status_str(status));
+		}
+	}
+
+	if (!err)
+		memcpy(payload, mlxsw_emad_reg_payload(out_mbox),
+		       reg->len);
+
+	mlxsw_cmd_mbox_free(out_mbox);
+free_in_mbox:
+	mlxsw_cmd_mbox_free(in_mbox);
+	if (err)
+		dev_err(mlxsw_core->bus_info->dev, "Reg cmd access failed (reg_id=%x(%s),type=%s)\n",
+			reg->id, mlxsw_reg_id_str(reg->id),
+			mlxsw_core_reg_access_type_str(type));
+	return err;
+}
+
+static void mlxsw_core_reg_access_cb(struct mlxsw_core *mlxsw_core,
+				     char *payload, size_t payload_len,
+				     unsigned long cb_priv)
+{
+	char *orig_payload = (char *) cb_priv;
+
+	memcpy(orig_payload, payload, payload_len);
+}
+
+static int mlxsw_core_reg_access(struct mlxsw_core *mlxsw_core,
+				 const struct mlxsw_reg_info *reg,
+				 char *payload,
+				 enum mlxsw_core_reg_access_type type)
+{
+	LIST_HEAD(bulk_list);
+	int err;
+
+	/* During initialization EMAD interface is not available to us,
+	 * so we default to command interface. We switch to EMAD interface
+	 * after setting the appropriate traps.
+	 */
+	if (!mlxsw_core->emad.use_emad)
+		return mlxsw_core_reg_access_cmd(mlxsw_core, reg,
+						 payload, type);
+
+	err = mlxsw_core_reg_access_emad(mlxsw_core, reg,
+					 payload, type, &bulk_list,
+					 mlxsw_core_reg_access_cb,
+					 (unsigned long) payload);
+	if (err)
+		return err;
+	return mlxsw_reg_trans_bulk_wait(&bulk_list);
+}
+
+int mlxsw_reg_query(struct mlxsw_core *mlxsw_core,
+		    const struct mlxsw_reg_info *reg, char *payload)
+{
+	return mlxsw_core_reg_access(mlxsw_core, reg, payload,
+				     MLXSW_CORE_REG_ACCESS_TYPE_QUERY);
+}
+EXPORT_SYMBOL(mlxsw_reg_query);
+
+int mlxsw_reg_write(struct mlxsw_core *mlxsw_core,
+		    const struct mlxsw_reg_info *reg, char *payload)
+{
+	return mlxsw_core_reg_access(mlxsw_core, reg, payload,
+				     MLXSW_CORE_REG_ACCESS_TYPE_WRITE);
+}
+EXPORT_SYMBOL(mlxsw_reg_write);
+
+void mlxsw_core_skb_receive(struct mlxsw_core *mlxsw_core, struct sk_buff *skb,
+			    struct mlxsw_rx_info *rx_info)
+{
+	struct mlxsw_rx_listener_item *rxl_item;
+	const struct mlxsw_rx_listener *rxl;
+	u8 local_port;
+	bool found = false;
+
+	if (rx_info->is_lag) {
+		dev_dbg_ratelimited(mlxsw_core->bus_info->dev, "%s: lag_id = %d, lag_port_index = 0x%x\n",
+				    __func__, rx_info->u.lag_id,
+				    rx_info->trap_id);
+		/* Upper layer does not care if the skb came from LAG or not,
+		 * so just get the local_port for the lag port and push it up.
+		 */
+		local_port = mlxsw_core_lag_mapping_get(mlxsw_core,
+							rx_info->u.lag_id,
+							rx_info->lag_port_index);
+	} else {
+		local_port = rx_info->u.sys_port;
+	}
+
+	dev_dbg_ratelimited(mlxsw_core->bus_info->dev, "%s: local_port = %d, trap_id = 0x%x\n",
+			    __func__, local_port, rx_info->trap_id);
+
+	if ((rx_info->trap_id >= MLXSW_TRAP_ID_MAX) ||
+	    (local_port >= mlxsw_core->max_ports))
+		goto drop;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(rxl_item, &mlxsw_core->rx_listener_list, list) {
+		rxl = &rxl_item->rxl;
+		if ((rxl->local_port == MLXSW_PORT_DONT_CARE ||
+		     rxl->local_port == local_port) &&
+		    rxl->trap_id == rx_info->trap_id) {
+			found = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	if (!found)
+		goto drop;
+
+	rxl->func(skb, local_port, rxl_item->priv);
+	return;
+
+drop:
+	dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL(mlxsw_core_skb_receive);
+
+static int mlxsw_core_lag_mapping_index(struct mlxsw_core *mlxsw_core,
+					u16 lag_id, u8 port_index)
+{
+	return MLXSW_CORE_RES_GET(mlxsw_core, MAX_LAG_MEMBERS) * lag_id +
+	       port_index;
+}
+
+void mlxsw_core_lag_mapping_set(struct mlxsw_core *mlxsw_core,
+				u16 lag_id, u8 port_index, u8 local_port)
+{
+	int index = mlxsw_core_lag_mapping_index(mlxsw_core,
+						 lag_id, port_index);
+
+	mlxsw_core->lag.mapping[index] = local_port;
+}
+EXPORT_SYMBOL(mlxsw_core_lag_mapping_set);
+
+u8 mlxsw_core_lag_mapping_get(struct mlxsw_core *mlxsw_core,
+			      u16 lag_id, u8 port_index)
+{
+	int index = mlxsw_core_lag_mapping_index(mlxsw_core,
+						 lag_id, port_index);
+
+	return mlxsw_core->lag.mapping[index];
+}
+EXPORT_SYMBOL(mlxsw_core_lag_mapping_get);
+
+void mlxsw_core_lag_mapping_clear(struct mlxsw_core *mlxsw_core,
+				  u16 lag_id, u8 local_port)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_core, MAX_LAG_MEMBERS); i++) {
+		int index = mlxsw_core_lag_mapping_index(mlxsw_core,
+							 lag_id, i);
+
+		if (mlxsw_core->lag.mapping[index] == local_port)
+			mlxsw_core->lag.mapping[index] = 0;
+	}
+}
+EXPORT_SYMBOL(mlxsw_core_lag_mapping_clear);
+
+bool mlxsw_core_res_valid(struct mlxsw_core *mlxsw_core,
+			  enum mlxsw_res_id res_id)
+{
+	return mlxsw_res_valid(&mlxsw_core->res, res_id);
+}
+EXPORT_SYMBOL(mlxsw_core_res_valid);
+
+u64 mlxsw_core_res_get(struct mlxsw_core *mlxsw_core,
+		       enum mlxsw_res_id res_id)
+{
+	return mlxsw_res_get(&mlxsw_core->res, res_id);
+}
+EXPORT_SYMBOL(mlxsw_core_res_get);
+
+int mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+	int err;
+
+	mlxsw_core_port->local_port = local_port;
+	err = devlink_port_register(devlink, devlink_port, local_port);
+	if (err)
+		memset(mlxsw_core_port, 0, sizeof(*mlxsw_core_port));
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_core_port_init);
+
+void mlxsw_core_port_fini(struct mlxsw_core *mlxsw_core, u8 local_port)
+{
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+	devlink_port_unregister(devlink_port);
+	memset(mlxsw_core_port, 0, sizeof(*mlxsw_core_port));
+}
+EXPORT_SYMBOL(mlxsw_core_port_fini);
+
+void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
+			     void *port_driver_priv, struct net_device *dev,
+			     bool split, u32 split_group)
+{
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+	mlxsw_core_port->port_driver_priv = port_driver_priv;
+	if (split)
+		devlink_port_split_set(devlink_port, split_group);
+	devlink_port_type_eth_set(devlink_port, dev);
+}
+EXPORT_SYMBOL(mlxsw_core_port_eth_set);
+
+void mlxsw_core_port_ib_set(struct mlxsw_core *mlxsw_core, u8 local_port,
+			    void *port_driver_priv)
+{
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+	mlxsw_core_port->port_driver_priv = port_driver_priv;
+	devlink_port_type_ib_set(devlink_port, NULL);
+}
+EXPORT_SYMBOL(mlxsw_core_port_ib_set);
+
+void mlxsw_core_port_clear(struct mlxsw_core *mlxsw_core, u8 local_port,
+			   void *port_driver_priv)
+{
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+	mlxsw_core_port->port_driver_priv = port_driver_priv;
+	devlink_port_type_clear(devlink_port);
+}
+EXPORT_SYMBOL(mlxsw_core_port_clear);
+
+enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
+						u8 local_port)
+{
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+	return devlink_port->type;
+}
+EXPORT_SYMBOL(mlxsw_core_port_type_get);
+
+static void mlxsw_core_buf_dump_dbg(struct mlxsw_core *mlxsw_core,
+				    const char *buf, size_t size)
+{
+	__be32 *m = (__be32 *) buf;
+	int i;
+	int count = size / sizeof(__be32);
+
+	for (i = count - 1; i >= 0; i--)
+		if (m[i])
+			break;
+	i++;
+	count = i ? i : 1;
+	for (i = 0; i < count; i += 4)
+		dev_dbg(mlxsw_core->bus_info->dev, "%04x - %08x %08x %08x %08x\n",
+			i * 4, be32_to_cpu(m[i]), be32_to_cpu(m[i + 1]),
+			be32_to_cpu(m[i + 2]), be32_to_cpu(m[i + 3]));
+}
+
+int mlxsw_cmd_exec(struct mlxsw_core *mlxsw_core, u16 opcode, u8 opcode_mod,
+		   u32 in_mod, bool out_mbox_direct,
+		   char *in_mbox, size_t in_mbox_size,
+		   char *out_mbox, size_t out_mbox_size)
+{
+	u8 status;
+	int err;
+
+	BUG_ON(in_mbox_size % sizeof(u32) || out_mbox_size % sizeof(u32));
+	if (!mlxsw_core->bus->cmd_exec)
+		return -EOPNOTSUPP;
+
+	dev_dbg(mlxsw_core->bus_info->dev, "Cmd exec (opcode=%x(%s),opcode_mod=%x,in_mod=%x)\n",
+		opcode, mlxsw_cmd_opcode_str(opcode), opcode_mod, in_mod);
+	if (in_mbox) {
+		dev_dbg(mlxsw_core->bus_info->dev, "Input mailbox:\n");
+		mlxsw_core_buf_dump_dbg(mlxsw_core, in_mbox, in_mbox_size);
+	}
+
+	err = mlxsw_core->bus->cmd_exec(mlxsw_core->bus_priv, opcode,
+					opcode_mod, in_mod, out_mbox_direct,
+					in_mbox, in_mbox_size,
+					out_mbox, out_mbox_size, &status);
+
+	if (err == -EIO && status != MLXSW_CMD_STATUS_OK) {
+		dev_err(mlxsw_core->bus_info->dev, "Cmd exec failed (opcode=%x(%s),opcode_mod=%x,in_mod=%x,status=%x(%s))\n",
+			opcode, mlxsw_cmd_opcode_str(opcode), opcode_mod,
+			in_mod, status, mlxsw_cmd_status_str(status));
+	} else if (err == -ETIMEDOUT) {
+		dev_err(mlxsw_core->bus_info->dev, "Cmd exec timed-out (opcode=%x(%s),opcode_mod=%x,in_mod=%x)\n",
+			opcode, mlxsw_cmd_opcode_str(opcode), opcode_mod,
+			in_mod);
+	}
+
+	if (!err && out_mbox) {
+		dev_dbg(mlxsw_core->bus_info->dev, "Output mailbox:\n");
+		mlxsw_core_buf_dump_dbg(mlxsw_core, out_mbox, out_mbox_size);
+	}
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_cmd_exec);
+
+int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay)
+{
+	return queue_delayed_work(mlxsw_wq, dwork, delay);
+}
+EXPORT_SYMBOL(mlxsw_core_schedule_dw);
+
+bool mlxsw_core_schedule_work(struct work_struct *work)
+{
+	return queue_work(mlxsw_owq, work);
+}
+EXPORT_SYMBOL(mlxsw_core_schedule_work);
+
+void mlxsw_core_flush_owq(void)
+{
+	flush_workqueue(mlxsw_owq);
+}
+EXPORT_SYMBOL(mlxsw_core_flush_owq);
+
+int mlxsw_core_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_config_profile *profile,
+			     u64 *p_single_size, u64 *p_double_size,
+			     u64 *p_linear_size)
+{
+	struct mlxsw_driver *driver = mlxsw_core->driver;
+
+	if (!driver->kvd_sizes_get)
+		return -EINVAL;
+
+	return driver->kvd_sizes_get(mlxsw_core, profile,
+				     p_single_size, p_double_size,
+				     p_linear_size);
+}
+EXPORT_SYMBOL(mlxsw_core_kvd_sizes_get);
+
+static int __init mlxsw_core_module_init(void)
+{
+	int err;
+
+	mlxsw_wq = alloc_workqueue(mlxsw_core_driver_name, WQ_MEM_RECLAIM, 0);
+	if (!mlxsw_wq)
+		return -ENOMEM;
+	mlxsw_owq = alloc_ordered_workqueue("%s_ordered", WQ_MEM_RECLAIM,
+					    mlxsw_core_driver_name);
+	if (!mlxsw_owq) {
+		err = -ENOMEM;
+		goto err_alloc_ordered_workqueue;
+	}
+	return 0;
+
+err_alloc_ordered_workqueue:
+	destroy_workqueue(mlxsw_wq);
+	return err;
+}
+
+static void __exit mlxsw_core_module_exit(void)
+{
+	destroy_workqueue(mlxsw_owq);
+	destroy_workqueue(mlxsw_wq);
+}
+
+module_init(mlxsw_core_module_init);
+module_exit(mlxsw_core_module_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox switch device core driver");
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
new file mode 100644
index 0000000..092d393
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -0,0 +1,416 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/core.h
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2015 Ido Schimmel <idosch@mellanox.com>
+ * Copyright (c) 2015 Elad Raz <eladr@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_CORE_H
+#define _MLXSW_CORE_H
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <net/devlink.h>
+
+#include "trap.h"
+#include "reg.h"
+#include "cmd.h"
+#include "resources.h"
+
+struct mlxsw_core;
+struct mlxsw_core_port;
+struct mlxsw_driver;
+struct mlxsw_bus;
+struct mlxsw_bus_info;
+
+unsigned int mlxsw_core_max_ports(const struct mlxsw_core *mlxsw_core);
+
+void *mlxsw_core_driver_priv(struct mlxsw_core *mlxsw_core);
+
+int mlxsw_core_driver_register(struct mlxsw_driver *mlxsw_driver);
+void mlxsw_core_driver_unregister(struct mlxsw_driver *mlxsw_driver);
+
+int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
+				   const struct mlxsw_bus *mlxsw_bus,
+				   void *bus_priv, bool reload,
+				   struct devlink *devlink);
+void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core, bool reload);
+
+struct mlxsw_tx_info {
+	u8 local_port;
+	bool is_emad;
+};
+
+bool mlxsw_core_skb_transmit_busy(struct mlxsw_core *mlxsw_core,
+				  const struct mlxsw_tx_info *tx_info);
+int mlxsw_core_skb_transmit(struct mlxsw_core *mlxsw_core, struct sk_buff *skb,
+			    const struct mlxsw_tx_info *tx_info);
+
+struct mlxsw_rx_listener {
+	void (*func)(struct sk_buff *skb, u8 local_port, void *priv);
+	u8 local_port;
+	u16 trap_id;
+	enum mlxsw_reg_hpkt_action action;
+};
+
+struct mlxsw_event_listener {
+	void (*func)(const struct mlxsw_reg_info *reg,
+		     char *payload, void *priv);
+	enum mlxsw_event_trap_id trap_id;
+};
+
+struct mlxsw_listener {
+	u16 trap_id;
+	union {
+		struct mlxsw_rx_listener rx_listener;
+		struct mlxsw_event_listener event_listener;
+	} u;
+	enum mlxsw_reg_hpkt_action action;
+	enum mlxsw_reg_hpkt_action unreg_action;
+	u8 trap_group;
+	bool is_ctrl; /* should go via control buffer or not */
+	bool is_event;
+};
+
+#define MLXSW_RXL(_func, _trap_id, _action, _is_ctrl, _trap_group,	\
+		  _unreg_action)					\
+	{								\
+		.trap_id = MLXSW_TRAP_ID_##_trap_id,			\
+		.u.rx_listener =					\
+		{							\
+			.func = _func,					\
+			.local_port = MLXSW_PORT_DONT_CARE,		\
+			.trap_id = MLXSW_TRAP_ID_##_trap_id,		\
+		},							\
+		.action = MLXSW_REG_HPKT_ACTION_##_action,		\
+		.unreg_action = MLXSW_REG_HPKT_ACTION_##_unreg_action,	\
+		.trap_group = MLXSW_REG_HTGT_TRAP_GROUP_##_trap_group,	\
+		.is_ctrl = _is_ctrl,					\
+		.is_event = false,					\
+	}
+
+#define MLXSW_EVENTL(_func, _trap_id, _trap_group)			\
+	{								\
+		.trap_id = MLXSW_TRAP_ID_##_trap_id,			\
+		.u.event_listener =					\
+		{							\
+			.func = _func,					\
+			.trap_id = MLXSW_TRAP_ID_##_trap_id,		\
+		},							\
+		.action = MLXSW_REG_HPKT_ACTION_TRAP_TO_CPU,		\
+		.trap_group = MLXSW_REG_HTGT_TRAP_GROUP_##_trap_group,	\
+		.is_ctrl = false,					\
+		.is_event = true,					\
+	}
+
+int mlxsw_core_rx_listener_register(struct mlxsw_core *mlxsw_core,
+				    const struct mlxsw_rx_listener *rxl,
+				    void *priv);
+void mlxsw_core_rx_listener_unregister(struct mlxsw_core *mlxsw_core,
+				       const struct mlxsw_rx_listener *rxl,
+				       void *priv);
+
+int mlxsw_core_event_listener_register(struct mlxsw_core *mlxsw_core,
+				       const struct mlxsw_event_listener *el,
+				       void *priv);
+void mlxsw_core_event_listener_unregister(struct mlxsw_core *mlxsw_core,
+					  const struct mlxsw_event_listener *el,
+					  void *priv);
+
+int mlxsw_core_trap_register(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_listener *listener,
+			     void *priv);
+void mlxsw_core_trap_unregister(struct mlxsw_core *mlxsw_core,
+				const struct mlxsw_listener *listener,
+				void *priv);
+
+typedef void mlxsw_reg_trans_cb_t(struct mlxsw_core *mlxsw_core, char *payload,
+				  size_t payload_len, unsigned long cb_priv);
+
+int mlxsw_reg_trans_query(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_reg_info *reg, char *payload,
+			  struct list_head *bulk_list,
+			  mlxsw_reg_trans_cb_t *cb, unsigned long cb_priv);
+int mlxsw_reg_trans_write(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_reg_info *reg, char *payload,
+			  struct list_head *bulk_list,
+			  mlxsw_reg_trans_cb_t *cb, unsigned long cb_priv);
+int mlxsw_reg_trans_bulk_wait(struct list_head *bulk_list);
+
+int mlxsw_reg_query(struct mlxsw_core *mlxsw_core,
+		    const struct mlxsw_reg_info *reg, char *payload);
+int mlxsw_reg_write(struct mlxsw_core *mlxsw_core,
+		    const struct mlxsw_reg_info *reg, char *payload);
+
+struct mlxsw_rx_info {
+	bool is_lag;
+	union {
+		u16 sys_port;
+		u16 lag_id;
+	} u;
+	u8 lag_port_index;
+	int trap_id;
+};
+
+void mlxsw_core_skb_receive(struct mlxsw_core *mlxsw_core, struct sk_buff *skb,
+			    struct mlxsw_rx_info *rx_info);
+
+void mlxsw_core_lag_mapping_set(struct mlxsw_core *mlxsw_core,
+				u16 lag_id, u8 port_index, u8 local_port);
+u8 mlxsw_core_lag_mapping_get(struct mlxsw_core *mlxsw_core,
+			      u16 lag_id, u8 port_index);
+void mlxsw_core_lag_mapping_clear(struct mlxsw_core *mlxsw_core,
+				  u16 lag_id, u8 local_port);
+
+void *mlxsw_core_port_driver_priv(struct mlxsw_core_port *mlxsw_core_port);
+int mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port);
+void mlxsw_core_port_fini(struct mlxsw_core *mlxsw_core, u8 local_port);
+void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
+			     void *port_driver_priv, struct net_device *dev,
+			     bool split, u32 split_group);
+void mlxsw_core_port_ib_set(struct mlxsw_core *mlxsw_core, u8 local_port,
+			    void *port_driver_priv);
+void mlxsw_core_port_clear(struct mlxsw_core *mlxsw_core, u8 local_port,
+			   void *port_driver_priv);
+enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
+						u8 local_port);
+
+int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay);
+bool mlxsw_core_schedule_work(struct work_struct *work);
+void mlxsw_core_flush_owq(void);
+
+#define MLXSW_CONFIG_PROFILE_SWID_COUNT 8
+
+struct mlxsw_swid_config {
+	u8	used_type:1,
+		used_properties:1;
+	u8	type;
+	u8	properties;
+};
+
+struct mlxsw_config_profile {
+	u16	used_max_vepa_channels:1,
+		used_max_mid:1,
+		used_max_pgt:1,
+		used_max_system_port:1,
+		used_max_vlan_groups:1,
+		used_max_regions:1,
+		used_flood_tables:1,
+		used_flood_mode:1,
+		used_max_ib_mc:1,
+		used_max_pkey:1,
+		used_ar_sec:1,
+		used_adaptive_routing_group_cap:1,
+		used_kvd_sizes:1;
+	u8	max_vepa_channels;
+	u16	max_mid;
+	u16	max_pgt;
+	u16	max_system_port;
+	u16	max_vlan_groups;
+	u16	max_regions;
+	u8	max_flood_tables;
+	u8	max_vid_flood_tables;
+	u8	flood_mode;
+	u8	max_fid_offset_flood_tables;
+	u16	fid_offset_flood_table_size;
+	u8	max_fid_flood_tables;
+	u16	fid_flood_table_size;
+	u16	max_ib_mc;
+	u16	max_pkey;
+	u8	ar_sec;
+	u16	adaptive_routing_group_cap;
+	u8	arn;
+	u32	kvd_linear_size;
+	u8	kvd_hash_single_parts;
+	u8	kvd_hash_double_parts;
+	struct mlxsw_swid_config swid_config[MLXSW_CONFIG_PROFILE_SWID_COUNT];
+};
+
+struct mlxsw_driver {
+	struct list_head list;
+	const char *kind;
+	size_t priv_size;
+	int (*init)(struct mlxsw_core *mlxsw_core,
+		    const struct mlxsw_bus_info *mlxsw_bus_info);
+	void (*fini)(struct mlxsw_core *mlxsw_core);
+	int (*basic_trap_groups_set)(struct mlxsw_core *mlxsw_core);
+	int (*port_type_set)(struct mlxsw_core *mlxsw_core, u8 local_port,
+			     enum devlink_port_type new_type);
+	int (*port_split)(struct mlxsw_core *mlxsw_core, u8 local_port,
+			  unsigned int count);
+	int (*port_unsplit)(struct mlxsw_core *mlxsw_core, u8 local_port);
+	int (*sb_pool_get)(struct mlxsw_core *mlxsw_core,
+			   unsigned int sb_index, u16 pool_index,
+			   struct devlink_sb_pool_info *pool_info);
+	int (*sb_pool_set)(struct mlxsw_core *mlxsw_core,
+			   unsigned int sb_index, u16 pool_index, u32 size,
+			   enum devlink_sb_threshold_type threshold_type);
+	int (*sb_port_pool_get)(struct mlxsw_core_port *mlxsw_core_port,
+				unsigned int sb_index, u16 pool_index,
+				u32 *p_threshold);
+	int (*sb_port_pool_set)(struct mlxsw_core_port *mlxsw_core_port,
+				unsigned int sb_index, u16 pool_index,
+				u32 threshold);
+	int (*sb_tc_pool_bind_get)(struct mlxsw_core_port *mlxsw_core_port,
+				   unsigned int sb_index, u16 tc_index,
+				   enum devlink_sb_pool_type pool_type,
+				   u16 *p_pool_index, u32 *p_threshold);
+	int (*sb_tc_pool_bind_set)(struct mlxsw_core_port *mlxsw_core_port,
+				   unsigned int sb_index, u16 tc_index,
+				   enum devlink_sb_pool_type pool_type,
+				   u16 pool_index, u32 threshold);
+	int (*sb_occ_snapshot)(struct mlxsw_core *mlxsw_core,
+			       unsigned int sb_index);
+	int (*sb_occ_max_clear)(struct mlxsw_core *mlxsw_core,
+				unsigned int sb_index);
+	int (*sb_occ_port_pool_get)(struct mlxsw_core_port *mlxsw_core_port,
+				    unsigned int sb_index, u16 pool_index,
+				    u32 *p_cur, u32 *p_max);
+	int (*sb_occ_tc_port_bind_get)(struct mlxsw_core_port *mlxsw_core_port,
+				       unsigned int sb_index, u16 tc_index,
+				       enum devlink_sb_pool_type pool_type,
+				       u32 *p_cur, u32 *p_max);
+	void (*txhdr_construct)(struct sk_buff *skb,
+				const struct mlxsw_tx_info *tx_info);
+	int (*resources_register)(struct mlxsw_core *mlxsw_core);
+	int (*kvd_sizes_get)(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_config_profile *profile,
+			     u64 *p_single_size, u64 *p_double_size,
+			     u64 *p_linear_size);
+	u8 txhdr_len;
+	const struct mlxsw_config_profile *profile;
+	bool res_query_enabled;
+};
+
+int mlxsw_core_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_config_profile *profile,
+			     u64 *p_single_size, u64 *p_double_size,
+			     u64 *p_linear_size);
+
+bool mlxsw_core_res_valid(struct mlxsw_core *mlxsw_core,
+			  enum mlxsw_res_id res_id);
+
+#define MLXSW_CORE_RES_VALID(mlxsw_core, short_res_id)			\
+	mlxsw_core_res_valid(mlxsw_core, MLXSW_RES_ID_##short_res_id)
+
+u64 mlxsw_core_res_get(struct mlxsw_core *mlxsw_core,
+		       enum mlxsw_res_id res_id);
+
+#define MLXSW_CORE_RES_GET(mlxsw_core, short_res_id)			\
+	mlxsw_core_res_get(mlxsw_core, MLXSW_RES_ID_##short_res_id)
+
+#define MLXSW_BUS_F_TXRX	BIT(0)
+
+struct mlxsw_bus {
+	const char *kind;
+	int (*init)(void *bus_priv, struct mlxsw_core *mlxsw_core,
+		    const struct mlxsw_config_profile *profile,
+		    struct mlxsw_res *res);
+	void (*fini)(void *bus_priv);
+	void (*reset)(void *bus_priv);
+	bool (*skb_transmit_busy)(void *bus_priv,
+				  const struct mlxsw_tx_info *tx_info);
+	int (*skb_transmit)(void *bus_priv, struct sk_buff *skb,
+			    const struct mlxsw_tx_info *tx_info);
+	int (*cmd_exec)(void *bus_priv, u16 opcode, u8 opcode_mod,
+			u32 in_mod, bool out_mbox_direct,
+			char *in_mbox, size_t in_mbox_size,
+			char *out_mbox, size_t out_mbox_size,
+			u8 *p_status);
+	u8 features;
+};
+
+struct mlxsw_fw_rev {
+	u16 major;
+	u16 minor;
+	u16 subminor;
+};
+
+struct mlxsw_bus_info {
+	const char *device_kind;
+	const char *device_name;
+	struct device *dev;
+	struct mlxsw_fw_rev fw_rev;
+	u8 vsd[MLXSW_CMD_BOARDINFO_VSD_LEN];
+	u8 psid[MLXSW_CMD_BOARDINFO_PSID_LEN];
+};
+
+struct mlxsw_hwmon;
+
+#ifdef CONFIG_MLXSW_CORE_HWMON
+
+int mlxsw_hwmon_init(struct mlxsw_core *mlxsw_core,
+		     const struct mlxsw_bus_info *mlxsw_bus_info,
+		     struct mlxsw_hwmon **p_hwmon);
+void mlxsw_hwmon_fini(struct mlxsw_hwmon *mlxsw_hwmon);
+
+#else
+
+static inline int mlxsw_hwmon_init(struct mlxsw_core *mlxsw_core,
+				   const struct mlxsw_bus_info *mlxsw_bus_info,
+				   struct mlxsw_hwmon **p_hwmon)
+{
+	return 0;
+}
+
+#endif
+
+struct mlxsw_thermal;
+
+#ifdef CONFIG_MLXSW_CORE_THERMAL
+
+int mlxsw_thermal_init(struct mlxsw_core *mlxsw_core,
+		       const struct mlxsw_bus_info *mlxsw_bus_info,
+		       struct mlxsw_thermal **p_thermal);
+void mlxsw_thermal_fini(struct mlxsw_thermal *thermal);
+
+#else
+
+static inline int mlxsw_thermal_init(struct mlxsw_core *mlxsw_core,
+				     const struct mlxsw_bus_info *mlxsw_bus_info,
+				     struct mlxsw_thermal **p_thermal)
+{
+	return 0;
+}
+
+static inline void mlxsw_thermal_fini(struct mlxsw_thermal *thermal)
+{
+}
+
+#endif
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c
new file mode 100644
index 0000000..3c0d882
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c
@@ -0,0 +1,1202 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c
+ * Copyright (c) 2017, 2018 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/rhashtable.h>
+#include <linux/list.h>
+
+#include "item.h"
+#include "trap.h"
+#include "core_acl_flex_actions.h"
+
+enum mlxsw_afa_set_type {
+	MLXSW_AFA_SET_TYPE_NEXT,
+	MLXSW_AFA_SET_TYPE_GOTO,
+};
+
+/* afa_set_type
+ * Type of the record at the end of the action set.
+ */
+MLXSW_ITEM32(afa, set, type, 0xA0, 28, 4);
+
+/* afa_set_next_action_set_ptr
+ * A pointer to the next action set in the KVD Centralized database.
+ */
+MLXSW_ITEM32(afa, set, next_action_set_ptr, 0xA4, 0, 24);
+
+/* afa_set_goto_g
+ * group - When set, the binding is of an ACL group. When cleared,
+ * the binding is of an ACL.
+ * Must be set to 1 for Spectrum.
+ */
+MLXSW_ITEM32(afa, set, goto_g, 0xA4, 29, 1);
+
+enum mlxsw_afa_set_goto_binding_cmd {
+	/* continue go the next binding point */
+	MLXSW_AFA_SET_GOTO_BINDING_CMD_NONE,
+	/* jump to the next binding point no return */
+	MLXSW_AFA_SET_GOTO_BINDING_CMD_JUMP,
+	/* terminate the acl binding */
+	MLXSW_AFA_SET_GOTO_BINDING_CMD_TERM = 4,
+};
+
+/* afa_set_goto_binding_cmd */
+MLXSW_ITEM32(afa, set, goto_binding_cmd, 0xA4, 24, 3);
+
+/* afa_set_goto_next_binding
+ * ACL/ACL group identifier. If the g bit is set, this field should hold
+ * the acl_group_id, else it should hold the acl_id.
+ */
+MLXSW_ITEM32(afa, set, goto_next_binding, 0xA4, 0, 16);
+
+/* afa_all_action_type
+ * Action Type.
+ */
+MLXSW_ITEM32(afa, all, action_type, 0x00, 24, 6);
+
+struct mlxsw_afa {
+	unsigned int max_acts_per_set;
+	const struct mlxsw_afa_ops *ops;
+	void *ops_priv;
+	struct rhashtable set_ht;
+	struct rhashtable fwd_entry_ht;
+};
+
+#define MLXSW_AFA_SET_LEN 0xA8
+
+struct mlxsw_afa_set_ht_key {
+	char enc_actions[MLXSW_AFA_SET_LEN]; /* Encoded set */
+	bool is_first;
+};
+
+/* Set structure holds one action set record. It contains up to three
+ * actions (depends on size of particular actions). The set is either
+ * put directly to a rule, or it is stored in KVD linear area.
+ * To prevent duplicate entries in KVD linear area, a hashtable is
+ * used to track sets that were previously inserted and may be shared.
+ */
+
+struct mlxsw_afa_set {
+	struct rhash_head ht_node;
+	struct mlxsw_afa_set_ht_key ht_key;
+	u32 kvdl_index;
+	bool shared; /* Inserted in hashtable (doesn't mean that
+		      * kvdl_index is valid).
+		      */
+	unsigned int ref_count;
+	struct mlxsw_afa_set *next; /* Pointer to the next set. */
+	struct mlxsw_afa_set *prev; /* Pointer to the previous set,
+				     * note that set may have multiple
+				     * sets from multiple blocks
+				     * pointing at it. This is only
+				     * usable until commit.
+				     */
+};
+
+static const struct rhashtable_params mlxsw_afa_set_ht_params = {
+	.key_len = sizeof(struct mlxsw_afa_set_ht_key),
+	.key_offset = offsetof(struct mlxsw_afa_set, ht_key),
+	.head_offset = offsetof(struct mlxsw_afa_set, ht_node),
+	.automatic_shrinking = true,
+};
+
+struct mlxsw_afa_fwd_entry_ht_key {
+	u8 local_port;
+};
+
+struct mlxsw_afa_fwd_entry {
+	struct rhash_head ht_node;
+	struct mlxsw_afa_fwd_entry_ht_key ht_key;
+	u32 kvdl_index;
+	unsigned int ref_count;
+};
+
+static const struct rhashtable_params mlxsw_afa_fwd_entry_ht_params = {
+	.key_len = sizeof(struct mlxsw_afa_fwd_entry_ht_key),
+	.key_offset = offsetof(struct mlxsw_afa_fwd_entry, ht_key),
+	.head_offset = offsetof(struct mlxsw_afa_fwd_entry, ht_node),
+	.automatic_shrinking = true,
+};
+
+struct mlxsw_afa *mlxsw_afa_create(unsigned int max_acts_per_set,
+				   const struct mlxsw_afa_ops *ops,
+				   void *ops_priv)
+{
+	struct mlxsw_afa *mlxsw_afa;
+	int err;
+
+	mlxsw_afa = kzalloc(sizeof(*mlxsw_afa), GFP_KERNEL);
+	if (!mlxsw_afa)
+		return ERR_PTR(-ENOMEM);
+	err = rhashtable_init(&mlxsw_afa->set_ht, &mlxsw_afa_set_ht_params);
+	if (err)
+		goto err_set_rhashtable_init;
+	err = rhashtable_init(&mlxsw_afa->fwd_entry_ht,
+			      &mlxsw_afa_fwd_entry_ht_params);
+	if (err)
+		goto err_fwd_entry_rhashtable_init;
+	mlxsw_afa->max_acts_per_set = max_acts_per_set;
+	mlxsw_afa->ops = ops;
+	mlxsw_afa->ops_priv = ops_priv;
+	return mlxsw_afa;
+
+err_fwd_entry_rhashtable_init:
+	rhashtable_destroy(&mlxsw_afa->set_ht);
+err_set_rhashtable_init:
+	kfree(mlxsw_afa);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(mlxsw_afa_create);
+
+void mlxsw_afa_destroy(struct mlxsw_afa *mlxsw_afa)
+{
+	rhashtable_destroy(&mlxsw_afa->fwd_entry_ht);
+	rhashtable_destroy(&mlxsw_afa->set_ht);
+	kfree(mlxsw_afa);
+}
+EXPORT_SYMBOL(mlxsw_afa_destroy);
+
+static void mlxsw_afa_set_goto_set(struct mlxsw_afa_set *set,
+				   enum mlxsw_afa_set_goto_binding_cmd cmd,
+				   u16 group_id)
+{
+	char *actions = set->ht_key.enc_actions;
+
+	mlxsw_afa_set_type_set(actions, MLXSW_AFA_SET_TYPE_GOTO);
+	mlxsw_afa_set_goto_g_set(actions, true);
+	mlxsw_afa_set_goto_binding_cmd_set(actions, cmd);
+	mlxsw_afa_set_goto_next_binding_set(actions, group_id);
+}
+
+static void mlxsw_afa_set_next_set(struct mlxsw_afa_set *set,
+				   u32 next_set_kvdl_index)
+{
+	char *actions = set->ht_key.enc_actions;
+
+	mlxsw_afa_set_type_set(actions, MLXSW_AFA_SET_TYPE_NEXT);
+	mlxsw_afa_set_next_action_set_ptr_set(actions, next_set_kvdl_index);
+}
+
+static struct mlxsw_afa_set *mlxsw_afa_set_create(bool is_first)
+{
+	struct mlxsw_afa_set *set;
+
+	set = kzalloc(sizeof(*set), GFP_KERNEL);
+	if (!set)
+		return NULL;
+	/* Need to initialize the set to pass by default */
+	mlxsw_afa_set_goto_set(set, MLXSW_AFA_SET_GOTO_BINDING_CMD_TERM, 0);
+	set->ht_key.is_first = is_first;
+	set->ref_count = 1;
+	return set;
+}
+
+static void mlxsw_afa_set_destroy(struct mlxsw_afa_set *set)
+{
+	kfree(set);
+}
+
+static int mlxsw_afa_set_share(struct mlxsw_afa *mlxsw_afa,
+			       struct mlxsw_afa_set *set)
+{
+	int err;
+
+	err = rhashtable_insert_fast(&mlxsw_afa->set_ht, &set->ht_node,
+				     mlxsw_afa_set_ht_params);
+	if (err)
+		return err;
+	err = mlxsw_afa->ops->kvdl_set_add(mlxsw_afa->ops_priv,
+					   &set->kvdl_index,
+					   set->ht_key.enc_actions,
+					   set->ht_key.is_first);
+	if (err)
+		goto err_kvdl_set_add;
+	set->shared = true;
+	set->prev = NULL;
+	return 0;
+
+err_kvdl_set_add:
+	rhashtable_remove_fast(&mlxsw_afa->set_ht, &set->ht_node,
+			       mlxsw_afa_set_ht_params);
+	return err;
+}
+
+static void mlxsw_afa_set_unshare(struct mlxsw_afa *mlxsw_afa,
+				  struct mlxsw_afa_set *set)
+{
+	mlxsw_afa->ops->kvdl_set_del(mlxsw_afa->ops_priv,
+				     set->kvdl_index,
+				     set->ht_key.is_first);
+	rhashtable_remove_fast(&mlxsw_afa->set_ht, &set->ht_node,
+			       mlxsw_afa_set_ht_params);
+	set->shared = false;
+}
+
+static void mlxsw_afa_set_put(struct mlxsw_afa *mlxsw_afa,
+			      struct mlxsw_afa_set *set)
+{
+	if (--set->ref_count)
+		return;
+	if (set->shared)
+		mlxsw_afa_set_unshare(mlxsw_afa, set);
+	mlxsw_afa_set_destroy(set);
+}
+
+static struct mlxsw_afa_set *mlxsw_afa_set_get(struct mlxsw_afa *mlxsw_afa,
+					       struct mlxsw_afa_set *orig_set)
+{
+	struct mlxsw_afa_set *set;
+	int err;
+
+	/* There is a hashtable of sets maintained. If a set with the exact
+	 * same encoding exists, we reuse it. Otherwise, the current set
+	 * is shared by making it available to others using the hash table.
+	 */
+	set = rhashtable_lookup_fast(&mlxsw_afa->set_ht, &orig_set->ht_key,
+				     mlxsw_afa_set_ht_params);
+	if (set) {
+		set->ref_count++;
+		mlxsw_afa_set_put(mlxsw_afa, orig_set);
+	} else {
+		set = orig_set;
+		err = mlxsw_afa_set_share(mlxsw_afa, set);
+		if (err)
+			return ERR_PTR(err);
+	}
+	return set;
+}
+
+/* Block structure holds a list of action sets. One action block
+ * represents one chain of actions executed upon match of a rule.
+ */
+
+struct mlxsw_afa_block {
+	struct mlxsw_afa *afa;
+	bool finished;
+	struct mlxsw_afa_set *first_set;
+	struct mlxsw_afa_set *cur_set;
+	unsigned int cur_act_index; /* In current set. */
+	struct list_head resource_list; /* List of resources held by actions
+					 * in this block.
+					 */
+};
+
+struct mlxsw_afa_resource {
+	struct list_head list;
+	void (*destructor)(struct mlxsw_afa_block *block,
+			   struct mlxsw_afa_resource *resource);
+};
+
+static void mlxsw_afa_resource_add(struct mlxsw_afa_block *block,
+				   struct mlxsw_afa_resource *resource)
+{
+	list_add(&resource->list, &block->resource_list);
+}
+
+static void mlxsw_afa_resources_destroy(struct mlxsw_afa_block *block)
+{
+	struct mlxsw_afa_resource *resource, *tmp;
+
+	list_for_each_entry_safe(resource, tmp, &block->resource_list, list) {
+		list_del(&resource->list);
+		resource->destructor(block, resource);
+	}
+}
+
+struct mlxsw_afa_block *mlxsw_afa_block_create(struct mlxsw_afa *mlxsw_afa)
+{
+	struct mlxsw_afa_block *block;
+
+	block = kzalloc(sizeof(*block), GFP_KERNEL);
+	if (!block)
+		return NULL;
+	INIT_LIST_HEAD(&block->resource_list);
+	block->afa = mlxsw_afa;
+
+	/* At least one action set is always present, so just create it here */
+	block->first_set = mlxsw_afa_set_create(true);
+	if (!block->first_set)
+		goto err_first_set_create;
+	block->cur_set = block->first_set;
+	return block;
+
+err_first_set_create:
+	kfree(block);
+	return NULL;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_create);
+
+void mlxsw_afa_block_destroy(struct mlxsw_afa_block *block)
+{
+	struct mlxsw_afa_set *set = block->first_set;
+	struct mlxsw_afa_set *next_set;
+
+	do {
+		next_set = set->next;
+		mlxsw_afa_set_put(block->afa, set);
+		set = next_set;
+	} while (set);
+	mlxsw_afa_resources_destroy(block);
+	kfree(block);
+}
+EXPORT_SYMBOL(mlxsw_afa_block_destroy);
+
+int mlxsw_afa_block_commit(struct mlxsw_afa_block *block)
+{
+	struct mlxsw_afa_set *set = block->cur_set;
+	struct mlxsw_afa_set *prev_set;
+
+	block->cur_set = NULL;
+	block->finished = true;
+
+	/* Go over all linked sets starting from last
+	 * and try to find existing set in the hash table.
+	 * In case it is not there, assign a KVD linear index
+	 * and insert it.
+	 */
+	do {
+		prev_set = set->prev;
+		set = mlxsw_afa_set_get(block->afa, set);
+		if (IS_ERR(set))
+			/* No rollback is needed since the chain is
+			 * in consistent state and mlxsw_afa_block_destroy
+			 * will take care of putting it away.
+			 */
+			return PTR_ERR(set);
+		if (prev_set) {
+			prev_set->next = set;
+			mlxsw_afa_set_next_set(prev_set, set->kvdl_index);
+			set = prev_set;
+		}
+	} while (prev_set);
+
+	block->first_set = set;
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_commit);
+
+char *mlxsw_afa_block_first_set(struct mlxsw_afa_block *block)
+{
+	return block->first_set->ht_key.enc_actions;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_first_set);
+
+u32 mlxsw_afa_block_first_set_kvdl_index(struct mlxsw_afa_block *block)
+{
+	return block->first_set->kvdl_index;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_first_set_kvdl_index);
+
+int mlxsw_afa_block_continue(struct mlxsw_afa_block *block)
+{
+	if (block->finished)
+		return -EINVAL;
+	mlxsw_afa_set_goto_set(block->cur_set,
+			       MLXSW_AFA_SET_GOTO_BINDING_CMD_NONE, 0);
+	block->finished = true;
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_continue);
+
+int mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id)
+{
+	if (block->finished)
+		return -EINVAL;
+	mlxsw_afa_set_goto_set(block->cur_set,
+			       MLXSW_AFA_SET_GOTO_BINDING_CMD_JUMP, group_id);
+	block->finished = true;
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_jump);
+
+int mlxsw_afa_block_terminate(struct mlxsw_afa_block *block)
+{
+	if (block->finished)
+		return -EINVAL;
+	mlxsw_afa_set_goto_set(block->cur_set,
+			       MLXSW_AFA_SET_GOTO_BINDING_CMD_TERM, 0);
+	block->finished = true;
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_terminate);
+
+static struct mlxsw_afa_fwd_entry *
+mlxsw_afa_fwd_entry_create(struct mlxsw_afa *mlxsw_afa, u8 local_port)
+{
+	struct mlxsw_afa_fwd_entry *fwd_entry;
+	int err;
+
+	fwd_entry = kzalloc(sizeof(*fwd_entry), GFP_KERNEL);
+	if (!fwd_entry)
+		return ERR_PTR(-ENOMEM);
+	fwd_entry->ht_key.local_port = local_port;
+	fwd_entry->ref_count = 1;
+
+	err = rhashtable_insert_fast(&mlxsw_afa->fwd_entry_ht,
+				     &fwd_entry->ht_node,
+				     mlxsw_afa_fwd_entry_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	err = mlxsw_afa->ops->kvdl_fwd_entry_add(mlxsw_afa->ops_priv,
+						 &fwd_entry->kvdl_index,
+						 local_port);
+	if (err)
+		goto err_kvdl_fwd_entry_add;
+	return fwd_entry;
+
+err_kvdl_fwd_entry_add:
+	rhashtable_remove_fast(&mlxsw_afa->fwd_entry_ht, &fwd_entry->ht_node,
+			       mlxsw_afa_fwd_entry_ht_params);
+err_rhashtable_insert:
+	kfree(fwd_entry);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_afa_fwd_entry_destroy(struct mlxsw_afa *mlxsw_afa,
+					struct mlxsw_afa_fwd_entry *fwd_entry)
+{
+	mlxsw_afa->ops->kvdl_fwd_entry_del(mlxsw_afa->ops_priv,
+					   fwd_entry->kvdl_index);
+	rhashtable_remove_fast(&mlxsw_afa->fwd_entry_ht, &fwd_entry->ht_node,
+			       mlxsw_afa_fwd_entry_ht_params);
+	kfree(fwd_entry);
+}
+
+static struct mlxsw_afa_fwd_entry *
+mlxsw_afa_fwd_entry_get(struct mlxsw_afa *mlxsw_afa, u8 local_port)
+{
+	struct mlxsw_afa_fwd_entry_ht_key ht_key = {0};
+	struct mlxsw_afa_fwd_entry *fwd_entry;
+
+	ht_key.local_port = local_port;
+	fwd_entry = rhashtable_lookup_fast(&mlxsw_afa->fwd_entry_ht, &ht_key,
+					   mlxsw_afa_fwd_entry_ht_params);
+	if (fwd_entry) {
+		fwd_entry->ref_count++;
+		return fwd_entry;
+	}
+	return mlxsw_afa_fwd_entry_create(mlxsw_afa, local_port);
+}
+
+static void mlxsw_afa_fwd_entry_put(struct mlxsw_afa *mlxsw_afa,
+				    struct mlxsw_afa_fwd_entry *fwd_entry)
+{
+	if (--fwd_entry->ref_count)
+		return;
+	mlxsw_afa_fwd_entry_destroy(mlxsw_afa, fwd_entry);
+}
+
+struct mlxsw_afa_fwd_entry_ref {
+	struct mlxsw_afa_resource resource;
+	struct mlxsw_afa_fwd_entry *fwd_entry;
+};
+
+static void
+mlxsw_afa_fwd_entry_ref_destroy(struct mlxsw_afa_block *block,
+				struct mlxsw_afa_fwd_entry_ref *fwd_entry_ref)
+{
+	mlxsw_afa_fwd_entry_put(block->afa, fwd_entry_ref->fwd_entry);
+	kfree(fwd_entry_ref);
+}
+
+static void
+mlxsw_afa_fwd_entry_ref_destructor(struct mlxsw_afa_block *block,
+				   struct mlxsw_afa_resource *resource)
+{
+	struct mlxsw_afa_fwd_entry_ref *fwd_entry_ref;
+
+	fwd_entry_ref = container_of(resource, struct mlxsw_afa_fwd_entry_ref,
+				     resource);
+	mlxsw_afa_fwd_entry_ref_destroy(block, fwd_entry_ref);
+}
+
+static struct mlxsw_afa_fwd_entry_ref *
+mlxsw_afa_fwd_entry_ref_create(struct mlxsw_afa_block *block, u8 local_port)
+{
+	struct mlxsw_afa_fwd_entry_ref *fwd_entry_ref;
+	struct mlxsw_afa_fwd_entry *fwd_entry;
+	int err;
+
+	fwd_entry_ref = kzalloc(sizeof(*fwd_entry_ref), GFP_KERNEL);
+	if (!fwd_entry_ref)
+		return ERR_PTR(-ENOMEM);
+	fwd_entry = mlxsw_afa_fwd_entry_get(block->afa, local_port);
+	if (IS_ERR(fwd_entry)) {
+		err = PTR_ERR(fwd_entry);
+		goto err_fwd_entry_get;
+	}
+	fwd_entry_ref->fwd_entry = fwd_entry;
+	fwd_entry_ref->resource.destructor = mlxsw_afa_fwd_entry_ref_destructor;
+	mlxsw_afa_resource_add(block, &fwd_entry_ref->resource);
+	return fwd_entry_ref;
+
+err_fwd_entry_get:
+	kfree(fwd_entry_ref);
+	return ERR_PTR(err);
+}
+
+struct mlxsw_afa_counter {
+	struct mlxsw_afa_resource resource;
+	u32 counter_index;
+};
+
+static void
+mlxsw_afa_counter_destroy(struct mlxsw_afa_block *block,
+			  struct mlxsw_afa_counter *counter)
+{
+	block->afa->ops->counter_index_put(block->afa->ops_priv,
+					   counter->counter_index);
+	kfree(counter);
+}
+
+static void
+mlxsw_afa_counter_destructor(struct mlxsw_afa_block *block,
+			     struct mlxsw_afa_resource *resource)
+{
+	struct mlxsw_afa_counter *counter;
+
+	counter = container_of(resource, struct mlxsw_afa_counter, resource);
+	mlxsw_afa_counter_destroy(block, counter);
+}
+
+static struct mlxsw_afa_counter *
+mlxsw_afa_counter_create(struct mlxsw_afa_block *block)
+{
+	struct mlxsw_afa_counter *counter;
+	int err;
+
+	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	if (!counter)
+		return ERR_PTR(-ENOMEM);
+
+	err = block->afa->ops->counter_index_get(block->afa->ops_priv,
+						 &counter->counter_index);
+	if (err)
+		goto err_counter_index_get;
+	counter->resource.destructor = mlxsw_afa_counter_destructor;
+	mlxsw_afa_resource_add(block, &counter->resource);
+	return counter;
+
+err_counter_index_get:
+	kfree(counter);
+	return ERR_PTR(err);
+}
+
+#define MLXSW_AFA_ONE_ACTION_LEN 32
+#define MLXSW_AFA_PAYLOAD_OFFSET 4
+
+static char *mlxsw_afa_block_append_action(struct mlxsw_afa_block *block,
+					   u8 action_code, u8 action_size)
+{
+	char *oneact;
+	char *actions;
+
+	if (WARN_ON(block->finished))
+		return NULL;
+	if (block->cur_act_index + action_size >
+	    block->afa->max_acts_per_set) {
+		struct mlxsw_afa_set *set;
+
+		/* The appended action won't fit into the current action set,
+		 * so create a new set.
+		 */
+		set = mlxsw_afa_set_create(false);
+		if (!set)
+			return NULL;
+		set->prev = block->cur_set;
+		block->cur_act_index = 0;
+		block->cur_set->next = set;
+		block->cur_set = set;
+	}
+
+	actions = block->cur_set->ht_key.enc_actions;
+	oneact = actions + block->cur_act_index * MLXSW_AFA_ONE_ACTION_LEN;
+	block->cur_act_index += action_size;
+	mlxsw_afa_all_action_type_set(oneact, action_code);
+	return oneact + MLXSW_AFA_PAYLOAD_OFFSET;
+}
+
+/* VLAN Action
+ * -----------
+ * VLAN action is used for manipulating VLANs. It can be used to implement QinQ,
+ * VLAN translation, change of PCP bits of the VLAN tag, push, pop as swap VLANs
+ * and more.
+ */
+
+#define MLXSW_AFA_VLAN_CODE 0x02
+#define MLXSW_AFA_VLAN_SIZE 1
+
+enum mlxsw_afa_vlan_vlan_tag_cmd {
+	MLXSW_AFA_VLAN_VLAN_TAG_CMD_NOP,
+	MLXSW_AFA_VLAN_VLAN_TAG_CMD_PUSH_TAG,
+	MLXSW_AFA_VLAN_VLAN_TAG_CMD_POP_TAG,
+};
+
+enum mlxsw_afa_vlan_cmd {
+	MLXSW_AFA_VLAN_CMD_NOP,
+	MLXSW_AFA_VLAN_CMD_SET_OUTER,
+	MLXSW_AFA_VLAN_CMD_SET_INNER,
+	MLXSW_AFA_VLAN_CMD_COPY_OUTER_TO_INNER,
+	MLXSW_AFA_VLAN_CMD_COPY_INNER_TO_OUTER,
+	MLXSW_AFA_VLAN_CMD_SWAP,
+};
+
+/* afa_vlan_vlan_tag_cmd
+ * Tag command: push, pop, nop VLAN header.
+ */
+MLXSW_ITEM32(afa, vlan, vlan_tag_cmd, 0x00, 29, 3);
+
+/* afa_vlan_vid_cmd */
+MLXSW_ITEM32(afa, vlan, vid_cmd, 0x04, 29, 3);
+
+/* afa_vlan_vid */
+MLXSW_ITEM32(afa, vlan, vid, 0x04, 0, 12);
+
+/* afa_vlan_ethertype_cmd */
+MLXSW_ITEM32(afa, vlan, ethertype_cmd, 0x08, 29, 3);
+
+/* afa_vlan_ethertype
+ * Index to EtherTypes in Switch VLAN EtherType Register (SVER).
+ */
+MLXSW_ITEM32(afa, vlan, ethertype, 0x08, 24, 3);
+
+/* afa_vlan_pcp_cmd */
+MLXSW_ITEM32(afa, vlan, pcp_cmd, 0x08, 13, 3);
+
+/* afa_vlan_pcp */
+MLXSW_ITEM32(afa, vlan, pcp, 0x08, 8, 3);
+
+static inline void
+mlxsw_afa_vlan_pack(char *payload,
+		    enum mlxsw_afa_vlan_vlan_tag_cmd vlan_tag_cmd,
+		    enum mlxsw_afa_vlan_cmd vid_cmd, u16 vid,
+		    enum mlxsw_afa_vlan_cmd pcp_cmd, u8 pcp,
+		    enum mlxsw_afa_vlan_cmd ethertype_cmd, u8 ethertype)
+{
+	mlxsw_afa_vlan_vlan_tag_cmd_set(payload, vlan_tag_cmd);
+	mlxsw_afa_vlan_vid_cmd_set(payload, vid_cmd);
+	mlxsw_afa_vlan_vid_set(payload, vid);
+	mlxsw_afa_vlan_pcp_cmd_set(payload, pcp_cmd);
+	mlxsw_afa_vlan_pcp_set(payload, pcp);
+	mlxsw_afa_vlan_ethertype_cmd_set(payload, ethertype_cmd);
+	mlxsw_afa_vlan_ethertype_set(payload, ethertype);
+}
+
+int mlxsw_afa_block_append_vlan_modify(struct mlxsw_afa_block *block,
+				       u16 vid, u8 pcp, u8 et)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_VLAN_CODE,
+						  MLXSW_AFA_VLAN_SIZE);
+
+	if (!act)
+		return -ENOBUFS;
+	mlxsw_afa_vlan_pack(act, MLXSW_AFA_VLAN_VLAN_TAG_CMD_NOP,
+			    MLXSW_AFA_VLAN_CMD_SET_OUTER, vid,
+			    MLXSW_AFA_VLAN_CMD_SET_OUTER, pcp,
+			    MLXSW_AFA_VLAN_CMD_SET_OUTER, et);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_vlan_modify);
+
+/* Trap / Discard Action
+ * ---------------------
+ * The Trap / Discard action enables trapping / mirroring packets to the CPU
+ * as well as discarding packets.
+ * The ACL Trap / Discard separates the forward/discard control from CPU
+ * trap control. In addition, the Trap / Discard action enables activating
+ * SPAN (port mirroring).
+ */
+
+#define MLXSW_AFA_TRAPDISC_CODE 0x03
+#define MLXSW_AFA_TRAPDISC_SIZE 1
+
+enum mlxsw_afa_trapdisc_trap_action {
+	MLXSW_AFA_TRAPDISC_TRAP_ACTION_NOP = 0,
+	MLXSW_AFA_TRAPDISC_TRAP_ACTION_TRAP = 2,
+};
+
+/* afa_trapdisc_trap_action
+ * Trap Action.
+ */
+MLXSW_ITEM32(afa, trapdisc, trap_action, 0x00, 24, 4);
+
+enum mlxsw_afa_trapdisc_forward_action {
+	MLXSW_AFA_TRAPDISC_FORWARD_ACTION_FORWARD = 1,
+	MLXSW_AFA_TRAPDISC_FORWARD_ACTION_DISCARD = 3,
+};
+
+/* afa_trapdisc_forward_action
+ * Forward Action.
+ */
+MLXSW_ITEM32(afa, trapdisc, forward_action, 0x00, 0, 4);
+
+/* afa_trapdisc_trap_id
+ * Trap ID to configure.
+ */
+MLXSW_ITEM32(afa, trapdisc, trap_id, 0x04, 0, 9);
+
+/* afa_trapdisc_mirror_agent
+ * Mirror agent.
+ */
+MLXSW_ITEM32(afa, trapdisc, mirror_agent, 0x08, 29, 3);
+
+/* afa_trapdisc_mirror_enable
+ * Mirror enable.
+ */
+MLXSW_ITEM32(afa, trapdisc, mirror_enable, 0x08, 24, 1);
+
+static inline void
+mlxsw_afa_trapdisc_pack(char *payload,
+			enum mlxsw_afa_trapdisc_trap_action trap_action,
+			enum mlxsw_afa_trapdisc_forward_action forward_action,
+			u16 trap_id)
+{
+	mlxsw_afa_trapdisc_trap_action_set(payload, trap_action);
+	mlxsw_afa_trapdisc_forward_action_set(payload, forward_action);
+	mlxsw_afa_trapdisc_trap_id_set(payload, trap_id);
+}
+
+static inline void
+mlxsw_afa_trapdisc_mirror_pack(char *payload, bool mirror_enable,
+			       u8 mirror_agent)
+{
+	mlxsw_afa_trapdisc_mirror_enable_set(payload, mirror_enable);
+	mlxsw_afa_trapdisc_mirror_agent_set(payload, mirror_agent);
+}
+
+int mlxsw_afa_block_append_drop(struct mlxsw_afa_block *block)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_TRAPDISC_CODE,
+						  MLXSW_AFA_TRAPDISC_SIZE);
+
+	if (!act)
+		return -ENOBUFS;
+	mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_NOP,
+				MLXSW_AFA_TRAPDISC_FORWARD_ACTION_DISCARD, 0);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_drop);
+
+int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block, u16 trap_id)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_TRAPDISC_CODE,
+						  MLXSW_AFA_TRAPDISC_SIZE);
+
+	if (!act)
+		return -ENOBUFS;
+	mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_TRAP,
+				MLXSW_AFA_TRAPDISC_FORWARD_ACTION_DISCARD,
+				trap_id);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_trap);
+
+int mlxsw_afa_block_append_trap_and_forward(struct mlxsw_afa_block *block,
+					    u16 trap_id)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_TRAPDISC_CODE,
+						  MLXSW_AFA_TRAPDISC_SIZE);
+
+	if (!act)
+		return -ENOBUFS;
+	mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_TRAP,
+				MLXSW_AFA_TRAPDISC_FORWARD_ACTION_FORWARD,
+				trap_id);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_trap_and_forward);
+
+struct mlxsw_afa_mirror {
+	struct mlxsw_afa_resource resource;
+	int span_id;
+	u8 local_in_port;
+	bool ingress;
+};
+
+static void
+mlxsw_afa_mirror_destroy(struct mlxsw_afa_block *block,
+			 struct mlxsw_afa_mirror *mirror)
+{
+	block->afa->ops->mirror_del(block->afa->ops_priv,
+				    mirror->local_in_port,
+				    mirror->span_id,
+				    mirror->ingress);
+	kfree(mirror);
+}
+
+static void
+mlxsw_afa_mirror_destructor(struct mlxsw_afa_block *block,
+			    struct mlxsw_afa_resource *resource)
+{
+	struct mlxsw_afa_mirror *mirror;
+
+	mirror = container_of(resource, struct mlxsw_afa_mirror, resource);
+	mlxsw_afa_mirror_destroy(block, mirror);
+}
+
+static struct mlxsw_afa_mirror *
+mlxsw_afa_mirror_create(struct mlxsw_afa_block *block, u8 local_in_port,
+			const struct net_device *out_dev, bool ingress)
+{
+	struct mlxsw_afa_mirror *mirror;
+	int err;
+
+	mirror = kzalloc(sizeof(*mirror), GFP_KERNEL);
+	if (!mirror)
+		return ERR_PTR(-ENOMEM);
+
+	err = block->afa->ops->mirror_add(block->afa->ops_priv,
+					  local_in_port, out_dev,
+					  ingress, &mirror->span_id);
+	if (err)
+		goto err_mirror_add;
+
+	mirror->ingress = ingress;
+	mirror->local_in_port = local_in_port;
+	mirror->resource.destructor = mlxsw_afa_mirror_destructor;
+	mlxsw_afa_resource_add(block, &mirror->resource);
+	return mirror;
+
+err_mirror_add:
+	kfree(mirror);
+	return ERR_PTR(err);
+}
+
+static int
+mlxsw_afa_block_append_allocated_mirror(struct mlxsw_afa_block *block,
+					u8 mirror_agent)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_TRAPDISC_CODE,
+						  MLXSW_AFA_TRAPDISC_SIZE);
+	if (!act)
+		return -ENOBUFS;
+	mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_NOP,
+				MLXSW_AFA_TRAPDISC_FORWARD_ACTION_FORWARD, 0);
+	mlxsw_afa_trapdisc_mirror_pack(act, true, mirror_agent);
+	return 0;
+}
+
+int
+mlxsw_afa_block_append_mirror(struct mlxsw_afa_block *block, u8 local_in_port,
+			      const struct net_device *out_dev, bool ingress)
+{
+	struct mlxsw_afa_mirror *mirror;
+	int err;
+
+	mirror = mlxsw_afa_mirror_create(block, local_in_port, out_dev,
+					 ingress);
+	if (IS_ERR(mirror))
+		return PTR_ERR(mirror);
+
+	err = mlxsw_afa_block_append_allocated_mirror(block, mirror->span_id);
+	if (err)
+		goto err_append_allocated_mirror;
+
+	return 0;
+
+err_append_allocated_mirror:
+	mlxsw_afa_mirror_destroy(block, mirror);
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_mirror);
+
+/* Forwarding Action
+ * -----------------
+ * Forwarding Action can be used to implement Policy Based Switching (PBS)
+ * as well as OpenFlow related "Output" action.
+ */
+
+#define MLXSW_AFA_FORWARD_CODE 0x07
+#define MLXSW_AFA_FORWARD_SIZE 1
+
+enum mlxsw_afa_forward_type {
+	/* PBS, Policy Based Switching */
+	MLXSW_AFA_FORWARD_TYPE_PBS,
+	/* Output, OpenFlow output type */
+	MLXSW_AFA_FORWARD_TYPE_OUTPUT,
+};
+
+/* afa_forward_type */
+MLXSW_ITEM32(afa, forward, type, 0x00, 24, 2);
+
+/* afa_forward_pbs_ptr
+ * A pointer to the PBS entry configured by PPBS register.
+ * Reserved when in_port is set.
+ */
+MLXSW_ITEM32(afa, forward, pbs_ptr, 0x08, 0, 24);
+
+/* afa_forward_in_port
+ * Packet is forwarded back to the ingress port.
+ */
+MLXSW_ITEM32(afa, forward, in_port, 0x0C, 0, 1);
+
+static inline void
+mlxsw_afa_forward_pack(char *payload, enum mlxsw_afa_forward_type type,
+		       u32 pbs_ptr, bool in_port)
+{
+	mlxsw_afa_forward_type_set(payload, type);
+	mlxsw_afa_forward_pbs_ptr_set(payload, pbs_ptr);
+	mlxsw_afa_forward_in_port_set(payload, in_port);
+}
+
+int mlxsw_afa_block_append_fwd(struct mlxsw_afa_block *block,
+			       u8 local_port, bool in_port)
+{
+	struct mlxsw_afa_fwd_entry_ref *fwd_entry_ref;
+	u32 kvdl_index;
+	char *act;
+	int err;
+
+	if (in_port)
+		return -EOPNOTSUPP;
+	fwd_entry_ref = mlxsw_afa_fwd_entry_ref_create(block, local_port);
+	if (IS_ERR(fwd_entry_ref))
+		return PTR_ERR(fwd_entry_ref);
+	kvdl_index = fwd_entry_ref->fwd_entry->kvdl_index;
+
+	act = mlxsw_afa_block_append_action(block, MLXSW_AFA_FORWARD_CODE,
+					    MLXSW_AFA_FORWARD_SIZE);
+	if (!act) {
+		err = -ENOBUFS;
+		goto err_append_action;
+	}
+	mlxsw_afa_forward_pack(act, MLXSW_AFA_FORWARD_TYPE_PBS,
+			       kvdl_index, in_port);
+	return 0;
+
+err_append_action:
+	mlxsw_afa_fwd_entry_ref_destroy(block, fwd_entry_ref);
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_fwd);
+
+/* Policing and Counting Action
+ * ----------------------------
+ * Policing and Counting action is used for binding policer and counter
+ * to ACL rules.
+ */
+
+#define MLXSW_AFA_POLCNT_CODE 0x08
+#define MLXSW_AFA_POLCNT_SIZE 1
+
+enum mlxsw_afa_polcnt_counter_set_type {
+	/* No count */
+	MLXSW_AFA_POLCNT_COUNTER_SET_TYPE_NO_COUNT = 0x00,
+	/* Count packets and bytes */
+	MLXSW_AFA_POLCNT_COUNTER_SET_TYPE_PACKETS_BYTES = 0x03,
+	/* Count only packets */
+	MLXSW_AFA_POLCNT_COUNTER_SET_TYPE_PACKETS = 0x05,
+};
+
+/* afa_polcnt_counter_set_type
+ * Counter set type for flow counters.
+ */
+MLXSW_ITEM32(afa, polcnt, counter_set_type, 0x04, 24, 8);
+
+/* afa_polcnt_counter_index
+ * Counter index for flow counters.
+ */
+MLXSW_ITEM32(afa, polcnt, counter_index, 0x04, 0, 24);
+
+static inline void
+mlxsw_afa_polcnt_pack(char *payload,
+		      enum mlxsw_afa_polcnt_counter_set_type set_type,
+		      u32 counter_index)
+{
+	mlxsw_afa_polcnt_counter_set_type_set(payload, set_type);
+	mlxsw_afa_polcnt_counter_index_set(payload, counter_index);
+}
+
+int mlxsw_afa_block_append_allocated_counter(struct mlxsw_afa_block *block,
+					     u32 counter_index)
+{
+	char *act = mlxsw_afa_block_append_action(block, MLXSW_AFA_POLCNT_CODE,
+						  MLXSW_AFA_POLCNT_SIZE);
+	if (!act)
+		return -ENOBUFS;
+	mlxsw_afa_polcnt_pack(act, MLXSW_AFA_POLCNT_COUNTER_SET_TYPE_PACKETS_BYTES,
+			      counter_index);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_allocated_counter);
+
+int mlxsw_afa_block_append_counter(struct mlxsw_afa_block *block,
+				   u32 *p_counter_index)
+{
+	struct mlxsw_afa_counter *counter;
+	u32 counter_index;
+	int err;
+
+	counter = mlxsw_afa_counter_create(block);
+	if (IS_ERR(counter))
+		return PTR_ERR(counter);
+	counter_index = counter->counter_index;
+
+	err = mlxsw_afa_block_append_allocated_counter(block, counter_index);
+	if (err)
+		goto err_append_allocated_counter;
+
+	if (p_counter_index)
+		*p_counter_index = counter_index;
+	return 0;
+
+err_append_allocated_counter:
+	mlxsw_afa_counter_destroy(block, counter);
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_counter);
+
+/* Virtual Router and Forwarding Domain Action
+ * -------------------------------------------
+ * Virtual Switch action is used for manipulate the Virtual Router (VR),
+ * MPLS label space and the Forwarding Identifier (FID).
+ */
+
+#define MLXSW_AFA_VIRFWD_CODE 0x0E
+#define MLXSW_AFA_VIRFWD_SIZE 1
+
+enum mlxsw_afa_virfwd_fid_cmd {
+	/* Do nothing */
+	MLXSW_AFA_VIRFWD_FID_CMD_NOOP,
+	/* Set the Forwarding Identifier (FID) to fid */
+	MLXSW_AFA_VIRFWD_FID_CMD_SET,
+};
+
+/* afa_virfwd_fid_cmd */
+MLXSW_ITEM32(afa, virfwd, fid_cmd, 0x08, 29, 3);
+
+/* afa_virfwd_fid
+ * The FID value.
+ */
+MLXSW_ITEM32(afa, virfwd, fid, 0x08, 0, 16);
+
+static inline void mlxsw_afa_virfwd_pack(char *payload,
+					 enum mlxsw_afa_virfwd_fid_cmd fid_cmd,
+					 u16 fid)
+{
+	mlxsw_afa_virfwd_fid_cmd_set(payload, fid_cmd);
+	mlxsw_afa_virfwd_fid_set(payload, fid);
+}
+
+int mlxsw_afa_block_append_fid_set(struct mlxsw_afa_block *block, u16 fid)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_VIRFWD_CODE,
+						  MLXSW_AFA_VIRFWD_SIZE);
+	if (!act)
+		return -ENOBUFS;
+	mlxsw_afa_virfwd_pack(act, MLXSW_AFA_VIRFWD_FID_CMD_SET, fid);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_fid_set);
+
+/* MC Routing Action
+ * -----------------
+ * The Multicast router action. Can be used by RMFT_V2 - Router Multicast
+ * Forwarding Table Version 2 Register.
+ */
+
+#define MLXSW_AFA_MCROUTER_CODE 0x10
+#define MLXSW_AFA_MCROUTER_SIZE 2
+
+enum mlxsw_afa_mcrouter_rpf_action {
+	MLXSW_AFA_MCROUTER_RPF_ACTION_NOP,
+	MLXSW_AFA_MCROUTER_RPF_ACTION_TRAP,
+	MLXSW_AFA_MCROUTER_RPF_ACTION_DISCARD_ERROR,
+};
+
+/* afa_mcrouter_rpf_action */
+MLXSW_ITEM32(afa, mcrouter, rpf_action, 0x00, 28, 3);
+
+/* afa_mcrouter_expected_irif */
+MLXSW_ITEM32(afa, mcrouter, expected_irif, 0x00, 0, 16);
+
+/* afa_mcrouter_min_mtu */
+MLXSW_ITEM32(afa, mcrouter, min_mtu, 0x08, 0, 16);
+
+enum mlxsw_afa_mrouter_vrmid {
+	MLXSW_AFA_MCROUTER_VRMID_INVALID,
+	MLXSW_AFA_MCROUTER_VRMID_VALID
+};
+
+/* afa_mcrouter_vrmid
+ * Valid RMID: rigr_rmid_index is used as RMID
+ */
+MLXSW_ITEM32(afa, mcrouter, vrmid, 0x0C, 31, 1);
+
+/* afa_mcrouter_rigr_rmid_index
+ * When the vrmid field is set to invalid, the field is used as pointer to
+ * Router Interface Group (RIGR) Table in the KVD linear.
+ * When the vrmid is set to valid, the field is used as RMID index, ranged
+ * from 0 to max_mid - 1. The index is to the Port Group Table.
+ */
+MLXSW_ITEM32(afa, mcrouter, rigr_rmid_index, 0x0C, 0, 24);
+
+static inline void
+mlxsw_afa_mcrouter_pack(char *payload,
+			enum mlxsw_afa_mcrouter_rpf_action rpf_action,
+			u16 expected_irif, u16 min_mtu,
+			enum mlxsw_afa_mrouter_vrmid vrmid, u32 rigr_rmid_index)
+
+{
+	mlxsw_afa_mcrouter_rpf_action_set(payload, rpf_action);
+	mlxsw_afa_mcrouter_expected_irif_set(payload, expected_irif);
+	mlxsw_afa_mcrouter_min_mtu_set(payload, min_mtu);
+	mlxsw_afa_mcrouter_vrmid_set(payload, vrmid);
+	mlxsw_afa_mcrouter_rigr_rmid_index_set(payload, rigr_rmid_index);
+}
+
+int mlxsw_afa_block_append_mcrouter(struct mlxsw_afa_block *block,
+				    u16 expected_irif, u16 min_mtu,
+				    bool rmid_valid, u32 kvdl_index)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_MCROUTER_CODE,
+						  MLXSW_AFA_MCROUTER_SIZE);
+	if (!act)
+		return -ENOBUFS;
+	mlxsw_afa_mcrouter_pack(act, MLXSW_AFA_MCROUTER_RPF_ACTION_TRAP,
+				expected_irif, min_mtu, rmid_valid, kvdl_index);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_mcrouter);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h
new file mode 100644
index 0000000..3a155d1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h
@@ -0,0 +1,92 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_CORE_ACL_FLEX_ACTIONS_H
+#define _MLXSW_CORE_ACL_FLEX_ACTIONS_H
+
+#include <linux/types.h>
+#include <linux/netdevice.h>
+
+struct mlxsw_afa;
+struct mlxsw_afa_block;
+
+struct mlxsw_afa_ops {
+	int (*kvdl_set_add)(void *priv, u32 *p_kvdl_index,
+			    char *enc_actions, bool is_first);
+	void (*kvdl_set_del)(void *priv, u32 kvdl_index, bool is_first);
+	int (*kvdl_fwd_entry_add)(void *priv, u32 *p_kvdl_index, u8 local_port);
+	void (*kvdl_fwd_entry_del)(void *priv, u32 kvdl_index);
+	int (*counter_index_get)(void *priv, unsigned int *p_counter_index);
+	void (*counter_index_put)(void *priv, unsigned int counter_index);
+	int (*mirror_add)(void *priv, u8 local_in_port,
+			  const struct net_device *out_dev,
+			  bool ingress, int *p_span_id);
+	void (*mirror_del)(void *priv, u8 local_in_port, int span_id,
+			   bool ingress);
+};
+
+struct mlxsw_afa *mlxsw_afa_create(unsigned int max_acts_per_set,
+				   const struct mlxsw_afa_ops *ops,
+				   void *ops_priv);
+void mlxsw_afa_destroy(struct mlxsw_afa *mlxsw_afa);
+struct mlxsw_afa_block *mlxsw_afa_block_create(struct mlxsw_afa *mlxsw_afa);
+void mlxsw_afa_block_destroy(struct mlxsw_afa_block *block);
+int mlxsw_afa_block_commit(struct mlxsw_afa_block *block);
+char *mlxsw_afa_block_first_set(struct mlxsw_afa_block *block);
+u32 mlxsw_afa_block_first_set_kvdl_index(struct mlxsw_afa_block *block);
+int mlxsw_afa_block_continue(struct mlxsw_afa_block *block);
+int mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id);
+int mlxsw_afa_block_terminate(struct mlxsw_afa_block *block);
+int mlxsw_afa_block_append_drop(struct mlxsw_afa_block *block);
+int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block, u16 trap_id);
+int mlxsw_afa_block_append_trap_and_forward(struct mlxsw_afa_block *block,
+					    u16 trap_id);
+int mlxsw_afa_block_append_mirror(struct mlxsw_afa_block *block,
+				  u8 local_in_port,
+				  const struct net_device *out_dev,
+				  bool ingress);
+int mlxsw_afa_block_append_fwd(struct mlxsw_afa_block *block,
+			       u8 local_port, bool in_port);
+int mlxsw_afa_block_append_vlan_modify(struct mlxsw_afa_block *block,
+				       u16 vid, u8 pcp, u8 et);
+int mlxsw_afa_block_append_allocated_counter(struct mlxsw_afa_block *block,
+					     u32 counter_index);
+int mlxsw_afa_block_append_counter(struct mlxsw_afa_block *block,
+				   u32 *p_counter_index);
+int mlxsw_afa_block_append_fid_set(struct mlxsw_afa_block *block, u16 fid);
+int mlxsw_afa_block_append_mcrouter(struct mlxsw_afa_block *block,
+				    u16 expected_irif, u16 min_mtu,
+				    bool rmid_valid, u32 kvdl_index);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.c b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.c
new file mode 100644
index 0000000..b32a009
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.c
@@ -0,0 +1,475 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+
+#include "item.h"
+#include "core_acl_flex_keys.h"
+
+struct mlxsw_afk {
+	struct list_head key_info_list;
+	unsigned int max_blocks;
+	const struct mlxsw_afk_block *blocks;
+	unsigned int blocks_count;
+};
+
+static bool mlxsw_afk_blocks_check(struct mlxsw_afk *mlxsw_afk)
+{
+	int i;
+	int j;
+
+	for (i = 0; i < mlxsw_afk->blocks_count; i++) {
+		const struct mlxsw_afk_block *block = &mlxsw_afk->blocks[i];
+
+		for (j = 0; j < block->instances_count; j++) {
+			struct mlxsw_afk_element_inst *elinst;
+
+			elinst = &block->instances[j];
+			if (elinst->type != elinst->info->type ||
+			    elinst->item.size.bits !=
+			    elinst->info->item.size.bits)
+				return false;
+		}
+	}
+	return true;
+}
+
+struct mlxsw_afk *mlxsw_afk_create(unsigned int max_blocks,
+				   const struct mlxsw_afk_block *blocks,
+				   unsigned int blocks_count)
+{
+	struct mlxsw_afk *mlxsw_afk;
+
+	mlxsw_afk = kzalloc(sizeof(*mlxsw_afk), GFP_KERNEL);
+	if (!mlxsw_afk)
+		return NULL;
+	INIT_LIST_HEAD(&mlxsw_afk->key_info_list);
+	mlxsw_afk->max_blocks = max_blocks;
+	mlxsw_afk->blocks = blocks;
+	mlxsw_afk->blocks_count = blocks_count;
+	WARN_ON(!mlxsw_afk_blocks_check(mlxsw_afk));
+	return mlxsw_afk;
+}
+EXPORT_SYMBOL(mlxsw_afk_create);
+
+void mlxsw_afk_destroy(struct mlxsw_afk *mlxsw_afk)
+{
+	WARN_ON(!list_empty(&mlxsw_afk->key_info_list));
+	kfree(mlxsw_afk);
+}
+EXPORT_SYMBOL(mlxsw_afk_destroy);
+
+struct mlxsw_afk_key_info {
+	struct list_head list;
+	unsigned int ref_count;
+	unsigned int blocks_count;
+	int element_to_block[MLXSW_AFK_ELEMENT_MAX]; /* index is element, value
+						      * is index inside "blocks"
+						      */
+	struct mlxsw_afk_element_usage elusage;
+	const struct mlxsw_afk_block *blocks[0];
+};
+
+static bool
+mlxsw_afk_key_info_elements_eq(struct mlxsw_afk_key_info *key_info,
+			       struct mlxsw_afk_element_usage *elusage)
+{
+	return memcmp(&key_info->elusage, elusage, sizeof(*elusage)) == 0;
+}
+
+static struct mlxsw_afk_key_info *
+mlxsw_afk_key_info_find(struct mlxsw_afk *mlxsw_afk,
+			struct mlxsw_afk_element_usage *elusage)
+{
+	struct mlxsw_afk_key_info *key_info;
+
+	list_for_each_entry(key_info, &mlxsw_afk->key_info_list, list) {
+		if (mlxsw_afk_key_info_elements_eq(key_info, elusage))
+			return key_info;
+	}
+	return NULL;
+}
+
+struct mlxsw_afk_picker {
+	struct {
+		DECLARE_BITMAP(element, MLXSW_AFK_ELEMENT_MAX);
+		unsigned int total;
+	} hits[0];
+};
+
+static void mlxsw_afk_picker_count_hits(struct mlxsw_afk *mlxsw_afk,
+					struct mlxsw_afk_picker *picker,
+					enum mlxsw_afk_element element)
+{
+	int i;
+	int j;
+
+	for (i = 0; i < mlxsw_afk->blocks_count; i++) {
+		const struct mlxsw_afk_block *block = &mlxsw_afk->blocks[i];
+
+		for (j = 0; j < block->instances_count; j++) {
+			struct mlxsw_afk_element_inst *elinst;
+
+			elinst = &block->instances[j];
+			if (elinst->info->element == element) {
+				__set_bit(element, picker->hits[i].element);
+				picker->hits[i].total++;
+			}
+		}
+	}
+}
+
+static void mlxsw_afk_picker_subtract_hits(struct mlxsw_afk *mlxsw_afk,
+					   struct mlxsw_afk_picker *picker,
+					   int block_index)
+{
+	DECLARE_BITMAP(hits_element, MLXSW_AFK_ELEMENT_MAX);
+	int i;
+	int j;
+
+	memcpy(&hits_element, &picker->hits[block_index].element,
+	       sizeof(hits_element));
+
+	for (i = 0; i < mlxsw_afk->blocks_count; i++) {
+		for_each_set_bit(j, hits_element, MLXSW_AFK_ELEMENT_MAX) {
+			if (__test_and_clear_bit(j, picker->hits[i].element))
+				picker->hits[i].total--;
+		}
+	}
+}
+
+static int mlxsw_afk_picker_most_hits_get(struct mlxsw_afk *mlxsw_afk,
+					  struct mlxsw_afk_picker *picker)
+{
+	int most_index = -EINVAL; /* Should never happen to return this */
+	int most_hits = 0;
+	int i;
+
+	for (i = 0; i < mlxsw_afk->blocks_count; i++) {
+		if (picker->hits[i].total > most_hits) {
+			most_hits = picker->hits[i].total;
+			most_index = i;
+		}
+	}
+	return most_index;
+}
+
+static int mlxsw_afk_picker_key_info_add(struct mlxsw_afk *mlxsw_afk,
+					 struct mlxsw_afk_picker *picker,
+					 int block_index,
+					 struct mlxsw_afk_key_info *key_info)
+{
+	enum mlxsw_afk_element element;
+
+	if (key_info->blocks_count == mlxsw_afk->max_blocks)
+		return -EINVAL;
+
+	for_each_set_bit(element, picker->hits[block_index].element,
+			 MLXSW_AFK_ELEMENT_MAX) {
+		key_info->element_to_block[element] = key_info->blocks_count;
+		mlxsw_afk_element_usage_add(&key_info->elusage, element);
+	}
+
+	key_info->blocks[key_info->blocks_count] =
+					&mlxsw_afk->blocks[block_index];
+	key_info->blocks_count++;
+	return 0;
+}
+
+static int mlxsw_afk_picker(struct mlxsw_afk *mlxsw_afk,
+			    struct mlxsw_afk_key_info *key_info,
+			    struct mlxsw_afk_element_usage *elusage)
+{
+	struct mlxsw_afk_picker *picker;
+	enum mlxsw_afk_element element;
+	size_t alloc_size;
+	int err;
+
+	alloc_size = sizeof(picker->hits[0]) * mlxsw_afk->blocks_count;
+	picker = kzalloc(alloc_size, GFP_KERNEL);
+	if (!picker)
+		return -ENOMEM;
+
+	/* Since the same elements could be present in multiple blocks,
+	 * we must find out optimal block list in order to make the
+	 * block count as low as possible.
+	 *
+	 * First, we count hits. We go over all available blocks and count
+	 * how many of requested elements are covered by each.
+	 *
+	 * Then in loop, we find block with most hits and add it to
+	 * output key_info. Then we have to subtract this block hits so
+	 * the next iteration will find most suitable block for
+	 * the rest of requested elements.
+	 */
+
+	mlxsw_afk_element_usage_for_each(element, elusage)
+		mlxsw_afk_picker_count_hits(mlxsw_afk, picker, element);
+
+	do {
+		int block_index;
+
+		block_index = mlxsw_afk_picker_most_hits_get(mlxsw_afk, picker);
+		if (block_index < 0) {
+			err = block_index;
+			goto out;
+		}
+		err = mlxsw_afk_picker_key_info_add(mlxsw_afk, picker,
+						    block_index, key_info);
+		if (err)
+			goto out;
+		mlxsw_afk_picker_subtract_hits(mlxsw_afk, picker, block_index);
+	} while (!mlxsw_afk_key_info_elements_eq(key_info, elusage));
+
+	err = 0;
+out:
+	kfree(picker);
+	return err;
+}
+
+static struct mlxsw_afk_key_info *
+mlxsw_afk_key_info_create(struct mlxsw_afk *mlxsw_afk,
+			  struct mlxsw_afk_element_usage *elusage)
+{
+	struct mlxsw_afk_key_info *key_info;
+	size_t alloc_size;
+	int err;
+
+	alloc_size = sizeof(*key_info) +
+		     sizeof(key_info->blocks[0]) * mlxsw_afk->max_blocks;
+	key_info = kzalloc(alloc_size, GFP_KERNEL);
+	if (!key_info)
+		return ERR_PTR(-ENOMEM);
+	err = mlxsw_afk_picker(mlxsw_afk, key_info, elusage);
+	if (err)
+		goto err_picker;
+	list_add(&key_info->list, &mlxsw_afk->key_info_list);
+	key_info->ref_count = 1;
+	return key_info;
+
+err_picker:
+	kfree(key_info);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_afk_key_info_destroy(struct mlxsw_afk_key_info *key_info)
+{
+	list_del(&key_info->list);
+	kfree(key_info);
+}
+
+struct mlxsw_afk_key_info *
+mlxsw_afk_key_info_get(struct mlxsw_afk *mlxsw_afk,
+		       struct mlxsw_afk_element_usage *elusage)
+{
+	struct mlxsw_afk_key_info *key_info;
+
+	key_info = mlxsw_afk_key_info_find(mlxsw_afk, elusage);
+	if (key_info) {
+		key_info->ref_count++;
+		return key_info;
+	}
+	return mlxsw_afk_key_info_create(mlxsw_afk, elusage);
+}
+EXPORT_SYMBOL(mlxsw_afk_key_info_get);
+
+void mlxsw_afk_key_info_put(struct mlxsw_afk_key_info *key_info)
+{
+	if (--key_info->ref_count)
+		return;
+	mlxsw_afk_key_info_destroy(key_info);
+}
+EXPORT_SYMBOL(mlxsw_afk_key_info_put);
+
+bool mlxsw_afk_key_info_subset(struct mlxsw_afk_key_info *key_info,
+			       struct mlxsw_afk_element_usage *elusage)
+{
+	return mlxsw_afk_element_usage_subset(elusage, &key_info->elusage);
+}
+EXPORT_SYMBOL(mlxsw_afk_key_info_subset);
+
+static const struct mlxsw_afk_element_inst *
+mlxsw_afk_block_elinst_get(const struct mlxsw_afk_block *block,
+			   enum mlxsw_afk_element element)
+{
+	int i;
+
+	for (i = 0; i < block->instances_count; i++) {
+		struct mlxsw_afk_element_inst *elinst;
+
+		elinst = &block->instances[i];
+		if (elinst->info->element == element)
+			return elinst;
+	}
+	return NULL;
+}
+
+static const struct mlxsw_afk_element_inst *
+mlxsw_afk_key_info_elinst_get(struct mlxsw_afk_key_info *key_info,
+			      enum mlxsw_afk_element element,
+			      int *p_block_index)
+{
+	const struct mlxsw_afk_element_inst *elinst;
+	const struct mlxsw_afk_block *block;
+	int block_index;
+
+	if (WARN_ON(!test_bit(element, key_info->elusage.usage)))
+		return NULL;
+	block_index = key_info->element_to_block[element];
+	block = key_info->blocks[block_index];
+
+	elinst = mlxsw_afk_block_elinst_get(block, element);
+	if (WARN_ON(!elinst))
+		return NULL;
+
+	*p_block_index = block_index;
+	return elinst;
+}
+
+u16
+mlxsw_afk_key_info_block_encoding_get(const struct mlxsw_afk_key_info *key_info,
+				      int block_index)
+{
+	return key_info->blocks[block_index]->encoding;
+}
+EXPORT_SYMBOL(mlxsw_afk_key_info_block_encoding_get);
+
+unsigned int
+mlxsw_afk_key_info_blocks_count_get(const struct mlxsw_afk_key_info *key_info)
+{
+	return key_info->blocks_count;
+}
+EXPORT_SYMBOL(mlxsw_afk_key_info_blocks_count_get);
+
+void mlxsw_afk_values_add_u32(struct mlxsw_afk_element_values *values,
+			      enum mlxsw_afk_element element,
+			      u32 key_value, u32 mask_value)
+{
+	const struct mlxsw_afk_element_info *elinfo =
+				&mlxsw_afk_element_infos[element];
+	const struct mlxsw_item *storage_item = &elinfo->item;
+
+	if (!mask_value)
+		return;
+	if (WARN_ON(elinfo->type != MLXSW_AFK_ELEMENT_TYPE_U32))
+		return;
+	__mlxsw_item_set32(values->storage.key, storage_item, 0, key_value);
+	__mlxsw_item_set32(values->storage.mask, storage_item, 0, mask_value);
+	mlxsw_afk_element_usage_add(&values->elusage, element);
+}
+EXPORT_SYMBOL(mlxsw_afk_values_add_u32);
+
+void mlxsw_afk_values_add_buf(struct mlxsw_afk_element_values *values,
+			      enum mlxsw_afk_element element,
+			      const char *key_value, const char *mask_value,
+			      unsigned int len)
+{
+	const struct mlxsw_afk_element_info *elinfo =
+				&mlxsw_afk_element_infos[element];
+	const struct mlxsw_item *storage_item = &elinfo->item;
+
+	if (!memchr_inv(mask_value, 0, len)) /* If mask is zero */
+		return;
+	if (WARN_ON(elinfo->type != MLXSW_AFK_ELEMENT_TYPE_BUF) ||
+	    WARN_ON(elinfo->item.size.bytes != len))
+		return;
+	__mlxsw_item_memcpy_to(values->storage.key, key_value,
+			       storage_item, 0);
+	__mlxsw_item_memcpy_to(values->storage.mask, mask_value,
+			       storage_item, 0);
+	mlxsw_afk_element_usage_add(&values->elusage, element);
+}
+EXPORT_SYMBOL(mlxsw_afk_values_add_buf);
+
+static void mlxsw_afk_encode_u32(const struct mlxsw_item *storage_item,
+				 const struct mlxsw_item *output_item,
+				 char *storage, char *output_indexed)
+{
+	u32 value;
+
+	value = __mlxsw_item_get32(storage, storage_item, 0);
+	__mlxsw_item_set32(output_indexed, output_item, 0, value);
+}
+
+static void mlxsw_afk_encode_buf(const struct mlxsw_item *storage_item,
+				 const struct mlxsw_item *output_item,
+				 char *storage, char *output_indexed)
+{
+	char *storage_data = __mlxsw_item_data(storage, storage_item, 0);
+	char *output_data = __mlxsw_item_data(output_indexed, output_item, 0);
+	size_t len = output_item->size.bytes;
+
+	memcpy(output_data, storage_data, len);
+}
+
+#define MLXSW_AFK_KEY_BLOCK_SIZE 16
+
+static void mlxsw_afk_encode_one(const struct mlxsw_afk_element_inst *elinst,
+				 int block_index, char *storage, char *output)
+{
+	char *output_indexed = output + block_index * MLXSW_AFK_KEY_BLOCK_SIZE;
+	const struct mlxsw_item *storage_item = &elinst->info->item;
+	const struct mlxsw_item *output_item = &elinst->item;
+
+	if (elinst->type == MLXSW_AFK_ELEMENT_TYPE_U32)
+		mlxsw_afk_encode_u32(storage_item, output_item,
+				     storage, output_indexed);
+	else if (elinst->type == MLXSW_AFK_ELEMENT_TYPE_BUF)
+		mlxsw_afk_encode_buf(storage_item, output_item,
+				     storage, output_indexed);
+}
+
+void mlxsw_afk_encode(struct mlxsw_afk_key_info *key_info,
+		      struct mlxsw_afk_element_values *values,
+		      char *key, char *mask)
+{
+	const struct mlxsw_afk_element_inst *elinst;
+	enum mlxsw_afk_element element;
+	int block_index;
+
+	mlxsw_afk_element_usage_for_each(element, &values->elusage) {
+		elinst = mlxsw_afk_key_info_elinst_get(key_info, element,
+						       &block_index);
+		if (!elinst)
+			continue;
+		mlxsw_afk_encode_one(elinst, block_index,
+				     values->storage.key, key);
+		mlxsw_afk_encode_one(elinst, block_index,
+				     values->storage.mask, mask);
+	}
+}
+EXPORT_SYMBOL(mlxsw_afk_encode);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.h b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.h
new file mode 100644
index 0000000..122506d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.h
@@ -0,0 +1,250 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_CORE_ACL_FLEX_KEYS_H
+#define _MLXSW_CORE_ACL_FLEX_KEYS_H
+
+#include <linux/types.h>
+#include <linux/bitmap.h>
+
+#include "item.h"
+
+enum mlxsw_afk_element {
+	MLXSW_AFK_ELEMENT_SRC_SYS_PORT,
+	MLXSW_AFK_ELEMENT_DMAC,
+	MLXSW_AFK_ELEMENT_SMAC,
+	MLXSW_AFK_ELEMENT_ETHERTYPE,
+	MLXSW_AFK_ELEMENT_IP_PROTO,
+	MLXSW_AFK_ELEMENT_SRC_IP4,
+	MLXSW_AFK_ELEMENT_DST_IP4,
+	MLXSW_AFK_ELEMENT_SRC_IP6_HI,
+	MLXSW_AFK_ELEMENT_SRC_IP6_LO,
+	MLXSW_AFK_ELEMENT_DST_IP6_HI,
+	MLXSW_AFK_ELEMENT_DST_IP6_LO,
+	MLXSW_AFK_ELEMENT_DST_L4_PORT,
+	MLXSW_AFK_ELEMENT_SRC_L4_PORT,
+	MLXSW_AFK_ELEMENT_VID,
+	MLXSW_AFK_ELEMENT_PCP,
+	MLXSW_AFK_ELEMENT_TCP_FLAGS,
+	MLXSW_AFK_ELEMENT_IP_TTL_,
+	MLXSW_AFK_ELEMENT_IP_ECN,
+	MLXSW_AFK_ELEMENT_IP_DSCP,
+	MLXSW_AFK_ELEMENT_MAX,
+};
+
+enum mlxsw_afk_element_type {
+	MLXSW_AFK_ELEMENT_TYPE_U32,
+	MLXSW_AFK_ELEMENT_TYPE_BUF,
+};
+
+struct mlxsw_afk_element_info {
+	enum mlxsw_afk_element element; /* element ID */
+	enum mlxsw_afk_element_type type;
+	struct mlxsw_item item; /* element geometry in internal storage */
+};
+
+#define MLXSW_AFK_ELEMENT_INFO(_type, _element, _offset, _shift, _size)		\
+	[MLXSW_AFK_ELEMENT_##_element] = {					\
+		.element = MLXSW_AFK_ELEMENT_##_element,			\
+		.type = _type,							\
+		.item = {							\
+			.offset = _offset,					\
+			.shift = _shift,					\
+			.size = {.bits = _size},				\
+			.name = #_element,					\
+		},								\
+	}
+
+#define MLXSW_AFK_ELEMENT_INFO_U32(_element, _offset, _shift, _size)		\
+	MLXSW_AFK_ELEMENT_INFO(MLXSW_AFK_ELEMENT_TYPE_U32,			\
+			       _element, _offset, _shift, _size)
+
+#define MLXSW_AFK_ELEMENT_INFO_BUF(_element, _offset, _size)			\
+	MLXSW_AFK_ELEMENT_INFO(MLXSW_AFK_ELEMENT_TYPE_BUF,			\
+			       _element, _offset, 0, _size)
+
+/* For the purpose of the driver, define an internal storage scratchpad
+ * that will be used to store key/mask values. For each defined element type
+ * define an internal storage geometry.
+ */
+static const struct mlxsw_afk_element_info mlxsw_afk_element_infos[] = {
+	MLXSW_AFK_ELEMENT_INFO_U32(SRC_SYS_PORT, 0x00, 16, 16),
+	MLXSW_AFK_ELEMENT_INFO_BUF(DMAC, 0x04, 6),
+	MLXSW_AFK_ELEMENT_INFO_BUF(SMAC, 0x0A, 6),
+	MLXSW_AFK_ELEMENT_INFO_U32(ETHERTYPE, 0x00, 0, 16),
+	MLXSW_AFK_ELEMENT_INFO_U32(IP_PROTO, 0x10, 0, 8),
+	MLXSW_AFK_ELEMENT_INFO_U32(VID, 0x10, 8, 12),
+	MLXSW_AFK_ELEMENT_INFO_U32(PCP, 0x10, 20, 3),
+	MLXSW_AFK_ELEMENT_INFO_U32(TCP_FLAGS, 0x10, 23, 9),
+	MLXSW_AFK_ELEMENT_INFO_U32(DST_L4_PORT, 0x14, 0, 16),
+	MLXSW_AFK_ELEMENT_INFO_U32(SRC_L4_PORT, 0x14, 16, 16),
+	MLXSW_AFK_ELEMENT_INFO_U32(IP_TTL_, 0x18, 0, 8),
+	MLXSW_AFK_ELEMENT_INFO_U32(IP_ECN, 0x18, 9, 2),
+	MLXSW_AFK_ELEMENT_INFO_U32(IP_DSCP, 0x18, 11, 6),
+	MLXSW_AFK_ELEMENT_INFO_U32(SRC_IP4, 0x20, 0, 32),
+	MLXSW_AFK_ELEMENT_INFO_U32(DST_IP4, 0x24, 0, 32),
+	MLXSW_AFK_ELEMENT_INFO_BUF(SRC_IP6_HI, 0x20, 8),
+	MLXSW_AFK_ELEMENT_INFO_BUF(SRC_IP6_LO, 0x28, 8),
+	MLXSW_AFK_ELEMENT_INFO_BUF(DST_IP6_HI, 0x30, 8),
+	MLXSW_AFK_ELEMENT_INFO_BUF(DST_IP6_LO, 0x38, 8),
+};
+
+#define MLXSW_AFK_ELEMENT_STORAGE_SIZE 0x40
+
+struct mlxsw_afk_element_inst { /* element instance in actual block */
+	const struct mlxsw_afk_element_info *info;
+	enum mlxsw_afk_element_type type;
+	struct mlxsw_item item; /* element geometry in block */
+};
+
+#define MLXSW_AFK_ELEMENT_INST(_type, _element, _offset, _shift, _size)		\
+	{									\
+		.info = &mlxsw_afk_element_infos[MLXSW_AFK_ELEMENT_##_element],	\
+		.type = _type,							\
+		.item = {							\
+			.offset = _offset,					\
+			.shift = _shift,					\
+			.size = {.bits = _size},				\
+			.name = #_element,					\
+		},								\
+	}
+
+#define MLXSW_AFK_ELEMENT_INST_U32(_element, _offset, _shift, _size)		\
+	MLXSW_AFK_ELEMENT_INST(MLXSW_AFK_ELEMENT_TYPE_U32,			\
+			       _element, _offset, _shift, _size)
+
+#define MLXSW_AFK_ELEMENT_INST_BUF(_element, _offset, _size)			\
+	MLXSW_AFK_ELEMENT_INST(MLXSW_AFK_ELEMENT_TYPE_BUF,			\
+			       _element, _offset, 0, _size)
+
+struct mlxsw_afk_block {
+	u16 encoding; /* block ID */
+	struct mlxsw_afk_element_inst *instances;
+	unsigned int instances_count;
+};
+
+#define MLXSW_AFK_BLOCK(_encoding, _instances)					\
+	{									\
+		.encoding = _encoding,						\
+		.instances = _instances,					\
+		.instances_count = ARRAY_SIZE(_instances),			\
+	}
+
+struct mlxsw_afk_element_usage {
+	DECLARE_BITMAP(usage, MLXSW_AFK_ELEMENT_MAX);
+};
+
+#define mlxsw_afk_element_usage_for_each(element, elusage)			\
+	for_each_set_bit(element, (elusage)->usage, MLXSW_AFK_ELEMENT_MAX)
+
+static inline void
+mlxsw_afk_element_usage_add(struct mlxsw_afk_element_usage *elusage,
+			    enum mlxsw_afk_element element)
+{
+	__set_bit(element, elusage->usage);
+}
+
+static inline void
+mlxsw_afk_element_usage_zero(struct mlxsw_afk_element_usage *elusage)
+{
+	bitmap_zero(elusage->usage, MLXSW_AFK_ELEMENT_MAX);
+}
+
+static inline void
+mlxsw_afk_element_usage_fill(struct mlxsw_afk_element_usage *elusage,
+			     const enum mlxsw_afk_element *elements,
+			     unsigned int elements_count)
+{
+	int i;
+
+	mlxsw_afk_element_usage_zero(elusage);
+	for (i = 0; i < elements_count; i++)
+		mlxsw_afk_element_usage_add(elusage, elements[i]);
+}
+
+static inline bool
+mlxsw_afk_element_usage_subset(struct mlxsw_afk_element_usage *elusage_small,
+			       struct mlxsw_afk_element_usage *elusage_big)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_AFK_ELEMENT_MAX; i++)
+		if (test_bit(i, elusage_small->usage) &&
+		    !test_bit(i, elusage_big->usage))
+			return false;
+	return true;
+}
+
+struct mlxsw_afk;
+
+struct mlxsw_afk *mlxsw_afk_create(unsigned int max_blocks,
+				   const struct mlxsw_afk_block *blocks,
+				   unsigned int blocks_count);
+void mlxsw_afk_destroy(struct mlxsw_afk *mlxsw_afk);
+
+struct mlxsw_afk_key_info;
+
+struct mlxsw_afk_key_info *
+mlxsw_afk_key_info_get(struct mlxsw_afk *mlxsw_afk,
+		       struct mlxsw_afk_element_usage *elusage);
+void mlxsw_afk_key_info_put(struct mlxsw_afk_key_info *key_info);
+bool mlxsw_afk_key_info_subset(struct mlxsw_afk_key_info *key_info,
+			       struct mlxsw_afk_element_usage *elusage);
+
+u16
+mlxsw_afk_key_info_block_encoding_get(const struct mlxsw_afk_key_info *key_info,
+				      int block_index);
+unsigned int
+mlxsw_afk_key_info_blocks_count_get(const struct mlxsw_afk_key_info *key_info);
+
+struct mlxsw_afk_element_values {
+	struct mlxsw_afk_element_usage elusage;
+	struct {
+		char key[MLXSW_AFK_ELEMENT_STORAGE_SIZE];
+		char mask[MLXSW_AFK_ELEMENT_STORAGE_SIZE];
+	} storage;
+};
+
+void mlxsw_afk_values_add_u32(struct mlxsw_afk_element_values *values,
+			      enum mlxsw_afk_element element,
+			      u32 key_value, u32 mask_value);
+void mlxsw_afk_values_add_buf(struct mlxsw_afk_element_values *values,
+			      enum mlxsw_afk_element element,
+			      const char *key_value, const char *mask_value,
+			      unsigned int len);
+void mlxsw_afk_encode(struct mlxsw_afk_key_info *key_info,
+		      struct mlxsw_afk_element_values *values,
+		      char *key, char *mask);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c b/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c
new file mode 100644
index 0000000..84185f8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c
@@ -0,0 +1,372 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/sysfs.h>
+#include <linux/hwmon.h>
+#include <linux/err.h>
+
+#include "core.h"
+
+#define MLXSW_HWMON_TEMP_SENSOR_MAX_COUNT 127
+#define MLXSW_HWMON_ATTR_COUNT (MLXSW_HWMON_TEMP_SENSOR_MAX_COUNT * 4 + \
+				MLXSW_MFCR_TACHOS_MAX + MLXSW_MFCR_PWMS_MAX)
+
+struct mlxsw_hwmon_attr {
+	struct device_attribute dev_attr;
+	struct mlxsw_hwmon *hwmon;
+	unsigned int type_index;
+	char name[32];
+};
+
+struct mlxsw_hwmon {
+	struct mlxsw_core *core;
+	const struct mlxsw_bus_info *bus_info;
+	struct device *hwmon_dev;
+	struct attribute_group group;
+	const struct attribute_group *groups[2];
+	struct attribute *attrs[MLXSW_HWMON_ATTR_COUNT + 1];
+	struct mlxsw_hwmon_attr hwmon_attrs[MLXSW_HWMON_ATTR_COUNT];
+	unsigned int attrs_count;
+};
+
+static ssize_t mlxsw_hwmon_temp_show(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	char mtmp_pl[MLXSW_REG_MTMP_LEN];
+	unsigned int temp;
+	int err;
+
+	mlxsw_reg_mtmp_pack(mtmp_pl, mlwsw_hwmon_attr->type_index,
+			    false, false);
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mtmp), mtmp_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to query temp sensor\n");
+		return err;
+	}
+	mlxsw_reg_mtmp_unpack(mtmp_pl, &temp, NULL, NULL);
+	return sprintf(buf, "%u\n", temp);
+}
+
+static ssize_t mlxsw_hwmon_temp_max_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	char mtmp_pl[MLXSW_REG_MTMP_LEN];
+	unsigned int temp_max;
+	int err;
+
+	mlxsw_reg_mtmp_pack(mtmp_pl, mlwsw_hwmon_attr->type_index,
+			    false, false);
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mtmp), mtmp_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to query temp sensor\n");
+		return err;
+	}
+	mlxsw_reg_mtmp_unpack(mtmp_pl, NULL, &temp_max, NULL);
+	return sprintf(buf, "%u\n", temp_max);
+}
+
+static ssize_t mlxsw_hwmon_temp_rst_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t len)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	char mtmp_pl[MLXSW_REG_MTMP_LEN];
+	unsigned long val;
+	int err;
+
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val != 1)
+		return -EINVAL;
+
+	mlxsw_reg_mtmp_pack(mtmp_pl, mlwsw_hwmon_attr->type_index, true, true);
+	err = mlxsw_reg_write(mlxsw_hwmon->core, MLXSW_REG(mtmp), mtmp_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to reset temp sensor history\n");
+		return err;
+	}
+	return len;
+}
+
+static ssize_t mlxsw_hwmon_fan_rpm_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	char mfsm_pl[MLXSW_REG_MFSM_LEN];
+	int err;
+
+	mlxsw_reg_mfsm_pack(mfsm_pl, mlwsw_hwmon_attr->type_index);
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mfsm), mfsm_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to query fan\n");
+		return err;
+	}
+	return sprintf(buf, "%u\n", mlxsw_reg_mfsm_rpm_get(mfsm_pl));
+}
+
+static ssize_t mlxsw_hwmon_pwm_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	char mfsc_pl[MLXSW_REG_MFSC_LEN];
+	int err;
+
+	mlxsw_reg_mfsc_pack(mfsc_pl, mlwsw_hwmon_attr->type_index, 0);
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mfsc), mfsc_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to query PWM\n");
+		return err;
+	}
+	return sprintf(buf, "%u\n",
+		       mlxsw_reg_mfsc_pwm_duty_cycle_get(mfsc_pl));
+}
+
+static ssize_t mlxsw_hwmon_pwm_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t len)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	char mfsc_pl[MLXSW_REG_MFSC_LEN];
+	unsigned long val;
+	int err;
+
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val > 255)
+		return -EINVAL;
+
+	mlxsw_reg_mfsc_pack(mfsc_pl, mlwsw_hwmon_attr->type_index, val);
+	err = mlxsw_reg_write(mlxsw_hwmon->core, MLXSW_REG(mfsc), mfsc_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to write PWM\n");
+		return err;
+	}
+	return len;
+}
+
+enum mlxsw_hwmon_attr_type {
+	MLXSW_HWMON_ATTR_TYPE_TEMP,
+	MLXSW_HWMON_ATTR_TYPE_TEMP_MAX,
+	MLXSW_HWMON_ATTR_TYPE_TEMP_RST,
+	MLXSW_HWMON_ATTR_TYPE_FAN_RPM,
+	MLXSW_HWMON_ATTR_TYPE_PWM,
+};
+
+static void mlxsw_hwmon_attr_add(struct mlxsw_hwmon *mlxsw_hwmon,
+				 enum mlxsw_hwmon_attr_type attr_type,
+				 unsigned int type_index, unsigned int num) {
+	struct mlxsw_hwmon_attr *mlxsw_hwmon_attr;
+	unsigned int attr_index;
+
+	attr_index = mlxsw_hwmon->attrs_count;
+	mlxsw_hwmon_attr = &mlxsw_hwmon->hwmon_attrs[attr_index];
+
+	switch (attr_type) {
+	case MLXSW_HWMON_ATTR_TYPE_TEMP:
+		mlxsw_hwmon_attr->dev_attr.show = mlxsw_hwmon_temp_show;
+		mlxsw_hwmon_attr->dev_attr.attr.mode = 0444;
+		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
+			 "temp%u_input", num + 1);
+		break;
+	case MLXSW_HWMON_ATTR_TYPE_TEMP_MAX:
+		mlxsw_hwmon_attr->dev_attr.show = mlxsw_hwmon_temp_max_show;
+		mlxsw_hwmon_attr->dev_attr.attr.mode = 0444;
+		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
+			 "temp%u_highest", num + 1);
+		break;
+	case MLXSW_HWMON_ATTR_TYPE_TEMP_RST:
+		mlxsw_hwmon_attr->dev_attr.store = mlxsw_hwmon_temp_rst_store;
+		mlxsw_hwmon_attr->dev_attr.attr.mode = 0200;
+		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
+			 "temp%u_reset_history", num + 1);
+		break;
+	case MLXSW_HWMON_ATTR_TYPE_FAN_RPM:
+		mlxsw_hwmon_attr->dev_attr.show = mlxsw_hwmon_fan_rpm_show;
+		mlxsw_hwmon_attr->dev_attr.attr.mode = 0444;
+		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
+			 "fan%u_input", num + 1);
+		break;
+	case MLXSW_HWMON_ATTR_TYPE_PWM:
+		mlxsw_hwmon_attr->dev_attr.show = mlxsw_hwmon_pwm_show;
+		mlxsw_hwmon_attr->dev_attr.store = mlxsw_hwmon_pwm_store;
+		mlxsw_hwmon_attr->dev_attr.attr.mode = 0644;
+		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
+			 "pwm%u", num + 1);
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	mlxsw_hwmon_attr->type_index = type_index;
+	mlxsw_hwmon_attr->hwmon = mlxsw_hwmon;
+	mlxsw_hwmon_attr->dev_attr.attr.name = mlxsw_hwmon_attr->name;
+	sysfs_attr_init(&mlxsw_hwmon_attr->dev_attr.attr);
+
+	mlxsw_hwmon->attrs[attr_index] = &mlxsw_hwmon_attr->dev_attr.attr;
+	mlxsw_hwmon->attrs_count++;
+}
+
+static int mlxsw_hwmon_temp_init(struct mlxsw_hwmon *mlxsw_hwmon)
+{
+	char mtcap_pl[MLXSW_REG_MTCAP_LEN] = {0};
+	char mtmp_pl[MLXSW_REG_MTMP_LEN];
+	u8 sensor_count;
+	int i;
+	int err;
+
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mtcap), mtcap_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to get number of temp sensors\n");
+		return err;
+	}
+	sensor_count = mlxsw_reg_mtcap_sensor_count_get(mtcap_pl);
+	for (i = 0; i < sensor_count; i++) {
+		mlxsw_reg_mtmp_pack(mtmp_pl, i, true, true);
+		err = mlxsw_reg_write(mlxsw_hwmon->core,
+				      MLXSW_REG(mtmp), mtmp_pl);
+		if (err) {
+			dev_err(mlxsw_hwmon->bus_info->dev, "Failed to setup temp sensor number %d\n",
+				i);
+			return err;
+		}
+		mlxsw_hwmon_attr_add(mlxsw_hwmon,
+				     MLXSW_HWMON_ATTR_TYPE_TEMP, i, i);
+		mlxsw_hwmon_attr_add(mlxsw_hwmon,
+				     MLXSW_HWMON_ATTR_TYPE_TEMP_MAX, i, i);
+		mlxsw_hwmon_attr_add(mlxsw_hwmon,
+				     MLXSW_HWMON_ATTR_TYPE_TEMP_RST, i, i);
+	}
+	return 0;
+}
+
+static int mlxsw_hwmon_fans_init(struct mlxsw_hwmon *mlxsw_hwmon)
+{
+	char mfcr_pl[MLXSW_REG_MFCR_LEN] = {0};
+	enum mlxsw_reg_mfcr_pwm_frequency freq;
+	unsigned int type_index;
+	unsigned int num;
+	u16 tacho_active;
+	u8 pwm_active;
+	int err;
+
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mfcr), mfcr_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to get to probe PWMs and Tachometers\n");
+		return err;
+	}
+	mlxsw_reg_mfcr_unpack(mfcr_pl, &freq, &tacho_active, &pwm_active);
+	num = 0;
+	for (type_index = 0; type_index < MLXSW_MFCR_TACHOS_MAX; type_index++) {
+		if (tacho_active & BIT(type_index))
+			mlxsw_hwmon_attr_add(mlxsw_hwmon,
+					     MLXSW_HWMON_ATTR_TYPE_FAN_RPM,
+					     type_index, num++);
+	}
+	num = 0;
+	for (type_index = 0; type_index < MLXSW_MFCR_PWMS_MAX; type_index++) {
+		if (pwm_active & BIT(type_index))
+			mlxsw_hwmon_attr_add(mlxsw_hwmon,
+					     MLXSW_HWMON_ATTR_TYPE_PWM,
+					     type_index, num++);
+	}
+	return 0;
+}
+
+int mlxsw_hwmon_init(struct mlxsw_core *mlxsw_core,
+		     const struct mlxsw_bus_info *mlxsw_bus_info,
+		     struct mlxsw_hwmon **p_hwmon)
+{
+	struct mlxsw_hwmon *mlxsw_hwmon;
+	struct device *hwmon_dev;
+	int err;
+
+	mlxsw_hwmon = devm_kzalloc(mlxsw_bus_info->dev, sizeof(*mlxsw_hwmon),
+				   GFP_KERNEL);
+	if (!mlxsw_hwmon)
+		return -ENOMEM;
+	mlxsw_hwmon->core = mlxsw_core;
+	mlxsw_hwmon->bus_info = mlxsw_bus_info;
+
+	err = mlxsw_hwmon_temp_init(mlxsw_hwmon);
+	if (err)
+		goto err_temp_init;
+
+	err = mlxsw_hwmon_fans_init(mlxsw_hwmon);
+	if (err)
+		goto err_fans_init;
+
+	mlxsw_hwmon->groups[0] = &mlxsw_hwmon->group;
+	mlxsw_hwmon->group.attrs = mlxsw_hwmon->attrs;
+
+	hwmon_dev = devm_hwmon_device_register_with_groups(mlxsw_bus_info->dev,
+							   "mlxsw",
+							   mlxsw_hwmon,
+							   mlxsw_hwmon->groups);
+	if (IS_ERR(hwmon_dev)) {
+		err = PTR_ERR(hwmon_dev);
+		goto err_hwmon_register;
+	}
+
+	mlxsw_hwmon->hwmon_dev = hwmon_dev;
+	*p_hwmon = mlxsw_hwmon;
+	return 0;
+
+err_hwmon_register:
+err_fans_init:
+err_temp_init:
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
new file mode 100644
index 0000000..d866c98
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -0,0 +1,442 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+ * Copyright (c) 2016 Ivan Vecera <cera@cera.cz>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/sysfs.h>
+#include <linux/thermal.h>
+#include <linux/err.h>
+
+#include "core.h"
+
+#define MLXSW_THERMAL_POLL_INT	1000	/* ms */
+#define MLXSW_THERMAL_MAX_TEMP	110000	/* 110C */
+#define MLXSW_THERMAL_MAX_STATE	10
+#define MLXSW_THERMAL_MAX_DUTY	255
+
+struct mlxsw_thermal_trip {
+	int	type;
+	int	temp;
+	int	min_state;
+	int	max_state;
+};
+
+static const struct mlxsw_thermal_trip default_thermal_trips[] = {
+	{	/* In range - 0-40% PWM */
+		.type		= THERMAL_TRIP_ACTIVE,
+		.temp		= 75000,
+		.min_state	= 0,
+		.max_state	= (4 * MLXSW_THERMAL_MAX_STATE) / 10,
+	},
+	{	/* High - 40-100% PWM */
+		.type		= THERMAL_TRIP_ACTIVE,
+		.temp		= 80000,
+		.min_state	= (4 * MLXSW_THERMAL_MAX_STATE) / 10,
+		.max_state	= MLXSW_THERMAL_MAX_STATE,
+	},
+	{
+		/* Very high - 100% PWM */
+		.type		= THERMAL_TRIP_ACTIVE,
+		.temp		= 85000,
+		.min_state	= MLXSW_THERMAL_MAX_STATE,
+		.max_state	= MLXSW_THERMAL_MAX_STATE,
+	},
+	{	/* Warning */
+		.type		= THERMAL_TRIP_HOT,
+		.temp		= 105000,
+		.min_state	= MLXSW_THERMAL_MAX_STATE,
+		.max_state	= MLXSW_THERMAL_MAX_STATE,
+	},
+	{	/* Critical - soft poweroff */
+		.type		= THERMAL_TRIP_CRITICAL,
+		.temp		= MLXSW_THERMAL_MAX_TEMP,
+		.min_state	= MLXSW_THERMAL_MAX_STATE,
+		.max_state	= MLXSW_THERMAL_MAX_STATE,
+	}
+};
+
+#define MLXSW_THERMAL_NUM_TRIPS	ARRAY_SIZE(default_thermal_trips)
+
+/* Make sure all trips are writable */
+#define MLXSW_THERMAL_TRIP_MASK	(BIT(MLXSW_THERMAL_NUM_TRIPS) - 1)
+
+struct mlxsw_thermal {
+	struct mlxsw_core *core;
+	const struct mlxsw_bus_info *bus_info;
+	struct thermal_zone_device *tzdev;
+	struct thermal_cooling_device *cdevs[MLXSW_MFCR_PWMS_MAX];
+	struct mlxsw_thermal_trip trips[MLXSW_THERMAL_NUM_TRIPS];
+	enum thermal_device_mode mode;
+};
+
+static inline u8 mlxsw_state_to_duty(int state)
+{
+	return DIV_ROUND_CLOSEST(state * MLXSW_THERMAL_MAX_DUTY,
+				 MLXSW_THERMAL_MAX_STATE);
+}
+
+static inline int mlxsw_duty_to_state(u8 duty)
+{
+	return DIV_ROUND_CLOSEST(duty * MLXSW_THERMAL_MAX_STATE,
+				 MLXSW_THERMAL_MAX_DUTY);
+}
+
+static int mlxsw_get_cooling_device_idx(struct mlxsw_thermal *thermal,
+					struct thermal_cooling_device *cdev)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++)
+		if (thermal->cdevs[i] == cdev)
+			return i;
+
+	return -ENODEV;
+}
+
+static int mlxsw_thermal_bind(struct thermal_zone_device *tzdev,
+			      struct thermal_cooling_device *cdev)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+	struct device *dev = thermal->bus_info->dev;
+	int i, err;
+
+	/* If the cooling device is one of ours bind it */
+	if (mlxsw_get_cooling_device_idx(thermal, cdev) < 0)
+		return 0;
+
+	for (i = 0; i < MLXSW_THERMAL_NUM_TRIPS; i++) {
+		const struct mlxsw_thermal_trip *trip = &thermal->trips[i];
+
+		err = thermal_zone_bind_cooling_device(tzdev, i, cdev,
+						       trip->max_state,
+						       trip->min_state,
+						       THERMAL_WEIGHT_DEFAULT);
+		if (err < 0) {
+			dev_err(dev, "Failed to bind cooling device to trip %d\n", i);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static int mlxsw_thermal_unbind(struct thermal_zone_device *tzdev,
+				struct thermal_cooling_device *cdev)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+	struct device *dev = thermal->bus_info->dev;
+	int i;
+	int err;
+
+	/* If the cooling device is our one unbind it */
+	if (mlxsw_get_cooling_device_idx(thermal, cdev) < 0)
+		return 0;
+
+	for (i = 0; i < MLXSW_THERMAL_NUM_TRIPS; i++) {
+		err = thermal_zone_unbind_cooling_device(tzdev, i, cdev);
+		if (err < 0) {
+			dev_err(dev, "Failed to unbind cooling device\n");
+			return err;
+		}
+	}
+	return 0;
+}
+
+static int mlxsw_thermal_get_mode(struct thermal_zone_device *tzdev,
+				  enum thermal_device_mode *mode)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+
+	*mode = thermal->mode;
+
+	return 0;
+}
+
+static int mlxsw_thermal_set_mode(struct thermal_zone_device *tzdev,
+				  enum thermal_device_mode mode)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+
+	mutex_lock(&tzdev->lock);
+
+	if (mode == THERMAL_DEVICE_ENABLED)
+		tzdev->polling_delay = MLXSW_THERMAL_POLL_INT;
+	else
+		tzdev->polling_delay = 0;
+
+	mutex_unlock(&tzdev->lock);
+
+	thermal->mode = mode;
+	thermal_zone_device_update(tzdev, THERMAL_EVENT_UNSPECIFIED);
+
+	return 0;
+}
+
+static int mlxsw_thermal_get_temp(struct thermal_zone_device *tzdev,
+				  int *p_temp)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+	struct device *dev = thermal->bus_info->dev;
+	char mtmp_pl[MLXSW_REG_MTMP_LEN];
+	unsigned int temp;
+	int err;
+
+	mlxsw_reg_mtmp_pack(mtmp_pl, 0, false, false);
+
+	err = mlxsw_reg_query(thermal->core, MLXSW_REG(mtmp), mtmp_pl);
+	if (err) {
+		dev_err(dev, "Failed to query temp sensor\n");
+		return err;
+	}
+	mlxsw_reg_mtmp_unpack(mtmp_pl, &temp, NULL, NULL);
+
+	*p_temp = (int) temp;
+	return 0;
+}
+
+static int mlxsw_thermal_get_trip_type(struct thermal_zone_device *tzdev,
+				       int trip,
+				       enum thermal_trip_type *p_type)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+
+	if (trip < 0 || trip >= MLXSW_THERMAL_NUM_TRIPS)
+		return -EINVAL;
+
+	*p_type = thermal->trips[trip].type;
+	return 0;
+}
+
+static int mlxsw_thermal_get_trip_temp(struct thermal_zone_device *tzdev,
+				       int trip, int *p_temp)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+
+	if (trip < 0 || trip >= MLXSW_THERMAL_NUM_TRIPS)
+		return -EINVAL;
+
+	*p_temp = thermal->trips[trip].temp;
+	return 0;
+}
+
+static int mlxsw_thermal_set_trip_temp(struct thermal_zone_device *tzdev,
+				       int trip, int temp)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+
+	if (trip < 0 || trip >= MLXSW_THERMAL_NUM_TRIPS ||
+	    temp > MLXSW_THERMAL_MAX_TEMP)
+		return -EINVAL;
+
+	thermal->trips[trip].temp = temp;
+	return 0;
+}
+
+static struct thermal_zone_device_ops mlxsw_thermal_ops = {
+	.bind = mlxsw_thermal_bind,
+	.unbind = mlxsw_thermal_unbind,
+	.get_mode = mlxsw_thermal_get_mode,
+	.set_mode = mlxsw_thermal_set_mode,
+	.get_temp = mlxsw_thermal_get_temp,
+	.get_trip_type	= mlxsw_thermal_get_trip_type,
+	.get_trip_temp	= mlxsw_thermal_get_trip_temp,
+	.set_trip_temp	= mlxsw_thermal_set_trip_temp,
+};
+
+static int mlxsw_thermal_get_max_state(struct thermal_cooling_device *cdev,
+				       unsigned long *p_state)
+{
+	*p_state = MLXSW_THERMAL_MAX_STATE;
+	return 0;
+}
+
+static int mlxsw_thermal_get_cur_state(struct thermal_cooling_device *cdev,
+				       unsigned long *p_state)
+
+{
+	struct mlxsw_thermal *thermal = cdev->devdata;
+	struct device *dev = thermal->bus_info->dev;
+	char mfsc_pl[MLXSW_REG_MFSC_LEN];
+	int err, idx;
+	u8 duty;
+
+	idx = mlxsw_get_cooling_device_idx(thermal, cdev);
+	if (idx < 0)
+		return idx;
+
+	mlxsw_reg_mfsc_pack(mfsc_pl, idx, 0);
+	err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfsc), mfsc_pl);
+	if (err) {
+		dev_err(dev, "Failed to query PWM duty\n");
+		return err;
+	}
+
+	duty = mlxsw_reg_mfsc_pwm_duty_cycle_get(mfsc_pl);
+	*p_state = mlxsw_duty_to_state(duty);
+	return 0;
+}
+
+static int mlxsw_thermal_set_cur_state(struct thermal_cooling_device *cdev,
+				       unsigned long state)
+
+{
+	struct mlxsw_thermal *thermal = cdev->devdata;
+	struct device *dev = thermal->bus_info->dev;
+	char mfsc_pl[MLXSW_REG_MFSC_LEN];
+	int err, idx;
+
+	idx = mlxsw_get_cooling_device_idx(thermal, cdev);
+	if (idx < 0)
+		return idx;
+
+	mlxsw_reg_mfsc_pack(mfsc_pl, idx, mlxsw_state_to_duty(state));
+	err = mlxsw_reg_write(thermal->core, MLXSW_REG(mfsc), mfsc_pl);
+	if (err) {
+		dev_err(dev, "Failed to write PWM duty\n");
+		return err;
+	}
+	return 0;
+}
+
+static const struct thermal_cooling_device_ops mlxsw_cooling_ops = {
+	.get_max_state	= mlxsw_thermal_get_max_state,
+	.get_cur_state	= mlxsw_thermal_get_cur_state,
+	.set_cur_state	= mlxsw_thermal_set_cur_state,
+};
+
+int mlxsw_thermal_init(struct mlxsw_core *core,
+		       const struct mlxsw_bus_info *bus_info,
+		       struct mlxsw_thermal **p_thermal)
+{
+	char mfcr_pl[MLXSW_REG_MFCR_LEN] = { 0 };
+	enum mlxsw_reg_mfcr_pwm_frequency freq;
+	struct device *dev = bus_info->dev;
+	struct mlxsw_thermal *thermal;
+	u16 tacho_active;
+	u8 pwm_active;
+	int err, i;
+
+	thermal = devm_kzalloc(dev, sizeof(*thermal),
+			       GFP_KERNEL);
+	if (!thermal)
+		return -ENOMEM;
+
+	thermal->core = core;
+	thermal->bus_info = bus_info;
+	memcpy(thermal->trips, default_thermal_trips, sizeof(thermal->trips));
+
+	err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfcr), mfcr_pl);
+	if (err) {
+		dev_err(dev, "Failed to probe PWMs\n");
+		goto err_free_thermal;
+	}
+	mlxsw_reg_mfcr_unpack(mfcr_pl, &freq, &tacho_active, &pwm_active);
+
+	for (i = 0; i < MLXSW_MFCR_TACHOS_MAX; i++) {
+		if (tacho_active & BIT(i)) {
+			char mfsl_pl[MLXSW_REG_MFSL_LEN];
+
+			mlxsw_reg_mfsl_pack(mfsl_pl, i, 0, 0);
+
+			/* We need to query the register to preserve maximum */
+			err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfsl),
+					      mfsl_pl);
+			if (err)
+				goto err_free_thermal;
+
+			/* set the minimal RPMs to 0 */
+			mlxsw_reg_mfsl_tach_min_set(mfsl_pl, 0);
+			err = mlxsw_reg_write(thermal->core, MLXSW_REG(mfsl),
+					      mfsl_pl);
+			if (err)
+				goto err_free_thermal;
+		}
+	}
+	for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++) {
+		if (pwm_active & BIT(i)) {
+			struct thermal_cooling_device *cdev;
+
+			cdev = thermal_cooling_device_register("Fan", thermal,
+							&mlxsw_cooling_ops);
+			if (IS_ERR(cdev)) {
+				err = PTR_ERR(cdev);
+				dev_err(dev, "Failed to register cooling device\n");
+				goto err_unreg_cdevs;
+			}
+			thermal->cdevs[i] = cdev;
+		}
+	}
+
+	thermal->tzdev = thermal_zone_device_register("mlxsw",
+						      MLXSW_THERMAL_NUM_TRIPS,
+						      MLXSW_THERMAL_TRIP_MASK,
+						      thermal,
+						      &mlxsw_thermal_ops,
+						      NULL, 0,
+						      MLXSW_THERMAL_POLL_INT);
+	if (IS_ERR(thermal->tzdev)) {
+		err = PTR_ERR(thermal->tzdev);
+		dev_err(dev, "Failed to register thermal zone\n");
+		goto err_unreg_cdevs;
+	}
+
+	thermal->mode = THERMAL_DEVICE_ENABLED;
+	*p_thermal = thermal;
+	return 0;
+err_unreg_cdevs:
+	for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++)
+		if (thermal->cdevs[i])
+			thermal_cooling_device_unregister(thermal->cdevs[i]);
+err_free_thermal:
+	devm_kfree(dev, thermal);
+	return err;
+}
+
+void mlxsw_thermal_fini(struct mlxsw_thermal *thermal)
+{
+	int i;
+
+	if (thermal->tzdev) {
+		thermal_zone_device_unregister(thermal->tzdev);
+		thermal->tzdev = NULL;
+	}
+
+	for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++) {
+		if (thermal->cdevs[i]) {
+			thermal_cooling_device_unregister(thermal->cdevs[i]);
+			thermal->cdevs[i] = NULL;
+		}
+	}
+
+	devm_kfree(thermal->bus_info->dev, thermal);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/emad.h b/drivers/net/ethernet/mellanox/mlxsw/emad.h
new file mode 100644
index 0000000..97b6bb5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/emad.h
@@ -0,0 +1,127 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/emad.h
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015 Ido Schimmel <idosch@mellanox.com>
+ * Copyright (c) 2015 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_EMAD_H
+#define _MLXSW_EMAD_H
+
+#define MLXSW_EMAD_MAX_FRAME_LEN 1518	/* Length in u8 */
+#define MLXSW_EMAD_MAX_RETRY 5
+
+/* EMAD Ethernet header */
+#define MLXSW_EMAD_ETH_HDR_LEN 0x10	/* Length in u8 */
+#define MLXSW_EMAD_EH_DMAC "\x01\x02\xc9\x00\x00\x01"
+#define MLXSW_EMAD_EH_SMAC "\x00\x02\xc9\x01\x02\x03"
+#define MLXSW_EMAD_EH_ETHERTYPE 0x8932
+#define MLXSW_EMAD_EH_MLX_PROTO 0
+#define MLXSW_EMAD_EH_PROTO_VERSION 0
+
+/* EMAD TLV Types */
+enum {
+	MLXSW_EMAD_TLV_TYPE_END,
+	MLXSW_EMAD_TLV_TYPE_OP,
+	MLXSW_EMAD_TLV_TYPE_DR,
+	MLXSW_EMAD_TLV_TYPE_REG,
+	MLXSW_EMAD_TLV_TYPE_USERDATA,
+	MLXSW_EMAD_TLV_TYPE_OOBETH,
+};
+
+/* OP TLV */
+#define MLXSW_EMAD_OP_TLV_LEN 4		/* Length in u32 */
+
+enum {
+	MLXSW_EMAD_OP_TLV_CLASS_REG_ACCESS = 1,
+	MLXSW_EMAD_OP_TLV_CLASS_IPC = 2,
+};
+
+enum mlxsw_emad_op_tlv_status {
+	MLXSW_EMAD_OP_TLV_STATUS_SUCCESS,
+	MLXSW_EMAD_OP_TLV_STATUS_BUSY,
+	MLXSW_EMAD_OP_TLV_STATUS_VERSION_NOT_SUPPORTED,
+	MLXSW_EMAD_OP_TLV_STATUS_UNKNOWN_TLV,
+	MLXSW_EMAD_OP_TLV_STATUS_REGISTER_NOT_SUPPORTED,
+	MLXSW_EMAD_OP_TLV_STATUS_CLASS_NOT_SUPPORTED,
+	MLXSW_EMAD_OP_TLV_STATUS_METHOD_NOT_SUPPORTED,
+	MLXSW_EMAD_OP_TLV_STATUS_BAD_PARAMETER,
+	MLXSW_EMAD_OP_TLV_STATUS_RESOURCE_NOT_AVAILABLE,
+	MLXSW_EMAD_OP_TLV_STATUS_MESSAGE_RECEIPT_ACK,
+	MLXSW_EMAD_OP_TLV_STATUS_INTERNAL_ERROR = 0x70,
+};
+
+static inline char *mlxsw_emad_op_tlv_status_str(u8 status)
+{
+	switch (status) {
+	case MLXSW_EMAD_OP_TLV_STATUS_SUCCESS:
+		return "operation performed";
+	case MLXSW_EMAD_OP_TLV_STATUS_BUSY:
+		return "device is busy";
+	case MLXSW_EMAD_OP_TLV_STATUS_VERSION_NOT_SUPPORTED:
+		return "version not supported";
+	case MLXSW_EMAD_OP_TLV_STATUS_UNKNOWN_TLV:
+		return "unknown TLV";
+	case MLXSW_EMAD_OP_TLV_STATUS_REGISTER_NOT_SUPPORTED:
+		return "register not supported";
+	case MLXSW_EMAD_OP_TLV_STATUS_CLASS_NOT_SUPPORTED:
+		return "class not supported";
+	case MLXSW_EMAD_OP_TLV_STATUS_METHOD_NOT_SUPPORTED:
+		return "method not supported";
+	case MLXSW_EMAD_OP_TLV_STATUS_BAD_PARAMETER:
+		return "bad parameter";
+	case MLXSW_EMAD_OP_TLV_STATUS_RESOURCE_NOT_AVAILABLE:
+		return "resource not available";
+	case MLXSW_EMAD_OP_TLV_STATUS_MESSAGE_RECEIPT_ACK:
+		return "acknowledged. retransmit";
+	case MLXSW_EMAD_OP_TLV_STATUS_INTERNAL_ERROR:
+		return "internal error";
+	default:
+		return "*UNKNOWN*";
+	}
+}
+
+enum {
+	MLXSW_EMAD_OP_TLV_REQUEST,
+	MLXSW_EMAD_OP_TLV_RESPONSE
+};
+
+enum {
+	MLXSW_EMAD_OP_TLV_METHOD_QUERY = 1,
+	MLXSW_EMAD_OP_TLV_METHOD_WRITE = 2,
+	MLXSW_EMAD_OP_TLV_METHOD_SEND = 3,
+	MLXSW_EMAD_OP_TLV_METHOD_EVENT = 5,
+};
+
+/* END TLV */
+#define MLXSW_EMAD_END_TLV_LEN 1	/* Length in u32 */
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/i2c.c b/drivers/net/ethernet/mellanox/mlxsw/i2c.c
new file mode 100644
index 0000000..25f9915
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/i2c.c
@@ -0,0 +1,583 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/i2c.c
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Vadim Pasternak <vadimp@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/slab.h>
+
+#include "cmd.h"
+#include "core.h"
+#include "i2c.h"
+
+static const char mlxsw_i2c_driver_name[] = "mlxsw_i2c";
+
+#define MLXSW_I2C_CIR2_BASE		0x72000
+#define MLXSW_I2C_CIR_STATUS_OFF	0x18
+#define MLXSW_I2C_CIR2_OFF_STATUS	(MLXSW_I2C_CIR2_BASE + \
+					 MLXSW_I2C_CIR_STATUS_OFF)
+#define MLXSW_I2C_OPMOD_SHIFT		12
+#define MLXSW_I2C_GO_BIT_SHIFT		23
+#define MLXSW_I2C_CIR_CTRL_STATUS_SHIFT	24
+#define MLXSW_I2C_GO_BIT		BIT(MLXSW_I2C_GO_BIT_SHIFT)
+#define MLXSW_I2C_GO_OPMODE		BIT(MLXSW_I2C_OPMOD_SHIFT)
+#define MLXSW_I2C_SET_IMM_CMD		(MLXSW_I2C_GO_OPMODE | \
+					 MLXSW_CMD_OPCODE_QUERY_FW)
+#define MLXSW_I2C_PUSH_IMM_CMD		(MLXSW_I2C_GO_BIT | \
+					 MLXSW_I2C_SET_IMM_CMD)
+#define MLXSW_I2C_SET_CMD		(MLXSW_CMD_OPCODE_ACCESS_REG)
+#define MLXSW_I2C_PUSH_CMD		(MLXSW_I2C_GO_BIT | MLXSW_I2C_SET_CMD)
+#define MLXSW_I2C_TLV_HDR_SIZE		0x10
+#define MLXSW_I2C_ADDR_WIDTH		4
+#define MLXSW_I2C_PUSH_CMD_SIZE		(MLXSW_I2C_ADDR_WIDTH + 4)
+#define MLXSW_I2C_READ_SEMA_SIZE	4
+#define MLXSW_I2C_PREP_SIZE		(MLXSW_I2C_ADDR_WIDTH + 28)
+#define MLXSW_I2C_MBOX_SIZE		20
+#define MLXSW_I2C_MBOX_OUT_PARAM_OFF	12
+#define MLXSW_I2C_MAX_BUFF_SIZE		32
+#define MLXSW_I2C_MBOX_OFFSET_BITS	20
+#define MLXSW_I2C_MBOX_SIZE_BITS	12
+#define MLXSW_I2C_ADDR_BUF_SIZE		4
+#define MLXSW_I2C_BLK_MAX		32
+#define MLXSW_I2C_RETRY			5
+#define MLXSW_I2C_TIMEOUT_MSECS		5000
+
+/**
+ * struct mlxsw_i2c - device private data:
+ * @cmd.mb_size_in: input mailbox size;
+ * @cmd.mb_off_in: input mailbox offset in register space;
+ * @cmd.mb_size_out: output mailbox size;
+ * @cmd.mb_off_out: output mailbox offset in register space;
+ * @cmd.lock: command execution lock;
+ * @dev: I2C device;
+ * @core: switch core pointer;
+ * @bus_info: bus info block;
+ */
+struct mlxsw_i2c {
+	struct {
+		u32 mb_size_in;
+		u32 mb_off_in;
+		u32 mb_size_out;
+		u32 mb_off_out;
+		struct mutex lock;
+	} cmd;
+	struct device *dev;
+	struct mlxsw_core *core;
+	struct mlxsw_bus_info bus_info;
+};
+
+#define MLXSW_I2C_READ_MSG(_client, _addr_buf, _buf, _len) {	\
+	{ .addr = (_client)->addr,				\
+	  .buf = (_addr_buf),					\
+	  .len = MLXSW_I2C_ADDR_BUF_SIZE,			\
+	  .flags = 0 },						\
+	{ .addr = (_client)->addr,				\
+	  .buf = (_buf),					\
+	  .len = (_len),					\
+	  .flags = I2C_M_RD } }
+
+#define MLXSW_I2C_WRITE_MSG(_client, _buf, _len)		\
+	{ .addr = (_client)->addr,				\
+	  .buf = (u8 *)(_buf),					\
+	  .len = (_len),					\
+	  .flags = 0 }
+
+/* Routine converts in and out mail boxes offset and size. */
+static inline void
+mlxsw_i2c_convert_mbox(struct mlxsw_i2c *mlxsw_i2c, u8 *buf)
+{
+	u32 tmp;
+
+	/* Local in/out mailboxes: 20 bits for offset, 12 for size */
+	tmp = be32_to_cpup((__be32 *) buf);
+	mlxsw_i2c->cmd.mb_off_in = tmp &
+				   GENMASK(MLXSW_I2C_MBOX_OFFSET_BITS - 1, 0);
+	mlxsw_i2c->cmd.mb_size_in = (tmp & GENMASK(31,
+					MLXSW_I2C_MBOX_OFFSET_BITS)) >>
+					MLXSW_I2C_MBOX_OFFSET_BITS;
+
+	tmp = be32_to_cpup((__be32 *) (buf + MLXSW_I2C_ADDR_WIDTH));
+	mlxsw_i2c->cmd.mb_off_out = tmp &
+				    GENMASK(MLXSW_I2C_MBOX_OFFSET_BITS - 1, 0);
+	mlxsw_i2c->cmd.mb_size_out = (tmp & GENMASK(31,
+					MLXSW_I2C_MBOX_OFFSET_BITS)) >>
+					MLXSW_I2C_MBOX_OFFSET_BITS;
+}
+
+/* Routine obtains register size from mail box buffer. */
+static inline int mlxsw_i2c_get_reg_size(u8 *in_mbox)
+{
+	u16  tmp = be16_to_cpup((__be16 *) (in_mbox + MLXSW_I2C_TLV_HDR_SIZE));
+
+	return (tmp & 0x7ff) * 4 + MLXSW_I2C_TLV_HDR_SIZE;
+}
+
+/* Routine sets I2C device internal offset in the transaction buffer. */
+static inline void mlxsw_i2c_set_slave_addr(u8 *buf, u32 off)
+{
+	__be32 *val = (__be32 *) buf;
+
+	*val = htonl(off);
+}
+
+/* Routine waits until go bit is cleared. */
+static int mlxsw_i2c_wait_go_bit(struct i2c_client *client,
+				 struct mlxsw_i2c *mlxsw_i2c, u8 *p_status)
+{
+	u8 addr_buf[MLXSW_I2C_ADDR_BUF_SIZE];
+	u8 buf[MLXSW_I2C_READ_SEMA_SIZE];
+	int len = MLXSW_I2C_READ_SEMA_SIZE;
+	struct i2c_msg read_sema[] =
+		MLXSW_I2C_READ_MSG(client, addr_buf, buf, len);
+	bool wait_done = false;
+	unsigned long end;
+	int i = 0, err;
+
+	mlxsw_i2c_set_slave_addr(addr_buf, MLXSW_I2C_CIR2_OFF_STATUS);
+
+	end = jiffies + msecs_to_jiffies(MLXSW_I2C_TIMEOUT_MSECS);
+	do {
+		u32 ctrl;
+
+		err = i2c_transfer(client->adapter, read_sema,
+				   ARRAY_SIZE(read_sema));
+
+		ctrl = be32_to_cpu(*(__be32 *) buf);
+		if (err == ARRAY_SIZE(read_sema)) {
+			if (!(ctrl & MLXSW_I2C_GO_BIT)) {
+				wait_done = true;
+				*p_status = ctrl >>
+					    MLXSW_I2C_CIR_CTRL_STATUS_SHIFT;
+				break;
+			}
+		}
+		cond_resched();
+	} while ((time_before(jiffies, end)) || (i++ < MLXSW_I2C_RETRY));
+
+	if (wait_done) {
+		if (*p_status)
+			err = -EIO;
+	} else {
+		return -ETIMEDOUT;
+	}
+
+	return err > 0 ? 0 : err;
+}
+
+/* Routine posts a command to ASIC though mail box. */
+static int mlxsw_i2c_write_cmd(struct i2c_client *client,
+			       struct mlxsw_i2c *mlxsw_i2c,
+			       int immediate)
+{
+	__be32 push_cmd_buf[MLXSW_I2C_PUSH_CMD_SIZE / 4] = {
+		0, cpu_to_be32(MLXSW_I2C_PUSH_IMM_CMD)
+	};
+	__be32 prep_cmd_buf[MLXSW_I2C_PREP_SIZE / 4] = {
+		0, 0, 0, 0, 0, 0,
+		cpu_to_be32(client->adapter->nr & 0xffff),
+		cpu_to_be32(MLXSW_I2C_SET_IMM_CMD)
+	};
+	struct i2c_msg push_cmd =
+		MLXSW_I2C_WRITE_MSG(client, push_cmd_buf,
+				    MLXSW_I2C_PUSH_CMD_SIZE);
+	struct i2c_msg prep_cmd =
+		MLXSW_I2C_WRITE_MSG(client, prep_cmd_buf, MLXSW_I2C_PREP_SIZE);
+	int err;
+
+	if (!immediate) {
+		push_cmd_buf[1] = cpu_to_be32(MLXSW_I2C_PUSH_CMD);
+		prep_cmd_buf[7] = cpu_to_be32(MLXSW_I2C_SET_CMD);
+	}
+	mlxsw_i2c_set_slave_addr((u8 *)prep_cmd_buf,
+				 MLXSW_I2C_CIR2_BASE);
+	mlxsw_i2c_set_slave_addr((u8 *)push_cmd_buf,
+				 MLXSW_I2C_CIR2_OFF_STATUS);
+
+	/* Prepare Command Interface Register for transaction */
+	err = i2c_transfer(client->adapter, &prep_cmd, 1);
+	if (err < 0)
+		return err;
+	else if (err != 1)
+		return -EIO;
+
+	/* Write out Command Interface Register GO bit to push transaction */
+	err = i2c_transfer(client->adapter, &push_cmd, 1);
+	if (err < 0)
+		return err;
+	else if (err != 1)
+		return -EIO;
+
+	return 0;
+}
+
+/* Routine obtains mail box offsets from ASIC register space. */
+static int mlxsw_i2c_get_mbox(struct i2c_client *client,
+			      struct mlxsw_i2c *mlxsw_i2c)
+{
+	u8 addr_buf[MLXSW_I2C_ADDR_BUF_SIZE];
+	u8 buf[MLXSW_I2C_MBOX_SIZE];
+	struct i2c_msg mbox_cmd[] =
+		MLXSW_I2C_READ_MSG(client, addr_buf, buf, MLXSW_I2C_MBOX_SIZE);
+	int err;
+
+	/* Read mail boxes offsets. */
+	mlxsw_i2c_set_slave_addr(addr_buf, MLXSW_I2C_CIR2_BASE);
+	err = i2c_transfer(client->adapter, mbox_cmd, 2);
+	if (err != 2) {
+		dev_err(&client->dev, "Could not obtain mail boxes\n");
+		if (!err)
+			return -EIO;
+		else
+			return err;
+	}
+
+	/* Convert mail boxes. */
+	mlxsw_i2c_convert_mbox(mlxsw_i2c, &buf[MLXSW_I2C_MBOX_OUT_PARAM_OFF]);
+
+	return err;
+}
+
+/* Routine sends I2C write transaction to ASIC device. */
+static int
+mlxsw_i2c_write(struct device *dev, size_t in_mbox_size, u8 *in_mbox, int num,
+		u8 *p_status)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct mlxsw_i2c *mlxsw_i2c = i2c_get_clientdata(client);
+	unsigned long timeout = msecs_to_jiffies(MLXSW_I2C_TIMEOUT_MSECS);
+	u8 tran_buf[MLXSW_I2C_MAX_BUFF_SIZE + MLXSW_I2C_ADDR_BUF_SIZE];
+	int off = mlxsw_i2c->cmd.mb_off_in, chunk_size, i, j;
+	unsigned long end;
+	struct i2c_msg write_tran =
+		MLXSW_I2C_WRITE_MSG(client, tran_buf, MLXSW_I2C_PUSH_CMD_SIZE);
+	int err;
+
+	for (i = 0; i < num; i++) {
+		chunk_size = (in_mbox_size > MLXSW_I2C_BLK_MAX) ?
+			     MLXSW_I2C_BLK_MAX : in_mbox_size;
+		write_tran.len = MLXSW_I2C_ADDR_WIDTH + chunk_size;
+		mlxsw_i2c_set_slave_addr(tran_buf, off);
+		memcpy(&tran_buf[MLXSW_I2C_ADDR_BUF_SIZE], in_mbox +
+		       MLXSW_I2C_BLK_MAX * i, chunk_size);
+
+		j = 0;
+		end = jiffies + timeout;
+		do {
+			err = i2c_transfer(client->adapter, &write_tran, 1);
+			if (err == 1)
+				break;
+
+			cond_resched();
+		} while ((time_before(jiffies, end)) ||
+			 (j++ < MLXSW_I2C_RETRY));
+
+		if (err != 1) {
+			if (!err)
+				err = -EIO;
+			return err;
+		}
+
+		off += chunk_size;
+		in_mbox_size -= chunk_size;
+	}
+
+	/* Prepare and write out Command Interface Register for transaction. */
+	err = mlxsw_i2c_write_cmd(client, mlxsw_i2c, 0);
+	if (err) {
+		dev_err(&client->dev, "Could not start transaction");
+		return -EIO;
+	}
+
+	/* Wait until go bit is cleared. */
+	err = mlxsw_i2c_wait_go_bit(client, mlxsw_i2c, p_status);
+	if (err) {
+		dev_err(&client->dev, "HW semaphore is not released");
+		return err;
+	}
+
+	/* Validate transaction completion status. */
+	if (*p_status) {
+		dev_err(&client->dev, "Bad transaction completion status %x\n",
+			*p_status);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/* Routine executes I2C command. */
+static int
+mlxsw_i2c_cmd(struct device *dev, size_t in_mbox_size, u8 *in_mbox,
+	      size_t out_mbox_size, u8 *out_mbox, u8 *status)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct mlxsw_i2c *mlxsw_i2c = i2c_get_clientdata(client);
+	unsigned long timeout = msecs_to_jiffies(MLXSW_I2C_TIMEOUT_MSECS);
+	u8 tran_buf[MLXSW_I2C_ADDR_BUF_SIZE];
+	int num, chunk_size, reg_size, i, j;
+	int off = mlxsw_i2c->cmd.mb_off_out;
+	unsigned long end;
+	struct i2c_msg read_tran[] =
+		MLXSW_I2C_READ_MSG(client, tran_buf, NULL, 0);
+	int err;
+
+	WARN_ON(in_mbox_size % sizeof(u32) || out_mbox_size % sizeof(u32));
+
+	reg_size = mlxsw_i2c_get_reg_size(in_mbox);
+	num = reg_size / MLXSW_I2C_BLK_MAX;
+	if (reg_size % MLXSW_I2C_BLK_MAX)
+		num++;
+
+	if (mutex_lock_interruptible(&mlxsw_i2c->cmd.lock) < 0) {
+		dev_err(&client->dev, "Could not acquire lock");
+		return -EINVAL;
+	}
+
+	err = mlxsw_i2c_write(dev, reg_size, in_mbox, num, status);
+	if (err)
+		goto cmd_fail;
+
+	/* No out mailbox is case of write transaction. */
+	if (!out_mbox) {
+		mutex_unlock(&mlxsw_i2c->cmd.lock);
+		return 0;
+	}
+
+	/* Send read transaction to get output mailbox content. */
+	read_tran[1].buf = out_mbox;
+	for (i = 0; i < num; i++) {
+		chunk_size = (reg_size > MLXSW_I2C_BLK_MAX) ?
+			     MLXSW_I2C_BLK_MAX : reg_size;
+		read_tran[1].len = chunk_size;
+		mlxsw_i2c_set_slave_addr(tran_buf, off);
+
+		j = 0;
+		end = jiffies + timeout;
+		do {
+			err = i2c_transfer(client->adapter, read_tran,
+					   ARRAY_SIZE(read_tran));
+			if (err == ARRAY_SIZE(read_tran))
+				break;
+
+			cond_resched();
+		} while ((time_before(jiffies, end)) ||
+			 (j++ < MLXSW_I2C_RETRY));
+
+		if (err != ARRAY_SIZE(read_tran)) {
+			if (!err)
+				err = -EIO;
+
+			goto cmd_fail;
+		}
+
+		off += chunk_size;
+		reg_size -= chunk_size;
+		read_tran[1].buf += chunk_size;
+	}
+
+	mutex_unlock(&mlxsw_i2c->cmd.lock);
+
+	return 0;
+
+cmd_fail:
+	mutex_unlock(&mlxsw_i2c->cmd.lock);
+	return err;
+}
+
+static int mlxsw_i2c_cmd_exec(void *bus_priv, u16 opcode, u8 opcode_mod,
+			      u32 in_mod, bool out_mbox_direct,
+			      char *in_mbox, size_t in_mbox_size,
+			      char *out_mbox, size_t out_mbox_size,
+			      u8 *status)
+{
+	struct mlxsw_i2c *mlxsw_i2c = bus_priv;
+
+	return mlxsw_i2c_cmd(mlxsw_i2c->dev, in_mbox_size, in_mbox,
+			     out_mbox_size, out_mbox, status);
+}
+
+static bool mlxsw_i2c_skb_transmit_busy(void *bus_priv,
+					const struct mlxsw_tx_info *tx_info)
+{
+	return false;
+}
+
+static int mlxsw_i2c_skb_transmit(void *bus_priv, struct sk_buff *skb,
+				  const struct mlxsw_tx_info *tx_info)
+{
+	return 0;
+}
+
+static int
+mlxsw_i2c_init(void *bus_priv, struct mlxsw_core *mlxsw_core,
+	       const struct mlxsw_config_profile *profile,
+	       struct mlxsw_res *resources)
+{
+	struct mlxsw_i2c *mlxsw_i2c = bus_priv;
+
+	mlxsw_i2c->core = mlxsw_core;
+
+	return 0;
+}
+
+static void mlxsw_i2c_fini(void *bus_priv)
+{
+	struct mlxsw_i2c *mlxsw_i2c = bus_priv;
+
+	mlxsw_i2c->core = NULL;
+}
+
+static const struct mlxsw_bus mlxsw_i2c_bus = {
+	.kind			= "i2c",
+	.init			= mlxsw_i2c_init,
+	.fini			= mlxsw_i2c_fini,
+	.skb_transmit_busy	= mlxsw_i2c_skb_transmit_busy,
+	.skb_transmit		= mlxsw_i2c_skb_transmit,
+	.cmd_exec		= mlxsw_i2c_cmd_exec,
+};
+
+static int mlxsw_i2c_probe(struct i2c_client *client,
+			   const struct i2c_device_id *id)
+{
+	struct mlxsw_i2c *mlxsw_i2c;
+	u8 status;
+	int err;
+
+	mlxsw_i2c = devm_kzalloc(&client->dev, sizeof(*mlxsw_i2c), GFP_KERNEL);
+	if (!mlxsw_i2c)
+		return -ENOMEM;
+
+	i2c_set_clientdata(client, mlxsw_i2c);
+	mutex_init(&mlxsw_i2c->cmd.lock);
+
+	/* In order to use mailboxes through the i2c, special area is reserved
+	 * on the i2c address space that can be used for input and output
+	 * mailboxes. Such mailboxes are called local mailboxes. When using a
+	 * local mailbox, software should specify 0 as the Input/Output
+	 * parameters. The location of the Local Mailbox addresses on the i2c
+	 * space can be retrieved through the QUERY_FW command.
+	 * For this purpose QUERY_FW is to be issued with opcode modifier equal
+	 * 0x01. For such command the output parameter is an immediate value.
+	 * Here QUERY_FW command is invoked for ASIC probing and for getting
+	 * local mailboxes addresses from immedate output parameters.
+	 */
+
+	/* Prepare and write out Command Interface Register for transaction */
+	err = mlxsw_i2c_write_cmd(client, mlxsw_i2c, 1);
+	if (err) {
+		dev_err(&client->dev, "Could not start transaction");
+		goto errout;
+	}
+
+	/* Wait until go bit is cleared. */
+	err = mlxsw_i2c_wait_go_bit(client, mlxsw_i2c, &status);
+	if (err) {
+		dev_err(&client->dev, "HW semaphore is not released");
+		goto errout;
+	}
+
+	/* Validate transaction completion status. */
+	if (status) {
+		dev_err(&client->dev, "Bad transaction completion status %x\n",
+			status);
+		err = -EIO;
+		goto errout;
+	}
+
+	/* Get mailbox offsets. */
+	err = mlxsw_i2c_get_mbox(client, mlxsw_i2c);
+	if (err < 0) {
+		dev_err(&client->dev, "Fail to get mailboxes\n");
+		goto errout;
+	}
+
+	dev_info(&client->dev, "%s mb size=%x off=0x%08x out mb size=%x off=0x%08x\n",
+		 id->name, mlxsw_i2c->cmd.mb_size_in,
+		 mlxsw_i2c->cmd.mb_off_in, mlxsw_i2c->cmd.mb_size_out,
+		 mlxsw_i2c->cmd.mb_off_out);
+
+	/* Register device bus. */
+	mlxsw_i2c->bus_info.device_kind = id->name;
+	mlxsw_i2c->bus_info.device_name = client->name;
+	mlxsw_i2c->bus_info.dev = &client->dev;
+	mlxsw_i2c->dev = &client->dev;
+
+	err = mlxsw_core_bus_device_register(&mlxsw_i2c->bus_info,
+					     &mlxsw_i2c_bus, mlxsw_i2c, false,
+					     NULL);
+	if (err) {
+		dev_err(&client->dev, "Fail to register core bus\n");
+		return err;
+	}
+
+	return 0;
+
+errout:
+	i2c_set_clientdata(client, NULL);
+
+	return err;
+}
+
+static int mlxsw_i2c_remove(struct i2c_client *client)
+{
+	struct mlxsw_i2c *mlxsw_i2c = i2c_get_clientdata(client);
+
+	mlxsw_core_bus_device_unregister(mlxsw_i2c->core, false);
+	mutex_destroy(&mlxsw_i2c->cmd.lock);
+
+	return 0;
+}
+
+int mlxsw_i2c_driver_register(struct i2c_driver *i2c_driver)
+{
+	i2c_driver->probe = mlxsw_i2c_probe;
+	i2c_driver->remove = mlxsw_i2c_remove;
+	return i2c_add_driver(i2c_driver);
+}
+EXPORT_SYMBOL(mlxsw_i2c_driver_register);
+
+void mlxsw_i2c_driver_unregister(struct i2c_driver *i2c_driver)
+{
+	i2c_del_driver(i2c_driver);
+}
+EXPORT_SYMBOL(mlxsw_i2c_driver_unregister);
+
+MODULE_AUTHOR("Vadim Pasternak <vadimp@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox switch I2C interface driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/net/ethernet/mellanox/mlxsw/i2c.h b/drivers/net/ethernet/mellanox/mlxsw/i2c.h
new file mode 100644
index 0000000..daa24b2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/i2c.h
@@ -0,0 +1,60 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/i2c.h
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Vadim Pasternak <vadimp@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_I2C_H
+#define _MLXSW_I2C_H
+
+#include <linux/i2c.h>
+
+#if IS_ENABLED(CONFIG_MLXSW_I2C)
+
+int mlxsw_i2c_driver_register(struct i2c_driver *i2c_driver);
+void mlxsw_i2c_driver_unregister(struct i2c_driver *i2c_driver);
+
+#else
+
+static inline int
+mlxsw_i2c_driver_register(struct i2c_driver *i2c_driver)
+{
+	return -ENODEV;
+}
+
+static inline void
+mlxsw_i2c_driver_unregister(struct i2c_driver *i2c_driver)
+{
+}
+
+#endif
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/ib.h b/drivers/net/ethernet/mellanox/mlxsw/ib.h
new file mode 100644
index 0000000..ce313aa
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/ib.h
@@ -0,0 +1,39 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/ib.h
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Elad Raz <eladr@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _MLXSW_IB_H
+#define _MLXSW_IB_H
+
+#define MLXSW_IB_DEFAULT_MTU 4096
+
+#endif /* _MLXSW_IB_H */
diff --git a/drivers/net/ethernet/mellanox/mlxsw/item.h b/drivers/net/ethernet/mellanox/mlxsw/item.h
new file mode 100644
index 0000000..31c886e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/item.h
@@ -0,0 +1,541 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/item.h
+ * Copyright (c) 2015-2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015-2017 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2015 Ido Schimmel <idosch@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_ITEM_H
+#define _MLXSW_ITEM_H
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+
+struct mlxsw_item {
+	unsigned short	offset;		/* bytes in container */
+	short		step;		/* step in bytes for indexed items */
+	unsigned short	in_step_offset; /* offset within one step */
+	unsigned char	shift;		/* shift in bits */
+	unsigned char	element_size;	/* size of element in bit array */
+	bool		no_real_shift;
+	union {
+		unsigned char	bits;
+		unsigned short	bytes;
+	} size;
+	const char	*name;
+};
+
+static inline unsigned int
+__mlxsw_item_offset(const struct mlxsw_item *item, unsigned short index,
+		    size_t typesize)
+{
+	BUG_ON(index && !item->step);
+	if (item->offset % typesize != 0 ||
+	    item->step % typesize != 0 ||
+	    item->in_step_offset % typesize != 0) {
+		pr_err("mlxsw: item bug (name=%s,offset=%x,step=%x,in_step_offset=%x,typesize=%zx)\n",
+		       item->name, item->offset, item->step,
+		       item->in_step_offset, typesize);
+		BUG();
+	}
+
+	return ((item->offset + item->step * index + item->in_step_offset) /
+		typesize);
+}
+
+static inline u8 __mlxsw_item_get8(const char *buf,
+				   const struct mlxsw_item *item,
+				   unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(u8));
+	u8 *b = (u8 *) buf;
+	u8 tmp;
+
+	tmp = b[offset];
+	tmp >>= item->shift;
+	tmp &= GENMASK(item->size.bits - 1, 0);
+	if (item->no_real_shift)
+		tmp <<= item->shift;
+	return tmp;
+}
+
+static inline void __mlxsw_item_set8(char *buf, const struct mlxsw_item *item,
+				     unsigned short index, u8 val)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index,
+						  sizeof(u8));
+	u8 *b = (u8 *) buf;
+	u8 mask = GENMASK(item->size.bits - 1, 0) << item->shift;
+	u8 tmp;
+
+	if (!item->no_real_shift)
+		val <<= item->shift;
+	val &= mask;
+	tmp = b[offset];
+	tmp &= ~mask;
+	tmp |= val;
+	b[offset] = tmp;
+}
+
+static inline u16 __mlxsw_item_get16(const char *buf,
+				     const struct mlxsw_item *item,
+				     unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(u16));
+	__be16 *b = (__be16 *) buf;
+	u16 tmp;
+
+	tmp = be16_to_cpu(b[offset]);
+	tmp >>= item->shift;
+	tmp &= GENMASK(item->size.bits - 1, 0);
+	if (item->no_real_shift)
+		tmp <<= item->shift;
+	return tmp;
+}
+
+static inline void __mlxsw_item_set16(char *buf, const struct mlxsw_item *item,
+				      unsigned short index, u16 val)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index,
+						  sizeof(u16));
+	__be16 *b = (__be16 *) buf;
+	u16 mask = GENMASK(item->size.bits - 1, 0) << item->shift;
+	u16 tmp;
+
+	if (!item->no_real_shift)
+		val <<= item->shift;
+	val &= mask;
+	tmp = be16_to_cpu(b[offset]);
+	tmp &= ~mask;
+	tmp |= val;
+	b[offset] = cpu_to_be16(tmp);
+}
+
+static inline u32 __mlxsw_item_get32(const char *buf,
+				     const struct mlxsw_item *item,
+				     unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(u32));
+	__be32 *b = (__be32 *) buf;
+	u32 tmp;
+
+	tmp = be32_to_cpu(b[offset]);
+	tmp >>= item->shift;
+	tmp &= GENMASK(item->size.bits - 1, 0);
+	if (item->no_real_shift)
+		tmp <<= item->shift;
+	return tmp;
+}
+
+static inline void __mlxsw_item_set32(char *buf, const struct mlxsw_item *item,
+				      unsigned short index, u32 val)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index,
+						  sizeof(u32));
+	__be32 *b = (__be32 *) buf;
+	u32 mask = GENMASK(item->size.bits - 1, 0) << item->shift;
+	u32 tmp;
+
+	if (!item->no_real_shift)
+		val <<= item->shift;
+	val &= mask;
+	tmp = be32_to_cpu(b[offset]);
+	tmp &= ~mask;
+	tmp |= val;
+	b[offset] = cpu_to_be32(tmp);
+}
+
+static inline u64 __mlxsw_item_get64(const char *buf,
+				     const struct mlxsw_item *item,
+				     unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(u64));
+	__be64 *b = (__be64 *) buf;
+	u64 tmp;
+
+	tmp = be64_to_cpu(b[offset]);
+	tmp >>= item->shift;
+	tmp &= GENMASK_ULL(item->size.bits - 1, 0);
+	if (item->no_real_shift)
+		tmp <<= item->shift;
+	return tmp;
+}
+
+static inline void __mlxsw_item_set64(char *buf, const struct mlxsw_item *item,
+				      unsigned short index, u64 val)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(u64));
+	__be64 *b = (__be64 *) buf;
+	u64 mask = GENMASK_ULL(item->size.bits - 1, 0) << item->shift;
+	u64 tmp;
+
+	if (!item->no_real_shift)
+		val <<= item->shift;
+	val &= mask;
+	tmp = be64_to_cpu(b[offset]);
+	tmp &= ~mask;
+	tmp |= val;
+	b[offset] = cpu_to_be64(tmp);
+}
+
+static inline void __mlxsw_item_memcpy_from(const char *buf, char *dst,
+					    const struct mlxsw_item *item,
+					    unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(char));
+
+	memcpy(dst, &buf[offset], item->size.bytes);
+}
+
+static inline void __mlxsw_item_memcpy_to(char *buf, const char *src,
+					  const struct mlxsw_item *item,
+					  unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(char));
+
+	memcpy(&buf[offset], src, item->size.bytes);
+}
+
+static inline char *__mlxsw_item_data(char *buf, const struct mlxsw_item *item,
+				      unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(char));
+
+	return &buf[offset];
+}
+
+static inline u16
+__mlxsw_item_bit_array_offset(const struct mlxsw_item *item,
+			      u16 index, u8 *shift)
+{
+	u16 max_index, be_index;
+	u16 offset;		/* byte offset inside the array */
+	u8 in_byte_index;
+
+	BUG_ON(index && !item->element_size);
+	if (item->offset % sizeof(u32) != 0 ||
+	    BITS_PER_BYTE % item->element_size != 0) {
+		pr_err("mlxsw: item bug (name=%s,offset=%x,element_size=%x)\n",
+		       item->name, item->offset, item->element_size);
+		BUG();
+	}
+
+	max_index = (item->size.bytes << 3) / item->element_size - 1;
+	be_index = max_index - index;
+	offset = be_index * item->element_size >> 3;
+	in_byte_index  = index % (BITS_PER_BYTE / item->element_size);
+	*shift = in_byte_index * item->element_size;
+
+	return item->offset + offset;
+}
+
+static inline u8 __mlxsw_item_bit_array_get(const char *buf,
+					    const struct mlxsw_item *item,
+					    u16 index)
+{
+	u8 shift, tmp;
+	u16 offset = __mlxsw_item_bit_array_offset(item, index, &shift);
+
+	tmp = buf[offset];
+	tmp >>= shift;
+	tmp &= GENMASK(item->element_size - 1, 0);
+	return tmp;
+}
+
+static inline void __mlxsw_item_bit_array_set(char *buf,
+					      const struct mlxsw_item *item,
+					      u16 index, u8 val)
+{
+	u8 shift, tmp;
+	u16 offset = __mlxsw_item_bit_array_offset(item, index, &shift);
+	u8 mask = GENMASK(item->element_size - 1, 0) << shift;
+
+	val <<= shift;
+	val &= mask;
+	tmp = buf[offset];
+	tmp &= ~mask;
+	tmp |= val;
+	buf[offset] = tmp;
+}
+
+#define __ITEM_NAME(_type, _cname, _iname)					\
+	mlxsw_##_type##_##_cname##_##_iname##_item
+
+/* _type: cmd_mbox, reg, etc.
+ * _cname: containter name (e.g. command name, register name)
+ * _iname: item name within the container
+ */
+
+#define MLXSW_ITEM8(_type, _cname, _iname, _offset, _shift, _sizebits)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.shift = _shift,							\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u8 mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf)	\
+{										\
+	return __mlxsw_item_get8(buf, &__ITEM_NAME(_type, _cname, _iname), 0);	\
+}										\
+static inline void mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, u8 val)\
+{										\
+	__mlxsw_item_set8(buf, &__ITEM_NAME(_type, _cname, _iname), 0, val);	\
+}
+
+#define MLXSW_ITEM8_INDEXED(_type, _cname, _iname, _offset, _shift, _sizebits,	\
+			    _step, _instepoffset, _norealshift)			\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.step = _step,								\
+	.in_step_offset = _instepoffset,					\
+	.shift = _shift,							\
+	.no_real_shift = _norealshift,						\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u8								\
+mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf, unsigned short index)\
+{										\
+	return __mlxsw_item_get8(buf, &__ITEM_NAME(_type, _cname, _iname),	\
+				 index);					\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, unsigned short index,	\
+					  u8 val)				\
+{										\
+	__mlxsw_item_set8(buf, &__ITEM_NAME(_type, _cname, _iname),		\
+			  index, val);						\
+}
+
+#define MLXSW_ITEM16(_type, _cname, _iname, _offset, _shift, _sizebits)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.shift = _shift,							\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u16 mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf)	\
+{										\
+	return __mlxsw_item_get16(buf, &__ITEM_NAME(_type, _cname, _iname), 0);	\
+}										\
+static inline void mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, u16 val)\
+{										\
+	__mlxsw_item_set16(buf, &__ITEM_NAME(_type, _cname, _iname), 0, val);	\
+}
+
+#define MLXSW_ITEM16_INDEXED(_type, _cname, _iname, _offset, _shift, _sizebits,	\
+			     _step, _instepoffset, _norealshift)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.step = _step,								\
+	.in_step_offset = _instepoffset,					\
+	.shift = _shift,							\
+	.no_real_shift = _norealshift,						\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u16								\
+mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf, unsigned short index)\
+{										\
+	return __mlxsw_item_get16(buf, &__ITEM_NAME(_type, _cname, _iname),	\
+				  index);					\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, unsigned short index,	\
+					  u16 val)				\
+{										\
+	__mlxsw_item_set16(buf, &__ITEM_NAME(_type, _cname, _iname),		\
+			   index, val);						\
+}
+
+#define MLXSW_ITEM32(_type, _cname, _iname, _offset, _shift, _sizebits)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.shift = _shift,							\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u32 mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf)	\
+{										\
+	return __mlxsw_item_get32(buf, &__ITEM_NAME(_type, _cname, _iname), 0);	\
+}										\
+static inline void mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, u32 val)\
+{										\
+	__mlxsw_item_set32(buf, &__ITEM_NAME(_type, _cname, _iname), 0, val);	\
+}
+
+#define MLXSW_ITEM32_INDEXED(_type, _cname, _iname, _offset, _shift, _sizebits,	\
+			     _step, _instepoffset, _norealshift)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.step = _step,								\
+	.in_step_offset = _instepoffset,					\
+	.shift = _shift,							\
+	.no_real_shift = _norealshift,						\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u32								\
+mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf, unsigned short index)\
+{										\
+	return __mlxsw_item_get32(buf, &__ITEM_NAME(_type, _cname, _iname),	\
+				  index);					\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, unsigned short index,	\
+					  u32 val)				\
+{										\
+	__mlxsw_item_set32(buf, &__ITEM_NAME(_type, _cname, _iname),		\
+			   index, val);						\
+}
+
+#define MLXSW_ITEM64(_type, _cname, _iname, _offset, _shift, _sizebits)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.shift = _shift,							\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u64 mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf)	\
+{										\
+	return __mlxsw_item_get64(buf, &__ITEM_NAME(_type, _cname, _iname), 0);	\
+}										\
+static inline void mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, u64 val)\
+{										\
+	__mlxsw_item_set64(buf, &__ITEM_NAME(_type, _cname, _iname), 0,	val);	\
+}
+
+#define MLXSW_ITEM64_INDEXED(_type, _cname, _iname, _offset, _shift,		\
+			     _sizebits, _step, _instepoffset, _norealshift)	\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.step = _step,								\
+	.in_step_offset = _instepoffset,					\
+	.shift = _shift,							\
+	.no_real_shift = _norealshift,						\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u64								\
+mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf, unsigned short index)\
+{										\
+	return __mlxsw_item_get64(buf, &__ITEM_NAME(_type, _cname, _iname),	\
+				  index);					\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, unsigned short index,	\
+					  u64 val)				\
+{										\
+	__mlxsw_item_set64(buf, &__ITEM_NAME(_type, _cname, _iname),		\
+			   index, val);						\
+}
+
+#define MLXSW_ITEM_BUF(_type, _cname, _iname, _offset, _sizebytes)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.size = {.bytes = _sizebytes,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_memcpy_from(const char *buf, char *dst)	\
+{										\
+	__mlxsw_item_memcpy_from(buf, dst,					\
+				 &__ITEM_NAME(_type, _cname, _iname), 0);	\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_memcpy_to(char *buf, const char *src)	\
+{										\
+	__mlxsw_item_memcpy_to(buf, src,					\
+			       &__ITEM_NAME(_type, _cname, _iname), 0);		\
+}										\
+static inline char *								\
+mlxsw_##_type##_##_cname##_##_iname##_data(char *buf)				\
+{										\
+	return __mlxsw_item_data(buf, &__ITEM_NAME(_type, _cname, _iname), 0);	\
+}
+
+#define MLXSW_ITEM_BUF_INDEXED(_type, _cname, _iname, _offset, _sizebytes,	\
+			       _step, _instepoffset)				\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.step = _step,								\
+	.in_step_offset = _instepoffset,					\
+	.size = {.bytes = _sizebytes,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_memcpy_from(const char *buf,		\
+						  unsigned short index,		\
+						  char *dst)			\
+{										\
+	__mlxsw_item_memcpy_from(buf, dst,					\
+				 &__ITEM_NAME(_type, _cname, _iname), index);	\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_memcpy_to(char *buf,			\
+						unsigned short index,		\
+						const char *src)		\
+{										\
+	__mlxsw_item_memcpy_to(buf, src,					\
+			       &__ITEM_NAME(_type, _cname, _iname), index);	\
+}										\
+static inline char *								\
+mlxsw_##_type##_##_cname##_##_iname##_data(char *buf, unsigned short index)	\
+{										\
+	return __mlxsw_item_data(buf,						\
+				 &__ITEM_NAME(_type, _cname, _iname), index);	\
+}
+
+#define MLXSW_ITEM_BIT_ARRAY(_type, _cname, _iname, _offset, _sizebytes,	\
+			     _element_size)					\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.element_size = _element_size,						\
+	.size = {.bytes = _sizebytes,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u8								\
+mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf, u16 index)		\
+{										\
+	return __mlxsw_item_bit_array_get(buf,					\
+					  &__ITEM_NAME(_type, _cname, _iname),	\
+					  index);				\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, u16 index, u8 val)		\
+{										\
+	return __mlxsw_item_bit_array_set(buf,					\
+					  &__ITEM_NAME(_type, _cname, _iname),	\
+					  index, val);				\
+}										\
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/minimal.c b/drivers/net/ethernet/mellanox/mlxsw/minimal.c
new file mode 100644
index 0000000..3dd1626
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/minimal.c
@@ -0,0 +1,97 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/minimal.c
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Vadim Pasternak <vadimp@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/types.h>
+
+#include "core.h"
+#include "i2c.h"
+
+static const char mlxsw_minimal_driver_name[] = "mlxsw_minimal";
+
+static const struct mlxsw_config_profile mlxsw_minimal_config_profile;
+
+static struct mlxsw_driver mlxsw_minimal_driver = {
+	.kind		= mlxsw_minimal_driver_name,
+	.priv_size	= 1,
+	.profile	= &mlxsw_minimal_config_profile,
+};
+
+static const struct i2c_device_id mlxsw_minimal_i2c_id[] = {
+	{ "mlxsw_minimal", 0},
+	{ },
+};
+
+static struct i2c_driver mlxsw_minimal_i2c_driver = {
+	.driver.name = "mlxsw_minimal",
+	.class = I2C_CLASS_HWMON,
+	.id_table = mlxsw_minimal_i2c_id,
+};
+
+static int __init mlxsw_minimal_module_init(void)
+{
+	int err;
+
+	err = mlxsw_core_driver_register(&mlxsw_minimal_driver);
+	if (err)
+		return err;
+
+	err = mlxsw_i2c_driver_register(&mlxsw_minimal_i2c_driver);
+	if (err)
+		goto err_i2c_driver_register;
+
+	return 0;
+
+err_i2c_driver_register:
+	mlxsw_core_driver_unregister(&mlxsw_minimal_driver);
+
+	return err;
+}
+
+static void __exit mlxsw_minimal_module_exit(void)
+{
+	mlxsw_i2c_driver_unregister(&mlxsw_minimal_i2c_driver);
+	mlxsw_core_driver_unregister(&mlxsw_minimal_driver);
+}
+
+module_init(mlxsw_minimal_module_init);
+module_exit(mlxsw_minimal_module_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Vadim Pasternak <vadimp@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox minimal driver");
+MODULE_DEVICE_TABLE(i2c, mlxsw_minimal_i2c_id);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
new file mode 100644
index 0000000..3a93819
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -0,0 +1,1807 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/pci.c
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/err.h>
+#include <linux/device.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/wait.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+
+#include "pci_hw.h"
+#include "pci.h"
+#include "core.h"
+#include "cmd.h"
+#include "port.h"
+#include "resources.h"
+
+static const char mlxsw_pci_driver_name[] = "mlxsw_pci";
+
+#define mlxsw_pci_write32(mlxsw_pci, reg, val) \
+	iowrite32be(val, (mlxsw_pci)->hw_addr + (MLXSW_PCI_ ## reg))
+#define mlxsw_pci_read32(mlxsw_pci, reg) \
+	ioread32be((mlxsw_pci)->hw_addr + (MLXSW_PCI_ ## reg))
+
+enum mlxsw_pci_queue_type {
+	MLXSW_PCI_QUEUE_TYPE_SDQ,
+	MLXSW_PCI_QUEUE_TYPE_RDQ,
+	MLXSW_PCI_QUEUE_TYPE_CQ,
+	MLXSW_PCI_QUEUE_TYPE_EQ,
+};
+
+#define MLXSW_PCI_QUEUE_TYPE_COUNT	4
+
+static const u16 mlxsw_pci_doorbell_type_offset[] = {
+	MLXSW_PCI_DOORBELL_SDQ_OFFSET,	/* for type MLXSW_PCI_QUEUE_TYPE_SDQ */
+	MLXSW_PCI_DOORBELL_RDQ_OFFSET,	/* for type MLXSW_PCI_QUEUE_TYPE_RDQ */
+	MLXSW_PCI_DOORBELL_CQ_OFFSET,	/* for type MLXSW_PCI_QUEUE_TYPE_CQ */
+	MLXSW_PCI_DOORBELL_EQ_OFFSET,	/* for type MLXSW_PCI_QUEUE_TYPE_EQ */
+};
+
+static const u16 mlxsw_pci_doorbell_arm_type_offset[] = {
+	0, /* unused */
+	0, /* unused */
+	MLXSW_PCI_DOORBELL_ARM_CQ_OFFSET, /* for type MLXSW_PCI_QUEUE_TYPE_CQ */
+	MLXSW_PCI_DOORBELL_ARM_EQ_OFFSET, /* for type MLXSW_PCI_QUEUE_TYPE_EQ */
+};
+
+struct mlxsw_pci_mem_item {
+	char *buf;
+	dma_addr_t mapaddr;
+	size_t size;
+};
+
+struct mlxsw_pci_queue_elem_info {
+	char *elem; /* pointer to actual dma mapped element mem chunk */
+	union {
+		struct {
+			struct sk_buff *skb;
+		} sdq;
+		struct {
+			struct sk_buff *skb;
+		} rdq;
+	} u;
+};
+
+struct mlxsw_pci_queue {
+	spinlock_t lock; /* for queue accesses */
+	struct mlxsw_pci_mem_item mem_item;
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	u16 producer_counter;
+	u16 consumer_counter;
+	u16 count; /* number of elements in queue */
+	u8 num; /* queue number */
+	u8 elem_size; /* size of one element */
+	enum mlxsw_pci_queue_type type;
+	struct tasklet_struct tasklet; /* queue processing tasklet */
+	struct mlxsw_pci *pci;
+	union {
+		struct {
+			u32 comp_sdq_count;
+			u32 comp_rdq_count;
+		} cq;
+		struct {
+			u32 ev_cmd_count;
+			u32 ev_comp_count;
+			u32 ev_other_count;
+		} eq;
+	} u;
+};
+
+struct mlxsw_pci_queue_type_group {
+	struct mlxsw_pci_queue *q;
+	u8 count; /* number of queues in group */
+};
+
+struct mlxsw_pci {
+	struct pci_dev *pdev;
+	u8 __iomem *hw_addr;
+	struct mlxsw_pci_queue_type_group queues[MLXSW_PCI_QUEUE_TYPE_COUNT];
+	u32 doorbell_offset;
+	struct mlxsw_core *core;
+	struct {
+		struct mlxsw_pci_mem_item *items;
+		unsigned int count;
+	} fw_area;
+	struct {
+		struct mlxsw_pci_mem_item out_mbox;
+		struct mlxsw_pci_mem_item in_mbox;
+		struct mutex lock; /* Lock access to command registers */
+		bool nopoll;
+		wait_queue_head_t wait;
+		bool wait_done;
+		struct {
+			u8 status;
+			u64 out_param;
+		} comp;
+	} cmd;
+	struct mlxsw_bus_info bus_info;
+	const struct pci_device_id *id;
+};
+
+static void mlxsw_pci_queue_tasklet_schedule(struct mlxsw_pci_queue *q)
+{
+	tasklet_schedule(&q->tasklet);
+}
+
+static char *__mlxsw_pci_queue_elem_get(struct mlxsw_pci_queue *q,
+					size_t elem_size, int elem_index)
+{
+	return q->mem_item.buf + (elem_size * elem_index);
+}
+
+static struct mlxsw_pci_queue_elem_info *
+mlxsw_pci_queue_elem_info_get(struct mlxsw_pci_queue *q, int elem_index)
+{
+	return &q->elem_info[elem_index];
+}
+
+static struct mlxsw_pci_queue_elem_info *
+mlxsw_pci_queue_elem_info_producer_get(struct mlxsw_pci_queue *q)
+{
+	int index = q->producer_counter & (q->count - 1);
+
+	if ((u16) (q->producer_counter - q->consumer_counter) == q->count)
+		return NULL;
+	return mlxsw_pci_queue_elem_info_get(q, index);
+}
+
+static struct mlxsw_pci_queue_elem_info *
+mlxsw_pci_queue_elem_info_consumer_get(struct mlxsw_pci_queue *q)
+{
+	int index = q->consumer_counter & (q->count - 1);
+
+	return mlxsw_pci_queue_elem_info_get(q, index);
+}
+
+static char *mlxsw_pci_queue_elem_get(struct mlxsw_pci_queue *q, int elem_index)
+{
+	return mlxsw_pci_queue_elem_info_get(q, elem_index)->elem;
+}
+
+static bool mlxsw_pci_elem_hw_owned(struct mlxsw_pci_queue *q, bool owner_bit)
+{
+	return owner_bit != !!(q->consumer_counter & q->count);
+}
+
+static char *
+mlxsw_pci_queue_sw_elem_get(struct mlxsw_pci_queue *q,
+			    u32 (*get_elem_owner_func)(const char *))
+{
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	char *elem;
+	bool owner_bit;
+
+	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
+	elem = elem_info->elem;
+	owner_bit = get_elem_owner_func(elem);
+	if (mlxsw_pci_elem_hw_owned(q, owner_bit))
+		return NULL;
+	q->consumer_counter++;
+	rmb(); /* make sure we read owned bit before the rest of elem */
+	return elem;
+}
+
+static struct mlxsw_pci_queue_type_group *
+mlxsw_pci_queue_type_group_get(struct mlxsw_pci *mlxsw_pci,
+			       enum mlxsw_pci_queue_type q_type)
+{
+	return &mlxsw_pci->queues[q_type];
+}
+
+static u8 __mlxsw_pci_queue_count(struct mlxsw_pci *mlxsw_pci,
+				  enum mlxsw_pci_queue_type q_type)
+{
+	struct mlxsw_pci_queue_type_group *queue_group;
+
+	queue_group = mlxsw_pci_queue_type_group_get(mlxsw_pci, q_type);
+	return queue_group->count;
+}
+
+static u8 mlxsw_pci_sdq_count(struct mlxsw_pci *mlxsw_pci)
+{
+	return __mlxsw_pci_queue_count(mlxsw_pci, MLXSW_PCI_QUEUE_TYPE_SDQ);
+}
+
+static u8 mlxsw_pci_cq_count(struct mlxsw_pci *mlxsw_pci)
+{
+	return __mlxsw_pci_queue_count(mlxsw_pci, MLXSW_PCI_QUEUE_TYPE_CQ);
+}
+
+static struct mlxsw_pci_queue *
+__mlxsw_pci_queue_get(struct mlxsw_pci *mlxsw_pci,
+		      enum mlxsw_pci_queue_type q_type, u8 q_num)
+{
+	return &mlxsw_pci->queues[q_type].q[q_num];
+}
+
+static struct mlxsw_pci_queue *mlxsw_pci_sdq_get(struct mlxsw_pci *mlxsw_pci,
+						 u8 q_num)
+{
+	return __mlxsw_pci_queue_get(mlxsw_pci,
+				     MLXSW_PCI_QUEUE_TYPE_SDQ, q_num);
+}
+
+static struct mlxsw_pci_queue *mlxsw_pci_rdq_get(struct mlxsw_pci *mlxsw_pci,
+						 u8 q_num)
+{
+	return __mlxsw_pci_queue_get(mlxsw_pci,
+				     MLXSW_PCI_QUEUE_TYPE_RDQ, q_num);
+}
+
+static struct mlxsw_pci_queue *mlxsw_pci_cq_get(struct mlxsw_pci *mlxsw_pci,
+						u8 q_num)
+{
+	return __mlxsw_pci_queue_get(mlxsw_pci, MLXSW_PCI_QUEUE_TYPE_CQ, q_num);
+}
+
+static struct mlxsw_pci_queue *mlxsw_pci_eq_get(struct mlxsw_pci *mlxsw_pci,
+						u8 q_num)
+{
+	return __mlxsw_pci_queue_get(mlxsw_pci, MLXSW_PCI_QUEUE_TYPE_EQ, q_num);
+}
+
+static void __mlxsw_pci_queue_doorbell_set(struct mlxsw_pci *mlxsw_pci,
+					   struct mlxsw_pci_queue *q,
+					   u16 val)
+{
+	mlxsw_pci_write32(mlxsw_pci,
+			  DOORBELL(mlxsw_pci->doorbell_offset,
+				   mlxsw_pci_doorbell_type_offset[q->type],
+				   q->num), val);
+}
+
+static void __mlxsw_pci_queue_doorbell_arm_set(struct mlxsw_pci *mlxsw_pci,
+					       struct mlxsw_pci_queue *q,
+					       u16 val)
+{
+	mlxsw_pci_write32(mlxsw_pci,
+			  DOORBELL(mlxsw_pci->doorbell_offset,
+				   mlxsw_pci_doorbell_arm_type_offset[q->type],
+				   q->num), val);
+}
+
+static void mlxsw_pci_queue_doorbell_producer_ring(struct mlxsw_pci *mlxsw_pci,
+						   struct mlxsw_pci_queue *q)
+{
+	wmb(); /* ensure all writes are done before we ring a bell */
+	__mlxsw_pci_queue_doorbell_set(mlxsw_pci, q, q->producer_counter);
+}
+
+static void mlxsw_pci_queue_doorbell_consumer_ring(struct mlxsw_pci *mlxsw_pci,
+						   struct mlxsw_pci_queue *q)
+{
+	wmb(); /* ensure all writes are done before we ring a bell */
+	__mlxsw_pci_queue_doorbell_set(mlxsw_pci, q,
+				       q->consumer_counter + q->count);
+}
+
+static void
+mlxsw_pci_queue_doorbell_arm_consumer_ring(struct mlxsw_pci *mlxsw_pci,
+					   struct mlxsw_pci_queue *q)
+{
+	wmb(); /* ensure all writes are done before we ring a bell */
+	__mlxsw_pci_queue_doorbell_arm_set(mlxsw_pci, q, q->consumer_counter);
+}
+
+static dma_addr_t __mlxsw_pci_queue_page_get(struct mlxsw_pci_queue *q,
+					     int page_index)
+{
+	return q->mem_item.mapaddr + MLXSW_PCI_PAGE_SIZE * page_index;
+}
+
+static int mlxsw_pci_sdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+			      struct mlxsw_pci_queue *q)
+{
+	int i;
+	int err;
+
+	q->producer_counter = 0;
+	q->consumer_counter = 0;
+
+	/* Set CQ of same number of this SDQ. */
+	mlxsw_cmd_mbox_sw2hw_dq_cq_set(mbox, q->num);
+	mlxsw_cmd_mbox_sw2hw_dq_sdq_tclass_set(mbox, 3);
+	mlxsw_cmd_mbox_sw2hw_dq_log2_dq_sz_set(mbox, 3); /* 8 pages */
+	for (i = 0; i < MLXSW_PCI_AQ_PAGES; i++) {
+		dma_addr_t mapaddr = __mlxsw_pci_queue_page_get(q, i);
+
+		mlxsw_cmd_mbox_sw2hw_dq_pa_set(mbox, i, mapaddr);
+	}
+
+	err = mlxsw_cmd_sw2hw_sdq(mlxsw_pci->core, mbox, q->num);
+	if (err)
+		return err;
+	mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);
+	return 0;
+}
+
+static void mlxsw_pci_sdq_fini(struct mlxsw_pci *mlxsw_pci,
+			       struct mlxsw_pci_queue *q)
+{
+	mlxsw_cmd_hw2sw_sdq(mlxsw_pci->core, q->num);
+}
+
+static int mlxsw_pci_wqe_frag_map(struct mlxsw_pci *mlxsw_pci, char *wqe,
+				  int index, char *frag_data, size_t frag_len,
+				  int direction)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	dma_addr_t mapaddr;
+
+	mapaddr = pci_map_single(pdev, frag_data, frag_len, direction);
+	if (unlikely(pci_dma_mapping_error(pdev, mapaddr))) {
+		dev_err_ratelimited(&pdev->dev, "failed to dma map tx frag\n");
+		return -EIO;
+	}
+	mlxsw_pci_wqe_address_set(wqe, index, mapaddr);
+	mlxsw_pci_wqe_byte_count_set(wqe, index, frag_len);
+	return 0;
+}
+
+static void mlxsw_pci_wqe_frag_unmap(struct mlxsw_pci *mlxsw_pci, char *wqe,
+				     int index, int direction)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	size_t frag_len = mlxsw_pci_wqe_byte_count_get(wqe, index);
+	dma_addr_t mapaddr = mlxsw_pci_wqe_address_get(wqe, index);
+
+	if (!frag_len)
+		return;
+	pci_unmap_single(pdev, mapaddr, frag_len, direction);
+}
+
+static int mlxsw_pci_rdq_skb_alloc(struct mlxsw_pci *mlxsw_pci,
+				   struct mlxsw_pci_queue_elem_info *elem_info)
+{
+	size_t buf_len = MLXSW_PORT_MAX_MTU;
+	char *wqe = elem_info->elem;
+	struct sk_buff *skb;
+	int err;
+
+	elem_info->u.rdq.skb = NULL;
+	skb = netdev_alloc_skb_ip_align(NULL, buf_len);
+	if (!skb)
+		return -ENOMEM;
+
+	/* Assume that wqe was previously zeroed. */
+
+	err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, 0, skb->data,
+				     buf_len, DMA_FROM_DEVICE);
+	if (err)
+		goto err_frag_map;
+
+	elem_info->u.rdq.skb = skb;
+	return 0;
+
+err_frag_map:
+	dev_kfree_skb_any(skb);
+	return err;
+}
+
+static void mlxsw_pci_rdq_skb_free(struct mlxsw_pci *mlxsw_pci,
+				   struct mlxsw_pci_queue_elem_info *elem_info)
+{
+	struct sk_buff *skb;
+	char *wqe;
+
+	skb = elem_info->u.rdq.skb;
+	wqe = elem_info->elem;
+
+	mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
+	dev_kfree_skb_any(skb);
+}
+
+static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+			      struct mlxsw_pci_queue *q)
+{
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	u8 sdq_count = mlxsw_pci_sdq_count(mlxsw_pci);
+	int i;
+	int err;
+
+	q->producer_counter = 0;
+	q->consumer_counter = 0;
+
+	/* Set CQ of same number of this RDQ with base
+	 * above SDQ count as the lower ones are assigned to SDQs.
+	 */
+	mlxsw_cmd_mbox_sw2hw_dq_cq_set(mbox, sdq_count + q->num);
+	mlxsw_cmd_mbox_sw2hw_dq_log2_dq_sz_set(mbox, 3); /* 8 pages */
+	for (i = 0; i < MLXSW_PCI_AQ_PAGES; i++) {
+		dma_addr_t mapaddr = __mlxsw_pci_queue_page_get(q, i);
+
+		mlxsw_cmd_mbox_sw2hw_dq_pa_set(mbox, i, mapaddr);
+	}
+
+	err = mlxsw_cmd_sw2hw_rdq(mlxsw_pci->core, mbox, q->num);
+	if (err)
+		return err;
+
+	mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);
+
+	for (i = 0; i < q->count; i++) {
+		elem_info = mlxsw_pci_queue_elem_info_producer_get(q);
+		BUG_ON(!elem_info);
+		err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info);
+		if (err)
+			goto rollback;
+		/* Everything is set up, ring doorbell to pass elem to HW */
+		q->producer_counter++;
+		mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);
+	}
+
+	return 0;
+
+rollback:
+	for (i--; i >= 0; i--) {
+		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
+		mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info);
+	}
+	mlxsw_cmd_hw2sw_rdq(mlxsw_pci->core, q->num);
+
+	return err;
+}
+
+static void mlxsw_pci_rdq_fini(struct mlxsw_pci *mlxsw_pci,
+			       struct mlxsw_pci_queue *q)
+{
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	int i;
+
+	mlxsw_cmd_hw2sw_rdq(mlxsw_pci->core, q->num);
+	for (i = 0; i < q->count; i++) {
+		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
+		mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info);
+	}
+}
+
+static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+			     struct mlxsw_pci_queue *q)
+{
+	int i;
+	int err;
+
+	q->consumer_counter = 0;
+
+	for (i = 0; i < q->count; i++) {
+		char *elem = mlxsw_pci_queue_elem_get(q, i);
+
+		mlxsw_pci_cqe_owner_set(elem, 1);
+	}
+
+	mlxsw_cmd_mbox_sw2hw_cq_cv_set(mbox, 0); /* CQE ver 0 */
+	mlxsw_cmd_mbox_sw2hw_cq_c_eqn_set(mbox, MLXSW_PCI_EQ_COMP_NUM);
+	mlxsw_cmd_mbox_sw2hw_cq_st_set(mbox, 0);
+	mlxsw_cmd_mbox_sw2hw_cq_log_cq_size_set(mbox, ilog2(q->count));
+	for (i = 0; i < MLXSW_PCI_AQ_PAGES; i++) {
+		dma_addr_t mapaddr = __mlxsw_pci_queue_page_get(q, i);
+
+		mlxsw_cmd_mbox_sw2hw_cq_pa_set(mbox, i, mapaddr);
+	}
+	err = mlxsw_cmd_sw2hw_cq(mlxsw_pci->core, mbox, q->num);
+	if (err)
+		return err;
+	mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q);
+	mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q);
+	return 0;
+}
+
+static void mlxsw_pci_cq_fini(struct mlxsw_pci *mlxsw_pci,
+			      struct mlxsw_pci_queue *q)
+{
+	mlxsw_cmd_hw2sw_cq(mlxsw_pci->core, q->num);
+}
+
+static void mlxsw_pci_cqe_sdq_handle(struct mlxsw_pci *mlxsw_pci,
+				     struct mlxsw_pci_queue *q,
+				     u16 consumer_counter_limit,
+				     char *cqe)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	char *wqe;
+	struct sk_buff *skb;
+	int i;
+
+	spin_lock(&q->lock);
+	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
+	skb = elem_info->u.sdq.skb;
+	wqe = elem_info->elem;
+	for (i = 0; i < MLXSW_PCI_WQE_SG_ENTRIES; i++)
+		mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, i, DMA_TO_DEVICE);
+	dev_kfree_skb_any(skb);
+	elem_info->u.sdq.skb = NULL;
+
+	if (q->consumer_counter++ != consumer_counter_limit)
+		dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in SDQ\n");
+	spin_unlock(&q->lock);
+}
+
+static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
+				     struct mlxsw_pci_queue *q,
+				     u16 consumer_counter_limit,
+				     char *cqe)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	char *wqe;
+	struct sk_buff *skb;
+	struct mlxsw_rx_info rx_info;
+	u16 byte_count;
+	int err;
+
+	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
+	skb = elem_info->u.sdq.skb;
+	if (!skb)
+		return;
+	wqe = elem_info->elem;
+	mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
+
+	if (q->consumer_counter++ != consumer_counter_limit)
+		dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in RDQ\n");
+
+	if (mlxsw_pci_cqe_lag_get(cqe)) {
+		rx_info.is_lag = true;
+		rx_info.u.lag_id = mlxsw_pci_cqe_lag_id_get(cqe);
+		rx_info.lag_port_index = mlxsw_pci_cqe_lag_port_index_get(cqe);
+	} else {
+		rx_info.is_lag = false;
+		rx_info.u.sys_port = mlxsw_pci_cqe_system_port_get(cqe);
+	}
+
+	rx_info.trap_id = mlxsw_pci_cqe_trap_id_get(cqe);
+
+	byte_count = mlxsw_pci_cqe_byte_count_get(cqe);
+	if (mlxsw_pci_cqe_crc_get(cqe))
+		byte_count -= ETH_FCS_LEN;
+	skb_put(skb, byte_count);
+	mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info);
+
+	memset(wqe, 0, q->elem_size);
+	err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info);
+	if (err)
+		dev_dbg_ratelimited(&pdev->dev, "Failed to alloc skb for RDQ\n");
+	/* Everything is set up, ring doorbell to pass elem to HW */
+	q->producer_counter++;
+	mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);
+	return;
+}
+
+static char *mlxsw_pci_cq_sw_cqe_get(struct mlxsw_pci_queue *q)
+{
+	return mlxsw_pci_queue_sw_elem_get(q, mlxsw_pci_cqe_owner_get);
+}
+
+static void mlxsw_pci_cq_tasklet(unsigned long data)
+{
+	struct mlxsw_pci_queue *q = (struct mlxsw_pci_queue *) data;
+	struct mlxsw_pci *mlxsw_pci = q->pci;
+	char *cqe;
+	int items = 0;
+	int credits = q->count >> 1;
+
+	while ((cqe = mlxsw_pci_cq_sw_cqe_get(q))) {
+		u16 wqe_counter = mlxsw_pci_cqe_wqe_counter_get(cqe);
+		u8 sendq = mlxsw_pci_cqe_sr_get(cqe);
+		u8 dqn = mlxsw_pci_cqe_dqn_get(cqe);
+
+		if (sendq) {
+			struct mlxsw_pci_queue *sdq;
+
+			sdq = mlxsw_pci_sdq_get(mlxsw_pci, dqn);
+			mlxsw_pci_cqe_sdq_handle(mlxsw_pci, sdq,
+						 wqe_counter, cqe);
+			q->u.cq.comp_sdq_count++;
+		} else {
+			struct mlxsw_pci_queue *rdq;
+
+			rdq = mlxsw_pci_rdq_get(mlxsw_pci, dqn);
+			mlxsw_pci_cqe_rdq_handle(mlxsw_pci, rdq,
+						 wqe_counter, cqe);
+			q->u.cq.comp_rdq_count++;
+		}
+		if (++items == credits)
+			break;
+	}
+	if (items) {
+		mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q);
+		mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q);
+	}
+}
+
+static int mlxsw_pci_eq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+			     struct mlxsw_pci_queue *q)
+{
+	int i;
+	int err;
+
+	q->consumer_counter = 0;
+
+	for (i = 0; i < q->count; i++) {
+		char *elem = mlxsw_pci_queue_elem_get(q, i);
+
+		mlxsw_pci_eqe_owner_set(elem, 1);
+	}
+
+	mlxsw_cmd_mbox_sw2hw_eq_int_msix_set(mbox, 1); /* MSI-X used */
+	mlxsw_cmd_mbox_sw2hw_eq_st_set(mbox, 1); /* armed */
+	mlxsw_cmd_mbox_sw2hw_eq_log_eq_size_set(mbox, ilog2(q->count));
+	for (i = 0; i < MLXSW_PCI_AQ_PAGES; i++) {
+		dma_addr_t mapaddr = __mlxsw_pci_queue_page_get(q, i);
+
+		mlxsw_cmd_mbox_sw2hw_eq_pa_set(mbox, i, mapaddr);
+	}
+	err = mlxsw_cmd_sw2hw_eq(mlxsw_pci->core, mbox, q->num);
+	if (err)
+		return err;
+	mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q);
+	mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q);
+	return 0;
+}
+
+static void mlxsw_pci_eq_fini(struct mlxsw_pci *mlxsw_pci,
+			      struct mlxsw_pci_queue *q)
+{
+	mlxsw_cmd_hw2sw_eq(mlxsw_pci->core, q->num);
+}
+
+static void mlxsw_pci_eq_cmd_event(struct mlxsw_pci *mlxsw_pci, char *eqe)
+{
+	mlxsw_pci->cmd.comp.status = mlxsw_pci_eqe_cmd_status_get(eqe);
+	mlxsw_pci->cmd.comp.out_param =
+		((u64) mlxsw_pci_eqe_cmd_out_param_h_get(eqe)) << 32 |
+		mlxsw_pci_eqe_cmd_out_param_l_get(eqe);
+	mlxsw_pci->cmd.wait_done = true;
+	wake_up(&mlxsw_pci->cmd.wait);
+}
+
+static char *mlxsw_pci_eq_sw_eqe_get(struct mlxsw_pci_queue *q)
+{
+	return mlxsw_pci_queue_sw_elem_get(q, mlxsw_pci_eqe_owner_get);
+}
+
+static void mlxsw_pci_eq_tasklet(unsigned long data)
+{
+	struct mlxsw_pci_queue *q = (struct mlxsw_pci_queue *) data;
+	struct mlxsw_pci *mlxsw_pci = q->pci;
+	u8 cq_count = mlxsw_pci_cq_count(mlxsw_pci);
+	unsigned long active_cqns[BITS_TO_LONGS(MLXSW_PCI_CQS_MAX)];
+	char *eqe;
+	u8 cqn;
+	bool cq_handle = false;
+	int items = 0;
+	int credits = q->count >> 1;
+
+	memset(&active_cqns, 0, sizeof(active_cqns));
+
+	while ((eqe = mlxsw_pci_eq_sw_eqe_get(q))) {
+		u8 event_type = mlxsw_pci_eqe_event_type_get(eqe);
+
+		switch (event_type) {
+		case MLXSW_PCI_EQE_EVENT_TYPE_CMD:
+			mlxsw_pci_eq_cmd_event(mlxsw_pci, eqe);
+			q->u.eq.ev_cmd_count++;
+			break;
+		case MLXSW_PCI_EQE_EVENT_TYPE_COMP:
+			cqn = mlxsw_pci_eqe_cqn_get(eqe);
+			set_bit(cqn, active_cqns);
+			cq_handle = true;
+			q->u.eq.ev_comp_count++;
+			break;
+		default:
+			q->u.eq.ev_other_count++;
+		}
+		if (++items == credits)
+			break;
+	}
+	if (items) {
+		mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q);
+		mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q);
+	}
+
+	if (!cq_handle)
+		return;
+	for_each_set_bit(cqn, active_cqns, cq_count) {
+		q = mlxsw_pci_cq_get(mlxsw_pci, cqn);
+		mlxsw_pci_queue_tasklet_schedule(q);
+	}
+}
+
+struct mlxsw_pci_queue_ops {
+	const char *name;
+	enum mlxsw_pci_queue_type type;
+	int (*init)(struct mlxsw_pci *mlxsw_pci, char *mbox,
+		    struct mlxsw_pci_queue *q);
+	void (*fini)(struct mlxsw_pci *mlxsw_pci,
+		     struct mlxsw_pci_queue *q);
+	void (*tasklet)(unsigned long data);
+	u16 elem_count;
+	u8 elem_size;
+};
+
+static const struct mlxsw_pci_queue_ops mlxsw_pci_sdq_ops = {
+	.type		= MLXSW_PCI_QUEUE_TYPE_SDQ,
+	.init		= mlxsw_pci_sdq_init,
+	.fini		= mlxsw_pci_sdq_fini,
+	.elem_count	= MLXSW_PCI_WQE_COUNT,
+	.elem_size	= MLXSW_PCI_WQE_SIZE,
+};
+
+static const struct mlxsw_pci_queue_ops mlxsw_pci_rdq_ops = {
+	.type		= MLXSW_PCI_QUEUE_TYPE_RDQ,
+	.init		= mlxsw_pci_rdq_init,
+	.fini		= mlxsw_pci_rdq_fini,
+	.elem_count	= MLXSW_PCI_WQE_COUNT,
+	.elem_size	= MLXSW_PCI_WQE_SIZE
+};
+
+static const struct mlxsw_pci_queue_ops mlxsw_pci_cq_ops = {
+	.type		= MLXSW_PCI_QUEUE_TYPE_CQ,
+	.init		= mlxsw_pci_cq_init,
+	.fini		= mlxsw_pci_cq_fini,
+	.tasklet	= mlxsw_pci_cq_tasklet,
+	.elem_count	= MLXSW_PCI_CQE_COUNT,
+	.elem_size	= MLXSW_PCI_CQE_SIZE
+};
+
+static const struct mlxsw_pci_queue_ops mlxsw_pci_eq_ops = {
+	.type		= MLXSW_PCI_QUEUE_TYPE_EQ,
+	.init		= mlxsw_pci_eq_init,
+	.fini		= mlxsw_pci_eq_fini,
+	.tasklet	= mlxsw_pci_eq_tasklet,
+	.elem_count	= MLXSW_PCI_EQE_COUNT,
+	.elem_size	= MLXSW_PCI_EQE_SIZE
+};
+
+static int mlxsw_pci_queue_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+				const struct mlxsw_pci_queue_ops *q_ops,
+				struct mlxsw_pci_queue *q, u8 q_num)
+{
+	struct mlxsw_pci_mem_item *mem_item = &q->mem_item;
+	int i;
+	int err;
+
+	spin_lock_init(&q->lock);
+	q->num = q_num;
+	q->count = q_ops->elem_count;
+	q->elem_size = q_ops->elem_size;
+	q->type = q_ops->type;
+	q->pci = mlxsw_pci;
+
+	if (q_ops->tasklet)
+		tasklet_init(&q->tasklet, q_ops->tasklet, (unsigned long) q);
+
+	mem_item->size = MLXSW_PCI_AQ_SIZE;
+	mem_item->buf = pci_alloc_consistent(mlxsw_pci->pdev,
+					     mem_item->size,
+					     &mem_item->mapaddr);
+	if (!mem_item->buf)
+		return -ENOMEM;
+	memset(mem_item->buf, 0, mem_item->size);
+
+	q->elem_info = kcalloc(q->count, sizeof(*q->elem_info), GFP_KERNEL);
+	if (!q->elem_info) {
+		err = -ENOMEM;
+		goto err_elem_info_alloc;
+	}
+
+	/* Initialize dma mapped elements info elem_info for
+	 * future easy access.
+	 */
+	for (i = 0; i < q->count; i++) {
+		struct mlxsw_pci_queue_elem_info *elem_info;
+
+		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
+		elem_info->elem =
+			__mlxsw_pci_queue_elem_get(q, q_ops->elem_size, i);
+	}
+
+	mlxsw_cmd_mbox_zero(mbox);
+	err = q_ops->init(mlxsw_pci, mbox, q);
+	if (err)
+		goto err_q_ops_init;
+	return 0;
+
+err_q_ops_init:
+	kfree(q->elem_info);
+err_elem_info_alloc:
+	pci_free_consistent(mlxsw_pci->pdev, mem_item->size,
+			    mem_item->buf, mem_item->mapaddr);
+	return err;
+}
+
+static void mlxsw_pci_queue_fini(struct mlxsw_pci *mlxsw_pci,
+				 const struct mlxsw_pci_queue_ops *q_ops,
+				 struct mlxsw_pci_queue *q)
+{
+	struct mlxsw_pci_mem_item *mem_item = &q->mem_item;
+
+	q_ops->fini(mlxsw_pci, q);
+	kfree(q->elem_info);
+	pci_free_consistent(mlxsw_pci->pdev, mem_item->size,
+			    mem_item->buf, mem_item->mapaddr);
+}
+
+static int mlxsw_pci_queue_group_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+				      const struct mlxsw_pci_queue_ops *q_ops,
+				      u8 num_qs)
+{
+	struct mlxsw_pci_queue_type_group *queue_group;
+	int i;
+	int err;
+
+	queue_group = mlxsw_pci_queue_type_group_get(mlxsw_pci, q_ops->type);
+	queue_group->q = kcalloc(num_qs, sizeof(*queue_group->q), GFP_KERNEL);
+	if (!queue_group->q)
+		return -ENOMEM;
+
+	for (i = 0; i < num_qs; i++) {
+		err = mlxsw_pci_queue_init(mlxsw_pci, mbox, q_ops,
+					   &queue_group->q[i], i);
+		if (err)
+			goto err_queue_init;
+	}
+	queue_group->count = num_qs;
+
+	return 0;
+
+err_queue_init:
+	for (i--; i >= 0; i--)
+		mlxsw_pci_queue_fini(mlxsw_pci, q_ops, &queue_group->q[i]);
+	kfree(queue_group->q);
+	return err;
+}
+
+static void mlxsw_pci_queue_group_fini(struct mlxsw_pci *mlxsw_pci,
+				       const struct mlxsw_pci_queue_ops *q_ops)
+{
+	struct mlxsw_pci_queue_type_group *queue_group;
+	int i;
+
+	queue_group = mlxsw_pci_queue_type_group_get(mlxsw_pci, q_ops->type);
+	for (i = 0; i < queue_group->count; i++)
+		mlxsw_pci_queue_fini(mlxsw_pci, q_ops, &queue_group->q[i]);
+	kfree(queue_group->q);
+}
+
+static int mlxsw_pci_aqs_init(struct mlxsw_pci *mlxsw_pci, char *mbox)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	u8 num_sdqs;
+	u8 sdq_log2sz;
+	u8 num_rdqs;
+	u8 rdq_log2sz;
+	u8 num_cqs;
+	u8 cq_log2sz;
+	u8 num_eqs;
+	u8 eq_log2sz;
+	int err;
+
+	mlxsw_cmd_mbox_zero(mbox);
+	err = mlxsw_cmd_query_aq_cap(mlxsw_pci->core, mbox);
+	if (err)
+		return err;
+
+	num_sdqs = mlxsw_cmd_mbox_query_aq_cap_max_num_sdqs_get(mbox);
+	sdq_log2sz = mlxsw_cmd_mbox_query_aq_cap_log_max_sdq_sz_get(mbox);
+	num_rdqs = mlxsw_cmd_mbox_query_aq_cap_max_num_rdqs_get(mbox);
+	rdq_log2sz = mlxsw_cmd_mbox_query_aq_cap_log_max_rdq_sz_get(mbox);
+	num_cqs = mlxsw_cmd_mbox_query_aq_cap_max_num_cqs_get(mbox);
+	cq_log2sz = mlxsw_cmd_mbox_query_aq_cap_log_max_cq_sz_get(mbox);
+	num_eqs = mlxsw_cmd_mbox_query_aq_cap_max_num_eqs_get(mbox);
+	eq_log2sz = mlxsw_cmd_mbox_query_aq_cap_log_max_eq_sz_get(mbox);
+
+	if (num_sdqs + num_rdqs > num_cqs ||
+	    num_cqs > MLXSW_PCI_CQS_MAX || num_eqs != MLXSW_PCI_EQS_COUNT) {
+		dev_err(&pdev->dev, "Unsupported number of queues\n");
+		return -EINVAL;
+	}
+
+	if ((1 << sdq_log2sz != MLXSW_PCI_WQE_COUNT) ||
+	    (1 << rdq_log2sz != MLXSW_PCI_WQE_COUNT) ||
+	    (1 << cq_log2sz != MLXSW_PCI_CQE_COUNT) ||
+	    (1 << eq_log2sz != MLXSW_PCI_EQE_COUNT)) {
+		dev_err(&pdev->dev, "Unsupported number of async queue descriptors\n");
+		return -EINVAL;
+	}
+
+	err = mlxsw_pci_queue_group_init(mlxsw_pci, mbox, &mlxsw_pci_eq_ops,
+					 num_eqs);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to initialize event queues\n");
+		return err;
+	}
+
+	err = mlxsw_pci_queue_group_init(mlxsw_pci, mbox, &mlxsw_pci_cq_ops,
+					 num_cqs);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to initialize completion queues\n");
+		goto err_cqs_init;
+	}
+
+	err = mlxsw_pci_queue_group_init(mlxsw_pci, mbox, &mlxsw_pci_sdq_ops,
+					 num_sdqs);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to initialize send descriptor queues\n");
+		goto err_sdqs_init;
+	}
+
+	err = mlxsw_pci_queue_group_init(mlxsw_pci, mbox, &mlxsw_pci_rdq_ops,
+					 num_rdqs);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to initialize receive descriptor queues\n");
+		goto err_rdqs_init;
+	}
+
+	/* We have to poll in command interface until queues are initialized */
+	mlxsw_pci->cmd.nopoll = true;
+	return 0;
+
+err_rdqs_init:
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_sdq_ops);
+err_sdqs_init:
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_cq_ops);
+err_cqs_init:
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_eq_ops);
+	return err;
+}
+
+static void mlxsw_pci_aqs_fini(struct mlxsw_pci *mlxsw_pci)
+{
+	mlxsw_pci->cmd.nopoll = false;
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_rdq_ops);
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_sdq_ops);
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_cq_ops);
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_eq_ops);
+}
+
+static void
+mlxsw_pci_config_profile_swid_config(struct mlxsw_pci *mlxsw_pci,
+				     char *mbox, int index,
+				     const struct mlxsw_swid_config *swid)
+{
+	u8 mask = 0;
+
+	if (swid->used_type) {
+		mlxsw_cmd_mbox_config_profile_swid_config_type_set(
+			mbox, index, swid->type);
+		mask |= 1;
+	}
+	if (swid->used_properties) {
+		mlxsw_cmd_mbox_config_profile_swid_config_properties_set(
+			mbox, index, swid->properties);
+		mask |= 2;
+	}
+	mlxsw_cmd_mbox_config_profile_swid_config_mask_set(mbox, index, mask);
+}
+
+static int mlxsw_pci_resources_query(struct mlxsw_pci *mlxsw_pci, char *mbox,
+				     struct mlxsw_res *res)
+{
+	int index, i;
+	u64 data;
+	u16 id;
+	int err;
+
+	if (!res)
+		return 0;
+
+	mlxsw_cmd_mbox_zero(mbox);
+
+	for (index = 0; index < MLXSW_CMD_QUERY_RESOURCES_MAX_QUERIES;
+	     index++) {
+		err = mlxsw_cmd_query_resources(mlxsw_pci->core, mbox, index);
+		if (err)
+			return err;
+
+		for (i = 0; i < MLXSW_CMD_QUERY_RESOURCES_PER_QUERY; i++) {
+			id = mlxsw_cmd_mbox_query_resource_id_get(mbox, i);
+			data = mlxsw_cmd_mbox_query_resource_data_get(mbox, i);
+
+			if (id == MLXSW_CMD_QUERY_RESOURCES_TABLE_END_ID)
+				return 0;
+
+			mlxsw_res_parse(res, id, data);
+		}
+	}
+
+	/* If after MLXSW_RESOURCES_QUERY_MAX_QUERIES we still didn't get
+	 * MLXSW_RESOURCES_TABLE_END_ID, something went bad in the FW.
+	 */
+	return -EIO;
+}
+
+static int
+mlxsw_pci_profile_get_kvd_sizes(const struct mlxsw_pci *mlxsw_pci,
+				const struct mlxsw_config_profile *profile,
+				struct mlxsw_res *res)
+{
+	u64 single_size, double_size, linear_size;
+	int err;
+
+	err = mlxsw_core_kvd_sizes_get(mlxsw_pci->core, profile,
+				       &single_size, &double_size,
+				       &linear_size);
+	if (err)
+		return err;
+
+	MLXSW_RES_SET(res, KVD_SINGLE_SIZE, single_size);
+	MLXSW_RES_SET(res, KVD_DOUBLE_SIZE, double_size);
+	MLXSW_RES_SET(res, KVD_LINEAR_SIZE, linear_size);
+
+	return 0;
+}
+
+static int mlxsw_pci_config_profile(struct mlxsw_pci *mlxsw_pci, char *mbox,
+				    const struct mlxsw_config_profile *profile,
+				    struct mlxsw_res *res)
+{
+	int i;
+	int err;
+
+	mlxsw_cmd_mbox_zero(mbox);
+
+	if (profile->used_max_vepa_channels) {
+		mlxsw_cmd_mbox_config_profile_set_max_vepa_channels_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_vepa_channels_set(
+			mbox, profile->max_vepa_channels);
+	}
+	if (profile->used_max_mid) {
+		mlxsw_cmd_mbox_config_profile_set_max_mid_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_mid_set(
+			mbox, profile->max_mid);
+	}
+	if (profile->used_max_pgt) {
+		mlxsw_cmd_mbox_config_profile_set_max_pgt_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_pgt_set(
+			mbox, profile->max_pgt);
+	}
+	if (profile->used_max_system_port) {
+		mlxsw_cmd_mbox_config_profile_set_max_system_port_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_system_port_set(
+			mbox, profile->max_system_port);
+	}
+	if (profile->used_max_vlan_groups) {
+		mlxsw_cmd_mbox_config_profile_set_max_vlan_groups_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_vlan_groups_set(
+			mbox, profile->max_vlan_groups);
+	}
+	if (profile->used_max_regions) {
+		mlxsw_cmd_mbox_config_profile_set_max_regions_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_regions_set(
+			mbox, profile->max_regions);
+	}
+	if (profile->used_flood_tables) {
+		mlxsw_cmd_mbox_config_profile_set_flood_tables_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_flood_tables_set(
+			mbox, profile->max_flood_tables);
+		mlxsw_cmd_mbox_config_profile_max_vid_flood_tables_set(
+			mbox, profile->max_vid_flood_tables);
+		mlxsw_cmd_mbox_config_profile_max_fid_offset_flood_tables_set(
+			mbox, profile->max_fid_offset_flood_tables);
+		mlxsw_cmd_mbox_config_profile_fid_offset_flood_table_size_set(
+			mbox, profile->fid_offset_flood_table_size);
+		mlxsw_cmd_mbox_config_profile_max_fid_flood_tables_set(
+			mbox, profile->max_fid_flood_tables);
+		mlxsw_cmd_mbox_config_profile_fid_flood_table_size_set(
+			mbox, profile->fid_flood_table_size);
+	}
+	if (profile->used_flood_mode) {
+		mlxsw_cmd_mbox_config_profile_set_flood_mode_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_flood_mode_set(
+			mbox, profile->flood_mode);
+	}
+	if (profile->used_max_ib_mc) {
+		mlxsw_cmd_mbox_config_profile_set_max_ib_mc_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_ib_mc_set(
+			mbox, profile->max_ib_mc);
+	}
+	if (profile->used_max_pkey) {
+		mlxsw_cmd_mbox_config_profile_set_max_pkey_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_pkey_set(
+			mbox, profile->max_pkey);
+	}
+	if (profile->used_ar_sec) {
+		mlxsw_cmd_mbox_config_profile_set_ar_sec_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_ar_sec_set(
+			mbox, profile->ar_sec);
+	}
+	if (profile->used_adaptive_routing_group_cap) {
+		mlxsw_cmd_mbox_config_profile_set_adaptive_routing_group_cap_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_adaptive_routing_group_cap_set(
+			mbox, profile->adaptive_routing_group_cap);
+	}
+	if (profile->used_kvd_sizes && MLXSW_RES_VALID(res, KVD_SIZE)) {
+		err = mlxsw_pci_profile_get_kvd_sizes(mlxsw_pci, profile, res);
+		if (err)
+			return err;
+
+		mlxsw_cmd_mbox_config_profile_set_kvd_linear_size_set(mbox, 1);
+		mlxsw_cmd_mbox_config_profile_kvd_linear_size_set(mbox,
+					MLXSW_RES_GET(res, KVD_LINEAR_SIZE));
+		mlxsw_cmd_mbox_config_profile_set_kvd_hash_single_size_set(mbox,
+									   1);
+		mlxsw_cmd_mbox_config_profile_kvd_hash_single_size_set(mbox,
+					MLXSW_RES_GET(res, KVD_SINGLE_SIZE));
+		mlxsw_cmd_mbox_config_profile_set_kvd_hash_double_size_set(
+								mbox, 1);
+		mlxsw_cmd_mbox_config_profile_kvd_hash_double_size_set(mbox,
+					MLXSW_RES_GET(res, KVD_DOUBLE_SIZE));
+	}
+
+	for (i = 0; i < MLXSW_CONFIG_PROFILE_SWID_COUNT; i++)
+		mlxsw_pci_config_profile_swid_config(mlxsw_pci, mbox, i,
+						     &profile->swid_config[i]);
+
+	return mlxsw_cmd_config_profile_set(mlxsw_pci->core, mbox);
+}
+
+static int mlxsw_pci_boardinfo(struct mlxsw_pci *mlxsw_pci, char *mbox)
+{
+	struct mlxsw_bus_info *bus_info = &mlxsw_pci->bus_info;
+	int err;
+
+	mlxsw_cmd_mbox_zero(mbox);
+	err = mlxsw_cmd_boardinfo(mlxsw_pci->core, mbox);
+	if (err)
+		return err;
+	mlxsw_cmd_mbox_boardinfo_vsd_memcpy_from(mbox, bus_info->vsd);
+	mlxsw_cmd_mbox_boardinfo_psid_memcpy_from(mbox, bus_info->psid);
+	return 0;
+}
+
+static int mlxsw_pci_fw_area_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+				  u16 num_pages)
+{
+	struct mlxsw_pci_mem_item *mem_item;
+	int nent = 0;
+	int i;
+	int err;
+
+	mlxsw_pci->fw_area.items = kcalloc(num_pages, sizeof(*mem_item),
+					   GFP_KERNEL);
+	if (!mlxsw_pci->fw_area.items)
+		return -ENOMEM;
+	mlxsw_pci->fw_area.count = num_pages;
+
+	mlxsw_cmd_mbox_zero(mbox);
+	for (i = 0; i < num_pages; i++) {
+		mem_item = &mlxsw_pci->fw_area.items[i];
+
+		mem_item->size = MLXSW_PCI_PAGE_SIZE;
+		mem_item->buf = pci_alloc_consistent(mlxsw_pci->pdev,
+						     mem_item->size,
+						     &mem_item->mapaddr);
+		if (!mem_item->buf) {
+			err = -ENOMEM;
+			goto err_alloc;
+		}
+		mlxsw_cmd_mbox_map_fa_pa_set(mbox, nent, mem_item->mapaddr);
+		mlxsw_cmd_mbox_map_fa_log2size_set(mbox, nent, 0); /* 1 page */
+		if (++nent == MLXSW_CMD_MAP_FA_VPM_ENTRIES_MAX) {
+			err = mlxsw_cmd_map_fa(mlxsw_pci->core, mbox, nent);
+			if (err)
+				goto err_cmd_map_fa;
+			nent = 0;
+			mlxsw_cmd_mbox_zero(mbox);
+		}
+	}
+
+	if (nent) {
+		err = mlxsw_cmd_map_fa(mlxsw_pci->core, mbox, nent);
+		if (err)
+			goto err_cmd_map_fa;
+	}
+
+	return 0;
+
+err_cmd_map_fa:
+err_alloc:
+	for (i--; i >= 0; i--) {
+		mem_item = &mlxsw_pci->fw_area.items[i];
+
+		pci_free_consistent(mlxsw_pci->pdev, mem_item->size,
+				    mem_item->buf, mem_item->mapaddr);
+	}
+	kfree(mlxsw_pci->fw_area.items);
+	return err;
+}
+
+static void mlxsw_pci_fw_area_fini(struct mlxsw_pci *mlxsw_pci)
+{
+	struct mlxsw_pci_mem_item *mem_item;
+	int i;
+
+	mlxsw_cmd_unmap_fa(mlxsw_pci->core);
+
+	for (i = 0; i < mlxsw_pci->fw_area.count; i++) {
+		mem_item = &mlxsw_pci->fw_area.items[i];
+
+		pci_free_consistent(mlxsw_pci->pdev, mem_item->size,
+				    mem_item->buf, mem_item->mapaddr);
+	}
+	kfree(mlxsw_pci->fw_area.items);
+}
+
+static irqreturn_t mlxsw_pci_eq_irq_handler(int irq, void *dev_id)
+{
+	struct mlxsw_pci *mlxsw_pci = dev_id;
+	struct mlxsw_pci_queue *q;
+	int i;
+
+	for (i = 0; i < MLXSW_PCI_EQS_COUNT; i++) {
+		q = mlxsw_pci_eq_get(mlxsw_pci, i);
+		mlxsw_pci_queue_tasklet_schedule(q);
+	}
+	return IRQ_HANDLED;
+}
+
+static int mlxsw_pci_mbox_alloc(struct mlxsw_pci *mlxsw_pci,
+				struct mlxsw_pci_mem_item *mbox)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	int err = 0;
+
+	mbox->size = MLXSW_CMD_MBOX_SIZE;
+	mbox->buf = pci_alloc_consistent(pdev, MLXSW_CMD_MBOX_SIZE,
+					 &mbox->mapaddr);
+	if (!mbox->buf) {
+		dev_err(&pdev->dev, "Failed allocating memory for mailbox\n");
+		err = -ENOMEM;
+	}
+
+	return err;
+}
+
+static void mlxsw_pci_mbox_free(struct mlxsw_pci *mlxsw_pci,
+				struct mlxsw_pci_mem_item *mbox)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+
+	pci_free_consistent(pdev, MLXSW_CMD_MBOX_SIZE, mbox->buf,
+			    mbox->mapaddr);
+}
+
+static int mlxsw_pci_init(void *bus_priv, struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_config_profile *profile,
+			  struct mlxsw_res *res)
+{
+	struct mlxsw_pci *mlxsw_pci = bus_priv;
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	char *mbox;
+	u16 num_pages;
+	int err;
+
+	mutex_init(&mlxsw_pci->cmd.lock);
+	init_waitqueue_head(&mlxsw_pci->cmd.wait);
+
+	mlxsw_pci->core = mlxsw_core;
+
+	mbox = mlxsw_cmd_mbox_alloc();
+	if (!mbox)
+		return -ENOMEM;
+
+	err = mlxsw_pci_mbox_alloc(mlxsw_pci, &mlxsw_pci->cmd.in_mbox);
+	if (err)
+		goto mbox_put;
+
+	err = mlxsw_pci_mbox_alloc(mlxsw_pci, &mlxsw_pci->cmd.out_mbox);
+	if (err)
+		goto err_out_mbox_alloc;
+
+	err = mlxsw_cmd_query_fw(mlxsw_core, mbox);
+	if (err)
+		goto err_query_fw;
+
+	mlxsw_pci->bus_info.fw_rev.major =
+		mlxsw_cmd_mbox_query_fw_fw_rev_major_get(mbox);
+	mlxsw_pci->bus_info.fw_rev.minor =
+		mlxsw_cmd_mbox_query_fw_fw_rev_minor_get(mbox);
+	mlxsw_pci->bus_info.fw_rev.subminor =
+		mlxsw_cmd_mbox_query_fw_fw_rev_subminor_get(mbox);
+
+	if (mlxsw_cmd_mbox_query_fw_cmd_interface_rev_get(mbox) != 1) {
+		dev_err(&pdev->dev, "Unsupported cmd interface revision ID queried from hw\n");
+		err = -EINVAL;
+		goto err_iface_rev;
+	}
+	if (mlxsw_cmd_mbox_query_fw_doorbell_page_bar_get(mbox) != 0) {
+		dev_err(&pdev->dev, "Unsupported doorbell page bar queried from hw\n");
+		err = -EINVAL;
+		goto err_doorbell_page_bar;
+	}
+
+	mlxsw_pci->doorbell_offset =
+		mlxsw_cmd_mbox_query_fw_doorbell_page_offset_get(mbox);
+
+	num_pages = mlxsw_cmd_mbox_query_fw_fw_pages_get(mbox);
+	err = mlxsw_pci_fw_area_init(mlxsw_pci, mbox, num_pages);
+	if (err)
+		goto err_fw_area_init;
+
+	err = mlxsw_pci_boardinfo(mlxsw_pci, mbox);
+	if (err)
+		goto err_boardinfo;
+
+	err = mlxsw_pci_resources_query(mlxsw_pci, mbox, res);
+	if (err)
+		goto err_query_resources;
+
+	err = mlxsw_pci_config_profile(mlxsw_pci, mbox, profile, res);
+	if (err)
+		goto err_config_profile;
+
+	err = mlxsw_pci_aqs_init(mlxsw_pci, mbox);
+	if (err)
+		goto err_aqs_init;
+
+	err = request_irq(pci_irq_vector(pdev, 0),
+			  mlxsw_pci_eq_irq_handler, 0,
+			  mlxsw_pci->bus_info.device_kind, mlxsw_pci);
+	if (err) {
+		dev_err(&pdev->dev, "IRQ request failed\n");
+		goto err_request_eq_irq;
+	}
+
+	goto mbox_put;
+
+err_request_eq_irq:
+	mlxsw_pci_aqs_fini(mlxsw_pci);
+err_aqs_init:
+err_config_profile:
+err_query_resources:
+err_boardinfo:
+	mlxsw_pci_fw_area_fini(mlxsw_pci);
+err_fw_area_init:
+err_doorbell_page_bar:
+err_iface_rev:
+err_query_fw:
+	mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.out_mbox);
+err_out_mbox_alloc:
+	mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.in_mbox);
+mbox_put:
+	mlxsw_cmd_mbox_free(mbox);
+	return err;
+}
+
+static void mlxsw_pci_fini(void *bus_priv)
+{
+	struct mlxsw_pci *mlxsw_pci = bus_priv;
+
+	free_irq(pci_irq_vector(mlxsw_pci->pdev, 0), mlxsw_pci);
+	mlxsw_pci_aqs_fini(mlxsw_pci);
+	mlxsw_pci_fw_area_fini(mlxsw_pci);
+	mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.out_mbox);
+	mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.in_mbox);
+}
+
+static struct mlxsw_pci_queue *
+mlxsw_pci_sdq_pick(struct mlxsw_pci *mlxsw_pci,
+		   const struct mlxsw_tx_info *tx_info)
+{
+	u8 sdqn = tx_info->local_port % mlxsw_pci_sdq_count(mlxsw_pci);
+
+	return mlxsw_pci_sdq_get(mlxsw_pci, sdqn);
+}
+
+static bool mlxsw_pci_skb_transmit_busy(void *bus_priv,
+					const struct mlxsw_tx_info *tx_info)
+{
+	struct mlxsw_pci *mlxsw_pci = bus_priv;
+	struct mlxsw_pci_queue *q = mlxsw_pci_sdq_pick(mlxsw_pci, tx_info);
+
+	return !mlxsw_pci_queue_elem_info_producer_get(q);
+}
+
+static int mlxsw_pci_skb_transmit(void *bus_priv, struct sk_buff *skb,
+				  const struct mlxsw_tx_info *tx_info)
+{
+	struct mlxsw_pci *mlxsw_pci = bus_priv;
+	struct mlxsw_pci_queue *q;
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	char *wqe;
+	int i;
+	int err;
+
+	if (skb_shinfo(skb)->nr_frags > MLXSW_PCI_WQE_SG_ENTRIES - 1) {
+		err = skb_linearize(skb);
+		if (err)
+			return err;
+	}
+
+	q = mlxsw_pci_sdq_pick(mlxsw_pci, tx_info);
+	spin_lock_bh(&q->lock);
+	elem_info = mlxsw_pci_queue_elem_info_producer_get(q);
+	if (!elem_info) {
+		/* queue is full */
+		err = -EAGAIN;
+		goto unlock;
+	}
+	elem_info->u.sdq.skb = skb;
+
+	wqe = elem_info->elem;
+	mlxsw_pci_wqe_c_set(wqe, 1); /* always report completion */
+	mlxsw_pci_wqe_lp_set(wqe, !!tx_info->is_emad);
+	mlxsw_pci_wqe_type_set(wqe, MLXSW_PCI_WQE_TYPE_ETHERNET);
+
+	err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, 0, skb->data,
+				     skb_headlen(skb), DMA_TO_DEVICE);
+	if (err)
+		goto unlock;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, i + 1,
+					     skb_frag_address(frag),
+					     skb_frag_size(frag),
+					     DMA_TO_DEVICE);
+		if (err)
+			goto unmap_frags;
+	}
+
+	/* Set unused sq entries byte count to zero. */
+	for (i++; i < MLXSW_PCI_WQE_SG_ENTRIES; i++)
+		mlxsw_pci_wqe_byte_count_set(wqe, i, 0);
+
+	/* Everything is set up, ring producer doorbell to get HW going */
+	q->producer_counter++;
+	mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);
+
+	goto unlock;
+
+unmap_frags:
+	for (; i >= 0; i--)
+		mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, i, DMA_TO_DEVICE);
+unlock:
+	spin_unlock_bh(&q->lock);
+	return err;
+}
+
+static int mlxsw_pci_cmd_exec(void *bus_priv, u16 opcode, u8 opcode_mod,
+			      u32 in_mod, bool out_mbox_direct,
+			      char *in_mbox, size_t in_mbox_size,
+			      char *out_mbox, size_t out_mbox_size,
+			      u8 *p_status)
+{
+	struct mlxsw_pci *mlxsw_pci = bus_priv;
+	dma_addr_t in_mapaddr = 0, out_mapaddr = 0;
+	bool evreq = mlxsw_pci->cmd.nopoll;
+	unsigned long timeout = msecs_to_jiffies(MLXSW_PCI_CIR_TIMEOUT_MSECS);
+	bool *p_wait_done = &mlxsw_pci->cmd.wait_done;
+	int err;
+
+	*p_status = MLXSW_CMD_STATUS_OK;
+
+	err = mutex_lock_interruptible(&mlxsw_pci->cmd.lock);
+	if (err)
+		return err;
+
+	if (in_mbox) {
+		memcpy(mlxsw_pci->cmd.in_mbox.buf, in_mbox, in_mbox_size);
+		in_mapaddr = mlxsw_pci->cmd.in_mbox.mapaddr;
+	}
+	mlxsw_pci_write32(mlxsw_pci, CIR_IN_PARAM_HI, upper_32_bits(in_mapaddr));
+	mlxsw_pci_write32(mlxsw_pci, CIR_IN_PARAM_LO, lower_32_bits(in_mapaddr));
+
+	if (out_mbox)
+		out_mapaddr = mlxsw_pci->cmd.out_mbox.mapaddr;
+	mlxsw_pci_write32(mlxsw_pci, CIR_OUT_PARAM_HI, upper_32_bits(out_mapaddr));
+	mlxsw_pci_write32(mlxsw_pci, CIR_OUT_PARAM_LO, lower_32_bits(out_mapaddr));
+
+	mlxsw_pci_write32(mlxsw_pci, CIR_IN_MODIFIER, in_mod);
+	mlxsw_pci_write32(mlxsw_pci, CIR_TOKEN, 0);
+
+	*p_wait_done = false;
+
+	wmb(); /* all needs to be written before we write control register */
+	mlxsw_pci_write32(mlxsw_pci, CIR_CTRL,
+			  MLXSW_PCI_CIR_CTRL_GO_BIT |
+			  (evreq ? MLXSW_PCI_CIR_CTRL_EVREQ_BIT : 0) |
+			  (opcode_mod << MLXSW_PCI_CIR_CTRL_OPCODE_MOD_SHIFT) |
+			  opcode);
+
+	if (!evreq) {
+		unsigned long end;
+
+		end = jiffies + timeout;
+		do {
+			u32 ctrl = mlxsw_pci_read32(mlxsw_pci, CIR_CTRL);
+
+			if (!(ctrl & MLXSW_PCI_CIR_CTRL_GO_BIT)) {
+				*p_wait_done = true;
+				*p_status = ctrl >> MLXSW_PCI_CIR_CTRL_STATUS_SHIFT;
+				break;
+			}
+			cond_resched();
+		} while (time_before(jiffies, end));
+	} else {
+		wait_event_timeout(mlxsw_pci->cmd.wait, *p_wait_done, timeout);
+		*p_status = mlxsw_pci->cmd.comp.status;
+	}
+
+	err = 0;
+	if (*p_wait_done) {
+		if (*p_status)
+			err = -EIO;
+	} else {
+		err = -ETIMEDOUT;
+	}
+
+	if (!err && out_mbox && out_mbox_direct) {
+		/* Some commands don't use output param as address to mailbox
+		 * but they store output directly into registers. In that case,
+		 * copy registers into mbox buffer.
+		 */
+		__be32 tmp;
+
+		if (!evreq) {
+			tmp = cpu_to_be32(mlxsw_pci_read32(mlxsw_pci,
+							   CIR_OUT_PARAM_HI));
+			memcpy(out_mbox, &tmp, sizeof(tmp));
+			tmp = cpu_to_be32(mlxsw_pci_read32(mlxsw_pci,
+							   CIR_OUT_PARAM_LO));
+			memcpy(out_mbox + sizeof(tmp), &tmp, sizeof(tmp));
+		}
+	} else if (!err && out_mbox) {
+		memcpy(out_mbox, mlxsw_pci->cmd.out_mbox.buf, out_mbox_size);
+	}
+
+	mutex_unlock(&mlxsw_pci->cmd.lock);
+
+	return err;
+}
+
+static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci,
+			      const struct pci_device_id *id)
+{
+	unsigned long end;
+
+	mlxsw_pci_write32(mlxsw_pci, SW_RESET, MLXSW_PCI_SW_RESET_RST_BIT);
+	if (id->device == PCI_DEVICE_ID_MELLANOX_SWITCHX2) {
+		msleep(MLXSW_PCI_SW_RESET_TIMEOUT_MSECS);
+		return 0;
+	}
+
+	/* Reset needs to be written before we read control register, and
+	 * we must wait for the HW to become responsive once again
+	 */
+	wmb();
+	msleep(MLXSW_PCI_SW_RESET_WAIT_MSECS);
+
+	end = jiffies + msecs_to_jiffies(MLXSW_PCI_SW_RESET_TIMEOUT_MSECS);
+	do {
+		u32 val = mlxsw_pci_read32(mlxsw_pci, FW_READY);
+
+		if ((val & MLXSW_PCI_FW_READY_MASK) == MLXSW_PCI_FW_READY_MAGIC)
+			break;
+		cond_resched();
+	} while (time_before(jiffies, end));
+	return 0;
+}
+
+static void mlxsw_pci_free_irq_vectors(struct mlxsw_pci *mlxsw_pci)
+{
+	pci_free_irq_vectors(mlxsw_pci->pdev);
+}
+
+static int mlxsw_pci_alloc_irq_vectors(struct mlxsw_pci *mlxsw_pci)
+{
+	int err;
+
+	err = pci_alloc_irq_vectors(mlxsw_pci->pdev, 1, 1, PCI_IRQ_MSIX);
+	if (err < 0)
+		dev_err(&mlxsw_pci->pdev->dev, "MSI-X init failed\n");
+	return err;
+}
+
+static void mlxsw_pci_reset(void *bus_priv)
+{
+	struct mlxsw_pci *mlxsw_pci = bus_priv;
+
+	mlxsw_pci_free_irq_vectors(mlxsw_pci);
+	mlxsw_pci_sw_reset(mlxsw_pci, mlxsw_pci->id);
+	mlxsw_pci_alloc_irq_vectors(mlxsw_pci);
+}
+
+static const struct mlxsw_bus mlxsw_pci_bus = {
+	.kind			= "pci",
+	.init			= mlxsw_pci_init,
+	.fini			= mlxsw_pci_fini,
+	.skb_transmit_busy	= mlxsw_pci_skb_transmit_busy,
+	.skb_transmit		= mlxsw_pci_skb_transmit,
+	.cmd_exec		= mlxsw_pci_cmd_exec,
+	.features		= MLXSW_BUS_F_TXRX,
+	.reset			= mlxsw_pci_reset,
+};
+
+static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	const char *driver_name = pdev->driver->name;
+	struct mlxsw_pci *mlxsw_pci;
+	int err;
+
+	mlxsw_pci = kzalloc(sizeof(*mlxsw_pci), GFP_KERNEL);
+	if (!mlxsw_pci)
+		return -ENOMEM;
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "pci_enable_device failed\n");
+		goto err_pci_enable_device;
+	}
+
+	err = pci_request_regions(pdev, driver_name);
+	if (err) {
+		dev_err(&pdev->dev, "pci_request_regions failed\n");
+		goto err_pci_request_regions;
+	}
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (!err) {
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+		if (err) {
+			dev_err(&pdev->dev, "pci_set_consistent_dma_mask failed\n");
+			goto err_pci_set_dma_mask;
+		}
+	} else {
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "pci_set_dma_mask failed\n");
+			goto err_pci_set_dma_mask;
+		}
+	}
+
+	if (pci_resource_len(pdev, 0) < MLXSW_PCI_BAR0_SIZE) {
+		dev_err(&pdev->dev, "invalid PCI region size\n");
+		err = -EINVAL;
+		goto err_pci_resource_len_check;
+	}
+
+	mlxsw_pci->hw_addr = ioremap(pci_resource_start(pdev, 0),
+				     pci_resource_len(pdev, 0));
+	if (!mlxsw_pci->hw_addr) {
+		dev_err(&pdev->dev, "ioremap failed\n");
+		err = -EIO;
+		goto err_ioremap;
+	}
+	pci_set_master(pdev);
+
+	mlxsw_pci->pdev = pdev;
+	pci_set_drvdata(pdev, mlxsw_pci);
+
+	err = mlxsw_pci_sw_reset(mlxsw_pci, id);
+	if (err) {
+		dev_err(&pdev->dev, "Software reset failed\n");
+		goto err_sw_reset;
+	}
+
+	err = mlxsw_pci_alloc_irq_vectors(mlxsw_pci);
+	if (err < 0) {
+		dev_err(&pdev->dev, "MSI-X init failed\n");
+		goto err_msix_init;
+	}
+
+	mlxsw_pci->bus_info.device_kind = driver_name;
+	mlxsw_pci->bus_info.device_name = pci_name(mlxsw_pci->pdev);
+	mlxsw_pci->bus_info.dev = &pdev->dev;
+	mlxsw_pci->id = id;
+
+	err = mlxsw_core_bus_device_register(&mlxsw_pci->bus_info,
+					     &mlxsw_pci_bus, mlxsw_pci, false,
+					     NULL);
+	if (err) {
+		dev_err(&pdev->dev, "cannot register bus device\n");
+		goto err_bus_device_register;
+	}
+
+	return 0;
+
+err_bus_device_register:
+	mlxsw_pci_free_irq_vectors(mlxsw_pci);
+err_msix_init:
+err_sw_reset:
+	iounmap(mlxsw_pci->hw_addr);
+err_ioremap:
+err_pci_resource_len_check:
+err_pci_set_dma_mask:
+	pci_release_regions(pdev);
+err_pci_request_regions:
+	pci_disable_device(pdev);
+err_pci_enable_device:
+	kfree(mlxsw_pci);
+	return err;
+}
+
+static void mlxsw_pci_remove(struct pci_dev *pdev)
+{
+	struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev);
+
+	mlxsw_core_bus_device_unregister(mlxsw_pci->core, false);
+	mlxsw_pci_free_irq_vectors(mlxsw_pci);
+	iounmap(mlxsw_pci->hw_addr);
+	pci_release_regions(mlxsw_pci->pdev);
+	pci_disable_device(mlxsw_pci->pdev);
+	kfree(mlxsw_pci);
+}
+
+int mlxsw_pci_driver_register(struct pci_driver *pci_driver)
+{
+	pci_driver->probe = mlxsw_pci_probe;
+	pci_driver->remove = mlxsw_pci_remove;
+	return pci_register_driver(pci_driver);
+}
+EXPORT_SYMBOL(mlxsw_pci_driver_register);
+
+void mlxsw_pci_driver_unregister(struct pci_driver *pci_driver)
+{
+	pci_unregister_driver(pci_driver);
+}
+EXPORT_SYMBOL(mlxsw_pci_driver_unregister);
+
+static int __init mlxsw_pci_module_init(void)
+{
+	return 0;
+}
+
+static void __exit mlxsw_pci_module_exit(void)
+{
+}
+
+module_init(mlxsw_pci_module_init);
+module_exit(mlxsw_pci_module_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox switch PCI interface driver");
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.h b/drivers/net/ethernet/mellanox/mlxsw/pci.h
new file mode 100644
index 0000000..d655823
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.h
@@ -0,0 +1,65 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/pci.h
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_PCI_H
+#define _MLXSW_PCI_H
+
+#include <linux/pci.h>
+
+#define PCI_DEVICE_ID_MELLANOX_SWITCHX2		0xc738
+#define PCI_DEVICE_ID_MELLANOX_SPECTRUM		0xcb84
+#define PCI_DEVICE_ID_MELLANOX_SWITCHIB		0xcb20
+#define PCI_DEVICE_ID_MELLANOX_SWITCHIB2	0xcf08
+
+#if IS_ENABLED(CONFIG_MLXSW_PCI)
+
+int mlxsw_pci_driver_register(struct pci_driver *pci_driver);
+void mlxsw_pci_driver_unregister(struct pci_driver *pci_driver);
+
+#else
+
+static inline int
+mlxsw_pci_driver_register(struct pci_driver *pci_driver)
+{
+	return 0;
+}
+
+static inline void
+mlxsw_pci_driver_unregister(struct pci_driver *pci_driver)
+{
+}
+
+#endif
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
new file mode 100644
index 0000000..fb082ad
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
@@ -0,0 +1,230 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
+ * Copyright (c) 2015-2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015-2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_PCI_HW_H
+#define _MLXSW_PCI_HW_H
+
+#include <linux/bitops.h>
+
+#include "item.h"
+
+#define MLXSW_PCI_BAR0_SIZE		(1024 * 1024) /* 1MB */
+#define MLXSW_PCI_PAGE_SIZE		4096
+
+#define MLXSW_PCI_CIR_BASE			0x71000
+#define MLXSW_PCI_CIR_IN_PARAM_HI		MLXSW_PCI_CIR_BASE
+#define MLXSW_PCI_CIR_IN_PARAM_LO		(MLXSW_PCI_CIR_BASE + 0x04)
+#define MLXSW_PCI_CIR_IN_MODIFIER		(MLXSW_PCI_CIR_BASE + 0x08)
+#define MLXSW_PCI_CIR_OUT_PARAM_HI		(MLXSW_PCI_CIR_BASE + 0x0C)
+#define MLXSW_PCI_CIR_OUT_PARAM_LO		(MLXSW_PCI_CIR_BASE + 0x10)
+#define MLXSW_PCI_CIR_TOKEN			(MLXSW_PCI_CIR_BASE + 0x14)
+#define MLXSW_PCI_CIR_CTRL			(MLXSW_PCI_CIR_BASE + 0x18)
+#define MLXSW_PCI_CIR_CTRL_GO_BIT		BIT(23)
+#define MLXSW_PCI_CIR_CTRL_EVREQ_BIT		BIT(22)
+#define MLXSW_PCI_CIR_CTRL_OPCODE_MOD_SHIFT	12
+#define MLXSW_PCI_CIR_CTRL_STATUS_SHIFT		24
+#define MLXSW_PCI_CIR_TIMEOUT_MSECS		1000
+
+#define MLXSW_PCI_SW_RESET			0xF0010
+#define MLXSW_PCI_SW_RESET_RST_BIT		BIT(0)
+#define MLXSW_PCI_SW_RESET_TIMEOUT_MSECS	5000
+#define MLXSW_PCI_SW_RESET_WAIT_MSECS		100
+#define MLXSW_PCI_FW_READY			0xA1844
+#define MLXSW_PCI_FW_READY_MASK			0xFFFF
+#define MLXSW_PCI_FW_READY_MAGIC		0x5E
+
+#define MLXSW_PCI_DOORBELL_SDQ_OFFSET		0x000
+#define MLXSW_PCI_DOORBELL_RDQ_OFFSET		0x200
+#define MLXSW_PCI_DOORBELL_CQ_OFFSET		0x400
+#define MLXSW_PCI_DOORBELL_EQ_OFFSET		0x600
+#define MLXSW_PCI_DOORBELL_ARM_CQ_OFFSET	0x800
+#define MLXSW_PCI_DOORBELL_ARM_EQ_OFFSET	0xA00
+
+#define MLXSW_PCI_DOORBELL(offset, type_offset, num)	\
+	((offset) + (type_offset) + (num) * 4)
+
+#define MLXSW_PCI_CQS_MAX	96
+#define MLXSW_PCI_EQS_COUNT	2
+#define MLXSW_PCI_EQ_ASYNC_NUM	0
+#define MLXSW_PCI_EQ_COMP_NUM	1
+
+#define MLXSW_PCI_AQ_PAGES	8
+#define MLXSW_PCI_AQ_SIZE	(MLXSW_PCI_PAGE_SIZE * MLXSW_PCI_AQ_PAGES)
+#define MLXSW_PCI_WQE_SIZE	32 /* 32 bytes per element */
+#define MLXSW_PCI_CQE_SIZE	16 /* 16 bytes per element */
+#define MLXSW_PCI_EQE_SIZE	16 /* 16 bytes per element */
+#define MLXSW_PCI_WQE_COUNT	(MLXSW_PCI_AQ_SIZE / MLXSW_PCI_WQE_SIZE)
+#define MLXSW_PCI_CQE_COUNT	(MLXSW_PCI_AQ_SIZE / MLXSW_PCI_CQE_SIZE)
+#define MLXSW_PCI_EQE_COUNT	(MLXSW_PCI_AQ_SIZE / MLXSW_PCI_EQE_SIZE)
+#define MLXSW_PCI_EQE_UPDATE_COUNT	0x80
+
+#define MLXSW_PCI_WQE_SG_ENTRIES	3
+#define MLXSW_PCI_WQE_TYPE_ETHERNET	0xA
+
+/* pci_wqe_c
+ * If set it indicates that a completion should be reported upon
+ * execution of this descriptor.
+ */
+MLXSW_ITEM32(pci, wqe, c, 0x00, 31, 1);
+
+/* pci_wqe_lp
+ * Local Processing, set if packet should be processed by the local
+ * switch hardware:
+ * For Ethernet EMAD (Direct Route and non Direct Route) -
+ * must be set if packet destination is local device
+ * For InfiniBand CTL - must be set if packet destination is local device
+ * Otherwise it must be clear
+ * Local Process packets must not exceed the size of 2K (including payload
+ * and headers).
+ */
+MLXSW_ITEM32(pci, wqe, lp, 0x00, 30, 1);
+
+/* pci_wqe_type
+ * Packet type.
+ */
+MLXSW_ITEM32(pci, wqe, type, 0x00, 23, 4);
+
+/* pci_wqe_byte_count
+ * Size of i-th scatter/gather entry, 0 if entry is unused.
+ */
+MLXSW_ITEM16_INDEXED(pci, wqe, byte_count, 0x02, 0, 14, 0x02, 0x00, false);
+
+/* pci_wqe_address
+ * Physical address of i-th scatter/gather entry.
+ * Gather Entries must be 2Byte aligned.
+ */
+MLXSW_ITEM64_INDEXED(pci, wqe, address, 0x08, 0, 64, 0x8, 0x0, false);
+
+/* pci_cqe_lag
+ * Packet arrives from a port which is a LAG
+ */
+MLXSW_ITEM32(pci, cqe, lag, 0x00, 23, 1);
+
+/* pci_cqe_system_port/lag_id
+ * When lag=0: System port on which the packet was received
+ * When lag=1:
+ * bits [15:4] LAG ID on which the packet was received
+ * bits [3:0] sub_port on which the packet was received
+ */
+MLXSW_ITEM32(pci, cqe, system_port, 0x00, 0, 16);
+MLXSW_ITEM32(pci, cqe, lag_id, 0x00, 4, 12);
+MLXSW_ITEM32(pci, cqe, lag_port_index, 0x00, 0, 4);
+
+/* pci_cqe_wqe_counter
+ * WQE count of the WQEs completed on the associated dqn
+ */
+MLXSW_ITEM32(pci, cqe, wqe_counter, 0x04, 16, 16);
+
+/* pci_cqe_byte_count
+ * Byte count of received packets including additional two
+ * Reserved Bytes that are append to the end of the frame.
+ * Reserved for Send CQE.
+ */
+MLXSW_ITEM32(pci, cqe, byte_count, 0x04, 0, 14);
+
+/* pci_cqe_trap_id
+ * Trap ID that captured the packet.
+ */
+MLXSW_ITEM32(pci, cqe, trap_id, 0x08, 0, 9);
+
+/* pci_cqe_crc
+ * Length include CRC. Indicates the length field includes
+ * the packet's CRC.
+ */
+MLXSW_ITEM32(pci, cqe, crc, 0x0C, 8, 1);
+
+/* pci_cqe_e
+ * CQE with Error.
+ */
+MLXSW_ITEM32(pci, cqe, e, 0x0C, 7, 1);
+
+/* pci_cqe_sr
+ * 1 - Send Queue
+ * 0 - Receive Queue
+ */
+MLXSW_ITEM32(pci, cqe, sr, 0x0C, 6, 1);
+
+/* pci_cqe_dqn
+ * Descriptor Queue (DQ) Number.
+ */
+MLXSW_ITEM32(pci, cqe, dqn, 0x0C, 1, 5);
+
+/* pci_cqe_owner
+ * Ownership bit.
+ */
+MLXSW_ITEM32(pci, cqe, owner, 0x0C, 0, 1);
+
+/* pci_eqe_event_type
+ * Event type.
+ */
+MLXSW_ITEM32(pci, eqe, event_type, 0x0C, 24, 8);
+#define MLXSW_PCI_EQE_EVENT_TYPE_COMP	0x00
+#define MLXSW_PCI_EQE_EVENT_TYPE_CMD	0x0A
+
+/* pci_eqe_event_sub_type
+ * Event type.
+ */
+MLXSW_ITEM32(pci, eqe, event_sub_type, 0x0C, 16, 8);
+
+/* pci_eqe_cqn
+ * Completion Queue that triggeret this EQE.
+ */
+MLXSW_ITEM32(pci, eqe, cqn, 0x0C, 8, 7);
+
+/* pci_eqe_owner
+ * Ownership bit.
+ */
+MLXSW_ITEM32(pci, eqe, owner, 0x0C, 0, 1);
+
+/* pci_eqe_cmd_token
+ * Command completion event - token
+ */
+MLXSW_ITEM32(pci, eqe, cmd_token, 0x00, 16, 16);
+
+/* pci_eqe_cmd_status
+ * Command completion event - status
+ */
+MLXSW_ITEM32(pci, eqe, cmd_status, 0x00, 0, 8);
+
+/* pci_eqe_cmd_out_param_h
+ * Command completion event - output parameter - higher part
+ */
+MLXSW_ITEM32(pci, eqe, cmd_out_param_h, 0x04, 0, 32);
+
+/* pci_eqe_cmd_out_param_l
+ * Command completion event - output parameter - lower part
+ */
+MLXSW_ITEM32(pci, eqe, cmd_out_param_l, 0x08, 0, 32);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/port.h b/drivers/net/ethernet/mellanox/mlxsw/port.h
new file mode 100644
index 0000000..c580abb
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/port.h
@@ -0,0 +1,74 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/port.h
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015 Elad Raz <eladr@mellanox.com>
+ * Copyright (c) 2015 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2015 Ido Schimmel <idosch@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _MLXSW_PORT_H
+#define _MLXSW_PORT_H
+
+#include <linux/types.h>
+
+#define MLXSW_PORT_MAX_MTU		10000
+
+#define MLXSW_PORT_DEFAULT_VID		1
+
+#define MLXSW_PORT_SWID_DISABLED_PORT	255
+#define MLXSW_PORT_SWID_ALL_SWIDS	254
+#define MLXSW_PORT_SWID_TYPE_IB		1
+#define MLXSW_PORT_SWID_TYPE_ETH	2
+
+#define MLXSW_PORT_MID			0xd000
+
+#define MLXSW_PORT_MAX_IB_PHY_PORTS	36
+#define MLXSW_PORT_MAX_IB_PORTS		(MLXSW_PORT_MAX_IB_PHY_PORTS + 1)
+
+#define MLXSW_PORT_CPU_PORT		0x0
+
+#define MLXSW_PORT_DONT_CARE		0xFF
+
+#define MLXSW_PORT_MODULE_MAX_WIDTH	4
+
+enum mlxsw_port_admin_status {
+	MLXSW_PORT_ADMIN_STATUS_UP = 1,
+	MLXSW_PORT_ADMIN_STATUS_DOWN = 2,
+	MLXSW_PORT_ADMIN_STATUS_UP_ONCE = 3,
+	MLXSW_PORT_ADMIN_STATUS_DISABLED = 4,
+};
+
+enum mlxsw_reg_pude_oper_status {
+	MLXSW_PORT_OPER_STATUS_UP = 1,
+	MLXSW_PORT_OPER_STATUS_DOWN = 2,
+	MLXSW_PORT_OPER_STATUS_FAILURE = 4,	/* Can be set to up again. */
+};
+
+#endif /* _MLXSW_PORT_H */
diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
new file mode 100644
index 0000000..6218231
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -0,0 +1,7962 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/reg.h
+ * Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015-2016 Ido Schimmel <idosch@mellanox.com>
+ * Copyright (c) 2015 Elad Raz <eladr@mellanox.com>
+ * Copyright (c) 2015-2017 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2016 Yotam Gigi <yotamg@mellanox.com>
+ * Copyright (c) 2017-2018 Petr Machata <petrm@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_REG_H
+#define _MLXSW_REG_H
+
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <linux/if_vlan.h>
+
+#include "item.h"
+#include "port.h"
+
+struct mlxsw_reg_info {
+	u16 id;
+	u16 len; /* In u8 */
+	const char *name;
+};
+
+#define MLXSW_REG_DEFINE(_name, _id, _len)				\
+static const struct mlxsw_reg_info mlxsw_reg_##_name = {		\
+	.id = _id,							\
+	.len = _len,							\
+	.name = #_name,							\
+}
+
+#define MLXSW_REG(type) (&mlxsw_reg_##type)
+#define MLXSW_REG_LEN(type) MLXSW_REG(type)->len
+#define MLXSW_REG_ZERO(type, payload) memset(payload, 0, MLXSW_REG(type)->len)
+
+/* SGCR - Switch General Configuration Register
+ * --------------------------------------------
+ * This register is used for configuration of the switch capabilities.
+ */
+#define MLXSW_REG_SGCR_ID 0x2000
+#define MLXSW_REG_SGCR_LEN 0x10
+
+MLXSW_REG_DEFINE(sgcr, MLXSW_REG_SGCR_ID, MLXSW_REG_SGCR_LEN);
+
+/* reg_sgcr_llb
+ * Link Local Broadcast (Default=0)
+ * When set, all Link Local packets (224.0.0.X) will be treated as broadcast
+ * packets and ignore the IGMP snooping entries.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sgcr, llb, 0x04, 0, 1);
+
+static inline void mlxsw_reg_sgcr_pack(char *payload, bool llb)
+{
+	MLXSW_REG_ZERO(sgcr, payload);
+	mlxsw_reg_sgcr_llb_set(payload, !!llb);
+}
+
+/* SPAD - Switch Physical Address Register
+ * ---------------------------------------
+ * The SPAD register configures the switch physical MAC address.
+ */
+#define MLXSW_REG_SPAD_ID 0x2002
+#define MLXSW_REG_SPAD_LEN 0x10
+
+MLXSW_REG_DEFINE(spad, MLXSW_REG_SPAD_ID, MLXSW_REG_SPAD_LEN);
+
+/* reg_spad_base_mac
+ * Base MAC address for the switch partitions.
+ * Per switch partition MAC address is equal to:
+ * base_mac + swid
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, spad, base_mac, 0x02, 6);
+
+/* SMID - Switch Multicast ID
+ * --------------------------
+ * The MID record maps from a MID (Multicast ID), which is a unique identifier
+ * of the multicast group within the stacking domain, into a list of local
+ * ports into which the packet is replicated.
+ */
+#define MLXSW_REG_SMID_ID 0x2007
+#define MLXSW_REG_SMID_LEN 0x240
+
+MLXSW_REG_DEFINE(smid, MLXSW_REG_SMID_ID, MLXSW_REG_SMID_LEN);
+
+/* reg_smid_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, smid, swid, 0x00, 24, 8);
+
+/* reg_smid_mid
+ * Multicast identifier - global identifier that represents the multicast group
+ * across all devices.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, smid, mid, 0x00, 0, 16);
+
+/* reg_smid_port
+ * Local port memebership (1 bit per port).
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, smid, port, 0x20, 0x20, 1);
+
+/* reg_smid_port_mask
+ * Local port mask (1 bit per port).
+ * Access: W
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, smid, port_mask, 0x220, 0x20, 1);
+
+static inline void mlxsw_reg_smid_pack(char *payload, u16 mid,
+				       u8 port, bool set)
+{
+	MLXSW_REG_ZERO(smid, payload);
+	mlxsw_reg_smid_swid_set(payload, 0);
+	mlxsw_reg_smid_mid_set(payload, mid);
+	mlxsw_reg_smid_port_set(payload, port, set);
+	mlxsw_reg_smid_port_mask_set(payload, port, 1);
+}
+
+/* SSPR - Switch System Port Record Register
+ * -----------------------------------------
+ * Configures the system port to local port mapping.
+ */
+#define MLXSW_REG_SSPR_ID 0x2008
+#define MLXSW_REG_SSPR_LEN 0x8
+
+MLXSW_REG_DEFINE(sspr, MLXSW_REG_SSPR_ID, MLXSW_REG_SSPR_LEN);
+
+/* reg_sspr_m
+ * Master - if set, then the record describes the master system port.
+ * This is needed in case a local port is mapped into several system ports
+ * (for multipathing). That number will be reported as the source system
+ * port when packets are forwarded to the CPU. Only one master port is allowed
+ * per local port.
+ *
+ * Note: Must be set for Spectrum.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sspr, m, 0x00, 31, 1);
+
+/* reg_sspr_local_port
+ * Local port number.
+ *
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sspr, local_port, 0x00, 16, 8);
+
+/* reg_sspr_sub_port
+ * Virtual port within the physical port.
+ * Should be set to 0 when virtual ports are not enabled on the port.
+ *
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sspr, sub_port, 0x00, 8, 8);
+
+/* reg_sspr_system_port
+ * Unique identifier within the stacking domain that represents all the ports
+ * that are available in the system (external ports).
+ *
+ * Currently, only single-ASIC configurations are supported, so we default to
+ * 1:1 mapping between system ports and local ports.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sspr, system_port, 0x04, 0, 16);
+
+static inline void mlxsw_reg_sspr_pack(char *payload, u8 local_port)
+{
+	MLXSW_REG_ZERO(sspr, payload);
+	mlxsw_reg_sspr_m_set(payload, 1);
+	mlxsw_reg_sspr_local_port_set(payload, local_port);
+	mlxsw_reg_sspr_sub_port_set(payload, 0);
+	mlxsw_reg_sspr_system_port_set(payload, local_port);
+}
+
+/* SFDAT - Switch Filtering Database Aging Time
+ * --------------------------------------------
+ * Controls the Switch aging time. Aging time is able to be set per Switch
+ * Partition.
+ */
+#define MLXSW_REG_SFDAT_ID 0x2009
+#define MLXSW_REG_SFDAT_LEN 0x8
+
+MLXSW_REG_DEFINE(sfdat, MLXSW_REG_SFDAT_ID, MLXSW_REG_SFDAT_LEN);
+
+/* reg_sfdat_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfdat, swid, 0x00, 24, 8);
+
+/* reg_sfdat_age_time
+ * Aging time in seconds
+ * Min - 10 seconds
+ * Max - 1,000,000 seconds
+ * Default is 300 seconds.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdat, age_time, 0x04, 0, 20);
+
+static inline void mlxsw_reg_sfdat_pack(char *payload, u32 age_time)
+{
+	MLXSW_REG_ZERO(sfdat, payload);
+	mlxsw_reg_sfdat_swid_set(payload, 0);
+	mlxsw_reg_sfdat_age_time_set(payload, age_time);
+}
+
+/* SFD - Switch Filtering Database
+ * -------------------------------
+ * The following register defines the access to the filtering database.
+ * The register supports querying, adding, removing and modifying the database.
+ * The access is optimized for bulk updates in which case more than one
+ * FDB record is present in the same command.
+ */
+#define MLXSW_REG_SFD_ID 0x200A
+#define MLXSW_REG_SFD_BASE_LEN 0x10 /* base length, without records */
+#define MLXSW_REG_SFD_REC_LEN 0x10 /* record length */
+#define MLXSW_REG_SFD_REC_MAX_COUNT 64
+#define MLXSW_REG_SFD_LEN (MLXSW_REG_SFD_BASE_LEN +	\
+			   MLXSW_REG_SFD_REC_LEN * MLXSW_REG_SFD_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(sfd, MLXSW_REG_SFD_ID, MLXSW_REG_SFD_LEN);
+
+/* reg_sfd_swid
+ * Switch partition ID for queries. Reserved on Write.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfd, swid, 0x00, 24, 8);
+
+enum mlxsw_reg_sfd_op {
+	/* Dump entire FDB a (process according to record_locator) */
+	MLXSW_REG_SFD_OP_QUERY_DUMP = 0,
+	/* Query records by {MAC, VID/FID} value */
+	MLXSW_REG_SFD_OP_QUERY_QUERY = 1,
+	/* Query and clear activity. Query records by {MAC, VID/FID} value */
+	MLXSW_REG_SFD_OP_QUERY_QUERY_AND_CLEAR_ACTIVITY = 2,
+	/* Test. Response indicates if each of the records could be
+	 * added to the FDB.
+	 */
+	MLXSW_REG_SFD_OP_WRITE_TEST = 0,
+	/* Add/modify. Aged-out records cannot be added. This command removes
+	 * the learning notification of the {MAC, VID/FID}. Response includes
+	 * the entries that were added to the FDB.
+	 */
+	MLXSW_REG_SFD_OP_WRITE_EDIT = 1,
+	/* Remove record by {MAC, VID/FID}. This command also removes
+	 * the learning notification and aged-out notifications
+	 * of the {MAC, VID/FID}. The response provides current (pre-removal)
+	 * entries as non-aged-out.
+	 */
+	MLXSW_REG_SFD_OP_WRITE_REMOVE = 2,
+	/* Remove learned notification by {MAC, VID/FID}. The response provides
+	 * the removed learning notification.
+	 */
+	MLXSW_REG_SFD_OP_WRITE_REMOVE_NOTIFICATION = 2,
+};
+
+/* reg_sfd_op
+ * Operation.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, sfd, op, 0x04, 30, 2);
+
+/* reg_sfd_record_locator
+ * Used for querying the FDB. Use record_locator=0 to initiate the
+ * query. When a record is returned, a new record_locator is
+ * returned to be used in the subsequent query.
+ * Reserved for database update.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfd, record_locator, 0x04, 0, 30);
+
+/* reg_sfd_num_rec
+ * Request: Number of records to read/add/modify/remove
+ * Response: Number of records read/added/replaced/removed
+ * See above description for more details.
+ * Ranges 0..64
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfd, num_rec, 0x08, 0, 8);
+
+static inline void mlxsw_reg_sfd_pack(char *payload, enum mlxsw_reg_sfd_op op,
+				      u32 record_locator)
+{
+	MLXSW_REG_ZERO(sfd, payload);
+	mlxsw_reg_sfd_op_set(payload, op);
+	mlxsw_reg_sfd_record_locator_set(payload, record_locator);
+}
+
+/* reg_sfd_rec_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, rec_swid, MLXSW_REG_SFD_BASE_LEN, 24, 8,
+		     MLXSW_REG_SFD_REC_LEN, 0x00, false);
+
+enum mlxsw_reg_sfd_rec_type {
+	MLXSW_REG_SFD_REC_TYPE_UNICAST = 0x0,
+	MLXSW_REG_SFD_REC_TYPE_UNICAST_LAG = 0x1,
+	MLXSW_REG_SFD_REC_TYPE_MULTICAST = 0x2,
+};
+
+/* reg_sfd_rec_type
+ * FDB record type.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, rec_type, MLXSW_REG_SFD_BASE_LEN, 20, 4,
+		     MLXSW_REG_SFD_REC_LEN, 0x00, false);
+
+enum mlxsw_reg_sfd_rec_policy {
+	/* Replacement disabled, aging disabled. */
+	MLXSW_REG_SFD_REC_POLICY_STATIC_ENTRY = 0,
+	/* (mlag remote): Replacement enabled, aging disabled,
+	 * learning notification enabled on this port.
+	 */
+	MLXSW_REG_SFD_REC_POLICY_DYNAMIC_ENTRY_MLAG = 1,
+	/* (ingress device): Replacement enabled, aging enabled. */
+	MLXSW_REG_SFD_REC_POLICY_DYNAMIC_ENTRY_INGRESS = 3,
+};
+
+/* reg_sfd_rec_policy
+ * Policy.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, rec_policy, MLXSW_REG_SFD_BASE_LEN, 18, 2,
+		     MLXSW_REG_SFD_REC_LEN, 0x00, false);
+
+/* reg_sfd_rec_a
+ * Activity. Set for new static entries. Set for static entries if a frame SMAC
+ * lookup hits on the entry.
+ * To clear the a bit, use "query and clear activity" op.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, rec_a, MLXSW_REG_SFD_BASE_LEN, 16, 1,
+		     MLXSW_REG_SFD_REC_LEN, 0x00, false);
+
+/* reg_sfd_rec_mac
+ * MAC address.
+ * Access: Index
+ */
+MLXSW_ITEM_BUF_INDEXED(reg, sfd, rec_mac, MLXSW_REG_SFD_BASE_LEN, 6,
+		       MLXSW_REG_SFD_REC_LEN, 0x02);
+
+enum mlxsw_reg_sfd_rec_action {
+	/* forward */
+	MLXSW_REG_SFD_REC_ACTION_NOP = 0,
+	/* forward and trap, trap_id is FDB_TRAP */
+	MLXSW_REG_SFD_REC_ACTION_MIRROR_TO_CPU = 1,
+	/* trap and do not forward, trap_id is FDB_TRAP */
+	MLXSW_REG_SFD_REC_ACTION_TRAP = 2,
+	/* forward to IP router */
+	MLXSW_REG_SFD_REC_ACTION_FORWARD_IP_ROUTER = 3,
+	MLXSW_REG_SFD_REC_ACTION_DISCARD_ERROR = 15,
+};
+
+/* reg_sfd_rec_action
+ * Action to apply on the packet.
+ * Note: Dynamic entries can only be configured with NOP action.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, rec_action, MLXSW_REG_SFD_BASE_LEN, 28, 4,
+		     MLXSW_REG_SFD_REC_LEN, 0x0C, false);
+
+/* reg_sfd_uc_sub_port
+ * VEPA channel on local port.
+ * Valid only if local port is a non-stacking port. Must be 0 if multichannel
+ * VEPA is not enabled.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_sub_port, MLXSW_REG_SFD_BASE_LEN, 16, 8,
+		     MLXSW_REG_SFD_REC_LEN, 0x08, false);
+
+/* reg_sfd_uc_fid_vid
+ * Filtering ID or VLAN ID
+ * For SwitchX and SwitchX-2:
+ * - Dynamic entries (policy 2,3) use FID
+ * - Static entries (policy 0) use VID
+ * - When independent learning is configured, VID=FID
+ * For Spectrum: use FID for both Dynamic and Static entries.
+ * VID should not be used.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_fid_vid, MLXSW_REG_SFD_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFD_REC_LEN, 0x08, false);
+
+/* reg_sfd_uc_system_port
+ * Unique port identifier for the final destination of the packet.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_system_port, MLXSW_REG_SFD_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFD_REC_LEN, 0x0C, false);
+
+static inline void mlxsw_reg_sfd_rec_pack(char *payload, int rec_index,
+					  enum mlxsw_reg_sfd_rec_type rec_type,
+					  const char *mac,
+					  enum mlxsw_reg_sfd_rec_action action)
+{
+	u8 num_rec = mlxsw_reg_sfd_num_rec_get(payload);
+
+	if (rec_index >= num_rec)
+		mlxsw_reg_sfd_num_rec_set(payload, rec_index + 1);
+	mlxsw_reg_sfd_rec_swid_set(payload, rec_index, 0);
+	mlxsw_reg_sfd_rec_type_set(payload, rec_index, rec_type);
+	mlxsw_reg_sfd_rec_mac_memcpy_to(payload, rec_index, mac);
+	mlxsw_reg_sfd_rec_action_set(payload, rec_index, action);
+}
+
+static inline void mlxsw_reg_sfd_uc_pack(char *payload, int rec_index,
+					 enum mlxsw_reg_sfd_rec_policy policy,
+					 const char *mac, u16 fid_vid,
+					 enum mlxsw_reg_sfd_rec_action action,
+					 u8 local_port)
+{
+	mlxsw_reg_sfd_rec_pack(payload, rec_index,
+			       MLXSW_REG_SFD_REC_TYPE_UNICAST, mac, action);
+	mlxsw_reg_sfd_rec_policy_set(payload, rec_index, policy);
+	mlxsw_reg_sfd_uc_sub_port_set(payload, rec_index, 0);
+	mlxsw_reg_sfd_uc_fid_vid_set(payload, rec_index, fid_vid);
+	mlxsw_reg_sfd_uc_system_port_set(payload, rec_index, local_port);
+}
+
+static inline void mlxsw_reg_sfd_uc_unpack(char *payload, int rec_index,
+					   char *mac, u16 *p_fid_vid,
+					   u8 *p_local_port)
+{
+	mlxsw_reg_sfd_rec_mac_memcpy_from(payload, rec_index, mac);
+	*p_fid_vid = mlxsw_reg_sfd_uc_fid_vid_get(payload, rec_index);
+	*p_local_port = mlxsw_reg_sfd_uc_system_port_get(payload, rec_index);
+}
+
+/* reg_sfd_uc_lag_sub_port
+ * LAG sub port.
+ * Must be 0 if multichannel VEPA is not enabled.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_lag_sub_port, MLXSW_REG_SFD_BASE_LEN, 16, 8,
+		     MLXSW_REG_SFD_REC_LEN, 0x08, false);
+
+/* reg_sfd_uc_lag_fid_vid
+ * Filtering ID or VLAN ID
+ * For SwitchX and SwitchX-2:
+ * - Dynamic entries (policy 2,3) use FID
+ * - Static entries (policy 0) use VID
+ * - When independent learning is configured, VID=FID
+ * For Spectrum: use FID for both Dynamic and Static entries.
+ * VID should not be used.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_lag_fid_vid, MLXSW_REG_SFD_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFD_REC_LEN, 0x08, false);
+
+/* reg_sfd_uc_lag_lag_vid
+ * Indicates VID in case of vFIDs. Reserved for FIDs.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_lag_lag_vid, MLXSW_REG_SFD_BASE_LEN, 16, 12,
+		     MLXSW_REG_SFD_REC_LEN, 0x0C, false);
+
+/* reg_sfd_uc_lag_lag_id
+ * LAG Identifier - pointer into the LAG descriptor table.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_lag_lag_id, MLXSW_REG_SFD_BASE_LEN, 0, 10,
+		     MLXSW_REG_SFD_REC_LEN, 0x0C, false);
+
+static inline void
+mlxsw_reg_sfd_uc_lag_pack(char *payload, int rec_index,
+			  enum mlxsw_reg_sfd_rec_policy policy,
+			  const char *mac, u16 fid_vid,
+			  enum mlxsw_reg_sfd_rec_action action, u16 lag_vid,
+			  u16 lag_id)
+{
+	mlxsw_reg_sfd_rec_pack(payload, rec_index,
+			       MLXSW_REG_SFD_REC_TYPE_UNICAST_LAG,
+			       mac, action);
+	mlxsw_reg_sfd_rec_policy_set(payload, rec_index, policy);
+	mlxsw_reg_sfd_uc_lag_sub_port_set(payload, rec_index, 0);
+	mlxsw_reg_sfd_uc_lag_fid_vid_set(payload, rec_index, fid_vid);
+	mlxsw_reg_sfd_uc_lag_lag_vid_set(payload, rec_index, lag_vid);
+	mlxsw_reg_sfd_uc_lag_lag_id_set(payload, rec_index, lag_id);
+}
+
+static inline void mlxsw_reg_sfd_uc_lag_unpack(char *payload, int rec_index,
+					       char *mac, u16 *p_vid,
+					       u16 *p_lag_id)
+{
+	mlxsw_reg_sfd_rec_mac_memcpy_from(payload, rec_index, mac);
+	*p_vid = mlxsw_reg_sfd_uc_lag_fid_vid_get(payload, rec_index);
+	*p_lag_id = mlxsw_reg_sfd_uc_lag_lag_id_get(payload, rec_index);
+}
+
+/* reg_sfd_mc_pgi
+ *
+ * Multicast port group index - index into the port group table.
+ * Value 0x1FFF indicates the pgi should point to the MID entry.
+ * For Spectrum this value must be set to 0x1FFF
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, mc_pgi, MLXSW_REG_SFD_BASE_LEN, 16, 13,
+		     MLXSW_REG_SFD_REC_LEN, 0x08, false);
+
+/* reg_sfd_mc_fid_vid
+ *
+ * Filtering ID or VLAN ID
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, mc_fid_vid, MLXSW_REG_SFD_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFD_REC_LEN, 0x08, false);
+
+/* reg_sfd_mc_mid
+ *
+ * Multicast identifier - global identifier that represents the multicast
+ * group across all devices.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, mc_mid, MLXSW_REG_SFD_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFD_REC_LEN, 0x0C, false);
+
+static inline void
+mlxsw_reg_sfd_mc_pack(char *payload, int rec_index,
+		      const char *mac, u16 fid_vid,
+		      enum mlxsw_reg_sfd_rec_action action, u16 mid)
+{
+	mlxsw_reg_sfd_rec_pack(payload, rec_index,
+			       MLXSW_REG_SFD_REC_TYPE_MULTICAST, mac, action);
+	mlxsw_reg_sfd_mc_pgi_set(payload, rec_index, 0x1FFF);
+	mlxsw_reg_sfd_mc_fid_vid_set(payload, rec_index, fid_vid);
+	mlxsw_reg_sfd_mc_mid_set(payload, rec_index, mid);
+}
+
+/* SFN - Switch FDB Notification Register
+ * -------------------------------------------
+ * The switch provides notifications on newly learned FDB entries and
+ * aged out entries. The notifications can be polled by software.
+ */
+#define MLXSW_REG_SFN_ID 0x200B
+#define MLXSW_REG_SFN_BASE_LEN 0x10 /* base length, without records */
+#define MLXSW_REG_SFN_REC_LEN 0x10 /* record length */
+#define MLXSW_REG_SFN_REC_MAX_COUNT 64
+#define MLXSW_REG_SFN_LEN (MLXSW_REG_SFN_BASE_LEN +	\
+			   MLXSW_REG_SFN_REC_LEN * MLXSW_REG_SFN_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(sfn, MLXSW_REG_SFN_ID, MLXSW_REG_SFN_LEN);
+
+/* reg_sfn_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfn, swid, 0x00, 24, 8);
+
+/* reg_sfn_end
+ * Forces the current session to end.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, sfn, end, 0x04, 20, 1);
+
+/* reg_sfn_num_rec
+ * Request: Number of learned notifications and aged-out notification
+ * records requested.
+ * Response: Number of notification records returned (must be smaller
+ * than or equal to the value requested)
+ * Ranges 0..64
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, sfn, num_rec, 0x04, 0, 8);
+
+static inline void mlxsw_reg_sfn_pack(char *payload)
+{
+	MLXSW_REG_ZERO(sfn, payload);
+	mlxsw_reg_sfn_swid_set(payload, 0);
+	mlxsw_reg_sfn_end_set(payload, 1);
+	mlxsw_reg_sfn_num_rec_set(payload, MLXSW_REG_SFN_REC_MAX_COUNT);
+}
+
+/* reg_sfn_rec_swid
+ * Switch partition ID.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfn, rec_swid, MLXSW_REG_SFN_BASE_LEN, 24, 8,
+		     MLXSW_REG_SFN_REC_LEN, 0x00, false);
+
+enum mlxsw_reg_sfn_rec_type {
+	/* MAC addresses learned on a regular port. */
+	MLXSW_REG_SFN_REC_TYPE_LEARNED_MAC = 0x5,
+	/* MAC addresses learned on a LAG port. */
+	MLXSW_REG_SFN_REC_TYPE_LEARNED_MAC_LAG = 0x6,
+	/* Aged-out MAC address on a regular port. */
+	MLXSW_REG_SFN_REC_TYPE_AGED_OUT_MAC = 0x7,
+	/* Aged-out MAC address on a LAG port. */
+	MLXSW_REG_SFN_REC_TYPE_AGED_OUT_MAC_LAG = 0x8,
+};
+
+/* reg_sfn_rec_type
+ * Notification record type.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfn, rec_type, MLXSW_REG_SFN_BASE_LEN, 20, 4,
+		     MLXSW_REG_SFN_REC_LEN, 0x00, false);
+
+/* reg_sfn_rec_mac
+ * MAC address.
+ * Access: RO
+ */
+MLXSW_ITEM_BUF_INDEXED(reg, sfn, rec_mac, MLXSW_REG_SFN_BASE_LEN, 6,
+		       MLXSW_REG_SFN_REC_LEN, 0x02);
+
+/* reg_sfn_mac_sub_port
+ * VEPA channel on the local port.
+ * 0 if multichannel VEPA is not enabled.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfn, mac_sub_port, MLXSW_REG_SFN_BASE_LEN, 16, 8,
+		     MLXSW_REG_SFN_REC_LEN, 0x08, false);
+
+/* reg_sfn_mac_fid
+ * Filtering identifier.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfn, mac_fid, MLXSW_REG_SFN_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFN_REC_LEN, 0x08, false);
+
+/* reg_sfn_mac_system_port
+ * Unique port identifier for the final destination of the packet.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfn, mac_system_port, MLXSW_REG_SFN_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFN_REC_LEN, 0x0C, false);
+
+static inline void mlxsw_reg_sfn_mac_unpack(char *payload, int rec_index,
+					    char *mac, u16 *p_vid,
+					    u8 *p_local_port)
+{
+	mlxsw_reg_sfn_rec_mac_memcpy_from(payload, rec_index, mac);
+	*p_vid = mlxsw_reg_sfn_mac_fid_get(payload, rec_index);
+	*p_local_port = mlxsw_reg_sfn_mac_system_port_get(payload, rec_index);
+}
+
+/* reg_sfn_mac_lag_lag_id
+ * LAG ID (pointer into the LAG descriptor table).
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfn, mac_lag_lag_id, MLXSW_REG_SFN_BASE_LEN, 0, 10,
+		     MLXSW_REG_SFN_REC_LEN, 0x0C, false);
+
+static inline void mlxsw_reg_sfn_mac_lag_unpack(char *payload, int rec_index,
+						char *mac, u16 *p_vid,
+						u16 *p_lag_id)
+{
+	mlxsw_reg_sfn_rec_mac_memcpy_from(payload, rec_index, mac);
+	*p_vid = mlxsw_reg_sfn_mac_fid_get(payload, rec_index);
+	*p_lag_id = mlxsw_reg_sfn_mac_lag_lag_id_get(payload, rec_index);
+}
+
+/* SPMS - Switch Port MSTP/RSTP State Register
+ * -------------------------------------------
+ * Configures the spanning tree state of a physical port.
+ */
+#define MLXSW_REG_SPMS_ID 0x200D
+#define MLXSW_REG_SPMS_LEN 0x404
+
+MLXSW_REG_DEFINE(spms, MLXSW_REG_SPMS_ID, MLXSW_REG_SPMS_LEN);
+
+/* reg_spms_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spms, local_port, 0x00, 16, 8);
+
+enum mlxsw_reg_spms_state {
+	MLXSW_REG_SPMS_STATE_NO_CHANGE,
+	MLXSW_REG_SPMS_STATE_DISCARDING,
+	MLXSW_REG_SPMS_STATE_LEARNING,
+	MLXSW_REG_SPMS_STATE_FORWARDING,
+};
+
+/* reg_spms_state
+ * Spanning tree state of each VLAN ID (VID) of the local port.
+ * 0 - Do not change spanning tree state (used only when writing).
+ * 1 - Discarding. No learning or forwarding to/from this port (default).
+ * 2 - Learning. Port is learning, but not forwarding.
+ * 3 - Forwarding. Port is learning and forwarding.
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, spms, state, 0x04, 0x400, 2);
+
+static inline void mlxsw_reg_spms_pack(char *payload, u8 local_port)
+{
+	MLXSW_REG_ZERO(spms, payload);
+	mlxsw_reg_spms_local_port_set(payload, local_port);
+}
+
+static inline void mlxsw_reg_spms_vid_pack(char *payload, u16 vid,
+					   enum mlxsw_reg_spms_state state)
+{
+	mlxsw_reg_spms_state_set(payload, vid, state);
+}
+
+/* SPVID - Switch Port VID
+ * -----------------------
+ * The switch port VID configures the default VID for a port.
+ */
+#define MLXSW_REG_SPVID_ID 0x200E
+#define MLXSW_REG_SPVID_LEN 0x08
+
+MLXSW_REG_DEFINE(spvid, MLXSW_REG_SPVID_ID, MLXSW_REG_SPVID_LEN);
+
+/* reg_spvid_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spvid, local_port, 0x00, 16, 8);
+
+/* reg_spvid_sub_port
+ * Virtual port within the physical port.
+ * Should be set to 0 when virtual ports are not enabled on the port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spvid, sub_port, 0x00, 8, 8);
+
+/* reg_spvid_pvid
+ * Port default VID
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spvid, pvid, 0x04, 0, 12);
+
+static inline void mlxsw_reg_spvid_pack(char *payload, u8 local_port, u16 pvid)
+{
+	MLXSW_REG_ZERO(spvid, payload);
+	mlxsw_reg_spvid_local_port_set(payload, local_port);
+	mlxsw_reg_spvid_pvid_set(payload, pvid);
+}
+
+/* SPVM - Switch Port VLAN Membership
+ * ----------------------------------
+ * The Switch Port VLAN Membership register configures the VLAN membership
+ * of a port in a VLAN denoted by VID. VLAN membership is managed per
+ * virtual port. The register can be used to add and remove VID(s) from a port.
+ */
+#define MLXSW_REG_SPVM_ID 0x200F
+#define MLXSW_REG_SPVM_BASE_LEN 0x04 /* base length, without records */
+#define MLXSW_REG_SPVM_REC_LEN 0x04 /* record length */
+#define MLXSW_REG_SPVM_REC_MAX_COUNT 255
+#define MLXSW_REG_SPVM_LEN (MLXSW_REG_SPVM_BASE_LEN +	\
+		    MLXSW_REG_SPVM_REC_LEN * MLXSW_REG_SPVM_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(spvm, MLXSW_REG_SPVM_ID, MLXSW_REG_SPVM_LEN);
+
+/* reg_spvm_pt
+ * Priority tagged. If this bit is set, packets forwarded to the port with
+ * untagged VLAN membership (u bit is set) will be tagged with priority tag
+ * (VID=0)
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spvm, pt, 0x00, 31, 1);
+
+/* reg_spvm_pte
+ * Priority Tagged Update Enable. On Write operations, if this bit is cleared,
+ * the pt bit will NOT be updated. To update the pt bit, pte must be set.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, spvm, pte, 0x00, 30, 1);
+
+/* reg_spvm_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spvm, local_port, 0x00, 16, 8);
+
+/* reg_spvm_sub_port
+ * Virtual port within the physical port.
+ * Should be set to 0 when virtual ports are not enabled on the port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spvm, sub_port, 0x00, 8, 8);
+
+/* reg_spvm_num_rec
+ * Number of records to update. Each record contains: i, e, u, vid.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, spvm, num_rec, 0x00, 0, 8);
+
+/* reg_spvm_rec_i
+ * Ingress membership in VLAN ID.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, spvm, rec_i,
+		     MLXSW_REG_SPVM_BASE_LEN, 14, 1,
+		     MLXSW_REG_SPVM_REC_LEN, 0, false);
+
+/* reg_spvm_rec_e
+ * Egress membership in VLAN ID.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, spvm, rec_e,
+		     MLXSW_REG_SPVM_BASE_LEN, 13, 1,
+		     MLXSW_REG_SPVM_REC_LEN, 0, false);
+
+/* reg_spvm_rec_u
+ * Untagged - port is an untagged member - egress transmission uses untagged
+ * frames on VID<n>
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, spvm, rec_u,
+		     MLXSW_REG_SPVM_BASE_LEN, 12, 1,
+		     MLXSW_REG_SPVM_REC_LEN, 0, false);
+
+/* reg_spvm_rec_vid
+ * Egress membership in VLAN ID.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, spvm, rec_vid,
+		     MLXSW_REG_SPVM_BASE_LEN, 0, 12,
+		     MLXSW_REG_SPVM_REC_LEN, 0, false);
+
+static inline void mlxsw_reg_spvm_pack(char *payload, u8 local_port,
+				       u16 vid_begin, u16 vid_end,
+				       bool is_member, bool untagged)
+{
+	int size = vid_end - vid_begin + 1;
+	int i;
+
+	MLXSW_REG_ZERO(spvm, payload);
+	mlxsw_reg_spvm_local_port_set(payload, local_port);
+	mlxsw_reg_spvm_num_rec_set(payload, size);
+
+	for (i = 0; i < size; i++) {
+		mlxsw_reg_spvm_rec_i_set(payload, i, is_member);
+		mlxsw_reg_spvm_rec_e_set(payload, i, is_member);
+		mlxsw_reg_spvm_rec_u_set(payload, i, untagged);
+		mlxsw_reg_spvm_rec_vid_set(payload, i, vid_begin + i);
+	}
+}
+
+/* SPAFT - Switch Port Acceptable Frame Types
+ * ------------------------------------------
+ * The Switch Port Acceptable Frame Types register configures the frame
+ * admittance of the port.
+ */
+#define MLXSW_REG_SPAFT_ID 0x2010
+#define MLXSW_REG_SPAFT_LEN 0x08
+
+MLXSW_REG_DEFINE(spaft, MLXSW_REG_SPAFT_ID, MLXSW_REG_SPAFT_LEN);
+
+/* reg_spaft_local_port
+ * Local port number.
+ * Access: Index
+ *
+ * Note: CPU port is not supported (all tag types are allowed).
+ */
+MLXSW_ITEM32(reg, spaft, local_port, 0x00, 16, 8);
+
+/* reg_spaft_sub_port
+ * Virtual port within the physical port.
+ * Should be set to 0 when virtual ports are not enabled on the port.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spaft, sub_port, 0x00, 8, 8);
+
+/* reg_spaft_allow_untagged
+ * When set, untagged frames on the ingress are allowed (default).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spaft, allow_untagged, 0x04, 31, 1);
+
+/* reg_spaft_allow_prio_tagged
+ * When set, priority tagged frames on the ingress are allowed (default).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spaft, allow_prio_tagged, 0x04, 30, 1);
+
+/* reg_spaft_allow_tagged
+ * When set, tagged frames on the ingress are allowed (default).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spaft, allow_tagged, 0x04, 29, 1);
+
+static inline void mlxsw_reg_spaft_pack(char *payload, u8 local_port,
+					bool allow_untagged)
+{
+	MLXSW_REG_ZERO(spaft, payload);
+	mlxsw_reg_spaft_local_port_set(payload, local_port);
+	mlxsw_reg_spaft_allow_untagged_set(payload, allow_untagged);
+	mlxsw_reg_spaft_allow_prio_tagged_set(payload, true);
+	mlxsw_reg_spaft_allow_tagged_set(payload, true);
+}
+
+/* SFGC - Switch Flooding Group Configuration
+ * ------------------------------------------
+ * The following register controls the association of flooding tables and MIDs
+ * to packet types used for flooding.
+ */
+#define MLXSW_REG_SFGC_ID 0x2011
+#define MLXSW_REG_SFGC_LEN 0x10
+
+MLXSW_REG_DEFINE(sfgc, MLXSW_REG_SFGC_ID, MLXSW_REG_SFGC_LEN);
+
+enum mlxsw_reg_sfgc_type {
+	MLXSW_REG_SFGC_TYPE_BROADCAST,
+	MLXSW_REG_SFGC_TYPE_UNKNOWN_UNICAST,
+	MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_IPV4,
+	MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_IPV6,
+	MLXSW_REG_SFGC_TYPE_RESERVED,
+	MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_NON_IP,
+	MLXSW_REG_SFGC_TYPE_IPV4_LINK_LOCAL,
+	MLXSW_REG_SFGC_TYPE_IPV6_ALL_HOST,
+	MLXSW_REG_SFGC_TYPE_MAX,
+};
+
+/* reg_sfgc_type
+ * The traffic type to reach the flooding table.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfgc, type, 0x00, 0, 4);
+
+enum mlxsw_reg_sfgc_bridge_type {
+	MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID = 0,
+	MLXSW_REG_SFGC_BRIDGE_TYPE_VFID = 1,
+};
+
+/* reg_sfgc_bridge_type
+ * Access: Index
+ *
+ * Note: SwitchX-2 only supports 802.1Q mode.
+ */
+MLXSW_ITEM32(reg, sfgc, bridge_type, 0x04, 24, 3);
+
+enum mlxsw_flood_table_type {
+	MLXSW_REG_SFGC_TABLE_TYPE_VID = 1,
+	MLXSW_REG_SFGC_TABLE_TYPE_SINGLE = 2,
+	MLXSW_REG_SFGC_TABLE_TYPE_ANY = 0,
+	MLXSW_REG_SFGC_TABLE_TYPE_FID_OFFSET = 3,
+	MLXSW_REG_SFGC_TABLE_TYPE_FID = 4,
+};
+
+/* reg_sfgc_table_type
+ * See mlxsw_flood_table_type
+ * Access: RW
+ *
+ * Note: FID offset and FID types are not supported in SwitchX-2.
+ */
+MLXSW_ITEM32(reg, sfgc, table_type, 0x04, 16, 3);
+
+/* reg_sfgc_flood_table
+ * Flooding table index to associate with the specific type on the specific
+ * switch partition.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfgc, flood_table, 0x04, 0, 6);
+
+/* reg_sfgc_mid
+ * The multicast ID for the swid. Not supported for Spectrum
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfgc, mid, 0x08, 0, 16);
+
+/* reg_sfgc_counter_set_type
+ * Counter Set Type for flow counters.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfgc, counter_set_type, 0x0C, 24, 8);
+
+/* reg_sfgc_counter_index
+ * Counter Index for flow counters.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfgc, counter_index, 0x0C, 0, 24);
+
+static inline void
+mlxsw_reg_sfgc_pack(char *payload, enum mlxsw_reg_sfgc_type type,
+		    enum mlxsw_reg_sfgc_bridge_type bridge_type,
+		    enum mlxsw_flood_table_type table_type,
+		    unsigned int flood_table)
+{
+	MLXSW_REG_ZERO(sfgc, payload);
+	mlxsw_reg_sfgc_type_set(payload, type);
+	mlxsw_reg_sfgc_bridge_type_set(payload, bridge_type);
+	mlxsw_reg_sfgc_table_type_set(payload, table_type);
+	mlxsw_reg_sfgc_flood_table_set(payload, flood_table);
+	mlxsw_reg_sfgc_mid_set(payload, MLXSW_PORT_MID);
+}
+
+/* SFTR - Switch Flooding Table Register
+ * -------------------------------------
+ * The switch flooding table is used for flooding packet replication. The table
+ * defines a bit mask of ports for packet replication.
+ */
+#define MLXSW_REG_SFTR_ID 0x2012
+#define MLXSW_REG_SFTR_LEN 0x420
+
+MLXSW_REG_DEFINE(sftr, MLXSW_REG_SFTR_ID, MLXSW_REG_SFTR_LEN);
+
+/* reg_sftr_swid
+ * Switch partition ID with which to associate the port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sftr, swid, 0x00, 24, 8);
+
+/* reg_sftr_flood_table
+ * Flooding table index to associate with the specific type on the specific
+ * switch partition.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sftr, flood_table, 0x00, 16, 6);
+
+/* reg_sftr_index
+ * Index. Used as an index into the Flooding Table in case the table is
+ * configured to use VID / FID or FID Offset.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sftr, index, 0x00, 0, 16);
+
+/* reg_sftr_table_type
+ * See mlxsw_flood_table_type
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sftr, table_type, 0x04, 16, 3);
+
+/* reg_sftr_range
+ * Range of entries to update
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sftr, range, 0x04, 0, 16);
+
+/* reg_sftr_port
+ * Local port membership (1 bit per port).
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, sftr, port, 0x20, 0x20, 1);
+
+/* reg_sftr_cpu_port_mask
+ * CPU port mask (1 bit per port).
+ * Access: W
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, sftr, port_mask, 0x220, 0x20, 1);
+
+static inline void mlxsw_reg_sftr_pack(char *payload,
+				       unsigned int flood_table,
+				       unsigned int index,
+				       enum mlxsw_flood_table_type table_type,
+				       unsigned int range, u8 port, bool set)
+{
+	MLXSW_REG_ZERO(sftr, payload);
+	mlxsw_reg_sftr_swid_set(payload, 0);
+	mlxsw_reg_sftr_flood_table_set(payload, flood_table);
+	mlxsw_reg_sftr_index_set(payload, index);
+	mlxsw_reg_sftr_table_type_set(payload, table_type);
+	mlxsw_reg_sftr_range_set(payload, range);
+	mlxsw_reg_sftr_port_set(payload, port, set);
+	mlxsw_reg_sftr_port_mask_set(payload, port, 1);
+}
+
+/* SFDF - Switch Filtering DB Flush
+ * --------------------------------
+ * The switch filtering DB flush register is used to flush the FDB.
+ * Note that FDB notifications are flushed as well.
+ */
+#define MLXSW_REG_SFDF_ID 0x2013
+#define MLXSW_REG_SFDF_LEN 0x14
+
+MLXSW_REG_DEFINE(sfdf, MLXSW_REG_SFDF_ID, MLXSW_REG_SFDF_LEN);
+
+/* reg_sfdf_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfdf, swid, 0x00, 24, 8);
+
+enum mlxsw_reg_sfdf_flush_type {
+	MLXSW_REG_SFDF_FLUSH_PER_SWID,
+	MLXSW_REG_SFDF_FLUSH_PER_FID,
+	MLXSW_REG_SFDF_FLUSH_PER_PORT,
+	MLXSW_REG_SFDF_FLUSH_PER_PORT_AND_FID,
+	MLXSW_REG_SFDF_FLUSH_PER_LAG,
+	MLXSW_REG_SFDF_FLUSH_PER_LAG_AND_FID,
+};
+
+/* reg_sfdf_flush_type
+ * Flush type.
+ * 0 - All SWID dynamic entries are flushed.
+ * 1 - All FID dynamic entries are flushed.
+ * 2 - All dynamic entries pointing to port are flushed.
+ * 3 - All FID dynamic entries pointing to port are flushed.
+ * 4 - All dynamic entries pointing to LAG are flushed.
+ * 5 - All FID dynamic entries pointing to LAG are flushed.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, flush_type, 0x04, 28, 4);
+
+/* reg_sfdf_flush_static
+ * Static.
+ * 0 - Flush only dynamic entries.
+ * 1 - Flush both dynamic and static entries.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, flush_static, 0x04, 24, 1);
+
+static inline void mlxsw_reg_sfdf_pack(char *payload,
+				       enum mlxsw_reg_sfdf_flush_type type)
+{
+	MLXSW_REG_ZERO(sfdf, payload);
+	mlxsw_reg_sfdf_flush_type_set(payload, type);
+	mlxsw_reg_sfdf_flush_static_set(payload, true);
+}
+
+/* reg_sfdf_fid
+ * FID to flush.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, fid, 0x0C, 0, 16);
+
+/* reg_sfdf_system_port
+ * Port to flush.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, system_port, 0x0C, 0, 16);
+
+/* reg_sfdf_port_fid_system_port
+ * Port to flush, pointed to by FID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, port_fid_system_port, 0x08, 0, 16);
+
+/* reg_sfdf_lag_id
+ * LAG ID to flush.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, lag_id, 0x0C, 0, 10);
+
+/* reg_sfdf_lag_fid_lag_id
+ * LAG ID to flush, pointed to by FID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, lag_fid_lag_id, 0x08, 0, 10);
+
+/* SLDR - Switch LAG Descriptor Register
+ * -----------------------------------------
+ * The switch LAG descriptor register is populated by LAG descriptors.
+ * Each LAG descriptor is indexed by lag_id. The LAG ID runs from 0 to
+ * max_lag-1.
+ */
+#define MLXSW_REG_SLDR_ID 0x2014
+#define MLXSW_REG_SLDR_LEN 0x0C /* counting in only one port in list */
+
+MLXSW_REG_DEFINE(sldr, MLXSW_REG_SLDR_ID, MLXSW_REG_SLDR_LEN);
+
+enum mlxsw_reg_sldr_op {
+	/* Indicates a creation of a new LAG-ID, lag_id must be valid */
+	MLXSW_REG_SLDR_OP_LAG_CREATE,
+	MLXSW_REG_SLDR_OP_LAG_DESTROY,
+	/* Ports that appear in the list have the Distributor enabled */
+	MLXSW_REG_SLDR_OP_LAG_ADD_PORT_LIST,
+	/* Removes ports from the disributor list */
+	MLXSW_REG_SLDR_OP_LAG_REMOVE_PORT_LIST,
+};
+
+/* reg_sldr_op
+ * Operation.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sldr, op, 0x00, 29, 3);
+
+/* reg_sldr_lag_id
+ * LAG identifier. The lag_id is the index into the LAG descriptor table.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sldr, lag_id, 0x00, 0, 10);
+
+static inline void mlxsw_reg_sldr_lag_create_pack(char *payload, u8 lag_id)
+{
+	MLXSW_REG_ZERO(sldr, payload);
+	mlxsw_reg_sldr_op_set(payload, MLXSW_REG_SLDR_OP_LAG_CREATE);
+	mlxsw_reg_sldr_lag_id_set(payload, lag_id);
+}
+
+static inline void mlxsw_reg_sldr_lag_destroy_pack(char *payload, u8 lag_id)
+{
+	MLXSW_REG_ZERO(sldr, payload);
+	mlxsw_reg_sldr_op_set(payload, MLXSW_REG_SLDR_OP_LAG_DESTROY);
+	mlxsw_reg_sldr_lag_id_set(payload, lag_id);
+}
+
+/* reg_sldr_num_ports
+ * The number of member ports of the LAG.
+ * Reserved for Create / Destroy operations
+ * For Add / Remove operations - indicates the number of ports in the list.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sldr, num_ports, 0x04, 24, 8);
+
+/* reg_sldr_system_port
+ * System port.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sldr, system_port, 0x08, 0, 16, 4, 0, false);
+
+static inline void mlxsw_reg_sldr_lag_add_port_pack(char *payload, u8 lag_id,
+						    u8 local_port)
+{
+	MLXSW_REG_ZERO(sldr, payload);
+	mlxsw_reg_sldr_op_set(payload, MLXSW_REG_SLDR_OP_LAG_ADD_PORT_LIST);
+	mlxsw_reg_sldr_lag_id_set(payload, lag_id);
+	mlxsw_reg_sldr_num_ports_set(payload, 1);
+	mlxsw_reg_sldr_system_port_set(payload, 0, local_port);
+}
+
+static inline void mlxsw_reg_sldr_lag_remove_port_pack(char *payload, u8 lag_id,
+						       u8 local_port)
+{
+	MLXSW_REG_ZERO(sldr, payload);
+	mlxsw_reg_sldr_op_set(payload, MLXSW_REG_SLDR_OP_LAG_REMOVE_PORT_LIST);
+	mlxsw_reg_sldr_lag_id_set(payload, lag_id);
+	mlxsw_reg_sldr_num_ports_set(payload, 1);
+	mlxsw_reg_sldr_system_port_set(payload, 0, local_port);
+}
+
+/* SLCR - Switch LAG Configuration 2 Register
+ * -------------------------------------------
+ * The Switch LAG Configuration register is used for configuring the
+ * LAG properties of the switch.
+ */
+#define MLXSW_REG_SLCR_ID 0x2015
+#define MLXSW_REG_SLCR_LEN 0x10
+
+MLXSW_REG_DEFINE(slcr, MLXSW_REG_SLCR_ID, MLXSW_REG_SLCR_LEN);
+
+enum mlxsw_reg_slcr_pp {
+	/* Global Configuration (for all ports) */
+	MLXSW_REG_SLCR_PP_GLOBAL,
+	/* Per port configuration, based on local_port field */
+	MLXSW_REG_SLCR_PP_PER_PORT,
+};
+
+/* reg_slcr_pp
+ * Per Port Configuration
+ * Note: Reading at Global mode results in reading port 1 configuration.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, slcr, pp, 0x00, 24, 1);
+
+/* reg_slcr_local_port
+ * Local port number
+ * Supported from CPU port
+ * Not supported from router port
+ * Reserved when pp = Global Configuration
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, slcr, local_port, 0x00, 16, 8);
+
+enum mlxsw_reg_slcr_type {
+	MLXSW_REG_SLCR_TYPE_CRC, /* default */
+	MLXSW_REG_SLCR_TYPE_XOR,
+	MLXSW_REG_SLCR_TYPE_RANDOM,
+};
+
+/* reg_slcr_type
+ * Hash type
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, slcr, type, 0x00, 0, 4);
+
+/* Ingress port */
+#define MLXSW_REG_SLCR_LAG_HASH_IN_PORT		BIT(0)
+/* SMAC - for IPv4 and IPv6 packets */
+#define MLXSW_REG_SLCR_LAG_HASH_SMAC_IP		BIT(1)
+/* SMAC - for non-IP packets */
+#define MLXSW_REG_SLCR_LAG_HASH_SMAC_NONIP	BIT(2)
+#define MLXSW_REG_SLCR_LAG_HASH_SMAC \
+	(MLXSW_REG_SLCR_LAG_HASH_SMAC_IP | \
+	 MLXSW_REG_SLCR_LAG_HASH_SMAC_NONIP)
+/* DMAC - for IPv4 and IPv6 packets */
+#define MLXSW_REG_SLCR_LAG_HASH_DMAC_IP		BIT(3)
+/* DMAC - for non-IP packets */
+#define MLXSW_REG_SLCR_LAG_HASH_DMAC_NONIP	BIT(4)
+#define MLXSW_REG_SLCR_LAG_HASH_DMAC \
+	(MLXSW_REG_SLCR_LAG_HASH_DMAC_IP | \
+	 MLXSW_REG_SLCR_LAG_HASH_DMAC_NONIP)
+/* Ethertype - for IPv4 and IPv6 packets */
+#define MLXSW_REG_SLCR_LAG_HASH_ETHERTYPE_IP	BIT(5)
+/* Ethertype - for non-IP packets */
+#define MLXSW_REG_SLCR_LAG_HASH_ETHERTYPE_NONIP	BIT(6)
+#define MLXSW_REG_SLCR_LAG_HASH_ETHERTYPE \
+	(MLXSW_REG_SLCR_LAG_HASH_ETHERTYPE_IP | \
+	 MLXSW_REG_SLCR_LAG_HASH_ETHERTYPE_NONIP)
+/* VLAN ID - for IPv4 and IPv6 packets */
+#define MLXSW_REG_SLCR_LAG_HASH_VLANID_IP	BIT(7)
+/* VLAN ID - for non-IP packets */
+#define MLXSW_REG_SLCR_LAG_HASH_VLANID_NONIP	BIT(8)
+#define MLXSW_REG_SLCR_LAG_HASH_VLANID \
+	(MLXSW_REG_SLCR_LAG_HASH_VLANID_IP | \
+	 MLXSW_REG_SLCR_LAG_HASH_VLANID_NONIP)
+/* Source IP address (can be IPv4 or IPv6) */
+#define MLXSW_REG_SLCR_LAG_HASH_SIP		BIT(9)
+/* Destination IP address (can be IPv4 or IPv6) */
+#define MLXSW_REG_SLCR_LAG_HASH_DIP		BIT(10)
+/* TCP/UDP source port */
+#define MLXSW_REG_SLCR_LAG_HASH_SPORT		BIT(11)
+/* TCP/UDP destination port*/
+#define MLXSW_REG_SLCR_LAG_HASH_DPORT		BIT(12)
+/* IPv4 Protocol/IPv6 Next Header */
+#define MLXSW_REG_SLCR_LAG_HASH_IPPROTO		BIT(13)
+/* IPv6 Flow label */
+#define MLXSW_REG_SLCR_LAG_HASH_FLOWLABEL	BIT(14)
+/* SID - FCoE source ID */
+#define MLXSW_REG_SLCR_LAG_HASH_FCOE_SID	BIT(15)
+/* DID - FCoE destination ID */
+#define MLXSW_REG_SLCR_LAG_HASH_FCOE_DID	BIT(16)
+/* OXID - FCoE originator exchange ID */
+#define MLXSW_REG_SLCR_LAG_HASH_FCOE_OXID	BIT(17)
+/* Destination QP number - for RoCE packets */
+#define MLXSW_REG_SLCR_LAG_HASH_ROCE_DQP	BIT(19)
+
+/* reg_slcr_lag_hash
+ * LAG hashing configuration. This is a bitmask, in which each set
+ * bit includes the corresponding item in the LAG hash calculation.
+ * The default lag_hash contains SMAC, DMAC, VLANID and
+ * Ethertype (for all packet types).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, slcr, lag_hash, 0x04, 0, 20);
+
+static inline void mlxsw_reg_slcr_pack(char *payload, u16 lag_hash)
+{
+	MLXSW_REG_ZERO(slcr, payload);
+	mlxsw_reg_slcr_pp_set(payload, MLXSW_REG_SLCR_PP_GLOBAL);
+	mlxsw_reg_slcr_type_set(payload, MLXSW_REG_SLCR_TYPE_CRC);
+	mlxsw_reg_slcr_lag_hash_set(payload, lag_hash);
+}
+
+/* SLCOR - Switch LAG Collector Register
+ * -------------------------------------
+ * The Switch LAG Collector register controls the Local Port membership
+ * in a LAG and enablement of the collector.
+ */
+#define MLXSW_REG_SLCOR_ID 0x2016
+#define MLXSW_REG_SLCOR_LEN 0x10
+
+MLXSW_REG_DEFINE(slcor, MLXSW_REG_SLCOR_ID, MLXSW_REG_SLCOR_LEN);
+
+enum mlxsw_reg_slcor_col {
+	/* Port is added with collector disabled */
+	MLXSW_REG_SLCOR_COL_LAG_ADD_PORT,
+	MLXSW_REG_SLCOR_COL_LAG_COLLECTOR_ENABLED,
+	MLXSW_REG_SLCOR_COL_LAG_COLLECTOR_DISABLED,
+	MLXSW_REG_SLCOR_COL_LAG_REMOVE_PORT,
+};
+
+/* reg_slcor_col
+ * Collector configuration
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, slcor, col, 0x00, 30, 2);
+
+/* reg_slcor_local_port
+ * Local port number
+ * Not supported for CPU port
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, slcor, local_port, 0x00, 16, 8);
+
+/* reg_slcor_lag_id
+ * LAG Identifier. Index into the LAG descriptor table.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, slcor, lag_id, 0x00, 0, 10);
+
+/* reg_slcor_port_index
+ * Port index in the LAG list. Only valid on Add Port to LAG col.
+ * Valid range is from 0 to cap_max_lag_members-1
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, slcor, port_index, 0x04, 0, 10);
+
+static inline void mlxsw_reg_slcor_pack(char *payload,
+					u8 local_port, u16 lag_id,
+					enum mlxsw_reg_slcor_col col)
+{
+	MLXSW_REG_ZERO(slcor, payload);
+	mlxsw_reg_slcor_col_set(payload, col);
+	mlxsw_reg_slcor_local_port_set(payload, local_port);
+	mlxsw_reg_slcor_lag_id_set(payload, lag_id);
+}
+
+static inline void mlxsw_reg_slcor_port_add_pack(char *payload,
+						 u8 local_port, u16 lag_id,
+						 u8 port_index)
+{
+	mlxsw_reg_slcor_pack(payload, local_port, lag_id,
+			     MLXSW_REG_SLCOR_COL_LAG_ADD_PORT);
+	mlxsw_reg_slcor_port_index_set(payload, port_index);
+}
+
+static inline void mlxsw_reg_slcor_port_remove_pack(char *payload,
+						    u8 local_port, u16 lag_id)
+{
+	mlxsw_reg_slcor_pack(payload, local_port, lag_id,
+			     MLXSW_REG_SLCOR_COL_LAG_REMOVE_PORT);
+}
+
+static inline void mlxsw_reg_slcor_col_enable_pack(char *payload,
+						   u8 local_port, u16 lag_id)
+{
+	mlxsw_reg_slcor_pack(payload, local_port, lag_id,
+			     MLXSW_REG_SLCOR_COL_LAG_COLLECTOR_ENABLED);
+}
+
+static inline void mlxsw_reg_slcor_col_disable_pack(char *payload,
+						    u8 local_port, u16 lag_id)
+{
+	mlxsw_reg_slcor_pack(payload, local_port, lag_id,
+			     MLXSW_REG_SLCOR_COL_LAG_COLLECTOR_ENABLED);
+}
+
+/* SPMLR - Switch Port MAC Learning Register
+ * -----------------------------------------
+ * Controls the Switch MAC learning policy per port.
+ */
+#define MLXSW_REG_SPMLR_ID 0x2018
+#define MLXSW_REG_SPMLR_LEN 0x8
+
+MLXSW_REG_DEFINE(spmlr, MLXSW_REG_SPMLR_ID, MLXSW_REG_SPMLR_LEN);
+
+/* reg_spmlr_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spmlr, local_port, 0x00, 16, 8);
+
+/* reg_spmlr_sub_port
+ * Virtual port within the physical port.
+ * Should be set to 0 when virtual ports are not enabled on the port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spmlr, sub_port, 0x00, 8, 8);
+
+enum mlxsw_reg_spmlr_learn_mode {
+	MLXSW_REG_SPMLR_LEARN_MODE_DISABLE = 0,
+	MLXSW_REG_SPMLR_LEARN_MODE_ENABLE = 2,
+	MLXSW_REG_SPMLR_LEARN_MODE_SEC = 3,
+};
+
+/* reg_spmlr_learn_mode
+ * Learning mode on the port.
+ * 0 - Learning disabled.
+ * 2 - Learning enabled.
+ * 3 - Security mode.
+ *
+ * In security mode the switch does not learn MACs on the port, but uses the
+ * SMAC to see if it exists on another ingress port. If so, the packet is
+ * classified as a bad packet and is discarded unless the software registers
+ * to receive port security error packets usign HPKT.
+ */
+MLXSW_ITEM32(reg, spmlr, learn_mode, 0x04, 30, 2);
+
+static inline void mlxsw_reg_spmlr_pack(char *payload, u8 local_port,
+					enum mlxsw_reg_spmlr_learn_mode mode)
+{
+	MLXSW_REG_ZERO(spmlr, payload);
+	mlxsw_reg_spmlr_local_port_set(payload, local_port);
+	mlxsw_reg_spmlr_sub_port_set(payload, 0);
+	mlxsw_reg_spmlr_learn_mode_set(payload, mode);
+}
+
+/* SVFA - Switch VID to FID Allocation Register
+ * --------------------------------------------
+ * Controls the VID to FID mapping and {Port, VID} to FID mapping for
+ * virtualized ports.
+ */
+#define MLXSW_REG_SVFA_ID 0x201C
+#define MLXSW_REG_SVFA_LEN 0x10
+
+MLXSW_REG_DEFINE(svfa, MLXSW_REG_SVFA_ID, MLXSW_REG_SVFA_LEN);
+
+/* reg_svfa_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, svfa, swid, 0x00, 24, 8);
+
+/* reg_svfa_local_port
+ * Local port number.
+ * Access: Index
+ *
+ * Note: Reserved for 802.1Q FIDs.
+ */
+MLXSW_ITEM32(reg, svfa, local_port, 0x00, 16, 8);
+
+enum mlxsw_reg_svfa_mt {
+	MLXSW_REG_SVFA_MT_VID_TO_FID,
+	MLXSW_REG_SVFA_MT_PORT_VID_TO_FID,
+};
+
+/* reg_svfa_mapping_table
+ * Mapping table:
+ * 0 - VID to FID
+ * 1 - {Port, VID} to FID
+ * Access: Index
+ *
+ * Note: Reserved for SwitchX-2.
+ */
+MLXSW_ITEM32(reg, svfa, mapping_table, 0x00, 8, 3);
+
+/* reg_svfa_v
+ * Valid.
+ * Valid if set.
+ * Access: RW
+ *
+ * Note: Reserved for SwitchX-2.
+ */
+MLXSW_ITEM32(reg, svfa, v, 0x00, 0, 1);
+
+/* reg_svfa_fid
+ * Filtering ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, svfa, fid, 0x04, 16, 16);
+
+/* reg_svfa_vid
+ * VLAN ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, svfa, vid, 0x04, 0, 12);
+
+/* reg_svfa_counter_set_type
+ * Counter set type for flow counters.
+ * Access: RW
+ *
+ * Note: Reserved for SwitchX-2.
+ */
+MLXSW_ITEM32(reg, svfa, counter_set_type, 0x08, 24, 8);
+
+/* reg_svfa_counter_index
+ * Counter index for flow counters.
+ * Access: RW
+ *
+ * Note: Reserved for SwitchX-2.
+ */
+MLXSW_ITEM32(reg, svfa, counter_index, 0x08, 0, 24);
+
+static inline void mlxsw_reg_svfa_pack(char *payload, u8 local_port,
+				       enum mlxsw_reg_svfa_mt mt, bool valid,
+				       u16 fid, u16 vid)
+{
+	MLXSW_REG_ZERO(svfa, payload);
+	local_port = mt == MLXSW_REG_SVFA_MT_VID_TO_FID ? 0 : local_port;
+	mlxsw_reg_svfa_swid_set(payload, 0);
+	mlxsw_reg_svfa_local_port_set(payload, local_port);
+	mlxsw_reg_svfa_mapping_table_set(payload, mt);
+	mlxsw_reg_svfa_v_set(payload, valid);
+	mlxsw_reg_svfa_fid_set(payload, fid);
+	mlxsw_reg_svfa_vid_set(payload, vid);
+}
+
+/* SVPE - Switch Virtual-Port Enabling Register
+ * --------------------------------------------
+ * Enables port virtualization.
+ */
+#define MLXSW_REG_SVPE_ID 0x201E
+#define MLXSW_REG_SVPE_LEN 0x4
+
+MLXSW_REG_DEFINE(svpe, MLXSW_REG_SVPE_ID, MLXSW_REG_SVPE_LEN);
+
+/* reg_svpe_local_port
+ * Local port number
+ * Access: Index
+ *
+ * Note: CPU port is not supported (uses VLAN mode only).
+ */
+MLXSW_ITEM32(reg, svpe, local_port, 0x00, 16, 8);
+
+/* reg_svpe_vp_en
+ * Virtual port enable.
+ * 0 - Disable, VLAN mode (VID to FID).
+ * 1 - Enable, Virtual port mode ({Port, VID} to FID).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, svpe, vp_en, 0x00, 8, 1);
+
+static inline void mlxsw_reg_svpe_pack(char *payload, u8 local_port,
+				       bool enable)
+{
+	MLXSW_REG_ZERO(svpe, payload);
+	mlxsw_reg_svpe_local_port_set(payload, local_port);
+	mlxsw_reg_svpe_vp_en_set(payload, enable);
+}
+
+/* SFMR - Switch FID Management Register
+ * -------------------------------------
+ * Creates and configures FIDs.
+ */
+#define MLXSW_REG_SFMR_ID 0x201F
+#define MLXSW_REG_SFMR_LEN 0x18
+
+MLXSW_REG_DEFINE(sfmr, MLXSW_REG_SFMR_ID, MLXSW_REG_SFMR_LEN);
+
+enum mlxsw_reg_sfmr_op {
+	MLXSW_REG_SFMR_OP_CREATE_FID,
+	MLXSW_REG_SFMR_OP_DESTROY_FID,
+};
+
+/* reg_sfmr_op
+ * Operation.
+ * 0 - Create or edit FID.
+ * 1 - Destroy FID.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, sfmr, op, 0x00, 24, 4);
+
+/* reg_sfmr_fid
+ * Filtering ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfmr, fid, 0x00, 0, 16);
+
+/* reg_sfmr_fid_offset
+ * FID offset.
+ * Used to point into the flooding table selected by SFGC register if
+ * the table is of type FID-Offset. Otherwise, this field is reserved.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfmr, fid_offset, 0x08, 0, 16);
+
+/* reg_sfmr_vtfp
+ * Valid Tunnel Flood Pointer.
+ * If not set, then nve_tunnel_flood_ptr is reserved and considered NULL.
+ * Access: RW
+ *
+ * Note: Reserved for 802.1Q FIDs.
+ */
+MLXSW_ITEM32(reg, sfmr, vtfp, 0x0C, 31, 1);
+
+/* reg_sfmr_nve_tunnel_flood_ptr
+ * Underlay Flooding and BC Pointer.
+ * Used as a pointer to the first entry of the group based link lists of
+ * flooding or BC entries (for NVE tunnels).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfmr, nve_tunnel_flood_ptr, 0x0C, 0, 24);
+
+/* reg_sfmr_vv
+ * VNI Valid.
+ * If not set, then vni is reserved.
+ * Access: RW
+ *
+ * Note: Reserved for 802.1Q FIDs.
+ */
+MLXSW_ITEM32(reg, sfmr, vv, 0x10, 31, 1);
+
+/* reg_sfmr_vni
+ * Virtual Network Identifier.
+ * Access: RW
+ *
+ * Note: A given VNI can only be assigned to one FID.
+ */
+MLXSW_ITEM32(reg, sfmr, vni, 0x10, 0, 24);
+
+static inline void mlxsw_reg_sfmr_pack(char *payload,
+				       enum mlxsw_reg_sfmr_op op, u16 fid,
+				       u16 fid_offset)
+{
+	MLXSW_REG_ZERO(sfmr, payload);
+	mlxsw_reg_sfmr_op_set(payload, op);
+	mlxsw_reg_sfmr_fid_set(payload, fid);
+	mlxsw_reg_sfmr_fid_offset_set(payload, fid_offset);
+	mlxsw_reg_sfmr_vtfp_set(payload, false);
+	mlxsw_reg_sfmr_vv_set(payload, false);
+}
+
+/* SPVMLR - Switch Port VLAN MAC Learning Register
+ * -----------------------------------------------
+ * Controls the switch MAC learning policy per {Port, VID}.
+ */
+#define MLXSW_REG_SPVMLR_ID 0x2020
+#define MLXSW_REG_SPVMLR_BASE_LEN 0x04 /* base length, without records */
+#define MLXSW_REG_SPVMLR_REC_LEN 0x04 /* record length */
+#define MLXSW_REG_SPVMLR_REC_MAX_COUNT 255
+#define MLXSW_REG_SPVMLR_LEN (MLXSW_REG_SPVMLR_BASE_LEN + \
+			      MLXSW_REG_SPVMLR_REC_LEN * \
+			      MLXSW_REG_SPVMLR_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(spvmlr, MLXSW_REG_SPVMLR_ID, MLXSW_REG_SPVMLR_LEN);
+
+/* reg_spvmlr_local_port
+ * Local ingress port.
+ * Access: Index
+ *
+ * Note: CPU port is not supported.
+ */
+MLXSW_ITEM32(reg, spvmlr, local_port, 0x00, 16, 8);
+
+/* reg_spvmlr_num_rec
+ * Number of records to update.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, spvmlr, num_rec, 0x00, 0, 8);
+
+/* reg_spvmlr_rec_learn_enable
+ * 0 - Disable learning for {Port, VID}.
+ * 1 - Enable learning for {Port, VID}.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, spvmlr, rec_learn_enable, MLXSW_REG_SPVMLR_BASE_LEN,
+		     31, 1, MLXSW_REG_SPVMLR_REC_LEN, 0x00, false);
+
+/* reg_spvmlr_rec_vid
+ * VLAN ID to be added/removed from port or for querying.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, spvmlr, rec_vid, MLXSW_REG_SPVMLR_BASE_LEN, 0, 12,
+		     MLXSW_REG_SPVMLR_REC_LEN, 0x00, false);
+
+static inline void mlxsw_reg_spvmlr_pack(char *payload, u8 local_port,
+					 u16 vid_begin, u16 vid_end,
+					 bool learn_enable)
+{
+	int num_rec = vid_end - vid_begin + 1;
+	int i;
+
+	WARN_ON(num_rec < 1 || num_rec > MLXSW_REG_SPVMLR_REC_MAX_COUNT);
+
+	MLXSW_REG_ZERO(spvmlr, payload);
+	mlxsw_reg_spvmlr_local_port_set(payload, local_port);
+	mlxsw_reg_spvmlr_num_rec_set(payload, num_rec);
+
+	for (i = 0; i < num_rec; i++) {
+		mlxsw_reg_spvmlr_rec_learn_enable_set(payload, i, learn_enable);
+		mlxsw_reg_spvmlr_rec_vid_set(payload, i, vid_begin + i);
+	}
+}
+
+/* CWTP - Congetion WRED ECN TClass Profile
+ * ----------------------------------------
+ * Configures the profiles for queues of egress port and traffic class
+ */
+#define MLXSW_REG_CWTP_ID 0x2802
+#define MLXSW_REG_CWTP_BASE_LEN 0x28
+#define MLXSW_REG_CWTP_PROFILE_DATA_REC_LEN 0x08
+#define MLXSW_REG_CWTP_LEN 0x40
+
+MLXSW_REG_DEFINE(cwtp, MLXSW_REG_CWTP_ID, MLXSW_REG_CWTP_LEN);
+
+/* reg_cwtp_local_port
+ * Local port number
+ * Not supported for CPU port
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, cwtp, local_port, 0, 16, 8);
+
+/* reg_cwtp_traffic_class
+ * Traffic Class to configure
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, cwtp, traffic_class, 32, 0, 8);
+
+/* reg_cwtp_profile_min
+ * Minimum Average Queue Size of the profile in cells.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, cwtp, profile_min, MLXSW_REG_CWTP_BASE_LEN,
+		     0, 20, MLXSW_REG_CWTP_PROFILE_DATA_REC_LEN, 0, false);
+
+/* reg_cwtp_profile_percent
+ * Percentage of WRED and ECN marking for maximum Average Queue size
+ * Range is 0 to 100, units of integer percentage
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, cwtp, profile_percent, MLXSW_REG_CWTP_BASE_LEN,
+		     24, 7, MLXSW_REG_CWTP_PROFILE_DATA_REC_LEN, 4, false);
+
+/* reg_cwtp_profile_max
+ * Maximum Average Queue size of the profile in cells
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, cwtp, profile_max, MLXSW_REG_CWTP_BASE_LEN,
+		     0, 20, MLXSW_REG_CWTP_PROFILE_DATA_REC_LEN, 4, false);
+
+#define MLXSW_REG_CWTP_MIN_VALUE 64
+#define MLXSW_REG_CWTP_MAX_PROFILE 2
+#define MLXSW_REG_CWTP_DEFAULT_PROFILE 1
+
+static inline void mlxsw_reg_cwtp_pack(char *payload, u8 local_port,
+				       u8 traffic_class)
+{
+	int i;
+
+	MLXSW_REG_ZERO(cwtp, payload);
+	mlxsw_reg_cwtp_local_port_set(payload, local_port);
+	mlxsw_reg_cwtp_traffic_class_set(payload, traffic_class);
+
+	for (i = 0; i <= MLXSW_REG_CWTP_MAX_PROFILE; i++) {
+		mlxsw_reg_cwtp_profile_min_set(payload, i,
+					       MLXSW_REG_CWTP_MIN_VALUE);
+		mlxsw_reg_cwtp_profile_max_set(payload, i,
+					       MLXSW_REG_CWTP_MIN_VALUE);
+	}
+}
+
+#define MLXSW_REG_CWTP_PROFILE_TO_INDEX(profile) (profile - 1)
+
+static inline void
+mlxsw_reg_cwtp_profile_pack(char *payload, u8 profile, u32 min, u32 max,
+			    u32 probability)
+{
+	u8 index = MLXSW_REG_CWTP_PROFILE_TO_INDEX(profile);
+
+	mlxsw_reg_cwtp_profile_min_set(payload, index, min);
+	mlxsw_reg_cwtp_profile_max_set(payload, index, max);
+	mlxsw_reg_cwtp_profile_percent_set(payload, index, probability);
+}
+
+/* CWTPM - Congestion WRED ECN TClass and Pool Mapping
+ * ---------------------------------------------------
+ * The CWTPM register maps each egress port and traffic class to profile num.
+ */
+#define MLXSW_REG_CWTPM_ID 0x2803
+#define MLXSW_REG_CWTPM_LEN 0x44
+
+MLXSW_REG_DEFINE(cwtpm, MLXSW_REG_CWTPM_ID, MLXSW_REG_CWTPM_LEN);
+
+/* reg_cwtpm_local_port
+ * Local port number
+ * Not supported for CPU port
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, cwtpm, local_port, 0, 16, 8);
+
+/* reg_cwtpm_traffic_class
+ * Traffic Class to configure
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, cwtpm, traffic_class, 32, 0, 8);
+
+/* reg_cwtpm_ew
+ * Control enablement of WRED for traffic class:
+ * 0 - Disable
+ * 1 - Enable
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, ew, 36, 1, 1);
+
+/* reg_cwtpm_ee
+ * Control enablement of ECN for traffic class:
+ * 0 - Disable
+ * 1 - Enable
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, ee, 36, 0, 1);
+
+/* reg_cwtpm_tcp_g
+ * TCP Green Profile.
+ * Index of the profile within {port, traffic class} to use.
+ * 0 for disabling both WRED and ECN for this type of traffic.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, tcp_g, 52, 0, 2);
+
+/* reg_cwtpm_tcp_y
+ * TCP Yellow Profile.
+ * Index of the profile within {port, traffic class} to use.
+ * 0 for disabling both WRED and ECN for this type of traffic.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, tcp_y, 56, 16, 2);
+
+/* reg_cwtpm_tcp_r
+ * TCP Red Profile.
+ * Index of the profile within {port, traffic class} to use.
+ * 0 for disabling both WRED and ECN for this type of traffic.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, tcp_r, 56, 0, 2);
+
+/* reg_cwtpm_ntcp_g
+ * Non-TCP Green Profile.
+ * Index of the profile within {port, traffic class} to use.
+ * 0 for disabling both WRED and ECN for this type of traffic.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, ntcp_g, 60, 0, 2);
+
+/* reg_cwtpm_ntcp_y
+ * Non-TCP Yellow Profile.
+ * Index of the profile within {port, traffic class} to use.
+ * 0 for disabling both WRED and ECN for this type of traffic.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, ntcp_y, 64, 16, 2);
+
+/* reg_cwtpm_ntcp_r
+ * Non-TCP Red Profile.
+ * Index of the profile within {port, traffic class} to use.
+ * 0 for disabling both WRED and ECN for this type of traffic.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, ntcp_r, 64, 0, 2);
+
+#define MLXSW_REG_CWTPM_RESET_PROFILE 0
+
+static inline void mlxsw_reg_cwtpm_pack(char *payload, u8 local_port,
+					u8 traffic_class, u8 profile,
+					bool wred, bool ecn)
+{
+	MLXSW_REG_ZERO(cwtpm, payload);
+	mlxsw_reg_cwtpm_local_port_set(payload, local_port);
+	mlxsw_reg_cwtpm_traffic_class_set(payload, traffic_class);
+	mlxsw_reg_cwtpm_ew_set(payload, wred);
+	mlxsw_reg_cwtpm_ee_set(payload, ecn);
+	mlxsw_reg_cwtpm_tcp_g_set(payload, profile);
+	mlxsw_reg_cwtpm_tcp_y_set(payload, profile);
+	mlxsw_reg_cwtpm_tcp_r_set(payload, profile);
+	mlxsw_reg_cwtpm_ntcp_g_set(payload, profile);
+	mlxsw_reg_cwtpm_ntcp_y_set(payload, profile);
+	mlxsw_reg_cwtpm_ntcp_r_set(payload, profile);
+}
+
+/* PPBT - Policy-Engine Port Binding Table
+ * ---------------------------------------
+ * This register is used for configuration of the Port Binding Table.
+ */
+#define MLXSW_REG_PPBT_ID 0x3002
+#define MLXSW_REG_PPBT_LEN 0x14
+
+MLXSW_REG_DEFINE(ppbt, MLXSW_REG_PPBT_ID, MLXSW_REG_PPBT_LEN);
+
+enum mlxsw_reg_pxbt_e {
+	MLXSW_REG_PXBT_E_IACL,
+	MLXSW_REG_PXBT_E_EACL,
+};
+
+/* reg_ppbt_e
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppbt, e, 0x00, 31, 1);
+
+enum mlxsw_reg_pxbt_op {
+	MLXSW_REG_PXBT_OP_BIND,
+	MLXSW_REG_PXBT_OP_UNBIND,
+};
+
+/* reg_ppbt_op
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ppbt, op, 0x00, 28, 3);
+
+/* reg_ppbt_local_port
+ * Local port. Not including CPU port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppbt, local_port, 0x00, 16, 8);
+
+/* reg_ppbt_g
+ * group - When set, the binding is of an ACL group. When cleared,
+ * the binding is of an ACL.
+ * Must be set to 1 for Spectrum.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ppbt, g, 0x10, 31, 1);
+
+/* reg_ppbt_acl_info
+ * ACL/ACL group identifier. If the g bit is set, this field should hold
+ * the acl_group_id, else it should hold the acl_id.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ppbt, acl_info, 0x10, 0, 16);
+
+static inline void mlxsw_reg_ppbt_pack(char *payload, enum mlxsw_reg_pxbt_e e,
+				       enum mlxsw_reg_pxbt_op op,
+				       u8 local_port, u16 acl_info)
+{
+	MLXSW_REG_ZERO(ppbt, payload);
+	mlxsw_reg_ppbt_e_set(payload, e);
+	mlxsw_reg_ppbt_op_set(payload, op);
+	mlxsw_reg_ppbt_local_port_set(payload, local_port);
+	mlxsw_reg_ppbt_g_set(payload, true);
+	mlxsw_reg_ppbt_acl_info_set(payload, acl_info);
+}
+
+/* PACL - Policy-Engine ACL Register
+ * ---------------------------------
+ * This register is used for configuration of the ACL.
+ */
+#define MLXSW_REG_PACL_ID 0x3004
+#define MLXSW_REG_PACL_LEN 0x70
+
+MLXSW_REG_DEFINE(pacl, MLXSW_REG_PACL_ID, MLXSW_REG_PACL_LEN);
+
+/* reg_pacl_v
+ * Valid. Setting the v bit makes the ACL valid. It should not be cleared
+ * while the ACL is bounded to either a port, VLAN or ACL rule.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pacl, v, 0x00, 24, 1);
+
+/* reg_pacl_acl_id
+ * An identifier representing the ACL (managed by software)
+ * Range 0 .. cap_max_acl_regions - 1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pacl, acl_id, 0x08, 0, 16);
+
+#define MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN 16
+
+/* reg_pacl_tcam_region_info
+ * Opaque object that represents a TCAM region.
+ * Obtained through PTAR register.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, pacl, tcam_region_info, 0x30,
+	       MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN);
+
+static inline void mlxsw_reg_pacl_pack(char *payload, u16 acl_id,
+				       bool valid, const char *tcam_region_info)
+{
+	MLXSW_REG_ZERO(pacl, payload);
+	mlxsw_reg_pacl_acl_id_set(payload, acl_id);
+	mlxsw_reg_pacl_v_set(payload, valid);
+	mlxsw_reg_pacl_tcam_region_info_memcpy_to(payload, tcam_region_info);
+}
+
+/* PAGT - Policy-Engine ACL Group Table
+ * ------------------------------------
+ * This register is used for configuration of the ACL Group Table.
+ */
+#define MLXSW_REG_PAGT_ID 0x3005
+#define MLXSW_REG_PAGT_BASE_LEN 0x30
+#define MLXSW_REG_PAGT_ACL_LEN 4
+#define MLXSW_REG_PAGT_ACL_MAX_NUM 16
+#define MLXSW_REG_PAGT_LEN (MLXSW_REG_PAGT_BASE_LEN + \
+		MLXSW_REG_PAGT_ACL_MAX_NUM * MLXSW_REG_PAGT_ACL_LEN)
+
+MLXSW_REG_DEFINE(pagt, MLXSW_REG_PAGT_ID, MLXSW_REG_PAGT_LEN);
+
+/* reg_pagt_size
+ * Number of ACLs in the group.
+ * Size 0 invalidates a group.
+ * Range 0 .. cap_max_acl_group_size (hard coded to 16 for now)
+ * Total number of ACLs in all groups must be lower or equal
+ * to cap_max_acl_tot_groups
+ * Note: a group which is binded must not be invalidated
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pagt, size, 0x00, 0, 8);
+
+/* reg_pagt_acl_group_id
+ * An identifier (numbered from 0..cap_max_acl_groups-1) representing
+ * the ACL Group identifier (managed by software).
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pagt, acl_group_id, 0x08, 0, 16);
+
+/* reg_pagt_acl_id
+ * ACL identifier
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pagt, acl_id, 0x30, 0, 16, 0x04, 0x00, false);
+
+static inline void mlxsw_reg_pagt_pack(char *payload, u16 acl_group_id)
+{
+	MLXSW_REG_ZERO(pagt, payload);
+	mlxsw_reg_pagt_acl_group_id_set(payload, acl_group_id);
+}
+
+static inline void mlxsw_reg_pagt_acl_id_pack(char *payload, int index,
+					      u16 acl_id)
+{
+	u8 size = mlxsw_reg_pagt_size_get(payload);
+
+	if (index >= size)
+		mlxsw_reg_pagt_size_set(payload, index + 1);
+	mlxsw_reg_pagt_acl_id_set(payload, index, acl_id);
+}
+
+/* PTAR - Policy-Engine TCAM Allocation Register
+ * ---------------------------------------------
+ * This register is used for allocation of regions in the TCAM.
+ * Note: Query method is not supported on this register.
+ */
+#define MLXSW_REG_PTAR_ID 0x3006
+#define MLXSW_REG_PTAR_BASE_LEN 0x20
+#define MLXSW_REG_PTAR_KEY_ID_LEN 1
+#define MLXSW_REG_PTAR_KEY_ID_MAX_NUM 16
+#define MLXSW_REG_PTAR_LEN (MLXSW_REG_PTAR_BASE_LEN + \
+		MLXSW_REG_PTAR_KEY_ID_MAX_NUM * MLXSW_REG_PTAR_KEY_ID_LEN)
+
+MLXSW_REG_DEFINE(ptar, MLXSW_REG_PTAR_ID, MLXSW_REG_PTAR_LEN);
+
+enum mlxsw_reg_ptar_op {
+	/* allocate a TCAM region */
+	MLXSW_REG_PTAR_OP_ALLOC,
+	/* resize a TCAM region */
+	MLXSW_REG_PTAR_OP_RESIZE,
+	/* deallocate TCAM region */
+	MLXSW_REG_PTAR_OP_FREE,
+	/* test allocation */
+	MLXSW_REG_PTAR_OP_TEST,
+};
+
+/* reg_ptar_op
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, ptar, op, 0x00, 28, 4);
+
+/* reg_ptar_action_set_type
+ * Type of action set to be used on this region.
+ * For Spectrum, this is always type 2 - "flexible"
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, ptar, action_set_type, 0x00, 16, 8);
+
+/* reg_ptar_key_type
+ * TCAM key type for the region.
+ * For Spectrum, this is always type 0x50 - "FLEX_KEY"
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, ptar, key_type, 0x00, 0, 8);
+
+/* reg_ptar_region_size
+ * TCAM region size. When allocating/resizing this is the requested size,
+ * the response is the actual size. Note that actual size may be
+ * larger than requested.
+ * Allowed range 1 .. cap_max_rules-1
+ * Reserved during op deallocate.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, ptar, region_size, 0x04, 0, 16);
+
+/* reg_ptar_region_id
+ * Region identifier
+ * Range 0 .. cap_max_regions-1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ptar, region_id, 0x08, 0, 16);
+
+/* reg_ptar_tcam_region_info
+ * Opaque object that represents the TCAM region.
+ * Returned when allocating a region.
+ * Provided by software for ACL generation and region deallocation and resize.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ptar, tcam_region_info, 0x10,
+	       MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN);
+
+/* reg_ptar_flexible_key_id
+ * Identifier of the Flexible Key.
+ * Only valid if key_type == "FLEX_KEY"
+ * The key size will be rounded up to one of the following values:
+ * 9B, 18B, 36B, 54B.
+ * This field is reserved for in resize operation.
+ * Access: WO
+ */
+MLXSW_ITEM8_INDEXED(reg, ptar, flexible_key_id, 0x20, 0, 8,
+		    MLXSW_REG_PTAR_KEY_ID_LEN, 0x00, false);
+
+static inline void mlxsw_reg_ptar_pack(char *payload, enum mlxsw_reg_ptar_op op,
+				       u16 region_size, u16 region_id,
+				       const char *tcam_region_info)
+{
+	MLXSW_REG_ZERO(ptar, payload);
+	mlxsw_reg_ptar_op_set(payload, op);
+	mlxsw_reg_ptar_action_set_type_set(payload, 2); /* "flexible" */
+	mlxsw_reg_ptar_key_type_set(payload, 0x50); /* "FLEX_KEY" */
+	mlxsw_reg_ptar_region_size_set(payload, region_size);
+	mlxsw_reg_ptar_region_id_set(payload, region_id);
+	mlxsw_reg_ptar_tcam_region_info_memcpy_to(payload, tcam_region_info);
+}
+
+static inline void mlxsw_reg_ptar_key_id_pack(char *payload, int index,
+					      u16 key_id)
+{
+	mlxsw_reg_ptar_flexible_key_id_set(payload, index, key_id);
+}
+
+static inline void mlxsw_reg_ptar_unpack(char *payload, char *tcam_region_info)
+{
+	mlxsw_reg_ptar_tcam_region_info_memcpy_from(payload, tcam_region_info);
+}
+
+/* PPBS - Policy-Engine Policy Based Switching Register
+ * ----------------------------------------------------
+ * This register retrieves and sets Policy Based Switching Table entries.
+ */
+#define MLXSW_REG_PPBS_ID 0x300C
+#define MLXSW_REG_PPBS_LEN 0x14
+
+MLXSW_REG_DEFINE(ppbs, MLXSW_REG_PPBS_ID, MLXSW_REG_PPBS_LEN);
+
+/* reg_ppbs_pbs_ptr
+ * Index into the PBS table.
+ * For Spectrum, the index points to the KVD Linear.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppbs, pbs_ptr, 0x08, 0, 24);
+
+/* reg_ppbs_system_port
+ * Unique port identifier for the final destination of the packet.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ppbs, system_port, 0x10, 0, 16);
+
+static inline void mlxsw_reg_ppbs_pack(char *payload, u32 pbs_ptr,
+				       u16 system_port)
+{
+	MLXSW_REG_ZERO(ppbs, payload);
+	mlxsw_reg_ppbs_pbs_ptr_set(payload, pbs_ptr);
+	mlxsw_reg_ppbs_system_port_set(payload, system_port);
+}
+
+/* PRCR - Policy-Engine Rules Copy Register
+ * ----------------------------------------
+ * This register is used for accessing rules within a TCAM region.
+ */
+#define MLXSW_REG_PRCR_ID 0x300D
+#define MLXSW_REG_PRCR_LEN 0x40
+
+MLXSW_REG_DEFINE(prcr, MLXSW_REG_PRCR_ID, MLXSW_REG_PRCR_LEN);
+
+enum mlxsw_reg_prcr_op {
+	/* Move rules. Moves the rules from "tcam_region_info" starting
+	 * at offset "offset" to "dest_tcam_region_info"
+	 * at offset "dest_offset."
+	 */
+	MLXSW_REG_PRCR_OP_MOVE,
+	/* Copy rules. Copies the rules from "tcam_region_info" starting
+	 * at offset "offset" to "dest_tcam_region_info"
+	 * at offset "dest_offset."
+	 */
+	MLXSW_REG_PRCR_OP_COPY,
+};
+
+/* reg_prcr_op
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, prcr, op, 0x00, 28, 4);
+
+/* reg_prcr_offset
+ * Offset within the source region to copy/move from.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, prcr, offset, 0x00, 0, 16);
+
+/* reg_prcr_size
+ * The number of rules to copy/move.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, prcr, size, 0x04, 0, 16);
+
+/* reg_prcr_tcam_region_info
+ * Opaque object that represents the source TCAM region.
+ * Access: Index
+ */
+MLXSW_ITEM_BUF(reg, prcr, tcam_region_info, 0x10,
+	       MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN);
+
+/* reg_prcr_dest_offset
+ * Offset within the source region to copy/move to.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, prcr, dest_offset, 0x20, 0, 16);
+
+/* reg_prcr_dest_tcam_region_info
+ * Opaque object that represents the destination TCAM region.
+ * Access: Index
+ */
+MLXSW_ITEM_BUF(reg, prcr, dest_tcam_region_info, 0x30,
+	       MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN);
+
+static inline void mlxsw_reg_prcr_pack(char *payload, enum mlxsw_reg_prcr_op op,
+				       const char *src_tcam_region_info,
+				       u16 src_offset,
+				       const char *dest_tcam_region_info,
+				       u16 dest_offset, u16 size)
+{
+	MLXSW_REG_ZERO(prcr, payload);
+	mlxsw_reg_prcr_op_set(payload, op);
+	mlxsw_reg_prcr_offset_set(payload, src_offset);
+	mlxsw_reg_prcr_size_set(payload, size);
+	mlxsw_reg_prcr_tcam_region_info_memcpy_to(payload,
+						  src_tcam_region_info);
+	mlxsw_reg_prcr_dest_offset_set(payload, dest_offset);
+	mlxsw_reg_prcr_dest_tcam_region_info_memcpy_to(payload,
+						       dest_tcam_region_info);
+}
+
+/* PEFA - Policy-Engine Extended Flexible Action Register
+ * ------------------------------------------------------
+ * This register is used for accessing an extended flexible action entry
+ * in the central KVD Linear Database.
+ */
+#define MLXSW_REG_PEFA_ID 0x300F
+#define MLXSW_REG_PEFA_LEN 0xB0
+
+MLXSW_REG_DEFINE(pefa, MLXSW_REG_PEFA_ID, MLXSW_REG_PEFA_LEN);
+
+/* reg_pefa_index
+ * Index in the KVD Linear Centralized Database.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pefa, index, 0x00, 0, 24);
+
+#define MLXSW_REG_FLEX_ACTION_SET_LEN 0xA8
+
+/* reg_pefa_flex_action_set
+ * Action-set to perform when rule is matched.
+ * Must be zero padded if action set is shorter.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, pefa, flex_action_set, 0x08, MLXSW_REG_FLEX_ACTION_SET_LEN);
+
+static inline void mlxsw_reg_pefa_pack(char *payload, u32 index,
+				       const char *flex_action_set)
+{
+	MLXSW_REG_ZERO(pefa, payload);
+	mlxsw_reg_pefa_index_set(payload, index);
+	mlxsw_reg_pefa_flex_action_set_memcpy_to(payload, flex_action_set);
+}
+
+/* PTCE-V2 - Policy-Engine TCAM Entry Register Version 2
+ * -----------------------------------------------------
+ * This register is used for accessing rules within a TCAM region.
+ * It is a new version of PTCE in order to support wider key,
+ * mask and action within a TCAM region. This register is not supported
+ * by SwitchX and SwitchX-2.
+ */
+#define MLXSW_REG_PTCE2_ID 0x3017
+#define MLXSW_REG_PTCE2_LEN 0x1D8
+
+MLXSW_REG_DEFINE(ptce2, MLXSW_REG_PTCE2_ID, MLXSW_REG_PTCE2_LEN);
+
+/* reg_ptce2_v
+ * Valid.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptce2, v, 0x00, 31, 1);
+
+/* reg_ptce2_a
+ * Activity. Set if a packet lookup has hit on the specific entry.
+ * To clear the "a" bit, use "clear activity" op or "clear on read" op.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptce2, a, 0x00, 30, 1);
+
+enum mlxsw_reg_ptce2_op {
+	/* Read operation. */
+	MLXSW_REG_PTCE2_OP_QUERY_READ = 0,
+	/* clear on read operation. Used to read entry
+	 * and clear Activity bit.
+	 */
+	MLXSW_REG_PTCE2_OP_QUERY_CLEAR_ON_READ = 1,
+	/* Write operation. Used to write a new entry to the table.
+	 * All R/W fields are relevant for new entry. Activity bit is set
+	 * for new entries - Note write with v = 0 will delete the entry.
+	 */
+	MLXSW_REG_PTCE2_OP_WRITE_WRITE = 0,
+	/* Update action. Only action set will be updated. */
+	MLXSW_REG_PTCE2_OP_WRITE_UPDATE = 1,
+	/* Clear activity. A bit is cleared for the entry. */
+	MLXSW_REG_PTCE2_OP_WRITE_CLEAR_ACTIVITY = 2,
+};
+
+/* reg_ptce2_op
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, ptce2, op, 0x00, 20, 3);
+
+/* reg_ptce2_offset
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ptce2, offset, 0x00, 0, 16);
+
+/* reg_ptce2_tcam_region_info
+ * Opaque object that represents the TCAM region.
+ * Access: Index
+ */
+MLXSW_ITEM_BUF(reg, ptce2, tcam_region_info, 0x10,
+	       MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN);
+
+#define MLXSW_REG_PTCE2_FLEX_KEY_BLOCKS_LEN 96
+
+/* reg_ptce2_flex_key_blocks
+ * ACL Key.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ptce2, flex_key_blocks, 0x20,
+	       MLXSW_REG_PTCE2_FLEX_KEY_BLOCKS_LEN);
+
+/* reg_ptce2_mask
+ * mask- in the same size as key. A bit that is set directs the TCAM
+ * to compare the corresponding bit in key. A bit that is clear directs
+ * the TCAM to ignore the corresponding bit in key.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ptce2, mask, 0x80,
+	       MLXSW_REG_PTCE2_FLEX_KEY_BLOCKS_LEN);
+
+/* reg_ptce2_flex_action_set
+ * ACL action set.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ptce2, flex_action_set, 0xE0,
+	       MLXSW_REG_FLEX_ACTION_SET_LEN);
+
+static inline void mlxsw_reg_ptce2_pack(char *payload, bool valid,
+					enum mlxsw_reg_ptce2_op op,
+					const char *tcam_region_info,
+					u16 offset)
+{
+	MLXSW_REG_ZERO(ptce2, payload);
+	mlxsw_reg_ptce2_v_set(payload, valid);
+	mlxsw_reg_ptce2_op_set(payload, op);
+	mlxsw_reg_ptce2_offset_set(payload, offset);
+	mlxsw_reg_ptce2_tcam_region_info_memcpy_to(payload, tcam_region_info);
+}
+
+/* QPCR - QoS Policer Configuration Register
+ * -----------------------------------------
+ * The QPCR register is used to create policers - that limit
+ * the rate of bytes or packets via some trap group.
+ */
+#define MLXSW_REG_QPCR_ID 0x4004
+#define MLXSW_REG_QPCR_LEN 0x28
+
+MLXSW_REG_DEFINE(qpcr, MLXSW_REG_QPCR_ID, MLXSW_REG_QPCR_LEN);
+
+enum mlxsw_reg_qpcr_g {
+	MLXSW_REG_QPCR_G_GLOBAL = 2,
+	MLXSW_REG_QPCR_G_STORM_CONTROL = 3,
+};
+
+/* reg_qpcr_g
+ * The policer type.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qpcr, g, 0x00, 14, 2);
+
+/* reg_qpcr_pid
+ * Policer ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qpcr, pid, 0x00, 0, 14);
+
+/* reg_qpcr_color_aware
+ * Is the policer aware of colors.
+ * Must be 0 (unaware) for cpu port.
+ * Access: RW for unbounded policer. RO for bounded policer.
+ */
+MLXSW_ITEM32(reg, qpcr, color_aware, 0x04, 15, 1);
+
+/* reg_qpcr_bytes
+ * Is policer limit is for bytes per sec or packets per sec.
+ * 0 - packets
+ * 1 - bytes
+ * Access: RW for unbounded policer. RO for bounded policer.
+ */
+MLXSW_ITEM32(reg, qpcr, bytes, 0x04, 14, 1);
+
+enum mlxsw_reg_qpcr_ir_units {
+	MLXSW_REG_QPCR_IR_UNITS_M,
+	MLXSW_REG_QPCR_IR_UNITS_K,
+};
+
+/* reg_qpcr_ir_units
+ * Policer's units for cir and eir fields (for bytes limits only)
+ * 1 - 10^3
+ * 0 - 10^6
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, qpcr, ir_units, 0x04, 12, 1);
+
+enum mlxsw_reg_qpcr_rate_type {
+	MLXSW_REG_QPCR_RATE_TYPE_SINGLE = 1,
+	MLXSW_REG_QPCR_RATE_TYPE_DOUBLE = 2,
+};
+
+/* reg_qpcr_rate_type
+ * Policer can have one limit (single rate) or 2 limits with specific operation
+ * for packets that exceed the lower rate but not the upper one.
+ * (For cpu port must be single rate)
+ * Access: RW for unbounded policer. RO for bounded policer.
+ */
+MLXSW_ITEM32(reg, qpcr, rate_type, 0x04, 8, 2);
+
+/* reg_qpc_cbs
+ * Policer's committed burst size.
+ * The policer is working with time slices of 50 nano sec. By default every
+ * slice is granted the proportionate share of the committed rate. If we want to
+ * allow a slice to exceed that share (while still keeping the rate per sec) we
+ * can allow burst. The burst size is between the default proportionate share
+ * (and no lower than 8) to 32Gb. (Even though giving a number higher than the
+ * committed rate will result in exceeding the rate). The burst size must be a
+ * log of 2 and will be determined by 2^cbs.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qpcr, cbs, 0x08, 24, 6);
+
+/* reg_qpcr_cir
+ * Policer's committed rate.
+ * The rate used for sungle rate, the lower rate for double rate.
+ * For bytes limits, the rate will be this value * the unit from ir_units.
+ * (Resolution error is up to 1%).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qpcr, cir, 0x0C, 0, 32);
+
+/* reg_qpcr_eir
+ * Policer's exceed rate.
+ * The higher rate for double rate, reserved for single rate.
+ * Lower rate for double rate policer.
+ * For bytes limits, the rate will be this value * the unit from ir_units.
+ * (Resolution error is up to 1%).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qpcr, eir, 0x10, 0, 32);
+
+#define MLXSW_REG_QPCR_DOUBLE_RATE_ACTION 2
+
+/* reg_qpcr_exceed_action.
+ * What to do with packets between the 2 limits for double rate.
+ * Access: RW for unbounded policer. RO for bounded policer.
+ */
+MLXSW_ITEM32(reg, qpcr, exceed_action, 0x14, 0, 4);
+
+enum mlxsw_reg_qpcr_action {
+	/* Discard */
+	MLXSW_REG_QPCR_ACTION_DISCARD = 1,
+	/* Forward and set color to red.
+	 * If the packet is intended to cpu port, it will be dropped.
+	 */
+	MLXSW_REG_QPCR_ACTION_FORWARD = 2,
+};
+
+/* reg_qpcr_violate_action
+ * What to do with packets that cross the cir limit (for single rate) or the eir
+ * limit (for double rate).
+ * Access: RW for unbounded policer. RO for bounded policer.
+ */
+MLXSW_ITEM32(reg, qpcr, violate_action, 0x18, 0, 4);
+
+static inline void mlxsw_reg_qpcr_pack(char *payload, u16 pid,
+				       enum mlxsw_reg_qpcr_ir_units ir_units,
+				       bool bytes, u32 cir, u16 cbs)
+{
+	MLXSW_REG_ZERO(qpcr, payload);
+	mlxsw_reg_qpcr_pid_set(payload, pid);
+	mlxsw_reg_qpcr_g_set(payload, MLXSW_REG_QPCR_G_GLOBAL);
+	mlxsw_reg_qpcr_rate_type_set(payload, MLXSW_REG_QPCR_RATE_TYPE_SINGLE);
+	mlxsw_reg_qpcr_violate_action_set(payload,
+					  MLXSW_REG_QPCR_ACTION_DISCARD);
+	mlxsw_reg_qpcr_cir_set(payload, cir);
+	mlxsw_reg_qpcr_ir_units_set(payload, ir_units);
+	mlxsw_reg_qpcr_bytes_set(payload, bytes);
+	mlxsw_reg_qpcr_cbs_set(payload, cbs);
+}
+
+/* QTCT - QoS Switch Traffic Class Table
+ * -------------------------------------
+ * Configures the mapping between the packet switch priority and the
+ * traffic class on the transmit port.
+ */
+#define MLXSW_REG_QTCT_ID 0x400A
+#define MLXSW_REG_QTCT_LEN 0x08
+
+MLXSW_REG_DEFINE(qtct, MLXSW_REG_QTCT_ID, MLXSW_REG_QTCT_LEN);
+
+/* reg_qtct_local_port
+ * Local port number.
+ * Access: Index
+ *
+ * Note: CPU port is not supported.
+ */
+MLXSW_ITEM32(reg, qtct, local_port, 0x00, 16, 8);
+
+/* reg_qtct_sub_port
+ * Virtual port within the physical port.
+ * Should be set to 0 when virtual ports are not enabled on the port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qtct, sub_port, 0x00, 8, 8);
+
+/* reg_qtct_switch_prio
+ * Switch priority.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qtct, switch_prio, 0x00, 0, 4);
+
+/* reg_qtct_tclass
+ * Traffic class.
+ * Default values:
+ * switch_prio 0 : tclass 1
+ * switch_prio 1 : tclass 0
+ * switch_prio i : tclass i, for i > 1
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qtct, tclass, 0x04, 0, 4);
+
+static inline void mlxsw_reg_qtct_pack(char *payload, u8 local_port,
+				       u8 switch_prio, u8 tclass)
+{
+	MLXSW_REG_ZERO(qtct, payload);
+	mlxsw_reg_qtct_local_port_set(payload, local_port);
+	mlxsw_reg_qtct_switch_prio_set(payload, switch_prio);
+	mlxsw_reg_qtct_tclass_set(payload, tclass);
+}
+
+/* QEEC - QoS ETS Element Configuration Register
+ * ---------------------------------------------
+ * Configures the ETS elements.
+ */
+#define MLXSW_REG_QEEC_ID 0x400D
+#define MLXSW_REG_QEEC_LEN 0x1C
+
+MLXSW_REG_DEFINE(qeec, MLXSW_REG_QEEC_ID, MLXSW_REG_QEEC_LEN);
+
+/* reg_qeec_local_port
+ * Local port number.
+ * Access: Index
+ *
+ * Note: CPU port is supported.
+ */
+MLXSW_ITEM32(reg, qeec, local_port, 0x00, 16, 8);
+
+enum mlxsw_reg_qeec_hr {
+	MLXSW_REG_QEEC_HIERARCY_PORT,
+	MLXSW_REG_QEEC_HIERARCY_GROUP,
+	MLXSW_REG_QEEC_HIERARCY_SUBGROUP,
+	MLXSW_REG_QEEC_HIERARCY_TC,
+};
+
+/* reg_qeec_element_hierarchy
+ * 0 - Port
+ * 1 - Group
+ * 2 - Subgroup
+ * 3 - Traffic Class
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qeec, element_hierarchy, 0x04, 16, 4);
+
+/* reg_qeec_element_index
+ * The index of the element in the hierarchy.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qeec, element_index, 0x04, 0, 8);
+
+/* reg_qeec_next_element_index
+ * The index of the next (lower) element in the hierarchy.
+ * Access: RW
+ *
+ * Note: Reserved for element_hierarchy 0.
+ */
+MLXSW_ITEM32(reg, qeec, next_element_index, 0x08, 0, 8);
+
+enum {
+	MLXSW_REG_QEEC_BYTES_MODE,
+	MLXSW_REG_QEEC_PACKETS_MODE,
+};
+
+/* reg_qeec_pb
+ * Packets or bytes mode.
+ * 0 - Bytes mode
+ * 1 - Packets mode
+ * Access: RW
+ *
+ * Note: Used for max shaper configuration. For Spectrum, packets mode
+ * is supported only for traffic classes of CPU port.
+ */
+MLXSW_ITEM32(reg, qeec, pb, 0x0C, 28, 1);
+
+/* reg_qeec_mase
+ * Max shaper configuration enable. Enables configuration of the max
+ * shaper on this ETS element.
+ * 0 - Disable
+ * 1 - Enable
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qeec, mase, 0x10, 31, 1);
+
+/* A large max rate will disable the max shaper. */
+#define MLXSW_REG_QEEC_MAS_DIS	200000000	/* Kbps */
+
+/* reg_qeec_max_shaper_rate
+ * Max shaper information rate.
+ * For CPU port, can only be configured for port hierarchy.
+ * When in bytes mode, value is specified in units of 1000bps.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qeec, max_shaper_rate, 0x10, 0, 28);
+
+/* reg_qeec_de
+ * DWRR configuration enable. Enables configuration of the dwrr and
+ * dwrr_weight.
+ * 0 - Disable
+ * 1 - Enable
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qeec, de, 0x18, 31, 1);
+
+/* reg_qeec_dwrr
+ * Transmission selection algorithm to use on the link going down from
+ * the ETS element.
+ * 0 - Strict priority
+ * 1 - DWRR
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qeec, dwrr, 0x18, 15, 1);
+
+/* reg_qeec_dwrr_weight
+ * DWRR weight on the link going down from the ETS element. The
+ * percentage of bandwidth guaranteed to an ETS element within
+ * its hierarchy. The sum of all weights across all ETS elements
+ * within one hierarchy should be equal to 100. Reserved when
+ * transmission selection algorithm is strict priority.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qeec, dwrr_weight, 0x18, 0, 8);
+
+static inline void mlxsw_reg_qeec_pack(char *payload, u8 local_port,
+				       enum mlxsw_reg_qeec_hr hr, u8 index,
+				       u8 next_index)
+{
+	MLXSW_REG_ZERO(qeec, payload);
+	mlxsw_reg_qeec_local_port_set(payload, local_port);
+	mlxsw_reg_qeec_element_hierarchy_set(payload, hr);
+	mlxsw_reg_qeec_element_index_set(payload, index);
+	mlxsw_reg_qeec_next_element_index_set(payload, next_index);
+}
+
+/* PMLP - Ports Module to Local Port Register
+ * ------------------------------------------
+ * Configures the assignment of modules to local ports.
+ */
+#define MLXSW_REG_PMLP_ID 0x5002
+#define MLXSW_REG_PMLP_LEN 0x40
+
+MLXSW_REG_DEFINE(pmlp, MLXSW_REG_PMLP_ID, MLXSW_REG_PMLP_LEN);
+
+/* reg_pmlp_rxtx
+ * 0 - Tx value is used for both Tx and Rx.
+ * 1 - Rx value is taken from a separte field.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pmlp, rxtx, 0x00, 31, 1);
+
+/* reg_pmlp_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pmlp, local_port, 0x00, 16, 8);
+
+/* reg_pmlp_width
+ * 0 - Unmap local port.
+ * 1 - Lane 0 is used.
+ * 2 - Lanes 0 and 1 are used.
+ * 4 - Lanes 0, 1, 2 and 3 are used.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pmlp, width, 0x00, 0, 8);
+
+/* reg_pmlp_module
+ * Module number.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pmlp, module, 0x04, 0, 8, 0x04, 0x00, false);
+
+/* reg_pmlp_tx_lane
+ * Tx Lane. When rxtx field is cleared, this field is used for Rx as well.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pmlp, tx_lane, 0x04, 16, 2, 0x04, 0x00, false);
+
+/* reg_pmlp_rx_lane
+ * Rx Lane. When rxtx field is cleared, this field is ignored and Rx lane is
+ * equal to Tx lane.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pmlp, rx_lane, 0x04, 24, 2, 0x04, 0x00, false);
+
+static inline void mlxsw_reg_pmlp_pack(char *payload, u8 local_port)
+{
+	MLXSW_REG_ZERO(pmlp, payload);
+	mlxsw_reg_pmlp_local_port_set(payload, local_port);
+}
+
+/* PMTU - Port MTU Register
+ * ------------------------
+ * Configures and reports the port MTU.
+ */
+#define MLXSW_REG_PMTU_ID 0x5003
+#define MLXSW_REG_PMTU_LEN 0x10
+
+MLXSW_REG_DEFINE(pmtu, MLXSW_REG_PMTU_ID, MLXSW_REG_PMTU_LEN);
+
+/* reg_pmtu_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pmtu, local_port, 0x00, 16, 8);
+
+/* reg_pmtu_max_mtu
+ * Maximum MTU.
+ * When port type (e.g. Ethernet) is configured, the relevant MTU is
+ * reported, otherwise the minimum between the max_mtu of the different
+ * types is reported.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pmtu, max_mtu, 0x04, 16, 16);
+
+/* reg_pmtu_admin_mtu
+ * MTU value to set port to. Must be smaller or equal to max_mtu.
+ * Note: If port type is Infiniband, then port must be disabled, when its
+ * MTU is set.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pmtu, admin_mtu, 0x08, 16, 16);
+
+/* reg_pmtu_oper_mtu
+ * The actual MTU configured on the port. Packets exceeding this size
+ * will be dropped.
+ * Note: In Ethernet and FC oper_mtu == admin_mtu, however, in Infiniband
+ * oper_mtu might be smaller than admin_mtu.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pmtu, oper_mtu, 0x0C, 16, 16);
+
+static inline void mlxsw_reg_pmtu_pack(char *payload, u8 local_port,
+				       u16 new_mtu)
+{
+	MLXSW_REG_ZERO(pmtu, payload);
+	mlxsw_reg_pmtu_local_port_set(payload, local_port);
+	mlxsw_reg_pmtu_max_mtu_set(payload, 0);
+	mlxsw_reg_pmtu_admin_mtu_set(payload, new_mtu);
+	mlxsw_reg_pmtu_oper_mtu_set(payload, 0);
+}
+
+/* PTYS - Port Type and Speed Register
+ * -----------------------------------
+ * Configures and reports the port speed type.
+ *
+ * Note: When set while the link is up, the changes will not take effect
+ * until the port transitions from down to up state.
+ */
+#define MLXSW_REG_PTYS_ID 0x5004
+#define MLXSW_REG_PTYS_LEN 0x40
+
+MLXSW_REG_DEFINE(ptys, MLXSW_REG_PTYS_ID, MLXSW_REG_PTYS_LEN);
+
+/* an_disable_admin
+ * Auto negotiation disable administrative configuration
+ * 0 - Device doesn't support AN disable.
+ * 1 - Device supports AN disable.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptys, an_disable_admin, 0x00, 30, 1);
+
+/* reg_ptys_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ptys, local_port, 0x00, 16, 8);
+
+#define MLXSW_REG_PTYS_PROTO_MASK_IB	BIT(0)
+#define MLXSW_REG_PTYS_PROTO_MASK_ETH	BIT(2)
+
+/* reg_ptys_proto_mask
+ * Protocol mask. Indicates which protocol is used.
+ * 0 - Infiniband.
+ * 1 - Fibre Channel.
+ * 2 - Ethernet.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ptys, proto_mask, 0x00, 0, 3);
+
+enum {
+	MLXSW_REG_PTYS_AN_STATUS_NA,
+	MLXSW_REG_PTYS_AN_STATUS_OK,
+	MLXSW_REG_PTYS_AN_STATUS_FAIL,
+};
+
+/* reg_ptys_an_status
+ * Autonegotiation status.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, an_status, 0x04, 28, 4);
+
+#define MLXSW_REG_PTYS_ETH_SPEED_SGMII			BIT(0)
+#define MLXSW_REG_PTYS_ETH_SPEED_1000BASE_KX		BIT(1)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CX4		BIT(2)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4		BIT(3)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR		BIT(4)
+#define MLXSW_REG_PTYS_ETH_SPEED_20GBASE_KR2		BIT(5)
+#define MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4		BIT(6)
+#define MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4		BIT(7)
+#define MLXSW_REG_PTYS_ETH_SPEED_56GBASE_R4		BIT(8)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR		BIT(12)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR		BIT(13)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_ER_LR		BIT(14)
+#define MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4		BIT(15)
+#define MLXSW_REG_PTYS_ETH_SPEED_40GBASE_LR4_ER4	BIT(16)
+#define MLXSW_REG_PTYS_ETH_SPEED_50GBASE_SR2		BIT(18)
+#define MLXSW_REG_PTYS_ETH_SPEED_50GBASE_KR4		BIT(19)
+#define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_CR4		BIT(20)
+#define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4		BIT(21)
+#define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4		BIT(22)
+#define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_LR4_ER4	BIT(23)
+#define MLXSW_REG_PTYS_ETH_SPEED_100BASE_TX		BIT(24)
+#define MLXSW_REG_PTYS_ETH_SPEED_100BASE_T		BIT(25)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_T		BIT(26)
+#define MLXSW_REG_PTYS_ETH_SPEED_25GBASE_CR		BIT(27)
+#define MLXSW_REG_PTYS_ETH_SPEED_25GBASE_KR		BIT(28)
+#define MLXSW_REG_PTYS_ETH_SPEED_25GBASE_SR		BIT(29)
+#define MLXSW_REG_PTYS_ETH_SPEED_50GBASE_CR2		BIT(30)
+#define MLXSW_REG_PTYS_ETH_SPEED_50GBASE_KR2		BIT(31)
+
+/* reg_ptys_eth_proto_cap
+ * Ethernet port supported speeds and protocols.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, eth_proto_cap, 0x0C, 0, 32);
+
+/* reg_ptys_ib_link_width_cap
+ * IB port supported widths.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, ib_link_width_cap, 0x10, 16, 16);
+
+#define MLXSW_REG_PTYS_IB_SPEED_SDR	BIT(0)
+#define MLXSW_REG_PTYS_IB_SPEED_DDR	BIT(1)
+#define MLXSW_REG_PTYS_IB_SPEED_QDR	BIT(2)
+#define MLXSW_REG_PTYS_IB_SPEED_FDR10	BIT(3)
+#define MLXSW_REG_PTYS_IB_SPEED_FDR	BIT(4)
+#define MLXSW_REG_PTYS_IB_SPEED_EDR	BIT(5)
+
+/* reg_ptys_ib_proto_cap
+ * IB port supported speeds and protocols.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, ib_proto_cap, 0x10, 0, 16);
+
+/* reg_ptys_eth_proto_admin
+ * Speed and protocol to set port to.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptys, eth_proto_admin, 0x18, 0, 32);
+
+/* reg_ptys_ib_link_width_admin
+ * IB width to set port to.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptys, ib_link_width_admin, 0x1C, 16, 16);
+
+/* reg_ptys_ib_proto_admin
+ * IB speeds and protocols to set port to.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptys, ib_proto_admin, 0x1C, 0, 16);
+
+/* reg_ptys_eth_proto_oper
+ * The current speed and protocol configured for the port.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, eth_proto_oper, 0x24, 0, 32);
+
+/* reg_ptys_ib_link_width_oper
+ * The current IB width to set port to.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, ib_link_width_oper, 0x28, 16, 16);
+
+/* reg_ptys_ib_proto_oper
+ * The current IB speed and protocol.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, ib_proto_oper, 0x28, 0, 16);
+
+/* reg_ptys_eth_proto_lp_advertise
+ * The protocols that were advertised by the link partner during
+ * autonegotiation.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, eth_proto_lp_advertise, 0x30, 0, 32);
+
+static inline void mlxsw_reg_ptys_eth_pack(char *payload, u8 local_port,
+					   u32 proto_admin, bool autoneg)
+{
+	MLXSW_REG_ZERO(ptys, payload);
+	mlxsw_reg_ptys_local_port_set(payload, local_port);
+	mlxsw_reg_ptys_proto_mask_set(payload, MLXSW_REG_PTYS_PROTO_MASK_ETH);
+	mlxsw_reg_ptys_eth_proto_admin_set(payload, proto_admin);
+	mlxsw_reg_ptys_an_disable_admin_set(payload, !autoneg);
+}
+
+static inline void mlxsw_reg_ptys_eth_unpack(char *payload,
+					     u32 *p_eth_proto_cap,
+					     u32 *p_eth_proto_adm,
+					     u32 *p_eth_proto_oper)
+{
+	if (p_eth_proto_cap)
+		*p_eth_proto_cap = mlxsw_reg_ptys_eth_proto_cap_get(payload);
+	if (p_eth_proto_adm)
+		*p_eth_proto_adm = mlxsw_reg_ptys_eth_proto_admin_get(payload);
+	if (p_eth_proto_oper)
+		*p_eth_proto_oper = mlxsw_reg_ptys_eth_proto_oper_get(payload);
+}
+
+static inline void mlxsw_reg_ptys_ib_pack(char *payload, u8 local_port,
+					  u16 proto_admin, u16 link_width)
+{
+	MLXSW_REG_ZERO(ptys, payload);
+	mlxsw_reg_ptys_local_port_set(payload, local_port);
+	mlxsw_reg_ptys_proto_mask_set(payload, MLXSW_REG_PTYS_PROTO_MASK_IB);
+	mlxsw_reg_ptys_ib_proto_admin_set(payload, proto_admin);
+	mlxsw_reg_ptys_ib_link_width_admin_set(payload, link_width);
+}
+
+static inline void mlxsw_reg_ptys_ib_unpack(char *payload, u16 *p_ib_proto_cap,
+					    u16 *p_ib_link_width_cap,
+					    u16 *p_ib_proto_oper,
+					    u16 *p_ib_link_width_oper)
+{
+	if (p_ib_proto_cap)
+		*p_ib_proto_cap = mlxsw_reg_ptys_ib_proto_cap_get(payload);
+	if (p_ib_link_width_cap)
+		*p_ib_link_width_cap =
+			mlxsw_reg_ptys_ib_link_width_cap_get(payload);
+	if (p_ib_proto_oper)
+		*p_ib_proto_oper = mlxsw_reg_ptys_ib_proto_oper_get(payload);
+	if (p_ib_link_width_oper)
+		*p_ib_link_width_oper =
+			mlxsw_reg_ptys_ib_link_width_oper_get(payload);
+}
+
+/* PPAD - Port Physical Address Register
+ * -------------------------------------
+ * The PPAD register configures the per port physical MAC address.
+ */
+#define MLXSW_REG_PPAD_ID 0x5005
+#define MLXSW_REG_PPAD_LEN 0x10
+
+MLXSW_REG_DEFINE(ppad, MLXSW_REG_PPAD_ID, MLXSW_REG_PPAD_LEN);
+
+/* reg_ppad_single_base_mac
+ * 0: base_mac, local port should be 0 and mac[7:0] is
+ * reserved. HW will set incremental
+ * 1: single_mac - mac of the local_port
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ppad, single_base_mac, 0x00, 28, 1);
+
+/* reg_ppad_local_port
+ * port number, if single_base_mac = 0 then local_port is reserved
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ppad, local_port, 0x00, 16, 8);
+
+/* reg_ppad_mac
+ * If single_base_mac = 0 - base MAC address, mac[7:0] is reserved.
+ * If single_base_mac = 1 - the per port MAC address
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ppad, mac, 0x02, 6);
+
+static inline void mlxsw_reg_ppad_pack(char *payload, bool single_base_mac,
+				       u8 local_port)
+{
+	MLXSW_REG_ZERO(ppad, payload);
+	mlxsw_reg_ppad_single_base_mac_set(payload, !!single_base_mac);
+	mlxsw_reg_ppad_local_port_set(payload, local_port);
+}
+
+/* PAOS - Ports Administrative and Operational Status Register
+ * -----------------------------------------------------------
+ * Configures and retrieves per port administrative and operational status.
+ */
+#define MLXSW_REG_PAOS_ID 0x5006
+#define MLXSW_REG_PAOS_LEN 0x10
+
+MLXSW_REG_DEFINE(paos, MLXSW_REG_PAOS_ID, MLXSW_REG_PAOS_LEN);
+
+/* reg_paos_swid
+ * Switch partition ID with which to associate the port.
+ * Note: while external ports uses unique local port numbers (and thus swid is
+ * redundant), router ports use the same local port number where swid is the
+ * only indication for the relevant port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, paos, swid, 0x00, 24, 8);
+
+/* reg_paos_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, paos, local_port, 0x00, 16, 8);
+
+/* reg_paos_admin_status
+ * Port administrative state (the desired state of the port):
+ * 1 - Up.
+ * 2 - Down.
+ * 3 - Up once. This means that in case of link failure, the port won't go
+ *     into polling mode, but will wait to be re-enabled by software.
+ * 4 - Disabled by system. Can only be set by hardware.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, paos, admin_status, 0x00, 8, 4);
+
+/* reg_paos_oper_status
+ * Port operational state (the current state):
+ * 1 - Up.
+ * 2 - Down.
+ * 3 - Down by port failure. This means that the device will not let the
+ *     port up again until explicitly specified by software.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, paos, oper_status, 0x00, 0, 4);
+
+/* reg_paos_ase
+ * Admin state update enabled.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, paos, ase, 0x04, 31, 1);
+
+/* reg_paos_ee
+ * Event update enable. If this bit is set, event generation will be
+ * updated based on the e field.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, paos, ee, 0x04, 30, 1);
+
+/* reg_paos_e
+ * Event generation on operational state change:
+ * 0 - Do not generate event.
+ * 1 - Generate Event.
+ * 2 - Generate Single Event.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, paos, e, 0x04, 0, 2);
+
+static inline void mlxsw_reg_paos_pack(char *payload, u8 local_port,
+				       enum mlxsw_port_admin_status status)
+{
+	MLXSW_REG_ZERO(paos, payload);
+	mlxsw_reg_paos_swid_set(payload, 0);
+	mlxsw_reg_paos_local_port_set(payload, local_port);
+	mlxsw_reg_paos_admin_status_set(payload, status);
+	mlxsw_reg_paos_oper_status_set(payload, 0);
+	mlxsw_reg_paos_ase_set(payload, 1);
+	mlxsw_reg_paos_ee_set(payload, 1);
+	mlxsw_reg_paos_e_set(payload, 1);
+}
+
+/* PFCC - Ports Flow Control Configuration Register
+ * ------------------------------------------------
+ * Configures and retrieves the per port flow control configuration.
+ */
+#define MLXSW_REG_PFCC_ID 0x5007
+#define MLXSW_REG_PFCC_LEN 0x20
+
+MLXSW_REG_DEFINE(pfcc, MLXSW_REG_PFCC_ID, MLXSW_REG_PFCC_LEN);
+
+/* reg_pfcc_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pfcc, local_port, 0x00, 16, 8);
+
+/* reg_pfcc_pnat
+ * Port number access type. Determines the way local_port is interpreted:
+ * 0 - Local port number.
+ * 1 - IB / label port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pfcc, pnat, 0x00, 14, 2);
+
+/* reg_pfcc_shl_cap
+ * Send to higher layers capabilities:
+ * 0 - No capability of sending Pause and PFC frames to higher layers.
+ * 1 - Device has capability of sending Pause and PFC frames to higher
+ *     layers.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pfcc, shl_cap, 0x00, 1, 1);
+
+/* reg_pfcc_shl_opr
+ * Send to higher layers operation:
+ * 0 - Pause and PFC frames are handled by the port (default).
+ * 1 - Pause and PFC frames are handled by the port and also sent to
+ *     higher layers. Only valid if shl_cap = 1.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pfcc, shl_opr, 0x00, 0, 1);
+
+/* reg_pfcc_ppan
+ * Pause policy auto negotiation.
+ * 0 - Disabled. Generate / ignore Pause frames based on pptx / pprtx.
+ * 1 - Enabled. When auto-negotiation is performed, set the Pause policy
+ *     based on the auto-negotiation resolution.
+ * Access: RW
+ *
+ * Note: The auto-negotiation advertisement is set according to pptx and
+ * pprtx. When PFC is set on Tx / Rx, ppan must be set to 0.
+ */
+MLXSW_ITEM32(reg, pfcc, ppan, 0x04, 28, 4);
+
+/* reg_pfcc_prio_mask_tx
+ * Bit per priority indicating if Tx flow control policy should be
+ * updated based on bit pfctx.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, pfcc, prio_mask_tx, 0x04, 16, 8);
+
+/* reg_pfcc_prio_mask_rx
+ * Bit per priority indicating if Rx flow control policy should be
+ * updated based on bit pfcrx.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, pfcc, prio_mask_rx, 0x04, 0, 8);
+
+/* reg_pfcc_pptx
+ * Admin Pause policy on Tx.
+ * 0 - Never generate Pause frames (default).
+ * 1 - Generate Pause frames according to Rx buffer threshold.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pfcc, pptx, 0x08, 31, 1);
+
+/* reg_pfcc_aptx
+ * Active (operational) Pause policy on Tx.
+ * 0 - Never generate Pause frames.
+ * 1 - Generate Pause frames according to Rx buffer threshold.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pfcc, aptx, 0x08, 30, 1);
+
+/* reg_pfcc_pfctx
+ * Priority based flow control policy on Tx[7:0]. Per-priority bit mask:
+ * 0 - Never generate priority Pause frames on the specified priority
+ *     (default).
+ * 1 - Generate priority Pause frames according to Rx buffer threshold on
+ *     the specified priority.
+ * Access: RW
+ *
+ * Note: pfctx and pptx must be mutually exclusive.
+ */
+MLXSW_ITEM32(reg, pfcc, pfctx, 0x08, 16, 8);
+
+/* reg_pfcc_pprx
+ * Admin Pause policy on Rx.
+ * 0 - Ignore received Pause frames (default).
+ * 1 - Respect received Pause frames.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pfcc, pprx, 0x0C, 31, 1);
+
+/* reg_pfcc_aprx
+ * Active (operational) Pause policy on Rx.
+ * 0 - Ignore received Pause frames.
+ * 1 - Respect received Pause frames.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pfcc, aprx, 0x0C, 30, 1);
+
+/* reg_pfcc_pfcrx
+ * Priority based flow control policy on Rx[7:0]. Per-priority bit mask:
+ * 0 - Ignore incoming priority Pause frames on the specified priority
+ *     (default).
+ * 1 - Respect incoming priority Pause frames on the specified priority.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pfcc, pfcrx, 0x0C, 16, 8);
+
+#define MLXSW_REG_PFCC_ALL_PRIO 0xFF
+
+static inline void mlxsw_reg_pfcc_prio_pack(char *payload, u8 pfc_en)
+{
+	mlxsw_reg_pfcc_prio_mask_tx_set(payload, MLXSW_REG_PFCC_ALL_PRIO);
+	mlxsw_reg_pfcc_prio_mask_rx_set(payload, MLXSW_REG_PFCC_ALL_PRIO);
+	mlxsw_reg_pfcc_pfctx_set(payload, pfc_en);
+	mlxsw_reg_pfcc_pfcrx_set(payload, pfc_en);
+}
+
+static inline void mlxsw_reg_pfcc_pack(char *payload, u8 local_port)
+{
+	MLXSW_REG_ZERO(pfcc, payload);
+	mlxsw_reg_pfcc_local_port_set(payload, local_port);
+}
+
+/* PPCNT - Ports Performance Counters Register
+ * -------------------------------------------
+ * The PPCNT register retrieves per port performance counters.
+ */
+#define MLXSW_REG_PPCNT_ID 0x5008
+#define MLXSW_REG_PPCNT_LEN 0x100
+#define MLXSW_REG_PPCNT_COUNTERS_OFFSET 0x08
+
+MLXSW_REG_DEFINE(ppcnt, MLXSW_REG_PPCNT_ID, MLXSW_REG_PPCNT_LEN);
+
+/* reg_ppcnt_swid
+ * For HCA: must be always 0.
+ * Switch partition ID to associate port with.
+ * Switch partitions are numbered from 0 to 7 inclusively.
+ * Switch partition 254 indicates stacking ports.
+ * Switch partition 255 indicates all switch partitions.
+ * Only valid on Set() operation with local_port=255.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppcnt, swid, 0x00, 24, 8);
+
+/* reg_ppcnt_local_port
+ * Local port number.
+ * 255 indicates all ports on the device, and is only allowed
+ * for Set() operation.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppcnt, local_port, 0x00, 16, 8);
+
+/* reg_ppcnt_pnat
+ * Port number access type:
+ * 0 - Local port number
+ * 1 - IB port number
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppcnt, pnat, 0x00, 14, 2);
+
+enum mlxsw_reg_ppcnt_grp {
+	MLXSW_REG_PPCNT_IEEE_8023_CNT = 0x0,
+	MLXSW_REG_PPCNT_EXT_CNT = 0x5,
+	MLXSW_REG_PPCNT_PRIO_CNT = 0x10,
+	MLXSW_REG_PPCNT_TC_CNT = 0x11,
+	MLXSW_REG_PPCNT_TC_CONG_TC = 0x13,
+};
+
+/* reg_ppcnt_grp
+ * Performance counter group.
+ * Group 63 indicates all groups. Only valid on Set() operation with
+ * clr bit set.
+ * 0x0: IEEE 802.3 Counters
+ * 0x1: RFC 2863 Counters
+ * 0x2: RFC 2819 Counters
+ * 0x3: RFC 3635 Counters
+ * 0x5: Ethernet Extended Counters
+ * 0x8: Link Level Retransmission Counters
+ * 0x10: Per Priority Counters
+ * 0x11: Per Traffic Class Counters
+ * 0x12: Physical Layer Counters
+ * 0x13: Per Traffic Class Congestion Counters
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppcnt, grp, 0x00, 0, 6);
+
+/* reg_ppcnt_clr
+ * Clear counters. Setting the clr bit will reset the counter value
+ * for all counters in the counter group. This bit can be set
+ * for both Set() and Get() operation.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, ppcnt, clr, 0x04, 31, 1);
+
+/* reg_ppcnt_prio_tc
+ * Priority for counter set that support per priority, valid values: 0-7.
+ * Traffic class for counter set that support per traffic class,
+ * valid values: 0- cap_max_tclass-1 .
+ * For HCA: cap_max_tclass is always 8.
+ * Otherwise must be 0.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppcnt, prio_tc, 0x04, 0, 5);
+
+/* Ethernet IEEE 802.3 Counter Group */
+
+/* reg_ppcnt_a_frames_transmitted_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_frames_transmitted_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x00, 0, 64);
+
+/* reg_ppcnt_a_frames_received_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_frames_received_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x08, 0, 64);
+
+/* reg_ppcnt_a_frame_check_sequence_errors
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_frame_check_sequence_errors,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x10, 0, 64);
+
+/* reg_ppcnt_a_alignment_errors
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_alignment_errors,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x18, 0, 64);
+
+/* reg_ppcnt_a_octets_transmitted_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_octets_transmitted_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x20, 0, 64);
+
+/* reg_ppcnt_a_octets_received_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_octets_received_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x28, 0, 64);
+
+/* reg_ppcnt_a_multicast_frames_xmitted_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_multicast_frames_xmitted_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x30, 0, 64);
+
+/* reg_ppcnt_a_broadcast_frames_xmitted_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_broadcast_frames_xmitted_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x38, 0, 64);
+
+/* reg_ppcnt_a_multicast_frames_received_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_multicast_frames_received_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x40, 0, 64);
+
+/* reg_ppcnt_a_broadcast_frames_received_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_broadcast_frames_received_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x48, 0, 64);
+
+/* reg_ppcnt_a_in_range_length_errors
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_in_range_length_errors,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x50, 0, 64);
+
+/* reg_ppcnt_a_out_of_range_length_field
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_out_of_range_length_field,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x58, 0, 64);
+
+/* reg_ppcnt_a_frame_too_long_errors
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_frame_too_long_errors,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x60, 0, 64);
+
+/* reg_ppcnt_a_symbol_error_during_carrier
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_symbol_error_during_carrier,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x68, 0, 64);
+
+/* reg_ppcnt_a_mac_control_frames_transmitted
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_mac_control_frames_transmitted,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x70, 0, 64);
+
+/* reg_ppcnt_a_mac_control_frames_received
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_mac_control_frames_received,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x78, 0, 64);
+
+/* reg_ppcnt_a_unsupported_opcodes_received
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_unsupported_opcodes_received,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x80, 0, 64);
+
+/* reg_ppcnt_a_pause_mac_ctrl_frames_received
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_pause_mac_ctrl_frames_received,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x88, 0, 64);
+
+/* reg_ppcnt_a_pause_mac_ctrl_frames_transmitted
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_pause_mac_ctrl_frames_transmitted,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x90, 0, 64);
+
+/* Ethernet Extended Counter Group Counters */
+
+/* reg_ppcnt_ecn_marked
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ecn_marked,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x08, 0, 64);
+
+/* Ethernet Per Priority Group Counters */
+
+/* reg_ppcnt_rx_octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, rx_octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x00, 0, 64);
+
+/* reg_ppcnt_rx_frames
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, rx_frames,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x20, 0, 64);
+
+/* reg_ppcnt_tx_octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tx_octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x28, 0, 64);
+
+/* reg_ppcnt_tx_frames
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tx_frames,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x48, 0, 64);
+
+/* reg_ppcnt_rx_pause
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, rx_pause,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x50, 0, 64);
+
+/* reg_ppcnt_rx_pause_duration
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, rx_pause_duration,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x58, 0, 64);
+
+/* reg_ppcnt_tx_pause
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tx_pause,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x60, 0, 64);
+
+/* reg_ppcnt_tx_pause_duration
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tx_pause_duration,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x68, 0, 64);
+
+/* reg_ppcnt_rx_pause_transition
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tx_pause_transition,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x70, 0, 64);
+
+/* Ethernet Per Traffic Group Counters */
+
+/* reg_ppcnt_tc_transmit_queue
+ * Contains the transmit queue depth in cells of traffic class
+ * selected by prio_tc and the port selected by local_port.
+ * The field cannot be cleared.
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tc_transmit_queue,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x00, 0, 64);
+
+/* reg_ppcnt_tc_no_buffer_discard_uc
+ * The number of unicast packets dropped due to lack of shared
+ * buffer resources.
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tc_no_buffer_discard_uc,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x08, 0, 64);
+
+/* Ethernet Per Traffic Class Congestion Group Counters */
+
+/* reg_ppcnt_wred_discard
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, wred_discard,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x00, 0, 64);
+
+static inline void mlxsw_reg_ppcnt_pack(char *payload, u8 local_port,
+					enum mlxsw_reg_ppcnt_grp grp,
+					u8 prio_tc)
+{
+	MLXSW_REG_ZERO(ppcnt, payload);
+	mlxsw_reg_ppcnt_swid_set(payload, 0);
+	mlxsw_reg_ppcnt_local_port_set(payload, local_port);
+	mlxsw_reg_ppcnt_pnat_set(payload, 0);
+	mlxsw_reg_ppcnt_grp_set(payload, grp);
+	mlxsw_reg_ppcnt_clr_set(payload, 0);
+	mlxsw_reg_ppcnt_prio_tc_set(payload, prio_tc);
+}
+
+/* PLIB - Port Local to InfiniBand Port
+ * ------------------------------------
+ * The PLIB register performs mapping from Local Port into InfiniBand Port.
+ */
+#define MLXSW_REG_PLIB_ID 0x500A
+#define MLXSW_REG_PLIB_LEN 0x10
+
+MLXSW_REG_DEFINE(plib, MLXSW_REG_PLIB_ID, MLXSW_REG_PLIB_LEN);
+
+/* reg_plib_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, plib, local_port, 0x00, 16, 8);
+
+/* reg_plib_ib_port
+ * InfiniBand port remapping for local_port.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, plib, ib_port, 0x00, 0, 8);
+
+/* PPTB - Port Prio To Buffer Register
+ * -----------------------------------
+ * Configures the switch priority to buffer table.
+ */
+#define MLXSW_REG_PPTB_ID 0x500B
+#define MLXSW_REG_PPTB_LEN 0x10
+
+MLXSW_REG_DEFINE(pptb, MLXSW_REG_PPTB_ID, MLXSW_REG_PPTB_LEN);
+
+enum {
+	MLXSW_REG_PPTB_MM_UM,
+	MLXSW_REG_PPTB_MM_UNICAST,
+	MLXSW_REG_PPTB_MM_MULTICAST,
+};
+
+/* reg_pptb_mm
+ * Mapping mode.
+ * 0 - Map both unicast and multicast packets to the same buffer.
+ * 1 - Map only unicast packets.
+ * 2 - Map only multicast packets.
+ * Access: Index
+ *
+ * Note: SwitchX-2 only supports the first option.
+ */
+MLXSW_ITEM32(reg, pptb, mm, 0x00, 28, 2);
+
+/* reg_pptb_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pptb, local_port, 0x00, 16, 8);
+
+/* reg_pptb_um
+ * Enables the update of the untagged_buf field.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pptb, um, 0x00, 8, 1);
+
+/* reg_pptb_pm
+ * Enables the update of the prio_to_buff field.
+ * Bit <i> is a flag for updating the mapping for switch priority <i>.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pptb, pm, 0x00, 0, 8);
+
+/* reg_pptb_prio_to_buff
+ * Mapping of switch priority <i> to one of the allocated receive port
+ * buffers.
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, pptb, prio_to_buff, 0x04, 0x04, 4);
+
+/* reg_pptb_pm_msb
+ * Enables the update of the prio_to_buff field.
+ * Bit <i> is a flag for updating the mapping for switch priority <i+8>.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pptb, pm_msb, 0x08, 24, 8);
+
+/* reg_pptb_untagged_buff
+ * Mapping of untagged frames to one of the allocated receive port buffers.
+ * Access: RW
+ *
+ * Note: In SwitchX-2 this field must be mapped to buffer 8. Reserved for
+ * Spectrum, as it maps untagged packets based on the default switch priority.
+ */
+MLXSW_ITEM32(reg, pptb, untagged_buff, 0x08, 0, 4);
+
+/* reg_pptb_prio_to_buff_msb
+ * Mapping of switch priority <i+8> to one of the allocated receive port
+ * buffers.
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, pptb, prio_to_buff_msb, 0x0C, 0x04, 4);
+
+#define MLXSW_REG_PPTB_ALL_PRIO 0xFF
+
+static inline void mlxsw_reg_pptb_pack(char *payload, u8 local_port)
+{
+	MLXSW_REG_ZERO(pptb, payload);
+	mlxsw_reg_pptb_mm_set(payload, MLXSW_REG_PPTB_MM_UM);
+	mlxsw_reg_pptb_local_port_set(payload, local_port);
+	mlxsw_reg_pptb_pm_set(payload, MLXSW_REG_PPTB_ALL_PRIO);
+	mlxsw_reg_pptb_pm_msb_set(payload, MLXSW_REG_PPTB_ALL_PRIO);
+}
+
+static inline void mlxsw_reg_pptb_prio_to_buff_pack(char *payload, u8 prio,
+						    u8 buff)
+{
+	mlxsw_reg_pptb_prio_to_buff_set(payload, prio, buff);
+	mlxsw_reg_pptb_prio_to_buff_msb_set(payload, prio, buff);
+}
+
+/* PBMC - Port Buffer Management Control Register
+ * ----------------------------------------------
+ * The PBMC register configures and retrieves the port packet buffer
+ * allocation for different Prios, and the Pause threshold management.
+ */
+#define MLXSW_REG_PBMC_ID 0x500C
+#define MLXSW_REG_PBMC_LEN 0x6C
+
+MLXSW_REG_DEFINE(pbmc, MLXSW_REG_PBMC_ID, MLXSW_REG_PBMC_LEN);
+
+/* reg_pbmc_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pbmc, local_port, 0x00, 16, 8);
+
+/* reg_pbmc_xoff_timer_value
+ * When device generates a pause frame, it uses this value as the pause
+ * timer (time for the peer port to pause in quota-512 bit time).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pbmc, xoff_timer_value, 0x04, 16, 16);
+
+/* reg_pbmc_xoff_refresh
+ * The time before a new pause frame should be sent to refresh the pause RW
+ * state. Using the same units as xoff_timer_value above (in quota-512 bit
+ * time).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pbmc, xoff_refresh, 0x04, 0, 16);
+
+#define MLXSW_REG_PBMC_PORT_SHARED_BUF_IDX 11
+
+/* reg_pbmc_buf_lossy
+ * The field indicates if the buffer is lossy.
+ * 0 - Lossless
+ * 1 - Lossy
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pbmc, buf_lossy, 0x0C, 25, 1, 0x08, 0x00, false);
+
+/* reg_pbmc_buf_epsb
+ * Eligible for Port Shared buffer.
+ * If epsb is set, packets assigned to buffer are allowed to insert the port
+ * shared buffer.
+ * When buf_lossy is MLXSW_REG_PBMC_LOSSY_LOSSY this field is reserved.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pbmc, buf_epsb, 0x0C, 24, 1, 0x08, 0x00, false);
+
+/* reg_pbmc_buf_size
+ * The part of the packet buffer array is allocated for the specific buffer.
+ * Units are represented in cells.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pbmc, buf_size, 0x0C, 0, 16, 0x08, 0x00, false);
+
+/* reg_pbmc_buf_xoff_threshold
+ * Once the amount of data in the buffer goes above this value, device
+ * starts sending PFC frames for all priorities associated with the
+ * buffer. Units are represented in cells. Reserved in case of lossy
+ * buffer.
+ * Access: RW
+ *
+ * Note: In Spectrum, reserved for buffer[9].
+ */
+MLXSW_ITEM32_INDEXED(reg, pbmc, buf_xoff_threshold, 0x0C, 16, 16,
+		     0x08, 0x04, false);
+
+/* reg_pbmc_buf_xon_threshold
+ * When the amount of data in the buffer goes below this value, device
+ * stops sending PFC frames for the priorities associated with the
+ * buffer. Units are represented in cells. Reserved in case of lossy
+ * buffer.
+ * Access: RW
+ *
+ * Note: In Spectrum, reserved for buffer[9].
+ */
+MLXSW_ITEM32_INDEXED(reg, pbmc, buf_xon_threshold, 0x0C, 0, 16,
+		     0x08, 0x04, false);
+
+static inline void mlxsw_reg_pbmc_pack(char *payload, u8 local_port,
+				       u16 xoff_timer_value, u16 xoff_refresh)
+{
+	MLXSW_REG_ZERO(pbmc, payload);
+	mlxsw_reg_pbmc_local_port_set(payload, local_port);
+	mlxsw_reg_pbmc_xoff_timer_value_set(payload, xoff_timer_value);
+	mlxsw_reg_pbmc_xoff_refresh_set(payload, xoff_refresh);
+}
+
+static inline void mlxsw_reg_pbmc_lossy_buffer_pack(char *payload,
+						    int buf_index,
+						    u16 size)
+{
+	mlxsw_reg_pbmc_buf_lossy_set(payload, buf_index, 1);
+	mlxsw_reg_pbmc_buf_epsb_set(payload, buf_index, 0);
+	mlxsw_reg_pbmc_buf_size_set(payload, buf_index, size);
+}
+
+static inline void mlxsw_reg_pbmc_lossless_buffer_pack(char *payload,
+						       int buf_index, u16 size,
+						       u16 threshold)
+{
+	mlxsw_reg_pbmc_buf_lossy_set(payload, buf_index, 0);
+	mlxsw_reg_pbmc_buf_epsb_set(payload, buf_index, 0);
+	mlxsw_reg_pbmc_buf_size_set(payload, buf_index, size);
+	mlxsw_reg_pbmc_buf_xoff_threshold_set(payload, buf_index, threshold);
+	mlxsw_reg_pbmc_buf_xon_threshold_set(payload, buf_index, threshold);
+}
+
+/* PSPA - Port Switch Partition Allocation
+ * ---------------------------------------
+ * Controls the association of a port with a switch partition and enables
+ * configuring ports as stacking ports.
+ */
+#define MLXSW_REG_PSPA_ID 0x500D
+#define MLXSW_REG_PSPA_LEN 0x8
+
+MLXSW_REG_DEFINE(pspa, MLXSW_REG_PSPA_ID, MLXSW_REG_PSPA_LEN);
+
+/* reg_pspa_swid
+ * Switch partition ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pspa, swid, 0x00, 24, 8);
+
+/* reg_pspa_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pspa, local_port, 0x00, 16, 8);
+
+/* reg_pspa_sub_port
+ * Virtual port within the local port. Set to 0 when virtual ports are
+ * disabled on the local port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pspa, sub_port, 0x00, 8, 8);
+
+static inline void mlxsw_reg_pspa_pack(char *payload, u8 swid, u8 local_port)
+{
+	MLXSW_REG_ZERO(pspa, payload);
+	mlxsw_reg_pspa_swid_set(payload, swid);
+	mlxsw_reg_pspa_local_port_set(payload, local_port);
+	mlxsw_reg_pspa_sub_port_set(payload, 0);
+}
+
+/* HTGT - Host Trap Group Table
+ * ----------------------------
+ * Configures the properties for forwarding to CPU.
+ */
+#define MLXSW_REG_HTGT_ID 0x7002
+#define MLXSW_REG_HTGT_LEN 0x20
+
+MLXSW_REG_DEFINE(htgt, MLXSW_REG_HTGT_ID, MLXSW_REG_HTGT_LEN);
+
+/* reg_htgt_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, htgt, swid, 0x00, 24, 8);
+
+#define MLXSW_REG_HTGT_PATH_TYPE_LOCAL 0x0	/* For locally attached CPU */
+
+/* reg_htgt_type
+ * CPU path type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, htgt, type, 0x00, 8, 4);
+
+enum mlxsw_reg_htgt_trap_group {
+	MLXSW_REG_HTGT_TRAP_GROUP_EMAD,
+	MLXSW_REG_HTGT_TRAP_GROUP_SX2_RX,
+	MLXSW_REG_HTGT_TRAP_GROUP_SX2_CTRL,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_STP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_IGMP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_EVENT,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND,
+};
+
+/* reg_htgt_trap_group
+ * Trap group number. User defined number specifying which trap groups
+ * should be forwarded to the CPU. The mapping between trap IDs and trap
+ * groups is configured using HPKT register.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, htgt, trap_group, 0x00, 0, 8);
+
+enum {
+	MLXSW_REG_HTGT_POLICER_DISABLE,
+	MLXSW_REG_HTGT_POLICER_ENABLE,
+};
+
+/* reg_htgt_pide
+ * Enable policer ID specified using 'pid' field.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, htgt, pide, 0x04, 15, 1);
+
+#define MLXSW_REG_HTGT_INVALID_POLICER 0xff
+
+/* reg_htgt_pid
+ * Policer ID for the trap group.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, htgt, pid, 0x04, 0, 8);
+
+#define MLXSW_REG_HTGT_TRAP_TO_CPU 0x0
+
+/* reg_htgt_mirror_action
+ * Mirror action to use.
+ * 0 - Trap to CPU.
+ * 1 - Trap to CPU and mirror to a mirroring agent.
+ * 2 - Mirror to a mirroring agent and do not trap to CPU.
+ * Access: RW
+ *
+ * Note: Mirroring to a mirroring agent is only supported in Spectrum.
+ */
+MLXSW_ITEM32(reg, htgt, mirror_action, 0x08, 8, 2);
+
+/* reg_htgt_mirroring_agent
+ * Mirroring agent.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, htgt, mirroring_agent, 0x08, 0, 3);
+
+#define MLXSW_REG_HTGT_DEFAULT_PRIORITY 0
+
+/* reg_htgt_priority
+ * Trap group priority.
+ * In case a packet matches multiple classification rules, the packet will
+ * only be trapped once, based on the trap ID associated with the group (via
+ * register HPKT) with the highest priority.
+ * Supported values are 0-7, with 7 represnting the highest priority.
+ * Access: RW
+ *
+ * Note: In SwitchX-2 this field is ignored and the priority value is replaced
+ * by the 'trap_group' field.
+ */
+MLXSW_ITEM32(reg, htgt, priority, 0x0C, 0, 4);
+
+#define MLXSW_REG_HTGT_DEFAULT_TC 7
+
+/* reg_htgt_local_path_cpu_tclass
+ * CPU ingress traffic class for the trap group.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, htgt, local_path_cpu_tclass, 0x10, 16, 6);
+
+enum mlxsw_reg_htgt_local_path_rdq {
+	MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SX2_CTRL = 0x13,
+	MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SX2_RX = 0x14,
+	MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SX2_EMAD = 0x15,
+	MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SIB_EMAD = 0x15,
+};
+/* reg_htgt_local_path_rdq
+ * Receive descriptor queue (RDQ) to use for the trap group.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, htgt, local_path_rdq, 0x10, 0, 6);
+
+static inline void mlxsw_reg_htgt_pack(char *payload, u8 group, u8 policer_id,
+				       u8 priority, u8 tc)
+{
+	MLXSW_REG_ZERO(htgt, payload);
+
+	if (policer_id == MLXSW_REG_HTGT_INVALID_POLICER) {
+		mlxsw_reg_htgt_pide_set(payload,
+					MLXSW_REG_HTGT_POLICER_DISABLE);
+	} else {
+		mlxsw_reg_htgt_pide_set(payload,
+					MLXSW_REG_HTGT_POLICER_ENABLE);
+		mlxsw_reg_htgt_pid_set(payload, policer_id);
+	}
+
+	mlxsw_reg_htgt_type_set(payload, MLXSW_REG_HTGT_PATH_TYPE_LOCAL);
+	mlxsw_reg_htgt_trap_group_set(payload, group);
+	mlxsw_reg_htgt_mirror_action_set(payload, MLXSW_REG_HTGT_TRAP_TO_CPU);
+	mlxsw_reg_htgt_mirroring_agent_set(payload, 0);
+	mlxsw_reg_htgt_priority_set(payload, priority);
+	mlxsw_reg_htgt_local_path_cpu_tclass_set(payload, tc);
+	mlxsw_reg_htgt_local_path_rdq_set(payload, group);
+}
+
+/* HPKT - Host Packet Trap
+ * -----------------------
+ * Configures trap IDs inside trap groups.
+ */
+#define MLXSW_REG_HPKT_ID 0x7003
+#define MLXSW_REG_HPKT_LEN 0x10
+
+MLXSW_REG_DEFINE(hpkt, MLXSW_REG_HPKT_ID, MLXSW_REG_HPKT_LEN);
+
+enum {
+	MLXSW_REG_HPKT_ACK_NOT_REQUIRED,
+	MLXSW_REG_HPKT_ACK_REQUIRED,
+};
+
+/* reg_hpkt_ack
+ * Require acknowledgements from the host for events.
+ * If set, then the device will wait for the event it sent to be acknowledged
+ * by the host. This option is only relevant for event trap IDs.
+ * Access: RW
+ *
+ * Note: Currently not supported by firmware.
+ */
+MLXSW_ITEM32(reg, hpkt, ack, 0x00, 24, 1);
+
+enum mlxsw_reg_hpkt_action {
+	MLXSW_REG_HPKT_ACTION_FORWARD,
+	MLXSW_REG_HPKT_ACTION_TRAP_TO_CPU,
+	MLXSW_REG_HPKT_ACTION_MIRROR_TO_CPU,
+	MLXSW_REG_HPKT_ACTION_DISCARD,
+	MLXSW_REG_HPKT_ACTION_SOFT_DISCARD,
+	MLXSW_REG_HPKT_ACTION_TRAP_AND_SOFT_DISCARD,
+};
+
+/* reg_hpkt_action
+ * Action to perform on packet when trapped.
+ * 0 - No action. Forward to CPU based on switching rules.
+ * 1 - Trap to CPU (CPU receives sole copy).
+ * 2 - Mirror to CPU (CPU receives a replica of the packet).
+ * 3 - Discard.
+ * 4 - Soft discard (allow other traps to act on the packet).
+ * 5 - Trap and soft discard (allow other traps to overwrite this trap).
+ * Access: RW
+ *
+ * Note: Must be set to 0 (forward) for event trap IDs, as they are already
+ * addressed to the CPU.
+ */
+MLXSW_ITEM32(reg, hpkt, action, 0x00, 20, 3);
+
+/* reg_hpkt_trap_group
+ * Trap group to associate the trap with.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, hpkt, trap_group, 0x00, 12, 6);
+
+/* reg_hpkt_trap_id
+ * Trap ID.
+ * Access: Index
+ *
+ * Note: A trap ID can only be associated with a single trap group. The device
+ * will associate the trap ID with the last trap group configured.
+ */
+MLXSW_ITEM32(reg, hpkt, trap_id, 0x00, 0, 9);
+
+enum {
+	MLXSW_REG_HPKT_CTRL_PACKET_DEFAULT,
+	MLXSW_REG_HPKT_CTRL_PACKET_NO_BUFFER,
+	MLXSW_REG_HPKT_CTRL_PACKET_USE_BUFFER,
+};
+
+/* reg_hpkt_ctrl
+ * Configure dedicated buffer resources for control packets.
+ * Ignored by SwitchX-2.
+ * 0 - Keep factory defaults.
+ * 1 - Do not use control buffer for this trap ID.
+ * 2 - Use control buffer for this trap ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, hpkt, ctrl, 0x04, 16, 2);
+
+static inline void mlxsw_reg_hpkt_pack(char *payload, u8 action, u16 trap_id,
+				       enum mlxsw_reg_htgt_trap_group trap_group,
+				       bool is_ctrl)
+{
+	MLXSW_REG_ZERO(hpkt, payload);
+	mlxsw_reg_hpkt_ack_set(payload, MLXSW_REG_HPKT_ACK_NOT_REQUIRED);
+	mlxsw_reg_hpkt_action_set(payload, action);
+	mlxsw_reg_hpkt_trap_group_set(payload, trap_group);
+	mlxsw_reg_hpkt_trap_id_set(payload, trap_id);
+	mlxsw_reg_hpkt_ctrl_set(payload, is_ctrl ?
+				MLXSW_REG_HPKT_CTRL_PACKET_USE_BUFFER :
+				MLXSW_REG_HPKT_CTRL_PACKET_NO_BUFFER);
+}
+
+/* RGCR - Router General Configuration Register
+ * --------------------------------------------
+ * The register is used for setting up the router configuration.
+ */
+#define MLXSW_REG_RGCR_ID 0x8001
+#define MLXSW_REG_RGCR_LEN 0x28
+
+MLXSW_REG_DEFINE(rgcr, MLXSW_REG_RGCR_ID, MLXSW_REG_RGCR_LEN);
+
+/* reg_rgcr_ipv4_en
+ * IPv4 router enable.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rgcr, ipv4_en, 0x00, 31, 1);
+
+/* reg_rgcr_ipv6_en
+ * IPv6 router enable.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rgcr, ipv6_en, 0x00, 30, 1);
+
+/* reg_rgcr_max_router_interfaces
+ * Defines the maximum number of active router interfaces for all virtual
+ * routers.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rgcr, max_router_interfaces, 0x10, 0, 16);
+
+/* reg_rgcr_usp
+ * Update switch priority and packet color.
+ * 0 - Preserve the value of Switch Priority and packet color.
+ * 1 - Recalculate the value of Switch Priority and packet color.
+ * Access: RW
+ *
+ * Note: Not supported by SwitchX and SwitchX-2.
+ */
+MLXSW_ITEM32(reg, rgcr, usp, 0x18, 20, 1);
+
+/* reg_rgcr_pcp_rw
+ * Indicates how to handle the pcp_rewrite_en value:
+ * 0 - Preserve the value of pcp_rewrite_en.
+ * 2 - Disable PCP rewrite.
+ * 3 - Enable PCP rewrite.
+ * Access: RW
+ *
+ * Note: Not supported by SwitchX and SwitchX-2.
+ */
+MLXSW_ITEM32(reg, rgcr, pcp_rw, 0x18, 16, 2);
+
+/* reg_rgcr_activity_dis
+ * Activity disable:
+ * 0 - Activity will be set when an entry is hit (default).
+ * 1 - Activity will not be set when an entry is hit.
+ *
+ * Bit 0 - Disable activity bit in Router Algorithmic LPM Unicast Entry
+ * (RALUE).
+ * Bit 1 - Disable activity bit in Router Algorithmic LPM Unicast Host
+ * Entry (RAUHT).
+ * Bits 2:7 are reserved.
+ * Access: RW
+ *
+ * Note: Not supported by SwitchX, SwitchX-2 and Switch-IB.
+ */
+MLXSW_ITEM32(reg, rgcr, activity_dis, 0x20, 0, 8);
+
+static inline void mlxsw_reg_rgcr_pack(char *payload, bool ipv4_en,
+				       bool ipv6_en)
+{
+	MLXSW_REG_ZERO(rgcr, payload);
+	mlxsw_reg_rgcr_ipv4_en_set(payload, ipv4_en);
+	mlxsw_reg_rgcr_ipv6_en_set(payload, ipv6_en);
+}
+
+/* RITR - Router Interface Table Register
+ * --------------------------------------
+ * The register is used to configure the router interface table.
+ */
+#define MLXSW_REG_RITR_ID 0x8002
+#define MLXSW_REG_RITR_LEN 0x40
+
+MLXSW_REG_DEFINE(ritr, MLXSW_REG_RITR_ID, MLXSW_REG_RITR_LEN);
+
+/* reg_ritr_enable
+ * Enables routing on the router interface.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, enable, 0x00, 31, 1);
+
+/* reg_ritr_ipv4
+ * IPv4 routing enable. Enables routing of IPv4 traffic on the router
+ * interface.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv4, 0x00, 29, 1);
+
+/* reg_ritr_ipv6
+ * IPv6 routing enable. Enables routing of IPv6 traffic on the router
+ * interface.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv6, 0x00, 28, 1);
+
+/* reg_ritr_ipv4_mc
+ * IPv4 multicast routing enable.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv4_mc, 0x00, 27, 1);
+
+/* reg_ritr_ipv6_mc
+ * IPv6 multicast routing enable.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv6_mc, 0x00, 26, 1);
+
+enum mlxsw_reg_ritr_if_type {
+	/* VLAN interface. */
+	MLXSW_REG_RITR_VLAN_IF,
+	/* FID interface. */
+	MLXSW_REG_RITR_FID_IF,
+	/* Sub-port interface. */
+	MLXSW_REG_RITR_SP_IF,
+	/* Loopback Interface. */
+	MLXSW_REG_RITR_LOOPBACK_IF,
+};
+
+/* reg_ritr_type
+ * Router interface type as per enum mlxsw_reg_ritr_if_type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, type, 0x00, 23, 3);
+
+enum {
+	MLXSW_REG_RITR_RIF_CREATE,
+	MLXSW_REG_RITR_RIF_DEL,
+};
+
+/* reg_ritr_op
+ * Opcode:
+ * 0 - Create or edit RIF.
+ * 1 - Delete RIF.
+ * Reserved for SwitchX-2. For Spectrum, editing of interface properties
+ * is not supported. An interface must be deleted and re-created in order
+ * to update properties.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, ritr, op, 0x00, 20, 2);
+
+/* reg_ritr_rif
+ * Router interface index. A pointer to the Router Interface Table.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ritr, rif, 0x00, 0, 16);
+
+/* reg_ritr_ipv4_fe
+ * IPv4 Forwarding Enable.
+ * Enables routing of IPv4 traffic on the router interface. When disabled,
+ * forwarding is blocked but local traffic (traps and IP2ME) will be enabled.
+ * Not supported in SwitchX-2.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv4_fe, 0x04, 29, 1);
+
+/* reg_ritr_ipv6_fe
+ * IPv6 Forwarding Enable.
+ * Enables routing of IPv6 traffic on the router interface. When disabled,
+ * forwarding is blocked but local traffic (traps and IP2ME) will be enabled.
+ * Not supported in SwitchX-2.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv6_fe, 0x04, 28, 1);
+
+/* reg_ritr_ipv4_mc_fe
+ * IPv4 Multicast Forwarding Enable.
+ * When disabled, forwarding is blocked but local traffic (traps and IP to me)
+ * will be enabled.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv4_mc_fe, 0x04, 27, 1);
+
+/* reg_ritr_ipv6_mc_fe
+ * IPv6 Multicast Forwarding Enable.
+ * When disabled, forwarding is blocked but local traffic (traps and IP to me)
+ * will be enabled.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv6_mc_fe, 0x04, 26, 1);
+
+/* reg_ritr_lb_en
+ * Loop-back filter enable for unicast packets.
+ * If the flag is set then loop-back filter for unicast packets is
+ * implemented on the RIF. Multicast packets are always subject to
+ * loop-back filtering.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, lb_en, 0x04, 24, 1);
+
+/* reg_ritr_virtual_router
+ * Virtual router ID associated with the router interface.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, virtual_router, 0x04, 0, 16);
+
+/* reg_ritr_mtu
+ * Router interface MTU.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, mtu, 0x34, 0, 16);
+
+/* reg_ritr_if_swid
+ * Switch partition ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, if_swid, 0x08, 24, 8);
+
+/* reg_ritr_if_mac
+ * Router interface MAC address.
+ * In Spectrum, all MAC addresses must have the same 38 MSBits.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ritr, if_mac, 0x12, 6);
+
+/* VLAN Interface */
+
+/* reg_ritr_vlan_if_vid
+ * VLAN ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, vlan_if_vid, 0x08, 0, 12);
+
+/* FID Interface */
+
+/* reg_ritr_fid_if_fid
+ * Filtering ID. Used to connect a bridge to the router. Only FIDs from
+ * the vFID range are supported.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, fid_if_fid, 0x08, 0, 16);
+
+static inline void mlxsw_reg_ritr_fid_set(char *payload,
+					  enum mlxsw_reg_ritr_if_type rif_type,
+					  u16 fid)
+{
+	if (rif_type == MLXSW_REG_RITR_FID_IF)
+		mlxsw_reg_ritr_fid_if_fid_set(payload, fid);
+	else
+		mlxsw_reg_ritr_vlan_if_vid_set(payload, fid);
+}
+
+/* Sub-port Interface */
+
+/* reg_ritr_sp_if_lag
+ * LAG indication. When this bit is set the system_port field holds the
+ * LAG identifier.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, sp_if_lag, 0x08, 24, 1);
+
+/* reg_ritr_sp_system_port
+ * Port unique indentifier. When lag bit is set, this field holds the
+ * lag_id in bits 0:9.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, sp_if_system_port, 0x08, 0, 16);
+
+/* reg_ritr_sp_if_vid
+ * VLAN ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, sp_if_vid, 0x18, 0, 12);
+
+/* Loopback Interface */
+
+enum mlxsw_reg_ritr_loopback_protocol {
+	/* IPinIP IPv4 underlay Unicast */
+	MLXSW_REG_RITR_LOOPBACK_PROTOCOL_IPIP_IPV4,
+	/* IPinIP IPv6 underlay Unicast */
+	MLXSW_REG_RITR_LOOPBACK_PROTOCOL_IPIP_IPV6,
+};
+
+/* reg_ritr_loopback_protocol
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, loopback_protocol, 0x08, 28, 4);
+
+enum mlxsw_reg_ritr_loopback_ipip_type {
+	/* Tunnel is IPinIP. */
+	MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_IP,
+	/* Tunnel is GRE, no key. */
+	MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_GRE_IN_IP,
+	/* Tunnel is GRE, with a key. */
+	MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_GRE_KEY_IN_IP,
+};
+
+/* reg_ritr_loopback_ipip_type
+ * Encapsulation type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, loopback_ipip_type, 0x10, 24, 4);
+
+enum mlxsw_reg_ritr_loopback_ipip_options {
+	/* The key is defined by gre_key. */
+	MLXSW_REG_RITR_LOOPBACK_IPIP_OPTIONS_GRE_KEY_PRESET,
+};
+
+/* reg_ritr_loopback_ipip_options
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, loopback_ipip_options, 0x10, 20, 4);
+
+/* reg_ritr_loopback_ipip_uvr
+ * Underlay Virtual Router ID.
+ * Range is 0..cap_max_virtual_routers-1.
+ * Reserved for Spectrum-2.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, loopback_ipip_uvr, 0x10, 0, 16);
+
+/* reg_ritr_loopback_ipip_usip*
+ * Encapsulation Underlay source IP.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ritr, loopback_ipip_usip6, 0x18, 16);
+MLXSW_ITEM32(reg, ritr, loopback_ipip_usip4, 0x24, 0, 32);
+
+/* reg_ritr_loopback_ipip_gre_key
+ * GRE Key.
+ * Reserved when ipip_type is not IP_IN_GRE_KEY_IN_IP.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, loopback_ipip_gre_key, 0x28, 0, 32);
+
+/* Shared between ingress/egress */
+enum mlxsw_reg_ritr_counter_set_type {
+	/* No Count. */
+	MLXSW_REG_RITR_COUNTER_SET_TYPE_NO_COUNT = 0x0,
+	/* Basic. Used for router interfaces, counting the following:
+	 *	- Error and Discard counters.
+	 *	- Unicast, Multicast and Broadcast counters. Sharing the
+	 *	  same set of counters for the different type of traffic
+	 *	  (IPv4, IPv6 and mpls).
+	 */
+	MLXSW_REG_RITR_COUNTER_SET_TYPE_BASIC = 0x9,
+};
+
+/* reg_ritr_ingress_counter_index
+ * Counter Index for flow counter.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ingress_counter_index, 0x38, 0, 24);
+
+/* reg_ritr_ingress_counter_set_type
+ * Igress Counter Set Type for router interface counter.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ingress_counter_set_type, 0x38, 24, 8);
+
+/* reg_ritr_egress_counter_index
+ * Counter Index for flow counter.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, egress_counter_index, 0x3C, 0, 24);
+
+/* reg_ritr_egress_counter_set_type
+ * Egress Counter Set Type for router interface counter.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, egress_counter_set_type, 0x3C, 24, 8);
+
+static inline void mlxsw_reg_ritr_counter_pack(char *payload, u32 index,
+					       bool enable, bool egress)
+{
+	enum mlxsw_reg_ritr_counter_set_type set_type;
+
+	if (enable)
+		set_type = MLXSW_REG_RITR_COUNTER_SET_TYPE_BASIC;
+	else
+		set_type = MLXSW_REG_RITR_COUNTER_SET_TYPE_NO_COUNT;
+	mlxsw_reg_ritr_egress_counter_set_type_set(payload, set_type);
+
+	if (egress)
+		mlxsw_reg_ritr_egress_counter_index_set(payload, index);
+	else
+		mlxsw_reg_ritr_ingress_counter_index_set(payload, index);
+}
+
+static inline void mlxsw_reg_ritr_rif_pack(char *payload, u16 rif)
+{
+	MLXSW_REG_ZERO(ritr, payload);
+	mlxsw_reg_ritr_rif_set(payload, rif);
+}
+
+static inline void mlxsw_reg_ritr_sp_if_pack(char *payload, bool lag,
+					     u16 system_port, u16 vid)
+{
+	mlxsw_reg_ritr_sp_if_lag_set(payload, lag);
+	mlxsw_reg_ritr_sp_if_system_port_set(payload, system_port);
+	mlxsw_reg_ritr_sp_if_vid_set(payload, vid);
+}
+
+static inline void mlxsw_reg_ritr_pack(char *payload, bool enable,
+				       enum mlxsw_reg_ritr_if_type type,
+				       u16 rif, u16 vr_id, u16 mtu)
+{
+	bool op = enable ? MLXSW_REG_RITR_RIF_CREATE : MLXSW_REG_RITR_RIF_DEL;
+
+	MLXSW_REG_ZERO(ritr, payload);
+	mlxsw_reg_ritr_enable_set(payload, enable);
+	mlxsw_reg_ritr_ipv4_set(payload, 1);
+	mlxsw_reg_ritr_ipv6_set(payload, 1);
+	mlxsw_reg_ritr_ipv4_mc_set(payload, 1);
+	mlxsw_reg_ritr_ipv6_mc_set(payload, 1);
+	mlxsw_reg_ritr_type_set(payload, type);
+	mlxsw_reg_ritr_op_set(payload, op);
+	mlxsw_reg_ritr_rif_set(payload, rif);
+	mlxsw_reg_ritr_ipv4_fe_set(payload, 1);
+	mlxsw_reg_ritr_ipv6_fe_set(payload, 1);
+	mlxsw_reg_ritr_ipv4_mc_fe_set(payload, 1);
+	mlxsw_reg_ritr_ipv6_mc_fe_set(payload, 1);
+	mlxsw_reg_ritr_lb_en_set(payload, 1);
+	mlxsw_reg_ritr_virtual_router_set(payload, vr_id);
+	mlxsw_reg_ritr_mtu_set(payload, mtu);
+}
+
+static inline void mlxsw_reg_ritr_mac_pack(char *payload, const char *mac)
+{
+	mlxsw_reg_ritr_if_mac_memcpy_to(payload, mac);
+}
+
+static inline void
+mlxsw_reg_ritr_loopback_ipip_common_pack(char *payload,
+			    enum mlxsw_reg_ritr_loopback_ipip_type ipip_type,
+			    enum mlxsw_reg_ritr_loopback_ipip_options options,
+			    u16 uvr_id, u32 gre_key)
+{
+	mlxsw_reg_ritr_loopback_ipip_type_set(payload, ipip_type);
+	mlxsw_reg_ritr_loopback_ipip_options_set(payload, options);
+	mlxsw_reg_ritr_loopback_ipip_uvr_set(payload, uvr_id);
+	mlxsw_reg_ritr_loopback_ipip_gre_key_set(payload, gre_key);
+}
+
+static inline void
+mlxsw_reg_ritr_loopback_ipip4_pack(char *payload,
+			    enum mlxsw_reg_ritr_loopback_ipip_type ipip_type,
+			    enum mlxsw_reg_ritr_loopback_ipip_options options,
+			    u16 uvr_id, u32 usip, u32 gre_key)
+{
+	mlxsw_reg_ritr_loopback_protocol_set(payload,
+				    MLXSW_REG_RITR_LOOPBACK_PROTOCOL_IPIP_IPV4);
+	mlxsw_reg_ritr_loopback_ipip_common_pack(payload, ipip_type, options,
+						 uvr_id, gre_key);
+	mlxsw_reg_ritr_loopback_ipip_usip4_set(payload, usip);
+}
+
+/* RTAR - Router TCAM Allocation Register
+ * --------------------------------------
+ * This register is used for allocation of regions in the TCAM table.
+ */
+#define MLXSW_REG_RTAR_ID 0x8004
+#define MLXSW_REG_RTAR_LEN 0x20
+
+MLXSW_REG_DEFINE(rtar, MLXSW_REG_RTAR_ID, MLXSW_REG_RTAR_LEN);
+
+enum mlxsw_reg_rtar_op {
+	MLXSW_REG_RTAR_OP_ALLOCATE,
+	MLXSW_REG_RTAR_OP_RESIZE,
+	MLXSW_REG_RTAR_OP_DEALLOCATE,
+};
+
+/* reg_rtar_op
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rtar, op, 0x00, 28, 4);
+
+enum mlxsw_reg_rtar_key_type {
+	MLXSW_REG_RTAR_KEY_TYPE_IPV4_MULTICAST = 1,
+	MLXSW_REG_RTAR_KEY_TYPE_IPV6_MULTICAST = 3
+};
+
+/* reg_rtar_key_type
+ * TCAM key type for the region.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rtar, key_type, 0x00, 0, 8);
+
+/* reg_rtar_region_size
+ * TCAM region size. When allocating/resizing this is the requested
+ * size, the response is the actual size.
+ * Note: Actual size may be larger than requested.
+ * Reserved for op = Deallocate
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rtar, region_size, 0x04, 0, 16);
+
+static inline void mlxsw_reg_rtar_pack(char *payload,
+				       enum mlxsw_reg_rtar_op op,
+				       enum mlxsw_reg_rtar_key_type key_type,
+				       u16 region_size)
+{
+	MLXSW_REG_ZERO(rtar, payload);
+	mlxsw_reg_rtar_op_set(payload, op);
+	mlxsw_reg_rtar_key_type_set(payload, key_type);
+	mlxsw_reg_rtar_region_size_set(payload, region_size);
+}
+
+/* RATR - Router Adjacency Table Register
+ * --------------------------------------
+ * The RATR register is used to configure the Router Adjacency (next-hop)
+ * Table.
+ */
+#define MLXSW_REG_RATR_ID 0x8008
+#define MLXSW_REG_RATR_LEN 0x2C
+
+MLXSW_REG_DEFINE(ratr, MLXSW_REG_RATR_ID, MLXSW_REG_RATR_LEN);
+
+enum mlxsw_reg_ratr_op {
+	/* Read */
+	MLXSW_REG_RATR_OP_QUERY_READ = 0,
+	/* Read and clear activity */
+	MLXSW_REG_RATR_OP_QUERY_READ_CLEAR = 2,
+	/* Write Adjacency entry */
+	MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY = 1,
+	/* Write Adjacency entry only if the activity is cleared.
+	 * The write may not succeed if the activity is set. There is not
+	 * direct feedback if the write has succeeded or not, however
+	 * the get will reveal the actual entry (SW can compare the get
+	 * response to the set command).
+	 */
+	MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY_ON_ACTIVITY = 3,
+};
+
+/* reg_ratr_op
+ * Note that Write operation may also be used for updating
+ * counter_set_type and counter_index. In this case all other
+ * fields must not be updated.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, ratr, op, 0x00, 28, 4);
+
+/* reg_ratr_v
+ * Valid bit. Indicates if the adjacency entry is valid.
+ * Note: the device may need some time before reusing an invalidated
+ * entry. During this time the entry can not be reused. It is
+ * recommended to use another entry before reusing an invalidated
+ * entry (e.g. software can put it at the end of the list for
+ * reusing). Trying to access an invalidated entry not yet cleared
+ * by the device results with failure indicating "Try Again" status.
+ * When valid is '0' then egress_router_interface,trap_action,
+ * adjacency_parameters and counters are reserved
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, v, 0x00, 24, 1);
+
+/* reg_ratr_a
+ * Activity. Set for new entries. Set if a packet lookup has hit on
+ * the specific entry. To clear the a bit, use "clear activity".
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ratr, a, 0x00, 16, 1);
+
+enum mlxsw_reg_ratr_type {
+	/* Ethernet */
+	MLXSW_REG_RATR_TYPE_ETHERNET,
+	/* IPoIB Unicast without GRH.
+	 * Reserved for Spectrum.
+	 */
+	MLXSW_REG_RATR_TYPE_IPOIB_UC,
+	/* IPoIB Unicast with GRH. Supported only in table 0 (Ethernet unicast
+	 * adjacency).
+	 * Reserved for Spectrum.
+	 */
+	MLXSW_REG_RATR_TYPE_IPOIB_UC_W_GRH,
+	/* IPoIB Multicast.
+	 * Reserved for Spectrum.
+	 */
+	MLXSW_REG_RATR_TYPE_IPOIB_MC,
+	/* MPLS.
+	 * Reserved for SwitchX/-2.
+	 */
+	MLXSW_REG_RATR_TYPE_MPLS,
+	/* IPinIP Encap.
+	 * Reserved for SwitchX/-2.
+	 */
+	MLXSW_REG_RATR_TYPE_IPIP,
+};
+
+/* reg_ratr_type
+ * Adjacency entry type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, type, 0x04, 28, 4);
+
+/* reg_ratr_adjacency_index_low
+ * Bits 15:0 of index into the adjacency table.
+ * For SwitchX and SwitchX-2, the adjacency table is linear and
+ * used for adjacency entries only.
+ * For Spectrum, the index is to the KVD linear.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ratr, adjacency_index_low, 0x04, 0, 16);
+
+/* reg_ratr_egress_router_interface
+ * Range is 0 .. cap_max_router_interfaces - 1
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, egress_router_interface, 0x08, 0, 16);
+
+enum mlxsw_reg_ratr_trap_action {
+	MLXSW_REG_RATR_TRAP_ACTION_NOP,
+	MLXSW_REG_RATR_TRAP_ACTION_TRAP,
+	MLXSW_REG_RATR_TRAP_ACTION_MIRROR_TO_CPU,
+	MLXSW_REG_RATR_TRAP_ACTION_MIRROR,
+	MLXSW_REG_RATR_TRAP_ACTION_DISCARD_ERRORS,
+};
+
+/* reg_ratr_trap_action
+ * see mlxsw_reg_ratr_trap_action
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, trap_action, 0x0C, 28, 4);
+
+/* reg_ratr_adjacency_index_high
+ * Bits 23:16 of the adjacency_index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ratr, adjacency_index_high, 0x0C, 16, 8);
+
+enum mlxsw_reg_ratr_trap_id {
+	MLXSW_REG_RATR_TRAP_ID_RTR_EGRESS0,
+	MLXSW_REG_RATR_TRAP_ID_RTR_EGRESS1,
+};
+
+/* reg_ratr_trap_id
+ * Trap ID to be reported to CPU.
+ * Trap-ID is RTR_EGRESS0 or RTR_EGRESS1.
+ * For trap_action of NOP, MIRROR and DISCARD_ERROR
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, trap_id, 0x0C, 0, 8);
+
+/* reg_ratr_eth_destination_mac
+ * MAC address of the destination next-hop.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ratr, eth_destination_mac, 0x12, 6);
+
+enum mlxsw_reg_ratr_ipip_type {
+	/* IPv4, address set by mlxsw_reg_ratr_ipip_ipv4_udip. */
+	MLXSW_REG_RATR_IPIP_TYPE_IPV4,
+	/* IPv6, address set by mlxsw_reg_ratr_ipip_ipv6_ptr. */
+	MLXSW_REG_RATR_IPIP_TYPE_IPV6,
+};
+
+/* reg_ratr_ipip_type
+ * Underlay destination ip type.
+ * Note: the type field must match the protocol of the router interface.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, ipip_type, 0x10, 16, 4);
+
+/* reg_ratr_ipip_ipv4_udip
+ * Underlay ipv4 dip.
+ * Reserved when ipip_type is IPv6.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, ipip_ipv4_udip, 0x18, 0, 32);
+
+/* reg_ratr_ipip_ipv6_ptr
+ * Pointer to IPv6 underlay destination ip address.
+ * For Spectrum: Pointer to KVD linear space.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, ipip_ipv6_ptr, 0x1C, 0, 24);
+
+enum mlxsw_reg_flow_counter_set_type {
+	/* No count */
+	MLXSW_REG_FLOW_COUNTER_SET_TYPE_NO_COUNT = 0x00,
+	/* Count packets and bytes */
+	MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES = 0x03,
+	/* Count only packets */
+	MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS = 0x05,
+};
+
+/* reg_ratr_counter_set_type
+ * Counter set type for flow counters
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, counter_set_type, 0x28, 24, 8);
+
+/* reg_ratr_counter_index
+ * Counter index for flow counters
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, counter_index, 0x28, 0, 24);
+
+static inline void
+mlxsw_reg_ratr_pack(char *payload,
+		    enum mlxsw_reg_ratr_op op, bool valid,
+		    enum mlxsw_reg_ratr_type type,
+		    u32 adjacency_index, u16 egress_rif)
+{
+	MLXSW_REG_ZERO(ratr, payload);
+	mlxsw_reg_ratr_op_set(payload, op);
+	mlxsw_reg_ratr_v_set(payload, valid);
+	mlxsw_reg_ratr_type_set(payload, type);
+	mlxsw_reg_ratr_adjacency_index_low_set(payload, adjacency_index);
+	mlxsw_reg_ratr_adjacency_index_high_set(payload, adjacency_index >> 16);
+	mlxsw_reg_ratr_egress_router_interface_set(payload, egress_rif);
+}
+
+static inline void mlxsw_reg_ratr_eth_entry_pack(char *payload,
+						 const char *dest_mac)
+{
+	mlxsw_reg_ratr_eth_destination_mac_memcpy_to(payload, dest_mac);
+}
+
+static inline void mlxsw_reg_ratr_ipip4_entry_pack(char *payload, u32 ipv4_udip)
+{
+	mlxsw_reg_ratr_ipip_type_set(payload, MLXSW_REG_RATR_IPIP_TYPE_IPV4);
+	mlxsw_reg_ratr_ipip_ipv4_udip_set(payload, ipv4_udip);
+}
+
+static inline void mlxsw_reg_ratr_counter_pack(char *payload, u64 counter_index,
+					       bool counter_enable)
+{
+	enum mlxsw_reg_flow_counter_set_type set_type;
+
+	if (counter_enable)
+		set_type = MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES;
+	else
+		set_type = MLXSW_REG_FLOW_COUNTER_SET_TYPE_NO_COUNT;
+
+	mlxsw_reg_ratr_counter_index_set(payload, counter_index);
+	mlxsw_reg_ratr_counter_set_type_set(payload, set_type);
+}
+
+/* RDPM - Router DSCP to Priority Mapping
+ * --------------------------------------
+ * Controls the mapping from DSCP field to switch priority on routed packets
+ */
+#define MLXSW_REG_RDPM_ID 0x8009
+#define MLXSW_REG_RDPM_BASE_LEN 0x00
+#define MLXSW_REG_RDPM_DSCP_ENTRY_REC_LEN 0x01
+#define MLXSW_REG_RDPM_DSCP_ENTRY_REC_MAX_COUNT 64
+#define MLXSW_REG_RDPM_LEN 0x40
+#define MLXSW_REG_RDPM_LAST_ENTRY (MLXSW_REG_RDPM_BASE_LEN + \
+				   MLXSW_REG_RDPM_LEN - \
+				   MLXSW_REG_RDPM_DSCP_ENTRY_REC_LEN)
+
+MLXSW_REG_DEFINE(rdpm, MLXSW_REG_RDPM_ID, MLXSW_REG_RDPM_LEN);
+
+/* reg_dscp_entry_e
+ * Enable update of the specific entry
+ * Access: Index
+ */
+MLXSW_ITEM8_INDEXED(reg, rdpm, dscp_entry_e, MLXSW_REG_RDPM_LAST_ENTRY, 7, 1,
+		    -MLXSW_REG_RDPM_DSCP_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_dscp_entry_prio
+ * Switch Priority
+ * Access: RW
+ */
+MLXSW_ITEM8_INDEXED(reg, rdpm, dscp_entry_prio, MLXSW_REG_RDPM_LAST_ENTRY, 0, 4,
+		    -MLXSW_REG_RDPM_DSCP_ENTRY_REC_LEN, 0x00, false);
+
+static inline void mlxsw_reg_rdpm_pack(char *payload, unsigned short index,
+				       u8 prio)
+{
+	mlxsw_reg_rdpm_dscp_entry_e_set(payload, index, 1);
+	mlxsw_reg_rdpm_dscp_entry_prio_set(payload, index, prio);
+}
+
+/* RICNT - Router Interface Counter Register
+ * -----------------------------------------
+ * The RICNT register retrieves per port performance counters
+ */
+#define MLXSW_REG_RICNT_ID 0x800B
+#define MLXSW_REG_RICNT_LEN 0x100
+
+MLXSW_REG_DEFINE(ricnt, MLXSW_REG_RICNT_ID, MLXSW_REG_RICNT_LEN);
+
+/* reg_ricnt_counter_index
+ * Counter index
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ricnt, counter_index, 0x04, 0, 24);
+
+enum mlxsw_reg_ricnt_counter_set_type {
+	/* No Count. */
+	MLXSW_REG_RICNT_COUNTER_SET_TYPE_NO_COUNT = 0x00,
+	/* Basic. Used for router interfaces, counting the following:
+	 *	- Error and Discard counters.
+	 *	- Unicast, Multicast and Broadcast counters. Sharing the
+	 *	  same set of counters for the different type of traffic
+	 *	  (IPv4, IPv6 and mpls).
+	 */
+	MLXSW_REG_RICNT_COUNTER_SET_TYPE_BASIC = 0x09,
+};
+
+/* reg_ricnt_counter_set_type
+ * Counter Set Type for router interface counter
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ricnt, counter_set_type, 0x04, 24, 8);
+
+enum mlxsw_reg_ricnt_opcode {
+	/* Nop. Supported only for read access*/
+	MLXSW_REG_RICNT_OPCODE_NOP = 0x00,
+	/* Clear. Setting the clr bit will reset the counter value for
+	 * all counters of the specified Router Interface.
+	 */
+	MLXSW_REG_RICNT_OPCODE_CLEAR = 0x08,
+};
+
+/* reg_ricnt_opcode
+ * Opcode
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ricnt, op, 0x00, 28, 4);
+
+/* reg_ricnt_good_unicast_packets
+ * good unicast packets.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, good_unicast_packets, 0x08, 0, 64);
+
+/* reg_ricnt_good_multicast_packets
+ * good multicast packets.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, good_multicast_packets, 0x10, 0, 64);
+
+/* reg_ricnt_good_broadcast_packets
+ * good broadcast packets
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, good_broadcast_packets, 0x18, 0, 64);
+
+/* reg_ricnt_good_unicast_bytes
+ * A count of L3 data and padding octets not including L2 headers
+ * for good unicast frames.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, good_unicast_bytes, 0x20, 0, 64);
+
+/* reg_ricnt_good_multicast_bytes
+ * A count of L3 data and padding octets not including L2 headers
+ * for good multicast frames.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, good_multicast_bytes, 0x28, 0, 64);
+
+/* reg_ritr_good_broadcast_bytes
+ * A count of L3 data and padding octets not including L2 headers
+ * for good broadcast frames.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, good_broadcast_bytes, 0x30, 0, 64);
+
+/* reg_ricnt_error_packets
+ * A count of errored frames that do not pass the router checks.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, error_packets, 0x38, 0, 64);
+
+/* reg_ricnt_discrad_packets
+ * A count of non-errored frames that do not pass the router checks.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, discard_packets, 0x40, 0, 64);
+
+/* reg_ricnt_error_bytes
+ * A count of L3 data and padding octets not including L2 headers
+ * for errored frames.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, error_bytes, 0x48, 0, 64);
+
+/* reg_ricnt_discard_bytes
+ * A count of L3 data and padding octets not including L2 headers
+ * for non-errored frames that do not pass the router checks.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, discard_bytes, 0x50, 0, 64);
+
+static inline void mlxsw_reg_ricnt_pack(char *payload, u32 index,
+					enum mlxsw_reg_ricnt_opcode op)
+{
+	MLXSW_REG_ZERO(ricnt, payload);
+	mlxsw_reg_ricnt_op_set(payload, op);
+	mlxsw_reg_ricnt_counter_index_set(payload, index);
+	mlxsw_reg_ricnt_counter_set_type_set(payload,
+					     MLXSW_REG_RICNT_COUNTER_SET_TYPE_BASIC);
+}
+
+/* RRCR - Router Rules Copy Register Layout
+ * ----------------------------------------
+ * This register is used for moving and copying route entry rules.
+ */
+#define MLXSW_REG_RRCR_ID 0x800F
+#define MLXSW_REG_RRCR_LEN 0x24
+
+MLXSW_REG_DEFINE(rrcr, MLXSW_REG_RRCR_ID, MLXSW_REG_RRCR_LEN);
+
+enum mlxsw_reg_rrcr_op {
+	/* Move rules */
+	MLXSW_REG_RRCR_OP_MOVE,
+	/* Copy rules */
+	MLXSW_REG_RRCR_OP_COPY,
+};
+
+/* reg_rrcr_op
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rrcr, op, 0x00, 28, 4);
+
+/* reg_rrcr_offset
+ * Offset within the region from which to copy/move.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rrcr, offset, 0x00, 0, 16);
+
+/* reg_rrcr_size
+ * The number of rules to copy/move.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rrcr, size, 0x04, 0, 16);
+
+/* reg_rrcr_table_id
+ * Identifier of the table on which to perform the operation. Encoding is the
+ * same as in RTAR.key_type
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rrcr, table_id, 0x10, 0, 4);
+
+/* reg_rrcr_dest_offset
+ * Offset within the region to which to copy/move
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rrcr, dest_offset, 0x20, 0, 16);
+
+static inline void mlxsw_reg_rrcr_pack(char *payload, enum mlxsw_reg_rrcr_op op,
+				       u16 offset, u16 size,
+				       enum mlxsw_reg_rtar_key_type table_id,
+				       u16 dest_offset)
+{
+	MLXSW_REG_ZERO(rrcr, payload);
+	mlxsw_reg_rrcr_op_set(payload, op);
+	mlxsw_reg_rrcr_offset_set(payload, offset);
+	mlxsw_reg_rrcr_size_set(payload, size);
+	mlxsw_reg_rrcr_table_id_set(payload, table_id);
+	mlxsw_reg_rrcr_dest_offset_set(payload, dest_offset);
+}
+
+/* RALTA - Router Algorithmic LPM Tree Allocation Register
+ * -------------------------------------------------------
+ * RALTA is used to allocate the LPM trees of the SHSPM method.
+ */
+#define MLXSW_REG_RALTA_ID 0x8010
+#define MLXSW_REG_RALTA_LEN 0x04
+
+MLXSW_REG_DEFINE(ralta, MLXSW_REG_RALTA_ID, MLXSW_REG_RALTA_LEN);
+
+/* reg_ralta_op
+ * opcode (valid for Write, must be 0 on Read)
+ * 0 - allocate a tree
+ * 1 - deallocate a tree
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, ralta, op, 0x00, 28, 2);
+
+enum mlxsw_reg_ralxx_protocol {
+	MLXSW_REG_RALXX_PROTOCOL_IPV4,
+	MLXSW_REG_RALXX_PROTOCOL_IPV6,
+};
+
+/* reg_ralta_protocol
+ * Protocol.
+ * Deallocation opcode: Reserved.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralta, protocol, 0x00, 24, 4);
+
+/* reg_ralta_tree_id
+ * An identifier (numbered from 1..cap_shspm_max_trees-1) representing
+ * the tree identifier (managed by software).
+ * Note that tree_id 0 is allocated for a default-route tree.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ralta, tree_id, 0x00, 0, 8);
+
+static inline void mlxsw_reg_ralta_pack(char *payload, bool alloc,
+					enum mlxsw_reg_ralxx_protocol protocol,
+					u8 tree_id)
+{
+	MLXSW_REG_ZERO(ralta, payload);
+	mlxsw_reg_ralta_op_set(payload, !alloc);
+	mlxsw_reg_ralta_protocol_set(payload, protocol);
+	mlxsw_reg_ralta_tree_id_set(payload, tree_id);
+}
+
+/* RALST - Router Algorithmic LPM Structure Tree Register
+ * ------------------------------------------------------
+ * RALST is used to set and query the structure of an LPM tree.
+ * The structure of the tree must be sorted as a sorted binary tree, while
+ * each node is a bin that is tagged as the length of the prefixes the lookup
+ * will refer to. Therefore, bin X refers to a set of entries with prefixes
+ * of X bits to match with the destination address. The bin 0 indicates
+ * the default action, when there is no match of any prefix.
+ */
+#define MLXSW_REG_RALST_ID 0x8011
+#define MLXSW_REG_RALST_LEN 0x104
+
+MLXSW_REG_DEFINE(ralst, MLXSW_REG_RALST_ID, MLXSW_REG_RALST_LEN);
+
+/* reg_ralst_root_bin
+ * The bin number of the root bin.
+ * 0<root_bin=<(length of IP address)
+ * For a default-route tree configure 0xff
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralst, root_bin, 0x00, 16, 8);
+
+/* reg_ralst_tree_id
+ * Tree identifier numbered from 1..(cap_shspm_max_trees-1).
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ralst, tree_id, 0x00, 0, 8);
+
+#define MLXSW_REG_RALST_BIN_NO_CHILD 0xff
+#define MLXSW_REG_RALST_BIN_OFFSET 0x04
+#define MLXSW_REG_RALST_BIN_COUNT 128
+
+/* reg_ralst_left_child_bin
+ * Holding the children of the bin according to the stored tree's structure.
+ * For trees composed of less than 4 blocks, the bins in excess are reserved.
+ * Note that tree_id 0 is allocated for a default-route tree, bins are 0xff
+ * Access: RW
+ */
+MLXSW_ITEM16_INDEXED(reg, ralst, left_child_bin, 0x04, 8, 8, 0x02, 0x00, false);
+
+/* reg_ralst_right_child_bin
+ * Holding the children of the bin according to the stored tree's structure.
+ * For trees composed of less than 4 blocks, the bins in excess are reserved.
+ * Note that tree_id 0 is allocated for a default-route tree, bins are 0xff
+ * Access: RW
+ */
+MLXSW_ITEM16_INDEXED(reg, ralst, right_child_bin, 0x04, 0, 8, 0x02, 0x00,
+		     false);
+
+static inline void mlxsw_reg_ralst_pack(char *payload, u8 root_bin, u8 tree_id)
+{
+	MLXSW_REG_ZERO(ralst, payload);
+
+	/* Initialize all bins to have no left or right child */
+	memset(payload + MLXSW_REG_RALST_BIN_OFFSET,
+	       MLXSW_REG_RALST_BIN_NO_CHILD, MLXSW_REG_RALST_BIN_COUNT * 2);
+
+	mlxsw_reg_ralst_root_bin_set(payload, root_bin);
+	mlxsw_reg_ralst_tree_id_set(payload, tree_id);
+}
+
+static inline void mlxsw_reg_ralst_bin_pack(char *payload, u8 bin_number,
+					    u8 left_child_bin,
+					    u8 right_child_bin)
+{
+	int bin_index = bin_number - 1;
+
+	mlxsw_reg_ralst_left_child_bin_set(payload, bin_index, left_child_bin);
+	mlxsw_reg_ralst_right_child_bin_set(payload, bin_index,
+					    right_child_bin);
+}
+
+/* RALTB - Router Algorithmic LPM Tree Binding Register
+ * ----------------------------------------------------
+ * RALTB is used to bind virtual router and protocol to an allocated LPM tree.
+ */
+#define MLXSW_REG_RALTB_ID 0x8012
+#define MLXSW_REG_RALTB_LEN 0x04
+
+MLXSW_REG_DEFINE(raltb, MLXSW_REG_RALTB_ID, MLXSW_REG_RALTB_LEN);
+
+/* reg_raltb_virtual_router
+ * Virtual Router ID
+ * Range is 0..cap_max_virtual_routers-1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, raltb, virtual_router, 0x00, 16, 16);
+
+/* reg_raltb_protocol
+ * Protocol.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, raltb, protocol, 0x00, 12, 4);
+
+/* reg_raltb_tree_id
+ * Tree to be used for the {virtual_router, protocol}
+ * Tree identifier numbered from 1..(cap_shspm_max_trees-1).
+ * By default, all Unicast IPv4 and IPv6 are bound to tree_id 0.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, raltb, tree_id, 0x00, 0, 8);
+
+static inline void mlxsw_reg_raltb_pack(char *payload, u16 virtual_router,
+					enum mlxsw_reg_ralxx_protocol protocol,
+					u8 tree_id)
+{
+	MLXSW_REG_ZERO(raltb, payload);
+	mlxsw_reg_raltb_virtual_router_set(payload, virtual_router);
+	mlxsw_reg_raltb_protocol_set(payload, protocol);
+	mlxsw_reg_raltb_tree_id_set(payload, tree_id);
+}
+
+/* RALUE - Router Algorithmic LPM Unicast Entry Register
+ * -----------------------------------------------------
+ * RALUE is used to configure and query LPM entries that serve
+ * the Unicast protocols.
+ */
+#define MLXSW_REG_RALUE_ID 0x8013
+#define MLXSW_REG_RALUE_LEN 0x38
+
+MLXSW_REG_DEFINE(ralue, MLXSW_REG_RALUE_ID, MLXSW_REG_RALUE_LEN);
+
+/* reg_ralue_protocol
+ * Protocol.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ralue, protocol, 0x00, 24, 4);
+
+enum mlxsw_reg_ralue_op {
+	/* Read operation. If entry doesn't exist, the operation fails. */
+	MLXSW_REG_RALUE_OP_QUERY_READ = 0,
+	/* Clear on read operation. Used to read entry and
+	 * clear Activity bit.
+	 */
+	MLXSW_REG_RALUE_OP_QUERY_CLEAR = 1,
+	/* Write operation. Used to write a new entry to the table. All RW
+	 * fields are written for new entry. Activity bit is set
+	 * for new entries.
+	 */
+	MLXSW_REG_RALUE_OP_WRITE_WRITE = 0,
+	/* Update operation. Used to update an existing route entry and
+	 * only update the RW fields that are detailed in the field
+	 * op_u_mask. If entry doesn't exist, the operation fails.
+	 */
+	MLXSW_REG_RALUE_OP_WRITE_UPDATE = 1,
+	/* Clear activity. The Activity bit (the field a) is cleared
+	 * for the entry.
+	 */
+	MLXSW_REG_RALUE_OP_WRITE_CLEAR = 2,
+	/* Delete operation. Used to delete an existing entry. If entry
+	 * doesn't exist, the operation fails.
+	 */
+	MLXSW_REG_RALUE_OP_WRITE_DELETE = 3,
+};
+
+/* reg_ralue_op
+ * Operation.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, ralue, op, 0x00, 20, 3);
+
+/* reg_ralue_a
+ * Activity. Set for new entries. Set if a packet lookup has hit on the
+ * specific entry, only if the entry is a route. To clear the a bit, use
+ * "clear activity" op.
+ * Enabled by activity_dis in RGCR
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ralue, a, 0x00, 16, 1);
+
+/* reg_ralue_virtual_router
+ * Virtual Router ID
+ * Range is 0..cap_max_virtual_routers-1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ralue, virtual_router, 0x04, 16, 16);
+
+#define MLXSW_REG_RALUE_OP_U_MASK_ENTRY_TYPE	BIT(0)
+#define MLXSW_REG_RALUE_OP_U_MASK_BMP_LEN	BIT(1)
+#define MLXSW_REG_RALUE_OP_U_MASK_ACTION	BIT(2)
+
+/* reg_ralue_op_u_mask
+ * opcode update mask.
+ * On read operation, this field is reserved.
+ * This field is valid for update opcode, otherwise - reserved.
+ * This field is a bitmask of the fields that should be updated.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, ralue, op_u_mask, 0x04, 8, 3);
+
+/* reg_ralue_prefix_len
+ * Number of bits in the prefix of the LPM route.
+ * Note that for IPv6 prefixes, if prefix_len>64 the entry consumes
+ * two entries in the physical HW table.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ralue, prefix_len, 0x08, 0, 8);
+
+/* reg_ralue_dip*
+ * The prefix of the route or of the marker that the object of the LPM
+ * is compared with. The most significant bits of the dip are the prefix.
+ * The least significant bits must be '0' if the prefix_len is smaller
+ * than 128 for IPv6 or smaller than 32 for IPv4.
+ * IPv4 address uses bits dip[31:0] and bits dip[127:32] are reserved.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ralue, dip4, 0x18, 0, 32);
+MLXSW_ITEM_BUF(reg, ralue, dip6, 0x0C, 16);
+
+enum mlxsw_reg_ralue_entry_type {
+	MLXSW_REG_RALUE_ENTRY_TYPE_MARKER_ENTRY = 1,
+	MLXSW_REG_RALUE_ENTRY_TYPE_ROUTE_ENTRY = 2,
+	MLXSW_REG_RALUE_ENTRY_TYPE_MARKER_AND_ROUTE_ENTRY = 3,
+};
+
+/* reg_ralue_entry_type
+ * Entry type.
+ * Note - for Marker entries, the action_type and action fields are reserved.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, entry_type, 0x1C, 30, 2);
+
+/* reg_ralue_bmp_len
+ * The best match prefix length in the case that there is no match for
+ * longer prefixes.
+ * If (entry_type != MARKER_ENTRY), bmp_len must be equal to prefix_len
+ * Note for any update operation with entry_type modification this
+ * field must be set.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, bmp_len, 0x1C, 16, 8);
+
+enum mlxsw_reg_ralue_action_type {
+	MLXSW_REG_RALUE_ACTION_TYPE_REMOTE,
+	MLXSW_REG_RALUE_ACTION_TYPE_LOCAL,
+	MLXSW_REG_RALUE_ACTION_TYPE_IP2ME,
+};
+
+/* reg_ralue_action_type
+ * Action Type
+ * Indicates how the IP address is connected.
+ * It can be connected to a local subnet through local_erif or can be
+ * on a remote subnet connected through a next-hop router,
+ * or transmitted to the CPU.
+ * Reserved when entry_type = MARKER_ENTRY
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, action_type, 0x1C, 0, 2);
+
+enum mlxsw_reg_ralue_trap_action {
+	MLXSW_REG_RALUE_TRAP_ACTION_NOP,
+	MLXSW_REG_RALUE_TRAP_ACTION_TRAP,
+	MLXSW_REG_RALUE_TRAP_ACTION_MIRROR_TO_CPU,
+	MLXSW_REG_RALUE_TRAP_ACTION_MIRROR,
+	MLXSW_REG_RALUE_TRAP_ACTION_DISCARD_ERROR,
+};
+
+/* reg_ralue_trap_action
+ * Trap action.
+ * For IP2ME action, only NOP and MIRROR are possible.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, trap_action, 0x20, 28, 4);
+
+/* reg_ralue_trap_id
+ * Trap ID to be reported to CPU.
+ * Trap ID is RTR_INGRESS0 or RTR_INGRESS1.
+ * For trap_action of NOP, MIRROR and DISCARD_ERROR, trap_id is reserved.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, trap_id, 0x20, 0, 9);
+
+/* reg_ralue_adjacency_index
+ * Points to the first entry of the group-based ECMP.
+ * Only relevant in case of REMOTE action.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, adjacency_index, 0x24, 0, 24);
+
+/* reg_ralue_ecmp_size
+ * Amount of sequential entries starting
+ * from the adjacency_index (the number of ECMPs).
+ * The valid range is 1-64, 512, 1024, 2048 and 4096.
+ * Reserved when trap_action is TRAP or DISCARD_ERROR.
+ * Only relevant in case of REMOTE action.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, ecmp_size, 0x28, 0, 13);
+
+/* reg_ralue_local_erif
+ * Egress Router Interface.
+ * Only relevant in case of LOCAL action.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, local_erif, 0x24, 0, 16);
+
+/* reg_ralue_ip2me_v
+ * Valid bit for the tunnel_ptr field.
+ * If valid = 0 then trap to CPU as IP2ME trap ID.
+ * If valid = 1 and the packet format allows NVE or IPinIP tunnel
+ * decapsulation then tunnel decapsulation is done.
+ * If valid = 1 and packet format does not allow NVE or IPinIP tunnel
+ * decapsulation then trap as IP2ME trap ID.
+ * Only relevant in case of IP2ME action.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, ip2me_v, 0x24, 31, 1);
+
+/* reg_ralue_ip2me_tunnel_ptr
+ * Tunnel Pointer for NVE or IPinIP tunnel decapsulation.
+ * For Spectrum, pointer to KVD Linear.
+ * Only relevant in case of IP2ME action.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, ip2me_tunnel_ptr, 0x24, 0, 24);
+
+static inline void mlxsw_reg_ralue_pack(char *payload,
+					enum mlxsw_reg_ralxx_protocol protocol,
+					enum mlxsw_reg_ralue_op op,
+					u16 virtual_router, u8 prefix_len)
+{
+	MLXSW_REG_ZERO(ralue, payload);
+	mlxsw_reg_ralue_protocol_set(payload, protocol);
+	mlxsw_reg_ralue_op_set(payload, op);
+	mlxsw_reg_ralue_virtual_router_set(payload, virtual_router);
+	mlxsw_reg_ralue_prefix_len_set(payload, prefix_len);
+	mlxsw_reg_ralue_entry_type_set(payload,
+				       MLXSW_REG_RALUE_ENTRY_TYPE_ROUTE_ENTRY);
+	mlxsw_reg_ralue_bmp_len_set(payload, prefix_len);
+}
+
+static inline void mlxsw_reg_ralue_pack4(char *payload,
+					 enum mlxsw_reg_ralxx_protocol protocol,
+					 enum mlxsw_reg_ralue_op op,
+					 u16 virtual_router, u8 prefix_len,
+					 u32 dip)
+{
+	mlxsw_reg_ralue_pack(payload, protocol, op, virtual_router, prefix_len);
+	mlxsw_reg_ralue_dip4_set(payload, dip);
+}
+
+static inline void mlxsw_reg_ralue_pack6(char *payload,
+					 enum mlxsw_reg_ralxx_protocol protocol,
+					 enum mlxsw_reg_ralue_op op,
+					 u16 virtual_router, u8 prefix_len,
+					 const void *dip)
+{
+	mlxsw_reg_ralue_pack(payload, protocol, op, virtual_router, prefix_len);
+	mlxsw_reg_ralue_dip6_memcpy_to(payload, dip);
+}
+
+static inline void
+mlxsw_reg_ralue_act_remote_pack(char *payload,
+				enum mlxsw_reg_ralue_trap_action trap_action,
+				u16 trap_id, u32 adjacency_index, u16 ecmp_size)
+{
+	mlxsw_reg_ralue_action_type_set(payload,
+					MLXSW_REG_RALUE_ACTION_TYPE_REMOTE);
+	mlxsw_reg_ralue_trap_action_set(payload, trap_action);
+	mlxsw_reg_ralue_trap_id_set(payload, trap_id);
+	mlxsw_reg_ralue_adjacency_index_set(payload, adjacency_index);
+	mlxsw_reg_ralue_ecmp_size_set(payload, ecmp_size);
+}
+
+static inline void
+mlxsw_reg_ralue_act_local_pack(char *payload,
+			       enum mlxsw_reg_ralue_trap_action trap_action,
+			       u16 trap_id, u16 local_erif)
+{
+	mlxsw_reg_ralue_action_type_set(payload,
+					MLXSW_REG_RALUE_ACTION_TYPE_LOCAL);
+	mlxsw_reg_ralue_trap_action_set(payload, trap_action);
+	mlxsw_reg_ralue_trap_id_set(payload, trap_id);
+	mlxsw_reg_ralue_local_erif_set(payload, local_erif);
+}
+
+static inline void
+mlxsw_reg_ralue_act_ip2me_pack(char *payload)
+{
+	mlxsw_reg_ralue_action_type_set(payload,
+					MLXSW_REG_RALUE_ACTION_TYPE_IP2ME);
+}
+
+static inline void
+mlxsw_reg_ralue_act_ip2me_tun_pack(char *payload, u32 tunnel_ptr)
+{
+	mlxsw_reg_ralue_action_type_set(payload,
+					MLXSW_REG_RALUE_ACTION_TYPE_IP2ME);
+	mlxsw_reg_ralue_ip2me_v_set(payload, 1);
+	mlxsw_reg_ralue_ip2me_tunnel_ptr_set(payload, tunnel_ptr);
+}
+
+/* RAUHT - Router Algorithmic LPM Unicast Host Table Register
+ * ----------------------------------------------------------
+ * The RAUHT register is used to configure and query the Unicast Host table in
+ * devices that implement the Algorithmic LPM.
+ */
+#define MLXSW_REG_RAUHT_ID 0x8014
+#define MLXSW_REG_RAUHT_LEN 0x74
+
+MLXSW_REG_DEFINE(rauht, MLXSW_REG_RAUHT_ID, MLXSW_REG_RAUHT_LEN);
+
+enum mlxsw_reg_rauht_type {
+	MLXSW_REG_RAUHT_TYPE_IPV4,
+	MLXSW_REG_RAUHT_TYPE_IPV6,
+};
+
+/* reg_rauht_type
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauht, type, 0x00, 24, 2);
+
+enum mlxsw_reg_rauht_op {
+	MLXSW_REG_RAUHT_OP_QUERY_READ = 0,
+	/* Read operation */
+	MLXSW_REG_RAUHT_OP_QUERY_CLEAR_ON_READ = 1,
+	/* Clear on read operation. Used to read entry and clear
+	 * activity bit.
+	 */
+	MLXSW_REG_RAUHT_OP_WRITE_ADD = 0,
+	/* Add. Used to write a new entry to the table. All R/W fields are
+	 * relevant for new entry. Activity bit is set for new entries.
+	 */
+	MLXSW_REG_RAUHT_OP_WRITE_UPDATE = 1,
+	/* Update action. Used to update an existing route entry and
+	 * only update the following fields:
+	 * trap_action, trap_id, mac, counter_set_type, counter_index
+	 */
+	MLXSW_REG_RAUHT_OP_WRITE_CLEAR_ACTIVITY = 2,
+	/* Clear activity. A bit is cleared for the entry. */
+	MLXSW_REG_RAUHT_OP_WRITE_DELETE = 3,
+	/* Delete entry */
+	MLXSW_REG_RAUHT_OP_WRITE_DELETE_ALL = 4,
+	/* Delete all host entries on a RIF. In this command, dip
+	 * field is reserved.
+	 */
+};
+
+/* reg_rauht_op
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, rauht, op, 0x00, 20, 3);
+
+/* reg_rauht_a
+ * Activity. Set for new entries. Set if a packet lookup has hit on
+ * the specific entry.
+ * To clear the a bit, use "clear activity" op.
+ * Enabled by activity_dis in RGCR
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, rauht, a, 0x00, 16, 1);
+
+/* reg_rauht_rif
+ * Router Interface
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauht, rif, 0x00, 0, 16);
+
+/* reg_rauht_dip*
+ * Destination address.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauht, dip4, 0x1C, 0x0, 32);
+MLXSW_ITEM_BUF(reg, rauht, dip6, 0x10, 16);
+
+enum mlxsw_reg_rauht_trap_action {
+	MLXSW_REG_RAUHT_TRAP_ACTION_NOP,
+	MLXSW_REG_RAUHT_TRAP_ACTION_TRAP,
+	MLXSW_REG_RAUHT_TRAP_ACTION_MIRROR_TO_CPU,
+	MLXSW_REG_RAUHT_TRAP_ACTION_MIRROR,
+	MLXSW_REG_RAUHT_TRAP_ACTION_DISCARD_ERRORS,
+};
+
+/* reg_rauht_trap_action
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rauht, trap_action, 0x60, 28, 4);
+
+enum mlxsw_reg_rauht_trap_id {
+	MLXSW_REG_RAUHT_TRAP_ID_RTR_EGRESS0,
+	MLXSW_REG_RAUHT_TRAP_ID_RTR_EGRESS1,
+};
+
+/* reg_rauht_trap_id
+ * Trap ID to be reported to CPU.
+ * Trap-ID is RTR_EGRESS0 or RTR_EGRESS1.
+ * For trap_action of NOP, MIRROR and DISCARD_ERROR,
+ * trap_id is reserved.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rauht, trap_id, 0x60, 0, 9);
+
+/* reg_rauht_counter_set_type
+ * Counter set type for flow counters
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rauht, counter_set_type, 0x68, 24, 8);
+
+/* reg_rauht_counter_index
+ * Counter index for flow counters
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rauht, counter_index, 0x68, 0, 24);
+
+/* reg_rauht_mac
+ * MAC address.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, rauht, mac, 0x6E, 6);
+
+static inline void mlxsw_reg_rauht_pack(char *payload,
+					enum mlxsw_reg_rauht_op op, u16 rif,
+					const char *mac)
+{
+	MLXSW_REG_ZERO(rauht, payload);
+	mlxsw_reg_rauht_op_set(payload, op);
+	mlxsw_reg_rauht_rif_set(payload, rif);
+	mlxsw_reg_rauht_mac_memcpy_to(payload, mac);
+}
+
+static inline void mlxsw_reg_rauht_pack4(char *payload,
+					 enum mlxsw_reg_rauht_op op, u16 rif,
+					 const char *mac, u32 dip)
+{
+	mlxsw_reg_rauht_pack(payload, op, rif, mac);
+	mlxsw_reg_rauht_dip4_set(payload, dip);
+}
+
+static inline void mlxsw_reg_rauht_pack6(char *payload,
+					 enum mlxsw_reg_rauht_op op, u16 rif,
+					 const char *mac, const char *dip)
+{
+	mlxsw_reg_rauht_pack(payload, op, rif, mac);
+	mlxsw_reg_rauht_type_set(payload, MLXSW_REG_RAUHT_TYPE_IPV6);
+	mlxsw_reg_rauht_dip6_memcpy_to(payload, dip);
+}
+
+static inline void mlxsw_reg_rauht_pack_counter(char *payload,
+						u64 counter_index)
+{
+	mlxsw_reg_rauht_counter_index_set(payload, counter_index);
+	mlxsw_reg_rauht_counter_set_type_set(payload,
+					     MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES);
+}
+
+/* RALEU - Router Algorithmic LPM ECMP Update Register
+ * ---------------------------------------------------
+ * The register enables updating the ECMP section in the action for multiple
+ * LPM Unicast entries in a single operation. The update is executed to
+ * all entries of a {virtual router, protocol} tuple using the same ECMP group.
+ */
+#define MLXSW_REG_RALEU_ID 0x8015
+#define MLXSW_REG_RALEU_LEN 0x28
+
+MLXSW_REG_DEFINE(raleu, MLXSW_REG_RALEU_ID, MLXSW_REG_RALEU_LEN);
+
+/* reg_raleu_protocol
+ * Protocol.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, raleu, protocol, 0x00, 24, 4);
+
+/* reg_raleu_virtual_router
+ * Virtual Router ID
+ * Range is 0..cap_max_virtual_routers-1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, raleu, virtual_router, 0x00, 0, 16);
+
+/* reg_raleu_adjacency_index
+ * Adjacency Index used for matching on the existing entries.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, raleu, adjacency_index, 0x10, 0, 24);
+
+/* reg_raleu_ecmp_size
+ * ECMP Size used for matching on the existing entries.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, raleu, ecmp_size, 0x14, 0, 13);
+
+/* reg_raleu_new_adjacency_index
+ * New Adjacency Index.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, raleu, new_adjacency_index, 0x20, 0, 24);
+
+/* reg_raleu_new_ecmp_size
+ * New ECMP Size.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, raleu, new_ecmp_size, 0x24, 0, 13);
+
+static inline void mlxsw_reg_raleu_pack(char *payload,
+					enum mlxsw_reg_ralxx_protocol protocol,
+					u16 virtual_router,
+					u32 adjacency_index, u16 ecmp_size,
+					u32 new_adjacency_index,
+					u16 new_ecmp_size)
+{
+	MLXSW_REG_ZERO(raleu, payload);
+	mlxsw_reg_raleu_protocol_set(payload, protocol);
+	mlxsw_reg_raleu_virtual_router_set(payload, virtual_router);
+	mlxsw_reg_raleu_adjacency_index_set(payload, adjacency_index);
+	mlxsw_reg_raleu_ecmp_size_set(payload, ecmp_size);
+	mlxsw_reg_raleu_new_adjacency_index_set(payload, new_adjacency_index);
+	mlxsw_reg_raleu_new_ecmp_size_set(payload, new_ecmp_size);
+}
+
+/* RAUHTD - Router Algorithmic LPM Unicast Host Table Dump Register
+ * ----------------------------------------------------------------
+ * The RAUHTD register allows dumping entries from the Router Unicast Host
+ * Table. For a given session an entry is dumped no more than one time. The
+ * first RAUHTD access after reset is a new session. A session ends when the
+ * num_rec response is smaller than num_rec request or for IPv4 when the
+ * num_entries is smaller than 4. The clear activity affect the current session
+ * or the last session if a new session has not started.
+ */
+#define MLXSW_REG_RAUHTD_ID 0x8018
+#define MLXSW_REG_RAUHTD_BASE_LEN 0x20
+#define MLXSW_REG_RAUHTD_REC_LEN 0x20
+#define MLXSW_REG_RAUHTD_REC_MAX_NUM 32
+#define MLXSW_REG_RAUHTD_LEN (MLXSW_REG_RAUHTD_BASE_LEN + \
+		MLXSW_REG_RAUHTD_REC_MAX_NUM * MLXSW_REG_RAUHTD_REC_LEN)
+#define MLXSW_REG_RAUHTD_IPV4_ENT_PER_REC 4
+
+MLXSW_REG_DEFINE(rauhtd, MLXSW_REG_RAUHTD_ID, MLXSW_REG_RAUHTD_LEN);
+
+#define MLXSW_REG_RAUHTD_FILTER_A BIT(0)
+#define MLXSW_REG_RAUHTD_FILTER_RIF BIT(3)
+
+/* reg_rauhtd_filter_fields
+ * if a bit is '0' then the relevant field is ignored and dump is done
+ * regardless of the field value
+ * Bit0 - filter by activity: entry_a
+ * Bit3 - filter by entry rip: entry_rif
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauhtd, filter_fields, 0x00, 0, 8);
+
+enum mlxsw_reg_rauhtd_op {
+	MLXSW_REG_RAUHTD_OP_DUMP,
+	MLXSW_REG_RAUHTD_OP_DUMP_AND_CLEAR,
+};
+
+/* reg_rauhtd_op
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, rauhtd, op, 0x04, 24, 2);
+
+/* reg_rauhtd_num_rec
+ * At request: number of records requested
+ * At response: number of records dumped
+ * For IPv4, each record has 4 entries at request and up to 4 entries
+ * at response
+ * Range is 0..MLXSW_REG_RAUHTD_REC_MAX_NUM
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauhtd, num_rec, 0x04, 0, 8);
+
+/* reg_rauhtd_entry_a
+ * Dump only if activity has value of entry_a
+ * Reserved if filter_fields bit0 is '0'
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauhtd, entry_a, 0x08, 16, 1);
+
+enum mlxsw_reg_rauhtd_type {
+	MLXSW_REG_RAUHTD_TYPE_IPV4,
+	MLXSW_REG_RAUHTD_TYPE_IPV6,
+};
+
+/* reg_rauhtd_type
+ * Dump only if record type is:
+ * 0 - IPv4
+ * 1 - IPv6
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauhtd, type, 0x08, 0, 4);
+
+/* reg_rauhtd_entry_rif
+ * Dump only if RIF has value of entry_rif
+ * Reserved if filter_fields bit3 is '0'
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauhtd, entry_rif, 0x0C, 0, 16);
+
+static inline void mlxsw_reg_rauhtd_pack(char *payload,
+					 enum mlxsw_reg_rauhtd_type type)
+{
+	MLXSW_REG_ZERO(rauhtd, payload);
+	mlxsw_reg_rauhtd_filter_fields_set(payload, MLXSW_REG_RAUHTD_FILTER_A);
+	mlxsw_reg_rauhtd_op_set(payload, MLXSW_REG_RAUHTD_OP_DUMP_AND_CLEAR);
+	mlxsw_reg_rauhtd_num_rec_set(payload, MLXSW_REG_RAUHTD_REC_MAX_NUM);
+	mlxsw_reg_rauhtd_entry_a_set(payload, 1);
+	mlxsw_reg_rauhtd_type_set(payload, type);
+}
+
+/* reg_rauhtd_ipv4_rec_num_entries
+ * Number of valid entries in this record:
+ * 0 - 1 valid entry
+ * 1 - 2 valid entries
+ * 2 - 3 valid entries
+ * 3 - 4 valid entries
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv4_rec_num_entries,
+		     MLXSW_REG_RAUHTD_BASE_LEN, 28, 2,
+		     MLXSW_REG_RAUHTD_REC_LEN, 0x00, false);
+
+/* reg_rauhtd_rec_type
+ * Record type.
+ * 0 - IPv4
+ * 1 - IPv6
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, rec_type, MLXSW_REG_RAUHTD_BASE_LEN, 24, 2,
+		     MLXSW_REG_RAUHTD_REC_LEN, 0x00, false);
+
+#define MLXSW_REG_RAUHTD_IPV4_ENT_LEN 0x8
+
+/* reg_rauhtd_ipv4_ent_a
+ * Activity. Set for new entries. Set if a packet lookup has hit on the
+ * specific entry.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv4_ent_a, MLXSW_REG_RAUHTD_BASE_LEN, 16, 1,
+		     MLXSW_REG_RAUHTD_IPV4_ENT_LEN, 0x00, false);
+
+/* reg_rauhtd_ipv4_ent_rif
+ * Router interface.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv4_ent_rif, MLXSW_REG_RAUHTD_BASE_LEN, 0,
+		     16, MLXSW_REG_RAUHTD_IPV4_ENT_LEN, 0x00, false);
+
+/* reg_rauhtd_ipv4_ent_dip
+ * Destination IPv4 address.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv4_ent_dip, MLXSW_REG_RAUHTD_BASE_LEN, 0,
+		     32, MLXSW_REG_RAUHTD_IPV4_ENT_LEN, 0x04, false);
+
+#define MLXSW_REG_RAUHTD_IPV6_ENT_LEN 0x20
+
+/* reg_rauhtd_ipv6_ent_a
+ * Activity. Set for new entries. Set if a packet lookup has hit on the
+ * specific entry.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv6_ent_a, MLXSW_REG_RAUHTD_BASE_LEN, 16, 1,
+		     MLXSW_REG_RAUHTD_IPV6_ENT_LEN, 0x00, false);
+
+/* reg_rauhtd_ipv6_ent_rif
+ * Router interface.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv6_ent_rif, MLXSW_REG_RAUHTD_BASE_LEN, 0,
+		     16, MLXSW_REG_RAUHTD_IPV6_ENT_LEN, 0x00, false);
+
+/* reg_rauhtd_ipv6_ent_dip
+ * Destination IPv6 address.
+ * Access: RO
+ */
+MLXSW_ITEM_BUF_INDEXED(reg, rauhtd, ipv6_ent_dip, MLXSW_REG_RAUHTD_BASE_LEN,
+		       16, MLXSW_REG_RAUHTD_IPV6_ENT_LEN, 0x10);
+
+static inline void mlxsw_reg_rauhtd_ent_ipv4_unpack(char *payload,
+						    int ent_index, u16 *p_rif,
+						    u32 *p_dip)
+{
+	*p_rif = mlxsw_reg_rauhtd_ipv4_ent_rif_get(payload, ent_index);
+	*p_dip = mlxsw_reg_rauhtd_ipv4_ent_dip_get(payload, ent_index);
+}
+
+static inline void mlxsw_reg_rauhtd_ent_ipv6_unpack(char *payload,
+						    int rec_index, u16 *p_rif,
+						    char *p_dip)
+{
+	*p_rif = mlxsw_reg_rauhtd_ipv6_ent_rif_get(payload, rec_index);
+	mlxsw_reg_rauhtd_ipv6_ent_dip_memcpy_from(payload, rec_index, p_dip);
+}
+
+/* RTDP - Routing Tunnel Decap Properties Register
+ * -----------------------------------------------
+ * The RTDP register is used for configuring the tunnel decap properties of NVE
+ * and IPinIP.
+ */
+#define MLXSW_REG_RTDP_ID 0x8020
+#define MLXSW_REG_RTDP_LEN 0x44
+
+MLXSW_REG_DEFINE(rtdp, MLXSW_REG_RTDP_ID, MLXSW_REG_RTDP_LEN);
+
+enum mlxsw_reg_rtdp_type {
+	MLXSW_REG_RTDP_TYPE_NVE,
+	MLXSW_REG_RTDP_TYPE_IPIP,
+};
+
+/* reg_rtdp_type
+ * Type of the RTDP entry as per enum mlxsw_reg_rtdp_type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, type, 0x00, 28, 4);
+
+/* reg_rtdp_tunnel_index
+ * Index to the Decap entry.
+ * For Spectrum, Index to KVD Linear.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rtdp, tunnel_index, 0x00, 0, 24);
+
+/* IPinIP */
+
+/* reg_rtdp_ipip_irif
+ * Ingress Router Interface for the overlay router
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_irif, 0x04, 16, 16);
+
+enum mlxsw_reg_rtdp_ipip_sip_check {
+	/* No sip checks. */
+	MLXSW_REG_RTDP_IPIP_SIP_CHECK_NO,
+	/* Filter packet if underlay is not IPv4 or if underlay SIP does not
+	 * equal ipv4_usip.
+	 */
+	MLXSW_REG_RTDP_IPIP_SIP_CHECK_FILTER_IPV4,
+	/* Filter packet if underlay is not IPv6 or if underlay SIP does not
+	 * equal ipv6_usip.
+	 */
+	MLXSW_REG_RTDP_IPIP_SIP_CHECK_FILTER_IPV6 = 3,
+};
+
+/* reg_rtdp_ipip_sip_check
+ * SIP check to perform. If decapsulation failed due to these configurations
+ * then trap_id is IPIP_DECAP_ERROR.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_sip_check, 0x04, 0, 3);
+
+/* If set, allow decapsulation of IPinIP (without GRE). */
+#define MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_IPIP	BIT(0)
+/* If set, allow decapsulation of IPinGREinIP without a key. */
+#define MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_GRE	BIT(1)
+/* If set, allow decapsulation of IPinGREinIP with a key. */
+#define MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_GRE_KEY	BIT(2)
+
+/* reg_rtdp_ipip_type_check
+ * Flags as per MLXSW_REG_RTDP_IPIP_TYPE_CHECK_*. If decapsulation failed due to
+ * these configurations then trap_id is IPIP_DECAP_ERROR.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_type_check, 0x08, 24, 3);
+
+/* reg_rtdp_ipip_gre_key_check
+ * Whether GRE key should be checked. When check is enabled:
+ * - A packet received as IPinIP (without GRE) will always pass.
+ * - A packet received as IPinGREinIP without a key will not pass the check.
+ * - A packet received as IPinGREinIP with a key will pass the check only if the
+ *   key in the packet is equal to expected_gre_key.
+ * If decapsulation failed due to GRE key then trap_id is IPIP_DECAP_ERROR.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_gre_key_check, 0x08, 23, 1);
+
+/* reg_rtdp_ipip_ipv4_usip
+ * Underlay IPv4 address for ipv4 source address check.
+ * Reserved when sip_check is not '1'.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_ipv4_usip, 0x0C, 0, 32);
+
+/* reg_rtdp_ipip_ipv6_usip_ptr
+ * This field is valid when sip_check is "sipv6 check explicitly". This is a
+ * pointer to the IPv6 DIP which is configured by RIPS. For Spectrum, the index
+ * is to the KVD linear.
+ * Reserved when sip_check is not MLXSW_REG_RTDP_IPIP_SIP_CHECK_FILTER_IPV6.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_ipv6_usip_ptr, 0x10, 0, 24);
+
+/* reg_rtdp_ipip_expected_gre_key
+ * GRE key for checking.
+ * Reserved when gre_key_check is '0'.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_expected_gre_key, 0x14, 0, 32);
+
+static inline void mlxsw_reg_rtdp_pack(char *payload,
+				       enum mlxsw_reg_rtdp_type type,
+				       u32 tunnel_index)
+{
+	MLXSW_REG_ZERO(rtdp, payload);
+	mlxsw_reg_rtdp_type_set(payload, type);
+	mlxsw_reg_rtdp_tunnel_index_set(payload, tunnel_index);
+}
+
+static inline void
+mlxsw_reg_rtdp_ipip4_pack(char *payload, u16 irif,
+			  enum mlxsw_reg_rtdp_ipip_sip_check sip_check,
+			  unsigned int type_check, bool gre_key_check,
+			  u32 ipv4_usip, u32 expected_gre_key)
+{
+	mlxsw_reg_rtdp_ipip_irif_set(payload, irif);
+	mlxsw_reg_rtdp_ipip_sip_check_set(payload, sip_check);
+	mlxsw_reg_rtdp_ipip_type_check_set(payload, type_check);
+	mlxsw_reg_rtdp_ipip_gre_key_check_set(payload, gre_key_check);
+	mlxsw_reg_rtdp_ipip_ipv4_usip_set(payload, ipv4_usip);
+	mlxsw_reg_rtdp_ipip_expected_gre_key_set(payload, expected_gre_key);
+}
+
+/* RIGR-V2 - Router Interface Group Register Version 2
+ * ---------------------------------------------------
+ * The RIGR_V2 register is used to add, remove and query egress interface list
+ * of a multicast forwarding entry.
+ */
+#define MLXSW_REG_RIGR2_ID 0x8023
+#define MLXSW_REG_RIGR2_LEN 0xB0
+
+#define MLXSW_REG_RIGR2_MAX_ERIFS 32
+
+MLXSW_REG_DEFINE(rigr2, MLXSW_REG_RIGR2_ID, MLXSW_REG_RIGR2_LEN);
+
+/* reg_rigr2_rigr_index
+ * KVD Linear index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rigr2, rigr_index, 0x04, 0, 24);
+
+/* reg_rigr2_vnext
+ * Next RIGR Index is valid.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rigr2, vnext, 0x08, 31, 1);
+
+/* reg_rigr2_next_rigr_index
+ * Next RIGR Index. The index is to the KVD linear.
+ * Reserved when vnxet = '0'.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rigr2, next_rigr_index, 0x08, 0, 24);
+
+/* reg_rigr2_vrmid
+ * RMID Index is valid.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rigr2, vrmid, 0x20, 31, 1);
+
+/* reg_rigr2_rmid_index
+ * RMID Index.
+ * Range 0 .. max_mid - 1
+ * Reserved when vrmid = '0'.
+ * The index is to the Port Group Table (PGT)
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rigr2, rmid_index, 0x20, 0, 16);
+
+/* reg_rigr2_erif_entry_v
+ * Egress Router Interface is valid.
+ * Note that low-entries must be set if high-entries are set. For
+ * example: if erif_entry[2].v is set then erif_entry[1].v and
+ * erif_entry[0].v must be set.
+ * Index can be from 0 to cap_mc_erif_list_entries-1
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, rigr2, erif_entry_v, 0x24, 31, 1, 4, 0, false);
+
+/* reg_rigr2_erif_entry_erif
+ * Egress Router Interface.
+ * Valid range is from 0 to cap_max_router_interfaces - 1
+ * Index can be from 0 to MLXSW_REG_RIGR2_MAX_ERIFS - 1
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, rigr2, erif_entry_erif, 0x24, 0, 16, 4, 0, false);
+
+static inline void mlxsw_reg_rigr2_pack(char *payload, u32 rigr_index,
+					bool vnext, u32 next_rigr_index)
+{
+	MLXSW_REG_ZERO(rigr2, payload);
+	mlxsw_reg_rigr2_rigr_index_set(payload, rigr_index);
+	mlxsw_reg_rigr2_vnext_set(payload, vnext);
+	mlxsw_reg_rigr2_next_rigr_index_set(payload, next_rigr_index);
+	mlxsw_reg_rigr2_vrmid_set(payload, 0);
+	mlxsw_reg_rigr2_rmid_index_set(payload, 0);
+}
+
+static inline void mlxsw_reg_rigr2_erif_entry_pack(char *payload, int index,
+						   bool v, u16 erif)
+{
+	mlxsw_reg_rigr2_erif_entry_v_set(payload, index, v);
+	mlxsw_reg_rigr2_erif_entry_erif_set(payload, index, erif);
+}
+
+/* RECR-V2 - Router ECMP Configuration Version 2 Register
+ * ------------------------------------------------------
+ */
+#define MLXSW_REG_RECR2_ID 0x8025
+#define MLXSW_REG_RECR2_LEN 0x38
+
+MLXSW_REG_DEFINE(recr2, MLXSW_REG_RECR2_ID, MLXSW_REG_RECR2_LEN);
+
+/* reg_recr2_pp
+ * Per-port configuration
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, recr2, pp, 0x00, 24, 1);
+
+/* reg_recr2_sh
+ * Symmetric hash
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, recr2, sh, 0x00, 8, 1);
+
+/* reg_recr2_seed
+ * Seed
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, recr2, seed, 0x08, 0, 32);
+
+enum {
+	/* Enable IPv4 fields if packet is not TCP and not UDP */
+	MLXSW_REG_RECR2_IPV4_EN_NOT_TCP_NOT_UDP	= 3,
+	/* Enable IPv4 fields if packet is TCP or UDP */
+	MLXSW_REG_RECR2_IPV4_EN_TCP_UDP		= 4,
+	/* Enable IPv6 fields if packet is not TCP and not UDP */
+	MLXSW_REG_RECR2_IPV6_EN_NOT_TCP_NOT_UDP	= 5,
+	/* Enable IPv6 fields if packet is TCP or UDP */
+	MLXSW_REG_RECR2_IPV6_EN_TCP_UDP		= 6,
+	/* Enable TCP/UDP header fields if packet is IPv4 */
+	MLXSW_REG_RECR2_TCP_UDP_EN_IPV4		= 7,
+	/* Enable TCP/UDP header fields if packet is IPv6 */
+	MLXSW_REG_RECR2_TCP_UDP_EN_IPV6		= 8,
+};
+
+/* reg_recr2_outer_header_enables
+ * Bit mask where each bit enables a specific layer to be included in
+ * the hash calculation.
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, recr2, outer_header_enables, 0x10, 0x04, 1);
+
+enum {
+	/* IPv4 Source IP */
+	MLXSW_REG_RECR2_IPV4_SIP0			= 9,
+	MLXSW_REG_RECR2_IPV4_SIP3			= 12,
+	/* IPv4 Destination IP */
+	MLXSW_REG_RECR2_IPV4_DIP0			= 13,
+	MLXSW_REG_RECR2_IPV4_DIP3			= 16,
+	/* IP Protocol */
+	MLXSW_REG_RECR2_IPV4_PROTOCOL			= 17,
+	/* IPv6 Source IP */
+	MLXSW_REG_RECR2_IPV6_SIP0_7			= 21,
+	MLXSW_REG_RECR2_IPV6_SIP8			= 29,
+	MLXSW_REG_RECR2_IPV6_SIP15			= 36,
+	/* IPv6 Destination IP */
+	MLXSW_REG_RECR2_IPV6_DIP0_7			= 37,
+	MLXSW_REG_RECR2_IPV6_DIP8			= 45,
+	MLXSW_REG_RECR2_IPV6_DIP15			= 52,
+	/* IPv6 Next Header */
+	MLXSW_REG_RECR2_IPV6_NEXT_HEADER		= 53,
+	/* IPv6 Flow Label */
+	MLXSW_REG_RECR2_IPV6_FLOW_LABEL			= 57,
+	/* TCP/UDP Source Port */
+	MLXSW_REG_RECR2_TCP_UDP_SPORT			= 74,
+	/* TCP/UDP Destination Port */
+	MLXSW_REG_RECR2_TCP_UDP_DPORT			= 75,
+};
+
+/* reg_recr2_outer_header_fields_enable
+ * Packet fields to enable for ECMP hash subject to outer_header_enable.
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, recr2, outer_header_fields_enable, 0x14, 0x14, 1);
+
+static inline void mlxsw_reg_recr2_ipv4_sip_enable(char *payload)
+{
+	int i;
+
+	for (i = MLXSW_REG_RECR2_IPV4_SIP0; i <= MLXSW_REG_RECR2_IPV4_SIP3; i++)
+		mlxsw_reg_recr2_outer_header_fields_enable_set(payload, i,
+							       true);
+}
+
+static inline void mlxsw_reg_recr2_ipv4_dip_enable(char *payload)
+{
+	int i;
+
+	for (i = MLXSW_REG_RECR2_IPV4_DIP0; i <= MLXSW_REG_RECR2_IPV4_DIP3; i++)
+		mlxsw_reg_recr2_outer_header_fields_enable_set(payload, i,
+							       true);
+}
+
+static inline void mlxsw_reg_recr2_ipv6_sip_enable(char *payload)
+{
+	int i = MLXSW_REG_RECR2_IPV6_SIP0_7;
+
+	mlxsw_reg_recr2_outer_header_fields_enable_set(payload, i, true);
+
+	i = MLXSW_REG_RECR2_IPV6_SIP8;
+	for (; i <= MLXSW_REG_RECR2_IPV6_SIP15; i++)
+		mlxsw_reg_recr2_outer_header_fields_enable_set(payload, i,
+							       true);
+}
+
+static inline void mlxsw_reg_recr2_ipv6_dip_enable(char *payload)
+{
+	int i = MLXSW_REG_RECR2_IPV6_DIP0_7;
+
+	mlxsw_reg_recr2_outer_header_fields_enable_set(payload, i, true);
+
+	i = MLXSW_REG_RECR2_IPV6_DIP8;
+	for (; i <= MLXSW_REG_RECR2_IPV6_DIP15; i++)
+		mlxsw_reg_recr2_outer_header_fields_enable_set(payload, i,
+							       true);
+}
+
+static inline void mlxsw_reg_recr2_pack(char *payload, u32 seed)
+{
+	MLXSW_REG_ZERO(recr2, payload);
+	mlxsw_reg_recr2_pp_set(payload, false);
+	mlxsw_reg_recr2_sh_set(payload, true);
+	mlxsw_reg_recr2_seed_set(payload, seed);
+}
+
+/* RMFT-V2 - Router Multicast Forwarding Table Version 2 Register
+ * --------------------------------------------------------------
+ * The RMFT_V2 register is used to configure and query the multicast table.
+ */
+#define MLXSW_REG_RMFT2_ID 0x8027
+#define MLXSW_REG_RMFT2_LEN 0x174
+
+MLXSW_REG_DEFINE(rmft2, MLXSW_REG_RMFT2_ID, MLXSW_REG_RMFT2_LEN);
+
+/* reg_rmft2_v
+ * Valid
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, v, 0x00, 31, 1);
+
+enum mlxsw_reg_rmft2_type {
+	MLXSW_REG_RMFT2_TYPE_IPV4,
+	MLXSW_REG_RMFT2_TYPE_IPV6
+};
+
+/* reg_rmft2_type
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rmft2, type, 0x00, 28, 2);
+
+enum mlxsw_sp_reg_rmft2_op {
+	/* For Write:
+	 * Write operation. Used to write a new entry to the table. All RW
+	 * fields are relevant for new entry. Activity bit is set for new
+	 * entries - Note write with v (Valid) 0 will delete the entry.
+	 * For Query:
+	 * Read operation
+	 */
+	MLXSW_REG_RMFT2_OP_READ_WRITE,
+};
+
+/* reg_rmft2_op
+ * Operation.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, rmft2, op, 0x00, 20, 2);
+
+/* reg_rmft2_a
+ * Activity. Set for new entries. Set if a packet lookup has hit on the specific
+ * entry.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, rmft2, a, 0x00, 16, 1);
+
+/* reg_rmft2_offset
+ * Offset within the multicast forwarding table to write to.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rmft2, offset, 0x00, 0, 16);
+
+/* reg_rmft2_virtual_router
+ * Virtual Router ID. Range from 0..cap_max_virtual_routers-1
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, virtual_router, 0x04, 0, 16);
+
+enum mlxsw_reg_rmft2_irif_mask {
+	MLXSW_REG_RMFT2_IRIF_MASK_IGNORE,
+	MLXSW_REG_RMFT2_IRIF_MASK_COMPARE
+};
+
+/* reg_rmft2_irif_mask
+ * Ingress RIF mask.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, irif_mask, 0x08, 24, 1);
+
+/* reg_rmft2_irif
+ * Ingress RIF index.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, irif, 0x08, 0, 16);
+
+/* reg_rmft2_dip{4,6}
+ * Destination IPv4/6 address
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, rmft2, dip6, 0x10, 16);
+MLXSW_ITEM32(reg, rmft2, dip4, 0x1C, 0, 32);
+
+/* reg_rmft2_dip{4,6}_mask
+ * A bit that is set directs the TCAM to compare the corresponding bit in key. A
+ * bit that is clear directs the TCAM to ignore the corresponding bit in key.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, rmft2, dip6_mask, 0x20, 16);
+MLXSW_ITEM32(reg, rmft2, dip4_mask, 0x2C, 0, 32);
+
+/* reg_rmft2_sip{4,6}
+ * Source IPv4/6 address
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, rmft2, sip6, 0x30, 16);
+MLXSW_ITEM32(reg, rmft2, sip4, 0x3C, 0, 32);
+
+/* reg_rmft2_sip{4,6}_mask
+ * A bit that is set directs the TCAM to compare the corresponding bit in key. A
+ * bit that is clear directs the TCAM to ignore the corresponding bit in key.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, rmft2, sip6_mask, 0x40, 16);
+MLXSW_ITEM32(reg, rmft2, sip4_mask, 0x4C, 0, 32);
+
+/* reg_rmft2_flexible_action_set
+ * ACL action set. The only supported action types in this field and in any
+ * action-set pointed from here are as follows:
+ * 00h: ACTION_NULL
+ * 01h: ACTION_MAC_TTL, only TTL configuration is supported.
+ * 03h: ACTION_TRAP
+ * 06h: ACTION_QOS
+ * 08h: ACTION_POLICING_MONITORING
+ * 10h: ACTION_ROUTER_MC
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, rmft2, flexible_action_set, 0x80,
+	       MLXSW_REG_FLEX_ACTION_SET_LEN);
+
+static inline void
+mlxsw_reg_rmft2_common_pack(char *payload, bool v, u16 offset,
+			    u16 virtual_router,
+			    enum mlxsw_reg_rmft2_irif_mask irif_mask, u16 irif,
+			    const char *flex_action_set)
+{
+	MLXSW_REG_ZERO(rmft2, payload);
+	mlxsw_reg_rmft2_v_set(payload, v);
+	mlxsw_reg_rmft2_op_set(payload, MLXSW_REG_RMFT2_OP_READ_WRITE);
+	mlxsw_reg_rmft2_offset_set(payload, offset);
+	mlxsw_reg_rmft2_virtual_router_set(payload, virtual_router);
+	mlxsw_reg_rmft2_irif_mask_set(payload, irif_mask);
+	mlxsw_reg_rmft2_irif_set(payload, irif);
+	if (flex_action_set)
+		mlxsw_reg_rmft2_flexible_action_set_memcpy_to(payload,
+							      flex_action_set);
+}
+
+static inline void
+mlxsw_reg_rmft2_ipv4_pack(char *payload, bool v, u16 offset, u16 virtual_router,
+			  enum mlxsw_reg_rmft2_irif_mask irif_mask, u16 irif,
+			  u32 dip4, u32 dip4_mask, u32 sip4, u32 sip4_mask,
+			  const char *flexible_action_set)
+{
+	mlxsw_reg_rmft2_common_pack(payload, v, offset, virtual_router,
+				    irif_mask, irif, flexible_action_set);
+	mlxsw_reg_rmft2_type_set(payload, MLXSW_REG_RMFT2_TYPE_IPV4);
+	mlxsw_reg_rmft2_dip4_set(payload, dip4);
+	mlxsw_reg_rmft2_dip4_mask_set(payload, dip4_mask);
+	mlxsw_reg_rmft2_sip4_set(payload, sip4);
+	mlxsw_reg_rmft2_sip4_mask_set(payload, sip4_mask);
+}
+
+static inline void
+mlxsw_reg_rmft2_ipv6_pack(char *payload, bool v, u16 offset, u16 virtual_router,
+			  enum mlxsw_reg_rmft2_irif_mask irif_mask, u16 irif,
+			  struct in6_addr dip6, struct in6_addr dip6_mask,
+			  struct in6_addr sip6, struct in6_addr sip6_mask,
+			  const char *flexible_action_set)
+{
+	mlxsw_reg_rmft2_common_pack(payload, v, offset, virtual_router,
+				    irif_mask, irif, flexible_action_set);
+	mlxsw_reg_rmft2_type_set(payload, MLXSW_REG_RMFT2_TYPE_IPV6);
+	mlxsw_reg_rmft2_dip6_memcpy_to(payload, (void *)&dip6);
+	mlxsw_reg_rmft2_dip6_mask_memcpy_to(payload, (void *)&dip6_mask);
+	mlxsw_reg_rmft2_sip6_memcpy_to(payload, (void *)&sip6);
+	mlxsw_reg_rmft2_sip6_mask_memcpy_to(payload, (void *)&sip6_mask);
+}
+
+/* MFCR - Management Fan Control Register
+ * --------------------------------------
+ * This register controls the settings of the Fan Speed PWM mechanism.
+ */
+#define MLXSW_REG_MFCR_ID 0x9001
+#define MLXSW_REG_MFCR_LEN 0x08
+
+MLXSW_REG_DEFINE(mfcr, MLXSW_REG_MFCR_ID, MLXSW_REG_MFCR_LEN);
+
+enum mlxsw_reg_mfcr_pwm_frequency {
+	MLXSW_REG_MFCR_PWM_FEQ_11HZ = 0x00,
+	MLXSW_REG_MFCR_PWM_FEQ_14_7HZ = 0x01,
+	MLXSW_REG_MFCR_PWM_FEQ_22_1HZ = 0x02,
+	MLXSW_REG_MFCR_PWM_FEQ_1_4KHZ = 0x40,
+	MLXSW_REG_MFCR_PWM_FEQ_5KHZ = 0x41,
+	MLXSW_REG_MFCR_PWM_FEQ_20KHZ = 0x42,
+	MLXSW_REG_MFCR_PWM_FEQ_22_5KHZ = 0x43,
+	MLXSW_REG_MFCR_PWM_FEQ_25KHZ = 0x44,
+};
+
+/* reg_mfcr_pwm_frequency
+ * Controls the frequency of the PWM signal.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mfcr, pwm_frequency, 0x00, 0, 7);
+
+#define MLXSW_MFCR_TACHOS_MAX 10
+
+/* reg_mfcr_tacho_active
+ * Indicates which of the tachometer is active (bit per tachometer).
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mfcr, tacho_active, 0x04, 16, MLXSW_MFCR_TACHOS_MAX);
+
+#define MLXSW_MFCR_PWMS_MAX 5
+
+/* reg_mfcr_pwm_active
+ * Indicates which of the PWM control is active (bit per PWM).
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mfcr, pwm_active, 0x04, 0, MLXSW_MFCR_PWMS_MAX);
+
+static inline void
+mlxsw_reg_mfcr_pack(char *payload,
+		    enum mlxsw_reg_mfcr_pwm_frequency pwm_frequency)
+{
+	MLXSW_REG_ZERO(mfcr, payload);
+	mlxsw_reg_mfcr_pwm_frequency_set(payload, pwm_frequency);
+}
+
+static inline void
+mlxsw_reg_mfcr_unpack(char *payload,
+		      enum mlxsw_reg_mfcr_pwm_frequency *p_pwm_frequency,
+		      u16 *p_tacho_active, u8 *p_pwm_active)
+{
+	*p_pwm_frequency = mlxsw_reg_mfcr_pwm_frequency_get(payload);
+	*p_tacho_active = mlxsw_reg_mfcr_tacho_active_get(payload);
+	*p_pwm_active = mlxsw_reg_mfcr_pwm_active_get(payload);
+}
+
+/* MFSC - Management Fan Speed Control Register
+ * --------------------------------------------
+ * This register controls the settings of the Fan Speed PWM mechanism.
+ */
+#define MLXSW_REG_MFSC_ID 0x9002
+#define MLXSW_REG_MFSC_LEN 0x08
+
+MLXSW_REG_DEFINE(mfsc, MLXSW_REG_MFSC_ID, MLXSW_REG_MFSC_LEN);
+
+/* reg_mfsc_pwm
+ * Fan pwm to control / monitor.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mfsc, pwm, 0x00, 24, 3);
+
+/* reg_mfsc_pwm_duty_cycle
+ * Controls the duty cycle of the PWM. Value range from 0..255 to
+ * represent duty cycle of 0%...100%.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mfsc, pwm_duty_cycle, 0x04, 0, 8);
+
+static inline void mlxsw_reg_mfsc_pack(char *payload, u8 pwm,
+				       u8 pwm_duty_cycle)
+{
+	MLXSW_REG_ZERO(mfsc, payload);
+	mlxsw_reg_mfsc_pwm_set(payload, pwm);
+	mlxsw_reg_mfsc_pwm_duty_cycle_set(payload, pwm_duty_cycle);
+}
+
+/* MFSM - Management Fan Speed Measurement
+ * ---------------------------------------
+ * This register controls the settings of the Tacho measurements and
+ * enables reading the Tachometer measurements.
+ */
+#define MLXSW_REG_MFSM_ID 0x9003
+#define MLXSW_REG_MFSM_LEN 0x08
+
+MLXSW_REG_DEFINE(mfsm, MLXSW_REG_MFSM_ID, MLXSW_REG_MFSM_LEN);
+
+/* reg_mfsm_tacho
+ * Fan tachometer index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mfsm, tacho, 0x00, 24, 4);
+
+/* reg_mfsm_rpm
+ * Fan speed (round per minute).
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mfsm, rpm, 0x04, 0, 16);
+
+static inline void mlxsw_reg_mfsm_pack(char *payload, u8 tacho)
+{
+	MLXSW_REG_ZERO(mfsm, payload);
+	mlxsw_reg_mfsm_tacho_set(payload, tacho);
+}
+
+/* MFSL - Management Fan Speed Limit Register
+ * ------------------------------------------
+ * The Fan Speed Limit register is used to configure the fan speed
+ * event / interrupt notification mechanism. Fan speed threshold are
+ * defined for both under-speed and over-speed.
+ */
+#define MLXSW_REG_MFSL_ID 0x9004
+#define MLXSW_REG_MFSL_LEN 0x0C
+
+MLXSW_REG_DEFINE(mfsl, MLXSW_REG_MFSL_ID, MLXSW_REG_MFSL_LEN);
+
+/* reg_mfsl_tacho
+ * Fan tachometer index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mfsl, tacho, 0x00, 24, 4);
+
+/* reg_mfsl_tach_min
+ * Tachometer minimum value (minimum RPM).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mfsl, tach_min, 0x04, 0, 16);
+
+/* reg_mfsl_tach_max
+ * Tachometer maximum value (maximum RPM).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mfsl, tach_max, 0x08, 0, 16);
+
+static inline void mlxsw_reg_mfsl_pack(char *payload, u8 tacho,
+				       u16 tach_min, u16 tach_max)
+{
+	MLXSW_REG_ZERO(mfsl, payload);
+	mlxsw_reg_mfsl_tacho_set(payload, tacho);
+	mlxsw_reg_mfsl_tach_min_set(payload, tach_min);
+	mlxsw_reg_mfsl_tach_max_set(payload, tach_max);
+}
+
+static inline void mlxsw_reg_mfsl_unpack(char *payload, u8 tacho,
+					 u16 *p_tach_min, u16 *p_tach_max)
+{
+	if (p_tach_min)
+		*p_tach_min = mlxsw_reg_mfsl_tach_min_get(payload);
+
+	if (p_tach_max)
+		*p_tach_max = mlxsw_reg_mfsl_tach_max_get(payload);
+}
+
+/* MTCAP - Management Temperature Capabilities
+ * -------------------------------------------
+ * This register exposes the capabilities of the device and
+ * system temperature sensing.
+ */
+#define MLXSW_REG_MTCAP_ID 0x9009
+#define MLXSW_REG_MTCAP_LEN 0x08
+
+MLXSW_REG_DEFINE(mtcap, MLXSW_REG_MTCAP_ID, MLXSW_REG_MTCAP_LEN);
+
+/* reg_mtcap_sensor_count
+ * Number of sensors supported by the device.
+ * This includes the QSFP module sensors (if exists in the QSFP module).
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mtcap, sensor_count, 0x00, 0, 7);
+
+/* MTMP - Management Temperature
+ * -----------------------------
+ * This register controls the settings of the temperature measurements
+ * and enables reading the temperature measurements. Note that temperature
+ * is in 0.125 degrees Celsius.
+ */
+#define MLXSW_REG_MTMP_ID 0x900A
+#define MLXSW_REG_MTMP_LEN 0x20
+
+MLXSW_REG_DEFINE(mtmp, MLXSW_REG_MTMP_ID, MLXSW_REG_MTMP_LEN);
+
+/* reg_mtmp_sensor_index
+ * Sensors index to access.
+ * 64-127 of sensor_index are mapped to the SFP+/QSFP modules sequentially
+ * (module 0 is mapped to sensor_index 64).
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mtmp, sensor_index, 0x00, 0, 7);
+
+/* Convert to milli degrees Celsius */
+#define MLXSW_REG_MTMP_TEMP_TO_MC(val) (val * 125)
+
+/* reg_mtmp_temperature
+ * Temperature reading from the sensor. Reading is in 0.125 Celsius
+ * degrees units.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mtmp, temperature, 0x04, 0, 16);
+
+/* reg_mtmp_mte
+ * Max Temperature Enable - enables measuring the max temperature on a sensor.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mtmp, mte, 0x08, 31, 1);
+
+/* reg_mtmp_mtr
+ * Max Temperature Reset - clears the value of the max temperature register.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, mtmp, mtr, 0x08, 30, 1);
+
+/* reg_mtmp_max_temperature
+ * The highest measured temperature from the sensor.
+ * When the bit mte is cleared, the field max_temperature is reserved.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mtmp, max_temperature, 0x08, 0, 16);
+
+/* reg_mtmp_tee
+ * Temperature Event Enable.
+ * 0 - Do not generate event
+ * 1 - Generate event
+ * 2 - Generate single event
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mtmp, tee, 0x0C, 30, 2);
+
+#define MLXSW_REG_MTMP_THRESH_HI 0x348	/* 105 Celsius */
+
+/* reg_mtmp_temperature_threshold_hi
+ * High threshold for Temperature Warning Event. In 0.125 Celsius.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mtmp, temperature_threshold_hi, 0x0C, 0, 16);
+
+/* reg_mtmp_temperature_threshold_lo
+ * Low threshold for Temperature Warning Event. In 0.125 Celsius.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mtmp, temperature_threshold_lo, 0x10, 0, 16);
+
+#define MLXSW_REG_MTMP_SENSOR_NAME_SIZE 8
+
+/* reg_mtmp_sensor_name
+ * Sensor Name
+ * Access: RO
+ */
+MLXSW_ITEM_BUF(reg, mtmp, sensor_name, 0x18, MLXSW_REG_MTMP_SENSOR_NAME_SIZE);
+
+static inline void mlxsw_reg_mtmp_pack(char *payload, u8 sensor_index,
+				       bool max_temp_enable,
+				       bool max_temp_reset)
+{
+	MLXSW_REG_ZERO(mtmp, payload);
+	mlxsw_reg_mtmp_sensor_index_set(payload, sensor_index);
+	mlxsw_reg_mtmp_mte_set(payload, max_temp_enable);
+	mlxsw_reg_mtmp_mtr_set(payload, max_temp_reset);
+	mlxsw_reg_mtmp_temperature_threshold_hi_set(payload,
+						    MLXSW_REG_MTMP_THRESH_HI);
+}
+
+static inline void mlxsw_reg_mtmp_unpack(char *payload, unsigned int *p_temp,
+					 unsigned int *p_max_temp,
+					 char *sensor_name)
+{
+	u16 temp;
+
+	if (p_temp) {
+		temp = mlxsw_reg_mtmp_temperature_get(payload);
+		*p_temp = MLXSW_REG_MTMP_TEMP_TO_MC(temp);
+	}
+	if (p_max_temp) {
+		temp = mlxsw_reg_mtmp_max_temperature_get(payload);
+		*p_max_temp = MLXSW_REG_MTMP_TEMP_TO_MC(temp);
+	}
+	if (sensor_name)
+		mlxsw_reg_mtmp_sensor_name_memcpy_from(payload, sensor_name);
+}
+
+/* MCIA - Management Cable Info Access
+ * -----------------------------------
+ * MCIA register is used to access the SFP+ and QSFP connector's EPROM.
+ */
+
+#define MLXSW_REG_MCIA_ID 0x9014
+#define MLXSW_REG_MCIA_LEN 0x40
+
+MLXSW_REG_DEFINE(mcia, MLXSW_REG_MCIA_ID, MLXSW_REG_MCIA_LEN);
+
+/* reg_mcia_l
+ * Lock bit. Setting this bit will lock the access to the specific
+ * cable. Used for updating a full page in a cable EPROM. Any access
+ * other then subsequence writes will fail while the port is locked.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcia, l, 0x00, 31, 1);
+
+/* reg_mcia_module
+ * Module number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mcia, module, 0x00, 16, 8);
+
+/* reg_mcia_status
+ * Module status.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mcia, status, 0x00, 0, 8);
+
+/* reg_mcia_i2c_device_address
+ * I2C device address.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcia, i2c_device_address, 0x04, 24, 8);
+
+/* reg_mcia_page_number
+ * Page number.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcia, page_number, 0x04, 16, 8);
+
+/* reg_mcia_device_address
+ * Device address.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcia, device_address, 0x04, 0, 16);
+
+/* reg_mcia_size
+ * Number of bytes to read/write (up to 48 bytes).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcia, size, 0x08, 0, 16);
+
+#define MLXSW_SP_REG_MCIA_EEPROM_SIZE 48
+
+/* reg_mcia_eeprom
+ * Bytes to read/write.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, mcia, eeprom, 0x10, MLXSW_SP_REG_MCIA_EEPROM_SIZE);
+
+static inline void mlxsw_reg_mcia_pack(char *payload, u8 module, u8 lock,
+				       u8 page_number, u16 device_addr,
+				       u8 size, u8 i2c_device_addr)
+{
+	MLXSW_REG_ZERO(mcia, payload);
+	mlxsw_reg_mcia_module_set(payload, module);
+	mlxsw_reg_mcia_l_set(payload, lock);
+	mlxsw_reg_mcia_page_number_set(payload, page_number);
+	mlxsw_reg_mcia_device_address_set(payload, device_addr);
+	mlxsw_reg_mcia_size_set(payload, size);
+	mlxsw_reg_mcia_i2c_device_address_set(payload, i2c_device_addr);
+}
+
+/* MPAT - Monitoring Port Analyzer Table
+ * -------------------------------------
+ * MPAT Register is used to query and configure the Switch PortAnalyzer Table.
+ * For an enabled analyzer, all fields except e (enable) cannot be modified.
+ */
+#define MLXSW_REG_MPAT_ID 0x901A
+#define MLXSW_REG_MPAT_LEN 0x78
+
+MLXSW_REG_DEFINE(mpat, MLXSW_REG_MPAT_ID, MLXSW_REG_MPAT_LEN);
+
+/* reg_mpat_pa_id
+ * Port Analyzer ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mpat, pa_id, 0x00, 28, 4);
+
+/* reg_mpat_system_port
+ * A unique port identifier for the final destination of the packet.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, system_port, 0x00, 0, 16);
+
+/* reg_mpat_e
+ * Enable. Indicating the Port Analyzer is enabled.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, e, 0x04, 31, 1);
+
+/* reg_mpat_qos
+ * Quality Of Service Mode.
+ * 0: CONFIGURED - QoS parameters (Switch Priority, and encapsulation
+ * PCP, DEI, DSCP or VL) are configured.
+ * 1: MAINTAIN - QoS parameters (Switch Priority, Color) are the
+ * same as in the original packet that has triggered the mirroring. For
+ * SPAN also the pcp,dei are maintained.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, qos, 0x04, 26, 1);
+
+/* reg_mpat_be
+ * Best effort mode. Indicates mirroring traffic should not cause packet
+ * drop or back pressure, but will discard the mirrored packets. Mirrored
+ * packets will be forwarded on a best effort manner.
+ * 0: Do not discard mirrored packets
+ * 1: Discard mirrored packets if causing congestion
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, be, 0x04, 25, 1);
+
+enum mlxsw_reg_mpat_span_type {
+	/* Local SPAN Ethernet.
+	 * The original packet is not encapsulated.
+	 */
+	MLXSW_REG_MPAT_SPAN_TYPE_LOCAL_ETH = 0x0,
+
+	/* Encapsulated Remote SPAN Ethernet L3 GRE.
+	 * The packet is encapsulated with GRE header.
+	 */
+	MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3 = 0x3,
+};
+
+/* reg_mpat_span_type
+ * SPAN type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, span_type, 0x04, 0, 4);
+
+/* Remote SPAN - Ethernet VLAN
+ * - - - - - - - - - - - - - -
+ */
+
+/* reg_mpat_eth_rspan_vid
+ * Encapsulation header VLAN ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_vid, 0x18, 0, 12);
+
+/* Encapsulated Remote SPAN - Ethernet L2
+ * - - - - - - - - - - - - - - - - - - -
+ */
+
+enum mlxsw_reg_mpat_eth_rspan_version {
+	MLXSW_REG_MPAT_ETH_RSPAN_VERSION_NO_HEADER = 15,
+};
+
+/* reg_mpat_eth_rspan_version
+ * RSPAN mirror header version.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_version, 0x10, 18, 4);
+
+/* reg_mpat_eth_rspan_mac
+ * Destination MAC address.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, mpat, eth_rspan_mac, 0x12, 6);
+
+/* reg_mpat_eth_rspan_tp
+ * Tag Packet. Indicates whether the mirroring header should be VLAN tagged.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_tp, 0x18, 16, 1);
+
+/* Encapsulated Remote SPAN - Ethernet L3
+ * - - - - - - - - - - - - - - - - - - -
+ */
+
+enum mlxsw_reg_mpat_eth_rspan_protocol {
+	MLXSW_REG_MPAT_ETH_RSPAN_PROTOCOL_IPV4,
+	MLXSW_REG_MPAT_ETH_RSPAN_PROTOCOL_IPV6,
+};
+
+/* reg_mpat_eth_rspan_protocol
+ * SPAN encapsulation protocol.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_protocol, 0x18, 24, 4);
+
+/* reg_mpat_eth_rspan_ttl
+ * Encapsulation header Time-to-Live/HopLimit.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_ttl, 0x1C, 4, 8);
+
+/* reg_mpat_eth_rspan_smac
+ * Source MAC address
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, mpat, eth_rspan_smac, 0x22, 6);
+
+/* reg_mpat_eth_rspan_dip*
+ * Destination IP address. The IP version is configured by protocol.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_dip4, 0x4C, 0, 32);
+MLXSW_ITEM_BUF(reg, mpat, eth_rspan_dip6, 0x40, 16);
+
+/* reg_mpat_eth_rspan_sip*
+ * Source IP address. The IP version is configured by protocol.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_sip4, 0x5C, 0, 32);
+MLXSW_ITEM_BUF(reg, mpat, eth_rspan_sip6, 0x50, 16);
+
+static inline void mlxsw_reg_mpat_pack(char *payload, u8 pa_id,
+				       u16 system_port, bool e,
+				       enum mlxsw_reg_mpat_span_type span_type)
+{
+	MLXSW_REG_ZERO(mpat, payload);
+	mlxsw_reg_mpat_pa_id_set(payload, pa_id);
+	mlxsw_reg_mpat_system_port_set(payload, system_port);
+	mlxsw_reg_mpat_e_set(payload, e);
+	mlxsw_reg_mpat_qos_set(payload, 1);
+	mlxsw_reg_mpat_be_set(payload, 1);
+	mlxsw_reg_mpat_span_type_set(payload, span_type);
+}
+
+static inline void mlxsw_reg_mpat_eth_rspan_pack(char *payload, u16 vid)
+{
+	mlxsw_reg_mpat_eth_rspan_vid_set(payload, vid);
+}
+
+static inline void
+mlxsw_reg_mpat_eth_rspan_l2_pack(char *payload,
+				 enum mlxsw_reg_mpat_eth_rspan_version version,
+				 const char *mac,
+				 bool tp)
+{
+	mlxsw_reg_mpat_eth_rspan_version_set(payload, version);
+	mlxsw_reg_mpat_eth_rspan_mac_memcpy_to(payload, mac);
+	mlxsw_reg_mpat_eth_rspan_tp_set(payload, tp);
+}
+
+static inline void
+mlxsw_reg_mpat_eth_rspan_l3_ipv4_pack(char *payload, u8 ttl,
+				      const char *smac,
+				      u32 sip, u32 dip)
+{
+	mlxsw_reg_mpat_eth_rspan_ttl_set(payload, ttl);
+	mlxsw_reg_mpat_eth_rspan_smac_memcpy_to(payload, smac);
+	mlxsw_reg_mpat_eth_rspan_protocol_set(payload,
+				    MLXSW_REG_MPAT_ETH_RSPAN_PROTOCOL_IPV4);
+	mlxsw_reg_mpat_eth_rspan_sip4_set(payload, sip);
+	mlxsw_reg_mpat_eth_rspan_dip4_set(payload, dip);
+}
+
+static inline void
+mlxsw_reg_mpat_eth_rspan_l3_ipv6_pack(char *payload, u8 ttl,
+				      const char *smac,
+				      struct in6_addr sip, struct in6_addr dip)
+{
+	mlxsw_reg_mpat_eth_rspan_ttl_set(payload, ttl);
+	mlxsw_reg_mpat_eth_rspan_smac_memcpy_to(payload, smac);
+	mlxsw_reg_mpat_eth_rspan_protocol_set(payload,
+				    MLXSW_REG_MPAT_ETH_RSPAN_PROTOCOL_IPV6);
+	mlxsw_reg_mpat_eth_rspan_sip6_memcpy_to(payload, (void *)&sip);
+	mlxsw_reg_mpat_eth_rspan_dip6_memcpy_to(payload, (void *)&dip);
+}
+
+/* MPAR - Monitoring Port Analyzer Register
+ * ----------------------------------------
+ * MPAR register is used to query and configure the port analyzer port mirroring
+ * properties.
+ */
+#define MLXSW_REG_MPAR_ID 0x901B
+#define MLXSW_REG_MPAR_LEN 0x08
+
+MLXSW_REG_DEFINE(mpar, MLXSW_REG_MPAR_ID, MLXSW_REG_MPAR_LEN);
+
+/* reg_mpar_local_port
+ * The local port to mirror the packets from.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mpar, local_port, 0x00, 16, 8);
+
+enum mlxsw_reg_mpar_i_e {
+	MLXSW_REG_MPAR_TYPE_EGRESS,
+	MLXSW_REG_MPAR_TYPE_INGRESS,
+};
+
+/* reg_mpar_i_e
+ * Ingress/Egress
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mpar, i_e, 0x00, 0, 4);
+
+/* reg_mpar_enable
+ * Enable mirroring
+ * By default, port mirroring is disabled for all ports.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpar, enable, 0x04, 31, 1);
+
+/* reg_mpar_pa_id
+ * Port Analyzer ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpar, pa_id, 0x04, 0, 4);
+
+static inline void mlxsw_reg_mpar_pack(char *payload, u8 local_port,
+				       enum mlxsw_reg_mpar_i_e i_e,
+				       bool enable, u8 pa_id)
+{
+	MLXSW_REG_ZERO(mpar, payload);
+	mlxsw_reg_mpar_local_port_set(payload, local_port);
+	mlxsw_reg_mpar_enable_set(payload, enable);
+	mlxsw_reg_mpar_i_e_set(payload, i_e);
+	mlxsw_reg_mpar_pa_id_set(payload, pa_id);
+}
+
+/* MLCR - Management LED Control Register
+ * --------------------------------------
+ * Controls the system LEDs.
+ */
+#define MLXSW_REG_MLCR_ID 0x902B
+#define MLXSW_REG_MLCR_LEN 0x0C
+
+MLXSW_REG_DEFINE(mlcr, MLXSW_REG_MLCR_ID, MLXSW_REG_MLCR_LEN);
+
+/* reg_mlcr_local_port
+ * Local port number.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mlcr, local_port, 0x00, 16, 8);
+
+#define MLXSW_REG_MLCR_DURATION_MAX 0xFFFF
+
+/* reg_mlcr_beacon_duration
+ * Duration of the beacon to be active, in seconds.
+ * 0x0 - Will turn off the beacon.
+ * 0xFFFF - Will turn on the beacon until explicitly turned off.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mlcr, beacon_duration, 0x04, 0, 16);
+
+/* reg_mlcr_beacon_remain
+ * Remaining duration of the beacon, in seconds.
+ * 0xFFFF indicates an infinite amount of time.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mlcr, beacon_remain, 0x08, 0, 16);
+
+static inline void mlxsw_reg_mlcr_pack(char *payload, u8 local_port,
+				       bool active)
+{
+	MLXSW_REG_ZERO(mlcr, payload);
+	mlxsw_reg_mlcr_local_port_set(payload, local_port);
+	mlxsw_reg_mlcr_beacon_duration_set(payload, active ?
+					   MLXSW_REG_MLCR_DURATION_MAX : 0);
+}
+
+/* MCQI - Management Component Query Information
+ * ---------------------------------------------
+ * This register allows querying information about firmware components.
+ */
+#define MLXSW_REG_MCQI_ID 0x9061
+#define MLXSW_REG_MCQI_BASE_LEN 0x18
+#define MLXSW_REG_MCQI_CAP_LEN 0x14
+#define MLXSW_REG_MCQI_LEN (MLXSW_REG_MCQI_BASE_LEN + MLXSW_REG_MCQI_CAP_LEN)
+
+MLXSW_REG_DEFINE(mcqi, MLXSW_REG_MCQI_ID, MLXSW_REG_MCQI_LEN);
+
+/* reg_mcqi_component_index
+ * Index of the accessed component.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mcqi, component_index, 0x00, 0, 16);
+
+enum mlxfw_reg_mcqi_info_type {
+	MLXSW_REG_MCQI_INFO_TYPE_CAPABILITIES,
+};
+
+/* reg_mcqi_info_type
+ * Component properties set.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcqi, info_type, 0x08, 0, 5);
+
+/* reg_mcqi_offset
+ * The requested/returned data offset from the section start, given in bytes.
+ * Must be DWORD aligned.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcqi, offset, 0x10, 0, 32);
+
+/* reg_mcqi_data_size
+ * The requested/returned data size, given in bytes. If data_size is not DWORD
+ * aligned, the last bytes are zero padded.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcqi, data_size, 0x14, 0, 16);
+
+/* reg_mcqi_cap_max_component_size
+ * Maximum size for this component, given in bytes.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mcqi, cap_max_component_size, 0x20, 0, 32);
+
+/* reg_mcqi_cap_log_mcda_word_size
+ * Log 2 of the access word size in bytes. Read and write access must be aligned
+ * to the word size. Write access must be done for an integer number of words.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mcqi, cap_log_mcda_word_size, 0x24, 28, 4);
+
+/* reg_mcqi_cap_mcda_max_write_size
+ * Maximal write size for MCDA register
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mcqi, cap_mcda_max_write_size, 0x24, 0, 16);
+
+static inline void mlxsw_reg_mcqi_pack(char *payload, u16 component_index)
+{
+	MLXSW_REG_ZERO(mcqi, payload);
+	mlxsw_reg_mcqi_component_index_set(payload, component_index);
+	mlxsw_reg_mcqi_info_type_set(payload,
+				     MLXSW_REG_MCQI_INFO_TYPE_CAPABILITIES);
+	mlxsw_reg_mcqi_offset_set(payload, 0);
+	mlxsw_reg_mcqi_data_size_set(payload, MLXSW_REG_MCQI_CAP_LEN);
+}
+
+static inline void mlxsw_reg_mcqi_unpack(char *payload,
+					 u32 *p_cap_max_component_size,
+					 u8 *p_cap_log_mcda_word_size,
+					 u16 *p_cap_mcda_max_write_size)
+{
+	*p_cap_max_component_size =
+		mlxsw_reg_mcqi_cap_max_component_size_get(payload);
+	*p_cap_log_mcda_word_size =
+		mlxsw_reg_mcqi_cap_log_mcda_word_size_get(payload);
+	*p_cap_mcda_max_write_size =
+		mlxsw_reg_mcqi_cap_mcda_max_write_size_get(payload);
+}
+
+/* MCC - Management Component Control
+ * ----------------------------------
+ * Controls the firmware component and updates the FSM.
+ */
+#define MLXSW_REG_MCC_ID 0x9062
+#define MLXSW_REG_MCC_LEN 0x1C
+
+MLXSW_REG_DEFINE(mcc, MLXSW_REG_MCC_ID, MLXSW_REG_MCC_LEN);
+
+enum mlxsw_reg_mcc_instruction {
+	MLXSW_REG_MCC_INSTRUCTION_LOCK_UPDATE_HANDLE = 0x01,
+	MLXSW_REG_MCC_INSTRUCTION_RELEASE_UPDATE_HANDLE = 0x02,
+	MLXSW_REG_MCC_INSTRUCTION_UPDATE_COMPONENT = 0x03,
+	MLXSW_REG_MCC_INSTRUCTION_VERIFY_COMPONENT = 0x04,
+	MLXSW_REG_MCC_INSTRUCTION_ACTIVATE = 0x06,
+	MLXSW_REG_MCC_INSTRUCTION_CANCEL = 0x08,
+};
+
+/* reg_mcc_instruction
+ * Command to be executed by the FSM.
+ * Applicable for write operation only.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcc, instruction, 0x00, 0, 8);
+
+/* reg_mcc_component_index
+ * Index of the accessed component. Applicable only for commands that
+ * refer to components. Otherwise, this field is reserved.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mcc, component_index, 0x04, 0, 16);
+
+/* reg_mcc_update_handle
+ * Token representing the current flow executed by the FSM.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, mcc, update_handle, 0x08, 0, 24);
+
+/* reg_mcc_error_code
+ * Indicates the successful completion of the instruction, or the reason it
+ * failed
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mcc, error_code, 0x0C, 8, 8);
+
+/* reg_mcc_control_state
+ * Current FSM state
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mcc, control_state, 0x0C, 0, 4);
+
+/* reg_mcc_component_size
+ * Component size in bytes. Valid for UPDATE_COMPONENT instruction. Specifying
+ * the size may shorten the update time. Value 0x0 means that size is
+ * unspecified.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, mcc, component_size, 0x10, 0, 32);
+
+static inline void mlxsw_reg_mcc_pack(char *payload,
+				      enum mlxsw_reg_mcc_instruction instr,
+				      u16 component_index, u32 update_handle,
+				      u32 component_size)
+{
+	MLXSW_REG_ZERO(mcc, payload);
+	mlxsw_reg_mcc_instruction_set(payload, instr);
+	mlxsw_reg_mcc_component_index_set(payload, component_index);
+	mlxsw_reg_mcc_update_handle_set(payload, update_handle);
+	mlxsw_reg_mcc_component_size_set(payload, component_size);
+}
+
+static inline void mlxsw_reg_mcc_unpack(char *payload, u32 *p_update_handle,
+					u8 *p_error_code, u8 *p_control_state)
+{
+	if (p_update_handle)
+		*p_update_handle = mlxsw_reg_mcc_update_handle_get(payload);
+	if (p_error_code)
+		*p_error_code = mlxsw_reg_mcc_error_code_get(payload);
+	if (p_control_state)
+		*p_control_state = mlxsw_reg_mcc_control_state_get(payload);
+}
+
+/* MCDA - Management Component Data Access
+ * ---------------------------------------
+ * This register allows reading and writing a firmware component.
+ */
+#define MLXSW_REG_MCDA_ID 0x9063
+#define MLXSW_REG_MCDA_BASE_LEN 0x10
+#define MLXSW_REG_MCDA_MAX_DATA_LEN 0x80
+#define MLXSW_REG_MCDA_LEN \
+		(MLXSW_REG_MCDA_BASE_LEN + MLXSW_REG_MCDA_MAX_DATA_LEN)
+
+MLXSW_REG_DEFINE(mcda, MLXSW_REG_MCDA_ID, MLXSW_REG_MCDA_LEN);
+
+/* reg_mcda_update_handle
+ * Token representing the current flow executed by the FSM.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcda, update_handle, 0x00, 0, 24);
+
+/* reg_mcda_offset
+ * Offset of accessed address relative to component start. Accesses must be in
+ * accordance to log_mcda_word_size in MCQI reg.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcda, offset, 0x04, 0, 32);
+
+/* reg_mcda_size
+ * Size of the data accessed, given in bytes.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcda, size, 0x08, 0, 16);
+
+/* reg_mcda_data
+ * Data block accessed.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, mcda, data, 0x10, 0, 32, 4, 0, false);
+
+static inline void mlxsw_reg_mcda_pack(char *payload, u32 update_handle,
+				       u32 offset, u16 size, u8 *data)
+{
+	int i;
+
+	MLXSW_REG_ZERO(mcda, payload);
+	mlxsw_reg_mcda_update_handle_set(payload, update_handle);
+	mlxsw_reg_mcda_offset_set(payload, offset);
+	mlxsw_reg_mcda_size_set(payload, size);
+
+	for (i = 0; i < size / 4; i++)
+		mlxsw_reg_mcda_data_set(payload, i, *(u32 *) &data[i * 4]);
+}
+
+/* MPSC - Monitoring Packet Sampling Configuration Register
+ * --------------------------------------------------------
+ * MPSC Register is used to configure the Packet Sampling mechanism.
+ */
+#define MLXSW_REG_MPSC_ID 0x9080
+#define MLXSW_REG_MPSC_LEN 0x1C
+
+MLXSW_REG_DEFINE(mpsc, MLXSW_REG_MPSC_ID, MLXSW_REG_MPSC_LEN);
+
+/* reg_mpsc_local_port
+ * Local port number
+ * Not supported for CPU port
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mpsc, local_port, 0x00, 16, 8);
+
+/* reg_mpsc_e
+ * Enable sampling on port local_port
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpsc, e, 0x04, 30, 1);
+
+#define MLXSW_REG_MPSC_RATE_MAX 3500000000UL
+
+/* reg_mpsc_rate
+ * Sampling rate = 1 out of rate packets (with randomization around
+ * the point). Valid values are: 1 to MLXSW_REG_MPSC_RATE_MAX
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpsc, rate, 0x08, 0, 32);
+
+static inline void mlxsw_reg_mpsc_pack(char *payload, u8 local_port, bool e,
+				       u32 rate)
+{
+	MLXSW_REG_ZERO(mpsc, payload);
+	mlxsw_reg_mpsc_local_port_set(payload, local_port);
+	mlxsw_reg_mpsc_e_set(payload, e);
+	mlxsw_reg_mpsc_rate_set(payload, rate);
+}
+
+/* MGPC - Monitoring General Purpose Counter Set Register
+ * The MGPC register retrieves and sets the General Purpose Counter Set.
+ */
+#define MLXSW_REG_MGPC_ID 0x9081
+#define MLXSW_REG_MGPC_LEN 0x18
+
+MLXSW_REG_DEFINE(mgpc, MLXSW_REG_MGPC_ID, MLXSW_REG_MGPC_LEN);
+
+/* reg_mgpc_counter_set_type
+ * Counter set type.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, mgpc, counter_set_type, 0x00, 24, 8);
+
+/* reg_mgpc_counter_index
+ * Counter index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mgpc, counter_index, 0x00, 0, 24);
+
+enum mlxsw_reg_mgpc_opcode {
+	/* Nop */
+	MLXSW_REG_MGPC_OPCODE_NOP = 0x00,
+	/* Clear counters */
+	MLXSW_REG_MGPC_OPCODE_CLEAR = 0x08,
+};
+
+/* reg_mgpc_opcode
+ * Opcode.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, mgpc, opcode, 0x04, 28, 4);
+
+/* reg_mgpc_byte_counter
+ * Byte counter value.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, mgpc, byte_counter, 0x08, 0, 64);
+
+/* reg_mgpc_packet_counter
+ * Packet counter value.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, mgpc, packet_counter, 0x10, 0, 64);
+
+static inline void mlxsw_reg_mgpc_pack(char *payload, u32 counter_index,
+				       enum mlxsw_reg_mgpc_opcode opcode,
+				       enum mlxsw_reg_flow_counter_set_type set_type)
+{
+	MLXSW_REG_ZERO(mgpc, payload);
+	mlxsw_reg_mgpc_counter_index_set(payload, counter_index);
+	mlxsw_reg_mgpc_counter_set_type_set(payload, set_type);
+	mlxsw_reg_mgpc_opcode_set(payload, opcode);
+}
+
+/* TIGCR - Tunneling IPinIP General Configuration Register
+ * -------------------------------------------------------
+ * The TIGCR register is used for setting up the IPinIP Tunnel configuration.
+ */
+#define MLXSW_REG_TIGCR_ID 0xA801
+#define MLXSW_REG_TIGCR_LEN 0x10
+
+MLXSW_REG_DEFINE(tigcr, MLXSW_REG_TIGCR_ID, MLXSW_REG_TIGCR_LEN);
+
+/* reg_tigcr_ipip_ttlc
+ * For IPinIP Tunnel encapsulation: whether to copy the ttl from the packet
+ * header.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tigcr, ttlc, 0x04, 8, 1);
+
+/* reg_tigcr_ipip_ttl_uc
+ * The TTL for IPinIP Tunnel encapsulation of unicast packets if
+ * reg_tigcr_ipip_ttlc is unset.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tigcr, ttl_uc, 0x04, 0, 8);
+
+static inline void mlxsw_reg_tigcr_pack(char *payload, bool ttlc, u8 ttl_uc)
+{
+	MLXSW_REG_ZERO(tigcr, payload);
+	mlxsw_reg_tigcr_ttlc_set(payload, ttlc);
+	mlxsw_reg_tigcr_ttl_uc_set(payload, ttl_uc);
+}
+
+/* SBPR - Shared Buffer Pools Register
+ * -----------------------------------
+ * The SBPR configures and retrieves the shared buffer pools and configuration.
+ */
+#define MLXSW_REG_SBPR_ID 0xB001
+#define MLXSW_REG_SBPR_LEN 0x14
+
+MLXSW_REG_DEFINE(sbpr, MLXSW_REG_SBPR_ID, MLXSW_REG_SBPR_LEN);
+
+/* shared direstion enum for SBPR, SBCM, SBPM */
+enum mlxsw_reg_sbxx_dir {
+	MLXSW_REG_SBXX_DIR_INGRESS,
+	MLXSW_REG_SBXX_DIR_EGRESS,
+};
+
+/* reg_sbpr_dir
+ * Direction.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbpr, dir, 0x00, 24, 2);
+
+/* reg_sbpr_pool
+ * Pool index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbpr, pool, 0x00, 0, 4);
+
+/* reg_sbpr_size
+ * Pool size in buffer cells.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbpr, size, 0x04, 0, 24);
+
+enum mlxsw_reg_sbpr_mode {
+	MLXSW_REG_SBPR_MODE_STATIC,
+	MLXSW_REG_SBPR_MODE_DYNAMIC,
+};
+
+/* reg_sbpr_mode
+ * Pool quota calculation mode.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbpr, mode, 0x08, 0, 4);
+
+static inline void mlxsw_reg_sbpr_pack(char *payload, u8 pool,
+				       enum mlxsw_reg_sbxx_dir dir,
+				       enum mlxsw_reg_sbpr_mode mode, u32 size)
+{
+	MLXSW_REG_ZERO(sbpr, payload);
+	mlxsw_reg_sbpr_pool_set(payload, pool);
+	mlxsw_reg_sbpr_dir_set(payload, dir);
+	mlxsw_reg_sbpr_mode_set(payload, mode);
+	mlxsw_reg_sbpr_size_set(payload, size);
+}
+
+/* SBCM - Shared Buffer Class Management Register
+ * ----------------------------------------------
+ * The SBCM register configures and retrieves the shared buffer allocation
+ * and configuration according to Port-PG, including the binding to pool
+ * and definition of the associated quota.
+ */
+#define MLXSW_REG_SBCM_ID 0xB002
+#define MLXSW_REG_SBCM_LEN 0x28
+
+MLXSW_REG_DEFINE(sbcm, MLXSW_REG_SBCM_ID, MLXSW_REG_SBCM_LEN);
+
+/* reg_sbcm_local_port
+ * Local port number.
+ * For Ingress: excludes CPU port and Router port
+ * For Egress: excludes IP Router
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbcm, local_port, 0x00, 16, 8);
+
+/* reg_sbcm_pg_buff
+ * PG buffer - Port PG (dir=ingress) / traffic class (dir=egress)
+ * For PG buffer: range is 0..cap_max_pg_buffers - 1
+ * For traffic class: range is 0..cap_max_tclass - 1
+ * Note that when traffic class is in MC aware mode then the traffic
+ * classes which are MC aware cannot be configured.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbcm, pg_buff, 0x00, 8, 6);
+
+/* reg_sbcm_dir
+ * Direction.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbcm, dir, 0x00, 0, 2);
+
+/* reg_sbcm_min_buff
+ * Minimum buffer size for the limiter, in cells.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbcm, min_buff, 0x18, 0, 24);
+
+/* shared max_buff limits for dynamic threshold for SBCM, SBPM */
+#define MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN 1
+#define MLXSW_REG_SBXX_DYN_MAX_BUFF_MAX 14
+
+/* reg_sbcm_max_buff
+ * When the pool associated to the port-pg/tclass is configured to
+ * static, Maximum buffer size for the limiter configured in cells.
+ * When the pool associated to the port-pg/tclass is configured to
+ * dynamic, the max_buff holds the "alpha" parameter, supporting
+ * the following values:
+ * 0: 0
+ * i: (1/128)*2^(i-1), for i=1..14
+ * 0xFF: Infinity
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbcm, max_buff, 0x1C, 0, 24);
+
+/* reg_sbcm_pool
+ * Association of the port-priority to a pool.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbcm, pool, 0x24, 0, 4);
+
+static inline void mlxsw_reg_sbcm_pack(char *payload, u8 local_port, u8 pg_buff,
+				       enum mlxsw_reg_sbxx_dir dir,
+				       u32 min_buff, u32 max_buff, u8 pool)
+{
+	MLXSW_REG_ZERO(sbcm, payload);
+	mlxsw_reg_sbcm_local_port_set(payload, local_port);
+	mlxsw_reg_sbcm_pg_buff_set(payload, pg_buff);
+	mlxsw_reg_sbcm_dir_set(payload, dir);
+	mlxsw_reg_sbcm_min_buff_set(payload, min_buff);
+	mlxsw_reg_sbcm_max_buff_set(payload, max_buff);
+	mlxsw_reg_sbcm_pool_set(payload, pool);
+}
+
+/* SBPM - Shared Buffer Port Management Register
+ * ---------------------------------------------
+ * The SBPM register configures and retrieves the shared buffer allocation
+ * and configuration according to Port-Pool, including the definition
+ * of the associated quota.
+ */
+#define MLXSW_REG_SBPM_ID 0xB003
+#define MLXSW_REG_SBPM_LEN 0x28
+
+MLXSW_REG_DEFINE(sbpm, MLXSW_REG_SBPM_ID, MLXSW_REG_SBPM_LEN);
+
+/* reg_sbpm_local_port
+ * Local port number.
+ * For Ingress: excludes CPU port and Router port
+ * For Egress: excludes IP Router
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbpm, local_port, 0x00, 16, 8);
+
+/* reg_sbpm_pool
+ * The pool associated to quota counting on the local_port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbpm, pool, 0x00, 8, 4);
+
+/* reg_sbpm_dir
+ * Direction.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbpm, dir, 0x00, 0, 2);
+
+/* reg_sbpm_buff_occupancy
+ * Current buffer occupancy in cells.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, sbpm, buff_occupancy, 0x10, 0, 24);
+
+/* reg_sbpm_clr
+ * Clear Max Buffer Occupancy
+ * When this bit is set, max_buff_occupancy field is cleared (and a
+ * new max value is tracked from the time the clear was performed).
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, sbpm, clr, 0x14, 31, 1);
+
+/* reg_sbpm_max_buff_occupancy
+ * Maximum value of buffer occupancy in cells monitored. Cleared by
+ * writing to the clr field.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, sbpm, max_buff_occupancy, 0x14, 0, 24);
+
+/* reg_sbpm_min_buff
+ * Minimum buffer size for the limiter, in cells.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbpm, min_buff, 0x18, 0, 24);
+
+/* reg_sbpm_max_buff
+ * When the pool associated to the port-pg/tclass is configured to
+ * static, Maximum buffer size for the limiter configured in cells.
+ * When the pool associated to the port-pg/tclass is configured to
+ * dynamic, the max_buff holds the "alpha" parameter, supporting
+ * the following values:
+ * 0: 0
+ * i: (1/128)*2^(i-1), for i=1..14
+ * 0xFF: Infinity
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbpm, max_buff, 0x1C, 0, 24);
+
+static inline void mlxsw_reg_sbpm_pack(char *payload, u8 local_port, u8 pool,
+				       enum mlxsw_reg_sbxx_dir dir, bool clr,
+				       u32 min_buff, u32 max_buff)
+{
+	MLXSW_REG_ZERO(sbpm, payload);
+	mlxsw_reg_sbpm_local_port_set(payload, local_port);
+	mlxsw_reg_sbpm_pool_set(payload, pool);
+	mlxsw_reg_sbpm_dir_set(payload, dir);
+	mlxsw_reg_sbpm_clr_set(payload, clr);
+	mlxsw_reg_sbpm_min_buff_set(payload, min_buff);
+	mlxsw_reg_sbpm_max_buff_set(payload, max_buff);
+}
+
+static inline void mlxsw_reg_sbpm_unpack(char *payload, u32 *p_buff_occupancy,
+					 u32 *p_max_buff_occupancy)
+{
+	*p_buff_occupancy = mlxsw_reg_sbpm_buff_occupancy_get(payload);
+	*p_max_buff_occupancy = mlxsw_reg_sbpm_max_buff_occupancy_get(payload);
+}
+
+/* SBMM - Shared Buffer Multicast Management Register
+ * --------------------------------------------------
+ * The SBMM register configures and retrieves the shared buffer allocation
+ * and configuration for MC packets according to Switch-Priority, including
+ * the binding to pool and definition of the associated quota.
+ */
+#define MLXSW_REG_SBMM_ID 0xB004
+#define MLXSW_REG_SBMM_LEN 0x28
+
+MLXSW_REG_DEFINE(sbmm, MLXSW_REG_SBMM_ID, MLXSW_REG_SBMM_LEN);
+
+/* reg_sbmm_prio
+ * Switch Priority.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbmm, prio, 0x00, 8, 4);
+
+/* reg_sbmm_min_buff
+ * Minimum buffer size for the limiter, in cells.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbmm, min_buff, 0x18, 0, 24);
+
+/* reg_sbmm_max_buff
+ * When the pool associated to the port-pg/tclass is configured to
+ * static, Maximum buffer size for the limiter configured in cells.
+ * When the pool associated to the port-pg/tclass is configured to
+ * dynamic, the max_buff holds the "alpha" parameter, supporting
+ * the following values:
+ * 0: 0
+ * i: (1/128)*2^(i-1), for i=1..14
+ * 0xFF: Infinity
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbmm, max_buff, 0x1C, 0, 24);
+
+/* reg_sbmm_pool
+ * Association of the port-priority to a pool.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbmm, pool, 0x24, 0, 4);
+
+static inline void mlxsw_reg_sbmm_pack(char *payload, u8 prio, u32 min_buff,
+				       u32 max_buff, u8 pool)
+{
+	MLXSW_REG_ZERO(sbmm, payload);
+	mlxsw_reg_sbmm_prio_set(payload, prio);
+	mlxsw_reg_sbmm_min_buff_set(payload, min_buff);
+	mlxsw_reg_sbmm_max_buff_set(payload, max_buff);
+	mlxsw_reg_sbmm_pool_set(payload, pool);
+}
+
+/* SBSR - Shared Buffer Status Register
+ * ------------------------------------
+ * The SBSR register retrieves the shared buffer occupancy according to
+ * Port-Pool. Note that this register enables reading a large amount of data.
+ * It is the user's responsibility to limit the amount of data to ensure the
+ * response can match the maximum transfer unit. In case the response exceeds
+ * the maximum transport unit, it will be truncated with no special notice.
+ */
+#define MLXSW_REG_SBSR_ID 0xB005
+#define MLXSW_REG_SBSR_BASE_LEN 0x5C /* base length, without records */
+#define MLXSW_REG_SBSR_REC_LEN 0x8 /* record length */
+#define MLXSW_REG_SBSR_REC_MAX_COUNT 120
+#define MLXSW_REG_SBSR_LEN (MLXSW_REG_SBSR_BASE_LEN +	\
+			    MLXSW_REG_SBSR_REC_LEN *	\
+			    MLXSW_REG_SBSR_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(sbsr, MLXSW_REG_SBSR_ID, MLXSW_REG_SBSR_LEN);
+
+/* reg_sbsr_clr
+ * Clear Max Buffer Occupancy. When this bit is set, the max_buff_occupancy
+ * field is cleared (and a new max value is tracked from the time the clear
+ * was performed).
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, sbsr, clr, 0x00, 31, 1);
+
+/* reg_sbsr_ingress_port_mask
+ * Bit vector for all ingress network ports.
+ * Indicates which of the ports (for which the relevant bit is set)
+ * are affected by the set operation. Configuration of any other port
+ * does not change.
+ * Access: Index
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, sbsr, ingress_port_mask, 0x10, 0x20, 1);
+
+/* reg_sbsr_pg_buff_mask
+ * Bit vector for all switch priority groups.
+ * Indicates which of the priorities (for which the relevant bit is set)
+ * are affected by the set operation. Configuration of any other priority
+ * does not change.
+ * Range is 0..cap_max_pg_buffers - 1
+ * Access: Index
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, sbsr, pg_buff_mask, 0x30, 0x4, 1);
+
+/* reg_sbsr_egress_port_mask
+ * Bit vector for all egress network ports.
+ * Indicates which of the ports (for which the relevant bit is set)
+ * are affected by the set operation. Configuration of any other port
+ * does not change.
+ * Access: Index
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, sbsr, egress_port_mask, 0x34, 0x20, 1);
+
+/* reg_sbsr_tclass_mask
+ * Bit vector for all traffic classes.
+ * Indicates which of the traffic classes (for which the relevant bit is
+ * set) are affected by the set operation. Configuration of any other
+ * traffic class does not change.
+ * Range is 0..cap_max_tclass - 1
+ * Access: Index
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, sbsr, tclass_mask, 0x54, 0x8, 1);
+
+static inline void mlxsw_reg_sbsr_pack(char *payload, bool clr)
+{
+	MLXSW_REG_ZERO(sbsr, payload);
+	mlxsw_reg_sbsr_clr_set(payload, clr);
+}
+
+/* reg_sbsr_rec_buff_occupancy
+ * Current buffer occupancy in cells.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sbsr, rec_buff_occupancy, MLXSW_REG_SBSR_BASE_LEN,
+		     0, 24, MLXSW_REG_SBSR_REC_LEN, 0x00, false);
+
+/* reg_sbsr_rec_max_buff_occupancy
+ * Maximum value of buffer occupancy in cells monitored. Cleared by
+ * writing to the clr field.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sbsr, rec_max_buff_occupancy, MLXSW_REG_SBSR_BASE_LEN,
+		     0, 24, MLXSW_REG_SBSR_REC_LEN, 0x04, false);
+
+static inline void mlxsw_reg_sbsr_rec_unpack(char *payload, int rec_index,
+					     u32 *p_buff_occupancy,
+					     u32 *p_max_buff_occupancy)
+{
+	*p_buff_occupancy =
+		mlxsw_reg_sbsr_rec_buff_occupancy_get(payload, rec_index);
+	*p_max_buff_occupancy =
+		mlxsw_reg_sbsr_rec_max_buff_occupancy_get(payload, rec_index);
+}
+
+/* SBIB - Shared Buffer Internal Buffer Register
+ * ---------------------------------------------
+ * The SBIB register configures per port buffers for internal use. The internal
+ * buffers consume memory on the port buffers (note that the port buffers are
+ * used also by PBMC).
+ *
+ * For Spectrum this is used for egress mirroring.
+ */
+#define MLXSW_REG_SBIB_ID 0xB006
+#define MLXSW_REG_SBIB_LEN 0x10
+
+MLXSW_REG_DEFINE(sbib, MLXSW_REG_SBIB_ID, MLXSW_REG_SBIB_LEN);
+
+/* reg_sbib_local_port
+ * Local port number
+ * Not supported for CPU port and router port
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbib, local_port, 0x00, 16, 8);
+
+/* reg_sbib_buff_size
+ * Units represented in cells
+ * Allowed range is 0 to (cap_max_headroom_size - 1)
+ * Default is 0
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbib, buff_size, 0x08, 0, 24);
+
+static inline void mlxsw_reg_sbib_pack(char *payload, u8 local_port,
+				       u32 buff_size)
+{
+	MLXSW_REG_ZERO(sbib, payload);
+	mlxsw_reg_sbib_local_port_set(payload, local_port);
+	mlxsw_reg_sbib_buff_size_set(payload, buff_size);
+}
+
+static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
+	MLXSW_REG(sgcr),
+	MLXSW_REG(spad),
+	MLXSW_REG(smid),
+	MLXSW_REG(sspr),
+	MLXSW_REG(sfdat),
+	MLXSW_REG(sfd),
+	MLXSW_REG(sfn),
+	MLXSW_REG(spms),
+	MLXSW_REG(spvid),
+	MLXSW_REG(spvm),
+	MLXSW_REG(spaft),
+	MLXSW_REG(sfgc),
+	MLXSW_REG(sftr),
+	MLXSW_REG(sfdf),
+	MLXSW_REG(sldr),
+	MLXSW_REG(slcr),
+	MLXSW_REG(slcor),
+	MLXSW_REG(spmlr),
+	MLXSW_REG(svfa),
+	MLXSW_REG(svpe),
+	MLXSW_REG(sfmr),
+	MLXSW_REG(spvmlr),
+	MLXSW_REG(cwtp),
+	MLXSW_REG(cwtpm),
+	MLXSW_REG(ppbt),
+	MLXSW_REG(pacl),
+	MLXSW_REG(pagt),
+	MLXSW_REG(ptar),
+	MLXSW_REG(ppbs),
+	MLXSW_REG(prcr),
+	MLXSW_REG(pefa),
+	MLXSW_REG(ptce2),
+	MLXSW_REG(qpcr),
+	MLXSW_REG(qtct),
+	MLXSW_REG(qeec),
+	MLXSW_REG(pmlp),
+	MLXSW_REG(pmtu),
+	MLXSW_REG(ptys),
+	MLXSW_REG(ppad),
+	MLXSW_REG(paos),
+	MLXSW_REG(pfcc),
+	MLXSW_REG(ppcnt),
+	MLXSW_REG(plib),
+	MLXSW_REG(pptb),
+	MLXSW_REG(pbmc),
+	MLXSW_REG(pspa),
+	MLXSW_REG(htgt),
+	MLXSW_REG(hpkt),
+	MLXSW_REG(rgcr),
+	MLXSW_REG(ritr),
+	MLXSW_REG(rtar),
+	MLXSW_REG(ratr),
+	MLXSW_REG(rtdp),
+	MLXSW_REG(rdpm),
+	MLXSW_REG(ricnt),
+	MLXSW_REG(rrcr),
+	MLXSW_REG(ralta),
+	MLXSW_REG(ralst),
+	MLXSW_REG(raltb),
+	MLXSW_REG(ralue),
+	MLXSW_REG(rauht),
+	MLXSW_REG(raleu),
+	MLXSW_REG(rauhtd),
+	MLXSW_REG(rigr2),
+	MLXSW_REG(recr2),
+	MLXSW_REG(rmft2),
+	MLXSW_REG(mfcr),
+	MLXSW_REG(mfsc),
+	MLXSW_REG(mfsm),
+	MLXSW_REG(mfsl),
+	MLXSW_REG(mtcap),
+	MLXSW_REG(mtmp),
+	MLXSW_REG(mcia),
+	MLXSW_REG(mpat),
+	MLXSW_REG(mpar),
+	MLXSW_REG(mlcr),
+	MLXSW_REG(mpsc),
+	MLXSW_REG(mcqi),
+	MLXSW_REG(mcc),
+	MLXSW_REG(mcda),
+	MLXSW_REG(mgpc),
+	MLXSW_REG(tigcr),
+	MLXSW_REG(sbpr),
+	MLXSW_REG(sbcm),
+	MLXSW_REG(sbpm),
+	MLXSW_REG(sbmm),
+	MLXSW_REG(sbsr),
+	MLXSW_REG(sbib),
+};
+
+static inline const char *mlxsw_reg_id_str(u16 reg_id)
+{
+	const struct mlxsw_reg_info *reg_info;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_reg_infos); i++) {
+		reg_info = mlxsw_reg_infos[i];
+		if (reg_info->id == reg_id)
+			return reg_info->name;
+	}
+	return "*UNKNOWN*";
+}
+
+/* PUDE - Port Up / Down Event
+ * ---------------------------
+ * Reports the operational state change of a port.
+ */
+#define MLXSW_REG_PUDE_LEN 0x10
+
+/* reg_pude_swid
+ * Switch partition ID with which to associate the port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pude, swid, 0x00, 24, 8);
+
+/* reg_pude_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pude, local_port, 0x00, 16, 8);
+
+/* reg_pude_admin_status
+ * Port administrative state (the desired state).
+ * 1 - Up.
+ * 2 - Down.
+ * 3 - Up once. This means that in case of link failure, the port won't go
+ *     into polling mode, but will wait to be re-enabled by software.
+ * 4 - Disabled by system. Can only be set by hardware.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pude, admin_status, 0x00, 8, 4);
+
+/* reg_pude_oper_status
+ * Port operatioanl state.
+ * 1 - Up.
+ * 2 - Down.
+ * 3 - Down by port failure. This means that the device will not let the
+ *     port up again until explicitly specified by software.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pude, oper_status, 0x00, 0, 4);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/resources.h b/drivers/net/ethernet/mellanox/mlxsw/resources.h
new file mode 100644
index 0000000..087aad5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/resources.h
@@ -0,0 +1,155 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/resources.h
+ * Copyright (c) 2016-2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_RESOURCES_H
+#define _MLXSW_RESOURCES_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+enum mlxsw_res_id {
+	MLXSW_RES_ID_KVD_SIZE,
+	MLXSW_RES_ID_KVD_SINGLE_MIN_SIZE,
+	MLXSW_RES_ID_KVD_DOUBLE_MIN_SIZE,
+	MLXSW_RES_ID_MAX_TRAP_GROUPS,
+	MLXSW_RES_ID_COUNTER_POOL_SIZE,
+	MLXSW_RES_ID_MAX_SPAN,
+	MLXSW_RES_ID_COUNTER_SIZE_PACKETS_BYTES,
+	MLXSW_RES_ID_COUNTER_SIZE_ROUTER_BASIC,
+	MLXSW_RES_ID_MAX_SYSTEM_PORT,
+	MLXSW_RES_ID_MAX_LAG,
+	MLXSW_RES_ID_MAX_LAG_MEMBERS,
+	MLXSW_RES_ID_MAX_BUFFER_SIZE,
+	MLXSW_RES_ID_CELL_SIZE,
+	MLXSW_RES_ID_ACL_MAX_TCAM_REGIONS,
+	MLXSW_RES_ID_ACL_MAX_TCAM_RULES,
+	MLXSW_RES_ID_ACL_MAX_REGIONS,
+	MLXSW_RES_ID_ACL_MAX_GROUPS,
+	MLXSW_RES_ID_ACL_MAX_GROUP_SIZE,
+	MLXSW_RES_ID_ACL_FLEX_KEYS,
+	MLXSW_RES_ID_ACL_MAX_ACTION_PER_RULE,
+	MLXSW_RES_ID_ACL_ACTIONS_PER_SET,
+	MLXSW_RES_ID_MAX_CPU_POLICERS,
+	MLXSW_RES_ID_MAX_VRS,
+	MLXSW_RES_ID_MAX_RIFS,
+	MLXSW_RES_ID_MC_ERIF_LIST_ENTRIES,
+	MLXSW_RES_ID_MAX_LPM_TREES,
+
+	/* Internal resources.
+	 * Determined by the SW, not queried from the HW.
+	 */
+	MLXSW_RES_ID_KVD_SINGLE_SIZE,
+	MLXSW_RES_ID_KVD_DOUBLE_SIZE,
+	MLXSW_RES_ID_KVD_LINEAR_SIZE,
+
+	__MLXSW_RES_ID_MAX,
+};
+
+static u16 mlxsw_res_ids[] = {
+	[MLXSW_RES_ID_KVD_SIZE] = 0x1001,
+	[MLXSW_RES_ID_KVD_SINGLE_MIN_SIZE] = 0x1002,
+	[MLXSW_RES_ID_KVD_DOUBLE_MIN_SIZE] = 0x1003,
+	[MLXSW_RES_ID_MAX_TRAP_GROUPS] = 0x2201,
+	[MLXSW_RES_ID_COUNTER_POOL_SIZE] = 0x2410,
+	[MLXSW_RES_ID_MAX_SPAN] = 0x2420,
+	[MLXSW_RES_ID_COUNTER_SIZE_PACKETS_BYTES] = 0x2443,
+	[MLXSW_RES_ID_COUNTER_SIZE_ROUTER_BASIC] = 0x2449,
+	[MLXSW_RES_ID_MAX_SYSTEM_PORT] = 0x2502,
+	[MLXSW_RES_ID_MAX_LAG] = 0x2520,
+	[MLXSW_RES_ID_MAX_LAG_MEMBERS] = 0x2521,
+	[MLXSW_RES_ID_MAX_BUFFER_SIZE] = 0x2802,	/* Bytes */
+	[MLXSW_RES_ID_CELL_SIZE] = 0x2803,	/* Bytes */
+	[MLXSW_RES_ID_ACL_MAX_TCAM_REGIONS] = 0x2901,
+	[MLXSW_RES_ID_ACL_MAX_TCAM_RULES] = 0x2902,
+	[MLXSW_RES_ID_ACL_MAX_REGIONS] = 0x2903,
+	[MLXSW_RES_ID_ACL_MAX_GROUPS] = 0x2904,
+	[MLXSW_RES_ID_ACL_MAX_GROUP_SIZE] = 0x2905,
+	[MLXSW_RES_ID_ACL_FLEX_KEYS] = 0x2910,
+	[MLXSW_RES_ID_ACL_MAX_ACTION_PER_RULE] = 0x2911,
+	[MLXSW_RES_ID_ACL_ACTIONS_PER_SET] = 0x2912,
+	[MLXSW_RES_ID_MAX_CPU_POLICERS] = 0x2A13,
+	[MLXSW_RES_ID_MAX_VRS] = 0x2C01,
+	[MLXSW_RES_ID_MAX_RIFS] = 0x2C02,
+	[MLXSW_RES_ID_MC_ERIF_LIST_ENTRIES] = 0x2C10,
+	[MLXSW_RES_ID_MAX_LPM_TREES] = 0x2C30,
+};
+
+struct mlxsw_res {
+	bool valid[__MLXSW_RES_ID_MAX];
+	u64 values[__MLXSW_RES_ID_MAX];
+};
+
+static inline bool mlxsw_res_valid(struct mlxsw_res *res,
+				   enum mlxsw_res_id res_id)
+{
+	return res->valid[res_id];
+}
+
+#define MLXSW_RES_VALID(res, short_res_id)			\
+	mlxsw_res_valid(res, MLXSW_RES_ID_##short_res_id)
+
+static inline u64 mlxsw_res_get(struct mlxsw_res *res,
+				enum mlxsw_res_id res_id)
+{
+	if (WARN_ON(!res->valid[res_id]))
+		return 0;
+	return res->values[res_id];
+}
+
+#define MLXSW_RES_GET(res, short_res_id)			\
+	mlxsw_res_get(res, MLXSW_RES_ID_##short_res_id)
+
+static inline void mlxsw_res_set(struct mlxsw_res *res,
+				 enum mlxsw_res_id res_id, u64 value)
+{
+	res->valid[res_id] = true;
+	res->values[res_id] = value;
+}
+
+#define MLXSW_RES_SET(res, short_res_id, value)			\
+	mlxsw_res_set(res, MLXSW_RES_ID_##short_res_id, value)
+
+static inline void mlxsw_res_parse(struct mlxsw_res *res, u16 id, u64 value)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_res_ids); i++) {
+		if (mlxsw_res_ids[i] == id) {
+			mlxsw_res_set(res, i, value);
+			return;
+		}
+	}
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
new file mode 100644
index 0000000..adc6ab2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -0,0 +1,4742 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+ * Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015-2017 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2015 Ido Schimmel <idosch@mellanox.com>
+ * Copyright (c) 2015 Elad Raz <eladr@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <linux/if_bridge.h>
+#include <linux/workqueue.h>
+#include <linux/jiffies.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <linux/notifier.h>
+#include <linux/dcbnl.h>
+#include <linux/inetdevice.h>
+#include <linux/netlink.h>
+#include <net/switchdev.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_mirred.h>
+#include <net/netevent.h>
+#include <net/tc_act/tc_sample.h>
+#include <net/addrconf.h>
+
+#include "spectrum.h"
+#include "pci.h"
+#include "core.h"
+#include "reg.h"
+#include "port.h"
+#include "trap.h"
+#include "txheader.h"
+#include "spectrum_cnt.h"
+#include "spectrum_dpipe.h"
+#include "spectrum_acl_flex_actions.h"
+#include "spectrum_span.h"
+#include "../mlxfw/mlxfw.h"
+
+#define MLXSW_FWREV_MAJOR 13
+#define MLXSW_FWREV_MINOR 1620
+#define MLXSW_FWREV_SUBMINOR 192
+#define MLXSW_FWREV_MINOR_TO_BRANCH(minor) ((minor) / 100)
+
+#define MLXSW_SP_FW_FILENAME \
+	"mellanox/mlxsw_spectrum-" __stringify(MLXSW_FWREV_MAJOR) \
+	"." __stringify(MLXSW_FWREV_MINOR) \
+	"." __stringify(MLXSW_FWREV_SUBMINOR) ".mfa2"
+
+static const char mlxsw_sp_driver_name[] = "mlxsw_spectrum";
+static const char mlxsw_sp_driver_version[] = "1.0";
+
+/* tx_hdr_version
+ * Tx header version.
+ * Must be set to 1.
+ */
+MLXSW_ITEM32(tx, hdr, version, 0x00, 28, 4);
+
+/* tx_hdr_ctl
+ * Packet control type.
+ * 0 - Ethernet control (e.g. EMADs, LACP)
+ * 1 - Ethernet data
+ */
+MLXSW_ITEM32(tx, hdr, ctl, 0x00, 26, 2);
+
+/* tx_hdr_proto
+ * Packet protocol type. Must be set to 1 (Ethernet).
+ */
+MLXSW_ITEM32(tx, hdr, proto, 0x00, 21, 3);
+
+/* tx_hdr_rx_is_router
+ * Packet is sent from the router. Valid for data packets only.
+ */
+MLXSW_ITEM32(tx, hdr, rx_is_router, 0x00, 19, 1);
+
+/* tx_hdr_fid_valid
+ * Indicates if the 'fid' field is valid and should be used for
+ * forwarding lookup. Valid for data packets only.
+ */
+MLXSW_ITEM32(tx, hdr, fid_valid, 0x00, 16, 1);
+
+/* tx_hdr_swid
+ * Switch partition ID. Must be set to 0.
+ */
+MLXSW_ITEM32(tx, hdr, swid, 0x00, 12, 3);
+
+/* tx_hdr_control_tclass
+ * Indicates if the packet should use the control TClass and not one
+ * of the data TClasses.
+ */
+MLXSW_ITEM32(tx, hdr, control_tclass, 0x00, 6, 1);
+
+/* tx_hdr_etclass
+ * Egress TClass to be used on the egress device on the egress port.
+ */
+MLXSW_ITEM32(tx, hdr, etclass, 0x00, 0, 4);
+
+/* tx_hdr_port_mid
+ * Destination local port for unicast packets.
+ * Destination multicast ID for multicast packets.
+ *
+ * Control packets are directed to a specific egress port, while data
+ * packets are transmitted through the CPU port (0) into the switch partition,
+ * where forwarding rules are applied.
+ */
+MLXSW_ITEM32(tx, hdr, port_mid, 0x04, 16, 16);
+
+/* tx_hdr_fid
+ * Forwarding ID used for L2 forwarding lookup. Valid only if 'fid_valid' is
+ * set, otherwise calculated based on the packet's VID using VID to FID mapping.
+ * Valid for data packets only.
+ */
+MLXSW_ITEM32(tx, hdr, fid, 0x08, 0, 16);
+
+/* tx_hdr_type
+ * 0 - Data packets
+ * 6 - Control packets
+ */
+MLXSW_ITEM32(tx, hdr, type, 0x0C, 0, 4);
+
+struct mlxsw_sp_mlxfw_dev {
+	struct mlxfw_dev mlxfw_dev;
+	struct mlxsw_sp *mlxsw_sp;
+};
+
+static int mlxsw_sp_component_query(struct mlxfw_dev *mlxfw_dev,
+				    u16 component_index, u32 *p_max_size,
+				    u8 *p_align_bits, u16 *p_max_write_size)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcqi_pl[MLXSW_REG_MCQI_LEN];
+	int err;
+
+	mlxsw_reg_mcqi_pack(mcqi_pl, component_index);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mcqi), mcqi_pl);
+	if (err)
+		return err;
+	mlxsw_reg_mcqi_unpack(mcqi_pl, p_max_size, p_align_bits,
+			      p_max_write_size);
+
+	*p_align_bits = max_t(u8, *p_align_bits, 2);
+	*p_max_write_size = min_t(u16, *p_max_write_size,
+				  MLXSW_REG_MCDA_MAX_DATA_LEN);
+	return 0;
+}
+
+static int mlxsw_sp_fsm_lock(struct mlxfw_dev *mlxfw_dev, u32 *fwhandle)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+	u8 control_state;
+	int err;
+
+	mlxsw_reg_mcc_pack(mcc_pl, 0, 0, 0, 0);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_mcc_unpack(mcc_pl, fwhandle, NULL, &control_state);
+	if (control_state != MLXFW_FSM_STATE_IDLE)
+		return -EBUSY;
+
+	mlxsw_reg_mcc_pack(mcc_pl,
+			   MLXSW_REG_MCC_INSTRUCTION_LOCK_UPDATE_HANDLE,
+			   0, *fwhandle, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+}
+
+static int mlxsw_sp_fsm_component_update(struct mlxfw_dev *mlxfw_dev,
+					 u32 fwhandle, u16 component_index,
+					 u32 component_size)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+
+	mlxsw_reg_mcc_pack(mcc_pl, MLXSW_REG_MCC_INSTRUCTION_UPDATE_COMPONENT,
+			   component_index, fwhandle, component_size);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+}
+
+static int mlxsw_sp_fsm_block_download(struct mlxfw_dev *mlxfw_dev,
+				       u32 fwhandle, u8 *data, u16 size,
+				       u32 offset)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcda_pl[MLXSW_REG_MCDA_LEN];
+
+	mlxsw_reg_mcda_pack(mcda_pl, fwhandle, offset, size, data);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcda), mcda_pl);
+}
+
+static int mlxsw_sp_fsm_component_verify(struct mlxfw_dev *mlxfw_dev,
+					 u32 fwhandle, u16 component_index)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+
+	mlxsw_reg_mcc_pack(mcc_pl, MLXSW_REG_MCC_INSTRUCTION_VERIFY_COMPONENT,
+			   component_index, fwhandle, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+}
+
+static int mlxsw_sp_fsm_activate(struct mlxfw_dev *mlxfw_dev, u32 fwhandle)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+
+	mlxsw_reg_mcc_pack(mcc_pl, MLXSW_REG_MCC_INSTRUCTION_ACTIVATE, 0,
+			   fwhandle, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+}
+
+static int mlxsw_sp_fsm_query_state(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				    enum mlxfw_fsm_state *fsm_state,
+				    enum mlxfw_fsm_state_err *fsm_state_err)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+	u8 control_state;
+	u8 error_code;
+	int err;
+
+	mlxsw_reg_mcc_pack(mcc_pl, 0, 0, fwhandle, 0);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_mcc_unpack(mcc_pl, NULL, &error_code, &control_state);
+	*fsm_state = control_state;
+	*fsm_state_err = min_t(enum mlxfw_fsm_state_err, error_code,
+			       MLXFW_FSM_STATE_ERR_MAX);
+	return 0;
+}
+
+static void mlxsw_sp_fsm_cancel(struct mlxfw_dev *mlxfw_dev, u32 fwhandle)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+
+	mlxsw_reg_mcc_pack(mcc_pl, MLXSW_REG_MCC_INSTRUCTION_CANCEL, 0,
+			   fwhandle, 0);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+}
+
+static void mlxsw_sp_fsm_release(struct mlxfw_dev *mlxfw_dev, u32 fwhandle)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+
+	mlxsw_reg_mcc_pack(mcc_pl,
+			   MLXSW_REG_MCC_INSTRUCTION_RELEASE_UPDATE_HANDLE, 0,
+			   fwhandle, 0);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+}
+
+static const struct mlxfw_dev_ops mlxsw_sp_mlxfw_dev_ops = {
+	.component_query	= mlxsw_sp_component_query,
+	.fsm_lock		= mlxsw_sp_fsm_lock,
+	.fsm_component_update	= mlxsw_sp_fsm_component_update,
+	.fsm_block_download	= mlxsw_sp_fsm_block_download,
+	.fsm_component_verify	= mlxsw_sp_fsm_component_verify,
+	.fsm_activate		= mlxsw_sp_fsm_activate,
+	.fsm_query_state	= mlxsw_sp_fsm_query_state,
+	.fsm_cancel		= mlxsw_sp_fsm_cancel,
+	.fsm_release		= mlxsw_sp_fsm_release
+};
+
+static int mlxsw_sp_firmware_flash(struct mlxsw_sp *mlxsw_sp,
+				   const struct firmware *firmware)
+{
+	struct mlxsw_sp_mlxfw_dev mlxsw_sp_mlxfw_dev = {
+		.mlxfw_dev = {
+			.ops = &mlxsw_sp_mlxfw_dev_ops,
+			.psid = mlxsw_sp->bus_info->psid,
+			.psid_size = strlen(mlxsw_sp->bus_info->psid),
+		},
+		.mlxsw_sp = mlxsw_sp
+	};
+
+	return mlxfw_firmware_flash(&mlxsw_sp_mlxfw_dev.mlxfw_dev, firmware);
+}
+
+static int mlxsw_sp_fw_rev_validate(struct mlxsw_sp *mlxsw_sp)
+{
+	const struct mlxsw_fw_rev *rev = &mlxsw_sp->bus_info->fw_rev;
+	const struct firmware *firmware;
+	int err;
+
+	/* Validate driver & FW are compatible */
+	if (rev->major != MLXSW_FWREV_MAJOR) {
+		WARN(1, "Mismatch in major FW version [%d:%d] is never expected; Please contact support\n",
+		     rev->major, MLXSW_FWREV_MAJOR);
+		return -EINVAL;
+	}
+	if (MLXSW_FWREV_MINOR_TO_BRANCH(rev->minor) ==
+	    MLXSW_FWREV_MINOR_TO_BRANCH(MLXSW_FWREV_MINOR))
+		return 0;
+
+	dev_info(mlxsw_sp->bus_info->dev, "The firmware version %d.%d.%d is incompatible with the driver\n",
+		 rev->major, rev->minor, rev->subminor);
+	dev_info(mlxsw_sp->bus_info->dev, "Flashing firmware using file %s\n",
+		 MLXSW_SP_FW_FILENAME);
+
+	err = request_firmware_direct(&firmware, MLXSW_SP_FW_FILENAME,
+				      mlxsw_sp->bus_info->dev);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Could not request firmware file %s\n",
+			MLXSW_SP_FW_FILENAME);
+		return err;
+	}
+
+	err = mlxsw_sp_firmware_flash(mlxsw_sp, firmware);
+	release_firmware(firmware);
+	return err;
+}
+
+int mlxsw_sp_flow_counter_get(struct mlxsw_sp *mlxsw_sp,
+			      unsigned int counter_index, u64 *packets,
+			      u64 *bytes)
+{
+	char mgpc_pl[MLXSW_REG_MGPC_LEN];
+	int err;
+
+	mlxsw_reg_mgpc_pack(mgpc_pl, counter_index, MLXSW_REG_MGPC_OPCODE_NOP,
+			    MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mgpc), mgpc_pl);
+	if (err)
+		return err;
+	if (packets)
+		*packets = mlxsw_reg_mgpc_packet_counter_get(mgpc_pl);
+	if (bytes)
+		*bytes = mlxsw_reg_mgpc_byte_counter_get(mgpc_pl);
+	return 0;
+}
+
+static int mlxsw_sp_flow_counter_clear(struct mlxsw_sp *mlxsw_sp,
+				       unsigned int counter_index)
+{
+	char mgpc_pl[MLXSW_REG_MGPC_LEN];
+
+	mlxsw_reg_mgpc_pack(mgpc_pl, counter_index, MLXSW_REG_MGPC_OPCODE_CLEAR,
+			    MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mgpc), mgpc_pl);
+}
+
+int mlxsw_sp_flow_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+				unsigned int *p_counter_index)
+{
+	int err;
+
+	err = mlxsw_sp_counter_alloc(mlxsw_sp, MLXSW_SP_COUNTER_SUB_POOL_FLOW,
+				     p_counter_index);
+	if (err)
+		return err;
+	err = mlxsw_sp_flow_counter_clear(mlxsw_sp, *p_counter_index);
+	if (err)
+		goto err_counter_clear;
+	return 0;
+
+err_counter_clear:
+	mlxsw_sp_counter_free(mlxsw_sp, MLXSW_SP_COUNTER_SUB_POOL_FLOW,
+			      *p_counter_index);
+	return err;
+}
+
+void mlxsw_sp_flow_counter_free(struct mlxsw_sp *mlxsw_sp,
+				unsigned int counter_index)
+{
+	 mlxsw_sp_counter_free(mlxsw_sp, MLXSW_SP_COUNTER_SUB_POOL_FLOW,
+			       counter_index);
+}
+
+static void mlxsw_sp_txhdr_construct(struct sk_buff *skb,
+				     const struct mlxsw_tx_info *tx_info)
+{
+	char *txhdr = skb_push(skb, MLXSW_TXHDR_LEN);
+
+	memset(txhdr, 0, MLXSW_TXHDR_LEN);
+
+	mlxsw_tx_hdr_version_set(txhdr, MLXSW_TXHDR_VERSION_1);
+	mlxsw_tx_hdr_ctl_set(txhdr, MLXSW_TXHDR_ETH_CTL);
+	mlxsw_tx_hdr_proto_set(txhdr, MLXSW_TXHDR_PROTO_ETH);
+	mlxsw_tx_hdr_swid_set(txhdr, 0);
+	mlxsw_tx_hdr_control_tclass_set(txhdr, 1);
+	mlxsw_tx_hdr_port_mid_set(txhdr, tx_info->local_port);
+	mlxsw_tx_hdr_type_set(txhdr, MLXSW_TXHDR_TYPE_CONTROL);
+}
+
+int mlxsw_sp_port_vid_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
+			      u8 state)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	enum mlxsw_reg_spms_state spms_state;
+	char *spms_pl;
+	int err;
+
+	switch (state) {
+	case BR_STATE_FORWARDING:
+		spms_state = MLXSW_REG_SPMS_STATE_FORWARDING;
+		break;
+	case BR_STATE_LEARNING:
+		spms_state = MLXSW_REG_SPMS_STATE_LEARNING;
+		break;
+	case BR_STATE_LISTENING: /* fall-through */
+	case BR_STATE_DISABLED: /* fall-through */
+	case BR_STATE_BLOCKING:
+		spms_state = MLXSW_REG_SPMS_STATE_DISCARDING;
+		break;
+	default:
+		BUG();
+	}
+
+	spms_pl = kmalloc(MLXSW_REG_SPMS_LEN, GFP_KERNEL);
+	if (!spms_pl)
+		return -ENOMEM;
+	mlxsw_reg_spms_pack(spms_pl, mlxsw_sp_port->local_port);
+	mlxsw_reg_spms_vid_pack(spms_pl, vid, spms_state);
+
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spms), spms_pl);
+	kfree(spms_pl);
+	return err;
+}
+
+static int mlxsw_sp_base_mac_get(struct mlxsw_sp *mlxsw_sp)
+{
+	char spad_pl[MLXSW_REG_SPAD_LEN] = {0};
+	int err;
+
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(spad), spad_pl);
+	if (err)
+		return err;
+	mlxsw_reg_spad_base_mac_memcpy_from(spad_pl, mlxsw_sp->base_mac);
+	return 0;
+}
+
+static int mlxsw_sp_port_sample_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				    bool enable, u32 rate)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char mpsc_pl[MLXSW_REG_MPSC_LEN];
+
+	mlxsw_reg_mpsc_pack(mpsc_pl, mlxsw_sp_port->local_port, enable, rate);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpsc), mpsc_pl);
+}
+
+static int mlxsw_sp_port_admin_status_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					  bool is_up)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char paos_pl[MLXSW_REG_PAOS_LEN];
+
+	mlxsw_reg_paos_pack(paos_pl, mlxsw_sp_port->local_port,
+			    is_up ? MLXSW_PORT_ADMIN_STATUS_UP :
+			    MLXSW_PORT_ADMIN_STATUS_DOWN);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(paos), paos_pl);
+}
+
+static int mlxsw_sp_port_dev_addr_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				      unsigned char *addr)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char ppad_pl[MLXSW_REG_PPAD_LEN];
+
+	mlxsw_reg_ppad_pack(ppad_pl, true, mlxsw_sp_port->local_port);
+	mlxsw_reg_ppad_mac_memcpy_to(ppad_pl, addr);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ppad), ppad_pl);
+}
+
+static int mlxsw_sp_port_dev_addr_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	unsigned char *addr = mlxsw_sp_port->dev->dev_addr;
+
+	ether_addr_copy(addr, mlxsw_sp->base_mac);
+	addr[ETH_ALEN - 1] += mlxsw_sp_port->local_port;
+	return mlxsw_sp_port_dev_addr_set(mlxsw_sp_port, addr);
+}
+
+static int mlxsw_sp_port_mtu_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 mtu)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char pmtu_pl[MLXSW_REG_PMTU_LEN];
+	int max_mtu;
+	int err;
+
+	mtu += MLXSW_TXHDR_LEN + ETH_HLEN;
+	mlxsw_reg_pmtu_pack(pmtu_pl, mlxsw_sp_port->local_port, 0);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(pmtu), pmtu_pl);
+	if (err)
+		return err;
+	max_mtu = mlxsw_reg_pmtu_max_mtu_get(pmtu_pl);
+
+	if (mtu > max_mtu)
+		return -EINVAL;
+
+	mlxsw_reg_pmtu_pack(pmtu_pl, mlxsw_sp_port->local_port, mtu);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pmtu), pmtu_pl);
+}
+
+static int mlxsw_sp_port_swid_set(struct mlxsw_sp_port *mlxsw_sp_port, u8 swid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char pspa_pl[MLXSW_REG_PSPA_LEN];
+
+	mlxsw_reg_pspa_pack(pspa_pl, swid, mlxsw_sp_port->local_port);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pspa), pspa_pl);
+}
+
+int mlxsw_sp_port_vp_mode_set(struct mlxsw_sp_port *mlxsw_sp_port, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char svpe_pl[MLXSW_REG_SVPE_LEN];
+
+	mlxsw_reg_svpe_pack(svpe_pl, mlxsw_sp_port->local_port, enable);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(svpe), svpe_pl);
+}
+
+int mlxsw_sp_port_vid_learning_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
+				   bool learn_enable)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char *spvmlr_pl;
+	int err;
+
+	spvmlr_pl = kmalloc(MLXSW_REG_SPVMLR_LEN, GFP_KERNEL);
+	if (!spvmlr_pl)
+		return -ENOMEM;
+	mlxsw_reg_spvmlr_pack(spvmlr_pl, mlxsw_sp_port->local_port, vid, vid,
+			      learn_enable);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spvmlr), spvmlr_pl);
+	kfree(spvmlr_pl);
+	return err;
+}
+
+static int __mlxsw_sp_port_pvid_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				    u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char spvid_pl[MLXSW_REG_SPVID_LEN];
+
+	mlxsw_reg_spvid_pack(spvid_pl, mlxsw_sp_port->local_port, vid);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spvid), spvid_pl);
+}
+
+static int mlxsw_sp_port_allow_untagged_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					    bool allow)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char spaft_pl[MLXSW_REG_SPAFT_LEN];
+
+	mlxsw_reg_spaft_pack(spaft_pl, mlxsw_sp_port->local_port, allow);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spaft), spaft_pl);
+}
+
+int mlxsw_sp_port_pvid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	int err;
+
+	if (!vid) {
+		err = mlxsw_sp_port_allow_untagged_set(mlxsw_sp_port, false);
+		if (err)
+			return err;
+	} else {
+		err = __mlxsw_sp_port_pvid_set(mlxsw_sp_port, vid);
+		if (err)
+			return err;
+		err = mlxsw_sp_port_allow_untagged_set(mlxsw_sp_port, true);
+		if (err)
+			goto err_port_allow_untagged_set;
+	}
+
+	mlxsw_sp_port->pvid = vid;
+	return 0;
+
+err_port_allow_untagged_set:
+	__mlxsw_sp_port_pvid_set(mlxsw_sp_port, mlxsw_sp_port->pvid);
+	return err;
+}
+
+static int
+mlxsw_sp_port_system_port_mapping_set(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char sspr_pl[MLXSW_REG_SSPR_LEN];
+
+	mlxsw_reg_sspr_pack(sspr_pl, mlxsw_sp_port->local_port);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sspr), sspr_pl);
+}
+
+static int mlxsw_sp_port_module_info_get(struct mlxsw_sp *mlxsw_sp,
+					 u8 local_port, u8 *p_module,
+					 u8 *p_width, u8 *p_lane)
+{
+	char pmlp_pl[MLXSW_REG_PMLP_LEN];
+	int err;
+
+	mlxsw_reg_pmlp_pack(pmlp_pl, local_port);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(pmlp), pmlp_pl);
+	if (err)
+		return err;
+	*p_module = mlxsw_reg_pmlp_module_get(pmlp_pl, 0);
+	*p_width = mlxsw_reg_pmlp_width_get(pmlp_pl);
+	*p_lane = mlxsw_reg_pmlp_tx_lane_get(pmlp_pl, 0);
+	return 0;
+}
+
+static int mlxsw_sp_port_module_map(struct mlxsw_sp_port *mlxsw_sp_port,
+				    u8 module, u8 width, u8 lane)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char pmlp_pl[MLXSW_REG_PMLP_LEN];
+	int i;
+
+	mlxsw_reg_pmlp_pack(pmlp_pl, mlxsw_sp_port->local_port);
+	mlxsw_reg_pmlp_width_set(pmlp_pl, width);
+	for (i = 0; i < width; i++) {
+		mlxsw_reg_pmlp_module_set(pmlp_pl, i, module);
+		mlxsw_reg_pmlp_tx_lane_set(pmlp_pl, i, lane + i);  /* Rx & Tx */
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pmlp), pmlp_pl);
+}
+
+static int mlxsw_sp_port_module_unmap(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char pmlp_pl[MLXSW_REG_PMLP_LEN];
+
+	mlxsw_reg_pmlp_pack(pmlp_pl, mlxsw_sp_port->local_port);
+	mlxsw_reg_pmlp_width_set(pmlp_pl, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pmlp), pmlp_pl);
+}
+
+static int mlxsw_sp_port_open(struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	int err;
+
+	err = mlxsw_sp_port_admin_status_set(mlxsw_sp_port, true);
+	if (err)
+		return err;
+	netif_start_queue(dev);
+	return 0;
+}
+
+static int mlxsw_sp_port_stop(struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	netif_stop_queue(dev);
+	return mlxsw_sp_port_admin_status_set(mlxsw_sp_port, false);
+}
+
+static netdev_tx_t mlxsw_sp_port_xmit(struct sk_buff *skb,
+				      struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_port_pcpu_stats *pcpu_stats;
+	const struct mlxsw_tx_info tx_info = {
+		.local_port = mlxsw_sp_port->local_port,
+		.is_emad = false,
+	};
+	u64 len;
+	int err;
+
+	if (mlxsw_core_skb_transmit_busy(mlxsw_sp->core, &tx_info))
+		return NETDEV_TX_BUSY;
+
+	if (unlikely(skb_headroom(skb) < MLXSW_TXHDR_LEN)) {
+		struct sk_buff *skb_orig = skb;
+
+		skb = skb_realloc_headroom(skb, MLXSW_TXHDR_LEN);
+		if (!skb) {
+			this_cpu_inc(mlxsw_sp_port->pcpu_stats->tx_dropped);
+			dev_kfree_skb_any(skb_orig);
+			return NETDEV_TX_OK;
+		}
+		dev_consume_skb_any(skb_orig);
+	}
+
+	if (eth_skb_pad(skb)) {
+		this_cpu_inc(mlxsw_sp_port->pcpu_stats->tx_dropped);
+		return NETDEV_TX_OK;
+	}
+
+	mlxsw_sp_txhdr_construct(skb, &tx_info);
+	/* TX header is consumed by HW on the way so we shouldn't count its
+	 * bytes as being sent.
+	 */
+	len = skb->len - MLXSW_TXHDR_LEN;
+
+	/* Due to a race we might fail here because of a full queue. In that
+	 * unlikely case we simply drop the packet.
+	 */
+	err = mlxsw_core_skb_transmit(mlxsw_sp->core, skb, &tx_info);
+
+	if (!err) {
+		pcpu_stats = this_cpu_ptr(mlxsw_sp_port->pcpu_stats);
+		u64_stats_update_begin(&pcpu_stats->syncp);
+		pcpu_stats->tx_packets++;
+		pcpu_stats->tx_bytes += len;
+		u64_stats_update_end(&pcpu_stats->syncp);
+	} else {
+		this_cpu_inc(mlxsw_sp_port->pcpu_stats->tx_dropped);
+		dev_kfree_skb_any(skb);
+	}
+	return NETDEV_TX_OK;
+}
+
+static void mlxsw_sp_set_rx_mode(struct net_device *dev)
+{
+}
+
+static int mlxsw_sp_port_set_mac_address(struct net_device *dev, void *p)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct sockaddr *addr = p;
+	int err;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	err = mlxsw_sp_port_dev_addr_set(mlxsw_sp_port, addr->sa_data);
+	if (err)
+		return err;
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	return 0;
+}
+
+static u16 mlxsw_sp_pg_buf_threshold_get(const struct mlxsw_sp *mlxsw_sp,
+					 int mtu)
+{
+	return 2 * mlxsw_sp_bytes_cells(mlxsw_sp, mtu);
+}
+
+#define MLXSW_SP_CELL_FACTOR 2	/* 2 * cell_size / (IPG + cell_size + 1) */
+
+static u16 mlxsw_sp_pfc_delay_get(const struct mlxsw_sp *mlxsw_sp, int mtu,
+				  u16 delay)
+{
+	delay = mlxsw_sp_bytes_cells(mlxsw_sp, DIV_ROUND_UP(delay,
+							    BITS_PER_BYTE));
+	return MLXSW_SP_CELL_FACTOR * delay + mlxsw_sp_bytes_cells(mlxsw_sp,
+								   mtu);
+}
+
+/* Maximum delay buffer needed in case of PAUSE frames, in bytes.
+ * Assumes 100m cable and maximum MTU.
+ */
+#define MLXSW_SP_PAUSE_DELAY 58752
+
+static u16 mlxsw_sp_pg_buf_delay_get(const struct mlxsw_sp *mlxsw_sp, int mtu,
+				     u16 delay, bool pfc, bool pause)
+{
+	if (pfc)
+		return mlxsw_sp_pfc_delay_get(mlxsw_sp, mtu, delay);
+	else if (pause)
+		return mlxsw_sp_bytes_cells(mlxsw_sp, MLXSW_SP_PAUSE_DELAY);
+	else
+		return 0;
+}
+
+static void mlxsw_sp_pg_buf_pack(char *pbmc_pl, int index, u16 size, u16 thres,
+				 bool lossy)
+{
+	if (lossy)
+		mlxsw_reg_pbmc_lossy_buffer_pack(pbmc_pl, index, size);
+	else
+		mlxsw_reg_pbmc_lossless_buffer_pack(pbmc_pl, index, size,
+						    thres);
+}
+
+int __mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port, int mtu,
+				 u8 *prio_tc, bool pause_en,
+				 struct ieee_pfc *my_pfc)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 pfc_en = !!my_pfc ? my_pfc->pfc_en : 0;
+	u16 delay = !!my_pfc ? my_pfc->delay : 0;
+	char pbmc_pl[MLXSW_REG_PBMC_LEN];
+	int i, j, err;
+
+	mlxsw_reg_pbmc_pack(pbmc_pl, mlxsw_sp_port->local_port, 0, 0);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(pbmc), pbmc_pl);
+	if (err)
+		return err;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		bool configure = false;
+		bool pfc = false;
+		bool lossy;
+		u16 thres;
+
+		for (j = 0; j < IEEE_8021QAZ_MAX_TCS; j++) {
+			if (prio_tc[j] == i) {
+				pfc = pfc_en & BIT(j);
+				configure = true;
+				break;
+			}
+		}
+
+		if (!configure)
+			continue;
+
+		lossy = !(pfc || pause_en);
+		thres = mlxsw_sp_pg_buf_threshold_get(mlxsw_sp, mtu);
+		delay = mlxsw_sp_pg_buf_delay_get(mlxsw_sp, mtu, delay, pfc,
+						  pause_en);
+		mlxsw_sp_pg_buf_pack(pbmc_pl, i, thres + delay, thres, lossy);
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pbmc), pbmc_pl);
+}
+
+static int mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				      int mtu, bool pause_en)
+{
+	u8 def_prio_tc[IEEE_8021QAZ_MAX_TCS] = {0};
+	bool dcb_en = !!mlxsw_sp_port->dcb.ets;
+	struct ieee_pfc *my_pfc;
+	u8 *prio_tc;
+
+	prio_tc = dcb_en ? mlxsw_sp_port->dcb.ets->prio_tc : def_prio_tc;
+	my_pfc = dcb_en ? mlxsw_sp_port->dcb.pfc : NULL;
+
+	return __mlxsw_sp_port_headroom_set(mlxsw_sp_port, mtu, prio_tc,
+					    pause_en, my_pfc);
+}
+
+static int mlxsw_sp_port_change_mtu(struct net_device *dev, int mtu)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	bool pause_en = mlxsw_sp_port_is_pause_en(mlxsw_sp_port);
+	int err;
+
+	err = mlxsw_sp_port_headroom_set(mlxsw_sp_port, mtu, pause_en);
+	if (err)
+		return err;
+	err = mlxsw_sp_span_port_mtu_update(mlxsw_sp_port, mtu);
+	if (err)
+		goto err_span_port_mtu_update;
+	err = mlxsw_sp_port_mtu_set(mlxsw_sp_port, mtu);
+	if (err)
+		goto err_port_mtu_set;
+	dev->mtu = mtu;
+	return 0;
+
+err_port_mtu_set:
+	mlxsw_sp_span_port_mtu_update(mlxsw_sp_port, dev->mtu);
+err_span_port_mtu_update:
+	mlxsw_sp_port_headroom_set(mlxsw_sp_port, dev->mtu, pause_en);
+	return err;
+}
+
+static int
+mlxsw_sp_port_get_sw_stats64(const struct net_device *dev,
+			     struct rtnl_link_stats64 *stats)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp_port_pcpu_stats *p;
+	u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+	u32 tx_dropped = 0;
+	unsigned int start;
+	int i;
+
+	for_each_possible_cpu(i) {
+		p = per_cpu_ptr(mlxsw_sp_port->pcpu_stats, i);
+		do {
+			start = u64_stats_fetch_begin_irq(&p->syncp);
+			rx_packets	= p->rx_packets;
+			rx_bytes	= p->rx_bytes;
+			tx_packets	= p->tx_packets;
+			tx_bytes	= p->tx_bytes;
+		} while (u64_stats_fetch_retry_irq(&p->syncp, start));
+
+		stats->rx_packets	+= rx_packets;
+		stats->rx_bytes		+= rx_bytes;
+		stats->tx_packets	+= tx_packets;
+		stats->tx_bytes		+= tx_bytes;
+		/* tx_dropped is u32, updated without syncp protection. */
+		tx_dropped	+= p->tx_dropped;
+	}
+	stats->tx_dropped	= tx_dropped;
+	return 0;
+}
+
+static bool mlxsw_sp_port_has_offload_stats(const struct net_device *dev, int attr_id)
+{
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		return true;
+	}
+
+	return false;
+}
+
+static int mlxsw_sp_port_get_offload_stats(int attr_id, const struct net_device *dev,
+					   void *sp)
+{
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		return mlxsw_sp_port_get_sw_stats64(dev, sp);
+	}
+
+	return -EINVAL;
+}
+
+static int mlxsw_sp_port_get_stats_raw(struct net_device *dev, int grp,
+				       int prio, char *ppcnt_pl)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+
+	mlxsw_reg_ppcnt_pack(ppcnt_pl, mlxsw_sp_port->local_port, grp, prio);
+	return mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ppcnt), ppcnt_pl);
+}
+
+static int mlxsw_sp_port_get_hw_stats(struct net_device *dev,
+				      struct rtnl_link_stats64 *stats)
+{
+	char ppcnt_pl[MLXSW_REG_PPCNT_LEN];
+	int err;
+
+	err = mlxsw_sp_port_get_stats_raw(dev, MLXSW_REG_PPCNT_IEEE_8023_CNT,
+					  0, ppcnt_pl);
+	if (err)
+		goto out;
+
+	stats->tx_packets =
+		mlxsw_reg_ppcnt_a_frames_transmitted_ok_get(ppcnt_pl);
+	stats->rx_packets =
+		mlxsw_reg_ppcnt_a_frames_received_ok_get(ppcnt_pl);
+	stats->tx_bytes =
+		mlxsw_reg_ppcnt_a_octets_transmitted_ok_get(ppcnt_pl);
+	stats->rx_bytes =
+		mlxsw_reg_ppcnt_a_octets_received_ok_get(ppcnt_pl);
+	stats->multicast =
+		mlxsw_reg_ppcnt_a_multicast_frames_received_ok_get(ppcnt_pl);
+
+	stats->rx_crc_errors =
+		mlxsw_reg_ppcnt_a_frame_check_sequence_errors_get(ppcnt_pl);
+	stats->rx_frame_errors =
+		mlxsw_reg_ppcnt_a_alignment_errors_get(ppcnt_pl);
+
+	stats->rx_length_errors = (
+		mlxsw_reg_ppcnt_a_in_range_length_errors_get(ppcnt_pl) +
+		mlxsw_reg_ppcnt_a_out_of_range_length_field_get(ppcnt_pl) +
+		mlxsw_reg_ppcnt_a_frame_too_long_errors_get(ppcnt_pl));
+
+	stats->rx_errors = (stats->rx_crc_errors +
+		stats->rx_frame_errors + stats->rx_length_errors);
+
+out:
+	return err;
+}
+
+static void
+mlxsw_sp_port_get_hw_xstats(struct net_device *dev,
+			    struct mlxsw_sp_port_xstats *xstats)
+{
+	char ppcnt_pl[MLXSW_REG_PPCNT_LEN];
+	int err, i;
+
+	err = mlxsw_sp_port_get_stats_raw(dev, MLXSW_REG_PPCNT_EXT_CNT, 0,
+					  ppcnt_pl);
+	if (!err)
+		xstats->ecn = mlxsw_reg_ppcnt_ecn_marked_get(ppcnt_pl);
+
+	for (i = 0; i < TC_MAX_QUEUE; i++) {
+		err = mlxsw_sp_port_get_stats_raw(dev,
+						  MLXSW_REG_PPCNT_TC_CONG_TC,
+						  i, ppcnt_pl);
+		if (!err)
+			xstats->wred_drop[i] =
+				mlxsw_reg_ppcnt_wred_discard_get(ppcnt_pl);
+
+		err = mlxsw_sp_port_get_stats_raw(dev, MLXSW_REG_PPCNT_TC_CNT,
+						  i, ppcnt_pl);
+		if (err)
+			continue;
+
+		xstats->backlog[i] =
+			mlxsw_reg_ppcnt_tc_transmit_queue_get(ppcnt_pl);
+		xstats->tail_drop[i] =
+			mlxsw_reg_ppcnt_tc_no_buffer_discard_uc_get(ppcnt_pl);
+	}
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_get_stats_raw(dev, MLXSW_REG_PPCNT_PRIO_CNT,
+						  i, ppcnt_pl);
+		if (err)
+			continue;
+
+		xstats->tx_packets[i] = mlxsw_reg_ppcnt_tx_frames_get(ppcnt_pl);
+		xstats->tx_bytes[i] = mlxsw_reg_ppcnt_tx_octets_get(ppcnt_pl);
+	}
+}
+
+static void update_stats_cache(struct work_struct *work)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+		container_of(work, struct mlxsw_sp_port,
+			     periodic_hw_stats.update_dw.work);
+
+	if (!netif_carrier_ok(mlxsw_sp_port->dev))
+		goto out;
+
+	mlxsw_sp_port_get_hw_stats(mlxsw_sp_port->dev,
+				   &mlxsw_sp_port->periodic_hw_stats.stats);
+	mlxsw_sp_port_get_hw_xstats(mlxsw_sp_port->dev,
+				    &mlxsw_sp_port->periodic_hw_stats.xstats);
+
+out:
+	mlxsw_core_schedule_dw(&mlxsw_sp_port->periodic_hw_stats.update_dw,
+			       MLXSW_HW_STATS_UPDATE_TIME);
+}
+
+/* Return the stats from a cache that is updated periodically,
+ * as this function might get called in an atomic context.
+ */
+static void
+mlxsw_sp_port_get_stats64(struct net_device *dev,
+			  struct rtnl_link_stats64 *stats)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	memcpy(stats, &mlxsw_sp_port->periodic_hw_stats.stats, sizeof(*stats));
+}
+
+static int __mlxsw_sp_port_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				    u16 vid_begin, u16 vid_end,
+				    bool is_member, bool untagged)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char *spvm_pl;
+	int err;
+
+	spvm_pl = kmalloc(MLXSW_REG_SPVM_LEN, GFP_KERNEL);
+	if (!spvm_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_spvm_pack(spvm_pl, mlxsw_sp_port->local_port,	vid_begin,
+			    vid_end, is_member, untagged);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spvm), spvm_pl);
+	kfree(spvm_pl);
+	return err;
+}
+
+int mlxsw_sp_port_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid_begin,
+			   u16 vid_end, bool is_member, bool untagged)
+{
+	u16 vid, vid_e;
+	int err;
+
+	for (vid = vid_begin; vid <= vid_end;
+	     vid += MLXSW_REG_SPVM_REC_MAX_COUNT) {
+		vid_e = min((u16) (vid + MLXSW_REG_SPVM_REC_MAX_COUNT - 1),
+			    vid_end);
+
+		err = __mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid_e,
+					       is_member, untagged);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void mlxsw_sp_port_vlan_flush(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan, *tmp;
+
+	list_for_each_entry_safe(mlxsw_sp_port_vlan, tmp,
+				 &mlxsw_sp_port->vlans_list, list)
+		mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
+}
+
+static struct mlxsw_sp_port_vlan *
+mlxsw_sp_port_vlan_create(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	bool untagged = vid == 1;
+	int err;
+
+	err = mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid, true, untagged);
+	if (err)
+		return ERR_PTR(err);
+
+	mlxsw_sp_port_vlan = kzalloc(sizeof(*mlxsw_sp_port_vlan), GFP_KERNEL);
+	if (!mlxsw_sp_port_vlan) {
+		err = -ENOMEM;
+		goto err_port_vlan_alloc;
+	}
+
+	mlxsw_sp_port_vlan->mlxsw_sp_port = mlxsw_sp_port;
+	mlxsw_sp_port_vlan->ref_count = 1;
+	mlxsw_sp_port_vlan->vid = vid;
+	list_add(&mlxsw_sp_port_vlan->list, &mlxsw_sp_port->vlans_list);
+
+	return mlxsw_sp_port_vlan;
+
+err_port_vlan_alloc:
+	mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid, false, false);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_port_vlan_destroy(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+
+	list_del(&mlxsw_sp_port_vlan->list);
+	kfree(mlxsw_sp_port_vlan);
+	mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid, false, false);
+}
+
+struct mlxsw_sp_port_vlan *
+mlxsw_sp_port_vlan_get(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
+	if (mlxsw_sp_port_vlan) {
+		mlxsw_sp_port_vlan->ref_count++;
+		return mlxsw_sp_port_vlan;
+	}
+
+	return mlxsw_sp_port_vlan_create(mlxsw_sp_port, vid);
+}
+
+void mlxsw_sp_port_vlan_put(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
+{
+	struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+
+	if (--mlxsw_sp_port_vlan->ref_count != 0)
+		return;
+
+	if (mlxsw_sp_port_vlan->bridge_port)
+		mlxsw_sp_port_vlan_bridge_leave(mlxsw_sp_port_vlan);
+	else if (fid)
+		mlxsw_sp_port_vlan_router_leave(mlxsw_sp_port_vlan);
+
+	mlxsw_sp_port_vlan_destroy(mlxsw_sp_port_vlan);
+}
+
+static int mlxsw_sp_port_add_vid(struct net_device *dev,
+				 __be16 __always_unused proto, u16 vid)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	/* VLAN 0 is added to HW filter when device goes up, but it is
+	 * reserved in our case, so simply return.
+	 */
+	if (!vid)
+		return 0;
+
+	return PTR_ERR_OR_ZERO(mlxsw_sp_port_vlan_get(mlxsw_sp_port, vid));
+}
+
+static int mlxsw_sp_port_kill_vid(struct net_device *dev,
+				  __be16 __always_unused proto, u16 vid)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	/* VLAN 0 is removed from HW filter when device goes down, but
+	 * it is reserved in our case, so simply return.
+	 */
+	if (!vid)
+		return 0;
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
+	if (!mlxsw_sp_port_vlan)
+		return 0;
+	mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
+
+	return 0;
+}
+
+static int mlxsw_sp_port_get_phys_port_name(struct net_device *dev, char *name,
+					    size_t len)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	u8 module = mlxsw_sp_port->mapping.module;
+	u8 width = mlxsw_sp_port->mapping.width;
+	u8 lane = mlxsw_sp_port->mapping.lane;
+	int err;
+
+	if (!mlxsw_sp_port->split)
+		err = snprintf(name, len, "p%d", module + 1);
+	else
+		err = snprintf(name, len, "p%ds%d", module + 1,
+			       lane / width);
+
+	if (err >= len)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct mlxsw_sp_port_mall_tc_entry *
+mlxsw_sp_port_mall_tc_entry_find(struct mlxsw_sp_port *port,
+				 unsigned long cookie) {
+	struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry;
+
+	list_for_each_entry(mall_tc_entry, &port->mall_tc_list, list)
+		if (mall_tc_entry->cookie == cookie)
+			return mall_tc_entry;
+
+	return NULL;
+}
+
+static int
+mlxsw_sp_port_add_cls_matchall_mirror(struct mlxsw_sp_port *mlxsw_sp_port,
+				      struct mlxsw_sp_port_mall_mirror_tc_entry *mirror,
+				      const struct tc_action *a,
+				      bool ingress)
+{
+	enum mlxsw_sp_span_type span_type;
+	struct net_device *to_dev;
+
+	to_dev = tcf_mirred_dev(a);
+	if (!to_dev) {
+		netdev_err(mlxsw_sp_port->dev, "Could not find requested device\n");
+		return -EINVAL;
+	}
+
+	mirror->ingress = ingress;
+	span_type = ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS;
+	return mlxsw_sp_span_mirror_add(mlxsw_sp_port, to_dev, span_type,
+					true, &mirror->span_id);
+}
+
+static void
+mlxsw_sp_port_del_cls_matchall_mirror(struct mlxsw_sp_port *mlxsw_sp_port,
+				      struct mlxsw_sp_port_mall_mirror_tc_entry *mirror)
+{
+	enum mlxsw_sp_span_type span_type;
+
+	span_type = mirror->ingress ?
+			MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS;
+	mlxsw_sp_span_mirror_del(mlxsw_sp_port, mirror->span_id,
+				 span_type, true);
+}
+
+static int
+mlxsw_sp_port_add_cls_matchall_sample(struct mlxsw_sp_port *mlxsw_sp_port,
+				      struct tc_cls_matchall_offload *cls,
+				      const struct tc_action *a,
+				      bool ingress)
+{
+	int err;
+
+	if (!mlxsw_sp_port->sample)
+		return -EOPNOTSUPP;
+	if (rtnl_dereference(mlxsw_sp_port->sample->psample_group)) {
+		netdev_err(mlxsw_sp_port->dev, "sample already active\n");
+		return -EEXIST;
+	}
+	if (tcf_sample_rate(a) > MLXSW_REG_MPSC_RATE_MAX) {
+		netdev_err(mlxsw_sp_port->dev, "sample rate not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	rcu_assign_pointer(mlxsw_sp_port->sample->psample_group,
+			   tcf_sample_psample_group(a));
+	mlxsw_sp_port->sample->truncate = tcf_sample_truncate(a);
+	mlxsw_sp_port->sample->trunc_size = tcf_sample_trunc_size(a);
+	mlxsw_sp_port->sample->rate = tcf_sample_rate(a);
+
+	err = mlxsw_sp_port_sample_set(mlxsw_sp_port, true, tcf_sample_rate(a));
+	if (err)
+		goto err_port_sample_set;
+	return 0;
+
+err_port_sample_set:
+	RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL);
+	return err;
+}
+
+static void
+mlxsw_sp_port_del_cls_matchall_sample(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	if (!mlxsw_sp_port->sample)
+		return;
+
+	mlxsw_sp_port_sample_set(mlxsw_sp_port, false, 1);
+	RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL);
+}
+
+static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
+					  struct tc_cls_matchall_offload *f,
+					  bool ingress)
+{
+	struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry;
+	__be16 protocol = f->common.protocol;
+	const struct tc_action *a;
+	LIST_HEAD(actions);
+	int err;
+
+	if (!tcf_exts_has_one_action(f->exts)) {
+		netdev_err(mlxsw_sp_port->dev, "only singular actions are supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+	if (!mall_tc_entry)
+		return -ENOMEM;
+	mall_tc_entry->cookie = f->cookie;
+
+	tcf_exts_to_list(f->exts, &actions);
+	a = list_first_entry(&actions, struct tc_action, list);
+
+	if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
+		struct mlxsw_sp_port_mall_mirror_tc_entry *mirror;
+
+		mall_tc_entry->type = MLXSW_SP_PORT_MALL_MIRROR;
+		mirror = &mall_tc_entry->mirror;
+		err = mlxsw_sp_port_add_cls_matchall_mirror(mlxsw_sp_port,
+							    mirror, a, ingress);
+	} else if (is_tcf_sample(a) && protocol == htons(ETH_P_ALL)) {
+		mall_tc_entry->type = MLXSW_SP_PORT_MALL_SAMPLE;
+		err = mlxsw_sp_port_add_cls_matchall_sample(mlxsw_sp_port, f,
+							    a, ingress);
+	} else {
+		err = -EOPNOTSUPP;
+	}
+
+	if (err)
+		goto err_add_action;
+
+	list_add_tail(&mall_tc_entry->list, &mlxsw_sp_port->mall_tc_list);
+	return 0;
+
+err_add_action:
+	kfree(mall_tc_entry);
+	return err;
+}
+
+static void mlxsw_sp_port_del_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
+					   struct tc_cls_matchall_offload *f)
+{
+	struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry;
+
+	mall_tc_entry = mlxsw_sp_port_mall_tc_entry_find(mlxsw_sp_port,
+							 f->cookie);
+	if (!mall_tc_entry) {
+		netdev_dbg(mlxsw_sp_port->dev, "tc entry not found on port\n");
+		return;
+	}
+	list_del(&mall_tc_entry->list);
+
+	switch (mall_tc_entry->type) {
+	case MLXSW_SP_PORT_MALL_MIRROR:
+		mlxsw_sp_port_del_cls_matchall_mirror(mlxsw_sp_port,
+						      &mall_tc_entry->mirror);
+		break;
+	case MLXSW_SP_PORT_MALL_SAMPLE:
+		mlxsw_sp_port_del_cls_matchall_sample(mlxsw_sp_port);
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	kfree(mall_tc_entry);
+}
+
+static int mlxsw_sp_setup_tc_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
+					  struct tc_cls_matchall_offload *f,
+					  bool ingress)
+{
+	switch (f->command) {
+	case TC_CLSMATCHALL_REPLACE:
+		return mlxsw_sp_port_add_cls_matchall(mlxsw_sp_port, f,
+						      ingress);
+	case TC_CLSMATCHALL_DESTROY:
+		mlxsw_sp_port_del_cls_matchall(mlxsw_sp_port, f);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int
+mlxsw_sp_setup_tc_cls_flower(struct mlxsw_sp_acl_block *acl_block,
+			     struct tc_cls_flower_offload *f)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_acl_block_mlxsw_sp(acl_block);
+
+	switch (f->command) {
+	case TC_CLSFLOWER_REPLACE:
+		return mlxsw_sp_flower_replace(mlxsw_sp, acl_block, f);
+	case TC_CLSFLOWER_DESTROY:
+		mlxsw_sp_flower_destroy(mlxsw_sp, acl_block, f);
+		return 0;
+	case TC_CLSFLOWER_STATS:
+		return mlxsw_sp_flower_stats(mlxsw_sp, acl_block, f);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlxsw_sp_setup_tc_block_cb_matchall(enum tc_setup_type type,
+					       void *type_data,
+					       void *cb_priv, bool ingress)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = cb_priv;
+
+	switch (type) {
+	case TC_SETUP_CLSMATCHALL:
+		if (!tc_cls_can_offload_and_chain0(mlxsw_sp_port->dev,
+						   type_data))
+			return -EOPNOTSUPP;
+
+		return mlxsw_sp_setup_tc_cls_matchall(mlxsw_sp_port, type_data,
+						      ingress);
+	case TC_SETUP_CLSFLOWER:
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlxsw_sp_setup_tc_block_cb_matchall_ig(enum tc_setup_type type,
+						  void *type_data,
+						  void *cb_priv)
+{
+	return mlxsw_sp_setup_tc_block_cb_matchall(type, type_data,
+						   cb_priv, true);
+}
+
+static int mlxsw_sp_setup_tc_block_cb_matchall_eg(enum tc_setup_type type,
+						  void *type_data,
+						  void *cb_priv)
+{
+	return mlxsw_sp_setup_tc_block_cb_matchall(type, type_data,
+						   cb_priv, false);
+}
+
+static int mlxsw_sp_setup_tc_block_cb_flower(enum tc_setup_type type,
+					     void *type_data, void *cb_priv)
+{
+	struct mlxsw_sp_acl_block *acl_block = cb_priv;
+
+	switch (type) {
+	case TC_SETUP_CLSMATCHALL:
+		return 0;
+	case TC_SETUP_CLSFLOWER:
+		if (mlxsw_sp_acl_block_disabled(acl_block))
+			return -EOPNOTSUPP;
+
+		return mlxsw_sp_setup_tc_cls_flower(acl_block, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int
+mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port,
+				    struct tcf_block *block, bool ingress)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_acl_block *acl_block;
+	struct tcf_block_cb *block_cb;
+	int err;
+
+	block_cb = tcf_block_cb_lookup(block, mlxsw_sp_setup_tc_block_cb_flower,
+				       mlxsw_sp);
+	if (!block_cb) {
+		acl_block = mlxsw_sp_acl_block_create(mlxsw_sp, block->net);
+		if (!acl_block)
+			return -ENOMEM;
+		block_cb = __tcf_block_cb_register(block,
+						   mlxsw_sp_setup_tc_block_cb_flower,
+						   mlxsw_sp, acl_block);
+		if (IS_ERR(block_cb)) {
+			err = PTR_ERR(block_cb);
+			goto err_cb_register;
+		}
+	} else {
+		acl_block = tcf_block_cb_priv(block_cb);
+	}
+	tcf_block_cb_incref(block_cb);
+	err = mlxsw_sp_acl_block_bind(mlxsw_sp, acl_block,
+				      mlxsw_sp_port, ingress);
+	if (err)
+		goto err_block_bind;
+
+	if (ingress)
+		mlxsw_sp_port->ing_acl_block = acl_block;
+	else
+		mlxsw_sp_port->eg_acl_block = acl_block;
+
+	return 0;
+
+err_block_bind:
+	if (!tcf_block_cb_decref(block_cb)) {
+		__tcf_block_cb_unregister(block_cb);
+err_cb_register:
+		mlxsw_sp_acl_block_destroy(acl_block);
+	}
+	return err;
+}
+
+static void
+mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port,
+				      struct tcf_block *block, bool ingress)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_acl_block *acl_block;
+	struct tcf_block_cb *block_cb;
+	int err;
+
+	block_cb = tcf_block_cb_lookup(block, mlxsw_sp_setup_tc_block_cb_flower,
+				       mlxsw_sp);
+	if (!block_cb)
+		return;
+
+	if (ingress)
+		mlxsw_sp_port->ing_acl_block = NULL;
+	else
+		mlxsw_sp_port->eg_acl_block = NULL;
+
+	acl_block = tcf_block_cb_priv(block_cb);
+	err = mlxsw_sp_acl_block_unbind(mlxsw_sp, acl_block,
+					mlxsw_sp_port, ingress);
+	if (!err && !tcf_block_cb_decref(block_cb)) {
+		__tcf_block_cb_unregister(block_cb);
+		mlxsw_sp_acl_block_destroy(acl_block);
+	}
+}
+
+static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct tc_block_offload *f)
+{
+	tc_setup_cb_t *cb;
+	bool ingress;
+	int err;
+
+	if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS) {
+		cb = mlxsw_sp_setup_tc_block_cb_matchall_ig;
+		ingress = true;
+	} else if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS) {
+		cb = mlxsw_sp_setup_tc_block_cb_matchall_eg;
+		ingress = false;
+	} else {
+		return -EOPNOTSUPP;
+	}
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		err = tcf_block_cb_register(f->block, cb, mlxsw_sp_port,
+					    mlxsw_sp_port);
+		if (err)
+			return err;
+		err = mlxsw_sp_setup_tc_block_flower_bind(mlxsw_sp_port,
+							  f->block, ingress);
+		if (err) {
+			tcf_block_cb_unregister(f->block, cb, mlxsw_sp_port);
+			return err;
+		}
+		return 0;
+	case TC_BLOCK_UNBIND:
+		mlxsw_sp_setup_tc_block_flower_unbind(mlxsw_sp_port,
+						      f->block, ingress);
+		tcf_block_cb_unregister(f->block, cb, mlxsw_sp_port);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlxsw_sp_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			     void *type_data)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	switch (type) {
+	case TC_SETUP_BLOCK:
+		return mlxsw_sp_setup_tc_block(mlxsw_sp_port, type_data);
+	case TC_SETUP_QDISC_RED:
+		return mlxsw_sp_setup_tc_red(mlxsw_sp_port, type_data);
+	case TC_SETUP_QDISC_PRIO:
+		return mlxsw_sp_setup_tc_prio(mlxsw_sp_port, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+
+static int mlxsw_sp_feature_hw_tc(struct net_device *dev, bool enable)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	if (!enable) {
+		if (mlxsw_sp_acl_block_rule_count(mlxsw_sp_port->ing_acl_block) ||
+		    mlxsw_sp_acl_block_rule_count(mlxsw_sp_port->eg_acl_block) ||
+		    !list_empty(&mlxsw_sp_port->mall_tc_list)) {
+			netdev_err(dev, "Active offloaded tc filters, can't turn hw_tc_offload off\n");
+			return -EINVAL;
+		}
+		mlxsw_sp_acl_block_disable_inc(mlxsw_sp_port->ing_acl_block);
+		mlxsw_sp_acl_block_disable_inc(mlxsw_sp_port->eg_acl_block);
+	} else {
+		mlxsw_sp_acl_block_disable_dec(mlxsw_sp_port->ing_acl_block);
+		mlxsw_sp_acl_block_disable_dec(mlxsw_sp_port->eg_acl_block);
+	}
+	return 0;
+}
+
+typedef int (*mlxsw_sp_feature_handler)(struct net_device *dev, bool enable);
+
+static int mlxsw_sp_handle_feature(struct net_device *dev,
+				   netdev_features_t wanted_features,
+				   netdev_features_t feature,
+				   mlxsw_sp_feature_handler feature_handler)
+{
+	netdev_features_t changes = wanted_features ^ dev->features;
+	bool enable = !!(wanted_features & feature);
+	int err;
+
+	if (!(changes & feature))
+		return 0;
+
+	err = feature_handler(dev, enable);
+	if (err) {
+		netdev_err(dev, "%s feature %pNF failed, err %d\n",
+			   enable ? "Enable" : "Disable", &feature, err);
+		return err;
+	}
+
+	if (enable)
+		dev->features |= feature;
+	else
+		dev->features &= ~feature;
+
+	return 0;
+}
+static int mlxsw_sp_set_features(struct net_device *dev,
+				 netdev_features_t features)
+{
+	return mlxsw_sp_handle_feature(dev, features, NETIF_F_HW_TC,
+				       mlxsw_sp_feature_hw_tc);
+}
+
+static const struct net_device_ops mlxsw_sp_port_netdev_ops = {
+	.ndo_open		= mlxsw_sp_port_open,
+	.ndo_stop		= mlxsw_sp_port_stop,
+	.ndo_start_xmit		= mlxsw_sp_port_xmit,
+	.ndo_setup_tc           = mlxsw_sp_setup_tc,
+	.ndo_set_rx_mode	= mlxsw_sp_set_rx_mode,
+	.ndo_set_mac_address	= mlxsw_sp_port_set_mac_address,
+	.ndo_change_mtu		= mlxsw_sp_port_change_mtu,
+	.ndo_get_stats64	= mlxsw_sp_port_get_stats64,
+	.ndo_has_offload_stats	= mlxsw_sp_port_has_offload_stats,
+	.ndo_get_offload_stats	= mlxsw_sp_port_get_offload_stats,
+	.ndo_vlan_rx_add_vid	= mlxsw_sp_port_add_vid,
+	.ndo_vlan_rx_kill_vid	= mlxsw_sp_port_kill_vid,
+	.ndo_get_phys_port_name	= mlxsw_sp_port_get_phys_port_name,
+	.ndo_set_features	= mlxsw_sp_set_features,
+};
+
+static void mlxsw_sp_port_get_drvinfo(struct net_device *dev,
+				      struct ethtool_drvinfo *drvinfo)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+
+	strlcpy(drvinfo->driver, mlxsw_sp_driver_name, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, mlxsw_sp_driver_version,
+		sizeof(drvinfo->version));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "%d.%d.%d",
+		 mlxsw_sp->bus_info->fw_rev.major,
+		 mlxsw_sp->bus_info->fw_rev.minor,
+		 mlxsw_sp->bus_info->fw_rev.subminor);
+	strlcpy(drvinfo->bus_info, mlxsw_sp->bus_info->device_name,
+		sizeof(drvinfo->bus_info));
+}
+
+static void mlxsw_sp_port_get_pauseparam(struct net_device *dev,
+					 struct ethtool_pauseparam *pause)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	pause->rx_pause = mlxsw_sp_port->link.rx_pause;
+	pause->tx_pause = mlxsw_sp_port->link.tx_pause;
+}
+
+static int mlxsw_sp_port_pause_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct ethtool_pauseparam *pause)
+{
+	char pfcc_pl[MLXSW_REG_PFCC_LEN];
+
+	mlxsw_reg_pfcc_pack(pfcc_pl, mlxsw_sp_port->local_port);
+	mlxsw_reg_pfcc_pprx_set(pfcc_pl, pause->rx_pause);
+	mlxsw_reg_pfcc_pptx_set(pfcc_pl, pause->tx_pause);
+
+	return mlxsw_reg_write(mlxsw_sp_port->mlxsw_sp->core, MLXSW_REG(pfcc),
+			       pfcc_pl);
+}
+
+static int mlxsw_sp_port_set_pauseparam(struct net_device *dev,
+					struct ethtool_pauseparam *pause)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	bool pause_en = pause->tx_pause || pause->rx_pause;
+	int err;
+
+	if (mlxsw_sp_port->dcb.pfc && mlxsw_sp_port->dcb.pfc->pfc_en) {
+		netdev_err(dev, "PFC already enabled on port\n");
+		return -EINVAL;
+	}
+
+	if (pause->autoneg) {
+		netdev_err(dev, "PAUSE frames autonegotiation isn't supported\n");
+		return -EINVAL;
+	}
+
+	err = mlxsw_sp_port_headroom_set(mlxsw_sp_port, dev->mtu, pause_en);
+	if (err) {
+		netdev_err(dev, "Failed to configure port's headroom\n");
+		return err;
+	}
+
+	err = mlxsw_sp_port_pause_set(mlxsw_sp_port, pause);
+	if (err) {
+		netdev_err(dev, "Failed to set PAUSE parameters\n");
+		goto err_port_pause_configure;
+	}
+
+	mlxsw_sp_port->link.rx_pause = pause->rx_pause;
+	mlxsw_sp_port->link.tx_pause = pause->tx_pause;
+
+	return 0;
+
+err_port_pause_configure:
+	pause_en = mlxsw_sp_port_is_pause_en(mlxsw_sp_port);
+	mlxsw_sp_port_headroom_set(mlxsw_sp_port, dev->mtu, pause_en);
+	return err;
+}
+
+struct mlxsw_sp_port_hw_stats {
+	char str[ETH_GSTRING_LEN];
+	u64 (*getter)(const char *payload);
+	bool cells_bytes;
+};
+
+static struct mlxsw_sp_port_hw_stats mlxsw_sp_port_hw_stats[] = {
+	{
+		.str = "a_frames_transmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_frames_transmitted_ok_get,
+	},
+	{
+		.str = "a_frames_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_frames_received_ok_get,
+	},
+	{
+		.str = "a_frame_check_sequence_errors",
+		.getter = mlxsw_reg_ppcnt_a_frame_check_sequence_errors_get,
+	},
+	{
+		.str = "a_alignment_errors",
+		.getter = mlxsw_reg_ppcnt_a_alignment_errors_get,
+	},
+	{
+		.str = "a_octets_transmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_octets_transmitted_ok_get,
+	},
+	{
+		.str = "a_octets_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_octets_received_ok_get,
+	},
+	{
+		.str = "a_multicast_frames_xmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_multicast_frames_xmitted_ok_get,
+	},
+	{
+		.str = "a_broadcast_frames_xmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_broadcast_frames_xmitted_ok_get,
+	},
+	{
+		.str = "a_multicast_frames_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_multicast_frames_received_ok_get,
+	},
+	{
+		.str = "a_broadcast_frames_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_broadcast_frames_received_ok_get,
+	},
+	{
+		.str = "a_in_range_length_errors",
+		.getter = mlxsw_reg_ppcnt_a_in_range_length_errors_get,
+	},
+	{
+		.str = "a_out_of_range_length_field",
+		.getter = mlxsw_reg_ppcnt_a_out_of_range_length_field_get,
+	},
+	{
+		.str = "a_frame_too_long_errors",
+		.getter = mlxsw_reg_ppcnt_a_frame_too_long_errors_get,
+	},
+	{
+		.str = "a_symbol_error_during_carrier",
+		.getter = mlxsw_reg_ppcnt_a_symbol_error_during_carrier_get,
+	},
+	{
+		.str = "a_mac_control_frames_transmitted",
+		.getter = mlxsw_reg_ppcnt_a_mac_control_frames_transmitted_get,
+	},
+	{
+		.str = "a_mac_control_frames_received",
+		.getter = mlxsw_reg_ppcnt_a_mac_control_frames_received_get,
+	},
+	{
+		.str = "a_unsupported_opcodes_received",
+		.getter = mlxsw_reg_ppcnt_a_unsupported_opcodes_received_get,
+	},
+	{
+		.str = "a_pause_mac_ctrl_frames_received",
+		.getter = mlxsw_reg_ppcnt_a_pause_mac_ctrl_frames_received_get,
+	},
+	{
+		.str = "a_pause_mac_ctrl_frames_xmitted",
+		.getter = mlxsw_reg_ppcnt_a_pause_mac_ctrl_frames_transmitted_get,
+	},
+};
+
+#define MLXSW_SP_PORT_HW_STATS_LEN ARRAY_SIZE(mlxsw_sp_port_hw_stats)
+
+static struct mlxsw_sp_port_hw_stats mlxsw_sp_port_hw_prio_stats[] = {
+	{
+		.str = "rx_octets_prio",
+		.getter = mlxsw_reg_ppcnt_rx_octets_get,
+	},
+	{
+		.str = "rx_frames_prio",
+		.getter = mlxsw_reg_ppcnt_rx_frames_get,
+	},
+	{
+		.str = "tx_octets_prio",
+		.getter = mlxsw_reg_ppcnt_tx_octets_get,
+	},
+	{
+		.str = "tx_frames_prio",
+		.getter = mlxsw_reg_ppcnt_tx_frames_get,
+	},
+	{
+		.str = "rx_pause_prio",
+		.getter = mlxsw_reg_ppcnt_rx_pause_get,
+	},
+	{
+		.str = "rx_pause_duration_prio",
+		.getter = mlxsw_reg_ppcnt_rx_pause_duration_get,
+	},
+	{
+		.str = "tx_pause_prio",
+		.getter = mlxsw_reg_ppcnt_tx_pause_get,
+	},
+	{
+		.str = "tx_pause_duration_prio",
+		.getter = mlxsw_reg_ppcnt_tx_pause_duration_get,
+	},
+};
+
+#define MLXSW_SP_PORT_HW_PRIO_STATS_LEN ARRAY_SIZE(mlxsw_sp_port_hw_prio_stats)
+
+static struct mlxsw_sp_port_hw_stats mlxsw_sp_port_hw_tc_stats[] = {
+	{
+		.str = "tc_transmit_queue_tc",
+		.getter = mlxsw_reg_ppcnt_tc_transmit_queue_get,
+		.cells_bytes = true,
+	},
+	{
+		.str = "tc_no_buffer_discard_uc_tc",
+		.getter = mlxsw_reg_ppcnt_tc_no_buffer_discard_uc_get,
+	},
+};
+
+#define MLXSW_SP_PORT_HW_TC_STATS_LEN ARRAY_SIZE(mlxsw_sp_port_hw_tc_stats)
+
+#define MLXSW_SP_PORT_ETHTOOL_STATS_LEN (MLXSW_SP_PORT_HW_STATS_LEN + \
+					 (MLXSW_SP_PORT_HW_PRIO_STATS_LEN + \
+					  MLXSW_SP_PORT_HW_TC_STATS_LEN) * \
+					 IEEE_8021QAZ_MAX_TCS)
+
+static void mlxsw_sp_port_get_prio_strings(u8 **p, int prio)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_SP_PORT_HW_PRIO_STATS_LEN; i++) {
+		snprintf(*p, ETH_GSTRING_LEN, "%s_%d",
+			 mlxsw_sp_port_hw_prio_stats[i].str, prio);
+		*p += ETH_GSTRING_LEN;
+	}
+}
+
+static void mlxsw_sp_port_get_tc_strings(u8 **p, int tc)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_SP_PORT_HW_TC_STATS_LEN; i++) {
+		snprintf(*p, ETH_GSTRING_LEN, "%s_%d",
+			 mlxsw_sp_port_hw_tc_stats[i].str, tc);
+		*p += ETH_GSTRING_LEN;
+	}
+}
+
+static void mlxsw_sp_port_get_strings(struct net_device *dev,
+				      u32 stringset, u8 *data)
+{
+	u8 *p = data;
+	int i;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < MLXSW_SP_PORT_HW_STATS_LEN; i++) {
+			memcpy(p, mlxsw_sp_port_hw_stats[i].str,
+			       ETH_GSTRING_LEN);
+			p += ETH_GSTRING_LEN;
+		}
+
+		for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+			mlxsw_sp_port_get_prio_strings(&p, i);
+
+		for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+			mlxsw_sp_port_get_tc_strings(&p, i);
+
+		break;
+	}
+}
+
+static int mlxsw_sp_port_set_phys_id(struct net_device *dev,
+				     enum ethtool_phys_id_state state)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char mlcr_pl[MLXSW_REG_MLCR_LEN];
+	bool active;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		active = true;
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		active = false;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	mlxsw_reg_mlcr_pack(mlcr_pl, mlxsw_sp_port->local_port, active);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mlcr), mlcr_pl);
+}
+
+static int
+mlxsw_sp_get_hw_stats_by_group(struct mlxsw_sp_port_hw_stats **p_hw_stats,
+			       int *p_len, enum mlxsw_reg_ppcnt_grp grp)
+{
+	switch (grp) {
+	case  MLXSW_REG_PPCNT_IEEE_8023_CNT:
+		*p_hw_stats = mlxsw_sp_port_hw_stats;
+		*p_len = MLXSW_SP_PORT_HW_STATS_LEN;
+		break;
+	case MLXSW_REG_PPCNT_PRIO_CNT:
+		*p_hw_stats = mlxsw_sp_port_hw_prio_stats;
+		*p_len = MLXSW_SP_PORT_HW_PRIO_STATS_LEN;
+		break;
+	case MLXSW_REG_PPCNT_TC_CNT:
+		*p_hw_stats = mlxsw_sp_port_hw_tc_stats;
+		*p_len = MLXSW_SP_PORT_HW_TC_STATS_LEN;
+		break;
+	default:
+		WARN_ON(1);
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+static void __mlxsw_sp_port_get_stats(struct net_device *dev,
+				      enum mlxsw_reg_ppcnt_grp grp, int prio,
+				      u64 *data, int data_index)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_port_hw_stats *hw_stats;
+	char ppcnt_pl[MLXSW_REG_PPCNT_LEN];
+	int i, len;
+	int err;
+
+	err = mlxsw_sp_get_hw_stats_by_group(&hw_stats, &len, grp);
+	if (err)
+		return;
+	mlxsw_sp_port_get_stats_raw(dev, grp, prio, ppcnt_pl);
+	for (i = 0; i < len; i++) {
+		data[data_index + i] = hw_stats[i].getter(ppcnt_pl);
+		if (!hw_stats[i].cells_bytes)
+			continue;
+		data[data_index + i] = mlxsw_sp_cells_bytes(mlxsw_sp,
+							    data[data_index + i]);
+	}
+}
+
+static void mlxsw_sp_port_get_stats(struct net_device *dev,
+				    struct ethtool_stats *stats, u64 *data)
+{
+	int i, data_index = 0;
+
+	/* IEEE 802.3 Counters */
+	__mlxsw_sp_port_get_stats(dev, MLXSW_REG_PPCNT_IEEE_8023_CNT, 0,
+				  data, data_index);
+	data_index = MLXSW_SP_PORT_HW_STATS_LEN;
+
+	/* Per-Priority Counters */
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		__mlxsw_sp_port_get_stats(dev, MLXSW_REG_PPCNT_PRIO_CNT, i,
+					  data, data_index);
+		data_index += MLXSW_SP_PORT_HW_PRIO_STATS_LEN;
+	}
+
+	/* Per-TC Counters */
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		__mlxsw_sp_port_get_stats(dev, MLXSW_REG_PPCNT_TC_CNT, i,
+					  data, data_index);
+		data_index += MLXSW_SP_PORT_HW_TC_STATS_LEN;
+	}
+}
+
+static int mlxsw_sp_port_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return MLXSW_SP_PORT_ETHTOOL_STATS_LEN;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+struct mlxsw_sp_port_link_mode {
+	enum ethtool_link_mode_bit_indices mask_ethtool;
+	u32 mask;
+	u32 speed;
+};
+
+static const struct mlxsw_sp_port_link_mode mlxsw_sp_port_link_mode[] = {
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100BASE_T,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+		.speed		= SPEED_100,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_SGMII |
+				  MLXSW_REG_PTYS_ETH_SPEED_1000BASE_KX,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_1000baseKX_Full_BIT,
+		.speed		= SPEED_1000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_10GBASE_T,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_10000baseT_Full_BIT,
+		.speed		= SPEED_10000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CX4 |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT,
+		.speed		= SPEED_10000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_ER_LR,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_10000baseKR_Full_BIT,
+		.speed		= SPEED_10000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_20GBASE_KR2,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT,
+		.speed		= SPEED_20000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT,
+		.speed		= SPEED_40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT,
+		.speed		= SPEED_40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT,
+		.speed		= SPEED_40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_LR4_ER4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT,
+		.speed		= SPEED_40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_25GBASE_CR,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_25000baseCR_Full_BIT,
+		.speed		= SPEED_25000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_25GBASE_KR,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_25000baseKR_Full_BIT,
+		.speed		= SPEED_25000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_25GBASE_SR,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_25000baseSR_Full_BIT,
+		.speed		= SPEED_25000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_25GBASE_SR,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_25000baseSR_Full_BIT,
+		.speed		= SPEED_25000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_50GBASE_CR2,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT,
+		.speed		= SPEED_50000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_50GBASE_KR2,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT,
+		.speed		= SPEED_50000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_50GBASE_SR2,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT,
+		.speed		= SPEED_50000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_56GBASE_R4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT,
+		.speed		= SPEED_56000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_56GBASE_R4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT,
+		.speed		= SPEED_56000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_56GBASE_R4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT,
+		.speed		= SPEED_56000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_56GBASE_R4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT,
+		.speed		= SPEED_56000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100GBASE_CR4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT,
+		.speed		= SPEED_100000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT,
+		.speed		= SPEED_100000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT,
+		.speed		= SPEED_100000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100GBASE_LR4_ER4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT,
+		.speed		= SPEED_100000,
+	},
+};
+
+#define MLXSW_SP_PORT_LINK_MODE_LEN ARRAY_SIZE(mlxsw_sp_port_link_mode)
+
+static void
+mlxsw_sp_from_ptys_supported_port(u32 ptys_eth_proto,
+				  struct ethtool_link_ksettings *cmd)
+{
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR |
+			      MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_SGMII))
+		ethtool_link_ksettings_add_link_mode(cmd, supported, FIBRE);
+
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR |
+			      MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_1000BASE_KX))
+		ethtool_link_ksettings_add_link_mode(cmd, supported, Backplane);
+}
+
+static void mlxsw_sp_from_ptys_link(u32 ptys_eth_proto, unsigned long *mode)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_SP_PORT_LINK_MODE_LEN; i++) {
+		if (ptys_eth_proto & mlxsw_sp_port_link_mode[i].mask)
+			__set_bit(mlxsw_sp_port_link_mode[i].mask_ethtool,
+				  mode);
+	}
+}
+
+static void mlxsw_sp_from_ptys_speed_duplex(bool carrier_ok, u32 ptys_eth_proto,
+					    struct ethtool_link_ksettings *cmd)
+{
+	u32 speed = SPEED_UNKNOWN;
+	u8 duplex = DUPLEX_UNKNOWN;
+	int i;
+
+	if (!carrier_ok)
+		goto out;
+
+	for (i = 0; i < MLXSW_SP_PORT_LINK_MODE_LEN; i++) {
+		if (ptys_eth_proto & mlxsw_sp_port_link_mode[i].mask) {
+			speed = mlxsw_sp_port_link_mode[i].speed;
+			duplex = DUPLEX_FULL;
+			break;
+		}
+	}
+out:
+	cmd->base.speed = speed;
+	cmd->base.duplex = duplex;
+}
+
+static u8 mlxsw_sp_port_connector_port(u32 ptys_eth_proto)
+{
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_SGMII))
+		return PORT_FIBRE;
+
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_CR4))
+		return PORT_DA;
+
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR |
+			      MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4))
+		return PORT_NONE;
+
+	return PORT_OTHER;
+}
+
+static u32
+mlxsw_sp_to_ptys_advert_link(const struct ethtool_link_ksettings *cmd)
+{
+	u32 ptys_proto = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SP_PORT_LINK_MODE_LEN; i++) {
+		if (test_bit(mlxsw_sp_port_link_mode[i].mask_ethtool,
+			     cmd->link_modes.advertising))
+			ptys_proto |= mlxsw_sp_port_link_mode[i].mask;
+	}
+	return ptys_proto;
+}
+
+static u32 mlxsw_sp_to_ptys_speed(u32 speed)
+{
+	u32 ptys_proto = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SP_PORT_LINK_MODE_LEN; i++) {
+		if (speed == mlxsw_sp_port_link_mode[i].speed)
+			ptys_proto |= mlxsw_sp_port_link_mode[i].mask;
+	}
+	return ptys_proto;
+}
+
+static u32 mlxsw_sp_to_ptys_upper_speed(u32 upper_speed)
+{
+	u32 ptys_proto = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SP_PORT_LINK_MODE_LEN; i++) {
+		if (mlxsw_sp_port_link_mode[i].speed <= upper_speed)
+			ptys_proto |= mlxsw_sp_port_link_mode[i].mask;
+	}
+	return ptys_proto;
+}
+
+static void mlxsw_sp_port_get_link_supported(u32 eth_proto_cap,
+					     struct ethtool_link_ksettings *cmd)
+{
+	ethtool_link_ksettings_add_link_mode(cmd, supported, Asym_Pause);
+	ethtool_link_ksettings_add_link_mode(cmd, supported, Autoneg);
+	ethtool_link_ksettings_add_link_mode(cmd, supported, Pause);
+
+	mlxsw_sp_from_ptys_supported_port(eth_proto_cap, cmd);
+	mlxsw_sp_from_ptys_link(eth_proto_cap, cmd->link_modes.supported);
+}
+
+static void mlxsw_sp_port_get_link_advertise(u32 eth_proto_admin, bool autoneg,
+					     struct ethtool_link_ksettings *cmd)
+{
+	if (!autoneg)
+		return;
+
+	ethtool_link_ksettings_add_link_mode(cmd, advertising, Autoneg);
+	mlxsw_sp_from_ptys_link(eth_proto_admin, cmd->link_modes.advertising);
+}
+
+static void
+mlxsw_sp_port_get_link_lp_advertise(u32 eth_proto_lp, u8 autoneg_status,
+				    struct ethtool_link_ksettings *cmd)
+{
+	if (autoneg_status != MLXSW_REG_PTYS_AN_STATUS_OK || !eth_proto_lp)
+		return;
+
+	ethtool_link_ksettings_add_link_mode(cmd, lp_advertising, Autoneg);
+	mlxsw_sp_from_ptys_link(eth_proto_lp, cmd->link_modes.lp_advertising);
+}
+
+static int mlxsw_sp_port_get_link_ksettings(struct net_device *dev,
+					    struct ethtool_link_ksettings *cmd)
+{
+	u32 eth_proto_cap, eth_proto_admin, eth_proto_oper, eth_proto_lp;
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+	u8 autoneg_status;
+	bool autoneg;
+	int err;
+
+	autoneg = mlxsw_sp_port->link.autoneg;
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sp_port->local_port, 0, false);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ptys), ptys_pl);
+	if (err)
+		return err;
+	mlxsw_reg_ptys_eth_unpack(ptys_pl, &eth_proto_cap, &eth_proto_admin,
+				  &eth_proto_oper);
+
+	mlxsw_sp_port_get_link_supported(eth_proto_cap, cmd);
+
+	mlxsw_sp_port_get_link_advertise(eth_proto_admin, autoneg, cmd);
+
+	eth_proto_lp = mlxsw_reg_ptys_eth_proto_lp_advertise_get(ptys_pl);
+	autoneg_status = mlxsw_reg_ptys_an_status_get(ptys_pl);
+	mlxsw_sp_port_get_link_lp_advertise(eth_proto_lp, autoneg_status, cmd);
+
+	cmd->base.autoneg = autoneg ? AUTONEG_ENABLE : AUTONEG_DISABLE;
+	cmd->base.port = mlxsw_sp_port_connector_port(eth_proto_oper);
+	mlxsw_sp_from_ptys_speed_duplex(netif_carrier_ok(dev), eth_proto_oper,
+					cmd);
+
+	return 0;
+}
+
+static int
+mlxsw_sp_port_set_link_ksettings(struct net_device *dev,
+				 const struct ethtool_link_ksettings *cmd)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+	u32 eth_proto_cap, eth_proto_new;
+	bool autoneg;
+	int err;
+
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sp_port->local_port, 0, false);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ptys), ptys_pl);
+	if (err)
+		return err;
+	mlxsw_reg_ptys_eth_unpack(ptys_pl, &eth_proto_cap, NULL, NULL);
+
+	autoneg = cmd->base.autoneg == AUTONEG_ENABLE;
+	eth_proto_new = autoneg ?
+		mlxsw_sp_to_ptys_advert_link(cmd) :
+		mlxsw_sp_to_ptys_speed(cmd->base.speed);
+
+	eth_proto_new = eth_proto_new & eth_proto_cap;
+	if (!eth_proto_new) {
+		netdev_err(dev, "No supported speed requested\n");
+		return -EINVAL;
+	}
+
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sp_port->local_port,
+				eth_proto_new, autoneg);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptys), ptys_pl);
+	if (err)
+		return err;
+
+	if (!netif_running(dev))
+		return 0;
+
+	mlxsw_sp_port->link.autoneg = autoneg;
+
+	mlxsw_sp_port_admin_status_set(mlxsw_sp_port, false);
+	mlxsw_sp_port_admin_status_set(mlxsw_sp_port, true);
+
+	return 0;
+}
+
+static int mlxsw_sp_flash_device(struct net_device *dev,
+				 struct ethtool_flash *flash)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	const struct firmware *firmware;
+	int err;
+
+	if (flash->region != ETHTOOL_FLASH_ALL_REGIONS)
+		return -EOPNOTSUPP;
+
+	dev_hold(dev);
+	rtnl_unlock();
+
+	err = request_firmware_direct(&firmware, flash->data, &dev->dev);
+	if (err)
+		goto out;
+	err = mlxsw_sp_firmware_flash(mlxsw_sp, firmware);
+	release_firmware(firmware);
+out:
+	rtnl_lock();
+	dev_put(dev);
+	return err;
+}
+
+#define MLXSW_SP_I2C_ADDR_LOW 0x50
+#define MLXSW_SP_I2C_ADDR_HIGH 0x51
+#define MLXSW_SP_EEPROM_PAGE_LENGTH 256
+
+static int mlxsw_sp_query_module_eeprom(struct mlxsw_sp_port *mlxsw_sp_port,
+					u16 offset, u16 size, void *data,
+					unsigned int *p_read_size)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char eeprom_tmp[MLXSW_SP_REG_MCIA_EEPROM_SIZE];
+	char mcia_pl[MLXSW_REG_MCIA_LEN];
+	u16 i2c_addr;
+	int status;
+	int err;
+
+	size = min_t(u16, size, MLXSW_SP_REG_MCIA_EEPROM_SIZE);
+
+	if (offset < MLXSW_SP_EEPROM_PAGE_LENGTH &&
+	    offset + size > MLXSW_SP_EEPROM_PAGE_LENGTH)
+		/* Cross pages read, read until offset 256 in low page */
+		size = MLXSW_SP_EEPROM_PAGE_LENGTH - offset;
+
+	i2c_addr = MLXSW_SP_I2C_ADDR_LOW;
+	if (offset >= MLXSW_SP_EEPROM_PAGE_LENGTH) {
+		i2c_addr = MLXSW_SP_I2C_ADDR_HIGH;
+		offset -= MLXSW_SP_EEPROM_PAGE_LENGTH;
+	}
+
+	mlxsw_reg_mcia_pack(mcia_pl, mlxsw_sp_port->mapping.module,
+			    0, 0, offset, size, i2c_addr);
+
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mcia), mcia_pl);
+	if (err)
+		return err;
+
+	status = mlxsw_reg_mcia_status_get(mcia_pl);
+	if (status)
+		return -EIO;
+
+	mlxsw_reg_mcia_eeprom_memcpy_from(mcia_pl, eeprom_tmp);
+	memcpy(data, eeprom_tmp, size);
+	*p_read_size = size;
+
+	return 0;
+}
+
+enum mlxsw_sp_eeprom_module_info_rev_id {
+	MLXSW_SP_EEPROM_MODULE_INFO_REV_ID_UNSPC      = 0x00,
+	MLXSW_SP_EEPROM_MODULE_INFO_REV_ID_8436       = 0x01,
+	MLXSW_SP_EEPROM_MODULE_INFO_REV_ID_8636       = 0x03,
+};
+
+enum mlxsw_sp_eeprom_module_info_id {
+	MLXSW_SP_EEPROM_MODULE_INFO_ID_SFP              = 0x03,
+	MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP             = 0x0C,
+	MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP_PLUS        = 0x0D,
+	MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP28           = 0x11,
+};
+
+enum mlxsw_sp_eeprom_module_info {
+	MLXSW_SP_EEPROM_MODULE_INFO_ID,
+	MLXSW_SP_EEPROM_MODULE_INFO_REV_ID,
+	MLXSW_SP_EEPROM_MODULE_INFO_SIZE,
+};
+
+static int mlxsw_sp_get_module_info(struct net_device *netdev,
+				    struct ethtool_modinfo *modinfo)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(netdev);
+	u8 module_info[MLXSW_SP_EEPROM_MODULE_INFO_SIZE];
+	u8 module_rev_id, module_id;
+	unsigned int read_size;
+	int err;
+
+	err = mlxsw_sp_query_module_eeprom(mlxsw_sp_port, 0,
+					   MLXSW_SP_EEPROM_MODULE_INFO_SIZE,
+					   module_info, &read_size);
+	if (err)
+		return err;
+
+	if (read_size < MLXSW_SP_EEPROM_MODULE_INFO_SIZE)
+		return -EIO;
+
+	module_rev_id = module_info[MLXSW_SP_EEPROM_MODULE_INFO_REV_ID];
+	module_id = module_info[MLXSW_SP_EEPROM_MODULE_INFO_ID];
+
+	switch (module_id) {
+	case MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP:
+		modinfo->type       = ETH_MODULE_SFF_8436;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		break;
+	case MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP_PLUS:
+	case MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP28:
+		if (module_id  == MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP28 ||
+		    module_rev_id >= MLXSW_SP_EEPROM_MODULE_INFO_REV_ID_8636) {
+			modinfo->type       = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+		} else {
+			modinfo->type       = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		}
+		break;
+	case MLXSW_SP_EEPROM_MODULE_INFO_ID_SFP:
+		modinfo->type       = ETH_MODULE_SFF_8472;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_get_module_eeprom(struct net_device *netdev,
+				      struct ethtool_eeprom *ee,
+				      u8 *data)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(netdev);
+	int offset = ee->offset;
+	unsigned int read_size;
+	int i = 0;
+	int err;
+
+	if (!ee->len)
+		return -EINVAL;
+
+	memset(data, 0, ee->len);
+
+	while (i < ee->len) {
+		err = mlxsw_sp_query_module_eeprom(mlxsw_sp_port, offset,
+						   ee->len - i, data + i,
+						   &read_size);
+		if (err) {
+			netdev_err(mlxsw_sp_port->dev, "Eeprom query failed\n");
+			return err;
+		}
+
+		i += read_size;
+		offset += read_size;
+	}
+
+	return 0;
+}
+
+static const struct ethtool_ops mlxsw_sp_port_ethtool_ops = {
+	.get_drvinfo		= mlxsw_sp_port_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+	.get_pauseparam		= mlxsw_sp_port_get_pauseparam,
+	.set_pauseparam		= mlxsw_sp_port_set_pauseparam,
+	.get_strings		= mlxsw_sp_port_get_strings,
+	.set_phys_id		= mlxsw_sp_port_set_phys_id,
+	.get_ethtool_stats	= mlxsw_sp_port_get_stats,
+	.get_sset_count		= mlxsw_sp_port_get_sset_count,
+	.get_link_ksettings	= mlxsw_sp_port_get_link_ksettings,
+	.set_link_ksettings	= mlxsw_sp_port_set_link_ksettings,
+	.flash_device		= mlxsw_sp_flash_device,
+	.get_module_info	= mlxsw_sp_get_module_info,
+	.get_module_eeprom	= mlxsw_sp_get_module_eeprom,
+};
+
+static int
+mlxsw_sp_port_speed_by_width_set(struct mlxsw_sp_port *mlxsw_sp_port, u8 width)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u32 upper_speed = MLXSW_SP_PORT_BASE_SPEED * width;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+	u32 eth_proto_admin;
+
+	eth_proto_admin = mlxsw_sp_to_ptys_upper_speed(upper_speed);
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sp_port->local_port,
+				eth_proto_admin, mlxsw_sp_port->link.autoneg);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptys), ptys_pl);
+}
+
+int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port,
+			  enum mlxsw_reg_qeec_hr hr, u8 index, u8 next_index,
+			  bool dwrr, u8 dwrr_weight)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char qeec_pl[MLXSW_REG_QEEC_LEN];
+
+	mlxsw_reg_qeec_pack(qeec_pl, mlxsw_sp_port->local_port, hr, index,
+			    next_index);
+	mlxsw_reg_qeec_de_set(qeec_pl, true);
+	mlxsw_reg_qeec_dwrr_set(qeec_pl, dwrr);
+	mlxsw_reg_qeec_dwrr_weight_set(qeec_pl, dwrr_weight);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qeec), qeec_pl);
+}
+
+int mlxsw_sp_port_ets_maxrate_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				  enum mlxsw_reg_qeec_hr hr, u8 index,
+				  u8 next_index, u32 maxrate)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char qeec_pl[MLXSW_REG_QEEC_LEN];
+
+	mlxsw_reg_qeec_pack(qeec_pl, mlxsw_sp_port->local_port, hr, index,
+			    next_index);
+	mlxsw_reg_qeec_mase_set(qeec_pl, true);
+	mlxsw_reg_qeec_max_shaper_rate_set(qeec_pl, maxrate);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qeec), qeec_pl);
+}
+
+int mlxsw_sp_port_prio_tc_set(struct mlxsw_sp_port *mlxsw_sp_port,
+			      u8 switch_prio, u8 tclass)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char qtct_pl[MLXSW_REG_QTCT_LEN];
+
+	mlxsw_reg_qtct_pack(qtct_pl, mlxsw_sp_port->local_port, switch_prio,
+			    tclass);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qtct), qtct_pl);
+}
+
+static int mlxsw_sp_port_ets_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err, i;
+
+	/* Setup the elements hierarcy, so that each TC is linked to
+	 * one subgroup, which are all member in the same group.
+	 */
+	err = mlxsw_sp_port_ets_set(mlxsw_sp_port,
+				    MLXSW_REG_QEEC_HIERARCY_GROUP, 0, 0, false,
+				    0);
+	if (err)
+		return err;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_ets_set(mlxsw_sp_port,
+					    MLXSW_REG_QEEC_HIERARCY_SUBGROUP, i,
+					    0, false, 0);
+		if (err)
+			return err;
+	}
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_ets_set(mlxsw_sp_port,
+					    MLXSW_REG_QEEC_HIERARCY_TC, i, i,
+					    false, 0);
+		if (err)
+			return err;
+	}
+
+	/* Make sure the max shaper is disabled in all hierarcies that
+	 * support it.
+	 */
+	err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port,
+					    MLXSW_REG_QEEC_HIERARCY_PORT, 0, 0,
+					    MLXSW_REG_QEEC_MAS_DIS);
+	if (err)
+		return err;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port,
+						    MLXSW_REG_QEEC_HIERARCY_SUBGROUP,
+						    i, 0,
+						    MLXSW_REG_QEEC_MAS_DIS);
+		if (err)
+			return err;
+	}
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port,
+						    MLXSW_REG_QEEC_HIERARCY_TC,
+						    i, i,
+						    MLXSW_REG_QEEC_MAS_DIS);
+		if (err)
+			return err;
+	}
+
+	/* Map all priorities to traffic class 0. */
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_prio_tc_set(mlxsw_sp_port, i, 0);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				bool split, u8 module, u8 width, u8 lane)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	struct net_device *dev;
+	int err;
+
+	err = mlxsw_core_port_init(mlxsw_sp->core, local_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to init core port\n",
+			local_port);
+		return err;
+	}
+
+	dev = alloc_etherdev(sizeof(struct mlxsw_sp_port));
+	if (!dev) {
+		err = -ENOMEM;
+		goto err_alloc_etherdev;
+	}
+	SET_NETDEV_DEV(dev, mlxsw_sp->bus_info->dev);
+	mlxsw_sp_port = netdev_priv(dev);
+	mlxsw_sp_port->dev = dev;
+	mlxsw_sp_port->mlxsw_sp = mlxsw_sp;
+	mlxsw_sp_port->local_port = local_port;
+	mlxsw_sp_port->pvid = 1;
+	mlxsw_sp_port->split = split;
+	mlxsw_sp_port->mapping.module = module;
+	mlxsw_sp_port->mapping.width = width;
+	mlxsw_sp_port->mapping.lane = lane;
+	mlxsw_sp_port->link.autoneg = 1;
+	INIT_LIST_HEAD(&mlxsw_sp_port->vlans_list);
+	INIT_LIST_HEAD(&mlxsw_sp_port->mall_tc_list);
+
+	mlxsw_sp_port->pcpu_stats =
+		netdev_alloc_pcpu_stats(struct mlxsw_sp_port_pcpu_stats);
+	if (!mlxsw_sp_port->pcpu_stats) {
+		err = -ENOMEM;
+		goto err_alloc_stats;
+	}
+
+	mlxsw_sp_port->sample = kzalloc(sizeof(*mlxsw_sp_port->sample),
+					GFP_KERNEL);
+	if (!mlxsw_sp_port->sample) {
+		err = -ENOMEM;
+		goto err_alloc_sample;
+	}
+
+	INIT_DELAYED_WORK(&mlxsw_sp_port->periodic_hw_stats.update_dw,
+			  &update_stats_cache);
+
+	dev->netdev_ops = &mlxsw_sp_port_netdev_ops;
+	dev->ethtool_ops = &mlxsw_sp_port_ethtool_ops;
+
+	err = mlxsw_sp_port_module_map(mlxsw_sp_port, module, width, lane);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to map module\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_module_map;
+	}
+
+	err = mlxsw_sp_port_swid_set(mlxsw_sp_port, 0);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to set SWID\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_swid_set;
+	}
+
+	err = mlxsw_sp_port_dev_addr_init(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Unable to init port mac address\n",
+			mlxsw_sp_port->local_port);
+		goto err_dev_addr_init;
+	}
+
+	netif_carrier_off(dev);
+
+	dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_LLTX | NETIF_F_SG |
+			 NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_TC;
+	dev->hw_features |= NETIF_F_HW_TC;
+
+	dev->min_mtu = 0;
+	dev->max_mtu = ETH_MAX_MTU;
+
+	/* Each packet needs to have a Tx header (metadata) on top all other
+	 * headers.
+	 */
+	dev->needed_headroom = MLXSW_TXHDR_LEN;
+
+	err = mlxsw_sp_port_system_port_mapping_set(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to set system port mapping\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_system_port_mapping_set;
+	}
+
+	err = mlxsw_sp_port_speed_by_width_set(mlxsw_sp_port, width);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to enable speeds\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_speed_by_width_set;
+	}
+
+	err = mlxsw_sp_port_mtu_set(mlxsw_sp_port, ETH_DATA_LEN);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to set MTU\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_mtu_set;
+	}
+
+	err = mlxsw_sp_port_admin_status_set(mlxsw_sp_port, false);
+	if (err)
+		goto err_port_admin_status_set;
+
+	err = mlxsw_sp_port_buffers_init(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize buffers\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_buffers_init;
+	}
+
+	err = mlxsw_sp_port_ets_init(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize ETS\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_ets_init;
+	}
+
+	/* ETS and buffers must be initialized before DCB. */
+	err = mlxsw_sp_port_dcb_init(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize DCB\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_dcb_init;
+	}
+
+	err = mlxsw_sp_port_fids_init(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize FIDs\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_fids_init;
+	}
+
+	err = mlxsw_sp_tc_qdisc_init(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize TC qdiscs\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_qdiscs_init;
+	}
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_get(mlxsw_sp_port, 1);
+	if (IS_ERR(mlxsw_sp_port_vlan)) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to create VID 1\n",
+			mlxsw_sp_port->local_port);
+		err = PTR_ERR(mlxsw_sp_port_vlan);
+		goto err_port_vlan_get;
+	}
+
+	mlxsw_sp_port_switchdev_init(mlxsw_sp_port);
+	mlxsw_sp->ports[local_port] = mlxsw_sp_port;
+	err = register_netdev(dev);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to register netdev\n",
+			mlxsw_sp_port->local_port);
+		goto err_register_netdev;
+	}
+
+	mlxsw_core_port_eth_set(mlxsw_sp->core, mlxsw_sp_port->local_port,
+				mlxsw_sp_port, dev, mlxsw_sp_port->split,
+				module);
+	mlxsw_core_schedule_dw(&mlxsw_sp_port->periodic_hw_stats.update_dw, 0);
+	return 0;
+
+err_register_netdev:
+	mlxsw_sp->ports[local_port] = NULL;
+	mlxsw_sp_port_switchdev_fini(mlxsw_sp_port);
+	mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
+err_port_vlan_get:
+	mlxsw_sp_tc_qdisc_fini(mlxsw_sp_port);
+err_port_qdiscs_init:
+	mlxsw_sp_port_fids_fini(mlxsw_sp_port);
+err_port_fids_init:
+	mlxsw_sp_port_dcb_fini(mlxsw_sp_port);
+err_port_dcb_init:
+err_port_ets_init:
+err_port_buffers_init:
+err_port_admin_status_set:
+err_port_mtu_set:
+err_port_speed_by_width_set:
+err_port_system_port_mapping_set:
+err_dev_addr_init:
+	mlxsw_sp_port_swid_set(mlxsw_sp_port, MLXSW_PORT_SWID_DISABLED_PORT);
+err_port_swid_set:
+	mlxsw_sp_port_module_unmap(mlxsw_sp_port);
+err_port_module_map:
+	kfree(mlxsw_sp_port->sample);
+err_alloc_sample:
+	free_percpu(mlxsw_sp_port->pcpu_stats);
+err_alloc_stats:
+	free_netdev(dev);
+err_alloc_etherdev:
+	mlxsw_core_port_fini(mlxsw_sp->core, local_port);
+	return err;
+}
+
+static void mlxsw_sp_port_remove(struct mlxsw_sp *mlxsw_sp, u8 local_port)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port];
+
+	cancel_delayed_work_sync(&mlxsw_sp_port->periodic_hw_stats.update_dw);
+	mlxsw_core_port_clear(mlxsw_sp->core, local_port, mlxsw_sp);
+	unregister_netdev(mlxsw_sp_port->dev); /* This calls ndo_stop */
+	mlxsw_sp->ports[local_port] = NULL;
+	mlxsw_sp_port_switchdev_fini(mlxsw_sp_port);
+	mlxsw_sp_port_vlan_flush(mlxsw_sp_port);
+	mlxsw_sp_tc_qdisc_fini(mlxsw_sp_port);
+	mlxsw_sp_port_fids_fini(mlxsw_sp_port);
+	mlxsw_sp_port_dcb_fini(mlxsw_sp_port);
+	mlxsw_sp_port_swid_set(mlxsw_sp_port, MLXSW_PORT_SWID_DISABLED_PORT);
+	mlxsw_sp_port_module_unmap(mlxsw_sp_port);
+	kfree(mlxsw_sp_port->sample);
+	free_percpu(mlxsw_sp_port->pcpu_stats);
+	WARN_ON_ONCE(!list_empty(&mlxsw_sp_port->vlans_list));
+	free_netdev(mlxsw_sp_port->dev);
+	mlxsw_core_port_fini(mlxsw_sp->core, local_port);
+}
+
+static bool mlxsw_sp_port_created(struct mlxsw_sp *mlxsw_sp, u8 local_port)
+{
+	return mlxsw_sp->ports[local_port] != NULL;
+}
+
+static void mlxsw_sp_ports_remove(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+
+	for (i = 1; i < mlxsw_core_max_ports(mlxsw_sp->core); i++)
+		if (mlxsw_sp_port_created(mlxsw_sp, i))
+			mlxsw_sp_port_remove(mlxsw_sp, i);
+	kfree(mlxsw_sp->port_to_module);
+	kfree(mlxsw_sp->ports);
+}
+
+static int mlxsw_sp_ports_create(struct mlxsw_sp *mlxsw_sp)
+{
+	unsigned int max_ports = mlxsw_core_max_ports(mlxsw_sp->core);
+	u8 module, width, lane;
+	size_t alloc_size;
+	int i;
+	int err;
+
+	alloc_size = sizeof(struct mlxsw_sp_port *) * max_ports;
+	mlxsw_sp->ports = kzalloc(alloc_size, GFP_KERNEL);
+	if (!mlxsw_sp->ports)
+		return -ENOMEM;
+
+	mlxsw_sp->port_to_module = kmalloc_array(max_ports, sizeof(int),
+						 GFP_KERNEL);
+	if (!mlxsw_sp->port_to_module) {
+		err = -ENOMEM;
+		goto err_port_to_module_alloc;
+	}
+
+	for (i = 1; i < max_ports; i++) {
+		/* Mark as invalid */
+		mlxsw_sp->port_to_module[i] = -1;
+
+		err = mlxsw_sp_port_module_info_get(mlxsw_sp, i, &module,
+						    &width, &lane);
+		if (err)
+			goto err_port_module_info_get;
+		if (!width)
+			continue;
+		mlxsw_sp->port_to_module[i] = module;
+		err = mlxsw_sp_port_create(mlxsw_sp, i, false,
+					   module, width, lane);
+		if (err)
+			goto err_port_create;
+	}
+	return 0;
+
+err_port_create:
+err_port_module_info_get:
+	for (i--; i >= 1; i--)
+		if (mlxsw_sp_port_created(mlxsw_sp, i))
+			mlxsw_sp_port_remove(mlxsw_sp, i);
+	kfree(mlxsw_sp->port_to_module);
+err_port_to_module_alloc:
+	kfree(mlxsw_sp->ports);
+	return err;
+}
+
+static u8 mlxsw_sp_cluster_base_port_get(u8 local_port)
+{
+	u8 offset = (local_port - 1) % MLXSW_SP_PORTS_PER_CLUSTER_MAX;
+
+	return local_port - offset;
+}
+
+static int mlxsw_sp_port_split_create(struct mlxsw_sp *mlxsw_sp, u8 base_port,
+				      u8 module, unsigned int count)
+{
+	u8 width = MLXSW_PORT_MODULE_MAX_WIDTH / count;
+	int err, i;
+
+	for (i = 0; i < count; i++) {
+		err = mlxsw_sp_port_create(mlxsw_sp, base_port + i, true,
+					   module, width, i * width);
+		if (err)
+			goto err_port_create;
+	}
+
+	return 0;
+
+err_port_create:
+	for (i--; i >= 0; i--)
+		if (mlxsw_sp_port_created(mlxsw_sp, base_port + i))
+			mlxsw_sp_port_remove(mlxsw_sp, base_port + i);
+	return err;
+}
+
+static void mlxsw_sp_port_unsplit_create(struct mlxsw_sp *mlxsw_sp,
+					 u8 base_port, unsigned int count)
+{
+	u8 local_port, module, width = MLXSW_PORT_MODULE_MAX_WIDTH;
+	int i;
+
+	/* Split by four means we need to re-create two ports, otherwise
+	 * only one.
+	 */
+	count = count / 2;
+
+	for (i = 0; i < count; i++) {
+		local_port = base_port + i * 2;
+		if (mlxsw_sp->port_to_module[local_port] < 0)
+			continue;
+		module = mlxsw_sp->port_to_module[local_port];
+
+		mlxsw_sp_port_create(mlxsw_sp, local_port, false, module,
+				     width, 0);
+	}
+}
+
+static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
+			       unsigned int count)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u8 module, cur_width, base_port;
+	int i;
+	int err;
+
+	mlxsw_sp_port = mlxsw_sp->ports[local_port];
+	if (!mlxsw_sp_port) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port number \"%d\" does not exist\n",
+			local_port);
+		return -EINVAL;
+	}
+
+	module = mlxsw_sp_port->mapping.module;
+	cur_width = mlxsw_sp_port->mapping.width;
+
+	if (count != 2 && count != 4) {
+		netdev_err(mlxsw_sp_port->dev, "Port can only be split into 2 or 4 ports\n");
+		return -EINVAL;
+	}
+
+	if (cur_width != MLXSW_PORT_MODULE_MAX_WIDTH) {
+		netdev_err(mlxsw_sp_port->dev, "Port cannot be split further\n");
+		return -EINVAL;
+	}
+
+	/* Make sure we have enough slave (even) ports for the split. */
+	if (count == 2) {
+		base_port = local_port;
+		if (mlxsw_sp->ports[base_port + 1]) {
+			netdev_err(mlxsw_sp_port->dev, "Invalid split configuration\n");
+			return -EINVAL;
+		}
+	} else {
+		base_port = mlxsw_sp_cluster_base_port_get(local_port);
+		if (mlxsw_sp->ports[base_port + 1] ||
+		    mlxsw_sp->ports[base_port + 3]) {
+			netdev_err(mlxsw_sp_port->dev, "Invalid split configuration\n");
+			return -EINVAL;
+		}
+	}
+
+	for (i = 0; i < count; i++)
+		if (mlxsw_sp_port_created(mlxsw_sp, base_port + i))
+			mlxsw_sp_port_remove(mlxsw_sp, base_port + i);
+
+	err = mlxsw_sp_port_split_create(mlxsw_sp, base_port, module, count);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to create split ports\n");
+		goto err_port_split_create;
+	}
+
+	return 0;
+
+err_port_split_create:
+	mlxsw_sp_port_unsplit_create(mlxsw_sp, base_port, count);
+	return err;
+}
+
+static int mlxsw_sp_port_unsplit(struct mlxsw_core *mlxsw_core, u8 local_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u8 cur_width, base_port;
+	unsigned int count;
+	int i;
+
+	mlxsw_sp_port = mlxsw_sp->ports[local_port];
+	if (!mlxsw_sp_port) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port number \"%d\" does not exist\n",
+			local_port);
+		return -EINVAL;
+	}
+
+	if (!mlxsw_sp_port->split) {
+		netdev_err(mlxsw_sp_port->dev, "Port wasn't split\n");
+		return -EINVAL;
+	}
+
+	cur_width = mlxsw_sp_port->mapping.width;
+	count = cur_width == 1 ? 4 : 2;
+
+	base_port = mlxsw_sp_cluster_base_port_get(local_port);
+
+	/* Determine which ports to remove. */
+	if (count == 2 && local_port >= base_port + 2)
+		base_port = base_port + 2;
+
+	for (i = 0; i < count; i++)
+		if (mlxsw_sp_port_created(mlxsw_sp, base_port + i))
+			mlxsw_sp_port_remove(mlxsw_sp, base_port + i);
+
+	mlxsw_sp_port_unsplit_create(mlxsw_sp, base_port, count);
+
+	return 0;
+}
+
+static void mlxsw_sp_pude_event_func(const struct mlxsw_reg_info *reg,
+				     char *pude_pl, void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	enum mlxsw_reg_pude_oper_status status;
+	u8 local_port;
+
+	local_port = mlxsw_reg_pude_local_port_get(pude_pl);
+	mlxsw_sp_port = mlxsw_sp->ports[local_port];
+	if (!mlxsw_sp_port)
+		return;
+
+	status = mlxsw_reg_pude_oper_status_get(pude_pl);
+	if (status == MLXSW_PORT_OPER_STATUS_UP) {
+		netdev_info(mlxsw_sp_port->dev, "link up\n");
+		netif_carrier_on(mlxsw_sp_port->dev);
+	} else {
+		netdev_info(mlxsw_sp_port->dev, "link down\n");
+		netif_carrier_off(mlxsw_sp_port->dev);
+	}
+}
+
+static void mlxsw_sp_rx_listener_no_mark_func(struct sk_buff *skb,
+					      u8 local_port, void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port];
+	struct mlxsw_sp_port_pcpu_stats *pcpu_stats;
+
+	if (unlikely(!mlxsw_sp_port)) {
+		dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: skb received for non-existent port\n",
+				     local_port);
+		return;
+	}
+
+	skb->dev = mlxsw_sp_port->dev;
+
+	pcpu_stats = this_cpu_ptr(mlxsw_sp_port->pcpu_stats);
+	u64_stats_update_begin(&pcpu_stats->syncp);
+	pcpu_stats->rx_packets++;
+	pcpu_stats->rx_bytes += skb->len;
+	u64_stats_update_end(&pcpu_stats->syncp);
+
+	skb->protocol = eth_type_trans(skb, skb->dev);
+	netif_receive_skb(skb);
+}
+
+static void mlxsw_sp_rx_listener_mark_func(struct sk_buff *skb, u8 local_port,
+					   void *priv)
+{
+	skb->offload_fwd_mark = 1;
+	return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv);
+}
+
+static void mlxsw_sp_rx_listener_mr_mark_func(struct sk_buff *skb,
+					      u8 local_port, void *priv)
+{
+	skb->offload_mr_fwd_mark = 1;
+	skb->offload_fwd_mark = 1;
+	return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv);
+}
+
+static void mlxsw_sp_rx_listener_sample_func(struct sk_buff *skb, u8 local_port,
+					     void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port];
+	struct psample_group *psample_group;
+	u32 size;
+
+	if (unlikely(!mlxsw_sp_port)) {
+		dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: sample skb received for non-existent port\n",
+				     local_port);
+		goto out;
+	}
+	if (unlikely(!mlxsw_sp_port->sample)) {
+		dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: sample skb received on unsupported port\n",
+				     local_port);
+		goto out;
+	}
+
+	size = mlxsw_sp_port->sample->truncate ?
+		  mlxsw_sp_port->sample->trunc_size : skb->len;
+
+	rcu_read_lock();
+	psample_group = rcu_dereference(mlxsw_sp_port->sample->psample_group);
+	if (!psample_group)
+		goto out_unlock;
+	psample_sample_packet(psample_group, skb, size,
+			      mlxsw_sp_port->dev->ifindex, 0,
+			      mlxsw_sp_port->sample->rate);
+out_unlock:
+	rcu_read_unlock();
+out:
+	consume_skb(skb);
+}
+
+#define MLXSW_SP_RXL_NO_MARK(_trap_id, _action, _trap_group, _is_ctrl)	\
+	MLXSW_RXL(mlxsw_sp_rx_listener_no_mark_func, _trap_id, _action,	\
+		  _is_ctrl, SP_##_trap_group, DISCARD)
+
+#define MLXSW_SP_RXL_MARK(_trap_id, _action, _trap_group, _is_ctrl)	\
+	MLXSW_RXL(mlxsw_sp_rx_listener_mark_func, _trap_id, _action,	\
+		_is_ctrl, SP_##_trap_group, DISCARD)
+
+#define MLXSW_SP_RXL_MR_MARK(_trap_id, _action, _trap_group, _is_ctrl)	\
+	MLXSW_RXL(mlxsw_sp_rx_listener_mr_mark_func, _trap_id, _action,	\
+		_is_ctrl, SP_##_trap_group, DISCARD)
+
+#define MLXSW_SP_EVENTL(_func, _trap_id)		\
+	MLXSW_EVENTL(_func, _trap_id, SP_EVENT)
+
+static const struct mlxsw_listener mlxsw_sp_listener[] = {
+	/* Events */
+	MLXSW_SP_EVENTL(mlxsw_sp_pude_event_func, PUDE),
+	/* L2 traps */
+	MLXSW_SP_RXL_NO_MARK(STP, TRAP_TO_CPU, STP, true),
+	MLXSW_SP_RXL_NO_MARK(LACP, TRAP_TO_CPU, LACP, true),
+	MLXSW_SP_RXL_NO_MARK(LLDP, TRAP_TO_CPU, LLDP, true),
+	MLXSW_SP_RXL_MARK(DHCP, MIRROR_TO_CPU, DHCP, false),
+	MLXSW_SP_RXL_MARK(IGMP_QUERY, MIRROR_TO_CPU, IGMP, false),
+	MLXSW_SP_RXL_NO_MARK(IGMP_V1_REPORT, TRAP_TO_CPU, IGMP, false),
+	MLXSW_SP_RXL_NO_MARK(IGMP_V2_REPORT, TRAP_TO_CPU, IGMP, false),
+	MLXSW_SP_RXL_NO_MARK(IGMP_V2_LEAVE, TRAP_TO_CPU, IGMP, false),
+	MLXSW_SP_RXL_NO_MARK(IGMP_V3_REPORT, TRAP_TO_CPU, IGMP, false),
+	MLXSW_SP_RXL_MARK(ARPBC, MIRROR_TO_CPU, ARP, false),
+	MLXSW_SP_RXL_MARK(ARPUC, MIRROR_TO_CPU, ARP, false),
+	MLXSW_SP_RXL_NO_MARK(FID_MISS, TRAP_TO_CPU, IP2ME, false),
+	MLXSW_SP_RXL_MARK(IPV6_MLDV12_LISTENER_QUERY, MIRROR_TO_CPU, IPV6_MLD,
+			  false),
+	MLXSW_SP_RXL_NO_MARK(IPV6_MLDV1_LISTENER_REPORT, TRAP_TO_CPU, IPV6_MLD,
+			     false),
+	MLXSW_SP_RXL_NO_MARK(IPV6_MLDV1_LISTENER_DONE, TRAP_TO_CPU, IPV6_MLD,
+			     false),
+	MLXSW_SP_RXL_NO_MARK(IPV6_MLDV2_LISTENER_REPORT, TRAP_TO_CPU, IPV6_MLD,
+			     false),
+	/* L3 traps */
+	MLXSW_SP_RXL_MARK(MTUERROR, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(TTLERROR, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(LBERROR, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(IP2ME, TRAP_TO_CPU, IP2ME, false),
+	MLXSW_SP_RXL_MARK(IPV6_UNSPECIFIED_ADDRESS, TRAP_TO_CPU, ROUTER_EXP,
+			  false),
+	MLXSW_SP_RXL_MARK(IPV6_LINK_LOCAL_DEST, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(IPV6_LINK_LOCAL_SRC, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(IPV6_ALL_NODES_LINK, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(IPV6_ALL_ROUTERS_LINK, TRAP_TO_CPU, ROUTER_EXP,
+			  false),
+	MLXSW_SP_RXL_MARK(IPV4_OSPF, TRAP_TO_CPU, OSPF, false),
+	MLXSW_SP_RXL_MARK(IPV6_OSPF, TRAP_TO_CPU, OSPF, false),
+	MLXSW_SP_RXL_MARK(IPV6_DHCP, TRAP_TO_CPU, DHCP, false),
+	MLXSW_SP_RXL_MARK(RTR_INGRESS0, TRAP_TO_CPU, REMOTE_ROUTE, false),
+	MLXSW_SP_RXL_MARK(IPV4_BGP, TRAP_TO_CPU, BGP, false),
+	MLXSW_SP_RXL_MARK(IPV6_BGP, TRAP_TO_CPU, BGP, false),
+	MLXSW_SP_RXL_MARK(L3_IPV6_ROUTER_SOLICITATION, TRAP_TO_CPU, IPV6_ND,
+			  false),
+	MLXSW_SP_RXL_MARK(L3_IPV6_ROUTER_ADVERTISMENT, TRAP_TO_CPU, IPV6_ND,
+			  false),
+	MLXSW_SP_RXL_MARK(L3_IPV6_NEIGHBOR_SOLICITATION, TRAP_TO_CPU, IPV6_ND,
+			  false),
+	MLXSW_SP_RXL_MARK(L3_IPV6_NEIGHBOR_ADVERTISMENT, TRAP_TO_CPU, IPV6_ND,
+			  false),
+	MLXSW_SP_RXL_MARK(L3_IPV6_REDIRECTION, TRAP_TO_CPU, IPV6_ND, false),
+	MLXSW_SP_RXL_MARK(IPV6_MC_LINK_LOCAL_DEST, TRAP_TO_CPU, ROUTER_EXP,
+			  false),
+	MLXSW_SP_RXL_MARK(HOST_MISS_IPV4, TRAP_TO_CPU, HOST_MISS, false),
+	MLXSW_SP_RXL_MARK(HOST_MISS_IPV6, TRAP_TO_CPU, HOST_MISS, false),
+	MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV4, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV6, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(IPIP_DECAP_ERROR, TRAP_TO_CPU, ROUTER_EXP, false),
+	/* PKT Sample trap */
+	MLXSW_RXL(mlxsw_sp_rx_listener_sample_func, PKT_SAMPLE, MIRROR_TO_CPU,
+		  false, SP_IP2ME, DISCARD),
+	/* ACL trap */
+	MLXSW_SP_RXL_NO_MARK(ACL0, TRAP_TO_CPU, IP2ME, false),
+	/* Multicast Router Traps */
+	MLXSW_SP_RXL_MARK(IPV4_PIM, TRAP_TO_CPU, PIM, false),
+	MLXSW_SP_RXL_MARK(IPV6_PIM, TRAP_TO_CPU, PIM, false),
+	MLXSW_SP_RXL_MARK(RPF, TRAP_TO_CPU, RPF, false),
+	MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false),
+	MLXSW_SP_RXL_MR_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false),
+};
+
+static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core)
+{
+	char qpcr_pl[MLXSW_REG_QPCR_LEN];
+	enum mlxsw_reg_qpcr_ir_units ir_units;
+	int max_cpu_policers;
+	bool is_bytes;
+	u8 burst_size;
+	u32 rate;
+	int i, err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_core, MAX_CPU_POLICERS))
+		return -EIO;
+
+	max_cpu_policers = MLXSW_CORE_RES_GET(mlxsw_core, MAX_CPU_POLICERS);
+
+	ir_units = MLXSW_REG_QPCR_IR_UNITS_M;
+	for (i = 0; i < max_cpu_policers; i++) {
+		is_bytes = false;
+		switch (i) {
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_STP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF:
+			rate = 128;
+			burst_size = 7;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IGMP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD:
+			rate = 16 * 1024;
+			burst_size = 10;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST:
+			rate = 1024;
+			burst_size = 7;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME:
+			is_bytes = true;
+			rate = 4 * 1024;
+			burst_size = 4;
+			break;
+		default:
+			continue;
+		}
+
+		mlxsw_reg_qpcr_pack(qpcr_pl, i, ir_units, is_bytes, rate,
+				    burst_size);
+		err = mlxsw_reg_write(mlxsw_core, MLXSW_REG(qpcr), qpcr_pl);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core)
+{
+	char htgt_pl[MLXSW_REG_HTGT_LEN];
+	enum mlxsw_reg_htgt_trap_group i;
+	int max_cpu_policers;
+	int max_trap_groups;
+	u8 priority, tc;
+	u16 policer_id;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_core, MAX_TRAP_GROUPS))
+		return -EIO;
+
+	max_trap_groups = MLXSW_CORE_RES_GET(mlxsw_core, MAX_TRAP_GROUPS);
+	max_cpu_policers = MLXSW_CORE_RES_GET(mlxsw_core, MAX_CPU_POLICERS);
+
+	for (i = 0; i < max_trap_groups; i++) {
+		policer_id = i;
+		switch (i) {
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_STP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM:
+			priority = 5;
+			tc = 5;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP:
+			priority = 4;
+			tc = 4;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IGMP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD:
+			priority = 3;
+			tc = 3;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF:
+			priority = 2;
+			tc = 2;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST:
+			priority = 1;
+			tc = 1;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_EVENT:
+			priority = MLXSW_REG_HTGT_DEFAULT_PRIORITY;
+			tc = MLXSW_REG_HTGT_DEFAULT_TC;
+			policer_id = MLXSW_REG_HTGT_INVALID_POLICER;
+			break;
+		default:
+			continue;
+		}
+
+		if (max_cpu_policers <= policer_id &&
+		    policer_id != MLXSW_REG_HTGT_INVALID_POLICER)
+			return -EIO;
+
+		mlxsw_reg_htgt_pack(htgt_pl, i, policer_id, priority, tc);
+		err = mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_traps_init(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+	int err;
+
+	err = mlxsw_sp_cpu_policers_set(mlxsw_sp->core);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_trap_groups_set(mlxsw_sp->core);
+	if (err)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sp_listener); i++) {
+		err = mlxsw_core_trap_register(mlxsw_sp->core,
+					       &mlxsw_sp_listener[i],
+					       mlxsw_sp);
+		if (err)
+			goto err_listener_register;
+
+	}
+	return 0;
+
+err_listener_register:
+	for (i--; i >= 0; i--) {
+		mlxsw_core_trap_unregister(mlxsw_sp->core,
+					   &mlxsw_sp_listener[i],
+					   mlxsw_sp);
+	}
+	return err;
+}
+
+static void mlxsw_sp_traps_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sp_listener); i++) {
+		mlxsw_core_trap_unregister(mlxsw_sp->core,
+					   &mlxsw_sp_listener[i],
+					   mlxsw_sp);
+	}
+}
+
+static int mlxsw_sp_lag_init(struct mlxsw_sp *mlxsw_sp)
+{
+	char slcr_pl[MLXSW_REG_SLCR_LEN];
+	int err;
+
+	mlxsw_reg_slcr_pack(slcr_pl, MLXSW_REG_SLCR_LAG_HASH_SMAC |
+				     MLXSW_REG_SLCR_LAG_HASH_DMAC |
+				     MLXSW_REG_SLCR_LAG_HASH_ETHERTYPE |
+				     MLXSW_REG_SLCR_LAG_HASH_VLANID |
+				     MLXSW_REG_SLCR_LAG_HASH_SIP |
+				     MLXSW_REG_SLCR_LAG_HASH_DIP |
+				     MLXSW_REG_SLCR_LAG_HASH_SPORT |
+				     MLXSW_REG_SLCR_LAG_HASH_DPORT |
+				     MLXSW_REG_SLCR_LAG_HASH_IPPROTO);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(slcr), slcr_pl);
+	if (err)
+		return err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_LAG) ||
+	    !MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_LAG_MEMBERS))
+		return -EIO;
+
+	mlxsw_sp->lags = kcalloc(MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_LAG),
+				 sizeof(struct mlxsw_sp_upper),
+				 GFP_KERNEL);
+	if (!mlxsw_sp->lags)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void mlxsw_sp_lag_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	kfree(mlxsw_sp->lags);
+}
+
+static int mlxsw_sp_basic_trap_groups_set(struct mlxsw_core *mlxsw_core)
+{
+	char htgt_pl[MLXSW_REG_HTGT_LEN];
+
+	mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_EMAD,
+			    MLXSW_REG_HTGT_INVALID_POLICER,
+			    MLXSW_REG_HTGT_DEFAULT_PRIORITY,
+			    MLXSW_REG_HTGT_DEFAULT_TC);
+	return mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
+}
+
+static int mlxsw_sp_netdevice_event(struct notifier_block *unused,
+				    unsigned long event, void *ptr);
+
+static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core,
+			 const struct mlxsw_bus_info *mlxsw_bus_info)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	int err;
+
+	mlxsw_sp->core = mlxsw_core;
+	mlxsw_sp->bus_info = mlxsw_bus_info;
+
+	err = mlxsw_sp_fw_rev_validate(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Could not upgrade firmware\n");
+		return err;
+	}
+
+	err = mlxsw_sp_base_mac_get(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to get base mac\n");
+		return err;
+	}
+
+	err = mlxsw_sp_kvdl_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize KVDL\n");
+		return err;
+	}
+
+	err = mlxsw_sp_fids_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize FIDs\n");
+		goto err_fids_init;
+	}
+
+	err = mlxsw_sp_traps_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to set traps\n");
+		goto err_traps_init;
+	}
+
+	err = mlxsw_sp_buffers_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize buffers\n");
+		goto err_buffers_init;
+	}
+
+	err = mlxsw_sp_lag_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize LAG\n");
+		goto err_lag_init;
+	}
+
+	err = mlxsw_sp_switchdev_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize switchdev\n");
+		goto err_switchdev_init;
+	}
+
+	err = mlxsw_sp_counter_pool_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to init counter pool\n");
+		goto err_counter_pool_init;
+	}
+
+	err = mlxsw_sp_afa_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize ACL actions\n");
+		goto err_afa_init;
+	}
+
+	err = mlxsw_sp_span_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to init span system\n");
+		goto err_span_init;
+	}
+
+	/* Initialize router after SPAN is initialized, so that the FIB and
+	 * neighbor event handlers can issue SPAN respin.
+	 */
+	err = mlxsw_sp_router_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize router\n");
+		goto err_router_init;
+	}
+
+	/* Initialize netdevice notifier after router and SPAN is initialized,
+	 * so that the event handler can use router structures and call SPAN
+	 * respin.
+	 */
+	mlxsw_sp->netdevice_nb.notifier_call = mlxsw_sp_netdevice_event;
+	err = register_netdevice_notifier(&mlxsw_sp->netdevice_nb);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to register netdev notifier\n");
+		goto err_netdev_notifier;
+	}
+
+	err = mlxsw_sp_acl_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize ACL\n");
+		goto err_acl_init;
+	}
+
+	err = mlxsw_sp_dpipe_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to init pipeline debug\n");
+		goto err_dpipe_init;
+	}
+
+	err = mlxsw_sp_ports_create(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to create ports\n");
+		goto err_ports_create;
+	}
+
+	return 0;
+
+err_ports_create:
+	mlxsw_sp_dpipe_fini(mlxsw_sp);
+err_dpipe_init:
+	mlxsw_sp_acl_fini(mlxsw_sp);
+err_acl_init:
+	unregister_netdevice_notifier(&mlxsw_sp->netdevice_nb);
+err_netdev_notifier:
+	mlxsw_sp_router_fini(mlxsw_sp);
+err_router_init:
+	mlxsw_sp_span_fini(mlxsw_sp);
+err_span_init:
+	mlxsw_sp_afa_fini(mlxsw_sp);
+err_afa_init:
+	mlxsw_sp_counter_pool_fini(mlxsw_sp);
+err_counter_pool_init:
+	mlxsw_sp_switchdev_fini(mlxsw_sp);
+err_switchdev_init:
+	mlxsw_sp_lag_fini(mlxsw_sp);
+err_lag_init:
+	mlxsw_sp_buffers_fini(mlxsw_sp);
+err_buffers_init:
+	mlxsw_sp_traps_fini(mlxsw_sp);
+err_traps_init:
+	mlxsw_sp_fids_fini(mlxsw_sp);
+err_fids_init:
+	mlxsw_sp_kvdl_fini(mlxsw_sp);
+	return err;
+}
+
+static void mlxsw_sp_fini(struct mlxsw_core *mlxsw_core)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+
+	mlxsw_sp_ports_remove(mlxsw_sp);
+	mlxsw_sp_dpipe_fini(mlxsw_sp);
+	mlxsw_sp_acl_fini(mlxsw_sp);
+	unregister_netdevice_notifier(&mlxsw_sp->netdevice_nb);
+	mlxsw_sp_router_fini(mlxsw_sp);
+	mlxsw_sp_span_fini(mlxsw_sp);
+	mlxsw_sp_afa_fini(mlxsw_sp);
+	mlxsw_sp_counter_pool_fini(mlxsw_sp);
+	mlxsw_sp_switchdev_fini(mlxsw_sp);
+	mlxsw_sp_lag_fini(mlxsw_sp);
+	mlxsw_sp_buffers_fini(mlxsw_sp);
+	mlxsw_sp_traps_fini(mlxsw_sp);
+	mlxsw_sp_fids_fini(mlxsw_sp);
+	mlxsw_sp_kvdl_fini(mlxsw_sp);
+}
+
+static const struct mlxsw_config_profile mlxsw_sp_config_profile = {
+	.used_max_mid			= 1,
+	.max_mid			= MLXSW_SP_MID_MAX,
+	.used_flood_tables		= 1,
+	.used_flood_mode		= 1,
+	.flood_mode			= 3,
+	.max_fid_offset_flood_tables	= 3,
+	.fid_offset_flood_table_size	= VLAN_N_VID - 1,
+	.max_fid_flood_tables		= 3,
+	.fid_flood_table_size		= MLXSW_SP_FID_8021D_MAX,
+	.used_max_ib_mc			= 1,
+	.max_ib_mc			= 0,
+	.used_max_pkey			= 1,
+	.max_pkey			= 0,
+	.used_kvd_sizes			= 1,
+	.kvd_hash_single_parts		= 59,
+	.kvd_hash_double_parts		= 41,
+	.kvd_linear_size		= MLXSW_SP_KVD_LINEAR_SIZE,
+	.swid_config			= {
+		{
+			.used_type	= 1,
+			.type		= MLXSW_PORT_SWID_TYPE_ETH,
+		}
+	},
+};
+
+static void
+mlxsw_sp_resource_size_params_prepare(struct mlxsw_core *mlxsw_core,
+				      struct devlink_resource_size_params *kvd_size_params,
+				      struct devlink_resource_size_params *linear_size_params,
+				      struct devlink_resource_size_params *hash_double_size_params,
+				      struct devlink_resource_size_params *hash_single_size_params)
+{
+	u32 single_size_min = MLXSW_CORE_RES_GET(mlxsw_core,
+						 KVD_SINGLE_MIN_SIZE);
+	u32 double_size_min = MLXSW_CORE_RES_GET(mlxsw_core,
+						 KVD_DOUBLE_MIN_SIZE);
+	u32 kvd_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE);
+	u32 linear_size_min = 0;
+
+	devlink_resource_size_params_init(kvd_size_params, kvd_size, kvd_size,
+					  MLXSW_SP_KVD_GRANULARITY,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+	devlink_resource_size_params_init(linear_size_params, linear_size_min,
+					  kvd_size - single_size_min -
+					  double_size_min,
+					  MLXSW_SP_KVD_GRANULARITY,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+	devlink_resource_size_params_init(hash_double_size_params,
+					  double_size_min,
+					  kvd_size - single_size_min -
+					  linear_size_min,
+					  MLXSW_SP_KVD_GRANULARITY,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+	devlink_resource_size_params_init(hash_single_size_params,
+					  single_size_min,
+					  kvd_size - double_size_min -
+					  linear_size_min,
+					  MLXSW_SP_KVD_GRANULARITY,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+}
+
+static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+	struct devlink_resource_size_params hash_single_size_params;
+	struct devlink_resource_size_params hash_double_size_params;
+	struct devlink_resource_size_params linear_size_params;
+	struct devlink_resource_size_params kvd_size_params;
+	u32 kvd_size, single_size, double_size, linear_size;
+	const struct mlxsw_config_profile *profile;
+	int err;
+
+	profile = &mlxsw_sp_config_profile;
+	if (!MLXSW_CORE_RES_VALID(mlxsw_core, KVD_SIZE))
+		return -EIO;
+
+	mlxsw_sp_resource_size_params_prepare(mlxsw_core, &kvd_size_params,
+					      &linear_size_params,
+					      &hash_double_size_params,
+					      &hash_single_size_params);
+
+	kvd_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD,
+					kvd_size, MLXSW_SP_RESOURCE_KVD,
+					DEVLINK_RESOURCE_ID_PARENT_TOP,
+					&kvd_size_params);
+	if (err)
+		return err;
+
+	linear_size = profile->kvd_linear_size;
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR,
+					linear_size,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					MLXSW_SP_RESOURCE_KVD,
+					&linear_size_params);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_kvdl_resources_register(mlxsw_core);
+	if  (err)
+		return err;
+
+	double_size = kvd_size - linear_size;
+	double_size *= profile->kvd_hash_double_parts;
+	double_size /= profile->kvd_hash_double_parts +
+		       profile->kvd_hash_single_parts;
+	double_size = rounddown(double_size, MLXSW_SP_KVD_GRANULARITY);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_HASH_DOUBLE,
+					double_size,
+					MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+					MLXSW_SP_RESOURCE_KVD,
+					&hash_double_size_params);
+	if (err)
+		return err;
+
+	single_size = kvd_size - double_size - linear_size;
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_HASH_SINGLE,
+					single_size,
+					MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+					MLXSW_SP_RESOURCE_KVD,
+					&hash_single_size_params);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int mlxsw_sp_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
+				  const struct mlxsw_config_profile *profile,
+				  u64 *p_single_size, u64 *p_double_size,
+				  u64 *p_linear_size)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+	u32 double_size;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_core, KVD_SINGLE_MIN_SIZE) ||
+	    !MLXSW_CORE_RES_VALID(mlxsw_core, KVD_DOUBLE_MIN_SIZE))
+		return -EIO;
+
+	/* The hash part is what left of the kvd without the
+	 * linear part. It is split to the single size and
+	 * double size by the parts ratio from the profile.
+	 * Both sizes must be a multiplications of the
+	 * granularity from the profile. In case the user
+	 * provided the sizes they are obtained via devlink.
+	 */
+	err = devlink_resource_size_get(devlink,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					p_linear_size);
+	if (err)
+		*p_linear_size = profile->kvd_linear_size;
+
+	err = devlink_resource_size_get(devlink,
+					MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+					p_double_size);
+	if (err) {
+		double_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) -
+			      *p_linear_size;
+		double_size *= profile->kvd_hash_double_parts;
+		double_size /= profile->kvd_hash_double_parts +
+			       profile->kvd_hash_single_parts;
+		*p_double_size = rounddown(double_size,
+					   MLXSW_SP_KVD_GRANULARITY);
+	}
+
+	err = devlink_resource_size_get(devlink,
+					MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+					p_single_size);
+	if (err)
+		*p_single_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) -
+				 *p_double_size - *p_linear_size;
+
+	/* Check results are legal. */
+	if (*p_single_size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_SINGLE_MIN_SIZE) ||
+	    *p_double_size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_DOUBLE_MIN_SIZE) ||
+	    MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) < *p_linear_size)
+		return -EIO;
+
+	return 0;
+}
+
+static struct mlxsw_driver mlxsw_sp_driver = {
+	.kind				= mlxsw_sp_driver_name,
+	.priv_size			= sizeof(struct mlxsw_sp),
+	.init				= mlxsw_sp_init,
+	.fini				= mlxsw_sp_fini,
+	.basic_trap_groups_set		= mlxsw_sp_basic_trap_groups_set,
+	.port_split			= mlxsw_sp_port_split,
+	.port_unsplit			= mlxsw_sp_port_unsplit,
+	.sb_pool_get			= mlxsw_sp_sb_pool_get,
+	.sb_pool_set			= mlxsw_sp_sb_pool_set,
+	.sb_port_pool_get		= mlxsw_sp_sb_port_pool_get,
+	.sb_port_pool_set		= mlxsw_sp_sb_port_pool_set,
+	.sb_tc_pool_bind_get		= mlxsw_sp_sb_tc_pool_bind_get,
+	.sb_tc_pool_bind_set		= mlxsw_sp_sb_tc_pool_bind_set,
+	.sb_occ_snapshot		= mlxsw_sp_sb_occ_snapshot,
+	.sb_occ_max_clear		= mlxsw_sp_sb_occ_max_clear,
+	.sb_occ_port_pool_get		= mlxsw_sp_sb_occ_port_pool_get,
+	.sb_occ_tc_port_bind_get	= mlxsw_sp_sb_occ_tc_port_bind_get,
+	.txhdr_construct		= mlxsw_sp_txhdr_construct,
+	.resources_register		= mlxsw_sp_resources_register,
+	.kvd_sizes_get			= mlxsw_sp_kvd_sizes_get,
+	.txhdr_len			= MLXSW_TXHDR_LEN,
+	.profile			= &mlxsw_sp_config_profile,
+	.res_query_enabled		= true,
+};
+
+bool mlxsw_sp_port_dev_check(const struct net_device *dev)
+{
+	return dev->netdev_ops == &mlxsw_sp_port_netdev_ops;
+}
+
+static int mlxsw_sp_lower_dev_walk(struct net_device *lower_dev, void *data)
+{
+	struct mlxsw_sp_port **p_mlxsw_sp_port = data;
+	int ret = 0;
+
+	if (mlxsw_sp_port_dev_check(lower_dev)) {
+		*p_mlxsw_sp_port = netdev_priv(lower_dev);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find(struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+
+	if (mlxsw_sp_port_dev_check(dev))
+		return netdev_priv(dev);
+
+	mlxsw_sp_port = NULL;
+	netdev_walk_all_lower_dev(dev, mlxsw_sp_lower_dev_walk, &mlxsw_sp_port);
+
+	return mlxsw_sp_port;
+}
+
+struct mlxsw_sp *mlxsw_sp_lower_get(struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+
+	mlxsw_sp_port = mlxsw_sp_port_dev_lower_find(dev);
+	return mlxsw_sp_port ? mlxsw_sp_port->mlxsw_sp : NULL;
+}
+
+struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find_rcu(struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+
+	if (mlxsw_sp_port_dev_check(dev))
+		return netdev_priv(dev);
+
+	mlxsw_sp_port = NULL;
+	netdev_walk_all_lower_dev_rcu(dev, mlxsw_sp_lower_dev_walk,
+				      &mlxsw_sp_port);
+
+	return mlxsw_sp_port;
+}
+
+struct mlxsw_sp_port *mlxsw_sp_port_lower_dev_hold(struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+
+	rcu_read_lock();
+	mlxsw_sp_port = mlxsw_sp_port_dev_lower_find_rcu(dev);
+	if (mlxsw_sp_port)
+		dev_hold(mlxsw_sp_port->dev);
+	rcu_read_unlock();
+	return mlxsw_sp_port;
+}
+
+void mlxsw_sp_port_dev_put(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	dev_put(mlxsw_sp_port->dev);
+}
+
+static int mlxsw_sp_lag_create(struct mlxsw_sp *mlxsw_sp, u16 lag_id)
+{
+	char sldr_pl[MLXSW_REG_SLDR_LEN];
+
+	mlxsw_reg_sldr_lag_create_pack(sldr_pl, lag_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sldr), sldr_pl);
+}
+
+static int mlxsw_sp_lag_destroy(struct mlxsw_sp *mlxsw_sp, u16 lag_id)
+{
+	char sldr_pl[MLXSW_REG_SLDR_LEN];
+
+	mlxsw_reg_sldr_lag_destroy_pack(sldr_pl, lag_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sldr), sldr_pl);
+}
+
+static int mlxsw_sp_lag_col_port_add(struct mlxsw_sp_port *mlxsw_sp_port,
+				     u16 lag_id, u8 port_index)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char slcor_pl[MLXSW_REG_SLCOR_LEN];
+
+	mlxsw_reg_slcor_port_add_pack(slcor_pl, mlxsw_sp_port->local_port,
+				      lag_id, port_index);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(slcor), slcor_pl);
+}
+
+static int mlxsw_sp_lag_col_port_remove(struct mlxsw_sp_port *mlxsw_sp_port,
+					u16 lag_id)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char slcor_pl[MLXSW_REG_SLCOR_LEN];
+
+	mlxsw_reg_slcor_port_remove_pack(slcor_pl, mlxsw_sp_port->local_port,
+					 lag_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(slcor), slcor_pl);
+}
+
+static int mlxsw_sp_lag_col_port_enable(struct mlxsw_sp_port *mlxsw_sp_port,
+					u16 lag_id)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char slcor_pl[MLXSW_REG_SLCOR_LEN];
+
+	mlxsw_reg_slcor_col_enable_pack(slcor_pl, mlxsw_sp_port->local_port,
+					lag_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(slcor), slcor_pl);
+}
+
+static int mlxsw_sp_lag_col_port_disable(struct mlxsw_sp_port *mlxsw_sp_port,
+					 u16 lag_id)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char slcor_pl[MLXSW_REG_SLCOR_LEN];
+
+	mlxsw_reg_slcor_col_disable_pack(slcor_pl, mlxsw_sp_port->local_port,
+					 lag_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(slcor), slcor_pl);
+}
+
+static int mlxsw_sp_lag_index_get(struct mlxsw_sp *mlxsw_sp,
+				  struct net_device *lag_dev,
+				  u16 *p_lag_id)
+{
+	struct mlxsw_sp_upper *lag;
+	int free_lag_id = -1;
+	u64 max_lag;
+	int i;
+
+	max_lag = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_LAG);
+	for (i = 0; i < max_lag; i++) {
+		lag = mlxsw_sp_lag_get(mlxsw_sp, i);
+		if (lag->ref_count) {
+			if (lag->dev == lag_dev) {
+				*p_lag_id = i;
+				return 0;
+			}
+		} else if (free_lag_id < 0) {
+			free_lag_id = i;
+		}
+	}
+	if (free_lag_id < 0)
+		return -EBUSY;
+	*p_lag_id = free_lag_id;
+	return 0;
+}
+
+static bool
+mlxsw_sp_master_lag_check(struct mlxsw_sp *mlxsw_sp,
+			  struct net_device *lag_dev,
+			  struct netdev_lag_upper_info *lag_upper_info,
+			  struct netlink_ext_ack *extack)
+{
+	u16 lag_id;
+
+	if (mlxsw_sp_lag_index_get(mlxsw_sp, lag_dev, &lag_id) != 0) {
+		NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported LAG devices");
+		return false;
+	}
+	if (lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH) {
+		NL_SET_ERR_MSG_MOD(extack, "LAG device using unsupported Tx type");
+		return false;
+	}
+	return true;
+}
+
+static int mlxsw_sp_port_lag_index_get(struct mlxsw_sp *mlxsw_sp,
+				       u16 lag_id, u8 *p_port_index)
+{
+	u64 max_lag_members;
+	int i;
+
+	max_lag_members = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+					     MAX_LAG_MEMBERS);
+	for (i = 0; i < max_lag_members; i++) {
+		if (!mlxsw_sp_port_lagged_get(mlxsw_sp, lag_id, i)) {
+			*p_port_index = i;
+			return 0;
+		}
+	}
+	return -EBUSY;
+}
+
+static int mlxsw_sp_port_lag_join(struct mlxsw_sp_port *mlxsw_sp_port,
+				  struct net_device *lag_dev)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct mlxsw_sp_upper *lag;
+	u16 lag_id;
+	u8 port_index;
+	int err;
+
+	err = mlxsw_sp_lag_index_get(mlxsw_sp, lag_dev, &lag_id);
+	if (err)
+		return err;
+	lag = mlxsw_sp_lag_get(mlxsw_sp, lag_id);
+	if (!lag->ref_count) {
+		err = mlxsw_sp_lag_create(mlxsw_sp, lag_id);
+		if (err)
+			return err;
+		lag->dev = lag_dev;
+	}
+
+	err = mlxsw_sp_port_lag_index_get(mlxsw_sp, lag_id, &port_index);
+	if (err)
+		return err;
+	err = mlxsw_sp_lag_col_port_add(mlxsw_sp_port, lag_id, port_index);
+	if (err)
+		goto err_col_port_add;
+	err = mlxsw_sp_lag_col_port_enable(mlxsw_sp_port, lag_id);
+	if (err)
+		goto err_col_port_enable;
+
+	mlxsw_core_lag_mapping_set(mlxsw_sp->core, lag_id, port_index,
+				   mlxsw_sp_port->local_port);
+	mlxsw_sp_port->lag_id = lag_id;
+	mlxsw_sp_port->lagged = 1;
+	lag->ref_count++;
+
+	/* Port is no longer usable as a router interface */
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, 1);
+	if (mlxsw_sp_port_vlan->fid)
+		mlxsw_sp_port_vlan_router_leave(mlxsw_sp_port_vlan);
+
+	return 0;
+
+err_col_port_enable:
+	mlxsw_sp_lag_col_port_remove(mlxsw_sp_port, lag_id);
+err_col_port_add:
+	if (!lag->ref_count)
+		mlxsw_sp_lag_destroy(mlxsw_sp, lag_id);
+	return err;
+}
+
+static void mlxsw_sp_port_lag_leave(struct mlxsw_sp_port *mlxsw_sp_port,
+				    struct net_device *lag_dev)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u16 lag_id = mlxsw_sp_port->lag_id;
+	struct mlxsw_sp_upper *lag;
+
+	if (!mlxsw_sp_port->lagged)
+		return;
+	lag = mlxsw_sp_lag_get(mlxsw_sp, lag_id);
+	WARN_ON(lag->ref_count == 0);
+
+	mlxsw_sp_lag_col_port_disable(mlxsw_sp_port, lag_id);
+	mlxsw_sp_lag_col_port_remove(mlxsw_sp_port, lag_id);
+
+	/* Any VLANs configured on the port are no longer valid */
+	mlxsw_sp_port_vlan_flush(mlxsw_sp_port);
+
+	if (lag->ref_count == 1)
+		mlxsw_sp_lag_destroy(mlxsw_sp, lag_id);
+
+	mlxsw_core_lag_mapping_clear(mlxsw_sp->core, lag_id,
+				     mlxsw_sp_port->local_port);
+	mlxsw_sp_port->lagged = 0;
+	lag->ref_count--;
+
+	mlxsw_sp_port_vlan_get(mlxsw_sp_port, 1);
+	/* Make sure untagged frames are allowed to ingress */
+	mlxsw_sp_port_pvid_set(mlxsw_sp_port, 1);
+}
+
+static int mlxsw_sp_lag_dist_port_add(struct mlxsw_sp_port *mlxsw_sp_port,
+				      u16 lag_id)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char sldr_pl[MLXSW_REG_SLDR_LEN];
+
+	mlxsw_reg_sldr_lag_add_port_pack(sldr_pl, lag_id,
+					 mlxsw_sp_port->local_port);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sldr), sldr_pl);
+}
+
+static int mlxsw_sp_lag_dist_port_remove(struct mlxsw_sp_port *mlxsw_sp_port,
+					 u16 lag_id)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char sldr_pl[MLXSW_REG_SLDR_LEN];
+
+	mlxsw_reg_sldr_lag_remove_port_pack(sldr_pl, lag_id,
+					    mlxsw_sp_port->local_port);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sldr), sldr_pl);
+}
+
+static int mlxsw_sp_port_lag_tx_en_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				       bool lag_tx_enabled)
+{
+	if (lag_tx_enabled)
+		return mlxsw_sp_lag_dist_port_add(mlxsw_sp_port,
+						  mlxsw_sp_port->lag_id);
+	else
+		return mlxsw_sp_lag_dist_port_remove(mlxsw_sp_port,
+						     mlxsw_sp_port->lag_id);
+}
+
+static int mlxsw_sp_port_lag_changed(struct mlxsw_sp_port *mlxsw_sp_port,
+				     struct netdev_lag_lower_state_info *info)
+{
+	return mlxsw_sp_port_lag_tx_en_set(mlxsw_sp_port, info->tx_enabled);
+}
+
+static int mlxsw_sp_port_stp_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				 bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	enum mlxsw_reg_spms_state spms_state;
+	char *spms_pl;
+	u16 vid;
+	int err;
+
+	spms_state = enable ? MLXSW_REG_SPMS_STATE_FORWARDING :
+			      MLXSW_REG_SPMS_STATE_DISCARDING;
+
+	spms_pl = kmalloc(MLXSW_REG_SPMS_LEN, GFP_KERNEL);
+	if (!spms_pl)
+		return -ENOMEM;
+	mlxsw_reg_spms_pack(spms_pl, mlxsw_sp_port->local_port);
+
+	for (vid = 0; vid < VLAN_N_VID; vid++)
+		mlxsw_reg_spms_vid_pack(spms_pl, vid, spms_state);
+
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spms), spms_pl);
+	kfree(spms_pl);
+	return err;
+}
+
+static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	u16 vid = 1;
+	int err;
+
+	err = mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, true);
+	if (err)
+		return err;
+	err = mlxsw_sp_port_stp_set(mlxsw_sp_port, true);
+	if (err)
+		goto err_port_stp_set;
+	err = mlxsw_sp_port_vlan_set(mlxsw_sp_port, 2, VLAN_N_VID - 1,
+				     true, false);
+	if (err)
+		goto err_port_vlan_set;
+
+	for (; vid <= VLAN_N_VID - 1; vid++) {
+		err = mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
+						     vid, false);
+		if (err)
+			goto err_vid_learning_set;
+	}
+
+	return 0;
+
+err_vid_learning_set:
+	for (vid--; vid >= 1; vid--)
+		mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, true);
+err_port_vlan_set:
+	mlxsw_sp_port_stp_set(mlxsw_sp_port, false);
+err_port_stp_set:
+	mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, false);
+	return err;
+}
+
+static void mlxsw_sp_port_ovs_leave(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	u16 vid;
+
+	for (vid = VLAN_N_VID - 1; vid >= 1; vid--)
+		mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
+					       vid, true);
+
+	mlxsw_sp_port_vlan_set(mlxsw_sp_port, 2, VLAN_N_VID - 1,
+			       false, false);
+	mlxsw_sp_port_stp_set(mlxsw_sp_port, false);
+	mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, false);
+}
+
+static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev,
+					       struct net_device *dev,
+					       unsigned long event, void *ptr)
+{
+	struct netdev_notifier_changeupper_info *info;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	struct netlink_ext_ack *extack;
+	struct net_device *upper_dev;
+	struct mlxsw_sp *mlxsw_sp;
+	int err = 0;
+
+	mlxsw_sp_port = netdev_priv(dev);
+	mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	info = ptr;
+	extack = netdev_notifier_info_to_extack(&info->info);
+
+	switch (event) {
+	case NETDEV_PRECHANGEUPPER:
+		upper_dev = info->upper_dev;
+		if (!is_vlan_dev(upper_dev) &&
+		    !netif_is_lag_master(upper_dev) &&
+		    !netif_is_bridge_master(upper_dev) &&
+		    !netif_is_ovs_master(upper_dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "Unknown upper device type");
+			return -EINVAL;
+		}
+		if (!info->linking)
+			break;
+		if (netdev_has_any_upper_dev(upper_dev) &&
+		    (!netif_is_bridge_master(upper_dev) ||
+		     !mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp,
+							  upper_dev))) {
+			NL_SET_ERR_MSG_MOD(extack, "Enslaving a port to a device that already has an upper device is not supported");
+			return -EINVAL;
+		}
+		if (netif_is_lag_master(upper_dev) &&
+		    !mlxsw_sp_master_lag_check(mlxsw_sp, upper_dev,
+					       info->upper_info, extack))
+			return -EINVAL;
+		if (netif_is_lag_master(upper_dev) && vlan_uses_dev(dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "Master device is a LAG master and this device has a VLAN");
+			return -EINVAL;
+		}
+		if (netif_is_lag_port(dev) && is_vlan_dev(upper_dev) &&
+		    !netif_is_lag_master(vlan_dev_real_dev(upper_dev))) {
+			NL_SET_ERR_MSG_MOD(extack, "Can not put a VLAN on a LAG port");
+			return -EINVAL;
+		}
+		if (netif_is_ovs_master(upper_dev) && vlan_uses_dev(dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "Master device is an OVS master and this device has a VLAN");
+			return -EINVAL;
+		}
+		if (netif_is_ovs_port(dev) && is_vlan_dev(upper_dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "Can not put a VLAN on an OVS port");
+			return -EINVAL;
+		}
+		if (is_vlan_dev(upper_dev) &&
+		    vlan_dev_vlan_id(upper_dev) == 1) {
+			NL_SET_ERR_MSG_MOD(extack, "Creating a VLAN device with VID 1 is unsupported: VLAN 1 carries untagged traffic");
+			return -EINVAL;
+		}
+		break;
+	case NETDEV_CHANGEUPPER:
+		upper_dev = info->upper_dev;
+		if (netif_is_bridge_master(upper_dev)) {
+			if (info->linking)
+				err = mlxsw_sp_port_bridge_join(mlxsw_sp_port,
+								lower_dev,
+								upper_dev,
+								extack);
+			else
+				mlxsw_sp_port_bridge_leave(mlxsw_sp_port,
+							   lower_dev,
+							   upper_dev);
+		} else if (netif_is_lag_master(upper_dev)) {
+			if (info->linking)
+				err = mlxsw_sp_port_lag_join(mlxsw_sp_port,
+							     upper_dev);
+			else
+				mlxsw_sp_port_lag_leave(mlxsw_sp_port,
+							upper_dev);
+		} else if (netif_is_ovs_master(upper_dev)) {
+			if (info->linking)
+				err = mlxsw_sp_port_ovs_join(mlxsw_sp_port);
+			else
+				mlxsw_sp_port_ovs_leave(mlxsw_sp_port);
+		}
+		break;
+	}
+
+	return err;
+}
+
+static int mlxsw_sp_netdevice_port_lower_event(struct net_device *dev,
+					       unsigned long event, void *ptr)
+{
+	struct netdev_notifier_changelowerstate_info *info;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	int err;
+
+	mlxsw_sp_port = netdev_priv(dev);
+	info = ptr;
+
+	switch (event) {
+	case NETDEV_CHANGELOWERSTATE:
+		if (netif_is_lag_port(dev) && mlxsw_sp_port->lagged) {
+			err = mlxsw_sp_port_lag_changed(mlxsw_sp_port,
+							info->lower_state_info);
+			if (err)
+				netdev_err(dev, "Failed to reflect link aggregation lower state change\n");
+		}
+		break;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_netdevice_port_event(struct net_device *lower_dev,
+					 struct net_device *port_dev,
+					 unsigned long event, void *ptr)
+{
+	switch (event) {
+	case NETDEV_PRECHANGEUPPER:
+	case NETDEV_CHANGEUPPER:
+		return mlxsw_sp_netdevice_port_upper_event(lower_dev, port_dev,
+							   event, ptr);
+	case NETDEV_CHANGELOWERSTATE:
+		return mlxsw_sp_netdevice_port_lower_event(port_dev, event,
+							   ptr);
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_netdevice_lag_event(struct net_device *lag_dev,
+					unsigned long event, void *ptr)
+{
+	struct net_device *dev;
+	struct list_head *iter;
+	int ret;
+
+	netdev_for_each_lower_dev(lag_dev, dev, iter) {
+		if (mlxsw_sp_port_dev_check(dev)) {
+			ret = mlxsw_sp_netdevice_port_event(lag_dev, dev, event,
+							    ptr);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev,
+					      struct net_device *dev,
+					      unsigned long event, void *ptr,
+					      u16 vid)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct netdev_notifier_changeupper_info *info = ptr;
+	struct netlink_ext_ack *extack;
+	struct net_device *upper_dev;
+	int err = 0;
+
+	extack = netdev_notifier_info_to_extack(&info->info);
+
+	switch (event) {
+	case NETDEV_PRECHANGEUPPER:
+		upper_dev = info->upper_dev;
+		if (!netif_is_bridge_master(upper_dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "VLAN devices only support bridge and VRF uppers");
+			return -EINVAL;
+		}
+		if (!info->linking)
+			break;
+		if (netdev_has_any_upper_dev(upper_dev) &&
+		    (!netif_is_bridge_master(upper_dev) ||
+		     !mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp,
+							  upper_dev))) {
+			NL_SET_ERR_MSG_MOD(extack, "Enslaving a port to a device that already has an upper device is not supported");
+			return -EINVAL;
+		}
+		break;
+	case NETDEV_CHANGEUPPER:
+		upper_dev = info->upper_dev;
+		if (netif_is_bridge_master(upper_dev)) {
+			if (info->linking)
+				err = mlxsw_sp_port_bridge_join(mlxsw_sp_port,
+								vlan_dev,
+								upper_dev,
+								extack);
+			else
+				mlxsw_sp_port_bridge_leave(mlxsw_sp_port,
+							   vlan_dev,
+							   upper_dev);
+		} else {
+			err = -EINVAL;
+			WARN_ON(1);
+		}
+		break;
+	}
+
+	return err;
+}
+
+static int mlxsw_sp_netdevice_lag_port_vlan_event(struct net_device *vlan_dev,
+						  struct net_device *lag_dev,
+						  unsigned long event,
+						  void *ptr, u16 vid)
+{
+	struct net_device *dev;
+	struct list_head *iter;
+	int ret;
+
+	netdev_for_each_lower_dev(lag_dev, dev, iter) {
+		if (mlxsw_sp_port_dev_check(dev)) {
+			ret = mlxsw_sp_netdevice_port_vlan_event(vlan_dev, dev,
+								 event, ptr,
+								 vid);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_netdevice_vlan_event(struct net_device *vlan_dev,
+					 unsigned long event, void *ptr)
+{
+	struct net_device *real_dev = vlan_dev_real_dev(vlan_dev);
+	u16 vid = vlan_dev_vlan_id(vlan_dev);
+
+	if (mlxsw_sp_port_dev_check(real_dev))
+		return mlxsw_sp_netdevice_port_vlan_event(vlan_dev, real_dev,
+							  event, ptr, vid);
+	else if (netif_is_lag_master(real_dev))
+		return mlxsw_sp_netdevice_lag_port_vlan_event(vlan_dev,
+							      real_dev, event,
+							      ptr, vid);
+
+	return 0;
+}
+
+static bool mlxsw_sp_is_vrf_event(unsigned long event, void *ptr)
+{
+	struct netdev_notifier_changeupper_info *info = ptr;
+
+	if (event != NETDEV_PRECHANGEUPPER && event != NETDEV_CHANGEUPPER)
+		return false;
+	return netif_is_l3_master(info->upper_dev);
+}
+
+static int mlxsw_sp_netdevice_event(struct notifier_block *nb,
+				    unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct mlxsw_sp_span_entry *span_entry;
+	struct mlxsw_sp *mlxsw_sp;
+	int err = 0;
+
+	mlxsw_sp = container_of(nb, struct mlxsw_sp, netdevice_nb);
+	if (event == NETDEV_UNREGISTER) {
+		span_entry = mlxsw_sp_span_entry_find_by_port(mlxsw_sp, dev);
+		if (span_entry)
+			mlxsw_sp_span_entry_invalidate(mlxsw_sp, span_entry);
+	}
+	mlxsw_sp_span_respin(mlxsw_sp);
+
+	if (mlxsw_sp_netdev_is_ipip_ol(mlxsw_sp, dev))
+		err = mlxsw_sp_netdevice_ipip_ol_event(mlxsw_sp, dev,
+						       event, ptr);
+	else if (mlxsw_sp_netdev_is_ipip_ul(mlxsw_sp, dev))
+		err = mlxsw_sp_netdevice_ipip_ul_event(mlxsw_sp, dev,
+						       event, ptr);
+	else if (event == NETDEV_CHANGEADDR || event == NETDEV_CHANGEMTU)
+		err = mlxsw_sp_netdevice_router_port_event(dev);
+	else if (mlxsw_sp_is_vrf_event(event, ptr))
+		err = mlxsw_sp_netdevice_vrf_event(dev, event, ptr);
+	else if (mlxsw_sp_port_dev_check(dev))
+		err = mlxsw_sp_netdevice_port_event(dev, dev, event, ptr);
+	else if (netif_is_lag_master(dev))
+		err = mlxsw_sp_netdevice_lag_event(dev, event, ptr);
+	else if (is_vlan_dev(dev))
+		err = mlxsw_sp_netdevice_vlan_event(dev, event, ptr);
+
+	return notifier_from_errno(err);
+}
+
+static struct notifier_block mlxsw_sp_inetaddr_valid_nb __read_mostly = {
+	.notifier_call = mlxsw_sp_inetaddr_valid_event,
+};
+
+static struct notifier_block mlxsw_sp_inetaddr_nb __read_mostly = {
+	.notifier_call = mlxsw_sp_inetaddr_event,
+};
+
+static struct notifier_block mlxsw_sp_inet6addr_valid_nb __read_mostly = {
+	.notifier_call = mlxsw_sp_inet6addr_valid_event,
+};
+
+static struct notifier_block mlxsw_sp_inet6addr_nb __read_mostly = {
+	.notifier_call = mlxsw_sp_inet6addr_event,
+};
+
+static const struct pci_device_id mlxsw_sp_pci_id_table[] = {
+	{PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_SPECTRUM), 0},
+	{0, },
+};
+
+static struct pci_driver mlxsw_sp_pci_driver = {
+	.name = mlxsw_sp_driver_name,
+	.id_table = mlxsw_sp_pci_id_table,
+};
+
+static int __init mlxsw_sp_module_init(void)
+{
+	int err;
+
+	register_inetaddr_validator_notifier(&mlxsw_sp_inetaddr_valid_nb);
+	register_inetaddr_notifier(&mlxsw_sp_inetaddr_nb);
+	register_inet6addr_validator_notifier(&mlxsw_sp_inet6addr_valid_nb);
+	register_inet6addr_notifier(&mlxsw_sp_inet6addr_nb);
+
+	err = mlxsw_core_driver_register(&mlxsw_sp_driver);
+	if (err)
+		goto err_core_driver_register;
+
+	err = mlxsw_pci_driver_register(&mlxsw_sp_pci_driver);
+	if (err)
+		goto err_pci_driver_register;
+
+	return 0;
+
+err_pci_driver_register:
+	mlxsw_core_driver_unregister(&mlxsw_sp_driver);
+err_core_driver_register:
+	unregister_inet6addr_notifier(&mlxsw_sp_inet6addr_nb);
+	unregister_inet6addr_validator_notifier(&mlxsw_sp_inet6addr_valid_nb);
+	unregister_inetaddr_notifier(&mlxsw_sp_inetaddr_nb);
+	unregister_inetaddr_validator_notifier(&mlxsw_sp_inetaddr_valid_nb);
+	return err;
+}
+
+static void __exit mlxsw_sp_module_exit(void)
+{
+	mlxsw_pci_driver_unregister(&mlxsw_sp_pci_driver);
+	mlxsw_core_driver_unregister(&mlxsw_sp_driver);
+	unregister_inet6addr_notifier(&mlxsw_sp_inet6addr_nb);
+	unregister_inet6addr_validator_notifier(&mlxsw_sp_inet6addr_valid_nb);
+	unregister_inetaddr_notifier(&mlxsw_sp_inetaddr_nb);
+	unregister_inetaddr_validator_notifier(&mlxsw_sp_inetaddr_valid_nb);
+}
+
+module_init(mlxsw_sp_module_init);
+module_exit(mlxsw_sp_module_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox Spectrum driver");
+MODULE_DEVICE_TABLE(pci, mlxsw_sp_pci_id_table);
+MODULE_FIRMWARE(MLXSW_SP_FW_FILENAME);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
new file mode 100644
index 0000000..804d4d2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -0,0 +1,633 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+ * Copyright (c) 2015-2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015-2017 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2015 Ido Schimmel <idosch@mellanox.com>
+ * Copyright (c) 2015 Elad Raz <eladr@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_SPECTRUM_H
+#define _MLXSW_SPECTRUM_H
+
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/rhashtable.h>
+#include <linux/bitops.h>
+#include <linux/if_vlan.h>
+#include <linux/list.h>
+#include <linux/dcbnl.h>
+#include <linux/in6.h>
+#include <linux/notifier.h>
+#include <net/psample.h>
+#include <net/pkt_cls.h>
+#include <net/red.h>
+
+#include "port.h"
+#include "core.h"
+#include "core_acl_flex_keys.h"
+#include "core_acl_flex_actions.h"
+
+#define MLXSW_SP_FID_8021D_MAX 1024
+
+#define MLXSW_SP_MID_MAX 7000
+
+#define MLXSW_SP_PORTS_PER_CLUSTER_MAX 4
+
+#define MLXSW_SP_PORT_BASE_SPEED 25000	/* Mb/s */
+
+#define MLXSW_SP_KVD_LINEAR_SIZE 98304 /* entries */
+#define MLXSW_SP_KVD_GRANULARITY 128
+
+#define MLXSW_SP_RESOURCE_NAME_KVD "kvd"
+#define MLXSW_SP_RESOURCE_NAME_KVD_LINEAR "linear"
+#define MLXSW_SP_RESOURCE_NAME_KVD_HASH_SINGLE "hash_single"
+#define MLXSW_SP_RESOURCE_NAME_KVD_HASH_DOUBLE "hash_double"
+#define MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_SINGLES "singles"
+#define MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_CHUNKS "chunks"
+#define MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_LARGE_CHUNKS "large_chunks"
+
+enum mlxsw_sp_resource_id {
+	MLXSW_SP_RESOURCE_KVD = 1,
+	MLXSW_SP_RESOURCE_KVD_LINEAR,
+	MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+	MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+	MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE,
+	MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS,
+	MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS,
+};
+
+struct mlxsw_sp_port;
+struct mlxsw_sp_rif;
+struct mlxsw_sp_span_entry;
+
+struct mlxsw_sp_upper {
+	struct net_device *dev;
+	unsigned int ref_count;
+};
+
+enum mlxsw_sp_rif_type {
+	MLXSW_SP_RIF_TYPE_SUBPORT,
+	MLXSW_SP_RIF_TYPE_VLAN,
+	MLXSW_SP_RIF_TYPE_FID,
+	MLXSW_SP_RIF_TYPE_IPIP_LB, /* IP-in-IP loopback. */
+	MLXSW_SP_RIF_TYPE_MAX,
+};
+
+enum mlxsw_sp_fid_type {
+	MLXSW_SP_FID_TYPE_8021Q,
+	MLXSW_SP_FID_TYPE_8021D,
+	MLXSW_SP_FID_TYPE_RFID,
+	MLXSW_SP_FID_TYPE_DUMMY,
+	MLXSW_SP_FID_TYPE_MAX,
+};
+
+struct mlxsw_sp_mid {
+	struct list_head list;
+	unsigned char addr[ETH_ALEN];
+	u16 fid;
+	u16 mid;
+	bool in_hw;
+	unsigned long *ports_in_mid; /* bits array */
+};
+
+enum mlxsw_sp_port_mall_action_type {
+	MLXSW_SP_PORT_MALL_MIRROR,
+	MLXSW_SP_PORT_MALL_SAMPLE,
+};
+
+struct mlxsw_sp_port_mall_mirror_tc_entry {
+	int span_id;
+	bool ingress;
+};
+
+struct mlxsw_sp_port_mall_tc_entry {
+	struct list_head list;
+	unsigned long cookie;
+	enum mlxsw_sp_port_mall_action_type type;
+	union {
+		struct mlxsw_sp_port_mall_mirror_tc_entry mirror;
+	};
+};
+
+struct mlxsw_sp_sb;
+struct mlxsw_sp_bridge;
+struct mlxsw_sp_router;
+struct mlxsw_sp_mr;
+struct mlxsw_sp_acl;
+struct mlxsw_sp_counter_pool;
+struct mlxsw_sp_fid_core;
+struct mlxsw_sp_kvdl;
+
+struct mlxsw_sp {
+	struct mlxsw_sp_port **ports;
+	struct mlxsw_core *core;
+	const struct mlxsw_bus_info *bus_info;
+	unsigned char base_mac[ETH_ALEN];
+	struct mlxsw_sp_upper *lags;
+	int *port_to_module;
+	struct mlxsw_sp_sb *sb;
+	struct mlxsw_sp_bridge *bridge;
+	struct mlxsw_sp_router *router;
+	struct mlxsw_sp_mr *mr;
+	struct mlxsw_afa *afa;
+	struct mlxsw_sp_acl *acl;
+	struct mlxsw_sp_fid_core *fid_core;
+	struct mlxsw_sp_kvdl *kvdl;
+	struct notifier_block netdevice_nb;
+
+	struct mlxsw_sp_counter_pool *counter_pool;
+	struct {
+		struct mlxsw_sp_span_entry *entries;
+		int entries_count;
+	} span;
+};
+
+static inline struct mlxsw_sp_upper *
+mlxsw_sp_lag_get(struct mlxsw_sp *mlxsw_sp, u16 lag_id)
+{
+	return &mlxsw_sp->lags[lag_id];
+}
+
+struct mlxsw_sp_port_pcpu_stats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+	u32			tx_dropped;
+};
+
+struct mlxsw_sp_port_sample {
+	struct psample_group __rcu *psample_group;
+	u32 trunc_size;
+	u32 rate;
+	bool truncate;
+};
+
+struct mlxsw_sp_bridge_port;
+struct mlxsw_sp_fid;
+
+struct mlxsw_sp_port_vlan {
+	struct list_head list;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	struct mlxsw_sp_fid *fid;
+	unsigned int ref_count;
+	u16 vid;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct list_head bridge_vlan_node;
+};
+
+/* No need an internal lock; At worse - miss a single periodic iteration */
+struct mlxsw_sp_port_xstats {
+	u64 ecn;
+	u64 wred_drop[TC_MAX_QUEUE];
+	u64 tail_drop[TC_MAX_QUEUE];
+	u64 backlog[TC_MAX_QUEUE];
+	u64 tx_bytes[IEEE_8021QAZ_MAX_TCS];
+	u64 tx_packets[IEEE_8021QAZ_MAX_TCS];
+};
+
+struct mlxsw_sp_port {
+	struct net_device *dev;
+	struct mlxsw_sp_port_pcpu_stats __percpu *pcpu_stats;
+	struct mlxsw_sp *mlxsw_sp;
+	u8 local_port;
+	u8 lagged:1,
+	   split:1;
+	u16 pvid;
+	u16 lag_id;
+	struct {
+		u8 tx_pause:1,
+		   rx_pause:1,
+		   autoneg:1;
+	} link;
+	struct {
+		struct ieee_ets *ets;
+		struct ieee_maxrate *maxrate;
+		struct ieee_pfc *pfc;
+	} dcb;
+	struct {
+		u8 module;
+		u8 width;
+		u8 lane;
+	} mapping;
+	/* TC handles */
+	struct list_head mall_tc_list;
+	struct {
+		#define MLXSW_HW_STATS_UPDATE_TIME HZ
+		struct rtnl_link_stats64 stats;
+		struct mlxsw_sp_port_xstats xstats;
+		struct delayed_work update_dw;
+	} periodic_hw_stats;
+	struct mlxsw_sp_port_sample *sample;
+	struct list_head vlans_list;
+	struct mlxsw_sp_qdisc *root_qdisc;
+	struct mlxsw_sp_qdisc *tclass_qdiscs;
+	unsigned acl_rule_count;
+	struct mlxsw_sp_acl_block *ing_acl_block;
+	struct mlxsw_sp_acl_block *eg_acl_block;
+};
+
+static inline bool
+mlxsw_sp_port_is_pause_en(const struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	return mlxsw_sp_port->link.tx_pause || mlxsw_sp_port->link.rx_pause;
+}
+
+static inline struct mlxsw_sp_port *
+mlxsw_sp_port_lagged_get(struct mlxsw_sp *mlxsw_sp, u16 lag_id, u8 port_index)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u8 local_port;
+
+	local_port = mlxsw_core_lag_mapping_get(mlxsw_sp->core,
+						lag_id, port_index);
+	mlxsw_sp_port = mlxsw_sp->ports[local_port];
+	return mlxsw_sp_port && mlxsw_sp_port->lagged ? mlxsw_sp_port : NULL;
+}
+
+static inline struct mlxsw_sp_port_vlan *
+mlxsw_sp_port_vlan_find_by_vid(const struct mlxsw_sp_port *mlxsw_sp_port,
+			       u16 vid)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &mlxsw_sp_port->vlans_list,
+			    list) {
+		if (mlxsw_sp_port_vlan->vid == vid)
+			return mlxsw_sp_port_vlan;
+	}
+
+	return NULL;
+}
+
+enum mlxsw_sp_flood_type {
+	MLXSW_SP_FLOOD_TYPE_UC,
+	MLXSW_SP_FLOOD_TYPE_BC,
+	MLXSW_SP_FLOOD_TYPE_MC,
+};
+
+/* spectrum_buffers.c */
+int mlxsw_sp_buffers_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_buffers_fini(struct mlxsw_sp *mlxsw_sp);
+int mlxsw_sp_port_buffers_init(struct mlxsw_sp_port *mlxsw_sp_port);
+int mlxsw_sp_sb_pool_get(struct mlxsw_core *mlxsw_core,
+			 unsigned int sb_index, u16 pool_index,
+			 struct devlink_sb_pool_info *pool_info);
+int mlxsw_sp_sb_pool_set(struct mlxsw_core *mlxsw_core,
+			 unsigned int sb_index, u16 pool_index, u32 size,
+			 enum devlink_sb_threshold_type threshold_type);
+int mlxsw_sp_sb_port_pool_get(struct mlxsw_core_port *mlxsw_core_port,
+			      unsigned int sb_index, u16 pool_index,
+			      u32 *p_threshold);
+int mlxsw_sp_sb_port_pool_set(struct mlxsw_core_port *mlxsw_core_port,
+			      unsigned int sb_index, u16 pool_index,
+			      u32 threshold);
+int mlxsw_sp_sb_tc_pool_bind_get(struct mlxsw_core_port *mlxsw_core_port,
+				 unsigned int sb_index, u16 tc_index,
+				 enum devlink_sb_pool_type pool_type,
+				 u16 *p_pool_index, u32 *p_threshold);
+int mlxsw_sp_sb_tc_pool_bind_set(struct mlxsw_core_port *mlxsw_core_port,
+				 unsigned int sb_index, u16 tc_index,
+				 enum devlink_sb_pool_type pool_type,
+				 u16 pool_index, u32 threshold);
+int mlxsw_sp_sb_occ_snapshot(struct mlxsw_core *mlxsw_core,
+			     unsigned int sb_index);
+int mlxsw_sp_sb_occ_max_clear(struct mlxsw_core *mlxsw_core,
+			      unsigned int sb_index);
+int mlxsw_sp_sb_occ_port_pool_get(struct mlxsw_core_port *mlxsw_core_port,
+				  unsigned int sb_index, u16 pool_index,
+				  u32 *p_cur, u32 *p_max);
+int mlxsw_sp_sb_occ_tc_port_bind_get(struct mlxsw_core_port *mlxsw_core_port,
+				     unsigned int sb_index, u16 tc_index,
+				     enum devlink_sb_pool_type pool_type,
+				     u32 *p_cur, u32 *p_max);
+u32 mlxsw_sp_cells_bytes(const struct mlxsw_sp *mlxsw_sp, u32 cells);
+u32 mlxsw_sp_bytes_cells(const struct mlxsw_sp *mlxsw_sp, u32 bytes);
+
+/* spectrum_switchdev.c */
+int mlxsw_sp_switchdev_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_switchdev_fini(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_port_switchdev_init(struct mlxsw_sp_port *mlxsw_sp_port);
+void mlxsw_sp_port_switchdev_fini(struct mlxsw_sp_port *mlxsw_sp_port);
+int mlxsw_sp_rif_fdb_op(struct mlxsw_sp *mlxsw_sp, const char *mac, u16 fid,
+			bool adding);
+void
+mlxsw_sp_port_vlan_bridge_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan);
+int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct net_device *brport_dev,
+			      struct net_device *br_dev,
+			      struct netlink_ext_ack *extack);
+void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port,
+				struct net_device *brport_dev,
+				struct net_device *br_dev);
+bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp,
+					 const struct net_device *br_dev);
+
+/* spectrum.c */
+int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port,
+			  enum mlxsw_reg_qeec_hr hr, u8 index, u8 next_index,
+			  bool dwrr, u8 dwrr_weight);
+int mlxsw_sp_port_prio_tc_set(struct mlxsw_sp_port *mlxsw_sp_port,
+			      u8 switch_prio, u8 tclass);
+int __mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port, int mtu,
+				 u8 *prio_tc, bool pause_en,
+				 struct ieee_pfc *my_pfc);
+int mlxsw_sp_port_ets_maxrate_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				  enum mlxsw_reg_qeec_hr hr, u8 index,
+				  u8 next_index, u32 maxrate);
+int mlxsw_sp_port_vid_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
+			      u8 state);
+int mlxsw_sp_port_vp_mode_set(struct mlxsw_sp_port *mlxsw_sp_port, bool enable);
+int mlxsw_sp_port_vid_learning_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
+				   bool learn_enable);
+int mlxsw_sp_port_pvid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid);
+struct mlxsw_sp_port_vlan *
+mlxsw_sp_port_vlan_get(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid);
+void mlxsw_sp_port_vlan_put(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan);
+int mlxsw_sp_port_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid_begin,
+			   u16 vid_end, bool is_member, bool untagged);
+int mlxsw_sp_flow_counter_get(struct mlxsw_sp *mlxsw_sp,
+			      unsigned int counter_index, u64 *packets,
+			      u64 *bytes);
+int mlxsw_sp_flow_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+				unsigned int *p_counter_index);
+void mlxsw_sp_flow_counter_free(struct mlxsw_sp *mlxsw_sp,
+				unsigned int counter_index);
+bool mlxsw_sp_port_dev_check(const struct net_device *dev);
+struct mlxsw_sp *mlxsw_sp_lower_get(struct net_device *dev);
+struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find(struct net_device *dev);
+struct mlxsw_sp_port *mlxsw_sp_port_lower_dev_hold(struct net_device *dev);
+void mlxsw_sp_port_dev_put(struct mlxsw_sp_port *mlxsw_sp_port);
+struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find_rcu(struct net_device *dev);
+
+/* spectrum_dcb.c */
+#ifdef CONFIG_MLXSW_SPECTRUM_DCB
+int mlxsw_sp_port_dcb_init(struct mlxsw_sp_port *mlxsw_sp_port);
+void mlxsw_sp_port_dcb_fini(struct mlxsw_sp_port *mlxsw_sp_port);
+#else
+static inline int mlxsw_sp_port_dcb_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	return 0;
+}
+static inline void mlxsw_sp_port_dcb_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{}
+#endif
+
+/* spectrum_router.c */
+int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp);
+int mlxsw_sp_netdevice_router_port_event(struct net_device *dev);
+int mlxsw_sp_inetaddr_event(struct notifier_block *unused,
+			    unsigned long event, void *ptr);
+int mlxsw_sp_inetaddr_valid_event(struct notifier_block *unused,
+				  unsigned long event, void *ptr);
+int mlxsw_sp_inet6addr_event(struct notifier_block *unused,
+			     unsigned long event, void *ptr);
+int mlxsw_sp_inet6addr_valid_event(struct notifier_block *unused,
+				   unsigned long event, void *ptr);
+int mlxsw_sp_netdevice_vrf_event(struct net_device *l3_dev, unsigned long event,
+				 struct netdev_notifier_changeupper_info *info);
+bool mlxsw_sp_netdev_is_ipip_ol(const struct mlxsw_sp *mlxsw_sp,
+				const struct net_device *dev);
+bool mlxsw_sp_netdev_is_ipip_ul(const struct mlxsw_sp *mlxsw_sp,
+				const struct net_device *dev);
+int mlxsw_sp_netdevice_ipip_ol_event(struct mlxsw_sp *mlxsw_sp,
+				     struct net_device *l3_dev,
+				     unsigned long event,
+				     struct netdev_notifier_info *info);
+int
+mlxsw_sp_netdevice_ipip_ul_event(struct mlxsw_sp *mlxsw_sp,
+				 struct net_device *l3_dev,
+				 unsigned long event,
+				 struct netdev_notifier_info *info);
+void
+mlxsw_sp_port_vlan_router_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan);
+void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif);
+
+/* spectrum_kvdl.c */
+int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_kvdl_fini(struct mlxsw_sp *mlxsw_sp);
+int mlxsw_sp_kvdl_alloc(struct mlxsw_sp *mlxsw_sp, unsigned int entry_count,
+			u32 *p_entry_index);
+void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp, int entry_index);
+int mlxsw_sp_kvdl_alloc_size_query(struct mlxsw_sp *mlxsw_sp,
+				   unsigned int entry_count,
+				   unsigned int *p_alloc_size);
+int mlxsw_sp_kvdl_resources_register(struct mlxsw_core *mlxsw_core);
+
+struct mlxsw_sp_acl_rule_info {
+	unsigned int priority;
+	struct mlxsw_afk_element_values values;
+	struct mlxsw_afa_block *act_block;
+	unsigned int counter_index;
+};
+
+enum mlxsw_sp_acl_profile {
+	MLXSW_SP_ACL_PROFILE_FLOWER,
+};
+
+struct mlxsw_sp_acl_profile_ops {
+	size_t ruleset_priv_size;
+	int (*ruleset_add)(struct mlxsw_sp *mlxsw_sp,
+			   void *priv, void *ruleset_priv);
+	void (*ruleset_del)(struct mlxsw_sp *mlxsw_sp, void *ruleset_priv);
+	int (*ruleset_bind)(struct mlxsw_sp *mlxsw_sp, void *ruleset_priv,
+			    struct mlxsw_sp_port *mlxsw_sp_port,
+			    bool ingress);
+	void (*ruleset_unbind)(struct mlxsw_sp *mlxsw_sp, void *ruleset_priv,
+			       struct mlxsw_sp_port *mlxsw_sp_port,
+			       bool ingress);
+	u16 (*ruleset_group_id)(void *ruleset_priv);
+	size_t rule_priv_size;
+	int (*rule_add)(struct mlxsw_sp *mlxsw_sp,
+			void *ruleset_priv, void *rule_priv,
+			struct mlxsw_sp_acl_rule_info *rulei);
+	void (*rule_del)(struct mlxsw_sp *mlxsw_sp, void *rule_priv);
+	int (*rule_activity_get)(struct mlxsw_sp *mlxsw_sp, void *rule_priv,
+				 bool *activity);
+};
+
+struct mlxsw_sp_acl_ops {
+	size_t priv_size;
+	int (*init)(struct mlxsw_sp *mlxsw_sp, void *priv);
+	void (*fini)(struct mlxsw_sp *mlxsw_sp, void *priv);
+	const struct mlxsw_sp_acl_profile_ops *
+			(*profile_ops)(struct mlxsw_sp *mlxsw_sp,
+				       enum mlxsw_sp_acl_profile profile);
+};
+
+struct mlxsw_sp_acl_block;
+struct mlxsw_sp_acl_ruleset;
+
+/* spectrum_acl.c */
+struct mlxsw_afk *mlxsw_sp_acl_afk(struct mlxsw_sp_acl *acl);
+struct mlxsw_sp *mlxsw_sp_acl_block_mlxsw_sp(struct mlxsw_sp_acl_block *block);
+unsigned int mlxsw_sp_acl_block_rule_count(struct mlxsw_sp_acl_block *block);
+void mlxsw_sp_acl_block_disable_inc(struct mlxsw_sp_acl_block *block);
+void mlxsw_sp_acl_block_disable_dec(struct mlxsw_sp_acl_block *block);
+bool mlxsw_sp_acl_block_disabled(struct mlxsw_sp_acl_block *block);
+struct mlxsw_sp_acl_block *mlxsw_sp_acl_block_create(struct mlxsw_sp *mlxsw_sp,
+						     struct net *net);
+void mlxsw_sp_acl_block_destroy(struct mlxsw_sp_acl_block *block);
+int mlxsw_sp_acl_block_bind(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block,
+			    struct mlxsw_sp_port *mlxsw_sp_port,
+			    bool ingress);
+int mlxsw_sp_acl_block_unbind(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_acl_block *block,
+			      struct mlxsw_sp_port *mlxsw_sp_port,
+			      bool ingress);
+struct mlxsw_sp_acl_ruleset *
+mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block, u32 chain_index,
+			    enum mlxsw_sp_acl_profile profile);
+struct mlxsw_sp_acl_ruleset *
+mlxsw_sp_acl_ruleset_get(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_acl_block *block, u32 chain_index,
+			 enum mlxsw_sp_acl_profile profile);
+void mlxsw_sp_acl_ruleset_put(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_acl_ruleset *ruleset);
+u16 mlxsw_sp_acl_ruleset_group_id(struct mlxsw_sp_acl_ruleset *ruleset);
+
+struct mlxsw_sp_acl_rule_info *
+mlxsw_sp_acl_rulei_create(struct mlxsw_sp_acl *acl);
+void mlxsw_sp_acl_rulei_destroy(struct mlxsw_sp_acl_rule_info *rulei);
+int mlxsw_sp_acl_rulei_commit(struct mlxsw_sp_acl_rule_info *rulei);
+void mlxsw_sp_acl_rulei_priority(struct mlxsw_sp_acl_rule_info *rulei,
+				 unsigned int priority);
+void mlxsw_sp_acl_rulei_keymask_u32(struct mlxsw_sp_acl_rule_info *rulei,
+				    enum mlxsw_afk_element element,
+				    u32 key_value, u32 mask_value);
+void mlxsw_sp_acl_rulei_keymask_buf(struct mlxsw_sp_acl_rule_info *rulei,
+				    enum mlxsw_afk_element element,
+				    const char *key_value,
+				    const char *mask_value, unsigned int len);
+int mlxsw_sp_acl_rulei_act_continue(struct mlxsw_sp_acl_rule_info *rulei);
+int mlxsw_sp_acl_rulei_act_jump(struct mlxsw_sp_acl_rule_info *rulei,
+				u16 group_id);
+int mlxsw_sp_acl_rulei_act_terminate(struct mlxsw_sp_acl_rule_info *rulei);
+int mlxsw_sp_acl_rulei_act_drop(struct mlxsw_sp_acl_rule_info *rulei);
+int mlxsw_sp_acl_rulei_act_trap(struct mlxsw_sp_acl_rule_info *rulei);
+int mlxsw_sp_acl_rulei_act_mirror(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_acl_rule_info *rulei,
+				  struct mlxsw_sp_acl_block *block,
+				  struct net_device *out_dev);
+int mlxsw_sp_acl_rulei_act_fwd(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_rule_info *rulei,
+			       struct net_device *out_dev);
+int mlxsw_sp_acl_rulei_act_vlan(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_rule_info *rulei,
+				u32 action, u16 vid, u16 proto, u8 prio);
+int mlxsw_sp_acl_rulei_act_count(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_rule_info *rulei);
+int mlxsw_sp_acl_rulei_act_fid_set(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_acl_rule_info *rulei,
+				   u16 fid);
+
+struct mlxsw_sp_acl_rule;
+
+struct mlxsw_sp_acl_rule *
+mlxsw_sp_acl_rule_create(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_acl_ruleset *ruleset,
+			 unsigned long cookie);
+void mlxsw_sp_acl_rule_destroy(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_rule *rule);
+int mlxsw_sp_acl_rule_add(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_acl_rule *rule);
+void mlxsw_sp_acl_rule_del(struct mlxsw_sp *mlxsw_sp,
+			   struct mlxsw_sp_acl_rule *rule);
+struct mlxsw_sp_acl_rule *
+mlxsw_sp_acl_rule_lookup(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_acl_ruleset *ruleset,
+			 unsigned long cookie);
+struct mlxsw_sp_acl_rule_info *
+mlxsw_sp_acl_rule_rulei(struct mlxsw_sp_acl_rule *rule);
+int mlxsw_sp_acl_rule_get_stats(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_rule *rule,
+				u64 *packets, u64 *bytes, u64 *last_use);
+
+struct mlxsw_sp_fid *mlxsw_sp_acl_dummy_fid(struct mlxsw_sp *mlxsw_sp);
+
+int mlxsw_sp_acl_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_acl_fini(struct mlxsw_sp *mlxsw_sp);
+
+/* spectrum_acl_tcam.c */
+extern const struct mlxsw_sp_acl_ops mlxsw_sp_acl_tcam_ops;
+
+/* spectrum_flower.c */
+int mlxsw_sp_flower_replace(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block,
+			    struct tc_cls_flower_offload *f);
+void mlxsw_sp_flower_destroy(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_acl_block *block,
+			     struct tc_cls_flower_offload *f);
+int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_acl_block *block,
+			  struct tc_cls_flower_offload *f);
+
+/* spectrum_qdisc.c */
+int mlxsw_sp_tc_qdisc_init(struct mlxsw_sp_port *mlxsw_sp_port);
+void mlxsw_sp_tc_qdisc_fini(struct mlxsw_sp_port *mlxsw_sp_port);
+int mlxsw_sp_setup_tc_red(struct mlxsw_sp_port *mlxsw_sp_port,
+			  struct tc_red_qopt_offload *p);
+int mlxsw_sp_setup_tc_prio(struct mlxsw_sp_port *mlxsw_sp_port,
+			   struct tc_prio_qopt_offload *p);
+
+/* spectrum_fid.c */
+int mlxsw_sp_fid_flood_set(struct mlxsw_sp_fid *fid,
+			   enum mlxsw_sp_flood_type packet_type, u8 local_port,
+			   bool member);
+int mlxsw_sp_fid_port_vid_map(struct mlxsw_sp_fid *fid,
+			      struct mlxsw_sp_port *mlxsw_sp_port, u16 vid);
+void mlxsw_sp_fid_port_vid_unmap(struct mlxsw_sp_fid *fid,
+				 struct mlxsw_sp_port *mlxsw_sp_port, u16 vid);
+enum mlxsw_sp_rif_type mlxsw_sp_fid_rif_type(const struct mlxsw_sp_fid *fid);
+u16 mlxsw_sp_fid_index(const struct mlxsw_sp_fid *fid);
+enum mlxsw_sp_fid_type mlxsw_sp_fid_type(const struct mlxsw_sp_fid *fid);
+void mlxsw_sp_fid_rif_set(struct mlxsw_sp_fid *fid, struct mlxsw_sp_rif *rif);
+enum mlxsw_sp_rif_type
+mlxsw_sp_fid_type_rif_type(const struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_fid_type type);
+u16 mlxsw_sp_fid_8021q_vid(const struct mlxsw_sp_fid *fid);
+struct mlxsw_sp_fid *mlxsw_sp_fid_8021q_get(struct mlxsw_sp *mlxsw_sp, u16 vid);
+struct mlxsw_sp_fid *mlxsw_sp_fid_8021d_get(struct mlxsw_sp *mlxsw_sp,
+					    int br_ifindex);
+struct mlxsw_sp_fid *mlxsw_sp_fid_rfid_get(struct mlxsw_sp *mlxsw_sp,
+					   u16 rif_index);
+struct mlxsw_sp_fid *mlxsw_sp_fid_dummy_get(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_fid_put(struct mlxsw_sp_fid *fid);
+int mlxsw_sp_port_fids_init(struct mlxsw_sp_port *mlxsw_sp_port);
+void mlxsw_sp_port_fids_fini(struct mlxsw_sp_port *mlxsw_sp_port);
+int mlxsw_sp_fids_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_fids_fini(struct mlxsw_sp *mlxsw_sp);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c
new file mode 100644
index 0000000..79b1fa2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c
@@ -0,0 +1,896 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <net/tc_act/tc_vlan.h>
+
+#include "reg.h"
+#include "core.h"
+#include "resources.h"
+#include "spectrum.h"
+#include "core_acl_flex_keys.h"
+#include "core_acl_flex_actions.h"
+#include "spectrum_acl_flex_keys.h"
+
+struct mlxsw_sp_acl {
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_afk *afk;
+	struct mlxsw_sp_fid *dummy_fid;
+	const struct mlxsw_sp_acl_ops *ops;
+	struct rhashtable ruleset_ht;
+	struct list_head rules;
+	struct {
+		struct delayed_work dw;
+		unsigned long interval;	/* ms */
+#define MLXSW_SP_ACL_RULE_ACTIVITY_UPDATE_PERIOD_MS 1000
+	} rule_activity_update;
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+struct mlxsw_afk *mlxsw_sp_acl_afk(struct mlxsw_sp_acl *acl)
+{
+	return acl->afk;
+}
+
+struct mlxsw_sp_acl_block_binding {
+	struct list_head list;
+	struct net_device *dev;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	bool ingress;
+};
+
+struct mlxsw_sp_acl_block {
+	struct list_head binding_list;
+	struct mlxsw_sp_acl_ruleset *ruleset_zero;
+	struct mlxsw_sp *mlxsw_sp;
+	unsigned int rule_count;
+	unsigned int disable_count;
+};
+
+struct mlxsw_sp_acl_ruleset_ht_key {
+	struct mlxsw_sp_acl_block *block;
+	u32 chain_index;
+	const struct mlxsw_sp_acl_profile_ops *ops;
+};
+
+struct mlxsw_sp_acl_ruleset {
+	struct rhash_head ht_node; /* Member of acl HT */
+	struct mlxsw_sp_acl_ruleset_ht_key ht_key;
+	struct rhashtable rule_ht;
+	unsigned int ref_count;
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+struct mlxsw_sp_acl_rule {
+	struct rhash_head ht_node; /* Member of rule HT */
+	struct list_head list;
+	unsigned long cookie; /* HT key */
+	struct mlxsw_sp_acl_ruleset *ruleset;
+	struct mlxsw_sp_acl_rule_info *rulei;
+	u64 last_used;
+	u64 last_packets;
+	u64 last_bytes;
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+static const struct rhashtable_params mlxsw_sp_acl_ruleset_ht_params = {
+	.key_len = sizeof(struct mlxsw_sp_acl_ruleset_ht_key),
+	.key_offset = offsetof(struct mlxsw_sp_acl_ruleset, ht_key),
+	.head_offset = offsetof(struct mlxsw_sp_acl_ruleset, ht_node),
+	.automatic_shrinking = true,
+};
+
+static const struct rhashtable_params mlxsw_sp_acl_rule_ht_params = {
+	.key_len = sizeof(unsigned long),
+	.key_offset = offsetof(struct mlxsw_sp_acl_rule, cookie),
+	.head_offset = offsetof(struct mlxsw_sp_acl_rule, ht_node),
+	.automatic_shrinking = true,
+};
+
+struct mlxsw_sp_fid *mlxsw_sp_acl_dummy_fid(struct mlxsw_sp *mlxsw_sp)
+{
+	return mlxsw_sp->acl->dummy_fid;
+}
+
+struct mlxsw_sp *mlxsw_sp_acl_block_mlxsw_sp(struct mlxsw_sp_acl_block *block)
+{
+	return block->mlxsw_sp;
+}
+
+unsigned int mlxsw_sp_acl_block_rule_count(struct mlxsw_sp_acl_block *block)
+{
+	return block ? block->rule_count : 0;
+}
+
+void mlxsw_sp_acl_block_disable_inc(struct mlxsw_sp_acl_block *block)
+{
+	if (block)
+		block->disable_count++;
+}
+
+void mlxsw_sp_acl_block_disable_dec(struct mlxsw_sp_acl_block *block)
+{
+	if (block)
+		block->disable_count--;
+}
+
+bool mlxsw_sp_acl_block_disabled(struct mlxsw_sp_acl_block *block)
+{
+	return block->disable_count;
+}
+
+static bool
+mlxsw_sp_acl_ruleset_is_singular(const struct mlxsw_sp_acl_ruleset *ruleset)
+{
+	/* We hold a reference on ruleset ourselves */
+	return ruleset->ref_count == 2;
+}
+
+static int
+mlxsw_sp_acl_ruleset_bind(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_acl_block *block,
+			  struct mlxsw_sp_acl_block_binding *binding)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset = block->ruleset_zero;
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+
+	return ops->ruleset_bind(mlxsw_sp, ruleset->priv,
+				 binding->mlxsw_sp_port, binding->ingress);
+}
+
+static void
+mlxsw_sp_acl_ruleset_unbind(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block,
+			    struct mlxsw_sp_acl_block_binding *binding)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset = block->ruleset_zero;
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+
+	ops->ruleset_unbind(mlxsw_sp, ruleset->priv,
+			    binding->mlxsw_sp_port, binding->ingress);
+}
+
+static bool mlxsw_sp_acl_ruleset_block_bound(struct mlxsw_sp_acl_block *block)
+{
+	return block->ruleset_zero;
+}
+
+static int
+mlxsw_sp_acl_ruleset_block_bind(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_ruleset *ruleset,
+				struct mlxsw_sp_acl_block *block)
+{
+	struct mlxsw_sp_acl_block_binding *binding;
+	int err;
+
+	block->ruleset_zero = ruleset;
+	list_for_each_entry(binding, &block->binding_list, list) {
+		err = mlxsw_sp_acl_ruleset_bind(mlxsw_sp, block, binding);
+		if (err)
+			goto rollback;
+	}
+	return 0;
+
+rollback:
+	list_for_each_entry_continue_reverse(binding, &block->binding_list,
+					     list)
+		mlxsw_sp_acl_ruleset_unbind(mlxsw_sp, block, binding);
+	block->ruleset_zero = NULL;
+
+	return err;
+}
+
+static void
+mlxsw_sp_acl_ruleset_block_unbind(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_acl_ruleset *ruleset,
+				  struct mlxsw_sp_acl_block *block)
+{
+	struct mlxsw_sp_acl_block_binding *binding;
+
+	list_for_each_entry(binding, &block->binding_list, list)
+		mlxsw_sp_acl_ruleset_unbind(mlxsw_sp, block, binding);
+	block->ruleset_zero = NULL;
+}
+
+struct mlxsw_sp_acl_block *mlxsw_sp_acl_block_create(struct mlxsw_sp *mlxsw_sp,
+						     struct net *net)
+{
+	struct mlxsw_sp_acl_block *block;
+
+	block = kzalloc(sizeof(*block), GFP_KERNEL);
+	if (!block)
+		return NULL;
+	INIT_LIST_HEAD(&block->binding_list);
+	block->mlxsw_sp = mlxsw_sp;
+	return block;
+}
+
+void mlxsw_sp_acl_block_destroy(struct mlxsw_sp_acl_block *block)
+{
+	WARN_ON(!list_empty(&block->binding_list));
+	kfree(block);
+}
+
+static struct mlxsw_sp_acl_block_binding *
+mlxsw_sp_acl_block_lookup(struct mlxsw_sp_acl_block *block,
+			  struct mlxsw_sp_port *mlxsw_sp_port, bool ingress)
+{
+	struct mlxsw_sp_acl_block_binding *binding;
+
+	list_for_each_entry(binding, &block->binding_list, list)
+		if (binding->mlxsw_sp_port == mlxsw_sp_port &&
+		    binding->ingress == ingress)
+			return binding;
+	return NULL;
+}
+
+int mlxsw_sp_acl_block_bind(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block,
+			    struct mlxsw_sp_port *mlxsw_sp_port,
+			    bool ingress)
+{
+	struct mlxsw_sp_acl_block_binding *binding;
+	int err;
+
+	if (WARN_ON(mlxsw_sp_acl_block_lookup(block, mlxsw_sp_port, ingress)))
+		return -EEXIST;
+
+	binding = kzalloc(sizeof(*binding), GFP_KERNEL);
+	if (!binding)
+		return -ENOMEM;
+	binding->mlxsw_sp_port = mlxsw_sp_port;
+	binding->ingress = ingress;
+
+	if (mlxsw_sp_acl_ruleset_block_bound(block)) {
+		err = mlxsw_sp_acl_ruleset_bind(mlxsw_sp, block, binding);
+		if (err)
+			goto err_ruleset_bind;
+	}
+
+	list_add(&binding->list, &block->binding_list);
+	return 0;
+
+err_ruleset_bind:
+	kfree(binding);
+	return err;
+}
+
+int mlxsw_sp_acl_block_unbind(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_acl_block *block,
+			      struct mlxsw_sp_port *mlxsw_sp_port,
+			      bool ingress)
+{
+	struct mlxsw_sp_acl_block_binding *binding;
+
+	binding = mlxsw_sp_acl_block_lookup(block, mlxsw_sp_port, ingress);
+	if (!binding)
+		return -ENOENT;
+
+	list_del(&binding->list);
+
+	if (mlxsw_sp_acl_ruleset_block_bound(block))
+		mlxsw_sp_acl_ruleset_unbind(mlxsw_sp, block, binding);
+
+	kfree(binding);
+	return 0;
+}
+
+static struct mlxsw_sp_acl_ruleset *
+mlxsw_sp_acl_ruleset_create(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block, u32 chain_index,
+			    const struct mlxsw_sp_acl_profile_ops *ops)
+{
+	struct mlxsw_sp_acl *acl = mlxsw_sp->acl;
+	struct mlxsw_sp_acl_ruleset *ruleset;
+	size_t alloc_size;
+	int err;
+
+	alloc_size = sizeof(*ruleset) + ops->ruleset_priv_size;
+	ruleset = kzalloc(alloc_size, GFP_KERNEL);
+	if (!ruleset)
+		return ERR_PTR(-ENOMEM);
+	ruleset->ref_count = 1;
+	ruleset->ht_key.block = block;
+	ruleset->ht_key.chain_index = chain_index;
+	ruleset->ht_key.ops = ops;
+
+	err = rhashtable_init(&ruleset->rule_ht, &mlxsw_sp_acl_rule_ht_params);
+	if (err)
+		goto err_rhashtable_init;
+
+	err = ops->ruleset_add(mlxsw_sp, acl->priv, ruleset->priv);
+	if (err)
+		goto err_ops_ruleset_add;
+
+	err = rhashtable_insert_fast(&acl->ruleset_ht, &ruleset->ht_node,
+				     mlxsw_sp_acl_ruleset_ht_params);
+	if (err)
+		goto err_ht_insert;
+
+	return ruleset;
+
+err_ht_insert:
+	ops->ruleset_del(mlxsw_sp, ruleset->priv);
+err_ops_ruleset_add:
+	rhashtable_destroy(&ruleset->rule_ht);
+err_rhashtable_init:
+	kfree(ruleset);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_acl_ruleset_destroy(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_acl_ruleset *ruleset)
+{
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+	struct mlxsw_sp_acl *acl = mlxsw_sp->acl;
+
+	rhashtable_remove_fast(&acl->ruleset_ht, &ruleset->ht_node,
+			       mlxsw_sp_acl_ruleset_ht_params);
+	ops->ruleset_del(mlxsw_sp, ruleset->priv);
+	rhashtable_destroy(&ruleset->rule_ht);
+	kfree(ruleset);
+}
+
+static void mlxsw_sp_acl_ruleset_ref_inc(struct mlxsw_sp_acl_ruleset *ruleset)
+{
+	ruleset->ref_count++;
+}
+
+static void mlxsw_sp_acl_ruleset_ref_dec(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_acl_ruleset *ruleset)
+{
+	if (--ruleset->ref_count)
+		return;
+	mlxsw_sp_acl_ruleset_destroy(mlxsw_sp, ruleset);
+}
+
+static struct mlxsw_sp_acl_ruleset *
+__mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp_acl *acl,
+			      struct mlxsw_sp_acl_block *block, u32 chain_index,
+			      const struct mlxsw_sp_acl_profile_ops *ops)
+{
+	struct mlxsw_sp_acl_ruleset_ht_key ht_key;
+
+	memset(&ht_key, 0, sizeof(ht_key));
+	ht_key.block = block;
+	ht_key.chain_index = chain_index;
+	ht_key.ops = ops;
+	return rhashtable_lookup_fast(&acl->ruleset_ht, &ht_key,
+				      mlxsw_sp_acl_ruleset_ht_params);
+}
+
+struct mlxsw_sp_acl_ruleset *
+mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block, u32 chain_index,
+			    enum mlxsw_sp_acl_profile profile)
+{
+	const struct mlxsw_sp_acl_profile_ops *ops;
+	struct mlxsw_sp_acl *acl = mlxsw_sp->acl;
+	struct mlxsw_sp_acl_ruleset *ruleset;
+
+	ops = acl->ops->profile_ops(mlxsw_sp, profile);
+	if (!ops)
+		return ERR_PTR(-EINVAL);
+	ruleset = __mlxsw_sp_acl_ruleset_lookup(acl, block, chain_index, ops);
+	if (!ruleset)
+		return ERR_PTR(-ENOENT);
+	return ruleset;
+}
+
+struct mlxsw_sp_acl_ruleset *
+mlxsw_sp_acl_ruleset_get(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_acl_block *block, u32 chain_index,
+			 enum mlxsw_sp_acl_profile profile)
+{
+	const struct mlxsw_sp_acl_profile_ops *ops;
+	struct mlxsw_sp_acl *acl = mlxsw_sp->acl;
+	struct mlxsw_sp_acl_ruleset *ruleset;
+
+	ops = acl->ops->profile_ops(mlxsw_sp, profile);
+	if (!ops)
+		return ERR_PTR(-EINVAL);
+
+	ruleset = __mlxsw_sp_acl_ruleset_lookup(acl, block, chain_index, ops);
+	if (ruleset) {
+		mlxsw_sp_acl_ruleset_ref_inc(ruleset);
+		return ruleset;
+	}
+	return mlxsw_sp_acl_ruleset_create(mlxsw_sp, block, chain_index, ops);
+}
+
+void mlxsw_sp_acl_ruleset_put(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_acl_ruleset *ruleset)
+{
+	mlxsw_sp_acl_ruleset_ref_dec(mlxsw_sp, ruleset);
+}
+
+u16 mlxsw_sp_acl_ruleset_group_id(struct mlxsw_sp_acl_ruleset *ruleset)
+{
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+
+	return ops->ruleset_group_id(ruleset->priv);
+}
+
+struct mlxsw_sp_acl_rule_info *
+mlxsw_sp_acl_rulei_create(struct mlxsw_sp_acl *acl)
+{
+	struct mlxsw_sp_acl_rule_info *rulei;
+	int err;
+
+	rulei = kzalloc(sizeof(*rulei), GFP_KERNEL);
+	if (!rulei)
+		return NULL;
+	rulei->act_block = mlxsw_afa_block_create(acl->mlxsw_sp->afa);
+	if (IS_ERR(rulei->act_block)) {
+		err = PTR_ERR(rulei->act_block);
+		goto err_afa_block_create;
+	}
+	return rulei;
+
+err_afa_block_create:
+	kfree(rulei);
+	return ERR_PTR(err);
+}
+
+void mlxsw_sp_acl_rulei_destroy(struct mlxsw_sp_acl_rule_info *rulei)
+{
+	mlxsw_afa_block_destroy(rulei->act_block);
+	kfree(rulei);
+}
+
+int mlxsw_sp_acl_rulei_commit(struct mlxsw_sp_acl_rule_info *rulei)
+{
+	return mlxsw_afa_block_commit(rulei->act_block);
+}
+
+void mlxsw_sp_acl_rulei_priority(struct mlxsw_sp_acl_rule_info *rulei,
+				 unsigned int priority)
+{
+	rulei->priority = priority;
+}
+
+void mlxsw_sp_acl_rulei_keymask_u32(struct mlxsw_sp_acl_rule_info *rulei,
+				    enum mlxsw_afk_element element,
+				    u32 key_value, u32 mask_value)
+{
+	mlxsw_afk_values_add_u32(&rulei->values, element,
+				 key_value, mask_value);
+}
+
+void mlxsw_sp_acl_rulei_keymask_buf(struct mlxsw_sp_acl_rule_info *rulei,
+				    enum mlxsw_afk_element element,
+				    const char *key_value,
+				    const char *mask_value, unsigned int len)
+{
+	mlxsw_afk_values_add_buf(&rulei->values, element,
+				 key_value, mask_value, len);
+}
+
+int mlxsw_sp_acl_rulei_act_continue(struct mlxsw_sp_acl_rule_info *rulei)
+{
+	return mlxsw_afa_block_continue(rulei->act_block);
+}
+
+int mlxsw_sp_acl_rulei_act_jump(struct mlxsw_sp_acl_rule_info *rulei,
+				u16 group_id)
+{
+	return mlxsw_afa_block_jump(rulei->act_block, group_id);
+}
+
+int mlxsw_sp_acl_rulei_act_terminate(struct mlxsw_sp_acl_rule_info *rulei)
+{
+	return mlxsw_afa_block_terminate(rulei->act_block);
+}
+
+int mlxsw_sp_acl_rulei_act_drop(struct mlxsw_sp_acl_rule_info *rulei)
+{
+	return mlxsw_afa_block_append_drop(rulei->act_block);
+}
+
+int mlxsw_sp_acl_rulei_act_trap(struct mlxsw_sp_acl_rule_info *rulei)
+{
+	return mlxsw_afa_block_append_trap(rulei->act_block,
+					   MLXSW_TRAP_ID_ACL0);
+}
+
+int mlxsw_sp_acl_rulei_act_fwd(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_rule_info *rulei,
+			       struct net_device *out_dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u8 local_port;
+	bool in_port;
+
+	if (out_dev) {
+		if (!mlxsw_sp_port_dev_check(out_dev))
+			return -EINVAL;
+		mlxsw_sp_port = netdev_priv(out_dev);
+		if (mlxsw_sp_port->mlxsw_sp != mlxsw_sp)
+			return -EINVAL;
+		local_port = mlxsw_sp_port->local_port;
+		in_port = false;
+	} else {
+		/* If out_dev is NULL, the caller wants to
+		 * set forward to ingress port.
+		 */
+		local_port = 0;
+		in_port = true;
+	}
+	return mlxsw_afa_block_append_fwd(rulei->act_block,
+					  local_port, in_port);
+}
+
+int mlxsw_sp_acl_rulei_act_mirror(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_acl_rule_info *rulei,
+				  struct mlxsw_sp_acl_block *block,
+				  struct net_device *out_dev)
+{
+	struct mlxsw_sp_acl_block_binding *binding;
+	struct mlxsw_sp_port *in_port;
+
+	if (!list_is_singular(&block->binding_list))
+		return -EOPNOTSUPP;
+
+	binding = list_first_entry(&block->binding_list,
+				   struct mlxsw_sp_acl_block_binding, list);
+	in_port = binding->mlxsw_sp_port;
+
+	return mlxsw_afa_block_append_mirror(rulei->act_block,
+					     in_port->local_port,
+					     out_dev,
+					     binding->ingress);
+}
+
+int mlxsw_sp_acl_rulei_act_vlan(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_rule_info *rulei,
+				u32 action, u16 vid, u16 proto, u8 prio)
+{
+	u8 ethertype;
+
+	if (action == TCA_VLAN_ACT_MODIFY) {
+		switch (proto) {
+		case ETH_P_8021Q:
+			ethertype = 0;
+			break;
+		case ETH_P_8021AD:
+			ethertype = 1;
+			break;
+		default:
+			dev_err(mlxsw_sp->bus_info->dev, "Unsupported VLAN protocol %#04x\n",
+				proto);
+			return -EINVAL;
+		}
+
+		return mlxsw_afa_block_append_vlan_modify(rulei->act_block,
+							  vid, prio, ethertype);
+	} else {
+		dev_err(mlxsw_sp->bus_info->dev, "Unsupported VLAN action\n");
+		return -EINVAL;
+	}
+}
+
+int mlxsw_sp_acl_rulei_act_count(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_rule_info *rulei)
+{
+	return mlxsw_afa_block_append_counter(rulei->act_block,
+					      &rulei->counter_index);
+}
+
+int mlxsw_sp_acl_rulei_act_fid_set(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_acl_rule_info *rulei,
+				   u16 fid)
+{
+	return mlxsw_afa_block_append_fid_set(rulei->act_block, fid);
+}
+
+struct mlxsw_sp_acl_rule *
+mlxsw_sp_acl_rule_create(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_acl_ruleset *ruleset,
+			 unsigned long cookie)
+{
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+	struct mlxsw_sp_acl_rule *rule;
+	int err;
+
+	mlxsw_sp_acl_ruleset_ref_inc(ruleset);
+	rule = kzalloc(sizeof(*rule) + ops->rule_priv_size, GFP_KERNEL);
+	if (!rule) {
+		err = -ENOMEM;
+		goto err_alloc;
+	}
+	rule->cookie = cookie;
+	rule->ruleset = ruleset;
+
+	rule->rulei = mlxsw_sp_acl_rulei_create(mlxsw_sp->acl);
+	if (IS_ERR(rule->rulei)) {
+		err = PTR_ERR(rule->rulei);
+		goto err_rulei_create;
+	}
+
+	return rule;
+
+err_rulei_create:
+	kfree(rule);
+err_alloc:
+	mlxsw_sp_acl_ruleset_ref_dec(mlxsw_sp, ruleset);
+	return ERR_PTR(err);
+}
+
+void mlxsw_sp_acl_rule_destroy(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_rule *rule)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset = rule->ruleset;
+
+	mlxsw_sp_acl_rulei_destroy(rule->rulei);
+	kfree(rule);
+	mlxsw_sp_acl_ruleset_ref_dec(mlxsw_sp, ruleset);
+}
+
+int mlxsw_sp_acl_rule_add(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_acl_rule *rule)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset = rule->ruleset;
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+	int err;
+
+	err = ops->rule_add(mlxsw_sp, ruleset->priv, rule->priv, rule->rulei);
+	if (err)
+		return err;
+
+	err = rhashtable_insert_fast(&ruleset->rule_ht, &rule->ht_node,
+				     mlxsw_sp_acl_rule_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	if (!ruleset->ht_key.chain_index &&
+	    mlxsw_sp_acl_ruleset_is_singular(ruleset)) {
+		/* We only need ruleset with chain index 0, the implicit
+		 * one, to be directly bound to device. The rest of the
+		 * rulesets are bound by "Goto action set".
+		 */
+		err = mlxsw_sp_acl_ruleset_block_bind(mlxsw_sp, ruleset,
+						      ruleset->ht_key.block);
+		if (err)
+			goto err_ruleset_block_bind;
+	}
+
+	list_add_tail(&rule->list, &mlxsw_sp->acl->rules);
+	ruleset->ht_key.block->rule_count++;
+	return 0;
+
+err_ruleset_block_bind:
+	rhashtable_remove_fast(&ruleset->rule_ht, &rule->ht_node,
+			       mlxsw_sp_acl_rule_ht_params);
+err_rhashtable_insert:
+	ops->rule_del(mlxsw_sp, rule->priv);
+	return err;
+}
+
+void mlxsw_sp_acl_rule_del(struct mlxsw_sp *mlxsw_sp,
+			   struct mlxsw_sp_acl_rule *rule)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset = rule->ruleset;
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+
+	ruleset->ht_key.block->rule_count--;
+	list_del(&rule->list);
+	if (!ruleset->ht_key.chain_index &&
+	    mlxsw_sp_acl_ruleset_is_singular(ruleset))
+		mlxsw_sp_acl_ruleset_block_unbind(mlxsw_sp, ruleset,
+						  ruleset->ht_key.block);
+	rhashtable_remove_fast(&ruleset->rule_ht, &rule->ht_node,
+			       mlxsw_sp_acl_rule_ht_params);
+	ops->rule_del(mlxsw_sp, rule->priv);
+}
+
+struct mlxsw_sp_acl_rule *
+mlxsw_sp_acl_rule_lookup(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_acl_ruleset *ruleset,
+			 unsigned long cookie)
+{
+	return rhashtable_lookup_fast(&ruleset->rule_ht, &cookie,
+				       mlxsw_sp_acl_rule_ht_params);
+}
+
+struct mlxsw_sp_acl_rule_info *
+mlxsw_sp_acl_rule_rulei(struct mlxsw_sp_acl_rule *rule)
+{
+	return rule->rulei;
+}
+
+static int mlxsw_sp_acl_rule_activity_update(struct mlxsw_sp *mlxsw_sp,
+					     struct mlxsw_sp_acl_rule *rule)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset = rule->ruleset;
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+	bool active;
+	int err;
+
+	err = ops->rule_activity_get(mlxsw_sp, rule->priv, &active);
+	if (err)
+		return err;
+	if (active)
+		rule->last_used = jiffies;
+	return 0;
+}
+
+static int mlxsw_sp_acl_rules_activity_update(struct mlxsw_sp_acl *acl)
+{
+	struct mlxsw_sp_acl_rule *rule;
+	int err;
+
+	/* Protect internal structures from changes */
+	rtnl_lock();
+	list_for_each_entry(rule, &acl->rules, list) {
+		err = mlxsw_sp_acl_rule_activity_update(acl->mlxsw_sp,
+							rule);
+		if (err)
+			goto err_rule_update;
+	}
+	rtnl_unlock();
+	return 0;
+
+err_rule_update:
+	rtnl_unlock();
+	return err;
+}
+
+static void mlxsw_sp_acl_rule_activity_work_schedule(struct mlxsw_sp_acl *acl)
+{
+	unsigned long interval = acl->rule_activity_update.interval;
+
+	mlxsw_core_schedule_dw(&acl->rule_activity_update.dw,
+			       msecs_to_jiffies(interval));
+}
+
+static void mlxsw_sp_acl_rul_activity_update_work(struct work_struct *work)
+{
+	struct mlxsw_sp_acl *acl = container_of(work, struct mlxsw_sp_acl,
+						rule_activity_update.dw.work);
+	int err;
+
+	err = mlxsw_sp_acl_rules_activity_update(acl);
+	if (err)
+		dev_err(acl->mlxsw_sp->bus_info->dev, "Could not update acl activity");
+
+	mlxsw_sp_acl_rule_activity_work_schedule(acl);
+}
+
+int mlxsw_sp_acl_rule_get_stats(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_rule *rule,
+				u64 *packets, u64 *bytes, u64 *last_use)
+
+{
+	struct mlxsw_sp_acl_rule_info *rulei;
+	u64 current_packets;
+	u64 current_bytes;
+	int err;
+
+	rulei = mlxsw_sp_acl_rule_rulei(rule);
+	err = mlxsw_sp_flow_counter_get(mlxsw_sp, rulei->counter_index,
+					&current_packets, &current_bytes);
+	if (err)
+		return err;
+
+	*packets = current_packets - rule->last_packets;
+	*bytes = current_bytes - rule->last_bytes;
+	*last_use = rule->last_used;
+
+	rule->last_bytes = current_bytes;
+	rule->last_packets = current_packets;
+
+	return 0;
+}
+
+int mlxsw_sp_acl_init(struct mlxsw_sp *mlxsw_sp)
+{
+	const struct mlxsw_sp_acl_ops *acl_ops = &mlxsw_sp_acl_tcam_ops;
+	struct mlxsw_sp_fid *fid;
+	struct mlxsw_sp_acl *acl;
+	int err;
+
+	acl = kzalloc(sizeof(*acl) + acl_ops->priv_size, GFP_KERNEL);
+	if (!acl)
+		return -ENOMEM;
+	mlxsw_sp->acl = acl;
+	acl->mlxsw_sp = mlxsw_sp;
+	acl->afk = mlxsw_afk_create(MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						       ACL_FLEX_KEYS),
+				    mlxsw_sp_afk_blocks,
+				    MLXSW_SP_AFK_BLOCKS_COUNT);
+	if (!acl->afk) {
+		err = -ENOMEM;
+		goto err_afk_create;
+	}
+
+	err = rhashtable_init(&acl->ruleset_ht,
+			      &mlxsw_sp_acl_ruleset_ht_params);
+	if (err)
+		goto err_rhashtable_init;
+
+	fid = mlxsw_sp_fid_dummy_get(mlxsw_sp);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		goto err_fid_get;
+	}
+	acl->dummy_fid = fid;
+
+	INIT_LIST_HEAD(&acl->rules);
+	err = acl_ops->init(mlxsw_sp, acl->priv);
+	if (err)
+		goto err_acl_ops_init;
+
+	acl->ops = acl_ops;
+
+	/* Create the delayed work for the rule activity_update */
+	INIT_DELAYED_WORK(&acl->rule_activity_update.dw,
+			  mlxsw_sp_acl_rul_activity_update_work);
+	acl->rule_activity_update.interval = MLXSW_SP_ACL_RULE_ACTIVITY_UPDATE_PERIOD_MS;
+	mlxsw_core_schedule_dw(&acl->rule_activity_update.dw, 0);
+	return 0;
+
+err_acl_ops_init:
+	mlxsw_sp_fid_put(fid);
+err_fid_get:
+	rhashtable_destroy(&acl->ruleset_ht);
+err_rhashtable_init:
+	mlxsw_afk_destroy(acl->afk);
+err_afk_create:
+	kfree(acl);
+	return err;
+}
+
+void mlxsw_sp_acl_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_acl *acl = mlxsw_sp->acl;
+	const struct mlxsw_sp_acl_ops *acl_ops = acl->ops;
+
+	cancel_delayed_work_sync(&mlxsw_sp->acl->rule_activity_update.dw);
+	acl_ops->fini(mlxsw_sp, acl->priv);
+	WARN_ON(!list_empty(&acl->rules));
+	mlxsw_sp_fid_put(acl->dummy_fid);
+	rhashtable_destroy(&acl->ruleset_ht);
+	mlxsw_afk_destroy(acl->afk);
+	kfree(acl);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c
new file mode 100644
index 0000000..510ce48
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c
@@ -0,0 +1,179 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c
+ * Copyright (c) 2017, 2018 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spectrum_acl_flex_actions.h"
+#include "core_acl_flex_actions.h"
+#include "spectrum_span.h"
+
+#define MLXSW_SP_KVDL_ACT_EXT_SIZE 1
+
+static int mlxsw_sp_act_kvdl_set_add(void *priv, u32 *p_kvdl_index,
+				     char *enc_actions, bool is_first)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	char pefa_pl[MLXSW_REG_PEFA_LEN];
+	u32 kvdl_index;
+	int err;
+
+	/* The first action set of a TCAM entry is stored directly in TCAM,
+	 * not KVD linear area.
+	 */
+	if (is_first)
+		return 0;
+
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_ACT_EXT_SIZE,
+				  &kvdl_index);
+	if (err)
+		return err;
+	mlxsw_reg_pefa_pack(pefa_pl, kvdl_index, enc_actions);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pefa), pefa_pl);
+	if (err)
+		goto err_pefa_write;
+	*p_kvdl_index = kvdl_index;
+	return 0;
+
+err_pefa_write:
+	mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index);
+	return err;
+}
+
+static void mlxsw_sp_act_kvdl_set_del(void *priv, u32 kvdl_index,
+				      bool is_first)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	if (is_first)
+		return;
+	mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index);
+}
+
+static int mlxsw_sp_act_kvdl_fwd_entry_add(void *priv, u32 *p_kvdl_index,
+					   u8 local_port)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	char ppbs_pl[MLXSW_REG_PPBS_LEN];
+	u32 kvdl_index;
+	int err;
+
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, 1, &kvdl_index);
+	if (err)
+		return err;
+	mlxsw_reg_ppbs_pack(ppbs_pl, kvdl_index, local_port);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ppbs), ppbs_pl);
+	if (err)
+		goto err_ppbs_write;
+	*p_kvdl_index = kvdl_index;
+	return 0;
+
+err_ppbs_write:
+	mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index);
+	return err;
+}
+
+static void mlxsw_sp_act_kvdl_fwd_entry_del(void *priv, u32 kvdl_index)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index);
+}
+
+static int
+mlxsw_sp_act_counter_index_get(void *priv, unsigned int *p_counter_index)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	return mlxsw_sp_flow_counter_alloc(mlxsw_sp, p_counter_index);
+}
+
+static void
+mlxsw_sp_act_counter_index_put(void *priv, unsigned int counter_index)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	mlxsw_sp_flow_counter_free(mlxsw_sp, counter_index);
+}
+
+static int
+mlxsw_sp_act_mirror_add(void *priv, u8 local_in_port,
+			const struct net_device *out_dev,
+			bool ingress, int *p_span_id)
+{
+	struct mlxsw_sp_port *in_port;
+	struct mlxsw_sp *mlxsw_sp = priv;
+	enum mlxsw_sp_span_type type;
+
+	type = ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS;
+	in_port = mlxsw_sp->ports[local_in_port];
+
+	return mlxsw_sp_span_mirror_add(in_port, out_dev, type,
+					false, p_span_id);
+}
+
+static void
+mlxsw_sp_act_mirror_del(void *priv, u8 local_in_port, int span_id, bool ingress)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_port *in_port;
+	enum mlxsw_sp_span_type type;
+
+	type = ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS;
+	in_port = mlxsw_sp->ports[local_in_port];
+
+	mlxsw_sp_span_mirror_del(in_port, span_id, type, false);
+}
+
+static const struct mlxsw_afa_ops mlxsw_sp_act_afa_ops = {
+	.kvdl_set_add		= mlxsw_sp_act_kvdl_set_add,
+	.kvdl_set_del		= mlxsw_sp_act_kvdl_set_del,
+	.kvdl_fwd_entry_add	= mlxsw_sp_act_kvdl_fwd_entry_add,
+	.kvdl_fwd_entry_del	= mlxsw_sp_act_kvdl_fwd_entry_del,
+	.counter_index_get	= mlxsw_sp_act_counter_index_get,
+	.counter_index_put	= mlxsw_sp_act_counter_index_put,
+	.mirror_add		= mlxsw_sp_act_mirror_add,
+	.mirror_del		= mlxsw_sp_act_mirror_del,
+};
+
+int mlxsw_sp_afa_init(struct mlxsw_sp *mlxsw_sp)
+{
+	mlxsw_sp->afa = mlxsw_afa_create(MLXSW_CORE_RES_GET(mlxsw_sp->core,
+							    ACL_ACTIONS_PER_SET),
+					 &mlxsw_sp_act_afa_ops, mlxsw_sp);
+	return PTR_ERR_OR_ZERO(mlxsw_sp->afa);
+}
+
+void mlxsw_sp_afa_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	mlxsw_afa_destroy(mlxsw_sp->afa);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h
new file mode 100644
index 0000000..bd6d552
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h
@@ -0,0 +1,44 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_SPECTRUM_ACL_FLEX_ACTIONS_H
+#define _MLXSW_SPECTRUM_ACL_FLEX_ACTIONS_H
+
+#include "spectrum.h"
+
+int mlxsw_sp_afa_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_afa_fini(struct mlxsw_sp *mlxsw_sp);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_keys.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_keys.h
new file mode 100644
index 0000000..fb80318
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_keys.h
@@ -0,0 +1,124 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_keys.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_SPECTRUM_ACL_FLEX_KEYS_H
+#define _MLXSW_SPECTRUM_ACL_FLEX_KEYS_H
+
+#include "core_acl_flex_keys.h"
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_l2_dmac[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(DMAC, 0x00, 6),
+	MLXSW_AFK_ELEMENT_INST_U32(PCP, 0x08, 13, 3),
+	MLXSW_AFK_ELEMENT_INST_U32(VID, 0x08, 0, 12),
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_SYS_PORT, 0x0C, 0, 16),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_l2_smac[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SMAC, 0x00, 6),
+	MLXSW_AFK_ELEMENT_INST_U32(PCP, 0x08, 13, 3),
+	MLXSW_AFK_ELEMENT_INST_U32(VID, 0x08, 0, 12),
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_SYS_PORT, 0x0C, 0, 16),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_l2_smac_ex[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SMAC, 0x02, 6),
+	MLXSW_AFK_ELEMENT_INST_U32(ETHERTYPE, 0x0C, 0, 16),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4_sip[] = {
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_IP4, 0x00, 0, 32),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_PROTO, 0x08, 0, 8),
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_SYS_PORT, 0x0C, 0, 16),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4_dip[] = {
+	MLXSW_AFK_ELEMENT_INST_U32(DST_IP4, 0x00, 0, 32),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_PROTO, 0x08, 0, 8),
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_SYS_PORT, 0x0C, 0, 16),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4[] = {
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_IP4, 0x00, 0, 32),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_ECN, 0x04, 4, 2),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_TTL_, 0x04, 24, 8),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_DSCP, 0x08, 0, 6),
+	MLXSW_AFK_ELEMENT_INST_U32(TCP_FLAGS, 0x08, 8, 9), /* TCP_CONTROL+TCP_ECN */
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4_ex[] = {
+	MLXSW_AFK_ELEMENT_INST_U32(VID, 0x00, 0, 12),
+	MLXSW_AFK_ELEMENT_INST_U32(PCP, 0x08, 29, 3),
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_L4_PORT, 0x08, 0, 16),
+	MLXSW_AFK_ELEMENT_INST_U32(DST_L4_PORT, 0x0C, 0, 16),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_dip[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(DST_IP6_LO, 0x00, 8),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_ex1[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(DST_IP6_HI, 0x00, 8),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_PROTO, 0x08, 0, 8),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_sip[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SRC_IP6_LO, 0x00, 8),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_sip_ex[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SRC_IP6_HI, 0x00, 8),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_packet_type[] = {
+	MLXSW_AFK_ELEMENT_INST_U32(ETHERTYPE, 0x00, 0, 16),
+};
+
+static const struct mlxsw_afk_block mlxsw_sp_afk_blocks[] = {
+	MLXSW_AFK_BLOCK(0x10, mlxsw_sp_afk_element_info_l2_dmac),
+	MLXSW_AFK_BLOCK(0x11, mlxsw_sp_afk_element_info_l2_smac),
+	MLXSW_AFK_BLOCK(0x12, mlxsw_sp_afk_element_info_l2_smac_ex),
+	MLXSW_AFK_BLOCK(0x30, mlxsw_sp_afk_element_info_ipv4_sip),
+	MLXSW_AFK_BLOCK(0x31, mlxsw_sp_afk_element_info_ipv4_dip),
+	MLXSW_AFK_BLOCK(0x32, mlxsw_sp_afk_element_info_ipv4),
+	MLXSW_AFK_BLOCK(0x33, mlxsw_sp_afk_element_info_ipv4_ex),
+	MLXSW_AFK_BLOCK(0x60, mlxsw_sp_afk_element_info_ipv6_dip),
+	MLXSW_AFK_BLOCK(0x65, mlxsw_sp_afk_element_info_ipv6_ex1),
+	MLXSW_AFK_BLOCK(0x62, mlxsw_sp_afk_element_info_ipv6_sip),
+	MLXSW_AFK_BLOCK(0x63, mlxsw_sp_afk_element_info_ipv6_sip_ex),
+	MLXSW_AFK_BLOCK(0xB0, mlxsw_sp_afk_element_info_packet_type),
+};
+
+#define MLXSW_SP_AFK_BLOCKS_COUNT ARRAY_SIZE(mlxsw_sp_afk_blocks)
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
new file mode 100644
index 0000000..ad1b548
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
@@ -0,0 +1,1140 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <linux/parman.h>
+
+#include "reg.h"
+#include "core.h"
+#include "resources.h"
+#include "spectrum.h"
+#include "core_acl_flex_keys.h"
+
+struct mlxsw_sp_acl_tcam {
+	unsigned long *used_regions; /* bit array */
+	unsigned int max_regions;
+	unsigned long *used_groups;  /* bit array */
+	unsigned int max_groups;
+	unsigned int max_group_size;
+};
+
+static int mlxsw_sp_acl_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	struct mlxsw_sp_acl_tcam *tcam = priv;
+	u64 max_tcam_regions;
+	u64 max_regions;
+	u64 max_groups;
+	size_t alloc_size;
+	int err;
+
+	max_tcam_regions = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+					      ACL_MAX_TCAM_REGIONS);
+	max_regions = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_REGIONS);
+
+	/* Use 1:1 mapping between ACL region and TCAM region */
+	if (max_tcam_regions < max_regions)
+		max_regions = max_tcam_regions;
+
+	alloc_size = sizeof(tcam->used_regions[0]) * BITS_TO_LONGS(max_regions);
+	tcam->used_regions = kzalloc(alloc_size, GFP_KERNEL);
+	if (!tcam->used_regions)
+		return -ENOMEM;
+	tcam->max_regions = max_regions;
+
+	max_groups = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_GROUPS);
+	alloc_size = sizeof(tcam->used_groups[0]) * BITS_TO_LONGS(max_groups);
+	tcam->used_groups = kzalloc(alloc_size, GFP_KERNEL);
+	if (!tcam->used_groups) {
+		err = -ENOMEM;
+		goto err_alloc_used_groups;
+	}
+	tcam->max_groups = max_groups;
+	tcam->max_group_size = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						 ACL_MAX_GROUP_SIZE);
+	return 0;
+
+err_alloc_used_groups:
+	kfree(tcam->used_regions);
+	return err;
+}
+
+static void mlxsw_sp_acl_tcam_fini(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	struct mlxsw_sp_acl_tcam *tcam = priv;
+
+	kfree(tcam->used_groups);
+	kfree(tcam->used_regions);
+}
+
+static int mlxsw_sp_acl_tcam_region_id_get(struct mlxsw_sp_acl_tcam *tcam,
+					   u16 *p_id)
+{
+	u16 id;
+
+	id = find_first_zero_bit(tcam->used_regions, tcam->max_regions);
+	if (id < tcam->max_regions) {
+		__set_bit(id, tcam->used_regions);
+		*p_id = id;
+		return 0;
+	}
+	return -ENOBUFS;
+}
+
+static void mlxsw_sp_acl_tcam_region_id_put(struct mlxsw_sp_acl_tcam *tcam,
+					    u16 id)
+{
+	__clear_bit(id, tcam->used_regions);
+}
+
+static int mlxsw_sp_acl_tcam_group_id_get(struct mlxsw_sp_acl_tcam *tcam,
+					  u16 *p_id)
+{
+	u16 id;
+
+	id = find_first_zero_bit(tcam->used_groups, tcam->max_groups);
+	if (id < tcam->max_groups) {
+		__set_bit(id, tcam->used_groups);
+		*p_id = id;
+		return 0;
+	}
+	return -ENOBUFS;
+}
+
+static void mlxsw_sp_acl_tcam_group_id_put(struct mlxsw_sp_acl_tcam *tcam,
+					   u16 id)
+{
+	__clear_bit(id, tcam->used_groups);
+}
+
+struct mlxsw_sp_acl_tcam_pattern {
+	const enum mlxsw_afk_element *elements;
+	unsigned int elements_count;
+};
+
+struct mlxsw_sp_acl_tcam_group {
+	struct mlxsw_sp_acl_tcam *tcam;
+	u16 id;
+	struct list_head region_list;
+	unsigned int region_count;
+	struct rhashtable chunk_ht;
+	struct mlxsw_sp_acl_tcam_group_ops *ops;
+	const struct mlxsw_sp_acl_tcam_pattern *patterns;
+	unsigned int patterns_count;
+};
+
+struct mlxsw_sp_acl_tcam_region {
+	struct list_head list; /* Member of a TCAM group */
+	struct list_head chunk_list; /* List of chunks under this region */
+	struct parman *parman;
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_sp_acl_tcam_group *group;
+	u16 id; /* ACL ID and region ID - they are same */
+	char tcam_region_info[MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN];
+	struct mlxsw_afk_key_info *key_info;
+	struct {
+		struct parman_prio parman_prio;
+		struct parman_item parman_item;
+		struct mlxsw_sp_acl_rule_info *rulei;
+	} catchall;
+};
+
+struct mlxsw_sp_acl_tcam_chunk {
+	struct list_head list; /* Member of a TCAM region */
+	struct rhash_head ht_node; /* Member of a chunk HT */
+	unsigned int priority; /* Priority within the region and group */
+	struct parman_prio parman_prio;
+	struct mlxsw_sp_acl_tcam_group *group;
+	struct mlxsw_sp_acl_tcam_region *region;
+	unsigned int ref_count;
+};
+
+struct mlxsw_sp_acl_tcam_entry {
+	struct parman_item parman_item;
+	struct mlxsw_sp_acl_tcam_chunk *chunk;
+};
+
+static const struct rhashtable_params mlxsw_sp_acl_tcam_chunk_ht_params = {
+	.key_len = sizeof(unsigned int),
+	.key_offset = offsetof(struct mlxsw_sp_acl_tcam_chunk, priority),
+	.head_offset = offsetof(struct mlxsw_sp_acl_tcam_chunk, ht_node),
+	.automatic_shrinking = true,
+};
+
+static int mlxsw_sp_acl_tcam_group_update(struct mlxsw_sp *mlxsw_sp,
+					  struct mlxsw_sp_acl_tcam_group *group)
+{
+	struct mlxsw_sp_acl_tcam_region *region;
+	char pagt_pl[MLXSW_REG_PAGT_LEN];
+	int acl_index = 0;
+
+	mlxsw_reg_pagt_pack(pagt_pl, group->id);
+	list_for_each_entry(region, &group->region_list, list)
+		mlxsw_reg_pagt_acl_id_pack(pagt_pl, acl_index++, region->id);
+	mlxsw_reg_pagt_size_set(pagt_pl, acl_index);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pagt), pagt_pl);
+}
+
+static int
+mlxsw_sp_acl_tcam_group_add(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_tcam *tcam,
+			    struct mlxsw_sp_acl_tcam_group *group,
+			    const struct mlxsw_sp_acl_tcam_pattern *patterns,
+			    unsigned int patterns_count)
+{
+	int err;
+
+	group->tcam = tcam;
+	group->patterns = patterns;
+	group->patterns_count = patterns_count;
+	INIT_LIST_HEAD(&group->region_list);
+	err = mlxsw_sp_acl_tcam_group_id_get(tcam, &group->id);
+	if (err)
+		return err;
+
+	err = rhashtable_init(&group->chunk_ht,
+			      &mlxsw_sp_acl_tcam_chunk_ht_params);
+	if (err)
+		goto err_rhashtable_init;
+
+	return 0;
+
+err_rhashtable_init:
+	mlxsw_sp_acl_tcam_group_id_put(tcam, group->id);
+	return err;
+}
+
+static void mlxsw_sp_acl_tcam_group_del(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_acl_tcam_group *group)
+{
+	struct mlxsw_sp_acl_tcam *tcam = group->tcam;
+
+	rhashtable_destroy(&group->chunk_ht);
+	mlxsw_sp_acl_tcam_group_id_put(tcam, group->id);
+	WARN_ON(!list_empty(&group->region_list));
+}
+
+static int
+mlxsw_sp_acl_tcam_group_bind(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_acl_tcam_group *group,
+			     struct mlxsw_sp_port *mlxsw_sp_port,
+			     bool ingress)
+{
+	char ppbt_pl[MLXSW_REG_PPBT_LEN];
+
+	mlxsw_reg_ppbt_pack(ppbt_pl, ingress ? MLXSW_REG_PXBT_E_IACL :
+					       MLXSW_REG_PXBT_E_EACL,
+			    MLXSW_REG_PXBT_OP_BIND, mlxsw_sp_port->local_port,
+			    group->id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ppbt), ppbt_pl);
+}
+
+static void
+mlxsw_sp_acl_tcam_group_unbind(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_tcam_group *group,
+			       struct mlxsw_sp_port *mlxsw_sp_port,
+			       bool ingress)
+{
+	char ppbt_pl[MLXSW_REG_PPBT_LEN];
+
+	mlxsw_reg_ppbt_pack(ppbt_pl, ingress ? MLXSW_REG_PXBT_E_IACL :
+					       MLXSW_REG_PXBT_E_EACL,
+			    MLXSW_REG_PXBT_OP_UNBIND, mlxsw_sp_port->local_port,
+			    group->id);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ppbt), ppbt_pl);
+}
+
+static u16
+mlxsw_sp_acl_tcam_group_id(struct mlxsw_sp_acl_tcam_group *group)
+{
+	return group->id;
+}
+
+static unsigned int
+mlxsw_sp_acl_tcam_region_prio(struct mlxsw_sp_acl_tcam_region *region)
+{
+	struct mlxsw_sp_acl_tcam_chunk *chunk;
+
+	if (list_empty(&region->chunk_list))
+		return 0;
+	/* As a priority of a region, return priority of the first chunk */
+	chunk = list_first_entry(&region->chunk_list, typeof(*chunk), list);
+	return chunk->priority;
+}
+
+static unsigned int
+mlxsw_sp_acl_tcam_region_max_prio(struct mlxsw_sp_acl_tcam_region *region)
+{
+	struct mlxsw_sp_acl_tcam_chunk *chunk;
+
+	if (list_empty(&region->chunk_list))
+		return 0;
+	chunk = list_last_entry(&region->chunk_list, typeof(*chunk), list);
+	return chunk->priority;
+}
+
+static void
+mlxsw_sp_acl_tcam_group_list_add(struct mlxsw_sp_acl_tcam_group *group,
+				 struct mlxsw_sp_acl_tcam_region *region)
+{
+	struct mlxsw_sp_acl_tcam_region *region2;
+	struct list_head *pos;
+
+	/* Position the region inside the list according to priority */
+	list_for_each(pos, &group->region_list) {
+		region2 = list_entry(pos, typeof(*region2), list);
+		if (mlxsw_sp_acl_tcam_region_prio(region2) >
+		    mlxsw_sp_acl_tcam_region_prio(region))
+			break;
+	}
+	list_add_tail(&region->list, pos);
+	group->region_count++;
+}
+
+static void
+mlxsw_sp_acl_tcam_group_list_del(struct mlxsw_sp_acl_tcam_group *group,
+				 struct mlxsw_sp_acl_tcam_region *region)
+{
+	group->region_count--;
+	list_del(&region->list);
+}
+
+static int
+mlxsw_sp_acl_tcam_group_region_attach(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_acl_tcam_group *group,
+				      struct mlxsw_sp_acl_tcam_region *region)
+{
+	int err;
+
+	if (group->region_count == group->tcam->max_group_size)
+		return -ENOBUFS;
+
+	mlxsw_sp_acl_tcam_group_list_add(group, region);
+
+	err = mlxsw_sp_acl_tcam_group_update(mlxsw_sp, group);
+	if (err)
+		goto err_group_update;
+	region->group = group;
+
+	return 0;
+
+err_group_update:
+	mlxsw_sp_acl_tcam_group_list_del(group, region);
+	mlxsw_sp_acl_tcam_group_update(mlxsw_sp, group);
+	return err;
+}
+
+static void
+mlxsw_sp_acl_tcam_group_region_detach(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_acl_tcam_region *region)
+{
+	struct mlxsw_sp_acl_tcam_group *group = region->group;
+
+	mlxsw_sp_acl_tcam_group_list_del(group, region);
+	mlxsw_sp_acl_tcam_group_update(mlxsw_sp, group);
+}
+
+static struct mlxsw_sp_acl_tcam_region *
+mlxsw_sp_acl_tcam_group_region_find(struct mlxsw_sp_acl_tcam_group *group,
+				    unsigned int priority,
+				    struct mlxsw_afk_element_usage *elusage,
+				    bool *p_need_split)
+{
+	struct mlxsw_sp_acl_tcam_region *region, *region2;
+	struct list_head *pos;
+	bool issubset;
+
+	list_for_each(pos, &group->region_list) {
+		region = list_entry(pos, typeof(*region), list);
+
+		/* First, check if the requested priority does not rather belong
+		 * under some of the next regions.
+		 */
+		if (pos->next != &group->region_list) { /* not last */
+			region2 = list_entry(pos->next, typeof(*region2), list);
+			if (priority >= mlxsw_sp_acl_tcam_region_prio(region2))
+				continue;
+		}
+
+		issubset = mlxsw_afk_key_info_subset(region->key_info, elusage);
+
+		/* If requested element usage would not fit and the priority
+		 * is lower than the currently inspected region we cannot
+		 * use this region, so return NULL to indicate new region has
+		 * to be created.
+		 */
+		if (!issubset &&
+		    priority < mlxsw_sp_acl_tcam_region_prio(region))
+			return NULL;
+
+		/* If requested element usage would not fit and the priority
+		 * is higher than the currently inspected region we cannot
+		 * use this region. There is still some hope that the next
+		 * region would be the fit. So let it be processed and
+		 * eventually break at the check right above this.
+		 */
+		if (!issubset &&
+		    priority > mlxsw_sp_acl_tcam_region_max_prio(region))
+			continue;
+
+		/* Indicate if the region needs to be split in order to add
+		 * the requested priority. Split is needed when requested
+		 * element usage won't fit into the found region.
+		 */
+		*p_need_split = !issubset;
+		return region;
+	}
+	return NULL; /* New region has to be created. */
+}
+
+static void
+mlxsw_sp_acl_tcam_group_use_patterns(struct mlxsw_sp_acl_tcam_group *group,
+				     struct mlxsw_afk_element_usage *elusage,
+				     struct mlxsw_afk_element_usage *out)
+{
+	const struct mlxsw_sp_acl_tcam_pattern *pattern;
+	int i;
+
+	for (i = 0; i < group->patterns_count; i++) {
+		pattern = &group->patterns[i];
+		mlxsw_afk_element_usage_fill(out, pattern->elements,
+					     pattern->elements_count);
+		if (mlxsw_afk_element_usage_subset(elusage, out))
+			return;
+	}
+	memcpy(out, elusage, sizeof(*out));
+}
+
+#define MLXSW_SP_ACL_TCAM_REGION_BASE_COUNT 16
+#define MLXSW_SP_ACL_TCAM_REGION_RESIZE_STEP 16
+
+static int
+mlxsw_sp_acl_tcam_region_alloc(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_tcam_region *region)
+{
+	struct mlxsw_afk_key_info *key_info = region->key_info;
+	char ptar_pl[MLXSW_REG_PTAR_LEN];
+	unsigned int encodings_count;
+	int i;
+	int err;
+
+	mlxsw_reg_ptar_pack(ptar_pl, MLXSW_REG_PTAR_OP_ALLOC,
+			    MLXSW_SP_ACL_TCAM_REGION_BASE_COUNT,
+			    region->id, region->tcam_region_info);
+	encodings_count = mlxsw_afk_key_info_blocks_count_get(key_info);
+	for (i = 0; i < encodings_count; i++) {
+		u16 encoding;
+
+		encoding = mlxsw_afk_key_info_block_encoding_get(key_info, i);
+		mlxsw_reg_ptar_key_id_pack(ptar_pl, i, encoding);
+	}
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptar), ptar_pl);
+	if (err)
+		return err;
+	mlxsw_reg_ptar_unpack(ptar_pl, region->tcam_region_info);
+	return 0;
+}
+
+static void
+mlxsw_sp_acl_tcam_region_free(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_acl_tcam_region *region)
+{
+	char ptar_pl[MLXSW_REG_PTAR_LEN];
+
+	mlxsw_reg_ptar_pack(ptar_pl, MLXSW_REG_PTAR_OP_FREE, 0, region->id,
+			    region->tcam_region_info);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptar), ptar_pl);
+}
+
+static int
+mlxsw_sp_acl_tcam_region_resize(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_tcam_region *region,
+				u16 new_size)
+{
+	char ptar_pl[MLXSW_REG_PTAR_LEN];
+
+	mlxsw_reg_ptar_pack(ptar_pl, MLXSW_REG_PTAR_OP_RESIZE,
+			    new_size, region->id, region->tcam_region_info);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptar), ptar_pl);
+}
+
+static int
+mlxsw_sp_acl_tcam_region_enable(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_tcam_region *region)
+{
+	char pacl_pl[MLXSW_REG_PACL_LEN];
+
+	mlxsw_reg_pacl_pack(pacl_pl, region->id, true,
+			    region->tcam_region_info);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pacl), pacl_pl);
+}
+
+static void
+mlxsw_sp_acl_tcam_region_disable(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_tcam_region *region)
+{
+	char pacl_pl[MLXSW_REG_PACL_LEN];
+
+	mlxsw_reg_pacl_pack(pacl_pl, region->id, false,
+			    region->tcam_region_info);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pacl), pacl_pl);
+}
+
+static int
+mlxsw_sp_acl_tcam_region_entry_insert(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_acl_tcam_region *region,
+				      unsigned int offset,
+				      struct mlxsw_sp_acl_rule_info *rulei)
+{
+	char ptce2_pl[MLXSW_REG_PTCE2_LEN];
+	char *act_set;
+	char *mask;
+	char *key;
+
+	mlxsw_reg_ptce2_pack(ptce2_pl, true, MLXSW_REG_PTCE2_OP_WRITE_WRITE,
+			     region->tcam_region_info, offset);
+	key = mlxsw_reg_ptce2_flex_key_blocks_data(ptce2_pl);
+	mask = mlxsw_reg_ptce2_mask_data(ptce2_pl);
+	mlxsw_afk_encode(region->key_info, &rulei->values, key, mask);
+
+	/* Only the first action set belongs here, the rest is in KVD */
+	act_set = mlxsw_afa_block_first_set(rulei->act_block);
+	mlxsw_reg_ptce2_flex_action_set_memcpy_to(ptce2_pl, act_set);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptce2), ptce2_pl);
+}
+
+static void
+mlxsw_sp_acl_tcam_region_entry_remove(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_acl_tcam_region *region,
+				      unsigned int offset)
+{
+	char ptce2_pl[MLXSW_REG_PTCE2_LEN];
+
+	mlxsw_reg_ptce2_pack(ptce2_pl, false, MLXSW_REG_PTCE2_OP_WRITE_WRITE,
+			     region->tcam_region_info, offset);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptce2), ptce2_pl);
+}
+
+static int
+mlxsw_sp_acl_tcam_region_entry_activity_get(struct mlxsw_sp *mlxsw_sp,
+					    struct mlxsw_sp_acl_tcam_region *region,
+					    unsigned int offset,
+					    bool *activity)
+{
+	char ptce2_pl[MLXSW_REG_PTCE2_LEN];
+	int err;
+
+	mlxsw_reg_ptce2_pack(ptce2_pl, true, MLXSW_REG_PTCE2_OP_QUERY_CLEAR_ON_READ,
+			     region->tcam_region_info, offset);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ptce2), ptce2_pl);
+	if (err)
+		return err;
+	*activity = mlxsw_reg_ptce2_a_get(ptce2_pl);
+	return 0;
+}
+
+#define MLXSW_SP_ACL_TCAM_CATCHALL_PRIO (~0U)
+
+static int
+mlxsw_sp_acl_tcam_region_catchall_add(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_acl_tcam_region *region)
+{
+	struct parman_prio *parman_prio = &region->catchall.parman_prio;
+	struct parman_item *parman_item = &region->catchall.parman_item;
+	struct mlxsw_sp_acl_rule_info *rulei;
+	int err;
+
+	parman_prio_init(region->parman, parman_prio,
+			 MLXSW_SP_ACL_TCAM_CATCHALL_PRIO);
+	err = parman_item_add(region->parman, parman_prio, parman_item);
+	if (err)
+		goto err_parman_item_add;
+
+	rulei = mlxsw_sp_acl_rulei_create(mlxsw_sp->acl);
+	if (IS_ERR(rulei)) {
+		err = PTR_ERR(rulei);
+		goto err_rulei_create;
+	}
+
+	err = mlxsw_sp_acl_rulei_act_continue(rulei);
+	if (WARN_ON(err))
+		goto err_rulei_act_continue;
+
+	err = mlxsw_sp_acl_rulei_commit(rulei);
+	if (err)
+		goto err_rulei_commit;
+
+	err = mlxsw_sp_acl_tcam_region_entry_insert(mlxsw_sp, region,
+						    parman_item->index, rulei);
+	region->catchall.rulei = rulei;
+	if (err)
+		goto err_rule_insert;
+
+	return 0;
+
+err_rule_insert:
+err_rulei_commit:
+err_rulei_act_continue:
+	mlxsw_sp_acl_rulei_destroy(rulei);
+err_rulei_create:
+	parman_item_remove(region->parman, parman_prio, parman_item);
+err_parman_item_add:
+	parman_prio_fini(parman_prio);
+	return err;
+}
+
+static void
+mlxsw_sp_acl_tcam_region_catchall_del(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_acl_tcam_region *region)
+{
+	struct parman_prio *parman_prio = &region->catchall.parman_prio;
+	struct parman_item *parman_item = &region->catchall.parman_item;
+	struct mlxsw_sp_acl_rule_info *rulei = region->catchall.rulei;
+
+	mlxsw_sp_acl_tcam_region_entry_remove(mlxsw_sp, region,
+					      parman_item->index);
+	mlxsw_sp_acl_rulei_destroy(rulei);
+	parman_item_remove(region->parman, parman_prio, parman_item);
+	parman_prio_fini(parman_prio);
+}
+
+static void
+mlxsw_sp_acl_tcam_region_move(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_acl_tcam_region *region,
+			      u16 src_offset, u16 dst_offset, u16 size)
+{
+	char prcr_pl[MLXSW_REG_PRCR_LEN];
+
+	mlxsw_reg_prcr_pack(prcr_pl, MLXSW_REG_PRCR_OP_MOVE,
+			    region->tcam_region_info, src_offset,
+			    region->tcam_region_info, dst_offset, size);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(prcr), prcr_pl);
+}
+
+static int mlxsw_sp_acl_tcam_region_parman_resize(void *priv,
+						  unsigned long new_count)
+{
+	struct mlxsw_sp_acl_tcam_region *region = priv;
+	struct mlxsw_sp *mlxsw_sp = region->mlxsw_sp;
+	u64 max_tcam_rules;
+
+	max_tcam_rules = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_TCAM_RULES);
+	if (new_count > max_tcam_rules)
+		return -EINVAL;
+	return mlxsw_sp_acl_tcam_region_resize(mlxsw_sp, region, new_count);
+}
+
+static void mlxsw_sp_acl_tcam_region_parman_move(void *priv,
+						 unsigned long from_index,
+						 unsigned long to_index,
+						 unsigned long count)
+{
+	struct mlxsw_sp_acl_tcam_region *region = priv;
+	struct mlxsw_sp *mlxsw_sp = region->mlxsw_sp;
+
+	mlxsw_sp_acl_tcam_region_move(mlxsw_sp, region,
+				      from_index, to_index, count);
+}
+
+static const struct parman_ops mlxsw_sp_acl_tcam_region_parman_ops = {
+	.base_count	= MLXSW_SP_ACL_TCAM_REGION_BASE_COUNT,
+	.resize_step	= MLXSW_SP_ACL_TCAM_REGION_RESIZE_STEP,
+	.resize		= mlxsw_sp_acl_tcam_region_parman_resize,
+	.move		= mlxsw_sp_acl_tcam_region_parman_move,
+	.algo		= PARMAN_ALGO_TYPE_LSORT,
+};
+
+static struct mlxsw_sp_acl_tcam_region *
+mlxsw_sp_acl_tcam_region_create(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_tcam *tcam,
+				struct mlxsw_afk_element_usage *elusage)
+{
+	struct mlxsw_afk *afk = mlxsw_sp_acl_afk(mlxsw_sp->acl);
+	struct mlxsw_sp_acl_tcam_region *region;
+	int err;
+
+	region = kzalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&region->chunk_list);
+	region->mlxsw_sp = mlxsw_sp;
+
+	region->parman = parman_create(&mlxsw_sp_acl_tcam_region_parman_ops,
+				       region);
+	if (!region->parman) {
+		err = -ENOMEM;
+		goto err_parman_create;
+	}
+
+	region->key_info = mlxsw_afk_key_info_get(afk, elusage);
+	if (IS_ERR(region->key_info)) {
+		err = PTR_ERR(region->key_info);
+		goto err_key_info_get;
+	}
+
+	err = mlxsw_sp_acl_tcam_region_id_get(tcam, &region->id);
+	if (err)
+		goto err_region_id_get;
+
+	err = mlxsw_sp_acl_tcam_region_alloc(mlxsw_sp, region);
+	if (err)
+		goto err_tcam_region_alloc;
+
+	err = mlxsw_sp_acl_tcam_region_enable(mlxsw_sp, region);
+	if (err)
+		goto err_tcam_region_enable;
+
+	err = mlxsw_sp_acl_tcam_region_catchall_add(mlxsw_sp, region);
+	if (err)
+		goto err_tcam_region_catchall_add;
+
+	return region;
+
+err_tcam_region_catchall_add:
+	mlxsw_sp_acl_tcam_region_disable(mlxsw_sp, region);
+err_tcam_region_enable:
+	mlxsw_sp_acl_tcam_region_free(mlxsw_sp, region);
+err_tcam_region_alloc:
+	mlxsw_sp_acl_tcam_region_id_put(tcam, region->id);
+err_region_id_get:
+	mlxsw_afk_key_info_put(region->key_info);
+err_key_info_get:
+	parman_destroy(region->parman);
+err_parman_create:
+	kfree(region);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_acl_tcam_region_destroy(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_tcam_region *region)
+{
+	mlxsw_sp_acl_tcam_region_catchall_del(mlxsw_sp, region);
+	mlxsw_sp_acl_tcam_region_disable(mlxsw_sp, region);
+	mlxsw_sp_acl_tcam_region_free(mlxsw_sp, region);
+	mlxsw_sp_acl_tcam_region_id_put(region->group->tcam, region->id);
+	mlxsw_afk_key_info_put(region->key_info);
+	parman_destroy(region->parman);
+	kfree(region);
+}
+
+static int
+mlxsw_sp_acl_tcam_chunk_assoc(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_acl_tcam_group *group,
+			      unsigned int priority,
+			      struct mlxsw_afk_element_usage *elusage,
+			      struct mlxsw_sp_acl_tcam_chunk *chunk)
+{
+	struct mlxsw_sp_acl_tcam_region *region;
+	bool region_created = false;
+	bool need_split;
+	int err;
+
+	region = mlxsw_sp_acl_tcam_group_region_find(group, priority, elusage,
+						     &need_split);
+	if (region && need_split) {
+		/* According to priority, the chunk should belong to an
+		 * existing region. However, this chunk needs elements
+		 * that region does not contain. We need to split the existing
+		 * region into two and create a new region for this chunk
+		 * in between. This is not supported now.
+		 */
+		return -EOPNOTSUPP;
+	}
+	if (!region) {
+		struct mlxsw_afk_element_usage region_elusage;
+
+		mlxsw_sp_acl_tcam_group_use_patterns(group, elusage,
+						     &region_elusage);
+		region = mlxsw_sp_acl_tcam_region_create(mlxsw_sp, group->tcam,
+							 &region_elusage);
+		if (IS_ERR(region))
+			return PTR_ERR(region);
+		region_created = true;
+	}
+
+	chunk->region = region;
+	list_add_tail(&chunk->list, &region->chunk_list);
+
+	if (!region_created)
+		return 0;
+
+	err = mlxsw_sp_acl_tcam_group_region_attach(mlxsw_sp, group, region);
+	if (err)
+		goto err_group_region_attach;
+
+	return 0;
+
+err_group_region_attach:
+	mlxsw_sp_acl_tcam_region_destroy(mlxsw_sp, region);
+	return err;
+}
+
+static void
+mlxsw_sp_acl_tcam_chunk_deassoc(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_tcam_chunk *chunk)
+{
+	struct mlxsw_sp_acl_tcam_region *region = chunk->region;
+
+	list_del(&chunk->list);
+	if (list_empty(&region->chunk_list)) {
+		mlxsw_sp_acl_tcam_group_region_detach(mlxsw_sp, region);
+		mlxsw_sp_acl_tcam_region_destroy(mlxsw_sp, region);
+	}
+}
+
+static struct mlxsw_sp_acl_tcam_chunk *
+mlxsw_sp_acl_tcam_chunk_create(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_tcam_group *group,
+			       unsigned int priority,
+			       struct mlxsw_afk_element_usage *elusage)
+{
+	struct mlxsw_sp_acl_tcam_chunk *chunk;
+	int err;
+
+	if (priority == MLXSW_SP_ACL_TCAM_CATCHALL_PRIO)
+		return ERR_PTR(-EINVAL);
+
+	chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+	if (!chunk)
+		return ERR_PTR(-ENOMEM);
+	chunk->priority = priority;
+	chunk->group = group;
+	chunk->ref_count = 1;
+
+	err = mlxsw_sp_acl_tcam_chunk_assoc(mlxsw_sp, group, priority,
+					    elusage, chunk);
+	if (err)
+		goto err_chunk_assoc;
+
+	parman_prio_init(chunk->region->parman, &chunk->parman_prio, priority);
+
+	err = rhashtable_insert_fast(&group->chunk_ht, &chunk->ht_node,
+				     mlxsw_sp_acl_tcam_chunk_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	return chunk;
+
+err_rhashtable_insert:
+	parman_prio_fini(&chunk->parman_prio);
+	mlxsw_sp_acl_tcam_chunk_deassoc(mlxsw_sp, chunk);
+err_chunk_assoc:
+	kfree(chunk);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_acl_tcam_chunk_destroy(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_tcam_chunk *chunk)
+{
+	struct mlxsw_sp_acl_tcam_group *group = chunk->group;
+
+	rhashtable_remove_fast(&group->chunk_ht, &chunk->ht_node,
+			       mlxsw_sp_acl_tcam_chunk_ht_params);
+	parman_prio_fini(&chunk->parman_prio);
+	mlxsw_sp_acl_tcam_chunk_deassoc(mlxsw_sp, chunk);
+	kfree(chunk);
+}
+
+static struct mlxsw_sp_acl_tcam_chunk *
+mlxsw_sp_acl_tcam_chunk_get(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_tcam_group *group,
+			    unsigned int priority,
+			    struct mlxsw_afk_element_usage *elusage)
+{
+	struct mlxsw_sp_acl_tcam_chunk *chunk;
+
+	chunk = rhashtable_lookup_fast(&group->chunk_ht, &priority,
+				       mlxsw_sp_acl_tcam_chunk_ht_params);
+	if (chunk) {
+		if (WARN_ON(!mlxsw_afk_key_info_subset(chunk->region->key_info,
+						       elusage)))
+			return ERR_PTR(-EINVAL);
+		chunk->ref_count++;
+		return chunk;
+	}
+	return mlxsw_sp_acl_tcam_chunk_create(mlxsw_sp, group,
+					      priority, elusage);
+}
+
+static void mlxsw_sp_acl_tcam_chunk_put(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_acl_tcam_chunk *chunk)
+{
+	if (--chunk->ref_count)
+		return;
+	mlxsw_sp_acl_tcam_chunk_destroy(mlxsw_sp, chunk);
+}
+
+static int mlxsw_sp_acl_tcam_entry_add(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_acl_tcam_group *group,
+				       struct mlxsw_sp_acl_tcam_entry *entry,
+				       struct mlxsw_sp_acl_rule_info *rulei)
+{
+	struct mlxsw_sp_acl_tcam_chunk *chunk;
+	struct mlxsw_sp_acl_tcam_region *region;
+	int err;
+
+	chunk = mlxsw_sp_acl_tcam_chunk_get(mlxsw_sp, group, rulei->priority,
+					    &rulei->values.elusage);
+	if (IS_ERR(chunk))
+		return PTR_ERR(chunk);
+
+	region = chunk->region;
+	err = parman_item_add(region->parman, &chunk->parman_prio,
+			      &entry->parman_item);
+	if (err)
+		goto err_parman_item_add;
+
+	err = mlxsw_sp_acl_tcam_region_entry_insert(mlxsw_sp, region,
+						    entry->parman_item.index,
+						    rulei);
+	if (err)
+		goto err_rule_insert;
+	entry->chunk = chunk;
+
+	return 0;
+
+err_rule_insert:
+	parman_item_remove(region->parman, &chunk->parman_prio,
+			   &entry->parman_item);
+err_parman_item_add:
+	mlxsw_sp_acl_tcam_chunk_put(mlxsw_sp, chunk);
+	return err;
+}
+
+static void mlxsw_sp_acl_tcam_entry_del(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_acl_tcam_entry *entry)
+{
+	struct mlxsw_sp_acl_tcam_chunk *chunk = entry->chunk;
+	struct mlxsw_sp_acl_tcam_region *region = chunk->region;
+
+	mlxsw_sp_acl_tcam_region_entry_remove(mlxsw_sp, region,
+					      entry->parman_item.index);
+	parman_item_remove(region->parman, &chunk->parman_prio,
+			   &entry->parman_item);
+	mlxsw_sp_acl_tcam_chunk_put(mlxsw_sp, chunk);
+}
+
+static int
+mlxsw_sp_acl_tcam_entry_activity_get(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_acl_tcam_entry *entry,
+				     bool *activity)
+{
+	struct mlxsw_sp_acl_tcam_chunk *chunk = entry->chunk;
+	struct mlxsw_sp_acl_tcam_region *region = chunk->region;
+
+	return mlxsw_sp_acl_tcam_region_entry_activity_get(mlxsw_sp, region,
+							   entry->parman_item.index,
+							   activity);
+}
+
+static const enum mlxsw_afk_element mlxsw_sp_acl_tcam_pattern_ipv4[] = {
+	MLXSW_AFK_ELEMENT_SRC_SYS_PORT,
+	MLXSW_AFK_ELEMENT_DMAC,
+	MLXSW_AFK_ELEMENT_SMAC,
+	MLXSW_AFK_ELEMENT_ETHERTYPE,
+	MLXSW_AFK_ELEMENT_IP_PROTO,
+	MLXSW_AFK_ELEMENT_SRC_IP4,
+	MLXSW_AFK_ELEMENT_DST_IP4,
+	MLXSW_AFK_ELEMENT_DST_L4_PORT,
+	MLXSW_AFK_ELEMENT_SRC_L4_PORT,
+	MLXSW_AFK_ELEMENT_VID,
+	MLXSW_AFK_ELEMENT_PCP,
+	MLXSW_AFK_ELEMENT_TCP_FLAGS,
+	MLXSW_AFK_ELEMENT_IP_TTL_,
+	MLXSW_AFK_ELEMENT_IP_ECN,
+	MLXSW_AFK_ELEMENT_IP_DSCP,
+};
+
+static const enum mlxsw_afk_element mlxsw_sp_acl_tcam_pattern_ipv6[] = {
+	MLXSW_AFK_ELEMENT_ETHERTYPE,
+	MLXSW_AFK_ELEMENT_IP_PROTO,
+	MLXSW_AFK_ELEMENT_SRC_IP6_HI,
+	MLXSW_AFK_ELEMENT_SRC_IP6_LO,
+	MLXSW_AFK_ELEMENT_DST_IP6_HI,
+	MLXSW_AFK_ELEMENT_DST_IP6_LO,
+	MLXSW_AFK_ELEMENT_DST_L4_PORT,
+	MLXSW_AFK_ELEMENT_SRC_L4_PORT,
+};
+
+static const struct mlxsw_sp_acl_tcam_pattern mlxsw_sp_acl_tcam_patterns[] = {
+	{
+		.elements = mlxsw_sp_acl_tcam_pattern_ipv4,
+		.elements_count = ARRAY_SIZE(mlxsw_sp_acl_tcam_pattern_ipv4),
+	},
+	{
+		.elements = mlxsw_sp_acl_tcam_pattern_ipv6,
+		.elements_count = ARRAY_SIZE(mlxsw_sp_acl_tcam_pattern_ipv6),
+	},
+};
+
+#define MLXSW_SP_ACL_TCAM_PATTERNS_COUNT \
+	ARRAY_SIZE(mlxsw_sp_acl_tcam_patterns)
+
+struct mlxsw_sp_acl_tcam_flower_ruleset {
+	struct mlxsw_sp_acl_tcam_group group;
+};
+
+struct mlxsw_sp_acl_tcam_flower_rule {
+	struct mlxsw_sp_acl_tcam_entry entry;
+};
+
+static int
+mlxsw_sp_acl_tcam_flower_ruleset_add(struct mlxsw_sp *mlxsw_sp,
+				     void *priv, void *ruleset_priv)
+{
+	struct mlxsw_sp_acl_tcam_flower_ruleset *ruleset = ruleset_priv;
+	struct mlxsw_sp_acl_tcam *tcam = priv;
+
+	return mlxsw_sp_acl_tcam_group_add(mlxsw_sp, tcam, &ruleset->group,
+					   mlxsw_sp_acl_tcam_patterns,
+					   MLXSW_SP_ACL_TCAM_PATTERNS_COUNT);
+}
+
+static void
+mlxsw_sp_acl_tcam_flower_ruleset_del(struct mlxsw_sp *mlxsw_sp,
+				     void *ruleset_priv)
+{
+	struct mlxsw_sp_acl_tcam_flower_ruleset *ruleset = ruleset_priv;
+
+	mlxsw_sp_acl_tcam_group_del(mlxsw_sp, &ruleset->group);
+}
+
+static int
+mlxsw_sp_acl_tcam_flower_ruleset_bind(struct mlxsw_sp *mlxsw_sp,
+				      void *ruleset_priv,
+				      struct mlxsw_sp_port *mlxsw_sp_port,
+				      bool ingress)
+{
+	struct mlxsw_sp_acl_tcam_flower_ruleset *ruleset = ruleset_priv;
+
+	return mlxsw_sp_acl_tcam_group_bind(mlxsw_sp, &ruleset->group,
+					    mlxsw_sp_port, ingress);
+}
+
+static void
+mlxsw_sp_acl_tcam_flower_ruleset_unbind(struct mlxsw_sp *mlxsw_sp,
+					void *ruleset_priv,
+					struct mlxsw_sp_port *mlxsw_sp_port,
+					bool ingress)
+{
+	struct mlxsw_sp_acl_tcam_flower_ruleset *ruleset = ruleset_priv;
+
+	mlxsw_sp_acl_tcam_group_unbind(mlxsw_sp, &ruleset->group,
+				       mlxsw_sp_port, ingress);
+}
+
+static u16
+mlxsw_sp_acl_tcam_flower_ruleset_group_id(void *ruleset_priv)
+{
+	struct mlxsw_sp_acl_tcam_flower_ruleset *ruleset = ruleset_priv;
+
+	return mlxsw_sp_acl_tcam_group_id(&ruleset->group);
+}
+
+static int
+mlxsw_sp_acl_tcam_flower_rule_add(struct mlxsw_sp *mlxsw_sp,
+				  void *ruleset_priv, void *rule_priv,
+				  struct mlxsw_sp_acl_rule_info *rulei)
+{
+	struct mlxsw_sp_acl_tcam_flower_ruleset *ruleset = ruleset_priv;
+	struct mlxsw_sp_acl_tcam_flower_rule *rule = rule_priv;
+
+	return mlxsw_sp_acl_tcam_entry_add(mlxsw_sp, &ruleset->group,
+					   &rule->entry, rulei);
+}
+
+static void
+mlxsw_sp_acl_tcam_flower_rule_del(struct mlxsw_sp *mlxsw_sp, void *rule_priv)
+{
+	struct mlxsw_sp_acl_tcam_flower_rule *rule = rule_priv;
+
+	mlxsw_sp_acl_tcam_entry_del(mlxsw_sp, &rule->entry);
+}
+
+static int
+mlxsw_sp_acl_tcam_flower_rule_activity_get(struct mlxsw_sp *mlxsw_sp,
+					   void *rule_priv, bool *activity)
+{
+	struct mlxsw_sp_acl_tcam_flower_rule *rule = rule_priv;
+
+	return mlxsw_sp_acl_tcam_entry_activity_get(mlxsw_sp, &rule->entry,
+						    activity);
+}
+
+static const struct mlxsw_sp_acl_profile_ops mlxsw_sp_acl_tcam_flower_ops = {
+	.ruleset_priv_size	= sizeof(struct mlxsw_sp_acl_tcam_flower_ruleset),
+	.ruleset_add		= mlxsw_sp_acl_tcam_flower_ruleset_add,
+	.ruleset_del		= mlxsw_sp_acl_tcam_flower_ruleset_del,
+	.ruleset_bind		= mlxsw_sp_acl_tcam_flower_ruleset_bind,
+	.ruleset_unbind		= mlxsw_sp_acl_tcam_flower_ruleset_unbind,
+	.ruleset_group_id	= mlxsw_sp_acl_tcam_flower_ruleset_group_id,
+	.rule_priv_size		= sizeof(struct mlxsw_sp_acl_tcam_flower_rule),
+	.rule_add		= mlxsw_sp_acl_tcam_flower_rule_add,
+	.rule_del		= mlxsw_sp_acl_tcam_flower_rule_del,
+	.rule_activity_get	= mlxsw_sp_acl_tcam_flower_rule_activity_get,
+};
+
+static const struct mlxsw_sp_acl_profile_ops *
+mlxsw_sp_acl_tcam_profile_ops_arr[] = {
+	[MLXSW_SP_ACL_PROFILE_FLOWER] = &mlxsw_sp_acl_tcam_flower_ops,
+};
+
+static const struct mlxsw_sp_acl_profile_ops *
+mlxsw_sp_acl_tcam_profile_ops(struct mlxsw_sp *mlxsw_sp,
+			      enum mlxsw_sp_acl_profile profile)
+{
+	const struct mlxsw_sp_acl_profile_ops *ops;
+
+	if (WARN_ON(profile >= ARRAY_SIZE(mlxsw_sp_acl_tcam_profile_ops_arr)))
+		return NULL;
+	ops = mlxsw_sp_acl_tcam_profile_ops_arr[profile];
+	if (WARN_ON(!ops))
+		return NULL;
+	return ops;
+}
+
+const struct mlxsw_sp_acl_ops mlxsw_sp_acl_tcam_ops = {
+	.priv_size		= sizeof(struct mlxsw_sp_acl_tcam),
+	.init			= mlxsw_sp_acl_tcam_init,
+	.fini			= mlxsw_sp_acl_tcam_fini,
+	.profile_ops		= mlxsw_sp_acl_tcam_profile_ops,
+};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
new file mode 100644
index 0000000..0a9adc5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
@@ -0,0 +1,1055 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/dcbnl.h>
+#include <linux/if_ether.h>
+#include <linux/list.h>
+
+#include "spectrum.h"
+#include "core.h"
+#include "port.h"
+#include "reg.h"
+
+struct mlxsw_sp_sb_pr {
+	enum mlxsw_reg_sbpr_mode mode;
+	u32 size;
+};
+
+struct mlxsw_cp_sb_occ {
+	u32 cur;
+	u32 max;
+};
+
+struct mlxsw_sp_sb_cm {
+	u32 min_buff;
+	u32 max_buff;
+	u8 pool;
+	struct mlxsw_cp_sb_occ occ;
+};
+
+struct mlxsw_sp_sb_pm {
+	u32 min_buff;
+	u32 max_buff;
+	struct mlxsw_cp_sb_occ occ;
+};
+
+#define MLXSW_SP_SB_POOL_COUNT	4
+#define MLXSW_SP_SB_TC_COUNT	8
+
+struct mlxsw_sp_sb_port {
+	struct mlxsw_sp_sb_cm cms[2][MLXSW_SP_SB_TC_COUNT];
+	struct mlxsw_sp_sb_pm pms[2][MLXSW_SP_SB_POOL_COUNT];
+};
+
+struct mlxsw_sp_sb {
+	struct mlxsw_sp_sb_pr prs[2][MLXSW_SP_SB_POOL_COUNT];
+	struct mlxsw_sp_sb_port *ports;
+	u32 cell_size;
+};
+
+u32 mlxsw_sp_cells_bytes(const struct mlxsw_sp *mlxsw_sp, u32 cells)
+{
+	return mlxsw_sp->sb->cell_size * cells;
+}
+
+u32 mlxsw_sp_bytes_cells(const struct mlxsw_sp *mlxsw_sp, u32 bytes)
+{
+	return DIV_ROUND_UP(bytes, mlxsw_sp->sb->cell_size);
+}
+
+static struct mlxsw_sp_sb_pr *mlxsw_sp_sb_pr_get(struct mlxsw_sp *mlxsw_sp,
+						 u8 pool,
+						 enum mlxsw_reg_sbxx_dir dir)
+{
+	return &mlxsw_sp->sb->prs[dir][pool];
+}
+
+static struct mlxsw_sp_sb_cm *mlxsw_sp_sb_cm_get(struct mlxsw_sp *mlxsw_sp,
+						 u8 local_port, u8 pg_buff,
+						 enum mlxsw_reg_sbxx_dir dir)
+{
+	return &mlxsw_sp->sb->ports[local_port].cms[dir][pg_buff];
+}
+
+static struct mlxsw_sp_sb_pm *mlxsw_sp_sb_pm_get(struct mlxsw_sp *mlxsw_sp,
+						 u8 local_port, u8 pool,
+						 enum mlxsw_reg_sbxx_dir dir)
+{
+	return &mlxsw_sp->sb->ports[local_port].pms[dir][pool];
+}
+
+static int mlxsw_sp_sb_pr_write(struct mlxsw_sp *mlxsw_sp, u8 pool,
+				enum mlxsw_reg_sbxx_dir dir,
+				enum mlxsw_reg_sbpr_mode mode, u32 size)
+{
+	char sbpr_pl[MLXSW_REG_SBPR_LEN];
+	struct mlxsw_sp_sb_pr *pr;
+	int err;
+
+	mlxsw_reg_sbpr_pack(sbpr_pl, pool, dir, mode, size);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbpr), sbpr_pl);
+	if (err)
+		return err;
+
+	pr = mlxsw_sp_sb_pr_get(mlxsw_sp, pool, dir);
+	pr->mode = mode;
+	pr->size = size;
+	return 0;
+}
+
+static int mlxsw_sp_sb_cm_write(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				u8 pg_buff, enum mlxsw_reg_sbxx_dir dir,
+				u32 min_buff, u32 max_buff, u8 pool)
+{
+	char sbcm_pl[MLXSW_REG_SBCM_LEN];
+	int err;
+
+	mlxsw_reg_sbcm_pack(sbcm_pl, local_port, pg_buff, dir,
+			    min_buff, max_buff, pool);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbcm), sbcm_pl);
+	if (err)
+		return err;
+	if (pg_buff < MLXSW_SP_SB_TC_COUNT) {
+		struct mlxsw_sp_sb_cm *cm;
+
+		cm = mlxsw_sp_sb_cm_get(mlxsw_sp, local_port, pg_buff, dir);
+		cm->min_buff = min_buff;
+		cm->max_buff = max_buff;
+		cm->pool = pool;
+	}
+	return 0;
+}
+
+static int mlxsw_sp_sb_pm_write(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				u8 pool, enum mlxsw_reg_sbxx_dir dir,
+				u32 min_buff, u32 max_buff)
+{
+	char sbpm_pl[MLXSW_REG_SBPM_LEN];
+	struct mlxsw_sp_sb_pm *pm;
+	int err;
+
+	mlxsw_reg_sbpm_pack(sbpm_pl, local_port, pool, dir, false,
+			    min_buff, max_buff);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbpm), sbpm_pl);
+	if (err)
+		return err;
+
+	pm = mlxsw_sp_sb_pm_get(mlxsw_sp, local_port, pool, dir);
+	pm->min_buff = min_buff;
+	pm->max_buff = max_buff;
+	return 0;
+}
+
+static int mlxsw_sp_sb_pm_occ_clear(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				    u8 pool, enum mlxsw_reg_sbxx_dir dir,
+				    struct list_head *bulk_list)
+{
+	char sbpm_pl[MLXSW_REG_SBPM_LEN];
+
+	mlxsw_reg_sbpm_pack(sbpm_pl, local_port, pool, dir, true, 0, 0);
+	return mlxsw_reg_trans_query(mlxsw_sp->core, MLXSW_REG(sbpm), sbpm_pl,
+				     bulk_list, NULL, 0);
+}
+
+static void mlxsw_sp_sb_pm_occ_query_cb(struct mlxsw_core *mlxsw_core,
+					char *sbpm_pl, size_t sbpm_pl_len,
+					unsigned long cb_priv)
+{
+	struct mlxsw_sp_sb_pm *pm = (struct mlxsw_sp_sb_pm *) cb_priv;
+
+	mlxsw_reg_sbpm_unpack(sbpm_pl, &pm->occ.cur, &pm->occ.max);
+}
+
+static int mlxsw_sp_sb_pm_occ_query(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				    u8 pool, enum mlxsw_reg_sbxx_dir dir,
+				    struct list_head *bulk_list)
+{
+	char sbpm_pl[MLXSW_REG_SBPM_LEN];
+	struct mlxsw_sp_sb_pm *pm;
+
+	pm = mlxsw_sp_sb_pm_get(mlxsw_sp, local_port, pool, dir);
+	mlxsw_reg_sbpm_pack(sbpm_pl, local_port, pool, dir, false, 0, 0);
+	return mlxsw_reg_trans_query(mlxsw_sp->core, MLXSW_REG(sbpm), sbpm_pl,
+				     bulk_list,
+				     mlxsw_sp_sb_pm_occ_query_cb,
+				     (unsigned long) pm);
+}
+
+static const u16 mlxsw_sp_pbs[] = {
+	[0] = 2 * ETH_FRAME_LEN,
+	[9] = 2 * MLXSW_PORT_MAX_MTU,
+};
+
+#define MLXSW_SP_PBS_LEN ARRAY_SIZE(mlxsw_sp_pbs)
+#define MLXSW_SP_PB_UNUSED 8
+
+static int mlxsw_sp_port_pb_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char pbmc_pl[MLXSW_REG_PBMC_LEN];
+	int i;
+
+	mlxsw_reg_pbmc_pack(pbmc_pl, mlxsw_sp_port->local_port,
+			    0xffff, 0xffff / 2);
+	for (i = 0; i < MLXSW_SP_PBS_LEN; i++) {
+		u16 size = mlxsw_sp_bytes_cells(mlxsw_sp, mlxsw_sp_pbs[i]);
+
+		if (i == MLXSW_SP_PB_UNUSED)
+			continue;
+		mlxsw_reg_pbmc_lossy_buffer_pack(pbmc_pl, i, size);
+	}
+	mlxsw_reg_pbmc_lossy_buffer_pack(pbmc_pl,
+					 MLXSW_REG_PBMC_PORT_SHARED_BUF_IDX, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pbmc), pbmc_pl);
+}
+
+static int mlxsw_sp_port_pb_prio_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	char pptb_pl[MLXSW_REG_PPTB_LEN];
+	int i;
+
+	mlxsw_reg_pptb_pack(pptb_pl, mlxsw_sp_port->local_port);
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		mlxsw_reg_pptb_prio_to_buff_pack(pptb_pl, i, 0);
+	return mlxsw_reg_write(mlxsw_sp_port->mlxsw_sp->core, MLXSW_REG(pptb),
+			       pptb_pl);
+}
+
+static int mlxsw_sp_port_headroom_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err;
+
+	err = mlxsw_sp_port_pb_init(mlxsw_sp_port);
+	if (err)
+		return err;
+	return mlxsw_sp_port_pb_prio_init(mlxsw_sp_port);
+}
+
+static int mlxsw_sp_sb_ports_init(struct mlxsw_sp *mlxsw_sp)
+{
+	unsigned int max_ports = mlxsw_core_max_ports(mlxsw_sp->core);
+
+	mlxsw_sp->sb->ports = kcalloc(max_ports,
+				      sizeof(struct mlxsw_sp_sb_port),
+				      GFP_KERNEL);
+	if (!mlxsw_sp->sb->ports)
+		return -ENOMEM;
+	return 0;
+}
+
+static void mlxsw_sp_sb_ports_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	kfree(mlxsw_sp->sb->ports);
+}
+
+#define MLXSW_SP_SB_PR_INGRESS_SIZE	12440000
+#define MLXSW_SP_SB_PR_INGRESS_MNG_SIZE (200 * 1000)
+#define MLXSW_SP_SB_PR_EGRESS_SIZE	13232000
+
+#define MLXSW_SP_SB_PR(_mode, _size)	\
+	{				\
+		.mode = _mode,		\
+		.size = _size,		\
+	}
+
+static const struct mlxsw_sp_sb_pr mlxsw_sp_sb_prs_ingress[] = {
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC,
+		       MLXSW_SP_SB_PR_INGRESS_SIZE),
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC, 0),
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC, 0),
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC,
+		       MLXSW_SP_SB_PR_INGRESS_MNG_SIZE),
+};
+
+#define MLXSW_SP_SB_PRS_INGRESS_LEN ARRAY_SIZE(mlxsw_sp_sb_prs_ingress)
+
+static const struct mlxsw_sp_sb_pr mlxsw_sp_sb_prs_egress[] = {
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC, MLXSW_SP_SB_PR_EGRESS_SIZE),
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC, 0),
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC, 0),
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC, 0),
+};
+
+#define MLXSW_SP_SB_PRS_EGRESS_LEN ARRAY_SIZE(mlxsw_sp_sb_prs_egress)
+
+static int __mlxsw_sp_sb_prs_init(struct mlxsw_sp *mlxsw_sp,
+				  enum mlxsw_reg_sbxx_dir dir,
+				  const struct mlxsw_sp_sb_pr *prs,
+				  size_t prs_len)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < prs_len; i++) {
+		u32 size = mlxsw_sp_bytes_cells(mlxsw_sp, prs[i].size);
+
+		err = mlxsw_sp_sb_pr_write(mlxsw_sp, i, dir, prs[i].mode, size);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int mlxsw_sp_sb_prs_init(struct mlxsw_sp *mlxsw_sp)
+{
+	int err;
+
+	err = __mlxsw_sp_sb_prs_init(mlxsw_sp, MLXSW_REG_SBXX_DIR_INGRESS,
+				     mlxsw_sp_sb_prs_ingress,
+				     MLXSW_SP_SB_PRS_INGRESS_LEN);
+	if (err)
+		return err;
+	return __mlxsw_sp_sb_prs_init(mlxsw_sp, MLXSW_REG_SBXX_DIR_EGRESS,
+				      mlxsw_sp_sb_prs_egress,
+				      MLXSW_SP_SB_PRS_EGRESS_LEN);
+}
+
+#define MLXSW_SP_SB_CM(_min_buff, _max_buff, _pool)	\
+	{						\
+		.min_buff = _min_buff,			\
+		.max_buff = _max_buff,			\
+		.pool = _pool,				\
+	}
+
+static const struct mlxsw_sp_sb_cm mlxsw_sp_sb_cms_ingress[] = {
+	MLXSW_SP_SB_CM(10000, 8, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, 0, 0), /* dummy, this PG does not exist */
+	MLXSW_SP_SB_CM(20000, 1, 3),
+};
+
+#define MLXSW_SP_SB_CMS_INGRESS_LEN ARRAY_SIZE(mlxsw_sp_sb_cms_ingress)
+
+static const struct mlxsw_sp_sb_cm mlxsw_sp_sb_cms_egress[] = {
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(0, 0, 0),
+	MLXSW_SP_SB_CM(0, 0, 0),
+	MLXSW_SP_SB_CM(0, 0, 0),
+	MLXSW_SP_SB_CM(0, 0, 0),
+	MLXSW_SP_SB_CM(0, 0, 0),
+	MLXSW_SP_SB_CM(0, 0, 0),
+	MLXSW_SP_SB_CM(0, 0, 0),
+	MLXSW_SP_SB_CM(0, 0, 0),
+	MLXSW_SP_SB_CM(1, 0xff, 0),
+};
+
+#define MLXSW_SP_SB_CMS_EGRESS_LEN ARRAY_SIZE(mlxsw_sp_sb_cms_egress)
+
+#define MLXSW_SP_CPU_PORT_SB_CM MLXSW_SP_SB_CM(0, 0, 0)
+
+static const struct mlxsw_sp_sb_cm mlxsw_sp_cpu_port_sb_cms[] = {
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_SB_CM(MLXSW_PORT_MAX_MTU, 0, 0),
+	MLXSW_SP_SB_CM(MLXSW_PORT_MAX_MTU, 0, 0),
+	MLXSW_SP_SB_CM(MLXSW_PORT_MAX_MTU, 0, 0),
+	MLXSW_SP_SB_CM(MLXSW_PORT_MAX_MTU, 0, 0),
+	MLXSW_SP_SB_CM(MLXSW_PORT_MAX_MTU, 0, 0),
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_SB_CM(MLXSW_PORT_MAX_MTU, 0, 0),
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+};
+
+#define MLXSW_SP_CPU_PORT_SB_MCS_LEN \
+	ARRAY_SIZE(mlxsw_sp_cpu_port_sb_cms)
+
+static int __mlxsw_sp_sb_cms_init(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				  enum mlxsw_reg_sbxx_dir dir,
+				  const struct mlxsw_sp_sb_cm *cms,
+				  size_t cms_len)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < cms_len; i++) {
+		const struct mlxsw_sp_sb_cm *cm;
+		u32 min_buff;
+
+		if (i == 8 && dir == MLXSW_REG_SBXX_DIR_INGRESS)
+			continue; /* PG number 8 does not exist, skip it */
+		cm = &cms[i];
+		/* All pools are initialized using dynamic thresholds,
+		 * therefore 'max_buff' isn't specified in cells.
+		 */
+		min_buff = mlxsw_sp_bytes_cells(mlxsw_sp, cm->min_buff);
+		err = mlxsw_sp_sb_cm_write(mlxsw_sp, local_port, i, dir,
+					   min_buff, cm->max_buff, cm->pool);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int mlxsw_sp_port_sb_cms_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err;
+
+	err = __mlxsw_sp_sb_cms_init(mlxsw_sp_port->mlxsw_sp,
+				     mlxsw_sp_port->local_port,
+				     MLXSW_REG_SBXX_DIR_INGRESS,
+				     mlxsw_sp_sb_cms_ingress,
+				     MLXSW_SP_SB_CMS_INGRESS_LEN);
+	if (err)
+		return err;
+	return __mlxsw_sp_sb_cms_init(mlxsw_sp_port->mlxsw_sp,
+				      mlxsw_sp_port->local_port,
+				      MLXSW_REG_SBXX_DIR_EGRESS,
+				      mlxsw_sp_sb_cms_egress,
+				      MLXSW_SP_SB_CMS_EGRESS_LEN);
+}
+
+static int mlxsw_sp_cpu_port_sb_cms_init(struct mlxsw_sp *mlxsw_sp)
+{
+	return __mlxsw_sp_sb_cms_init(mlxsw_sp, 0, MLXSW_REG_SBXX_DIR_EGRESS,
+				      mlxsw_sp_cpu_port_sb_cms,
+				      MLXSW_SP_CPU_PORT_SB_MCS_LEN);
+}
+
+#define MLXSW_SP_SB_PM(_min_buff, _max_buff)	\
+	{					\
+		.min_buff = _min_buff,		\
+		.max_buff = _max_buff,		\
+	}
+
+static const struct mlxsw_sp_sb_pm mlxsw_sp_sb_pms_ingress[] = {
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MAX),
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN),
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN),
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MAX),
+};
+
+#define MLXSW_SP_SB_PMS_INGRESS_LEN ARRAY_SIZE(mlxsw_sp_sb_pms_ingress)
+
+static const struct mlxsw_sp_sb_pm mlxsw_sp_sb_pms_egress[] = {
+	MLXSW_SP_SB_PM(0, 7),
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN),
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN),
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN),
+};
+
+#define MLXSW_SP_SB_PMS_EGRESS_LEN ARRAY_SIZE(mlxsw_sp_sb_pms_egress)
+
+static int __mlxsw_sp_port_sb_pms_init(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				       enum mlxsw_reg_sbxx_dir dir,
+				       const struct mlxsw_sp_sb_pm *pms,
+				       size_t pms_len)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < pms_len; i++) {
+		const struct mlxsw_sp_sb_pm *pm;
+
+		pm = &pms[i];
+		err = mlxsw_sp_sb_pm_write(mlxsw_sp, local_port, i, dir,
+					   pm->min_buff, pm->max_buff);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int mlxsw_sp_port_sb_pms_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err;
+
+	err = __mlxsw_sp_port_sb_pms_init(mlxsw_sp_port->mlxsw_sp,
+					  mlxsw_sp_port->local_port,
+					  MLXSW_REG_SBXX_DIR_INGRESS,
+					  mlxsw_sp_sb_pms_ingress,
+					  MLXSW_SP_SB_PMS_INGRESS_LEN);
+	if (err)
+		return err;
+	return __mlxsw_sp_port_sb_pms_init(mlxsw_sp_port->mlxsw_sp,
+					   mlxsw_sp_port->local_port,
+					   MLXSW_REG_SBXX_DIR_EGRESS,
+					   mlxsw_sp_sb_pms_egress,
+					   MLXSW_SP_SB_PMS_EGRESS_LEN);
+}
+
+struct mlxsw_sp_sb_mm {
+	u32 min_buff;
+	u32 max_buff;
+	u8 pool;
+};
+
+#define MLXSW_SP_SB_MM(_min_buff, _max_buff, _pool)	\
+	{						\
+		.min_buff = _min_buff,			\
+		.max_buff = _max_buff,			\
+		.pool = _pool,				\
+	}
+
+static const struct mlxsw_sp_sb_mm mlxsw_sp_sb_mms[] = {
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+};
+
+#define MLXSW_SP_SB_MMS_LEN ARRAY_SIZE(mlxsw_sp_sb_mms)
+
+static int mlxsw_sp_sb_mms_init(struct mlxsw_sp *mlxsw_sp)
+{
+	char sbmm_pl[MLXSW_REG_SBMM_LEN];
+	int i;
+	int err;
+
+	for (i = 0; i < MLXSW_SP_SB_MMS_LEN; i++) {
+		const struct mlxsw_sp_sb_mm *mc;
+		u32 min_buff;
+
+		mc = &mlxsw_sp_sb_mms[i];
+		/* All pools are initialized using dynamic thresholds,
+		 * therefore 'max_buff' isn't specified in cells.
+		 */
+		min_buff = mlxsw_sp_bytes_cells(mlxsw_sp, mc->min_buff);
+		mlxsw_reg_sbmm_pack(sbmm_pl, i, min_buff, mc->max_buff,
+				    mc->pool);
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbmm), sbmm_pl);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+int mlxsw_sp_buffers_init(struct mlxsw_sp *mlxsw_sp)
+{
+	u64 sb_size;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, CELL_SIZE))
+		return -EIO;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_BUFFER_SIZE))
+		return -EIO;
+	sb_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_BUFFER_SIZE);
+
+	mlxsw_sp->sb = kzalloc(sizeof(*mlxsw_sp->sb), GFP_KERNEL);
+	if (!mlxsw_sp->sb)
+		return -ENOMEM;
+	mlxsw_sp->sb->cell_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, CELL_SIZE);
+
+	err = mlxsw_sp_sb_ports_init(mlxsw_sp);
+	if (err)
+		goto err_sb_ports_init;
+	err = mlxsw_sp_sb_prs_init(mlxsw_sp);
+	if (err)
+		goto err_sb_prs_init;
+	err = mlxsw_sp_cpu_port_sb_cms_init(mlxsw_sp);
+	if (err)
+		goto err_sb_cpu_port_sb_cms_init;
+	err = mlxsw_sp_sb_mms_init(mlxsw_sp);
+	if (err)
+		goto err_sb_mms_init;
+	err = devlink_sb_register(priv_to_devlink(mlxsw_sp->core), 0, sb_size,
+				  MLXSW_SP_SB_POOL_COUNT,
+				  MLXSW_SP_SB_POOL_COUNT,
+				  MLXSW_SP_SB_TC_COUNT,
+				  MLXSW_SP_SB_TC_COUNT);
+	if (err)
+		goto err_devlink_sb_register;
+
+	return 0;
+
+err_devlink_sb_register:
+err_sb_mms_init:
+err_sb_cpu_port_sb_cms_init:
+err_sb_prs_init:
+	mlxsw_sp_sb_ports_fini(mlxsw_sp);
+err_sb_ports_init:
+	kfree(mlxsw_sp->sb);
+	return err;
+}
+
+void mlxsw_sp_buffers_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	devlink_sb_unregister(priv_to_devlink(mlxsw_sp->core), 0);
+	mlxsw_sp_sb_ports_fini(mlxsw_sp);
+	kfree(mlxsw_sp->sb);
+}
+
+int mlxsw_sp_port_buffers_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err;
+
+	err = mlxsw_sp_port_headroom_init(mlxsw_sp_port);
+	if (err)
+		return err;
+	err = mlxsw_sp_port_sb_cms_init(mlxsw_sp_port);
+	if (err)
+		return err;
+	err = mlxsw_sp_port_sb_pms_init(mlxsw_sp_port);
+
+	return err;
+}
+
+static u8 pool_get(u16 pool_index)
+{
+	return pool_index % MLXSW_SP_SB_POOL_COUNT;
+}
+
+static u16 pool_index_get(u8 pool, enum mlxsw_reg_sbxx_dir dir)
+{
+	u16 pool_index;
+
+	pool_index = pool;
+	if (dir == MLXSW_REG_SBXX_DIR_EGRESS)
+		pool_index += MLXSW_SP_SB_POOL_COUNT;
+	return pool_index;
+}
+
+static enum mlxsw_reg_sbxx_dir dir_get(u16 pool_index)
+{
+	return pool_index < MLXSW_SP_SB_POOL_COUNT ?
+	       MLXSW_REG_SBXX_DIR_INGRESS : MLXSW_REG_SBXX_DIR_EGRESS;
+}
+
+int mlxsw_sp_sb_pool_get(struct mlxsw_core *mlxsw_core,
+			 unsigned int sb_index, u16 pool_index,
+			 struct devlink_sb_pool_info *pool_info)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	u8 pool = pool_get(pool_index);
+	enum mlxsw_reg_sbxx_dir dir = dir_get(pool_index);
+	struct mlxsw_sp_sb_pr *pr = mlxsw_sp_sb_pr_get(mlxsw_sp, pool, dir);
+
+	pool_info->pool_type = (enum devlink_sb_pool_type) dir;
+	pool_info->size = mlxsw_sp_cells_bytes(mlxsw_sp, pr->size);
+	pool_info->threshold_type = (enum devlink_sb_threshold_type) pr->mode;
+	return 0;
+}
+
+int mlxsw_sp_sb_pool_set(struct mlxsw_core *mlxsw_core,
+			 unsigned int sb_index, u16 pool_index, u32 size,
+			 enum devlink_sb_threshold_type threshold_type)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	u32 pool_size = mlxsw_sp_bytes_cells(mlxsw_sp, size);
+	u8 pool = pool_get(pool_index);
+	enum mlxsw_reg_sbxx_dir dir = dir_get(pool_index);
+	enum mlxsw_reg_sbpr_mode mode;
+
+	if (size > MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_BUFFER_SIZE))
+		return -EINVAL;
+
+	mode = (enum mlxsw_reg_sbpr_mode) threshold_type;
+	return mlxsw_sp_sb_pr_write(mlxsw_sp, pool, dir, mode, pool_size);
+}
+
+#define MLXSW_SP_SB_THRESHOLD_TO_ALPHA_OFFSET (-2) /* 3->1, 16->14 */
+
+static u32 mlxsw_sp_sb_threshold_out(struct mlxsw_sp *mlxsw_sp, u8 pool,
+				     enum mlxsw_reg_sbxx_dir dir, u32 max_buff)
+{
+	struct mlxsw_sp_sb_pr *pr = mlxsw_sp_sb_pr_get(mlxsw_sp, pool, dir);
+
+	if (pr->mode == MLXSW_REG_SBPR_MODE_DYNAMIC)
+		return max_buff - MLXSW_SP_SB_THRESHOLD_TO_ALPHA_OFFSET;
+	return mlxsw_sp_cells_bytes(mlxsw_sp, max_buff);
+}
+
+static int mlxsw_sp_sb_threshold_in(struct mlxsw_sp *mlxsw_sp, u8 pool,
+				    enum mlxsw_reg_sbxx_dir dir, u32 threshold,
+				    u32 *p_max_buff)
+{
+	struct mlxsw_sp_sb_pr *pr = mlxsw_sp_sb_pr_get(mlxsw_sp, pool, dir);
+
+	if (pr->mode == MLXSW_REG_SBPR_MODE_DYNAMIC) {
+		int val;
+
+		val = threshold + MLXSW_SP_SB_THRESHOLD_TO_ALPHA_OFFSET;
+		if (val < MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN ||
+		    val > MLXSW_REG_SBXX_DYN_MAX_BUFF_MAX)
+			return -EINVAL;
+		*p_max_buff = val;
+	} else {
+		*p_max_buff = mlxsw_sp_bytes_cells(mlxsw_sp, threshold);
+	}
+	return 0;
+}
+
+int mlxsw_sp_sb_port_pool_get(struct mlxsw_core_port *mlxsw_core_port,
+			      unsigned int sb_index, u16 pool_index,
+			      u32 *p_threshold)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pool = pool_get(pool_index);
+	enum mlxsw_reg_sbxx_dir dir = dir_get(pool_index);
+	struct mlxsw_sp_sb_pm *pm = mlxsw_sp_sb_pm_get(mlxsw_sp, local_port,
+						       pool, dir);
+
+	*p_threshold = mlxsw_sp_sb_threshold_out(mlxsw_sp, pool, dir,
+						 pm->max_buff);
+	return 0;
+}
+
+int mlxsw_sp_sb_port_pool_set(struct mlxsw_core_port *mlxsw_core_port,
+			      unsigned int sb_index, u16 pool_index,
+			      u32 threshold)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pool = pool_get(pool_index);
+	enum mlxsw_reg_sbxx_dir dir = dir_get(pool_index);
+	u32 max_buff;
+	int err;
+
+	err = mlxsw_sp_sb_threshold_in(mlxsw_sp, pool, dir,
+				       threshold, &max_buff);
+	if (err)
+		return err;
+
+	return mlxsw_sp_sb_pm_write(mlxsw_sp, local_port, pool, dir,
+				    0, max_buff);
+}
+
+int mlxsw_sp_sb_tc_pool_bind_get(struct mlxsw_core_port *mlxsw_core_port,
+				 unsigned int sb_index, u16 tc_index,
+				 enum devlink_sb_pool_type pool_type,
+				 u16 *p_pool_index, u32 *p_threshold)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pg_buff = tc_index;
+	enum mlxsw_reg_sbxx_dir dir = (enum mlxsw_reg_sbxx_dir) pool_type;
+	struct mlxsw_sp_sb_cm *cm = mlxsw_sp_sb_cm_get(mlxsw_sp, local_port,
+						       pg_buff, dir);
+
+	*p_threshold = mlxsw_sp_sb_threshold_out(mlxsw_sp, cm->pool, dir,
+						 cm->max_buff);
+	*p_pool_index = pool_index_get(cm->pool, dir);
+	return 0;
+}
+
+int mlxsw_sp_sb_tc_pool_bind_set(struct mlxsw_core_port *mlxsw_core_port,
+				 unsigned int sb_index, u16 tc_index,
+				 enum devlink_sb_pool_type pool_type,
+				 u16 pool_index, u32 threshold)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pg_buff = tc_index;
+	enum mlxsw_reg_sbxx_dir dir = (enum mlxsw_reg_sbxx_dir) pool_type;
+	u8 pool = pool_get(pool_index);
+	u32 max_buff;
+	int err;
+
+	if (dir != dir_get(pool_index))
+		return -EINVAL;
+
+	err = mlxsw_sp_sb_threshold_in(mlxsw_sp, pool, dir,
+				       threshold, &max_buff);
+	if (err)
+		return err;
+
+	return mlxsw_sp_sb_cm_write(mlxsw_sp, local_port, pg_buff, dir,
+				    0, max_buff, pool);
+}
+
+#define MASKED_COUNT_MAX \
+	(MLXSW_REG_SBSR_REC_MAX_COUNT / (MLXSW_SP_SB_TC_COUNT * 2))
+
+struct mlxsw_sp_sb_sr_occ_query_cb_ctx {
+	u8 masked_count;
+	u8 local_port_1;
+};
+
+static void mlxsw_sp_sb_sr_occ_query_cb(struct mlxsw_core *mlxsw_core,
+					char *sbsr_pl, size_t sbsr_pl_len,
+					unsigned long cb_priv)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	struct mlxsw_sp_sb_sr_occ_query_cb_ctx cb_ctx;
+	u8 masked_count;
+	u8 local_port;
+	int rec_index = 0;
+	struct mlxsw_sp_sb_cm *cm;
+	int i;
+
+	memcpy(&cb_ctx, &cb_priv, sizeof(cb_ctx));
+
+	masked_count = 0;
+	for (local_port = cb_ctx.local_port_1;
+	     local_port < mlxsw_core_max_ports(mlxsw_core); local_port++) {
+		if (!mlxsw_sp->ports[local_port])
+			continue;
+		for (i = 0; i < MLXSW_SP_SB_TC_COUNT; i++) {
+			cm = mlxsw_sp_sb_cm_get(mlxsw_sp, local_port, i,
+						MLXSW_REG_SBXX_DIR_INGRESS);
+			mlxsw_reg_sbsr_rec_unpack(sbsr_pl, rec_index++,
+						  &cm->occ.cur, &cm->occ.max);
+		}
+		if (++masked_count == cb_ctx.masked_count)
+			break;
+	}
+	masked_count = 0;
+	for (local_port = cb_ctx.local_port_1;
+	     local_port < mlxsw_core_max_ports(mlxsw_core); local_port++) {
+		if (!mlxsw_sp->ports[local_port])
+			continue;
+		for (i = 0; i < MLXSW_SP_SB_TC_COUNT; i++) {
+			cm = mlxsw_sp_sb_cm_get(mlxsw_sp, local_port, i,
+						MLXSW_REG_SBXX_DIR_EGRESS);
+			mlxsw_reg_sbsr_rec_unpack(sbsr_pl, rec_index++,
+						  &cm->occ.cur, &cm->occ.max);
+		}
+		if (++masked_count == cb_ctx.masked_count)
+			break;
+	}
+}
+
+int mlxsw_sp_sb_occ_snapshot(struct mlxsw_core *mlxsw_core,
+			     unsigned int sb_index)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	struct mlxsw_sp_sb_sr_occ_query_cb_ctx cb_ctx;
+	unsigned long cb_priv;
+	LIST_HEAD(bulk_list);
+	char *sbsr_pl;
+	u8 masked_count;
+	u8 local_port_1;
+	u8 local_port = 0;
+	int i;
+	int err;
+	int err2;
+
+	sbsr_pl = kmalloc(MLXSW_REG_SBSR_LEN, GFP_KERNEL);
+	if (!sbsr_pl)
+		return -ENOMEM;
+
+next_batch:
+	local_port++;
+	local_port_1 = local_port;
+	masked_count = 0;
+	mlxsw_reg_sbsr_pack(sbsr_pl, false);
+	for (i = 0; i < MLXSW_SP_SB_TC_COUNT; i++) {
+		mlxsw_reg_sbsr_pg_buff_mask_set(sbsr_pl, i, 1);
+		mlxsw_reg_sbsr_tclass_mask_set(sbsr_pl, i, 1);
+	}
+	for (; local_port < mlxsw_core_max_ports(mlxsw_core); local_port++) {
+		if (!mlxsw_sp->ports[local_port])
+			continue;
+		mlxsw_reg_sbsr_ingress_port_mask_set(sbsr_pl, local_port, 1);
+		mlxsw_reg_sbsr_egress_port_mask_set(sbsr_pl, local_port, 1);
+		for (i = 0; i < MLXSW_SP_SB_POOL_COUNT; i++) {
+			err = mlxsw_sp_sb_pm_occ_query(mlxsw_sp, local_port, i,
+						       MLXSW_REG_SBXX_DIR_INGRESS,
+						       &bulk_list);
+			if (err)
+				goto out;
+			err = mlxsw_sp_sb_pm_occ_query(mlxsw_sp, local_port, i,
+						       MLXSW_REG_SBXX_DIR_EGRESS,
+						       &bulk_list);
+			if (err)
+				goto out;
+		}
+		if (++masked_count == MASKED_COUNT_MAX)
+			goto do_query;
+	}
+
+do_query:
+	cb_ctx.masked_count = masked_count;
+	cb_ctx.local_port_1 = local_port_1;
+	memcpy(&cb_priv, &cb_ctx, sizeof(cb_ctx));
+	err = mlxsw_reg_trans_query(mlxsw_core, MLXSW_REG(sbsr), sbsr_pl,
+				    &bulk_list, mlxsw_sp_sb_sr_occ_query_cb,
+				    cb_priv);
+	if (err)
+		goto out;
+	if (local_port < mlxsw_core_max_ports(mlxsw_core))
+		goto next_batch;
+
+out:
+	err2 = mlxsw_reg_trans_bulk_wait(&bulk_list);
+	if (!err)
+		err = err2;
+	kfree(sbsr_pl);
+	return err;
+}
+
+int mlxsw_sp_sb_occ_max_clear(struct mlxsw_core *mlxsw_core,
+			      unsigned int sb_index)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	LIST_HEAD(bulk_list);
+	char *sbsr_pl;
+	unsigned int masked_count;
+	u8 local_port = 0;
+	int i;
+	int err;
+	int err2;
+
+	sbsr_pl = kmalloc(MLXSW_REG_SBSR_LEN, GFP_KERNEL);
+	if (!sbsr_pl)
+		return -ENOMEM;
+
+next_batch:
+	local_port++;
+	masked_count = 0;
+	mlxsw_reg_sbsr_pack(sbsr_pl, true);
+	for (i = 0; i < MLXSW_SP_SB_TC_COUNT; i++) {
+		mlxsw_reg_sbsr_pg_buff_mask_set(sbsr_pl, i, 1);
+		mlxsw_reg_sbsr_tclass_mask_set(sbsr_pl, i, 1);
+	}
+	for (; local_port < mlxsw_core_max_ports(mlxsw_core); local_port++) {
+		if (!mlxsw_sp->ports[local_port])
+			continue;
+		mlxsw_reg_sbsr_ingress_port_mask_set(sbsr_pl, local_port, 1);
+		mlxsw_reg_sbsr_egress_port_mask_set(sbsr_pl, local_port, 1);
+		for (i = 0; i < MLXSW_SP_SB_POOL_COUNT; i++) {
+			err = mlxsw_sp_sb_pm_occ_clear(mlxsw_sp, local_port, i,
+						       MLXSW_REG_SBXX_DIR_INGRESS,
+						       &bulk_list);
+			if (err)
+				goto out;
+			err = mlxsw_sp_sb_pm_occ_clear(mlxsw_sp, local_port, i,
+						       MLXSW_REG_SBXX_DIR_EGRESS,
+						       &bulk_list);
+			if (err)
+				goto out;
+		}
+		if (++masked_count == MASKED_COUNT_MAX)
+			goto do_query;
+	}
+
+do_query:
+	err = mlxsw_reg_trans_query(mlxsw_core, MLXSW_REG(sbsr), sbsr_pl,
+				    &bulk_list, NULL, 0);
+	if (err)
+		goto out;
+	if (local_port < mlxsw_core_max_ports(mlxsw_core))
+		goto next_batch;
+
+out:
+	err2 = mlxsw_reg_trans_bulk_wait(&bulk_list);
+	if (!err)
+		err = err2;
+	kfree(sbsr_pl);
+	return err;
+}
+
+int mlxsw_sp_sb_occ_port_pool_get(struct mlxsw_core_port *mlxsw_core_port,
+				  unsigned int sb_index, u16 pool_index,
+				  u32 *p_cur, u32 *p_max)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pool = pool_get(pool_index);
+	enum mlxsw_reg_sbxx_dir dir = dir_get(pool_index);
+	struct mlxsw_sp_sb_pm *pm = mlxsw_sp_sb_pm_get(mlxsw_sp, local_port,
+						       pool, dir);
+
+	*p_cur = mlxsw_sp_cells_bytes(mlxsw_sp, pm->occ.cur);
+	*p_max = mlxsw_sp_cells_bytes(mlxsw_sp, pm->occ.max);
+	return 0;
+}
+
+int mlxsw_sp_sb_occ_tc_port_bind_get(struct mlxsw_core_port *mlxsw_core_port,
+				     unsigned int sb_index, u16 tc_index,
+				     enum devlink_sb_pool_type pool_type,
+				     u32 *p_cur, u32 *p_max)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pg_buff = tc_index;
+	enum mlxsw_reg_sbxx_dir dir = (enum mlxsw_reg_sbxx_dir) pool_type;
+	struct mlxsw_sp_sb_cm *cm = mlxsw_sp_sb_cm_get(mlxsw_sp, local_port,
+						       pg_buff, dir);
+
+	*p_cur = mlxsw_sp_cells_bytes(mlxsw_sp, cm->occ.cur);
+	*p_max = mlxsw_sp_cells_bytes(mlxsw_sp, cm->occ.max);
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c
new file mode 100644
index 0000000..0f46775
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c
@@ -0,0 +1,207 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Arkadi Sharshevsky <arkadis@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+
+#include "spectrum_cnt.h"
+
+#define MLXSW_SP_COUNTER_POOL_BANK_SIZE 4096
+
+struct mlxsw_sp_counter_sub_pool {
+	unsigned int base_index;
+	unsigned int size;
+	unsigned int entry_size;
+	unsigned int bank_count;
+};
+
+struct mlxsw_sp_counter_pool {
+	unsigned int pool_size;
+	unsigned long *usage; /* Usage bitmap */
+	struct mlxsw_sp_counter_sub_pool *sub_pools;
+};
+
+static struct mlxsw_sp_counter_sub_pool mlxsw_sp_counter_sub_pools[] = {
+	[MLXSW_SP_COUNTER_SUB_POOL_FLOW] = {
+		.bank_count = 6,
+	},
+	[MLXSW_SP_COUNTER_SUB_POOL_RIF] = {
+		.bank_count = 2,
+	}
+};
+
+static int mlxsw_sp_counter_pool_validate(struct mlxsw_sp *mlxsw_sp)
+{
+	unsigned int total_bank_config = 0;
+	unsigned int pool_size;
+	int i;
+
+	pool_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, COUNTER_POOL_SIZE);
+	/* Check config is valid, no bank over subscription */
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sp_counter_sub_pools); i++)
+		total_bank_config += mlxsw_sp_counter_sub_pools[i].bank_count;
+	if (total_bank_config > pool_size / MLXSW_SP_COUNTER_POOL_BANK_SIZE + 1)
+		return -EINVAL;
+	return 0;
+}
+
+static int mlxsw_sp_counter_sub_pools_prepare(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_counter_sub_pool *sub_pool;
+
+	/* Prepare generic flow pool*/
+	sub_pool = &mlxsw_sp_counter_sub_pools[MLXSW_SP_COUNTER_SUB_POOL_FLOW];
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, COUNTER_SIZE_PACKETS_BYTES))
+		return -EIO;
+	sub_pool->entry_size = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						  COUNTER_SIZE_PACKETS_BYTES);
+	/* Prepare erif pool*/
+	sub_pool = &mlxsw_sp_counter_sub_pools[MLXSW_SP_COUNTER_SUB_POOL_RIF];
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, COUNTER_SIZE_ROUTER_BASIC))
+		return -EIO;
+	sub_pool->entry_size = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						  COUNTER_SIZE_ROUTER_BASIC);
+	return 0;
+}
+
+int mlxsw_sp_counter_pool_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_counter_sub_pool *sub_pool;
+	struct mlxsw_sp_counter_pool *pool;
+	unsigned int base_index;
+	unsigned int map_size;
+	int i;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, COUNTER_POOL_SIZE))
+		return -EIO;
+
+	err = mlxsw_sp_counter_pool_validate(mlxsw_sp);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_counter_sub_pools_prepare(mlxsw_sp);
+	if (err)
+		return err;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return -ENOMEM;
+
+	pool->pool_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, COUNTER_POOL_SIZE);
+	map_size = BITS_TO_LONGS(pool->pool_size) * sizeof(unsigned long);
+
+	pool->usage = kzalloc(map_size, GFP_KERNEL);
+	if (!pool->usage) {
+		err = -ENOMEM;
+		goto err_usage_alloc;
+	}
+
+	pool->sub_pools = mlxsw_sp_counter_sub_pools;
+	/* Allocation is based on bank count which should be
+	 * specified for each sub pool statically.
+	 */
+	base_index = 0;
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sp_counter_sub_pools); i++) {
+		sub_pool = &pool->sub_pools[i];
+		sub_pool->size = sub_pool->bank_count *
+				 MLXSW_SP_COUNTER_POOL_BANK_SIZE;
+		sub_pool->base_index = base_index;
+		base_index += sub_pool->size;
+		/* The last bank can't be fully used */
+		if (sub_pool->base_index + sub_pool->size > pool->pool_size)
+			sub_pool->size = pool->pool_size - sub_pool->base_index;
+	}
+
+	mlxsw_sp->counter_pool = pool;
+	return 0;
+
+err_usage_alloc:
+	kfree(pool);
+	return err;
+}
+
+void mlxsw_sp_counter_pool_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_counter_pool *pool = mlxsw_sp->counter_pool;
+
+	WARN_ON(find_first_bit(pool->usage, pool->pool_size) !=
+			       pool->pool_size);
+	kfree(pool->usage);
+	kfree(pool);
+}
+
+int mlxsw_sp_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_counter_sub_pool_id sub_pool_id,
+			   unsigned int *p_counter_index)
+{
+	struct mlxsw_sp_counter_pool *pool = mlxsw_sp->counter_pool;
+	struct mlxsw_sp_counter_sub_pool *sub_pool;
+	unsigned int entry_index;
+	unsigned int stop_index;
+	int i;
+
+	sub_pool = &mlxsw_sp_counter_sub_pools[sub_pool_id];
+	stop_index = sub_pool->base_index + sub_pool->size;
+	entry_index = sub_pool->base_index;
+
+	entry_index = find_next_zero_bit(pool->usage, stop_index, entry_index);
+	if (entry_index == stop_index)
+		return -ENOBUFS;
+	/* The sub-pools can contain non-integer number of entries
+	 * so we must check for overflow
+	 */
+	if (entry_index + sub_pool->entry_size > stop_index)
+		return -ENOBUFS;
+	for (i = 0; i < sub_pool->entry_size; i++)
+		__set_bit(entry_index + i, pool->usage);
+
+	*p_counter_index = entry_index;
+	return 0;
+}
+
+void mlxsw_sp_counter_free(struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_counter_sub_pool_id sub_pool_id,
+			   unsigned int counter_index)
+{
+	struct mlxsw_sp_counter_pool *pool = mlxsw_sp->counter_pool;
+	struct mlxsw_sp_counter_sub_pool *sub_pool;
+	int i;
+
+	if (WARN_ON(counter_index >= pool->pool_size))
+		return;
+	sub_pool = &mlxsw_sp_counter_sub_pools[sub_pool_id];
+	for (i = 0; i < sub_pool->entry_size; i++)
+		__clear_bit(counter_index + i, pool->usage);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h
new file mode 100644
index 0000000..fd34d0a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h
@@ -0,0 +1,54 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Arkadi Sharshevsky <arkdis@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_SPECTRUM_CNT_H
+#define _MLXSW_SPECTRUM_CNT_H
+
+#include "spectrum.h"
+
+enum mlxsw_sp_counter_sub_pool_id {
+	MLXSW_SP_COUNTER_SUB_POOL_FLOW,
+	MLXSW_SP_COUNTER_SUB_POOL_RIF,
+};
+
+int mlxsw_sp_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_counter_sub_pool_id sub_pool_id,
+			   unsigned int *p_counter_index);
+void mlxsw_sp_counter_free(struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_counter_sub_pool_id sub_pool_id,
+			   unsigned int counter_index);
+int mlxsw_sp_counter_pool_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_counter_pool_fini(struct mlxsw_sp *mlxsw_sp);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c
new file mode 100644
index 0000000..b6ed7f7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c
@@ -0,0 +1,486 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Ido Schimmel <idosch@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <net/dcbnl.h>
+
+#include "spectrum.h"
+#include "reg.h"
+
+static u8 mlxsw_sp_dcbnl_getdcbx(struct net_device __always_unused *dev)
+{
+	return DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE;
+}
+
+static u8 mlxsw_sp_dcbnl_setdcbx(struct net_device __always_unused *dev,
+				 u8 mode)
+{
+	return (mode != (DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE)) ? 1 : 0;
+}
+
+static int mlxsw_sp_dcbnl_ieee_getets(struct net_device *dev,
+				      struct ieee_ets *ets)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	memcpy(ets, mlxsw_sp_port->dcb.ets, sizeof(*ets));
+
+	return 0;
+}
+
+static int mlxsw_sp_port_ets_validate(struct mlxsw_sp_port *mlxsw_sp_port,
+				      struct ieee_ets *ets)
+{
+	struct net_device *dev = mlxsw_sp_port->dev;
+	bool has_ets_tc = false;
+	int i, tx_bw_sum = 0;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_STRICT:
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			has_ets_tc = true;
+			tx_bw_sum += ets->tc_tx_bw[i];
+			break;
+		default:
+			netdev_err(dev, "Only strict priority and ETS are supported\n");
+			return -EINVAL;
+		}
+
+		if (ets->prio_tc[i] >= IEEE_8021QAZ_MAX_TCS) {
+			netdev_err(dev, "Invalid TC\n");
+			return -EINVAL;
+		}
+	}
+
+	if (has_ets_tc && tx_bw_sum != 100) {
+		netdev_err(dev, "Total ETS bandwidth should equal 100\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_port_pg_prio_map(struct mlxsw_sp_port *mlxsw_sp_port,
+				     u8 *prio_tc)
+{
+	char pptb_pl[MLXSW_REG_PPTB_LEN];
+	int i;
+
+	mlxsw_reg_pptb_pack(pptb_pl, mlxsw_sp_port->local_port);
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		mlxsw_reg_pptb_prio_to_buff_pack(pptb_pl, i, prio_tc[i]);
+
+	return mlxsw_reg_write(mlxsw_sp_port->mlxsw_sp->core, MLXSW_REG(pptb),
+			       pptb_pl);
+}
+
+static bool mlxsw_sp_ets_has_pg(u8 *prio_tc, u8 pg)
+{
+	int i;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		if (prio_tc[i] == pg)
+			return true;
+	return false;
+}
+
+static int mlxsw_sp_port_pg_destroy(struct mlxsw_sp_port *mlxsw_sp_port,
+				    u8 *old_prio_tc, u8 *new_prio_tc)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char pbmc_pl[MLXSW_REG_PBMC_LEN];
+	int err, i;
+
+	mlxsw_reg_pbmc_pack(pbmc_pl, mlxsw_sp_port->local_port, 0, 0);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(pbmc), pbmc_pl);
+	if (err)
+		return err;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		u8 pg = old_prio_tc[i];
+
+		if (!mlxsw_sp_ets_has_pg(new_prio_tc, pg))
+			mlxsw_reg_pbmc_lossy_buffer_pack(pbmc_pl, pg, 0);
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pbmc), pbmc_pl);
+}
+
+static int mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				      struct ieee_ets *ets)
+{
+	bool pause_en = mlxsw_sp_port_is_pause_en(mlxsw_sp_port);
+	struct ieee_ets *my_ets = mlxsw_sp_port->dcb.ets;
+	struct net_device *dev = mlxsw_sp_port->dev;
+	int err;
+
+	/* Create the required PGs, but don't destroy existing ones, as
+	 * traffic is still directed to them.
+	 */
+	err = __mlxsw_sp_port_headroom_set(mlxsw_sp_port, dev->mtu,
+					   ets->prio_tc, pause_en,
+					   mlxsw_sp_port->dcb.pfc);
+	if (err) {
+		netdev_err(dev, "Failed to configure port's headroom\n");
+		return err;
+	}
+
+	err = mlxsw_sp_port_pg_prio_map(mlxsw_sp_port, ets->prio_tc);
+	if (err) {
+		netdev_err(dev, "Failed to set PG-priority mapping\n");
+		goto err_port_prio_pg_map;
+	}
+
+	err = mlxsw_sp_port_pg_destroy(mlxsw_sp_port, my_ets->prio_tc,
+				       ets->prio_tc);
+	if (err)
+		netdev_warn(dev, "Failed to remove ununsed PGs\n");
+
+	return 0;
+
+err_port_prio_pg_map:
+	mlxsw_sp_port_pg_destroy(mlxsw_sp_port, ets->prio_tc, my_ets->prio_tc);
+	return err;
+}
+
+static int __mlxsw_sp_dcbnl_ieee_setets(struct mlxsw_sp_port *mlxsw_sp_port,
+					struct ieee_ets *ets)
+{
+	struct ieee_ets *my_ets = mlxsw_sp_port->dcb.ets;
+	struct net_device *dev = mlxsw_sp_port->dev;
+	int i, err;
+
+	/* Egress configuration. */
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		bool dwrr = ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS;
+		u8 weight = ets->tc_tx_bw[i];
+
+		err = mlxsw_sp_port_ets_set(mlxsw_sp_port,
+					    MLXSW_REG_QEEC_HIERARCY_SUBGROUP, i,
+					    0, dwrr, weight);
+		if (err) {
+			netdev_err(dev, "Failed to link subgroup ETS element %d to group\n",
+				   i);
+			goto err_port_ets_set;
+		}
+	}
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_prio_tc_set(mlxsw_sp_port, i,
+						ets->prio_tc[i]);
+		if (err) {
+			netdev_err(dev, "Failed to map prio %d to TC %d\n", i,
+				   ets->prio_tc[i]);
+			goto err_port_prio_tc_set;
+		}
+	}
+
+	/* Ingress configuration. */
+	err = mlxsw_sp_port_headroom_set(mlxsw_sp_port, ets);
+	if (err)
+		goto err_port_headroom_set;
+
+	return 0;
+
+err_port_headroom_set:
+	i = IEEE_8021QAZ_MAX_TCS;
+err_port_prio_tc_set:
+	for (i--; i >= 0; i--)
+		mlxsw_sp_port_prio_tc_set(mlxsw_sp_port, i, my_ets->prio_tc[i]);
+	i = IEEE_8021QAZ_MAX_TCS;
+err_port_ets_set:
+	for (i--; i >= 0; i--) {
+		bool dwrr = my_ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS;
+		u8 weight = my_ets->tc_tx_bw[i];
+
+		err = mlxsw_sp_port_ets_set(mlxsw_sp_port,
+					    MLXSW_REG_QEEC_HIERARCY_SUBGROUP, i,
+					    0, dwrr, weight);
+	}
+	return err;
+}
+
+static int mlxsw_sp_dcbnl_ieee_setets(struct net_device *dev,
+				      struct ieee_ets *ets)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	int err;
+
+	err = mlxsw_sp_port_ets_validate(mlxsw_sp_port, ets);
+	if (err)
+		return err;
+
+	err = __mlxsw_sp_dcbnl_ieee_setets(mlxsw_sp_port, ets);
+	if (err)
+		return err;
+
+	memcpy(mlxsw_sp_port->dcb.ets, ets, sizeof(*ets));
+	mlxsw_sp_port->dcb.ets->ets_cap = IEEE_8021QAZ_MAX_TCS;
+
+	return 0;
+}
+
+static int mlxsw_sp_dcbnl_ieee_getmaxrate(struct net_device *dev,
+					  struct ieee_maxrate *maxrate)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	memcpy(maxrate, mlxsw_sp_port->dcb.maxrate, sizeof(*maxrate));
+
+	return 0;
+}
+
+static int mlxsw_sp_dcbnl_ieee_setmaxrate(struct net_device *dev,
+					  struct ieee_maxrate *maxrate)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct ieee_maxrate *my_maxrate = mlxsw_sp_port->dcb.maxrate;
+	int err, i;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port,
+						    MLXSW_REG_QEEC_HIERARCY_SUBGROUP,
+						    i, 0,
+						    maxrate->tc_maxrate[i]);
+		if (err) {
+			netdev_err(dev, "Failed to set maxrate for TC %d\n", i);
+			goto err_port_ets_maxrate_set;
+		}
+	}
+
+	memcpy(mlxsw_sp_port->dcb.maxrate, maxrate, sizeof(*maxrate));
+
+	return 0;
+
+err_port_ets_maxrate_set:
+	for (i--; i >= 0; i--)
+		mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port,
+					      MLXSW_REG_QEEC_HIERARCY_SUBGROUP,
+					      i, 0, my_maxrate->tc_maxrate[i]);
+	return err;
+}
+
+static int mlxsw_sp_port_pfc_cnt_get(struct mlxsw_sp_port *mlxsw_sp_port,
+				     u8 prio)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct ieee_pfc *my_pfc = mlxsw_sp_port->dcb.pfc;
+	char ppcnt_pl[MLXSW_REG_PPCNT_LEN];
+	int err;
+
+	mlxsw_reg_ppcnt_pack(ppcnt_pl, mlxsw_sp_port->local_port,
+			     MLXSW_REG_PPCNT_PRIO_CNT, prio);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ppcnt), ppcnt_pl);
+	if (err)
+		return err;
+
+	my_pfc->requests[prio] = mlxsw_reg_ppcnt_tx_pause_get(ppcnt_pl);
+	my_pfc->indications[prio] = mlxsw_reg_ppcnt_rx_pause_get(ppcnt_pl);
+
+	return 0;
+}
+
+static int mlxsw_sp_dcbnl_ieee_getpfc(struct net_device *dev,
+				      struct ieee_pfc *pfc)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	int err, i;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_pfc_cnt_get(mlxsw_sp_port, i);
+		if (err) {
+			netdev_err(dev, "Failed to get PFC count for priority %d\n",
+				   i);
+			return err;
+		}
+	}
+
+	memcpy(pfc, mlxsw_sp_port->dcb.pfc, sizeof(*pfc));
+
+	return 0;
+}
+
+static int mlxsw_sp_port_pfc_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				 struct ieee_pfc *pfc)
+{
+	char pfcc_pl[MLXSW_REG_PFCC_LEN];
+
+	mlxsw_reg_pfcc_pack(pfcc_pl, mlxsw_sp_port->local_port);
+	mlxsw_reg_pfcc_pprx_set(pfcc_pl, mlxsw_sp_port->link.rx_pause);
+	mlxsw_reg_pfcc_pptx_set(pfcc_pl, mlxsw_sp_port->link.tx_pause);
+	mlxsw_reg_pfcc_prio_pack(pfcc_pl, pfc->pfc_en);
+
+	return mlxsw_reg_write(mlxsw_sp_port->mlxsw_sp->core, MLXSW_REG(pfcc),
+			       pfcc_pl);
+}
+
+static int mlxsw_sp_dcbnl_ieee_setpfc(struct net_device *dev,
+				      struct ieee_pfc *pfc)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	bool pause_en = mlxsw_sp_port_is_pause_en(mlxsw_sp_port);
+	int err;
+
+	if (pause_en && pfc->pfc_en) {
+		netdev_err(dev, "PAUSE frames already enabled on port\n");
+		return -EINVAL;
+	}
+
+	err = __mlxsw_sp_port_headroom_set(mlxsw_sp_port, dev->mtu,
+					   mlxsw_sp_port->dcb.ets->prio_tc,
+					   pause_en, pfc);
+	if (err) {
+		netdev_err(dev, "Failed to configure port's headroom for PFC\n");
+		return err;
+	}
+
+	err = mlxsw_sp_port_pfc_set(mlxsw_sp_port, pfc);
+	if (err) {
+		netdev_err(dev, "Failed to configure PFC\n");
+		goto err_port_pfc_set;
+	}
+
+	memcpy(mlxsw_sp_port->dcb.pfc, pfc, sizeof(*pfc));
+	mlxsw_sp_port->dcb.pfc->pfc_cap = IEEE_8021QAZ_MAX_TCS;
+
+	return 0;
+
+err_port_pfc_set:
+	__mlxsw_sp_port_headroom_set(mlxsw_sp_port, dev->mtu,
+				     mlxsw_sp_port->dcb.ets->prio_tc, pause_en,
+				     mlxsw_sp_port->dcb.pfc);
+	return err;
+}
+
+static const struct dcbnl_rtnl_ops mlxsw_sp_dcbnl_ops = {
+	.ieee_getets		= mlxsw_sp_dcbnl_ieee_getets,
+	.ieee_setets		= mlxsw_sp_dcbnl_ieee_setets,
+	.ieee_getmaxrate	= mlxsw_sp_dcbnl_ieee_getmaxrate,
+	.ieee_setmaxrate	= mlxsw_sp_dcbnl_ieee_setmaxrate,
+	.ieee_getpfc		= mlxsw_sp_dcbnl_ieee_getpfc,
+	.ieee_setpfc		= mlxsw_sp_dcbnl_ieee_setpfc,
+
+	.getdcbx		= mlxsw_sp_dcbnl_getdcbx,
+	.setdcbx		= mlxsw_sp_dcbnl_setdcbx,
+};
+
+static int mlxsw_sp_port_ets_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	mlxsw_sp_port->dcb.ets = kzalloc(sizeof(*mlxsw_sp_port->dcb.ets),
+					 GFP_KERNEL);
+	if (!mlxsw_sp_port->dcb.ets)
+		return -ENOMEM;
+
+	mlxsw_sp_port->dcb.ets->ets_cap = IEEE_8021QAZ_MAX_TCS;
+
+	return 0;
+}
+
+static void mlxsw_sp_port_ets_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	kfree(mlxsw_sp_port->dcb.ets);
+}
+
+static int mlxsw_sp_port_maxrate_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int i;
+
+	mlxsw_sp_port->dcb.maxrate = kmalloc(sizeof(*mlxsw_sp_port->dcb.maxrate),
+					     GFP_KERNEL);
+	if (!mlxsw_sp_port->dcb.maxrate)
+		return -ENOMEM;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		mlxsw_sp_port->dcb.maxrate->tc_maxrate[i] = MLXSW_REG_QEEC_MAS_DIS;
+
+	return 0;
+}
+
+static void mlxsw_sp_port_maxrate_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	kfree(mlxsw_sp_port->dcb.maxrate);
+}
+
+static int mlxsw_sp_port_pfc_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	mlxsw_sp_port->dcb.pfc = kzalloc(sizeof(*mlxsw_sp_port->dcb.pfc),
+					 GFP_KERNEL);
+	if (!mlxsw_sp_port->dcb.pfc)
+		return -ENOMEM;
+
+	mlxsw_sp_port->dcb.pfc->pfc_cap = IEEE_8021QAZ_MAX_TCS;
+
+	return 0;
+}
+
+static void mlxsw_sp_port_pfc_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	kfree(mlxsw_sp_port->dcb.pfc);
+}
+
+int mlxsw_sp_port_dcb_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err;
+
+	err = mlxsw_sp_port_ets_init(mlxsw_sp_port);
+	if (err)
+		return err;
+	err = mlxsw_sp_port_maxrate_init(mlxsw_sp_port);
+	if (err)
+		goto err_port_maxrate_init;
+	err = mlxsw_sp_port_pfc_init(mlxsw_sp_port);
+	if (err)
+		goto err_port_pfc_init;
+
+	mlxsw_sp_port->dev->dcbnl_ops = &mlxsw_sp_dcbnl_ops;
+
+	return 0;
+
+err_port_pfc_init:
+	mlxsw_sp_port_maxrate_fini(mlxsw_sp_port);
+err_port_maxrate_init:
+	mlxsw_sp_port_ets_fini(mlxsw_sp_port);
+	return err;
+}
+
+void mlxsw_sp_port_dcb_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	mlxsw_sp_port_pfc_fini(mlxsw_sp_port);
+	mlxsw_sp_port_maxrate_fini(mlxsw_sp_port);
+	mlxsw_sp_port_ets_fini(mlxsw_sp_port);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
new file mode 100644
index 0000000..f56fa18
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
@@ -0,0 +1,1337 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Arkadi Sharshevsky <arakdis@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <net/devlink.h>
+
+#include "spectrum.h"
+#include "spectrum_dpipe.h"
+#include "spectrum_router.h"
+
+enum mlxsw_sp_field_metadata_id {
+	MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT,
+	MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD,
+	MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP,
+	MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX,
+	MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_SIZE,
+	MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX,
+};
+
+static struct devlink_dpipe_field mlxsw_sp_dpipe_fields_metadata[] = {
+	{
+		.name = "erif_port",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT,
+		.bitwidth = 32,
+		.mapping_type = DEVLINK_DPIPE_FIELD_MAPPING_TYPE_IFINDEX,
+	},
+	{
+		.name = "l3_forward",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD,
+		.bitwidth = 1,
+	},
+	{
+		.name = "l3_drop",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP,
+		.bitwidth = 1,
+	},
+	{
+		.name = "adj_index",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX,
+		.bitwidth = 32,
+	},
+	{
+		.name = "adj_size",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_SIZE,
+		.bitwidth = 32,
+	},
+	{
+		.name = "adj_hash_index",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX,
+		.bitwidth = 32,
+	},
+};
+
+enum mlxsw_sp_dpipe_header_id {
+	MLXSW_SP_DPIPE_HEADER_METADATA,
+};
+
+static struct devlink_dpipe_header mlxsw_sp_dpipe_header_metadata = {
+	.name = "mlxsw_meta",
+	.id = MLXSW_SP_DPIPE_HEADER_METADATA,
+	.fields = mlxsw_sp_dpipe_fields_metadata,
+	.fields_count = ARRAY_SIZE(mlxsw_sp_dpipe_fields_metadata),
+};
+
+static struct devlink_dpipe_header *mlxsw_dpipe_headers[] = {
+	&mlxsw_sp_dpipe_header_metadata,
+	&devlink_dpipe_header_ethernet,
+	&devlink_dpipe_header_ipv4,
+	&devlink_dpipe_header_ipv6,
+};
+
+static struct devlink_dpipe_headers mlxsw_sp_dpipe_headers = {
+	.headers = mlxsw_dpipe_headers,
+	.headers_count = ARRAY_SIZE(mlxsw_dpipe_headers),
+};
+
+static int mlxsw_sp_dpipe_table_erif_actions_dump(void *priv,
+						  struct sk_buff *skb)
+{
+	struct devlink_dpipe_action action = {0};
+	int err;
+
+	action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action.header = &mlxsw_sp_dpipe_header_metadata;
+	action.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD;
+
+	err = devlink_dpipe_action_put(skb, &action);
+	if (err)
+		return err;
+
+	action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action.header = &mlxsw_sp_dpipe_header_metadata;
+	action.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP;
+
+	return devlink_dpipe_action_put(skb, &action);
+}
+
+static int mlxsw_sp_dpipe_table_erif_matches_dump(void *priv,
+						  struct sk_buff *skb)
+{
+	struct devlink_dpipe_match match = {0};
+
+	match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match.header = &mlxsw_sp_dpipe_header_metadata;
+	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+
+	return devlink_dpipe_match_put(skb, &match);
+}
+
+static void
+mlxsw_sp_erif_match_action_prepare(struct devlink_dpipe_match *match,
+				   struct devlink_dpipe_action *action)
+{
+	action->type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action->header = &mlxsw_sp_dpipe_header_metadata;
+	action->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD;
+
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match->header = &mlxsw_sp_dpipe_header_metadata;
+	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+}
+
+static int mlxsw_sp_erif_entry_prepare(struct devlink_dpipe_entry *entry,
+				       struct devlink_dpipe_value *match_value,
+				       struct devlink_dpipe_match *match,
+				       struct devlink_dpipe_value *action_value,
+				       struct devlink_dpipe_action *action)
+{
+	entry->match_values = match_value;
+	entry->match_values_count = 1;
+
+	entry->action_values = action_value;
+	entry->action_values_count = 1;
+
+	match_value->match = match;
+	match_value->value_size = sizeof(u32);
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	action_value->action = action;
+	action_value->value_size = sizeof(u32);
+	action_value->value = kmalloc(action_value->value_size, GFP_KERNEL);
+	if (!action_value->value)
+		goto err_action_alloc;
+	return 0;
+
+err_action_alloc:
+	kfree(match_value->value);
+	return -ENOMEM;
+}
+
+static int mlxsw_sp_erif_entry_get(struct mlxsw_sp *mlxsw_sp,
+				   struct devlink_dpipe_entry *entry,
+				   struct mlxsw_sp_rif *rif,
+				   bool counters_enabled)
+{
+	u32 *action_value;
+	u32 *rif_value;
+	u64 cnt;
+	int err;
+
+	/* Set Match RIF index */
+	rif_value = entry->match_values->value;
+	*rif_value = mlxsw_sp_rif_index(rif);
+	entry->match_values->mapping_value = mlxsw_sp_rif_dev_ifindex(rif);
+	entry->match_values->mapping_valid = true;
+
+	/* Set Action Forwarding */
+	action_value = entry->action_values->value;
+	*action_value = 1;
+
+	entry->counter_valid = false;
+	entry->counter = 0;
+	entry->index = mlxsw_sp_rif_index(rif);
+
+	if (!counters_enabled)
+		return 0;
+
+	err = mlxsw_sp_rif_counter_value_get(mlxsw_sp, rif,
+					     MLXSW_SP_RIF_COUNTER_EGRESS,
+					     &cnt);
+	if (!err) {
+		entry->counter = cnt;
+		entry->counter_valid = true;
+	}
+	return 0;
+}
+
+static int
+mlxsw_sp_dpipe_table_erif_entries_dump(void *priv, bool counters_enabled,
+				       struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+	struct devlink_dpipe_value match_value, action_value;
+	struct devlink_dpipe_action action = {0};
+	struct devlink_dpipe_match match = {0};
+	struct devlink_dpipe_entry entry = {0};
+	struct mlxsw_sp *mlxsw_sp = priv;
+	unsigned int rif_count;
+	int i, j;
+	int err;
+
+	memset(&match_value, 0, sizeof(match_value));
+	memset(&action_value, 0, sizeof(action_value));
+
+	mlxsw_sp_erif_match_action_prepare(&match, &action);
+	err = mlxsw_sp_erif_entry_prepare(&entry, &match_value, &match,
+					  &action_value, &action);
+	if (err)
+		return err;
+
+	rif_count = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
+	rtnl_lock();
+	i = 0;
+start_again:
+	err = devlink_dpipe_entry_ctx_prepare(dump_ctx);
+	if (err)
+		return err;
+	j = 0;
+	for (; i < rif_count; i++) {
+		struct mlxsw_sp_rif *rif = mlxsw_sp_rif_by_index(mlxsw_sp, i);
+
+		if (!rif)
+			continue;
+		err = mlxsw_sp_erif_entry_get(mlxsw_sp, &entry, rif,
+					      counters_enabled);
+		if (err)
+			goto err_entry_get;
+		err = devlink_dpipe_entry_ctx_append(dump_ctx, &entry);
+		if (err) {
+			if (err == -EMSGSIZE) {
+				if (!j)
+					goto err_entry_append;
+				break;
+			}
+			goto err_entry_append;
+		}
+		j++;
+	}
+
+	devlink_dpipe_entry_ctx_close(dump_ctx);
+	if (i != rif_count)
+		goto start_again;
+	rtnl_unlock();
+
+	devlink_dpipe_entry_clear(&entry);
+	return 0;
+err_entry_append:
+err_entry_get:
+	rtnl_unlock();
+	devlink_dpipe_entry_clear(&entry);
+	return err;
+}
+
+static int mlxsw_sp_dpipe_table_erif_counters_update(void *priv, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	int i;
+
+	rtnl_lock();
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++) {
+		struct mlxsw_sp_rif *rif = mlxsw_sp_rif_by_index(mlxsw_sp, i);
+
+		if (!rif)
+			continue;
+		if (enable)
+			mlxsw_sp_rif_counter_alloc(mlxsw_sp, rif,
+						   MLXSW_SP_RIF_COUNTER_EGRESS);
+		else
+			mlxsw_sp_rif_counter_free(mlxsw_sp, rif,
+						  MLXSW_SP_RIF_COUNTER_EGRESS);
+	}
+	rtnl_unlock();
+	return 0;
+}
+
+static u64 mlxsw_sp_dpipe_table_erif_size_get(void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	return MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
+}
+
+static struct devlink_dpipe_table_ops mlxsw_sp_erif_ops = {
+	.matches_dump = mlxsw_sp_dpipe_table_erif_matches_dump,
+	.actions_dump = mlxsw_sp_dpipe_table_erif_actions_dump,
+	.entries_dump = mlxsw_sp_dpipe_table_erif_entries_dump,
+	.counters_set_update = mlxsw_sp_dpipe_table_erif_counters_update,
+	.size_get = mlxsw_sp_dpipe_table_erif_size_get,
+};
+
+static int mlxsw_sp_dpipe_erif_table_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	return devlink_dpipe_table_register(devlink,
+					    MLXSW_SP_DPIPE_TABLE_NAME_ERIF,
+					    &mlxsw_sp_erif_ops,
+					    mlxsw_sp, false);
+}
+
+static void mlxsw_sp_dpipe_erif_table_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	devlink_dpipe_table_unregister(devlink, MLXSW_SP_DPIPE_TABLE_NAME_ERIF);
+}
+
+static int mlxsw_sp_dpipe_table_host_matches_dump(struct sk_buff *skb, int type)
+{
+	struct devlink_dpipe_match match = {0};
+	int err;
+
+	match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match.header = &mlxsw_sp_dpipe_header_metadata;
+	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+
+	err = devlink_dpipe_match_put(skb, &match);
+	if (err)
+		return err;
+
+	switch (type) {
+	case AF_INET:
+		match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+		match.header = &devlink_dpipe_header_ipv4;
+		match.field_id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP;
+		break;
+	case AF_INET6:
+		match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+		match.header = &devlink_dpipe_header_ipv6;
+		match.field_id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP;
+		break;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	return devlink_dpipe_match_put(skb, &match);
+}
+
+static int
+mlxsw_sp_dpipe_table_host4_matches_dump(void *priv, struct sk_buff *skb)
+{
+	return mlxsw_sp_dpipe_table_host_matches_dump(skb, AF_INET);
+}
+
+static int
+mlxsw_sp_dpipe_table_host_actions_dump(void *priv, struct sk_buff *skb)
+{
+	struct devlink_dpipe_action action = {0};
+
+	action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action.header = &devlink_dpipe_header_ethernet;
+	action.field_id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC;
+
+	return devlink_dpipe_action_put(skb, &action);
+}
+
+enum mlxsw_sp_dpipe_table_host_match {
+	MLXSW_SP_DPIPE_TABLE_HOST_MATCH_RIF,
+	MLXSW_SP_DPIPE_TABLE_HOST_MATCH_DIP,
+	MLXSW_SP_DPIPE_TABLE_HOST_MATCH_COUNT,
+};
+
+static void
+mlxsw_sp_dpipe_table_host_match_action_prepare(struct devlink_dpipe_match *matches,
+					       struct devlink_dpipe_action *action,
+					       int type)
+{
+	struct devlink_dpipe_match *match;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_RIF];
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match->header = &mlxsw_sp_dpipe_header_metadata;
+	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_DIP];
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	switch (type) {
+	case AF_INET:
+		match->header = &devlink_dpipe_header_ipv4;
+		match->field_id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP;
+		break;
+	case AF_INET6:
+		match->header = &devlink_dpipe_header_ipv6;
+		match->field_id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP;
+		break;
+	default:
+		WARN_ON(1);
+		return;
+	}
+
+	action->type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action->header = &devlink_dpipe_header_ethernet;
+	action->field_id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC;
+}
+
+static int
+mlxsw_sp_dpipe_table_host_entry_prepare(struct devlink_dpipe_entry *entry,
+					struct devlink_dpipe_value *match_values,
+					struct devlink_dpipe_match *matches,
+					struct devlink_dpipe_value *action_value,
+					struct devlink_dpipe_action *action,
+					int type)
+{
+	struct devlink_dpipe_value *match_value;
+	struct devlink_dpipe_match *match;
+
+	entry->match_values = match_values;
+	entry->match_values_count = MLXSW_SP_DPIPE_TABLE_HOST_MATCH_COUNT;
+
+	entry->action_values = action_value;
+	entry->action_values_count = 1;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_RIF];
+	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_RIF];
+
+	match_value->match = match;
+	match_value->value_size = sizeof(u32);
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_DIP];
+	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_DIP];
+
+	match_value->match = match;
+	switch (type) {
+	case AF_INET:
+		match_value->value_size = sizeof(u32);
+		break;
+	case AF_INET6:
+		match_value->value_size = sizeof(struct in6_addr);
+		break;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	action_value->action = action;
+	action_value->value_size = sizeof(u64);
+	action_value->value = kmalloc(action_value->value_size, GFP_KERNEL);
+	if (!action_value->value)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void
+__mlxsw_sp_dpipe_table_host_entry_fill(struct devlink_dpipe_entry *entry,
+				       struct mlxsw_sp_rif *rif,
+				       unsigned char *ha, void *dip)
+{
+	struct devlink_dpipe_value *value;
+	u32 *rif_value;
+	u8 *ha_value;
+
+	/* Set Match RIF index */
+	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_RIF];
+
+	rif_value = value->value;
+	*rif_value = mlxsw_sp_rif_index(rif);
+	value->mapping_value = mlxsw_sp_rif_dev_ifindex(rif);
+	value->mapping_valid = true;
+
+	/* Set Match DIP */
+	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_DIP];
+	memcpy(value->value, dip, value->value_size);
+
+	/* Set Action DMAC */
+	value = entry->action_values;
+	ha_value = value->value;
+	ether_addr_copy(ha_value, ha);
+}
+
+static void
+mlxsw_sp_dpipe_table_host4_entry_fill(struct devlink_dpipe_entry *entry,
+				      struct mlxsw_sp_neigh_entry *neigh_entry,
+				      struct mlxsw_sp_rif *rif)
+{
+	unsigned char *ha;
+	u32 dip;
+
+	ha = mlxsw_sp_neigh_entry_ha(neigh_entry);
+	dip = mlxsw_sp_neigh4_entry_dip(neigh_entry);
+	__mlxsw_sp_dpipe_table_host_entry_fill(entry, rif, ha, &dip);
+}
+
+static void
+mlxsw_sp_dpipe_table_host6_entry_fill(struct devlink_dpipe_entry *entry,
+				      struct mlxsw_sp_neigh_entry *neigh_entry,
+				      struct mlxsw_sp_rif *rif)
+{
+	struct in6_addr *dip;
+	unsigned char *ha;
+
+	ha = mlxsw_sp_neigh_entry_ha(neigh_entry);
+	dip = mlxsw_sp_neigh6_entry_dip(neigh_entry);
+
+	__mlxsw_sp_dpipe_table_host_entry_fill(entry, rif, ha, dip);
+}
+
+static void
+mlxsw_sp_dpipe_table_host_entry_fill(struct mlxsw_sp *mlxsw_sp,
+				     struct devlink_dpipe_entry *entry,
+				     struct mlxsw_sp_neigh_entry *neigh_entry,
+				     struct mlxsw_sp_rif *rif,
+				     int type)
+{
+	int err;
+
+	switch (type) {
+	case AF_INET:
+		mlxsw_sp_dpipe_table_host4_entry_fill(entry, neigh_entry, rif);
+		break;
+	case AF_INET6:
+		mlxsw_sp_dpipe_table_host6_entry_fill(entry, neigh_entry, rif);
+		break;
+	default:
+		WARN_ON(1);
+		return;
+	}
+
+	err = mlxsw_sp_neigh_counter_get(mlxsw_sp, neigh_entry,
+					 &entry->counter);
+	if (!err)
+		entry->counter_valid = true;
+}
+
+static int
+mlxsw_sp_dpipe_table_host_entries_get(struct mlxsw_sp *mlxsw_sp,
+				      struct devlink_dpipe_entry *entry,
+				      bool counters_enabled,
+				      struct devlink_dpipe_dump_ctx *dump_ctx,
+				      int type)
+{
+	int rif_neigh_count = 0;
+	int rif_neigh_skip = 0;
+	int neigh_count = 0;
+	int rif_count;
+	int i, j;
+	int err;
+
+	rtnl_lock();
+	i = 0;
+	rif_count = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
+start_again:
+	err = devlink_dpipe_entry_ctx_prepare(dump_ctx);
+	if (err)
+		goto err_ctx_prepare;
+	j = 0;
+	rif_neigh_skip = rif_neigh_count;
+	for (; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++) {
+		struct mlxsw_sp_rif *rif = mlxsw_sp_rif_by_index(mlxsw_sp, i);
+		struct mlxsw_sp_neigh_entry *neigh_entry;
+
+		if (!rif)
+			continue;
+
+		rif_neigh_count = 0;
+		mlxsw_sp_rif_neigh_for_each(neigh_entry, rif) {
+			int neigh_type = mlxsw_sp_neigh_entry_type(neigh_entry);
+
+			if (neigh_type != type)
+				continue;
+
+			if (neigh_type == AF_INET6 &&
+			    mlxsw_sp_neigh_ipv6_ignore(neigh_entry))
+				continue;
+
+			if (rif_neigh_count < rif_neigh_skip)
+				goto skip;
+
+			mlxsw_sp_dpipe_table_host_entry_fill(mlxsw_sp, entry,
+							     neigh_entry, rif,
+							     type);
+			entry->index = neigh_count;
+			err = devlink_dpipe_entry_ctx_append(dump_ctx, entry);
+			if (err) {
+				if (err == -EMSGSIZE) {
+					if (!j)
+						goto err_entry_append;
+					else
+						goto out;
+				}
+				goto err_entry_append;
+			}
+			neigh_count++;
+			j++;
+skip:
+			rif_neigh_count++;
+		}
+		rif_neigh_skip = 0;
+	}
+out:
+	devlink_dpipe_entry_ctx_close(dump_ctx);
+	if (i != rif_count)
+		goto start_again;
+
+	rtnl_unlock();
+	return 0;
+
+err_ctx_prepare:
+err_entry_append:
+	rtnl_unlock();
+	return err;
+}
+
+static int
+mlxsw_sp_dpipe_table_host_entries_dump(struct mlxsw_sp *mlxsw_sp,
+				       bool counters_enabled,
+				       struct devlink_dpipe_dump_ctx *dump_ctx,
+				       int type)
+{
+	struct devlink_dpipe_value match_values[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_COUNT];
+	struct devlink_dpipe_match matches[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_COUNT];
+	struct devlink_dpipe_value action_value;
+	struct devlink_dpipe_action action = {0};
+	struct devlink_dpipe_entry entry = {0};
+	int err;
+
+	memset(matches, 0, MLXSW_SP_DPIPE_TABLE_HOST_MATCH_COUNT *
+			   sizeof(matches[0]));
+	memset(match_values, 0, MLXSW_SP_DPIPE_TABLE_HOST_MATCH_COUNT *
+				sizeof(match_values[0]));
+	memset(&action_value, 0, sizeof(action_value));
+
+	mlxsw_sp_dpipe_table_host_match_action_prepare(matches, &action, type);
+	err = mlxsw_sp_dpipe_table_host_entry_prepare(&entry, match_values,
+						      matches, &action_value,
+						      &action, type);
+	if (err)
+		goto out;
+
+	err = mlxsw_sp_dpipe_table_host_entries_get(mlxsw_sp, &entry,
+						    counters_enabled, dump_ctx,
+						    type);
+out:
+	devlink_dpipe_entry_clear(&entry);
+	return err;
+}
+
+static int
+mlxsw_sp_dpipe_table_host4_entries_dump(void *priv, bool counters_enabled,
+					struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	return mlxsw_sp_dpipe_table_host_entries_dump(mlxsw_sp,
+						      counters_enabled,
+						      dump_ctx, AF_INET);
+}
+
+static void
+mlxsw_sp_dpipe_table_host_counters_update(struct mlxsw_sp *mlxsw_sp,
+					  bool enable, int type)
+{
+	int i;
+
+	rtnl_lock();
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++) {
+		struct mlxsw_sp_rif *rif = mlxsw_sp_rif_by_index(mlxsw_sp, i);
+		struct mlxsw_sp_neigh_entry *neigh_entry;
+
+		if (!rif)
+			continue;
+		mlxsw_sp_rif_neigh_for_each(neigh_entry, rif) {
+			int neigh_type = mlxsw_sp_neigh_entry_type(neigh_entry);
+
+			if (neigh_type != type)
+				continue;
+
+			if (neigh_type == AF_INET6 &&
+			    mlxsw_sp_neigh_ipv6_ignore(neigh_entry))
+				continue;
+
+			mlxsw_sp_neigh_entry_counter_update(mlxsw_sp,
+							    neigh_entry,
+							    enable);
+		}
+	}
+	rtnl_unlock();
+}
+
+static int mlxsw_sp_dpipe_table_host4_counters_update(void *priv, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	mlxsw_sp_dpipe_table_host_counters_update(mlxsw_sp, enable, AF_INET);
+	return 0;
+}
+
+static u64
+mlxsw_sp_dpipe_table_host_size_get(struct mlxsw_sp *mlxsw_sp, int type)
+{
+	u64 size = 0;
+	int i;
+
+	rtnl_lock();
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++) {
+		struct mlxsw_sp_rif *rif = mlxsw_sp_rif_by_index(mlxsw_sp, i);
+		struct mlxsw_sp_neigh_entry *neigh_entry;
+
+		if (!rif)
+			continue;
+		mlxsw_sp_rif_neigh_for_each(neigh_entry, rif) {
+			int neigh_type = mlxsw_sp_neigh_entry_type(neigh_entry);
+
+			if (neigh_type != type)
+				continue;
+
+			if (neigh_type == AF_INET6 &&
+			    mlxsw_sp_neigh_ipv6_ignore(neigh_entry))
+				continue;
+
+			size++;
+		}
+	}
+	rtnl_unlock();
+
+	return size;
+}
+
+static u64 mlxsw_sp_dpipe_table_host4_size_get(void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	return mlxsw_sp_dpipe_table_host_size_get(mlxsw_sp, AF_INET);
+}
+
+static struct devlink_dpipe_table_ops mlxsw_sp_host4_ops = {
+	.matches_dump = mlxsw_sp_dpipe_table_host4_matches_dump,
+	.actions_dump = mlxsw_sp_dpipe_table_host_actions_dump,
+	.entries_dump = mlxsw_sp_dpipe_table_host4_entries_dump,
+	.counters_set_update = mlxsw_sp_dpipe_table_host4_counters_update,
+	.size_get = mlxsw_sp_dpipe_table_host4_size_get,
+};
+
+#define MLXSW_SP_DPIPE_TABLE_RESOURCE_UNIT_HOST4 1
+
+static int mlxsw_sp_dpipe_host4_table_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
+
+	err = devlink_dpipe_table_register(devlink,
+					   MLXSW_SP_DPIPE_TABLE_NAME_HOST4,
+					   &mlxsw_sp_host4_ops,
+					   mlxsw_sp, false);
+	if (err)
+		return err;
+
+	err = devlink_dpipe_table_resource_set(devlink,
+					       MLXSW_SP_DPIPE_TABLE_NAME_HOST4,
+					       MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+					       MLXSW_SP_DPIPE_TABLE_RESOURCE_UNIT_HOST4);
+	if (err)
+		goto err_resource_set;
+
+	return 0;
+
+err_resource_set:
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_HOST4);
+	return err;
+}
+
+static void mlxsw_sp_dpipe_host4_table_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_HOST4);
+}
+
+static int
+mlxsw_sp_dpipe_table_host6_matches_dump(void *priv, struct sk_buff *skb)
+{
+	return mlxsw_sp_dpipe_table_host_matches_dump(skb, AF_INET6);
+}
+
+static int
+mlxsw_sp_dpipe_table_host6_entries_dump(void *priv, bool counters_enabled,
+					struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	return mlxsw_sp_dpipe_table_host_entries_dump(mlxsw_sp,
+						      counters_enabled,
+						      dump_ctx, AF_INET6);
+}
+
+static int mlxsw_sp_dpipe_table_host6_counters_update(void *priv, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	mlxsw_sp_dpipe_table_host_counters_update(mlxsw_sp, enable, AF_INET6);
+	return 0;
+}
+
+static u64 mlxsw_sp_dpipe_table_host6_size_get(void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	return mlxsw_sp_dpipe_table_host_size_get(mlxsw_sp, AF_INET6);
+}
+
+static struct devlink_dpipe_table_ops mlxsw_sp_host6_ops = {
+	.matches_dump = mlxsw_sp_dpipe_table_host6_matches_dump,
+	.actions_dump = mlxsw_sp_dpipe_table_host_actions_dump,
+	.entries_dump = mlxsw_sp_dpipe_table_host6_entries_dump,
+	.counters_set_update = mlxsw_sp_dpipe_table_host6_counters_update,
+	.size_get = mlxsw_sp_dpipe_table_host6_size_get,
+};
+
+#define MLXSW_SP_DPIPE_TABLE_RESOURCE_UNIT_HOST6 2
+
+static int mlxsw_sp_dpipe_host6_table_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
+
+	err = devlink_dpipe_table_register(devlink,
+					   MLXSW_SP_DPIPE_TABLE_NAME_HOST6,
+					   &mlxsw_sp_host6_ops,
+					   mlxsw_sp, false);
+	if (err)
+		return err;
+
+	err = devlink_dpipe_table_resource_set(devlink,
+					       MLXSW_SP_DPIPE_TABLE_NAME_HOST6,
+					       MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+					       MLXSW_SP_DPIPE_TABLE_RESOURCE_UNIT_HOST6);
+	if (err)
+		goto err_resource_set;
+
+	return 0;
+
+err_resource_set:
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_HOST6);
+	return err;
+}
+
+static void mlxsw_sp_dpipe_host6_table_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_HOST6);
+}
+
+static int mlxsw_sp_dpipe_table_adj_matches_dump(void *priv,
+						 struct sk_buff *skb)
+{
+	struct devlink_dpipe_match match = {0};
+	int err;
+
+	match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match.header = &mlxsw_sp_dpipe_header_metadata;
+	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX;
+
+	err = devlink_dpipe_match_put(skb, &match);
+	if (err)
+		return err;
+
+	match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match.header = &mlxsw_sp_dpipe_header_metadata;
+	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_SIZE;
+
+	err = devlink_dpipe_match_put(skb, &match);
+	if (err)
+		return err;
+
+	match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match.header = &mlxsw_sp_dpipe_header_metadata;
+	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX;
+
+	return devlink_dpipe_match_put(skb, &match);
+}
+
+static int mlxsw_sp_dpipe_table_adj_actions_dump(void *priv,
+						 struct sk_buff *skb)
+{
+	struct devlink_dpipe_action action = {0};
+	int err;
+
+	action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action.header = &devlink_dpipe_header_ethernet;
+	action.field_id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC;
+
+	err = devlink_dpipe_action_put(skb, &action);
+	if (err)
+		return err;
+
+	action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action.header = &mlxsw_sp_dpipe_header_metadata;
+	action.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+
+	return devlink_dpipe_action_put(skb, &action);
+}
+
+static u64 mlxsw_sp_dpipe_table_adj_size(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_nexthop *nh;
+	u64 size = 0;
+
+	mlxsw_sp_nexthop_for_each(nh, mlxsw_sp->router)
+		if (mlxsw_sp_nexthop_offload(nh) &&
+		    !mlxsw_sp_nexthop_group_has_ipip(nh))
+			size++;
+	return size;
+}
+
+enum mlxsw_sp_dpipe_table_adj_match {
+	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX,
+	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE,
+	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX,
+	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT,
+};
+
+enum mlxsw_sp_dpipe_table_adj_action {
+	MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC,
+	MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT,
+	MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT,
+};
+
+static void
+mlxsw_sp_dpipe_table_adj_match_action_prepare(struct devlink_dpipe_match *matches,
+					      struct devlink_dpipe_action *actions)
+{
+	struct devlink_dpipe_action *action;
+	struct devlink_dpipe_match *match;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match->header = &mlxsw_sp_dpipe_header_metadata;
+	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE];
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match->header = &mlxsw_sp_dpipe_header_metadata;
+	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_SIZE;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match->header = &mlxsw_sp_dpipe_header_metadata;
+	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX;
+
+	action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC];
+	action->type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action->header = &devlink_dpipe_header_ethernet;
+	action->field_id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC;
+
+	action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT];
+	action->type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action->header = &mlxsw_sp_dpipe_header_metadata;
+	action->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+}
+
+static int
+mlxsw_sp_dpipe_table_adj_entry_prepare(struct devlink_dpipe_entry *entry,
+				       struct devlink_dpipe_value *match_values,
+				       struct devlink_dpipe_match *matches,
+				       struct devlink_dpipe_value *action_values,
+				       struct devlink_dpipe_action *actions)
+{	struct devlink_dpipe_value *action_value;
+	struct devlink_dpipe_value *match_value;
+	struct devlink_dpipe_action *action;
+	struct devlink_dpipe_match *match;
+
+	entry->match_values = match_values;
+	entry->match_values_count = MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT;
+
+	entry->action_values = action_values;
+	entry->action_values_count = MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];
+	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];
+
+	match_value->match = match;
+	match_value->value_size = sizeof(u32);
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE];
+	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE];
+
+	match_value->match = match;
+	match_value->value_size = sizeof(u32);
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
+	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
+
+	match_value->match = match;
+	match_value->value_size = sizeof(u32);
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC];
+	action_value = &action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC];
+
+	action_value->action = action;
+	action_value->value_size = sizeof(u64);
+	action_value->value = kmalloc(action_value->value_size, GFP_KERNEL);
+	if (!action_value->value)
+		return -ENOMEM;
+
+	action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT];
+	action_value = &action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT];
+
+	action_value->action = action;
+	action_value->value_size = sizeof(u32);
+	action_value->value = kmalloc(action_value->value_size, GFP_KERNEL);
+	if (!action_value->value)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void
+__mlxsw_sp_dpipe_table_adj_entry_fill(struct devlink_dpipe_entry *entry,
+				      u32 adj_index, u32 adj_size,
+				      u32 adj_hash_index, unsigned char *ha,
+				      struct mlxsw_sp_rif *rif)
+{
+	struct devlink_dpipe_value *value;
+	u32 *p_rif_value;
+	u32 *p_index;
+
+	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];
+	p_index = value->value;
+	*p_index = adj_index;
+
+	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE];
+	p_index = value->value;
+	*p_index = adj_size;
+
+	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
+	p_index = value->value;
+	*p_index = adj_hash_index;
+
+	value = &entry->action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC];
+	ether_addr_copy(value->value, ha);
+
+	value = &entry->action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT];
+	p_rif_value = value->value;
+	*p_rif_value = mlxsw_sp_rif_index(rif);
+	value->mapping_value = mlxsw_sp_rif_dev_ifindex(rif);
+	value->mapping_valid = true;
+}
+
+static void mlxsw_sp_dpipe_table_adj_entry_fill(struct mlxsw_sp *mlxsw_sp,
+						struct mlxsw_sp_nexthop *nh,
+						struct devlink_dpipe_entry *entry)
+{
+	struct mlxsw_sp_rif *rif = mlxsw_sp_nexthop_rif(nh);
+	unsigned char *ha = mlxsw_sp_nexthop_ha(nh);
+	u32 adj_hash_index = 0;
+	u32 adj_index = 0;
+	u32 adj_size = 0;
+	int err;
+
+	mlxsw_sp_nexthop_indexes(nh, &adj_index, &adj_size, &adj_hash_index);
+	__mlxsw_sp_dpipe_table_adj_entry_fill(entry, adj_index, adj_size,
+					      adj_hash_index, ha, rif);
+	err = mlxsw_sp_nexthop_counter_get(mlxsw_sp, nh, &entry->counter);
+	if (!err)
+		entry->counter_valid = true;
+}
+
+static int
+mlxsw_sp_dpipe_table_adj_entries_get(struct mlxsw_sp *mlxsw_sp,
+				     struct devlink_dpipe_entry *entry,
+				     bool counters_enabled,
+				     struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+	struct mlxsw_sp_nexthop *nh;
+	int entry_index = 0;
+	int nh_count_max;
+	int nh_count = 0;
+	int nh_skip;
+	int j;
+	int err;
+
+	rtnl_lock();
+	nh_count_max = mlxsw_sp_dpipe_table_adj_size(mlxsw_sp);
+start_again:
+	err = devlink_dpipe_entry_ctx_prepare(dump_ctx);
+	if (err)
+		goto err_ctx_prepare;
+	j = 0;
+	nh_skip = nh_count;
+	nh_count = 0;
+	mlxsw_sp_nexthop_for_each(nh, mlxsw_sp->router) {
+		if (!mlxsw_sp_nexthop_offload(nh) ||
+		    mlxsw_sp_nexthop_group_has_ipip(nh))
+			continue;
+
+		if (nh_count < nh_skip)
+			goto skip;
+
+		mlxsw_sp_dpipe_table_adj_entry_fill(mlxsw_sp, nh, entry);
+		entry->index = entry_index;
+		err = devlink_dpipe_entry_ctx_append(dump_ctx, entry);
+		if (err) {
+			if (err == -EMSGSIZE) {
+				if (!j)
+					goto err_entry_append;
+				break;
+			}
+			goto err_entry_append;
+		}
+		entry_index++;
+		j++;
+skip:
+		nh_count++;
+	}
+
+	devlink_dpipe_entry_ctx_close(dump_ctx);
+	if (nh_count != nh_count_max)
+		goto start_again;
+	rtnl_unlock();
+
+	return 0;
+
+err_ctx_prepare:
+err_entry_append:
+	rtnl_unlock();
+	return err;
+}
+
+static int
+mlxsw_sp_dpipe_table_adj_entries_dump(void *priv, bool counters_enabled,
+				      struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+	struct devlink_dpipe_value action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT];
+	struct devlink_dpipe_value match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT];
+	struct devlink_dpipe_action actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT];
+	struct devlink_dpipe_match matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT];
+	struct devlink_dpipe_entry entry = {0};
+	struct mlxsw_sp *mlxsw_sp = priv;
+	int err;
+
+	memset(matches, 0, MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT *
+			   sizeof(matches[0]));
+	memset(match_values, 0, MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT *
+				sizeof(match_values[0]));
+	memset(actions, 0, MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT *
+			   sizeof(actions[0]));
+	memset(action_values, 0, MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT *
+				 sizeof(action_values[0]));
+
+	mlxsw_sp_dpipe_table_adj_match_action_prepare(matches, actions);
+	err = mlxsw_sp_dpipe_table_adj_entry_prepare(&entry,
+						     match_values, matches,
+						     action_values, actions);
+	if (err)
+		goto out;
+
+	err = mlxsw_sp_dpipe_table_adj_entries_get(mlxsw_sp, &entry,
+						   counters_enabled, dump_ctx);
+out:
+	devlink_dpipe_entry_clear(&entry);
+	return err;
+}
+
+static int mlxsw_sp_dpipe_table_adj_counters_update(void *priv, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_nexthop *nh;
+	u32 adj_hash_index = 0;
+	u32 adj_index = 0;
+	u32 adj_size = 0;
+
+	mlxsw_sp_nexthop_for_each(nh, mlxsw_sp->router) {
+		if (!mlxsw_sp_nexthop_offload(nh) ||
+		    mlxsw_sp_nexthop_group_has_ipip(nh))
+			continue;
+
+		mlxsw_sp_nexthop_indexes(nh, &adj_index, &adj_size,
+					 &adj_hash_index);
+		if (enable)
+			mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
+		else
+			mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh);
+		mlxsw_sp_nexthop_update(mlxsw_sp,
+					adj_index + adj_hash_index, nh);
+	}
+	return 0;
+}
+
+static u64
+mlxsw_sp_dpipe_table_adj_size_get(void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	u64 size;
+
+	rtnl_lock();
+	size = mlxsw_sp_dpipe_table_adj_size(mlxsw_sp);
+	rtnl_unlock();
+
+	return size;
+}
+
+static struct devlink_dpipe_table_ops mlxsw_sp_dpipe_table_adj_ops = {
+	.matches_dump = mlxsw_sp_dpipe_table_adj_matches_dump,
+	.actions_dump = mlxsw_sp_dpipe_table_adj_actions_dump,
+	.entries_dump = mlxsw_sp_dpipe_table_adj_entries_dump,
+	.counters_set_update = mlxsw_sp_dpipe_table_adj_counters_update,
+	.size_get = mlxsw_sp_dpipe_table_adj_size_get,
+};
+
+#define MLXSW_SP_DPIPE_TABLE_RESOURCE_UNIT_ADJ 1
+
+static int mlxsw_sp_dpipe_adj_table_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
+
+	err = devlink_dpipe_table_register(devlink,
+					   MLXSW_SP_DPIPE_TABLE_NAME_ADJ,
+					   &mlxsw_sp_dpipe_table_adj_ops,
+					   mlxsw_sp, false);
+	if (err)
+		return err;
+
+	err = devlink_dpipe_table_resource_set(devlink,
+					       MLXSW_SP_DPIPE_TABLE_NAME_ADJ,
+					       MLXSW_SP_RESOURCE_KVD_LINEAR,
+					       MLXSW_SP_DPIPE_TABLE_RESOURCE_UNIT_ADJ);
+	if (err)
+		goto err_resource_set;
+
+	return 0;
+
+err_resource_set:
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_ADJ);
+	return err;
+}
+
+static void mlxsw_sp_dpipe_adj_table_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_ADJ);
+}
+
+int mlxsw_sp_dpipe_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
+
+	err = devlink_dpipe_headers_register(devlink,
+					     &mlxsw_sp_dpipe_headers);
+	if (err)
+		return err;
+	err = mlxsw_sp_dpipe_erif_table_init(mlxsw_sp);
+	if (err)
+		goto err_erif_table_init;
+
+	err = mlxsw_sp_dpipe_host4_table_init(mlxsw_sp);
+	if (err)
+		goto err_host4_table_init;
+
+	err = mlxsw_sp_dpipe_host6_table_init(mlxsw_sp);
+	if (err)
+		goto err_host6_table_init;
+
+	err = mlxsw_sp_dpipe_adj_table_init(mlxsw_sp);
+	if (err)
+		goto err_adj_table_init;
+
+	return 0;
+err_adj_table_init:
+	mlxsw_sp_dpipe_host6_table_fini(mlxsw_sp);
+err_host6_table_init:
+	mlxsw_sp_dpipe_host4_table_fini(mlxsw_sp);
+err_host4_table_init:
+	mlxsw_sp_dpipe_erif_table_fini(mlxsw_sp);
+err_erif_table_init:
+	devlink_dpipe_headers_unregister(priv_to_devlink(mlxsw_sp->core));
+	return err;
+}
+
+void mlxsw_sp_dpipe_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	mlxsw_sp_dpipe_adj_table_fini(mlxsw_sp);
+	mlxsw_sp_dpipe_host6_table_fini(mlxsw_sp);
+	mlxsw_sp_dpipe_host4_table_fini(mlxsw_sp);
+	mlxsw_sp_dpipe_erif_table_fini(mlxsw_sp);
+	devlink_dpipe_headers_unregister(devlink);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h
new file mode 100644
index 0000000..815d543
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h
@@ -0,0 +1,61 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Arkadi Sharshevsky <arkadis@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_PIPELINE_H_
+#define _MLXSW_PIPELINE_H_
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+
+int mlxsw_sp_dpipe_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_dpipe_fini(struct mlxsw_sp *mlxsw_sp);
+
+#else
+
+static inline int mlxsw_sp_dpipe_init(struct mlxsw_sp *mlxsw_sp)
+{
+	return 0;
+}
+
+static inline void mlxsw_sp_dpipe_fini(struct mlxsw_sp *mlxsw_sp)
+{
+}
+
+#endif
+
+#define MLXSW_SP_DPIPE_TABLE_NAME_ERIF "mlxsw_erif"
+#define MLXSW_SP_DPIPE_TABLE_NAME_HOST4 "mlxsw_host4"
+#define MLXSW_SP_DPIPE_TABLE_NAME_HOST6 "mlxsw_host6"
+#define MLXSW_SP_DPIPE_TABLE_NAME_ADJ "mlxsw_adj"
+
+#endif /* _MLXSW_PIPELINE_H_*/
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_fid.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_fid.c
new file mode 100644
index 0000000..54262af
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_fid.c
@@ -0,0 +1,992 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_fid.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Ido Schimmel <idosch@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/if_vlan.h>
+#include <linux/if_bridge.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+
+#include "spectrum.h"
+#include "reg.h"
+
+struct mlxsw_sp_fid_family;
+
+struct mlxsw_sp_fid_core {
+	struct mlxsw_sp_fid_family *fid_family_arr[MLXSW_SP_FID_TYPE_MAX];
+	unsigned int *port_fid_mappings;
+};
+
+struct mlxsw_sp_fid {
+	struct list_head list;
+	struct mlxsw_sp_rif *rif;
+	unsigned int ref_count;
+	u16 fid_index;
+	struct mlxsw_sp_fid_family *fid_family;
+};
+
+struct mlxsw_sp_fid_8021q {
+	struct mlxsw_sp_fid common;
+	u16 vid;
+};
+
+struct mlxsw_sp_fid_8021d {
+	struct mlxsw_sp_fid common;
+	int br_ifindex;
+};
+
+struct mlxsw_sp_flood_table {
+	enum mlxsw_sp_flood_type packet_type;
+	enum mlxsw_reg_sfgc_bridge_type bridge_type;
+	enum mlxsw_flood_table_type table_type;
+	int table_index;
+};
+
+struct mlxsw_sp_fid_ops {
+	void (*setup)(struct mlxsw_sp_fid *fid, const void *arg);
+	int (*configure)(struct mlxsw_sp_fid *fid);
+	void (*deconfigure)(struct mlxsw_sp_fid *fid);
+	int (*index_alloc)(struct mlxsw_sp_fid *fid, const void *arg,
+			   u16 *p_fid_index);
+	bool (*compare)(const struct mlxsw_sp_fid *fid,
+			const void *arg);
+	u16 (*flood_index)(const struct mlxsw_sp_fid *fid);
+	int (*port_vid_map)(struct mlxsw_sp_fid *fid,
+			    struct mlxsw_sp_port *port, u16 vid);
+	void (*port_vid_unmap)(struct mlxsw_sp_fid *fid,
+			       struct mlxsw_sp_port *port, u16 vid);
+};
+
+struct mlxsw_sp_fid_family {
+	enum mlxsw_sp_fid_type type;
+	size_t fid_size;
+	u16 start_index;
+	u16 end_index;
+	struct list_head fids_list;
+	unsigned long *fids_bitmap;
+	const struct mlxsw_sp_flood_table *flood_tables;
+	int nr_flood_tables;
+	enum mlxsw_sp_rif_type rif_type;
+	const struct mlxsw_sp_fid_ops *ops;
+	struct mlxsw_sp *mlxsw_sp;
+};
+
+static const int mlxsw_sp_sfgc_uc_packet_types[MLXSW_REG_SFGC_TYPE_MAX] = {
+	[MLXSW_REG_SFGC_TYPE_UNKNOWN_UNICAST]			= 1,
+};
+
+static const int mlxsw_sp_sfgc_bc_packet_types[MLXSW_REG_SFGC_TYPE_MAX] = {
+	[MLXSW_REG_SFGC_TYPE_BROADCAST]				= 1,
+	[MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_NON_IP]	= 1,
+	[MLXSW_REG_SFGC_TYPE_IPV4_LINK_LOCAL]			= 1,
+	[MLXSW_REG_SFGC_TYPE_IPV6_ALL_HOST]			= 1,
+	[MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_IPV6]	= 1,
+};
+
+static const int mlxsw_sp_sfgc_mc_packet_types[MLXSW_REG_SFGC_TYPE_MAX] = {
+	[MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_IPV4]	= 1,
+};
+
+static const int *mlxsw_sp_packet_type_sfgc_types[] = {
+	[MLXSW_SP_FLOOD_TYPE_UC]	= mlxsw_sp_sfgc_uc_packet_types,
+	[MLXSW_SP_FLOOD_TYPE_BC]	= mlxsw_sp_sfgc_bc_packet_types,
+	[MLXSW_SP_FLOOD_TYPE_MC]	= mlxsw_sp_sfgc_mc_packet_types,
+};
+
+static const struct mlxsw_sp_flood_table *
+mlxsw_sp_fid_flood_table_lookup(const struct mlxsw_sp_fid *fid,
+				enum mlxsw_sp_flood_type packet_type)
+{
+	struct mlxsw_sp_fid_family *fid_family = fid->fid_family;
+	int i;
+
+	for (i = 0; i < fid_family->nr_flood_tables; i++) {
+		if (fid_family->flood_tables[i].packet_type != packet_type)
+			continue;
+		return &fid_family->flood_tables[i];
+	}
+
+	return NULL;
+}
+
+int mlxsw_sp_fid_flood_set(struct mlxsw_sp_fid *fid,
+			   enum mlxsw_sp_flood_type packet_type, u8 local_port,
+			   bool member)
+{
+	struct mlxsw_sp_fid_family *fid_family = fid->fid_family;
+	const struct mlxsw_sp_fid_ops *ops = fid_family->ops;
+	const struct mlxsw_sp_flood_table *flood_table;
+	char *sftr_pl;
+	int err;
+
+	if (WARN_ON(!fid_family->flood_tables || !ops->flood_index))
+		return -EINVAL;
+
+	flood_table = mlxsw_sp_fid_flood_table_lookup(fid, packet_type);
+	if (!flood_table)
+		return -ESRCH;
+
+	sftr_pl = kmalloc(MLXSW_REG_SFTR_LEN, GFP_KERNEL);
+	if (!sftr_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_sftr_pack(sftr_pl, flood_table->table_index,
+			    ops->flood_index(fid), flood_table->table_type, 1,
+			    local_port, member);
+	err = mlxsw_reg_write(fid_family->mlxsw_sp->core, MLXSW_REG(sftr),
+			      sftr_pl);
+	kfree(sftr_pl);
+	return err;
+}
+
+int mlxsw_sp_fid_port_vid_map(struct mlxsw_sp_fid *fid,
+			      struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	if (WARN_ON(!fid->fid_family->ops->port_vid_map))
+		return -EINVAL;
+	return fid->fid_family->ops->port_vid_map(fid, mlxsw_sp_port, vid);
+}
+
+void mlxsw_sp_fid_port_vid_unmap(struct mlxsw_sp_fid *fid,
+				 struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	fid->fid_family->ops->port_vid_unmap(fid, mlxsw_sp_port, vid);
+}
+
+enum mlxsw_sp_rif_type mlxsw_sp_fid_rif_type(const struct mlxsw_sp_fid *fid)
+{
+	return fid->fid_family->rif_type;
+}
+
+u16 mlxsw_sp_fid_index(const struct mlxsw_sp_fid *fid)
+{
+	return fid->fid_index;
+}
+
+enum mlxsw_sp_fid_type mlxsw_sp_fid_type(const struct mlxsw_sp_fid *fid)
+{
+	return fid->fid_family->type;
+}
+
+void mlxsw_sp_fid_rif_set(struct mlxsw_sp_fid *fid, struct mlxsw_sp_rif *rif)
+{
+	fid->rif = rif;
+}
+
+enum mlxsw_sp_rif_type
+mlxsw_sp_fid_type_rif_type(const struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_fid_type type)
+{
+	struct mlxsw_sp_fid_core *fid_core = mlxsw_sp->fid_core;
+
+	return fid_core->fid_family_arr[type]->rif_type;
+}
+
+static struct mlxsw_sp_fid_8021q *
+mlxsw_sp_fid_8021q_fid(const struct mlxsw_sp_fid *fid)
+{
+	return container_of(fid, struct mlxsw_sp_fid_8021q, common);
+}
+
+u16 mlxsw_sp_fid_8021q_vid(const struct mlxsw_sp_fid *fid)
+{
+	return mlxsw_sp_fid_8021q_fid(fid)->vid;
+}
+
+static void mlxsw_sp_fid_8021q_setup(struct mlxsw_sp_fid *fid, const void *arg)
+{
+	u16 vid = *(u16 *) arg;
+
+	mlxsw_sp_fid_8021q_fid(fid)->vid = vid;
+}
+
+static enum mlxsw_reg_sfmr_op mlxsw_sp_sfmr_op(bool valid)
+{
+	return valid ? MLXSW_REG_SFMR_OP_CREATE_FID :
+		       MLXSW_REG_SFMR_OP_DESTROY_FID;
+}
+
+static int mlxsw_sp_fid_op(struct mlxsw_sp *mlxsw_sp, u16 fid_index,
+			   u16 fid_offset, bool valid)
+{
+	char sfmr_pl[MLXSW_REG_SFMR_LEN];
+
+	mlxsw_reg_sfmr_pack(sfmr_pl, mlxsw_sp_sfmr_op(valid), fid_index,
+			    fid_offset);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfmr), sfmr_pl);
+}
+
+static int mlxsw_sp_fid_vid_map(struct mlxsw_sp *mlxsw_sp, u16 fid_index,
+				u16 vid, bool valid)
+{
+	enum mlxsw_reg_svfa_mt mt = MLXSW_REG_SVFA_MT_VID_TO_FID;
+	char svfa_pl[MLXSW_REG_SVFA_LEN];
+
+	mlxsw_reg_svfa_pack(svfa_pl, 0, mt, valid, fid_index, vid);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(svfa), svfa_pl);
+}
+
+static int __mlxsw_sp_fid_port_vid_map(struct mlxsw_sp *mlxsw_sp, u16 fid_index,
+				       u8 local_port, u16 vid, bool valid)
+{
+	enum mlxsw_reg_svfa_mt mt = MLXSW_REG_SVFA_MT_PORT_VID_TO_FID;
+	char svfa_pl[MLXSW_REG_SVFA_LEN];
+
+	mlxsw_reg_svfa_pack(svfa_pl, local_port, mt, valid, fid_index, vid);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(svfa), svfa_pl);
+}
+
+static int mlxsw_sp_fid_8021q_configure(struct mlxsw_sp_fid *fid)
+{
+	struct mlxsw_sp *mlxsw_sp = fid->fid_family->mlxsw_sp;
+	struct mlxsw_sp_fid_8021q *fid_8021q;
+	int err;
+
+	err = mlxsw_sp_fid_op(mlxsw_sp, fid->fid_index, fid->fid_index, true);
+	if (err)
+		return err;
+
+	fid_8021q = mlxsw_sp_fid_8021q_fid(fid);
+	err = mlxsw_sp_fid_vid_map(mlxsw_sp, fid->fid_index, fid_8021q->vid,
+				   true);
+	if (err)
+		goto err_fid_map;
+
+	return 0;
+
+err_fid_map:
+	mlxsw_sp_fid_op(mlxsw_sp, fid->fid_index, 0, false);
+	return err;
+}
+
+static void mlxsw_sp_fid_8021q_deconfigure(struct mlxsw_sp_fid *fid)
+{
+	struct mlxsw_sp *mlxsw_sp = fid->fid_family->mlxsw_sp;
+	struct mlxsw_sp_fid_8021q *fid_8021q;
+
+	fid_8021q = mlxsw_sp_fid_8021q_fid(fid);
+	mlxsw_sp_fid_vid_map(mlxsw_sp, fid->fid_index, fid_8021q->vid, false);
+	mlxsw_sp_fid_op(mlxsw_sp, fid->fid_index, 0, false);
+}
+
+static int mlxsw_sp_fid_8021q_index_alloc(struct mlxsw_sp_fid *fid,
+					  const void *arg, u16 *p_fid_index)
+{
+	struct mlxsw_sp_fid_family *fid_family = fid->fid_family;
+	u16 vid = *(u16 *) arg;
+
+	/* Use 1:1 mapping for simplicity although not a must */
+	if (vid < fid_family->start_index || vid > fid_family->end_index)
+		return -EINVAL;
+	*p_fid_index = vid;
+
+	return 0;
+}
+
+static bool
+mlxsw_sp_fid_8021q_compare(const struct mlxsw_sp_fid *fid, const void *arg)
+{
+	u16 vid = *(u16 *) arg;
+
+	return mlxsw_sp_fid_8021q_fid(fid)->vid == vid;
+}
+
+static u16 mlxsw_sp_fid_8021q_flood_index(const struct mlxsw_sp_fid *fid)
+{
+	return fid->fid_index;
+}
+
+static int mlxsw_sp_fid_8021q_port_vid_map(struct mlxsw_sp_fid *fid,
+					   struct mlxsw_sp_port *mlxsw_sp_port,
+					   u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+
+	/* In case there are no {Port, VID} => FID mappings on the port,
+	 * we can use the global VID => FID mapping we created when the
+	 * FID was configured.
+	 */
+	if (mlxsw_sp->fid_core->port_fid_mappings[local_port] == 0)
+		return 0;
+	return __mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index, local_port,
+					   vid, true);
+}
+
+static void
+mlxsw_sp_fid_8021q_port_vid_unmap(struct mlxsw_sp_fid *fid,
+				  struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+
+	if (mlxsw_sp->fid_core->port_fid_mappings[local_port] == 0)
+		return;
+	__mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index, local_port, vid,
+				    false);
+}
+
+static const struct mlxsw_sp_fid_ops mlxsw_sp_fid_8021q_ops = {
+	.setup			= mlxsw_sp_fid_8021q_setup,
+	.configure		= mlxsw_sp_fid_8021q_configure,
+	.deconfigure		= mlxsw_sp_fid_8021q_deconfigure,
+	.index_alloc		= mlxsw_sp_fid_8021q_index_alloc,
+	.compare		= mlxsw_sp_fid_8021q_compare,
+	.flood_index		= mlxsw_sp_fid_8021q_flood_index,
+	.port_vid_map		= mlxsw_sp_fid_8021q_port_vid_map,
+	.port_vid_unmap		= mlxsw_sp_fid_8021q_port_vid_unmap,
+};
+
+static const struct mlxsw_sp_flood_table mlxsw_sp_fid_8021q_flood_tables[] = {
+	{
+		.packet_type	= MLXSW_SP_FLOOD_TYPE_UC,
+		.bridge_type	= MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+		.table_type	= MLXSW_REG_SFGC_TABLE_TYPE_FID_OFFSET,
+		.table_index	= 0,
+	},
+	{
+		.packet_type	= MLXSW_SP_FLOOD_TYPE_MC,
+		.bridge_type	= MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+		.table_type	= MLXSW_REG_SFGC_TABLE_TYPE_FID_OFFSET,
+		.table_index	= 1,
+	},
+	{
+		.packet_type	= MLXSW_SP_FLOOD_TYPE_BC,
+		.bridge_type	= MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+		.table_type	= MLXSW_REG_SFGC_TABLE_TYPE_FID_OFFSET,
+		.table_index	= 2,
+	},
+};
+
+/* Range and flood configuration must match mlxsw_config_profile */
+static const struct mlxsw_sp_fid_family mlxsw_sp_fid_8021q_family = {
+	.type			= MLXSW_SP_FID_TYPE_8021Q,
+	.fid_size		= sizeof(struct mlxsw_sp_fid_8021q),
+	.start_index		= 1,
+	.end_index		= VLAN_VID_MASK,
+	.flood_tables		= mlxsw_sp_fid_8021q_flood_tables,
+	.nr_flood_tables	= ARRAY_SIZE(mlxsw_sp_fid_8021q_flood_tables),
+	.rif_type		= MLXSW_SP_RIF_TYPE_VLAN,
+	.ops			= &mlxsw_sp_fid_8021q_ops,
+};
+
+static struct mlxsw_sp_fid_8021d *
+mlxsw_sp_fid_8021d_fid(const struct mlxsw_sp_fid *fid)
+{
+	return container_of(fid, struct mlxsw_sp_fid_8021d, common);
+}
+
+static void mlxsw_sp_fid_8021d_setup(struct mlxsw_sp_fid *fid, const void *arg)
+{
+	int br_ifindex = *(int *) arg;
+
+	mlxsw_sp_fid_8021d_fid(fid)->br_ifindex = br_ifindex;
+}
+
+static int mlxsw_sp_fid_8021d_configure(struct mlxsw_sp_fid *fid)
+{
+	struct mlxsw_sp_fid_family *fid_family = fid->fid_family;
+
+	return mlxsw_sp_fid_op(fid_family->mlxsw_sp, fid->fid_index, 0, true);
+}
+
+static void mlxsw_sp_fid_8021d_deconfigure(struct mlxsw_sp_fid *fid)
+{
+	mlxsw_sp_fid_op(fid->fid_family->mlxsw_sp, fid->fid_index, 0, false);
+}
+
+static int mlxsw_sp_fid_8021d_index_alloc(struct mlxsw_sp_fid *fid,
+					  const void *arg, u16 *p_fid_index)
+{
+	struct mlxsw_sp_fid_family *fid_family = fid->fid_family;
+	u16 nr_fids, fid_index;
+
+	nr_fids = fid_family->end_index - fid_family->start_index + 1;
+	fid_index = find_first_zero_bit(fid_family->fids_bitmap, nr_fids);
+	if (fid_index == nr_fids)
+		return -ENOBUFS;
+	*p_fid_index = fid_family->start_index + fid_index;
+
+	return 0;
+}
+
+static bool
+mlxsw_sp_fid_8021d_compare(const struct mlxsw_sp_fid *fid, const void *arg)
+{
+	int br_ifindex = *(int *) arg;
+
+	return mlxsw_sp_fid_8021d_fid(fid)->br_ifindex == br_ifindex;
+}
+
+static u16 mlxsw_sp_fid_8021d_flood_index(const struct mlxsw_sp_fid *fid)
+{
+	return fid->fid_index - fid->fid_family->start_index;
+}
+
+static int mlxsw_sp_port_vp_mode_trans(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	int err;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &mlxsw_sp_port->vlans_list,
+			    list) {
+		struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+		u16 vid = mlxsw_sp_port_vlan->vid;
+
+		if (!fid)
+			continue;
+
+		err = __mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index,
+						  mlxsw_sp_port->local_port,
+						  vid, true);
+		if (err)
+			goto err_fid_port_vid_map;
+	}
+
+	err = mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, true);
+	if (err)
+		goto err_port_vp_mode_set;
+
+	return 0;
+
+err_port_vp_mode_set:
+err_fid_port_vid_map:
+	list_for_each_entry_continue_reverse(mlxsw_sp_port_vlan,
+					     &mlxsw_sp_port->vlans_list, list) {
+		struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+		u16 vid = mlxsw_sp_port_vlan->vid;
+
+		if (!fid)
+			continue;
+
+		__mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index,
+					    mlxsw_sp_port->local_port, vid,
+					    false);
+	}
+	return err;
+}
+
+static void mlxsw_sp_port_vlan_mode_trans(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, false);
+
+	list_for_each_entry_reverse(mlxsw_sp_port_vlan,
+				    &mlxsw_sp_port->vlans_list, list) {
+		struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+		u16 vid = mlxsw_sp_port_vlan->vid;
+
+		if (!fid)
+			continue;
+
+		__mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index,
+					    mlxsw_sp_port->local_port, vid,
+					    false);
+	}
+}
+
+static int mlxsw_sp_fid_8021d_port_vid_map(struct mlxsw_sp_fid *fid,
+					   struct mlxsw_sp_port *mlxsw_sp_port,
+					   u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	int err;
+
+	err = __mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index,
+					  mlxsw_sp_port->local_port, vid, true);
+	if (err)
+		return err;
+
+	if (mlxsw_sp->fid_core->port_fid_mappings[local_port]++ == 0) {
+		err = mlxsw_sp_port_vp_mode_trans(mlxsw_sp_port);
+		if (err)
+			goto err_port_vp_mode_trans;
+	}
+
+	return 0;
+
+err_port_vp_mode_trans:
+	mlxsw_sp->fid_core->port_fid_mappings[local_port]--;
+	__mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index,
+				    mlxsw_sp_port->local_port, vid, false);
+	return err;
+}
+
+static void
+mlxsw_sp_fid_8021d_port_vid_unmap(struct mlxsw_sp_fid *fid,
+				  struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+
+	if (mlxsw_sp->fid_core->port_fid_mappings[local_port] == 1)
+		mlxsw_sp_port_vlan_mode_trans(mlxsw_sp_port);
+	mlxsw_sp->fid_core->port_fid_mappings[local_port]--;
+	__mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index,
+				    mlxsw_sp_port->local_port, vid, false);
+}
+
+static const struct mlxsw_sp_fid_ops mlxsw_sp_fid_8021d_ops = {
+	.setup			= mlxsw_sp_fid_8021d_setup,
+	.configure		= mlxsw_sp_fid_8021d_configure,
+	.deconfigure		= mlxsw_sp_fid_8021d_deconfigure,
+	.index_alloc		= mlxsw_sp_fid_8021d_index_alloc,
+	.compare		= mlxsw_sp_fid_8021d_compare,
+	.flood_index		= mlxsw_sp_fid_8021d_flood_index,
+	.port_vid_map		= mlxsw_sp_fid_8021d_port_vid_map,
+	.port_vid_unmap		= mlxsw_sp_fid_8021d_port_vid_unmap,
+};
+
+static const struct mlxsw_sp_flood_table mlxsw_sp_fid_8021d_flood_tables[] = {
+	{
+		.packet_type	= MLXSW_SP_FLOOD_TYPE_UC,
+		.bridge_type	= MLXSW_REG_SFGC_BRIDGE_TYPE_VFID,
+		.table_type	= MLXSW_REG_SFGC_TABLE_TYPE_FID,
+		.table_index	= 0,
+	},
+	{
+		.packet_type	= MLXSW_SP_FLOOD_TYPE_MC,
+		.bridge_type	= MLXSW_REG_SFGC_BRIDGE_TYPE_VFID,
+		.table_type	= MLXSW_REG_SFGC_TABLE_TYPE_FID,
+		.table_index	= 1,
+	},
+	{
+		.packet_type	= MLXSW_SP_FLOOD_TYPE_BC,
+		.bridge_type	= MLXSW_REG_SFGC_BRIDGE_TYPE_VFID,
+		.table_type	= MLXSW_REG_SFGC_TABLE_TYPE_FID,
+		.table_index	= 2,
+	},
+};
+
+/* Range and flood configuration must match mlxsw_config_profile */
+static const struct mlxsw_sp_fid_family mlxsw_sp_fid_8021d_family = {
+	.type			= MLXSW_SP_FID_TYPE_8021D,
+	.fid_size		= sizeof(struct mlxsw_sp_fid_8021d),
+	.start_index		= VLAN_N_VID,
+	.end_index		= VLAN_N_VID + MLXSW_SP_FID_8021D_MAX - 1,
+	.flood_tables		= mlxsw_sp_fid_8021d_flood_tables,
+	.nr_flood_tables	= ARRAY_SIZE(mlxsw_sp_fid_8021d_flood_tables),
+	.rif_type		= MLXSW_SP_RIF_TYPE_FID,
+	.ops			= &mlxsw_sp_fid_8021d_ops,
+};
+
+static int mlxsw_sp_fid_rfid_configure(struct mlxsw_sp_fid *fid)
+{
+	/* rFIDs are allocated by the device during init */
+	return 0;
+}
+
+static void mlxsw_sp_fid_rfid_deconfigure(struct mlxsw_sp_fid *fid)
+{
+}
+
+static int mlxsw_sp_fid_rfid_index_alloc(struct mlxsw_sp_fid *fid,
+					 const void *arg, u16 *p_fid_index)
+{
+	u16 rif_index = *(u16 *) arg;
+
+	*p_fid_index = fid->fid_family->start_index + rif_index;
+
+	return 0;
+}
+
+static bool mlxsw_sp_fid_rfid_compare(const struct mlxsw_sp_fid *fid,
+				      const void *arg)
+{
+	u16 rif_index = *(u16 *) arg;
+
+	return fid->fid_index == rif_index + fid->fid_family->start_index;
+}
+
+static int mlxsw_sp_fid_rfid_port_vid_map(struct mlxsw_sp_fid *fid,
+					  struct mlxsw_sp_port *mlxsw_sp_port,
+					  u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	int err;
+
+	/* We only need to transition the port to virtual mode since
+	 * {Port, VID} => FID is done by the firmware upon RIF creation.
+	 */
+	if (mlxsw_sp->fid_core->port_fid_mappings[local_port]++ == 0) {
+		err = mlxsw_sp_port_vp_mode_trans(mlxsw_sp_port);
+		if (err)
+			goto err_port_vp_mode_trans;
+	}
+
+	return 0;
+
+err_port_vp_mode_trans:
+	mlxsw_sp->fid_core->port_fid_mappings[local_port]--;
+	return err;
+}
+
+static void
+mlxsw_sp_fid_rfid_port_vid_unmap(struct mlxsw_sp_fid *fid,
+				 struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+
+	if (mlxsw_sp->fid_core->port_fid_mappings[local_port] == 1)
+		mlxsw_sp_port_vlan_mode_trans(mlxsw_sp_port);
+	mlxsw_sp->fid_core->port_fid_mappings[local_port]--;
+}
+
+static const struct mlxsw_sp_fid_ops mlxsw_sp_fid_rfid_ops = {
+	.configure		= mlxsw_sp_fid_rfid_configure,
+	.deconfigure		= mlxsw_sp_fid_rfid_deconfigure,
+	.index_alloc		= mlxsw_sp_fid_rfid_index_alloc,
+	.compare		= mlxsw_sp_fid_rfid_compare,
+	.port_vid_map		= mlxsw_sp_fid_rfid_port_vid_map,
+	.port_vid_unmap		= mlxsw_sp_fid_rfid_port_vid_unmap,
+};
+
+#define MLXSW_SP_RFID_BASE	(15 * 1024)
+#define MLXSW_SP_RFID_MAX	1024
+
+static const struct mlxsw_sp_fid_family mlxsw_sp_fid_rfid_family = {
+	.type			= MLXSW_SP_FID_TYPE_RFID,
+	.fid_size		= sizeof(struct mlxsw_sp_fid),
+	.start_index		= MLXSW_SP_RFID_BASE,
+	.end_index		= MLXSW_SP_RFID_BASE + MLXSW_SP_RFID_MAX - 1,
+	.rif_type		= MLXSW_SP_RIF_TYPE_SUBPORT,
+	.ops			= &mlxsw_sp_fid_rfid_ops,
+};
+
+static int mlxsw_sp_fid_dummy_configure(struct mlxsw_sp_fid *fid)
+{
+	struct mlxsw_sp *mlxsw_sp = fid->fid_family->mlxsw_sp;
+
+	return mlxsw_sp_fid_op(mlxsw_sp, fid->fid_index, 0, true);
+}
+
+static void mlxsw_sp_fid_dummy_deconfigure(struct mlxsw_sp_fid *fid)
+{
+	mlxsw_sp_fid_op(fid->fid_family->mlxsw_sp, fid->fid_index, 0, false);
+}
+
+static int mlxsw_sp_fid_dummy_index_alloc(struct mlxsw_sp_fid *fid,
+					  const void *arg, u16 *p_fid_index)
+{
+	*p_fid_index = fid->fid_family->start_index;
+
+	return 0;
+}
+
+static bool mlxsw_sp_fid_dummy_compare(const struct mlxsw_sp_fid *fid,
+				       const void *arg)
+{
+	return true;
+}
+
+static const struct mlxsw_sp_fid_ops mlxsw_sp_fid_dummy_ops = {
+	.configure		= mlxsw_sp_fid_dummy_configure,
+	.deconfigure		= mlxsw_sp_fid_dummy_deconfigure,
+	.index_alloc		= mlxsw_sp_fid_dummy_index_alloc,
+	.compare		= mlxsw_sp_fid_dummy_compare,
+};
+
+static const struct mlxsw_sp_fid_family mlxsw_sp_fid_dummy_family = {
+	.type			= MLXSW_SP_FID_TYPE_DUMMY,
+	.fid_size		= sizeof(struct mlxsw_sp_fid),
+	.start_index		= MLXSW_SP_RFID_BASE - 1,
+	.end_index		= MLXSW_SP_RFID_BASE - 1,
+	.ops			= &mlxsw_sp_fid_dummy_ops,
+};
+
+static const struct mlxsw_sp_fid_family *mlxsw_sp_fid_family_arr[] = {
+	[MLXSW_SP_FID_TYPE_8021Q]	= &mlxsw_sp_fid_8021q_family,
+	[MLXSW_SP_FID_TYPE_8021D]	= &mlxsw_sp_fid_8021d_family,
+	[MLXSW_SP_FID_TYPE_RFID]	= &mlxsw_sp_fid_rfid_family,
+	[MLXSW_SP_FID_TYPE_DUMMY]	= &mlxsw_sp_fid_dummy_family,
+};
+
+static struct mlxsw_sp_fid *mlxsw_sp_fid_get(struct mlxsw_sp *mlxsw_sp,
+					     enum mlxsw_sp_fid_type type,
+					     const void *arg)
+{
+	struct mlxsw_sp_fid_family *fid_family;
+	struct mlxsw_sp_fid *fid;
+	u16 fid_index;
+	int err;
+
+	fid_family = mlxsw_sp->fid_core->fid_family_arr[type];
+	list_for_each_entry(fid, &fid_family->fids_list, list) {
+		if (!fid->fid_family->ops->compare(fid, arg))
+			continue;
+		fid->ref_count++;
+		return fid;
+	}
+
+	fid = kzalloc(fid_family->fid_size, GFP_KERNEL);
+	if (!fid)
+		return ERR_PTR(-ENOMEM);
+	fid->fid_family = fid_family;
+
+	err = fid->fid_family->ops->index_alloc(fid, arg, &fid_index);
+	if (err)
+		goto err_index_alloc;
+	fid->fid_index = fid_index;
+	__set_bit(fid_index - fid_family->start_index, fid_family->fids_bitmap);
+
+	if (fid->fid_family->ops->setup)
+		fid->fid_family->ops->setup(fid, arg);
+
+	err = fid->fid_family->ops->configure(fid);
+	if (err)
+		goto err_configure;
+
+	list_add(&fid->list, &fid_family->fids_list);
+	fid->ref_count++;
+	return fid;
+
+err_configure:
+	__clear_bit(fid_index - fid_family->start_index,
+		    fid_family->fids_bitmap);
+err_index_alloc:
+	kfree(fid);
+	return ERR_PTR(err);
+}
+
+void mlxsw_sp_fid_put(struct mlxsw_sp_fid *fid)
+{
+	struct mlxsw_sp_fid_family *fid_family = fid->fid_family;
+
+	if (--fid->ref_count == 1 && fid->rif) {
+		/* Destroy the associated RIF and let it drop the last
+		 * reference on the FID.
+		 */
+		return mlxsw_sp_rif_destroy(fid->rif);
+	} else if (fid->ref_count == 0) {
+		list_del(&fid->list);
+		fid->fid_family->ops->deconfigure(fid);
+		__clear_bit(fid->fid_index - fid_family->start_index,
+			    fid_family->fids_bitmap);
+		kfree(fid);
+	}
+}
+
+struct mlxsw_sp_fid *mlxsw_sp_fid_8021q_get(struct mlxsw_sp *mlxsw_sp, u16 vid)
+{
+	return mlxsw_sp_fid_get(mlxsw_sp, MLXSW_SP_FID_TYPE_8021Q, &vid);
+}
+
+struct mlxsw_sp_fid *mlxsw_sp_fid_8021d_get(struct mlxsw_sp *mlxsw_sp,
+					    int br_ifindex)
+{
+	return mlxsw_sp_fid_get(mlxsw_sp, MLXSW_SP_FID_TYPE_8021D, &br_ifindex);
+}
+
+struct mlxsw_sp_fid *mlxsw_sp_fid_rfid_get(struct mlxsw_sp *mlxsw_sp,
+					   u16 rif_index)
+{
+	return mlxsw_sp_fid_get(mlxsw_sp, MLXSW_SP_FID_TYPE_RFID, &rif_index);
+}
+
+struct mlxsw_sp_fid *mlxsw_sp_fid_dummy_get(struct mlxsw_sp *mlxsw_sp)
+{
+	return mlxsw_sp_fid_get(mlxsw_sp, MLXSW_SP_FID_TYPE_DUMMY, NULL);
+}
+
+static int
+mlxsw_sp_fid_flood_table_init(struct mlxsw_sp_fid_family *fid_family,
+			      const struct mlxsw_sp_flood_table *flood_table)
+{
+	enum mlxsw_sp_flood_type packet_type = flood_table->packet_type;
+	const int *sfgc_packet_types;
+	int i;
+
+	sfgc_packet_types = mlxsw_sp_packet_type_sfgc_types[packet_type];
+	for (i = 0; i < MLXSW_REG_SFGC_TYPE_MAX; i++) {
+		struct mlxsw_sp *mlxsw_sp = fid_family->mlxsw_sp;
+		char sfgc_pl[MLXSW_REG_SFGC_LEN];
+		int err;
+
+		if (!sfgc_packet_types[i])
+			continue;
+		mlxsw_reg_sfgc_pack(sfgc_pl, i, flood_table->bridge_type,
+				    flood_table->table_type,
+				    flood_table->table_index);
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfgc), sfgc_pl);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int
+mlxsw_sp_fid_flood_tables_init(struct mlxsw_sp_fid_family *fid_family)
+{
+	int i;
+
+	for (i = 0; i < fid_family->nr_flood_tables; i++) {
+		const struct mlxsw_sp_flood_table *flood_table;
+		int err;
+
+		flood_table = &fid_family->flood_tables[i];
+		err = mlxsw_sp_fid_flood_table_init(fid_family, flood_table);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_fid_family_register(struct mlxsw_sp *mlxsw_sp,
+					const struct mlxsw_sp_fid_family *tmpl)
+{
+	u16 nr_fids = tmpl->end_index - tmpl->start_index + 1;
+	struct mlxsw_sp_fid_family *fid_family;
+	int err;
+
+	fid_family = kmemdup(tmpl, sizeof(*fid_family), GFP_KERNEL);
+	if (!fid_family)
+		return -ENOMEM;
+
+	fid_family->mlxsw_sp = mlxsw_sp;
+	INIT_LIST_HEAD(&fid_family->fids_list);
+	fid_family->fids_bitmap = kcalloc(BITS_TO_LONGS(nr_fids),
+					  sizeof(unsigned long), GFP_KERNEL);
+	if (!fid_family->fids_bitmap) {
+		err = -ENOMEM;
+		goto err_alloc_fids_bitmap;
+	}
+
+	if (fid_family->flood_tables) {
+		err = mlxsw_sp_fid_flood_tables_init(fid_family);
+		if (err)
+			goto err_fid_flood_tables_init;
+	}
+
+	mlxsw_sp->fid_core->fid_family_arr[tmpl->type] = fid_family;
+
+	return 0;
+
+err_fid_flood_tables_init:
+	kfree(fid_family->fids_bitmap);
+err_alloc_fids_bitmap:
+	kfree(fid_family);
+	return err;
+}
+
+static void
+mlxsw_sp_fid_family_unregister(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_fid_family *fid_family)
+{
+	mlxsw_sp->fid_core->fid_family_arr[fid_family->type] = NULL;
+	kfree(fid_family->fids_bitmap);
+	WARN_ON_ONCE(!list_empty(&fid_family->fids_list));
+	kfree(fid_family);
+}
+
+int mlxsw_sp_port_fids_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+
+	/* Track number of FIDs configured on the port with mapping type
+	 * PORT_VID_TO_FID, so that we know when to transition the port
+	 * back to non-virtual (VLAN) mode.
+	 */
+	mlxsw_sp->fid_core->port_fid_mappings[mlxsw_sp_port->local_port] = 0;
+
+	return mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, false);
+}
+
+void mlxsw_sp_port_fids_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+
+	mlxsw_sp->fid_core->port_fid_mappings[mlxsw_sp_port->local_port] = 0;
+}
+
+int mlxsw_sp_fids_init(struct mlxsw_sp *mlxsw_sp)
+{
+	unsigned int max_ports = mlxsw_core_max_ports(mlxsw_sp->core);
+	struct mlxsw_sp_fid_core *fid_core;
+	int err, i;
+
+	fid_core = kzalloc(sizeof(*mlxsw_sp->fid_core), GFP_KERNEL);
+	if (!fid_core)
+		return -ENOMEM;
+	mlxsw_sp->fid_core = fid_core;
+
+	fid_core->port_fid_mappings = kcalloc(max_ports, sizeof(unsigned int),
+					      GFP_KERNEL);
+	if (!fid_core->port_fid_mappings) {
+		err = -ENOMEM;
+		goto err_alloc_port_fid_mappings;
+	}
+
+	for (i = 0; i < MLXSW_SP_FID_TYPE_MAX; i++) {
+		err = mlxsw_sp_fid_family_register(mlxsw_sp,
+						   mlxsw_sp_fid_family_arr[i]);
+
+		if (err)
+			goto err_fid_ops_register;
+	}
+
+	return 0;
+
+err_fid_ops_register:
+	for (i--; i >= 0; i--) {
+		struct mlxsw_sp_fid_family *fid_family;
+
+		fid_family = fid_core->fid_family_arr[i];
+		mlxsw_sp_fid_family_unregister(mlxsw_sp, fid_family);
+	}
+	kfree(fid_core->port_fid_mappings);
+err_alloc_port_fid_mappings:
+	kfree(fid_core);
+	return err;
+}
+
+void mlxsw_sp_fids_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_fid_core *fid_core = mlxsw_sp->fid_core;
+	int i;
+
+	for (i = 0; i < MLXSW_SP_FID_TYPE_MAX; i++)
+		mlxsw_sp_fid_family_unregister(mlxsw_sp,
+					       fid_core->fid_family_arr[i]);
+	kfree(fid_core->port_fid_mappings);
+	kfree(fid_core);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
new file mode 100644
index 0000000..89dbf56
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -0,0 +1,495 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <net/flow_dissector.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_gact.h>
+#include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_vlan.h>
+
+#include "spectrum.h"
+#include "core_acl_flex_keys.h"
+
+static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_acl_block *block,
+					 struct mlxsw_sp_acl_rule_info *rulei,
+					 struct tcf_exts *exts)
+{
+	const struct tc_action *a;
+	LIST_HEAD(actions);
+	int err;
+
+	if (!tcf_exts_has_actions(exts))
+		return 0;
+
+	/* Count action is inserted first */
+	err = mlxsw_sp_acl_rulei_act_count(mlxsw_sp, rulei);
+	if (err)
+		return err;
+
+	tcf_exts_to_list(exts, &actions);
+	list_for_each_entry(a, &actions, list) {
+		if (is_tcf_gact_ok(a)) {
+			err = mlxsw_sp_acl_rulei_act_terminate(rulei);
+			if (err)
+				return err;
+		} else if (is_tcf_gact_shot(a)) {
+			err = mlxsw_sp_acl_rulei_act_drop(rulei);
+			if (err)
+				return err;
+		} else if (is_tcf_gact_trap(a)) {
+			err = mlxsw_sp_acl_rulei_act_trap(rulei);
+			if (err)
+				return err;
+		} else if (is_tcf_gact_goto_chain(a)) {
+			u32 chain_index = tcf_gact_goto_chain_index(a);
+			struct mlxsw_sp_acl_ruleset *ruleset;
+			u16 group_id;
+
+			ruleset = mlxsw_sp_acl_ruleset_lookup(mlxsw_sp, block,
+							      chain_index,
+							      MLXSW_SP_ACL_PROFILE_FLOWER);
+			if (IS_ERR(ruleset))
+				return PTR_ERR(ruleset);
+
+			group_id = mlxsw_sp_acl_ruleset_group_id(ruleset);
+			err = mlxsw_sp_acl_rulei_act_jump(rulei, group_id);
+			if (err)
+				return err;
+		} else if (is_tcf_mirred_egress_redirect(a)) {
+			struct net_device *out_dev;
+			struct mlxsw_sp_fid *fid;
+			u16 fid_index;
+
+			fid = mlxsw_sp_acl_dummy_fid(mlxsw_sp);
+			fid_index = mlxsw_sp_fid_index(fid);
+			err = mlxsw_sp_acl_rulei_act_fid_set(mlxsw_sp, rulei,
+							     fid_index);
+			if (err)
+				return err;
+
+			out_dev = tcf_mirred_dev(a);
+			err = mlxsw_sp_acl_rulei_act_fwd(mlxsw_sp, rulei,
+							 out_dev);
+			if (err)
+				return err;
+		} else if (is_tcf_mirred_egress_mirror(a)) {
+			struct net_device *out_dev = tcf_mirred_dev(a);
+
+			err = mlxsw_sp_acl_rulei_act_mirror(mlxsw_sp, rulei,
+							    block, out_dev);
+			if (err)
+				return err;
+		} else if (is_tcf_vlan(a)) {
+			u16 proto = be16_to_cpu(tcf_vlan_push_proto(a));
+			u32 action = tcf_vlan_action(a);
+			u8 prio = tcf_vlan_push_prio(a);
+			u16 vid = tcf_vlan_push_vid(a);
+
+			return mlxsw_sp_acl_rulei_act_vlan(mlxsw_sp, rulei,
+							   action, vid,
+							   proto, prio);
+		} else {
+			dev_err(mlxsw_sp->bus_info->dev, "Unsupported action\n");
+			return -EOPNOTSUPP;
+		}
+	}
+	return 0;
+}
+
+static void mlxsw_sp_flower_parse_ipv4(struct mlxsw_sp_acl_rule_info *rulei,
+				       struct tc_cls_flower_offload *f)
+{
+	struct flow_dissector_key_ipv4_addrs *key =
+		skb_flow_dissector_target(f->dissector,
+					  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+					  f->key);
+	struct flow_dissector_key_ipv4_addrs *mask =
+		skb_flow_dissector_target(f->dissector,
+					  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+					  f->mask);
+
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_SRC_IP4,
+				       ntohl(key->src), ntohl(mask->src));
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_DST_IP4,
+				       ntohl(key->dst), ntohl(mask->dst));
+}
+
+static void mlxsw_sp_flower_parse_ipv6(struct mlxsw_sp_acl_rule_info *rulei,
+				       struct tc_cls_flower_offload *f)
+{
+	struct flow_dissector_key_ipv6_addrs *key =
+		skb_flow_dissector_target(f->dissector,
+					  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+					  f->key);
+	struct flow_dissector_key_ipv6_addrs *mask =
+		skb_flow_dissector_target(f->dissector,
+					  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+					  f->mask);
+	size_t addr_half_size = sizeof(key->src) / 2;
+
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_SRC_IP6_HI,
+				       &key->src.s6_addr[0],
+				       &mask->src.s6_addr[0],
+				       addr_half_size);
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_SRC_IP6_LO,
+				       &key->src.s6_addr[addr_half_size],
+				       &mask->src.s6_addr[addr_half_size],
+				       addr_half_size);
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_DST_IP6_HI,
+				       &key->dst.s6_addr[0],
+				       &mask->dst.s6_addr[0],
+				       addr_half_size);
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_DST_IP6_LO,
+				       &key->dst.s6_addr[addr_half_size],
+				       &mask->dst.s6_addr[addr_half_size],
+				       addr_half_size);
+}
+
+static int mlxsw_sp_flower_parse_ports(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_acl_rule_info *rulei,
+				       struct tc_cls_flower_offload *f,
+				       u8 ip_proto)
+{
+	struct flow_dissector_key_ports *key, *mask;
+
+	if (!dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_PORTS))
+		return 0;
+
+	if (ip_proto != IPPROTO_TCP && ip_proto != IPPROTO_UDP) {
+		dev_err(mlxsw_sp->bus_info->dev, "Only UDP and TCP keys are supported\n");
+		return -EINVAL;
+	}
+
+	key = skb_flow_dissector_target(f->dissector,
+					FLOW_DISSECTOR_KEY_PORTS,
+					f->key);
+	mask = skb_flow_dissector_target(f->dissector,
+					 FLOW_DISSECTOR_KEY_PORTS,
+					 f->mask);
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_DST_L4_PORT,
+				       ntohs(key->dst), ntohs(mask->dst));
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_SRC_L4_PORT,
+				       ntohs(key->src), ntohs(mask->src));
+	return 0;
+}
+
+static int mlxsw_sp_flower_parse_tcp(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_acl_rule_info *rulei,
+				     struct tc_cls_flower_offload *f,
+				     u8 ip_proto)
+{
+	struct flow_dissector_key_tcp *key, *mask;
+
+	if (!dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_TCP))
+		return 0;
+
+	if (ip_proto != IPPROTO_TCP) {
+		dev_err(mlxsw_sp->bus_info->dev, "TCP keys supported only for TCP\n");
+		return -EINVAL;
+	}
+
+	key = skb_flow_dissector_target(f->dissector,
+					FLOW_DISSECTOR_KEY_TCP,
+					f->key);
+	mask = skb_flow_dissector_target(f->dissector,
+					 FLOW_DISSECTOR_KEY_TCP,
+					 f->mask);
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_TCP_FLAGS,
+				       ntohs(key->flags), ntohs(mask->flags));
+	return 0;
+}
+
+static int mlxsw_sp_flower_parse_ip(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_acl_rule_info *rulei,
+				    struct tc_cls_flower_offload *f,
+				    u16 n_proto)
+{
+	struct flow_dissector_key_ip *key, *mask;
+
+	if (!dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_IP))
+		return 0;
+
+	if (n_proto != ETH_P_IP && n_proto != ETH_P_IPV6) {
+		dev_err(mlxsw_sp->bus_info->dev, "IP keys supported only for IPv4/6\n");
+		return -EINVAL;
+	}
+
+	key = skb_flow_dissector_target(f->dissector,
+					FLOW_DISSECTOR_KEY_IP,
+					f->key);
+	mask = skb_flow_dissector_target(f->dissector,
+					 FLOW_DISSECTOR_KEY_IP,
+					 f->mask);
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_IP_TTL_,
+				       key->ttl, mask->ttl);
+
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_IP_ECN,
+				       key->tos & 0x3, mask->tos & 0x3);
+
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_IP_DSCP,
+				       key->tos >> 6, mask->tos >> 6);
+
+	return 0;
+}
+
+static int mlxsw_sp_flower_parse(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_block *block,
+				 struct mlxsw_sp_acl_rule_info *rulei,
+				 struct tc_cls_flower_offload *f)
+{
+	u16 n_proto_mask = 0;
+	u16 n_proto_key = 0;
+	u16 addr_type = 0;
+	u8 ip_proto = 0;
+	int err;
+
+	if (f->dissector->used_keys &
+	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
+	      BIT(FLOW_DISSECTOR_KEY_BASIC) |
+	      BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_PORTS) |
+	      BIT(FLOW_DISSECTOR_KEY_TCP) |
+	      BIT(FLOW_DISSECTOR_KEY_IP) |
+	      BIT(FLOW_DISSECTOR_KEY_VLAN))) {
+		dev_err(mlxsw_sp->bus_info->dev, "Unsupported key\n");
+		return -EOPNOTSUPP;
+	}
+
+	mlxsw_sp_acl_rulei_priority(rulei, f->common.prio);
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
+		struct flow_dissector_key_control *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_CONTROL,
+						  f->key);
+		addr_type = key->addr_type;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->key);
+		struct flow_dissector_key_basic *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->mask);
+		n_proto_key = ntohs(key->n_proto);
+		n_proto_mask = ntohs(mask->n_proto);
+
+		if (n_proto_key == ETH_P_ALL) {
+			n_proto_key = 0;
+			n_proto_mask = 0;
+		}
+		mlxsw_sp_acl_rulei_keymask_u32(rulei,
+					       MLXSW_AFK_ELEMENT_ETHERTYPE,
+					       n_proto_key, n_proto_mask);
+
+		ip_proto = key->ip_proto;
+		mlxsw_sp_acl_rulei_keymask_u32(rulei,
+					       MLXSW_AFK_ELEMENT_IP_PROTO,
+					       key->ip_proto, mask->ip_proto);
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+		struct flow_dissector_key_eth_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ETH_ADDRS,
+						  f->key);
+		struct flow_dissector_key_eth_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ETH_ADDRS,
+						  f->mask);
+
+		mlxsw_sp_acl_rulei_keymask_buf(rulei,
+					       MLXSW_AFK_ELEMENT_DMAC,
+					       key->dst, mask->dst,
+					       sizeof(key->dst));
+		mlxsw_sp_acl_rulei_keymask_buf(rulei,
+					       MLXSW_AFK_ELEMENT_SMAC,
+					       key->src, mask->src,
+					       sizeof(key->src));
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_VLAN)) {
+		struct flow_dissector_key_vlan *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLAN,
+						  f->key);
+		struct flow_dissector_key_vlan *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLAN,
+						  f->mask);
+		if (mask->vlan_id != 0)
+			mlxsw_sp_acl_rulei_keymask_u32(rulei,
+						       MLXSW_AFK_ELEMENT_VID,
+						       key->vlan_id,
+						       mask->vlan_id);
+		if (mask->vlan_priority != 0)
+			mlxsw_sp_acl_rulei_keymask_u32(rulei,
+						       MLXSW_AFK_ELEMENT_PCP,
+						       key->vlan_priority,
+						       mask->vlan_priority);
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS)
+		mlxsw_sp_flower_parse_ipv4(rulei, f);
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS)
+		mlxsw_sp_flower_parse_ipv6(rulei, f);
+
+	err = mlxsw_sp_flower_parse_ports(mlxsw_sp, rulei, f, ip_proto);
+	if (err)
+		return err;
+	err = mlxsw_sp_flower_parse_tcp(mlxsw_sp, rulei, f, ip_proto);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_flower_parse_ip(mlxsw_sp, rulei, f, n_proto_key & n_proto_mask);
+	if (err)
+		return err;
+
+	return mlxsw_sp_flower_parse_actions(mlxsw_sp, block, rulei, f->exts);
+}
+
+int mlxsw_sp_flower_replace(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block,
+			    struct tc_cls_flower_offload *f)
+{
+	struct mlxsw_sp_acl_rule_info *rulei;
+	struct mlxsw_sp_acl_ruleset *ruleset;
+	struct mlxsw_sp_acl_rule *rule;
+	int err;
+
+	ruleset = mlxsw_sp_acl_ruleset_get(mlxsw_sp, block,
+					   f->common.chain_index,
+					   MLXSW_SP_ACL_PROFILE_FLOWER);
+	if (IS_ERR(ruleset))
+		return PTR_ERR(ruleset);
+
+	rule = mlxsw_sp_acl_rule_create(mlxsw_sp, ruleset, f->cookie);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		goto err_rule_create;
+	}
+
+	rulei = mlxsw_sp_acl_rule_rulei(rule);
+	err = mlxsw_sp_flower_parse(mlxsw_sp, block, rulei, f);
+	if (err)
+		goto err_flower_parse;
+
+	err = mlxsw_sp_acl_rulei_commit(rulei);
+	if (err)
+		goto err_rulei_commit;
+
+	err = mlxsw_sp_acl_rule_add(mlxsw_sp, rule);
+	if (err)
+		goto err_rule_add;
+
+	mlxsw_sp_acl_ruleset_put(mlxsw_sp, ruleset);
+	return 0;
+
+err_rule_add:
+err_rulei_commit:
+err_flower_parse:
+	mlxsw_sp_acl_rule_destroy(mlxsw_sp, rule);
+err_rule_create:
+	mlxsw_sp_acl_ruleset_put(mlxsw_sp, ruleset);
+	return err;
+}
+
+void mlxsw_sp_flower_destroy(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_acl_block *block,
+			     struct tc_cls_flower_offload *f)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset;
+	struct mlxsw_sp_acl_rule *rule;
+
+	ruleset = mlxsw_sp_acl_ruleset_get(mlxsw_sp, block,
+					   f->common.chain_index,
+					   MLXSW_SP_ACL_PROFILE_FLOWER);
+	if (IS_ERR(ruleset))
+		return;
+
+	rule = mlxsw_sp_acl_rule_lookup(mlxsw_sp, ruleset, f->cookie);
+	if (rule) {
+		mlxsw_sp_acl_rule_del(mlxsw_sp, rule);
+		mlxsw_sp_acl_rule_destroy(mlxsw_sp, rule);
+	}
+
+	mlxsw_sp_acl_ruleset_put(mlxsw_sp, ruleset);
+}
+
+int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_acl_block *block,
+			  struct tc_cls_flower_offload *f)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset;
+	struct mlxsw_sp_acl_rule *rule;
+	u64 packets;
+	u64 lastuse;
+	u64 bytes;
+	int err;
+
+	ruleset = mlxsw_sp_acl_ruleset_get(mlxsw_sp, block,
+					   f->common.chain_index,
+					   MLXSW_SP_ACL_PROFILE_FLOWER);
+	if (WARN_ON(IS_ERR(ruleset)))
+		return -EINVAL;
+
+	rule = mlxsw_sp_acl_rule_lookup(mlxsw_sp, ruleset, f->cookie);
+	if (!rule)
+		return -EINVAL;
+
+	err = mlxsw_sp_acl_rule_get_stats(mlxsw_sp, rule, &packets, &bytes,
+					  &lastuse);
+	if (err)
+		goto err_rule_get_stats;
+
+	tcf_exts_stats_update(f->exts, bytes, packets, lastuse);
+
+	mlxsw_sp_acl_ruleset_put(mlxsw_sp, ruleset);
+	return 0;
+
+err_rule_get_stats:
+	mlxsw_sp_acl_ruleset_put(mlxsw_sp, ruleset);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c
new file mode 100644
index 0000000..98d896c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c
@@ -0,0 +1,369 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c
+ * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017-2018 Petr Machata <petrm@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <net/ip_tunnels.h>
+#include <net/ip6_tunnel.h>
+
+#include "spectrum_ipip.h"
+
+struct ip_tunnel_parm
+mlxsw_sp_ipip_netdev_parms4(const struct net_device *ol_dev)
+{
+	struct ip_tunnel *tun = netdev_priv(ol_dev);
+
+	return tun->parms;
+}
+
+struct __ip6_tnl_parm
+mlxsw_sp_ipip_netdev_parms6(const struct net_device *ol_dev)
+{
+	struct ip6_tnl *tun = netdev_priv(ol_dev);
+
+	return tun->parms;
+}
+
+static bool mlxsw_sp_ipip_parms4_has_ikey(struct ip_tunnel_parm parms)
+{
+	return !!(parms.i_flags & TUNNEL_KEY);
+}
+
+static bool mlxsw_sp_ipip_parms4_has_okey(struct ip_tunnel_parm parms)
+{
+	return !!(parms.o_flags & TUNNEL_KEY);
+}
+
+static u32 mlxsw_sp_ipip_parms4_ikey(struct ip_tunnel_parm parms)
+{
+	return mlxsw_sp_ipip_parms4_has_ikey(parms) ?
+		be32_to_cpu(parms.i_key) : 0;
+}
+
+static u32 mlxsw_sp_ipip_parms4_okey(struct ip_tunnel_parm parms)
+{
+	return mlxsw_sp_ipip_parms4_has_okey(parms) ?
+		be32_to_cpu(parms.o_key) : 0;
+}
+
+static union mlxsw_sp_l3addr
+mlxsw_sp_ipip_parms4_saddr(struct ip_tunnel_parm parms)
+{
+	return (union mlxsw_sp_l3addr) { .addr4 = parms.iph.saddr };
+}
+
+static union mlxsw_sp_l3addr
+mlxsw_sp_ipip_parms6_saddr(struct __ip6_tnl_parm parms)
+{
+	return (union mlxsw_sp_l3addr) { .addr6 = parms.laddr };
+}
+
+static union mlxsw_sp_l3addr
+mlxsw_sp_ipip_parms4_daddr(struct ip_tunnel_parm parms)
+{
+	return (union mlxsw_sp_l3addr) { .addr4 = parms.iph.daddr };
+}
+
+static union mlxsw_sp_l3addr
+mlxsw_sp_ipip_parms6_daddr(struct __ip6_tnl_parm parms)
+{
+	return (union mlxsw_sp_l3addr) { .addr6 = parms.raddr };
+}
+
+union mlxsw_sp_l3addr
+mlxsw_sp_ipip_netdev_saddr(enum mlxsw_sp_l3proto proto,
+			   const struct net_device *ol_dev)
+{
+	struct ip_tunnel_parm parms4;
+	struct __ip6_tnl_parm parms6;
+
+	switch (proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		parms4 = mlxsw_sp_ipip_netdev_parms4(ol_dev);
+		return mlxsw_sp_ipip_parms4_saddr(parms4);
+	case MLXSW_SP_L3_PROTO_IPV6:
+		parms6 = mlxsw_sp_ipip_netdev_parms6(ol_dev);
+		return mlxsw_sp_ipip_parms6_saddr(parms6);
+	}
+
+	WARN_ON(1);
+	return (union mlxsw_sp_l3addr) {0};
+}
+
+static __be32 mlxsw_sp_ipip_netdev_daddr4(const struct net_device *ol_dev)
+{
+
+	struct ip_tunnel_parm parms4 = mlxsw_sp_ipip_netdev_parms4(ol_dev);
+
+	return mlxsw_sp_ipip_parms4_daddr(parms4).addr4;
+}
+
+static union mlxsw_sp_l3addr
+mlxsw_sp_ipip_netdev_daddr(enum mlxsw_sp_l3proto proto,
+			   const struct net_device *ol_dev)
+{
+	struct ip_tunnel_parm parms4;
+	struct __ip6_tnl_parm parms6;
+
+	switch (proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		parms4 = mlxsw_sp_ipip_netdev_parms4(ol_dev);
+		return mlxsw_sp_ipip_parms4_daddr(parms4);
+	case MLXSW_SP_L3_PROTO_IPV6:
+		parms6 = mlxsw_sp_ipip_netdev_parms6(ol_dev);
+		return mlxsw_sp_ipip_parms6_daddr(parms6);
+	}
+
+	WARN_ON(1);
+	return (union mlxsw_sp_l3addr) {0};
+}
+
+bool mlxsw_sp_l3addr_is_zero(union mlxsw_sp_l3addr addr)
+{
+	union mlxsw_sp_l3addr naddr = {0};
+
+	return !memcmp(&addr, &naddr, sizeof(naddr));
+}
+
+static int
+mlxsw_sp_ipip_nexthop_update_gre4(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+				  struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	u16 rif_index = mlxsw_sp_ipip_lb_rif_index(ipip_entry->ol_lb);
+	__be32 daddr4 = mlxsw_sp_ipip_netdev_daddr4(ipip_entry->ol_dev);
+	char ratr_pl[MLXSW_REG_RATR_LEN];
+
+	mlxsw_reg_ratr_pack(ratr_pl, MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY,
+			    true, MLXSW_REG_RATR_TYPE_IPIP,
+			    adj_index, rif_index);
+	mlxsw_reg_ratr_ipip4_entry_pack(ratr_pl, be32_to_cpu(daddr4));
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ratr), ratr_pl);
+}
+
+static int
+mlxsw_sp_ipip_fib_entry_op_gre4_rtdp(struct mlxsw_sp *mlxsw_sp,
+				     u32 tunnel_index,
+				     struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	u16 rif_index = mlxsw_sp_ipip_lb_rif_index(ipip_entry->ol_lb);
+	char rtdp_pl[MLXSW_REG_RTDP_LEN];
+	struct ip_tunnel_parm parms;
+	unsigned int type_check;
+	bool has_ikey;
+	u32 daddr4;
+	u32 ikey;
+
+	parms = mlxsw_sp_ipip_netdev_parms4(ipip_entry->ol_dev);
+	has_ikey = mlxsw_sp_ipip_parms4_has_ikey(parms);
+	ikey = mlxsw_sp_ipip_parms4_ikey(parms);
+
+	mlxsw_reg_rtdp_pack(rtdp_pl, MLXSW_REG_RTDP_TYPE_IPIP, tunnel_index);
+
+	type_check = has_ikey ?
+		MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_GRE_KEY :
+		MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_GRE;
+
+	/* Linux demuxes tunnels based on packet SIP (which must match tunnel
+	 * remote IP). Thus configure decap so that it filters out packets that
+	 * are not IPv4 or have the wrong SIP. IPIP_DECAP_ERROR trap is
+	 * generated for packets that fail this criterion. Linux then handles
+	 * such packets in slow path and generates ICMP destination unreachable.
+	 */
+	daddr4 = be32_to_cpu(mlxsw_sp_ipip_netdev_daddr4(ipip_entry->ol_dev));
+	mlxsw_reg_rtdp_ipip4_pack(rtdp_pl, rif_index,
+				  MLXSW_REG_RTDP_IPIP_SIP_CHECK_FILTER_IPV4,
+				  type_check, has_ikey, daddr4, ikey);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtdp), rtdp_pl);
+}
+
+static int
+mlxsw_sp_ipip_fib_entry_op_gre4_ralue(struct mlxsw_sp *mlxsw_sp,
+				      u32 dip, u8 prefix_len, u16 ul_vr_id,
+				      enum mlxsw_reg_ralue_op op,
+				      u32 tunnel_index)
+{
+	char ralue_pl[MLXSW_REG_RALUE_LEN];
+
+	mlxsw_reg_ralue_pack4(ralue_pl, MLXSW_REG_RALXX_PROTOCOL_IPV4, op,
+			      ul_vr_id, prefix_len, dip);
+	mlxsw_reg_ralue_act_ip2me_tun_pack(ralue_pl, tunnel_index);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl);
+}
+
+static int mlxsw_sp_ipip_fib_entry_op_gre4(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_ipip_entry *ipip_entry,
+					enum mlxsw_reg_ralue_op op,
+					u32 tunnel_index)
+{
+	u16 ul_vr_id = mlxsw_sp_ipip_lb_ul_vr_id(ipip_entry->ol_lb);
+	__be32 dip;
+	int err;
+
+	err = mlxsw_sp_ipip_fib_entry_op_gre4_rtdp(mlxsw_sp, tunnel_index,
+						   ipip_entry);
+	if (err)
+		return err;
+
+	dip = mlxsw_sp_ipip_netdev_saddr(MLXSW_SP_L3_PROTO_IPV4,
+					 ipip_entry->ol_dev).addr4;
+	return mlxsw_sp_ipip_fib_entry_op_gre4_ralue(mlxsw_sp, be32_to_cpu(dip),
+						     32, ul_vr_id, op,
+						     tunnel_index);
+}
+
+static bool mlxsw_sp_ipip_tunnel_complete(enum mlxsw_sp_l3proto proto,
+					  const struct net_device *ol_dev)
+{
+	union mlxsw_sp_l3addr saddr = mlxsw_sp_ipip_netdev_saddr(proto, ol_dev);
+	union mlxsw_sp_l3addr daddr = mlxsw_sp_ipip_netdev_daddr(proto, ol_dev);
+
+	/* Tunnels with unset local or remote address are valid in Linux and
+	 * used for lightweight tunnels (LWT) and Non-Broadcast Multi-Access
+	 * (NBMA) tunnels. In principle these can be offloaded, but the driver
+	 * currently doesn't support this. So punt.
+	 */
+	return !mlxsw_sp_l3addr_is_zero(saddr) &&
+	       !mlxsw_sp_l3addr_is_zero(daddr);
+}
+
+static bool mlxsw_sp_ipip_can_offload_gre4(const struct mlxsw_sp *mlxsw_sp,
+					   const struct net_device *ol_dev,
+					   enum mlxsw_sp_l3proto ol_proto)
+{
+	struct ip_tunnel *tunnel = netdev_priv(ol_dev);
+	__be16 okflags = TUNNEL_KEY; /* We can't offload any other features. */
+	bool inherit_ttl = tunnel->parms.iph.ttl == 0;
+	bool inherit_tos = tunnel->parms.iph.tos & 0x1;
+
+	return (tunnel->parms.i_flags & ~okflags) == 0 &&
+	       (tunnel->parms.o_flags & ~okflags) == 0 &&
+	       inherit_ttl && inherit_tos &&
+	       mlxsw_sp_ipip_tunnel_complete(MLXSW_SP_L3_PROTO_IPV4, ol_dev);
+}
+
+static struct mlxsw_sp_rif_ipip_lb_config
+mlxsw_sp_ipip_ol_loopback_config_gre4(struct mlxsw_sp *mlxsw_sp,
+				      const struct net_device *ol_dev)
+{
+	struct ip_tunnel_parm parms = mlxsw_sp_ipip_netdev_parms4(ol_dev);
+	enum mlxsw_reg_ritr_loopback_ipip_type lb_ipipt;
+
+	lb_ipipt = mlxsw_sp_ipip_parms4_has_okey(parms) ?
+		MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_GRE_KEY_IN_IP :
+		MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_GRE_IN_IP;
+	return (struct mlxsw_sp_rif_ipip_lb_config){
+		.lb_ipipt = lb_ipipt,
+		.okey = mlxsw_sp_ipip_parms4_okey(parms),
+		.ul_protocol = MLXSW_SP_L3_PROTO_IPV4,
+		.saddr = mlxsw_sp_ipip_netdev_saddr(MLXSW_SP_L3_PROTO_IPV4,
+						    ol_dev),
+	};
+}
+
+static int
+mlxsw_sp_ipip_ol_netdev_change_gre4(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_ipip_entry *ipip_entry,
+				    struct netlink_ext_ack *extack)
+{
+	union mlxsw_sp_l3addr old_saddr, new_saddr;
+	union mlxsw_sp_l3addr old_daddr, new_daddr;
+	struct ip_tunnel_parm new_parms;
+	bool update_tunnel = false;
+	bool update_decap = false;
+	bool update_nhs = false;
+	int err = 0;
+
+	new_parms = mlxsw_sp_ipip_netdev_parms4(ipip_entry->ol_dev);
+
+	new_saddr = mlxsw_sp_ipip_parms4_saddr(new_parms);
+	old_saddr = mlxsw_sp_ipip_parms4_saddr(ipip_entry->parms4);
+	new_daddr = mlxsw_sp_ipip_parms4_daddr(new_parms);
+	old_daddr = mlxsw_sp_ipip_parms4_daddr(ipip_entry->parms4);
+
+	if (!mlxsw_sp_l3addr_eq(&new_saddr, &old_saddr)) {
+		u16 ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(ipip_entry->ol_dev);
+
+		/* Since the local address has changed, if there is another
+		 * tunnel with a matching saddr, both need to be demoted.
+		 */
+		if (mlxsw_sp_ipip_demote_tunnel_by_saddr(mlxsw_sp,
+							 MLXSW_SP_L3_PROTO_IPV4,
+							 new_saddr, ul_tb_id,
+							 ipip_entry)) {
+			mlxsw_sp_ipip_entry_demote_tunnel(mlxsw_sp, ipip_entry);
+			return 0;
+		}
+
+		update_tunnel = true;
+	} else if ((mlxsw_sp_ipip_parms4_okey(ipip_entry->parms4) !=
+		    mlxsw_sp_ipip_parms4_okey(new_parms)) ||
+		   ipip_entry->parms4.link != new_parms.link) {
+		update_tunnel = true;
+	} else if (!mlxsw_sp_l3addr_eq(&new_daddr, &old_daddr)) {
+		update_nhs = true;
+	} else if (mlxsw_sp_ipip_parms4_ikey(ipip_entry->parms4) !=
+		   mlxsw_sp_ipip_parms4_ikey(new_parms)) {
+		update_decap = true;
+	}
+
+	if (update_tunnel)
+		err = __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+							  true, true, true,
+							  extack);
+	else if (update_nhs)
+		err = __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+							  false, false, true,
+							  extack);
+	else if (update_decap)
+		err = __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+							  false, false, false,
+							  extack);
+
+	ipip_entry->parms4 = new_parms;
+	return err;
+}
+
+static const struct mlxsw_sp_ipip_ops mlxsw_sp_ipip_gre4_ops = {
+	.dev_type = ARPHRD_IPGRE,
+	.ul_proto = MLXSW_SP_L3_PROTO_IPV4,
+	.nexthop_update = mlxsw_sp_ipip_nexthop_update_gre4,
+	.fib_entry_op = mlxsw_sp_ipip_fib_entry_op_gre4,
+	.can_offload = mlxsw_sp_ipip_can_offload_gre4,
+	.ol_loopback_config = mlxsw_sp_ipip_ol_loopback_config_gre4,
+	.ol_netdev_change = mlxsw_sp_ipip_ol_netdev_change_gre4,
+};
+
+const struct mlxsw_sp_ipip_ops *mlxsw_sp_ipip_ops_arr[] = {
+	[MLXSW_SP_IPIP_TYPE_GRE4] = &mlxsw_sp_ipip_gre4_ops,
+};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h
new file mode 100644
index 0000000..6909d86
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h
@@ -0,0 +1,97 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h
+ * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017-2018 Petr Machata <petrm@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_IPIP_H_
+#define _MLXSW_IPIP_H_
+
+#include "spectrum_router.h"
+#include <net/ip_fib.h>
+#include <linux/if_tunnel.h>
+
+struct ip_tunnel_parm
+mlxsw_sp_ipip_netdev_parms4(const struct net_device *ol_dev);
+struct __ip6_tnl_parm
+mlxsw_sp_ipip_netdev_parms6(const struct net_device *ol_dev);
+
+union mlxsw_sp_l3addr
+mlxsw_sp_ipip_netdev_saddr(enum mlxsw_sp_l3proto proto,
+			   const struct net_device *ol_dev);
+
+bool mlxsw_sp_l3addr_is_zero(union mlxsw_sp_l3addr addr);
+
+enum mlxsw_sp_ipip_type {
+	MLXSW_SP_IPIP_TYPE_GRE4,
+	MLXSW_SP_IPIP_TYPE_MAX,
+};
+
+struct mlxsw_sp_ipip_entry {
+	enum mlxsw_sp_ipip_type ipipt;
+	struct net_device *ol_dev; /* Overlay. */
+	struct mlxsw_sp_rif_ipip_lb *ol_lb;
+	struct mlxsw_sp_fib_entry *decap_fib_entry;
+	struct list_head ipip_list_node;
+	union {
+		struct ip_tunnel_parm parms4;
+	};
+};
+
+struct mlxsw_sp_ipip_ops {
+	int dev_type;
+	enum mlxsw_sp_l3proto ul_proto; /* Underlay. */
+
+	int (*nexthop_update)(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+			      struct mlxsw_sp_ipip_entry *ipip_entry);
+
+	bool (*can_offload)(const struct mlxsw_sp *mlxsw_sp,
+			    const struct net_device *ol_dev,
+			    enum mlxsw_sp_l3proto ol_proto);
+
+	/* Return a configuration for creating an overlay loopback RIF. */
+	struct mlxsw_sp_rif_ipip_lb_config
+	(*ol_loopback_config)(struct mlxsw_sp *mlxsw_sp,
+			      const struct net_device *ol_dev);
+
+	int (*fib_entry_op)(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_ipip_entry *ipip_entry,
+			    enum mlxsw_reg_ralue_op op,
+			    u32 tunnel_index);
+
+	int (*ol_netdev_change)(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_ipip_entry *ipip_entry,
+				struct netlink_ext_ack *extack);
+};
+
+extern const struct mlxsw_sp_ipip_ops *mlxsw_sp_ipip_ops_arr[];
+
+#endif /* _MLXSW_IPIP_H_*/
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
new file mode 100644
index 0000000..fe4327f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
@@ -0,0 +1,454 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+
+#include "spectrum.h"
+
+#define MLXSW_SP_KVDL_SINGLE_BASE 0
+#define MLXSW_SP_KVDL_SINGLE_SIZE 16384
+#define MLXSW_SP_KVDL_SINGLE_END \
+	(MLXSW_SP_KVDL_SINGLE_SIZE + MLXSW_SP_KVDL_SINGLE_BASE - 1)
+
+#define MLXSW_SP_KVDL_CHUNKS_BASE \
+	(MLXSW_SP_KVDL_SINGLE_BASE + MLXSW_SP_KVDL_SINGLE_SIZE)
+#define MLXSW_SP_KVDL_CHUNKS_SIZE 49152
+#define MLXSW_SP_KVDL_CHUNKS_END \
+	(MLXSW_SP_KVDL_CHUNKS_SIZE + MLXSW_SP_KVDL_CHUNKS_BASE - 1)
+
+#define MLXSW_SP_KVDL_LARGE_CHUNKS_BASE \
+	(MLXSW_SP_KVDL_CHUNKS_BASE + MLXSW_SP_KVDL_CHUNKS_SIZE)
+#define MLXSW_SP_KVDL_LARGE_CHUNKS_SIZE \
+	(MLXSW_SP_KVD_LINEAR_SIZE - MLXSW_SP_KVDL_LARGE_CHUNKS_BASE)
+#define MLXSW_SP_KVDL_LARGE_CHUNKS_END \
+	(MLXSW_SP_KVDL_LARGE_CHUNKS_SIZE + MLXSW_SP_KVDL_LARGE_CHUNKS_BASE - 1)
+
+#define MLXSW_SP_KVDL_SINGLE_ALLOC_SIZE 1
+#define MLXSW_SP_KVDL_CHUNKS_ALLOC_SIZE 32
+#define MLXSW_SP_KVDL_LARGE_CHUNKS_ALLOC_SIZE 512
+
+struct mlxsw_sp_kvdl_part_info {
+	unsigned int part_index;
+	unsigned int start_index;
+	unsigned int end_index;
+	unsigned int alloc_size;
+	enum mlxsw_sp_resource_id resource_id;
+};
+
+enum mlxsw_sp_kvdl_part_id {
+	MLXSW_SP_KVDL_PART_ID_SINGLE,
+	MLXSW_SP_KVDL_PART_ID_CHUNKS,
+	MLXSW_SP_KVDL_PART_ID_LARGE_CHUNKS,
+};
+
+#define MLXSW_SP_KVDL_PART_INFO(id)				\
+[MLXSW_SP_KVDL_PART_ID_##id] = {				\
+	.start_index = MLXSW_SP_KVDL_##id##_BASE,		\
+	.end_index = MLXSW_SP_KVDL_##id##_END,			\
+	.alloc_size = MLXSW_SP_KVDL_##id##_ALLOC_SIZE,		\
+	.resource_id = MLXSW_SP_RESOURCE_KVD_LINEAR_##id,	\
+}
+
+static const struct mlxsw_sp_kvdl_part_info mlxsw_sp_kvdl_parts_info[] = {
+	MLXSW_SP_KVDL_PART_INFO(SINGLE),
+	MLXSW_SP_KVDL_PART_INFO(CHUNKS),
+	MLXSW_SP_KVDL_PART_INFO(LARGE_CHUNKS),
+};
+
+#define MLXSW_SP_KVDL_PARTS_INFO_LEN ARRAY_SIZE(mlxsw_sp_kvdl_parts_info)
+
+struct mlxsw_sp_kvdl_part {
+	struct mlxsw_sp_kvdl_part_info info;
+	unsigned long usage[0];	/* Entries */
+};
+
+struct mlxsw_sp_kvdl {
+	struct mlxsw_sp_kvdl_part *parts[MLXSW_SP_KVDL_PARTS_INFO_LEN];
+};
+
+static struct mlxsw_sp_kvdl_part *
+mlxsw_sp_kvdl_alloc_size_part(struct mlxsw_sp_kvdl *kvdl,
+			      unsigned int alloc_size)
+{
+	struct mlxsw_sp_kvdl_part *part, *min_part = NULL;
+	int i;
+
+	for (i = 0; i < MLXSW_SP_KVDL_PARTS_INFO_LEN; i++) {
+		part = kvdl->parts[i];
+		if (alloc_size <= part->info.alloc_size &&
+		    (!min_part ||
+		     part->info.alloc_size <= min_part->info.alloc_size))
+			min_part = part;
+	}
+
+	return min_part ?: ERR_PTR(-ENOBUFS);
+}
+
+static struct mlxsw_sp_kvdl_part *
+mlxsw_sp_kvdl_index_part(struct mlxsw_sp_kvdl *kvdl, u32 kvdl_index)
+{
+	struct mlxsw_sp_kvdl_part *part;
+	int i;
+
+	for (i = 0; i < MLXSW_SP_KVDL_PARTS_INFO_LEN; i++) {
+		part = kvdl->parts[i];
+		if (kvdl_index >= part->info.start_index &&
+		    kvdl_index <= part->info.end_index)
+			return part;
+	}
+
+	return ERR_PTR(-EINVAL);
+}
+
+static u32
+mlxsw_sp_entry_index_kvdl_index(const struct mlxsw_sp_kvdl_part_info *info,
+				unsigned int entry_index)
+{
+	return info->start_index + entry_index * info->alloc_size;
+}
+
+static unsigned int
+mlxsw_sp_kvdl_index_entry_index(const struct mlxsw_sp_kvdl_part_info *info,
+				u32 kvdl_index)
+{
+	return (kvdl_index - info->start_index) / info->alloc_size;
+}
+
+static int mlxsw_sp_kvdl_part_alloc(struct mlxsw_sp_kvdl_part *part,
+				    u32 *p_kvdl_index)
+{
+	const struct mlxsw_sp_kvdl_part_info *info = &part->info;
+	unsigned int entry_index, nr_entries;
+
+	nr_entries = (info->end_index - info->start_index + 1) /
+		     info->alloc_size;
+	entry_index = find_first_zero_bit(part->usage, nr_entries);
+	if (entry_index == nr_entries)
+		return -ENOBUFS;
+	__set_bit(entry_index, part->usage);
+
+	*p_kvdl_index = mlxsw_sp_entry_index_kvdl_index(info, entry_index);
+
+	return 0;
+}
+
+static void mlxsw_sp_kvdl_part_free(struct mlxsw_sp_kvdl_part *part,
+				    u32 kvdl_index)
+{
+	const struct mlxsw_sp_kvdl_part_info *info = &part->info;
+	unsigned int entry_index;
+
+	entry_index = mlxsw_sp_kvdl_index_entry_index(info, kvdl_index);
+	__clear_bit(entry_index, part->usage);
+}
+
+int mlxsw_sp_kvdl_alloc(struct mlxsw_sp *mlxsw_sp, unsigned int entry_count,
+			u32 *p_entry_index)
+{
+	struct mlxsw_sp_kvdl_part *part;
+
+	/* Find partition with smallest allocation size satisfying the
+	 * requested size.
+	 */
+	part = mlxsw_sp_kvdl_alloc_size_part(mlxsw_sp->kvdl, entry_count);
+	if (IS_ERR(part))
+		return PTR_ERR(part);
+
+	return mlxsw_sp_kvdl_part_alloc(part, p_entry_index);
+}
+
+void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp, int entry_index)
+{
+	struct mlxsw_sp_kvdl_part *part;
+
+	part = mlxsw_sp_kvdl_index_part(mlxsw_sp->kvdl, entry_index);
+	if (IS_ERR(part))
+		return;
+	mlxsw_sp_kvdl_part_free(part, entry_index);
+}
+
+int mlxsw_sp_kvdl_alloc_size_query(struct mlxsw_sp *mlxsw_sp,
+				   unsigned int entry_count,
+				   unsigned int *p_alloc_size)
+{
+	struct mlxsw_sp_kvdl_part *part;
+
+	part = mlxsw_sp_kvdl_alloc_size_part(mlxsw_sp->kvdl, entry_count);
+	if (IS_ERR(part))
+		return PTR_ERR(part);
+
+	*p_alloc_size = part->info.alloc_size;
+
+	return 0;
+}
+
+static void mlxsw_sp_kvdl_part_update(struct mlxsw_sp_kvdl_part *part,
+				      struct mlxsw_sp_kvdl_part *part_prev,
+				      unsigned int size)
+{
+
+	if (!part_prev) {
+		part->info.end_index = size - 1;
+	} else {
+		part->info.start_index = part_prev->info.end_index + 1;
+		part->info.end_index = part->info.start_index + size - 1;
+	}
+}
+
+static struct mlxsw_sp_kvdl_part *
+mlxsw_sp_kvdl_part_init(struct mlxsw_sp *mlxsw_sp,
+			const struct mlxsw_sp_kvdl_part_info *info,
+			struct mlxsw_sp_kvdl_part *part_prev)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	struct mlxsw_sp_kvdl_part *part;
+	bool need_update = true;
+	unsigned int nr_entries;
+	size_t usage_size;
+	u64 resource_size;
+	int err;
+
+	err = devlink_resource_size_get(devlink, info->resource_id,
+					&resource_size);
+	if (err) {
+		need_update = false;
+		resource_size = info->end_index - info->start_index + 1;
+	}
+
+	nr_entries = div_u64(resource_size, info->alloc_size);
+	usage_size = BITS_TO_LONGS(nr_entries) * sizeof(unsigned long);
+	part = kzalloc(sizeof(*part) + usage_size, GFP_KERNEL);
+	if (!part)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(&part->info, info, sizeof(part->info));
+
+	if (need_update)
+		mlxsw_sp_kvdl_part_update(part, part_prev, resource_size);
+	return part;
+}
+
+static void mlxsw_sp_kvdl_part_fini(struct mlxsw_sp_kvdl_part *part)
+{
+	kfree(part);
+}
+
+static int mlxsw_sp_kvdl_parts_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_kvdl *kvdl = mlxsw_sp->kvdl;
+	const struct mlxsw_sp_kvdl_part_info *info;
+	struct mlxsw_sp_kvdl_part *part_prev = NULL;
+	int err, i;
+
+	for (i = 0; i < MLXSW_SP_KVDL_PARTS_INFO_LEN; i++) {
+		info = &mlxsw_sp_kvdl_parts_info[i];
+		kvdl->parts[i] = mlxsw_sp_kvdl_part_init(mlxsw_sp, info,
+							 part_prev);
+		if (IS_ERR(kvdl->parts[i])) {
+			err = PTR_ERR(kvdl->parts[i]);
+			goto err_kvdl_part_init;
+		}
+		part_prev = kvdl->parts[i];
+	}
+	return 0;
+
+err_kvdl_part_init:
+	for (i--; i >= 0; i--)
+		mlxsw_sp_kvdl_part_fini(kvdl->parts[i]);
+	return err;
+}
+
+static void mlxsw_sp_kvdl_parts_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_kvdl *kvdl = mlxsw_sp->kvdl;
+	int i;
+
+	for (i = 0; i < MLXSW_SP_KVDL_PARTS_INFO_LEN; i++)
+		mlxsw_sp_kvdl_part_fini(kvdl->parts[i]);
+}
+
+static u64 mlxsw_sp_kvdl_part_occ(struct mlxsw_sp_kvdl_part *part)
+{
+	const struct mlxsw_sp_kvdl_part_info *info = &part->info;
+	unsigned int nr_entries;
+	int bit = -1;
+	u64 occ = 0;
+
+	nr_entries = (info->end_index -
+		      info->start_index + 1) /
+		      info->alloc_size;
+	while ((bit = find_next_bit(part->usage, nr_entries, bit + 1))
+		< nr_entries)
+		occ += info->alloc_size;
+	return occ;
+}
+
+static u64 mlxsw_sp_kvdl_occ_get(void *priv)
+{
+	const struct mlxsw_sp *mlxsw_sp = priv;
+	u64 occ = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SP_KVDL_PARTS_INFO_LEN; i++)
+		occ += mlxsw_sp_kvdl_part_occ(mlxsw_sp->kvdl->parts[i]);
+
+	return occ;
+}
+
+static u64 mlxsw_sp_kvdl_single_occ_get(void *priv)
+{
+	const struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_kvdl_part *part;
+
+	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_SINGLE];
+	return mlxsw_sp_kvdl_part_occ(part);
+}
+
+static u64 mlxsw_sp_kvdl_chunks_occ_get(void *priv)
+{
+	const struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_kvdl_part *part;
+
+	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_CHUNKS];
+	return mlxsw_sp_kvdl_part_occ(part);
+}
+
+static u64 mlxsw_sp_kvdl_large_chunks_occ_get(void *priv)
+{
+	const struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_kvdl_part *part;
+
+	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_LARGE_CHUNKS];
+	return mlxsw_sp_kvdl_part_occ(part);
+}
+
+int mlxsw_sp_kvdl_resources_register(struct mlxsw_core *mlxsw_core)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+	static struct devlink_resource_size_params size_params;
+	u32 kvdl_max_size;
+	int err;
+
+	kvdl_max_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) -
+			MLXSW_CORE_RES_GET(mlxsw_core, KVD_SINGLE_MIN_SIZE) -
+			MLXSW_CORE_RES_GET(mlxsw_core, KVD_DOUBLE_MIN_SIZE);
+
+	devlink_resource_size_params_init(&size_params, 0, kvdl_max_size,
+					  MLXSW_SP_KVDL_SINGLE_ALLOC_SIZE,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_SINGLES,
+					MLXSW_SP_KVDL_SINGLE_SIZE,
+					MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					&size_params);
+	if (err)
+		return err;
+
+	devlink_resource_size_params_init(&size_params, 0, kvdl_max_size,
+					  MLXSW_SP_KVDL_CHUNKS_ALLOC_SIZE,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_CHUNKS,
+					MLXSW_SP_KVDL_CHUNKS_SIZE,
+					MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					&size_params);
+	if (err)
+		return err;
+
+	devlink_resource_size_params_init(&size_params, 0, kvdl_max_size,
+					  MLXSW_SP_KVDL_LARGE_CHUNKS_ALLOC_SIZE,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_LARGE_CHUNKS,
+					MLXSW_SP_KVDL_LARGE_CHUNKS_SIZE,
+					MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					&size_params);
+	return err;
+}
+
+int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	struct mlxsw_sp_kvdl *kvdl;
+	int err;
+
+	kvdl = kzalloc(sizeof(*mlxsw_sp->kvdl), GFP_KERNEL);
+	if (!kvdl)
+		return -ENOMEM;
+	mlxsw_sp->kvdl = kvdl;
+
+	err = mlxsw_sp_kvdl_parts_init(mlxsw_sp);
+	if (err)
+		goto err_kvdl_parts_init;
+
+	devlink_resource_occ_get_register(devlink,
+					  MLXSW_SP_RESOURCE_KVD_LINEAR,
+					  mlxsw_sp_kvdl_occ_get,
+					  mlxsw_sp);
+	devlink_resource_occ_get_register(devlink,
+					  MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE,
+					  mlxsw_sp_kvdl_single_occ_get,
+					  mlxsw_sp);
+	devlink_resource_occ_get_register(devlink,
+					  MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS,
+					  mlxsw_sp_kvdl_chunks_occ_get,
+					  mlxsw_sp);
+	devlink_resource_occ_get_register(devlink,
+					  MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS,
+					  mlxsw_sp_kvdl_large_chunks_occ_get,
+					  mlxsw_sp);
+
+	return 0;
+
+err_kvdl_parts_init:
+	kfree(mlxsw_sp->kvdl);
+	return err;
+}
+
+void mlxsw_sp_kvdl_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	devlink_resource_occ_get_unregister(devlink,
+					    MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS);
+	devlink_resource_occ_get_unregister(devlink,
+					    MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS);
+	devlink_resource_occ_get_unregister(devlink,
+					    MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE);
+	devlink_resource_occ_get_unregister(devlink,
+					    MLXSW_SP_RESOURCE_KVD_LINEAR);
+	mlxsw_sp_kvdl_parts_fini(mlxsw_sp);
+	kfree(mlxsw_sp->kvdl);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
new file mode 100644
index 0000000..a825396
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
@@ -0,0 +1,1080 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/rhashtable.h>
+#include <net/ipv6.h>
+
+#include "spectrum_mr.h"
+#include "spectrum_router.h"
+
+struct mlxsw_sp_mr {
+	const struct mlxsw_sp_mr_ops *mr_ops;
+	void *catchall_route_priv;
+	struct delayed_work stats_update_dw;
+	struct list_head table_list;
+#define MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL 5000 /* ms */
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+struct mlxsw_sp_mr_vif;
+struct mlxsw_sp_mr_vif_ops {
+	bool (*is_regular)(const struct mlxsw_sp_mr_vif *vif);
+};
+
+struct mlxsw_sp_mr_vif {
+	struct net_device *dev;
+	const struct mlxsw_sp_rif *rif;
+	unsigned long vif_flags;
+
+	/* A list of route_vif_entry structs that point to routes that the VIF
+	 * instance is used as one of the egress VIFs
+	 */
+	struct list_head route_evif_list;
+
+	/* A list of route_vif_entry structs that point to routes that the VIF
+	 * instance is used as an ingress VIF
+	 */
+	struct list_head route_ivif_list;
+
+	/* Protocol specific operations for a VIF */
+	const struct mlxsw_sp_mr_vif_ops *ops;
+};
+
+struct mlxsw_sp_mr_route_vif_entry {
+	struct list_head vif_node;
+	struct list_head route_node;
+	struct mlxsw_sp_mr_vif *mr_vif;
+	struct mlxsw_sp_mr_route *mr_route;
+};
+
+struct mlxsw_sp_mr_table;
+struct mlxsw_sp_mr_table_ops {
+	bool (*is_route_valid)(const struct mlxsw_sp_mr_table *mr_table,
+			       const struct mr_mfc *mfc);
+	void (*key_create)(struct mlxsw_sp_mr_table *mr_table,
+			   struct mlxsw_sp_mr_route_key *key,
+			   struct mr_mfc *mfc);
+	bool (*is_route_starg)(const struct mlxsw_sp_mr_table *mr_table,
+			       const struct mlxsw_sp_mr_route *mr_route);
+};
+
+struct mlxsw_sp_mr_table {
+	struct list_head node;
+	enum mlxsw_sp_l3proto proto;
+	struct mlxsw_sp *mlxsw_sp;
+	u32 vr_id;
+	struct mlxsw_sp_mr_vif vifs[MAXVIFS];
+	struct list_head route_list;
+	struct rhashtable route_ht;
+	const struct mlxsw_sp_mr_table_ops *ops;
+	char catchall_route_priv[0];
+	/* catchall_route_priv has to be always the last item */
+};
+
+struct mlxsw_sp_mr_route {
+	struct list_head node;
+	struct rhash_head ht_node;
+	struct mlxsw_sp_mr_route_key key;
+	enum mlxsw_sp_mr_route_action route_action;
+	u16 min_mtu;
+	struct mr_mfc *mfc;
+	void *route_priv;
+	const struct mlxsw_sp_mr_table *mr_table;
+	/* A list of route_vif_entry structs that point to the egress VIFs */
+	struct list_head evif_list;
+	/* A route_vif_entry struct that point to the ingress VIF */
+	struct mlxsw_sp_mr_route_vif_entry ivif;
+};
+
+static const struct rhashtable_params mlxsw_sp_mr_route_ht_params = {
+	.key_len = sizeof(struct mlxsw_sp_mr_route_key),
+	.key_offset = offsetof(struct mlxsw_sp_mr_route, key),
+	.head_offset = offsetof(struct mlxsw_sp_mr_route, ht_node),
+	.automatic_shrinking = true,
+};
+
+static bool mlxsw_sp_mr_vif_valid(const struct mlxsw_sp_mr_vif *vif)
+{
+	return vif->ops->is_regular(vif) && vif->dev && vif->rif;
+}
+
+static bool mlxsw_sp_mr_vif_exists(const struct mlxsw_sp_mr_vif *vif)
+{
+	return vif->dev;
+}
+
+static bool
+mlxsw_sp_mr_route_ivif_in_evifs(const struct mlxsw_sp_mr_route *mr_route)
+{
+	vifi_t ivif = mr_route->mfc->mfc_parent;
+
+	return mr_route->mfc->mfc_un.res.ttls[ivif] != 255;
+}
+
+static int
+mlxsw_sp_mr_route_valid_evifs_num(const struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+	int valid_evifs;
+
+	valid_evifs = 0;
+	list_for_each_entry(rve, &mr_route->evif_list, route_node)
+		if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
+			valid_evifs++;
+	return valid_evifs;
+}
+
+static enum mlxsw_sp_mr_route_action
+mlxsw_sp_mr_route_action(const struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+
+	/* If the ingress port is not regular and resolved, trap the route */
+	if (!mlxsw_sp_mr_vif_valid(mr_route->ivif.mr_vif))
+		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+
+	/* The kernel does not match a (*,G) route that the ingress interface is
+	 * not one of the egress interfaces, so trap these kind of routes.
+	 */
+	if (mr_route->mr_table->ops->is_route_starg(mr_route->mr_table,
+						    mr_route) &&
+	    !mlxsw_sp_mr_route_ivif_in_evifs(mr_route))
+		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+
+	/* If the route has no valid eVIFs, trap it. */
+	if (!mlxsw_sp_mr_route_valid_evifs_num(mr_route))
+		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+
+	/* If one of the eVIFs has no RIF, trap-and-forward the route as there
+	 * is some more routing to do in software too.
+	 */
+	list_for_each_entry(rve, &mr_route->evif_list, route_node)
+		if (mlxsw_sp_mr_vif_exists(rve->mr_vif) && !rve->mr_vif->rif)
+			return MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD;
+
+	return MLXSW_SP_MR_ROUTE_ACTION_FORWARD;
+}
+
+static enum mlxsw_sp_mr_route_prio
+mlxsw_sp_mr_route_prio(const struct mlxsw_sp_mr_route *mr_route)
+{
+	return mr_route->mr_table->ops->is_route_starg(mr_route->mr_table,
+						       mr_route) ?
+		MLXSW_SP_MR_ROUTE_PRIO_STARG : MLXSW_SP_MR_ROUTE_PRIO_SG;
+}
+
+static int mlxsw_sp_mr_route_evif_link(struct mlxsw_sp_mr_route *mr_route,
+				       struct mlxsw_sp_mr_vif *mr_vif)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+
+	rve = kzalloc(sizeof(*rve), GFP_KERNEL);
+	if (!rve)
+		return -ENOMEM;
+	rve->mr_route = mr_route;
+	rve->mr_vif = mr_vif;
+	list_add_tail(&rve->route_node, &mr_route->evif_list);
+	list_add_tail(&rve->vif_node, &mr_vif->route_evif_list);
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_route_evif_unlink(struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	list_del(&rve->route_node);
+	list_del(&rve->vif_node);
+	kfree(rve);
+}
+
+static void mlxsw_sp_mr_route_ivif_link(struct mlxsw_sp_mr_route *mr_route,
+					struct mlxsw_sp_mr_vif *mr_vif)
+{
+	mr_route->ivif.mr_route = mr_route;
+	mr_route->ivif.mr_vif = mr_vif;
+	list_add_tail(&mr_route->ivif.vif_node, &mr_vif->route_ivif_list);
+}
+
+static void mlxsw_sp_mr_route_ivif_unlink(struct mlxsw_sp_mr_route *mr_route)
+{
+	list_del(&mr_route->ivif.vif_node);
+}
+
+static int
+mlxsw_sp_mr_route_info_create(struct mlxsw_sp_mr_table *mr_table,
+			      struct mlxsw_sp_mr_route *mr_route,
+			      struct mlxsw_sp_mr_route_info *route_info)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+	u16 *erif_indices;
+	u16 irif_index;
+	u16 erif = 0;
+
+	erif_indices = kmalloc_array(MAXVIFS, sizeof(*erif_indices),
+				     GFP_KERNEL);
+	if (!erif_indices)
+		return -ENOMEM;
+
+	list_for_each_entry(rve, &mr_route->evif_list, route_node) {
+		if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
+			u16 rifi = mlxsw_sp_rif_index(rve->mr_vif->rif);
+
+			erif_indices[erif++] = rifi;
+		}
+	}
+
+	if (mlxsw_sp_mr_vif_valid(mr_route->ivif.mr_vif))
+		irif_index = mlxsw_sp_rif_index(mr_route->ivif.mr_vif->rif);
+	else
+		irif_index = 0;
+
+	route_info->irif_index = irif_index;
+	route_info->erif_indices = erif_indices;
+	route_info->min_mtu = mr_route->min_mtu;
+	route_info->route_action = mr_route->route_action;
+	route_info->erif_num = erif;
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_route_info_destroy(struct mlxsw_sp_mr_route_info *route_info)
+{
+	kfree(route_info->erif_indices);
+}
+
+static int mlxsw_sp_mr_route_write(struct mlxsw_sp_mr_table *mr_table,
+				   struct mlxsw_sp_mr_route *mr_route,
+				   bool replace)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr_route_info route_info;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	int err;
+
+	err = mlxsw_sp_mr_route_info_create(mr_table, mr_route, &route_info);
+	if (err)
+		return err;
+
+	if (!replace) {
+		struct mlxsw_sp_mr_route_params route_params;
+
+		mr_route->route_priv = kzalloc(mr->mr_ops->route_priv_size,
+					       GFP_KERNEL);
+		if (!mr_route->route_priv) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		route_params.key = mr_route->key;
+		route_params.value = route_info;
+		route_params.prio = mlxsw_sp_mr_route_prio(mr_route);
+		err = mr->mr_ops->route_create(mlxsw_sp, mr->priv,
+					       mr_route->route_priv,
+					       &route_params);
+		if (err)
+			kfree(mr_route->route_priv);
+	} else {
+		err = mr->mr_ops->route_update(mlxsw_sp, mr_route->route_priv,
+					       &route_info);
+	}
+out:
+	mlxsw_sp_mr_route_info_destroy(&route_info);
+	return err;
+}
+
+static void mlxsw_sp_mr_route_erase(struct mlxsw_sp_mr_table *mr_table,
+				    struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	mr->mr_ops->route_destroy(mlxsw_sp, mr->priv, mr_route->route_priv);
+	kfree(mr_route->route_priv);
+}
+
+static struct mlxsw_sp_mr_route *
+mlxsw_sp_mr_route_create(struct mlxsw_sp_mr_table *mr_table,
+			 struct mr_mfc *mfc)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve, *tmp;
+	struct mlxsw_sp_mr_route *mr_route;
+	int err = 0;
+	int i;
+
+	/* Allocate and init a new route and fill it with parameters */
+	mr_route = kzalloc(sizeof(*mr_route), GFP_KERNEL);
+	if (!mr_route)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&mr_route->evif_list);
+
+	/* Find min_mtu and link iVIF and eVIFs */
+	mr_route->min_mtu = ETH_MAX_MTU;
+	mr_cache_hold(mfc);
+	mr_route->mfc = mfc;
+	mr_table->ops->key_create(mr_table, &mr_route->key, mr_route->mfc);
+
+	mr_route->mr_table = mr_table;
+	for (i = 0; i < MAXVIFS; i++) {
+		if (mfc->mfc_un.res.ttls[i] != 255) {
+			err = mlxsw_sp_mr_route_evif_link(mr_route,
+							  &mr_table->vifs[i]);
+			if (err)
+				goto err;
+			if (mr_table->vifs[i].dev &&
+			    mr_table->vifs[i].dev->mtu < mr_route->min_mtu)
+				mr_route->min_mtu = mr_table->vifs[i].dev->mtu;
+		}
+	}
+	mlxsw_sp_mr_route_ivif_link(mr_route,
+				    &mr_table->vifs[mfc->mfc_parent]);
+
+	mr_route->route_action = mlxsw_sp_mr_route_action(mr_route);
+	return mr_route;
+err:
+	mr_cache_put(mfc);
+	list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node)
+		mlxsw_sp_mr_route_evif_unlink(rve);
+	kfree(mr_route);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_mr_route_destroy(struct mlxsw_sp_mr_table *mr_table,
+				      struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve, *tmp;
+
+	mlxsw_sp_mr_route_ivif_unlink(mr_route);
+	mr_cache_put(mr_route->mfc);
+	list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node)
+		mlxsw_sp_mr_route_evif_unlink(rve);
+	kfree(mr_route);
+}
+
+static void mlxsw_sp_mr_mfc_offload_set(struct mlxsw_sp_mr_route *mr_route,
+					bool offload)
+{
+	if (offload)
+		mr_route->mfc->mfc_flags |= MFC_OFFLOAD;
+	else
+		mr_route->mfc->mfc_flags &= ~MFC_OFFLOAD;
+}
+
+static void mlxsw_sp_mr_mfc_offload_update(struct mlxsw_sp_mr_route *mr_route)
+{
+	bool offload;
+
+	offload = mr_route->route_action != MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+	mlxsw_sp_mr_mfc_offload_set(mr_route, offload);
+}
+
+static void __mlxsw_sp_mr_route_del(struct mlxsw_sp_mr_table *mr_table,
+				    struct mlxsw_sp_mr_route *mr_route)
+{
+	mlxsw_sp_mr_mfc_offload_set(mr_route, false);
+	mlxsw_sp_mr_route_erase(mr_table, mr_route);
+	rhashtable_remove_fast(&mr_table->route_ht, &mr_route->ht_node,
+			       mlxsw_sp_mr_route_ht_params);
+	list_del(&mr_route->node);
+	mlxsw_sp_mr_route_destroy(mr_table, mr_route);
+}
+
+int mlxsw_sp_mr_route_add(struct mlxsw_sp_mr_table *mr_table,
+			  struct mr_mfc *mfc, bool replace)
+{
+	struct mlxsw_sp_mr_route *mr_orig_route = NULL;
+	struct mlxsw_sp_mr_route *mr_route;
+	int err;
+
+	if (!mr_table->ops->is_route_valid(mr_table, mfc))
+		return -EINVAL;
+
+	/* Create a new route */
+	mr_route = mlxsw_sp_mr_route_create(mr_table, mfc);
+	if (IS_ERR(mr_route))
+		return PTR_ERR(mr_route);
+
+	/* Find any route with a matching key */
+	mr_orig_route = rhashtable_lookup_fast(&mr_table->route_ht,
+					       &mr_route->key,
+					       mlxsw_sp_mr_route_ht_params);
+	if (replace) {
+		/* On replace case, make the route point to the new route_priv.
+		 */
+		if (WARN_ON(!mr_orig_route)) {
+			err = -ENOENT;
+			goto err_no_orig_route;
+		}
+		mr_route->route_priv = mr_orig_route->route_priv;
+	} else if (mr_orig_route) {
+		/* On non replace case, if another route with the same key was
+		 * found, abort, as duplicate routes are used for proxy routes.
+		 */
+		dev_warn(mr_table->mlxsw_sp->bus_info->dev,
+			 "Offloading proxy routes is not supported.\n");
+		err = -EINVAL;
+		goto err_duplicate_route;
+	}
+
+	/* Put it in the table data-structures */
+	list_add_tail(&mr_route->node, &mr_table->route_list);
+	err = rhashtable_insert_fast(&mr_table->route_ht,
+				     &mr_route->ht_node,
+				     mlxsw_sp_mr_route_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	/* Write the route to the hardware */
+	err = mlxsw_sp_mr_route_write(mr_table, mr_route, replace);
+	if (err)
+		goto err_mr_route_write;
+
+	/* Destroy the original route */
+	if (replace) {
+		rhashtable_remove_fast(&mr_table->route_ht,
+				       &mr_orig_route->ht_node,
+				       mlxsw_sp_mr_route_ht_params);
+		list_del(&mr_orig_route->node);
+		mlxsw_sp_mr_route_destroy(mr_table, mr_orig_route);
+	}
+
+	mlxsw_sp_mr_mfc_offload_update(mr_route);
+	return 0;
+
+err_mr_route_write:
+	rhashtable_remove_fast(&mr_table->route_ht, &mr_route->ht_node,
+			       mlxsw_sp_mr_route_ht_params);
+err_rhashtable_insert:
+	list_del(&mr_route->node);
+err_no_orig_route:
+err_duplicate_route:
+	mlxsw_sp_mr_route_destroy(mr_table, mr_route);
+	return err;
+}
+
+void mlxsw_sp_mr_route_del(struct mlxsw_sp_mr_table *mr_table,
+			   struct mr_mfc *mfc)
+{
+	struct mlxsw_sp_mr_route *mr_route;
+	struct mlxsw_sp_mr_route_key key;
+
+	mr_table->ops->key_create(mr_table, &key, mfc);
+	mr_route = rhashtable_lookup_fast(&mr_table->route_ht, &key,
+					  mlxsw_sp_mr_route_ht_params);
+	if (mr_route)
+		__mlxsw_sp_mr_route_del(mr_table, mr_route);
+}
+
+/* Should be called after the VIF struct is updated */
+static int
+mlxsw_sp_mr_route_ivif_resolve(struct mlxsw_sp_mr_table *mr_table,
+			       struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	enum mlxsw_sp_mr_route_action route_action;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u16 irif_index;
+	int err;
+
+	route_action = mlxsw_sp_mr_route_action(rve->mr_route);
+	if (route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return 0;
+
+	/* rve->mr_vif->rif is guaranteed to be valid at this stage */
+	irif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
+	err = mr->mr_ops->route_irif_update(mlxsw_sp, rve->mr_route->route_priv,
+					    irif_index);
+	if (err)
+		return err;
+
+	err = mr->mr_ops->route_action_update(mlxsw_sp,
+					      rve->mr_route->route_priv,
+					      route_action);
+	if (err)
+		/* No need to rollback here because the iRIF change only takes
+		 * place after the action has been updated.
+		 */
+		return err;
+
+	rve->mr_route->route_action = route_action;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_route_ivif_unresolve(struct mlxsw_sp_mr_table *mr_table,
+				 struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	mr->mr_ops->route_action_update(mlxsw_sp, rve->mr_route->route_priv,
+					MLXSW_SP_MR_ROUTE_ACTION_TRAP);
+	rve->mr_route->route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+}
+
+/* Should be called after the RIF struct is updated */
+static int
+mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table,
+			       struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	enum mlxsw_sp_mr_route_action route_action;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u16 erif_index = 0;
+	int err;
+
+	/* Update the route action, as the new eVIF can be a tunnel or a pimreg
+	 * device which will require updating the action.
+	 */
+	route_action = mlxsw_sp_mr_route_action(rve->mr_route);
+	if (route_action != rve->mr_route->route_action) {
+		err = mr->mr_ops->route_action_update(mlxsw_sp,
+						      rve->mr_route->route_priv,
+						      route_action);
+		if (err)
+			return err;
+	}
+
+	/* Add the eRIF */
+	if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
+		erif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
+		err = mr->mr_ops->route_erif_add(mlxsw_sp,
+						 rve->mr_route->route_priv,
+						 erif_index);
+		if (err)
+			goto err_route_erif_add;
+	}
+
+	/* Update the minimum MTU */
+	if (rve->mr_vif->dev->mtu < rve->mr_route->min_mtu) {
+		rve->mr_route->min_mtu = rve->mr_vif->dev->mtu;
+		err = mr->mr_ops->route_min_mtu_update(mlxsw_sp,
+						       rve->mr_route->route_priv,
+						       rve->mr_route->min_mtu);
+		if (err)
+			goto err_route_min_mtu_update;
+	}
+
+	rve->mr_route->route_action = route_action;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+	return 0;
+
+err_route_min_mtu_update:
+	if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
+		mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv,
+					   erif_index);
+err_route_erif_add:
+	if (route_action != rve->mr_route->route_action)
+		mr->mr_ops->route_action_update(mlxsw_sp,
+						rve->mr_route->route_priv,
+						rve->mr_route->route_action);
+	return err;
+}
+
+/* Should be called before the RIF struct is updated */
+static void
+mlxsw_sp_mr_route_evif_unresolve(struct mlxsw_sp_mr_table *mr_table,
+				 struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	enum mlxsw_sp_mr_route_action route_action;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u16 rifi;
+
+	/* If the unresolved RIF was not valid, no need to delete it */
+	if (!mlxsw_sp_mr_vif_valid(rve->mr_vif))
+		return;
+
+	/* Update the route action: if there is only one valid eVIF in the
+	 * route, set the action to trap as the VIF deletion will lead to zero
+	 * valid eVIFs. On any other case, use the mlxsw_sp_mr_route_action to
+	 * determine the route action.
+	 */
+	if (mlxsw_sp_mr_route_valid_evifs_num(rve->mr_route) == 1)
+		route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+	else
+		route_action = mlxsw_sp_mr_route_action(rve->mr_route);
+	if (route_action != rve->mr_route->route_action)
+		mr->mr_ops->route_action_update(mlxsw_sp,
+						rve->mr_route->route_priv,
+						route_action);
+
+	/* Delete the erif from the route */
+	rifi = mlxsw_sp_rif_index(rve->mr_vif->rif);
+	mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv, rifi);
+	rve->mr_route->route_action = route_action;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+}
+
+static int mlxsw_sp_mr_vif_resolve(struct mlxsw_sp_mr_table *mr_table,
+				   struct net_device *dev,
+				   struct mlxsw_sp_mr_vif *mr_vif,
+				   unsigned long vif_flags,
+				   const struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_mr_route_vif_entry *irve, *erve;
+	int err;
+
+	/* Update the VIF */
+	mr_vif->dev = dev;
+	mr_vif->rif = rif;
+	mr_vif->vif_flags = vif_flags;
+
+	/* Update all routes where this VIF is used as an unresolved iRIF */
+	list_for_each_entry(irve, &mr_vif->route_ivif_list, vif_node) {
+		err = mlxsw_sp_mr_route_ivif_resolve(mr_table, irve);
+		if (err)
+			goto err_irif_unresolve;
+	}
+
+	/* Update all routes where this VIF is used as an unresolved eRIF */
+	list_for_each_entry(erve, &mr_vif->route_evif_list, vif_node) {
+		err = mlxsw_sp_mr_route_evif_resolve(mr_table, erve);
+		if (err)
+			goto err_erif_unresolve;
+	}
+	return 0;
+
+err_erif_unresolve:
+	list_for_each_entry_from_reverse(erve, &mr_vif->route_evif_list,
+					 vif_node)
+		mlxsw_sp_mr_route_evif_unresolve(mr_table, erve);
+err_irif_unresolve:
+	list_for_each_entry_from_reverse(irve, &mr_vif->route_ivif_list,
+					 vif_node)
+		mlxsw_sp_mr_route_ivif_unresolve(mr_table, irve);
+	mr_vif->rif = NULL;
+	return err;
+}
+
+static void mlxsw_sp_mr_vif_unresolve(struct mlxsw_sp_mr_table *mr_table,
+				      struct net_device *dev,
+				      struct mlxsw_sp_mr_vif *mr_vif)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+
+	/* Update all routes where this VIF is used as an unresolved eRIF */
+	list_for_each_entry(rve, &mr_vif->route_evif_list, vif_node)
+		mlxsw_sp_mr_route_evif_unresolve(mr_table, rve);
+
+	/* Update all routes where this VIF is used as an unresolved iRIF */
+	list_for_each_entry(rve, &mr_vif->route_ivif_list, vif_node)
+		mlxsw_sp_mr_route_ivif_unresolve(mr_table, rve);
+
+	/* Update the VIF */
+	mr_vif->dev = dev;
+	mr_vif->rif = NULL;
+}
+
+int mlxsw_sp_mr_vif_add(struct mlxsw_sp_mr_table *mr_table,
+			struct net_device *dev, vifi_t vif_index,
+			unsigned long vif_flags, const struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_mr_vif *mr_vif = &mr_table->vifs[vif_index];
+
+	if (WARN_ON(vif_index >= MAXVIFS))
+		return -EINVAL;
+	if (mr_vif->dev)
+		return -EEXIST;
+	return mlxsw_sp_mr_vif_resolve(mr_table, dev, mr_vif, vif_flags, rif);
+}
+
+void mlxsw_sp_mr_vif_del(struct mlxsw_sp_mr_table *mr_table, vifi_t vif_index)
+{
+	struct mlxsw_sp_mr_vif *mr_vif = &mr_table->vifs[vif_index];
+
+	if (WARN_ON(vif_index >= MAXVIFS))
+		return;
+	if (WARN_ON(!mr_vif->dev))
+		return;
+	mlxsw_sp_mr_vif_unresolve(mr_table, NULL, mr_vif);
+}
+
+static struct mlxsw_sp_mr_vif *
+mlxsw_sp_mr_dev_vif_lookup(struct mlxsw_sp_mr_table *mr_table,
+			   const struct net_device *dev)
+{
+	vifi_t vif_index;
+
+	for (vif_index = 0; vif_index < MAXVIFS; vif_index++)
+		if (mr_table->vifs[vif_index].dev == dev)
+			return &mr_table->vifs[vif_index];
+	return NULL;
+}
+
+int mlxsw_sp_mr_rif_add(struct mlxsw_sp_mr_table *mr_table,
+			const struct mlxsw_sp_rif *rif)
+{
+	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
+	struct mlxsw_sp_mr_vif *mr_vif;
+
+	if (!rif_dev)
+		return 0;
+
+	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
+	if (!mr_vif)
+		return 0;
+	return mlxsw_sp_mr_vif_resolve(mr_table, mr_vif->dev, mr_vif,
+				       mr_vif->vif_flags, rif);
+}
+
+void mlxsw_sp_mr_rif_del(struct mlxsw_sp_mr_table *mr_table,
+			 const struct mlxsw_sp_rif *rif)
+{
+	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
+	struct mlxsw_sp_mr_vif *mr_vif;
+
+	if (!rif_dev)
+		return;
+
+	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
+	if (!mr_vif)
+		return;
+	mlxsw_sp_mr_vif_unresolve(mr_table, mr_vif->dev, mr_vif);
+}
+
+void mlxsw_sp_mr_rif_mtu_update(struct mlxsw_sp_mr_table *mr_table,
+				const struct mlxsw_sp_rif *rif, int mtu)
+{
+	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	struct mlxsw_sp_mr_vif *mr_vif;
+
+	if (!rif_dev)
+		return;
+
+	/* Search for a VIF that use that RIF */
+	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
+	if (!mr_vif)
+		return;
+
+	/* Update all the routes that uses that VIF as eVIF */
+	list_for_each_entry(rve, &mr_vif->route_evif_list, vif_node) {
+		if (mtu < rve->mr_route->min_mtu) {
+			rve->mr_route->min_mtu = mtu;
+			mr->mr_ops->route_min_mtu_update(mlxsw_sp,
+							 rve->mr_route->route_priv,
+							 mtu);
+		}
+	}
+}
+
+/* Protocol specific functions */
+static bool
+mlxsw_sp_mr_route4_validate(const struct mlxsw_sp_mr_table *mr_table,
+			    const struct mr_mfc *c)
+{
+	struct mfc_cache *mfc = (struct mfc_cache *) c;
+
+	/* If the route is a (*,*) route, abort, as these kind of routes are
+	 * used for proxy routes.
+	 */
+	if (mfc->mfc_origin == htonl(INADDR_ANY) &&
+	    mfc->mfc_mcastgrp == htonl(INADDR_ANY)) {
+		dev_warn(mr_table->mlxsw_sp->bus_info->dev,
+			 "Offloading proxy routes is not supported.\n");
+		return false;
+	}
+	return true;
+}
+
+static void mlxsw_sp_mr_route4_key(struct mlxsw_sp_mr_table *mr_table,
+				   struct mlxsw_sp_mr_route_key *key,
+				   struct mr_mfc *c)
+{
+	const struct mfc_cache *mfc = (struct mfc_cache *) c;
+	bool starg;
+
+	starg = (mfc->mfc_origin == htonl(INADDR_ANY));
+
+	memset(key, 0, sizeof(*key));
+	key->vrid = mr_table->vr_id;
+	key->proto = MLXSW_SP_L3_PROTO_IPV4;
+	key->group.addr4 = mfc->mfc_mcastgrp;
+	key->group_mask.addr4 = htonl(0xffffffff);
+	key->source.addr4 = mfc->mfc_origin;
+	key->source_mask.addr4 = htonl(starg ? 0 : 0xffffffff);
+}
+
+static bool mlxsw_sp_mr_route4_starg(const struct mlxsw_sp_mr_table *mr_table,
+				     const struct mlxsw_sp_mr_route *mr_route)
+{
+	return mr_route->key.source_mask.addr4 == htonl(INADDR_ANY);
+}
+
+static bool mlxsw_sp_mr_vif4_is_regular(const struct mlxsw_sp_mr_vif *vif)
+{
+	return !(vif->vif_flags & (VIFF_TUNNEL | VIFF_REGISTER));
+}
+
+static bool
+mlxsw_sp_mr_route6_validate(const struct mlxsw_sp_mr_table *mr_table,
+			    const struct mr_mfc *c)
+{
+	struct mfc6_cache *mfc = (struct mfc6_cache *) c;
+
+	/* If the route is a (*,*) route, abort, as these kind of routes are
+	 * used for proxy routes.
+	 */
+	if (ipv6_addr_any(&mfc->mf6c_origin) &&
+	    ipv6_addr_any(&mfc->mf6c_mcastgrp)) {
+		dev_warn(mr_table->mlxsw_sp->bus_info->dev,
+			 "Offloading proxy routes is not supported.\n");
+		return false;
+	}
+	return true;
+}
+
+static void mlxsw_sp_mr_route6_key(struct mlxsw_sp_mr_table *mr_table,
+				   struct mlxsw_sp_mr_route_key *key,
+				   struct mr_mfc *c)
+{
+	const struct mfc6_cache *mfc = (struct mfc6_cache *) c;
+
+	memset(key, 0, sizeof(*key));
+	key->vrid = mr_table->vr_id;
+	key->proto = MLXSW_SP_L3_PROTO_IPV6;
+	key->group.addr6 = mfc->mf6c_mcastgrp;
+	memset(&key->group_mask.addr6, 0xff, sizeof(key->group_mask.addr6));
+	key->source.addr6 = mfc->mf6c_origin;
+	if (!ipv6_addr_any(&mfc->mf6c_origin))
+		memset(&key->source_mask.addr6, 0xff,
+		       sizeof(key->source_mask.addr6));
+}
+
+static bool mlxsw_sp_mr_route6_starg(const struct mlxsw_sp_mr_table *mr_table,
+				     const struct mlxsw_sp_mr_route *mr_route)
+{
+	return ipv6_addr_any(&mr_route->key.source_mask.addr6);
+}
+
+static bool mlxsw_sp_mr_vif6_is_regular(const struct mlxsw_sp_mr_vif *vif)
+{
+	return !(vif->vif_flags & MIFF_REGISTER);
+}
+
+static struct
+mlxsw_sp_mr_vif_ops mlxsw_sp_mr_vif_ops_arr[] = {
+	{
+		.is_regular = mlxsw_sp_mr_vif4_is_regular,
+	},
+	{
+		.is_regular = mlxsw_sp_mr_vif6_is_regular,
+	},
+};
+
+static struct
+mlxsw_sp_mr_table_ops mlxsw_sp_mr_table_ops_arr[] = {
+	{
+		.is_route_valid = mlxsw_sp_mr_route4_validate,
+		.key_create = mlxsw_sp_mr_route4_key,
+		.is_route_starg = mlxsw_sp_mr_route4_starg,
+	},
+	{
+		.is_route_valid = mlxsw_sp_mr_route6_validate,
+		.key_create = mlxsw_sp_mr_route6_key,
+		.is_route_starg = mlxsw_sp_mr_route6_starg,
+	},
+
+};
+
+struct mlxsw_sp_mr_table *mlxsw_sp_mr_table_create(struct mlxsw_sp *mlxsw_sp,
+						   u32 vr_id,
+						   enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_mr_route_params catchall_route_params = {
+		.prio = MLXSW_SP_MR_ROUTE_PRIO_CATCHALL,
+		.key = {
+			.vrid = vr_id,
+			.proto = proto,
+		},
+		.value = {
+			.route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP,
+		}
+	};
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	struct mlxsw_sp_mr_table *mr_table;
+	int err;
+	int i;
+
+	mr_table = kzalloc(sizeof(*mr_table) + mr->mr_ops->route_priv_size,
+			   GFP_KERNEL);
+	if (!mr_table)
+		return ERR_PTR(-ENOMEM);
+
+	mr_table->vr_id = vr_id;
+	mr_table->mlxsw_sp = mlxsw_sp;
+	mr_table->proto = proto;
+	mr_table->ops = &mlxsw_sp_mr_table_ops_arr[proto];
+	INIT_LIST_HEAD(&mr_table->route_list);
+
+	err = rhashtable_init(&mr_table->route_ht,
+			      &mlxsw_sp_mr_route_ht_params);
+	if (err)
+		goto err_route_rhashtable_init;
+
+	for (i = 0; i < MAXVIFS; i++) {
+		INIT_LIST_HEAD(&mr_table->vifs[i].route_evif_list);
+		INIT_LIST_HEAD(&mr_table->vifs[i].route_ivif_list);
+		mr_table->vifs[i].ops = &mlxsw_sp_mr_vif_ops_arr[proto];
+	}
+
+	err = mr->mr_ops->route_create(mlxsw_sp, mr->priv,
+				       mr_table->catchall_route_priv,
+				       &catchall_route_params);
+	if (err)
+		goto err_ops_route_create;
+	list_add_tail(&mr_table->node, &mr->table_list);
+	return mr_table;
+
+err_ops_route_create:
+	rhashtable_destroy(&mr_table->route_ht);
+err_route_rhashtable_init:
+	kfree(mr_table);
+	return ERR_PTR(err);
+}
+
+void mlxsw_sp_mr_table_destroy(struct mlxsw_sp_mr_table *mr_table)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	WARN_ON(!mlxsw_sp_mr_table_empty(mr_table));
+	list_del(&mr_table->node);
+	mr->mr_ops->route_destroy(mlxsw_sp, mr->priv,
+				  &mr_table->catchall_route_priv);
+	rhashtable_destroy(&mr_table->route_ht);
+	kfree(mr_table);
+}
+
+void mlxsw_sp_mr_table_flush(struct mlxsw_sp_mr_table *mr_table)
+{
+	struct mlxsw_sp_mr_route *mr_route, *tmp;
+	int i;
+
+	list_for_each_entry_safe(mr_route, tmp, &mr_table->route_list, node)
+		__mlxsw_sp_mr_route_del(mr_table, mr_route);
+
+	for (i = 0; i < MAXVIFS; i++) {
+		mr_table->vifs[i].dev = NULL;
+		mr_table->vifs[i].rif = NULL;
+	}
+}
+
+bool mlxsw_sp_mr_table_empty(const struct mlxsw_sp_mr_table *mr_table)
+{
+	int i;
+
+	for (i = 0; i < MAXVIFS; i++)
+		if (mr_table->vifs[i].dev)
+			return false;
+	return list_empty(&mr_table->route_list);
+}
+
+static void mlxsw_sp_mr_route_stats_update(struct mlxsw_sp *mlxsw_sp,
+					   struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u64 packets, bytes;
+
+	if (mr_route->route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return;
+
+	mr->mr_ops->route_stats(mlxsw_sp, mr_route->route_priv, &packets,
+				&bytes);
+
+	if (mr_route->mfc->mfc_un.res.pkt != packets)
+		mr_route->mfc->mfc_un.res.lastuse = jiffies;
+	mr_route->mfc->mfc_un.res.pkt = packets;
+	mr_route->mfc->mfc_un.res.bytes = bytes;
+}
+
+static void mlxsw_sp_mr_stats_update(struct work_struct *work)
+{
+	struct mlxsw_sp_mr *mr = container_of(work, struct mlxsw_sp_mr,
+					      stats_update_dw.work);
+	struct mlxsw_sp_mr_table *mr_table;
+	struct mlxsw_sp_mr_route *mr_route;
+	unsigned long interval;
+
+	rtnl_lock();
+	list_for_each_entry(mr_table, &mr->table_list, node)
+		list_for_each_entry(mr_route, &mr_table->route_list, node)
+			mlxsw_sp_mr_route_stats_update(mr_table->mlxsw_sp,
+						       mr_route);
+	rtnl_unlock();
+
+	interval = msecs_to_jiffies(MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL);
+	mlxsw_core_schedule_dw(&mr->stats_update_dw, interval);
+}
+
+int mlxsw_sp_mr_init(struct mlxsw_sp *mlxsw_sp,
+		     const struct mlxsw_sp_mr_ops *mr_ops)
+{
+	struct mlxsw_sp_mr *mr;
+	unsigned long interval;
+	int err;
+
+	mr = kzalloc(sizeof(*mr) + mr_ops->priv_size, GFP_KERNEL);
+	if (!mr)
+		return -ENOMEM;
+	mr->mr_ops = mr_ops;
+	mlxsw_sp->mr = mr;
+	INIT_LIST_HEAD(&mr->table_list);
+
+	err = mr_ops->init(mlxsw_sp, mr->priv);
+	if (err)
+		goto err;
+
+	/* Create the delayed work for counter updates */
+	INIT_DELAYED_WORK(&mr->stats_update_dw, mlxsw_sp_mr_stats_update);
+	interval = msecs_to_jiffies(MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL);
+	mlxsw_core_schedule_dw(&mr->stats_update_dw, interval);
+	return 0;
+err:
+	kfree(mr);
+	return err;
+}
+
+void mlxsw_sp_mr_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	cancel_delayed_work_sync(&mr->stats_update_dw);
+	mr->mr_ops->fini(mr->priv);
+	kfree(mr);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
new file mode 100644
index 0000000..7c864a8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
@@ -0,0 +1,135 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_SPECTRUM_MCROUTER_H
+#define _MLXSW_SPECTRUM_MCROUTER_H
+
+#include <linux/mroute.h>
+#include <linux/mroute6.h>
+#include "spectrum_router.h"
+#include "spectrum.h"
+
+enum mlxsw_sp_mr_route_action {
+	MLXSW_SP_MR_ROUTE_ACTION_FORWARD,
+	MLXSW_SP_MR_ROUTE_ACTION_TRAP,
+	MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD,
+};
+
+enum mlxsw_sp_mr_route_prio {
+	MLXSW_SP_MR_ROUTE_PRIO_SG,
+	MLXSW_SP_MR_ROUTE_PRIO_STARG,
+	MLXSW_SP_MR_ROUTE_PRIO_CATCHALL,
+	__MLXSW_SP_MR_ROUTE_PRIO_MAX
+};
+
+#define MLXSW_SP_MR_ROUTE_PRIO_MAX (__MLXSW_SP_MR_ROUTE_PRIO_MAX - 1)
+
+struct mlxsw_sp_mr_route_key {
+	int vrid;
+	enum mlxsw_sp_l3proto proto;
+	union mlxsw_sp_l3addr group;
+	union mlxsw_sp_l3addr group_mask;
+	union mlxsw_sp_l3addr source;
+	union mlxsw_sp_l3addr source_mask;
+};
+
+struct mlxsw_sp_mr_route_info {
+	enum mlxsw_sp_mr_route_action route_action;
+	u16 irif_index;
+	u16 *erif_indices;
+	size_t erif_num;
+	u16 min_mtu;
+};
+
+struct mlxsw_sp_mr_route_params {
+	struct mlxsw_sp_mr_route_key key;
+	struct mlxsw_sp_mr_route_info value;
+	enum mlxsw_sp_mr_route_prio prio;
+};
+
+struct mlxsw_sp_mr_ops {
+	int priv_size;
+	int route_priv_size;
+	int (*init)(struct mlxsw_sp *mlxsw_sp, void *priv);
+	int (*route_create)(struct mlxsw_sp *mlxsw_sp, void *priv,
+			    void *route_priv,
+			    struct mlxsw_sp_mr_route_params *route_params);
+	int (*route_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			    struct mlxsw_sp_mr_route_info *route_info);
+	int (*route_stats)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			   u64 *packets, u64 *bytes);
+	int (*route_action_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+				   enum mlxsw_sp_mr_route_action route_action);
+	int (*route_min_mtu_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+				    u16 min_mtu);
+	int (*route_irif_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+				 u16 irif_index);
+	int (*route_erif_add)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			      u16 erif_index);
+	int (*route_erif_del)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			      u16 erif_index);
+	void (*route_destroy)(struct mlxsw_sp *mlxsw_sp, void *priv,
+			      void *route_priv);
+	void (*fini)(void *priv);
+};
+
+struct mlxsw_sp_mr;
+struct mlxsw_sp_mr_table;
+
+int mlxsw_sp_mr_init(struct mlxsw_sp *mlxsw_sp,
+		     const struct mlxsw_sp_mr_ops *mr_ops);
+void mlxsw_sp_mr_fini(struct mlxsw_sp *mlxsw_sp);
+int mlxsw_sp_mr_route_add(struct mlxsw_sp_mr_table *mr_table,
+			  struct mr_mfc *mfc, bool replace);
+void mlxsw_sp_mr_route_del(struct mlxsw_sp_mr_table *mr_table,
+			   struct mr_mfc *mfc);
+int mlxsw_sp_mr_vif_add(struct mlxsw_sp_mr_table *mr_table,
+			struct net_device *dev, vifi_t vif_index,
+			unsigned long vif_flags,
+			const struct mlxsw_sp_rif *rif);
+void mlxsw_sp_mr_vif_del(struct mlxsw_sp_mr_table *mr_table, vifi_t vif_index);
+int mlxsw_sp_mr_rif_add(struct mlxsw_sp_mr_table *mr_table,
+			const struct mlxsw_sp_rif *rif);
+void mlxsw_sp_mr_rif_del(struct mlxsw_sp_mr_table *mr_table,
+			 const struct mlxsw_sp_rif *rif);
+void mlxsw_sp_mr_rif_mtu_update(struct mlxsw_sp_mr_table *mr_table,
+				const struct mlxsw_sp_rif *rif, int mtu);
+struct mlxsw_sp_mr_table *mlxsw_sp_mr_table_create(struct mlxsw_sp *mlxsw_sp,
+						   u32 tb_id,
+						   enum mlxsw_sp_l3proto proto);
+void mlxsw_sp_mr_table_destroy(struct mlxsw_sp_mr_table *mr_table);
+void mlxsw_sp_mr_table_flush(struct mlxsw_sp_mr_table *mr_table);
+bool mlxsw_sp_mr_table_empty(const struct mlxsw_sp_mr_table *mr_table);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
new file mode 100644
index 0000000..4f4c0d3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
@@ -0,0 +1,879 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/parman.h>
+
+#include "spectrum_mr_tcam.h"
+#include "reg.h"
+#include "spectrum.h"
+#include "core_acl_flex_actions.h"
+#include "spectrum_mr.h"
+
+struct mlxsw_sp_mr_tcam_region {
+	struct mlxsw_sp *mlxsw_sp;
+	enum mlxsw_reg_rtar_key_type rtar_key_type;
+	struct parman *parman;
+	struct parman_prio *parman_prios;
+};
+
+struct mlxsw_sp_mr_tcam {
+	struct mlxsw_sp_mr_tcam_region tcam_regions[MLXSW_SP_L3_PROTO_MAX];
+};
+
+/* This struct maps to one RIGR2 register entry */
+struct mlxsw_sp_mr_erif_sublist {
+	struct list_head list;
+	u32 rigr2_kvdl_index;
+	int num_erifs;
+	u16 erif_indices[MLXSW_REG_RIGR2_MAX_ERIFS];
+	bool synced;
+};
+
+struct mlxsw_sp_mr_tcam_erif_list {
+	struct list_head erif_sublists;
+	u32 kvdl_index;
+};
+
+static bool
+mlxsw_sp_mr_erif_sublist_full(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_mr_erif_sublist *erif_sublist)
+{
+	int erif_list_entries = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						   MC_ERIF_LIST_ENTRIES);
+
+	return erif_sublist->num_erifs == erif_list_entries;
+}
+
+static void
+mlxsw_sp_mr_erif_list_init(struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	INIT_LIST_HEAD(&erif_list->erif_sublists);
+}
+
+#define MLXSW_SP_KVDL_RIGR2_SIZE 1
+
+static struct mlxsw_sp_mr_erif_sublist *
+mlxsw_sp_mr_erif_sublist_create(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_sp_mr_erif_sublist *erif_sublist;
+	int err;
+
+	erif_sublist = kzalloc(sizeof(*erif_sublist), GFP_KERNEL);
+	if (!erif_sublist)
+		return ERR_PTR(-ENOMEM);
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_RIGR2_SIZE,
+				  &erif_sublist->rigr2_kvdl_index);
+	if (err) {
+		kfree(erif_sublist);
+		return ERR_PTR(err);
+	}
+
+	list_add_tail(&erif_sublist->list, &erif_list->erif_sublists);
+	return erif_sublist;
+}
+
+static void
+mlxsw_sp_mr_erif_sublist_destroy(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_mr_erif_sublist *erif_sublist)
+{
+	list_del(&erif_sublist->list);
+	mlxsw_sp_kvdl_free(mlxsw_sp, erif_sublist->rigr2_kvdl_index);
+	kfree(erif_sublist);
+}
+
+static int
+mlxsw_sp_mr_erif_list_add(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_mr_tcam_erif_list *erif_list,
+			  u16 erif_index)
+{
+	struct mlxsw_sp_mr_erif_sublist *sublist;
+
+	/* If either there is no erif_entry or the last one is full, allocate a
+	 * new one.
+	 */
+	if (list_empty(&erif_list->erif_sublists)) {
+		sublist = mlxsw_sp_mr_erif_sublist_create(mlxsw_sp, erif_list);
+		if (IS_ERR(sublist))
+			return PTR_ERR(sublist);
+		erif_list->kvdl_index = sublist->rigr2_kvdl_index;
+	} else {
+		sublist = list_last_entry(&erif_list->erif_sublists,
+					  struct mlxsw_sp_mr_erif_sublist,
+					  list);
+		sublist->synced = false;
+		if (mlxsw_sp_mr_erif_sublist_full(mlxsw_sp, sublist)) {
+			sublist = mlxsw_sp_mr_erif_sublist_create(mlxsw_sp,
+								  erif_list);
+			if (IS_ERR(sublist))
+				return PTR_ERR(sublist);
+		}
+	}
+
+	/* Add the eRIF to the last entry's last index */
+	sublist->erif_indices[sublist->num_erifs++] = erif_index;
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_erif_list_flush(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_sp_mr_erif_sublist *erif_sublist, *tmp;
+
+	list_for_each_entry_safe(erif_sublist, tmp, &erif_list->erif_sublists,
+				 list)
+		mlxsw_sp_mr_erif_sublist_destroy(mlxsw_sp, erif_sublist);
+}
+
+static int
+mlxsw_sp_mr_erif_list_commit(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_sp_mr_erif_sublist *curr_sublist;
+	char rigr2_pl[MLXSW_REG_RIGR2_LEN];
+	int err;
+	int i;
+
+	list_for_each_entry(curr_sublist, &erif_list->erif_sublists, list) {
+		if (curr_sublist->synced)
+			continue;
+
+		/* If the sublist is not the last one, pack the next index */
+		if (list_is_last(&curr_sublist->list,
+				 &erif_list->erif_sublists)) {
+			mlxsw_reg_rigr2_pack(rigr2_pl,
+					     curr_sublist->rigr2_kvdl_index,
+					     false, 0);
+		} else {
+			struct mlxsw_sp_mr_erif_sublist *next_sublist;
+
+			next_sublist = list_next_entry(curr_sublist, list);
+			mlxsw_reg_rigr2_pack(rigr2_pl,
+					     curr_sublist->rigr2_kvdl_index,
+					     true,
+					     next_sublist->rigr2_kvdl_index);
+		}
+
+		/* Pack all the erifs */
+		for (i = 0; i < curr_sublist->num_erifs; i++) {
+			u16 erif_index = curr_sublist->erif_indices[i];
+
+			mlxsw_reg_rigr2_erif_entry_pack(rigr2_pl, i, true,
+							erif_index);
+		}
+
+		/* Write the entry */
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rigr2),
+				      rigr2_pl);
+		if (err)
+			/* No need of a rollback here because this
+			 * hardware entry should not be pointed yet.
+			 */
+			return err;
+		curr_sublist->synced = true;
+	}
+	return 0;
+}
+
+static void mlxsw_sp_mr_erif_list_move(struct mlxsw_sp_mr_tcam_erif_list *to,
+				       struct mlxsw_sp_mr_tcam_erif_list *from)
+{
+	list_splice(&from->erif_sublists, &to->erif_sublists);
+	to->kvdl_index = from->kvdl_index;
+}
+
+struct mlxsw_sp_mr_tcam_route {
+	struct mlxsw_sp_mr_tcam_erif_list erif_list;
+	struct mlxsw_afa_block *afa_block;
+	u32 counter_index;
+	struct parman_item parman_item;
+	struct parman_prio *parman_prio;
+	enum mlxsw_sp_mr_route_action action;
+	struct mlxsw_sp_mr_route_key key;
+	u16 irif_index;
+	u16 min_mtu;
+};
+
+static struct mlxsw_afa_block *
+mlxsw_sp_mr_tcam_afa_block_create(struct mlxsw_sp *mlxsw_sp,
+				  enum mlxsw_sp_mr_route_action route_action,
+				  u16 irif_index, u32 counter_index,
+				  u16 min_mtu,
+				  struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	afa_block = mlxsw_afa_block_create(mlxsw_sp->afa);
+	if (!afa_block)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlxsw_afa_block_append_allocated_counter(afa_block,
+						       counter_index);
+	if (err)
+		goto err;
+
+	switch (route_action) {
+	case MLXSW_SP_MR_ROUTE_ACTION_TRAP:
+		err = mlxsw_afa_block_append_trap(afa_block,
+						  MLXSW_TRAP_ID_ACL1);
+		if (err)
+			goto err;
+		break;
+	case MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD:
+	case MLXSW_SP_MR_ROUTE_ACTION_FORWARD:
+		/* If we are about to append a multicast router action, commit
+		 * the erif_list.
+		 */
+		err = mlxsw_sp_mr_erif_list_commit(mlxsw_sp, erif_list);
+		if (err)
+			goto err;
+
+		err = mlxsw_afa_block_append_mcrouter(afa_block, irif_index,
+						      min_mtu, false,
+						      erif_list->kvdl_index);
+		if (err)
+			goto err;
+
+		if (route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD) {
+			err = mlxsw_afa_block_append_trap_and_forward(afa_block,
+								      MLXSW_TRAP_ID_ACL2);
+			if (err)
+				goto err;
+		}
+		break;
+	default:
+		err = -EINVAL;
+		goto err;
+	}
+
+	err = mlxsw_afa_block_commit(afa_block);
+	if (err)
+		goto err;
+	return afa_block;
+err:
+	mlxsw_afa_block_destroy(afa_block);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_mr_tcam_afa_block_destroy(struct mlxsw_afa_block *afa_block)
+{
+	mlxsw_afa_block_destroy(afa_block);
+}
+
+static int mlxsw_sp_mr_tcam_route_replace(struct mlxsw_sp *mlxsw_sp,
+					  struct parman_item *parman_item,
+					  struct mlxsw_sp_mr_route_key *key,
+					  struct mlxsw_afa_block *afa_block)
+{
+	char rmft2_pl[MLXSW_REG_RMFT2_LEN];
+
+	switch (key->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_reg_rmft2_ipv4_pack(rmft2_pl, true, parman_item->index,
+					  key->vrid,
+					  MLXSW_REG_RMFT2_IRIF_MASK_IGNORE, 0,
+					  ntohl(key->group.addr4),
+					  ntohl(key->group_mask.addr4),
+					  ntohl(key->source.addr4),
+					  ntohl(key->source_mask.addr4),
+					  mlxsw_afa_block_first_set(afa_block));
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		mlxsw_reg_rmft2_ipv6_pack(rmft2_pl, true, parman_item->index,
+					  key->vrid,
+					  MLXSW_REG_RMFT2_IRIF_MASK_IGNORE, 0,
+					  key->group.addr6,
+					  key->group_mask.addr6,
+					  key->source.addr6,
+					  key->source_mask.addr6,
+					  mlxsw_afa_block_first_set(afa_block));
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rmft2), rmft2_pl);
+}
+
+static int mlxsw_sp_mr_tcam_route_remove(struct mlxsw_sp *mlxsw_sp, int vrid,
+					 struct mlxsw_sp_mr_route_key *key,
+					 struct parman_item *parman_item)
+{
+	struct in6_addr zero_addr = IN6ADDR_ANY_INIT;
+	char rmft2_pl[MLXSW_REG_RMFT2_LEN];
+
+	switch (key->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_reg_rmft2_ipv4_pack(rmft2_pl, false, parman_item->index,
+					  vrid, 0, 0, 0, 0, 0, 0, NULL);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		mlxsw_reg_rmft2_ipv6_pack(rmft2_pl, false, parman_item->index,
+					  vrid, 0, 0, zero_addr, zero_addr,
+					  zero_addr, zero_addr, NULL);
+		break;
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rmft2), rmft2_pl);
+}
+
+static int
+mlxsw_sp_mr_tcam_erif_populate(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_mr_tcam_erif_list *erif_list,
+			       struct mlxsw_sp_mr_route_info *route_info)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < route_info->erif_num; i++) {
+		u16 erif_index = route_info->erif_indices[i];
+
+		err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, erif_list,
+						erif_index);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static struct mlxsw_sp_mr_tcam_region *
+mlxsw_sp_mr_tcam_protocol_region(struct mlxsw_sp_mr_tcam *mr_tcam,
+				 enum mlxsw_sp_l3proto proto)
+{
+	return &mr_tcam->tcam_regions[proto];
+}
+
+static int
+mlxsw_sp_mr_tcam_route_parman_item_add(struct mlxsw_sp_mr_tcam *mr_tcam,
+				       struct mlxsw_sp_mr_tcam_route *route,
+				       enum mlxsw_sp_mr_route_prio prio)
+{
+	struct mlxsw_sp_mr_tcam_region *tcam_region;
+	int err;
+
+	tcam_region = mlxsw_sp_mr_tcam_protocol_region(mr_tcam,
+						       route->key.proto);
+	err = parman_item_add(tcam_region->parman,
+			      &tcam_region->parman_prios[prio],
+			      &route->parman_item);
+	if (err)
+		return err;
+
+	route->parman_prio = &tcam_region->parman_prios[prio];
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_tcam_route_parman_item_remove(struct mlxsw_sp_mr_tcam *mr_tcam,
+					  struct mlxsw_sp_mr_tcam_route *route)
+{
+	struct mlxsw_sp_mr_tcam_region *tcam_region;
+
+	tcam_region = mlxsw_sp_mr_tcam_protocol_region(mr_tcam,
+						       route->key.proto);
+
+	parman_item_remove(tcam_region->parman,
+			   route->parman_prio, &route->parman_item);
+}
+
+static int
+mlxsw_sp_mr_tcam_route_create(struct mlxsw_sp *mlxsw_sp, void *priv,
+			      void *route_priv,
+			      struct mlxsw_sp_mr_route_params *route_params)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+	int err;
+
+	route->key = route_params->key;
+	route->irif_index = route_params->value.irif_index;
+	route->min_mtu = route_params->value.min_mtu;
+	route->action = route_params->value.route_action;
+
+	/* Create the egress RIFs list */
+	mlxsw_sp_mr_erif_list_init(&route->erif_list);
+	err = mlxsw_sp_mr_tcam_erif_populate(mlxsw_sp, &route->erif_list,
+					     &route_params->value);
+	if (err)
+		goto err_erif_populate;
+
+	/* Create the flow counter */
+	err = mlxsw_sp_flow_counter_alloc(mlxsw_sp, &route->counter_index);
+	if (err)
+		goto err_counter_alloc;
+
+	/* Create the flexible action block */
+	route->afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp,
+							     route->action,
+							     route->irif_index,
+							     route->counter_index,
+							     route->min_mtu,
+							     &route->erif_list);
+	if (IS_ERR(route->afa_block)) {
+		err = PTR_ERR(route->afa_block);
+		goto err_afa_block_create;
+	}
+
+	/* Allocate place in the TCAM */
+	err = mlxsw_sp_mr_tcam_route_parman_item_add(mr_tcam, route,
+						     route_params->prio);
+	if (err)
+		goto err_parman_item_add;
+
+	/* Write the route to the TCAM */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, route->afa_block);
+	if (err)
+		goto err_route_replace;
+	return 0;
+
+err_route_replace:
+	mlxsw_sp_mr_tcam_route_parman_item_remove(mr_tcam, route);
+err_parman_item_add:
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+err_afa_block_create:
+	mlxsw_sp_flow_counter_free(mlxsw_sp, route->counter_index);
+err_erif_populate:
+err_counter_alloc:
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+	return err;
+}
+
+static void mlxsw_sp_mr_tcam_route_destroy(struct mlxsw_sp *mlxsw_sp,
+					   void *priv, void *route_priv)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+
+	mlxsw_sp_mr_tcam_route_remove(mlxsw_sp, route->key.vrid,
+				      &route->key, &route->parman_item);
+	mlxsw_sp_mr_tcam_route_parman_item_remove(mr_tcam, route);
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	mlxsw_sp_flow_counter_free(mlxsw_sp, route->counter_index);
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+}
+
+static int mlxsw_sp_mr_tcam_route_stats(struct mlxsw_sp *mlxsw_sp,
+					void *route_priv, u64 *packets,
+					u64 *bytes)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+
+	return mlxsw_sp_flow_counter_get(mlxsw_sp, route->counter_index,
+					 packets, bytes);
+}
+
+static int
+mlxsw_sp_mr_tcam_route_action_update(struct mlxsw_sp *mlxsw_sp,
+				     void *route_priv,
+				     enum mlxsw_sp_mr_route_action route_action)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	/* Create a new flexible action block */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp, route_action,
+						      route->irif_index,
+						      route->counter_index,
+						      route->min_mtu,
+						      &route->erif_list);
+	if (IS_ERR(afa_block))
+		return PTR_ERR(afa_block);
+
+	/* Update the TCAM route entry */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, afa_block);
+	if (err)
+		goto err;
+
+	/* Delete the old one */
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	route->afa_block = afa_block;
+	route->action = route_action;
+	return 0;
+err:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+	return err;
+}
+
+static int mlxsw_sp_mr_tcam_route_min_mtu_update(struct mlxsw_sp *mlxsw_sp,
+						 void *route_priv, u16 min_mtu)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	/* Create a new flexible action block */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp,
+						      route->action,
+						      route->irif_index,
+						      route->counter_index,
+						      min_mtu,
+						      &route->erif_list);
+	if (IS_ERR(afa_block))
+		return PTR_ERR(afa_block);
+
+	/* Update the TCAM route entry */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, afa_block);
+	if (err)
+		goto err;
+
+	/* Delete the old one */
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	route->afa_block = afa_block;
+	route->min_mtu = min_mtu;
+	return 0;
+err:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+	return err;
+}
+
+static int mlxsw_sp_mr_tcam_route_irif_update(struct mlxsw_sp *mlxsw_sp,
+					      void *route_priv, u16 irif_index)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+
+	if (route->action != MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return -EINVAL;
+	route->irif_index = irif_index;
+	return 0;
+}
+
+static int mlxsw_sp_mr_tcam_route_erif_add(struct mlxsw_sp *mlxsw_sp,
+					   void *route_priv, u16 erif_index)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	int err;
+
+	err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, &route->erif_list,
+					erif_index);
+	if (err)
+		return err;
+
+	/* Commit the action only if the route action is not TRAP */
+	if (route->action != MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return mlxsw_sp_mr_erif_list_commit(mlxsw_sp,
+						    &route->erif_list);
+	return 0;
+}
+
+static int mlxsw_sp_mr_tcam_route_erif_del(struct mlxsw_sp *mlxsw_sp,
+					   void *route_priv, u16 erif_index)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_erif_sublist *erif_sublist;
+	struct mlxsw_sp_mr_tcam_erif_list erif_list;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+	int i;
+
+	/* Create a copy of the original erif_list without the deleted entry */
+	mlxsw_sp_mr_erif_list_init(&erif_list);
+	list_for_each_entry(erif_sublist, &route->erif_list.erif_sublists, list) {
+		for (i = 0; i < erif_sublist->num_erifs; i++) {
+			u16 curr_erif = erif_sublist->erif_indices[i];
+
+			if (curr_erif == erif_index)
+				continue;
+			err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, &erif_list,
+							curr_erif);
+			if (err)
+				goto err_erif_list_add;
+		}
+	}
+
+	/* Create the flexible action block pointing to the new erif_list */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp, route->action,
+						      route->irif_index,
+						      route->counter_index,
+						      route->min_mtu,
+						      &erif_list);
+	if (IS_ERR(afa_block)) {
+		err = PTR_ERR(afa_block);
+		goto err_afa_block_create;
+	}
+
+	/* Update the TCAM route entry */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, afa_block);
+	if (err)
+		goto err_route_write;
+
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+	route->afa_block = afa_block;
+	mlxsw_sp_mr_erif_list_move(&route->erif_list, &erif_list);
+	return 0;
+
+err_route_write:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+err_afa_block_create:
+err_erif_list_add:
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &erif_list);
+	return err;
+}
+
+static int
+mlxsw_sp_mr_tcam_route_update(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			      struct mlxsw_sp_mr_route_info *route_info)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_tcam_erif_list erif_list;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	/* Create a new erif_list */
+	mlxsw_sp_mr_erif_list_init(&erif_list);
+	err = mlxsw_sp_mr_tcam_erif_populate(mlxsw_sp, &erif_list, route_info);
+	if (err)
+		goto err_erif_populate;
+
+	/* Create the flexible action block pointing to the new erif_list */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp,
+						      route_info->route_action,
+						      route_info->irif_index,
+						      route->counter_index,
+						      route_info->min_mtu,
+						      &erif_list);
+	if (IS_ERR(afa_block)) {
+		err = PTR_ERR(afa_block);
+		goto err_afa_block_create;
+	}
+
+	/* Update the TCAM route entry */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, afa_block);
+	if (err)
+		goto err_route_write;
+
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+	route->afa_block = afa_block;
+	mlxsw_sp_mr_erif_list_move(&route->erif_list, &erif_list);
+	route->action = route_info->route_action;
+	route->irif_index = route_info->irif_index;
+	route->min_mtu = route_info->min_mtu;
+	return 0;
+
+err_route_write:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+err_afa_block_create:
+err_erif_populate:
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &erif_list);
+	return err;
+}
+
+#define MLXSW_SP_MR_TCAM_REGION_BASE_COUNT 16
+#define MLXSW_SP_MR_TCAM_REGION_RESIZE_STEP 16
+
+static int
+mlxsw_sp_mr_tcam_region_alloc(struct mlxsw_sp_mr_tcam_region *mr_tcam_region)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rtar_pl[MLXSW_REG_RTAR_LEN];
+
+	mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_ALLOCATE,
+			    mr_tcam_region->rtar_key_type,
+			    MLXSW_SP_MR_TCAM_REGION_BASE_COUNT);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl);
+}
+
+static void
+mlxsw_sp_mr_tcam_region_free(struct mlxsw_sp_mr_tcam_region *mr_tcam_region)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rtar_pl[MLXSW_REG_RTAR_LEN];
+
+	mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_DEALLOCATE,
+			    mr_tcam_region->rtar_key_type, 0);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl);
+}
+
+static int mlxsw_sp_mr_tcam_region_parman_resize(void *priv,
+						 unsigned long new_count)
+{
+	struct mlxsw_sp_mr_tcam_region *mr_tcam_region = priv;
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rtar_pl[MLXSW_REG_RTAR_LEN];
+	u64 max_tcam_rules;
+
+	max_tcam_rules = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_TCAM_RULES);
+	if (new_count > max_tcam_rules)
+		return -EINVAL;
+	mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_RESIZE,
+			    mr_tcam_region->rtar_key_type, new_count);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl);
+}
+
+static void mlxsw_sp_mr_tcam_region_parman_move(void *priv,
+						unsigned long from_index,
+						unsigned long to_index,
+						unsigned long count)
+{
+	struct mlxsw_sp_mr_tcam_region *mr_tcam_region = priv;
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rrcr_pl[MLXSW_REG_RRCR_LEN];
+
+	mlxsw_reg_rrcr_pack(rrcr_pl, MLXSW_REG_RRCR_OP_MOVE,
+			    from_index, count,
+			    mr_tcam_region->rtar_key_type, to_index);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rrcr), rrcr_pl);
+}
+
+static const struct parman_ops mlxsw_sp_mr_tcam_region_parman_ops = {
+	.base_count	= MLXSW_SP_MR_TCAM_REGION_BASE_COUNT,
+	.resize_step	= MLXSW_SP_MR_TCAM_REGION_RESIZE_STEP,
+	.resize		= mlxsw_sp_mr_tcam_region_parman_resize,
+	.move		= mlxsw_sp_mr_tcam_region_parman_move,
+	.algo		= PARMAN_ALGO_TYPE_LSORT,
+};
+
+static int
+mlxsw_sp_mr_tcam_region_init(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_mr_tcam_region *mr_tcam_region,
+			     enum mlxsw_reg_rtar_key_type rtar_key_type)
+{
+	struct parman_prio *parman_prios;
+	struct parman *parman;
+	int err;
+	int i;
+
+	mr_tcam_region->rtar_key_type = rtar_key_type;
+	mr_tcam_region->mlxsw_sp = mlxsw_sp;
+
+	err = mlxsw_sp_mr_tcam_region_alloc(mr_tcam_region);
+	if (err)
+		return err;
+
+	parman = parman_create(&mlxsw_sp_mr_tcam_region_parman_ops,
+			       mr_tcam_region);
+	if (!parman) {
+		err = -ENOMEM;
+		goto err_parman_create;
+	}
+	mr_tcam_region->parman = parman;
+
+	parman_prios = kmalloc_array(MLXSW_SP_MR_ROUTE_PRIO_MAX + 1,
+				     sizeof(*parman_prios), GFP_KERNEL);
+	if (!parman_prios) {
+		err = -ENOMEM;
+		goto err_parman_prios_alloc;
+	}
+	mr_tcam_region->parman_prios = parman_prios;
+
+	for (i = 0; i < MLXSW_SP_MR_ROUTE_PRIO_MAX + 1; i++)
+		parman_prio_init(mr_tcam_region->parman,
+				 &mr_tcam_region->parman_prios[i], i);
+	return 0;
+
+err_parman_prios_alloc:
+	parman_destroy(parman);
+err_parman_create:
+	mlxsw_sp_mr_tcam_region_free(mr_tcam_region);
+	return err;
+}
+
+static void
+mlxsw_sp_mr_tcam_region_fini(struct mlxsw_sp_mr_tcam_region *mr_tcam_region)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_SP_MR_ROUTE_PRIO_MAX + 1; i++)
+		parman_prio_fini(&mr_tcam_region->parman_prios[i]);
+	kfree(mr_tcam_region->parman_prios);
+	parman_destroy(mr_tcam_region->parman);
+	mlxsw_sp_mr_tcam_region_free(mr_tcam_region);
+}
+
+static int mlxsw_sp_mr_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+	struct mlxsw_sp_mr_tcam_region *region = &mr_tcam->tcam_regions[0];
+	u32 rtar_key;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MC_ERIF_LIST_ENTRIES) ||
+	    !MLXSW_CORE_RES_VALID(mlxsw_sp->core, ACL_MAX_TCAM_RULES))
+		return -EIO;
+
+	rtar_key = MLXSW_REG_RTAR_KEY_TYPE_IPV4_MULTICAST;
+	err = mlxsw_sp_mr_tcam_region_init(mlxsw_sp,
+					   &region[MLXSW_SP_L3_PROTO_IPV4],
+					   rtar_key);
+	if (err)
+		return err;
+
+	rtar_key = MLXSW_REG_RTAR_KEY_TYPE_IPV6_MULTICAST;
+	err = mlxsw_sp_mr_tcam_region_init(mlxsw_sp,
+					   &region[MLXSW_SP_L3_PROTO_IPV6],
+					   rtar_key);
+	if (err)
+		goto err_ipv6_region_init;
+
+	return 0;
+
+err_ipv6_region_init:
+	mlxsw_sp_mr_tcam_region_fini(&region[MLXSW_SP_L3_PROTO_IPV4]);
+	return err;
+}
+
+static void mlxsw_sp_mr_tcam_fini(void *priv)
+{
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+	struct mlxsw_sp_mr_tcam_region *region = &mr_tcam->tcam_regions[0];
+
+	mlxsw_sp_mr_tcam_region_fini(&region[MLXSW_SP_L3_PROTO_IPV6]);
+	mlxsw_sp_mr_tcam_region_fini(&region[MLXSW_SP_L3_PROTO_IPV4]);
+}
+
+const struct mlxsw_sp_mr_ops mlxsw_sp_mr_tcam_ops = {
+	.priv_size = sizeof(struct mlxsw_sp_mr_tcam),
+	.route_priv_size = sizeof(struct mlxsw_sp_mr_tcam_route),
+	.init = mlxsw_sp_mr_tcam_init,
+	.route_create = mlxsw_sp_mr_tcam_route_create,
+	.route_update = mlxsw_sp_mr_tcam_route_update,
+	.route_stats = mlxsw_sp_mr_tcam_route_stats,
+	.route_action_update = mlxsw_sp_mr_tcam_route_action_update,
+	.route_min_mtu_update = mlxsw_sp_mr_tcam_route_min_mtu_update,
+	.route_irif_update = mlxsw_sp_mr_tcam_route_irif_update,
+	.route_erif_add = mlxsw_sp_mr_tcam_route_erif_add,
+	.route_erif_del = mlxsw_sp_mr_tcam_route_erif_del,
+	.route_destroy = mlxsw_sp_mr_tcam_route_destroy,
+	.fini = mlxsw_sp_mr_tcam_fini,
+};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h
new file mode 100644
index 0000000..f9b59ee
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h
@@ -0,0 +1,43 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_SPECTRUM_MCROUTER_TCAM_H
+#define _MLXSW_SPECTRUM_MCROUTER_TCAM_H
+
+#include "spectrum.h"
+#include "spectrum_mr.h"
+
+extern const struct mlxsw_sp_mr_ops mlxsw_sp_mr_tcam_ops;
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
new file mode 100644
index 0000000..91262b0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
@@ -0,0 +1,764 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Nogah Frankel <nogahf@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <net/pkt_cls.h>
+#include <net/red.h>
+
+#include "spectrum.h"
+#include "reg.h"
+
+#define MLXSW_SP_PRIO_BAND_TO_TCLASS(band) (IEEE_8021QAZ_MAX_TCS - band - 1)
+#define MLXSW_SP_PRIO_CHILD_TO_TCLASS(child) \
+	MLXSW_SP_PRIO_BAND_TO_TCLASS((child - 1))
+
+enum mlxsw_sp_qdisc_type {
+	MLXSW_SP_QDISC_NO_QDISC,
+	MLXSW_SP_QDISC_RED,
+	MLXSW_SP_QDISC_PRIO,
+};
+
+struct mlxsw_sp_qdisc_ops {
+	enum mlxsw_sp_qdisc_type type;
+	int (*check_params)(struct mlxsw_sp_port *mlxsw_sp_port,
+			    struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			    void *params);
+	int (*replace)(struct mlxsw_sp_port *mlxsw_sp_port,
+		       struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, void *params);
+	int (*destroy)(struct mlxsw_sp_port *mlxsw_sp_port,
+		       struct mlxsw_sp_qdisc *mlxsw_sp_qdisc);
+	int (*get_stats)(struct mlxsw_sp_port *mlxsw_sp_port,
+			 struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			 struct tc_qopt_offload_stats *stats_ptr);
+	int (*get_xstats)(struct mlxsw_sp_port *mlxsw_sp_port,
+			  struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			  void *xstats_ptr);
+	void (*clean_stats)(struct mlxsw_sp_port *mlxsw_sp_port,
+			    struct mlxsw_sp_qdisc *mlxsw_sp_qdisc);
+	/* unoffload - to be used for a qdisc that stops being offloaded without
+	 * being destroyed.
+	 */
+	void (*unoffload)(struct mlxsw_sp_port *mlxsw_sp_port,
+			  struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, void *params);
+};
+
+struct mlxsw_sp_qdisc {
+	u32 handle;
+	u8 tclass_num;
+	u8 prio_bitmap;
+	union {
+		struct red_stats red;
+	} xstats_base;
+	struct mlxsw_sp_qdisc_stats {
+		u64 tx_bytes;
+		u64 tx_packets;
+		u64 drops;
+		u64 overlimits;
+		u64 backlog;
+	} stats_base;
+
+	struct mlxsw_sp_qdisc_ops *ops;
+};
+
+static bool
+mlxsw_sp_qdisc_compare(struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, u32 handle,
+		       enum mlxsw_sp_qdisc_type type)
+{
+	return mlxsw_sp_qdisc && mlxsw_sp_qdisc->ops &&
+	       mlxsw_sp_qdisc->ops->type == type &&
+	       mlxsw_sp_qdisc->handle == handle;
+}
+
+static struct mlxsw_sp_qdisc *
+mlxsw_sp_qdisc_find(struct mlxsw_sp_port *mlxsw_sp_port, u32 parent,
+		    bool root_only)
+{
+	int tclass, child_index;
+
+	if (parent == TC_H_ROOT)
+		return mlxsw_sp_port->root_qdisc;
+
+	if (root_only || !mlxsw_sp_port->root_qdisc ||
+	    !mlxsw_sp_port->root_qdisc->ops ||
+	    TC_H_MAJ(parent) != mlxsw_sp_port->root_qdisc->handle ||
+	    TC_H_MIN(parent) > IEEE_8021QAZ_MAX_TCS)
+		return NULL;
+
+	child_index = TC_H_MIN(parent);
+	tclass = MLXSW_SP_PRIO_CHILD_TO_TCLASS(child_index);
+	return &mlxsw_sp_port->tclass_qdiscs[tclass];
+}
+
+static struct mlxsw_sp_qdisc *
+mlxsw_sp_qdisc_find_by_handle(struct mlxsw_sp_port *mlxsw_sp_port, u32 handle)
+{
+	int i;
+
+	if (mlxsw_sp_port->root_qdisc->handle == handle)
+		return mlxsw_sp_port->root_qdisc;
+
+	if (mlxsw_sp_port->root_qdisc->handle == TC_H_UNSPEC)
+		return NULL;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		if (mlxsw_sp_port->tclass_qdiscs[i].handle == handle)
+			return &mlxsw_sp_port->tclass_qdiscs[i];
+
+	return NULL;
+}
+
+static int
+mlxsw_sp_qdisc_destroy(struct mlxsw_sp_port *mlxsw_sp_port,
+		       struct mlxsw_sp_qdisc *mlxsw_sp_qdisc)
+{
+	int err = 0;
+
+	if (!mlxsw_sp_qdisc)
+		return 0;
+
+	if (mlxsw_sp_qdisc->ops && mlxsw_sp_qdisc->ops->destroy)
+		err = mlxsw_sp_qdisc->ops->destroy(mlxsw_sp_port,
+						   mlxsw_sp_qdisc);
+
+	mlxsw_sp_qdisc->handle = TC_H_UNSPEC;
+	mlxsw_sp_qdisc->ops = NULL;
+	return err;
+}
+
+static int
+mlxsw_sp_qdisc_replace(struct mlxsw_sp_port *mlxsw_sp_port, u32 handle,
+		       struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+		       struct mlxsw_sp_qdisc_ops *ops, void *params)
+{
+	int err;
+
+	if (mlxsw_sp_qdisc->ops && mlxsw_sp_qdisc->ops->type != ops->type)
+		/* In case this location contained a different qdisc of the
+		 * same type we can override the old qdisc configuration.
+		 * Otherwise, we need to remove the old qdisc before setting the
+		 * new one.
+		 */
+		mlxsw_sp_qdisc_destroy(mlxsw_sp_port, mlxsw_sp_qdisc);
+	err = ops->check_params(mlxsw_sp_port, mlxsw_sp_qdisc, params);
+	if (err)
+		goto err_bad_param;
+
+	err = ops->replace(mlxsw_sp_port, mlxsw_sp_qdisc, params);
+	if (err)
+		goto err_config;
+
+	if (mlxsw_sp_qdisc->handle != handle) {
+		mlxsw_sp_qdisc->ops = ops;
+		if (ops->clean_stats)
+			ops->clean_stats(mlxsw_sp_port, mlxsw_sp_qdisc);
+	}
+
+	mlxsw_sp_qdisc->handle = handle;
+	return 0;
+
+err_bad_param:
+err_config:
+	if (mlxsw_sp_qdisc->handle == handle && ops->unoffload)
+		ops->unoffload(mlxsw_sp_port, mlxsw_sp_qdisc, params);
+
+	mlxsw_sp_qdisc_destroy(mlxsw_sp_port, mlxsw_sp_qdisc);
+	return err;
+}
+
+static int
+mlxsw_sp_qdisc_get_stats(struct mlxsw_sp_port *mlxsw_sp_port,
+			 struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			 struct tc_qopt_offload_stats *stats_ptr)
+{
+	if (mlxsw_sp_qdisc && mlxsw_sp_qdisc->ops &&
+	    mlxsw_sp_qdisc->ops->get_stats)
+		return mlxsw_sp_qdisc->ops->get_stats(mlxsw_sp_port,
+						      mlxsw_sp_qdisc,
+						      stats_ptr);
+
+	return -EOPNOTSUPP;
+}
+
+static int
+mlxsw_sp_qdisc_get_xstats(struct mlxsw_sp_port *mlxsw_sp_port,
+			  struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			  void *xstats_ptr)
+{
+	if (mlxsw_sp_qdisc && mlxsw_sp_qdisc->ops &&
+	    mlxsw_sp_qdisc->ops->get_xstats)
+		return mlxsw_sp_qdisc->ops->get_xstats(mlxsw_sp_port,
+						      mlxsw_sp_qdisc,
+						      xstats_ptr);
+
+	return -EOPNOTSUPP;
+}
+
+static void
+mlxsw_sp_qdisc_bstats_per_priority_get(struct mlxsw_sp_port_xstats *xstats,
+				       u8 prio_bitmap, u64 *tx_packets,
+				       u64 *tx_bytes)
+{
+	int i;
+
+	*tx_packets = 0;
+	*tx_bytes = 0;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (prio_bitmap & BIT(i)) {
+			*tx_packets += xstats->tx_packets[i];
+			*tx_bytes += xstats->tx_bytes[i];
+		}
+	}
+}
+
+static int
+mlxsw_sp_tclass_congestion_enable(struct mlxsw_sp_port *mlxsw_sp_port,
+				  int tclass_num, u32 min, u32 max,
+				  u32 probability, bool is_ecn)
+{
+	char cwtpm_cmd[MLXSW_REG_CWTPM_LEN];
+	char cwtp_cmd[MLXSW_REG_CWTP_LEN];
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	int err;
+
+	mlxsw_reg_cwtp_pack(cwtp_cmd, mlxsw_sp_port->local_port, tclass_num);
+	mlxsw_reg_cwtp_profile_pack(cwtp_cmd, MLXSW_REG_CWTP_DEFAULT_PROFILE,
+				    roundup(min, MLXSW_REG_CWTP_MIN_VALUE),
+				    roundup(max, MLXSW_REG_CWTP_MIN_VALUE),
+				    probability);
+
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(cwtp), cwtp_cmd);
+	if (err)
+		return err;
+
+	mlxsw_reg_cwtpm_pack(cwtpm_cmd, mlxsw_sp_port->local_port, tclass_num,
+			     MLXSW_REG_CWTP_DEFAULT_PROFILE, true, is_ecn);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(cwtpm), cwtpm_cmd);
+}
+
+static int
+mlxsw_sp_tclass_congestion_disable(struct mlxsw_sp_port *mlxsw_sp_port,
+				   int tclass_num)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char cwtpm_cmd[MLXSW_REG_CWTPM_LEN];
+
+	mlxsw_reg_cwtpm_pack(cwtpm_cmd, mlxsw_sp_port->local_port, tclass_num,
+			     MLXSW_REG_CWTPM_RESET_PROFILE, false, false);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(cwtpm), cwtpm_cmd);
+}
+
+static void
+mlxsw_sp_setup_tc_qdisc_red_clean_stats(struct mlxsw_sp_port *mlxsw_sp_port,
+					struct mlxsw_sp_qdisc *mlxsw_sp_qdisc)
+{
+	u8 tclass_num = mlxsw_sp_qdisc->tclass_num;
+	struct mlxsw_sp_qdisc_stats *stats_base;
+	struct mlxsw_sp_port_xstats *xstats;
+	struct red_stats *red_base;
+
+	xstats = &mlxsw_sp_port->periodic_hw_stats.xstats;
+	stats_base = &mlxsw_sp_qdisc->stats_base;
+	red_base = &mlxsw_sp_qdisc->xstats_base.red;
+
+	mlxsw_sp_qdisc_bstats_per_priority_get(xstats,
+					       mlxsw_sp_qdisc->prio_bitmap,
+					       &stats_base->tx_packets,
+					       &stats_base->tx_bytes);
+	red_base->prob_mark = xstats->ecn;
+	red_base->prob_drop = xstats->wred_drop[tclass_num];
+	red_base->pdrop = xstats->tail_drop[tclass_num];
+
+	stats_base->overlimits = red_base->prob_drop + red_base->prob_mark;
+	stats_base->drops = red_base->prob_drop + red_base->pdrop;
+
+	stats_base->backlog = 0;
+}
+
+static int
+mlxsw_sp_qdisc_red_destroy(struct mlxsw_sp_port *mlxsw_sp_port,
+			   struct mlxsw_sp_qdisc *mlxsw_sp_qdisc)
+{
+	struct mlxsw_sp_qdisc *root_qdisc = mlxsw_sp_port->root_qdisc;
+
+	if (root_qdisc != mlxsw_sp_qdisc)
+		root_qdisc->stats_base.backlog -=
+					mlxsw_sp_qdisc->stats_base.backlog;
+
+	return mlxsw_sp_tclass_congestion_disable(mlxsw_sp_port,
+						  mlxsw_sp_qdisc->tclass_num);
+}
+
+static int
+mlxsw_sp_qdisc_red_check_params(struct mlxsw_sp_port *mlxsw_sp_port,
+				struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+				void *params)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct tc_red_qopt_offload_params *p = params;
+
+	if (p->min > p->max) {
+		dev_err(mlxsw_sp->bus_info->dev,
+			"spectrum: RED: min %u is bigger then max %u\n", p->min,
+			p->max);
+		return -EINVAL;
+	}
+	if (p->max > MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_BUFFER_SIZE)) {
+		dev_err(mlxsw_sp->bus_info->dev,
+			"spectrum: RED: max value %u is too big\n", p->max);
+		return -EINVAL;
+	}
+	if (p->min == 0 || p->max == 0) {
+		dev_err(mlxsw_sp->bus_info->dev,
+			"spectrum: RED: 0 value is illegal for min and max\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int
+mlxsw_sp_qdisc_red_replace(struct mlxsw_sp_port *mlxsw_sp_port,
+			   struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			   void *params)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct tc_red_qopt_offload_params *p = params;
+	u8 tclass_num = mlxsw_sp_qdisc->tclass_num;
+	u32 min, max;
+	u64 prob;
+
+	/* calculate probability in percentage */
+	prob = p->probability;
+	prob *= 100;
+	prob = DIV_ROUND_UP(prob, 1 << 16);
+	prob = DIV_ROUND_UP(prob, 1 << 16);
+	min = mlxsw_sp_bytes_cells(mlxsw_sp, p->min);
+	max = mlxsw_sp_bytes_cells(mlxsw_sp, p->max);
+	return mlxsw_sp_tclass_congestion_enable(mlxsw_sp_port, tclass_num, min,
+						 max, prob, p->is_ecn);
+}
+
+static void
+mlxsw_sp_qdisc_red_unoffload(struct mlxsw_sp_port *mlxsw_sp_port,
+			     struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			     void *params)
+{
+	struct tc_red_qopt_offload_params *p = params;
+	u64 backlog;
+
+	backlog = mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+				       mlxsw_sp_qdisc->stats_base.backlog);
+	p->qstats->backlog -= backlog;
+	mlxsw_sp_qdisc->stats_base.backlog = 0;
+}
+
+static int
+mlxsw_sp_qdisc_get_red_xstats(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			      void *xstats_ptr)
+{
+	struct red_stats *xstats_base = &mlxsw_sp_qdisc->xstats_base.red;
+	u8 tclass_num = mlxsw_sp_qdisc->tclass_num;
+	struct mlxsw_sp_port_xstats *xstats;
+	struct red_stats *res = xstats_ptr;
+	int early_drops, marks, pdrops;
+
+	xstats = &mlxsw_sp_port->periodic_hw_stats.xstats;
+
+	early_drops = xstats->wred_drop[tclass_num] - xstats_base->prob_drop;
+	marks = xstats->ecn - xstats_base->prob_mark;
+	pdrops = xstats->tail_drop[tclass_num] - xstats_base->pdrop;
+
+	res->pdrop += pdrops;
+	res->prob_drop += early_drops;
+	res->prob_mark += marks;
+
+	xstats_base->pdrop += pdrops;
+	xstats_base->prob_drop += early_drops;
+	xstats_base->prob_mark += marks;
+	return 0;
+}
+
+static int
+mlxsw_sp_qdisc_get_red_stats(struct mlxsw_sp_port *mlxsw_sp_port,
+			     struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			     struct tc_qopt_offload_stats *stats_ptr)
+{
+	u64 tx_bytes, tx_packets, overlimits, drops, backlog;
+	u8 tclass_num = mlxsw_sp_qdisc->tclass_num;
+	struct mlxsw_sp_qdisc_stats *stats_base;
+	struct mlxsw_sp_port_xstats *xstats;
+
+	xstats = &mlxsw_sp_port->periodic_hw_stats.xstats;
+	stats_base = &mlxsw_sp_qdisc->stats_base;
+
+	mlxsw_sp_qdisc_bstats_per_priority_get(xstats,
+					       mlxsw_sp_qdisc->prio_bitmap,
+					       &tx_packets, &tx_bytes);
+	tx_bytes = tx_bytes - stats_base->tx_bytes;
+	tx_packets = tx_packets - stats_base->tx_packets;
+
+	overlimits = xstats->wred_drop[tclass_num] + xstats->ecn -
+		     stats_base->overlimits;
+	drops = xstats->wred_drop[tclass_num] + xstats->tail_drop[tclass_num] -
+		stats_base->drops;
+	backlog = xstats->backlog[tclass_num];
+
+	_bstats_update(stats_ptr->bstats, tx_bytes, tx_packets);
+	stats_ptr->qstats->overlimits += overlimits;
+	stats_ptr->qstats->drops += drops;
+	stats_ptr->qstats->backlog +=
+				mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+						     backlog) -
+				mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+						     stats_base->backlog);
+
+	stats_base->backlog = backlog;
+	stats_base->drops +=  drops;
+	stats_base->overlimits += overlimits;
+	stats_base->tx_bytes += tx_bytes;
+	stats_base->tx_packets += tx_packets;
+	return 0;
+}
+
+#define MLXSW_SP_PORT_DEFAULT_TCLASS 0
+
+static struct mlxsw_sp_qdisc_ops mlxsw_sp_qdisc_ops_red = {
+	.type = MLXSW_SP_QDISC_RED,
+	.check_params = mlxsw_sp_qdisc_red_check_params,
+	.replace = mlxsw_sp_qdisc_red_replace,
+	.unoffload = mlxsw_sp_qdisc_red_unoffload,
+	.destroy = mlxsw_sp_qdisc_red_destroy,
+	.get_stats = mlxsw_sp_qdisc_get_red_stats,
+	.get_xstats = mlxsw_sp_qdisc_get_red_xstats,
+	.clean_stats = mlxsw_sp_setup_tc_qdisc_red_clean_stats,
+};
+
+int mlxsw_sp_setup_tc_red(struct mlxsw_sp_port *mlxsw_sp_port,
+			  struct tc_red_qopt_offload *p)
+{
+	struct mlxsw_sp_qdisc *mlxsw_sp_qdisc;
+
+	mlxsw_sp_qdisc = mlxsw_sp_qdisc_find(mlxsw_sp_port, p->parent, false);
+	if (!mlxsw_sp_qdisc)
+		return -EOPNOTSUPP;
+
+	if (p->command == TC_RED_REPLACE)
+		return mlxsw_sp_qdisc_replace(mlxsw_sp_port, p->handle,
+					      mlxsw_sp_qdisc,
+					      &mlxsw_sp_qdisc_ops_red,
+					      &p->set);
+
+	if (!mlxsw_sp_qdisc_compare(mlxsw_sp_qdisc, p->handle,
+				    MLXSW_SP_QDISC_RED))
+		return -EOPNOTSUPP;
+
+	switch (p->command) {
+	case TC_RED_DESTROY:
+		return mlxsw_sp_qdisc_destroy(mlxsw_sp_port, mlxsw_sp_qdisc);
+	case TC_RED_XSTATS:
+		return mlxsw_sp_qdisc_get_xstats(mlxsw_sp_port, mlxsw_sp_qdisc,
+						 p->xstats);
+	case TC_RED_STATS:
+		return mlxsw_sp_qdisc_get_stats(mlxsw_sp_port, mlxsw_sp_qdisc,
+						&p->stats);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int
+mlxsw_sp_qdisc_prio_destroy(struct mlxsw_sp_port *mlxsw_sp_port,
+			    struct mlxsw_sp_qdisc *mlxsw_sp_qdisc)
+{
+	int i;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		mlxsw_sp_port_prio_tc_set(mlxsw_sp_port, i,
+					  MLXSW_SP_PORT_DEFAULT_TCLASS);
+		mlxsw_sp_qdisc_destroy(mlxsw_sp_port,
+				       &mlxsw_sp_port->tclass_qdiscs[i]);
+		mlxsw_sp_port->tclass_qdiscs[i].prio_bitmap = 0;
+	}
+
+	return 0;
+}
+
+static int
+mlxsw_sp_qdisc_prio_check_params(struct mlxsw_sp_port *mlxsw_sp_port,
+				 struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+				 void *params)
+{
+	struct tc_prio_qopt_offload_params *p = params;
+
+	if (p->bands > IEEE_8021QAZ_MAX_TCS)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static int
+mlxsw_sp_qdisc_prio_replace(struct mlxsw_sp_port *mlxsw_sp_port,
+			    struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			    void *params)
+{
+	struct tc_prio_qopt_offload_params *p = params;
+	struct mlxsw_sp_qdisc *child_qdisc;
+	int tclass, i, band, backlog;
+	u8 old_priomap;
+	int err;
+
+	for (band = 0; band < p->bands; band++) {
+		tclass = MLXSW_SP_PRIO_BAND_TO_TCLASS(band);
+		child_qdisc = &mlxsw_sp_port->tclass_qdiscs[tclass];
+		old_priomap = child_qdisc->prio_bitmap;
+		child_qdisc->prio_bitmap = 0;
+		for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+			if (p->priomap[i] == band) {
+				child_qdisc->prio_bitmap |= BIT(i);
+				if (BIT(i) & old_priomap)
+					continue;
+				err = mlxsw_sp_port_prio_tc_set(mlxsw_sp_port,
+								i, tclass);
+				if (err)
+					return err;
+			}
+		}
+		if (old_priomap != child_qdisc->prio_bitmap &&
+		    child_qdisc->ops && child_qdisc->ops->clean_stats) {
+			backlog = child_qdisc->stats_base.backlog;
+			child_qdisc->ops->clean_stats(mlxsw_sp_port,
+						      child_qdisc);
+			child_qdisc->stats_base.backlog = backlog;
+		}
+	}
+	for (; band < IEEE_8021QAZ_MAX_TCS; band++) {
+		tclass = MLXSW_SP_PRIO_BAND_TO_TCLASS(band);
+		child_qdisc = &mlxsw_sp_port->tclass_qdiscs[tclass];
+		child_qdisc->prio_bitmap = 0;
+		mlxsw_sp_qdisc_destroy(mlxsw_sp_port, child_qdisc);
+	}
+	return 0;
+}
+
+static void
+mlxsw_sp_qdisc_prio_unoffload(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			      void *params)
+{
+	struct tc_prio_qopt_offload_params *p = params;
+	u64 backlog;
+
+	backlog = mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+				       mlxsw_sp_qdisc->stats_base.backlog);
+	p->qstats->backlog -= backlog;
+}
+
+static int
+mlxsw_sp_qdisc_get_prio_stats(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			      struct tc_qopt_offload_stats *stats_ptr)
+{
+	u64 tx_bytes, tx_packets, drops = 0, backlog = 0;
+	struct mlxsw_sp_qdisc_stats *stats_base;
+	struct mlxsw_sp_port_xstats *xstats;
+	struct rtnl_link_stats64 *stats;
+	int i;
+
+	xstats = &mlxsw_sp_port->periodic_hw_stats.xstats;
+	stats = &mlxsw_sp_port->periodic_hw_stats.stats;
+	stats_base = &mlxsw_sp_qdisc->stats_base;
+
+	tx_bytes = stats->tx_bytes - stats_base->tx_bytes;
+	tx_packets = stats->tx_packets - stats_base->tx_packets;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		drops += xstats->tail_drop[i];
+		drops += xstats->wred_drop[i];
+		backlog += xstats->backlog[i];
+	}
+	drops = drops - stats_base->drops;
+
+	_bstats_update(stats_ptr->bstats, tx_bytes, tx_packets);
+	stats_ptr->qstats->drops += drops;
+	stats_ptr->qstats->backlog +=
+				mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+						     backlog) -
+				mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+						     stats_base->backlog);
+	stats_base->backlog = backlog;
+	stats_base->drops += drops;
+	stats_base->tx_bytes += tx_bytes;
+	stats_base->tx_packets += tx_packets;
+	return 0;
+}
+
+static void
+mlxsw_sp_setup_tc_qdisc_prio_clean_stats(struct mlxsw_sp_port *mlxsw_sp_port,
+					 struct mlxsw_sp_qdisc *mlxsw_sp_qdisc)
+{
+	struct mlxsw_sp_qdisc_stats *stats_base;
+	struct mlxsw_sp_port_xstats *xstats;
+	struct rtnl_link_stats64 *stats;
+	int i;
+
+	xstats = &mlxsw_sp_port->periodic_hw_stats.xstats;
+	stats = &mlxsw_sp_port->periodic_hw_stats.stats;
+	stats_base = &mlxsw_sp_qdisc->stats_base;
+
+	stats_base->tx_packets = stats->tx_packets;
+	stats_base->tx_bytes = stats->tx_bytes;
+
+	stats_base->drops = 0;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		stats_base->drops += xstats->tail_drop[i];
+		stats_base->drops += xstats->wred_drop[i];
+	}
+
+	mlxsw_sp_qdisc->stats_base.backlog = 0;
+}
+
+static struct mlxsw_sp_qdisc_ops mlxsw_sp_qdisc_ops_prio = {
+	.type = MLXSW_SP_QDISC_PRIO,
+	.check_params = mlxsw_sp_qdisc_prio_check_params,
+	.replace = mlxsw_sp_qdisc_prio_replace,
+	.unoffload = mlxsw_sp_qdisc_prio_unoffload,
+	.destroy = mlxsw_sp_qdisc_prio_destroy,
+	.get_stats = mlxsw_sp_qdisc_get_prio_stats,
+	.clean_stats = mlxsw_sp_setup_tc_qdisc_prio_clean_stats,
+};
+
+/* Grafting is not supported in mlxsw. It will result in un-offloading of the
+ * grafted qdisc as well as the qdisc in the qdisc new location.
+ * (However, if the graft is to the location where the qdisc is already at, it
+ * will be ignored completely and won't cause un-offloading).
+ */
+static int
+mlxsw_sp_qdisc_prio_graft(struct mlxsw_sp_port *mlxsw_sp_port,
+			  struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			  struct tc_prio_qopt_offload_graft_params *p)
+{
+	int tclass_num = MLXSW_SP_PRIO_BAND_TO_TCLASS(p->band);
+	struct mlxsw_sp_qdisc *old_qdisc;
+
+	/* Check if the grafted qdisc is already in its "new" location. If so -
+	 * nothing needs to be done.
+	 */
+	if (p->band < IEEE_8021QAZ_MAX_TCS &&
+	    mlxsw_sp_port->tclass_qdiscs[tclass_num].handle == p->child_handle)
+		return 0;
+
+	/* See if the grafted qdisc is already offloaded on any tclass. If so,
+	 * unoffload it.
+	 */
+	old_qdisc = mlxsw_sp_qdisc_find_by_handle(mlxsw_sp_port,
+						  p->child_handle);
+	if (old_qdisc)
+		mlxsw_sp_qdisc_destroy(mlxsw_sp_port, old_qdisc);
+
+	mlxsw_sp_qdisc_destroy(mlxsw_sp_port,
+			       &mlxsw_sp_port->tclass_qdiscs[tclass_num]);
+	return -EOPNOTSUPP;
+}
+
+int mlxsw_sp_setup_tc_prio(struct mlxsw_sp_port *mlxsw_sp_port,
+			   struct tc_prio_qopt_offload *p)
+{
+	struct mlxsw_sp_qdisc *mlxsw_sp_qdisc;
+
+	mlxsw_sp_qdisc = mlxsw_sp_qdisc_find(mlxsw_sp_port, p->parent, true);
+	if (!mlxsw_sp_qdisc)
+		return -EOPNOTSUPP;
+
+	if (p->command == TC_PRIO_REPLACE)
+		return mlxsw_sp_qdisc_replace(mlxsw_sp_port, p->handle,
+					      mlxsw_sp_qdisc,
+					      &mlxsw_sp_qdisc_ops_prio,
+					      &p->replace_params);
+
+	if (!mlxsw_sp_qdisc_compare(mlxsw_sp_qdisc, p->handle,
+				    MLXSW_SP_QDISC_PRIO))
+		return -EOPNOTSUPP;
+
+	switch (p->command) {
+	case TC_PRIO_DESTROY:
+		return mlxsw_sp_qdisc_destroy(mlxsw_sp_port, mlxsw_sp_qdisc);
+	case TC_PRIO_STATS:
+		return mlxsw_sp_qdisc_get_stats(mlxsw_sp_port, mlxsw_sp_qdisc,
+						&p->stats);
+	case TC_PRIO_GRAFT:
+		return mlxsw_sp_qdisc_prio_graft(mlxsw_sp_port, mlxsw_sp_qdisc,
+						 &p->graft_params);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+int mlxsw_sp_tc_qdisc_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp_qdisc *mlxsw_sp_qdisc;
+	int i;
+
+	mlxsw_sp_qdisc = kzalloc(sizeof(*mlxsw_sp_qdisc), GFP_KERNEL);
+	if (!mlxsw_sp_qdisc)
+		goto err_root_qdisc_init;
+
+	mlxsw_sp_port->root_qdisc = mlxsw_sp_qdisc;
+	mlxsw_sp_port->root_qdisc->prio_bitmap = 0xff;
+	mlxsw_sp_port->root_qdisc->tclass_num = MLXSW_SP_PORT_DEFAULT_TCLASS;
+
+	mlxsw_sp_qdisc = kzalloc(sizeof(*mlxsw_sp_qdisc) * IEEE_8021QAZ_MAX_TCS,
+				 GFP_KERNEL);
+	if (!mlxsw_sp_qdisc)
+		goto err_tclass_qdiscs_init;
+
+	mlxsw_sp_port->tclass_qdiscs = mlxsw_sp_qdisc;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		mlxsw_sp_port->tclass_qdiscs[i].tclass_num = i;
+
+	return 0;
+
+err_tclass_qdiscs_init:
+	kfree(mlxsw_sp_port->root_qdisc);
+err_root_qdisc_init:
+	return -ENOMEM;
+}
+
+void mlxsw_sp_tc_qdisc_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	kfree(mlxsw_sp_port->tclass_qdiscs);
+	kfree(mlxsw_sp_port->root_qdisc);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
new file mode 100644
index 0000000..1904c03
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -0,0 +1,7305 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+ * Copyright (c) 2016-2018 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2016 Ido Schimmel <idosch@mellanox.com>
+ * Copyright (c) 2016 Yotam Gigi <yotamg@mellanox.com>
+ * Copyright (c) 2017-2018 Petr Machata <petrm@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/rhashtable.h>
+#include <linux/bitops.h>
+#include <linux/in6.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/socket.h>
+#include <linux/route.h>
+#include <linux/gcd.h>
+#include <linux/random.h>
+#include <net/netevent.h>
+#include <net/neighbour.h>
+#include <net/arp.h>
+#include <net/ip_fib.h>
+#include <net/ip6_fib.h>
+#include <net/fib_rules.h>
+#include <net/ip_tunnels.h>
+#include <net/l3mdev.h>
+#include <net/addrconf.h>
+#include <net/ndisc.h>
+#include <net/ipv6.h>
+#include <net/fib_notifier.h>
+
+#include "spectrum.h"
+#include "core.h"
+#include "reg.h"
+#include "spectrum_cnt.h"
+#include "spectrum_dpipe.h"
+#include "spectrum_ipip.h"
+#include "spectrum_mr.h"
+#include "spectrum_mr_tcam.h"
+#include "spectrum_router.h"
+#include "spectrum_span.h"
+
+struct mlxsw_sp_fib;
+struct mlxsw_sp_vr;
+struct mlxsw_sp_lpm_tree;
+struct mlxsw_sp_rif_ops;
+
+struct mlxsw_sp_router {
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_sp_rif **rifs;
+	struct mlxsw_sp_vr *vrs;
+	struct rhashtable neigh_ht;
+	struct rhashtable nexthop_group_ht;
+	struct rhashtable nexthop_ht;
+	struct list_head nexthop_list;
+	struct {
+		/* One tree for each protocol: IPv4 and IPv6 */
+		struct mlxsw_sp_lpm_tree *proto_trees[2];
+		struct mlxsw_sp_lpm_tree *trees;
+		unsigned int tree_count;
+	} lpm;
+	struct {
+		struct delayed_work dw;
+		unsigned long interval;	/* ms */
+	} neighs_update;
+	struct delayed_work nexthop_probe_dw;
+#define MLXSW_SP_UNRESOLVED_NH_PROBE_INTERVAL 5000 /* ms */
+	struct list_head nexthop_neighs_list;
+	struct list_head ipip_list;
+	bool aborted;
+	struct notifier_block fib_nb;
+	struct notifier_block netevent_nb;
+	const struct mlxsw_sp_rif_ops **rif_ops_arr;
+	const struct mlxsw_sp_ipip_ops **ipip_ops_arr;
+};
+
+struct mlxsw_sp_rif {
+	struct list_head nexthop_list;
+	struct list_head neigh_list;
+	struct net_device *dev;
+	struct mlxsw_sp_fid *fid;
+	unsigned char addr[ETH_ALEN];
+	int mtu;
+	u16 rif_index;
+	u16 vr_id;
+	const struct mlxsw_sp_rif_ops *ops;
+	struct mlxsw_sp *mlxsw_sp;
+
+	unsigned int counter_ingress;
+	bool counter_ingress_valid;
+	unsigned int counter_egress;
+	bool counter_egress_valid;
+};
+
+struct mlxsw_sp_rif_params {
+	struct net_device *dev;
+	union {
+		u16 system_port;
+		u16 lag_id;
+	};
+	u16 vid;
+	bool lag;
+};
+
+struct mlxsw_sp_rif_subport {
+	struct mlxsw_sp_rif common;
+	union {
+		u16 system_port;
+		u16 lag_id;
+	};
+	u16 vid;
+	bool lag;
+};
+
+struct mlxsw_sp_rif_ipip_lb {
+	struct mlxsw_sp_rif common;
+	struct mlxsw_sp_rif_ipip_lb_config lb_config;
+	u16 ul_vr_id; /* Reserved for Spectrum-2. */
+};
+
+struct mlxsw_sp_rif_params_ipip_lb {
+	struct mlxsw_sp_rif_params common;
+	struct mlxsw_sp_rif_ipip_lb_config lb_config;
+};
+
+struct mlxsw_sp_rif_ops {
+	enum mlxsw_sp_rif_type type;
+	size_t rif_size;
+
+	void (*setup)(struct mlxsw_sp_rif *rif,
+		      const struct mlxsw_sp_rif_params *params);
+	int (*configure)(struct mlxsw_sp_rif *rif);
+	void (*deconfigure)(struct mlxsw_sp_rif *rif);
+	struct mlxsw_sp_fid * (*fid_get)(struct mlxsw_sp_rif *rif);
+};
+
+static void mlxsw_sp_lpm_tree_hold(struct mlxsw_sp_lpm_tree *lpm_tree);
+static void mlxsw_sp_lpm_tree_put(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_lpm_tree *lpm_tree);
+static int mlxsw_sp_vr_lpm_tree_bind(struct mlxsw_sp *mlxsw_sp,
+				     const struct mlxsw_sp_fib *fib,
+				     u8 tree_id);
+static int mlxsw_sp_vr_lpm_tree_unbind(struct mlxsw_sp *mlxsw_sp,
+				       const struct mlxsw_sp_fib *fib);
+
+static unsigned int *
+mlxsw_sp_rif_p_counter_get(struct mlxsw_sp_rif *rif,
+			   enum mlxsw_sp_rif_counter_dir dir)
+{
+	switch (dir) {
+	case MLXSW_SP_RIF_COUNTER_EGRESS:
+		return &rif->counter_egress;
+	case MLXSW_SP_RIF_COUNTER_INGRESS:
+		return &rif->counter_ingress;
+	}
+	return NULL;
+}
+
+static bool
+mlxsw_sp_rif_counter_valid_get(struct mlxsw_sp_rif *rif,
+			       enum mlxsw_sp_rif_counter_dir dir)
+{
+	switch (dir) {
+	case MLXSW_SP_RIF_COUNTER_EGRESS:
+		return rif->counter_egress_valid;
+	case MLXSW_SP_RIF_COUNTER_INGRESS:
+		return rif->counter_ingress_valid;
+	}
+	return false;
+}
+
+static void
+mlxsw_sp_rif_counter_valid_set(struct mlxsw_sp_rif *rif,
+			       enum mlxsw_sp_rif_counter_dir dir,
+			       bool valid)
+{
+	switch (dir) {
+	case MLXSW_SP_RIF_COUNTER_EGRESS:
+		rif->counter_egress_valid = valid;
+		break;
+	case MLXSW_SP_RIF_COUNTER_INGRESS:
+		rif->counter_ingress_valid = valid;
+		break;
+	}
+}
+
+static int mlxsw_sp_rif_counter_edit(struct mlxsw_sp *mlxsw_sp, u16 rif_index,
+				     unsigned int counter_index, bool enable,
+				     enum mlxsw_sp_rif_counter_dir dir)
+{
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+	bool is_egress = false;
+	int err;
+
+	if (dir == MLXSW_SP_RIF_COUNTER_EGRESS)
+		is_egress = true;
+	mlxsw_reg_ritr_rif_pack(ritr_pl, rif_index);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_ritr_counter_pack(ritr_pl, counter_index, enable,
+				    is_egress);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
+int mlxsw_sp_rif_counter_value_get(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_rif *rif,
+				   enum mlxsw_sp_rif_counter_dir dir, u64 *cnt)
+{
+	char ricnt_pl[MLXSW_REG_RICNT_LEN];
+	unsigned int *p_counter_index;
+	bool valid;
+	int err;
+
+	valid = mlxsw_sp_rif_counter_valid_get(rif, dir);
+	if (!valid)
+		return -EINVAL;
+
+	p_counter_index = mlxsw_sp_rif_p_counter_get(rif, dir);
+	if (!p_counter_index)
+		return -EINVAL;
+	mlxsw_reg_ricnt_pack(ricnt_pl, *p_counter_index,
+			     MLXSW_REG_RICNT_OPCODE_NOP);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ricnt), ricnt_pl);
+	if (err)
+		return err;
+	*cnt = mlxsw_reg_ricnt_good_unicast_packets_get(ricnt_pl);
+	return 0;
+}
+
+static int mlxsw_sp_rif_counter_clear(struct mlxsw_sp *mlxsw_sp,
+				      unsigned int counter_index)
+{
+	char ricnt_pl[MLXSW_REG_RICNT_LEN];
+
+	mlxsw_reg_ricnt_pack(ricnt_pl, counter_index,
+			     MLXSW_REG_RICNT_OPCODE_CLEAR);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ricnt), ricnt_pl);
+}
+
+int mlxsw_sp_rif_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_rif *rif,
+			       enum mlxsw_sp_rif_counter_dir dir)
+{
+	unsigned int *p_counter_index;
+	int err;
+
+	p_counter_index = mlxsw_sp_rif_p_counter_get(rif, dir);
+	if (!p_counter_index)
+		return -EINVAL;
+	err = mlxsw_sp_counter_alloc(mlxsw_sp, MLXSW_SP_COUNTER_SUB_POOL_RIF,
+				     p_counter_index);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_rif_counter_clear(mlxsw_sp, *p_counter_index);
+	if (err)
+		goto err_counter_clear;
+
+	err = mlxsw_sp_rif_counter_edit(mlxsw_sp, rif->rif_index,
+					*p_counter_index, true, dir);
+	if (err)
+		goto err_counter_edit;
+	mlxsw_sp_rif_counter_valid_set(rif, dir, true);
+	return 0;
+
+err_counter_edit:
+err_counter_clear:
+	mlxsw_sp_counter_free(mlxsw_sp, MLXSW_SP_COUNTER_SUB_POOL_RIF,
+			      *p_counter_index);
+	return err;
+}
+
+void mlxsw_sp_rif_counter_free(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_rif *rif,
+			       enum mlxsw_sp_rif_counter_dir dir)
+{
+	unsigned int *p_counter_index;
+
+	if (!mlxsw_sp_rif_counter_valid_get(rif, dir))
+		return;
+
+	p_counter_index = mlxsw_sp_rif_p_counter_get(rif, dir);
+	if (WARN_ON(!p_counter_index))
+		return;
+	mlxsw_sp_rif_counter_edit(mlxsw_sp, rif->rif_index,
+				  *p_counter_index, false, dir);
+	mlxsw_sp_counter_free(mlxsw_sp, MLXSW_SP_COUNTER_SUB_POOL_RIF,
+			      *p_counter_index);
+	mlxsw_sp_rif_counter_valid_set(rif, dir, false);
+}
+
+static void mlxsw_sp_rif_counters_alloc(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct devlink *devlink;
+
+	devlink = priv_to_devlink(mlxsw_sp->core);
+	if (!devlink_dpipe_table_counter_enabled(devlink,
+						 MLXSW_SP_DPIPE_TABLE_NAME_ERIF))
+		return;
+	mlxsw_sp_rif_counter_alloc(mlxsw_sp, rif, MLXSW_SP_RIF_COUNTER_EGRESS);
+}
+
+static void mlxsw_sp_rif_counters_free(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+
+	mlxsw_sp_rif_counter_free(mlxsw_sp, rif, MLXSW_SP_RIF_COUNTER_EGRESS);
+}
+
+static struct mlxsw_sp_rif *
+mlxsw_sp_rif_find_by_dev(const struct mlxsw_sp *mlxsw_sp,
+			 const struct net_device *dev);
+
+#define MLXSW_SP_PREFIX_COUNT (sizeof(struct in6_addr) * BITS_PER_BYTE + 1)
+
+struct mlxsw_sp_prefix_usage {
+	DECLARE_BITMAP(b, MLXSW_SP_PREFIX_COUNT);
+};
+
+#define mlxsw_sp_prefix_usage_for_each(prefix, prefix_usage) \
+	for_each_set_bit(prefix, (prefix_usage)->b, MLXSW_SP_PREFIX_COUNT)
+
+static bool
+mlxsw_sp_prefix_usage_eq(struct mlxsw_sp_prefix_usage *prefix_usage1,
+			 struct mlxsw_sp_prefix_usage *prefix_usage2)
+{
+	return !memcmp(prefix_usage1, prefix_usage2, sizeof(*prefix_usage1));
+}
+
+static void
+mlxsw_sp_prefix_usage_cpy(struct mlxsw_sp_prefix_usage *prefix_usage1,
+			  struct mlxsw_sp_prefix_usage *prefix_usage2)
+{
+	memcpy(prefix_usage1, prefix_usage2, sizeof(*prefix_usage1));
+}
+
+static void
+mlxsw_sp_prefix_usage_set(struct mlxsw_sp_prefix_usage *prefix_usage,
+			  unsigned char prefix_len)
+{
+	set_bit(prefix_len, prefix_usage->b);
+}
+
+static void
+mlxsw_sp_prefix_usage_clear(struct mlxsw_sp_prefix_usage *prefix_usage,
+			    unsigned char prefix_len)
+{
+	clear_bit(prefix_len, prefix_usage->b);
+}
+
+struct mlxsw_sp_fib_key {
+	unsigned char addr[sizeof(struct in6_addr)];
+	unsigned char prefix_len;
+};
+
+enum mlxsw_sp_fib_entry_type {
+	MLXSW_SP_FIB_ENTRY_TYPE_REMOTE,
+	MLXSW_SP_FIB_ENTRY_TYPE_LOCAL,
+	MLXSW_SP_FIB_ENTRY_TYPE_TRAP,
+
+	/* This is a special case of local delivery, where a packet should be
+	 * decapsulated on reception. Note that there is no corresponding ENCAP,
+	 * because that's a type of next hop, not of FIB entry. (There can be
+	 * several next hops in a REMOTE entry, and some of them may be
+	 * encapsulating entries.)
+	 */
+	MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP,
+};
+
+struct mlxsw_sp_nexthop_group;
+
+struct mlxsw_sp_fib_node {
+	struct list_head entry_list;
+	struct list_head list;
+	struct rhash_head ht_node;
+	struct mlxsw_sp_fib *fib;
+	struct mlxsw_sp_fib_key key;
+};
+
+struct mlxsw_sp_fib_entry_decap {
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	u32 tunnel_index;
+};
+
+struct mlxsw_sp_fib_entry {
+	struct list_head list;
+	struct mlxsw_sp_fib_node *fib_node;
+	enum mlxsw_sp_fib_entry_type type;
+	struct list_head nexthop_group_node;
+	struct mlxsw_sp_nexthop_group *nh_group;
+	struct mlxsw_sp_fib_entry_decap decap; /* Valid for decap entries. */
+};
+
+struct mlxsw_sp_fib4_entry {
+	struct mlxsw_sp_fib_entry common;
+	u32 tb_id;
+	u32 prio;
+	u8 tos;
+	u8 type;
+};
+
+struct mlxsw_sp_fib6_entry {
+	struct mlxsw_sp_fib_entry common;
+	struct list_head rt6_list;
+	unsigned int nrt6;
+};
+
+struct mlxsw_sp_rt6 {
+	struct list_head list;
+	struct rt6_info *rt;
+};
+
+struct mlxsw_sp_lpm_tree {
+	u8 id; /* tree ID */
+	unsigned int ref_count;
+	enum mlxsw_sp_l3proto proto;
+	unsigned long prefix_ref_count[MLXSW_SP_PREFIX_COUNT];
+	struct mlxsw_sp_prefix_usage prefix_usage;
+};
+
+struct mlxsw_sp_fib {
+	struct rhashtable ht;
+	struct list_head node_list;
+	struct mlxsw_sp_vr *vr;
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+	enum mlxsw_sp_l3proto proto;
+};
+
+struct mlxsw_sp_vr {
+	u16 id; /* virtual router ID */
+	u32 tb_id; /* kernel fib table id */
+	unsigned int rif_count;
+	struct mlxsw_sp_fib *fib4;
+	struct mlxsw_sp_fib *fib6;
+	struct mlxsw_sp_mr_table *mr_table[MLXSW_SP_L3_PROTO_MAX];
+};
+
+static const struct rhashtable_params mlxsw_sp_fib_ht_params;
+
+static struct mlxsw_sp_fib *mlxsw_sp_fib_create(struct mlxsw_sp *mlxsw_sp,
+						struct mlxsw_sp_vr *vr,
+						enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+	struct mlxsw_sp_fib *fib;
+	int err;
+
+	lpm_tree = mlxsw_sp->router->lpm.proto_trees[proto];
+	fib = kzalloc(sizeof(*fib), GFP_KERNEL);
+	if (!fib)
+		return ERR_PTR(-ENOMEM);
+	err = rhashtable_init(&fib->ht, &mlxsw_sp_fib_ht_params);
+	if (err)
+		goto err_rhashtable_init;
+	INIT_LIST_HEAD(&fib->node_list);
+	fib->proto = proto;
+	fib->vr = vr;
+	fib->lpm_tree = lpm_tree;
+	mlxsw_sp_lpm_tree_hold(lpm_tree);
+	err = mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, fib, lpm_tree->id);
+	if (err)
+		goto err_lpm_tree_bind;
+	return fib;
+
+err_lpm_tree_bind:
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree);
+err_rhashtable_init:
+	kfree(fib);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_fib_destroy(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_fib *fib)
+{
+	mlxsw_sp_vr_lpm_tree_unbind(mlxsw_sp, fib);
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, fib->lpm_tree);
+	WARN_ON(!list_empty(&fib->node_list));
+	rhashtable_destroy(&fib->ht);
+	kfree(fib);
+}
+
+static struct mlxsw_sp_lpm_tree *
+mlxsw_sp_lpm_tree_find_unused(struct mlxsw_sp *mlxsw_sp)
+{
+	static struct mlxsw_sp_lpm_tree *lpm_tree;
+	int i;
+
+	for (i = 0; i < mlxsw_sp->router->lpm.tree_count; i++) {
+		lpm_tree = &mlxsw_sp->router->lpm.trees[i];
+		if (lpm_tree->ref_count == 0)
+			return lpm_tree;
+	}
+	return NULL;
+}
+
+static int mlxsw_sp_lpm_tree_alloc(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_lpm_tree *lpm_tree)
+{
+	char ralta_pl[MLXSW_REG_RALTA_LEN];
+
+	mlxsw_reg_ralta_pack(ralta_pl, true,
+			     (enum mlxsw_reg_ralxx_protocol) lpm_tree->proto,
+			     lpm_tree->id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralta), ralta_pl);
+}
+
+static void mlxsw_sp_lpm_tree_free(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_lpm_tree *lpm_tree)
+{
+	char ralta_pl[MLXSW_REG_RALTA_LEN];
+
+	mlxsw_reg_ralta_pack(ralta_pl, false,
+			     (enum mlxsw_reg_ralxx_protocol) lpm_tree->proto,
+			     lpm_tree->id);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralta), ralta_pl);
+}
+
+static int
+mlxsw_sp_lpm_tree_left_struct_set(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_prefix_usage *prefix_usage,
+				  struct mlxsw_sp_lpm_tree *lpm_tree)
+{
+	char ralst_pl[MLXSW_REG_RALST_LEN];
+	u8 root_bin = 0;
+	u8 prefix;
+	u8 last_prefix = MLXSW_REG_RALST_BIN_NO_CHILD;
+
+	mlxsw_sp_prefix_usage_for_each(prefix, prefix_usage)
+		root_bin = prefix;
+
+	mlxsw_reg_ralst_pack(ralst_pl, root_bin, lpm_tree->id);
+	mlxsw_sp_prefix_usage_for_each(prefix, prefix_usage) {
+		if (prefix == 0)
+			continue;
+		mlxsw_reg_ralst_bin_pack(ralst_pl, prefix, last_prefix,
+					 MLXSW_REG_RALST_BIN_NO_CHILD);
+		last_prefix = prefix;
+	}
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralst), ralst_pl);
+}
+
+static struct mlxsw_sp_lpm_tree *
+mlxsw_sp_lpm_tree_create(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_prefix_usage *prefix_usage,
+			 enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+	int err;
+
+	lpm_tree = mlxsw_sp_lpm_tree_find_unused(mlxsw_sp);
+	if (!lpm_tree)
+		return ERR_PTR(-EBUSY);
+	lpm_tree->proto = proto;
+	err = mlxsw_sp_lpm_tree_alloc(mlxsw_sp, lpm_tree);
+	if (err)
+		return ERR_PTR(err);
+
+	err = mlxsw_sp_lpm_tree_left_struct_set(mlxsw_sp, prefix_usage,
+						lpm_tree);
+	if (err)
+		goto err_left_struct_set;
+	memcpy(&lpm_tree->prefix_usage, prefix_usage,
+	       sizeof(lpm_tree->prefix_usage));
+	memset(&lpm_tree->prefix_ref_count, 0,
+	       sizeof(lpm_tree->prefix_ref_count));
+	lpm_tree->ref_count = 1;
+	return lpm_tree;
+
+err_left_struct_set:
+	mlxsw_sp_lpm_tree_free(mlxsw_sp, lpm_tree);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_lpm_tree_destroy(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_lpm_tree *lpm_tree)
+{
+	mlxsw_sp_lpm_tree_free(mlxsw_sp, lpm_tree);
+}
+
+static struct mlxsw_sp_lpm_tree *
+mlxsw_sp_lpm_tree_get(struct mlxsw_sp *mlxsw_sp,
+		      struct mlxsw_sp_prefix_usage *prefix_usage,
+		      enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+	int i;
+
+	for (i = 0; i < mlxsw_sp->router->lpm.tree_count; i++) {
+		lpm_tree = &mlxsw_sp->router->lpm.trees[i];
+		if (lpm_tree->ref_count != 0 &&
+		    lpm_tree->proto == proto &&
+		    mlxsw_sp_prefix_usage_eq(&lpm_tree->prefix_usage,
+					     prefix_usage)) {
+			mlxsw_sp_lpm_tree_hold(lpm_tree);
+			return lpm_tree;
+		}
+	}
+	return mlxsw_sp_lpm_tree_create(mlxsw_sp, prefix_usage, proto);
+}
+
+static void mlxsw_sp_lpm_tree_hold(struct mlxsw_sp_lpm_tree *lpm_tree)
+{
+	lpm_tree->ref_count++;
+}
+
+static void mlxsw_sp_lpm_tree_put(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_lpm_tree *lpm_tree)
+{
+	if (--lpm_tree->ref_count == 0)
+		mlxsw_sp_lpm_tree_destroy(mlxsw_sp, lpm_tree);
+}
+
+#define MLXSW_SP_LPM_TREE_MIN 1 /* tree 0 is reserved */
+
+static int mlxsw_sp_lpm_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_prefix_usage req_prefix_usage = {{ 0 } };
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+	u64 max_trees;
+	int err, i;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_LPM_TREES))
+		return -EIO;
+
+	max_trees = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_LPM_TREES);
+	mlxsw_sp->router->lpm.tree_count = max_trees - MLXSW_SP_LPM_TREE_MIN;
+	mlxsw_sp->router->lpm.trees = kcalloc(mlxsw_sp->router->lpm.tree_count,
+					     sizeof(struct mlxsw_sp_lpm_tree),
+					     GFP_KERNEL);
+	if (!mlxsw_sp->router->lpm.trees)
+		return -ENOMEM;
+
+	for (i = 0; i < mlxsw_sp->router->lpm.tree_count; i++) {
+		lpm_tree = &mlxsw_sp->router->lpm.trees[i];
+		lpm_tree->id = i + MLXSW_SP_LPM_TREE_MIN;
+	}
+
+	lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, &req_prefix_usage,
+					 MLXSW_SP_L3_PROTO_IPV4);
+	if (IS_ERR(lpm_tree)) {
+		err = PTR_ERR(lpm_tree);
+		goto err_ipv4_tree_get;
+	}
+	mlxsw_sp->router->lpm.proto_trees[MLXSW_SP_L3_PROTO_IPV4] = lpm_tree;
+
+	lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, &req_prefix_usage,
+					 MLXSW_SP_L3_PROTO_IPV6);
+	if (IS_ERR(lpm_tree)) {
+		err = PTR_ERR(lpm_tree);
+		goto err_ipv6_tree_get;
+	}
+	mlxsw_sp->router->lpm.proto_trees[MLXSW_SP_L3_PROTO_IPV6] = lpm_tree;
+
+	return 0;
+
+err_ipv6_tree_get:
+	lpm_tree = mlxsw_sp->router->lpm.proto_trees[MLXSW_SP_L3_PROTO_IPV4];
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree);
+err_ipv4_tree_get:
+	kfree(mlxsw_sp->router->lpm.trees);
+	return err;
+}
+
+static void mlxsw_sp_lpm_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+
+	lpm_tree = mlxsw_sp->router->lpm.proto_trees[MLXSW_SP_L3_PROTO_IPV6];
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree);
+
+	lpm_tree = mlxsw_sp->router->lpm.proto_trees[MLXSW_SP_L3_PROTO_IPV4];
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree);
+
+	kfree(mlxsw_sp->router->lpm.trees);
+}
+
+static bool mlxsw_sp_vr_is_used(const struct mlxsw_sp_vr *vr)
+{
+	return !!vr->fib4 || !!vr->fib6 ||
+	       !!vr->mr_table[MLXSW_SP_L3_PROTO_IPV4] ||
+	       !!vr->mr_table[MLXSW_SP_L3_PROTO_IPV6];
+}
+
+static struct mlxsw_sp_vr *mlxsw_sp_vr_find_unused(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_vr *vr;
+	int i;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
+		vr = &mlxsw_sp->router->vrs[i];
+		if (!mlxsw_sp_vr_is_used(vr))
+			return vr;
+	}
+	return NULL;
+}
+
+static int mlxsw_sp_vr_lpm_tree_bind(struct mlxsw_sp *mlxsw_sp,
+				     const struct mlxsw_sp_fib *fib, u8 tree_id)
+{
+	char raltb_pl[MLXSW_REG_RALTB_LEN];
+
+	mlxsw_reg_raltb_pack(raltb_pl, fib->vr->id,
+			     (enum mlxsw_reg_ralxx_protocol) fib->proto,
+			     tree_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(raltb), raltb_pl);
+}
+
+static int mlxsw_sp_vr_lpm_tree_unbind(struct mlxsw_sp *mlxsw_sp,
+				       const struct mlxsw_sp_fib *fib)
+{
+	char raltb_pl[MLXSW_REG_RALTB_LEN];
+
+	/* Bind to tree 0 which is default */
+	mlxsw_reg_raltb_pack(raltb_pl, fib->vr->id,
+			     (enum mlxsw_reg_ralxx_protocol) fib->proto, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(raltb), raltb_pl);
+}
+
+static u32 mlxsw_sp_fix_tb_id(u32 tb_id)
+{
+	/* For our purpose, squash main, default and local tables into one */
+	if (tb_id == RT_TABLE_LOCAL || tb_id == RT_TABLE_DEFAULT)
+		tb_id = RT_TABLE_MAIN;
+	return tb_id;
+}
+
+static struct mlxsw_sp_vr *mlxsw_sp_vr_find(struct mlxsw_sp *mlxsw_sp,
+					    u32 tb_id)
+{
+	struct mlxsw_sp_vr *vr;
+	int i;
+
+	tb_id = mlxsw_sp_fix_tb_id(tb_id);
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
+		vr = &mlxsw_sp->router->vrs[i];
+		if (mlxsw_sp_vr_is_used(vr) && vr->tb_id == tb_id)
+			return vr;
+	}
+	return NULL;
+}
+
+static struct mlxsw_sp_fib *mlxsw_sp_vr_fib(const struct mlxsw_sp_vr *vr,
+					    enum mlxsw_sp_l3proto proto)
+{
+	switch (proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		return vr->fib4;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		return vr->fib6;
+	}
+	return NULL;
+}
+
+static struct mlxsw_sp_vr *mlxsw_sp_vr_create(struct mlxsw_sp *mlxsw_sp,
+					      u32 tb_id,
+					      struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_mr_table *mr4_table, *mr6_table;
+	struct mlxsw_sp_fib *fib4;
+	struct mlxsw_sp_fib *fib6;
+	struct mlxsw_sp_vr *vr;
+	int err;
+
+	vr = mlxsw_sp_vr_find_unused(mlxsw_sp);
+	if (!vr) {
+		NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported virtual routers");
+		return ERR_PTR(-EBUSY);
+	}
+	fib4 = mlxsw_sp_fib_create(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV4);
+	if (IS_ERR(fib4))
+		return ERR_CAST(fib4);
+	fib6 = mlxsw_sp_fib_create(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV6);
+	if (IS_ERR(fib6)) {
+		err = PTR_ERR(fib6);
+		goto err_fib6_create;
+	}
+	mr4_table = mlxsw_sp_mr_table_create(mlxsw_sp, vr->id,
+					     MLXSW_SP_L3_PROTO_IPV4);
+	if (IS_ERR(mr4_table)) {
+		err = PTR_ERR(mr4_table);
+		goto err_mr4_table_create;
+	}
+	mr6_table = mlxsw_sp_mr_table_create(mlxsw_sp, vr->id,
+					     MLXSW_SP_L3_PROTO_IPV6);
+	if (IS_ERR(mr6_table)) {
+		err = PTR_ERR(mr6_table);
+		goto err_mr6_table_create;
+	}
+
+	vr->fib4 = fib4;
+	vr->fib6 = fib6;
+	vr->mr_table[MLXSW_SP_L3_PROTO_IPV4] = mr4_table;
+	vr->mr_table[MLXSW_SP_L3_PROTO_IPV6] = mr6_table;
+	vr->tb_id = tb_id;
+	return vr;
+
+err_mr6_table_create:
+	mlxsw_sp_mr_table_destroy(mr4_table);
+err_mr4_table_create:
+	mlxsw_sp_fib_destroy(mlxsw_sp, fib6);
+err_fib6_create:
+	mlxsw_sp_fib_destroy(mlxsw_sp, fib4);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_vr_destroy(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_vr *vr)
+{
+	mlxsw_sp_mr_table_destroy(vr->mr_table[MLXSW_SP_L3_PROTO_IPV6]);
+	vr->mr_table[MLXSW_SP_L3_PROTO_IPV6] = NULL;
+	mlxsw_sp_mr_table_destroy(vr->mr_table[MLXSW_SP_L3_PROTO_IPV4]);
+	vr->mr_table[MLXSW_SP_L3_PROTO_IPV4] = NULL;
+	mlxsw_sp_fib_destroy(mlxsw_sp, vr->fib6);
+	vr->fib6 = NULL;
+	mlxsw_sp_fib_destroy(mlxsw_sp, vr->fib4);
+	vr->fib4 = NULL;
+}
+
+static struct mlxsw_sp_vr *mlxsw_sp_vr_get(struct mlxsw_sp *mlxsw_sp, u32 tb_id,
+					   struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_vr *vr;
+
+	tb_id = mlxsw_sp_fix_tb_id(tb_id);
+	vr = mlxsw_sp_vr_find(mlxsw_sp, tb_id);
+	if (!vr)
+		vr = mlxsw_sp_vr_create(mlxsw_sp, tb_id, extack);
+	return vr;
+}
+
+static void mlxsw_sp_vr_put(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_vr *vr)
+{
+	if (!vr->rif_count && list_empty(&vr->fib4->node_list) &&
+	    list_empty(&vr->fib6->node_list) &&
+	    mlxsw_sp_mr_table_empty(vr->mr_table[MLXSW_SP_L3_PROTO_IPV4]) &&
+	    mlxsw_sp_mr_table_empty(vr->mr_table[MLXSW_SP_L3_PROTO_IPV6]))
+		mlxsw_sp_vr_destroy(mlxsw_sp, vr);
+}
+
+static bool
+mlxsw_sp_vr_lpm_tree_should_replace(struct mlxsw_sp_vr *vr,
+				    enum mlxsw_sp_l3proto proto, u8 tree_id)
+{
+	struct mlxsw_sp_fib *fib = mlxsw_sp_vr_fib(vr, proto);
+
+	if (!mlxsw_sp_vr_is_used(vr))
+		return false;
+	if (fib->lpm_tree->id == tree_id)
+		return true;
+	return false;
+}
+
+static int mlxsw_sp_vr_lpm_tree_replace(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib *fib,
+					struct mlxsw_sp_lpm_tree *new_tree)
+{
+	struct mlxsw_sp_lpm_tree *old_tree = fib->lpm_tree;
+	int err;
+
+	fib->lpm_tree = new_tree;
+	mlxsw_sp_lpm_tree_hold(new_tree);
+	err = mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, fib, new_tree->id);
+	if (err)
+		goto err_tree_bind;
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, old_tree);
+	return 0;
+
+err_tree_bind:
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, new_tree);
+	fib->lpm_tree = old_tree;
+	return err;
+}
+
+static int mlxsw_sp_vrs_lpm_tree_replace(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_fib *fib,
+					 struct mlxsw_sp_lpm_tree *new_tree)
+{
+	enum mlxsw_sp_l3proto proto = fib->proto;
+	struct mlxsw_sp_lpm_tree *old_tree;
+	u8 old_id, new_id = new_tree->id;
+	struct mlxsw_sp_vr *vr;
+	int i, err;
+
+	old_tree = mlxsw_sp->router->lpm.proto_trees[proto];
+	old_id = old_tree->id;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
+		vr = &mlxsw_sp->router->vrs[i];
+		if (!mlxsw_sp_vr_lpm_tree_should_replace(vr, proto, old_id))
+			continue;
+		err = mlxsw_sp_vr_lpm_tree_replace(mlxsw_sp,
+						   mlxsw_sp_vr_fib(vr, proto),
+						   new_tree);
+		if (err)
+			goto err_tree_replace;
+	}
+
+	memcpy(new_tree->prefix_ref_count, old_tree->prefix_ref_count,
+	       sizeof(new_tree->prefix_ref_count));
+	mlxsw_sp->router->lpm.proto_trees[proto] = new_tree;
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, old_tree);
+
+	return 0;
+
+err_tree_replace:
+	for (i--; i >= 0; i--) {
+		if (!mlxsw_sp_vr_lpm_tree_should_replace(vr, proto, new_id))
+			continue;
+		mlxsw_sp_vr_lpm_tree_replace(mlxsw_sp,
+					     mlxsw_sp_vr_fib(vr, proto),
+					     old_tree);
+	}
+	return err;
+}
+
+static int mlxsw_sp_vrs_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_vr *vr;
+	u64 max_vrs;
+	int i;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_VRS))
+		return -EIO;
+
+	max_vrs = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS);
+	mlxsw_sp->router->vrs = kcalloc(max_vrs, sizeof(struct mlxsw_sp_vr),
+					GFP_KERNEL);
+	if (!mlxsw_sp->router->vrs)
+		return -ENOMEM;
+
+	for (i = 0; i < max_vrs; i++) {
+		vr = &mlxsw_sp->router->vrs[i];
+		vr->id = i;
+	}
+
+	return 0;
+}
+
+static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp);
+
+static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	/* At this stage we're guaranteed not to have new incoming
+	 * FIB notifications and the work queue is free from FIBs
+	 * sitting on top of mlxsw netdevs. However, we can still
+	 * have other FIBs queued. Flush the queue before flushing
+	 * the device's tables. No need for locks, as we're the only
+	 * writer.
+	 */
+	mlxsw_core_flush_owq();
+	mlxsw_sp_router_fib_flush(mlxsw_sp);
+	kfree(mlxsw_sp->router->vrs);
+}
+
+static struct net_device *
+__mlxsw_sp_ipip_netdev_ul_dev_get(const struct net_device *ol_dev)
+{
+	struct ip_tunnel *tun = netdev_priv(ol_dev);
+	struct net *net = dev_net(ol_dev);
+
+	return __dev_get_by_index(net, tun->parms.link);
+}
+
+u32 mlxsw_sp_ipip_dev_ul_tb_id(const struct net_device *ol_dev)
+{
+	struct net_device *d = __mlxsw_sp_ipip_netdev_ul_dev_get(ol_dev);
+
+	if (d)
+		return l3mdev_fib_table(d) ? : RT_TABLE_MAIN;
+	else
+		return l3mdev_fib_table(ol_dev) ? : RT_TABLE_MAIN;
+}
+
+static struct mlxsw_sp_rif *
+mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
+		    const struct mlxsw_sp_rif_params *params,
+		    struct netlink_ext_ack *extack);
+
+static struct mlxsw_sp_rif_ipip_lb *
+mlxsw_sp_ipip_ol_ipip_lb_create(struct mlxsw_sp *mlxsw_sp,
+				enum mlxsw_sp_ipip_type ipipt,
+				struct net_device *ol_dev,
+				struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_rif_params_ipip_lb lb_params;
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	struct mlxsw_sp_rif *rif;
+
+	ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipipt];
+	lb_params = (struct mlxsw_sp_rif_params_ipip_lb) {
+		.common.dev = ol_dev,
+		.common.lag = false,
+		.lb_config = ipip_ops->ol_loopback_config(mlxsw_sp, ol_dev),
+	};
+
+	rif = mlxsw_sp_rif_create(mlxsw_sp, &lb_params.common, extack);
+	if (IS_ERR(rif))
+		return ERR_CAST(rif);
+	return container_of(rif, struct mlxsw_sp_rif_ipip_lb, common);
+}
+
+static struct mlxsw_sp_ipip_entry *
+mlxsw_sp_ipip_entry_alloc(struct mlxsw_sp *mlxsw_sp,
+			  enum mlxsw_sp_ipip_type ipipt,
+			  struct net_device *ol_dev)
+{
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	struct mlxsw_sp_ipip_entry *ret = NULL;
+
+	ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipipt];
+	ipip_entry = kzalloc(sizeof(*ipip_entry), GFP_KERNEL);
+	if (!ipip_entry)
+		return ERR_PTR(-ENOMEM);
+
+	ipip_entry->ol_lb = mlxsw_sp_ipip_ol_ipip_lb_create(mlxsw_sp, ipipt,
+							    ol_dev, NULL);
+	if (IS_ERR(ipip_entry->ol_lb)) {
+		ret = ERR_CAST(ipip_entry->ol_lb);
+		goto err_ol_ipip_lb_create;
+	}
+
+	ipip_entry->ipipt = ipipt;
+	ipip_entry->ol_dev = ol_dev;
+
+	switch (ipip_ops->ul_proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		ipip_entry->parms4 = mlxsw_sp_ipip_netdev_parms4(ol_dev);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		WARN_ON(1);
+		break;
+	}
+
+	return ipip_entry;
+
+err_ol_ipip_lb_create:
+	kfree(ipip_entry);
+	return ret;
+}
+
+static void
+mlxsw_sp_ipip_entry_dealloc(struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	mlxsw_sp_rif_destroy(&ipip_entry->ol_lb->common);
+	kfree(ipip_entry);
+}
+
+static bool
+mlxsw_sp_ipip_entry_saddr_matches(struct mlxsw_sp *mlxsw_sp,
+				  const enum mlxsw_sp_l3proto ul_proto,
+				  union mlxsw_sp_l3addr saddr,
+				  u32 ul_tb_id,
+				  struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	u32 tun_ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(ipip_entry->ol_dev);
+	enum mlxsw_sp_ipip_type ipipt = ipip_entry->ipipt;
+	union mlxsw_sp_l3addr tun_saddr;
+
+	if (mlxsw_sp->router->ipip_ops_arr[ipipt]->ul_proto != ul_proto)
+		return false;
+
+	tun_saddr = mlxsw_sp_ipip_netdev_saddr(ul_proto, ipip_entry->ol_dev);
+	return tun_ul_tb_id == ul_tb_id &&
+	       mlxsw_sp_l3addr_eq(&tun_saddr, &saddr);
+}
+
+static int
+mlxsw_sp_fib_entry_decap_init(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_fib_entry *fib_entry,
+			      struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	u32 tunnel_index;
+	int err;
+
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, 1, &tunnel_index);
+	if (err)
+		return err;
+
+	ipip_entry->decap_fib_entry = fib_entry;
+	fib_entry->decap.ipip_entry = ipip_entry;
+	fib_entry->decap.tunnel_index = tunnel_index;
+	return 0;
+}
+
+static void mlxsw_sp_fib_entry_decap_fini(struct mlxsw_sp *mlxsw_sp,
+					  struct mlxsw_sp_fib_entry *fib_entry)
+{
+	/* Unlink this node from the IPIP entry that it's the decap entry of. */
+	fib_entry->decap.ipip_entry->decap_fib_entry = NULL;
+	fib_entry->decap.ipip_entry = NULL;
+	mlxsw_sp_kvdl_free(mlxsw_sp, fib_entry->decap.tunnel_index);
+}
+
+static struct mlxsw_sp_fib_node *
+mlxsw_sp_fib_node_lookup(struct mlxsw_sp_fib *fib, const void *addr,
+			 size_t addr_len, unsigned char prefix_len);
+static int mlxsw_sp_fib_entry_update(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_fib_entry *fib_entry);
+
+static void
+mlxsw_sp_ipip_entry_demote_decap(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	struct mlxsw_sp_fib_entry *fib_entry = ipip_entry->decap_fib_entry;
+
+	mlxsw_sp_fib_entry_decap_fini(mlxsw_sp, fib_entry);
+	fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP;
+
+	mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry);
+}
+
+static void
+mlxsw_sp_ipip_entry_promote_decap(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_ipip_entry *ipip_entry,
+				  struct mlxsw_sp_fib_entry *decap_fib_entry)
+{
+	if (mlxsw_sp_fib_entry_decap_init(mlxsw_sp, decap_fib_entry,
+					  ipip_entry))
+		return;
+	decap_fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP;
+
+	if (mlxsw_sp_fib_entry_update(mlxsw_sp, decap_fib_entry))
+		mlxsw_sp_ipip_entry_demote_decap(mlxsw_sp, ipip_entry);
+}
+
+/* Given an IPIP entry, find the corresponding decap route. */
+static struct mlxsw_sp_fib_entry *
+mlxsw_sp_ipip_entry_find_decap(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	static struct mlxsw_sp_fib_node *fib_node;
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	struct mlxsw_sp_fib_entry *fib_entry;
+	unsigned char saddr_prefix_len;
+	union mlxsw_sp_l3addr saddr;
+	struct mlxsw_sp_fib *ul_fib;
+	struct mlxsw_sp_vr *ul_vr;
+	const void *saddrp;
+	size_t saddr_len;
+	u32 ul_tb_id;
+	u32 saddr4;
+
+	ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipip_entry->ipipt];
+
+	ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(ipip_entry->ol_dev);
+	ul_vr = mlxsw_sp_vr_find(mlxsw_sp, ul_tb_id);
+	if (!ul_vr)
+		return NULL;
+
+	ul_fib = mlxsw_sp_vr_fib(ul_vr, ipip_ops->ul_proto);
+	saddr = mlxsw_sp_ipip_netdev_saddr(ipip_ops->ul_proto,
+					   ipip_entry->ol_dev);
+
+	switch (ipip_ops->ul_proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		saddr4 = be32_to_cpu(saddr.addr4);
+		saddrp = &saddr4;
+		saddr_len = 4;
+		saddr_prefix_len = 32;
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		WARN_ON(1);
+		return NULL;
+	}
+
+	fib_node = mlxsw_sp_fib_node_lookup(ul_fib, saddrp, saddr_len,
+					    saddr_prefix_len);
+	if (!fib_node || list_empty(&fib_node->entry_list))
+		return NULL;
+
+	fib_entry = list_first_entry(&fib_node->entry_list,
+				     struct mlxsw_sp_fib_entry, list);
+	if (fib_entry->type != MLXSW_SP_FIB_ENTRY_TYPE_TRAP)
+		return NULL;
+
+	return fib_entry;
+}
+
+static struct mlxsw_sp_ipip_entry *
+mlxsw_sp_ipip_entry_create(struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_ipip_type ipipt,
+			   struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	ipip_entry = mlxsw_sp_ipip_entry_alloc(mlxsw_sp, ipipt, ol_dev);
+	if (IS_ERR(ipip_entry))
+		return ipip_entry;
+
+	list_add_tail(&ipip_entry->ipip_list_node,
+		      &mlxsw_sp->router->ipip_list);
+
+	return ipip_entry;
+}
+
+static void
+mlxsw_sp_ipip_entry_destroy(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	list_del(&ipip_entry->ipip_list_node);
+	mlxsw_sp_ipip_entry_dealloc(ipip_entry);
+}
+
+static bool
+mlxsw_sp_ipip_entry_matches_decap(struct mlxsw_sp *mlxsw_sp,
+				  const struct net_device *ul_dev,
+				  enum mlxsw_sp_l3proto ul_proto,
+				  union mlxsw_sp_l3addr ul_dip,
+				  struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	u32 ul_tb_id = l3mdev_fib_table(ul_dev) ? : RT_TABLE_MAIN;
+	enum mlxsw_sp_ipip_type ipipt = ipip_entry->ipipt;
+	struct net_device *ipip_ul_dev;
+
+	if (mlxsw_sp->router->ipip_ops_arr[ipipt]->ul_proto != ul_proto)
+		return false;
+
+	ipip_ul_dev = __mlxsw_sp_ipip_netdev_ul_dev_get(ipip_entry->ol_dev);
+	return mlxsw_sp_ipip_entry_saddr_matches(mlxsw_sp, ul_proto, ul_dip,
+						 ul_tb_id, ipip_entry) &&
+	       (!ipip_ul_dev || ipip_ul_dev == ul_dev);
+}
+
+/* Given decap parameters, find the corresponding IPIP entry. */
+static struct mlxsw_sp_ipip_entry *
+mlxsw_sp_ipip_entry_find_by_decap(struct mlxsw_sp *mlxsw_sp,
+				  const struct net_device *ul_dev,
+				  enum mlxsw_sp_l3proto ul_proto,
+				  union mlxsw_sp_l3addr ul_dip)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	list_for_each_entry(ipip_entry, &mlxsw_sp->router->ipip_list,
+			    ipip_list_node)
+		if (mlxsw_sp_ipip_entry_matches_decap(mlxsw_sp, ul_dev,
+						      ul_proto, ul_dip,
+						      ipip_entry))
+			return ipip_entry;
+
+	return NULL;
+}
+
+static bool mlxsw_sp_netdev_ipip_type(const struct mlxsw_sp *mlxsw_sp,
+				      const struct net_device *dev,
+				      enum mlxsw_sp_ipip_type *p_type)
+{
+	struct mlxsw_sp_router *router = mlxsw_sp->router;
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	enum mlxsw_sp_ipip_type ipipt;
+
+	for (ipipt = 0; ipipt < MLXSW_SP_IPIP_TYPE_MAX; ++ipipt) {
+		ipip_ops = router->ipip_ops_arr[ipipt];
+		if (dev->type == ipip_ops->dev_type) {
+			if (p_type)
+				*p_type = ipipt;
+			return true;
+		}
+	}
+	return false;
+}
+
+bool mlxsw_sp_netdev_is_ipip_ol(const struct mlxsw_sp *mlxsw_sp,
+				const struct net_device *dev)
+{
+	return mlxsw_sp_netdev_ipip_type(mlxsw_sp, dev, NULL);
+}
+
+static struct mlxsw_sp_ipip_entry *
+mlxsw_sp_ipip_entry_find_by_ol_dev(struct mlxsw_sp *mlxsw_sp,
+				   const struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	list_for_each_entry(ipip_entry, &mlxsw_sp->router->ipip_list,
+			    ipip_list_node)
+		if (ipip_entry->ol_dev == ol_dev)
+			return ipip_entry;
+
+	return NULL;
+}
+
+static struct mlxsw_sp_ipip_entry *
+mlxsw_sp_ipip_entry_find_by_ul_dev(const struct mlxsw_sp *mlxsw_sp,
+				   const struct net_device *ul_dev,
+				   struct mlxsw_sp_ipip_entry *start)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	ipip_entry = list_prepare_entry(start, &mlxsw_sp->router->ipip_list,
+					ipip_list_node);
+	list_for_each_entry_continue(ipip_entry, &mlxsw_sp->router->ipip_list,
+				     ipip_list_node) {
+		struct net_device *ipip_ul_dev =
+			__mlxsw_sp_ipip_netdev_ul_dev_get(ipip_entry->ol_dev);
+
+		if (ipip_ul_dev == ul_dev)
+			return ipip_entry;
+	}
+
+	return NULL;
+}
+
+bool mlxsw_sp_netdev_is_ipip_ul(const struct mlxsw_sp *mlxsw_sp,
+				const struct net_device *dev)
+{
+	return mlxsw_sp_ipip_entry_find_by_ul_dev(mlxsw_sp, dev, NULL);
+}
+
+static bool mlxsw_sp_netdevice_ipip_can_offload(struct mlxsw_sp *mlxsw_sp,
+						const struct net_device *ol_dev,
+						enum mlxsw_sp_ipip_type ipipt)
+{
+	const struct mlxsw_sp_ipip_ops *ops
+		= mlxsw_sp->router->ipip_ops_arr[ipipt];
+
+	/* For deciding whether decap should be offloaded, we don't care about
+	 * overlay protocol, so ask whether either one is supported.
+	 */
+	return ops->can_offload(mlxsw_sp, ol_dev, MLXSW_SP_L3_PROTO_IPV4) ||
+	       ops->can_offload(mlxsw_sp, ol_dev, MLXSW_SP_L3_PROTO_IPV6);
+}
+
+static int mlxsw_sp_netdevice_ipip_ol_reg_event(struct mlxsw_sp *mlxsw_sp,
+						struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	enum mlxsw_sp_l3proto ul_proto;
+	enum mlxsw_sp_ipip_type ipipt;
+	union mlxsw_sp_l3addr saddr;
+	u32 ul_tb_id;
+
+	mlxsw_sp_netdev_ipip_type(mlxsw_sp, ol_dev, &ipipt);
+	if (mlxsw_sp_netdevice_ipip_can_offload(mlxsw_sp, ol_dev, ipipt)) {
+		ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(ol_dev);
+		ul_proto = mlxsw_sp->router->ipip_ops_arr[ipipt]->ul_proto;
+		saddr = mlxsw_sp_ipip_netdev_saddr(ul_proto, ol_dev);
+		if (!mlxsw_sp_ipip_demote_tunnel_by_saddr(mlxsw_sp, ul_proto,
+							  saddr, ul_tb_id,
+							  NULL)) {
+			ipip_entry = mlxsw_sp_ipip_entry_create(mlxsw_sp, ipipt,
+								ol_dev);
+			if (IS_ERR(ipip_entry))
+				return PTR_ERR(ipip_entry);
+		}
+	}
+
+	return 0;
+}
+
+static void mlxsw_sp_netdevice_ipip_ol_unreg_event(struct mlxsw_sp *mlxsw_sp,
+						   struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, ol_dev);
+	if (ipip_entry)
+		mlxsw_sp_ipip_entry_destroy(mlxsw_sp, ipip_entry);
+}
+
+static void
+mlxsw_sp_ipip_entry_ol_up_event(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	struct mlxsw_sp_fib_entry *decap_fib_entry;
+
+	decap_fib_entry = mlxsw_sp_ipip_entry_find_decap(mlxsw_sp, ipip_entry);
+	if (decap_fib_entry)
+		mlxsw_sp_ipip_entry_promote_decap(mlxsw_sp, ipip_entry,
+						  decap_fib_entry);
+}
+
+static int
+mlxsw_sp_rif_ipip_lb_op(struct mlxsw_sp_rif_ipip_lb *lb_rif,
+			struct mlxsw_sp_vr *ul_vr, bool enable)
+{
+	struct mlxsw_sp_rif_ipip_lb_config lb_cf = lb_rif->lb_config;
+	struct mlxsw_sp_rif *rif = &lb_rif->common;
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+	u32 saddr4;
+
+	switch (lb_cf.ul_protocol) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		saddr4 = be32_to_cpu(lb_cf.saddr.addr4);
+		mlxsw_reg_ritr_pack(ritr_pl, enable, MLXSW_REG_RITR_LOOPBACK_IF,
+				    rif->rif_index, rif->vr_id, rif->dev->mtu);
+		mlxsw_reg_ritr_loopback_ipip4_pack(ritr_pl, lb_cf.lb_ipipt,
+			    MLXSW_REG_RITR_LOOPBACK_IPIP_OPTIONS_GRE_KEY_PRESET,
+			    ul_vr->id, saddr4, lb_cf.okey);
+		break;
+
+	case MLXSW_SP_L3_PROTO_IPV6:
+		return -EAFNOSUPPORT;
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
+static int mlxsw_sp_netdevice_ipip_ol_update_mtu(struct mlxsw_sp *mlxsw_sp,
+						 struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	struct mlxsw_sp_rif_ipip_lb *lb_rif;
+	struct mlxsw_sp_vr *ul_vr;
+	int err = 0;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, ol_dev);
+	if (ipip_entry) {
+		lb_rif = ipip_entry->ol_lb;
+		ul_vr = &mlxsw_sp->router->vrs[lb_rif->ul_vr_id];
+		err = mlxsw_sp_rif_ipip_lb_op(lb_rif, ul_vr, true);
+		if (err)
+			goto out;
+		lb_rif->common.mtu = ol_dev->mtu;
+	}
+
+out:
+	return err;
+}
+
+static void mlxsw_sp_netdevice_ipip_ol_up_event(struct mlxsw_sp *mlxsw_sp,
+						struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, ol_dev);
+	if (ipip_entry)
+		mlxsw_sp_ipip_entry_ol_up_event(mlxsw_sp, ipip_entry);
+}
+
+static void
+mlxsw_sp_ipip_entry_ol_down_event(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	if (ipip_entry->decap_fib_entry)
+		mlxsw_sp_ipip_entry_demote_decap(mlxsw_sp, ipip_entry);
+}
+
+static void mlxsw_sp_netdevice_ipip_ol_down_event(struct mlxsw_sp *mlxsw_sp,
+						  struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, ol_dev);
+	if (ipip_entry)
+		mlxsw_sp_ipip_entry_ol_down_event(mlxsw_sp, ipip_entry);
+}
+
+static void mlxsw_sp_nexthop_rif_migrate(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_rif *old_rif,
+					 struct mlxsw_sp_rif *new_rif);
+static int
+mlxsw_sp_ipip_entry_ol_lb_update(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_ipip_entry *ipip_entry,
+				 bool keep_encap,
+				 struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_rif_ipip_lb *old_lb_rif = ipip_entry->ol_lb;
+	struct mlxsw_sp_rif_ipip_lb *new_lb_rif;
+
+	new_lb_rif = mlxsw_sp_ipip_ol_ipip_lb_create(mlxsw_sp,
+						     ipip_entry->ipipt,
+						     ipip_entry->ol_dev,
+						     extack);
+	if (IS_ERR(new_lb_rif))
+		return PTR_ERR(new_lb_rif);
+	ipip_entry->ol_lb = new_lb_rif;
+
+	if (keep_encap)
+		mlxsw_sp_nexthop_rif_migrate(mlxsw_sp, &old_lb_rif->common,
+					     &new_lb_rif->common);
+
+	mlxsw_sp_rif_destroy(&old_lb_rif->common);
+
+	return 0;
+}
+
+static void mlxsw_sp_nexthop_rif_update(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_rif *rif);
+
+/**
+ * Update the offload related to an IPIP entry. This always updates decap, and
+ * in addition to that it also:
+ * @recreate_loopback: recreates the associated loopback RIF
+ * @keep_encap: updates next hops that use the tunnel netdevice. This is only
+ *              relevant when recreate_loopback is true.
+ * @update_nexthops: updates next hops, keeping the current loopback RIF. This
+ *                   is only relevant when recreate_loopback is false.
+ */
+int __mlxsw_sp_ipip_entry_update_tunnel(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_ipip_entry *ipip_entry,
+					bool recreate_loopback,
+					bool keep_encap,
+					bool update_nexthops,
+					struct netlink_ext_ack *extack)
+{
+	int err;
+
+	/* RIFs can't be edited, so to update loopback, we need to destroy and
+	 * recreate it. That creates a window of opportunity where RALUE and
+	 * RATR registers end up referencing a RIF that's already gone. RATRs
+	 * are handled in mlxsw_sp_ipip_entry_ol_lb_update(), and to take care
+	 * of RALUE, demote the decap route back.
+	 */
+	if (ipip_entry->decap_fib_entry)
+		mlxsw_sp_ipip_entry_demote_decap(mlxsw_sp, ipip_entry);
+
+	if (recreate_loopback) {
+		err = mlxsw_sp_ipip_entry_ol_lb_update(mlxsw_sp, ipip_entry,
+						       keep_encap, extack);
+		if (err)
+			return err;
+	} else if (update_nexthops) {
+		mlxsw_sp_nexthop_rif_update(mlxsw_sp,
+					    &ipip_entry->ol_lb->common);
+	}
+
+	if (ipip_entry->ol_dev->flags & IFF_UP)
+		mlxsw_sp_ipip_entry_ol_up_event(mlxsw_sp, ipip_entry);
+
+	return 0;
+}
+
+static int mlxsw_sp_netdevice_ipip_ol_vrf_event(struct mlxsw_sp *mlxsw_sp,
+						struct net_device *ol_dev,
+						struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry =
+		mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, ol_dev);
+	enum mlxsw_sp_l3proto ul_proto;
+	union mlxsw_sp_l3addr saddr;
+	u32 ul_tb_id;
+
+	if (!ipip_entry)
+		return 0;
+
+	/* For flat configuration cases, moving overlay to a different VRF might
+	 * cause local address conflict, and the conflicting tunnels need to be
+	 * demoted.
+	 */
+	ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(ol_dev);
+	ul_proto = mlxsw_sp->router->ipip_ops_arr[ipip_entry->ipipt]->ul_proto;
+	saddr = mlxsw_sp_ipip_netdev_saddr(ul_proto, ol_dev);
+	if (mlxsw_sp_ipip_demote_tunnel_by_saddr(mlxsw_sp, ul_proto,
+						 saddr, ul_tb_id,
+						 ipip_entry)) {
+		mlxsw_sp_ipip_entry_demote_tunnel(mlxsw_sp, ipip_entry);
+		return 0;
+	}
+
+	return __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+						   true, false, false, extack);
+}
+
+static int
+mlxsw_sp_netdevice_ipip_ul_vrf_event(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_ipip_entry *ipip_entry,
+				     struct net_device *ul_dev,
+				     struct netlink_ext_ack *extack)
+{
+	return __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+						   true, true, false, extack);
+}
+
+static int
+mlxsw_sp_netdevice_ipip_ul_up_event(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_ipip_entry *ipip_entry,
+				    struct net_device *ul_dev)
+{
+	return __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+						   false, false, true, NULL);
+}
+
+static int
+mlxsw_sp_netdevice_ipip_ul_down_event(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_ipip_entry *ipip_entry,
+				      struct net_device *ul_dev)
+{
+	/* A down underlay device causes encapsulated packets to not be
+	 * forwarded, but decap still works. So refresh next hops without
+	 * touching anything else.
+	 */
+	return __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+						   false, false, true, NULL);
+}
+
+static int
+mlxsw_sp_netdevice_ipip_ol_change_event(struct mlxsw_sp *mlxsw_sp,
+					struct net_device *ol_dev,
+					struct netlink_ext_ack *extack)
+{
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	int err;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, ol_dev);
+	if (!ipip_entry)
+		/* A change might make a tunnel eligible for offloading, but
+		 * that is currently not implemented. What falls to slow path
+		 * stays there.
+		 */
+		return 0;
+
+	/* A change might make a tunnel not eligible for offloading. */
+	if (!mlxsw_sp_netdevice_ipip_can_offload(mlxsw_sp, ol_dev,
+						 ipip_entry->ipipt)) {
+		mlxsw_sp_ipip_entry_demote_tunnel(mlxsw_sp, ipip_entry);
+		return 0;
+	}
+
+	ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipip_entry->ipipt];
+	err = ipip_ops->ol_netdev_change(mlxsw_sp, ipip_entry, extack);
+	return err;
+}
+
+void mlxsw_sp_ipip_entry_demote_tunnel(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	struct net_device *ol_dev = ipip_entry->ol_dev;
+
+	if (ol_dev->flags & IFF_UP)
+		mlxsw_sp_ipip_entry_ol_down_event(mlxsw_sp, ipip_entry);
+	mlxsw_sp_ipip_entry_destroy(mlxsw_sp, ipip_entry);
+}
+
+/* The configuration where several tunnels have the same local address in the
+ * same underlay table needs special treatment in the HW. That is currently not
+ * implemented in the driver. This function finds and demotes the first tunnel
+ * with a given source address, except the one passed in in the argument
+ * `except'.
+ */
+bool
+mlxsw_sp_ipip_demote_tunnel_by_saddr(struct mlxsw_sp *mlxsw_sp,
+				     enum mlxsw_sp_l3proto ul_proto,
+				     union mlxsw_sp_l3addr saddr,
+				     u32 ul_tb_id,
+				     const struct mlxsw_sp_ipip_entry *except)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry, *tmp;
+
+	list_for_each_entry_safe(ipip_entry, tmp, &mlxsw_sp->router->ipip_list,
+				 ipip_list_node) {
+		if (ipip_entry != except &&
+		    mlxsw_sp_ipip_entry_saddr_matches(mlxsw_sp, ul_proto, saddr,
+						      ul_tb_id, ipip_entry)) {
+			mlxsw_sp_ipip_entry_demote_tunnel(mlxsw_sp, ipip_entry);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void mlxsw_sp_ipip_demote_tunnel_by_ul_netdev(struct mlxsw_sp *mlxsw_sp,
+						     struct net_device *ul_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry, *tmp;
+
+	list_for_each_entry_safe(ipip_entry, tmp, &mlxsw_sp->router->ipip_list,
+				 ipip_list_node) {
+		struct net_device *ipip_ul_dev =
+			__mlxsw_sp_ipip_netdev_ul_dev_get(ipip_entry->ol_dev);
+
+		if (ipip_ul_dev == ul_dev)
+			mlxsw_sp_ipip_entry_demote_tunnel(mlxsw_sp, ipip_entry);
+	}
+}
+
+int mlxsw_sp_netdevice_ipip_ol_event(struct mlxsw_sp *mlxsw_sp,
+				     struct net_device *ol_dev,
+				     unsigned long event,
+				     struct netdev_notifier_info *info)
+{
+	struct netdev_notifier_changeupper_info *chup;
+	struct netlink_ext_ack *extack;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		return mlxsw_sp_netdevice_ipip_ol_reg_event(mlxsw_sp, ol_dev);
+	case NETDEV_UNREGISTER:
+		mlxsw_sp_netdevice_ipip_ol_unreg_event(mlxsw_sp, ol_dev);
+		return 0;
+	case NETDEV_UP:
+		mlxsw_sp_netdevice_ipip_ol_up_event(mlxsw_sp, ol_dev);
+		return 0;
+	case NETDEV_DOWN:
+		mlxsw_sp_netdevice_ipip_ol_down_event(mlxsw_sp, ol_dev);
+		return 0;
+	case NETDEV_CHANGEUPPER:
+		chup = container_of(info, typeof(*chup), info);
+		extack = info->extack;
+		if (netif_is_l3_master(chup->upper_dev))
+			return mlxsw_sp_netdevice_ipip_ol_vrf_event(mlxsw_sp,
+								    ol_dev,
+								    extack);
+		return 0;
+	case NETDEV_CHANGE:
+		extack = info->extack;
+		return mlxsw_sp_netdevice_ipip_ol_change_event(mlxsw_sp,
+							       ol_dev, extack);
+	case NETDEV_CHANGEMTU:
+		return mlxsw_sp_netdevice_ipip_ol_update_mtu(mlxsw_sp, ol_dev);
+	}
+	return 0;
+}
+
+static int
+__mlxsw_sp_netdevice_ipip_ul_event(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_ipip_entry *ipip_entry,
+				   struct net_device *ul_dev,
+				   unsigned long event,
+				   struct netdev_notifier_info *info)
+{
+	struct netdev_notifier_changeupper_info *chup;
+	struct netlink_ext_ack *extack;
+
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		chup = container_of(info, typeof(*chup), info);
+		extack = info->extack;
+		if (netif_is_l3_master(chup->upper_dev))
+			return mlxsw_sp_netdevice_ipip_ul_vrf_event(mlxsw_sp,
+								    ipip_entry,
+								    ul_dev,
+								    extack);
+		break;
+
+	case NETDEV_UP:
+		return mlxsw_sp_netdevice_ipip_ul_up_event(mlxsw_sp, ipip_entry,
+							   ul_dev);
+	case NETDEV_DOWN:
+		return mlxsw_sp_netdevice_ipip_ul_down_event(mlxsw_sp,
+							     ipip_entry,
+							     ul_dev);
+	}
+	return 0;
+}
+
+int
+mlxsw_sp_netdevice_ipip_ul_event(struct mlxsw_sp *mlxsw_sp,
+				 struct net_device *ul_dev,
+				 unsigned long event,
+				 struct netdev_notifier_info *info)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry = NULL;
+	int err;
+
+	while ((ipip_entry = mlxsw_sp_ipip_entry_find_by_ul_dev(mlxsw_sp,
+								ul_dev,
+								ipip_entry))) {
+		err = __mlxsw_sp_netdevice_ipip_ul_event(mlxsw_sp, ipip_entry,
+							 ul_dev, event, info);
+		if (err) {
+			mlxsw_sp_ipip_demote_tunnel_by_ul_netdev(mlxsw_sp,
+								 ul_dev);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+struct mlxsw_sp_neigh_key {
+	struct neighbour *n;
+};
+
+struct mlxsw_sp_neigh_entry {
+	struct list_head rif_list_node;
+	struct rhash_head ht_node;
+	struct mlxsw_sp_neigh_key key;
+	u16 rif;
+	bool connected;
+	unsigned char ha[ETH_ALEN];
+	struct list_head nexthop_list; /* list of nexthops using
+					* this neigh entry
+					*/
+	struct list_head nexthop_neighs_list_node;
+	unsigned int counter_index;
+	bool counter_valid;
+};
+
+static const struct rhashtable_params mlxsw_sp_neigh_ht_params = {
+	.key_offset = offsetof(struct mlxsw_sp_neigh_entry, key),
+	.head_offset = offsetof(struct mlxsw_sp_neigh_entry, ht_node),
+	.key_len = sizeof(struct mlxsw_sp_neigh_key),
+};
+
+struct mlxsw_sp_neigh_entry *
+mlxsw_sp_rif_neigh_next(struct mlxsw_sp_rif *rif,
+			struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	if (!neigh_entry) {
+		if (list_empty(&rif->neigh_list))
+			return NULL;
+		else
+			return list_first_entry(&rif->neigh_list,
+						typeof(*neigh_entry),
+						rif_list_node);
+	}
+	if (list_is_last(&neigh_entry->rif_list_node, &rif->neigh_list))
+		return NULL;
+	return list_next_entry(neigh_entry, rif_list_node);
+}
+
+int mlxsw_sp_neigh_entry_type(struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	return neigh_entry->key.n->tbl->family;
+}
+
+unsigned char *
+mlxsw_sp_neigh_entry_ha(struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	return neigh_entry->ha;
+}
+
+u32 mlxsw_sp_neigh4_entry_dip(struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	struct neighbour *n;
+
+	n = neigh_entry->key.n;
+	return ntohl(*((__be32 *) n->primary_key));
+}
+
+struct in6_addr *
+mlxsw_sp_neigh6_entry_dip(struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	struct neighbour *n;
+
+	n = neigh_entry->key.n;
+	return (struct in6_addr *) &n->primary_key;
+}
+
+int mlxsw_sp_neigh_counter_get(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_neigh_entry *neigh_entry,
+			       u64 *p_counter)
+{
+	if (!neigh_entry->counter_valid)
+		return -EINVAL;
+
+	return mlxsw_sp_flow_counter_get(mlxsw_sp, neigh_entry->counter_index,
+					 p_counter, NULL);
+}
+
+static struct mlxsw_sp_neigh_entry *
+mlxsw_sp_neigh_entry_alloc(struct mlxsw_sp *mlxsw_sp, struct neighbour *n,
+			   u16 rif)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry;
+
+	neigh_entry = kzalloc(sizeof(*neigh_entry), GFP_KERNEL);
+	if (!neigh_entry)
+		return NULL;
+
+	neigh_entry->key.n = n;
+	neigh_entry->rif = rif;
+	INIT_LIST_HEAD(&neigh_entry->nexthop_list);
+
+	return neigh_entry;
+}
+
+static void mlxsw_sp_neigh_entry_free(struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	kfree(neigh_entry);
+}
+
+static int
+mlxsw_sp_neigh_entry_insert(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	return rhashtable_insert_fast(&mlxsw_sp->router->neigh_ht,
+				      &neigh_entry->ht_node,
+				      mlxsw_sp_neigh_ht_params);
+}
+
+static void
+mlxsw_sp_neigh_entry_remove(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	rhashtable_remove_fast(&mlxsw_sp->router->neigh_ht,
+			       &neigh_entry->ht_node,
+			       mlxsw_sp_neigh_ht_params);
+}
+
+static bool
+mlxsw_sp_neigh_counter_should_alloc(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	struct devlink *devlink;
+	const char *table_name;
+
+	switch (mlxsw_sp_neigh_entry_type(neigh_entry)) {
+	case AF_INET:
+		table_name = MLXSW_SP_DPIPE_TABLE_NAME_HOST4;
+		break;
+	case AF_INET6:
+		table_name = MLXSW_SP_DPIPE_TABLE_NAME_HOST6;
+		break;
+	default:
+		WARN_ON(1);
+		return false;
+	}
+
+	devlink = priv_to_devlink(mlxsw_sp->core);
+	return devlink_dpipe_table_counter_enabled(devlink, table_name);
+}
+
+static void
+mlxsw_sp_neigh_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	if (!mlxsw_sp_neigh_counter_should_alloc(mlxsw_sp, neigh_entry))
+		return;
+
+	if (mlxsw_sp_flow_counter_alloc(mlxsw_sp, &neigh_entry->counter_index))
+		return;
+
+	neigh_entry->counter_valid = true;
+}
+
+static void
+mlxsw_sp_neigh_counter_free(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	if (!neigh_entry->counter_valid)
+		return;
+	mlxsw_sp_flow_counter_free(mlxsw_sp,
+				   neigh_entry->counter_index);
+	neigh_entry->counter_valid = false;
+}
+
+static struct mlxsw_sp_neigh_entry *
+mlxsw_sp_neigh_entry_create(struct mlxsw_sp *mlxsw_sp, struct neighbour *n)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry;
+	struct mlxsw_sp_rif *rif;
+	int err;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, n->dev);
+	if (!rif)
+		return ERR_PTR(-EINVAL);
+
+	neigh_entry = mlxsw_sp_neigh_entry_alloc(mlxsw_sp, n, rif->rif_index);
+	if (!neigh_entry)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlxsw_sp_neigh_entry_insert(mlxsw_sp, neigh_entry);
+	if (err)
+		goto err_neigh_entry_insert;
+
+	mlxsw_sp_neigh_counter_alloc(mlxsw_sp, neigh_entry);
+	list_add(&neigh_entry->rif_list_node, &rif->neigh_list);
+
+	return neigh_entry;
+
+err_neigh_entry_insert:
+	mlxsw_sp_neigh_entry_free(neigh_entry);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_neigh_entry_destroy(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	list_del(&neigh_entry->rif_list_node);
+	mlxsw_sp_neigh_counter_free(mlxsw_sp, neigh_entry);
+	mlxsw_sp_neigh_entry_remove(mlxsw_sp, neigh_entry);
+	mlxsw_sp_neigh_entry_free(neigh_entry);
+}
+
+static struct mlxsw_sp_neigh_entry *
+mlxsw_sp_neigh_entry_lookup(struct mlxsw_sp *mlxsw_sp, struct neighbour *n)
+{
+	struct mlxsw_sp_neigh_key key;
+
+	key.n = n;
+	return rhashtable_lookup_fast(&mlxsw_sp->router->neigh_ht,
+				      &key, mlxsw_sp_neigh_ht_params);
+}
+
+static void
+mlxsw_sp_router_neighs_update_interval_init(struct mlxsw_sp *mlxsw_sp)
+{
+	unsigned long interval;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	interval = min_t(unsigned long,
+			 NEIGH_VAR(&arp_tbl.parms, DELAY_PROBE_TIME),
+			 NEIGH_VAR(&nd_tbl.parms, DELAY_PROBE_TIME));
+#else
+	interval = NEIGH_VAR(&arp_tbl.parms, DELAY_PROBE_TIME);
+#endif
+	mlxsw_sp->router->neighs_update.interval = jiffies_to_msecs(interval);
+}
+
+static void mlxsw_sp_router_neigh_ent_ipv4_process(struct mlxsw_sp *mlxsw_sp,
+						   char *rauhtd_pl,
+						   int ent_index)
+{
+	struct net_device *dev;
+	struct neighbour *n;
+	__be32 dipn;
+	u32 dip;
+	u16 rif;
+
+	mlxsw_reg_rauhtd_ent_ipv4_unpack(rauhtd_pl, ent_index, &rif, &dip);
+
+	if (!mlxsw_sp->router->rifs[rif]) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Incorrect RIF in neighbour entry\n");
+		return;
+	}
+
+	dipn = htonl(dip);
+	dev = mlxsw_sp->router->rifs[rif]->dev;
+	n = neigh_lookup(&arp_tbl, &dipn, dev);
+	if (!n)
+		return;
+
+	netdev_dbg(dev, "Updating neighbour with IP=%pI4h\n", &dip);
+	neigh_event_send(n, NULL);
+	neigh_release(n);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void mlxsw_sp_router_neigh_ent_ipv6_process(struct mlxsw_sp *mlxsw_sp,
+						   char *rauhtd_pl,
+						   int rec_index)
+{
+	struct net_device *dev;
+	struct neighbour *n;
+	struct in6_addr dip;
+	u16 rif;
+
+	mlxsw_reg_rauhtd_ent_ipv6_unpack(rauhtd_pl, rec_index, &rif,
+					 (char *) &dip);
+
+	if (!mlxsw_sp->router->rifs[rif]) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Incorrect RIF in neighbour entry\n");
+		return;
+	}
+
+	dev = mlxsw_sp->router->rifs[rif]->dev;
+	n = neigh_lookup(&nd_tbl, &dip, dev);
+	if (!n)
+		return;
+
+	netdev_dbg(dev, "Updating neighbour with IP=%pI6c\n", &dip);
+	neigh_event_send(n, NULL);
+	neigh_release(n);
+}
+#else
+static void mlxsw_sp_router_neigh_ent_ipv6_process(struct mlxsw_sp *mlxsw_sp,
+						   char *rauhtd_pl,
+						   int rec_index)
+{
+}
+#endif
+
+static void mlxsw_sp_router_neigh_rec_ipv4_process(struct mlxsw_sp *mlxsw_sp,
+						   char *rauhtd_pl,
+						   int rec_index)
+{
+	u8 num_entries;
+	int i;
+
+	num_entries = mlxsw_reg_rauhtd_ipv4_rec_num_entries_get(rauhtd_pl,
+								rec_index);
+	/* Hardware starts counting at 0, so add 1. */
+	num_entries++;
+
+	/* Each record consists of several neighbour entries. */
+	for (i = 0; i < num_entries; i++) {
+		int ent_index;
+
+		ent_index = rec_index * MLXSW_REG_RAUHTD_IPV4_ENT_PER_REC + i;
+		mlxsw_sp_router_neigh_ent_ipv4_process(mlxsw_sp, rauhtd_pl,
+						       ent_index);
+	}
+
+}
+
+static void mlxsw_sp_router_neigh_rec_ipv6_process(struct mlxsw_sp *mlxsw_sp,
+						   char *rauhtd_pl,
+						   int rec_index)
+{
+	/* One record contains one entry. */
+	mlxsw_sp_router_neigh_ent_ipv6_process(mlxsw_sp, rauhtd_pl,
+					       rec_index);
+}
+
+static void mlxsw_sp_router_neigh_rec_process(struct mlxsw_sp *mlxsw_sp,
+					      char *rauhtd_pl, int rec_index)
+{
+	switch (mlxsw_reg_rauhtd_rec_type_get(rauhtd_pl, rec_index)) {
+	case MLXSW_REG_RAUHTD_TYPE_IPV4:
+		mlxsw_sp_router_neigh_rec_ipv4_process(mlxsw_sp, rauhtd_pl,
+						       rec_index);
+		break;
+	case MLXSW_REG_RAUHTD_TYPE_IPV6:
+		mlxsw_sp_router_neigh_rec_ipv6_process(mlxsw_sp, rauhtd_pl,
+						       rec_index);
+		break;
+	}
+}
+
+static bool mlxsw_sp_router_rauhtd_is_full(char *rauhtd_pl)
+{
+	u8 num_rec, last_rec_index, num_entries;
+
+	num_rec = mlxsw_reg_rauhtd_num_rec_get(rauhtd_pl);
+	last_rec_index = num_rec - 1;
+
+	if (num_rec < MLXSW_REG_RAUHTD_REC_MAX_NUM)
+		return false;
+	if (mlxsw_reg_rauhtd_rec_type_get(rauhtd_pl, last_rec_index) ==
+	    MLXSW_REG_RAUHTD_TYPE_IPV6)
+		return true;
+
+	num_entries = mlxsw_reg_rauhtd_ipv4_rec_num_entries_get(rauhtd_pl,
+								last_rec_index);
+	if (++num_entries == MLXSW_REG_RAUHTD_IPV4_ENT_PER_REC)
+		return true;
+	return false;
+}
+
+static int
+__mlxsw_sp_router_neighs_update_rauhtd(struct mlxsw_sp *mlxsw_sp,
+				       char *rauhtd_pl,
+				       enum mlxsw_reg_rauhtd_type type)
+{
+	int i, num_rec;
+	int err;
+
+	/* Make sure the neighbour's netdev isn't removed in the
+	 * process.
+	 */
+	rtnl_lock();
+	do {
+		mlxsw_reg_rauhtd_pack(rauhtd_pl, type);
+		err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(rauhtd),
+				      rauhtd_pl);
+		if (err) {
+			dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to dump neighbour table\n");
+			break;
+		}
+		num_rec = mlxsw_reg_rauhtd_num_rec_get(rauhtd_pl);
+		for (i = 0; i < num_rec; i++)
+			mlxsw_sp_router_neigh_rec_process(mlxsw_sp, rauhtd_pl,
+							  i);
+	} while (mlxsw_sp_router_rauhtd_is_full(rauhtd_pl));
+	rtnl_unlock();
+
+	return err;
+}
+
+static int mlxsw_sp_router_neighs_update_rauhtd(struct mlxsw_sp *mlxsw_sp)
+{
+	enum mlxsw_reg_rauhtd_type type;
+	char *rauhtd_pl;
+	int err;
+
+	rauhtd_pl = kmalloc(MLXSW_REG_RAUHTD_LEN, GFP_KERNEL);
+	if (!rauhtd_pl)
+		return -ENOMEM;
+
+	type = MLXSW_REG_RAUHTD_TYPE_IPV4;
+	err = __mlxsw_sp_router_neighs_update_rauhtd(mlxsw_sp, rauhtd_pl, type);
+	if (err)
+		goto out;
+
+	type = MLXSW_REG_RAUHTD_TYPE_IPV6;
+	err = __mlxsw_sp_router_neighs_update_rauhtd(mlxsw_sp, rauhtd_pl, type);
+out:
+	kfree(rauhtd_pl);
+	return err;
+}
+
+static void mlxsw_sp_router_neighs_update_nh(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry;
+
+	/* Take RTNL mutex here to prevent lists from changes */
+	rtnl_lock();
+	list_for_each_entry(neigh_entry, &mlxsw_sp->router->nexthop_neighs_list,
+			    nexthop_neighs_list_node)
+		/* If this neigh have nexthops, make the kernel think this neigh
+		 * is active regardless of the traffic.
+		 */
+		neigh_event_send(neigh_entry->key.n, NULL);
+	rtnl_unlock();
+}
+
+static void
+mlxsw_sp_router_neighs_update_work_schedule(struct mlxsw_sp *mlxsw_sp)
+{
+	unsigned long interval = mlxsw_sp->router->neighs_update.interval;
+
+	mlxsw_core_schedule_dw(&mlxsw_sp->router->neighs_update.dw,
+			       msecs_to_jiffies(interval));
+}
+
+static void mlxsw_sp_router_neighs_update_work(struct work_struct *work)
+{
+	struct mlxsw_sp_router *router;
+	int err;
+
+	router = container_of(work, struct mlxsw_sp_router,
+			      neighs_update.dw.work);
+	err = mlxsw_sp_router_neighs_update_rauhtd(router->mlxsw_sp);
+	if (err)
+		dev_err(router->mlxsw_sp->bus_info->dev, "Could not update kernel for neigh activity");
+
+	mlxsw_sp_router_neighs_update_nh(router->mlxsw_sp);
+
+	mlxsw_sp_router_neighs_update_work_schedule(router->mlxsw_sp);
+}
+
+static void mlxsw_sp_router_probe_unresolved_nexthops(struct work_struct *work)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry;
+	struct mlxsw_sp_router *router;
+
+	router = container_of(work, struct mlxsw_sp_router,
+			      nexthop_probe_dw.work);
+	/* Iterate over nexthop neighbours, find those who are unresolved and
+	 * send arp on them. This solves the chicken-egg problem when
+	 * the nexthop wouldn't get offloaded until the neighbor is resolved
+	 * but it wouldn't get resolved ever in case traffic is flowing in HW
+	 * using different nexthop.
+	 *
+	 * Take RTNL mutex here to prevent lists from changes.
+	 */
+	rtnl_lock();
+	list_for_each_entry(neigh_entry, &router->nexthop_neighs_list,
+			    nexthop_neighs_list_node)
+		if (!neigh_entry->connected)
+			neigh_event_send(neigh_entry->key.n, NULL);
+	rtnl_unlock();
+
+	mlxsw_core_schedule_dw(&router->nexthop_probe_dw,
+			       MLXSW_SP_UNRESOLVED_NH_PROBE_INTERVAL);
+}
+
+static void
+mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_neigh_entry *neigh_entry,
+			      bool removing);
+
+static enum mlxsw_reg_rauht_op mlxsw_sp_rauht_op(bool adding)
+{
+	return adding ? MLXSW_REG_RAUHT_OP_WRITE_ADD :
+			MLXSW_REG_RAUHT_OP_WRITE_DELETE;
+}
+
+static void
+mlxsw_sp_router_neigh_entry_op4(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_neigh_entry *neigh_entry,
+				enum mlxsw_reg_rauht_op op)
+{
+	struct neighbour *n = neigh_entry->key.n;
+	u32 dip = ntohl(*((__be32 *) n->primary_key));
+	char rauht_pl[MLXSW_REG_RAUHT_LEN];
+
+	mlxsw_reg_rauht_pack4(rauht_pl, op, neigh_entry->rif, neigh_entry->ha,
+			      dip);
+	if (neigh_entry->counter_valid)
+		mlxsw_reg_rauht_pack_counter(rauht_pl,
+					     neigh_entry->counter_index);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rauht), rauht_pl);
+}
+
+static void
+mlxsw_sp_router_neigh_entry_op6(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_neigh_entry *neigh_entry,
+				enum mlxsw_reg_rauht_op op)
+{
+	struct neighbour *n = neigh_entry->key.n;
+	char rauht_pl[MLXSW_REG_RAUHT_LEN];
+	const char *dip = n->primary_key;
+
+	mlxsw_reg_rauht_pack6(rauht_pl, op, neigh_entry->rif, neigh_entry->ha,
+			      dip);
+	if (neigh_entry->counter_valid)
+		mlxsw_reg_rauht_pack_counter(rauht_pl,
+					     neigh_entry->counter_index);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rauht), rauht_pl);
+}
+
+bool mlxsw_sp_neigh_ipv6_ignore(struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	struct neighbour *n = neigh_entry->key.n;
+
+	/* Packets with a link-local destination address are trapped
+	 * after LPM lookup and never reach the neighbour table, so
+	 * there is no need to program such neighbours to the device.
+	 */
+	if (ipv6_addr_type((struct in6_addr *) &n->primary_key) &
+	    IPV6_ADDR_LINKLOCAL)
+		return true;
+	return false;
+}
+
+static void
+mlxsw_sp_neigh_entry_update(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_neigh_entry *neigh_entry,
+			    bool adding)
+{
+	if (!adding && !neigh_entry->connected)
+		return;
+	neigh_entry->connected = adding;
+	if (neigh_entry->key.n->tbl->family == AF_INET) {
+		mlxsw_sp_router_neigh_entry_op4(mlxsw_sp, neigh_entry,
+						mlxsw_sp_rauht_op(adding));
+	} else if (neigh_entry->key.n->tbl->family == AF_INET6) {
+		if (mlxsw_sp_neigh_ipv6_ignore(neigh_entry))
+			return;
+		mlxsw_sp_router_neigh_entry_op6(mlxsw_sp, neigh_entry,
+						mlxsw_sp_rauht_op(adding));
+	} else {
+		WARN_ON_ONCE(1);
+	}
+}
+
+void
+mlxsw_sp_neigh_entry_counter_update(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_neigh_entry *neigh_entry,
+				    bool adding)
+{
+	if (adding)
+		mlxsw_sp_neigh_counter_alloc(mlxsw_sp, neigh_entry);
+	else
+		mlxsw_sp_neigh_counter_free(mlxsw_sp, neigh_entry);
+	mlxsw_sp_neigh_entry_update(mlxsw_sp, neigh_entry, true);
+}
+
+struct mlxsw_sp_netevent_work {
+	struct work_struct work;
+	struct mlxsw_sp *mlxsw_sp;
+	struct neighbour *n;
+};
+
+static void mlxsw_sp_router_neigh_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_netevent_work *net_work =
+		container_of(work, struct mlxsw_sp_netevent_work, work);
+	struct mlxsw_sp *mlxsw_sp = net_work->mlxsw_sp;
+	struct mlxsw_sp_neigh_entry *neigh_entry;
+	struct neighbour *n = net_work->n;
+	unsigned char ha[ETH_ALEN];
+	bool entry_connected;
+	u8 nud_state, dead;
+
+	/* If these parameters are changed after we release the lock,
+	 * then we are guaranteed to receive another event letting us
+	 * know about it.
+	 */
+	read_lock_bh(&n->lock);
+	memcpy(ha, n->ha, ETH_ALEN);
+	nud_state = n->nud_state;
+	dead = n->dead;
+	read_unlock_bh(&n->lock);
+
+	rtnl_lock();
+	mlxsw_sp_span_respin(mlxsw_sp);
+
+	entry_connected = nud_state & NUD_VALID && !dead;
+	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
+	if (!entry_connected && !neigh_entry)
+		goto out;
+	if (!neigh_entry) {
+		neigh_entry = mlxsw_sp_neigh_entry_create(mlxsw_sp, n);
+		if (IS_ERR(neigh_entry))
+			goto out;
+	}
+
+	memcpy(neigh_entry->ha, ha, ETH_ALEN);
+	mlxsw_sp_neigh_entry_update(mlxsw_sp, neigh_entry, entry_connected);
+	mlxsw_sp_nexthop_neigh_update(mlxsw_sp, neigh_entry, !entry_connected);
+
+	if (!neigh_entry->connected && list_empty(&neigh_entry->nexthop_list))
+		mlxsw_sp_neigh_entry_destroy(mlxsw_sp, neigh_entry);
+
+out:
+	rtnl_unlock();
+	neigh_release(n);
+	kfree(net_work);
+}
+
+static int mlxsw_sp_mp_hash_init(struct mlxsw_sp *mlxsw_sp);
+
+static void mlxsw_sp_router_mp_hash_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_netevent_work *net_work =
+		container_of(work, struct mlxsw_sp_netevent_work, work);
+	struct mlxsw_sp *mlxsw_sp = net_work->mlxsw_sp;
+
+	mlxsw_sp_mp_hash_init(mlxsw_sp);
+	kfree(net_work);
+}
+
+static int mlxsw_sp_router_netevent_event(struct notifier_block *nb,
+					  unsigned long event, void *ptr)
+{
+	struct mlxsw_sp_netevent_work *net_work;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	struct mlxsw_sp_router *router;
+	struct mlxsw_sp *mlxsw_sp;
+	unsigned long interval;
+	struct neigh_parms *p;
+	struct neighbour *n;
+	struct net *net;
+
+	switch (event) {
+	case NETEVENT_DELAY_PROBE_TIME_UPDATE:
+		p = ptr;
+
+		/* We don't care about changes in the default table. */
+		if (!p->dev || (p->tbl->family != AF_INET &&
+				p->tbl->family != AF_INET6))
+			return NOTIFY_DONE;
+
+		/* We are in atomic context and can't take RTNL mutex,
+		 * so use RCU variant to walk the device chain.
+		 */
+		mlxsw_sp_port = mlxsw_sp_port_lower_dev_hold(p->dev);
+		if (!mlxsw_sp_port)
+			return NOTIFY_DONE;
+
+		mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+		interval = jiffies_to_msecs(NEIGH_VAR(p, DELAY_PROBE_TIME));
+		mlxsw_sp->router->neighs_update.interval = interval;
+
+		mlxsw_sp_port_dev_put(mlxsw_sp_port);
+		break;
+	case NETEVENT_NEIGH_UPDATE:
+		n = ptr;
+
+		if (n->tbl->family != AF_INET && n->tbl->family != AF_INET6)
+			return NOTIFY_DONE;
+
+		mlxsw_sp_port = mlxsw_sp_port_lower_dev_hold(n->dev);
+		if (!mlxsw_sp_port)
+			return NOTIFY_DONE;
+
+		net_work = kzalloc(sizeof(*net_work), GFP_ATOMIC);
+		if (!net_work) {
+			mlxsw_sp_port_dev_put(mlxsw_sp_port);
+			return NOTIFY_BAD;
+		}
+
+		INIT_WORK(&net_work->work, mlxsw_sp_router_neigh_event_work);
+		net_work->mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+		net_work->n = n;
+
+		/* Take a reference to ensure the neighbour won't be
+		 * destructed until we drop the reference in delayed
+		 * work.
+		 */
+		neigh_clone(n);
+		mlxsw_core_schedule_work(&net_work->work);
+		mlxsw_sp_port_dev_put(mlxsw_sp_port);
+		break;
+	case NETEVENT_IPV4_MPATH_HASH_UPDATE:
+	case NETEVENT_IPV6_MPATH_HASH_UPDATE:
+		net = ptr;
+
+		if (!net_eq(net, &init_net))
+			return NOTIFY_DONE;
+
+		net_work = kzalloc(sizeof(*net_work), GFP_ATOMIC);
+		if (!net_work)
+			return NOTIFY_BAD;
+
+		router = container_of(nb, struct mlxsw_sp_router, netevent_nb);
+		INIT_WORK(&net_work->work, mlxsw_sp_router_mp_hash_event_work);
+		net_work->mlxsw_sp = router->mlxsw_sp;
+		mlxsw_core_schedule_work(&net_work->work);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int mlxsw_sp_neigh_init(struct mlxsw_sp *mlxsw_sp)
+{
+	int err;
+
+	err = rhashtable_init(&mlxsw_sp->router->neigh_ht,
+			      &mlxsw_sp_neigh_ht_params);
+	if (err)
+		return err;
+
+	/* Initialize the polling interval according to the default
+	 * table.
+	 */
+	mlxsw_sp_router_neighs_update_interval_init(mlxsw_sp);
+
+	/* Create the delayed works for the activity_update */
+	INIT_DELAYED_WORK(&mlxsw_sp->router->neighs_update.dw,
+			  mlxsw_sp_router_neighs_update_work);
+	INIT_DELAYED_WORK(&mlxsw_sp->router->nexthop_probe_dw,
+			  mlxsw_sp_router_probe_unresolved_nexthops);
+	mlxsw_core_schedule_dw(&mlxsw_sp->router->neighs_update.dw, 0);
+	mlxsw_core_schedule_dw(&mlxsw_sp->router->nexthop_probe_dw, 0);
+	return 0;
+}
+
+static void mlxsw_sp_neigh_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	cancel_delayed_work_sync(&mlxsw_sp->router->neighs_update.dw);
+	cancel_delayed_work_sync(&mlxsw_sp->router->nexthop_probe_dw);
+	rhashtable_destroy(&mlxsw_sp->router->neigh_ht);
+}
+
+static void mlxsw_sp_neigh_rif_gone_sync(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry, *tmp;
+
+	list_for_each_entry_safe(neigh_entry, tmp, &rif->neigh_list,
+				 rif_list_node) {
+		mlxsw_sp_neigh_entry_update(mlxsw_sp, neigh_entry, false);
+		mlxsw_sp_neigh_entry_destroy(mlxsw_sp, neigh_entry);
+	}
+}
+
+enum mlxsw_sp_nexthop_type {
+	MLXSW_SP_NEXTHOP_TYPE_ETH,
+	MLXSW_SP_NEXTHOP_TYPE_IPIP,
+};
+
+struct mlxsw_sp_nexthop_key {
+	struct fib_nh *fib_nh;
+};
+
+struct mlxsw_sp_nexthop {
+	struct list_head neigh_list_node; /* member of neigh entry list */
+	struct list_head rif_list_node;
+	struct list_head router_list_node;
+	struct mlxsw_sp_nexthop_group *nh_grp; /* pointer back to the group
+						* this belongs to
+						*/
+	struct rhash_head ht_node;
+	struct mlxsw_sp_nexthop_key key;
+	unsigned char gw_addr[sizeof(struct in6_addr)];
+	int ifindex;
+	int nh_weight;
+	int norm_nh_weight;
+	int num_adj_entries;
+	struct mlxsw_sp_rif *rif;
+	u8 should_offload:1, /* set indicates this neigh is connected and
+			      * should be put to KVD linear area of this group.
+			      */
+	   offloaded:1, /* set in case the neigh is actually put into
+			 * KVD linear area of this group.
+			 */
+	   update:1; /* set indicates that MAC of this neigh should be
+		      * updated in HW
+		      */
+	enum mlxsw_sp_nexthop_type type;
+	union {
+		struct mlxsw_sp_neigh_entry *neigh_entry;
+		struct mlxsw_sp_ipip_entry *ipip_entry;
+	};
+	unsigned int counter_index;
+	bool counter_valid;
+};
+
+struct mlxsw_sp_nexthop_group {
+	void *priv;
+	struct rhash_head ht_node;
+	struct list_head fib_list; /* list of fib entries that use this group */
+	struct neigh_table *neigh_tbl;
+	u8 adj_index_valid:1,
+	   gateway:1; /* routes using the group use a gateway */
+	u32 adj_index;
+	u16 ecmp_size;
+	u16 count;
+	int sum_norm_weight;
+	struct mlxsw_sp_nexthop nexthops[0];
+#define nh_rif	nexthops[0].rif
+};
+
+void mlxsw_sp_nexthop_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_nexthop *nh)
+{
+	struct devlink *devlink;
+
+	devlink = priv_to_devlink(mlxsw_sp->core);
+	if (!devlink_dpipe_table_counter_enabled(devlink,
+						 MLXSW_SP_DPIPE_TABLE_NAME_ADJ))
+		return;
+
+	if (mlxsw_sp_flow_counter_alloc(mlxsw_sp, &nh->counter_index))
+		return;
+
+	nh->counter_valid = true;
+}
+
+void mlxsw_sp_nexthop_counter_free(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_nexthop *nh)
+{
+	if (!nh->counter_valid)
+		return;
+	mlxsw_sp_flow_counter_free(mlxsw_sp, nh->counter_index);
+	nh->counter_valid = false;
+}
+
+int mlxsw_sp_nexthop_counter_get(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_nexthop *nh, u64 *p_counter)
+{
+	if (!nh->counter_valid)
+		return -EINVAL;
+
+	return mlxsw_sp_flow_counter_get(mlxsw_sp, nh->counter_index,
+					 p_counter, NULL);
+}
+
+struct mlxsw_sp_nexthop *mlxsw_sp_nexthop_next(struct mlxsw_sp_router *router,
+					       struct mlxsw_sp_nexthop *nh)
+{
+	if (!nh) {
+		if (list_empty(&router->nexthop_list))
+			return NULL;
+		else
+			return list_first_entry(&router->nexthop_list,
+						typeof(*nh), router_list_node);
+	}
+	if (list_is_last(&nh->router_list_node, &router->nexthop_list))
+		return NULL;
+	return list_next_entry(nh, router_list_node);
+}
+
+bool mlxsw_sp_nexthop_offload(struct mlxsw_sp_nexthop *nh)
+{
+	return nh->offloaded;
+}
+
+unsigned char *mlxsw_sp_nexthop_ha(struct mlxsw_sp_nexthop *nh)
+{
+	if (!nh->offloaded)
+		return NULL;
+	return nh->neigh_entry->ha;
+}
+
+int mlxsw_sp_nexthop_indexes(struct mlxsw_sp_nexthop *nh, u32 *p_adj_index,
+			     u32 *p_adj_size, u32 *p_adj_hash_index)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = nh->nh_grp;
+	u32 adj_hash_index = 0;
+	int i;
+
+	if (!nh->offloaded || !nh_grp->adj_index_valid)
+		return -EINVAL;
+
+	*p_adj_index = nh_grp->adj_index;
+	*p_adj_size = nh_grp->ecmp_size;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh_iter = &nh_grp->nexthops[i];
+
+		if (nh_iter == nh)
+			break;
+		if (nh_iter->offloaded)
+			adj_hash_index += nh_iter->num_adj_entries;
+	}
+
+	*p_adj_hash_index = adj_hash_index;
+	return 0;
+}
+
+struct mlxsw_sp_rif *mlxsw_sp_nexthop_rif(struct mlxsw_sp_nexthop *nh)
+{
+	return nh->rif;
+}
+
+bool mlxsw_sp_nexthop_group_has_ipip(struct mlxsw_sp_nexthop *nh)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = nh->nh_grp;
+	int i;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh_iter = &nh_grp->nexthops[i];
+
+		if (nh_iter->type == MLXSW_SP_NEXTHOP_TYPE_IPIP)
+			return true;
+	}
+	return false;
+}
+
+static struct fib_info *
+mlxsw_sp_nexthop4_group_fi(const struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	return nh_grp->priv;
+}
+
+struct mlxsw_sp_nexthop_group_cmp_arg {
+	enum mlxsw_sp_l3proto proto;
+	union {
+		struct fib_info *fi;
+		struct mlxsw_sp_fib6_entry *fib6_entry;
+	};
+};
+
+static bool
+mlxsw_sp_nexthop6_group_has_nexthop(const struct mlxsw_sp_nexthop_group *nh_grp,
+				    const struct in6_addr *gw, int ifindex,
+				    int weight)
+{
+	int i;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		const struct mlxsw_sp_nexthop *nh;
+
+		nh = &nh_grp->nexthops[i];
+		if (nh->ifindex == ifindex && nh->nh_weight == weight &&
+		    ipv6_addr_equal(gw, (struct in6_addr *) nh->gw_addr))
+			return true;
+	}
+
+	return false;
+}
+
+static bool
+mlxsw_sp_nexthop6_group_cmp(const struct mlxsw_sp_nexthop_group *nh_grp,
+			    const struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+
+	if (nh_grp->count != fib6_entry->nrt6)
+		return false;
+
+	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
+		struct in6_addr *gw;
+		int ifindex, weight;
+
+		ifindex = mlxsw_sp_rt6->rt->dst.dev->ifindex;
+		weight = mlxsw_sp_rt6->rt->rt6i_nh_weight;
+		gw = &mlxsw_sp_rt6->rt->rt6i_gateway;
+		if (!mlxsw_sp_nexthop6_group_has_nexthop(nh_grp, gw, ifindex,
+							 weight))
+			return false;
+	}
+
+	return true;
+}
+
+static int
+mlxsw_sp_nexthop_group_cmp(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+	const struct mlxsw_sp_nexthop_group_cmp_arg *cmp_arg = arg->key;
+	const struct mlxsw_sp_nexthop_group *nh_grp = ptr;
+
+	switch (cmp_arg->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		return cmp_arg->fi != mlxsw_sp_nexthop4_group_fi(nh_grp);
+	case MLXSW_SP_L3_PROTO_IPV6:
+		return !mlxsw_sp_nexthop6_group_cmp(nh_grp,
+						    cmp_arg->fib6_entry);
+	default:
+		WARN_ON(1);
+		return 1;
+	}
+}
+
+static int
+mlxsw_sp_nexthop_group_type(const struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	return nh_grp->neigh_tbl->family;
+}
+
+static u32 mlxsw_sp_nexthop_group_hash_obj(const void *data, u32 len, u32 seed)
+{
+	const struct mlxsw_sp_nexthop_group *nh_grp = data;
+	const struct mlxsw_sp_nexthop *nh;
+	struct fib_info *fi;
+	unsigned int val;
+	int i;
+
+	switch (mlxsw_sp_nexthop_group_type(nh_grp)) {
+	case AF_INET:
+		fi = mlxsw_sp_nexthop4_group_fi(nh_grp);
+		return jhash(&fi, sizeof(fi), seed);
+	case AF_INET6:
+		val = nh_grp->count;
+		for (i = 0; i < nh_grp->count; i++) {
+			nh = &nh_grp->nexthops[i];
+			val ^= nh->ifindex;
+		}
+		return jhash(&val, sizeof(val), seed);
+	default:
+		WARN_ON(1);
+		return 0;
+	}
+}
+
+static u32
+mlxsw_sp_nexthop6_group_hash(struct mlxsw_sp_fib6_entry *fib6_entry, u32 seed)
+{
+	unsigned int val = fib6_entry->nrt6;
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+	struct net_device *dev;
+
+	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
+		dev = mlxsw_sp_rt6->rt->dst.dev;
+		val ^= dev->ifindex;
+	}
+
+	return jhash(&val, sizeof(val), seed);
+}
+
+static u32
+mlxsw_sp_nexthop_group_hash(const void *data, u32 len, u32 seed)
+{
+	const struct mlxsw_sp_nexthop_group_cmp_arg *cmp_arg = data;
+
+	switch (cmp_arg->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		return jhash(&cmp_arg->fi, sizeof(cmp_arg->fi), seed);
+	case MLXSW_SP_L3_PROTO_IPV6:
+		return mlxsw_sp_nexthop6_group_hash(cmp_arg->fib6_entry, seed);
+	default:
+		WARN_ON(1);
+		return 0;
+	}
+}
+
+static const struct rhashtable_params mlxsw_sp_nexthop_group_ht_params = {
+	.head_offset = offsetof(struct mlxsw_sp_nexthop_group, ht_node),
+	.hashfn	     = mlxsw_sp_nexthop_group_hash,
+	.obj_hashfn  = mlxsw_sp_nexthop_group_hash_obj,
+	.obj_cmpfn   = mlxsw_sp_nexthop_group_cmp,
+};
+
+static int mlxsw_sp_nexthop_group_insert(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	if (mlxsw_sp_nexthop_group_type(nh_grp) == AF_INET6 &&
+	    !nh_grp->gateway)
+		return 0;
+
+	return rhashtable_insert_fast(&mlxsw_sp->router->nexthop_group_ht,
+				      &nh_grp->ht_node,
+				      mlxsw_sp_nexthop_group_ht_params);
+}
+
+static void mlxsw_sp_nexthop_group_remove(struct mlxsw_sp *mlxsw_sp,
+					  struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	if (mlxsw_sp_nexthop_group_type(nh_grp) == AF_INET6 &&
+	    !nh_grp->gateway)
+		return;
+
+	rhashtable_remove_fast(&mlxsw_sp->router->nexthop_group_ht,
+			       &nh_grp->ht_node,
+			       mlxsw_sp_nexthop_group_ht_params);
+}
+
+static struct mlxsw_sp_nexthop_group *
+mlxsw_sp_nexthop4_group_lookup(struct mlxsw_sp *mlxsw_sp,
+			       struct fib_info *fi)
+{
+	struct mlxsw_sp_nexthop_group_cmp_arg cmp_arg;
+
+	cmp_arg.proto = MLXSW_SP_L3_PROTO_IPV4;
+	cmp_arg.fi = fi;
+	return rhashtable_lookup_fast(&mlxsw_sp->router->nexthop_group_ht,
+				      &cmp_arg,
+				      mlxsw_sp_nexthop_group_ht_params);
+}
+
+static struct mlxsw_sp_nexthop_group *
+mlxsw_sp_nexthop6_group_lookup(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	struct mlxsw_sp_nexthop_group_cmp_arg cmp_arg;
+
+	cmp_arg.proto = MLXSW_SP_L3_PROTO_IPV6;
+	cmp_arg.fib6_entry = fib6_entry;
+	return rhashtable_lookup_fast(&mlxsw_sp->router->nexthop_group_ht,
+				      &cmp_arg,
+				      mlxsw_sp_nexthop_group_ht_params);
+}
+
+static const struct rhashtable_params mlxsw_sp_nexthop_ht_params = {
+	.key_offset = offsetof(struct mlxsw_sp_nexthop, key),
+	.head_offset = offsetof(struct mlxsw_sp_nexthop, ht_node),
+	.key_len = sizeof(struct mlxsw_sp_nexthop_key),
+};
+
+static int mlxsw_sp_nexthop_insert(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_nexthop *nh)
+{
+	return rhashtable_insert_fast(&mlxsw_sp->router->nexthop_ht,
+				      &nh->ht_node, mlxsw_sp_nexthop_ht_params);
+}
+
+static void mlxsw_sp_nexthop_remove(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_nexthop *nh)
+{
+	rhashtable_remove_fast(&mlxsw_sp->router->nexthop_ht, &nh->ht_node,
+			       mlxsw_sp_nexthop_ht_params);
+}
+
+static struct mlxsw_sp_nexthop *
+mlxsw_sp_nexthop_lookup(struct mlxsw_sp *mlxsw_sp,
+			struct mlxsw_sp_nexthop_key key)
+{
+	return rhashtable_lookup_fast(&mlxsw_sp->router->nexthop_ht, &key,
+				      mlxsw_sp_nexthop_ht_params);
+}
+
+static int mlxsw_sp_adj_index_mass_update_vr(struct mlxsw_sp *mlxsw_sp,
+					     const struct mlxsw_sp_fib *fib,
+					     u32 adj_index, u16 ecmp_size,
+					     u32 new_adj_index,
+					     u16 new_ecmp_size)
+{
+	char raleu_pl[MLXSW_REG_RALEU_LEN];
+
+	mlxsw_reg_raleu_pack(raleu_pl,
+			     (enum mlxsw_reg_ralxx_protocol) fib->proto,
+			     fib->vr->id, adj_index, ecmp_size, new_adj_index,
+			     new_ecmp_size);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(raleu), raleu_pl);
+}
+
+static int mlxsw_sp_adj_index_mass_update(struct mlxsw_sp *mlxsw_sp,
+					  struct mlxsw_sp_nexthop_group *nh_grp,
+					  u32 old_adj_index, u16 old_ecmp_size)
+{
+	struct mlxsw_sp_fib_entry *fib_entry;
+	struct mlxsw_sp_fib *fib = NULL;
+	int err;
+
+	list_for_each_entry(fib_entry, &nh_grp->fib_list, nexthop_group_node) {
+		if (fib == fib_entry->fib_node->fib)
+			continue;
+		fib = fib_entry->fib_node->fib;
+		err = mlxsw_sp_adj_index_mass_update_vr(mlxsw_sp, fib,
+							old_adj_index,
+							old_ecmp_size,
+							nh_grp->adj_index,
+							nh_grp->ecmp_size);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int __mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+				     struct mlxsw_sp_nexthop *nh)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry;
+	char ratr_pl[MLXSW_REG_RATR_LEN];
+
+	mlxsw_reg_ratr_pack(ratr_pl, MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY,
+			    true, MLXSW_REG_RATR_TYPE_ETHERNET,
+			    adj_index, neigh_entry->rif);
+	mlxsw_reg_ratr_eth_entry_pack(ratr_pl, neigh_entry->ha);
+	if (nh->counter_valid)
+		mlxsw_reg_ratr_counter_pack(ratr_pl, nh->counter_index, true);
+	else
+		mlxsw_reg_ratr_counter_pack(ratr_pl, 0, false);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ratr), ratr_pl);
+}
+
+int mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+			    struct mlxsw_sp_nexthop *nh)
+{
+	int i;
+
+	for (i = 0; i < nh->num_adj_entries; i++) {
+		int err;
+
+		err = __mlxsw_sp_nexthop_update(mlxsw_sp, adj_index + i, nh);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int __mlxsw_sp_nexthop_ipip_update(struct mlxsw_sp *mlxsw_sp,
+					  u32 adj_index,
+					  struct mlxsw_sp_nexthop *nh)
+{
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+
+	ipip_ops = mlxsw_sp->router->ipip_ops_arr[nh->ipip_entry->ipipt];
+	return ipip_ops->nexthop_update(mlxsw_sp, adj_index, nh->ipip_entry);
+}
+
+static int mlxsw_sp_nexthop_ipip_update(struct mlxsw_sp *mlxsw_sp,
+					u32 adj_index,
+					struct mlxsw_sp_nexthop *nh)
+{
+	int i;
+
+	for (i = 0; i < nh->num_adj_entries; i++) {
+		int err;
+
+		err = __mlxsw_sp_nexthop_ipip_update(mlxsw_sp, adj_index + i,
+						     nh);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int
+mlxsw_sp_nexthop_group_update(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_nexthop_group *nh_grp,
+			      bool reallocate)
+{
+	u32 adj_index = nh_grp->adj_index; /* base */
+	struct mlxsw_sp_nexthop *nh;
+	int i;
+	int err;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+
+		if (!nh->should_offload) {
+			nh->offloaded = 0;
+			continue;
+		}
+
+		if (nh->update || reallocate) {
+			switch (nh->type) {
+			case MLXSW_SP_NEXTHOP_TYPE_ETH:
+				err = mlxsw_sp_nexthop_update
+					    (mlxsw_sp, adj_index, nh);
+				break;
+			case MLXSW_SP_NEXTHOP_TYPE_IPIP:
+				err = mlxsw_sp_nexthop_ipip_update
+					    (mlxsw_sp, adj_index, nh);
+				break;
+			}
+			if (err)
+				return err;
+			nh->update = 0;
+			nh->offloaded = 1;
+		}
+		adj_index += nh->num_adj_entries;
+	}
+	return 0;
+}
+
+static bool
+mlxsw_sp_fib_node_entry_is_first(const struct mlxsw_sp_fib_node *fib_node,
+				 const struct mlxsw_sp_fib_entry *fib_entry);
+
+static int
+mlxsw_sp_nexthop_fib_entries_update(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	struct mlxsw_sp_fib_entry *fib_entry;
+	int err;
+
+	list_for_each_entry(fib_entry, &nh_grp->fib_list, nexthop_group_node) {
+		if (!mlxsw_sp_fib_node_entry_is_first(fib_entry->fib_node,
+						      fib_entry))
+			continue;
+		err = mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void
+mlxsw_sp_fib_entry_offload_refresh(struct mlxsw_sp_fib_entry *fib_entry,
+				   enum mlxsw_reg_ralue_op op, int err);
+
+static void
+mlxsw_sp_nexthop_fib_entries_refresh(struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	enum mlxsw_reg_ralue_op op = MLXSW_REG_RALUE_OP_WRITE_WRITE;
+	struct mlxsw_sp_fib_entry *fib_entry;
+
+	list_for_each_entry(fib_entry, &nh_grp->fib_list, nexthop_group_node) {
+		if (!mlxsw_sp_fib_node_entry_is_first(fib_entry->fib_node,
+						      fib_entry))
+			continue;
+		mlxsw_sp_fib_entry_offload_refresh(fib_entry, op, 0);
+	}
+}
+
+static void mlxsw_sp_adj_grp_size_round_up(u16 *p_adj_grp_size)
+{
+	/* Valid sizes for an adjacency group are:
+	 * 1-64, 512, 1024, 2048 and 4096.
+	 */
+	if (*p_adj_grp_size <= 64)
+		return;
+	else if (*p_adj_grp_size <= 512)
+		*p_adj_grp_size = 512;
+	else if (*p_adj_grp_size <= 1024)
+		*p_adj_grp_size = 1024;
+	else if (*p_adj_grp_size <= 2048)
+		*p_adj_grp_size = 2048;
+	else
+		*p_adj_grp_size = 4096;
+}
+
+static void mlxsw_sp_adj_grp_size_round_down(u16 *p_adj_grp_size,
+					     unsigned int alloc_size)
+{
+	if (alloc_size >= 4096)
+		*p_adj_grp_size = 4096;
+	else if (alloc_size >= 2048)
+		*p_adj_grp_size = 2048;
+	else if (alloc_size >= 1024)
+		*p_adj_grp_size = 1024;
+	else if (alloc_size >= 512)
+		*p_adj_grp_size = 512;
+}
+
+static int mlxsw_sp_fix_adj_grp_size(struct mlxsw_sp *mlxsw_sp,
+				     u16 *p_adj_grp_size)
+{
+	unsigned int alloc_size;
+	int err;
+
+	/* Round up the requested group size to the next size supported
+	 * by the device and make sure the request can be satisfied.
+	 */
+	mlxsw_sp_adj_grp_size_round_up(p_adj_grp_size);
+	err = mlxsw_sp_kvdl_alloc_size_query(mlxsw_sp, *p_adj_grp_size,
+					     &alloc_size);
+	if (err)
+		return err;
+	/* It is possible the allocation results in more allocated
+	 * entries than requested. Try to use as much of them as
+	 * possible.
+	 */
+	mlxsw_sp_adj_grp_size_round_down(p_adj_grp_size, alloc_size);
+
+	return 0;
+}
+
+static void
+mlxsw_sp_nexthop_group_normalize(struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	int i, g = 0, sum_norm_weight = 0;
+	struct mlxsw_sp_nexthop *nh;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+
+		if (!nh->should_offload)
+			continue;
+		if (g > 0)
+			g = gcd(nh->nh_weight, g);
+		else
+			g = nh->nh_weight;
+	}
+
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+
+		if (!nh->should_offload)
+			continue;
+		nh->norm_nh_weight = nh->nh_weight / g;
+		sum_norm_weight += nh->norm_nh_weight;
+	}
+
+	nh_grp->sum_norm_weight = sum_norm_weight;
+}
+
+static void
+mlxsw_sp_nexthop_group_rebalance(struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	int total = nh_grp->sum_norm_weight;
+	u16 ecmp_size = nh_grp->ecmp_size;
+	int i, weight = 0, lower_bound = 0;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
+		int upper_bound;
+
+		if (!nh->should_offload)
+			continue;
+		weight += nh->norm_nh_weight;
+		upper_bound = DIV_ROUND_CLOSEST(ecmp_size * weight, total);
+		nh->num_adj_entries = upper_bound - lower_bound;
+		lower_bound = upper_bound;
+	}
+}
+
+static void
+mlxsw_sp_nexthop_group_refresh(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	u16 ecmp_size, old_ecmp_size;
+	struct mlxsw_sp_nexthop *nh;
+	bool offload_change = false;
+	u32 adj_index;
+	bool old_adj_index_valid;
+	u32 old_adj_index;
+	int i;
+	int err;
+
+	if (!nh_grp->gateway) {
+		mlxsw_sp_nexthop_fib_entries_update(mlxsw_sp, nh_grp);
+		return;
+	}
+
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+
+		if (nh->should_offload != nh->offloaded) {
+			offload_change = true;
+			if (nh->should_offload)
+				nh->update = 1;
+		}
+	}
+	if (!offload_change) {
+		/* Nothing was added or removed, so no need to reallocate. Just
+		 * update MAC on existing adjacency indexes.
+		 */
+		err = mlxsw_sp_nexthop_group_update(mlxsw_sp, nh_grp, false);
+		if (err) {
+			dev_warn(mlxsw_sp->bus_info->dev, "Failed to update neigh MAC in adjacency table.\n");
+			goto set_trap;
+		}
+		return;
+	}
+	mlxsw_sp_nexthop_group_normalize(nh_grp);
+	if (!nh_grp->sum_norm_weight)
+		/* No neigh of this group is connected so we just set
+		 * the trap and let everthing flow through kernel.
+		 */
+		goto set_trap;
+
+	ecmp_size = nh_grp->sum_norm_weight;
+	err = mlxsw_sp_fix_adj_grp_size(mlxsw_sp, &ecmp_size);
+	if (err)
+		/* No valid allocation size available. */
+		goto set_trap;
+
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, ecmp_size, &adj_index);
+	if (err) {
+		/* We ran out of KVD linear space, just set the
+		 * trap and let everything flow through kernel.
+		 */
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to allocate KVD linear area for nexthop group.\n");
+		goto set_trap;
+	}
+	old_adj_index_valid = nh_grp->adj_index_valid;
+	old_adj_index = nh_grp->adj_index;
+	old_ecmp_size = nh_grp->ecmp_size;
+	nh_grp->adj_index_valid = 1;
+	nh_grp->adj_index = adj_index;
+	nh_grp->ecmp_size = ecmp_size;
+	mlxsw_sp_nexthop_group_rebalance(nh_grp);
+	err = mlxsw_sp_nexthop_group_update(mlxsw_sp, nh_grp, true);
+	if (err) {
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to update neigh MAC in adjacency table.\n");
+		goto set_trap;
+	}
+
+	if (!old_adj_index_valid) {
+		/* The trap was set for fib entries, so we have to call
+		 * fib entry update to unset it and use adjacency index.
+		 */
+		err = mlxsw_sp_nexthop_fib_entries_update(mlxsw_sp, nh_grp);
+		if (err) {
+			dev_warn(mlxsw_sp->bus_info->dev, "Failed to add adjacency index to fib entries.\n");
+			goto set_trap;
+		}
+		return;
+	}
+
+	err = mlxsw_sp_adj_index_mass_update(mlxsw_sp, nh_grp,
+					     old_adj_index, old_ecmp_size);
+	mlxsw_sp_kvdl_free(mlxsw_sp, old_adj_index);
+	if (err) {
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to mass-update adjacency index for nexthop group.\n");
+		goto set_trap;
+	}
+
+	/* Offload state within the group changed, so update the flags. */
+	mlxsw_sp_nexthop_fib_entries_refresh(nh_grp);
+
+	return;
+
+set_trap:
+	old_adj_index_valid = nh_grp->adj_index_valid;
+	nh_grp->adj_index_valid = 0;
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+		nh->offloaded = 0;
+	}
+	err = mlxsw_sp_nexthop_fib_entries_update(mlxsw_sp, nh_grp);
+	if (err)
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to set traps for fib entries.\n");
+	if (old_adj_index_valid)
+		mlxsw_sp_kvdl_free(mlxsw_sp, nh_grp->adj_index);
+}
+
+static void __mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp_nexthop *nh,
+					    bool removing)
+{
+	if (!removing)
+		nh->should_offload = 1;
+	else
+		nh->should_offload = 0;
+	nh->update = 1;
+}
+
+static void
+mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_neigh_entry *neigh_entry,
+			      bool removing)
+{
+	struct mlxsw_sp_nexthop *nh;
+
+	list_for_each_entry(nh, &neigh_entry->nexthop_list,
+			    neigh_list_node) {
+		__mlxsw_sp_nexthop_neigh_update(nh, removing);
+		mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh->nh_grp);
+	}
+}
+
+static void mlxsw_sp_nexthop_rif_init(struct mlxsw_sp_nexthop *nh,
+				      struct mlxsw_sp_rif *rif)
+{
+	if (nh->rif)
+		return;
+
+	nh->rif = rif;
+	list_add(&nh->rif_list_node, &rif->nexthop_list);
+}
+
+static void mlxsw_sp_nexthop_rif_fini(struct mlxsw_sp_nexthop *nh)
+{
+	if (!nh->rif)
+		return;
+
+	list_del(&nh->rif_list_node);
+	nh->rif = NULL;
+}
+
+static int mlxsw_sp_nexthop_neigh_init(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_nexthop *nh)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry;
+	struct neighbour *n;
+	u8 nud_state, dead;
+	int err;
+
+	if (!nh->nh_grp->gateway || nh->neigh_entry)
+		return 0;
+
+	/* Take a reference of neigh here ensuring that neigh would
+	 * not be destructed before the nexthop entry is finished.
+	 * The reference is taken either in neigh_lookup() or
+	 * in neigh_create() in case n is not found.
+	 */
+	n = neigh_lookup(nh->nh_grp->neigh_tbl, &nh->gw_addr, nh->rif->dev);
+	if (!n) {
+		n = neigh_create(nh->nh_grp->neigh_tbl, &nh->gw_addr,
+				 nh->rif->dev);
+		if (IS_ERR(n))
+			return PTR_ERR(n);
+		neigh_event_send(n, NULL);
+	}
+	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
+	if (!neigh_entry) {
+		neigh_entry = mlxsw_sp_neigh_entry_create(mlxsw_sp, n);
+		if (IS_ERR(neigh_entry)) {
+			err = -EINVAL;
+			goto err_neigh_entry_create;
+		}
+	}
+
+	/* If that is the first nexthop connected to that neigh, add to
+	 * nexthop_neighs_list
+	 */
+	if (list_empty(&neigh_entry->nexthop_list))
+		list_add_tail(&neigh_entry->nexthop_neighs_list_node,
+			      &mlxsw_sp->router->nexthop_neighs_list);
+
+	nh->neigh_entry = neigh_entry;
+	list_add_tail(&nh->neigh_list_node, &neigh_entry->nexthop_list);
+	read_lock_bh(&n->lock);
+	nud_state = n->nud_state;
+	dead = n->dead;
+	read_unlock_bh(&n->lock);
+	__mlxsw_sp_nexthop_neigh_update(nh, !(nud_state & NUD_VALID && !dead));
+
+	return 0;
+
+err_neigh_entry_create:
+	neigh_release(n);
+	return err;
+}
+
+static void mlxsw_sp_nexthop_neigh_fini(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_nexthop *nh)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry;
+	struct neighbour *n;
+
+	if (!neigh_entry)
+		return;
+	n = neigh_entry->key.n;
+
+	__mlxsw_sp_nexthop_neigh_update(nh, true);
+	list_del(&nh->neigh_list_node);
+	nh->neigh_entry = NULL;
+
+	/* If that is the last nexthop connected to that neigh, remove from
+	 * nexthop_neighs_list
+	 */
+	if (list_empty(&neigh_entry->nexthop_list))
+		list_del(&neigh_entry->nexthop_neighs_list_node);
+
+	if (!neigh_entry->connected && list_empty(&neigh_entry->nexthop_list))
+		mlxsw_sp_neigh_entry_destroy(mlxsw_sp, neigh_entry);
+
+	neigh_release(n);
+}
+
+static bool mlxsw_sp_ipip_netdev_ul_up(struct net_device *ol_dev)
+{
+	struct net_device *ul_dev = __mlxsw_sp_ipip_netdev_ul_dev_get(ol_dev);
+
+	return ul_dev ? (ul_dev->flags & IFF_UP) : true;
+}
+
+static void mlxsw_sp_nexthop_ipip_init(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_nexthop *nh,
+				       struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	bool removing;
+
+	if (!nh->nh_grp->gateway || nh->ipip_entry)
+		return;
+
+	nh->ipip_entry = ipip_entry;
+	removing = !mlxsw_sp_ipip_netdev_ul_up(ipip_entry->ol_dev);
+	__mlxsw_sp_nexthop_neigh_update(nh, removing);
+	mlxsw_sp_nexthop_rif_init(nh, &ipip_entry->ol_lb->common);
+}
+
+static void mlxsw_sp_nexthop_ipip_fini(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_nexthop *nh)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry = nh->ipip_entry;
+
+	if (!ipip_entry)
+		return;
+
+	__mlxsw_sp_nexthop_neigh_update(nh, true);
+	nh->ipip_entry = NULL;
+}
+
+static bool mlxsw_sp_nexthop4_ipip_type(const struct mlxsw_sp *mlxsw_sp,
+					const struct fib_nh *fib_nh,
+					enum mlxsw_sp_ipip_type *p_ipipt)
+{
+	struct net_device *dev = fib_nh->nh_dev;
+
+	return dev &&
+	       fib_nh->nh_parent->fib_type == RTN_UNICAST &&
+	       mlxsw_sp_netdev_ipip_type(mlxsw_sp, dev, p_ipipt);
+}
+
+static void mlxsw_sp_nexthop_type_fini(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_nexthop *nh)
+{
+	switch (nh->type) {
+	case MLXSW_SP_NEXTHOP_TYPE_ETH:
+		mlxsw_sp_nexthop_neigh_fini(mlxsw_sp, nh);
+		mlxsw_sp_nexthop_rif_fini(nh);
+		break;
+	case MLXSW_SP_NEXTHOP_TYPE_IPIP:
+		mlxsw_sp_nexthop_rif_fini(nh);
+		mlxsw_sp_nexthop_ipip_fini(mlxsw_sp, nh);
+		break;
+	}
+}
+
+static int mlxsw_sp_nexthop4_type_init(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_nexthop *nh,
+				       struct fib_nh *fib_nh)
+{
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	struct net_device *dev = fib_nh->nh_dev;
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	struct mlxsw_sp_rif *rif;
+	int err;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, dev);
+	if (ipip_entry) {
+		ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipip_entry->ipipt];
+		if (ipip_ops->can_offload(mlxsw_sp, dev,
+					  MLXSW_SP_L3_PROTO_IPV4)) {
+			nh->type = MLXSW_SP_NEXTHOP_TYPE_IPIP;
+			mlxsw_sp_nexthop_ipip_init(mlxsw_sp, nh, ipip_entry);
+			return 0;
+		}
+	}
+
+	nh->type = MLXSW_SP_NEXTHOP_TYPE_ETH;
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!rif)
+		return 0;
+
+	mlxsw_sp_nexthop_rif_init(nh, rif);
+	err = mlxsw_sp_nexthop_neigh_init(mlxsw_sp, nh);
+	if (err)
+		goto err_neigh_init;
+
+	return 0;
+
+err_neigh_init:
+	mlxsw_sp_nexthop_rif_fini(nh);
+	return err;
+}
+
+static void mlxsw_sp_nexthop4_type_fini(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_nexthop *nh)
+{
+	mlxsw_sp_nexthop_type_fini(mlxsw_sp, nh);
+}
+
+static int mlxsw_sp_nexthop4_init(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_nexthop_group *nh_grp,
+				  struct mlxsw_sp_nexthop *nh,
+				  struct fib_nh *fib_nh)
+{
+	struct net_device *dev = fib_nh->nh_dev;
+	struct in_device *in_dev;
+	int err;
+
+	nh->nh_grp = nh_grp;
+	nh->key.fib_nh = fib_nh;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	nh->nh_weight = fib_nh->nh_weight;
+#else
+	nh->nh_weight = 1;
+#endif
+	memcpy(&nh->gw_addr, &fib_nh->nh_gw, sizeof(fib_nh->nh_gw));
+	err = mlxsw_sp_nexthop_insert(mlxsw_sp, nh);
+	if (err)
+		return err;
+
+	mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
+	list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list);
+
+	if (!dev)
+		return 0;
+
+	in_dev = __in_dev_get_rtnl(dev);
+	if (in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+	    fib_nh->nh_flags & RTNH_F_LINKDOWN)
+		return 0;
+
+	err = mlxsw_sp_nexthop4_type_init(mlxsw_sp, nh, fib_nh);
+	if (err)
+		goto err_nexthop_neigh_init;
+
+	return 0;
+
+err_nexthop_neigh_init:
+	mlxsw_sp_nexthop_remove(mlxsw_sp, nh);
+	return err;
+}
+
+static void mlxsw_sp_nexthop4_fini(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_nexthop *nh)
+{
+	mlxsw_sp_nexthop4_type_fini(mlxsw_sp, nh);
+	list_del(&nh->router_list_node);
+	mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh);
+	mlxsw_sp_nexthop_remove(mlxsw_sp, nh);
+}
+
+static void mlxsw_sp_nexthop4_event(struct mlxsw_sp *mlxsw_sp,
+				    unsigned long event, struct fib_nh *fib_nh)
+{
+	struct mlxsw_sp_nexthop_key key;
+	struct mlxsw_sp_nexthop *nh;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	key.fib_nh = fib_nh;
+	nh = mlxsw_sp_nexthop_lookup(mlxsw_sp, key);
+	if (WARN_ON_ONCE(!nh))
+		return;
+
+	switch (event) {
+	case FIB_EVENT_NH_ADD:
+		mlxsw_sp_nexthop4_type_init(mlxsw_sp, nh, fib_nh);
+		break;
+	case FIB_EVENT_NH_DEL:
+		mlxsw_sp_nexthop4_type_fini(mlxsw_sp, nh);
+		break;
+	}
+
+	mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh->nh_grp);
+}
+
+static void mlxsw_sp_nexthop_rif_update(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_nexthop *nh;
+	bool removing;
+
+	list_for_each_entry(nh, &rif->nexthop_list, rif_list_node) {
+		switch (nh->type) {
+		case MLXSW_SP_NEXTHOP_TYPE_ETH:
+			removing = false;
+			break;
+		case MLXSW_SP_NEXTHOP_TYPE_IPIP:
+			removing = !mlxsw_sp_ipip_netdev_ul_up(rif->dev);
+			break;
+		default:
+			WARN_ON(1);
+			continue;
+		}
+
+		__mlxsw_sp_nexthop_neigh_update(nh, removing);
+		mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh->nh_grp);
+	}
+}
+
+static void mlxsw_sp_nexthop_rif_migrate(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_rif *old_rif,
+					 struct mlxsw_sp_rif *new_rif)
+{
+	struct mlxsw_sp_nexthop *nh;
+
+	list_splice_init(&old_rif->nexthop_list, &new_rif->nexthop_list);
+	list_for_each_entry(nh, &new_rif->nexthop_list, rif_list_node)
+		nh->rif = new_rif;
+	mlxsw_sp_nexthop_rif_update(mlxsw_sp, new_rif);
+}
+
+static void mlxsw_sp_nexthop_rif_gone_sync(struct mlxsw_sp *mlxsw_sp,
+					   struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_nexthop *nh, *tmp;
+
+	list_for_each_entry_safe(nh, tmp, &rif->nexthop_list, rif_list_node) {
+		mlxsw_sp_nexthop_type_fini(mlxsw_sp, nh);
+		mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh->nh_grp);
+	}
+}
+
+static bool mlxsw_sp_fi_is_gateway(const struct mlxsw_sp *mlxsw_sp,
+				   const struct fib_info *fi)
+{
+	return fi->fib_nh->nh_scope == RT_SCOPE_LINK ||
+	       mlxsw_sp_nexthop4_ipip_type(mlxsw_sp, fi->fib_nh, NULL);
+}
+
+static struct mlxsw_sp_nexthop_group *
+mlxsw_sp_nexthop4_group_create(struct mlxsw_sp *mlxsw_sp, struct fib_info *fi)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp;
+	struct mlxsw_sp_nexthop *nh;
+	struct fib_nh *fib_nh;
+	size_t alloc_size;
+	int i;
+	int err;
+
+	alloc_size = sizeof(*nh_grp) +
+		     fi->fib_nhs * sizeof(struct mlxsw_sp_nexthop);
+	nh_grp = kzalloc(alloc_size, GFP_KERNEL);
+	if (!nh_grp)
+		return ERR_PTR(-ENOMEM);
+	nh_grp->priv = fi;
+	INIT_LIST_HEAD(&nh_grp->fib_list);
+	nh_grp->neigh_tbl = &arp_tbl;
+
+	nh_grp->gateway = mlxsw_sp_fi_is_gateway(mlxsw_sp, fi);
+	nh_grp->count = fi->fib_nhs;
+	fib_info_hold(fi);
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+		fib_nh = &fi->fib_nh[i];
+		err = mlxsw_sp_nexthop4_init(mlxsw_sp, nh_grp, nh, fib_nh);
+		if (err)
+			goto err_nexthop4_init;
+	}
+	err = mlxsw_sp_nexthop_group_insert(mlxsw_sp, nh_grp);
+	if (err)
+		goto err_nexthop_group_insert;
+	mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh_grp);
+	return nh_grp;
+
+err_nexthop_group_insert:
+err_nexthop4_init:
+	for (i--; i >= 0; i--) {
+		nh = &nh_grp->nexthops[i];
+		mlxsw_sp_nexthop4_fini(mlxsw_sp, nh);
+	}
+	fib_info_put(fi);
+	kfree(nh_grp);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_nexthop4_group_destroy(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	struct mlxsw_sp_nexthop *nh;
+	int i;
+
+	mlxsw_sp_nexthop_group_remove(mlxsw_sp, nh_grp);
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+		mlxsw_sp_nexthop4_fini(mlxsw_sp, nh);
+	}
+	mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh_grp);
+	WARN_ON_ONCE(nh_grp->adj_index_valid);
+	fib_info_put(mlxsw_sp_nexthop4_group_fi(nh_grp));
+	kfree(nh_grp);
+}
+
+static int mlxsw_sp_nexthop4_group_get(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_fib_entry *fib_entry,
+				       struct fib_info *fi)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp;
+
+	nh_grp = mlxsw_sp_nexthop4_group_lookup(mlxsw_sp, fi);
+	if (!nh_grp) {
+		nh_grp = mlxsw_sp_nexthop4_group_create(mlxsw_sp, fi);
+		if (IS_ERR(nh_grp))
+			return PTR_ERR(nh_grp);
+	}
+	list_add_tail(&fib_entry->nexthop_group_node, &nh_grp->fib_list);
+	fib_entry->nh_group = nh_grp;
+	return 0;
+}
+
+static void mlxsw_sp_nexthop4_group_put(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
+
+	list_del(&fib_entry->nexthop_group_node);
+	if (!list_empty(&nh_grp->fib_list))
+		return;
+	mlxsw_sp_nexthop4_group_destroy(mlxsw_sp, nh_grp);
+}
+
+static bool
+mlxsw_sp_fib4_entry_should_offload(const struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+
+	fib4_entry = container_of(fib_entry, struct mlxsw_sp_fib4_entry,
+				  common);
+	return !fib4_entry->tos;
+}
+
+static bool
+mlxsw_sp_fib_entry_should_offload(const struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_group = fib_entry->nh_group;
+
+	switch (fib_entry->fib_node->fib->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		if (!mlxsw_sp_fib4_entry_should_offload(fib_entry))
+			return false;
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		break;
+	}
+
+	switch (fib_entry->type) {
+	case MLXSW_SP_FIB_ENTRY_TYPE_REMOTE:
+		return !!nh_group->adj_index_valid;
+	case MLXSW_SP_FIB_ENTRY_TYPE_LOCAL:
+		return !!nh_group->nh_rif;
+	case MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static struct mlxsw_sp_nexthop *
+mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp,
+		     const struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
+{
+	int i;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
+		struct rt6_info *rt = mlxsw_sp_rt6->rt;
+
+		if (nh->rif && nh->rif->dev == rt->dst.dev &&
+		    ipv6_addr_equal((const struct in6_addr *) &nh->gw_addr,
+				    &rt->rt6i_gateway))
+			return nh;
+		continue;
+	}
+
+	return NULL;
+}
+
+static void
+mlxsw_sp_fib4_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
+	int i;
+
+	if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL ||
+	    fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP) {
+		nh_grp->nexthops->key.fib_nh->nh_flags |= RTNH_F_OFFLOAD;
+		return;
+	}
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
+
+		if (nh->offloaded)
+			nh->key.fib_nh->nh_flags |= RTNH_F_OFFLOAD;
+		else
+			nh->key.fib_nh->nh_flags &= ~RTNH_F_OFFLOAD;
+	}
+}
+
+static void
+mlxsw_sp_fib4_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
+	int i;
+
+	if (!list_is_singular(&nh_grp->fib_list))
+		return;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
+
+		nh->key.fib_nh->nh_flags &= ~RTNH_F_OFFLOAD;
+	}
+}
+
+static void
+mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+
+	fib6_entry = container_of(fib_entry, struct mlxsw_sp_fib6_entry,
+				  common);
+
+	if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL) {
+		list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
+				 list)->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
+		return;
+	}
+
+	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
+		struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
+		struct mlxsw_sp_nexthop *nh;
+
+		nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6);
+		if (nh && nh->offloaded)
+			mlxsw_sp_rt6->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
+		else
+			mlxsw_sp_rt6->rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
+	}
+}
+
+static void
+mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+
+	fib6_entry = container_of(fib_entry, struct mlxsw_sp_fib6_entry,
+				  common);
+	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
+		struct rt6_info *rt = mlxsw_sp_rt6->rt;
+
+		rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
+	}
+}
+
+static void mlxsw_sp_fib_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
+{
+	switch (fib_entry->fib_node->fib->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_sp_fib4_entry_offload_set(fib_entry);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		mlxsw_sp_fib6_entry_offload_set(fib_entry);
+		break;
+	}
+}
+
+static void
+mlxsw_sp_fib_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
+{
+	switch (fib_entry->fib_node->fib->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_sp_fib4_entry_offload_unset(fib_entry);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		mlxsw_sp_fib6_entry_offload_unset(fib_entry);
+		break;
+	}
+}
+
+static void
+mlxsw_sp_fib_entry_offload_refresh(struct mlxsw_sp_fib_entry *fib_entry,
+				   enum mlxsw_reg_ralue_op op, int err)
+{
+	switch (op) {
+	case MLXSW_REG_RALUE_OP_WRITE_DELETE:
+		return mlxsw_sp_fib_entry_offload_unset(fib_entry);
+	case MLXSW_REG_RALUE_OP_WRITE_WRITE:
+		if (err)
+			return;
+		if (mlxsw_sp_fib_entry_should_offload(fib_entry))
+			mlxsw_sp_fib_entry_offload_set(fib_entry);
+		else
+			mlxsw_sp_fib_entry_offload_unset(fib_entry);
+		return;
+	default:
+		return;
+	}
+}
+
+static void
+mlxsw_sp_fib_entry_ralue_pack(char *ralue_pl,
+			      const struct mlxsw_sp_fib_entry *fib_entry,
+			      enum mlxsw_reg_ralue_op op)
+{
+	struct mlxsw_sp_fib *fib = fib_entry->fib_node->fib;
+	enum mlxsw_reg_ralxx_protocol proto;
+	u32 *p_dip;
+
+	proto = (enum mlxsw_reg_ralxx_protocol) fib->proto;
+
+	switch (fib->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		p_dip = (u32 *) fib_entry->fib_node->key.addr;
+		mlxsw_reg_ralue_pack4(ralue_pl, proto, op, fib->vr->id,
+				      fib_entry->fib_node->key.prefix_len,
+				      *p_dip);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		mlxsw_reg_ralue_pack6(ralue_pl, proto, op, fib->vr->id,
+				      fib_entry->fib_node->key.prefix_len,
+				      fib_entry->fib_node->key.addr);
+		break;
+	}
+}
+
+static int mlxsw_sp_fib_entry_op_remote(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib_entry *fib_entry,
+					enum mlxsw_reg_ralue_op op)
+{
+	char ralue_pl[MLXSW_REG_RALUE_LEN];
+	enum mlxsw_reg_ralue_trap_action trap_action;
+	u16 trap_id = 0;
+	u32 adjacency_index = 0;
+	u16 ecmp_size = 0;
+
+	/* In case the nexthop group adjacency index is valid, use it
+	 * with provided ECMP size. Otherwise, setup trap and pass
+	 * traffic to kernel.
+	 */
+	if (mlxsw_sp_fib_entry_should_offload(fib_entry)) {
+		trap_action = MLXSW_REG_RALUE_TRAP_ACTION_NOP;
+		adjacency_index = fib_entry->nh_group->adj_index;
+		ecmp_size = fib_entry->nh_group->ecmp_size;
+	} else {
+		trap_action = MLXSW_REG_RALUE_TRAP_ACTION_TRAP;
+		trap_id = MLXSW_TRAP_ID_RTR_INGRESS0;
+	}
+
+	mlxsw_sp_fib_entry_ralue_pack(ralue_pl, fib_entry, op);
+	mlxsw_reg_ralue_act_remote_pack(ralue_pl, trap_action, trap_id,
+					adjacency_index, ecmp_size);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl);
+}
+
+static int mlxsw_sp_fib_entry_op_local(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_fib_entry *fib_entry,
+				       enum mlxsw_reg_ralue_op op)
+{
+	struct mlxsw_sp_rif *rif = fib_entry->nh_group->nh_rif;
+	enum mlxsw_reg_ralue_trap_action trap_action;
+	char ralue_pl[MLXSW_REG_RALUE_LEN];
+	u16 trap_id = 0;
+	u16 rif_index = 0;
+
+	if (mlxsw_sp_fib_entry_should_offload(fib_entry)) {
+		trap_action = MLXSW_REG_RALUE_TRAP_ACTION_NOP;
+		rif_index = rif->rif_index;
+	} else {
+		trap_action = MLXSW_REG_RALUE_TRAP_ACTION_TRAP;
+		trap_id = MLXSW_TRAP_ID_RTR_INGRESS0;
+	}
+
+	mlxsw_sp_fib_entry_ralue_pack(ralue_pl, fib_entry, op);
+	mlxsw_reg_ralue_act_local_pack(ralue_pl, trap_action, trap_id,
+				       rif_index);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl);
+}
+
+static int mlxsw_sp_fib_entry_op_trap(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_fib_entry *fib_entry,
+				      enum mlxsw_reg_ralue_op op)
+{
+	char ralue_pl[MLXSW_REG_RALUE_LEN];
+
+	mlxsw_sp_fib_entry_ralue_pack(ralue_pl, fib_entry, op);
+	mlxsw_reg_ralue_act_ip2me_pack(ralue_pl);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl);
+}
+
+static int
+mlxsw_sp_fib_entry_op_ipip_decap(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_fib_entry *fib_entry,
+				 enum mlxsw_reg_ralue_op op)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry = fib_entry->decap.ipip_entry;
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+
+	if (WARN_ON(!ipip_entry))
+		return -EINVAL;
+
+	ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipip_entry->ipipt];
+	return ipip_ops->fib_entry_op(mlxsw_sp, ipip_entry, op,
+				      fib_entry->decap.tunnel_index);
+}
+
+static int __mlxsw_sp_fib_entry_op(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_fib_entry *fib_entry,
+				   enum mlxsw_reg_ralue_op op)
+{
+	switch (fib_entry->type) {
+	case MLXSW_SP_FIB_ENTRY_TYPE_REMOTE:
+		return mlxsw_sp_fib_entry_op_remote(mlxsw_sp, fib_entry, op);
+	case MLXSW_SP_FIB_ENTRY_TYPE_LOCAL:
+		return mlxsw_sp_fib_entry_op_local(mlxsw_sp, fib_entry, op);
+	case MLXSW_SP_FIB_ENTRY_TYPE_TRAP:
+		return mlxsw_sp_fib_entry_op_trap(mlxsw_sp, fib_entry, op);
+	case MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP:
+		return mlxsw_sp_fib_entry_op_ipip_decap(mlxsw_sp,
+							fib_entry, op);
+	}
+	return -EINVAL;
+}
+
+static int mlxsw_sp_fib_entry_op(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_fib_entry *fib_entry,
+				 enum mlxsw_reg_ralue_op op)
+{
+	int err = __mlxsw_sp_fib_entry_op(mlxsw_sp, fib_entry, op);
+
+	mlxsw_sp_fib_entry_offload_refresh(fib_entry, op, err);
+
+	return err;
+}
+
+static int mlxsw_sp_fib_entry_update(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_fib_entry *fib_entry)
+{
+	return mlxsw_sp_fib_entry_op(mlxsw_sp, fib_entry,
+				     MLXSW_REG_RALUE_OP_WRITE_WRITE);
+}
+
+static int mlxsw_sp_fib_entry_del(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_fib_entry *fib_entry)
+{
+	return mlxsw_sp_fib_entry_op(mlxsw_sp, fib_entry,
+				     MLXSW_REG_RALUE_OP_WRITE_DELETE);
+}
+
+static int
+mlxsw_sp_fib4_entry_type_set(struct mlxsw_sp *mlxsw_sp,
+			     const struct fib_entry_notifier_info *fen_info,
+			     struct mlxsw_sp_fib_entry *fib_entry)
+{
+	union mlxsw_sp_l3addr dip = { .addr4 = htonl(fen_info->dst) };
+	struct net_device *dev = fen_info->fi->fib_dev;
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	struct fib_info *fi = fen_info->fi;
+
+	switch (fen_info->type) {
+	case RTN_LOCAL:
+		ipip_entry = mlxsw_sp_ipip_entry_find_by_decap(mlxsw_sp, dev,
+						 MLXSW_SP_L3_PROTO_IPV4, dip);
+		if (ipip_entry && ipip_entry->ol_dev->flags & IFF_UP) {
+			fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP;
+			return mlxsw_sp_fib_entry_decap_init(mlxsw_sp,
+							     fib_entry,
+							     ipip_entry);
+		}
+		/* fall through */
+	case RTN_BROADCAST:
+		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP;
+		return 0;
+	case RTN_UNREACHABLE: /* fall through */
+	case RTN_BLACKHOLE: /* fall through */
+	case RTN_PROHIBIT:
+		/* Packets hitting these routes need to be trapped, but
+		 * can do so with a lower priority than packets directed
+		 * at the host, so use action type local instead of trap.
+		 */
+		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL;
+		return 0;
+	case RTN_UNICAST:
+		if (mlxsw_sp_fi_is_gateway(mlxsw_sp, fi))
+			fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_REMOTE;
+		else
+			fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct mlxsw_sp_fib4_entry *
+mlxsw_sp_fib4_entry_create(struct mlxsw_sp *mlxsw_sp,
+			   struct mlxsw_sp_fib_node *fib_node,
+			   const struct fib_entry_notifier_info *fen_info)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+	struct mlxsw_sp_fib_entry *fib_entry;
+	int err;
+
+	fib4_entry = kzalloc(sizeof(*fib4_entry), GFP_KERNEL);
+	if (!fib4_entry)
+		return ERR_PTR(-ENOMEM);
+	fib_entry = &fib4_entry->common;
+
+	err = mlxsw_sp_fib4_entry_type_set(mlxsw_sp, fen_info, fib_entry);
+	if (err)
+		goto err_fib4_entry_type_set;
+
+	err = mlxsw_sp_nexthop4_group_get(mlxsw_sp, fib_entry, fen_info->fi);
+	if (err)
+		goto err_nexthop4_group_get;
+
+	fib4_entry->prio = fen_info->fi->fib_priority;
+	fib4_entry->tb_id = fen_info->tb_id;
+	fib4_entry->type = fen_info->type;
+	fib4_entry->tos = fen_info->tos;
+
+	fib_entry->fib_node = fib_node;
+
+	return fib4_entry;
+
+err_nexthop4_group_get:
+err_fib4_entry_type_set:
+	kfree(fib4_entry);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_fib4_entry_destroy(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib4_entry *fib4_entry)
+{
+	mlxsw_sp_nexthop4_group_put(mlxsw_sp, &fib4_entry->common);
+	kfree(fib4_entry);
+}
+
+static struct mlxsw_sp_fib4_entry *
+mlxsw_sp_fib4_entry_lookup(struct mlxsw_sp *mlxsw_sp,
+			   const struct fib_entry_notifier_info *fen_info)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+	struct mlxsw_sp_fib_node *fib_node;
+	struct mlxsw_sp_fib *fib;
+	struct mlxsw_sp_vr *vr;
+
+	vr = mlxsw_sp_vr_find(mlxsw_sp, fen_info->tb_id);
+	if (!vr)
+		return NULL;
+	fib = mlxsw_sp_vr_fib(vr, MLXSW_SP_L3_PROTO_IPV4);
+
+	fib_node = mlxsw_sp_fib_node_lookup(fib, &fen_info->dst,
+					    sizeof(fen_info->dst),
+					    fen_info->dst_len);
+	if (!fib_node)
+		return NULL;
+
+	list_for_each_entry(fib4_entry, &fib_node->entry_list, common.list) {
+		if (fib4_entry->tb_id == fen_info->tb_id &&
+		    fib4_entry->tos == fen_info->tos &&
+		    fib4_entry->type == fen_info->type &&
+		    mlxsw_sp_nexthop4_group_fi(fib4_entry->common.nh_group) ==
+		    fen_info->fi) {
+			return fib4_entry;
+		}
+	}
+
+	return NULL;
+}
+
+static const struct rhashtable_params mlxsw_sp_fib_ht_params = {
+	.key_offset = offsetof(struct mlxsw_sp_fib_node, key),
+	.head_offset = offsetof(struct mlxsw_sp_fib_node, ht_node),
+	.key_len = sizeof(struct mlxsw_sp_fib_key),
+	.automatic_shrinking = true,
+};
+
+static int mlxsw_sp_fib_node_insert(struct mlxsw_sp_fib *fib,
+				    struct mlxsw_sp_fib_node *fib_node)
+{
+	return rhashtable_insert_fast(&fib->ht, &fib_node->ht_node,
+				      mlxsw_sp_fib_ht_params);
+}
+
+static void mlxsw_sp_fib_node_remove(struct mlxsw_sp_fib *fib,
+				     struct mlxsw_sp_fib_node *fib_node)
+{
+	rhashtable_remove_fast(&fib->ht, &fib_node->ht_node,
+			       mlxsw_sp_fib_ht_params);
+}
+
+static struct mlxsw_sp_fib_node *
+mlxsw_sp_fib_node_lookup(struct mlxsw_sp_fib *fib, const void *addr,
+			 size_t addr_len, unsigned char prefix_len)
+{
+	struct mlxsw_sp_fib_key key;
+
+	memset(&key, 0, sizeof(key));
+	memcpy(key.addr, addr, addr_len);
+	key.prefix_len = prefix_len;
+	return rhashtable_lookup_fast(&fib->ht, &key, mlxsw_sp_fib_ht_params);
+}
+
+static struct mlxsw_sp_fib_node *
+mlxsw_sp_fib_node_create(struct mlxsw_sp_fib *fib, const void *addr,
+			 size_t addr_len, unsigned char prefix_len)
+{
+	struct mlxsw_sp_fib_node *fib_node;
+
+	fib_node = kzalloc(sizeof(*fib_node), GFP_KERNEL);
+	if (!fib_node)
+		return NULL;
+
+	INIT_LIST_HEAD(&fib_node->entry_list);
+	list_add(&fib_node->list, &fib->node_list);
+	memcpy(fib_node->key.addr, addr, addr_len);
+	fib_node->key.prefix_len = prefix_len;
+
+	return fib_node;
+}
+
+static void mlxsw_sp_fib_node_destroy(struct mlxsw_sp_fib_node *fib_node)
+{
+	list_del(&fib_node->list);
+	WARN_ON(!list_empty(&fib_node->entry_list));
+	kfree(fib_node);
+}
+
+static bool
+mlxsw_sp_fib_node_entry_is_first(const struct mlxsw_sp_fib_node *fib_node,
+				 const struct mlxsw_sp_fib_entry *fib_entry)
+{
+	return list_first_entry(&fib_node->entry_list,
+				struct mlxsw_sp_fib_entry, list) == fib_entry;
+}
+
+static int mlxsw_sp_fib_lpm_tree_link(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_fib_node *fib_node)
+{
+	struct mlxsw_sp_prefix_usage req_prefix_usage;
+	struct mlxsw_sp_fib *fib = fib_node->fib;
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+	int err;
+
+	lpm_tree = mlxsw_sp->router->lpm.proto_trees[fib->proto];
+	if (lpm_tree->prefix_ref_count[fib_node->key.prefix_len] != 0)
+		goto out;
+
+	mlxsw_sp_prefix_usage_cpy(&req_prefix_usage, &lpm_tree->prefix_usage);
+	mlxsw_sp_prefix_usage_set(&req_prefix_usage, fib_node->key.prefix_len);
+	lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, &req_prefix_usage,
+					 fib->proto);
+	if (IS_ERR(lpm_tree))
+		return PTR_ERR(lpm_tree);
+
+	err = mlxsw_sp_vrs_lpm_tree_replace(mlxsw_sp, fib, lpm_tree);
+	if (err)
+		goto err_lpm_tree_replace;
+
+out:
+	lpm_tree->prefix_ref_count[fib_node->key.prefix_len]++;
+	return 0;
+
+err_lpm_tree_replace:
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree);
+	return err;
+}
+
+static void mlxsw_sp_fib_lpm_tree_unlink(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_fib_node *fib_node)
+{
+	struct mlxsw_sp_lpm_tree *lpm_tree = fib_node->fib->lpm_tree;
+	struct mlxsw_sp_prefix_usage req_prefix_usage;
+	struct mlxsw_sp_fib *fib = fib_node->fib;
+	int err;
+
+	if (--lpm_tree->prefix_ref_count[fib_node->key.prefix_len] != 0)
+		return;
+	/* Try to construct a new LPM tree from the current prefix usage
+	 * minus the unused one. If we fail, continue using the old one.
+	 */
+	mlxsw_sp_prefix_usage_cpy(&req_prefix_usage, &lpm_tree->prefix_usage);
+	mlxsw_sp_prefix_usage_clear(&req_prefix_usage,
+				    fib_node->key.prefix_len);
+	lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, &req_prefix_usage,
+					 fib->proto);
+	if (IS_ERR(lpm_tree))
+		return;
+
+	err = mlxsw_sp_vrs_lpm_tree_replace(mlxsw_sp, fib, lpm_tree);
+	if (err)
+		goto err_lpm_tree_replace;
+
+	return;
+
+err_lpm_tree_replace:
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree);
+}
+
+static int mlxsw_sp_fib_node_init(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_fib_node *fib_node,
+				  struct mlxsw_sp_fib *fib)
+{
+	int err;
+
+	err = mlxsw_sp_fib_node_insert(fib, fib_node);
+	if (err)
+		return err;
+	fib_node->fib = fib;
+
+	err = mlxsw_sp_fib_lpm_tree_link(mlxsw_sp, fib_node);
+	if (err)
+		goto err_fib_lpm_tree_link;
+
+	return 0;
+
+err_fib_lpm_tree_link:
+	fib_node->fib = NULL;
+	mlxsw_sp_fib_node_remove(fib, fib_node);
+	return err;
+}
+
+static void mlxsw_sp_fib_node_fini(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_fib_node *fib_node)
+{
+	struct mlxsw_sp_fib *fib = fib_node->fib;
+
+	mlxsw_sp_fib_lpm_tree_unlink(mlxsw_sp, fib_node);
+	fib_node->fib = NULL;
+	mlxsw_sp_fib_node_remove(fib, fib_node);
+}
+
+static struct mlxsw_sp_fib_node *
+mlxsw_sp_fib_node_get(struct mlxsw_sp *mlxsw_sp, u32 tb_id, const void *addr,
+		      size_t addr_len, unsigned char prefix_len,
+		      enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_fib_node *fib_node;
+	struct mlxsw_sp_fib *fib;
+	struct mlxsw_sp_vr *vr;
+	int err;
+
+	vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id, NULL);
+	if (IS_ERR(vr))
+		return ERR_CAST(vr);
+	fib = mlxsw_sp_vr_fib(vr, proto);
+
+	fib_node = mlxsw_sp_fib_node_lookup(fib, addr, addr_len, prefix_len);
+	if (fib_node)
+		return fib_node;
+
+	fib_node = mlxsw_sp_fib_node_create(fib, addr, addr_len, prefix_len);
+	if (!fib_node) {
+		err = -ENOMEM;
+		goto err_fib_node_create;
+	}
+
+	err = mlxsw_sp_fib_node_init(mlxsw_sp, fib_node, fib);
+	if (err)
+		goto err_fib_node_init;
+
+	return fib_node;
+
+err_fib_node_init:
+	mlxsw_sp_fib_node_destroy(fib_node);
+err_fib_node_create:
+	mlxsw_sp_vr_put(mlxsw_sp, vr);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_fib_node_put(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_fib_node *fib_node)
+{
+	struct mlxsw_sp_vr *vr = fib_node->fib->vr;
+
+	if (!list_empty(&fib_node->entry_list))
+		return;
+	mlxsw_sp_fib_node_fini(mlxsw_sp, fib_node);
+	mlxsw_sp_fib_node_destroy(fib_node);
+	mlxsw_sp_vr_put(mlxsw_sp, vr);
+}
+
+static struct mlxsw_sp_fib4_entry *
+mlxsw_sp_fib4_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
+			      const struct mlxsw_sp_fib4_entry *new4_entry)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+
+	list_for_each_entry(fib4_entry, &fib_node->entry_list, common.list) {
+		if (fib4_entry->tb_id > new4_entry->tb_id)
+			continue;
+		if (fib4_entry->tb_id != new4_entry->tb_id)
+			break;
+		if (fib4_entry->tos > new4_entry->tos)
+			continue;
+		if (fib4_entry->prio >= new4_entry->prio ||
+		    fib4_entry->tos < new4_entry->tos)
+			return fib4_entry;
+	}
+
+	return NULL;
+}
+
+static int
+mlxsw_sp_fib4_node_list_append(struct mlxsw_sp_fib4_entry *fib4_entry,
+			       struct mlxsw_sp_fib4_entry *new4_entry)
+{
+	struct mlxsw_sp_fib_node *fib_node;
+
+	if (WARN_ON(!fib4_entry))
+		return -EINVAL;
+
+	fib_node = fib4_entry->common.fib_node;
+	list_for_each_entry_from(fib4_entry, &fib_node->entry_list,
+				 common.list) {
+		if (fib4_entry->tb_id != new4_entry->tb_id ||
+		    fib4_entry->tos != new4_entry->tos ||
+		    fib4_entry->prio != new4_entry->prio)
+			break;
+	}
+
+	list_add_tail(&new4_entry->common.list, &fib4_entry->common.list);
+	return 0;
+}
+
+static int
+mlxsw_sp_fib4_node_list_insert(struct mlxsw_sp_fib4_entry *new4_entry,
+			       bool replace, bool append)
+{
+	struct mlxsw_sp_fib_node *fib_node = new4_entry->common.fib_node;
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+
+	fib4_entry = mlxsw_sp_fib4_node_entry_find(fib_node, new4_entry);
+
+	if (append)
+		return mlxsw_sp_fib4_node_list_append(fib4_entry, new4_entry);
+	if (replace && WARN_ON(!fib4_entry))
+		return -EINVAL;
+
+	/* Insert new entry before replaced one, so that we can later
+	 * remove the second.
+	 */
+	if (fib4_entry) {
+		list_add_tail(&new4_entry->common.list,
+			      &fib4_entry->common.list);
+	} else {
+		struct mlxsw_sp_fib4_entry *last;
+
+		list_for_each_entry(last, &fib_node->entry_list, common.list) {
+			if (new4_entry->tb_id > last->tb_id)
+				break;
+			fib4_entry = last;
+		}
+
+		if (fib4_entry)
+			list_add(&new4_entry->common.list,
+				 &fib4_entry->common.list);
+		else
+			list_add(&new4_entry->common.list,
+				 &fib_node->entry_list);
+	}
+
+	return 0;
+}
+
+static void
+mlxsw_sp_fib4_node_list_remove(struct mlxsw_sp_fib4_entry *fib4_entry)
+{
+	list_del(&fib4_entry->common.list);
+}
+
+static int mlxsw_sp_fib_node_entry_add(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_fib_node *fib_node = fib_entry->fib_node;
+
+	if (!mlxsw_sp_fib_node_entry_is_first(fib_node, fib_entry))
+		return 0;
+
+	/* To prevent packet loss, overwrite the previously offloaded
+	 * entry.
+	 */
+	if (!list_is_singular(&fib_node->entry_list)) {
+		enum mlxsw_reg_ralue_op op = MLXSW_REG_RALUE_OP_WRITE_DELETE;
+		struct mlxsw_sp_fib_entry *n = list_next_entry(fib_entry, list);
+
+		mlxsw_sp_fib_entry_offload_refresh(n, op, 0);
+	}
+
+	return mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry);
+}
+
+static void mlxsw_sp_fib_node_entry_del(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_fib_node *fib_node = fib_entry->fib_node;
+
+	if (!mlxsw_sp_fib_node_entry_is_first(fib_node, fib_entry))
+		return;
+
+	/* Promote the next entry by overwriting the deleted entry */
+	if (!list_is_singular(&fib_node->entry_list)) {
+		struct mlxsw_sp_fib_entry *n = list_next_entry(fib_entry, list);
+		enum mlxsw_reg_ralue_op op = MLXSW_REG_RALUE_OP_WRITE_DELETE;
+
+		mlxsw_sp_fib_entry_update(mlxsw_sp, n);
+		mlxsw_sp_fib_entry_offload_refresh(fib_entry, op, 0);
+		return;
+	}
+
+	mlxsw_sp_fib_entry_del(mlxsw_sp, fib_entry);
+}
+
+static int mlxsw_sp_fib4_node_entry_link(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_fib4_entry *fib4_entry,
+					 bool replace, bool append)
+{
+	int err;
+
+	err = mlxsw_sp_fib4_node_list_insert(fib4_entry, replace, append);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_fib_node_entry_add(mlxsw_sp, &fib4_entry->common);
+	if (err)
+		goto err_fib_node_entry_add;
+
+	return 0;
+
+err_fib_node_entry_add:
+	mlxsw_sp_fib4_node_list_remove(fib4_entry);
+	return err;
+}
+
+static void
+mlxsw_sp_fib4_node_entry_unlink(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_fib4_entry *fib4_entry)
+{
+	mlxsw_sp_fib_node_entry_del(mlxsw_sp, &fib4_entry->common);
+	mlxsw_sp_fib4_node_list_remove(fib4_entry);
+
+	if (fib4_entry->common.type == MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP)
+		mlxsw_sp_fib_entry_decap_fini(mlxsw_sp, &fib4_entry->common);
+}
+
+static void mlxsw_sp_fib4_entry_replace(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib4_entry *fib4_entry,
+					bool replace)
+{
+	struct mlxsw_sp_fib_node *fib_node = fib4_entry->common.fib_node;
+	struct mlxsw_sp_fib4_entry *replaced;
+
+	if (!replace)
+		return;
+
+	/* We inserted the new entry before replaced one */
+	replaced = list_next_entry(fib4_entry, common.list);
+
+	mlxsw_sp_fib4_node_entry_unlink(mlxsw_sp, replaced);
+	mlxsw_sp_fib4_entry_destroy(mlxsw_sp, replaced);
+	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+}
+
+static int
+mlxsw_sp_router_fib4_add(struct mlxsw_sp *mlxsw_sp,
+			 const struct fib_entry_notifier_info *fen_info,
+			 bool replace, bool append)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+	struct mlxsw_sp_fib_node *fib_node;
+	int err;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	fib_node = mlxsw_sp_fib_node_get(mlxsw_sp, fen_info->tb_id,
+					 &fen_info->dst, sizeof(fen_info->dst),
+					 fen_info->dst_len,
+					 MLXSW_SP_L3_PROTO_IPV4);
+	if (IS_ERR(fib_node)) {
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to get FIB node\n");
+		return PTR_ERR(fib_node);
+	}
+
+	fib4_entry = mlxsw_sp_fib4_entry_create(mlxsw_sp, fib_node, fen_info);
+	if (IS_ERR(fib4_entry)) {
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to create FIB entry\n");
+		err = PTR_ERR(fib4_entry);
+		goto err_fib4_entry_create;
+	}
+
+	err = mlxsw_sp_fib4_node_entry_link(mlxsw_sp, fib4_entry, replace,
+					    append);
+	if (err) {
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to link FIB entry to node\n");
+		goto err_fib4_node_entry_link;
+	}
+
+	mlxsw_sp_fib4_entry_replace(mlxsw_sp, fib4_entry, replace);
+
+	return 0;
+
+err_fib4_node_entry_link:
+	mlxsw_sp_fib4_entry_destroy(mlxsw_sp, fib4_entry);
+err_fib4_entry_create:
+	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+	return err;
+}
+
+static void mlxsw_sp_router_fib4_del(struct mlxsw_sp *mlxsw_sp,
+				     struct fib_entry_notifier_info *fen_info)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+	struct mlxsw_sp_fib_node *fib_node;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	fib4_entry = mlxsw_sp_fib4_entry_lookup(mlxsw_sp, fen_info);
+	if (WARN_ON(!fib4_entry))
+		return;
+	fib_node = fib4_entry->common.fib_node;
+
+	mlxsw_sp_fib4_node_entry_unlink(mlxsw_sp, fib4_entry);
+	mlxsw_sp_fib4_entry_destroy(mlxsw_sp, fib4_entry);
+	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+}
+
+static bool mlxsw_sp_fib6_rt_should_ignore(const struct rt6_info *rt)
+{
+	/* Packets with link-local destination IP arriving to the router
+	 * are trapped to the CPU, so no need to program specific routes
+	 * for them.
+	 */
+	if (ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LINKLOCAL)
+		return true;
+
+	/* Multicast routes aren't supported, so ignore them. Neighbour
+	 * Discovery packets are specifically trapped.
+	 */
+	if (ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_MULTICAST)
+		return true;
+
+	/* Cloned routes are irrelevant in the forwarding path. */
+	if (rt->rt6i_flags & RTF_CACHE)
+		return true;
+
+	return false;
+}
+
+static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct rt6_info *rt)
+{
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+
+	mlxsw_sp_rt6 = kzalloc(sizeof(*mlxsw_sp_rt6), GFP_KERNEL);
+	if (!mlxsw_sp_rt6)
+		return ERR_PTR(-ENOMEM);
+
+	/* In case of route replace, replaced route is deleted with
+	 * no notification. Take reference to prevent accessing freed
+	 * memory.
+	 */
+	mlxsw_sp_rt6->rt = rt;
+	rt6_hold(rt);
+
+	return mlxsw_sp_rt6;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void mlxsw_sp_rt6_release(struct rt6_info *rt)
+{
+	rt6_release(rt);
+}
+#else
+static void mlxsw_sp_rt6_release(struct rt6_info *rt)
+{
+}
+#endif
+
+static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
+{
+	mlxsw_sp_rt6_release(mlxsw_sp_rt6->rt);
+	kfree(mlxsw_sp_rt6);
+}
+
+static bool mlxsw_sp_fib6_rt_can_mp(const struct rt6_info *rt)
+{
+	/* RTF_CACHE routes are ignored */
+	return (rt->rt6i_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY;
+}
+
+static struct rt6_info *
+mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	return list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
+				list)->rt;
+}
+
+static struct mlxsw_sp_fib6_entry *
+mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
+				 const struct rt6_info *nrt, bool replace)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+
+	if (!mlxsw_sp_fib6_rt_can_mp(nrt) || replace)
+		return NULL;
+
+	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
+		struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
+
+		/* RT6_TABLE_LOCAL and RT6_TABLE_MAIN share the same
+		 * virtual router.
+		 */
+		if (rt->rt6i_table->tb6_id > nrt->rt6i_table->tb6_id)
+			continue;
+		if (rt->rt6i_table->tb6_id != nrt->rt6i_table->tb6_id)
+			break;
+		if (rt->rt6i_metric < nrt->rt6i_metric)
+			continue;
+		if (rt->rt6i_metric == nrt->rt6i_metric &&
+		    mlxsw_sp_fib6_rt_can_mp(rt))
+			return fib6_entry;
+		if (rt->rt6i_metric > nrt->rt6i_metric)
+			break;
+	}
+
+	return NULL;
+}
+
+static struct mlxsw_sp_rt6 *
+mlxsw_sp_fib6_entry_rt_find(const struct mlxsw_sp_fib6_entry *fib6_entry,
+			    const struct rt6_info *rt)
+{
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+
+	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
+		if (mlxsw_sp_rt6->rt == rt)
+			return mlxsw_sp_rt6;
+	}
+
+	return NULL;
+}
+
+static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp,
+					const struct rt6_info *rt,
+					enum mlxsw_sp_ipip_type *ret)
+{
+	return rt->dst.dev &&
+	       mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->dst.dev, ret);
+}
+
+static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_nexthop_group *nh_grp,
+				       struct mlxsw_sp_nexthop *nh,
+				       const struct rt6_info *rt)
+{
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	struct net_device *dev = rt->dst.dev;
+	struct mlxsw_sp_rif *rif;
+	int err;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, dev);
+	if (ipip_entry) {
+		ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipip_entry->ipipt];
+		if (ipip_ops->can_offload(mlxsw_sp, dev,
+					  MLXSW_SP_L3_PROTO_IPV6)) {
+			nh->type = MLXSW_SP_NEXTHOP_TYPE_IPIP;
+			mlxsw_sp_nexthop_ipip_init(mlxsw_sp, nh, ipip_entry);
+			return 0;
+		}
+	}
+
+	nh->type = MLXSW_SP_NEXTHOP_TYPE_ETH;
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!rif)
+		return 0;
+	mlxsw_sp_nexthop_rif_init(nh, rif);
+
+	err = mlxsw_sp_nexthop_neigh_init(mlxsw_sp, nh);
+	if (err)
+		goto err_nexthop_neigh_init;
+
+	return 0;
+
+err_nexthop_neigh_init:
+	mlxsw_sp_nexthop_rif_fini(nh);
+	return err;
+}
+
+static void mlxsw_sp_nexthop6_type_fini(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_nexthop *nh)
+{
+	mlxsw_sp_nexthop_type_fini(mlxsw_sp, nh);
+}
+
+static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_nexthop_group *nh_grp,
+				  struct mlxsw_sp_nexthop *nh,
+				  const struct rt6_info *rt)
+{
+	struct net_device *dev = rt->dst.dev;
+
+	nh->nh_grp = nh_grp;
+	nh->nh_weight = rt->rt6i_nh_weight;
+	memcpy(&nh->gw_addr, &rt->rt6i_gateway, sizeof(nh->gw_addr));
+	mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
+
+	list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list);
+
+	if (!dev)
+		return 0;
+	nh->ifindex = dev->ifindex;
+
+	return mlxsw_sp_nexthop6_type_init(mlxsw_sp, nh_grp, nh, rt);
+}
+
+static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_nexthop *nh)
+{
+	mlxsw_sp_nexthop6_type_fini(mlxsw_sp, nh);
+	list_del(&nh->router_list_node);
+	mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh);
+}
+
+static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp,
+				    const struct rt6_info *rt)
+{
+	return rt->rt6i_flags & RTF_GATEWAY ||
+	       mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, NULL);
+}
+
+static struct mlxsw_sp_nexthop_group *
+mlxsw_sp_nexthop6_group_create(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp;
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+	struct mlxsw_sp_nexthop *nh;
+	size_t alloc_size;
+	int i = 0;
+	int err;
+
+	alloc_size = sizeof(*nh_grp) +
+		     fib6_entry->nrt6 * sizeof(struct mlxsw_sp_nexthop);
+	nh_grp = kzalloc(alloc_size, GFP_KERNEL);
+	if (!nh_grp)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&nh_grp->fib_list);
+#if IS_ENABLED(CONFIG_IPV6)
+	nh_grp->neigh_tbl = &nd_tbl;
+#endif
+	mlxsw_sp_rt6 = list_first_entry(&fib6_entry->rt6_list,
+					struct mlxsw_sp_rt6, list);
+	nh_grp->gateway = mlxsw_sp_rt6_is_gateway(mlxsw_sp, mlxsw_sp_rt6->rt);
+	nh_grp->count = fib6_entry->nrt6;
+	for (i = 0; i < nh_grp->count; i++) {
+		struct rt6_info *rt = mlxsw_sp_rt6->rt;
+
+		nh = &nh_grp->nexthops[i];
+		err = mlxsw_sp_nexthop6_init(mlxsw_sp, nh_grp, nh, rt);
+		if (err)
+			goto err_nexthop6_init;
+		mlxsw_sp_rt6 = list_next_entry(mlxsw_sp_rt6, list);
+	}
+
+	err = mlxsw_sp_nexthop_group_insert(mlxsw_sp, nh_grp);
+	if (err)
+		goto err_nexthop_group_insert;
+
+	mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh_grp);
+	return nh_grp;
+
+err_nexthop_group_insert:
+err_nexthop6_init:
+	for (i--; i >= 0; i--) {
+		nh = &nh_grp->nexthops[i];
+		mlxsw_sp_nexthop6_fini(mlxsw_sp, nh);
+	}
+	kfree(nh_grp);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_nexthop6_group_destroy(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	struct mlxsw_sp_nexthop *nh;
+	int i = nh_grp->count;
+
+	mlxsw_sp_nexthop_group_remove(mlxsw_sp, nh_grp);
+	for (i--; i >= 0; i--) {
+		nh = &nh_grp->nexthops[i];
+		mlxsw_sp_nexthop6_fini(mlxsw_sp, nh);
+	}
+	mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh_grp);
+	WARN_ON(nh_grp->adj_index_valid);
+	kfree(nh_grp);
+}
+
+static int mlxsw_sp_nexthop6_group_get(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp;
+
+	nh_grp = mlxsw_sp_nexthop6_group_lookup(mlxsw_sp, fib6_entry);
+	if (!nh_grp) {
+		nh_grp = mlxsw_sp_nexthop6_group_create(mlxsw_sp, fib6_entry);
+		if (IS_ERR(nh_grp))
+			return PTR_ERR(nh_grp);
+	}
+
+	list_add_tail(&fib6_entry->common.nexthop_group_node,
+		      &nh_grp->fib_list);
+	fib6_entry->common.nh_group = nh_grp;
+
+	return 0;
+}
+
+static void mlxsw_sp_nexthop6_group_put(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
+
+	list_del(&fib_entry->nexthop_group_node);
+	if (!list_empty(&nh_grp->fib_list))
+		return;
+	mlxsw_sp_nexthop6_group_destroy(mlxsw_sp, nh_grp);
+}
+
+static int
+mlxsw_sp_nexthop6_group_update(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	struct mlxsw_sp_nexthop_group *old_nh_grp = fib6_entry->common.nh_group;
+	int err;
+
+	fib6_entry->common.nh_group = NULL;
+	list_del(&fib6_entry->common.nexthop_group_node);
+
+	err = mlxsw_sp_nexthop6_group_get(mlxsw_sp, fib6_entry);
+	if (err)
+		goto err_nexthop6_group_get;
+
+	/* In case this entry is offloaded, then the adjacency index
+	 * currently associated with it in the device's table is that
+	 * of the old group. Start using the new one instead.
+	 */
+	err = mlxsw_sp_fib_node_entry_add(mlxsw_sp, &fib6_entry->common);
+	if (err)
+		goto err_fib_node_entry_add;
+
+	if (list_empty(&old_nh_grp->fib_list))
+		mlxsw_sp_nexthop6_group_destroy(mlxsw_sp, old_nh_grp);
+
+	return 0;
+
+err_fib_node_entry_add:
+	mlxsw_sp_nexthop6_group_put(mlxsw_sp, &fib6_entry->common);
+err_nexthop6_group_get:
+	list_add_tail(&fib6_entry->common.nexthop_group_node,
+		      &old_nh_grp->fib_list);
+	fib6_entry->common.nh_group = old_nh_grp;
+	return err;
+}
+
+static int
+mlxsw_sp_fib6_entry_nexthop_add(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_fib6_entry *fib6_entry,
+				struct rt6_info *rt)
+{
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+	int err;
+
+	mlxsw_sp_rt6 = mlxsw_sp_rt6_create(rt);
+	if (IS_ERR(mlxsw_sp_rt6))
+		return PTR_ERR(mlxsw_sp_rt6);
+
+	list_add_tail(&mlxsw_sp_rt6->list, &fib6_entry->rt6_list);
+	fib6_entry->nrt6++;
+
+	err = mlxsw_sp_nexthop6_group_update(mlxsw_sp, fib6_entry);
+	if (err)
+		goto err_nexthop6_group_update;
+
+	return 0;
+
+err_nexthop6_group_update:
+	fib6_entry->nrt6--;
+	list_del(&mlxsw_sp_rt6->list);
+	mlxsw_sp_rt6_destroy(mlxsw_sp_rt6);
+	return err;
+}
+
+static void
+mlxsw_sp_fib6_entry_nexthop_del(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_fib6_entry *fib6_entry,
+				struct rt6_info *rt)
+{
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+
+	mlxsw_sp_rt6 = mlxsw_sp_fib6_entry_rt_find(fib6_entry, rt);
+	if (WARN_ON(!mlxsw_sp_rt6))
+		return;
+
+	fib6_entry->nrt6--;
+	list_del(&mlxsw_sp_rt6->list);
+	mlxsw_sp_nexthop6_group_update(mlxsw_sp, fib6_entry);
+	mlxsw_sp_rt6_destroy(mlxsw_sp_rt6);
+}
+
+static void mlxsw_sp_fib6_entry_type_set(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_fib_entry *fib_entry,
+					 const struct rt6_info *rt)
+{
+	/* Packets hitting RTF_REJECT routes need to be discarded by the
+	 * stack. We can rely on their destination device not having a
+	 * RIF (it's the loopback device) and can thus use action type
+	 * local, which will cause them to be trapped with a lower
+	 * priority than packets that need to be locally received.
+	 */
+	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
+		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP;
+	else if (rt->rt6i_flags & RTF_REJECT)
+		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL;
+	else if (mlxsw_sp_rt6_is_gateway(mlxsw_sp, rt))
+		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_REMOTE;
+	else
+		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL;
+}
+
+static void
+mlxsw_sp_fib6_entry_rt_destroy_all(struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6, *tmp;
+
+	list_for_each_entry_safe(mlxsw_sp_rt6, tmp, &fib6_entry->rt6_list,
+				 list) {
+		fib6_entry->nrt6--;
+		list_del(&mlxsw_sp_rt6->list);
+		mlxsw_sp_rt6_destroy(mlxsw_sp_rt6);
+	}
+}
+
+static struct mlxsw_sp_fib6_entry *
+mlxsw_sp_fib6_entry_create(struct mlxsw_sp *mlxsw_sp,
+			   struct mlxsw_sp_fib_node *fib_node,
+			   struct rt6_info *rt)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_fib_entry *fib_entry;
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+	int err;
+
+	fib6_entry = kzalloc(sizeof(*fib6_entry), GFP_KERNEL);
+	if (!fib6_entry)
+		return ERR_PTR(-ENOMEM);
+	fib_entry = &fib6_entry->common;
+
+	mlxsw_sp_rt6 = mlxsw_sp_rt6_create(rt);
+	if (IS_ERR(mlxsw_sp_rt6)) {
+		err = PTR_ERR(mlxsw_sp_rt6);
+		goto err_rt6_create;
+	}
+
+	mlxsw_sp_fib6_entry_type_set(mlxsw_sp, fib_entry, mlxsw_sp_rt6->rt);
+
+	INIT_LIST_HEAD(&fib6_entry->rt6_list);
+	list_add_tail(&mlxsw_sp_rt6->list, &fib6_entry->rt6_list);
+	fib6_entry->nrt6 = 1;
+	err = mlxsw_sp_nexthop6_group_get(mlxsw_sp, fib6_entry);
+	if (err)
+		goto err_nexthop6_group_get;
+
+	fib_entry->fib_node = fib_node;
+
+	return fib6_entry;
+
+err_nexthop6_group_get:
+	list_del(&mlxsw_sp_rt6->list);
+	mlxsw_sp_rt6_destroy(mlxsw_sp_rt6);
+err_rt6_create:
+	kfree(fib6_entry);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_fib6_entry_destroy(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	mlxsw_sp_nexthop6_group_put(mlxsw_sp, &fib6_entry->common);
+	mlxsw_sp_fib6_entry_rt_destroy_all(fib6_entry);
+	WARN_ON(fib6_entry->nrt6);
+	kfree(fib6_entry);
+}
+
+static struct mlxsw_sp_fib6_entry *
+mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
+			      const struct rt6_info *nrt, bool replace)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry, *fallback = NULL;
+
+	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
+		struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
+
+		if (rt->rt6i_table->tb6_id > nrt->rt6i_table->tb6_id)
+			continue;
+		if (rt->rt6i_table->tb6_id != nrt->rt6i_table->tb6_id)
+			break;
+		if (replace && rt->rt6i_metric == nrt->rt6i_metric) {
+			if (mlxsw_sp_fib6_rt_can_mp(rt) ==
+			    mlxsw_sp_fib6_rt_can_mp(nrt))
+				return fib6_entry;
+			if (mlxsw_sp_fib6_rt_can_mp(nrt))
+				fallback = fallback ?: fib6_entry;
+		}
+		if (rt->rt6i_metric > nrt->rt6i_metric)
+			return fallback ?: fib6_entry;
+	}
+
+	return fallback;
+}
+
+static int
+mlxsw_sp_fib6_node_list_insert(struct mlxsw_sp_fib6_entry *new6_entry,
+			       bool replace)
+{
+	struct mlxsw_sp_fib_node *fib_node = new6_entry->common.fib_node;
+	struct rt6_info *nrt = mlxsw_sp_fib6_entry_rt(new6_entry);
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+
+	fib6_entry = mlxsw_sp_fib6_node_entry_find(fib_node, nrt, replace);
+
+	if (replace && WARN_ON(!fib6_entry))
+		return -EINVAL;
+
+	if (fib6_entry) {
+		list_add_tail(&new6_entry->common.list,
+			      &fib6_entry->common.list);
+	} else {
+		struct mlxsw_sp_fib6_entry *last;
+
+		list_for_each_entry(last, &fib_node->entry_list, common.list) {
+			struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(last);
+
+			if (nrt->rt6i_table->tb6_id > rt->rt6i_table->tb6_id)
+				break;
+			fib6_entry = last;
+		}
+
+		if (fib6_entry)
+			list_add(&new6_entry->common.list,
+				 &fib6_entry->common.list);
+		else
+			list_add(&new6_entry->common.list,
+				 &fib_node->entry_list);
+	}
+
+	return 0;
+}
+
+static void
+mlxsw_sp_fib6_node_list_remove(struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	list_del(&fib6_entry->common.list);
+}
+
+static int mlxsw_sp_fib6_node_entry_link(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_fib6_entry *fib6_entry,
+					 bool replace)
+{
+	int err;
+
+	err = mlxsw_sp_fib6_node_list_insert(fib6_entry, replace);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_fib_node_entry_add(mlxsw_sp, &fib6_entry->common);
+	if (err)
+		goto err_fib_node_entry_add;
+
+	return 0;
+
+err_fib_node_entry_add:
+	mlxsw_sp_fib6_node_list_remove(fib6_entry);
+	return err;
+}
+
+static void
+mlxsw_sp_fib6_node_entry_unlink(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	mlxsw_sp_fib_node_entry_del(mlxsw_sp, &fib6_entry->common);
+	mlxsw_sp_fib6_node_list_remove(fib6_entry);
+}
+
+static struct mlxsw_sp_fib6_entry *
+mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp,
+			   const struct rt6_info *rt)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_fib_node *fib_node;
+	struct mlxsw_sp_fib *fib;
+	struct mlxsw_sp_vr *vr;
+
+	vr = mlxsw_sp_vr_find(mlxsw_sp, rt->rt6i_table->tb6_id);
+	if (!vr)
+		return NULL;
+	fib = mlxsw_sp_vr_fib(vr, MLXSW_SP_L3_PROTO_IPV6);
+
+	fib_node = mlxsw_sp_fib_node_lookup(fib, &rt->rt6i_dst.addr,
+					    sizeof(rt->rt6i_dst.addr),
+					    rt->rt6i_dst.plen);
+	if (!fib_node)
+		return NULL;
+
+	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
+		struct rt6_info *iter_rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
+
+		if (rt->rt6i_table->tb6_id == iter_rt->rt6i_table->tb6_id &&
+		    rt->rt6i_metric == iter_rt->rt6i_metric &&
+		    mlxsw_sp_fib6_entry_rt_find(fib6_entry, rt))
+			return fib6_entry;
+	}
+
+	return NULL;
+}
+
+static void mlxsw_sp_fib6_entry_replace(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib6_entry *fib6_entry,
+					bool replace)
+{
+	struct mlxsw_sp_fib_node *fib_node = fib6_entry->common.fib_node;
+	struct mlxsw_sp_fib6_entry *replaced;
+
+	if (!replace)
+		return;
+
+	replaced = list_next_entry(fib6_entry, common.list);
+
+	mlxsw_sp_fib6_node_entry_unlink(mlxsw_sp, replaced);
+	mlxsw_sp_fib6_entry_destroy(mlxsw_sp, replaced);
+	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+}
+
+static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
+				    struct rt6_info *rt, bool replace)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_fib_node *fib_node;
+	int err;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	if (rt->rt6i_src.plen)
+		return -EINVAL;
+
+	if (mlxsw_sp_fib6_rt_should_ignore(rt))
+		return 0;
+
+	fib_node = mlxsw_sp_fib_node_get(mlxsw_sp, rt->rt6i_table->tb6_id,
+					 &rt->rt6i_dst.addr,
+					 sizeof(rt->rt6i_dst.addr),
+					 rt->rt6i_dst.plen,
+					 MLXSW_SP_L3_PROTO_IPV6);
+	if (IS_ERR(fib_node))
+		return PTR_ERR(fib_node);
+
+	/* Before creating a new entry, try to append route to an existing
+	 * multipath entry.
+	 */
+	fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, replace);
+	if (fib6_entry) {
+		err = mlxsw_sp_fib6_entry_nexthop_add(mlxsw_sp, fib6_entry, rt);
+		if (err)
+			goto err_fib6_entry_nexthop_add;
+		return 0;
+	}
+
+	fib6_entry = mlxsw_sp_fib6_entry_create(mlxsw_sp, fib_node, rt);
+	if (IS_ERR(fib6_entry)) {
+		err = PTR_ERR(fib6_entry);
+		goto err_fib6_entry_create;
+	}
+
+	err = mlxsw_sp_fib6_node_entry_link(mlxsw_sp, fib6_entry, replace);
+	if (err)
+		goto err_fib6_node_entry_link;
+
+	mlxsw_sp_fib6_entry_replace(mlxsw_sp, fib6_entry, replace);
+
+	return 0;
+
+err_fib6_node_entry_link:
+	mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry);
+err_fib6_entry_create:
+err_fib6_entry_nexthop_add:
+	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+	return err;
+}
+
+static void mlxsw_sp_router_fib6_del(struct mlxsw_sp *mlxsw_sp,
+				     struct rt6_info *rt)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_fib_node *fib_node;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	if (mlxsw_sp_fib6_rt_should_ignore(rt))
+		return;
+
+	fib6_entry = mlxsw_sp_fib6_entry_lookup(mlxsw_sp, rt);
+	if (WARN_ON(!fib6_entry))
+		return;
+
+	/* If route is part of a multipath entry, but not the last one
+	 * removed, then only reduce its nexthop group.
+	 */
+	if (!list_is_singular(&fib6_entry->rt6_list)) {
+		mlxsw_sp_fib6_entry_nexthop_del(mlxsw_sp, fib6_entry, rt);
+		return;
+	}
+
+	fib_node = fib6_entry->common.fib_node;
+
+	mlxsw_sp_fib6_node_entry_unlink(mlxsw_sp, fib6_entry);
+	mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry);
+	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+}
+
+static int __mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp,
+					    enum mlxsw_reg_ralxx_protocol proto,
+					    u8 tree_id)
+{
+	char ralta_pl[MLXSW_REG_RALTA_LEN];
+	char ralst_pl[MLXSW_REG_RALST_LEN];
+	int i, err;
+
+	mlxsw_reg_ralta_pack(ralta_pl, true, proto, tree_id);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralta), ralta_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_ralst_pack(ralst_pl, 0xff, tree_id);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralst), ralst_pl);
+	if (err)
+		return err;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
+		struct mlxsw_sp_vr *vr = &mlxsw_sp->router->vrs[i];
+		char raltb_pl[MLXSW_REG_RALTB_LEN];
+		char ralue_pl[MLXSW_REG_RALUE_LEN];
+
+		mlxsw_reg_raltb_pack(raltb_pl, vr->id, proto, tree_id);
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(raltb),
+				      raltb_pl);
+		if (err)
+			return err;
+
+		mlxsw_reg_ralue_pack(ralue_pl, proto,
+				     MLXSW_REG_RALUE_OP_WRITE_WRITE, vr->id, 0);
+		mlxsw_reg_ralue_act_ip2me_pack(ralue_pl);
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue),
+				      ralue_pl);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static struct mlxsw_sp_mr_table *
+mlxsw_sp_router_fibmr_family_to_table(struct mlxsw_sp_vr *vr, int family)
+{
+	if (family == RTNL_FAMILY_IPMR)
+		return vr->mr_table[MLXSW_SP_L3_PROTO_IPV4];
+	else
+		return vr->mr_table[MLXSW_SP_L3_PROTO_IPV6];
+}
+
+static int mlxsw_sp_router_fibmr_add(struct mlxsw_sp *mlxsw_sp,
+				     struct mfc_entry_notifier_info *men_info,
+				     bool replace)
+{
+	struct mlxsw_sp_mr_table *mrt;
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	vr = mlxsw_sp_vr_get(mlxsw_sp, men_info->tb_id, NULL);
+	if (IS_ERR(vr))
+		return PTR_ERR(vr);
+
+	mrt = mlxsw_sp_router_fibmr_family_to_table(vr, men_info->info.family);
+	return mlxsw_sp_mr_route_add(mrt, men_info->mfc, replace);
+}
+
+static void mlxsw_sp_router_fibmr_del(struct mlxsw_sp *mlxsw_sp,
+				      struct mfc_entry_notifier_info *men_info)
+{
+	struct mlxsw_sp_mr_table *mrt;
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	vr = mlxsw_sp_vr_find(mlxsw_sp, men_info->tb_id);
+	if (WARN_ON(!vr))
+		return;
+
+	mrt = mlxsw_sp_router_fibmr_family_to_table(vr, men_info->info.family);
+	mlxsw_sp_mr_route_del(mrt, men_info->mfc);
+	mlxsw_sp_vr_put(mlxsw_sp, vr);
+}
+
+static int
+mlxsw_sp_router_fibmr_vif_add(struct mlxsw_sp *mlxsw_sp,
+			      struct vif_entry_notifier_info *ven_info)
+{
+	struct mlxsw_sp_mr_table *mrt;
+	struct mlxsw_sp_rif *rif;
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	vr = mlxsw_sp_vr_get(mlxsw_sp, ven_info->tb_id, NULL);
+	if (IS_ERR(vr))
+		return PTR_ERR(vr);
+
+	mrt = mlxsw_sp_router_fibmr_family_to_table(vr, ven_info->info.family);
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, ven_info->dev);
+	return mlxsw_sp_mr_vif_add(mrt, ven_info->dev,
+				   ven_info->vif_index,
+				   ven_info->vif_flags, rif);
+}
+
+static void
+mlxsw_sp_router_fibmr_vif_del(struct mlxsw_sp *mlxsw_sp,
+			      struct vif_entry_notifier_info *ven_info)
+{
+	struct mlxsw_sp_mr_table *mrt;
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	vr = mlxsw_sp_vr_find(mlxsw_sp, ven_info->tb_id);
+	if (WARN_ON(!vr))
+		return;
+
+	mrt = mlxsw_sp_router_fibmr_family_to_table(vr, ven_info->info.family);
+	mlxsw_sp_mr_vif_del(mrt, ven_info->vif_index);
+	mlxsw_sp_vr_put(mlxsw_sp, vr);
+}
+
+static int mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp)
+{
+	enum mlxsw_reg_ralxx_protocol proto = MLXSW_REG_RALXX_PROTOCOL_IPV4;
+	int err;
+
+	err = __mlxsw_sp_router_set_abort_trap(mlxsw_sp, proto,
+					       MLXSW_SP_LPM_TREE_MIN);
+	if (err)
+		return err;
+
+	/* The multicast router code does not need an abort trap as by default,
+	 * packets that don't match any routes are trapped to the CPU.
+	 */
+
+	proto = MLXSW_REG_RALXX_PROTOCOL_IPV6;
+	return __mlxsw_sp_router_set_abort_trap(mlxsw_sp, proto,
+						MLXSW_SP_LPM_TREE_MIN + 1);
+}
+
+static void mlxsw_sp_fib4_node_flush(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_fib_node *fib_node)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry, *tmp;
+
+	list_for_each_entry_safe(fib4_entry, tmp, &fib_node->entry_list,
+				 common.list) {
+		bool do_break = &tmp->common.list == &fib_node->entry_list;
+
+		mlxsw_sp_fib4_node_entry_unlink(mlxsw_sp, fib4_entry);
+		mlxsw_sp_fib4_entry_destroy(mlxsw_sp, fib4_entry);
+		mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+		/* Break when entry list is empty and node was freed.
+		 * Otherwise, we'll access freed memory in the next
+		 * iteration.
+		 */
+		if (do_break)
+			break;
+	}
+}
+
+static void mlxsw_sp_fib6_node_flush(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_fib_node *fib_node)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry, *tmp;
+
+	list_for_each_entry_safe(fib6_entry, tmp, &fib_node->entry_list,
+				 common.list) {
+		bool do_break = &tmp->common.list == &fib_node->entry_list;
+
+		mlxsw_sp_fib6_node_entry_unlink(mlxsw_sp, fib6_entry);
+		mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry);
+		mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+		if (do_break)
+			break;
+	}
+}
+
+static void mlxsw_sp_fib_node_flush(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_fib_node *fib_node)
+{
+	switch (fib_node->fib->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_sp_fib4_node_flush(mlxsw_sp, fib_node);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		mlxsw_sp_fib6_node_flush(mlxsw_sp, fib_node);
+		break;
+	}
+}
+
+static void mlxsw_sp_vr_fib_flush(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_vr *vr,
+				  enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_fib *fib = mlxsw_sp_vr_fib(vr, proto);
+	struct mlxsw_sp_fib_node *fib_node, *tmp;
+
+	list_for_each_entry_safe(fib_node, tmp, &fib->node_list, list) {
+		bool do_break = &tmp->list == &fib->node_list;
+
+		mlxsw_sp_fib_node_flush(mlxsw_sp, fib_node);
+		if (do_break)
+			break;
+	}
+}
+
+static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp)
+{
+	int i, j;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
+		struct mlxsw_sp_vr *vr = &mlxsw_sp->router->vrs[i];
+
+		if (!mlxsw_sp_vr_is_used(vr))
+			continue;
+
+		for (j = 0; j < MLXSW_SP_L3_PROTO_MAX; j++)
+			mlxsw_sp_mr_table_flush(vr->mr_table[j]);
+		mlxsw_sp_vr_fib_flush(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV4);
+
+		/* If virtual router was only used for IPv4, then it's no
+		 * longer used.
+		 */
+		if (!mlxsw_sp_vr_is_used(vr))
+			continue;
+		mlxsw_sp_vr_fib_flush(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV6);
+	}
+}
+
+static void mlxsw_sp_router_fib_abort(struct mlxsw_sp *mlxsw_sp)
+{
+	int err;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+	dev_warn(mlxsw_sp->bus_info->dev, "FIB abort triggered. Note that FIB entries are no longer being offloaded to this device.\n");
+	mlxsw_sp_router_fib_flush(mlxsw_sp);
+	mlxsw_sp->router->aborted = true;
+	err = mlxsw_sp_router_set_abort_trap(mlxsw_sp);
+	if (err)
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to set abort trap.\n");
+}
+
+struct mlxsw_sp_fib_event_work {
+	struct work_struct work;
+	union {
+		struct fib6_entry_notifier_info fen6_info;
+		struct fib_entry_notifier_info fen_info;
+		struct fib_rule_notifier_info fr_info;
+		struct fib_nh_notifier_info fnh_info;
+		struct mfc_entry_notifier_info men_info;
+		struct vif_entry_notifier_info ven_info;
+	};
+	struct mlxsw_sp *mlxsw_sp;
+	unsigned long event;
+};
+
+static void mlxsw_sp_router_fib4_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_fib_event_work *fib_work =
+		container_of(work, struct mlxsw_sp_fib_event_work, work);
+	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
+	bool replace, append;
+	int err;
+
+	/* Protect internal structures from changes */
+	rtnl_lock();
+	mlxsw_sp_span_respin(mlxsw_sp);
+
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_APPEND: /* fall through */
+	case FIB_EVENT_ENTRY_ADD:
+		replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
+		append = fib_work->event == FIB_EVENT_ENTRY_APPEND;
+		err = mlxsw_sp_router_fib4_add(mlxsw_sp, &fib_work->fen_info,
+					       replace, append);
+		if (err)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		fib_info_put(fib_work->fen_info.fi);
+		break;
+	case FIB_EVENT_ENTRY_DEL:
+		mlxsw_sp_router_fib4_del(mlxsw_sp, &fib_work->fen_info);
+		fib_info_put(fib_work->fen_info.fi);
+		break;
+	case FIB_EVENT_RULE_ADD:
+		/* if we get here, a rule was added that we do not support.
+		 * just do the fib_abort
+		 */
+		mlxsw_sp_router_fib_abort(mlxsw_sp);
+		break;
+	case FIB_EVENT_NH_ADD: /* fall through */
+	case FIB_EVENT_NH_DEL:
+		mlxsw_sp_nexthop4_event(mlxsw_sp, fib_work->event,
+					fib_work->fnh_info.fib_nh);
+		fib_info_put(fib_work->fnh_info.fib_nh->nh_parent);
+		break;
+	}
+	rtnl_unlock();
+	kfree(fib_work);
+}
+
+static void mlxsw_sp_router_fib6_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_fib_event_work *fib_work =
+		container_of(work, struct mlxsw_sp_fib_event_work, work);
+	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
+	bool replace;
+	int err;
+
+	rtnl_lock();
+	mlxsw_sp_span_respin(mlxsw_sp);
+
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_ADD:
+		replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
+		err = mlxsw_sp_router_fib6_add(mlxsw_sp,
+					       fib_work->fen6_info.rt, replace);
+		if (err)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		mlxsw_sp_rt6_release(fib_work->fen6_info.rt);
+		break;
+	case FIB_EVENT_ENTRY_DEL:
+		mlxsw_sp_router_fib6_del(mlxsw_sp, fib_work->fen6_info.rt);
+		mlxsw_sp_rt6_release(fib_work->fen6_info.rt);
+		break;
+	case FIB_EVENT_RULE_ADD:
+		/* if we get here, a rule was added that we do not support.
+		 * just do the fib_abort
+		 */
+		mlxsw_sp_router_fib_abort(mlxsw_sp);
+		break;
+	}
+	rtnl_unlock();
+	kfree(fib_work);
+}
+
+static void mlxsw_sp_router_fibmr_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_fib_event_work *fib_work =
+		container_of(work, struct mlxsw_sp_fib_event_work, work);
+	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
+	bool replace;
+	int err;
+
+	rtnl_lock();
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_ADD:
+		replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
+
+		err = mlxsw_sp_router_fibmr_add(mlxsw_sp, &fib_work->men_info,
+						replace);
+		if (err)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		mr_cache_put(fib_work->men_info.mfc);
+		break;
+	case FIB_EVENT_ENTRY_DEL:
+		mlxsw_sp_router_fibmr_del(mlxsw_sp, &fib_work->men_info);
+		mr_cache_put(fib_work->men_info.mfc);
+		break;
+	case FIB_EVENT_VIF_ADD:
+		err = mlxsw_sp_router_fibmr_vif_add(mlxsw_sp,
+						    &fib_work->ven_info);
+		if (err)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		dev_put(fib_work->ven_info.dev);
+		break;
+	case FIB_EVENT_VIF_DEL:
+		mlxsw_sp_router_fibmr_vif_del(mlxsw_sp,
+					      &fib_work->ven_info);
+		dev_put(fib_work->ven_info.dev);
+		break;
+	case FIB_EVENT_RULE_ADD:
+		/* if we get here, a rule was added that we do not support.
+		 * just do the fib_abort
+		 */
+		mlxsw_sp_router_fib_abort(mlxsw_sp);
+		break;
+	}
+	rtnl_unlock();
+	kfree(fib_work);
+}
+
+static void mlxsw_sp_router_fib4_event(struct mlxsw_sp_fib_event_work *fib_work,
+				       struct fib_notifier_info *info)
+{
+	struct fib_entry_notifier_info *fen_info;
+	struct fib_nh_notifier_info *fnh_info;
+
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_APPEND: /* fall through */
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		fen_info = container_of(info, struct fib_entry_notifier_info,
+					info);
+		fib_work->fen_info = *fen_info;
+		/* Take reference on fib_info to prevent it from being
+		 * freed while work is queued. Release it afterwards.
+		 */
+		fib_info_hold(fib_work->fen_info.fi);
+		break;
+	case FIB_EVENT_NH_ADD: /* fall through */
+	case FIB_EVENT_NH_DEL:
+		fnh_info = container_of(info, struct fib_nh_notifier_info,
+					info);
+		fib_work->fnh_info = *fnh_info;
+		fib_info_hold(fib_work->fnh_info.fib_nh->nh_parent);
+		break;
+	}
+}
+
+static void mlxsw_sp_router_fib6_event(struct mlxsw_sp_fib_event_work *fib_work,
+				       struct fib_notifier_info *info)
+{
+	struct fib6_entry_notifier_info *fen6_info;
+
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		fen6_info = container_of(info, struct fib6_entry_notifier_info,
+					 info);
+		fib_work->fen6_info = *fen6_info;
+		rt6_hold(fib_work->fen6_info.rt);
+		break;
+	}
+}
+
+static void
+mlxsw_sp_router_fibmr_event(struct mlxsw_sp_fib_event_work *fib_work,
+			    struct fib_notifier_info *info)
+{
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		memcpy(&fib_work->men_info, info, sizeof(fib_work->men_info));
+		mr_cache_hold(fib_work->men_info.mfc);
+		break;
+	case FIB_EVENT_VIF_ADD: /* fall through */
+	case FIB_EVENT_VIF_DEL:
+		memcpy(&fib_work->ven_info, info, sizeof(fib_work->ven_info));
+		dev_hold(fib_work->ven_info.dev);
+		break;
+	}
+}
+
+static int mlxsw_sp_router_fib_rule_event(unsigned long event,
+					  struct fib_notifier_info *info,
+					  struct mlxsw_sp *mlxsw_sp)
+{
+	struct netlink_ext_ack *extack = info->extack;
+	struct fib_rule_notifier_info *fr_info;
+	struct fib_rule *rule;
+	int err = 0;
+
+	/* nothing to do at the moment */
+	if (event == FIB_EVENT_RULE_DEL)
+		return 0;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	fr_info = container_of(info, struct fib_rule_notifier_info, info);
+	rule = fr_info->rule;
+
+	switch (info->family) {
+	case AF_INET:
+		if (!fib4_rule_default(rule) && !rule->l3mdev)
+			err = -1;
+		break;
+	case AF_INET6:
+		if (!fib6_rule_default(rule) && !rule->l3mdev)
+			err = -1;
+		break;
+	case RTNL_FAMILY_IPMR:
+		if (!ipmr_rule_default(rule) && !rule->l3mdev)
+			err = -1;
+		break;
+	case RTNL_FAMILY_IP6MR:
+		if (!ip6mr_rule_default(rule) && !rule->l3mdev)
+			err = -1;
+		break;
+	}
+
+	if (err < 0)
+		NL_SET_ERR_MSG_MOD(extack, "FIB rules not supported. Aborting offload");
+
+	return err;
+}
+
+/* Called with rcu_read_lock() */
+static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
+				     unsigned long event, void *ptr)
+{
+	struct mlxsw_sp_fib_event_work *fib_work;
+	struct fib_notifier_info *info = ptr;
+	struct mlxsw_sp_router *router;
+	int err;
+
+	if (!net_eq(info->net, &init_net) ||
+	    (info->family != AF_INET && info->family != AF_INET6 &&
+	     info->family != RTNL_FAMILY_IPMR &&
+	     info->family != RTNL_FAMILY_IP6MR))
+		return NOTIFY_DONE;
+
+	router = container_of(nb, struct mlxsw_sp_router, fib_nb);
+
+	switch (event) {
+	case FIB_EVENT_RULE_ADD: /* fall through */
+	case FIB_EVENT_RULE_DEL:
+		err = mlxsw_sp_router_fib_rule_event(event, info,
+						     router->mlxsw_sp);
+		if (!err)
+			return NOTIFY_DONE;
+	}
+
+	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
+	if (WARN_ON(!fib_work))
+		return NOTIFY_BAD;
+
+	fib_work->mlxsw_sp = router->mlxsw_sp;
+	fib_work->event = event;
+
+	switch (info->family) {
+	case AF_INET:
+		INIT_WORK(&fib_work->work, mlxsw_sp_router_fib4_event_work);
+		mlxsw_sp_router_fib4_event(fib_work, info);
+		break;
+	case AF_INET6:
+		INIT_WORK(&fib_work->work, mlxsw_sp_router_fib6_event_work);
+		mlxsw_sp_router_fib6_event(fib_work, info);
+		break;
+	case RTNL_FAMILY_IP6MR:
+	case RTNL_FAMILY_IPMR:
+		INIT_WORK(&fib_work->work, mlxsw_sp_router_fibmr_event_work);
+		mlxsw_sp_router_fibmr_event(fib_work, info);
+		break;
+	}
+
+	mlxsw_core_schedule_work(&fib_work->work);
+
+	return NOTIFY_DONE;
+}
+
+static struct mlxsw_sp_rif *
+mlxsw_sp_rif_find_by_dev(const struct mlxsw_sp *mlxsw_sp,
+			 const struct net_device *dev)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++)
+		if (mlxsw_sp->router->rifs[i] &&
+		    mlxsw_sp->router->rifs[i]->dev == dev)
+			return mlxsw_sp->router->rifs[i];
+
+	return NULL;
+}
+
+static int mlxsw_sp_router_rif_disable(struct mlxsw_sp *mlxsw_sp, u16 rif)
+{
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+	int err;
+
+	mlxsw_reg_ritr_rif_pack(ritr_pl, rif);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+	if (WARN_ON_ONCE(err))
+		return err;
+
+	mlxsw_reg_ritr_enable_set(ritr_pl, false);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
+static void mlxsw_sp_router_rif_gone_sync(struct mlxsw_sp *mlxsw_sp,
+					  struct mlxsw_sp_rif *rif)
+{
+	mlxsw_sp_router_rif_disable(mlxsw_sp, rif->rif_index);
+	mlxsw_sp_nexthop_rif_gone_sync(mlxsw_sp, rif);
+	mlxsw_sp_neigh_rif_gone_sync(mlxsw_sp, rif);
+}
+
+static bool
+mlxsw_sp_rif_should_config(struct mlxsw_sp_rif *rif, struct net_device *dev,
+			   unsigned long event)
+{
+	struct inet6_dev *inet6_dev;
+	bool addr_list_empty = true;
+	struct in_device *idev;
+
+	switch (event) {
+	case NETDEV_UP:
+		return rif == NULL;
+	case NETDEV_DOWN:
+		idev = __in_dev_get_rtnl(dev);
+		if (idev && idev->ifa_list)
+			addr_list_empty = false;
+
+		inet6_dev = __in6_dev_get(dev);
+		if (addr_list_empty && inet6_dev &&
+		    !list_empty(&inet6_dev->addr_list))
+			addr_list_empty = false;
+
+		if (rif && addr_list_empty &&
+		    !netif_is_l3_slave(rif->dev))
+			return true;
+		/* It is possible we already removed the RIF ourselves
+		 * if it was assigned to a netdev that is now a bridge
+		 * or LAG slave.
+		 */
+		return false;
+	}
+
+	return false;
+}
+
+static enum mlxsw_sp_rif_type
+mlxsw_sp_dev_rif_type(const struct mlxsw_sp *mlxsw_sp,
+		      const struct net_device *dev)
+{
+	enum mlxsw_sp_fid_type type;
+
+	if (mlxsw_sp_netdev_ipip_type(mlxsw_sp, dev, NULL))
+		return MLXSW_SP_RIF_TYPE_IPIP_LB;
+
+	/* Otherwise RIF type is derived from the type of the underlying FID. */
+	if (is_vlan_dev(dev) && netif_is_bridge_master(vlan_dev_real_dev(dev)))
+		type = MLXSW_SP_FID_TYPE_8021Q;
+	else if (netif_is_bridge_master(dev) && br_vlan_enabled(dev))
+		type = MLXSW_SP_FID_TYPE_8021Q;
+	else if (netif_is_bridge_master(dev))
+		type = MLXSW_SP_FID_TYPE_8021D;
+	else
+		type = MLXSW_SP_FID_TYPE_RFID;
+
+	return mlxsw_sp_fid_type_rif_type(mlxsw_sp, type);
+}
+
+static int mlxsw_sp_rif_index_alloc(struct mlxsw_sp *mlxsw_sp, u16 *p_rif_index)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++) {
+		if (!mlxsw_sp->router->rifs[i]) {
+			*p_rif_index = i;
+			return 0;
+		}
+	}
+
+	return -ENOBUFS;
+}
+
+static struct mlxsw_sp_rif *mlxsw_sp_rif_alloc(size_t rif_size, u16 rif_index,
+					       u16 vr_id,
+					       struct net_device *l3_dev)
+{
+	struct mlxsw_sp_rif *rif;
+
+	rif = kzalloc(rif_size, GFP_KERNEL);
+	if (!rif)
+		return NULL;
+
+	INIT_LIST_HEAD(&rif->nexthop_list);
+	INIT_LIST_HEAD(&rif->neigh_list);
+	ether_addr_copy(rif->addr, l3_dev->dev_addr);
+	rif->mtu = l3_dev->mtu;
+	rif->vr_id = vr_id;
+	rif->dev = l3_dev;
+	rif->rif_index = rif_index;
+
+	return rif;
+}
+
+struct mlxsw_sp_rif *mlxsw_sp_rif_by_index(const struct mlxsw_sp *mlxsw_sp,
+					   u16 rif_index)
+{
+	return mlxsw_sp->router->rifs[rif_index];
+}
+
+u16 mlxsw_sp_rif_index(const struct mlxsw_sp_rif *rif)
+{
+	return rif->rif_index;
+}
+
+u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *lb_rif)
+{
+	return lb_rif->common.rif_index;
+}
+
+u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *lb_rif)
+{
+	return lb_rif->ul_vr_id;
+}
+
+int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif)
+{
+	return rif->dev->ifindex;
+}
+
+const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif)
+{
+	return rif->dev;
+}
+
+static struct mlxsw_sp_rif *
+mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
+		    const struct mlxsw_sp_rif_params *params,
+		    struct netlink_ext_ack *extack)
+{
+	u32 tb_id = l3mdev_fib_table(params->dev);
+	const struct mlxsw_sp_rif_ops *ops;
+	struct mlxsw_sp_fid *fid = NULL;
+	enum mlxsw_sp_rif_type type;
+	struct mlxsw_sp_rif *rif;
+	struct mlxsw_sp_vr *vr;
+	u16 rif_index;
+	int i, err;
+
+	type = mlxsw_sp_dev_rif_type(mlxsw_sp, params->dev);
+	ops = mlxsw_sp->router->rif_ops_arr[type];
+
+	vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id ? : RT_TABLE_MAIN, extack);
+	if (IS_ERR(vr))
+		return ERR_CAST(vr);
+	vr->rif_count++;
+
+	err = mlxsw_sp_rif_index_alloc(mlxsw_sp, &rif_index);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported router interfaces");
+		goto err_rif_index_alloc;
+	}
+
+	rif = mlxsw_sp_rif_alloc(ops->rif_size, rif_index, vr->id, params->dev);
+	if (!rif) {
+		err = -ENOMEM;
+		goto err_rif_alloc;
+	}
+	rif->mlxsw_sp = mlxsw_sp;
+	rif->ops = ops;
+
+	if (ops->fid_get) {
+		fid = ops->fid_get(rif);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			goto err_fid_get;
+		}
+		rif->fid = fid;
+	}
+
+	if (ops->setup)
+		ops->setup(rif, params);
+
+	err = ops->configure(rif);
+	if (err)
+		goto err_configure;
+
+	for (i = 0; i < MLXSW_SP_L3_PROTO_MAX; i++) {
+		err = mlxsw_sp_mr_rif_add(vr->mr_table[i], rif);
+		if (err)
+			goto err_mr_rif_add;
+	}
+
+	mlxsw_sp_rif_counters_alloc(rif);
+	mlxsw_sp->router->rifs[rif_index] = rif;
+
+	return rif;
+
+err_mr_rif_add:
+	for (i--; i >= 0; i--)
+		mlxsw_sp_mr_rif_del(vr->mr_table[i], rif);
+	ops->deconfigure(rif);
+err_configure:
+	if (fid)
+		mlxsw_sp_fid_put(fid);
+err_fid_get:
+	kfree(rif);
+err_rif_alloc:
+err_rif_index_alloc:
+	vr->rif_count--;
+	mlxsw_sp_vr_put(mlxsw_sp, vr);
+	return ERR_PTR(err);
+}
+
+void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif)
+{
+	const struct mlxsw_sp_rif_ops *ops = rif->ops;
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct mlxsw_sp_fid *fid = rif->fid;
+	struct mlxsw_sp_vr *vr;
+	int i;
+
+	mlxsw_sp_router_rif_gone_sync(mlxsw_sp, rif);
+	vr = &mlxsw_sp->router->vrs[rif->vr_id];
+
+	mlxsw_sp->router->rifs[rif->rif_index] = NULL;
+	mlxsw_sp_rif_counters_free(rif);
+	for (i = 0; i < MLXSW_SP_L3_PROTO_MAX; i++)
+		mlxsw_sp_mr_rif_del(vr->mr_table[i], rif);
+	ops->deconfigure(rif);
+	if (fid)
+		/* Loopback RIFs are not associated with a FID. */
+		mlxsw_sp_fid_put(fid);
+	kfree(rif);
+	vr->rif_count--;
+	mlxsw_sp_vr_put(mlxsw_sp, vr);
+}
+
+static void
+mlxsw_sp_rif_subport_params_init(struct mlxsw_sp_rif_params *params,
+				 struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+
+	params->vid = mlxsw_sp_port_vlan->vid;
+	params->lag = mlxsw_sp_port->lagged;
+	if (params->lag)
+		params->lag_id = mlxsw_sp_port->lag_id;
+	else
+		params->system_port = mlxsw_sp_port->local_port;
+}
+
+static int
+mlxsw_sp_port_vlan_router_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan,
+			       struct net_device *l3_dev,
+			       struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+	struct mlxsw_sp_rif *rif;
+	struct mlxsw_sp_fid *fid;
+	int err;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, l3_dev);
+	if (!rif) {
+		struct mlxsw_sp_rif_params params = {
+			.dev = l3_dev,
+		};
+
+		mlxsw_sp_rif_subport_params_init(&params, mlxsw_sp_port_vlan);
+		rif = mlxsw_sp_rif_create(mlxsw_sp, &params, extack);
+		if (IS_ERR(rif))
+			return PTR_ERR(rif);
+	}
+
+	/* FID was already created, just take a reference */
+	fid = rif->ops->fid_get(rif);
+	err = mlxsw_sp_fid_port_vid_map(fid, mlxsw_sp_port, vid);
+	if (err)
+		goto err_fid_port_vid_map;
+
+	err = mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, false);
+	if (err)
+		goto err_port_vid_learning_set;
+
+	err = mlxsw_sp_port_vid_stp_set(mlxsw_sp_port, vid,
+					BR_STATE_FORWARDING);
+	if (err)
+		goto err_port_vid_stp_set;
+
+	mlxsw_sp_port_vlan->fid = fid;
+
+	return 0;
+
+err_port_vid_stp_set:
+	mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, true);
+err_port_vid_learning_set:
+	mlxsw_sp_fid_port_vid_unmap(fid, mlxsw_sp_port, vid);
+err_fid_port_vid_map:
+	mlxsw_sp_fid_put(fid);
+	return err;
+}
+
+void
+mlxsw_sp_port_vlan_router_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+
+	if (WARN_ON(mlxsw_sp_fid_type(fid) != MLXSW_SP_FID_TYPE_RFID))
+		return;
+
+	mlxsw_sp_port_vlan->fid = NULL;
+	mlxsw_sp_port_vid_stp_set(mlxsw_sp_port, vid, BR_STATE_BLOCKING);
+	mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, true);
+	mlxsw_sp_fid_port_vid_unmap(fid, mlxsw_sp_port, vid);
+	/* If router port holds the last reference on the rFID, then the
+	 * associated Sub-port RIF will be destroyed.
+	 */
+	mlxsw_sp_fid_put(fid);
+}
+
+static int mlxsw_sp_inetaddr_port_vlan_event(struct net_device *l3_dev,
+					     struct net_device *port_dev,
+					     unsigned long event, u16 vid,
+					     struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(port_dev);
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
+	if (WARN_ON(!mlxsw_sp_port_vlan))
+		return -EINVAL;
+
+	switch (event) {
+	case NETDEV_UP:
+		return mlxsw_sp_port_vlan_router_join(mlxsw_sp_port_vlan,
+						      l3_dev, extack);
+	case NETDEV_DOWN:
+		mlxsw_sp_port_vlan_router_leave(mlxsw_sp_port_vlan);
+		break;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_inetaddr_port_event(struct net_device *port_dev,
+					unsigned long event,
+					struct netlink_ext_ack *extack)
+{
+	if (netif_is_bridge_port(port_dev) ||
+	    netif_is_lag_port(port_dev) ||
+	    netif_is_ovs_port(port_dev))
+		return 0;
+
+	return mlxsw_sp_inetaddr_port_vlan_event(port_dev, port_dev, event, 1,
+						 extack);
+}
+
+static int __mlxsw_sp_inetaddr_lag_event(struct net_device *l3_dev,
+					 struct net_device *lag_dev,
+					 unsigned long event, u16 vid,
+					 struct netlink_ext_ack *extack)
+{
+	struct net_device *port_dev;
+	struct list_head *iter;
+	int err;
+
+	netdev_for_each_lower_dev(lag_dev, port_dev, iter) {
+		if (mlxsw_sp_port_dev_check(port_dev)) {
+			err = mlxsw_sp_inetaddr_port_vlan_event(l3_dev,
+								port_dev,
+								event, vid,
+								extack);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_inetaddr_lag_event(struct net_device *lag_dev,
+				       unsigned long event,
+				       struct netlink_ext_ack *extack)
+{
+	if (netif_is_bridge_port(lag_dev))
+		return 0;
+
+	return __mlxsw_sp_inetaddr_lag_event(lag_dev, lag_dev, event, 1,
+					     extack);
+}
+
+static int mlxsw_sp_inetaddr_bridge_event(struct net_device *l3_dev,
+					  unsigned long event,
+					  struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(l3_dev);
+	struct mlxsw_sp_rif_params params = {
+		.dev = l3_dev,
+	};
+	struct mlxsw_sp_rif *rif;
+
+	switch (event) {
+	case NETDEV_UP:
+		rif = mlxsw_sp_rif_create(mlxsw_sp, &params, extack);
+		if (IS_ERR(rif))
+			return PTR_ERR(rif);
+		break;
+	case NETDEV_DOWN:
+		rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, l3_dev);
+		mlxsw_sp_rif_destroy(rif);
+		break;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_inetaddr_vlan_event(struct net_device *vlan_dev,
+					unsigned long event,
+					struct netlink_ext_ack *extack)
+{
+	struct net_device *real_dev = vlan_dev_real_dev(vlan_dev);
+	u16 vid = vlan_dev_vlan_id(vlan_dev);
+
+	if (netif_is_bridge_port(vlan_dev))
+		return 0;
+
+	if (mlxsw_sp_port_dev_check(real_dev))
+		return mlxsw_sp_inetaddr_port_vlan_event(vlan_dev, real_dev,
+							 event, vid, extack);
+	else if (netif_is_lag_master(real_dev))
+		return __mlxsw_sp_inetaddr_lag_event(vlan_dev, real_dev, event,
+						     vid, extack);
+	else if (netif_is_bridge_master(real_dev) && br_vlan_enabled(real_dev))
+		return mlxsw_sp_inetaddr_bridge_event(vlan_dev, event, extack);
+
+	return 0;
+}
+
+static int __mlxsw_sp_inetaddr_event(struct net_device *dev,
+				     unsigned long event,
+				     struct netlink_ext_ack *extack)
+{
+	if (mlxsw_sp_port_dev_check(dev))
+		return mlxsw_sp_inetaddr_port_event(dev, event, extack);
+	else if (netif_is_lag_master(dev))
+		return mlxsw_sp_inetaddr_lag_event(dev, event, extack);
+	else if (netif_is_bridge_master(dev))
+		return mlxsw_sp_inetaddr_bridge_event(dev, event, extack);
+	else if (is_vlan_dev(dev))
+		return mlxsw_sp_inetaddr_vlan_event(dev, event, extack);
+	else
+		return 0;
+}
+
+int mlxsw_sp_inetaddr_event(struct notifier_block *unused,
+			    unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = (struct in_ifaddr *) ptr;
+	struct net_device *dev = ifa->ifa_dev->dev;
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_sp_rif *rif;
+	int err = 0;
+
+	/* NETDEV_UP event is handled by mlxsw_sp_inetaddr_valid_event */
+	if (event == NETDEV_UP)
+		goto out;
+
+	mlxsw_sp = mlxsw_sp_lower_get(dev);
+	if (!mlxsw_sp)
+		goto out;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!mlxsw_sp_rif_should_config(rif, dev, event))
+		goto out;
+
+	err = __mlxsw_sp_inetaddr_event(dev, event, NULL);
+out:
+	return notifier_from_errno(err);
+}
+
+int mlxsw_sp_inetaddr_valid_event(struct notifier_block *unused,
+				  unsigned long event, void *ptr)
+{
+	struct in_validator_info *ivi = (struct in_validator_info *) ptr;
+	struct net_device *dev = ivi->ivi_dev->dev;
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_sp_rif *rif;
+	int err = 0;
+
+	mlxsw_sp = mlxsw_sp_lower_get(dev);
+	if (!mlxsw_sp)
+		goto out;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!mlxsw_sp_rif_should_config(rif, dev, event))
+		goto out;
+
+	err = __mlxsw_sp_inetaddr_event(dev, event, ivi->extack);
+out:
+	return notifier_from_errno(err);
+}
+
+struct mlxsw_sp_inet6addr_event_work {
+	struct work_struct work;
+	struct net_device *dev;
+	unsigned long event;
+};
+
+static void mlxsw_sp_inet6addr_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_inet6addr_event_work *inet6addr_work =
+		container_of(work, struct mlxsw_sp_inet6addr_event_work, work);
+	struct net_device *dev = inet6addr_work->dev;
+	unsigned long event = inet6addr_work->event;
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_sp_rif *rif;
+
+	rtnl_lock();
+	mlxsw_sp = mlxsw_sp_lower_get(dev);
+	if (!mlxsw_sp)
+		goto out;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!mlxsw_sp_rif_should_config(rif, dev, event))
+		goto out;
+
+	__mlxsw_sp_inetaddr_event(dev, event, NULL);
+out:
+	rtnl_unlock();
+	dev_put(dev);
+	kfree(inet6addr_work);
+}
+
+/* Called with rcu_read_lock() */
+int mlxsw_sp_inet6addr_event(struct notifier_block *unused,
+			     unsigned long event, void *ptr)
+{
+	struct inet6_ifaddr *if6 = (struct inet6_ifaddr *) ptr;
+	struct mlxsw_sp_inet6addr_event_work *inet6addr_work;
+	struct net_device *dev = if6->idev->dev;
+
+	/* NETDEV_UP event is handled by mlxsw_sp_inet6addr_valid_event */
+	if (event == NETDEV_UP)
+		return NOTIFY_DONE;
+
+	if (!mlxsw_sp_port_dev_lower_find_rcu(dev))
+		return NOTIFY_DONE;
+
+	inet6addr_work = kzalloc(sizeof(*inet6addr_work), GFP_ATOMIC);
+	if (!inet6addr_work)
+		return NOTIFY_BAD;
+
+	INIT_WORK(&inet6addr_work->work, mlxsw_sp_inet6addr_event_work);
+	inet6addr_work->dev = dev;
+	inet6addr_work->event = event;
+	dev_hold(dev);
+	mlxsw_core_schedule_work(&inet6addr_work->work);
+
+	return NOTIFY_DONE;
+}
+
+int mlxsw_sp_inet6addr_valid_event(struct notifier_block *unused,
+				   unsigned long event, void *ptr)
+{
+	struct in6_validator_info *i6vi = (struct in6_validator_info *) ptr;
+	struct net_device *dev = i6vi->i6vi_dev->dev;
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_sp_rif *rif;
+	int err = 0;
+
+	mlxsw_sp = mlxsw_sp_lower_get(dev);
+	if (!mlxsw_sp)
+		goto out;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!mlxsw_sp_rif_should_config(rif, dev, event))
+		goto out;
+
+	err = __mlxsw_sp_inetaddr_event(dev, event, i6vi->extack);
+out:
+	return notifier_from_errno(err);
+}
+
+static int mlxsw_sp_rif_edit(struct mlxsw_sp *mlxsw_sp, u16 rif_index,
+			     const char *mac, int mtu)
+{
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+	int err;
+
+	mlxsw_reg_ritr_rif_pack(ritr_pl, rif_index);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_ritr_mtu_set(ritr_pl, mtu);
+	mlxsw_reg_ritr_if_mac_memcpy_to(ritr_pl, mac);
+	mlxsw_reg_ritr_op_set(ritr_pl, MLXSW_REG_RITR_RIF_CREATE);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
+int mlxsw_sp_netdevice_router_port_event(struct net_device *dev)
+{
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_sp_rif *rif;
+	u16 fid_index;
+	int err;
+
+	mlxsw_sp = mlxsw_sp_lower_get(dev);
+	if (!mlxsw_sp)
+		return 0;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!rif)
+		return 0;
+	fid_index = mlxsw_sp_fid_index(rif->fid);
+
+	err = mlxsw_sp_rif_fdb_op(mlxsw_sp, rif->addr, fid_index, false);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_rif_edit(mlxsw_sp, rif->rif_index, dev->dev_addr,
+				dev->mtu);
+	if (err)
+		goto err_rif_edit;
+
+	err = mlxsw_sp_rif_fdb_op(mlxsw_sp, dev->dev_addr, fid_index, true);
+	if (err)
+		goto err_rif_fdb_op;
+
+	if (rif->mtu != dev->mtu) {
+		struct mlxsw_sp_vr *vr;
+		int i;
+
+		/* The RIF is relevant only to its mr_table instance, as unlike
+		 * unicast routing, in multicast routing a RIF cannot be shared
+		 * between several multicast routing tables.
+		 */
+		vr = &mlxsw_sp->router->vrs[rif->vr_id];
+		for (i = 0; i < MLXSW_SP_L3_PROTO_MAX; i++)
+			mlxsw_sp_mr_rif_mtu_update(vr->mr_table[i],
+						   rif, dev->mtu);
+	}
+
+	ether_addr_copy(rif->addr, dev->dev_addr);
+	rif->mtu = dev->mtu;
+
+	netdev_dbg(dev, "Updated RIF=%d\n", rif->rif_index);
+
+	return 0;
+
+err_rif_fdb_op:
+	mlxsw_sp_rif_edit(mlxsw_sp, rif->rif_index, rif->addr, rif->mtu);
+err_rif_edit:
+	mlxsw_sp_rif_fdb_op(mlxsw_sp, rif->addr, fid_index, true);
+	return err;
+}
+
+static int mlxsw_sp_port_vrf_join(struct mlxsw_sp *mlxsw_sp,
+				  struct net_device *l3_dev,
+				  struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_rif *rif;
+
+	/* If netdev is already associated with a RIF, then we need to
+	 * destroy it and create a new one with the new virtual router ID.
+	 */
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, l3_dev);
+	if (rif)
+		__mlxsw_sp_inetaddr_event(l3_dev, NETDEV_DOWN, extack);
+
+	return __mlxsw_sp_inetaddr_event(l3_dev, NETDEV_UP, extack);
+}
+
+static void mlxsw_sp_port_vrf_leave(struct mlxsw_sp *mlxsw_sp,
+				    struct net_device *l3_dev)
+{
+	struct mlxsw_sp_rif *rif;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, l3_dev);
+	if (!rif)
+		return;
+	__mlxsw_sp_inetaddr_event(l3_dev, NETDEV_DOWN, NULL);
+}
+
+int mlxsw_sp_netdevice_vrf_event(struct net_device *l3_dev, unsigned long event,
+				 struct netdev_notifier_changeupper_info *info)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(l3_dev);
+	int err = 0;
+
+	if (!mlxsw_sp)
+		return 0;
+
+	switch (event) {
+	case NETDEV_PRECHANGEUPPER:
+		return 0;
+	case NETDEV_CHANGEUPPER:
+		if (info->linking) {
+			struct netlink_ext_ack *extack;
+
+			extack = netdev_notifier_info_to_extack(&info->info);
+			err = mlxsw_sp_port_vrf_join(mlxsw_sp, l3_dev, extack);
+		} else {
+			mlxsw_sp_port_vrf_leave(mlxsw_sp, l3_dev);
+		}
+		break;
+	}
+
+	return err;
+}
+
+static struct mlxsw_sp_rif_subport *
+mlxsw_sp_rif_subport_rif(const struct mlxsw_sp_rif *rif)
+{
+	return container_of(rif, struct mlxsw_sp_rif_subport, common);
+}
+
+static void mlxsw_sp_rif_subport_setup(struct mlxsw_sp_rif *rif,
+				       const struct mlxsw_sp_rif_params *params)
+{
+	struct mlxsw_sp_rif_subport *rif_subport;
+
+	rif_subport = mlxsw_sp_rif_subport_rif(rif);
+	rif_subport->vid = params->vid;
+	rif_subport->lag = params->lag;
+	if (params->lag)
+		rif_subport->lag_id = params->lag_id;
+	else
+		rif_subport->system_port = params->system_port;
+}
+
+static int mlxsw_sp_rif_subport_op(struct mlxsw_sp_rif *rif, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct mlxsw_sp_rif_subport *rif_subport;
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+
+	rif_subport = mlxsw_sp_rif_subport_rif(rif);
+	mlxsw_reg_ritr_pack(ritr_pl, enable, MLXSW_REG_RITR_SP_IF,
+			    rif->rif_index, rif->vr_id, rif->dev->mtu);
+	mlxsw_reg_ritr_mac_pack(ritr_pl, rif->dev->dev_addr);
+	mlxsw_reg_ritr_sp_if_pack(ritr_pl, rif_subport->lag,
+				  rif_subport->lag ? rif_subport->lag_id :
+						     rif_subport->system_port,
+				  rif_subport->vid);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
+static int mlxsw_sp_rif_subport_configure(struct mlxsw_sp_rif *rif)
+{
+	int err;
+
+	err = mlxsw_sp_rif_subport_op(rif, true);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr,
+				  mlxsw_sp_fid_index(rif->fid), true);
+	if (err)
+		goto err_rif_fdb_op;
+
+	mlxsw_sp_fid_rif_set(rif->fid, rif);
+	return 0;
+
+err_rif_fdb_op:
+	mlxsw_sp_rif_subport_op(rif, false);
+	return err;
+}
+
+static void mlxsw_sp_rif_subport_deconfigure(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_fid *fid = rif->fid;
+
+	mlxsw_sp_fid_rif_set(fid, NULL);
+	mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr,
+			    mlxsw_sp_fid_index(fid), false);
+	mlxsw_sp_rif_subport_op(rif, false);
+}
+
+static struct mlxsw_sp_fid *
+mlxsw_sp_rif_subport_fid_get(struct mlxsw_sp_rif *rif)
+{
+	return mlxsw_sp_fid_rfid_get(rif->mlxsw_sp, rif->rif_index);
+}
+
+static const struct mlxsw_sp_rif_ops mlxsw_sp_rif_subport_ops = {
+	.type			= MLXSW_SP_RIF_TYPE_SUBPORT,
+	.rif_size		= sizeof(struct mlxsw_sp_rif_subport),
+	.setup			= mlxsw_sp_rif_subport_setup,
+	.configure		= mlxsw_sp_rif_subport_configure,
+	.deconfigure		= mlxsw_sp_rif_subport_deconfigure,
+	.fid_get		= mlxsw_sp_rif_subport_fid_get,
+};
+
+static int mlxsw_sp_rif_vlan_fid_op(struct mlxsw_sp_rif *rif,
+				    enum mlxsw_reg_ritr_if_type type,
+				    u16 vid_fid, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+
+	mlxsw_reg_ritr_pack(ritr_pl, enable, type, rif->rif_index, rif->vr_id,
+			    rif->dev->mtu);
+	mlxsw_reg_ritr_mac_pack(ritr_pl, rif->dev->dev_addr);
+	mlxsw_reg_ritr_fid_set(ritr_pl, type, vid_fid);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
+u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp)
+{
+	return mlxsw_core_max_ports(mlxsw_sp->core) + 1;
+}
+
+static int mlxsw_sp_rif_vlan_configure(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	u16 vid = mlxsw_sp_fid_8021q_vid(rif->fid);
+	int err;
+
+	err = mlxsw_sp_rif_vlan_fid_op(rif, MLXSW_REG_RITR_VLAN_IF, vid, true);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC,
+				     mlxsw_sp_router_port(mlxsw_sp), true);
+	if (err)
+		goto err_fid_mc_flood_set;
+
+	err = mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC,
+				     mlxsw_sp_router_port(mlxsw_sp), true);
+	if (err)
+		goto err_fid_bc_flood_set;
+
+	err = mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr,
+				  mlxsw_sp_fid_index(rif->fid), true);
+	if (err)
+		goto err_rif_fdb_op;
+
+	mlxsw_sp_fid_rif_set(rif->fid, rif);
+	return 0;
+
+err_rif_fdb_op:
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+err_fid_bc_flood_set:
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+err_fid_mc_flood_set:
+	mlxsw_sp_rif_vlan_fid_op(rif, MLXSW_REG_RITR_VLAN_IF, vid, false);
+	return err;
+}
+
+static void mlxsw_sp_rif_vlan_deconfigure(struct mlxsw_sp_rif *rif)
+{
+	u16 vid = mlxsw_sp_fid_8021q_vid(rif->fid);
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct mlxsw_sp_fid *fid = rif->fid;
+
+	mlxsw_sp_fid_rif_set(fid, NULL);
+	mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr,
+			    mlxsw_sp_fid_index(fid), false);
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+	mlxsw_sp_rif_vlan_fid_op(rif, MLXSW_REG_RITR_VLAN_IF, vid, false);
+}
+
+static struct mlxsw_sp_fid *
+mlxsw_sp_rif_vlan_fid_get(struct mlxsw_sp_rif *rif)
+{
+	u16 vid = is_vlan_dev(rif->dev) ? vlan_dev_vlan_id(rif->dev) : 1;
+
+	return mlxsw_sp_fid_8021q_get(rif->mlxsw_sp, vid);
+}
+
+static const struct mlxsw_sp_rif_ops mlxsw_sp_rif_vlan_ops = {
+	.type			= MLXSW_SP_RIF_TYPE_VLAN,
+	.rif_size		= sizeof(struct mlxsw_sp_rif),
+	.configure		= mlxsw_sp_rif_vlan_configure,
+	.deconfigure		= mlxsw_sp_rif_vlan_deconfigure,
+	.fid_get		= mlxsw_sp_rif_vlan_fid_get,
+};
+
+static int mlxsw_sp_rif_fid_configure(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	u16 fid_index = mlxsw_sp_fid_index(rif->fid);
+	int err;
+
+	err = mlxsw_sp_rif_vlan_fid_op(rif, MLXSW_REG_RITR_FID_IF, fid_index,
+				       true);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC,
+				     mlxsw_sp_router_port(mlxsw_sp), true);
+	if (err)
+		goto err_fid_mc_flood_set;
+
+	err = mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC,
+				     mlxsw_sp_router_port(mlxsw_sp), true);
+	if (err)
+		goto err_fid_bc_flood_set;
+
+	err = mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr,
+				  mlxsw_sp_fid_index(rif->fid), true);
+	if (err)
+		goto err_rif_fdb_op;
+
+	mlxsw_sp_fid_rif_set(rif->fid, rif);
+	return 0;
+
+err_rif_fdb_op:
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+err_fid_bc_flood_set:
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+err_fid_mc_flood_set:
+	mlxsw_sp_rif_vlan_fid_op(rif, MLXSW_REG_RITR_FID_IF, fid_index, false);
+	return err;
+}
+
+static void mlxsw_sp_rif_fid_deconfigure(struct mlxsw_sp_rif *rif)
+{
+	u16 fid_index = mlxsw_sp_fid_index(rif->fid);
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct mlxsw_sp_fid *fid = rif->fid;
+
+	mlxsw_sp_fid_rif_set(fid, NULL);
+	mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr,
+			    mlxsw_sp_fid_index(fid), false);
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+	mlxsw_sp_rif_vlan_fid_op(rif, MLXSW_REG_RITR_FID_IF, fid_index, false);
+}
+
+static struct mlxsw_sp_fid *
+mlxsw_sp_rif_fid_fid_get(struct mlxsw_sp_rif *rif)
+{
+	return mlxsw_sp_fid_8021d_get(rif->mlxsw_sp, rif->dev->ifindex);
+}
+
+static const struct mlxsw_sp_rif_ops mlxsw_sp_rif_fid_ops = {
+	.type			= MLXSW_SP_RIF_TYPE_FID,
+	.rif_size		= sizeof(struct mlxsw_sp_rif),
+	.configure		= mlxsw_sp_rif_fid_configure,
+	.deconfigure		= mlxsw_sp_rif_fid_deconfigure,
+	.fid_get		= mlxsw_sp_rif_fid_fid_get,
+};
+
+static struct mlxsw_sp_rif_ipip_lb *
+mlxsw_sp_rif_ipip_lb_rif(struct mlxsw_sp_rif *rif)
+{
+	return container_of(rif, struct mlxsw_sp_rif_ipip_lb, common);
+}
+
+static void
+mlxsw_sp_rif_ipip_lb_setup(struct mlxsw_sp_rif *rif,
+			   const struct mlxsw_sp_rif_params *params)
+{
+	struct mlxsw_sp_rif_params_ipip_lb *params_lb;
+	struct mlxsw_sp_rif_ipip_lb *rif_lb;
+
+	params_lb = container_of(params, struct mlxsw_sp_rif_params_ipip_lb,
+				 common);
+	rif_lb = mlxsw_sp_rif_ipip_lb_rif(rif);
+	rif_lb->lb_config = params_lb->lb_config;
+}
+
+static int
+mlxsw_sp_rif_ipip_lb_configure(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_rif_ipip_lb *lb_rif = mlxsw_sp_rif_ipip_lb_rif(rif);
+	u32 ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(rif->dev);
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct mlxsw_sp_vr *ul_vr;
+	int err;
+
+	ul_vr = mlxsw_sp_vr_get(mlxsw_sp, ul_tb_id, NULL);
+	if (IS_ERR(ul_vr))
+		return PTR_ERR(ul_vr);
+
+	err = mlxsw_sp_rif_ipip_lb_op(lb_rif, ul_vr, true);
+	if (err)
+		goto err_loopback_op;
+
+	lb_rif->ul_vr_id = ul_vr->id;
+	++ul_vr->rif_count;
+	return 0;
+
+err_loopback_op:
+	mlxsw_sp_vr_put(mlxsw_sp, ul_vr);
+	return err;
+}
+
+static void mlxsw_sp_rif_ipip_lb_deconfigure(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_rif_ipip_lb *lb_rif = mlxsw_sp_rif_ipip_lb_rif(rif);
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct mlxsw_sp_vr *ul_vr;
+
+	ul_vr = &mlxsw_sp->router->vrs[lb_rif->ul_vr_id];
+	mlxsw_sp_rif_ipip_lb_op(lb_rif, ul_vr, false);
+
+	--ul_vr->rif_count;
+	mlxsw_sp_vr_put(mlxsw_sp, ul_vr);
+}
+
+static const struct mlxsw_sp_rif_ops mlxsw_sp_rif_ipip_lb_ops = {
+	.type			= MLXSW_SP_RIF_TYPE_IPIP_LB,
+	.rif_size		= sizeof(struct mlxsw_sp_rif_ipip_lb),
+	.setup                  = mlxsw_sp_rif_ipip_lb_setup,
+	.configure		= mlxsw_sp_rif_ipip_lb_configure,
+	.deconfigure		= mlxsw_sp_rif_ipip_lb_deconfigure,
+};
+
+static const struct mlxsw_sp_rif_ops *mlxsw_sp_rif_ops_arr[] = {
+	[MLXSW_SP_RIF_TYPE_SUBPORT]	= &mlxsw_sp_rif_subport_ops,
+	[MLXSW_SP_RIF_TYPE_VLAN]	= &mlxsw_sp_rif_vlan_ops,
+	[MLXSW_SP_RIF_TYPE_FID]		= &mlxsw_sp_rif_fid_ops,
+	[MLXSW_SP_RIF_TYPE_IPIP_LB]	= &mlxsw_sp_rif_ipip_lb_ops,
+};
+
+static int mlxsw_sp_rifs_init(struct mlxsw_sp *mlxsw_sp)
+{
+	u64 max_rifs = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
+
+	mlxsw_sp->router->rifs = kcalloc(max_rifs,
+					 sizeof(struct mlxsw_sp_rif *),
+					 GFP_KERNEL);
+	if (!mlxsw_sp->router->rifs)
+		return -ENOMEM;
+
+	mlxsw_sp->router->rif_ops_arr = mlxsw_sp_rif_ops_arr;
+
+	return 0;
+}
+
+static void mlxsw_sp_rifs_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++)
+		WARN_ON_ONCE(mlxsw_sp->router->rifs[i]);
+
+	kfree(mlxsw_sp->router->rifs);
+}
+
+static int
+mlxsw_sp_ipip_config_tigcr(struct mlxsw_sp *mlxsw_sp)
+{
+	char tigcr_pl[MLXSW_REG_TIGCR_LEN];
+
+	mlxsw_reg_tigcr_pack(tigcr_pl, true, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(tigcr), tigcr_pl);
+}
+
+static int mlxsw_sp_ipips_init(struct mlxsw_sp *mlxsw_sp)
+{
+	mlxsw_sp->router->ipip_ops_arr = mlxsw_sp_ipip_ops_arr;
+	INIT_LIST_HEAD(&mlxsw_sp->router->ipip_list);
+	return mlxsw_sp_ipip_config_tigcr(mlxsw_sp);
+}
+
+static void mlxsw_sp_ipips_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	WARN_ON(!list_empty(&mlxsw_sp->router->ipip_list));
+}
+
+static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
+{
+	struct mlxsw_sp_router *router;
+
+	/* Flush pending FIB notifications and then flush the device's
+	 * table before requesting another dump. The FIB notification
+	 * block is unregistered, so no need to take RTNL.
+	 */
+	mlxsw_core_flush_owq();
+	router = container_of(nb, struct mlxsw_sp_router, fib_nb);
+	mlxsw_sp_router_fib_flush(router->mlxsw_sp);
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static void mlxsw_sp_mp_hash_header_set(char *recr2_pl, int header)
+{
+	mlxsw_reg_recr2_outer_header_enables_set(recr2_pl, header, true);
+}
+
+static void mlxsw_sp_mp_hash_field_set(char *recr2_pl, int field)
+{
+	mlxsw_reg_recr2_outer_header_fields_enable_set(recr2_pl, field, true);
+}
+
+static void mlxsw_sp_mp4_hash_init(char *recr2_pl)
+{
+	bool only_l3 = !init_net.ipv4.sysctl_fib_multipath_hash_policy;
+
+	mlxsw_sp_mp_hash_header_set(recr2_pl,
+				    MLXSW_REG_RECR2_IPV4_EN_NOT_TCP_NOT_UDP);
+	mlxsw_sp_mp_hash_header_set(recr2_pl, MLXSW_REG_RECR2_IPV4_EN_TCP_UDP);
+	mlxsw_reg_recr2_ipv4_sip_enable(recr2_pl);
+	mlxsw_reg_recr2_ipv4_dip_enable(recr2_pl);
+	if (only_l3)
+		return;
+	mlxsw_sp_mp_hash_header_set(recr2_pl, MLXSW_REG_RECR2_TCP_UDP_EN_IPV4);
+	mlxsw_sp_mp_hash_field_set(recr2_pl, MLXSW_REG_RECR2_IPV4_PROTOCOL);
+	mlxsw_sp_mp_hash_field_set(recr2_pl, MLXSW_REG_RECR2_TCP_UDP_SPORT);
+	mlxsw_sp_mp_hash_field_set(recr2_pl, MLXSW_REG_RECR2_TCP_UDP_DPORT);
+}
+
+static void mlxsw_sp_mp6_hash_init(char *recr2_pl)
+{
+	bool only_l3 = !ip6_multipath_hash_policy(&init_net);
+
+	mlxsw_sp_mp_hash_header_set(recr2_pl,
+				    MLXSW_REG_RECR2_IPV6_EN_NOT_TCP_NOT_UDP);
+	mlxsw_sp_mp_hash_header_set(recr2_pl, MLXSW_REG_RECR2_IPV6_EN_TCP_UDP);
+	mlxsw_reg_recr2_ipv6_sip_enable(recr2_pl);
+	mlxsw_reg_recr2_ipv6_dip_enable(recr2_pl);
+	mlxsw_sp_mp_hash_field_set(recr2_pl, MLXSW_REG_RECR2_IPV6_NEXT_HEADER);
+	if (only_l3) {
+		mlxsw_sp_mp_hash_field_set(recr2_pl,
+					   MLXSW_REG_RECR2_IPV6_FLOW_LABEL);
+	} else {
+		mlxsw_sp_mp_hash_header_set(recr2_pl,
+					    MLXSW_REG_RECR2_TCP_UDP_EN_IPV6);
+		mlxsw_sp_mp_hash_field_set(recr2_pl,
+					   MLXSW_REG_RECR2_TCP_UDP_SPORT);
+		mlxsw_sp_mp_hash_field_set(recr2_pl,
+					   MLXSW_REG_RECR2_TCP_UDP_DPORT);
+	}
+}
+
+static int mlxsw_sp_mp_hash_init(struct mlxsw_sp *mlxsw_sp)
+{
+	char recr2_pl[MLXSW_REG_RECR2_LEN];
+	u32 seed;
+
+	get_random_bytes(&seed, sizeof(seed));
+	mlxsw_reg_recr2_pack(recr2_pl, seed);
+	mlxsw_sp_mp4_hash_init(recr2_pl);
+	mlxsw_sp_mp6_hash_init(recr2_pl);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(recr2), recr2_pl);
+}
+#else
+static int mlxsw_sp_mp_hash_init(struct mlxsw_sp *mlxsw_sp)
+{
+	return 0;
+}
+#endif
+
+static int mlxsw_sp_dscp_init(struct mlxsw_sp *mlxsw_sp)
+{
+	char rdpm_pl[MLXSW_REG_RDPM_LEN];
+	unsigned int i;
+
+	MLXSW_REG_ZERO(rdpm, rdpm_pl);
+
+	/* HW is determining switch priority based on DSCP-bits, but the
+	 * kernel is still doing that based on the ToS. Since there's a
+	 * mismatch in bits we need to make sure to translate the right
+	 * value ToS would observe, skipping the 2 least-significant ECN bits.
+	 */
+	for (i = 0; i < MLXSW_REG_RDPM_DSCP_ENTRY_REC_MAX_COUNT; i++)
+		mlxsw_reg_rdpm_pack(rdpm_pl, i, rt_tos2priority(i << 2));
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rdpm), rdpm_pl);
+}
+
+static int __mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
+{
+	char rgcr_pl[MLXSW_REG_RGCR_LEN];
+	u64 max_rifs;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_RIFS))
+		return -EIO;
+	max_rifs = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
+
+	mlxsw_reg_rgcr_pack(rgcr_pl, true, true);
+	mlxsw_reg_rgcr_max_router_interfaces_set(rgcr_pl, max_rifs);
+	mlxsw_reg_rgcr_usp_set(rgcr_pl, true);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rgcr), rgcr_pl);
+	if (err)
+		return err;
+	return 0;
+}
+
+static void __mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	char rgcr_pl[MLXSW_REG_RGCR_LEN];
+
+	mlxsw_reg_rgcr_pack(rgcr_pl, false, false);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rgcr), rgcr_pl);
+}
+
+int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_router *router;
+	int err;
+
+	router = kzalloc(sizeof(*mlxsw_sp->router), GFP_KERNEL);
+	if (!router)
+		return -ENOMEM;
+	mlxsw_sp->router = router;
+	router->mlxsw_sp = mlxsw_sp;
+
+	INIT_LIST_HEAD(&mlxsw_sp->router->nexthop_neighs_list);
+	err = __mlxsw_sp_router_init(mlxsw_sp);
+	if (err)
+		goto err_router_init;
+
+	err = mlxsw_sp_rifs_init(mlxsw_sp);
+	if (err)
+		goto err_rifs_init;
+
+	err = mlxsw_sp_ipips_init(mlxsw_sp);
+	if (err)
+		goto err_ipips_init;
+
+	err = rhashtable_init(&mlxsw_sp->router->nexthop_ht,
+			      &mlxsw_sp_nexthop_ht_params);
+	if (err)
+		goto err_nexthop_ht_init;
+
+	err = rhashtable_init(&mlxsw_sp->router->nexthop_group_ht,
+			      &mlxsw_sp_nexthop_group_ht_params);
+	if (err)
+		goto err_nexthop_group_ht_init;
+
+	INIT_LIST_HEAD(&mlxsw_sp->router->nexthop_list);
+	err = mlxsw_sp_lpm_init(mlxsw_sp);
+	if (err)
+		goto err_lpm_init;
+
+	err = mlxsw_sp_mr_init(mlxsw_sp, &mlxsw_sp_mr_tcam_ops);
+	if (err)
+		goto err_mr_init;
+
+	err = mlxsw_sp_vrs_init(mlxsw_sp);
+	if (err)
+		goto err_vrs_init;
+
+	err = mlxsw_sp_neigh_init(mlxsw_sp);
+	if (err)
+		goto err_neigh_init;
+
+	mlxsw_sp->router->netevent_nb.notifier_call =
+		mlxsw_sp_router_netevent_event;
+	err = register_netevent_notifier(&mlxsw_sp->router->netevent_nb);
+	if (err)
+		goto err_register_netevent_notifier;
+
+	err = mlxsw_sp_mp_hash_init(mlxsw_sp);
+	if (err)
+		goto err_mp_hash_init;
+
+	err = mlxsw_sp_dscp_init(mlxsw_sp);
+	if (err)
+		goto err_dscp_init;
+
+	mlxsw_sp->router->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
+	err = register_fib_notifier(&mlxsw_sp->router->fib_nb,
+				    mlxsw_sp_router_fib_dump_flush);
+	if (err)
+		goto err_register_fib_notifier;
+
+	return 0;
+
+err_register_fib_notifier:
+err_dscp_init:
+err_mp_hash_init:
+	unregister_netevent_notifier(&mlxsw_sp->router->netevent_nb);
+err_register_netevent_notifier:
+	mlxsw_sp_neigh_fini(mlxsw_sp);
+err_neigh_init:
+	mlxsw_sp_vrs_fini(mlxsw_sp);
+err_vrs_init:
+	mlxsw_sp_mr_fini(mlxsw_sp);
+err_mr_init:
+	mlxsw_sp_lpm_fini(mlxsw_sp);
+err_lpm_init:
+	rhashtable_destroy(&mlxsw_sp->router->nexthop_group_ht);
+err_nexthop_group_ht_init:
+	rhashtable_destroy(&mlxsw_sp->router->nexthop_ht);
+err_nexthop_ht_init:
+	mlxsw_sp_ipips_fini(mlxsw_sp);
+err_ipips_init:
+	mlxsw_sp_rifs_fini(mlxsw_sp);
+err_rifs_init:
+	__mlxsw_sp_router_fini(mlxsw_sp);
+err_router_init:
+	kfree(mlxsw_sp->router);
+	return err;
+}
+
+void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	unregister_fib_notifier(&mlxsw_sp->router->fib_nb);
+	unregister_netevent_notifier(&mlxsw_sp->router->netevent_nb);
+	mlxsw_sp_neigh_fini(mlxsw_sp);
+	mlxsw_sp_vrs_fini(mlxsw_sp);
+	mlxsw_sp_mr_fini(mlxsw_sp);
+	mlxsw_sp_lpm_fini(mlxsw_sp);
+	rhashtable_destroy(&mlxsw_sp->router->nexthop_group_ht);
+	rhashtable_destroy(&mlxsw_sp->router->nexthop_ht);
+	mlxsw_sp_ipips_fini(mlxsw_sp);
+	mlxsw_sp_rifs_fini(mlxsw_sp);
+	__mlxsw_sp_router_fini(mlxsw_sp);
+	kfree(mlxsw_sp->router);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
new file mode 100644
index 0000000..a01edcf
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
@@ -0,0 +1,149 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Arkadi Sharshevsky <arkadis@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_ROUTER_H_
+#define _MLXSW_ROUTER_H_
+
+#include "spectrum.h"
+#include "reg.h"
+
+enum mlxsw_sp_l3proto {
+	MLXSW_SP_L3_PROTO_IPV4,
+	MLXSW_SP_L3_PROTO_IPV6,
+#define MLXSW_SP_L3_PROTO_MAX	(MLXSW_SP_L3_PROTO_IPV6 + 1)
+};
+
+union mlxsw_sp_l3addr {
+	__be32 addr4;
+	struct in6_addr addr6;
+};
+
+struct mlxsw_sp_rif_ipip_lb;
+struct mlxsw_sp_rif_ipip_lb_config {
+	enum mlxsw_reg_ritr_loopback_ipip_type lb_ipipt;
+	u32 okey;
+	enum mlxsw_sp_l3proto ul_protocol; /* Underlay. */
+	union mlxsw_sp_l3addr saddr;
+};
+
+enum mlxsw_sp_rif_counter_dir {
+	MLXSW_SP_RIF_COUNTER_INGRESS,
+	MLXSW_SP_RIF_COUNTER_EGRESS,
+};
+
+struct mlxsw_sp_neigh_entry;
+struct mlxsw_sp_nexthop;
+struct mlxsw_sp_ipip_entry;
+
+struct mlxsw_sp_rif *mlxsw_sp_rif_by_index(const struct mlxsw_sp *mlxsw_sp,
+					   u16 rif_index);
+u16 mlxsw_sp_rif_index(const struct mlxsw_sp_rif *rif);
+u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *rif);
+u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *rif);
+u32 mlxsw_sp_ipip_dev_ul_tb_id(const struct net_device *ol_dev);
+int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif);
+u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp);
+const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif);
+int mlxsw_sp_rif_counter_value_get(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_rif *rif,
+				   enum mlxsw_sp_rif_counter_dir dir,
+				   u64 *cnt);
+void mlxsw_sp_rif_counter_free(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_rif *rif,
+			       enum mlxsw_sp_rif_counter_dir dir);
+int mlxsw_sp_rif_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_rif *rif,
+			       enum mlxsw_sp_rif_counter_dir dir);
+struct mlxsw_sp_neigh_entry *
+mlxsw_sp_rif_neigh_next(struct mlxsw_sp_rif *rif,
+			struct mlxsw_sp_neigh_entry *neigh_entry);
+int mlxsw_sp_neigh_entry_type(struct mlxsw_sp_neigh_entry *neigh_entry);
+unsigned char *
+mlxsw_sp_neigh_entry_ha(struct mlxsw_sp_neigh_entry *neigh_entry);
+u32 mlxsw_sp_neigh4_entry_dip(struct mlxsw_sp_neigh_entry *neigh_entry);
+struct in6_addr *
+mlxsw_sp_neigh6_entry_dip(struct mlxsw_sp_neigh_entry *neigh_entry);
+
+#define mlxsw_sp_rif_neigh_for_each(neigh_entry, rif)				\
+	for (neigh_entry = mlxsw_sp_rif_neigh_next(rif, NULL); neigh_entry;	\
+	     neigh_entry = mlxsw_sp_rif_neigh_next(rif, neigh_entry))
+int mlxsw_sp_neigh_counter_get(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_neigh_entry *neigh_entry,
+			       u64 *p_counter);
+void
+mlxsw_sp_neigh_entry_counter_update(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_neigh_entry *neigh_entry,
+				    bool adding);
+bool mlxsw_sp_neigh_ipv6_ignore(struct mlxsw_sp_neigh_entry *neigh_entry);
+int __mlxsw_sp_ipip_entry_update_tunnel(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_ipip_entry *ipip_entry,
+					bool recreate_loopback,
+					bool keep_encap,
+					bool update_nexthops,
+					struct netlink_ext_ack *extack);
+void mlxsw_sp_ipip_entry_demote_tunnel(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_ipip_entry *ipip_entry);
+bool
+mlxsw_sp_ipip_demote_tunnel_by_saddr(struct mlxsw_sp *mlxsw_sp,
+				     enum mlxsw_sp_l3proto ul_proto,
+				     union mlxsw_sp_l3addr saddr,
+				     u32 ul_tb_id,
+				     const struct mlxsw_sp_ipip_entry *except);
+struct mlxsw_sp_nexthop *mlxsw_sp_nexthop_next(struct mlxsw_sp_router *router,
+					       struct mlxsw_sp_nexthop *nh);
+bool mlxsw_sp_nexthop_offload(struct mlxsw_sp_nexthop *nh);
+unsigned char *mlxsw_sp_nexthop_ha(struct mlxsw_sp_nexthop *nh);
+int mlxsw_sp_nexthop_indexes(struct mlxsw_sp_nexthop *nh, u32 *p_adj_index,
+			     u32 *p_adj_size, u32 *p_adj_hash_index);
+struct mlxsw_sp_rif *mlxsw_sp_nexthop_rif(struct mlxsw_sp_nexthop *nh);
+bool mlxsw_sp_nexthop_group_has_ipip(struct mlxsw_sp_nexthop *nh);
+#define mlxsw_sp_nexthop_for_each(nh, router)				\
+	for (nh = mlxsw_sp_nexthop_next(router, NULL); nh;		\
+	     nh = mlxsw_sp_nexthop_next(router, nh))
+int mlxsw_sp_nexthop_counter_get(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_nexthop *nh, u64 *p_counter);
+int mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+			    struct mlxsw_sp_nexthop *nh);
+void mlxsw_sp_nexthop_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_nexthop *nh);
+void mlxsw_sp_nexthop_counter_free(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_nexthop *nh);
+
+static inline bool mlxsw_sp_l3addr_eq(const union mlxsw_sp_l3addr *addr1,
+				      const union mlxsw_sp_l3addr *addr2)
+{
+	return !memcmp(addr1, addr2, sizeof(*addr1));
+}
+
+#endif /* _MLXSW_ROUTER_H_*/
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
new file mode 100644
index 0000000..65a7770
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -0,0 +1,824 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/mlxsw_span.c
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2018 Petr Machata <petrm@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/list.h>
+#include <net/arp.h>
+#include <net/gre.h>
+#include <net/ndisc.h>
+#include <net/ip6_tunnel.h>
+
+#include "spectrum.h"
+#include "spectrum_span.h"
+#include "spectrum_ipip.h"
+
+int mlxsw_sp_span_init(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_SPAN))
+		return -EIO;
+
+	mlxsw_sp->span.entries_count = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+							  MAX_SPAN);
+	mlxsw_sp->span.entries = kcalloc(mlxsw_sp->span.entries_count,
+					 sizeof(struct mlxsw_sp_span_entry),
+					 GFP_KERNEL);
+	if (!mlxsw_sp->span.entries)
+		return -ENOMEM;
+
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i];
+
+		INIT_LIST_HEAD(&curr->bound_ports_list);
+		curr->id = i;
+	}
+
+	return 0;
+}
+
+void mlxsw_sp_span_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i];
+
+		WARN_ON_ONCE(!list_empty(&curr->bound_ports_list));
+	}
+	kfree(mlxsw_sp->span.entries);
+}
+
+static int
+mlxsw_sp_span_entry_phys_parms(const struct net_device *to_dev,
+			       struct mlxsw_sp_span_parms *sparmsp)
+{
+	sparmsp->dest_port = netdev_priv(to_dev);
+	return 0;
+}
+
+static int
+mlxsw_sp_span_entry_phys_configure(struct mlxsw_sp_span_entry *span_entry,
+				   struct mlxsw_sp_span_parms sparms)
+{
+	struct mlxsw_sp_port *dest_port = sparms.dest_port;
+	struct mlxsw_sp *mlxsw_sp = dest_port->mlxsw_sp;
+	u8 local_port = dest_port->local_port;
+	char mpat_pl[MLXSW_REG_MPAT_LEN];
+	int pa_id = span_entry->id;
+
+	/* Create a new port analayzer entry for local_port. */
+	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, true,
+			    MLXSW_REG_MPAT_SPAN_TYPE_LOCAL_ETH);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpat), mpat_pl);
+}
+
+static void
+mlxsw_sp_span_entry_deconfigure_common(struct mlxsw_sp_span_entry *span_entry,
+				       enum mlxsw_reg_mpat_span_type span_type)
+{
+	struct mlxsw_sp_port *dest_port = span_entry->parms.dest_port;
+	struct mlxsw_sp *mlxsw_sp = dest_port->mlxsw_sp;
+	u8 local_port = dest_port->local_port;
+	char mpat_pl[MLXSW_REG_MPAT_LEN];
+	int pa_id = span_entry->id;
+
+	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, false, span_type);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpat), mpat_pl);
+}
+
+static void
+mlxsw_sp_span_entry_phys_deconfigure(struct mlxsw_sp_span_entry *span_entry)
+{
+	mlxsw_sp_span_entry_deconfigure_common(span_entry,
+					    MLXSW_REG_MPAT_SPAN_TYPE_LOCAL_ETH);
+}
+
+static const
+struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_phys = {
+	.can_handle = mlxsw_sp_port_dev_check,
+	.parms = mlxsw_sp_span_entry_phys_parms,
+	.configure = mlxsw_sp_span_entry_phys_configure,
+	.deconfigure = mlxsw_sp_span_entry_phys_deconfigure,
+};
+
+static int mlxsw_sp_span_dmac(struct neigh_table *tbl,
+			      const void *pkey,
+			      struct net_device *l3edev,
+			      unsigned char dmac[ETH_ALEN])
+{
+	struct neighbour *neigh = neigh_lookup(tbl, pkey, l3edev);
+	int err = 0;
+
+	if (!neigh) {
+		neigh = neigh_create(tbl, pkey, l3edev);
+		if (IS_ERR(neigh))
+			return PTR_ERR(neigh);
+	}
+
+	neigh_event_send(neigh, NULL);
+
+	read_lock_bh(&neigh->lock);
+	if ((neigh->nud_state & NUD_VALID) && !neigh->dead)
+		memcpy(dmac, neigh->ha, ETH_ALEN);
+	else
+		err = -ENOENT;
+	read_unlock_bh(&neigh->lock);
+
+	neigh_release(neigh);
+	return err;
+}
+
+static int
+mlxsw_sp_span_entry_unoffloadable(struct mlxsw_sp_span_parms *sparmsp)
+{
+	sparmsp->dest_port = NULL;
+	return 0;
+}
+
+static __maybe_unused int
+mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
+					union mlxsw_sp_l3addr saddr,
+					union mlxsw_sp_l3addr daddr,
+					union mlxsw_sp_l3addr gw,
+					__u8 ttl,
+					struct neigh_table *tbl,
+					struct mlxsw_sp_span_parms *sparmsp)
+{
+	unsigned char dmac[ETH_ALEN];
+
+	if (mlxsw_sp_l3addr_is_zero(gw))
+		gw = daddr;
+
+	if (!l3edev || !mlxsw_sp_port_dev_check(l3edev) ||
+	    mlxsw_sp_span_dmac(tbl, &gw, l3edev, dmac))
+		return mlxsw_sp_span_entry_unoffloadable(sparmsp);
+
+	sparmsp->dest_port = netdev_priv(l3edev);
+	sparmsp->ttl = ttl;
+	memcpy(sparmsp->dmac, dmac, ETH_ALEN);
+	memcpy(sparmsp->smac, l3edev->dev_addr, ETH_ALEN);
+	sparmsp->saddr = saddr;
+	sparmsp->daddr = daddr;
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_NET_IPGRE)
+static struct net_device *
+mlxsw_sp_span_gretap4_route(const struct net_device *to_dev,
+			    __be32 *saddrp, __be32 *daddrp)
+{
+	struct ip_tunnel *tun = netdev_priv(to_dev);
+	struct net_device *dev = NULL;
+	struct ip_tunnel_parm parms;
+	struct rtable *rt = NULL;
+	struct flowi4 fl4;
+
+	/* We assume "dev" stays valid after rt is put. */
+	ASSERT_RTNL();
+
+	parms = mlxsw_sp_ipip_netdev_parms4(to_dev);
+	ip_tunnel_init_flow(&fl4, parms.iph.protocol, *daddrp, *saddrp,
+			    0, 0, parms.link, tun->fwmark);
+
+	rt = ip_route_output_key(tun->net, &fl4);
+	if (IS_ERR(rt))
+		return NULL;
+
+	if (rt->rt_type != RTN_UNICAST)
+		goto out;
+
+	dev = rt->dst.dev;
+	*saddrp = fl4.saddr;
+	*daddrp = rt->rt_gateway;
+
+out:
+	ip_rt_put(rt);
+	return dev;
+}
+
+static int
+mlxsw_sp_span_entry_gretap4_parms(const struct net_device *to_dev,
+				  struct mlxsw_sp_span_parms *sparmsp)
+{
+	struct ip_tunnel_parm tparm = mlxsw_sp_ipip_netdev_parms4(to_dev);
+	union mlxsw_sp_l3addr saddr = { .addr4 = tparm.iph.saddr };
+	union mlxsw_sp_l3addr daddr = { .addr4 = tparm.iph.daddr };
+	bool inherit_tos = tparm.iph.tos & 0x1;
+	bool inherit_ttl = !tparm.iph.ttl;
+	union mlxsw_sp_l3addr gw = daddr;
+	struct net_device *l3edev;
+
+	if (!(to_dev->flags & IFF_UP) ||
+	    /* Reject tunnels with GRE keys, checksums, etc. */
+	    tparm.i_flags || tparm.o_flags ||
+	    /* Require a fixed TTL and a TOS copied from the mirrored packet. */
+	    inherit_ttl || !inherit_tos ||
+	    /* A destination address may not be "any". */
+	    mlxsw_sp_l3addr_is_zero(daddr))
+		return mlxsw_sp_span_entry_unoffloadable(sparmsp);
+
+	l3edev = mlxsw_sp_span_gretap4_route(to_dev, &saddr.addr4, &gw.addr4);
+	return mlxsw_sp_span_entry_tunnel_parms_common(l3edev, saddr, daddr, gw,
+						       tparm.iph.ttl,
+						       &arp_tbl, sparmsp);
+}
+
+static int
+mlxsw_sp_span_entry_gretap4_configure(struct mlxsw_sp_span_entry *span_entry,
+				      struct mlxsw_sp_span_parms sparms)
+{
+	struct mlxsw_sp_port *dest_port = sparms.dest_port;
+	struct mlxsw_sp *mlxsw_sp = dest_port->mlxsw_sp;
+	u8 local_port = dest_port->local_port;
+	char mpat_pl[MLXSW_REG_MPAT_LEN];
+	int pa_id = span_entry->id;
+
+	/* Create a new port analayzer entry for local_port. */
+	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, true,
+			    MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3);
+	mlxsw_reg_mpat_eth_rspan_l2_pack(mpat_pl,
+				    MLXSW_REG_MPAT_ETH_RSPAN_VERSION_NO_HEADER,
+				    sparms.dmac, false);
+	mlxsw_reg_mpat_eth_rspan_l3_ipv4_pack(mpat_pl,
+					      sparms.ttl, sparms.smac,
+					      be32_to_cpu(sparms.saddr.addr4),
+					      be32_to_cpu(sparms.daddr.addr4));
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpat), mpat_pl);
+}
+
+static void
+mlxsw_sp_span_entry_gretap4_deconfigure(struct mlxsw_sp_span_entry *span_entry)
+{
+	mlxsw_sp_span_entry_deconfigure_common(span_entry,
+					MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3);
+}
+
+static const struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_gretap4 = {
+	.can_handle = is_gretap_dev,
+	.parms = mlxsw_sp_span_entry_gretap4_parms,
+	.configure = mlxsw_sp_span_entry_gretap4_configure,
+	.deconfigure = mlxsw_sp_span_entry_gretap4_deconfigure,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6_GRE)
+static struct net_device *
+mlxsw_sp_span_gretap6_route(const struct net_device *to_dev,
+			    struct in6_addr *saddrp,
+			    struct in6_addr *daddrp)
+{
+	struct ip6_tnl *t = netdev_priv(to_dev);
+	struct flowi6 fl6 = t->fl.u.ip6;
+	struct net_device *dev = NULL;
+	struct dst_entry *dst;
+	struct rt6_info *rt6;
+
+	/* We assume "dev" stays valid after dst is released. */
+	ASSERT_RTNL();
+
+	fl6.flowi6_mark = t->parms.fwmark;
+	if (!ip6_tnl_xmit_ctl(t, &fl6.saddr, &fl6.daddr))
+		return NULL;
+
+	dst = ip6_route_output(t->net, NULL, &fl6);
+	if (!dst || dst->error)
+		goto out;
+
+	rt6 = container_of(dst, struct rt6_info, dst);
+
+	dev = dst->dev;
+	*saddrp = fl6.saddr;
+	*daddrp = rt6->rt6i_gateway;
+
+out:
+	dst_release(dst);
+	return dev;
+}
+
+static int
+mlxsw_sp_span_entry_gretap6_parms(const struct net_device *to_dev,
+				  struct mlxsw_sp_span_parms *sparmsp)
+{
+	struct __ip6_tnl_parm tparm = mlxsw_sp_ipip_netdev_parms6(to_dev);
+	bool inherit_tos = tparm.flags & IP6_TNL_F_USE_ORIG_TCLASS;
+	union mlxsw_sp_l3addr saddr = { .addr6 = tparm.laddr };
+	union mlxsw_sp_l3addr daddr = { .addr6 = tparm.raddr };
+	bool inherit_ttl = !tparm.hop_limit;
+	union mlxsw_sp_l3addr gw = daddr;
+	struct net_device *l3edev;
+
+	if (!(to_dev->flags & IFF_UP) ||
+	    /* Reject tunnels with GRE keys, checksums, etc. */
+	    tparm.i_flags || tparm.o_flags ||
+	    /* Require a fixed TTL and a TOS copied from the mirrored packet. */
+	    inherit_ttl || !inherit_tos ||
+	    /* A destination address may not be "any". */
+	    mlxsw_sp_l3addr_is_zero(daddr))
+		return mlxsw_sp_span_entry_unoffloadable(sparmsp);
+
+	l3edev = mlxsw_sp_span_gretap6_route(to_dev, &saddr.addr6, &gw.addr6);
+	return mlxsw_sp_span_entry_tunnel_parms_common(l3edev, saddr, daddr, gw,
+						       tparm.hop_limit,
+						       &nd_tbl, sparmsp);
+}
+
+static int
+mlxsw_sp_span_entry_gretap6_configure(struct mlxsw_sp_span_entry *span_entry,
+				      struct mlxsw_sp_span_parms sparms)
+{
+	struct mlxsw_sp_port *dest_port = sparms.dest_port;
+	struct mlxsw_sp *mlxsw_sp = dest_port->mlxsw_sp;
+	u8 local_port = dest_port->local_port;
+	char mpat_pl[MLXSW_REG_MPAT_LEN];
+	int pa_id = span_entry->id;
+
+	/* Create a new port analayzer entry for local_port. */
+	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, true,
+			    MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3);
+	mlxsw_reg_mpat_eth_rspan_l2_pack(mpat_pl,
+				    MLXSW_REG_MPAT_ETH_RSPAN_VERSION_NO_HEADER,
+				    sparms.dmac, false);
+	mlxsw_reg_mpat_eth_rspan_l3_ipv6_pack(mpat_pl, sparms.ttl, sparms.smac,
+					      sparms.saddr.addr6,
+					      sparms.daddr.addr6);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpat), mpat_pl);
+}
+
+static void
+mlxsw_sp_span_entry_gretap6_deconfigure(struct mlxsw_sp_span_entry *span_entry)
+{
+	mlxsw_sp_span_entry_deconfigure_common(span_entry,
+					MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3);
+}
+
+static const
+struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_gretap6 = {
+	.can_handle = is_ip6gretap_dev,
+	.parms = mlxsw_sp_span_entry_gretap6_parms,
+	.configure = mlxsw_sp_span_entry_gretap6_configure,
+	.deconfigure = mlxsw_sp_span_entry_gretap6_deconfigure,
+};
+#endif
+
+static const
+struct mlxsw_sp_span_entry_ops *const mlxsw_sp_span_entry_types[] = {
+	&mlxsw_sp_span_entry_ops_phys,
+#if IS_ENABLED(CONFIG_NET_IPGRE)
+	&mlxsw_sp_span_entry_ops_gretap4,
+#endif
+#if IS_ENABLED(CONFIG_IPV6_GRE)
+	&mlxsw_sp_span_entry_ops_gretap6,
+#endif
+};
+
+static int
+mlxsw_sp_span_entry_nop_parms(const struct net_device *to_dev,
+			      struct mlxsw_sp_span_parms *sparmsp)
+{
+	return mlxsw_sp_span_entry_unoffloadable(sparmsp);
+}
+
+static int
+mlxsw_sp_span_entry_nop_configure(struct mlxsw_sp_span_entry *span_entry,
+				  struct mlxsw_sp_span_parms sparms)
+{
+	return 0;
+}
+
+static void
+mlxsw_sp_span_entry_nop_deconfigure(struct mlxsw_sp_span_entry *span_entry)
+{
+}
+
+static const struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_nop = {
+	.parms = mlxsw_sp_span_entry_nop_parms,
+	.configure = mlxsw_sp_span_entry_nop_configure,
+	.deconfigure = mlxsw_sp_span_entry_nop_deconfigure,
+};
+
+static void
+mlxsw_sp_span_entry_configure(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_span_entry *span_entry,
+			      struct mlxsw_sp_span_parms sparms)
+{
+	if (sparms.dest_port) {
+		if (sparms.dest_port->mlxsw_sp != mlxsw_sp) {
+			netdev_err(span_entry->to_dev, "Cannot mirror to %s, which belongs to a different mlxsw instance",
+				   sparms.dest_port->dev->name);
+			sparms.dest_port = NULL;
+		} else if (span_entry->ops->configure(span_entry, sparms)) {
+			netdev_err(span_entry->to_dev, "Failed to offload mirror to %s",
+				   sparms.dest_port->dev->name);
+			sparms.dest_port = NULL;
+		}
+	}
+
+	span_entry->parms = sparms;
+}
+
+static void
+mlxsw_sp_span_entry_deconfigure(struct mlxsw_sp_span_entry *span_entry)
+{
+	if (span_entry->parms.dest_port)
+		span_entry->ops->deconfigure(span_entry);
+}
+
+static struct mlxsw_sp_span_entry *
+mlxsw_sp_span_entry_create(struct mlxsw_sp *mlxsw_sp,
+			   const struct net_device *to_dev,
+			   const struct mlxsw_sp_span_entry_ops *ops,
+			   struct mlxsw_sp_span_parms sparms)
+{
+	struct mlxsw_sp_span_entry *span_entry = NULL;
+	int i;
+
+	/* find a free entry to use */
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		if (!mlxsw_sp->span.entries[i].ref_count) {
+			span_entry = &mlxsw_sp->span.entries[i];
+			break;
+		}
+	}
+	if (!span_entry)
+		return NULL;
+
+	span_entry->ops = ops;
+	span_entry->ref_count = 1;
+	span_entry->to_dev = to_dev;
+	mlxsw_sp_span_entry_configure(mlxsw_sp, span_entry, sparms);
+
+	return span_entry;
+}
+
+static void mlxsw_sp_span_entry_destroy(struct mlxsw_sp_span_entry *span_entry)
+{
+	mlxsw_sp_span_entry_deconfigure(span_entry);
+}
+
+struct mlxsw_sp_span_entry *
+mlxsw_sp_span_entry_find_by_port(struct mlxsw_sp *mlxsw_sp,
+				 const struct net_device *to_dev)
+{
+	int i;
+
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i];
+
+		if (curr->ref_count && curr->to_dev == to_dev)
+			return curr;
+	}
+	return NULL;
+}
+
+void mlxsw_sp_span_entry_invalidate(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_span_entry *span_entry)
+{
+	mlxsw_sp_span_entry_deconfigure(span_entry);
+	span_entry->ops = &mlxsw_sp_span_entry_ops_nop;
+}
+
+static struct mlxsw_sp_span_entry *
+mlxsw_sp_span_entry_find_by_id(struct mlxsw_sp *mlxsw_sp, int span_id)
+{
+	int i;
+
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i];
+
+		if (curr->ref_count && curr->id == span_id)
+			return curr;
+	}
+	return NULL;
+}
+
+static struct mlxsw_sp_span_entry *
+mlxsw_sp_span_entry_get(struct mlxsw_sp *mlxsw_sp,
+			const struct net_device *to_dev,
+			const struct mlxsw_sp_span_entry_ops *ops,
+			struct mlxsw_sp_span_parms sparms)
+{
+	struct mlxsw_sp_span_entry *span_entry;
+
+	span_entry = mlxsw_sp_span_entry_find_by_port(mlxsw_sp, to_dev);
+	if (span_entry) {
+		/* Already exists, just take a reference */
+		span_entry->ref_count++;
+		return span_entry;
+	}
+
+	return mlxsw_sp_span_entry_create(mlxsw_sp, to_dev, ops, sparms);
+}
+
+static int mlxsw_sp_span_entry_put(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_span_entry *span_entry)
+{
+	WARN_ON(!span_entry->ref_count);
+	if (--span_entry->ref_count == 0)
+		mlxsw_sp_span_entry_destroy(span_entry);
+	return 0;
+}
+
+static bool mlxsw_sp_span_is_egress_mirror(struct mlxsw_sp_port *port)
+{
+	struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp;
+	struct mlxsw_sp_span_inspected_port *p;
+	int i;
+
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i];
+
+		list_for_each_entry(p, &curr->bound_ports_list, list)
+			if (p->local_port == port->local_port &&
+			    p->type == MLXSW_SP_SPAN_EGRESS)
+				return true;
+	}
+
+	return false;
+}
+
+static int mlxsw_sp_span_mtu_to_buffsize(const struct mlxsw_sp *mlxsw_sp,
+					 int mtu)
+{
+	return mlxsw_sp_bytes_cells(mlxsw_sp, mtu * 5 / 2) + 1;
+}
+
+int mlxsw_sp_span_port_mtu_update(struct mlxsw_sp_port *port, u16 mtu)
+{
+	struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp;
+	char sbib_pl[MLXSW_REG_SBIB_LEN];
+	int err;
+
+	/* If port is egress mirrored, the shared buffer size should be
+	 * updated according to the mtu value
+	 */
+	if (mlxsw_sp_span_is_egress_mirror(port)) {
+		u32 buffsize = mlxsw_sp_span_mtu_to_buffsize(mlxsw_sp, mtu);
+
+		mlxsw_reg_sbib_pack(sbib_pl, port->local_port, buffsize);
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl);
+		if (err) {
+			netdev_err(port->dev, "Could not update shared buffer for mirroring\n");
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static struct mlxsw_sp_span_inspected_port *
+mlxsw_sp_span_entry_bound_port_find(struct mlxsw_sp_span_entry *span_entry,
+				    enum mlxsw_sp_span_type type,
+				    struct mlxsw_sp_port *port,
+				    bool bind)
+{
+	struct mlxsw_sp_span_inspected_port *p;
+
+	list_for_each_entry(p, &span_entry->bound_ports_list, list)
+		if (type == p->type &&
+		    port->local_port == p->local_port &&
+		    bind == p->bound)
+			return p;
+	return NULL;
+}
+
+static int
+mlxsw_sp_span_inspected_port_bind(struct mlxsw_sp_port *port,
+				  struct mlxsw_sp_span_entry *span_entry,
+				  enum mlxsw_sp_span_type type,
+				  bool bind)
+{
+	struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp;
+	char mpar_pl[MLXSW_REG_MPAR_LEN];
+	int pa_id = span_entry->id;
+
+	/* bind the port to the SPAN entry */
+	mlxsw_reg_mpar_pack(mpar_pl, port->local_port,
+			    (enum mlxsw_reg_mpar_i_e)type, bind, pa_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpar), mpar_pl);
+}
+
+static int
+mlxsw_sp_span_inspected_port_add(struct mlxsw_sp_port *port,
+				 struct mlxsw_sp_span_entry *span_entry,
+				 enum mlxsw_sp_span_type type,
+				 bool bind)
+{
+	struct mlxsw_sp_span_inspected_port *inspected_port;
+	struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp;
+	char sbib_pl[MLXSW_REG_SBIB_LEN];
+	int i;
+	int err;
+
+	/* A given (source port, direction) can only be bound to one analyzer,
+	 * so if a binding is requested, check for conflicts.
+	 */
+	if (bind)
+		for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+			struct mlxsw_sp_span_entry *curr =
+				&mlxsw_sp->span.entries[i];
+
+			if (mlxsw_sp_span_entry_bound_port_find(curr, type,
+								port, bind))
+				return -EEXIST;
+		}
+
+	/* if it is an egress SPAN, bind a shared buffer to it */
+	if (type == MLXSW_SP_SPAN_EGRESS) {
+		u32 buffsize = mlxsw_sp_span_mtu_to_buffsize(mlxsw_sp,
+							     port->dev->mtu);
+
+		mlxsw_reg_sbib_pack(sbib_pl, port->local_port, buffsize);
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl);
+		if (err) {
+			netdev_err(port->dev, "Could not create shared buffer for mirroring\n");
+			return err;
+		}
+	}
+
+	if (bind) {
+		err = mlxsw_sp_span_inspected_port_bind(port, span_entry, type,
+							true);
+		if (err)
+			goto err_port_bind;
+	}
+
+	inspected_port = kzalloc(sizeof(*inspected_port), GFP_KERNEL);
+	if (!inspected_port) {
+		err = -ENOMEM;
+		goto err_inspected_port_alloc;
+	}
+	inspected_port->local_port = port->local_port;
+	inspected_port->type = type;
+	inspected_port->bound = bind;
+	list_add_tail(&inspected_port->list, &span_entry->bound_ports_list);
+
+	return 0;
+
+err_inspected_port_alloc:
+	if (bind)
+		mlxsw_sp_span_inspected_port_bind(port, span_entry, type,
+						  false);
+err_port_bind:
+	if (type == MLXSW_SP_SPAN_EGRESS) {
+		mlxsw_reg_sbib_pack(sbib_pl, port->local_port, 0);
+		mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl);
+	}
+	return err;
+}
+
+static void
+mlxsw_sp_span_inspected_port_del(struct mlxsw_sp_port *port,
+				 struct mlxsw_sp_span_entry *span_entry,
+				 enum mlxsw_sp_span_type type,
+				 bool bind)
+{
+	struct mlxsw_sp_span_inspected_port *inspected_port;
+	struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp;
+	char sbib_pl[MLXSW_REG_SBIB_LEN];
+
+	inspected_port = mlxsw_sp_span_entry_bound_port_find(span_entry, type,
+							     port, bind);
+	if (!inspected_port)
+		return;
+
+	if (bind)
+		mlxsw_sp_span_inspected_port_bind(port, span_entry, type,
+						  false);
+	/* remove the SBIB buffer if it was egress SPAN */
+	if (type == MLXSW_SP_SPAN_EGRESS) {
+		mlxsw_reg_sbib_pack(sbib_pl, port->local_port, 0);
+		mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl);
+	}
+
+	mlxsw_sp_span_entry_put(mlxsw_sp, span_entry);
+
+	list_del(&inspected_port->list);
+	kfree(inspected_port);
+}
+
+static const struct mlxsw_sp_span_entry_ops *
+mlxsw_sp_span_entry_ops(struct mlxsw_sp *mlxsw_sp,
+			const struct net_device *to_dev)
+{
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sp_span_entry_types); ++i)
+		if (mlxsw_sp_span_entry_types[i]->can_handle(to_dev))
+			return mlxsw_sp_span_entry_types[i];
+
+	return NULL;
+}
+
+int mlxsw_sp_span_mirror_add(struct mlxsw_sp_port *from,
+			     const struct net_device *to_dev,
+			     enum mlxsw_sp_span_type type, bool bind,
+			     int *p_span_id)
+{
+	struct mlxsw_sp *mlxsw_sp = from->mlxsw_sp;
+	const struct mlxsw_sp_span_entry_ops *ops;
+	struct mlxsw_sp_span_parms sparms = {NULL};
+	struct mlxsw_sp_span_entry *span_entry;
+	int err;
+
+	ops = mlxsw_sp_span_entry_ops(mlxsw_sp, to_dev);
+	if (!ops) {
+		netdev_err(to_dev, "Cannot mirror to %s", to_dev->name);
+		return -EOPNOTSUPP;
+	}
+
+	err = ops->parms(to_dev, &sparms);
+	if (err)
+		return err;
+
+	span_entry = mlxsw_sp_span_entry_get(mlxsw_sp, to_dev, ops, sparms);
+	if (!span_entry)
+		return -ENOENT;
+
+	netdev_dbg(from->dev, "Adding inspected port to SPAN entry %d\n",
+		   span_entry->id);
+
+	err = mlxsw_sp_span_inspected_port_add(from, span_entry, type, bind);
+	if (err)
+		goto err_port_bind;
+
+	*p_span_id = span_entry->id;
+	return 0;
+
+err_port_bind:
+	mlxsw_sp_span_entry_put(mlxsw_sp, span_entry);
+	return err;
+}
+
+void mlxsw_sp_span_mirror_del(struct mlxsw_sp_port *from, int span_id,
+			      enum mlxsw_sp_span_type type, bool bind)
+{
+	struct mlxsw_sp_span_entry *span_entry;
+
+	span_entry = mlxsw_sp_span_entry_find_by_id(from->mlxsw_sp, span_id);
+	if (!span_entry) {
+		netdev_err(from->dev, "no span entry found\n");
+		return;
+	}
+
+	netdev_dbg(from->dev, "removing inspected port from SPAN entry %d\n",
+		   span_entry->id);
+	mlxsw_sp_span_inspected_port_del(from, span_entry, type, bind);
+}
+
+void mlxsw_sp_span_respin(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+	int err;
+
+	ASSERT_RTNL();
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i];
+		struct mlxsw_sp_span_parms sparms = {NULL};
+
+		if (!curr->ref_count)
+			continue;
+
+		err = curr->ops->parms(curr->to_dev, &sparms);
+		if (err)
+			continue;
+
+		if (memcmp(&sparms, &curr->parms, sizeof(sparms))) {
+			mlxsw_sp_span_entry_deconfigure(curr);
+			mlxsw_sp_span_entry_configure(mlxsw_sp, curr, sparms);
+		}
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h
new file mode 100644
index 0000000..4b87ec2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h
@@ -0,0 +1,107 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/mlxsw_span.h
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_SPECTRUM_SPAN_H
+#define _MLXSW_SPECTRUM_SPAN_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#include "spectrum_router.h"
+
+struct mlxsw_sp;
+struct mlxsw_sp_port;
+
+enum mlxsw_sp_span_type {
+	MLXSW_SP_SPAN_EGRESS,
+	MLXSW_SP_SPAN_INGRESS
+};
+
+struct mlxsw_sp_span_inspected_port {
+	struct list_head list;
+	enum mlxsw_sp_span_type type;
+	u8 local_port;
+
+	/* Whether this is a directly bound mirror (port-to-port) or an ACL. */
+	bool bound;
+};
+
+struct mlxsw_sp_span_parms {
+	struct mlxsw_sp_port *dest_port; /* NULL for unoffloaded SPAN. */
+	unsigned int ttl;
+	unsigned char dmac[ETH_ALEN];
+	unsigned char smac[ETH_ALEN];
+	union mlxsw_sp_l3addr daddr;
+	union mlxsw_sp_l3addr saddr;
+};
+
+struct mlxsw_sp_span_entry_ops;
+
+struct mlxsw_sp_span_entry {
+	const struct net_device *to_dev;
+	const struct mlxsw_sp_span_entry_ops *ops;
+	struct mlxsw_sp_span_parms parms;
+	struct list_head bound_ports_list;
+	int ref_count;
+	int id;
+};
+
+struct mlxsw_sp_span_entry_ops {
+	bool (*can_handle)(const struct net_device *to_dev);
+	int (*parms)(const struct net_device *to_dev,
+		     struct mlxsw_sp_span_parms *sparmsp);
+	int (*configure)(struct mlxsw_sp_span_entry *span_entry,
+			 struct mlxsw_sp_span_parms sparms);
+	void (*deconfigure)(struct mlxsw_sp_span_entry *span_entry);
+};
+
+int mlxsw_sp_span_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_span_fini(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_span_respin(struct mlxsw_sp *mlxsw_sp);
+
+int mlxsw_sp_span_mirror_add(struct mlxsw_sp_port *from,
+			     const struct net_device *to_dev,
+			     enum mlxsw_sp_span_type type,
+			     bool bind, int *p_span_id);
+void mlxsw_sp_span_mirror_del(struct mlxsw_sp_port *from, int span_id,
+			      enum mlxsw_sp_span_type type, bool bind);
+struct mlxsw_sp_span_entry *
+mlxsw_sp_span_entry_find_by_port(struct mlxsw_sp *mlxsw_sp,
+				 const struct net_device *to_dev);
+
+void mlxsw_sp_span_entry_invalidate(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_span_entry *span_entry);
+
+int mlxsw_sp_span_port_mtu_update(struct mlxsw_sp_port *port, u16 mtu);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
new file mode 100644
index 0000000..4ed0118
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -0,0 +1,2360 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2015 Ido Schimmel <idosch@mellanox.com>
+ * Copyright (c) 2015 Elad Raz <eladr@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <linux/if_bridge.h>
+#include <linux/workqueue.h>
+#include <linux/jiffies.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <net/switchdev.h>
+
+#include "spectrum_router.h"
+#include "spectrum.h"
+#include "core.h"
+#include "reg.h"
+
+struct mlxsw_sp_bridge_ops;
+
+struct mlxsw_sp_bridge {
+	struct mlxsw_sp *mlxsw_sp;
+	struct {
+		struct delayed_work dw;
+#define MLXSW_SP_DEFAULT_LEARNING_INTERVAL 100
+		unsigned int interval; /* ms */
+	} fdb_notify;
+#define MLXSW_SP_MIN_AGEING_TIME 10
+#define MLXSW_SP_MAX_AGEING_TIME 1000000
+#define MLXSW_SP_DEFAULT_AGEING_TIME 300
+	u32 ageing_time;
+	bool vlan_enabled_exists;
+	struct list_head bridges_list;
+	DECLARE_BITMAP(mids_bitmap, MLXSW_SP_MID_MAX);
+	const struct mlxsw_sp_bridge_ops *bridge_8021q_ops;
+	const struct mlxsw_sp_bridge_ops *bridge_8021d_ops;
+};
+
+struct mlxsw_sp_bridge_device {
+	struct net_device *dev;
+	struct list_head list;
+	struct list_head ports_list;
+	struct list_head mids_list;
+	u8 vlan_enabled:1,
+	   multicast_enabled:1,
+	   mrouter:1;
+	const struct mlxsw_sp_bridge_ops *ops;
+};
+
+struct mlxsw_sp_bridge_port {
+	struct net_device *dev;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct list_head list;
+	struct list_head vlans_list;
+	unsigned int ref_count;
+	u8 stp_state;
+	unsigned long flags;
+	bool mrouter;
+	bool lagged;
+	union {
+		u16 lag_id;
+		u16 system_port;
+	};
+};
+
+struct mlxsw_sp_bridge_vlan {
+	struct list_head list;
+	struct list_head port_vlan_list;
+	u16 vid;
+};
+
+struct mlxsw_sp_bridge_ops {
+	int (*port_join)(struct mlxsw_sp_bridge_device *bridge_device,
+			 struct mlxsw_sp_bridge_port *bridge_port,
+			 struct mlxsw_sp_port *mlxsw_sp_port,
+			 struct netlink_ext_ack *extack);
+	void (*port_leave)(struct mlxsw_sp_bridge_device *bridge_device,
+			   struct mlxsw_sp_bridge_port *bridge_port,
+			   struct mlxsw_sp_port *mlxsw_sp_port);
+	struct mlxsw_sp_fid *
+		(*fid_get)(struct mlxsw_sp_bridge_device *bridge_device,
+			   u16 vid);
+};
+
+static int
+mlxsw_sp_bridge_port_fdb_flush(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_bridge_port *bridge_port,
+			       u16 fid_index);
+
+static void
+mlxsw_sp_bridge_port_mdb_flush(struct mlxsw_sp_port *mlxsw_sp_port,
+			       struct mlxsw_sp_bridge_port *bridge_port);
+
+static void
+mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct mlxsw_sp_bridge_device
+				   *bridge_device);
+
+static void
+mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port,
+				 struct mlxsw_sp_bridge_port *bridge_port,
+				 bool add);
+
+static struct mlxsw_sp_bridge_device *
+mlxsw_sp_bridge_device_find(const struct mlxsw_sp_bridge *bridge,
+			    const struct net_device *br_dev)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+
+	list_for_each_entry(bridge_device, &bridge->bridges_list, list)
+		if (bridge_device->dev == br_dev)
+			return bridge_device;
+
+	return NULL;
+}
+
+bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp,
+					 const struct net_device *br_dev)
+{
+	return !!mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev);
+}
+
+static struct mlxsw_sp_bridge_device *
+mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge,
+			      struct net_device *br_dev)
+{
+	struct device *dev = bridge->mlxsw_sp->bus_info->dev;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	bool vlan_enabled = br_vlan_enabled(br_dev);
+
+	if (vlan_enabled && bridge->vlan_enabled_exists) {
+		dev_err(dev, "Only one VLAN-aware bridge is supported\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	bridge_device = kzalloc(sizeof(*bridge_device), GFP_KERNEL);
+	if (!bridge_device)
+		return ERR_PTR(-ENOMEM);
+
+	bridge_device->dev = br_dev;
+	bridge_device->vlan_enabled = vlan_enabled;
+	bridge_device->multicast_enabled = br_multicast_enabled(br_dev);
+	bridge_device->mrouter = br_multicast_router(br_dev);
+	INIT_LIST_HEAD(&bridge_device->ports_list);
+	if (vlan_enabled) {
+		bridge->vlan_enabled_exists = true;
+		bridge_device->ops = bridge->bridge_8021q_ops;
+	} else {
+		bridge_device->ops = bridge->bridge_8021d_ops;
+	}
+	INIT_LIST_HEAD(&bridge_device->mids_list);
+	list_add(&bridge_device->list, &bridge->bridges_list);
+
+	return bridge_device;
+}
+
+static void
+mlxsw_sp_bridge_device_destroy(struct mlxsw_sp_bridge *bridge,
+			       struct mlxsw_sp_bridge_device *bridge_device)
+{
+	list_del(&bridge_device->list);
+	if (bridge_device->vlan_enabled)
+		bridge->vlan_enabled_exists = false;
+	WARN_ON(!list_empty(&bridge_device->ports_list));
+	WARN_ON(!list_empty(&bridge_device->mids_list));
+	kfree(bridge_device);
+}
+
+static struct mlxsw_sp_bridge_device *
+mlxsw_sp_bridge_device_get(struct mlxsw_sp_bridge *bridge,
+			   struct net_device *br_dev)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+
+	bridge_device = mlxsw_sp_bridge_device_find(bridge, br_dev);
+	if (bridge_device)
+		return bridge_device;
+
+	return mlxsw_sp_bridge_device_create(bridge, br_dev);
+}
+
+static void
+mlxsw_sp_bridge_device_put(struct mlxsw_sp_bridge *bridge,
+			   struct mlxsw_sp_bridge_device *bridge_device)
+{
+	if (list_empty(&bridge_device->ports_list))
+		mlxsw_sp_bridge_device_destroy(bridge, bridge_device);
+}
+
+static struct mlxsw_sp_bridge_port *
+__mlxsw_sp_bridge_port_find(const struct mlxsw_sp_bridge_device *bridge_device,
+			    const struct net_device *brport_dev)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+
+	list_for_each_entry(bridge_port, &bridge_device->ports_list, list) {
+		if (bridge_port->dev == brport_dev)
+			return bridge_port;
+	}
+
+	return NULL;
+}
+
+static struct mlxsw_sp_bridge_port *
+mlxsw_sp_bridge_port_find(struct mlxsw_sp_bridge *bridge,
+			  struct net_device *brport_dev)
+{
+	struct net_device *br_dev = netdev_master_upper_dev_get(brport_dev);
+	struct mlxsw_sp_bridge_device *bridge_device;
+
+	if (!br_dev)
+		return NULL;
+
+	bridge_device = mlxsw_sp_bridge_device_find(bridge, br_dev);
+	if (!bridge_device)
+		return NULL;
+
+	return __mlxsw_sp_bridge_port_find(bridge_device, brport_dev);
+}
+
+static struct mlxsw_sp_bridge_port *
+mlxsw_sp_bridge_port_create(struct mlxsw_sp_bridge_device *bridge_device,
+			    struct net_device *brport_dev)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+
+	bridge_port = kzalloc(sizeof(*bridge_port), GFP_KERNEL);
+	if (!bridge_port)
+		return NULL;
+
+	mlxsw_sp_port = mlxsw_sp_port_dev_lower_find(brport_dev);
+	bridge_port->lagged = mlxsw_sp_port->lagged;
+	if (bridge_port->lagged)
+		bridge_port->lag_id = mlxsw_sp_port->lag_id;
+	else
+		bridge_port->system_port = mlxsw_sp_port->local_port;
+	bridge_port->dev = brport_dev;
+	bridge_port->bridge_device = bridge_device;
+	bridge_port->stp_state = BR_STATE_DISABLED;
+	bridge_port->flags = BR_LEARNING | BR_FLOOD | BR_LEARNING_SYNC |
+			     BR_MCAST_FLOOD;
+	INIT_LIST_HEAD(&bridge_port->vlans_list);
+	list_add(&bridge_port->list, &bridge_device->ports_list);
+	bridge_port->ref_count = 1;
+
+	return bridge_port;
+}
+
+static void
+mlxsw_sp_bridge_port_destroy(struct mlxsw_sp_bridge_port *bridge_port)
+{
+	list_del(&bridge_port->list);
+	WARN_ON(!list_empty(&bridge_port->vlans_list));
+	kfree(bridge_port);
+}
+
+static bool
+mlxsw_sp_bridge_port_should_destroy(const struct mlxsw_sp_bridge_port *
+				    bridge_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(bridge_port->dev);
+
+	/* In case ports were pulled from out of a bridged LAG, then
+	 * it's possible the reference count isn't zero, yet the bridge
+	 * port should be destroyed, as it's no longer an upper of ours.
+	 */
+	if (!mlxsw_sp && list_empty(&bridge_port->vlans_list))
+		return true;
+	else if (bridge_port->ref_count == 0)
+		return true;
+	else
+		return false;
+}
+
+static struct mlxsw_sp_bridge_port *
+mlxsw_sp_bridge_port_get(struct mlxsw_sp_bridge *bridge,
+			 struct net_device *brport_dev)
+{
+	struct net_device *br_dev = netdev_master_upper_dev_get(brport_dev);
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	int err;
+
+	bridge_port = mlxsw_sp_bridge_port_find(bridge, brport_dev);
+	if (bridge_port) {
+		bridge_port->ref_count++;
+		return bridge_port;
+	}
+
+	bridge_device = mlxsw_sp_bridge_device_get(bridge, br_dev);
+	if (IS_ERR(bridge_device))
+		return ERR_CAST(bridge_device);
+
+	bridge_port = mlxsw_sp_bridge_port_create(bridge_device, brport_dev);
+	if (!bridge_port) {
+		err = -ENOMEM;
+		goto err_bridge_port_create;
+	}
+
+	return bridge_port;
+
+err_bridge_port_create:
+	mlxsw_sp_bridge_device_put(bridge, bridge_device);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_bridge_port_put(struct mlxsw_sp_bridge *bridge,
+				     struct mlxsw_sp_bridge_port *bridge_port)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+
+	bridge_port->ref_count--;
+	if (!mlxsw_sp_bridge_port_should_destroy(bridge_port))
+		return;
+	bridge_device = bridge_port->bridge_device;
+	mlxsw_sp_bridge_port_destroy(bridge_port);
+	mlxsw_sp_bridge_device_put(bridge, bridge_device);
+}
+
+static struct mlxsw_sp_port_vlan *
+mlxsw_sp_port_vlan_find_by_bridge(struct mlxsw_sp_port *mlxsw_sp_port,
+				  const struct mlxsw_sp_bridge_device *
+				  bridge_device,
+				  u16 vid)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &mlxsw_sp_port->vlans_list,
+			    list) {
+		if (!mlxsw_sp_port_vlan->bridge_port)
+			continue;
+		if (mlxsw_sp_port_vlan->bridge_port->bridge_device !=
+		    bridge_device)
+			continue;
+		if (bridge_device->vlan_enabled &&
+		    mlxsw_sp_port_vlan->vid != vid)
+			continue;
+		return mlxsw_sp_port_vlan;
+	}
+
+	return NULL;
+}
+
+static struct mlxsw_sp_port_vlan*
+mlxsw_sp_port_vlan_find_by_fid(struct mlxsw_sp_port *mlxsw_sp_port,
+			       u16 fid_index)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &mlxsw_sp_port->vlans_list,
+			    list) {
+		struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+
+		if (fid && mlxsw_sp_fid_index(fid) == fid_index)
+			return mlxsw_sp_port_vlan;
+	}
+
+	return NULL;
+}
+
+static struct mlxsw_sp_bridge_vlan *
+mlxsw_sp_bridge_vlan_find(const struct mlxsw_sp_bridge_port *bridge_port,
+			  u16 vid)
+{
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+
+	list_for_each_entry(bridge_vlan, &bridge_port->vlans_list, list) {
+		if (bridge_vlan->vid == vid)
+			return bridge_vlan;
+	}
+
+	return NULL;
+}
+
+static struct mlxsw_sp_bridge_vlan *
+mlxsw_sp_bridge_vlan_create(struct mlxsw_sp_bridge_port *bridge_port, u16 vid)
+{
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+
+	bridge_vlan = kzalloc(sizeof(*bridge_vlan), GFP_KERNEL);
+	if (!bridge_vlan)
+		return NULL;
+
+	INIT_LIST_HEAD(&bridge_vlan->port_vlan_list);
+	bridge_vlan->vid = vid;
+	list_add(&bridge_vlan->list, &bridge_port->vlans_list);
+
+	return bridge_vlan;
+}
+
+static void
+mlxsw_sp_bridge_vlan_destroy(struct mlxsw_sp_bridge_vlan *bridge_vlan)
+{
+	list_del(&bridge_vlan->list);
+	WARN_ON(!list_empty(&bridge_vlan->port_vlan_list));
+	kfree(bridge_vlan);
+}
+
+static struct mlxsw_sp_bridge_vlan *
+mlxsw_sp_bridge_vlan_get(struct mlxsw_sp_bridge_port *bridge_port, u16 vid)
+{
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+
+	bridge_vlan = mlxsw_sp_bridge_vlan_find(bridge_port, vid);
+	if (bridge_vlan)
+		return bridge_vlan;
+
+	return mlxsw_sp_bridge_vlan_create(bridge_port, vid);
+}
+
+static void mlxsw_sp_bridge_vlan_put(struct mlxsw_sp_bridge_vlan *bridge_vlan)
+{
+	if (list_empty(&bridge_vlan->port_vlan_list))
+		mlxsw_sp_bridge_vlan_destroy(bridge_vlan);
+}
+
+static void mlxsw_sp_port_bridge_flags_get(struct mlxsw_sp_bridge *bridge,
+					   struct net_device *dev,
+					   unsigned long *brport_flags)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+
+	bridge_port = mlxsw_sp_bridge_port_find(bridge, dev);
+	if (WARN_ON(!bridge_port))
+		return;
+
+	memcpy(brport_flags, &bridge_port->flags, sizeof(*brport_flags));
+}
+
+static int mlxsw_sp_port_attr_get(struct net_device *dev,
+				  struct switchdev_attr *attr)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
+		attr->u.ppid.id_len = sizeof(mlxsw_sp->base_mac);
+		memcpy(&attr->u.ppid.id, &mlxsw_sp->base_mac,
+		       attr->u.ppid.id_len);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
+		mlxsw_sp_port_bridge_flags_get(mlxsw_sp->bridge, attr->orig_dev,
+					       &attr->u.brport_flags);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT:
+		attr->u.brport_flags_support = BR_LEARNING | BR_FLOOD |
+					       BR_MCAST_FLOOD;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int
+mlxsw_sp_port_bridge_vlan_stp_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				  struct mlxsw_sp_bridge_vlan *bridge_vlan,
+				  u8 state)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &bridge_vlan->port_vlan_list,
+			    bridge_vlan_node) {
+		if (mlxsw_sp_port_vlan->mlxsw_sp_port != mlxsw_sp_port)
+			continue;
+		return mlxsw_sp_port_vid_stp_set(mlxsw_sp_port,
+						 bridge_vlan->vid, state);
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_port_attr_stp_state_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					    struct switchdev_trans *trans,
+					    struct net_device *orig_dev,
+					    u8 state)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+	int err;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	/* It's possible we failed to enslave the port, yet this
+	 * operation is executed due to it being deferred.
+	 */
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp_port->mlxsw_sp->bridge,
+						orig_dev);
+	if (!bridge_port)
+		return 0;
+
+	list_for_each_entry(bridge_vlan, &bridge_port->vlans_list, list) {
+		err = mlxsw_sp_port_bridge_vlan_stp_set(mlxsw_sp_port,
+							bridge_vlan, state);
+		if (err)
+			goto err_port_bridge_vlan_stp_set;
+	}
+
+	bridge_port->stp_state = state;
+
+	return 0;
+
+err_port_bridge_vlan_stp_set:
+	list_for_each_entry_continue_reverse(bridge_vlan,
+					     &bridge_port->vlans_list, list)
+		mlxsw_sp_port_bridge_vlan_stp_set(mlxsw_sp_port, bridge_vlan,
+						  bridge_port->stp_state);
+	return err;
+}
+
+static int
+mlxsw_sp_port_bridge_vlan_flood_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				    struct mlxsw_sp_bridge_vlan *bridge_vlan,
+				    enum mlxsw_sp_flood_type packet_type,
+				    bool member)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &bridge_vlan->port_vlan_list,
+			    bridge_vlan_node) {
+		if (mlxsw_sp_port_vlan->mlxsw_sp_port != mlxsw_sp_port)
+			continue;
+		return mlxsw_sp_fid_flood_set(mlxsw_sp_port_vlan->fid,
+					      packet_type,
+					      mlxsw_sp_port->local_port,
+					      member);
+	}
+
+	return 0;
+}
+
+static int
+mlxsw_sp_bridge_port_flood_table_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				     struct mlxsw_sp_bridge_port *bridge_port,
+				     enum mlxsw_sp_flood_type packet_type,
+				     bool member)
+{
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+	int err;
+
+	list_for_each_entry(bridge_vlan, &bridge_port->vlans_list, list) {
+		err = mlxsw_sp_port_bridge_vlan_flood_set(mlxsw_sp_port,
+							  bridge_vlan,
+							  packet_type,
+							  member);
+		if (err)
+			goto err_port_bridge_vlan_flood_set;
+	}
+
+	return 0;
+
+err_port_bridge_vlan_flood_set:
+	list_for_each_entry_continue_reverse(bridge_vlan,
+					     &bridge_port->vlans_list, list)
+		mlxsw_sp_port_bridge_vlan_flood_set(mlxsw_sp_port, bridge_vlan,
+						    packet_type, !member);
+	return err;
+}
+
+static int
+mlxsw_sp_port_bridge_vlan_learning_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				       struct mlxsw_sp_bridge_vlan *bridge_vlan,
+				       bool set)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	u16 vid = bridge_vlan->vid;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &bridge_vlan->port_vlan_list,
+			    bridge_vlan_node) {
+		if (mlxsw_sp_port_vlan->mlxsw_sp_port != mlxsw_sp_port)
+			continue;
+		return mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, set);
+	}
+
+	return 0;
+}
+
+static int
+mlxsw_sp_bridge_port_learning_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				  struct mlxsw_sp_bridge_port *bridge_port,
+				  bool set)
+{
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+	int err;
+
+	list_for_each_entry(bridge_vlan, &bridge_port->vlans_list, list) {
+		err = mlxsw_sp_port_bridge_vlan_learning_set(mlxsw_sp_port,
+							     bridge_vlan, set);
+		if (err)
+			goto err_port_bridge_vlan_learning_set;
+	}
+
+	return 0;
+
+err_port_bridge_vlan_learning_set:
+	list_for_each_entry_continue_reverse(bridge_vlan,
+					     &bridge_port->vlans_list, list)
+		mlxsw_sp_port_bridge_vlan_learning_set(mlxsw_sp_port,
+						       bridge_vlan, !set);
+	return err;
+}
+
+static int mlxsw_sp_port_attr_br_flags_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					   struct switchdev_trans *trans,
+					   struct net_device *orig_dev,
+					   unsigned long brport_flags)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+	int err;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp_port->mlxsw_sp->bridge,
+						orig_dev);
+	if (!bridge_port)
+		return 0;
+
+	err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port,
+						   MLXSW_SP_FLOOD_TYPE_UC,
+						   brport_flags & BR_FLOOD);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_bridge_port_learning_set(mlxsw_sp_port, bridge_port,
+						brport_flags & BR_LEARNING);
+	if (err)
+		return err;
+
+	if (bridge_port->bridge_device->multicast_enabled)
+		goto out;
+
+	err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port,
+						   MLXSW_SP_FLOOD_TYPE_MC,
+						   brport_flags &
+						   BR_MCAST_FLOOD);
+	if (err)
+		return err;
+
+out:
+	memcpy(&bridge_port->flags, &brport_flags, sizeof(brport_flags));
+	return 0;
+}
+
+static int mlxsw_sp_ageing_set(struct mlxsw_sp *mlxsw_sp, u32 ageing_time)
+{
+	char sfdat_pl[MLXSW_REG_SFDAT_LEN];
+	int err;
+
+	mlxsw_reg_sfdat_pack(sfdat_pl, ageing_time);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfdat), sfdat_pl);
+	if (err)
+		return err;
+	mlxsw_sp->bridge->ageing_time = ageing_time;
+	return 0;
+}
+
+static int mlxsw_sp_port_attr_br_ageing_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					    struct switchdev_trans *trans,
+					    unsigned long ageing_clock_t)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock_t);
+	u32 ageing_time = jiffies_to_msecs(ageing_jiffies) / 1000;
+
+	if (switchdev_trans_ph_prepare(trans)) {
+		if (ageing_time < MLXSW_SP_MIN_AGEING_TIME ||
+		    ageing_time > MLXSW_SP_MAX_AGEING_TIME)
+			return -ERANGE;
+		else
+			return 0;
+	}
+
+	return mlxsw_sp_ageing_set(mlxsw_sp, ageing_time);
+}
+
+static int mlxsw_sp_port_attr_br_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					  struct switchdev_trans *trans,
+					  struct net_device *orig_dev,
+					  bool vlan_enabled)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_bridge_device *bridge_device;
+
+	if (!switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, orig_dev);
+	if (WARN_ON(!bridge_device))
+		return -EINVAL;
+
+	if (bridge_device->vlan_enabled == vlan_enabled)
+		return 0;
+
+	netdev_err(bridge_device->dev, "VLAN filtering can't be changed for existing bridge\n");
+	return -EINVAL;
+}
+
+static int mlxsw_sp_port_attr_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					  struct switchdev_trans *trans,
+					  struct net_device *orig_dev,
+					  bool is_port_mrouter)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+	int err;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp_port->mlxsw_sp->bridge,
+						orig_dev);
+	if (!bridge_port)
+		return 0;
+
+	if (!bridge_port->bridge_device->multicast_enabled)
+		goto out;
+
+	err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port,
+						   MLXSW_SP_FLOOD_TYPE_MC,
+						   is_port_mrouter);
+	if (err)
+		return err;
+
+	mlxsw_sp_port_mrouter_update_mdb(mlxsw_sp_port, bridge_port,
+					 is_port_mrouter);
+out:
+	bridge_port->mrouter = is_port_mrouter;
+	return 0;
+}
+
+static bool mlxsw_sp_mc_flood(const struct mlxsw_sp_bridge_port *bridge_port)
+{
+	const struct mlxsw_sp_bridge_device *bridge_device;
+
+	bridge_device = bridge_port->bridge_device;
+	return bridge_device->multicast_enabled ? bridge_port->mrouter :
+					bridge_port->flags & BR_MCAST_FLOOD;
+}
+
+static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					 struct switchdev_trans *trans,
+					 struct net_device *orig_dev,
+					 bool mc_disabled)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	int err;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	/* It's possible we failed to enslave the port, yet this
+	 * operation is executed due to it being deferred.
+	 */
+	bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, orig_dev);
+	if (!bridge_device)
+		return 0;
+
+	if (bridge_device->multicast_enabled != !mc_disabled) {
+		bridge_device->multicast_enabled = !mc_disabled;
+		mlxsw_sp_bridge_mdb_mc_enable_sync(mlxsw_sp_port,
+						   bridge_device);
+	}
+
+	list_for_each_entry(bridge_port, &bridge_device->ports_list, list) {
+		enum mlxsw_sp_flood_type packet_type = MLXSW_SP_FLOOD_TYPE_MC;
+		bool member = mlxsw_sp_mc_flood(bridge_port);
+
+		err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port,
+							   bridge_port,
+							   packet_type, member);
+		if (err)
+			return err;
+	}
+
+	bridge_device->multicast_enabled = !mc_disabled;
+
+	return 0;
+}
+
+static int mlxsw_sp_smid_router_port_set(struct mlxsw_sp *mlxsw_sp,
+					 u16 mid_idx, bool add)
+{
+	char *smid_pl;
+	int err;
+
+	smid_pl = kmalloc(MLXSW_REG_SMID_LEN, GFP_KERNEL);
+	if (!smid_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_smid_pack(smid_pl, mid_idx,
+			    mlxsw_sp_router_port(mlxsw_sp), add);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl);
+	kfree(smid_pl);
+	return err;
+}
+
+static void
+mlxsw_sp_bridge_mrouter_update_mdb(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_bridge_device *bridge_device,
+				   bool add)
+{
+	struct mlxsw_sp_mid *mid;
+
+	list_for_each_entry(mid, &bridge_device->mids_list, list)
+		mlxsw_sp_smid_router_port_set(mlxsw_sp, mid->mid, add);
+}
+
+static int
+mlxsw_sp_port_attr_br_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				  struct switchdev_trans *trans,
+				  struct net_device *orig_dev,
+				  bool is_mrouter)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_bridge_device *bridge_device;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	/* It's possible we failed to enslave the port, yet this
+	 * operation is executed due to it being deferred.
+	 */
+	bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, orig_dev);
+	if (!bridge_device)
+		return 0;
+
+	if (bridge_device->mrouter != is_mrouter)
+		mlxsw_sp_bridge_mrouter_update_mdb(mlxsw_sp, bridge_device,
+						   is_mrouter);
+	bridge_device->mrouter = is_mrouter;
+	return 0;
+}
+
+static int mlxsw_sp_port_attr_set(struct net_device *dev,
+				  const struct switchdev_attr *attr,
+				  struct switchdev_trans *trans)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	int err;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
+		err = mlxsw_sp_port_attr_stp_state_set(mlxsw_sp_port, trans,
+						       attr->orig_dev,
+						       attr->u.stp_state);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
+		err = mlxsw_sp_port_attr_br_flags_set(mlxsw_sp_port, trans,
+						      attr->orig_dev,
+						      attr->u.brport_flags);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
+		err = mlxsw_sp_port_attr_br_ageing_set(mlxsw_sp_port, trans,
+						       attr->u.ageing_time);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
+		err = mlxsw_sp_port_attr_br_vlan_set(mlxsw_sp_port, trans,
+						     attr->orig_dev,
+						     attr->u.vlan_filtering);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_MROUTER:
+		err = mlxsw_sp_port_attr_mrouter_set(mlxsw_sp_port, trans,
+						     attr->orig_dev,
+						     attr->u.mrouter);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED:
+		err = mlxsw_sp_port_mc_disabled_set(mlxsw_sp_port, trans,
+						    attr->orig_dev,
+						    attr->u.mc_disabled);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER:
+		err = mlxsw_sp_port_attr_br_mrouter_set(mlxsw_sp_port, trans,
+							attr->orig_dev,
+							attr->u.mrouter);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static int
+mlxsw_sp_port_vlan_fid_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan,
+			    struct mlxsw_sp_bridge_port *bridge_port)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+	struct mlxsw_sp_fid *fid;
+	int err;
+
+	bridge_device = bridge_port->bridge_device;
+	fid = bridge_device->ops->fid_get(bridge_device, vid);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	err = mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_UC, local_port,
+				     bridge_port->flags & BR_FLOOD);
+	if (err)
+		goto err_fid_uc_flood_set;
+
+	err = mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_MC, local_port,
+				     mlxsw_sp_mc_flood(bridge_port));
+	if (err)
+		goto err_fid_mc_flood_set;
+
+	err = mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_BC, local_port,
+				     true);
+	if (err)
+		goto err_fid_bc_flood_set;
+
+	err = mlxsw_sp_fid_port_vid_map(fid, mlxsw_sp_port, vid);
+	if (err)
+		goto err_fid_port_vid_map;
+
+	mlxsw_sp_port_vlan->fid = fid;
+
+	return 0;
+
+err_fid_port_vid_map:
+	mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_BC, local_port, false);
+err_fid_bc_flood_set:
+	mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_MC, local_port, false);
+err_fid_mc_flood_set:
+	mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_UC, local_port, false);
+err_fid_uc_flood_set:
+	mlxsw_sp_fid_put(fid);
+	return err;
+}
+
+static void
+mlxsw_sp_port_vlan_fid_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+
+	mlxsw_sp_port_vlan->fid = NULL;
+	mlxsw_sp_fid_port_vid_unmap(fid, mlxsw_sp_port, vid);
+	mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_BC, local_port, false);
+	mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_MC, local_port, false);
+	mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_UC, local_port, false);
+	mlxsw_sp_fid_put(fid);
+}
+
+static u16
+mlxsw_sp_port_pvid_determine(const struct mlxsw_sp_port *mlxsw_sp_port,
+			     u16 vid, bool is_pvid)
+{
+	if (is_pvid)
+		return vid;
+	else if (mlxsw_sp_port->pvid == vid)
+		return 0;	/* Dis-allow untagged packets */
+	else
+		return mlxsw_sp_port->pvid;
+}
+
+static int
+mlxsw_sp_port_vlan_bridge_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan,
+			       struct mlxsw_sp_bridge_port *bridge_port)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+	int err;
+
+	/* No need to continue if only VLAN flags were changed */
+	if (mlxsw_sp_port_vlan->bridge_port)
+		return 0;
+
+	err = mlxsw_sp_port_vlan_fid_join(mlxsw_sp_port_vlan, bridge_port);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid,
+					     bridge_port->flags & BR_LEARNING);
+	if (err)
+		goto err_port_vid_learning_set;
+
+	err = mlxsw_sp_port_vid_stp_set(mlxsw_sp_port, vid,
+					bridge_port->stp_state);
+	if (err)
+		goto err_port_vid_stp_set;
+
+	bridge_vlan = mlxsw_sp_bridge_vlan_get(bridge_port, vid);
+	if (!bridge_vlan) {
+		err = -ENOMEM;
+		goto err_bridge_vlan_get;
+	}
+
+	list_add(&mlxsw_sp_port_vlan->bridge_vlan_node,
+		 &bridge_vlan->port_vlan_list);
+
+	mlxsw_sp_bridge_port_get(mlxsw_sp_port->mlxsw_sp->bridge,
+				 bridge_port->dev);
+	mlxsw_sp_port_vlan->bridge_port = bridge_port;
+
+	return 0;
+
+err_bridge_vlan_get:
+	mlxsw_sp_port_vid_stp_set(mlxsw_sp_port, vid, BR_STATE_DISABLED);
+err_port_vid_stp_set:
+	mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, false);
+err_port_vid_learning_set:
+	mlxsw_sp_port_vlan_fid_leave(mlxsw_sp_port_vlan);
+	return err;
+}
+
+void
+mlxsw_sp_port_vlan_bridge_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+	bool last_port, last_vlan;
+
+	if (WARN_ON(mlxsw_sp_fid_type(fid) != MLXSW_SP_FID_TYPE_8021Q &&
+		    mlxsw_sp_fid_type(fid) != MLXSW_SP_FID_TYPE_8021D))
+		return;
+
+	bridge_port = mlxsw_sp_port_vlan->bridge_port;
+	last_vlan = list_is_singular(&bridge_port->vlans_list);
+	bridge_vlan = mlxsw_sp_bridge_vlan_find(bridge_port, vid);
+	last_port = list_is_singular(&bridge_vlan->port_vlan_list);
+
+	list_del(&mlxsw_sp_port_vlan->bridge_vlan_node);
+	mlxsw_sp_bridge_vlan_put(bridge_vlan);
+	mlxsw_sp_port_vid_stp_set(mlxsw_sp_port, vid, BR_STATE_DISABLED);
+	mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, false);
+	if (last_port)
+		mlxsw_sp_bridge_port_fdb_flush(mlxsw_sp_port->mlxsw_sp,
+					       bridge_port,
+					       mlxsw_sp_fid_index(fid));
+	if (last_vlan)
+		mlxsw_sp_bridge_port_mdb_flush(mlxsw_sp_port, bridge_port);
+
+	mlxsw_sp_port_vlan_fid_leave(mlxsw_sp_port_vlan);
+
+	mlxsw_sp_bridge_port_put(mlxsw_sp_port->mlxsw_sp->bridge, bridge_port);
+	mlxsw_sp_port_vlan->bridge_port = NULL;
+}
+
+static int
+mlxsw_sp_bridge_port_vlan_add(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct mlxsw_sp_bridge_port *bridge_port,
+			      u16 vid, bool is_untagged, bool is_pvid)
+{
+	u16 pvid = mlxsw_sp_port_pvid_determine(mlxsw_sp_port, vid, is_pvid);
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	u16 old_pvid = mlxsw_sp_port->pvid;
+	int err;
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_get(mlxsw_sp_port, vid);
+	if (IS_ERR(mlxsw_sp_port_vlan))
+		return PTR_ERR(mlxsw_sp_port_vlan);
+
+	err = mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid, true,
+				     is_untagged);
+	if (err)
+		goto err_port_vlan_set;
+
+	err = mlxsw_sp_port_pvid_set(mlxsw_sp_port, pvid);
+	if (err)
+		goto err_port_pvid_set;
+
+	err = mlxsw_sp_port_vlan_bridge_join(mlxsw_sp_port_vlan, bridge_port);
+	if (err)
+		goto err_port_vlan_bridge_join;
+
+	return 0;
+
+err_port_vlan_bridge_join:
+	mlxsw_sp_port_pvid_set(mlxsw_sp_port, old_pvid);
+err_port_pvid_set:
+	mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid, false, false);
+err_port_vlan_set:
+	mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
+	return err;
+}
+
+static int mlxsw_sp_port_vlans_add(struct mlxsw_sp_port *mlxsw_sp_port,
+				   const struct switchdev_obj_port_vlan *vlan,
+				   struct switchdev_trans *trans)
+{
+	bool flag_untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED;
+	bool flag_pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID;
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct net_device *orig_dev = vlan->obj.orig_dev;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	u16 vid;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev);
+	if (WARN_ON(!bridge_port))
+		return -EINVAL;
+
+	if (!bridge_port->bridge_device->vlan_enabled)
+		return 0;
+
+	for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) {
+		int err;
+
+		err = mlxsw_sp_bridge_port_vlan_add(mlxsw_sp_port, bridge_port,
+						    vid, flag_untagged,
+						    flag_pvid);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static enum mlxsw_reg_sfdf_flush_type mlxsw_sp_fdb_flush_type(bool lagged)
+{
+	return lagged ? MLXSW_REG_SFDF_FLUSH_PER_LAG_AND_FID :
+			MLXSW_REG_SFDF_FLUSH_PER_PORT_AND_FID;
+}
+
+static int
+mlxsw_sp_bridge_port_fdb_flush(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_bridge_port *bridge_port,
+			       u16 fid_index)
+{
+	bool lagged = bridge_port->lagged;
+	char sfdf_pl[MLXSW_REG_SFDF_LEN];
+	u16 system_port;
+
+	system_port = lagged ? bridge_port->lag_id : bridge_port->system_port;
+	mlxsw_reg_sfdf_pack(sfdf_pl, mlxsw_sp_fdb_flush_type(lagged));
+	mlxsw_reg_sfdf_fid_set(sfdf_pl, fid_index);
+	mlxsw_reg_sfdf_port_fid_system_port_set(sfdf_pl, system_port);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfdf), sfdf_pl);
+}
+
+static enum mlxsw_reg_sfd_rec_policy mlxsw_sp_sfd_rec_policy(bool dynamic)
+{
+	return dynamic ? MLXSW_REG_SFD_REC_POLICY_DYNAMIC_ENTRY_INGRESS :
+			 MLXSW_REG_SFD_REC_POLICY_STATIC_ENTRY;
+}
+
+static enum mlxsw_reg_sfd_op mlxsw_sp_sfd_op(bool adding)
+{
+	return adding ? MLXSW_REG_SFD_OP_WRITE_EDIT :
+			MLXSW_REG_SFD_OP_WRITE_REMOVE;
+}
+
+static int __mlxsw_sp_port_fdb_uc_op(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				     const char *mac, u16 fid, bool adding,
+				     enum mlxsw_reg_sfd_rec_action action,
+				     bool dynamic)
+{
+	char *sfd_pl;
+	u8 num_rec;
+	int err;
+
+	sfd_pl = kmalloc(MLXSW_REG_SFD_LEN, GFP_KERNEL);
+	if (!sfd_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_sfd_pack(sfd_pl, mlxsw_sp_sfd_op(adding), 0);
+	mlxsw_reg_sfd_uc_pack(sfd_pl, 0, mlxsw_sp_sfd_rec_policy(dynamic),
+			      mac, fid, action, local_port);
+	num_rec = mlxsw_reg_sfd_num_rec_get(sfd_pl);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl);
+	if (err)
+		goto out;
+
+	if (num_rec != mlxsw_reg_sfd_num_rec_get(sfd_pl))
+		err = -EBUSY;
+
+out:
+	kfree(sfd_pl);
+	return err;
+}
+
+static int mlxsw_sp_port_fdb_uc_op(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				   const char *mac, u16 fid, bool adding,
+				   bool dynamic)
+{
+	return __mlxsw_sp_port_fdb_uc_op(mlxsw_sp, local_port, mac, fid, adding,
+					 MLXSW_REG_SFD_REC_ACTION_NOP, dynamic);
+}
+
+int mlxsw_sp_rif_fdb_op(struct mlxsw_sp *mlxsw_sp, const char *mac, u16 fid,
+			bool adding)
+{
+	return __mlxsw_sp_port_fdb_uc_op(mlxsw_sp, 0, mac, fid, adding,
+					 MLXSW_REG_SFD_REC_ACTION_FORWARD_IP_ROUTER,
+					 false);
+}
+
+static int mlxsw_sp_port_fdb_uc_lag_op(struct mlxsw_sp *mlxsw_sp, u16 lag_id,
+				       const char *mac, u16 fid, u16 lag_vid,
+				       bool adding, bool dynamic)
+{
+	char *sfd_pl;
+	u8 num_rec;
+	int err;
+
+	sfd_pl = kmalloc(MLXSW_REG_SFD_LEN, GFP_KERNEL);
+	if (!sfd_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_sfd_pack(sfd_pl, mlxsw_sp_sfd_op(adding), 0);
+	mlxsw_reg_sfd_uc_lag_pack(sfd_pl, 0, mlxsw_sp_sfd_rec_policy(dynamic),
+				  mac, fid, MLXSW_REG_SFD_REC_ACTION_NOP,
+				  lag_vid, lag_id);
+	num_rec = mlxsw_reg_sfd_num_rec_get(sfd_pl);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl);
+	if (err)
+		goto out;
+
+	if (num_rec != mlxsw_reg_sfd_num_rec_get(sfd_pl))
+		err = -EBUSY;
+
+out:
+	kfree(sfd_pl);
+	return err;
+}
+
+static int
+mlxsw_sp_port_fdb_set(struct mlxsw_sp_port *mlxsw_sp_port,
+		      struct switchdev_notifier_fdb_info *fdb_info, bool adding)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct net_device *orig_dev = fdb_info->info.dev;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	u16 fid_index, vid;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev);
+	if (!bridge_port)
+		return -EINVAL;
+
+	bridge_device = bridge_port->bridge_device;
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_bridge(mlxsw_sp_port,
+							       bridge_device,
+							       fdb_info->vid);
+	if (!mlxsw_sp_port_vlan)
+		return 0;
+
+	fid_index = mlxsw_sp_fid_index(mlxsw_sp_port_vlan->fid);
+	vid = mlxsw_sp_port_vlan->vid;
+
+	if (!bridge_port->lagged)
+		return mlxsw_sp_port_fdb_uc_op(mlxsw_sp,
+					       bridge_port->system_port,
+					       fdb_info->addr, fid_index,
+					       adding, false);
+	else
+		return mlxsw_sp_port_fdb_uc_lag_op(mlxsw_sp,
+						   bridge_port->lag_id,
+						   fdb_info->addr, fid_index,
+						   vid, adding, false);
+}
+
+static int mlxsw_sp_port_mdb_op(struct mlxsw_sp *mlxsw_sp, const char *addr,
+				u16 fid, u16 mid_idx, bool adding)
+{
+	char *sfd_pl;
+	u8 num_rec;
+	int err;
+
+	sfd_pl = kmalloc(MLXSW_REG_SFD_LEN, GFP_KERNEL);
+	if (!sfd_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_sfd_pack(sfd_pl, mlxsw_sp_sfd_op(adding), 0);
+	mlxsw_reg_sfd_mc_pack(sfd_pl, 0, addr, fid,
+			      MLXSW_REG_SFD_REC_ACTION_NOP, mid_idx);
+	num_rec = mlxsw_reg_sfd_num_rec_get(sfd_pl);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl);
+	if (err)
+		goto out;
+
+	if (num_rec != mlxsw_reg_sfd_num_rec_get(sfd_pl))
+		err = -EBUSY;
+
+out:
+	kfree(sfd_pl);
+	return err;
+}
+
+static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, u16 mid_idx,
+					 long *ports_bitmap,
+					 bool set_router_port)
+{
+	char *smid_pl;
+	int err, i;
+
+	smid_pl = kmalloc(MLXSW_REG_SMID_LEN, GFP_KERNEL);
+	if (!smid_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_smid_pack(smid_pl, mid_idx, 0, false);
+	for (i = 1; i < mlxsw_core_max_ports(mlxsw_sp->core); i++) {
+		if (mlxsw_sp->ports[i])
+			mlxsw_reg_smid_port_mask_set(smid_pl, i, 1);
+	}
+
+	mlxsw_reg_smid_port_mask_set(smid_pl,
+				     mlxsw_sp_router_port(mlxsw_sp), 1);
+
+	for_each_set_bit(i, ports_bitmap, mlxsw_core_max_ports(mlxsw_sp->core))
+		mlxsw_reg_smid_port_set(smid_pl, i, 1);
+
+	mlxsw_reg_smid_port_set(smid_pl, mlxsw_sp_router_port(mlxsw_sp),
+				set_router_port);
+
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl);
+	kfree(smid_pl);
+	return err;
+}
+
+static int mlxsw_sp_port_smid_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				  u16 mid_idx, bool add)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char *smid_pl;
+	int err;
+
+	smid_pl = kmalloc(MLXSW_REG_SMID_LEN, GFP_KERNEL);
+	if (!smid_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_smid_pack(smid_pl, mid_idx, mlxsw_sp_port->local_port, add);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl);
+	kfree(smid_pl);
+	return err;
+}
+
+static struct
+mlxsw_sp_mid *__mlxsw_sp_mc_get(struct mlxsw_sp_bridge_device *bridge_device,
+				const unsigned char *addr,
+				u16 fid)
+{
+	struct mlxsw_sp_mid *mid;
+
+	list_for_each_entry(mid, &bridge_device->mids_list, list) {
+		if (ether_addr_equal(mid->addr, addr) && mid->fid == fid)
+			return mid;
+	}
+	return NULL;
+}
+
+static void
+mlxsw_sp_bridge_port_get_ports_bitmap(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_bridge_port *bridge_port,
+				      unsigned long *ports_bitmap)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u64 max_lag_members, i;
+	int lag_id;
+
+	if (!bridge_port->lagged) {
+		set_bit(bridge_port->system_port, ports_bitmap);
+	} else {
+		max_lag_members = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						     MAX_LAG_MEMBERS);
+		lag_id = bridge_port->lag_id;
+		for (i = 0; i < max_lag_members; i++) {
+			mlxsw_sp_port = mlxsw_sp_port_lagged_get(mlxsw_sp,
+								 lag_id, i);
+			if (mlxsw_sp_port)
+				set_bit(mlxsw_sp_port->local_port,
+					ports_bitmap);
+		}
+	}
+}
+
+static void
+mlxsw_sp_mc_get_mrouters_bitmap(unsigned long *flood_bitmap,
+				struct mlxsw_sp_bridge_device *bridge_device,
+				struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+
+	list_for_each_entry(bridge_port, &bridge_device->ports_list, list) {
+		if (bridge_port->mrouter) {
+			mlxsw_sp_bridge_port_get_ports_bitmap(mlxsw_sp,
+							      bridge_port,
+							      flood_bitmap);
+		}
+	}
+}
+
+static bool
+mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_mid *mid,
+			    struct mlxsw_sp_bridge_device *bridge_device)
+{
+	long *flood_bitmap;
+	int num_of_ports;
+	int alloc_size;
+	u16 mid_idx;
+	int err;
+
+	mid_idx = find_first_zero_bit(mlxsw_sp->bridge->mids_bitmap,
+				      MLXSW_SP_MID_MAX);
+	if (mid_idx == MLXSW_SP_MID_MAX)
+		return false;
+
+	num_of_ports = mlxsw_core_max_ports(mlxsw_sp->core);
+	alloc_size = sizeof(long) * BITS_TO_LONGS(num_of_ports);
+	flood_bitmap = kzalloc(alloc_size, GFP_KERNEL);
+	if (!flood_bitmap)
+		return false;
+
+	bitmap_copy(flood_bitmap,  mid->ports_in_mid, num_of_ports);
+	mlxsw_sp_mc_get_mrouters_bitmap(flood_bitmap, bridge_device, mlxsw_sp);
+
+	mid->mid = mid_idx;
+	err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx, flood_bitmap,
+					    bridge_device->mrouter);
+	kfree(flood_bitmap);
+	if (err)
+		return false;
+
+	err = mlxsw_sp_port_mdb_op(mlxsw_sp, mid->addr, mid->fid, mid_idx,
+				   true);
+	if (err)
+		return false;
+
+	set_bit(mid_idx, mlxsw_sp->bridge->mids_bitmap);
+	mid->in_hw = true;
+	return true;
+}
+
+static int mlxsw_sp_mc_remove_mdb_entry(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_mid *mid)
+{
+	if (!mid->in_hw)
+		return 0;
+
+	clear_bit(mid->mid, mlxsw_sp->bridge->mids_bitmap);
+	mid->in_hw = false;
+	return mlxsw_sp_port_mdb_op(mlxsw_sp, mid->addr, mid->fid, mid->mid,
+				    false);
+}
+
+static struct
+mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_bridge_device *bridge_device,
+				  const unsigned char *addr,
+				  u16 fid)
+{
+	struct mlxsw_sp_mid *mid;
+	size_t alloc_size;
+
+	mid = kzalloc(sizeof(*mid), GFP_KERNEL);
+	if (!mid)
+		return NULL;
+
+	alloc_size = sizeof(unsigned long) *
+		     BITS_TO_LONGS(mlxsw_core_max_ports(mlxsw_sp->core));
+
+	mid->ports_in_mid = kzalloc(alloc_size, GFP_KERNEL);
+	if (!mid->ports_in_mid)
+		goto err_ports_in_mid_alloc;
+
+	ether_addr_copy(mid->addr, addr);
+	mid->fid = fid;
+	mid->in_hw = false;
+
+	if (!bridge_device->multicast_enabled)
+		goto out;
+
+	if (!mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid, bridge_device))
+		goto err_write_mdb_entry;
+
+out:
+	list_add_tail(&mid->list, &bridge_device->mids_list);
+	return mid;
+
+err_write_mdb_entry:
+	kfree(mid->ports_in_mid);
+err_ports_in_mid_alloc:
+	kfree(mid);
+	return NULL;
+}
+
+static int mlxsw_sp_port_remove_from_mid(struct mlxsw_sp_port *mlxsw_sp_port,
+					 struct mlxsw_sp_mid *mid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	int err = 0;
+
+	clear_bit(mlxsw_sp_port->local_port, mid->ports_in_mid);
+	if (bitmap_empty(mid->ports_in_mid,
+			 mlxsw_core_max_ports(mlxsw_sp->core))) {
+		err = mlxsw_sp_mc_remove_mdb_entry(mlxsw_sp, mid);
+		list_del(&mid->list);
+		kfree(mid->ports_in_mid);
+		kfree(mid);
+	}
+	return err;
+}
+
+static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port,
+				 const struct switchdev_obj_port_mdb *mdb,
+				 struct switchdev_trans *trans)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct net_device *orig_dev = mdb->obj.orig_dev;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct net_device *dev = mlxsw_sp_port->dev;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct mlxsw_sp_mid *mid;
+	u16 fid_index;
+	int err = 0;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev);
+	if (!bridge_port)
+		return 0;
+
+	bridge_device = bridge_port->bridge_device;
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_bridge(mlxsw_sp_port,
+							       bridge_device,
+							       mdb->vid);
+	if (!mlxsw_sp_port_vlan)
+		return 0;
+
+	fid_index = mlxsw_sp_fid_index(mlxsw_sp_port_vlan->fid);
+
+	mid = __mlxsw_sp_mc_get(bridge_device, mdb->addr, fid_index);
+	if (!mid) {
+		mid = __mlxsw_sp_mc_alloc(mlxsw_sp, bridge_device, mdb->addr,
+					  fid_index);
+		if (!mid) {
+			netdev_err(dev, "Unable to allocate MC group\n");
+			return -ENOMEM;
+		}
+	}
+	set_bit(mlxsw_sp_port->local_port, mid->ports_in_mid);
+
+	if (!bridge_device->multicast_enabled)
+		return 0;
+
+	if (bridge_port->mrouter)
+		return 0;
+
+	err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, true);
+	if (err) {
+		netdev_err(dev, "Unable to set SMID\n");
+		goto err_out;
+	}
+
+	return 0;
+
+err_out:
+	mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid);
+	return err;
+}
+
+static void
+mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct mlxsw_sp_bridge_device
+				   *bridge_device)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_mid *mid;
+	bool mc_enabled;
+
+	mc_enabled = bridge_device->multicast_enabled;
+
+	list_for_each_entry(mid, &bridge_device->mids_list, list) {
+		if (mc_enabled)
+			mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid,
+						    bridge_device);
+		else
+			mlxsw_sp_mc_remove_mdb_entry(mlxsw_sp, mid);
+	}
+}
+
+static void
+mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port,
+				 struct mlxsw_sp_bridge_port *bridge_port,
+				 bool add)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_mid *mid;
+
+	bridge_device = bridge_port->bridge_device;
+
+	list_for_each_entry(mid, &bridge_device->mids_list, list) {
+		if (!test_bit(mlxsw_sp_port->local_port, mid->ports_in_mid))
+			mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, add);
+	}
+}
+
+static int mlxsw_sp_port_obj_add(struct net_device *dev,
+				 const struct switchdev_obj *obj,
+				 struct switchdev_trans *trans)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	int err = 0;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_ID_PORT_VLAN:
+		err = mlxsw_sp_port_vlans_add(mlxsw_sp_port,
+					      SWITCHDEV_OBJ_PORT_VLAN(obj),
+					      trans);
+		break;
+	case SWITCHDEV_OBJ_ID_PORT_MDB:
+		err = mlxsw_sp_port_mdb_add(mlxsw_sp_port,
+					    SWITCHDEV_OBJ_PORT_MDB(obj),
+					    trans);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static void
+mlxsw_sp_bridge_port_vlan_del(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct mlxsw_sp_bridge_port *bridge_port, u16 vid)
+{
+	u16 pvid = mlxsw_sp_port->pvid == vid ? 0 : vid;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
+	if (WARN_ON(!mlxsw_sp_port_vlan))
+		return;
+
+	mlxsw_sp_port_vlan_bridge_leave(mlxsw_sp_port_vlan);
+	mlxsw_sp_port_pvid_set(mlxsw_sp_port, pvid);
+	mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid, false, false);
+	mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
+}
+
+static int mlxsw_sp_port_vlans_del(struct mlxsw_sp_port *mlxsw_sp_port,
+				   const struct switchdev_obj_port_vlan *vlan)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct net_device *orig_dev = vlan->obj.orig_dev;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	u16 vid;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev);
+	if (WARN_ON(!bridge_port))
+		return -EINVAL;
+
+	if (!bridge_port->bridge_device->vlan_enabled)
+		return 0;
+
+	for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++)
+		mlxsw_sp_bridge_port_vlan_del(mlxsw_sp_port, bridge_port, vid);
+
+	return 0;
+}
+
+static int
+__mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port,
+			struct mlxsw_sp_bridge_port *bridge_port,
+			struct mlxsw_sp_mid *mid)
+{
+	struct net_device *dev = mlxsw_sp_port->dev;
+	int err;
+
+	if (bridge_port->bridge_device->multicast_enabled &&
+	    !bridge_port->mrouter) {
+		err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false);
+		if (err)
+			netdev_err(dev, "Unable to remove port from SMID\n");
+	}
+
+	err = mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid);
+	if (err)
+		netdev_err(dev, "Unable to remove MC SFD\n");
+
+	return err;
+}
+
+static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port,
+				 const struct switchdev_obj_port_mdb *mdb)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct net_device *orig_dev = mdb->obj.orig_dev;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct net_device *dev = mlxsw_sp_port->dev;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct mlxsw_sp_mid *mid;
+	u16 fid_index;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev);
+	if (!bridge_port)
+		return 0;
+
+	bridge_device = bridge_port->bridge_device;
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_bridge(mlxsw_sp_port,
+							       bridge_device,
+							       mdb->vid);
+	if (!mlxsw_sp_port_vlan)
+		return 0;
+
+	fid_index = mlxsw_sp_fid_index(mlxsw_sp_port_vlan->fid);
+
+	mid = __mlxsw_sp_mc_get(bridge_device, mdb->addr, fid_index);
+	if (!mid) {
+		netdev_err(dev, "Unable to remove port from MC DB\n");
+		return -EINVAL;
+	}
+
+	return __mlxsw_sp_port_mdb_del(mlxsw_sp_port, bridge_port, mid);
+}
+
+static void
+mlxsw_sp_bridge_port_mdb_flush(struct mlxsw_sp_port *mlxsw_sp_port,
+			       struct mlxsw_sp_bridge_port *bridge_port)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_mid *mid, *tmp;
+
+	bridge_device = bridge_port->bridge_device;
+
+	list_for_each_entry_safe(mid, tmp, &bridge_device->mids_list, list) {
+		if (test_bit(mlxsw_sp_port->local_port, mid->ports_in_mid)) {
+			__mlxsw_sp_port_mdb_del(mlxsw_sp_port, bridge_port,
+						mid);
+		} else if (bridge_device->multicast_enabled &&
+			   bridge_port->mrouter) {
+			mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false);
+		}
+	}
+}
+
+static int mlxsw_sp_port_obj_del(struct net_device *dev,
+				 const struct switchdev_obj *obj)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	int err = 0;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_ID_PORT_VLAN:
+		err = mlxsw_sp_port_vlans_del(mlxsw_sp_port,
+					      SWITCHDEV_OBJ_PORT_VLAN(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_PORT_MDB:
+		err = mlxsw_sp_port_mdb_del(mlxsw_sp_port,
+					    SWITCHDEV_OBJ_PORT_MDB(obj));
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static struct mlxsw_sp_port *mlxsw_sp_lag_rep_port(struct mlxsw_sp *mlxsw_sp,
+						   u16 lag_id)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u64 max_lag_members;
+	int i;
+
+	max_lag_members = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+					     MAX_LAG_MEMBERS);
+	for (i = 0; i < max_lag_members; i++) {
+		mlxsw_sp_port = mlxsw_sp_port_lagged_get(mlxsw_sp, lag_id, i);
+		if (mlxsw_sp_port)
+			return mlxsw_sp_port;
+	}
+	return NULL;
+}
+
+static const struct switchdev_ops mlxsw_sp_port_switchdev_ops = {
+	.switchdev_port_attr_get	= mlxsw_sp_port_attr_get,
+	.switchdev_port_attr_set	= mlxsw_sp_port_attr_set,
+	.switchdev_port_obj_add		= mlxsw_sp_port_obj_add,
+	.switchdev_port_obj_del		= mlxsw_sp_port_obj_del,
+};
+
+static int
+mlxsw_sp_bridge_8021q_port_join(struct mlxsw_sp_bridge_device *bridge_device,
+				struct mlxsw_sp_bridge_port *bridge_port,
+				struct mlxsw_sp_port *mlxsw_sp_port,
+				struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	if (is_vlan_dev(bridge_port->dev)) {
+		NL_SET_ERR_MSG_MOD(extack, "Can not enslave a VLAN device to a VLAN-aware bridge");
+		return -EINVAL;
+	}
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, 1);
+	if (WARN_ON(!mlxsw_sp_port_vlan))
+		return -EINVAL;
+
+	/* Let VLAN-aware bridge take care of its own VLANs */
+	mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
+
+	return 0;
+}
+
+static void
+mlxsw_sp_bridge_8021q_port_leave(struct mlxsw_sp_bridge_device *bridge_device,
+				 struct mlxsw_sp_bridge_port *bridge_port,
+				 struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	mlxsw_sp_port_vlan_get(mlxsw_sp_port, 1);
+	/* Make sure untagged frames are allowed to ingress */
+	mlxsw_sp_port_pvid_set(mlxsw_sp_port, 1);
+}
+
+static struct mlxsw_sp_fid *
+mlxsw_sp_bridge_8021q_fid_get(struct mlxsw_sp_bridge_device *bridge_device,
+			      u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(bridge_device->dev);
+
+	return mlxsw_sp_fid_8021q_get(mlxsw_sp, vid);
+}
+
+static const struct mlxsw_sp_bridge_ops mlxsw_sp_bridge_8021q_ops = {
+	.port_join	= mlxsw_sp_bridge_8021q_port_join,
+	.port_leave	= mlxsw_sp_bridge_8021q_port_leave,
+	.fid_get	= mlxsw_sp_bridge_8021q_fid_get,
+};
+
+static bool
+mlxsw_sp_port_is_br_member(const struct mlxsw_sp_port *mlxsw_sp_port,
+			   const struct net_device *br_dev)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &mlxsw_sp_port->vlans_list,
+			    list) {
+		if (mlxsw_sp_port_vlan->bridge_port &&
+		    mlxsw_sp_port_vlan->bridge_port->bridge_device->dev ==
+		    br_dev)
+			return true;
+	}
+
+	return false;
+}
+
+static int
+mlxsw_sp_bridge_8021d_port_join(struct mlxsw_sp_bridge_device *bridge_device,
+				struct mlxsw_sp_bridge_port *bridge_port,
+				struct mlxsw_sp_port *mlxsw_sp_port,
+				struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct net_device *dev = bridge_port->dev;
+	u16 vid;
+
+	vid = is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 1;
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
+	if (WARN_ON(!mlxsw_sp_port_vlan))
+		return -EINVAL;
+
+	if (mlxsw_sp_port_is_br_member(mlxsw_sp_port, bridge_device->dev)) {
+		NL_SET_ERR_MSG_MOD(extack, "Can not bridge VLAN uppers of the same port");
+		return -EINVAL;
+	}
+
+	/* Port is no longer usable as a router interface */
+	if (mlxsw_sp_port_vlan->fid)
+		mlxsw_sp_port_vlan_router_leave(mlxsw_sp_port_vlan);
+
+	return mlxsw_sp_port_vlan_bridge_join(mlxsw_sp_port_vlan, bridge_port);
+}
+
+static void
+mlxsw_sp_bridge_8021d_port_leave(struct mlxsw_sp_bridge_device *bridge_device,
+				 struct mlxsw_sp_bridge_port *bridge_port,
+				 struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct net_device *dev = bridge_port->dev;
+	u16 vid;
+
+	vid = is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 1;
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
+	if (WARN_ON(!mlxsw_sp_port_vlan))
+		return;
+
+	mlxsw_sp_port_vlan_bridge_leave(mlxsw_sp_port_vlan);
+}
+
+static struct mlxsw_sp_fid *
+mlxsw_sp_bridge_8021d_fid_get(struct mlxsw_sp_bridge_device *bridge_device,
+			      u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(bridge_device->dev);
+
+	return mlxsw_sp_fid_8021d_get(mlxsw_sp, bridge_device->dev->ifindex);
+}
+
+static const struct mlxsw_sp_bridge_ops mlxsw_sp_bridge_8021d_ops = {
+	.port_join	= mlxsw_sp_bridge_8021d_port_join,
+	.port_leave	= mlxsw_sp_bridge_8021d_port_leave,
+	.fid_get	= mlxsw_sp_bridge_8021d_fid_get,
+};
+
+int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct net_device *brport_dev,
+			      struct net_device *br_dev,
+			      struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	int err;
+
+	bridge_port = mlxsw_sp_bridge_port_get(mlxsw_sp->bridge, brport_dev);
+	if (IS_ERR(bridge_port))
+		return PTR_ERR(bridge_port);
+	bridge_device = bridge_port->bridge_device;
+
+	err = bridge_device->ops->port_join(bridge_device, bridge_port,
+					    mlxsw_sp_port, extack);
+	if (err)
+		goto err_port_join;
+
+	return 0;
+
+err_port_join:
+	mlxsw_sp_bridge_port_put(mlxsw_sp->bridge, bridge_port);
+	return err;
+}
+
+void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port,
+				struct net_device *brport_dev,
+				struct net_device *br_dev)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+
+	bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev);
+	if (!bridge_device)
+		return;
+	bridge_port = __mlxsw_sp_bridge_port_find(bridge_device, brport_dev);
+	if (!bridge_port)
+		return;
+
+	bridge_device->ops->port_leave(bridge_device, bridge_port,
+				       mlxsw_sp_port);
+	mlxsw_sp_bridge_port_put(mlxsw_sp->bridge, bridge_port);
+}
+
+static void
+mlxsw_sp_fdb_call_notifiers(enum switchdev_notifier_type type,
+			    const char *mac, u16 vid,
+			    struct net_device *dev)
+{
+	struct switchdev_notifier_fdb_info info;
+
+	info.addr = mac;
+	info.vid = vid;
+	call_switchdev_notifiers(type, dev, &info.info);
+}
+
+static void mlxsw_sp_fdb_notify_mac_process(struct mlxsw_sp *mlxsw_sp,
+					    char *sfn_pl, int rec_index,
+					    bool adding)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	enum switchdev_notifier_type type;
+	char mac[ETH_ALEN];
+	u8 local_port;
+	u16 vid, fid;
+	bool do_notification = true;
+	int err;
+
+	mlxsw_reg_sfn_mac_unpack(sfn_pl, rec_index, mac, &fid, &local_port);
+	mlxsw_sp_port = mlxsw_sp->ports[local_port];
+	if (!mlxsw_sp_port) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Incorrect local port in FDB notification\n");
+		goto just_remove;
+	}
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_fid(mlxsw_sp_port, fid);
+	if (!mlxsw_sp_port_vlan) {
+		netdev_err(mlxsw_sp_port->dev, "Failed to find a matching {Port, VID} following FDB notification\n");
+		goto just_remove;
+	}
+
+	bridge_port = mlxsw_sp_port_vlan->bridge_port;
+	if (!bridge_port) {
+		netdev_err(mlxsw_sp_port->dev, "{Port, VID} not associated with a bridge\n");
+		goto just_remove;
+	}
+
+	bridge_device = bridge_port->bridge_device;
+	vid = bridge_device->vlan_enabled ? mlxsw_sp_port_vlan->vid : 0;
+
+do_fdb_op:
+	err = mlxsw_sp_port_fdb_uc_op(mlxsw_sp, local_port, mac, fid,
+				      adding, true);
+	if (err) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to set FDB entry\n");
+		return;
+	}
+
+	if (!do_notification)
+		return;
+	type = adding ? SWITCHDEV_FDB_ADD_TO_BRIDGE : SWITCHDEV_FDB_DEL_TO_BRIDGE;
+	mlxsw_sp_fdb_call_notifiers(type, mac, vid, bridge_port->dev);
+
+	return;
+
+just_remove:
+	adding = false;
+	do_notification = false;
+	goto do_fdb_op;
+}
+
+static void mlxsw_sp_fdb_notify_mac_lag_process(struct mlxsw_sp *mlxsw_sp,
+						char *sfn_pl, int rec_index,
+						bool adding)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	enum switchdev_notifier_type type;
+	char mac[ETH_ALEN];
+	u16 lag_vid = 0;
+	u16 lag_id;
+	u16 vid, fid;
+	bool do_notification = true;
+	int err;
+
+	mlxsw_reg_sfn_mac_lag_unpack(sfn_pl, rec_index, mac, &fid, &lag_id);
+	mlxsw_sp_port = mlxsw_sp_lag_rep_port(mlxsw_sp, lag_id);
+	if (!mlxsw_sp_port) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Cannot find port representor for LAG\n");
+		goto just_remove;
+	}
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_fid(mlxsw_sp_port, fid);
+	if (!mlxsw_sp_port_vlan) {
+		netdev_err(mlxsw_sp_port->dev, "Failed to find a matching {Port, VID} following FDB notification\n");
+		goto just_remove;
+	}
+
+	bridge_port = mlxsw_sp_port_vlan->bridge_port;
+	if (!bridge_port) {
+		netdev_err(mlxsw_sp_port->dev, "{Port, VID} not associated with a bridge\n");
+		goto just_remove;
+	}
+
+	bridge_device = bridge_port->bridge_device;
+	vid = bridge_device->vlan_enabled ? mlxsw_sp_port_vlan->vid : 0;
+	lag_vid = mlxsw_sp_port_vlan->vid;
+
+do_fdb_op:
+	err = mlxsw_sp_port_fdb_uc_lag_op(mlxsw_sp, lag_id, mac, fid, lag_vid,
+					  adding, true);
+	if (err) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to set FDB entry\n");
+		return;
+	}
+
+	if (!do_notification)
+		return;
+	type = adding ? SWITCHDEV_FDB_ADD_TO_BRIDGE : SWITCHDEV_FDB_DEL_TO_BRIDGE;
+	mlxsw_sp_fdb_call_notifiers(type, mac, vid, bridge_port->dev);
+
+	return;
+
+just_remove:
+	adding = false;
+	do_notification = false;
+	goto do_fdb_op;
+}
+
+static void mlxsw_sp_fdb_notify_rec_process(struct mlxsw_sp *mlxsw_sp,
+					    char *sfn_pl, int rec_index)
+{
+	switch (mlxsw_reg_sfn_rec_type_get(sfn_pl, rec_index)) {
+	case MLXSW_REG_SFN_REC_TYPE_LEARNED_MAC:
+		mlxsw_sp_fdb_notify_mac_process(mlxsw_sp, sfn_pl,
+						rec_index, true);
+		break;
+	case MLXSW_REG_SFN_REC_TYPE_AGED_OUT_MAC:
+		mlxsw_sp_fdb_notify_mac_process(mlxsw_sp, sfn_pl,
+						rec_index, false);
+		break;
+	case MLXSW_REG_SFN_REC_TYPE_LEARNED_MAC_LAG:
+		mlxsw_sp_fdb_notify_mac_lag_process(mlxsw_sp, sfn_pl,
+						    rec_index, true);
+		break;
+	case MLXSW_REG_SFN_REC_TYPE_AGED_OUT_MAC_LAG:
+		mlxsw_sp_fdb_notify_mac_lag_process(mlxsw_sp, sfn_pl,
+						    rec_index, false);
+		break;
+	}
+}
+
+static void mlxsw_sp_fdb_notify_work_schedule(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_bridge *bridge = mlxsw_sp->bridge;
+
+	mlxsw_core_schedule_dw(&bridge->fdb_notify.dw,
+			       msecs_to_jiffies(bridge->fdb_notify.interval));
+}
+
+static void mlxsw_sp_fdb_notify_work(struct work_struct *work)
+{
+	struct mlxsw_sp_bridge *bridge;
+	struct mlxsw_sp *mlxsw_sp;
+	char *sfn_pl;
+	u8 num_rec;
+	int i;
+	int err;
+
+	sfn_pl = kmalloc(MLXSW_REG_SFN_LEN, GFP_KERNEL);
+	if (!sfn_pl)
+		return;
+
+	bridge = container_of(work, struct mlxsw_sp_bridge, fdb_notify.dw.work);
+	mlxsw_sp = bridge->mlxsw_sp;
+
+	rtnl_lock();
+	mlxsw_reg_sfn_pack(sfn_pl);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(sfn), sfn_pl);
+	if (err) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to get FDB notifications\n");
+		goto out;
+	}
+	num_rec = mlxsw_reg_sfn_num_rec_get(sfn_pl);
+	for (i = 0; i < num_rec; i++)
+		mlxsw_sp_fdb_notify_rec_process(mlxsw_sp, sfn_pl, i);
+
+out:
+	rtnl_unlock();
+	kfree(sfn_pl);
+	mlxsw_sp_fdb_notify_work_schedule(mlxsw_sp);
+}
+
+struct mlxsw_sp_switchdev_event_work {
+	struct work_struct work;
+	struct switchdev_notifier_fdb_info fdb_info;
+	struct net_device *dev;
+	unsigned long event;
+};
+
+static void mlxsw_sp_switchdev_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_switchdev_event_work *switchdev_work =
+		container_of(work, struct mlxsw_sp_switchdev_event_work, work);
+	struct net_device *dev = switchdev_work->dev;
+	struct switchdev_notifier_fdb_info *fdb_info;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	int err;
+
+	rtnl_lock();
+	mlxsw_sp_port = mlxsw_sp_port_dev_lower_find(dev);
+	if (!mlxsw_sp_port)
+		goto out;
+
+	switch (switchdev_work->event) {
+	case SWITCHDEV_FDB_ADD_TO_DEVICE:
+		fdb_info = &switchdev_work->fdb_info;
+		err = mlxsw_sp_port_fdb_set(mlxsw_sp_port, fdb_info, true);
+		if (err)
+			break;
+		mlxsw_sp_fdb_call_notifiers(SWITCHDEV_FDB_OFFLOADED,
+					    fdb_info->addr,
+					    fdb_info->vid, dev);
+		break;
+	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		fdb_info = &switchdev_work->fdb_info;
+		mlxsw_sp_port_fdb_set(mlxsw_sp_port, fdb_info, false);
+		break;
+	}
+
+out:
+	rtnl_unlock();
+	kfree(switchdev_work->fdb_info.addr);
+	kfree(switchdev_work);
+	dev_put(dev);
+}
+
+/* Called under rcu_read_lock() */
+static int mlxsw_sp_switchdev_event(struct notifier_block *unused,
+				    unsigned long event, void *ptr)
+{
+	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+	struct mlxsw_sp_switchdev_event_work *switchdev_work;
+	struct switchdev_notifier_fdb_info *fdb_info = ptr;
+
+	if (!mlxsw_sp_port_dev_lower_find_rcu(dev))
+		return NOTIFY_DONE;
+
+	switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
+	if (!switchdev_work)
+		return NOTIFY_BAD;
+
+	INIT_WORK(&switchdev_work->work, mlxsw_sp_switchdev_event_work);
+	switchdev_work->dev = dev;
+	switchdev_work->event = event;
+
+	switch (event) {
+	case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
+	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		memcpy(&switchdev_work->fdb_info, ptr,
+		       sizeof(switchdev_work->fdb_info));
+		switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
+		if (!switchdev_work->fdb_info.addr)
+			goto err_addr_alloc;
+		ether_addr_copy((u8 *)switchdev_work->fdb_info.addr,
+				fdb_info->addr);
+		/* Take a reference on the device. This can be either
+		 * upper device containig mlxsw_sp_port or just a
+		 * mlxsw_sp_port
+		 */
+		dev_hold(dev);
+		break;
+	default:
+		kfree(switchdev_work);
+		return NOTIFY_DONE;
+	}
+
+	mlxsw_core_schedule_work(&switchdev_work->work);
+
+	return NOTIFY_DONE;
+
+err_addr_alloc:
+	kfree(switchdev_work);
+	return NOTIFY_BAD;
+}
+
+static struct notifier_block mlxsw_sp_switchdev_notifier = {
+	.notifier_call = mlxsw_sp_switchdev_event,
+};
+
+static int mlxsw_sp_fdb_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_bridge *bridge = mlxsw_sp->bridge;
+	int err;
+
+	err = mlxsw_sp_ageing_set(mlxsw_sp, MLXSW_SP_DEFAULT_AGEING_TIME);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to set default ageing time\n");
+		return err;
+	}
+
+	err = register_switchdev_notifier(&mlxsw_sp_switchdev_notifier);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to register switchdev notifier\n");
+		return err;
+	}
+
+	INIT_DELAYED_WORK(&bridge->fdb_notify.dw, mlxsw_sp_fdb_notify_work);
+	bridge->fdb_notify.interval = MLXSW_SP_DEFAULT_LEARNING_INTERVAL;
+	mlxsw_sp_fdb_notify_work_schedule(mlxsw_sp);
+	return 0;
+}
+
+static void mlxsw_sp_fdb_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	cancel_delayed_work_sync(&mlxsw_sp->bridge->fdb_notify.dw);
+	unregister_switchdev_notifier(&mlxsw_sp_switchdev_notifier);
+
+}
+
+int mlxsw_sp_switchdev_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_bridge *bridge;
+
+	bridge = kzalloc(sizeof(*mlxsw_sp->bridge), GFP_KERNEL);
+	if (!bridge)
+		return -ENOMEM;
+	mlxsw_sp->bridge = bridge;
+	bridge->mlxsw_sp = mlxsw_sp;
+
+	INIT_LIST_HEAD(&mlxsw_sp->bridge->bridges_list);
+
+	bridge->bridge_8021q_ops = &mlxsw_sp_bridge_8021q_ops;
+	bridge->bridge_8021d_ops = &mlxsw_sp_bridge_8021d_ops;
+
+	return mlxsw_sp_fdb_init(mlxsw_sp);
+}
+
+void mlxsw_sp_switchdev_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	mlxsw_sp_fdb_fini(mlxsw_sp);
+	WARN_ON(!list_empty(&mlxsw_sp->bridge->bridges_list));
+	kfree(mlxsw_sp->bridge);
+}
+
+void mlxsw_sp_port_switchdev_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	mlxsw_sp_port->dev->switchdev_ops = &mlxsw_sp_port_switchdev_ops;
+}
+
+void mlxsw_sp_port_switchdev_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchib.c b/drivers/net/ethernet/mellanox/mlxsw/switchib.c
new file mode 100644
index 0000000..c698ec4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchib.c
@@ -0,0 +1,604 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/switchib.c
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Elad Raz <eladr@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <net/switchdev.h>
+
+#include "pci.h"
+#include "core.h"
+#include "reg.h"
+#include "port.h"
+#include "trap.h"
+#include "txheader.h"
+#include "ib.h"
+
+static const char mlxsw_sib_driver_name[] = "mlxsw_switchib";
+static const char mlxsw_sib2_driver_name[] = "mlxsw_switchib2";
+
+struct mlxsw_sib_port;
+
+struct mlxsw_sib {
+	struct mlxsw_sib_port **ports;
+	struct mlxsw_core *core;
+	const struct mlxsw_bus_info *bus_info;
+};
+
+struct mlxsw_sib_port {
+	struct mlxsw_sib *mlxsw_sib;
+	u8 local_port;
+	struct {
+		u8 module;
+	} mapping;
+};
+
+/* tx_v1_hdr_version
+ * Tx header version.
+ * Must be set to 1.
+ */
+MLXSW_ITEM32(tx_v1, hdr, version, 0x00, 28, 4);
+
+/* tx_v1_hdr_ctl
+ * Packet control type.
+ * 0 - Ethernet control (e.g. EMADs, LACP)
+ * 1 - Ethernet data
+ */
+MLXSW_ITEM32(tx_v1, hdr, ctl, 0x00, 26, 2);
+
+/* tx_v1_hdr_proto
+ * Packet protocol type. Must be set to 1 (Ethernet).
+ */
+MLXSW_ITEM32(tx_v1, hdr, proto, 0x00, 21, 3);
+
+/* tx_v1_hdr_swid
+ * Switch partition ID. Must be set to 0.
+ */
+MLXSW_ITEM32(tx_v1, hdr, swid, 0x00, 12, 3);
+
+/* tx_v1_hdr_control_tclass
+ * Indicates if the packet should use the control TClass and not one
+ * of the data TClasses.
+ */
+MLXSW_ITEM32(tx_v1, hdr, control_tclass, 0x00, 6, 1);
+
+/* tx_v1_hdr_port_mid
+ * Destination local port for unicast packets.
+ * Destination multicast ID for multicast packets.
+ *
+ * Control packets are directed to a specific egress port, while data
+ * packets are transmitted through the CPU port (0) into the switch partition,
+ * where forwarding rules are applied.
+ */
+MLXSW_ITEM32(tx_v1, hdr, port_mid, 0x04, 16, 16);
+
+/* tx_v1_hdr_type
+ * 0 - Data packets
+ * 6 - Control packets
+ */
+MLXSW_ITEM32(tx_v1, hdr, type, 0x0C, 0, 4);
+
+static void
+mlxsw_sib_tx_v1_hdr_construct(struct sk_buff *skb,
+			      const struct mlxsw_tx_info *tx_info)
+{
+	char *txhdr = skb_push(skb, MLXSW_TXHDR_LEN);
+
+	memset(txhdr, 0, MLXSW_TXHDR_LEN);
+
+	mlxsw_tx_v1_hdr_version_set(txhdr, MLXSW_TXHDR_VERSION_1);
+	mlxsw_tx_v1_hdr_ctl_set(txhdr, MLXSW_TXHDR_ETH_CTL);
+	mlxsw_tx_v1_hdr_proto_set(txhdr, MLXSW_TXHDR_PROTO_ETH);
+	mlxsw_tx_v1_hdr_swid_set(txhdr, 0);
+	mlxsw_tx_v1_hdr_control_tclass_set(txhdr, 1);
+	mlxsw_tx_v1_hdr_port_mid_set(txhdr, tx_info->local_port);
+	mlxsw_tx_v1_hdr_type_set(txhdr, MLXSW_TXHDR_TYPE_CONTROL);
+}
+
+static int
+mlxsw_sib_port_admin_status_set(struct mlxsw_sib_port *mlxsw_sib_port,
+				bool is_up)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_sib_port->mlxsw_sib;
+	char paos_pl[MLXSW_REG_PAOS_LEN];
+
+	mlxsw_reg_paos_pack(paos_pl, mlxsw_sib_port->local_port,
+			    is_up ? MLXSW_PORT_ADMIN_STATUS_UP :
+			    MLXSW_PORT_ADMIN_STATUS_DOWN);
+	return mlxsw_reg_write(mlxsw_sib->core, MLXSW_REG(paos), paos_pl);
+}
+
+static int mlxsw_sib_port_mtu_set(struct mlxsw_sib_port *mlxsw_sib_port,
+				  u16 mtu)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_sib_port->mlxsw_sib;
+	char pmtu_pl[MLXSW_REG_PMTU_LEN];
+	int max_mtu;
+	int err;
+
+	mlxsw_reg_pmtu_pack(pmtu_pl, mlxsw_sib_port->local_port, 0);
+	err = mlxsw_reg_query(mlxsw_sib->core, MLXSW_REG(pmtu), pmtu_pl);
+	if (err)
+		return err;
+	max_mtu = mlxsw_reg_pmtu_max_mtu_get(pmtu_pl);
+
+	if (mtu > max_mtu)
+		return -EINVAL;
+
+	mlxsw_reg_pmtu_pack(pmtu_pl, mlxsw_sib_port->local_port, mtu);
+	return mlxsw_reg_write(mlxsw_sib->core, MLXSW_REG(pmtu), pmtu_pl);
+}
+
+static int mlxsw_sib_port_set(struct mlxsw_sib_port *mlxsw_sib_port, u8 port)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_sib_port->mlxsw_sib;
+	char plib_pl[MLXSW_REG_PLIB_LEN] = {0};
+	int err;
+
+	mlxsw_reg_plib_local_port_set(plib_pl, mlxsw_sib_port->local_port);
+	mlxsw_reg_plib_ib_port_set(plib_pl, port);
+	err = mlxsw_reg_write(mlxsw_sib->core, MLXSW_REG(plib), plib_pl);
+	return err;
+}
+
+static int mlxsw_sib_port_swid_set(struct mlxsw_sib_port *mlxsw_sib_port,
+				   u8 swid)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_sib_port->mlxsw_sib;
+	char pspa_pl[MLXSW_REG_PSPA_LEN];
+
+	mlxsw_reg_pspa_pack(pspa_pl, swid, mlxsw_sib_port->local_port);
+	return mlxsw_reg_write(mlxsw_sib->core, MLXSW_REG(pspa), pspa_pl);
+}
+
+static int mlxsw_sib_port_module_info_get(struct mlxsw_sib *mlxsw_sib,
+					  u8 local_port, u8 *p_module,
+					  u8 *p_width)
+{
+	char pmlp_pl[MLXSW_REG_PMLP_LEN];
+	int err;
+
+	mlxsw_reg_pmlp_pack(pmlp_pl, local_port);
+	err = mlxsw_reg_query(mlxsw_sib->core, MLXSW_REG(pmlp), pmlp_pl);
+	if (err)
+		return err;
+	*p_module = mlxsw_reg_pmlp_module_get(pmlp_pl, 0);
+	*p_width = mlxsw_reg_pmlp_width_get(pmlp_pl);
+	return 0;
+}
+
+static int mlxsw_sib_port_speed_set(struct mlxsw_sib_port *mlxsw_sib_port,
+				    u16 speed, u16 width)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_sib_port->mlxsw_sib;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+
+	mlxsw_reg_ptys_ib_pack(ptys_pl, mlxsw_sib_port->local_port, speed,
+			       width);
+	return mlxsw_reg_write(mlxsw_sib->core, MLXSW_REG(ptys), ptys_pl);
+}
+
+static bool mlxsw_sib_port_created(struct mlxsw_sib *mlxsw_sib, u8 local_port)
+{
+	return mlxsw_sib->ports[local_port] != NULL;
+}
+
+static int __mlxsw_sib_port_create(struct mlxsw_sib *mlxsw_sib, u8 local_port,
+				   u8 module, u8 width)
+{
+	struct mlxsw_sib_port *mlxsw_sib_port;
+	int err;
+
+	mlxsw_sib_port = kzalloc(sizeof(*mlxsw_sib_port), GFP_KERNEL);
+	if (!mlxsw_sib_port)
+		return -ENOMEM;
+	mlxsw_sib_port->mlxsw_sib = mlxsw_sib;
+	mlxsw_sib_port->local_port = local_port;
+	mlxsw_sib_port->mapping.module = module;
+
+	err = mlxsw_sib_port_swid_set(mlxsw_sib_port, 0);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Port %d: Failed to set SWID\n",
+			mlxsw_sib_port->local_port);
+		goto err_port_swid_set;
+	}
+
+	/* Expose the IB port number as it's front panel name */
+	err = mlxsw_sib_port_set(mlxsw_sib_port, module + 1);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Port %d: Failed to set IB port\n",
+			mlxsw_sib_port->local_port);
+		goto err_port_ib_set;
+	}
+
+	/* Supports all speeds from SDR to FDR (bitmask) and support bus width
+	 * of 1x, 2x and 4x (3 bits bitmask)
+	 */
+	err = mlxsw_sib_port_speed_set(mlxsw_sib_port,
+				       MLXSW_REG_PTYS_IB_SPEED_EDR - 1,
+				       BIT(3) - 1);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Port %d: Failed to set speed\n",
+			mlxsw_sib_port->local_port);
+		goto err_port_speed_set;
+	}
+
+	/* Change to the maximum MTU the device supports, the SMA will take
+	 * care of the active MTU
+	 */
+	err = mlxsw_sib_port_mtu_set(mlxsw_sib_port, MLXSW_IB_DEFAULT_MTU);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Port %d: Failed to set MTU\n",
+			mlxsw_sib_port->local_port);
+		goto err_port_mtu_set;
+	}
+
+	err = mlxsw_sib_port_admin_status_set(mlxsw_sib_port, true);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Port %d: Failed to change admin state to UP\n",
+			mlxsw_sib_port->local_port);
+		goto err_port_admin_set;
+	}
+
+	mlxsw_core_port_ib_set(mlxsw_sib->core, mlxsw_sib_port->local_port,
+			       mlxsw_sib_port);
+	mlxsw_sib->ports[local_port] = mlxsw_sib_port;
+	return 0;
+
+err_port_admin_set:
+err_port_mtu_set:
+err_port_speed_set:
+err_port_ib_set:
+	mlxsw_sib_port_swid_set(mlxsw_sib_port, MLXSW_PORT_SWID_DISABLED_PORT);
+err_port_swid_set:
+	kfree(mlxsw_sib_port);
+	return err;
+}
+
+static int mlxsw_sib_port_create(struct mlxsw_sib *mlxsw_sib, u8 local_port,
+				 u8 module, u8 width)
+{
+	int err;
+
+	err = mlxsw_core_port_init(mlxsw_sib->core, local_port);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Port %d: Failed to init core port\n",
+			local_port);
+		return err;
+	}
+	err = __mlxsw_sib_port_create(mlxsw_sib, local_port, module, width);
+	if (err)
+		goto err_port_create;
+
+	return 0;
+
+err_port_create:
+	mlxsw_core_port_fini(mlxsw_sib->core, local_port);
+	return err;
+}
+
+static void __mlxsw_sib_port_remove(struct mlxsw_sib *mlxsw_sib, u8 local_port)
+{
+	struct mlxsw_sib_port *mlxsw_sib_port = mlxsw_sib->ports[local_port];
+
+	mlxsw_core_port_clear(mlxsw_sib->core, local_port, mlxsw_sib);
+	mlxsw_sib->ports[local_port] = NULL;
+	mlxsw_sib_port_admin_status_set(mlxsw_sib_port, false);
+	mlxsw_sib_port_swid_set(mlxsw_sib_port, MLXSW_PORT_SWID_DISABLED_PORT);
+	kfree(mlxsw_sib_port);
+}
+
+static void mlxsw_sib_port_remove(struct mlxsw_sib *mlxsw_sib, u8 local_port)
+{
+	__mlxsw_sib_port_remove(mlxsw_sib, local_port);
+	mlxsw_core_port_fini(mlxsw_sib->core, local_port);
+}
+
+static void mlxsw_sib_ports_remove(struct mlxsw_sib *mlxsw_sib)
+{
+	int i;
+
+	for (i = 1; i < MLXSW_PORT_MAX_IB_PORTS; i++)
+		if (mlxsw_sib_port_created(mlxsw_sib, i))
+			mlxsw_sib_port_remove(mlxsw_sib, i);
+	kfree(mlxsw_sib->ports);
+}
+
+static int mlxsw_sib_ports_create(struct mlxsw_sib *mlxsw_sib)
+{
+	size_t alloc_size;
+	u8 module, width;
+	int i;
+	int err;
+
+	alloc_size = sizeof(struct mlxsw_sib_port *) * MLXSW_PORT_MAX_IB_PORTS;
+	mlxsw_sib->ports = kzalloc(alloc_size, GFP_KERNEL);
+	if (!mlxsw_sib->ports)
+		return -ENOMEM;
+
+	for (i = 1; i < MLXSW_PORT_MAX_IB_PORTS; i++) {
+		err = mlxsw_sib_port_module_info_get(mlxsw_sib, i, &module,
+						     &width);
+		if (err)
+			goto err_port_module_info_get;
+		if (!width)
+			continue;
+		err = mlxsw_sib_port_create(mlxsw_sib, i, module, width);
+		if (err)
+			goto err_port_create;
+	}
+	return 0;
+
+err_port_create:
+err_port_module_info_get:
+	for (i--; i >= 1; i--)
+		if (mlxsw_sib_port_created(mlxsw_sib, i))
+			mlxsw_sib_port_remove(mlxsw_sib, i);
+	kfree(mlxsw_sib->ports);
+	return err;
+}
+
+static void
+mlxsw_sib_pude_ib_event_func(struct mlxsw_sib_port *mlxsw_sib_port,
+			     enum mlxsw_reg_pude_oper_status status)
+{
+	if (status == MLXSW_PORT_OPER_STATUS_UP)
+		pr_info("ib link for port %d - up\n",
+			mlxsw_sib_port->mapping.module + 1);
+	else
+		pr_info("ib link for port %d - down\n",
+			mlxsw_sib_port->mapping.module + 1);
+}
+
+static void mlxsw_sib_pude_event_func(const struct mlxsw_reg_info *reg,
+				      char *pude_pl, void *priv)
+{
+	struct mlxsw_sib *mlxsw_sib = priv;
+	struct mlxsw_sib_port *mlxsw_sib_port;
+	enum mlxsw_reg_pude_oper_status status;
+	u8 local_port;
+
+	local_port = mlxsw_reg_pude_local_port_get(pude_pl);
+	mlxsw_sib_port = mlxsw_sib->ports[local_port];
+	if (!mlxsw_sib_port) {
+		dev_warn(mlxsw_sib->bus_info->dev, "Port %d: Link event received for non-existent port\n",
+			 local_port);
+		return;
+	}
+
+	status = mlxsw_reg_pude_oper_status_get(pude_pl);
+	mlxsw_sib_pude_ib_event_func(mlxsw_sib_port, status);
+}
+
+static const struct mlxsw_listener mlxsw_sib_listener[] = {
+	MLXSW_EVENTL(mlxsw_sib_pude_event_func, PUDE, EMAD),
+};
+
+static int mlxsw_sib_taps_init(struct mlxsw_sib *mlxsw_sib)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sib_listener); i++) {
+		err = mlxsw_core_trap_register(mlxsw_sib->core,
+					       &mlxsw_sib_listener[i],
+					       mlxsw_sib);
+		if (err)
+			goto err_rx_listener_register;
+	}
+
+	return 0;
+
+err_rx_listener_register:
+	for (i--; i >= 0; i--) {
+		mlxsw_core_trap_unregister(mlxsw_sib->core,
+					   &mlxsw_sib_listener[i],
+					   mlxsw_sib);
+	}
+
+	return err;
+}
+
+static void mlxsw_sib_traps_fini(struct mlxsw_sib *mlxsw_sib)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sib_listener); i++) {
+		mlxsw_core_trap_unregister(mlxsw_sib->core,
+					   &mlxsw_sib_listener[i], mlxsw_sib);
+	}
+}
+
+static int mlxsw_sib_basic_trap_groups_set(struct mlxsw_core *mlxsw_core)
+{
+	char htgt_pl[MLXSW_REG_HTGT_LEN];
+
+	mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_EMAD,
+			    MLXSW_REG_HTGT_INVALID_POLICER,
+			    MLXSW_REG_HTGT_DEFAULT_PRIORITY,
+			    MLXSW_REG_HTGT_DEFAULT_TC);
+	mlxsw_reg_htgt_swid_set(htgt_pl, MLXSW_PORT_SWID_ALL_SWIDS);
+	mlxsw_reg_htgt_local_path_rdq_set(htgt_pl,
+					MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SIB_EMAD);
+	return mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
+}
+
+static int mlxsw_sib_init(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_bus_info *mlxsw_bus_info)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_core_driver_priv(mlxsw_core);
+	int err;
+
+	mlxsw_sib->core = mlxsw_core;
+	mlxsw_sib->bus_info = mlxsw_bus_info;
+
+	err = mlxsw_sib_ports_create(mlxsw_sib);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Failed to create ports\n");
+		return err;
+	}
+
+	err = mlxsw_sib_taps_init(mlxsw_sib);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Failed to set traps\n");
+		goto err_traps_init_err;
+	}
+
+	return 0;
+
+err_traps_init_err:
+	mlxsw_sib_ports_remove(mlxsw_sib);
+	return err;
+}
+
+static void mlxsw_sib_fini(struct mlxsw_core *mlxsw_core)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_core_driver_priv(mlxsw_core);
+
+	mlxsw_sib_traps_fini(mlxsw_sib);
+	mlxsw_sib_ports_remove(mlxsw_sib);
+}
+
+static const struct mlxsw_config_profile mlxsw_sib_config_profile = {
+	.used_max_system_port		= 1,
+	.max_system_port		= 48000,
+	.used_max_ib_mc			= 1,
+	.max_ib_mc			= 27,
+	.used_max_pkey			= 1,
+	.max_pkey			= 32,
+	.swid_config			= {
+		{
+			.used_type	= 1,
+			.type		= MLXSW_PORT_SWID_TYPE_IB,
+		}
+	},
+};
+
+static struct mlxsw_driver mlxsw_sib_driver = {
+	.kind			= mlxsw_sib_driver_name,
+	.priv_size		= sizeof(struct mlxsw_sib),
+	.init			= mlxsw_sib_init,
+	.fini			= mlxsw_sib_fini,
+	.basic_trap_groups_set	= mlxsw_sib_basic_trap_groups_set,
+	.txhdr_construct	= mlxsw_sib_tx_v1_hdr_construct,
+	.txhdr_len		= MLXSW_TXHDR_LEN,
+	.profile		= &mlxsw_sib_config_profile,
+};
+
+static struct mlxsw_driver mlxsw_sib2_driver = {
+	.kind			= mlxsw_sib2_driver_name,
+	.priv_size		= sizeof(struct mlxsw_sib),
+	.init			= mlxsw_sib_init,
+	.fini			= mlxsw_sib_fini,
+	.basic_trap_groups_set	= mlxsw_sib_basic_trap_groups_set,
+	.txhdr_construct	= mlxsw_sib_tx_v1_hdr_construct,
+	.txhdr_len		= MLXSW_TXHDR_LEN,
+	.profile		= &mlxsw_sib_config_profile,
+};
+
+static const struct pci_device_id mlxsw_sib_pci_id_table[] = {
+	{PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_SWITCHIB), 0},
+	{0, },
+};
+
+static struct pci_driver mlxsw_sib_pci_driver = {
+	.name = mlxsw_sib_driver_name,
+	.id_table = mlxsw_sib_pci_id_table,
+};
+
+static const struct pci_device_id mlxsw_sib2_pci_id_table[] = {
+	{PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_SWITCHIB2), 0},
+	{0, },
+};
+
+static struct pci_driver mlxsw_sib2_pci_driver = {
+	.name = mlxsw_sib2_driver_name,
+	.id_table = mlxsw_sib2_pci_id_table,
+};
+
+static int __init mlxsw_sib_module_init(void)
+{
+	int err;
+
+	err = mlxsw_core_driver_register(&mlxsw_sib_driver);
+	if (err)
+		return err;
+
+	err = mlxsw_core_driver_register(&mlxsw_sib2_driver);
+	if (err)
+		goto err_sib2_driver_register;
+
+	err = mlxsw_pci_driver_register(&mlxsw_sib_pci_driver);
+	if (err)
+		goto err_sib_pci_driver_register;
+
+	err = mlxsw_pci_driver_register(&mlxsw_sib2_pci_driver);
+	if (err)
+		goto err_sib2_pci_driver_register;
+
+	return 0;
+
+err_sib2_pci_driver_register:
+	mlxsw_pci_driver_unregister(&mlxsw_sib_pci_driver);
+err_sib_pci_driver_register:
+	mlxsw_core_driver_unregister(&mlxsw_sib2_driver);
+err_sib2_driver_register:
+	mlxsw_core_driver_unregister(&mlxsw_sib_driver);
+	return err;
+}
+
+static void __exit mlxsw_sib_module_exit(void)
+{
+	mlxsw_pci_driver_unregister(&mlxsw_sib2_pci_driver);
+	mlxsw_pci_driver_unregister(&mlxsw_sib_pci_driver);
+	mlxsw_core_driver_unregister(&mlxsw_sib2_driver);
+	mlxsw_core_driver_unregister(&mlxsw_sib_driver);
+}
+
+module_init(mlxsw_sib_module_init);
+module_exit(mlxsw_sib_module_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Elad Raz <eladr@@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox SwitchIB and SwitchIB-2 driver");
+MODULE_ALIAS("mlxsw_switchib2");
+MODULE_DEVICE_TABLE(pci, mlxsw_sib_pci_id_table);
+MODULE_DEVICE_TABLE(pci, mlxsw_sib2_pci_id_table);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
new file mode 100644
index 0000000..a655c58
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
@@ -0,0 +1,1764 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/switchx2.c
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2015 Ido Schimmel <idosch@mellanox.com>
+ * Copyright (c) 2015-2016 Elad Raz <eladr@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <net/switchdev.h>
+
+#include "pci.h"
+#include "core.h"
+#include "reg.h"
+#include "port.h"
+#include "trap.h"
+#include "txheader.h"
+#include "ib.h"
+
+static const char mlxsw_sx_driver_name[] = "mlxsw_switchx2";
+static const char mlxsw_sx_driver_version[] = "1.0";
+
+struct mlxsw_sx_port;
+
+struct mlxsw_sx {
+	struct mlxsw_sx_port **ports;
+	struct mlxsw_core *core;
+	const struct mlxsw_bus_info *bus_info;
+	u8 hw_id[ETH_ALEN];
+};
+
+struct mlxsw_sx_port_pcpu_stats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+	u32			tx_dropped;
+};
+
+struct mlxsw_sx_port {
+	struct net_device *dev;
+	struct mlxsw_sx_port_pcpu_stats __percpu *pcpu_stats;
+	struct mlxsw_sx *mlxsw_sx;
+	u8 local_port;
+	struct {
+		u8 module;
+	} mapping;
+};
+
+/* tx_hdr_version
+ * Tx header version.
+ * Must be set to 0.
+ */
+MLXSW_ITEM32(tx, hdr, version, 0x00, 28, 4);
+
+/* tx_hdr_ctl
+ * Packet control type.
+ * 0 - Ethernet control (e.g. EMADs, LACP)
+ * 1 - Ethernet data
+ */
+MLXSW_ITEM32(tx, hdr, ctl, 0x00, 26, 2);
+
+/* tx_hdr_proto
+ * Packet protocol type. Must be set to 1 (Ethernet).
+ */
+MLXSW_ITEM32(tx, hdr, proto, 0x00, 21, 3);
+
+/* tx_hdr_etclass
+ * Egress TClass to be used on the egress device on the egress port.
+ * The MSB is specified in the 'ctclass3' field.
+ * Range is 0-15, where 15 is the highest priority.
+ */
+MLXSW_ITEM32(tx, hdr, etclass, 0x00, 18, 3);
+
+/* tx_hdr_swid
+ * Switch partition ID.
+ */
+MLXSW_ITEM32(tx, hdr, swid, 0x00, 12, 3);
+
+/* tx_hdr_port_mid
+ * Destination local port for unicast packets.
+ * Destination multicast ID for multicast packets.
+ *
+ * Control packets are directed to a specific egress port, while data
+ * packets are transmitted through the CPU port (0) into the switch partition,
+ * where forwarding rules are applied.
+ */
+MLXSW_ITEM32(tx, hdr, port_mid, 0x04, 16, 16);
+
+/* tx_hdr_ctclass3
+ * See field 'etclass'.
+ */
+MLXSW_ITEM32(tx, hdr, ctclass3, 0x04, 14, 1);
+
+/* tx_hdr_rdq
+ * RDQ for control packets sent to remote CPU.
+ * Must be set to 0x1F for EMADs, otherwise 0.
+ */
+MLXSW_ITEM32(tx, hdr, rdq, 0x04, 9, 5);
+
+/* tx_hdr_cpu_sig
+ * Signature control for packets going to CPU. Must be set to 0.
+ */
+MLXSW_ITEM32(tx, hdr, cpu_sig, 0x04, 0, 9);
+
+/* tx_hdr_sig
+ * Stacking protocl signature. Must be set to 0xE0E0.
+ */
+MLXSW_ITEM32(tx, hdr, sig, 0x0C, 16, 16);
+
+/* tx_hdr_stclass
+ * Stacking TClass.
+ */
+MLXSW_ITEM32(tx, hdr, stclass, 0x0C, 13, 3);
+
+/* tx_hdr_emad
+ * EMAD bit. Must be set for EMADs.
+ */
+MLXSW_ITEM32(tx, hdr, emad, 0x0C, 5, 1);
+
+/* tx_hdr_type
+ * 0 - Data packets
+ * 6 - Control packets
+ */
+MLXSW_ITEM32(tx, hdr, type, 0x0C, 0, 4);
+
+static void mlxsw_sx_txhdr_construct(struct sk_buff *skb,
+				     const struct mlxsw_tx_info *tx_info)
+{
+	char *txhdr = skb_push(skb, MLXSW_TXHDR_LEN);
+	bool is_emad = tx_info->is_emad;
+
+	memset(txhdr, 0, MLXSW_TXHDR_LEN);
+
+	/* We currently set default values for the egress tclass (QoS). */
+	mlxsw_tx_hdr_version_set(txhdr, MLXSW_TXHDR_VERSION_0);
+	mlxsw_tx_hdr_ctl_set(txhdr, MLXSW_TXHDR_ETH_CTL);
+	mlxsw_tx_hdr_proto_set(txhdr, MLXSW_TXHDR_PROTO_ETH);
+	mlxsw_tx_hdr_etclass_set(txhdr, is_emad ? MLXSW_TXHDR_ETCLASS_6 :
+						  MLXSW_TXHDR_ETCLASS_5);
+	mlxsw_tx_hdr_swid_set(txhdr, 0);
+	mlxsw_tx_hdr_port_mid_set(txhdr, tx_info->local_port);
+	mlxsw_tx_hdr_ctclass3_set(txhdr, MLXSW_TXHDR_CTCLASS3);
+	mlxsw_tx_hdr_rdq_set(txhdr, is_emad ? MLXSW_TXHDR_RDQ_EMAD :
+					      MLXSW_TXHDR_RDQ_OTHER);
+	mlxsw_tx_hdr_cpu_sig_set(txhdr, MLXSW_TXHDR_CPU_SIG);
+	mlxsw_tx_hdr_sig_set(txhdr, MLXSW_TXHDR_SIG);
+	mlxsw_tx_hdr_stclass_set(txhdr, MLXSW_TXHDR_STCLASS_NONE);
+	mlxsw_tx_hdr_emad_set(txhdr, is_emad ? MLXSW_TXHDR_EMAD :
+					       MLXSW_TXHDR_NOT_EMAD);
+	mlxsw_tx_hdr_type_set(txhdr, MLXSW_TXHDR_TYPE_CONTROL);
+}
+
+static int mlxsw_sx_port_admin_status_set(struct mlxsw_sx_port *mlxsw_sx_port,
+					  bool is_up)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char paos_pl[MLXSW_REG_PAOS_LEN];
+
+	mlxsw_reg_paos_pack(paos_pl, mlxsw_sx_port->local_port,
+			    is_up ? MLXSW_PORT_ADMIN_STATUS_UP :
+			    MLXSW_PORT_ADMIN_STATUS_DOWN);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(paos), paos_pl);
+}
+
+static int mlxsw_sx_port_oper_status_get(struct mlxsw_sx_port *mlxsw_sx_port,
+					 bool *p_is_up)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char paos_pl[MLXSW_REG_PAOS_LEN];
+	u8 oper_status;
+	int err;
+
+	mlxsw_reg_paos_pack(paos_pl, mlxsw_sx_port->local_port, 0);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(paos), paos_pl);
+	if (err)
+		return err;
+	oper_status = mlxsw_reg_paos_oper_status_get(paos_pl);
+	*p_is_up = oper_status == MLXSW_PORT_ADMIN_STATUS_UP ? true : false;
+	return 0;
+}
+
+static int __mlxsw_sx_port_mtu_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				   u16 mtu)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char pmtu_pl[MLXSW_REG_PMTU_LEN];
+	int max_mtu;
+	int err;
+
+	mlxsw_reg_pmtu_pack(pmtu_pl, mlxsw_sx_port->local_port, 0);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(pmtu), pmtu_pl);
+	if (err)
+		return err;
+	max_mtu = mlxsw_reg_pmtu_max_mtu_get(pmtu_pl);
+
+	if (mtu > max_mtu)
+		return -EINVAL;
+
+	mlxsw_reg_pmtu_pack(pmtu_pl, mlxsw_sx_port->local_port, mtu);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(pmtu), pmtu_pl);
+}
+
+static int mlxsw_sx_port_mtu_eth_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				     u16 mtu)
+{
+	mtu += MLXSW_TXHDR_LEN + ETH_HLEN;
+	return __mlxsw_sx_port_mtu_set(mlxsw_sx_port, mtu);
+}
+
+static int mlxsw_sx_port_mtu_ib_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				    u16 mtu)
+{
+	return __mlxsw_sx_port_mtu_set(mlxsw_sx_port, mtu);
+}
+
+static int mlxsw_sx_port_ib_port_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				     u8 ib_port)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char plib_pl[MLXSW_REG_PLIB_LEN] = {0};
+	int err;
+
+	mlxsw_reg_plib_local_port_set(plib_pl, mlxsw_sx_port->local_port);
+	mlxsw_reg_plib_ib_port_set(plib_pl, ib_port);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(plib), plib_pl);
+	return err;
+}
+
+static int mlxsw_sx_port_swid_set(struct mlxsw_sx_port *mlxsw_sx_port, u8 swid)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char pspa_pl[MLXSW_REG_PSPA_LEN];
+
+	mlxsw_reg_pspa_pack(pspa_pl, swid, mlxsw_sx_port->local_port);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(pspa), pspa_pl);
+}
+
+static int
+mlxsw_sx_port_system_port_mapping_set(struct mlxsw_sx_port *mlxsw_sx_port)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char sspr_pl[MLXSW_REG_SSPR_LEN];
+
+	mlxsw_reg_sspr_pack(sspr_pl, mlxsw_sx_port->local_port);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sspr), sspr_pl);
+}
+
+static int mlxsw_sx_port_module_info_get(struct mlxsw_sx *mlxsw_sx,
+					 u8 local_port, u8 *p_module,
+					 u8 *p_width)
+{
+	char pmlp_pl[MLXSW_REG_PMLP_LEN];
+	int err;
+
+	mlxsw_reg_pmlp_pack(pmlp_pl, local_port);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(pmlp), pmlp_pl);
+	if (err)
+		return err;
+	*p_module = mlxsw_reg_pmlp_module_get(pmlp_pl, 0);
+	*p_width = mlxsw_reg_pmlp_width_get(pmlp_pl);
+	return 0;
+}
+
+static int mlxsw_sx_port_open(struct net_device *dev)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	int err;
+
+	err = mlxsw_sx_port_admin_status_set(mlxsw_sx_port, true);
+	if (err)
+		return err;
+	netif_start_queue(dev);
+	return 0;
+}
+
+static int mlxsw_sx_port_stop(struct net_device *dev)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+
+	netif_stop_queue(dev);
+	return mlxsw_sx_port_admin_status_set(mlxsw_sx_port, false);
+}
+
+static netdev_tx_t mlxsw_sx_port_xmit(struct sk_buff *skb,
+				      struct net_device *dev)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	struct mlxsw_sx_port_pcpu_stats *pcpu_stats;
+	const struct mlxsw_tx_info tx_info = {
+		.local_port = mlxsw_sx_port->local_port,
+		.is_emad = false,
+	};
+	u64 len;
+	int err;
+
+	if (mlxsw_core_skb_transmit_busy(mlxsw_sx->core, &tx_info))
+		return NETDEV_TX_BUSY;
+
+	if (unlikely(skb_headroom(skb) < MLXSW_TXHDR_LEN)) {
+		struct sk_buff *skb_orig = skb;
+
+		skb = skb_realloc_headroom(skb, MLXSW_TXHDR_LEN);
+		if (!skb) {
+			this_cpu_inc(mlxsw_sx_port->pcpu_stats->tx_dropped);
+			dev_kfree_skb_any(skb_orig);
+			return NETDEV_TX_OK;
+		}
+		dev_consume_skb_any(skb_orig);
+	}
+	mlxsw_sx_txhdr_construct(skb, &tx_info);
+	/* TX header is consumed by HW on the way so we shouldn't count its
+	 * bytes as being sent.
+	 */
+	len = skb->len - MLXSW_TXHDR_LEN;
+	/* Due to a race we might fail here because of a full queue. In that
+	 * unlikely case we simply drop the packet.
+	 */
+	err = mlxsw_core_skb_transmit(mlxsw_sx->core, skb, &tx_info);
+
+	if (!err) {
+		pcpu_stats = this_cpu_ptr(mlxsw_sx_port->pcpu_stats);
+		u64_stats_update_begin(&pcpu_stats->syncp);
+		pcpu_stats->tx_packets++;
+		pcpu_stats->tx_bytes += len;
+		u64_stats_update_end(&pcpu_stats->syncp);
+	} else {
+		this_cpu_inc(mlxsw_sx_port->pcpu_stats->tx_dropped);
+		dev_kfree_skb_any(skb);
+	}
+	return NETDEV_TX_OK;
+}
+
+static int mlxsw_sx_port_change_mtu(struct net_device *dev, int mtu)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	int err;
+
+	err = mlxsw_sx_port_mtu_eth_set(mlxsw_sx_port, mtu);
+	if (err)
+		return err;
+	dev->mtu = mtu;
+	return 0;
+}
+
+static void
+mlxsw_sx_port_get_stats64(struct net_device *dev,
+			  struct rtnl_link_stats64 *stats)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx_port_pcpu_stats *p;
+	u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+	u32 tx_dropped = 0;
+	unsigned int start;
+	int i;
+
+	for_each_possible_cpu(i) {
+		p = per_cpu_ptr(mlxsw_sx_port->pcpu_stats, i);
+		do {
+			start = u64_stats_fetch_begin_irq(&p->syncp);
+			rx_packets	= p->rx_packets;
+			rx_bytes	= p->rx_bytes;
+			tx_packets	= p->tx_packets;
+			tx_bytes	= p->tx_bytes;
+		} while (u64_stats_fetch_retry_irq(&p->syncp, start));
+
+		stats->rx_packets	+= rx_packets;
+		stats->rx_bytes		+= rx_bytes;
+		stats->tx_packets	+= tx_packets;
+		stats->tx_bytes		+= tx_bytes;
+		/* tx_dropped is u32, updated without syncp protection. */
+		tx_dropped	+= p->tx_dropped;
+	}
+	stats->tx_dropped	= tx_dropped;
+}
+
+static int mlxsw_sx_port_get_phys_port_name(struct net_device *dev, char *name,
+					    size_t len)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	int err;
+
+	err = snprintf(name, len, "p%d", mlxsw_sx_port->mapping.module + 1);
+	if (err >= len)
+		return -EINVAL;
+
+	return 0;
+}
+
+static const struct net_device_ops mlxsw_sx_port_netdev_ops = {
+	.ndo_open		= mlxsw_sx_port_open,
+	.ndo_stop		= mlxsw_sx_port_stop,
+	.ndo_start_xmit		= mlxsw_sx_port_xmit,
+	.ndo_change_mtu		= mlxsw_sx_port_change_mtu,
+	.ndo_get_stats64	= mlxsw_sx_port_get_stats64,
+	.ndo_get_phys_port_name = mlxsw_sx_port_get_phys_port_name,
+};
+
+static void mlxsw_sx_port_get_drvinfo(struct net_device *dev,
+				      struct ethtool_drvinfo *drvinfo)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+
+	strlcpy(drvinfo->driver, mlxsw_sx_driver_name, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, mlxsw_sx_driver_version,
+		sizeof(drvinfo->version));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "%d.%d.%d",
+		 mlxsw_sx->bus_info->fw_rev.major,
+		 mlxsw_sx->bus_info->fw_rev.minor,
+		 mlxsw_sx->bus_info->fw_rev.subminor);
+	strlcpy(drvinfo->bus_info, mlxsw_sx->bus_info->device_name,
+		sizeof(drvinfo->bus_info));
+}
+
+struct mlxsw_sx_port_hw_stats {
+	char str[ETH_GSTRING_LEN];
+	u64 (*getter)(const char *payload);
+};
+
+static const struct mlxsw_sx_port_hw_stats mlxsw_sx_port_hw_stats[] = {
+	{
+		.str = "a_frames_transmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_frames_transmitted_ok_get,
+	},
+	{
+		.str = "a_frames_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_frames_received_ok_get,
+	},
+	{
+		.str = "a_frame_check_sequence_errors",
+		.getter = mlxsw_reg_ppcnt_a_frame_check_sequence_errors_get,
+	},
+	{
+		.str = "a_alignment_errors",
+		.getter = mlxsw_reg_ppcnt_a_alignment_errors_get,
+	},
+	{
+		.str = "a_octets_transmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_octets_transmitted_ok_get,
+	},
+	{
+		.str = "a_octets_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_octets_received_ok_get,
+	},
+	{
+		.str = "a_multicast_frames_xmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_multicast_frames_xmitted_ok_get,
+	},
+	{
+		.str = "a_broadcast_frames_xmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_broadcast_frames_xmitted_ok_get,
+	},
+	{
+		.str = "a_multicast_frames_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_multicast_frames_received_ok_get,
+	},
+	{
+		.str = "a_broadcast_frames_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_broadcast_frames_received_ok_get,
+	},
+	{
+		.str = "a_in_range_length_errors",
+		.getter = mlxsw_reg_ppcnt_a_in_range_length_errors_get,
+	},
+	{
+		.str = "a_out_of_range_length_field",
+		.getter = mlxsw_reg_ppcnt_a_out_of_range_length_field_get,
+	},
+	{
+		.str = "a_frame_too_long_errors",
+		.getter = mlxsw_reg_ppcnt_a_frame_too_long_errors_get,
+	},
+	{
+		.str = "a_symbol_error_during_carrier",
+		.getter = mlxsw_reg_ppcnt_a_symbol_error_during_carrier_get,
+	},
+	{
+		.str = "a_mac_control_frames_transmitted",
+		.getter = mlxsw_reg_ppcnt_a_mac_control_frames_transmitted_get,
+	},
+	{
+		.str = "a_mac_control_frames_received",
+		.getter = mlxsw_reg_ppcnt_a_mac_control_frames_received_get,
+	},
+	{
+		.str = "a_unsupported_opcodes_received",
+		.getter = mlxsw_reg_ppcnt_a_unsupported_opcodes_received_get,
+	},
+	{
+		.str = "a_pause_mac_ctrl_frames_received",
+		.getter = mlxsw_reg_ppcnt_a_pause_mac_ctrl_frames_received_get,
+	},
+	{
+		.str = "a_pause_mac_ctrl_frames_xmitted",
+		.getter = mlxsw_reg_ppcnt_a_pause_mac_ctrl_frames_transmitted_get,
+	},
+};
+
+#define MLXSW_SX_PORT_HW_STATS_LEN ARRAY_SIZE(mlxsw_sx_port_hw_stats)
+
+static void mlxsw_sx_port_get_strings(struct net_device *dev,
+				      u32 stringset, u8 *data)
+{
+	u8 *p = data;
+	int i;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < MLXSW_SX_PORT_HW_STATS_LEN; i++) {
+			memcpy(p, mlxsw_sx_port_hw_stats[i].str,
+			       ETH_GSTRING_LEN);
+			p += ETH_GSTRING_LEN;
+		}
+		break;
+	}
+}
+
+static void mlxsw_sx_port_get_stats(struct net_device *dev,
+				    struct ethtool_stats *stats, u64 *data)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char ppcnt_pl[MLXSW_REG_PPCNT_LEN];
+	int i;
+	int err;
+
+	mlxsw_reg_ppcnt_pack(ppcnt_pl, mlxsw_sx_port->local_port,
+			     MLXSW_REG_PPCNT_IEEE_8023_CNT, 0);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(ppcnt), ppcnt_pl);
+	for (i = 0; i < MLXSW_SX_PORT_HW_STATS_LEN; i++)
+		data[i] = !err ? mlxsw_sx_port_hw_stats[i].getter(ppcnt_pl) : 0;
+}
+
+static int mlxsw_sx_port_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return MLXSW_SX_PORT_HW_STATS_LEN;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+struct mlxsw_sx_port_link_mode {
+	u32 mask;
+	u32 supported;
+	u32 advertised;
+	u32 speed;
+};
+
+static const struct mlxsw_sx_port_link_mode mlxsw_sx_port_link_mode[] = {
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100BASE_T,
+		.supported	= SUPPORTED_100baseT_Full,
+		.advertised	= ADVERTISED_100baseT_Full,
+		.speed		= 100,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100BASE_TX,
+		.speed		= 100,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_SGMII |
+				  MLXSW_REG_PTYS_ETH_SPEED_1000BASE_KX,
+		.supported	= SUPPORTED_1000baseKX_Full,
+		.advertised	= ADVERTISED_1000baseKX_Full,
+		.speed		= 1000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_10GBASE_T,
+		.supported	= SUPPORTED_10000baseT_Full,
+		.advertised	= ADVERTISED_10000baseT_Full,
+		.speed		= 10000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CX4 |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4,
+		.supported	= SUPPORTED_10000baseKX4_Full,
+		.advertised	= ADVERTISED_10000baseKX4_Full,
+		.speed		= 10000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_ER_LR,
+		.supported	= SUPPORTED_10000baseKR_Full,
+		.advertised	= ADVERTISED_10000baseKR_Full,
+		.speed		= 10000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_20GBASE_KR2,
+		.supported	= SUPPORTED_20000baseKR2_Full,
+		.advertised	= ADVERTISED_20000baseKR2_Full,
+		.speed		= 20000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4,
+		.supported	= SUPPORTED_40000baseCR4_Full,
+		.advertised	= ADVERTISED_40000baseCR4_Full,
+		.speed		= 40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4,
+		.supported	= SUPPORTED_40000baseKR4_Full,
+		.advertised	= ADVERTISED_40000baseKR4_Full,
+		.speed		= 40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4,
+		.supported	= SUPPORTED_40000baseSR4_Full,
+		.advertised	= ADVERTISED_40000baseSR4_Full,
+		.speed		= 40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_LR4_ER4,
+		.supported	= SUPPORTED_40000baseLR4_Full,
+		.advertised	= ADVERTISED_40000baseLR4_Full,
+		.speed		= 40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_25GBASE_CR |
+				  MLXSW_REG_PTYS_ETH_SPEED_25GBASE_KR |
+				  MLXSW_REG_PTYS_ETH_SPEED_25GBASE_SR,
+		.speed		= 25000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_50GBASE_KR4 |
+				  MLXSW_REG_PTYS_ETH_SPEED_50GBASE_CR2 |
+				  MLXSW_REG_PTYS_ETH_SPEED_50GBASE_KR2,
+		.speed		= 50000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_56GBASE_R4,
+		.supported	= SUPPORTED_56000baseKR4_Full,
+		.advertised	= ADVERTISED_56000baseKR4_Full,
+		.speed		= 56000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100GBASE_CR4 |
+				  MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4 |
+				  MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4 |
+				  MLXSW_REG_PTYS_ETH_SPEED_100GBASE_LR4_ER4,
+		.speed		= 100000,
+	},
+};
+
+#define MLXSW_SX_PORT_LINK_MODE_LEN ARRAY_SIZE(mlxsw_sx_port_link_mode)
+#define MLXSW_SX_PORT_BASE_SPEED 10000 /* Mb/s */
+
+static u32 mlxsw_sx_from_ptys_supported_port(u32 ptys_eth_proto)
+{
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR |
+			      MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_SGMII))
+		return SUPPORTED_FIBRE;
+
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR |
+			      MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_1000BASE_KX))
+		return SUPPORTED_Backplane;
+	return 0;
+}
+
+static u32 mlxsw_sx_from_ptys_supported_link(u32 ptys_eth_proto)
+{
+	u32 modes = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SX_PORT_LINK_MODE_LEN; i++) {
+		if (ptys_eth_proto & mlxsw_sx_port_link_mode[i].mask)
+			modes |= mlxsw_sx_port_link_mode[i].supported;
+	}
+	return modes;
+}
+
+static u32 mlxsw_sx_from_ptys_advert_link(u32 ptys_eth_proto)
+{
+	u32 modes = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SX_PORT_LINK_MODE_LEN; i++) {
+		if (ptys_eth_proto & mlxsw_sx_port_link_mode[i].mask)
+			modes |= mlxsw_sx_port_link_mode[i].advertised;
+	}
+	return modes;
+}
+
+static void mlxsw_sx_from_ptys_speed_duplex(bool carrier_ok, u32 ptys_eth_proto,
+					    struct ethtool_link_ksettings *cmd)
+{
+	u32 speed = SPEED_UNKNOWN;
+	u8 duplex = DUPLEX_UNKNOWN;
+	int i;
+
+	if (!carrier_ok)
+		goto out;
+
+	for (i = 0; i < MLXSW_SX_PORT_LINK_MODE_LEN; i++) {
+		if (ptys_eth_proto & mlxsw_sx_port_link_mode[i].mask) {
+			speed = mlxsw_sx_port_link_mode[i].speed;
+			duplex = DUPLEX_FULL;
+			break;
+		}
+	}
+out:
+	cmd->base.speed = speed;
+	cmd->base.duplex = duplex;
+}
+
+static u8 mlxsw_sx_port_connector_port(u32 ptys_eth_proto)
+{
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_SGMII))
+		return PORT_FIBRE;
+
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_CR4))
+		return PORT_DA;
+
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR |
+			      MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4))
+		return PORT_NONE;
+
+	return PORT_OTHER;
+}
+
+static int
+mlxsw_sx_port_get_link_ksettings(struct net_device *dev,
+				 struct ethtool_link_ksettings *cmd)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+	u32 eth_proto_cap;
+	u32 eth_proto_admin;
+	u32 eth_proto_oper;
+	u32 supported, advertising, lp_advertising;
+	int err;
+
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sx_port->local_port, 0, false);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(ptys), ptys_pl);
+	if (err) {
+		netdev_err(dev, "Failed to get proto");
+		return err;
+	}
+	mlxsw_reg_ptys_eth_unpack(ptys_pl, &eth_proto_cap,
+				  &eth_proto_admin, &eth_proto_oper);
+
+	supported = mlxsw_sx_from_ptys_supported_port(eth_proto_cap) |
+			 mlxsw_sx_from_ptys_supported_link(eth_proto_cap) |
+			 SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+	advertising = mlxsw_sx_from_ptys_advert_link(eth_proto_admin);
+	mlxsw_sx_from_ptys_speed_duplex(netif_carrier_ok(dev),
+					eth_proto_oper, cmd);
+
+	eth_proto_oper = eth_proto_oper ? eth_proto_oper : eth_proto_cap;
+	cmd->base.port = mlxsw_sx_port_connector_port(eth_proto_oper);
+	lp_advertising = mlxsw_sx_from_ptys_advert_link(eth_proto_oper);
+
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
+						supported);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
+						advertising);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.lp_advertising,
+						lp_advertising);
+
+	return 0;
+}
+
+static u32 mlxsw_sx_to_ptys_advert_link(u32 advertising)
+{
+	u32 ptys_proto = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SX_PORT_LINK_MODE_LEN; i++) {
+		if (advertising & mlxsw_sx_port_link_mode[i].advertised)
+			ptys_proto |= mlxsw_sx_port_link_mode[i].mask;
+	}
+	return ptys_proto;
+}
+
+static u32 mlxsw_sx_to_ptys_speed(u32 speed)
+{
+	u32 ptys_proto = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SX_PORT_LINK_MODE_LEN; i++) {
+		if (speed == mlxsw_sx_port_link_mode[i].speed)
+			ptys_proto |= mlxsw_sx_port_link_mode[i].mask;
+	}
+	return ptys_proto;
+}
+
+static u32 mlxsw_sx_to_ptys_upper_speed(u32 upper_speed)
+{
+	u32 ptys_proto = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SX_PORT_LINK_MODE_LEN; i++) {
+		if (mlxsw_sx_port_link_mode[i].speed <= upper_speed)
+			ptys_proto |= mlxsw_sx_port_link_mode[i].mask;
+	}
+	return ptys_proto;
+}
+
+static int
+mlxsw_sx_port_set_link_ksettings(struct net_device *dev,
+				 const struct ethtool_link_ksettings *cmd)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+	u32 speed;
+	u32 eth_proto_new;
+	u32 eth_proto_cap;
+	u32 eth_proto_admin;
+	u32 advertising;
+	bool is_up;
+	int err;
+
+	speed = cmd->base.speed;
+
+	ethtool_convert_link_mode_to_legacy_u32(&advertising,
+						cmd->link_modes.advertising);
+
+	eth_proto_new = cmd->base.autoneg == AUTONEG_ENABLE ?
+		mlxsw_sx_to_ptys_advert_link(advertising) :
+		mlxsw_sx_to_ptys_speed(speed);
+
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sx_port->local_port, 0, false);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(ptys), ptys_pl);
+	if (err) {
+		netdev_err(dev, "Failed to get proto");
+		return err;
+	}
+	mlxsw_reg_ptys_eth_unpack(ptys_pl, &eth_proto_cap, &eth_proto_admin,
+				  NULL);
+
+	eth_proto_new = eth_proto_new & eth_proto_cap;
+	if (!eth_proto_new) {
+		netdev_err(dev, "Not supported proto admin requested");
+		return -EINVAL;
+	}
+	if (eth_proto_new == eth_proto_admin)
+		return 0;
+
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sx_port->local_port,
+				eth_proto_new, true);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(ptys), ptys_pl);
+	if (err) {
+		netdev_err(dev, "Failed to set proto admin");
+		return err;
+	}
+
+	err = mlxsw_sx_port_oper_status_get(mlxsw_sx_port, &is_up);
+	if (err) {
+		netdev_err(dev, "Failed to get oper status");
+		return err;
+	}
+	if (!is_up)
+		return 0;
+
+	err = mlxsw_sx_port_admin_status_set(mlxsw_sx_port, false);
+	if (err) {
+		netdev_err(dev, "Failed to set admin status");
+		return err;
+	}
+
+	err = mlxsw_sx_port_admin_status_set(mlxsw_sx_port, true);
+	if (err) {
+		netdev_err(dev, "Failed to set admin status");
+		return err;
+	}
+
+	return 0;
+}
+
+static const struct ethtool_ops mlxsw_sx_port_ethtool_ops = {
+	.get_drvinfo		= mlxsw_sx_port_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+	.get_strings		= mlxsw_sx_port_get_strings,
+	.get_ethtool_stats	= mlxsw_sx_port_get_stats,
+	.get_sset_count		= mlxsw_sx_port_get_sset_count,
+	.get_link_ksettings	= mlxsw_sx_port_get_link_ksettings,
+	.set_link_ksettings	= mlxsw_sx_port_set_link_ksettings,
+};
+
+static int mlxsw_sx_port_attr_get(struct net_device *dev,
+				  struct switchdev_attr *attr)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
+		attr->u.ppid.id_len = sizeof(mlxsw_sx->hw_id);
+		memcpy(&attr->u.ppid.id, &mlxsw_sx->hw_id, attr->u.ppid.id_len);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static const struct switchdev_ops mlxsw_sx_port_switchdev_ops = {
+	.switchdev_port_attr_get	= mlxsw_sx_port_attr_get,
+};
+
+static int mlxsw_sx_hw_id_get(struct mlxsw_sx *mlxsw_sx)
+{
+	char spad_pl[MLXSW_REG_SPAD_LEN] = {0};
+	int err;
+
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(spad), spad_pl);
+	if (err)
+		return err;
+	mlxsw_reg_spad_base_mac_memcpy_from(spad_pl, mlxsw_sx->hw_id);
+	return 0;
+}
+
+static int mlxsw_sx_port_dev_addr_get(struct mlxsw_sx_port *mlxsw_sx_port)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	struct net_device *dev = mlxsw_sx_port->dev;
+	char ppad_pl[MLXSW_REG_PPAD_LEN];
+	int err;
+
+	mlxsw_reg_ppad_pack(ppad_pl, false, 0);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(ppad), ppad_pl);
+	if (err)
+		return err;
+	mlxsw_reg_ppad_mac_memcpy_from(ppad_pl, dev->dev_addr);
+	/* The last byte value in base mac address is guaranteed
+	 * to be such it does not overflow when adding local_port
+	 * value.
+	 */
+	dev->dev_addr[ETH_ALEN - 1] += mlxsw_sx_port->local_port;
+	return 0;
+}
+
+static int mlxsw_sx_port_stp_state_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				       u16 vid, enum mlxsw_reg_spms_state state)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char *spms_pl;
+	int err;
+
+	spms_pl = kmalloc(MLXSW_REG_SPMS_LEN, GFP_KERNEL);
+	if (!spms_pl)
+		return -ENOMEM;
+	mlxsw_reg_spms_pack(spms_pl, mlxsw_sx_port->local_port);
+	mlxsw_reg_spms_vid_pack(spms_pl, vid, state);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(spms), spms_pl);
+	kfree(spms_pl);
+	return err;
+}
+
+static int mlxsw_sx_port_ib_speed_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				      u16 speed, u16 width)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+
+	mlxsw_reg_ptys_ib_pack(ptys_pl, mlxsw_sx_port->local_port, speed,
+			       width);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(ptys), ptys_pl);
+}
+
+static int
+mlxsw_sx_port_speed_by_width_set(struct mlxsw_sx_port *mlxsw_sx_port, u8 width)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	u32 upper_speed = MLXSW_SX_PORT_BASE_SPEED * width;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+	u32 eth_proto_admin;
+
+	eth_proto_admin = mlxsw_sx_to_ptys_upper_speed(upper_speed);
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sx_port->local_port,
+				eth_proto_admin, true);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(ptys), ptys_pl);
+}
+
+static int
+mlxsw_sx_port_mac_learning_mode_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				    enum mlxsw_reg_spmlr_learn_mode mode)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char spmlr_pl[MLXSW_REG_SPMLR_LEN];
+
+	mlxsw_reg_spmlr_pack(spmlr_pl, mlxsw_sx_port->local_port, mode);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(spmlr), spmlr_pl);
+}
+
+static int __mlxsw_sx_port_eth_create(struct mlxsw_sx *mlxsw_sx, u8 local_port,
+				      u8 module, u8 width)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port;
+	struct net_device *dev;
+	int err;
+
+	dev = alloc_etherdev(sizeof(struct mlxsw_sx_port));
+	if (!dev)
+		return -ENOMEM;
+	SET_NETDEV_DEV(dev, mlxsw_sx->bus_info->dev);
+	mlxsw_sx_port = netdev_priv(dev);
+	mlxsw_sx_port->dev = dev;
+	mlxsw_sx_port->mlxsw_sx = mlxsw_sx;
+	mlxsw_sx_port->local_port = local_port;
+	mlxsw_sx_port->mapping.module = module;
+
+	mlxsw_sx_port->pcpu_stats =
+		netdev_alloc_pcpu_stats(struct mlxsw_sx_port_pcpu_stats);
+	if (!mlxsw_sx_port->pcpu_stats) {
+		err = -ENOMEM;
+		goto err_alloc_stats;
+	}
+
+	dev->netdev_ops = &mlxsw_sx_port_netdev_ops;
+	dev->ethtool_ops = &mlxsw_sx_port_ethtool_ops;
+	dev->switchdev_ops = &mlxsw_sx_port_switchdev_ops;
+
+	err = mlxsw_sx_port_dev_addr_get(mlxsw_sx_port);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Unable to get port mac address\n",
+			mlxsw_sx_port->local_port);
+		goto err_dev_addr_get;
+	}
+
+	netif_carrier_off(dev);
+
+	dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_LLTX | NETIF_F_SG |
+			 NETIF_F_VLAN_CHALLENGED;
+
+	dev->min_mtu = 0;
+	dev->max_mtu = ETH_MAX_MTU;
+
+	/* Each packet needs to have a Tx header (metadata) on top all other
+	 * headers.
+	 */
+	dev->needed_headroom = MLXSW_TXHDR_LEN;
+
+	err = mlxsw_sx_port_system_port_mapping_set(mlxsw_sx_port);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set system port mapping\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_system_port_mapping_set;
+	}
+
+	err = mlxsw_sx_port_swid_set(mlxsw_sx_port, 0);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set SWID\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_swid_set;
+	}
+
+	err = mlxsw_sx_port_speed_by_width_set(mlxsw_sx_port, width);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set speed\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_speed_set;
+	}
+
+	err = mlxsw_sx_port_mtu_eth_set(mlxsw_sx_port, ETH_DATA_LEN);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set MTU\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_mtu_set;
+	}
+
+	err = mlxsw_sx_port_admin_status_set(mlxsw_sx_port, false);
+	if (err)
+		goto err_port_admin_status_set;
+
+	err = mlxsw_sx_port_stp_state_set(mlxsw_sx_port,
+					  MLXSW_PORT_DEFAULT_VID,
+					  MLXSW_REG_SPMS_STATE_FORWARDING);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set STP state\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_stp_state_set;
+	}
+
+	err = mlxsw_sx_port_mac_learning_mode_set(mlxsw_sx_port,
+						  MLXSW_REG_SPMLR_LEARN_MODE_DISABLE);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set MAC learning mode\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_mac_learning_mode_set;
+	}
+
+	err = register_netdev(dev);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to register netdev\n",
+			mlxsw_sx_port->local_port);
+		goto err_register_netdev;
+	}
+
+	mlxsw_core_port_eth_set(mlxsw_sx->core, mlxsw_sx_port->local_port,
+				mlxsw_sx_port, dev, false, 0);
+	mlxsw_sx->ports[local_port] = mlxsw_sx_port;
+	return 0;
+
+err_register_netdev:
+err_port_mac_learning_mode_set:
+err_port_stp_state_set:
+err_port_admin_status_set:
+err_port_mtu_set:
+err_port_speed_set:
+	mlxsw_sx_port_swid_set(mlxsw_sx_port, MLXSW_PORT_SWID_DISABLED_PORT);
+err_port_swid_set:
+err_port_system_port_mapping_set:
+err_dev_addr_get:
+	free_percpu(mlxsw_sx_port->pcpu_stats);
+err_alloc_stats:
+	free_netdev(dev);
+	return err;
+}
+
+static int mlxsw_sx_port_eth_create(struct mlxsw_sx *mlxsw_sx, u8 local_port,
+				    u8 module, u8 width)
+{
+	int err;
+
+	err = mlxsw_core_port_init(mlxsw_sx->core, local_port);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to init core port\n",
+			local_port);
+		return err;
+	}
+	err = __mlxsw_sx_port_eth_create(mlxsw_sx, local_port, module, width);
+	if (err)
+		goto err_port_create;
+
+	return 0;
+
+err_port_create:
+	mlxsw_core_port_fini(mlxsw_sx->core, local_port);
+	return err;
+}
+
+static void __mlxsw_sx_port_eth_remove(struct mlxsw_sx *mlxsw_sx, u8 local_port)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = mlxsw_sx->ports[local_port];
+
+	mlxsw_core_port_clear(mlxsw_sx->core, local_port, mlxsw_sx);
+	unregister_netdev(mlxsw_sx_port->dev); /* This calls ndo_stop */
+	mlxsw_sx->ports[local_port] = NULL;
+	mlxsw_sx_port_swid_set(mlxsw_sx_port, MLXSW_PORT_SWID_DISABLED_PORT);
+	free_percpu(mlxsw_sx_port->pcpu_stats);
+	free_netdev(mlxsw_sx_port->dev);
+}
+
+static bool mlxsw_sx_port_created(struct mlxsw_sx *mlxsw_sx, u8 local_port)
+{
+	return mlxsw_sx->ports[local_port] != NULL;
+}
+
+static int __mlxsw_sx_port_ib_create(struct mlxsw_sx *mlxsw_sx, u8 local_port,
+				     u8 module, u8 width)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port;
+	int err;
+
+	mlxsw_sx_port = kzalloc(sizeof(*mlxsw_sx_port), GFP_KERNEL);
+	if (!mlxsw_sx_port)
+		return -ENOMEM;
+	mlxsw_sx_port->mlxsw_sx = mlxsw_sx;
+	mlxsw_sx_port->local_port = local_port;
+	mlxsw_sx_port->mapping.module = module;
+
+	err = mlxsw_sx_port_system_port_mapping_set(mlxsw_sx_port);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set system port mapping\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_system_port_mapping_set;
+	}
+
+	/* Adding port to Infiniband swid (1) */
+	err = mlxsw_sx_port_swid_set(mlxsw_sx_port, 1);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set SWID\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_swid_set;
+	}
+
+	/* Expose the IB port number as it's front panel name */
+	err = mlxsw_sx_port_ib_port_set(mlxsw_sx_port, module + 1);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set IB port\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_ib_set;
+	}
+
+	/* Supports all speeds from SDR to FDR (bitmask) and support bus width
+	 * of 1x, 2x and 4x (3 bits bitmask)
+	 */
+	err = mlxsw_sx_port_ib_speed_set(mlxsw_sx_port,
+					 MLXSW_REG_PTYS_IB_SPEED_EDR - 1,
+					 BIT(3) - 1);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set speed\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_speed_set;
+	}
+
+	/* Change to the maximum MTU the device supports, the SMA will take
+	 * care of the active MTU
+	 */
+	err = mlxsw_sx_port_mtu_ib_set(mlxsw_sx_port, MLXSW_IB_DEFAULT_MTU);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set MTU\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_mtu_set;
+	}
+
+	err = mlxsw_sx_port_admin_status_set(mlxsw_sx_port, true);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to change admin state to UP\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_admin_set;
+	}
+
+	mlxsw_core_port_ib_set(mlxsw_sx->core, mlxsw_sx_port->local_port,
+			       mlxsw_sx_port);
+	mlxsw_sx->ports[local_port] = mlxsw_sx_port;
+	return 0;
+
+err_port_admin_set:
+err_port_mtu_set:
+err_port_speed_set:
+err_port_ib_set:
+	mlxsw_sx_port_swid_set(mlxsw_sx_port, MLXSW_PORT_SWID_DISABLED_PORT);
+err_port_swid_set:
+err_port_system_port_mapping_set:
+	kfree(mlxsw_sx_port);
+	return err;
+}
+
+static void __mlxsw_sx_port_ib_remove(struct mlxsw_sx *mlxsw_sx, u8 local_port)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = mlxsw_sx->ports[local_port];
+
+	mlxsw_core_port_clear(mlxsw_sx->core, local_port, mlxsw_sx);
+	mlxsw_sx->ports[local_port] = NULL;
+	mlxsw_sx_port_admin_status_set(mlxsw_sx_port, false);
+	mlxsw_sx_port_swid_set(mlxsw_sx_port, MLXSW_PORT_SWID_DISABLED_PORT);
+	kfree(mlxsw_sx_port);
+}
+
+static void __mlxsw_sx_port_remove(struct mlxsw_sx *mlxsw_sx, u8 local_port)
+{
+	enum devlink_port_type port_type =
+		mlxsw_core_port_type_get(mlxsw_sx->core, local_port);
+
+	if (port_type == DEVLINK_PORT_TYPE_ETH)
+		__mlxsw_sx_port_eth_remove(mlxsw_sx, local_port);
+	else if (port_type == DEVLINK_PORT_TYPE_IB)
+		__mlxsw_sx_port_ib_remove(mlxsw_sx, local_port);
+}
+
+static void mlxsw_sx_port_remove(struct mlxsw_sx *mlxsw_sx, u8 local_port)
+{
+	__mlxsw_sx_port_remove(mlxsw_sx, local_port);
+	mlxsw_core_port_fini(mlxsw_sx->core, local_port);
+}
+
+static void mlxsw_sx_ports_remove(struct mlxsw_sx *mlxsw_sx)
+{
+	int i;
+
+	for (i = 1; i < mlxsw_core_max_ports(mlxsw_sx->core); i++)
+		if (mlxsw_sx_port_created(mlxsw_sx, i))
+			mlxsw_sx_port_remove(mlxsw_sx, i);
+	kfree(mlxsw_sx->ports);
+}
+
+static int mlxsw_sx_ports_create(struct mlxsw_sx *mlxsw_sx)
+{
+	unsigned int max_ports = mlxsw_core_max_ports(mlxsw_sx->core);
+	size_t alloc_size;
+	u8 module, width;
+	int i;
+	int err;
+
+	alloc_size = sizeof(struct mlxsw_sx_port *) * max_ports;
+	mlxsw_sx->ports = kzalloc(alloc_size, GFP_KERNEL);
+	if (!mlxsw_sx->ports)
+		return -ENOMEM;
+
+	for (i = 1; i < max_ports; i++) {
+		err = mlxsw_sx_port_module_info_get(mlxsw_sx, i, &module,
+						    &width);
+		if (err)
+			goto err_port_module_info_get;
+		if (!width)
+			continue;
+		err = mlxsw_sx_port_eth_create(mlxsw_sx, i, module, width);
+		if (err)
+			goto err_port_create;
+	}
+	return 0;
+
+err_port_create:
+err_port_module_info_get:
+	for (i--; i >= 1; i--)
+		if (mlxsw_sx_port_created(mlxsw_sx, i))
+			mlxsw_sx_port_remove(mlxsw_sx, i);
+	kfree(mlxsw_sx->ports);
+	return err;
+}
+
+static void mlxsw_sx_pude_eth_event_func(struct mlxsw_sx_port *mlxsw_sx_port,
+					 enum mlxsw_reg_pude_oper_status status)
+{
+	if (status == MLXSW_PORT_OPER_STATUS_UP) {
+		netdev_info(mlxsw_sx_port->dev, "link up\n");
+		netif_carrier_on(mlxsw_sx_port->dev);
+	} else {
+		netdev_info(mlxsw_sx_port->dev, "link down\n");
+		netif_carrier_off(mlxsw_sx_port->dev);
+	}
+}
+
+static void mlxsw_sx_pude_ib_event_func(struct mlxsw_sx_port *mlxsw_sx_port,
+					enum mlxsw_reg_pude_oper_status status)
+{
+	if (status == MLXSW_PORT_OPER_STATUS_UP)
+		pr_info("ib link for port %d - up\n",
+			mlxsw_sx_port->mapping.module + 1);
+	else
+		pr_info("ib link for port %d - down\n",
+			mlxsw_sx_port->mapping.module + 1);
+}
+
+static void mlxsw_sx_pude_event_func(const struct mlxsw_reg_info *reg,
+				     char *pude_pl, void *priv)
+{
+	struct mlxsw_sx *mlxsw_sx = priv;
+	struct mlxsw_sx_port *mlxsw_sx_port;
+	enum mlxsw_reg_pude_oper_status status;
+	enum devlink_port_type port_type;
+	u8 local_port;
+
+	local_port = mlxsw_reg_pude_local_port_get(pude_pl);
+	mlxsw_sx_port = mlxsw_sx->ports[local_port];
+	if (!mlxsw_sx_port) {
+		dev_warn(mlxsw_sx->bus_info->dev, "Port %d: Link event received for non-existent port\n",
+			 local_port);
+		return;
+	}
+
+	status = mlxsw_reg_pude_oper_status_get(pude_pl);
+	port_type = mlxsw_core_port_type_get(mlxsw_sx->core, local_port);
+	if (port_type == DEVLINK_PORT_TYPE_ETH)
+		mlxsw_sx_pude_eth_event_func(mlxsw_sx_port, status);
+	else if (port_type == DEVLINK_PORT_TYPE_IB)
+		mlxsw_sx_pude_ib_event_func(mlxsw_sx_port, status);
+}
+
+static void mlxsw_sx_rx_listener_func(struct sk_buff *skb, u8 local_port,
+				      void *priv)
+{
+	struct mlxsw_sx *mlxsw_sx = priv;
+	struct mlxsw_sx_port *mlxsw_sx_port = mlxsw_sx->ports[local_port];
+	struct mlxsw_sx_port_pcpu_stats *pcpu_stats;
+
+	if (unlikely(!mlxsw_sx_port)) {
+		dev_warn_ratelimited(mlxsw_sx->bus_info->dev, "Port %d: skb received for non-existent port\n",
+				     local_port);
+		return;
+	}
+
+	skb->dev = mlxsw_sx_port->dev;
+
+	pcpu_stats = this_cpu_ptr(mlxsw_sx_port->pcpu_stats);
+	u64_stats_update_begin(&pcpu_stats->syncp);
+	pcpu_stats->rx_packets++;
+	pcpu_stats->rx_bytes += skb->len;
+	u64_stats_update_end(&pcpu_stats->syncp);
+
+	skb->protocol = eth_type_trans(skb, skb->dev);
+	netif_receive_skb(skb);
+}
+
+static int mlxsw_sx_port_type_set(struct mlxsw_core *mlxsw_core, u8 local_port,
+				  enum devlink_port_type new_type)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_core_driver_priv(mlxsw_core);
+	u8 module, width;
+	int err;
+
+	if (new_type == DEVLINK_PORT_TYPE_AUTO)
+		return -EOPNOTSUPP;
+
+	__mlxsw_sx_port_remove(mlxsw_sx, local_port);
+	err = mlxsw_sx_port_module_info_get(mlxsw_sx, local_port, &module,
+					    &width);
+	if (err)
+		goto err_port_module_info_get;
+
+	if (new_type == DEVLINK_PORT_TYPE_ETH)
+		err = __mlxsw_sx_port_eth_create(mlxsw_sx, local_port, module,
+						 width);
+	else if (new_type == DEVLINK_PORT_TYPE_IB)
+		err = __mlxsw_sx_port_ib_create(mlxsw_sx, local_port, module,
+						width);
+
+err_port_module_info_get:
+	return err;
+}
+
+#define MLXSW_SX_RXL(_trap_id) \
+	MLXSW_RXL(mlxsw_sx_rx_listener_func, _trap_id, TRAP_TO_CPU,	\
+		  false, SX2_RX, FORWARD)
+
+static const struct mlxsw_listener mlxsw_sx_listener[] = {
+	MLXSW_EVENTL(mlxsw_sx_pude_event_func, PUDE, EMAD),
+	MLXSW_SX_RXL(FDB_MC),
+	MLXSW_SX_RXL(STP),
+	MLXSW_SX_RXL(LACP),
+	MLXSW_SX_RXL(EAPOL),
+	MLXSW_SX_RXL(LLDP),
+	MLXSW_SX_RXL(MMRP),
+	MLXSW_SX_RXL(MVRP),
+	MLXSW_SX_RXL(RPVST),
+	MLXSW_SX_RXL(DHCP),
+	MLXSW_SX_RXL(IGMP_QUERY),
+	MLXSW_SX_RXL(IGMP_V1_REPORT),
+	MLXSW_SX_RXL(IGMP_V2_REPORT),
+	MLXSW_SX_RXL(IGMP_V2_LEAVE),
+	MLXSW_SX_RXL(IGMP_V3_REPORT),
+};
+
+static int mlxsw_sx_traps_init(struct mlxsw_sx *mlxsw_sx)
+{
+	char htgt_pl[MLXSW_REG_HTGT_LEN];
+	int i;
+	int err;
+
+	mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_SX2_RX,
+			    MLXSW_REG_HTGT_INVALID_POLICER,
+			    MLXSW_REG_HTGT_DEFAULT_PRIORITY,
+			    MLXSW_REG_HTGT_DEFAULT_TC);
+	mlxsw_reg_htgt_local_path_rdq_set(htgt_pl,
+					  MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SX2_RX);
+
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(htgt), htgt_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_SX2_CTRL,
+			    MLXSW_REG_HTGT_INVALID_POLICER,
+			    MLXSW_REG_HTGT_DEFAULT_PRIORITY,
+			    MLXSW_REG_HTGT_DEFAULT_TC);
+	mlxsw_reg_htgt_local_path_rdq_set(htgt_pl,
+					MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SX2_CTRL);
+
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(htgt), htgt_pl);
+	if (err)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sx_listener); i++) {
+		err = mlxsw_core_trap_register(mlxsw_sx->core,
+					       &mlxsw_sx_listener[i],
+					       mlxsw_sx);
+		if (err)
+			goto err_listener_register;
+
+	}
+	return 0;
+
+err_listener_register:
+	for (i--; i >= 0; i--) {
+		mlxsw_core_trap_unregister(mlxsw_sx->core,
+					   &mlxsw_sx_listener[i],
+					   mlxsw_sx);
+	}
+	return err;
+}
+
+static void mlxsw_sx_traps_fini(struct mlxsw_sx *mlxsw_sx)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sx_listener); i++) {
+		mlxsw_core_trap_unregister(mlxsw_sx->core,
+					   &mlxsw_sx_listener[i],
+					   mlxsw_sx);
+	}
+}
+
+static int mlxsw_sx_flood_init(struct mlxsw_sx *mlxsw_sx)
+{
+	char sfgc_pl[MLXSW_REG_SFGC_LEN];
+	char sgcr_pl[MLXSW_REG_SGCR_LEN];
+	char *sftr_pl;
+	int err;
+
+	/* Configure a flooding table, which includes only CPU port. */
+	sftr_pl = kmalloc(MLXSW_REG_SFTR_LEN, GFP_KERNEL);
+	if (!sftr_pl)
+		return -ENOMEM;
+	mlxsw_reg_sftr_pack(sftr_pl, 0, 0, MLXSW_REG_SFGC_TABLE_TYPE_SINGLE, 0,
+			    MLXSW_PORT_CPU_PORT, true);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sftr), sftr_pl);
+	kfree(sftr_pl);
+	if (err)
+		return err;
+
+	/* Flood different packet types using the flooding table. */
+	mlxsw_reg_sfgc_pack(sfgc_pl,
+			    MLXSW_REG_SFGC_TYPE_UNKNOWN_UNICAST,
+			    MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+			    MLXSW_REG_SFGC_TABLE_TYPE_SINGLE,
+			    0);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sfgc), sfgc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_sfgc_pack(sfgc_pl,
+			    MLXSW_REG_SFGC_TYPE_BROADCAST,
+			    MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+			    MLXSW_REG_SFGC_TABLE_TYPE_SINGLE,
+			    0);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sfgc), sfgc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_sfgc_pack(sfgc_pl,
+			    MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_NON_IP,
+			    MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+			    MLXSW_REG_SFGC_TABLE_TYPE_SINGLE,
+			    0);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sfgc), sfgc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_sfgc_pack(sfgc_pl,
+			    MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_IPV6,
+			    MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+			    MLXSW_REG_SFGC_TABLE_TYPE_SINGLE,
+			    0);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sfgc), sfgc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_sfgc_pack(sfgc_pl,
+			    MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_IPV4,
+			    MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+			    MLXSW_REG_SFGC_TABLE_TYPE_SINGLE,
+			    0);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sfgc), sfgc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_sgcr_pack(sgcr_pl, true);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sgcr), sgcr_pl);
+}
+
+static int mlxsw_sx_basic_trap_groups_set(struct mlxsw_core *mlxsw_core)
+{
+	char htgt_pl[MLXSW_REG_HTGT_LEN];
+
+	mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_EMAD,
+			    MLXSW_REG_HTGT_INVALID_POLICER,
+			    MLXSW_REG_HTGT_DEFAULT_PRIORITY,
+			    MLXSW_REG_HTGT_DEFAULT_TC);
+	mlxsw_reg_htgt_swid_set(htgt_pl, MLXSW_PORT_SWID_ALL_SWIDS);
+	mlxsw_reg_htgt_local_path_rdq_set(htgt_pl,
+					MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SX2_EMAD);
+	return mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
+}
+
+static int mlxsw_sx_init(struct mlxsw_core *mlxsw_core,
+			 const struct mlxsw_bus_info *mlxsw_bus_info)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_core_driver_priv(mlxsw_core);
+	int err;
+
+	mlxsw_sx->core = mlxsw_core;
+	mlxsw_sx->bus_info = mlxsw_bus_info;
+
+	err = mlxsw_sx_hw_id_get(mlxsw_sx);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Failed to get switch HW ID\n");
+		return err;
+	}
+
+	err = mlxsw_sx_ports_create(mlxsw_sx);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Failed to create ports\n");
+		return err;
+	}
+
+	err = mlxsw_sx_traps_init(mlxsw_sx);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Failed to set traps\n");
+		goto err_listener_register;
+	}
+
+	err = mlxsw_sx_flood_init(mlxsw_sx);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Failed to initialize flood tables\n");
+		goto err_flood_init;
+	}
+
+	return 0;
+
+err_flood_init:
+	mlxsw_sx_traps_fini(mlxsw_sx);
+err_listener_register:
+	mlxsw_sx_ports_remove(mlxsw_sx);
+	return err;
+}
+
+static void mlxsw_sx_fini(struct mlxsw_core *mlxsw_core)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_core_driver_priv(mlxsw_core);
+
+	mlxsw_sx_traps_fini(mlxsw_sx);
+	mlxsw_sx_ports_remove(mlxsw_sx);
+}
+
+static const struct mlxsw_config_profile mlxsw_sx_config_profile = {
+	.used_max_vepa_channels		= 1,
+	.max_vepa_channels		= 0,
+	.used_max_mid			= 1,
+	.max_mid			= 7000,
+	.used_max_pgt			= 1,
+	.max_pgt			= 0,
+	.used_max_system_port		= 1,
+	.max_system_port		= 48000,
+	.used_max_vlan_groups		= 1,
+	.max_vlan_groups		= 127,
+	.used_max_regions		= 1,
+	.max_regions			= 400,
+	.used_flood_tables		= 1,
+	.max_flood_tables		= 2,
+	.max_vid_flood_tables		= 1,
+	.used_flood_mode		= 1,
+	.flood_mode			= 3,
+	.used_max_ib_mc			= 1,
+	.max_ib_mc			= 6,
+	.used_max_pkey			= 1,
+	.max_pkey			= 0,
+	.swid_config			= {
+		{
+			.used_type	= 1,
+			.type		= MLXSW_PORT_SWID_TYPE_ETH,
+		},
+		{
+			.used_type	= 1,
+			.type		= MLXSW_PORT_SWID_TYPE_IB,
+		}
+	},
+};
+
+static struct mlxsw_driver mlxsw_sx_driver = {
+	.kind			= mlxsw_sx_driver_name,
+	.priv_size		= sizeof(struct mlxsw_sx),
+	.init			= mlxsw_sx_init,
+	.fini			= mlxsw_sx_fini,
+	.basic_trap_groups_set	= mlxsw_sx_basic_trap_groups_set,
+	.txhdr_construct	= mlxsw_sx_txhdr_construct,
+	.txhdr_len		= MLXSW_TXHDR_LEN,
+	.profile		= &mlxsw_sx_config_profile,
+	.port_type_set		= mlxsw_sx_port_type_set,
+};
+
+static const struct pci_device_id mlxsw_sx_pci_id_table[] = {
+	{PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_SWITCHX2), 0},
+	{0, },
+};
+
+static struct pci_driver mlxsw_sx_pci_driver = {
+	.name = mlxsw_sx_driver_name,
+	.id_table = mlxsw_sx_pci_id_table,
+};
+
+static int __init mlxsw_sx_module_init(void)
+{
+	int err;
+
+	err = mlxsw_core_driver_register(&mlxsw_sx_driver);
+	if (err)
+		return err;
+
+	err = mlxsw_pci_driver_register(&mlxsw_sx_pci_driver);
+	if (err)
+		goto err_pci_driver_register;
+
+	return 0;
+
+err_pci_driver_register:
+	mlxsw_core_driver_unregister(&mlxsw_sx_driver);
+	return err;
+}
+
+static void __exit mlxsw_sx_module_exit(void)
+{
+	mlxsw_pci_driver_unregister(&mlxsw_sx_pci_driver);
+	mlxsw_core_driver_unregister(&mlxsw_sx_driver);
+}
+
+module_init(mlxsw_sx_module_init);
+module_exit(mlxsw_sx_module_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox SwitchX-2 driver");
+MODULE_DEVICE_TABLE(pci, mlxsw_sx_pci_id_table);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/trap.h b/drivers/net/ethernet/mellanox/mlxsw/trap.h
new file mode 100644
index 0000000..399e9d6
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/trap.h
@@ -0,0 +1,108 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/trap.h
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015 Elad Raz <eladr@mellanox.com>
+ * Copyright (c) 2015 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2015 Ido Schimmel <idosch@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _MLXSW_TRAP_H
+#define _MLXSW_TRAP_H
+
+enum {
+	/* Ethernet EMAD and FDB miss */
+	MLXSW_TRAP_ID_FDB_MC = 0x01,
+	MLXSW_TRAP_ID_ETHEMAD = 0x05,
+	/* L2 traps for specific packet types */
+	MLXSW_TRAP_ID_STP = 0x10,
+	MLXSW_TRAP_ID_LACP = 0x11,
+	MLXSW_TRAP_ID_EAPOL = 0x12,
+	MLXSW_TRAP_ID_LLDP = 0x13,
+	MLXSW_TRAP_ID_MMRP = 0x14,
+	MLXSW_TRAP_ID_MVRP = 0x15,
+	MLXSW_TRAP_ID_RPVST = 0x16,
+	MLXSW_TRAP_ID_DHCP = 0x19,
+	MLXSW_TRAP_ID_IGMP_QUERY = 0x30,
+	MLXSW_TRAP_ID_IGMP_V1_REPORT = 0x31,
+	MLXSW_TRAP_ID_IGMP_V2_REPORT = 0x32,
+	MLXSW_TRAP_ID_IGMP_V2_LEAVE = 0x33,
+	MLXSW_TRAP_ID_IGMP_V3_REPORT = 0x34,
+	MLXSW_TRAP_ID_PKT_SAMPLE = 0x38,
+	MLXSW_TRAP_ID_FID_MISS = 0x3D,
+	MLXSW_TRAP_ID_ARPBC = 0x50,
+	MLXSW_TRAP_ID_ARPUC = 0x51,
+	MLXSW_TRAP_ID_MTUERROR = 0x52,
+	MLXSW_TRAP_ID_TTLERROR = 0x53,
+	MLXSW_TRAP_ID_LBERROR = 0x54,
+	MLXSW_TRAP_ID_IPV4_OSPF = 0x55,
+	MLXSW_TRAP_ID_IPV4_PIM = 0x58,
+	MLXSW_TRAP_ID_RPF = 0x5C,
+	MLXSW_TRAP_ID_IP2ME = 0x5F,
+	MLXSW_TRAP_ID_IPV6_UNSPECIFIED_ADDRESS = 0x60,
+	MLXSW_TRAP_ID_IPV6_LINK_LOCAL_DEST = 0x61,
+	MLXSW_TRAP_ID_IPV6_LINK_LOCAL_SRC = 0x62,
+	MLXSW_TRAP_ID_IPV6_ALL_NODES_LINK = 0x63,
+	MLXSW_TRAP_ID_IPV6_OSPF = 0x64,
+	MLXSW_TRAP_ID_IPV6_MLDV12_LISTENER_QUERY = 0x65,
+	MLXSW_TRAP_ID_IPV6_MLDV1_LISTENER_REPORT = 0x66,
+	MLXSW_TRAP_ID_IPV6_MLDV1_LISTENER_DONE = 0x67,
+	MLXSW_TRAP_ID_IPV6_MLDV2_LISTENER_REPORT = 0x68,
+	MLXSW_TRAP_ID_IPV6_DHCP = 0x69,
+	MLXSW_TRAP_ID_IPV6_ALL_ROUTERS_LINK = 0x6F,
+	MLXSW_TRAP_ID_RTR_INGRESS0 = 0x70,
+	MLXSW_TRAP_ID_IPV6_PIM = 0x79,
+	MLXSW_TRAP_ID_IPV4_BGP = 0x88,
+	MLXSW_TRAP_ID_IPV6_BGP = 0x89,
+	MLXSW_TRAP_ID_L3_IPV6_ROUTER_SOLICITATION = 0x8A,
+	MLXSW_TRAP_ID_L3_IPV6_ROUTER_ADVERTISMENT = 0x8B,
+	MLXSW_TRAP_ID_L3_IPV6_NEIGHBOR_SOLICITATION = 0x8C,
+	MLXSW_TRAP_ID_L3_IPV6_NEIGHBOR_ADVERTISMENT = 0x8D,
+	MLXSW_TRAP_ID_L3_IPV6_REDIRECTION = 0x8E,
+	MLXSW_TRAP_ID_HOST_MISS_IPV4 = 0x90,
+	MLXSW_TRAP_ID_IPV6_MC_LINK_LOCAL_DEST = 0x91,
+	MLXSW_TRAP_ID_HOST_MISS_IPV6 = 0x92,
+	MLXSW_TRAP_ID_IPIP_DECAP_ERROR = 0xB1,
+	MLXSW_TRAP_ID_ROUTER_ALERT_IPV4 = 0xD6,
+	MLXSW_TRAP_ID_ROUTER_ALERT_IPV6 = 0xD7,
+	MLXSW_TRAP_ID_ACL0 = 0x1C0,
+	/* Multicast trap used for routes with trap action */
+	MLXSW_TRAP_ID_ACL1 = 0x1C1,
+	/* Multicast trap used for routes with trap-and-forward action */
+	MLXSW_TRAP_ID_ACL2 = 0x1C2,
+
+	MLXSW_TRAP_ID_MAX = 0x1FF
+};
+
+enum mlxsw_event_trap_id {
+	/* Port Up/Down event generated by hardware */
+	MLXSW_TRAP_ID_PUDE = 0x8,
+};
+
+#endif /* _MLXSW_TRAP_H */
diff --git a/drivers/net/ethernet/mellanox/mlxsw/txheader.h b/drivers/net/ethernet/mellanox/mlxsw/txheader.h
new file mode 100644
index 0000000..fdf9472
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/txheader.h
@@ -0,0 +1,81 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/txheader.h
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2015 Ido Schimmel <idosch@mellanox.com>
+ * Copyright (c) 2015 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_TXHEADER_H
+#define _MLXSW_TXHEADER_H
+
+#define MLXSW_TXHDR_LEN 0x10
+#define MLXSW_TXHDR_VERSION_0 0
+#define MLXSW_TXHDR_VERSION_1 1
+
+enum {
+	MLXSW_TXHDR_ETH_CTL,
+	MLXSW_TXHDR_ETH_DATA,
+};
+
+#define MLXSW_TXHDR_PROTO_ETH 1
+
+enum {
+	MLXSW_TXHDR_ETCLASS_0,
+	MLXSW_TXHDR_ETCLASS_1,
+	MLXSW_TXHDR_ETCLASS_2,
+	MLXSW_TXHDR_ETCLASS_3,
+	MLXSW_TXHDR_ETCLASS_4,
+	MLXSW_TXHDR_ETCLASS_5,
+	MLXSW_TXHDR_ETCLASS_6,
+	MLXSW_TXHDR_ETCLASS_7,
+};
+
+enum {
+	MLXSW_TXHDR_RDQ_OTHER,
+	MLXSW_TXHDR_RDQ_EMAD = 0x1f,
+};
+
+#define MLXSW_TXHDR_CTCLASS3 0
+#define MLXSW_TXHDR_CPU_SIG 0
+#define MLXSW_TXHDR_SIG 0xE0E0
+#define MLXSW_TXHDR_STCLASS_NONE 0
+
+enum {
+	MLXSW_TXHDR_NOT_EMAD,
+	MLXSW_TXHDR_EMAD,
+};
+
+enum {
+	MLXSW_TXHDR_TYPE_DATA,
+	MLXSW_TXHDR_TYPE_CONTROL = 6,
+};
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/Kconfig b/drivers/net/ethernet/qlogic/Kconfig
new file mode 100644
index 0000000..0ee2490
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/Kconfig
@@ -0,0 +1,124 @@
+#
+# QLogic network device configuration
+#
+
+config NET_VENDOR_QLOGIC
+	bool "QLogic devices"
+	default y
+	depends on PCI
+	---help---
+	  If you have a network (Ethernet) card belonging to this class, say Y.
+
+	  Note that the answer to this question doesn't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about QLogic cards. If you say Y, you will be asked for
+	  your specific card in the following questions.
+
+if NET_VENDOR_QLOGIC
+
+config QLA3XXX
+	tristate "QLogic QLA3XXX Network Driver Support"
+	depends on PCI
+	---help---
+	  This driver supports QLogic ISP3XXX gigabit Ethernet cards.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called qla3xxx.
+
+config QLCNIC
+	tristate "QLOGIC QLCNIC 1/10Gb Converged Ethernet NIC Support"
+	depends on PCI
+	select FW_LOADER
+	---help---
+	  This driver supports QLogic QLE8240 and QLE8242 Converged Ethernet
+	  devices.
+
+config QLCNIC_SRIOV
+	bool "QLOGIC QLCNIC 83XX family SR-IOV Support"
+	depends on QLCNIC && PCI_IOV
+	default y
+	---help---
+	  This configuration parameter enables Single Root Input Output
+	  Virtualization support for QLE83XX Converged Ethernet devices.
+	  This allows for virtual function acceleration in virtualized
+	  environments.
+
+config QLCNIC_DCB
+	bool "QLOGIC QLCNIC 82XX and 83XX family DCB Support"
+	depends on QLCNIC && DCB
+	default y
+	---help---
+	  This configuration parameter enables DCB support in QLE83XX
+	  and QLE82XX Converged Ethernet devices. This allows for DCB
+	  get operations support through rtNetlink interface. Only CEE
+	  mode of DCB is supported. PG and PFC values are related only
+	  to Tx.
+
+config QLCNIC_HWMON
+	bool "QLOGIC QLCNIC 82XX and 83XX family HWMON support"
+	depends on QLCNIC && HWMON && !(QLCNIC=y && HWMON=m)
+	default y
+	---help---
+	  This configuration parameter can be used to read the
+	  board temperature in Converged Ethernet devices
+	  supported by qlcnic.
+
+	  This data is available via the hwmon sysfs interface.
+
+config QLGE
+	tristate "QLogic QLGE 10Gb Ethernet Driver Support"
+	depends on PCI
+	---help---
+	  This driver supports QLogic ISP8XXX 10Gb Ethernet cards.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called qlge.
+
+config NETXEN_NIC
+	tristate "NetXen Multi port (1/10) Gigabit Ethernet NIC"
+	depends on PCI
+	select FW_LOADER
+	---help---
+	  This enables the support for NetXen's Gigabit Ethernet card.
+
+config QED
+	tristate "QLogic QED 25/40/100Gb core driver"
+	depends on PCI
+	select ZLIB_INFLATE
+	select CRC8
+	---help---
+	  This enables the support for ...
+
+config QED_LL2
+	bool
+
+config QED_SRIOV
+	bool "QLogic QED 25/40/100Gb SR-IOV support"
+	depends on QED && PCI_IOV
+	default y
+	---help---
+	  This configuration parameter enables Single Root Input Output
+	  Virtualization support for QED devices.
+	  This allows for virtual function acceleration in virtualized
+	  environments.
+
+config QEDE
+	tristate "QLogic QED 25/40/100Gb Ethernet NIC"
+	depends on QED
+	imply PTP_1588_CLOCK
+	---help---
+	  This enables the support for ...
+
+config QED_RDMA
+	bool
+
+config QED_ISCSI
+	bool
+
+config QED_FCOE
+	bool
+
+config QED_OOO
+	bool
+
+endif # NET_VENDOR_QLOGIC
diff --git a/drivers/net/ethernet/qlogic/Makefile b/drivers/net/ethernet/qlogic/Makefile
new file mode 100644
index 0000000..6cd2e33
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the QLogic network device drivers.
+#
+
+obj-$(CONFIG_QLA3XXX) += qla3xxx.o
+obj-$(CONFIG_QLCNIC) += qlcnic/
+obj-$(CONFIG_QLGE) += qlge/
+obj-$(CONFIG_NETXEN_NIC) += netxen/
+obj-$(CONFIG_QED) += qed/
+obj-$(CONFIG_QEDE)+= qede/
diff --git a/drivers/net/ethernet/qlogic/netxen/Makefile b/drivers/net/ethernet/qlogic/netxen/Makefile
new file mode 100644
index 0000000..e14e60c
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/netxen/Makefile
@@ -0,0 +1,27 @@
+# Copyright (C) 2003 - 2009 NetXen, Inc.
+# Copyright (C) 2009 - QLogic Corporation.
+# All rights reserved.
+# 
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#                            
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#                                   
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+# 
+# The full GNU General Public License is included in this distribution
+# in the file called "COPYING".
+# 
+#
+
+
+obj-$(CONFIG_NETXEN_NIC) := netxen_nic.o
+
+netxen_nic-y := netxen_nic_hw.o netxen_nic_main.o netxen_nic_init.o \
+	netxen_nic_ethtool.o netxen_nic_ctx.o
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic.h b/drivers/net/ethernet/qlogic/netxen/netxen_nic.h
new file mode 100644
index 0000000..0a5e204
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic.h
@@ -0,0 +1,1889 @@
+/*
+ * Copyright (C) 2003 - 2009 NetXen, Inc.
+ * Copyright (C) 2009 - QLogic Corporation.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution
+ * in the file called "COPYING".
+ *
+ */
+
+#ifndef _NETXEN_NIC_H_
+#define _NETXEN_NIC_H_
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/skbuff.h>
+#include <linux/firmware.h>
+
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/timer.h>
+
+#include <linux/vmalloc.h>
+
+#include <asm/io.h>
+#include <asm/byteorder.h>
+
+#include "netxen_nic_hdr.h"
+#include "netxen_nic_hw.h"
+
+#define _NETXEN_NIC_LINUX_MAJOR 4
+#define _NETXEN_NIC_LINUX_MINOR 0
+#define _NETXEN_NIC_LINUX_SUBVERSION 82
+#define NETXEN_NIC_LINUX_VERSIONID  "4.0.82"
+
+#define NETXEN_VERSION_CODE(a, b, c)	(((a) << 24) + ((b) << 16) + (c))
+#define _major(v)	(((v) >> 24) & 0xff)
+#define _minor(v)	(((v) >> 16) & 0xff)
+#define _build(v)	((v) & 0xffff)
+
+/* version in image has weird encoding:
+ *  7:0  - major
+ * 15:8  - minor
+ * 31:16 - build (little endian)
+ */
+#define NETXEN_DECODE_VERSION(v) \
+	NETXEN_VERSION_CODE(((v) & 0xff), (((v) >> 8) & 0xff), ((v) >> 16))
+
+#define NETXEN_NUM_FLASH_SECTORS (64)
+#define NETXEN_FLASH_SECTOR_SIZE (64 * 1024)
+#define NETXEN_FLASH_TOTAL_SIZE  (NETXEN_NUM_FLASH_SECTORS \
+					* NETXEN_FLASH_SECTOR_SIZE)
+
+#define RCV_DESC_RINGSIZE(rds_ring)	\
+	(sizeof(struct rcv_desc) * (rds_ring)->num_desc)
+#define RCV_BUFF_RINGSIZE(rds_ring)	\
+	(sizeof(struct netxen_rx_buffer) * rds_ring->num_desc)
+#define STATUS_DESC_RINGSIZE(sds_ring)	\
+	(sizeof(struct status_desc) * (sds_ring)->num_desc)
+#define TX_BUFF_RINGSIZE(tx_ring)	\
+	(sizeof(struct netxen_cmd_buffer) * tx_ring->num_desc)
+#define TX_DESC_RINGSIZE(tx_ring)	\
+	(sizeof(struct cmd_desc_type0) * tx_ring->num_desc)
+
+#define find_diff_among(a,b,range) ((a)<(b)?((b)-(a)):((b)+(range)-(a)))
+
+#define NETXEN_RCV_PRODUCER_OFFSET	0
+#define NETXEN_RCV_PEG_DB_ID		2
+#define NETXEN_HOST_DUMMY_DMA_SIZE 1024
+#define FLASH_SUCCESS 0
+
+#define ADDR_IN_WINDOW1(off)	\
+	((off > NETXEN_CRB_PCIX_HOST2) && (off < NETXEN_CRB_MAX)) ? 1 : 0
+
+#define ADDR_IN_RANGE(addr, low, high)	\
+	(((addr) < (high)) && ((addr) >= (low)))
+
+/*
+ * normalize a 64MB crb address to 32MB PCI window
+ * To use NETXEN_CRB_NORMALIZE, window _must_ be set to 1
+ */
+#define NETXEN_CRB_NORMAL(reg)	\
+	((reg) - NETXEN_CRB_PCIX_HOST2 + NETXEN_CRB_PCIX_HOST)
+
+#define NETXEN_CRB_NORMALIZE(adapter, reg) \
+	pci_base_offset(adapter, NETXEN_CRB_NORMAL(reg))
+
+#define DB_NORMALIZE(adapter, off) \
+	(adapter->ahw.db_base + (off))
+
+#define NX_P2_C0		0x24
+#define NX_P2_C1		0x25
+#define NX_P3_A0		0x30
+#define NX_P3_A2		0x30
+#define NX_P3_B0		0x40
+#define NX_P3_B1		0x41
+#define NX_P3_B2		0x42
+#define NX_P3P_A0		0x50
+
+#define NX_IS_REVISION_P2(REVISION)     (REVISION <= NX_P2_C1)
+#define NX_IS_REVISION_P3(REVISION)     (REVISION >= NX_P3_A0)
+#define NX_IS_REVISION_P3P(REVISION)     (REVISION >= NX_P3P_A0)
+
+#define FIRST_PAGE_GROUP_START	0
+#define FIRST_PAGE_GROUP_END	0x100000
+
+#define SECOND_PAGE_GROUP_START	0x6000000
+#define SECOND_PAGE_GROUP_END	0x68BC000
+
+#define THIRD_PAGE_GROUP_START	0x70E4000
+#define THIRD_PAGE_GROUP_END	0x8000000
+
+#define FIRST_PAGE_GROUP_SIZE  FIRST_PAGE_GROUP_END - FIRST_PAGE_GROUP_START
+#define SECOND_PAGE_GROUP_SIZE SECOND_PAGE_GROUP_END - SECOND_PAGE_GROUP_START
+#define THIRD_PAGE_GROUP_SIZE  THIRD_PAGE_GROUP_END - THIRD_PAGE_GROUP_START
+
+#define P2_MAX_MTU                     (8000)
+#define P3_MAX_MTU                     (9600)
+#define NX_ETHERMTU                    1500
+#define NX_MAX_ETHERHDR                32 /* This contains some padding */
+
+#define NX_P2_RX_BUF_MAX_LEN           1760
+#define NX_P3_RX_BUF_MAX_LEN           (NX_MAX_ETHERHDR + NX_ETHERMTU)
+#define NX_P2_RX_JUMBO_BUF_MAX_LEN     (NX_MAX_ETHERHDR + P2_MAX_MTU)
+#define NX_P3_RX_JUMBO_BUF_MAX_LEN     (NX_MAX_ETHERHDR + P3_MAX_MTU)
+#define NX_CT_DEFAULT_RX_BUF_LEN	2048
+#define NX_LRO_BUFFER_EXTRA		2048
+
+#define NX_RX_LRO_BUFFER_LENGTH		(8060)
+
+/*
+ * Maximum number of ring contexts
+ */
+#define MAX_RING_CTX 1
+
+/* Opcodes to be used with the commands */
+#define TX_ETHER_PKT	0x01
+#define TX_TCP_PKT	0x02
+#define TX_UDP_PKT	0x03
+#define TX_IP_PKT	0x04
+#define TX_TCP_LSO	0x05
+#define TX_TCP_LSO6	0x06
+#define TX_IPSEC	0x07
+#define TX_IPSEC_CMD	0x0a
+#define TX_TCPV6_PKT	0x0b
+#define TX_UDPV6_PKT	0x0c
+
+/* The following opcodes are for internal consumption. */
+#define NETXEN_CONTROL_OP	0x10
+#define PEGNET_REQUEST		0x11
+
+#define	MAX_NUM_CARDS		4
+
+#define NETXEN_MAX_FRAGS_PER_TX	14
+#define MAX_TSO_HEADER_DESC	2
+#define MGMT_CMD_DESC_RESV	4
+#define TX_STOP_THRESH		((MAX_SKB_FRAGS >> 2) + MAX_TSO_HEADER_DESC \
+							+ MGMT_CMD_DESC_RESV)
+#define NX_MAX_TX_TIMEOUTS	2
+
+/*
+ * Following are the states of the Phantom. Phantom will set them and
+ * Host will read to check if the fields are correct.
+ */
+#define PHAN_INITIALIZE_START		0xff00
+#define PHAN_INITIALIZE_FAILED		0xffff
+#define PHAN_INITIALIZE_COMPLETE	0xff01
+
+/* Host writes the following to notify that it has done the init-handshake */
+#define PHAN_INITIALIZE_ACK	0xf00f
+
+#define NUM_RCV_DESC_RINGS	3
+#define NUM_STS_DESC_RINGS	4
+
+#define RCV_RING_NORMAL	0
+#define RCV_RING_JUMBO	1
+#define RCV_RING_LRO	2
+
+#define MIN_CMD_DESCRIPTORS		64
+#define MIN_RCV_DESCRIPTORS		64
+#define MIN_JUMBO_DESCRIPTORS		32
+
+#define MAX_CMD_DESCRIPTORS		1024
+#define MAX_RCV_DESCRIPTORS_1G		4096
+#define MAX_RCV_DESCRIPTORS_10G		8192
+#define MAX_JUMBO_RCV_DESCRIPTORS_1G	512
+#define MAX_JUMBO_RCV_DESCRIPTORS_10G	1024
+#define MAX_LRO_RCV_DESCRIPTORS		8
+
+#define DEFAULT_RCV_DESCRIPTORS_1G	2048
+#define DEFAULT_RCV_DESCRIPTORS_10G	4096
+
+#define NETXEN_CTX_SIGNATURE	0xdee0
+#define NETXEN_CTX_SIGNATURE_V2	0x0002dee0
+#define NETXEN_CTX_RESET	0xbad0
+#define NETXEN_CTX_D3_RESET	0xacc0
+#define NETXEN_RCV_PRODUCER(ringid)	(ringid)
+
+#define PHAN_PEG_RCV_INITIALIZED	0xff01
+#define PHAN_PEG_RCV_START_INITIALIZE	0xff00
+
+#define get_next_index(index, length)	\
+	(((index) + 1) & ((length) - 1))
+
+#define get_index_range(index,length,count)	\
+	(((index) + (count)) & ((length) - 1))
+
+#define MPORT_SINGLE_FUNCTION_MODE 0x1111
+#define MPORT_MULTI_FUNCTION_MODE 0x2222
+
+#define NX_MAX_PCI_FUNC		8
+
+/*
+ * NetXen host-peg signal message structure
+ *
+ *	Bit 0-1		: peg_id => 0x2 for tx and 01 for rx
+ *	Bit 2		: priv_id => must be 1
+ *	Bit 3-17	: count => for doorbell
+ *	Bit 18-27	: ctx_id => Context id
+ *	Bit 28-31	: opcode
+ */
+
+typedef u32 netxen_ctx_msg;
+
+#define netxen_set_msg_peg_id(config_word, val)	\
+	((config_word) &= ~3, (config_word) |= val & 3)
+#define netxen_set_msg_privid(config_word)	\
+	((config_word) |= 1 << 2)
+#define netxen_set_msg_count(config_word, val)	\
+	((config_word) &= ~(0x7fff<<3), (config_word) |= (val & 0x7fff) << 3)
+#define netxen_set_msg_ctxid(config_word, val)	\
+	((config_word) &= ~(0x3ff<<18), (config_word) |= (val & 0x3ff) << 18)
+#define netxen_set_msg_opcode(config_word, val)	\
+	((config_word) &= ~(0xf<<28), (config_word) |= (val & 0xf) << 28)
+
+struct netxen_rcv_ring {
+	__le64 addr;
+	__le32 size;
+	__le32 rsrvd;
+};
+
+struct netxen_sts_ring {
+	__le64 addr;
+	__le32 size;
+	__le16 msi_index;
+	__le16 rsvd;
+} ;
+
+struct netxen_ring_ctx {
+
+	/* one command ring */
+	__le64 cmd_consumer_offset;
+	__le64 cmd_ring_addr;
+	__le32 cmd_ring_size;
+	__le32 rsrvd;
+
+	/* three receive rings */
+	struct netxen_rcv_ring rcv_rings[NUM_RCV_DESC_RINGS];
+
+	__le64 sts_ring_addr;
+	__le32 sts_ring_size;
+
+	__le32 ctx_id;
+
+	__le64 rsrvd_2[3];
+	__le32 sts_ring_count;
+	__le32 rsrvd_3;
+	struct netxen_sts_ring sts_rings[NUM_STS_DESC_RINGS];
+
+} __attribute__ ((aligned(64)));
+
+/*
+ * Following data structures describe the descriptors that will be used.
+ * Added fileds of tcpHdrSize and ipHdrSize, The driver needs to do it only when
+ * we are doing LSO (above the 1500 size packet) only.
+ */
+
+/*
+ * The size of reference handle been changed to 16 bits to pass the MSS fields
+ * for the LSO packet
+ */
+
+#define FLAGS_CHECKSUM_ENABLED	0x01
+#define FLAGS_LSO_ENABLED	0x02
+#define FLAGS_IPSEC_SA_ADD	0x04
+#define FLAGS_IPSEC_SA_DELETE	0x08
+#define FLAGS_VLAN_TAGGED	0x10
+#define FLAGS_VLAN_OOB		0x40
+
+#define netxen_set_tx_vlan_tci(cmd_desc, v)	\
+	(cmd_desc)->vlan_TCI = cpu_to_le16(v);
+
+#define netxen_set_cmd_desc_port(cmd_desc, var)	\
+	((cmd_desc)->port_ctxid |= ((var) & 0x0F))
+#define netxen_set_cmd_desc_ctxid(cmd_desc, var)	\
+	((cmd_desc)->port_ctxid |= ((var) << 4 & 0xF0))
+
+#define netxen_set_tx_port(_desc, _port) \
+	(_desc)->port_ctxid = ((_port) & 0xf) | (((_port) << 4) & 0xf0)
+
+#define netxen_set_tx_flags_opcode(_desc, _flags, _opcode) \
+	(_desc)->flags_opcode = \
+	cpu_to_le16(((_flags) & 0x7f) | (((_opcode) & 0x3f) << 7))
+
+#define netxen_set_tx_frags_len(_desc, _frags, _len) \
+	(_desc)->nfrags__length = \
+	cpu_to_le32(((_frags) & 0xff) | (((_len) & 0xffffff) << 8))
+
+struct cmd_desc_type0 {
+	u8 tcp_hdr_offset;	/* For LSO only */
+	u8 ip_hdr_offset;	/* For LSO only */
+	__le16 flags_opcode;	/* 15:13 unused, 12:7 opcode, 6:0 flags */
+	__le32 nfrags__length;	/* 31:8 total len, 7:0 frag count */
+
+	__le64 addr_buffer2;
+
+	__le16 reference_handle;
+	__le16 mss;
+	u8 port_ctxid;		/* 7:4 ctxid 3:0 port */
+	u8 total_hdr_length;	/* LSO only : MAC+IP+TCP Hdr size */
+	__le16 conn_id;		/* IPSec offoad only */
+
+	__le64 addr_buffer3;
+	__le64 addr_buffer1;
+
+	__le16 buffer_length[4];
+
+	__le64 addr_buffer4;
+
+	__le32 reserved2;
+	__le16 reserved;
+	__le16 vlan_TCI;
+
+} __attribute__ ((aligned(64)));
+
+/* Note: sizeof(rcv_desc) should always be a multiple of 2 */
+struct rcv_desc {
+	__le16 reference_handle;
+	__le16 reserved;
+	__le32 buffer_length;	/* allocated buffer length (usually 2K) */
+	__le64 addr_buffer;
+};
+
+/* opcode field in status_desc */
+#define NETXEN_NIC_SYN_OFFLOAD  0x03
+#define NETXEN_NIC_RXPKT_DESC  0x04
+#define NETXEN_OLD_RXPKT_DESC  0x3f
+#define NETXEN_NIC_RESPONSE_DESC 0x05
+#define NETXEN_NIC_LRO_DESC  	0x12
+
+/* for status field in status_desc */
+#define STATUS_NEED_CKSUM	(1)
+#define STATUS_CKSUM_OK		(2)
+
+/* owner bits of status_desc */
+#define STATUS_OWNER_HOST	(0x1ULL << 56)
+#define STATUS_OWNER_PHANTOM	(0x2ULL << 56)
+
+/* Status descriptor:
+   0-3 port, 4-7 status, 8-11 type, 12-27 total_length
+   28-43 reference_handle, 44-47 protocol, 48-52 pkt_offset
+   53-55 desc_cnt, 56-57 owner, 58-63 opcode
+ */
+#define netxen_get_sts_port(sts_data)	\
+	((sts_data) & 0x0F)
+#define netxen_get_sts_status(sts_data)	\
+	(((sts_data) >> 4) & 0x0F)
+#define netxen_get_sts_type(sts_data)	\
+	(((sts_data) >> 8) & 0x0F)
+#define netxen_get_sts_totallength(sts_data)	\
+	(((sts_data) >> 12) & 0xFFFF)
+#define netxen_get_sts_refhandle(sts_data)	\
+	(((sts_data) >> 28) & 0xFFFF)
+#define netxen_get_sts_prot(sts_data)	\
+	(((sts_data) >> 44) & 0x0F)
+#define netxen_get_sts_pkt_offset(sts_data)	\
+	(((sts_data) >> 48) & 0x1F)
+#define netxen_get_sts_desc_cnt(sts_data)	\
+	(((sts_data) >> 53) & 0x7)
+#define netxen_get_sts_opcode(sts_data)	\
+	(((sts_data) >> 58) & 0x03F)
+
+#define netxen_get_lro_sts_refhandle(sts_data) 	\
+	((sts_data) & 0x0FFFF)
+#define netxen_get_lro_sts_length(sts_data)	\
+	(((sts_data) >> 16) & 0x0FFFF)
+#define netxen_get_lro_sts_l2_hdr_offset(sts_data)	\
+	(((sts_data) >> 32) & 0x0FF)
+#define netxen_get_lro_sts_l4_hdr_offset(sts_data)	\
+	(((sts_data) >> 40) & 0x0FF)
+#define netxen_get_lro_sts_timestamp(sts_data)	\
+	(((sts_data) >> 48) & 0x1)
+#define netxen_get_lro_sts_type(sts_data)	\
+	(((sts_data) >> 49) & 0x7)
+#define netxen_get_lro_sts_push_flag(sts_data)		\
+	(((sts_data) >> 52) & 0x1)
+#define netxen_get_lro_sts_seq_number(sts_data)		\
+	((sts_data) & 0x0FFFFFFFF)
+#define netxen_get_lro_sts_mss(sts_data1)		\
+	((sts_data1 >> 32) & 0x0FFFF)
+
+
+struct status_desc {
+	__le64 status_desc_data[2];
+} __attribute__ ((aligned(16)));
+
+/* UNIFIED ROMIMAGE *************************/
+#define NX_UNI_DIR_SECT_PRODUCT_TBL	0x0
+#define NX_UNI_DIR_SECT_BOOTLD		0x6
+#define NX_UNI_DIR_SECT_FW		0x7
+
+/*Offsets */
+#define NX_UNI_CHIP_REV_OFF		10
+#define NX_UNI_FLAGS_OFF		11
+#define NX_UNI_BIOS_VERSION_OFF 	12
+#define NX_UNI_BOOTLD_IDX_OFF		27
+#define NX_UNI_FIRMWARE_IDX_OFF 	29
+
+struct uni_table_desc{
+	uint32_t	findex;
+	uint32_t	num_entries;
+	uint32_t	entry_size;
+	uint32_t	reserved[5];
+};
+
+struct uni_data_desc{
+	uint32_t	findex;
+	uint32_t	size;
+	uint32_t	reserved[5];
+};
+
+/* UNIFIED ROMIMAGE *************************/
+
+/* The version of the main data structure */
+#define	NETXEN_BDINFO_VERSION 1
+
+/* Magic number to let user know flash is programmed */
+#define	NETXEN_BDINFO_MAGIC 0x12345678
+
+/* Max number of Gig ports on a Phantom board */
+#define NETXEN_MAX_PORTS 4
+
+#define NETXEN_BRDTYPE_P1_BD		0x0000
+#define NETXEN_BRDTYPE_P1_SB		0x0001
+#define NETXEN_BRDTYPE_P1_SMAX		0x0002
+#define NETXEN_BRDTYPE_P1_SOCK		0x0003
+
+#define NETXEN_BRDTYPE_P2_SOCK_31	0x0008
+#define NETXEN_BRDTYPE_P2_SOCK_35	0x0009
+#define NETXEN_BRDTYPE_P2_SB35_4G	0x000a
+#define NETXEN_BRDTYPE_P2_SB31_10G	0x000b
+#define NETXEN_BRDTYPE_P2_SB31_2G	0x000c
+
+#define NETXEN_BRDTYPE_P2_SB31_10G_IMEZ		0x000d
+#define NETXEN_BRDTYPE_P2_SB31_10G_HMEZ		0x000e
+#define NETXEN_BRDTYPE_P2_SB31_10G_CX4		0x000f
+
+#define NETXEN_BRDTYPE_P3_REF_QG	0x0021
+#define NETXEN_BRDTYPE_P3_HMEZ		0x0022
+#define NETXEN_BRDTYPE_P3_10G_CX4_LP	0x0023
+#define NETXEN_BRDTYPE_P3_4_GB		0x0024
+#define NETXEN_BRDTYPE_P3_IMEZ		0x0025
+#define NETXEN_BRDTYPE_P3_10G_SFP_PLUS	0x0026
+#define NETXEN_BRDTYPE_P3_10000_BASE_T	0x0027
+#define NETXEN_BRDTYPE_P3_XG_LOM	0x0028
+#define NETXEN_BRDTYPE_P3_4_GB_MM	0x0029
+#define NETXEN_BRDTYPE_P3_10G_SFP_CT	0x002a
+#define NETXEN_BRDTYPE_P3_10G_SFP_QT	0x002b
+#define NETXEN_BRDTYPE_P3_10G_CX4	0x0031
+#define NETXEN_BRDTYPE_P3_10G_XFP	0x0032
+#define NETXEN_BRDTYPE_P3_10G_TP	0x0080
+
+/* Flash memory map */
+#define NETXEN_CRBINIT_START	0	/* crbinit section */
+#define NETXEN_BRDCFG_START	0x4000	/* board config */
+#define NETXEN_INITCODE_START	0x6000	/* pegtune code */
+#define NETXEN_BOOTLD_START	0x10000	/* bootld */
+#define NETXEN_IMAGE_START	0x43000	/* compressed image */
+#define NETXEN_SECONDARY_START	0x200000	/* backup images */
+#define NETXEN_PXE_START	0x3E0000	/* PXE boot rom */
+#define NETXEN_USER_START	0x3E8000	/* Firmware info */
+#define NETXEN_FIXED_START	0x3F0000	/* backup of crbinit */
+#define NETXEN_USER_START_OLD	NETXEN_PXE_START /* very old flash */
+
+#define NX_OLD_MAC_ADDR_OFFSET	(NETXEN_USER_START)
+#define NX_FW_VERSION_OFFSET	(NETXEN_USER_START+0x408)
+#define NX_FW_SIZE_OFFSET	(NETXEN_USER_START+0x40c)
+#define NX_FW_MAC_ADDR_OFFSET	(NETXEN_USER_START+0x418)
+#define NX_FW_SERIAL_NUM_OFFSET	(NETXEN_USER_START+0x81c)
+#define NX_BIOS_VERSION_OFFSET	(NETXEN_USER_START+0x83c)
+
+#define NX_HDR_VERSION_OFFSET	(NETXEN_BRDCFG_START)
+#define NX_BRDTYPE_OFFSET	(NETXEN_BRDCFG_START+0x8)
+#define NX_FW_MAGIC_OFFSET	(NETXEN_BRDCFG_START+0x128)
+
+#define NX_FW_MIN_SIZE		(0x3fffff)
+#define NX_P2_MN_ROMIMAGE	0
+#define NX_P3_CT_ROMIMAGE	1
+#define NX_P3_MN_ROMIMAGE	2
+#define NX_UNIFIED_ROMIMAGE	3
+#define NX_FLASH_ROMIMAGE	4
+#define NX_UNKNOWN_ROMIMAGE	0xff
+
+#define NX_P2_MN_ROMIMAGE_NAME		"nxromimg.bin"
+#define NX_P3_CT_ROMIMAGE_NAME		"nx3fwct.bin"
+#define NX_P3_MN_ROMIMAGE_NAME		"nx3fwmn.bin"
+#define NX_UNIFIED_ROMIMAGE_NAME	"phanfw.bin"
+#define NX_FLASH_ROMIMAGE_NAME		"flash"
+
+extern char netxen_nic_driver_name[];
+
+/* Number of status descriptors to handle per interrupt */
+#define MAX_STATUS_HANDLE	(64)
+
+/*
+ * netxen_skb_frag{} is to contain mapping info for each SG list. This
+ * has to be freed when DMA is complete. This is part of netxen_tx_buffer{}.
+ */
+struct netxen_skb_frag {
+	u64 dma;
+	u64 length;
+};
+
+struct netxen_recv_crb {
+	u32 crb_rcv_producer[NUM_RCV_DESC_RINGS];
+	u32 crb_sts_consumer[NUM_STS_DESC_RINGS];
+	u32 sw_int_mask[NUM_STS_DESC_RINGS];
+};
+
+/*    Following defines are for the state of the buffers    */
+#define	NETXEN_BUFFER_FREE	0
+#define	NETXEN_BUFFER_BUSY	1
+
+/*
+ * There will be one netxen_buffer per skb packet.    These will be
+ * used to save the dma info for pci_unmap_page()
+ */
+struct netxen_cmd_buffer {
+	struct sk_buff *skb;
+	struct netxen_skb_frag frag_array[MAX_SKB_FRAGS + 1];
+	u32 frag_count;
+};
+
+/* In rx_buffer, we do not need multiple fragments as is a single buffer */
+struct netxen_rx_buffer {
+	struct list_head list;
+	struct sk_buff *skb;
+	u64 dma;
+	u16 ref_handle;
+	u16 state;
+};
+
+/* Board types */
+#define	NETXEN_NIC_GBE	0x01
+#define	NETXEN_NIC_XGBE	0x02
+
+/*
+ * One hardware_context{} per adapter
+ * contains interrupt info as well shared hardware info.
+ */
+struct netxen_hardware_context {
+	void __iomem *pci_base0;
+	void __iomem *pci_base1;
+	void __iomem *pci_base2;
+	void __iomem *db_base;
+	void __iomem *ocm_win_crb;
+
+	unsigned long db_len;
+	unsigned long pci_len0;
+
+	u32 ocm_win;
+	u32 crb_win;
+
+	rwlock_t crb_lock;
+	spinlock_t mem_lock;
+
+	u8 cut_through;
+	u8 revision_id;
+	u8 pci_func;
+	u8 linkup;
+	u16 port_type;
+	u16 board_type;
+};
+
+#define MINIMUM_ETHERNET_FRAME_SIZE	64	/* With FCS */
+#define ETHERNET_FCS_SIZE		4
+
+struct netxen_adapter_stats {
+	u64  xmitcalled;
+	u64  xmitfinished;
+	u64  rxdropped;
+	u64  txdropped;
+	u64  csummed;
+	u64  rx_pkts;
+	u64  lro_pkts;
+	u64  rxbytes;
+	u64  txbytes;
+};
+
+/*
+ * Rcv Descriptor Context. One such per Rcv Descriptor. There may
+ * be one Rcv Descriptor for normal packets, one for jumbo and may be others.
+ */
+struct nx_host_rds_ring {
+	u32 producer;
+	u32 num_desc;
+	u32 dma_size;
+	u32 skb_size;
+	u32 flags;
+	void __iomem *crb_rcv_producer;
+	struct rcv_desc *desc_head;
+	struct netxen_rx_buffer *rx_buf_arr;
+	struct list_head free_list;
+	spinlock_t lock;
+	dma_addr_t phys_addr;
+};
+
+struct nx_host_sds_ring {
+	u32 consumer;
+	u32 num_desc;
+	void __iomem *crb_sts_consumer;
+	void __iomem *crb_intr_mask;
+
+	struct status_desc *desc_head;
+	struct netxen_adapter *adapter;
+	struct napi_struct napi;
+	struct list_head free_list[NUM_RCV_DESC_RINGS];
+
+	int irq;
+
+	dma_addr_t phys_addr;
+	char name[IFNAMSIZ+4];
+};
+
+struct nx_host_tx_ring {
+	u32 producer;
+	__le32 *hw_consumer;
+	u32 sw_consumer;
+	void __iomem *crb_cmd_producer;
+	void __iomem *crb_cmd_consumer;
+	u32 num_desc;
+
+	struct netdev_queue *txq;
+
+	struct netxen_cmd_buffer *cmd_buf_arr;
+	struct cmd_desc_type0 *desc_head;
+	dma_addr_t phys_addr;
+};
+
+/*
+ * Receive context. There is one such structure per instance of the
+ * receive processing. Any state information that is relevant to
+ * the receive, and is must be in this structure. The global data may be
+ * present elsewhere.
+ */
+struct netxen_recv_context {
+	u32 state;
+	u16 context_id;
+	u16 virt_port;
+
+	struct nx_host_rds_ring *rds_rings;
+	struct nx_host_sds_ring *sds_rings;
+
+	struct netxen_ring_ctx *hwctx;
+	dma_addr_t phys_addr;
+};
+
+struct _cdrp_cmd {
+	u32 cmd;
+	u32 arg1;
+	u32 arg2;
+	u32 arg3;
+};
+
+struct netxen_cmd_args {
+	struct _cdrp_cmd req;
+	struct _cdrp_cmd rsp;
+};
+
+/* New HW context creation */
+
+#define NX_OS_CRB_RETRY_COUNT	4000
+#define NX_CDRP_SIGNATURE_MAKE(pcifn, version) \
+	(((pcifn) & 0xff) | (((version) & 0xff) << 8) | (0xcafe << 16))
+
+#define NX_CDRP_CLEAR		0x00000000
+#define NX_CDRP_CMD_BIT		0x80000000
+
+/*
+ * All responses must have the NX_CDRP_CMD_BIT cleared
+ * in the crb NX_CDRP_CRB_OFFSET.
+ */
+#define NX_CDRP_FORM_RSP(rsp)	(rsp)
+#define NX_CDRP_IS_RSP(rsp)	(((rsp) & NX_CDRP_CMD_BIT) == 0)
+
+#define NX_CDRP_RSP_OK		0x00000001
+#define NX_CDRP_RSP_FAIL	0x00000002
+#define NX_CDRP_RSP_TIMEOUT	0x00000003
+
+/*
+ * All commands must have the NX_CDRP_CMD_BIT set in
+ * the crb NX_CDRP_CRB_OFFSET.
+ */
+#define NX_CDRP_FORM_CMD(cmd)	(NX_CDRP_CMD_BIT | (cmd))
+#define NX_CDRP_IS_CMD(cmd)	(((cmd) & NX_CDRP_CMD_BIT) != 0)
+
+#define NX_CDRP_CMD_SUBMIT_CAPABILITIES     0x00000001
+#define NX_CDRP_CMD_READ_MAX_RDS_PER_CTX    0x00000002
+#define NX_CDRP_CMD_READ_MAX_SDS_PER_CTX    0x00000003
+#define NX_CDRP_CMD_READ_MAX_RULES_PER_CTX  0x00000004
+#define NX_CDRP_CMD_READ_MAX_RX_CTX         0x00000005
+#define NX_CDRP_CMD_READ_MAX_TX_CTX         0x00000006
+#define NX_CDRP_CMD_CREATE_RX_CTX           0x00000007
+#define NX_CDRP_CMD_DESTROY_RX_CTX          0x00000008
+#define NX_CDRP_CMD_CREATE_TX_CTX           0x00000009
+#define NX_CDRP_CMD_DESTROY_TX_CTX          0x0000000a
+#define NX_CDRP_CMD_SETUP_STATISTICS        0x0000000e
+#define NX_CDRP_CMD_GET_STATISTICS          0x0000000f
+#define NX_CDRP_CMD_DELETE_STATISTICS       0x00000010
+#define NX_CDRP_CMD_SET_MTU                 0x00000012
+#define NX_CDRP_CMD_READ_PHY			0x00000013
+#define NX_CDRP_CMD_WRITE_PHY			0x00000014
+#define NX_CDRP_CMD_READ_HW_REG			0x00000015
+#define NX_CDRP_CMD_GET_FLOW_CTL		0x00000016
+#define NX_CDRP_CMD_SET_FLOW_CTL		0x00000017
+#define NX_CDRP_CMD_READ_MAX_MTU		0x00000018
+#define NX_CDRP_CMD_READ_MAX_LRO		0x00000019
+#define NX_CDRP_CMD_CONFIGURE_TOE		0x0000001a
+#define NX_CDRP_CMD_FUNC_ATTRIB			0x0000001b
+#define NX_CDRP_CMD_READ_PEXQ_PARAMETERS	0x0000001c
+#define NX_CDRP_CMD_GET_LIC_CAPABILITIES	0x0000001d
+#define NX_CDRP_CMD_READ_MAX_LRO_PER_BOARD	0x0000001e
+#define NX_CDRP_CMD_CONFIG_GBE_PORT		0x0000001f
+#define NX_CDRP_CMD_MAX				0x00000020
+
+#define NX_RCODE_SUCCESS		0
+#define NX_RCODE_NO_HOST_MEM		1
+#define NX_RCODE_NO_HOST_RESOURCE	2
+#define NX_RCODE_NO_CARD_CRB		3
+#define NX_RCODE_NO_CARD_MEM		4
+#define NX_RCODE_NO_CARD_RESOURCE	5
+#define NX_RCODE_INVALID_ARGS		6
+#define NX_RCODE_INVALID_ACTION		7
+#define NX_RCODE_INVALID_STATE		8
+#define NX_RCODE_NOT_SUPPORTED		9
+#define NX_RCODE_NOT_PERMITTED		10
+#define NX_RCODE_NOT_READY		11
+#define NX_RCODE_DOES_NOT_EXIST		12
+#define NX_RCODE_ALREADY_EXISTS		13
+#define NX_RCODE_BAD_SIGNATURE		14
+#define NX_RCODE_CMD_NOT_IMPL		15
+#define NX_RCODE_CMD_INVALID		16
+#define NX_RCODE_TIMEOUT		17
+#define NX_RCODE_CMD_FAILED		18
+#define NX_RCODE_MAX_EXCEEDED		19
+#define NX_RCODE_MAX			20
+
+#define NX_DESTROY_CTX_RESET		0
+#define NX_DESTROY_CTX_D3_RESET		1
+#define NX_DESTROY_CTX_MAX		2
+
+/*
+ * Capabilities
+ */
+#define NX_CAP_BIT(class, bit)		(1 << bit)
+#define NX_CAP0_LEGACY_CONTEXT		NX_CAP_BIT(0, 0)
+#define NX_CAP0_MULTI_CONTEXT		NX_CAP_BIT(0, 1)
+#define NX_CAP0_LEGACY_MN		NX_CAP_BIT(0, 2)
+#define NX_CAP0_LEGACY_MS		NX_CAP_BIT(0, 3)
+#define NX_CAP0_CUT_THROUGH		NX_CAP_BIT(0, 4)
+#define NX_CAP0_LRO			NX_CAP_BIT(0, 5)
+#define NX_CAP0_LSO			NX_CAP_BIT(0, 6)
+#define NX_CAP0_JUMBO_CONTIGUOUS	NX_CAP_BIT(0, 7)
+#define NX_CAP0_LRO_CONTIGUOUS		NX_CAP_BIT(0, 8)
+#define NX_CAP0_HW_LRO			NX_CAP_BIT(0, 10)
+#define NX_CAP0_HW_LRO_MSS		NX_CAP_BIT(0, 21)
+
+/*
+ * Context state
+ */
+#define NX_HOST_CTX_STATE_FREED		0
+#define NX_HOST_CTX_STATE_ALLOCATED	1
+#define NX_HOST_CTX_STATE_ACTIVE	2
+#define NX_HOST_CTX_STATE_DISABLED	3
+#define NX_HOST_CTX_STATE_QUIESCED	4
+#define NX_HOST_CTX_STATE_MAX		5
+
+/*
+ * Rx context
+ */
+
+typedef struct {
+	__le64 host_phys_addr;	/* Ring base addr */
+	__le32 ring_size;		/* Ring entries */
+	__le16 msi_index;
+	__le16 rsvd;		/* Padding */
+} nx_hostrq_sds_ring_t;
+
+typedef struct {
+	__le64 host_phys_addr;	/* Ring base addr */
+	__le64 buff_size;		/* Packet buffer size */
+	__le32 ring_size;		/* Ring entries */
+	__le32 ring_kind;		/* Class of ring */
+} nx_hostrq_rds_ring_t;
+
+typedef struct {
+	__le64 host_rsp_dma_addr;	/* Response dma'd here */
+	__le32 capabilities[4];	/* Flag bit vector */
+	__le32 host_int_crb_mode;	/* Interrupt crb usage */
+	__le32 host_rds_crb_mode;	/* RDS crb usage */
+	/* These ring offsets are relative to data[0] below */
+	__le32 rds_ring_offset;	/* Offset to RDS config */
+	__le32 sds_ring_offset;	/* Offset to SDS config */
+	__le16 num_rds_rings;	/* Count of RDS rings */
+	__le16 num_sds_rings;	/* Count of SDS rings */
+	__le16 rsvd1;		/* Padding */
+	__le16 rsvd2;		/* Padding */
+	u8  reserved[128]; 	/* reserve space for future expansion*/
+	/* MUST BE 64-bit aligned.
+	   The following is packed:
+	   - N hostrq_rds_rings
+	   - N hostrq_sds_rings */
+	char data[0];
+} nx_hostrq_rx_ctx_t;
+
+typedef struct {
+	__le32 host_producer_crb;	/* Crb to use */
+	__le32 rsvd1;		/* Padding */
+} nx_cardrsp_rds_ring_t;
+
+typedef struct {
+	__le32 host_consumer_crb;	/* Crb to use */
+	__le32 interrupt_crb;	/* Crb to use */
+} nx_cardrsp_sds_ring_t;
+
+typedef struct {
+	/* These ring offsets are relative to data[0] below */
+	__le32 rds_ring_offset;	/* Offset to RDS config */
+	__le32 sds_ring_offset;	/* Offset to SDS config */
+	__le32 host_ctx_state;	/* Starting State */
+	__le32 num_fn_per_port;	/* How many PCI fn share the port */
+	__le16 num_rds_rings;	/* Count of RDS rings */
+	__le16 num_sds_rings;	/* Count of SDS rings */
+	__le16 context_id;		/* Handle for context */
+	u8  phys_port;		/* Physical id of port */
+	u8  virt_port;		/* Virtual/Logical id of port */
+	u8  reserved[128];	/* save space for future expansion */
+	/*  MUST BE 64-bit aligned.
+	   The following is packed:
+	   - N cardrsp_rds_rings
+	   - N cardrs_sds_rings */
+	char data[0];
+} nx_cardrsp_rx_ctx_t;
+
+#define SIZEOF_HOSTRQ_RX(HOSTRQ_RX, rds_rings, sds_rings)	\
+	(sizeof(HOSTRQ_RX) + 					\
+	(rds_rings)*(sizeof(nx_hostrq_rds_ring_t)) +		\
+	(sds_rings)*(sizeof(nx_hostrq_sds_ring_t)))
+
+#define SIZEOF_CARDRSP_RX(CARDRSP_RX, rds_rings, sds_rings) 	\
+	(sizeof(CARDRSP_RX) + 					\
+	(rds_rings)*(sizeof(nx_cardrsp_rds_ring_t)) + 		\
+	(sds_rings)*(sizeof(nx_cardrsp_sds_ring_t)))
+
+/*
+ * Tx context
+ */
+
+typedef struct {
+	__le64 host_phys_addr;	/* Ring base addr */
+	__le32 ring_size;		/* Ring entries */
+	__le32 rsvd;		/* Padding */
+} nx_hostrq_cds_ring_t;
+
+typedef struct {
+	__le64 host_rsp_dma_addr;	/* Response dma'd here */
+	__le64 cmd_cons_dma_addr;	/*  */
+	__le64 dummy_dma_addr;	/*  */
+	__le32 capabilities[4];	/* Flag bit vector */
+	__le32 host_int_crb_mode;	/* Interrupt crb usage */
+	__le32 rsvd1;		/* Padding */
+	__le16 rsvd2;		/* Padding */
+	__le16 interrupt_ctl;
+	__le16 msi_index;
+	__le16 rsvd3;		/* Padding */
+	nx_hostrq_cds_ring_t cds_ring;	/* Desc of cds ring */
+	u8  reserved[128];	/* future expansion */
+} nx_hostrq_tx_ctx_t;
+
+typedef struct {
+	__le32 host_producer_crb;	/* Crb to use */
+	__le32 interrupt_crb;	/* Crb to use */
+} nx_cardrsp_cds_ring_t;
+
+typedef struct {
+	__le32 host_ctx_state;	/* Starting state */
+	__le16 context_id;		/* Handle for context */
+	u8  phys_port;		/* Physical id of port */
+	u8  virt_port;		/* Virtual/Logical id of port */
+	nx_cardrsp_cds_ring_t cds_ring;	/* Card cds settings */
+	u8  reserved[128];	/* future expansion */
+} nx_cardrsp_tx_ctx_t;
+
+#define SIZEOF_HOSTRQ_TX(HOSTRQ_TX)	(sizeof(HOSTRQ_TX))
+#define SIZEOF_CARDRSP_TX(CARDRSP_TX)	(sizeof(CARDRSP_TX))
+
+/* CRB */
+
+#define NX_HOST_RDS_CRB_MODE_UNIQUE	0
+#define NX_HOST_RDS_CRB_MODE_SHARED	1
+#define NX_HOST_RDS_CRB_MODE_CUSTOM	2
+#define NX_HOST_RDS_CRB_MODE_MAX	3
+
+#define NX_HOST_INT_CRB_MODE_UNIQUE	0
+#define NX_HOST_INT_CRB_MODE_SHARED	1
+#define NX_HOST_INT_CRB_MODE_NORX	2
+#define NX_HOST_INT_CRB_MODE_NOTX	3
+#define NX_HOST_INT_CRB_MODE_NORXTX	4
+
+
+/* MAC */
+
+#define MC_COUNT_P2	16
+#define MC_COUNT_P3	38
+
+#define NETXEN_MAC_NOOP	0
+#define NETXEN_MAC_ADD	1
+#define NETXEN_MAC_DEL	2
+
+typedef struct nx_mac_list_s {
+	struct list_head list;
+	uint8_t mac_addr[ETH_ALEN+2];
+} nx_mac_list_t;
+
+struct nx_ip_list {
+	struct list_head list;
+	__be32 ip_addr;
+	bool master;
+};
+
+/*
+ * Interrupt coalescing defaults. The defaults are for 1500 MTU. It is
+ * adjusted based on configured MTU.
+ */
+#define NETXEN_DEFAULT_INTR_COALESCE_RX_TIME_US	3
+#define NETXEN_DEFAULT_INTR_COALESCE_RX_PACKETS	256
+#define NETXEN_DEFAULT_INTR_COALESCE_TX_PACKETS	64
+#define NETXEN_DEFAULT_INTR_COALESCE_TX_TIME_US	4
+
+#define NETXEN_NIC_INTR_DEFAULT			0x04
+
+typedef union {
+	struct {
+		uint16_t	rx_packets;
+		uint16_t	rx_time_us;
+		uint16_t	tx_packets;
+		uint16_t	tx_time_us;
+	} data;
+	uint64_t		word;
+} nx_nic_intr_coalesce_data_t;
+
+typedef struct {
+	uint16_t			stats_time_us;
+	uint16_t			rate_sample_time;
+	uint16_t			flags;
+	uint16_t			rsvd_1;
+	uint32_t			low_threshold;
+	uint32_t			high_threshold;
+	nx_nic_intr_coalesce_data_t	normal;
+	nx_nic_intr_coalesce_data_t	low;
+	nx_nic_intr_coalesce_data_t	high;
+	nx_nic_intr_coalesce_data_t	irq;
+} nx_nic_intr_coalesce_t;
+
+#define NX_HOST_REQUEST		0x13
+#define NX_NIC_REQUEST		0x14
+
+#define NX_MAC_EVENT		0x1
+
+#define NX_IP_UP		2
+#define NX_IP_DOWN		3
+
+/*
+ * Driver --> Firmware
+ */
+#define NX_NIC_H2C_OPCODE_START				0
+#define NX_NIC_H2C_OPCODE_CONFIG_RSS			1
+#define NX_NIC_H2C_OPCODE_CONFIG_RSS_TBL		2
+#define NX_NIC_H2C_OPCODE_CONFIG_INTR_COALESCE		3
+#define NX_NIC_H2C_OPCODE_CONFIG_LED			4
+#define NX_NIC_H2C_OPCODE_CONFIG_PROMISCUOUS		5
+#define NX_NIC_H2C_OPCODE_CONFIG_L2_MAC			6
+#define NX_NIC_H2C_OPCODE_LRO_REQUEST			7
+#define NX_NIC_H2C_OPCODE_GET_SNMP_STATS		8
+#define NX_NIC_H2C_OPCODE_PROXY_START_REQUEST		9
+#define NX_NIC_H2C_OPCODE_PROXY_STOP_REQUEST		10
+#define NX_NIC_H2C_OPCODE_PROXY_SET_MTU			11
+#define NX_NIC_H2C_OPCODE_PROXY_SET_VPORT_MISS_MODE	12
+#define NX_NIC_H2C_OPCODE_GET_FINGER_PRINT_REQUEST	13
+#define NX_NIC_H2C_OPCODE_INSTALL_LICENSE_REQUEST	14
+#define NX_NIC_H2C_OPCODE_GET_LICENSE_CAPABILITY_REQUEST	15
+#define NX_NIC_H2C_OPCODE_GET_NET_STATS			16
+#define NX_NIC_H2C_OPCODE_PROXY_UPDATE_P2V		17
+#define NX_NIC_H2C_OPCODE_CONFIG_IPADDR			18
+#define NX_NIC_H2C_OPCODE_CONFIG_LOOPBACK		19
+#define NX_NIC_H2C_OPCODE_PROXY_STOP_DONE		20
+#define NX_NIC_H2C_OPCODE_GET_LINKEVENT			21
+#define NX_NIC_C2C_OPCODE				22
+#define NX_NIC_H2C_OPCODE_CONFIG_BRIDGING               23
+#define NX_NIC_H2C_OPCODE_CONFIG_HW_LRO			24
+#define NX_NIC_H2C_OPCODE_LAST				25
+
+/*
+ * Firmware --> Driver
+ */
+
+#define NX_NIC_C2H_OPCODE_START				128
+#define NX_NIC_C2H_OPCODE_CONFIG_RSS_RESPONSE		129
+#define NX_NIC_C2H_OPCODE_CONFIG_RSS_TBL_RESPONSE	130
+#define NX_NIC_C2H_OPCODE_CONFIG_MAC_RESPONSE		131
+#define NX_NIC_C2H_OPCODE_CONFIG_PROMISCUOUS_RESPONSE	132
+#define NX_NIC_C2H_OPCODE_CONFIG_L2_MAC_RESPONSE	133
+#define NX_NIC_C2H_OPCODE_LRO_DELETE_RESPONSE		134
+#define NX_NIC_C2H_OPCODE_LRO_ADD_FAILURE_RESPONSE	135
+#define NX_NIC_C2H_OPCODE_GET_SNMP_STATS		136
+#define NX_NIC_C2H_OPCODE_GET_FINGER_PRINT_REPLY	137
+#define NX_NIC_C2H_OPCODE_INSTALL_LICENSE_REPLY		138
+#define NX_NIC_C2H_OPCODE_GET_LICENSE_CAPABILITIES_REPLY 139
+#define NX_NIC_C2H_OPCODE_GET_NET_STATS_RESPONSE	140
+#define NX_NIC_C2H_OPCODE_GET_LINKEVENT_RESPONSE	141
+#define NX_NIC_C2H_OPCODE_LAST				142
+
+#define VPORT_MISS_MODE_DROP		0 /* drop all unmatched */
+#define VPORT_MISS_MODE_ACCEPT_ALL	1 /* accept all packets */
+#define VPORT_MISS_MODE_ACCEPT_MULTI	2 /* accept unmatched multicast */
+
+#define NX_NIC_LRO_REQUEST_FIRST		0
+#define NX_NIC_LRO_REQUEST_ADD_FLOW		1
+#define NX_NIC_LRO_REQUEST_DELETE_FLOW		2
+#define NX_NIC_LRO_REQUEST_TIMER		3
+#define NX_NIC_LRO_REQUEST_CLEANUP		4
+#define NX_NIC_LRO_REQUEST_ADD_FLOW_SCHEDULED	5
+#define NX_TOE_LRO_REQUEST_ADD_FLOW		6
+#define NX_TOE_LRO_REQUEST_ADD_FLOW_RESPONSE	7
+#define NX_TOE_LRO_REQUEST_DELETE_FLOW		8
+#define NX_TOE_LRO_REQUEST_DELETE_FLOW_RESPONSE	9
+#define NX_TOE_LRO_REQUEST_TIMER		10
+#define NX_NIC_LRO_REQUEST_LAST			11
+
+#define NX_FW_CAPABILITY_LINK_NOTIFICATION	(1 << 5)
+#define NX_FW_CAPABILITY_SWITCHING		(1 << 6)
+#define NX_FW_CAPABILITY_PEXQ			(1 << 7)
+#define NX_FW_CAPABILITY_BDG			(1 << 8)
+#define NX_FW_CAPABILITY_FVLANTX		(1 << 9)
+#define NX_FW_CAPABILITY_HW_LRO			(1 << 10)
+#define NX_FW_CAPABILITY_GBE_LINK_CFG		(1 << 11)
+#define NX_FW_CAPABILITY_MORE_CAPS		(1 << 31)
+#define NX_FW_CAPABILITY_2_LRO_MAX_TCP_SEG	(1 << 2)
+
+/* module types */
+#define LINKEVENT_MODULE_NOT_PRESENT			1
+#define LINKEVENT_MODULE_OPTICAL_UNKNOWN		2
+#define LINKEVENT_MODULE_OPTICAL_SRLR			3
+#define LINKEVENT_MODULE_OPTICAL_LRM			4
+#define LINKEVENT_MODULE_OPTICAL_SFP_1G			5
+#define LINKEVENT_MODULE_TWINAX_UNSUPPORTED_CABLE	6
+#define LINKEVENT_MODULE_TWINAX_UNSUPPORTED_CABLELEN	7
+#define LINKEVENT_MODULE_TWINAX				8
+
+#define LINKSPEED_10GBPS	10000
+#define LINKSPEED_1GBPS		1000
+#define LINKSPEED_100MBPS	100
+#define LINKSPEED_10MBPS	10
+
+#define LINKSPEED_ENCODED_10MBPS	0
+#define LINKSPEED_ENCODED_100MBPS	1
+#define LINKSPEED_ENCODED_1GBPS		2
+
+#define LINKEVENT_AUTONEG_DISABLED	0
+#define LINKEVENT_AUTONEG_ENABLED	1
+
+#define LINKEVENT_HALF_DUPLEX		0
+#define LINKEVENT_FULL_DUPLEX		1
+
+#define LINKEVENT_LINKSPEED_MBPS	0
+#define LINKEVENT_LINKSPEED_ENCODED	1
+
+#define AUTO_FW_RESET_ENABLED	0xEF10AF12
+#define AUTO_FW_RESET_DISABLED	0xDCBAAF12
+
+/* firmware response header:
+ *	63:58 - message type
+ *	57:56 - owner
+ *	55:53 - desc count
+ *	52:48 - reserved
+ *	47:40 - completion id
+ *	39:32 - opcode
+ *	31:16 - error code
+ *	15:00 - reserved
+ */
+#define netxen_get_nic_msgtype(msg_hdr)	\
+	((msg_hdr >> 58) & 0x3F)
+#define netxen_get_nic_msg_compid(msg_hdr)	\
+	((msg_hdr >> 40) & 0xFF)
+#define netxen_get_nic_msg_opcode(msg_hdr)	\
+	((msg_hdr >> 32) & 0xFF)
+#define netxen_get_nic_msg_errcode(msg_hdr)	\
+	((msg_hdr >> 16) & 0xFFFF)
+
+typedef struct {
+	union {
+		struct {
+			u64 hdr;
+			u64 body[7];
+		};
+		u64 words[8];
+	};
+} nx_fw_msg_t;
+
+typedef struct {
+	__le64 qhdr;
+	__le64 req_hdr;
+	__le64 words[6];
+} nx_nic_req_t;
+
+typedef struct {
+	u8 op;
+	u8 tag;
+	u8 mac_addr[6];
+} nx_mac_req_t;
+
+#define MAX_PENDING_DESC_BLOCK_SIZE	64
+
+#define NETXEN_NIC_MSI_ENABLED		0x02
+#define NETXEN_NIC_MSIX_ENABLED		0x04
+#define NETXEN_NIC_LRO_ENABLED		0x08
+#define NETXEN_NIC_LRO_DISABLED		0x00
+#define NETXEN_NIC_BRIDGE_ENABLED       0X10
+#define NETXEN_NIC_DIAG_ENABLED		0x20
+#define NETXEN_FW_RESET_OWNER           0x40
+#define NETXEN_FW_MSS_CAP	        0x80
+#define NETXEN_IS_MSI_FAMILY(adapter) \
+	((adapter)->flags & (NETXEN_NIC_MSI_ENABLED | NETXEN_NIC_MSIX_ENABLED))
+
+#define MSIX_ENTRIES_PER_ADAPTER	NUM_STS_DESC_RINGS
+#define NETXEN_MSIX_TBL_SPACE		8192
+#define NETXEN_PCI_REG_MSIX_TBL		0x44
+
+#define NETXEN_DB_MAPSIZE_BYTES    	0x1000
+
+#define NETXEN_ADAPTER_UP_MAGIC 777
+#define NETXEN_NIC_PEG_TUNE 0
+
+#define __NX_FW_ATTACHED		0
+#define __NX_DEV_UP			1
+#define __NX_RESETTING			2
+
+/* Mini Coredump FW supported version */
+#define NX_MD_SUPPORT_MAJOR		4
+#define NX_MD_SUPPORT_MINOR		0
+#define NX_MD_SUPPORT_SUBVERSION	579
+
+#define LSW(x)  ((uint16_t)(x))
+#define LSD(x)  ((uint32_t)((uint64_t)(x)))
+#define MSD(x)  ((uint32_t)((((uint64_t)(x)) >> 16) >> 16))
+
+/* Mini Coredump mask level */
+#define	NX_DUMP_MASK_MIN	0x03
+#define	NX_DUMP_MASK_DEF	0x1f
+#define	NX_DUMP_MASK_MAX	0xff
+
+/* Mini Coredump CDRP commands */
+#define NX_CDRP_CMD_TEMP_SIZE           0x0000002f
+#define NX_CDRP_CMD_GET_TEMP_HDR        0x00000030
+
+
+#define NX_DUMP_STATE_ARRAY_LEN		16
+#define NX_DUMP_CAP_SIZE_ARRAY_LEN	8
+
+/* Mini Coredump sysfs entries flags*/
+#define NX_FORCE_FW_DUMP_KEY		0xdeadfeed
+#define NX_ENABLE_FW_DUMP               0xaddfeed
+#define NX_DISABLE_FW_DUMP              0xbadfeed
+#define NX_FORCE_FW_RESET               0xdeaddead
+
+
+/* Fw dump levels */
+static const u32 FW_DUMP_LEVELS[] = { 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
+
+/* Flash read/write address */
+#define NX_FW_DUMP_REG1         0x00130060
+#define NX_FW_DUMP_REG2         0x001e0000
+#define NX_FLASH_SEM2_LK        0x0013C010
+#define NX_FLASH_SEM2_ULK       0x0013C014
+#define NX_FLASH_LOCK_ID        0x001B2100
+#define FLASH_ROM_WINDOW        0x42110030
+#define FLASH_ROM_DATA          0x42150000
+
+/* Mini Coredump register read/write routine */
+#define NX_RD_DUMP_REG(addr, bar0, data) do {                   \
+	writel((addr & 0xFFFF0000), (void __iomem *) (bar0 +            \
+		NX_FW_DUMP_REG1));                                      \
+	readl((void __iomem *) (bar0 + NX_FW_DUMP_REG1));               \
+	*data = readl((void __iomem *) (bar0 + NX_FW_DUMP_REG2 +        \
+		LSW(addr)));                                            \
+} while (0)
+
+#define NX_WR_DUMP_REG(addr, bar0, data) do {                   \
+	writel((addr & 0xFFFF0000), (void __iomem *) (bar0 +            \
+		NX_FW_DUMP_REG1));                                      \
+	readl((void __iomem *) (bar0 + NX_FW_DUMP_REG1));                \
+	writel(data, (void __iomem *) (bar0 + NX_FW_DUMP_REG2 + LSW(addr)));\
+	readl((void __iomem *) (bar0 + NX_FW_DUMP_REG2 + LSW(addr)));  \
+} while (0)
+
+
+/*
+Entry Type Defines
+*/
+
+#define RDNOP	0
+#define RDCRB	1
+#define RDMUX	2
+#define QUEUE	3
+#define BOARD	4
+#define RDSRE	5
+#define RDOCM	6
+#define PREGS	7
+#define L1DTG	8
+#define L1ITG	9
+#define CACHE	10
+
+#define L1DAT	11
+#define L1INS	12
+#define RDSTK	13
+#define RDCON	14
+
+#define L2DTG	21
+#define L2ITG	22
+#define L2DAT	23
+#define L2INS	24
+#define RDOC3	25
+
+#define MEMBK	32
+
+#define RDROM	71
+#define RDMEM	72
+#define RDMN	73
+
+#define INFOR	81
+#define CNTRL	98
+
+#define TLHDR	99
+#define RDEND	255
+
+#define PRIMQ	103
+#define SQG2Q	104
+#define SQG3Q	105
+
+/*
+* Opcodes for Control Entries.
+* These Flags are bit fields.
+*/
+#define NX_DUMP_WCRB		0x01
+#define NX_DUMP_RWCRB		0x02
+#define NX_DUMP_ANDCRB		0x04
+#define NX_DUMP_ORCRB		0x08
+#define NX_DUMP_POLLCRB		0x10
+#define NX_DUMP_RD_SAVE		0x20
+#define NX_DUMP_WRT_SAVED	0x40
+#define NX_DUMP_MOD_SAVE_ST	0x80
+
+/* Driver Flags */
+#define NX_DUMP_SKIP		0x80	/*  driver skipped this entry  */
+#define NX_DUMP_SIZE_ERR 0x40	/*entry size vs capture size mismatch*/
+
+#define NX_PCI_READ_32(ADDR)			readl((ADDR))
+#define NX_PCI_WRITE_32(DATA, ADDR)	writel(DATA, (ADDR))
+
+
+
+struct netxen_minidump {
+	u32 pos;			/* position in the dump buffer */
+	u8  fw_supports_md;		/* FW supports Mini cordump */
+	u8  has_valid_dump;		/* indicates valid dump */
+	u8  md_capture_mask;		/* driver capture mask */
+	u8  md_enabled;			/* Turn Mini Coredump on/off */
+	u32 md_dump_size;		/* Total FW Mini Coredump size */
+	u32 md_capture_size;		/* FW dump capture size */
+	u32 md_template_size;		/* FW template size */
+	u32 md_template_ver;		/* FW template version */
+	u64 md_timestamp;		/* FW Mini dump timestamp */
+	void *md_template;		/* FW template will be stored */
+	void *md_capture_buff;		/* FW dump will be stored */
+};
+
+
+
+struct netxen_minidump_template_hdr {
+	u32 entry_type;
+	u32 first_entry_offset;
+	u32 size_of_template;
+	u32 capture_mask;
+	u32 num_of_entries;
+	u32 version;
+	u32 driver_timestamp;
+	u32 checksum;
+	u32 driver_capture_mask;
+	u32 driver_info_word2;
+	u32 driver_info_word3;
+	u32 driver_info_word4;
+	u32 saved_state_array[NX_DUMP_STATE_ARRAY_LEN];
+	u32 capture_size_array[NX_DUMP_CAP_SIZE_ARRAY_LEN];
+	u32 rsvd[0];
+};
+
+/* Common Entry Header:  Common to All Entry Types */
+/*
+ * Driver Code is for driver to write some info about the entry.
+ * Currently not used.
+ */
+
+struct netxen_common_entry_hdr {
+	u32 entry_type;
+	u32 entry_size;
+	u32 entry_capture_size;
+	union {
+		struct {
+			u8 entry_capture_mask;
+			u8 entry_code;
+			u8 driver_code;
+			u8 driver_flags;
+		};
+		u32 entry_ctrl_word;
+	};
+};
+
+
+/* Generic Entry Including Header */
+struct netxen_minidump_entry {
+	struct netxen_common_entry_hdr hdr;
+	u32 entry_data00;
+	u32 entry_data01;
+	u32 entry_data02;
+	u32 entry_data03;
+	u32 entry_data04;
+	u32 entry_data05;
+	u32 entry_data06;
+	u32 entry_data07;
+};
+
+/* Read ROM Header */
+struct netxen_minidump_entry_rdrom {
+	struct netxen_common_entry_hdr h;
+	union {
+		struct {
+			u32 select_addr_reg;
+		};
+		u32 rsvd_0;
+	};
+	union {
+		struct {
+			u8 addr_stride;
+			u8 addr_cnt;
+			u16 data_size;
+		};
+		u32 rsvd_1;
+	};
+	union {
+		struct {
+			u32 op_count;
+		};
+		u32 rsvd_2;
+	};
+	union {
+		struct {
+			u32 read_addr_reg;
+		};
+		u32 rsvd_3;
+	};
+	union {
+		struct {
+			u32 write_mask;
+		};
+		u32 rsvd_4;
+	};
+	union {
+		struct {
+			u32 read_mask;
+		};
+		u32 rsvd_5;
+	};
+	u32 read_addr;
+	u32 read_data_size;
+};
+
+
+/* Read CRB and Control Entry Header */
+struct netxen_minidump_entry_crb {
+	struct netxen_common_entry_hdr h;
+	u32 addr;
+	union {
+		struct {
+			u8 addr_stride;
+			u8 state_index_a;
+			u16 poll_timeout;
+			};
+		u32 addr_cntrl;
+	};
+	u32 data_size;
+	u32 op_count;
+	union {
+		struct {
+			u8 opcode;
+			u8 state_index_v;
+			u8 shl;
+			u8 shr;
+			};
+		u32 control_value;
+	};
+	u32 value_1;
+	u32 value_2;
+	u32 value_3;
+};
+
+/* Read Memory and MN Header */
+struct netxen_minidump_entry_rdmem {
+	struct netxen_common_entry_hdr h;
+	union {
+		struct {
+			u32 select_addr_reg;
+		};
+		u32 rsvd_0;
+	};
+	union {
+		struct {
+			u8 addr_stride;
+			u8 addr_cnt;
+			u16 data_size;
+		};
+		u32 rsvd_1;
+	};
+	union {
+		struct {
+			u32 op_count;
+		};
+		u32 rsvd_2;
+	};
+	union {
+		struct {
+			u32 read_addr_reg;
+		};
+		u32 rsvd_3;
+	};
+	union {
+		struct {
+			u32 cntrl_addr_reg;
+		};
+		u32 rsvd_4;
+	};
+	union {
+		struct {
+			u8 wr_byte0;
+			u8 wr_byte1;
+			u8 poll_mask;
+			u8 poll_cnt;
+		};
+		u32 rsvd_5;
+	};
+	u32 read_addr;
+	u32 read_data_size;
+};
+
+/* Read Cache L1 and L2 Header */
+struct netxen_minidump_entry_cache {
+	struct netxen_common_entry_hdr h;
+	u32 tag_reg_addr;
+	union {
+		struct {
+			u16 tag_value_stride;
+			u16 init_tag_value;
+		};
+		u32 select_addr_cntrl;
+	};
+	u32 data_size;
+	u32 op_count;
+	u32 control_addr;
+	union {
+		struct {
+			u16 write_value;
+			u8 poll_mask;
+			u8 poll_wait;
+		};
+		u32 control_value;
+	};
+	u32 read_addr;
+	union {
+		struct {
+			u8 read_addr_stride;
+			u8 read_addr_cnt;
+			u16 rsvd_1;
+		};
+		u32 read_addr_cntrl;
+	};
+};
+
+/* Read OCM Header */
+struct netxen_minidump_entry_rdocm {
+	struct netxen_common_entry_hdr h;
+	u32 rsvd_0;
+	union {
+		struct {
+			u32 rsvd_1;
+		};
+		u32 select_addr_cntrl;
+	};
+	u32 data_size;
+	u32 op_count;
+	u32 rsvd_2;
+	u32 rsvd_3;
+	u32 read_addr;
+	union {
+		struct {
+			u32 read_addr_stride;
+		};
+		u32 read_addr_cntrl;
+	};
+};
+
+/* Read MUX Header */
+struct netxen_minidump_entry_mux {
+	struct netxen_common_entry_hdr h;
+	u32 select_addr;
+	union {
+		struct {
+			u32 rsvd_0;
+		};
+		u32 select_addr_cntrl;
+	};
+	u32 data_size;
+	u32 op_count;
+	u32 select_value;
+	u32 select_value_stride;
+	u32 read_addr;
+	u32 rsvd_1;
+};
+
+/* Read Queue Header */
+struct netxen_minidump_entry_queue {
+	struct netxen_common_entry_hdr h;
+	u32 select_addr;
+	union {
+		struct {
+			u16 queue_id_stride;
+			u16 rsvd_0;
+		};
+		u32 select_addr_cntrl;
+	};
+	u32 data_size;
+	u32 op_count;
+	u32 rsvd_1;
+	u32 rsvd_2;
+	u32 read_addr;
+	union {
+		struct {
+			u8 read_addr_stride;
+			u8 read_addr_cnt;
+			u16 rsvd_3;
+		};
+		u32 read_addr_cntrl;
+	};
+};
+
+struct netxen_dummy_dma {
+	void *addr;
+	dma_addr_t phys_addr;
+};
+
+struct netxen_adapter {
+	struct netxen_hardware_context ahw;
+
+	struct net_device *netdev;
+	struct pci_dev *pdev;
+	struct list_head mac_list;
+	struct list_head ip_list;
+
+	spinlock_t tx_clean_lock;
+
+	u16 num_txd;
+	u16 num_rxd;
+	u16 num_jumbo_rxd;
+	u16 num_lro_rxd;
+
+	u8 max_rds_rings;
+	u8 max_sds_rings;
+	u8 driver_mismatch;
+	u8 msix_supported;
+	u8 __pad;
+	u8 pci_using_dac;
+	u8 portnum;
+	u8 physical_port;
+
+	u8 mc_enabled;
+	u8 max_mc_count;
+	u8 rss_supported;
+	u8 link_changed;
+	u8 fw_wait_cnt;
+	u8 fw_fail_cnt;
+	u8 tx_timeo_cnt;
+	u8 need_fw_reset;
+
+	u8 has_link_events;
+	u8 fw_type;
+	u16 tx_context_id;
+	u16 mtu;
+	u16 is_up;
+
+	u16 link_speed;
+	u16 link_duplex;
+	u16 link_autoneg;
+	u16 module_type;
+
+	u32 capabilities;
+	u32 flags;
+	u32 irq;
+	u32 temp;
+
+	u32 int_vec_bit;
+	u32 heartbit;
+
+	u8 mac_addr[ETH_ALEN];
+
+	struct netxen_adapter_stats stats;
+
+	struct netxen_recv_context recv_ctx;
+	struct nx_host_tx_ring *tx_ring;
+
+	int (*macaddr_set) (struct netxen_adapter *, u8 *);
+	int (*set_mtu) (struct netxen_adapter *, int);
+	int (*set_promisc) (struct netxen_adapter *, u32);
+	void (*set_multi) (struct net_device *);
+	int (*phy_read) (struct netxen_adapter *, u32 reg, u32 *);
+	int (*phy_write) (struct netxen_adapter *, u32 reg, u32 val);
+	int (*init_port) (struct netxen_adapter *, int);
+	int (*stop_port) (struct netxen_adapter *);
+
+	u32 (*crb_read)(struct netxen_adapter *, ulong);
+	int (*crb_write)(struct netxen_adapter *, ulong, u32);
+
+	int (*pci_mem_read)(struct netxen_adapter *, u64, u64 *);
+	int (*pci_mem_write)(struct netxen_adapter *, u64, u64);
+
+	int (*pci_set_window)(struct netxen_adapter *, u64, u32 *);
+
+	u32 (*io_read)(struct netxen_adapter *, void __iomem *);
+	void (*io_write)(struct netxen_adapter *, void __iomem *, u32);
+
+	void __iomem	*tgt_mask_reg;
+	void __iomem	*pci_int_reg;
+	void __iomem	*tgt_status_reg;
+	void __iomem	*crb_int_state_reg;
+	void __iomem	*isr_int_vec;
+
+	struct msix_entry msix_entries[MSIX_ENTRIES_PER_ADAPTER];
+
+	struct netxen_dummy_dma dummy_dma;
+
+	struct delayed_work fw_work;
+
+	struct work_struct  tx_timeout_task;
+
+	nx_nic_intr_coalesce_t coal;
+
+	unsigned long state;
+	__le32 file_prd_off;	/*File fw product offset*/
+	u32 fw_version;
+	const struct firmware *fw;
+	struct netxen_minidump mdump;   /* mdump ptr */
+	int fw_mdump_rdy;	/* for mdump ready */
+};
+
+int nx_fw_cmd_query_phy(struct netxen_adapter *adapter, u32 reg, u32 *val);
+int nx_fw_cmd_set_phy(struct netxen_adapter *adapter, u32 reg, u32 val);
+
+#define NXRD32(adapter, off) \
+	(adapter->crb_read(adapter, off))
+#define NXWR32(adapter, off, val) \
+	(adapter->crb_write(adapter, off, val))
+#define NXRDIO(adapter, addr) \
+	(adapter->io_read(adapter, addr))
+#define NXWRIO(adapter, addr, val) \
+	(adapter->io_write(adapter, addr, val))
+
+int netxen_pcie_sem_lock(struct netxen_adapter *, int, u32);
+void netxen_pcie_sem_unlock(struct netxen_adapter *, int);
+
+#define netxen_rom_lock(a)	\
+	netxen_pcie_sem_lock((a), 2, NETXEN_ROM_LOCK_ID)
+#define netxen_rom_unlock(a)	\
+	netxen_pcie_sem_unlock((a), 2)
+#define netxen_phy_lock(a)	\
+	netxen_pcie_sem_lock((a), 3, NETXEN_PHY_LOCK_ID)
+#define netxen_phy_unlock(a)	\
+	netxen_pcie_sem_unlock((a), 3)
+#define netxen_api_lock(a)	\
+	netxen_pcie_sem_lock((a), 5, 0)
+#define netxen_api_unlock(a)	\
+	netxen_pcie_sem_unlock((a), 5)
+#define netxen_sw_lock(a)	\
+	netxen_pcie_sem_lock((a), 6, 0)
+#define netxen_sw_unlock(a)	\
+	netxen_pcie_sem_unlock((a), 6)
+#define crb_win_lock(a)	\
+	netxen_pcie_sem_lock((a), 7, NETXEN_CRB_WIN_LOCK_ID)
+#define crb_win_unlock(a)	\
+	netxen_pcie_sem_unlock((a), 7)
+
+int netxen_nic_get_board_info(struct netxen_adapter *adapter);
+int netxen_nic_wol_supported(struct netxen_adapter *adapter);
+
+/* Functions from netxen_nic_init.c */
+int netxen_init_dummy_dma(struct netxen_adapter *adapter);
+void netxen_free_dummy_dma(struct netxen_adapter *adapter);
+
+int netxen_check_flash_fw_compatibility(struct netxen_adapter *adapter);
+int netxen_phantom_init(struct netxen_adapter *adapter, int pegtune_val);
+int netxen_load_firmware(struct netxen_adapter *adapter);
+int netxen_need_fw_reset(struct netxen_adapter *adapter);
+void netxen_request_firmware(struct netxen_adapter *adapter);
+void netxen_release_firmware(struct netxen_adapter *adapter);
+int netxen_pinit_from_rom(struct netxen_adapter *adapter);
+
+int netxen_rom_fast_read(struct netxen_adapter *adapter, int addr, int *valp);
+int netxen_rom_fast_read_words(struct netxen_adapter *adapter, int addr,
+				u8 *bytes, size_t size);
+int netxen_rom_fast_write_words(struct netxen_adapter *adapter, int addr,
+				u8 *bytes, size_t size);
+int netxen_flash_unlock(struct netxen_adapter *adapter);
+int netxen_backup_crbinit(struct netxen_adapter *adapter);
+int netxen_flash_erase_secondary(struct netxen_adapter *adapter);
+int netxen_flash_erase_primary(struct netxen_adapter *adapter);
+void netxen_halt_pegs(struct netxen_adapter *adapter);
+
+int netxen_rom_se(struct netxen_adapter *adapter, int addr);
+
+int netxen_alloc_sw_resources(struct netxen_adapter *adapter);
+void netxen_free_sw_resources(struct netxen_adapter *adapter);
+
+void netxen_setup_hwops(struct netxen_adapter *adapter);
+void __iomem *netxen_get_ioaddr(struct netxen_adapter *, u32);
+
+int netxen_alloc_hw_resources(struct netxen_adapter *adapter);
+void netxen_free_hw_resources(struct netxen_adapter *adapter);
+
+void netxen_release_rx_buffers(struct netxen_adapter *adapter);
+void netxen_release_tx_buffers(struct netxen_adapter *adapter);
+
+int netxen_init_firmware(struct netxen_adapter *adapter);
+void netxen_nic_clear_stats(struct netxen_adapter *adapter);
+void netxen_watchdog_task(struct work_struct *work);
+void netxen_post_rx_buffers(struct netxen_adapter *adapter, u32 ringid,
+		struct nx_host_rds_ring *rds_ring);
+int netxen_process_cmd_ring(struct netxen_adapter *adapter);
+int netxen_process_rcv_ring(struct nx_host_sds_ring *sds_ring, int max);
+
+void netxen_p3_free_mac_list(struct netxen_adapter *adapter);
+int netxen_config_intr_coalesce(struct netxen_adapter *adapter);
+int netxen_config_rss(struct netxen_adapter *adapter, int enable);
+int netxen_config_ipaddr(struct netxen_adapter *adapter, __be32 ip, int cmd);
+int netxen_linkevent_request(struct netxen_adapter *adapter, int enable);
+void netxen_advert_link_change(struct netxen_adapter *adapter, int linkup);
+void netxen_pci_camqm_read_2M(struct netxen_adapter *, u64, u64 *);
+void netxen_pci_camqm_write_2M(struct netxen_adapter *, u64, u64);
+
+int nx_fw_cmd_set_gbe_port(struct netxen_adapter *adapter,
+				u32 speed, u32 duplex, u32 autoneg);
+int nx_fw_cmd_set_mtu(struct netxen_adapter *adapter, int mtu);
+int netxen_nic_change_mtu(struct net_device *netdev, int new_mtu);
+int netxen_config_hw_lro(struct netxen_adapter *adapter, int enable);
+int netxen_config_bridged_mode(struct netxen_adapter *adapter, int enable);
+int netxen_send_lro_cleanup(struct netxen_adapter *adapter);
+int netxen_setup_minidump(struct netxen_adapter *adapter);
+void netxen_dump_fw(struct netxen_adapter *adapter);
+void netxen_nic_update_cmd_producer(struct netxen_adapter *adapter,
+		struct nx_host_tx_ring *tx_ring);
+
+/* Functions from netxen_nic_main.c */
+int netxen_nic_reset_context(struct netxen_adapter *);
+
+int nx_dev_request_reset(struct netxen_adapter *adapter);
+
+/*
+ * NetXen Board information
+ */
+
+#define NETXEN_MAX_SHORT_NAME 32
+struct netxen_brdinfo {
+	int brdtype;	/* type of board */
+	long ports;		/* max no of physical ports */
+	char short_name[NETXEN_MAX_SHORT_NAME];
+};
+
+struct netxen_dimm_cfg {
+	u8 presence;
+	u8 mem_type;
+	u8 dimm_type;
+	u32 size;
+};
+
+static const struct netxen_brdinfo netxen_boards[] = {
+	{NETXEN_BRDTYPE_P2_SB31_10G_CX4, 1, "XGb CX4"},
+	{NETXEN_BRDTYPE_P2_SB31_10G_HMEZ, 1, "XGb HMEZ"},
+	{NETXEN_BRDTYPE_P2_SB31_10G_IMEZ, 2, "XGb IMEZ"},
+	{NETXEN_BRDTYPE_P2_SB31_10G, 1, "XGb XFP"},
+	{NETXEN_BRDTYPE_P2_SB35_4G, 4, "Quad Gb"},
+	{NETXEN_BRDTYPE_P2_SB31_2G, 2, "Dual Gb"},
+	{NETXEN_BRDTYPE_P3_REF_QG,  4, "Reference Quad Gig "},
+	{NETXEN_BRDTYPE_P3_HMEZ,    2, "Dual XGb HMEZ"},
+	{NETXEN_BRDTYPE_P3_10G_CX4_LP,   2, "Dual XGb CX4 LP"},
+	{NETXEN_BRDTYPE_P3_4_GB,    4, "Quad Gig LP"},
+	{NETXEN_BRDTYPE_P3_IMEZ,    2, "Dual XGb IMEZ"},
+	{NETXEN_BRDTYPE_P3_10G_SFP_PLUS, 2, "Dual XGb SFP+ LP"},
+	{NETXEN_BRDTYPE_P3_10000_BASE_T, 1, "XGB 10G BaseT LP"},
+	{NETXEN_BRDTYPE_P3_XG_LOM,  2, "Dual XGb LOM"},
+	{NETXEN_BRDTYPE_P3_4_GB_MM, 4, "NX3031 Gigabit Ethernet"},
+	{NETXEN_BRDTYPE_P3_10G_SFP_CT, 2, "NX3031 10 Gigabit Ethernet"},
+	{NETXEN_BRDTYPE_P3_10G_SFP_QT, 2, "Quanta Dual XGb SFP+"},
+	{NETXEN_BRDTYPE_P3_10G_CX4, 2, "Reference Dual CX4 Option"},
+	{NETXEN_BRDTYPE_P3_10G_XFP, 1, "Reference Single XFP Option"}
+};
+
+#define NUM_SUPPORTED_BOARDS ARRAY_SIZE(netxen_boards)
+
+static inline int netxen_nic_get_brd_name_by_type(u32 type, char *name)
+{
+	int i, found = 0;
+	for (i = 0; i < NUM_SUPPORTED_BOARDS; ++i) {
+		if (netxen_boards[i].brdtype == type) {
+			strcpy(name, netxen_boards[i].short_name);
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		strcpy(name, "Unknown");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline u32 netxen_tx_avail(struct nx_host_tx_ring *tx_ring)
+{
+	smp_mb();
+	return find_diff_among(tx_ring->producer,
+			tx_ring->sw_consumer, tx_ring->num_desc);
+
+}
+
+int netxen_get_flash_mac_addr(struct netxen_adapter *adapter, u64 *mac);
+int netxen_p3_get_mac_addr(struct netxen_adapter *adapter, u64 *mac);
+void netxen_change_ringparam(struct netxen_adapter *adapter);
+int netxen_rom_fast_read(struct netxen_adapter *adapter, int addr, int *valp);
+
+extern const struct ethtool_ops netxen_nic_ethtool_ops;
+
+#endif				/* __NETXEN_NIC_H_ */
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c
new file mode 100644
index 0000000..6cec2a6
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c
@@ -0,0 +1,938 @@
+/*
+ * Copyright (C) 2003 - 2009 NetXen, Inc.
+ * Copyright (C) 2009 - QLogic Corporation.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution
+ * in the file called "COPYING".
+ *
+ */
+
+#include "netxen_nic_hw.h"
+#include "netxen_nic.h"
+
+#define NXHAL_VERSION	1
+
+static u32
+netxen_poll_rsp(struct netxen_adapter *adapter)
+{
+	u32 rsp = NX_CDRP_RSP_OK;
+	int	timeout = 0;
+
+	do {
+		/* give atleast 1ms for firmware to respond */
+		msleep(1);
+
+		if (++timeout > NX_OS_CRB_RETRY_COUNT)
+			return NX_CDRP_RSP_TIMEOUT;
+
+		rsp = NXRD32(adapter, NX_CDRP_CRB_OFFSET);
+	} while (!NX_CDRP_IS_RSP(rsp));
+
+	return rsp;
+}
+
+static u32
+netxen_issue_cmd(struct netxen_adapter *adapter, struct netxen_cmd_args *cmd)
+{
+	u32 rsp;
+	u32 signature = 0;
+	u32 rcode = NX_RCODE_SUCCESS;
+
+	signature = NX_CDRP_SIGNATURE_MAKE(adapter->ahw.pci_func,
+						NXHAL_VERSION);
+	/* Acquire semaphore before accessing CRB */
+	if (netxen_api_lock(adapter))
+		return NX_RCODE_TIMEOUT;
+
+	NXWR32(adapter, NX_SIGN_CRB_OFFSET, signature);
+
+	NXWR32(adapter, NX_ARG1_CRB_OFFSET, cmd->req.arg1);
+
+	NXWR32(adapter, NX_ARG2_CRB_OFFSET, cmd->req.arg2);
+
+	NXWR32(adapter, NX_ARG3_CRB_OFFSET, cmd->req.arg3);
+
+	NXWR32(adapter, NX_CDRP_CRB_OFFSET, NX_CDRP_FORM_CMD(cmd->req.cmd));
+
+	rsp = netxen_poll_rsp(adapter);
+
+	if (rsp == NX_CDRP_RSP_TIMEOUT) {
+		printk(KERN_ERR "%s: card response timeout.\n",
+				netxen_nic_driver_name);
+
+		rcode = NX_RCODE_TIMEOUT;
+	} else if (rsp == NX_CDRP_RSP_FAIL) {
+		rcode = NXRD32(adapter, NX_ARG1_CRB_OFFSET);
+
+		printk(KERN_ERR "%s: failed card response code:0x%x\n",
+				netxen_nic_driver_name, rcode);
+	} else if (rsp == NX_CDRP_RSP_OK) {
+		cmd->rsp.cmd = NX_RCODE_SUCCESS;
+		if (cmd->rsp.arg2)
+			cmd->rsp.arg2 = NXRD32(adapter, NX_ARG2_CRB_OFFSET);
+		if (cmd->rsp.arg3)
+			cmd->rsp.arg3 = NXRD32(adapter, NX_ARG3_CRB_OFFSET);
+	}
+
+	if (cmd->rsp.arg1)
+		cmd->rsp.arg1 = NXRD32(adapter, NX_ARG1_CRB_OFFSET);
+	/* Release semaphore */
+	netxen_api_unlock(adapter);
+
+	return rcode;
+}
+
+static int
+netxen_get_minidump_template_size(struct netxen_adapter *adapter)
+{
+	struct netxen_cmd_args cmd;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.req.cmd = NX_CDRP_CMD_TEMP_SIZE;
+	memset(&cmd.rsp, 1, sizeof(struct _cdrp_cmd));
+	netxen_issue_cmd(adapter, &cmd);
+	if (cmd.rsp.cmd != NX_RCODE_SUCCESS) {
+		dev_info(&adapter->pdev->dev,
+			"Can't get template size %d\n", cmd.rsp.cmd);
+		return -EIO;
+	}
+	adapter->mdump.md_template_size = cmd.rsp.arg2;
+	adapter->mdump.md_template_ver = cmd.rsp.arg3;
+	return 0;
+}
+
+static int
+netxen_get_minidump_template(struct netxen_adapter *adapter)
+{
+	dma_addr_t md_template_addr;
+	void *addr;
+	u32 size;
+	struct netxen_cmd_args cmd;
+	size = adapter->mdump.md_template_size;
+
+	if (size == 0) {
+		dev_err(&adapter->pdev->dev, "Can not capture Minidump "
+			"template. Invalid template size.\n");
+		return NX_RCODE_INVALID_ARGS;
+	}
+
+	addr = pci_zalloc_consistent(adapter->pdev, size, &md_template_addr);
+	if (!addr) {
+		dev_err(&adapter->pdev->dev, "Unable to allocate dmable memory for template.\n");
+		return -ENOMEM;
+	}
+
+	memset(&cmd, 0, sizeof(cmd));
+	memset(&cmd.rsp, 1, sizeof(struct _cdrp_cmd));
+	cmd.req.cmd = NX_CDRP_CMD_GET_TEMP_HDR;
+	cmd.req.arg1 = LSD(md_template_addr);
+	cmd.req.arg2 = MSD(md_template_addr);
+	cmd.req.arg3 |= size;
+	netxen_issue_cmd(adapter, &cmd);
+
+	if ((cmd.rsp.cmd == NX_RCODE_SUCCESS) && (size == cmd.rsp.arg2)) {
+		memcpy(adapter->mdump.md_template, addr, size);
+	} else {
+		dev_err(&adapter->pdev->dev, "Failed to get minidump template, "
+			"err_code : %d, requested_size : %d, actual_size : %d\n ",
+			cmd.rsp.cmd, size, cmd.rsp.arg2);
+	}
+	pci_free_consistent(adapter->pdev, size, addr, md_template_addr);
+	return 0;
+}
+
+static u32
+netxen_check_template_checksum(struct netxen_adapter *adapter)
+{
+	u64 sum =  0 ;
+	u32 *buff = adapter->mdump.md_template;
+	int count =  adapter->mdump.md_template_size/sizeof(uint32_t) ;
+
+	while (count-- > 0)
+		sum += *buff++ ;
+	while (sum >> 32)
+		sum = (sum & 0xFFFFFFFF) +  (sum >> 32) ;
+
+	return ~sum;
+}
+
+int
+netxen_setup_minidump(struct netxen_adapter *adapter)
+{
+	int err = 0, i;
+	u32 *template, *tmp_buf;
+	err = netxen_get_minidump_template_size(adapter);
+	if (err) {
+		adapter->mdump.fw_supports_md = 0;
+		if ((err == NX_RCODE_CMD_INVALID) ||
+			(err == NX_RCODE_CMD_NOT_IMPL)) {
+			dev_info(&adapter->pdev->dev,
+				"Flashed firmware version does not support minidump, "
+				"minimum version required is [ %u.%u.%u ].\n ",
+				NX_MD_SUPPORT_MAJOR, NX_MD_SUPPORT_MINOR,
+				NX_MD_SUPPORT_SUBVERSION);
+		}
+		return err;
+	}
+
+	if (!adapter->mdump.md_template_size) {
+		dev_err(&adapter->pdev->dev, "Error : Invalid template size "
+		",should be non-zero.\n");
+		return -EIO;
+	}
+	adapter->mdump.md_template =
+		kmalloc(adapter->mdump.md_template_size, GFP_KERNEL);
+
+	if (!adapter->mdump.md_template)
+		return -ENOMEM;
+
+	err = netxen_get_minidump_template(adapter);
+	if (err) {
+		if (err == NX_RCODE_CMD_NOT_IMPL)
+			adapter->mdump.fw_supports_md = 0;
+		goto free_template;
+	}
+
+	if (netxen_check_template_checksum(adapter)) {
+		dev_err(&adapter->pdev->dev, "Minidump template checksum Error\n");
+		err = -EIO;
+		goto free_template;
+	}
+
+	adapter->mdump.md_capture_mask = NX_DUMP_MASK_DEF;
+	tmp_buf = (u32 *) adapter->mdump.md_template;
+	template = (u32 *) adapter->mdump.md_template;
+	for (i = 0; i < adapter->mdump.md_template_size/sizeof(u32); i++)
+		*template++ = __le32_to_cpu(*tmp_buf++);
+	adapter->mdump.md_capture_buff = NULL;
+	adapter->mdump.fw_supports_md = 1;
+	adapter->mdump.md_enabled = 0;
+
+	return err;
+
+free_template:
+	kfree(adapter->mdump.md_template);
+	adapter->mdump.md_template = NULL;
+	return err;
+}
+
+
+int
+nx_fw_cmd_set_mtu(struct netxen_adapter *adapter, int mtu)
+{
+	u32 rcode = NX_RCODE_SUCCESS;
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+	struct netxen_cmd_args cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.req.cmd = NX_CDRP_CMD_SET_MTU;
+	cmd.req.arg1 = recv_ctx->context_id;
+	cmd.req.arg2 = mtu;
+	cmd.req.arg3 = 0;
+
+	if (recv_ctx->state == NX_HOST_CTX_STATE_ACTIVE)
+		rcode = netxen_issue_cmd(adapter, &cmd);
+
+	if (rcode != NX_RCODE_SUCCESS)
+		return -EIO;
+
+	return 0;
+}
+
+int
+nx_fw_cmd_set_gbe_port(struct netxen_adapter *adapter,
+			u32 speed, u32 duplex, u32 autoneg)
+{
+	struct netxen_cmd_args cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.req.cmd = NX_CDRP_CMD_CONFIG_GBE_PORT;
+	cmd.req.arg1 = speed;
+	cmd.req.arg2 = duplex;
+	cmd.req.arg3 = autoneg;
+	return netxen_issue_cmd(adapter, &cmd);
+}
+
+static int
+nx_fw_cmd_create_rx_ctx(struct netxen_adapter *adapter)
+{
+	void *addr;
+	nx_hostrq_rx_ctx_t *prq;
+	nx_cardrsp_rx_ctx_t *prsp;
+	nx_hostrq_rds_ring_t *prq_rds;
+	nx_hostrq_sds_ring_t *prq_sds;
+	nx_cardrsp_rds_ring_t *prsp_rds;
+	nx_cardrsp_sds_ring_t *prsp_sds;
+	struct nx_host_rds_ring *rds_ring;
+	struct nx_host_sds_ring *sds_ring;
+	struct netxen_cmd_args cmd;
+
+	dma_addr_t hostrq_phys_addr, cardrsp_phys_addr;
+	u64 phys_addr;
+
+	int i, nrds_rings, nsds_rings;
+	size_t rq_size, rsp_size;
+	u32 cap, reg, val;
+
+	int err;
+
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+
+	nrds_rings = adapter->max_rds_rings;
+	nsds_rings = adapter->max_sds_rings;
+
+	rq_size =
+		SIZEOF_HOSTRQ_RX(nx_hostrq_rx_ctx_t, nrds_rings, nsds_rings);
+	rsp_size =
+		SIZEOF_CARDRSP_RX(nx_cardrsp_rx_ctx_t, nrds_rings, nsds_rings);
+
+	addr = pci_alloc_consistent(adapter->pdev,
+				rq_size, &hostrq_phys_addr);
+	if (addr == NULL)
+		return -ENOMEM;
+	prq = addr;
+
+	addr = pci_alloc_consistent(adapter->pdev,
+			rsp_size, &cardrsp_phys_addr);
+	if (addr == NULL) {
+		err = -ENOMEM;
+		goto out_free_rq;
+	}
+	prsp = addr;
+
+	prq->host_rsp_dma_addr = cpu_to_le64(cardrsp_phys_addr);
+
+	cap = (NX_CAP0_LEGACY_CONTEXT | NX_CAP0_LEGACY_MN);
+	cap |= (NX_CAP0_JUMBO_CONTIGUOUS | NX_CAP0_LRO_CONTIGUOUS);
+
+	if (adapter->flags & NETXEN_FW_MSS_CAP)
+		cap |= NX_CAP0_HW_LRO_MSS;
+
+	prq->capabilities[0] = cpu_to_le32(cap);
+	prq->host_int_crb_mode =
+		cpu_to_le32(NX_HOST_INT_CRB_MODE_SHARED);
+	prq->host_rds_crb_mode =
+		cpu_to_le32(NX_HOST_RDS_CRB_MODE_UNIQUE);
+
+	prq->num_rds_rings = cpu_to_le16(nrds_rings);
+	prq->num_sds_rings = cpu_to_le16(nsds_rings);
+	prq->rds_ring_offset = cpu_to_le32(0);
+
+	val = le32_to_cpu(prq->rds_ring_offset) +
+		(sizeof(nx_hostrq_rds_ring_t) * nrds_rings);
+	prq->sds_ring_offset = cpu_to_le32(val);
+
+	prq_rds = (nx_hostrq_rds_ring_t *)(prq->data +
+			le32_to_cpu(prq->rds_ring_offset));
+
+	for (i = 0; i < nrds_rings; i++) {
+
+		rds_ring = &recv_ctx->rds_rings[i];
+
+		prq_rds[i].host_phys_addr = cpu_to_le64(rds_ring->phys_addr);
+		prq_rds[i].ring_size = cpu_to_le32(rds_ring->num_desc);
+		prq_rds[i].ring_kind = cpu_to_le32(i);
+		prq_rds[i].buff_size = cpu_to_le64(rds_ring->dma_size);
+	}
+
+	prq_sds = (nx_hostrq_sds_ring_t *)(prq->data +
+			le32_to_cpu(prq->sds_ring_offset));
+
+	for (i = 0; i < nsds_rings; i++) {
+
+		sds_ring = &recv_ctx->sds_rings[i];
+
+		prq_sds[i].host_phys_addr = cpu_to_le64(sds_ring->phys_addr);
+		prq_sds[i].ring_size = cpu_to_le32(sds_ring->num_desc);
+		prq_sds[i].msi_index = cpu_to_le16(i);
+	}
+
+	phys_addr = hostrq_phys_addr;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.req.arg1 = (u32)(phys_addr >> 32);
+	cmd.req.arg2 = (u32)(phys_addr & 0xffffffff);
+	cmd.req.arg3 = rq_size;
+	cmd.req.cmd = NX_CDRP_CMD_CREATE_RX_CTX;
+	err = netxen_issue_cmd(adapter, &cmd);
+	if (err) {
+		printk(KERN_WARNING
+			"Failed to create rx ctx in firmware%d\n", err);
+		goto out_free_rsp;
+	}
+
+
+	prsp_rds = ((nx_cardrsp_rds_ring_t *)
+			 &prsp->data[le32_to_cpu(prsp->rds_ring_offset)]);
+
+	for (i = 0; i < le16_to_cpu(prsp->num_rds_rings); i++) {
+		rds_ring = &recv_ctx->rds_rings[i];
+
+		reg = le32_to_cpu(prsp_rds[i].host_producer_crb);
+		rds_ring->crb_rcv_producer = netxen_get_ioaddr(adapter,
+				NETXEN_NIC_REG(reg - 0x200));
+	}
+
+	prsp_sds = ((nx_cardrsp_sds_ring_t *)
+			&prsp->data[le32_to_cpu(prsp->sds_ring_offset)]);
+
+	for (i = 0; i < le16_to_cpu(prsp->num_sds_rings); i++) {
+		sds_ring = &recv_ctx->sds_rings[i];
+
+		reg = le32_to_cpu(prsp_sds[i].host_consumer_crb);
+		sds_ring->crb_sts_consumer = netxen_get_ioaddr(adapter,
+				NETXEN_NIC_REG(reg - 0x200));
+
+		reg = le32_to_cpu(prsp_sds[i].interrupt_crb);
+		sds_ring->crb_intr_mask = netxen_get_ioaddr(adapter,
+				NETXEN_NIC_REG(reg - 0x200));
+	}
+
+	recv_ctx->state = le32_to_cpu(prsp->host_ctx_state);
+	recv_ctx->context_id = le16_to_cpu(prsp->context_id);
+	recv_ctx->virt_port = prsp->virt_port;
+
+out_free_rsp:
+	pci_free_consistent(adapter->pdev, rsp_size, prsp, cardrsp_phys_addr);
+out_free_rq:
+	pci_free_consistent(adapter->pdev, rq_size, prq, hostrq_phys_addr);
+	return err;
+}
+
+static void
+nx_fw_cmd_destroy_rx_ctx(struct netxen_adapter *adapter)
+{
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+	struct netxen_cmd_args cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.req.arg1 = recv_ctx->context_id;
+	cmd.req.arg2 = NX_DESTROY_CTX_RESET;
+	cmd.req.arg3 = 0;
+	cmd.req.cmd = NX_CDRP_CMD_DESTROY_RX_CTX;
+
+	if (netxen_issue_cmd(adapter, &cmd)) {
+		printk(KERN_WARNING
+			"%s: Failed to destroy rx ctx in firmware\n",
+			netxen_nic_driver_name);
+	}
+}
+
+static int
+nx_fw_cmd_create_tx_ctx(struct netxen_adapter *adapter)
+{
+	nx_hostrq_tx_ctx_t	*prq;
+	nx_hostrq_cds_ring_t	*prq_cds;
+	nx_cardrsp_tx_ctx_t	*prsp;
+	void	*rq_addr, *rsp_addr;
+	size_t	rq_size, rsp_size;
+	u32	temp;
+	int	err = 0;
+	u64	offset, phys_addr;
+	dma_addr_t	rq_phys_addr, rsp_phys_addr;
+	struct nx_host_tx_ring *tx_ring = adapter->tx_ring;
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+	struct netxen_cmd_args cmd;
+
+	rq_size = SIZEOF_HOSTRQ_TX(nx_hostrq_tx_ctx_t);
+	rq_addr = pci_alloc_consistent(adapter->pdev,
+		rq_size, &rq_phys_addr);
+	if (!rq_addr)
+		return -ENOMEM;
+
+	rsp_size = SIZEOF_CARDRSP_TX(nx_cardrsp_tx_ctx_t);
+	rsp_addr = pci_alloc_consistent(adapter->pdev,
+		rsp_size, &rsp_phys_addr);
+	if (!rsp_addr) {
+		err = -ENOMEM;
+		goto out_free_rq;
+	}
+
+	memset(rq_addr, 0, rq_size);
+	prq = rq_addr;
+
+	memset(rsp_addr, 0, rsp_size);
+	prsp = rsp_addr;
+
+	prq->host_rsp_dma_addr = cpu_to_le64(rsp_phys_addr);
+
+	temp = (NX_CAP0_LEGACY_CONTEXT | NX_CAP0_LEGACY_MN | NX_CAP0_LSO);
+	prq->capabilities[0] = cpu_to_le32(temp);
+
+	prq->host_int_crb_mode =
+		cpu_to_le32(NX_HOST_INT_CRB_MODE_SHARED);
+
+	prq->interrupt_ctl = 0;
+	prq->msi_index = 0;
+
+	prq->dummy_dma_addr = cpu_to_le64(adapter->dummy_dma.phys_addr);
+
+	offset = recv_ctx->phys_addr + sizeof(struct netxen_ring_ctx);
+	prq->cmd_cons_dma_addr = cpu_to_le64(offset);
+
+	prq_cds = &prq->cds_ring;
+
+	prq_cds->host_phys_addr = cpu_to_le64(tx_ring->phys_addr);
+	prq_cds->ring_size = cpu_to_le32(tx_ring->num_desc);
+
+	phys_addr = rq_phys_addr;
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.req.arg1 = (u32)(phys_addr >> 32);
+	cmd.req.arg2 = ((u32)phys_addr & 0xffffffff);
+	cmd.req.arg3 = rq_size;
+	cmd.req.cmd = NX_CDRP_CMD_CREATE_TX_CTX;
+	err = netxen_issue_cmd(adapter, &cmd);
+
+	if (err == NX_RCODE_SUCCESS) {
+		temp = le32_to_cpu(prsp->cds_ring.host_producer_crb);
+		tx_ring->crb_cmd_producer = netxen_get_ioaddr(adapter,
+				NETXEN_NIC_REG(temp - 0x200));
+#if 0
+		adapter->tx_state =
+			le32_to_cpu(prsp->host_ctx_state);
+#endif
+		adapter->tx_context_id =
+			le16_to_cpu(prsp->context_id);
+	} else {
+		printk(KERN_WARNING
+			"Failed to create tx ctx in firmware%d\n", err);
+		err = -EIO;
+	}
+
+	pci_free_consistent(adapter->pdev, rsp_size, rsp_addr, rsp_phys_addr);
+
+out_free_rq:
+	pci_free_consistent(adapter->pdev, rq_size, rq_addr, rq_phys_addr);
+
+	return err;
+}
+
+static void
+nx_fw_cmd_destroy_tx_ctx(struct netxen_adapter *adapter)
+{
+	struct netxen_cmd_args cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.req.arg1 = adapter->tx_context_id;
+	cmd.req.arg2 = NX_DESTROY_CTX_RESET;
+	cmd.req.arg3 = 0;
+	cmd.req.cmd = NX_CDRP_CMD_DESTROY_TX_CTX;
+	if (netxen_issue_cmd(adapter, &cmd)) {
+		printk(KERN_WARNING
+			"%s: Failed to destroy tx ctx in firmware\n",
+			netxen_nic_driver_name);
+	}
+}
+
+int
+nx_fw_cmd_query_phy(struct netxen_adapter *adapter, u32 reg, u32 *val)
+{
+	u32 rcode;
+	struct netxen_cmd_args cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.req.arg1 = reg;
+	cmd.req.arg2 = 0;
+	cmd.req.arg3 = 0;
+	cmd.req.cmd = NX_CDRP_CMD_READ_PHY;
+	cmd.rsp.arg1 = 1;
+	rcode = netxen_issue_cmd(adapter, &cmd);
+	if (rcode != NX_RCODE_SUCCESS)
+		return -EIO;
+
+	if (val == NULL)
+		return -EIO;
+
+	*val = cmd.rsp.arg1;
+	return 0;
+}
+
+int
+nx_fw_cmd_set_phy(struct netxen_adapter *adapter, u32 reg, u32 val)
+{
+	u32 rcode;
+	struct netxen_cmd_args cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.req.arg1 = reg;
+	cmd.req.arg2 = val;
+	cmd.req.arg3 = 0;
+	cmd.req.cmd = NX_CDRP_CMD_WRITE_PHY;
+	rcode = netxen_issue_cmd(adapter, &cmd);
+	if (rcode != NX_RCODE_SUCCESS)
+		return -EIO;
+
+	return 0;
+}
+
+static u64 ctx_addr_sig_regs[][3] = {
+	{NETXEN_NIC_REG(0x188), NETXEN_NIC_REG(0x18c), NETXEN_NIC_REG(0x1c0)},
+	{NETXEN_NIC_REG(0x190), NETXEN_NIC_REG(0x194), NETXEN_NIC_REG(0x1c4)},
+	{NETXEN_NIC_REG(0x198), NETXEN_NIC_REG(0x19c), NETXEN_NIC_REG(0x1c8)},
+	{NETXEN_NIC_REG(0x1a0), NETXEN_NIC_REG(0x1a4), NETXEN_NIC_REG(0x1cc)}
+};
+
+#define CRB_CTX_ADDR_REG_LO(FUNC_ID)	(ctx_addr_sig_regs[FUNC_ID][0])
+#define CRB_CTX_ADDR_REG_HI(FUNC_ID)	(ctx_addr_sig_regs[FUNC_ID][2])
+#define CRB_CTX_SIGNATURE_REG(FUNC_ID)	(ctx_addr_sig_regs[FUNC_ID][1])
+
+#define lower32(x)	((u32)((x) & 0xffffffff))
+#define upper32(x)	((u32)(((u64)(x) >> 32) & 0xffffffff))
+
+static struct netxen_recv_crb recv_crb_registers[] = {
+	/* Instance 0 */
+	{
+		/* crb_rcv_producer: */
+		{
+			NETXEN_NIC_REG(0x100),
+			/* Jumbo frames */
+			NETXEN_NIC_REG(0x110),
+			/* LRO */
+			NETXEN_NIC_REG(0x120)
+		},
+		/* crb_sts_consumer: */
+		{
+			NETXEN_NIC_REG(0x138),
+			NETXEN_NIC_REG_2(0x000),
+			NETXEN_NIC_REG_2(0x004),
+			NETXEN_NIC_REG_2(0x008),
+		},
+		/* sw_int_mask */
+		{
+			CRB_SW_INT_MASK_0,
+			NETXEN_NIC_REG_2(0x044),
+			NETXEN_NIC_REG_2(0x048),
+			NETXEN_NIC_REG_2(0x04c),
+		},
+	},
+	/* Instance 1 */
+	{
+		/* crb_rcv_producer: */
+		{
+			NETXEN_NIC_REG(0x144),
+			/* Jumbo frames */
+			NETXEN_NIC_REG(0x154),
+			/* LRO */
+			NETXEN_NIC_REG(0x164)
+		},
+		/* crb_sts_consumer: */
+		{
+			NETXEN_NIC_REG(0x17c),
+			NETXEN_NIC_REG_2(0x020),
+			NETXEN_NIC_REG_2(0x024),
+			NETXEN_NIC_REG_2(0x028),
+		},
+		/* sw_int_mask */
+		{
+			CRB_SW_INT_MASK_1,
+			NETXEN_NIC_REG_2(0x064),
+			NETXEN_NIC_REG_2(0x068),
+			NETXEN_NIC_REG_2(0x06c),
+		},
+	},
+	/* Instance 2 */
+	{
+		/* crb_rcv_producer: */
+		{
+			NETXEN_NIC_REG(0x1d8),
+			/* Jumbo frames */
+			NETXEN_NIC_REG(0x1f8),
+			/* LRO */
+			NETXEN_NIC_REG(0x208)
+		},
+		/* crb_sts_consumer: */
+		{
+			NETXEN_NIC_REG(0x220),
+			NETXEN_NIC_REG_2(0x03c),
+			NETXEN_NIC_REG_2(0x03c),
+			NETXEN_NIC_REG_2(0x03c),
+		},
+		/* sw_int_mask */
+		{
+			CRB_SW_INT_MASK_2,
+			NETXEN_NIC_REG_2(0x03c),
+			NETXEN_NIC_REG_2(0x03c),
+			NETXEN_NIC_REG_2(0x03c),
+		},
+	},
+	/* Instance 3 */
+	{
+		/* crb_rcv_producer: */
+		{
+			NETXEN_NIC_REG(0x22c),
+			/* Jumbo frames */
+			NETXEN_NIC_REG(0x23c),
+			/* LRO */
+			NETXEN_NIC_REG(0x24c)
+		},
+		/* crb_sts_consumer: */
+		{
+			NETXEN_NIC_REG(0x264),
+			NETXEN_NIC_REG_2(0x03c),
+			NETXEN_NIC_REG_2(0x03c),
+			NETXEN_NIC_REG_2(0x03c),
+		},
+		/* sw_int_mask */
+		{
+			CRB_SW_INT_MASK_3,
+			NETXEN_NIC_REG_2(0x03c),
+			NETXEN_NIC_REG_2(0x03c),
+			NETXEN_NIC_REG_2(0x03c),
+		},
+	},
+};
+
+static int
+netxen_init_old_ctx(struct netxen_adapter *adapter)
+{
+	struct netxen_recv_context *recv_ctx;
+	struct nx_host_rds_ring *rds_ring;
+	struct nx_host_sds_ring *sds_ring;
+	struct nx_host_tx_ring *tx_ring;
+	int ring;
+	int port = adapter->portnum;
+	struct netxen_ring_ctx *hwctx;
+	u32 signature;
+
+	tx_ring = adapter->tx_ring;
+	recv_ctx = &adapter->recv_ctx;
+	hwctx = recv_ctx->hwctx;
+
+	hwctx->cmd_ring_addr = cpu_to_le64(tx_ring->phys_addr);
+	hwctx->cmd_ring_size = cpu_to_le32(tx_ring->num_desc);
+
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &recv_ctx->rds_rings[ring];
+
+		hwctx->rcv_rings[ring].addr =
+			cpu_to_le64(rds_ring->phys_addr);
+		hwctx->rcv_rings[ring].size =
+			cpu_to_le32(rds_ring->num_desc);
+	}
+
+	for (ring = 0; ring < adapter->max_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+
+		if (ring == 0) {
+			hwctx->sts_ring_addr = cpu_to_le64(sds_ring->phys_addr);
+			hwctx->sts_ring_size = cpu_to_le32(sds_ring->num_desc);
+		}
+		hwctx->sts_rings[ring].addr = cpu_to_le64(sds_ring->phys_addr);
+		hwctx->sts_rings[ring].size = cpu_to_le32(sds_ring->num_desc);
+		hwctx->sts_rings[ring].msi_index = cpu_to_le16(ring);
+	}
+	hwctx->sts_ring_count = cpu_to_le32(adapter->max_sds_rings);
+
+	signature = (adapter->max_sds_rings > 1) ?
+		NETXEN_CTX_SIGNATURE_V2 : NETXEN_CTX_SIGNATURE;
+
+	NXWR32(adapter, CRB_CTX_ADDR_REG_LO(port),
+			lower32(recv_ctx->phys_addr));
+	NXWR32(adapter, CRB_CTX_ADDR_REG_HI(port),
+			upper32(recv_ctx->phys_addr));
+	NXWR32(adapter, CRB_CTX_SIGNATURE_REG(port),
+			signature | port);
+	return 0;
+}
+
+int netxen_alloc_hw_resources(struct netxen_adapter *adapter)
+{
+	void *addr;
+	int err = 0;
+	int ring;
+	struct netxen_recv_context *recv_ctx;
+	struct nx_host_rds_ring *rds_ring;
+	struct nx_host_sds_ring *sds_ring;
+	struct nx_host_tx_ring *tx_ring;
+
+	struct pci_dev *pdev = adapter->pdev;
+	struct net_device *netdev = adapter->netdev;
+	int port = adapter->portnum;
+
+	recv_ctx = &adapter->recv_ctx;
+	tx_ring = adapter->tx_ring;
+
+	addr = pci_alloc_consistent(pdev,
+			sizeof(struct netxen_ring_ctx) + sizeof(uint32_t),
+			&recv_ctx->phys_addr);
+	if (addr == NULL) {
+		dev_err(&pdev->dev, "failed to allocate hw context\n");
+		return -ENOMEM;
+	}
+
+	memset(addr, 0, sizeof(struct netxen_ring_ctx));
+	recv_ctx->hwctx = addr;
+	recv_ctx->hwctx->ctx_id = cpu_to_le32(port);
+	recv_ctx->hwctx->cmd_consumer_offset =
+		cpu_to_le64(recv_ctx->phys_addr +
+			sizeof(struct netxen_ring_ctx));
+	tx_ring->hw_consumer =
+		(__le32 *)(((char *)addr) + sizeof(struct netxen_ring_ctx));
+
+	/* cmd desc ring */
+	addr = pci_alloc_consistent(pdev, TX_DESC_RINGSIZE(tx_ring),
+			&tx_ring->phys_addr);
+
+	if (addr == NULL) {
+		dev_err(&pdev->dev, "%s: failed to allocate tx desc ring\n",
+				netdev->name);
+		err = -ENOMEM;
+		goto err_out_free;
+	}
+
+	tx_ring->desc_head = addr;
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &recv_ctx->rds_rings[ring];
+		addr = pci_alloc_consistent(adapter->pdev,
+				RCV_DESC_RINGSIZE(rds_ring),
+				&rds_ring->phys_addr);
+		if (addr == NULL) {
+			dev_err(&pdev->dev,
+				"%s: failed to allocate rds ring [%d]\n",
+				netdev->name, ring);
+			err = -ENOMEM;
+			goto err_out_free;
+		}
+		rds_ring->desc_head = addr;
+
+		if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+			rds_ring->crb_rcv_producer =
+				netxen_get_ioaddr(adapter,
+			recv_crb_registers[port].crb_rcv_producer[ring]);
+	}
+
+	for (ring = 0; ring < adapter->max_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+
+		addr = pci_alloc_consistent(adapter->pdev,
+				STATUS_DESC_RINGSIZE(sds_ring),
+				&sds_ring->phys_addr);
+		if (addr == NULL) {
+			dev_err(&pdev->dev,
+				"%s: failed to allocate sds ring [%d]\n",
+				netdev->name, ring);
+			err = -ENOMEM;
+			goto err_out_free;
+		}
+		sds_ring->desc_head = addr;
+
+		if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+			sds_ring->crb_sts_consumer =
+				netxen_get_ioaddr(adapter,
+				recv_crb_registers[port].crb_sts_consumer[ring]);
+
+			sds_ring->crb_intr_mask =
+				netxen_get_ioaddr(adapter,
+				recv_crb_registers[port].sw_int_mask[ring]);
+		}
+	}
+
+
+	if (!NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+		if (test_and_set_bit(__NX_FW_ATTACHED, &adapter->state))
+			goto done;
+		err = nx_fw_cmd_create_rx_ctx(adapter);
+		if (err)
+			goto err_out_free;
+		err = nx_fw_cmd_create_tx_ctx(adapter);
+		if (err)
+			goto err_out_free;
+	} else {
+		err = netxen_init_old_ctx(adapter);
+		if (err)
+			goto err_out_free;
+	}
+
+done:
+	return 0;
+
+err_out_free:
+	netxen_free_hw_resources(adapter);
+	return err;
+}
+
+void netxen_free_hw_resources(struct netxen_adapter *adapter)
+{
+	struct netxen_recv_context *recv_ctx;
+	struct nx_host_rds_ring *rds_ring;
+	struct nx_host_sds_ring *sds_ring;
+	struct nx_host_tx_ring *tx_ring;
+	int ring;
+
+	int port = adapter->portnum;
+
+	if (!NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+		if (!test_and_clear_bit(__NX_FW_ATTACHED, &adapter->state))
+			goto done;
+
+		nx_fw_cmd_destroy_rx_ctx(adapter);
+		nx_fw_cmd_destroy_tx_ctx(adapter);
+	} else {
+		netxen_api_lock(adapter);
+		NXWR32(adapter, CRB_CTX_SIGNATURE_REG(port),
+				NETXEN_CTX_D3_RESET | port);
+		netxen_api_unlock(adapter);
+	}
+
+	/* Allow dma queues to drain after context reset */
+	msleep(20);
+
+done:
+	recv_ctx = &adapter->recv_ctx;
+
+	if (recv_ctx->hwctx != NULL) {
+		pci_free_consistent(adapter->pdev,
+				sizeof(struct netxen_ring_ctx) +
+				sizeof(uint32_t),
+				recv_ctx->hwctx,
+				recv_ctx->phys_addr);
+		recv_ctx->hwctx = NULL;
+	}
+
+	tx_ring = adapter->tx_ring;
+	if (tx_ring->desc_head != NULL) {
+		pci_free_consistent(adapter->pdev,
+				TX_DESC_RINGSIZE(tx_ring),
+				tx_ring->desc_head, tx_ring->phys_addr);
+		tx_ring->desc_head = NULL;
+	}
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &recv_ctx->rds_rings[ring];
+
+		if (rds_ring->desc_head != NULL) {
+			pci_free_consistent(adapter->pdev,
+					RCV_DESC_RINGSIZE(rds_ring),
+					rds_ring->desc_head,
+					rds_ring->phys_addr);
+			rds_ring->desc_head = NULL;
+		}
+	}
+
+	for (ring = 0; ring < adapter->max_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+
+		if (sds_ring->desc_head != NULL) {
+			pci_free_consistent(adapter->pdev,
+				STATUS_DESC_RINGSIZE(sds_ring),
+				sds_ring->desc_head,
+				sds_ring->phys_addr);
+			sds_ring->desc_head = NULL;
+		}
+	}
+}
+
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_ethtool.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ethtool.c
new file mode 100644
index 0000000..3157f97
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ethtool.c
@@ -0,0 +1,964 @@
+/*
+ * Copyright (C) 2003 - 2009 NetXen, Inc.
+ * Copyright (C) 2009 - QLogic Corporation.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution
+ * in the file called "COPYING".
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <asm/io.h>
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+
+#include "netxen_nic.h"
+#include "netxen_nic_hw.h"
+
+struct netxen_nic_stats {
+	char stat_string[ETH_GSTRING_LEN];
+	int sizeof_stat;
+	int stat_offset;
+};
+
+#define NETXEN_NIC_STAT(m) sizeof(((struct netxen_adapter *)0)->m), \
+			offsetof(struct netxen_adapter, m)
+
+#define NETXEN_NIC_PORT_WINDOW 0x10000
+#define NETXEN_NIC_INVALID_DATA 0xDEADBEEF
+
+static const struct netxen_nic_stats netxen_nic_gstrings_stats[] = {
+	{"xmit_called", NETXEN_NIC_STAT(stats.xmitcalled)},
+	{"xmit_finished", NETXEN_NIC_STAT(stats.xmitfinished)},
+	{"rx_dropped", NETXEN_NIC_STAT(stats.rxdropped)},
+	{"tx_dropped", NETXEN_NIC_STAT(stats.txdropped)},
+	{"csummed", NETXEN_NIC_STAT(stats.csummed)},
+	{"rx_pkts", NETXEN_NIC_STAT(stats.rx_pkts)},
+	{"lro_pkts", NETXEN_NIC_STAT(stats.lro_pkts)},
+	{"rx_bytes", NETXEN_NIC_STAT(stats.rxbytes)},
+	{"tx_bytes", NETXEN_NIC_STAT(stats.txbytes)},
+};
+
+#define NETXEN_NIC_STATS_LEN	ARRAY_SIZE(netxen_nic_gstrings_stats)
+
+static const char netxen_nic_gstrings_test[][ETH_GSTRING_LEN] = {
+	"Register_Test_on_offline",
+	"Link_Test_on_offline"
+};
+
+#define NETXEN_NIC_TEST_LEN	ARRAY_SIZE(netxen_nic_gstrings_test)
+
+#define NETXEN_NIC_REGS_COUNT 30
+#define NETXEN_NIC_REGS_LEN (NETXEN_NIC_REGS_COUNT * sizeof(__le32))
+#define NETXEN_MAX_EEPROM_LEN   1024
+
+static int netxen_nic_get_eeprom_len(struct net_device *dev)
+{
+	return NETXEN_FLASH_TOTAL_SIZE;
+}
+
+static void
+netxen_nic_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	u32 fw_major = 0;
+	u32 fw_minor = 0;
+	u32 fw_build = 0;
+
+	strlcpy(drvinfo->driver, netxen_nic_driver_name,
+		sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, NETXEN_NIC_LINUX_VERSIONID,
+		sizeof(drvinfo->version));
+	fw_major = NXRD32(adapter, NETXEN_FW_VERSION_MAJOR);
+	fw_minor = NXRD32(adapter, NETXEN_FW_VERSION_MINOR);
+	fw_build = NXRD32(adapter, NETXEN_FW_VERSION_SUB);
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		"%d.%d.%d", fw_major, fw_minor, fw_build);
+
+	strlcpy(drvinfo->bus_info, pci_name(adapter->pdev),
+		sizeof(drvinfo->bus_info));
+}
+
+static int
+netxen_nic_get_link_ksettings(struct net_device *dev,
+			      struct ethtool_link_ksettings *cmd)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	int check_sfp_module = 0;
+	u32 supported, advertising;
+
+	/* read which mode */
+	if (adapter->ahw.port_type == NETXEN_NIC_GBE) {
+		supported = (SUPPORTED_10baseT_Half |
+				   SUPPORTED_10baseT_Full |
+				   SUPPORTED_100baseT_Half |
+				   SUPPORTED_100baseT_Full |
+				   SUPPORTED_1000baseT_Half |
+				   SUPPORTED_1000baseT_Full);
+
+		advertising = (ADVERTISED_100baseT_Half |
+				     ADVERTISED_100baseT_Full |
+				     ADVERTISED_1000baseT_Half |
+				     ADVERTISED_1000baseT_Full);
+
+		cmd->base.port = PORT_TP;
+
+		cmd->base.speed = adapter->link_speed;
+		cmd->base.duplex = adapter->link_duplex;
+		cmd->base.autoneg = adapter->link_autoneg;
+
+	} else if (adapter->ahw.port_type == NETXEN_NIC_XGBE) {
+		u32 val;
+
+		val = NXRD32(adapter, NETXEN_PORT_MODE_ADDR);
+		if (val == NETXEN_PORT_MODE_802_3_AP) {
+			supported = SUPPORTED_1000baseT_Full;
+			advertising = ADVERTISED_1000baseT_Full;
+		} else {
+			supported = SUPPORTED_10000baseT_Full;
+			advertising = ADVERTISED_10000baseT_Full;
+		}
+
+		if (netif_running(dev) && adapter->has_link_events) {
+			cmd->base.speed = adapter->link_speed;
+			cmd->base.autoneg = adapter->link_autoneg;
+			cmd->base.duplex = adapter->link_duplex;
+			goto skip;
+		}
+
+		cmd->base.port = PORT_TP;
+
+		if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+			u16 pcifn = adapter->ahw.pci_func;
+
+			val = NXRD32(adapter, P3_LINK_SPEED_REG(pcifn));
+			cmd->base.speed = P3_LINK_SPEED_MHZ *
+				P3_LINK_SPEED_VAL(pcifn, val);
+		} else
+			cmd->base.speed = SPEED_10000;
+
+		cmd->base.duplex = DUPLEX_FULL;
+		cmd->base.autoneg = AUTONEG_DISABLE;
+	} else
+		return -EIO;
+
+skip:
+	cmd->base.phy_address = adapter->physical_port;
+
+	switch (adapter->ahw.board_type) {
+	case NETXEN_BRDTYPE_P2_SB35_4G:
+	case NETXEN_BRDTYPE_P2_SB31_2G:
+	case NETXEN_BRDTYPE_P3_REF_QG:
+	case NETXEN_BRDTYPE_P3_4_GB:
+	case NETXEN_BRDTYPE_P3_4_GB_MM:
+
+		supported |= SUPPORTED_Autoneg;
+		advertising |= ADVERTISED_Autoneg;
+	case NETXEN_BRDTYPE_P2_SB31_10G_CX4:
+	case NETXEN_BRDTYPE_P3_10G_CX4:
+	case NETXEN_BRDTYPE_P3_10G_CX4_LP:
+	case NETXEN_BRDTYPE_P3_10000_BASE_T:
+		supported |= SUPPORTED_TP;
+		advertising |= ADVERTISED_TP;
+		cmd->base.port = PORT_TP;
+		cmd->base.autoneg = (adapter->ahw.board_type ==
+				 NETXEN_BRDTYPE_P2_SB31_10G_CX4) ?
+		    (AUTONEG_DISABLE) : (adapter->link_autoneg);
+		break;
+	case NETXEN_BRDTYPE_P2_SB31_10G_HMEZ:
+	case NETXEN_BRDTYPE_P2_SB31_10G_IMEZ:
+	case NETXEN_BRDTYPE_P3_IMEZ:
+	case NETXEN_BRDTYPE_P3_XG_LOM:
+	case NETXEN_BRDTYPE_P3_HMEZ:
+		supported |= SUPPORTED_MII;
+		advertising |= ADVERTISED_MII;
+		cmd->base.port = PORT_MII;
+		cmd->base.autoneg = AUTONEG_DISABLE;
+		break;
+	case NETXEN_BRDTYPE_P3_10G_SFP_PLUS:
+	case NETXEN_BRDTYPE_P3_10G_SFP_CT:
+	case NETXEN_BRDTYPE_P3_10G_SFP_QT:
+		advertising |= ADVERTISED_TP;
+		supported |= SUPPORTED_TP;
+		check_sfp_module = netif_running(dev) &&
+			adapter->has_link_events;
+	case NETXEN_BRDTYPE_P2_SB31_10G:
+	case NETXEN_BRDTYPE_P3_10G_XFP:
+		supported |= SUPPORTED_FIBRE;
+		advertising |= ADVERTISED_FIBRE;
+		cmd->base.port = PORT_FIBRE;
+		cmd->base.autoneg = AUTONEG_DISABLE;
+		break;
+	case NETXEN_BRDTYPE_P3_10G_TP:
+		if (adapter->ahw.port_type == NETXEN_NIC_XGBE) {
+			cmd->base.autoneg = AUTONEG_DISABLE;
+			supported |= (SUPPORTED_FIBRE | SUPPORTED_TP);
+			advertising |=
+				(ADVERTISED_FIBRE | ADVERTISED_TP);
+			cmd->base.port = PORT_FIBRE;
+			check_sfp_module = netif_running(dev) &&
+				adapter->has_link_events;
+		} else {
+			supported |= (SUPPORTED_TP | SUPPORTED_Autoneg);
+			advertising |=
+				(ADVERTISED_TP | ADVERTISED_Autoneg);
+			cmd->base.port = PORT_TP;
+		}
+		break;
+	default:
+		printk(KERN_ERR "netxen-nic: Unsupported board model %d\n",
+				adapter->ahw.board_type);
+		return -EIO;
+	}
+
+	if (check_sfp_module) {
+		switch (adapter->module_type) {
+		case LINKEVENT_MODULE_OPTICAL_UNKNOWN:
+		case LINKEVENT_MODULE_OPTICAL_SRLR:
+		case LINKEVENT_MODULE_OPTICAL_LRM:
+		case LINKEVENT_MODULE_OPTICAL_SFP_1G:
+			cmd->base.port = PORT_FIBRE;
+			break;
+		case LINKEVENT_MODULE_TWINAX_UNSUPPORTED_CABLE:
+		case LINKEVENT_MODULE_TWINAX_UNSUPPORTED_CABLELEN:
+		case LINKEVENT_MODULE_TWINAX:
+			cmd->base.port = PORT_TP;
+			break;
+		default:
+			cmd->base.port = -1;
+		}
+	}
+
+	if (!netif_running(dev) || !adapter->ahw.linkup) {
+		cmd->base.duplex = DUPLEX_UNKNOWN;
+		cmd->base.speed = SPEED_UNKNOWN;
+	}
+
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
+						supported);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
+						advertising);
+
+	return 0;
+}
+
+static int
+netxen_nic_set_link_ksettings(struct net_device *dev,
+			      const struct ethtool_link_ksettings *cmd)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	u32 speed = cmd->base.speed;
+	int ret;
+
+	if (adapter->ahw.port_type != NETXEN_NIC_GBE)
+		return -EOPNOTSUPP;
+
+	if (!(adapter->capabilities & NX_FW_CAPABILITY_GBE_LINK_CFG))
+		return -EOPNOTSUPP;
+
+	ret = nx_fw_cmd_set_gbe_port(adapter, speed, cmd->base.duplex,
+				     cmd->base.autoneg);
+	if (ret == NX_RCODE_NOT_SUPPORTED)
+		return -EOPNOTSUPP;
+	else if (ret)
+		return -EIO;
+
+	adapter->link_speed = speed;
+	adapter->link_duplex = cmd->base.duplex;
+	adapter->link_autoneg = cmd->base.autoneg;
+
+	if (!netif_running(dev))
+		return 0;
+
+	dev->netdev_ops->ndo_stop(dev);
+	return dev->netdev_ops->ndo_open(dev);
+}
+
+static int netxen_nic_get_regs_len(struct net_device *dev)
+{
+	return NETXEN_NIC_REGS_LEN;
+}
+
+static void
+netxen_nic_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *p)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+	struct nx_host_sds_ring *sds_ring;
+	u32 *regs_buff = p;
+	int ring, i = 0;
+	int port = adapter->physical_port;
+
+	memset(p, 0, NETXEN_NIC_REGS_LEN);
+
+	regs->version = (1 << 24) | (adapter->ahw.revision_id << 16) |
+	    (adapter->pdev)->device;
+
+	if (adapter->is_up != NETXEN_ADAPTER_UP_MAGIC)
+		return;
+
+	regs_buff[i++] = NXRD32(adapter, CRB_CMDPEG_STATE);
+	regs_buff[i++] = NXRD32(adapter, CRB_RCVPEG_STATE);
+	regs_buff[i++] = NXRD32(adapter, CRB_FW_CAPABILITIES_1);
+	regs_buff[i++] = NXRDIO(adapter, adapter->crb_int_state_reg);
+	regs_buff[i++] = NXRD32(adapter, NX_CRB_DEV_REF_COUNT);
+	regs_buff[i++] = NXRD32(adapter, NX_CRB_DEV_STATE);
+	regs_buff[i++] = NXRD32(adapter, NETXEN_PEG_ALIVE_COUNTER);
+	regs_buff[i++] = NXRD32(adapter, NETXEN_PEG_HALT_STATUS1);
+	regs_buff[i++] = NXRD32(adapter, NETXEN_PEG_HALT_STATUS2);
+
+	regs_buff[i++] = NXRD32(adapter, NETXEN_CRB_PEG_NET_0+0x3c);
+	regs_buff[i++] = NXRD32(adapter, NETXEN_CRB_PEG_NET_1+0x3c);
+	regs_buff[i++] = NXRD32(adapter, NETXEN_CRB_PEG_NET_2+0x3c);
+	regs_buff[i++] = NXRD32(adapter, NETXEN_CRB_PEG_NET_3+0x3c);
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+
+		regs_buff[i++] = NXRD32(adapter, NETXEN_CRB_PEG_NET_4+0x3c);
+		i += 2;
+
+		regs_buff[i++] = NXRD32(adapter, CRB_XG_STATE_P3);
+		regs_buff[i++] = le32_to_cpu(*(adapter->tx_ring->hw_consumer));
+
+	} else {
+		i++;
+
+		regs_buff[i++] = NXRD32(adapter,
+					NETXEN_NIU_XGE_CONFIG_0+(0x10000*port));
+		regs_buff[i++] = NXRD32(adapter,
+					NETXEN_NIU_XGE_CONFIG_1+(0x10000*port));
+
+		regs_buff[i++] = NXRD32(adapter, CRB_XG_STATE);
+		regs_buff[i++] = NXRDIO(adapter,
+				 adapter->tx_ring->crb_cmd_consumer);
+	}
+
+	regs_buff[i++] = NXRDIO(adapter, adapter->tx_ring->crb_cmd_producer);
+
+	regs_buff[i++] = NXRDIO(adapter,
+			 recv_ctx->rds_rings[0].crb_rcv_producer);
+	regs_buff[i++] = NXRDIO(adapter,
+			 recv_ctx->rds_rings[1].crb_rcv_producer);
+
+	regs_buff[i++] = adapter->max_sds_rings;
+
+	for (ring = 0; ring < adapter->max_sds_rings; ring++) {
+		sds_ring = &(recv_ctx->sds_rings[ring]);
+		regs_buff[i++] = NXRDIO(adapter,
+					sds_ring->crb_sts_consumer);
+	}
+}
+
+static u32 netxen_nic_test_link(struct net_device *dev)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	u32 val, port;
+
+	port = adapter->physical_port;
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+		val = NXRD32(adapter, CRB_XG_STATE_P3);
+		val = XG_LINK_STATE_P3(adapter->ahw.pci_func, val);
+		return (val == XG_LINK_UP_P3) ? 0 : 1;
+	} else {
+		val = NXRD32(adapter, CRB_XG_STATE);
+		val = (val >> port*8) & 0xff;
+		return (val == XG_LINK_UP) ? 0 : 1;
+	}
+}
+
+static int
+netxen_nic_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
+		      u8 *bytes)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	int offset;
+	int ret;
+
+	if (eeprom->len == 0)
+		return -EINVAL;
+
+	eeprom->magic = (adapter->pdev)->vendor |
+			((adapter->pdev)->device << 16);
+	offset = eeprom->offset;
+
+	ret = netxen_rom_fast_read_words(adapter, offset, bytes,
+						eeprom->len);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static void
+netxen_nic_get_ringparam(struct net_device *dev,
+		struct ethtool_ringparam *ring)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+
+	ring->rx_pending = adapter->num_rxd;
+	ring->rx_jumbo_pending = adapter->num_jumbo_rxd;
+	ring->rx_jumbo_pending += adapter->num_lro_rxd;
+	ring->tx_pending = adapter->num_txd;
+
+	if (adapter->ahw.port_type == NETXEN_NIC_GBE) {
+		ring->rx_max_pending = MAX_RCV_DESCRIPTORS_1G;
+		ring->rx_jumbo_max_pending = MAX_JUMBO_RCV_DESCRIPTORS_1G;
+	} else {
+		ring->rx_max_pending = MAX_RCV_DESCRIPTORS_10G;
+		ring->rx_jumbo_max_pending = MAX_JUMBO_RCV_DESCRIPTORS_10G;
+	}
+
+	ring->tx_max_pending = MAX_CMD_DESCRIPTORS;
+}
+
+static u32
+netxen_validate_ringparam(u32 val, u32 min, u32 max, char *r_name)
+{
+	u32 num_desc;
+	num_desc = max(val, min);
+	num_desc = min(num_desc, max);
+	num_desc = roundup_pow_of_two(num_desc);
+
+	if (val != num_desc) {
+		printk(KERN_INFO "%s: setting %s ring size %d instead of %d\n",
+		       netxen_nic_driver_name, r_name, num_desc, val);
+	}
+
+	return num_desc;
+}
+
+static int
+netxen_nic_set_ringparam(struct net_device *dev,
+		struct ethtool_ringparam *ring)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	u16 max_rcv_desc = MAX_RCV_DESCRIPTORS_10G;
+	u16 max_jumbo_desc = MAX_JUMBO_RCV_DESCRIPTORS_10G;
+	u16 num_rxd, num_jumbo_rxd, num_txd;
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		return -EOPNOTSUPP;
+
+	if (ring->rx_mini_pending)
+		return -EOPNOTSUPP;
+
+	if (adapter->ahw.port_type == NETXEN_NIC_GBE) {
+		max_rcv_desc = MAX_RCV_DESCRIPTORS_1G;
+		max_jumbo_desc = MAX_JUMBO_RCV_DESCRIPTORS_10G;
+	}
+
+	num_rxd = netxen_validate_ringparam(ring->rx_pending,
+			MIN_RCV_DESCRIPTORS, max_rcv_desc, "rx");
+
+	num_jumbo_rxd = netxen_validate_ringparam(ring->rx_jumbo_pending,
+			MIN_JUMBO_DESCRIPTORS, max_jumbo_desc, "rx jumbo");
+
+	num_txd = netxen_validate_ringparam(ring->tx_pending,
+			MIN_CMD_DESCRIPTORS, MAX_CMD_DESCRIPTORS, "tx");
+
+	if (num_rxd == adapter->num_rxd && num_txd == adapter->num_txd &&
+			num_jumbo_rxd == adapter->num_jumbo_rxd)
+		return 0;
+
+	adapter->num_rxd = num_rxd;
+	adapter->num_jumbo_rxd = num_jumbo_rxd;
+	adapter->num_txd = num_txd;
+
+	return netxen_nic_reset_context(adapter);
+}
+
+static void
+netxen_nic_get_pauseparam(struct net_device *dev,
+			  struct ethtool_pauseparam *pause)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	__u32 val;
+	int port = adapter->physical_port;
+
+	pause->autoneg = 0;
+
+	if (adapter->ahw.port_type == NETXEN_NIC_GBE) {
+		if ((port < 0) || (port >= NETXEN_NIU_MAX_GBE_PORTS))
+			return;
+		/* get flow control settings */
+		val = NXRD32(adapter, NETXEN_NIU_GB_MAC_CONFIG_0(port));
+		pause->rx_pause = netxen_gb_get_rx_flowctl(val);
+		val = NXRD32(adapter, NETXEN_NIU_GB_PAUSE_CTL);
+		switch (port) {
+		case 0:
+			pause->tx_pause = !(netxen_gb_get_gb0_mask(val));
+			break;
+		case 1:
+			pause->tx_pause = !(netxen_gb_get_gb1_mask(val));
+			break;
+		case 2:
+			pause->tx_pause = !(netxen_gb_get_gb2_mask(val));
+			break;
+		case 3:
+		default:
+			pause->tx_pause = !(netxen_gb_get_gb3_mask(val));
+			break;
+		}
+	} else if (adapter->ahw.port_type == NETXEN_NIC_XGBE) {
+		if ((port < 0) || (port >= NETXEN_NIU_MAX_XG_PORTS))
+			return;
+		pause->rx_pause = 1;
+		val = NXRD32(adapter, NETXEN_NIU_XG_PAUSE_CTL);
+		if (port == 0)
+			pause->tx_pause = !(netxen_xg_get_xg0_mask(val));
+		else
+			pause->tx_pause = !(netxen_xg_get_xg1_mask(val));
+	} else {
+		printk(KERN_ERR"%s: Unknown board type: %x\n",
+				netxen_nic_driver_name, adapter->ahw.port_type);
+	}
+}
+
+static int
+netxen_nic_set_pauseparam(struct net_device *dev,
+			  struct ethtool_pauseparam *pause)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	__u32 val;
+	int port = adapter->physical_port;
+
+	/* not supported */
+	if (pause->autoneg)
+		return -EINVAL;
+
+	/* read mode */
+	if (adapter->ahw.port_type == NETXEN_NIC_GBE) {
+		if ((port < 0) || (port >= NETXEN_NIU_MAX_GBE_PORTS))
+			return -EIO;
+		/* set flow control */
+		val = NXRD32(adapter, NETXEN_NIU_GB_MAC_CONFIG_0(port));
+
+		if (pause->rx_pause)
+			netxen_gb_rx_flowctl(val);
+		else
+			netxen_gb_unset_rx_flowctl(val);
+
+		NXWR32(adapter, NETXEN_NIU_GB_MAC_CONFIG_0(port),
+				val);
+		/* set autoneg */
+		val = NXRD32(adapter, NETXEN_NIU_GB_PAUSE_CTL);
+		switch (port) {
+		case 0:
+			if (pause->tx_pause)
+				netxen_gb_unset_gb0_mask(val);
+			else
+				netxen_gb_set_gb0_mask(val);
+			break;
+		case 1:
+			if (pause->tx_pause)
+				netxen_gb_unset_gb1_mask(val);
+			else
+				netxen_gb_set_gb1_mask(val);
+			break;
+		case 2:
+			if (pause->tx_pause)
+				netxen_gb_unset_gb2_mask(val);
+			else
+				netxen_gb_set_gb2_mask(val);
+			break;
+		case 3:
+		default:
+			if (pause->tx_pause)
+				netxen_gb_unset_gb3_mask(val);
+			else
+				netxen_gb_set_gb3_mask(val);
+			break;
+		}
+		NXWR32(adapter, NETXEN_NIU_GB_PAUSE_CTL, val);
+	} else if (adapter->ahw.port_type == NETXEN_NIC_XGBE) {
+		if ((port < 0) || (port >= NETXEN_NIU_MAX_XG_PORTS))
+			return -EIO;
+		val = NXRD32(adapter, NETXEN_NIU_XG_PAUSE_CTL);
+		if (port == 0) {
+			if (pause->tx_pause)
+				netxen_xg_unset_xg0_mask(val);
+			else
+				netxen_xg_set_xg0_mask(val);
+		} else {
+			if (pause->tx_pause)
+				netxen_xg_unset_xg1_mask(val);
+			else
+				netxen_xg_set_xg1_mask(val);
+		}
+		NXWR32(adapter, NETXEN_NIU_XG_PAUSE_CTL, val);
+	} else {
+		printk(KERN_ERR "%s: Unknown board type: %x\n",
+				netxen_nic_driver_name,
+				adapter->ahw.port_type);
+	}
+	return 0;
+}
+
+static int netxen_nic_reg_test(struct net_device *dev)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	u32 data_read, data_written;
+
+	data_read = NXRD32(adapter, NETXEN_PCIX_PH_REG(0));
+	if ((data_read & 0xffff) != adapter->pdev->vendor)
+		return 1;
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		return 0;
+
+	data_written = (u32)0xa5a5a5a5;
+
+	NXWR32(adapter, CRB_SCRATCHPAD_TEST, data_written);
+	data_read = NXRD32(adapter, CRB_SCRATCHPAD_TEST);
+	if (data_written != data_read)
+		return 1;
+
+	return 0;
+}
+
+static int netxen_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_TEST:
+		return NETXEN_NIC_TEST_LEN;
+	case ETH_SS_STATS:
+		return NETXEN_NIC_STATS_LEN;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void
+netxen_nic_diag_test(struct net_device *dev, struct ethtool_test *eth_test,
+		     u64 *data)
+{
+	memset(data, 0, sizeof(uint64_t) * NETXEN_NIC_TEST_LEN);
+	if ((data[0] = netxen_nic_reg_test(dev)))
+		eth_test->flags |= ETH_TEST_FL_FAILED;
+	/* link test */
+	if ((data[1] = (u64) netxen_nic_test_link(dev)))
+		eth_test->flags |= ETH_TEST_FL_FAILED;
+}
+
+static void
+netxen_nic_get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+	int index;
+
+	switch (stringset) {
+	case ETH_SS_TEST:
+		memcpy(data, *netxen_nic_gstrings_test,
+		       NETXEN_NIC_TEST_LEN * ETH_GSTRING_LEN);
+		break;
+	case ETH_SS_STATS:
+		for (index = 0; index < NETXEN_NIC_STATS_LEN; index++) {
+			memcpy(data + index * ETH_GSTRING_LEN,
+			       netxen_nic_gstrings_stats[index].stat_string,
+			       ETH_GSTRING_LEN);
+		}
+		break;
+	}
+}
+
+static void
+netxen_nic_get_ethtool_stats(struct net_device *dev,
+			     struct ethtool_stats *stats, u64 *data)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	int index;
+
+	for (index = 0; index < NETXEN_NIC_STATS_LEN; index++) {
+		char *p =
+		    (char *)adapter +
+		    netxen_nic_gstrings_stats[index].stat_offset;
+		data[index] =
+		    (netxen_nic_gstrings_stats[index].sizeof_stat ==
+		     sizeof(u64)) ? *(u64 *) p : *(u32 *) p;
+	}
+}
+
+static void
+netxen_nic_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	u32 wol_cfg = 0;
+
+	wol->supported = 0;
+	wol->wolopts = 0;
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		return;
+
+	wol_cfg = NXRD32(adapter, NETXEN_WOL_CONFIG_NV);
+	if (wol_cfg & (1UL << adapter->portnum))
+		wol->supported |= WAKE_MAGIC;
+
+	wol_cfg = NXRD32(adapter, NETXEN_WOL_CONFIG);
+	if (wol_cfg & (1UL << adapter->portnum))
+		wol->wolopts |= WAKE_MAGIC;
+}
+
+static int
+netxen_nic_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	u32 wol_cfg = 0;
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		return -EOPNOTSUPP;
+
+	if (wol->wolopts & ~WAKE_MAGIC)
+		return -EOPNOTSUPP;
+
+	wol_cfg = NXRD32(adapter, NETXEN_WOL_CONFIG_NV);
+	if (!(wol_cfg & (1 << adapter->portnum)))
+		return -EOPNOTSUPP;
+
+	wol_cfg = NXRD32(adapter, NETXEN_WOL_CONFIG);
+	if (wol->wolopts & WAKE_MAGIC)
+		wol_cfg |= 1UL << adapter->portnum;
+	else
+		wol_cfg &= ~(1UL << adapter->portnum);
+	NXWR32(adapter, NETXEN_WOL_CONFIG, wol_cfg);
+
+	return 0;
+}
+
+/*
+ * Set the coalescing parameters. Currently only normal is supported.
+ * If rx_coalesce_usecs == 0 or rx_max_coalesced_frames == 0 then set the
+ * firmware coalescing to default.
+ */
+static int netxen_set_intr_coalesce(struct net_device *netdev,
+			struct ethtool_coalesce *ethcoal)
+{
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+
+	if (!NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		return -EINVAL;
+
+	if (adapter->is_up != NETXEN_ADAPTER_UP_MAGIC)
+		return -EINVAL;
+
+	/*
+	* Return Error if unsupported values or
+	* unsupported parameters are set.
+	*/
+	if (ethcoal->rx_coalesce_usecs > 0xffff ||
+		ethcoal->rx_max_coalesced_frames > 0xffff ||
+		ethcoal->tx_coalesce_usecs > 0xffff ||
+		ethcoal->tx_max_coalesced_frames > 0xffff ||
+		ethcoal->rx_coalesce_usecs_irq ||
+		ethcoal->rx_max_coalesced_frames_irq ||
+		ethcoal->tx_coalesce_usecs_irq ||
+		ethcoal->tx_max_coalesced_frames_irq ||
+		ethcoal->stats_block_coalesce_usecs ||
+		ethcoal->use_adaptive_rx_coalesce ||
+		ethcoal->use_adaptive_tx_coalesce ||
+		ethcoal->pkt_rate_low ||
+		ethcoal->rx_coalesce_usecs_low ||
+		ethcoal->rx_max_coalesced_frames_low ||
+		ethcoal->tx_coalesce_usecs_low ||
+		ethcoal->tx_max_coalesced_frames_low ||
+		ethcoal->pkt_rate_high ||
+		ethcoal->rx_coalesce_usecs_high ||
+		ethcoal->rx_max_coalesced_frames_high ||
+		ethcoal->tx_coalesce_usecs_high ||
+		ethcoal->tx_max_coalesced_frames_high)
+		return -EINVAL;
+
+	if (!ethcoal->rx_coalesce_usecs ||
+		!ethcoal->rx_max_coalesced_frames) {
+		adapter->coal.flags = NETXEN_NIC_INTR_DEFAULT;
+		adapter->coal.normal.data.rx_time_us =
+			NETXEN_DEFAULT_INTR_COALESCE_RX_TIME_US;
+		adapter->coal.normal.data.rx_packets =
+			NETXEN_DEFAULT_INTR_COALESCE_RX_PACKETS;
+	} else {
+		adapter->coal.flags = 0;
+		adapter->coal.normal.data.rx_time_us =
+		ethcoal->rx_coalesce_usecs;
+		adapter->coal.normal.data.rx_packets =
+		ethcoal->rx_max_coalesced_frames;
+	}
+	adapter->coal.normal.data.tx_time_us = ethcoal->tx_coalesce_usecs;
+	adapter->coal.normal.data.tx_packets =
+	ethcoal->tx_max_coalesced_frames;
+
+	netxen_config_intr_coalesce(adapter);
+
+	return 0;
+}
+
+static int netxen_get_intr_coalesce(struct net_device *netdev,
+			struct ethtool_coalesce *ethcoal)
+{
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+
+	if (!NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		return -EINVAL;
+
+	if (adapter->is_up != NETXEN_ADAPTER_UP_MAGIC)
+		return -EINVAL;
+
+	ethcoal->rx_coalesce_usecs = adapter->coal.normal.data.rx_time_us;
+	ethcoal->tx_coalesce_usecs = adapter->coal.normal.data.tx_time_us;
+	ethcoal->rx_max_coalesced_frames =
+		adapter->coal.normal.data.rx_packets;
+	ethcoal->tx_max_coalesced_frames =
+		adapter->coal.normal.data.tx_packets;
+
+	return 0;
+}
+
+static int
+netxen_get_dump_flag(struct net_device *netdev, struct ethtool_dump *dump)
+{
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+	struct netxen_minidump *mdump = &adapter->mdump;
+	if (adapter->fw_mdump_rdy)
+		dump->len = mdump->md_dump_size;
+	else
+		dump->len = 0;
+
+	if (!mdump->md_enabled)
+		dump->flag = ETH_FW_DUMP_DISABLE;
+	else
+		dump->flag = mdump->md_capture_mask;
+
+	dump->version = adapter->fw_version;
+	return 0;
+}
+
+static int
+netxen_set_dump(struct net_device *netdev, struct ethtool_dump *val)
+{
+	int i;
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+	struct netxen_minidump *mdump = &adapter->mdump;
+
+	switch (val->flag) {
+	case NX_FORCE_FW_DUMP_KEY:
+		if (!mdump->md_enabled) {
+			netdev_info(netdev, "FW dump not enabled\n");
+			return 0;
+		}
+		if (adapter->fw_mdump_rdy) {
+			netdev_info(netdev, "Previous dump not cleared, not forcing dump\n");
+			return 0;
+		}
+		netdev_info(netdev, "Forcing a fw dump\n");
+		nx_dev_request_reset(adapter);
+		break;
+	case NX_DISABLE_FW_DUMP:
+		if (mdump->md_enabled) {
+			netdev_info(netdev, "Disabling FW Dump\n");
+			mdump->md_enabled = 0;
+		}
+		break;
+	case NX_ENABLE_FW_DUMP:
+		if (!mdump->md_enabled) {
+			netdev_info(netdev, "Enabling FW dump\n");
+			mdump->md_enabled = 1;
+		}
+		break;
+	case NX_FORCE_FW_RESET:
+		netdev_info(netdev, "Forcing FW reset\n");
+		nx_dev_request_reset(adapter);
+		adapter->flags &= ~NETXEN_FW_RESET_OWNER;
+		break;
+	default:
+		for (i = 0; i < ARRAY_SIZE(FW_DUMP_LEVELS); i++) {
+			if (val->flag == FW_DUMP_LEVELS[i]) {
+				mdump->md_capture_mask = val->flag;
+				netdev_info(netdev,
+					"Driver mask changed to: 0x%x\n",
+					mdump->md_capture_mask);
+				return 0;
+			}
+		}
+		netdev_info(netdev,
+			"Invalid dump level: 0x%x\n", val->flag);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+netxen_get_dump_data(struct net_device *netdev, struct ethtool_dump *dump,
+			void *buffer)
+{
+	int i, copy_sz;
+	u32 *hdr_ptr, *data;
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+	struct netxen_minidump *mdump = &adapter->mdump;
+
+
+	if (!adapter->fw_mdump_rdy) {
+		netdev_info(netdev, "Dump not available\n");
+		return -EINVAL;
+	}
+	/* Copy template header first */
+	copy_sz = mdump->md_template_size;
+	hdr_ptr = (u32 *) mdump->md_template;
+	data = buffer;
+	for (i = 0; i < copy_sz/sizeof(u32); i++)
+		*data++ = cpu_to_le32(*hdr_ptr++);
+
+	/* Copy captured dump data */
+	memcpy(buffer + copy_sz,
+		mdump->md_capture_buff + mdump->md_template_size,
+			mdump->md_capture_size);
+	dump->len = copy_sz + mdump->md_capture_size;
+	dump->flag = mdump->md_capture_mask;
+
+	/* Free dump area once data has been captured */
+	vfree(mdump->md_capture_buff);
+	mdump->md_capture_buff = NULL;
+	adapter->fw_mdump_rdy = 0;
+	netdev_info(netdev, "extracted the fw dump Successfully\n");
+	return 0;
+}
+
+const struct ethtool_ops netxen_nic_ethtool_ops = {
+	.get_drvinfo = netxen_nic_get_drvinfo,
+	.get_regs_len = netxen_nic_get_regs_len,
+	.get_regs = netxen_nic_get_regs,
+	.get_link = ethtool_op_get_link,
+	.get_eeprom_len = netxen_nic_get_eeprom_len,
+	.get_eeprom = netxen_nic_get_eeprom,
+	.get_ringparam = netxen_nic_get_ringparam,
+	.set_ringparam = netxen_nic_set_ringparam,
+	.get_pauseparam = netxen_nic_get_pauseparam,
+	.set_pauseparam = netxen_nic_set_pauseparam,
+	.get_wol = netxen_nic_get_wol,
+	.set_wol = netxen_nic_set_wol,
+	.self_test = netxen_nic_diag_test,
+	.get_strings = netxen_nic_get_strings,
+	.get_ethtool_stats = netxen_nic_get_ethtool_stats,
+	.get_sset_count = netxen_get_sset_count,
+	.get_coalesce = netxen_get_intr_coalesce,
+	.set_coalesce = netxen_set_intr_coalesce,
+	.get_dump_flag = netxen_get_dump_flag,
+	.get_dump_data = netxen_get_dump_data,
+	.set_dump = netxen_set_dump,
+	.get_link_ksettings = netxen_nic_get_link_ksettings,
+	.set_link_ksettings = netxen_nic_set_link_ksettings,
+};
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_hdr.h b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hdr.h
new file mode 100644
index 0000000..a310c2f
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hdr.h
@@ -0,0 +1,1079 @@
+/*
+ * Copyright (C) 2003 - 2009 NetXen, Inc.
+ * Copyright (C) 2009 - QLogic Corporation.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution
+ * in the file called "COPYING".
+ *
+ */
+
+#ifndef __NETXEN_NIC_HDR_H_
+#define __NETXEN_NIC_HDR_H_
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+/*
+ * The basic unit of access when reading/writing control registers.
+ */
+
+typedef __le32 netxen_crbword_t;	/* single word in CRB space */
+
+enum {
+	NETXEN_HW_H0_CH_HUB_ADR = 0x05,
+	NETXEN_HW_H1_CH_HUB_ADR = 0x0E,
+	NETXEN_HW_H2_CH_HUB_ADR = 0x03,
+	NETXEN_HW_H3_CH_HUB_ADR = 0x01,
+	NETXEN_HW_H4_CH_HUB_ADR = 0x06,
+	NETXEN_HW_H5_CH_HUB_ADR = 0x07,
+	NETXEN_HW_H6_CH_HUB_ADR = 0x08
+};
+
+/*  Hub 0 */
+enum {
+	NETXEN_HW_MN_CRB_AGT_ADR = 0x15,
+	NETXEN_HW_MS_CRB_AGT_ADR = 0x25
+};
+
+/*  Hub 1 */
+enum {
+	NETXEN_HW_PS_CRB_AGT_ADR = 0x73,
+	NETXEN_HW_SS_CRB_AGT_ADR = 0x20,
+	NETXEN_HW_RPMX3_CRB_AGT_ADR = 0x0b,
+	NETXEN_HW_QMS_CRB_AGT_ADR = 0x00,
+	NETXEN_HW_SQGS0_CRB_AGT_ADR = 0x01,
+	NETXEN_HW_SQGS1_CRB_AGT_ADR = 0x02,
+	NETXEN_HW_SQGS2_CRB_AGT_ADR = 0x03,
+	NETXEN_HW_SQGS3_CRB_AGT_ADR = 0x04,
+	NETXEN_HW_C2C0_CRB_AGT_ADR = 0x58,
+	NETXEN_HW_C2C1_CRB_AGT_ADR = 0x59,
+	NETXEN_HW_C2C2_CRB_AGT_ADR = 0x5a,
+	NETXEN_HW_RPMX2_CRB_AGT_ADR = 0x0a,
+	NETXEN_HW_RPMX4_CRB_AGT_ADR = 0x0c,
+	NETXEN_HW_RPMX7_CRB_AGT_ADR = 0x0f,
+	NETXEN_HW_RPMX9_CRB_AGT_ADR = 0x12,
+	NETXEN_HW_SMB_CRB_AGT_ADR = 0x18
+};
+
+/*  Hub 2 */
+enum {
+	NETXEN_HW_NIU_CRB_AGT_ADR = 0x31,
+	NETXEN_HW_I2C0_CRB_AGT_ADR = 0x19,
+	NETXEN_HW_I2C1_CRB_AGT_ADR = 0x29,
+
+	NETXEN_HW_SN_CRB_AGT_ADR = 0x10,
+	NETXEN_HW_I2Q_CRB_AGT_ADR = 0x20,
+	NETXEN_HW_LPC_CRB_AGT_ADR = 0x22,
+	NETXEN_HW_ROMUSB_CRB_AGT_ADR = 0x21,
+	NETXEN_HW_QM_CRB_AGT_ADR = 0x66,
+	NETXEN_HW_SQG0_CRB_AGT_ADR = 0x60,
+	NETXEN_HW_SQG1_CRB_AGT_ADR = 0x61,
+	NETXEN_HW_SQG2_CRB_AGT_ADR = 0x62,
+	NETXEN_HW_SQG3_CRB_AGT_ADR = 0x63,
+	NETXEN_HW_RPMX1_CRB_AGT_ADR = 0x09,
+	NETXEN_HW_RPMX5_CRB_AGT_ADR = 0x0d,
+	NETXEN_HW_RPMX6_CRB_AGT_ADR = 0x0e,
+	NETXEN_HW_RPMX8_CRB_AGT_ADR = 0x11
+};
+
+/*  Hub 3 */
+enum {
+	NETXEN_HW_PH_CRB_AGT_ADR = 0x1A,
+	NETXEN_HW_SRE_CRB_AGT_ADR = 0x50,
+	NETXEN_HW_EG_CRB_AGT_ADR = 0x51,
+	NETXEN_HW_RPMX0_CRB_AGT_ADR = 0x08
+};
+
+/*  Hub 4 */
+enum {
+	NETXEN_HW_PEGN0_CRB_AGT_ADR = 0x40,
+	NETXEN_HW_PEGN1_CRB_AGT_ADR,
+	NETXEN_HW_PEGN2_CRB_AGT_ADR,
+	NETXEN_HW_PEGN3_CRB_AGT_ADR,
+	NETXEN_HW_PEGNI_CRB_AGT_ADR,
+	NETXEN_HW_PEGND_CRB_AGT_ADR,
+	NETXEN_HW_PEGNC_CRB_AGT_ADR,
+	NETXEN_HW_PEGR0_CRB_AGT_ADR,
+	NETXEN_HW_PEGR1_CRB_AGT_ADR,
+	NETXEN_HW_PEGR2_CRB_AGT_ADR,
+	NETXEN_HW_PEGR3_CRB_AGT_ADR,
+	NETXEN_HW_PEGN4_CRB_AGT_ADR
+};
+
+/*  Hub 5 */
+enum {
+	NETXEN_HW_PEGS0_CRB_AGT_ADR = 0x40,
+	NETXEN_HW_PEGS1_CRB_AGT_ADR,
+	NETXEN_HW_PEGS2_CRB_AGT_ADR,
+	NETXEN_HW_PEGS3_CRB_AGT_ADR,
+	NETXEN_HW_PEGSI_CRB_AGT_ADR,
+	NETXEN_HW_PEGSD_CRB_AGT_ADR,
+	NETXEN_HW_PEGSC_CRB_AGT_ADR
+};
+
+/*  Hub 6 */
+enum {
+	NETXEN_HW_CAS0_CRB_AGT_ADR = 0x46,
+	NETXEN_HW_CAS1_CRB_AGT_ADR = 0x47,
+	NETXEN_HW_CAS2_CRB_AGT_ADR = 0x48,
+	NETXEN_HW_CAS3_CRB_AGT_ADR = 0x49,
+	NETXEN_HW_NCM_CRB_AGT_ADR = 0x16,
+	NETXEN_HW_TMR_CRB_AGT_ADR = 0x17,
+	NETXEN_HW_XDMA_CRB_AGT_ADR = 0x05,
+	NETXEN_HW_OCM0_CRB_AGT_ADR = 0x06,
+	NETXEN_HW_OCM1_CRB_AGT_ADR = 0x07
+};
+
+/*  Floaters - non existent modules */
+#define NETXEN_HW_EFC_RPMX0_CRB_AGT_ADR	0x67
+
+/*  This field defines PCI/X adr [25:20] of agents on the CRB */
+enum {
+	NETXEN_HW_PX_MAP_CRB_PH = 0,
+	NETXEN_HW_PX_MAP_CRB_PS,
+	NETXEN_HW_PX_MAP_CRB_MN,
+	NETXEN_HW_PX_MAP_CRB_MS,
+	NETXEN_HW_PX_MAP_CRB_PGR1,
+	NETXEN_HW_PX_MAP_CRB_SRE,
+	NETXEN_HW_PX_MAP_CRB_NIU,
+	NETXEN_HW_PX_MAP_CRB_QMN,
+	NETXEN_HW_PX_MAP_CRB_SQN0,
+	NETXEN_HW_PX_MAP_CRB_SQN1,
+	NETXEN_HW_PX_MAP_CRB_SQN2,
+	NETXEN_HW_PX_MAP_CRB_SQN3,
+	NETXEN_HW_PX_MAP_CRB_QMS,
+	NETXEN_HW_PX_MAP_CRB_SQS0,
+	NETXEN_HW_PX_MAP_CRB_SQS1,
+	NETXEN_HW_PX_MAP_CRB_SQS2,
+	NETXEN_HW_PX_MAP_CRB_SQS3,
+	NETXEN_HW_PX_MAP_CRB_PGN0,
+	NETXEN_HW_PX_MAP_CRB_PGN1,
+	NETXEN_HW_PX_MAP_CRB_PGN2,
+	NETXEN_HW_PX_MAP_CRB_PGN3,
+	NETXEN_HW_PX_MAP_CRB_PGND,
+	NETXEN_HW_PX_MAP_CRB_PGNI,
+	NETXEN_HW_PX_MAP_CRB_PGS0,
+	NETXEN_HW_PX_MAP_CRB_PGS1,
+	NETXEN_HW_PX_MAP_CRB_PGS2,
+	NETXEN_HW_PX_MAP_CRB_PGS3,
+	NETXEN_HW_PX_MAP_CRB_PGSD,
+	NETXEN_HW_PX_MAP_CRB_PGSI,
+	NETXEN_HW_PX_MAP_CRB_SN,
+	NETXEN_HW_PX_MAP_CRB_PGR2,
+	NETXEN_HW_PX_MAP_CRB_EG,
+	NETXEN_HW_PX_MAP_CRB_PH2,
+	NETXEN_HW_PX_MAP_CRB_PS2,
+	NETXEN_HW_PX_MAP_CRB_CAM,
+	NETXEN_HW_PX_MAP_CRB_CAS0,
+	NETXEN_HW_PX_MAP_CRB_CAS1,
+	NETXEN_HW_PX_MAP_CRB_CAS2,
+	NETXEN_HW_PX_MAP_CRB_C2C0,
+	NETXEN_HW_PX_MAP_CRB_C2C1,
+	NETXEN_HW_PX_MAP_CRB_TIMR,
+	NETXEN_HW_PX_MAP_CRB_PGR3,
+	NETXEN_HW_PX_MAP_CRB_RPMX1,
+	NETXEN_HW_PX_MAP_CRB_RPMX2,
+	NETXEN_HW_PX_MAP_CRB_RPMX3,
+	NETXEN_HW_PX_MAP_CRB_RPMX4,
+	NETXEN_HW_PX_MAP_CRB_RPMX5,
+	NETXEN_HW_PX_MAP_CRB_RPMX6,
+	NETXEN_HW_PX_MAP_CRB_RPMX7,
+	NETXEN_HW_PX_MAP_CRB_XDMA,
+	NETXEN_HW_PX_MAP_CRB_I2Q,
+	NETXEN_HW_PX_MAP_CRB_ROMUSB,
+	NETXEN_HW_PX_MAP_CRB_CAS3,
+	NETXEN_HW_PX_MAP_CRB_RPMX0,
+	NETXEN_HW_PX_MAP_CRB_RPMX8,
+	NETXEN_HW_PX_MAP_CRB_RPMX9,
+	NETXEN_HW_PX_MAP_CRB_OCM0,
+	NETXEN_HW_PX_MAP_CRB_OCM1,
+	NETXEN_HW_PX_MAP_CRB_SMB,
+	NETXEN_HW_PX_MAP_CRB_I2C0,
+	NETXEN_HW_PX_MAP_CRB_I2C1,
+	NETXEN_HW_PX_MAP_CRB_LPC,
+	NETXEN_HW_PX_MAP_CRB_PGNC,
+	NETXEN_HW_PX_MAP_CRB_PGR0
+};
+
+/*  This field defines CRB adr [31:20] of the agents */
+
+#define NETXEN_HW_CRB_HUB_AGT_ADR_MN	\
+	((NETXEN_HW_H0_CH_HUB_ADR << 7) | NETXEN_HW_MN_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PH	\
+	((NETXEN_HW_H0_CH_HUB_ADR << 7) | NETXEN_HW_PH_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_MS	\
+	((NETXEN_HW_H0_CH_HUB_ADR << 7) | NETXEN_HW_MS_CRB_AGT_ADR)
+
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PS	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_PS_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_SS	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_SS_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_RPMX3	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_RPMX3_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_QMS	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_QMS_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_SQS0	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_SQGS0_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_SQS1	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_SQGS1_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_SQS2	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_SQGS2_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_SQS3	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_SQGS3_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_C2C0	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_C2C0_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_C2C1	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_C2C1_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_RPMX2	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_RPMX2_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_RPMX4	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_RPMX4_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_RPMX7	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_RPMX7_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_RPMX9	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_RPMX9_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_SMB	\
+	((NETXEN_HW_H1_CH_HUB_ADR << 7) | NETXEN_HW_SMB_CRB_AGT_ADR)
+
+#define NETXEN_HW_CRB_HUB_AGT_ADR_NIU	\
+	((NETXEN_HW_H2_CH_HUB_ADR << 7) | NETXEN_HW_NIU_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_I2C0	\
+	((NETXEN_HW_H2_CH_HUB_ADR << 7) | NETXEN_HW_I2C0_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_I2C1	\
+	((NETXEN_HW_H2_CH_HUB_ADR << 7) | NETXEN_HW_I2C1_CRB_AGT_ADR)
+
+#define NETXEN_HW_CRB_HUB_AGT_ADR_SRE	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_SRE_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_EG	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_EG_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_RPMX0	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_RPMX0_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_QMN	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_QM_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_SQN0	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_SQG0_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_SQN1	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_SQG1_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_SQN2	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_SQG2_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_SQN3	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_SQG3_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_RPMX1	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_RPMX1_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_RPMX5	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_RPMX5_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_RPMX6	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_RPMX6_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_RPMX8	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_RPMX8_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_CAS0	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_CAS0_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_CAS1	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_CAS1_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_CAS2	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_CAS2_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_CAS3	\
+	((NETXEN_HW_H3_CH_HUB_ADR << 7) | NETXEN_HW_CAS3_CRB_AGT_ADR)
+
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGNI	\
+	((NETXEN_HW_H4_CH_HUB_ADR << 7) | NETXEN_HW_PEGNI_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGND	\
+	((NETXEN_HW_H4_CH_HUB_ADR << 7) | NETXEN_HW_PEGND_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGN0	\
+	((NETXEN_HW_H4_CH_HUB_ADR << 7) | NETXEN_HW_PEGN0_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGN1	\
+	((NETXEN_HW_H4_CH_HUB_ADR << 7) | NETXEN_HW_PEGN1_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGN2	\
+	((NETXEN_HW_H4_CH_HUB_ADR << 7) | NETXEN_HW_PEGN2_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGN3	\
+	((NETXEN_HW_H4_CH_HUB_ADR << 7) | NETXEN_HW_PEGN3_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGN4	\
+	((NETXEN_HW_H4_CH_HUB_ADR << 7) | NETXEN_HW_PEGN4_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGNC	\
+	((NETXEN_HW_H4_CH_HUB_ADR << 7) | NETXEN_HW_PEGNC_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGR0	\
+	((NETXEN_HW_H4_CH_HUB_ADR << 7) | NETXEN_HW_PEGR0_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGR1	\
+	((NETXEN_HW_H4_CH_HUB_ADR << 7) | NETXEN_HW_PEGR1_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGR2	\
+	((NETXEN_HW_H4_CH_HUB_ADR << 7) | NETXEN_HW_PEGR2_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGR3	\
+	((NETXEN_HW_H4_CH_HUB_ADR << 7) | NETXEN_HW_PEGR3_CRB_AGT_ADR)
+
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGSI	\
+	((NETXEN_HW_H5_CH_HUB_ADR << 7) | NETXEN_HW_PEGSI_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGSD	\
+	((NETXEN_HW_H5_CH_HUB_ADR << 7) | NETXEN_HW_PEGSD_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGS0	\
+	((NETXEN_HW_H5_CH_HUB_ADR << 7) | NETXEN_HW_PEGS0_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGS1	\
+	((NETXEN_HW_H5_CH_HUB_ADR << 7) | NETXEN_HW_PEGS1_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGS2	\
+	((NETXEN_HW_H5_CH_HUB_ADR << 7) | NETXEN_HW_PEGS2_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGS3	\
+	((NETXEN_HW_H5_CH_HUB_ADR << 7) | NETXEN_HW_PEGS3_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_PGSC	\
+	((NETXEN_HW_H5_CH_HUB_ADR << 7) | NETXEN_HW_PEGSC_CRB_AGT_ADR)
+
+#define NETXEN_HW_CRB_HUB_AGT_ADR_CAM	\
+	((NETXEN_HW_H6_CH_HUB_ADR << 7) | NETXEN_HW_NCM_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_TIMR	\
+	((NETXEN_HW_H6_CH_HUB_ADR << 7) | NETXEN_HW_TMR_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_XDMA	\
+	((NETXEN_HW_H6_CH_HUB_ADR << 7) | NETXEN_HW_XDMA_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_SN	\
+	((NETXEN_HW_H6_CH_HUB_ADR << 7) | NETXEN_HW_SN_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_I2Q	\
+	((NETXEN_HW_H6_CH_HUB_ADR << 7) | NETXEN_HW_I2Q_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_ROMUSB	\
+	((NETXEN_HW_H6_CH_HUB_ADR << 7) | NETXEN_HW_ROMUSB_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_OCM0	\
+	((NETXEN_HW_H6_CH_HUB_ADR << 7) | NETXEN_HW_OCM0_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_OCM1	\
+	((NETXEN_HW_H6_CH_HUB_ADR << 7) | NETXEN_HW_OCM1_CRB_AGT_ADR)
+#define NETXEN_HW_CRB_HUB_AGT_ADR_LPC	\
+	((NETXEN_HW_H6_CH_HUB_ADR << 7) | NETXEN_HW_LPC_CRB_AGT_ADR)
+
+#define NETXEN_SRE_MISC			(NETXEN_CRB_SRE + 0x0002c)
+#define NETXEN_SRE_INT_STATUS		(NETXEN_CRB_SRE + 0x00034)
+#define NETXEN_SRE_PBI_ACTIVE_STATUS	(NETXEN_CRB_SRE + 0x01014)
+#define NETXEN_SRE_L1RE_CTL		(NETXEN_CRB_SRE + 0x03000)
+#define NETXEN_SRE_L2RE_CTL		(NETXEN_CRB_SRE + 0x05000)
+#define NETXEN_SRE_BUF_CTL		(NETXEN_CRB_SRE + 0x01000)
+
+#define	NETXEN_DMA_BASE(U)	(NETXEN_CRB_PCIX_MD + 0x20000 + ((U)<<16))
+#define	NETXEN_DMA_COMMAND(U)	(NETXEN_DMA_BASE(U) + 0x00008)
+
+#define NETXEN_I2Q_CLR_PCI_HI	(NETXEN_CRB_I2Q + 0x00034)
+
+#define PEG_NETWORK_BASE(N)	(NETXEN_CRB_PEG_NET_0 + (((N)&3) << 20))
+#define CRB_REG_EX_PC		0x3c
+
+#define ROMUSB_GLB	(NETXEN_CRB_ROMUSB + 0x00000)
+#define ROMUSB_ROM	(NETXEN_CRB_ROMUSB + 0x10000)
+
+#define NETXEN_ROMUSB_GLB_STATUS	(ROMUSB_GLB + 0x0004)
+#define NETXEN_ROMUSB_GLB_SW_RESET	(ROMUSB_GLB + 0x0008)
+#define NETXEN_ROMUSB_GLB_PAD_GPIO_I	(ROMUSB_GLB + 0x000c)
+#define NETXEN_ROMUSB_GLB_CAS_RST	(ROMUSB_GLB + 0x0038)
+#define NETXEN_ROMUSB_GLB_TEST_MUX_SEL	(ROMUSB_GLB + 0x0044)
+#define NETXEN_ROMUSB_GLB_PEGTUNE_DONE	(ROMUSB_GLB + 0x005c)
+#define NETXEN_ROMUSB_GLB_CHIP_CLK_CTRL	(ROMUSB_GLB + 0x00A8)
+
+#define NETXEN_ROMUSB_GPIO(n)		(ROMUSB_GLB + 0x60 + (4 * (n)))
+
+#define NETXEN_ROMUSB_ROM_INSTR_OPCODE	(ROMUSB_ROM + 0x0004)
+#define NETXEN_ROMUSB_ROM_ADDRESS	(ROMUSB_ROM + 0x0008)
+#define NETXEN_ROMUSB_ROM_WDATA		(ROMUSB_ROM + 0x000c)
+#define NETXEN_ROMUSB_ROM_ABYTE_CNT	(ROMUSB_ROM + 0x0010)
+#define NETXEN_ROMUSB_ROM_DUMMY_BYTE_CNT (ROMUSB_ROM + 0x0014)
+#define NETXEN_ROMUSB_ROM_RDATA		(ROMUSB_ROM + 0x0018)
+
+/* Lock IDs for ROM lock */
+#define ROM_LOCK_DRIVER	0x0d417340
+
+/******************************************************************************
+*
+*    Definitions specific to M25P flash
+*
+*******************************************************************************
+*   Instructions
+*/
+#define M25P_INSTR_WREN		0x06
+#define M25P_INSTR_WRDI		0x04
+#define M25P_INSTR_RDID		0x9f
+#define M25P_INSTR_RDSR		0x05
+#define M25P_INSTR_WRSR		0x01
+#define M25P_INSTR_READ		0x03
+#define M25P_INSTR_FAST_READ	0x0b
+#define M25P_INSTR_PP		0x02
+#define M25P_INSTR_SE		0xd8
+#define M25P_INSTR_BE		0xc7
+#define M25P_INSTR_DP		0xb9
+#define M25P_INSTR_RES		0xab
+
+/* all are 1MB windows */
+
+#define NETXEN_PCI_CRB_WINDOWSIZE	0x00100000
+#define NETXEN_PCI_CRB_WINDOW(A)	\
+	(NETXEN_PCI_CRBSPACE + (A)*NETXEN_PCI_CRB_WINDOWSIZE)
+
+#define NETXEN_CRB_NIU		NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_NIU)
+#define NETXEN_CRB_SRE		NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_SRE)
+#define NETXEN_CRB_ROMUSB	\
+	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_ROMUSB)
+#define NETXEN_CRB_I2Q		NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_I2Q)
+#define NETXEN_CRB_I2C0		NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_I2C0)
+#define NETXEN_CRB_SMB		NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_SMB)
+#define NETXEN_CRB_MAX		NETXEN_PCI_CRB_WINDOW(64)
+
+#define NETXEN_CRB_PCIX_HOST	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_PH)
+#define NETXEN_CRB_PCIX_HOST2	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_PH2)
+#define NETXEN_CRB_PEG_NET_0	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_PGN0)
+#define NETXEN_CRB_PEG_NET_1	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_PGN1)
+#define NETXEN_CRB_PEG_NET_2	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_PGN2)
+#define NETXEN_CRB_PEG_NET_3	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_PGN3)
+#define NETXEN_CRB_PEG_NET_4	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_SQS2)
+#define NETXEN_CRB_PEG_NET_D	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_PGND)
+#define NETXEN_CRB_PEG_NET_I	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_PGNI)
+#define NETXEN_CRB_DDR_NET	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_MN)
+#define NETXEN_CRB_QDR_NET	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_SN)
+
+#define NETXEN_CRB_PCIX_MD	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_PS)
+#define NETXEN_CRB_PCIE		NETXEN_CRB_PCIX_MD
+
+#define ISR_INT_VECTOR		(NETXEN_PCIX_PS_REG(PCIX_INT_VECTOR))
+#define ISR_INT_MASK		(NETXEN_PCIX_PS_REG(PCIX_INT_MASK))
+#define ISR_INT_MASK_SLOW	(NETXEN_PCIX_PS_REG(PCIX_INT_MASK))
+#define ISR_INT_TARGET_STATUS	(NETXEN_PCIX_PS_REG(PCIX_TARGET_STATUS))
+#define ISR_INT_TARGET_MASK	(NETXEN_PCIX_PS_REG(PCIX_TARGET_MASK))
+#define ISR_INT_TARGET_STATUS_F1   (NETXEN_PCIX_PS_REG(PCIX_TARGET_STATUS_F1))
+#define ISR_INT_TARGET_MASK_F1     (NETXEN_PCIX_PS_REG(PCIX_TARGET_MASK_F1))
+#define ISR_INT_TARGET_STATUS_F2   (NETXEN_PCIX_PS_REG(PCIX_TARGET_STATUS_F2))
+#define ISR_INT_TARGET_MASK_F2     (NETXEN_PCIX_PS_REG(PCIX_TARGET_MASK_F2))
+#define ISR_INT_TARGET_STATUS_F3   (NETXEN_PCIX_PS_REG(PCIX_TARGET_STATUS_F3))
+#define ISR_INT_TARGET_MASK_F3     (NETXEN_PCIX_PS_REG(PCIX_TARGET_MASK_F3))
+#define ISR_INT_TARGET_STATUS_F4   (NETXEN_PCIX_PS_REG(PCIX_TARGET_STATUS_F4))
+#define ISR_INT_TARGET_MASK_F4     (NETXEN_PCIX_PS_REG(PCIX_TARGET_MASK_F4))
+#define ISR_INT_TARGET_STATUS_F5   (NETXEN_PCIX_PS_REG(PCIX_TARGET_STATUS_F5))
+#define ISR_INT_TARGET_MASK_F5     (NETXEN_PCIX_PS_REG(PCIX_TARGET_MASK_F5))
+#define ISR_INT_TARGET_STATUS_F6   (NETXEN_PCIX_PS_REG(PCIX_TARGET_STATUS_F6))
+#define ISR_INT_TARGET_MASK_F6     (NETXEN_PCIX_PS_REG(PCIX_TARGET_MASK_F6))
+#define ISR_INT_TARGET_STATUS_F7   (NETXEN_PCIX_PS_REG(PCIX_TARGET_STATUS_F7))
+#define ISR_INT_TARGET_MASK_F7     (NETXEN_PCIX_PS_REG(PCIX_TARGET_MASK_F7))
+
+#define NETXEN_PCI_MAPSIZE	128
+#define NETXEN_PCI_DDR_NET	(0x00000000UL)
+#define NETXEN_PCI_QDR_NET	(0x04000000UL)
+#define NETXEN_PCI_DIRECT_CRB	(0x04400000UL)
+#define NETXEN_PCI_CAMQM	(0x04800000UL)
+#define NETXEN_PCI_CAMQM_MAX	(0x04ffffffUL)
+#define NETXEN_PCI_OCM0		(0x05000000UL)
+#define NETXEN_PCI_OCM0_MAX	(0x050fffffUL)
+#define NETXEN_PCI_OCM1		(0x05100000UL)
+#define NETXEN_PCI_OCM1_MAX	(0x051fffffUL)
+#define NETXEN_PCI_CRBSPACE	(0x06000000UL)
+#define NETXEN_PCI_128MB_SIZE	(0x08000000UL)
+#define NETXEN_PCI_32MB_SIZE	(0x02000000UL)
+#define NETXEN_PCI_2MB_SIZE	(0x00200000UL)
+
+#define NETXEN_PCI_MN_2M	(0)
+#define NETXEN_PCI_MS_2M	(0x80000)
+#define NETXEN_PCI_OCM0_2M	(0x000c0000UL)
+#define NETXEN_PCI_CAMQM_2M_BASE	(0x000ff800UL)
+#define NETXEN_PCI_CAMQM_2M_END		(0x04800800UL)
+
+#define NETXEN_CRB_CAM	NETXEN_PCI_CRB_WINDOW(NETXEN_HW_PX_MAP_CRB_CAM)
+
+#define NETXEN_ADDR_DDR_NET	(0x0000000000000000ULL)
+#define NETXEN_ADDR_DDR_NET_MAX (0x000000000fffffffULL)
+#define NETXEN_ADDR_OCM0	(0x0000000200000000ULL)
+#define NETXEN_ADDR_OCM0_MAX	(0x00000002000fffffULL)
+#define NETXEN_ADDR_OCM1	(0x0000000200400000ULL)
+#define NETXEN_ADDR_OCM1_MAX	(0x00000002004fffffULL)
+#define NETXEN_ADDR_QDR_NET	(0x0000000300000000ULL)
+#define NETXEN_ADDR_QDR_NET_MAX_P2 (0x00000003003fffffULL)
+#define NETXEN_ADDR_QDR_NET_MAX_P3 (0x0000000303ffffffULL)
+
+/*
+ *   Register offsets for MN
+ */
+#define	NETXEN_MIU_CONTROL	(0x000)
+#define	NETXEN_MIU_MN_CONTROL	(NETXEN_CRB_DDR_NET+NETXEN_MIU_CONTROL)
+
+	/* 200ms delay in each loop */
+#define	NETXEN_NIU_PHY_WAITLEN		200000
+	/* 10 seconds before we give up */
+#define	NETXEN_NIU_PHY_WAITMAX		50
+#define	NETXEN_NIU_MAX_GBE_PORTS	4
+#define	NETXEN_NIU_MAX_XG_PORTS		2
+
+#define	NETXEN_NIU_MODE			(NETXEN_CRB_NIU + 0x00000)
+
+#define	NETXEN_NIU_XG_SINGLE_TERM	(NETXEN_CRB_NIU + 0x00004)
+#define	NETXEN_NIU_XG_DRIVE_HI		(NETXEN_CRB_NIU + 0x00008)
+#define	NETXEN_NIU_XG_DRIVE_LO		(NETXEN_CRB_NIU + 0x0000c)
+#define	NETXEN_NIU_XG_DTX		(NETXEN_CRB_NIU + 0x00010)
+#define	NETXEN_NIU_XG_DEQ		(NETXEN_CRB_NIU + 0x00014)
+#define	NETXEN_NIU_XG_WORD_ALIGN	(NETXEN_CRB_NIU + 0x00018)
+#define	NETXEN_NIU_XG_RESET		(NETXEN_CRB_NIU + 0x0001c)
+#define	NETXEN_NIU_XG_POWER_DOWN	(NETXEN_CRB_NIU + 0x00020)
+#define	NETXEN_NIU_XG_RESET_PLL		(NETXEN_CRB_NIU + 0x00024)
+#define	NETXEN_NIU_XG_SERDES_LOOPBACK	(NETXEN_CRB_NIU + 0x00028)
+#define	NETXEN_NIU_XG_DO_BYTE_ALIGN	(NETXEN_CRB_NIU + 0x0002c)
+#define	NETXEN_NIU_XG_TX_ENABLE		(NETXEN_CRB_NIU + 0x00030)
+#define	NETXEN_NIU_XG_RX_ENABLE		(NETXEN_CRB_NIU + 0x00034)
+#define	NETXEN_NIU_XG_STATUS		(NETXEN_CRB_NIU + 0x00038)
+#define	NETXEN_NIU_XG_PAUSE_THRESHOLD	(NETXEN_CRB_NIU + 0x0003c)
+#define	NETXEN_NIU_INT_MASK		(NETXEN_CRB_NIU + 0x00040)
+#define	NETXEN_NIU_ACTIVE_INT		(NETXEN_CRB_NIU + 0x00044)
+#define	NETXEN_NIU_MASKABLE_INT		(NETXEN_CRB_NIU + 0x00048)
+
+#define NETXEN_NIU_STRAP_VALUE_SAVE_HIGHER	(NETXEN_CRB_NIU + 0x0004c)
+
+#define	NETXEN_NIU_GB_SERDES_RESET	(NETXEN_CRB_NIU + 0x00050)
+#define	NETXEN_NIU_GB0_GMII_MODE	(NETXEN_CRB_NIU + 0x00054)
+#define	NETXEN_NIU_GB0_MII_MODE		(NETXEN_CRB_NIU + 0x00058)
+#define	NETXEN_NIU_GB1_GMII_MODE	(NETXEN_CRB_NIU + 0x0005c)
+#define	NETXEN_NIU_GB1_MII_MODE		(NETXEN_CRB_NIU + 0x00060)
+#define	NETXEN_NIU_GB2_GMII_MODE	(NETXEN_CRB_NIU + 0x00064)
+#define	NETXEN_NIU_GB2_MII_MODE		(NETXEN_CRB_NIU + 0x00068)
+#define	NETXEN_NIU_GB3_GMII_MODE	(NETXEN_CRB_NIU + 0x0006c)
+#define	NETXEN_NIU_GB3_MII_MODE		(NETXEN_CRB_NIU + 0x00070)
+#define	NETXEN_NIU_REMOTE_LOOPBACK	(NETXEN_CRB_NIU + 0x00074)
+#define	NETXEN_NIU_GB0_HALF_DUPLEX	(NETXEN_CRB_NIU + 0x00078)
+#define	NETXEN_NIU_GB1_HALF_DUPLEX	(NETXEN_CRB_NIU + 0x0007c)
+#define	NETXEN_NIU_RESET_SYS_FIFOS	(NETXEN_CRB_NIU + 0x00088)
+#define	NETXEN_NIU_GB_CRC_DROP		(NETXEN_CRB_NIU + 0x0008c)
+#define	NETXEN_NIU_GB_DROP_WRONGADDR	(NETXEN_CRB_NIU + 0x00090)
+#define	NETXEN_NIU_TEST_MUX_CTL		(NETXEN_CRB_NIU + 0x00094)
+#define	NETXEN_NIU_XG_PAUSE_CTL		(NETXEN_CRB_NIU + 0x00098)
+#define	NETXEN_NIU_XG_PAUSE_LEVEL	(NETXEN_CRB_NIU + 0x000dc)
+#define	NETXEN_NIU_FRAME_COUNT_SELECT	(NETXEN_CRB_NIU + 0x000ac)
+#define	NETXEN_NIU_FRAME_COUNT		(NETXEN_CRB_NIU + 0x000b0)
+#define	NETXEN_NIU_XG_SEL		(NETXEN_CRB_NIU + 0x00128)
+#define NETXEN_NIU_GB_PAUSE_CTL		(NETXEN_CRB_NIU + 0x0030c)
+
+#define NETXEN_NIU_FULL_LEVEL_XG	(NETXEN_CRB_NIU + 0x00450)
+
+#define NETXEN_NIU_XG1_RESET	    	(NETXEN_CRB_NIU + 0x0011c)
+#define NETXEN_NIU_XG1_POWER_DOWN	(NETXEN_CRB_NIU + 0x00120)
+#define NETXEN_NIU_XG1_RESET_PLL	(NETXEN_CRB_NIU + 0x00124)
+
+#define NETXEN_MAC_ADDR_CNTL_REG	(NETXEN_CRB_NIU + 0x1000)
+
+#define	NETXEN_MULTICAST_ADDR_HI_0	(NETXEN_CRB_NIU + 0x1010)
+#define NETXEN_MULTICAST_ADDR_HI_1	(NETXEN_CRB_NIU + 0x1014)
+#define NETXEN_MULTICAST_ADDR_HI_2	(NETXEN_CRB_NIU + 0x1018)
+#define NETXEN_MULTICAST_ADDR_HI_3	(NETXEN_CRB_NIU + 0x101c)
+
+#define NETXEN_UNICAST_ADDR_BASE	(NETXEN_CRB_NIU + 0x1080)
+#define	NETXEN_MULTICAST_ADDR_BASE	(NETXEN_CRB_NIU + 0x1100)
+
+#define	NETXEN_NIU_GB_MAC_CONFIG_0(I)		\
+	(NETXEN_CRB_NIU + 0x30000 + (I)*0x10000)
+#define	NETXEN_NIU_GB_MAC_CONFIG_1(I)		\
+	(NETXEN_CRB_NIU + 0x30004 + (I)*0x10000)
+#define	NETXEN_NIU_GB_MAC_IPG_IFG(I)		\
+	(NETXEN_CRB_NIU + 0x30008 + (I)*0x10000)
+#define	NETXEN_NIU_GB_HALF_DUPLEX_CTRL(I)	\
+	(NETXEN_CRB_NIU + 0x3000c + (I)*0x10000)
+#define	NETXEN_NIU_GB_MAX_FRAME_SIZE(I)		\
+	(NETXEN_CRB_NIU + 0x30010 + (I)*0x10000)
+#define	NETXEN_NIU_GB_TEST_REG(I)		\
+	(NETXEN_CRB_NIU + 0x3001c + (I)*0x10000)
+#define	NETXEN_NIU_GB_MII_MGMT_CONFIG(I)	\
+	(NETXEN_CRB_NIU + 0x30020 + (I)*0x10000)
+#define	NETXEN_NIU_GB_MII_MGMT_COMMAND(I)	\
+	(NETXEN_CRB_NIU + 0x30024 + (I)*0x10000)
+#define	NETXEN_NIU_GB_MII_MGMT_ADDR(I)		\
+	(NETXEN_CRB_NIU + 0x30028 + (I)*0x10000)
+#define	NETXEN_NIU_GB_MII_MGMT_CTRL(I)		\
+	(NETXEN_CRB_NIU + 0x3002c + (I)*0x10000)
+#define	NETXEN_NIU_GB_MII_MGMT_STATUS(I)	\
+	(NETXEN_CRB_NIU + 0x30030 + (I)*0x10000)
+#define	NETXEN_NIU_GB_MII_MGMT_INDICATE(I)	\
+	(NETXEN_CRB_NIU + 0x30034 + (I)*0x10000)
+#define	NETXEN_NIU_GB_INTERFACE_CTRL(I)		\
+	(NETXEN_CRB_NIU + 0x30038 + (I)*0x10000)
+#define	NETXEN_NIU_GB_INTERFACE_STATUS(I)	\
+	(NETXEN_CRB_NIU + 0x3003c + (I)*0x10000)
+#define	NETXEN_NIU_GB_STATION_ADDR_0(I)		\
+	(NETXEN_CRB_NIU + 0x30040 + (I)*0x10000)
+#define	NETXEN_NIU_GB_STATION_ADDR_1(I)		\
+	(NETXEN_CRB_NIU + 0x30044 + (I)*0x10000)
+
+#define	NETXEN_NIU_XGE_CONFIG_0			(NETXEN_CRB_NIU + 0x70000)
+#define	NETXEN_NIU_XGE_CONFIG_1			(NETXEN_CRB_NIU + 0x70004)
+#define	NETXEN_NIU_XGE_IPG			(NETXEN_CRB_NIU + 0x70008)
+#define	NETXEN_NIU_XGE_STATION_ADDR_0_HI	(NETXEN_CRB_NIU + 0x7000c)
+#define	NETXEN_NIU_XGE_STATION_ADDR_0_1		(NETXEN_CRB_NIU + 0x70010)
+#define	NETXEN_NIU_XGE_STATION_ADDR_1_LO	(NETXEN_CRB_NIU + 0x70014)
+#define	NETXEN_NIU_XGE_STATUS			(NETXEN_CRB_NIU + 0x70018)
+#define	NETXEN_NIU_XGE_MAX_FRAME_SIZE		(NETXEN_CRB_NIU + 0x7001c)
+#define	NETXEN_NIU_XGE_PAUSE_FRAME_VALUE	(NETXEN_CRB_NIU + 0x70020)
+#define	NETXEN_NIU_XGE_TX_BYTE_CNT		(NETXEN_CRB_NIU + 0x70024)
+#define	NETXEN_NIU_XGE_TX_FRAME_CNT		(NETXEN_CRB_NIU + 0x70028)
+#define	NETXEN_NIU_XGE_RX_BYTE_CNT		(NETXEN_CRB_NIU + 0x7002c)
+#define	NETXEN_NIU_XGE_RX_FRAME_CNT		(NETXEN_CRB_NIU + 0x70030)
+#define	NETXEN_NIU_XGE_AGGR_ERROR_CNT		(NETXEN_CRB_NIU + 0x70034)
+#define	NETXEN_NIU_XGE_MULTICAST_FRAME_CNT 	(NETXEN_CRB_NIU + 0x70038)
+#define	NETXEN_NIU_XGE_UNICAST_FRAME_CNT	(NETXEN_CRB_NIU + 0x7003c)
+#define	NETXEN_NIU_XGE_CRC_ERROR_CNT		(NETXEN_CRB_NIU + 0x70040)
+#define	NETXEN_NIU_XGE_OVERSIZE_FRAME_ERR	(NETXEN_CRB_NIU + 0x70044)
+#define	NETXEN_NIU_XGE_UNDERSIZE_FRAME_ERR	(NETXEN_CRB_NIU + 0x70048)
+#define	NETXEN_NIU_XGE_LOCAL_ERROR_CNT		(NETXEN_CRB_NIU + 0x7004c)
+#define	NETXEN_NIU_XGE_REMOTE_ERROR_CNT		(NETXEN_CRB_NIU + 0x70050)
+#define	NETXEN_NIU_XGE_CONTROL_CHAR_CNT		(NETXEN_CRB_NIU + 0x70054)
+#define	NETXEN_NIU_XGE_PAUSE_FRAME_CNT		(NETXEN_CRB_NIU + 0x70058)
+#define NETXEN_NIU_XG1_CONFIG_0			(NETXEN_CRB_NIU + 0x80000)
+#define NETXEN_NIU_XG1_CONFIG_1			(NETXEN_CRB_NIU + 0x80004)
+#define NETXEN_NIU_XG1_IPG			(NETXEN_CRB_NIU + 0x80008)
+#define NETXEN_NIU_XG1_STATION_ADDR_0_HI	(NETXEN_CRB_NIU + 0x8000c)
+#define NETXEN_NIU_XG1_STATION_ADDR_0_1		(NETXEN_CRB_NIU + 0x80010)
+#define NETXEN_NIU_XG1_STATION_ADDR_1_LO	(NETXEN_CRB_NIU + 0x80014)
+#define NETXEN_NIU_XG1_STATUS		    	(NETXEN_CRB_NIU + 0x80018)
+#define NETXEN_NIU_XG1_MAX_FRAME_SIZE	   	(NETXEN_CRB_NIU + 0x8001c)
+#define NETXEN_NIU_XG1_PAUSE_FRAME_VALUE	(NETXEN_CRB_NIU + 0x80020)
+#define NETXEN_NIU_XG1_TX_BYTE_CNT		(NETXEN_CRB_NIU + 0x80024)
+#define NETXEN_NIU_XG1_TX_FRAME_CNT	 	(NETXEN_CRB_NIU + 0x80028)
+#define NETXEN_NIU_XG1_RX_BYTE_CNT	  	(NETXEN_CRB_NIU + 0x8002c)
+#define NETXEN_NIU_XG1_RX_FRAME_CNT	 	(NETXEN_CRB_NIU + 0x80030)
+#define NETXEN_NIU_XG1_AGGR_ERROR_CNT	   	(NETXEN_CRB_NIU + 0x80034)
+#define NETXEN_NIU_XG1_MULTICAST_FRAME_CNT	(NETXEN_CRB_NIU + 0x80038)
+#define NETXEN_NIU_XG1_UNICAST_FRAME_CNT	(NETXEN_CRB_NIU + 0x8003c)
+#define NETXEN_NIU_XG1_CRC_ERROR_CNT		(NETXEN_CRB_NIU + 0x80040)
+#define NETXEN_NIU_XG1_OVERSIZE_FRAME_ERR	(NETXEN_CRB_NIU + 0x80044)
+#define NETXEN_NIU_XG1_UNDERSIZE_FRAME_ERR	(NETXEN_CRB_NIU + 0x80048)
+#define NETXEN_NIU_XG1_LOCAL_ERROR_CNT		(NETXEN_CRB_NIU + 0x8004c)
+#define NETXEN_NIU_XG1_REMOTE_ERROR_CNT		(NETXEN_CRB_NIU + 0x80050)
+#define NETXEN_NIU_XG1_CONTROL_CHAR_CNT		(NETXEN_CRB_NIU + 0x80054)
+#define NETXEN_NIU_XG1_PAUSE_FRAME_CNT		(NETXEN_CRB_NIU + 0x80058)
+
+/* P3 802.3ap */
+#define NETXEN_NIU_AP_MAC_CONFIG_0(I)      (NETXEN_CRB_NIU+0xa0000+(I)*0x10000)
+#define NETXEN_NIU_AP_MAC_CONFIG_1(I)      (NETXEN_CRB_NIU+0xa0004+(I)*0x10000)
+#define NETXEN_NIU_AP_MAC_IPG_IFG(I)       (NETXEN_CRB_NIU+0xa0008+(I)*0x10000)
+#define NETXEN_NIU_AP_HALF_DUPLEX_CTRL(I)  (NETXEN_CRB_NIU+0xa000c+(I)*0x10000)
+#define NETXEN_NIU_AP_MAX_FRAME_SIZE(I)    (NETXEN_CRB_NIU+0xa0010+(I)*0x10000)
+#define NETXEN_NIU_AP_TEST_REG(I)          (NETXEN_CRB_NIU+0xa001c+(I)*0x10000)
+#define NETXEN_NIU_AP_MII_MGMT_CONFIG(I)   (NETXEN_CRB_NIU+0xa0020+(I)*0x10000)
+#define NETXEN_NIU_AP_MII_MGMT_COMMAND(I)  (NETXEN_CRB_NIU+0xa0024+(I)*0x10000)
+#define NETXEN_NIU_AP_MII_MGMT_ADDR(I)     (NETXEN_CRB_NIU+0xa0028+(I)*0x10000)
+#define NETXEN_NIU_AP_MII_MGMT_CTRL(I)     (NETXEN_CRB_NIU+0xa002c+(I)*0x10000)
+#define NETXEN_NIU_AP_MII_MGMT_STATUS(I)   (NETXEN_CRB_NIU+0xa0030+(I)*0x10000)
+#define NETXEN_NIU_AP_MII_MGMT_INDICATE(I) (NETXEN_CRB_NIU+0xa0034+(I)*0x10000)
+#define NETXEN_NIU_AP_INTERFACE_CTRL(I)    (NETXEN_CRB_NIU+0xa0038+(I)*0x10000)
+#define NETXEN_NIU_AP_INTERFACE_STATUS(I)  (NETXEN_CRB_NIU+0xa003c+(I)*0x10000)
+#define NETXEN_NIU_AP_STATION_ADDR_0(I)    (NETXEN_CRB_NIU+0xa0040+(I)*0x10000)
+#define NETXEN_NIU_AP_STATION_ADDR_1(I)    (NETXEN_CRB_NIU+0xa0044+(I)*0x10000)
+
+
+#define TEST_AGT_CTRL	(0x00)
+
+#define TA_CTL_START	1
+#define TA_CTL_ENABLE	2
+#define TA_CTL_WRITE	4
+#define TA_CTL_BUSY	8
+
+/*
+ *   Register offsets for MN
+ */
+#define MIU_TEST_AGT_BASE		(0x90)
+
+#define MIU_TEST_AGT_ADDR_LO		(0x04)
+#define MIU_TEST_AGT_ADDR_HI		(0x08)
+#define MIU_TEST_AGT_WRDATA_LO		(0x10)
+#define MIU_TEST_AGT_WRDATA_HI		(0x14)
+#define MIU_TEST_AGT_RDDATA_LO		(0x18)
+#define MIU_TEST_AGT_RDDATA_HI		(0x1c)
+
+#define MIU_TEST_AGT_ADDR_MASK		0xfffffff8
+#define MIU_TEST_AGT_UPPER_ADDR(off)	(0)
+
+/*
+ *   Register offsets for MS
+ */
+#define SIU_TEST_AGT_BASE		(0x60)
+
+#define SIU_TEST_AGT_ADDR_LO		(0x04)
+#define SIU_TEST_AGT_ADDR_HI		(0x18)
+#define SIU_TEST_AGT_WRDATA_LO		(0x08)
+#define SIU_TEST_AGT_WRDATA_HI		(0x0c)
+#define SIU_TEST_AGT_WRDATA(i)		(0x08+(4*(i)))
+#define SIU_TEST_AGT_RDDATA_LO		(0x10)
+#define SIU_TEST_AGT_RDDATA_HI		(0x14)
+#define SIU_TEST_AGT_RDDATA(i)		(0x10+(4*(i)))
+
+#define SIU_TEST_AGT_ADDR_MASK		0x3ffff8
+#define SIU_TEST_AGT_UPPER_ADDR(off)	((off)>>22)
+
+/* XG Link status */
+#define XG_LINK_UP	0x10
+#define XG_LINK_DOWN	0x20
+
+#define XG_LINK_UP_P3	0x01
+#define XG_LINK_DOWN_P3	0x02
+#define XG_LINK_STATE_P3_MASK 0xf
+#define XG_LINK_STATE_P3(pcifn,val) \
+	(((val) >> ((pcifn) * 4)) & XG_LINK_STATE_P3_MASK)
+
+#define P3_LINK_SPEED_MHZ	100
+#define P3_LINK_SPEED_MASK	0xff
+#define P3_LINK_SPEED_REG(pcifn)	\
+	(CRB_PF_LINK_SPEED_1 + (((pcifn) / 4) * 4))
+#define P3_LINK_SPEED_VAL(pcifn, reg)	\
+	(((reg) >> (8 * ((pcifn) & 0x3))) & P3_LINK_SPEED_MASK)
+
+#define NETXEN_CAM_RAM_BASE	(NETXEN_CRB_CAM + 0x02000)
+#define NETXEN_CAM_RAM(reg)	(NETXEN_CAM_RAM_BASE + (reg))
+#define NETXEN_FW_VERSION_MAJOR (NETXEN_CAM_RAM(0x150))
+#define NETXEN_FW_VERSION_MINOR (NETXEN_CAM_RAM(0x154))
+#define NETXEN_FW_VERSION_SUB	(NETXEN_CAM_RAM(0x158))
+#define NETXEN_ROM_LOCK_ID	(NETXEN_CAM_RAM(0x100))
+#define NETXEN_PHY_LOCK_ID	(NETXEN_CAM_RAM(0x120))
+#define NETXEN_CRB_WIN_LOCK_ID	(NETXEN_CAM_RAM(0x124))
+
+#define NIC_CRB_BASE		(NETXEN_CAM_RAM(0x200))
+#define NIC_CRB_BASE_2		(NETXEN_CAM_RAM(0x700))
+#define NETXEN_NIC_REG(X)	(NIC_CRB_BASE+(X))
+#define NETXEN_NIC_REG_2(X)	(NIC_CRB_BASE_2+(X))
+#define NETXEN_INTR_MODE_REG	NETXEN_NIC_REG(0x44)
+#define NETXEN_MSI_MODE		0x1
+#define NETXEN_INTX_MODE	0x2
+
+#define NX_CDRP_CRB_OFFSET		(NETXEN_NIC_REG(0x18))
+#define NX_ARG1_CRB_OFFSET		(NETXEN_NIC_REG(0x1c))
+#define NX_ARG2_CRB_OFFSET		(NETXEN_NIC_REG(0x20))
+#define NX_ARG3_CRB_OFFSET		(NETXEN_NIC_REG(0x24))
+#define NX_SIGN_CRB_OFFSET		(NETXEN_NIC_REG(0x28))
+
+#define CRB_HOST_DUMMY_BUF_ADDR_HI	(NETXEN_NIC_REG(0x3c))
+#define CRB_HOST_DUMMY_BUF_ADDR_LO	(NETXEN_NIC_REG(0x40))
+
+#define CRB_CMDPEG_STATE		(NETXEN_NIC_REG(0x50))
+#define CRB_RCVPEG_STATE		(NETXEN_NIC_REG(0x13c))
+
+#define CRB_XG_STATE			(NETXEN_NIC_REG(0x94))
+#define CRB_XG_STATE_P3			(NETXEN_NIC_REG(0x98))
+#define CRB_PF_LINK_SPEED_1		(NETXEN_NIC_REG(0xe8))
+#define CRB_PF_LINK_SPEED_2		(NETXEN_NIC_REG(0xec))
+
+#define CRB_MPORT_MODE			(NETXEN_NIC_REG(0xc4))
+#define CRB_DMA_SHIFT			(NETXEN_NIC_REG(0xcc))
+#define CRB_INT_VECTOR			(NETXEN_NIC_REG(0xd4))
+
+#define CRB_CMD_PRODUCER_OFFSET		(NETXEN_NIC_REG(0x08))
+#define CRB_CMD_CONSUMER_OFFSET		(NETXEN_NIC_REG(0x0c))
+#define CRB_CMD_PRODUCER_OFFSET_1   	(NETXEN_NIC_REG(0x1ac))
+#define CRB_CMD_CONSUMER_OFFSET_1	(NETXEN_NIC_REG(0x1b0))
+#define CRB_CMD_PRODUCER_OFFSET_2	(NETXEN_NIC_REG(0x1b8))
+#define CRB_CMD_CONSUMER_OFFSET_2	(NETXEN_NIC_REG(0x1bc))
+#define CRB_CMD_PRODUCER_OFFSET_3	(NETXEN_NIC_REG(0x1d0))
+#define CRB_CMD_CONSUMER_OFFSET_3	(NETXEN_NIC_REG(0x1d4))
+#define CRB_TEMP_STATE			(NETXEN_NIC_REG(0x1b4))
+
+#define CRB_V2P_0			(NETXEN_NIC_REG(0x290))
+#define CRB_V2P(port)			(CRB_V2P_0+((port)*4))
+#define CRB_DRIVER_VERSION		(NETXEN_NIC_REG(0x2a0))
+
+#define CRB_SW_INT_MASK_0		(NETXEN_NIC_REG(0x1d8))
+#define CRB_SW_INT_MASK_1		(NETXEN_NIC_REG(0x1e0))
+#define CRB_SW_INT_MASK_2		(NETXEN_NIC_REG(0x1e4))
+#define CRB_SW_INT_MASK_3		(NETXEN_NIC_REG(0x1e8))
+
+#define CRB_FW_CAPABILITIES_1		(NETXEN_CAM_RAM(0x128))
+#define CRB_FW_CAPABILITIES_2		(NETXEN_CAM_RAM(0x12c))
+#define CRB_MAC_BLOCK_START		(NETXEN_CAM_RAM(0x1c0))
+
+/*
+ * capabilities register, can be used to selectively enable/disable features
+ * for backward compatibility
+ */
+#define CRB_NIC_CAPABILITIES_HOST	NETXEN_NIC_REG(0x1a8)
+#define CRB_NIC_MSI_MODE_HOST		NETXEN_NIC_REG(0x270)
+
+#define INTR_SCHEME_PERPORT	      	0x1
+#define MSI_MODE_MULTIFUNC	      	0x1
+
+/* used for ethtool tests */
+#define CRB_SCRATCHPAD_TEST	    NETXEN_NIC_REG(0x280)
+
+/*
+ * CrbPortPhanCntrHi/Lo is used to pass the address of HostPhantomIndex address
+ * which can be read by the Phantom host to get producer/consumer indexes from
+ * Phantom/Casper. If it is not HOST_SHARED_MEMORY, then the following
+ * registers will be used for the addresses of the ring's shared memory
+ * on the Phantom.
+ */
+
+#define nx_get_temp_val(x)		((x) >> 16)
+#define nx_get_temp_state(x)		((x) & 0xffff)
+#define nx_encode_temp(val, state)	(((val) << 16) | (state))
+
+/*
+ * Temperature control.
+ */
+enum {
+	NX_TEMP_NORMAL = 0x1,	/* Normal operating range */
+	NX_TEMP_WARN,		/* Sound alert, temperature getting high */
+	NX_TEMP_PANIC		/* Fatal error, hardware has shut down. */
+};
+
+/* Lock IDs for PHY lock */
+#define PHY_LOCK_DRIVER		0x44524956
+
+/* Used for PS PCI Memory access */
+#define PCIX_PS_OP_ADDR_LO	(0x10000)
+/*   via CRB  (PS side only)     */
+#define PCIX_PS_OP_ADDR_HI	(0x10004)
+
+#define PCIX_INT_VECTOR		(0x10100)
+#define PCIX_INT_MASK		(0x10104)
+
+#define PCIX_CRB_WINDOW		(0x10210)
+#define PCIX_CRB_WINDOW_F0	(0x10210)
+#define PCIX_CRB_WINDOW_F1	(0x10230)
+#define PCIX_CRB_WINDOW_F2	(0x10250)
+#define PCIX_CRB_WINDOW_F3	(0x10270)
+#define PCIX_CRB_WINDOW_F4	(0x102ac)
+#define PCIX_CRB_WINDOW_F5	(0x102bc)
+#define PCIX_CRB_WINDOW_F6	(0x102cc)
+#define PCIX_CRB_WINDOW_F7	(0x102dc)
+#define PCIE_CRB_WINDOW_REG(func)	(((func) < 4) ? \
+		(PCIX_CRB_WINDOW_F0 + (0x20 * (func))) :\
+		(PCIX_CRB_WINDOW_F4 + (0x10 * ((func)-4))))
+
+#define PCIX_MN_WINDOW		(0x10200)
+#define PCIX_MN_WINDOW_F0	(0x10200)
+#define PCIX_MN_WINDOW_F1	(0x10220)
+#define PCIX_MN_WINDOW_F2	(0x10240)
+#define PCIX_MN_WINDOW_F3	(0x10260)
+#define PCIX_MN_WINDOW_F4	(0x102a0)
+#define PCIX_MN_WINDOW_F5	(0x102b0)
+#define PCIX_MN_WINDOW_F6	(0x102c0)
+#define PCIX_MN_WINDOW_F7	(0x102d0)
+#define PCIE_MN_WINDOW_REG(func)	(((func) < 4) ? \
+		(PCIX_MN_WINDOW_F0 + (0x20 * (func))) :\
+		(PCIX_MN_WINDOW_F4 + (0x10 * ((func)-4))))
+
+#define PCIX_SN_WINDOW		(0x10208)
+#define PCIX_SN_WINDOW_F0	(0x10208)
+#define PCIX_SN_WINDOW_F1	(0x10228)
+#define PCIX_SN_WINDOW_F2	(0x10248)
+#define PCIX_SN_WINDOW_F3	(0x10268)
+#define PCIX_SN_WINDOW_F4	(0x102a8)
+#define PCIX_SN_WINDOW_F5	(0x102b8)
+#define PCIX_SN_WINDOW_F6	(0x102c8)
+#define PCIX_SN_WINDOW_F7	(0x102d8)
+#define PCIE_SN_WINDOW_REG(func)	(((func) < 4) ? \
+		(PCIX_SN_WINDOW_F0 + (0x20 * (func))) :\
+		(PCIX_SN_WINDOW_F4 + (0x10 * ((func)-4))))
+
+#define PCIX_OCM_WINDOW		(0x10800)
+#define PCIX_OCM_WINDOW_REG(func)	(PCIX_OCM_WINDOW + 0x20 * (func))
+
+#define PCIX_TARGET_STATUS	(0x10118)
+#define PCIX_TARGET_STATUS_F1	(0x10160)
+#define PCIX_TARGET_STATUS_F2	(0x10164)
+#define PCIX_TARGET_STATUS_F3	(0x10168)
+#define PCIX_TARGET_STATUS_F4	(0x10360)
+#define PCIX_TARGET_STATUS_F5	(0x10364)
+#define PCIX_TARGET_STATUS_F6	(0x10368)
+#define PCIX_TARGET_STATUS_F7	(0x1036c)
+
+#define PCIX_TARGET_MASK	(0x10128)
+#define PCIX_TARGET_MASK_F1	(0x10170)
+#define PCIX_TARGET_MASK_F2	(0x10174)
+#define PCIX_TARGET_MASK_F3	(0x10178)
+#define PCIX_TARGET_MASK_F4	(0x10370)
+#define PCIX_TARGET_MASK_F5	(0x10374)
+#define PCIX_TARGET_MASK_F6	(0x10378)
+#define PCIX_TARGET_MASK_F7	(0x1037c)
+
+#define PCIX_MSI_F0		(0x13000)
+#define PCIX_MSI_F1		(0x13004)
+#define PCIX_MSI_F2		(0x13008)
+#define PCIX_MSI_F3		(0x1300c)
+#define PCIX_MSI_F4		(0x13010)
+#define PCIX_MSI_F5		(0x13014)
+#define PCIX_MSI_F6		(0x13018)
+#define PCIX_MSI_F7		(0x1301c)
+#define PCIX_MSI_F(i)		(0x13000+((i)*4))
+
+#define PCIX_PS_MEM_SPACE	(0x90000)
+
+#define NETXEN_PCIX_PH_REG(reg)	(NETXEN_CRB_PCIE + (reg))
+#define NETXEN_PCIX_PS_REG(reg)	(NETXEN_CRB_PCIX_MD + (reg))
+
+#define NETXEN_PCIE_REG(reg)	(NETXEN_CRB_PCIE + (reg))
+
+#define PCIE_MAX_DMA_XFER_SIZE	(0x1404c)
+
+#define PCIE_DCR		0x00d8
+
+#define PCIE_SEM0_LOCK		(0x1c000)
+#define PCIE_SEM0_UNLOCK	(0x1c004)
+#define PCIE_SEM1_LOCK		(0x1c008)
+#define PCIE_SEM1_UNLOCK	(0x1c00c)
+#define PCIE_SEM2_LOCK		(0x1c010)	/* Flash lock   */
+#define PCIE_SEM2_UNLOCK	(0x1c014)	/* Flash unlock */
+#define PCIE_SEM3_LOCK	  	(0x1c018)	/* Phy lock     */
+#define PCIE_SEM3_UNLOCK	(0x1c01c)	/* Phy unlock   */
+#define PCIE_SEM4_LOCK	  	(0x1c020)
+#define PCIE_SEM4_UNLOCK	(0x1c024)
+#define PCIE_SEM5_LOCK		(0x1c028)	/* API lock     */
+#define PCIE_SEM5_UNLOCK	(0x1c02c)	/* API unlock   */
+#define PCIE_SEM6_LOCK		(0x1c030)	/* sw lock      */
+#define PCIE_SEM6_UNLOCK	(0x1c034)	/* sw unlock    */
+#define PCIE_SEM7_LOCK		(0x1c038)	/* crb win lock */
+#define PCIE_SEM7_UNLOCK	(0x1c03c)	/* crbwin unlock*/
+#define PCIE_SEM_LOCK(N)	(PCIE_SEM0_LOCK + 8*(N))
+#define PCIE_SEM_UNLOCK(N)	(PCIE_SEM0_UNLOCK + 8*(N))
+
+#define PCIE_SETUP_FUNCTION	(0x12040)
+#define PCIE_SETUP_FUNCTION2	(0x12048)
+#define PCIE_MISCCFG_RC         (0x1206c)
+#define PCIE_TGT_SPLIT_CHICKEN	(0x12080)
+#define PCIE_CHICKEN3		(0x120c8)
+
+#define ISR_INT_STATE_REG       (NETXEN_PCIX_PS_REG(PCIE_MISCCFG_RC))
+#define PCIE_MAX_MASTER_SPLIT	(0x14048)
+
+#define NETXEN_PORT_MODE_NONE		0
+#define NETXEN_PORT_MODE_XG		1
+#define NETXEN_PORT_MODE_GB		2
+#define NETXEN_PORT_MODE_802_3_AP	3
+#define NETXEN_PORT_MODE_AUTO_NEG	4
+#define NETXEN_PORT_MODE_AUTO_NEG_1G	5
+#define NETXEN_PORT_MODE_AUTO_NEG_XG	6
+#define NETXEN_PORT_MODE_ADDR		(NETXEN_CAM_RAM(0x24))
+#define NETXEN_WOL_PORT_MODE		(NETXEN_CAM_RAM(0x198))
+
+#define NETXEN_WOL_CONFIG_NV		(NETXEN_CAM_RAM(0x184))
+#define NETXEN_WOL_CONFIG		(NETXEN_CAM_RAM(0x188))
+
+#define NX_PEG_TUNE_MN_PRESENT		0x1
+#define NX_PEG_TUNE_CAPABILITY		(NETXEN_CAM_RAM(0x02c))
+
+#define NETXEN_DMA_WATCHDOG_CTRL	(NETXEN_CAM_RAM(0x14))
+#define NETXEN_PEG_ALIVE_COUNTER	(NETXEN_CAM_RAM(0xb0))
+#define NETXEN_PEG_HALT_STATUS1 	(NETXEN_CAM_RAM(0xa8))
+#define NETXEN_PEG_HALT_STATUS2 	(NETXEN_CAM_RAM(0xac))
+#define NX_CRB_DEV_REF_COUNT		(NETXEN_CAM_RAM(0x138))
+#define NX_CRB_DEV_STATE		(NETXEN_CAM_RAM(0x140))
+#define NETXEN_ULA_KEY			(NETXEN_CAM_RAM(0x178))
+
+/* MiniDIMM related macros */
+#define NETXEN_DIMM_CAPABILITY		(NETXEN_CAM_RAM(0x258))
+#define NETXEN_DIMM_PRESENT			0x1
+#define NETXEN_DIMM_MEMTYPE_DDR2_SDRAM	0x2
+#define NETXEN_DIMM_SIZE			0x4
+#define NETXEN_DIMM_MEMTYPE(VAL)		((VAL >> 3) & 0xf)
+#define	NETXEN_DIMM_NUMROWS(VAL)		((VAL >> 7) & 0xf)
+#define	NETXEN_DIMM_NUMCOLS(VAL)		((VAL >> 11) & 0xf)
+#define	NETXEN_DIMM_NUMRANKS(VAL)		((VAL >> 15) & 0x3)
+#define NETXEN_DIMM_DATAWIDTH(VAL)		((VAL >> 18) & 0x3)
+#define NETXEN_DIMM_NUMBANKS(VAL)		((VAL >> 21) & 0xf)
+#define NETXEN_DIMM_TYPE(VAL)		((VAL >> 25) & 0x3f)
+#define NETXEN_DIMM_VALID_FLAG		0x80000000
+
+#define NETXEN_DIMM_MEM_DDR2_SDRAM	0x8
+
+#define NETXEN_DIMM_STD_MEM_SIZE	512
+
+#define NETXEN_DIMM_TYPE_RDIMM	0x1
+#define NETXEN_DIMM_TYPE_UDIMM	0x2
+#define NETXEN_DIMM_TYPE_SO_DIMM	0x4
+#define NETXEN_DIMM_TYPE_Micro_DIMM	0x8
+#define NETXEN_DIMM_TYPE_Mini_RDIMM	0x10
+#define NETXEN_DIMM_TYPE_Mini_UDIMM	0x20
+
+/* Device State */
+#define NX_DEV_COLD		1
+#define NX_DEV_INITALIZING	2
+#define NX_DEV_READY		3
+#define NX_DEV_NEED_RESET	4
+#define NX_DEV_NEED_QUISCENT	5
+#define NX_DEV_NEED_AER 	6
+#define NX_DEV_FAILED		7
+
+#define NX_RCODE_DRIVER_INFO		0x20000000
+#define NX_RCODE_DRIVER_CAN_RELOAD	0x40000000
+#define NX_RCODE_FATAL_ERROR		0x80000000
+#define NX_FWERROR_PEGNUM(code)		((code) & 0xff)
+#define NX_FWERROR_CODE(code)		((code >> 8) & 0xfffff)
+#define NX_FWERROR_PEGSTAT1(code)	((code >> 8) & 0x1fffff)
+
+#define FW_POLL_DELAY			(2 * HZ)
+#define FW_FAIL_THRESH			3
+#define FW_POLL_THRESH			10
+
+#define	ISR_MSI_INT_TRIGGER(FUNC) (NETXEN_PCIX_PS_REG(PCIX_MSI_F(FUNC)))
+#define ISR_LEGACY_INT_TRIGGERED(VAL)	(((VAL) & 0x300) == 0x200)
+
+/*
+ * PCI Interrupt Vector Values.
+ */
+#define	PCIX_INT_VECTOR_BIT_F0	0x0080
+#define	PCIX_INT_VECTOR_BIT_F1	0x0100
+#define	PCIX_INT_VECTOR_BIT_F2	0x0200
+#define	PCIX_INT_VECTOR_BIT_F3	0x0400
+#define	PCIX_INT_VECTOR_BIT_F4	0x0800
+#define	PCIX_INT_VECTOR_BIT_F5	0x1000
+#define	PCIX_INT_VECTOR_BIT_F6	0x2000
+#define	PCIX_INT_VECTOR_BIT_F7	0x4000
+
+struct netxen_legacy_intr_set {
+	uint32_t	int_vec_bit;
+	uint32_t	tgt_status_reg;
+	uint32_t	tgt_mask_reg;
+	uint32_t	pci_int_reg;
+};
+
+#define	NX_LEGACY_INTR_CONFIG						\
+{									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F0,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS,		\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK,		\
+		.pci_int_reg	=	ISR_MSI_INT_TRIGGER(0) },	\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F1,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F1,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F1,		\
+		.pci_int_reg	=	ISR_MSI_INT_TRIGGER(1) },	\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F2,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F2,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F2,		\
+		.pci_int_reg	=	ISR_MSI_INT_TRIGGER(2) },	\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F3,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F3,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F3,		\
+		.pci_int_reg	=	ISR_MSI_INT_TRIGGER(3) },	\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F4,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F4,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F4,		\
+		.pci_int_reg	=	ISR_MSI_INT_TRIGGER(4) },	\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F5,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F5,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F5,		\
+		.pci_int_reg	=	ISR_MSI_INT_TRIGGER(5) },	\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F6,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F6,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F6,		\
+		.pci_int_reg	=	ISR_MSI_INT_TRIGGER(6) },	\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F7,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F7,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F7,		\
+		.pci_int_reg	=	ISR_MSI_INT_TRIGGER(7) },	\
+}
+
+#endif				/* __NETXEN_NIC_HDR_H_ */
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c
new file mode 100644
index 0000000..1cd39c9
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c
@@ -0,0 +1,2570 @@
+/*
+ * Copyright (C) 2003 - 2009 NetXen, Inc.
+ * Copyright (C) 2009 - QLogic Corporation.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution
+ * in the file called "COPYING".
+ *
+ */
+
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/slab.h>
+#include "netxen_nic.h"
+#include "netxen_nic_hw.h"
+
+#include <net/ip.h>
+
+#define MASK(n) ((1ULL<<(n))-1)
+#define MN_WIN(addr) (((addr & 0x1fc0000) >> 1) | ((addr >> 25) & 0x3ff))
+#define OCM_WIN(addr) (((addr & 0x1ff0000) >> 1) | ((addr >> 25) & 0x3ff))
+#define MS_WIN(addr) (addr & 0x0ffc0000)
+
+#define GET_MEM_OFFS_2M(addr) (addr & MASK(18))
+
+#define CRB_BLK(off)	((off >> 20) & 0x3f)
+#define CRB_SUBBLK(off)	((off >> 16) & 0xf)
+#define CRB_WINDOW_2M	(0x130060)
+#define CRB_HI(off)	((crb_hub_agt[CRB_BLK(off)] << 20) | ((off) & 0xf0000))
+#define CRB_INDIRECT_2M	(0x1e0000UL)
+
+static void netxen_nic_io_write_128M(struct netxen_adapter *adapter,
+		void __iomem *addr, u32 data);
+static u32 netxen_nic_io_read_128M(struct netxen_adapter *adapter,
+		void __iomem *addr);
+
+#define PCI_OFFSET_FIRST_RANGE(adapter, off)    \
+	((adapter)->ahw.pci_base0 + (off))
+#define PCI_OFFSET_SECOND_RANGE(adapter, off)   \
+	((adapter)->ahw.pci_base1 + (off) - SECOND_PAGE_GROUP_START)
+#define PCI_OFFSET_THIRD_RANGE(adapter, off)    \
+	((adapter)->ahw.pci_base2 + (off) - THIRD_PAGE_GROUP_START)
+
+static void __iomem *pci_base_offset(struct netxen_adapter *adapter,
+					    unsigned long off)
+{
+	if (ADDR_IN_RANGE(off, FIRST_PAGE_GROUP_START, FIRST_PAGE_GROUP_END))
+		return PCI_OFFSET_FIRST_RANGE(adapter, off);
+
+	if (ADDR_IN_RANGE(off, SECOND_PAGE_GROUP_START, SECOND_PAGE_GROUP_END))
+		return PCI_OFFSET_SECOND_RANGE(adapter, off);
+
+	if (ADDR_IN_RANGE(off, THIRD_PAGE_GROUP_START, THIRD_PAGE_GROUP_END))
+		return PCI_OFFSET_THIRD_RANGE(adapter, off);
+
+	return NULL;
+}
+
+static crb_128M_2M_block_map_t
+crb_128M_2M_map[64] __cacheline_aligned_in_smp = {
+    {{{0, 0,         0,         0} } },		/* 0: PCI */
+    {{{1, 0x0100000, 0x0102000, 0x120000},	/* 1: PCIE */
+	  {1, 0x0110000, 0x0120000, 0x130000},
+	  {1, 0x0120000, 0x0122000, 0x124000},
+	  {1, 0x0130000, 0x0132000, 0x126000},
+	  {1, 0x0140000, 0x0142000, 0x128000},
+	  {1, 0x0150000, 0x0152000, 0x12a000},
+	  {1, 0x0160000, 0x0170000, 0x110000},
+	  {1, 0x0170000, 0x0172000, 0x12e000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {1, 0x01e0000, 0x01e0800, 0x122000},
+	  {0, 0x0000000, 0x0000000, 0x000000} } },
+	{{{1, 0x0200000, 0x0210000, 0x180000} } },/* 2: MN */
+    {{{0, 0,         0,         0} } },	    /* 3: */
+    {{{1, 0x0400000, 0x0401000, 0x169000} } },/* 4: P2NR1 */
+    {{{1, 0x0500000, 0x0510000, 0x140000} } },/* 5: SRE   */
+    {{{1, 0x0600000, 0x0610000, 0x1c0000} } },/* 6: NIU   */
+    {{{1, 0x0700000, 0x0704000, 0x1b8000} } },/* 7: QM    */
+    {{{1, 0x0800000, 0x0802000, 0x170000},  /* 8: SQM0  */
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {1, 0x08f0000, 0x08f2000, 0x172000} } },
+    {{{1, 0x0900000, 0x0902000, 0x174000},	/* 9: SQM1*/
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {1, 0x09f0000, 0x09f2000, 0x176000} } },
+    {{{0, 0x0a00000, 0x0a02000, 0x178000},	/* 10: SQM2*/
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {1, 0x0af0000, 0x0af2000, 0x17a000} } },
+    {{{0, 0x0b00000, 0x0b02000, 0x17c000},	/* 11: SQM3*/
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {1, 0x0bf0000, 0x0bf2000, 0x17e000} } },
+	{{{1, 0x0c00000, 0x0c04000, 0x1d4000} } },/* 12: I2Q */
+	{{{1, 0x0d00000, 0x0d04000, 0x1a4000} } },/* 13: TMR */
+	{{{1, 0x0e00000, 0x0e04000, 0x1a0000} } },/* 14: ROMUSB */
+	{{{1, 0x0f00000, 0x0f01000, 0x164000} } },/* 15: PEG4 */
+	{{{0, 0x1000000, 0x1004000, 0x1a8000} } },/* 16: XDMA */
+	{{{1, 0x1100000, 0x1101000, 0x160000} } },/* 17: PEG0 */
+	{{{1, 0x1200000, 0x1201000, 0x161000} } },/* 18: PEG1 */
+	{{{1, 0x1300000, 0x1301000, 0x162000} } },/* 19: PEG2 */
+	{{{1, 0x1400000, 0x1401000, 0x163000} } },/* 20: PEG3 */
+	{{{1, 0x1500000, 0x1501000, 0x165000} } },/* 21: P2ND */
+	{{{1, 0x1600000, 0x1601000, 0x166000} } },/* 22: P2NI */
+	{{{0, 0,         0,         0} } },	/* 23: */
+	{{{0, 0,         0,         0} } },	/* 24: */
+	{{{0, 0,         0,         0} } },	/* 25: */
+	{{{0, 0,         0,         0} } },	/* 26: */
+	{{{0, 0,         0,         0} } },	/* 27: */
+	{{{0, 0,         0,         0} } },	/* 28: */
+	{{{1, 0x1d00000, 0x1d10000, 0x190000} } },/* 29: MS */
+    {{{1, 0x1e00000, 0x1e01000, 0x16a000} } },/* 30: P2NR2 */
+    {{{1, 0x1f00000, 0x1f10000, 0x150000} } },/* 31: EPG */
+	{{{0} } },				/* 32: PCI */
+	{{{1, 0x2100000, 0x2102000, 0x120000},	/* 33: PCIE */
+	  {1, 0x2110000, 0x2120000, 0x130000},
+	  {1, 0x2120000, 0x2122000, 0x124000},
+	  {1, 0x2130000, 0x2132000, 0x126000},
+	  {1, 0x2140000, 0x2142000, 0x128000},
+	  {1, 0x2150000, 0x2152000, 0x12a000},
+	  {1, 0x2160000, 0x2170000, 0x110000},
+	  {1, 0x2170000, 0x2172000, 0x12e000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000} } },
+	{{{1, 0x2200000, 0x2204000, 0x1b0000} } },/* 34: CAM */
+	{{{0} } },				/* 35: */
+	{{{0} } },				/* 36: */
+	{{{0} } },				/* 37: */
+	{{{0} } },				/* 38: */
+	{{{0} } },				/* 39: */
+	{{{1, 0x2800000, 0x2804000, 0x1a4000} } },/* 40: TMR */
+	{{{1, 0x2900000, 0x2901000, 0x16b000} } },/* 41: P2NR3 */
+	{{{1, 0x2a00000, 0x2a00400, 0x1ac400} } },/* 42: RPMX1 */
+	{{{1, 0x2b00000, 0x2b00400, 0x1ac800} } },/* 43: RPMX2 */
+	{{{1, 0x2c00000, 0x2c00400, 0x1acc00} } },/* 44: RPMX3 */
+	{{{1, 0x2d00000, 0x2d00400, 0x1ad000} } },/* 45: RPMX4 */
+	{{{1, 0x2e00000, 0x2e00400, 0x1ad400} } },/* 46: RPMX5 */
+	{{{1, 0x2f00000, 0x2f00400, 0x1ad800} } },/* 47: RPMX6 */
+	{{{1, 0x3000000, 0x3000400, 0x1adc00} } },/* 48: RPMX7 */
+	{{{0, 0x3100000, 0x3104000, 0x1a8000} } },/* 49: XDMA */
+	{{{1, 0x3200000, 0x3204000, 0x1d4000} } },/* 50: I2Q */
+	{{{1, 0x3300000, 0x3304000, 0x1a0000} } },/* 51: ROMUSB */
+	{{{0} } },				/* 52: */
+	{{{1, 0x3500000, 0x3500400, 0x1ac000} } },/* 53: RPMX0 */
+	{{{1, 0x3600000, 0x3600400, 0x1ae000} } },/* 54: RPMX8 */
+	{{{1, 0x3700000, 0x3700400, 0x1ae400} } },/* 55: RPMX9 */
+	{{{1, 0x3800000, 0x3804000, 0x1d0000} } },/* 56: OCM0 */
+	{{{1, 0x3900000, 0x3904000, 0x1b4000} } },/* 57: CRYPTO */
+	{{{1, 0x3a00000, 0x3a04000, 0x1d8000} } },/* 58: SMB */
+	{{{0} } },				/* 59: I2C0 */
+	{{{0} } },				/* 60: I2C1 */
+	{{{1, 0x3d00000, 0x3d04000, 0x1d8000} } },/* 61: LPC */
+	{{{1, 0x3e00000, 0x3e01000, 0x167000} } },/* 62: P2NC */
+	{{{1, 0x3f00000, 0x3f01000, 0x168000} } }	/* 63: P2NR0 */
+};
+
+/*
+ * top 12 bits of crb internal address (hub, agent)
+ */
+static unsigned crb_hub_agt[64] =
+{
+	0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PS,
+	NETXEN_HW_CRB_HUB_AGT_ADR_MN,
+	NETXEN_HW_CRB_HUB_AGT_ADR_MS,
+	0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_SRE,
+	NETXEN_HW_CRB_HUB_AGT_ADR_NIU,
+	NETXEN_HW_CRB_HUB_AGT_ADR_QMN,
+	NETXEN_HW_CRB_HUB_AGT_ADR_SQN0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_SQN1,
+	NETXEN_HW_CRB_HUB_AGT_ADR_SQN2,
+	NETXEN_HW_CRB_HUB_AGT_ADR_SQN3,
+	NETXEN_HW_CRB_HUB_AGT_ADR_I2Q,
+	NETXEN_HW_CRB_HUB_AGT_ADR_TIMR,
+	NETXEN_HW_CRB_HUB_AGT_ADR_ROMUSB,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PGN4,
+	NETXEN_HW_CRB_HUB_AGT_ADR_XDMA,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PGN0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PGN1,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PGN2,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PGN3,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PGND,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PGNI,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PGS0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PGS1,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PGS2,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PGS3,
+	0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PGSI,
+	NETXEN_HW_CRB_HUB_AGT_ADR_SN,
+	0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_EG,
+	0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PS,
+	NETXEN_HW_CRB_HUB_AGT_ADR_CAM,
+	0,
+	0,
+	0,
+	0,
+	0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_TIMR,
+	0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_RPMX1,
+	NETXEN_HW_CRB_HUB_AGT_ADR_RPMX2,
+	NETXEN_HW_CRB_HUB_AGT_ADR_RPMX3,
+	NETXEN_HW_CRB_HUB_AGT_ADR_RPMX4,
+	NETXEN_HW_CRB_HUB_AGT_ADR_RPMX5,
+	NETXEN_HW_CRB_HUB_AGT_ADR_RPMX6,
+	NETXEN_HW_CRB_HUB_AGT_ADR_RPMX7,
+	NETXEN_HW_CRB_HUB_AGT_ADR_XDMA,
+	NETXEN_HW_CRB_HUB_AGT_ADR_I2Q,
+	NETXEN_HW_CRB_HUB_AGT_ADR_ROMUSB,
+	0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_RPMX0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_RPMX8,
+	NETXEN_HW_CRB_HUB_AGT_ADR_RPMX9,
+	NETXEN_HW_CRB_HUB_AGT_ADR_OCM0,
+	0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_SMB,
+	NETXEN_HW_CRB_HUB_AGT_ADR_I2C0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_I2C1,
+	0,
+	NETXEN_HW_CRB_HUB_AGT_ADR_PGNC,
+	0,
+};
+
+/*  PCI Windowing for DDR regions.  */
+
+#define NETXEN_WINDOW_ONE 	0x2000000 /*CRB Window: bit 25 of CRB address */
+
+#define NETXEN_PCIE_SEM_TIMEOUT	10000
+
+static int netxen_nic_set_mtu_xgb(struct netxen_adapter *adapter, int new_mtu);
+
+int
+netxen_pcie_sem_lock(struct netxen_adapter *adapter, int sem, u32 id_reg)
+{
+	int done = 0, timeout = 0;
+
+	while (!done) {
+		done = NXRD32(adapter, NETXEN_PCIE_REG(PCIE_SEM_LOCK(sem)));
+		if (done == 1)
+			break;
+		if (++timeout >= NETXEN_PCIE_SEM_TIMEOUT)
+			return -EIO;
+		msleep(1);
+	}
+
+	if (id_reg)
+		NXWR32(adapter, id_reg, adapter->portnum);
+
+	return 0;
+}
+
+void
+netxen_pcie_sem_unlock(struct netxen_adapter *adapter, int sem)
+{
+	NXRD32(adapter, NETXEN_PCIE_REG(PCIE_SEM_UNLOCK(sem)));
+}
+
+static int netxen_niu_xg_init_port(struct netxen_adapter *adapter, int port)
+{
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+		NXWR32(adapter, NETXEN_NIU_XGE_CONFIG_1+(0x10000*port), 0x1447);
+		NXWR32(adapter, NETXEN_NIU_XGE_CONFIG_0+(0x10000*port), 0x5);
+	}
+
+	return 0;
+}
+
+/* Disable an XG interface */
+static int netxen_niu_disable_xg_port(struct netxen_adapter *adapter)
+{
+	__u32 mac_cfg;
+	u32 port = adapter->physical_port;
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		return 0;
+
+	if (port >= NETXEN_NIU_MAX_XG_PORTS)
+		return -EINVAL;
+
+	mac_cfg = 0;
+	if (NXWR32(adapter,
+			NETXEN_NIU_XGE_CONFIG_0 + (0x10000 * port), mac_cfg))
+		return -EIO;
+	return 0;
+}
+
+#define NETXEN_UNICAST_ADDR(port, index) \
+	(NETXEN_UNICAST_ADDR_BASE+(port*32)+(index*8))
+#define NETXEN_MCAST_ADDR(port, index) \
+	(NETXEN_MULTICAST_ADDR_BASE+(port*0x80)+(index*8))
+#define MAC_HI(addr) \
+	((addr[2] << 16) | (addr[1] << 8) | (addr[0]))
+#define MAC_LO(addr) \
+	((addr[5] << 16) | (addr[4] << 8) | (addr[3]))
+
+static int netxen_p2_nic_set_promisc(struct netxen_adapter *adapter, u32 mode)
+{
+	u32 mac_cfg;
+	u32 cnt = 0;
+	__u32 reg = 0x0200;
+	u32 port = adapter->physical_port;
+	u16 board_type = adapter->ahw.board_type;
+
+	if (port >= NETXEN_NIU_MAX_XG_PORTS)
+		return -EINVAL;
+
+	mac_cfg = NXRD32(adapter, NETXEN_NIU_XGE_CONFIG_0 + (0x10000 * port));
+	mac_cfg &= ~0x4;
+	NXWR32(adapter, NETXEN_NIU_XGE_CONFIG_0 + (0x10000 * port), mac_cfg);
+
+	if ((board_type == NETXEN_BRDTYPE_P2_SB31_10G_IMEZ) ||
+			(board_type == NETXEN_BRDTYPE_P2_SB31_10G_HMEZ))
+		reg = (0x20 << port);
+
+	NXWR32(adapter, NETXEN_NIU_FRAME_COUNT_SELECT, reg);
+
+	mdelay(10);
+
+	while (NXRD32(adapter, NETXEN_NIU_FRAME_COUNT) && ++cnt < 20)
+		mdelay(10);
+
+	if (cnt < 20) {
+
+		reg = NXRD32(adapter,
+			NETXEN_NIU_XGE_CONFIG_1 + (0x10000 * port));
+
+		if (mode == NETXEN_NIU_PROMISC_MODE)
+			reg = (reg | 0x2000UL);
+		else
+			reg = (reg & ~0x2000UL);
+
+		if (mode == NETXEN_NIU_ALLMULTI_MODE)
+			reg = (reg | 0x1000UL);
+		else
+			reg = (reg & ~0x1000UL);
+
+		NXWR32(adapter,
+			NETXEN_NIU_XGE_CONFIG_1 + (0x10000 * port), reg);
+	}
+
+	mac_cfg |= 0x4;
+	NXWR32(adapter, NETXEN_NIU_XGE_CONFIG_0 + (0x10000 * port), mac_cfg);
+
+	return 0;
+}
+
+static int netxen_p2_nic_set_mac_addr(struct netxen_adapter *adapter, u8 *addr)
+{
+	u32 mac_hi, mac_lo;
+	u32 reg_hi, reg_lo;
+
+	u8 phy = adapter->physical_port;
+
+	if (phy >= NETXEN_NIU_MAX_XG_PORTS)
+		return -EINVAL;
+
+	mac_lo = ((u32)addr[0] << 16) | ((u32)addr[1] << 24);
+	mac_hi = addr[2] | ((u32)addr[3] << 8) |
+		((u32)addr[4] << 16) | ((u32)addr[5] << 24);
+
+	reg_lo = NETXEN_NIU_XGE_STATION_ADDR_0_1 + (0x10000 * phy);
+	reg_hi = NETXEN_NIU_XGE_STATION_ADDR_0_HI + (0x10000 * phy);
+
+	/* write twice to flush */
+	if (NXWR32(adapter, reg_lo, mac_lo) || NXWR32(adapter, reg_hi, mac_hi))
+		return -EIO;
+	if (NXWR32(adapter, reg_lo, mac_lo) || NXWR32(adapter, reg_hi, mac_hi))
+		return -EIO;
+
+	return 0;
+}
+
+static int
+netxen_nic_enable_mcast_filter(struct netxen_adapter *adapter)
+{
+	u32	val = 0;
+	u16 port = adapter->physical_port;
+	u8 *addr = adapter->mac_addr;
+
+	if (adapter->mc_enabled)
+		return 0;
+
+	val = NXRD32(adapter, NETXEN_MAC_ADDR_CNTL_REG);
+	val |= (1UL << (28+port));
+	NXWR32(adapter, NETXEN_MAC_ADDR_CNTL_REG, val);
+
+	/* add broadcast addr to filter */
+	val = 0xffffff;
+	NXWR32(adapter, NETXEN_UNICAST_ADDR(port, 0), val);
+	NXWR32(adapter, NETXEN_UNICAST_ADDR(port, 0)+4, val);
+
+	/* add station addr to filter */
+	val = MAC_HI(addr);
+	NXWR32(adapter, NETXEN_UNICAST_ADDR(port, 1), val);
+	val = MAC_LO(addr);
+	NXWR32(adapter, NETXEN_UNICAST_ADDR(port, 1)+4, val);
+
+	adapter->mc_enabled = 1;
+	return 0;
+}
+
+static int
+netxen_nic_disable_mcast_filter(struct netxen_adapter *adapter)
+{
+	u32	val = 0;
+	u16 port = adapter->physical_port;
+	u8 *addr = adapter->mac_addr;
+
+	if (!adapter->mc_enabled)
+		return 0;
+
+	val = NXRD32(adapter, NETXEN_MAC_ADDR_CNTL_REG);
+	val &= ~(1UL << (28+port));
+	NXWR32(adapter, NETXEN_MAC_ADDR_CNTL_REG, val);
+
+	val = MAC_HI(addr);
+	NXWR32(adapter, NETXEN_UNICAST_ADDR(port, 0), val);
+	val = MAC_LO(addr);
+	NXWR32(adapter, NETXEN_UNICAST_ADDR(port, 0)+4, val);
+
+	NXWR32(adapter, NETXEN_UNICAST_ADDR(port, 1), 0);
+	NXWR32(adapter, NETXEN_UNICAST_ADDR(port, 1)+4, 0);
+
+	adapter->mc_enabled = 0;
+	return 0;
+}
+
+static int
+netxen_nic_set_mcast_addr(struct netxen_adapter *adapter,
+		int index, u8 *addr)
+{
+	u32 hi = 0, lo = 0;
+	u16 port = adapter->physical_port;
+
+	lo = MAC_LO(addr);
+	hi = MAC_HI(addr);
+
+	NXWR32(adapter, NETXEN_MCAST_ADDR(port, index), hi);
+	NXWR32(adapter, NETXEN_MCAST_ADDR(port, index)+4, lo);
+
+	return 0;
+}
+
+static void netxen_p2_nic_set_multi(struct net_device *netdev)
+{
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+	struct netdev_hw_addr *ha;
+	u8 null_addr[ETH_ALEN];
+	int i;
+
+	eth_zero_addr(null_addr);
+
+	if (netdev->flags & IFF_PROMISC) {
+
+		adapter->set_promisc(adapter,
+				NETXEN_NIU_PROMISC_MODE);
+
+		/* Full promiscuous mode */
+		netxen_nic_disable_mcast_filter(adapter);
+
+		return;
+	}
+
+	if (netdev_mc_empty(netdev)) {
+		adapter->set_promisc(adapter,
+				NETXEN_NIU_NON_PROMISC_MODE);
+		netxen_nic_disable_mcast_filter(adapter);
+		return;
+	}
+
+	adapter->set_promisc(adapter, NETXEN_NIU_ALLMULTI_MODE);
+	if (netdev->flags & IFF_ALLMULTI ||
+			netdev_mc_count(netdev) > adapter->max_mc_count) {
+		netxen_nic_disable_mcast_filter(adapter);
+		return;
+	}
+
+	netxen_nic_enable_mcast_filter(adapter);
+
+	i = 0;
+	netdev_for_each_mc_addr(ha, netdev)
+		netxen_nic_set_mcast_addr(adapter, i++, ha->addr);
+
+	/* Clear out remaining addresses */
+	while (i < adapter->max_mc_count)
+		netxen_nic_set_mcast_addr(adapter, i++, null_addr);
+}
+
+static int
+netxen_send_cmd_descs(struct netxen_adapter *adapter,
+		struct cmd_desc_type0 *cmd_desc_arr, int nr_desc)
+{
+	u32 i, producer, consumer;
+	struct netxen_cmd_buffer *pbuf;
+	struct cmd_desc_type0 *cmd_desc;
+	struct nx_host_tx_ring *tx_ring;
+
+	i = 0;
+
+	if (adapter->is_up != NETXEN_ADAPTER_UP_MAGIC)
+		return -EIO;
+
+	tx_ring = adapter->tx_ring;
+	__netif_tx_lock_bh(tx_ring->txq);
+
+	producer = tx_ring->producer;
+	consumer = tx_ring->sw_consumer;
+
+	if (nr_desc >= netxen_tx_avail(tx_ring)) {
+		netif_tx_stop_queue(tx_ring->txq);
+		smp_mb();
+		if (netxen_tx_avail(tx_ring) > nr_desc) {
+			if (netxen_tx_avail(tx_ring) > TX_STOP_THRESH)
+				netif_tx_wake_queue(tx_ring->txq);
+		} else {
+			__netif_tx_unlock_bh(tx_ring->txq);
+			return -EBUSY;
+		}
+	}
+
+	do {
+		cmd_desc = &cmd_desc_arr[i];
+
+		pbuf = &tx_ring->cmd_buf_arr[producer];
+		pbuf->skb = NULL;
+		pbuf->frag_count = 0;
+
+		memcpy(&tx_ring->desc_head[producer],
+			&cmd_desc_arr[i], sizeof(struct cmd_desc_type0));
+
+		producer = get_next_index(producer, tx_ring->num_desc);
+		i++;
+
+	} while (i != nr_desc);
+
+	tx_ring->producer = producer;
+
+	netxen_nic_update_cmd_producer(adapter, tx_ring);
+
+	__netif_tx_unlock_bh(tx_ring->txq);
+
+	return 0;
+}
+
+static int
+nx_p3_sre_macaddr_change(struct netxen_adapter *adapter, u8 *addr, unsigned op)
+{
+	nx_nic_req_t req;
+	nx_mac_req_t *mac_req;
+	u64 word;
+
+	memset(&req, 0, sizeof(nx_nic_req_t));
+	req.qhdr = cpu_to_le64(NX_NIC_REQUEST << 23);
+
+	word = NX_MAC_EVENT | ((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+
+	mac_req = (nx_mac_req_t *)&req.words[0];
+	mac_req->op = op;
+	memcpy(mac_req->mac_addr, addr, ETH_ALEN);
+
+	return netxen_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+}
+
+static int nx_p3_nic_add_mac(struct netxen_adapter *adapter,
+		const u8 *addr, struct list_head *del_list)
+{
+	struct list_head *head;
+	nx_mac_list_t *cur;
+
+	/* look up if already exists */
+	list_for_each(head, del_list) {
+		cur = list_entry(head, nx_mac_list_t, list);
+
+		if (ether_addr_equal(addr, cur->mac_addr)) {
+			list_move_tail(head, &adapter->mac_list);
+			return 0;
+		}
+	}
+
+	cur = kzalloc(sizeof(nx_mac_list_t), GFP_ATOMIC);
+	if (cur == NULL)
+		return -ENOMEM;
+
+	memcpy(cur->mac_addr, addr, ETH_ALEN);
+	list_add_tail(&cur->list, &adapter->mac_list);
+	return nx_p3_sre_macaddr_change(adapter,
+				cur->mac_addr, NETXEN_MAC_ADD);
+}
+
+static void netxen_p3_nic_set_multi(struct net_device *netdev)
+{
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+	struct netdev_hw_addr *ha;
+	static const u8 bcast_addr[ETH_ALEN] = {
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+	};
+	u32 mode = VPORT_MISS_MODE_DROP;
+	LIST_HEAD(del_list);
+	struct list_head *head;
+	nx_mac_list_t *cur;
+
+	if (adapter->is_up != NETXEN_ADAPTER_UP_MAGIC)
+		return;
+
+	list_splice_tail_init(&adapter->mac_list, &del_list);
+
+	nx_p3_nic_add_mac(adapter, adapter->mac_addr, &del_list);
+	nx_p3_nic_add_mac(adapter, bcast_addr, &del_list);
+
+	if (netdev->flags & IFF_PROMISC) {
+		mode = VPORT_MISS_MODE_ACCEPT_ALL;
+		goto send_fw_cmd;
+	}
+
+	if ((netdev->flags & IFF_ALLMULTI) ||
+			(netdev_mc_count(netdev) > adapter->max_mc_count)) {
+		mode = VPORT_MISS_MODE_ACCEPT_MULTI;
+		goto send_fw_cmd;
+	}
+
+	if (!netdev_mc_empty(netdev)) {
+		netdev_for_each_mc_addr(ha, netdev)
+			nx_p3_nic_add_mac(adapter, ha->addr, &del_list);
+	}
+
+send_fw_cmd:
+	adapter->set_promisc(adapter, mode);
+	head = &del_list;
+	while (!list_empty(head)) {
+		cur = list_entry(head->next, nx_mac_list_t, list);
+
+		nx_p3_sre_macaddr_change(adapter,
+				cur->mac_addr, NETXEN_MAC_DEL);
+		list_del(&cur->list);
+		kfree(cur);
+	}
+}
+
+static int netxen_p3_nic_set_promisc(struct netxen_adapter *adapter, u32 mode)
+{
+	nx_nic_req_t req;
+	u64 word;
+
+	memset(&req, 0, sizeof(nx_nic_req_t));
+
+	req.qhdr = cpu_to_le64(NX_HOST_REQUEST << 23);
+
+	word = NX_NIC_H2C_OPCODE_PROXY_SET_VPORT_MISS_MODE |
+			((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+
+	req.words[0] = cpu_to_le64(mode);
+
+	return netxen_send_cmd_descs(adapter,
+				(struct cmd_desc_type0 *)&req, 1);
+}
+
+void netxen_p3_free_mac_list(struct netxen_adapter *adapter)
+{
+	nx_mac_list_t *cur;
+	struct list_head *head = &adapter->mac_list;
+
+	while (!list_empty(head)) {
+		cur = list_entry(head->next, nx_mac_list_t, list);
+		nx_p3_sre_macaddr_change(adapter,
+				cur->mac_addr, NETXEN_MAC_DEL);
+		list_del(&cur->list);
+		kfree(cur);
+	}
+}
+
+static int netxen_p3_nic_set_mac_addr(struct netxen_adapter *adapter, u8 *addr)
+{
+	/* assuming caller has already copied new addr to netdev */
+	netxen_p3_nic_set_multi(adapter->netdev);
+	return 0;
+}
+
+#define	NETXEN_CONFIG_INTR_COALESCE	3
+
+/*
+ * Send the interrupt coalescing parameter set by ethtool to the card.
+ */
+int netxen_config_intr_coalesce(struct netxen_adapter *adapter)
+{
+	nx_nic_req_t req;
+	u64 word[6];
+	int rv, i;
+
+	memset(&req, 0, sizeof(nx_nic_req_t));
+	memset(word, 0, sizeof(word));
+
+	req.qhdr = cpu_to_le64(NX_HOST_REQUEST << 23);
+
+	word[0] = NETXEN_CONFIG_INTR_COALESCE | ((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word[0]);
+
+	memcpy(&word[0], &adapter->coal, sizeof(adapter->coal));
+	for (i = 0; i < 6; i++)
+		req.words[i] = cpu_to_le64(word[i]);
+
+	rv = netxen_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0) {
+		printk(KERN_ERR "ERROR. Could not send "
+			"interrupt coalescing parameters\n");
+	}
+
+	return rv;
+}
+
+int netxen_config_hw_lro(struct netxen_adapter *adapter, int enable)
+{
+	nx_nic_req_t req;
+	u64 word;
+	int rv = 0;
+
+	if (!test_bit(__NX_FW_ATTACHED, &adapter->state))
+		return 0;
+
+	memset(&req, 0, sizeof(nx_nic_req_t));
+
+	req.qhdr = cpu_to_le64(NX_HOST_REQUEST << 23);
+
+	word = NX_NIC_H2C_OPCODE_CONFIG_HW_LRO | ((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+
+	req.words[0] = cpu_to_le64(enable);
+
+	rv = netxen_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0) {
+		printk(KERN_ERR "ERROR. Could not send "
+			"configure hw lro request\n");
+	}
+
+	return rv;
+}
+
+int netxen_config_bridged_mode(struct netxen_adapter *adapter, int enable)
+{
+	nx_nic_req_t req;
+	u64 word;
+	int rv = 0;
+
+	if (!!(adapter->flags & NETXEN_NIC_BRIDGE_ENABLED) == enable)
+		return rv;
+
+	memset(&req, 0, sizeof(nx_nic_req_t));
+
+	req.qhdr = cpu_to_le64(NX_HOST_REQUEST << 23);
+
+	word = NX_NIC_H2C_OPCODE_CONFIG_BRIDGING |
+		((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+
+	req.words[0] = cpu_to_le64(enable);
+
+	rv = netxen_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0) {
+		printk(KERN_ERR "ERROR. Could not send "
+				"configure bridge mode request\n");
+	}
+
+	adapter->flags ^= NETXEN_NIC_BRIDGE_ENABLED;
+
+	return rv;
+}
+
+
+#define RSS_HASHTYPE_IP_TCP	0x3
+
+int netxen_config_rss(struct netxen_adapter *adapter, int enable)
+{
+	nx_nic_req_t req;
+	u64 word;
+	int i, rv;
+
+	static const u64 key[] = {
+		0xbeac01fa6a42b73bULL, 0x8030f20c77cb2da3ULL,
+		0xae7b30b4d0ca2bcbULL, 0x43a38fb04167253dULL,
+		0x255b0ec26d5a56daULL
+	};
+
+
+	memset(&req, 0, sizeof(nx_nic_req_t));
+	req.qhdr = cpu_to_le64(NX_HOST_REQUEST << 23);
+
+	word = NX_NIC_H2C_OPCODE_CONFIG_RSS | ((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+
+	/*
+	 * RSS request:
+	 * bits 3-0: hash_method
+	 *      5-4: hash_type_ipv4
+	 *	7-6: hash_type_ipv6
+	 *	  8: enable
+	 *        9: use indirection table
+	 *    47-10: reserved
+	 *    63-48: indirection table mask
+	 */
+	word =  ((u64)(RSS_HASHTYPE_IP_TCP & 0x3) << 4) |
+		((u64)(RSS_HASHTYPE_IP_TCP & 0x3) << 6) |
+		((u64)(enable & 0x1) << 8) |
+		((0x7ULL) << 48);
+	req.words[0] = cpu_to_le64(word);
+	for (i = 0; i < ARRAY_SIZE(key); i++)
+		req.words[i+1] = cpu_to_le64(key[i]);
+
+
+	rv = netxen_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0) {
+		printk(KERN_ERR "%s: could not configure RSS\n",
+				adapter->netdev->name);
+	}
+
+	return rv;
+}
+
+int netxen_config_ipaddr(struct netxen_adapter *adapter, __be32 ip, int cmd)
+{
+	nx_nic_req_t req;
+	u64 word;
+	int rv;
+
+	memset(&req, 0, sizeof(nx_nic_req_t));
+	req.qhdr = cpu_to_le64(NX_HOST_REQUEST << 23);
+
+	word = NX_NIC_H2C_OPCODE_CONFIG_IPADDR | ((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+
+	req.words[0] = cpu_to_le64(cmd);
+	memcpy(&req.words[1], &ip, sizeof(u32));
+
+	rv = netxen_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0) {
+		printk(KERN_ERR "%s: could not notify %s IP 0x%x request\n",
+				adapter->netdev->name,
+				(cmd == NX_IP_UP) ? "Add" : "Remove", ip);
+	}
+	return rv;
+}
+
+int netxen_linkevent_request(struct netxen_adapter *adapter, int enable)
+{
+	nx_nic_req_t req;
+	u64 word;
+	int rv;
+
+	memset(&req, 0, sizeof(nx_nic_req_t));
+	req.qhdr = cpu_to_le64(NX_HOST_REQUEST << 23);
+
+	word = NX_NIC_H2C_OPCODE_GET_LINKEVENT | ((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+	req.words[0] = cpu_to_le64(enable | (enable << 8));
+
+	rv = netxen_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0) {
+		printk(KERN_ERR "%s: could not configure link notification\n",
+				adapter->netdev->name);
+	}
+
+	return rv;
+}
+
+int netxen_send_lro_cleanup(struct netxen_adapter *adapter)
+{
+	nx_nic_req_t req;
+	u64 word;
+	int rv;
+
+	if (!test_bit(__NX_FW_ATTACHED, &adapter->state))
+		return 0;
+
+	memset(&req, 0, sizeof(nx_nic_req_t));
+	req.qhdr = cpu_to_le64(NX_HOST_REQUEST << 23);
+
+	word = NX_NIC_H2C_OPCODE_LRO_REQUEST |
+		((u64)adapter->portnum << 16) |
+		((u64)NX_NIC_LRO_REQUEST_CLEANUP << 56) ;
+
+	req.req_hdr = cpu_to_le64(word);
+
+	rv = netxen_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0) {
+		printk(KERN_ERR "%s: could not cleanup lro flows\n",
+				adapter->netdev->name);
+	}
+	return rv;
+}
+
+/*
+ * netxen_nic_change_mtu - Change the Maximum Transfer Unit
+ * @returns 0 on success, negative on failure
+ */
+
+#define MTU_FUDGE_FACTOR	100
+
+int netxen_nic_change_mtu(struct net_device *netdev, int mtu)
+{
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+	int rc = 0;
+
+	if (adapter->set_mtu)
+		rc = adapter->set_mtu(adapter, mtu);
+
+	if (!rc)
+		netdev->mtu = mtu;
+
+	return rc;
+}
+
+static int netxen_get_flash_block(struct netxen_adapter *adapter, int base,
+				  int size, __le32 * buf)
+{
+	int i, v, addr;
+	__le32 *ptr32;
+	int ret;
+
+	addr = base;
+	ptr32 = buf;
+	for (i = 0; i < size / sizeof(u32); i++) {
+		ret = netxen_rom_fast_read(adapter, addr, &v);
+		if (ret)
+			return ret;
+
+		*ptr32 = cpu_to_le32(v);
+		ptr32++;
+		addr += sizeof(u32);
+	}
+	if ((char *)buf + size > (char *)ptr32) {
+		__le32 local;
+		ret = netxen_rom_fast_read(adapter, addr, &v);
+		if (ret)
+			return ret;
+		local = cpu_to_le32(v);
+		memcpy(ptr32, &local, (char *)buf + size - (char *)ptr32);
+	}
+
+	return 0;
+}
+
+int netxen_get_flash_mac_addr(struct netxen_adapter *adapter, u64 *mac)
+{
+	__le32 *pmac = (__le32 *) mac;
+	u32 offset;
+
+	offset = NX_FW_MAC_ADDR_OFFSET + (adapter->portnum * sizeof(u64));
+
+	if (netxen_get_flash_block(adapter, offset, sizeof(u64), pmac) == -1)
+		return -1;
+
+	if (*mac == ~0ULL) {
+
+		offset = NX_OLD_MAC_ADDR_OFFSET +
+			(adapter->portnum * sizeof(u64));
+
+		if (netxen_get_flash_block(adapter,
+					offset, sizeof(u64), pmac) == -1)
+			return -1;
+
+		if (*mac == ~0ULL)
+			return -1;
+	}
+	return 0;
+}
+
+int netxen_p3_get_mac_addr(struct netxen_adapter *adapter, u64 *mac)
+{
+	uint32_t crbaddr, mac_hi, mac_lo;
+	int pci_func = adapter->ahw.pci_func;
+
+	crbaddr = CRB_MAC_BLOCK_START +
+		(4 * ((pci_func/2) * 3)) + (4 * (pci_func & 1));
+
+	mac_lo = NXRD32(adapter, crbaddr);
+	mac_hi = NXRD32(adapter, crbaddr+4);
+
+	if (pci_func & 1)
+		*mac = le64_to_cpu((mac_lo >> 16) | ((u64)mac_hi << 16));
+	else
+		*mac = le64_to_cpu((u64)mac_lo | ((u64)mac_hi << 32));
+
+	return 0;
+}
+
+/*
+ * Changes the CRB window to the specified window.
+ */
+static void
+netxen_nic_pci_set_crbwindow_128M(struct netxen_adapter *adapter,
+		u32 window)
+{
+	void __iomem *offset;
+	int count = 10;
+	u8 func = adapter->ahw.pci_func;
+
+	if (adapter->ahw.crb_win == window)
+		return;
+
+	offset = PCI_OFFSET_SECOND_RANGE(adapter,
+			NETXEN_PCIX_PH_REG(PCIE_CRB_WINDOW_REG(func)));
+
+	writel(window, offset);
+	do {
+		if (window == readl(offset))
+			break;
+
+		if (printk_ratelimit())
+			dev_warn(&adapter->pdev->dev,
+					"failed to set CRB window to %d\n",
+					(window == NETXEN_WINDOW_ONE));
+		udelay(1);
+
+	} while (--count > 0);
+
+	if (count > 0)
+		adapter->ahw.crb_win = window;
+}
+
+/*
+ * Returns < 0 if off is not valid,
+ *	 1 if window access is needed. 'off' is set to offset from
+ *	   CRB space in 128M pci map
+ *	 0 if no window access is needed. 'off' is set to 2M addr
+ * In: 'off' is offset from base in 128M pci map
+ */
+static int
+netxen_nic_pci_get_crb_addr_2M(struct netxen_adapter *adapter,
+		ulong off, void __iomem **addr)
+{
+	crb_128M_2M_sub_block_map_t *m;
+
+
+	if ((off >= NETXEN_CRB_MAX) || (off < NETXEN_PCI_CRBSPACE))
+		return -EINVAL;
+
+	off -= NETXEN_PCI_CRBSPACE;
+
+	/*
+	 * Try direct map
+	 */
+	m = &crb_128M_2M_map[CRB_BLK(off)].sub_block[CRB_SUBBLK(off)];
+
+	if (m->valid && (m->start_128M <= off) && (m->end_128M > off)) {
+		*addr = adapter->ahw.pci_base0 + m->start_2M +
+			(off - m->start_128M);
+		return 0;
+	}
+
+	/*
+	 * Not in direct map, use crb window
+	 */
+	*addr = adapter->ahw.pci_base0 + CRB_INDIRECT_2M +
+		(off & MASK(16));
+	return 1;
+}
+
+/*
+ * In: 'off' is offset from CRB space in 128M pci map
+ * Out: 'off' is 2M pci map addr
+ * side effect: lock crb window
+ */
+static void
+netxen_nic_pci_set_crbwindow_2M(struct netxen_adapter *adapter, ulong off)
+{
+	u32 window;
+	void __iomem *addr = adapter->ahw.pci_base0 + CRB_WINDOW_2M;
+
+	off -= NETXEN_PCI_CRBSPACE;
+
+	window = CRB_HI(off);
+
+	writel(window, addr);
+	if (readl(addr) != window) {
+		if (printk_ratelimit())
+			dev_warn(&adapter->pdev->dev,
+				"failed to set CRB window to %d off 0x%lx\n",
+				window, off);
+	}
+}
+
+static void __iomem *
+netxen_nic_map_indirect_address_128M(struct netxen_adapter *adapter,
+		ulong win_off, void __iomem **mem_ptr)
+{
+	ulong off = win_off;
+	void __iomem *addr;
+	resource_size_t mem_base;
+
+	if (ADDR_IN_WINDOW1(win_off))
+		off = NETXEN_CRB_NORMAL(win_off);
+
+	addr = pci_base_offset(adapter, off);
+	if (addr)
+		return addr;
+
+	if (adapter->ahw.pci_len0 == 0)
+		off -= NETXEN_PCI_CRBSPACE;
+
+	mem_base = pci_resource_start(adapter->pdev, 0);
+	*mem_ptr = ioremap(mem_base + (off & PAGE_MASK), PAGE_SIZE);
+	if (*mem_ptr)
+		addr = *mem_ptr + (off & (PAGE_SIZE - 1));
+
+	return addr;
+}
+
+static int
+netxen_nic_hw_write_wx_128M(struct netxen_adapter *adapter, ulong off, u32 data)
+{
+	unsigned long flags;
+	void __iomem *addr, *mem_ptr = NULL;
+
+	addr = netxen_nic_map_indirect_address_128M(adapter, off, &mem_ptr);
+	if (!addr)
+		return -EIO;
+
+	if (ADDR_IN_WINDOW1(off)) { /* Window 1 */
+		netxen_nic_io_write_128M(adapter, addr, data);
+	} else {        /* Window 0 */
+		write_lock_irqsave(&adapter->ahw.crb_lock, flags);
+		netxen_nic_pci_set_crbwindow_128M(adapter, 0);
+		writel(data, addr);
+		netxen_nic_pci_set_crbwindow_128M(adapter,
+				NETXEN_WINDOW_ONE);
+		write_unlock_irqrestore(&adapter->ahw.crb_lock, flags);
+	}
+
+	if (mem_ptr)
+		iounmap(mem_ptr);
+
+	return 0;
+}
+
+static u32
+netxen_nic_hw_read_wx_128M(struct netxen_adapter *adapter, ulong off)
+{
+	unsigned long flags;
+	void __iomem *addr, *mem_ptr = NULL;
+	u32 data;
+
+	addr = netxen_nic_map_indirect_address_128M(adapter, off, &mem_ptr);
+	if (!addr)
+		return -EIO;
+
+	if (ADDR_IN_WINDOW1(off)) { /* Window 1 */
+		data = netxen_nic_io_read_128M(adapter, addr);
+	} else {        /* Window 0 */
+		write_lock_irqsave(&adapter->ahw.crb_lock, flags);
+		netxen_nic_pci_set_crbwindow_128M(adapter, 0);
+		data = readl(addr);
+		netxen_nic_pci_set_crbwindow_128M(adapter,
+				NETXEN_WINDOW_ONE);
+		write_unlock_irqrestore(&adapter->ahw.crb_lock, flags);
+	}
+
+	if (mem_ptr)
+		iounmap(mem_ptr);
+
+	return data;
+}
+
+static int
+netxen_nic_hw_write_wx_2M(struct netxen_adapter *adapter, ulong off, u32 data)
+{
+	unsigned long flags;
+	int rv;
+	void __iomem *addr = NULL;
+
+	rv = netxen_nic_pci_get_crb_addr_2M(adapter, off, &addr);
+
+	if (rv == 0) {
+		writel(data, addr);
+		return 0;
+	}
+
+	if (rv > 0) {
+		/* indirect access */
+		write_lock_irqsave(&adapter->ahw.crb_lock, flags);
+		crb_win_lock(adapter);
+		netxen_nic_pci_set_crbwindow_2M(adapter, off);
+		writel(data, addr);
+		crb_win_unlock(adapter);
+		write_unlock_irqrestore(&adapter->ahw.crb_lock, flags);
+		return 0;
+	}
+
+	dev_err(&adapter->pdev->dev,
+			"%s: invalid offset: 0x%016lx\n", __func__, off);
+	dump_stack();
+	return -EIO;
+}
+
+static u32
+netxen_nic_hw_read_wx_2M(struct netxen_adapter *adapter, ulong off)
+{
+	unsigned long flags;
+	int rv;
+	u32 data;
+	void __iomem *addr = NULL;
+
+	rv = netxen_nic_pci_get_crb_addr_2M(adapter, off, &addr);
+
+	if (rv == 0)
+		return readl(addr);
+
+	if (rv > 0) {
+		/* indirect access */
+		write_lock_irqsave(&adapter->ahw.crb_lock, flags);
+		crb_win_lock(adapter);
+		netxen_nic_pci_set_crbwindow_2M(adapter, off);
+		data = readl(addr);
+		crb_win_unlock(adapter);
+		write_unlock_irqrestore(&adapter->ahw.crb_lock, flags);
+		return data;
+	}
+
+	dev_err(&adapter->pdev->dev,
+			"%s: invalid offset: 0x%016lx\n", __func__, off);
+	dump_stack();
+	return -1;
+}
+
+/* window 1 registers only */
+static void netxen_nic_io_write_128M(struct netxen_adapter *adapter,
+		void __iomem *addr, u32 data)
+{
+	read_lock(&adapter->ahw.crb_lock);
+	writel(data, addr);
+	read_unlock(&adapter->ahw.crb_lock);
+}
+
+static u32 netxen_nic_io_read_128M(struct netxen_adapter *adapter,
+		void __iomem *addr)
+{
+	u32 val;
+
+	read_lock(&adapter->ahw.crb_lock);
+	val = readl(addr);
+	read_unlock(&adapter->ahw.crb_lock);
+
+	return val;
+}
+
+static void netxen_nic_io_write_2M(struct netxen_adapter *adapter,
+		void __iomem *addr, u32 data)
+{
+	writel(data, addr);
+}
+
+static u32 netxen_nic_io_read_2M(struct netxen_adapter *adapter,
+		void __iomem *addr)
+{
+	return readl(addr);
+}
+
+void __iomem *
+netxen_get_ioaddr(struct netxen_adapter *adapter, u32 offset)
+{
+	void __iomem *addr = NULL;
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+		if ((offset < NETXEN_CRB_PCIX_HOST2) &&
+				(offset > NETXEN_CRB_PCIX_HOST))
+			addr = PCI_OFFSET_SECOND_RANGE(adapter, offset);
+		else
+			addr = NETXEN_CRB_NORMALIZE(adapter, offset);
+	} else {
+		WARN_ON(netxen_nic_pci_get_crb_addr_2M(adapter,
+					offset, &addr));
+	}
+
+	return addr;
+}
+
+static int
+netxen_nic_pci_set_window_128M(struct netxen_adapter *adapter,
+		u64 addr, u32 *start)
+{
+	if (ADDR_IN_RANGE(addr, NETXEN_ADDR_OCM0, NETXEN_ADDR_OCM0_MAX)) {
+		*start = (addr - NETXEN_ADDR_OCM0  + NETXEN_PCI_OCM0);
+		return 0;
+	} else if (ADDR_IN_RANGE(addr,
+				NETXEN_ADDR_OCM1, NETXEN_ADDR_OCM1_MAX)) {
+		*start = (addr - NETXEN_ADDR_OCM1 + NETXEN_PCI_OCM1);
+		return 0;
+	}
+
+	return -EIO;
+}
+
+static int
+netxen_nic_pci_set_window_2M(struct netxen_adapter *adapter,
+		u64 addr, u32 *start)
+{
+	u32 window;
+
+	window = OCM_WIN(addr);
+
+	writel(window, adapter->ahw.ocm_win_crb);
+	/* read back to flush */
+	readl(adapter->ahw.ocm_win_crb);
+
+	adapter->ahw.ocm_win = window;
+	*start = NETXEN_PCI_OCM0_2M + GET_MEM_OFFS_2M(addr);
+	return 0;
+}
+
+static int
+netxen_nic_pci_mem_access_direct(struct netxen_adapter *adapter, u64 off,
+		u64 *data, int op)
+{
+	void __iomem *addr, *mem_ptr = NULL;
+	resource_size_t mem_base;
+	int ret;
+	u32 start;
+
+	spin_lock(&adapter->ahw.mem_lock);
+
+	ret = adapter->pci_set_window(adapter, off, &start);
+	if (ret != 0)
+		goto unlock;
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+		addr = adapter->ahw.pci_base0 + start;
+	} else {
+		addr = pci_base_offset(adapter, start);
+		if (addr)
+			goto noremap;
+
+		mem_base = pci_resource_start(adapter->pdev, 0) +
+					(start & PAGE_MASK);
+		mem_ptr = ioremap(mem_base, PAGE_SIZE);
+		if (mem_ptr == NULL) {
+			ret = -EIO;
+			goto unlock;
+		}
+
+		addr = mem_ptr + (start & (PAGE_SIZE-1));
+	}
+noremap:
+	if (op == 0)	/* read */
+		*data = readq(addr);
+	else		/* write */
+		writeq(*data, addr);
+
+unlock:
+	spin_unlock(&adapter->ahw.mem_lock);
+
+	if (mem_ptr)
+		iounmap(mem_ptr);
+	return ret;
+}
+
+void
+netxen_pci_camqm_read_2M(struct netxen_adapter *adapter, u64 off, u64 *data)
+{
+	void __iomem *addr = adapter->ahw.pci_base0 +
+		NETXEN_PCI_CAMQM_2M_BASE + (off - NETXEN_PCI_CAMQM);
+
+	spin_lock(&adapter->ahw.mem_lock);
+	*data = readq(addr);
+	spin_unlock(&adapter->ahw.mem_lock);
+}
+
+void
+netxen_pci_camqm_write_2M(struct netxen_adapter *adapter, u64 off, u64 data)
+{
+	void __iomem *addr = adapter->ahw.pci_base0 +
+		NETXEN_PCI_CAMQM_2M_BASE + (off - NETXEN_PCI_CAMQM);
+
+	spin_lock(&adapter->ahw.mem_lock);
+	writeq(data, addr);
+	spin_unlock(&adapter->ahw.mem_lock);
+}
+
+#define MAX_CTL_CHECK   1000
+
+static int
+netxen_nic_pci_mem_write_128M(struct netxen_adapter *adapter,
+		u64 off, u64 data)
+{
+	int j, ret;
+	u32 temp, off_lo, off_hi, addr_hi, data_hi, data_lo;
+	void __iomem *mem_crb;
+
+	/* Only 64-bit aligned access */
+	if (off & 7)
+		return -EIO;
+
+	/* P2 has different SIU and MIU test agent base addr */
+	if (ADDR_IN_RANGE(off, NETXEN_ADDR_QDR_NET,
+				NETXEN_ADDR_QDR_NET_MAX_P2)) {
+		mem_crb = pci_base_offset(adapter,
+				NETXEN_CRB_QDR_NET+SIU_TEST_AGT_BASE);
+		addr_hi = SIU_TEST_AGT_ADDR_HI;
+		data_lo = SIU_TEST_AGT_WRDATA_LO;
+		data_hi = SIU_TEST_AGT_WRDATA_HI;
+		off_lo = off & SIU_TEST_AGT_ADDR_MASK;
+		off_hi = SIU_TEST_AGT_UPPER_ADDR(off);
+		goto correct;
+	}
+
+	if (ADDR_IN_RANGE(off, NETXEN_ADDR_DDR_NET, NETXEN_ADDR_DDR_NET_MAX)) {
+		mem_crb = pci_base_offset(adapter,
+				NETXEN_CRB_DDR_NET+MIU_TEST_AGT_BASE);
+		addr_hi = MIU_TEST_AGT_ADDR_HI;
+		data_lo = MIU_TEST_AGT_WRDATA_LO;
+		data_hi = MIU_TEST_AGT_WRDATA_HI;
+		off_lo = off & MIU_TEST_AGT_ADDR_MASK;
+		off_hi = 0;
+		goto correct;
+	}
+
+	if (ADDR_IN_RANGE(off, NETXEN_ADDR_OCM0, NETXEN_ADDR_OCM0_MAX) ||
+		ADDR_IN_RANGE(off, NETXEN_ADDR_OCM1, NETXEN_ADDR_OCM1_MAX)) {
+		if (adapter->ahw.pci_len0 != 0) {
+			return netxen_nic_pci_mem_access_direct(adapter,
+					off, &data, 1);
+		}
+	}
+
+	return -EIO;
+
+correct:
+	spin_lock(&adapter->ahw.mem_lock);
+	netxen_nic_pci_set_crbwindow_128M(adapter, 0);
+
+	writel(off_lo, (mem_crb + MIU_TEST_AGT_ADDR_LO));
+	writel(off_hi, (mem_crb + addr_hi));
+	writel(data & 0xffffffff, (mem_crb + data_lo));
+	writel((data >> 32) & 0xffffffff, (mem_crb + data_hi));
+	writel((TA_CTL_ENABLE | TA_CTL_WRITE), (mem_crb + TEST_AGT_CTRL));
+	writel((TA_CTL_START | TA_CTL_ENABLE | TA_CTL_WRITE),
+			(mem_crb + TEST_AGT_CTRL));
+
+	for (j = 0; j < MAX_CTL_CHECK; j++) {
+		temp = readl((mem_crb + TEST_AGT_CTRL));
+		if ((temp & TA_CTL_BUSY) == 0)
+			break;
+	}
+
+	if (j >= MAX_CTL_CHECK) {
+		if (printk_ratelimit())
+			dev_err(&adapter->pdev->dev,
+					"failed to write through agent\n");
+		ret = -EIO;
+	} else
+		ret = 0;
+
+	netxen_nic_pci_set_crbwindow_128M(adapter, NETXEN_WINDOW_ONE);
+	spin_unlock(&adapter->ahw.mem_lock);
+	return ret;
+}
+
+static int
+netxen_nic_pci_mem_read_128M(struct netxen_adapter *adapter,
+		u64 off, u64 *data)
+{
+	int j, ret;
+	u32 temp, off_lo, off_hi, addr_hi, data_hi, data_lo;
+	u64 val;
+	void __iomem *mem_crb;
+
+	/* Only 64-bit aligned access */
+	if (off & 7)
+		return -EIO;
+
+	/* P2 has different SIU and MIU test agent base addr */
+	if (ADDR_IN_RANGE(off, NETXEN_ADDR_QDR_NET,
+				NETXEN_ADDR_QDR_NET_MAX_P2)) {
+		mem_crb = pci_base_offset(adapter,
+				NETXEN_CRB_QDR_NET+SIU_TEST_AGT_BASE);
+		addr_hi = SIU_TEST_AGT_ADDR_HI;
+		data_lo = SIU_TEST_AGT_RDDATA_LO;
+		data_hi = SIU_TEST_AGT_RDDATA_HI;
+		off_lo = off & SIU_TEST_AGT_ADDR_MASK;
+		off_hi = SIU_TEST_AGT_UPPER_ADDR(off);
+		goto correct;
+	}
+
+	if (ADDR_IN_RANGE(off, NETXEN_ADDR_DDR_NET, NETXEN_ADDR_DDR_NET_MAX)) {
+		mem_crb = pci_base_offset(adapter,
+				NETXEN_CRB_DDR_NET+MIU_TEST_AGT_BASE);
+		addr_hi = MIU_TEST_AGT_ADDR_HI;
+		data_lo = MIU_TEST_AGT_RDDATA_LO;
+		data_hi = MIU_TEST_AGT_RDDATA_HI;
+		off_lo = off & MIU_TEST_AGT_ADDR_MASK;
+		off_hi = 0;
+		goto correct;
+	}
+
+	if (ADDR_IN_RANGE(off, NETXEN_ADDR_OCM0, NETXEN_ADDR_OCM0_MAX) ||
+		ADDR_IN_RANGE(off, NETXEN_ADDR_OCM1, NETXEN_ADDR_OCM1_MAX)) {
+		if (adapter->ahw.pci_len0 != 0) {
+			return netxen_nic_pci_mem_access_direct(adapter,
+					off, data, 0);
+		}
+	}
+
+	return -EIO;
+
+correct:
+	spin_lock(&adapter->ahw.mem_lock);
+	netxen_nic_pci_set_crbwindow_128M(adapter, 0);
+
+	writel(off_lo, (mem_crb + MIU_TEST_AGT_ADDR_LO));
+	writel(off_hi, (mem_crb + addr_hi));
+	writel(TA_CTL_ENABLE, (mem_crb + TEST_AGT_CTRL));
+	writel((TA_CTL_START|TA_CTL_ENABLE), (mem_crb + TEST_AGT_CTRL));
+
+	for (j = 0; j < MAX_CTL_CHECK; j++) {
+		temp = readl(mem_crb + TEST_AGT_CTRL);
+		if ((temp & TA_CTL_BUSY) == 0)
+			break;
+	}
+
+	if (j >= MAX_CTL_CHECK) {
+		if (printk_ratelimit())
+			dev_err(&adapter->pdev->dev,
+					"failed to read through agent\n");
+		ret = -EIO;
+	} else {
+
+		temp = readl(mem_crb + data_hi);
+		val = ((u64)temp << 32);
+		val |= readl(mem_crb + data_lo);
+		*data = val;
+		ret = 0;
+	}
+
+	netxen_nic_pci_set_crbwindow_128M(adapter, NETXEN_WINDOW_ONE);
+	spin_unlock(&adapter->ahw.mem_lock);
+
+	return ret;
+}
+
+static int
+netxen_nic_pci_mem_write_2M(struct netxen_adapter *adapter,
+		u64 off, u64 data)
+{
+	int j, ret;
+	u32 temp, off8;
+	void __iomem *mem_crb;
+
+	/* Only 64-bit aligned access */
+	if (off & 7)
+		return -EIO;
+
+	/* P3 onward, test agent base for MIU and SIU is same */
+	if (ADDR_IN_RANGE(off, NETXEN_ADDR_QDR_NET,
+				NETXEN_ADDR_QDR_NET_MAX_P3)) {
+		mem_crb = netxen_get_ioaddr(adapter,
+				NETXEN_CRB_QDR_NET+MIU_TEST_AGT_BASE);
+		goto correct;
+	}
+
+	if (ADDR_IN_RANGE(off, NETXEN_ADDR_DDR_NET, NETXEN_ADDR_DDR_NET_MAX)) {
+		mem_crb = netxen_get_ioaddr(adapter,
+				NETXEN_CRB_DDR_NET+MIU_TEST_AGT_BASE);
+		goto correct;
+	}
+
+	if (ADDR_IN_RANGE(off, NETXEN_ADDR_OCM0, NETXEN_ADDR_OCM0_MAX))
+		return netxen_nic_pci_mem_access_direct(adapter, off, &data, 1);
+
+	return -EIO;
+
+correct:
+	off8 = off & 0xfffffff8;
+
+	spin_lock(&adapter->ahw.mem_lock);
+
+	writel(off8, (mem_crb + MIU_TEST_AGT_ADDR_LO));
+	writel(0, (mem_crb + MIU_TEST_AGT_ADDR_HI));
+
+	writel(data & 0xffffffff,
+			mem_crb + MIU_TEST_AGT_WRDATA_LO);
+	writel((data >> 32) & 0xffffffff,
+			mem_crb + MIU_TEST_AGT_WRDATA_HI);
+
+	writel((TA_CTL_ENABLE | TA_CTL_WRITE), (mem_crb + TEST_AGT_CTRL));
+	writel((TA_CTL_START | TA_CTL_ENABLE | TA_CTL_WRITE),
+			(mem_crb + TEST_AGT_CTRL));
+
+	for (j = 0; j < MAX_CTL_CHECK; j++) {
+		temp = readl(mem_crb + TEST_AGT_CTRL);
+		if ((temp & TA_CTL_BUSY) == 0)
+			break;
+	}
+
+	if (j >= MAX_CTL_CHECK) {
+		if (printk_ratelimit())
+			dev_err(&adapter->pdev->dev,
+					"failed to write through agent\n");
+		ret = -EIO;
+	} else
+		ret = 0;
+
+	spin_unlock(&adapter->ahw.mem_lock);
+
+	return ret;
+}
+
+static int
+netxen_nic_pci_mem_read_2M(struct netxen_adapter *adapter,
+		u64 off, u64 *data)
+{
+	int j, ret;
+	u32 temp, off8;
+	u64 val;
+	void __iomem *mem_crb;
+
+	/* Only 64-bit aligned access */
+	if (off & 7)
+		return -EIO;
+
+	/* P3 onward, test agent base for MIU and SIU is same */
+	if (ADDR_IN_RANGE(off, NETXEN_ADDR_QDR_NET,
+				NETXEN_ADDR_QDR_NET_MAX_P3)) {
+		mem_crb = netxen_get_ioaddr(adapter,
+				NETXEN_CRB_QDR_NET+MIU_TEST_AGT_BASE);
+		goto correct;
+	}
+
+	if (ADDR_IN_RANGE(off, NETXEN_ADDR_DDR_NET, NETXEN_ADDR_DDR_NET_MAX)) {
+		mem_crb = netxen_get_ioaddr(adapter,
+				NETXEN_CRB_DDR_NET+MIU_TEST_AGT_BASE);
+		goto correct;
+	}
+
+	if (ADDR_IN_RANGE(off, NETXEN_ADDR_OCM0, NETXEN_ADDR_OCM0_MAX)) {
+		return netxen_nic_pci_mem_access_direct(adapter,
+				off, data, 0);
+	}
+
+	return -EIO;
+
+correct:
+	off8 = off & 0xfffffff8;
+
+	spin_lock(&adapter->ahw.mem_lock);
+
+	writel(off8, (mem_crb + MIU_TEST_AGT_ADDR_LO));
+	writel(0, (mem_crb + MIU_TEST_AGT_ADDR_HI));
+	writel(TA_CTL_ENABLE, (mem_crb + TEST_AGT_CTRL));
+	writel((TA_CTL_START | TA_CTL_ENABLE), (mem_crb + TEST_AGT_CTRL));
+
+	for (j = 0; j < MAX_CTL_CHECK; j++) {
+		temp = readl(mem_crb + TEST_AGT_CTRL);
+		if ((temp & TA_CTL_BUSY) == 0)
+			break;
+	}
+
+	if (j >= MAX_CTL_CHECK) {
+		if (printk_ratelimit())
+			dev_err(&adapter->pdev->dev,
+					"failed to read through agent\n");
+		ret = -EIO;
+	} else {
+		val = (u64)(readl(mem_crb + MIU_TEST_AGT_RDDATA_HI)) << 32;
+		val |= readl(mem_crb + MIU_TEST_AGT_RDDATA_LO);
+		*data = val;
+		ret = 0;
+	}
+
+	spin_unlock(&adapter->ahw.mem_lock);
+
+	return ret;
+}
+
+void
+netxen_setup_hwops(struct netxen_adapter *adapter)
+{
+	adapter->init_port = netxen_niu_xg_init_port;
+	adapter->stop_port = netxen_niu_disable_xg_port;
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+		adapter->crb_read = netxen_nic_hw_read_wx_128M,
+		adapter->crb_write = netxen_nic_hw_write_wx_128M,
+		adapter->pci_set_window = netxen_nic_pci_set_window_128M,
+		adapter->pci_mem_read = netxen_nic_pci_mem_read_128M,
+		adapter->pci_mem_write = netxen_nic_pci_mem_write_128M,
+		adapter->io_read = netxen_nic_io_read_128M,
+		adapter->io_write = netxen_nic_io_write_128M,
+
+		adapter->macaddr_set = netxen_p2_nic_set_mac_addr;
+		adapter->set_multi = netxen_p2_nic_set_multi;
+		adapter->set_mtu = netxen_nic_set_mtu_xgb;
+		adapter->set_promisc = netxen_p2_nic_set_promisc;
+
+	} else {
+		adapter->crb_read = netxen_nic_hw_read_wx_2M,
+		adapter->crb_write = netxen_nic_hw_write_wx_2M,
+		adapter->pci_set_window = netxen_nic_pci_set_window_2M,
+		adapter->pci_mem_read = netxen_nic_pci_mem_read_2M,
+		adapter->pci_mem_write = netxen_nic_pci_mem_write_2M,
+		adapter->io_read = netxen_nic_io_read_2M,
+		adapter->io_write = netxen_nic_io_write_2M,
+
+		adapter->set_mtu = nx_fw_cmd_set_mtu;
+		adapter->set_promisc = netxen_p3_nic_set_promisc;
+		adapter->macaddr_set = netxen_p3_nic_set_mac_addr;
+		adapter->set_multi = netxen_p3_nic_set_multi;
+
+		adapter->phy_read = nx_fw_cmd_query_phy;
+		adapter->phy_write = nx_fw_cmd_set_phy;
+	}
+}
+
+int netxen_nic_get_board_info(struct netxen_adapter *adapter)
+{
+	int offset, board_type, magic;
+	struct pci_dev *pdev = adapter->pdev;
+
+	offset = NX_FW_MAGIC_OFFSET;
+	if (netxen_rom_fast_read(adapter, offset, &magic))
+		return -EIO;
+
+	if (magic != NETXEN_BDINFO_MAGIC) {
+		dev_err(&pdev->dev, "invalid board config, magic=%08x\n",
+			magic);
+		return -EIO;
+	}
+
+	offset = NX_BRDTYPE_OFFSET;
+	if (netxen_rom_fast_read(adapter, offset, &board_type))
+		return -EIO;
+
+	if (board_type == NETXEN_BRDTYPE_P3_4_GB_MM) {
+		u32 gpio = NXRD32(adapter, NETXEN_ROMUSB_GLB_PAD_GPIO_I);
+		if ((gpio & 0x8000) == 0)
+			board_type = NETXEN_BRDTYPE_P3_10G_TP;
+	}
+
+	adapter->ahw.board_type = board_type;
+
+	switch (board_type) {
+	case NETXEN_BRDTYPE_P2_SB35_4G:
+		adapter->ahw.port_type = NETXEN_NIC_GBE;
+		break;
+	case NETXEN_BRDTYPE_P2_SB31_10G:
+	case NETXEN_BRDTYPE_P2_SB31_10G_IMEZ:
+	case NETXEN_BRDTYPE_P2_SB31_10G_HMEZ:
+	case NETXEN_BRDTYPE_P2_SB31_10G_CX4:
+	case NETXEN_BRDTYPE_P3_HMEZ:
+	case NETXEN_BRDTYPE_P3_XG_LOM:
+	case NETXEN_BRDTYPE_P3_10G_CX4:
+	case NETXEN_BRDTYPE_P3_10G_CX4_LP:
+	case NETXEN_BRDTYPE_P3_IMEZ:
+	case NETXEN_BRDTYPE_P3_10G_SFP_PLUS:
+	case NETXEN_BRDTYPE_P3_10G_SFP_CT:
+	case NETXEN_BRDTYPE_P3_10G_SFP_QT:
+	case NETXEN_BRDTYPE_P3_10G_XFP:
+	case NETXEN_BRDTYPE_P3_10000_BASE_T:
+		adapter->ahw.port_type = NETXEN_NIC_XGBE;
+		break;
+	case NETXEN_BRDTYPE_P1_BD:
+	case NETXEN_BRDTYPE_P1_SB:
+	case NETXEN_BRDTYPE_P1_SMAX:
+	case NETXEN_BRDTYPE_P1_SOCK:
+	case NETXEN_BRDTYPE_P3_REF_QG:
+	case NETXEN_BRDTYPE_P3_4_GB:
+	case NETXEN_BRDTYPE_P3_4_GB_MM:
+		adapter->ahw.port_type = NETXEN_NIC_GBE;
+		break;
+	case NETXEN_BRDTYPE_P3_10G_TP:
+		adapter->ahw.port_type = (adapter->portnum < 2) ?
+			NETXEN_NIC_XGBE : NETXEN_NIC_GBE;
+		break;
+	default:
+		dev_err(&pdev->dev, "unknown board type %x\n", board_type);
+		adapter->ahw.port_type = NETXEN_NIC_XGBE;
+		break;
+	}
+
+	return 0;
+}
+
+/* NIU access sections */
+static int netxen_nic_set_mtu_xgb(struct netxen_adapter *adapter, int new_mtu)
+{
+	new_mtu += MTU_FUDGE_FACTOR;
+	if (adapter->physical_port == 0)
+		NXWR32(adapter, NETXEN_NIU_XGE_MAX_FRAME_SIZE, new_mtu);
+	else
+		NXWR32(adapter, NETXEN_NIU_XG1_MAX_FRAME_SIZE, new_mtu);
+	return 0;
+}
+
+void netxen_nic_set_link_parameters(struct netxen_adapter *adapter)
+{
+	__u32 status;
+	__u32 autoneg;
+	__u32 port_mode;
+
+	if (!netif_carrier_ok(adapter->netdev)) {
+		adapter->link_speed   = 0;
+		adapter->link_duplex  = -1;
+		adapter->link_autoneg = AUTONEG_ENABLE;
+		return;
+	}
+
+	if (adapter->ahw.port_type == NETXEN_NIC_GBE) {
+		port_mode = NXRD32(adapter, NETXEN_PORT_MODE_ADDR);
+		if (port_mode == NETXEN_PORT_MODE_802_3_AP) {
+			adapter->link_speed   = SPEED_1000;
+			adapter->link_duplex  = DUPLEX_FULL;
+			adapter->link_autoneg = AUTONEG_DISABLE;
+			return;
+		}
+
+		if (adapter->phy_read &&
+		    adapter->phy_read(adapter,
+				      NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_STATUS,
+				      &status) == 0) {
+			if (netxen_get_phy_link(status)) {
+				switch (netxen_get_phy_speed(status)) {
+				case 0:
+					adapter->link_speed = SPEED_10;
+					break;
+				case 1:
+					adapter->link_speed = SPEED_100;
+					break;
+				case 2:
+					adapter->link_speed = SPEED_1000;
+					break;
+				default:
+					adapter->link_speed = 0;
+					break;
+				}
+				switch (netxen_get_phy_duplex(status)) {
+				case 0:
+					adapter->link_duplex = DUPLEX_HALF;
+					break;
+				case 1:
+					adapter->link_duplex = DUPLEX_FULL;
+					break;
+				default:
+					adapter->link_duplex = -1;
+					break;
+				}
+				if (adapter->phy_read &&
+				    adapter->phy_read(adapter,
+						      NETXEN_NIU_GB_MII_MGMT_ADDR_AUTONEG,
+						      &autoneg) == 0)
+					adapter->link_autoneg = autoneg;
+			} else
+				goto link_down;
+		} else {
+		      link_down:
+			adapter->link_speed = 0;
+			adapter->link_duplex = -1;
+		}
+	}
+}
+
+int
+netxen_nic_wol_supported(struct netxen_adapter *adapter)
+{
+	u32 wol_cfg;
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		return 0;
+
+	wol_cfg = NXRD32(adapter, NETXEN_WOL_CONFIG_NV);
+	if (wol_cfg & (1UL << adapter->portnum)) {
+		wol_cfg = NXRD32(adapter, NETXEN_WOL_CONFIG);
+		if (wol_cfg & (1 << adapter->portnum))
+			return 1;
+	}
+
+	return 0;
+}
+
+static u32 netxen_md_cntrl(struct netxen_adapter *adapter,
+			struct netxen_minidump_template_hdr *template_hdr,
+			struct netxen_minidump_entry_crb *crtEntry)
+{
+	int loop_cnt, i, rv = 0, timeout_flag;
+	u32 op_count, stride;
+	u32 opcode, read_value, addr;
+	unsigned long timeout, timeout_jiffies;
+	addr = crtEntry->addr;
+	op_count = crtEntry->op_count;
+	stride = crtEntry->addr_stride;
+
+	for (loop_cnt = 0; loop_cnt < op_count; loop_cnt++) {
+		for (i = 0; i < sizeof(crtEntry->opcode) * 8; i++) {
+			opcode = (crtEntry->opcode & (0x1 << i));
+			if (opcode) {
+				switch (opcode) {
+				case NX_DUMP_WCRB:
+					NX_WR_DUMP_REG(addr,
+						adapter->ahw.pci_base0,
+							crtEntry->value_1);
+					break;
+				case NX_DUMP_RWCRB:
+					NX_RD_DUMP_REG(addr,
+						adapter->ahw.pci_base0,
+								&read_value);
+					NX_WR_DUMP_REG(addr,
+						adapter->ahw.pci_base0,
+								read_value);
+					break;
+				case NX_DUMP_ANDCRB:
+					NX_RD_DUMP_REG(addr,
+						adapter->ahw.pci_base0,
+								&read_value);
+					read_value &= crtEntry->value_2;
+					NX_WR_DUMP_REG(addr,
+						adapter->ahw.pci_base0,
+								read_value);
+					break;
+				case NX_DUMP_ORCRB:
+					NX_RD_DUMP_REG(addr,
+						adapter->ahw.pci_base0,
+								&read_value);
+					read_value |= crtEntry->value_3;
+					NX_WR_DUMP_REG(addr,
+						adapter->ahw.pci_base0,
+								read_value);
+					break;
+				case NX_DUMP_POLLCRB:
+					timeout = crtEntry->poll_timeout;
+					NX_RD_DUMP_REG(addr,
+						adapter->ahw.pci_base0,
+								&read_value);
+					timeout_jiffies =
+					msecs_to_jiffies(timeout) + jiffies;
+					for (timeout_flag = 0;
+						!timeout_flag
+					&& ((read_value & crtEntry->value_2)
+					!= crtEntry->value_1);) {
+						if (time_after(jiffies,
+							timeout_jiffies))
+							timeout_flag = 1;
+					NX_RD_DUMP_REG(addr,
+							adapter->ahw.pci_base0,
+								&read_value);
+					}
+
+					if (timeout_flag) {
+						dev_err(&adapter->pdev->dev, "%s : "
+							"Timeout in poll_crb control operation.\n"
+								, __func__);
+						return -1;
+					}
+					break;
+				case NX_DUMP_RD_SAVE:
+					/* Decide which address to use */
+					if (crtEntry->state_index_a)
+						addr =
+						template_hdr->saved_state_array
+						[crtEntry->state_index_a];
+					NX_RD_DUMP_REG(addr,
+						adapter->ahw.pci_base0,
+								&read_value);
+					template_hdr->saved_state_array
+					[crtEntry->state_index_v]
+						= read_value;
+					break;
+				case NX_DUMP_WRT_SAVED:
+					/* Decide which value to use */
+					if (crtEntry->state_index_v)
+						read_value =
+						template_hdr->saved_state_array
+						[crtEntry->state_index_v];
+					else
+						read_value = crtEntry->value_1;
+
+					/* Decide which address to use */
+					if (crtEntry->state_index_a)
+						addr =
+						template_hdr->saved_state_array
+						[crtEntry->state_index_a];
+
+					NX_WR_DUMP_REG(addr,
+						adapter->ahw.pci_base0,
+								read_value);
+					break;
+				case NX_DUMP_MOD_SAVE_ST:
+					read_value =
+					template_hdr->saved_state_array
+						[crtEntry->state_index_v];
+					read_value <<= crtEntry->shl;
+					read_value >>= crtEntry->shr;
+					if (crtEntry->value_2)
+						read_value &=
+						crtEntry->value_2;
+					read_value |= crtEntry->value_3;
+					read_value += crtEntry->value_1;
+					/* Write value back to state area.*/
+					template_hdr->saved_state_array
+						[crtEntry->state_index_v]
+							= read_value;
+					break;
+				default:
+					rv = 1;
+					break;
+				}
+			}
+		}
+		addr = addr + stride;
+	}
+	return rv;
+}
+
+/* Read memory or MN */
+static u32
+netxen_md_rdmem(struct netxen_adapter *adapter,
+		struct netxen_minidump_entry_rdmem
+			*memEntry, u64 *data_buff)
+{
+	u64 addr, value = 0;
+	int i = 0, loop_cnt;
+
+	addr = (u64)memEntry->read_addr;
+	loop_cnt = memEntry->read_data_size;    /* This is size in bytes */
+	loop_cnt /= sizeof(value);
+
+	for (i = 0; i < loop_cnt; i++) {
+		if (netxen_nic_pci_mem_read_2M(adapter, addr, &value))
+			goto out;
+		*data_buff++ = value;
+		addr += sizeof(value);
+	}
+out:
+	return i * sizeof(value);
+}
+
+/* Read CRB operation */
+static u32 netxen_md_rd_crb(struct netxen_adapter *adapter,
+			struct netxen_minidump_entry_crb
+				*crbEntry, u32 *data_buff)
+{
+	int loop_cnt;
+	u32 op_count, addr, stride, value;
+
+	addr = crbEntry->addr;
+	op_count = crbEntry->op_count;
+	stride = crbEntry->addr_stride;
+
+	for (loop_cnt = 0; loop_cnt < op_count; loop_cnt++) {
+		NX_RD_DUMP_REG(addr, adapter->ahw.pci_base0, &value);
+		*data_buff++ = addr;
+		*data_buff++ = value;
+		addr = addr + stride;
+	}
+	return loop_cnt * (2 * sizeof(u32));
+}
+
+/* Read ROM */
+static u32
+netxen_md_rdrom(struct netxen_adapter *adapter,
+			struct netxen_minidump_entry_rdrom
+				*romEntry, __le32 *data_buff)
+{
+	int i, count = 0;
+	u32 size, lck_val;
+	u32 val;
+	u32 fl_addr, waddr, raddr;
+	fl_addr = romEntry->read_addr;
+	size = romEntry->read_data_size/4;
+lock_try:
+	lck_val = readl((void __iomem *)(adapter->ahw.pci_base0 +
+							NX_FLASH_SEM2_LK));
+	if (!lck_val && count < MAX_CTL_CHECK) {
+		msleep(20);
+		count++;
+		goto lock_try;
+	}
+	writel(adapter->ahw.pci_func, (void __iomem *)(adapter->ahw.pci_base0 +
+							NX_FLASH_LOCK_ID));
+	for (i = 0; i < size; i++) {
+		waddr = fl_addr & 0xFFFF0000;
+		NX_WR_DUMP_REG(FLASH_ROM_WINDOW, adapter->ahw.pci_base0, waddr);
+		raddr = FLASH_ROM_DATA + (fl_addr & 0x0000FFFF);
+		NX_RD_DUMP_REG(raddr, adapter->ahw.pci_base0, &val);
+		*data_buff++ = cpu_to_le32(val);
+		fl_addr += sizeof(val);
+	}
+	readl((void __iomem *)(adapter->ahw.pci_base0 + NX_FLASH_SEM2_ULK));
+	return romEntry->read_data_size;
+}
+
+/* Handle L2 Cache */
+static u32
+netxen_md_L2Cache(struct netxen_adapter *adapter,
+				struct netxen_minidump_entry_cache
+					*cacheEntry, u32 *data_buff)
+{
+	int loop_cnt, i, k, timeout_flag = 0;
+	u32 addr, read_addr, read_value, cntrl_addr, tag_reg_addr;
+	u32 tag_value, read_cnt;
+	u8 cntl_value_w, cntl_value_r;
+	unsigned long timeout, timeout_jiffies;
+
+	loop_cnt = cacheEntry->op_count;
+	read_addr = cacheEntry->read_addr;
+	cntrl_addr = cacheEntry->control_addr;
+	cntl_value_w = (u32) cacheEntry->write_value;
+	tag_reg_addr = cacheEntry->tag_reg_addr;
+	tag_value = cacheEntry->init_tag_value;
+	read_cnt = cacheEntry->read_addr_cnt;
+
+	for (i = 0; i < loop_cnt; i++) {
+		NX_WR_DUMP_REG(tag_reg_addr, adapter->ahw.pci_base0, tag_value);
+		if (cntl_value_w)
+			NX_WR_DUMP_REG(cntrl_addr, adapter->ahw.pci_base0,
+					(u32)cntl_value_w);
+		if (cacheEntry->poll_mask) {
+			timeout = cacheEntry->poll_wait;
+			NX_RD_DUMP_REG(cntrl_addr, adapter->ahw.pci_base0,
+							&cntl_value_r);
+			timeout_jiffies = msecs_to_jiffies(timeout) + jiffies;
+			for (timeout_flag = 0; !timeout_flag &&
+			((cntl_value_r & cacheEntry->poll_mask) != 0);) {
+				if (time_after(jiffies, timeout_jiffies))
+					timeout_flag = 1;
+				NX_RD_DUMP_REG(cntrl_addr,
+					adapter->ahw.pci_base0,
+							&cntl_value_r);
+			}
+			if (timeout_flag) {
+				dev_err(&adapter->pdev->dev,
+						"Timeout in processing L2 Tag poll.\n");
+				return -1;
+			}
+		}
+		addr = read_addr;
+		for (k = 0; k < read_cnt; k++) {
+			NX_RD_DUMP_REG(addr, adapter->ahw.pci_base0,
+					&read_value);
+			*data_buff++ = read_value;
+			addr += cacheEntry->read_addr_stride;
+		}
+		tag_value += cacheEntry->tag_value_stride;
+	}
+	return read_cnt * loop_cnt * sizeof(read_value);
+}
+
+
+/* Handle L1 Cache */
+static u32 netxen_md_L1Cache(struct netxen_adapter *adapter,
+				struct netxen_minidump_entry_cache
+					*cacheEntry, u32 *data_buff)
+{
+	int i, k, loop_cnt;
+	u32 addr, read_addr, read_value, cntrl_addr, tag_reg_addr;
+	u32 tag_value, read_cnt;
+	u8 cntl_value_w;
+
+	loop_cnt = cacheEntry->op_count;
+	read_addr = cacheEntry->read_addr;
+	cntrl_addr = cacheEntry->control_addr;
+	cntl_value_w = (u32) cacheEntry->write_value;
+	tag_reg_addr = cacheEntry->tag_reg_addr;
+	tag_value = cacheEntry->init_tag_value;
+	read_cnt = cacheEntry->read_addr_cnt;
+
+	for (i = 0; i < loop_cnt; i++) {
+		NX_WR_DUMP_REG(tag_reg_addr, adapter->ahw.pci_base0, tag_value);
+		NX_WR_DUMP_REG(cntrl_addr, adapter->ahw.pci_base0,
+						(u32) cntl_value_w);
+		addr = read_addr;
+		for (k = 0; k < read_cnt; k++) {
+			NX_RD_DUMP_REG(addr,
+				adapter->ahw.pci_base0,
+						&read_value);
+			*data_buff++ = read_value;
+			addr += cacheEntry->read_addr_stride;
+		}
+		tag_value += cacheEntry->tag_value_stride;
+	}
+	return read_cnt * loop_cnt * sizeof(read_value);
+}
+
+/* Reading OCM memory */
+static u32
+netxen_md_rdocm(struct netxen_adapter *adapter,
+				struct netxen_minidump_entry_rdocm
+					*ocmEntry, u32 *data_buff)
+{
+	int i, loop_cnt;
+	u32 value;
+	void __iomem *addr;
+	addr = (ocmEntry->read_addr + adapter->ahw.pci_base0);
+	loop_cnt = ocmEntry->op_count;
+
+	for (i = 0; i < loop_cnt; i++) {
+		value = readl(addr);
+		*data_buff++ = value;
+		addr += ocmEntry->read_addr_stride;
+	}
+	return i * sizeof(u32);
+}
+
+/* Read MUX data */
+static u32
+netxen_md_rdmux(struct netxen_adapter *adapter, struct netxen_minidump_entry_mux
+					*muxEntry, u32 *data_buff)
+{
+	int loop_cnt = 0;
+	u32 read_addr, read_value, select_addr, sel_value;
+
+	read_addr = muxEntry->read_addr;
+	sel_value = muxEntry->select_value;
+	select_addr = muxEntry->select_addr;
+
+	for (loop_cnt = 0; loop_cnt < muxEntry->op_count; loop_cnt++) {
+		NX_WR_DUMP_REG(select_addr, adapter->ahw.pci_base0, sel_value);
+		NX_RD_DUMP_REG(read_addr, adapter->ahw.pci_base0, &read_value);
+		*data_buff++ = sel_value;
+		*data_buff++ = read_value;
+		sel_value += muxEntry->select_value_stride;
+	}
+	return loop_cnt * (2 * sizeof(u32));
+}
+
+/* Handling Queue State Reads */
+static u32
+netxen_md_rdqueue(struct netxen_adapter *adapter,
+				struct netxen_minidump_entry_queue
+					*queueEntry, u32 *data_buff)
+{
+	int loop_cnt, k;
+	u32 queue_id, read_addr, read_value, read_stride, select_addr, read_cnt;
+
+	read_cnt = queueEntry->read_addr_cnt;
+	read_stride = queueEntry->read_addr_stride;
+	select_addr = queueEntry->select_addr;
+
+	for (loop_cnt = 0, queue_id = 0; loop_cnt < queueEntry->op_count;
+				 loop_cnt++) {
+		NX_WR_DUMP_REG(select_addr, adapter->ahw.pci_base0, queue_id);
+		read_addr = queueEntry->read_addr;
+		for (k = 0; k < read_cnt; k++) {
+			NX_RD_DUMP_REG(read_addr, adapter->ahw.pci_base0,
+							&read_value);
+			*data_buff++ = read_value;
+			read_addr += read_stride;
+		}
+		queue_id += queueEntry->queue_id_stride;
+	}
+	return loop_cnt * (read_cnt * sizeof(read_value));
+}
+
+
+/*
+* We catch an error where driver does not read
+* as much data as we expect from the entry.
+*/
+
+static int netxen_md_entry_err_chk(struct netxen_adapter *adapter,
+				struct netxen_minidump_entry *entry, int esize)
+{
+	if (esize < 0) {
+		entry->hdr.driver_flags |= NX_DUMP_SKIP;
+		return esize;
+	}
+	if (esize != entry->hdr.entry_capture_size) {
+		entry->hdr.entry_capture_size = esize;
+		entry->hdr.driver_flags |= NX_DUMP_SIZE_ERR;
+		dev_info(&adapter->pdev->dev,
+			"Invalidate dump, Type:%d\tMask:%d\tSize:%dCap_size:%d\n",
+			entry->hdr.entry_type, entry->hdr.entry_capture_mask,
+			esize, entry->hdr.entry_capture_size);
+		dev_info(&adapter->pdev->dev, "Aborting further dump capture\n");
+	}
+	return 0;
+}
+
+static int netxen_parse_md_template(struct netxen_adapter *adapter)
+{
+	int num_of_entries, buff_level, e_cnt, esize;
+	int end_cnt = 0, rv = 0, sane_start = 0, sane_end = 0;
+	char *dbuff;
+	void *template_buff = adapter->mdump.md_template;
+	char *dump_buff = adapter->mdump.md_capture_buff;
+	int capture_mask = adapter->mdump.md_capture_mask;
+	struct netxen_minidump_template_hdr *template_hdr;
+	struct netxen_minidump_entry *entry;
+
+	if ((capture_mask & 0x3) != 0x3) {
+		dev_err(&adapter->pdev->dev, "Capture mask %02x below minimum needed "
+			"for valid firmware dump\n", capture_mask);
+		return -EINVAL;
+	}
+	template_hdr = (struct netxen_minidump_template_hdr *) template_buff;
+	num_of_entries = template_hdr->num_of_entries;
+	entry = (struct netxen_minidump_entry *) ((char *) template_buff +
+				template_hdr->first_entry_offset);
+	memcpy(dump_buff, template_buff, adapter->mdump.md_template_size);
+	dump_buff = dump_buff + adapter->mdump.md_template_size;
+
+	if (template_hdr->entry_type == TLHDR)
+		sane_start = 1;
+
+	for (e_cnt = 0, buff_level = 0; e_cnt < num_of_entries; e_cnt++) {
+		if (!(entry->hdr.entry_capture_mask & capture_mask)) {
+			entry->hdr.driver_flags |= NX_DUMP_SKIP;
+			entry = (struct netxen_minidump_entry *)
+				((char *) entry + entry->hdr.entry_size);
+			continue;
+		}
+		switch (entry->hdr.entry_type) {
+		case RDNOP:
+			entry->hdr.driver_flags |= NX_DUMP_SKIP;
+			break;
+		case RDEND:
+			entry->hdr.driver_flags |= NX_DUMP_SKIP;
+			if (!sane_end)
+				end_cnt = e_cnt;
+			sane_end += 1;
+			break;
+		case CNTRL:
+			rv = netxen_md_cntrl(adapter,
+				template_hdr, (void *)entry);
+			if (rv)
+				entry->hdr.driver_flags |= NX_DUMP_SKIP;
+			break;
+		case RDCRB:
+			dbuff = dump_buff + buff_level;
+			esize = netxen_md_rd_crb(adapter,
+					(void *) entry, (void *) dbuff);
+			rv = netxen_md_entry_err_chk
+				(adapter, entry, esize);
+			if (rv < 0)
+				break;
+			buff_level += esize;
+			break;
+		case RDMN:
+		case RDMEM:
+			dbuff = dump_buff + buff_level;
+			esize = netxen_md_rdmem(adapter,
+				(void *) entry, (void *) dbuff);
+			rv = netxen_md_entry_err_chk
+				(adapter, entry, esize);
+			if (rv < 0)
+				break;
+			buff_level += esize;
+			break;
+		case BOARD:
+		case RDROM:
+			dbuff = dump_buff + buff_level;
+			esize = netxen_md_rdrom(adapter,
+				(void *) entry, (void *) dbuff);
+			rv = netxen_md_entry_err_chk
+				(adapter, entry, esize);
+			if (rv < 0)
+				break;
+			buff_level += esize;
+			break;
+		case L2ITG:
+		case L2DTG:
+		case L2DAT:
+		case L2INS:
+			dbuff = dump_buff + buff_level;
+			esize = netxen_md_L2Cache(adapter,
+				(void *) entry, (void *) dbuff);
+			rv = netxen_md_entry_err_chk
+				(adapter, entry, esize);
+			if (rv < 0)
+				break;
+			buff_level += esize;
+			break;
+		case L1DAT:
+		case L1INS:
+			dbuff = dump_buff + buff_level;
+			esize = netxen_md_L1Cache(adapter,
+				(void *) entry, (void *) dbuff);
+			rv = netxen_md_entry_err_chk
+				(adapter, entry, esize);
+			if (rv < 0)
+				break;
+			buff_level += esize;
+			break;
+		case RDOCM:
+			dbuff = dump_buff + buff_level;
+			esize = netxen_md_rdocm(adapter,
+				(void *) entry, (void *) dbuff);
+			rv = netxen_md_entry_err_chk
+				(adapter, entry, esize);
+			if (rv < 0)
+				break;
+			buff_level += esize;
+			break;
+		case RDMUX:
+			dbuff = dump_buff + buff_level;
+			esize = netxen_md_rdmux(adapter,
+				(void *) entry, (void *) dbuff);
+			rv = netxen_md_entry_err_chk
+				(adapter, entry, esize);
+			if (rv < 0)
+				break;
+			buff_level += esize;
+			break;
+		case QUEUE:
+			dbuff = dump_buff + buff_level;
+			esize = netxen_md_rdqueue(adapter,
+				(void *) entry, (void *) dbuff);
+			rv = netxen_md_entry_err_chk
+				(adapter, entry, esize);
+			if (rv  < 0)
+				break;
+			buff_level += esize;
+			break;
+		default:
+			entry->hdr.driver_flags |= NX_DUMP_SKIP;
+			break;
+		}
+		/* Next entry in the template */
+		entry = (struct netxen_minidump_entry *)
+			((char *) entry + entry->hdr.entry_size);
+	}
+	if (!sane_start || sane_end > 1) {
+		dev_err(&adapter->pdev->dev,
+				"Firmware minidump template configuration error.\n");
+	}
+	return 0;
+}
+
+static int
+netxen_collect_minidump(struct netxen_adapter *adapter)
+{
+	int ret = 0;
+	struct netxen_minidump_template_hdr *hdr;
+	hdr = (struct netxen_minidump_template_hdr *)
+				adapter->mdump.md_template;
+	hdr->driver_capture_mask = adapter->mdump.md_capture_mask;
+	hdr->driver_timestamp = ktime_get_seconds();
+	hdr->driver_info_word2 = adapter->fw_version;
+	hdr->driver_info_word3 = NXRD32(adapter, CRB_DRIVER_VERSION);
+	ret = netxen_parse_md_template(adapter);
+	if (ret)
+		return ret;
+
+	return ret;
+}
+
+
+void
+netxen_dump_fw(struct netxen_adapter *adapter)
+{
+	struct netxen_minidump_template_hdr *hdr;
+	int i, k, data_size = 0;
+	u32 capture_mask;
+	hdr = (struct netxen_minidump_template_hdr *)
+				adapter->mdump.md_template;
+	capture_mask = adapter->mdump.md_capture_mask;
+
+	for (i = 0x2, k = 1; (i & NX_DUMP_MASK_MAX); i <<= 1, k++) {
+		if (i & capture_mask)
+			data_size += hdr->capture_size_array[k];
+	}
+	if (!data_size) {
+		dev_err(&adapter->pdev->dev,
+				"Invalid cap sizes for capture_mask=0x%x\n",
+			adapter->mdump.md_capture_mask);
+		return;
+	}
+	adapter->mdump.md_capture_size = data_size;
+	adapter->mdump.md_dump_size = adapter->mdump.md_template_size +
+					adapter->mdump.md_capture_size;
+	if (!adapter->mdump.md_capture_buff) {
+		adapter->mdump.md_capture_buff =
+				vzalloc(adapter->mdump.md_dump_size);
+		if (!adapter->mdump.md_capture_buff)
+			return;
+
+		if (netxen_collect_minidump(adapter)) {
+			adapter->mdump.has_valid_dump = 0;
+			adapter->mdump.md_dump_size = 0;
+			vfree(adapter->mdump.md_capture_buff);
+			adapter->mdump.md_capture_buff = NULL;
+			dev_err(&adapter->pdev->dev,
+				"Error in collecting firmware minidump.\n");
+		} else {
+			adapter->mdump.md_timestamp = jiffies;
+			adapter->mdump.has_valid_dump = 1;
+			adapter->fw_mdump_rdy = 1;
+			dev_info(&adapter->pdev->dev, "%s Successfully "
+				"collected fw dump.\n", adapter->netdev->name);
+		}
+
+	} else {
+		dev_info(&adapter->pdev->dev,
+					"Cannot overwrite previously collected "
+							"firmware minidump.\n");
+		adapter->fw_mdump_rdy = 1;
+		return;
+	}
+}
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.h b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.h
new file mode 100644
index 0000000..7433c4d
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.h
@@ -0,0 +1,285 @@
+/*
+ * Copyright (C) 2003 - 2009 NetXen, Inc.
+ * Copyright (C) 2009 - QLogic Corporation.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution
+ * in the file called "COPYING".
+ *
+ */
+
+#ifndef __NETXEN_NIC_HW_H_
+#define __NETXEN_NIC_HW_H_
+
+/* Hardware memory size of 128 meg */
+#define NETXEN_MEMADDR_MAX (128 * 1024 * 1024)
+
+struct netxen_adapter;
+
+#define NETXEN_PCI_MAPSIZE_BYTES  (NETXEN_PCI_MAPSIZE << 20)
+
+void netxen_nic_set_link_parameters(struct netxen_adapter *adapter);
+
+/* Nibble or Byte mode for phy interface (GbE mode only) */
+
+#define _netxen_crb_get_bit(var, bit)  ((var >> bit) & 0x1)
+
+/*
+ * NIU GB MAC Config Register 0 (applies to GB0, GB1, GB2, GB3)
+ *
+ *	Bit 0 : enable_tx => 1:enable frame xmit, 0:disable
+ *	Bit 1 : tx_synced => R/O: xmit enable synched to xmit stream
+ *	Bit 2 : enable_rx => 1:enable frame recv, 0:disable
+ *	Bit 3 : rx_synced => R/O: recv enable synched to recv stream
+ *	Bit 4 : tx_flowctl => 1:enable pause frame generation, 0:disable
+ *	Bit 5 : rx_flowctl => 1:act on recv'd pause frames, 0:ignore
+ *	Bit 8 : loopback => 1:loop MAC xmits to MAC recvs, 0:normal
+ *	Bit 16: tx_reset_pb => 1:reset frame xmit protocol blk, 0:no-op
+ *	Bit 17: rx_reset_pb => 1:reset frame recv protocol blk, 0:no-op
+ *	Bit 18: tx_reset_mac => 1:reset data/ctl multiplexer blk, 0:no-op
+ *	Bit 19: rx_reset_mac => 1:reset ctl frames & timers blk, 0:no-op
+ *	Bit 31: soft_reset => 1:reset the MAC and the SERDES, 0:no-op
+ */
+
+#define netxen_gb_tx_flowctl(config_word)	\
+	((config_word) |= 1 << 4)
+#define netxen_gb_rx_flowctl(config_word)	\
+	((config_word) |= 1 << 5)
+#define netxen_gb_tx_reset_pb(config_word)	\
+	((config_word) |= 1 << 16)
+#define netxen_gb_rx_reset_pb(config_word)	\
+	((config_word) |= 1 << 17)
+#define netxen_gb_tx_reset_mac(config_word)	\
+	((config_word) |= 1 << 18)
+#define netxen_gb_rx_reset_mac(config_word)	\
+	((config_word) |= 1 << 19)
+
+#define netxen_gb_unset_tx_flowctl(config_word)	\
+	((config_word) &= ~(1 << 4))
+#define netxen_gb_unset_rx_flowctl(config_word)	\
+	((config_word) &= ~(1 << 5))
+
+#define netxen_gb_get_tx_synced(config_word)	\
+		_netxen_crb_get_bit((config_word), 1)
+#define netxen_gb_get_rx_synced(config_word)	\
+		_netxen_crb_get_bit((config_word), 3)
+#define netxen_gb_get_tx_flowctl(config_word)	\
+		_netxen_crb_get_bit((config_word), 4)
+#define netxen_gb_get_rx_flowctl(config_word)	\
+		_netxen_crb_get_bit((config_word), 5)
+#define netxen_gb_get_soft_reset(config_word)	\
+		_netxen_crb_get_bit((config_word), 31)
+
+#define netxen_gb_get_stationaddress_low(config_word) ((config_word) >> 16)
+
+#define netxen_gb_set_mii_mgmt_clockselect(config_word, val)	\
+		((config_word) |= ((val) & 0x07))
+#define netxen_gb_mii_mgmt_reset(config_word)	\
+		((config_word) |= 1 << 31)
+#define netxen_gb_mii_mgmt_unset(config_word)	\
+		((config_word) &= ~(1 << 31))
+
+/*
+ * NIU GB MII Mgmt Command Register (applies to GB0, GB1, GB2, GB3)
+ * Bit 0 : read_cycle => 1:perform single read cycle, 0:no-op
+ * Bit 1 : scan_cycle => 1:perform continuous read cycles, 0:no-op
+ */
+
+#define netxen_gb_mii_mgmt_set_read_cycle(config_word)	\
+		((config_word) |= 1 << 0)
+#define netxen_gb_mii_mgmt_reg_addr(config_word, val)	\
+		((config_word) |= ((val) & 0x1F))
+#define netxen_gb_mii_mgmt_phy_addr(config_word, val)	\
+		((config_word) |= (((val) & 0x1F) << 8))
+
+/*
+ * NIU GB MII Mgmt Indicators Register (applies to GB0, GB1, GB2, GB3)
+ * Read-only register.
+ * Bit 0 : busy => 1:performing an MII mgmt cycle, 0:idle
+ * Bit 1 : scanning => 1:scan operation in progress, 0:idle
+ * Bit 2 : notvalid => :mgmt result data not yet valid, 0:idle
+ */
+#define netxen_get_gb_mii_mgmt_busy(config_word)	\
+		_netxen_crb_get_bit(config_word, 0)
+#define netxen_get_gb_mii_mgmt_scanning(config_word)	\
+		_netxen_crb_get_bit(config_word, 1)
+#define netxen_get_gb_mii_mgmt_notvalid(config_word)	\
+		_netxen_crb_get_bit(config_word, 2)
+/*
+ * NIU XG Pause Ctl Register
+ *
+ *      Bit 0       : xg0_mask => 1:disable tx pause frames
+ *      Bit 1       : xg0_request => 1:request single pause frame
+ *      Bit 2       : xg0_on_off => 1:request is pause on, 0:off
+ *      Bit 3       : xg1_mask => 1:disable tx pause frames
+ *      Bit 4       : xg1_request => 1:request single pause frame
+ *      Bit 5       : xg1_on_off => 1:request is pause on, 0:off
+ */
+
+#define netxen_xg_set_xg0_mask(config_word)    \
+	((config_word) |= 1 << 0)
+#define netxen_xg_set_xg1_mask(config_word)    \
+	((config_word) |= 1 << 3)
+
+#define netxen_xg_get_xg0_mask(config_word)    \
+	_netxen_crb_get_bit((config_word), 0)
+#define netxen_xg_get_xg1_mask(config_word)    \
+	_netxen_crb_get_bit((config_word), 3)
+
+#define netxen_xg_unset_xg0_mask(config_word)  \
+	((config_word) &= ~(1 << 0))
+#define netxen_xg_unset_xg1_mask(config_word)  \
+	((config_word) &= ~(1 << 3))
+
+/*
+ * NIU XG Pause Ctl Register
+ *
+ *      Bit 0       : xg0_mask => 1:disable tx pause frames
+ *      Bit 1       : xg0_request => 1:request single pause frame
+ *      Bit 2       : xg0_on_off => 1:request is pause on, 0:off
+ *      Bit 3       : xg1_mask => 1:disable tx pause frames
+ *      Bit 4       : xg1_request => 1:request single pause frame
+ *      Bit 5       : xg1_on_off => 1:request is pause on, 0:off
+ */
+#define netxen_gb_set_gb0_mask(config_word)    \
+	((config_word) |= 1 << 0)
+#define netxen_gb_set_gb1_mask(config_word)    \
+	((config_word) |= 1 << 2)
+#define netxen_gb_set_gb2_mask(config_word)    \
+	((config_word) |= 1 << 4)
+#define netxen_gb_set_gb3_mask(config_word)    \
+	((config_word) |= 1 << 6)
+
+#define netxen_gb_get_gb0_mask(config_word)    \
+	_netxen_crb_get_bit((config_word), 0)
+#define netxen_gb_get_gb1_mask(config_word)    \
+	_netxen_crb_get_bit((config_word), 2)
+#define netxen_gb_get_gb2_mask(config_word)    \
+	_netxen_crb_get_bit((config_word), 4)
+#define netxen_gb_get_gb3_mask(config_word)    \
+	_netxen_crb_get_bit((config_word), 6)
+
+#define netxen_gb_unset_gb0_mask(config_word)  \
+	((config_word) &= ~(1 << 0))
+#define netxen_gb_unset_gb1_mask(config_word)  \
+	((config_word) &= ~(1 << 2))
+#define netxen_gb_unset_gb2_mask(config_word)  \
+	((config_word) &= ~(1 << 4))
+#define netxen_gb_unset_gb3_mask(config_word)  \
+	((config_word) &= ~(1 << 6))
+
+
+/*
+ * PHY-Specific MII control/status registers.
+ */
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_CONTROL		0
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_STATUS		1
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_ID_0		2
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_ID_1		3
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_AUTONEG		4
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_LNKPART		5
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_AUTONEG_MORE	6
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_NEXTPAGE_XMIT	7
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_LNKPART_NEXTPAGE	8
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_1000BT_CONTROL	9
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_1000BT_STATUS	10
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_EXTENDED_STATUS	15
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_CONTROL		16
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_STATUS		17
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_INT_ENABLE		18
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_INT_STATUS		19
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_CONTROL_MORE	20
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_RECV_ERROR_COUNT	21
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_LED_CONTROL		24
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_LED_OVERRIDE	25
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_CONTROL_MORE_YET	26
+#define NETXEN_NIU_GB_MII_MGMT_ADDR_PHY_STATUS_MORE	27
+
+/*
+ * PHY-Specific Status Register (reg 17).
+ *
+ * Bit 0      : jabber => 1:jabber detected, 0:not
+ * Bit 1      : polarity => 1:polarity reversed, 0:normal
+ * Bit 2      : recvpause => 1:receive pause enabled, 0:disabled
+ * Bit 3      : xmitpause => 1:transmit pause enabled, 0:disabled
+ * Bit 4      : energydetect => 1:sleep, 0:active
+ * Bit 5      : downshift => 1:downshift, 0:no downshift
+ * Bit 6      : crossover => 1:MDIX (crossover), 0:MDI (no crossover)
+ * Bits 7-9   : cablelen => not valid in 10Mb/s mode
+ *			0:<50m, 1:50-80m, 2:80-110m, 3:110-140m, 4:>140m
+ * Bit 10     : link => 1:link up, 0:link down
+ * Bit 11     : resolved => 1:speed and duplex resolved, 0:not yet
+ * Bit 12     : pagercvd => 1:page received, 0:page not received
+ * Bit 13     : duplex => 1:full duplex, 0:half duplex
+ * Bits 14-15 : speed => 0:10Mb/s, 1:100Mb/s, 2:1000Mb/s, 3:rsvd
+ */
+
+#define netxen_get_phy_speed(config_word) (((config_word) >> 14) & 0x03)
+
+#define netxen_set_phy_speed(config_word, val)	\
+		((config_word) |= ((val & 0x03) << 14))
+#define netxen_set_phy_duplex(config_word)	\
+		((config_word) |= 1 << 13)
+#define netxen_clear_phy_duplex(config_word)	\
+		((config_word) &= ~(1 << 13))
+
+#define netxen_get_phy_link(config_word)	\
+		_netxen_crb_get_bit(config_word, 10)
+#define netxen_get_phy_duplex(config_word)	\
+		_netxen_crb_get_bit(config_word, 13)
+
+/*
+ * NIU Mode Register.
+ * Bit 0 : enable FibreChannel
+ * Bit 1 : enable 10/100/1000 Ethernet
+ * Bit 2 : enable 10Gb Ethernet
+ */
+
+#define netxen_get_niu_enable_ge(config_word)	\
+		_netxen_crb_get_bit(config_word, 1)
+
+#define NETXEN_NIU_NON_PROMISC_MODE	0
+#define NETXEN_NIU_PROMISC_MODE		1
+#define NETXEN_NIU_ALLMULTI_MODE	2
+
+/*
+ * NIU XG MAC Config Register
+ *
+ * Bit 0 : tx_enable => 1:enable frame xmit, 0:disable
+ * Bit 2 : rx_enable => 1:enable frame recv, 0:disable
+ * Bit 4 : soft_reset => 1:reset the MAC , 0:no-op
+ * Bit 27: xaui_framer_reset
+ * Bit 28: xaui_rx_reset
+ * Bit 29: xaui_tx_reset
+ * Bit 30: xg_ingress_afifo_reset
+ * Bit 31: xg_egress_afifo_reset
+ */
+
+#define netxen_xg_soft_reset(config_word)	\
+		((config_word) |= 1 << 4)
+
+typedef struct {
+	unsigned valid;
+	unsigned start_128M;
+	unsigned end_128M;
+	unsigned start_2M;
+} crb_128M_2M_sub_block_map_t;
+
+typedef struct {
+	crb_128M_2M_sub_block_map_t sub_block[16];
+} crb_128M_2M_block_map_t;
+
+#endif				/* __NETXEN_NIC_HW_H_ */
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_init.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_init.c
new file mode 100644
index 0000000..0ea141e
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_init.c
@@ -0,0 +1,1928 @@
+/*
+ * Copyright (C) 2003 - 2009 NetXen, Inc.
+ * Copyright (C) 2009 - QLogic Corporation.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution
+ * in the file called "COPYING".
+ *
+ */
+
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/if_vlan.h>
+#include <net/checksum.h>
+#include "netxen_nic.h"
+#include "netxen_nic_hw.h"
+
+struct crb_addr_pair {
+	u32 addr;
+	u32 data;
+};
+
+#define NETXEN_MAX_CRB_XFORM 60
+static unsigned int crb_addr_xform[NETXEN_MAX_CRB_XFORM];
+#define NETXEN_ADDR_ERROR (0xffffffff)
+
+#define crb_addr_transform(name) \
+	crb_addr_xform[NETXEN_HW_PX_MAP_CRB_##name] = \
+	NETXEN_HW_CRB_HUB_AGT_ADR_##name << 20
+
+#define NETXEN_NIC_XDMA_RESET 0x8000ff
+
+static void
+netxen_post_rx_buffers_nodb(struct netxen_adapter *adapter,
+		struct nx_host_rds_ring *rds_ring);
+static int netxen_p3_has_mn(struct netxen_adapter *adapter);
+
+static void crb_addr_transform_setup(void)
+{
+	crb_addr_transform(XDMA);
+	crb_addr_transform(TIMR);
+	crb_addr_transform(SRE);
+	crb_addr_transform(SQN3);
+	crb_addr_transform(SQN2);
+	crb_addr_transform(SQN1);
+	crb_addr_transform(SQN0);
+	crb_addr_transform(SQS3);
+	crb_addr_transform(SQS2);
+	crb_addr_transform(SQS1);
+	crb_addr_transform(SQS0);
+	crb_addr_transform(RPMX7);
+	crb_addr_transform(RPMX6);
+	crb_addr_transform(RPMX5);
+	crb_addr_transform(RPMX4);
+	crb_addr_transform(RPMX3);
+	crb_addr_transform(RPMX2);
+	crb_addr_transform(RPMX1);
+	crb_addr_transform(RPMX0);
+	crb_addr_transform(ROMUSB);
+	crb_addr_transform(SN);
+	crb_addr_transform(QMN);
+	crb_addr_transform(QMS);
+	crb_addr_transform(PGNI);
+	crb_addr_transform(PGND);
+	crb_addr_transform(PGN3);
+	crb_addr_transform(PGN2);
+	crb_addr_transform(PGN1);
+	crb_addr_transform(PGN0);
+	crb_addr_transform(PGSI);
+	crb_addr_transform(PGSD);
+	crb_addr_transform(PGS3);
+	crb_addr_transform(PGS2);
+	crb_addr_transform(PGS1);
+	crb_addr_transform(PGS0);
+	crb_addr_transform(PS);
+	crb_addr_transform(PH);
+	crb_addr_transform(NIU);
+	crb_addr_transform(I2Q);
+	crb_addr_transform(EG);
+	crb_addr_transform(MN);
+	crb_addr_transform(MS);
+	crb_addr_transform(CAS2);
+	crb_addr_transform(CAS1);
+	crb_addr_transform(CAS0);
+	crb_addr_transform(CAM);
+	crb_addr_transform(C2C1);
+	crb_addr_transform(C2C0);
+	crb_addr_transform(SMB);
+	crb_addr_transform(OCM0);
+	crb_addr_transform(I2C0);
+}
+
+void netxen_release_rx_buffers(struct netxen_adapter *adapter)
+{
+	struct netxen_recv_context *recv_ctx;
+	struct nx_host_rds_ring *rds_ring;
+	struct netxen_rx_buffer *rx_buf;
+	int i, ring;
+
+	recv_ctx = &adapter->recv_ctx;
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &recv_ctx->rds_rings[ring];
+		for (i = 0; i < rds_ring->num_desc; ++i) {
+			rx_buf = &(rds_ring->rx_buf_arr[i]);
+			if (rx_buf->state == NETXEN_BUFFER_FREE)
+				continue;
+			pci_unmap_single(adapter->pdev,
+					rx_buf->dma,
+					rds_ring->dma_size,
+					PCI_DMA_FROMDEVICE);
+			if (rx_buf->skb != NULL)
+				dev_kfree_skb_any(rx_buf->skb);
+		}
+	}
+}
+
+void netxen_release_tx_buffers(struct netxen_adapter *adapter)
+{
+	struct netxen_cmd_buffer *cmd_buf;
+	struct netxen_skb_frag *buffrag;
+	int i, j;
+	struct nx_host_tx_ring *tx_ring = adapter->tx_ring;
+
+	spin_lock_bh(&adapter->tx_clean_lock);
+	cmd_buf = tx_ring->cmd_buf_arr;
+	for (i = 0; i < tx_ring->num_desc; i++) {
+		buffrag = cmd_buf->frag_array;
+		if (buffrag->dma) {
+			pci_unmap_single(adapter->pdev, buffrag->dma,
+					 buffrag->length, PCI_DMA_TODEVICE);
+			buffrag->dma = 0ULL;
+		}
+		for (j = 1; j < cmd_buf->frag_count; j++) {
+			buffrag++;
+			if (buffrag->dma) {
+				pci_unmap_page(adapter->pdev, buffrag->dma,
+					       buffrag->length,
+					       PCI_DMA_TODEVICE);
+				buffrag->dma = 0ULL;
+			}
+		}
+		if (cmd_buf->skb) {
+			dev_kfree_skb_any(cmd_buf->skb);
+			cmd_buf->skb = NULL;
+		}
+		cmd_buf++;
+	}
+	spin_unlock_bh(&adapter->tx_clean_lock);
+}
+
+void netxen_free_sw_resources(struct netxen_adapter *adapter)
+{
+	struct netxen_recv_context *recv_ctx;
+	struct nx_host_rds_ring *rds_ring;
+	struct nx_host_tx_ring *tx_ring;
+	int ring;
+
+	recv_ctx = &adapter->recv_ctx;
+
+	if (recv_ctx->rds_rings == NULL)
+		goto skip_rds;
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &recv_ctx->rds_rings[ring];
+		vfree(rds_ring->rx_buf_arr);
+		rds_ring->rx_buf_arr = NULL;
+	}
+	kfree(recv_ctx->rds_rings);
+
+skip_rds:
+	if (adapter->tx_ring == NULL)
+		return;
+
+	tx_ring = adapter->tx_ring;
+	vfree(tx_ring->cmd_buf_arr);
+	kfree(tx_ring);
+	adapter->tx_ring = NULL;
+}
+
+int netxen_alloc_sw_resources(struct netxen_adapter *adapter)
+{
+	struct netxen_recv_context *recv_ctx;
+	struct nx_host_rds_ring *rds_ring;
+	struct nx_host_sds_ring *sds_ring;
+	struct nx_host_tx_ring *tx_ring;
+	struct netxen_rx_buffer *rx_buf;
+	int ring, i;
+
+	struct netxen_cmd_buffer *cmd_buf_arr;
+	struct net_device *netdev = adapter->netdev;
+
+	tx_ring = kzalloc(sizeof(struct nx_host_tx_ring), GFP_KERNEL);
+	if (tx_ring == NULL)
+		return -ENOMEM;
+
+	adapter->tx_ring = tx_ring;
+
+	tx_ring->num_desc = adapter->num_txd;
+	tx_ring->txq = netdev_get_tx_queue(netdev, 0);
+
+	cmd_buf_arr = vzalloc(TX_BUFF_RINGSIZE(tx_ring));
+	if (cmd_buf_arr == NULL)
+		goto err_out;
+
+	tx_ring->cmd_buf_arr = cmd_buf_arr;
+
+	recv_ctx = &adapter->recv_ctx;
+
+	rds_ring = kcalloc(adapter->max_rds_rings,
+			   sizeof(struct nx_host_rds_ring), GFP_KERNEL);
+	if (rds_ring == NULL)
+		goto err_out;
+
+	recv_ctx->rds_rings = rds_ring;
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &recv_ctx->rds_rings[ring];
+		switch (ring) {
+		case RCV_RING_NORMAL:
+			rds_ring->num_desc = adapter->num_rxd;
+			if (adapter->ahw.cut_through) {
+				rds_ring->dma_size =
+					NX_CT_DEFAULT_RX_BUF_LEN;
+				rds_ring->skb_size =
+					NX_CT_DEFAULT_RX_BUF_LEN;
+			} else {
+				if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+					rds_ring->dma_size =
+						NX_P3_RX_BUF_MAX_LEN;
+				else
+					rds_ring->dma_size =
+						NX_P2_RX_BUF_MAX_LEN;
+				rds_ring->skb_size =
+					rds_ring->dma_size + NET_IP_ALIGN;
+			}
+			break;
+
+		case RCV_RING_JUMBO:
+			rds_ring->num_desc = adapter->num_jumbo_rxd;
+			if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+				rds_ring->dma_size =
+					NX_P3_RX_JUMBO_BUF_MAX_LEN;
+			else
+				rds_ring->dma_size =
+					NX_P2_RX_JUMBO_BUF_MAX_LEN;
+
+			if (adapter->capabilities & NX_CAP0_HW_LRO)
+				rds_ring->dma_size += NX_LRO_BUFFER_EXTRA;
+
+			rds_ring->skb_size =
+				rds_ring->dma_size + NET_IP_ALIGN;
+			break;
+
+		case RCV_RING_LRO:
+			rds_ring->num_desc = adapter->num_lro_rxd;
+			rds_ring->dma_size = NX_RX_LRO_BUFFER_LENGTH;
+			rds_ring->skb_size = rds_ring->dma_size + NET_IP_ALIGN;
+			break;
+
+		}
+		rds_ring->rx_buf_arr = vzalloc(RCV_BUFF_RINGSIZE(rds_ring));
+		if (rds_ring->rx_buf_arr == NULL)
+			/* free whatever was already allocated */
+			goto err_out;
+
+		INIT_LIST_HEAD(&rds_ring->free_list);
+		/*
+		 * Now go through all of them, set reference handles
+		 * and put them in the queues.
+		 */
+		rx_buf = rds_ring->rx_buf_arr;
+		for (i = 0; i < rds_ring->num_desc; i++) {
+			list_add_tail(&rx_buf->list,
+					&rds_ring->free_list);
+			rx_buf->ref_handle = i;
+			rx_buf->state = NETXEN_BUFFER_FREE;
+			rx_buf++;
+		}
+		spin_lock_init(&rds_ring->lock);
+	}
+
+	for (ring = 0; ring < adapter->max_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		sds_ring->irq = adapter->msix_entries[ring].vector;
+		sds_ring->adapter = adapter;
+		sds_ring->num_desc = adapter->num_rxd;
+
+		for (i = 0; i < NUM_RCV_DESC_RINGS; i++)
+			INIT_LIST_HEAD(&sds_ring->free_list[i]);
+	}
+
+	return 0;
+
+err_out:
+	netxen_free_sw_resources(adapter);
+	return -ENOMEM;
+}
+
+/*
+ * netxen_decode_crb_addr(0 - utility to translate from internal Phantom CRB
+ * address to external PCI CRB address.
+ */
+static u32 netxen_decode_crb_addr(u32 addr)
+{
+	int i;
+	u32 base_addr, offset, pci_base;
+
+	crb_addr_transform_setup();
+
+	pci_base = NETXEN_ADDR_ERROR;
+	base_addr = addr & 0xfff00000;
+	offset = addr & 0x000fffff;
+
+	for (i = 0; i < NETXEN_MAX_CRB_XFORM; i++) {
+		if (crb_addr_xform[i] == base_addr) {
+			pci_base = i << 20;
+			break;
+		}
+	}
+	if (pci_base == NETXEN_ADDR_ERROR)
+		return pci_base;
+	else
+		return pci_base + offset;
+}
+
+#define NETXEN_MAX_ROM_WAIT_USEC	100
+
+static int netxen_wait_rom_done(struct netxen_adapter *adapter)
+{
+	long timeout = 0;
+	long done = 0;
+
+	cond_resched();
+
+	while (done == 0) {
+		done = NXRD32(adapter, NETXEN_ROMUSB_GLB_STATUS);
+		done &= 2;
+		if (++timeout >= NETXEN_MAX_ROM_WAIT_USEC) {
+			dev_err(&adapter->pdev->dev,
+				"Timeout reached  waiting for rom done");
+			return -EIO;
+		}
+		udelay(1);
+	}
+	return 0;
+}
+
+static int do_rom_fast_read(struct netxen_adapter *adapter,
+			    int addr, int *valp)
+{
+	NXWR32(adapter, NETXEN_ROMUSB_ROM_ADDRESS, addr);
+	NXWR32(adapter, NETXEN_ROMUSB_ROM_DUMMY_BYTE_CNT, 0);
+	NXWR32(adapter, NETXEN_ROMUSB_ROM_ABYTE_CNT, 3);
+	NXWR32(adapter, NETXEN_ROMUSB_ROM_INSTR_OPCODE, 0xb);
+	if (netxen_wait_rom_done(adapter)) {
+		printk("Error waiting for rom done\n");
+		return -EIO;
+	}
+	/* reset abyte_cnt and dummy_byte_cnt */
+	NXWR32(adapter, NETXEN_ROMUSB_ROM_ABYTE_CNT, 0);
+	udelay(10);
+	NXWR32(adapter, NETXEN_ROMUSB_ROM_DUMMY_BYTE_CNT, 0);
+
+	*valp = NXRD32(adapter, NETXEN_ROMUSB_ROM_RDATA);
+	return 0;
+}
+
+static int do_rom_fast_read_words(struct netxen_adapter *adapter, int addr,
+				  u8 *bytes, size_t size)
+{
+	int addridx;
+	int ret = 0;
+
+	for (addridx = addr; addridx < (addr + size); addridx += 4) {
+		int v;
+		ret = do_rom_fast_read(adapter, addridx, &v);
+		if (ret != 0)
+			break;
+		*(__le32 *)bytes = cpu_to_le32(v);
+		bytes += 4;
+	}
+
+	return ret;
+}
+
+int
+netxen_rom_fast_read_words(struct netxen_adapter *adapter, int addr,
+				u8 *bytes, size_t size)
+{
+	int ret;
+
+	ret = netxen_rom_lock(adapter);
+	if (ret < 0)
+		return ret;
+
+	ret = do_rom_fast_read_words(adapter, addr, bytes, size);
+
+	netxen_rom_unlock(adapter);
+	return ret;
+}
+
+int netxen_rom_fast_read(struct netxen_adapter *adapter, int addr, int *valp)
+{
+	int ret;
+
+	if (netxen_rom_lock(adapter) != 0)
+		return -EIO;
+
+	ret = do_rom_fast_read(adapter, addr, valp);
+	netxen_rom_unlock(adapter);
+	return ret;
+}
+
+#define NETXEN_BOARDTYPE		0x4008
+#define NETXEN_BOARDNUM 		0x400c
+#define NETXEN_CHIPNUM			0x4010
+
+int netxen_pinit_from_rom(struct netxen_adapter *adapter)
+{
+	int addr, val;
+	int i, n, init_delay = 0;
+	struct crb_addr_pair *buf;
+	unsigned offset;
+	u32 off;
+
+	/* resetall */
+	netxen_rom_lock(adapter);
+	NXWR32(adapter, NETXEN_ROMUSB_GLB_SW_RESET, 0xfeffffff);
+	netxen_rom_unlock(adapter);
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+		if (netxen_rom_fast_read(adapter, 0, &n) != 0 ||
+			(n != 0xcafecafe) ||
+			netxen_rom_fast_read(adapter, 4, &n) != 0) {
+			printk(KERN_ERR "%s: ERROR Reading crb_init area: "
+					"n: %08x\n", netxen_nic_driver_name, n);
+			return -EIO;
+		}
+		offset = n & 0xffffU;
+		n = (n >> 16) & 0xffffU;
+	} else {
+		if (netxen_rom_fast_read(adapter, 0, &n) != 0 ||
+			!(n & 0x80000000)) {
+			printk(KERN_ERR "%s: ERROR Reading crb_init area: "
+					"n: %08x\n", netxen_nic_driver_name, n);
+			return -EIO;
+		}
+		offset = 1;
+		n &= ~0x80000000;
+	}
+
+	if (n >= 1024) {
+		printk(KERN_ERR "%s:n=0x%x Error! NetXen card flash not"
+		       " initialized.\n", __func__, n);
+		return -EIO;
+	}
+
+	buf = kcalloc(n, sizeof(struct crb_addr_pair), GFP_KERNEL);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < n; i++) {
+		if (netxen_rom_fast_read(adapter, 8*i + 4*offset, &val) != 0 ||
+		netxen_rom_fast_read(adapter, 8*i + 4*offset + 4, &addr) != 0) {
+			kfree(buf);
+			return -EIO;
+		}
+
+		buf[i].addr = addr;
+		buf[i].data = val;
+
+	}
+
+	for (i = 0; i < n; i++) {
+
+		off = netxen_decode_crb_addr(buf[i].addr);
+		if (off == NETXEN_ADDR_ERROR) {
+			printk(KERN_ERR"CRB init value out of range %x\n",
+					buf[i].addr);
+			continue;
+		}
+		off += NETXEN_PCI_CRBSPACE;
+
+		if (off & 1)
+			continue;
+
+		/* skipping cold reboot MAGIC */
+		if (off == NETXEN_CAM_RAM(0x1fc))
+			continue;
+
+		if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+			if (off == (NETXEN_CRB_I2C0 + 0x1c))
+				continue;
+			/* do not reset PCI */
+			if (off == (ROMUSB_GLB + 0xbc))
+				continue;
+			if (off == (ROMUSB_GLB + 0xa8))
+				continue;
+			if (off == (ROMUSB_GLB + 0xc8)) /* core clock */
+				continue;
+			if (off == (ROMUSB_GLB + 0x24)) /* MN clock */
+				continue;
+			if (off == (ROMUSB_GLB + 0x1c)) /* MS clock */
+				continue;
+			if ((off & 0x0ff00000) == NETXEN_CRB_DDR_NET)
+				continue;
+			if (off == (NETXEN_CRB_PEG_NET_1 + 0x18) &&
+				!NX_IS_REVISION_P3P(adapter->ahw.revision_id))
+				buf[i].data = 0x1020;
+			/* skip the function enable register */
+			if (off == NETXEN_PCIE_REG(PCIE_SETUP_FUNCTION))
+				continue;
+			if (off == NETXEN_PCIE_REG(PCIE_SETUP_FUNCTION2))
+				continue;
+			if ((off & 0x0ff00000) == NETXEN_CRB_SMB)
+				continue;
+		}
+
+		init_delay = 1;
+		/* After writing this register, HW needs time for CRB */
+		/* to quiet down (else crb_window returns 0xffffffff) */
+		if (off == NETXEN_ROMUSB_GLB_SW_RESET) {
+			init_delay = 1000;
+			if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+				/* hold xdma in reset also */
+				buf[i].data = NETXEN_NIC_XDMA_RESET;
+				buf[i].data = 0x8000ff;
+			}
+		}
+
+		NXWR32(adapter, off, buf[i].data);
+
+		msleep(init_delay);
+	}
+	kfree(buf);
+
+	/* disable_peg_cache_all */
+
+	/* unreset_net_cache */
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+		val = NXRD32(adapter, NETXEN_ROMUSB_GLB_SW_RESET);
+		NXWR32(adapter, NETXEN_ROMUSB_GLB_SW_RESET, (val & 0xffffff0f));
+	}
+
+	/* p2dn replyCount */
+	NXWR32(adapter, NETXEN_CRB_PEG_NET_D + 0xec, 0x1e);
+	/* disable_peg_cache 0 */
+	NXWR32(adapter, NETXEN_CRB_PEG_NET_D + 0x4c, 8);
+	/* disable_peg_cache 1 */
+	NXWR32(adapter, NETXEN_CRB_PEG_NET_I + 0x4c, 8);
+
+	/* peg_clr_all */
+
+	/* peg_clr 0 */
+	NXWR32(adapter, NETXEN_CRB_PEG_NET_0 + 0x8, 0);
+	NXWR32(adapter, NETXEN_CRB_PEG_NET_0 + 0xc, 0);
+	/* peg_clr 1 */
+	NXWR32(adapter, NETXEN_CRB_PEG_NET_1 + 0x8, 0);
+	NXWR32(adapter, NETXEN_CRB_PEG_NET_1 + 0xc, 0);
+	/* peg_clr 2 */
+	NXWR32(adapter, NETXEN_CRB_PEG_NET_2 + 0x8, 0);
+	NXWR32(adapter, NETXEN_CRB_PEG_NET_2 + 0xc, 0);
+	/* peg_clr 3 */
+	NXWR32(adapter, NETXEN_CRB_PEG_NET_3 + 0x8, 0);
+	NXWR32(adapter, NETXEN_CRB_PEG_NET_3 + 0xc, 0);
+	return 0;
+}
+
+static struct uni_table_desc *nx_get_table_desc(const u8 *unirom, int section)
+{
+	uint32_t i;
+	struct uni_table_desc *directory = (struct uni_table_desc *) &unirom[0];
+	__le32 entries = cpu_to_le32(directory->num_entries);
+
+	for (i = 0; i < entries; i++) {
+
+		__le32 offs = cpu_to_le32(directory->findex) +
+				(i * cpu_to_le32(directory->entry_size));
+		__le32 tab_type = cpu_to_le32(*((u32 *)&unirom[offs] + 8));
+
+		if (tab_type == section)
+			return (struct uni_table_desc *) &unirom[offs];
+	}
+
+	return NULL;
+}
+
+#define	QLCNIC_FILEHEADER_SIZE	(14 * 4)
+
+static int
+netxen_nic_validate_header(struct netxen_adapter *adapter)
+{
+	const u8 *unirom = adapter->fw->data;
+	struct uni_table_desc *directory = (struct uni_table_desc *) &unirom[0];
+	u32 fw_file_size = adapter->fw->size;
+	u32 tab_size;
+	__le32 entries;
+	__le32 entry_size;
+
+	if (fw_file_size < QLCNIC_FILEHEADER_SIZE)
+		return -EINVAL;
+
+	entries = cpu_to_le32(directory->num_entries);
+	entry_size = cpu_to_le32(directory->entry_size);
+	tab_size = cpu_to_le32(directory->findex) + (entries * entry_size);
+
+	if (fw_file_size < tab_size)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int
+netxen_nic_validate_bootld(struct netxen_adapter *adapter)
+{
+	struct uni_table_desc *tab_desc;
+	struct uni_data_desc *descr;
+	const u8 *unirom = adapter->fw->data;
+	__le32 idx = cpu_to_le32(*((int *)&unirom[adapter->file_prd_off] +
+				NX_UNI_BOOTLD_IDX_OFF));
+	u32 offs;
+	u32 tab_size;
+	u32 data_size;
+
+	tab_desc = nx_get_table_desc(unirom, NX_UNI_DIR_SECT_BOOTLD);
+
+	if (!tab_desc)
+		return -EINVAL;
+
+	tab_size = cpu_to_le32(tab_desc->findex) +
+			(cpu_to_le32(tab_desc->entry_size) * (idx + 1));
+
+	if (adapter->fw->size < tab_size)
+		return -EINVAL;
+
+	offs = cpu_to_le32(tab_desc->findex) +
+		(cpu_to_le32(tab_desc->entry_size) * (idx));
+	descr = (struct uni_data_desc *)&unirom[offs];
+
+	data_size = cpu_to_le32(descr->findex) + cpu_to_le32(descr->size);
+
+	if (adapter->fw->size < data_size)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int
+netxen_nic_validate_fw(struct netxen_adapter *adapter)
+{
+	struct uni_table_desc *tab_desc;
+	struct uni_data_desc *descr;
+	const u8 *unirom = adapter->fw->data;
+	__le32 idx = cpu_to_le32(*((int *)&unirom[adapter->file_prd_off] +
+				NX_UNI_FIRMWARE_IDX_OFF));
+	u32 offs;
+	u32 tab_size;
+	u32 data_size;
+
+	tab_desc = nx_get_table_desc(unirom, NX_UNI_DIR_SECT_FW);
+
+	if (!tab_desc)
+		return -EINVAL;
+
+	tab_size = cpu_to_le32(tab_desc->findex) +
+			(cpu_to_le32(tab_desc->entry_size) * (idx + 1));
+
+	if (adapter->fw->size < tab_size)
+		return -EINVAL;
+
+	offs = cpu_to_le32(tab_desc->findex) +
+		(cpu_to_le32(tab_desc->entry_size) * (idx));
+	descr = (struct uni_data_desc *)&unirom[offs];
+	data_size = cpu_to_le32(descr->findex) + cpu_to_le32(descr->size);
+
+	if (adapter->fw->size < data_size)
+		return -EINVAL;
+
+	return 0;
+}
+
+
+static int
+netxen_nic_validate_product_offs(struct netxen_adapter *adapter)
+{
+	struct uni_table_desc *ptab_descr;
+	const u8 *unirom = adapter->fw->data;
+	int mn_present = (NX_IS_REVISION_P2(adapter->ahw.revision_id)) ?
+			1 : netxen_p3_has_mn(adapter);
+	__le32 entries;
+	__le32 entry_size;
+	u32 tab_size;
+	u32 i;
+
+	ptab_descr = nx_get_table_desc(unirom, NX_UNI_DIR_SECT_PRODUCT_TBL);
+	if (ptab_descr == NULL)
+		return -EINVAL;
+
+	entries = cpu_to_le32(ptab_descr->num_entries);
+	entry_size = cpu_to_le32(ptab_descr->entry_size);
+	tab_size = cpu_to_le32(ptab_descr->findex) + (entries * entry_size);
+
+	if (adapter->fw->size < tab_size)
+		return -EINVAL;
+
+nomn:
+	for (i = 0; i < entries; i++) {
+
+		__le32 flags, file_chiprev, offs;
+		u8 chiprev = adapter->ahw.revision_id;
+		uint32_t flagbit;
+
+		offs = cpu_to_le32(ptab_descr->findex) +
+				(i * cpu_to_le32(ptab_descr->entry_size));
+		flags = cpu_to_le32(*((int *)&unirom[offs] + NX_UNI_FLAGS_OFF));
+		file_chiprev = cpu_to_le32(*((int *)&unirom[offs] +
+							NX_UNI_CHIP_REV_OFF));
+
+		flagbit = mn_present ? 1 : 2;
+
+		if ((chiprev == file_chiprev) &&
+					((1ULL << flagbit) & flags)) {
+			adapter->file_prd_off = offs;
+			return 0;
+		}
+	}
+
+	if (mn_present && NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+		mn_present = 0;
+		goto nomn;
+	}
+
+	return -EINVAL;
+}
+
+static int
+netxen_nic_validate_unified_romimage(struct netxen_adapter *adapter)
+{
+	if (netxen_nic_validate_header(adapter)) {
+		dev_err(&adapter->pdev->dev,
+				"unified image: header validation failed\n");
+		return -EINVAL;
+	}
+
+	if (netxen_nic_validate_product_offs(adapter)) {
+		dev_err(&adapter->pdev->dev,
+				"unified image: product validation failed\n");
+		return -EINVAL;
+	}
+
+	if (netxen_nic_validate_bootld(adapter)) {
+		dev_err(&adapter->pdev->dev,
+				"unified image: bootld validation failed\n");
+		return -EINVAL;
+	}
+
+	if (netxen_nic_validate_fw(adapter)) {
+		dev_err(&adapter->pdev->dev,
+				"unified image: firmware validation failed\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct uni_data_desc *nx_get_data_desc(struct netxen_adapter *adapter,
+			u32 section, u32 idx_offset)
+{
+	const u8 *unirom = adapter->fw->data;
+	int idx = cpu_to_le32(*((int *)&unirom[adapter->file_prd_off] +
+								idx_offset));
+	struct uni_table_desc *tab_desc;
+	__le32 offs;
+
+	tab_desc = nx_get_table_desc(unirom, section);
+
+	if (tab_desc == NULL)
+		return NULL;
+
+	offs = cpu_to_le32(tab_desc->findex) +
+			(cpu_to_le32(tab_desc->entry_size) * idx);
+
+	return (struct uni_data_desc *)&unirom[offs];
+}
+
+static u8 *
+nx_get_bootld_offs(struct netxen_adapter *adapter)
+{
+	u32 offs = NETXEN_BOOTLD_START;
+
+	if (adapter->fw_type == NX_UNIFIED_ROMIMAGE)
+		offs = cpu_to_le32((nx_get_data_desc(adapter,
+					NX_UNI_DIR_SECT_BOOTLD,
+					NX_UNI_BOOTLD_IDX_OFF))->findex);
+
+	return (u8 *)&adapter->fw->data[offs];
+}
+
+static u8 *
+nx_get_fw_offs(struct netxen_adapter *adapter)
+{
+	u32 offs = NETXEN_IMAGE_START;
+
+	if (adapter->fw_type == NX_UNIFIED_ROMIMAGE)
+		offs = cpu_to_le32((nx_get_data_desc(adapter,
+					NX_UNI_DIR_SECT_FW,
+					NX_UNI_FIRMWARE_IDX_OFF))->findex);
+
+	return (u8 *)&adapter->fw->data[offs];
+}
+
+static __le32
+nx_get_fw_size(struct netxen_adapter *adapter)
+{
+	if (adapter->fw_type == NX_UNIFIED_ROMIMAGE)
+		return cpu_to_le32((nx_get_data_desc(adapter,
+					NX_UNI_DIR_SECT_FW,
+					NX_UNI_FIRMWARE_IDX_OFF))->size);
+	else
+		return cpu_to_le32(
+				*(u32 *)&adapter->fw->data[NX_FW_SIZE_OFFSET]);
+}
+
+static __le32
+nx_get_fw_version(struct netxen_adapter *adapter)
+{
+	struct uni_data_desc *fw_data_desc;
+	const struct firmware *fw = adapter->fw;
+	__le32 major, minor, sub;
+	const u8 *ver_str;
+	int i, ret = 0;
+
+	if (adapter->fw_type == NX_UNIFIED_ROMIMAGE) {
+
+		fw_data_desc = nx_get_data_desc(adapter,
+				NX_UNI_DIR_SECT_FW, NX_UNI_FIRMWARE_IDX_OFF);
+		ver_str = fw->data + cpu_to_le32(fw_data_desc->findex) +
+				cpu_to_le32(fw_data_desc->size) - 17;
+
+		for (i = 0; i < 12; i++) {
+			if (!strncmp(&ver_str[i], "REV=", 4)) {
+				ret = sscanf(&ver_str[i+4], "%u.%u.%u ",
+							&major, &minor, &sub);
+				break;
+			}
+		}
+
+		if (ret != 3)
+			return 0;
+
+		return major + (minor << 8) + (sub << 16);
+
+	} else
+		return cpu_to_le32(*(u32 *)&fw->data[NX_FW_VERSION_OFFSET]);
+}
+
+static __le32
+nx_get_bios_version(struct netxen_adapter *adapter)
+{
+	const struct firmware *fw = adapter->fw;
+	__le32 bios_ver, prd_off = adapter->file_prd_off;
+
+	if (adapter->fw_type == NX_UNIFIED_ROMIMAGE) {
+		bios_ver = cpu_to_le32(*((u32 *) (&fw->data[prd_off])
+						+ NX_UNI_BIOS_VERSION_OFF));
+		return (bios_ver << 16) + ((bios_ver >> 8) & 0xff00) +
+							(bios_ver >> 24);
+	} else
+		return cpu_to_le32(*(u32 *)&fw->data[NX_BIOS_VERSION_OFFSET]);
+
+}
+
+int
+netxen_need_fw_reset(struct netxen_adapter *adapter)
+{
+	u32 count, old_count;
+	u32 val, version, major, minor, build;
+	int i, timeout;
+	u8 fw_type;
+
+	/* NX2031 firmware doesn't support heartbit */
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		return 1;
+
+	if (adapter->need_fw_reset)
+		return 1;
+
+	/* last attempt had failed */
+	if (NXRD32(adapter, CRB_CMDPEG_STATE) == PHAN_INITIALIZE_FAILED)
+		return 1;
+
+	old_count = NXRD32(adapter, NETXEN_PEG_ALIVE_COUNTER);
+
+	for (i = 0; i < 10; i++) {
+
+		timeout = msleep_interruptible(200);
+		if (timeout) {
+			NXWR32(adapter, CRB_CMDPEG_STATE,
+					PHAN_INITIALIZE_FAILED);
+			return -EINTR;
+		}
+
+		count = NXRD32(adapter, NETXEN_PEG_ALIVE_COUNTER);
+		if (count != old_count)
+			break;
+	}
+
+	/* firmware is dead */
+	if (count == old_count)
+		return 1;
+
+	/* check if we have got newer or different file firmware */
+	if (adapter->fw) {
+
+		val = nx_get_fw_version(adapter);
+
+		version = NETXEN_DECODE_VERSION(val);
+
+		major = NXRD32(adapter, NETXEN_FW_VERSION_MAJOR);
+		minor = NXRD32(adapter, NETXEN_FW_VERSION_MINOR);
+		build = NXRD32(adapter, NETXEN_FW_VERSION_SUB);
+
+		if (version > NETXEN_VERSION_CODE(major, minor, build))
+			return 1;
+
+		if (version == NETXEN_VERSION_CODE(major, minor, build) &&
+			adapter->fw_type != NX_UNIFIED_ROMIMAGE) {
+
+			val = NXRD32(adapter, NETXEN_MIU_MN_CONTROL);
+			fw_type = (val & 0x4) ?
+				NX_P3_CT_ROMIMAGE : NX_P3_MN_ROMIMAGE;
+
+			if (adapter->fw_type != fw_type)
+				return 1;
+		}
+	}
+
+	return 0;
+}
+
+#define NETXEN_MIN_P3_FW_SUPP	NETXEN_VERSION_CODE(4, 0, 505)
+
+int
+netxen_check_flash_fw_compatibility(struct netxen_adapter *adapter)
+{
+	u32 flash_fw_ver, min_fw_ver;
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		return 0;
+
+	if (netxen_rom_fast_read(adapter,
+			NX_FW_VERSION_OFFSET, (int *)&flash_fw_ver)) {
+		dev_err(&adapter->pdev->dev, "Unable to read flash fw"
+			"version\n");
+		return -EIO;
+	}
+
+	flash_fw_ver = NETXEN_DECODE_VERSION(flash_fw_ver);
+	min_fw_ver = NETXEN_MIN_P3_FW_SUPP;
+	if (flash_fw_ver >= min_fw_ver)
+		return 0;
+
+	dev_info(&adapter->pdev->dev, "Flash fw[%d.%d.%d] is < min fw supported"
+		"[4.0.505]. Please update firmware on flash\n",
+		_major(flash_fw_ver), _minor(flash_fw_ver),
+		_build(flash_fw_ver));
+	return -EINVAL;
+}
+
+static char *fw_name[] = {
+	NX_P2_MN_ROMIMAGE_NAME,
+	NX_P3_CT_ROMIMAGE_NAME,
+	NX_P3_MN_ROMIMAGE_NAME,
+	NX_UNIFIED_ROMIMAGE_NAME,
+	NX_FLASH_ROMIMAGE_NAME,
+};
+
+int
+netxen_load_firmware(struct netxen_adapter *adapter)
+{
+	u64 *ptr64;
+	u32 i, flashaddr, size;
+	const struct firmware *fw = adapter->fw;
+	struct pci_dev *pdev = adapter->pdev;
+
+	dev_info(&pdev->dev, "loading firmware from %s\n",
+			fw_name[adapter->fw_type]);
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		NXWR32(adapter, NETXEN_ROMUSB_GLB_CAS_RST, 1);
+
+	if (fw) {
+		__le64 data;
+
+		size = (NETXEN_IMAGE_START - NETXEN_BOOTLD_START) / 8;
+
+		ptr64 = (u64 *)nx_get_bootld_offs(adapter);
+		flashaddr = NETXEN_BOOTLD_START;
+
+		for (i = 0; i < size; i++) {
+			data = cpu_to_le64(ptr64[i]);
+
+			if (adapter->pci_mem_write(adapter, flashaddr, data))
+				return -EIO;
+
+			flashaddr += 8;
+		}
+
+		size = (__force u32)nx_get_fw_size(adapter) / 8;
+
+		ptr64 = (u64 *)nx_get_fw_offs(adapter);
+		flashaddr = NETXEN_IMAGE_START;
+
+		for (i = 0; i < size; i++) {
+			data = cpu_to_le64(ptr64[i]);
+
+			if (adapter->pci_mem_write(adapter,
+						flashaddr, data))
+				return -EIO;
+
+			flashaddr += 8;
+		}
+
+		size = (__force u32)nx_get_fw_size(adapter) % 8;
+		if (size) {
+			data = cpu_to_le64(ptr64[i]);
+
+			if (adapter->pci_mem_write(adapter,
+						flashaddr, data))
+				return -EIO;
+		}
+
+	} else {
+		u64 data;
+		u32 hi, lo;
+
+		size = (NETXEN_IMAGE_START - NETXEN_BOOTLD_START) / 8;
+		flashaddr = NETXEN_BOOTLD_START;
+
+		for (i = 0; i < size; i++) {
+			if (netxen_rom_fast_read(adapter,
+					flashaddr, (int *)&lo) != 0)
+				return -EIO;
+			if (netxen_rom_fast_read(adapter,
+					flashaddr + 4, (int *)&hi) != 0)
+				return -EIO;
+
+			/* hi, lo are already in host endian byteorder */
+			data = (((u64)hi << 32) | lo);
+
+			if (adapter->pci_mem_write(adapter,
+						flashaddr, data))
+				return -EIO;
+
+			flashaddr += 8;
+		}
+	}
+	msleep(1);
+
+	if (NX_IS_REVISION_P3P(adapter->ahw.revision_id)) {
+		NXWR32(adapter, NETXEN_CRB_PEG_NET_0 + 0x18, 0x1020);
+		NXWR32(adapter, NETXEN_ROMUSB_GLB_SW_RESET, 0x80001e);
+	} else if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		NXWR32(adapter, NETXEN_ROMUSB_GLB_SW_RESET, 0x80001d);
+	else {
+		NXWR32(adapter, NETXEN_ROMUSB_GLB_CHIP_CLK_CTRL, 0x3fff);
+		NXWR32(adapter, NETXEN_ROMUSB_GLB_CAS_RST, 0);
+	}
+
+	return 0;
+}
+
+static int
+netxen_validate_firmware(struct netxen_adapter *adapter)
+{
+	__le32 val;
+	__le32 flash_fw_ver;
+	u32 file_fw_ver, min_ver, bios;
+	struct pci_dev *pdev = adapter->pdev;
+	const struct firmware *fw = adapter->fw;
+	u8 fw_type = adapter->fw_type;
+	u32 crbinit_fix_fw;
+
+	if (fw_type == NX_UNIFIED_ROMIMAGE) {
+		if (netxen_nic_validate_unified_romimage(adapter))
+			return -EINVAL;
+	} else {
+		val = cpu_to_le32(*(u32 *)&fw->data[NX_FW_MAGIC_OFFSET]);
+		if ((__force u32)val != NETXEN_BDINFO_MAGIC)
+			return -EINVAL;
+
+		if (fw->size < NX_FW_MIN_SIZE)
+			return -EINVAL;
+	}
+
+	val = nx_get_fw_version(adapter);
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		min_ver = NETXEN_MIN_P3_FW_SUPP;
+	else
+		min_ver = NETXEN_VERSION_CODE(3, 4, 216);
+
+	file_fw_ver = NETXEN_DECODE_VERSION(val);
+
+	if ((_major(file_fw_ver) > _NETXEN_NIC_LINUX_MAJOR) ||
+	    (file_fw_ver < min_ver)) {
+		dev_err(&pdev->dev,
+				"%s: firmware version %d.%d.%d unsupported\n",
+		fw_name[fw_type], _major(file_fw_ver), _minor(file_fw_ver),
+		 _build(file_fw_ver));
+		return -EINVAL;
+	}
+	val = nx_get_bios_version(adapter);
+	netxen_rom_fast_read(adapter, NX_BIOS_VERSION_OFFSET, (int *)&bios);
+	if ((__force u32)val != bios) {
+		dev_err(&pdev->dev, "%s: firmware bios is incompatible\n",
+				fw_name[fw_type]);
+		return -EINVAL;
+	}
+
+	if (netxen_rom_fast_read(adapter,
+			NX_FW_VERSION_OFFSET, (int *)&flash_fw_ver)) {
+		dev_err(&pdev->dev, "Unable to read flash fw version\n");
+		return -EIO;
+	}
+	flash_fw_ver = NETXEN_DECODE_VERSION(flash_fw_ver);
+
+	/* New fw from file is not allowed, if fw on flash is < 4.0.554 */
+	crbinit_fix_fw = NETXEN_VERSION_CODE(4, 0, 554);
+	if (file_fw_ver >= crbinit_fix_fw && flash_fw_ver < crbinit_fix_fw &&
+	    NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+		dev_err(&pdev->dev, "Incompatibility detected between driver "
+			"and firmware version on flash. This configuration "
+			"is not recommended. Please update the firmware on "
+			"flash immediately\n");
+		return -EINVAL;
+	}
+
+	/* check if flashed firmware is newer only for no-mn and P2 case*/
+	if (!netxen_p3_has_mn(adapter) ||
+	    NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+		if (flash_fw_ver > file_fw_ver) {
+			dev_info(&pdev->dev, "%s: firmware is older than flash\n",
+				fw_name[fw_type]);
+			return -EINVAL;
+		}
+	}
+
+	NXWR32(adapter, NETXEN_CAM_RAM(0x1fc), NETXEN_BDINFO_MAGIC);
+	return 0;
+}
+
+static void
+nx_get_next_fwtype(struct netxen_adapter *adapter)
+{
+	u8 fw_type;
+
+	switch (adapter->fw_type) {
+	case NX_UNKNOWN_ROMIMAGE:
+		fw_type = NX_UNIFIED_ROMIMAGE;
+		break;
+
+	case NX_UNIFIED_ROMIMAGE:
+		if (NX_IS_REVISION_P3P(adapter->ahw.revision_id))
+			fw_type = NX_FLASH_ROMIMAGE;
+		else if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+			fw_type = NX_P2_MN_ROMIMAGE;
+		else if (netxen_p3_has_mn(adapter))
+			fw_type = NX_P3_MN_ROMIMAGE;
+		else
+			fw_type = NX_P3_CT_ROMIMAGE;
+		break;
+
+	case NX_P3_MN_ROMIMAGE:
+		fw_type = NX_P3_CT_ROMIMAGE;
+		break;
+
+	case NX_P2_MN_ROMIMAGE:
+	case NX_P3_CT_ROMIMAGE:
+	default:
+		fw_type = NX_FLASH_ROMIMAGE;
+		break;
+	}
+
+	adapter->fw_type = fw_type;
+}
+
+static int
+netxen_p3_has_mn(struct netxen_adapter *adapter)
+{
+	u32 capability, flashed_ver;
+	capability = 0;
+
+	/* NX2031 always had MN */
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		return 1;
+
+	netxen_rom_fast_read(adapter,
+			NX_FW_VERSION_OFFSET, (int *)&flashed_ver);
+	flashed_ver = NETXEN_DECODE_VERSION(flashed_ver);
+
+	if (flashed_ver >= NETXEN_VERSION_CODE(4, 0, 220)) {
+
+		capability = NXRD32(adapter, NX_PEG_TUNE_CAPABILITY);
+		if (capability & NX_PEG_TUNE_MN_PRESENT)
+			return 1;
+	}
+	return 0;
+}
+
+void netxen_request_firmware(struct netxen_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	int rc = 0;
+
+	adapter->fw_type = NX_UNKNOWN_ROMIMAGE;
+
+next:
+	nx_get_next_fwtype(adapter);
+
+	if (adapter->fw_type == NX_FLASH_ROMIMAGE) {
+		adapter->fw = NULL;
+	} else {
+		rc = request_firmware(&adapter->fw,
+				fw_name[adapter->fw_type], &pdev->dev);
+		if (rc != 0)
+			goto next;
+
+		rc = netxen_validate_firmware(adapter);
+		if (rc != 0) {
+			release_firmware(adapter->fw);
+			msleep(1);
+			goto next;
+		}
+	}
+}
+
+
+void
+netxen_release_firmware(struct netxen_adapter *adapter)
+{
+	release_firmware(adapter->fw);
+	adapter->fw = NULL;
+}
+
+int netxen_init_dummy_dma(struct netxen_adapter *adapter)
+{
+	u64 addr;
+	u32 hi, lo;
+
+	if (!NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		return 0;
+
+	adapter->dummy_dma.addr = pci_alloc_consistent(adapter->pdev,
+				 NETXEN_HOST_DUMMY_DMA_SIZE,
+				 &adapter->dummy_dma.phys_addr);
+	if (adapter->dummy_dma.addr == NULL) {
+		dev_err(&adapter->pdev->dev,
+			"ERROR: Could not allocate dummy DMA memory\n");
+		return -ENOMEM;
+	}
+
+	addr = (uint64_t) adapter->dummy_dma.phys_addr;
+	hi = (addr >> 32) & 0xffffffff;
+	lo = addr & 0xffffffff;
+
+	NXWR32(adapter, CRB_HOST_DUMMY_BUF_ADDR_HI, hi);
+	NXWR32(adapter, CRB_HOST_DUMMY_BUF_ADDR_LO, lo);
+
+	return 0;
+}
+
+/*
+ * NetXen DMA watchdog control:
+ *
+ *	Bit 0		: enabled => R/O: 1 watchdog active, 0 inactive
+ *	Bit 1		: disable_request => 1 req disable dma watchdog
+ *	Bit 2		: enable_request =>  1 req enable dma watchdog
+ *	Bit 3-31	: unused
+ */
+void netxen_free_dummy_dma(struct netxen_adapter *adapter)
+{
+	int i = 100;
+	u32 ctrl;
+
+	if (!NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		return;
+
+	if (!adapter->dummy_dma.addr)
+		return;
+
+	ctrl = NXRD32(adapter, NETXEN_DMA_WATCHDOG_CTRL);
+	if ((ctrl & 0x1) != 0) {
+		NXWR32(adapter, NETXEN_DMA_WATCHDOG_CTRL, (ctrl | 0x2));
+
+		while ((ctrl & 0x1) != 0) {
+
+			msleep(50);
+
+			ctrl = NXRD32(adapter, NETXEN_DMA_WATCHDOG_CTRL);
+
+			if (--i == 0)
+				break;
+		}
+	}
+
+	if (i) {
+		pci_free_consistent(adapter->pdev,
+			    NETXEN_HOST_DUMMY_DMA_SIZE,
+			    adapter->dummy_dma.addr,
+			    adapter->dummy_dma.phys_addr);
+		adapter->dummy_dma.addr = NULL;
+	} else
+		dev_err(&adapter->pdev->dev, "dma_watchdog_shutdown failed\n");
+}
+
+int netxen_phantom_init(struct netxen_adapter *adapter, int pegtune_val)
+{
+	u32 val = 0;
+	int retries = 60;
+
+	if (pegtune_val)
+		return 0;
+
+	do {
+		val = NXRD32(adapter, CRB_CMDPEG_STATE);
+		switch (val) {
+		case PHAN_INITIALIZE_COMPLETE:
+		case PHAN_INITIALIZE_ACK:
+			return 0;
+		case PHAN_INITIALIZE_FAILED:
+			goto out_err;
+		default:
+			break;
+		}
+
+		msleep(500);
+
+	} while (--retries);
+
+	NXWR32(adapter, CRB_CMDPEG_STATE, PHAN_INITIALIZE_FAILED);
+
+out_err:
+	dev_warn(&adapter->pdev->dev, "firmware init failed\n");
+	return -EIO;
+}
+
+static int
+netxen_receive_peg_ready(struct netxen_adapter *adapter)
+{
+	u32 val = 0;
+	int retries = 2000;
+
+	do {
+		val = NXRD32(adapter, CRB_RCVPEG_STATE);
+
+		if (val == PHAN_PEG_RCV_INITIALIZED)
+			return 0;
+
+		msleep(10);
+
+	} while (--retries);
+
+	pr_err("Receive Peg initialization not complete, state: 0x%x.\n", val);
+	return -EIO;
+}
+
+int netxen_init_firmware(struct netxen_adapter *adapter)
+{
+	int err;
+
+	err = netxen_receive_peg_ready(adapter);
+	if (err)
+		return err;
+
+	NXWR32(adapter, CRB_NIC_CAPABILITIES_HOST, INTR_SCHEME_PERPORT);
+	NXWR32(adapter, CRB_MPORT_MODE, MPORT_MULTI_FUNCTION_MODE);
+	NXWR32(adapter, CRB_CMDPEG_STATE, PHAN_INITIALIZE_ACK);
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		NXWR32(adapter, CRB_NIC_MSI_MODE_HOST, MSI_MODE_MULTIFUNC);
+
+	return err;
+}
+
+static void
+netxen_handle_linkevent(struct netxen_adapter *adapter, nx_fw_msg_t *msg)
+{
+	u32 cable_OUI;
+	u16 cable_len;
+	u16 link_speed;
+	u8  link_status, module, duplex, autoneg;
+	struct net_device *netdev = adapter->netdev;
+
+	adapter->has_link_events = 1;
+
+	cable_OUI = msg->body[1] & 0xffffffff;
+	cable_len = (msg->body[1] >> 32) & 0xffff;
+	link_speed = (msg->body[1] >> 48) & 0xffff;
+
+	link_status = msg->body[2] & 0xff;
+	duplex = (msg->body[2] >> 16) & 0xff;
+	autoneg = (msg->body[2] >> 24) & 0xff;
+
+	module = (msg->body[2] >> 8) & 0xff;
+	if (module == LINKEVENT_MODULE_TWINAX_UNSUPPORTED_CABLE) {
+		printk(KERN_INFO "%s: unsupported cable: OUI 0x%x, length %d\n",
+				netdev->name, cable_OUI, cable_len);
+	} else if (module == LINKEVENT_MODULE_TWINAX_UNSUPPORTED_CABLELEN) {
+		printk(KERN_INFO "%s: unsupported cable length %d\n",
+				netdev->name, cable_len);
+	}
+
+	/* update link parameters */
+	if (duplex == LINKEVENT_FULL_DUPLEX)
+		adapter->link_duplex = DUPLEX_FULL;
+	else
+		adapter->link_duplex = DUPLEX_HALF;
+	adapter->module_type = module;
+	adapter->link_autoneg = autoneg;
+	adapter->link_speed = link_speed;
+
+	netxen_advert_link_change(adapter, link_status);
+}
+
+static void
+netxen_handle_fw_message(int desc_cnt, int index,
+		struct nx_host_sds_ring *sds_ring)
+{
+	nx_fw_msg_t msg;
+	struct status_desc *desc;
+	int i = 0, opcode;
+
+	while (desc_cnt > 0 && i < 8) {
+		desc = &sds_ring->desc_head[index];
+		msg.words[i++] = le64_to_cpu(desc->status_desc_data[0]);
+		msg.words[i++] = le64_to_cpu(desc->status_desc_data[1]);
+
+		index = get_next_index(index, sds_ring->num_desc);
+		desc_cnt--;
+	}
+
+	opcode = netxen_get_nic_msg_opcode(msg.body[0]);
+	switch (opcode) {
+	case NX_NIC_C2H_OPCODE_GET_LINKEVENT_RESPONSE:
+		netxen_handle_linkevent(sds_ring->adapter, &msg);
+		break;
+	default:
+		break;
+	}
+}
+
+static int
+netxen_alloc_rx_skb(struct netxen_adapter *adapter,
+		struct nx_host_rds_ring *rds_ring,
+		struct netxen_rx_buffer *buffer)
+{
+	struct sk_buff *skb;
+	dma_addr_t dma;
+	struct pci_dev *pdev = adapter->pdev;
+
+	buffer->skb = netdev_alloc_skb(adapter->netdev, rds_ring->skb_size);
+	if (!buffer->skb)
+		return 1;
+
+	skb = buffer->skb;
+
+	if (!adapter->ahw.cut_through)
+		skb_reserve(skb, 2);
+
+	dma = pci_map_single(pdev, skb->data,
+			rds_ring->dma_size, PCI_DMA_FROMDEVICE);
+
+	if (pci_dma_mapping_error(pdev, dma)) {
+		dev_kfree_skb_any(skb);
+		buffer->skb = NULL;
+		return 1;
+	}
+
+	buffer->skb = skb;
+	buffer->dma = dma;
+	buffer->state = NETXEN_BUFFER_BUSY;
+
+	return 0;
+}
+
+static struct sk_buff *netxen_process_rxbuf(struct netxen_adapter *adapter,
+		struct nx_host_rds_ring *rds_ring, u16 index, u16 cksum)
+{
+	struct netxen_rx_buffer *buffer;
+	struct sk_buff *skb;
+
+	buffer = &rds_ring->rx_buf_arr[index];
+
+	pci_unmap_single(adapter->pdev, buffer->dma, rds_ring->dma_size,
+			PCI_DMA_FROMDEVICE);
+
+	skb = buffer->skb;
+	if (!skb)
+		goto no_skb;
+
+	if (likely((adapter->netdev->features & NETIF_F_RXCSUM)
+	    && cksum == STATUS_CKSUM_OK)) {
+		adapter->stats.csummed++;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else
+		skb->ip_summed = CHECKSUM_NONE;
+
+	buffer->skb = NULL;
+no_skb:
+	buffer->state = NETXEN_BUFFER_FREE;
+	return skb;
+}
+
+static struct netxen_rx_buffer *
+netxen_process_rcv(struct netxen_adapter *adapter,
+		struct nx_host_sds_ring *sds_ring,
+		int ring, u64 sts_data0)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+	struct netxen_rx_buffer *buffer;
+	struct sk_buff *skb;
+	struct nx_host_rds_ring *rds_ring;
+	int index, length, cksum, pkt_offset;
+
+	if (unlikely(ring >= adapter->max_rds_rings))
+		return NULL;
+
+	rds_ring = &recv_ctx->rds_rings[ring];
+
+	index = netxen_get_sts_refhandle(sts_data0);
+	if (unlikely(index >= rds_ring->num_desc))
+		return NULL;
+
+	buffer = &rds_ring->rx_buf_arr[index];
+
+	length = netxen_get_sts_totallength(sts_data0);
+	cksum  = netxen_get_sts_status(sts_data0);
+	pkt_offset = netxen_get_sts_pkt_offset(sts_data0);
+
+	skb = netxen_process_rxbuf(adapter, rds_ring, index, cksum);
+	if (!skb)
+		return buffer;
+
+	if (length > rds_ring->skb_size)
+		skb_put(skb, rds_ring->skb_size);
+	else
+		skb_put(skb, length);
+
+
+	if (pkt_offset)
+		skb_pull(skb, pkt_offset);
+
+	skb->protocol = eth_type_trans(skb, netdev);
+
+	napi_gro_receive(&sds_ring->napi, skb);
+
+	adapter->stats.rx_pkts++;
+	adapter->stats.rxbytes += length;
+
+	return buffer;
+}
+
+#define TCP_HDR_SIZE            20
+#define TCP_TS_OPTION_SIZE      12
+#define TCP_TS_HDR_SIZE         (TCP_HDR_SIZE + TCP_TS_OPTION_SIZE)
+
+static struct netxen_rx_buffer *
+netxen_process_lro(struct netxen_adapter *adapter,
+		struct nx_host_sds_ring *sds_ring,
+		int ring, u64 sts_data0, u64 sts_data1)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+	struct netxen_rx_buffer *buffer;
+	struct sk_buff *skb;
+	struct nx_host_rds_ring *rds_ring;
+	struct iphdr *iph;
+	struct tcphdr *th;
+	bool push, timestamp;
+	int l2_hdr_offset, l4_hdr_offset;
+	int index;
+	u16 lro_length, length, data_offset;
+	u32 seq_number;
+	u8 vhdr_len = 0;
+
+	if (unlikely(ring >= adapter->max_rds_rings))
+		return NULL;
+
+	rds_ring = &recv_ctx->rds_rings[ring];
+
+	index = netxen_get_lro_sts_refhandle(sts_data0);
+	if (unlikely(index >= rds_ring->num_desc))
+		return NULL;
+
+	buffer = &rds_ring->rx_buf_arr[index];
+
+	timestamp = netxen_get_lro_sts_timestamp(sts_data0);
+	lro_length = netxen_get_lro_sts_length(sts_data0);
+	l2_hdr_offset = netxen_get_lro_sts_l2_hdr_offset(sts_data0);
+	l4_hdr_offset = netxen_get_lro_sts_l4_hdr_offset(sts_data0);
+	push = netxen_get_lro_sts_push_flag(sts_data0);
+	seq_number = netxen_get_lro_sts_seq_number(sts_data1);
+
+	skb = netxen_process_rxbuf(adapter, rds_ring, index, STATUS_CKSUM_OK);
+	if (!skb)
+		return buffer;
+
+	if (timestamp)
+		data_offset = l4_hdr_offset + TCP_TS_HDR_SIZE;
+	else
+		data_offset = l4_hdr_offset + TCP_HDR_SIZE;
+
+	skb_put(skb, lro_length + data_offset);
+
+	skb_pull(skb, l2_hdr_offset);
+	skb->protocol = eth_type_trans(skb, netdev);
+
+	if (skb->protocol == htons(ETH_P_8021Q))
+		vhdr_len = VLAN_HLEN;
+	iph = (struct iphdr *)(skb->data + vhdr_len);
+	th = (struct tcphdr *)((skb->data + vhdr_len) + (iph->ihl << 2));
+
+	length = (iph->ihl << 2) + (th->doff << 2) + lro_length;
+	csum_replace2(&iph->check, iph->tot_len, htons(length));
+	iph->tot_len = htons(length);
+	th->psh = push;
+	th->seq = htonl(seq_number);
+
+	length = skb->len;
+
+	if (adapter->flags & NETXEN_FW_MSS_CAP)
+		skb_shinfo(skb)->gso_size  =  netxen_get_lro_sts_mss(sts_data1);
+
+	netif_receive_skb(skb);
+
+	adapter->stats.lro_pkts++;
+	adapter->stats.rxbytes += length;
+
+	return buffer;
+}
+
+#define netxen_merge_rx_buffers(list, head) \
+	do { list_splice_tail_init(list, head); } while (0);
+
+int
+netxen_process_rcv_ring(struct nx_host_sds_ring *sds_ring, int max)
+{
+	struct netxen_adapter *adapter = sds_ring->adapter;
+
+	struct list_head *cur;
+
+	struct status_desc *desc;
+	struct netxen_rx_buffer *rxbuf;
+
+	u32 consumer = sds_ring->consumer;
+
+	int count = 0;
+	u64 sts_data0, sts_data1;
+	int opcode, ring = 0, desc_cnt;
+
+	while (count < max) {
+		desc = &sds_ring->desc_head[consumer];
+		sts_data0 = le64_to_cpu(desc->status_desc_data[0]);
+
+		if (!(sts_data0 & STATUS_OWNER_HOST))
+			break;
+
+		desc_cnt = netxen_get_sts_desc_cnt(sts_data0);
+
+		opcode = netxen_get_sts_opcode(sts_data0);
+
+		switch (opcode) {
+		case NETXEN_NIC_RXPKT_DESC:
+		case NETXEN_OLD_RXPKT_DESC:
+		case NETXEN_NIC_SYN_OFFLOAD:
+			ring = netxen_get_sts_type(sts_data0);
+			rxbuf = netxen_process_rcv(adapter, sds_ring,
+					ring, sts_data0);
+			break;
+		case NETXEN_NIC_LRO_DESC:
+			ring = netxen_get_lro_sts_type(sts_data0);
+			sts_data1 = le64_to_cpu(desc->status_desc_data[1]);
+			rxbuf = netxen_process_lro(adapter, sds_ring,
+					ring, sts_data0, sts_data1);
+			break;
+		case NETXEN_NIC_RESPONSE_DESC:
+			netxen_handle_fw_message(desc_cnt, consumer, sds_ring);
+		default:
+			goto skip;
+		}
+
+		WARN_ON(desc_cnt > 1);
+
+		if (rxbuf)
+			list_add_tail(&rxbuf->list, &sds_ring->free_list[ring]);
+
+skip:
+		for (; desc_cnt > 0; desc_cnt--) {
+			desc = &sds_ring->desc_head[consumer];
+			desc->status_desc_data[0] =
+				cpu_to_le64(STATUS_OWNER_PHANTOM);
+			consumer = get_next_index(consumer, sds_ring->num_desc);
+		}
+		count++;
+	}
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		struct nx_host_rds_ring *rds_ring =
+			&adapter->recv_ctx.rds_rings[ring];
+
+		if (!list_empty(&sds_ring->free_list[ring])) {
+			list_for_each(cur, &sds_ring->free_list[ring]) {
+				rxbuf = list_entry(cur,
+						struct netxen_rx_buffer, list);
+				netxen_alloc_rx_skb(adapter, rds_ring, rxbuf);
+			}
+			spin_lock(&rds_ring->lock);
+			netxen_merge_rx_buffers(&sds_ring->free_list[ring],
+						&rds_ring->free_list);
+			spin_unlock(&rds_ring->lock);
+		}
+
+		netxen_post_rx_buffers_nodb(adapter, rds_ring);
+	}
+
+	if (count) {
+		sds_ring->consumer = consumer;
+		NXWRIO(adapter, sds_ring->crb_sts_consumer, consumer);
+	}
+
+	return count;
+}
+
+/* Process Command status ring */
+int netxen_process_cmd_ring(struct netxen_adapter *adapter)
+{
+	u32 sw_consumer, hw_consumer;
+	int count = 0, i;
+	struct netxen_cmd_buffer *buffer;
+	struct pci_dev *pdev = adapter->pdev;
+	struct net_device *netdev = adapter->netdev;
+	struct netxen_skb_frag *frag;
+	int done = 0;
+	struct nx_host_tx_ring *tx_ring = adapter->tx_ring;
+
+	if (!spin_trylock_bh(&adapter->tx_clean_lock))
+		return 1;
+
+	sw_consumer = tx_ring->sw_consumer;
+	hw_consumer = le32_to_cpu(*(tx_ring->hw_consumer));
+
+	while (sw_consumer != hw_consumer) {
+		buffer = &tx_ring->cmd_buf_arr[sw_consumer];
+		if (buffer->skb) {
+			frag = &buffer->frag_array[0];
+			pci_unmap_single(pdev, frag->dma, frag->length,
+					 PCI_DMA_TODEVICE);
+			frag->dma = 0ULL;
+			for (i = 1; i < buffer->frag_count; i++) {
+				frag++;	/* Get the next frag */
+				pci_unmap_page(pdev, frag->dma, frag->length,
+					       PCI_DMA_TODEVICE);
+				frag->dma = 0ULL;
+			}
+
+			adapter->stats.xmitfinished++;
+			dev_kfree_skb_any(buffer->skb);
+			buffer->skb = NULL;
+		}
+
+		sw_consumer = get_next_index(sw_consumer, tx_ring->num_desc);
+		if (++count >= MAX_STATUS_HANDLE)
+			break;
+	}
+
+	tx_ring->sw_consumer = sw_consumer;
+
+	if (count && netif_running(netdev)) {
+		smp_mb();
+
+		if (netif_queue_stopped(netdev) && netif_carrier_ok(netdev))
+			if (netxen_tx_avail(tx_ring) > TX_STOP_THRESH)
+				netif_wake_queue(netdev);
+		adapter->tx_timeo_cnt = 0;
+	}
+	/*
+	 * If everything is freed up to consumer then check if the ring is full
+	 * If the ring is full then check if more needs to be freed and
+	 * schedule the call back again.
+	 *
+	 * This happens when there are 2 CPUs. One could be freeing and the
+	 * other filling it. If the ring is full when we get out of here and
+	 * the card has already interrupted the host then the host can miss the
+	 * interrupt.
+	 *
+	 * There is still a possible race condition and the host could miss an
+	 * interrupt. The card has to take care of this.
+	 */
+	hw_consumer = le32_to_cpu(*(tx_ring->hw_consumer));
+	done = (sw_consumer == hw_consumer);
+	spin_unlock_bh(&adapter->tx_clean_lock);
+
+	return done;
+}
+
+void
+netxen_post_rx_buffers(struct netxen_adapter *adapter, u32 ringid,
+	struct nx_host_rds_ring *rds_ring)
+{
+	struct rcv_desc *pdesc;
+	struct netxen_rx_buffer *buffer;
+	int producer, count = 0;
+	netxen_ctx_msg msg = 0;
+	struct list_head *head;
+
+	producer = rds_ring->producer;
+
+	head = &rds_ring->free_list;
+	while (!list_empty(head)) {
+
+		buffer = list_entry(head->next, struct netxen_rx_buffer, list);
+
+		if (!buffer->skb) {
+			if (netxen_alloc_rx_skb(adapter, rds_ring, buffer))
+				break;
+		}
+
+		count++;
+		list_del(&buffer->list);
+
+		/* make a rcv descriptor  */
+		pdesc = &rds_ring->desc_head[producer];
+		pdesc->addr_buffer = cpu_to_le64(buffer->dma);
+		pdesc->reference_handle = cpu_to_le16(buffer->ref_handle);
+		pdesc->buffer_length = cpu_to_le32(rds_ring->dma_size);
+
+		producer = get_next_index(producer, rds_ring->num_desc);
+	}
+
+	if (count) {
+		rds_ring->producer = producer;
+		NXWRIO(adapter, rds_ring->crb_rcv_producer,
+				(producer-1) & (rds_ring->num_desc-1));
+
+		if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+			/*
+			 * Write a doorbell msg to tell phanmon of change in
+			 * receive ring producer
+			 * Only for firmware version < 4.0.0
+			 */
+			netxen_set_msg_peg_id(msg, NETXEN_RCV_PEG_DB_ID);
+			netxen_set_msg_privid(msg);
+			netxen_set_msg_count(msg,
+					     ((producer - 1) &
+					      (rds_ring->num_desc - 1)));
+			netxen_set_msg_ctxid(msg, adapter->portnum);
+			netxen_set_msg_opcode(msg, NETXEN_RCV_PRODUCER(ringid));
+			NXWRIO(adapter, DB_NORMALIZE(adapter,
+					NETXEN_RCV_PRODUCER_OFFSET), msg);
+		}
+	}
+}
+
+static void
+netxen_post_rx_buffers_nodb(struct netxen_adapter *adapter,
+		struct nx_host_rds_ring *rds_ring)
+{
+	struct rcv_desc *pdesc;
+	struct netxen_rx_buffer *buffer;
+	int producer, count = 0;
+	struct list_head *head;
+
+	if (!spin_trylock(&rds_ring->lock))
+		return;
+
+	producer = rds_ring->producer;
+
+	head = &rds_ring->free_list;
+	while (!list_empty(head)) {
+
+		buffer = list_entry(head->next, struct netxen_rx_buffer, list);
+
+		if (!buffer->skb) {
+			if (netxen_alloc_rx_skb(adapter, rds_ring, buffer))
+				break;
+		}
+
+		count++;
+		list_del(&buffer->list);
+
+		/* make a rcv descriptor  */
+		pdesc = &rds_ring->desc_head[producer];
+		pdesc->reference_handle = cpu_to_le16(buffer->ref_handle);
+		pdesc->buffer_length = cpu_to_le32(rds_ring->dma_size);
+		pdesc->addr_buffer = cpu_to_le64(buffer->dma);
+
+		producer = get_next_index(producer, rds_ring->num_desc);
+	}
+
+	if (count) {
+		rds_ring->producer = producer;
+		NXWRIO(adapter, rds_ring->crb_rcv_producer,
+				(producer - 1) & (rds_ring->num_desc - 1));
+	}
+	spin_unlock(&rds_ring->lock);
+}
+
+void netxen_nic_clear_stats(struct netxen_adapter *adapter)
+{
+	memset(&adapter->stats, 0, sizeof(adapter->stats));
+}
+
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
new file mode 100644
index 0000000..8259e83
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -0,0 +1,3532 @@
+/*
+ * Copyright (C) 2003 - 2009 NetXen, Inc.
+ * Copyright (C) 2009 - QLogic Corporation.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * The full GNU General Public License is included in this distribution
+ * in the file called "COPYING".
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/interrupt.h>
+#include "netxen_nic_hw.h"
+
+#include "netxen_nic.h"
+
+#include <linux/dma-mapping.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <linux/inetdevice.h>
+#include <linux/sysfs.h>
+#include <linux/aer.h>
+
+MODULE_DESCRIPTION("QLogic/NetXen (1/10) GbE Intelligent Ethernet Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(NETXEN_NIC_LINUX_VERSIONID);
+MODULE_FIRMWARE(NX_UNIFIED_ROMIMAGE_NAME);
+
+char netxen_nic_driver_name[] = "netxen_nic";
+static char netxen_nic_driver_string[] = "QLogic/NetXen Network Driver v"
+    NETXEN_NIC_LINUX_VERSIONID;
+
+static int port_mode = NETXEN_PORT_MODE_AUTO_NEG;
+
+/* Default to restricted 1G auto-neg mode */
+static int wol_port_mode = 5;
+
+static int use_msi = 1;
+
+static int use_msi_x = 1;
+
+static int auto_fw_reset = AUTO_FW_RESET_ENABLED;
+module_param(auto_fw_reset, int, 0644);
+MODULE_PARM_DESC(auto_fw_reset,"Auto firmware reset (0=disabled, 1=enabled");
+
+static int netxen_nic_probe(struct pci_dev *pdev,
+		const struct pci_device_id *ent);
+static void netxen_nic_remove(struct pci_dev *pdev);
+static int netxen_nic_open(struct net_device *netdev);
+static int netxen_nic_close(struct net_device *netdev);
+static netdev_tx_t netxen_nic_xmit_frame(struct sk_buff *,
+					       struct net_device *);
+static void netxen_tx_timeout(struct net_device *netdev);
+static void netxen_tx_timeout_task(struct work_struct *work);
+static void netxen_fw_poll_work(struct work_struct *work);
+static void netxen_schedule_work(struct netxen_adapter *adapter,
+		work_func_t func, int delay);
+static void netxen_cancel_fw_work(struct netxen_adapter *adapter);
+static int netxen_nic_poll(struct napi_struct *napi, int budget);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void netxen_nic_poll_controller(struct net_device *netdev);
+#endif
+
+static void netxen_create_sysfs_entries(struct netxen_adapter *adapter);
+static void netxen_remove_sysfs_entries(struct netxen_adapter *adapter);
+static void netxen_create_diag_entries(struct netxen_adapter *adapter);
+static void netxen_remove_diag_entries(struct netxen_adapter *adapter);
+static int nx_dev_request_aer(struct netxen_adapter *adapter);
+static int nx_decr_dev_ref_cnt(struct netxen_adapter *adapter);
+static int netxen_can_start_firmware(struct netxen_adapter *adapter);
+
+static irqreturn_t netxen_intr(int irq, void *data);
+static irqreturn_t netxen_msi_intr(int irq, void *data);
+static irqreturn_t netxen_msix_intr(int irq, void *data);
+
+static void netxen_free_ip_list(struct netxen_adapter *, bool);
+static void netxen_restore_indev_addr(struct net_device *dev, unsigned long);
+static void netxen_nic_get_stats(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats);
+static int netxen_nic_set_mac(struct net_device *netdev, void *p);
+
+/*  PCI Device ID Table  */
+#define ENTRY(device) \
+	{PCI_DEVICE(PCI_VENDOR_ID_NETXEN, (device)), \
+	.class = PCI_CLASS_NETWORK_ETHERNET << 8, .class_mask = ~0}
+
+static const struct pci_device_id netxen_pci_tbl[] = {
+	ENTRY(PCI_DEVICE_ID_NX2031_10GXSR),
+	ENTRY(PCI_DEVICE_ID_NX2031_10GCX4),
+	ENTRY(PCI_DEVICE_ID_NX2031_4GCU),
+	ENTRY(PCI_DEVICE_ID_NX2031_IMEZ),
+	ENTRY(PCI_DEVICE_ID_NX2031_HMEZ),
+	ENTRY(PCI_DEVICE_ID_NX2031_XG_MGMT),
+	ENTRY(PCI_DEVICE_ID_NX2031_XG_MGMT2),
+	ENTRY(PCI_DEVICE_ID_NX3031),
+	{0,}
+};
+
+MODULE_DEVICE_TABLE(pci, netxen_pci_tbl);
+
+static uint32_t crb_cmd_producer[4] = {
+	CRB_CMD_PRODUCER_OFFSET, CRB_CMD_PRODUCER_OFFSET_1,
+	CRB_CMD_PRODUCER_OFFSET_2, CRB_CMD_PRODUCER_OFFSET_3
+};
+
+void
+netxen_nic_update_cmd_producer(struct netxen_adapter *adapter,
+		struct nx_host_tx_ring *tx_ring)
+{
+	NXWRIO(adapter, tx_ring->crb_cmd_producer, tx_ring->producer);
+}
+
+static uint32_t crb_cmd_consumer[4] = {
+	CRB_CMD_CONSUMER_OFFSET, CRB_CMD_CONSUMER_OFFSET_1,
+	CRB_CMD_CONSUMER_OFFSET_2, CRB_CMD_CONSUMER_OFFSET_3
+};
+
+static inline void
+netxen_nic_update_cmd_consumer(struct netxen_adapter *adapter,
+		struct nx_host_tx_ring *tx_ring)
+{
+	NXWRIO(adapter, tx_ring->crb_cmd_consumer, tx_ring->sw_consumer);
+}
+
+static uint32_t msi_tgt_status[8] = {
+	ISR_INT_TARGET_STATUS, ISR_INT_TARGET_STATUS_F1,
+	ISR_INT_TARGET_STATUS_F2, ISR_INT_TARGET_STATUS_F3,
+	ISR_INT_TARGET_STATUS_F4, ISR_INT_TARGET_STATUS_F5,
+	ISR_INT_TARGET_STATUS_F6, ISR_INT_TARGET_STATUS_F7
+};
+
+static struct netxen_legacy_intr_set legacy_intr[] = NX_LEGACY_INTR_CONFIG;
+
+static inline void netxen_nic_disable_int(struct nx_host_sds_ring *sds_ring)
+{
+	struct netxen_adapter *adapter = sds_ring->adapter;
+
+	NXWRIO(adapter, sds_ring->crb_intr_mask, 0);
+}
+
+static inline void netxen_nic_enable_int(struct nx_host_sds_ring *sds_ring)
+{
+	struct netxen_adapter *adapter = sds_ring->adapter;
+
+	NXWRIO(adapter, sds_ring->crb_intr_mask, 0x1);
+
+	if (!NETXEN_IS_MSI_FAMILY(adapter))
+		NXWRIO(adapter, adapter->tgt_mask_reg, 0xfbff);
+}
+
+static int
+netxen_alloc_sds_rings(struct netxen_recv_context *recv_ctx, int count)
+{
+	int size = sizeof(struct nx_host_sds_ring) * count;
+
+	recv_ctx->sds_rings = kzalloc(size, GFP_KERNEL);
+
+	return recv_ctx->sds_rings == NULL;
+}
+
+static void
+netxen_free_sds_rings(struct netxen_recv_context *recv_ctx)
+{
+	kfree(recv_ctx->sds_rings);
+	recv_ctx->sds_rings = NULL;
+}
+
+static int
+netxen_napi_add(struct netxen_adapter *adapter, struct net_device *netdev)
+{
+	int ring;
+	struct nx_host_sds_ring *sds_ring;
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+
+	if (netxen_alloc_sds_rings(recv_ctx, adapter->max_sds_rings))
+		return -ENOMEM;
+
+	for (ring = 0; ring < adapter->max_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		netif_napi_add(netdev, &sds_ring->napi,
+				netxen_nic_poll, NAPI_POLL_WEIGHT);
+	}
+
+	return 0;
+}
+
+static void
+netxen_napi_del(struct netxen_adapter *adapter)
+{
+	int ring;
+	struct nx_host_sds_ring *sds_ring;
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+
+	for (ring = 0; ring < adapter->max_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		netif_napi_del(&sds_ring->napi);
+	}
+
+	netxen_free_sds_rings(&adapter->recv_ctx);
+}
+
+static void
+netxen_napi_enable(struct netxen_adapter *adapter)
+{
+	int ring;
+	struct nx_host_sds_ring *sds_ring;
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+
+	for (ring = 0; ring < adapter->max_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		napi_enable(&sds_ring->napi);
+		netxen_nic_enable_int(sds_ring);
+	}
+}
+
+static void
+netxen_napi_disable(struct netxen_adapter *adapter)
+{
+	int ring;
+	struct nx_host_sds_ring *sds_ring;
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+
+	for (ring = 0; ring < adapter->max_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		netxen_nic_disable_int(sds_ring);
+		napi_synchronize(&sds_ring->napi);
+		napi_disable(&sds_ring->napi);
+	}
+}
+
+static int nx_set_dma_mask(struct netxen_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	uint64_t mask, cmask;
+
+	adapter->pci_using_dac = 0;
+
+	mask = DMA_BIT_MASK(32);
+	cmask = DMA_BIT_MASK(32);
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+#ifndef CONFIG_IA64
+		mask = DMA_BIT_MASK(35);
+#endif
+	} else {
+		mask = DMA_BIT_MASK(39);
+		cmask = mask;
+	}
+
+	if (pci_set_dma_mask(pdev, mask) == 0 &&
+		pci_set_consistent_dma_mask(pdev, cmask) == 0) {
+		adapter->pci_using_dac = 1;
+		return 0;
+	}
+
+	return -EIO;
+}
+
+/* Update addressable range if firmware supports it */
+static int
+nx_update_dma_mask(struct netxen_adapter *adapter)
+{
+	int change, shift, err;
+	uint64_t mask, old_mask, old_cmask;
+	struct pci_dev *pdev = adapter->pdev;
+
+	change = 0;
+
+	shift = NXRD32(adapter, CRB_DMA_SHIFT);
+	if (shift > 32)
+		return 0;
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id) && (shift > 9))
+		change = 1;
+	else if ((adapter->ahw.revision_id == NX_P2_C1) && (shift <= 4))
+		change = 1;
+
+	if (change) {
+		old_mask = pdev->dma_mask;
+		old_cmask = pdev->dev.coherent_dma_mask;
+
+		mask = DMA_BIT_MASK(32+shift);
+
+		err = pci_set_dma_mask(pdev, mask);
+		if (err)
+			goto err_out;
+
+		if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+
+			err = pci_set_consistent_dma_mask(pdev, mask);
+			if (err)
+				goto err_out;
+		}
+		dev_info(&pdev->dev, "using %d-bit dma mask\n", 32+shift);
+	}
+
+	return 0;
+
+err_out:
+	pci_set_dma_mask(pdev, old_mask);
+	pci_set_consistent_dma_mask(pdev, old_cmask);
+	return err;
+}
+
+static int
+netxen_check_hw_init(struct netxen_adapter *adapter, int first_boot)
+{
+	u32 val, timeout;
+
+	if (first_boot == 0x55555555) {
+		/* This is the first boot after power up */
+		NXWR32(adapter, NETXEN_CAM_RAM(0x1fc), NETXEN_BDINFO_MAGIC);
+
+		if (!NX_IS_REVISION_P2(adapter->ahw.revision_id))
+			return 0;
+
+		/* PCI bus master workaround */
+		first_boot = NXRD32(adapter, NETXEN_PCIE_REG(0x4));
+		if (!(first_boot & 0x4)) {
+			first_boot |= 0x4;
+			NXWR32(adapter, NETXEN_PCIE_REG(0x4), first_boot);
+			NXRD32(adapter, NETXEN_PCIE_REG(0x4));
+		}
+
+		/* This is the first boot after power up */
+		first_boot = NXRD32(adapter, NETXEN_ROMUSB_GLB_SW_RESET);
+		if (first_boot != 0x80000f) {
+			/* clear the register for future unloads/loads */
+			NXWR32(adapter, NETXEN_CAM_RAM(0x1fc), 0);
+			return -EIO;
+		}
+
+		/* Start P2 boot loader */
+		val = NXRD32(adapter, NETXEN_ROMUSB_GLB_PEGTUNE_DONE);
+		NXWR32(adapter, NETXEN_ROMUSB_GLB_PEGTUNE_DONE, val | 0x1);
+		timeout = 0;
+		do {
+			msleep(1);
+			val = NXRD32(adapter, NETXEN_CAM_RAM(0x1fc));
+
+			if (++timeout > 5000)
+				return -EIO;
+
+		} while (val == NETXEN_BDINFO_MAGIC);
+	}
+	return 0;
+}
+
+static void netxen_set_port_mode(struct netxen_adapter *adapter)
+{
+	u32 val, data;
+
+	val = adapter->ahw.board_type;
+	if ((val == NETXEN_BRDTYPE_P3_HMEZ) ||
+		(val == NETXEN_BRDTYPE_P3_XG_LOM)) {
+		if (port_mode == NETXEN_PORT_MODE_802_3_AP) {
+			data = NETXEN_PORT_MODE_802_3_AP;
+			NXWR32(adapter, NETXEN_PORT_MODE_ADDR, data);
+		} else if (port_mode == NETXEN_PORT_MODE_XG) {
+			data = NETXEN_PORT_MODE_XG;
+			NXWR32(adapter, NETXEN_PORT_MODE_ADDR, data);
+		} else if (port_mode == NETXEN_PORT_MODE_AUTO_NEG_1G) {
+			data = NETXEN_PORT_MODE_AUTO_NEG_1G;
+			NXWR32(adapter, NETXEN_PORT_MODE_ADDR, data);
+		} else if (port_mode == NETXEN_PORT_MODE_AUTO_NEG_XG) {
+			data = NETXEN_PORT_MODE_AUTO_NEG_XG;
+			NXWR32(adapter, NETXEN_PORT_MODE_ADDR, data);
+		} else {
+			data = NETXEN_PORT_MODE_AUTO_NEG;
+			NXWR32(adapter, NETXEN_PORT_MODE_ADDR, data);
+		}
+
+		if ((wol_port_mode != NETXEN_PORT_MODE_802_3_AP) &&
+			(wol_port_mode != NETXEN_PORT_MODE_XG) &&
+			(wol_port_mode != NETXEN_PORT_MODE_AUTO_NEG_1G) &&
+			(wol_port_mode != NETXEN_PORT_MODE_AUTO_NEG_XG)) {
+			wol_port_mode = NETXEN_PORT_MODE_AUTO_NEG;
+		}
+		NXWR32(adapter, NETXEN_WOL_PORT_MODE, wol_port_mode);
+	}
+}
+
+#define PCI_CAP_ID_GEN  0x10
+
+static void netxen_pcie_strap_init(struct netxen_adapter *adapter)
+{
+	u32 pdevfuncsave;
+	u32 c8c9value = 0;
+	u32 chicken = 0;
+	u32 control = 0;
+	int i, pos;
+	struct pci_dev *pdev;
+
+	pdev = adapter->pdev;
+
+	chicken = NXRD32(adapter, NETXEN_PCIE_REG(PCIE_CHICKEN3));
+	/* clear chicken3.25:24 */
+	chicken &= 0xFCFFFFFF;
+	/*
+	 * if gen1 and B0, set F1020 - if gen 2, do nothing
+	 * if gen2 set to F1000
+	 */
+	pos = pci_find_capability(pdev, PCI_CAP_ID_GEN);
+	if (pos == 0xC0) {
+		pci_read_config_dword(pdev, pos + 0x10, &control);
+		if ((control & 0x000F0000) != 0x00020000) {
+			/*  set chicken3.24 if gen1 */
+			chicken |= 0x01000000;
+		}
+		dev_info(&adapter->pdev->dev, "Gen2 strapping detected\n");
+		c8c9value = 0xF1000;
+	} else {
+		/* set chicken3.24 if gen1 */
+		chicken |= 0x01000000;
+		dev_info(&adapter->pdev->dev, "Gen1 strapping detected\n");
+		if (adapter->ahw.revision_id == NX_P3_B0)
+			c8c9value = 0xF1020;
+		else
+			c8c9value = 0;
+	}
+
+	NXWR32(adapter, NETXEN_PCIE_REG(PCIE_CHICKEN3), chicken);
+
+	if (!c8c9value)
+		return;
+
+	pdevfuncsave = pdev->devfn;
+	if (pdevfuncsave & 0x07)
+		return;
+
+	for (i = 0; i < 8; i++) {
+		pci_read_config_dword(pdev, pos + 8, &control);
+		pci_read_config_dword(pdev, pos + 8, &control);
+		pci_write_config_dword(pdev, pos + 8, c8c9value);
+		pdev->devfn++;
+	}
+	pdev->devfn = pdevfuncsave;
+}
+
+static void netxen_set_msix_bit(struct pci_dev *pdev, int enable)
+{
+	u32 control;
+
+	if (pdev->msix_cap) {
+		pci_read_config_dword(pdev, pdev->msix_cap, &control);
+		if (enable)
+			control |= PCI_MSIX_FLAGS_ENABLE;
+		else
+			control = 0;
+		pci_write_config_dword(pdev, pdev->msix_cap, control);
+	}
+}
+
+static void netxen_init_msix_entries(struct netxen_adapter *adapter, int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		adapter->msix_entries[i].entry = i;
+}
+
+static int
+netxen_read_mac_addr(struct netxen_adapter *adapter)
+{
+	int i;
+	unsigned char *p;
+	u64 mac_addr;
+	struct net_device *netdev = adapter->netdev;
+	struct pci_dev *pdev = adapter->pdev;
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+		if (netxen_p3_get_mac_addr(adapter, &mac_addr) != 0)
+			return -EIO;
+	} else {
+		if (netxen_get_flash_mac_addr(adapter, &mac_addr) != 0)
+			return -EIO;
+	}
+
+	p = (unsigned char *)&mac_addr;
+	for (i = 0; i < 6; i++)
+		netdev->dev_addr[i] = *(p + 5 - i);
+
+	memcpy(adapter->mac_addr, netdev->dev_addr, netdev->addr_len);
+
+	/* set station address */
+
+	if (!is_valid_ether_addr(netdev->dev_addr))
+		dev_warn(&pdev->dev, "Bad MAC address %pM.\n", netdev->dev_addr);
+
+	return 0;
+}
+
+static int netxen_nic_set_mac(struct net_device *netdev, void *p)
+{
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	if (netif_running(netdev)) {
+		netif_device_detach(netdev);
+		netxen_napi_disable(adapter);
+	}
+
+	memcpy(adapter->mac_addr, addr->sa_data, netdev->addr_len);
+	memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len);
+	adapter->macaddr_set(adapter, addr->sa_data);
+
+	if (netif_running(netdev)) {
+		netif_device_attach(netdev);
+		netxen_napi_enable(adapter);
+	}
+	return 0;
+}
+
+static void netxen_set_multicast_list(struct net_device *dev)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+
+	adapter->set_multi(dev);
+}
+
+static netdev_features_t netxen_fix_features(struct net_device *dev,
+	netdev_features_t features)
+{
+	if (!(features & NETIF_F_RXCSUM)) {
+		netdev_info(dev, "disabling LRO as RXCSUM is off\n");
+
+		features &= ~NETIF_F_LRO;
+	}
+
+	return features;
+}
+
+static int netxen_set_features(struct net_device *dev,
+	netdev_features_t features)
+{
+	struct netxen_adapter *adapter = netdev_priv(dev);
+	int hw_lro;
+
+	if (!((dev->features ^ features) & NETIF_F_LRO))
+		return 0;
+
+	hw_lro = (features & NETIF_F_LRO) ? NETXEN_NIC_LRO_ENABLED
+	         : NETXEN_NIC_LRO_DISABLED;
+
+	if (netxen_config_hw_lro(adapter, hw_lro))
+		return -EIO;
+
+	if (!(features & NETIF_F_LRO) && netxen_send_lro_cleanup(adapter))
+		return -EIO;
+
+	return 0;
+}
+
+static const struct net_device_ops netxen_netdev_ops = {
+	.ndo_open	   = netxen_nic_open,
+	.ndo_stop	   = netxen_nic_close,
+	.ndo_start_xmit    = netxen_nic_xmit_frame,
+	.ndo_get_stats64   = netxen_nic_get_stats,
+	.ndo_validate_addr = eth_validate_addr,
+	.ndo_set_rx_mode   = netxen_set_multicast_list,
+	.ndo_set_mac_address    = netxen_nic_set_mac,
+	.ndo_change_mtu	   = netxen_nic_change_mtu,
+	.ndo_tx_timeout	   = netxen_tx_timeout,
+	.ndo_fix_features = netxen_fix_features,
+	.ndo_set_features = netxen_set_features,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = netxen_nic_poll_controller,
+#endif
+};
+
+static inline bool netxen_function_zero(struct pci_dev *pdev)
+{
+	return (PCI_FUNC(pdev->devfn) == 0) ? true : false;
+}
+
+static inline void netxen_set_interrupt_mode(struct netxen_adapter *adapter,
+					     u32 mode)
+{
+	NXWR32(adapter, NETXEN_INTR_MODE_REG, mode);
+}
+
+static inline u32 netxen_get_interrupt_mode(struct netxen_adapter *adapter)
+{
+	return NXRD32(adapter, NETXEN_INTR_MODE_REG);
+}
+
+static void
+netxen_initialize_interrupt_registers(struct netxen_adapter *adapter)
+{
+	struct netxen_legacy_intr_set *legacy_intrp;
+	u32 tgt_status_reg, int_state_reg;
+
+	if (adapter->ahw.revision_id >= NX_P3_B0)
+		legacy_intrp = &legacy_intr[adapter->ahw.pci_func];
+	else
+		legacy_intrp = &legacy_intr[0];
+
+	tgt_status_reg = legacy_intrp->tgt_status_reg;
+	int_state_reg = ISR_INT_STATE_REG;
+
+	adapter->int_vec_bit = legacy_intrp->int_vec_bit;
+	adapter->tgt_status_reg = netxen_get_ioaddr(adapter, tgt_status_reg);
+	adapter->tgt_mask_reg = netxen_get_ioaddr(adapter,
+						  legacy_intrp->tgt_mask_reg);
+	adapter->pci_int_reg = netxen_get_ioaddr(adapter,
+						 legacy_intrp->pci_int_reg);
+	adapter->isr_int_vec = netxen_get_ioaddr(adapter, ISR_INT_VECTOR);
+
+	if (adapter->ahw.revision_id >= NX_P3_B1)
+		adapter->crb_int_state_reg = netxen_get_ioaddr(adapter,
+							       int_state_reg);
+	else
+		adapter->crb_int_state_reg = netxen_get_ioaddr(adapter,
+							       CRB_INT_VECTOR);
+}
+
+static int netxen_setup_msi_interrupts(struct netxen_adapter *adapter,
+				       int num_msix)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	u32 value;
+	int err;
+
+	if (adapter->msix_supported) {
+		netxen_init_msix_entries(adapter, num_msix);
+		err = pci_enable_msix_range(pdev, adapter->msix_entries,
+					    num_msix, num_msix);
+		if (err > 0) {
+			adapter->flags |= NETXEN_NIC_MSIX_ENABLED;
+			netxen_set_msix_bit(pdev, 1);
+
+			if (adapter->rss_supported)
+				adapter->max_sds_rings = num_msix;
+
+			dev_info(&pdev->dev, "using msi-x interrupts\n");
+			return 0;
+		}
+		/* fall through for msi */
+	}
+
+	if (use_msi && !pci_enable_msi(pdev)) {
+		value = msi_tgt_status[adapter->ahw.pci_func];
+		adapter->flags |= NETXEN_NIC_MSI_ENABLED;
+		adapter->tgt_status_reg = netxen_get_ioaddr(adapter, value);
+		adapter->msix_entries[0].vector = pdev->irq;
+		dev_info(&pdev->dev, "using msi interrupts\n");
+		return 0;
+	}
+
+	dev_err(&pdev->dev, "Failed to acquire MSI-X/MSI interrupt vector\n");
+	return -EIO;
+}
+
+static int netxen_setup_intr(struct netxen_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	int num_msix;
+
+	if (adapter->rss_supported)
+		num_msix = (num_online_cpus() >= MSIX_ENTRIES_PER_ADAPTER) ?
+			    MSIX_ENTRIES_PER_ADAPTER : 2;
+	else
+		num_msix = 1;
+
+	adapter->max_sds_rings = 1;
+	adapter->flags &= ~(NETXEN_NIC_MSI_ENABLED | NETXEN_NIC_MSIX_ENABLED);
+
+	netxen_initialize_interrupt_registers(adapter);
+	netxen_set_msix_bit(pdev, 0);
+
+	if (netxen_function_zero(pdev)) {
+		if (!netxen_setup_msi_interrupts(adapter, num_msix))
+			netxen_set_interrupt_mode(adapter, NETXEN_MSI_MODE);
+		else
+			netxen_set_interrupt_mode(adapter, NETXEN_INTX_MODE);
+	} else {
+		if (netxen_get_interrupt_mode(adapter) == NETXEN_MSI_MODE &&
+		    netxen_setup_msi_interrupts(adapter, num_msix)) {
+			dev_err(&pdev->dev, "Co-existence of MSI-X/MSI and INTx interrupts is not supported\n");
+			return -EIO;
+		}
+	}
+
+	if (!NETXEN_IS_MSI_FAMILY(adapter)) {
+		adapter->msix_entries[0].vector = pdev->irq;
+		dev_info(&pdev->dev, "using legacy interrupts\n");
+	}
+	return 0;
+}
+
+static void
+netxen_teardown_intr(struct netxen_adapter *adapter)
+{
+	if (adapter->flags & NETXEN_NIC_MSIX_ENABLED)
+		pci_disable_msix(adapter->pdev);
+	if (adapter->flags & NETXEN_NIC_MSI_ENABLED)
+		pci_disable_msi(adapter->pdev);
+}
+
+static void
+netxen_cleanup_pci_map(struct netxen_adapter *adapter)
+{
+	if (adapter->ahw.db_base != NULL)
+		iounmap(adapter->ahw.db_base);
+	if (adapter->ahw.pci_base0 != NULL)
+		iounmap(adapter->ahw.pci_base0);
+	if (adapter->ahw.pci_base1 != NULL)
+		iounmap(adapter->ahw.pci_base1);
+	if (adapter->ahw.pci_base2 != NULL)
+		iounmap(adapter->ahw.pci_base2);
+}
+
+static int
+netxen_setup_pci_map(struct netxen_adapter *adapter)
+{
+	void __iomem *db_ptr = NULL;
+
+	resource_size_t mem_base, db_base;
+	unsigned long mem_len, db_len = 0;
+
+	struct pci_dev *pdev = adapter->pdev;
+	int pci_func = adapter->ahw.pci_func;
+	struct netxen_hardware_context *ahw = &adapter->ahw;
+
+	int err = 0;
+
+	/*
+	 * Set the CRB window to invalid. If any register in window 0 is
+	 * accessed it should set the window to 0 and then reset it to 1.
+	 */
+	adapter->ahw.crb_win = -1;
+	adapter->ahw.ocm_win = -1;
+
+	/* remap phys address */
+	mem_base = pci_resource_start(pdev, 0);	/* 0 is for BAR 0 */
+	mem_len = pci_resource_len(pdev, 0);
+
+	/* 128 Meg of memory */
+	if (mem_len == NETXEN_PCI_128MB_SIZE) {
+
+		ahw->pci_base0 = ioremap(mem_base, FIRST_PAGE_GROUP_SIZE);
+		ahw->pci_base1 = ioremap(mem_base + SECOND_PAGE_GROUP_START,
+				SECOND_PAGE_GROUP_SIZE);
+		ahw->pci_base2 = ioremap(mem_base + THIRD_PAGE_GROUP_START,
+				THIRD_PAGE_GROUP_SIZE);
+		if (ahw->pci_base0 == NULL || ahw->pci_base1 == NULL ||
+						ahw->pci_base2 == NULL) {
+			dev_err(&pdev->dev, "failed to map PCI bar 0\n");
+			err = -EIO;
+			goto err_out;
+		}
+
+		ahw->pci_len0 = FIRST_PAGE_GROUP_SIZE;
+
+	} else if (mem_len == NETXEN_PCI_32MB_SIZE) {
+
+		ahw->pci_base1 = ioremap(mem_base, SECOND_PAGE_GROUP_SIZE);
+		ahw->pci_base2 = ioremap(mem_base + THIRD_PAGE_GROUP_START -
+			SECOND_PAGE_GROUP_START, THIRD_PAGE_GROUP_SIZE);
+		if (ahw->pci_base1 == NULL || ahw->pci_base2 == NULL) {
+			dev_err(&pdev->dev, "failed to map PCI bar 0\n");
+			err = -EIO;
+			goto err_out;
+		}
+
+	} else if (mem_len == NETXEN_PCI_2MB_SIZE) {
+
+		ahw->pci_base0 = pci_ioremap_bar(pdev, 0);
+		if (ahw->pci_base0 == NULL) {
+			dev_err(&pdev->dev, "failed to map PCI bar 0\n");
+			return -EIO;
+		}
+		ahw->pci_len0 = mem_len;
+	} else {
+		return -EIO;
+	}
+
+	netxen_setup_hwops(adapter);
+
+	dev_info(&pdev->dev, "%dMB memory map\n", (int)(mem_len>>20));
+
+	if (NX_IS_REVISION_P3P(adapter->ahw.revision_id)) {
+		adapter->ahw.ocm_win_crb = netxen_get_ioaddr(adapter,
+			NETXEN_PCIX_PS_REG(PCIX_OCM_WINDOW_REG(pci_func)));
+
+	} else if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+		adapter->ahw.ocm_win_crb = netxen_get_ioaddr(adapter,
+			NETXEN_PCIX_PS_REG(PCIE_MN_WINDOW_REG(pci_func)));
+	}
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		goto skip_doorbell;
+
+	db_base = pci_resource_start(pdev, 4);	/* doorbell is on bar 4 */
+	db_len = pci_resource_len(pdev, 4);
+
+	if (db_len == 0) {
+		printk(KERN_ERR "%s: doorbell is disabled\n",
+				netxen_nic_driver_name);
+		err = -EIO;
+		goto err_out;
+	}
+
+	db_ptr = ioremap(db_base, NETXEN_DB_MAPSIZE_BYTES);
+	if (!db_ptr) {
+		printk(KERN_ERR "%s: Failed to allocate doorbell map.",
+				netxen_nic_driver_name);
+		err = -EIO;
+		goto err_out;
+	}
+
+skip_doorbell:
+	adapter->ahw.db_base = db_ptr;
+	adapter->ahw.db_len = db_len;
+	return 0;
+
+err_out:
+	netxen_cleanup_pci_map(adapter);
+	return err;
+}
+
+static void
+netxen_check_options(struct netxen_adapter *adapter)
+{
+	u32 fw_major, fw_minor, fw_build, prev_fw_version;
+	char brd_name[NETXEN_MAX_SHORT_NAME];
+	char serial_num[32];
+	int i, offset, val, err;
+	__le32 *ptr32;
+	struct pci_dev *pdev = adapter->pdev;
+
+	adapter->driver_mismatch = 0;
+
+	ptr32 = (__le32 *)&serial_num;
+	offset = NX_FW_SERIAL_NUM_OFFSET;
+	for (i = 0; i < 8; i++) {
+		err = netxen_rom_fast_read(adapter, offset, &val);
+		if (err) {
+			dev_err(&pdev->dev, "error reading board info\n");
+			adapter->driver_mismatch = 1;
+			return;
+		}
+		ptr32[i] = cpu_to_le32(val);
+		offset += sizeof(u32);
+	}
+
+	fw_major = NXRD32(adapter, NETXEN_FW_VERSION_MAJOR);
+	fw_minor = NXRD32(adapter, NETXEN_FW_VERSION_MINOR);
+	fw_build = NXRD32(adapter, NETXEN_FW_VERSION_SUB);
+	prev_fw_version = adapter->fw_version;
+	adapter->fw_version = NETXEN_VERSION_CODE(fw_major, fw_minor, fw_build);
+
+	/* Get FW Mini Coredump template and store it */
+	 if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+		if (adapter->mdump.md_template == NULL ||
+				adapter->fw_version > prev_fw_version) {
+			kfree(adapter->mdump.md_template);
+			adapter->mdump.md_template = NULL;
+			err = netxen_setup_minidump(adapter);
+			if (err)
+				dev_err(&adapter->pdev->dev,
+				"Failed to setup minidump rcode = %d\n", err);
+		}
+	}
+
+	if (adapter->portnum == 0) {
+		if (netxen_nic_get_brd_name_by_type(adapter->ahw.board_type,
+						    brd_name))
+			strcpy(serial_num, "Unknown");
+
+		pr_info("%s: %s Board S/N %s  Chip rev 0x%x\n",
+				module_name(THIS_MODULE),
+				brd_name, serial_num, adapter->ahw.revision_id);
+	}
+
+	if (adapter->fw_version < NETXEN_VERSION_CODE(3, 4, 216)) {
+		adapter->driver_mismatch = 1;
+		dev_warn(&pdev->dev, "firmware version %d.%d.%d unsupported\n",
+				fw_major, fw_minor, fw_build);
+		return;
+	}
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+		i = NXRD32(adapter, NETXEN_SRE_MISC);
+		adapter->ahw.cut_through = (i & 0x8000) ? 1 : 0;
+	}
+
+	dev_info(&pdev->dev, "Driver v%s, firmware v%d.%d.%d [%s]\n",
+		 NETXEN_NIC_LINUX_VERSIONID, fw_major, fw_minor, fw_build,
+		 adapter->ahw.cut_through ? "cut-through" : "legacy");
+
+	if (adapter->fw_version >= NETXEN_VERSION_CODE(4, 0, 222))
+		adapter->capabilities = NXRD32(adapter, CRB_FW_CAPABILITIES_1);
+
+	if (adapter->ahw.port_type == NETXEN_NIC_XGBE) {
+		adapter->num_rxd = DEFAULT_RCV_DESCRIPTORS_10G;
+		adapter->num_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_10G;
+	} else if (adapter->ahw.port_type == NETXEN_NIC_GBE) {
+		adapter->num_rxd = DEFAULT_RCV_DESCRIPTORS_1G;
+		adapter->num_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_1G;
+	}
+
+	adapter->msix_supported = 0;
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+		adapter->msix_supported = !!use_msi_x;
+		adapter->rss_supported = !!use_msi_x;
+	} else {
+		u32 flashed_ver = 0;
+		netxen_rom_fast_read(adapter,
+				NX_FW_VERSION_OFFSET, (int *)&flashed_ver);
+		flashed_ver = NETXEN_DECODE_VERSION(flashed_ver);
+
+		if (flashed_ver >= NETXEN_VERSION_CODE(3, 4, 336)) {
+			switch (adapter->ahw.board_type) {
+			case NETXEN_BRDTYPE_P2_SB31_10G:
+			case NETXEN_BRDTYPE_P2_SB31_10G_CX4:
+				adapter->msix_supported = !!use_msi_x;
+				adapter->rss_supported = !!use_msi_x;
+				break;
+			default:
+				break;
+			}
+		}
+	}
+
+	adapter->num_txd = MAX_CMD_DESCRIPTORS;
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+		adapter->num_lro_rxd = MAX_LRO_RCV_DESCRIPTORS;
+		adapter->max_rds_rings = 3;
+	} else {
+		adapter->num_lro_rxd = 0;
+		adapter->max_rds_rings = 2;
+	}
+}
+
+static int
+netxen_start_firmware(struct netxen_adapter *adapter)
+{
+	int val, err, first_boot;
+	struct pci_dev *pdev = adapter->pdev;
+
+	/* required for NX2031 dummy dma */
+	err = nx_set_dma_mask(adapter);
+	if (err)
+		return err;
+
+	err = netxen_can_start_firmware(adapter);
+
+	if (err < 0)
+		return err;
+
+	if (!err)
+		goto wait_init;
+
+	first_boot = NXRD32(adapter, NETXEN_CAM_RAM(0x1fc));
+
+	err = netxen_check_hw_init(adapter, first_boot);
+	if (err) {
+		dev_err(&pdev->dev, "error in init HW init sequence\n");
+		return err;
+	}
+
+	netxen_request_firmware(adapter);
+
+	err = netxen_need_fw_reset(adapter);
+	if (err < 0)
+		goto err_out;
+	if (err == 0)
+		goto pcie_strap_init;
+
+	if (first_boot != 0x55555555) {
+		NXWR32(adapter, CRB_CMDPEG_STATE, 0);
+		netxen_pinit_from_rom(adapter);
+		msleep(1);
+	}
+
+	NXWR32(adapter, CRB_DMA_SHIFT, 0x55555555);
+	NXWR32(adapter, NETXEN_PEG_HALT_STATUS1, 0);
+	NXWR32(adapter, NETXEN_PEG_HALT_STATUS2, 0);
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		netxen_set_port_mode(adapter);
+
+	err = netxen_load_firmware(adapter);
+	if (err)
+		goto err_out;
+
+	netxen_release_firmware(adapter);
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+
+		/* Initialize multicast addr pool owners */
+		val = 0x7654;
+		if (adapter->ahw.port_type == NETXEN_NIC_XGBE)
+			val |= 0x0f000000;
+		NXWR32(adapter, NETXEN_MAC_ADDR_CNTL_REG, val);
+
+	}
+
+	err = netxen_init_dummy_dma(adapter);
+	if (err)
+		goto err_out;
+
+	/*
+	 * Tell the hardware our version number.
+	 */
+	val = (_NETXEN_NIC_LINUX_MAJOR << 16)
+		| ((_NETXEN_NIC_LINUX_MINOR << 8))
+		| (_NETXEN_NIC_LINUX_SUBVERSION);
+	NXWR32(adapter, CRB_DRIVER_VERSION, val);
+
+pcie_strap_init:
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		netxen_pcie_strap_init(adapter);
+
+wait_init:
+	/* Handshake with the card before we register the devices. */
+	err = netxen_phantom_init(adapter, NETXEN_NIC_PEG_TUNE);
+	if (err) {
+		netxen_free_dummy_dma(adapter);
+		goto err_out;
+	}
+
+	NXWR32(adapter, NX_CRB_DEV_STATE, NX_DEV_READY);
+
+	nx_update_dma_mask(adapter);
+
+	netxen_check_options(adapter);
+
+	adapter->need_fw_reset = 0;
+
+	/* fall through and release firmware */
+
+err_out:
+	netxen_release_firmware(adapter);
+	return err;
+}
+
+static int
+netxen_nic_request_irq(struct netxen_adapter *adapter)
+{
+	irq_handler_t handler;
+	struct nx_host_sds_ring *sds_ring;
+	int err, ring;
+
+	unsigned long flags = 0;
+	struct net_device *netdev = adapter->netdev;
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+
+	if (adapter->flags & NETXEN_NIC_MSIX_ENABLED)
+		handler = netxen_msix_intr;
+	else if (adapter->flags & NETXEN_NIC_MSI_ENABLED)
+		handler = netxen_msi_intr;
+	else {
+		flags |= IRQF_SHARED;
+		handler = netxen_intr;
+	}
+	adapter->irq = netdev->irq;
+
+	for (ring = 0; ring < adapter->max_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		sprintf(sds_ring->name, "%s[%d]", netdev->name, ring);
+		err = request_irq(sds_ring->irq, handler,
+				  flags, sds_ring->name, sds_ring);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void
+netxen_nic_free_irq(struct netxen_adapter *adapter)
+{
+	int ring;
+	struct nx_host_sds_ring *sds_ring;
+
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+
+	for (ring = 0; ring < adapter->max_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		free_irq(sds_ring->irq, sds_ring);
+	}
+}
+
+static void
+netxen_nic_init_coalesce_defaults(struct netxen_adapter *adapter)
+{
+	adapter->coal.flags = NETXEN_NIC_INTR_DEFAULT;
+	adapter->coal.normal.data.rx_time_us =
+		NETXEN_DEFAULT_INTR_COALESCE_RX_TIME_US;
+	adapter->coal.normal.data.rx_packets =
+		NETXEN_DEFAULT_INTR_COALESCE_RX_PACKETS;
+	adapter->coal.normal.data.tx_time_us =
+		NETXEN_DEFAULT_INTR_COALESCE_TX_TIME_US;
+	adapter->coal.normal.data.tx_packets =
+		NETXEN_DEFAULT_INTR_COALESCE_TX_PACKETS;
+}
+
+/* with rtnl_lock */
+static int
+__netxen_nic_up(struct netxen_adapter *adapter, struct net_device *netdev)
+{
+	int err;
+
+	if (adapter->is_up != NETXEN_ADAPTER_UP_MAGIC)
+		return -EIO;
+
+	err = adapter->init_port(adapter, adapter->physical_port);
+	if (err) {
+		printk(KERN_ERR "%s: Failed to initialize port %d\n",
+				netxen_nic_driver_name, adapter->portnum);
+		return err;
+	}
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		adapter->macaddr_set(adapter, adapter->mac_addr);
+
+	adapter->set_multi(netdev);
+	adapter->set_mtu(adapter, netdev->mtu);
+
+	adapter->ahw.linkup = 0;
+
+	if (adapter->max_sds_rings > 1)
+		netxen_config_rss(adapter, 1);
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		netxen_config_intr_coalesce(adapter);
+
+	if (netdev->features & NETIF_F_LRO)
+		netxen_config_hw_lro(adapter, NETXEN_NIC_LRO_ENABLED);
+
+	netxen_napi_enable(adapter);
+
+	if (adapter->capabilities & NX_FW_CAPABILITY_LINK_NOTIFICATION)
+		netxen_linkevent_request(adapter, 1);
+	else
+		netxen_nic_set_link_parameters(adapter);
+
+	set_bit(__NX_DEV_UP, &adapter->state);
+	return 0;
+}
+
+/* Usage: During resume and firmware recovery module.*/
+
+static inline int
+netxen_nic_up(struct netxen_adapter *adapter, struct net_device *netdev)
+{
+	int err = 0;
+
+	rtnl_lock();
+	if (netif_running(netdev))
+		err = __netxen_nic_up(adapter, netdev);
+	rtnl_unlock();
+
+	return err;
+}
+
+/* with rtnl_lock */
+static void
+__netxen_nic_down(struct netxen_adapter *adapter, struct net_device *netdev)
+{
+	if (adapter->is_up != NETXEN_ADAPTER_UP_MAGIC)
+		return;
+
+	if (!test_and_clear_bit(__NX_DEV_UP, &adapter->state))
+		return;
+
+	smp_mb();
+	netif_carrier_off(netdev);
+	netif_tx_disable(netdev);
+
+	if (adapter->capabilities & NX_FW_CAPABILITY_LINK_NOTIFICATION)
+		netxen_linkevent_request(adapter, 0);
+
+	if (adapter->stop_port)
+		adapter->stop_port(adapter);
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		netxen_p3_free_mac_list(adapter);
+
+	adapter->set_promisc(adapter, NETXEN_NIU_NON_PROMISC_MODE);
+
+	netxen_napi_disable(adapter);
+
+	netxen_release_tx_buffers(adapter);
+}
+
+/* Usage: During suspend and firmware recovery module */
+
+static inline void
+netxen_nic_down(struct netxen_adapter *adapter, struct net_device *netdev)
+{
+	rtnl_lock();
+	if (netif_running(netdev))
+		__netxen_nic_down(adapter, netdev);
+	rtnl_unlock();
+
+}
+
+static int
+netxen_nic_attach(struct netxen_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct pci_dev *pdev = adapter->pdev;
+	int err, ring;
+	struct nx_host_rds_ring *rds_ring;
+	struct nx_host_tx_ring *tx_ring;
+	u32 capab2;
+
+	if (adapter->is_up == NETXEN_ADAPTER_UP_MAGIC)
+		return 0;
+
+	err = netxen_init_firmware(adapter);
+	if (err)
+		return err;
+
+	adapter->flags &= ~NETXEN_FW_MSS_CAP;
+	if (adapter->capabilities & NX_FW_CAPABILITY_MORE_CAPS) {
+		capab2 = NXRD32(adapter, CRB_FW_CAPABILITIES_2);
+		if (capab2 & NX_FW_CAPABILITY_2_LRO_MAX_TCP_SEG)
+			adapter->flags |= NETXEN_FW_MSS_CAP;
+	}
+
+	err = netxen_napi_add(adapter, netdev);
+	if (err)
+		return err;
+
+	err = netxen_alloc_sw_resources(adapter);
+	if (err) {
+		printk(KERN_ERR "%s: Error in setting sw resources\n",
+				netdev->name);
+		return err;
+	}
+
+	err = netxen_alloc_hw_resources(adapter);
+	if (err) {
+		printk(KERN_ERR "%s: Error in setting hw resources\n",
+				netdev->name);
+		goto err_out_free_sw;
+	}
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+		tx_ring = adapter->tx_ring;
+		tx_ring->crb_cmd_producer = netxen_get_ioaddr(adapter,
+				crb_cmd_producer[adapter->portnum]);
+		tx_ring->crb_cmd_consumer = netxen_get_ioaddr(adapter,
+				crb_cmd_consumer[adapter->portnum]);
+
+		tx_ring->producer = 0;
+		tx_ring->sw_consumer = 0;
+
+		netxen_nic_update_cmd_producer(adapter, tx_ring);
+		netxen_nic_update_cmd_consumer(adapter, tx_ring);
+	}
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &adapter->recv_ctx.rds_rings[ring];
+		netxen_post_rx_buffers(adapter, ring, rds_ring);
+	}
+
+	err = netxen_nic_request_irq(adapter);
+	if (err) {
+		dev_err(&pdev->dev, "%s: failed to setup interrupt\n",
+				netdev->name);
+		goto err_out_free_rxbuf;
+	}
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		netxen_nic_init_coalesce_defaults(adapter);
+
+	netxen_create_sysfs_entries(adapter);
+
+	adapter->is_up = NETXEN_ADAPTER_UP_MAGIC;
+	return 0;
+
+err_out_free_rxbuf:
+	netxen_release_rx_buffers(adapter);
+	netxen_free_hw_resources(adapter);
+err_out_free_sw:
+	netxen_free_sw_resources(adapter);
+	return err;
+}
+
+static void
+netxen_nic_detach(struct netxen_adapter *adapter)
+{
+	if (adapter->is_up != NETXEN_ADAPTER_UP_MAGIC)
+		return;
+
+	netxen_remove_sysfs_entries(adapter);
+
+	netxen_free_hw_resources(adapter);
+	netxen_release_rx_buffers(adapter);
+	netxen_nic_free_irq(adapter);
+	netxen_napi_del(adapter);
+	netxen_free_sw_resources(adapter);
+
+	adapter->is_up = 0;
+}
+
+int
+netxen_nic_reset_context(struct netxen_adapter *adapter)
+{
+	int err = 0;
+	struct net_device *netdev = adapter->netdev;
+
+	if (test_and_set_bit(__NX_RESETTING, &adapter->state))
+		return -EBUSY;
+
+	if (adapter->is_up == NETXEN_ADAPTER_UP_MAGIC) {
+
+		netif_device_detach(netdev);
+
+		if (netif_running(netdev))
+			__netxen_nic_down(adapter, netdev);
+
+		netxen_nic_detach(adapter);
+
+		if (netif_running(netdev)) {
+			err = netxen_nic_attach(adapter);
+			if (!err)
+				err = __netxen_nic_up(adapter, netdev);
+
+			if (err)
+				goto done;
+		}
+
+		netif_device_attach(netdev);
+	}
+
+done:
+	clear_bit(__NX_RESETTING, &adapter->state);
+	return err;
+}
+
+static int
+netxen_setup_netdev(struct netxen_adapter *adapter,
+		struct net_device *netdev)
+{
+	int err = 0;
+	struct pci_dev *pdev = adapter->pdev;
+
+	adapter->mc_enabled = 0;
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		adapter->max_mc_count = 38;
+	else
+		adapter->max_mc_count = 16;
+
+	netdev->netdev_ops	   = &netxen_netdev_ops;
+	netdev->watchdog_timeo     = 5*HZ;
+
+	netxen_nic_change_mtu(netdev, netdev->mtu);
+
+	netdev->ethtool_ops = &netxen_nic_ethtool_ops;
+
+	netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
+	                      NETIF_F_RXCSUM;
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		netdev->hw_features |= NETIF_F_IPV6_CSUM | NETIF_F_TSO6;
+
+	netdev->vlan_features |= netdev->hw_features;
+
+	if (adapter->pci_using_dac) {
+		netdev->features |= NETIF_F_HIGHDMA;
+		netdev->vlan_features |= NETIF_F_HIGHDMA;
+	}
+
+	if (adapter->capabilities & NX_FW_CAPABILITY_FVLANTX)
+		netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX;
+
+	if (adapter->capabilities & NX_FW_CAPABILITY_HW_LRO)
+		netdev->hw_features |= NETIF_F_LRO;
+
+	netdev->features |= netdev->hw_features;
+
+	netdev->irq = adapter->msix_entries[0].vector;
+
+	INIT_WORK(&adapter->tx_timeout_task, netxen_tx_timeout_task);
+
+	if (netxen_read_mac_addr(adapter))
+		dev_warn(&pdev->dev, "failed to read mac addr\n");
+
+	netif_carrier_off(netdev);
+
+	err = register_netdev(netdev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to register net device\n");
+		return err;
+	}
+
+	return 0;
+}
+
+#define NETXEN_ULA_ADAPTER_KEY		(0xdaddad01)
+#define NETXEN_NON_ULA_ADAPTER_KEY	(0xdaddad00)
+
+static void netxen_read_ula_info(struct netxen_adapter *adapter)
+{
+	u32 temp;
+
+	/* Print ULA info only once for an adapter */
+	if (adapter->portnum != 0)
+		return;
+
+	temp = NXRD32(adapter, NETXEN_ULA_KEY);
+	switch (temp) {
+	case NETXEN_ULA_ADAPTER_KEY:
+		dev_info(&adapter->pdev->dev, "ULA adapter");
+		break;
+	case NETXEN_NON_ULA_ADAPTER_KEY:
+		dev_info(&adapter->pdev->dev, "non ULA adapter");
+		break;
+	default:
+		break;
+	}
+
+	return;
+}
+
+#ifdef CONFIG_PCIEAER
+static void netxen_mask_aer_correctable(struct netxen_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	struct pci_dev *root = pdev->bus->self;
+	u32 aer_pos;
+
+	/* root bus? */
+	if (!root)
+		return;
+
+	if (adapter->ahw.board_type != NETXEN_BRDTYPE_P3_4_GB_MM &&
+		adapter->ahw.board_type != NETXEN_BRDTYPE_P3_10G_TP)
+		return;
+
+	if (pci_pcie_type(root) != PCI_EXP_TYPE_ROOT_PORT)
+		return;
+
+	aer_pos = pci_find_ext_capability(root, PCI_EXT_CAP_ID_ERR);
+	if (!aer_pos)
+		return;
+
+	pci_write_config_dword(root, aer_pos + PCI_ERR_COR_MASK, 0xffff);
+}
+#endif
+
+static int
+netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct net_device *netdev = NULL;
+	struct netxen_adapter *adapter = NULL;
+	int i = 0, err;
+	int pci_func_id = PCI_FUNC(pdev->devfn);
+	uint8_t revision_id;
+	u32 val;
+
+	if (pdev->revision >= NX_P3_A0 && pdev->revision <= NX_P3_B1) {
+		pr_warn("%s: chip revisions between 0x%x-0x%x will not be enabled\n",
+			module_name(THIS_MODULE), NX_P3_A0, NX_P3_B1);
+		return -ENODEV;
+	}
+
+	if ((err = pci_enable_device(pdev)))
+		return err;
+
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		err = -ENODEV;
+		goto err_out_disable_pdev;
+	}
+
+	if ((err = pci_request_regions(pdev, netxen_nic_driver_name)))
+		goto err_out_disable_pdev;
+
+	if (NX_IS_REVISION_P3(pdev->revision))
+		pci_enable_pcie_error_reporting(pdev);
+
+	pci_set_master(pdev);
+
+	netdev = alloc_etherdev(sizeof(struct netxen_adapter));
+	if(!netdev) {
+		err = -ENOMEM;
+		goto err_out_free_res;
+	}
+
+	SET_NETDEV_DEV(netdev, &pdev->dev);
+
+	adapter = netdev_priv(netdev);
+	adapter->netdev  = netdev;
+	adapter->pdev    = pdev;
+	adapter->ahw.pci_func  = pci_func_id;
+
+	revision_id = pdev->revision;
+	adapter->ahw.revision_id = revision_id;
+
+	rwlock_init(&adapter->ahw.crb_lock);
+	spin_lock_init(&adapter->ahw.mem_lock);
+
+	spin_lock_init(&adapter->tx_clean_lock);
+	INIT_LIST_HEAD(&adapter->mac_list);
+	INIT_LIST_HEAD(&adapter->ip_list);
+
+	err = netxen_setup_pci_map(adapter);
+	if (err)
+		goto err_out_free_netdev;
+
+	/* This will be reset for mezz cards  */
+	adapter->portnum = pci_func_id;
+
+	err = netxen_nic_get_board_info(adapter);
+	if (err) {
+		dev_err(&pdev->dev, "Error getting board config info.\n");
+		goto err_out_iounmap;
+	}
+
+#ifdef CONFIG_PCIEAER
+	netxen_mask_aer_correctable(adapter);
+#endif
+
+	/* Mezz cards have PCI function 0,2,3 enabled */
+	switch (adapter->ahw.board_type) {
+	case NETXEN_BRDTYPE_P2_SB31_10G_IMEZ:
+	case NETXEN_BRDTYPE_P2_SB31_10G_HMEZ:
+		if (pci_func_id >= 2)
+			adapter->portnum = pci_func_id - 2;
+		break;
+	default:
+		break;
+	}
+
+	err = netxen_check_flash_fw_compatibility(adapter);
+	if (err)
+		goto err_out_iounmap;
+
+	if (adapter->portnum == 0) {
+		val = NXRD32(adapter, NX_CRB_DEV_REF_COUNT);
+		if (val != 0xffffffff && val != 0) {
+			NXWR32(adapter, NX_CRB_DEV_REF_COUNT, 0);
+			adapter->need_fw_reset = 1;
+		}
+	}
+
+	err = netxen_start_firmware(adapter);
+	if (err)
+		goto err_out_decr_ref;
+
+	/*
+	 * See if the firmware gave us a virtual-physical port mapping.
+	 */
+	adapter->physical_port = adapter->portnum;
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+		i = NXRD32(adapter, CRB_V2P(adapter->portnum));
+		if (i != 0x55555555)
+			adapter->physical_port = i;
+	}
+
+	/* MTU range: 0 - 8000 (P2) or 9600 (P3) */
+	netdev->min_mtu = 0;
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id))
+		netdev->max_mtu = P3_MAX_MTU;
+	else
+		netdev->max_mtu = P2_MAX_MTU;
+
+	netxen_nic_clear_stats(adapter);
+
+	err = netxen_setup_intr(adapter);
+
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to setup interrupts, error = %d\n", err);
+		goto err_out_disable_msi;
+	}
+
+	netxen_read_ula_info(adapter);
+
+	err = netxen_setup_netdev(adapter, netdev);
+	if (err)
+		goto err_out_disable_msi;
+
+	pci_set_drvdata(pdev, adapter);
+
+	netxen_schedule_work(adapter, netxen_fw_poll_work, FW_POLL_DELAY);
+
+	switch (adapter->ahw.port_type) {
+	case NETXEN_NIC_GBE:
+		dev_info(&adapter->pdev->dev, "%s: GbE port initialized\n",
+				adapter->netdev->name);
+		break;
+	case NETXEN_NIC_XGBE:
+		dev_info(&adapter->pdev->dev, "%s: XGbE port initialized\n",
+				adapter->netdev->name);
+		break;
+	}
+
+	netxen_create_diag_entries(adapter);
+
+	return 0;
+
+err_out_disable_msi:
+	netxen_teardown_intr(adapter);
+
+	netxen_free_dummy_dma(adapter);
+
+err_out_decr_ref:
+	nx_decr_dev_ref_cnt(adapter);
+
+err_out_iounmap:
+	netxen_cleanup_pci_map(adapter);
+
+err_out_free_netdev:
+	free_netdev(netdev);
+
+err_out_free_res:
+	pci_release_regions(pdev);
+
+err_out_disable_pdev:
+	pci_disable_device(pdev);
+	return err;
+}
+
+static
+void netxen_cleanup_minidump(struct netxen_adapter *adapter)
+{
+	kfree(adapter->mdump.md_template);
+	adapter->mdump.md_template = NULL;
+
+	if (adapter->mdump.md_capture_buff) {
+		vfree(adapter->mdump.md_capture_buff);
+		adapter->mdump.md_capture_buff = NULL;
+	}
+}
+
+static void netxen_nic_remove(struct pci_dev *pdev)
+{
+	struct netxen_adapter *adapter;
+	struct net_device *netdev;
+
+	adapter = pci_get_drvdata(pdev);
+	if (adapter == NULL)
+		return;
+
+	netdev = adapter->netdev;
+
+	netxen_cancel_fw_work(adapter);
+
+	unregister_netdev(netdev);
+
+	cancel_work_sync(&adapter->tx_timeout_task);
+
+	netxen_free_ip_list(adapter, false);
+	netxen_nic_detach(adapter);
+
+	nx_decr_dev_ref_cnt(adapter);
+
+	if (adapter->portnum == 0)
+		netxen_free_dummy_dma(adapter);
+
+	clear_bit(__NX_RESETTING, &adapter->state);
+
+	netxen_teardown_intr(adapter);
+	netxen_set_interrupt_mode(adapter, 0);
+	netxen_remove_diag_entries(adapter);
+
+	netxen_cleanup_pci_map(adapter);
+
+	netxen_release_firmware(adapter);
+
+	if (NX_IS_REVISION_P3(pdev->revision)) {
+		netxen_cleanup_minidump(adapter);
+		pci_disable_pcie_error_reporting(pdev);
+	}
+
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+
+	free_netdev(netdev);
+}
+
+static void netxen_nic_detach_func(struct netxen_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	netif_device_detach(netdev);
+
+	netxen_cancel_fw_work(adapter);
+
+	if (netif_running(netdev))
+		netxen_nic_down(adapter, netdev);
+
+	cancel_work_sync(&adapter->tx_timeout_task);
+
+	netxen_nic_detach(adapter);
+
+	if (adapter->portnum == 0)
+		netxen_free_dummy_dma(adapter);
+
+	nx_decr_dev_ref_cnt(adapter);
+
+	clear_bit(__NX_RESETTING, &adapter->state);
+}
+
+static int netxen_nic_attach_func(struct pci_dev *pdev)
+{
+	struct netxen_adapter *adapter = pci_get_drvdata(pdev);
+	struct net_device *netdev = adapter->netdev;
+	int err;
+
+	err = pci_enable_device(pdev);
+	if (err)
+		return err;
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_set_master(pdev);
+	pci_restore_state(pdev);
+
+	adapter->ahw.crb_win = -1;
+	adapter->ahw.ocm_win = -1;
+
+	err = netxen_start_firmware(adapter);
+	if (err) {
+		dev_err(&pdev->dev, "failed to start firmware\n");
+		return err;
+	}
+
+	if (netif_running(netdev)) {
+		err = netxen_nic_attach(adapter);
+		if (err)
+			goto err_out;
+
+		err = netxen_nic_up(adapter, netdev);
+		if (err)
+			goto err_out_detach;
+
+		netxen_restore_indev_addr(netdev, NETDEV_UP);
+	}
+
+	netif_device_attach(netdev);
+	netxen_schedule_work(adapter, netxen_fw_poll_work, FW_POLL_DELAY);
+	return 0;
+
+err_out_detach:
+	netxen_nic_detach(adapter);
+err_out:
+	nx_decr_dev_ref_cnt(adapter);
+	return err;
+}
+
+static pci_ers_result_t netxen_io_error_detected(struct pci_dev *pdev,
+						pci_channel_state_t state)
+{
+	struct netxen_adapter *adapter = pci_get_drvdata(pdev);
+
+	if (state == pci_channel_io_perm_failure)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	if (nx_dev_request_aer(adapter))
+		return PCI_ERS_RESULT_RECOVERED;
+
+	netxen_nic_detach_func(adapter);
+
+	pci_disable_device(pdev);
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t netxen_io_slot_reset(struct pci_dev *pdev)
+{
+	int err = 0;
+
+	err = netxen_nic_attach_func(pdev);
+
+	return err ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
+}
+
+static void netxen_io_resume(struct pci_dev *pdev)
+{
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+}
+
+static void netxen_nic_shutdown(struct pci_dev *pdev)
+{
+	struct netxen_adapter *adapter = pci_get_drvdata(pdev);
+
+	netxen_nic_detach_func(adapter);
+
+	if (pci_save_state(pdev))
+		return;
+
+	if (netxen_nic_wol_supported(adapter)) {
+		pci_enable_wake(pdev, PCI_D3cold, 1);
+		pci_enable_wake(pdev, PCI_D3hot, 1);
+	}
+
+	pci_disable_device(pdev);
+}
+
+#ifdef CONFIG_PM
+static int
+netxen_nic_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	struct netxen_adapter *adapter = pci_get_drvdata(pdev);
+	int retval;
+
+	netxen_nic_detach_func(adapter);
+
+	retval = pci_save_state(pdev);
+	if (retval)
+		return retval;
+
+	if (netxen_nic_wol_supported(adapter)) {
+		pci_enable_wake(pdev, PCI_D3cold, 1);
+		pci_enable_wake(pdev, PCI_D3hot, 1);
+	}
+
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+
+	return 0;
+}
+
+static int
+netxen_nic_resume(struct pci_dev *pdev)
+{
+	return netxen_nic_attach_func(pdev);
+}
+#endif
+
+static int netxen_nic_open(struct net_device *netdev)
+{
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+	int err = 0;
+
+	if (adapter->driver_mismatch)
+		return -EIO;
+
+	err = netxen_nic_attach(adapter);
+	if (err)
+		return err;
+
+	err = __netxen_nic_up(adapter, netdev);
+	if (err)
+		goto err_out;
+
+	netif_start_queue(netdev);
+
+	return 0;
+
+err_out:
+	netxen_nic_detach(adapter);
+	return err;
+}
+
+/*
+ * netxen_nic_close - Disables a network interface entry point
+ */
+static int netxen_nic_close(struct net_device *netdev)
+{
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+
+	__netxen_nic_down(adapter, netdev);
+	return 0;
+}
+
+static void
+netxen_tso_check(struct net_device *netdev,
+		struct nx_host_tx_ring *tx_ring,
+		struct cmd_desc_type0 *first_desc,
+		struct sk_buff *skb)
+{
+	u8 opcode = TX_ETHER_PKT;
+	__be16 protocol = skb->protocol;
+	u16 flags = 0, vid = 0;
+	u32 producer;
+	int copied, offset, copy_len, hdr_len = 0, tso = 0, vlan_oob = 0;
+	struct cmd_desc_type0 *hwdesc;
+	struct vlan_ethhdr *vh;
+
+	if (protocol == cpu_to_be16(ETH_P_8021Q)) {
+
+		vh = (struct vlan_ethhdr *)skb->data;
+		protocol = vh->h_vlan_encapsulated_proto;
+		flags = FLAGS_VLAN_TAGGED;
+
+	} else if (skb_vlan_tag_present(skb)) {
+		flags = FLAGS_VLAN_OOB;
+		vid = skb_vlan_tag_get(skb);
+		netxen_set_tx_vlan_tci(first_desc, vid);
+		vlan_oob = 1;
+	}
+
+	if ((netdev->features & (NETIF_F_TSO | NETIF_F_TSO6)) &&
+			skb_shinfo(skb)->gso_size > 0) {
+
+		hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+
+		first_desc->mss = cpu_to_le16(skb_shinfo(skb)->gso_size);
+		first_desc->total_hdr_length = hdr_len;
+		if (vlan_oob) {
+			first_desc->total_hdr_length += VLAN_HLEN;
+			first_desc->tcp_hdr_offset = VLAN_HLEN;
+			first_desc->ip_hdr_offset = VLAN_HLEN;
+			/* Only in case of TSO on vlan device */
+			flags |= FLAGS_VLAN_TAGGED;
+		}
+
+		opcode = (protocol == cpu_to_be16(ETH_P_IPV6)) ?
+				TX_TCP_LSO6 : TX_TCP_LSO;
+		tso = 1;
+
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		u8 l4proto;
+
+		if (protocol == cpu_to_be16(ETH_P_IP)) {
+			l4proto = ip_hdr(skb)->protocol;
+
+			if (l4proto == IPPROTO_TCP)
+				opcode = TX_TCP_PKT;
+			else if(l4proto == IPPROTO_UDP)
+				opcode = TX_UDP_PKT;
+		} else if (protocol == cpu_to_be16(ETH_P_IPV6)) {
+			l4proto = ipv6_hdr(skb)->nexthdr;
+
+			if (l4proto == IPPROTO_TCP)
+				opcode = TX_TCPV6_PKT;
+			else if(l4proto == IPPROTO_UDP)
+				opcode = TX_UDPV6_PKT;
+		}
+	}
+
+	first_desc->tcp_hdr_offset += skb_transport_offset(skb);
+	first_desc->ip_hdr_offset += skb_network_offset(skb);
+	netxen_set_tx_flags_opcode(first_desc, flags, opcode);
+
+	if (!tso)
+		return;
+
+	/* For LSO, we need to copy the MAC/IP/TCP headers into
+	 * the descriptor ring
+	 */
+	producer = tx_ring->producer;
+	copied = 0;
+	offset = 2;
+
+	if (vlan_oob) {
+		/* Create a TSO vlan header template for firmware */
+
+		hwdesc = &tx_ring->desc_head[producer];
+		tx_ring->cmd_buf_arr[producer].skb = NULL;
+
+		copy_len = min((int)sizeof(struct cmd_desc_type0) - offset,
+				hdr_len + VLAN_HLEN);
+
+		vh = (struct vlan_ethhdr *)((char *)hwdesc + 2);
+		skb_copy_from_linear_data(skb, vh, 12);
+		vh->h_vlan_proto = htons(ETH_P_8021Q);
+		vh->h_vlan_TCI = htons(vid);
+		skb_copy_from_linear_data_offset(skb, 12,
+				(char *)vh + 16, copy_len - 16);
+
+		copied = copy_len - VLAN_HLEN;
+		offset = 0;
+
+		producer = get_next_index(producer, tx_ring->num_desc);
+	}
+
+	while (copied < hdr_len) {
+
+		copy_len = min((int)sizeof(struct cmd_desc_type0) - offset,
+				(hdr_len - copied));
+
+		hwdesc = &tx_ring->desc_head[producer];
+		tx_ring->cmd_buf_arr[producer].skb = NULL;
+
+		skb_copy_from_linear_data_offset(skb, copied,
+				 (char *)hwdesc + offset, copy_len);
+
+		copied += copy_len;
+		offset = 0;
+
+		producer = get_next_index(producer, tx_ring->num_desc);
+	}
+
+	tx_ring->producer = producer;
+	barrier();
+}
+
+static int
+netxen_map_tx_skb(struct pci_dev *pdev,
+		struct sk_buff *skb, struct netxen_cmd_buffer *pbuf)
+{
+	struct netxen_skb_frag *nf;
+	struct skb_frag_struct *frag;
+	int i, nr_frags;
+	dma_addr_t map;
+
+	nr_frags = skb_shinfo(skb)->nr_frags;
+	nf = &pbuf->frag_array[0];
+
+	map = pci_map_single(pdev, skb->data,
+			skb_headlen(skb), PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(pdev, map))
+		goto out_err;
+
+	nf->dma = map;
+	nf->length = skb_headlen(skb);
+
+	for (i = 0; i < nr_frags; i++) {
+		frag = &skb_shinfo(skb)->frags[i];
+		nf = &pbuf->frag_array[i+1];
+
+		map = skb_frag_dma_map(&pdev->dev, frag, 0, skb_frag_size(frag),
+				       DMA_TO_DEVICE);
+		if (dma_mapping_error(&pdev->dev, map))
+			goto unwind;
+
+		nf->dma = map;
+		nf->length = skb_frag_size(frag);
+	}
+
+	return 0;
+
+unwind:
+	while (--i >= 0) {
+		nf = &pbuf->frag_array[i+1];
+		pci_unmap_page(pdev, nf->dma, nf->length, PCI_DMA_TODEVICE);
+		nf->dma = 0ULL;
+	}
+
+	nf = &pbuf->frag_array[0];
+	pci_unmap_single(pdev, nf->dma, skb_headlen(skb), PCI_DMA_TODEVICE);
+	nf->dma = 0ULL;
+
+out_err:
+	return -ENOMEM;
+}
+
+static inline void
+netxen_clear_cmddesc(u64 *desc)
+{
+	desc[0] = 0ULL;
+	desc[2] = 0ULL;
+}
+
+static netdev_tx_t
+netxen_nic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+	struct nx_host_tx_ring *tx_ring = adapter->tx_ring;
+	struct netxen_cmd_buffer *pbuf;
+	struct netxen_skb_frag *buffrag;
+	struct cmd_desc_type0 *hwdesc, *first_desc;
+	struct pci_dev *pdev;
+	int i, k;
+	int delta = 0;
+	struct skb_frag_struct *frag;
+
+	u32 producer;
+	int frag_count, no_of_desc;
+	u32 num_txd = tx_ring->num_desc;
+
+	frag_count = skb_shinfo(skb)->nr_frags + 1;
+
+	/* 14 frags supported for normal packet and
+	 * 32 frags supported for TSO packet
+	 */
+	if (!skb_is_gso(skb) && frag_count > NETXEN_MAX_FRAGS_PER_TX) {
+
+		for (i = 0; i < (frag_count - NETXEN_MAX_FRAGS_PER_TX); i++) {
+			frag = &skb_shinfo(skb)->frags[i];
+			delta += skb_frag_size(frag);
+		}
+
+		if (!__pskb_pull_tail(skb, delta))
+			goto drop_packet;
+
+		frag_count = 1 + skb_shinfo(skb)->nr_frags;
+	}
+	/* 4 fragments per cmd des */
+	no_of_desc = (frag_count + 3) >> 2;
+
+	if (unlikely(netxen_tx_avail(tx_ring) <= TX_STOP_THRESH)) {
+		netif_stop_queue(netdev);
+		smp_mb();
+		if (netxen_tx_avail(tx_ring) > TX_STOP_THRESH)
+			netif_start_queue(netdev);
+		else
+			return NETDEV_TX_BUSY;
+	}
+
+	producer = tx_ring->producer;
+	pbuf = &tx_ring->cmd_buf_arr[producer];
+
+	pdev = adapter->pdev;
+
+	if (netxen_map_tx_skb(pdev, skb, pbuf))
+		goto drop_packet;
+
+	pbuf->skb = skb;
+	pbuf->frag_count = frag_count;
+
+	first_desc = hwdesc = &tx_ring->desc_head[producer];
+	netxen_clear_cmddesc((u64 *)hwdesc);
+
+	netxen_set_tx_frags_len(first_desc, frag_count, skb->len);
+	netxen_set_tx_port(first_desc, adapter->portnum);
+
+	for (i = 0; i < frag_count; i++) {
+
+		k = i % 4;
+
+		if ((k == 0) && (i > 0)) {
+			/* move to next desc.*/
+			producer = get_next_index(producer, num_txd);
+			hwdesc = &tx_ring->desc_head[producer];
+			netxen_clear_cmddesc((u64 *)hwdesc);
+			tx_ring->cmd_buf_arr[producer].skb = NULL;
+		}
+
+		buffrag = &pbuf->frag_array[i];
+
+		hwdesc->buffer_length[k] = cpu_to_le16(buffrag->length);
+		switch (k) {
+		case 0:
+			hwdesc->addr_buffer1 = cpu_to_le64(buffrag->dma);
+			break;
+		case 1:
+			hwdesc->addr_buffer2 = cpu_to_le64(buffrag->dma);
+			break;
+		case 2:
+			hwdesc->addr_buffer3 = cpu_to_le64(buffrag->dma);
+			break;
+		case 3:
+			hwdesc->addr_buffer4 = cpu_to_le64(buffrag->dma);
+			break;
+		}
+	}
+
+	tx_ring->producer = get_next_index(producer, num_txd);
+
+	netxen_tso_check(netdev, tx_ring, first_desc, skb);
+
+	adapter->stats.txbytes += skb->len;
+	adapter->stats.xmitcalled++;
+
+	netxen_nic_update_cmd_producer(adapter, tx_ring);
+
+	return NETDEV_TX_OK;
+
+drop_packet:
+	adapter->stats.txdropped++;
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
+}
+
+static int netxen_nic_check_temp(struct netxen_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	uint32_t temp, temp_state, temp_val;
+	int rv = 0;
+
+	temp = NXRD32(adapter, CRB_TEMP_STATE);
+
+	temp_state = nx_get_temp_state(temp);
+	temp_val = nx_get_temp_val(temp);
+
+	if (temp_state == NX_TEMP_PANIC) {
+		printk(KERN_ALERT
+		       "%s: Device temperature %d degrees C exceeds"
+		       " maximum allowed. Hardware has been shut down.\n",
+		       netdev->name, temp_val);
+		rv = 1;
+	} else if (temp_state == NX_TEMP_WARN) {
+		if (adapter->temp == NX_TEMP_NORMAL) {
+			printk(KERN_ALERT
+			       "%s: Device temperature %d degrees C "
+			       "exceeds operating range."
+			       " Immediate action needed.\n",
+			       netdev->name, temp_val);
+		}
+	} else {
+		if (adapter->temp == NX_TEMP_WARN) {
+			printk(KERN_INFO
+			       "%s: Device temperature is now %d degrees C"
+			       " in normal range.\n", netdev->name,
+			       temp_val);
+		}
+	}
+	adapter->temp = temp_state;
+	return rv;
+}
+
+void netxen_advert_link_change(struct netxen_adapter *adapter, int linkup)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (adapter->ahw.linkup && !linkup) {
+		printk(KERN_INFO "%s: %s NIC Link is down\n",
+		       netxen_nic_driver_name, netdev->name);
+		adapter->ahw.linkup = 0;
+		if (netif_running(netdev)) {
+			netif_carrier_off(netdev);
+			netif_stop_queue(netdev);
+		}
+		adapter->link_changed = !adapter->has_link_events;
+	} else if (!adapter->ahw.linkup && linkup) {
+		printk(KERN_INFO "%s: %s NIC Link is up\n",
+		       netxen_nic_driver_name, netdev->name);
+		adapter->ahw.linkup = 1;
+		if (netif_running(netdev)) {
+			netif_carrier_on(netdev);
+			netif_wake_queue(netdev);
+		}
+		adapter->link_changed = !adapter->has_link_events;
+	}
+}
+
+static void netxen_nic_handle_phy_intr(struct netxen_adapter *adapter)
+{
+	u32 val, port, linkup;
+
+	port = adapter->physical_port;
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+		val = NXRD32(adapter, CRB_XG_STATE_P3);
+		val = XG_LINK_STATE_P3(adapter->ahw.pci_func, val);
+		linkup = (val == XG_LINK_UP_P3);
+	} else {
+		val = NXRD32(adapter, CRB_XG_STATE);
+		val = (val >> port*8) & 0xff;
+		linkup = (val == XG_LINK_UP);
+	}
+
+	netxen_advert_link_change(adapter, linkup);
+}
+
+static void netxen_tx_timeout(struct net_device *netdev)
+{
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+
+	if (test_bit(__NX_RESETTING, &adapter->state))
+		return;
+
+	dev_err(&netdev->dev, "transmit timeout, resetting.\n");
+	schedule_work(&adapter->tx_timeout_task);
+}
+
+static void netxen_tx_timeout_task(struct work_struct *work)
+{
+	struct netxen_adapter *adapter =
+		container_of(work, struct netxen_adapter, tx_timeout_task);
+
+	if (!netif_running(adapter->netdev))
+		return;
+
+	if (test_and_set_bit(__NX_RESETTING, &adapter->state))
+		return;
+
+	if (++adapter->tx_timeo_cnt >= NX_MAX_TX_TIMEOUTS)
+		goto request_reset;
+
+	rtnl_lock();
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+		/* try to scrub interrupt */
+		netxen_napi_disable(adapter);
+
+		netxen_napi_enable(adapter);
+
+		netif_wake_queue(adapter->netdev);
+
+		clear_bit(__NX_RESETTING, &adapter->state);
+	} else {
+		clear_bit(__NX_RESETTING, &adapter->state);
+		if (netxen_nic_reset_context(adapter)) {
+			rtnl_unlock();
+			goto request_reset;
+		}
+	}
+	netif_trans_update(adapter->netdev);
+	rtnl_unlock();
+	return;
+
+request_reset:
+	adapter->need_fw_reset = 1;
+	clear_bit(__NX_RESETTING, &adapter->state);
+}
+
+static void netxen_nic_get_stats(struct net_device *netdev,
+				 struct rtnl_link_stats64 *stats)
+{
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+
+	stats->rx_packets = adapter->stats.rx_pkts + adapter->stats.lro_pkts;
+	stats->tx_packets = adapter->stats.xmitfinished;
+	stats->rx_bytes = adapter->stats.rxbytes;
+	stats->tx_bytes = adapter->stats.txbytes;
+	stats->rx_dropped = adapter->stats.rxdropped;
+	stats->tx_dropped = adapter->stats.txdropped;
+}
+
+static irqreturn_t netxen_intr(int irq, void *data)
+{
+	struct nx_host_sds_ring *sds_ring = data;
+	struct netxen_adapter *adapter = sds_ring->adapter;
+	u32 status = 0;
+
+	status = readl(adapter->isr_int_vec);
+
+	if (!(status & adapter->int_vec_bit))
+		return IRQ_NONE;
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
+		/* check interrupt state machine, to be sure */
+		status = readl(adapter->crb_int_state_reg);
+		if (!ISR_LEGACY_INT_TRIGGERED(status))
+			return IRQ_NONE;
+
+	} else {
+		unsigned long our_int = 0;
+
+		our_int = readl(adapter->crb_int_state_reg);
+
+		/* not our interrupt */
+		if (!test_and_clear_bit((7 + adapter->portnum), &our_int))
+			return IRQ_NONE;
+
+		/* claim interrupt */
+		writel((our_int & 0xffffffff), adapter->crb_int_state_reg);
+
+		/* clear interrupt */
+		netxen_nic_disable_int(sds_ring);
+	}
+
+	writel(0xffffffff, adapter->tgt_status_reg);
+	/* read twice to ensure write is flushed */
+	readl(adapter->isr_int_vec);
+	readl(adapter->isr_int_vec);
+
+	napi_schedule(&sds_ring->napi);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t netxen_msi_intr(int irq, void *data)
+{
+	struct nx_host_sds_ring *sds_ring = data;
+	struct netxen_adapter *adapter = sds_ring->adapter;
+
+	/* clear interrupt */
+	writel(0xffffffff, adapter->tgt_status_reg);
+
+	napi_schedule(&sds_ring->napi);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t netxen_msix_intr(int irq, void *data)
+{
+	struct nx_host_sds_ring *sds_ring = data;
+
+	napi_schedule(&sds_ring->napi);
+	return IRQ_HANDLED;
+}
+
+static int netxen_nic_poll(struct napi_struct *napi, int budget)
+{
+	struct nx_host_sds_ring *sds_ring =
+		container_of(napi, struct nx_host_sds_ring, napi);
+
+	struct netxen_adapter *adapter = sds_ring->adapter;
+
+	int tx_complete;
+	int work_done;
+
+	tx_complete = netxen_process_cmd_ring(adapter);
+
+	work_done = netxen_process_rcv_ring(sds_ring, budget);
+
+	if (!tx_complete)
+		work_done = budget;
+
+	if (work_done < budget) {
+		napi_complete_done(&sds_ring->napi, work_done);
+		if (test_bit(__NX_DEV_UP, &adapter->state))
+			netxen_nic_enable_int(sds_ring);
+	}
+
+	return work_done;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void netxen_nic_poll_controller(struct net_device *netdev)
+{
+	int ring;
+	struct nx_host_sds_ring *sds_ring;
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+	struct netxen_recv_context *recv_ctx = &adapter->recv_ctx;
+
+	disable_irq(adapter->irq);
+	for (ring = 0; ring < adapter->max_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		netxen_intr(adapter->irq, sds_ring);
+	}
+	enable_irq(adapter->irq);
+}
+#endif
+
+static int
+nx_incr_dev_ref_cnt(struct netxen_adapter *adapter)
+{
+	int count;
+	if (netxen_api_lock(adapter))
+		return -EIO;
+
+	count = NXRD32(adapter, NX_CRB_DEV_REF_COUNT);
+
+	NXWR32(adapter, NX_CRB_DEV_REF_COUNT, ++count);
+
+	netxen_api_unlock(adapter);
+	return count;
+}
+
+static int
+nx_decr_dev_ref_cnt(struct netxen_adapter *adapter)
+{
+	int count, state;
+	if (netxen_api_lock(adapter))
+		return -EIO;
+
+	count = NXRD32(adapter, NX_CRB_DEV_REF_COUNT);
+	WARN_ON(count == 0);
+
+	NXWR32(adapter, NX_CRB_DEV_REF_COUNT, --count);
+	state = NXRD32(adapter, NX_CRB_DEV_STATE);
+
+	if (count == 0 && state != NX_DEV_FAILED)
+		NXWR32(adapter, NX_CRB_DEV_STATE, NX_DEV_COLD);
+
+	netxen_api_unlock(adapter);
+	return count;
+}
+
+static int
+nx_dev_request_aer(struct netxen_adapter *adapter)
+{
+	u32 state;
+	int ret = -EINVAL;
+
+	if (netxen_api_lock(adapter))
+		return ret;
+
+	state = NXRD32(adapter, NX_CRB_DEV_STATE);
+
+	if (state == NX_DEV_NEED_AER)
+		ret = 0;
+	else if (state == NX_DEV_READY) {
+		NXWR32(adapter, NX_CRB_DEV_STATE, NX_DEV_NEED_AER);
+		ret = 0;
+	}
+
+	netxen_api_unlock(adapter);
+	return ret;
+}
+
+int
+nx_dev_request_reset(struct netxen_adapter *adapter)
+{
+	u32 state;
+	int ret = -EINVAL;
+
+	if (netxen_api_lock(adapter))
+		return ret;
+
+	state = NXRD32(adapter, NX_CRB_DEV_STATE);
+
+	if (state == NX_DEV_NEED_RESET || state == NX_DEV_FAILED)
+		ret = 0;
+	else if (state != NX_DEV_INITALIZING && state != NX_DEV_NEED_AER) {
+		NXWR32(adapter, NX_CRB_DEV_STATE, NX_DEV_NEED_RESET);
+		adapter->flags |= NETXEN_FW_RESET_OWNER;
+		ret = 0;
+	}
+
+	netxen_api_unlock(adapter);
+
+	return ret;
+}
+
+static int
+netxen_can_start_firmware(struct netxen_adapter *adapter)
+{
+	int count;
+	int can_start = 0;
+
+	if (netxen_api_lock(adapter)) {
+		nx_incr_dev_ref_cnt(adapter);
+		return -1;
+	}
+
+	count = NXRD32(adapter, NX_CRB_DEV_REF_COUNT);
+
+	if ((count < 0) || (count >= NX_MAX_PCI_FUNC))
+		count = 0;
+
+	if (count == 0) {
+		can_start = 1;
+		NXWR32(adapter, NX_CRB_DEV_STATE, NX_DEV_INITALIZING);
+	}
+
+	NXWR32(adapter, NX_CRB_DEV_REF_COUNT, ++count);
+
+	netxen_api_unlock(adapter);
+
+	return can_start;
+}
+
+static void
+netxen_schedule_work(struct netxen_adapter *adapter,
+		work_func_t func, int delay)
+{
+	INIT_DELAYED_WORK(&adapter->fw_work, func);
+	schedule_delayed_work(&adapter->fw_work, delay);
+}
+
+static void
+netxen_cancel_fw_work(struct netxen_adapter *adapter)
+{
+	while (test_and_set_bit(__NX_RESETTING, &adapter->state))
+		msleep(10);
+
+	cancel_delayed_work_sync(&adapter->fw_work);
+}
+
+static void
+netxen_attach_work(struct work_struct *work)
+{
+	struct netxen_adapter *adapter = container_of(work,
+				struct netxen_adapter, fw_work.work);
+	struct net_device *netdev = adapter->netdev;
+	int err = 0;
+
+	if (netif_running(netdev)) {
+		err = netxen_nic_attach(adapter);
+		if (err)
+			goto done;
+
+		err = netxen_nic_up(adapter, netdev);
+		if (err) {
+			netxen_nic_detach(adapter);
+			goto done;
+		}
+
+		netxen_restore_indev_addr(netdev, NETDEV_UP);
+	}
+
+	netif_device_attach(netdev);
+
+done:
+	adapter->fw_fail_cnt = 0;
+	clear_bit(__NX_RESETTING, &adapter->state);
+	netxen_schedule_work(adapter, netxen_fw_poll_work, FW_POLL_DELAY);
+}
+
+static void
+netxen_fwinit_work(struct work_struct *work)
+{
+	struct netxen_adapter *adapter = container_of(work,
+				struct netxen_adapter, fw_work.work);
+	int dev_state;
+	int count;
+	dev_state = NXRD32(adapter, NX_CRB_DEV_STATE);
+	if (adapter->flags & NETXEN_FW_RESET_OWNER) {
+		count = NXRD32(adapter, NX_CRB_DEV_REF_COUNT);
+		WARN_ON(count == 0);
+		if (count == 1) {
+			if (adapter->mdump.md_enabled) {
+				rtnl_lock();
+				netxen_dump_fw(adapter);
+				rtnl_unlock();
+			}
+			adapter->flags &= ~NETXEN_FW_RESET_OWNER;
+			if (netxen_api_lock(adapter)) {
+				clear_bit(__NX_RESETTING, &adapter->state);
+				NXWR32(adapter, NX_CRB_DEV_STATE,
+						NX_DEV_FAILED);
+				return;
+			}
+			count = NXRD32(adapter, NX_CRB_DEV_REF_COUNT);
+			NXWR32(adapter, NX_CRB_DEV_REF_COUNT, --count);
+			NXWR32(adapter, NX_CRB_DEV_STATE, NX_DEV_COLD);
+			dev_state = NX_DEV_COLD;
+			netxen_api_unlock(adapter);
+		}
+	}
+
+	switch (dev_state) {
+	case NX_DEV_COLD:
+	case NX_DEV_READY:
+		if (!netxen_start_firmware(adapter)) {
+			netxen_schedule_work(adapter, netxen_attach_work, 0);
+			return;
+		}
+		break;
+
+	case NX_DEV_NEED_RESET:
+	case NX_DEV_INITALIZING:
+			netxen_schedule_work(adapter,
+					netxen_fwinit_work, 2 * FW_POLL_DELAY);
+			return;
+
+	case NX_DEV_FAILED:
+	default:
+		nx_incr_dev_ref_cnt(adapter);
+		break;
+	}
+
+	if (netxen_api_lock(adapter)) {
+		clear_bit(__NX_RESETTING, &adapter->state);
+		return;
+	}
+	NXWR32(adapter, NX_CRB_DEV_STATE, NX_DEV_FAILED);
+	netxen_api_unlock(adapter);
+	dev_err(&adapter->pdev->dev, "%s: Device initialization Failed\n",
+				adapter->netdev->name);
+
+	clear_bit(__NX_RESETTING, &adapter->state);
+}
+
+static void
+netxen_detach_work(struct work_struct *work)
+{
+	struct netxen_adapter *adapter = container_of(work,
+				struct netxen_adapter, fw_work.work);
+	struct net_device *netdev = adapter->netdev;
+	int ref_cnt = 0, delay;
+	u32 status;
+
+	netif_device_detach(netdev);
+
+	netxen_nic_down(adapter, netdev);
+
+	rtnl_lock();
+	netxen_nic_detach(adapter);
+	rtnl_unlock();
+
+	status = NXRD32(adapter, NETXEN_PEG_HALT_STATUS1);
+
+	if (status & NX_RCODE_FATAL_ERROR)
+		goto err_ret;
+
+	if (adapter->temp == NX_TEMP_PANIC)
+		goto err_ret;
+
+	if (!(adapter->flags & NETXEN_FW_RESET_OWNER))
+		ref_cnt = nx_decr_dev_ref_cnt(adapter);
+
+	if (ref_cnt == -EIO)
+		goto err_ret;
+
+	delay = (ref_cnt == 0) ? 0 : (2 * FW_POLL_DELAY);
+
+	adapter->fw_wait_cnt = 0;
+	netxen_schedule_work(adapter, netxen_fwinit_work, delay);
+
+	return;
+
+err_ret:
+	clear_bit(__NX_RESETTING, &adapter->state);
+}
+
+static int
+netxen_check_health(struct netxen_adapter *adapter)
+{
+	u32 state, heartbit;
+	u32 peg_status;
+	struct net_device *netdev = adapter->netdev;
+
+	state = NXRD32(adapter, NX_CRB_DEV_STATE);
+	if (state == NX_DEV_NEED_AER)
+		return 0;
+
+	if (netxen_nic_check_temp(adapter))
+		goto detach;
+
+	if (adapter->need_fw_reset) {
+		if (nx_dev_request_reset(adapter))
+			return 0;
+		goto detach;
+	}
+
+	/* NX_DEV_NEED_RESET, this state can be marked in two cases
+	 * 1. Tx timeout 2. Fw hang
+	 * Send request to destroy context in case of tx timeout only
+	 * and doesn't required in case of Fw hang
+	 */
+	if (state == NX_DEV_NEED_RESET || state == NX_DEV_FAILED) {
+		adapter->need_fw_reset = 1;
+		if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+			goto detach;
+	}
+
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		return 0;
+
+	heartbit = NXRD32(adapter, NETXEN_PEG_ALIVE_COUNTER);
+	if (heartbit != adapter->heartbit) {
+		adapter->heartbit = heartbit;
+		adapter->fw_fail_cnt = 0;
+		if (adapter->need_fw_reset)
+			goto detach;
+		return 0;
+	}
+
+	if (++adapter->fw_fail_cnt < FW_FAIL_THRESH)
+		return 0;
+
+	if (nx_dev_request_reset(adapter))
+		return 0;
+
+	clear_bit(__NX_FW_ATTACHED, &adapter->state);
+
+	dev_err(&netdev->dev, "firmware hang detected\n");
+	peg_status = NXRD32(adapter, NETXEN_PEG_HALT_STATUS1);
+	dev_err(&adapter->pdev->dev, "Dumping hw/fw registers\n"
+			"PEG_HALT_STATUS1: 0x%x, PEG_HALT_STATUS2: 0x%x,\n"
+			"PEG_NET_0_PC: 0x%x, PEG_NET_1_PC: 0x%x,\n"
+			"PEG_NET_2_PC: 0x%x, PEG_NET_3_PC: 0x%x,\n"
+			"PEG_NET_4_PC: 0x%x\n",
+			peg_status,
+			NXRD32(adapter, NETXEN_PEG_HALT_STATUS2),
+			NXRD32(adapter, NETXEN_CRB_PEG_NET_0 + 0x3c),
+			NXRD32(adapter, NETXEN_CRB_PEG_NET_1 + 0x3c),
+			NXRD32(adapter, NETXEN_CRB_PEG_NET_2 + 0x3c),
+			NXRD32(adapter, NETXEN_CRB_PEG_NET_3 + 0x3c),
+			NXRD32(adapter, NETXEN_CRB_PEG_NET_4 + 0x3c));
+	if (NX_FWERROR_PEGSTAT1(peg_status) == 0x67)
+		dev_err(&adapter->pdev->dev,
+			"Firmware aborted with error code 0x00006700. "
+				"Device is being reset.\n");
+detach:
+	if ((auto_fw_reset == AUTO_FW_RESET_ENABLED) &&
+			!test_and_set_bit(__NX_RESETTING, &adapter->state))
+		netxen_schedule_work(adapter, netxen_detach_work, 0);
+	return 1;
+}
+
+static void
+netxen_fw_poll_work(struct work_struct *work)
+{
+	struct netxen_adapter *adapter = container_of(work,
+				struct netxen_adapter, fw_work.work);
+
+	if (test_bit(__NX_RESETTING, &adapter->state))
+		goto reschedule;
+
+	if (test_bit(__NX_DEV_UP, &adapter->state) &&
+	    !(adapter->capabilities & NX_FW_CAPABILITY_LINK_NOTIFICATION)) {
+		if (!adapter->has_link_events) {
+
+			netxen_nic_handle_phy_intr(adapter);
+
+			if (adapter->link_changed)
+				netxen_nic_set_link_parameters(adapter);
+		}
+	}
+
+	if (netxen_check_health(adapter))
+		return;
+
+reschedule:
+	netxen_schedule_work(adapter, netxen_fw_poll_work, FW_POLL_DELAY);
+}
+
+static ssize_t
+netxen_store_bridged_mode(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct net_device *net = to_net_dev(dev);
+	struct netxen_adapter *adapter = netdev_priv(net);
+	unsigned long new;
+	int ret = -EINVAL;
+
+	if (!(adapter->capabilities & NX_FW_CAPABILITY_BDG))
+		goto err_out;
+
+	if (adapter->is_up != NETXEN_ADAPTER_UP_MAGIC)
+		goto err_out;
+
+	if (kstrtoul(buf, 2, &new))
+		goto err_out;
+
+	if (!netxen_config_bridged_mode(adapter, !!new))
+		ret = len;
+
+err_out:
+	return ret;
+}
+
+static ssize_t
+netxen_show_bridged_mode(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct net_device *net = to_net_dev(dev);
+	struct netxen_adapter *adapter;
+	int bridged_mode = 0;
+
+	adapter = netdev_priv(net);
+
+	if (adapter->capabilities & NX_FW_CAPABILITY_BDG)
+		bridged_mode = !!(adapter->flags & NETXEN_NIC_BRIDGE_ENABLED);
+
+	return sprintf(buf, "%d\n", bridged_mode);
+}
+
+static const struct device_attribute dev_attr_bridged_mode = {
+	.attr = { .name = "bridged_mode", .mode = 0644 },
+	.show = netxen_show_bridged_mode,
+	.store = netxen_store_bridged_mode,
+};
+
+static ssize_t
+netxen_store_diag_mode(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct netxen_adapter *adapter = dev_get_drvdata(dev);
+	unsigned long new;
+
+	if (kstrtoul(buf, 2, &new))
+		return -EINVAL;
+
+	if (!!new != !!(adapter->flags & NETXEN_NIC_DIAG_ENABLED))
+		adapter->flags ^= NETXEN_NIC_DIAG_ENABLED;
+
+	return len;
+}
+
+static ssize_t
+netxen_show_diag_mode(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct netxen_adapter *adapter = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n",
+			!!(adapter->flags & NETXEN_NIC_DIAG_ENABLED));
+}
+
+static const struct device_attribute dev_attr_diag_mode = {
+	.attr = { .name = "diag_mode", .mode = 0644 },
+	.show = netxen_show_diag_mode,
+	.store = netxen_store_diag_mode,
+};
+
+static int
+netxen_sysfs_validate_crb(struct netxen_adapter *adapter,
+		loff_t offset, size_t size)
+{
+	size_t crb_size = 4;
+
+	if (!(adapter->flags & NETXEN_NIC_DIAG_ENABLED))
+		return -EIO;
+
+	if (offset < NETXEN_PCI_CRBSPACE) {
+		if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+			return -EINVAL;
+
+		if (ADDR_IN_RANGE(offset, NETXEN_PCI_CAMQM,
+						NETXEN_PCI_CAMQM_2M_END))
+			crb_size = 8;
+		else
+			return -EINVAL;
+	}
+
+	if ((size != crb_size) || (offset & (crb_size-1)))
+		return  -EINVAL;
+
+	return 0;
+}
+
+static ssize_t
+netxen_sysfs_read_crb(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
+		char *buf, loff_t offset, size_t size)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct netxen_adapter *adapter = dev_get_drvdata(dev);
+	u32 data;
+	u64 qmdata;
+	int ret;
+
+	ret = netxen_sysfs_validate_crb(adapter, offset, size);
+	if (ret != 0)
+		return ret;
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id) &&
+		ADDR_IN_RANGE(offset, NETXEN_PCI_CAMQM,
+					NETXEN_PCI_CAMQM_2M_END)) {
+		netxen_pci_camqm_read_2M(adapter, offset, &qmdata);
+		memcpy(buf, &qmdata, size);
+	} else {
+		data = NXRD32(adapter, offset);
+		memcpy(buf, &data, size);
+	}
+
+	return size;
+}
+
+static ssize_t
+netxen_sysfs_write_crb(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
+		char *buf, loff_t offset, size_t size)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct netxen_adapter *adapter = dev_get_drvdata(dev);
+	u32 data;
+	u64 qmdata;
+	int ret;
+
+	ret = netxen_sysfs_validate_crb(adapter, offset, size);
+	if (ret != 0)
+		return ret;
+
+	if (NX_IS_REVISION_P3(adapter->ahw.revision_id) &&
+		ADDR_IN_RANGE(offset, NETXEN_PCI_CAMQM,
+					NETXEN_PCI_CAMQM_2M_END)) {
+		memcpy(&qmdata, buf, size);
+		netxen_pci_camqm_write_2M(adapter, offset, qmdata);
+	} else {
+		memcpy(&data, buf, size);
+		NXWR32(adapter, offset, data);
+	}
+
+	return size;
+}
+
+static int
+netxen_sysfs_validate_mem(struct netxen_adapter *adapter,
+		loff_t offset, size_t size)
+{
+	if (!(adapter->flags & NETXEN_NIC_DIAG_ENABLED))
+		return -EIO;
+
+	if ((size != 8) || (offset & 0x7))
+		return  -EIO;
+
+	return 0;
+}
+
+static ssize_t
+netxen_sysfs_read_mem(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
+		char *buf, loff_t offset, size_t size)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct netxen_adapter *adapter = dev_get_drvdata(dev);
+	u64 data;
+	int ret;
+
+	ret = netxen_sysfs_validate_mem(adapter, offset, size);
+	if (ret != 0)
+		return ret;
+
+	if (adapter->pci_mem_read(adapter, offset, &data))
+		return -EIO;
+
+	memcpy(buf, &data, size);
+
+	return size;
+}
+
+static ssize_t netxen_sysfs_write_mem(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr, char *buf,
+		loff_t offset, size_t size)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct netxen_adapter *adapter = dev_get_drvdata(dev);
+	u64 data;
+	int ret;
+
+	ret = netxen_sysfs_validate_mem(adapter, offset, size);
+	if (ret != 0)
+		return ret;
+
+	memcpy(&data, buf, size);
+
+	if (adapter->pci_mem_write(adapter, offset, data))
+		return -EIO;
+
+	return size;
+}
+
+
+static const struct bin_attribute bin_attr_crb = {
+	.attr = { .name = "crb", .mode = 0644 },
+	.size = 0,
+	.read = netxen_sysfs_read_crb,
+	.write = netxen_sysfs_write_crb,
+};
+
+static const struct bin_attribute bin_attr_mem = {
+	.attr = { .name = "mem", .mode = 0644 },
+	.size = 0,
+	.read = netxen_sysfs_read_mem,
+	.write = netxen_sysfs_write_mem,
+};
+
+static ssize_t
+netxen_sysfs_read_dimm(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
+		char *buf, loff_t offset, size_t size)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct netxen_adapter *adapter = dev_get_drvdata(dev);
+	struct net_device *netdev = adapter->netdev;
+	struct netxen_dimm_cfg dimm;
+	u8 dw, rows, cols, banks, ranks;
+	u32 val;
+
+	if (size < attr->size) {
+		netdev_err(netdev, "Invalid size\n");
+		return -EINVAL;
+	}
+
+	memset(&dimm, 0, sizeof(struct netxen_dimm_cfg));
+	val = NXRD32(adapter, NETXEN_DIMM_CAPABILITY);
+
+	/* Checks if DIMM info is valid. */
+	if (val & NETXEN_DIMM_VALID_FLAG) {
+		netdev_err(netdev, "Invalid DIMM flag\n");
+		dimm.presence = 0xff;
+		goto out;
+	}
+
+	rows = NETXEN_DIMM_NUMROWS(val);
+	cols = NETXEN_DIMM_NUMCOLS(val);
+	ranks = NETXEN_DIMM_NUMRANKS(val);
+	banks = NETXEN_DIMM_NUMBANKS(val);
+	dw = NETXEN_DIMM_DATAWIDTH(val);
+
+	dimm.presence = (val & NETXEN_DIMM_PRESENT);
+
+	/* Checks if DIMM info is present. */
+	if (!dimm.presence) {
+		netdev_err(netdev, "DIMM not present\n");
+		goto out;
+	}
+
+	dimm.dimm_type = NETXEN_DIMM_TYPE(val);
+
+	switch (dimm.dimm_type) {
+	case NETXEN_DIMM_TYPE_RDIMM:
+	case NETXEN_DIMM_TYPE_UDIMM:
+	case NETXEN_DIMM_TYPE_SO_DIMM:
+	case NETXEN_DIMM_TYPE_Micro_DIMM:
+	case NETXEN_DIMM_TYPE_Mini_RDIMM:
+	case NETXEN_DIMM_TYPE_Mini_UDIMM:
+		break;
+	default:
+		netdev_err(netdev, "Invalid DIMM type %x\n", dimm.dimm_type);
+		goto out;
+	}
+
+	if (val & NETXEN_DIMM_MEMTYPE_DDR2_SDRAM)
+		dimm.mem_type = NETXEN_DIMM_MEM_DDR2_SDRAM;
+	else
+		dimm.mem_type = NETXEN_DIMM_MEMTYPE(val);
+
+	if (val & NETXEN_DIMM_SIZE) {
+		dimm.size = NETXEN_DIMM_STD_MEM_SIZE;
+		goto out;
+	}
+
+	if (!rows) {
+		netdev_err(netdev, "Invalid no of rows %x\n", rows);
+		goto out;
+	}
+
+	if (!cols) {
+		netdev_err(netdev, "Invalid no of columns %x\n", cols);
+		goto out;
+	}
+
+	if (!banks) {
+		netdev_err(netdev, "Invalid no of banks %x\n", banks);
+		goto out;
+	}
+
+	ranks += 1;
+
+	switch (dw) {
+	case 0x0:
+		dw = 32;
+		break;
+	case 0x1:
+		dw = 33;
+		break;
+	case 0x2:
+		dw = 36;
+		break;
+	case 0x3:
+		dw = 64;
+		break;
+	case 0x4:
+		dw = 72;
+		break;
+	case 0x5:
+		dw = 80;
+		break;
+	case 0x6:
+		dw = 128;
+		break;
+	case 0x7:
+		dw = 144;
+		break;
+	default:
+		netdev_err(netdev, "Invalid data-width %x\n", dw);
+		goto out;
+	}
+
+	dimm.size = ((1 << rows) * (1 << cols) * dw * banks * ranks) / 8;
+	/* Size returned in MB. */
+	dimm.size = (dimm.size) / 0x100000;
+out:
+	memcpy(buf, &dimm, sizeof(struct netxen_dimm_cfg));
+	return sizeof(struct netxen_dimm_cfg);
+
+}
+
+static const struct bin_attribute bin_attr_dimm = {
+	.attr = { .name = "dimm", .mode = 0644 },
+	.size = sizeof(struct netxen_dimm_cfg),
+	.read = netxen_sysfs_read_dimm,
+};
+
+
+static void
+netxen_create_sysfs_entries(struct netxen_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+
+	if (adapter->capabilities & NX_FW_CAPABILITY_BDG) {
+		/* bridged_mode control */
+		if (device_create_file(dev, &dev_attr_bridged_mode)) {
+			dev_warn(dev,
+				"failed to create bridged_mode sysfs entry\n");
+		}
+	}
+}
+
+static void
+netxen_remove_sysfs_entries(struct netxen_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+
+	if (adapter->capabilities & NX_FW_CAPABILITY_BDG)
+		device_remove_file(dev, &dev_attr_bridged_mode);
+}
+
+static void
+netxen_create_diag_entries(struct netxen_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	struct device *dev;
+
+	dev = &pdev->dev;
+	if (device_create_file(dev, &dev_attr_diag_mode))
+		dev_info(dev, "failed to create diag_mode sysfs entry\n");
+	if (device_create_bin_file(dev, &bin_attr_crb))
+		dev_info(dev, "failed to create crb sysfs entry\n");
+	if (device_create_bin_file(dev, &bin_attr_mem))
+		dev_info(dev, "failed to create mem sysfs entry\n");
+	if (device_create_bin_file(dev, &bin_attr_dimm))
+		dev_info(dev, "failed to create dimm sysfs entry\n");
+}
+
+
+static void
+netxen_remove_diag_entries(struct netxen_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	struct device *dev = &pdev->dev;
+
+	device_remove_file(dev, &dev_attr_diag_mode);
+	device_remove_bin_file(dev, &bin_attr_crb);
+	device_remove_bin_file(dev, &bin_attr_mem);
+	device_remove_bin_file(dev, &bin_attr_dimm);
+}
+
+#ifdef CONFIG_INET
+
+#define is_netxen_netdev(dev) (dev->netdev_ops == &netxen_netdev_ops)
+
+static int
+netxen_destip_supported(struct netxen_adapter *adapter)
+{
+	if (NX_IS_REVISION_P2(adapter->ahw.revision_id))
+		return 0;
+
+	if (adapter->ahw.cut_through)
+		return 0;
+
+	return 1;
+}
+
+static void
+netxen_free_ip_list(struct netxen_adapter *adapter, bool master)
+{
+	struct nx_ip_list  *cur, *tmp_cur;
+
+	list_for_each_entry_safe(cur, tmp_cur, &adapter->ip_list, list) {
+		if (master) {
+			if (cur->master) {
+				netxen_config_ipaddr(adapter, cur->ip_addr,
+						     NX_IP_DOWN);
+				list_del(&cur->list);
+				kfree(cur);
+			}
+		} else {
+			netxen_config_ipaddr(adapter, cur->ip_addr, NX_IP_DOWN);
+			list_del(&cur->list);
+			kfree(cur);
+		}
+	}
+}
+
+static bool
+netxen_list_config_ip(struct netxen_adapter *adapter,
+		struct in_ifaddr *ifa, unsigned long event)
+{
+	struct net_device *dev;
+	struct nx_ip_list *cur, *tmp_cur;
+	struct list_head *head;
+	bool ret = false;
+
+	dev = ifa->ifa_dev ? ifa->ifa_dev->dev : NULL;
+
+	if (dev == NULL)
+		goto out;
+
+	switch (event) {
+	case NX_IP_UP:
+		list_for_each(head, &adapter->ip_list) {
+			cur = list_entry(head, struct nx_ip_list, list);
+
+			if (cur->ip_addr == ifa->ifa_address)
+				goto out;
+		}
+
+		cur = kzalloc(sizeof(struct nx_ip_list), GFP_ATOMIC);
+		if (cur == NULL)
+			goto out;
+		if (is_vlan_dev(dev))
+			dev = vlan_dev_real_dev(dev);
+		cur->master = !!netif_is_bond_master(dev);
+		cur->ip_addr = ifa->ifa_address;
+		list_add_tail(&cur->list, &adapter->ip_list);
+		netxen_config_ipaddr(adapter, ifa->ifa_address, NX_IP_UP);
+		ret = true;
+		break;
+	case NX_IP_DOWN:
+		list_for_each_entry_safe(cur, tmp_cur,
+					&adapter->ip_list, list) {
+			if (cur->ip_addr == ifa->ifa_address) {
+				list_del(&cur->list);
+				kfree(cur);
+				netxen_config_ipaddr(adapter, ifa->ifa_address,
+						     NX_IP_DOWN);
+				ret = true;
+				break;
+			}
+		}
+	}
+out:
+	return ret;
+}
+
+static void
+netxen_config_indev_addr(struct netxen_adapter *adapter,
+		struct net_device *dev, unsigned long event)
+{
+	struct in_device *indev;
+
+	if (!netxen_destip_supported(adapter))
+		return;
+
+	indev = in_dev_get(dev);
+	if (!indev)
+		return;
+
+	for_ifa(indev) {
+		switch (event) {
+		case NETDEV_UP:
+			netxen_list_config_ip(adapter, ifa, NX_IP_UP);
+			break;
+		case NETDEV_DOWN:
+			netxen_list_config_ip(adapter, ifa, NX_IP_DOWN);
+			break;
+		default:
+			break;
+		}
+	} endfor_ifa(indev);
+
+	in_dev_put(indev);
+}
+
+static void
+netxen_restore_indev_addr(struct net_device *netdev, unsigned long event)
+
+{
+	struct netxen_adapter *adapter = netdev_priv(netdev);
+	struct nx_ip_list *pos, *tmp_pos;
+	unsigned long ip_event;
+
+	ip_event = (event == NETDEV_UP) ? NX_IP_UP : NX_IP_DOWN;
+	netxen_config_indev_addr(adapter, netdev, event);
+
+	list_for_each_entry_safe(pos, tmp_pos, &adapter->ip_list, list) {
+		netxen_config_ipaddr(adapter, pos->ip_addr, ip_event);
+	}
+}
+
+static inline bool
+netxen_config_checkdev(struct net_device *dev)
+{
+	struct netxen_adapter *adapter;
+
+	if (!is_netxen_netdev(dev))
+		return false;
+	adapter = netdev_priv(dev);
+	if (!adapter)
+		return false;
+	if (!netxen_destip_supported(adapter))
+		return false;
+	if (adapter->is_up != NETXEN_ADAPTER_UP_MAGIC)
+		return false;
+
+	return true;
+}
+
+/**
+ * netxen_config_master - configure addresses based on master
+ * @dev: netxen device
+ * @event: netdev event
+ */
+static void netxen_config_master(struct net_device *dev, unsigned long event)
+{
+	struct net_device *master, *slave;
+	struct netxen_adapter *adapter = netdev_priv(dev);
+
+	rcu_read_lock();
+	master = netdev_master_upper_dev_get_rcu(dev);
+	/*
+	 * This is the case where the netxen nic is being
+	 * enslaved and is dev_open()ed in bond_enslave()
+	 * Now we should program the bond's (and its vlans')
+	 * addresses in the netxen NIC.
+	 */
+	if (master && netif_is_bond_master(master) &&
+	    !netif_is_bond_slave(dev)) {
+		netxen_config_indev_addr(adapter, master, event);
+		for_each_netdev_rcu(&init_net, slave)
+			if (is_vlan_dev(slave) &&
+			    vlan_dev_real_dev(slave) == master)
+				netxen_config_indev_addr(adapter, slave, event);
+	}
+	rcu_read_unlock();
+	/*
+	 * This is the case where the netxen nic is being
+	 * released and is dev_close()ed in bond_release()
+	 * just before IFF_BONDING is stripped.
+	 */
+	if (!master && dev->priv_flags & IFF_BONDING)
+		netxen_free_ip_list(adapter, true);
+}
+
+static int netxen_netdev_event(struct notifier_block *this,
+				 unsigned long event, void *ptr)
+{
+	struct netxen_adapter *adapter;
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct net_device *orig_dev = dev;
+	struct net_device *slave;
+
+recheck:
+	if (dev == NULL)
+		goto done;
+
+	if (is_vlan_dev(dev)) {
+		dev = vlan_dev_real_dev(dev);
+		goto recheck;
+	}
+	if (event == NETDEV_UP || event == NETDEV_DOWN) {
+		/* If this is a bonding device, look for netxen-based slaves*/
+		if (netif_is_bond_master(dev)) {
+			rcu_read_lock();
+			for_each_netdev_in_bond_rcu(dev, slave) {
+				if (!netxen_config_checkdev(slave))
+					continue;
+				adapter = netdev_priv(slave);
+				netxen_config_indev_addr(adapter,
+							 orig_dev, event);
+			}
+			rcu_read_unlock();
+		} else {
+			if (!netxen_config_checkdev(dev))
+				goto done;
+			adapter = netdev_priv(dev);
+			/* Act only if the actual netxen is the target */
+			if (orig_dev == dev)
+				netxen_config_master(dev, event);
+			netxen_config_indev_addr(adapter, orig_dev, event);
+		}
+	}
+done:
+	return NOTIFY_DONE;
+}
+
+static int
+netxen_inetaddr_event(struct notifier_block *this,
+		unsigned long event, void *ptr)
+{
+	struct netxen_adapter *adapter;
+	struct net_device *dev, *slave;
+	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+	unsigned long ip_event;
+
+	dev = ifa->ifa_dev ? ifa->ifa_dev->dev : NULL;
+	ip_event = (event == NETDEV_UP) ? NX_IP_UP : NX_IP_DOWN;
+recheck:
+	if (dev == NULL)
+		goto done;
+
+	if (is_vlan_dev(dev)) {
+		dev = vlan_dev_real_dev(dev);
+		goto recheck;
+	}
+	if (event == NETDEV_UP || event == NETDEV_DOWN) {
+		/* If this is a bonding device, look for netxen-based slaves*/
+		if (netif_is_bond_master(dev)) {
+			rcu_read_lock();
+			for_each_netdev_in_bond_rcu(dev, slave) {
+				if (!netxen_config_checkdev(slave))
+					continue;
+				adapter = netdev_priv(slave);
+				netxen_list_config_ip(adapter, ifa, ip_event);
+			}
+			rcu_read_unlock();
+		} else {
+			if (!netxen_config_checkdev(dev))
+				goto done;
+			adapter = netdev_priv(dev);
+			netxen_list_config_ip(adapter, ifa, ip_event);
+		}
+	}
+done:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block	netxen_netdev_cb = {
+	.notifier_call = netxen_netdev_event,
+};
+
+static struct notifier_block netxen_inetaddr_cb = {
+	.notifier_call = netxen_inetaddr_event,
+};
+#else
+static void
+netxen_restore_indev_addr(struct net_device *dev, unsigned long event)
+{ }
+static void
+netxen_free_ip_list(struct netxen_adapter *adapter, bool master)
+{ }
+#endif
+
+static const struct pci_error_handlers netxen_err_handler = {
+	.error_detected = netxen_io_error_detected,
+	.slot_reset = netxen_io_slot_reset,
+	.resume = netxen_io_resume,
+};
+
+static struct pci_driver netxen_driver = {
+	.name = netxen_nic_driver_name,
+	.id_table = netxen_pci_tbl,
+	.probe = netxen_nic_probe,
+	.remove = netxen_nic_remove,
+#ifdef CONFIG_PM
+	.suspend = netxen_nic_suspend,
+	.resume = netxen_nic_resume,
+#endif
+	.shutdown = netxen_nic_shutdown,
+	.err_handler = &netxen_err_handler
+};
+
+static int __init netxen_init_module(void)
+{
+	printk(KERN_INFO "%s\n", netxen_nic_driver_string);
+
+#ifdef CONFIG_INET
+	register_netdevice_notifier(&netxen_netdev_cb);
+	register_inetaddr_notifier(&netxen_inetaddr_cb);
+#endif
+	return pci_register_driver(&netxen_driver);
+}
+
+module_init(netxen_init_module);
+
+static void __exit netxen_exit_module(void)
+{
+	pci_unregister_driver(&netxen_driver);
+
+#ifdef CONFIG_INET
+	unregister_inetaddr_notifier(&netxen_inetaddr_cb);
+	unregister_netdevice_notifier(&netxen_netdev_cb);
+#endif
+}
+
+module_exit(netxen_exit_module);
diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
new file mode 100644
index 0000000..c70cf2a
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_QED) := qed.o
+
+qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \
+	 qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o \
+	 qed_selftest.o qed_dcbx.o qed_debug.o qed_ptp.o
+qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o
+qed-$(CONFIG_QED_LL2) += qed_ll2.o
+qed-$(CONFIG_QED_RDMA) += qed_roce.o qed_rdma.o qed_iwarp.o
+qed-$(CONFIG_QED_ISCSI) += qed_iscsi.o
+qed-$(CONFIG_QED_FCOE) += qed_fcoe.o
+qed-$(CONFIG_QED_OOO) += qed_ooo.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
new file mode 100644
index 0000000..e07460a
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -0,0 +1,857 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_H
+#define _QED_H
+
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/firmware.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
+#include <linux/zlib.h>
+#include <linux/hashtable.h>
+#include <linux/qed/qed_if.h>
+#include "qed_debug.h"
+#include "qed_hsi.h"
+
+extern const struct qed_common_ops qed_common_ops_pass;
+
+#define QED_MAJOR_VERSION		8
+#define QED_MINOR_VERSION		33
+#define QED_REVISION_VERSION		0
+#define QED_ENGINEERING_VERSION		20
+
+#define QED_VERSION						 \
+	((QED_MAJOR_VERSION << 24) | (QED_MINOR_VERSION << 16) | \
+	 (QED_REVISION_VERSION << 8) | QED_ENGINEERING_VERSION)
+
+#define STORM_FW_VERSION				       \
+	((FW_MAJOR_VERSION << 24) | (FW_MINOR_VERSION << 16) | \
+	 (FW_REVISION_VERSION << 8) | FW_ENGINEERING_VERSION)
+
+#define MAX_HWFNS_PER_DEVICE    (4)
+#define NAME_SIZE 16
+#define VER_SIZE 16
+
+#define QED_WFQ_UNIT	100
+
+#define QED_WID_SIZE            (1024)
+#define QED_MIN_WIDS		(4)
+#define QED_PF_DEMS_SIZE        (4)
+
+/* cau states */
+enum qed_coalescing_mode {
+	QED_COAL_MODE_DISABLE,
+	QED_COAL_MODE_ENABLE
+};
+
+enum qed_nvm_cmd {
+	QED_PUT_FILE_BEGIN = DRV_MSG_CODE_NVM_PUT_FILE_BEGIN,
+	QED_PUT_FILE_DATA = DRV_MSG_CODE_NVM_PUT_FILE_DATA,
+	QED_NVM_WRITE_NVRAM = DRV_MSG_CODE_NVM_WRITE_NVRAM,
+	QED_GET_MCP_NVM_RESP = 0xFFFFFF00
+};
+
+struct qed_eth_cb_ops;
+struct qed_dev_info;
+union qed_mcp_protocol_stats;
+enum qed_mcp_protocol_type;
+
+/* helpers */
+#define QED_MFW_GET_FIELD(name, field) \
+	(((name) & (field ## _MASK)) >> (field ## _SHIFT))
+
+#define QED_MFW_SET_FIELD(name, field, value)				       \
+	do {								       \
+		(name)	&= ~(field ## _MASK);	       \
+		(name)	|= (((value) << (field ## _SHIFT)) & (field ## _MASK));\
+	} while (0)
+
+static inline u32 qed_db_addr(u32 cid, u32 DEMS)
+{
+	u32 db_addr = FIELD_VALUE(DB_LEGACY_ADDR_DEMS, DEMS) |
+		      (cid * QED_PF_DEMS_SIZE);
+
+	return db_addr;
+}
+
+static inline u32 qed_db_addr_vf(u32 cid, u32 DEMS)
+{
+	u32 db_addr = FIELD_VALUE(DB_LEGACY_ADDR_DEMS, DEMS) |
+		      FIELD_VALUE(DB_LEGACY_ADDR_ICID, cid);
+
+	return db_addr;
+}
+
+#define ALIGNED_TYPE_SIZE(type_name, p_hwfn)				     \
+	((sizeof(type_name) + (u32)(1 << (p_hwfn->cdev->cache_shift)) - 1) & \
+	 ~((1 << (p_hwfn->cdev->cache_shift)) - 1))
+
+#define for_each_hwfn(cdev, i)  for (i = 0; i < cdev->num_hwfns; i++)
+
+#define D_TRINE(val, cond1, cond2, true1, true2, def) \
+	(val == (cond1) ? true1 :		      \
+	 (val == (cond2) ? true2 : def))
+
+/* forward */
+struct qed_ptt_pool;
+struct qed_spq;
+struct qed_sb_info;
+struct qed_sb_attn_info;
+struct qed_cxt_mngr;
+struct qed_sb_sp_info;
+struct qed_ll2_info;
+struct qed_mcp_info;
+
+struct qed_rt_data {
+	u32	*init_val;
+	bool	*b_valid;
+};
+
+enum qed_tunn_mode {
+	QED_MODE_L2GENEVE_TUNN,
+	QED_MODE_IPGENEVE_TUNN,
+	QED_MODE_L2GRE_TUNN,
+	QED_MODE_IPGRE_TUNN,
+	QED_MODE_VXLAN_TUNN,
+};
+
+enum qed_tunn_clss {
+	QED_TUNN_CLSS_MAC_VLAN,
+	QED_TUNN_CLSS_MAC_VNI,
+	QED_TUNN_CLSS_INNER_MAC_VLAN,
+	QED_TUNN_CLSS_INNER_MAC_VNI,
+	QED_TUNN_CLSS_MAC_VLAN_DUAL_STAGE,
+	MAX_QED_TUNN_CLSS,
+};
+
+struct qed_tunn_update_type {
+	bool b_update_mode;
+	bool b_mode_enabled;
+	enum qed_tunn_clss tun_cls;
+};
+
+struct qed_tunn_update_udp_port {
+	bool b_update_port;
+	u16 port;
+};
+
+struct qed_tunnel_info {
+	struct qed_tunn_update_type vxlan;
+	struct qed_tunn_update_type l2_geneve;
+	struct qed_tunn_update_type ip_geneve;
+	struct qed_tunn_update_type l2_gre;
+	struct qed_tunn_update_type ip_gre;
+
+	struct qed_tunn_update_udp_port vxlan_port;
+	struct qed_tunn_update_udp_port geneve_port;
+
+	bool b_update_rx_cls;
+	bool b_update_tx_cls;
+};
+
+struct qed_tunn_start_params {
+	unsigned long	tunn_mode;
+	u16		vxlan_udp_port;
+	u16		geneve_udp_port;
+	u8		update_vxlan_udp_port;
+	u8		update_geneve_udp_port;
+	u8		tunn_clss_vxlan;
+	u8		tunn_clss_l2geneve;
+	u8		tunn_clss_ipgeneve;
+	u8		tunn_clss_l2gre;
+	u8		tunn_clss_ipgre;
+};
+
+struct qed_tunn_update_params {
+	unsigned long	tunn_mode_update_mask;
+	unsigned long	tunn_mode;
+	u16		vxlan_udp_port;
+	u16		geneve_udp_port;
+	u8		update_rx_pf_clss;
+	u8		update_tx_pf_clss;
+	u8		update_vxlan_udp_port;
+	u8		update_geneve_udp_port;
+	u8		tunn_clss_vxlan;
+	u8		tunn_clss_l2geneve;
+	u8		tunn_clss_ipgeneve;
+	u8		tunn_clss_l2gre;
+	u8		tunn_clss_ipgre;
+};
+
+/* The PCI personality is not quite synonymous to protocol ID:
+ * 1. All personalities need CORE connections
+ * 2. The Ethernet personality may support also the RoCE/iWARP protocol
+ */
+enum qed_pci_personality {
+	QED_PCI_ETH,
+	QED_PCI_FCOE,
+	QED_PCI_ISCSI,
+	QED_PCI_ETH_ROCE,
+	QED_PCI_ETH_IWARP,
+	QED_PCI_ETH_RDMA,
+	QED_PCI_DEFAULT, /* default in shmem */
+};
+
+/* All VFs are symmetric, all counters are PF + all VFs */
+struct qed_qm_iids {
+	u32 cids;
+	u32 vf_cids;
+	u32 tids;
+};
+
+/* HW / FW resources, output of features supported below, most information
+ * is received from MFW.
+ */
+enum qed_resources {
+	QED_SB,
+	QED_L2_QUEUE,
+	QED_VPORT,
+	QED_RSS_ENG,
+	QED_PQ,
+	QED_RL,
+	QED_MAC,
+	QED_VLAN,
+	QED_RDMA_CNQ_RAM,
+	QED_ILT,
+	QED_LL2_QUEUE,
+	QED_CMDQS_CQS,
+	QED_RDMA_STATS_QUEUE,
+	QED_BDQ,
+	QED_MAX_RESC,
+};
+
+enum QED_FEATURE {
+	QED_PF_L2_QUE,
+	QED_VF,
+	QED_RDMA_CNQ,
+	QED_ISCSI_CQ,
+	QED_FCOE_CQ,
+	QED_VF_L2_QUE,
+	QED_MAX_FEATURES,
+};
+
+enum QED_PORT_MODE {
+	QED_PORT_MODE_DE_2X40G,
+	QED_PORT_MODE_DE_2X50G,
+	QED_PORT_MODE_DE_1X100G,
+	QED_PORT_MODE_DE_4X10G_F,
+	QED_PORT_MODE_DE_4X10G_E,
+	QED_PORT_MODE_DE_4X20G,
+	QED_PORT_MODE_DE_1X40G,
+	QED_PORT_MODE_DE_2X25G,
+	QED_PORT_MODE_DE_1X25G,
+	QED_PORT_MODE_DE_4X25G,
+	QED_PORT_MODE_DE_2X10G,
+};
+
+enum qed_dev_cap {
+	QED_DEV_CAP_ETH,
+	QED_DEV_CAP_FCOE,
+	QED_DEV_CAP_ISCSI,
+	QED_DEV_CAP_ROCE,
+	QED_DEV_CAP_IWARP,
+};
+
+enum qed_wol_support {
+	QED_WOL_SUPPORT_NONE,
+	QED_WOL_SUPPORT_PME,
+};
+
+struct qed_hw_info {
+	/* PCI personality */
+	enum qed_pci_personality personality;
+#define QED_IS_RDMA_PERSONALITY(dev)			    \
+	((dev)->hw_info.personality == QED_PCI_ETH_ROCE ||  \
+	 (dev)->hw_info.personality == QED_PCI_ETH_IWARP || \
+	 (dev)->hw_info.personality == QED_PCI_ETH_RDMA)
+#define QED_IS_ROCE_PERSONALITY(dev)			   \
+	((dev)->hw_info.personality == QED_PCI_ETH_ROCE || \
+	 (dev)->hw_info.personality == QED_PCI_ETH_RDMA)
+#define QED_IS_IWARP_PERSONALITY(dev)			    \
+	((dev)->hw_info.personality == QED_PCI_ETH_IWARP || \
+	 (dev)->hw_info.personality == QED_PCI_ETH_RDMA)
+#define QED_IS_L2_PERSONALITY(dev)		      \
+	((dev)->hw_info.personality == QED_PCI_ETH || \
+	 QED_IS_RDMA_PERSONALITY(dev))
+#define QED_IS_FCOE_PERSONALITY(dev) \
+	((dev)->hw_info.personality == QED_PCI_FCOE)
+#define QED_IS_ISCSI_PERSONALITY(dev) \
+	((dev)->hw_info.personality == QED_PCI_ISCSI)
+
+	/* Resource Allocation scheme results */
+	u32				resc_start[QED_MAX_RESC];
+	u32				resc_num[QED_MAX_RESC];
+	u32				feat_num[QED_MAX_FEATURES];
+
+#define RESC_START(_p_hwfn, resc) ((_p_hwfn)->hw_info.resc_start[resc])
+#define RESC_NUM(_p_hwfn, resc) ((_p_hwfn)->hw_info.resc_num[resc])
+#define RESC_END(_p_hwfn, resc) (RESC_START(_p_hwfn, resc) + \
+				 RESC_NUM(_p_hwfn, resc))
+#define FEAT_NUM(_p_hwfn, resc) ((_p_hwfn)->hw_info.feat_num[resc])
+
+	/* Amount of traffic classes HW supports */
+	u8 num_hw_tc;
+
+	/* Amount of TCs which should be active according to DCBx or upper
+	 * layer driver configuration.
+	 */
+	u8 num_active_tc;
+	u8				offload_tc;
+
+	u32				concrete_fid;
+	u16				opaque_fid;
+	u16				ovlan;
+	u32				part_num[4];
+
+	unsigned char			hw_mac_addr[ETH_ALEN];
+	u64				node_wwn;
+	u64				port_wwn;
+
+	u16				num_fcoe_conns;
+
+	struct qed_igu_info		*p_igu_info;
+
+	u32				port_mode;
+	u32				hw_mode;
+	unsigned long		device_capabilities;
+	u16				mtu;
+
+	enum qed_wol_support b_wol_support;
+};
+
+/* maximun size of read/write commands (HW limit) */
+#define DMAE_MAX_RW_SIZE        0x2000
+
+struct qed_dmae_info {
+	/* Mutex for synchronizing access to functions */
+	struct mutex	mutex;
+
+	u8		channel;
+
+	dma_addr_t	completion_word_phys_addr;
+
+	/* The memory location where the DMAE writes the completion
+	 * value when an operation is finished on this context.
+	 */
+	u32		*p_completion_word;
+
+	dma_addr_t	intermediate_buffer_phys_addr;
+
+	/* An intermediate buffer for DMAE operations that use virtual
+	 * addresses - data is DMA'd to/from this buffer and then
+	 * memcpy'd to/from the virtual address
+	 */
+	u32		*p_intermediate_buffer;
+
+	dma_addr_t	dmae_cmd_phys_addr;
+	struct dmae_cmd *p_dmae_cmd;
+};
+
+struct qed_wfq_data {
+	/* when feature is configured for at least 1 vport */
+	u32	min_speed;
+	bool	configured;
+};
+
+struct qed_qm_info {
+	struct init_qm_pq_params	*qm_pq_params;
+	struct init_qm_vport_params	*qm_vport_params;
+	struct init_qm_port_params	*qm_port_params;
+	u16				start_pq;
+	u8				start_vport;
+	u16				 pure_lb_pq;
+	u16				offload_pq;
+	u16				low_latency_pq;
+	u16				pure_ack_pq;
+	u16				ooo_pq;
+	u16				first_vf_pq;
+	u16				first_mcos_pq;
+	u16				first_rl_pq;
+	u16				num_pqs;
+	u16				num_vf_pqs;
+	u8				num_vports;
+	u8				max_phys_tcs_per_port;
+	u8				ooo_tc;
+	bool				pf_rl_en;
+	bool				pf_wfq_en;
+	bool				vport_rl_en;
+	bool				vport_wfq_en;
+	u8				pf_wfq;
+	u32				pf_rl;
+	struct qed_wfq_data		*wfq_data;
+	u8 num_pf_rls;
+};
+
+struct storm_stats {
+	u32     address;
+	u32     len;
+};
+
+struct qed_storm_stats {
+	struct storm_stats mstats;
+	struct storm_stats pstats;
+	struct storm_stats tstats;
+	struct storm_stats ustats;
+};
+
+struct qed_fw_data {
+	struct fw_ver_info	*fw_ver_info;
+	const u8		*modes_tree_buf;
+	union init_op		*init_ops;
+	const u32		*arr_data;
+	u32			init_ops_size;
+};
+
+enum BAR_ID {
+	BAR_ID_0,		/* used for GRC */
+	BAR_ID_1		/* Used for doorbells */
+};
+
+struct qed_nvm_image_info {
+	u32 num_images;
+	struct bist_nvm_image_att *image_att;
+};
+
+#define DRV_MODULE_VERSION		      \
+	__stringify(QED_MAJOR_VERSION) "."    \
+	__stringify(QED_MINOR_VERSION) "."    \
+	__stringify(QED_REVISION_VERSION) "." \
+	__stringify(QED_ENGINEERING_VERSION)
+
+struct qed_simd_fp_handler {
+	void	*token;
+	void	(*func)(void *);
+};
+
+struct qed_hwfn {
+	struct qed_dev			*cdev;
+	u8				my_id;          /* ID inside the PF */
+#define IS_LEAD_HWFN(edev)              (!((edev)->my_id))
+	u8				rel_pf_id;      /* Relative to engine*/
+	u8				abs_pf_id;
+#define QED_PATH_ID(_p_hwfn) \
+	(QED_IS_K2((_p_hwfn)->cdev) ? 0 : ((_p_hwfn)->abs_pf_id & 1))
+	u8				port_id;
+	bool				b_active;
+
+	u32				dp_module;
+	u8				dp_level;
+	char				name[NAME_SIZE];
+
+	bool				first_on_engine;
+	bool				hw_init_done;
+
+	u8				num_funcs_on_engine;
+	u8 enabled_func_idx;
+
+	/* BAR access */
+	void __iomem			*regview;
+	void __iomem			*doorbells;
+	u64				db_phys_addr;
+	unsigned long			db_size;
+
+	/* PTT pool */
+	struct qed_ptt_pool		*p_ptt_pool;
+
+	/* HW info */
+	struct qed_hw_info		hw_info;
+
+	/* rt_array (for init-tool) */
+	struct qed_rt_data		rt_data;
+
+	/* SPQ */
+	struct qed_spq			*p_spq;
+
+	/* EQ */
+	struct qed_eq			*p_eq;
+
+	/* Consolidate Q*/
+	struct qed_consq		*p_consq;
+
+	/* Slow-Path definitions */
+	struct tasklet_struct		*sp_dpc;
+	bool				b_sp_dpc_enabled;
+
+	struct qed_ptt			*p_main_ptt;
+	struct qed_ptt			*p_dpc_ptt;
+
+	/* PTP will be used only by the leading function.
+	 * Usage of all PTP-apis should be synchronized as result.
+	 */
+	struct qed_ptt *p_ptp_ptt;
+
+	struct qed_sb_sp_info		*p_sp_sb;
+	struct qed_sb_attn_info		*p_sb_attn;
+
+	/* Protocol related */
+	bool				using_ll2;
+	struct qed_ll2_info		*p_ll2_info;
+	struct qed_ooo_info		*p_ooo_info;
+	struct qed_rdma_info		*p_rdma_info;
+	struct qed_iscsi_info		*p_iscsi_info;
+	struct qed_fcoe_info		*p_fcoe_info;
+	struct qed_pf_params		pf_params;
+
+	bool b_rdma_enabled_in_prs;
+	u32 rdma_prs_search_reg;
+
+	struct qed_cxt_mngr		*p_cxt_mngr;
+
+	/* Flag indicating whether interrupts are enabled or not*/
+	bool				b_int_enabled;
+	bool				b_int_requested;
+
+	/* True if the driver requests for the link */
+	bool				b_drv_link_init;
+
+	struct qed_vf_iov		*vf_iov_info;
+	struct qed_pf_iov		*pf_iov_info;
+	struct qed_mcp_info		*mcp_info;
+
+	struct qed_dcbx_info		*p_dcbx_info;
+
+	struct qed_dmae_info		dmae_info;
+
+	/* QM init */
+	struct qed_qm_info		qm_info;
+	struct qed_storm_stats		storm_stats;
+
+	/* Buffer for unzipping firmware data */
+	void				*unzip_buf;
+
+	struct dbg_tools_data		dbg_info;
+
+	/* PWM region specific data */
+	u16				wid_count;
+	u32				dpi_size;
+	u32				dpi_count;
+
+	/* This is used to calculate the doorbell address */
+	u32 dpi_start_offset;
+
+	/* If one of the following is set then EDPM shouldn't be used */
+	u8 dcbx_no_edpm;
+	u8 db_bar_no_edpm;
+
+	/* L2-related */
+	struct qed_l2_info *p_l2_info;
+
+	/* Nvm images number and attributes */
+	struct qed_nvm_image_info nvm_info;
+
+	struct qed_ptt *p_arfs_ptt;
+
+	struct qed_simd_fp_handler	simd_proto_handler[64];
+
+#ifdef CONFIG_QED_SRIOV
+	struct workqueue_struct *iov_wq;
+	struct delayed_work iov_task;
+	unsigned long iov_task_flags;
+#endif
+
+	struct z_stream_s		*stream;
+};
+
+struct pci_params {
+	int		pm_cap;
+
+	unsigned long	mem_start;
+	unsigned long	mem_end;
+	unsigned int	irq;
+	u8		pf_num;
+};
+
+struct qed_int_param {
+	u32	int_mode;
+	u8	num_vectors;
+	u8	min_msix_cnt; /* for minimal functionality */
+};
+
+struct qed_int_params {
+	struct qed_int_param	in;
+	struct qed_int_param	out;
+	struct msix_entry	*msix_table;
+	bool			fp_initialized;
+	u8			fp_msix_base;
+	u8			fp_msix_cnt;
+	u8			rdma_msix_base;
+	u8			rdma_msix_cnt;
+};
+
+struct qed_dbg_feature {
+	struct dentry *dentry;
+	u8 *dump_buf;
+	u32 buf_size;
+	u32 dumped_dwords;
+};
+
+struct qed_dbg_params {
+	struct qed_dbg_feature features[DBG_FEATURE_NUM];
+	u8 engine_for_debug;
+	bool print_data;
+};
+
+struct qed_dev {
+	u32	dp_module;
+	u8	dp_level;
+	char	name[NAME_SIZE];
+
+	enum	qed_dev_type type;
+/* Translate type/revision combo into the proper conditions */
+#define QED_IS_BB(dev)  ((dev)->type == QED_DEV_TYPE_BB)
+#define QED_IS_BB_B0(dev)       (QED_IS_BB(dev) && \
+				 CHIP_REV_IS_B0(dev))
+#define QED_IS_AH(dev)  ((dev)->type == QED_DEV_TYPE_AH)
+#define QED_IS_K2(dev)  QED_IS_AH(dev)
+
+	u16	vendor_id;
+	u16	device_id;
+#define QED_DEV_ID_MASK		0xff00
+#define QED_DEV_ID_MASK_BB	0x1600
+#define QED_DEV_ID_MASK_AH	0x8000
+
+	u16	chip_num;
+#define CHIP_NUM_MASK                   0xffff
+#define CHIP_NUM_SHIFT                  16
+
+	u16	chip_rev;
+#define CHIP_REV_MASK                   0xf
+#define CHIP_REV_SHIFT                  12
+#define CHIP_REV_IS_B0(_cdev)   ((_cdev)->chip_rev == 1)
+
+	u16				chip_metal;
+#define CHIP_METAL_MASK                 0xff
+#define CHIP_METAL_SHIFT                4
+
+	u16				chip_bond_id;
+#define CHIP_BOND_ID_MASK               0xf
+#define CHIP_BOND_ID_SHIFT              0
+
+	u8				num_engines;
+	u8				num_ports_in_engine;
+	u8				num_funcs_in_port;
+
+	u8				path_id;
+	enum qed_mf_mode		mf_mode;
+#define IS_MF_DEFAULT(_p_hwfn)  (((_p_hwfn)->cdev)->mf_mode == QED_MF_DEFAULT)
+#define IS_MF_SI(_p_hwfn)       (((_p_hwfn)->cdev)->mf_mode == QED_MF_NPAR)
+#define IS_MF_SD(_p_hwfn)       (((_p_hwfn)->cdev)->mf_mode == QED_MF_OVLAN)
+
+	int				pcie_width;
+	int				pcie_speed;
+
+	/* Add MF related configuration */
+	u8				mcp_rev;
+	u8				boot_mode;
+
+	/* WoL related configurations */
+	u8 wol_config;
+	u8 wol_mac[ETH_ALEN];
+
+	u32				int_mode;
+	enum qed_coalescing_mode	int_coalescing_mode;
+	u16				rx_coalesce_usecs;
+	u16				tx_coalesce_usecs;
+
+	/* Start Bar offset of first hwfn */
+	void __iomem			*regview;
+	void __iomem			*doorbells;
+	u64				db_phys_addr;
+	unsigned long			db_size;
+
+	/* PCI */
+	u8				cache_shift;
+
+	/* Init */
+	const struct iro		*iro_arr;
+#define IRO (p_hwfn->cdev->iro_arr)
+
+	/* HW functions */
+	u8				num_hwfns;
+	struct qed_hwfn			hwfns[MAX_HWFNS_PER_DEVICE];
+
+	/* SRIOV */
+	struct qed_hw_sriov_info *p_iov_info;
+#define IS_QED_SRIOV(cdev)              (!!(cdev)->p_iov_info)
+	struct qed_tunnel_info		tunnel;
+	bool				b_is_vf;
+	u32				drv_type;
+	struct qed_eth_stats		*reset_stats;
+	struct qed_fw_data		*fw_data;
+
+	u32				mcp_nvm_resp;
+
+	/* Linux specific here */
+	struct  qede_dev		*edev;
+	struct  pci_dev			*pdev;
+	u32 flags;
+#define QED_FLAG_STORAGE_STARTED	(BIT(0))
+	int				msg_enable;
+
+	struct pci_params		pci_params;
+
+	struct qed_int_params		int_params;
+
+	u8				protocol;
+#define IS_QED_ETH_IF(cdev)     ((cdev)->protocol == QED_PROTOCOL_ETH)
+#define IS_QED_FCOE_IF(cdev)    ((cdev)->protocol == QED_PROTOCOL_FCOE)
+
+	/* Callbacks to protocol driver */
+	union {
+		struct qed_common_cb_ops	*common;
+		struct qed_eth_cb_ops		*eth;
+		struct qed_fcoe_cb_ops		*fcoe;
+		struct qed_iscsi_cb_ops		*iscsi;
+	} protocol_ops;
+	void				*ops_cookie;
+
+	struct qed_dbg_params		dbg_params;
+
+#ifdef CONFIG_QED_LL2
+	struct qed_cb_ll2_info		*ll2;
+	u8				ll2_mac_address[ETH_ALEN];
+#endif
+	DECLARE_HASHTABLE(connections, 10);
+	const struct firmware		*firmware;
+
+	u32 rdma_max_sge;
+	u32 rdma_max_inline;
+	u32 rdma_max_srq_sge;
+	u16 tunn_feature_mask;
+};
+
+#define NUM_OF_VFS(dev)         (QED_IS_BB(dev) ? MAX_NUM_VFS_BB \
+						: MAX_NUM_VFS_K2)
+#define NUM_OF_L2_QUEUES(dev)   (QED_IS_BB(dev) ? MAX_NUM_L2_QUEUES_BB \
+						: MAX_NUM_L2_QUEUES_K2)
+#define NUM_OF_PORTS(dev)       (QED_IS_BB(dev) ? MAX_NUM_PORTS_BB \
+						: MAX_NUM_PORTS_K2)
+#define NUM_OF_SBS(dev)         (QED_IS_BB(dev) ? MAX_SB_PER_PATH_BB \
+						: MAX_SB_PER_PATH_K2)
+#define NUM_OF_ENG_PFS(dev)     (QED_IS_BB(dev) ? MAX_NUM_PFS_BB \
+						: MAX_NUM_PFS_K2)
+
+/**
+ * @brief qed_concrete_to_sw_fid - get the sw function id from
+ *        the concrete value.
+ *
+ * @param concrete_fid
+ *
+ * @return inline u8
+ */
+static inline u8 qed_concrete_to_sw_fid(struct qed_dev *cdev,
+					u32 concrete_fid)
+{
+	u8 vfid = GET_FIELD(concrete_fid, PXP_CONCRETE_FID_VFID);
+	u8 pfid = GET_FIELD(concrete_fid, PXP_CONCRETE_FID_PFID);
+	u8 vf_valid = GET_FIELD(concrete_fid,
+				PXP_CONCRETE_FID_VFVALID);
+	u8 sw_fid;
+
+	if (vf_valid)
+		sw_fid = vfid + MAX_NUM_PFS;
+	else
+		sw_fid = pfid;
+
+	return sw_fid;
+}
+
+#define PKT_LB_TC	9
+#define MAX_NUM_VOQS_E4	20
+
+int qed_configure_vport_wfq(struct qed_dev *cdev, u16 vp_id, u32 rate);
+void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev,
+					 struct qed_ptt *p_ptt,
+					 u32 min_pf_rate);
+
+void qed_clean_wfq_db(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+int qed_device_num_engines(struct qed_dev *cdev);
+int qed_device_get_port_id(struct qed_dev *cdev);
+void qed_set_fw_mac_addr(__le16 *fw_msb,
+			 __le16 *fw_mid, __le16 *fw_lsb, u8 *mac);
+
+#define QED_LEADING_HWFN(dev)   (&dev->hwfns[0])
+
+/* Flags for indication of required queues */
+#define PQ_FLAGS_RLS    (BIT(0))
+#define PQ_FLAGS_MCOS   (BIT(1))
+#define PQ_FLAGS_LB     (BIT(2))
+#define PQ_FLAGS_OOO    (BIT(3))
+#define PQ_FLAGS_ACK    (BIT(4))
+#define PQ_FLAGS_OFLD   (BIT(5))
+#define PQ_FLAGS_VFS    (BIT(6))
+#define PQ_FLAGS_LLT    (BIT(7))
+
+/* physical queue index for cm context intialization */
+u16 qed_get_cm_pq_idx(struct qed_hwfn *p_hwfn, u32 pq_flags);
+u16 qed_get_cm_pq_idx_mcos(struct qed_hwfn *p_hwfn, u8 tc);
+u16 qed_get_cm_pq_idx_vf(struct qed_hwfn *p_hwfn, u16 vf);
+
+#define QED_LEADING_HWFN(dev)   (&dev->hwfns[0])
+
+/* Other Linux specific common definitions */
+#define DP_NAME(cdev) ((cdev)->name)
+
+#define REG_ADDR(cdev, offset)          (void __iomem *)((u8 __iomem *)\
+						(cdev->regview) + \
+							 (offset))
+
+#define REG_RD(cdev, offset)            readl(REG_ADDR(cdev, offset))
+#define REG_WR(cdev, offset, val)       writel((u32)val, REG_ADDR(cdev, offset))
+#define REG_WR16(cdev, offset, val)     writew((u16)val, REG_ADDR(cdev, offset))
+
+#define DOORBELL(cdev, db_addr, val)			 \
+	writel((u32)val, (void __iomem *)((u8 __iomem *)\
+					  (cdev->doorbells) + (db_addr)))
+
+/* Prototypes */
+int qed_fill_dev_info(struct qed_dev *cdev,
+		      struct qed_dev_info *dev_info);
+void qed_link_update(struct qed_hwfn *hwfn);
+u32 qed_unzip_data(struct qed_hwfn *p_hwfn,
+		   u32 input_len, u8 *input_buf,
+		   u32 max_size, u8 *unzip_buf);
+void qed_get_protocol_stats(struct qed_dev *cdev,
+			    enum qed_mcp_protocol_type type,
+			    union qed_mcp_protocol_stats *stats);
+int qed_slowpath_irq_req(struct qed_hwfn *hwfn);
+void qed_slowpath_irq_sync(struct qed_hwfn *p_hwfn);
+
+#endif /* _QED_H */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
new file mode 100644
index 0000000..820b226
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -0,0 +1,2553 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/dma-mapping.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/log2.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_dev_api.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_init_ops.h"
+#include "qed_reg_addr.h"
+#include "qed_sriov.h"
+
+/* Max number of connection types in HW (DQ/CDU etc.) */
+#define MAX_CONN_TYPES		PROTOCOLID_COMMON
+#define NUM_TASK_TYPES		2
+#define NUM_TASK_PF_SEGMENTS	4
+#define NUM_TASK_VF_SEGMENTS	1
+
+/* QM constants */
+#define QM_PQ_ELEMENT_SIZE	4 /* in bytes */
+
+/* Doorbell-Queue constants */
+#define DQ_RANGE_SHIFT		4
+#define DQ_RANGE_ALIGN		BIT(DQ_RANGE_SHIFT)
+
+/* Searcher constants */
+#define SRC_MIN_NUM_ELEMS 256
+
+/* Timers constants */
+#define TM_SHIFT        7
+#define TM_ALIGN        BIT(TM_SHIFT)
+#define TM_ELEM_SIZE    4
+
+#define ILT_DEFAULT_HW_P_SIZE	4
+
+#define ILT_PAGE_IN_BYTES(hw_p_size)	(1U << ((hw_p_size) + 12))
+#define ILT_CFG_REG(cli, reg)	PSWRQ2_REG_ ## cli ## _ ## reg ## _RT_OFFSET
+
+/* ILT entry structure */
+#define ILT_ENTRY_PHY_ADDR_MASK		(~0ULL >> 12)
+#define ILT_ENTRY_PHY_ADDR_SHIFT	0
+#define ILT_ENTRY_VALID_MASK		0x1ULL
+#define ILT_ENTRY_VALID_SHIFT		52
+#define ILT_ENTRY_IN_REGS		2
+#define ILT_REG_SIZE_IN_BYTES		4
+
+/* connection context union */
+union conn_context {
+	struct e4_core_conn_context core_ctx;
+	struct e4_eth_conn_context eth_ctx;
+	struct e4_iscsi_conn_context iscsi_ctx;
+	struct e4_fcoe_conn_context fcoe_ctx;
+	struct e4_roce_conn_context roce_ctx;
+};
+
+/* TYPE-0 task context - iSCSI, FCOE */
+union type0_task_context {
+	struct e4_iscsi_task_context iscsi_ctx;
+	struct e4_fcoe_task_context fcoe_ctx;
+};
+
+/* TYPE-1 task context - ROCE */
+union type1_task_context {
+	struct e4_rdma_task_context roce_ctx;
+};
+
+struct src_ent {
+	u8 opaque[56];
+	u64 next;
+};
+
+#define CDUT_SEG_ALIGNMET		3 /* in 4k chunks */
+#define CDUT_SEG_ALIGNMET_IN_BYTES	BIT(CDUT_SEG_ALIGNMET + 12)
+
+#define CONN_CXT_SIZE(p_hwfn) \
+	ALIGNED_TYPE_SIZE(union conn_context, p_hwfn)
+
+#define SRQ_CXT_SIZE (sizeof(struct rdma_srq_context))
+
+#define TYPE0_TASK_CXT_SIZE(p_hwfn) \
+	ALIGNED_TYPE_SIZE(union type0_task_context, p_hwfn)
+
+/* Alignment is inherent to the type1_task_context structure */
+#define TYPE1_TASK_CXT_SIZE(p_hwfn) sizeof(union type1_task_context)
+
+/* PF per protocl configuration object */
+#define TASK_SEGMENTS   (NUM_TASK_PF_SEGMENTS + NUM_TASK_VF_SEGMENTS)
+#define TASK_SEGMENT_VF (NUM_TASK_PF_SEGMENTS)
+
+struct qed_tid_seg {
+	u32 count;
+	u8 type;
+	bool has_fl_mem;
+};
+
+struct qed_conn_type_cfg {
+	u32 cid_count;
+	u32 cids_per_vf;
+	struct qed_tid_seg tid_seg[TASK_SEGMENTS];
+};
+
+/* ILT Client configuration, Per connection type (protocol) resources. */
+#define ILT_CLI_PF_BLOCKS	(1 + NUM_TASK_PF_SEGMENTS * 2)
+#define ILT_CLI_VF_BLOCKS       (1 + NUM_TASK_VF_SEGMENTS * 2)
+#define CDUC_BLK		(0)
+#define SRQ_BLK                 (0)
+#define CDUT_SEG_BLK(n)         (1 + (u8)(n))
+#define CDUT_FL_SEG_BLK(n, X)   (1 + (n) + NUM_TASK_ ## X ## _SEGMENTS)
+
+enum ilt_clients {
+	ILT_CLI_CDUC,
+	ILT_CLI_CDUT,
+	ILT_CLI_QM,
+	ILT_CLI_TM,
+	ILT_CLI_SRC,
+	ILT_CLI_TSDM,
+	ILT_CLI_MAX
+};
+
+struct ilt_cfg_pair {
+	u32 reg;
+	u32 val;
+};
+
+struct qed_ilt_cli_blk {
+	u32 total_size; /* 0 means not active */
+	u32 real_size_in_page;
+	u32 start_line;
+	u32 dynamic_line_cnt;
+};
+
+struct qed_ilt_client_cfg {
+	bool active;
+
+	/* ILT boundaries */
+	struct ilt_cfg_pair first;
+	struct ilt_cfg_pair last;
+	struct ilt_cfg_pair p_size;
+
+	/* ILT client blocks for PF */
+	struct qed_ilt_cli_blk pf_blks[ILT_CLI_PF_BLOCKS];
+	u32 pf_total_lines;
+
+	/* ILT client blocks for VFs */
+	struct qed_ilt_cli_blk vf_blks[ILT_CLI_VF_BLOCKS];
+	u32 vf_total_lines;
+};
+
+/* Per Path -
+ *      ILT shadow table
+ *      Protocol acquired CID lists
+ *      PF start line in ILT
+ */
+struct qed_dma_mem {
+	dma_addr_t p_phys;
+	void *p_virt;
+	size_t size;
+};
+
+struct qed_cid_acquired_map {
+	u32		start_cid;
+	u32		max_count;
+	unsigned long	*cid_map;
+};
+
+struct qed_cxt_mngr {
+	/* Per protocl configuration */
+	struct qed_conn_type_cfg	conn_cfg[MAX_CONN_TYPES];
+
+	/* computed ILT structure */
+	struct qed_ilt_client_cfg	clients[ILT_CLI_MAX];
+
+	/* Task type sizes */
+	u32 task_type_size[NUM_TASK_TYPES];
+
+	/* total number of VFs for this hwfn -
+	 * ALL VFs are symmetric in terms of HW resources
+	 */
+	u32				vf_count;
+
+	/* Acquired CIDs */
+	struct qed_cid_acquired_map	acquired[MAX_CONN_TYPES];
+
+	struct qed_cid_acquired_map
+	acquired_vf[MAX_CONN_TYPES][MAX_NUM_VFS];
+
+	/* ILT  shadow table */
+	struct qed_dma_mem		*ilt_shadow;
+	u32				pf_start_line;
+
+	/* Mutex for a dynamic ILT allocation */
+	struct mutex mutex;
+
+	/* SRC T2 */
+	struct qed_dma_mem *t2;
+	u32 t2_num_pages;
+	u64 first_free;
+	u64 last_free;
+
+	/* total number of SRQ's for this hwfn */
+	u32 srq_count;
+
+	/* Maximal number of L2 steering filters */
+	u32 arfs_count;
+};
+static bool src_proto(enum protocol_type type)
+{
+	return type == PROTOCOLID_ISCSI ||
+	       type == PROTOCOLID_FCOE ||
+	       type == PROTOCOLID_IWARP;
+}
+
+static bool tm_cid_proto(enum protocol_type type)
+{
+	return type == PROTOCOLID_ISCSI ||
+	       type == PROTOCOLID_FCOE ||
+	       type == PROTOCOLID_ROCE ||
+	       type == PROTOCOLID_IWARP;
+}
+
+static bool tm_tid_proto(enum protocol_type type)
+{
+	return type == PROTOCOLID_FCOE;
+}
+
+/* counts the iids for the CDU/CDUC ILT client configuration */
+struct qed_cdu_iids {
+	u32 pf_cids;
+	u32 per_vf_cids;
+};
+
+static void qed_cxt_cdu_iids(struct qed_cxt_mngr *p_mngr,
+			     struct qed_cdu_iids *iids)
+{
+	u32 type;
+
+	for (type = 0; type < MAX_CONN_TYPES; type++) {
+		iids->pf_cids += p_mngr->conn_cfg[type].cid_count;
+		iids->per_vf_cids += p_mngr->conn_cfg[type].cids_per_vf;
+	}
+}
+
+/* counts the iids for the Searcher block configuration */
+struct qed_src_iids {
+	u32 pf_cids;
+	u32 per_vf_cids;
+};
+
+static void qed_cxt_src_iids(struct qed_cxt_mngr *p_mngr,
+			     struct qed_src_iids *iids)
+{
+	u32 i;
+
+	for (i = 0; i < MAX_CONN_TYPES; i++) {
+		if (!src_proto(i))
+			continue;
+
+		iids->pf_cids += p_mngr->conn_cfg[i].cid_count;
+		iids->per_vf_cids += p_mngr->conn_cfg[i].cids_per_vf;
+	}
+
+	/* Add L2 filtering filters in addition */
+	iids->pf_cids += p_mngr->arfs_count;
+}
+
+/* counts the iids for the Timers block configuration */
+struct qed_tm_iids {
+	u32 pf_cids;
+	u32 pf_tids[NUM_TASK_PF_SEGMENTS];	/* per segment */
+	u32 pf_tids_total;
+	u32 per_vf_cids;
+	u32 per_vf_tids;
+};
+
+static void qed_cxt_tm_iids(struct qed_hwfn *p_hwfn,
+			    struct qed_cxt_mngr *p_mngr,
+			    struct qed_tm_iids *iids)
+{
+	bool tm_vf_required = false;
+	bool tm_required = false;
+	int i, j;
+
+	/* Timers is a special case -> we don't count how many cids require
+	 * timers but what's the max cid that will be used by the timer block.
+	 * therefore we traverse in reverse order, and once we hit a protocol
+	 * that requires the timers memory, we'll sum all the protocols up
+	 * to that one.
+	 */
+	for (i = MAX_CONN_TYPES - 1; i >= 0; i--) {
+		struct qed_conn_type_cfg *p_cfg = &p_mngr->conn_cfg[i];
+
+		if (tm_cid_proto(i) || tm_required) {
+			if (p_cfg->cid_count)
+				tm_required = true;
+
+			iids->pf_cids += p_cfg->cid_count;
+		}
+
+		if (tm_cid_proto(i) || tm_vf_required) {
+			if (p_cfg->cids_per_vf)
+				tm_vf_required = true;
+
+			iids->per_vf_cids += p_cfg->cids_per_vf;
+		}
+
+		if (tm_tid_proto(i)) {
+			struct qed_tid_seg *segs = p_cfg->tid_seg;
+
+			/* for each segment there is at most one
+			 * protocol for which count is not 0.
+			 */
+			for (j = 0; j < NUM_TASK_PF_SEGMENTS; j++)
+				iids->pf_tids[j] += segs[j].count;
+
+			/* The last array elelment is for the VFs. As for PF
+			 * segments there can be only one protocol for
+			 * which this value is not 0.
+			 */
+			iids->per_vf_tids += segs[NUM_TASK_PF_SEGMENTS].count;
+		}
+	}
+
+	iids->pf_cids = roundup(iids->pf_cids, TM_ALIGN);
+	iids->per_vf_cids = roundup(iids->per_vf_cids, TM_ALIGN);
+	iids->per_vf_tids = roundup(iids->per_vf_tids, TM_ALIGN);
+
+	for (iids->pf_tids_total = 0, j = 0; j < NUM_TASK_PF_SEGMENTS; j++) {
+		iids->pf_tids[j] = roundup(iids->pf_tids[j], TM_ALIGN);
+		iids->pf_tids_total += iids->pf_tids[j];
+	}
+}
+
+static void qed_cxt_qm_iids(struct qed_hwfn *p_hwfn,
+			    struct qed_qm_iids *iids)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	struct qed_tid_seg *segs;
+	u32 vf_cids = 0, type, j;
+	u32 vf_tids = 0;
+
+	for (type = 0; type < MAX_CONN_TYPES; type++) {
+		iids->cids += p_mngr->conn_cfg[type].cid_count;
+		vf_cids += p_mngr->conn_cfg[type].cids_per_vf;
+
+		segs = p_mngr->conn_cfg[type].tid_seg;
+		/* for each segment there is at most one
+		 * protocol for which count is not 0.
+		 */
+		for (j = 0; j < NUM_TASK_PF_SEGMENTS; j++)
+			iids->tids += segs[j].count;
+
+		/* The last array elelment is for the VFs. As for PF
+		 * segments there can be only one protocol for
+		 * which this value is not 0.
+		 */
+		vf_tids += segs[NUM_TASK_PF_SEGMENTS].count;
+	}
+
+	iids->vf_cids += vf_cids * p_mngr->vf_count;
+	iids->tids += vf_tids * p_mngr->vf_count;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_ILT,
+		   "iids: CIDS %08x vf_cids %08x tids %08x vf_tids %08x\n",
+		   iids->cids, iids->vf_cids, iids->tids, vf_tids);
+}
+
+static struct qed_tid_seg *qed_cxt_tid_seg_info(struct qed_hwfn *p_hwfn,
+						u32 seg)
+{
+	struct qed_cxt_mngr *p_cfg = p_hwfn->p_cxt_mngr;
+	u32 i;
+
+	/* Find the protocol with tid count > 0 for this segment.
+	 * Note: there can only be one and this is already validated.
+	 */
+	for (i = 0; i < MAX_CONN_TYPES; i++)
+		if (p_cfg->conn_cfg[i].tid_seg[seg].count)
+			return &p_cfg->conn_cfg[i].tid_seg[seg];
+	return NULL;
+}
+
+static void qed_cxt_set_srq_count(struct qed_hwfn *p_hwfn, u32 num_srqs)
+{
+	struct qed_cxt_mngr *p_mgr = p_hwfn->p_cxt_mngr;
+
+	p_mgr->srq_count = num_srqs;
+}
+
+static u32 qed_cxt_get_srq_count(struct qed_hwfn *p_hwfn)
+{
+	struct qed_cxt_mngr *p_mgr = p_hwfn->p_cxt_mngr;
+
+	return p_mgr->srq_count;
+}
+
+/* set the iids count per protocol */
+static void qed_cxt_set_proto_cid_count(struct qed_hwfn *p_hwfn,
+					enum protocol_type type,
+					u32 cid_count, u32 vf_cid_cnt)
+{
+	struct qed_cxt_mngr *p_mgr = p_hwfn->p_cxt_mngr;
+	struct qed_conn_type_cfg *p_conn = &p_mgr->conn_cfg[type];
+
+	p_conn->cid_count = roundup(cid_count, DQ_RANGE_ALIGN);
+	p_conn->cids_per_vf = roundup(vf_cid_cnt, DQ_RANGE_ALIGN);
+
+	if (type == PROTOCOLID_ROCE) {
+		u32 page_sz = p_mgr->clients[ILT_CLI_CDUC].p_size.val;
+		u32 cxt_size = CONN_CXT_SIZE(p_hwfn);
+		u32 elems_per_page = ILT_PAGE_IN_BYTES(page_sz) / cxt_size;
+		u32 align = elems_per_page * DQ_RANGE_ALIGN;
+
+		p_conn->cid_count = roundup(p_conn->cid_count, align);
+	}
+}
+
+u32 qed_cxt_get_proto_cid_count(struct qed_hwfn *p_hwfn,
+				enum protocol_type type, u32 *vf_cid)
+{
+	if (vf_cid)
+		*vf_cid = p_hwfn->p_cxt_mngr->conn_cfg[type].cids_per_vf;
+
+	return p_hwfn->p_cxt_mngr->conn_cfg[type].cid_count;
+}
+
+u32 qed_cxt_get_proto_cid_start(struct qed_hwfn *p_hwfn,
+				enum protocol_type type)
+{
+	return p_hwfn->p_cxt_mngr->acquired[type].start_cid;
+}
+
+u32 qed_cxt_get_proto_tid_count(struct qed_hwfn *p_hwfn,
+				enum protocol_type type)
+{
+	u32 cnt = 0;
+	int i;
+
+	for (i = 0; i < TASK_SEGMENTS; i++)
+		cnt += p_hwfn->p_cxt_mngr->conn_cfg[type].tid_seg[i].count;
+
+	return cnt;
+}
+
+static void qed_cxt_set_proto_tid_count(struct qed_hwfn *p_hwfn,
+					enum protocol_type proto,
+					u8 seg,
+					u8 seg_type, u32 count, bool has_fl)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	struct qed_tid_seg *p_seg = &p_mngr->conn_cfg[proto].tid_seg[seg];
+
+	p_seg->count = count;
+	p_seg->has_fl_mem = has_fl;
+	p_seg->type = seg_type;
+}
+
+static void qed_ilt_cli_blk_fill(struct qed_ilt_client_cfg *p_cli,
+				 struct qed_ilt_cli_blk *p_blk,
+				 u32 start_line, u32 total_size, u32 elem_size)
+{
+	u32 ilt_size = ILT_PAGE_IN_BYTES(p_cli->p_size.val);
+
+	/* verify thatits called only once for each block */
+	if (p_blk->total_size)
+		return;
+
+	p_blk->total_size = total_size;
+	p_blk->real_size_in_page = 0;
+	if (elem_size)
+		p_blk->real_size_in_page = (ilt_size / elem_size) * elem_size;
+	p_blk->start_line = start_line;
+}
+
+static void qed_ilt_cli_adv_line(struct qed_hwfn *p_hwfn,
+				 struct qed_ilt_client_cfg *p_cli,
+				 struct qed_ilt_cli_blk *p_blk,
+				 u32 *p_line, enum ilt_clients client_id)
+{
+	if (!p_blk->total_size)
+		return;
+
+	if (!p_cli->active)
+		p_cli->first.val = *p_line;
+
+	p_cli->active = true;
+	*p_line += DIV_ROUND_UP(p_blk->total_size, p_blk->real_size_in_page);
+	p_cli->last.val = *p_line - 1;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_ILT,
+		   "ILT[Client %d] - Lines: [%08x - %08x]. Block - Size %08x [Real %08x] Start line %d\n",
+		   client_id, p_cli->first.val,
+		   p_cli->last.val, p_blk->total_size,
+		   p_blk->real_size_in_page, p_blk->start_line);
+}
+
+static u32 qed_ilt_get_dynamic_line_cnt(struct qed_hwfn *p_hwfn,
+					enum ilt_clients ilt_client)
+{
+	u32 cid_count = p_hwfn->p_cxt_mngr->conn_cfg[PROTOCOLID_ROCE].cid_count;
+	struct qed_ilt_client_cfg *p_cli;
+	u32 lines_to_skip = 0;
+	u32 cxts_per_p;
+
+	if (ilt_client == ILT_CLI_CDUC) {
+		p_cli = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_CDUC];
+
+		cxts_per_p = ILT_PAGE_IN_BYTES(p_cli->p_size.val) /
+		    (u32) CONN_CXT_SIZE(p_hwfn);
+
+		lines_to_skip = cid_count / cxts_per_p;
+	}
+
+	return lines_to_skip;
+}
+
+static struct qed_ilt_client_cfg *qed_cxt_set_cli(struct qed_ilt_client_cfg
+						  *p_cli)
+{
+	p_cli->active = false;
+	p_cli->first.val = 0;
+	p_cli->last.val = 0;
+	return p_cli;
+}
+
+static struct qed_ilt_cli_blk *qed_cxt_set_blk(struct qed_ilt_cli_blk *p_blk)
+{
+	p_blk->total_size = 0;
+	return p_blk;
+}
+
+int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn, u32 *line_count)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	u32 curr_line, total, i, task_size, line;
+	struct qed_ilt_client_cfg *p_cli;
+	struct qed_ilt_cli_blk *p_blk;
+	struct qed_cdu_iids cdu_iids;
+	struct qed_src_iids src_iids;
+	struct qed_qm_iids qm_iids;
+	struct qed_tm_iids tm_iids;
+	struct qed_tid_seg *p_seg;
+
+	memset(&qm_iids, 0, sizeof(qm_iids));
+	memset(&cdu_iids, 0, sizeof(cdu_iids));
+	memset(&src_iids, 0, sizeof(src_iids));
+	memset(&tm_iids, 0, sizeof(tm_iids));
+
+	p_mngr->pf_start_line = RESC_START(p_hwfn, QED_ILT);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_ILT,
+		   "hwfn [%d] - Set context manager starting line to be 0x%08x\n",
+		   p_hwfn->my_id, p_hwfn->p_cxt_mngr->pf_start_line);
+
+	/* CDUC */
+	p_cli = qed_cxt_set_cli(&p_mngr->clients[ILT_CLI_CDUC]);
+
+	curr_line = p_mngr->pf_start_line;
+
+	/* CDUC PF */
+	p_cli->pf_total_lines = 0;
+
+	/* get the counters for the CDUC and QM clients  */
+	qed_cxt_cdu_iids(p_mngr, &cdu_iids);
+
+	p_blk = qed_cxt_set_blk(&p_cli->pf_blks[CDUC_BLK]);
+
+	total = cdu_iids.pf_cids * CONN_CXT_SIZE(p_hwfn);
+
+	qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
+			     total, CONN_CXT_SIZE(p_hwfn));
+
+	qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line, ILT_CLI_CDUC);
+	p_cli->pf_total_lines = curr_line - p_blk->start_line;
+
+	p_blk->dynamic_line_cnt = qed_ilt_get_dynamic_line_cnt(p_hwfn,
+							       ILT_CLI_CDUC);
+
+	/* CDUC VF */
+	p_blk = qed_cxt_set_blk(&p_cli->vf_blks[CDUC_BLK]);
+	total = cdu_iids.per_vf_cids * CONN_CXT_SIZE(p_hwfn);
+
+	qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
+			     total, CONN_CXT_SIZE(p_hwfn));
+
+	qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line, ILT_CLI_CDUC);
+	p_cli->vf_total_lines = curr_line - p_blk->start_line;
+
+	for (i = 1; i < p_mngr->vf_count; i++)
+		qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+				     ILT_CLI_CDUC);
+
+	/* CDUT PF */
+	p_cli = qed_cxt_set_cli(&p_mngr->clients[ILT_CLI_CDUT]);
+	p_cli->first.val = curr_line;
+
+	/* first the 'working' task memory */
+	for (i = 0; i < NUM_TASK_PF_SEGMENTS; i++) {
+		p_seg = qed_cxt_tid_seg_info(p_hwfn, i);
+		if (!p_seg || p_seg->count == 0)
+			continue;
+
+		p_blk = qed_cxt_set_blk(&p_cli->pf_blks[CDUT_SEG_BLK(i)]);
+		total = p_seg->count * p_mngr->task_type_size[p_seg->type];
+		qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line, total,
+				     p_mngr->task_type_size[p_seg->type]);
+
+		qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+				     ILT_CLI_CDUT);
+	}
+
+	/* next the 'init' task memory (forced load memory) */
+	for (i = 0; i < NUM_TASK_PF_SEGMENTS; i++) {
+		p_seg = qed_cxt_tid_seg_info(p_hwfn, i);
+		if (!p_seg || p_seg->count == 0)
+			continue;
+
+		p_blk =
+		    qed_cxt_set_blk(&p_cli->pf_blks[CDUT_FL_SEG_BLK(i, PF)]);
+
+		if (!p_seg->has_fl_mem) {
+			/* The segment is active (total size pf 'working'
+			 * memory is > 0) but has no FL (forced-load, Init)
+			 * memory. Thus:
+			 *
+			 * 1.   The total-size in the corrsponding FL block of
+			 *      the ILT client is set to 0 - No ILT line are
+			 *      provisioned and no ILT memory allocated.
+			 *
+			 * 2.   The start-line of said block is set to the
+			 *      start line of the matching working memory
+			 *      block in the ILT client. This is later used to
+			 *      configure the CDU segment offset registers and
+			 *      results in an FL command for TIDs of this
+			 *      segement behaves as regular load commands
+			 *      (loading TIDs from the working memory).
+			 */
+			line = p_cli->pf_blks[CDUT_SEG_BLK(i)].start_line;
+
+			qed_ilt_cli_blk_fill(p_cli, p_blk, line, 0, 0);
+			continue;
+		}
+		total = p_seg->count * p_mngr->task_type_size[p_seg->type];
+
+		qed_ilt_cli_blk_fill(p_cli, p_blk,
+				     curr_line, total,
+				     p_mngr->task_type_size[p_seg->type]);
+
+		qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+				     ILT_CLI_CDUT);
+	}
+	p_cli->pf_total_lines = curr_line - p_cli->pf_blks[0].start_line;
+
+	/* CDUT VF */
+	p_seg = qed_cxt_tid_seg_info(p_hwfn, TASK_SEGMENT_VF);
+	if (p_seg && p_seg->count) {
+		/* Stricly speaking we need to iterate over all VF
+		 * task segment types, but a VF has only 1 segment
+		 */
+
+		/* 'working' memory */
+		total = p_seg->count * p_mngr->task_type_size[p_seg->type];
+
+		p_blk = qed_cxt_set_blk(&p_cli->vf_blks[CDUT_SEG_BLK(0)]);
+		qed_ilt_cli_blk_fill(p_cli, p_blk,
+				     curr_line, total,
+				     p_mngr->task_type_size[p_seg->type]);
+
+		qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+				     ILT_CLI_CDUT);
+
+		/* 'init' memory */
+		p_blk =
+		    qed_cxt_set_blk(&p_cli->vf_blks[CDUT_FL_SEG_BLK(0, VF)]);
+		if (!p_seg->has_fl_mem) {
+			/* see comment above */
+			line = p_cli->vf_blks[CDUT_SEG_BLK(0)].start_line;
+			qed_ilt_cli_blk_fill(p_cli, p_blk, line, 0, 0);
+		} else {
+			task_size = p_mngr->task_type_size[p_seg->type];
+			qed_ilt_cli_blk_fill(p_cli, p_blk,
+					     curr_line, total, task_size);
+			qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+					     ILT_CLI_CDUT);
+		}
+		p_cli->vf_total_lines = curr_line -
+		    p_cli->vf_blks[0].start_line;
+
+		/* Now for the rest of the VFs */
+		for (i = 1; i < p_mngr->vf_count; i++) {
+			p_blk = &p_cli->vf_blks[CDUT_SEG_BLK(0)];
+			qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+					     ILT_CLI_CDUT);
+
+			p_blk = &p_cli->vf_blks[CDUT_FL_SEG_BLK(0, VF)];
+			qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+					     ILT_CLI_CDUT);
+		}
+	}
+
+	/* QM */
+	p_cli = qed_cxt_set_cli(&p_mngr->clients[ILT_CLI_QM]);
+	p_blk = qed_cxt_set_blk(&p_cli->pf_blks[0]);
+
+	qed_cxt_qm_iids(p_hwfn, &qm_iids);
+	total = qed_qm_pf_mem_size(qm_iids.cids,
+				   qm_iids.vf_cids, qm_iids.tids,
+				   p_hwfn->qm_info.num_pqs,
+				   p_hwfn->qm_info.num_vf_pqs);
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_ILT,
+		   "QM ILT Info, (cids=%d, vf_cids=%d, tids=%d, num_pqs=%d, num_vf_pqs=%d, memory_size=%d)\n",
+		   qm_iids.cids,
+		   qm_iids.vf_cids,
+		   qm_iids.tids,
+		   p_hwfn->qm_info.num_pqs, p_hwfn->qm_info.num_vf_pqs, total);
+
+	qed_ilt_cli_blk_fill(p_cli, p_blk,
+			     curr_line, total * 0x1000,
+			     QM_PQ_ELEMENT_SIZE);
+
+	qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line, ILT_CLI_QM);
+	p_cli->pf_total_lines = curr_line - p_blk->start_line;
+
+	/* SRC */
+	p_cli = qed_cxt_set_cli(&p_mngr->clients[ILT_CLI_SRC]);
+	qed_cxt_src_iids(p_mngr, &src_iids);
+
+	/* Both the PF and VFs searcher connections are stored in the per PF
+	 * database. Thus sum the PF searcher cids and all the VFs searcher
+	 * cids.
+	 */
+	total = src_iids.pf_cids + src_iids.per_vf_cids * p_mngr->vf_count;
+	if (total) {
+		u32 local_max = max_t(u32, total,
+				      SRC_MIN_NUM_ELEMS);
+
+		total = roundup_pow_of_two(local_max);
+
+		p_blk = qed_cxt_set_blk(&p_cli->pf_blks[0]);
+		qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
+				     total * sizeof(struct src_ent),
+				     sizeof(struct src_ent));
+
+		qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+				     ILT_CLI_SRC);
+		p_cli->pf_total_lines = curr_line - p_blk->start_line;
+	}
+
+	/* TM PF */
+	p_cli = qed_cxt_set_cli(&p_mngr->clients[ILT_CLI_TM]);
+	qed_cxt_tm_iids(p_hwfn, p_mngr, &tm_iids);
+	total = tm_iids.pf_cids + tm_iids.pf_tids_total;
+	if (total) {
+		p_blk = qed_cxt_set_blk(&p_cli->pf_blks[0]);
+		qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
+				     total * TM_ELEM_SIZE, TM_ELEM_SIZE);
+
+		qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+				     ILT_CLI_TM);
+		p_cli->pf_total_lines = curr_line - p_blk->start_line;
+	}
+
+	/* TM VF */
+	total = tm_iids.per_vf_cids + tm_iids.per_vf_tids;
+	if (total) {
+		p_blk = qed_cxt_set_blk(&p_cli->vf_blks[0]);
+		qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
+				     total * TM_ELEM_SIZE, TM_ELEM_SIZE);
+
+		qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+				     ILT_CLI_TM);
+
+		p_cli->vf_total_lines = curr_line - p_blk->start_line;
+		for (i = 1; i < p_mngr->vf_count; i++)
+			qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+					     ILT_CLI_TM);
+	}
+
+	/* TSDM (SRQ CONTEXT) */
+	total = qed_cxt_get_srq_count(p_hwfn);
+
+	if (total) {
+		p_cli = qed_cxt_set_cli(&p_mngr->clients[ILT_CLI_TSDM]);
+		p_blk = qed_cxt_set_blk(&p_cli->pf_blks[SRQ_BLK]);
+		qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
+				     total * SRQ_CXT_SIZE, SRQ_CXT_SIZE);
+
+		qed_ilt_cli_adv_line(p_hwfn, p_cli, p_blk, &curr_line,
+				     ILT_CLI_TSDM);
+		p_cli->pf_total_lines = curr_line - p_blk->start_line;
+	}
+
+	*line_count = curr_line - p_hwfn->p_cxt_mngr->pf_start_line;
+
+	if (curr_line - p_hwfn->p_cxt_mngr->pf_start_line >
+	    RESC_NUM(p_hwfn, QED_ILT))
+		return -EINVAL;
+
+	return 0;
+}
+
+u32 qed_cxt_cfg_ilt_compute_excess(struct qed_hwfn *p_hwfn, u32 used_lines)
+{
+	struct qed_ilt_client_cfg *p_cli;
+	u32 excess_lines, available_lines;
+	struct qed_cxt_mngr *p_mngr;
+	u32 ilt_page_size, elem_size;
+	struct qed_tid_seg *p_seg;
+	int i;
+
+	available_lines = RESC_NUM(p_hwfn, QED_ILT);
+	excess_lines = used_lines - available_lines;
+
+	if (!excess_lines)
+		return 0;
+
+	if (!QED_IS_RDMA_PERSONALITY(p_hwfn))
+		return 0;
+
+	p_mngr = p_hwfn->p_cxt_mngr;
+	p_cli = &p_mngr->clients[ILT_CLI_CDUT];
+	ilt_page_size = ILT_PAGE_IN_BYTES(p_cli->p_size.val);
+
+	for (i = 0; i < NUM_TASK_PF_SEGMENTS; i++) {
+		p_seg = qed_cxt_tid_seg_info(p_hwfn, i);
+		if (!p_seg || p_seg->count == 0)
+			continue;
+
+		elem_size = p_mngr->task_type_size[p_seg->type];
+		if (!elem_size)
+			continue;
+
+		return (ilt_page_size / elem_size) * excess_lines;
+	}
+
+	DP_NOTICE(p_hwfn, "failed computing excess ILT lines\n");
+	return 0;
+}
+
+static void qed_cxt_src_t2_free(struct qed_hwfn *p_hwfn)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	u32 i;
+
+	if (!p_mngr->t2)
+		return;
+
+	for (i = 0; i < p_mngr->t2_num_pages; i++)
+		if (p_mngr->t2[i].p_virt)
+			dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+					  p_mngr->t2[i].size,
+					  p_mngr->t2[i].p_virt,
+					  p_mngr->t2[i].p_phys);
+
+	kfree(p_mngr->t2);
+	p_mngr->t2 = NULL;
+}
+
+static int qed_cxt_src_t2_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	u32 conn_num, total_size, ent_per_page, psz, i;
+	struct qed_ilt_client_cfg *p_src;
+	struct qed_src_iids src_iids;
+	struct qed_dma_mem *p_t2;
+	int rc;
+
+	memset(&src_iids, 0, sizeof(src_iids));
+
+	/* if the SRC ILT client is inactive - there are no connection
+	 * requiring the searcer, leave.
+	 */
+	p_src = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_SRC];
+	if (!p_src->active)
+		return 0;
+
+	qed_cxt_src_iids(p_mngr, &src_iids);
+	conn_num = src_iids.pf_cids + src_iids.per_vf_cids * p_mngr->vf_count;
+	total_size = conn_num * sizeof(struct src_ent);
+
+	/* use the same page size as the SRC ILT client */
+	psz = ILT_PAGE_IN_BYTES(p_src->p_size.val);
+	p_mngr->t2_num_pages = DIV_ROUND_UP(total_size, psz);
+
+	/* allocate t2 */
+	p_mngr->t2 = kcalloc(p_mngr->t2_num_pages, sizeof(struct qed_dma_mem),
+			     GFP_KERNEL);
+	if (!p_mngr->t2) {
+		rc = -ENOMEM;
+		goto t2_fail;
+	}
+
+	/* allocate t2 pages */
+	for (i = 0; i < p_mngr->t2_num_pages; i++) {
+		u32 size = min_t(u32, total_size, psz);
+		void **p_virt = &p_mngr->t2[i].p_virt;
+
+		*p_virt = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+					     size,
+					     &p_mngr->t2[i].p_phys, GFP_KERNEL);
+		if (!p_mngr->t2[i].p_virt) {
+			rc = -ENOMEM;
+			goto t2_fail;
+		}
+		memset(*p_virt, 0, size);
+		p_mngr->t2[i].size = size;
+		total_size -= size;
+	}
+
+	/* Set the t2 pointers */
+
+	/* entries per page - must be a power of two */
+	ent_per_page = psz / sizeof(struct src_ent);
+
+	p_mngr->first_free = (u64) p_mngr->t2[0].p_phys;
+
+	p_t2 = &p_mngr->t2[(conn_num - 1) / ent_per_page];
+	p_mngr->last_free = (u64) p_t2->p_phys +
+	    ((conn_num - 1) & (ent_per_page - 1)) * sizeof(struct src_ent);
+
+	for (i = 0; i < p_mngr->t2_num_pages; i++) {
+		u32 ent_num = min_t(u32,
+				    ent_per_page,
+				    conn_num);
+		struct src_ent *entries = p_mngr->t2[i].p_virt;
+		u64 p_ent_phys = (u64) p_mngr->t2[i].p_phys, val;
+		u32 j;
+
+		for (j = 0; j < ent_num - 1; j++) {
+			val = p_ent_phys + (j + 1) * sizeof(struct src_ent);
+			entries[j].next = cpu_to_be64(val);
+		}
+
+		if (i < p_mngr->t2_num_pages - 1)
+			val = (u64) p_mngr->t2[i + 1].p_phys;
+		else
+			val = 0;
+		entries[j].next = cpu_to_be64(val);
+
+		conn_num -= ent_num;
+	}
+
+	return 0;
+
+t2_fail:
+	qed_cxt_src_t2_free(p_hwfn);
+	return rc;
+}
+
+#define for_each_ilt_valid_client(pos, clients)	\
+	for (pos = 0; pos < ILT_CLI_MAX; pos++)	\
+		if (!clients[pos].active) {	\
+			continue;		\
+		} else				\
+
+/* Total number of ILT lines used by this PF */
+static u32 qed_cxt_ilt_shadow_size(struct qed_ilt_client_cfg *ilt_clients)
+{
+	u32 size = 0;
+	u32 i;
+
+	for_each_ilt_valid_client(i, ilt_clients)
+	    size += (ilt_clients[i].last.val - ilt_clients[i].first.val + 1);
+
+	return size;
+}
+
+static void qed_ilt_shadow_free(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ilt_client_cfg *p_cli = p_hwfn->p_cxt_mngr->clients;
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	u32 ilt_size, i;
+
+	ilt_size = qed_cxt_ilt_shadow_size(p_cli);
+
+	for (i = 0; p_mngr->ilt_shadow && i < ilt_size; i++) {
+		struct qed_dma_mem *p_dma = &p_mngr->ilt_shadow[i];
+
+		if (p_dma->p_virt)
+			dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+					  p_dma->size, p_dma->p_virt,
+					  p_dma->p_phys);
+		p_dma->p_virt = NULL;
+	}
+	kfree(p_mngr->ilt_shadow);
+}
+
+static int qed_ilt_blk_alloc(struct qed_hwfn *p_hwfn,
+			     struct qed_ilt_cli_blk *p_blk,
+			     enum ilt_clients ilt_client,
+			     u32 start_line_offset)
+{
+	struct qed_dma_mem *ilt_shadow = p_hwfn->p_cxt_mngr->ilt_shadow;
+	u32 lines, line, sz_left, lines_to_skip = 0;
+
+	/* Special handling for RoCE that supports dynamic allocation */
+	if (QED_IS_RDMA_PERSONALITY(p_hwfn) &&
+	    ((ilt_client == ILT_CLI_CDUT) || ilt_client == ILT_CLI_TSDM))
+		return 0;
+
+	lines_to_skip = p_blk->dynamic_line_cnt;
+
+	if (!p_blk->total_size)
+		return 0;
+
+	sz_left = p_blk->total_size;
+	lines = DIV_ROUND_UP(sz_left, p_blk->real_size_in_page) - lines_to_skip;
+	line = p_blk->start_line + start_line_offset -
+	    p_hwfn->p_cxt_mngr->pf_start_line + lines_to_skip;
+
+	for (; lines; lines--) {
+		dma_addr_t p_phys;
+		void *p_virt;
+		u32 size;
+
+		size = min_t(u32, sz_left, p_blk->real_size_in_page);
+		p_virt = dma_zalloc_coherent(&p_hwfn->cdev->pdev->dev, size,
+					     &p_phys, GFP_KERNEL);
+		if (!p_virt)
+			return -ENOMEM;
+
+		ilt_shadow[line].p_phys = p_phys;
+		ilt_shadow[line].p_virt = p_virt;
+		ilt_shadow[line].size = size;
+
+		DP_VERBOSE(p_hwfn, QED_MSG_ILT,
+			   "ILT shadow: Line [%d] Physical 0x%llx Virtual %p Size %d\n",
+			    line, (u64)p_phys, p_virt, size);
+
+		sz_left -= size;
+		line++;
+	}
+
+	return 0;
+}
+
+static int qed_ilt_shadow_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	struct qed_ilt_client_cfg *clients = p_mngr->clients;
+	struct qed_ilt_cli_blk *p_blk;
+	u32 size, i, j, k;
+	int rc;
+
+	size = qed_cxt_ilt_shadow_size(clients);
+	p_mngr->ilt_shadow = kcalloc(size, sizeof(struct qed_dma_mem),
+				     GFP_KERNEL);
+	if (!p_mngr->ilt_shadow) {
+		rc = -ENOMEM;
+		goto ilt_shadow_fail;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_ILT,
+		   "Allocated 0x%x bytes for ilt shadow\n",
+		   (u32)(size * sizeof(struct qed_dma_mem)));
+
+	for_each_ilt_valid_client(i, clients) {
+		for (j = 0; j < ILT_CLI_PF_BLOCKS; j++) {
+			p_blk = &clients[i].pf_blks[j];
+			rc = qed_ilt_blk_alloc(p_hwfn, p_blk, i, 0);
+			if (rc)
+				goto ilt_shadow_fail;
+		}
+		for (k = 0; k < p_mngr->vf_count; k++) {
+			for (j = 0; j < ILT_CLI_VF_BLOCKS; j++) {
+				u32 lines = clients[i].vf_total_lines * k;
+
+				p_blk = &clients[i].vf_blks[j];
+				rc = qed_ilt_blk_alloc(p_hwfn, p_blk, i, lines);
+				if (rc)
+					goto ilt_shadow_fail;
+			}
+		}
+	}
+
+	return 0;
+
+ilt_shadow_fail:
+	qed_ilt_shadow_free(p_hwfn);
+	return rc;
+}
+
+static void qed_cid_map_free(struct qed_hwfn *p_hwfn)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	u32 type, vf;
+
+	for (type = 0; type < MAX_CONN_TYPES; type++) {
+		kfree(p_mngr->acquired[type].cid_map);
+		p_mngr->acquired[type].max_count = 0;
+		p_mngr->acquired[type].start_cid = 0;
+
+		for (vf = 0; vf < MAX_NUM_VFS; vf++) {
+			kfree(p_mngr->acquired_vf[type][vf].cid_map);
+			p_mngr->acquired_vf[type][vf].max_count = 0;
+			p_mngr->acquired_vf[type][vf].start_cid = 0;
+		}
+	}
+}
+
+static int
+qed_cid_map_alloc_single(struct qed_hwfn *p_hwfn,
+			 u32 type,
+			 u32 cid_start,
+			 u32 cid_count, struct qed_cid_acquired_map *p_map)
+{
+	u32 size;
+
+	if (!cid_count)
+		return 0;
+
+	size = DIV_ROUND_UP(cid_count,
+			    sizeof(unsigned long) * BITS_PER_BYTE) *
+	       sizeof(unsigned long);
+	p_map->cid_map = kzalloc(size, GFP_KERNEL);
+	if (!p_map->cid_map)
+		return -ENOMEM;
+
+	p_map->max_count = cid_count;
+	p_map->start_cid = cid_start;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_CXT,
+		   "Type %08x start: %08x count %08x\n",
+		   type, p_map->start_cid, p_map->max_count);
+
+	return 0;
+}
+
+static int qed_cid_map_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	u32 start_cid = 0, vf_start_cid = 0;
+	u32 type, vf;
+
+	for (type = 0; type < MAX_CONN_TYPES; type++) {
+		struct qed_conn_type_cfg *p_cfg = &p_mngr->conn_cfg[type];
+		struct qed_cid_acquired_map *p_map;
+
+		/* Handle PF maps */
+		p_map = &p_mngr->acquired[type];
+		if (qed_cid_map_alloc_single(p_hwfn, type, start_cid,
+					     p_cfg->cid_count, p_map))
+			goto cid_map_fail;
+
+		/* Handle VF maps */
+		for (vf = 0; vf < MAX_NUM_VFS; vf++) {
+			p_map = &p_mngr->acquired_vf[type][vf];
+			if (qed_cid_map_alloc_single(p_hwfn, type,
+						     vf_start_cid,
+						     p_cfg->cids_per_vf, p_map))
+				goto cid_map_fail;
+		}
+
+		start_cid += p_cfg->cid_count;
+		vf_start_cid += p_cfg->cids_per_vf;
+	}
+
+	return 0;
+
+cid_map_fail:
+	qed_cid_map_free(p_hwfn);
+	return -ENOMEM;
+}
+
+int qed_cxt_mngr_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ilt_client_cfg *clients;
+	struct qed_cxt_mngr *p_mngr;
+	u32 i;
+
+	p_mngr = kzalloc(sizeof(*p_mngr), GFP_KERNEL);
+	if (!p_mngr)
+		return -ENOMEM;
+
+	/* Initialize ILT client registers */
+	clients = p_mngr->clients;
+	clients[ILT_CLI_CDUC].first.reg = ILT_CFG_REG(CDUC, FIRST_ILT);
+	clients[ILT_CLI_CDUC].last.reg = ILT_CFG_REG(CDUC, LAST_ILT);
+	clients[ILT_CLI_CDUC].p_size.reg = ILT_CFG_REG(CDUC, P_SIZE);
+
+	clients[ILT_CLI_QM].first.reg = ILT_CFG_REG(QM, FIRST_ILT);
+	clients[ILT_CLI_QM].last.reg = ILT_CFG_REG(QM, LAST_ILT);
+	clients[ILT_CLI_QM].p_size.reg = ILT_CFG_REG(QM, P_SIZE);
+
+	clients[ILT_CLI_TM].first.reg = ILT_CFG_REG(TM, FIRST_ILT);
+	clients[ILT_CLI_TM].last.reg = ILT_CFG_REG(TM, LAST_ILT);
+	clients[ILT_CLI_TM].p_size.reg = ILT_CFG_REG(TM, P_SIZE);
+
+	clients[ILT_CLI_SRC].first.reg = ILT_CFG_REG(SRC, FIRST_ILT);
+	clients[ILT_CLI_SRC].last.reg = ILT_CFG_REG(SRC, LAST_ILT);
+	clients[ILT_CLI_SRC].p_size.reg = ILT_CFG_REG(SRC, P_SIZE);
+
+	clients[ILT_CLI_CDUT].first.reg = ILT_CFG_REG(CDUT, FIRST_ILT);
+	clients[ILT_CLI_CDUT].last.reg = ILT_CFG_REG(CDUT, LAST_ILT);
+	clients[ILT_CLI_CDUT].p_size.reg = ILT_CFG_REG(CDUT, P_SIZE);
+
+	clients[ILT_CLI_TSDM].first.reg = ILT_CFG_REG(TSDM, FIRST_ILT);
+	clients[ILT_CLI_TSDM].last.reg = ILT_CFG_REG(TSDM, LAST_ILT);
+	clients[ILT_CLI_TSDM].p_size.reg = ILT_CFG_REG(TSDM, P_SIZE);
+	/* default ILT page size for all clients is 64K */
+	for (i = 0; i < ILT_CLI_MAX; i++)
+		p_mngr->clients[i].p_size.val = ILT_DEFAULT_HW_P_SIZE;
+
+	/* Initialize task sizes */
+	p_mngr->task_type_size[0] = TYPE0_TASK_CXT_SIZE(p_hwfn);
+	p_mngr->task_type_size[1] = TYPE1_TASK_CXT_SIZE(p_hwfn);
+
+	if (p_hwfn->cdev->p_iov_info)
+		p_mngr->vf_count = p_hwfn->cdev->p_iov_info->total_vfs;
+	/* Initialize the dynamic ILT allocation mutex */
+	mutex_init(&p_mngr->mutex);
+
+	/* Set the cxt mangr pointer priori to further allocations */
+	p_hwfn->p_cxt_mngr = p_mngr;
+
+	return 0;
+}
+
+int qed_cxt_tables_alloc(struct qed_hwfn *p_hwfn)
+{
+	int rc;
+
+	/* Allocate the ILT shadow table */
+	rc = qed_ilt_shadow_alloc(p_hwfn);
+	if (rc)
+		goto tables_alloc_fail;
+
+	/* Allocate the T2  table */
+	rc = qed_cxt_src_t2_alloc(p_hwfn);
+	if (rc)
+		goto tables_alloc_fail;
+
+	/* Allocate and initialize the acquired cids bitmaps */
+	rc = qed_cid_map_alloc(p_hwfn);
+	if (rc)
+		goto tables_alloc_fail;
+
+	return 0;
+
+tables_alloc_fail:
+	qed_cxt_mngr_free(p_hwfn);
+	return rc;
+}
+
+void qed_cxt_mngr_free(struct qed_hwfn *p_hwfn)
+{
+	if (!p_hwfn->p_cxt_mngr)
+		return;
+
+	qed_cid_map_free(p_hwfn);
+	qed_cxt_src_t2_free(p_hwfn);
+	qed_ilt_shadow_free(p_hwfn);
+	kfree(p_hwfn->p_cxt_mngr);
+
+	p_hwfn->p_cxt_mngr = NULL;
+}
+
+void qed_cxt_mngr_setup(struct qed_hwfn *p_hwfn)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	struct qed_cid_acquired_map *p_map;
+	struct qed_conn_type_cfg *p_cfg;
+	int type;
+	u32 len;
+
+	/* Reset acquired cids */
+	for (type = 0; type < MAX_CONN_TYPES; type++) {
+		u32 vf;
+
+		p_cfg = &p_mngr->conn_cfg[type];
+		if (p_cfg->cid_count) {
+			p_map = &p_mngr->acquired[type];
+			len = DIV_ROUND_UP(p_map->max_count,
+					   sizeof(unsigned long) *
+					   BITS_PER_BYTE) *
+			      sizeof(unsigned long);
+			memset(p_map->cid_map, 0, len);
+		}
+
+		if (!p_cfg->cids_per_vf)
+			continue;
+
+		for (vf = 0; vf < MAX_NUM_VFS; vf++) {
+			p_map = &p_mngr->acquired_vf[type][vf];
+			len = DIV_ROUND_UP(p_map->max_count,
+					   sizeof(unsigned long) *
+					   BITS_PER_BYTE) *
+			      sizeof(unsigned long);
+			memset(p_map->cid_map, 0, len);
+		}
+	}
+}
+
+/* CDU Common */
+#define CDUC_CXT_SIZE_SHIFT \
+	CDU_REG_CID_ADDR_PARAMS_CONTEXT_SIZE_SHIFT
+
+#define CDUC_CXT_SIZE_MASK \
+	(CDU_REG_CID_ADDR_PARAMS_CONTEXT_SIZE >> CDUC_CXT_SIZE_SHIFT)
+
+#define CDUC_BLOCK_WASTE_SHIFT \
+	CDU_REG_CID_ADDR_PARAMS_BLOCK_WASTE_SHIFT
+
+#define CDUC_BLOCK_WASTE_MASK \
+	(CDU_REG_CID_ADDR_PARAMS_BLOCK_WASTE >> CDUC_BLOCK_WASTE_SHIFT)
+
+#define CDUC_NCIB_SHIFT	\
+	CDU_REG_CID_ADDR_PARAMS_NCIB_SHIFT
+
+#define CDUC_NCIB_MASK \
+	(CDU_REG_CID_ADDR_PARAMS_NCIB >> CDUC_NCIB_SHIFT)
+
+#define CDUT_TYPE0_CXT_SIZE_SHIFT \
+	CDU_REG_SEGMENT0_PARAMS_T0_TID_SIZE_SHIFT
+
+#define CDUT_TYPE0_CXT_SIZE_MASK		\
+	(CDU_REG_SEGMENT0_PARAMS_T0_TID_SIZE >>	\
+	 CDUT_TYPE0_CXT_SIZE_SHIFT)
+
+#define CDUT_TYPE0_BLOCK_WASTE_SHIFT \
+	CDU_REG_SEGMENT0_PARAMS_T0_TID_BLOCK_WASTE_SHIFT
+
+#define CDUT_TYPE0_BLOCK_WASTE_MASK		       \
+	(CDU_REG_SEGMENT0_PARAMS_T0_TID_BLOCK_WASTE >> \
+	 CDUT_TYPE0_BLOCK_WASTE_SHIFT)
+
+#define CDUT_TYPE0_NCIB_SHIFT \
+	CDU_REG_SEGMENT0_PARAMS_T0_NUM_TIDS_IN_BLOCK_SHIFT
+
+#define CDUT_TYPE0_NCIB_MASK				 \
+	(CDU_REG_SEGMENT0_PARAMS_T0_NUM_TIDS_IN_BLOCK >> \
+	 CDUT_TYPE0_NCIB_SHIFT)
+
+#define CDUT_TYPE1_CXT_SIZE_SHIFT \
+	CDU_REG_SEGMENT1_PARAMS_T1_TID_SIZE_SHIFT
+
+#define CDUT_TYPE1_CXT_SIZE_MASK		\
+	(CDU_REG_SEGMENT1_PARAMS_T1_TID_SIZE >>	\
+	 CDUT_TYPE1_CXT_SIZE_SHIFT)
+
+#define CDUT_TYPE1_BLOCK_WASTE_SHIFT \
+	CDU_REG_SEGMENT1_PARAMS_T1_TID_BLOCK_WASTE_SHIFT
+
+#define CDUT_TYPE1_BLOCK_WASTE_MASK		       \
+	(CDU_REG_SEGMENT1_PARAMS_T1_TID_BLOCK_WASTE >> \
+	 CDUT_TYPE1_BLOCK_WASTE_SHIFT)
+
+#define CDUT_TYPE1_NCIB_SHIFT \
+	CDU_REG_SEGMENT1_PARAMS_T1_NUM_TIDS_IN_BLOCK_SHIFT
+
+#define CDUT_TYPE1_NCIB_MASK				 \
+	(CDU_REG_SEGMENT1_PARAMS_T1_NUM_TIDS_IN_BLOCK >> \
+	 CDUT_TYPE1_NCIB_SHIFT)
+
+static void qed_cdu_init_common(struct qed_hwfn *p_hwfn)
+{
+	u32 page_sz, elems_per_page, block_waste, cxt_size, cdu_params = 0;
+
+	/* CDUC - connection configuration */
+	page_sz = p_hwfn->p_cxt_mngr->clients[ILT_CLI_CDUC].p_size.val;
+	cxt_size = CONN_CXT_SIZE(p_hwfn);
+	elems_per_page = ILT_PAGE_IN_BYTES(page_sz) / cxt_size;
+	block_waste = ILT_PAGE_IN_BYTES(page_sz) - elems_per_page * cxt_size;
+
+	SET_FIELD(cdu_params, CDUC_CXT_SIZE, cxt_size);
+	SET_FIELD(cdu_params, CDUC_BLOCK_WASTE, block_waste);
+	SET_FIELD(cdu_params, CDUC_NCIB, elems_per_page);
+	STORE_RT_REG(p_hwfn, CDU_REG_CID_ADDR_PARAMS_RT_OFFSET, cdu_params);
+
+	/* CDUT - type-0 tasks configuration */
+	page_sz = p_hwfn->p_cxt_mngr->clients[ILT_CLI_CDUT].p_size.val;
+	cxt_size = p_hwfn->p_cxt_mngr->task_type_size[0];
+	elems_per_page = ILT_PAGE_IN_BYTES(page_sz) / cxt_size;
+	block_waste = ILT_PAGE_IN_BYTES(page_sz) - elems_per_page * cxt_size;
+
+	/* cxt size and block-waste are multipes of 8 */
+	cdu_params = 0;
+	SET_FIELD(cdu_params, CDUT_TYPE0_CXT_SIZE, (cxt_size >> 3));
+	SET_FIELD(cdu_params, CDUT_TYPE0_BLOCK_WASTE, (block_waste >> 3));
+	SET_FIELD(cdu_params, CDUT_TYPE0_NCIB, elems_per_page);
+	STORE_RT_REG(p_hwfn, CDU_REG_SEGMENT0_PARAMS_RT_OFFSET, cdu_params);
+
+	/* CDUT - type-1 tasks configuration */
+	cxt_size = p_hwfn->p_cxt_mngr->task_type_size[1];
+	elems_per_page = ILT_PAGE_IN_BYTES(page_sz) / cxt_size;
+	block_waste = ILT_PAGE_IN_BYTES(page_sz) - elems_per_page * cxt_size;
+
+	/* cxt size and block-waste are multipes of 8 */
+	cdu_params = 0;
+	SET_FIELD(cdu_params, CDUT_TYPE1_CXT_SIZE, (cxt_size >> 3));
+	SET_FIELD(cdu_params, CDUT_TYPE1_BLOCK_WASTE, (block_waste >> 3));
+	SET_FIELD(cdu_params, CDUT_TYPE1_NCIB, elems_per_page);
+	STORE_RT_REG(p_hwfn, CDU_REG_SEGMENT1_PARAMS_RT_OFFSET, cdu_params);
+}
+
+/* CDU PF */
+#define CDU_SEG_REG_TYPE_SHIFT          CDU_SEG_TYPE_OFFSET_REG_TYPE_SHIFT
+#define CDU_SEG_REG_TYPE_MASK           0x1
+#define CDU_SEG_REG_OFFSET_SHIFT        0
+#define CDU_SEG_REG_OFFSET_MASK         CDU_SEG_TYPE_OFFSET_REG_OFFSET_MASK
+
+static void qed_cdu_init_pf(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ilt_client_cfg *p_cli;
+	struct qed_tid_seg *p_seg;
+	u32 cdu_seg_params, offset;
+	int i;
+
+	static const u32 rt_type_offset_arr[] = {
+		CDU_REG_PF_SEG0_TYPE_OFFSET_RT_OFFSET,
+		CDU_REG_PF_SEG1_TYPE_OFFSET_RT_OFFSET,
+		CDU_REG_PF_SEG2_TYPE_OFFSET_RT_OFFSET,
+		CDU_REG_PF_SEG3_TYPE_OFFSET_RT_OFFSET
+	};
+
+	static const u32 rt_type_offset_fl_arr[] = {
+		CDU_REG_PF_FL_SEG0_TYPE_OFFSET_RT_OFFSET,
+		CDU_REG_PF_FL_SEG1_TYPE_OFFSET_RT_OFFSET,
+		CDU_REG_PF_FL_SEG2_TYPE_OFFSET_RT_OFFSET,
+		CDU_REG_PF_FL_SEG3_TYPE_OFFSET_RT_OFFSET
+	};
+
+	p_cli = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_CDUT];
+
+	/* There are initializations only for CDUT during pf Phase */
+	for (i = 0; i < NUM_TASK_PF_SEGMENTS; i++) {
+		/* Segment 0 */
+		p_seg = qed_cxt_tid_seg_info(p_hwfn, i);
+		if (!p_seg)
+			continue;
+
+		/* Note: start_line is already adjusted for the CDU
+		 * segment register granularity, so we just need to
+		 * divide. Adjustment is implicit as we assume ILT
+		 * Page size is larger than 32K!
+		 */
+		offset = (ILT_PAGE_IN_BYTES(p_cli->p_size.val) *
+			  (p_cli->pf_blks[CDUT_SEG_BLK(i)].start_line -
+			   p_cli->first.val)) / CDUT_SEG_ALIGNMET_IN_BYTES;
+
+		cdu_seg_params = 0;
+		SET_FIELD(cdu_seg_params, CDU_SEG_REG_TYPE, p_seg->type);
+		SET_FIELD(cdu_seg_params, CDU_SEG_REG_OFFSET, offset);
+		STORE_RT_REG(p_hwfn, rt_type_offset_arr[i], cdu_seg_params);
+
+		offset = (ILT_PAGE_IN_BYTES(p_cli->p_size.val) *
+			  (p_cli->pf_blks[CDUT_FL_SEG_BLK(i, PF)].start_line -
+			   p_cli->first.val)) / CDUT_SEG_ALIGNMET_IN_BYTES;
+
+		cdu_seg_params = 0;
+		SET_FIELD(cdu_seg_params, CDU_SEG_REG_TYPE, p_seg->type);
+		SET_FIELD(cdu_seg_params, CDU_SEG_REG_OFFSET, offset);
+		STORE_RT_REG(p_hwfn, rt_type_offset_fl_arr[i], cdu_seg_params);
+	}
+}
+
+void qed_qm_init_pf(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt, bool is_pf_loading)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+	struct qed_qm_pf_rt_init_params params;
+	struct qed_mcp_link_state *p_link;
+	struct qed_qm_iids iids;
+
+	memset(&iids, 0, sizeof(iids));
+	qed_cxt_qm_iids(p_hwfn, &iids);
+
+	p_link = &QED_LEADING_HWFN(p_hwfn->cdev)->mcp_info->link_output;
+
+	memset(&params, 0, sizeof(params));
+	params.port_id = p_hwfn->port_id;
+	params.pf_id = p_hwfn->rel_pf_id;
+	params.max_phys_tcs_per_port = qm_info->max_phys_tcs_per_port;
+	params.is_pf_loading = is_pf_loading;
+	params.num_pf_cids = iids.cids;
+	params.num_vf_cids = iids.vf_cids;
+	params.num_tids = iids.tids;
+	params.start_pq = qm_info->start_pq;
+	params.num_pf_pqs = qm_info->num_pqs - qm_info->num_vf_pqs;
+	params.num_vf_pqs = qm_info->num_vf_pqs;
+	params.start_vport = qm_info->start_vport;
+	params.num_vports = qm_info->num_vports;
+	params.pf_wfq = qm_info->pf_wfq;
+	params.pf_rl = qm_info->pf_rl;
+	params.link_speed = p_link->speed;
+	params.pq_params = qm_info->qm_pq_params;
+	params.vport_params = qm_info->qm_vport_params;
+
+	qed_qm_pf_rt_init(p_hwfn, p_ptt, &params);
+}
+
+/* CM PF */
+void qed_cm_init_pf(struct qed_hwfn *p_hwfn)
+{
+	/* XCM pure-LB queue */
+	STORE_RT_REG(p_hwfn, XCM_REG_CON_PHY_Q3_RT_OFFSET,
+		     qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_LB));
+}
+
+/* DQ PF */
+static void qed_dq_init_pf(struct qed_hwfn *p_hwfn)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	u32 dq_pf_max_cid = 0, dq_vf_max_cid = 0;
+
+	dq_pf_max_cid += (p_mngr->conn_cfg[0].cid_count >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_0_RT_OFFSET, dq_pf_max_cid);
+
+	dq_vf_max_cid += (p_mngr->conn_cfg[0].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_0_RT_OFFSET, dq_vf_max_cid);
+
+	dq_pf_max_cid += (p_mngr->conn_cfg[1].cid_count >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_1_RT_OFFSET, dq_pf_max_cid);
+
+	dq_vf_max_cid += (p_mngr->conn_cfg[1].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_1_RT_OFFSET, dq_vf_max_cid);
+
+	dq_pf_max_cid += (p_mngr->conn_cfg[2].cid_count >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_2_RT_OFFSET, dq_pf_max_cid);
+
+	dq_vf_max_cid += (p_mngr->conn_cfg[2].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_2_RT_OFFSET, dq_vf_max_cid);
+
+	dq_pf_max_cid += (p_mngr->conn_cfg[3].cid_count >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_3_RT_OFFSET, dq_pf_max_cid);
+
+	dq_vf_max_cid += (p_mngr->conn_cfg[3].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_3_RT_OFFSET, dq_vf_max_cid);
+
+	dq_pf_max_cid += (p_mngr->conn_cfg[4].cid_count >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_4_RT_OFFSET, dq_pf_max_cid);
+
+	dq_vf_max_cid += (p_mngr->conn_cfg[4].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_4_RT_OFFSET, dq_vf_max_cid);
+
+	dq_pf_max_cid += (p_mngr->conn_cfg[5].cid_count >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_5_RT_OFFSET, dq_pf_max_cid);
+
+	dq_vf_max_cid += (p_mngr->conn_cfg[5].cids_per_vf >> DQ_RANGE_SHIFT);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_5_RT_OFFSET, dq_vf_max_cid);
+
+	/* Connection types 6 & 7 are not in use, yet they must be configured
+	 * as the highest possible connection. Not configuring them means the
+	 * defaults will be  used, and with a large number of cids a bug may
+	 * occur, if the defaults will be smaller than dq_pf_max_cid /
+	 * dq_vf_max_cid.
+	 */
+	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_6_RT_OFFSET, dq_pf_max_cid);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_6_RT_OFFSET, dq_vf_max_cid);
+
+	STORE_RT_REG(p_hwfn, DORQ_REG_PF_MAX_ICID_7_RT_OFFSET, dq_pf_max_cid);
+	STORE_RT_REG(p_hwfn, DORQ_REG_VF_MAX_ICID_7_RT_OFFSET, dq_vf_max_cid);
+}
+
+static void qed_ilt_bounds_init(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ilt_client_cfg *ilt_clients;
+	int i;
+
+	ilt_clients = p_hwfn->p_cxt_mngr->clients;
+	for_each_ilt_valid_client(i, ilt_clients) {
+		STORE_RT_REG(p_hwfn,
+			     ilt_clients[i].first.reg,
+			     ilt_clients[i].first.val);
+		STORE_RT_REG(p_hwfn,
+			     ilt_clients[i].last.reg, ilt_clients[i].last.val);
+		STORE_RT_REG(p_hwfn,
+			     ilt_clients[i].p_size.reg,
+			     ilt_clients[i].p_size.val);
+	}
+}
+
+static void qed_ilt_vf_bounds_init(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ilt_client_cfg *p_cli;
+	u32 blk_factor;
+
+	/* For simplicty  we set the 'block' to be an ILT page */
+	if (p_hwfn->cdev->p_iov_info) {
+		struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
+
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_VF_BASE_RT_OFFSET,
+			     p_iov->first_vf_in_pf);
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_VF_LAST_ILT_RT_OFFSET,
+			     p_iov->first_vf_in_pf + p_iov->total_vfs);
+	}
+
+	p_cli = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_CDUC];
+	blk_factor = ilog2(ILT_PAGE_IN_BYTES(p_cli->p_size.val) >> 10);
+	if (p_cli->active) {
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_CDUC_BLOCKS_FACTOR_RT_OFFSET,
+			     blk_factor);
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_CDUC_NUMBER_OF_PF_BLOCKS_RT_OFFSET,
+			     p_cli->pf_total_lines);
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_CDUC_VF_BLOCKS_RT_OFFSET,
+			     p_cli->vf_total_lines);
+	}
+
+	p_cli = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_CDUT];
+	blk_factor = ilog2(ILT_PAGE_IN_BYTES(p_cli->p_size.val) >> 10);
+	if (p_cli->active) {
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_CDUT_BLOCKS_FACTOR_RT_OFFSET,
+			     blk_factor);
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_CDUT_NUMBER_OF_PF_BLOCKS_RT_OFFSET,
+			     p_cli->pf_total_lines);
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_CDUT_VF_BLOCKS_RT_OFFSET,
+			     p_cli->vf_total_lines);
+	}
+
+	p_cli = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_TM];
+	blk_factor = ilog2(ILT_PAGE_IN_BYTES(p_cli->p_size.val) >> 10);
+	if (p_cli->active) {
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_TM_BLOCKS_FACTOR_RT_OFFSET, blk_factor);
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_TM_NUMBER_OF_PF_BLOCKS_RT_OFFSET,
+			     p_cli->pf_total_lines);
+		STORE_RT_REG(p_hwfn,
+			     PSWRQ2_REG_TM_VF_BLOCKS_RT_OFFSET,
+			     p_cli->vf_total_lines);
+	}
+}
+
+/* ILT (PSWRQ2) PF */
+static void qed_ilt_init_pf(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ilt_client_cfg *clients;
+	struct qed_cxt_mngr *p_mngr;
+	struct qed_dma_mem *p_shdw;
+	u32 line, rt_offst, i;
+
+	qed_ilt_bounds_init(p_hwfn);
+	qed_ilt_vf_bounds_init(p_hwfn);
+
+	p_mngr = p_hwfn->p_cxt_mngr;
+	p_shdw = p_mngr->ilt_shadow;
+	clients = p_hwfn->p_cxt_mngr->clients;
+
+	for_each_ilt_valid_client(i, clients) {
+		/** Client's 1st val and RT array are absolute, ILT shadows'
+		 *  lines are relative.
+		 */
+		line = clients[i].first.val - p_mngr->pf_start_line;
+		rt_offst = PSWRQ2_REG_ILT_MEMORY_RT_OFFSET +
+			   clients[i].first.val * ILT_ENTRY_IN_REGS;
+
+		for (; line <= clients[i].last.val - p_mngr->pf_start_line;
+		     line++, rt_offst += ILT_ENTRY_IN_REGS) {
+			u64 ilt_hw_entry = 0;
+
+			/** p_virt could be NULL incase of dynamic
+			 *  allocation
+			 */
+			if (p_shdw[line].p_virt) {
+				SET_FIELD(ilt_hw_entry, ILT_ENTRY_VALID, 1ULL);
+				SET_FIELD(ilt_hw_entry, ILT_ENTRY_PHY_ADDR,
+					  (p_shdw[line].p_phys >> 12));
+
+				DP_VERBOSE(p_hwfn, QED_MSG_ILT,
+					   "Setting RT[0x%08x] from ILT[0x%08x] [Client is %d] to Physical addr: 0x%llx\n",
+					   rt_offst, line, i,
+					   (u64)(p_shdw[line].p_phys >> 12));
+			}
+
+			STORE_RT_REG_AGG(p_hwfn, rt_offst, ilt_hw_entry);
+		}
+	}
+}
+
+/* SRC (Searcher) PF */
+static void qed_src_init_pf(struct qed_hwfn *p_hwfn)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	u32 rounded_conn_num, conn_num, conn_max;
+	struct qed_src_iids src_iids;
+
+	memset(&src_iids, 0, sizeof(src_iids));
+	qed_cxt_src_iids(p_mngr, &src_iids);
+	conn_num = src_iids.pf_cids + src_iids.per_vf_cids * p_mngr->vf_count;
+	if (!conn_num)
+		return;
+
+	conn_max = max_t(u32, conn_num, SRC_MIN_NUM_ELEMS);
+	rounded_conn_num = roundup_pow_of_two(conn_max);
+
+	STORE_RT_REG(p_hwfn, SRC_REG_COUNTFREE_RT_OFFSET, conn_num);
+	STORE_RT_REG(p_hwfn, SRC_REG_NUMBER_HASH_BITS_RT_OFFSET,
+		     ilog2(rounded_conn_num));
+
+	STORE_RT_REG_AGG(p_hwfn, SRC_REG_FIRSTFREE_RT_OFFSET,
+			 p_hwfn->p_cxt_mngr->first_free);
+	STORE_RT_REG_AGG(p_hwfn, SRC_REG_LASTFREE_RT_OFFSET,
+			 p_hwfn->p_cxt_mngr->last_free);
+}
+
+/* Timers PF */
+#define TM_CFG_NUM_IDS_SHIFT            0
+#define TM_CFG_NUM_IDS_MASK             0xFFFFULL
+#define TM_CFG_PRE_SCAN_OFFSET_SHIFT    16
+#define TM_CFG_PRE_SCAN_OFFSET_MASK     0x1FFULL
+#define TM_CFG_PARENT_PF_SHIFT          25
+#define TM_CFG_PARENT_PF_MASK           0x7ULL
+
+#define TM_CFG_CID_PRE_SCAN_ROWS_SHIFT  30
+#define TM_CFG_CID_PRE_SCAN_ROWS_MASK   0x1FFULL
+
+#define TM_CFG_TID_OFFSET_SHIFT         30
+#define TM_CFG_TID_OFFSET_MASK          0x7FFFFULL
+#define TM_CFG_TID_PRE_SCAN_ROWS_SHIFT  49
+#define TM_CFG_TID_PRE_SCAN_ROWS_MASK   0x1FFULL
+
+static void qed_tm_init_pf(struct qed_hwfn *p_hwfn)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	u32 active_seg_mask = 0, tm_offset, rt_reg;
+	struct qed_tm_iids tm_iids;
+	u64 cfg_word;
+	u8 i;
+
+	memset(&tm_iids, 0, sizeof(tm_iids));
+	qed_cxt_tm_iids(p_hwfn, p_mngr, &tm_iids);
+
+	/* @@@TBD No pre-scan for now */
+
+	/* Note: We assume consecutive VFs for a PF */
+	for (i = 0; i < p_mngr->vf_count; i++) {
+		cfg_word = 0;
+		SET_FIELD(cfg_word, TM_CFG_NUM_IDS, tm_iids.per_vf_cids);
+		SET_FIELD(cfg_word, TM_CFG_PRE_SCAN_OFFSET, 0);
+		SET_FIELD(cfg_word, TM_CFG_PARENT_PF, p_hwfn->rel_pf_id);
+		SET_FIELD(cfg_word, TM_CFG_CID_PRE_SCAN_ROWS, 0);
+		rt_reg = TM_REG_CONFIG_CONN_MEM_RT_OFFSET +
+		    (sizeof(cfg_word) / sizeof(u32)) *
+		    (p_hwfn->cdev->p_iov_info->first_vf_in_pf + i);
+		STORE_RT_REG_AGG(p_hwfn, rt_reg, cfg_word);
+	}
+
+	cfg_word = 0;
+	SET_FIELD(cfg_word, TM_CFG_NUM_IDS, tm_iids.pf_cids);
+	SET_FIELD(cfg_word, TM_CFG_PRE_SCAN_OFFSET, 0);
+	SET_FIELD(cfg_word, TM_CFG_PARENT_PF, 0);	/* n/a for PF */
+	SET_FIELD(cfg_word, TM_CFG_CID_PRE_SCAN_ROWS, 0);	/* scan all   */
+
+	rt_reg = TM_REG_CONFIG_CONN_MEM_RT_OFFSET +
+	    (sizeof(cfg_word) / sizeof(u32)) *
+	    (NUM_OF_VFS(p_hwfn->cdev) + p_hwfn->rel_pf_id);
+	STORE_RT_REG_AGG(p_hwfn, rt_reg, cfg_word);
+
+	/* enale scan */
+	STORE_RT_REG(p_hwfn, TM_REG_PF_ENABLE_CONN_RT_OFFSET,
+		     tm_iids.pf_cids ? 0x1 : 0x0);
+
+	/* @@@TBD how to enable the scan for the VFs */
+
+	tm_offset = tm_iids.per_vf_cids;
+
+	/* Note: We assume consecutive VFs for a PF */
+	for (i = 0; i < p_mngr->vf_count; i++) {
+		cfg_word = 0;
+		SET_FIELD(cfg_word, TM_CFG_NUM_IDS, tm_iids.per_vf_tids);
+		SET_FIELD(cfg_word, TM_CFG_PRE_SCAN_OFFSET, 0);
+		SET_FIELD(cfg_word, TM_CFG_PARENT_PF, p_hwfn->rel_pf_id);
+		SET_FIELD(cfg_word, TM_CFG_TID_OFFSET, tm_offset);
+		SET_FIELD(cfg_word, TM_CFG_TID_PRE_SCAN_ROWS, (u64) 0);
+
+		rt_reg = TM_REG_CONFIG_TASK_MEM_RT_OFFSET +
+		    (sizeof(cfg_word) / sizeof(u32)) *
+		    (p_hwfn->cdev->p_iov_info->first_vf_in_pf + i);
+
+		STORE_RT_REG_AGG(p_hwfn, rt_reg, cfg_word);
+	}
+
+	tm_offset = tm_iids.pf_cids;
+	for (i = 0; i < NUM_TASK_PF_SEGMENTS; i++) {
+		cfg_word = 0;
+		SET_FIELD(cfg_word, TM_CFG_NUM_IDS, tm_iids.pf_tids[i]);
+		SET_FIELD(cfg_word, TM_CFG_PRE_SCAN_OFFSET, 0);
+		SET_FIELD(cfg_word, TM_CFG_PARENT_PF, 0);
+		SET_FIELD(cfg_word, TM_CFG_TID_OFFSET, tm_offset);
+		SET_FIELD(cfg_word, TM_CFG_TID_PRE_SCAN_ROWS, (u64) 0);
+
+		rt_reg = TM_REG_CONFIG_TASK_MEM_RT_OFFSET +
+		    (sizeof(cfg_word) / sizeof(u32)) *
+		    (NUM_OF_VFS(p_hwfn->cdev) +
+		     p_hwfn->rel_pf_id * NUM_TASK_PF_SEGMENTS + i);
+
+		STORE_RT_REG_AGG(p_hwfn, rt_reg, cfg_word);
+		active_seg_mask |= (tm_iids.pf_tids[i] ? BIT(i) : 0);
+
+		tm_offset += tm_iids.pf_tids[i];
+	}
+
+	if (QED_IS_RDMA_PERSONALITY(p_hwfn))
+		active_seg_mask = 0;
+
+	STORE_RT_REG(p_hwfn, TM_REG_PF_ENABLE_TASK_RT_OFFSET, active_seg_mask);
+
+	/* @@@TBD how to enable the scan for the VFs */
+}
+
+static void qed_prs_init_common(struct qed_hwfn *p_hwfn)
+{
+	if ((p_hwfn->hw_info.personality == QED_PCI_FCOE) &&
+	    p_hwfn->pf_params.fcoe_pf_params.is_target)
+		STORE_RT_REG(p_hwfn,
+			     PRS_REG_SEARCH_RESP_INITIATOR_TYPE_RT_OFFSET, 0);
+}
+
+static void qed_prs_init_pf(struct qed_hwfn *p_hwfn)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	struct qed_conn_type_cfg *p_fcoe;
+	struct qed_tid_seg *p_tid;
+
+	p_fcoe = &p_mngr->conn_cfg[PROTOCOLID_FCOE];
+
+	/* If FCoE is active set the MAX OX_ID (tid) in the Parser */
+	if (!p_fcoe->cid_count)
+		return;
+
+	p_tid = &p_fcoe->tid_seg[QED_CXT_FCOE_TID_SEG];
+	if (p_hwfn->pf_params.fcoe_pf_params.is_target) {
+		STORE_RT_REG_AGG(p_hwfn,
+				 PRS_REG_TASK_ID_MAX_TARGET_PF_RT_OFFSET,
+				 p_tid->count);
+	} else {
+		STORE_RT_REG_AGG(p_hwfn,
+				 PRS_REG_TASK_ID_MAX_INITIATOR_PF_RT_OFFSET,
+				 p_tid->count);
+	}
+}
+
+void qed_cxt_hw_init_common(struct qed_hwfn *p_hwfn)
+{
+	qed_cdu_init_common(p_hwfn);
+	qed_prs_init_common(p_hwfn);
+}
+
+void qed_cxt_hw_init_pf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	qed_qm_init_pf(p_hwfn, p_ptt, true);
+	qed_cm_init_pf(p_hwfn);
+	qed_dq_init_pf(p_hwfn);
+	qed_cdu_init_pf(p_hwfn);
+	qed_ilt_init_pf(p_hwfn);
+	qed_src_init_pf(p_hwfn);
+	qed_tm_init_pf(p_hwfn);
+	qed_prs_init_pf(p_hwfn);
+}
+
+int _qed_cxt_acquire_cid(struct qed_hwfn *p_hwfn,
+			 enum protocol_type type, u32 *p_cid, u8 vfid)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	struct qed_cid_acquired_map *p_map;
+	u32 rel_cid;
+
+	if (type >= MAX_CONN_TYPES) {
+		DP_NOTICE(p_hwfn, "Invalid protocol type %d", type);
+		return -EINVAL;
+	}
+
+	if (vfid >= MAX_NUM_VFS && vfid != QED_CXT_PF_CID) {
+		DP_NOTICE(p_hwfn, "VF [%02x] is out of range\n", vfid);
+		return -EINVAL;
+	}
+
+	/* Determine the right map to take this CID from */
+	if (vfid == QED_CXT_PF_CID)
+		p_map = &p_mngr->acquired[type];
+	else
+		p_map = &p_mngr->acquired_vf[type][vfid];
+
+	if (!p_map->cid_map) {
+		DP_NOTICE(p_hwfn, "Invalid protocol type %d", type);
+		return -EINVAL;
+	}
+
+	rel_cid = find_first_zero_bit(p_map->cid_map, p_map->max_count);
+
+	if (rel_cid >= p_map->max_count) {
+		DP_NOTICE(p_hwfn, "no CID available for protocol %d\n", type);
+		return -EINVAL;
+	}
+
+	__set_bit(rel_cid, p_map->cid_map);
+
+	*p_cid = rel_cid + p_map->start_cid;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_CXT,
+		   "Acquired cid 0x%08x [rel. %08x] vfid %02x type %d\n",
+		   *p_cid, rel_cid, vfid, type);
+
+	return 0;
+}
+
+int qed_cxt_acquire_cid(struct qed_hwfn *p_hwfn,
+			enum protocol_type type, u32 *p_cid)
+{
+	return _qed_cxt_acquire_cid(p_hwfn, type, p_cid, QED_CXT_PF_CID);
+}
+
+static bool qed_cxt_test_cid_acquired(struct qed_hwfn *p_hwfn,
+				      u32 cid,
+				      u8 vfid,
+				      enum protocol_type *p_type,
+				      struct qed_cid_acquired_map **pp_map)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	u32 rel_cid;
+
+	/* Iterate over protocols and find matching cid range */
+	for (*p_type = 0; *p_type < MAX_CONN_TYPES; (*p_type)++) {
+		if (vfid == QED_CXT_PF_CID)
+			*pp_map = &p_mngr->acquired[*p_type];
+		else
+			*pp_map = &p_mngr->acquired_vf[*p_type][vfid];
+
+		if (!((*pp_map)->cid_map))
+			continue;
+		if (cid >= (*pp_map)->start_cid &&
+		    cid < (*pp_map)->start_cid + (*pp_map)->max_count)
+			break;
+	}
+
+	if (*p_type == MAX_CONN_TYPES) {
+		DP_NOTICE(p_hwfn, "Invalid CID %d vfid %02x", cid, vfid);
+		goto fail;
+	}
+
+	rel_cid = cid - (*pp_map)->start_cid;
+	if (!test_bit(rel_cid, (*pp_map)->cid_map)) {
+		DP_NOTICE(p_hwfn, "CID %d [vifd %02x] not acquired",
+			  cid, vfid);
+		goto fail;
+	}
+
+	return true;
+fail:
+	*p_type = MAX_CONN_TYPES;
+	*pp_map = NULL;
+	return false;
+}
+
+void _qed_cxt_release_cid(struct qed_hwfn *p_hwfn, u32 cid, u8 vfid)
+{
+	struct qed_cid_acquired_map *p_map = NULL;
+	enum protocol_type type;
+	bool b_acquired;
+	u32 rel_cid;
+
+	if (vfid != QED_CXT_PF_CID && vfid > MAX_NUM_VFS) {
+		DP_NOTICE(p_hwfn,
+			  "Trying to return incorrect CID belonging to VF %02x\n",
+			  vfid);
+		return;
+	}
+
+	/* Test acquired and find matching per-protocol map */
+	b_acquired = qed_cxt_test_cid_acquired(p_hwfn, cid, vfid,
+					       &type, &p_map);
+
+	if (!b_acquired)
+		return;
+
+	rel_cid = cid - p_map->start_cid;
+	clear_bit(rel_cid, p_map->cid_map);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_CXT,
+		   "Released CID 0x%08x [rel. %08x] vfid %02x type %d\n",
+		   cid, rel_cid, vfid, type);
+}
+
+void qed_cxt_release_cid(struct qed_hwfn *p_hwfn, u32 cid)
+{
+	_qed_cxt_release_cid(p_hwfn, cid, QED_CXT_PF_CID);
+}
+
+int qed_cxt_get_cid_info(struct qed_hwfn *p_hwfn, struct qed_cxt_info *p_info)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	struct qed_cid_acquired_map *p_map = NULL;
+	u32 conn_cxt_size, hw_p_size, cxts_per_p, line;
+	enum protocol_type type;
+	bool b_acquired;
+
+	/* Test acquired and find matching per-protocol map */
+	b_acquired = qed_cxt_test_cid_acquired(p_hwfn, p_info->iid,
+					       QED_CXT_PF_CID, &type, &p_map);
+
+	if (!b_acquired)
+		return -EINVAL;
+
+	/* set the protocl type */
+	p_info->type = type;
+
+	/* compute context virtual pointer */
+	hw_p_size = p_hwfn->p_cxt_mngr->clients[ILT_CLI_CDUC].p_size.val;
+
+	conn_cxt_size = CONN_CXT_SIZE(p_hwfn);
+	cxts_per_p = ILT_PAGE_IN_BYTES(hw_p_size) / conn_cxt_size;
+	line = p_info->iid / cxts_per_p;
+
+	/* Make sure context is allocated (dynamic allocation) */
+	if (!p_mngr->ilt_shadow[line].p_virt)
+		return -EINVAL;
+
+	p_info->p_cxt = p_mngr->ilt_shadow[line].p_virt +
+			p_info->iid % cxts_per_p * conn_cxt_size;
+
+	DP_VERBOSE(p_hwfn, (QED_MSG_ILT | QED_MSG_CXT),
+		   "Accessing ILT shadow[%d]: CXT pointer is at %p (for iid %d)\n",
+		   p_info->iid / cxts_per_p, p_info->p_cxt, p_info->iid);
+
+	return 0;
+}
+
+static void qed_rdma_set_pf_params(struct qed_hwfn *p_hwfn,
+				   struct qed_rdma_pf_params *p_params,
+				   u32 num_tasks)
+{
+	u32 num_cons, num_qps, num_srqs;
+	enum protocol_type proto;
+
+	num_srqs = min_t(u32, 32 * 1024, p_params->num_srqs);
+
+	if (p_hwfn->mcp_info->func_info.protocol == QED_PCI_ETH_RDMA) {
+		DP_NOTICE(p_hwfn,
+			  "Current day drivers don't support RoCE & iWARP simultaneously on the same PF. Default to RoCE-only\n");
+		p_hwfn->hw_info.personality = QED_PCI_ETH_ROCE;
+	}
+
+	switch (p_hwfn->hw_info.personality) {
+	case QED_PCI_ETH_IWARP:
+		/* Each QP requires one connection */
+		num_cons = min_t(u32, IWARP_MAX_QPS, p_params->num_qps);
+		proto = PROTOCOLID_IWARP;
+		break;
+	case QED_PCI_ETH_ROCE:
+		num_qps = min_t(u32, ROCE_MAX_QPS, p_params->num_qps);
+		num_cons = num_qps * 2;	/* each QP requires two connections */
+		proto = PROTOCOLID_ROCE;
+		break;
+	default:
+		return;
+	}
+
+	if (num_cons && num_tasks) {
+		qed_cxt_set_proto_cid_count(p_hwfn, proto, num_cons, 0);
+
+		/* Deliberatly passing ROCE for tasks id. This is because
+		 * iWARP / RoCE share the task id.
+		 */
+		qed_cxt_set_proto_tid_count(p_hwfn, PROTOCOLID_ROCE,
+					    QED_CXT_ROCE_TID_SEG, 1,
+					    num_tasks, false);
+		qed_cxt_set_srq_count(p_hwfn, num_srqs);
+	} else {
+		DP_INFO(p_hwfn->cdev,
+			"RDMA personality used without setting params!\n");
+	}
+}
+
+int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn, u32 rdma_tasks)
+{
+	/* Set the number of required CORE connections */
+	u32 core_cids = 1; /* SPQ */
+
+	if (p_hwfn->using_ll2)
+		core_cids += 4;
+	qed_cxt_set_proto_cid_count(p_hwfn, PROTOCOLID_CORE, core_cids, 0);
+
+	switch (p_hwfn->hw_info.personality) {
+	case QED_PCI_ETH_RDMA:
+	case QED_PCI_ETH_IWARP:
+	case QED_PCI_ETH_ROCE:
+	{
+			qed_rdma_set_pf_params(p_hwfn,
+					       &p_hwfn->
+					       pf_params.rdma_pf_params,
+					       rdma_tasks);
+		/* no need for break since RoCE coexist with Ethernet */
+	}
+	case QED_PCI_ETH:
+	{
+		struct qed_eth_pf_params *p_params =
+		    &p_hwfn->pf_params.eth_pf_params;
+
+			if (!p_params->num_vf_cons)
+				p_params->num_vf_cons =
+				    ETH_PF_PARAMS_VF_CONS_DEFAULT;
+			qed_cxt_set_proto_cid_count(p_hwfn, PROTOCOLID_ETH,
+						    p_params->num_cons,
+						    p_params->num_vf_cons);
+		p_hwfn->p_cxt_mngr->arfs_count = p_params->num_arfs_filters;
+		break;
+	}
+	case QED_PCI_FCOE:
+	{
+		struct qed_fcoe_pf_params *p_params;
+
+		p_params = &p_hwfn->pf_params.fcoe_pf_params;
+
+		if (p_params->num_cons && p_params->num_tasks) {
+			qed_cxt_set_proto_cid_count(p_hwfn,
+						    PROTOCOLID_FCOE,
+						    p_params->num_cons,
+						    0);
+
+			qed_cxt_set_proto_tid_count(p_hwfn, PROTOCOLID_FCOE,
+						    QED_CXT_FCOE_TID_SEG, 0,
+						    p_params->num_tasks, true);
+		} else {
+			DP_INFO(p_hwfn->cdev,
+				"Fcoe personality used without setting params!\n");
+		}
+		break;
+	}
+	case QED_PCI_ISCSI:
+	{
+		struct qed_iscsi_pf_params *p_params;
+
+		p_params = &p_hwfn->pf_params.iscsi_pf_params;
+
+		if (p_params->num_cons && p_params->num_tasks) {
+			qed_cxt_set_proto_cid_count(p_hwfn,
+						    PROTOCOLID_ISCSI,
+						    p_params->num_cons,
+						    0);
+
+			qed_cxt_set_proto_tid_count(p_hwfn,
+						    PROTOCOLID_ISCSI,
+						    QED_CXT_ISCSI_TID_SEG,
+						    0,
+						    p_params->num_tasks,
+						    true);
+		} else {
+			DP_INFO(p_hwfn->cdev,
+				"Iscsi personality used without setting params!\n");
+		}
+		break;
+	}
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int qed_cxt_get_tid_mem_info(struct qed_hwfn *p_hwfn,
+			     struct qed_tid_mem *p_info)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	u32 proto, seg, total_lines, i, shadow_line;
+	struct qed_ilt_client_cfg *p_cli;
+	struct qed_ilt_cli_blk *p_fl_seg;
+	struct qed_tid_seg *p_seg_info;
+
+	/* Verify the personality */
+	switch (p_hwfn->hw_info.personality) {
+	case QED_PCI_FCOE:
+		proto = PROTOCOLID_FCOE;
+		seg = QED_CXT_FCOE_TID_SEG;
+		break;
+	case QED_PCI_ISCSI:
+		proto = PROTOCOLID_ISCSI;
+		seg = QED_CXT_ISCSI_TID_SEG;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	p_cli = &p_mngr->clients[ILT_CLI_CDUT];
+	if (!p_cli->active)
+		return -EINVAL;
+
+	p_seg_info = &p_mngr->conn_cfg[proto].tid_seg[seg];
+	if (!p_seg_info->has_fl_mem)
+		return -EINVAL;
+
+	p_fl_seg = &p_cli->pf_blks[CDUT_FL_SEG_BLK(seg, PF)];
+	total_lines = DIV_ROUND_UP(p_fl_seg->total_size,
+				   p_fl_seg->real_size_in_page);
+
+	for (i = 0; i < total_lines; i++) {
+		shadow_line = i + p_fl_seg->start_line -
+		    p_hwfn->p_cxt_mngr->pf_start_line;
+		p_info->blocks[i] = p_mngr->ilt_shadow[shadow_line].p_virt;
+	}
+	p_info->waste = ILT_PAGE_IN_BYTES(p_cli->p_size.val) -
+	    p_fl_seg->real_size_in_page;
+	p_info->tid_size = p_mngr->task_type_size[p_seg_info->type];
+	p_info->num_tids_per_block = p_fl_seg->real_size_in_page /
+	    p_info->tid_size;
+
+	return 0;
+}
+
+/* This function is very RoCE oriented, if another protocol in the future
+ * will want this feature we'll need to modify the function to be more generic
+ */
+int
+qed_cxt_dynamic_ilt_alloc(struct qed_hwfn *p_hwfn,
+			  enum qed_cxt_elem_type elem_type, u32 iid)
+{
+	u32 reg_offset, shadow_line, elem_size, hw_p_size, elems_per_p, line;
+	struct qed_ilt_client_cfg *p_cli;
+	struct qed_ilt_cli_blk *p_blk;
+	struct qed_ptt *p_ptt;
+	dma_addr_t p_phys;
+	u64 ilt_hw_entry;
+	void *p_virt;
+	int rc = 0;
+
+	switch (elem_type) {
+	case QED_ELEM_CXT:
+		p_cli = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_CDUC];
+		elem_size = CONN_CXT_SIZE(p_hwfn);
+		p_blk = &p_cli->pf_blks[CDUC_BLK];
+		break;
+	case QED_ELEM_SRQ:
+		p_cli = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_TSDM];
+		elem_size = SRQ_CXT_SIZE;
+		p_blk = &p_cli->pf_blks[SRQ_BLK];
+		break;
+	case QED_ELEM_TASK:
+		p_cli = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_CDUT];
+		elem_size = TYPE1_TASK_CXT_SIZE(p_hwfn);
+		p_blk = &p_cli->pf_blks[CDUT_SEG_BLK(QED_CXT_ROCE_TID_SEG)];
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "-EINVALID elem type = %d", elem_type);
+		return -EINVAL;
+	}
+
+	/* Calculate line in ilt */
+	hw_p_size = p_cli->p_size.val;
+	elems_per_p = ILT_PAGE_IN_BYTES(hw_p_size) / elem_size;
+	line = p_blk->start_line + (iid / elems_per_p);
+	shadow_line = line - p_hwfn->p_cxt_mngr->pf_start_line;
+
+	/* If line is already allocated, do nothing, otherwise allocate it and
+	 * write it to the PSWRQ2 registers.
+	 * This section can be run in parallel from different contexts and thus
+	 * a mutex protection is needed.
+	 */
+
+	mutex_lock(&p_hwfn->p_cxt_mngr->mutex);
+
+	if (p_hwfn->p_cxt_mngr->ilt_shadow[shadow_line].p_virt)
+		goto out0;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt) {
+		DP_NOTICE(p_hwfn,
+			  "QED_TIME_OUT on ptt acquire - dynamic allocation");
+		rc = -EBUSY;
+		goto out0;
+	}
+
+	p_virt = dma_zalloc_coherent(&p_hwfn->cdev->pdev->dev,
+				     p_blk->real_size_in_page, &p_phys,
+				     GFP_KERNEL);
+	if (!p_virt) {
+		rc = -ENOMEM;
+		goto out1;
+	}
+
+	/* configuration of refTagMask to 0xF is required for RoCE DIF MR only,
+	 * to compensate for a HW bug, but it is configured even if DIF is not
+	 * enabled. This is harmless and allows us to avoid a dedicated API. We
+	 * configure the field for all of the contexts on the newly allocated
+	 * page.
+	 */
+	if (elem_type == QED_ELEM_TASK) {
+		u32 elem_i;
+		u8 *elem_start = (u8 *)p_virt;
+		union type1_task_context *elem;
+
+		for (elem_i = 0; elem_i < elems_per_p; elem_i++) {
+			elem = (union type1_task_context *)elem_start;
+			SET_FIELD(elem->roce_ctx.tdif_context.flags1,
+				  TDIF_TASK_CONTEXT_REF_TAG_MASK, 0xf);
+			elem_start += TYPE1_TASK_CXT_SIZE(p_hwfn);
+		}
+	}
+
+	p_hwfn->p_cxt_mngr->ilt_shadow[shadow_line].p_virt = p_virt;
+	p_hwfn->p_cxt_mngr->ilt_shadow[shadow_line].p_phys = p_phys;
+	p_hwfn->p_cxt_mngr->ilt_shadow[shadow_line].size =
+	    p_blk->real_size_in_page;
+
+	/* compute absolute offset */
+	reg_offset = PSWRQ2_REG_ILT_MEMORY +
+	    (line * ILT_REG_SIZE_IN_BYTES * ILT_ENTRY_IN_REGS);
+
+	ilt_hw_entry = 0;
+	SET_FIELD(ilt_hw_entry, ILT_ENTRY_VALID, 1ULL);
+	SET_FIELD(ilt_hw_entry,
+		  ILT_ENTRY_PHY_ADDR,
+		  (p_hwfn->p_cxt_mngr->ilt_shadow[shadow_line].p_phys >> 12));
+
+	/* Write via DMAE since the PSWRQ2_REG_ILT_MEMORY line is a wide-bus */
+	qed_dmae_host2grc(p_hwfn, p_ptt, (u64) (uintptr_t)&ilt_hw_entry,
+			  reg_offset, sizeof(ilt_hw_entry) / sizeof(u32), 0);
+
+	if (elem_type == QED_ELEM_CXT) {
+		u32 last_cid_allocated = (1 + (iid / elems_per_p)) *
+		    elems_per_p;
+
+		/* Update the relevant register in the parser */
+		qed_wr(p_hwfn, p_ptt, PRS_REG_ROCE_DEST_QP_MAX_PF,
+		       last_cid_allocated - 1);
+
+		if (!p_hwfn->b_rdma_enabled_in_prs) {
+			/* Enable RDMA search */
+			qed_wr(p_hwfn, p_ptt, p_hwfn->rdma_prs_search_reg, 1);
+			p_hwfn->b_rdma_enabled_in_prs = true;
+		}
+	}
+
+out1:
+	qed_ptt_release(p_hwfn, p_ptt);
+out0:
+	mutex_unlock(&p_hwfn->p_cxt_mngr->mutex);
+
+	return rc;
+}
+
+/* This function is very RoCE oriented, if another protocol in the future
+ * will want this feature we'll need to modify the function to be more generic
+ */
+static int
+qed_cxt_free_ilt_range(struct qed_hwfn *p_hwfn,
+		       enum qed_cxt_elem_type elem_type,
+		       u32 start_iid, u32 count)
+{
+	u32 start_line, end_line, shadow_start_line, shadow_end_line;
+	u32 reg_offset, elem_size, hw_p_size, elems_per_p;
+	struct qed_ilt_client_cfg *p_cli;
+	struct qed_ilt_cli_blk *p_blk;
+	u32 end_iid = start_iid + count;
+	struct qed_ptt *p_ptt;
+	u64 ilt_hw_entry = 0;
+	u32 i;
+
+	switch (elem_type) {
+	case QED_ELEM_CXT:
+		p_cli = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_CDUC];
+		elem_size = CONN_CXT_SIZE(p_hwfn);
+		p_blk = &p_cli->pf_blks[CDUC_BLK];
+		break;
+	case QED_ELEM_SRQ:
+		p_cli = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_TSDM];
+		elem_size = SRQ_CXT_SIZE;
+		p_blk = &p_cli->pf_blks[SRQ_BLK];
+		break;
+	case QED_ELEM_TASK:
+		p_cli = &p_hwfn->p_cxt_mngr->clients[ILT_CLI_CDUT];
+		elem_size = TYPE1_TASK_CXT_SIZE(p_hwfn);
+		p_blk = &p_cli->pf_blks[CDUT_SEG_BLK(QED_CXT_ROCE_TID_SEG)];
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "-EINVALID elem type = %d", elem_type);
+		return -EINVAL;
+	}
+
+	/* Calculate line in ilt */
+	hw_p_size = p_cli->p_size.val;
+	elems_per_p = ILT_PAGE_IN_BYTES(hw_p_size) / elem_size;
+	start_line = p_blk->start_line + (start_iid / elems_per_p);
+	end_line = p_blk->start_line + (end_iid / elems_per_p);
+	if (((end_iid + 1) / elems_per_p) != (end_iid / elems_per_p))
+		end_line--;
+
+	shadow_start_line = start_line - p_hwfn->p_cxt_mngr->pf_start_line;
+	shadow_end_line = end_line - p_hwfn->p_cxt_mngr->pf_start_line;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt) {
+		DP_NOTICE(p_hwfn,
+			  "QED_TIME_OUT on ptt acquire - dynamic allocation");
+		return -EBUSY;
+	}
+
+	for (i = shadow_start_line; i < shadow_end_line; i++) {
+		if (!p_hwfn->p_cxt_mngr->ilt_shadow[i].p_virt)
+			continue;
+
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  p_hwfn->p_cxt_mngr->ilt_shadow[i].size,
+				  p_hwfn->p_cxt_mngr->ilt_shadow[i].p_virt,
+				  p_hwfn->p_cxt_mngr->ilt_shadow[i].p_phys);
+
+		p_hwfn->p_cxt_mngr->ilt_shadow[i].p_virt = NULL;
+		p_hwfn->p_cxt_mngr->ilt_shadow[i].p_phys = 0;
+		p_hwfn->p_cxt_mngr->ilt_shadow[i].size = 0;
+
+		/* compute absolute offset */
+		reg_offset = PSWRQ2_REG_ILT_MEMORY +
+		    ((start_line++) * ILT_REG_SIZE_IN_BYTES *
+		     ILT_ENTRY_IN_REGS);
+
+		/* Write via DMAE since the PSWRQ2_REG_ILT_MEMORY line is a
+		 * wide-bus.
+		 */
+		qed_dmae_host2grc(p_hwfn, p_ptt,
+				  (u64) (uintptr_t) &ilt_hw_entry,
+				  reg_offset,
+				  sizeof(ilt_hw_entry) / sizeof(u32),
+				  0);
+	}
+
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return 0;
+}
+
+int qed_cxt_free_proto_ilt(struct qed_hwfn *p_hwfn, enum protocol_type proto)
+{
+	int rc;
+	u32 cid;
+
+	/* Free Connection CXT */
+	rc = qed_cxt_free_ilt_range(p_hwfn, QED_ELEM_CXT,
+				    qed_cxt_get_proto_cid_start(p_hwfn,
+								proto),
+				    qed_cxt_get_proto_cid_count(p_hwfn,
+								proto, &cid));
+
+	if (rc)
+		return rc;
+
+	/* Free Task CXT ( Intentionally RoCE as task-id is shared between
+	 * RoCE and iWARP )
+	 */
+	proto = PROTOCOLID_ROCE;
+	rc = qed_cxt_free_ilt_range(p_hwfn, QED_ELEM_TASK, 0,
+				    qed_cxt_get_proto_tid_count(p_hwfn, proto));
+	if (rc)
+		return rc;
+
+	/* Free TSDM CXT */
+	rc = qed_cxt_free_ilt_range(p_hwfn, QED_ELEM_SRQ, 0,
+				    qed_cxt_get_srq_count(p_hwfn));
+
+	return rc;
+}
+
+int qed_cxt_get_task_ctx(struct qed_hwfn *p_hwfn,
+			 u32 tid, u8 ctx_type, void **pp_task_ctx)
+{
+	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
+	struct qed_ilt_client_cfg *p_cli;
+	struct qed_tid_seg *p_seg_info;
+	struct qed_ilt_cli_blk *p_seg;
+	u32 num_tids_per_block;
+	u32 tid_size, ilt_idx;
+	u32 total_lines;
+	u32 proto, seg;
+
+	/* Verify the personality */
+	switch (p_hwfn->hw_info.personality) {
+	case QED_PCI_FCOE:
+		proto = PROTOCOLID_FCOE;
+		seg = QED_CXT_FCOE_TID_SEG;
+		break;
+	case QED_PCI_ISCSI:
+		proto = PROTOCOLID_ISCSI;
+		seg = QED_CXT_ISCSI_TID_SEG;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	p_cli = &p_mngr->clients[ILT_CLI_CDUT];
+	if (!p_cli->active)
+		return -EINVAL;
+
+	p_seg_info = &p_mngr->conn_cfg[proto].tid_seg[seg];
+
+	if (ctx_type == QED_CTX_WORKING_MEM) {
+		p_seg = &p_cli->pf_blks[CDUT_SEG_BLK(seg)];
+	} else if (ctx_type == QED_CTX_FL_MEM) {
+		if (!p_seg_info->has_fl_mem)
+			return -EINVAL;
+		p_seg = &p_cli->pf_blks[CDUT_FL_SEG_BLK(seg, PF)];
+	} else {
+		return -EINVAL;
+	}
+	total_lines = DIV_ROUND_UP(p_seg->total_size, p_seg->real_size_in_page);
+	tid_size = p_mngr->task_type_size[p_seg_info->type];
+	num_tids_per_block = p_seg->real_size_in_page / tid_size;
+
+	if (total_lines < tid / num_tids_per_block)
+		return -EINVAL;
+
+	ilt_idx = tid / num_tids_per_block + p_seg->start_line -
+		  p_mngr->pf_start_line;
+	*pp_task_ctx = (u8 *)p_mngr->ilt_shadow[ilt_idx].p_virt +
+		       (tid % num_tids_per_block) * tid_size;
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
new file mode 100644
index 0000000..a4e9586
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
@@ -0,0 +1,244 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_CXT_H
+#define _QED_CXT_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/qed/qed_if.h>
+#include "qed_hsi.h"
+#include "qed.h"
+
+struct qed_cxt_info {
+	void			*p_cxt;
+	u32			iid;
+	enum protocol_type	type;
+};
+
+#define MAX_TID_BLOCKS                  512
+struct qed_tid_mem {
+	u32 tid_size;
+	u32 num_tids_per_block;
+	u32 waste;
+	u8 *blocks[MAX_TID_BLOCKS];	/* 4K */
+};
+
+/**
+ * @brief qedo_cid_get_cxt_info - Returns the context info for a specific cid
+ *
+ *
+ * @param p_hwfn
+ * @param p_info in/out
+ *
+ * @return int
+ */
+int qed_cxt_get_cid_info(struct qed_hwfn *p_hwfn,
+			 struct qed_cxt_info *p_info);
+
+/**
+ * @brief qed_cxt_get_tid_mem_info
+ *
+ * @param p_hwfn
+ * @param p_info
+ *
+ * @return int
+ */
+int qed_cxt_get_tid_mem_info(struct qed_hwfn *p_hwfn,
+			     struct qed_tid_mem *p_info);
+
+#define QED_CXT_ISCSI_TID_SEG	PROTOCOLID_ISCSI
+#define QED_CXT_ROCE_TID_SEG	PROTOCOLID_ROCE
+#define QED_CXT_FCOE_TID_SEG	PROTOCOLID_FCOE
+enum qed_cxt_elem_type {
+	QED_ELEM_CXT,
+	QED_ELEM_SRQ,
+	QED_ELEM_TASK
+};
+
+u32 qed_cxt_get_proto_cid_count(struct qed_hwfn *p_hwfn,
+				enum protocol_type type, u32 *vf_cid);
+
+/**
+ * @brief qed_cxt_set_pf_params - Set the PF params for cxt init
+ *
+ * @param p_hwfn
+ * @param rdma_tasks - requested maximum
+ * @return int
+ */
+int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn, u32 rdma_tasks);
+
+/**
+ * @brief qed_cxt_cfg_ilt_compute - compute ILT init parameters
+ *
+ * @param p_hwfn
+ * @param last_line
+ *
+ * @return int
+ */
+int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn, u32 *last_line);
+
+/**
+ * @brief qed_cxt_cfg_ilt_compute_excess - how many lines can be decreased
+ *
+ * @param p_hwfn
+ * @param used_lines
+ */
+u32 qed_cxt_cfg_ilt_compute_excess(struct qed_hwfn *p_hwfn, u32 used_lines);
+
+/**
+ * @brief qed_cxt_mngr_alloc - Allocate and init the context manager struct
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_cxt_mngr_alloc(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_cxt_mngr_free
+ *
+ * @param p_hwfn
+ */
+void qed_cxt_mngr_free(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_cxt_tables_alloc - Allocate ILT shadow, Searcher T2, acquired map
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_cxt_tables_alloc(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_cxt_mngr_setup - Reset the acquired CIDs
+ *
+ * @param p_hwfn
+ */
+void qed_cxt_mngr_setup(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_cxt_hw_init_common - Initailze ILT and DQ, common phase, per path.
+ *
+ *
+ *
+ * @param p_hwfn
+ */
+void qed_cxt_hw_init_common(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_cxt_hw_init_pf - Initailze ILT and DQ, PF phase, per path.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+void qed_cxt_hw_init_pf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
+/**
+ * @brief qed_qm_init_pf - Initailze the QM PF phase, per path
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param is_pf_loading
+ */
+void qed_qm_init_pf(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt, bool is_pf_loading);
+
+/**
+ * @brief Reconfigures QM pf on the fly
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @return int
+ */
+int qed_qm_reconf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
+#define QED_CXT_PF_CID (0xff)
+
+/**
+ * @brief qed_cxt_release - Release a cid
+ *
+ * @param p_hwfn
+ * @param cid
+ */
+void qed_cxt_release_cid(struct qed_hwfn *p_hwfn, u32 cid);
+
+/**
+ * @brief qed_cxt_release - Release a cid belonging to a vf-queue
+ *
+ * @param p_hwfn
+ * @param cid
+ * @param vfid - engine relative index. QED_CXT_PF_CID if belongs to PF
+ */
+void _qed_cxt_release_cid(struct qed_hwfn *p_hwfn, u32 cid, u8 vfid);
+
+/**
+ * @brief qed_cxt_acquire - Acquire a new cid of a specific protocol type
+ *
+ * @param p_hwfn
+ * @param type
+ * @param p_cid
+ *
+ * @return int
+ */
+int qed_cxt_acquire_cid(struct qed_hwfn *p_hwfn,
+			enum protocol_type type, u32 *p_cid);
+
+/**
+ * @brief _qed_cxt_acquire - Acquire a new cid of a specific protocol type
+ *                           for a vf-queue
+ *
+ * @param p_hwfn
+ * @param type
+ * @param p_cid
+ * @param vfid - engine relative index. QED_CXT_PF_CID if belongs to PF
+ *
+ * @return int
+ */
+int _qed_cxt_acquire_cid(struct qed_hwfn *p_hwfn,
+			 enum protocol_type type, u32 *p_cid, u8 vfid);
+
+int qed_cxt_dynamic_ilt_alloc(struct qed_hwfn *p_hwfn,
+			      enum qed_cxt_elem_type elem_type, u32 iid);
+u32 qed_cxt_get_proto_tid_count(struct qed_hwfn *p_hwfn,
+				enum protocol_type type);
+u32 qed_cxt_get_proto_cid_start(struct qed_hwfn *p_hwfn,
+				enum protocol_type type);
+int qed_cxt_free_proto_ilt(struct qed_hwfn *p_hwfn, enum protocol_type proto);
+
+#define QED_CTX_WORKING_MEM 0
+#define QED_CTX_FL_MEM 1
+int qed_cxt_get_task_ctx(struct qed_hwfn *p_hwfn,
+			 u32 tid, u8 ctx_type, void **task_ctx);
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
new file mode 100644
index 0000000..449777f
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -0,0 +1,2408 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/bitops.h>
+#include <linux/dcbnl.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_dcbx.h"
+#include "qed_hsi.h"
+#include "qed_sp.h"
+#include "qed_sriov.h"
+#include "qed_rdma.h"
+#ifdef CONFIG_DCB
+#include <linux/qed/qed_eth_if.h>
+#endif
+
+#define QED_DCBX_MAX_MIB_READ_TRY       (100)
+#define QED_ETH_TYPE_DEFAULT            (0)
+#define QED_ETH_TYPE_ROCE               (0x8915)
+#define QED_UDP_PORT_TYPE_ROCE_V2       (0x12B7)
+#define QED_ETH_TYPE_FCOE               (0x8906)
+#define QED_TCP_PORT_ISCSI              (0xCBC)
+
+#define QED_DCBX_INVALID_PRIORITY       0xFF
+
+/* Get Traffic Class from priority traffic class table, 4 bits represent
+ * the traffic class corresponding to the priority.
+ */
+#define QED_DCBX_PRIO2TC(prio_tc_tbl, prio) \
+	((u32)(prio_tc_tbl >> ((7 - prio) * 4)) & 0x7)
+
+static const struct qed_dcbx_app_metadata qed_dcbx_app_update[] = {
+	{DCBX_PROTOCOL_ISCSI, "ISCSI", QED_PCI_ISCSI},
+	{DCBX_PROTOCOL_FCOE, "FCOE", QED_PCI_FCOE},
+	{DCBX_PROTOCOL_ROCE, "ROCE", QED_PCI_ETH_ROCE},
+	{DCBX_PROTOCOL_ROCE_V2, "ROCE_V2", QED_PCI_ETH_ROCE},
+	{DCBX_PROTOCOL_ETH, "ETH", QED_PCI_ETH},
+};
+
+static bool qed_dcbx_app_ethtype(u32 app_info_bitmap)
+{
+	return !!(QED_MFW_GET_FIELD(app_info_bitmap, DCBX_APP_SF) ==
+		  DCBX_APP_SF_ETHTYPE);
+}
+
+static bool qed_dcbx_ieee_app_ethtype(u32 app_info_bitmap)
+{
+	u8 mfw_val = QED_MFW_GET_FIELD(app_info_bitmap, DCBX_APP_SF_IEEE);
+
+	/* Old MFW */
+	if (mfw_val == DCBX_APP_SF_IEEE_RESERVED)
+		return qed_dcbx_app_ethtype(app_info_bitmap);
+
+	return !!(mfw_val == DCBX_APP_SF_IEEE_ETHTYPE);
+}
+
+static bool qed_dcbx_app_port(u32 app_info_bitmap)
+{
+	return !!(QED_MFW_GET_FIELD(app_info_bitmap, DCBX_APP_SF) ==
+		  DCBX_APP_SF_PORT);
+}
+
+static bool qed_dcbx_ieee_app_port(u32 app_info_bitmap, u8 type)
+{
+	u8 mfw_val = QED_MFW_GET_FIELD(app_info_bitmap, DCBX_APP_SF_IEEE);
+
+	/* Old MFW */
+	if (mfw_val == DCBX_APP_SF_IEEE_RESERVED)
+		return qed_dcbx_app_port(app_info_bitmap);
+
+	return !!(mfw_val == type || mfw_val == DCBX_APP_SF_IEEE_TCP_UDP_PORT);
+}
+
+static bool qed_dcbx_default_tlv(u32 app_info_bitmap, u16 proto_id, bool ieee)
+{
+	bool ethtype;
+
+	if (ieee)
+		ethtype = qed_dcbx_ieee_app_ethtype(app_info_bitmap);
+	else
+		ethtype = qed_dcbx_app_ethtype(app_info_bitmap);
+
+	return !!(ethtype && (proto_id == QED_ETH_TYPE_DEFAULT));
+}
+
+static bool qed_dcbx_iscsi_tlv(u32 app_info_bitmap, u16 proto_id, bool ieee)
+{
+	bool port;
+
+	if (ieee)
+		port = qed_dcbx_ieee_app_port(app_info_bitmap,
+					      DCBX_APP_SF_IEEE_TCP_PORT);
+	else
+		port = qed_dcbx_app_port(app_info_bitmap);
+
+	return !!(port && (proto_id == QED_TCP_PORT_ISCSI));
+}
+
+static bool qed_dcbx_fcoe_tlv(u32 app_info_bitmap, u16 proto_id, bool ieee)
+{
+	bool ethtype;
+
+	if (ieee)
+		ethtype = qed_dcbx_ieee_app_ethtype(app_info_bitmap);
+	else
+		ethtype = qed_dcbx_app_ethtype(app_info_bitmap);
+
+	return !!(ethtype && (proto_id == QED_ETH_TYPE_FCOE));
+}
+
+static bool qed_dcbx_roce_tlv(u32 app_info_bitmap, u16 proto_id, bool ieee)
+{
+	bool ethtype;
+
+	if (ieee)
+		ethtype = qed_dcbx_ieee_app_ethtype(app_info_bitmap);
+	else
+		ethtype = qed_dcbx_app_ethtype(app_info_bitmap);
+
+	return !!(ethtype && (proto_id == QED_ETH_TYPE_ROCE));
+}
+
+static bool qed_dcbx_roce_v2_tlv(u32 app_info_bitmap, u16 proto_id, bool ieee)
+{
+	bool port;
+
+	if (ieee)
+		port = qed_dcbx_ieee_app_port(app_info_bitmap,
+					      DCBX_APP_SF_IEEE_UDP_PORT);
+	else
+		port = qed_dcbx_app_port(app_info_bitmap);
+
+	return !!(port && (proto_id == QED_UDP_PORT_TYPE_ROCE_V2));
+}
+
+static void
+qed_dcbx_dp_protocol(struct qed_hwfn *p_hwfn, struct qed_dcbx_results *p_data)
+{
+	enum dcbx_protocol_type id;
+	int i;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB, "DCBX negotiated: %d\n",
+		   p_data->dcbx_enabled);
+
+	for (i = 0; i < ARRAY_SIZE(qed_dcbx_app_update); i++) {
+		id = qed_dcbx_app_update[i].id;
+
+		DP_VERBOSE(p_hwfn, QED_MSG_DCB,
+			   "%s info: update %d, enable %d, prio %d, tc %d, num_tc %d\n",
+			   qed_dcbx_app_update[i].name, p_data->arr[id].update,
+			   p_data->arr[id].enable, p_data->arr[id].priority,
+			   p_data->arr[id].tc, p_hwfn->hw_info.num_active_tc);
+	}
+}
+
+static void
+qed_dcbx_set_params(struct qed_dcbx_results *p_data,
+		    struct qed_hw_info *p_info,
+		    bool enable,
+		    u8 prio,
+		    u8 tc,
+		    enum dcbx_protocol_type type,
+		    enum qed_pci_personality personality)
+{
+	/* PF update ramrod data */
+	p_data->arr[type].enable = enable;
+	p_data->arr[type].priority = prio;
+	p_data->arr[type].tc = tc;
+	if (enable)
+		p_data->arr[type].update = UPDATE_DCB;
+	else
+		p_data->arr[type].update = DONT_UPDATE_DCB_DSCP;
+
+	/* QM reconf data */
+	if (p_info->personality == personality)
+		p_info->offload_tc = tc;
+}
+
+/* Update app protocol data and hw_info fields with the TLV info */
+static void
+qed_dcbx_update_app_info(struct qed_dcbx_results *p_data,
+			 struct qed_hwfn *p_hwfn,
+			 bool enable,
+			 u8 prio, u8 tc, enum dcbx_protocol_type type)
+{
+	struct qed_hw_info *p_info = &p_hwfn->hw_info;
+	enum qed_pci_personality personality;
+	enum dcbx_protocol_type id;
+	char *name;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(qed_dcbx_app_update); i++) {
+		id = qed_dcbx_app_update[i].id;
+
+		if (type != id)
+			continue;
+
+		personality = qed_dcbx_app_update[i].personality;
+		name = qed_dcbx_app_update[i].name;
+
+		qed_dcbx_set_params(p_data, p_info, enable,
+				    prio, tc, type, personality);
+	}
+}
+
+static bool
+qed_dcbx_get_app_protocol_type(struct qed_hwfn *p_hwfn,
+			       u32 app_prio_bitmap,
+			       u16 id, enum dcbx_protocol_type *type, bool ieee)
+{
+	if (qed_dcbx_fcoe_tlv(app_prio_bitmap, id, ieee)) {
+		*type = DCBX_PROTOCOL_FCOE;
+	} else if (qed_dcbx_roce_tlv(app_prio_bitmap, id, ieee)) {
+		*type = DCBX_PROTOCOL_ROCE;
+	} else if (qed_dcbx_iscsi_tlv(app_prio_bitmap, id, ieee)) {
+		*type = DCBX_PROTOCOL_ISCSI;
+	} else if (qed_dcbx_default_tlv(app_prio_bitmap, id, ieee)) {
+		*type = DCBX_PROTOCOL_ETH;
+	} else if (qed_dcbx_roce_v2_tlv(app_prio_bitmap, id, ieee)) {
+		*type = DCBX_PROTOCOL_ROCE_V2;
+	} else {
+		*type = DCBX_MAX_PROTOCOL_TYPE;
+		DP_ERR(p_hwfn,
+		       "No action required, App TLV id = 0x%x app_prio_bitmap = 0x%x\n",
+		       id, app_prio_bitmap);
+		return false;
+	}
+
+	return true;
+}
+
+/* Parse app TLV's to update TC information in hw_info structure for
+ * reconfiguring QM. Get protocol specific data for PF update ramrod command.
+ */
+static int
+qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn,
+		     struct qed_dcbx_results *p_data,
+		     struct dcbx_app_priority_entry *p_tbl,
+		     u32 pri_tc_tbl, int count, u8 dcbx_version)
+{
+	enum dcbx_protocol_type type;
+	u8 tc, priority_map;
+	bool enable, ieee;
+	u16 protocol_id;
+	int priority;
+	int i;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB, "Num APP entries = %d\n", count);
+
+	ieee = (dcbx_version == DCBX_CONFIG_VERSION_IEEE);
+	/* Parse APP TLV */
+	for (i = 0; i < count; i++) {
+		protocol_id = QED_MFW_GET_FIELD(p_tbl[i].entry,
+						DCBX_APP_PROTOCOL_ID);
+		priority_map = QED_MFW_GET_FIELD(p_tbl[i].entry,
+						 DCBX_APP_PRI_MAP);
+		priority = ffs(priority_map) - 1;
+		if (priority < 0) {
+			DP_ERR(p_hwfn, "Invalid priority\n");
+			return -EINVAL;
+		}
+
+		tc = QED_DCBX_PRIO2TC(pri_tc_tbl, priority);
+		if (qed_dcbx_get_app_protocol_type(p_hwfn, p_tbl[i].entry,
+						   protocol_id, &type, ieee)) {
+			/* ETH always have the enable bit reset, as it gets
+			 * vlan information per packet. For other protocols,
+			 * should be set according to the dcbx_enabled
+			 * indication, but we only got here if there was an
+			 * app tlv for the protocol, so dcbx must be enabled.
+			 */
+			enable = !(type == DCBX_PROTOCOL_ETH);
+
+			qed_dcbx_update_app_info(p_data, p_hwfn, enable,
+						 priority, tc, type);
+		}
+	}
+
+	/* Update ramrod protocol data and hw_info fields
+	 * with default info when corresponding APP TLV's are not detected.
+	 * The enabled field has a different logic for ethernet as only for
+	 * ethernet dcb should disabled by default, as the information arrives
+	 * from the OS (unless an explicit app tlv was present).
+	 */
+	tc = p_data->arr[DCBX_PROTOCOL_ETH].tc;
+	priority = p_data->arr[DCBX_PROTOCOL_ETH].priority;
+	for (type = 0; type < DCBX_MAX_PROTOCOL_TYPE; type++) {
+		if (p_data->arr[type].update)
+			continue;
+
+		enable = (type == DCBX_PROTOCOL_ETH) ? false : !!dcbx_version;
+		qed_dcbx_update_app_info(p_data, p_hwfn, enable,
+					 priority, tc, type);
+	}
+
+	return 0;
+}
+
+/* Parse app TLV's to update TC information in hw_info structure for
+ * reconfiguring QM. Get protocol specific data for PF update ramrod command.
+ */
+static int qed_dcbx_process_mib_info(struct qed_hwfn *p_hwfn)
+{
+	struct dcbx_app_priority_feature *p_app;
+	struct dcbx_app_priority_entry *p_tbl;
+	struct qed_dcbx_results data = { 0 };
+	struct dcbx_ets_feature *p_ets;
+	struct qed_hw_info *p_info;
+	u32 pri_tc_tbl, flags;
+	u8 dcbx_version;
+	int num_entries;
+	int rc = 0;
+
+	flags = p_hwfn->p_dcbx_info->operational.flags;
+	dcbx_version = QED_MFW_GET_FIELD(flags, DCBX_CONFIG_VERSION);
+
+	p_app = &p_hwfn->p_dcbx_info->operational.features.app;
+	p_tbl = p_app->app_pri_tbl;
+
+	p_ets = &p_hwfn->p_dcbx_info->operational.features.ets;
+	pri_tc_tbl = p_ets->pri_tc_tbl[0];
+
+	p_info = &p_hwfn->hw_info;
+	num_entries = QED_MFW_GET_FIELD(p_app->flags, DCBX_APP_NUM_ENTRIES);
+
+	rc = qed_dcbx_process_tlv(p_hwfn, &data, p_tbl, pri_tc_tbl,
+				  num_entries, dcbx_version);
+	if (rc)
+		return rc;
+
+	p_info->num_active_tc = QED_MFW_GET_FIELD(p_ets->flags,
+						  DCBX_ETS_MAX_TCS);
+	p_hwfn->qm_info.ooo_tc = QED_MFW_GET_FIELD(p_ets->flags, DCBX_OOO_TC);
+	data.pf_id = p_hwfn->rel_pf_id;
+	data.dcbx_enabled = !!dcbx_version;
+
+	qed_dcbx_dp_protocol(p_hwfn, &data);
+
+	memcpy(&p_hwfn->p_dcbx_info->results, &data,
+	       sizeof(struct qed_dcbx_results));
+
+	return 0;
+}
+
+static int
+qed_dcbx_copy_mib(struct qed_hwfn *p_hwfn,
+		  struct qed_ptt *p_ptt,
+		  struct qed_dcbx_mib_meta_data *p_data,
+		  enum qed_mib_read_type type)
+{
+	u32 prefix_seq_num, suffix_seq_num;
+	int read_count = 0;
+	int rc = 0;
+
+	/* The data is considered to be valid only if both sequence numbers are
+	 * the same.
+	 */
+	do {
+		if (type == QED_DCBX_REMOTE_LLDP_MIB) {
+			qed_memcpy_from(p_hwfn, p_ptt, p_data->lldp_remote,
+					p_data->addr, p_data->size);
+			prefix_seq_num = p_data->lldp_remote->prefix_seq_num;
+			suffix_seq_num = p_data->lldp_remote->suffix_seq_num;
+		} else {
+			qed_memcpy_from(p_hwfn, p_ptt, p_data->mib,
+					p_data->addr, p_data->size);
+			prefix_seq_num = p_data->mib->prefix_seq_num;
+			suffix_seq_num = p_data->mib->suffix_seq_num;
+		}
+		read_count++;
+
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_DCB,
+			   "mib type = %d, try count = %d prefix seq num  = %d suffix seq num = %d\n",
+			   type, read_count, prefix_seq_num, suffix_seq_num);
+	} while ((prefix_seq_num != suffix_seq_num) &&
+		 (read_count < QED_DCBX_MAX_MIB_READ_TRY));
+
+	if (read_count >= QED_DCBX_MAX_MIB_READ_TRY) {
+		DP_ERR(p_hwfn,
+		       "MIB read err, mib type = %d, try count = %d prefix seq num = %d suffix seq num = %d\n",
+		       type, read_count, prefix_seq_num, suffix_seq_num);
+		rc = -EIO;
+	}
+
+	return rc;
+}
+
+static void
+qed_dcbx_get_priority_info(struct qed_hwfn *p_hwfn,
+			   struct qed_dcbx_app_prio *p_prio,
+			   struct qed_dcbx_results *p_results)
+{
+	u8 val;
+
+	p_prio->roce = QED_DCBX_INVALID_PRIORITY;
+	p_prio->roce_v2 = QED_DCBX_INVALID_PRIORITY;
+	p_prio->iscsi = QED_DCBX_INVALID_PRIORITY;
+	p_prio->fcoe = QED_DCBX_INVALID_PRIORITY;
+
+	if (p_results->arr[DCBX_PROTOCOL_ROCE].update &&
+	    p_results->arr[DCBX_PROTOCOL_ROCE].enable)
+		p_prio->roce = p_results->arr[DCBX_PROTOCOL_ROCE].priority;
+
+	if (p_results->arr[DCBX_PROTOCOL_ROCE_V2].update &&
+	    p_results->arr[DCBX_PROTOCOL_ROCE_V2].enable) {
+		val = p_results->arr[DCBX_PROTOCOL_ROCE_V2].priority;
+		p_prio->roce_v2 = val;
+	}
+
+	if (p_results->arr[DCBX_PROTOCOL_ISCSI].update &&
+	    p_results->arr[DCBX_PROTOCOL_ISCSI].enable)
+		p_prio->iscsi = p_results->arr[DCBX_PROTOCOL_ISCSI].priority;
+
+	if (p_results->arr[DCBX_PROTOCOL_FCOE].update &&
+	    p_results->arr[DCBX_PROTOCOL_FCOE].enable)
+		p_prio->fcoe = p_results->arr[DCBX_PROTOCOL_FCOE].priority;
+
+	if (p_results->arr[DCBX_PROTOCOL_ETH].update &&
+	    p_results->arr[DCBX_PROTOCOL_ETH].enable)
+		p_prio->eth = p_results->arr[DCBX_PROTOCOL_ETH].priority;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB,
+		   "Priorities: iscsi %d, roce %d, roce v2 %d, fcoe %d, eth %d\n",
+		   p_prio->iscsi, p_prio->roce, p_prio->roce_v2, p_prio->fcoe,
+		   p_prio->eth);
+}
+
+static void
+qed_dcbx_get_app_data(struct qed_hwfn *p_hwfn,
+		      struct dcbx_app_priority_feature *p_app,
+		      struct dcbx_app_priority_entry *p_tbl,
+		      struct qed_dcbx_params *p_params, bool ieee)
+{
+	struct qed_app_entry *entry;
+	u8 pri_map;
+	int i;
+
+	p_params->app_willing = QED_MFW_GET_FIELD(p_app->flags,
+						  DCBX_APP_WILLING);
+	p_params->app_valid = QED_MFW_GET_FIELD(p_app->flags, DCBX_APP_ENABLED);
+	p_params->app_error = QED_MFW_GET_FIELD(p_app->flags, DCBX_APP_ERROR);
+	p_params->num_app_entries = QED_MFW_GET_FIELD(p_app->flags,
+						      DCBX_APP_NUM_ENTRIES);
+	for (i = 0; i < DCBX_MAX_APP_PROTOCOL; i++) {
+		entry = &p_params->app_entry[i];
+		if (ieee) {
+			u8 sf_ieee;
+			u32 val;
+
+			sf_ieee = QED_MFW_GET_FIELD(p_tbl[i].entry,
+						    DCBX_APP_SF_IEEE);
+			switch (sf_ieee) {
+			case DCBX_APP_SF_IEEE_RESERVED:
+				/* Old MFW */
+				val = QED_MFW_GET_FIELD(p_tbl[i].entry,
+							DCBX_APP_SF);
+				entry->sf_ieee = val ?
+				    QED_DCBX_SF_IEEE_TCP_UDP_PORT :
+				    QED_DCBX_SF_IEEE_ETHTYPE;
+				break;
+			case DCBX_APP_SF_IEEE_ETHTYPE:
+				entry->sf_ieee = QED_DCBX_SF_IEEE_ETHTYPE;
+				break;
+			case DCBX_APP_SF_IEEE_TCP_PORT:
+				entry->sf_ieee = QED_DCBX_SF_IEEE_TCP_PORT;
+				break;
+			case DCBX_APP_SF_IEEE_UDP_PORT:
+				entry->sf_ieee = QED_DCBX_SF_IEEE_UDP_PORT;
+				break;
+			case DCBX_APP_SF_IEEE_TCP_UDP_PORT:
+				entry->sf_ieee = QED_DCBX_SF_IEEE_TCP_UDP_PORT;
+				break;
+			}
+		} else {
+			entry->ethtype = !(QED_MFW_GET_FIELD(p_tbl[i].entry,
+							     DCBX_APP_SF));
+		}
+
+		pri_map = QED_MFW_GET_FIELD(p_tbl[i].entry, DCBX_APP_PRI_MAP);
+		entry->prio = ffs(pri_map) - 1;
+		entry->proto_id = QED_MFW_GET_FIELD(p_tbl[i].entry,
+						    DCBX_APP_PROTOCOL_ID);
+		qed_dcbx_get_app_protocol_type(p_hwfn, p_tbl[i].entry,
+					       entry->proto_id,
+					       &entry->proto_type, ieee);
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB,
+		   "APP params: willing %d, valid %d error = %d\n",
+		   p_params->app_willing, p_params->app_valid,
+		   p_params->app_error);
+}
+
+static void
+qed_dcbx_get_pfc_data(struct qed_hwfn *p_hwfn,
+		      u32 pfc, struct qed_dcbx_params *p_params)
+{
+	u8 pfc_map;
+
+	p_params->pfc.willing = QED_MFW_GET_FIELD(pfc, DCBX_PFC_WILLING);
+	p_params->pfc.max_tc = QED_MFW_GET_FIELD(pfc, DCBX_PFC_CAPS);
+	p_params->pfc.enabled = QED_MFW_GET_FIELD(pfc, DCBX_PFC_ENABLED);
+	pfc_map = QED_MFW_GET_FIELD(pfc, DCBX_PFC_PRI_EN_BITMAP);
+	p_params->pfc.prio[0] = !!(pfc_map & DCBX_PFC_PRI_EN_BITMAP_PRI_0);
+	p_params->pfc.prio[1] = !!(pfc_map & DCBX_PFC_PRI_EN_BITMAP_PRI_1);
+	p_params->pfc.prio[2] = !!(pfc_map & DCBX_PFC_PRI_EN_BITMAP_PRI_2);
+	p_params->pfc.prio[3] = !!(pfc_map & DCBX_PFC_PRI_EN_BITMAP_PRI_3);
+	p_params->pfc.prio[4] = !!(pfc_map & DCBX_PFC_PRI_EN_BITMAP_PRI_4);
+	p_params->pfc.prio[5] = !!(pfc_map & DCBX_PFC_PRI_EN_BITMAP_PRI_5);
+	p_params->pfc.prio[6] = !!(pfc_map & DCBX_PFC_PRI_EN_BITMAP_PRI_6);
+	p_params->pfc.prio[7] = !!(pfc_map & DCBX_PFC_PRI_EN_BITMAP_PRI_7);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB,
+		   "PFC params: willing %d, pfc_bitmap %u max_tc = %u enabled = %d\n",
+		   p_params->pfc.willing, pfc_map, p_params->pfc.max_tc,
+		   p_params->pfc.enabled);
+}
+
+static void
+qed_dcbx_get_ets_data(struct qed_hwfn *p_hwfn,
+		      struct dcbx_ets_feature *p_ets,
+		      struct qed_dcbx_params *p_params)
+{
+	u32 bw_map[2], tsa_map[2], pri_map;
+	int i;
+
+	p_params->ets_willing = QED_MFW_GET_FIELD(p_ets->flags,
+						  DCBX_ETS_WILLING);
+	p_params->ets_enabled = QED_MFW_GET_FIELD(p_ets->flags,
+						  DCBX_ETS_ENABLED);
+	p_params->ets_cbs = QED_MFW_GET_FIELD(p_ets->flags, DCBX_ETS_CBS);
+	p_params->max_ets_tc = QED_MFW_GET_FIELD(p_ets->flags,
+						 DCBX_ETS_MAX_TCS);
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB,
+		   "ETS params: willing %d, enabled = %d ets_cbs %d pri_tc_tbl_0 %x max_ets_tc %d\n",
+		   p_params->ets_willing, p_params->ets_enabled,
+		   p_params->ets_cbs, p_ets->pri_tc_tbl[0],
+		   p_params->max_ets_tc);
+
+	if (p_params->ets_enabled && !p_params->max_ets_tc) {
+		p_params->max_ets_tc = QED_MAX_PFC_PRIORITIES;
+		DP_VERBOSE(p_hwfn, QED_MSG_DCB,
+			   "ETS params: max_ets_tc is forced to %d\n",
+		p_params->max_ets_tc);
+	}
+
+	/* 8 bit tsa and bw data corresponding to each of the 8 TC's are
+	 * encoded in a type u32 array of size 2.
+	 */
+	bw_map[0] = be32_to_cpu(p_ets->tc_bw_tbl[0]);
+	bw_map[1] = be32_to_cpu(p_ets->tc_bw_tbl[1]);
+	tsa_map[0] = be32_to_cpu(p_ets->tc_tsa_tbl[0]);
+	tsa_map[1] = be32_to_cpu(p_ets->tc_tsa_tbl[1]);
+	pri_map = p_ets->pri_tc_tbl[0];
+	for (i = 0; i < QED_MAX_PFC_PRIORITIES; i++) {
+		p_params->ets_tc_bw_tbl[i] = ((u8 *)bw_map)[i];
+		p_params->ets_tc_tsa_tbl[i] = ((u8 *)tsa_map)[i];
+		p_params->ets_pri_tc_tbl[i] = QED_DCBX_PRIO2TC(pri_map, i);
+		DP_VERBOSE(p_hwfn, QED_MSG_DCB,
+			   "elem %d  bw_tbl %x tsa_tbl %x\n",
+			   i, p_params->ets_tc_bw_tbl[i],
+			   p_params->ets_tc_tsa_tbl[i]);
+	}
+}
+
+static void
+qed_dcbx_get_common_params(struct qed_hwfn *p_hwfn,
+			   struct dcbx_app_priority_feature *p_app,
+			   struct dcbx_app_priority_entry *p_tbl,
+			   struct dcbx_ets_feature *p_ets,
+			   u32 pfc, struct qed_dcbx_params *p_params, bool ieee)
+{
+	qed_dcbx_get_app_data(p_hwfn, p_app, p_tbl, p_params, ieee);
+	qed_dcbx_get_ets_data(p_hwfn, p_ets, p_params);
+	qed_dcbx_get_pfc_data(p_hwfn, pfc, p_params);
+}
+
+static void
+qed_dcbx_get_local_params(struct qed_hwfn *p_hwfn, struct qed_dcbx_get *params)
+{
+	struct dcbx_features *p_feat;
+
+	p_feat = &p_hwfn->p_dcbx_info->local_admin.features;
+	qed_dcbx_get_common_params(p_hwfn, &p_feat->app,
+				   p_feat->app.app_pri_tbl, &p_feat->ets,
+				   p_feat->pfc, &params->local.params, false);
+	params->local.valid = true;
+}
+
+static void
+qed_dcbx_get_remote_params(struct qed_hwfn *p_hwfn, struct qed_dcbx_get *params)
+{
+	struct dcbx_features *p_feat;
+
+	p_feat = &p_hwfn->p_dcbx_info->remote.features;
+	qed_dcbx_get_common_params(p_hwfn, &p_feat->app,
+				   p_feat->app.app_pri_tbl, &p_feat->ets,
+				   p_feat->pfc, &params->remote.params, false);
+	params->remote.valid = true;
+}
+
+static void
+qed_dcbx_get_operational_params(struct qed_hwfn *p_hwfn,
+				struct qed_dcbx_get *params)
+{
+	struct qed_dcbx_operational_params *p_operational;
+	struct qed_dcbx_results *p_results;
+	struct dcbx_features *p_feat;
+	bool enabled, err;
+	u32 flags;
+	bool val;
+
+	flags = p_hwfn->p_dcbx_info->operational.flags;
+
+	/* If DCBx version is non zero, then negotiation
+	 * was successfuly performed
+	 */
+	p_operational = &params->operational;
+	enabled = !!(QED_MFW_GET_FIELD(flags, DCBX_CONFIG_VERSION) !=
+		     DCBX_CONFIG_VERSION_DISABLED);
+	if (!enabled) {
+		p_operational->enabled = enabled;
+		p_operational->valid = false;
+		DP_VERBOSE(p_hwfn, QED_MSG_DCB, "Dcbx is disabled\n");
+		return;
+	}
+
+	p_feat = &p_hwfn->p_dcbx_info->operational.features;
+	p_results = &p_hwfn->p_dcbx_info->results;
+
+	val = !!(QED_MFW_GET_FIELD(flags, DCBX_CONFIG_VERSION) ==
+		 DCBX_CONFIG_VERSION_IEEE);
+	p_operational->ieee = val;
+	val = !!(QED_MFW_GET_FIELD(flags, DCBX_CONFIG_VERSION) ==
+		 DCBX_CONFIG_VERSION_CEE);
+	p_operational->cee = val;
+
+	val = !!(QED_MFW_GET_FIELD(flags, DCBX_CONFIG_VERSION) ==
+		 DCBX_CONFIG_VERSION_STATIC);
+	p_operational->local = val;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB,
+		   "Version support: ieee %d, cee %d, static %d\n",
+		   p_operational->ieee, p_operational->cee,
+		   p_operational->local);
+
+	qed_dcbx_get_common_params(p_hwfn, &p_feat->app,
+				   p_feat->app.app_pri_tbl, &p_feat->ets,
+				   p_feat->pfc, &params->operational.params,
+				   p_operational->ieee);
+	qed_dcbx_get_priority_info(p_hwfn, &p_operational->app_prio, p_results);
+	err = QED_MFW_GET_FIELD(p_feat->app.flags, DCBX_APP_ERROR);
+	p_operational->err = err;
+	p_operational->enabled = enabled;
+	p_operational->valid = true;
+}
+
+static void
+qed_dcbx_get_local_lldp_params(struct qed_hwfn *p_hwfn,
+			       struct qed_dcbx_get *params)
+{
+	struct lldp_config_params_s *p_local;
+
+	p_local = &p_hwfn->p_dcbx_info->lldp_local[LLDP_NEAREST_BRIDGE];
+
+	memcpy(params->lldp_local.local_chassis_id, p_local->local_chassis_id,
+	       ARRAY_SIZE(p_local->local_chassis_id));
+	memcpy(params->lldp_local.local_port_id, p_local->local_port_id,
+	       ARRAY_SIZE(p_local->local_port_id));
+}
+
+static void
+qed_dcbx_get_remote_lldp_params(struct qed_hwfn *p_hwfn,
+				struct qed_dcbx_get *params)
+{
+	struct lldp_status_params_s *p_remote;
+
+	p_remote = &p_hwfn->p_dcbx_info->lldp_remote[LLDP_NEAREST_BRIDGE];
+
+	memcpy(params->lldp_remote.peer_chassis_id, p_remote->peer_chassis_id,
+	       ARRAY_SIZE(p_remote->peer_chassis_id));
+	memcpy(params->lldp_remote.peer_port_id, p_remote->peer_port_id,
+	       ARRAY_SIZE(p_remote->peer_port_id));
+}
+
+static int
+qed_dcbx_get_params(struct qed_hwfn *p_hwfn, struct qed_dcbx_get *p_params,
+		    enum qed_mib_read_type type)
+{
+	switch (type) {
+	case QED_DCBX_REMOTE_MIB:
+		qed_dcbx_get_remote_params(p_hwfn, p_params);
+		break;
+	case QED_DCBX_LOCAL_MIB:
+		qed_dcbx_get_local_params(p_hwfn, p_params);
+		break;
+	case QED_DCBX_OPERATIONAL_MIB:
+		qed_dcbx_get_operational_params(p_hwfn, p_params);
+		break;
+	case QED_DCBX_REMOTE_LLDP_MIB:
+		qed_dcbx_get_remote_lldp_params(p_hwfn, p_params);
+		break;
+	case QED_DCBX_LOCAL_LLDP_MIB:
+		qed_dcbx_get_local_lldp_params(p_hwfn, p_params);
+		break;
+	default:
+		DP_ERR(p_hwfn, "MIB read err, unknown mib type %d\n", type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+qed_dcbx_read_local_lldp_mib(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr + offsetof(struct public_port,
+							   lldp_config_params);
+	data.lldp_local = p_hwfn->p_dcbx_info->lldp_local;
+	data.size = sizeof(struct lldp_config_params_s);
+	qed_memcpy_from(p_hwfn, p_ptt, data.lldp_local, data.addr, data.size);
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_remote_lldp_mib(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      enum qed_mib_read_type type)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr + offsetof(struct public_port,
+							   lldp_status_params);
+	data.lldp_remote = p_hwfn->p_dcbx_info->lldp_remote;
+	data.size = sizeof(struct lldp_status_params_s);
+	rc = qed_dcbx_copy_mib(p_hwfn, p_ptt, &data, type);
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_operational_mib(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      enum qed_mib_read_type type)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr +
+		    offsetof(struct public_port, operational_dcbx_mib);
+	data.mib = &p_hwfn->p_dcbx_info->operational;
+	data.size = sizeof(struct dcbx_mib);
+	rc = qed_dcbx_copy_mib(p_hwfn, p_ptt, &data, type);
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_remote_mib(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt, enum qed_mib_read_type type)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr +
+		    offsetof(struct public_port, remote_dcbx_mib);
+	data.mib = &p_hwfn->p_dcbx_info->remote;
+	data.size = sizeof(struct dcbx_mib);
+	rc = qed_dcbx_copy_mib(p_hwfn, p_ptt, &data, type);
+
+	return rc;
+}
+
+static int
+qed_dcbx_read_local_mib(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_dcbx_mib_meta_data data;
+	int rc = 0;
+
+	memset(&data, 0, sizeof(data));
+	data.addr = p_hwfn->mcp_info->port_addr +
+		    offsetof(struct public_port, local_admin_dcbx_mib);
+	data.local_admin = &p_hwfn->p_dcbx_info->local_admin;
+	data.size = sizeof(struct dcbx_local_params);
+	qed_memcpy_from(p_hwfn, p_ptt, data.local_admin, data.addr, data.size);
+
+	return rc;
+}
+
+static int qed_dcbx_read_mib(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt, enum qed_mib_read_type type)
+{
+	int rc = -EINVAL;
+
+	switch (type) {
+	case QED_DCBX_OPERATIONAL_MIB:
+		rc = qed_dcbx_read_operational_mib(p_hwfn, p_ptt, type);
+		break;
+	case QED_DCBX_REMOTE_MIB:
+		rc = qed_dcbx_read_remote_mib(p_hwfn, p_ptt, type);
+		break;
+	case QED_DCBX_LOCAL_MIB:
+		rc = qed_dcbx_read_local_mib(p_hwfn, p_ptt);
+		break;
+	case QED_DCBX_REMOTE_LLDP_MIB:
+		rc = qed_dcbx_read_remote_lldp_mib(p_hwfn, p_ptt, type);
+		break;
+	case QED_DCBX_LOCAL_LLDP_MIB:
+		rc = qed_dcbx_read_local_lldp_mib(p_hwfn, p_ptt);
+		break;
+	default:
+		DP_ERR(p_hwfn, "MIB read err, unknown mib type %d\n", type);
+	}
+
+	return rc;
+}
+
+void qed_dcbx_aen(struct qed_hwfn *hwfn, u32 mib_type)
+{
+	struct qed_common_cb_ops *op = hwfn->cdev->protocol_ops.common;
+	void *cookie = hwfn->cdev->ops_cookie;
+
+	if (cookie && op->dcbx_aen)
+		op->dcbx_aen(cookie, &hwfn->p_dcbx_info->get, mib_type);
+}
+
+/* Read updated MIB.
+ * Reconfigure QM and invoke PF update ramrod command if operational MIB
+ * change is detected.
+ */
+int
+qed_dcbx_mib_update_event(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, enum qed_mib_read_type type)
+{
+	int rc = 0;
+
+	rc = qed_dcbx_read_mib(p_hwfn, p_ptt, type);
+	if (rc)
+		return rc;
+
+	if (type == QED_DCBX_OPERATIONAL_MIB) {
+		rc = qed_dcbx_process_mib_info(p_hwfn);
+		if (!rc) {
+			/* reconfigure tcs of QM queues according
+			 * to negotiation results
+			 */
+			qed_qm_reconf(p_hwfn, p_ptt);
+
+			/* update storm FW with negotiation results */
+			qed_sp_pf_update(p_hwfn);
+
+			/* for roce PFs, we may want to enable/disable DPM
+			 * when DCBx change occurs
+			 */
+			if (p_hwfn->hw_info.personality ==
+			    QED_PCI_ETH_ROCE)
+				qed_roce_dpm_dcbx(p_hwfn, p_ptt);
+		}
+	}
+
+	qed_dcbx_get_params(p_hwfn, &p_hwfn->p_dcbx_info->get, type);
+
+	if (type == QED_DCBX_OPERATIONAL_MIB) {
+		struct qed_dcbx_results *p_data;
+		u16 val;
+
+		/* Configure in NIG which protocols support EDPM and should
+		 * honor PFC.
+		 */
+		p_data = &p_hwfn->p_dcbx_info->results;
+		val = (0x1 << p_data->arr[DCBX_PROTOCOL_ROCE].tc) |
+		      (0x1 << p_data->arr[DCBX_PROTOCOL_ROCE_V2].tc);
+		val <<= NIG_REG_TX_EDPM_CTRL_TX_EDPM_TC_EN_SHIFT;
+		val |= NIG_REG_TX_EDPM_CTRL_TX_EDPM_EN;
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TX_EDPM_CTRL, val);
+	}
+
+	qed_dcbx_aen(p_hwfn, type);
+
+	return rc;
+}
+
+int qed_dcbx_info_alloc(struct qed_hwfn *p_hwfn)
+{
+	p_hwfn->p_dcbx_info = kzalloc(sizeof(*p_hwfn->p_dcbx_info), GFP_KERNEL);
+	if (!p_hwfn->p_dcbx_info)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void qed_dcbx_info_free(struct qed_hwfn *p_hwfn)
+{
+	kfree(p_hwfn->p_dcbx_info);
+	p_hwfn->p_dcbx_info = NULL;
+}
+
+static void qed_dcbx_update_protocol_data(struct protocol_dcb_data *p_data,
+					  struct qed_dcbx_results *p_src,
+					  enum dcbx_protocol_type type)
+{
+	p_data->dcb_enable_flag = p_src->arr[type].enable;
+	p_data->dcb_priority = p_src->arr[type].priority;
+	p_data->dcb_tc = p_src->arr[type].tc;
+}
+
+/* Set pf update ramrod command params */
+void qed_dcbx_set_pf_update_params(struct qed_dcbx_results *p_src,
+				   struct pf_update_ramrod_data *p_dest)
+{
+	struct protocol_dcb_data *p_dcb_data;
+	u8 update_flag;
+
+	update_flag = p_src->arr[DCBX_PROTOCOL_FCOE].update;
+	p_dest->update_fcoe_dcb_data_mode = update_flag;
+
+	update_flag = p_src->arr[DCBX_PROTOCOL_ROCE].update;
+	p_dest->update_roce_dcb_data_mode = update_flag;
+
+	update_flag = p_src->arr[DCBX_PROTOCOL_ROCE_V2].update;
+	p_dest->update_rroce_dcb_data_mode = update_flag;
+
+	update_flag = p_src->arr[DCBX_PROTOCOL_ISCSI].update;
+	p_dest->update_iscsi_dcb_data_mode = update_flag;
+	update_flag = p_src->arr[DCBX_PROTOCOL_ETH].update;
+	p_dest->update_eth_dcb_data_mode = update_flag;
+
+	p_dcb_data = &p_dest->fcoe_dcb_data;
+	qed_dcbx_update_protocol_data(p_dcb_data, p_src, DCBX_PROTOCOL_FCOE);
+	p_dcb_data = &p_dest->roce_dcb_data;
+	qed_dcbx_update_protocol_data(p_dcb_data, p_src, DCBX_PROTOCOL_ROCE);
+	p_dcb_data = &p_dest->rroce_dcb_data;
+	qed_dcbx_update_protocol_data(p_dcb_data, p_src, DCBX_PROTOCOL_ROCE_V2);
+	p_dcb_data = &p_dest->iscsi_dcb_data;
+	qed_dcbx_update_protocol_data(p_dcb_data, p_src, DCBX_PROTOCOL_ISCSI);
+	p_dcb_data = &p_dest->eth_dcb_data;
+	qed_dcbx_update_protocol_data(p_dcb_data, p_src, DCBX_PROTOCOL_ETH);
+}
+
+#ifdef CONFIG_DCB
+static int qed_dcbx_query_params(struct qed_hwfn *p_hwfn,
+				 struct qed_dcbx_get *p_get,
+				 enum qed_mib_read_type type)
+{
+	struct qed_ptt *p_ptt;
+	int rc;
+
+	if (IS_VF(p_hwfn->cdev))
+		return -EINVAL;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EBUSY;
+
+	rc = qed_dcbx_read_mib(p_hwfn, p_ptt, type);
+	if (rc)
+		goto out;
+
+	rc = qed_dcbx_get_params(p_hwfn, p_get, type);
+
+out:
+	qed_ptt_release(p_hwfn, p_ptt);
+	return rc;
+}
+
+static void
+qed_dcbx_set_pfc_data(struct qed_hwfn *p_hwfn,
+		      u32 *pfc, struct qed_dcbx_params *p_params)
+{
+	u8 pfc_map = 0;
+	int i;
+
+	*pfc &= ~DCBX_PFC_ERROR_MASK;
+
+	if (p_params->pfc.willing)
+		*pfc |= DCBX_PFC_WILLING_MASK;
+	else
+		*pfc &= ~DCBX_PFC_WILLING_MASK;
+
+	if (p_params->pfc.enabled)
+		*pfc |= DCBX_PFC_ENABLED_MASK;
+	else
+		*pfc &= ~DCBX_PFC_ENABLED_MASK;
+
+	*pfc &= ~DCBX_PFC_CAPS_MASK;
+	*pfc |= (u32)p_params->pfc.max_tc << DCBX_PFC_CAPS_SHIFT;
+
+	for (i = 0; i < QED_MAX_PFC_PRIORITIES; i++)
+		if (p_params->pfc.prio[i])
+			pfc_map |= BIT(i);
+
+	*pfc &= ~DCBX_PFC_PRI_EN_BITMAP_MASK;
+	*pfc |= (pfc_map << DCBX_PFC_PRI_EN_BITMAP_SHIFT);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB, "pfc = 0x%x\n", *pfc);
+}
+
+static void
+qed_dcbx_set_ets_data(struct qed_hwfn *p_hwfn,
+		      struct dcbx_ets_feature *p_ets,
+		      struct qed_dcbx_params *p_params)
+{
+	u8 *bw_map, *tsa_map;
+	u32 val;
+	int i;
+
+	if (p_params->ets_willing)
+		p_ets->flags |= DCBX_ETS_WILLING_MASK;
+	else
+		p_ets->flags &= ~DCBX_ETS_WILLING_MASK;
+
+	if (p_params->ets_cbs)
+		p_ets->flags |= DCBX_ETS_CBS_MASK;
+	else
+		p_ets->flags &= ~DCBX_ETS_CBS_MASK;
+
+	if (p_params->ets_enabled)
+		p_ets->flags |= DCBX_ETS_ENABLED_MASK;
+	else
+		p_ets->flags &= ~DCBX_ETS_ENABLED_MASK;
+
+	p_ets->flags &= ~DCBX_ETS_MAX_TCS_MASK;
+	p_ets->flags |= (u32)p_params->max_ets_tc << DCBX_ETS_MAX_TCS_SHIFT;
+
+	bw_map = (u8 *)&p_ets->tc_bw_tbl[0];
+	tsa_map = (u8 *)&p_ets->tc_tsa_tbl[0];
+	p_ets->pri_tc_tbl[0] = 0;
+	for (i = 0; i < QED_MAX_PFC_PRIORITIES; i++) {
+		bw_map[i] = p_params->ets_tc_bw_tbl[i];
+		tsa_map[i] = p_params->ets_tc_tsa_tbl[i];
+		/* Copy the priority value to the corresponding 4 bits in the
+		 * traffic class table.
+		 */
+		val = (((u32)p_params->ets_pri_tc_tbl[i]) << ((7 - i) * 4));
+		p_ets->pri_tc_tbl[0] |= val;
+	}
+	for (i = 0; i < 2; i++) {
+		p_ets->tc_bw_tbl[i] = cpu_to_be32(p_ets->tc_bw_tbl[i]);
+		p_ets->tc_tsa_tbl[i] = cpu_to_be32(p_ets->tc_tsa_tbl[i]);
+	}
+}
+
+static void
+qed_dcbx_set_app_data(struct qed_hwfn *p_hwfn,
+		      struct dcbx_app_priority_feature *p_app,
+		      struct qed_dcbx_params *p_params, bool ieee)
+{
+	u32 *entry;
+	int i;
+
+	if (p_params->app_willing)
+		p_app->flags |= DCBX_APP_WILLING_MASK;
+	else
+		p_app->flags &= ~DCBX_APP_WILLING_MASK;
+
+	if (p_params->app_valid)
+		p_app->flags |= DCBX_APP_ENABLED_MASK;
+	else
+		p_app->flags &= ~DCBX_APP_ENABLED_MASK;
+
+	p_app->flags &= ~DCBX_APP_NUM_ENTRIES_MASK;
+	p_app->flags |= (u32)p_params->num_app_entries <<
+	    DCBX_APP_NUM_ENTRIES_SHIFT;
+
+	for (i = 0; i < DCBX_MAX_APP_PROTOCOL; i++) {
+		entry = &p_app->app_pri_tbl[i].entry;
+		*entry = 0;
+		if (ieee) {
+			*entry &= ~(DCBX_APP_SF_IEEE_MASK | DCBX_APP_SF_MASK);
+			switch (p_params->app_entry[i].sf_ieee) {
+			case QED_DCBX_SF_IEEE_ETHTYPE:
+				*entry |= ((u32)DCBX_APP_SF_IEEE_ETHTYPE <<
+					   DCBX_APP_SF_IEEE_SHIFT);
+				*entry |= ((u32)DCBX_APP_SF_ETHTYPE <<
+					   DCBX_APP_SF_SHIFT);
+				break;
+			case QED_DCBX_SF_IEEE_TCP_PORT:
+				*entry |= ((u32)DCBX_APP_SF_IEEE_TCP_PORT <<
+					   DCBX_APP_SF_IEEE_SHIFT);
+				*entry |= ((u32)DCBX_APP_SF_PORT <<
+					   DCBX_APP_SF_SHIFT);
+				break;
+			case QED_DCBX_SF_IEEE_UDP_PORT:
+				*entry |= ((u32)DCBX_APP_SF_IEEE_UDP_PORT <<
+					   DCBX_APP_SF_IEEE_SHIFT);
+				*entry |= ((u32)DCBX_APP_SF_PORT <<
+					   DCBX_APP_SF_SHIFT);
+				break;
+			case QED_DCBX_SF_IEEE_TCP_UDP_PORT:
+				*entry |= ((u32)DCBX_APP_SF_IEEE_TCP_UDP_PORT <<
+					   DCBX_APP_SF_IEEE_SHIFT);
+				*entry |= ((u32)DCBX_APP_SF_PORT <<
+					   DCBX_APP_SF_SHIFT);
+				break;
+			}
+		} else {
+			*entry &= ~DCBX_APP_SF_MASK;
+			if (p_params->app_entry[i].ethtype)
+				*entry |= ((u32)DCBX_APP_SF_ETHTYPE <<
+					   DCBX_APP_SF_SHIFT);
+			else
+				*entry |= ((u32)DCBX_APP_SF_PORT <<
+					   DCBX_APP_SF_SHIFT);
+		}
+
+		*entry &= ~DCBX_APP_PROTOCOL_ID_MASK;
+		*entry |= ((u32)p_params->app_entry[i].proto_id <<
+			   DCBX_APP_PROTOCOL_ID_SHIFT);
+		*entry &= ~DCBX_APP_PRI_MAP_MASK;
+		*entry |= ((u32)(p_params->app_entry[i].prio) <<
+			   DCBX_APP_PRI_MAP_SHIFT);
+	}
+}
+
+static void
+qed_dcbx_set_local_params(struct qed_hwfn *p_hwfn,
+			  struct dcbx_local_params *local_admin,
+			  struct qed_dcbx_set *params)
+{
+	bool ieee = false;
+
+	local_admin->flags = 0;
+	memcpy(&local_admin->features,
+	       &p_hwfn->p_dcbx_info->operational.features,
+	       sizeof(local_admin->features));
+
+	if (params->enabled) {
+		local_admin->config = params->ver_num;
+		ieee = !!(params->ver_num & DCBX_CONFIG_VERSION_IEEE);
+	} else {
+		local_admin->config = DCBX_CONFIG_VERSION_DISABLED;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB, "Dcbx version = %d\n",
+		   local_admin->config);
+
+	if (params->override_flags & QED_DCBX_OVERRIDE_PFC_CFG)
+		qed_dcbx_set_pfc_data(p_hwfn, &local_admin->features.pfc,
+				      &params->config.params);
+
+	if (params->override_flags & QED_DCBX_OVERRIDE_ETS_CFG)
+		qed_dcbx_set_ets_data(p_hwfn, &local_admin->features.ets,
+				      &params->config.params);
+
+	if (params->override_flags & QED_DCBX_OVERRIDE_APP_CFG)
+		qed_dcbx_set_app_data(p_hwfn, &local_admin->features.app,
+				      &params->config.params, ieee);
+}
+
+int qed_dcbx_config_params(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			   struct qed_dcbx_set *params, bool hw_commit)
+{
+	struct dcbx_local_params local_admin;
+	struct qed_dcbx_mib_meta_data data;
+	u32 resp = 0, param = 0;
+	int rc = 0;
+
+	if (!hw_commit) {
+		memcpy(&p_hwfn->p_dcbx_info->set, params,
+		       sizeof(struct qed_dcbx_set));
+		return 0;
+	}
+
+	/* clear set-parmas cache */
+	memset(&p_hwfn->p_dcbx_info->set, 0, sizeof(p_hwfn->p_dcbx_info->set));
+
+	memset(&local_admin, 0, sizeof(local_admin));
+	qed_dcbx_set_local_params(p_hwfn, &local_admin, params);
+
+	data.addr = p_hwfn->mcp_info->port_addr +
+	    offsetof(struct public_port, local_admin_dcbx_mib);
+	data.local_admin = &local_admin;
+	data.size = sizeof(struct dcbx_local_params);
+	qed_memcpy_to(p_hwfn, p_ptt, data.addr, data.local_admin, data.size);
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_SET_DCBX,
+			 1 << DRV_MB_PARAM_LLDP_SEND_SHIFT, &resp, &param);
+	if (rc)
+		DP_NOTICE(p_hwfn, "Failed to send DCBX update request\n");
+
+	return rc;
+}
+
+int qed_dcbx_get_config_params(struct qed_hwfn *p_hwfn,
+			       struct qed_dcbx_set *params)
+{
+	struct qed_dcbx_get *dcbx_info;
+	int rc;
+
+	if (p_hwfn->p_dcbx_info->set.config.valid) {
+		memcpy(params, &p_hwfn->p_dcbx_info->set,
+		       sizeof(struct qed_dcbx_set));
+		return 0;
+	}
+
+	dcbx_info = kzalloc(sizeof(*dcbx_info), GFP_KERNEL);
+	if (!dcbx_info)
+		return -ENOMEM;
+
+	rc = qed_dcbx_query_params(p_hwfn, dcbx_info, QED_DCBX_OPERATIONAL_MIB);
+	if (rc) {
+		kfree(dcbx_info);
+		return rc;
+	}
+
+	p_hwfn->p_dcbx_info->set.override_flags = 0;
+	p_hwfn->p_dcbx_info->set.ver_num = DCBX_CONFIG_VERSION_DISABLED;
+	if (dcbx_info->operational.cee)
+		p_hwfn->p_dcbx_info->set.ver_num |= DCBX_CONFIG_VERSION_CEE;
+	if (dcbx_info->operational.ieee)
+		p_hwfn->p_dcbx_info->set.ver_num |= DCBX_CONFIG_VERSION_IEEE;
+	if (dcbx_info->operational.local)
+		p_hwfn->p_dcbx_info->set.ver_num |= DCBX_CONFIG_VERSION_STATIC;
+
+	p_hwfn->p_dcbx_info->set.enabled = dcbx_info->operational.enabled;
+	memcpy(&p_hwfn->p_dcbx_info->set.config.params,
+	       &dcbx_info->operational.params,
+	       sizeof(struct qed_dcbx_admin_params));
+	p_hwfn->p_dcbx_info->set.config.valid = true;
+
+	memcpy(params, &p_hwfn->p_dcbx_info->set, sizeof(struct qed_dcbx_set));
+
+	kfree(dcbx_info);
+
+	return 0;
+}
+
+static struct qed_dcbx_get *qed_dcbnl_get_dcbx(struct qed_hwfn *hwfn,
+					       enum qed_mib_read_type type)
+{
+	struct qed_dcbx_get *dcbx_info;
+
+	dcbx_info = kzalloc(sizeof(*dcbx_info), GFP_ATOMIC);
+	if (!dcbx_info)
+		return NULL;
+
+	if (qed_dcbx_query_params(hwfn, dcbx_info, type)) {
+		kfree(dcbx_info);
+		return NULL;
+	}
+
+	if ((type == QED_DCBX_OPERATIONAL_MIB) &&
+	    !dcbx_info->operational.enabled) {
+		DP_INFO(hwfn, "DCBX is not enabled/operational\n");
+		kfree(dcbx_info);
+		return NULL;
+	}
+
+	return dcbx_info;
+}
+
+static u8 qed_dcbnl_getstate(struct qed_dev *cdev)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	bool enabled;
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return 0;
+
+	enabled = dcbx_info->operational.enabled;
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "DCB state = %d\n", enabled);
+	kfree(dcbx_info);
+
+	return enabled;
+}
+
+static u8 qed_dcbnl_setstate(struct qed_dev *cdev, u8 state)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_set dcbx_set;
+	struct qed_ptt *ptt;
+	int rc;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "DCB state = %d\n", state);
+
+	memset(&dcbx_set, 0, sizeof(dcbx_set));
+	rc = qed_dcbx_get_config_params(hwfn, &dcbx_set);
+	if (rc)
+		return 1;
+
+	dcbx_set.enabled = !!state;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return 1;
+
+	rc = qed_dcbx_config_params(hwfn, ptt, &dcbx_set, 0);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return rc ? 1 : 0;
+}
+
+static void qed_dcbnl_getpgtccfgtx(struct qed_dev *cdev, int tc, u8 *prio_type,
+				   u8 *pgid, u8 *bw_pct, u8 *up_map)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "tc = %d\n", tc);
+	*prio_type = *pgid = *bw_pct = *up_map = 0;
+	if (tc < 0 || tc >= QED_MAX_PFC_PRIORITIES) {
+		DP_INFO(hwfn, "Invalid tc %d\n", tc);
+		return;
+	}
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return;
+
+	*pgid = dcbx_info->operational.params.ets_pri_tc_tbl[tc];
+	kfree(dcbx_info);
+}
+
+static void qed_dcbnl_getpgbwgcfgtx(struct qed_dev *cdev, int pgid, u8 *bw_pct)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+
+	*bw_pct = 0;
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "pgid = %d\n", pgid);
+	if (pgid < 0 || pgid >= QED_MAX_PFC_PRIORITIES) {
+		DP_INFO(hwfn, "Invalid pgid %d\n", pgid);
+		return;
+	}
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return;
+
+	*bw_pct = dcbx_info->operational.params.ets_tc_bw_tbl[pgid];
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "bw_pct = %d\n", *bw_pct);
+	kfree(dcbx_info);
+}
+
+static void qed_dcbnl_getpgtccfgrx(struct qed_dev *cdev, int tc, u8 *prio,
+				   u8 *bwg_id, u8 *bw_pct, u8 *up_map)
+{
+	DP_INFO(QED_LEADING_HWFN(cdev), "Rx ETS is not supported\n");
+	*prio = *bwg_id = *bw_pct = *up_map = 0;
+}
+
+static void qed_dcbnl_getpgbwgcfgrx(struct qed_dev *cdev,
+				    int bwg_id, u8 *bw_pct)
+{
+	DP_INFO(QED_LEADING_HWFN(cdev), "Rx ETS is not supported\n");
+	*bw_pct = 0;
+}
+
+static void qed_dcbnl_getpfccfg(struct qed_dev *cdev,
+				int priority, u8 *setting)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "priority = %d\n", priority);
+	if (priority < 0 || priority >= QED_MAX_PFC_PRIORITIES) {
+		DP_INFO(hwfn, "Invalid priority %d\n", priority);
+		return;
+	}
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return;
+
+	*setting = dcbx_info->operational.params.pfc.prio[priority];
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "setting = %d\n", *setting);
+	kfree(dcbx_info);
+}
+
+static void qed_dcbnl_setpfccfg(struct qed_dev *cdev, int priority, u8 setting)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_set dcbx_set;
+	struct qed_ptt *ptt;
+	int rc;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "priority = %d setting = %d\n",
+		   priority, setting);
+	if (priority < 0 || priority >= QED_MAX_PFC_PRIORITIES) {
+		DP_INFO(hwfn, "Invalid priority %d\n", priority);
+		return;
+	}
+
+	memset(&dcbx_set, 0, sizeof(dcbx_set));
+	rc = qed_dcbx_get_config_params(hwfn, &dcbx_set);
+	if (rc)
+		return;
+
+	dcbx_set.override_flags |= QED_DCBX_OVERRIDE_PFC_CFG;
+	dcbx_set.config.params.pfc.prio[priority] = !!setting;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return;
+
+	rc = qed_dcbx_config_params(hwfn, ptt, &dcbx_set, 0);
+
+	qed_ptt_release(hwfn, ptt);
+}
+
+static u8 qed_dcbnl_getcap(struct qed_dev *cdev, int capid, u8 *cap)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	int rc = 0;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "capid = %d\n", capid);
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return 1;
+
+	switch (capid) {
+	case DCB_CAP_ATTR_PG:
+	case DCB_CAP_ATTR_PFC:
+	case DCB_CAP_ATTR_UP2TC:
+	case DCB_CAP_ATTR_GSP:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_PG_TCS:
+	case DCB_CAP_ATTR_PFC_TCS:
+		*cap = 0x80;
+		break;
+	case DCB_CAP_ATTR_DCBX:
+		*cap = (DCB_CAP_DCBX_LLD_MANAGED | DCB_CAP_DCBX_VER_CEE |
+			DCB_CAP_DCBX_VER_IEEE | DCB_CAP_DCBX_STATIC);
+		break;
+	default:
+		*cap = false;
+		rc = 1;
+	}
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "id = %d caps = %d\n", capid, *cap);
+	kfree(dcbx_info);
+
+	return rc;
+}
+
+static int qed_dcbnl_getnumtcs(struct qed_dev *cdev, int tcid, u8 *num)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	int rc = 0;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "tcid = %d\n", tcid);
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return -EINVAL;
+
+	switch (tcid) {
+	case DCB_NUMTCS_ATTR_PG:
+		*num = dcbx_info->operational.params.max_ets_tc;
+		break;
+	case DCB_NUMTCS_ATTR_PFC:
+		*num = dcbx_info->operational.params.pfc.max_tc;
+		break;
+	default:
+		rc = -EINVAL;
+	}
+
+	kfree(dcbx_info);
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "numtcs = %d\n", *num);
+
+	return rc;
+}
+
+static u8 qed_dcbnl_getpfcstate(struct qed_dev *cdev)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	bool enabled;
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return 0;
+
+	enabled = dcbx_info->operational.params.pfc.enabled;
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "pfc state = %d\n", enabled);
+	kfree(dcbx_info);
+
+	return enabled;
+}
+
+static u8 qed_dcbnl_getdcbx(struct qed_dev *cdev)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	u8 mode = 0;
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return 0;
+
+	if (dcbx_info->operational.enabled)
+		mode |= DCB_CAP_DCBX_LLD_MANAGED;
+	if (dcbx_info->operational.ieee)
+		mode |= DCB_CAP_DCBX_VER_IEEE;
+	if (dcbx_info->operational.cee)
+		mode |= DCB_CAP_DCBX_VER_CEE;
+	if (dcbx_info->operational.local)
+		mode |= DCB_CAP_DCBX_STATIC;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "dcb mode = %d\n", mode);
+	kfree(dcbx_info);
+
+	return mode;
+}
+
+static void qed_dcbnl_setpgtccfgtx(struct qed_dev *cdev,
+				   int tc,
+				   u8 pri_type, u8 pgid, u8 bw_pct, u8 up_map)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_set dcbx_set;
+	struct qed_ptt *ptt;
+	int rc;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB,
+		   "tc = %d pri_type = %d pgid = %d bw_pct = %d up_map = %d\n",
+		   tc, pri_type, pgid, bw_pct, up_map);
+
+	if (tc < 0 || tc >= QED_MAX_PFC_PRIORITIES) {
+		DP_INFO(hwfn, "Invalid tc %d\n", tc);
+		return;
+	}
+
+	memset(&dcbx_set, 0, sizeof(dcbx_set));
+	rc = qed_dcbx_get_config_params(hwfn, &dcbx_set);
+	if (rc)
+		return;
+
+	dcbx_set.override_flags |= QED_DCBX_OVERRIDE_ETS_CFG;
+	dcbx_set.config.params.ets_pri_tc_tbl[tc] = pgid;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return;
+
+	rc = qed_dcbx_config_params(hwfn, ptt, &dcbx_set, 0);
+
+	qed_ptt_release(hwfn, ptt);
+}
+
+static void qed_dcbnl_setpgtccfgrx(struct qed_dev *cdev, int prio,
+				   u8 pri_type, u8 pgid, u8 bw_pct, u8 up_map)
+{
+	DP_INFO(QED_LEADING_HWFN(cdev), "Rx ETS is not supported\n");
+}
+
+static void qed_dcbnl_setpgbwgcfgtx(struct qed_dev *cdev, int pgid, u8 bw_pct)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_set dcbx_set;
+	struct qed_ptt *ptt;
+	int rc;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "pgid = %d bw_pct = %d\n", pgid, bw_pct);
+	if (pgid < 0 || pgid >= QED_MAX_PFC_PRIORITIES) {
+		DP_INFO(hwfn, "Invalid pgid %d\n", pgid);
+		return;
+	}
+
+	memset(&dcbx_set, 0, sizeof(dcbx_set));
+	rc = qed_dcbx_get_config_params(hwfn, &dcbx_set);
+	if (rc)
+		return;
+
+	dcbx_set.override_flags |= QED_DCBX_OVERRIDE_ETS_CFG;
+	dcbx_set.config.params.ets_tc_bw_tbl[pgid] = bw_pct;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return;
+
+	rc = qed_dcbx_config_params(hwfn, ptt, &dcbx_set, 0);
+
+	qed_ptt_release(hwfn, ptt);
+}
+
+static void qed_dcbnl_setpgbwgcfgrx(struct qed_dev *cdev, int pgid, u8 bw_pct)
+{
+	DP_INFO(QED_LEADING_HWFN(cdev), "Rx ETS is not supported\n");
+}
+
+static u8 qed_dcbnl_setall(struct qed_dev *cdev)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_set dcbx_set;
+	struct qed_ptt *ptt;
+	int rc;
+
+	memset(&dcbx_set, 0, sizeof(dcbx_set));
+	rc = qed_dcbx_get_config_params(hwfn, &dcbx_set);
+	if (rc)
+		return 1;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return 1;
+
+	rc = qed_dcbx_config_params(hwfn, ptt, &dcbx_set, 1);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return rc;
+}
+
+static int qed_dcbnl_setnumtcs(struct qed_dev *cdev, int tcid, u8 num)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_set dcbx_set;
+	struct qed_ptt *ptt;
+	int rc;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "tcid = %d num = %d\n", tcid, num);
+	memset(&dcbx_set, 0, sizeof(dcbx_set));
+	rc = qed_dcbx_get_config_params(hwfn, &dcbx_set);
+	if (rc)
+		return 1;
+
+	switch (tcid) {
+	case DCB_NUMTCS_ATTR_PG:
+		dcbx_set.override_flags |= QED_DCBX_OVERRIDE_ETS_CFG;
+		dcbx_set.config.params.max_ets_tc = num;
+		break;
+	case DCB_NUMTCS_ATTR_PFC:
+		dcbx_set.override_flags |= QED_DCBX_OVERRIDE_PFC_CFG;
+		dcbx_set.config.params.pfc.max_tc = num;
+		break;
+	default:
+		DP_INFO(hwfn, "Invalid tcid %d\n", tcid);
+		return -EINVAL;
+	}
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EINVAL;
+
+	rc = qed_dcbx_config_params(hwfn, ptt, &dcbx_set, 0);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return 0;
+}
+
+static void qed_dcbnl_setpfcstate(struct qed_dev *cdev, u8 state)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_set dcbx_set;
+	struct qed_ptt *ptt;
+	int rc;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "new state = %d\n", state);
+
+	memset(&dcbx_set, 0, sizeof(dcbx_set));
+	rc = qed_dcbx_get_config_params(hwfn, &dcbx_set);
+	if (rc)
+		return;
+
+	dcbx_set.override_flags |= QED_DCBX_OVERRIDE_PFC_CFG;
+	dcbx_set.config.params.pfc.enabled = !!state;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return;
+
+	rc = qed_dcbx_config_params(hwfn, ptt, &dcbx_set, 0);
+
+	qed_ptt_release(hwfn, ptt);
+}
+
+static int qed_dcbnl_getapp(struct qed_dev *cdev, u8 idtype, u16 idval)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	struct qed_app_entry *entry;
+	bool ethtype;
+	u8 prio = 0;
+	int i;
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return -EINVAL;
+
+	ethtype = !!(idtype == DCB_APP_IDTYPE_ETHTYPE);
+	for (i = 0; i < QED_DCBX_MAX_APP_PROTOCOL; i++) {
+		entry = &dcbx_info->operational.params.app_entry[i];
+		if ((entry->ethtype == ethtype) && (entry->proto_id == idval)) {
+			prio = entry->prio;
+			break;
+		}
+	}
+
+	if (i == QED_DCBX_MAX_APP_PROTOCOL) {
+		DP_ERR(cdev, "App entry (%d, %d) not found\n", idtype, idval);
+		kfree(dcbx_info);
+		return -EINVAL;
+	}
+
+	kfree(dcbx_info);
+
+	return prio;
+}
+
+static int qed_dcbnl_setapp(struct qed_dev *cdev,
+			    u8 idtype, u16 idval, u8 pri_map)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_set dcbx_set;
+	struct qed_app_entry *entry;
+	struct qed_ptt *ptt;
+	bool ethtype;
+	int rc, i;
+
+	memset(&dcbx_set, 0, sizeof(dcbx_set));
+	rc = qed_dcbx_get_config_params(hwfn, &dcbx_set);
+	if (rc)
+		return -EINVAL;
+
+	ethtype = !!(idtype == DCB_APP_IDTYPE_ETHTYPE);
+	for (i = 0; i < QED_DCBX_MAX_APP_PROTOCOL; i++) {
+		entry = &dcbx_set.config.params.app_entry[i];
+		if ((entry->ethtype == ethtype) && (entry->proto_id == idval))
+			break;
+		/* First empty slot */
+		if (!entry->proto_id) {
+			dcbx_set.config.params.num_app_entries++;
+			break;
+		}
+	}
+
+	if (i == QED_DCBX_MAX_APP_PROTOCOL) {
+		DP_ERR(cdev, "App table is full\n");
+		return -EBUSY;
+	}
+
+	dcbx_set.override_flags |= QED_DCBX_OVERRIDE_APP_CFG;
+	dcbx_set.config.params.app_entry[i].ethtype = ethtype;
+	dcbx_set.config.params.app_entry[i].proto_id = idval;
+	dcbx_set.config.params.app_entry[i].prio = pri_map;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EBUSY;
+
+	rc = qed_dcbx_config_params(hwfn, ptt, &dcbx_set, 0);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return rc;
+}
+
+static u8 qed_dcbnl_setdcbx(struct qed_dev *cdev, u8 mode)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_set dcbx_set;
+	struct qed_ptt *ptt;
+	int rc;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "new mode = %x\n", mode);
+
+	if (!(mode & DCB_CAP_DCBX_VER_IEEE) &&
+	    !(mode & DCB_CAP_DCBX_VER_CEE) && !(mode & DCB_CAP_DCBX_STATIC)) {
+		DP_INFO(hwfn, "Allowed modes are cee, ieee or static\n");
+		return 1;
+	}
+
+	memset(&dcbx_set, 0, sizeof(dcbx_set));
+	rc = qed_dcbx_get_config_params(hwfn, &dcbx_set);
+	if (rc)
+		return 1;
+
+	dcbx_set.ver_num = 0;
+	if (mode & DCB_CAP_DCBX_VER_CEE) {
+		dcbx_set.ver_num |= DCBX_CONFIG_VERSION_CEE;
+		dcbx_set.enabled = true;
+	}
+
+	if (mode & DCB_CAP_DCBX_VER_IEEE) {
+		dcbx_set.ver_num |= DCBX_CONFIG_VERSION_IEEE;
+		dcbx_set.enabled = true;
+	}
+
+	if (mode & DCB_CAP_DCBX_STATIC) {
+		dcbx_set.ver_num |= DCBX_CONFIG_VERSION_STATIC;
+		dcbx_set.enabled = true;
+	}
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return 1;
+
+	rc = qed_dcbx_config_params(hwfn, ptt, &dcbx_set, 0);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return rc;
+}
+
+static u8 qed_dcbnl_getfeatcfg(struct qed_dev *cdev, int featid, u8 *flags)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "Feature id  = %d\n", featid);
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return 1;
+
+	*flags = 0;
+	switch (featid) {
+	case DCB_FEATCFG_ATTR_PG:
+		if (dcbx_info->operational.params.ets_enabled)
+			*flags = DCB_FEATCFG_ENABLE;
+		else
+			*flags = DCB_FEATCFG_ERROR;
+		break;
+	case DCB_FEATCFG_ATTR_PFC:
+		if (dcbx_info->operational.params.pfc.enabled)
+			*flags = DCB_FEATCFG_ENABLE;
+		else
+			*flags = DCB_FEATCFG_ERROR;
+		break;
+	case DCB_FEATCFG_ATTR_APP:
+		if (dcbx_info->operational.params.app_valid)
+			*flags = DCB_FEATCFG_ENABLE;
+		else
+			*flags = DCB_FEATCFG_ERROR;
+		break;
+	default:
+		DP_INFO(hwfn, "Invalid feature-ID %d\n", featid);
+		kfree(dcbx_info);
+		return 1;
+	}
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "flags = %d\n", *flags);
+	kfree(dcbx_info);
+
+	return 0;
+}
+
+static u8 qed_dcbnl_setfeatcfg(struct qed_dev *cdev, int featid, u8 flags)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_set dcbx_set;
+	bool enabled, willing;
+	struct qed_ptt *ptt;
+	int rc;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "featid = %d flags = %d\n",
+		   featid, flags);
+	memset(&dcbx_set, 0, sizeof(dcbx_set));
+	rc = qed_dcbx_get_config_params(hwfn, &dcbx_set);
+	if (rc)
+		return 1;
+
+	enabled = !!(flags & DCB_FEATCFG_ENABLE);
+	willing = !!(flags & DCB_FEATCFG_WILLING);
+	switch (featid) {
+	case DCB_FEATCFG_ATTR_PG:
+		dcbx_set.override_flags |= QED_DCBX_OVERRIDE_ETS_CFG;
+		dcbx_set.config.params.ets_enabled = enabled;
+		dcbx_set.config.params.ets_willing = willing;
+		break;
+	case DCB_FEATCFG_ATTR_PFC:
+		dcbx_set.override_flags |= QED_DCBX_OVERRIDE_PFC_CFG;
+		dcbx_set.config.params.pfc.enabled = enabled;
+		dcbx_set.config.params.pfc.willing = willing;
+		break;
+	case DCB_FEATCFG_ATTR_APP:
+		dcbx_set.override_flags |= QED_DCBX_OVERRIDE_APP_CFG;
+		dcbx_set.config.params.app_willing = willing;
+		break;
+	default:
+		DP_INFO(hwfn, "Invalid feature-ID %d\n", featid);
+		return 1;
+	}
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return 1;
+
+	rc = qed_dcbx_config_params(hwfn, ptt, &dcbx_set, 0);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return 0;
+}
+
+static int qed_dcbnl_peer_getappinfo(struct qed_dev *cdev,
+				     struct dcb_peer_app_info *info,
+				     u16 *app_count)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_REMOTE_MIB);
+	if (!dcbx_info)
+		return -EINVAL;
+
+	info->willing = dcbx_info->remote.params.app_willing;
+	info->error = dcbx_info->remote.params.app_error;
+	*app_count = dcbx_info->remote.params.num_app_entries;
+	kfree(dcbx_info);
+
+	return 0;
+}
+
+static int qed_dcbnl_peer_getapptable(struct qed_dev *cdev,
+				      struct dcb_app *table)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	int i;
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_REMOTE_MIB);
+	if (!dcbx_info)
+		return -EINVAL;
+
+	for (i = 0; i < dcbx_info->remote.params.num_app_entries; i++) {
+		if (dcbx_info->remote.params.app_entry[i].ethtype)
+			table[i].selector = DCB_APP_IDTYPE_ETHTYPE;
+		else
+			table[i].selector = DCB_APP_IDTYPE_PORTNUM;
+		table[i].priority = dcbx_info->remote.params.app_entry[i].prio;
+		table[i].protocol =
+		    dcbx_info->remote.params.app_entry[i].proto_id;
+	}
+
+	kfree(dcbx_info);
+
+	return 0;
+}
+
+static int qed_dcbnl_cee_peer_getpfc(struct qed_dev *cdev, struct cee_pfc *pfc)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	int i;
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_REMOTE_MIB);
+	if (!dcbx_info)
+		return -EINVAL;
+
+	for (i = 0; i < QED_MAX_PFC_PRIORITIES; i++)
+		if (dcbx_info->remote.params.pfc.prio[i])
+			pfc->pfc_en |= BIT(i);
+
+	pfc->tcs_supported = dcbx_info->remote.params.pfc.max_tc;
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "pfc state = %d tcs_supported = %d\n",
+		   pfc->pfc_en, pfc->tcs_supported);
+	kfree(dcbx_info);
+
+	return 0;
+}
+
+static int qed_dcbnl_cee_peer_getpg(struct qed_dev *cdev, struct cee_pg *pg)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	int i;
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_REMOTE_MIB);
+	if (!dcbx_info)
+		return -EINVAL;
+
+	pg->willing = dcbx_info->remote.params.ets_willing;
+	for (i = 0; i < QED_MAX_PFC_PRIORITIES; i++) {
+		pg->pg_bw[i] = dcbx_info->remote.params.ets_tc_bw_tbl[i];
+		pg->prio_pg[i] = dcbx_info->remote.params.ets_pri_tc_tbl[i];
+	}
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "willing = %d", pg->willing);
+	kfree(dcbx_info);
+
+	return 0;
+}
+
+static int qed_dcbnl_get_ieee_pfc(struct qed_dev *cdev,
+				  struct ieee_pfc *pfc, bool remote)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_params *params;
+	struct qed_dcbx_get *dcbx_info;
+	int rc, i;
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return -EINVAL;
+
+	if (!dcbx_info->operational.ieee) {
+		DP_INFO(hwfn, "DCBX is not enabled/operational in IEEE mode\n");
+		kfree(dcbx_info);
+		return -EINVAL;
+	}
+
+	if (remote) {
+		memset(dcbx_info, 0, sizeof(*dcbx_info));
+		rc = qed_dcbx_query_params(hwfn, dcbx_info,
+					   QED_DCBX_REMOTE_MIB);
+		if (rc) {
+			kfree(dcbx_info);
+			return -EINVAL;
+		}
+
+		params = &dcbx_info->remote.params;
+	} else {
+		params = &dcbx_info->operational.params;
+	}
+
+	pfc->pfc_cap = params->pfc.max_tc;
+	pfc->pfc_en = 0;
+	for (i = 0; i < QED_MAX_PFC_PRIORITIES; i++)
+		if (params->pfc.prio[i])
+			pfc->pfc_en |= BIT(i);
+
+	kfree(dcbx_info);
+
+	return 0;
+}
+
+static int qed_dcbnl_ieee_getpfc(struct qed_dev *cdev, struct ieee_pfc *pfc)
+{
+	return qed_dcbnl_get_ieee_pfc(cdev, pfc, false);
+}
+
+static int qed_dcbnl_ieee_setpfc(struct qed_dev *cdev, struct ieee_pfc *pfc)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	struct qed_dcbx_set dcbx_set;
+	struct qed_ptt *ptt;
+	int rc, i;
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return -EINVAL;
+
+	if (!dcbx_info->operational.ieee) {
+		DP_INFO(hwfn, "DCBX is not enabled/operational in IEEE mode\n");
+		kfree(dcbx_info);
+		return -EINVAL;
+	}
+
+	kfree(dcbx_info);
+
+	memset(&dcbx_set, 0, sizeof(dcbx_set));
+	rc = qed_dcbx_get_config_params(hwfn, &dcbx_set);
+	if (rc)
+		return -EINVAL;
+
+	dcbx_set.override_flags |= QED_DCBX_OVERRIDE_PFC_CFG;
+	for (i = 0; i < QED_MAX_PFC_PRIORITIES; i++)
+		dcbx_set.config.params.pfc.prio[i] = !!(pfc->pfc_en & BIT(i));
+
+	dcbx_set.config.params.pfc.max_tc = pfc->pfc_cap;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EINVAL;
+
+	rc = qed_dcbx_config_params(hwfn, ptt, &dcbx_set, 0);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return rc;
+}
+
+static int qed_dcbnl_get_ieee_ets(struct qed_dev *cdev,
+				  struct ieee_ets *ets, bool remote)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	struct qed_dcbx_params *params;
+	int rc;
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return -EINVAL;
+
+	if (!dcbx_info->operational.ieee) {
+		DP_INFO(hwfn, "DCBX is not enabled/operational in IEEE mode\n");
+		kfree(dcbx_info);
+		return -EINVAL;
+	}
+
+	if (remote) {
+		memset(dcbx_info, 0, sizeof(*dcbx_info));
+		rc = qed_dcbx_query_params(hwfn, dcbx_info,
+					   QED_DCBX_REMOTE_MIB);
+		if (rc) {
+			kfree(dcbx_info);
+			return -EINVAL;
+		}
+
+		params = &dcbx_info->remote.params;
+	} else {
+		params = &dcbx_info->operational.params;
+	}
+
+	ets->ets_cap = params->max_ets_tc;
+	ets->willing = params->ets_willing;
+	ets->cbs = params->ets_cbs;
+	memcpy(ets->tc_tx_bw, params->ets_tc_bw_tbl, sizeof(ets->tc_tx_bw));
+	memcpy(ets->tc_tsa, params->ets_tc_tsa_tbl, sizeof(ets->tc_tsa));
+	memcpy(ets->prio_tc, params->ets_pri_tc_tbl, sizeof(ets->prio_tc));
+	kfree(dcbx_info);
+
+	return 0;
+}
+
+static int qed_dcbnl_ieee_getets(struct qed_dev *cdev, struct ieee_ets *ets)
+{
+	return qed_dcbnl_get_ieee_ets(cdev, ets, false);
+}
+
+static int qed_dcbnl_ieee_setets(struct qed_dev *cdev, struct ieee_ets *ets)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	struct qed_dcbx_set dcbx_set;
+	struct qed_ptt *ptt;
+	int rc;
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return -EINVAL;
+
+	if (!dcbx_info->operational.ieee) {
+		DP_INFO(hwfn, "DCBX is not enabled/operational in IEEE mode\n");
+		kfree(dcbx_info);
+		return -EINVAL;
+	}
+
+	kfree(dcbx_info);
+
+	memset(&dcbx_set, 0, sizeof(dcbx_set));
+	rc = qed_dcbx_get_config_params(hwfn, &dcbx_set);
+	if (rc)
+		return -EINVAL;
+
+	dcbx_set.override_flags |= QED_DCBX_OVERRIDE_ETS_CFG;
+	dcbx_set.config.params.max_ets_tc = ets->ets_cap;
+	dcbx_set.config.params.ets_willing = ets->willing;
+	dcbx_set.config.params.ets_cbs = ets->cbs;
+	memcpy(dcbx_set.config.params.ets_tc_bw_tbl, ets->tc_tx_bw,
+	       sizeof(ets->tc_tx_bw));
+	memcpy(dcbx_set.config.params.ets_tc_tsa_tbl, ets->tc_tsa,
+	       sizeof(ets->tc_tsa));
+	memcpy(dcbx_set.config.params.ets_pri_tc_tbl, ets->prio_tc,
+	       sizeof(ets->prio_tc));
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EINVAL;
+
+	rc = qed_dcbx_config_params(hwfn, ptt, &dcbx_set, 0);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return rc;
+}
+
+static int
+qed_dcbnl_ieee_peer_getets(struct qed_dev *cdev, struct ieee_ets *ets)
+{
+	return qed_dcbnl_get_ieee_ets(cdev, ets, true);
+}
+
+static int
+qed_dcbnl_ieee_peer_getpfc(struct qed_dev *cdev, struct ieee_pfc *pfc)
+{
+	return qed_dcbnl_get_ieee_pfc(cdev, pfc, true);
+}
+
+static int qed_get_sf_ieee_value(u8 selector, u8 *sf_ieee)
+{
+	switch (selector) {
+	case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+		*sf_ieee = QED_DCBX_SF_IEEE_ETHTYPE;
+		break;
+	case IEEE_8021QAZ_APP_SEL_STREAM:
+		*sf_ieee = QED_DCBX_SF_IEEE_TCP_PORT;
+		break;
+	case IEEE_8021QAZ_APP_SEL_DGRAM:
+		*sf_ieee = QED_DCBX_SF_IEEE_UDP_PORT;
+		break;
+	case IEEE_8021QAZ_APP_SEL_ANY:
+		*sf_ieee = QED_DCBX_SF_IEEE_TCP_UDP_PORT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int qed_dcbnl_ieee_getapp(struct qed_dev *cdev, struct dcb_app *app)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	struct qed_app_entry *entry;
+	u8 prio = 0;
+	u8 sf_ieee;
+	int i;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "selector = %d protocol = %d\n",
+		   app->selector, app->protocol);
+
+	if (qed_get_sf_ieee_value(app->selector, &sf_ieee)) {
+		DP_INFO(cdev, "Invalid selector field value %d\n",
+			app->selector);
+		return -EINVAL;
+	}
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return -EINVAL;
+
+	if (!dcbx_info->operational.ieee) {
+		DP_INFO(hwfn, "DCBX is not enabled/operational in IEEE mode\n");
+		kfree(dcbx_info);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < QED_DCBX_MAX_APP_PROTOCOL; i++) {
+		entry = &dcbx_info->operational.params.app_entry[i];
+		if ((entry->sf_ieee == sf_ieee) &&
+		    (entry->proto_id == app->protocol)) {
+			prio = entry->prio;
+			break;
+		}
+	}
+
+	if (i == QED_DCBX_MAX_APP_PROTOCOL) {
+		DP_ERR(cdev, "App entry (%d, %d) not found\n", app->selector,
+		       app->protocol);
+		kfree(dcbx_info);
+		return -EINVAL;
+	}
+
+	app->priority = ffs(prio) - 1;
+
+	kfree(dcbx_info);
+
+	return 0;
+}
+
+static int qed_dcbnl_ieee_setapp(struct qed_dev *cdev, struct dcb_app *app)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_dcbx_get *dcbx_info;
+	struct qed_dcbx_set dcbx_set;
+	struct qed_app_entry *entry;
+	struct qed_ptt *ptt;
+	u8 sf_ieee;
+	int rc, i;
+
+	DP_VERBOSE(hwfn, QED_MSG_DCB, "selector = %d protocol = %d pri = %d\n",
+		   app->selector, app->protocol, app->priority);
+	if (app->priority >= QED_MAX_PFC_PRIORITIES) {
+		DP_INFO(hwfn, "Invalid priority %d\n", app->priority);
+		return -EINVAL;
+	}
+
+	if (qed_get_sf_ieee_value(app->selector, &sf_ieee)) {
+		DP_INFO(cdev, "Invalid selector field value %d\n",
+			app->selector);
+		return -EINVAL;
+	}
+
+	dcbx_info = qed_dcbnl_get_dcbx(hwfn, QED_DCBX_OPERATIONAL_MIB);
+	if (!dcbx_info)
+		return -EINVAL;
+
+	if (!dcbx_info->operational.ieee) {
+		DP_INFO(hwfn, "DCBX is not enabled/operational in IEEE mode\n");
+		kfree(dcbx_info);
+		return -EINVAL;
+	}
+
+	kfree(dcbx_info);
+
+	memset(&dcbx_set, 0, sizeof(dcbx_set));
+	rc = qed_dcbx_get_config_params(hwfn, &dcbx_set);
+	if (rc)
+		return -EINVAL;
+
+	for (i = 0; i < QED_DCBX_MAX_APP_PROTOCOL; i++) {
+		entry = &dcbx_set.config.params.app_entry[i];
+		if ((entry->sf_ieee == sf_ieee) &&
+		    (entry->proto_id == app->protocol))
+			break;
+		/* First empty slot */
+		if (!entry->proto_id) {
+			dcbx_set.config.params.num_app_entries++;
+			break;
+		}
+	}
+
+	if (i == QED_DCBX_MAX_APP_PROTOCOL) {
+		DP_ERR(cdev, "App table is full\n");
+		return -EBUSY;
+	}
+
+	dcbx_set.override_flags |= QED_DCBX_OVERRIDE_APP_CFG;
+	dcbx_set.config.params.app_entry[i].sf_ieee = sf_ieee;
+	dcbx_set.config.params.app_entry[i].proto_id = app->protocol;
+	dcbx_set.config.params.app_entry[i].prio = BIT(app->priority);
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EBUSY;
+
+	rc = qed_dcbx_config_params(hwfn, ptt, &dcbx_set, 0);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return rc;
+}
+
+const struct qed_eth_dcbnl_ops qed_dcbnl_ops_pass = {
+	.getstate = qed_dcbnl_getstate,
+	.setstate = qed_dcbnl_setstate,
+	.getpgtccfgtx = qed_dcbnl_getpgtccfgtx,
+	.getpgbwgcfgtx = qed_dcbnl_getpgbwgcfgtx,
+	.getpgtccfgrx = qed_dcbnl_getpgtccfgrx,
+	.getpgbwgcfgrx = qed_dcbnl_getpgbwgcfgrx,
+	.getpfccfg = qed_dcbnl_getpfccfg,
+	.setpfccfg = qed_dcbnl_setpfccfg,
+	.getcap = qed_dcbnl_getcap,
+	.getnumtcs = qed_dcbnl_getnumtcs,
+	.getpfcstate = qed_dcbnl_getpfcstate,
+	.getdcbx = qed_dcbnl_getdcbx,
+	.setpgtccfgtx = qed_dcbnl_setpgtccfgtx,
+	.setpgtccfgrx = qed_dcbnl_setpgtccfgrx,
+	.setpgbwgcfgtx = qed_dcbnl_setpgbwgcfgtx,
+	.setpgbwgcfgrx = qed_dcbnl_setpgbwgcfgrx,
+	.setall = qed_dcbnl_setall,
+	.setnumtcs = qed_dcbnl_setnumtcs,
+	.setpfcstate = qed_dcbnl_setpfcstate,
+	.setapp = qed_dcbnl_setapp,
+	.setdcbx = qed_dcbnl_setdcbx,
+	.setfeatcfg = qed_dcbnl_setfeatcfg,
+	.getfeatcfg = qed_dcbnl_getfeatcfg,
+	.getapp = qed_dcbnl_getapp,
+	.peer_getappinfo = qed_dcbnl_peer_getappinfo,
+	.peer_getapptable = qed_dcbnl_peer_getapptable,
+	.cee_peer_getpfc = qed_dcbnl_cee_peer_getpfc,
+	.cee_peer_getpg = qed_dcbnl_cee_peer_getpg,
+	.ieee_getpfc = qed_dcbnl_ieee_getpfc,
+	.ieee_setpfc = qed_dcbnl_ieee_setpfc,
+	.ieee_getets = qed_dcbnl_ieee_getets,
+	.ieee_setets = qed_dcbnl_ieee_setets,
+	.ieee_peer_getpfc = qed_dcbnl_ieee_peer_getpfc,
+	.ieee_peer_getets = qed_dcbnl_ieee_peer_getets,
+	.ieee_getapp = qed_dcbnl_ieee_getapp,
+	.ieee_setapp = qed_dcbnl_ieee_setapp,
+};
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.h b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h
new file mode 100644
index 0000000..5feb90e
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h
@@ -0,0 +1,126 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_DCBX_H
+#define _QED_DCBX_H
+#include <linux/types.h>
+#include <linux/slab.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+
+#define DCBX_CONFIG_MAX_APP_PROTOCOL    4
+
+enum qed_mib_read_type {
+	QED_DCBX_OPERATIONAL_MIB,
+	QED_DCBX_REMOTE_MIB,
+	QED_DCBX_LOCAL_MIB,
+	QED_DCBX_REMOTE_LLDP_MIB,
+	QED_DCBX_LOCAL_LLDP_MIB
+};
+
+struct qed_dcbx_app_data {
+	bool enable;		/* DCB enabled */
+	u8 update;		/* Update indication */
+	u8 priority;		/* Priority */
+	u8 tc;			/* Traffic Class */
+};
+
+#define QED_DCBX_VERSION_DISABLED       0
+#define QED_DCBX_VERSION_IEEE           1
+#define QED_DCBX_VERSION_CEE            2
+
+struct qed_dcbx_set {
+#define QED_DCBX_OVERRIDE_STATE	        BIT(0)
+#define QED_DCBX_OVERRIDE_PFC_CFG       BIT(1)
+#define QED_DCBX_OVERRIDE_ETS_CFG       BIT(2)
+#define QED_DCBX_OVERRIDE_APP_CFG       BIT(3)
+#define QED_DCBX_OVERRIDE_DSCP_CFG      BIT(4)
+	u32 override_flags;
+	bool enabled;
+	struct qed_dcbx_admin_params config;
+	u32 ver_num;
+};
+
+struct qed_dcbx_results {
+	bool dcbx_enabled;
+	u8 pf_id;
+	struct qed_dcbx_app_data arr[DCBX_MAX_PROTOCOL_TYPE];
+};
+
+struct qed_dcbx_app_metadata {
+	enum dcbx_protocol_type id;
+	char *name;
+	enum qed_pci_personality personality;
+};
+
+struct qed_dcbx_info {
+	struct lldp_status_params_s lldp_remote[LLDP_MAX_LLDP_AGENTS];
+	struct lldp_config_params_s lldp_local[LLDP_MAX_LLDP_AGENTS];
+	struct dcbx_local_params local_admin;
+	struct qed_dcbx_results results;
+	struct dcbx_mib operational;
+	struct dcbx_mib remote;
+	struct qed_dcbx_set set;
+	struct qed_dcbx_get get;
+	u8 dcbx_cap;
+};
+
+struct qed_dcbx_mib_meta_data {
+	struct lldp_config_params_s *lldp_local;
+	struct lldp_status_params_s *lldp_remote;
+	struct dcbx_local_params *local_admin;
+	struct dcbx_mib *mib;
+	size_t size;
+	u32 addr;
+};
+
+#ifdef CONFIG_DCB
+int qed_dcbx_get_config_params(struct qed_hwfn *, struct qed_dcbx_set *);
+
+int qed_dcbx_config_params(struct qed_hwfn *,
+			   struct qed_ptt *, struct qed_dcbx_set *, bool);
+#endif
+
+/* QED local interface routines */
+int
+qed_dcbx_mib_update_event(struct qed_hwfn *,
+			  struct qed_ptt *, enum qed_mib_read_type);
+
+int qed_dcbx_info_alloc(struct qed_hwfn *p_hwfn);
+void qed_dcbx_info_free(struct qed_hwfn *p_hwfn);
+void qed_dcbx_set_pf_update_params(struct qed_dcbx_results *p_src,
+				   struct pf_update_ramrod_data *p_dest);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c
new file mode 100644
index 0000000..4926c55
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c
@@ -0,0 +1,8099 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/crc32.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+
+/* Memory groups enum */
+enum mem_groups {
+	MEM_GROUP_PXP_MEM,
+	MEM_GROUP_DMAE_MEM,
+	MEM_GROUP_CM_MEM,
+	MEM_GROUP_QM_MEM,
+	MEM_GROUP_DORQ_MEM,
+	MEM_GROUP_BRB_RAM,
+	MEM_GROUP_BRB_MEM,
+	MEM_GROUP_PRS_MEM,
+	MEM_GROUP_IOR,
+	MEM_GROUP_BTB_RAM,
+	MEM_GROUP_CONN_CFC_MEM,
+	MEM_GROUP_TASK_CFC_MEM,
+	MEM_GROUP_CAU_PI,
+	MEM_GROUP_CAU_MEM,
+	MEM_GROUP_PXP_ILT,
+	MEM_GROUP_TM_MEM,
+	MEM_GROUP_SDM_MEM,
+	MEM_GROUP_PBUF,
+	MEM_GROUP_RAM,
+	MEM_GROUP_MULD_MEM,
+	MEM_GROUP_BTB_MEM,
+	MEM_GROUP_RDIF_CTX,
+	MEM_GROUP_TDIF_CTX,
+	MEM_GROUP_CFC_MEM,
+	MEM_GROUP_IGU_MEM,
+	MEM_GROUP_IGU_MSIX,
+	MEM_GROUP_CAU_SB,
+	MEM_GROUP_BMB_RAM,
+	MEM_GROUP_BMB_MEM,
+	MEM_GROUPS_NUM
+};
+
+/* Memory groups names */
+static const char * const s_mem_group_names[] = {
+	"PXP_MEM",
+	"DMAE_MEM",
+	"CM_MEM",
+	"QM_MEM",
+	"DORQ_MEM",
+	"BRB_RAM",
+	"BRB_MEM",
+	"PRS_MEM",
+	"IOR",
+	"BTB_RAM",
+	"CONN_CFC_MEM",
+	"TASK_CFC_MEM",
+	"CAU_PI",
+	"CAU_MEM",
+	"PXP_ILT",
+	"TM_MEM",
+	"SDM_MEM",
+	"PBUF",
+	"RAM",
+	"MULD_MEM",
+	"BTB_MEM",
+	"RDIF_CTX",
+	"TDIF_CTX",
+	"CFC_MEM",
+	"IGU_MEM",
+	"IGU_MSIX",
+	"CAU_SB",
+	"BMB_RAM",
+	"BMB_MEM",
+};
+
+/* Idle check conditions */
+
+static u32 cond5(const u32 *r, const u32 *imm)
+{
+	return ((r[0] & imm[0]) != imm[1]) && ((r[1] & imm[2]) != imm[3]);
+}
+
+static u32 cond7(const u32 *r, const u32 *imm)
+{
+	return ((r[0] >> imm[0]) & imm[1]) != imm[2];
+}
+
+static u32 cond6(const u32 *r, const u32 *imm)
+{
+	return (r[0] & imm[0]) != imm[1];
+}
+
+static u32 cond9(const u32 *r, const u32 *imm)
+{
+	return ((r[0] & imm[0]) >> imm[1]) !=
+	    (((r[0] & imm[2]) >> imm[3]) | ((r[1] & imm[4]) << imm[5]));
+}
+
+static u32 cond10(const u32 *r, const u32 *imm)
+{
+	return ((r[0] & imm[0]) >> imm[1]) != (r[0] & imm[2]);
+}
+
+static u32 cond4(const u32 *r, const u32 *imm)
+{
+	return (r[0] & ~imm[0]) != imm[1];
+}
+
+static u32 cond0(const u32 *r, const u32 *imm)
+{
+	return (r[0] & ~r[1]) != imm[0];
+}
+
+static u32 cond1(const u32 *r, const u32 *imm)
+{
+	return r[0] != imm[0];
+}
+
+static u32 cond11(const u32 *r, const u32 *imm)
+{
+	return r[0] != r[1] && r[2] == imm[0];
+}
+
+static u32 cond12(const u32 *r, const u32 *imm)
+{
+	return r[0] != r[1] && r[2] > imm[0];
+}
+
+static u32 cond3(const u32 *r, const u32 *imm)
+{
+	return r[0] != r[1];
+}
+
+static u32 cond13(const u32 *r, const u32 *imm)
+{
+	return r[0] & imm[0];
+}
+
+static u32 cond8(const u32 *r, const u32 *imm)
+{
+	return r[0] < (r[1] - imm[0]);
+}
+
+static u32 cond2(const u32 *r, const u32 *imm)
+{
+	return r[0] > imm[0];
+}
+
+/* Array of Idle Check conditions */
+static u32(*cond_arr[]) (const u32 *r, const u32 *imm) = {
+	cond0,
+	cond1,
+	cond2,
+	cond3,
+	cond4,
+	cond5,
+	cond6,
+	cond7,
+	cond8,
+	cond9,
+	cond10,
+	cond11,
+	cond12,
+	cond13,
+};
+
+/******************************* Data Types **********************************/
+
+enum platform_ids {
+	PLATFORM_ASIC,
+	PLATFORM_RESERVED,
+	PLATFORM_RESERVED2,
+	PLATFORM_RESERVED3,
+	MAX_PLATFORM_IDS
+};
+
+struct chip_platform_defs {
+	u8 num_ports;
+	u8 num_pfs;
+	u8 num_vfs;
+};
+
+/* Chip constant definitions */
+struct chip_defs {
+	const char *name;
+	struct chip_platform_defs per_platform[MAX_PLATFORM_IDS];
+};
+
+/* Platform constant definitions */
+struct platform_defs {
+	const char *name;
+	u32 delay_factor;
+	u32 dmae_thresh;
+	u32 log_thresh;
+};
+
+/* Storm constant definitions.
+ * Addresses are in bytes, sizes are in quad-regs.
+ */
+struct storm_defs {
+	char letter;
+	enum block_id block_id;
+	enum dbg_bus_clients dbg_client_id[MAX_CHIP_IDS];
+	bool has_vfc;
+	u32 sem_fast_mem_addr;
+	u32 sem_frame_mode_addr;
+	u32 sem_slow_enable_addr;
+	u32 sem_slow_mode_addr;
+	u32 sem_slow_mode1_conf_addr;
+	u32 sem_sync_dbg_empty_addr;
+	u32 sem_slow_dbg_empty_addr;
+	u32 cm_ctx_wr_addr;
+	u32 cm_conn_ag_ctx_lid_size;
+	u32 cm_conn_ag_ctx_rd_addr;
+	u32 cm_conn_st_ctx_lid_size;
+	u32 cm_conn_st_ctx_rd_addr;
+	u32 cm_task_ag_ctx_lid_size;
+	u32 cm_task_ag_ctx_rd_addr;
+	u32 cm_task_st_ctx_lid_size;
+	u32 cm_task_st_ctx_rd_addr;
+};
+
+/* Block constant definitions */
+struct block_defs {
+	const char *name;
+	bool exists[MAX_CHIP_IDS];
+	bool associated_to_storm;
+
+	/* Valid only if associated_to_storm is true */
+	u32 storm_id;
+	enum dbg_bus_clients dbg_client_id[MAX_CHIP_IDS];
+	u32 dbg_select_addr;
+	u32 dbg_enable_addr;
+	u32 dbg_shift_addr;
+	u32 dbg_force_valid_addr;
+	u32 dbg_force_frame_addr;
+	bool has_reset_bit;
+
+	/* If true, block is taken out of reset before dump */
+	bool unreset;
+	enum dbg_reset_regs reset_reg;
+
+	/* Bit offset in reset register */
+	u8 reset_bit_offset;
+};
+
+/* Reset register definitions */
+struct reset_reg_defs {
+	u32 addr;
+	bool exists[MAX_CHIP_IDS];
+	u32 unreset_val[MAX_CHIP_IDS];
+};
+
+struct grc_param_defs {
+	u32 default_val[MAX_CHIP_IDS];
+	u32 min;
+	u32 max;
+	bool is_preset;
+	bool is_persistent;
+	u32 exclude_all_preset_val;
+	u32 crash_preset_val;
+};
+
+/* Address is in 128b units. Width is in bits. */
+struct rss_mem_defs {
+	const char *mem_name;
+	const char *type_name;
+	u32 addr;
+	u32 entry_width;
+	u32 num_entries[MAX_CHIP_IDS];
+};
+
+struct vfc_ram_defs {
+	const char *mem_name;
+	const char *type_name;
+	u32 base_row;
+	u32 num_rows;
+};
+
+struct big_ram_defs {
+	const char *instance_name;
+	enum mem_groups mem_group_id;
+	enum mem_groups ram_mem_group_id;
+	enum dbg_grc_params grc_param;
+	u32 addr_reg_addr;
+	u32 data_reg_addr;
+	u32 is_256b_reg_addr;
+	u32 is_256b_bit_offset[MAX_CHIP_IDS];
+	u32 ram_size[MAX_CHIP_IDS]; /* In dwords */
+};
+
+struct phy_defs {
+	const char *phy_name;
+
+	/* PHY base GRC address */
+	u32 base_addr;
+
+	/* Relative address of indirect TBUS address register (bits 0..7) */
+	u32 tbus_addr_lo_addr;
+
+	/* Relative address of indirect TBUS address register (bits 8..10) */
+	u32 tbus_addr_hi_addr;
+
+	/* Relative address of indirect TBUS data register (bits 0..7) */
+	u32 tbus_data_lo_addr;
+
+	/* Relative address of indirect TBUS data register (bits 8..11) */
+	u32 tbus_data_hi_addr;
+};
+
+/******************************** Constants **********************************/
+
+#define MAX_LCIDS			320
+#define MAX_LTIDS			320
+
+#define NUM_IOR_SETS			2
+#define IORS_PER_SET			176
+#define IOR_SET_OFFSET(set_id)		((set_id) * 256)
+
+#define BYTES_IN_DWORD			sizeof(u32)
+
+/* In the macros below, size and offset are specified in bits */
+#define CEIL_DWORDS(size)		DIV_ROUND_UP(size, 32)
+#define FIELD_BIT_OFFSET(type, field)	type ## _ ## field ## _ ## OFFSET
+#define FIELD_BIT_SIZE(type, field)	type ## _ ## field ## _ ## SIZE
+#define FIELD_DWORD_OFFSET(type, field) \
+	 (int)(FIELD_BIT_OFFSET(type, field) / 32)
+#define FIELD_DWORD_SHIFT(type, field)	(FIELD_BIT_OFFSET(type, field) % 32)
+#define FIELD_BIT_MASK(type, field) \
+	(((1 << FIELD_BIT_SIZE(type, field)) - 1) << \
+	 FIELD_DWORD_SHIFT(type, field))
+
+#define SET_VAR_FIELD(var, type, field, val) \
+	do { \
+		var[FIELD_DWORD_OFFSET(type, field)] &=	\
+		(~FIELD_BIT_MASK(type, field));	\
+		var[FIELD_DWORD_OFFSET(type, field)] |= \
+		(val) << FIELD_DWORD_SHIFT(type, field); \
+	} while (0)
+
+#define ARR_REG_WR(dev, ptt, addr, arr, arr_size) \
+	do { \
+		for (i = 0; i < (arr_size); i++) \
+			qed_wr(dev, ptt, addr,	(arr)[i]); \
+	} while (0)
+
+#define ARR_REG_RD(dev, ptt, addr, arr, arr_size) \
+	do { \
+		for (i = 0; i < (arr_size); i++) \
+			(arr)[i] = qed_rd(dev, ptt, addr); \
+	} while (0)
+
+#define DWORDS_TO_BYTES(dwords)		((dwords) * BYTES_IN_DWORD)
+#define BYTES_TO_DWORDS(bytes)		((bytes) / BYTES_IN_DWORD)
+
+/* Extra lines include a signature line + optional latency events line */
+#define NUM_EXTRA_DBG_LINES(block_desc) \
+	(1 + ((block_desc)->has_latency_events ? 1 : 0))
+#define NUM_DBG_LINES(block_desc) \
+	((block_desc)->num_of_lines + NUM_EXTRA_DBG_LINES(block_desc))
+
+#define RAM_LINES_TO_DWORDS(lines)	((lines) * 2)
+#define RAM_LINES_TO_BYTES(lines) \
+	DWORDS_TO_BYTES(RAM_LINES_TO_DWORDS(lines))
+
+#define REG_DUMP_LEN_SHIFT		24
+#define MEM_DUMP_ENTRY_SIZE_DWORDS \
+	BYTES_TO_DWORDS(sizeof(struct dbg_dump_mem))
+
+#define IDLE_CHK_RULE_SIZE_DWORDS \
+	BYTES_TO_DWORDS(sizeof(struct dbg_idle_chk_rule))
+
+#define IDLE_CHK_RESULT_HDR_DWORDS \
+	BYTES_TO_DWORDS(sizeof(struct dbg_idle_chk_result_hdr))
+
+#define IDLE_CHK_RESULT_REG_HDR_DWORDS \
+	BYTES_TO_DWORDS(sizeof(struct dbg_idle_chk_result_reg_hdr))
+
+#define IDLE_CHK_MAX_ENTRIES_SIZE	32
+
+/* The sizes and offsets below are specified in bits */
+#define VFC_CAM_CMD_STRUCT_SIZE		64
+#define VFC_CAM_CMD_ROW_OFFSET		48
+#define VFC_CAM_CMD_ROW_SIZE		9
+#define VFC_CAM_ADDR_STRUCT_SIZE	16
+#define VFC_CAM_ADDR_OP_OFFSET		0
+#define VFC_CAM_ADDR_OP_SIZE		4
+#define VFC_CAM_RESP_STRUCT_SIZE	256
+#define VFC_RAM_ADDR_STRUCT_SIZE	16
+#define VFC_RAM_ADDR_OP_OFFSET		0
+#define VFC_RAM_ADDR_OP_SIZE		2
+#define VFC_RAM_ADDR_ROW_OFFSET		2
+#define VFC_RAM_ADDR_ROW_SIZE		10
+#define VFC_RAM_RESP_STRUCT_SIZE	256
+
+#define VFC_CAM_CMD_DWORDS		CEIL_DWORDS(VFC_CAM_CMD_STRUCT_SIZE)
+#define VFC_CAM_ADDR_DWORDS		CEIL_DWORDS(VFC_CAM_ADDR_STRUCT_SIZE)
+#define VFC_CAM_RESP_DWORDS		CEIL_DWORDS(VFC_CAM_RESP_STRUCT_SIZE)
+#define VFC_RAM_CMD_DWORDS		VFC_CAM_CMD_DWORDS
+#define VFC_RAM_ADDR_DWORDS		CEIL_DWORDS(VFC_RAM_ADDR_STRUCT_SIZE)
+#define VFC_RAM_RESP_DWORDS		CEIL_DWORDS(VFC_RAM_RESP_STRUCT_SIZE)
+
+#define NUM_VFC_RAM_TYPES		4
+
+#define VFC_CAM_NUM_ROWS		512
+
+#define VFC_OPCODE_CAM_RD		14
+#define VFC_OPCODE_RAM_RD		0
+
+#define NUM_RSS_MEM_TYPES		5
+
+#define NUM_BIG_RAM_TYPES		3
+
+#define NUM_PHY_TBUS_ADDRESSES		2048
+#define PHY_DUMP_SIZE_DWORDS		(NUM_PHY_TBUS_ADDRESSES / 2)
+
+#define RESET_REG_UNRESET_OFFSET	4
+
+#define STALL_DELAY_MS			500
+
+#define STATIC_DEBUG_LINE_DWORDS	9
+
+#define NUM_COMMON_GLOBAL_PARAMS	8
+
+#define FW_IMG_MAIN			1
+
+#define REG_FIFO_ELEMENT_DWORDS		2
+#define REG_FIFO_DEPTH_ELEMENTS		32
+#define REG_FIFO_DEPTH_DWORDS \
+	(REG_FIFO_ELEMENT_DWORDS * REG_FIFO_DEPTH_ELEMENTS)
+
+#define IGU_FIFO_ELEMENT_DWORDS		4
+#define IGU_FIFO_DEPTH_ELEMENTS		64
+#define IGU_FIFO_DEPTH_DWORDS \
+	(IGU_FIFO_ELEMENT_DWORDS * IGU_FIFO_DEPTH_ELEMENTS)
+
+#define PROTECTION_OVERRIDE_ELEMENT_DWORDS	2
+#define PROTECTION_OVERRIDE_DEPTH_ELEMENTS	20
+#define PROTECTION_OVERRIDE_DEPTH_DWORDS \
+	(PROTECTION_OVERRIDE_DEPTH_ELEMENTS * \
+	 PROTECTION_OVERRIDE_ELEMENT_DWORDS)
+
+#define MCP_SPAD_TRACE_OFFSIZE_ADDR \
+	(MCP_REG_SCRATCH + \
+	 offsetof(struct static_init, sections[SPAD_SECTION_TRACE]))
+
+#define EMPTY_FW_VERSION_STR		"???_???_???_???"
+#define EMPTY_FW_IMAGE_STR		"???????????????"
+
+/***************************** Constant Arrays *******************************/
+
+struct dbg_array {
+	const u32 *ptr;
+	u32 size_in_dwords;
+};
+
+/* Debug arrays */
+static struct dbg_array s_dbg_arrays[MAX_BIN_DBG_BUFFER_TYPE] = { {NULL} };
+
+/* Chip constant definitions array */
+static struct chip_defs s_chip_defs[MAX_CHIP_IDS] = {
+	{ "bb",
+	  {{MAX_NUM_PORTS_BB, MAX_NUM_PFS_BB, MAX_NUM_VFS_BB},
+	   {0, 0, 0},
+	   {0, 0, 0},
+	   {0, 0, 0} } },
+	{ "ah",
+	  {{MAX_NUM_PORTS_K2, MAX_NUM_PFS_K2, MAX_NUM_VFS_K2},
+	   {0, 0, 0},
+	   {0, 0, 0},
+	   {0, 0, 0} } },
+	{ "reserved",
+	   {{0, 0, 0},
+	   {0, 0, 0},
+	   {0, 0, 0},
+	   {0, 0, 0} } }
+};
+
+/* Storm constant definitions array */
+static struct storm_defs s_storm_defs[] = {
+	/* Tstorm */
+	{'T', BLOCK_TSEM,
+	 {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT,
+	  DBG_BUS_CLIENT_RBCT}, true,
+	 TSEM_REG_FAST_MEMORY,
+	 TSEM_REG_DBG_FRAME_MODE_BB_K2, TSEM_REG_SLOW_DBG_ACTIVE_BB_K2,
+	 TSEM_REG_SLOW_DBG_MODE_BB_K2, TSEM_REG_DBG_MODE1_CFG_BB_K2,
+	 TSEM_REG_SYNC_DBG_EMPTY, TSEM_REG_SLOW_DBG_EMPTY_BB_K2,
+	 TCM_REG_CTX_RBC_ACCS,
+	 4, TCM_REG_AGG_CON_CTX,
+	 16, TCM_REG_SM_CON_CTX,
+	 2, TCM_REG_AGG_TASK_CTX,
+	 4, TCM_REG_SM_TASK_CTX},
+
+	/* Mstorm */
+	{'M', BLOCK_MSEM,
+	 {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM,
+	  DBG_BUS_CLIENT_RBCM}, false,
+	 MSEM_REG_FAST_MEMORY,
+	 MSEM_REG_DBG_FRAME_MODE_BB_K2, MSEM_REG_SLOW_DBG_ACTIVE_BB_K2,
+	 MSEM_REG_SLOW_DBG_MODE_BB_K2, MSEM_REG_DBG_MODE1_CFG_BB_K2,
+	 MSEM_REG_SYNC_DBG_EMPTY, MSEM_REG_SLOW_DBG_EMPTY_BB_K2,
+	 MCM_REG_CTX_RBC_ACCS,
+	 1, MCM_REG_AGG_CON_CTX,
+	 10, MCM_REG_SM_CON_CTX,
+	 2, MCM_REG_AGG_TASK_CTX,
+	 7, MCM_REG_SM_TASK_CTX},
+
+	/* Ustorm */
+	{'U', BLOCK_USEM,
+	 {DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU,
+	  DBG_BUS_CLIENT_RBCU}, false,
+	 USEM_REG_FAST_MEMORY,
+	 USEM_REG_DBG_FRAME_MODE_BB_K2, USEM_REG_SLOW_DBG_ACTIVE_BB_K2,
+	 USEM_REG_SLOW_DBG_MODE_BB_K2, USEM_REG_DBG_MODE1_CFG_BB_K2,
+	 USEM_REG_SYNC_DBG_EMPTY, USEM_REG_SLOW_DBG_EMPTY_BB_K2,
+	 UCM_REG_CTX_RBC_ACCS,
+	 2, UCM_REG_AGG_CON_CTX,
+	 13, UCM_REG_SM_CON_CTX,
+	 3, UCM_REG_AGG_TASK_CTX,
+	 3, UCM_REG_SM_TASK_CTX},
+
+	/* Xstorm */
+	{'X', BLOCK_XSEM,
+	 {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX,
+	  DBG_BUS_CLIENT_RBCX}, false,
+	 XSEM_REG_FAST_MEMORY,
+	 XSEM_REG_DBG_FRAME_MODE_BB_K2, XSEM_REG_SLOW_DBG_ACTIVE_BB_K2,
+	 XSEM_REG_SLOW_DBG_MODE_BB_K2, XSEM_REG_DBG_MODE1_CFG_BB_K2,
+	 XSEM_REG_SYNC_DBG_EMPTY, XSEM_REG_SLOW_DBG_EMPTY_BB_K2,
+	 XCM_REG_CTX_RBC_ACCS,
+	 9, XCM_REG_AGG_CON_CTX,
+	 15, XCM_REG_SM_CON_CTX,
+	 0, 0,
+	 0, 0},
+
+	/* Ystorm */
+	{'Y', BLOCK_YSEM,
+	 {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY,
+	  DBG_BUS_CLIENT_RBCY}, false,
+	 YSEM_REG_FAST_MEMORY,
+	 YSEM_REG_DBG_FRAME_MODE_BB_K2, YSEM_REG_SLOW_DBG_ACTIVE_BB_K2,
+	 YSEM_REG_SLOW_DBG_MODE_BB_K2, YSEM_REG_DBG_MODE1_CFG_BB_K2,
+	 YSEM_REG_SYNC_DBG_EMPTY, TSEM_REG_SLOW_DBG_EMPTY_BB_K2,
+	 YCM_REG_CTX_RBC_ACCS,
+	 2, YCM_REG_AGG_CON_CTX,
+	 3, YCM_REG_SM_CON_CTX,
+	 2, YCM_REG_AGG_TASK_CTX,
+	 12, YCM_REG_SM_TASK_CTX},
+
+	/* Pstorm */
+	{'P', BLOCK_PSEM,
+	 {DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS,
+	  DBG_BUS_CLIENT_RBCS}, true,
+	 PSEM_REG_FAST_MEMORY,
+	 PSEM_REG_DBG_FRAME_MODE_BB_K2, PSEM_REG_SLOW_DBG_ACTIVE_BB_K2,
+	 PSEM_REG_SLOW_DBG_MODE_BB_K2, PSEM_REG_DBG_MODE1_CFG_BB_K2,
+	 PSEM_REG_SYNC_DBG_EMPTY, PSEM_REG_SLOW_DBG_EMPTY_BB_K2,
+	 PCM_REG_CTX_RBC_ACCS,
+	 0, 0,
+	 10, PCM_REG_SM_CON_CTX,
+	 0, 0,
+	 0, 0}
+};
+
+/* Block definitions array */
+
+static struct block_defs block_grc_defs = {
+	"grc",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN},
+	GRC_REG_DBG_SELECT, GRC_REG_DBG_DWORD_ENABLE,
+	GRC_REG_DBG_SHIFT, GRC_REG_DBG_FORCE_VALID,
+	GRC_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_UA, 1
+};
+
+static struct block_defs block_miscs_defs = {
+	"miscs", {true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_misc_defs = {
+	"misc", {true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_dbu_defs = {
+	"dbu", {true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_pglue_b_defs = {
+	"pglue_b",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCH, DBG_BUS_CLIENT_RBCH, DBG_BUS_CLIENT_RBCH},
+	PGLUE_B_REG_DBG_SELECT, PGLUE_B_REG_DBG_DWORD_ENABLE,
+	PGLUE_B_REG_DBG_SHIFT, PGLUE_B_REG_DBG_FORCE_VALID,
+	PGLUE_B_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 1
+};
+
+static struct block_defs block_cnig_defs = {
+	"cnig",
+	{true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCW,
+	 DBG_BUS_CLIENT_RBCW},
+	CNIG_REG_DBG_SELECT_K2_E5, CNIG_REG_DBG_DWORD_ENABLE_K2_E5,
+	CNIG_REG_DBG_SHIFT_K2_E5, CNIG_REG_DBG_FORCE_VALID_K2_E5,
+	CNIG_REG_DBG_FORCE_FRAME_K2_E5,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 0
+};
+
+static struct block_defs block_cpmu_defs = {
+	"cpmu", {true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 8
+};
+
+static struct block_defs block_ncsi_defs = {
+	"ncsi",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ},
+	NCSI_REG_DBG_SELECT, NCSI_REG_DBG_DWORD_ENABLE,
+	NCSI_REG_DBG_SHIFT, NCSI_REG_DBG_FORCE_VALID,
+	NCSI_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 5
+};
+
+static struct block_defs block_opte_defs = {
+	"opte", {true, true, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 4
+};
+
+static struct block_defs block_bmb_defs = {
+	"bmb",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCB, DBG_BUS_CLIENT_RBCB},
+	BMB_REG_DBG_SELECT, BMB_REG_DBG_DWORD_ENABLE,
+	BMB_REG_DBG_SHIFT, BMB_REG_DBG_FORCE_VALID,
+	BMB_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISCS_PL_UA, 7
+};
+
+static struct block_defs block_pcie_defs = {
+	"pcie",
+	{true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH,
+	 DBG_BUS_CLIENT_RBCH},
+	PCIE_REG_DBG_COMMON_SELECT_K2_E5,
+	PCIE_REG_DBG_COMMON_DWORD_ENABLE_K2_E5,
+	PCIE_REG_DBG_COMMON_SHIFT_K2_E5,
+	PCIE_REG_DBG_COMMON_FORCE_VALID_K2_E5,
+	PCIE_REG_DBG_COMMON_FORCE_FRAME_K2_E5,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_mcp_defs = {
+	"mcp", {true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_mcp2_defs = {
+	"mcp2",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ},
+	MCP2_REG_DBG_SELECT, MCP2_REG_DBG_DWORD_ENABLE,
+	MCP2_REG_DBG_SHIFT, MCP2_REG_DBG_FORCE_VALID,
+	MCP2_REG_DBG_FORCE_FRAME,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_pswhst_defs = {
+	"pswhst",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWHST_REG_DBG_SELECT, PSWHST_REG_DBG_DWORD_ENABLE,
+	PSWHST_REG_DBG_SHIFT, PSWHST_REG_DBG_FORCE_VALID,
+	PSWHST_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 0
+};
+
+static struct block_defs block_pswhst2_defs = {
+	"pswhst2",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWHST2_REG_DBG_SELECT, PSWHST2_REG_DBG_DWORD_ENABLE,
+	PSWHST2_REG_DBG_SHIFT, PSWHST2_REG_DBG_FORCE_VALID,
+	PSWHST2_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 0
+};
+
+static struct block_defs block_pswrd_defs = {
+	"pswrd",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWRD_REG_DBG_SELECT, PSWRD_REG_DBG_DWORD_ENABLE,
+	PSWRD_REG_DBG_SHIFT, PSWRD_REG_DBG_FORCE_VALID,
+	PSWRD_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 2
+};
+
+static struct block_defs block_pswrd2_defs = {
+	"pswrd2",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWRD2_REG_DBG_SELECT, PSWRD2_REG_DBG_DWORD_ENABLE,
+	PSWRD2_REG_DBG_SHIFT, PSWRD2_REG_DBG_FORCE_VALID,
+	PSWRD2_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 2
+};
+
+static struct block_defs block_pswwr_defs = {
+	"pswwr",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWWR_REG_DBG_SELECT, PSWWR_REG_DBG_DWORD_ENABLE,
+	PSWWR_REG_DBG_SHIFT, PSWWR_REG_DBG_FORCE_VALID,
+	PSWWR_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 3
+};
+
+static struct block_defs block_pswwr2_defs = {
+	"pswwr2", {true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 3
+};
+
+static struct block_defs block_pswrq_defs = {
+	"pswrq",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWRQ_REG_DBG_SELECT, PSWRQ_REG_DBG_DWORD_ENABLE,
+	PSWRQ_REG_DBG_SHIFT, PSWRQ_REG_DBG_FORCE_VALID,
+	PSWRQ_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 1
+};
+
+static struct block_defs block_pswrq2_defs = {
+	"pswrq2",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PSWRQ2_REG_DBG_SELECT, PSWRQ2_REG_DBG_DWORD_ENABLE,
+	PSWRQ2_REG_DBG_SHIFT, PSWRQ2_REG_DBG_FORCE_VALID,
+	PSWRQ2_REG_DBG_FORCE_FRAME,
+	true, false, DBG_RESET_REG_MISC_PL_HV, 1
+};
+
+static struct block_defs block_pglcs_defs = {
+	"pglcs",
+	{true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH,
+	 DBG_BUS_CLIENT_RBCH},
+	PGLCS_REG_DBG_SELECT_K2_E5, PGLCS_REG_DBG_DWORD_ENABLE_K2_E5,
+	PGLCS_REG_DBG_SHIFT_K2_E5, PGLCS_REG_DBG_FORCE_VALID_K2_E5,
+	PGLCS_REG_DBG_FORCE_FRAME_K2_E5,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 2
+};
+
+static struct block_defs block_ptu_defs = {
+	"ptu",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	PTU_REG_DBG_SELECT, PTU_REG_DBG_DWORD_ENABLE,
+	PTU_REG_DBG_SHIFT, PTU_REG_DBG_FORCE_VALID,
+	PTU_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 20
+};
+
+static struct block_defs block_dmae_defs = {
+	"dmae",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	DMAE_REG_DBG_SELECT, DMAE_REG_DBG_DWORD_ENABLE,
+	DMAE_REG_DBG_SHIFT, DMAE_REG_DBG_FORCE_VALID,
+	DMAE_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 28
+};
+
+static struct block_defs block_tcm_defs = {
+	"tcm",
+	{true, true, true}, true, DBG_TSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
+	TCM_REG_DBG_SELECT, TCM_REG_DBG_DWORD_ENABLE,
+	TCM_REG_DBG_SHIFT, TCM_REG_DBG_FORCE_VALID,
+	TCM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 5
+};
+
+static struct block_defs block_mcm_defs = {
+	"mcm",
+	{true, true, true}, true, DBG_MSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM, DBG_BUS_CLIENT_RBCM},
+	MCM_REG_DBG_SELECT, MCM_REG_DBG_DWORD_ENABLE,
+	MCM_REG_DBG_SHIFT, MCM_REG_DBG_FORCE_VALID,
+	MCM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 3
+};
+
+static struct block_defs block_ucm_defs = {
+	"ucm",
+	{true, true, true}, true, DBG_USTORM_ID,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	UCM_REG_DBG_SELECT, UCM_REG_DBG_DWORD_ENABLE,
+	UCM_REG_DBG_SHIFT, UCM_REG_DBG_FORCE_VALID,
+	UCM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 8
+};
+
+static struct block_defs block_xcm_defs = {
+	"xcm",
+	{true, true, true}, true, DBG_XSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
+	XCM_REG_DBG_SELECT, XCM_REG_DBG_DWORD_ENABLE,
+	XCM_REG_DBG_SHIFT, XCM_REG_DBG_FORCE_VALID,
+	XCM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 19
+};
+
+static struct block_defs block_ycm_defs = {
+	"ycm",
+	{true, true, true}, true, DBG_YSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY, DBG_BUS_CLIENT_RBCY},
+	YCM_REG_DBG_SELECT, YCM_REG_DBG_DWORD_ENABLE,
+	YCM_REG_DBG_SHIFT, YCM_REG_DBG_FORCE_VALID,
+	YCM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 5
+};
+
+static struct block_defs block_pcm_defs = {
+	"pcm",
+	{true, true, true}, true, DBG_PSTORM_ID,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	PCM_REG_DBG_SELECT, PCM_REG_DBG_DWORD_ENABLE,
+	PCM_REG_DBG_SHIFT, PCM_REG_DBG_FORCE_VALID,
+	PCM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 4
+};
+
+static struct block_defs block_qm_defs = {
+	"qm",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCQ, DBG_BUS_CLIENT_RBCQ},
+	QM_REG_DBG_SELECT, QM_REG_DBG_DWORD_ENABLE,
+	QM_REG_DBG_SHIFT, QM_REG_DBG_FORCE_VALID,
+	QM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 16
+};
+
+static struct block_defs block_tm_defs = {
+	"tm",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	TM_REG_DBG_SELECT, TM_REG_DBG_DWORD_ENABLE,
+	TM_REG_DBG_SHIFT, TM_REG_DBG_FORCE_VALID,
+	TM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 17
+};
+
+static struct block_defs block_dorq_defs = {
+	"dorq",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY, DBG_BUS_CLIENT_RBCY},
+	DORQ_REG_DBG_SELECT, DORQ_REG_DBG_DWORD_ENABLE,
+	DORQ_REG_DBG_SHIFT, DORQ_REG_DBG_FORCE_VALID,
+	DORQ_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 18
+};
+
+static struct block_defs block_brb_defs = {
+	"brb",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR},
+	BRB_REG_DBG_SELECT, BRB_REG_DBG_DWORD_ENABLE,
+	BRB_REG_DBG_SHIFT, BRB_REG_DBG_FORCE_VALID,
+	BRB_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 0
+};
+
+static struct block_defs block_src_defs = {
+	"src",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
+	SRC_REG_DBG_SELECT, SRC_REG_DBG_DWORD_ENABLE,
+	SRC_REG_DBG_SHIFT, SRC_REG_DBG_FORCE_VALID,
+	SRC_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 2
+};
+
+static struct block_defs block_prs_defs = {
+	"prs",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR},
+	PRS_REG_DBG_SELECT, PRS_REG_DBG_DWORD_ENABLE,
+	PRS_REG_DBG_SHIFT, PRS_REG_DBG_FORCE_VALID,
+	PRS_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 1
+};
+
+static struct block_defs block_tsdm_defs = {
+	"tsdm",
+	{true, true, true}, true, DBG_TSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
+	TSDM_REG_DBG_SELECT, TSDM_REG_DBG_DWORD_ENABLE,
+	TSDM_REG_DBG_SHIFT, TSDM_REG_DBG_FORCE_VALID,
+	TSDM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 3
+};
+
+static struct block_defs block_msdm_defs = {
+	"msdm",
+	{true, true, true}, true, DBG_MSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM, DBG_BUS_CLIENT_RBCM},
+	MSDM_REG_DBG_SELECT, MSDM_REG_DBG_DWORD_ENABLE,
+	MSDM_REG_DBG_SHIFT, MSDM_REG_DBG_FORCE_VALID,
+	MSDM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 6
+};
+
+static struct block_defs block_usdm_defs = {
+	"usdm",
+	{true, true, true}, true, DBG_USTORM_ID,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	USDM_REG_DBG_SELECT, USDM_REG_DBG_DWORD_ENABLE,
+	USDM_REG_DBG_SHIFT, USDM_REG_DBG_FORCE_VALID,
+	USDM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 7
+};
+
+static struct block_defs block_xsdm_defs = {
+	"xsdm",
+	{true, true, true}, true, DBG_XSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
+	XSDM_REG_DBG_SELECT, XSDM_REG_DBG_DWORD_ENABLE,
+	XSDM_REG_DBG_SHIFT, XSDM_REG_DBG_FORCE_VALID,
+	XSDM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 20
+};
+
+static struct block_defs block_ysdm_defs = {
+	"ysdm",
+	{true, true, true}, true, DBG_YSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY, DBG_BUS_CLIENT_RBCY},
+	YSDM_REG_DBG_SELECT, YSDM_REG_DBG_DWORD_ENABLE,
+	YSDM_REG_DBG_SHIFT, YSDM_REG_DBG_FORCE_VALID,
+	YSDM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 8
+};
+
+static struct block_defs block_psdm_defs = {
+	"psdm",
+	{true, true, true}, true, DBG_PSTORM_ID,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	PSDM_REG_DBG_SELECT, PSDM_REG_DBG_DWORD_ENABLE,
+	PSDM_REG_DBG_SHIFT, PSDM_REG_DBG_FORCE_VALID,
+	PSDM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 7
+};
+
+static struct block_defs block_tsem_defs = {
+	"tsem",
+	{true, true, true}, true, DBG_TSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
+	TSEM_REG_DBG_SELECT, TSEM_REG_DBG_DWORD_ENABLE,
+	TSEM_REG_DBG_SHIFT, TSEM_REG_DBG_FORCE_VALID,
+	TSEM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 4
+};
+
+static struct block_defs block_msem_defs = {
+	"msem",
+	{true, true, true}, true, DBG_MSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM, DBG_BUS_CLIENT_RBCM},
+	MSEM_REG_DBG_SELECT, MSEM_REG_DBG_DWORD_ENABLE,
+	MSEM_REG_DBG_SHIFT, MSEM_REG_DBG_FORCE_VALID,
+	MSEM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 9
+};
+
+static struct block_defs block_usem_defs = {
+	"usem",
+	{true, true, true}, true, DBG_USTORM_ID,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	USEM_REG_DBG_SELECT, USEM_REG_DBG_DWORD_ENABLE,
+	USEM_REG_DBG_SHIFT, USEM_REG_DBG_FORCE_VALID,
+	USEM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 9
+};
+
+static struct block_defs block_xsem_defs = {
+	"xsem",
+	{true, true, true}, true, DBG_XSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
+	XSEM_REG_DBG_SELECT, XSEM_REG_DBG_DWORD_ENABLE,
+	XSEM_REG_DBG_SHIFT, XSEM_REG_DBG_FORCE_VALID,
+	XSEM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 21
+};
+
+static struct block_defs block_ysem_defs = {
+	"ysem",
+	{true, true, true}, true, DBG_YSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY, DBG_BUS_CLIENT_RBCY},
+	YSEM_REG_DBG_SELECT, YSEM_REG_DBG_DWORD_ENABLE,
+	YSEM_REG_DBG_SHIFT, YSEM_REG_DBG_FORCE_VALID,
+	YSEM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 11
+};
+
+static struct block_defs block_psem_defs = {
+	"psem",
+	{true, true, true}, true, DBG_PSTORM_ID,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	PSEM_REG_DBG_SELECT, PSEM_REG_DBG_DWORD_ENABLE,
+	PSEM_REG_DBG_SHIFT, PSEM_REG_DBG_FORCE_VALID,
+	PSEM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 10
+};
+
+static struct block_defs block_rss_defs = {
+	"rss",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
+	RSS_REG_DBG_SELECT, RSS_REG_DBG_DWORD_ENABLE,
+	RSS_REG_DBG_SHIFT, RSS_REG_DBG_FORCE_VALID,
+	RSS_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 18
+};
+
+static struct block_defs block_tmld_defs = {
+	"tmld",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM, DBG_BUS_CLIENT_RBCM},
+	TMLD_REG_DBG_SELECT, TMLD_REG_DBG_DWORD_ENABLE,
+	TMLD_REG_DBG_SHIFT, TMLD_REG_DBG_FORCE_VALID,
+	TMLD_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 13
+};
+
+static struct block_defs block_muld_defs = {
+	"muld",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	MULD_REG_DBG_SELECT, MULD_REG_DBG_DWORD_ENABLE,
+	MULD_REG_DBG_SHIFT, MULD_REG_DBG_FORCE_VALID,
+	MULD_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 14
+};
+
+static struct block_defs block_yuld_defs = {
+	"yuld",
+	{true, true, false}, false, 0,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU,
+	 MAX_DBG_BUS_CLIENTS},
+	YULD_REG_DBG_SELECT_BB_K2, YULD_REG_DBG_DWORD_ENABLE_BB_K2,
+	YULD_REG_DBG_SHIFT_BB_K2, YULD_REG_DBG_FORCE_VALID_BB_K2,
+	YULD_REG_DBG_FORCE_FRAME_BB_K2,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2,
+	15
+};
+
+static struct block_defs block_xyld_defs = {
+	"xyld",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
+	XYLD_REG_DBG_SELECT, XYLD_REG_DBG_DWORD_ENABLE,
+	XYLD_REG_DBG_SHIFT, XYLD_REG_DBG_FORCE_VALID,
+	XYLD_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 12
+};
+
+static struct block_defs block_ptld_defs = {
+	"ptld",
+	{false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCT},
+	PTLD_REG_DBG_SELECT_E5, PTLD_REG_DBG_DWORD_ENABLE_E5,
+	PTLD_REG_DBG_SHIFT_E5, PTLD_REG_DBG_FORCE_VALID_E5,
+	PTLD_REG_DBG_FORCE_FRAME_E5,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2,
+	28
+};
+
+static struct block_defs block_ypld_defs = {
+	"ypld",
+	{false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCS},
+	YPLD_REG_DBG_SELECT_E5, YPLD_REG_DBG_DWORD_ENABLE_E5,
+	YPLD_REG_DBG_SHIFT_E5, YPLD_REG_DBG_FORCE_VALID_E5,
+	YPLD_REG_DBG_FORCE_FRAME_E5,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2,
+	27
+};
+
+static struct block_defs block_prm_defs = {
+	"prm",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM, DBG_BUS_CLIENT_RBCM},
+	PRM_REG_DBG_SELECT, PRM_REG_DBG_DWORD_ENABLE,
+	PRM_REG_DBG_SHIFT, PRM_REG_DBG_FORCE_VALID,
+	PRM_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 21
+};
+
+static struct block_defs block_pbf_pb1_defs = {
+	"pbf_pb1",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV, DBG_BUS_CLIENT_RBCV},
+	PBF_PB1_REG_DBG_SELECT, PBF_PB1_REG_DBG_DWORD_ENABLE,
+	PBF_PB1_REG_DBG_SHIFT, PBF_PB1_REG_DBG_FORCE_VALID,
+	PBF_PB1_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1,
+	11
+};
+
+static struct block_defs block_pbf_pb2_defs = {
+	"pbf_pb2",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV, DBG_BUS_CLIENT_RBCV},
+	PBF_PB2_REG_DBG_SELECT, PBF_PB2_REG_DBG_DWORD_ENABLE,
+	PBF_PB2_REG_DBG_SHIFT, PBF_PB2_REG_DBG_FORCE_VALID,
+	PBF_PB2_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1,
+	12
+};
+
+static struct block_defs block_rpb_defs = {
+	"rpb",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM, DBG_BUS_CLIENT_RBCM},
+	RPB_REG_DBG_SELECT, RPB_REG_DBG_DWORD_ENABLE,
+	RPB_REG_DBG_SHIFT, RPB_REG_DBG_FORCE_VALID,
+	RPB_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 13
+};
+
+static struct block_defs block_btb_defs = {
+	"btb",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCV, DBG_BUS_CLIENT_RBCV},
+	BTB_REG_DBG_SELECT, BTB_REG_DBG_DWORD_ENABLE,
+	BTB_REG_DBG_SHIFT, BTB_REG_DBG_FORCE_VALID,
+	BTB_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 10
+};
+
+static struct block_defs block_pbf_defs = {
+	"pbf",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV, DBG_BUS_CLIENT_RBCV},
+	PBF_REG_DBG_SELECT, PBF_REG_DBG_DWORD_ENABLE,
+	PBF_REG_DBG_SHIFT, PBF_REG_DBG_FORCE_VALID,
+	PBF_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 15
+};
+
+static struct block_defs block_rdif_defs = {
+	"rdif",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM, DBG_BUS_CLIENT_RBCM},
+	RDIF_REG_DBG_SELECT, RDIF_REG_DBG_DWORD_ENABLE,
+	RDIF_REG_DBG_SHIFT, RDIF_REG_DBG_FORCE_VALID,
+	RDIF_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 16
+};
+
+static struct block_defs block_tdif_defs = {
+	"tdif",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	TDIF_REG_DBG_SELECT, TDIF_REG_DBG_DWORD_ENABLE,
+	TDIF_REG_DBG_SHIFT, TDIF_REG_DBG_FORCE_VALID,
+	TDIF_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 17
+};
+
+static struct block_defs block_cdu_defs = {
+	"cdu",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
+	CDU_REG_DBG_SELECT, CDU_REG_DBG_DWORD_ENABLE,
+	CDU_REG_DBG_SHIFT, CDU_REG_DBG_FORCE_VALID,
+	CDU_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 23
+};
+
+static struct block_defs block_ccfc_defs = {
+	"ccfc",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
+	CCFC_REG_DBG_SELECT, CCFC_REG_DBG_DWORD_ENABLE,
+	CCFC_REG_DBG_SHIFT, CCFC_REG_DBG_FORCE_VALID,
+	CCFC_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 24
+};
+
+static struct block_defs block_tcfc_defs = {
+	"tcfc",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
+	TCFC_REG_DBG_SELECT, TCFC_REG_DBG_DWORD_ENABLE,
+	TCFC_REG_DBG_SHIFT, TCFC_REG_DBG_FORCE_VALID,
+	TCFC_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 25
+};
+
+static struct block_defs block_igu_defs = {
+	"igu",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	IGU_REG_DBG_SELECT, IGU_REG_DBG_DWORD_ENABLE,
+	IGU_REG_DBG_SHIFT, IGU_REG_DBG_FORCE_VALID,
+	IGU_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1, 27
+};
+
+static struct block_defs block_cau_defs = {
+	"cau",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	CAU_REG_DBG_SELECT, CAU_REG_DBG_DWORD_ENABLE,
+	CAU_REG_DBG_SHIFT, CAU_REG_DBG_FORCE_VALID,
+	CAU_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 19
+};
+
+static struct block_defs block_rgfs_defs = {
+	"rgfs", {false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 29
+};
+
+static struct block_defs block_rgsrc_defs = {
+	"rgsrc",
+	{false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH},
+	RGSRC_REG_DBG_SELECT_E5, RGSRC_REG_DBG_DWORD_ENABLE_E5,
+	RGSRC_REG_DBG_SHIFT_E5, RGSRC_REG_DBG_FORCE_VALID_E5,
+	RGSRC_REG_DBG_FORCE_FRAME_E5,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1,
+	30
+};
+
+static struct block_defs block_tgfs_defs = {
+	"tgfs", {false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_2, 30
+};
+
+static struct block_defs block_tgsrc_defs = {
+	"tgsrc",
+	{false, false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCV},
+	TGSRC_REG_DBG_SELECT_E5, TGSRC_REG_DBG_DWORD_ENABLE_E5,
+	TGSRC_REG_DBG_SHIFT_E5, TGSRC_REG_DBG_FORCE_VALID_E5,
+	TGSRC_REG_DBG_FORCE_FRAME_E5,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VMAIN_1,
+	31
+};
+
+static struct block_defs block_umac_defs = {
+	"umac",
+	{true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCZ,
+	 DBG_BUS_CLIENT_RBCZ},
+	UMAC_REG_DBG_SELECT_K2_E5, UMAC_REG_DBG_DWORD_ENABLE_K2_E5,
+	UMAC_REG_DBG_SHIFT_K2_E5, UMAC_REG_DBG_FORCE_VALID_K2_E5,
+	UMAC_REG_DBG_FORCE_FRAME_K2_E5,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 6
+};
+
+static struct block_defs block_xmac_defs = {
+	"xmac", {true, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_dbg_defs = {
+	"dbg", {true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VAUX, 3
+};
+
+static struct block_defs block_nig_defs = {
+	"nig",
+	{true, true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN},
+	NIG_REG_DBG_SELECT, NIG_REG_DBG_DWORD_ENABLE,
+	NIG_REG_DBG_SHIFT, NIG_REG_DBG_FORCE_VALID,
+	NIG_REG_DBG_FORCE_FRAME,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VAUX, 0
+};
+
+static struct block_defs block_wol_defs = {
+	"wol",
+	{false, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ},
+	WOL_REG_DBG_SELECT_K2_E5, WOL_REG_DBG_DWORD_ENABLE_K2_E5,
+	WOL_REG_DBG_SHIFT_K2_E5, WOL_REG_DBG_FORCE_VALID_K2_E5,
+	WOL_REG_DBG_FORCE_FRAME_K2_E5,
+	true, true, DBG_RESET_REG_MISC_PL_PDA_VAUX, 7
+};
+
+static struct block_defs block_bmbn_defs = {
+	"bmbn",
+	{false, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCB,
+	 DBG_BUS_CLIENT_RBCB},
+	BMBN_REG_DBG_SELECT_K2_E5, BMBN_REG_DBG_DWORD_ENABLE_K2_E5,
+	BMBN_REG_DBG_SHIFT_K2_E5, BMBN_REG_DBG_FORCE_VALID_K2_E5,
+	BMBN_REG_DBG_FORCE_FRAME_K2_E5,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_ipc_defs = {
+	"ipc", {true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISCS_PL_UA, 8
+};
+
+static struct block_defs block_nwm_defs = {
+	"nwm",
+	{false, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCW, DBG_BUS_CLIENT_RBCW},
+	NWM_REG_DBG_SELECT_K2_E5, NWM_REG_DBG_DWORD_ENABLE_K2_E5,
+	NWM_REG_DBG_SHIFT_K2_E5, NWM_REG_DBG_FORCE_VALID_K2_E5,
+	NWM_REG_DBG_FORCE_FRAME_K2_E5,
+	true, false, DBG_RESET_REG_MISCS_PL_HV_2, 0
+};
+
+static struct block_defs block_nws_defs = {
+	"nws",
+	{false, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCW, DBG_BUS_CLIENT_RBCW},
+	NWS_REG_DBG_SELECT_K2_E5, NWS_REG_DBG_DWORD_ENABLE_K2_E5,
+	NWS_REG_DBG_SHIFT_K2_E5, NWS_REG_DBG_FORCE_VALID_K2_E5,
+	NWS_REG_DBG_FORCE_FRAME_K2_E5,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 12
+};
+
+static struct block_defs block_ms_defs = {
+	"ms",
+	{false, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ},
+	MS_REG_DBG_SELECT_K2_E5, MS_REG_DBG_DWORD_ENABLE_K2_E5,
+	MS_REG_DBG_SHIFT_K2_E5, MS_REG_DBG_FORCE_VALID_K2_E5,
+	MS_REG_DBG_FORCE_FRAME_K2_E5,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 13
+};
+
+static struct block_defs block_phy_pcie_defs = {
+	"phy_pcie",
+	{false, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH,
+	 DBG_BUS_CLIENT_RBCH},
+	PCIE_REG_DBG_COMMON_SELECT_K2_E5,
+	PCIE_REG_DBG_COMMON_DWORD_ENABLE_K2_E5,
+	PCIE_REG_DBG_COMMON_SHIFT_K2_E5,
+	PCIE_REG_DBG_COMMON_FORCE_VALID_K2_E5,
+	PCIE_REG_DBG_COMMON_FORCE_FRAME_K2_E5,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_led_defs = {
+	"led", {false, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 14
+};
+
+static struct block_defs block_avs_wrap_defs = {
+	"avs_wrap", {false, true, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISCS_PL_UA, 11
+};
+
+static struct block_defs block_pxpreqbus_defs = {
+	"pxpreqbus", {false, false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_misc_aeu_defs = {
+	"misc_aeu", {true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_bar0_map_defs = {
+	"bar0_map", {true, true, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs *s_block_defs[MAX_BLOCK_ID] = {
+	&block_grc_defs,
+	&block_miscs_defs,
+	&block_misc_defs,
+	&block_dbu_defs,
+	&block_pglue_b_defs,
+	&block_cnig_defs,
+	&block_cpmu_defs,
+	&block_ncsi_defs,
+	&block_opte_defs,
+	&block_bmb_defs,
+	&block_pcie_defs,
+	&block_mcp_defs,
+	&block_mcp2_defs,
+	&block_pswhst_defs,
+	&block_pswhst2_defs,
+	&block_pswrd_defs,
+	&block_pswrd2_defs,
+	&block_pswwr_defs,
+	&block_pswwr2_defs,
+	&block_pswrq_defs,
+	&block_pswrq2_defs,
+	&block_pglcs_defs,
+	&block_dmae_defs,
+	&block_ptu_defs,
+	&block_tcm_defs,
+	&block_mcm_defs,
+	&block_ucm_defs,
+	&block_xcm_defs,
+	&block_ycm_defs,
+	&block_pcm_defs,
+	&block_qm_defs,
+	&block_tm_defs,
+	&block_dorq_defs,
+	&block_brb_defs,
+	&block_src_defs,
+	&block_prs_defs,
+	&block_tsdm_defs,
+	&block_msdm_defs,
+	&block_usdm_defs,
+	&block_xsdm_defs,
+	&block_ysdm_defs,
+	&block_psdm_defs,
+	&block_tsem_defs,
+	&block_msem_defs,
+	&block_usem_defs,
+	&block_xsem_defs,
+	&block_ysem_defs,
+	&block_psem_defs,
+	&block_rss_defs,
+	&block_tmld_defs,
+	&block_muld_defs,
+	&block_yuld_defs,
+	&block_xyld_defs,
+	&block_ptld_defs,
+	&block_ypld_defs,
+	&block_prm_defs,
+	&block_pbf_pb1_defs,
+	&block_pbf_pb2_defs,
+	&block_rpb_defs,
+	&block_btb_defs,
+	&block_pbf_defs,
+	&block_rdif_defs,
+	&block_tdif_defs,
+	&block_cdu_defs,
+	&block_ccfc_defs,
+	&block_tcfc_defs,
+	&block_igu_defs,
+	&block_cau_defs,
+	&block_rgfs_defs,
+	&block_rgsrc_defs,
+	&block_tgfs_defs,
+	&block_tgsrc_defs,
+	&block_umac_defs,
+	&block_xmac_defs,
+	&block_dbg_defs,
+	&block_nig_defs,
+	&block_wol_defs,
+	&block_bmbn_defs,
+	&block_ipc_defs,
+	&block_nwm_defs,
+	&block_nws_defs,
+	&block_ms_defs,
+	&block_phy_pcie_defs,
+	&block_led_defs,
+	&block_avs_wrap_defs,
+	&block_pxpreqbus_defs,
+	&block_misc_aeu_defs,
+	&block_bar0_map_defs,
+};
+
+static struct platform_defs s_platform_defs[] = {
+	{"asic", 1, 256, 32768},
+	{"reserved", 0, 0, 0},
+	{"reserved2", 0, 0, 0},
+	{"reserved3", 0, 0, 0}
+};
+
+static struct grc_param_defs s_grc_param_defs[] = {
+	/* DBG_GRC_PARAM_DUMP_TSTORM */
+	{{1, 1, 1}, 0, 1, false, false, 1, 1},
+
+	/* DBG_GRC_PARAM_DUMP_MSTORM */
+	{{1, 1, 1}, 0, 1, false, false, 1, 1},
+
+	/* DBG_GRC_PARAM_DUMP_USTORM */
+	{{1, 1, 1}, 0, 1, false, false, 1, 1},
+
+	/* DBG_GRC_PARAM_DUMP_XSTORM */
+	{{1, 1, 1}, 0, 1, false, false, 1, 1},
+
+	/* DBG_GRC_PARAM_DUMP_YSTORM */
+	{{1, 1, 1}, 0, 1, false, false, 1, 1},
+
+	/* DBG_GRC_PARAM_DUMP_PSTORM */
+	{{1, 1, 1}, 0, 1, false, false, 1, 1},
+
+	/* DBG_GRC_PARAM_DUMP_REGS */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_RAM */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_PBUF */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_IOR */
+	{{0, 0, 0}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_VFC */
+	{{0, 0, 0}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_CM_CTX */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_ILT */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_RSS */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_CAU */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_QM */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_MCP */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_MCP_TRACE_META_SIZE */
+	{{1, 1, 1}, 1, 0xffffffff, false, true, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_CFC */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_IGU */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_BRB */
+	{{0, 0, 0}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_BTB */
+	{{0, 0, 0}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_BMB */
+	{{0, 0, 0}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_NIG */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_MULD */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_PRS */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_DMAE */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_TM */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_SDM */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_DIF */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_STATIC */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_UNSTALL */
+	{{0, 0, 0}, 0, 1, false, false, 0, 0},
+
+	/* DBG_GRC_PARAM_NUM_LCIDS */
+	{{MAX_LCIDS, MAX_LCIDS, MAX_LCIDS}, 1, MAX_LCIDS, false, false,
+	 MAX_LCIDS, MAX_LCIDS},
+
+	/* DBG_GRC_PARAM_NUM_LTIDS */
+	{{MAX_LTIDS, MAX_LTIDS, MAX_LTIDS}, 1, MAX_LTIDS, false, false,
+	 MAX_LTIDS, MAX_LTIDS},
+
+	/* DBG_GRC_PARAM_EXCLUDE_ALL */
+	{{0, 0, 0}, 0, 1, true, false, 0, 0},
+
+	/* DBG_GRC_PARAM_CRASH */
+	{{0, 0, 0}, 0, 1, true, false, 0, 0},
+
+	/* DBG_GRC_PARAM_PARITY_SAFE */
+	{{0, 0, 0}, 0, 1, false, false, 1, 0},
+
+	/* DBG_GRC_PARAM_DUMP_CM */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_DUMP_PHY */
+	{{1, 1, 1}, 0, 1, false, false, 0, 1},
+
+	/* DBG_GRC_PARAM_NO_MCP */
+	{{0, 0, 0}, 0, 1, false, false, 0, 0},
+
+	/* DBG_GRC_PARAM_NO_FW_VER */
+	{{0, 0, 0}, 0, 1, false, false, 0, 0}
+};
+
+static struct rss_mem_defs s_rss_mem_defs[] = {
+	{ "rss_mem_cid", "rss_cid", 0, 32,
+	  {256, 320, 512} },
+
+	{ "rss_mem_key_msb", "rss_key", 1024, 256,
+	  {128, 208, 257} },
+
+	{ "rss_mem_key_lsb", "rss_key", 2048, 64,
+	  {128, 208, 257} },
+
+	{ "rss_mem_info", "rss_info", 3072, 16,
+	  {128, 208, 256} },
+
+	{ "rss_mem_ind", "rss_ind", 4096, 16,
+	  {16384, 26624, 32768} }
+};
+
+static struct vfc_ram_defs s_vfc_ram_defs[] = {
+	{"vfc_ram_tt1", "vfc_ram", 0, 512},
+	{"vfc_ram_mtt2", "vfc_ram", 512, 128},
+	{"vfc_ram_stt2", "vfc_ram", 640, 32},
+	{"vfc_ram_ro_vect", "vfc_ram", 672, 32}
+};
+
+static struct big_ram_defs s_big_ram_defs[] = {
+	{ "BRB", MEM_GROUP_BRB_MEM, MEM_GROUP_BRB_RAM, DBG_GRC_PARAM_DUMP_BRB,
+	  BRB_REG_BIG_RAM_ADDRESS, BRB_REG_BIG_RAM_DATA,
+	  MISC_REG_BLOCK_256B_EN, {0, 0, 0},
+	  {153600, 180224, 282624} },
+
+	{ "BTB", MEM_GROUP_BTB_MEM, MEM_GROUP_BTB_RAM, DBG_GRC_PARAM_DUMP_BTB,
+	  BTB_REG_BIG_RAM_ADDRESS, BTB_REG_BIG_RAM_DATA,
+	  MISC_REG_BLOCK_256B_EN, {0, 1, 1},
+	  {92160, 117760, 168960} },
+
+	{ "BMB", MEM_GROUP_BMB_MEM, MEM_GROUP_BMB_RAM, DBG_GRC_PARAM_DUMP_BMB,
+	  BMB_REG_BIG_RAM_ADDRESS, BMB_REG_BIG_RAM_DATA,
+	  MISCS_REG_BLOCK_256B_EN, {0, 0, 0},
+	  {36864, 36864, 36864} }
+};
+
+static struct reset_reg_defs s_reset_regs_defs[] = {
+	/* DBG_RESET_REG_MISCS_PL_UA */
+	{ MISCS_REG_RESET_PL_UA,
+	  {true, true, true}, {0x0, 0x0, 0x0} },
+
+	/* DBG_RESET_REG_MISCS_PL_HV */
+	{ MISCS_REG_RESET_PL_HV,
+	  {true, true, true}, {0x0, 0x400, 0x600} },
+
+	/* DBG_RESET_REG_MISCS_PL_HV_2 */
+	{ MISCS_REG_RESET_PL_HV_2_K2_E5,
+	  {false, true, true}, {0x0, 0x0, 0x0} },
+
+	/* DBG_RESET_REG_MISC_PL_UA */
+	{ MISC_REG_RESET_PL_UA,
+	  {true, true, true}, {0x0, 0x0, 0x0} },
+
+	/* DBG_RESET_REG_MISC_PL_HV */
+	{ MISC_REG_RESET_PL_HV,
+	  {true, true, true}, {0x0, 0x0, 0x0} },
+
+	/* DBG_RESET_REG_MISC_PL_PDA_VMAIN_1 */
+	{ MISC_REG_RESET_PL_PDA_VMAIN_1,
+	  {true, true, true}, {0x4404040, 0x4404040, 0x404040} },
+
+	/* DBG_RESET_REG_MISC_PL_PDA_VMAIN_2 */
+	{ MISC_REG_RESET_PL_PDA_VMAIN_2,
+	  {true, true, true}, {0x7, 0x7c00007, 0x5c08007} },
+
+	/* DBG_RESET_REG_MISC_PL_PDA_VAUX */
+	{ MISC_REG_RESET_PL_PDA_VAUX,
+	  {true, true, true}, {0x2, 0x2, 0x2} },
+};
+
+static struct phy_defs s_phy_defs[] = {
+	{"nw_phy", NWS_REG_NWS_CMU_K2,
+	 PHY_NW_IP_REG_PHY0_TOP_TBUS_ADDR_7_0_K2_E5,
+	 PHY_NW_IP_REG_PHY0_TOP_TBUS_ADDR_15_8_K2_E5,
+	 PHY_NW_IP_REG_PHY0_TOP_TBUS_DATA_7_0_K2_E5,
+	 PHY_NW_IP_REG_PHY0_TOP_TBUS_DATA_11_8_K2_E5},
+	{"sgmii_phy", MS_REG_MS_CMU_K2_E5,
+	 PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X132_K2_E5,
+	 PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X133_K2_E5,
+	 PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X130_K2_E5,
+	 PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X131_K2_E5},
+	{"pcie_phy0", PHY_PCIE_REG_PHY0_K2_E5,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X132_K2_E5,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X133_K2_E5,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X130_K2_E5,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X131_K2_E5},
+	{"pcie_phy1", PHY_PCIE_REG_PHY1_K2_E5,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X132_K2_E5,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X133_K2_E5,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X130_K2_E5,
+	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X131_K2_E5},
+};
+
+/**************************** Private Functions ******************************/
+
+/* Reads and returns a single dword from the specified unaligned buffer */
+static u32 qed_read_unaligned_dword(u8 *buf)
+{
+	u32 dword;
+
+	memcpy((u8 *)&dword, buf, sizeof(dword));
+	return dword;
+}
+
+/* Returns the value of the specified GRC param */
+static u32 qed_grc_get_param(struct qed_hwfn *p_hwfn,
+			     enum dbg_grc_params grc_param)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+
+	return dev_data->grc.param_val[grc_param];
+}
+
+/* Initializes the GRC parameters */
+static void qed_dbg_grc_init_params(struct qed_hwfn *p_hwfn)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+
+	if (!dev_data->grc.params_initialized) {
+		qed_dbg_grc_set_params_default(p_hwfn);
+		dev_data->grc.params_initialized = 1;
+	}
+}
+
+/* Initializes debug data for the specified device */
+static enum dbg_status qed_dbg_dev_init(struct qed_hwfn *p_hwfn,
+					struct qed_ptt *p_ptt)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+
+	if (dev_data->initialized)
+		return DBG_STATUS_OK;
+
+	if (QED_IS_K2(p_hwfn->cdev)) {
+		dev_data->chip_id = CHIP_K2;
+		dev_data->mode_enable[MODE_K2] = 1;
+	} else if (QED_IS_BB_B0(p_hwfn->cdev)) {
+		dev_data->chip_id = CHIP_BB;
+		dev_data->mode_enable[MODE_BB] = 1;
+	} else {
+		return DBG_STATUS_UNKNOWN_CHIP;
+	}
+
+	dev_data->platform_id = PLATFORM_ASIC;
+	dev_data->mode_enable[MODE_ASIC] = 1;
+
+	/* Initializes the GRC parameters */
+	qed_dbg_grc_init_params(p_hwfn);
+
+	dev_data->use_dmae = true;
+	dev_data->num_regs_read = 0;
+	dev_data->initialized = 1;
+
+	return DBG_STATUS_OK;
+}
+
+static struct dbg_bus_block *get_dbg_bus_block_desc(struct qed_hwfn *p_hwfn,
+						    enum block_id block_id)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+
+	return (struct dbg_bus_block *)&dbg_bus_blocks[block_id *
+						       MAX_CHIP_IDS +
+						       dev_data->chip_id];
+}
+
+/* Reads the FW info structure for the specified Storm from the chip,
+ * and writes it to the specified fw_info pointer.
+ */
+static void qed_read_fw_info(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt,
+			     u8 storm_id, struct fw_info *fw_info)
+{
+	struct storm_defs *storm = &s_storm_defs[storm_id];
+	struct fw_info_location fw_info_location;
+	u32 addr, i, *dest;
+
+	memset(&fw_info_location, 0, sizeof(fw_info_location));
+	memset(fw_info, 0, sizeof(*fw_info));
+
+	/* Read first the address that points to fw_info location.
+	 * The address is located in the last line of the Storm RAM.
+	 */
+	addr = storm->sem_fast_mem_addr + SEM_FAST_REG_INT_RAM +
+	       DWORDS_TO_BYTES(SEM_FAST_REG_INT_RAM_SIZE_BB_K2) -
+	       sizeof(fw_info_location);
+	dest = (u32 *)&fw_info_location;
+
+	for (i = 0; i < BYTES_TO_DWORDS(sizeof(fw_info_location));
+	     i++, addr += BYTES_IN_DWORD)
+		dest[i] = qed_rd(p_hwfn, p_ptt, addr);
+
+	/* Read FW version info from Storm RAM */
+	if (fw_info_location.size > 0 && fw_info_location.size <=
+	    sizeof(*fw_info)) {
+		addr = fw_info_location.grc_addr;
+		dest = (u32 *)fw_info;
+		for (i = 0; i < BYTES_TO_DWORDS(fw_info_location.size);
+		     i++, addr += BYTES_IN_DWORD)
+			dest[i] = qed_rd(p_hwfn, p_ptt, addr);
+	}
+}
+
+/* Dumps the specified string to the specified buffer.
+ * Returns the dumped size in bytes.
+ */
+static u32 qed_dump_str(char *dump_buf, bool dump, const char *str)
+{
+	if (dump)
+		strcpy(dump_buf, str);
+
+	return (u32)strlen(str) + 1;
+}
+
+/* Dumps zeros to align the specified buffer to dwords.
+ * Returns the dumped size in bytes.
+ */
+static u32 qed_dump_align(char *dump_buf, bool dump, u32 byte_offset)
+{
+	u8 offset_in_dword, align_size;
+
+	offset_in_dword = (u8)(byte_offset & 0x3);
+	align_size = offset_in_dword ? BYTES_IN_DWORD - offset_in_dword : 0;
+
+	if (dump && align_size)
+		memset(dump_buf, 0, align_size);
+
+	return align_size;
+}
+
+/* Writes the specified string param to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_dump_str_param(u32 *dump_buf,
+			      bool dump,
+			      const char *param_name, const char *param_val)
+{
+	char *char_buf = (char *)dump_buf;
+	u32 offset = 0;
+
+	/* Dump param name */
+	offset += qed_dump_str(char_buf + offset, dump, param_name);
+
+	/* Indicate a string param value */
+	if (dump)
+		*(char_buf + offset) = 1;
+	offset++;
+
+	/* Dump param value */
+	offset += qed_dump_str(char_buf + offset, dump, param_val);
+
+	/* Align buffer to next dword */
+	offset += qed_dump_align(char_buf + offset, dump, offset);
+
+	return BYTES_TO_DWORDS(offset);
+}
+
+/* Writes the specified numeric param to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_dump_num_param(u32 *dump_buf,
+			      bool dump, const char *param_name, u32 param_val)
+{
+	char *char_buf = (char *)dump_buf;
+	u32 offset = 0;
+
+	/* Dump param name */
+	offset += qed_dump_str(char_buf + offset, dump, param_name);
+
+	/* Indicate a numeric param value */
+	if (dump)
+		*(char_buf + offset) = 0;
+	offset++;
+
+	/* Align buffer to next dword */
+	offset += qed_dump_align(char_buf + offset, dump, offset);
+
+	/* Dump param value (and change offset from bytes to dwords) */
+	offset = BYTES_TO_DWORDS(offset);
+	if (dump)
+		*(dump_buf + offset) = param_val;
+	offset++;
+
+	return offset;
+}
+
+/* Reads the FW version and writes it as a param to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_dump_fw_ver_param(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 u32 *dump_buf, bool dump)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	char fw_ver_str[16] = EMPTY_FW_VERSION_STR;
+	char fw_img_str[16] = EMPTY_FW_IMAGE_STR;
+	struct fw_info fw_info = { {0}, {0} };
+	u32 offset = 0;
+
+	if (dump && !qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_NO_FW_VER)) {
+		/* Read FW image/version from PRAM in a non-reset SEMI */
+		bool found = false;
+		u8 storm_id;
+
+		for (storm_id = 0; storm_id < MAX_DBG_STORMS && !found;
+		     storm_id++) {
+			struct storm_defs *storm = &s_storm_defs[storm_id];
+
+			/* Read FW version/image */
+			if (dev_data->block_in_reset[storm->block_id])
+				continue;
+
+			/* Read FW info for the current Storm */
+			qed_read_fw_info(p_hwfn, p_ptt, storm_id, &fw_info);
+
+			/* Create FW version/image strings */
+			if (snprintf(fw_ver_str, sizeof(fw_ver_str),
+				     "%d_%d_%d_%d", fw_info.ver.num.major,
+				     fw_info.ver.num.minor, fw_info.ver.num.rev,
+				     fw_info.ver.num.eng) < 0)
+				DP_NOTICE(p_hwfn,
+					  "Unexpected debug error: invalid FW version string\n");
+			switch (fw_info.ver.image_id) {
+			case FW_IMG_MAIN:
+				strcpy(fw_img_str, "main");
+				break;
+			default:
+				strcpy(fw_img_str, "unknown");
+				break;
+			}
+
+			found = true;
+		}
+	}
+
+	/* Dump FW version, image and timestamp */
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "fw-version", fw_ver_str);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "fw-image", fw_img_str);
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump,
+				     "fw-timestamp", fw_info.ver.timestamp);
+
+	return offset;
+}
+
+/* Reads the MFW version and writes it as a param to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_dump_mfw_ver_param(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  u32 *dump_buf, bool dump)
+{
+	char mfw_ver_str[16] = EMPTY_FW_VERSION_STR;
+
+	if (dump &&
+	    !qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_NO_FW_VER)) {
+		u32 global_section_offsize, global_section_addr, mfw_ver;
+		u32 public_data_addr, global_section_offsize_addr;
+
+		/* Find MCP public data GRC address. Needs to be ORed with
+		 * MCP_REG_SCRATCH due to a HW bug.
+		 */
+		public_data_addr = qed_rd(p_hwfn,
+					  p_ptt,
+					  MISC_REG_SHARED_MEM_ADDR) |
+				   MCP_REG_SCRATCH;
+
+		/* Find MCP public global section offset */
+		global_section_offsize_addr = public_data_addr +
+					      offsetof(struct mcp_public_data,
+						       sections) +
+					      sizeof(offsize_t) * PUBLIC_GLOBAL;
+		global_section_offsize = qed_rd(p_hwfn, p_ptt,
+						global_section_offsize_addr);
+		global_section_addr =
+			MCP_REG_SCRATCH +
+			(global_section_offsize & OFFSIZE_OFFSET_MASK) * 4;
+
+		/* Read MFW version from MCP public global section */
+		mfw_ver = qed_rd(p_hwfn, p_ptt,
+				 global_section_addr +
+				 offsetof(struct public_global, mfw_ver));
+
+		/* Dump MFW version param */
+		if (snprintf(mfw_ver_str, sizeof(mfw_ver_str), "%d_%d_%d_%d",
+			     (u8)(mfw_ver >> 24), (u8)(mfw_ver >> 16),
+			     (u8)(mfw_ver >> 8), (u8)mfw_ver) < 0)
+			DP_NOTICE(p_hwfn,
+				  "Unexpected debug error: invalid MFW version string\n");
+	}
+
+	return qed_dump_str_param(dump_buf, dump, "mfw-version", mfw_ver_str);
+}
+
+/* Writes a section header to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_dump_section_hdr(u32 *dump_buf,
+				bool dump, const char *name, u32 num_params)
+{
+	return qed_dump_num_param(dump_buf, dump, name, num_params);
+}
+
+/* Writes the common global params to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_dump_common_global_params(struct qed_hwfn *p_hwfn,
+					 struct qed_ptt *p_ptt,
+					 u32 *dump_buf,
+					 bool dump,
+					 u8 num_specific_global_params)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 offset = 0;
+	u8 num_params;
+
+	/* Dump global params section header */
+	num_params = NUM_COMMON_GLOBAL_PARAMS + num_specific_global_params;
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "global_params", num_params);
+
+	/* Store params */
+	offset += qed_dump_fw_ver_param(p_hwfn, p_ptt, dump_buf + offset, dump);
+	offset += qed_dump_mfw_ver_param(p_hwfn,
+					 p_ptt, dump_buf + offset, dump);
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump, "tools-version", TOOLS_VERSION);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump,
+				     "chip",
+				     s_chip_defs[dev_data->chip_id].name);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump,
+				     "platform",
+				     s_platform_defs[dev_data->platform_id].
+				     name);
+	offset +=
+	    qed_dump_num_param(dump_buf + offset, dump, "pci-func",
+			       p_hwfn->abs_pf_id);
+
+	return offset;
+}
+
+/* Writes the "last" section (including CRC) to the specified buffer at the
+ * given offset. Returns the dumped size in dwords.
+ */
+static u32 qed_dump_last_section(u32 *dump_buf, u32 offset, bool dump)
+{
+	u32 start_offset = offset;
+
+	/* Dump CRC section header */
+	offset += qed_dump_section_hdr(dump_buf + offset, dump, "last", 0);
+
+	/* Calculate CRC32 and add it to the dword after the "last" section */
+	if (dump)
+		*(dump_buf + offset) = ~crc32(0xffffffff,
+					      (u8 *)dump_buf,
+					      DWORDS_TO_BYTES(offset));
+
+	offset++;
+
+	return offset - start_offset;
+}
+
+/* Update blocks reset state  */
+static void qed_update_blocks_reset_state(struct qed_hwfn *p_hwfn,
+					  struct qed_ptt *p_ptt)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 reg_val[MAX_DBG_RESET_REGS] = { 0 };
+	u32 i;
+
+	/* Read reset registers */
+	for (i = 0; i < MAX_DBG_RESET_REGS; i++)
+		if (s_reset_regs_defs[i].exists[dev_data->chip_id])
+			reg_val[i] = qed_rd(p_hwfn,
+					    p_ptt, s_reset_regs_defs[i].addr);
+
+	/* Check if blocks are in reset */
+	for (i = 0; i < MAX_BLOCK_ID; i++) {
+		struct block_defs *block = s_block_defs[i];
+
+		dev_data->block_in_reset[i] = block->has_reset_bit &&
+		    !(reg_val[block->reset_reg] & BIT(block->reset_bit_offset));
+	}
+}
+
+/* Enable / disable the Debug block */
+static void qed_bus_enable_dbg_block(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt, bool enable)
+{
+	qed_wr(p_hwfn, p_ptt, DBG_REG_DBG_BLOCK_ON, enable ? 1 : 0);
+}
+
+/* Resets the Debug block */
+static void qed_bus_reset_dbg_block(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt)
+{
+	u32 dbg_reset_reg_addr, old_reset_reg_val, new_reset_reg_val;
+	struct block_defs *dbg_block = s_block_defs[BLOCK_DBG];
+
+	dbg_reset_reg_addr = s_reset_regs_defs[dbg_block->reset_reg].addr;
+	old_reset_reg_val = qed_rd(p_hwfn, p_ptt, dbg_reset_reg_addr);
+	new_reset_reg_val =
+	    old_reset_reg_val & ~BIT(dbg_block->reset_bit_offset);
+
+	qed_wr(p_hwfn, p_ptt, dbg_reset_reg_addr, new_reset_reg_val);
+	qed_wr(p_hwfn, p_ptt, dbg_reset_reg_addr, old_reset_reg_val);
+}
+
+static void qed_bus_set_framing_mode(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     enum dbg_bus_frame_modes mode)
+{
+	qed_wr(p_hwfn, p_ptt, DBG_REG_FRAMING_MODE, (u8)mode);
+}
+
+/* Enable / disable Debug Bus clients according to the specified mask
+ * (1 = enable, 0 = disable).
+ */
+static void qed_bus_enable_clients(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt, u32 client_mask)
+{
+	qed_wr(p_hwfn, p_ptt, DBG_REG_CLIENT_ENABLE, client_mask);
+}
+
+static bool qed_is_mode_match(struct qed_hwfn *p_hwfn, u16 *modes_buf_offset)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	bool arg1, arg2;
+	const u32 *ptr;
+	u8 tree_val;
+
+	/* Get next element from modes tree buffer */
+	ptr = s_dbg_arrays[BIN_BUF_DBG_MODE_TREE].ptr;
+	tree_val = ((u8 *)ptr)[(*modes_buf_offset)++];
+
+	switch (tree_val) {
+	case INIT_MODE_OP_NOT:
+		return !qed_is_mode_match(p_hwfn, modes_buf_offset);
+	case INIT_MODE_OP_OR:
+	case INIT_MODE_OP_AND:
+		arg1 = qed_is_mode_match(p_hwfn, modes_buf_offset);
+		arg2 = qed_is_mode_match(p_hwfn, modes_buf_offset);
+		return (tree_val == INIT_MODE_OP_OR) ? (arg1 ||
+							arg2) : (arg1 && arg2);
+	default:
+		return dev_data->mode_enable[tree_val - MAX_INIT_MODE_OPS] > 0;
+	}
+}
+
+/* Returns true if the specified entity (indicated by GRC param) should be
+ * included in the dump, false otherwise.
+ */
+static bool qed_grc_is_included(struct qed_hwfn *p_hwfn,
+				enum dbg_grc_params grc_param)
+{
+	return qed_grc_get_param(p_hwfn, grc_param) > 0;
+}
+
+/* Returns true of the specified Storm should be included in the dump, false
+ * otherwise.
+ */
+static bool qed_grc_is_storm_included(struct qed_hwfn *p_hwfn,
+				      enum dbg_storms storm)
+{
+	return qed_grc_get_param(p_hwfn, (enum dbg_grc_params)storm) > 0;
+}
+
+/* Returns true if the specified memory should be included in the dump, false
+ * otherwise.
+ */
+static bool qed_grc_is_mem_included(struct qed_hwfn *p_hwfn,
+				    enum block_id block_id, u8 mem_group_id)
+{
+	struct block_defs *block = s_block_defs[block_id];
+	u8 i;
+
+	/* Check Storm match */
+	if (block->associated_to_storm &&
+	    !qed_grc_is_storm_included(p_hwfn,
+				       (enum dbg_storms)block->storm_id))
+		return false;
+
+	for (i = 0; i < NUM_BIG_RAM_TYPES; i++) {
+		struct big_ram_defs *big_ram = &s_big_ram_defs[i];
+
+		if (mem_group_id == big_ram->mem_group_id ||
+		    mem_group_id == big_ram->ram_mem_group_id)
+			return qed_grc_is_included(p_hwfn, big_ram->grc_param);
+	}
+
+	switch (mem_group_id) {
+	case MEM_GROUP_PXP_ILT:
+	case MEM_GROUP_PXP_MEM:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_PXP);
+	case MEM_GROUP_RAM:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_RAM);
+	case MEM_GROUP_PBUF:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_PBUF);
+	case MEM_GROUP_CAU_MEM:
+	case MEM_GROUP_CAU_SB:
+	case MEM_GROUP_CAU_PI:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_CAU);
+	case MEM_GROUP_QM_MEM:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_QM);
+	case MEM_GROUP_CFC_MEM:
+	case MEM_GROUP_CONN_CFC_MEM:
+	case MEM_GROUP_TASK_CFC_MEM:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_CFC) ||
+		       qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_CM_CTX);
+	case MEM_GROUP_IGU_MEM:
+	case MEM_GROUP_IGU_MSIX:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_IGU);
+	case MEM_GROUP_MULD_MEM:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_MULD);
+	case MEM_GROUP_PRS_MEM:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_PRS);
+	case MEM_GROUP_DMAE_MEM:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_DMAE);
+	case MEM_GROUP_TM_MEM:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_TM);
+	case MEM_GROUP_SDM_MEM:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_SDM);
+	case MEM_GROUP_TDIF_CTX:
+	case MEM_GROUP_RDIF_CTX:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_DIF);
+	case MEM_GROUP_CM_MEM:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_CM);
+	case MEM_GROUP_IOR:
+		return qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_IOR);
+	default:
+		return true;
+	}
+}
+
+/* Stalls all Storms */
+static void qed_grc_stall_storms(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt, bool stall)
+{
+	u32 reg_addr;
+	u8 storm_id;
+
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		if (!qed_grc_is_storm_included(p_hwfn,
+					       (enum dbg_storms)storm_id))
+			continue;
+
+		reg_addr = s_storm_defs[storm_id].sem_fast_mem_addr +
+		    SEM_FAST_REG_STALL_0_BB_K2;
+		qed_wr(p_hwfn, p_ptt, reg_addr, stall ? 1 : 0);
+	}
+
+	msleep(STALL_DELAY_MS);
+}
+
+/* Takes all blocks out of reset */
+static void qed_grc_unreset_blocks(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 reg_val[MAX_DBG_RESET_REGS] = { 0 };
+	u32 block_id, i;
+
+	/* Fill reset regs values */
+	for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) {
+		struct block_defs *block = s_block_defs[block_id];
+
+		if (block->exists[dev_data->chip_id] && block->has_reset_bit &&
+		    block->unreset)
+			reg_val[block->reset_reg] |=
+			    BIT(block->reset_bit_offset);
+	}
+
+	/* Write reset registers */
+	for (i = 0; i < MAX_DBG_RESET_REGS; i++) {
+		if (!s_reset_regs_defs[i].exists[dev_data->chip_id])
+			continue;
+
+		reg_val[i] |=
+			s_reset_regs_defs[i].unreset_val[dev_data->chip_id];
+
+		if (reg_val[i])
+			qed_wr(p_hwfn,
+			       p_ptt,
+			       s_reset_regs_defs[i].addr +
+			       RESET_REG_UNRESET_OFFSET, reg_val[i]);
+	}
+}
+
+/* Returns the attention block data of the specified block */
+static const struct dbg_attn_block_type_data *
+qed_get_block_attn_data(enum block_id block_id, enum dbg_attn_type attn_type)
+{
+	const struct dbg_attn_block *base_attn_block_arr =
+		(const struct dbg_attn_block *)
+		s_dbg_arrays[BIN_BUF_DBG_ATTN_BLOCKS].ptr;
+
+	return &base_attn_block_arr[block_id].per_type_data[attn_type];
+}
+
+/* Returns the attention registers of the specified block */
+static const struct dbg_attn_reg *
+qed_get_block_attn_regs(enum block_id block_id, enum dbg_attn_type attn_type,
+			u8 *num_attn_regs)
+{
+	const struct dbg_attn_block_type_data *block_type_data =
+		qed_get_block_attn_data(block_id, attn_type);
+
+	*num_attn_regs = block_type_data->num_regs;
+
+	return &((const struct dbg_attn_reg *)
+		 s_dbg_arrays[BIN_BUF_DBG_ATTN_REGS].ptr)[block_type_data->
+							  regs_offset];
+}
+
+/* For each block, clear the status of all parities */
+static void qed_grc_clear_all_prty(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	const struct dbg_attn_reg *attn_reg_arr;
+	u8 reg_idx, num_attn_regs;
+	u32 block_id;
+
+	for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) {
+		if (dev_data->block_in_reset[block_id])
+			continue;
+
+		attn_reg_arr = qed_get_block_attn_regs((enum block_id)block_id,
+						       ATTN_TYPE_PARITY,
+						       &num_attn_regs);
+
+		for (reg_idx = 0; reg_idx < num_attn_regs; reg_idx++) {
+			const struct dbg_attn_reg *reg_data =
+				&attn_reg_arr[reg_idx];
+			u16 modes_buf_offset;
+			bool eval_mode;
+
+			/* Check mode */
+			eval_mode = GET_FIELD(reg_data->mode.data,
+					      DBG_MODE_HDR_EVAL_MODE) > 0;
+			modes_buf_offset =
+				GET_FIELD(reg_data->mode.data,
+					  DBG_MODE_HDR_MODES_BUF_OFFSET);
+
+			/* If Mode match: clear parity status */
+			if (!eval_mode ||
+			    qed_is_mode_match(p_hwfn, &modes_buf_offset))
+				qed_rd(p_hwfn, p_ptt,
+				       DWORDS_TO_BYTES(reg_data->
+						       sts_clr_address));
+		}
+	}
+}
+
+/* Dumps GRC registers section header. Returns the dumped size in dwords.
+ * The following parameters are dumped:
+ * - count:	 no. of dumped entries
+ * - split:	 split type
+ * - id:	 split ID (dumped only if split_id >= 0)
+ * - param_name: user parameter value (dumped only if param_name != NULL
+ *		 and param_val != NULL).
+ */
+static u32 qed_grc_dump_regs_hdr(u32 *dump_buf,
+				 bool dump,
+				 u32 num_reg_entries,
+				 const char *split_type,
+				 int split_id,
+				 const char *param_name, const char *param_val)
+{
+	u8 num_params = 2 + (split_id >= 0 ? 1 : 0) + (param_name ? 1 : 0);
+	u32 offset = 0;
+
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "grc_regs", num_params);
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump, "count", num_reg_entries);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "split", split_type);
+	if (split_id >= 0)
+		offset += qed_dump_num_param(dump_buf + offset,
+					     dump, "id", split_id);
+	if (param_name && param_val)
+		offset += qed_dump_str_param(dump_buf + offset,
+					     dump, param_name, param_val);
+
+	return offset;
+}
+
+/* Reads the specified registers into the specified buffer.
+ * The addr and len arguments are specified in dwords.
+ */
+void qed_read_regs(struct qed_hwfn *p_hwfn,
+		   struct qed_ptt *p_ptt, u32 *buf, u32 addr, u32 len)
+{
+	u32 i;
+
+	for (i = 0; i < len; i++)
+		buf[i] = qed_rd(p_hwfn, p_ptt, DWORDS_TO_BYTES(addr + i));
+}
+
+/* Dumps the GRC registers in the specified address range.
+ * Returns the dumped size in dwords.
+ * The addr and len arguments are specified in dwords.
+ */
+static u32 qed_grc_dump_addr_range(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   u32 *dump_buf,
+				   bool dump, u32 addr, u32 len, bool wide_bus)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+
+	if (!dump)
+		return len;
+
+	/* Print log if needed */
+	dev_data->num_regs_read += len;
+	if (dev_data->num_regs_read >=
+	    s_platform_defs[dev_data->platform_id].log_thresh) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_DEBUG,
+			   "Dumping %d registers...\n",
+			   dev_data->num_regs_read);
+		dev_data->num_regs_read = 0;
+	}
+
+	/* Try reading using DMAE */
+	if (dev_data->use_dmae &&
+	    (len >= s_platform_defs[dev_data->platform_id].dmae_thresh ||
+	     wide_bus)) {
+		if (!qed_dmae_grc2host(p_hwfn, p_ptt, DWORDS_TO_BYTES(addr),
+				       (u64)(uintptr_t)(dump_buf), len, 0))
+			return len;
+		dev_data->use_dmae = 0;
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_DEBUG,
+			   "Failed reading from chip using DMAE, using GRC instead\n");
+	}
+
+	/* Read registers */
+	qed_read_regs(p_hwfn, p_ptt, dump_buf, addr, len);
+
+	return len;
+}
+
+/* Dumps GRC registers sequence header. Returns the dumped size in dwords.
+ * The addr and len arguments are specified in dwords.
+ */
+static u32 qed_grc_dump_reg_entry_hdr(u32 *dump_buf,
+				      bool dump, u32 addr, u32 len)
+{
+	if (dump)
+		*dump_buf = addr | (len << REG_DUMP_LEN_SHIFT);
+
+	return 1;
+}
+
+/* Dumps GRC registers sequence. Returns the dumped size in dwords.
+ * The addr and len arguments are specified in dwords.
+ */
+static u32 qed_grc_dump_reg_entry(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  u32 *dump_buf,
+				  bool dump, u32 addr, u32 len, bool wide_bus)
+{
+	u32 offset = 0;
+
+	offset += qed_grc_dump_reg_entry_hdr(dump_buf, dump, addr, len);
+	offset += qed_grc_dump_addr_range(p_hwfn,
+					  p_ptt,
+					  dump_buf + offset,
+					  dump, addr, len, wide_bus);
+
+	return offset;
+}
+
+/* Dumps GRC registers sequence with skip cycle.
+ * Returns the dumped size in dwords.
+ * - addr:	start GRC address in dwords
+ * - total_len:	total no. of dwords to dump
+ * - read_len:	no. consecutive dwords to read
+ * - skip_len:	no. of dwords to skip (and fill with zeros)
+ */
+static u32 qed_grc_dump_reg_entry_skip(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       u32 *dump_buf,
+				       bool dump,
+				       u32 addr,
+				       u32 total_len,
+				       u32 read_len, u32 skip_len)
+{
+	u32 offset = 0, reg_offset = 0;
+
+	offset += qed_grc_dump_reg_entry_hdr(dump_buf, dump, addr, total_len);
+
+	if (!dump)
+		return offset + total_len;
+
+	while (reg_offset < total_len) {
+		u32 curr_len = min_t(u32, read_len, total_len - reg_offset);
+
+		offset += qed_grc_dump_addr_range(p_hwfn,
+						  p_ptt,
+						  dump_buf + offset,
+						  dump, addr, curr_len, false);
+		reg_offset += curr_len;
+		addr += curr_len;
+
+		if (reg_offset < total_len) {
+			curr_len = min_t(u32, skip_len, total_len - skip_len);
+			memset(dump_buf + offset, 0, DWORDS_TO_BYTES(curr_len));
+			offset += curr_len;
+			reg_offset += curr_len;
+			addr += curr_len;
+		}
+	}
+
+	return offset;
+}
+
+/* Dumps GRC registers entries. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_regs_entries(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     struct dbg_array input_regs_arr,
+				     u32 *dump_buf,
+				     bool dump,
+				     bool block_enable[MAX_BLOCK_ID],
+				     u32 *num_dumped_reg_entries)
+{
+	u32 i, offset = 0, input_offset = 0;
+	bool mode_match = true;
+
+	*num_dumped_reg_entries = 0;
+
+	while (input_offset < input_regs_arr.size_in_dwords) {
+		const struct dbg_dump_cond_hdr *cond_hdr =
+		    (const struct dbg_dump_cond_hdr *)
+		    &input_regs_arr.ptr[input_offset++];
+		u16 modes_buf_offset;
+		bool eval_mode;
+
+		/* Check mode/block */
+		eval_mode = GET_FIELD(cond_hdr->mode.data,
+				      DBG_MODE_HDR_EVAL_MODE) > 0;
+		if (eval_mode) {
+			modes_buf_offset =
+				GET_FIELD(cond_hdr->mode.data,
+					  DBG_MODE_HDR_MODES_BUF_OFFSET);
+			mode_match = qed_is_mode_match(p_hwfn,
+						       &modes_buf_offset);
+		}
+
+		if (!mode_match || !block_enable[cond_hdr->block_id]) {
+			input_offset += cond_hdr->data_size;
+			continue;
+		}
+
+		for (i = 0; i < cond_hdr->data_size; i++, input_offset++) {
+			const struct dbg_dump_reg *reg =
+			    (const struct dbg_dump_reg *)
+			    &input_regs_arr.ptr[input_offset];
+			u32 addr, len;
+			bool wide_bus;
+
+			addr = GET_FIELD(reg->data, DBG_DUMP_REG_ADDRESS);
+			len = GET_FIELD(reg->data, DBG_DUMP_REG_LENGTH);
+			wide_bus = GET_FIELD(reg->data, DBG_DUMP_REG_WIDE_BUS);
+			offset += qed_grc_dump_reg_entry(p_hwfn,
+							 p_ptt,
+							 dump_buf + offset,
+							 dump,
+							 addr,
+							 len,
+							 wide_bus);
+			(*num_dumped_reg_entries)++;
+		}
+	}
+
+	return offset;
+}
+
+/* Dumps GRC registers entries. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_split_data(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct dbg_array input_regs_arr,
+				   u32 *dump_buf,
+				   bool dump,
+				   bool block_enable[MAX_BLOCK_ID],
+				   const char *split_type_name,
+				   u32 split_id,
+				   const char *param_name,
+				   const char *param_val)
+{
+	u32 num_dumped_reg_entries, offset;
+
+	/* Calculate register dump header size (and skip it for now) */
+	offset = qed_grc_dump_regs_hdr(dump_buf,
+				       false,
+				       0,
+				       split_type_name,
+				       split_id, param_name, param_val);
+
+	/* Dump registers */
+	offset += qed_grc_dump_regs_entries(p_hwfn,
+					    p_ptt,
+					    input_regs_arr,
+					    dump_buf + offset,
+					    dump,
+					    block_enable,
+					    &num_dumped_reg_entries);
+
+	/* Write register dump header */
+	if (dump && num_dumped_reg_entries > 0)
+		qed_grc_dump_regs_hdr(dump_buf,
+				      dump,
+				      num_dumped_reg_entries,
+				      split_type_name,
+				      split_id, param_name, param_val);
+
+	return num_dumped_reg_entries > 0 ? offset : 0;
+}
+
+/* Dumps registers according to the input registers array. Returns the dumped
+ * size in dwords.
+ */
+static u32 qed_grc_dump_registers(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  u32 *dump_buf,
+				  bool dump,
+				  bool block_enable[MAX_BLOCK_ID],
+				  const char *param_name, const char *param_val)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	struct chip_platform_defs *chip_platform;
+	u32 offset = 0, input_offset = 0;
+	struct chip_defs *chip;
+	u8 port_id, pf_id, vf_id;
+	u16 fid;
+
+	chip = &s_chip_defs[dev_data->chip_id];
+	chip_platform = &chip->per_platform[dev_data->platform_id];
+
+	while (input_offset <
+	       s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].size_in_dwords) {
+		const struct dbg_dump_split_hdr *split_hdr;
+		struct dbg_array curr_input_regs_arr;
+		u32 split_data_size;
+		u8 split_type_id;
+
+		split_hdr =
+			(const struct dbg_dump_split_hdr *)
+			&s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].ptr[input_offset++];
+		split_type_id =
+			GET_FIELD(split_hdr->hdr,
+				  DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID);
+		split_data_size =
+			GET_FIELD(split_hdr->hdr,
+				  DBG_DUMP_SPLIT_HDR_DATA_SIZE);
+		curr_input_regs_arr.ptr =
+			&s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].ptr[input_offset];
+		curr_input_regs_arr.size_in_dwords = split_data_size;
+
+		switch (split_type_id) {
+		case SPLIT_TYPE_NONE:
+			offset += qed_grc_dump_split_data(p_hwfn,
+							  p_ptt,
+							  curr_input_regs_arr,
+							  dump_buf + offset,
+							  dump,
+							  block_enable,
+							  "eng",
+							  (u32)(-1),
+							  param_name,
+							  param_val);
+			break;
+
+		case SPLIT_TYPE_PORT:
+			for (port_id = 0; port_id < chip_platform->num_ports;
+			     port_id++) {
+				if (dump)
+					qed_port_pretend(p_hwfn, p_ptt,
+							 port_id);
+				offset +=
+				    qed_grc_dump_split_data(p_hwfn, p_ptt,
+							    curr_input_regs_arr,
+							    dump_buf + offset,
+							    dump, block_enable,
+							    "port", port_id,
+							    param_name,
+							    param_val);
+			}
+			break;
+
+		case SPLIT_TYPE_PF:
+		case SPLIT_TYPE_PORT_PF:
+			for (pf_id = 0; pf_id < chip_platform->num_pfs;
+			     pf_id++) {
+				u8 pfid_shift =
+					PXP_PRETEND_CONCRETE_FID_PFID_SHIFT;
+
+				if (dump) {
+					fid = pf_id << pfid_shift;
+					qed_fid_pretend(p_hwfn, p_ptt, fid);
+				}
+
+				offset +=
+				    qed_grc_dump_split_data(p_hwfn,
+							    p_ptt,
+							    curr_input_regs_arr,
+							    dump_buf + offset,
+							    dump,
+							    block_enable,
+							    "pf",
+							    pf_id,
+							    param_name,
+							    param_val);
+			}
+			break;
+
+		case SPLIT_TYPE_VF:
+			for (vf_id = 0; vf_id < chip_platform->num_vfs;
+			     vf_id++) {
+				u8 vfvalid_shift =
+					PXP_PRETEND_CONCRETE_FID_VFVALID_SHIFT;
+				u8 vfid_shift =
+					PXP_PRETEND_CONCRETE_FID_VFID_SHIFT;
+
+				if (dump) {
+					fid = BIT(vfvalid_shift) |
+					      (vf_id << vfid_shift);
+					qed_fid_pretend(p_hwfn, p_ptt, fid);
+				}
+
+				offset +=
+				    qed_grc_dump_split_data(p_hwfn, p_ptt,
+							    curr_input_regs_arr,
+							    dump_buf + offset,
+							    dump, block_enable,
+							    "vf", vf_id,
+							    param_name,
+							    param_val);
+			}
+			break;
+
+		default:
+			break;
+		}
+
+		input_offset += split_data_size;
+	}
+
+	/* Pretend to original PF */
+	if (dump) {
+		fid = p_hwfn->rel_pf_id << PXP_PRETEND_CONCRETE_FID_PFID_SHIFT;
+		qed_fid_pretend(p_hwfn, p_ptt, fid);
+	}
+
+	return offset;
+}
+
+/* Dump reset registers. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_reset_regs(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   u32 *dump_buf, bool dump)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 i, offset = 0, num_regs = 0;
+
+	/* Calculate header size */
+	offset += qed_grc_dump_regs_hdr(dump_buf,
+					false, 0, "eng", -1, NULL, NULL);
+
+	/* Write reset registers */
+	for (i = 0; i < MAX_DBG_RESET_REGS; i++) {
+		if (!s_reset_regs_defs[i].exists[dev_data->chip_id])
+			continue;
+
+		offset += qed_grc_dump_reg_entry(p_hwfn,
+						 p_ptt,
+						 dump_buf + offset,
+						 dump,
+						 BYTES_TO_DWORDS
+						 (s_reset_regs_defs[i].addr), 1,
+						 false);
+		num_regs++;
+	}
+
+	/* Write header */
+	if (dump)
+		qed_grc_dump_regs_hdr(dump_buf,
+				      true, num_regs, "eng", -1, NULL, NULL);
+
+	return offset;
+}
+
+/* Dump registers that are modified during GRC Dump and therefore must be
+ * dumped first. Returns the dumped size in dwords.
+ */
+static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf, bool dump)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 block_id, offset = 0, num_reg_entries = 0;
+	const struct dbg_attn_reg *attn_reg_arr;
+	u8 storm_id, reg_idx, num_attn_regs;
+
+	/* Calculate header size */
+	offset += qed_grc_dump_regs_hdr(dump_buf,
+					false, 0, "eng", -1, NULL, NULL);
+
+	/* Write parity registers */
+	for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) {
+		if (dev_data->block_in_reset[block_id] && dump)
+			continue;
+
+		attn_reg_arr = qed_get_block_attn_regs((enum block_id)block_id,
+						       ATTN_TYPE_PARITY,
+						       &num_attn_regs);
+
+		for (reg_idx = 0; reg_idx < num_attn_regs; reg_idx++) {
+			const struct dbg_attn_reg *reg_data =
+				&attn_reg_arr[reg_idx];
+			u16 modes_buf_offset;
+			bool eval_mode;
+			u32 addr;
+
+			/* Check mode */
+			eval_mode = GET_FIELD(reg_data->mode.data,
+					      DBG_MODE_HDR_EVAL_MODE) > 0;
+			modes_buf_offset =
+				GET_FIELD(reg_data->mode.data,
+					  DBG_MODE_HDR_MODES_BUF_OFFSET);
+			if (eval_mode &&
+			    !qed_is_mode_match(p_hwfn, &modes_buf_offset))
+				continue;
+
+			/* Mode match: read & dump registers */
+			addr = reg_data->mask_address;
+			offset += qed_grc_dump_reg_entry(p_hwfn,
+							 p_ptt,
+							 dump_buf + offset,
+							 dump,
+							 addr,
+							 1, false);
+			addr = GET_FIELD(reg_data->data,
+					 DBG_ATTN_REG_STS_ADDRESS);
+			offset += qed_grc_dump_reg_entry(p_hwfn,
+							 p_ptt,
+							 dump_buf + offset,
+							 dump,
+							 addr,
+							 1, false);
+			num_reg_entries += 2;
+		}
+	}
+
+	/* Write Storm stall status registers */
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		struct storm_defs *storm = &s_storm_defs[storm_id];
+		u32 addr;
+
+		if (dev_data->block_in_reset[storm->block_id] && dump)
+			continue;
+
+		addr =
+		    BYTES_TO_DWORDS(s_storm_defs[storm_id].sem_fast_mem_addr +
+				    SEM_FAST_REG_STALLED);
+		offset += qed_grc_dump_reg_entry(p_hwfn,
+						 p_ptt,
+						 dump_buf + offset,
+						 dump,
+						 addr,
+						 1,
+						 false);
+		num_reg_entries++;
+	}
+
+	/* Write header */
+	if (dump)
+		qed_grc_dump_regs_hdr(dump_buf,
+				      true,
+				      num_reg_entries, "eng", -1, NULL, NULL);
+
+	return offset;
+}
+
+/* Dumps registers that can't be represented in the debug arrays */
+static u32 qed_grc_dump_special_regs(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     u32 *dump_buf, bool dump)
+{
+	u32 offset = 0, addr;
+
+	offset += qed_grc_dump_regs_hdr(dump_buf,
+					dump, 2, "eng", -1, NULL, NULL);
+
+	/* Dump R/TDIF_REG_DEBUG_ERROR_INFO_SIZE (every 8'th register should be
+	 * skipped).
+	 */
+	addr = BYTES_TO_DWORDS(RDIF_REG_DEBUG_ERROR_INFO);
+	offset += qed_grc_dump_reg_entry_skip(p_hwfn,
+					      p_ptt,
+					      dump_buf + offset,
+					      dump,
+					      addr,
+					      RDIF_REG_DEBUG_ERROR_INFO_SIZE,
+					      7,
+					      1);
+	addr = BYTES_TO_DWORDS(TDIF_REG_DEBUG_ERROR_INFO);
+	offset +=
+	    qed_grc_dump_reg_entry_skip(p_hwfn,
+					p_ptt,
+					dump_buf + offset,
+					dump,
+					addr,
+					TDIF_REG_DEBUG_ERROR_INFO_SIZE,
+					7,
+					1);
+
+	return offset;
+}
+
+/* Dumps a GRC memory header (section and params). Returns the dumped size in
+ * dwords. The following parameters are dumped:
+ * - name:	   dumped only if it's not NULL.
+ * - addr:	   in dwords, dumped only if name is NULL.
+ * - len:	   in dwords, always dumped.
+ * - width:	   dumped if it's not zero.
+ * - packed:	   dumped only if it's not false.
+ * - mem_group:	   always dumped.
+ * - is_storm:	   true only if the memory is related to a Storm.
+ * - storm_letter: valid only if is_storm is true.
+ *
+ */
+static u32 qed_grc_dump_mem_hdr(struct qed_hwfn *p_hwfn,
+				u32 *dump_buf,
+				bool dump,
+				const char *name,
+				u32 addr,
+				u32 len,
+				u32 bit_width,
+				bool packed,
+				const char *mem_group,
+				bool is_storm, char storm_letter)
+{
+	u8 num_params = 3;
+	u32 offset = 0;
+	char buf[64];
+
+	if (!len)
+		DP_NOTICE(p_hwfn,
+			  "Unexpected GRC Dump error: dumped memory size must be non-zero\n");
+
+	if (bit_width)
+		num_params++;
+	if (packed)
+		num_params++;
+
+	/* Dump section header */
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "grc_mem", num_params);
+
+	if (name) {
+		/* Dump name */
+		if (is_storm) {
+			strcpy(buf, "?STORM_");
+			buf[0] = storm_letter;
+			strcpy(buf + strlen(buf), name);
+		} else {
+			strcpy(buf, name);
+		}
+
+		offset += qed_dump_str_param(dump_buf + offset,
+					     dump, "name", buf);
+	} else {
+		/* Dump address */
+		u32 addr_in_bytes = DWORDS_TO_BYTES(addr);
+
+		offset += qed_dump_num_param(dump_buf + offset,
+					     dump, "addr", addr_in_bytes);
+	}
+
+	/* Dump len */
+	offset += qed_dump_num_param(dump_buf + offset, dump, "len", len);
+
+	/* Dump bit width */
+	if (bit_width)
+		offset += qed_dump_num_param(dump_buf + offset,
+					     dump, "width", bit_width);
+
+	/* Dump packed */
+	if (packed)
+		offset += qed_dump_num_param(dump_buf + offset,
+					     dump, "packed", 1);
+
+	/* Dump reg type */
+	if (is_storm) {
+		strcpy(buf, "?STORM_");
+		buf[0] = storm_letter;
+		strcpy(buf + strlen(buf), mem_group);
+	} else {
+		strcpy(buf, mem_group);
+	}
+
+	offset += qed_dump_str_param(dump_buf + offset, dump, "type", buf);
+
+	return offset;
+}
+
+/* Dumps a single GRC memory. If name is NULL, the memory is stored by address.
+ * Returns the dumped size in dwords.
+ * The addr and len arguments are specified in dwords.
+ */
+static u32 qed_grc_dump_mem(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt,
+			    u32 *dump_buf,
+			    bool dump,
+			    const char *name,
+			    u32 addr,
+			    u32 len,
+			    bool wide_bus,
+			    u32 bit_width,
+			    bool packed,
+			    const char *mem_group,
+			    bool is_storm, char storm_letter)
+{
+	u32 offset = 0;
+
+	offset += qed_grc_dump_mem_hdr(p_hwfn,
+				       dump_buf + offset,
+				       dump,
+				       name,
+				       addr,
+				       len,
+				       bit_width,
+				       packed,
+				       mem_group, is_storm, storm_letter);
+	offset += qed_grc_dump_addr_range(p_hwfn,
+					  p_ptt,
+					  dump_buf + offset,
+					  dump, addr, len, wide_bus);
+
+	return offset;
+}
+
+/* Dumps GRC memories entries. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_mem_entries(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    struct dbg_array input_mems_arr,
+				    u32 *dump_buf, bool dump)
+{
+	u32 i, offset = 0, input_offset = 0;
+	bool mode_match = true;
+
+	while (input_offset < input_mems_arr.size_in_dwords) {
+		const struct dbg_dump_cond_hdr *cond_hdr;
+		u16 modes_buf_offset;
+		u32 num_entries;
+		bool eval_mode;
+
+		cond_hdr = (const struct dbg_dump_cond_hdr *)
+			   &input_mems_arr.ptr[input_offset++];
+		num_entries = cond_hdr->data_size / MEM_DUMP_ENTRY_SIZE_DWORDS;
+
+		/* Check required mode */
+		eval_mode = GET_FIELD(cond_hdr->mode.data,
+				      DBG_MODE_HDR_EVAL_MODE) > 0;
+		if (eval_mode) {
+			modes_buf_offset =
+				GET_FIELD(cond_hdr->mode.data,
+					  DBG_MODE_HDR_MODES_BUF_OFFSET);
+			mode_match = qed_is_mode_match(p_hwfn,
+						       &modes_buf_offset);
+		}
+
+		if (!mode_match) {
+			input_offset += cond_hdr->data_size;
+			continue;
+		}
+
+		for (i = 0; i < num_entries;
+		     i++, input_offset += MEM_DUMP_ENTRY_SIZE_DWORDS) {
+			const struct dbg_dump_mem *mem =
+				(const struct dbg_dump_mem *)
+				&input_mems_arr.ptr[input_offset];
+			u8 mem_group_id = GET_FIELD(mem->dword0,
+						    DBG_DUMP_MEM_MEM_GROUP_ID);
+			bool is_storm = false, mem_wide_bus;
+			enum dbg_grc_params grc_param;
+			char storm_letter = 'a';
+			enum block_id block_id;
+			u32 mem_addr, mem_len;
+
+			if (mem_group_id >= MEM_GROUPS_NUM) {
+				DP_NOTICE(p_hwfn, "Invalid mem_group_id\n");
+				return 0;
+			}
+
+			block_id = (enum block_id)cond_hdr->block_id;
+			if (!qed_grc_is_mem_included(p_hwfn,
+						     block_id,
+						     mem_group_id))
+				continue;
+
+			mem_addr = GET_FIELD(mem->dword0, DBG_DUMP_MEM_ADDRESS);
+			mem_len = GET_FIELD(mem->dword1, DBG_DUMP_MEM_LENGTH);
+			mem_wide_bus = GET_FIELD(mem->dword1,
+						 DBG_DUMP_MEM_WIDE_BUS);
+
+			/* Update memory length for CCFC/TCFC memories
+			 * according to number of LCIDs/LTIDs.
+			 */
+			if (mem_group_id == MEM_GROUP_CONN_CFC_MEM) {
+				if (mem_len % MAX_LCIDS) {
+					DP_NOTICE(p_hwfn,
+						  "Invalid CCFC connection memory size\n");
+					return 0;
+				}
+
+				grc_param = DBG_GRC_PARAM_NUM_LCIDS;
+				mem_len = qed_grc_get_param(p_hwfn, grc_param) *
+					  (mem_len / MAX_LCIDS);
+			} else if (mem_group_id == MEM_GROUP_TASK_CFC_MEM) {
+				if (mem_len % MAX_LTIDS) {
+					DP_NOTICE(p_hwfn,
+						  "Invalid TCFC task memory size\n");
+					return 0;
+				}
+
+				grc_param = DBG_GRC_PARAM_NUM_LTIDS;
+				mem_len = qed_grc_get_param(p_hwfn, grc_param) *
+					  (mem_len / MAX_LTIDS);
+			}
+
+			/* If memory is associated with Storm, update Storm
+			 * details.
+			 */
+			if (s_block_defs
+			    [cond_hdr->block_id]->associated_to_storm) {
+				is_storm = true;
+				storm_letter =
+				    s_storm_defs[s_block_defs
+						 [cond_hdr->block_id]->
+						 storm_id].letter;
+			}
+
+			/* Dump memory */
+			offset += qed_grc_dump_mem(p_hwfn,
+						p_ptt,
+						dump_buf + offset,
+						dump,
+						NULL,
+						mem_addr,
+						mem_len,
+						mem_wide_bus,
+						0,
+						false,
+						s_mem_group_names[mem_group_id],
+						is_storm,
+						storm_letter);
+		}
+	}
+
+	return offset;
+}
+
+/* Dumps GRC memories according to the input array dump_mem.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_grc_dump_memories(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 u32 *dump_buf, bool dump)
+{
+	u32 offset = 0, input_offset = 0;
+
+	while (input_offset <
+	       s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].size_in_dwords) {
+		const struct dbg_dump_split_hdr *split_hdr;
+		struct dbg_array curr_input_mems_arr;
+		u32 split_data_size;
+		u8 split_type_id;
+
+		split_hdr = (const struct dbg_dump_split_hdr *)
+			&s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].ptr[input_offset++];
+		split_type_id =
+			GET_FIELD(split_hdr->hdr,
+				  DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID);
+		split_data_size =
+			GET_FIELD(split_hdr->hdr,
+				  DBG_DUMP_SPLIT_HDR_DATA_SIZE);
+		curr_input_mems_arr.ptr =
+			&s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].ptr[input_offset];
+		curr_input_mems_arr.size_in_dwords = split_data_size;
+
+		switch (split_type_id) {
+		case SPLIT_TYPE_NONE:
+			offset += qed_grc_dump_mem_entries(p_hwfn,
+							   p_ptt,
+							   curr_input_mems_arr,
+							   dump_buf + offset,
+							   dump);
+			break;
+
+		default:
+			DP_NOTICE(p_hwfn,
+				  "Dumping split memories is currently not supported\n");
+			break;
+		}
+
+		input_offset += split_data_size;
+	}
+
+	return offset;
+}
+
+/* Dumps GRC context data for the specified Storm.
+ * Returns the dumped size in dwords.
+ * The lid_size argument is specified in quad-regs.
+ */
+static u32 qed_grc_dump_ctx_data(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 u32 *dump_buf,
+				 bool dump,
+				 const char *name,
+				 u32 num_lids,
+				 u32 lid_size,
+				 u32 rd_reg_addr,
+				 u8 storm_id)
+{
+	struct storm_defs *storm = &s_storm_defs[storm_id];
+	u32 i, lid, total_size, offset = 0;
+
+	if (!lid_size)
+		return 0;
+
+	lid_size *= BYTES_IN_DWORD;
+	total_size = num_lids * lid_size;
+
+	offset += qed_grc_dump_mem_hdr(p_hwfn,
+				       dump_buf + offset,
+				       dump,
+				       name,
+				       0,
+				       total_size,
+				       lid_size * 32,
+				       false, name, true, storm->letter);
+
+	if (!dump)
+		return offset + total_size;
+
+	/* Dump context data */
+	for (lid = 0; lid < num_lids; lid++) {
+		for (i = 0; i < lid_size; i++, offset++) {
+			qed_wr(p_hwfn,
+			       p_ptt, storm->cm_ctx_wr_addr, (i << 9) | lid);
+			*(dump_buf + offset) = qed_rd(p_hwfn,
+						      p_ptt, rd_reg_addr);
+		}
+	}
+
+	return offset;
+}
+
+/* Dumps GRC contexts. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_ctx(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	enum dbg_grc_params grc_param;
+	u32 offset = 0;
+	u8 storm_id;
+
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		struct storm_defs *storm = &s_storm_defs[storm_id];
+
+		if (!qed_grc_is_storm_included(p_hwfn,
+					       (enum dbg_storms)storm_id))
+			continue;
+
+		/* Dump Conn AG context size */
+		grc_param = DBG_GRC_PARAM_NUM_LCIDS;
+		offset +=
+			qed_grc_dump_ctx_data(p_hwfn,
+					      p_ptt,
+					      dump_buf + offset,
+					      dump,
+					      "CONN_AG_CTX",
+					      qed_grc_get_param(p_hwfn,
+								grc_param),
+					      storm->cm_conn_ag_ctx_lid_size,
+					      storm->cm_conn_ag_ctx_rd_addr,
+					      storm_id);
+
+		/* Dump Conn ST context size */
+		grc_param = DBG_GRC_PARAM_NUM_LCIDS;
+		offset +=
+			qed_grc_dump_ctx_data(p_hwfn,
+					      p_ptt,
+					      dump_buf + offset,
+					      dump,
+					      "CONN_ST_CTX",
+					      qed_grc_get_param(p_hwfn,
+								grc_param),
+					      storm->cm_conn_st_ctx_lid_size,
+					      storm->cm_conn_st_ctx_rd_addr,
+					      storm_id);
+
+		/* Dump Task AG context size */
+		grc_param = DBG_GRC_PARAM_NUM_LTIDS;
+		offset +=
+			qed_grc_dump_ctx_data(p_hwfn,
+					      p_ptt,
+					      dump_buf + offset,
+					      dump,
+					      "TASK_AG_CTX",
+					      qed_grc_get_param(p_hwfn,
+								grc_param),
+					      storm->cm_task_ag_ctx_lid_size,
+					      storm->cm_task_ag_ctx_rd_addr,
+					      storm_id);
+
+		/* Dump Task ST context size */
+		grc_param = DBG_GRC_PARAM_NUM_LTIDS;
+		offset +=
+			qed_grc_dump_ctx_data(p_hwfn,
+					      p_ptt,
+					      dump_buf + offset,
+					      dump,
+					      "TASK_ST_CTX",
+					      qed_grc_get_param(p_hwfn,
+								grc_param),
+					      storm->cm_task_st_ctx_lid_size,
+					      storm->cm_task_st_ctx_rd_addr,
+					      storm_id);
+	}
+
+	return offset;
+}
+
+/* Dumps GRC IORs data. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_iors(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	char buf[10] = "IOR_SET_?";
+	u32 addr, offset = 0;
+	u8 storm_id, set_id;
+
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		struct storm_defs *storm = &s_storm_defs[storm_id];
+
+		if (!qed_grc_is_storm_included(p_hwfn,
+					       (enum dbg_storms)storm_id))
+			continue;
+
+		for (set_id = 0; set_id < NUM_IOR_SETS; set_id++) {
+			addr = BYTES_TO_DWORDS(storm->sem_fast_mem_addr +
+					       SEM_FAST_REG_STORM_REG_FILE) +
+			       IOR_SET_OFFSET(set_id);
+			buf[strlen(buf) - 1] = '0' + set_id;
+			offset += qed_grc_dump_mem(p_hwfn,
+						   p_ptt,
+						   dump_buf + offset,
+						   dump,
+						   buf,
+						   addr,
+						   IORS_PER_SET,
+						   false,
+						   32,
+						   false,
+						   "ior",
+						   true,
+						   storm->letter);
+		}
+	}
+
+	return offset;
+}
+
+/* Dump VFC CAM. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_vfc_cam(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt,
+				u32 *dump_buf, bool dump, u8 storm_id)
+{
+	u32 total_size = VFC_CAM_NUM_ROWS * VFC_CAM_RESP_DWORDS;
+	struct storm_defs *storm = &s_storm_defs[storm_id];
+	u32 cam_addr[VFC_CAM_ADDR_DWORDS] = { 0 };
+	u32 cam_cmd[VFC_CAM_CMD_DWORDS] = { 0 };
+	u32 row, i, offset = 0;
+
+	offset += qed_grc_dump_mem_hdr(p_hwfn,
+				       dump_buf + offset,
+				       dump,
+				       "vfc_cam",
+				       0,
+				       total_size,
+				       256,
+				       false, "vfc_cam", true, storm->letter);
+
+	if (!dump)
+		return offset + total_size;
+
+	/* Prepare CAM address */
+	SET_VAR_FIELD(cam_addr, VFC_CAM_ADDR, OP, VFC_OPCODE_CAM_RD);
+
+	for (row = 0; row < VFC_CAM_NUM_ROWS;
+	     row++, offset += VFC_CAM_RESP_DWORDS) {
+		/* Write VFC CAM command */
+		SET_VAR_FIELD(cam_cmd, VFC_CAM_CMD, ROW, row);
+		ARR_REG_WR(p_hwfn,
+			   p_ptt,
+			   storm->sem_fast_mem_addr + SEM_FAST_REG_VFC_DATA_WR,
+			   cam_cmd, VFC_CAM_CMD_DWORDS);
+
+		/* Write VFC CAM address */
+		ARR_REG_WR(p_hwfn,
+			   p_ptt,
+			   storm->sem_fast_mem_addr + SEM_FAST_REG_VFC_ADDR,
+			   cam_addr, VFC_CAM_ADDR_DWORDS);
+
+		/* Read VFC CAM read response */
+		ARR_REG_RD(p_hwfn,
+			   p_ptt,
+			   storm->sem_fast_mem_addr + SEM_FAST_REG_VFC_DATA_RD,
+			   dump_buf + offset, VFC_CAM_RESP_DWORDS);
+	}
+
+	return offset;
+}
+
+/* Dump VFC RAM. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_vfc_ram(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt,
+				u32 *dump_buf,
+				bool dump,
+				u8 storm_id, struct vfc_ram_defs *ram_defs)
+{
+	u32 total_size = ram_defs->num_rows * VFC_RAM_RESP_DWORDS;
+	struct storm_defs *storm = &s_storm_defs[storm_id];
+	u32 ram_addr[VFC_RAM_ADDR_DWORDS] = { 0 };
+	u32 ram_cmd[VFC_RAM_CMD_DWORDS] = { 0 };
+	u32 row, i, offset = 0;
+
+	offset += qed_grc_dump_mem_hdr(p_hwfn,
+				       dump_buf + offset,
+				       dump,
+				       ram_defs->mem_name,
+				       0,
+				       total_size,
+				       256,
+				       false,
+				       ram_defs->type_name,
+				       true, storm->letter);
+
+	/* Prepare RAM address */
+	SET_VAR_FIELD(ram_addr, VFC_RAM_ADDR, OP, VFC_OPCODE_RAM_RD);
+
+	if (!dump)
+		return offset + total_size;
+
+	for (row = ram_defs->base_row;
+	     row < ram_defs->base_row + ram_defs->num_rows;
+	     row++, offset += VFC_RAM_RESP_DWORDS) {
+		/* Write VFC RAM command */
+		ARR_REG_WR(p_hwfn,
+			   p_ptt,
+			   storm->sem_fast_mem_addr + SEM_FAST_REG_VFC_DATA_WR,
+			   ram_cmd, VFC_RAM_CMD_DWORDS);
+
+		/* Write VFC RAM address */
+		SET_VAR_FIELD(ram_addr, VFC_RAM_ADDR, ROW, row);
+		ARR_REG_WR(p_hwfn,
+			   p_ptt,
+			   storm->sem_fast_mem_addr + SEM_FAST_REG_VFC_ADDR,
+			   ram_addr, VFC_RAM_ADDR_DWORDS);
+
+		/* Read VFC RAM read response */
+		ARR_REG_RD(p_hwfn,
+			   p_ptt,
+			   storm->sem_fast_mem_addr + SEM_FAST_REG_VFC_DATA_RD,
+			   dump_buf + offset, VFC_RAM_RESP_DWORDS);
+	}
+
+	return offset;
+}
+
+/* Dumps GRC VFC data. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_vfc(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u8 storm_id, i;
+	u32 offset = 0;
+
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		if (!qed_grc_is_storm_included(p_hwfn,
+					       (enum dbg_storms)storm_id) ||
+		    !s_storm_defs[storm_id].has_vfc ||
+		    (storm_id == DBG_PSTORM_ID && dev_data->platform_id !=
+		     PLATFORM_ASIC))
+			continue;
+
+		/* Read CAM */
+		offset += qed_grc_dump_vfc_cam(p_hwfn,
+					       p_ptt,
+					       dump_buf + offset,
+					       dump, storm_id);
+
+		/* Read RAM */
+		for (i = 0; i < NUM_VFC_RAM_TYPES; i++)
+			offset += qed_grc_dump_vfc_ram(p_hwfn,
+						       p_ptt,
+						       dump_buf + offset,
+						       dump,
+						       storm_id,
+						       &s_vfc_ram_defs[i]);
+	}
+
+	return offset;
+}
+
+/* Dumps GRC RSS data. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_rss(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 offset = 0;
+	u8 rss_mem_id;
+
+	for (rss_mem_id = 0; rss_mem_id < NUM_RSS_MEM_TYPES; rss_mem_id++) {
+		u32 rss_addr, num_entries, total_dwords;
+		struct rss_mem_defs *rss_defs;
+		u32 addr, num_dwords_to_read;
+		bool packed;
+
+		rss_defs = &s_rss_mem_defs[rss_mem_id];
+		rss_addr = rss_defs->addr;
+		num_entries = rss_defs->num_entries[dev_data->chip_id];
+		total_dwords = (num_entries * rss_defs->entry_width) / 32;
+		packed = (rss_defs->entry_width == 16);
+
+		offset += qed_grc_dump_mem_hdr(p_hwfn,
+					       dump_buf + offset,
+					       dump,
+					       rss_defs->mem_name,
+					       0,
+					       total_dwords,
+					       rss_defs->entry_width,
+					       packed,
+					       rss_defs->type_name, false, 0);
+
+		/* Dump RSS data */
+		if (!dump) {
+			offset += total_dwords;
+			continue;
+		}
+
+		addr = BYTES_TO_DWORDS(RSS_REG_RSS_RAM_DATA);
+		while (total_dwords) {
+			num_dwords_to_read = min_t(u32,
+						   RSS_REG_RSS_RAM_DATA_SIZE,
+						   total_dwords);
+			qed_wr(p_hwfn, p_ptt, RSS_REG_RSS_RAM_ADDR, rss_addr);
+			offset += qed_grc_dump_addr_range(p_hwfn,
+							  p_ptt,
+							  dump_buf + offset,
+							  dump,
+							  addr,
+							  num_dwords_to_read,
+							  false);
+			total_dwords -= num_dwords_to_read;
+			rss_addr++;
+		}
+	}
+
+	return offset;
+}
+
+/* Dumps GRC Big RAM. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_big_ram(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt,
+				u32 *dump_buf, bool dump, u8 big_ram_id)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 block_size, ram_size, offset = 0, reg_val, i;
+	char mem_name[12] = "???_BIG_RAM";
+	char type_name[8] = "???_RAM";
+	struct big_ram_defs *big_ram;
+
+	big_ram = &s_big_ram_defs[big_ram_id];
+	ram_size = big_ram->ram_size[dev_data->chip_id];
+
+	reg_val = qed_rd(p_hwfn, p_ptt, big_ram->is_256b_reg_addr);
+	block_size = reg_val &
+		     BIT(big_ram->is_256b_bit_offset[dev_data->chip_id]) ? 256
+									 : 128;
+
+	strscpy(type_name, big_ram->instance_name, sizeof(type_name));
+	strscpy(mem_name, big_ram->instance_name, sizeof(mem_name));
+
+	/* Dump memory header */
+	offset += qed_grc_dump_mem_hdr(p_hwfn,
+				       dump_buf + offset,
+				       dump,
+				       mem_name,
+				       0,
+				       ram_size,
+				       block_size * 8,
+				       false, type_name, false, 0);
+
+	/* Read and dump Big RAM data */
+	if (!dump)
+		return offset + ram_size;
+
+	/* Dump Big RAM */
+	for (i = 0; i < DIV_ROUND_UP(ram_size, BRB_REG_BIG_RAM_DATA_SIZE);
+	     i++) {
+		u32 addr, len;
+
+		qed_wr(p_hwfn, p_ptt, big_ram->addr_reg_addr, i);
+		addr = BYTES_TO_DWORDS(big_ram->data_reg_addr);
+		len = BRB_REG_BIG_RAM_DATA_SIZE;
+		offset += qed_grc_dump_addr_range(p_hwfn,
+						  p_ptt,
+						  dump_buf + offset,
+						  dump,
+						  addr,
+						  len,
+						  false);
+	}
+
+	return offset;
+}
+
+static u32 qed_grc_dump_mcp(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	bool block_enable[MAX_BLOCK_ID] = { 0 };
+	u32 offset = 0, addr;
+	bool halted = false;
+
+	/* Halt MCP */
+	if (dump && !qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_NO_MCP)) {
+		halted = !qed_mcp_halt(p_hwfn, p_ptt);
+		if (!halted)
+			DP_NOTICE(p_hwfn, "MCP halt failed!\n");
+	}
+
+	/* Dump MCP scratchpad */
+	offset += qed_grc_dump_mem(p_hwfn,
+				   p_ptt,
+				   dump_buf + offset,
+				   dump,
+				   NULL,
+				   BYTES_TO_DWORDS(MCP_REG_SCRATCH),
+				   MCP_REG_SCRATCH_SIZE_BB_K2,
+				   false, 0, false, "MCP", false, 0);
+
+	/* Dump MCP cpu_reg_file */
+	offset += qed_grc_dump_mem(p_hwfn,
+				   p_ptt,
+				   dump_buf + offset,
+				   dump,
+				   NULL,
+				   BYTES_TO_DWORDS(MCP_REG_CPU_REG_FILE),
+				   MCP_REG_CPU_REG_FILE_SIZE,
+				   false, 0, false, "MCP", false, 0);
+
+	/* Dump MCP registers */
+	block_enable[BLOCK_MCP] = true;
+	offset += qed_grc_dump_registers(p_hwfn,
+					 p_ptt,
+					 dump_buf + offset,
+					 dump, block_enable, "block", "MCP");
+
+	/* Dump required non-MCP registers */
+	offset += qed_grc_dump_regs_hdr(dump_buf + offset,
+					dump, 1, "eng", -1, "block", "MCP");
+	addr = BYTES_TO_DWORDS(MISC_REG_SHARED_MEM_ADDR);
+	offset += qed_grc_dump_reg_entry(p_hwfn,
+					 p_ptt,
+					 dump_buf + offset,
+					 dump,
+					 addr,
+					 1,
+					 false);
+
+	/* Release MCP */
+	if (halted && qed_mcp_resume(p_hwfn, p_ptt))
+		DP_NOTICE(p_hwfn, "Failed to resume MCP after halt!\n");
+
+	return offset;
+}
+
+/* Dumps the tbus indirect memory for all PHYs. */
+static u32 qed_grc_dump_phy(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	u32 offset = 0, tbus_lo_offset, tbus_hi_offset;
+	char mem_name[32];
+	u8 phy_id;
+
+	for (phy_id = 0; phy_id < ARRAY_SIZE(s_phy_defs); phy_id++) {
+		u32 addr_lo_addr, addr_hi_addr, data_lo_addr, data_hi_addr;
+		struct phy_defs *phy_defs;
+		u8 *bytes_buf;
+
+		phy_defs = &s_phy_defs[phy_id];
+		addr_lo_addr = phy_defs->base_addr +
+			       phy_defs->tbus_addr_lo_addr;
+		addr_hi_addr = phy_defs->base_addr +
+			       phy_defs->tbus_addr_hi_addr;
+		data_lo_addr = phy_defs->base_addr +
+			       phy_defs->tbus_data_lo_addr;
+		data_hi_addr = phy_defs->base_addr +
+			       phy_defs->tbus_data_hi_addr;
+
+		if (snprintf(mem_name, sizeof(mem_name), "tbus_%s",
+			     phy_defs->phy_name) < 0)
+			DP_NOTICE(p_hwfn,
+				  "Unexpected debug error: invalid PHY memory name\n");
+
+		offset += qed_grc_dump_mem_hdr(p_hwfn,
+					       dump_buf + offset,
+					       dump,
+					       mem_name,
+					       0,
+					       PHY_DUMP_SIZE_DWORDS,
+					       16, true, mem_name, false, 0);
+
+		if (!dump) {
+			offset += PHY_DUMP_SIZE_DWORDS;
+			continue;
+		}
+
+		bytes_buf = (u8 *)(dump_buf + offset);
+		for (tbus_hi_offset = 0;
+		     tbus_hi_offset < (NUM_PHY_TBUS_ADDRESSES >> 8);
+		     tbus_hi_offset++) {
+			qed_wr(p_hwfn, p_ptt, addr_hi_addr, tbus_hi_offset);
+			for (tbus_lo_offset = 0; tbus_lo_offset < 256;
+			     tbus_lo_offset++) {
+				qed_wr(p_hwfn,
+				       p_ptt, addr_lo_addr, tbus_lo_offset);
+				*(bytes_buf++) = (u8)qed_rd(p_hwfn,
+							    p_ptt,
+							    data_lo_addr);
+				*(bytes_buf++) = (u8)qed_rd(p_hwfn,
+							    p_ptt,
+							    data_hi_addr);
+			}
+		}
+
+		offset += PHY_DUMP_SIZE_DWORDS;
+	}
+
+	return offset;
+}
+
+static void qed_config_dbg_line(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt,
+				enum block_id block_id,
+				u8 line_id,
+				u8 enable_mask,
+				u8 right_shift,
+				u8 force_valid_mask, u8 force_frame_mask)
+{
+	struct block_defs *block = s_block_defs[block_id];
+
+	qed_wr(p_hwfn, p_ptt, block->dbg_select_addr, line_id);
+	qed_wr(p_hwfn, p_ptt, block->dbg_enable_addr, enable_mask);
+	qed_wr(p_hwfn, p_ptt, block->dbg_shift_addr, right_shift);
+	qed_wr(p_hwfn, p_ptt, block->dbg_force_valid_addr, force_valid_mask);
+	qed_wr(p_hwfn, p_ptt, block->dbg_force_frame_addr, force_frame_mask);
+}
+
+/* Dumps Static Debug data. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_static_debug(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     u32 *dump_buf, bool dump)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 block_id, line_id, offset = 0;
+
+	/* Don't dump static debug if a debug bus recording is in progress */
+	if (dump && qed_rd(p_hwfn, p_ptt, DBG_REG_DBG_BLOCK_ON))
+		return 0;
+
+	if (dump) {
+		/* Disable all blocks debug output */
+		for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) {
+			struct block_defs *block = s_block_defs[block_id];
+
+			if (block->dbg_client_id[dev_data->chip_id] !=
+			    MAX_DBG_BUS_CLIENTS)
+				qed_wr(p_hwfn, p_ptt, block->dbg_enable_addr,
+				       0);
+		}
+
+		qed_bus_reset_dbg_block(p_hwfn, p_ptt);
+		qed_bus_set_framing_mode(p_hwfn,
+					 p_ptt, DBG_BUS_FRAME_MODE_8HW_0ST);
+		qed_wr(p_hwfn,
+		       p_ptt, DBG_REG_DEBUG_TARGET, DBG_BUS_TARGET_ID_INT_BUF);
+		qed_wr(p_hwfn, p_ptt, DBG_REG_FULL_MODE, 1);
+		qed_bus_enable_dbg_block(p_hwfn, p_ptt, true);
+	}
+
+	/* Dump all static debug lines for each relevant block */
+	for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) {
+		struct block_defs *block = s_block_defs[block_id];
+		struct dbg_bus_block *block_desc;
+		u32 block_dwords, addr, len;
+		u8 dbg_client_id;
+
+		if (block->dbg_client_id[dev_data->chip_id] ==
+		    MAX_DBG_BUS_CLIENTS)
+			continue;
+
+		block_desc = get_dbg_bus_block_desc(p_hwfn,
+						    (enum block_id)block_id);
+		block_dwords = NUM_DBG_LINES(block_desc) *
+			       STATIC_DEBUG_LINE_DWORDS;
+
+		/* Dump static section params */
+		offset += qed_grc_dump_mem_hdr(p_hwfn,
+					       dump_buf + offset,
+					       dump,
+					       block->name,
+					       0,
+					       block_dwords,
+					       32, false, "STATIC", false, 0);
+
+		if (!dump) {
+			offset += block_dwords;
+			continue;
+		}
+
+		/* If all lines are invalid - dump zeros */
+		if (dev_data->block_in_reset[block_id]) {
+			memset(dump_buf + offset, 0,
+			       DWORDS_TO_BYTES(block_dwords));
+			offset += block_dwords;
+			continue;
+		}
+
+		/* Enable block's client */
+		dbg_client_id = block->dbg_client_id[dev_data->chip_id];
+		qed_bus_enable_clients(p_hwfn,
+				       p_ptt,
+				       BIT(dbg_client_id));
+
+		addr = BYTES_TO_DWORDS(DBG_REG_CALENDAR_OUT_DATA);
+		len = STATIC_DEBUG_LINE_DWORDS;
+		for (line_id = 0; line_id < (u32)NUM_DBG_LINES(block_desc);
+		     line_id++) {
+			/* Configure debug line ID */
+			qed_config_dbg_line(p_hwfn,
+					    p_ptt,
+					    (enum block_id)block_id,
+					    (u8)line_id, 0xf, 0, 0, 0);
+
+			/* Read debug line info */
+			offset += qed_grc_dump_addr_range(p_hwfn,
+							  p_ptt,
+							  dump_buf + offset,
+							  dump,
+							  addr,
+							  len,
+							  true);
+		}
+
+		/* Disable block's client and debug output */
+		qed_bus_enable_clients(p_hwfn, p_ptt, 0);
+		qed_wr(p_hwfn, p_ptt, block->dbg_enable_addr, 0);
+	}
+
+	if (dump) {
+		qed_bus_enable_dbg_block(p_hwfn, p_ptt, false);
+		qed_bus_enable_clients(p_hwfn, p_ptt, 0);
+	}
+
+	return offset;
+}
+
+/* Performs GRC Dump to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static enum dbg_status qed_grc_dump(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    u32 *dump_buf,
+				    bool dump, u32 *num_dumped_dwords)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	bool parities_masked = false;
+	u8 i, port_mode = 0;
+	u32 offset = 0;
+
+	*num_dumped_dwords = 0;
+
+	if (dump) {
+		/* Find port mode */
+		switch (qed_rd(p_hwfn, p_ptt, MISC_REG_PORT_MODE)) {
+		case 0:
+			port_mode = 1;
+			break;
+		case 1:
+			port_mode = 2;
+			break;
+		case 2:
+			port_mode = 4;
+			break;
+		}
+
+		/* Update reset state */
+		qed_update_blocks_reset_state(p_hwfn, p_ptt);
+	}
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 4);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "grc-dump");
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump,
+				     "num-lcids",
+				     qed_grc_get_param(p_hwfn,
+						DBG_GRC_PARAM_NUM_LCIDS));
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump,
+				     "num-ltids",
+				     qed_grc_get_param(p_hwfn,
+						DBG_GRC_PARAM_NUM_LTIDS));
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump, "num-ports", port_mode);
+
+	/* Dump reset registers (dumped before taking blocks out of reset ) */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_REGS))
+		offset += qed_grc_dump_reset_regs(p_hwfn,
+						  p_ptt,
+						  dump_buf + offset, dump);
+
+	/* Take all blocks out of reset (using reset registers) */
+	if (dump) {
+		qed_grc_unreset_blocks(p_hwfn, p_ptt);
+		qed_update_blocks_reset_state(p_hwfn, p_ptt);
+	}
+
+	/* Disable all parities using MFW command */
+	if (dump &&
+	    !qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_NO_MCP)) {
+		parities_masked = !qed_mcp_mask_parities(p_hwfn, p_ptt, 1);
+		if (!parities_masked) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to mask parities using MFW\n");
+			if (qed_grc_get_param
+			    (p_hwfn, DBG_GRC_PARAM_PARITY_SAFE))
+				return DBG_STATUS_MCP_COULD_NOT_MASK_PRTY;
+		}
+	}
+
+	/* Dump modified registers (dumped before modifying them) */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_REGS))
+		offset += qed_grc_dump_modified_regs(p_hwfn,
+						     p_ptt,
+						     dump_buf + offset, dump);
+
+	/* Stall storms */
+	if (dump &&
+	    (qed_grc_is_included(p_hwfn,
+				 DBG_GRC_PARAM_DUMP_IOR) ||
+	     qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_VFC)))
+		qed_grc_stall_storms(p_hwfn, p_ptt, true);
+
+	/* Dump all regs  */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_REGS)) {
+		bool block_enable[MAX_BLOCK_ID];
+
+		/* Dump all blocks except MCP */
+		for (i = 0; i < MAX_BLOCK_ID; i++)
+			block_enable[i] = true;
+		block_enable[BLOCK_MCP] = false;
+		offset += qed_grc_dump_registers(p_hwfn,
+						 p_ptt,
+						 dump_buf +
+						 offset,
+						 dump,
+						 block_enable, NULL, NULL);
+
+		/* Dump special registers */
+		offset += qed_grc_dump_special_regs(p_hwfn,
+						    p_ptt,
+						    dump_buf + offset, dump);
+	}
+
+	/* Dump memories */
+	offset += qed_grc_dump_memories(p_hwfn, p_ptt, dump_buf + offset, dump);
+
+	/* Dump MCP */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_MCP))
+		offset += qed_grc_dump_mcp(p_hwfn,
+					   p_ptt, dump_buf + offset, dump);
+
+	/* Dump context */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_CM_CTX))
+		offset += qed_grc_dump_ctx(p_hwfn,
+					   p_ptt, dump_buf + offset, dump);
+
+	/* Dump RSS memories */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_RSS))
+		offset += qed_grc_dump_rss(p_hwfn,
+					   p_ptt, dump_buf + offset, dump);
+
+	/* Dump Big RAM */
+	for (i = 0; i < NUM_BIG_RAM_TYPES; i++)
+		if (qed_grc_is_included(p_hwfn, s_big_ram_defs[i].grc_param))
+			offset += qed_grc_dump_big_ram(p_hwfn,
+						       p_ptt,
+						       dump_buf + offset,
+						       dump, i);
+
+	/* Dump IORs */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_IOR))
+		offset += qed_grc_dump_iors(p_hwfn,
+					    p_ptt, dump_buf + offset, dump);
+
+	/* Dump VFC */
+	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_VFC))
+		offset += qed_grc_dump_vfc(p_hwfn,
+					   p_ptt, dump_buf + offset, dump);
+
+	/* Dump PHY tbus */
+	if (qed_grc_is_included(p_hwfn,
+				DBG_GRC_PARAM_DUMP_PHY) && dev_data->chip_id ==
+	    CHIP_K2 && dev_data->platform_id == PLATFORM_ASIC)
+		offset += qed_grc_dump_phy(p_hwfn,
+					   p_ptt, dump_buf + offset, dump);
+
+	/* Dump static debug data  */
+	if (qed_grc_is_included(p_hwfn,
+				DBG_GRC_PARAM_DUMP_STATIC) &&
+	    dev_data->bus.state == DBG_BUS_STATE_IDLE)
+		offset += qed_grc_dump_static_debug(p_hwfn,
+						    p_ptt,
+						    dump_buf + offset, dump);
+
+	/* Dump last section */
+	offset += qed_dump_last_section(dump_buf, offset, dump);
+
+	if (dump) {
+		/* Unstall storms */
+		if (qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_UNSTALL))
+			qed_grc_stall_storms(p_hwfn, p_ptt, false);
+
+		/* Clear parity status */
+		qed_grc_clear_all_prty(p_hwfn, p_ptt);
+
+		/* Enable all parities using MFW command */
+		if (parities_masked)
+			qed_mcp_mask_parities(p_hwfn, p_ptt, 0);
+	}
+
+	*num_dumped_dwords = offset;
+
+	return DBG_STATUS_OK;
+}
+
+/* Writes the specified failing Idle Check rule to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_idle_chk_dump_failure(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     u32 *
+				     dump_buf,
+				     bool dump,
+				     u16 rule_id,
+				     const struct dbg_idle_chk_rule *rule,
+				     u16 fail_entry_id, u32 *cond_reg_values)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	const struct dbg_idle_chk_cond_reg *cond_regs;
+	const struct dbg_idle_chk_info_reg *info_regs;
+	u32 i, next_reg_offset = 0, offset = 0;
+	struct dbg_idle_chk_result_hdr *hdr;
+	const union dbg_idle_chk_reg *regs;
+	u8 reg_id;
+
+	hdr = (struct dbg_idle_chk_result_hdr *)dump_buf;
+	regs = &((const union dbg_idle_chk_reg *)
+		 s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_REGS].ptr)[rule->reg_offset];
+	cond_regs = &regs[0].cond_reg;
+	info_regs = &regs[rule->num_cond_regs].info_reg;
+
+	/* Dump rule data */
+	if (dump) {
+		memset(hdr, 0, sizeof(*hdr));
+		hdr->rule_id = rule_id;
+		hdr->mem_entry_id = fail_entry_id;
+		hdr->severity = rule->severity;
+		hdr->num_dumped_cond_regs = rule->num_cond_regs;
+	}
+
+	offset += IDLE_CHK_RESULT_HDR_DWORDS;
+
+	/* Dump condition register values */
+	for (reg_id = 0; reg_id < rule->num_cond_regs; reg_id++) {
+		const struct dbg_idle_chk_cond_reg *reg = &cond_regs[reg_id];
+		struct dbg_idle_chk_result_reg_hdr *reg_hdr;
+
+		reg_hdr = (struct dbg_idle_chk_result_reg_hdr *)
+			  (dump_buf + offset);
+
+		/* Write register header */
+		if (!dump) {
+			offset += IDLE_CHK_RESULT_REG_HDR_DWORDS +
+			    reg->entry_size;
+			continue;
+		}
+
+		offset += IDLE_CHK_RESULT_REG_HDR_DWORDS;
+		memset(reg_hdr, 0, sizeof(*reg_hdr));
+		reg_hdr->start_entry = reg->start_entry;
+		reg_hdr->size = reg->entry_size;
+		SET_FIELD(reg_hdr->data,
+			  DBG_IDLE_CHK_RESULT_REG_HDR_IS_MEM,
+			  reg->num_entries > 1 || reg->start_entry > 0 ? 1 : 0);
+		SET_FIELD(reg_hdr->data,
+			  DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID, reg_id);
+
+		/* Write register values */
+		for (i = 0; i < reg_hdr->size; i++, next_reg_offset++, offset++)
+			dump_buf[offset] = cond_reg_values[next_reg_offset];
+	}
+
+	/* Dump info register values */
+	for (reg_id = 0; reg_id < rule->num_info_regs; reg_id++) {
+		const struct dbg_idle_chk_info_reg *reg = &info_regs[reg_id];
+		u32 block_id;
+
+		/* Check if register's block is in reset */
+		if (!dump) {
+			offset += IDLE_CHK_RESULT_REG_HDR_DWORDS + reg->size;
+			continue;
+		}
+
+		block_id = GET_FIELD(reg->data, DBG_IDLE_CHK_INFO_REG_BLOCK_ID);
+		if (block_id >= MAX_BLOCK_ID) {
+			DP_NOTICE(p_hwfn, "Invalid block_id\n");
+			return 0;
+		}
+
+		if (!dev_data->block_in_reset[block_id]) {
+			struct dbg_idle_chk_result_reg_hdr *reg_hdr;
+			bool wide_bus, eval_mode, mode_match = true;
+			u16 modes_buf_offset;
+			u32 addr;
+
+			reg_hdr = (struct dbg_idle_chk_result_reg_hdr *)
+				  (dump_buf + offset);
+
+			/* Check mode */
+			eval_mode = GET_FIELD(reg->mode.data,
+					      DBG_MODE_HDR_EVAL_MODE) > 0;
+			if (eval_mode) {
+				modes_buf_offset =
+				    GET_FIELD(reg->mode.data,
+					      DBG_MODE_HDR_MODES_BUF_OFFSET);
+				mode_match =
+					qed_is_mode_match(p_hwfn,
+							  &modes_buf_offset);
+			}
+
+			if (!mode_match)
+				continue;
+
+			addr = GET_FIELD(reg->data,
+					 DBG_IDLE_CHK_INFO_REG_ADDRESS);
+			wide_bus = GET_FIELD(reg->data,
+					     DBG_IDLE_CHK_INFO_REG_WIDE_BUS);
+
+			/* Write register header */
+			offset += IDLE_CHK_RESULT_REG_HDR_DWORDS;
+			hdr->num_dumped_info_regs++;
+			memset(reg_hdr, 0, sizeof(*reg_hdr));
+			reg_hdr->size = reg->size;
+			SET_FIELD(reg_hdr->data,
+				  DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID,
+				  rule->num_cond_regs + reg_id);
+
+			/* Write register values */
+			offset += qed_grc_dump_addr_range(p_hwfn,
+							  p_ptt,
+							  dump_buf + offset,
+							  dump,
+							  addr,
+							  reg->size, wide_bus);
+		}
+	}
+
+	return offset;
+}
+
+/* Dumps idle check rule entries. Returns the dumped size in dwords. */
+static u32
+qed_idle_chk_dump_rule_entries(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			       u32 *dump_buf, bool dump,
+			       const struct dbg_idle_chk_rule *input_rules,
+			       u32 num_input_rules, u32 *num_failing_rules)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 cond_reg_values[IDLE_CHK_MAX_ENTRIES_SIZE];
+	u32 i, offset = 0;
+	u16 entry_id;
+	u8 reg_id;
+
+	*num_failing_rules = 0;
+
+	for (i = 0; i < num_input_rules; i++) {
+		const struct dbg_idle_chk_cond_reg *cond_regs;
+		const struct dbg_idle_chk_rule *rule;
+		const union dbg_idle_chk_reg *regs;
+		u16 num_reg_entries = 1;
+		bool check_rule = true;
+		const u32 *imm_values;
+
+		rule = &input_rules[i];
+		regs = &((const union dbg_idle_chk_reg *)
+			 s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_REGS].ptr)
+			[rule->reg_offset];
+		cond_regs = &regs[0].cond_reg;
+		imm_values = &s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_IMMS].ptr
+			     [rule->imm_offset];
+
+		/* Check if all condition register blocks are out of reset, and
+		 * find maximal number of entries (all condition registers that
+		 * are memories must have the same size, which is > 1).
+		 */
+		for (reg_id = 0; reg_id < rule->num_cond_regs && check_rule;
+		     reg_id++) {
+			u32 block_id =
+				GET_FIELD(cond_regs[reg_id].data,
+					  DBG_IDLE_CHK_COND_REG_BLOCK_ID);
+
+			if (block_id >= MAX_BLOCK_ID) {
+				DP_NOTICE(p_hwfn, "Invalid block_id\n");
+				return 0;
+			}
+
+			check_rule = !dev_data->block_in_reset[block_id];
+			if (cond_regs[reg_id].num_entries > num_reg_entries)
+				num_reg_entries = cond_regs[reg_id].num_entries;
+		}
+
+		if (!check_rule && dump)
+			continue;
+
+		if (!dump) {
+			u32 entry_dump_size =
+				qed_idle_chk_dump_failure(p_hwfn,
+							  p_ptt,
+							  dump_buf + offset,
+							  false,
+							  rule->rule_id,
+							  rule,
+							  0,
+							  NULL);
+
+			offset += num_reg_entries * entry_dump_size;
+			(*num_failing_rules) += num_reg_entries;
+			continue;
+		}
+
+		/* Go over all register entries (number of entries is the same
+		 * for all condition registers).
+		 */
+		for (entry_id = 0; entry_id < num_reg_entries; entry_id++) {
+			u32 next_reg_offset = 0;
+
+			/* Read current entry of all condition registers */
+			for (reg_id = 0; reg_id < rule->num_cond_regs;
+			     reg_id++) {
+				const struct dbg_idle_chk_cond_reg *reg =
+					&cond_regs[reg_id];
+				u32 padded_entry_size, addr;
+				bool wide_bus;
+
+				/* Find GRC address (if it's a memory, the
+				 * address of the specific entry is calculated).
+				 */
+				addr = GET_FIELD(reg->data,
+						 DBG_IDLE_CHK_COND_REG_ADDRESS);
+				wide_bus =
+				    GET_FIELD(reg->data,
+					      DBG_IDLE_CHK_COND_REG_WIDE_BUS);
+				if (reg->num_entries > 1 ||
+				    reg->start_entry > 0) {
+					padded_entry_size =
+					   reg->entry_size > 1 ?
+					   roundup_pow_of_two(reg->entry_size) :
+					   1;
+					addr += (reg->start_entry + entry_id) *
+						padded_entry_size;
+				}
+
+				/* Read registers */
+				if (next_reg_offset + reg->entry_size >=
+				    IDLE_CHK_MAX_ENTRIES_SIZE) {
+					DP_NOTICE(p_hwfn,
+						  "idle check registers entry is too large\n");
+					return 0;
+				}
+
+				next_reg_offset +=
+				    qed_grc_dump_addr_range(p_hwfn, p_ptt,
+							    cond_reg_values +
+							    next_reg_offset,
+							    dump, addr,
+							    reg->entry_size,
+							    wide_bus);
+			}
+
+			/* Call rule condition function.
+			 * If returns true, it's a failure.
+			 */
+			if ((*cond_arr[rule->cond_id]) (cond_reg_values,
+							imm_values)) {
+				offset += qed_idle_chk_dump_failure(p_hwfn,
+							p_ptt,
+							dump_buf + offset,
+							dump,
+							rule->rule_id,
+							rule,
+							entry_id,
+							cond_reg_values);
+				(*num_failing_rules)++;
+			}
+		}
+	}
+
+	return offset;
+}
+
+/* Performs Idle Check Dump to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_idle_chk_dump(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	u32 num_failing_rules_offset, offset = 0, input_offset = 0;
+	u32 num_failing_rules = 0;
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 1);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "idle-chk");
+
+	/* Dump idle check section header with a single parameter */
+	offset += qed_dump_section_hdr(dump_buf + offset, dump, "idle_chk", 1);
+	num_failing_rules_offset = offset;
+	offset += qed_dump_num_param(dump_buf + offset, dump, "num_rules", 0);
+
+	while (input_offset <
+	       s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_RULES].size_in_dwords) {
+		const struct dbg_idle_chk_cond_hdr *cond_hdr =
+			(const struct dbg_idle_chk_cond_hdr *)
+			&s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_RULES].ptr
+			[input_offset++];
+		bool eval_mode, mode_match = true;
+		u32 curr_failing_rules;
+		u16 modes_buf_offset;
+
+		/* Check mode */
+		eval_mode = GET_FIELD(cond_hdr->mode.data,
+				      DBG_MODE_HDR_EVAL_MODE) > 0;
+		if (eval_mode) {
+			modes_buf_offset =
+				GET_FIELD(cond_hdr->mode.data,
+					  DBG_MODE_HDR_MODES_BUF_OFFSET);
+			mode_match = qed_is_mode_match(p_hwfn,
+						       &modes_buf_offset);
+		}
+
+		if (mode_match) {
+			offset +=
+			    qed_idle_chk_dump_rule_entries(p_hwfn,
+				p_ptt,
+				dump_buf + offset,
+				dump,
+				(const struct dbg_idle_chk_rule *)
+				&s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_RULES].
+				ptr[input_offset],
+				cond_hdr->data_size / IDLE_CHK_RULE_SIZE_DWORDS,
+				&curr_failing_rules);
+			num_failing_rules += curr_failing_rules;
+		}
+
+		input_offset += cond_hdr->data_size;
+	}
+
+	/* Overwrite num_rules parameter */
+	if (dump)
+		qed_dump_num_param(dump_buf + num_failing_rules_offset,
+				   dump, "num_rules", num_failing_rules);
+
+	/* Dump last section */
+	offset += qed_dump_last_section(dump_buf, offset, dump);
+
+	return offset;
+}
+
+/* Finds the meta data image in NVRAM */
+static enum dbg_status qed_find_nvram_image(struct qed_hwfn *p_hwfn,
+					    struct qed_ptt *p_ptt,
+					    u32 image_type,
+					    u32 *nvram_offset_bytes,
+					    u32 *nvram_size_bytes)
+{
+	u32 ret_mcp_resp, ret_mcp_param, ret_txn_size;
+	struct mcp_file_att file_att;
+	int nvm_result;
+
+	/* Call NVRAM get file command */
+	nvm_result = qed_mcp_nvm_rd_cmd(p_hwfn,
+					p_ptt,
+					DRV_MSG_CODE_NVM_GET_FILE_ATT,
+					image_type,
+					&ret_mcp_resp,
+					&ret_mcp_param,
+					&ret_txn_size, (u32 *)&file_att);
+
+	/* Check response */
+	if (nvm_result ||
+	    (ret_mcp_resp & FW_MSG_CODE_MASK) != FW_MSG_CODE_NVM_OK)
+		return DBG_STATUS_NVRAM_GET_IMAGE_FAILED;
+
+	/* Update return values */
+	*nvram_offset_bytes = file_att.nvm_start_addr;
+	*nvram_size_bytes = file_att.len;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_DEBUG,
+		   "find_nvram_image: found NVRAM image of type %d in NVRAM offset %d bytes with size %d bytes\n",
+		   image_type, *nvram_offset_bytes, *nvram_size_bytes);
+
+	/* Check alignment */
+	if (*nvram_size_bytes & 0x3)
+		return DBG_STATUS_NON_ALIGNED_NVRAM_IMAGE;
+
+	return DBG_STATUS_OK;
+}
+
+/* Reads data from NVRAM */
+static enum dbg_status qed_nvram_read(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 nvram_offset_bytes,
+				      u32 nvram_size_bytes, u32 *ret_buf)
+{
+	u32 ret_mcp_resp, ret_mcp_param, ret_read_size, bytes_to_copy;
+	s32 bytes_left = nvram_size_bytes;
+	u32 read_offset = 0;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_DEBUG,
+		   "nvram_read: reading image of size %d bytes from NVRAM\n",
+		   nvram_size_bytes);
+
+	do {
+		bytes_to_copy =
+		    (bytes_left >
+		     MCP_DRV_NVM_BUF_LEN) ? MCP_DRV_NVM_BUF_LEN : bytes_left;
+
+		/* Call NVRAM read command */
+		if (qed_mcp_nvm_rd_cmd(p_hwfn, p_ptt,
+				       DRV_MSG_CODE_NVM_READ_NVRAM,
+				       (nvram_offset_bytes +
+					read_offset) |
+				       (bytes_to_copy <<
+					DRV_MB_PARAM_NVM_LEN_OFFSET),
+				       &ret_mcp_resp, &ret_mcp_param,
+				       &ret_read_size,
+				       (u32 *)((u8 *)ret_buf + read_offset)))
+			return DBG_STATUS_NVRAM_READ_FAILED;
+
+		/* Check response */
+		if ((ret_mcp_resp & FW_MSG_CODE_MASK) != FW_MSG_CODE_NVM_OK)
+			return DBG_STATUS_NVRAM_READ_FAILED;
+
+		/* Update read offset */
+		read_offset += ret_read_size;
+		bytes_left -= ret_read_size;
+	} while (bytes_left > 0);
+
+	return DBG_STATUS_OK;
+}
+
+/* Get info on the MCP Trace data in the scratchpad:
+ * - trace_data_grc_addr (OUT): trace data GRC address in bytes
+ * - trace_data_size (OUT): trace data size in bytes (without the header)
+ */
+static enum dbg_status qed_mcp_trace_get_data_info(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *trace_data_grc_addr,
+						   u32 *trace_data_size)
+{
+	u32 spad_trace_offsize, signature;
+
+	/* Read trace section offsize structure from MCP scratchpad */
+	spad_trace_offsize = qed_rd(p_hwfn, p_ptt, MCP_SPAD_TRACE_OFFSIZE_ADDR);
+
+	/* Extract trace section address from offsize (in scratchpad) */
+	*trace_data_grc_addr =
+		MCP_REG_SCRATCH + SECTION_OFFSET(spad_trace_offsize);
+
+	/* Read signature from MCP trace section */
+	signature = qed_rd(p_hwfn, p_ptt,
+			   *trace_data_grc_addr +
+			   offsetof(struct mcp_trace, signature));
+
+	if (signature != MFW_TRACE_SIGNATURE)
+		return DBG_STATUS_INVALID_TRACE_SIGNATURE;
+
+	/* Read trace size from MCP trace section */
+	*trace_data_size = qed_rd(p_hwfn,
+				  p_ptt,
+				  *trace_data_grc_addr +
+				  offsetof(struct mcp_trace, size));
+
+	return DBG_STATUS_OK;
+}
+
+/* Reads MCP trace meta data image from NVRAM
+ * - running_bundle_id (OUT): running bundle ID (invalid when loaded from file)
+ * - trace_meta_offset (OUT): trace meta offset in NVRAM in bytes (invalid when
+ *			      loaded from file).
+ * - trace_meta_size (OUT):   size in bytes of the trace meta data.
+ */
+static enum dbg_status qed_mcp_trace_get_meta_info(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 trace_data_size_bytes,
+						   u32 *running_bundle_id,
+						   u32 *trace_meta_offset,
+						   u32 *trace_meta_size)
+{
+	u32 spad_trace_offsize, nvram_image_type, running_mfw_addr;
+
+	/* Read MCP trace section offsize structure from MCP scratchpad */
+	spad_trace_offsize = qed_rd(p_hwfn, p_ptt, MCP_SPAD_TRACE_OFFSIZE_ADDR);
+
+	/* Find running bundle ID */
+	running_mfw_addr =
+		MCP_REG_SCRATCH + SECTION_OFFSET(spad_trace_offsize) +
+		QED_SECTION_SIZE(spad_trace_offsize) + trace_data_size_bytes;
+	*running_bundle_id = qed_rd(p_hwfn, p_ptt, running_mfw_addr);
+	if (*running_bundle_id > 1)
+		return DBG_STATUS_INVALID_NVRAM_BUNDLE;
+
+	/* Find image in NVRAM */
+	nvram_image_type =
+	    (*running_bundle_id ==
+	     DIR_ID_1) ? NVM_TYPE_MFW_TRACE1 : NVM_TYPE_MFW_TRACE2;
+	return qed_find_nvram_image(p_hwfn,
+				    p_ptt,
+				    nvram_image_type,
+				    trace_meta_offset, trace_meta_size);
+}
+
+/* Reads the MCP Trace meta data from NVRAM into the specified buffer */
+static enum dbg_status qed_mcp_trace_read_meta(struct qed_hwfn *p_hwfn,
+					       struct qed_ptt *p_ptt,
+					       u32 nvram_offset_in_bytes,
+					       u32 size_in_bytes, u32 *buf)
+{
+	u8 modules_num, module_len, i, *byte_buf = (u8 *)buf;
+	enum dbg_status status;
+	u32 signature;
+
+	/* Read meta data from NVRAM */
+	status = qed_nvram_read(p_hwfn,
+				p_ptt,
+				nvram_offset_in_bytes, size_in_bytes, buf);
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	/* Extract and check first signature */
+	signature = qed_read_unaligned_dword(byte_buf);
+	byte_buf += sizeof(signature);
+	if (signature != NVM_MAGIC_VALUE)
+		return DBG_STATUS_INVALID_TRACE_SIGNATURE;
+
+	/* Extract number of modules */
+	modules_num = *(byte_buf++);
+
+	/* Skip all modules */
+	for (i = 0; i < modules_num; i++) {
+		module_len = *(byte_buf++);
+		byte_buf += module_len;
+	}
+
+	/* Extract and check second signature */
+	signature = qed_read_unaligned_dword(byte_buf);
+	byte_buf += sizeof(signature);
+	if (signature != NVM_MAGIC_VALUE)
+		return DBG_STATUS_INVALID_TRACE_SIGNATURE;
+
+	return DBG_STATUS_OK;
+}
+
+/* Dump MCP Trace */
+static enum dbg_status qed_mcp_trace_dump(struct qed_hwfn *p_hwfn,
+					  struct qed_ptt *p_ptt,
+					  u32 *dump_buf,
+					  bool dump, u32 *num_dumped_dwords)
+{
+	u32 trace_data_grc_addr, trace_data_size_bytes, trace_data_size_dwords;
+	u32 trace_meta_size_dwords = 0, running_bundle_id, offset = 0;
+	u32 trace_meta_offset_bytes = 0, trace_meta_size_bytes = 0;
+	enum dbg_status status;
+	bool mcp_access;
+	int halted = 0;
+
+	*num_dumped_dwords = 0;
+
+	mcp_access = !qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_NO_MCP);
+
+	/* Get trace data info */
+	status = qed_mcp_trace_get_data_info(p_hwfn,
+					     p_ptt,
+					     &trace_data_grc_addr,
+					     &trace_data_size_bytes);
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 1);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "mcp-trace");
+
+	/* Halt MCP while reading from scratchpad so the read data will be
+	 * consistent. if halt fails, MCP trace is taken anyway, with a small
+	 * risk that it may be corrupt.
+	 */
+	if (dump && mcp_access) {
+		halted = !qed_mcp_halt(p_hwfn, p_ptt);
+		if (!halted)
+			DP_NOTICE(p_hwfn, "MCP halt failed!\n");
+	}
+
+	/* Find trace data size */
+	trace_data_size_dwords =
+	    DIV_ROUND_UP(trace_data_size_bytes + sizeof(struct mcp_trace),
+			 BYTES_IN_DWORD);
+
+	/* Dump trace data section header and param */
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "mcp_trace_data", 1);
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump, "size", trace_data_size_dwords);
+
+	/* Read trace data from scratchpad into dump buffer */
+	offset += qed_grc_dump_addr_range(p_hwfn,
+					  p_ptt,
+					  dump_buf + offset,
+					  dump,
+					  BYTES_TO_DWORDS(trace_data_grc_addr),
+					  trace_data_size_dwords, false);
+
+	/* Resume MCP (only if halt succeeded) */
+	if (halted && qed_mcp_resume(p_hwfn, p_ptt))
+		DP_NOTICE(p_hwfn, "Failed to resume MCP after halt!\n");
+
+	/* Dump trace meta section header */
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "mcp_trace_meta", 1);
+
+	/* If MCP Trace meta size parameter was set, use it.
+	 * Otherwise, read trace meta.
+	 * trace_meta_size_bytes is dword-aligned.
+	 */
+	trace_meta_size_bytes =
+		qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_MCP_TRACE_META_SIZE);
+	if ((!trace_meta_size_bytes || dump) && mcp_access) {
+		status = qed_mcp_trace_get_meta_info(p_hwfn,
+						     p_ptt,
+						     trace_data_size_bytes,
+						     &running_bundle_id,
+						     &trace_meta_offset_bytes,
+						     &trace_meta_size_bytes);
+		if (status == DBG_STATUS_OK)
+			trace_meta_size_dwords =
+				BYTES_TO_DWORDS(trace_meta_size_bytes);
+	}
+
+	/* Dump trace meta size param */
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump, "size", trace_meta_size_dwords);
+
+	/* Read trace meta image into dump buffer */
+	if (dump && trace_meta_size_dwords)
+		status = qed_mcp_trace_read_meta(p_hwfn,
+						 p_ptt,
+						 trace_meta_offset_bytes,
+						 trace_meta_size_bytes,
+						 dump_buf + offset);
+	if (status == DBG_STATUS_OK)
+		offset += trace_meta_size_dwords;
+
+	/* Dump last section */
+	offset += qed_dump_last_section(dump_buf, offset, dump);
+
+	*num_dumped_dwords = offset;
+
+	/* If no mcp access, indicate that the dump doesn't contain the meta
+	 * data from NVRAM.
+	 */
+	return mcp_access ? status : DBG_STATUS_NVRAM_GET_IMAGE_FAILED;
+}
+
+/* Dump GRC FIFO */
+static enum dbg_status qed_reg_fifo_dump(struct qed_hwfn *p_hwfn,
+					 struct qed_ptt *p_ptt,
+					 u32 *dump_buf,
+					 bool dump, u32 *num_dumped_dwords)
+{
+	u32 dwords_read, size_param_offset, offset = 0, addr, len;
+	bool fifo_has_data;
+
+	*num_dumped_dwords = 0;
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 1);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "reg-fifo");
+
+	/* Dump fifo data section header and param. The size param is 0 for
+	 * now, and is overwritten after reading the FIFO.
+	 */
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "reg_fifo_data", 1);
+	size_param_offset = offset;
+	offset += qed_dump_num_param(dump_buf + offset, dump, "size", 0);
+
+	if (!dump) {
+		/* FIFO max size is REG_FIFO_DEPTH_DWORDS. There is no way to
+		 * test how much data is available, except for reading it.
+		 */
+		offset += REG_FIFO_DEPTH_DWORDS;
+		goto out;
+	}
+
+	fifo_has_data = qed_rd(p_hwfn, p_ptt,
+			       GRC_REG_TRACE_FIFO_VALID_DATA) > 0;
+
+	/* Pull available data from fifo. Use DMAE since this is widebus memory
+	 * and must be accessed atomically. Test for dwords_read not passing
+	 * buffer size since more entries could be added to the buffer as we are
+	 * emptying it.
+	 */
+	addr = BYTES_TO_DWORDS(GRC_REG_TRACE_FIFO);
+	len = REG_FIFO_ELEMENT_DWORDS;
+	for (dwords_read = 0;
+	     fifo_has_data && dwords_read < REG_FIFO_DEPTH_DWORDS;
+	     dwords_read += REG_FIFO_ELEMENT_DWORDS) {
+		offset += qed_grc_dump_addr_range(p_hwfn,
+						  p_ptt,
+						  dump_buf + offset,
+						  true,
+						  addr,
+						  len,
+						  true);
+		fifo_has_data = qed_rd(p_hwfn, p_ptt,
+				       GRC_REG_TRACE_FIFO_VALID_DATA) > 0;
+	}
+
+	qed_dump_num_param(dump_buf + size_param_offset, dump, "size",
+			   dwords_read);
+out:
+	/* Dump last section */
+	offset += qed_dump_last_section(dump_buf, offset, dump);
+
+	*num_dumped_dwords = offset;
+
+	return DBG_STATUS_OK;
+}
+
+/* Dump IGU FIFO */
+static enum dbg_status qed_igu_fifo_dump(struct qed_hwfn *p_hwfn,
+					 struct qed_ptt *p_ptt,
+					 u32 *dump_buf,
+					 bool dump, u32 *num_dumped_dwords)
+{
+	u32 dwords_read, size_param_offset, offset = 0, addr, len;
+	bool fifo_has_data;
+
+	*num_dumped_dwords = 0;
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 1);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "igu-fifo");
+
+	/* Dump fifo data section header and param. The size param is 0 for
+	 * now, and is overwritten after reading the FIFO.
+	 */
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "igu_fifo_data", 1);
+	size_param_offset = offset;
+	offset += qed_dump_num_param(dump_buf + offset, dump, "size", 0);
+
+	if (!dump) {
+		/* FIFO max size is IGU_FIFO_DEPTH_DWORDS. There is no way to
+		 * test how much data is available, except for reading it.
+		 */
+		offset += IGU_FIFO_DEPTH_DWORDS;
+		goto out;
+	}
+
+	fifo_has_data = qed_rd(p_hwfn, p_ptt,
+			       IGU_REG_ERROR_HANDLING_DATA_VALID) > 0;
+
+	/* Pull available data from fifo. Use DMAE since this is widebus memory
+	 * and must be accessed atomically. Test for dwords_read not passing
+	 * buffer size since more entries could be added to the buffer as we are
+	 * emptying it.
+	 */
+	addr = BYTES_TO_DWORDS(IGU_REG_ERROR_HANDLING_MEMORY);
+	len = IGU_FIFO_ELEMENT_DWORDS;
+	for (dwords_read = 0;
+	     fifo_has_data && dwords_read < IGU_FIFO_DEPTH_DWORDS;
+	     dwords_read += IGU_FIFO_ELEMENT_DWORDS) {
+		offset += qed_grc_dump_addr_range(p_hwfn,
+						  p_ptt,
+						  dump_buf + offset,
+						  true,
+						  addr,
+						  len,
+						  true);
+		fifo_has_data = qed_rd(p_hwfn, p_ptt,
+				       IGU_REG_ERROR_HANDLING_DATA_VALID) > 0;
+	}
+
+	qed_dump_num_param(dump_buf + size_param_offset, dump, "size",
+			   dwords_read);
+out:
+	/* Dump last section */
+	offset += qed_dump_last_section(dump_buf, offset, dump);
+
+	*num_dumped_dwords = offset;
+
+	return DBG_STATUS_OK;
+}
+
+/* Protection Override dump */
+static enum dbg_status qed_protection_override_dump(struct qed_hwfn *p_hwfn,
+						    struct qed_ptt *p_ptt,
+						    u32 *dump_buf,
+						    bool dump,
+						    u32 *num_dumped_dwords)
+{
+	u32 size_param_offset, override_window_dwords, offset = 0, addr;
+
+	*num_dumped_dwords = 0;
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 1);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "protection-override");
+
+	/* Dump data section header and param. The size param is 0 for now,
+	 * and is overwritten after reading the data.
+	 */
+	offset += qed_dump_section_hdr(dump_buf + offset,
+				       dump, "protection_override_data", 1);
+	size_param_offset = offset;
+	offset += qed_dump_num_param(dump_buf + offset, dump, "size", 0);
+
+	if (!dump) {
+		offset += PROTECTION_OVERRIDE_DEPTH_DWORDS;
+		goto out;
+	}
+
+	/* Add override window info to buffer */
+	override_window_dwords =
+		qed_rd(p_hwfn, p_ptt, GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW) *
+		PROTECTION_OVERRIDE_ELEMENT_DWORDS;
+	addr = BYTES_TO_DWORDS(GRC_REG_PROTECTION_OVERRIDE_WINDOW);
+	offset += qed_grc_dump_addr_range(p_hwfn,
+					  p_ptt,
+					  dump_buf + offset,
+					  true,
+					  addr,
+					  override_window_dwords,
+					  true);
+	qed_dump_num_param(dump_buf + size_param_offset, dump, "size",
+			   override_window_dwords);
+out:
+	/* Dump last section */
+	offset += qed_dump_last_section(dump_buf, offset, dump);
+
+	*num_dumped_dwords = offset;
+
+	return DBG_STATUS_OK;
+}
+
+/* Performs FW Asserts Dump to the specified buffer.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_fw_asserts_dump(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	struct fw_asserts_ram_section *asserts;
+	char storm_letter_str[2] = "?";
+	struct fw_info fw_info;
+	u32 offset = 0;
+	u8 storm_id;
+
+	/* Dump global params */
+	offset += qed_dump_common_global_params(p_hwfn,
+						p_ptt,
+						dump_buf + offset, dump, 1);
+	offset += qed_dump_str_param(dump_buf + offset,
+				     dump, "dump-type", "fw-asserts");
+
+	/* Find Storm dump size */
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		u32 fw_asserts_section_addr, next_list_idx_addr, next_list_idx;
+		struct storm_defs *storm = &s_storm_defs[storm_id];
+		u32 last_list_idx, addr;
+
+		if (dev_data->block_in_reset[storm->block_id])
+			continue;
+
+		/* Read FW info for the current Storm */
+		qed_read_fw_info(p_hwfn, p_ptt, storm_id, &fw_info);
+
+		asserts = &fw_info.fw_asserts_section;
+
+		/* Dump FW Asserts section header and params */
+		storm_letter_str[0] = storm->letter;
+		offset += qed_dump_section_hdr(dump_buf + offset,
+					       dump, "fw_asserts", 2);
+		offset += qed_dump_str_param(dump_buf + offset,
+					     dump, "storm", storm_letter_str);
+		offset += qed_dump_num_param(dump_buf + offset,
+					     dump,
+					     "size",
+					     asserts->list_element_dword_size);
+
+		/* Read and dump FW Asserts data */
+		if (!dump) {
+			offset += asserts->list_element_dword_size;
+			continue;
+		}
+
+		fw_asserts_section_addr = storm->sem_fast_mem_addr +
+			SEM_FAST_REG_INT_RAM +
+			RAM_LINES_TO_BYTES(asserts->section_ram_line_offset);
+		next_list_idx_addr = fw_asserts_section_addr +
+			DWORDS_TO_BYTES(asserts->list_next_index_dword_offset);
+		next_list_idx = qed_rd(p_hwfn, p_ptt, next_list_idx_addr);
+		last_list_idx = (next_list_idx > 0 ?
+				 next_list_idx :
+				 asserts->list_num_elements) - 1;
+		addr = BYTES_TO_DWORDS(fw_asserts_section_addr) +
+		       asserts->list_dword_offset +
+		       last_list_idx * asserts->list_element_dword_size;
+		offset +=
+		    qed_grc_dump_addr_range(p_hwfn, p_ptt,
+					    dump_buf + offset,
+					    dump, addr,
+					    asserts->list_element_dword_size,
+					    false);
+	}
+
+	/* Dump last section */
+	offset += qed_dump_last_section(dump_buf, offset, dump);
+
+	return offset;
+}
+
+/***************************** Public Functions *******************************/
+
+enum dbg_status qed_dbg_set_bin_ptr(const u8 * const bin_ptr)
+{
+	struct bin_buffer_hdr *buf_array = (struct bin_buffer_hdr *)bin_ptr;
+	u8 buf_id;
+
+	/* convert binary data to debug arrays */
+	for (buf_id = 0; buf_id < MAX_BIN_DBG_BUFFER_TYPE; buf_id++) {
+		s_dbg_arrays[buf_id].ptr =
+		    (u32 *)(bin_ptr + buf_array[buf_id].offset);
+		s_dbg_arrays[buf_id].size_in_dwords =
+		    BYTES_TO_DWORDS(buf_array[buf_id].length);
+	}
+
+	return DBG_STATUS_OK;
+}
+
+/* Assign default GRC param values */
+void qed_dbg_grc_set_params_default(struct qed_hwfn *p_hwfn)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 i;
+
+	for (i = 0; i < MAX_DBG_GRC_PARAMS; i++)
+		if (!s_grc_param_defs[i].is_persistent)
+			dev_data->grc.param_val[i] =
+			    s_grc_param_defs[i].default_val[dev_data->chip_id];
+}
+
+enum dbg_status qed_dbg_grc_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+					      struct qed_ptt *p_ptt,
+					      u32 *buf_size)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+
+	*buf_size = 0;
+
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	if (!s_dbg_arrays[BIN_BUF_DBG_MODE_TREE].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_ATTN_BLOCKS].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_ATTN_REGS].ptr)
+		return DBG_STATUS_DBG_ARRAY_NOT_SET;
+
+	return qed_grc_dump(p_hwfn, p_ptt, NULL, false, buf_size);
+}
+
+enum dbg_status qed_dbg_grc_dump(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 u32 *dump_buf,
+				 u32 buf_size_in_dwords,
+				 u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	*num_dumped_dwords = 0;
+
+	status = qed_dbg_grc_get_dump_buf_size(p_hwfn,
+					       p_ptt,
+					       &needed_buf_size_in_dwords);
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	/* GRC Dump */
+	status = qed_grc_dump(p_hwfn, p_ptt, dump_buf, true, num_dumped_dwords);
+
+	/* Revert GRC params to their default */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
+	return status;
+}
+
+enum dbg_status qed_dbg_idle_chk_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *buf_size)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	struct idle_chk_data *idle_chk;
+	enum dbg_status status;
+
+	idle_chk = &dev_data->idle_chk;
+	*buf_size = 0;
+
+	status = qed_dbg_dev_init(p_hwfn, p_ptt);
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	if (!s_dbg_arrays[BIN_BUF_DBG_MODE_TREE].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_REGS].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_IMMS].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_RULES].ptr)
+		return DBG_STATUS_DBG_ARRAY_NOT_SET;
+
+	if (!idle_chk->buf_size_set) {
+		idle_chk->buf_size = qed_idle_chk_dump(p_hwfn,
+						       p_ptt, NULL, false);
+		idle_chk->buf_size_set = true;
+	}
+
+	*buf_size = idle_chk->buf_size;
+
+	return DBG_STATUS_OK;
+}
+
+enum dbg_status qed_dbg_idle_chk_dump(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf,
+				      u32 buf_size_in_dwords,
+				      u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	*num_dumped_dwords = 0;
+
+	status = qed_dbg_idle_chk_get_dump_buf_size(p_hwfn,
+						    p_ptt,
+						    &needed_buf_size_in_dwords);
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	/* Update reset state */
+	qed_update_blocks_reset_state(p_hwfn, p_ptt);
+
+	/* Idle Check Dump */
+	*num_dumped_dwords = qed_idle_chk_dump(p_hwfn, p_ptt, dump_buf, true);
+
+	/* Revert GRC params to their default */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
+	return DBG_STATUS_OK;
+}
+
+enum dbg_status qed_dbg_mcp_trace_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						    struct qed_ptt *p_ptt,
+						    u32 *buf_size)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+
+	*buf_size = 0;
+
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	return qed_mcp_trace_dump(p_hwfn, p_ptt, NULL, false, buf_size);
+}
+
+enum dbg_status qed_dbg_mcp_trace_dump(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       u32 *dump_buf,
+				       u32 buf_size_in_dwords,
+				       u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	status =
+		qed_dbg_mcp_trace_get_dump_buf_size(p_hwfn,
+						    p_ptt,
+						    &needed_buf_size_in_dwords);
+	if (status != DBG_STATUS_OK && status !=
+	    DBG_STATUS_NVRAM_GET_IMAGE_FAILED)
+		return status;
+
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	/* Update reset state */
+	qed_update_blocks_reset_state(p_hwfn, p_ptt);
+
+	/* Perform dump */
+	status = qed_mcp_trace_dump(p_hwfn,
+				    p_ptt, dump_buf, true, num_dumped_dwords);
+
+	/* Revert GRC params to their default */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
+	return status;
+}
+
+enum dbg_status qed_dbg_reg_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *buf_size)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+
+	*buf_size = 0;
+
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	return qed_reg_fifo_dump(p_hwfn, p_ptt, NULL, false, buf_size);
+}
+
+enum dbg_status qed_dbg_reg_fifo_dump(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf,
+				      u32 buf_size_in_dwords,
+				      u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	*num_dumped_dwords = 0;
+
+	status = qed_dbg_reg_fifo_get_dump_buf_size(p_hwfn,
+						    p_ptt,
+						    &needed_buf_size_in_dwords);
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	/* Update reset state */
+	qed_update_blocks_reset_state(p_hwfn, p_ptt);
+
+	status = qed_reg_fifo_dump(p_hwfn,
+				   p_ptt, dump_buf, true, num_dumped_dwords);
+
+	/* Revert GRC params to their default */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
+	return status;
+}
+
+enum dbg_status qed_dbg_igu_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *buf_size)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+
+	*buf_size = 0;
+
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	return qed_igu_fifo_dump(p_hwfn, p_ptt, NULL, false, buf_size);
+}
+
+enum dbg_status qed_dbg_igu_fifo_dump(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf,
+				      u32 buf_size_in_dwords,
+				      u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	*num_dumped_dwords = 0;
+
+	status = qed_dbg_igu_fifo_get_dump_buf_size(p_hwfn,
+						    p_ptt,
+						    &needed_buf_size_in_dwords);
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	/* Update reset state */
+	qed_update_blocks_reset_state(p_hwfn, p_ptt);
+
+	status = qed_igu_fifo_dump(p_hwfn,
+				   p_ptt, dump_buf, true, num_dumped_dwords);
+	/* Revert GRC params to their default */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
+	return status;
+}
+
+enum dbg_status
+qed_dbg_protection_override_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+					      struct qed_ptt *p_ptt,
+					      u32 *buf_size)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+
+	*buf_size = 0;
+
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	return qed_protection_override_dump(p_hwfn,
+					    p_ptt, NULL, false, buf_size);
+}
+
+enum dbg_status qed_dbg_protection_override_dump(struct qed_hwfn *p_hwfn,
+						 struct qed_ptt *p_ptt,
+						 u32 *dump_buf,
+						 u32 buf_size_in_dwords,
+						 u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords, *p_size = &needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	*num_dumped_dwords = 0;
+
+	status =
+		qed_dbg_protection_override_get_dump_buf_size(p_hwfn,
+							      p_ptt,
+							      p_size);
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	/* Update reset state */
+	qed_update_blocks_reset_state(p_hwfn, p_ptt);
+
+	status = qed_protection_override_dump(p_hwfn,
+					      p_ptt,
+					      dump_buf,
+					      true, num_dumped_dwords);
+
+	/* Revert GRC params to their default */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
+	return status;
+}
+
+enum dbg_status qed_dbg_fw_asserts_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						     struct qed_ptt *p_ptt,
+						     u32 *buf_size)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+
+	*buf_size = 0;
+
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	/* Update reset state */
+	qed_update_blocks_reset_state(p_hwfn, p_ptt);
+
+	*buf_size = qed_fw_asserts_dump(p_hwfn, p_ptt, NULL, false);
+
+	return DBG_STATUS_OK;
+}
+
+enum dbg_status qed_dbg_fw_asserts_dump(struct qed_hwfn *p_hwfn,
+					struct qed_ptt *p_ptt,
+					u32 *dump_buf,
+					u32 buf_size_in_dwords,
+					u32 *num_dumped_dwords)
+{
+	u32 needed_buf_size_in_dwords, *p_size = &needed_buf_size_in_dwords;
+	enum dbg_status status;
+
+	*num_dumped_dwords = 0;
+
+	status =
+		qed_dbg_fw_asserts_get_dump_buf_size(p_hwfn,
+						     p_ptt,
+						     p_size);
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	if (buf_size_in_dwords < needed_buf_size_in_dwords)
+		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
+
+	*num_dumped_dwords = qed_fw_asserts_dump(p_hwfn, p_ptt, dump_buf, true);
+
+	/* Revert GRC params to their default */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
+	return DBG_STATUS_OK;
+}
+
+enum dbg_status qed_dbg_read_attn(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  enum block_id block_id,
+				  enum dbg_attn_type attn_type,
+				  bool clear_status,
+				  struct dbg_attn_block_result *results)
+{
+	enum dbg_status status = qed_dbg_dev_init(p_hwfn, p_ptt);
+	u8 reg_idx, num_attn_regs, num_result_regs = 0;
+	const struct dbg_attn_reg *attn_reg_arr;
+
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	if (!s_dbg_arrays[BIN_BUF_DBG_MODE_TREE].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_ATTN_BLOCKS].ptr ||
+	    !s_dbg_arrays[BIN_BUF_DBG_ATTN_REGS].ptr)
+		return DBG_STATUS_DBG_ARRAY_NOT_SET;
+
+	attn_reg_arr = qed_get_block_attn_regs(block_id,
+					       attn_type, &num_attn_regs);
+
+	for (reg_idx = 0; reg_idx < num_attn_regs; reg_idx++) {
+		const struct dbg_attn_reg *reg_data = &attn_reg_arr[reg_idx];
+		struct dbg_attn_reg_result *reg_result;
+		u32 sts_addr, sts_val;
+		u16 modes_buf_offset;
+		bool eval_mode;
+
+		/* Check mode */
+		eval_mode = GET_FIELD(reg_data->mode.data,
+				      DBG_MODE_HDR_EVAL_MODE) > 0;
+		modes_buf_offset = GET_FIELD(reg_data->mode.data,
+					     DBG_MODE_HDR_MODES_BUF_OFFSET);
+		if (eval_mode && !qed_is_mode_match(p_hwfn, &modes_buf_offset))
+			continue;
+
+		/* Mode match - read attention status register */
+		sts_addr = DWORDS_TO_BYTES(clear_status ?
+					   reg_data->sts_clr_address :
+					   GET_FIELD(reg_data->data,
+						     DBG_ATTN_REG_STS_ADDRESS));
+		sts_val = qed_rd(p_hwfn, p_ptt, sts_addr);
+		if (!sts_val)
+			continue;
+
+		/* Non-zero attention status - add to results */
+		reg_result = &results->reg_results[num_result_regs];
+		SET_FIELD(reg_result->data,
+			  DBG_ATTN_REG_RESULT_STS_ADDRESS, sts_addr);
+		SET_FIELD(reg_result->data,
+			  DBG_ATTN_REG_RESULT_NUM_REG_ATTN,
+			  GET_FIELD(reg_data->data, DBG_ATTN_REG_NUM_REG_ATTN));
+		reg_result->block_attn_offset = reg_data->block_attn_offset;
+		reg_result->sts_val = sts_val;
+		reg_result->mask_val = qed_rd(p_hwfn,
+					      p_ptt,
+					      DWORDS_TO_BYTES
+					      (reg_data->mask_address));
+		num_result_regs++;
+	}
+
+	results->block_id = (u8)block_id;
+	results->names_offset =
+	    qed_get_block_attn_data(block_id, attn_type)->names_offset;
+	SET_FIELD(results->data, DBG_ATTN_BLOCK_RESULT_ATTN_TYPE, attn_type);
+	SET_FIELD(results->data,
+		  DBG_ATTN_BLOCK_RESULT_NUM_REGS, num_result_regs);
+
+	return DBG_STATUS_OK;
+}
+
+/******************************* Data Types **********************************/
+
+struct block_info {
+	const char *name;
+	enum block_id id;
+};
+
+struct mcp_trace_format {
+	u32 data;
+#define MCP_TRACE_FORMAT_MODULE_MASK	0x0000ffff
+#define MCP_TRACE_FORMAT_MODULE_SHIFT	0
+#define MCP_TRACE_FORMAT_LEVEL_MASK	0x00030000
+#define MCP_TRACE_FORMAT_LEVEL_SHIFT	16
+#define MCP_TRACE_FORMAT_P1_SIZE_MASK	0x000c0000
+#define MCP_TRACE_FORMAT_P1_SIZE_SHIFT	18
+#define MCP_TRACE_FORMAT_P2_SIZE_MASK	0x00300000
+#define MCP_TRACE_FORMAT_P2_SIZE_SHIFT	20
+#define MCP_TRACE_FORMAT_P3_SIZE_MASK	0x00c00000
+#define MCP_TRACE_FORMAT_P3_SIZE_SHIFT	22
+#define MCP_TRACE_FORMAT_LEN_MASK	0xff000000
+#define MCP_TRACE_FORMAT_LEN_SHIFT	24
+
+	char *format_str;
+};
+
+/* Meta data structure, generated by a perl script during MFW build. therefore,
+ * the structs mcp_trace_meta and mcp_trace_format are duplicated in the perl
+ * script.
+ */
+struct mcp_trace_meta {
+	u32 modules_num;
+	char **modules;
+	u32 formats_num;
+	struct mcp_trace_format *formats;
+};
+
+/* REG fifo element */
+struct reg_fifo_element {
+	u64 data;
+#define REG_FIFO_ELEMENT_ADDRESS_SHIFT		0
+#define REG_FIFO_ELEMENT_ADDRESS_MASK		0x7fffff
+#define REG_FIFO_ELEMENT_ACCESS_SHIFT		23
+#define REG_FIFO_ELEMENT_ACCESS_MASK		0x1
+#define REG_FIFO_ELEMENT_PF_SHIFT		24
+#define REG_FIFO_ELEMENT_PF_MASK		0xf
+#define REG_FIFO_ELEMENT_VF_SHIFT		28
+#define REG_FIFO_ELEMENT_VF_MASK		0xff
+#define REG_FIFO_ELEMENT_PORT_SHIFT		36
+#define REG_FIFO_ELEMENT_PORT_MASK		0x3
+#define REG_FIFO_ELEMENT_PRIVILEGE_SHIFT	38
+#define REG_FIFO_ELEMENT_PRIVILEGE_MASK		0x3
+#define REG_FIFO_ELEMENT_PROTECTION_SHIFT	40
+#define REG_FIFO_ELEMENT_PROTECTION_MASK	0x7
+#define REG_FIFO_ELEMENT_MASTER_SHIFT		43
+#define REG_FIFO_ELEMENT_MASTER_MASK		0xf
+#define REG_FIFO_ELEMENT_ERROR_SHIFT		47
+#define REG_FIFO_ELEMENT_ERROR_MASK		0x1f
+};
+
+/* IGU fifo element */
+struct igu_fifo_element {
+	u32 dword0;
+#define IGU_FIFO_ELEMENT_DWORD0_FID_SHIFT		0
+#define IGU_FIFO_ELEMENT_DWORD0_FID_MASK		0xff
+#define IGU_FIFO_ELEMENT_DWORD0_IS_PF_SHIFT		8
+#define IGU_FIFO_ELEMENT_DWORD0_IS_PF_MASK		0x1
+#define IGU_FIFO_ELEMENT_DWORD0_SOURCE_SHIFT		9
+#define IGU_FIFO_ELEMENT_DWORD0_SOURCE_MASK		0xf
+#define IGU_FIFO_ELEMENT_DWORD0_ERR_TYPE_SHIFT		13
+#define IGU_FIFO_ELEMENT_DWORD0_ERR_TYPE_MASK		0xf
+#define IGU_FIFO_ELEMENT_DWORD0_CMD_ADDR_SHIFT		17
+#define IGU_FIFO_ELEMENT_DWORD0_CMD_ADDR_MASK		0x7fff
+	u32 dword1;
+	u32 dword2;
+#define IGU_FIFO_ELEMENT_DWORD12_IS_WR_CMD_SHIFT	0
+#define IGU_FIFO_ELEMENT_DWORD12_IS_WR_CMD_MASK		0x1
+#define IGU_FIFO_ELEMENT_DWORD12_WR_DATA_SHIFT		1
+#define IGU_FIFO_ELEMENT_DWORD12_WR_DATA_MASK		0xffffffff
+	u32 reserved;
+};
+
+struct igu_fifo_wr_data {
+	u32 data;
+#define IGU_FIFO_WR_DATA_PROD_CONS_SHIFT		0
+#define IGU_FIFO_WR_DATA_PROD_CONS_MASK			0xffffff
+#define IGU_FIFO_WR_DATA_UPDATE_FLAG_SHIFT		24
+#define IGU_FIFO_WR_DATA_UPDATE_FLAG_MASK		0x1
+#define IGU_FIFO_WR_DATA_EN_DIS_INT_FOR_SB_SHIFT	25
+#define IGU_FIFO_WR_DATA_EN_DIS_INT_FOR_SB_MASK		0x3
+#define IGU_FIFO_WR_DATA_SEGMENT_SHIFT			27
+#define IGU_FIFO_WR_DATA_SEGMENT_MASK			0x1
+#define IGU_FIFO_WR_DATA_TIMER_MASK_SHIFT		28
+#define IGU_FIFO_WR_DATA_TIMER_MASK_MASK		0x1
+#define IGU_FIFO_WR_DATA_CMD_TYPE_SHIFT			31
+#define IGU_FIFO_WR_DATA_CMD_TYPE_MASK			0x1
+};
+
+struct igu_fifo_cleanup_wr_data {
+	u32 data;
+#define IGU_FIFO_CLEANUP_WR_DATA_RESERVED_SHIFT		0
+#define IGU_FIFO_CLEANUP_WR_DATA_RESERVED_MASK		0x7ffffff
+#define IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_VAL_SHIFT	27
+#define IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_VAL_MASK	0x1
+#define IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_TYPE_SHIFT	28
+#define IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_TYPE_MASK	0x7
+#define IGU_FIFO_CLEANUP_WR_DATA_CMD_TYPE_SHIFT		31
+#define IGU_FIFO_CLEANUP_WR_DATA_CMD_TYPE_MASK		0x1
+};
+
+/* Protection override element */
+struct protection_override_element {
+	u64 data;
+#define PROTECTION_OVERRIDE_ELEMENT_ADDRESS_SHIFT		0
+#define PROTECTION_OVERRIDE_ELEMENT_ADDRESS_MASK		0x7fffff
+#define PROTECTION_OVERRIDE_ELEMENT_WINDOW_SIZE_SHIFT		23
+#define PROTECTION_OVERRIDE_ELEMENT_WINDOW_SIZE_MASK		0xffffff
+#define PROTECTION_OVERRIDE_ELEMENT_READ_SHIFT			47
+#define PROTECTION_OVERRIDE_ELEMENT_READ_MASK			0x1
+#define PROTECTION_OVERRIDE_ELEMENT_WRITE_SHIFT			48
+#define PROTECTION_OVERRIDE_ELEMENT_WRITE_MASK			0x1
+#define PROTECTION_OVERRIDE_ELEMENT_READ_PROTECTION_SHIFT	49
+#define PROTECTION_OVERRIDE_ELEMENT_READ_PROTECTION_MASK	0x7
+#define PROTECTION_OVERRIDE_ELEMENT_WRITE_PROTECTION_SHIFT	52
+#define PROTECTION_OVERRIDE_ELEMENT_WRITE_PROTECTION_MASK	0x7
+};
+
+enum igu_fifo_sources {
+	IGU_SRC_PXP0,
+	IGU_SRC_PXP1,
+	IGU_SRC_PXP2,
+	IGU_SRC_PXP3,
+	IGU_SRC_PXP4,
+	IGU_SRC_PXP5,
+	IGU_SRC_PXP6,
+	IGU_SRC_PXP7,
+	IGU_SRC_CAU,
+	IGU_SRC_ATTN,
+	IGU_SRC_GRC
+};
+
+enum igu_fifo_addr_types {
+	IGU_ADDR_TYPE_MSIX_MEM,
+	IGU_ADDR_TYPE_WRITE_PBA,
+	IGU_ADDR_TYPE_WRITE_INT_ACK,
+	IGU_ADDR_TYPE_WRITE_ATTN_BITS,
+	IGU_ADDR_TYPE_READ_INT,
+	IGU_ADDR_TYPE_WRITE_PROD_UPDATE,
+	IGU_ADDR_TYPE_RESERVED
+};
+
+struct igu_fifo_addr_data {
+	u16 start_addr;
+	u16 end_addr;
+	char *desc;
+	char *vf_desc;
+	enum igu_fifo_addr_types type;
+};
+
+/******************************** Constants **********************************/
+
+#define MAX_MSG_LEN				1024
+
+#define MCP_TRACE_MAX_MODULE_LEN		8
+#define MCP_TRACE_FORMAT_MAX_PARAMS		3
+#define MCP_TRACE_FORMAT_PARAM_WIDTH \
+	(MCP_TRACE_FORMAT_P2_SIZE_SHIFT - MCP_TRACE_FORMAT_P1_SIZE_SHIFT)
+
+#define REG_FIFO_ELEMENT_ADDR_FACTOR		4
+#define REG_FIFO_ELEMENT_IS_PF_VF_VAL		127
+
+#define PROTECTION_OVERRIDE_ELEMENT_ADDR_FACTOR	4
+
+/***************************** Constant Arrays *******************************/
+
+struct user_dbg_array {
+	const u32 *ptr;
+	u32 size_in_dwords;
+};
+
+/* Debug arrays */
+static struct user_dbg_array
+s_user_dbg_arrays[MAX_BIN_DBG_BUFFER_TYPE] = { {NULL} };
+
+/* Block names array */
+static struct block_info s_block_info_arr[] = {
+	{"grc", BLOCK_GRC},
+	{"miscs", BLOCK_MISCS},
+	{"misc", BLOCK_MISC},
+	{"dbu", BLOCK_DBU},
+	{"pglue_b", BLOCK_PGLUE_B},
+	{"cnig", BLOCK_CNIG},
+	{"cpmu", BLOCK_CPMU},
+	{"ncsi", BLOCK_NCSI},
+	{"opte", BLOCK_OPTE},
+	{"bmb", BLOCK_BMB},
+	{"pcie", BLOCK_PCIE},
+	{"mcp", BLOCK_MCP},
+	{"mcp2", BLOCK_MCP2},
+	{"pswhst", BLOCK_PSWHST},
+	{"pswhst2", BLOCK_PSWHST2},
+	{"pswrd", BLOCK_PSWRD},
+	{"pswrd2", BLOCK_PSWRD2},
+	{"pswwr", BLOCK_PSWWR},
+	{"pswwr2", BLOCK_PSWWR2},
+	{"pswrq", BLOCK_PSWRQ},
+	{"pswrq2", BLOCK_PSWRQ2},
+	{"pglcs", BLOCK_PGLCS},
+	{"ptu", BLOCK_PTU},
+	{"dmae", BLOCK_DMAE},
+	{"tcm", BLOCK_TCM},
+	{"mcm", BLOCK_MCM},
+	{"ucm", BLOCK_UCM},
+	{"xcm", BLOCK_XCM},
+	{"ycm", BLOCK_YCM},
+	{"pcm", BLOCK_PCM},
+	{"qm", BLOCK_QM},
+	{"tm", BLOCK_TM},
+	{"dorq", BLOCK_DORQ},
+	{"brb", BLOCK_BRB},
+	{"src", BLOCK_SRC},
+	{"prs", BLOCK_PRS},
+	{"tsdm", BLOCK_TSDM},
+	{"msdm", BLOCK_MSDM},
+	{"usdm", BLOCK_USDM},
+	{"xsdm", BLOCK_XSDM},
+	{"ysdm", BLOCK_YSDM},
+	{"psdm", BLOCK_PSDM},
+	{"tsem", BLOCK_TSEM},
+	{"msem", BLOCK_MSEM},
+	{"usem", BLOCK_USEM},
+	{"xsem", BLOCK_XSEM},
+	{"ysem", BLOCK_YSEM},
+	{"psem", BLOCK_PSEM},
+	{"rss", BLOCK_RSS},
+	{"tmld", BLOCK_TMLD},
+	{"muld", BLOCK_MULD},
+	{"yuld", BLOCK_YULD},
+	{"xyld", BLOCK_XYLD},
+	{"ptld", BLOCK_PTLD},
+	{"ypld", BLOCK_YPLD},
+	{"prm", BLOCK_PRM},
+	{"pbf_pb1", BLOCK_PBF_PB1},
+	{"pbf_pb2", BLOCK_PBF_PB2},
+	{"rpb", BLOCK_RPB},
+	{"btb", BLOCK_BTB},
+	{"pbf", BLOCK_PBF},
+	{"rdif", BLOCK_RDIF},
+	{"tdif", BLOCK_TDIF},
+	{"cdu", BLOCK_CDU},
+	{"ccfc", BLOCK_CCFC},
+	{"tcfc", BLOCK_TCFC},
+	{"igu", BLOCK_IGU},
+	{"cau", BLOCK_CAU},
+	{"rgfs", BLOCK_RGFS},
+	{"rgsrc", BLOCK_RGSRC},
+	{"tgfs", BLOCK_TGFS},
+	{"tgsrc", BLOCK_TGSRC},
+	{"umac", BLOCK_UMAC},
+	{"xmac", BLOCK_XMAC},
+	{"dbg", BLOCK_DBG},
+	{"nig", BLOCK_NIG},
+	{"wol", BLOCK_WOL},
+	{"bmbn", BLOCK_BMBN},
+	{"ipc", BLOCK_IPC},
+	{"nwm", BLOCK_NWM},
+	{"nws", BLOCK_NWS},
+	{"ms", BLOCK_MS},
+	{"phy_pcie", BLOCK_PHY_PCIE},
+	{"led", BLOCK_LED},
+	{"avs_wrap", BLOCK_AVS_WRAP},
+	{"pxpreqbus", BLOCK_PXPREQBUS},
+	{"misc_aeu", BLOCK_MISC_AEU},
+	{"bar0_map", BLOCK_BAR0_MAP}
+};
+
+/* Status string array */
+static const char * const s_status_str[] = {
+	/* DBG_STATUS_OK */
+	"Operation completed successfully",
+
+	/* DBG_STATUS_APP_VERSION_NOT_SET */
+	"Debug application version wasn't set",
+
+	/* DBG_STATUS_UNSUPPORTED_APP_VERSION */
+	"Unsupported debug application version",
+
+	/* DBG_STATUS_DBG_BLOCK_NOT_RESET */
+	"The debug block wasn't reset since the last recording",
+
+	/* DBG_STATUS_INVALID_ARGS */
+	"Invalid arguments",
+
+	/* DBG_STATUS_OUTPUT_ALREADY_SET */
+	"The debug output was already set",
+
+	/* DBG_STATUS_INVALID_PCI_BUF_SIZE */
+	"Invalid PCI buffer size",
+
+	/* DBG_STATUS_PCI_BUF_ALLOC_FAILED */
+	"PCI buffer allocation failed",
+
+	/* DBG_STATUS_PCI_BUF_NOT_ALLOCATED */
+	"A PCI buffer wasn't allocated",
+
+	/* DBG_STATUS_TOO_MANY_INPUTS */
+	"Too many inputs were enabled. Enabled less inputs, or set 'unifyInputs' to true",
+
+	/* DBG_STATUS_INPUT_OVERLAP */
+	"Overlapping debug bus inputs",
+
+	/* DBG_STATUS_HW_ONLY_RECORDING */
+	"Cannot record Storm data since the entire recording cycle is used by HW",
+
+	/* DBG_STATUS_STORM_ALREADY_ENABLED */
+	"The Storm was already enabled",
+
+	/* DBG_STATUS_STORM_NOT_ENABLED */
+	"The specified Storm wasn't enabled",
+
+	/* DBG_STATUS_BLOCK_ALREADY_ENABLED */
+	"The block was already enabled",
+
+	/* DBG_STATUS_BLOCK_NOT_ENABLED */
+	"The specified block wasn't enabled",
+
+	/* DBG_STATUS_NO_INPUT_ENABLED */
+	"No input was enabled for recording",
+
+	/* DBG_STATUS_NO_FILTER_TRIGGER_64B */
+	"Filters and triggers are not allowed when recording in 64b units",
+
+	/* DBG_STATUS_FILTER_ALREADY_ENABLED */
+	"The filter was already enabled",
+
+	/* DBG_STATUS_TRIGGER_ALREADY_ENABLED */
+	"The trigger was already enabled",
+
+	/* DBG_STATUS_TRIGGER_NOT_ENABLED */
+	"The trigger wasn't enabled",
+
+	/* DBG_STATUS_CANT_ADD_CONSTRAINT */
+	"A constraint can be added only after a filter was enabled or a trigger state was added",
+
+	/* DBG_STATUS_TOO_MANY_TRIGGER_STATES */
+	"Cannot add more than 3 trigger states",
+
+	/* DBG_STATUS_TOO_MANY_CONSTRAINTS */
+	"Cannot add more than 4 constraints per filter or trigger state",
+
+	/* DBG_STATUS_RECORDING_NOT_STARTED */
+	"The recording wasn't started",
+
+	/* DBG_STATUS_DATA_DIDNT_TRIGGER */
+	"A trigger was configured, but it didn't trigger",
+
+	/* DBG_STATUS_NO_DATA_RECORDED */
+	"No data was recorded",
+
+	/* DBG_STATUS_DUMP_BUF_TOO_SMALL */
+	"Dump buffer is too small",
+
+	/* DBG_STATUS_DUMP_NOT_CHUNK_ALIGNED */
+	"Dumped data is not aligned to chunks",
+
+	/* DBG_STATUS_UNKNOWN_CHIP */
+	"Unknown chip",
+
+	/* DBG_STATUS_VIRT_MEM_ALLOC_FAILED */
+	"Failed allocating virtual memory",
+
+	/* DBG_STATUS_BLOCK_IN_RESET */
+	"The input block is in reset",
+
+	/* DBG_STATUS_INVALID_TRACE_SIGNATURE */
+	"Invalid MCP trace signature found in NVRAM",
+
+	/* DBG_STATUS_INVALID_NVRAM_BUNDLE */
+	"Invalid bundle ID found in NVRAM",
+
+	/* DBG_STATUS_NVRAM_GET_IMAGE_FAILED */
+	"Failed getting NVRAM image",
+
+	/* DBG_STATUS_NON_ALIGNED_NVRAM_IMAGE */
+	"NVRAM image is not dword-aligned",
+
+	/* DBG_STATUS_NVRAM_READ_FAILED */
+	"Failed reading from NVRAM",
+
+	/* DBG_STATUS_IDLE_CHK_PARSE_FAILED */
+	"Idle check parsing failed",
+
+	/* DBG_STATUS_MCP_TRACE_BAD_DATA */
+	"MCP Trace data is corrupt",
+
+	/* DBG_STATUS_MCP_TRACE_NO_META */
+	"Dump doesn't contain meta data - it must be provided in image file",
+
+	/* DBG_STATUS_MCP_COULD_NOT_HALT */
+	"Failed to halt MCP",
+
+	/* DBG_STATUS_MCP_COULD_NOT_RESUME */
+	"Failed to resume MCP after halt",
+
+	/* DBG_STATUS_RESERVED2 */
+	"Reserved debug status - shouldn't be returned",
+
+	/* DBG_STATUS_SEMI_FIFO_NOT_EMPTY */
+	"Failed to empty SEMI sync FIFO",
+
+	/* DBG_STATUS_IGU_FIFO_BAD_DATA */
+	"IGU FIFO data is corrupt",
+
+	/* DBG_STATUS_MCP_COULD_NOT_MASK_PRTY */
+	"MCP failed to mask parities",
+
+	/* DBG_STATUS_FW_ASSERTS_PARSE_FAILED */
+	"FW Asserts parsing failed",
+
+	/* DBG_STATUS_REG_FIFO_BAD_DATA */
+	"GRC FIFO data is corrupt",
+
+	/* DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA */
+	"Protection Override data is corrupt",
+
+	/* DBG_STATUS_DBG_ARRAY_NOT_SET */
+	"Debug arrays were not set (when using binary files, dbg_set_bin_ptr must be called)",
+
+	/* DBG_STATUS_FILTER_BUG */
+	"Debug Bus filtering requires the -unifyInputs option (due to a HW bug)",
+
+	/* DBG_STATUS_NON_MATCHING_LINES */
+	"Non-matching debug lines - all lines must be of the same type (either 128b or 256b)",
+
+	/* DBG_STATUS_INVALID_TRIGGER_DWORD_OFFSET */
+	"The selected trigger dword offset wasn't enabled in the recorded HW block",
+
+	/* DBG_STATUS_DBG_BUS_IN_USE */
+	"The debug bus is in use"
+};
+
+/* Idle check severity names array */
+static const char * const s_idle_chk_severity_str[] = {
+	"Error",
+	"Error if no traffic",
+	"Warning"
+};
+
+/* MCP Trace level names array */
+static const char * const s_mcp_trace_level_str[] = {
+	"ERROR",
+	"TRACE",
+	"DEBUG"
+};
+
+/* Access type names array */
+static const char * const s_access_strs[] = {
+	"read",
+	"write"
+};
+
+/* Privilege type names array */
+static const char * const s_privilege_strs[] = {
+	"VF",
+	"PDA",
+	"HV",
+	"UA"
+};
+
+/* Protection type names array */
+static const char * const s_protection_strs[] = {
+	"(default)",
+	"(default)",
+	"(default)",
+	"(default)",
+	"override VF",
+	"override PDA",
+	"override HV",
+	"override UA"
+};
+
+/* Master type names array */
+static const char * const s_master_strs[] = {
+	"???",
+	"pxp",
+	"mcp",
+	"msdm",
+	"psdm",
+	"ysdm",
+	"usdm",
+	"tsdm",
+	"xsdm",
+	"dbu",
+	"dmae",
+	"???",
+	"???",
+	"???",
+	"???",
+	"???"
+};
+
+/* REG FIFO error messages array */
+static const char * const s_reg_fifo_error_strs[] = {
+	"grc timeout",
+	"address doesn't belong to any block",
+	"reserved address in block or write to read-only address",
+	"privilege/protection mismatch",
+	"path isolation error"
+};
+
+/* IGU FIFO sources array */
+static const char * const s_igu_fifo_source_strs[] = {
+	"TSTORM",
+	"MSTORM",
+	"USTORM",
+	"XSTORM",
+	"YSTORM",
+	"PSTORM",
+	"PCIE",
+	"NIG_QM_PBF",
+	"CAU",
+	"ATTN",
+	"GRC",
+};
+
+/* IGU FIFO error messages */
+static const char * const s_igu_fifo_error_strs[] = {
+	"no error",
+	"length error",
+	"function disabled",
+	"VF sent command to attnetion address",
+	"host sent prod update command",
+	"read of during interrupt register while in MIMD mode",
+	"access to PXP BAR reserved address",
+	"producer update command to attention index",
+	"unknown error",
+	"SB index not valid",
+	"SB relative index and FID not found",
+	"FID not match",
+	"command with error flag asserted (PCI error or CAU discard)",
+	"VF sent cleanup and RF cleanup is disabled",
+	"cleanup command on type bigger than 4"
+};
+
+/* IGU FIFO address data */
+static const struct igu_fifo_addr_data s_igu_fifo_addr_data[] = {
+	{0x0, 0x101, "MSI-X Memory", NULL,
+	 IGU_ADDR_TYPE_MSIX_MEM},
+	{0x102, 0x1ff, "reserved", NULL,
+	 IGU_ADDR_TYPE_RESERVED},
+	{0x200, 0x200, "Write PBA[0:63]", NULL,
+	 IGU_ADDR_TYPE_WRITE_PBA},
+	{0x201, 0x201, "Write PBA[64:127]", "reserved",
+	 IGU_ADDR_TYPE_WRITE_PBA},
+	{0x202, 0x202, "Write PBA[128]", "reserved",
+	 IGU_ADDR_TYPE_WRITE_PBA},
+	{0x203, 0x3ff, "reserved", NULL,
+	 IGU_ADDR_TYPE_RESERVED},
+	{0x400, 0x5ef, "Write interrupt acknowledgment", NULL,
+	 IGU_ADDR_TYPE_WRITE_INT_ACK},
+	{0x5f0, 0x5f0, "Attention bits update", NULL,
+	 IGU_ADDR_TYPE_WRITE_ATTN_BITS},
+	{0x5f1, 0x5f1, "Attention bits set", NULL,
+	 IGU_ADDR_TYPE_WRITE_ATTN_BITS},
+	{0x5f2, 0x5f2, "Attention bits clear", NULL,
+	 IGU_ADDR_TYPE_WRITE_ATTN_BITS},
+	{0x5f3, 0x5f3, "Read interrupt 0:63 with mask", NULL,
+	 IGU_ADDR_TYPE_READ_INT},
+	{0x5f4, 0x5f4, "Read interrupt 0:31 with mask", NULL,
+	 IGU_ADDR_TYPE_READ_INT},
+	{0x5f5, 0x5f5, "Read interrupt 32:63 with mask", NULL,
+	 IGU_ADDR_TYPE_READ_INT},
+	{0x5f6, 0x5f6, "Read interrupt 0:63 without mask", NULL,
+	 IGU_ADDR_TYPE_READ_INT},
+	{0x5f7, 0x5ff, "reserved", NULL,
+	 IGU_ADDR_TYPE_RESERVED},
+	{0x600, 0x7ff, "Producer update", NULL,
+	 IGU_ADDR_TYPE_WRITE_PROD_UPDATE}
+};
+
+/******************************** Variables **********************************/
+
+/* MCP Trace meta data array - used in case the dump doesn't contain the
+ * meta data (e.g. due to no NVRAM access).
+ */
+static struct user_dbg_array s_mcp_trace_meta_arr = { NULL, 0 };
+
+/* Parsed MCP Trace meta data info, based on MCP trace meta array */
+static struct mcp_trace_meta s_mcp_trace_meta;
+static bool s_mcp_trace_meta_valid;
+
+/* Temporary buffer, used for print size calculations */
+static char s_temp_buf[MAX_MSG_LEN];
+
+/**************************** Private Functions ******************************/
+
+static u32 qed_cyclic_add(u32 a, u32 b, u32 size)
+{
+	return (a + b) % size;
+}
+
+static u32 qed_cyclic_sub(u32 a, u32 b, u32 size)
+{
+	return (size + a - b) % size;
+}
+
+/* Reads the specified number of bytes from the specified cyclic buffer (up to 4
+ * bytes) and returns them as a dword value. the specified buffer offset is
+ * updated.
+ */
+static u32 qed_read_from_cyclic_buf(void *buf,
+				    u32 *offset,
+				    u32 buf_size, u8 num_bytes_to_read)
+{
+	u8 i, *val_ptr, *bytes_buf = (u8 *)buf;
+	u32 val = 0;
+
+	val_ptr = (u8 *)&val;
+
+	/* Assume running on a LITTLE ENDIAN and the buffer is network order
+	 * (BIG ENDIAN), as high order bytes are placed in lower memory address.
+	 */
+	for (i = 0; i < num_bytes_to_read; i++) {
+		val_ptr[i] = bytes_buf[*offset];
+		*offset = qed_cyclic_add(*offset, 1, buf_size);
+	}
+
+	return val;
+}
+
+/* Reads and returns the next byte from the specified buffer.
+ * The specified buffer offset is updated.
+ */
+static u8 qed_read_byte_from_buf(void *buf, u32 *offset)
+{
+	return ((u8 *)buf)[(*offset)++];
+}
+
+/* Reads and returns the next dword from the specified buffer.
+ * The specified buffer offset is updated.
+ */
+static u32 qed_read_dword_from_buf(void *buf, u32 *offset)
+{
+	u32 dword_val = *(u32 *)&((u8 *)buf)[*offset];
+
+	*offset += 4;
+
+	return dword_val;
+}
+
+/* Reads the next string from the specified buffer, and copies it to the
+ * specified pointer. The specified buffer offset is updated.
+ */
+static void qed_read_str_from_buf(void *buf, u32 *offset, u32 size, char *dest)
+{
+	const char *source_str = &((const char *)buf)[*offset];
+
+	strncpy(dest, source_str, size);
+	dest[size - 1] = '\0';
+	*offset += size;
+}
+
+/* Returns a pointer to the specified offset (in bytes) of the specified buffer.
+ * If the specified buffer in NULL, a temporary buffer pointer is returned.
+ */
+static char *qed_get_buf_ptr(void *buf, u32 offset)
+{
+	return buf ? (char *)buf + offset : s_temp_buf;
+}
+
+/* Reads a param from the specified buffer. Returns the number of dwords read.
+ * If the returned str_param is NULL, the param is numeric and its value is
+ * returned in num_param.
+ * Otheriwise, the param is a string and its pointer is returned in str_param.
+ */
+static u32 qed_read_param(u32 *dump_buf,
+			  const char **param_name,
+			  const char **param_str_val, u32 *param_num_val)
+{
+	char *char_buf = (char *)dump_buf;
+	size_t offset = 0;
+
+	/* Extract param name */
+	*param_name = char_buf;
+	offset += strlen(*param_name) + 1;
+
+	/* Check param type */
+	if (*(char_buf + offset++)) {
+		/* String param */
+		*param_str_val = char_buf + offset;
+		*param_num_val = 0;
+		offset += strlen(*param_str_val) + 1;
+		if (offset & 0x3)
+			offset += (4 - (offset & 0x3));
+	} else {
+		/* Numeric param */
+		*param_str_val = NULL;
+		if (offset & 0x3)
+			offset += (4 - (offset & 0x3));
+		*param_num_val = *(u32 *)(char_buf + offset);
+		offset += 4;
+	}
+
+	return (u32)offset / 4;
+}
+
+/* Reads a section header from the specified buffer.
+ * Returns the number of dwords read.
+ */
+static u32 qed_read_section_hdr(u32 *dump_buf,
+				const char **section_name,
+				u32 *num_section_params)
+{
+	const char *param_str_val;
+
+	return qed_read_param(dump_buf,
+			      section_name, &param_str_val, num_section_params);
+}
+
+/* Reads section params from the specified buffer and prints them to the results
+ * buffer. Returns the number of dwords read.
+ */
+static u32 qed_print_section_params(u32 *dump_buf,
+				    u32 num_section_params,
+				    char *results_buf, u32 *num_chars_printed)
+{
+	u32 i, dump_offset = 0, results_offset = 0;
+
+	for (i = 0; i < num_section_params; i++) {
+		const char *param_name, *param_str_val;
+		u32 param_num_val = 0;
+
+		dump_offset += qed_read_param(dump_buf + dump_offset,
+					      &param_name,
+					      &param_str_val, &param_num_val);
+
+		if (param_str_val)
+			results_offset +=
+				sprintf(qed_get_buf_ptr(results_buf,
+							results_offset),
+					"%s: %s\n", param_name, param_str_val);
+		else if (strcmp(param_name, "fw-timestamp"))
+			results_offset +=
+				sprintf(qed_get_buf_ptr(results_buf,
+							results_offset),
+					"%s: %d\n", param_name, param_num_val);
+	}
+
+	results_offset += sprintf(qed_get_buf_ptr(results_buf, results_offset),
+				  "\n");
+
+	*num_chars_printed = results_offset;
+
+	return dump_offset;
+}
+
+/* Parses the idle check rules and returns the number of characters printed.
+ * In case of parsing error, returns 0.
+ */
+static u32 qed_parse_idle_chk_dump_rules(u32 *dump_buf,
+					 u32 *dump_buf_end,
+					 u32 num_rules,
+					 bool print_fw_idle_chk,
+					 char *results_buf,
+					 u32 *num_errors, u32 *num_warnings)
+{
+	/* Offset in results_buf in bytes */
+	u32 results_offset = 0;
+
+	u32 rule_idx;
+	u16 i, j;
+
+	*num_errors = 0;
+	*num_warnings = 0;
+
+	/* Go over dumped results */
+	for (rule_idx = 0; rule_idx < num_rules && dump_buf < dump_buf_end;
+	     rule_idx++) {
+		const struct dbg_idle_chk_rule_parsing_data *rule_parsing_data;
+		struct dbg_idle_chk_result_hdr *hdr;
+		const char *parsing_str, *lsi_msg;
+		u32 parsing_str_offset;
+		bool has_fw_msg;
+		u8 curr_reg_id;
+
+		hdr = (struct dbg_idle_chk_result_hdr *)dump_buf;
+		rule_parsing_data =
+			(const struct dbg_idle_chk_rule_parsing_data *)
+			&s_user_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_PARSING_DATA].
+			ptr[hdr->rule_id];
+		parsing_str_offset =
+			GET_FIELD(rule_parsing_data->data,
+				  DBG_IDLE_CHK_RULE_PARSING_DATA_STR_OFFSET);
+		has_fw_msg =
+			GET_FIELD(rule_parsing_data->data,
+				DBG_IDLE_CHK_RULE_PARSING_DATA_HAS_FW_MSG) > 0;
+		parsing_str =
+			&((const char *)
+			s_user_dbg_arrays[BIN_BUF_DBG_PARSING_STRINGS].ptr)
+			[parsing_str_offset];
+		lsi_msg = parsing_str;
+		curr_reg_id = 0;
+
+		if (hdr->severity >= MAX_DBG_IDLE_CHK_SEVERITY_TYPES)
+			return 0;
+
+		/* Skip rule header */
+		dump_buf += BYTES_TO_DWORDS(sizeof(*hdr));
+
+		/* Update errors/warnings count */
+		if (hdr->severity == IDLE_CHK_SEVERITY_ERROR ||
+		    hdr->severity == IDLE_CHK_SEVERITY_ERROR_NO_TRAFFIC)
+			(*num_errors)++;
+		else
+			(*num_warnings)++;
+
+		/* Print rule severity */
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset), "%s: ",
+			    s_idle_chk_severity_str[hdr->severity]);
+
+		/* Print rule message */
+		if (has_fw_msg)
+			parsing_str += strlen(parsing_str) + 1;
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset), "%s.",
+			    has_fw_msg &&
+			    print_fw_idle_chk ? parsing_str : lsi_msg);
+		parsing_str += strlen(parsing_str) + 1;
+
+		/* Print register values */
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset), " Registers:");
+		for (i = 0;
+		     i < hdr->num_dumped_cond_regs + hdr->num_dumped_info_regs;
+		     i++) {
+			struct dbg_idle_chk_result_reg_hdr *reg_hdr;
+			bool is_mem;
+			u8 reg_id;
+
+			reg_hdr =
+				(struct dbg_idle_chk_result_reg_hdr *)dump_buf;
+			is_mem = GET_FIELD(reg_hdr->data,
+					   DBG_IDLE_CHK_RESULT_REG_HDR_IS_MEM);
+			reg_id = GET_FIELD(reg_hdr->data,
+					   DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID);
+
+			/* Skip reg header */
+			dump_buf += BYTES_TO_DWORDS(sizeof(*reg_hdr));
+
+			/* Skip register names until the required reg_id is
+			 * reached.
+			 */
+			for (; reg_id > curr_reg_id;
+			     curr_reg_id++,
+			     parsing_str += strlen(parsing_str) + 1);
+
+			results_offset +=
+			    sprintf(qed_get_buf_ptr(results_buf,
+						    results_offset), " %s",
+				    parsing_str);
+			if (i < hdr->num_dumped_cond_regs && is_mem)
+				results_offset +=
+				    sprintf(qed_get_buf_ptr(results_buf,
+							    results_offset),
+					    "[%d]", hdr->mem_entry_id +
+					    reg_hdr->start_entry);
+			results_offset +=
+			    sprintf(qed_get_buf_ptr(results_buf,
+						    results_offset), "=");
+			for (j = 0; j < reg_hdr->size; j++, dump_buf++) {
+				results_offset +=
+				    sprintf(qed_get_buf_ptr(results_buf,
+							    results_offset),
+					    "0x%x", *dump_buf);
+				if (j < reg_hdr->size - 1)
+					results_offset +=
+					    sprintf(qed_get_buf_ptr
+						    (results_buf,
+						     results_offset), ",");
+			}
+		}
+
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf, results_offset), "\n");
+	}
+
+	/* Check if end of dump buffer was exceeded */
+	if (dump_buf > dump_buf_end)
+		return 0;
+
+	return results_offset;
+}
+
+/* Parses an idle check dump buffer.
+ * If result_buf is not NULL, the idle check results are printed to it.
+ * In any case, the required results buffer size is assigned to
+ * parsed_results_bytes.
+ * The parsing status is returned.
+ */
+static enum dbg_status qed_parse_idle_chk_dump(u32 *dump_buf,
+					       u32 num_dumped_dwords,
+					       char *results_buf,
+					       u32 *parsed_results_bytes,
+					       u32 *num_errors,
+					       u32 *num_warnings)
+{
+	const char *section_name, *param_name, *param_str_val;
+	u32 *dump_buf_end = dump_buf + num_dumped_dwords;
+	u32 num_section_params = 0, num_rules;
+
+	/* Offset in results_buf in bytes */
+	u32 results_offset = 0;
+
+	*parsed_results_bytes = 0;
+	*num_errors = 0;
+	*num_warnings = 0;
+
+	if (!s_user_dbg_arrays[BIN_BUF_DBG_PARSING_STRINGS].ptr ||
+	    !s_user_dbg_arrays[BIN_BUF_DBG_IDLE_CHK_PARSING_DATA].ptr)
+		return DBG_STATUS_DBG_ARRAY_NOT_SET;
+
+	/* Read global_params section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "global_params"))
+		return DBG_STATUS_IDLE_CHK_PARSE_FAILED;
+
+	/* Print global params */
+	dump_buf += qed_print_section_params(dump_buf,
+					     num_section_params,
+					     results_buf, &results_offset);
+
+	/* Read idle_chk section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "idle_chk") || num_section_params != 1)
+		return DBG_STATUS_IDLE_CHK_PARSE_FAILED;
+	dump_buf += qed_read_param(dump_buf,
+				   &param_name, &param_str_val, &num_rules);
+	if (strcmp(param_name, "num_rules"))
+		return DBG_STATUS_IDLE_CHK_PARSE_FAILED;
+
+	if (num_rules) {
+		u32 rules_print_size;
+
+		/* Print FW output */
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "FW_IDLE_CHECK:\n");
+		rules_print_size =
+			qed_parse_idle_chk_dump_rules(dump_buf,
+						      dump_buf_end,
+						      num_rules,
+						      true,
+						      results_buf ?
+						      results_buf +
+						      results_offset :
+						      NULL,
+						      num_errors,
+						      num_warnings);
+		results_offset += rules_print_size;
+		if (!rules_print_size)
+			return DBG_STATUS_IDLE_CHK_PARSE_FAILED;
+
+		/* Print LSI output */
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "\nLSI_IDLE_CHECK:\n");
+		rules_print_size =
+			qed_parse_idle_chk_dump_rules(dump_buf,
+						      dump_buf_end,
+						      num_rules,
+						      false,
+						      results_buf ?
+						      results_buf +
+						      results_offset :
+						      NULL,
+						      num_errors,
+						      num_warnings);
+		results_offset += rules_print_size;
+		if (!rules_print_size)
+			return DBG_STATUS_IDLE_CHK_PARSE_FAILED;
+	}
+
+	/* Print errors/warnings count */
+	if (*num_errors)
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "\nIdle Check failed!!! (with %d errors and %d warnings)\n",
+			    *num_errors, *num_warnings);
+	else if (*num_warnings)
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "\nIdle Check completed successfully (with %d warnings)\n",
+			    *num_warnings);
+	else
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "\nIdle Check completed successfully\n");
+
+	/* Add 1 for string NULL termination */
+	*parsed_results_bytes = results_offset + 1;
+
+	return DBG_STATUS_OK;
+}
+
+/* Frees the specified MCP Trace meta data */
+static void qed_mcp_trace_free_meta(struct qed_hwfn *p_hwfn,
+				    struct mcp_trace_meta *meta)
+{
+	u32 i;
+
+	s_mcp_trace_meta_valid = false;
+
+	/* Release modules */
+	if (meta->modules) {
+		for (i = 0; i < meta->modules_num; i++)
+			kfree(meta->modules[i]);
+		kfree(meta->modules);
+	}
+
+	/* Release formats */
+	if (meta->formats) {
+		for (i = 0; i < meta->formats_num; i++)
+			kfree(meta->formats[i].format_str);
+		kfree(meta->formats);
+	}
+}
+
+/* Allocates and fills MCP Trace meta data based on the specified meta data
+ * dump buffer.
+ * Returns debug status code.
+ */
+static enum dbg_status qed_mcp_trace_alloc_meta(struct qed_hwfn *p_hwfn,
+						const u32 *meta_buf,
+						struct mcp_trace_meta *meta)
+{
+	u8 *meta_buf_bytes = (u8 *)meta_buf;
+	u32 offset = 0, signature, i;
+
+	/* Free the previous meta before loading a new one. */
+	if (s_mcp_trace_meta_valid)
+		qed_mcp_trace_free_meta(p_hwfn, meta);
+
+	memset(meta, 0, sizeof(*meta));
+
+	/* Read first signature */
+	signature = qed_read_dword_from_buf(meta_buf_bytes, &offset);
+	if (signature != NVM_MAGIC_VALUE)
+		return DBG_STATUS_INVALID_TRACE_SIGNATURE;
+
+	/* Read no. of modules and allocate memory for their pointers */
+	meta->modules_num = qed_read_byte_from_buf(meta_buf_bytes, &offset);
+	meta->modules = kzalloc(meta->modules_num * sizeof(char *), GFP_KERNEL);
+	if (!meta->modules)
+		return DBG_STATUS_VIRT_MEM_ALLOC_FAILED;
+
+	/* Allocate and read all module strings */
+	for (i = 0; i < meta->modules_num; i++) {
+		u8 module_len = qed_read_byte_from_buf(meta_buf_bytes, &offset);
+
+		*(meta->modules + i) = kzalloc(module_len, GFP_KERNEL);
+		if (!(*(meta->modules + i))) {
+			/* Update number of modules to be released */
+			meta->modules_num = i ? i - 1 : 0;
+			return DBG_STATUS_VIRT_MEM_ALLOC_FAILED;
+		}
+
+		qed_read_str_from_buf(meta_buf_bytes, &offset, module_len,
+				      *(meta->modules + i));
+		if (module_len > MCP_TRACE_MAX_MODULE_LEN)
+			(*(meta->modules + i))[MCP_TRACE_MAX_MODULE_LEN] = '\0';
+	}
+
+	/* Read second signature */
+	signature = qed_read_dword_from_buf(meta_buf_bytes, &offset);
+	if (signature != NVM_MAGIC_VALUE)
+		return DBG_STATUS_INVALID_TRACE_SIGNATURE;
+
+	/* Read number of formats and allocate memory for all formats */
+	meta->formats_num = qed_read_dword_from_buf(meta_buf_bytes, &offset);
+	meta->formats = kzalloc(meta->formats_num *
+				sizeof(struct mcp_trace_format),
+				GFP_KERNEL);
+	if (!meta->formats)
+		return DBG_STATUS_VIRT_MEM_ALLOC_FAILED;
+
+	/* Allocate and read all strings */
+	for (i = 0; i < meta->formats_num; i++) {
+		struct mcp_trace_format *format_ptr = &meta->formats[i];
+		u8 format_len;
+
+		format_ptr->data = qed_read_dword_from_buf(meta_buf_bytes,
+							   &offset);
+		format_len =
+		    (format_ptr->data &
+		     MCP_TRACE_FORMAT_LEN_MASK) >> MCP_TRACE_FORMAT_LEN_SHIFT;
+		format_ptr->format_str = kzalloc(format_len, GFP_KERNEL);
+		if (!format_ptr->format_str) {
+			/* Update number of modules to be released */
+			meta->formats_num = i ? i - 1 : 0;
+			return DBG_STATUS_VIRT_MEM_ALLOC_FAILED;
+		}
+
+		qed_read_str_from_buf(meta_buf_bytes,
+				      &offset,
+				      format_len, format_ptr->format_str);
+	}
+
+	s_mcp_trace_meta_valid = true;
+	return DBG_STATUS_OK;
+}
+
+/* Parses an MCP trace buffer. If result_buf is not NULL, the MCP Trace results
+ * are printed to it. The parsing status is returned.
+ * Arguments:
+ * trace_buf - MCP trace cyclic buffer
+ * trace_buf_size - MCP trace cyclic buffer size in bytes
+ * data_offset - offset in bytes of the data to parse in the MCP trace cyclic
+ *               buffer.
+ * data_size - size in bytes of data to parse.
+ * parsed_buf - destination buffer for parsed data.
+ * parsed_bytes - size of parsed data in bytes.
+ */
+static enum dbg_status qed_parse_mcp_trace_buf(u8 *trace_buf,
+					       u32 trace_buf_size,
+					       u32 data_offset,
+					       u32 data_size,
+					       char *parsed_buf,
+					       u32 *parsed_bytes)
+{
+	u32 param_mask, param_shift;
+	enum dbg_status status;
+
+	*parsed_bytes = 0;
+
+	if (!s_mcp_trace_meta_valid)
+		return DBG_STATUS_MCP_TRACE_BAD_DATA;
+
+	status = DBG_STATUS_OK;
+
+	while (data_size) {
+		struct mcp_trace_format *format_ptr;
+		u8 format_level, format_module;
+		u32 params[3] = { 0, 0, 0 };
+		u32 header, format_idx, i;
+
+		if (data_size < MFW_TRACE_ENTRY_SIZE)
+			return DBG_STATUS_MCP_TRACE_BAD_DATA;
+
+		header = qed_read_from_cyclic_buf(trace_buf,
+						  &data_offset,
+						  trace_buf_size,
+						  MFW_TRACE_ENTRY_SIZE);
+		data_size -= MFW_TRACE_ENTRY_SIZE;
+		format_idx = header & MFW_TRACE_EVENTID_MASK;
+
+		/* Skip message if its index doesn't exist in the meta data */
+		if (format_idx > s_mcp_trace_meta.formats_num) {
+			u8 format_size =
+				(u8)((header & MFW_TRACE_PRM_SIZE_MASK) >>
+				     MFW_TRACE_PRM_SIZE_SHIFT);
+
+			if (data_size < format_size)
+				return DBG_STATUS_MCP_TRACE_BAD_DATA;
+
+			data_offset = qed_cyclic_add(data_offset,
+						     format_size,
+						     trace_buf_size);
+			data_size -= format_size;
+			continue;
+		}
+
+		format_ptr = &s_mcp_trace_meta.formats[format_idx];
+
+		for (i = 0,
+		     param_mask = MCP_TRACE_FORMAT_P1_SIZE_MASK,
+		     param_shift = MCP_TRACE_FORMAT_P1_SIZE_SHIFT;
+		     i < MCP_TRACE_FORMAT_MAX_PARAMS;
+		     i++,
+		     param_mask <<= MCP_TRACE_FORMAT_PARAM_WIDTH,
+		     param_shift += MCP_TRACE_FORMAT_PARAM_WIDTH) {
+			/* Extract param size (0..3) */
+			u8 param_size = (u8)((format_ptr->data & param_mask) >>
+					     param_shift);
+
+			/* If the param size is zero, there are no other
+			 * parameters.
+			 */
+			if (!param_size)
+				break;
+
+			/* Size is encoded using 2 bits, where 3 is used to
+			 * encode 4.
+			 */
+			if (param_size == 3)
+				param_size = 4;
+
+			if (data_size < param_size)
+				return DBG_STATUS_MCP_TRACE_BAD_DATA;
+
+			params[i] = qed_read_from_cyclic_buf(trace_buf,
+							     &data_offset,
+							     trace_buf_size,
+							     param_size);
+			data_size -= param_size;
+		}
+
+		format_level = (u8)((format_ptr->data &
+				     MCP_TRACE_FORMAT_LEVEL_MASK) >>
+				    MCP_TRACE_FORMAT_LEVEL_SHIFT);
+		format_module = (u8)((format_ptr->data &
+				      MCP_TRACE_FORMAT_MODULE_MASK) >>
+				     MCP_TRACE_FORMAT_MODULE_SHIFT);
+		if (format_level >= ARRAY_SIZE(s_mcp_trace_level_str))
+			return DBG_STATUS_MCP_TRACE_BAD_DATA;
+
+		/* Print current message to results buffer */
+		*parsed_bytes +=
+			sprintf(qed_get_buf_ptr(parsed_buf, *parsed_bytes),
+				"%s %-8s: ",
+				s_mcp_trace_level_str[format_level],
+				s_mcp_trace_meta.modules[format_module]);
+		*parsed_bytes +=
+		    sprintf(qed_get_buf_ptr(parsed_buf, *parsed_bytes),
+			    format_ptr->format_str,
+			    params[0], params[1], params[2]);
+	}
+
+	/* Add string NULL terminator */
+	(*parsed_bytes)++;
+
+	return status;
+}
+
+/* Parses an MCP Trace dump buffer.
+ * If result_buf is not NULL, the MCP Trace results are printed to it.
+ * In any case, the required results buffer size is assigned to
+ * parsed_bytes.
+ * The parsing status is returned.
+ */
+static enum dbg_status qed_parse_mcp_trace_dump(struct qed_hwfn *p_hwfn,
+						u32 *dump_buf,
+						char *parsed_buf,
+						u32 *parsed_bytes)
+{
+	const char *section_name, *param_name, *param_str_val;
+	u32 data_size, trace_data_dwords, trace_meta_dwords;
+	u32 offset, results_offset, parsed_buf_bytes;
+	u32 param_num_val, num_section_params;
+	struct mcp_trace *trace;
+	enum dbg_status status;
+	const u32 *meta_buf;
+	u8 *trace_buf;
+
+	*parsed_bytes = 0;
+
+	/* Read global_params section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "global_params"))
+		return DBG_STATUS_MCP_TRACE_BAD_DATA;
+
+	/* Print global params */
+	dump_buf += qed_print_section_params(dump_buf,
+					     num_section_params,
+					     parsed_buf, &results_offset);
+
+	/* Read trace_data section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "mcp_trace_data") || num_section_params != 1)
+		return DBG_STATUS_MCP_TRACE_BAD_DATA;
+	dump_buf += qed_read_param(dump_buf,
+				   &param_name, &param_str_val, &param_num_val);
+	if (strcmp(param_name, "size"))
+		return DBG_STATUS_MCP_TRACE_BAD_DATA;
+	trace_data_dwords = param_num_val;
+
+	/* Prepare trace info */
+	trace = (struct mcp_trace *)dump_buf;
+	trace_buf = (u8 *)dump_buf + sizeof(*trace);
+	offset = trace->trace_oldest;
+	data_size = qed_cyclic_sub(trace->trace_prod, offset, trace->size);
+	dump_buf += trace_data_dwords;
+
+	/* Read meta_data section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "mcp_trace_meta"))
+		return DBG_STATUS_MCP_TRACE_BAD_DATA;
+	dump_buf += qed_read_param(dump_buf,
+				   &param_name, &param_str_val, &param_num_val);
+	if (strcmp(param_name, "size"))
+		return DBG_STATUS_MCP_TRACE_BAD_DATA;
+	trace_meta_dwords = param_num_val;
+
+	/* Choose meta data buffer */
+	if (!trace_meta_dwords) {
+		/* Dump doesn't include meta data */
+		if (!s_mcp_trace_meta_arr.ptr)
+			return DBG_STATUS_MCP_TRACE_NO_META;
+		meta_buf = s_mcp_trace_meta_arr.ptr;
+	} else {
+		/* Dump includes meta data */
+		meta_buf = dump_buf;
+	}
+
+	/* Allocate meta data memory */
+	status = qed_mcp_trace_alloc_meta(p_hwfn, meta_buf, &s_mcp_trace_meta);
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	status = qed_parse_mcp_trace_buf(trace_buf,
+					 trace->size,
+					 offset,
+					 data_size,
+					 parsed_buf ?
+					 parsed_buf + results_offset :
+					 NULL,
+					 &parsed_buf_bytes);
+	if (status != DBG_STATUS_OK)
+		return status;
+
+	*parsed_bytes = results_offset + parsed_buf_bytes;
+
+	return DBG_STATUS_OK;
+}
+
+/* Parses a Reg FIFO dump buffer.
+ * If result_buf is not NULL, the Reg FIFO results are printed to it.
+ * In any case, the required results buffer size is assigned to
+ * parsed_results_bytes.
+ * The parsing status is returned.
+ */
+static enum dbg_status qed_parse_reg_fifo_dump(u32 *dump_buf,
+					       char *results_buf,
+					       u32 *parsed_results_bytes)
+{
+	const char *section_name, *param_name, *param_str_val;
+	u32 param_num_val, num_section_params, num_elements;
+	struct reg_fifo_element *elements;
+	u8 i, j, err_val, vf_val;
+	u32 results_offset = 0;
+	char vf_str[4];
+
+	/* Read global_params section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "global_params"))
+		return DBG_STATUS_REG_FIFO_BAD_DATA;
+
+	/* Print global params */
+	dump_buf += qed_print_section_params(dump_buf,
+					     num_section_params,
+					     results_buf, &results_offset);
+
+	/* Read reg_fifo_data section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "reg_fifo_data"))
+		return DBG_STATUS_REG_FIFO_BAD_DATA;
+	dump_buf += qed_read_param(dump_buf,
+				   &param_name, &param_str_val, &param_num_val);
+	if (strcmp(param_name, "size"))
+		return DBG_STATUS_REG_FIFO_BAD_DATA;
+	if (param_num_val % REG_FIFO_ELEMENT_DWORDS)
+		return DBG_STATUS_REG_FIFO_BAD_DATA;
+	num_elements = param_num_val / REG_FIFO_ELEMENT_DWORDS;
+	elements = (struct reg_fifo_element *)dump_buf;
+
+	/* Decode elements */
+	for (i = 0; i < num_elements; i++) {
+		bool err_printed = false;
+
+		/* Discover if element belongs to a VF or a PF */
+		vf_val = GET_FIELD(elements[i].data, REG_FIFO_ELEMENT_VF);
+		if (vf_val == REG_FIFO_ELEMENT_IS_PF_VF_VAL)
+			sprintf(vf_str, "%s", "N/A");
+		else
+			sprintf(vf_str, "%d", vf_val);
+
+		/* Add parsed element to parsed buffer */
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "raw: 0x%016llx, address: 0x%07x, access: %-5s, pf: %2d, vf: %s, port: %d, privilege: %-3s, protection: %-12s, master: %-4s, errors: ",
+			    elements[i].data,
+			    (u32)GET_FIELD(elements[i].data,
+					   REG_FIFO_ELEMENT_ADDRESS) *
+			    REG_FIFO_ELEMENT_ADDR_FACTOR,
+			    s_access_strs[GET_FIELD(elements[i].data,
+						    REG_FIFO_ELEMENT_ACCESS)],
+			    (u32)GET_FIELD(elements[i].data,
+					   REG_FIFO_ELEMENT_PF),
+			    vf_str,
+			    (u32)GET_FIELD(elements[i].data,
+					   REG_FIFO_ELEMENT_PORT),
+			    s_privilege_strs[GET_FIELD(elements[i].data,
+						REG_FIFO_ELEMENT_PRIVILEGE)],
+			    s_protection_strs[GET_FIELD(elements[i].data,
+						REG_FIFO_ELEMENT_PROTECTION)],
+			    s_master_strs[GET_FIELD(elements[i].data,
+						REG_FIFO_ELEMENT_MASTER)]);
+
+		/* Print errors */
+		for (j = 0,
+		     err_val = GET_FIELD(elements[i].data,
+					 REG_FIFO_ELEMENT_ERROR);
+		     j < ARRAY_SIZE(s_reg_fifo_error_strs);
+		     j++, err_val >>= 1) {
+			if (err_val & 0x1) {
+				if (err_printed)
+					results_offset +=
+					    sprintf(qed_get_buf_ptr
+						    (results_buf,
+						     results_offset), ", ");
+				results_offset +=
+				    sprintf(qed_get_buf_ptr
+					    (results_buf, results_offset), "%s",
+					    s_reg_fifo_error_strs[j]);
+				err_printed = true;
+			}
+		}
+
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf, results_offset), "\n");
+	}
+
+	results_offset += sprintf(qed_get_buf_ptr(results_buf,
+						  results_offset),
+				  "fifo contained %d elements", num_elements);
+
+	/* Add 1 for string NULL termination */
+	*parsed_results_bytes = results_offset + 1;
+
+	return DBG_STATUS_OK;
+}
+
+static enum dbg_status qed_parse_igu_fifo_element(struct igu_fifo_element
+						  *element, char
+						  *results_buf,
+						  u32 *results_offset)
+{
+	const struct igu_fifo_addr_data *found_addr = NULL;
+	u8 source, err_type, i, is_cleanup;
+	char parsed_addr_data[32];
+	char parsed_wr_data[256];
+	u32 wr_data, prod_cons;
+	bool is_wr_cmd, is_pf;
+	u16 cmd_addr;
+	u64 dword12;
+
+	/* Dword12 (dword index 1 and 2) contains bits 32..95 of the
+	 * FIFO element.
+	 */
+	dword12 = ((u64)element->dword2 << 32) | element->dword1;
+	is_wr_cmd = GET_FIELD(dword12, IGU_FIFO_ELEMENT_DWORD12_IS_WR_CMD);
+	is_pf = GET_FIELD(element->dword0, IGU_FIFO_ELEMENT_DWORD0_IS_PF);
+	cmd_addr = GET_FIELD(element->dword0, IGU_FIFO_ELEMENT_DWORD0_CMD_ADDR);
+	source = GET_FIELD(element->dword0, IGU_FIFO_ELEMENT_DWORD0_SOURCE);
+	err_type = GET_FIELD(element->dword0, IGU_FIFO_ELEMENT_DWORD0_ERR_TYPE);
+
+	if (source >= ARRAY_SIZE(s_igu_fifo_source_strs))
+		return DBG_STATUS_IGU_FIFO_BAD_DATA;
+	if (err_type >= ARRAY_SIZE(s_igu_fifo_error_strs))
+		return DBG_STATUS_IGU_FIFO_BAD_DATA;
+
+	/* Find address data */
+	for (i = 0; i < ARRAY_SIZE(s_igu_fifo_addr_data) && !found_addr; i++) {
+		const struct igu_fifo_addr_data *curr_addr =
+			&s_igu_fifo_addr_data[i];
+
+		if (cmd_addr >= curr_addr->start_addr && cmd_addr <=
+		    curr_addr->end_addr)
+			found_addr = curr_addr;
+	}
+
+	if (!found_addr)
+		return DBG_STATUS_IGU_FIFO_BAD_DATA;
+
+	/* Prepare parsed address data */
+	switch (found_addr->type) {
+	case IGU_ADDR_TYPE_MSIX_MEM:
+		sprintf(parsed_addr_data, " vector_num = 0x%x", cmd_addr / 2);
+		break;
+	case IGU_ADDR_TYPE_WRITE_INT_ACK:
+	case IGU_ADDR_TYPE_WRITE_PROD_UPDATE:
+		sprintf(parsed_addr_data,
+			" SB = 0x%x", cmd_addr - found_addr->start_addr);
+		break;
+	default:
+		parsed_addr_data[0] = '\0';
+	}
+
+	if (!is_wr_cmd) {
+		parsed_wr_data[0] = '\0';
+		goto out;
+	}
+
+	/* Prepare parsed write data */
+	wr_data = GET_FIELD(dword12, IGU_FIFO_ELEMENT_DWORD12_WR_DATA);
+	prod_cons = GET_FIELD(wr_data, IGU_FIFO_WR_DATA_PROD_CONS);
+	is_cleanup = GET_FIELD(wr_data, IGU_FIFO_WR_DATA_CMD_TYPE);
+
+	if (source == IGU_SRC_ATTN) {
+		sprintf(parsed_wr_data, "prod: 0x%x, ", prod_cons);
+	} else {
+		if (is_cleanup) {
+			u8 cleanup_val, cleanup_type;
+
+			cleanup_val =
+				GET_FIELD(wr_data,
+					  IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_VAL);
+			cleanup_type =
+			    GET_FIELD(wr_data,
+				      IGU_FIFO_CLEANUP_WR_DATA_CLEANUP_TYPE);
+
+			sprintf(parsed_wr_data,
+				"cmd_type: cleanup, cleanup_val: %s, cleanup_type : %d, ",
+				cleanup_val ? "set" : "clear",
+				cleanup_type);
+		} else {
+			u8 update_flag, en_dis_int_for_sb, segment;
+			u8 timer_mask;
+
+			update_flag = GET_FIELD(wr_data,
+						IGU_FIFO_WR_DATA_UPDATE_FLAG);
+			en_dis_int_for_sb =
+				GET_FIELD(wr_data,
+					  IGU_FIFO_WR_DATA_EN_DIS_INT_FOR_SB);
+			segment = GET_FIELD(wr_data,
+					    IGU_FIFO_WR_DATA_SEGMENT);
+			timer_mask = GET_FIELD(wr_data,
+					       IGU_FIFO_WR_DATA_TIMER_MASK);
+
+			sprintf(parsed_wr_data,
+				"cmd_type: prod/cons update, prod/cons: 0x%x, update_flag: %s, en_dis_int_for_sb : %s, segment : %s, timer_mask = %d, ",
+				prod_cons,
+				update_flag ? "update" : "nop",
+				en_dis_int_for_sb ?
+				(en_dis_int_for_sb == 1 ? "disable" : "nop") :
+				"enable",
+				segment ? "attn" : "regular",
+				timer_mask);
+		}
+	}
+out:
+	/* Add parsed element to parsed buffer */
+	*results_offset += sprintf(qed_get_buf_ptr(results_buf,
+						   *results_offset),
+				   "raw: 0x%01x%08x%08x, %s: %d, source : %s, type : %s, cmd_addr : 0x%x(%s%s), %serror: %s\n",
+				   element->dword2, element->dword1,
+				   element->dword0,
+				   is_pf ? "pf" : "vf",
+				   GET_FIELD(element->dword0,
+					     IGU_FIFO_ELEMENT_DWORD0_FID),
+				   s_igu_fifo_source_strs[source],
+				   is_wr_cmd ? "wr" : "rd",
+				   cmd_addr,
+				   (!is_pf && found_addr->vf_desc)
+				   ? found_addr->vf_desc
+				   : found_addr->desc,
+				   parsed_addr_data,
+				   parsed_wr_data,
+				   s_igu_fifo_error_strs[err_type]);
+
+	return DBG_STATUS_OK;
+}
+
+/* Parses an IGU FIFO dump buffer.
+ * If result_buf is not NULL, the IGU FIFO results are printed to it.
+ * In any case, the required results buffer size is assigned to
+ * parsed_results_bytes.
+ * The parsing status is returned.
+ */
+static enum dbg_status qed_parse_igu_fifo_dump(u32 *dump_buf,
+					       char *results_buf,
+					       u32 *parsed_results_bytes)
+{
+	const char *section_name, *param_name, *param_str_val;
+	u32 param_num_val, num_section_params, num_elements;
+	struct igu_fifo_element *elements;
+	enum dbg_status status;
+	u32 results_offset = 0;
+	u8 i;
+
+	/* Read global_params section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "global_params"))
+		return DBG_STATUS_IGU_FIFO_BAD_DATA;
+
+	/* Print global params */
+	dump_buf += qed_print_section_params(dump_buf,
+					     num_section_params,
+					     results_buf, &results_offset);
+
+	/* Read igu_fifo_data section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "igu_fifo_data"))
+		return DBG_STATUS_IGU_FIFO_BAD_DATA;
+	dump_buf += qed_read_param(dump_buf,
+				   &param_name, &param_str_val, &param_num_val);
+	if (strcmp(param_name, "size"))
+		return DBG_STATUS_IGU_FIFO_BAD_DATA;
+	if (param_num_val % IGU_FIFO_ELEMENT_DWORDS)
+		return DBG_STATUS_IGU_FIFO_BAD_DATA;
+	num_elements = param_num_val / IGU_FIFO_ELEMENT_DWORDS;
+	elements = (struct igu_fifo_element *)dump_buf;
+
+	/* Decode elements */
+	for (i = 0; i < num_elements; i++) {
+		status = qed_parse_igu_fifo_element(&elements[i],
+						    results_buf,
+						    &results_offset);
+		if (status != DBG_STATUS_OK)
+			return status;
+	}
+
+	results_offset += sprintf(qed_get_buf_ptr(results_buf,
+						  results_offset),
+				  "fifo contained %d elements", num_elements);
+
+	/* Add 1 for string NULL termination */
+	*parsed_results_bytes = results_offset + 1;
+
+	return DBG_STATUS_OK;
+}
+
+static enum dbg_status
+qed_parse_protection_override_dump(u32 *dump_buf,
+				   char *results_buf,
+				   u32 *parsed_results_bytes)
+{
+	const char *section_name, *param_name, *param_str_val;
+	u32 param_num_val, num_section_params, num_elements;
+	struct protection_override_element *elements;
+	u32 results_offset = 0;
+	u8 i;
+
+	/* Read global_params section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "global_params"))
+		return DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA;
+
+	/* Print global params */
+	dump_buf += qed_print_section_params(dump_buf,
+					     num_section_params,
+					     results_buf, &results_offset);
+
+	/* Read protection_override_data section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "protection_override_data"))
+		return DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA;
+	dump_buf += qed_read_param(dump_buf,
+				   &param_name, &param_str_val, &param_num_val);
+	if (strcmp(param_name, "size"))
+		return DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA;
+	if (param_num_val % PROTECTION_OVERRIDE_ELEMENT_DWORDS)
+		return DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA;
+	num_elements = param_num_val / PROTECTION_OVERRIDE_ELEMENT_DWORDS;
+	elements = (struct protection_override_element *)dump_buf;
+
+	/* Decode elements */
+	for (i = 0; i < num_elements; i++) {
+		u32 address = GET_FIELD(elements[i].data,
+					PROTECTION_OVERRIDE_ELEMENT_ADDRESS) *
+			      PROTECTION_OVERRIDE_ELEMENT_ADDR_FACTOR;
+
+		results_offset +=
+		    sprintf(qed_get_buf_ptr(results_buf,
+					    results_offset),
+			    "window %2d, address: 0x%07x, size: %7d regs, read: %d, write: %d, read protection: %-12s, write protection: %-12s\n",
+			    i, address,
+			    (u32)GET_FIELD(elements[i].data,
+				      PROTECTION_OVERRIDE_ELEMENT_WINDOW_SIZE),
+			    (u32)GET_FIELD(elements[i].data,
+				      PROTECTION_OVERRIDE_ELEMENT_READ),
+			    (u32)GET_FIELD(elements[i].data,
+				      PROTECTION_OVERRIDE_ELEMENT_WRITE),
+			    s_protection_strs[GET_FIELD(elements[i].data,
+				PROTECTION_OVERRIDE_ELEMENT_READ_PROTECTION)],
+			    s_protection_strs[GET_FIELD(elements[i].data,
+				PROTECTION_OVERRIDE_ELEMENT_WRITE_PROTECTION)]);
+	}
+
+	results_offset += sprintf(qed_get_buf_ptr(results_buf,
+						  results_offset),
+				  "protection override contained %d elements",
+				  num_elements);
+
+	/* Add 1 for string NULL termination */
+	*parsed_results_bytes = results_offset + 1;
+
+	return DBG_STATUS_OK;
+}
+
+/* Parses a FW Asserts dump buffer.
+ * If result_buf is not NULL, the FW Asserts results are printed to it.
+ * In any case, the required results buffer size is assigned to
+ * parsed_results_bytes.
+ * The parsing status is returned.
+ */
+static enum dbg_status qed_parse_fw_asserts_dump(u32 *dump_buf,
+						 char *results_buf,
+						 u32 *parsed_results_bytes)
+{
+	u32 num_section_params, param_num_val, i, results_offset = 0;
+	const char *param_name, *param_str_val, *section_name;
+	bool last_section_found = false;
+
+	*parsed_results_bytes = 0;
+
+	/* Read global_params section */
+	dump_buf += qed_read_section_hdr(dump_buf,
+					 &section_name, &num_section_params);
+	if (strcmp(section_name, "global_params"))
+		return DBG_STATUS_FW_ASSERTS_PARSE_FAILED;
+
+	/* Print global params */
+	dump_buf += qed_print_section_params(dump_buf,
+					     num_section_params,
+					     results_buf, &results_offset);
+
+	while (!last_section_found) {
+		dump_buf += qed_read_section_hdr(dump_buf,
+						 &section_name,
+						 &num_section_params);
+		if (!strcmp(section_name, "fw_asserts")) {
+			/* Extract params */
+			const char *storm_letter = NULL;
+			u32 storm_dump_size = 0;
+
+			for (i = 0; i < num_section_params; i++) {
+				dump_buf += qed_read_param(dump_buf,
+							   &param_name,
+							   &param_str_val,
+							   &param_num_val);
+				if (!strcmp(param_name, "storm"))
+					storm_letter = param_str_val;
+				else if (!strcmp(param_name, "size"))
+					storm_dump_size = param_num_val;
+				else
+					return
+					    DBG_STATUS_FW_ASSERTS_PARSE_FAILED;
+			}
+
+			if (!storm_letter || !storm_dump_size)
+				return DBG_STATUS_FW_ASSERTS_PARSE_FAILED;
+
+			/* Print data */
+			results_offset +=
+			    sprintf(qed_get_buf_ptr(results_buf,
+						    results_offset),
+				    "\n%sSTORM_ASSERT: size=%d\n",
+				    storm_letter, storm_dump_size);
+			for (i = 0; i < storm_dump_size; i++, dump_buf++)
+				results_offset +=
+				    sprintf(qed_get_buf_ptr(results_buf,
+							    results_offset),
+					    "%08x\n", *dump_buf);
+		} else if (!strcmp(section_name, "last")) {
+			last_section_found = true;
+		} else {
+			return DBG_STATUS_FW_ASSERTS_PARSE_FAILED;
+		}
+	}
+
+	/* Add 1 for string NULL termination */
+	*parsed_results_bytes = results_offset + 1;
+
+	return DBG_STATUS_OK;
+}
+
+/***************************** Public Functions *******************************/
+
+enum dbg_status qed_dbg_user_set_bin_ptr(const u8 * const bin_ptr)
+{
+	struct bin_buffer_hdr *buf_array = (struct bin_buffer_hdr *)bin_ptr;
+	u8 buf_id;
+
+	/* Convert binary data to debug arrays */
+	for (buf_id = 0; buf_id < MAX_BIN_DBG_BUFFER_TYPE; buf_id++) {
+		s_user_dbg_arrays[buf_id].ptr =
+			(u32 *)(bin_ptr + buf_array[buf_id].offset);
+		s_user_dbg_arrays[buf_id].size_in_dwords =
+			BYTES_TO_DWORDS(buf_array[buf_id].length);
+	}
+
+	return DBG_STATUS_OK;
+}
+
+const char *qed_dbg_get_status_str(enum dbg_status status)
+{
+	return (status <
+		MAX_DBG_STATUS) ? s_status_str[status] : "Invalid debug status";
+}
+
+enum dbg_status qed_get_idle_chk_results_buf_size(struct qed_hwfn *p_hwfn,
+						  u32 *dump_buf,
+						  u32 num_dumped_dwords,
+						  u32 *results_buf_size)
+{
+	u32 num_errors, num_warnings;
+
+	return qed_parse_idle_chk_dump(dump_buf,
+				       num_dumped_dwords,
+				       NULL,
+				       results_buf_size,
+				       &num_errors, &num_warnings);
+}
+
+enum dbg_status qed_print_idle_chk_results(struct qed_hwfn *p_hwfn,
+					   u32 *dump_buf,
+					   u32 num_dumped_dwords,
+					   char *results_buf,
+					   u32 *num_errors,
+					   u32 *num_warnings)
+{
+	u32 parsed_buf_size;
+
+	return qed_parse_idle_chk_dump(dump_buf,
+				       num_dumped_dwords,
+				       results_buf,
+				       &parsed_buf_size,
+				       num_errors, num_warnings);
+}
+
+void qed_dbg_mcp_trace_set_meta_data(u32 *data, u32 size)
+{
+	s_mcp_trace_meta_arr.ptr = data;
+	s_mcp_trace_meta_arr.size_in_dwords = size;
+}
+
+enum dbg_status qed_get_mcp_trace_results_buf_size(struct qed_hwfn *p_hwfn,
+						   u32 *dump_buf,
+						   u32 num_dumped_dwords,
+						   u32 *results_buf_size)
+{
+	return qed_parse_mcp_trace_dump(p_hwfn,
+					dump_buf, NULL, results_buf_size);
+}
+
+enum dbg_status qed_print_mcp_trace_results(struct qed_hwfn *p_hwfn,
+					    u32 *dump_buf,
+					    u32 num_dumped_dwords,
+					    char *results_buf)
+{
+	u32 parsed_buf_size;
+
+	return qed_parse_mcp_trace_dump(p_hwfn,
+					dump_buf,
+					results_buf, &parsed_buf_size);
+}
+
+enum dbg_status qed_print_mcp_trace_line(u8 *dump_buf,
+					 u32 num_dumped_bytes,
+					 char *results_buf)
+{
+	u32 parsed_bytes;
+
+	return qed_parse_mcp_trace_buf(dump_buf,
+				       num_dumped_bytes,
+				       0,
+				       num_dumped_bytes,
+				       results_buf, &parsed_bytes);
+}
+
+enum dbg_status qed_get_reg_fifo_results_buf_size(struct qed_hwfn *p_hwfn,
+						  u32 *dump_buf,
+						  u32 num_dumped_dwords,
+						  u32 *results_buf_size)
+{
+	return qed_parse_reg_fifo_dump(dump_buf, NULL, results_buf_size);
+}
+
+enum dbg_status qed_print_reg_fifo_results(struct qed_hwfn *p_hwfn,
+					   u32 *dump_buf,
+					   u32 num_dumped_dwords,
+					   char *results_buf)
+{
+	u32 parsed_buf_size;
+
+	return qed_parse_reg_fifo_dump(dump_buf, results_buf, &parsed_buf_size);
+}
+
+enum dbg_status qed_get_igu_fifo_results_buf_size(struct qed_hwfn *p_hwfn,
+						  u32 *dump_buf,
+						  u32 num_dumped_dwords,
+						  u32 *results_buf_size)
+{
+	return qed_parse_igu_fifo_dump(dump_buf, NULL, results_buf_size);
+}
+
+enum dbg_status qed_print_igu_fifo_results(struct qed_hwfn *p_hwfn,
+					   u32 *dump_buf,
+					   u32 num_dumped_dwords,
+					   char *results_buf)
+{
+	u32 parsed_buf_size;
+
+	return qed_parse_igu_fifo_dump(dump_buf, results_buf, &parsed_buf_size);
+}
+
+enum dbg_status
+qed_get_protection_override_results_buf_size(struct qed_hwfn *p_hwfn,
+					     u32 *dump_buf,
+					     u32 num_dumped_dwords,
+					     u32 *results_buf_size)
+{
+	return qed_parse_protection_override_dump(dump_buf,
+						  NULL, results_buf_size);
+}
+
+enum dbg_status qed_print_protection_override_results(struct qed_hwfn *p_hwfn,
+						      u32 *dump_buf,
+						      u32 num_dumped_dwords,
+						      char *results_buf)
+{
+	u32 parsed_buf_size;
+
+	return qed_parse_protection_override_dump(dump_buf,
+						  results_buf,
+						  &parsed_buf_size);
+}
+
+enum dbg_status qed_get_fw_asserts_results_buf_size(struct qed_hwfn *p_hwfn,
+						    u32 *dump_buf,
+						    u32 num_dumped_dwords,
+						    u32 *results_buf_size)
+{
+	return qed_parse_fw_asserts_dump(dump_buf, NULL, results_buf_size);
+}
+
+enum dbg_status qed_print_fw_asserts_results(struct qed_hwfn *p_hwfn,
+					     u32 *dump_buf,
+					     u32 num_dumped_dwords,
+					     char *results_buf)
+{
+	u32 parsed_buf_size;
+
+	return qed_parse_fw_asserts_dump(dump_buf,
+					 results_buf, &parsed_buf_size);
+}
+
+enum dbg_status qed_dbg_parse_attn(struct qed_hwfn *p_hwfn,
+				   struct dbg_attn_block_result *results)
+{
+	struct user_dbg_array *block_attn, *pstrings;
+	const u32 *block_attn_name_offsets;
+	enum dbg_attn_type attn_type;
+	const char *block_name;
+	u8 num_regs, i, j;
+
+	num_regs = GET_FIELD(results->data, DBG_ATTN_BLOCK_RESULT_NUM_REGS);
+	attn_type = (enum dbg_attn_type)
+		    GET_FIELD(results->data,
+			      DBG_ATTN_BLOCK_RESULT_ATTN_TYPE);
+	block_name = s_block_info_arr[results->block_id].name;
+
+	if (!s_user_dbg_arrays[BIN_BUF_DBG_ATTN_INDEXES].ptr ||
+	    !s_user_dbg_arrays[BIN_BUF_DBG_ATTN_NAME_OFFSETS].ptr ||
+	    !s_user_dbg_arrays[BIN_BUF_DBG_PARSING_STRINGS].ptr)
+		return DBG_STATUS_DBG_ARRAY_NOT_SET;
+
+	block_attn = &s_user_dbg_arrays[BIN_BUF_DBG_ATTN_NAME_OFFSETS];
+	block_attn_name_offsets = &block_attn->ptr[results->names_offset];
+
+	/* Go over registers with a non-zero attention status */
+	for (i = 0; i < num_regs; i++) {
+		struct dbg_attn_bit_mapping *bit_mapping;
+		struct dbg_attn_reg_result *reg_result;
+		u8 num_reg_attn, bit_idx = 0;
+
+		reg_result = &results->reg_results[i];
+		num_reg_attn = GET_FIELD(reg_result->data,
+					 DBG_ATTN_REG_RESULT_NUM_REG_ATTN);
+		block_attn = &s_user_dbg_arrays[BIN_BUF_DBG_ATTN_INDEXES];
+		bit_mapping = &((struct dbg_attn_bit_mapping *)
+				block_attn->ptr)[reg_result->block_attn_offset];
+
+		pstrings = &s_user_dbg_arrays[BIN_BUF_DBG_PARSING_STRINGS];
+
+		/* Go over attention status bits */
+		for (j = 0; j < num_reg_attn; j++) {
+			u16 attn_idx_val = GET_FIELD(bit_mapping[j].data,
+						     DBG_ATTN_BIT_MAPPING_VAL);
+			const char *attn_name, *attn_type_str, *masked_str;
+			u32 attn_name_offset, sts_addr;
+
+			/* Check if bit mask should be advanced (due to unused
+			 * bits).
+			 */
+			if (GET_FIELD(bit_mapping[j].data,
+				      DBG_ATTN_BIT_MAPPING_IS_UNUSED_BIT_CNT)) {
+				bit_idx += (u8)attn_idx_val;
+				continue;
+			}
+
+			/* Check current bit index */
+			if (!(reg_result->sts_val & BIT(bit_idx))) {
+				bit_idx++;
+				continue;
+			}
+
+			/* Find attention name */
+			attn_name_offset =
+				block_attn_name_offsets[attn_idx_val];
+			attn_name = &((const char *)
+				      pstrings->ptr)[attn_name_offset];
+			attn_type_str = attn_type == ATTN_TYPE_INTERRUPT ?
+					"Interrupt" : "Parity";
+			masked_str = reg_result->mask_val & BIT(bit_idx) ?
+				     " [masked]" : "";
+			sts_addr = GET_FIELD(reg_result->data,
+					     DBG_ATTN_REG_RESULT_STS_ADDRESS);
+			DP_NOTICE(p_hwfn,
+				  "%s (%s) : %s [address 0x%08x, bit %d]%s\n",
+				  block_name, attn_type_str, attn_name,
+				  sts_addr, bit_idx, masked_str);
+
+			bit_idx++;
+		}
+	}
+
+	return DBG_STATUS_OK;
+}
+
+/* Wrapper for unifying the idle_chk and mcp_trace api */
+static enum dbg_status
+qed_print_idle_chk_results_wrapper(struct qed_hwfn *p_hwfn,
+				   u32 *dump_buf,
+				   u32 num_dumped_dwords,
+				   char *results_buf)
+{
+	u32 num_errors, num_warnnings;
+
+	return qed_print_idle_chk_results(p_hwfn, dump_buf, num_dumped_dwords,
+					  results_buf, &num_errors,
+					  &num_warnnings);
+}
+
+/* Feature meta data lookup table */
+static struct {
+	char *name;
+	enum dbg_status (*get_size)(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt, u32 *size);
+	enum dbg_status (*perform_dump)(struct qed_hwfn *p_hwfn,
+					struct qed_ptt *p_ptt, u32 *dump_buf,
+					u32 buf_size, u32 *dumped_dwords);
+	enum dbg_status (*print_results)(struct qed_hwfn *p_hwfn,
+					 u32 *dump_buf, u32 num_dumped_dwords,
+					 char *results_buf);
+	enum dbg_status (*results_buf_size)(struct qed_hwfn *p_hwfn,
+					    u32 *dump_buf,
+					    u32 num_dumped_dwords,
+					    u32 *results_buf_size);
+} qed_features_lookup[] = {
+	{
+	"grc", qed_dbg_grc_get_dump_buf_size,
+		    qed_dbg_grc_dump, NULL, NULL}, {
+	"idle_chk",
+		    qed_dbg_idle_chk_get_dump_buf_size,
+		    qed_dbg_idle_chk_dump,
+		    qed_print_idle_chk_results_wrapper,
+		    qed_get_idle_chk_results_buf_size}, {
+	"mcp_trace",
+		    qed_dbg_mcp_trace_get_dump_buf_size,
+		    qed_dbg_mcp_trace_dump, qed_print_mcp_trace_results,
+		    qed_get_mcp_trace_results_buf_size}, {
+	"reg_fifo",
+		    qed_dbg_reg_fifo_get_dump_buf_size,
+		    qed_dbg_reg_fifo_dump, qed_print_reg_fifo_results,
+		    qed_get_reg_fifo_results_buf_size}, {
+	"igu_fifo",
+		    qed_dbg_igu_fifo_get_dump_buf_size,
+		    qed_dbg_igu_fifo_dump, qed_print_igu_fifo_results,
+		    qed_get_igu_fifo_results_buf_size}, {
+	"protection_override",
+		    qed_dbg_protection_override_get_dump_buf_size,
+		    qed_dbg_protection_override_dump,
+		    qed_print_protection_override_results,
+		    qed_get_protection_override_results_buf_size}, {
+	"fw_asserts",
+		    qed_dbg_fw_asserts_get_dump_buf_size,
+		    qed_dbg_fw_asserts_dump,
+		    qed_print_fw_asserts_results,
+		    qed_get_fw_asserts_results_buf_size},};
+
+static void qed_dbg_print_feature(u8 *p_text_buf, u32 text_size)
+{
+	u32 i, precision = 80;
+
+	if (!p_text_buf)
+		return;
+
+	pr_notice("\n%.*s", precision, p_text_buf);
+	for (i = precision; i < text_size; i += precision)
+		pr_cont("%.*s", precision, p_text_buf + i);
+	pr_cont("\n");
+}
+
+#define QED_RESULTS_BUF_MIN_SIZE 16
+/* Generic function for decoding debug feature info */
+static enum dbg_status format_feature(struct qed_hwfn *p_hwfn,
+				      enum qed_dbg_features feature_idx)
+{
+	struct qed_dbg_feature *feature =
+	    &p_hwfn->cdev->dbg_params.features[feature_idx];
+	u32 text_size_bytes, null_char_pos, i;
+	enum dbg_status rc;
+	char *text_buf;
+
+	/* Check if feature supports formatting capability */
+	if (!qed_features_lookup[feature_idx].results_buf_size)
+		return DBG_STATUS_OK;
+
+	/* Obtain size of formatted output */
+	rc = qed_features_lookup[feature_idx].
+		results_buf_size(p_hwfn, (u32 *)feature->dump_buf,
+				 feature->dumped_dwords, &text_size_bytes);
+	if (rc != DBG_STATUS_OK)
+		return rc;
+
+	/* Make sure that the allocated size is a multiple of dword (4 bytes) */
+	null_char_pos = text_size_bytes - 1;
+	text_size_bytes = (text_size_bytes + 3) & ~0x3;
+
+	if (text_size_bytes < QED_RESULTS_BUF_MIN_SIZE) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "formatted size of feature was too small %d. Aborting\n",
+			  text_size_bytes);
+		return DBG_STATUS_INVALID_ARGS;
+	}
+
+	/* Allocate temp text buf */
+	text_buf = vzalloc(text_size_bytes);
+	if (!text_buf)
+		return DBG_STATUS_VIRT_MEM_ALLOC_FAILED;
+
+	/* Decode feature opcodes to string on temp buf */
+	rc = qed_features_lookup[feature_idx].
+		print_results(p_hwfn, (u32 *)feature->dump_buf,
+			      feature->dumped_dwords, text_buf);
+	if (rc != DBG_STATUS_OK) {
+		vfree(text_buf);
+		return rc;
+	}
+
+	/* Replace the original null character with a '\n' character.
+	 * The bytes that were added as a result of the dword alignment are also
+	 * padded with '\n' characters.
+	 */
+	for (i = null_char_pos; i < text_size_bytes; i++)
+		text_buf[i] = '\n';
+
+	/* Dump printable feature to log */
+	if (p_hwfn->cdev->dbg_params.print_data)
+		qed_dbg_print_feature(text_buf, text_size_bytes);
+
+	/* Free the old dump_buf and point the dump_buf to the newly allocagted
+	 * and formatted text buffer.
+	 */
+	vfree(feature->dump_buf);
+	feature->dump_buf = text_buf;
+	feature->buf_size = text_size_bytes;
+	feature->dumped_dwords = text_size_bytes / 4;
+	return rc;
+}
+
+/* Generic function for performing the dump of a debug feature. */
+static enum dbg_status qed_dbg_dump(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    enum qed_dbg_features feature_idx)
+{
+	struct qed_dbg_feature *feature =
+	    &p_hwfn->cdev->dbg_params.features[feature_idx];
+	u32 buf_size_dwords;
+	enum dbg_status rc;
+
+	DP_NOTICE(p_hwfn->cdev, "Collecting a debug feature [\"%s\"]\n",
+		  qed_features_lookup[feature_idx].name);
+
+	/* Dump_buf was already allocated need to free (this can happen if dump
+	 * was called but file was never read).
+	 * We can't use the buffer as is since size may have changed.
+	 */
+	if (feature->dump_buf) {
+		vfree(feature->dump_buf);
+		feature->dump_buf = NULL;
+	}
+
+	/* Get buffer size from hsi, allocate accordingly, and perform the
+	 * dump.
+	 */
+	rc = qed_features_lookup[feature_idx].get_size(p_hwfn, p_ptt,
+						       &buf_size_dwords);
+	if (rc != DBG_STATUS_OK && rc != DBG_STATUS_NVRAM_GET_IMAGE_FAILED)
+		return rc;
+	feature->buf_size = buf_size_dwords * sizeof(u32);
+	feature->dump_buf = vmalloc(feature->buf_size);
+	if (!feature->dump_buf)
+		return DBG_STATUS_VIRT_MEM_ALLOC_FAILED;
+
+	rc = qed_features_lookup[feature_idx].
+		perform_dump(p_hwfn, p_ptt, (u32 *)feature->dump_buf,
+			     feature->buf_size / sizeof(u32),
+			     &feature->dumped_dwords);
+
+	/* If mcp is stuck we get DBG_STATUS_NVRAM_GET_IMAGE_FAILED error.
+	 * In this case the buffer holds valid binary data, but we wont able
+	 * to parse it (since parsing relies on data in NVRAM which is only
+	 * accessible when MFW is responsive). skip the formatting but return
+	 * success so that binary data is provided.
+	 */
+	if (rc == DBG_STATUS_NVRAM_GET_IMAGE_FAILED)
+		return DBG_STATUS_OK;
+
+	if (rc != DBG_STATUS_OK)
+		return rc;
+
+	/* Format output */
+	rc = format_feature(p_hwfn, feature_idx);
+	return rc;
+}
+
+int qed_dbg_grc(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_GRC, num_dumped_bytes);
+}
+
+int qed_dbg_grc_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_GRC);
+}
+
+int qed_dbg_idle_chk(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_IDLE_CHK,
+			       num_dumped_bytes);
+}
+
+int qed_dbg_idle_chk_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_IDLE_CHK);
+}
+
+int qed_dbg_reg_fifo(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_REG_FIFO,
+			       num_dumped_bytes);
+}
+
+int qed_dbg_reg_fifo_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_REG_FIFO);
+}
+
+int qed_dbg_igu_fifo(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_IGU_FIFO,
+			       num_dumped_bytes);
+}
+
+int qed_dbg_igu_fifo_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_IGU_FIFO);
+}
+
+int qed_dbg_protection_override(struct qed_dev *cdev, void *buffer,
+				u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_PROTECTION_OVERRIDE,
+			       num_dumped_bytes);
+}
+
+int qed_dbg_protection_override_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_PROTECTION_OVERRIDE);
+}
+
+int qed_dbg_fw_asserts(struct qed_dev *cdev, void *buffer,
+		       u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_FW_ASSERTS,
+			       num_dumped_bytes);
+}
+
+int qed_dbg_fw_asserts_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_FW_ASSERTS);
+}
+
+int qed_dbg_mcp_trace(struct qed_dev *cdev, void *buffer,
+		      u32 *num_dumped_bytes)
+{
+	return qed_dbg_feature(cdev, buffer, DBG_FEATURE_MCP_TRACE,
+			       num_dumped_bytes);
+}
+
+int qed_dbg_mcp_trace_size(struct qed_dev *cdev)
+{
+	return qed_dbg_feature_size(cdev, DBG_FEATURE_MCP_TRACE);
+}
+
+/* Defines the amount of bytes allocated for recording the length of debugfs
+ * feature buffer.
+ */
+#define REGDUMP_HEADER_SIZE			sizeof(u32)
+#define REGDUMP_HEADER_FEATURE_SHIFT		24
+#define REGDUMP_HEADER_ENGINE_SHIFT		31
+#define REGDUMP_HEADER_OMIT_ENGINE_SHIFT	30
+enum debug_print_features {
+	OLD_MODE = 0,
+	IDLE_CHK = 1,
+	GRC_DUMP = 2,
+	MCP_TRACE = 3,
+	REG_FIFO = 4,
+	PROTECTION_OVERRIDE = 5,
+	IGU_FIFO = 6,
+	PHY = 7,
+	FW_ASSERTS = 8,
+};
+
+static u32 qed_calc_regdump_header(enum debug_print_features feature,
+				   int engine, u32 feature_size, u8 omit_engine)
+{
+	/* Insert the engine, feature and mode inside the header and combine it
+	 * with feature size.
+	 */
+	return feature_size | (feature << REGDUMP_HEADER_FEATURE_SHIFT) |
+	       (omit_engine << REGDUMP_HEADER_OMIT_ENGINE_SHIFT) |
+	       (engine << REGDUMP_HEADER_ENGINE_SHIFT);
+}
+
+int qed_dbg_all_data(struct qed_dev *cdev, void *buffer)
+{
+	u8 cur_engine, omit_engine = 0, org_engine;
+	u32 offset = 0, feature_size;
+	int rc;
+
+	if (cdev->num_hwfns == 1)
+		omit_engine = 1;
+
+	org_engine = qed_get_debug_engine(cdev);
+	for (cur_engine = 0; cur_engine < cdev->num_hwfns; cur_engine++) {
+		/* Collect idle_chks and grcDump for each hw function */
+		DP_VERBOSE(cdev, QED_MSG_DEBUG,
+			   "obtaining idle_chk and grcdump for current engine\n");
+		qed_set_debug_engine(cdev, cur_engine);
+
+		/* First idle_chk */
+		rc = qed_dbg_idle_chk(cdev, (u8 *)buffer + offset +
+				      REGDUMP_HEADER_SIZE, &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(IDLE_CHK, cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev, "qed_dbg_idle_chk failed. rc = %d\n", rc);
+		}
+
+		/* Second idle_chk */
+		rc = qed_dbg_idle_chk(cdev, (u8 *)buffer + offset +
+				      REGDUMP_HEADER_SIZE, &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(IDLE_CHK, cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev, "qed_dbg_idle_chk failed. rc = %d\n", rc);
+		}
+
+		/* reg_fifo dump */
+		rc = qed_dbg_reg_fifo(cdev, (u8 *)buffer + offset +
+				      REGDUMP_HEADER_SIZE, &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(REG_FIFO, cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev, "qed_dbg_reg_fifo failed. rc = %d\n", rc);
+		}
+
+		/* igu_fifo dump */
+		rc = qed_dbg_igu_fifo(cdev, (u8 *)buffer + offset +
+				      REGDUMP_HEADER_SIZE, &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(IGU_FIFO, cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev, "qed_dbg_igu_fifo failed. rc = %d", rc);
+		}
+
+		/* protection_override dump */
+		rc = qed_dbg_protection_override(cdev, (u8 *)buffer + offset +
+						 REGDUMP_HEADER_SIZE,
+						 &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(PROTECTION_OVERRIDE,
+						    cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev,
+			       "qed_dbg_protection_override failed. rc = %d\n",
+			       rc);
+		}
+
+		/* fw_asserts dump */
+		rc = qed_dbg_fw_asserts(cdev, (u8 *)buffer + offset +
+					REGDUMP_HEADER_SIZE, &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(FW_ASSERTS, cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev, "qed_dbg_fw_asserts failed. rc = %d\n",
+			       rc);
+		}
+
+		/* GRC dump - must be last because when mcp stuck it will
+		 * clutter idle_chk, reg_fifo, ...
+		 */
+		rc = qed_dbg_grc(cdev, (u8 *)buffer + offset +
+				 REGDUMP_HEADER_SIZE, &feature_size);
+		if (!rc) {
+			*(u32 *)((u8 *)buffer + offset) =
+			    qed_calc_regdump_header(GRC_DUMP, cur_engine,
+						    feature_size, omit_engine);
+			offset += (feature_size + REGDUMP_HEADER_SIZE);
+		} else {
+			DP_ERR(cdev, "qed_dbg_grc failed. rc = %d", rc);
+		}
+	}
+
+	qed_set_debug_engine(cdev, org_engine);
+	/* mcp_trace */
+	rc = qed_dbg_mcp_trace(cdev, (u8 *)buffer + offset +
+			       REGDUMP_HEADER_SIZE, &feature_size);
+	if (!rc) {
+		*(u32 *)((u8 *)buffer + offset) =
+		    qed_calc_regdump_header(MCP_TRACE, cur_engine,
+					    feature_size, omit_engine);
+		offset += (feature_size + REGDUMP_HEADER_SIZE);
+	} else {
+		DP_ERR(cdev, "qed_dbg_mcp_trace failed. rc = %d\n", rc);
+	}
+
+	return 0;
+}
+
+int qed_dbg_all_data_size(struct qed_dev *cdev)
+{
+	u8 cur_engine, org_engine;
+	u32 regs_len = 0;
+
+	org_engine = qed_get_debug_engine(cdev);
+	for (cur_engine = 0; cur_engine < cdev->num_hwfns; cur_engine++) {
+		/* Engine specific */
+		DP_VERBOSE(cdev, QED_MSG_DEBUG,
+			   "calculating idle_chk and grcdump register length for current engine\n");
+		qed_set_debug_engine(cdev, cur_engine);
+		regs_len += REGDUMP_HEADER_SIZE + qed_dbg_idle_chk_size(cdev) +
+			    REGDUMP_HEADER_SIZE + qed_dbg_idle_chk_size(cdev) +
+			    REGDUMP_HEADER_SIZE + qed_dbg_grc_size(cdev) +
+			    REGDUMP_HEADER_SIZE + qed_dbg_reg_fifo_size(cdev) +
+			    REGDUMP_HEADER_SIZE + qed_dbg_igu_fifo_size(cdev) +
+			    REGDUMP_HEADER_SIZE +
+			    qed_dbg_protection_override_size(cdev) +
+			    REGDUMP_HEADER_SIZE + qed_dbg_fw_asserts_size(cdev);
+	}
+
+	qed_set_debug_engine(cdev, org_engine);
+
+	/* Engine common */
+	regs_len += REGDUMP_HEADER_SIZE + qed_dbg_mcp_trace_size(cdev);
+
+	return regs_len;
+}
+
+int qed_dbg_feature(struct qed_dev *cdev, void *buffer,
+		    enum qed_dbg_features feature, u32 *num_dumped_bytes)
+{
+	struct qed_hwfn *p_hwfn =
+		&cdev->hwfns[cdev->dbg_params.engine_for_debug];
+	struct qed_dbg_feature *qed_feature =
+		&cdev->dbg_params.features[feature];
+	enum dbg_status dbg_rc;
+	struct qed_ptt *p_ptt;
+	int rc = 0;
+
+	/* Acquire ptt */
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EINVAL;
+
+	/* Get dump */
+	dbg_rc = qed_dbg_dump(p_hwfn, p_ptt, feature);
+	if (dbg_rc != DBG_STATUS_OK) {
+		DP_VERBOSE(cdev, QED_MSG_DEBUG, "%s\n",
+			   qed_dbg_get_status_str(dbg_rc));
+		*num_dumped_bytes = 0;
+		rc = -EINVAL;
+		goto out;
+	}
+
+	DP_VERBOSE(cdev, QED_MSG_DEBUG,
+		   "copying debugfs feature to external buffer\n");
+	memcpy(buffer, qed_feature->dump_buf, qed_feature->buf_size);
+	*num_dumped_bytes = cdev->dbg_params.features[feature].dumped_dwords *
+			    4;
+
+out:
+	qed_ptt_release(p_hwfn, p_ptt);
+	return rc;
+}
+
+int qed_dbg_feature_size(struct qed_dev *cdev, enum qed_dbg_features feature)
+{
+	struct qed_hwfn *p_hwfn =
+		&cdev->hwfns[cdev->dbg_params.engine_for_debug];
+	struct qed_ptt *p_ptt = qed_ptt_acquire(p_hwfn);
+	struct qed_dbg_feature *qed_feature =
+		&cdev->dbg_params.features[feature];
+	u32 buf_size_dwords;
+	enum dbg_status rc;
+
+	if (!p_ptt)
+		return -EINVAL;
+
+	rc = qed_features_lookup[feature].get_size(p_hwfn, p_ptt,
+						   &buf_size_dwords);
+	if (rc != DBG_STATUS_OK)
+		buf_size_dwords = 0;
+
+	qed_ptt_release(p_hwfn, p_ptt);
+	qed_feature->buf_size = buf_size_dwords * sizeof(u32);
+	return qed_feature->buf_size;
+}
+
+u8 qed_get_debug_engine(struct qed_dev *cdev)
+{
+	return cdev->dbg_params.engine_for_debug;
+}
+
+void qed_set_debug_engine(struct qed_dev *cdev, int engine_number)
+{
+	DP_VERBOSE(cdev, QED_MSG_DEBUG, "set debug engine to %d\n",
+		   engine_number);
+	cdev->dbg_params.engine_for_debug = engine_number;
+}
+
+void qed_dbg_pf_init(struct qed_dev *cdev)
+{
+	const u8 *dbg_values;
+
+	/* Debug values are after init values.
+	 * The offset is the first dword of the file.
+	 */
+	dbg_values = cdev->firmware->data + *(u32 *)cdev->firmware->data;
+	qed_dbg_set_bin_ptr((u8 *)dbg_values);
+	qed_dbg_user_set_bin_ptr((u8 *)dbg_values);
+}
+
+void qed_dbg_pf_exit(struct qed_dev *cdev)
+{
+	struct qed_dbg_feature *feature = NULL;
+	enum qed_dbg_features feature_idx;
+
+	/* Debug features' buffers may be allocated if debug feature was used
+	 * but dump wasn't called.
+	 */
+	for (feature_idx = 0; feature_idx < DBG_FEATURE_NUM; feature_idx++) {
+		feature = &cdev->dbg_params.features[feature_idx];
+		if (feature->dump_buf) {
+			vfree(feature->dump_buf);
+			feature->dump_buf = NULL;
+		}
+	}
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.h b/drivers/net/ethernet/qlogic/qed/qed_debug.h
new file mode 100644
index 0000000..ea1cc8e
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.h
@@ -0,0 +1,57 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QED_DEBUGFS_H
+#define _QED_DEBUGFS_H
+
+enum qed_dbg_features {
+	DBG_FEATURE_GRC,
+	DBG_FEATURE_IDLE_CHK,
+	DBG_FEATURE_MCP_TRACE,
+	DBG_FEATURE_REG_FIFO,
+	DBG_FEATURE_IGU_FIFO,
+	DBG_FEATURE_PROTECTION_OVERRIDE,
+	DBG_FEATURE_FW_ASSERTS,
+	DBG_FEATURE_NUM
+};
+
+/* Forward Declaration */
+struct qed_dev;
+
+int qed_dbg_grc(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes);
+int qed_dbg_grc_size(struct qed_dev *cdev);
+int qed_dbg_idle_chk(struct qed_dev *cdev, void *buffer,
+		     u32 *num_dumped_bytes);
+int qed_dbg_idle_chk_size(struct qed_dev *cdev);
+int qed_dbg_reg_fifo(struct qed_dev *cdev, void *buffer,
+		     u32 *num_dumped_bytes);
+int qed_dbg_reg_fifo_size(struct qed_dev *cdev);
+int qed_dbg_igu_fifo(struct qed_dev *cdev, void *buffer,
+		     u32 *num_dumped_bytes);
+int qed_dbg_igu_fifo_size(struct qed_dev *cdev);
+int qed_dbg_protection_override(struct qed_dev *cdev, void *buffer,
+				u32 *num_dumped_bytes);
+int qed_dbg_protection_override_size(struct qed_dev *cdev);
+int qed_dbg_fw_asserts(struct qed_dev *cdev, void *buffer,
+		       u32 *num_dumped_bytes);
+int qed_dbg_fw_asserts_size(struct qed_dev *cdev);
+int qed_dbg_mcp_trace(struct qed_dev *cdev, void *buffer,
+		      u32 *num_dumped_bytes);
+int qed_dbg_mcp_trace_size(struct qed_dev *cdev);
+int qed_dbg_all_data(struct qed_dev *cdev, void *buffer);
+int qed_dbg_all_data_size(struct qed_dev *cdev);
+u8 qed_get_debug_engine(struct qed_dev *cdev);
+void qed_set_debug_engine(struct qed_dev *cdev, int engine_number);
+int qed_dbg_feature(struct qed_dev *cdev, void *buffer,
+		    enum qed_dbg_features feature, u32 *num_dumped_bytes);
+int qed_dbg_feature_size(struct qed_dev *cdev, enum qed_dbg_features feature);
+
+void qed_dbg_pf_init(struct qed_dev *cdev);
+void qed_dbg_pf_exit(struct qed_dev *cdev);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
new file mode 100644
index 0000000..d2ad5e9
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -0,0 +1,4262 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/etherdevice.h>
+#include <linux/qed/qed_chain.h>
+#include <linux/qed/qed_if.h>
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_dcbx.h"
+#include "qed_dev_api.h"
+#include "qed_fcoe.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_init_ops.h"
+#include "qed_int.h"
+#include "qed_iscsi.h"
+#include "qed_ll2.h"
+#include "qed_mcp.h"
+#include "qed_ooo.h"
+#include "qed_reg_addr.h"
+#include "qed_sp.h"
+#include "qed_sriov.h"
+#include "qed_vf.h"
+#include "qed_rdma.h"
+
+static DEFINE_SPINLOCK(qm_lock);
+
+#define QED_MIN_DPIS            (4)
+#define QED_MIN_PWM_REGION      (QED_WID_SIZE * QED_MIN_DPIS)
+
+static u32 qed_hw_bar_size(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt, enum BAR_ID bar_id)
+{
+	u32 bar_reg = (bar_id == BAR_ID_0 ?
+		       PGLUE_B_REG_PF_BAR0_SIZE : PGLUE_B_REG_PF_BAR1_SIZE);
+	u32 val;
+
+	if (IS_VF(p_hwfn->cdev))
+		return qed_vf_hw_bar_size(p_hwfn, bar_id);
+
+	val = qed_rd(p_hwfn, p_ptt, bar_reg);
+	if (val)
+		return 1 << (val + 15);
+
+	/* Old MFW initialized above registered only conditionally */
+	if (p_hwfn->cdev->num_hwfns > 1) {
+		DP_INFO(p_hwfn,
+			"BAR size not configured. Assuming BAR size of 256kB for GRC and 512kB for DB\n");
+			return BAR_ID_0 ? 256 * 1024 : 512 * 1024;
+	} else {
+		DP_INFO(p_hwfn,
+			"BAR size not configured. Assuming BAR size of 512kB for GRC and 512kB for DB\n");
+			return 512 * 1024;
+	}
+}
+
+void qed_init_dp(struct qed_dev *cdev, u32 dp_module, u8 dp_level)
+{
+	u32 i;
+
+	cdev->dp_level = dp_level;
+	cdev->dp_module = dp_module;
+	for (i = 0; i < MAX_HWFNS_PER_DEVICE; i++) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		p_hwfn->dp_level = dp_level;
+		p_hwfn->dp_module = dp_module;
+	}
+}
+
+void qed_init_struct(struct qed_dev *cdev)
+{
+	u8 i;
+
+	for (i = 0; i < MAX_HWFNS_PER_DEVICE; i++) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		p_hwfn->cdev = cdev;
+		p_hwfn->my_id = i;
+		p_hwfn->b_active = false;
+
+		mutex_init(&p_hwfn->dmae_info.mutex);
+	}
+
+	/* hwfn 0 is always active */
+	cdev->hwfns[0].b_active = true;
+
+	/* set the default cache alignment to 128 */
+	cdev->cache_shift = 7;
+}
+
+static void qed_qm_info_free(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+
+	kfree(qm_info->qm_pq_params);
+	qm_info->qm_pq_params = NULL;
+	kfree(qm_info->qm_vport_params);
+	qm_info->qm_vport_params = NULL;
+	kfree(qm_info->qm_port_params);
+	qm_info->qm_port_params = NULL;
+	kfree(qm_info->wfq_data);
+	qm_info->wfq_data = NULL;
+}
+
+void qed_resc_free(struct qed_dev *cdev)
+{
+	int i;
+
+	if (IS_VF(cdev)) {
+		for_each_hwfn(cdev, i)
+			qed_l2_free(&cdev->hwfns[i]);
+		return;
+	}
+
+	kfree(cdev->fw_data);
+	cdev->fw_data = NULL;
+
+	kfree(cdev->reset_stats);
+	cdev->reset_stats = NULL;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		qed_cxt_mngr_free(p_hwfn);
+		qed_qm_info_free(p_hwfn);
+		qed_spq_free(p_hwfn);
+		qed_eq_free(p_hwfn);
+		qed_consq_free(p_hwfn);
+		qed_int_free(p_hwfn);
+#ifdef CONFIG_QED_LL2
+		qed_ll2_free(p_hwfn);
+#endif
+		if (p_hwfn->hw_info.personality == QED_PCI_FCOE)
+			qed_fcoe_free(p_hwfn);
+
+		if (p_hwfn->hw_info.personality == QED_PCI_ISCSI) {
+			qed_iscsi_free(p_hwfn);
+			qed_ooo_free(p_hwfn);
+		}
+		qed_iov_free(p_hwfn);
+		qed_l2_free(p_hwfn);
+		qed_dmae_info_free(p_hwfn);
+		qed_dcbx_info_free(p_hwfn);
+	}
+}
+
+/******************** QM initialization *******************/
+#define ACTIVE_TCS_BMAP 0x9f
+#define ACTIVE_TCS_BMAP_4PORT_K2 0xf
+
+/* determines the physical queue flags for a given PF. */
+static u32 qed_get_pq_flags(struct qed_hwfn *p_hwfn)
+{
+	u32 flags;
+
+	/* common flags */
+	flags = PQ_FLAGS_LB;
+
+	/* feature flags */
+	if (IS_QED_SRIOV(p_hwfn->cdev))
+		flags |= PQ_FLAGS_VFS;
+
+	/* protocol flags */
+	switch (p_hwfn->hw_info.personality) {
+	case QED_PCI_ETH:
+		flags |= PQ_FLAGS_MCOS;
+		break;
+	case QED_PCI_FCOE:
+		flags |= PQ_FLAGS_OFLD;
+		break;
+	case QED_PCI_ISCSI:
+		flags |= PQ_FLAGS_ACK | PQ_FLAGS_OOO | PQ_FLAGS_OFLD;
+		break;
+	case QED_PCI_ETH_ROCE:
+		flags |= PQ_FLAGS_MCOS | PQ_FLAGS_OFLD | PQ_FLAGS_LLT;
+		break;
+	case QED_PCI_ETH_IWARP:
+		flags |= PQ_FLAGS_MCOS | PQ_FLAGS_ACK | PQ_FLAGS_OOO |
+		    PQ_FLAGS_OFLD;
+		break;
+	default:
+		DP_ERR(p_hwfn,
+		       "unknown personality %d\n", p_hwfn->hw_info.personality);
+		return 0;
+	}
+
+	return flags;
+}
+
+/* Getters for resource amounts necessary for qm initialization */
+u8 qed_init_qm_get_num_tcs(struct qed_hwfn *p_hwfn)
+{
+	return p_hwfn->hw_info.num_hw_tc;
+}
+
+u16 qed_init_qm_get_num_vfs(struct qed_hwfn *p_hwfn)
+{
+	return IS_QED_SRIOV(p_hwfn->cdev) ?
+	       p_hwfn->cdev->p_iov_info->total_vfs : 0;
+}
+
+#define NUM_DEFAULT_RLS 1
+
+u16 qed_init_qm_get_num_pf_rls(struct qed_hwfn *p_hwfn)
+{
+	u16 num_pf_rls, num_vfs = qed_init_qm_get_num_vfs(p_hwfn);
+
+	/* num RLs can't exceed resource amount of rls or vports */
+	num_pf_rls = (u16) min_t(u32, RESC_NUM(p_hwfn, QED_RL),
+				 RESC_NUM(p_hwfn, QED_VPORT));
+
+	/* Make sure after we reserve there's something left */
+	if (num_pf_rls < num_vfs + NUM_DEFAULT_RLS)
+		return 0;
+
+	/* subtract rls necessary for VFs and one default one for the PF */
+	num_pf_rls -= num_vfs + NUM_DEFAULT_RLS;
+
+	return num_pf_rls;
+}
+
+u16 qed_init_qm_get_num_vports(struct qed_hwfn *p_hwfn)
+{
+	u32 pq_flags = qed_get_pq_flags(p_hwfn);
+
+	/* all pqs share the same vport, except for vfs and pf_rl pqs */
+	return (!!(PQ_FLAGS_RLS & pq_flags)) *
+	       qed_init_qm_get_num_pf_rls(p_hwfn) +
+	       (!!(PQ_FLAGS_VFS & pq_flags)) *
+	       qed_init_qm_get_num_vfs(p_hwfn) + 1;
+}
+
+/* calc amount of PQs according to the requested flags */
+u16 qed_init_qm_get_num_pqs(struct qed_hwfn *p_hwfn)
+{
+	u32 pq_flags = qed_get_pq_flags(p_hwfn);
+
+	return (!!(PQ_FLAGS_RLS & pq_flags)) *
+	       qed_init_qm_get_num_pf_rls(p_hwfn) +
+	       (!!(PQ_FLAGS_MCOS & pq_flags)) *
+	       qed_init_qm_get_num_tcs(p_hwfn) +
+	       (!!(PQ_FLAGS_LB & pq_flags)) + (!!(PQ_FLAGS_OOO & pq_flags)) +
+	       (!!(PQ_FLAGS_ACK & pq_flags)) + (!!(PQ_FLAGS_OFLD & pq_flags)) +
+	       (!!(PQ_FLAGS_LLT & pq_flags)) +
+	       (!!(PQ_FLAGS_VFS & pq_flags)) * qed_init_qm_get_num_vfs(p_hwfn);
+}
+
+/* initialize the top level QM params */
+static void qed_init_qm_params(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+	bool four_port;
+
+	/* pq and vport bases for this PF */
+	qm_info->start_pq = (u16) RESC_START(p_hwfn, QED_PQ);
+	qm_info->start_vport = (u8) RESC_START(p_hwfn, QED_VPORT);
+
+	/* rate limiting and weighted fair queueing are always enabled */
+	qm_info->vport_rl_en = true;
+	qm_info->vport_wfq_en = true;
+
+	/* TC config is different for AH 4 port */
+	four_port = p_hwfn->cdev->num_ports_in_engine == MAX_NUM_PORTS_K2;
+
+	/* in AH 4 port we have fewer TCs per port */
+	qm_info->max_phys_tcs_per_port = four_port ? NUM_PHYS_TCS_4PORT_K2 :
+						     NUM_OF_PHYS_TCS;
+
+	/* unless MFW indicated otherwise, ooo_tc == 3 for
+	 * AH 4-port and 4 otherwise.
+	 */
+	if (!qm_info->ooo_tc)
+		qm_info->ooo_tc = four_port ? DCBX_TCP_OOO_K2_4PORT_TC :
+					      DCBX_TCP_OOO_TC;
+}
+
+/* initialize qm vport params */
+static void qed_init_qm_vport_params(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+	u8 i;
+
+	/* all vports participate in weighted fair queueing */
+	for (i = 0; i < qed_init_qm_get_num_vports(p_hwfn); i++)
+		qm_info->qm_vport_params[i].vport_wfq = 1;
+}
+
+/* initialize qm port params */
+static void qed_init_qm_port_params(struct qed_hwfn *p_hwfn)
+{
+	/* Initialize qm port parameters */
+	u8 i, active_phys_tcs, num_ports = p_hwfn->cdev->num_ports_in_engine;
+
+	/* indicate how ooo and high pri traffic is dealt with */
+	active_phys_tcs = num_ports == MAX_NUM_PORTS_K2 ?
+			  ACTIVE_TCS_BMAP_4PORT_K2 :
+			  ACTIVE_TCS_BMAP;
+
+	for (i = 0; i < num_ports; i++) {
+		struct init_qm_port_params *p_qm_port =
+		    &p_hwfn->qm_info.qm_port_params[i];
+
+		p_qm_port->active = 1;
+		p_qm_port->active_phys_tcs = active_phys_tcs;
+		p_qm_port->num_pbf_cmd_lines = PBF_MAX_CMD_LINES / num_ports;
+		p_qm_port->num_btb_blocks = BTB_MAX_BLOCKS / num_ports;
+	}
+}
+
+/* Reset the params which must be reset for qm init. QM init may be called as
+ * a result of flows other than driver load (e.g. dcbx renegotiation). Other
+ * params may be affected by the init but would simply recalculate to the same
+ * values. The allocations made for QM init, ports, vports, pqs and vfqs are not
+ * affected as these amounts stay the same.
+ */
+static void qed_init_qm_reset_params(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+
+	qm_info->num_pqs = 0;
+	qm_info->num_vports = 0;
+	qm_info->num_pf_rls = 0;
+	qm_info->num_vf_pqs = 0;
+	qm_info->first_vf_pq = 0;
+	qm_info->first_mcos_pq = 0;
+	qm_info->first_rl_pq = 0;
+}
+
+static void qed_init_qm_advance_vport(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+
+	qm_info->num_vports++;
+
+	if (qm_info->num_vports > qed_init_qm_get_num_vports(p_hwfn))
+		DP_ERR(p_hwfn,
+		       "vport overflow! qm_info->num_vports %d, qm_init_get_num_vports() %d\n",
+		       qm_info->num_vports, qed_init_qm_get_num_vports(p_hwfn));
+}
+
+/* initialize a single pq and manage qm_info resources accounting.
+ * The pq_init_flags param determines whether the PQ is rate limited
+ * (for VF or PF) and whether a new vport is allocated to the pq or not
+ * (i.e. vport will be shared).
+ */
+
+/* flags for pq init */
+#define PQ_INIT_SHARE_VPORT     (1 << 0)
+#define PQ_INIT_PF_RL           (1 << 1)
+#define PQ_INIT_VF_RL           (1 << 2)
+
+/* defines for pq init */
+#define PQ_INIT_DEFAULT_WRR_GROUP       1
+#define PQ_INIT_DEFAULT_TC              0
+#define PQ_INIT_OFLD_TC                 (p_hwfn->hw_info.offload_tc)
+
+static void qed_init_qm_pq(struct qed_hwfn *p_hwfn,
+			   struct qed_qm_info *qm_info,
+			   u8 tc, u32 pq_init_flags)
+{
+	u16 pq_idx = qm_info->num_pqs, max_pq = qed_init_qm_get_num_pqs(p_hwfn);
+
+	if (pq_idx > max_pq)
+		DP_ERR(p_hwfn,
+		       "pq overflow! pq %d, max pq %d\n", pq_idx, max_pq);
+
+	/* init pq params */
+	qm_info->qm_pq_params[pq_idx].port_id = p_hwfn->port_id;
+	qm_info->qm_pq_params[pq_idx].vport_id = qm_info->start_vport +
+	    qm_info->num_vports;
+	qm_info->qm_pq_params[pq_idx].tc_id = tc;
+	qm_info->qm_pq_params[pq_idx].wrr_group = PQ_INIT_DEFAULT_WRR_GROUP;
+	qm_info->qm_pq_params[pq_idx].rl_valid =
+	    (pq_init_flags & PQ_INIT_PF_RL || pq_init_flags & PQ_INIT_VF_RL);
+
+	/* qm params accounting */
+	qm_info->num_pqs++;
+	if (!(pq_init_flags & PQ_INIT_SHARE_VPORT))
+		qm_info->num_vports++;
+
+	if (pq_init_flags & PQ_INIT_PF_RL)
+		qm_info->num_pf_rls++;
+
+	if (qm_info->num_vports > qed_init_qm_get_num_vports(p_hwfn))
+		DP_ERR(p_hwfn,
+		       "vport overflow! qm_info->num_vports %d, qm_init_get_num_vports() %d\n",
+		       qm_info->num_vports, qed_init_qm_get_num_vports(p_hwfn));
+
+	if (qm_info->num_pf_rls > qed_init_qm_get_num_pf_rls(p_hwfn))
+		DP_ERR(p_hwfn,
+		       "rl overflow! qm_info->num_pf_rls %d, qm_init_get_num_pf_rls() %d\n",
+		       qm_info->num_pf_rls, qed_init_qm_get_num_pf_rls(p_hwfn));
+}
+
+/* get pq index according to PQ_FLAGS */
+static u16 *qed_init_qm_get_idx_from_flags(struct qed_hwfn *p_hwfn,
+					   u32 pq_flags)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+
+	/* Can't have multiple flags set here */
+	if (bitmap_weight((unsigned long *)&pq_flags, sizeof(pq_flags)) > 1)
+		goto err;
+
+	switch (pq_flags) {
+	case PQ_FLAGS_RLS:
+		return &qm_info->first_rl_pq;
+	case PQ_FLAGS_MCOS:
+		return &qm_info->first_mcos_pq;
+	case PQ_FLAGS_LB:
+		return &qm_info->pure_lb_pq;
+	case PQ_FLAGS_OOO:
+		return &qm_info->ooo_pq;
+	case PQ_FLAGS_ACK:
+		return &qm_info->pure_ack_pq;
+	case PQ_FLAGS_OFLD:
+		return &qm_info->offload_pq;
+	case PQ_FLAGS_LLT:
+		return &qm_info->low_latency_pq;
+	case PQ_FLAGS_VFS:
+		return &qm_info->first_vf_pq;
+	default:
+		goto err;
+	}
+
+err:
+	DP_ERR(p_hwfn, "BAD pq flags %d\n", pq_flags);
+	return NULL;
+}
+
+/* save pq index in qm info */
+static void qed_init_qm_set_idx(struct qed_hwfn *p_hwfn,
+				u32 pq_flags, u16 pq_val)
+{
+	u16 *base_pq_idx = qed_init_qm_get_idx_from_flags(p_hwfn, pq_flags);
+
+	*base_pq_idx = p_hwfn->qm_info.start_pq + pq_val;
+}
+
+/* get tx pq index, with the PQ TX base already set (ready for context init) */
+u16 qed_get_cm_pq_idx(struct qed_hwfn *p_hwfn, u32 pq_flags)
+{
+	u16 *base_pq_idx = qed_init_qm_get_idx_from_flags(p_hwfn, pq_flags);
+
+	return *base_pq_idx + CM_TX_PQ_BASE;
+}
+
+u16 qed_get_cm_pq_idx_mcos(struct qed_hwfn *p_hwfn, u8 tc)
+{
+	u8 max_tc = qed_init_qm_get_num_tcs(p_hwfn);
+
+	if (tc > max_tc)
+		DP_ERR(p_hwfn, "tc %d must be smaller than %d\n", tc, max_tc);
+
+	return qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_MCOS) + tc;
+}
+
+u16 qed_get_cm_pq_idx_vf(struct qed_hwfn *p_hwfn, u16 vf)
+{
+	u16 max_vf = qed_init_qm_get_num_vfs(p_hwfn);
+
+	if (vf > max_vf)
+		DP_ERR(p_hwfn, "vf %d must be smaller than %d\n", vf, max_vf);
+
+	return qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_VFS) + vf;
+}
+
+u16 qed_get_cm_pq_idx_rl(struct qed_hwfn *p_hwfn, u8 rl)
+{
+	u16 max_rl = qed_init_qm_get_num_pf_rls(p_hwfn);
+
+	if (rl > max_rl)
+		DP_ERR(p_hwfn, "rl %d must be smaller than %d\n", rl, max_rl);
+
+	return qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_RLS) + rl;
+}
+
+/* Functions for creating specific types of pqs */
+static void qed_init_qm_lb_pq(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+
+	if (!(qed_get_pq_flags(p_hwfn) & PQ_FLAGS_LB))
+		return;
+
+	qed_init_qm_set_idx(p_hwfn, PQ_FLAGS_LB, qm_info->num_pqs);
+	qed_init_qm_pq(p_hwfn, qm_info, PURE_LB_TC, PQ_INIT_SHARE_VPORT);
+}
+
+static void qed_init_qm_ooo_pq(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+
+	if (!(qed_get_pq_flags(p_hwfn) & PQ_FLAGS_OOO))
+		return;
+
+	qed_init_qm_set_idx(p_hwfn, PQ_FLAGS_OOO, qm_info->num_pqs);
+	qed_init_qm_pq(p_hwfn, qm_info, qm_info->ooo_tc, PQ_INIT_SHARE_VPORT);
+}
+
+static void qed_init_qm_pure_ack_pq(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+
+	if (!(qed_get_pq_flags(p_hwfn) & PQ_FLAGS_ACK))
+		return;
+
+	qed_init_qm_set_idx(p_hwfn, PQ_FLAGS_ACK, qm_info->num_pqs);
+	qed_init_qm_pq(p_hwfn, qm_info, PQ_INIT_OFLD_TC, PQ_INIT_SHARE_VPORT);
+}
+
+static void qed_init_qm_offload_pq(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+
+	if (!(qed_get_pq_flags(p_hwfn) & PQ_FLAGS_OFLD))
+		return;
+
+	qed_init_qm_set_idx(p_hwfn, PQ_FLAGS_OFLD, qm_info->num_pqs);
+	qed_init_qm_pq(p_hwfn, qm_info, PQ_INIT_OFLD_TC, PQ_INIT_SHARE_VPORT);
+}
+
+static void qed_init_qm_low_latency_pq(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+
+	if (!(qed_get_pq_flags(p_hwfn) & PQ_FLAGS_LLT))
+		return;
+
+	qed_init_qm_set_idx(p_hwfn, PQ_FLAGS_LLT, qm_info->num_pqs);
+	qed_init_qm_pq(p_hwfn, qm_info, PQ_INIT_OFLD_TC, PQ_INIT_SHARE_VPORT);
+}
+
+static void qed_init_qm_mcos_pqs(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+	u8 tc_idx;
+
+	if (!(qed_get_pq_flags(p_hwfn) & PQ_FLAGS_MCOS))
+		return;
+
+	qed_init_qm_set_idx(p_hwfn, PQ_FLAGS_MCOS, qm_info->num_pqs);
+	for (tc_idx = 0; tc_idx < qed_init_qm_get_num_tcs(p_hwfn); tc_idx++)
+		qed_init_qm_pq(p_hwfn, qm_info, tc_idx, PQ_INIT_SHARE_VPORT);
+}
+
+static void qed_init_qm_vf_pqs(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+	u16 vf_idx, num_vfs = qed_init_qm_get_num_vfs(p_hwfn);
+
+	if (!(qed_get_pq_flags(p_hwfn) & PQ_FLAGS_VFS))
+		return;
+
+	qed_init_qm_set_idx(p_hwfn, PQ_FLAGS_VFS, qm_info->num_pqs);
+	qm_info->num_vf_pqs = num_vfs;
+	for (vf_idx = 0; vf_idx < num_vfs; vf_idx++)
+		qed_init_qm_pq(p_hwfn,
+			       qm_info, PQ_INIT_DEFAULT_TC, PQ_INIT_VF_RL);
+}
+
+static void qed_init_qm_rl_pqs(struct qed_hwfn *p_hwfn)
+{
+	u16 pf_rls_idx, num_pf_rls = qed_init_qm_get_num_pf_rls(p_hwfn);
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+
+	if (!(qed_get_pq_flags(p_hwfn) & PQ_FLAGS_RLS))
+		return;
+
+	qed_init_qm_set_idx(p_hwfn, PQ_FLAGS_RLS, qm_info->num_pqs);
+	for (pf_rls_idx = 0; pf_rls_idx < num_pf_rls; pf_rls_idx++)
+		qed_init_qm_pq(p_hwfn, qm_info, PQ_INIT_OFLD_TC, PQ_INIT_PF_RL);
+}
+
+static void qed_init_qm_pq_params(struct qed_hwfn *p_hwfn)
+{
+	/* rate limited pqs, must come first (FW assumption) */
+	qed_init_qm_rl_pqs(p_hwfn);
+
+	/* pqs for multi cos */
+	qed_init_qm_mcos_pqs(p_hwfn);
+
+	/* pure loopback pq */
+	qed_init_qm_lb_pq(p_hwfn);
+
+	/* out of order pq */
+	qed_init_qm_ooo_pq(p_hwfn);
+
+	/* pure ack pq */
+	qed_init_qm_pure_ack_pq(p_hwfn);
+
+	/* pq for offloaded protocol */
+	qed_init_qm_offload_pq(p_hwfn);
+
+	/* low latency pq */
+	qed_init_qm_low_latency_pq(p_hwfn);
+
+	/* done sharing vports */
+	qed_init_qm_advance_vport(p_hwfn);
+
+	/* pqs for vfs */
+	qed_init_qm_vf_pqs(p_hwfn);
+}
+
+/* compare values of getters against resources amounts */
+static int qed_init_qm_sanity(struct qed_hwfn *p_hwfn)
+{
+	if (qed_init_qm_get_num_vports(p_hwfn) > RESC_NUM(p_hwfn, QED_VPORT)) {
+		DP_ERR(p_hwfn, "requested amount of vports exceeds resource\n");
+		return -EINVAL;
+	}
+
+	if (qed_init_qm_get_num_pqs(p_hwfn) > RESC_NUM(p_hwfn, QED_PQ)) {
+		DP_ERR(p_hwfn, "requested amount of pqs exceeds resource\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void qed_dp_init_qm_params(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+	struct init_qm_vport_params *vport;
+	struct init_qm_port_params *port;
+	struct init_qm_pq_params *pq;
+	int i, tc;
+
+	/* top level params */
+	DP_VERBOSE(p_hwfn,
+		   NETIF_MSG_HW,
+		   "qm init top level params: start_pq %d, start_vport %d, pure_lb_pq %d, offload_pq %d, pure_ack_pq %d\n",
+		   qm_info->start_pq,
+		   qm_info->start_vport,
+		   qm_info->pure_lb_pq,
+		   qm_info->offload_pq, qm_info->pure_ack_pq);
+	DP_VERBOSE(p_hwfn,
+		   NETIF_MSG_HW,
+		   "ooo_pq %d, first_vf_pq %d, num_pqs %d, num_vf_pqs %d, num_vports %d, max_phys_tcs_per_port %d\n",
+		   qm_info->ooo_pq,
+		   qm_info->first_vf_pq,
+		   qm_info->num_pqs,
+		   qm_info->num_vf_pqs,
+		   qm_info->num_vports, qm_info->max_phys_tcs_per_port);
+	DP_VERBOSE(p_hwfn,
+		   NETIF_MSG_HW,
+		   "pf_rl_en %d, pf_wfq_en %d, vport_rl_en %d, vport_wfq_en %d, pf_wfq %d, pf_rl %d, num_pf_rls %d, pq_flags %x\n",
+		   qm_info->pf_rl_en,
+		   qm_info->pf_wfq_en,
+		   qm_info->vport_rl_en,
+		   qm_info->vport_wfq_en,
+		   qm_info->pf_wfq,
+		   qm_info->pf_rl,
+		   qm_info->num_pf_rls, qed_get_pq_flags(p_hwfn));
+
+	/* port table */
+	for (i = 0; i < p_hwfn->cdev->num_ports_in_engine; i++) {
+		port = &(qm_info->qm_port_params[i]);
+		DP_VERBOSE(p_hwfn,
+			   NETIF_MSG_HW,
+			   "port idx %d, active %d, active_phys_tcs %d, num_pbf_cmd_lines %d, num_btb_blocks %d, reserved %d\n",
+			   i,
+			   port->active,
+			   port->active_phys_tcs,
+			   port->num_pbf_cmd_lines,
+			   port->num_btb_blocks, port->reserved);
+	}
+
+	/* vport table */
+	for (i = 0; i < qm_info->num_vports; i++) {
+		vport = &(qm_info->qm_vport_params[i]);
+		DP_VERBOSE(p_hwfn,
+			   NETIF_MSG_HW,
+			   "vport idx %d, vport_rl %d, wfq %d, first_tx_pq_id [ ",
+			   qm_info->start_vport + i,
+			   vport->vport_rl, vport->vport_wfq);
+		for (tc = 0; tc < NUM_OF_TCS; tc++)
+			DP_VERBOSE(p_hwfn,
+				   NETIF_MSG_HW,
+				   "%d ", vport->first_tx_pq_id[tc]);
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW, "]\n");
+	}
+
+	/* pq table */
+	for (i = 0; i < qm_info->num_pqs; i++) {
+		pq = &(qm_info->qm_pq_params[i]);
+		DP_VERBOSE(p_hwfn,
+			   NETIF_MSG_HW,
+			   "pq idx %d, port %d, vport_id %d, tc %d, wrr_grp %d, rl_valid %d\n",
+			   qm_info->start_pq + i,
+			   pq->port_id,
+			   pq->vport_id,
+			   pq->tc_id, pq->wrr_group, pq->rl_valid);
+	}
+}
+
+static void qed_init_qm_info(struct qed_hwfn *p_hwfn)
+{
+	/* reset params required for init run */
+	qed_init_qm_reset_params(p_hwfn);
+
+	/* init QM top level params */
+	qed_init_qm_params(p_hwfn);
+
+	/* init QM port params */
+	qed_init_qm_port_params(p_hwfn);
+
+	/* init QM vport params */
+	qed_init_qm_vport_params(p_hwfn);
+
+	/* init QM physical queue params */
+	qed_init_qm_pq_params(p_hwfn);
+
+	/* display all that init */
+	qed_dp_init_qm_params(p_hwfn);
+}
+
+/* This function reconfigures the QM pf on the fly.
+ * For this purpose we:
+ * 1. reconfigure the QM database
+ * 2. set new values to runtime array
+ * 3. send an sdm_qm_cmd through the rbc interface to stop the QM
+ * 4. activate init tool in QM_PF stage
+ * 5. send an sdm_qm_cmd through rbc interface to release the QM
+ */
+int qed_qm_reconf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+	bool b_rc;
+	int rc;
+
+	/* initialize qed's qm data structure */
+	qed_init_qm_info(p_hwfn);
+
+	/* stop PF's qm queues */
+	spin_lock_bh(&qm_lock);
+	b_rc = qed_send_qm_stop_cmd(p_hwfn, p_ptt, false, true,
+				    qm_info->start_pq, qm_info->num_pqs);
+	spin_unlock_bh(&qm_lock);
+	if (!b_rc)
+		return -EINVAL;
+
+	/* clear the QM_PF runtime phase leftovers from previous init */
+	qed_init_clear_rt_data(p_hwfn);
+
+	/* prepare QM portion of runtime array */
+	qed_qm_init_pf(p_hwfn, p_ptt, false);
+
+	/* activate init tool on runtime array */
+	rc = qed_init_run(p_hwfn, p_ptt, PHASE_QM_PF, p_hwfn->rel_pf_id,
+			  p_hwfn->hw_info.hw_mode);
+	if (rc)
+		return rc;
+
+	/* start PF's qm queues */
+	spin_lock_bh(&qm_lock);
+	b_rc = qed_send_qm_stop_cmd(p_hwfn, p_ptt, true, true,
+				    qm_info->start_pq, qm_info->num_pqs);
+	spin_unlock_bh(&qm_lock);
+	if (!b_rc)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qed_alloc_qm_data(struct qed_hwfn *p_hwfn)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+	int rc;
+
+	rc = qed_init_qm_sanity(p_hwfn);
+	if (rc)
+		goto alloc_err;
+
+	qm_info->qm_pq_params = kzalloc(sizeof(*qm_info->qm_pq_params) *
+					qed_init_qm_get_num_pqs(p_hwfn),
+					GFP_KERNEL);
+	if (!qm_info->qm_pq_params)
+		goto alloc_err;
+
+	qm_info->qm_vport_params = kzalloc(sizeof(*qm_info->qm_vport_params) *
+					   qed_init_qm_get_num_vports(p_hwfn),
+					   GFP_KERNEL);
+	if (!qm_info->qm_vport_params)
+		goto alloc_err;
+
+	qm_info->qm_port_params = kzalloc(sizeof(*qm_info->qm_port_params) *
+					  p_hwfn->cdev->num_ports_in_engine,
+					  GFP_KERNEL);
+	if (!qm_info->qm_port_params)
+		goto alloc_err;
+
+	qm_info->wfq_data = kzalloc(sizeof(*qm_info->wfq_data) *
+				    qed_init_qm_get_num_vports(p_hwfn),
+				    GFP_KERNEL);
+	if (!qm_info->wfq_data)
+		goto alloc_err;
+
+	return 0;
+
+alloc_err:
+	DP_NOTICE(p_hwfn, "Failed to allocate memory for QM params\n");
+	qed_qm_info_free(p_hwfn);
+	return -ENOMEM;
+}
+
+int qed_resc_alloc(struct qed_dev *cdev)
+{
+	u32 rdma_tasks, excess_tasks;
+	u32 line_count;
+	int i, rc = 0;
+
+	if (IS_VF(cdev)) {
+		for_each_hwfn(cdev, i) {
+			rc = qed_l2_alloc(&cdev->hwfns[i]);
+			if (rc)
+				return rc;
+		}
+		return rc;
+	}
+
+	cdev->fw_data = kzalloc(sizeof(*cdev->fw_data), GFP_KERNEL);
+	if (!cdev->fw_data)
+		return -ENOMEM;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+		u32 n_eqes, num_cons;
+
+		/* First allocate the context manager structure */
+		rc = qed_cxt_mngr_alloc(p_hwfn);
+		if (rc)
+			goto alloc_err;
+
+		/* Set the HW cid/tid numbers (in the contest manager)
+		 * Must be done prior to any further computations.
+		 */
+		rc = qed_cxt_set_pf_params(p_hwfn, RDMA_MAX_TIDS);
+		if (rc)
+			goto alloc_err;
+
+		rc = qed_alloc_qm_data(p_hwfn);
+		if (rc)
+			goto alloc_err;
+
+		/* init qm info */
+		qed_init_qm_info(p_hwfn);
+
+		/* Compute the ILT client partition */
+		rc = qed_cxt_cfg_ilt_compute(p_hwfn, &line_count);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "too many ILT lines; re-computing with less lines\n");
+			/* In case there are not enough ILT lines we reduce the
+			 * number of RDMA tasks and re-compute.
+			 */
+			excess_tasks =
+			    qed_cxt_cfg_ilt_compute_excess(p_hwfn, line_count);
+			if (!excess_tasks)
+				goto alloc_err;
+
+			rdma_tasks = RDMA_MAX_TIDS - excess_tasks;
+			rc = qed_cxt_set_pf_params(p_hwfn, rdma_tasks);
+			if (rc)
+				goto alloc_err;
+
+			rc = qed_cxt_cfg_ilt_compute(p_hwfn, &line_count);
+			if (rc) {
+				DP_ERR(p_hwfn,
+				       "failed ILT compute. Requested too many lines: %u\n",
+				       line_count);
+
+				goto alloc_err;
+			}
+		}
+
+		/* CID map / ILT shadow table / T2
+		 * The talbes sizes are determined by the computations above
+		 */
+		rc = qed_cxt_tables_alloc(p_hwfn);
+		if (rc)
+			goto alloc_err;
+
+		/* SPQ, must follow ILT because initializes SPQ context */
+		rc = qed_spq_alloc(p_hwfn);
+		if (rc)
+			goto alloc_err;
+
+		/* SP status block allocation */
+		p_hwfn->p_dpc_ptt = qed_get_reserved_ptt(p_hwfn,
+							 RESERVED_PTT_DPC);
+
+		rc = qed_int_alloc(p_hwfn, p_hwfn->p_main_ptt);
+		if (rc)
+			goto alloc_err;
+
+		rc = qed_iov_alloc(p_hwfn);
+		if (rc)
+			goto alloc_err;
+
+		/* EQ */
+		n_eqes = qed_chain_get_capacity(&p_hwfn->p_spq->chain);
+		if (QED_IS_RDMA_PERSONALITY(p_hwfn)) {
+			enum protocol_type rdma_proto;
+
+			if (QED_IS_ROCE_PERSONALITY(p_hwfn))
+				rdma_proto = PROTOCOLID_ROCE;
+			else
+				rdma_proto = PROTOCOLID_IWARP;
+
+			num_cons = qed_cxt_get_proto_cid_count(p_hwfn,
+							       rdma_proto,
+							       NULL) * 2;
+			n_eqes += num_cons + 2 * MAX_NUM_VFS_BB;
+		} else if (p_hwfn->hw_info.personality == QED_PCI_ISCSI) {
+			num_cons =
+			    qed_cxt_get_proto_cid_count(p_hwfn,
+							PROTOCOLID_ISCSI,
+							NULL);
+			n_eqes += 2 * num_cons;
+		}
+
+		if (n_eqes > 0xFFFF) {
+			DP_ERR(p_hwfn,
+			       "Cannot allocate 0x%x EQ elements. The maximum of a u16 chain is 0x%x\n",
+			       n_eqes, 0xFFFF);
+			goto alloc_no_mem;
+		}
+
+		rc = qed_eq_alloc(p_hwfn, (u16) n_eqes);
+		if (rc)
+			goto alloc_err;
+
+		rc = qed_consq_alloc(p_hwfn);
+		if (rc)
+			goto alloc_err;
+
+		rc = qed_l2_alloc(p_hwfn);
+		if (rc)
+			goto alloc_err;
+
+#ifdef CONFIG_QED_LL2
+		if (p_hwfn->using_ll2) {
+			rc = qed_ll2_alloc(p_hwfn);
+			if (rc)
+				goto alloc_err;
+		}
+#endif
+
+		if (p_hwfn->hw_info.personality == QED_PCI_FCOE) {
+			rc = qed_fcoe_alloc(p_hwfn);
+			if (rc)
+				goto alloc_err;
+		}
+
+		if (p_hwfn->hw_info.personality == QED_PCI_ISCSI) {
+			rc = qed_iscsi_alloc(p_hwfn);
+			if (rc)
+				goto alloc_err;
+			rc = qed_ooo_alloc(p_hwfn);
+			if (rc)
+				goto alloc_err;
+		}
+
+		/* DMA info initialization */
+		rc = qed_dmae_info_alloc(p_hwfn);
+		if (rc)
+			goto alloc_err;
+
+		/* DCBX initialization */
+		rc = qed_dcbx_info_alloc(p_hwfn);
+		if (rc)
+			goto alloc_err;
+	}
+
+	cdev->reset_stats = kzalloc(sizeof(*cdev->reset_stats), GFP_KERNEL);
+	if (!cdev->reset_stats)
+		goto alloc_no_mem;
+
+	return 0;
+
+alloc_no_mem:
+	rc = -ENOMEM;
+alloc_err:
+	qed_resc_free(cdev);
+	return rc;
+}
+
+void qed_resc_setup(struct qed_dev *cdev)
+{
+	int i;
+
+	if (IS_VF(cdev)) {
+		for_each_hwfn(cdev, i)
+			qed_l2_setup(&cdev->hwfns[i]);
+		return;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		qed_cxt_mngr_setup(p_hwfn);
+		qed_spq_setup(p_hwfn);
+		qed_eq_setup(p_hwfn);
+		qed_consq_setup(p_hwfn);
+
+		/* Read shadow of current MFW mailbox */
+		qed_mcp_read_mb(p_hwfn, p_hwfn->p_main_ptt);
+		memcpy(p_hwfn->mcp_info->mfw_mb_shadow,
+		       p_hwfn->mcp_info->mfw_mb_cur,
+		       p_hwfn->mcp_info->mfw_mb_length);
+
+		qed_int_setup(p_hwfn, p_hwfn->p_main_ptt);
+
+		qed_l2_setup(p_hwfn);
+		qed_iov_setup(p_hwfn);
+#ifdef CONFIG_QED_LL2
+		if (p_hwfn->using_ll2)
+			qed_ll2_setup(p_hwfn);
+#endif
+		if (p_hwfn->hw_info.personality == QED_PCI_FCOE)
+			qed_fcoe_setup(p_hwfn);
+
+		if (p_hwfn->hw_info.personality == QED_PCI_ISCSI) {
+			qed_iscsi_setup(p_hwfn);
+			qed_ooo_setup(p_hwfn);
+		}
+	}
+}
+
+#define FINAL_CLEANUP_POLL_CNT          (100)
+#define FINAL_CLEANUP_POLL_TIME         (10)
+int qed_final_cleanup(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt, u16 id, bool is_vf)
+{
+	u32 command = 0, addr, count = FINAL_CLEANUP_POLL_CNT;
+	int rc = -EBUSY;
+
+	addr = GTT_BAR0_MAP_REG_USDM_RAM +
+		USTORM_FLR_FINAL_ACK_OFFSET(p_hwfn->rel_pf_id);
+
+	if (is_vf)
+		id += 0x10;
+
+	command |= X_FINAL_CLEANUP_AGG_INT <<
+		SDM_AGG_INT_COMP_PARAMS_AGG_INT_INDEX_SHIFT;
+	command |= 1 << SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_ENABLE_SHIFT;
+	command |= id << SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_BIT_SHIFT;
+	command |= SDM_COMP_TYPE_AGG_INT << SDM_OP_GEN_COMP_TYPE_SHIFT;
+
+	/* Make sure notification is not set before initiating final cleanup */
+	if (REG_RD(p_hwfn, addr)) {
+		DP_NOTICE(p_hwfn,
+			  "Unexpected; Found final cleanup notification before initiating final cleanup\n");
+		REG_WR(p_hwfn, addr, 0);
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "Sending final cleanup for PFVF[%d] [Command %08x\n]",
+		   id, command);
+
+	qed_wr(p_hwfn, p_ptt, XSDM_REG_OPERATION_GEN, command);
+
+	/* Poll until completion */
+	while (!REG_RD(p_hwfn, addr) && count--)
+		msleep(FINAL_CLEANUP_POLL_TIME);
+
+	if (REG_RD(p_hwfn, addr))
+		rc = 0;
+	else
+		DP_NOTICE(p_hwfn,
+			  "Failed to receive FW final cleanup notification\n");
+
+	/* Cleanup afterwards */
+	REG_WR(p_hwfn, addr, 0);
+
+	return rc;
+}
+
+static int qed_calc_hw_mode(struct qed_hwfn *p_hwfn)
+{
+	int hw_mode = 0;
+
+	if (QED_IS_BB_B0(p_hwfn->cdev)) {
+		hw_mode |= 1 << MODE_BB;
+	} else if (QED_IS_AH(p_hwfn->cdev)) {
+		hw_mode |= 1 << MODE_K2;
+	} else {
+		DP_NOTICE(p_hwfn, "Unknown chip type %#x\n",
+			  p_hwfn->cdev->type);
+		return -EINVAL;
+	}
+
+	switch (p_hwfn->cdev->num_ports_in_engine) {
+	case 1:
+		hw_mode |= 1 << MODE_PORTS_PER_ENG_1;
+		break;
+	case 2:
+		hw_mode |= 1 << MODE_PORTS_PER_ENG_2;
+		break;
+	case 4:
+		hw_mode |= 1 << MODE_PORTS_PER_ENG_4;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "num_ports_in_engine = %d not supported\n",
+			  p_hwfn->cdev->num_ports_in_engine);
+		return -EINVAL;
+	}
+
+	switch (p_hwfn->cdev->mf_mode) {
+	case QED_MF_DEFAULT:
+	case QED_MF_NPAR:
+		hw_mode |= 1 << MODE_MF_SI;
+		break;
+	case QED_MF_OVLAN:
+		hw_mode |= 1 << MODE_MF_SD;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Unsupported MF mode, init as DEFAULT\n");
+		hw_mode |= 1 << MODE_MF_SI;
+	}
+
+	hw_mode |= 1 << MODE_ASIC;
+
+	if (p_hwfn->cdev->num_hwfns > 1)
+		hw_mode |= 1 << MODE_100G;
+
+	p_hwfn->hw_info.hw_mode = hw_mode;
+
+	DP_VERBOSE(p_hwfn, (NETIF_MSG_PROBE | NETIF_MSG_IFUP),
+		   "Configuring function for hw_mode: 0x%08x\n",
+		   p_hwfn->hw_info.hw_mode);
+
+	return 0;
+}
+
+/* Init run time data for all PFs on an engine. */
+static void qed_init_cau_rt_data(struct qed_dev *cdev)
+{
+	u32 offset = CAU_REG_SB_VAR_MEMORY_RT_OFFSET;
+	int i, igu_sb_id;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+		struct qed_igu_info *p_igu_info;
+		struct qed_igu_block *p_block;
+		struct cau_sb_entry sb_entry;
+
+		p_igu_info = p_hwfn->hw_info.p_igu_info;
+
+		for (igu_sb_id = 0;
+		     igu_sb_id < QED_MAPPING_MEMORY_SIZE(cdev); igu_sb_id++) {
+			p_block = &p_igu_info->entry[igu_sb_id];
+
+			if (!p_block->is_pf)
+				continue;
+
+			qed_init_cau_sb_entry(p_hwfn, &sb_entry,
+					      p_block->function_id, 0, 0);
+			STORE_RT_REG_AGG(p_hwfn, offset + igu_sb_id * 2,
+					 sb_entry);
+		}
+	}
+}
+
+static void qed_init_cache_line_size(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt)
+{
+	u32 val, wr_mbs, cache_line_size;
+
+	val = qed_rd(p_hwfn, p_ptt, PSWRQ2_REG_WR_MBS0);
+	switch (val) {
+	case 0:
+		wr_mbs = 128;
+		break;
+	case 1:
+		wr_mbs = 256;
+		break;
+	case 2:
+		wr_mbs = 512;
+		break;
+	default:
+		DP_INFO(p_hwfn,
+			"Unexpected value of PSWRQ2_REG_WR_MBS0 [0x%x]. Avoid configuring PGLUE_B_REG_CACHE_LINE_SIZE.\n",
+			val);
+		return;
+	}
+
+	cache_line_size = min_t(u32, L1_CACHE_BYTES, wr_mbs);
+	switch (cache_line_size) {
+	case 32:
+		val = 0;
+		break;
+	case 64:
+		val = 1;
+		break;
+	case 128:
+		val = 2;
+		break;
+	case 256:
+		val = 3;
+		break;
+	default:
+		DP_INFO(p_hwfn,
+			"Unexpected value of cache line size [0x%x]. Avoid configuring PGLUE_B_REG_CACHE_LINE_SIZE.\n",
+			cache_line_size);
+	}
+
+	if (L1_CACHE_BYTES > wr_mbs)
+		DP_INFO(p_hwfn,
+			"The cache line size for padding is suboptimal for performance [OS cache line size 0x%x, wr mbs 0x%x]\n",
+			L1_CACHE_BYTES, wr_mbs);
+
+	STORE_RT_REG(p_hwfn, PGLUE_REG_B_CACHE_LINE_SIZE_RT_OFFSET, val);
+	if (val > 0) {
+		STORE_RT_REG(p_hwfn, PSWRQ2_REG_DRAM_ALIGN_WR_RT_OFFSET, val);
+		STORE_RT_REG(p_hwfn, PSWRQ2_REG_DRAM_ALIGN_RD_RT_OFFSET, val);
+	}
+}
+
+static int qed_hw_init_common(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt, int hw_mode)
+{
+	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
+	struct qed_qm_common_rt_init_params params;
+	struct qed_dev *cdev = p_hwfn->cdev;
+	u8 vf_id, max_num_vfs;
+	u16 num_pfs, pf_id;
+	u32 concrete_fid;
+	int rc = 0;
+
+	qed_init_cau_rt_data(cdev);
+
+	/* Program GTT windows */
+	qed_gtt_init(p_hwfn);
+
+	if (p_hwfn->mcp_info) {
+		if (p_hwfn->mcp_info->func_info.bandwidth_max)
+			qm_info->pf_rl_en = true;
+		if (p_hwfn->mcp_info->func_info.bandwidth_min)
+			qm_info->pf_wfq_en = true;
+	}
+
+	memset(&params, 0, sizeof(params));
+	params.max_ports_per_engine = p_hwfn->cdev->num_ports_in_engine;
+	params.max_phys_tcs_per_port = qm_info->max_phys_tcs_per_port;
+	params.pf_rl_en = qm_info->pf_rl_en;
+	params.pf_wfq_en = qm_info->pf_wfq_en;
+	params.vport_rl_en = qm_info->vport_rl_en;
+	params.vport_wfq_en = qm_info->vport_wfq_en;
+	params.port_params = qm_info->qm_port_params;
+
+	qed_qm_common_rt_init(p_hwfn, &params);
+
+	qed_cxt_hw_init_common(p_hwfn);
+
+	qed_init_cache_line_size(p_hwfn, p_ptt);
+
+	rc = qed_init_run(p_hwfn, p_ptt, PHASE_ENGINE, ANY_PHASE_ID, hw_mode);
+	if (rc)
+		return rc;
+
+	qed_wr(p_hwfn, p_ptt, PSWRQ2_REG_L2P_VALIDATE_VFID, 0);
+	qed_wr(p_hwfn, p_ptt, PGLUE_B_REG_USE_CLIENTID_IN_TAG, 1);
+
+	if (QED_IS_BB(p_hwfn->cdev)) {
+		num_pfs = NUM_OF_ENG_PFS(p_hwfn->cdev);
+		for (pf_id = 0; pf_id < num_pfs; pf_id++) {
+			qed_fid_pretend(p_hwfn, p_ptt, pf_id);
+			qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_ROCE, 0x0);
+			qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_TCP, 0x0);
+		}
+		/* pretend to original PF */
+		qed_fid_pretend(p_hwfn, p_ptt, p_hwfn->rel_pf_id);
+	}
+
+	max_num_vfs = QED_IS_AH(cdev) ? MAX_NUM_VFS_K2 : MAX_NUM_VFS_BB;
+	for (vf_id = 0; vf_id < max_num_vfs; vf_id++) {
+		concrete_fid = qed_vfid_to_concrete(p_hwfn, vf_id);
+		qed_fid_pretend(p_hwfn, p_ptt, (u16) concrete_fid);
+		qed_wr(p_hwfn, p_ptt, CCFC_REG_STRONG_ENABLE_VF, 0x1);
+		qed_wr(p_hwfn, p_ptt, CCFC_REG_WEAK_ENABLE_VF, 0x0);
+		qed_wr(p_hwfn, p_ptt, TCFC_REG_STRONG_ENABLE_VF, 0x1);
+		qed_wr(p_hwfn, p_ptt, TCFC_REG_WEAK_ENABLE_VF, 0x0);
+	}
+	/* pretend to original PF */
+	qed_fid_pretend(p_hwfn, p_ptt, p_hwfn->rel_pf_id);
+
+	return rc;
+}
+
+static int
+qed_hw_init_dpi_size(struct qed_hwfn *p_hwfn,
+		     struct qed_ptt *p_ptt, u32 pwm_region_size, u32 n_cpus)
+{
+	u32 dpi_bit_shift, dpi_count, dpi_page_size;
+	u32 min_dpis;
+	u32 n_wids;
+
+	/* Calculate DPI size */
+	n_wids = max_t(u32, QED_MIN_WIDS, n_cpus);
+	dpi_page_size = QED_WID_SIZE * roundup_pow_of_two(n_wids);
+	dpi_page_size = (dpi_page_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+	dpi_bit_shift = ilog2(dpi_page_size / 4096);
+	dpi_count = pwm_region_size / dpi_page_size;
+
+	min_dpis = p_hwfn->pf_params.rdma_pf_params.min_dpis;
+	min_dpis = max_t(u32, QED_MIN_DPIS, min_dpis);
+
+	p_hwfn->dpi_size = dpi_page_size;
+	p_hwfn->dpi_count = dpi_count;
+
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_DPI_BIT_SHIFT, dpi_bit_shift);
+
+	if (dpi_count < min_dpis)
+		return -EINVAL;
+
+	return 0;
+}
+
+enum QED_ROCE_EDPM_MODE {
+	QED_ROCE_EDPM_MODE_ENABLE = 0,
+	QED_ROCE_EDPM_MODE_FORCE_ON = 1,
+	QED_ROCE_EDPM_MODE_DISABLE = 2,
+};
+
+static int
+qed_hw_init_pf_doorbell_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 pwm_regsize, norm_regsize;
+	u32 non_pwm_conn, min_addr_reg1;
+	u32 db_bar_size, n_cpus = 1;
+	u32 roce_edpm_mode;
+	u32 pf_dems_shift;
+	int rc = 0;
+	u8 cond;
+
+	db_bar_size = qed_hw_bar_size(p_hwfn, p_ptt, BAR_ID_1);
+	if (p_hwfn->cdev->num_hwfns > 1)
+		db_bar_size /= 2;
+
+	/* Calculate doorbell regions */
+	non_pwm_conn = qed_cxt_get_proto_cid_start(p_hwfn, PROTOCOLID_CORE) +
+		       qed_cxt_get_proto_cid_count(p_hwfn, PROTOCOLID_CORE,
+						   NULL) +
+		       qed_cxt_get_proto_cid_count(p_hwfn, PROTOCOLID_ETH,
+						   NULL);
+	norm_regsize = roundup(QED_PF_DEMS_SIZE * non_pwm_conn, PAGE_SIZE);
+	min_addr_reg1 = norm_regsize / 4096;
+	pwm_regsize = db_bar_size - norm_regsize;
+
+	/* Check that the normal and PWM sizes are valid */
+	if (db_bar_size < norm_regsize) {
+		DP_ERR(p_hwfn->cdev,
+		       "Doorbell BAR size 0x%x is too small (normal region is 0x%0x )\n",
+		       db_bar_size, norm_regsize);
+		return -EINVAL;
+	}
+
+	if (pwm_regsize < QED_MIN_PWM_REGION) {
+		DP_ERR(p_hwfn->cdev,
+		       "PWM region size 0x%0x is too small. Should be at least 0x%0x (Doorbell BAR size is 0x%x and normal region size is 0x%0x)\n",
+		       pwm_regsize,
+		       QED_MIN_PWM_REGION, db_bar_size, norm_regsize);
+		return -EINVAL;
+	}
+
+	/* Calculate number of DPIs */
+	roce_edpm_mode = p_hwfn->pf_params.rdma_pf_params.roce_edpm_mode;
+	if ((roce_edpm_mode == QED_ROCE_EDPM_MODE_ENABLE) ||
+	    ((roce_edpm_mode == QED_ROCE_EDPM_MODE_FORCE_ON))) {
+		/* Either EDPM is mandatory, or we are attempting to allocate a
+		 * WID per CPU.
+		 */
+		n_cpus = num_present_cpus();
+		rc = qed_hw_init_dpi_size(p_hwfn, p_ptt, pwm_regsize, n_cpus);
+	}
+
+	cond = (rc && (roce_edpm_mode == QED_ROCE_EDPM_MODE_ENABLE)) ||
+	       (roce_edpm_mode == QED_ROCE_EDPM_MODE_DISABLE);
+	if (cond || p_hwfn->dcbx_no_edpm) {
+		/* Either EDPM is disabled from user configuration, or it is
+		 * disabled via DCBx, or it is not mandatory and we failed to
+		 * allocated a WID per CPU.
+		 */
+		n_cpus = 1;
+		rc = qed_hw_init_dpi_size(p_hwfn, p_ptt, pwm_regsize, n_cpus);
+
+		if (cond)
+			qed_rdma_dpm_bar(p_hwfn, p_ptt);
+	}
+
+	p_hwfn->wid_count = (u16) n_cpus;
+
+	DP_INFO(p_hwfn,
+		"doorbell bar: normal_region_size=%d, pwm_region_size=%d, dpi_size=%d, dpi_count=%d, roce_edpm=%s\n",
+		norm_regsize,
+		pwm_regsize,
+		p_hwfn->dpi_size,
+		p_hwfn->dpi_count,
+		((p_hwfn->dcbx_no_edpm) || (p_hwfn->db_bar_no_edpm)) ?
+		"disabled" : "enabled");
+
+	if (rc) {
+		DP_ERR(p_hwfn,
+		       "Failed to allocate enough DPIs. Allocated %d but the current minimum is %d.\n",
+		       p_hwfn->dpi_count,
+		       p_hwfn->pf_params.rdma_pf_params.min_dpis);
+		return -EINVAL;
+	}
+
+	p_hwfn->dpi_start_offset = norm_regsize;
+
+	/* DEMS size is configured log2 of DWORDs, hence the division by 4 */
+	pf_dems_shift = ilog2(QED_PF_DEMS_SIZE / 4);
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_ICID_BIT_SHIFT_NORM, pf_dems_shift);
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_MIN_ADDR_REG1, min_addr_reg1);
+
+	return 0;
+}
+
+static int qed_hw_init_port(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt, int hw_mode)
+{
+	int rc = 0;
+
+	rc = qed_init_run(p_hwfn, p_ptt, PHASE_PORT, p_hwfn->port_id, hw_mode);
+	if (rc)
+		return rc;
+
+	qed_wr(p_hwfn, p_ptt, PGLUE_B_REG_MASTER_WRITE_PAD_ENABLE, 0);
+
+	return 0;
+}
+
+static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  struct qed_tunnel_info *p_tunn,
+			  int hw_mode,
+			  bool b_hw_start,
+			  enum qed_int_mode int_mode,
+			  bool allow_npar_tx_switch)
+{
+	u8 rel_pf_id = p_hwfn->rel_pf_id;
+	int rc = 0;
+
+	if (p_hwfn->mcp_info) {
+		struct qed_mcp_function_info *p_info;
+
+		p_info = &p_hwfn->mcp_info->func_info;
+		if (p_info->bandwidth_min)
+			p_hwfn->qm_info.pf_wfq = p_info->bandwidth_min;
+
+		/* Update rate limit once we'll actually have a link */
+		p_hwfn->qm_info.pf_rl = 100000;
+	}
+
+	qed_cxt_hw_init_pf(p_hwfn, p_ptt);
+
+	qed_int_igu_init_rt(p_hwfn);
+
+	/* Set VLAN in NIG if needed */
+	if (hw_mode & BIT(MODE_MF_SD)) {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW, "Configuring LLH_FUNC_TAG\n");
+		STORE_RT_REG(p_hwfn, NIG_REG_LLH_FUNC_TAG_EN_RT_OFFSET, 1);
+		STORE_RT_REG(p_hwfn, NIG_REG_LLH_FUNC_TAG_VALUE_RT_OFFSET,
+			     p_hwfn->hw_info.ovlan);
+	}
+
+	/* Enable classification by MAC if needed */
+	if (hw_mode & BIT(MODE_MF_SI)) {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "Configuring TAGMAC_CLS_TYPE\n");
+		STORE_RT_REG(p_hwfn,
+			     NIG_REG_LLH_FUNC_TAGMAC_CLS_TYPE_RT_OFFSET, 1);
+	}
+
+	/* Protocol Configuration */
+	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_TCP_RT_OFFSET,
+		     (p_hwfn->hw_info.personality == QED_PCI_ISCSI) ? 1 : 0);
+	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_FCOE_RT_OFFSET,
+		     (p_hwfn->hw_info.personality == QED_PCI_FCOE) ? 1 : 0);
+	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_ROCE_RT_OFFSET, 0);
+
+	/* Cleanup chip from previous driver if such remains exist */
+	rc = qed_final_cleanup(p_hwfn, p_ptt, rel_pf_id, false);
+	if (rc)
+		return rc;
+
+	/* Sanity check before the PF init sequence that uses DMAE */
+	rc = qed_dmae_sanity(p_hwfn, p_ptt, "pf_phase");
+	if (rc)
+		return rc;
+
+	/* PF Init sequence */
+	rc = qed_init_run(p_hwfn, p_ptt, PHASE_PF, rel_pf_id, hw_mode);
+	if (rc)
+		return rc;
+
+	/* QM_PF Init sequence (may be invoked separately e.g. for DCB) */
+	rc = qed_init_run(p_hwfn, p_ptt, PHASE_QM_PF, rel_pf_id, hw_mode);
+	if (rc)
+		return rc;
+
+	/* Pure runtime initializations - directly to the HW  */
+	qed_int_igu_init_pure_rt(p_hwfn, p_ptt, true, true);
+
+	rc = qed_hw_init_pf_doorbell_bar(p_hwfn, p_ptt);
+	if (rc)
+		return rc;
+
+	if (b_hw_start) {
+		/* enable interrupts */
+		qed_int_igu_enable(p_hwfn, p_ptt, int_mode);
+
+		/* send function start command */
+		rc = qed_sp_pf_start(p_hwfn, p_ptt, p_tunn,
+				     p_hwfn->cdev->mf_mode,
+				     allow_npar_tx_switch);
+		if (rc) {
+			DP_NOTICE(p_hwfn, "Function start ramrod failed\n");
+			return rc;
+		}
+		if (p_hwfn->hw_info.personality == QED_PCI_FCOE) {
+			qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_TAG1, BIT(2));
+			qed_wr(p_hwfn, p_ptt,
+			       PRS_REG_PKT_LEN_STAT_TAGS_NOT_COUNTED_FIRST,
+			       0x100);
+		}
+	}
+	return rc;
+}
+
+static int qed_change_pci_hwfn(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt,
+			       u8 enable)
+{
+	u32 delay_idx = 0, val, set_val = enable ? 1 : 0;
+
+	/* Change PF in PXP */
+	qed_wr(p_hwfn, p_ptt,
+	       PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER, set_val);
+
+	/* wait until value is set - try for 1 second every 50us */
+	for (delay_idx = 0; delay_idx < 20000; delay_idx++) {
+		val = qed_rd(p_hwfn, p_ptt,
+			     PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER);
+		if (val == set_val)
+			break;
+
+		usleep_range(50, 60);
+	}
+
+	if (val != set_val) {
+		DP_NOTICE(p_hwfn,
+			  "PFID_ENABLE_MASTER wasn't changed after a second\n");
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void qed_reset_mb_shadow(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_main_ptt)
+{
+	/* Read shadow of current MFW mailbox */
+	qed_mcp_read_mb(p_hwfn, p_main_ptt);
+	memcpy(p_hwfn->mcp_info->mfw_mb_shadow,
+	       p_hwfn->mcp_info->mfw_mb_cur, p_hwfn->mcp_info->mfw_mb_length);
+}
+
+static void
+qed_fill_load_req_params(struct qed_load_req_params *p_load_req,
+			 struct qed_drv_load_params *p_drv_load)
+{
+	memset(p_load_req, 0, sizeof(*p_load_req));
+
+	p_load_req->drv_role = p_drv_load->is_crash_kernel ?
+			       QED_DRV_ROLE_KDUMP : QED_DRV_ROLE_OS;
+	p_load_req->timeout_val = p_drv_load->mfw_timeout_val;
+	p_load_req->avoid_eng_reset = p_drv_load->avoid_eng_reset;
+	p_load_req->override_force_load = p_drv_load->override_force_load;
+}
+
+static int qed_vf_start(struct qed_hwfn *p_hwfn,
+			struct qed_hw_init_params *p_params)
+{
+	if (p_params->p_tunn) {
+		qed_vf_set_vf_start_tunn_update_param(p_params->p_tunn);
+		qed_vf_pf_tunnel_param_update(p_hwfn, p_params->p_tunn);
+	}
+
+	p_hwfn->b_int_enabled = true;
+
+	return 0;
+}
+
+int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
+{
+	struct qed_load_req_params load_req_params;
+	u32 load_code, param, drv_mb_param;
+	bool b_default_mtu = true;
+	struct qed_hwfn *p_hwfn;
+	int rc = 0, mfw_rc, i;
+
+	if ((p_params->int_mode == QED_INT_MODE_MSI) && (cdev->num_hwfns > 1)) {
+		DP_NOTICE(cdev, "MSI mode is not supported for CMT devices\n");
+		return -EINVAL;
+	}
+
+	if (IS_PF(cdev)) {
+		rc = qed_init_fw_data(cdev, p_params->bin_fw_data);
+		if (rc)
+			return rc;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		/* If management didn't provide a default, set one of our own */
+		if (!p_hwfn->hw_info.mtu) {
+			p_hwfn->hw_info.mtu = 1500;
+			b_default_mtu = false;
+		}
+
+		if (IS_VF(cdev)) {
+			qed_vf_start(p_hwfn, p_params);
+			continue;
+		}
+
+		/* Enable DMAE in PXP */
+		rc = qed_change_pci_hwfn(p_hwfn, p_hwfn->p_main_ptt, true);
+
+		rc = qed_calc_hw_mode(p_hwfn);
+		if (rc)
+			return rc;
+
+		qed_fill_load_req_params(&load_req_params,
+					 p_params->p_drv_load_params);
+		rc = qed_mcp_load_req(p_hwfn, p_hwfn->p_main_ptt,
+				      &load_req_params);
+		if (rc) {
+			DP_NOTICE(p_hwfn, "Failed sending a LOAD_REQ command\n");
+			return rc;
+		}
+
+		load_code = load_req_params.load_code;
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Load request was sent. Load code: 0x%x\n",
+			   load_code);
+
+		qed_mcp_set_capabilities(p_hwfn, p_hwfn->p_main_ptt);
+
+		qed_reset_mb_shadow(p_hwfn, p_hwfn->p_main_ptt);
+
+		p_hwfn->first_on_engine = (load_code ==
+					   FW_MSG_CODE_DRV_LOAD_ENGINE);
+
+		switch (load_code) {
+		case FW_MSG_CODE_DRV_LOAD_ENGINE:
+			rc = qed_hw_init_common(p_hwfn, p_hwfn->p_main_ptt,
+						p_hwfn->hw_info.hw_mode);
+			if (rc)
+				break;
+		/* Fall into */
+		case FW_MSG_CODE_DRV_LOAD_PORT:
+			rc = qed_hw_init_port(p_hwfn, p_hwfn->p_main_ptt,
+					      p_hwfn->hw_info.hw_mode);
+			if (rc)
+				break;
+
+		/* Fall into */
+		case FW_MSG_CODE_DRV_LOAD_FUNCTION:
+			rc = qed_hw_init_pf(p_hwfn, p_hwfn->p_main_ptt,
+					    p_params->p_tunn,
+					    p_hwfn->hw_info.hw_mode,
+					    p_params->b_hw_start,
+					    p_params->int_mode,
+					    p_params->allow_npar_tx_switch);
+			break;
+		default:
+			DP_NOTICE(p_hwfn,
+				  "Unexpected load code [0x%08x]", load_code);
+			rc = -EINVAL;
+			break;
+		}
+
+		if (rc)
+			DP_NOTICE(p_hwfn,
+				  "init phase failed for loadcode 0x%x (rc %d)\n",
+				   load_code, rc);
+
+		/* ACK mfw regardless of success or failure of initialization */
+		mfw_rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt,
+				     DRV_MSG_CODE_LOAD_DONE,
+				     0, &load_code, &param);
+		if (rc)
+			return rc;
+		if (mfw_rc) {
+			DP_NOTICE(p_hwfn, "Failed sending LOAD_DONE command\n");
+			return mfw_rc;
+		}
+
+		/* Check if there is a DID mismatch between nvm-cfg/efuse */
+		if (param & FW_MB_PARAM_LOAD_DONE_DID_EFUSE_ERROR)
+			DP_NOTICE(p_hwfn,
+				  "warning: device configuration is not supported on this board type. The device may not function as expected.\n");
+
+		/* send DCBX attention request command */
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_DCB,
+			   "sending phony dcbx set command to trigger DCBx attention handling\n");
+		mfw_rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt,
+				     DRV_MSG_CODE_SET_DCBX,
+				     1 << DRV_MB_PARAM_DCBX_NOTIFY_SHIFT,
+				     &load_code, &param);
+		if (mfw_rc) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to send DCBX attention request\n");
+			return mfw_rc;
+		}
+
+		p_hwfn->hw_init_done = true;
+	}
+
+	if (IS_PF(cdev)) {
+		p_hwfn = QED_LEADING_HWFN(cdev);
+		drv_mb_param = STORM_FW_VERSION;
+		rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt,
+				 DRV_MSG_CODE_OV_UPDATE_STORM_FW_VER,
+				 drv_mb_param, &load_code, &param);
+		if (rc)
+			DP_INFO(p_hwfn, "Failed to update firmware version\n");
+
+		if (!b_default_mtu) {
+			rc = qed_mcp_ov_update_mtu(p_hwfn, p_hwfn->p_main_ptt,
+						   p_hwfn->hw_info.mtu);
+			if (rc)
+				DP_INFO(p_hwfn,
+					"Failed to update default mtu\n");
+		}
+
+		rc = qed_mcp_ov_update_driver_state(p_hwfn,
+						    p_hwfn->p_main_ptt,
+						  QED_OV_DRIVER_STATE_DISABLED);
+		if (rc)
+			DP_INFO(p_hwfn, "Failed to update driver state\n");
+
+		rc = qed_mcp_ov_update_eswitch(p_hwfn, p_hwfn->p_main_ptt,
+					       QED_OV_ESWITCH_VEB);
+		if (rc)
+			DP_INFO(p_hwfn, "Failed to update eswitch mode\n");
+	}
+
+	return 0;
+}
+
+#define QED_HW_STOP_RETRY_LIMIT (10)
+static void qed_hw_timers_stop(struct qed_dev *cdev,
+			       struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	int i;
+
+	/* close timers */
+	qed_wr(p_hwfn, p_ptt, TM_REG_PF_ENABLE_CONN, 0x0);
+	qed_wr(p_hwfn, p_ptt, TM_REG_PF_ENABLE_TASK, 0x0);
+
+	for (i = 0; i < QED_HW_STOP_RETRY_LIMIT; i++) {
+		if ((!qed_rd(p_hwfn, p_ptt,
+			     TM_REG_PF_SCAN_ACTIVE_CONN)) &&
+		    (!qed_rd(p_hwfn, p_ptt, TM_REG_PF_SCAN_ACTIVE_TASK)))
+			break;
+
+		/* Dependent on number of connection/tasks, possibly
+		 * 1ms sleep is required between polls
+		 */
+		usleep_range(1000, 2000);
+	}
+
+	if (i < QED_HW_STOP_RETRY_LIMIT)
+		return;
+
+	DP_NOTICE(p_hwfn,
+		  "Timers linear scans are not over [Connection %02x Tasks %02x]\n",
+		  (u8)qed_rd(p_hwfn, p_ptt, TM_REG_PF_SCAN_ACTIVE_CONN),
+		  (u8)qed_rd(p_hwfn, p_ptt, TM_REG_PF_SCAN_ACTIVE_TASK));
+}
+
+void qed_hw_timers_stop_all(struct qed_dev *cdev)
+{
+	int j;
+
+	for_each_hwfn(cdev, j) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[j];
+		struct qed_ptt *p_ptt = p_hwfn->p_main_ptt;
+
+		qed_hw_timers_stop(cdev, p_hwfn, p_ptt);
+	}
+}
+
+int qed_hw_stop(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn;
+	struct qed_ptt *p_ptt;
+	int rc, rc2 = 0;
+	int j;
+
+	for_each_hwfn(cdev, j) {
+		p_hwfn = &cdev->hwfns[j];
+		p_ptt = p_hwfn->p_main_ptt;
+
+		DP_VERBOSE(p_hwfn, NETIF_MSG_IFDOWN, "Stopping hw/fw\n");
+
+		if (IS_VF(cdev)) {
+			qed_vf_pf_int_cleanup(p_hwfn);
+			rc = qed_vf_pf_reset(p_hwfn);
+			if (rc) {
+				DP_NOTICE(p_hwfn,
+					  "qed_vf_pf_reset failed. rc = %d.\n",
+					  rc);
+				rc2 = -EINVAL;
+			}
+			continue;
+		}
+
+		/* mark the hw as uninitialized... */
+		p_hwfn->hw_init_done = false;
+
+		/* Send unload command to MCP */
+		rc = qed_mcp_unload_req(p_hwfn, p_ptt);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "Failed sending a UNLOAD_REQ command. rc = %d.\n",
+				  rc);
+			rc2 = -EINVAL;
+		}
+
+		qed_slowpath_irq_sync(p_hwfn);
+
+		/* After this point no MFW attentions are expected, e.g. prevent
+		 * race between pf stop and dcbx pf update.
+		 */
+		rc = qed_sp_pf_stop(p_hwfn);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to close PF against FW [rc = %d]. Continue to stop HW to prevent illegal host access by the device.\n",
+				  rc);
+			rc2 = -EINVAL;
+		}
+
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_RX_LLH_BRB_GATE_DNTFWD_PERPF, 0x1);
+
+		qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_TCP, 0x0);
+		qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_UDP, 0x0);
+		qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_FCOE, 0x0);
+		qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_ROCE, 0x0);
+		qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_OPENFLOW, 0x0);
+
+		qed_hw_timers_stop(cdev, p_hwfn, p_ptt);
+
+		/* Disable Attention Generation */
+		qed_int_igu_disable_int(p_hwfn, p_ptt);
+
+		qed_wr(p_hwfn, p_ptt, IGU_REG_LEADING_EDGE_LATCH, 0);
+		qed_wr(p_hwfn, p_ptt, IGU_REG_TRAILING_EDGE_LATCH, 0);
+
+		qed_int_igu_init_pure_rt(p_hwfn, p_ptt, false, true);
+
+		/* Need to wait 1ms to guarantee SBs are cleared */
+		usleep_range(1000, 2000);
+
+		/* Disable PF in HW blocks */
+		qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_DB_ENABLE, 0);
+		qed_wr(p_hwfn, p_ptt, QM_REG_PF_EN, 0);
+
+		qed_mcp_unload_done(p_hwfn, p_ptt);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "Failed sending a UNLOAD_DONE command. rc = %d.\n",
+				  rc);
+			rc2 = -EINVAL;
+		}
+	}
+
+	if (IS_PF(cdev)) {
+		p_hwfn = QED_LEADING_HWFN(cdev);
+		p_ptt = QED_LEADING_HWFN(cdev)->p_main_ptt;
+
+		/* Disable DMAE in PXP - in CMT, this should only be done for
+		 * first hw-function, and only after all transactions have
+		 * stopped for all active hw-functions.
+		 */
+		rc = qed_change_pci_hwfn(p_hwfn, p_ptt, false);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "qed_change_pci_hwfn failed. rc = %d.\n", rc);
+			rc2 = -EINVAL;
+		}
+	}
+
+	return rc2;
+}
+
+int qed_hw_stop_fastpath(struct qed_dev *cdev)
+{
+	int j;
+
+	for_each_hwfn(cdev, j) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[j];
+		struct qed_ptt *p_ptt;
+
+		if (IS_VF(cdev)) {
+			qed_vf_pf_int_cleanup(p_hwfn);
+			continue;
+		}
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt)
+			return -EAGAIN;
+
+		DP_VERBOSE(p_hwfn,
+			   NETIF_MSG_IFDOWN, "Shutting down the fastpath\n");
+
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_RX_LLH_BRB_GATE_DNTFWD_PERPF, 0x1);
+
+		qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_TCP, 0x0);
+		qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_UDP, 0x0);
+		qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_FCOE, 0x0);
+		qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_ROCE, 0x0);
+		qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_OPENFLOW, 0x0);
+
+		qed_int_igu_init_pure_rt(p_hwfn, p_ptt, false, false);
+
+		/* Need to wait 1ms to guarantee SBs are cleared */
+		usleep_range(1000, 2000);
+		qed_ptt_release(p_hwfn, p_ptt);
+	}
+
+	return 0;
+}
+
+int qed_hw_start_fastpath(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ptt *p_ptt;
+
+	if (IS_VF(p_hwfn->cdev))
+		return 0;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EAGAIN;
+
+	/* If roce info is allocated it means roce is initialized and should
+	 * be enabled in searcher.
+	 */
+	if (p_hwfn->p_rdma_info &&
+	    p_hwfn->b_rdma_enabled_in_prs)
+		qed_wr(p_hwfn, p_ptt, p_hwfn->rdma_prs_search_reg, 0x1);
+
+	/* Re-open incoming traffic */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_RX_LLH_BRB_GATE_DNTFWD_PERPF, 0x0);
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return 0;
+}
+
+/* Free hwfn memory and resources acquired in hw_hwfn_prepare */
+static void qed_hw_hwfn_free(struct qed_hwfn *p_hwfn)
+{
+	qed_ptt_pool_free(p_hwfn);
+	kfree(p_hwfn->hw_info.p_igu_info);
+	p_hwfn->hw_info.p_igu_info = NULL;
+}
+
+/* Setup bar access */
+static void qed_hw_hwfn_prepare(struct qed_hwfn *p_hwfn)
+{
+	/* clear indirect access */
+	if (QED_IS_AH(p_hwfn->cdev)) {
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_E8_F0_K2, 0);
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_EC_F0_K2, 0);
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_F0_F0_K2, 0);
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_F4_F0_K2, 0);
+	} else {
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_88_F0_BB, 0);
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_8C_F0_BB, 0);
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_90_F0_BB, 0);
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_94_F0_BB, 0);
+	}
+
+	/* Clean Previous errors if such exist */
+	qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+	       PGLUE_B_REG_WAS_ERROR_PF_31_0_CLR, 1 << p_hwfn->abs_pf_id);
+
+	/* enable internal target-read */
+	qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+	       PGLUE_B_REG_INTERNAL_PFID_ENABLE_TARGET_READ, 1);
+}
+
+static void get_function_id(struct qed_hwfn *p_hwfn)
+{
+	/* ME Register */
+	p_hwfn->hw_info.opaque_fid = (u16) REG_RD(p_hwfn,
+						  PXP_PF_ME_OPAQUE_ADDR);
+
+	p_hwfn->hw_info.concrete_fid = REG_RD(p_hwfn, PXP_PF_ME_CONCRETE_ADDR);
+
+	p_hwfn->abs_pf_id = (p_hwfn->hw_info.concrete_fid >> 16) & 0xf;
+	p_hwfn->rel_pf_id = GET_FIELD(p_hwfn->hw_info.concrete_fid,
+				      PXP_CONCRETE_FID_PFID);
+	p_hwfn->port_id = GET_FIELD(p_hwfn->hw_info.concrete_fid,
+				    PXP_CONCRETE_FID_PORT);
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_PROBE,
+		   "Read ME register: Concrete 0x%08x Opaque 0x%04x\n",
+		   p_hwfn->hw_info.concrete_fid, p_hwfn->hw_info.opaque_fid);
+}
+
+static void qed_hw_set_feat(struct qed_hwfn *p_hwfn)
+{
+	u32 *feat_num = p_hwfn->hw_info.feat_num;
+	struct qed_sb_cnt_info sb_cnt;
+	u32 non_l2_sbs = 0;
+
+	memset(&sb_cnt, 0, sizeof(sb_cnt));
+	qed_int_get_num_sbs(p_hwfn, &sb_cnt);
+
+	if (IS_ENABLED(CONFIG_QED_RDMA) &&
+	    QED_IS_RDMA_PERSONALITY(p_hwfn)) {
+		/* Roce CNQ each requires: 1 status block + 1 CNQ. We divide
+		 * the status blocks equally between L2 / RoCE but with
+		 * consideration as to how many l2 queues / cnqs we have.
+		 */
+		feat_num[QED_RDMA_CNQ] =
+			min_t(u32, sb_cnt.cnt / 2,
+			      RESC_NUM(p_hwfn, QED_RDMA_CNQ_RAM));
+
+		non_l2_sbs = feat_num[QED_RDMA_CNQ];
+	}
+	if (QED_IS_L2_PERSONALITY(p_hwfn)) {
+		/* Start by allocating VF queues, then PF's */
+		feat_num[QED_VF_L2_QUE] = min_t(u32,
+						RESC_NUM(p_hwfn, QED_L2_QUEUE),
+						sb_cnt.iov_cnt);
+		feat_num[QED_PF_L2_QUE] = min_t(u32,
+						sb_cnt.cnt - non_l2_sbs,
+						RESC_NUM(p_hwfn,
+							 QED_L2_QUEUE) -
+						FEAT_NUM(p_hwfn,
+							 QED_VF_L2_QUE));
+	}
+
+	if (QED_IS_FCOE_PERSONALITY(p_hwfn))
+		feat_num[QED_FCOE_CQ] =  min_t(u32, sb_cnt.cnt,
+					       RESC_NUM(p_hwfn,
+							QED_CMDQS_CQS));
+
+	if (QED_IS_ISCSI_PERSONALITY(p_hwfn))
+		feat_num[QED_ISCSI_CQ] = min_t(u32, sb_cnt.cnt,
+					       RESC_NUM(p_hwfn,
+							QED_CMDQS_CQS));
+	DP_VERBOSE(p_hwfn,
+		   NETIF_MSG_PROBE,
+		   "#PF_L2_QUEUES=%d VF_L2_QUEUES=%d #ROCE_CNQ=%d FCOE_CQ=%d ISCSI_CQ=%d #SBS=%d\n",
+		   (int)FEAT_NUM(p_hwfn, QED_PF_L2_QUE),
+		   (int)FEAT_NUM(p_hwfn, QED_VF_L2_QUE),
+		   (int)FEAT_NUM(p_hwfn, QED_RDMA_CNQ),
+		   (int)FEAT_NUM(p_hwfn, QED_FCOE_CQ),
+		   (int)FEAT_NUM(p_hwfn, QED_ISCSI_CQ),
+		   (int)sb_cnt.cnt);
+}
+
+const char *qed_hw_get_resc_name(enum qed_resources res_id)
+{
+	switch (res_id) {
+	case QED_L2_QUEUE:
+		return "L2_QUEUE";
+	case QED_VPORT:
+		return "VPORT";
+	case QED_RSS_ENG:
+		return "RSS_ENG";
+	case QED_PQ:
+		return "PQ";
+	case QED_RL:
+		return "RL";
+	case QED_MAC:
+		return "MAC";
+	case QED_VLAN:
+		return "VLAN";
+	case QED_RDMA_CNQ_RAM:
+		return "RDMA_CNQ_RAM";
+	case QED_ILT:
+		return "ILT";
+	case QED_LL2_QUEUE:
+		return "LL2_QUEUE";
+	case QED_CMDQS_CQS:
+		return "CMDQS_CQS";
+	case QED_RDMA_STATS_QUEUE:
+		return "RDMA_STATS_QUEUE";
+	case QED_BDQ:
+		return "BDQ";
+	case QED_SB:
+		return "SB";
+	default:
+		return "UNKNOWN_RESOURCE";
+	}
+}
+
+static int
+__qed_hw_set_soft_resc_size(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt,
+			    enum qed_resources res_id,
+			    u32 resc_max_val, u32 *p_mcp_resp)
+{
+	int rc;
+
+	rc = qed_mcp_set_resc_max_val(p_hwfn, p_ptt, res_id,
+				      resc_max_val, p_mcp_resp);
+	if (rc) {
+		DP_NOTICE(p_hwfn,
+			  "MFW response failure for a max value setting of resource %d [%s]\n",
+			  res_id, qed_hw_get_resc_name(res_id));
+		return rc;
+	}
+
+	if (*p_mcp_resp != FW_MSG_CODE_RESOURCE_ALLOC_OK)
+		DP_INFO(p_hwfn,
+			"Failed to set the max value of resource %d [%s]. mcp_resp = 0x%08x.\n",
+			res_id, qed_hw_get_resc_name(res_id), *p_mcp_resp);
+
+	return 0;
+}
+
+static int
+qed_hw_set_soft_resc_size(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	bool b_ah = QED_IS_AH(p_hwfn->cdev);
+	u32 resc_max_val, mcp_resp;
+	u8 res_id;
+	int rc;
+
+	for (res_id = 0; res_id < QED_MAX_RESC; res_id++) {
+		switch (res_id) {
+		case QED_LL2_QUEUE:
+			resc_max_val = MAX_NUM_LL2_RX_QUEUES;
+			break;
+		case QED_RDMA_CNQ_RAM:
+			/* No need for a case for QED_CMDQS_CQS since
+			 * CNQ/CMDQS are the same resource.
+			 */
+			resc_max_val = NUM_OF_GLOBAL_QUEUES;
+			break;
+		case QED_RDMA_STATS_QUEUE:
+			resc_max_val = b_ah ? RDMA_NUM_STATISTIC_COUNTERS_K2
+			    : RDMA_NUM_STATISTIC_COUNTERS_BB;
+			break;
+		case QED_BDQ:
+			resc_max_val = BDQ_NUM_RESOURCES;
+			break;
+		default:
+			continue;
+		}
+
+		rc = __qed_hw_set_soft_resc_size(p_hwfn, p_ptt, res_id,
+						 resc_max_val, &mcp_resp);
+		if (rc)
+			return rc;
+
+		/* There's no point to continue to the next resource if the
+		 * command is not supported by the MFW.
+		 * We do continue if the command is supported but the resource
+		 * is unknown to the MFW. Such a resource will be later
+		 * configured with the default allocation values.
+		 */
+		if (mcp_resp == FW_MSG_CODE_UNSUPPORTED)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static
+int qed_hw_get_dflt_resc(struct qed_hwfn *p_hwfn,
+			 enum qed_resources res_id,
+			 u32 *p_resc_num, u32 *p_resc_start)
+{
+	u8 num_funcs = p_hwfn->num_funcs_on_engine;
+	bool b_ah = QED_IS_AH(p_hwfn->cdev);
+
+	switch (res_id) {
+	case QED_L2_QUEUE:
+		*p_resc_num = (b_ah ? MAX_NUM_L2_QUEUES_K2 :
+			       MAX_NUM_L2_QUEUES_BB) / num_funcs;
+		break;
+	case QED_VPORT:
+		*p_resc_num = (b_ah ? MAX_NUM_VPORTS_K2 :
+			       MAX_NUM_VPORTS_BB) / num_funcs;
+		break;
+	case QED_RSS_ENG:
+		*p_resc_num = (b_ah ? ETH_RSS_ENGINE_NUM_K2 :
+			       ETH_RSS_ENGINE_NUM_BB) / num_funcs;
+		break;
+	case QED_PQ:
+		*p_resc_num = (b_ah ? MAX_QM_TX_QUEUES_K2 :
+			       MAX_QM_TX_QUEUES_BB) / num_funcs;
+		*p_resc_num &= ~0x7;	/* The granularity of the PQs is 8 */
+		break;
+	case QED_RL:
+		*p_resc_num = MAX_QM_GLOBAL_RLS / num_funcs;
+		break;
+	case QED_MAC:
+	case QED_VLAN:
+		/* Each VFC resource can accommodate both a MAC and a VLAN */
+		*p_resc_num = ETH_NUM_MAC_FILTERS / num_funcs;
+		break;
+	case QED_ILT:
+		*p_resc_num = (b_ah ? PXP_NUM_ILT_RECORDS_K2 :
+			       PXP_NUM_ILT_RECORDS_BB) / num_funcs;
+		break;
+	case QED_LL2_QUEUE:
+		*p_resc_num = MAX_NUM_LL2_RX_QUEUES / num_funcs;
+		break;
+	case QED_RDMA_CNQ_RAM:
+	case QED_CMDQS_CQS:
+		/* CNQ/CMDQS are the same resource */
+		*p_resc_num = NUM_OF_GLOBAL_QUEUES / num_funcs;
+		break;
+	case QED_RDMA_STATS_QUEUE:
+		*p_resc_num = (b_ah ? RDMA_NUM_STATISTIC_COUNTERS_K2 :
+			       RDMA_NUM_STATISTIC_COUNTERS_BB) / num_funcs;
+		break;
+	case QED_BDQ:
+		if (p_hwfn->hw_info.personality != QED_PCI_ISCSI &&
+		    p_hwfn->hw_info.personality != QED_PCI_FCOE)
+			*p_resc_num = 0;
+		else
+			*p_resc_num = 1;
+		break;
+	case QED_SB:
+		/* Since we want its value to reflect whether MFW supports
+		 * the new scheme, have a default of 0.
+		 */
+		*p_resc_num = 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	switch (res_id) {
+	case QED_BDQ:
+		if (!*p_resc_num)
+			*p_resc_start = 0;
+		else if (p_hwfn->cdev->num_ports_in_engine == 4)
+			*p_resc_start = p_hwfn->port_id;
+		else if (p_hwfn->hw_info.personality == QED_PCI_ISCSI)
+			*p_resc_start = p_hwfn->port_id;
+		else if (p_hwfn->hw_info.personality == QED_PCI_FCOE)
+			*p_resc_start = p_hwfn->port_id + 2;
+		break;
+	default:
+		*p_resc_start = *p_resc_num * p_hwfn->enabled_func_idx;
+		break;
+	}
+
+	return 0;
+}
+
+static int __qed_hw_set_resc_info(struct qed_hwfn *p_hwfn,
+				  enum qed_resources res_id)
+{
+	u32 dflt_resc_num = 0, dflt_resc_start = 0;
+	u32 mcp_resp, *p_resc_num, *p_resc_start;
+	int rc;
+
+	p_resc_num = &RESC_NUM(p_hwfn, res_id);
+	p_resc_start = &RESC_START(p_hwfn, res_id);
+
+	rc = qed_hw_get_dflt_resc(p_hwfn, res_id, &dflt_resc_num,
+				  &dflt_resc_start);
+	if (rc) {
+		DP_ERR(p_hwfn,
+		       "Failed to get default amount for resource %d [%s]\n",
+		       res_id, qed_hw_get_resc_name(res_id));
+		return rc;
+	}
+
+	rc = qed_mcp_get_resc_info(p_hwfn, p_hwfn->p_main_ptt, res_id,
+				   &mcp_resp, p_resc_num, p_resc_start);
+	if (rc) {
+		DP_NOTICE(p_hwfn,
+			  "MFW response failure for an allocation request for resource %d [%s]\n",
+			  res_id, qed_hw_get_resc_name(res_id));
+		return rc;
+	}
+
+	/* Default driver values are applied in the following cases:
+	 * - The resource allocation MB command is not supported by the MFW
+	 * - There is an internal error in the MFW while processing the request
+	 * - The resource ID is unknown to the MFW
+	 */
+	if (mcp_resp != FW_MSG_CODE_RESOURCE_ALLOC_OK) {
+		DP_INFO(p_hwfn,
+			"Failed to receive allocation info for resource %d [%s]. mcp_resp = 0x%x. Applying default values [%d,%d].\n",
+			res_id,
+			qed_hw_get_resc_name(res_id),
+			mcp_resp, dflt_resc_num, dflt_resc_start);
+		*p_resc_num = dflt_resc_num;
+		*p_resc_start = dflt_resc_start;
+		goto out;
+	}
+
+out:
+	/* PQs have to divide by 8 [that's the HW granularity].
+	 * Reduce number so it would fit.
+	 */
+	if ((res_id == QED_PQ) && ((*p_resc_num % 8) || (*p_resc_start % 8))) {
+		DP_INFO(p_hwfn,
+			"PQs need to align by 8; Number %08x --> %08x, Start %08x --> %08x\n",
+			*p_resc_num,
+			(*p_resc_num) & ~0x7,
+			*p_resc_start, (*p_resc_start) & ~0x7);
+		*p_resc_num &= ~0x7;
+		*p_resc_start &= ~0x7;
+	}
+
+	return 0;
+}
+
+static int qed_hw_set_resc_info(struct qed_hwfn *p_hwfn)
+{
+	int rc;
+	u8 res_id;
+
+	for (res_id = 0; res_id < QED_MAX_RESC; res_id++) {
+		rc = __qed_hw_set_resc_info(p_hwfn, res_id);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static int qed_hw_get_resc(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_resc_unlock_params resc_unlock_params;
+	struct qed_resc_lock_params resc_lock_params;
+	bool b_ah = QED_IS_AH(p_hwfn->cdev);
+	u8 res_id;
+	int rc;
+
+	/* Setting the max values of the soft resources and the following
+	 * resources allocation queries should be atomic. Since several PFs can
+	 * run in parallel - a resource lock is needed.
+	 * If either the resource lock or resource set value commands are not
+	 * supported - skip the the max values setting, release the lock if
+	 * needed, and proceed to the queries. Other failures, including a
+	 * failure to acquire the lock, will cause this function to fail.
+	 */
+	qed_mcp_resc_lock_default_init(&resc_lock_params, &resc_unlock_params,
+				       QED_RESC_LOCK_RESC_ALLOC, false);
+
+	rc = qed_mcp_resc_lock(p_hwfn, p_ptt, &resc_lock_params);
+	if (rc && rc != -EINVAL) {
+		return rc;
+	} else if (rc == -EINVAL) {
+		DP_INFO(p_hwfn,
+			"Skip the max values setting of the soft resources since the resource lock is not supported by the MFW\n");
+	} else if (!rc && !resc_lock_params.b_granted) {
+		DP_NOTICE(p_hwfn,
+			  "Failed to acquire the resource lock for the resource allocation commands\n");
+		return -EBUSY;
+	} else {
+		rc = qed_hw_set_soft_resc_size(p_hwfn, p_ptt);
+		if (rc && rc != -EINVAL) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to set the max values of the soft resources\n");
+			goto unlock_and_exit;
+		} else if (rc == -EINVAL) {
+			DP_INFO(p_hwfn,
+				"Skip the max values setting of the soft resources since it is not supported by the MFW\n");
+			rc = qed_mcp_resc_unlock(p_hwfn, p_ptt,
+						 &resc_unlock_params);
+			if (rc)
+				DP_INFO(p_hwfn,
+					"Failed to release the resource lock for the resource allocation commands\n");
+		}
+	}
+
+	rc = qed_hw_set_resc_info(p_hwfn);
+	if (rc)
+		goto unlock_and_exit;
+
+	if (resc_lock_params.b_granted && !resc_unlock_params.b_released) {
+		rc = qed_mcp_resc_unlock(p_hwfn, p_ptt, &resc_unlock_params);
+		if (rc)
+			DP_INFO(p_hwfn,
+				"Failed to release the resource lock for the resource allocation commands\n");
+	}
+
+	/* Sanity for ILT */
+	if ((b_ah && (RESC_END(p_hwfn, QED_ILT) > PXP_NUM_ILT_RECORDS_K2)) ||
+	    (!b_ah && (RESC_END(p_hwfn, QED_ILT) > PXP_NUM_ILT_RECORDS_BB))) {
+		DP_NOTICE(p_hwfn, "Can't assign ILT pages [%08x,...,%08x]\n",
+			  RESC_START(p_hwfn, QED_ILT),
+			  RESC_END(p_hwfn, QED_ILT) - 1);
+		return -EINVAL;
+	}
+
+	/* This will also learn the number of SBs from MFW */
+	if (qed_int_igu_reset_cam(p_hwfn, p_ptt))
+		return -EINVAL;
+
+	qed_hw_set_feat(p_hwfn);
+
+	for (res_id = 0; res_id < QED_MAX_RESC; res_id++)
+		DP_VERBOSE(p_hwfn, NETIF_MSG_PROBE, "%s = %d start = %d\n",
+			   qed_hw_get_resc_name(res_id),
+			   RESC_NUM(p_hwfn, res_id),
+			   RESC_START(p_hwfn, res_id));
+
+	return 0;
+
+unlock_and_exit:
+	if (resc_lock_params.b_granted && !resc_unlock_params.b_released)
+		qed_mcp_resc_unlock(p_hwfn, p_ptt, &resc_unlock_params);
+	return rc;
+}
+
+static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 port_cfg_addr, link_temp, nvm_cfg_addr, device_capabilities;
+	u32 nvm_cfg1_offset, mf_mode, addr, generic_cont0, core_cfg;
+	struct qed_mcp_link_capabilities *p_caps;
+	struct qed_mcp_link_params *link;
+
+	/* Read global nvm_cfg address */
+	nvm_cfg_addr = qed_rd(p_hwfn, p_ptt, MISC_REG_GEN_PURP_CR0);
+
+	/* Verify MCP has initialized it */
+	if (!nvm_cfg_addr) {
+		DP_NOTICE(p_hwfn, "Shared memory not initialized\n");
+		return -EINVAL;
+	}
+
+	/* Read nvm_cfg1  (Notice this is just offset, and not offsize (TBD) */
+	nvm_cfg1_offset = qed_rd(p_hwfn, p_ptt, nvm_cfg_addr + 4);
+
+	addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
+	       offsetof(struct nvm_cfg1, glob) +
+	       offsetof(struct nvm_cfg1_glob, core_cfg);
+
+	core_cfg = qed_rd(p_hwfn, p_ptt, addr);
+
+	switch ((core_cfg & NVM_CFG1_GLOB_NETWORK_PORT_MODE_MASK) >>
+		NVM_CFG1_GLOB_NETWORK_PORT_MODE_OFFSET) {
+	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_BB_2X40G:
+		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_2X40G;
+		break;
+	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_2X50G:
+		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_2X50G;
+		break;
+	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_BB_1X100G:
+		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_1X100G;
+		break;
+	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_4X10G_F:
+		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_4X10G_F;
+		break;
+	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_BB_4X10G_E:
+		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_4X10G_E;
+		break;
+	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_BB_4X20G:
+		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_4X20G;
+		break;
+	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_1X40G:
+		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_1X40G;
+		break;
+	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_2X25G:
+		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_2X25G;
+		break;
+	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_2X10G:
+		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_2X10G;
+		break;
+	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_1X25G:
+		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_1X25G;
+		break;
+	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_4X25G:
+		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_4X25G;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Unknown port mode in 0x%08x\n", core_cfg);
+		break;
+	}
+
+	/* Read default link configuration */
+	link = &p_hwfn->mcp_info->link_input;
+	p_caps = &p_hwfn->mcp_info->link_capabilities;
+	port_cfg_addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
+			offsetof(struct nvm_cfg1, port[MFW_PORT(p_hwfn)]);
+	link_temp = qed_rd(p_hwfn, p_ptt,
+			   port_cfg_addr +
+			   offsetof(struct nvm_cfg1_port, speed_cap_mask));
+	link_temp &= NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_MASK;
+	link->speed.advertised_speeds = link_temp;
+
+	link_temp = link->speed.advertised_speeds;
+	p_hwfn->mcp_info->link_capabilities.speed_capabilities = link_temp;
+
+	link_temp = qed_rd(p_hwfn, p_ptt,
+			   port_cfg_addr +
+			   offsetof(struct nvm_cfg1_port, link_settings));
+	switch ((link_temp & NVM_CFG1_PORT_DRV_LINK_SPEED_MASK) >>
+		NVM_CFG1_PORT_DRV_LINK_SPEED_OFFSET) {
+	case NVM_CFG1_PORT_DRV_LINK_SPEED_AUTONEG:
+		link->speed.autoneg = true;
+		break;
+	case NVM_CFG1_PORT_DRV_LINK_SPEED_1G:
+		link->speed.forced_speed = 1000;
+		break;
+	case NVM_CFG1_PORT_DRV_LINK_SPEED_10G:
+		link->speed.forced_speed = 10000;
+		break;
+	case NVM_CFG1_PORT_DRV_LINK_SPEED_25G:
+		link->speed.forced_speed = 25000;
+		break;
+	case NVM_CFG1_PORT_DRV_LINK_SPEED_40G:
+		link->speed.forced_speed = 40000;
+		break;
+	case NVM_CFG1_PORT_DRV_LINK_SPEED_50G:
+		link->speed.forced_speed = 50000;
+		break;
+	case NVM_CFG1_PORT_DRV_LINK_SPEED_BB_100G:
+		link->speed.forced_speed = 100000;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Unknown Speed in 0x%08x\n", link_temp);
+	}
+
+	p_hwfn->mcp_info->link_capabilities.default_speed_autoneg =
+		link->speed.autoneg;
+
+	link_temp &= NVM_CFG1_PORT_DRV_FLOW_CONTROL_MASK;
+	link_temp >>= NVM_CFG1_PORT_DRV_FLOW_CONTROL_OFFSET;
+	link->pause.autoneg = !!(link_temp &
+				 NVM_CFG1_PORT_DRV_FLOW_CONTROL_AUTONEG);
+	link->pause.forced_rx = !!(link_temp &
+				   NVM_CFG1_PORT_DRV_FLOW_CONTROL_RX);
+	link->pause.forced_tx = !!(link_temp &
+				   NVM_CFG1_PORT_DRV_FLOW_CONTROL_TX);
+	link->loopback_mode = 0;
+
+	if (p_hwfn->mcp_info->capabilities & FW_MB_PARAM_FEATURE_SUPPORT_EEE) {
+		link_temp = qed_rd(p_hwfn, p_ptt, port_cfg_addr +
+				   offsetof(struct nvm_cfg1_port, ext_phy));
+		link_temp &= NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_MASK;
+		link_temp >>= NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_OFFSET;
+		p_caps->default_eee = QED_MCP_EEE_ENABLED;
+		link->eee.enable = true;
+		switch (link_temp) {
+		case NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_DISABLED:
+			p_caps->default_eee = QED_MCP_EEE_DISABLED;
+			link->eee.enable = false;
+			break;
+		case NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_BALANCED:
+			p_caps->eee_lpi_timer = EEE_TX_TIMER_USEC_BALANCED_TIME;
+			break;
+		case NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_AGGRESSIVE:
+			p_caps->eee_lpi_timer =
+			    EEE_TX_TIMER_USEC_AGGRESSIVE_TIME;
+			break;
+		case NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_LOW_LATENCY:
+			p_caps->eee_lpi_timer = EEE_TX_TIMER_USEC_LATENCY_TIME;
+			break;
+		}
+
+		link->eee.tx_lpi_timer = p_caps->eee_lpi_timer;
+		link->eee.tx_lpi_enable = link->eee.enable;
+		link->eee.adv_caps = QED_EEE_1G_ADV | QED_EEE_10G_ADV;
+	} else {
+		p_caps->default_eee = QED_MCP_EEE_UNSUPPORTED;
+	}
+
+	DP_VERBOSE(p_hwfn,
+		   NETIF_MSG_LINK,
+		   "Read default link: Speed 0x%08x, Adv. Speed 0x%08x, AN: 0x%02x, PAUSE AN: 0x%02x EEE: %02x [%08x usec]\n",
+		   link->speed.forced_speed,
+		   link->speed.advertised_speeds,
+		   link->speed.autoneg,
+		   link->pause.autoneg,
+		   p_caps->default_eee, p_caps->eee_lpi_timer);
+
+	/* Read Multi-function information from shmem */
+	addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
+	       offsetof(struct nvm_cfg1, glob) +
+	       offsetof(struct nvm_cfg1_glob, generic_cont0);
+
+	generic_cont0 = qed_rd(p_hwfn, p_ptt, addr);
+
+	mf_mode = (generic_cont0 & NVM_CFG1_GLOB_MF_MODE_MASK) >>
+		  NVM_CFG1_GLOB_MF_MODE_OFFSET;
+
+	switch (mf_mode) {
+	case NVM_CFG1_GLOB_MF_MODE_MF_ALLOWED:
+		p_hwfn->cdev->mf_mode = QED_MF_OVLAN;
+		break;
+	case NVM_CFG1_GLOB_MF_MODE_NPAR1_0:
+		p_hwfn->cdev->mf_mode = QED_MF_NPAR;
+		break;
+	case NVM_CFG1_GLOB_MF_MODE_DEFAULT:
+		p_hwfn->cdev->mf_mode = QED_MF_DEFAULT;
+		break;
+	}
+	DP_INFO(p_hwfn, "Multi function mode is %08x\n",
+		p_hwfn->cdev->mf_mode);
+
+	/* Read Multi-function information from shmem */
+	addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
+		offsetof(struct nvm_cfg1, glob) +
+		offsetof(struct nvm_cfg1_glob, device_capabilities);
+
+	device_capabilities = qed_rd(p_hwfn, p_ptt, addr);
+	if (device_capabilities & NVM_CFG1_GLOB_DEVICE_CAPABILITIES_ETHERNET)
+		__set_bit(QED_DEV_CAP_ETH,
+			  &p_hwfn->hw_info.device_capabilities);
+	if (device_capabilities & NVM_CFG1_GLOB_DEVICE_CAPABILITIES_FCOE)
+		__set_bit(QED_DEV_CAP_FCOE,
+			  &p_hwfn->hw_info.device_capabilities);
+	if (device_capabilities & NVM_CFG1_GLOB_DEVICE_CAPABILITIES_ISCSI)
+		__set_bit(QED_DEV_CAP_ISCSI,
+			  &p_hwfn->hw_info.device_capabilities);
+	if (device_capabilities & NVM_CFG1_GLOB_DEVICE_CAPABILITIES_ROCE)
+		__set_bit(QED_DEV_CAP_ROCE,
+			  &p_hwfn->hw_info.device_capabilities);
+
+	return qed_mcp_fill_shmem_func_info(p_hwfn, p_ptt);
+}
+
+static void qed_get_num_funcs(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u8 num_funcs, enabled_func_idx = p_hwfn->rel_pf_id;
+	u32 reg_function_hide, tmp, eng_mask, low_pfs_mask;
+	struct qed_dev *cdev = p_hwfn->cdev;
+
+	num_funcs = QED_IS_AH(cdev) ? MAX_NUM_PFS_K2 : MAX_NUM_PFS_BB;
+
+	/* Bit 0 of MISCS_REG_FUNCTION_HIDE indicates whether the bypass values
+	 * in the other bits are selected.
+	 * Bits 1-15 are for functions 1-15, respectively, and their value is
+	 * '0' only for enabled functions (function 0 always exists and
+	 * enabled).
+	 * In case of CMT, only the "even" functions are enabled, and thus the
+	 * number of functions for both hwfns is learnt from the same bits.
+	 */
+	reg_function_hide = qed_rd(p_hwfn, p_ptt, MISCS_REG_FUNCTION_HIDE);
+
+	if (reg_function_hide & 0x1) {
+		if (QED_IS_BB(cdev)) {
+			if (QED_PATH_ID(p_hwfn) && cdev->num_hwfns == 1) {
+				num_funcs = 0;
+				eng_mask = 0xaaaa;
+			} else {
+				num_funcs = 1;
+				eng_mask = 0x5554;
+			}
+		} else {
+			num_funcs = 1;
+			eng_mask = 0xfffe;
+		}
+
+		/* Get the number of the enabled functions on the engine */
+		tmp = (reg_function_hide ^ 0xffffffff) & eng_mask;
+		while (tmp) {
+			if (tmp & 0x1)
+				num_funcs++;
+			tmp >>= 0x1;
+		}
+
+		/* Get the PF index within the enabled functions */
+		low_pfs_mask = (0x1 << p_hwfn->abs_pf_id) - 1;
+		tmp = reg_function_hide & eng_mask & low_pfs_mask;
+		while (tmp) {
+			if (tmp & 0x1)
+				enabled_func_idx--;
+			tmp >>= 0x1;
+		}
+	}
+
+	p_hwfn->num_funcs_on_engine = num_funcs;
+	p_hwfn->enabled_func_idx = enabled_func_idx;
+
+	DP_VERBOSE(p_hwfn,
+		   NETIF_MSG_PROBE,
+		   "PF [rel_id %d, abs_id %d] occupies index %d within the %d enabled functions on the engine\n",
+		   p_hwfn->rel_pf_id,
+		   p_hwfn->abs_pf_id,
+		   p_hwfn->enabled_func_idx, p_hwfn->num_funcs_on_engine);
+}
+
+static void qed_hw_info_port_num_bb(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt)
+{
+	u32 port_mode;
+
+	port_mode = qed_rd(p_hwfn, p_ptt, CNIG_REG_NW_PORT_MODE_BB_B0);
+
+	if (port_mode < 3) {
+		p_hwfn->cdev->num_ports_in_engine = 1;
+	} else if (port_mode <= 5) {
+		p_hwfn->cdev->num_ports_in_engine = 2;
+	} else {
+		DP_NOTICE(p_hwfn, "PORT MODE: %d not supported\n",
+			  p_hwfn->cdev->num_ports_in_engine);
+
+		/* Default num_ports_in_engine to something */
+		p_hwfn->cdev->num_ports_in_engine = 1;
+	}
+}
+
+static void qed_hw_info_port_num_ah(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt)
+{
+	u32 port;
+	int i;
+
+	p_hwfn->cdev->num_ports_in_engine = 0;
+
+	for (i = 0; i < MAX_NUM_PORTS_K2; i++) {
+		port = qed_rd(p_hwfn, p_ptt,
+			      CNIG_REG_NIG_PORT0_CONF_K2 + (i * 4));
+		if (port & 1)
+			p_hwfn->cdev->num_ports_in_engine++;
+	}
+
+	if (!p_hwfn->cdev->num_ports_in_engine) {
+		DP_NOTICE(p_hwfn, "All NIG ports are inactive\n");
+
+		/* Default num_ports_in_engine to something */
+		p_hwfn->cdev->num_ports_in_engine = 1;
+	}
+}
+
+static void qed_hw_info_port_num(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	if (QED_IS_BB(p_hwfn->cdev))
+		qed_hw_info_port_num_bb(p_hwfn, p_ptt);
+	else
+		qed_hw_info_port_num_ah(p_hwfn, p_ptt);
+}
+
+static void qed_get_eee_caps(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_mcp_link_capabilities *p_caps;
+	u32 eee_status;
+
+	p_caps = &p_hwfn->mcp_info->link_capabilities;
+	if (p_caps->default_eee == QED_MCP_EEE_UNSUPPORTED)
+		return;
+
+	p_caps->eee_speed_caps = 0;
+	eee_status = qed_rd(p_hwfn, p_ptt, p_hwfn->mcp_info->port_addr +
+			    offsetof(struct public_port, eee_status));
+	eee_status = (eee_status & EEE_SUPPORTED_SPEED_MASK) >>
+			EEE_SUPPORTED_SPEED_OFFSET;
+
+	if (eee_status & EEE_1G_SUPPORTED)
+		p_caps->eee_speed_caps |= QED_EEE_1G_ADV;
+	if (eee_status & EEE_10G_ADV)
+		p_caps->eee_speed_caps |= QED_EEE_10G_ADV;
+}
+
+static int
+qed_get_hw_info(struct qed_hwfn *p_hwfn,
+		struct qed_ptt *p_ptt,
+		enum qed_pci_personality personality)
+{
+	int rc;
+
+	/* Since all information is common, only first hwfns should do this */
+	if (IS_LEAD_HWFN(p_hwfn)) {
+		rc = qed_iov_hw_info(p_hwfn);
+		if (rc)
+			return rc;
+	}
+
+	qed_hw_info_port_num(p_hwfn, p_ptt);
+
+	qed_mcp_get_capabilities(p_hwfn, p_ptt);
+
+	qed_hw_get_nvm_info(p_hwfn, p_ptt);
+
+	rc = qed_int_igu_read_cam(p_hwfn, p_ptt);
+	if (rc)
+		return rc;
+
+	if (qed_mcp_is_init(p_hwfn))
+		ether_addr_copy(p_hwfn->hw_info.hw_mac_addr,
+				p_hwfn->mcp_info->func_info.mac);
+	else
+		eth_random_addr(p_hwfn->hw_info.hw_mac_addr);
+
+	if (qed_mcp_is_init(p_hwfn)) {
+		if (p_hwfn->mcp_info->func_info.ovlan != QED_MCP_VLAN_UNSET)
+			p_hwfn->hw_info.ovlan =
+				p_hwfn->mcp_info->func_info.ovlan;
+
+		qed_mcp_cmd_port_init(p_hwfn, p_ptt);
+
+		qed_get_eee_caps(p_hwfn, p_ptt);
+	}
+
+	if (qed_mcp_is_init(p_hwfn)) {
+		enum qed_pci_personality protocol;
+
+		protocol = p_hwfn->mcp_info->func_info.protocol;
+		p_hwfn->hw_info.personality = protocol;
+	}
+
+	p_hwfn->hw_info.num_hw_tc = NUM_PHYS_TCS_4PORT_K2;
+	p_hwfn->hw_info.num_active_tc = 1;
+
+	qed_get_num_funcs(p_hwfn, p_ptt);
+
+	if (qed_mcp_is_init(p_hwfn))
+		p_hwfn->hw_info.mtu = p_hwfn->mcp_info->func_info.mtu;
+
+	return qed_hw_get_resc(p_hwfn, p_ptt);
+}
+
+static int qed_get_dev_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_dev *cdev = p_hwfn->cdev;
+	u16 device_id_mask;
+	u32 tmp;
+
+	/* Read Vendor Id / Device Id */
+	pci_read_config_word(cdev->pdev, PCI_VENDOR_ID, &cdev->vendor_id);
+	pci_read_config_word(cdev->pdev, PCI_DEVICE_ID, &cdev->device_id);
+
+	/* Determine type */
+	device_id_mask = cdev->device_id & QED_DEV_ID_MASK;
+	switch (device_id_mask) {
+	case QED_DEV_ID_MASK_BB:
+		cdev->type = QED_DEV_TYPE_BB;
+		break;
+	case QED_DEV_ID_MASK_AH:
+		cdev->type = QED_DEV_TYPE_AH;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Unknown device id 0x%x\n", cdev->device_id);
+		return -EBUSY;
+	}
+
+	cdev->chip_num = (u16)qed_rd(p_hwfn, p_ptt, MISCS_REG_CHIP_NUM);
+	cdev->chip_rev = (u16)qed_rd(p_hwfn, p_ptt, MISCS_REG_CHIP_REV);
+
+	MASK_FIELD(CHIP_REV, cdev->chip_rev);
+
+	/* Learn number of HW-functions */
+	tmp = qed_rd(p_hwfn, p_ptt, MISCS_REG_CMT_ENABLED_FOR_PAIR);
+
+	if (tmp & (1 << p_hwfn->rel_pf_id)) {
+		DP_NOTICE(cdev->hwfns, "device in CMT mode\n");
+		cdev->num_hwfns = 2;
+	} else {
+		cdev->num_hwfns = 1;
+	}
+
+	cdev->chip_bond_id = qed_rd(p_hwfn, p_ptt,
+				    MISCS_REG_CHIP_TEST_REG) >> 4;
+	MASK_FIELD(CHIP_BOND_ID, cdev->chip_bond_id);
+	cdev->chip_metal = (u16)qed_rd(p_hwfn, p_ptt, MISCS_REG_CHIP_METAL);
+	MASK_FIELD(CHIP_METAL, cdev->chip_metal);
+
+	DP_INFO(cdev->hwfns,
+		"Chip details - %s %c%d, Num: %04x Rev: %04x Bond id: %04x Metal: %04x\n",
+		QED_IS_BB(cdev) ? "BB" : "AH",
+		'A' + cdev->chip_rev,
+		(int)cdev->chip_metal,
+		cdev->chip_num, cdev->chip_rev,
+		cdev->chip_bond_id, cdev->chip_metal);
+
+	return 0;
+}
+
+static void qed_nvm_info_free(struct qed_hwfn *p_hwfn)
+{
+	kfree(p_hwfn->nvm_info.image_att);
+	p_hwfn->nvm_info.image_att = NULL;
+}
+
+static int qed_hw_prepare_single(struct qed_hwfn *p_hwfn,
+				 void __iomem *p_regview,
+				 void __iomem *p_doorbells,
+				 enum qed_pci_personality personality)
+{
+	int rc = 0;
+
+	/* Split PCI bars evenly between hwfns */
+	p_hwfn->regview = p_regview;
+	p_hwfn->doorbells = p_doorbells;
+
+	if (IS_VF(p_hwfn->cdev))
+		return qed_vf_hw_prepare(p_hwfn);
+
+	/* Validate that chip access is feasible */
+	if (REG_RD(p_hwfn, PXP_PF_ME_OPAQUE_ADDR) == 0xffffffff) {
+		DP_ERR(p_hwfn,
+		       "Reading the ME register returns all Fs; Preventing further chip access\n");
+		return -EINVAL;
+	}
+
+	get_function_id(p_hwfn);
+
+	/* Allocate PTT pool */
+	rc = qed_ptt_pool_alloc(p_hwfn);
+	if (rc)
+		goto err0;
+
+	/* Allocate the main PTT */
+	p_hwfn->p_main_ptt = qed_get_reserved_ptt(p_hwfn, RESERVED_PTT_MAIN);
+
+	/* First hwfn learns basic information, e.g., number of hwfns */
+	if (!p_hwfn->my_id) {
+		rc = qed_get_dev_info(p_hwfn, p_hwfn->p_main_ptt);
+		if (rc)
+			goto err1;
+	}
+
+	qed_hw_hwfn_prepare(p_hwfn);
+
+	/* Initialize MCP structure */
+	rc = qed_mcp_cmd_init(p_hwfn, p_hwfn->p_main_ptt);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Failed initializing mcp command\n");
+		goto err1;
+	}
+
+	/* Read the device configuration information from the HW and SHMEM */
+	rc = qed_get_hw_info(p_hwfn, p_hwfn->p_main_ptt, personality);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Failed to get HW information\n");
+		goto err2;
+	}
+
+	/* Sending a mailbox to the MFW should be done after qed_get_hw_info()
+	 * is called as it sets the ports number in an engine.
+	 */
+	if (IS_LEAD_HWFN(p_hwfn)) {
+		rc = qed_mcp_initiate_pf_flr(p_hwfn, p_hwfn->p_main_ptt);
+		if (rc)
+			DP_NOTICE(p_hwfn, "Failed to initiate PF FLR\n");
+	}
+
+	/* NVRAM info initialization and population */
+	if (IS_LEAD_HWFN(p_hwfn)) {
+		rc = qed_mcp_nvm_info_populate(p_hwfn);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to populate nvm info shadow\n");
+			goto err2;
+		}
+	}
+
+	/* Allocate the init RT array and initialize the init-ops engine */
+	rc = qed_init_alloc(p_hwfn);
+	if (rc)
+		goto err3;
+
+	return rc;
+err3:
+	if (IS_LEAD_HWFN(p_hwfn))
+		qed_nvm_info_free(p_hwfn);
+err2:
+	if (IS_LEAD_HWFN(p_hwfn))
+		qed_iov_free_hw_info(p_hwfn->cdev);
+	qed_mcp_free(p_hwfn);
+err1:
+	qed_hw_hwfn_free(p_hwfn);
+err0:
+	return rc;
+}
+
+int qed_hw_prepare(struct qed_dev *cdev,
+		   int personality)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	int rc;
+
+	/* Store the precompiled init data ptrs */
+	if (IS_PF(cdev))
+		qed_init_iro_array(cdev);
+
+	/* Initialize the first hwfn - will learn number of hwfns */
+	rc = qed_hw_prepare_single(p_hwfn,
+				   cdev->regview,
+				   cdev->doorbells, personality);
+	if (rc)
+		return rc;
+
+	personality = p_hwfn->hw_info.personality;
+
+	/* Initialize the rest of the hwfns */
+	if (cdev->num_hwfns > 1) {
+		void __iomem *p_regview, *p_doorbell;
+		u8 __iomem *addr;
+
+		/* adjust bar offset for second engine */
+		addr = cdev->regview +
+		       qed_hw_bar_size(p_hwfn, p_hwfn->p_main_ptt,
+				       BAR_ID_0) / 2;
+		p_regview = addr;
+
+		addr = cdev->doorbells +
+		       qed_hw_bar_size(p_hwfn, p_hwfn->p_main_ptt,
+				       BAR_ID_1) / 2;
+		p_doorbell = addr;
+
+		/* prepare second hw function */
+		rc = qed_hw_prepare_single(&cdev->hwfns[1], p_regview,
+					   p_doorbell, personality);
+
+		/* in case of error, need to free the previously
+		 * initiliazed hwfn 0.
+		 */
+		if (rc) {
+			if (IS_PF(cdev)) {
+				qed_init_free(p_hwfn);
+				qed_nvm_info_free(p_hwfn);
+				qed_mcp_free(p_hwfn);
+				qed_hw_hwfn_free(p_hwfn);
+			}
+		}
+	}
+
+	return rc;
+}
+
+void qed_hw_remove(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	int i;
+
+	if (IS_PF(cdev))
+		qed_mcp_ov_update_driver_state(p_hwfn, p_hwfn->p_main_ptt,
+					       QED_OV_DRIVER_STATE_NOT_LOADED);
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		if (IS_VF(cdev)) {
+			qed_vf_pf_release(p_hwfn);
+			continue;
+		}
+
+		qed_init_free(p_hwfn);
+		qed_hw_hwfn_free(p_hwfn);
+		qed_mcp_free(p_hwfn);
+	}
+
+	qed_iov_free_hw_info(cdev);
+
+	qed_nvm_info_free(p_hwfn);
+}
+
+static void qed_chain_free_next_ptr(struct qed_dev *cdev,
+				    struct qed_chain *p_chain)
+{
+	void *p_virt = p_chain->p_virt_addr, *p_virt_next = NULL;
+	dma_addr_t p_phys = p_chain->p_phys_addr, p_phys_next = 0;
+	struct qed_chain_next *p_next;
+	u32 size, i;
+
+	if (!p_virt)
+		return;
+
+	size = p_chain->elem_size * p_chain->usable_per_page;
+
+	for (i = 0; i < p_chain->page_cnt; i++) {
+		if (!p_virt)
+			break;
+
+		p_next = (struct qed_chain_next *)((u8 *)p_virt + size);
+		p_virt_next = p_next->next_virt;
+		p_phys_next = HILO_DMA_REGPAIR(p_next->next_phys);
+
+		dma_free_coherent(&cdev->pdev->dev,
+				  QED_CHAIN_PAGE_SIZE, p_virt, p_phys);
+
+		p_virt = p_virt_next;
+		p_phys = p_phys_next;
+	}
+}
+
+static void qed_chain_free_single(struct qed_dev *cdev,
+				  struct qed_chain *p_chain)
+{
+	if (!p_chain->p_virt_addr)
+		return;
+
+	dma_free_coherent(&cdev->pdev->dev,
+			  QED_CHAIN_PAGE_SIZE,
+			  p_chain->p_virt_addr, p_chain->p_phys_addr);
+}
+
+static void qed_chain_free_pbl(struct qed_dev *cdev, struct qed_chain *p_chain)
+{
+	void **pp_virt_addr_tbl = p_chain->pbl.pp_virt_addr_tbl;
+	u32 page_cnt = p_chain->page_cnt, i, pbl_size;
+	u8 *p_pbl_virt = p_chain->pbl_sp.p_virt_table;
+
+	if (!pp_virt_addr_tbl)
+		return;
+
+	if (!p_pbl_virt)
+		goto out;
+
+	for (i = 0; i < page_cnt; i++) {
+		if (!pp_virt_addr_tbl[i])
+			break;
+
+		dma_free_coherent(&cdev->pdev->dev,
+				  QED_CHAIN_PAGE_SIZE,
+				  pp_virt_addr_tbl[i],
+				  *(dma_addr_t *)p_pbl_virt);
+
+		p_pbl_virt += QED_CHAIN_PBL_ENTRY_SIZE;
+	}
+
+	pbl_size = page_cnt * QED_CHAIN_PBL_ENTRY_SIZE;
+
+	if (!p_chain->b_external_pbl)
+		dma_free_coherent(&cdev->pdev->dev,
+				  pbl_size,
+				  p_chain->pbl_sp.p_virt_table,
+				  p_chain->pbl_sp.p_phys_table);
+out:
+	vfree(p_chain->pbl.pp_virt_addr_tbl);
+	p_chain->pbl.pp_virt_addr_tbl = NULL;
+}
+
+void qed_chain_free(struct qed_dev *cdev, struct qed_chain *p_chain)
+{
+	switch (p_chain->mode) {
+	case QED_CHAIN_MODE_NEXT_PTR:
+		qed_chain_free_next_ptr(cdev, p_chain);
+		break;
+	case QED_CHAIN_MODE_SINGLE:
+		qed_chain_free_single(cdev, p_chain);
+		break;
+	case QED_CHAIN_MODE_PBL:
+		qed_chain_free_pbl(cdev, p_chain);
+		break;
+	}
+}
+
+static int
+qed_chain_alloc_sanity_check(struct qed_dev *cdev,
+			     enum qed_chain_cnt_type cnt_type,
+			     size_t elem_size, u32 page_cnt)
+{
+	u64 chain_size = ELEMS_PER_PAGE(elem_size) * page_cnt;
+
+	/* The actual chain size can be larger than the maximal possible value
+	 * after rounding up the requested elements number to pages, and after
+	 * taking into acount the unusuable elements (next-ptr elements).
+	 * The size of a "u16" chain can be (U16_MAX + 1) since the chain
+	 * size/capacity fields are of a u32 type.
+	 */
+	if ((cnt_type == QED_CHAIN_CNT_TYPE_U16 &&
+	     chain_size > ((u32)U16_MAX + 1)) ||
+	    (cnt_type == QED_CHAIN_CNT_TYPE_U32 && chain_size > U32_MAX)) {
+		DP_NOTICE(cdev,
+			  "The actual chain size (0x%llx) is larger than the maximal possible value\n",
+			  chain_size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+qed_chain_alloc_next_ptr(struct qed_dev *cdev, struct qed_chain *p_chain)
+{
+	void *p_virt = NULL, *p_virt_prev = NULL;
+	dma_addr_t p_phys = 0;
+	u32 i;
+
+	for (i = 0; i < p_chain->page_cnt; i++) {
+		p_virt = dma_alloc_coherent(&cdev->pdev->dev,
+					    QED_CHAIN_PAGE_SIZE,
+					    &p_phys, GFP_KERNEL);
+		if (!p_virt)
+			return -ENOMEM;
+
+		if (i == 0) {
+			qed_chain_init_mem(p_chain, p_virt, p_phys);
+			qed_chain_reset(p_chain);
+		} else {
+			qed_chain_init_next_ptr_elem(p_chain, p_virt_prev,
+						     p_virt, p_phys);
+		}
+
+		p_virt_prev = p_virt;
+	}
+	/* Last page's next element should point to the beginning of the
+	 * chain.
+	 */
+	qed_chain_init_next_ptr_elem(p_chain, p_virt_prev,
+				     p_chain->p_virt_addr,
+				     p_chain->p_phys_addr);
+
+	return 0;
+}
+
+static int
+qed_chain_alloc_single(struct qed_dev *cdev, struct qed_chain *p_chain)
+{
+	dma_addr_t p_phys = 0;
+	void *p_virt = NULL;
+
+	p_virt = dma_alloc_coherent(&cdev->pdev->dev,
+				    QED_CHAIN_PAGE_SIZE, &p_phys, GFP_KERNEL);
+	if (!p_virt)
+		return -ENOMEM;
+
+	qed_chain_init_mem(p_chain, p_virt, p_phys);
+	qed_chain_reset(p_chain);
+
+	return 0;
+}
+
+static int
+qed_chain_alloc_pbl(struct qed_dev *cdev,
+		    struct qed_chain *p_chain,
+		    struct qed_chain_ext_pbl *ext_pbl)
+{
+	u32 page_cnt = p_chain->page_cnt, size, i;
+	dma_addr_t p_phys = 0, p_pbl_phys = 0;
+	void **pp_virt_addr_tbl = NULL;
+	u8 *p_pbl_virt = NULL;
+	void *p_virt = NULL;
+
+	size = page_cnt * sizeof(*pp_virt_addr_tbl);
+	pp_virt_addr_tbl = vzalloc(size);
+	if (!pp_virt_addr_tbl)
+		return -ENOMEM;
+
+	/* The allocation of the PBL table is done with its full size, since it
+	 * is expected to be successive.
+	 * qed_chain_init_pbl_mem() is called even in a case of an allocation
+	 * failure, since pp_virt_addr_tbl was previously allocated, and it
+	 * should be saved to allow its freeing during the error flow.
+	 */
+	size = page_cnt * QED_CHAIN_PBL_ENTRY_SIZE;
+
+	if (!ext_pbl) {
+		p_pbl_virt = dma_alloc_coherent(&cdev->pdev->dev,
+						size, &p_pbl_phys, GFP_KERNEL);
+	} else {
+		p_pbl_virt = ext_pbl->p_pbl_virt;
+		p_pbl_phys = ext_pbl->p_pbl_phys;
+		p_chain->b_external_pbl = true;
+	}
+
+	qed_chain_init_pbl_mem(p_chain, p_pbl_virt, p_pbl_phys,
+			       pp_virt_addr_tbl);
+	if (!p_pbl_virt)
+		return -ENOMEM;
+
+	for (i = 0; i < page_cnt; i++) {
+		p_virt = dma_alloc_coherent(&cdev->pdev->dev,
+					    QED_CHAIN_PAGE_SIZE,
+					    &p_phys, GFP_KERNEL);
+		if (!p_virt)
+			return -ENOMEM;
+
+		if (i == 0) {
+			qed_chain_init_mem(p_chain, p_virt, p_phys);
+			qed_chain_reset(p_chain);
+		}
+
+		/* Fill the PBL table with the physical address of the page */
+		*(dma_addr_t *)p_pbl_virt = p_phys;
+		/* Keep the virtual address of the page */
+		p_chain->pbl.pp_virt_addr_tbl[i] = p_virt;
+
+		p_pbl_virt += QED_CHAIN_PBL_ENTRY_SIZE;
+	}
+
+	return 0;
+}
+
+int qed_chain_alloc(struct qed_dev *cdev,
+		    enum qed_chain_use_mode intended_use,
+		    enum qed_chain_mode mode,
+		    enum qed_chain_cnt_type cnt_type,
+		    u32 num_elems,
+		    size_t elem_size,
+		    struct qed_chain *p_chain,
+		    struct qed_chain_ext_pbl *ext_pbl)
+{
+	u32 page_cnt;
+	int rc = 0;
+
+	if (mode == QED_CHAIN_MODE_SINGLE)
+		page_cnt = 1;
+	else
+		page_cnt = QED_CHAIN_PAGE_CNT(num_elems, elem_size, mode);
+
+	rc = qed_chain_alloc_sanity_check(cdev, cnt_type, elem_size, page_cnt);
+	if (rc) {
+		DP_NOTICE(cdev,
+			  "Cannot allocate a chain with the given arguments:\n");
+		DP_NOTICE(cdev,
+			  "[use_mode %d, mode %d, cnt_type %d, num_elems %d, elem_size %zu]\n",
+			  intended_use, mode, cnt_type, num_elems, elem_size);
+		return rc;
+	}
+
+	qed_chain_init_params(p_chain, page_cnt, (u8) elem_size, intended_use,
+			      mode, cnt_type);
+
+	switch (mode) {
+	case QED_CHAIN_MODE_NEXT_PTR:
+		rc = qed_chain_alloc_next_ptr(cdev, p_chain);
+		break;
+	case QED_CHAIN_MODE_SINGLE:
+		rc = qed_chain_alloc_single(cdev, p_chain);
+		break;
+	case QED_CHAIN_MODE_PBL:
+		rc = qed_chain_alloc_pbl(cdev, p_chain, ext_pbl);
+		break;
+	}
+	if (rc)
+		goto nomem;
+
+	return 0;
+
+nomem:
+	qed_chain_free(cdev, p_chain);
+	return rc;
+}
+
+int qed_fw_l2_queue(struct qed_hwfn *p_hwfn, u16 src_id, u16 *dst_id)
+{
+	if (src_id >= RESC_NUM(p_hwfn, QED_L2_QUEUE)) {
+		u16 min, max;
+
+		min = (u16) RESC_START(p_hwfn, QED_L2_QUEUE);
+		max = min + RESC_NUM(p_hwfn, QED_L2_QUEUE);
+		DP_NOTICE(p_hwfn,
+			  "l2_queue id [%d] is not valid, available indices [%d - %d]\n",
+			  src_id, min, max);
+
+		return -EINVAL;
+	}
+
+	*dst_id = RESC_START(p_hwfn, QED_L2_QUEUE) + src_id;
+
+	return 0;
+}
+
+int qed_fw_vport(struct qed_hwfn *p_hwfn, u8 src_id, u8 *dst_id)
+{
+	if (src_id >= RESC_NUM(p_hwfn, QED_VPORT)) {
+		u8 min, max;
+
+		min = (u8)RESC_START(p_hwfn, QED_VPORT);
+		max = min + RESC_NUM(p_hwfn, QED_VPORT);
+		DP_NOTICE(p_hwfn,
+			  "vport id [%d] is not valid, available indices [%d - %d]\n",
+			  src_id, min, max);
+
+		return -EINVAL;
+	}
+
+	*dst_id = RESC_START(p_hwfn, QED_VPORT) + src_id;
+
+	return 0;
+}
+
+int qed_fw_rss_eng(struct qed_hwfn *p_hwfn, u8 src_id, u8 *dst_id)
+{
+	if (src_id >= RESC_NUM(p_hwfn, QED_RSS_ENG)) {
+		u8 min, max;
+
+		min = (u8)RESC_START(p_hwfn, QED_RSS_ENG);
+		max = min + RESC_NUM(p_hwfn, QED_RSS_ENG);
+		DP_NOTICE(p_hwfn,
+			  "rss_eng id [%d] is not valid, available indices [%d - %d]\n",
+			  src_id, min, max);
+
+		return -EINVAL;
+	}
+
+	*dst_id = RESC_START(p_hwfn, QED_RSS_ENG) + src_id;
+
+	return 0;
+}
+
+static void qed_llh_mac_to_filter(u32 *p_high, u32 *p_low,
+				  u8 *p_filter)
+{
+	*p_high = p_filter[1] | (p_filter[0] << 8);
+	*p_low = p_filter[5] | (p_filter[4] << 8) |
+		 (p_filter[3] << 16) | (p_filter[2] << 24);
+}
+
+int qed_llh_add_mac_filter(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt, u8 *p_filter)
+{
+	u32 high = 0, low = 0, en;
+	int i;
+
+	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+		return 0;
+
+	qed_llh_mac_to_filter(&high, &low, p_filter);
+
+	/* Find a free entry and utilize it */
+	for (i = 0; i < NIG_REG_LLH_FUNC_FILTER_EN_SIZE; i++) {
+		en = qed_rd(p_hwfn, p_ptt,
+			    NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32));
+		if (en)
+			continue;
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_VALUE +
+		       2 * i * sizeof(u32), low);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_VALUE +
+		       (2 * i + 1) * sizeof(u32), high);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_MODE + i * sizeof(u32), 0);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE +
+		       i * sizeof(u32), 0);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32), 1);
+		break;
+	}
+	if (i >= NIG_REG_LLH_FUNC_FILTER_EN_SIZE) {
+		DP_NOTICE(p_hwfn,
+			  "Failed to find an empty LLH filter to utilize\n");
+		return -EINVAL;
+	}
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+		   "mac: %pM is added at %d\n",
+		   p_filter, i);
+
+	return 0;
+}
+
+void qed_llh_remove_mac_filter(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt, u8 *p_filter)
+{
+	u32 high = 0, low = 0;
+	int i;
+
+	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+		return;
+
+	qed_llh_mac_to_filter(&high, &low, p_filter);
+
+	/* Find the entry and clean it */
+	for (i = 0; i < NIG_REG_LLH_FUNC_FILTER_EN_SIZE; i++) {
+		if (qed_rd(p_hwfn, p_ptt,
+			   NIG_REG_LLH_FUNC_FILTER_VALUE +
+			   2 * i * sizeof(u32)) != low)
+			continue;
+		if (qed_rd(p_hwfn, p_ptt,
+			   NIG_REG_LLH_FUNC_FILTER_VALUE +
+			   (2 * i + 1) * sizeof(u32)) != high)
+			continue;
+
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32), 0);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_VALUE + 2 * i * sizeof(u32), 0);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_VALUE +
+		       (2 * i + 1) * sizeof(u32), 0);
+
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "mac: %pM is removed from %d\n",
+			   p_filter, i);
+		break;
+	}
+	if (i >= NIG_REG_LLH_FUNC_FILTER_EN_SIZE)
+		DP_NOTICE(p_hwfn, "Tried to remove a non-configured filter\n");
+}
+
+int
+qed_llh_add_protocol_filter(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt,
+			    u16 source_port_or_eth_type,
+			    u16 dest_port, enum qed_llh_port_filter_type_t type)
+{
+	u32 high = 0, low = 0, en;
+	int i;
+
+	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+		return 0;
+
+	switch (type) {
+	case QED_LLH_FILTER_ETHERTYPE:
+		high = source_port_or_eth_type;
+		break;
+	case QED_LLH_FILTER_TCP_SRC_PORT:
+	case QED_LLH_FILTER_UDP_SRC_PORT:
+		low = source_port_or_eth_type << 16;
+		break;
+	case QED_LLH_FILTER_TCP_DEST_PORT:
+	case QED_LLH_FILTER_UDP_DEST_PORT:
+		low = dest_port;
+		break;
+	case QED_LLH_FILTER_TCP_SRC_AND_DEST_PORT:
+	case QED_LLH_FILTER_UDP_SRC_AND_DEST_PORT:
+		low = (source_port_or_eth_type << 16) | dest_port;
+		break;
+	default:
+		DP_NOTICE(p_hwfn,
+			  "Non valid LLH protocol filter type %d\n", type);
+		return -EINVAL;
+	}
+	/* Find a free entry and utilize it */
+	for (i = 0; i < NIG_REG_LLH_FUNC_FILTER_EN_SIZE; i++) {
+		en = qed_rd(p_hwfn, p_ptt,
+			    NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32));
+		if (en)
+			continue;
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_VALUE +
+		       2 * i * sizeof(u32), low);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_VALUE +
+		       (2 * i + 1) * sizeof(u32), high);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_MODE + i * sizeof(u32), 1);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE +
+		       i * sizeof(u32), 1 << type);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32), 1);
+		break;
+	}
+	if (i >= NIG_REG_LLH_FUNC_FILTER_EN_SIZE) {
+		DP_NOTICE(p_hwfn,
+			  "Failed to find an empty LLH filter to utilize\n");
+		return -EINVAL;
+	}
+	switch (type) {
+	case QED_LLH_FILTER_ETHERTYPE:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "ETH type %x is added at %d\n",
+			   source_port_or_eth_type, i);
+		break;
+	case QED_LLH_FILTER_TCP_SRC_PORT:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "TCP src port %x is added at %d\n",
+			   source_port_or_eth_type, i);
+		break;
+	case QED_LLH_FILTER_UDP_SRC_PORT:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "UDP src port %x is added at %d\n",
+			   source_port_or_eth_type, i);
+		break;
+	case QED_LLH_FILTER_TCP_DEST_PORT:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "TCP dst port %x is added at %d\n", dest_port, i);
+		break;
+	case QED_LLH_FILTER_UDP_DEST_PORT:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "UDP dst port %x is added at %d\n", dest_port, i);
+		break;
+	case QED_LLH_FILTER_TCP_SRC_AND_DEST_PORT:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "TCP src/dst ports %x/%x are added at %d\n",
+			   source_port_or_eth_type, dest_port, i);
+		break;
+	case QED_LLH_FILTER_UDP_SRC_AND_DEST_PORT:
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "UDP src/dst ports %x/%x are added at %d\n",
+			   source_port_or_eth_type, dest_port, i);
+		break;
+	}
+	return 0;
+}
+
+void
+qed_llh_remove_protocol_filter(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt,
+			       u16 source_port_or_eth_type,
+			       u16 dest_port,
+			       enum qed_llh_port_filter_type_t type)
+{
+	u32 high = 0, low = 0;
+	int i;
+
+	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+		return;
+
+	switch (type) {
+	case QED_LLH_FILTER_ETHERTYPE:
+		high = source_port_or_eth_type;
+		break;
+	case QED_LLH_FILTER_TCP_SRC_PORT:
+	case QED_LLH_FILTER_UDP_SRC_PORT:
+		low = source_port_or_eth_type << 16;
+		break;
+	case QED_LLH_FILTER_TCP_DEST_PORT:
+	case QED_LLH_FILTER_UDP_DEST_PORT:
+		low = dest_port;
+		break;
+	case QED_LLH_FILTER_TCP_SRC_AND_DEST_PORT:
+	case QED_LLH_FILTER_UDP_SRC_AND_DEST_PORT:
+		low = (source_port_or_eth_type << 16) | dest_port;
+		break;
+	default:
+		DP_NOTICE(p_hwfn,
+			  "Non valid LLH protocol filter type %d\n", type);
+		return;
+	}
+
+	for (i = 0; i < NIG_REG_LLH_FUNC_FILTER_EN_SIZE; i++) {
+		if (!qed_rd(p_hwfn, p_ptt,
+			    NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32)))
+			continue;
+		if (!qed_rd(p_hwfn, p_ptt,
+			    NIG_REG_LLH_FUNC_FILTER_MODE + i * sizeof(u32)))
+			continue;
+		if (!(qed_rd(p_hwfn, p_ptt,
+			     NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE +
+			     i * sizeof(u32)) & BIT(type)))
+			continue;
+		if (qed_rd(p_hwfn, p_ptt,
+			   NIG_REG_LLH_FUNC_FILTER_VALUE +
+			   2 * i * sizeof(u32)) != low)
+			continue;
+		if (qed_rd(p_hwfn, p_ptt,
+			   NIG_REG_LLH_FUNC_FILTER_VALUE +
+			   (2 * i + 1) * sizeof(u32)) != high)
+			continue;
+
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_EN + i * sizeof(u32), 0);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_MODE + i * sizeof(u32), 0);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE +
+		       i * sizeof(u32), 0);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_VALUE + 2 * i * sizeof(u32), 0);
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_FILTER_VALUE +
+		       (2 * i + 1) * sizeof(u32), 0);
+		break;
+	}
+
+	if (i >= NIG_REG_LLH_FUNC_FILTER_EN_SIZE)
+		DP_NOTICE(p_hwfn, "Tried to remove a non-configured filter\n");
+}
+
+static int qed_set_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			    u32 hw_addr, void *p_eth_qzone,
+			    size_t eth_qzone_size, u8 timeset)
+{
+	struct coalescing_timeset *p_coal_timeset;
+
+	if (p_hwfn->cdev->int_coalescing_mode != QED_COAL_MODE_ENABLE) {
+		DP_NOTICE(p_hwfn, "Coalescing configuration not enabled\n");
+		return -EINVAL;
+	}
+
+	p_coal_timeset = p_eth_qzone;
+	memset(p_eth_qzone, 0, eth_qzone_size);
+	SET_FIELD(p_coal_timeset->value, COALESCING_TIMESET_TIMESET, timeset);
+	SET_FIELD(p_coal_timeset->value, COALESCING_TIMESET_VALID, 1);
+	qed_memcpy_to(p_hwfn, p_ptt, hw_addr, p_eth_qzone, eth_qzone_size);
+
+	return 0;
+}
+
+int qed_set_queue_coalesce(u16 rx_coal, u16 tx_coal, void *p_handle)
+{
+	struct qed_queue_cid *p_cid = p_handle;
+	struct qed_hwfn *p_hwfn;
+	struct qed_ptt *p_ptt;
+	int rc = 0;
+
+	p_hwfn = p_cid->p_owner;
+
+	if (IS_VF(p_hwfn->cdev))
+		return qed_vf_pf_set_coalesce(p_hwfn, rx_coal, tx_coal, p_cid);
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EAGAIN;
+
+	if (rx_coal) {
+		rc = qed_set_rxq_coalesce(p_hwfn, p_ptt, rx_coal, p_cid);
+		if (rc)
+			goto out;
+		p_hwfn->cdev->rx_coalesce_usecs = rx_coal;
+	}
+
+	if (tx_coal) {
+		rc = qed_set_txq_coalesce(p_hwfn, p_ptt, tx_coal, p_cid);
+		if (rc)
+			goto out;
+		p_hwfn->cdev->tx_coalesce_usecs = tx_coal;
+	}
+out:
+	qed_ptt_release(p_hwfn, p_ptt);
+	return rc;
+}
+
+int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 u16 coalesce, struct qed_queue_cid *p_cid)
+{
+	struct ustorm_eth_queue_zone eth_qzone;
+	u8 timeset, timer_res;
+	u32 address;
+	int rc;
+
+	/* Coalesce = (timeset << timer-resolution), timeset is 7bit wide */
+	if (coalesce <= 0x7F) {
+		timer_res = 0;
+	} else if (coalesce <= 0xFF) {
+		timer_res = 1;
+	} else if (coalesce <= 0x1FF) {
+		timer_res = 2;
+	} else {
+		DP_ERR(p_hwfn, "Invalid coalesce value - %d\n", coalesce);
+		return -EINVAL;
+	}
+	timeset = (u8)(coalesce >> timer_res);
+
+	rc = qed_int_set_timer_res(p_hwfn, p_ptt, timer_res,
+				   p_cid->sb_igu_id, false);
+	if (rc)
+		goto out;
+
+	address = BAR0_MAP_REG_USDM_RAM +
+		  USTORM_ETH_QUEUE_ZONE_OFFSET(p_cid->abs.queue_id);
+
+	rc = qed_set_coalesce(p_hwfn, p_ptt, address, &eth_qzone,
+			      sizeof(struct ustorm_eth_queue_zone), timeset);
+	if (rc)
+		goto out;
+
+out:
+	return rc;
+}
+
+int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 u16 coalesce, struct qed_queue_cid *p_cid)
+{
+	struct xstorm_eth_queue_zone eth_qzone;
+	u8 timeset, timer_res;
+	u32 address;
+	int rc;
+
+	/* Coalesce = (timeset << timer-resolution), timeset is 7bit wide */
+	if (coalesce <= 0x7F) {
+		timer_res = 0;
+	} else if (coalesce <= 0xFF) {
+		timer_res = 1;
+	} else if (coalesce <= 0x1FF) {
+		timer_res = 2;
+	} else {
+		DP_ERR(p_hwfn, "Invalid coalesce value - %d\n", coalesce);
+		return -EINVAL;
+	}
+	timeset = (u8)(coalesce >> timer_res);
+
+	rc = qed_int_set_timer_res(p_hwfn, p_ptt, timer_res,
+				   p_cid->sb_igu_id, true);
+	if (rc)
+		goto out;
+
+	address = BAR0_MAP_REG_XSDM_RAM +
+		  XSTORM_ETH_QUEUE_ZONE_OFFSET(p_cid->abs.queue_id);
+
+	rc = qed_set_coalesce(p_hwfn, p_ptt, address, &eth_qzone,
+			      sizeof(struct xstorm_eth_queue_zone), timeset);
+out:
+	return rc;
+}
+
+/* Calculate final WFQ values for all vports and configure them.
+ * After this configuration each vport will have
+ * approx min rate =  min_pf_rate * (vport_wfq / QED_WFQ_UNIT)
+ */
+static void qed_configure_wfq_for_all_vports(struct qed_hwfn *p_hwfn,
+					     struct qed_ptt *p_ptt,
+					     u32 min_pf_rate)
+{
+	struct init_qm_vport_params *vport_params;
+	int i;
+
+	vport_params = p_hwfn->qm_info.qm_vport_params;
+
+	for (i = 0; i < p_hwfn->qm_info.num_vports; i++) {
+		u32 wfq_speed = p_hwfn->qm_info.wfq_data[i].min_speed;
+
+		vport_params[i].vport_wfq = (wfq_speed * QED_WFQ_UNIT) /
+						min_pf_rate;
+		qed_init_vport_wfq(p_hwfn, p_ptt,
+				   vport_params[i].first_tx_pq_id,
+				   vport_params[i].vport_wfq);
+	}
+}
+
+static void qed_init_wfq_default_param(struct qed_hwfn *p_hwfn,
+				       u32 min_pf_rate)
+
+{
+	int i;
+
+	for (i = 0; i < p_hwfn->qm_info.num_vports; i++)
+		p_hwfn->qm_info.qm_vport_params[i].vport_wfq = 1;
+}
+
+static void qed_disable_wfq_for_all_vports(struct qed_hwfn *p_hwfn,
+					   struct qed_ptt *p_ptt,
+					   u32 min_pf_rate)
+{
+	struct init_qm_vport_params *vport_params;
+	int i;
+
+	vport_params = p_hwfn->qm_info.qm_vport_params;
+
+	for (i = 0; i < p_hwfn->qm_info.num_vports; i++) {
+		qed_init_wfq_default_param(p_hwfn, min_pf_rate);
+		qed_init_vport_wfq(p_hwfn, p_ptt,
+				   vport_params[i].first_tx_pq_id,
+				   vport_params[i].vport_wfq);
+	}
+}
+
+/* This function performs several validations for WFQ
+ * configuration and required min rate for a given vport
+ * 1. req_rate must be greater than one percent of min_pf_rate.
+ * 2. req_rate should not cause other vports [not configured for WFQ explicitly]
+ *    rates to get less than one percent of min_pf_rate.
+ * 3. total_req_min_rate [all vports min rate sum] shouldn't exceed min_pf_rate.
+ */
+static int qed_init_wfq_param(struct qed_hwfn *p_hwfn,
+			      u16 vport_id, u32 req_rate, u32 min_pf_rate)
+{
+	u32 total_req_min_rate = 0, total_left_rate = 0, left_rate_per_vp = 0;
+	int non_requested_count = 0, req_count = 0, i, num_vports;
+
+	num_vports = p_hwfn->qm_info.num_vports;
+
+	/* Accounting for the vports which are configured for WFQ explicitly */
+	for (i = 0; i < num_vports; i++) {
+		u32 tmp_speed;
+
+		if ((i != vport_id) &&
+		    p_hwfn->qm_info.wfq_data[i].configured) {
+			req_count++;
+			tmp_speed = p_hwfn->qm_info.wfq_data[i].min_speed;
+			total_req_min_rate += tmp_speed;
+		}
+	}
+
+	/* Include current vport data as well */
+	req_count++;
+	total_req_min_rate += req_rate;
+	non_requested_count = num_vports - req_count;
+
+	if (req_rate < min_pf_rate / QED_WFQ_UNIT) {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_LINK,
+			   "Vport [%d] - Requested rate[%d Mbps] is less than one percent of configured PF min rate[%d Mbps]\n",
+			   vport_id, req_rate, min_pf_rate);
+		return -EINVAL;
+	}
+
+	if (num_vports > QED_WFQ_UNIT) {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_LINK,
+			   "Number of vports is greater than %d\n",
+			   QED_WFQ_UNIT);
+		return -EINVAL;
+	}
+
+	if (total_req_min_rate > min_pf_rate) {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_LINK,
+			   "Total requested min rate for all vports[%d Mbps] is greater than configured PF min rate[%d Mbps]\n",
+			   total_req_min_rate, min_pf_rate);
+		return -EINVAL;
+	}
+
+	total_left_rate	= min_pf_rate - total_req_min_rate;
+
+	left_rate_per_vp = total_left_rate / non_requested_count;
+	if (left_rate_per_vp <  min_pf_rate / QED_WFQ_UNIT) {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_LINK,
+			   "Non WFQ configured vports rate [%d Mbps] is less than one percent of configured PF min rate[%d Mbps]\n",
+			   left_rate_per_vp, min_pf_rate);
+		return -EINVAL;
+	}
+
+	p_hwfn->qm_info.wfq_data[vport_id].min_speed = req_rate;
+	p_hwfn->qm_info.wfq_data[vport_id].configured = true;
+
+	for (i = 0; i < num_vports; i++) {
+		if (p_hwfn->qm_info.wfq_data[i].configured)
+			continue;
+
+		p_hwfn->qm_info.wfq_data[i].min_speed = left_rate_per_vp;
+	}
+
+	return 0;
+}
+
+static int __qed_configure_vport_wfq(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt, u16 vp_id, u32 rate)
+{
+	struct qed_mcp_link_state *p_link;
+	int rc = 0;
+
+	p_link = &p_hwfn->cdev->hwfns[0].mcp_info->link_output;
+
+	if (!p_link->min_pf_rate) {
+		p_hwfn->qm_info.wfq_data[vp_id].min_speed = rate;
+		p_hwfn->qm_info.wfq_data[vp_id].configured = true;
+		return rc;
+	}
+
+	rc = qed_init_wfq_param(p_hwfn, vp_id, rate, p_link->min_pf_rate);
+
+	if (!rc)
+		qed_configure_wfq_for_all_vports(p_hwfn, p_ptt,
+						 p_link->min_pf_rate);
+	else
+		DP_NOTICE(p_hwfn,
+			  "Validation failed while configuring min rate\n");
+
+	return rc;
+}
+
+static int __qed_configure_vp_wfq_on_link_change(struct qed_hwfn *p_hwfn,
+						 struct qed_ptt *p_ptt,
+						 u32 min_pf_rate)
+{
+	bool use_wfq = false;
+	int rc = 0;
+	u16 i;
+
+	/* Validate all pre configured vports for wfq */
+	for (i = 0; i < p_hwfn->qm_info.num_vports; i++) {
+		u32 rate;
+
+		if (!p_hwfn->qm_info.wfq_data[i].configured)
+			continue;
+
+		rate = p_hwfn->qm_info.wfq_data[i].min_speed;
+		use_wfq = true;
+
+		rc = qed_init_wfq_param(p_hwfn, i, rate, min_pf_rate);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "WFQ validation failed while configuring min rate\n");
+			break;
+		}
+	}
+
+	if (!rc && use_wfq)
+		qed_configure_wfq_for_all_vports(p_hwfn, p_ptt, min_pf_rate);
+	else
+		qed_disable_wfq_for_all_vports(p_hwfn, p_ptt, min_pf_rate);
+
+	return rc;
+}
+
+/* Main API for qed clients to configure vport min rate.
+ * vp_id - vport id in PF Range[0 - (total_num_vports_per_pf - 1)]
+ * rate - Speed in Mbps needs to be assigned to a given vport.
+ */
+int qed_configure_vport_wfq(struct qed_dev *cdev, u16 vp_id, u32 rate)
+{
+	int i, rc = -EINVAL;
+
+	/* Currently not supported; Might change in future */
+	if (cdev->num_hwfns > 1) {
+		DP_NOTICE(cdev,
+			  "WFQ configuration is not supported for this device\n");
+		return rc;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+		struct qed_ptt *p_ptt;
+
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt)
+			return -EBUSY;
+
+		rc = __qed_configure_vport_wfq(p_hwfn, p_ptt, vp_id, rate);
+
+		if (rc) {
+			qed_ptt_release(p_hwfn, p_ptt);
+			return rc;
+		}
+
+		qed_ptt_release(p_hwfn, p_ptt);
+	}
+
+	return rc;
+}
+
+/* API to configure WFQ from mcp link change */
+void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev,
+					 struct qed_ptt *p_ptt, u32 min_pf_rate)
+{
+	int i;
+
+	if (cdev->num_hwfns > 1) {
+		DP_VERBOSE(cdev,
+			   NETIF_MSG_LINK,
+			   "WFQ configuration is not supported for this device\n");
+		return;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		__qed_configure_vp_wfq_on_link_change(p_hwfn, p_ptt,
+						      min_pf_rate);
+	}
+}
+
+int __qed_configure_pf_max_bandwidth(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     struct qed_mcp_link_state *p_link,
+				     u8 max_bw)
+{
+	int rc = 0;
+
+	p_hwfn->mcp_info->func_info.bandwidth_max = max_bw;
+
+	if (!p_link->line_speed && (max_bw != 100))
+		return rc;
+
+	p_link->speed = (p_link->line_speed * max_bw) / 100;
+	p_hwfn->qm_info.pf_rl = p_link->speed;
+
+	/* Since the limiter also affects Tx-switched traffic, we don't want it
+	 * to limit such traffic in case there's no actual limit.
+	 * In that case, set limit to imaginary high boundary.
+	 */
+	if (max_bw == 100)
+		p_hwfn->qm_info.pf_rl = 100000;
+
+	rc = qed_init_pf_rl(p_hwfn, p_ptt, p_hwfn->rel_pf_id,
+			    p_hwfn->qm_info.pf_rl);
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_LINK,
+		   "Configured MAX bandwidth to be %08x Mb/sec\n",
+		   p_link->speed);
+
+	return rc;
+}
+
+/* Main API to configure PF max bandwidth where bw range is [1 - 100] */
+int qed_configure_pf_max_bandwidth(struct qed_dev *cdev, u8 max_bw)
+{
+	int i, rc = -EINVAL;
+
+	if (max_bw < 1 || max_bw > 100) {
+		DP_NOTICE(cdev, "PF max bw valid range is [1-100]\n");
+		return rc;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn	*p_hwfn = &cdev->hwfns[i];
+		struct qed_hwfn *p_lead = QED_LEADING_HWFN(cdev);
+		struct qed_mcp_link_state *p_link;
+		struct qed_ptt *p_ptt;
+
+		p_link = &p_lead->mcp_info->link_output;
+
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt)
+			return -EBUSY;
+
+		rc = __qed_configure_pf_max_bandwidth(p_hwfn, p_ptt,
+						      p_link, max_bw);
+
+		qed_ptt_release(p_hwfn, p_ptt);
+
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+int __qed_configure_pf_min_bandwidth(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     struct qed_mcp_link_state *p_link,
+				     u8 min_bw)
+{
+	int rc = 0;
+
+	p_hwfn->mcp_info->func_info.bandwidth_min = min_bw;
+	p_hwfn->qm_info.pf_wfq = min_bw;
+
+	if (!p_link->line_speed)
+		return rc;
+
+	p_link->min_pf_rate = (p_link->line_speed * min_bw) / 100;
+
+	rc = qed_init_pf_wfq(p_hwfn, p_ptt, p_hwfn->rel_pf_id, min_bw);
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_LINK,
+		   "Configured MIN bandwidth to be %d Mb/sec\n",
+		   p_link->min_pf_rate);
+
+	return rc;
+}
+
+/* Main API to configure PF min bandwidth where bw range is [1-100] */
+int qed_configure_pf_min_bandwidth(struct qed_dev *cdev, u8 min_bw)
+{
+	int i, rc = -EINVAL;
+
+	if (min_bw < 1 || min_bw > 100) {
+		DP_NOTICE(cdev, "PF min bw valid range is [1-100]\n");
+		return rc;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+		struct qed_hwfn *p_lead = QED_LEADING_HWFN(cdev);
+		struct qed_mcp_link_state *p_link;
+		struct qed_ptt *p_ptt;
+
+		p_link = &p_lead->mcp_info->link_output;
+
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt)
+			return -EBUSY;
+
+		rc = __qed_configure_pf_min_bandwidth(p_hwfn, p_ptt,
+						      p_link, min_bw);
+		if (rc) {
+			qed_ptt_release(p_hwfn, p_ptt);
+			return rc;
+		}
+
+		if (p_link->min_pf_rate) {
+			u32 min_rate = p_link->min_pf_rate;
+
+			rc = __qed_configure_vp_wfq_on_link_change(p_hwfn,
+								   p_ptt,
+								   min_rate);
+		}
+
+		qed_ptt_release(p_hwfn, p_ptt);
+	}
+
+	return rc;
+}
+
+void qed_clean_wfq_db(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_mcp_link_state *p_link;
+
+	p_link = &p_hwfn->mcp_info->link_output;
+
+	if (p_link->min_pf_rate)
+		qed_disable_wfq_for_all_vports(p_hwfn, p_ptt,
+					       p_link->min_pf_rate);
+
+	memset(p_hwfn->qm_info.wfq_data, 0,
+	       sizeof(*p_hwfn->qm_info.wfq_data) * p_hwfn->qm_info.num_vports);
+}
+
+int qed_device_num_engines(struct qed_dev *cdev)
+{
+	return QED_IS_BB(cdev) ? 2 : 1;
+}
+
+static int qed_device_num_ports(struct qed_dev *cdev)
+{
+	/* in CMT always only one port */
+	if (cdev->num_hwfns > 1)
+		return 1;
+
+	return cdev->num_ports_in_engine * qed_device_num_engines(cdev);
+}
+
+int qed_device_get_port_id(struct qed_dev *cdev)
+{
+	return (QED_LEADING_HWFN(cdev)->abs_pf_id) % qed_device_num_ports(cdev);
+}
+
+void qed_set_fw_mac_addr(__le16 *fw_msb,
+			 __le16 *fw_mid, __le16 *fw_lsb, u8 *mac)
+{
+	((u8 *)fw_msb)[0] = mac[1];
+	((u8 *)fw_msb)[1] = mac[0];
+	((u8 *)fw_mid)[0] = mac[3];
+	((u8 *)fw_mid)[1] = mac[2];
+	((u8 *)fw_lsb)[0] = mac[5];
+	((u8 *)fw_lsb)[1] = mac[4];
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
new file mode 100644
index 0000000..defdda1
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -0,0 +1,477 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_DEV_API_H
+#define _QED_DEV_API_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/qed/qed_chain.h>
+#include <linux/qed/qed_if.h>
+#include "qed_int.h"
+
+/**
+ * @brief qed_init_dp - initialize the debug level
+ *
+ * @param cdev
+ * @param dp_module
+ * @param dp_level
+ */
+void qed_init_dp(struct qed_dev *cdev,
+		 u32 dp_module,
+		 u8 dp_level);
+
+/**
+ * @brief qed_init_struct - initialize the device structure to
+ *        its defaults
+ *
+ * @param cdev
+ */
+void qed_init_struct(struct qed_dev *cdev);
+
+/**
+ * @brief qed_resc_free -
+ *
+ * @param cdev
+ */
+void qed_resc_free(struct qed_dev *cdev);
+
+/**
+ * @brief qed_resc_alloc -
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_resc_alloc(struct qed_dev *cdev);
+
+/**
+ * @brief qed_resc_setup -
+ *
+ * @param cdev
+ */
+void qed_resc_setup(struct qed_dev *cdev);
+
+enum qed_override_force_load {
+	QED_OVERRIDE_FORCE_LOAD_NONE,
+	QED_OVERRIDE_FORCE_LOAD_ALWAYS,
+	QED_OVERRIDE_FORCE_LOAD_NEVER,
+};
+
+struct qed_drv_load_params {
+	/* Indicates whether the driver is running over a crash kernel.
+	 * As part of the load request, this will be used for providing the
+	 * driver role to the MFW.
+	 * In case of a crash kernel over PDA - this should be set to false.
+	 */
+	bool is_crash_kernel;
+
+	/* The timeout value that the MFW should use when locking the engine for
+	 * the driver load process.
+	 * A value of '0' means the default value, and '255' means no timeout.
+	 */
+	u8 mfw_timeout_val;
+#define QED_LOAD_REQ_LOCK_TO_DEFAULT    0
+#define QED_LOAD_REQ_LOCK_TO_NONE       255
+
+	/* Avoid engine reset when first PF loads on it */
+	bool avoid_eng_reset;
+
+	/* Allow overriding the default force load behavior */
+	enum qed_override_force_load override_force_load;
+};
+
+struct qed_hw_init_params {
+	/* Tunneling parameters */
+	struct qed_tunnel_info *p_tunn;
+
+	bool b_hw_start;
+
+	/* Interrupt mode [msix, inta, etc.] to use */
+	enum qed_int_mode int_mode;
+
+	/* NPAR tx switching to be used for vports for tx-switching */
+	bool allow_npar_tx_switch;
+
+	/* Binary fw data pointer in binary fw file */
+	const u8 *bin_fw_data;
+
+	/* Driver load parameters */
+	struct qed_drv_load_params *p_drv_load_params;
+};
+
+/**
+ * @brief qed_hw_init -
+ *
+ * @param cdev
+ * @param p_params
+ *
+ * @return int
+ */
+int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params);
+
+/**
+ * @brief qed_hw_timers_stop_all - stop the timers HW block
+ *
+ * @param cdev
+ *
+ * @return void
+ */
+void qed_hw_timers_stop_all(struct qed_dev *cdev);
+
+/**
+ * @brief qed_hw_stop -
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_hw_stop(struct qed_dev *cdev);
+
+/**
+ * @brief qed_hw_stop_fastpath -should be called incase
+ *		slowpath is still required for the device,
+ *		but fastpath is not.
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_hw_stop_fastpath(struct qed_dev *cdev);
+
+/**
+ * @brief qed_hw_start_fastpath -restart fastpath traffic,
+ *		only if hw_stop_fastpath was called
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_hw_start_fastpath(struct qed_hwfn *p_hwfn);
+
+
+/**
+ * @brief qed_hw_prepare -
+ *
+ * @param cdev
+ * @param personality - personality to initialize
+ *
+ * @return int
+ */
+int qed_hw_prepare(struct qed_dev *cdev,
+		   int personality);
+
+/**
+ * @brief qed_hw_remove -
+ *
+ * @param cdev
+ */
+void qed_hw_remove(struct qed_dev *cdev);
+
+/**
+ * @brief qed_ptt_acquire - Allocate a PTT window
+ *
+ * Should be called at the entry point to the driver (at the beginning of an
+ * exported function)
+ *
+ * @param p_hwfn
+ *
+ * @return struct qed_ptt
+ */
+struct qed_ptt *qed_ptt_acquire(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_ptt_release - Release PTT Window
+ *
+ * Should be called at the end of a flow - at the end of the function that
+ * acquired the PTT.
+ *
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+void qed_ptt_release(struct qed_hwfn *p_hwfn,
+		     struct qed_ptt *p_ptt);
+void qed_reset_vport_stats(struct qed_dev *cdev);
+
+enum qed_dmae_address_type_t {
+	QED_DMAE_ADDRESS_HOST_VIRT,
+	QED_DMAE_ADDRESS_HOST_PHYS,
+	QED_DMAE_ADDRESS_GRC
+};
+
+/* value of flags If QED_DMAE_FLAG_RW_REPL_SRC flag is set and the
+ * source is a block of length DMAE_MAX_RW_SIZE and the
+ * destination is larger, the source block will be duplicated as
+ * many times as required to fill the destination block. This is
+ * used mostly to write a zeroed buffer to destination address
+ * using DMA
+ */
+#define QED_DMAE_FLAG_RW_REPL_SRC	0x00000001
+#define QED_DMAE_FLAG_VF_SRC		0x00000002
+#define QED_DMAE_FLAG_VF_DST		0x00000004
+#define QED_DMAE_FLAG_COMPLETION_DST	0x00000008
+
+struct qed_dmae_params {
+	u32 flags; /* consists of QED_DMAE_FLAG_* values */
+	u8 src_vfid;
+	u8 dst_vfid;
+};
+
+/**
+ * @brief qed_dmae_host2grc - copy data from source addr to
+ * dmae registers using the given ptt
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param source_addr
+ * @param grc_addr (dmae_data_offset)
+ * @param size_in_dwords
+ * @param flags (one of the flags defined above)
+ */
+int
+qed_dmae_host2grc(struct qed_hwfn *p_hwfn,
+		  struct qed_ptt *p_ptt,
+		  u64 source_addr,
+		  u32 grc_addr,
+		  u32 size_in_dwords,
+		  u32 flags);
+
+ /**
+ * @brief qed_dmae_grc2host - Read data from dmae data offset
+ * to source address using the given ptt
+ *
+ * @param p_ptt
+ * @param grc_addr (dmae_data_offset)
+ * @param dest_addr
+ * @param size_in_dwords
+ * @param flags - one of the flags defined above
+ */
+int qed_dmae_grc2host(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+		      u32 grc_addr, dma_addr_t dest_addr, u32 size_in_dwords,
+		      u32 flags);
+
+/**
+ * @brief qed_dmae_host2host - copy data from to source address
+ * to a destination adress (for SRIOV) using the given ptt
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param source_addr
+ * @param dest_addr
+ * @param size_in_dwords
+ * @param params
+ */
+int qed_dmae_host2host(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt,
+		       dma_addr_t source_addr,
+		       dma_addr_t dest_addr,
+		       u32 size_in_dwords, struct qed_dmae_params *p_params);
+
+/**
+ * @brief qed_chain_alloc - Allocate and initialize a chain
+ *
+ * @param p_hwfn
+ * @param intended_use
+ * @param mode
+ * @param num_elems
+ * @param elem_size
+ * @param p_chain
+ * @param ext_pbl - a possible external PBL
+ *
+ * @return int
+ */
+int
+qed_chain_alloc(struct qed_dev *cdev,
+		enum qed_chain_use_mode intended_use,
+		enum qed_chain_mode mode,
+		enum qed_chain_cnt_type cnt_type,
+		u32 num_elems,
+		size_t elem_size,
+		struct qed_chain *p_chain, struct qed_chain_ext_pbl *ext_pbl);
+
+/**
+ * @brief qed_chain_free - Free chain DMA memory
+ *
+ * @param p_hwfn
+ * @param p_chain
+ */
+void qed_chain_free(struct qed_dev *cdev, struct qed_chain *p_chain);
+
+/**
+ * @@brief qed_fw_l2_queue - Get absolute L2 queue ID
+ *
+ *  @param p_hwfn
+ *  @param src_id - relative to p_hwfn
+ *  @param dst_id - absolute per engine
+ *
+ *  @return int
+ */
+int qed_fw_l2_queue(struct qed_hwfn *p_hwfn,
+		    u16 src_id,
+		    u16 *dst_id);
+
+/**
+ * @@brief qed_fw_vport - Get absolute vport ID
+ *
+ *  @param p_hwfn
+ *  @param src_id - relative to p_hwfn
+ *  @param dst_id - absolute per engine
+ *
+ *  @return int
+ */
+int qed_fw_vport(struct qed_hwfn *p_hwfn,
+		 u8 src_id,
+		 u8 *dst_id);
+
+/**
+ * @@brief qed_fw_rss_eng - Get absolute RSS engine ID
+ *
+ *  @param p_hwfn
+ *  @param src_id - relative to p_hwfn
+ *  @param dst_id - absolute per engine
+ *
+ *  @return int
+ */
+int qed_fw_rss_eng(struct qed_hwfn *p_hwfn,
+		   u8 src_id,
+		   u8 *dst_id);
+
+/**
+ * @brief qed_llh_add_mac_filter - configures a MAC filter in llh
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param p_filter - MAC to add
+ */
+int qed_llh_add_mac_filter(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt, u8 *p_filter);
+
+/**
+ * @brief qed_llh_remove_mac_filter - removes a MAC filter from llh
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param p_filter - MAC to remove
+ */
+void qed_llh_remove_mac_filter(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt, u8 *p_filter);
+
+enum qed_llh_port_filter_type_t {
+	QED_LLH_FILTER_ETHERTYPE,
+	QED_LLH_FILTER_TCP_SRC_PORT,
+	QED_LLH_FILTER_TCP_DEST_PORT,
+	QED_LLH_FILTER_TCP_SRC_AND_DEST_PORT,
+	QED_LLH_FILTER_UDP_SRC_PORT,
+	QED_LLH_FILTER_UDP_DEST_PORT,
+	QED_LLH_FILTER_UDP_SRC_AND_DEST_PORT
+};
+
+/**
+ * @brief qed_llh_add_protocol_filter - configures a protocol filter in llh
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param source_port_or_eth_type - source port or ethertype to add
+ * @param dest_port - destination port to add
+ * @param type - type of filters and comparing
+ */
+int
+qed_llh_add_protocol_filter(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt,
+			    u16 source_port_or_eth_type,
+			    u16 dest_port,
+			    enum qed_llh_port_filter_type_t type);
+
+/**
+ * @brief qed_llh_remove_protocol_filter - remove a protocol filter in llh
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param source_port_or_eth_type - source port or ethertype to add
+ * @param dest_port - destination port to add
+ * @param type - type of filters and comparing
+ */
+void
+qed_llh_remove_protocol_filter(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt,
+			       u16 source_port_or_eth_type,
+			       u16 dest_port,
+			       enum qed_llh_port_filter_type_t type);
+
+/**
+ * *@brief Cleanup of previous driver remains prior to load
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param id - For PF, engine-relative. For VF, PF-relative.
+ * @param is_vf - true iff cleanup is made for a VF.
+ *
+ * @return int
+ */
+int qed_final_cleanup(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt, u16 id, bool is_vf);
+
+/**
+ * @brief qed_get_queue_coalesce - Retrieve coalesce value for a given queue.
+ *
+ * @param p_hwfn
+ * @param p_coal - store coalesce value read from the hardware.
+ * @param p_handle
+ *
+ * @return int
+ **/
+int qed_get_queue_coalesce(struct qed_hwfn *p_hwfn, u16 *coal, void *handle);
+
+/**
+ * @brief qed_set_queue_coalesce - Configure coalesce parameters for Rx and
+ *    Tx queue. The fact that we can configure coalescing to up to 511, but on
+ *    varying accuracy [the bigger the value the less accurate] up to a mistake
+ *    of 3usec for the highest values.
+ *    While the API allows setting coalescing per-qid, all queues sharing a SB
+ *    should be in same range [i.e., either 0-0x7f, 0x80-0xff or 0x100-0x1ff]
+ *    otherwise configuration would break.
+ *
+ *
+ * @param rx_coal - Rx Coalesce value in micro seconds.
+ * @param tx_coal - TX Coalesce value in micro seconds.
+ * @param p_handle
+ *
+ * @return int
+ **/
+int
+qed_set_queue_coalesce(u16 rx_coal, u16 tx_coal, void *p_handle);
+
+
+const char *qed_hw_get_resc_name(enum qed_resources res_id);
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_fcoe.c b/drivers/net/ethernet/qlogic/qed/qed_fcoe.c
new file mode 100644
index 0000000..2dc9b31
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_fcoe.c
@@ -0,0 +1,1051 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <asm/param.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#define __PREVENT_DUMP_MEM_ARR__
+#define __PREVENT_PXP_GLOBAL_WIN__
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_dev_api.h"
+#include "qed_fcoe.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_int.h"
+#include "qed_ll2.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+#include "qed_sp.h"
+#include "qed_sriov.h"
+#include <linux/qed/qed_fcoe_if.h>
+
+struct qed_fcoe_conn {
+	struct list_head list_entry;
+	bool free_on_delete;
+
+	u16 conn_id;
+	u32 icid;
+	u32 fw_cid;
+	u8 layer_code;
+
+	dma_addr_t sq_pbl_addr;
+	dma_addr_t sq_curr_page_addr;
+	dma_addr_t sq_next_page_addr;
+	dma_addr_t xferq_pbl_addr;
+	void *xferq_pbl_addr_virt_addr;
+	dma_addr_t xferq_addr[4];
+	void *xferq_addr_virt_addr[4];
+	dma_addr_t confq_pbl_addr;
+	void *confq_pbl_addr_virt_addr;
+	dma_addr_t confq_addr[2];
+	void *confq_addr_virt_addr[2];
+
+	dma_addr_t terminate_params;
+
+	u16 dst_mac_addr_lo;
+	u16 dst_mac_addr_mid;
+	u16 dst_mac_addr_hi;
+	u16 src_mac_addr_lo;
+	u16 src_mac_addr_mid;
+	u16 src_mac_addr_hi;
+
+	u16 tx_max_fc_pay_len;
+	u16 e_d_tov_timer_val;
+	u16 rec_tov_timer_val;
+	u16 rx_max_fc_pay_len;
+	u16 vlan_tag;
+	u16 physical_q0;
+
+	struct fc_addr_nw s_id;
+	u8 max_conc_seqs_c3;
+	struct fc_addr_nw d_id;
+	u8 flags;
+	u8 def_q_idx;
+};
+
+static int
+qed_sp_fcoe_func_start(struct qed_hwfn *p_hwfn,
+		       enum spq_mode comp_mode,
+		       struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct qed_fcoe_pf_params *fcoe_pf_params = NULL;
+	struct fcoe_init_ramrod_params *p_ramrod = NULL;
+	struct fcoe_init_func_ramrod_data *p_data;
+	struct e4_fcoe_conn_context *p_cxt = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	struct qed_cxt_info cxt_info;
+	u32 dummy_cid;
+	int rc = 0;
+	u16 tmp;
+	u8 i;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 FCOE_RAMROD_CMD_ID_INIT_FUNC,
+				 PROTOCOLID_FCOE, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.fcoe_init;
+	p_data = &p_ramrod->init_ramrod_data;
+	fcoe_pf_params = &p_hwfn->pf_params.fcoe_pf_params;
+
+	/* Sanity */
+	if (fcoe_pf_params->num_cqs > p_hwfn->hw_info.feat_num[QED_FCOE_CQ]) {
+		DP_ERR(p_hwfn,
+		       "Cannot satisfy CQ amount. CQs requested %d, CQs available %d. Aborting function start\n",
+		       fcoe_pf_params->num_cqs,
+		       p_hwfn->hw_info.feat_num[QED_FCOE_CQ]);
+		return -EINVAL;
+	}
+
+	p_data->mtu = cpu_to_le16(fcoe_pf_params->mtu);
+	tmp = cpu_to_le16(fcoe_pf_params->sq_num_pbl_pages);
+	p_data->sq_num_pages_in_pbl = tmp;
+
+	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_FCOE, &dummy_cid);
+	if (rc)
+		return rc;
+
+	cxt_info.iid = dummy_cid;
+	rc = qed_cxt_get_cid_info(p_hwfn, &cxt_info);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Cannot find context info for dummy cid=%d\n",
+			  dummy_cid);
+		return rc;
+	}
+	p_cxt = cxt_info.p_cxt;
+	SET_FIELD(p_cxt->tstorm_ag_context.flags3,
+		  E4_TSTORM_FCOE_CONN_AG_CTX_DUMMY_TIMER_CF_EN, 1);
+
+	fcoe_pf_params->dummy_icid = (u16)dummy_cid;
+
+	tmp = cpu_to_le16(fcoe_pf_params->num_tasks);
+	p_data->func_params.num_tasks = tmp;
+	p_data->func_params.log_page_size = fcoe_pf_params->log_page_size;
+	p_data->func_params.debug_mode = fcoe_pf_params->debug_mode;
+
+	DMA_REGPAIR_LE(p_data->q_params.glbl_q_params_addr,
+		       fcoe_pf_params->glbl_q_params_addr);
+
+	tmp = cpu_to_le16(fcoe_pf_params->cq_num_entries);
+	p_data->q_params.cq_num_entries = tmp;
+
+	tmp = cpu_to_le16(fcoe_pf_params->cmdq_num_entries);
+	p_data->q_params.cmdq_num_entries = tmp;
+
+	tmp = fcoe_pf_params->num_cqs;
+	p_data->q_params.num_queues = (u8)tmp;
+
+	tmp = (u16)p_hwfn->hw_info.resc_start[QED_CMDQS_CQS];
+	p_data->q_params.queue_relative_offset = (u8)tmp;
+
+	for (i = 0; i < fcoe_pf_params->num_cqs; i++) {
+		u16 igu_sb_id;
+
+		igu_sb_id = qed_get_igu_sb_id(p_hwfn, i);
+		tmp = cpu_to_le16(igu_sb_id);
+		p_data->q_params.cq_cmdq_sb_num_arr[i] = tmp;
+	}
+
+	p_data->q_params.cq_sb_pi = fcoe_pf_params->gl_rq_pi;
+	p_data->q_params.cmdq_sb_pi = fcoe_pf_params->gl_cmd_pi;
+
+	p_data->q_params.bdq_resource_id = (u8)RESC_START(p_hwfn, QED_BDQ);
+
+	DMA_REGPAIR_LE(p_data->q_params.bdq_pbl_base_address[BDQ_ID_RQ],
+		       fcoe_pf_params->bdq_pbl_base_addr[BDQ_ID_RQ]);
+	p_data->q_params.bdq_pbl_num_entries[BDQ_ID_RQ] =
+	    fcoe_pf_params->bdq_pbl_num_entries[BDQ_ID_RQ];
+	tmp = fcoe_pf_params->bdq_xoff_threshold[BDQ_ID_RQ];
+	p_data->q_params.bdq_xoff_threshold[BDQ_ID_RQ] = cpu_to_le16(tmp);
+	tmp = fcoe_pf_params->bdq_xon_threshold[BDQ_ID_RQ];
+	p_data->q_params.bdq_xon_threshold[BDQ_ID_RQ] = cpu_to_le16(tmp);
+
+	DMA_REGPAIR_LE(p_data->q_params.bdq_pbl_base_address[BDQ_ID_IMM_DATA],
+		       fcoe_pf_params->bdq_pbl_base_addr[BDQ_ID_IMM_DATA]);
+	p_data->q_params.bdq_pbl_num_entries[BDQ_ID_IMM_DATA] =
+	    fcoe_pf_params->bdq_pbl_num_entries[BDQ_ID_IMM_DATA];
+	tmp = fcoe_pf_params->bdq_xoff_threshold[BDQ_ID_IMM_DATA];
+	p_data->q_params.bdq_xoff_threshold[BDQ_ID_IMM_DATA] = cpu_to_le16(tmp);
+	tmp = fcoe_pf_params->bdq_xon_threshold[BDQ_ID_IMM_DATA];
+	p_data->q_params.bdq_xon_threshold[BDQ_ID_IMM_DATA] = cpu_to_le16(tmp);
+	tmp = fcoe_pf_params->rq_buffer_size;
+	p_data->q_params.rq_buffer_size = cpu_to_le16(tmp);
+
+	if (fcoe_pf_params->is_target) {
+		SET_FIELD(p_data->q_params.q_validity,
+			  SCSI_INIT_FUNC_QUEUES_RQ_VALID, 1);
+		if (p_data->q_params.bdq_pbl_num_entries[BDQ_ID_IMM_DATA])
+			SET_FIELD(p_data->q_params.q_validity,
+				  SCSI_INIT_FUNC_QUEUES_IMM_DATA_VALID, 1);
+		SET_FIELD(p_data->q_params.q_validity,
+			  SCSI_INIT_FUNC_QUEUES_CMD_VALID, 1);
+	} else {
+		SET_FIELD(p_data->q_params.q_validity,
+			  SCSI_INIT_FUNC_QUEUES_RQ_VALID, 1);
+	}
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+	return rc;
+}
+
+static int
+qed_sp_fcoe_conn_offload(struct qed_hwfn *p_hwfn,
+			 struct qed_fcoe_conn *p_conn,
+			 enum spq_mode comp_mode,
+			 struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct fcoe_conn_offload_ramrod_params *p_ramrod = NULL;
+	struct fcoe_conn_offload_ramrod_data *p_data;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	u16 physical_q0, tmp;
+	int rc;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 FCOE_RAMROD_CMD_ID_OFFLOAD_CONN,
+				 PROTOCOLID_FCOE, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.fcoe_conn_ofld;
+	p_data = &p_ramrod->offload_ramrod_data;
+
+	/* Transmission PQ is the first of the PF */
+	physical_q0 = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_OFLD);
+	p_conn->physical_q0 = cpu_to_le16(physical_q0);
+	p_data->physical_q0 = cpu_to_le16(physical_q0);
+
+	p_data->conn_id = cpu_to_le16(p_conn->conn_id);
+	DMA_REGPAIR_LE(p_data->sq_pbl_addr, p_conn->sq_pbl_addr);
+	DMA_REGPAIR_LE(p_data->sq_curr_page_addr, p_conn->sq_curr_page_addr);
+	DMA_REGPAIR_LE(p_data->sq_next_page_addr, p_conn->sq_next_page_addr);
+	DMA_REGPAIR_LE(p_data->xferq_pbl_addr, p_conn->xferq_pbl_addr);
+	DMA_REGPAIR_LE(p_data->xferq_curr_page_addr, p_conn->xferq_addr[0]);
+	DMA_REGPAIR_LE(p_data->xferq_next_page_addr, p_conn->xferq_addr[1]);
+
+	DMA_REGPAIR_LE(p_data->respq_pbl_addr, p_conn->confq_pbl_addr);
+	DMA_REGPAIR_LE(p_data->respq_curr_page_addr, p_conn->confq_addr[0]);
+	DMA_REGPAIR_LE(p_data->respq_next_page_addr, p_conn->confq_addr[1]);
+
+	p_data->dst_mac_addr_lo = cpu_to_le16(p_conn->dst_mac_addr_lo);
+	p_data->dst_mac_addr_mid = cpu_to_le16(p_conn->dst_mac_addr_mid);
+	p_data->dst_mac_addr_hi = cpu_to_le16(p_conn->dst_mac_addr_hi);
+	p_data->src_mac_addr_lo = cpu_to_le16(p_conn->src_mac_addr_lo);
+	p_data->src_mac_addr_mid = cpu_to_le16(p_conn->src_mac_addr_mid);
+	p_data->src_mac_addr_hi = cpu_to_le16(p_conn->src_mac_addr_hi);
+
+	tmp = cpu_to_le16(p_conn->tx_max_fc_pay_len);
+	p_data->tx_max_fc_pay_len = tmp;
+	tmp = cpu_to_le16(p_conn->e_d_tov_timer_val);
+	p_data->e_d_tov_timer_val = tmp;
+	tmp = cpu_to_le16(p_conn->rec_tov_timer_val);
+	p_data->rec_rr_tov_timer_val = tmp;
+	tmp = cpu_to_le16(p_conn->rx_max_fc_pay_len);
+	p_data->rx_max_fc_pay_len = tmp;
+
+	p_data->vlan_tag = cpu_to_le16(p_conn->vlan_tag);
+	p_data->s_id.addr_hi = p_conn->s_id.addr_hi;
+	p_data->s_id.addr_mid = p_conn->s_id.addr_mid;
+	p_data->s_id.addr_lo = p_conn->s_id.addr_lo;
+	p_data->max_conc_seqs_c3 = p_conn->max_conc_seqs_c3;
+	p_data->d_id.addr_hi = p_conn->d_id.addr_hi;
+	p_data->d_id.addr_mid = p_conn->d_id.addr_mid;
+	p_data->d_id.addr_lo = p_conn->d_id.addr_lo;
+	p_data->flags = p_conn->flags;
+	p_data->def_q_idx = p_conn->def_q_idx;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int
+qed_sp_fcoe_conn_destroy(struct qed_hwfn *p_hwfn,
+			 struct qed_fcoe_conn *p_conn,
+			 enum spq_mode comp_mode,
+			 struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct fcoe_conn_terminate_ramrod_params *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = 0;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 FCOE_RAMROD_CMD_ID_TERMINATE_CONN,
+				 PROTOCOLID_FCOE, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.fcoe_conn_terminate;
+	DMA_REGPAIR_LE(p_ramrod->terminate_ramrod_data.terminate_params_addr,
+		       p_conn->terminate_params);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int
+qed_sp_fcoe_func_stop(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt,
+		      enum spq_mode comp_mode,
+		      struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	u32 active_segs = 0;
+	int rc = 0;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_hwfn->pf_params.fcoe_pf_params.dummy_icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 FCOE_RAMROD_CMD_ID_DESTROY_FUNC,
+				 PROTOCOLID_FCOE, &init_data);
+	if (rc)
+		return rc;
+
+	active_segs = qed_rd(p_hwfn, p_ptt, TM_REG_PF_ENABLE_TASK);
+	active_segs &= ~BIT(QED_CXT_FCOE_TID_SEG);
+	qed_wr(p_hwfn, p_ptt, TM_REG_PF_ENABLE_TASK, active_segs);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int
+qed_fcoe_allocate_connection(struct qed_hwfn *p_hwfn,
+			     struct qed_fcoe_conn **p_out_conn)
+{
+	struct qed_fcoe_conn *p_conn = NULL;
+	void *p_addr;
+	u32 i;
+
+	spin_lock_bh(&p_hwfn->p_fcoe_info->lock);
+	if (!list_empty(&p_hwfn->p_fcoe_info->free_list))
+		p_conn =
+		    list_first_entry(&p_hwfn->p_fcoe_info->free_list,
+				     struct qed_fcoe_conn, list_entry);
+	if (p_conn) {
+		list_del(&p_conn->list_entry);
+		spin_unlock_bh(&p_hwfn->p_fcoe_info->lock);
+		*p_out_conn = p_conn;
+		return 0;
+	}
+	spin_unlock_bh(&p_hwfn->p_fcoe_info->lock);
+
+	p_conn = kzalloc(sizeof(*p_conn), GFP_KERNEL);
+	if (!p_conn)
+		return -ENOMEM;
+
+	p_addr = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				    QED_CHAIN_PAGE_SIZE,
+				    &p_conn->xferq_pbl_addr, GFP_KERNEL);
+	if (!p_addr)
+		goto nomem_pbl_xferq;
+	p_conn->xferq_pbl_addr_virt_addr = p_addr;
+
+	for (i = 0; i < ARRAY_SIZE(p_conn->xferq_addr); i++) {
+		p_addr = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+					    QED_CHAIN_PAGE_SIZE,
+					    &p_conn->xferq_addr[i], GFP_KERNEL);
+		if (!p_addr)
+			goto nomem_xferq;
+		p_conn->xferq_addr_virt_addr[i] = p_addr;
+
+		p_addr = p_conn->xferq_pbl_addr_virt_addr;
+		((dma_addr_t *)p_addr)[i] = p_conn->xferq_addr[i];
+	}
+
+	p_addr = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				    QED_CHAIN_PAGE_SIZE,
+				    &p_conn->confq_pbl_addr, GFP_KERNEL);
+	if (!p_addr)
+		goto nomem_xferq;
+	p_conn->confq_pbl_addr_virt_addr = p_addr;
+
+	for (i = 0; i < ARRAY_SIZE(p_conn->confq_addr); i++) {
+		p_addr = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+					    QED_CHAIN_PAGE_SIZE,
+					    &p_conn->confq_addr[i], GFP_KERNEL);
+		if (!p_addr)
+			goto nomem_confq;
+		p_conn->confq_addr_virt_addr[i] = p_addr;
+
+		p_addr = p_conn->confq_pbl_addr_virt_addr;
+		((dma_addr_t *)p_addr)[i] = p_conn->confq_addr[i];
+	}
+
+	p_conn->free_on_delete = true;
+	*p_out_conn = p_conn;
+	return 0;
+
+nomem_confq:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  QED_CHAIN_PAGE_SIZE,
+			  p_conn->confq_pbl_addr_virt_addr,
+			  p_conn->confq_pbl_addr);
+	for (i = 0; i < ARRAY_SIZE(p_conn->confq_addr); i++)
+		if (p_conn->confq_addr_virt_addr[i])
+			dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+					  QED_CHAIN_PAGE_SIZE,
+					  p_conn->confq_addr_virt_addr[i],
+					  p_conn->confq_addr[i]);
+nomem_xferq:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  QED_CHAIN_PAGE_SIZE,
+			  p_conn->xferq_pbl_addr_virt_addr,
+			  p_conn->xferq_pbl_addr);
+	for (i = 0; i < ARRAY_SIZE(p_conn->xferq_addr); i++)
+		if (p_conn->xferq_addr_virt_addr[i])
+			dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+					  QED_CHAIN_PAGE_SIZE,
+					  p_conn->xferq_addr_virt_addr[i],
+					  p_conn->xferq_addr[i]);
+nomem_pbl_xferq:
+	kfree(p_conn);
+	return -ENOMEM;
+}
+
+static void qed_fcoe_free_connection(struct qed_hwfn *p_hwfn,
+				     struct qed_fcoe_conn *p_conn)
+{
+	u32 i;
+
+	if (!p_conn)
+		return;
+
+	if (p_conn->confq_pbl_addr_virt_addr)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  QED_CHAIN_PAGE_SIZE,
+				  p_conn->confq_pbl_addr_virt_addr,
+				  p_conn->confq_pbl_addr);
+
+	for (i = 0; i < ARRAY_SIZE(p_conn->confq_addr); i++) {
+		if (!p_conn->confq_addr_virt_addr[i])
+			continue;
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  QED_CHAIN_PAGE_SIZE,
+				  p_conn->confq_addr_virt_addr[i],
+				  p_conn->confq_addr[i]);
+	}
+
+	if (p_conn->xferq_pbl_addr_virt_addr)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  QED_CHAIN_PAGE_SIZE,
+				  p_conn->xferq_pbl_addr_virt_addr,
+				  p_conn->xferq_pbl_addr);
+
+	for (i = 0; i < ARRAY_SIZE(p_conn->xferq_addr); i++) {
+		if (!p_conn->xferq_addr_virt_addr[i])
+			continue;
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  QED_CHAIN_PAGE_SIZE,
+				  p_conn->xferq_addr_virt_addr[i],
+				  p_conn->xferq_addr[i]);
+	}
+	kfree(p_conn);
+}
+
+static void __iomem *qed_fcoe_get_db_addr(struct qed_hwfn *p_hwfn, u32 cid)
+{
+	return (u8 __iomem *)p_hwfn->doorbells +
+	       qed_db_addr(cid, DQ_DEMS_LEGACY);
+}
+
+static void __iomem *qed_fcoe_get_primary_bdq_prod(struct qed_hwfn *p_hwfn,
+						   u8 bdq_id)
+{
+	if (RESC_NUM(p_hwfn, QED_BDQ)) {
+		return (u8 __iomem *)p_hwfn->regview +
+		       GTT_BAR0_MAP_REG_MSDM_RAM +
+		       MSTORM_SCSI_BDQ_EXT_PROD_OFFSET(RESC_START(p_hwfn,
+								  QED_BDQ),
+						       bdq_id);
+	} else {
+		DP_NOTICE(p_hwfn, "BDQ is not allocated!\n");
+		return NULL;
+	}
+}
+
+static void __iomem *qed_fcoe_get_secondary_bdq_prod(struct qed_hwfn *p_hwfn,
+						     u8 bdq_id)
+{
+	if (RESC_NUM(p_hwfn, QED_BDQ)) {
+		return (u8 __iomem *)p_hwfn->regview +
+		       GTT_BAR0_MAP_REG_TSDM_RAM +
+		       TSTORM_SCSI_BDQ_EXT_PROD_OFFSET(RESC_START(p_hwfn,
+								  QED_BDQ),
+						       bdq_id);
+	} else {
+		DP_NOTICE(p_hwfn, "BDQ is not allocated!\n");
+		return NULL;
+	}
+}
+
+int qed_fcoe_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_fcoe_info *p_fcoe_info;
+
+	/* Allocate LL2's set struct */
+	p_fcoe_info = kzalloc(sizeof(*p_fcoe_info), GFP_KERNEL);
+	if (!p_fcoe_info) {
+		DP_NOTICE(p_hwfn, "Failed to allocate qed_fcoe_info'\n");
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&p_fcoe_info->free_list);
+
+	p_hwfn->p_fcoe_info = p_fcoe_info;
+	return 0;
+}
+
+void qed_fcoe_setup(struct qed_hwfn *p_hwfn)
+{
+	struct e4_fcoe_task_context *p_task_ctx = NULL;
+	int rc;
+	u32 i;
+
+	spin_lock_init(&p_hwfn->p_fcoe_info->lock);
+	for (i = 0; i < p_hwfn->pf_params.fcoe_pf_params.num_tasks; i++) {
+		rc = qed_cxt_get_task_ctx(p_hwfn, i,
+					  QED_CTX_WORKING_MEM,
+					  (void **)&p_task_ctx);
+		if (rc)
+			continue;
+
+		memset(p_task_ctx, 0, sizeof(struct e4_fcoe_task_context));
+		SET_FIELD(p_task_ctx->timer_context.logical_client_0,
+			  TIMERS_CONTEXT_VALIDLC0, 1);
+		SET_FIELD(p_task_ctx->timer_context.logical_client_1,
+			  TIMERS_CONTEXT_VALIDLC1, 1);
+		SET_FIELD(p_task_ctx->tstorm_ag_context.flags0,
+			  E4_TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE, 1);
+	}
+}
+
+void qed_fcoe_free(struct qed_hwfn *p_hwfn)
+{
+	struct qed_fcoe_conn *p_conn = NULL;
+
+	if (!p_hwfn->p_fcoe_info)
+		return;
+
+	while (!list_empty(&p_hwfn->p_fcoe_info->free_list)) {
+		p_conn = list_first_entry(&p_hwfn->p_fcoe_info->free_list,
+					  struct qed_fcoe_conn, list_entry);
+		if (!p_conn)
+			break;
+		list_del(&p_conn->list_entry);
+		qed_fcoe_free_connection(p_hwfn, p_conn);
+	}
+
+	kfree(p_hwfn->p_fcoe_info);
+	p_hwfn->p_fcoe_info = NULL;
+}
+
+static int
+qed_fcoe_acquire_connection(struct qed_hwfn *p_hwfn,
+			    struct qed_fcoe_conn *p_in_conn,
+			    struct qed_fcoe_conn **p_out_conn)
+{
+	struct qed_fcoe_conn *p_conn = NULL;
+	int rc = 0;
+	u32 icid;
+
+	spin_lock_bh(&p_hwfn->p_fcoe_info->lock);
+	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_FCOE, &icid);
+	spin_unlock_bh(&p_hwfn->p_fcoe_info->lock);
+	if (rc)
+		return rc;
+
+	/* Use input connection [if provided] or allocate a new one */
+	if (p_in_conn) {
+		p_conn = p_in_conn;
+	} else {
+		rc = qed_fcoe_allocate_connection(p_hwfn, &p_conn);
+		if (rc) {
+			spin_lock_bh(&p_hwfn->p_fcoe_info->lock);
+			qed_cxt_release_cid(p_hwfn, icid);
+			spin_unlock_bh(&p_hwfn->p_fcoe_info->lock);
+			return rc;
+		}
+	}
+
+	p_conn->icid = icid;
+	p_conn->fw_cid = (p_hwfn->hw_info.opaque_fid << 16) | icid;
+	*p_out_conn = p_conn;
+
+	return rc;
+}
+
+static void qed_fcoe_release_connection(struct qed_hwfn *p_hwfn,
+					struct qed_fcoe_conn *p_conn)
+{
+	spin_lock_bh(&p_hwfn->p_fcoe_info->lock);
+	list_add_tail(&p_conn->list_entry, &p_hwfn->p_fcoe_info->free_list);
+	qed_cxt_release_cid(p_hwfn, p_conn->icid);
+	spin_unlock_bh(&p_hwfn->p_fcoe_info->lock);
+}
+
+static void _qed_fcoe_get_tstats(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 struct qed_fcoe_stats *p_stats)
+{
+	struct fcoe_rx_stat tstats;
+	u32 tstats_addr;
+
+	memset(&tstats, 0, sizeof(tstats));
+	tstats_addr = BAR0_MAP_REG_TSDM_RAM +
+	    TSTORM_FCOE_RX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &tstats, tstats_addr, sizeof(tstats));
+
+	p_stats->fcoe_rx_byte_cnt = HILO_64_REGPAIR(tstats.fcoe_rx_byte_cnt);
+	p_stats->fcoe_rx_data_pkt_cnt =
+	    HILO_64_REGPAIR(tstats.fcoe_rx_data_pkt_cnt);
+	p_stats->fcoe_rx_xfer_pkt_cnt =
+	    HILO_64_REGPAIR(tstats.fcoe_rx_xfer_pkt_cnt);
+	p_stats->fcoe_rx_other_pkt_cnt =
+	    HILO_64_REGPAIR(tstats.fcoe_rx_other_pkt_cnt);
+
+	p_stats->fcoe_silent_drop_pkt_cmdq_full_cnt =
+	    le32_to_cpu(tstats.fcoe_silent_drop_pkt_cmdq_full_cnt);
+	p_stats->fcoe_silent_drop_pkt_rq_full_cnt =
+	    le32_to_cpu(tstats.fcoe_silent_drop_pkt_rq_full_cnt);
+	p_stats->fcoe_silent_drop_pkt_crc_error_cnt =
+	    le32_to_cpu(tstats.fcoe_silent_drop_pkt_crc_error_cnt);
+	p_stats->fcoe_silent_drop_pkt_task_invalid_cnt =
+	    le32_to_cpu(tstats.fcoe_silent_drop_pkt_task_invalid_cnt);
+	p_stats->fcoe_silent_drop_total_pkt_cnt =
+	    le32_to_cpu(tstats.fcoe_silent_drop_total_pkt_cnt);
+}
+
+static void _qed_fcoe_get_pstats(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 struct qed_fcoe_stats *p_stats)
+{
+	struct fcoe_tx_stat pstats;
+	u32 pstats_addr;
+
+	memset(&pstats, 0, sizeof(pstats));
+	pstats_addr = BAR0_MAP_REG_PSDM_RAM +
+	    PSTORM_FCOE_TX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &pstats, pstats_addr, sizeof(pstats));
+
+	p_stats->fcoe_tx_byte_cnt = HILO_64_REGPAIR(pstats.fcoe_tx_byte_cnt);
+	p_stats->fcoe_tx_data_pkt_cnt =
+	    HILO_64_REGPAIR(pstats.fcoe_tx_data_pkt_cnt);
+	p_stats->fcoe_tx_xfer_pkt_cnt =
+	    HILO_64_REGPAIR(pstats.fcoe_tx_xfer_pkt_cnt);
+	p_stats->fcoe_tx_other_pkt_cnt =
+	    HILO_64_REGPAIR(pstats.fcoe_tx_other_pkt_cnt);
+}
+
+static int qed_fcoe_get_stats(struct qed_hwfn *p_hwfn,
+			      struct qed_fcoe_stats *p_stats)
+{
+	struct qed_ptt *p_ptt;
+
+	memset(p_stats, 0, sizeof(*p_stats));
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+
+	if (!p_ptt) {
+		DP_ERR(p_hwfn, "Failed to acquire ptt\n");
+		return -EINVAL;
+	}
+
+	_qed_fcoe_get_tstats(p_hwfn, p_ptt, p_stats);
+	_qed_fcoe_get_pstats(p_hwfn, p_ptt, p_stats);
+
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return 0;
+}
+
+struct qed_hash_fcoe_con {
+	struct hlist_node node;
+	struct qed_fcoe_conn *con;
+};
+
+static int qed_fill_fcoe_dev_info(struct qed_dev *cdev,
+				  struct qed_dev_fcoe_info *info)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	int rc;
+
+	memset(info, 0, sizeof(*info));
+	rc = qed_fill_dev_info(cdev, &info->common);
+
+	info->primary_dbq_rq_addr =
+	    qed_fcoe_get_primary_bdq_prod(hwfn, BDQ_ID_RQ);
+	info->secondary_bdq_rq_addr =
+	    qed_fcoe_get_secondary_bdq_prod(hwfn, BDQ_ID_RQ);
+
+	info->wwpn = hwfn->mcp_info->func_info.wwn_port;
+	info->wwnn = hwfn->mcp_info->func_info.wwn_node;
+
+	info->num_cqs = FEAT_NUM(hwfn, QED_FCOE_CQ);
+
+	return rc;
+}
+
+static void qed_register_fcoe_ops(struct qed_dev *cdev,
+				  struct qed_fcoe_cb_ops *ops, void *cookie)
+{
+	cdev->protocol_ops.fcoe = ops;
+	cdev->ops_cookie = cookie;
+}
+
+static struct qed_hash_fcoe_con *qed_fcoe_get_hash(struct qed_dev *cdev,
+						   u32 handle)
+{
+	struct qed_hash_fcoe_con *hash_con = NULL;
+
+	if (!(cdev->flags & QED_FLAG_STORAGE_STARTED))
+		return NULL;
+
+	hash_for_each_possible(cdev->connections, hash_con, node, handle) {
+		if (hash_con->con->icid == handle)
+			break;
+	}
+
+	if (!hash_con || (hash_con->con->icid != handle))
+		return NULL;
+
+	return hash_con;
+}
+
+static int qed_fcoe_stop(struct qed_dev *cdev)
+{
+	struct qed_ptt *p_ptt;
+	int rc;
+
+	if (!(cdev->flags & QED_FLAG_STORAGE_STARTED)) {
+		DP_NOTICE(cdev, "fcoe already stopped\n");
+		return 0;
+	}
+
+	if (!hash_empty(cdev->connections)) {
+		DP_NOTICE(cdev,
+			  "Can't stop fcoe - not all connections were returned\n");
+		return -EINVAL;
+	}
+
+	p_ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev));
+	if (!p_ptt)
+		return -EAGAIN;
+
+	/* Stop the fcoe */
+	rc = qed_sp_fcoe_func_stop(QED_LEADING_HWFN(cdev), p_ptt,
+				   QED_SPQ_MODE_EBLOCK, NULL);
+	cdev->flags &= ~QED_FLAG_STORAGE_STARTED;
+	qed_ptt_release(QED_LEADING_HWFN(cdev), p_ptt);
+
+	return rc;
+}
+
+static int qed_fcoe_start(struct qed_dev *cdev, struct qed_fcoe_tid *tasks)
+{
+	int rc;
+
+	if (cdev->flags & QED_FLAG_STORAGE_STARTED) {
+		DP_NOTICE(cdev, "fcoe already started;\n");
+		return 0;
+	}
+
+	rc = qed_sp_fcoe_func_start(QED_LEADING_HWFN(cdev),
+				    QED_SPQ_MODE_EBLOCK, NULL);
+	if (rc) {
+		DP_NOTICE(cdev, "Failed to start fcoe\n");
+		return rc;
+	}
+
+	cdev->flags |= QED_FLAG_STORAGE_STARTED;
+	hash_init(cdev->connections);
+
+	if (tasks) {
+		struct qed_tid_mem *tid_info = kzalloc(sizeof(*tid_info),
+						       GFP_ATOMIC);
+
+		if (!tid_info) {
+			DP_NOTICE(cdev,
+				  "Failed to allocate tasks information\n");
+			qed_fcoe_stop(cdev);
+			return -ENOMEM;
+		}
+
+		rc = qed_cxt_get_tid_mem_info(QED_LEADING_HWFN(cdev), tid_info);
+		if (rc) {
+			DP_NOTICE(cdev, "Failed to gather task information\n");
+			qed_fcoe_stop(cdev);
+			kfree(tid_info);
+			return rc;
+		}
+
+		/* Fill task information */
+		tasks->size = tid_info->tid_size;
+		tasks->num_tids_per_block = tid_info->num_tids_per_block;
+		memcpy(tasks->blocks, tid_info->blocks,
+		       MAX_TID_BLOCKS_FCOE * sizeof(u8 *));
+
+		kfree(tid_info);
+	}
+
+	return 0;
+}
+
+static int qed_fcoe_acquire_conn(struct qed_dev *cdev,
+				 u32 *handle,
+				 u32 *fw_cid, void __iomem **p_doorbell)
+{
+	struct qed_hash_fcoe_con *hash_con;
+	int rc;
+
+	/* Allocate a hashed connection */
+	hash_con = kzalloc(sizeof(*hash_con), GFP_KERNEL);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to allocate hashed connection\n");
+		return -ENOMEM;
+	}
+
+	/* Acquire the connection */
+	rc = qed_fcoe_acquire_connection(QED_LEADING_HWFN(cdev), NULL,
+					 &hash_con->con);
+	if (rc) {
+		DP_NOTICE(cdev, "Failed to acquire Connection\n");
+		kfree(hash_con);
+		return rc;
+	}
+
+	/* Added the connection to hash table */
+	*handle = hash_con->con->icid;
+	*fw_cid = hash_con->con->fw_cid;
+	hash_add(cdev->connections, &hash_con->node, *handle);
+
+	if (p_doorbell)
+		*p_doorbell = qed_fcoe_get_db_addr(QED_LEADING_HWFN(cdev),
+						   *handle);
+
+	return 0;
+}
+
+static int qed_fcoe_release_conn(struct qed_dev *cdev, u32 handle)
+{
+	struct qed_hash_fcoe_con *hash_con;
+
+	hash_con = qed_fcoe_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	hlist_del(&hash_con->node);
+	qed_fcoe_release_connection(QED_LEADING_HWFN(cdev), hash_con->con);
+	kfree(hash_con);
+
+	return 0;
+}
+
+static int qed_fcoe_offload_conn(struct qed_dev *cdev,
+				 u32 handle,
+				 struct qed_fcoe_params_offload *conn_info)
+{
+	struct qed_hash_fcoe_con *hash_con;
+	struct qed_fcoe_conn *con;
+
+	hash_con = qed_fcoe_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	/* Update the connection with information from the params */
+	con = hash_con->con;
+
+	con->sq_pbl_addr = conn_info->sq_pbl_addr;
+	con->sq_curr_page_addr = conn_info->sq_curr_page_addr;
+	con->sq_next_page_addr = conn_info->sq_next_page_addr;
+	con->tx_max_fc_pay_len = conn_info->tx_max_fc_pay_len;
+	con->e_d_tov_timer_val = conn_info->e_d_tov_timer_val;
+	con->rec_tov_timer_val = conn_info->rec_tov_timer_val;
+	con->rx_max_fc_pay_len = conn_info->rx_max_fc_pay_len;
+	con->vlan_tag = conn_info->vlan_tag;
+	con->max_conc_seqs_c3 = conn_info->max_conc_seqs_c3;
+	con->flags = conn_info->flags;
+	con->def_q_idx = conn_info->def_q_idx;
+
+	con->src_mac_addr_hi = (conn_info->src_mac[5] << 8) |
+	    conn_info->src_mac[4];
+	con->src_mac_addr_mid = (conn_info->src_mac[3] << 8) |
+	    conn_info->src_mac[2];
+	con->src_mac_addr_lo = (conn_info->src_mac[1] << 8) |
+	    conn_info->src_mac[0];
+	con->dst_mac_addr_hi = (conn_info->dst_mac[5] << 8) |
+	    conn_info->dst_mac[4];
+	con->dst_mac_addr_mid = (conn_info->dst_mac[3] << 8) |
+	    conn_info->dst_mac[2];
+	con->dst_mac_addr_lo = (conn_info->dst_mac[1] << 8) |
+	    conn_info->dst_mac[0];
+
+	con->s_id.addr_hi = conn_info->s_id.addr_hi;
+	con->s_id.addr_mid = conn_info->s_id.addr_mid;
+	con->s_id.addr_lo = conn_info->s_id.addr_lo;
+	con->d_id.addr_hi = conn_info->d_id.addr_hi;
+	con->d_id.addr_mid = conn_info->d_id.addr_mid;
+	con->d_id.addr_lo = conn_info->d_id.addr_lo;
+
+	return qed_sp_fcoe_conn_offload(QED_LEADING_HWFN(cdev), con,
+					QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_fcoe_destroy_conn(struct qed_dev *cdev,
+				 u32 handle, dma_addr_t terminate_params)
+{
+	struct qed_hash_fcoe_con *hash_con;
+	struct qed_fcoe_conn *con;
+
+	hash_con = qed_fcoe_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	/* Update the connection with information from the params */
+	con = hash_con->con;
+	con->terminate_params = terminate_params;
+
+	return qed_sp_fcoe_conn_destroy(QED_LEADING_HWFN(cdev), con,
+					QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_fcoe_stats(struct qed_dev *cdev, struct qed_fcoe_stats *stats)
+{
+	return qed_fcoe_get_stats(QED_LEADING_HWFN(cdev), stats);
+}
+
+void qed_get_protocol_stats_fcoe(struct qed_dev *cdev,
+				 struct qed_mcp_fcoe_stats *stats)
+{
+	struct qed_fcoe_stats proto_stats;
+
+	/* Retrieve FW statistics */
+	memset(&proto_stats, 0, sizeof(proto_stats));
+	if (qed_fcoe_stats(cdev, &proto_stats)) {
+		DP_VERBOSE(cdev, QED_MSG_STORAGE,
+			   "Failed to collect FCoE statistics\n");
+		return;
+	}
+
+	/* Translate FW statistics into struct */
+	stats->rx_pkts = proto_stats.fcoe_rx_data_pkt_cnt +
+			 proto_stats.fcoe_rx_xfer_pkt_cnt +
+			 proto_stats.fcoe_rx_other_pkt_cnt;
+	stats->tx_pkts = proto_stats.fcoe_tx_data_pkt_cnt +
+			 proto_stats.fcoe_tx_xfer_pkt_cnt +
+			 proto_stats.fcoe_tx_other_pkt_cnt;
+	stats->fcs_err = proto_stats.fcoe_silent_drop_pkt_crc_error_cnt;
+
+	/* Request protocol driver to fill-in the rest */
+	if (cdev->protocol_ops.fcoe && cdev->ops_cookie) {
+		struct qed_fcoe_cb_ops *ops = cdev->protocol_ops.fcoe;
+		void *cookie = cdev->ops_cookie;
+
+		if (ops->get_login_failures)
+			stats->login_failure = ops->get_login_failures(cookie);
+	}
+}
+
+static const struct qed_fcoe_ops qed_fcoe_ops_pass = {
+	.common = &qed_common_ops_pass,
+	.ll2 = &qed_ll2_ops_pass,
+	.fill_dev_info = &qed_fill_fcoe_dev_info,
+	.start = &qed_fcoe_start,
+	.stop = &qed_fcoe_stop,
+	.register_ops = &qed_register_fcoe_ops,
+	.acquire_conn = &qed_fcoe_acquire_conn,
+	.release_conn = &qed_fcoe_release_conn,
+	.offload_conn = &qed_fcoe_offload_conn,
+	.destroy_conn = &qed_fcoe_destroy_conn,
+	.get_stats = &qed_fcoe_stats,
+};
+
+const struct qed_fcoe_ops *qed_get_fcoe_ops(void)
+{
+	return &qed_fcoe_ops_pass;
+}
+EXPORT_SYMBOL(qed_get_fcoe_ops);
+
+void qed_put_fcoe_ops(void)
+{
+}
+EXPORT_SYMBOL(qed_put_fcoe_ops);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_fcoe.h b/drivers/net/ethernet/qlogic/qed/qed_fcoe.h
new file mode 100644
index 0000000..027a76a
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_fcoe.h
@@ -0,0 +1,79 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_FCOE_H
+#define _QED_FCOE_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/qed/qed_fcoe_if.h>
+#include <linux/qed/qed_chain.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_mcp.h"
+#include "qed_sp.h"
+
+struct qed_fcoe_info {
+	spinlock_t lock; /* Connection resources. */
+	struct list_head free_list;
+};
+
+#if IS_ENABLED(CONFIG_QED_FCOE)
+int qed_fcoe_alloc(struct qed_hwfn *p_hwfn);
+
+void qed_fcoe_setup(struct qed_hwfn *p_hwfn);
+
+void qed_fcoe_free(struct qed_hwfn *p_hwfn);
+void qed_get_protocol_stats_fcoe(struct qed_dev *cdev,
+				 struct qed_mcp_fcoe_stats *stats);
+#else /* CONFIG_QED_FCOE */
+static inline int qed_fcoe_alloc(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
+static inline void qed_fcoe_setup(struct qed_hwfn *p_hwfn) {}
+static inline void qed_fcoe_free(struct qed_hwfn *p_hwfn) {}
+
+static inline void qed_get_protocol_stats_fcoe(struct qed_dev *cdev,
+					       struct qed_mcp_fcoe_stats *stats)
+{
+}
+#endif /* CONFIG_QED_FCOE */
+
+#ifdef CONFIG_QED_LL2
+extern const struct qed_common_ops qed_common_ops_pass;
+extern const struct qed_ll2_ops qed_ll2_ops_pass;
+#endif
+
+#endif /* _QED_FCOE_H */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
new file mode 100644
index 0000000..7f5ec42
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -0,0 +1,12814 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_HSI_H
+#define _QED_HSI_H
+
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/qed/common_hsi.h>
+#include <linux/qed/storage_common.h>
+#include <linux/qed/tcp_common.h>
+#include <linux/qed/fcoe_common.h>
+#include <linux/qed/eth_common.h>
+#include <linux/qed/iscsi_common.h>
+#include <linux/qed/iwarp_common.h>
+#include <linux/qed/rdma_common.h>
+#include <linux/qed/roce_common.h>
+#include <linux/qed/qed_fcoe_if.h>
+
+struct qed_hwfn;
+struct qed_ptt;
+
+/* Opcodes for the event ring */
+enum common_event_opcode {
+	COMMON_EVENT_PF_START,
+	COMMON_EVENT_PF_STOP,
+	COMMON_EVENT_VF_START,
+	COMMON_EVENT_VF_STOP,
+	COMMON_EVENT_VF_PF_CHANNEL,
+	COMMON_EVENT_VF_FLR,
+	COMMON_EVENT_PF_UPDATE,
+	COMMON_EVENT_MALICIOUS_VF,
+	COMMON_EVENT_RL_UPDATE,
+	COMMON_EVENT_EMPTY,
+	MAX_COMMON_EVENT_OPCODE
+};
+
+/* Common Ramrod Command IDs */
+enum common_ramrod_cmd_id {
+	COMMON_RAMROD_UNUSED,
+	COMMON_RAMROD_PF_START,
+	COMMON_RAMROD_PF_STOP,
+	COMMON_RAMROD_VF_START,
+	COMMON_RAMROD_VF_STOP,
+	COMMON_RAMROD_PF_UPDATE,
+	COMMON_RAMROD_RL_UPDATE,
+	COMMON_RAMROD_EMPTY,
+	MAX_COMMON_RAMROD_CMD_ID
+};
+
+/* How ll2 should deal with packet upon errors */
+enum core_error_handle {
+	LL2_DROP_PACKET,
+	LL2_DO_NOTHING,
+	LL2_ASSERT,
+	MAX_CORE_ERROR_HANDLE
+};
+
+/* Opcodes for the event ring */
+enum core_event_opcode {
+	CORE_EVENT_TX_QUEUE_START,
+	CORE_EVENT_TX_QUEUE_STOP,
+	CORE_EVENT_RX_QUEUE_START,
+	CORE_EVENT_RX_QUEUE_STOP,
+	CORE_EVENT_RX_QUEUE_FLUSH,
+	CORE_EVENT_TX_QUEUE_UPDATE,
+	MAX_CORE_EVENT_OPCODE
+};
+
+/* The L4 pseudo checksum mode for Core */
+enum core_l4_pseudo_checksum_mode {
+	CORE_L4_PSEUDO_CSUM_CORRECT_LENGTH,
+	CORE_L4_PSEUDO_CSUM_ZERO_LENGTH,
+	MAX_CORE_L4_PSEUDO_CHECKSUM_MODE
+};
+
+/* Light-L2 RX Producers in Tstorm RAM */
+struct core_ll2_port_stats {
+	struct regpair gsi_invalid_hdr;
+	struct regpair gsi_invalid_pkt_length;
+	struct regpair gsi_unsupported_pkt_typ;
+	struct regpair gsi_crcchksm_error;
+};
+
+/* Ethernet TX Per Queue Stats */
+struct core_ll2_pstorm_per_queue_stat {
+	struct regpair sent_ucast_bytes;
+	struct regpair sent_mcast_bytes;
+	struct regpair sent_bcast_bytes;
+	struct regpair sent_ucast_pkts;
+	struct regpair sent_mcast_pkts;
+	struct regpair sent_bcast_pkts;
+};
+
+/* Light-L2 RX Producers in Tstorm RAM */
+struct core_ll2_rx_prod {
+	__le16 bd_prod;
+	__le16 cqe_prod;
+	__le32 reserved;
+};
+
+struct core_ll2_tstorm_per_queue_stat {
+	struct regpair packet_too_big_discard;
+	struct regpair no_buff_discard;
+};
+
+struct core_ll2_ustorm_per_queue_stat {
+	struct regpair rcv_ucast_bytes;
+	struct regpair rcv_mcast_bytes;
+	struct regpair rcv_bcast_bytes;
+	struct regpair rcv_ucast_pkts;
+	struct regpair rcv_mcast_pkts;
+	struct regpair rcv_bcast_pkts;
+};
+
+/* Core Ramrod Command IDs (light L2) */
+enum core_ramrod_cmd_id {
+	CORE_RAMROD_UNUSED,
+	CORE_RAMROD_RX_QUEUE_START,
+	CORE_RAMROD_TX_QUEUE_START,
+	CORE_RAMROD_RX_QUEUE_STOP,
+	CORE_RAMROD_TX_QUEUE_STOP,
+	CORE_RAMROD_RX_QUEUE_FLUSH,
+	CORE_RAMROD_TX_QUEUE_UPDATE,
+	MAX_CORE_RAMROD_CMD_ID
+};
+
+/* Core RX CQE Type for Light L2 */
+enum core_roce_flavor_type {
+	CORE_ROCE,
+	CORE_RROCE,
+	MAX_CORE_ROCE_FLAVOR_TYPE
+};
+
+/* Specifies how ll2 should deal with packets errors: packet_too_big and
+ * no_buff.
+ */
+struct core_rx_action_on_error {
+	u8 error_type;
+#define CORE_RX_ACTION_ON_ERROR_PACKET_TOO_BIG_MASK	0x3
+#define CORE_RX_ACTION_ON_ERROR_PACKET_TOO_BIG_SHIFT	0
+#define CORE_RX_ACTION_ON_ERROR_NO_BUFF_MASK		0x3
+#define CORE_RX_ACTION_ON_ERROR_NO_BUFF_SHIFT		2
+#define CORE_RX_ACTION_ON_ERROR_RESERVED_MASK		0xF
+#define CORE_RX_ACTION_ON_ERROR_RESERVED_SHIFT		4
+};
+
+/* Core RX BD for Light L2 */
+struct core_rx_bd {
+	struct regpair addr;
+	__le16 reserved[4];
+};
+
+/* Core RX CM offload BD for Light L2 */
+struct core_rx_bd_with_buff_len {
+	struct regpair addr;
+	__le16 buff_length;
+	__le16 reserved[3];
+};
+
+/* Core RX CM offload BD for Light L2 */
+union core_rx_bd_union {
+	struct core_rx_bd rx_bd;
+	struct core_rx_bd_with_buff_len rx_bd_with_len;
+};
+
+/* Opaque Data for Light L2 RX CQE */
+struct core_rx_cqe_opaque_data {
+	__le32 data[2];
+};
+
+/* Core RX CQE Type for Light L2 */
+enum core_rx_cqe_type {
+	CORE_RX_CQE_ILLEGAL_TYPE,
+	CORE_RX_CQE_TYPE_REGULAR,
+	CORE_RX_CQE_TYPE_GSI_OFFLOAD,
+	CORE_RX_CQE_TYPE_SLOW_PATH,
+	MAX_CORE_RX_CQE_TYPE
+};
+
+/* Core RX CQE for Light L2 */
+struct core_rx_fast_path_cqe {
+	u8 type;
+	u8 placement_offset;
+	struct parsing_and_err_flags parse_flags;
+	__le16 packet_length;
+	__le16 vlan;
+	struct core_rx_cqe_opaque_data opaque_data;
+	struct parsing_err_flags err_flags;
+	__le16 reserved0;
+	__le32 reserved1[3];
+};
+
+/* Core Rx CM offload CQE */
+struct core_rx_gsi_offload_cqe {
+	u8 type;
+	u8 data_length_error;
+	struct parsing_and_err_flags parse_flags;
+	__le16 data_length;
+	__le16 vlan;
+	__le32 src_mac_addrhi;
+	__le16 src_mac_addrlo;
+	__le16 qp_id;
+	__le32 src_qp;
+	__le32 reserved[3];
+};
+
+/* Core RX CQE for Light L2 */
+struct core_rx_slow_path_cqe {
+	u8 type;
+	u8 ramrod_cmd_id;
+	__le16 echo;
+	struct core_rx_cqe_opaque_data opaque_data;
+	__le32 reserved1[5];
+};
+
+/* Core RX CM offload BD for Light L2 */
+union core_rx_cqe_union {
+	struct core_rx_fast_path_cqe rx_cqe_fp;
+	struct core_rx_gsi_offload_cqe rx_cqe_gsi;
+	struct core_rx_slow_path_cqe rx_cqe_sp;
+};
+
+/* Ramrod data for rx queue start ramrod */
+struct core_rx_start_ramrod_data {
+	struct regpair bd_base;
+	struct regpair cqe_pbl_addr;
+	__le16 mtu;
+	__le16 sb_id;
+	u8 sb_index;
+	u8 complete_cqe_flg;
+	u8 complete_event_flg;
+	u8 drop_ttl0_flg;
+	__le16 num_of_pbl_pages;
+	u8 inner_vlan_stripping_en;
+	u8 report_outer_vlan;
+	u8 queue_id;
+	u8 main_func_queue;
+	u8 mf_si_bcast_accept_all;
+	u8 mf_si_mcast_accept_all;
+	struct core_rx_action_on_error action_on_error;
+	u8 gsi_offload_flag;
+	u8 reserved[6];
+};
+
+/* Ramrod data for rx queue stop ramrod */
+struct core_rx_stop_ramrod_data {
+	u8 complete_cqe_flg;
+	u8 complete_event_flg;
+	u8 queue_id;
+	u8 reserved1;
+	__le16 reserved2[2];
+};
+
+/* Flags for Core TX BD */
+struct core_tx_bd_data {
+	__le16 as_bitfield;
+#define CORE_TX_BD_DATA_FORCE_VLAN_MODE_MASK		0x1
+#define CORE_TX_BD_DATA_FORCE_VLAN_MODE_SHIFT		0
+#define CORE_TX_BD_DATA_VLAN_INSERTION_MASK		0x1
+#define CORE_TX_BD_DATA_VLAN_INSERTION_SHIFT		1
+#define CORE_TX_BD_DATA_START_BD_MASK			0x1
+#define CORE_TX_BD_DATA_START_BD_SHIFT			2
+#define CORE_TX_BD_DATA_IP_CSUM_MASK			0x1
+#define CORE_TX_BD_DATA_IP_CSUM_SHIFT			3
+#define CORE_TX_BD_DATA_L4_CSUM_MASK			0x1
+#define CORE_TX_BD_DATA_L4_CSUM_SHIFT			4
+#define CORE_TX_BD_DATA_IPV6_EXT_MASK			0x1
+#define CORE_TX_BD_DATA_IPV6_EXT_SHIFT			5
+#define CORE_TX_BD_DATA_L4_PROTOCOL_MASK		0x1
+#define CORE_TX_BD_DATA_L4_PROTOCOL_SHIFT		6
+#define CORE_TX_BD_DATA_L4_PSEUDO_CSUM_MODE_MASK	0x1
+#define CORE_TX_BD_DATA_L4_PSEUDO_CSUM_MODE_SHIFT	7
+#define CORE_TX_BD_DATA_NBDS_MASK			0xF
+#define CORE_TX_BD_DATA_NBDS_SHIFT			8
+#define CORE_TX_BD_DATA_ROCE_FLAV_MASK			0x1
+#define CORE_TX_BD_DATA_ROCE_FLAV_SHIFT			12
+#define CORE_TX_BD_DATA_IP_LEN_MASK			0x1
+#define CORE_TX_BD_DATA_IP_LEN_SHIFT			13
+#define CORE_TX_BD_DATA_DISABLE_STAG_INSERTION_MASK	0x1
+#define CORE_TX_BD_DATA_DISABLE_STAG_INSERTION_SHIFT	14
+#define CORE_TX_BD_DATA_RESERVED0_MASK			0x1
+#define CORE_TX_BD_DATA_RESERVED0_SHIFT			15
+};
+
+/* Core TX BD for Light L2 */
+struct core_tx_bd {
+	struct regpair addr;
+	__le16 nbytes;
+	__le16 nw_vlan_or_lb_echo;
+	struct core_tx_bd_data bd_data;
+	__le16 bitfield1;
+#define CORE_TX_BD_L4_HDR_OFFSET_W_MASK		0x3FFF
+#define CORE_TX_BD_L4_HDR_OFFSET_W_SHIFT	0
+#define CORE_TX_BD_TX_DST_MASK			0x3
+#define CORE_TX_BD_TX_DST_SHIFT			14
+};
+
+/* Light L2 TX Destination */
+enum core_tx_dest {
+	CORE_TX_DEST_NW,
+	CORE_TX_DEST_LB,
+	CORE_TX_DEST_RESERVED,
+	CORE_TX_DEST_DROP,
+	MAX_CORE_TX_DEST
+};
+
+/* Ramrod data for tx queue start ramrod */
+struct core_tx_start_ramrod_data {
+	struct regpair pbl_base_addr;
+	__le16 mtu;
+	__le16 sb_id;
+	u8 sb_index;
+	u8 stats_en;
+	u8 stats_id;
+	u8 conn_type;
+	__le16 pbl_size;
+	__le16 qm_pq_id;
+	u8 gsi_offload_flag;
+	u8 resrved[3];
+};
+
+/* Ramrod data for tx queue stop ramrod */
+struct core_tx_stop_ramrod_data {
+	__le32 reserved0[2];
+};
+
+/* Ramrod data for tx queue update ramrod */
+struct core_tx_update_ramrod_data {
+	u8 update_qm_pq_id_flg;
+	u8 reserved0;
+	__le16 qm_pq_id;
+	__le32 reserved1[1];
+};
+
+/* Enum flag for what type of dcb data to update */
+enum dcb_dscp_update_mode {
+	DONT_UPDATE_DCB_DSCP,
+	UPDATE_DCB,
+	UPDATE_DSCP,
+	UPDATE_DCB_DSCP,
+	MAX_DCB_DSCP_UPDATE_MODE
+};
+
+/* The core storm context for the Ystorm */
+struct ystorm_core_conn_st_ctx {
+	__le32 reserved[4];
+};
+
+/* The core storm context for the Pstorm */
+struct pstorm_core_conn_st_ctx {
+	__le32 reserved[4];
+};
+
+/* Core Slowpath Connection storm context of Xstorm */
+struct xstorm_core_conn_st_ctx {
+	__le32 spq_base_lo;
+	__le32 spq_base_hi;
+	struct regpair consolid_base_addr;
+	__le16 spq_cons;
+	__le16 consolid_cons;
+	__le32 reserved0[55];
+};
+
+struct e4_xstorm_core_conn_ag_ctx {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4_XSTORM_CORE_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	0
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED1_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED1_SHIFT	1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED2_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED2_SHIFT	2
+#define E4_XSTORM_CORE_CONN_AG_CTX_EXIST_IN_QM3_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_EXIST_IN_QM3_SHIFT	3
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED3_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED3_SHIFT	4
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED4_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED4_SHIFT	5
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED5_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED5_SHIFT	6
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED6_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED6_SHIFT	7
+	u8 flags1;
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED7_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED7_SHIFT	0
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED8_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED8_SHIFT	1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED9_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED9_SHIFT	2
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT11_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT11_SHIFT		3
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT12_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT12_SHIFT		4
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT13_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT13_SHIFT		5
+#define E4_XSTORM_CORE_CONN_AG_CTX_TX_RULE_ACTIVE_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_TX_RULE_ACTIVE_SHIFT	6
+#define E4_XSTORM_CORE_CONN_AG_CTX_DQ_CF_ACTIVE_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_DQ_CF_ACTIVE_SHIFT	7
+	u8 flags2;
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF0_SHIFT	0
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF1_SHIFT	2
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF2_SHIFT	4
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF3_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF3_SHIFT	6
+	u8 flags3;
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF4_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF4_SHIFT	0
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF5_SHIFT	2
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF6_SHIFT	4
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF7_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF7_SHIFT	6
+	u8 flags4;
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF8_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF8_SHIFT	0
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF9_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF9_SHIFT	2
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF10_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF10_SHIFT	4
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF11_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF11_SHIFT	6
+	u8 flags5;
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF12_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF12_SHIFT	0
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF13_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF13_SHIFT	2
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF14_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF14_SHIFT	4
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF15_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF15_SHIFT	6
+	u8 flags6;
+#define E4_XSTORM_CORE_CONN_AG_CTX_CONSOLID_PROD_CF_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CONSOLID_PROD_CF_SHIFT	0
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF17_MASK			0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF17_SHIFT			2
+#define E4_XSTORM_CORE_CONN_AG_CTX_DQ_CF_MASK			0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_DQ_CF_SHIFT			4
+#define E4_XSTORM_CORE_CONN_AG_CTX_TERMINATE_CF_MASK		0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_TERMINATE_CF_SHIFT		6
+	u8 flags7;
+#define E4_XSTORM_CORE_CONN_AG_CTX_FLUSH_Q0_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_FLUSH_Q0_SHIFT	0
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED10_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED10_SHIFT	2
+#define E4_XSTORM_CORE_CONN_AG_CTX_SLOW_PATH_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_SLOW_PATH_SHIFT	4
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF0EN_SHIFT		6
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF1EN_SHIFT		7
+	u8 flags8;
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF2EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF2EN_SHIFT	0
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF3EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF3EN_SHIFT	1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF4EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF4EN_SHIFT	2
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF5EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF5EN_SHIFT	3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF6EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF6EN_SHIFT	4
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF7EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF7EN_SHIFT	5
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF8EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF8EN_SHIFT	6
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF9EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF9EN_SHIFT	7
+	u8 flags9;
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF10EN_MASK			0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF10EN_SHIFT			0
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF11EN_MASK			0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF11EN_SHIFT			1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF12EN_MASK			0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF12EN_SHIFT			2
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF13EN_MASK			0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF13EN_SHIFT			3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF14EN_MASK			0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF14EN_SHIFT			4
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF15EN_MASK			0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF15EN_SHIFT			5
+#define E4_XSTORM_CORE_CONN_AG_CTX_CONSOLID_PROD_CF_EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CONSOLID_PROD_CF_EN_SHIFT	6
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF17EN_MASK			0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF17EN_SHIFT			7
+	u8 flags10;
+#define E4_XSTORM_CORE_CONN_AG_CTX_DQ_CF_EN_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_DQ_CF_EN_SHIFT		0
+#define E4_XSTORM_CORE_CONN_AG_CTX_TERMINATE_CF_EN_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_TERMINATE_CF_EN_SHIFT	1
+#define E4_XSTORM_CORE_CONN_AG_CTX_FLUSH_Q0_EN_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_FLUSH_Q0_EN_SHIFT		2
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED11_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED11_SHIFT		3
+#define E4_XSTORM_CORE_CONN_AG_CTX_SLOW_PATH_EN_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_SLOW_PATH_EN_SHIFT		4
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF23EN_MASK			0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF23EN_SHIFT			5
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED12_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED12_SHIFT		6
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED13_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED13_SHIFT		7
+	u8 flags11;
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED14_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED14_SHIFT	0
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED15_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RESERVED15_SHIFT	1
+#define E4_XSTORM_CORE_CONN_AG_CTX_TX_DEC_RULE_EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_TX_DEC_RULE_EN_SHIFT	2
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE5EN_SHIFT	3
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE6EN_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE6EN_SHIFT	4
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE7EN_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE7EN_SHIFT	5
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED1_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED1_SHIFT	6
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE9EN_MASK		0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE9EN_SHIFT	7
+	u8 flags12;
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE10EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE10EN_SHIFT	0
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE11EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE11EN_SHIFT	1
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED2_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED2_SHIFT	2
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED3_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED3_SHIFT	3
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE14EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE14EN_SHIFT	4
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE15EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE15EN_SHIFT	5
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE16EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE16EN_SHIFT	6
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE17EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE17EN_SHIFT	7
+	u8 flags13;
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE18EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE18EN_SHIFT	0
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE19EN_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_RULE19EN_SHIFT	1
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED4_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED4_SHIFT	2
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED5_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED5_SHIFT	3
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED6_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED6_SHIFT	4
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED7_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED7_SHIFT	5
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED8_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED8_SHIFT	6
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED9_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_A0_RESERVED9_SHIFT	7
+	u8 flags14;
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT16_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT16_SHIFT	0
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT17_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT17_SHIFT	1
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT18_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT18_SHIFT	2
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT19_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT19_SHIFT	3
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT20_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT20_SHIFT	4
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT21_MASK	0x1
+#define E4_XSTORM_CORE_CONN_AG_CTX_BIT21_SHIFT	5
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF23_MASK	0x3
+#define E4_XSTORM_CORE_CONN_AG_CTX_CF23_SHIFT	6
+	u8 byte2;
+	__le16 physical_q0;
+	__le16 consolid_prod;
+	__le16 reserved16;
+	__le16 tx_bd_cons;
+	__le16 tx_bd_or_spq_prod;
+	__le16 updated_qm_pq_id;
+	__le16 conn_dpi;
+	u8 byte3;
+	u8 byte4;
+	u8 byte5;
+	u8 byte6;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 reg6;
+	__le16 word7;
+	__le16 word8;
+	__le16 word9;
+	__le16 word10;
+	__le32 reg7;
+	__le32 reg8;
+	__le32 reg9;
+	u8 byte7;
+	u8 byte8;
+	u8 byte9;
+	u8 byte10;
+	u8 byte11;
+	u8 byte12;
+	u8 byte13;
+	u8 byte14;
+	u8 byte15;
+	u8 e5_reserved;
+	__le16 word11;
+	__le32 reg10;
+	__le32 reg11;
+	__le32 reg12;
+	__le32 reg13;
+	__le32 reg14;
+	__le32 reg15;
+	__le32 reg16;
+	__le32 reg17;
+	__le32 reg18;
+	__le32 reg19;
+	__le16 word12;
+	__le16 word13;
+	__le16 word14;
+	__le16 word15;
+};
+
+struct e4_tstorm_core_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_TSTORM_CORE_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_TSTORM_CORE_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_TSTORM_CORE_CONN_AG_CTX_BIT2_MASK	0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_BIT2_SHIFT	2
+#define E4_TSTORM_CORE_CONN_AG_CTX_BIT3_MASK	0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_BIT3_SHIFT	3
+#define E4_TSTORM_CORE_CONN_AG_CTX_BIT4_MASK	0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_BIT4_SHIFT	4
+#define E4_TSTORM_CORE_CONN_AG_CTX_BIT5_MASK	0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_BIT5_SHIFT	5
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF0_SHIFT	6
+	u8 flags1;
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF1_SHIFT	0
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF2_SHIFT	2
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF3_MASK	0x3
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF3_SHIFT	4
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF4_MASK	0x3
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF4_SHIFT	6
+	u8 flags2;
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF5_SHIFT	0
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF6_SHIFT	2
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF7_MASK	0x3
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF7_SHIFT	4
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF8_MASK	0x3
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF8_SHIFT	6
+	u8 flags3;
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF9_MASK	0x3
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF9_SHIFT	0
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF10_MASK	0x3
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF10_SHIFT	2
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF0EN_MASK	0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF0EN_SHIFT	4
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF1EN_MASK	0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF1EN_SHIFT	5
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF2EN_MASK	0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF2EN_SHIFT	6
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF3EN_MASK	0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF3EN_SHIFT	7
+	u8 flags4;
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF4EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF4EN_SHIFT		0
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF5EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF5EN_SHIFT		1
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF6EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF6EN_SHIFT		2
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF7EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF7EN_SHIFT		3
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF8EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF8EN_SHIFT		4
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF9EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF9EN_SHIFT		5
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF10EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_CF10EN_SHIFT		6
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE0EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE0EN_SHIFT	7
+	u8 flags5;
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE1EN_SHIFT	0
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE2EN_SHIFT	1
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE3EN_SHIFT	2
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE4EN_SHIFT	3
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE5EN_SHIFT	4
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE6EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE6EN_SHIFT	5
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE7EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE7EN_SHIFT	6
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE8EN_MASK		0x1
+#define E4_TSTORM_CORE_CONN_AG_CTX_RULE8EN_SHIFT	7
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 reg6;
+	__le32 reg7;
+	__le32 reg8;
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	u8 byte4;
+	u8 byte5;
+	__le16 word1;
+	__le16 word2;
+	__le16 word3;
+	__le32 reg9;
+	__le32 reg10;
+};
+
+struct e4_ustorm_core_conn_ag_ctx {
+	u8 reserved;
+	u8 byte1;
+	u8 flags0;
+#define E4_USTORM_CORE_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_USTORM_CORE_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_USTORM_CORE_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_USTORM_CORE_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_USTORM_CORE_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_USTORM_CORE_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_USTORM_CORE_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_USTORM_CORE_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_USTORM_CORE_CONN_AG_CTX_CF3_MASK	0x3
+#define E4_USTORM_CORE_CONN_AG_CTX_CF3_SHIFT	0
+#define E4_USTORM_CORE_CONN_AG_CTX_CF4_MASK	0x3
+#define E4_USTORM_CORE_CONN_AG_CTX_CF4_SHIFT	2
+#define E4_USTORM_CORE_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_USTORM_CORE_CONN_AG_CTX_CF5_SHIFT	4
+#define E4_USTORM_CORE_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_USTORM_CORE_CONN_AG_CTX_CF6_SHIFT	6
+	u8 flags2;
+#define E4_USTORM_CORE_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_CF0EN_SHIFT		0
+#define E4_USTORM_CORE_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_CF1EN_SHIFT		1
+#define E4_USTORM_CORE_CONN_AG_CTX_CF2EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_CF2EN_SHIFT		2
+#define E4_USTORM_CORE_CONN_AG_CTX_CF3EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_CF3EN_SHIFT		3
+#define E4_USTORM_CORE_CONN_AG_CTX_CF4EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_CF4EN_SHIFT		4
+#define E4_USTORM_CORE_CONN_AG_CTX_CF5EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_CF5EN_SHIFT		5
+#define E4_USTORM_CORE_CONN_AG_CTX_CF6EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_CF6EN_SHIFT		6
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE0EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE0EN_SHIFT	7
+	u8 flags3;
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE1EN_SHIFT	0
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE2EN_SHIFT	1
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE3EN_SHIFT	2
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE4EN_SHIFT	3
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE5EN_SHIFT	4
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE6EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE6EN_SHIFT	5
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE7EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE7EN_SHIFT	6
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE8EN_MASK		0x1
+#define E4_USTORM_CORE_CONN_AG_CTX_RULE8EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le16 word1;
+	__le32 rx_producers;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le16 word2;
+	__le16 word3;
+};
+
+/* The core storm context for the Mstorm */
+struct mstorm_core_conn_st_ctx {
+	__le32 reserved[24];
+};
+
+/* The core storm context for the Ustorm */
+struct ustorm_core_conn_st_ctx {
+	__le32 reserved[4];
+};
+
+/* core connection context */
+struct e4_core_conn_context {
+	struct ystorm_core_conn_st_ctx ystorm_st_context;
+	struct regpair ystorm_st_padding[2];
+	struct pstorm_core_conn_st_ctx pstorm_st_context;
+	struct regpair pstorm_st_padding[2];
+	struct xstorm_core_conn_st_ctx xstorm_st_context;
+	struct e4_xstorm_core_conn_ag_ctx xstorm_ag_context;
+	struct e4_tstorm_core_conn_ag_ctx tstorm_ag_context;
+	struct e4_ustorm_core_conn_ag_ctx ustorm_ag_context;
+	struct mstorm_core_conn_st_ctx mstorm_st_context;
+	struct ustorm_core_conn_st_ctx ustorm_st_context;
+	struct regpair ustorm_st_padding[2];
+};
+
+struct eth_mstorm_per_pf_stat {
+	struct regpair gre_discard_pkts;
+	struct regpair vxlan_discard_pkts;
+	struct regpair geneve_discard_pkts;
+	struct regpair lb_discard_pkts;
+};
+
+struct eth_mstorm_per_queue_stat {
+	struct regpair ttl0_discard;
+	struct regpair packet_too_big_discard;
+	struct regpair no_buff_discard;
+	struct regpair not_active_discard;
+	struct regpair tpa_coalesced_pkts;
+	struct regpair tpa_coalesced_events;
+	struct regpair tpa_aborts_num;
+	struct regpair tpa_coalesced_bytes;
+};
+
+/* Ethernet TX Per PF */
+struct eth_pstorm_per_pf_stat {
+	struct regpair sent_lb_ucast_bytes;
+	struct regpair sent_lb_mcast_bytes;
+	struct regpair sent_lb_bcast_bytes;
+	struct regpair sent_lb_ucast_pkts;
+	struct regpair sent_lb_mcast_pkts;
+	struct regpair sent_lb_bcast_pkts;
+	struct regpair sent_gre_bytes;
+	struct regpair sent_vxlan_bytes;
+	struct regpair sent_geneve_bytes;
+	struct regpair sent_gre_pkts;
+	struct regpair sent_vxlan_pkts;
+	struct regpair sent_geneve_pkts;
+	struct regpair gre_drop_pkts;
+	struct regpair vxlan_drop_pkts;
+	struct regpair geneve_drop_pkts;
+};
+
+/* Ethernet TX Per Queue Stats */
+struct eth_pstorm_per_queue_stat {
+	struct regpair sent_ucast_bytes;
+	struct regpair sent_mcast_bytes;
+	struct regpair sent_bcast_bytes;
+	struct regpair sent_ucast_pkts;
+	struct regpair sent_mcast_pkts;
+	struct regpair sent_bcast_pkts;
+	struct regpair error_drop_pkts;
+};
+
+/* ETH Rx producers data */
+struct eth_rx_rate_limit {
+	__le16 mult;
+	__le16 cnst;
+	u8 add_sub_cnst;
+	u8 reserved0;
+	__le16 reserved1;
+};
+
+struct eth_ustorm_per_pf_stat {
+	struct regpair rcv_lb_ucast_bytes;
+	struct regpair rcv_lb_mcast_bytes;
+	struct regpair rcv_lb_bcast_bytes;
+	struct regpair rcv_lb_ucast_pkts;
+	struct regpair rcv_lb_mcast_pkts;
+	struct regpair rcv_lb_bcast_pkts;
+	struct regpair rcv_gre_bytes;
+	struct regpair rcv_vxlan_bytes;
+	struct regpair rcv_geneve_bytes;
+	struct regpair rcv_gre_pkts;
+	struct regpair rcv_vxlan_pkts;
+	struct regpair rcv_geneve_pkts;
+};
+
+struct eth_ustorm_per_queue_stat {
+	struct regpair rcv_ucast_bytes;
+	struct regpair rcv_mcast_bytes;
+	struct regpair rcv_bcast_bytes;
+	struct regpair rcv_ucast_pkts;
+	struct regpair rcv_mcast_pkts;
+	struct regpair rcv_bcast_pkts;
+};
+
+/* Event Ring VF-PF Channel data */
+struct vf_pf_channel_eqe_data {
+	struct regpair msg_addr;
+};
+
+/* Event Ring malicious VF data */
+struct malicious_vf_eqe_data {
+	u8 vf_id;
+	u8 err_id;
+	__le16 reserved[3];
+};
+
+/* Event Ring initial cleanup data */
+struct initial_cleanup_eqe_data {
+	u8 vf_id;
+	u8 reserved[7];
+};
+
+/* Event Data Union */
+union event_ring_data {
+	u8 bytes[8];
+	struct vf_pf_channel_eqe_data vf_pf_channel;
+	struct iscsi_eqe_data iscsi_info;
+	struct iscsi_connect_done_results iscsi_conn_done_info;
+	union rdma_eqe_data rdma_data;
+	struct malicious_vf_eqe_data malicious_vf;
+	struct initial_cleanup_eqe_data vf_init_cleanup;
+};
+
+/* Event Ring Entry */
+struct event_ring_entry {
+	u8 protocol_id;
+	u8 opcode;
+	__le16 reserved0;
+	__le16 echo;
+	u8 fw_return_code;
+	u8 flags;
+#define EVENT_RING_ENTRY_ASYNC_MASK		0x1
+#define EVENT_RING_ENTRY_ASYNC_SHIFT		0
+#define EVENT_RING_ENTRY_RESERVED1_MASK		0x7F
+#define EVENT_RING_ENTRY_RESERVED1_SHIFT	1
+	union event_ring_data data;
+};
+
+/* Event Ring Next Page Address */
+struct event_ring_next_addr {
+	struct regpair addr;
+	__le32 reserved[2];
+};
+
+/* Event Ring Element */
+union event_ring_element {
+	struct event_ring_entry entry;
+	struct event_ring_next_addr next_addr;
+};
+
+/* Ports mode */
+enum fw_flow_ctrl_mode {
+	flow_ctrl_pause,
+	flow_ctrl_pfc,
+	MAX_FW_FLOW_CTRL_MODE
+};
+
+/* GFT profile type */
+enum gft_profile_type {
+	GFT_PROFILE_TYPE_4_TUPLE,
+	GFT_PROFILE_TYPE_L4_DST_PORT,
+	GFT_PROFILE_TYPE_IP_DST_ADDR,
+	GFT_PROFILE_TYPE_IP_SRC_ADDR,
+	GFT_PROFILE_TYPE_TUNNEL_TYPE,
+	MAX_GFT_PROFILE_TYPE
+};
+
+/* Major and Minor hsi Versions */
+struct hsi_fp_ver_struct {
+	u8 minor_ver_arr[2];
+	u8 major_ver_arr[2];
+};
+
+enum iwarp_ll2_tx_queues {
+	IWARP_LL2_IN_ORDER_TX_QUEUE = 1,
+	IWARP_LL2_ALIGNED_TX_QUEUE,
+	IWARP_LL2_ALIGNED_RIGHT_TRIMMED_TX_QUEUE,
+	IWARP_LL2_ERROR,
+	MAX_IWARP_LL2_TX_QUEUES
+};
+
+/* Malicious VF error ID */
+enum malicious_vf_error_id {
+	MALICIOUS_VF_NO_ERROR,
+	VF_PF_CHANNEL_NOT_READY,
+	VF_ZONE_MSG_NOT_VALID,
+	VF_ZONE_FUNC_NOT_ENABLED,
+	ETH_PACKET_TOO_SMALL,
+	ETH_ILLEGAL_VLAN_MODE,
+	ETH_MTU_VIOLATION,
+	ETH_ILLEGAL_INBAND_TAGS,
+	ETH_VLAN_INSERT_AND_INBAND_VLAN,
+	ETH_ILLEGAL_NBDS,
+	ETH_FIRST_BD_WO_SOP,
+	ETH_INSUFFICIENT_BDS,
+	ETH_ILLEGAL_LSO_HDR_NBDS,
+	ETH_ILLEGAL_LSO_MSS,
+	ETH_ZERO_SIZE_BD,
+	ETH_ILLEGAL_LSO_HDR_LEN,
+	ETH_INSUFFICIENT_PAYLOAD,
+	ETH_EDPM_OUT_OF_SYNC,
+	ETH_TUNN_IPV6_EXT_NBD_ERR,
+	ETH_CONTROL_PACKET_VIOLATION,
+	ETH_ANTI_SPOOFING_ERR,
+	ETH_PACKET_SIZE_TOO_LARGE,
+	MAX_MALICIOUS_VF_ERROR_ID
+};
+
+/* Mstorm non-triggering VF zone */
+struct mstorm_non_trigger_vf_zone {
+	struct eth_mstorm_per_queue_stat eth_queue_stat;
+	struct eth_rx_prod_data eth_rx_queue_producers[ETH_MAX_NUM_RX_QUEUES_PER_VF_QUAD];
+};
+
+/* Mstorm VF zone */
+struct mstorm_vf_zone {
+	struct mstorm_non_trigger_vf_zone non_trigger;
+};
+
+/* vlan header including TPID and TCI fields */
+struct vlan_header {
+	__le16 tpid;
+	__le16 tci;
+};
+
+/* outer tag configurations */
+struct outer_tag_config_struct {
+	u8 enable_stag_pri_change;
+	u8 pri_map_valid;
+	u8 reserved[2];
+	struct vlan_header outer_tag;
+	u8 inner_to_outer_pri_map[8];
+};
+
+/* personality per PF */
+enum personality_type {
+	BAD_PERSONALITY_TYP,
+	PERSONALITY_ISCSI,
+	PERSONALITY_FCOE,
+	PERSONALITY_RDMA_AND_ETH,
+	PERSONALITY_RDMA,
+	PERSONALITY_CORE,
+	PERSONALITY_ETH,
+	PERSONALITY_RESERVED,
+	MAX_PERSONALITY_TYPE
+};
+
+/* tunnel configuration */
+struct pf_start_tunnel_config {
+	u8 set_vxlan_udp_port_flg;
+	u8 set_geneve_udp_port_flg;
+	u8 tunnel_clss_vxlan;
+	u8 tunnel_clss_l2geneve;
+	u8 tunnel_clss_ipgeneve;
+	u8 tunnel_clss_l2gre;
+	u8 tunnel_clss_ipgre;
+	u8 reserved;
+	__le16 vxlan_udp_port;
+	__le16 geneve_udp_port;
+};
+
+/* Ramrod data for PF start ramrod */
+struct pf_start_ramrod_data {
+	struct regpair event_ring_pbl_addr;
+	struct regpair consolid_q_pbl_addr;
+	struct pf_start_tunnel_config tunnel_config;
+	__le16 event_ring_sb_id;
+	u8 base_vf_id;
+	u8 num_vfs;
+	u8 event_ring_num_pages;
+	u8 event_ring_sb_index;
+	u8 path_id;
+	u8 warning_as_error;
+	u8 dont_log_ramrods;
+	u8 personality;
+	__le16 log_type_mask;
+	u8 mf_mode;
+	u8 integ_phase;
+	u8 allow_npar_tx_switching;
+	u8 reserved0;
+	struct hsi_fp_ver_struct hsi_fp_ver;
+	struct outer_tag_config_struct outer_tag_config;
+};
+
+/* Data for port update ramrod */
+struct protocol_dcb_data {
+	u8 dcb_enable_flag;
+	u8 dscp_enable_flag;
+	u8 dcb_priority;
+	u8 dcb_tc;
+	u8 dscp_val;
+	u8 dcb_dont_add_vlan0;
+};
+
+/* Update tunnel configuration */
+struct pf_update_tunnel_config {
+	u8 update_rx_pf_clss;
+	u8 update_rx_def_ucast_clss;
+	u8 update_rx_def_non_ucast_clss;
+	u8 set_vxlan_udp_port_flg;
+	u8 set_geneve_udp_port_flg;
+	u8 tunnel_clss_vxlan;
+	u8 tunnel_clss_l2geneve;
+	u8 tunnel_clss_ipgeneve;
+	u8 tunnel_clss_l2gre;
+	u8 tunnel_clss_ipgre;
+	__le16 vxlan_udp_port;
+	__le16 geneve_udp_port;
+	__le16 reserved;
+};
+
+/* Data for port update ramrod */
+struct pf_update_ramrod_data {
+	u8 update_eth_dcb_data_mode;
+	u8 update_fcoe_dcb_data_mode;
+	u8 update_iscsi_dcb_data_mode;
+	u8 update_roce_dcb_data_mode;
+	u8 update_rroce_dcb_data_mode;
+	u8 update_iwarp_dcb_data_mode;
+	u8 update_mf_vlan_flag;
+	u8 update_enable_stag_pri_change;
+	struct protocol_dcb_data eth_dcb_data;
+	struct protocol_dcb_data fcoe_dcb_data;
+	struct protocol_dcb_data iscsi_dcb_data;
+	struct protocol_dcb_data roce_dcb_data;
+	struct protocol_dcb_data rroce_dcb_data;
+	struct protocol_dcb_data iwarp_dcb_data;
+	__le16 mf_vlan;
+	u8 enable_stag_pri_change;
+	u8 reserved;
+	struct pf_update_tunnel_config tunnel_config;
+};
+
+/* Ports mode */
+enum ports_mode {
+	ENGX2_PORTX1,
+	ENGX2_PORTX2,
+	ENGX1_PORTX1,
+	ENGX1_PORTX2,
+	ENGX1_PORTX4,
+	MAX_PORTS_MODE
+};
+
+/* use to index in hsi_fp_[major|minor]_ver_arr per protocol */
+enum protocol_version_array_key {
+	ETH_VER_KEY = 0,
+	ROCE_VER_KEY,
+	MAX_PROTOCOL_VERSION_ARRAY_KEY
+};
+
+/* RDMA TX Stats */
+struct rdma_sent_stats {
+	struct regpair sent_bytes;
+	struct regpair sent_pkts;
+};
+
+/* Pstorm non-triggering VF zone */
+struct pstorm_non_trigger_vf_zone {
+	struct eth_pstorm_per_queue_stat eth_queue_stat;
+	struct rdma_sent_stats rdma_stats;
+};
+
+/* Pstorm VF zone */
+struct pstorm_vf_zone {
+	struct pstorm_non_trigger_vf_zone non_trigger;
+	struct regpair reserved[7];
+};
+
+/* Ramrod Header of SPQE */
+struct ramrod_header {
+	__le32 cid;
+	u8 cmd_id;
+	u8 protocol_id;
+	__le16 echo;
+};
+
+/* RDMA RX Stats */
+struct rdma_rcv_stats {
+	struct regpair rcv_bytes;
+	struct regpair rcv_pkts;
+};
+
+/* Data for update QCN/DCQCN RL ramrod */
+struct rl_update_ramrod_data {
+	u8 qcn_update_param_flg;
+	u8 dcqcn_update_param_flg;
+	u8 rl_init_flg;
+	u8 rl_start_flg;
+	u8 rl_stop_flg;
+	u8 rl_id_first;
+	u8 rl_id_last;
+	u8 rl_dc_qcn_flg;
+	__le32 rl_bc_rate;
+	__le16 rl_max_rate;
+	__le16 rl_r_ai;
+	__le16 rl_r_hai;
+	__le16 dcqcn_g;
+	__le32 dcqcn_k_us;
+	__le32 dcqcn_timeuot_us;
+	__le32 qcn_timeuot_us;
+	__le32 reserved[2];
+};
+
+/* Slowpath Element (SPQE) */
+struct slow_path_element {
+	struct ramrod_header hdr;
+	struct regpair data_ptr;
+};
+
+/* Tstorm non-triggering VF zone */
+struct tstorm_non_trigger_vf_zone {
+	struct rdma_rcv_stats rdma_stats;
+};
+
+struct tstorm_per_port_stat {
+	struct regpair trunc_error_discard;
+	struct regpair mac_error_discard;
+	struct regpair mftag_filter_discard;
+	struct regpair eth_mac_filter_discard;
+	struct regpair ll2_mac_filter_discard;
+	struct regpair ll2_conn_disabled_discard;
+	struct regpair iscsi_irregular_pkt;
+	struct regpair fcoe_irregular_pkt;
+	struct regpair roce_irregular_pkt;
+	struct regpair iwarp_irregular_pkt;
+	struct regpair eth_irregular_pkt;
+	struct regpair toe_irregular_pkt;
+	struct regpair preroce_irregular_pkt;
+	struct regpair eth_gre_tunn_filter_discard;
+	struct regpair eth_vxlan_tunn_filter_discard;
+	struct regpair eth_geneve_tunn_filter_discard;
+	struct regpair eth_gft_drop_pkt;
+};
+
+/* Tstorm VF zone */
+struct tstorm_vf_zone {
+	struct tstorm_non_trigger_vf_zone non_trigger;
+};
+
+/* Tunnel classification scheme */
+enum tunnel_clss {
+	TUNNEL_CLSS_MAC_VLAN = 0,
+	TUNNEL_CLSS_MAC_VNI,
+	TUNNEL_CLSS_INNER_MAC_VLAN,
+	TUNNEL_CLSS_INNER_MAC_VNI,
+	TUNNEL_CLSS_MAC_VLAN_DUAL_STAGE,
+	MAX_TUNNEL_CLSS
+};
+
+/* Ustorm non-triggering VF zone */
+struct ustorm_non_trigger_vf_zone {
+	struct eth_ustorm_per_queue_stat eth_queue_stat;
+	struct regpair vf_pf_msg_addr;
+};
+
+/* Ustorm triggering VF zone */
+struct ustorm_trigger_vf_zone {
+	u8 vf_pf_msg_valid;
+	u8 reserved[7];
+};
+
+/* Ustorm VF zone */
+struct ustorm_vf_zone {
+	struct ustorm_non_trigger_vf_zone non_trigger;
+	struct ustorm_trigger_vf_zone trigger;
+};
+
+/* VF-PF channel data */
+struct vf_pf_channel_data {
+	__le32 ready;
+	u8 valid;
+	u8 reserved0;
+	__le16 reserved1;
+};
+
+/* Ramrod data for VF start ramrod */
+struct vf_start_ramrod_data {
+	u8 vf_id;
+	u8 enable_flr_ack;
+	__le16 opaque_fid;
+	u8 personality;
+	u8 reserved[7];
+	struct hsi_fp_ver_struct hsi_fp_ver;
+
+};
+
+/* Ramrod data for VF start ramrod */
+struct vf_stop_ramrod_data {
+	u8 vf_id;
+	u8 reserved0;
+	__le16 reserved1;
+	__le32 reserved2;
+};
+
+/* VF zone size mode */
+enum vf_zone_size_mode {
+	VF_ZONE_SIZE_MODE_DEFAULT,
+	VF_ZONE_SIZE_MODE_DOUBLE,
+	VF_ZONE_SIZE_MODE_QUAD,
+	MAX_VF_ZONE_SIZE_MODE
+};
+
+/* Attentions status block */
+struct atten_status_block {
+	__le32 atten_bits;
+	__le32 atten_ack;
+	__le16 reserved0;
+	__le16 sb_index;
+	__le32 reserved1;
+};
+
+/* DMAE command */
+struct dmae_cmd {
+	__le32 opcode;
+#define DMAE_CMD_SRC_MASK		0x1
+#define DMAE_CMD_SRC_SHIFT		0
+#define DMAE_CMD_DST_MASK		0x3
+#define DMAE_CMD_DST_SHIFT		1
+#define DMAE_CMD_C_DST_MASK		0x1
+#define DMAE_CMD_C_DST_SHIFT		3
+#define DMAE_CMD_CRC_RESET_MASK		0x1
+#define DMAE_CMD_CRC_RESET_SHIFT	4
+#define DMAE_CMD_SRC_ADDR_RESET_MASK	0x1
+#define DMAE_CMD_SRC_ADDR_RESET_SHIFT	5
+#define DMAE_CMD_DST_ADDR_RESET_MASK	0x1
+#define DMAE_CMD_DST_ADDR_RESET_SHIFT	6
+#define DMAE_CMD_COMP_FUNC_MASK		0x1
+#define DMAE_CMD_COMP_FUNC_SHIFT	7
+#define DMAE_CMD_COMP_WORD_EN_MASK	0x1
+#define DMAE_CMD_COMP_WORD_EN_SHIFT	8
+#define DMAE_CMD_COMP_CRC_EN_MASK	0x1
+#define DMAE_CMD_COMP_CRC_EN_SHIFT	9
+#define DMAE_CMD_COMP_CRC_OFFSET_MASK	0x7
+#define DMAE_CMD_COMP_CRC_OFFSET_SHIFT 10
+#define DMAE_CMD_RESERVED1_MASK		0x1
+#define DMAE_CMD_RESERVED1_SHIFT	13
+#define DMAE_CMD_ENDIANITY_MODE_MASK	0x3
+#define DMAE_CMD_ENDIANITY_MODE_SHIFT	14
+#define DMAE_CMD_ERR_HANDLING_MASK	0x3
+#define DMAE_CMD_ERR_HANDLING_SHIFT	16
+#define DMAE_CMD_PORT_ID_MASK		0x3
+#define DMAE_CMD_PORT_ID_SHIFT		18
+#define DMAE_CMD_SRC_PF_ID_MASK		0xF
+#define DMAE_CMD_SRC_PF_ID_SHIFT	20
+#define DMAE_CMD_DST_PF_ID_MASK		0xF
+#define DMAE_CMD_DST_PF_ID_SHIFT	24
+#define DMAE_CMD_SRC_VF_ID_VALID_MASK	0x1
+#define DMAE_CMD_SRC_VF_ID_VALID_SHIFT 28
+#define DMAE_CMD_DST_VF_ID_VALID_MASK	0x1
+#define DMAE_CMD_DST_VF_ID_VALID_SHIFT 29
+#define DMAE_CMD_RESERVED2_MASK		0x3
+#define DMAE_CMD_RESERVED2_SHIFT	30
+	__le32 src_addr_lo;
+	__le32 src_addr_hi;
+	__le32 dst_addr_lo;
+	__le32 dst_addr_hi;
+	__le16 length_dw;
+	__le16 opcode_b;
+#define DMAE_CMD_SRC_VF_ID_MASK		0xFF
+#define DMAE_CMD_SRC_VF_ID_SHIFT	0
+#define DMAE_CMD_DST_VF_ID_MASK		0xFF
+#define DMAE_CMD_DST_VF_ID_SHIFT	8
+	__le32 comp_addr_lo;
+	__le32 comp_addr_hi;
+	__le32 comp_val;
+	__le32 crc32;
+	__le32 crc_32_c;
+	__le16 crc16;
+	__le16 crc16_c;
+	__le16 crc10;
+	__le16 reserved;
+	__le16 xsum16;
+	__le16 xsum8;
+};
+
+enum dmae_cmd_comp_crc_en_enum {
+	dmae_cmd_comp_crc_disabled,
+	dmae_cmd_comp_crc_enabled,
+	MAX_DMAE_CMD_COMP_CRC_EN_ENUM
+};
+
+enum dmae_cmd_comp_func_enum {
+	dmae_cmd_comp_func_to_src,
+	dmae_cmd_comp_func_to_dst,
+	MAX_DMAE_CMD_COMP_FUNC_ENUM
+};
+
+enum dmae_cmd_comp_word_en_enum {
+	dmae_cmd_comp_word_disabled,
+	dmae_cmd_comp_word_enabled,
+	MAX_DMAE_CMD_COMP_WORD_EN_ENUM
+};
+
+enum dmae_cmd_c_dst_enum {
+	dmae_cmd_c_dst_pcie,
+	dmae_cmd_c_dst_grc,
+	MAX_DMAE_CMD_C_DST_ENUM
+};
+
+enum dmae_cmd_dst_enum {
+	dmae_cmd_dst_none_0,
+	dmae_cmd_dst_pcie,
+	dmae_cmd_dst_grc,
+	dmae_cmd_dst_none_3,
+	MAX_DMAE_CMD_DST_ENUM
+};
+
+enum dmae_cmd_error_handling_enum {
+	dmae_cmd_error_handling_send_regular_comp,
+	dmae_cmd_error_handling_send_comp_with_err,
+	dmae_cmd_error_handling_dont_send_comp,
+	MAX_DMAE_CMD_ERROR_HANDLING_ENUM
+};
+
+enum dmae_cmd_src_enum {
+	dmae_cmd_src_pcie,
+	dmae_cmd_src_grc,
+	MAX_DMAE_CMD_SRC_ENUM
+};
+
+struct e4_mstorm_core_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_MSTORM_CORE_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_MSTORM_CORE_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_MSTORM_CORE_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_MSTORM_CORE_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_MSTORM_CORE_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_MSTORM_CORE_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_MSTORM_CORE_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_MSTORM_CORE_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_MSTORM_CORE_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_MSTORM_CORE_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_MSTORM_CORE_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_MSTORM_CORE_CONN_AG_CTX_CF0EN_SHIFT		0
+#define E4_MSTORM_CORE_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_MSTORM_CORE_CONN_AG_CTX_CF1EN_SHIFT		1
+#define E4_MSTORM_CORE_CONN_AG_CTX_CF2EN_MASK		0x1
+#define E4_MSTORM_CORE_CONN_AG_CTX_CF2EN_SHIFT		2
+#define E4_MSTORM_CORE_CONN_AG_CTX_RULE0EN_MASK		0x1
+#define E4_MSTORM_CORE_CONN_AG_CTX_RULE0EN_SHIFT	3
+#define E4_MSTORM_CORE_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_MSTORM_CORE_CONN_AG_CTX_RULE1EN_SHIFT	4
+#define E4_MSTORM_CORE_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_MSTORM_CORE_CONN_AG_CTX_RULE2EN_SHIFT	5
+#define E4_MSTORM_CORE_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_MSTORM_CORE_CONN_AG_CTX_RULE3EN_SHIFT	6
+#define E4_MSTORM_CORE_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_MSTORM_CORE_CONN_AG_CTX_RULE4EN_SHIFT	7
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+};
+
+struct e4_ystorm_core_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_YSTORM_CORE_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_YSTORM_CORE_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_YSTORM_CORE_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_YSTORM_CORE_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_YSTORM_CORE_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_YSTORM_CORE_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_YSTORM_CORE_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_YSTORM_CORE_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_YSTORM_CORE_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_YSTORM_CORE_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_YSTORM_CORE_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_YSTORM_CORE_CONN_AG_CTX_CF0EN_SHIFT		0
+#define E4_YSTORM_CORE_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_YSTORM_CORE_CONN_AG_CTX_CF1EN_SHIFT		1
+#define E4_YSTORM_CORE_CONN_AG_CTX_CF2EN_MASK		0x1
+#define E4_YSTORM_CORE_CONN_AG_CTX_CF2EN_SHIFT		2
+#define E4_YSTORM_CORE_CONN_AG_CTX_RULE0EN_MASK		0x1
+#define E4_YSTORM_CORE_CONN_AG_CTX_RULE0EN_SHIFT	3
+#define E4_YSTORM_CORE_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_YSTORM_CORE_CONN_AG_CTX_RULE1EN_SHIFT	4
+#define E4_YSTORM_CORE_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_YSTORM_CORE_CONN_AG_CTX_RULE2EN_SHIFT	5
+#define E4_YSTORM_CORE_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_YSTORM_CORE_CONN_AG_CTX_RULE3EN_SHIFT	6
+#define E4_YSTORM_CORE_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_YSTORM_CORE_CONN_AG_CTX_RULE4EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le32 reg0;
+	__le32 reg1;
+	__le16 word1;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le32 reg2;
+	__le32 reg3;
+};
+
+/* IGU cleanup command */
+struct igu_cleanup {
+	__le32 sb_id_and_flags;
+#define IGU_CLEANUP_RESERVED0_MASK	0x7FFFFFF
+#define IGU_CLEANUP_RESERVED0_SHIFT	0
+#define IGU_CLEANUP_CLEANUP_SET_MASK	0x1
+#define IGU_CLEANUP_CLEANUP_SET_SHIFT	27
+#define IGU_CLEANUP_CLEANUP_TYPE_MASK	0x7
+#define IGU_CLEANUP_CLEANUP_TYPE_SHIFT	28
+#define IGU_CLEANUP_COMMAND_TYPE_MASK	0x1
+#define IGU_CLEANUP_COMMAND_TYPE_SHIFT	31
+	__le32 reserved1;
+};
+
+/* IGU firmware driver command */
+union igu_command {
+	struct igu_prod_cons_update prod_cons_update;
+	struct igu_cleanup cleanup;
+};
+
+/* IGU firmware driver command */
+struct igu_command_reg_ctrl {
+	__le16 opaque_fid;
+	__le16 igu_command_reg_ctrl_fields;
+#define IGU_COMMAND_REG_CTRL_PXP_BAR_ADDR_MASK	0xFFF
+#define IGU_COMMAND_REG_CTRL_PXP_BAR_ADDR_SHIFT	0
+#define IGU_COMMAND_REG_CTRL_RESERVED_MASK	0x7
+#define IGU_COMMAND_REG_CTRL_RESERVED_SHIFT	12
+#define IGU_COMMAND_REG_CTRL_COMMAND_TYPE_MASK	0x1
+#define IGU_COMMAND_REG_CTRL_COMMAND_TYPE_SHIFT	15
+};
+
+/* IGU mapping line structure */
+struct igu_mapping_line {
+	__le32 igu_mapping_line_fields;
+#define IGU_MAPPING_LINE_VALID_MASK		0x1
+#define IGU_MAPPING_LINE_VALID_SHIFT		0
+#define IGU_MAPPING_LINE_VECTOR_NUMBER_MASK	0xFF
+#define IGU_MAPPING_LINE_VECTOR_NUMBER_SHIFT	1
+#define IGU_MAPPING_LINE_FUNCTION_NUMBER_MASK	0xFF
+#define IGU_MAPPING_LINE_FUNCTION_NUMBER_SHIFT	9
+#define IGU_MAPPING_LINE_PF_VALID_MASK		0x1
+#define IGU_MAPPING_LINE_PF_VALID_SHIFT		17
+#define IGU_MAPPING_LINE_IPS_GROUP_MASK		0x3F
+#define IGU_MAPPING_LINE_IPS_GROUP_SHIFT	18
+#define IGU_MAPPING_LINE_RESERVED_MASK		0xFF
+#define IGU_MAPPING_LINE_RESERVED_SHIFT		24
+};
+
+/* IGU MSIX line structure */
+struct igu_msix_vector {
+	struct regpair address;
+	__le32 data;
+	__le32 msix_vector_fields;
+#define IGU_MSIX_VECTOR_MASK_BIT_MASK		0x1
+#define IGU_MSIX_VECTOR_MASK_BIT_SHIFT		0
+#define IGU_MSIX_VECTOR_RESERVED0_MASK		0x7FFF
+#define IGU_MSIX_VECTOR_RESERVED0_SHIFT		1
+#define IGU_MSIX_VECTOR_STEERING_TAG_MASK	0xFF
+#define IGU_MSIX_VECTOR_STEERING_TAG_SHIFT	16
+#define IGU_MSIX_VECTOR_RESERVED1_MASK		0xFF
+#define IGU_MSIX_VECTOR_RESERVED1_SHIFT		24
+};
+/* per encapsulation type enabling flags */
+struct prs_reg_encapsulation_type_en {
+	u8 flags;
+#define PRS_REG_ENCAPSULATION_TYPE_EN_ETH_OVER_GRE_ENABLE_MASK		0x1
+#define PRS_REG_ENCAPSULATION_TYPE_EN_ETH_OVER_GRE_ENABLE_SHIFT		0
+#define PRS_REG_ENCAPSULATION_TYPE_EN_IP_OVER_GRE_ENABLE_MASK		0x1
+#define PRS_REG_ENCAPSULATION_TYPE_EN_IP_OVER_GRE_ENABLE_SHIFT		1
+#define PRS_REG_ENCAPSULATION_TYPE_EN_VXLAN_ENABLE_MASK			0x1
+#define PRS_REG_ENCAPSULATION_TYPE_EN_VXLAN_ENABLE_SHIFT		2
+#define PRS_REG_ENCAPSULATION_TYPE_EN_T_TAG_ENABLE_MASK			0x1
+#define PRS_REG_ENCAPSULATION_TYPE_EN_T_TAG_ENABLE_SHIFT		3
+#define PRS_REG_ENCAPSULATION_TYPE_EN_ETH_OVER_GENEVE_ENABLE_MASK	0x1
+#define PRS_REG_ENCAPSULATION_TYPE_EN_ETH_OVER_GENEVE_ENABLE_SHIFT	4
+#define PRS_REG_ENCAPSULATION_TYPE_EN_IP_OVER_GENEVE_ENABLE_MASK	0x1
+#define PRS_REG_ENCAPSULATION_TYPE_EN_IP_OVER_GENEVE_ENABLE_SHIFT	5
+#define PRS_REG_ENCAPSULATION_TYPE_EN_RESERVED_MASK			0x3
+#define PRS_REG_ENCAPSULATION_TYPE_EN_RESERVED_SHIFT			6
+};
+
+enum pxp_tph_st_hint {
+	TPH_ST_HINT_BIDIR,
+	TPH_ST_HINT_REQUESTER,
+	TPH_ST_HINT_TARGET,
+	TPH_ST_HINT_TARGET_PRIO,
+	MAX_PXP_TPH_ST_HINT
+};
+
+/* QM hardware structure of enable bypass credit mask */
+struct qm_rf_bypass_mask {
+	u8 flags;
+#define QM_RF_BYPASS_MASK_LINEVOQ_MASK		0x1
+#define QM_RF_BYPASS_MASK_LINEVOQ_SHIFT		0
+#define QM_RF_BYPASS_MASK_RESERVED0_MASK	0x1
+#define QM_RF_BYPASS_MASK_RESERVED0_SHIFT	1
+#define QM_RF_BYPASS_MASK_PFWFQ_MASK		0x1
+#define QM_RF_BYPASS_MASK_PFWFQ_SHIFT		2
+#define QM_RF_BYPASS_MASK_VPWFQ_MASK		0x1
+#define QM_RF_BYPASS_MASK_VPWFQ_SHIFT		3
+#define QM_RF_BYPASS_MASK_PFRL_MASK		0x1
+#define QM_RF_BYPASS_MASK_PFRL_SHIFT		4
+#define QM_RF_BYPASS_MASK_VPQCNRL_MASK		0x1
+#define QM_RF_BYPASS_MASK_VPQCNRL_SHIFT		5
+#define QM_RF_BYPASS_MASK_FWPAUSE_MASK		0x1
+#define QM_RF_BYPASS_MASK_FWPAUSE_SHIFT		6
+#define QM_RF_BYPASS_MASK_RESERVED1_MASK	0x1
+#define QM_RF_BYPASS_MASK_RESERVED1_SHIFT	7
+};
+
+/* QM hardware structure of opportunistic credit mask */
+struct qm_rf_opportunistic_mask {
+	__le16 flags;
+#define QM_RF_OPPORTUNISTIC_MASK_LINEVOQ_MASK		0x1
+#define QM_RF_OPPORTUNISTIC_MASK_LINEVOQ_SHIFT		0
+#define QM_RF_OPPORTUNISTIC_MASK_BYTEVOQ_MASK		0x1
+#define QM_RF_OPPORTUNISTIC_MASK_BYTEVOQ_SHIFT		1
+#define QM_RF_OPPORTUNISTIC_MASK_PFWFQ_MASK		0x1
+#define QM_RF_OPPORTUNISTIC_MASK_PFWFQ_SHIFT		2
+#define QM_RF_OPPORTUNISTIC_MASK_VPWFQ_MASK		0x1
+#define QM_RF_OPPORTUNISTIC_MASK_VPWFQ_SHIFT		3
+#define QM_RF_OPPORTUNISTIC_MASK_PFRL_MASK		0x1
+#define QM_RF_OPPORTUNISTIC_MASK_PFRL_SHIFT		4
+#define QM_RF_OPPORTUNISTIC_MASK_VPQCNRL_MASK		0x1
+#define QM_RF_OPPORTUNISTIC_MASK_VPQCNRL_SHIFT		5
+#define QM_RF_OPPORTUNISTIC_MASK_FWPAUSE_MASK		0x1
+#define QM_RF_OPPORTUNISTIC_MASK_FWPAUSE_SHIFT		6
+#define QM_RF_OPPORTUNISTIC_MASK_RESERVED0_MASK		0x1
+#define QM_RF_OPPORTUNISTIC_MASK_RESERVED0_SHIFT	7
+#define QM_RF_OPPORTUNISTIC_MASK_QUEUEEMPTY_MASK	0x1
+#define QM_RF_OPPORTUNISTIC_MASK_QUEUEEMPTY_SHIFT	8
+#define QM_RF_OPPORTUNISTIC_MASK_RESERVED1_MASK		0x7F
+#define QM_RF_OPPORTUNISTIC_MASK_RESERVED1_SHIFT	9
+};
+
+/* QM hardware structure of QM map memory */
+struct qm_rf_pq_map_e4 {
+	__le32 reg;
+#define QM_RF_PQ_MAP_E4_PQ_VALID_MASK		0x1
+#define QM_RF_PQ_MAP_E4_PQ_VALID_SHIFT		0
+#define QM_RF_PQ_MAP_E4_RL_ID_MASK		0xFF
+#define QM_RF_PQ_MAP_E4_RL_ID_SHIFT		1
+#define QM_RF_PQ_MAP_E4_VP_PQ_ID_MASK		0x1FF
+#define QM_RF_PQ_MAP_E4_VP_PQ_ID_SHIFT		9
+#define QM_RF_PQ_MAP_E4_VOQ_MASK		0x1F
+#define QM_RF_PQ_MAP_E4_VOQ_SHIFT		18
+#define QM_RF_PQ_MAP_E4_WRR_WEIGHT_GROUP_MASK	0x3
+#define QM_RF_PQ_MAP_E4_WRR_WEIGHT_GROUP_SHIFT	23
+#define QM_RF_PQ_MAP_E4_RL_VALID_MASK		0x1
+#define QM_RF_PQ_MAP_E4_RL_VALID_SHIFT		25
+#define QM_RF_PQ_MAP_E4_RESERVED_MASK		0x3F
+#define QM_RF_PQ_MAP_E4_RESERVED_SHIFT		26
+};
+
+/* Completion params for aggregated interrupt completion */
+struct sdm_agg_int_comp_params {
+	__le16 params;
+#define SDM_AGG_INT_COMP_PARAMS_AGG_INT_INDEX_MASK	0x3F
+#define SDM_AGG_INT_COMP_PARAMS_AGG_INT_INDEX_SHIFT	0
+#define SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_ENABLE_MASK	0x1
+#define SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_ENABLE_SHIFT	6
+#define SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_BIT_MASK	0x1FF
+#define SDM_AGG_INT_COMP_PARAMS_AGG_VECTOR_BIT_SHIFT	7
+};
+
+/* SDM operation gen command (generate aggregative interrupt) */
+struct sdm_op_gen {
+	__le32 command;
+#define SDM_OP_GEN_COMP_PARAM_MASK	0xFFFF
+#define SDM_OP_GEN_COMP_PARAM_SHIFT	0
+#define SDM_OP_GEN_COMP_TYPE_MASK	0xF
+#define SDM_OP_GEN_COMP_TYPE_SHIFT	16
+#define SDM_OP_GEN_RESERVED_MASK	0xFFF
+#define SDM_OP_GEN_RESERVED_SHIFT	20
+};
+
+/****************************************/
+/* Debug Tools HSI constants and macros */
+/****************************************/
+
+enum block_addr {
+	GRCBASE_GRC = 0x50000,
+	GRCBASE_MISCS = 0x9000,
+	GRCBASE_MISC = 0x8000,
+	GRCBASE_DBU = 0xa000,
+	GRCBASE_PGLUE_B = 0x2a8000,
+	GRCBASE_CNIG = 0x218000,
+	GRCBASE_CPMU = 0x30000,
+	GRCBASE_NCSI = 0x40000,
+	GRCBASE_OPTE = 0x53000,
+	GRCBASE_BMB = 0x540000,
+	GRCBASE_PCIE = 0x54000,
+	GRCBASE_MCP = 0xe00000,
+	GRCBASE_MCP2 = 0x52000,
+	GRCBASE_PSWHST = 0x2a0000,
+	GRCBASE_PSWHST2 = 0x29e000,
+	GRCBASE_PSWRD = 0x29c000,
+	GRCBASE_PSWRD2 = 0x29d000,
+	GRCBASE_PSWWR = 0x29a000,
+	GRCBASE_PSWWR2 = 0x29b000,
+	GRCBASE_PSWRQ = 0x280000,
+	GRCBASE_PSWRQ2 = 0x240000,
+	GRCBASE_PGLCS = 0x0,
+	GRCBASE_DMAE = 0xc000,
+	GRCBASE_PTU = 0x560000,
+	GRCBASE_TCM = 0x1180000,
+	GRCBASE_MCM = 0x1200000,
+	GRCBASE_UCM = 0x1280000,
+	GRCBASE_XCM = 0x1000000,
+	GRCBASE_YCM = 0x1080000,
+	GRCBASE_PCM = 0x1100000,
+	GRCBASE_QM = 0x2f0000,
+	GRCBASE_TM = 0x2c0000,
+	GRCBASE_DORQ = 0x100000,
+	GRCBASE_BRB = 0x340000,
+	GRCBASE_SRC = 0x238000,
+	GRCBASE_PRS = 0x1f0000,
+	GRCBASE_TSDM = 0xfb0000,
+	GRCBASE_MSDM = 0xfc0000,
+	GRCBASE_USDM = 0xfd0000,
+	GRCBASE_XSDM = 0xf80000,
+	GRCBASE_YSDM = 0xf90000,
+	GRCBASE_PSDM = 0xfa0000,
+	GRCBASE_TSEM = 0x1700000,
+	GRCBASE_MSEM = 0x1800000,
+	GRCBASE_USEM = 0x1900000,
+	GRCBASE_XSEM = 0x1400000,
+	GRCBASE_YSEM = 0x1500000,
+	GRCBASE_PSEM = 0x1600000,
+	GRCBASE_RSS = 0x238800,
+	GRCBASE_TMLD = 0x4d0000,
+	GRCBASE_MULD = 0x4e0000,
+	GRCBASE_YULD = 0x4c8000,
+	GRCBASE_XYLD = 0x4c0000,
+	GRCBASE_PTLD = 0x5a0000,
+	GRCBASE_YPLD = 0x5c0000,
+	GRCBASE_PRM = 0x230000,
+	GRCBASE_PBF_PB1 = 0xda0000,
+	GRCBASE_PBF_PB2 = 0xda4000,
+	GRCBASE_RPB = 0x23c000,
+	GRCBASE_BTB = 0xdb0000,
+	GRCBASE_PBF = 0xd80000,
+	GRCBASE_RDIF = 0x300000,
+	GRCBASE_TDIF = 0x310000,
+	GRCBASE_CDU = 0x580000,
+	GRCBASE_CCFC = 0x2e0000,
+	GRCBASE_TCFC = 0x2d0000,
+	GRCBASE_IGU = 0x180000,
+	GRCBASE_CAU = 0x1c0000,
+	GRCBASE_RGFS = 0xf00000,
+	GRCBASE_RGSRC = 0x320000,
+	GRCBASE_TGFS = 0xd00000,
+	GRCBASE_TGSRC = 0x322000,
+	GRCBASE_UMAC = 0x51000,
+	GRCBASE_XMAC = 0x210000,
+	GRCBASE_DBG = 0x10000,
+	GRCBASE_NIG = 0x500000,
+	GRCBASE_WOL = 0x600000,
+	GRCBASE_BMBN = 0x610000,
+	GRCBASE_IPC = 0x20000,
+	GRCBASE_NWM = 0x800000,
+	GRCBASE_NWS = 0x700000,
+	GRCBASE_MS = 0x6a0000,
+	GRCBASE_PHY_PCIE = 0x620000,
+	GRCBASE_LED = 0x6b8000,
+	GRCBASE_AVS_WRAP = 0x6b0000,
+	GRCBASE_PXPREQBUS = 0x56000,
+	GRCBASE_MISC_AEU = 0x8000,
+	GRCBASE_BAR0_MAP = 0x1c00000,
+	MAX_BLOCK_ADDR
+};
+
+enum block_id {
+	BLOCK_GRC,
+	BLOCK_MISCS,
+	BLOCK_MISC,
+	BLOCK_DBU,
+	BLOCK_PGLUE_B,
+	BLOCK_CNIG,
+	BLOCK_CPMU,
+	BLOCK_NCSI,
+	BLOCK_OPTE,
+	BLOCK_BMB,
+	BLOCK_PCIE,
+	BLOCK_MCP,
+	BLOCK_MCP2,
+	BLOCK_PSWHST,
+	BLOCK_PSWHST2,
+	BLOCK_PSWRD,
+	BLOCK_PSWRD2,
+	BLOCK_PSWWR,
+	BLOCK_PSWWR2,
+	BLOCK_PSWRQ,
+	BLOCK_PSWRQ2,
+	BLOCK_PGLCS,
+	BLOCK_DMAE,
+	BLOCK_PTU,
+	BLOCK_TCM,
+	BLOCK_MCM,
+	BLOCK_UCM,
+	BLOCK_XCM,
+	BLOCK_YCM,
+	BLOCK_PCM,
+	BLOCK_QM,
+	BLOCK_TM,
+	BLOCK_DORQ,
+	BLOCK_BRB,
+	BLOCK_SRC,
+	BLOCK_PRS,
+	BLOCK_TSDM,
+	BLOCK_MSDM,
+	BLOCK_USDM,
+	BLOCK_XSDM,
+	BLOCK_YSDM,
+	BLOCK_PSDM,
+	BLOCK_TSEM,
+	BLOCK_MSEM,
+	BLOCK_USEM,
+	BLOCK_XSEM,
+	BLOCK_YSEM,
+	BLOCK_PSEM,
+	BLOCK_RSS,
+	BLOCK_TMLD,
+	BLOCK_MULD,
+	BLOCK_YULD,
+	BLOCK_XYLD,
+	BLOCK_PTLD,
+	BLOCK_YPLD,
+	BLOCK_PRM,
+	BLOCK_PBF_PB1,
+	BLOCK_PBF_PB2,
+	BLOCK_RPB,
+	BLOCK_BTB,
+	BLOCK_PBF,
+	BLOCK_RDIF,
+	BLOCK_TDIF,
+	BLOCK_CDU,
+	BLOCK_CCFC,
+	BLOCK_TCFC,
+	BLOCK_IGU,
+	BLOCK_CAU,
+	BLOCK_RGFS,
+	BLOCK_RGSRC,
+	BLOCK_TGFS,
+	BLOCK_TGSRC,
+	BLOCK_UMAC,
+	BLOCK_XMAC,
+	BLOCK_DBG,
+	BLOCK_NIG,
+	BLOCK_WOL,
+	BLOCK_BMBN,
+	BLOCK_IPC,
+	BLOCK_NWM,
+	BLOCK_NWS,
+	BLOCK_MS,
+	BLOCK_PHY_PCIE,
+	BLOCK_LED,
+	BLOCK_AVS_WRAP,
+	BLOCK_PXPREQBUS,
+	BLOCK_MISC_AEU,
+	BLOCK_BAR0_MAP,
+	MAX_BLOCK_ID
+};
+
+/* binary debug buffer types */
+enum bin_dbg_buffer_type {
+	BIN_BUF_DBG_MODE_TREE,
+	BIN_BUF_DBG_DUMP_REG,
+	BIN_BUF_DBG_DUMP_MEM,
+	BIN_BUF_DBG_IDLE_CHK_REGS,
+	BIN_BUF_DBG_IDLE_CHK_IMMS,
+	BIN_BUF_DBG_IDLE_CHK_RULES,
+	BIN_BUF_DBG_IDLE_CHK_PARSING_DATA,
+	BIN_BUF_DBG_ATTN_BLOCKS,
+	BIN_BUF_DBG_ATTN_REGS,
+	BIN_BUF_DBG_ATTN_INDEXES,
+	BIN_BUF_DBG_ATTN_NAME_OFFSETS,
+	BIN_BUF_DBG_BUS_BLOCKS,
+	BIN_BUF_DBG_BUS_LINES,
+	BIN_BUF_DBG_BUS_BLOCKS_USER_DATA,
+	BIN_BUF_DBG_BUS_LINE_NAME_OFFSETS,
+	BIN_BUF_DBG_PARSING_STRINGS,
+	MAX_BIN_DBG_BUFFER_TYPE
+};
+
+
+/* Attention bit mapping */
+struct dbg_attn_bit_mapping {
+	u16 data;
+#define DBG_ATTN_BIT_MAPPING_VAL_MASK			0x7FFF
+#define DBG_ATTN_BIT_MAPPING_VAL_SHIFT			0
+#define DBG_ATTN_BIT_MAPPING_IS_UNUSED_BIT_CNT_MASK	0x1
+#define DBG_ATTN_BIT_MAPPING_IS_UNUSED_BIT_CNT_SHIFT	15
+};
+
+/* Attention block per-type data */
+struct dbg_attn_block_type_data {
+	u16 names_offset;
+	u16 reserved1;
+	u8 num_regs;
+	u8 reserved2;
+	u16 regs_offset;
+
+};
+
+/* Block attentions */
+struct dbg_attn_block {
+	struct dbg_attn_block_type_data per_type_data[2];
+};
+
+/* Attention register result */
+struct dbg_attn_reg_result {
+	u32 data;
+#define DBG_ATTN_REG_RESULT_STS_ADDRESS_MASK	0xFFFFFF
+#define DBG_ATTN_REG_RESULT_STS_ADDRESS_SHIFT	0
+#define DBG_ATTN_REG_RESULT_NUM_REG_ATTN_MASK	0xFF
+#define DBG_ATTN_REG_RESULT_NUM_REG_ATTN_SHIFT	24
+	u16 block_attn_offset;
+	u16 reserved;
+	u32 sts_val;
+	u32 mask_val;
+};
+
+/* Attention block result */
+struct dbg_attn_block_result {
+	u8 block_id;
+	u8 data;
+#define DBG_ATTN_BLOCK_RESULT_ATTN_TYPE_MASK	0x3
+#define DBG_ATTN_BLOCK_RESULT_ATTN_TYPE_SHIFT	0
+#define DBG_ATTN_BLOCK_RESULT_NUM_REGS_MASK	0x3F
+#define DBG_ATTN_BLOCK_RESULT_NUM_REGS_SHIFT	2
+	u16 names_offset;
+	struct dbg_attn_reg_result reg_results[15];
+};
+
+/* Mode header */
+struct dbg_mode_hdr {
+	u16 data;
+#define DBG_MODE_HDR_EVAL_MODE_MASK		0x1
+#define DBG_MODE_HDR_EVAL_MODE_SHIFT		0
+#define DBG_MODE_HDR_MODES_BUF_OFFSET_MASK	0x7FFF
+#define DBG_MODE_HDR_MODES_BUF_OFFSET_SHIFT	1
+};
+
+/* Attention register */
+struct dbg_attn_reg {
+	struct dbg_mode_hdr mode;
+	u16 block_attn_offset;
+	u32 data;
+#define DBG_ATTN_REG_STS_ADDRESS_MASK	0xFFFFFF
+#define DBG_ATTN_REG_STS_ADDRESS_SHIFT	0
+#define DBG_ATTN_REG_NUM_REG_ATTN_MASK	0xFF
+#define DBG_ATTN_REG_NUM_REG_ATTN_SHIFT 24
+	u32 sts_clr_address;
+	u32 mask_address;
+};
+
+/* Attention types */
+enum dbg_attn_type {
+	ATTN_TYPE_INTERRUPT,
+	ATTN_TYPE_PARITY,
+	MAX_DBG_ATTN_TYPE
+};
+
+/* Debug Bus block data */
+struct dbg_bus_block {
+	u8 num_of_lines;
+	u8 has_latency_events;
+	u16 lines_offset;
+};
+
+/* Debug Bus block user data */
+struct dbg_bus_block_user_data {
+	u8 num_of_lines;
+	u8 has_latency_events;
+	u16 names_offset;
+};
+
+/* Block Debug line data */
+struct dbg_bus_line {
+	u8 data;
+#define DBG_BUS_LINE_NUM_OF_GROUPS_MASK		0xF
+#define DBG_BUS_LINE_NUM_OF_GROUPS_SHIFT	0
+#define DBG_BUS_LINE_IS_256B_MASK		0x1
+#define DBG_BUS_LINE_IS_256B_SHIFT		4
+#define DBG_BUS_LINE_RESERVED_MASK		0x7
+#define DBG_BUS_LINE_RESERVED_SHIFT		5
+	u8 group_sizes;
+};
+
+/* Condition header for registers dump */
+struct dbg_dump_cond_hdr {
+	struct dbg_mode_hdr mode; /* Mode header */
+	u8 block_id; /* block ID */
+	u8 data_size; /* size in dwords of the data following this header */
+};
+
+/* Memory data for registers dump */
+struct dbg_dump_mem {
+	u32 dword0;
+#define DBG_DUMP_MEM_ADDRESS_MASK	0xFFFFFF
+#define DBG_DUMP_MEM_ADDRESS_SHIFT	0
+#define DBG_DUMP_MEM_MEM_GROUP_ID_MASK	0xFF
+#define DBG_DUMP_MEM_MEM_GROUP_ID_SHIFT	24
+	u32 dword1;
+#define DBG_DUMP_MEM_LENGTH_MASK	0xFFFFFF
+#define DBG_DUMP_MEM_LENGTH_SHIFT	0
+#define DBG_DUMP_MEM_WIDE_BUS_MASK	0x1
+#define DBG_DUMP_MEM_WIDE_BUS_SHIFT	24
+#define DBG_DUMP_MEM_RESERVED_MASK	0x7F
+#define DBG_DUMP_MEM_RESERVED_SHIFT	25
+};
+
+/* Register data for registers dump */
+struct dbg_dump_reg {
+	u32 data;
+#define DBG_DUMP_REG_ADDRESS_MASK	0x7FFFFF
+#define DBG_DUMP_REG_ADDRESS_SHIFT	0
+#define DBG_DUMP_REG_WIDE_BUS_MASK	0x1
+#define DBG_DUMP_REG_WIDE_BUS_SHIFT	23
+#define DBG_DUMP_REG_LENGTH_MASK	0xFF
+#define DBG_DUMP_REG_LENGTH_SHIFT	24
+};
+
+/* Split header for registers dump */
+struct dbg_dump_split_hdr {
+	u32 hdr;
+#define DBG_DUMP_SPLIT_HDR_DATA_SIZE_MASK	0xFFFFFF
+#define DBG_DUMP_SPLIT_HDR_DATA_SIZE_SHIFT	0
+#define DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID_MASK	0xFF
+#define DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID_SHIFT	24
+};
+
+/* Condition header for idle check */
+struct dbg_idle_chk_cond_hdr {
+	struct dbg_mode_hdr mode; /* Mode header */
+	u16 data_size; /* size in dwords of the data following this header */
+};
+
+/* Idle Check condition register */
+struct dbg_idle_chk_cond_reg {
+	u32 data;
+#define DBG_IDLE_CHK_COND_REG_ADDRESS_MASK	0x7FFFFF
+#define DBG_IDLE_CHK_COND_REG_ADDRESS_SHIFT	0
+#define DBG_IDLE_CHK_COND_REG_WIDE_BUS_MASK	0x1
+#define DBG_IDLE_CHK_COND_REG_WIDE_BUS_SHIFT	23
+#define DBG_IDLE_CHK_COND_REG_BLOCK_ID_MASK	0xFF
+#define DBG_IDLE_CHK_COND_REG_BLOCK_ID_SHIFT	24
+	u16 num_entries;
+	u8 entry_size;
+	u8 start_entry;
+};
+
+/* Idle Check info register */
+struct dbg_idle_chk_info_reg {
+	u32 data;
+#define DBG_IDLE_CHK_INFO_REG_ADDRESS_MASK	0x7FFFFF
+#define DBG_IDLE_CHK_INFO_REG_ADDRESS_SHIFT	0
+#define DBG_IDLE_CHK_INFO_REG_WIDE_BUS_MASK	0x1
+#define DBG_IDLE_CHK_INFO_REG_WIDE_BUS_SHIFT	23
+#define DBG_IDLE_CHK_INFO_REG_BLOCK_ID_MASK	0xFF
+#define DBG_IDLE_CHK_INFO_REG_BLOCK_ID_SHIFT	24
+	u16 size; /* register size in dwords */
+	struct dbg_mode_hdr mode; /* Mode header */
+};
+
+/* Idle Check register */
+union dbg_idle_chk_reg {
+	struct dbg_idle_chk_cond_reg cond_reg; /* condition register */
+	struct dbg_idle_chk_info_reg info_reg; /* info register */
+};
+
+/* Idle Check result header */
+struct dbg_idle_chk_result_hdr {
+	u16 rule_id; /* Failing rule index */
+	u16 mem_entry_id; /* Failing memory entry index */
+	u8 num_dumped_cond_regs; /* number of dumped condition registers */
+	u8 num_dumped_info_regs; /* number of dumped condition registers */
+	u8 severity; /* from dbg_idle_chk_severity_types enum */
+	u8 reserved;
+};
+
+/* Idle Check result register header */
+struct dbg_idle_chk_result_reg_hdr {
+	u8 data;
+#define DBG_IDLE_CHK_RESULT_REG_HDR_IS_MEM_MASK  0x1
+#define DBG_IDLE_CHK_RESULT_REG_HDR_IS_MEM_SHIFT 0
+#define DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID_MASK  0x7F
+#define DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID_SHIFT 1
+	u8 start_entry; /* index of the first checked entry */
+	u16 size; /* register size in dwords */
+};
+
+/* Idle Check rule */
+struct dbg_idle_chk_rule {
+	u16 rule_id; /* Idle Check rule ID */
+	u8 severity; /* value from dbg_idle_chk_severity_types enum */
+	u8 cond_id; /* Condition ID */
+	u8 num_cond_regs; /* number of condition registers */
+	u8 num_info_regs; /* number of info registers */
+	u8 num_imms; /* number of immediates in the condition */
+	u8 reserved1;
+	u16 reg_offset; /* offset of this rules registers in the idle check
+			 * register array (in dbg_idle_chk_reg units).
+			 */
+	u16 imm_offset; /* offset of this rules immediate values in the
+			 * immediate values array (in dwords).
+			 */
+};
+
+/* Idle Check rule parsing data */
+struct dbg_idle_chk_rule_parsing_data {
+	u32 data;
+#define DBG_IDLE_CHK_RULE_PARSING_DATA_HAS_FW_MSG_MASK	0x1
+#define DBG_IDLE_CHK_RULE_PARSING_DATA_HAS_FW_MSG_SHIFT	0
+#define DBG_IDLE_CHK_RULE_PARSING_DATA_STR_OFFSET_MASK	0x7FFFFFFF
+#define DBG_IDLE_CHK_RULE_PARSING_DATA_STR_OFFSET_SHIFT	1
+};
+
+/* Idle check severity types */
+enum dbg_idle_chk_severity_types {
+	/* idle check failure should cause an error */
+	IDLE_CHK_SEVERITY_ERROR,
+	/* idle check failure should cause an error only if theres no traffic */
+	IDLE_CHK_SEVERITY_ERROR_NO_TRAFFIC,
+	/* idle check failure should cause a warning */
+	IDLE_CHK_SEVERITY_WARNING,
+	MAX_DBG_IDLE_CHK_SEVERITY_TYPES
+};
+
+/* Debug Bus block data */
+struct dbg_bus_block_data {
+	u16 data;
+#define DBG_BUS_BLOCK_DATA_ENABLE_MASK_MASK		0xF
+#define DBG_BUS_BLOCK_DATA_ENABLE_MASK_SHIFT		0
+#define DBG_BUS_BLOCK_DATA_RIGHT_SHIFT_MASK		0xF
+#define DBG_BUS_BLOCK_DATA_RIGHT_SHIFT_SHIFT		4
+#define DBG_BUS_BLOCK_DATA_FORCE_VALID_MASK_MASK	0xF
+#define DBG_BUS_BLOCK_DATA_FORCE_VALID_MASK_SHIFT	8
+#define DBG_BUS_BLOCK_DATA_FORCE_FRAME_MASK_MASK	0xF
+#define DBG_BUS_BLOCK_DATA_FORCE_FRAME_MASK_SHIFT	12
+	u8 line_num;
+	u8 hw_id;
+};
+
+/* Debug Bus Clients */
+enum dbg_bus_clients {
+	DBG_BUS_CLIENT_RBCN,
+	DBG_BUS_CLIENT_RBCP,
+	DBG_BUS_CLIENT_RBCR,
+	DBG_BUS_CLIENT_RBCT,
+	DBG_BUS_CLIENT_RBCU,
+	DBG_BUS_CLIENT_RBCF,
+	DBG_BUS_CLIENT_RBCX,
+	DBG_BUS_CLIENT_RBCS,
+	DBG_BUS_CLIENT_RBCH,
+	DBG_BUS_CLIENT_RBCZ,
+	DBG_BUS_CLIENT_OTHER_ENGINE,
+	DBG_BUS_CLIENT_TIMESTAMP,
+	DBG_BUS_CLIENT_CPU,
+	DBG_BUS_CLIENT_RBCY,
+	DBG_BUS_CLIENT_RBCQ,
+	DBG_BUS_CLIENT_RBCM,
+	DBG_BUS_CLIENT_RBCB,
+	DBG_BUS_CLIENT_RBCW,
+	DBG_BUS_CLIENT_RBCV,
+	MAX_DBG_BUS_CLIENTS
+};
+
+/* Debug Bus constraint operation types */
+enum dbg_bus_constraint_ops {
+	DBG_BUS_CONSTRAINT_OP_EQ,
+	DBG_BUS_CONSTRAINT_OP_NE,
+	DBG_BUS_CONSTRAINT_OP_LT,
+	DBG_BUS_CONSTRAINT_OP_LTC,
+	DBG_BUS_CONSTRAINT_OP_LE,
+	DBG_BUS_CONSTRAINT_OP_LEC,
+	DBG_BUS_CONSTRAINT_OP_GT,
+	DBG_BUS_CONSTRAINT_OP_GTC,
+	DBG_BUS_CONSTRAINT_OP_GE,
+	DBG_BUS_CONSTRAINT_OP_GEC,
+	MAX_DBG_BUS_CONSTRAINT_OPS
+};
+
+/* Debug Bus trigger state data */
+struct dbg_bus_trigger_state_data {
+	u8 data;
+#define DBG_BUS_TRIGGER_STATE_DATA_BLOCK_SHIFTED_ENABLE_MASK_MASK	0xF
+#define DBG_BUS_TRIGGER_STATE_DATA_BLOCK_SHIFTED_ENABLE_MASK_SHIFT	0
+#define DBG_BUS_TRIGGER_STATE_DATA_CONSTRAINT_DWORD_MASK_MASK		0xF
+#define DBG_BUS_TRIGGER_STATE_DATA_CONSTRAINT_DWORD_MASK_SHIFT		4
+};
+
+/* Debug Bus memory address */
+struct dbg_bus_mem_addr {
+	u32 lo;
+	u32 hi;
+};
+
+/* Debug Bus PCI buffer data */
+struct dbg_bus_pci_buf_data {
+	struct dbg_bus_mem_addr phys_addr; /* PCI buffer physical address */
+	struct dbg_bus_mem_addr virt_addr; /* PCI buffer virtual address */
+	u32 size; /* PCI buffer size in bytes */
+};
+
+/* Debug Bus Storm EID range filter params */
+struct dbg_bus_storm_eid_range_params {
+	u8 min; /* Minimal event ID to filter on */
+	u8 max; /* Maximal event ID to filter on */
+};
+
+/* Debug Bus Storm EID mask filter params */
+struct dbg_bus_storm_eid_mask_params {
+	u8 val; /* Event ID value */
+	u8 mask; /* Event ID mask. 1s in the mask = dont care bits. */
+};
+
+/* Debug Bus Storm EID filter params */
+union dbg_bus_storm_eid_params {
+	struct dbg_bus_storm_eid_range_params range;
+	struct dbg_bus_storm_eid_mask_params mask;
+};
+
+/* Debug Bus Storm data */
+struct dbg_bus_storm_data {
+	u8 enabled;
+	u8 mode;
+	u8 hw_id;
+	u8 eid_filter_en;
+	u8 eid_range_not_mask;
+	u8 cid_filter_en;
+	union dbg_bus_storm_eid_params eid_filter_params;
+	u32 cid;
+};
+
+/* Debug Bus data */
+struct dbg_bus_data {
+	u32 app_version;
+	u8 state;
+	u8 hw_dwords;
+	u16 hw_id_mask;
+	u8 num_enabled_blocks;
+	u8 num_enabled_storms;
+	u8 target;
+	u8 one_shot_en;
+	u8 grc_input_en;
+	u8 timestamp_input_en;
+	u8 filter_en;
+	u8 adding_filter;
+	u8 filter_pre_trigger;
+	u8 filter_post_trigger;
+	u16 reserved;
+	u8 trigger_en;
+	struct dbg_bus_trigger_state_data trigger_states[3];
+	u8 next_trigger_state;
+	u8 next_constraint_id;
+	u8 unify_inputs;
+	u8 rcv_from_other_engine;
+	struct dbg_bus_pci_buf_data pci_buf;
+	struct dbg_bus_block_data blocks[88];
+	struct dbg_bus_storm_data storms[6];
+};
+
+/* Debug bus filter types */
+enum dbg_bus_filter_types {
+	DBG_BUS_FILTER_TYPE_OFF,
+	DBG_BUS_FILTER_TYPE_PRE,
+	DBG_BUS_FILTER_TYPE_POST,
+	DBG_BUS_FILTER_TYPE_ON,
+	MAX_DBG_BUS_FILTER_TYPES
+};
+
+/* Debug bus frame modes */
+enum dbg_bus_frame_modes {
+	DBG_BUS_FRAME_MODE_0HW_4ST = 0, /* 0 HW dwords, 4 Storm dwords */
+	DBG_BUS_FRAME_MODE_4HW_0ST = 3, /* 4 HW dwords, 0 Storm dwords */
+	DBG_BUS_FRAME_MODE_8HW_0ST = 4, /* 8 HW dwords, 0 Storm dwords */
+	MAX_DBG_BUS_FRAME_MODES
+};
+
+/* Debug bus other engine mode */
+enum dbg_bus_other_engine_modes {
+	DBG_BUS_OTHER_ENGINE_MODE_NONE,
+	DBG_BUS_OTHER_ENGINE_MODE_DOUBLE_BW_TX,
+	DBG_BUS_OTHER_ENGINE_MODE_DOUBLE_BW_RX,
+	DBG_BUS_OTHER_ENGINE_MODE_CROSS_ENGINE_TX,
+	DBG_BUS_OTHER_ENGINE_MODE_CROSS_ENGINE_RX,
+	MAX_DBG_BUS_OTHER_ENGINE_MODES
+};
+
+/* Debug bus post-trigger recording types */
+enum dbg_bus_post_trigger_types {
+	DBG_BUS_POST_TRIGGER_RECORD,
+	DBG_BUS_POST_TRIGGER_DROP,
+	MAX_DBG_BUS_POST_TRIGGER_TYPES
+};
+
+/* Debug bus pre-trigger recording types */
+enum dbg_bus_pre_trigger_types {
+	DBG_BUS_PRE_TRIGGER_START_FROM_ZERO,
+	DBG_BUS_PRE_TRIGGER_NUM_CHUNKS,
+	DBG_BUS_PRE_TRIGGER_DROP,
+	MAX_DBG_BUS_PRE_TRIGGER_TYPES
+};
+
+/* Debug bus SEMI frame modes */
+enum dbg_bus_semi_frame_modes {
+	DBG_BUS_SEMI_FRAME_MODE_0SLOW_4FAST = 0,
+	DBG_BUS_SEMI_FRAME_MODE_4SLOW_0FAST = 3,
+	MAX_DBG_BUS_SEMI_FRAME_MODES
+};
+
+/* Debug bus states */
+enum dbg_bus_states {
+	DBG_BUS_STATE_IDLE,
+	DBG_BUS_STATE_READY,
+	DBG_BUS_STATE_RECORDING,
+	DBG_BUS_STATE_STOPPED,
+	MAX_DBG_BUS_STATES
+};
+
+/* Debug Bus Storm modes */
+enum dbg_bus_storm_modes {
+	DBG_BUS_STORM_MODE_PRINTF,
+	DBG_BUS_STORM_MODE_PRAM_ADDR,
+	DBG_BUS_STORM_MODE_DRA_RW,
+	DBG_BUS_STORM_MODE_DRA_W,
+	DBG_BUS_STORM_MODE_LD_ST_ADDR,
+	DBG_BUS_STORM_MODE_DRA_FSM,
+	DBG_BUS_STORM_MODE_RH,
+	DBG_BUS_STORM_MODE_FOC,
+	DBG_BUS_STORM_MODE_EXT_STORE,
+	MAX_DBG_BUS_STORM_MODES
+};
+
+/* Debug bus target IDs */
+enum dbg_bus_targets {
+	DBG_BUS_TARGET_ID_INT_BUF,
+	DBG_BUS_TARGET_ID_NIG,
+	DBG_BUS_TARGET_ID_PCI,
+	MAX_DBG_BUS_TARGETS
+};
+
+/* GRC Dump data */
+struct dbg_grc_data {
+	u8 params_initialized;
+	u8 reserved1;
+	u16 reserved2;
+	u32 param_val[48];
+};
+
+/* Debug GRC params */
+enum dbg_grc_params {
+	DBG_GRC_PARAM_DUMP_TSTORM,
+	DBG_GRC_PARAM_DUMP_MSTORM,
+	DBG_GRC_PARAM_DUMP_USTORM,
+	DBG_GRC_PARAM_DUMP_XSTORM,
+	DBG_GRC_PARAM_DUMP_YSTORM,
+	DBG_GRC_PARAM_DUMP_PSTORM,
+	DBG_GRC_PARAM_DUMP_REGS,
+	DBG_GRC_PARAM_DUMP_RAM,
+	DBG_GRC_PARAM_DUMP_PBUF,
+	DBG_GRC_PARAM_DUMP_IOR,
+	DBG_GRC_PARAM_DUMP_VFC,
+	DBG_GRC_PARAM_DUMP_CM_CTX,
+	DBG_GRC_PARAM_DUMP_PXP,
+	DBG_GRC_PARAM_DUMP_RSS,
+	DBG_GRC_PARAM_DUMP_CAU,
+	DBG_GRC_PARAM_DUMP_QM,
+	DBG_GRC_PARAM_DUMP_MCP,
+	DBG_GRC_PARAM_MCP_TRACE_META_SIZE,
+	DBG_GRC_PARAM_DUMP_CFC,
+	DBG_GRC_PARAM_DUMP_IGU,
+	DBG_GRC_PARAM_DUMP_BRB,
+	DBG_GRC_PARAM_DUMP_BTB,
+	DBG_GRC_PARAM_DUMP_BMB,
+	DBG_GRC_PARAM_DUMP_NIG,
+	DBG_GRC_PARAM_DUMP_MULD,
+	DBG_GRC_PARAM_DUMP_PRS,
+	DBG_GRC_PARAM_DUMP_DMAE,
+	DBG_GRC_PARAM_DUMP_TM,
+	DBG_GRC_PARAM_DUMP_SDM,
+	DBG_GRC_PARAM_DUMP_DIF,
+	DBG_GRC_PARAM_DUMP_STATIC,
+	DBG_GRC_PARAM_UNSTALL,
+	DBG_GRC_PARAM_NUM_LCIDS,
+	DBG_GRC_PARAM_NUM_LTIDS,
+	DBG_GRC_PARAM_EXCLUDE_ALL,
+	DBG_GRC_PARAM_CRASH,
+	DBG_GRC_PARAM_PARITY_SAFE,
+	DBG_GRC_PARAM_DUMP_CM,
+	DBG_GRC_PARAM_DUMP_PHY,
+	DBG_GRC_PARAM_NO_MCP,
+	DBG_GRC_PARAM_NO_FW_VER,
+	MAX_DBG_GRC_PARAMS
+};
+
+/* Debug reset registers */
+enum dbg_reset_regs {
+	DBG_RESET_REG_MISCS_PL_UA,
+	DBG_RESET_REG_MISCS_PL_HV,
+	DBG_RESET_REG_MISCS_PL_HV_2,
+	DBG_RESET_REG_MISC_PL_UA,
+	DBG_RESET_REG_MISC_PL_HV,
+	DBG_RESET_REG_MISC_PL_PDA_VMAIN_1,
+	DBG_RESET_REG_MISC_PL_PDA_VMAIN_2,
+	DBG_RESET_REG_MISC_PL_PDA_VAUX,
+	MAX_DBG_RESET_REGS
+};
+
+/* Debug status codes */
+enum dbg_status {
+	DBG_STATUS_OK,
+	DBG_STATUS_APP_VERSION_NOT_SET,
+	DBG_STATUS_UNSUPPORTED_APP_VERSION,
+	DBG_STATUS_DBG_BLOCK_NOT_RESET,
+	DBG_STATUS_INVALID_ARGS,
+	DBG_STATUS_OUTPUT_ALREADY_SET,
+	DBG_STATUS_INVALID_PCI_BUF_SIZE,
+	DBG_STATUS_PCI_BUF_ALLOC_FAILED,
+	DBG_STATUS_PCI_BUF_NOT_ALLOCATED,
+	DBG_STATUS_TOO_MANY_INPUTS,
+	DBG_STATUS_INPUT_OVERLAP,
+	DBG_STATUS_HW_ONLY_RECORDING,
+	DBG_STATUS_STORM_ALREADY_ENABLED,
+	DBG_STATUS_STORM_NOT_ENABLED,
+	DBG_STATUS_BLOCK_ALREADY_ENABLED,
+	DBG_STATUS_BLOCK_NOT_ENABLED,
+	DBG_STATUS_NO_INPUT_ENABLED,
+	DBG_STATUS_NO_FILTER_TRIGGER_64B,
+	DBG_STATUS_FILTER_ALREADY_ENABLED,
+	DBG_STATUS_TRIGGER_ALREADY_ENABLED,
+	DBG_STATUS_TRIGGER_NOT_ENABLED,
+	DBG_STATUS_CANT_ADD_CONSTRAINT,
+	DBG_STATUS_TOO_MANY_TRIGGER_STATES,
+	DBG_STATUS_TOO_MANY_CONSTRAINTS,
+	DBG_STATUS_RECORDING_NOT_STARTED,
+	DBG_STATUS_DATA_DIDNT_TRIGGER,
+	DBG_STATUS_NO_DATA_RECORDED,
+	DBG_STATUS_DUMP_BUF_TOO_SMALL,
+	DBG_STATUS_DUMP_NOT_CHUNK_ALIGNED,
+	DBG_STATUS_UNKNOWN_CHIP,
+	DBG_STATUS_VIRT_MEM_ALLOC_FAILED,
+	DBG_STATUS_BLOCK_IN_RESET,
+	DBG_STATUS_INVALID_TRACE_SIGNATURE,
+	DBG_STATUS_INVALID_NVRAM_BUNDLE,
+	DBG_STATUS_NVRAM_GET_IMAGE_FAILED,
+	DBG_STATUS_NON_ALIGNED_NVRAM_IMAGE,
+	DBG_STATUS_NVRAM_READ_FAILED,
+	DBG_STATUS_IDLE_CHK_PARSE_FAILED,
+	DBG_STATUS_MCP_TRACE_BAD_DATA,
+	DBG_STATUS_MCP_TRACE_NO_META,
+	DBG_STATUS_MCP_COULD_NOT_HALT,
+	DBG_STATUS_MCP_COULD_NOT_RESUME,
+	DBG_STATUS_RESERVED2,
+	DBG_STATUS_SEMI_FIFO_NOT_EMPTY,
+	DBG_STATUS_IGU_FIFO_BAD_DATA,
+	DBG_STATUS_MCP_COULD_NOT_MASK_PRTY,
+	DBG_STATUS_FW_ASSERTS_PARSE_FAILED,
+	DBG_STATUS_REG_FIFO_BAD_DATA,
+	DBG_STATUS_PROTECTION_OVERRIDE_BAD_DATA,
+	DBG_STATUS_DBG_ARRAY_NOT_SET,
+	DBG_STATUS_FILTER_BUG,
+	DBG_STATUS_NON_MATCHING_LINES,
+	DBG_STATUS_INVALID_TRIGGER_DWORD_OFFSET,
+	DBG_STATUS_DBG_BUS_IN_USE,
+	MAX_DBG_STATUS
+};
+
+/* Debug Storms IDs */
+enum dbg_storms {
+	DBG_TSTORM_ID,
+	DBG_MSTORM_ID,
+	DBG_USTORM_ID,
+	DBG_XSTORM_ID,
+	DBG_YSTORM_ID,
+	DBG_PSTORM_ID,
+	MAX_DBG_STORMS
+};
+
+/* Idle Check data */
+struct idle_chk_data {
+	u32 buf_size;
+	u8 buf_size_set;
+	u8 reserved1;
+	u16 reserved2;
+};
+
+/* Debug Tools data (per HW function) */
+struct dbg_tools_data {
+	struct dbg_grc_data grc;
+	struct dbg_bus_data bus;
+	struct idle_chk_data idle_chk;
+	u8 mode_enable[40];
+	u8 block_in_reset[88];
+	u8 chip_id;
+	u8 platform_id;
+	u8 initialized;
+	u8 use_dmae;
+	u32 num_regs_read;
+};
+
+/********************************/
+/* HSI Init Functions constants */
+/********************************/
+
+/* Number of VLAN priorities */
+#define NUM_OF_VLAN_PRIORITIES	8
+
+/* BRB RAM init requirements */
+struct init_brb_ram_req {
+	u32 guranteed_per_tc;
+	u32 headroom_per_tc;
+	u32 min_pkt_size;
+	u32 max_ports_per_engine;
+	u8 num_active_tcs[MAX_NUM_PORTS];
+};
+
+/* ETS per-TC init requirements */
+struct init_ets_tc_req {
+	u8 use_sp;
+	u8 use_wfq;
+	u16 weight;
+};
+
+/* ETS init requirements */
+struct init_ets_req {
+	u32 mtu;
+	struct init_ets_tc_req tc_req[NUM_OF_TCS];
+};
+
+/* NIG LB RL init requirements */
+struct init_nig_lb_rl_req {
+	u16 lb_mac_rate;
+	u16 lb_rate;
+	u32 mtu;
+	u16 tc_rate[NUM_OF_PHYS_TCS];
+};
+
+/* NIG TC mapping for each priority */
+struct init_nig_pri_tc_map_entry {
+	u8 tc_id;
+	u8 valid;
+};
+
+/* NIG priority to TC map init requirements */
+struct init_nig_pri_tc_map_req {
+	struct init_nig_pri_tc_map_entry pri[NUM_OF_VLAN_PRIORITIES];
+};
+
+/* QM per-port init parameters */
+struct init_qm_port_params {
+	u8 active;
+	u8 active_phys_tcs;
+	u16 num_pbf_cmd_lines;
+	u16 num_btb_blocks;
+	u16 reserved;
+};
+
+/* QM per-PQ init parameters */
+struct init_qm_pq_params {
+	u8 vport_id;
+	u8 tc_id;
+	u8 wrr_group;
+	u8 rl_valid;
+	u8 port_id;
+	u8 reserved0;
+	u16 reserved1;
+};
+
+/* QM per-vport init parameters */
+struct init_qm_vport_params {
+	u32 vport_rl;
+	u16 vport_wfq;
+	u16 first_tx_pq_id[NUM_OF_TCS];
+};
+
+/**************************************/
+/* Init Tool HSI constants and macros */
+/**************************************/
+
+/* Width of GRC address in bits (addresses are specified in dwords) */
+#define GRC_ADDR_BITS	23
+#define MAX_GRC_ADDR	(BIT(GRC_ADDR_BITS) - 1)
+
+/* indicates an init that should be applied to any phase ID */
+#define ANY_PHASE_ID	0xffff
+
+/* Max size in dwords of a zipped array */
+#define MAX_ZIPPED_SIZE	8192
+enum chip_ids {
+	CHIP_BB,
+	CHIP_K2,
+	CHIP_RESERVED,
+	MAX_CHIP_IDS
+};
+
+struct fw_asserts_ram_section {
+	u16 section_ram_line_offset;
+	u16 section_ram_line_size;
+	u8 list_dword_offset;
+	u8 list_element_dword_size;
+	u8 list_num_elements;
+	u8 list_next_index_dword_offset;
+};
+
+struct fw_ver_num {
+	u8 major;
+	u8 minor;
+	u8 rev;
+	u8 eng;
+};
+
+struct fw_ver_info {
+	__le16 tools_ver;
+	u8 image_id;
+	u8 reserved1;
+	struct fw_ver_num num;
+	__le32 timestamp;
+	__le32 reserved2;
+};
+
+struct fw_info {
+	struct fw_ver_info ver;
+	struct fw_asserts_ram_section fw_asserts_section;
+};
+
+struct fw_info_location {
+	__le32 grc_addr;
+	__le32 size;
+};
+
+enum init_modes {
+	MODE_RESERVED,
+	MODE_BB,
+	MODE_K2,
+	MODE_ASIC,
+	MODE_RESERVED2,
+	MODE_RESERVED3,
+	MODE_RESERVED4,
+	MODE_RESERVED5,
+	MODE_SF,
+	MODE_MF_SD,
+	MODE_MF_SI,
+	MODE_PORTS_PER_ENG_1,
+	MODE_PORTS_PER_ENG_2,
+	MODE_PORTS_PER_ENG_4,
+	MODE_100G,
+	MODE_RESERVED6,
+	MAX_INIT_MODES
+};
+
+enum init_phases {
+	PHASE_ENGINE,
+	PHASE_PORT,
+	PHASE_PF,
+	PHASE_VF,
+	PHASE_QM_PF,
+	MAX_INIT_PHASES
+};
+
+enum init_split_types {
+	SPLIT_TYPE_NONE,
+	SPLIT_TYPE_PORT,
+	SPLIT_TYPE_PF,
+	SPLIT_TYPE_PORT_PF,
+	SPLIT_TYPE_VF,
+	MAX_INIT_SPLIT_TYPES
+};
+
+/* Binary buffer header */
+struct bin_buffer_hdr {
+	u32 offset;
+	u32 length;
+};
+
+/* Binary init buffer types */
+enum bin_init_buffer_type {
+	BIN_BUF_INIT_FW_VER_INFO,
+	BIN_BUF_INIT_CMD,
+	BIN_BUF_INIT_VAL,
+	BIN_BUF_INIT_MODE_TREE,
+	BIN_BUF_INIT_IRO,
+	MAX_BIN_INIT_BUFFER_TYPE
+};
+
+/* init array header: raw */
+struct init_array_raw_hdr {
+	u32 data;
+#define INIT_ARRAY_RAW_HDR_TYPE_MASK	0xF
+#define INIT_ARRAY_RAW_HDR_TYPE_SHIFT	0
+#define INIT_ARRAY_RAW_HDR_PARAMS_MASK	0xFFFFFFF
+#define INIT_ARRAY_RAW_HDR_PARAMS_SHIFT	4
+};
+
+/* init array header: standard */
+struct init_array_standard_hdr {
+	u32 data;
+#define INIT_ARRAY_STANDARD_HDR_TYPE_MASK	0xF
+#define INIT_ARRAY_STANDARD_HDR_TYPE_SHIFT	0
+#define INIT_ARRAY_STANDARD_HDR_SIZE_MASK	0xFFFFFFF
+#define INIT_ARRAY_STANDARD_HDR_SIZE_SHIFT	4
+};
+
+/* init array header: zipped */
+struct init_array_zipped_hdr {
+	u32 data;
+#define INIT_ARRAY_ZIPPED_HDR_TYPE_MASK		0xF
+#define INIT_ARRAY_ZIPPED_HDR_TYPE_SHIFT	0
+#define INIT_ARRAY_ZIPPED_HDR_ZIPPED_SIZE_MASK	0xFFFFFFF
+#define INIT_ARRAY_ZIPPED_HDR_ZIPPED_SIZE_SHIFT	4
+};
+
+/* init array header: pattern */
+struct init_array_pattern_hdr {
+	u32 data;
+#define INIT_ARRAY_PATTERN_HDR_TYPE_MASK		0xF
+#define INIT_ARRAY_PATTERN_HDR_TYPE_SHIFT		0
+#define INIT_ARRAY_PATTERN_HDR_PATTERN_SIZE_MASK	0xF
+#define INIT_ARRAY_PATTERN_HDR_PATTERN_SIZE_SHIFT	4
+#define INIT_ARRAY_PATTERN_HDR_REPETITIONS_MASK		0xFFFFFF
+#define INIT_ARRAY_PATTERN_HDR_REPETITIONS_SHIFT	8
+};
+
+/* init array header union */
+union init_array_hdr {
+	struct init_array_raw_hdr raw;
+	struct init_array_standard_hdr standard;
+	struct init_array_zipped_hdr zipped;
+	struct init_array_pattern_hdr pattern;
+};
+
+/* init array types */
+enum init_array_types {
+	INIT_ARR_STANDARD,
+	INIT_ARR_ZIPPED,
+	INIT_ARR_PATTERN,
+	MAX_INIT_ARRAY_TYPES
+};
+
+/* init operation: callback */
+struct init_callback_op {
+	u32 op_data;
+#define INIT_CALLBACK_OP_OP_MASK	0xF
+#define INIT_CALLBACK_OP_OP_SHIFT	0
+#define INIT_CALLBACK_OP_RESERVED_MASK	0xFFFFFFF
+#define INIT_CALLBACK_OP_RESERVED_SHIFT	4
+	u16 callback_id;
+	u16 block_id;
+};
+
+/* init operation: delay */
+struct init_delay_op {
+	u32 op_data;
+#define INIT_DELAY_OP_OP_MASK		0xF
+#define INIT_DELAY_OP_OP_SHIFT		0
+#define INIT_DELAY_OP_RESERVED_MASK	0xFFFFFFF
+#define INIT_DELAY_OP_RESERVED_SHIFT	4
+	u32 delay;
+};
+
+/* init operation: if_mode */
+struct init_if_mode_op {
+	u32 op_data;
+#define INIT_IF_MODE_OP_OP_MASK			0xF
+#define INIT_IF_MODE_OP_OP_SHIFT		0
+#define INIT_IF_MODE_OP_RESERVED1_MASK		0xFFF
+#define INIT_IF_MODE_OP_RESERVED1_SHIFT		4
+#define INIT_IF_MODE_OP_CMD_OFFSET_MASK		0xFFFF
+#define INIT_IF_MODE_OP_CMD_OFFSET_SHIFT	16
+	u16 reserved2;
+	u16 modes_buf_offset;
+};
+
+/* init operation: if_phase */
+struct init_if_phase_op {
+	u32 op_data;
+#define INIT_IF_PHASE_OP_OP_MASK		0xF
+#define INIT_IF_PHASE_OP_OP_SHIFT		0
+#define INIT_IF_PHASE_OP_DMAE_ENABLE_MASK	0x1
+#define INIT_IF_PHASE_OP_DMAE_ENABLE_SHIFT	4
+#define INIT_IF_PHASE_OP_RESERVED1_MASK		0x7FF
+#define INIT_IF_PHASE_OP_RESERVED1_SHIFT	5
+#define INIT_IF_PHASE_OP_CMD_OFFSET_MASK	0xFFFF
+#define INIT_IF_PHASE_OP_CMD_OFFSET_SHIFT	16
+	u32 phase_data;
+#define INIT_IF_PHASE_OP_PHASE_MASK		0xFF
+#define INIT_IF_PHASE_OP_PHASE_SHIFT		0
+#define INIT_IF_PHASE_OP_RESERVED2_MASK		0xFF
+#define INIT_IF_PHASE_OP_RESERVED2_SHIFT	8
+#define INIT_IF_PHASE_OP_PHASE_ID_MASK		0xFFFF
+#define INIT_IF_PHASE_OP_PHASE_ID_SHIFT		16
+};
+
+/* init mode operators */
+enum init_mode_ops {
+	INIT_MODE_OP_NOT,
+	INIT_MODE_OP_OR,
+	INIT_MODE_OP_AND,
+	MAX_INIT_MODE_OPS
+};
+
+/* init operation: raw */
+struct init_raw_op {
+	u32 op_data;
+#define INIT_RAW_OP_OP_MASK		0xF
+#define INIT_RAW_OP_OP_SHIFT		0
+#define INIT_RAW_OP_PARAM1_MASK		0xFFFFFFF
+#define INIT_RAW_OP_PARAM1_SHIFT	4
+	u32 param2;
+};
+
+/* init array params */
+struct init_op_array_params {
+	u16 size;
+	u16 offset;
+};
+
+/* Write init operation arguments */
+union init_write_args {
+	u32 inline_val;
+	u32 zeros_count;
+	u32 array_offset;
+	struct init_op_array_params runtime;
+};
+
+/* init operation: write */
+struct init_write_op {
+	u32 data;
+#define INIT_WRITE_OP_OP_MASK		0xF
+#define INIT_WRITE_OP_OP_SHIFT		0
+#define INIT_WRITE_OP_SOURCE_MASK	0x7
+#define INIT_WRITE_OP_SOURCE_SHIFT	4
+#define INIT_WRITE_OP_RESERVED_MASK	0x1
+#define INIT_WRITE_OP_RESERVED_SHIFT	7
+#define INIT_WRITE_OP_WIDE_BUS_MASK	0x1
+#define INIT_WRITE_OP_WIDE_BUS_SHIFT	8
+#define INIT_WRITE_OP_ADDRESS_MASK	0x7FFFFF
+#define INIT_WRITE_OP_ADDRESS_SHIFT	9
+	union init_write_args args;
+};
+
+/* init operation: read */
+struct init_read_op {
+	u32 op_data;
+#define INIT_READ_OP_OP_MASK		0xF
+#define INIT_READ_OP_OP_SHIFT		0
+#define INIT_READ_OP_POLL_TYPE_MASK	0xF
+#define INIT_READ_OP_POLL_TYPE_SHIFT	4
+#define INIT_READ_OP_RESERVED_MASK	0x1
+#define INIT_READ_OP_RESERVED_SHIFT	8
+#define INIT_READ_OP_ADDRESS_MASK	0x7FFFFF
+#define INIT_READ_OP_ADDRESS_SHIFT	9
+	u32 expected_val;
+};
+
+/* Init operations union */
+union init_op {
+	struct init_raw_op raw;
+	struct init_write_op write;
+	struct init_read_op read;
+	struct init_if_mode_op if_mode;
+	struct init_if_phase_op if_phase;
+	struct init_callback_op callback;
+	struct init_delay_op delay;
+};
+
+/* Init command operation types */
+enum init_op_types {
+	INIT_OP_READ,
+	INIT_OP_WRITE,
+	INIT_OP_IF_MODE,
+	INIT_OP_IF_PHASE,
+	INIT_OP_DELAY,
+	INIT_OP_CALLBACK,
+	MAX_INIT_OP_TYPES
+};
+
+/* init polling types */
+enum init_poll_types {
+	INIT_POLL_NONE,
+	INIT_POLL_EQ,
+	INIT_POLL_OR,
+	INIT_POLL_AND,
+	MAX_INIT_POLL_TYPES
+};
+
+/* init source types */
+enum init_source_types {
+	INIT_SRC_INLINE,
+	INIT_SRC_ZEROS,
+	INIT_SRC_ARRAY,
+	INIT_SRC_RUNTIME,
+	MAX_INIT_SOURCE_TYPES
+};
+
+/* Internal RAM Offsets macro data */
+struct iro {
+	u32 base;
+	u16 m1;
+	u16 m2;
+	u16 m3;
+	u16 size;
+};
+
+/***************************** Public Functions *******************************/
+
+/**
+ * @brief qed_dbg_set_bin_ptr - Sets a pointer to the binary data with debug
+ *	arrays.
+ *
+ * @param bin_ptr - a pointer to the binary data with debug arrays.
+ */
+enum dbg_status qed_dbg_set_bin_ptr(const u8 * const bin_ptr);
+
+/**
+ * @brief qed_read_regs - Reads registers into a buffer (using GRC).
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf - Destination buffer.
+ * @param addr - Source GRC address in dwords.
+ * @param len - Number of registers to read.
+ */
+void qed_read_regs(struct qed_hwfn *p_hwfn,
+		   struct qed_ptt *p_ptt, u32 *buf, u32 addr, u32 len);
+
+/**
+ * @brief qed_dbg_grc_set_params_default - Reverts all GRC parameters to their
+ *	default value.
+ *
+ * @param p_hwfn		- HW device data
+ */
+void qed_dbg_grc_set_params_default(struct qed_hwfn *p_hwfn);
+/**
+ * @brief qed_dbg_grc_get_dump_buf_size - Returns the required buffer size for
+ *	GRC Dump.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for the GRC Dump
+ *	data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_grc_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+					      struct qed_ptt *p_ptt,
+					      u32 *buf_size);
+
+/**
+ * @brief qed_dbg_grc_dump - Dumps GRC data into the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the collected GRC data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified dump buffer is too small
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_grc_dump(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 u32 *dump_buf,
+				 u32 buf_size_in_dwords,
+				 u32 *num_dumped_dwords);
+
+/**
+ * @brief qed_dbg_idle_chk_get_dump_buf_size - Returns the required buffer size
+ *	for idle check results.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for the idle check
+ *	data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_idle_chk_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *buf_size);
+
+/**
+ * @brief qed_dbg_idle_chk_dump - Performs idle check and writes the results
+ *	into the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the idle check data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified buffer is too small
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_idle_chk_dump(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf,
+				      u32 buf_size_in_dwords,
+				      u32 *num_dumped_dwords);
+
+/**
+ * @brief qed_dbg_mcp_trace_get_dump_buf_size - Returns the required buffer size
+ *	for mcp trace results.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for mcp trace data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the trace data in MCP scratchpad contain an invalid signature
+ *	- the bundle ID in NVRAM is invalid
+ *	- the trace meta data cannot be found (in NVRAM or image file)
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_mcp_trace_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						    struct qed_ptt *p_ptt,
+						    u32 *buf_size);
+
+/**
+ * @brief qed_dbg_mcp_trace_dump - Performs mcp trace and writes the results
+ *	into the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the mcp trace data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified buffer is too small
+ *	- the trace data in MCP scratchpad contain an invalid signature
+ *	- the bundle ID in NVRAM is invalid
+ *	- the trace meta data cannot be found (in NVRAM or image file)
+ *	- the trace meta data cannot be read (from NVRAM or image file)
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_mcp_trace_dump(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       u32 *dump_buf,
+				       u32 buf_size_in_dwords,
+				       u32 *num_dumped_dwords);
+
+/**
+ * @brief qed_dbg_reg_fifo_get_dump_buf_size - Returns the required buffer size
+ *	for grc trace fifo results.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for reg fifo data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_reg_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *buf_size);
+
+/**
+ * @brief qed_dbg_reg_fifo_dump - Reads the reg fifo and writes the results into
+ *	the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the reg fifo data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified buffer is too small
+ *	- DMAE transaction failed
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_reg_fifo_dump(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf,
+				      u32 buf_size_in_dwords,
+				      u32 *num_dumped_dwords);
+
+/**
+ * @brief qed_dbg_igu_fifo_get_dump_buf_size - Returns the required buffer size
+ *	for the IGU fifo results.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for the IGU fifo
+ *	data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_igu_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						   struct qed_ptt *p_ptt,
+						   u32 *buf_size);
+
+/**
+ * @brief qed_dbg_igu_fifo_dump - Reads the IGU fifo and writes the results into
+ *	the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the IGU fifo data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified buffer is too small
+ *	- DMAE transaction failed
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_igu_fifo_dump(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      u32 *dump_buf,
+				      u32 buf_size_in_dwords,
+				      u32 *num_dumped_dwords);
+
+/**
+ * @brief qed_dbg_protection_override_get_dump_buf_size - Returns the required
+ *	buffer size for protection override window results.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for protection
+ *	override data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status
+qed_dbg_protection_override_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+					      struct qed_ptt *p_ptt,
+					      u32 *buf_size);
+/**
+ * @brief qed_dbg_protection_override_dump - Reads protection override window
+ *	entries and writes the results into the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the protection override data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified buffer is too small
+ *	- DMAE transaction failed
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_protection_override_dump(struct qed_hwfn *p_hwfn,
+						 struct qed_ptt *p_ptt,
+						 u32 *dump_buf,
+						 u32 buf_size_in_dwords,
+						 u32 *num_dumped_dwords);
+/**
+ * @brief qed_dbg_fw_asserts_get_dump_buf_size - Returns the required buffer
+ *	size for FW Asserts results.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param buf_size - OUT: required buffer size (in dwords) for FW Asserts data.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_fw_asserts_get_dump_buf_size(struct qed_hwfn *p_hwfn,
+						     struct qed_ptt *p_ptt,
+						     u32 *buf_size);
+/**
+ * @brief qed_dbg_fw_asserts_dump - Reads the FW Asserts and writes the results
+ *	into the specified buffer.
+ *
+ * @param p_hwfn - HW device data
+ * @param p_ptt - Ptt window used for writing the registers.
+ * @param dump_buf - Pointer to write the FW Asserts data into.
+ * @param buf_size_in_dwords - Size of the specified buffer in dwords.
+ * @param num_dumped_dwords - OUT: number of dumped dwords.
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ *	- the specified buffer is too small
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_fw_asserts_dump(struct qed_hwfn *p_hwfn,
+					struct qed_ptt *p_ptt,
+					u32 *dump_buf,
+					u32 buf_size_in_dwords,
+					u32 *num_dumped_dwords);
+
+/**
+ * @brief qed_dbg_read_attn - Reads the attention registers of the specified
+ * block and type, and writes the results into the specified buffer.
+ *
+ * @param p_hwfn -	 HW device data
+ * @param p_ptt -	 Ptt window used for writing the registers.
+ * @param block -	 Block ID.
+ * @param attn_type -	 Attention type.
+ * @param clear_status - Indicates if the attention status should be cleared.
+ * @param results -	 OUT: Pointer to write the read results into
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_read_attn(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  enum block_id block,
+				  enum dbg_attn_type attn_type,
+				  bool clear_status,
+				  struct dbg_attn_block_result *results);
+
+/**
+ * @brief qed_dbg_print_attn - Prints attention registers values in the
+ *	specified results struct.
+ *
+ * @param p_hwfn
+ * @param results - Pointer to the attention read results
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_print_attn(struct qed_hwfn *p_hwfn,
+				   struct dbg_attn_block_result *results);
+
+/******************************** Constants **********************************/
+
+#define MAX_NAME_LEN	16
+
+/***************************** Public Functions *******************************/
+
+/**
+ * @brief qed_dbg_user_set_bin_ptr - Sets a pointer to the binary data with
+ *	debug arrays.
+ *
+ * @param bin_ptr - a pointer to the binary data with debug arrays.
+ */
+enum dbg_status qed_dbg_user_set_bin_ptr(const u8 * const bin_ptr);
+
+/**
+ * @brief qed_dbg_get_status_str - Returns a string for the specified status.
+ *
+ * @param status - a debug status code.
+ *
+ * @return a string for the specified status
+ */
+const char *qed_dbg_get_status_str(enum dbg_status status);
+
+/**
+ * @brief qed_get_idle_chk_results_buf_size - Returns the required buffer size
+ *	for idle check results (in bytes).
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - idle check dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed
+ *	results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_get_idle_chk_results_buf_size(struct qed_hwfn *p_hwfn,
+						  u32 *dump_buf,
+						  u32  num_dumped_dwords,
+						  u32 *results_buf_size);
+/**
+ * @brief qed_print_idle_chk_results - Prints idle check results
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - idle check dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf - buffer for printing the idle check results.
+ * @param num_errors - OUT: number of errors found in idle check.
+ * @param num_warnings - OUT: number of warnings found in idle check.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_idle_chk_results(struct qed_hwfn *p_hwfn,
+					   u32 *dump_buf,
+					   u32 num_dumped_dwords,
+					   char *results_buf,
+					   u32 *num_errors,
+					   u32 *num_warnings);
+
+/**
+ * @brief qed_dbg_mcp_trace_set_meta_data - Sets a pointer to the MCP Trace
+ *	meta data.
+ *
+ * Needed in case the MCP Trace dump doesn't contain the meta data (e.g. due to
+ * no NVRAM access).
+ *
+ * @param data - pointer to MCP Trace meta data
+ * @param size - size of MCP Trace meta data in dwords
+ */
+void qed_dbg_mcp_trace_set_meta_data(u32 *data, u32 size);
+
+/**
+ * @brief qed_get_mcp_trace_results_buf_size - Returns the required buffer size
+ *	for MCP Trace results (in bytes).
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - MCP Trace dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed
+ *	results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_get_mcp_trace_results_buf_size(struct qed_hwfn *p_hwfn,
+						   u32 *dump_buf,
+						   u32 num_dumped_dwords,
+						   u32 *results_buf_size);
+
+/**
+ * @brief qed_print_mcp_trace_results - Prints MCP Trace results
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - mcp trace dump buffer, starting from the header.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf - buffer for printing the mcp trace results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_mcp_trace_results(struct qed_hwfn *p_hwfn,
+					    u32 *dump_buf,
+					    u32 num_dumped_dwords,
+					    char *results_buf);
+
+/**
+ * @brief print_mcp_trace_line - Prints MCP Trace results for a single line
+ *
+ * @param dump_buf -	      mcp trace dump buffer, starting from the header.
+ * @param num_dumped_bytes -  number of bytes that were dumped.
+ * @param results_buf -	      buffer for printing the mcp trace results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_mcp_trace_line(u8 *dump_buf,
+					 u32 num_dumped_bytes,
+					 char *results_buf);
+
+/**
+ * @brief qed_get_reg_fifo_results_buf_size - Returns the required buffer size
+ *	for reg_fifo results (in bytes).
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - reg fifo dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed
+ *	results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_get_reg_fifo_results_buf_size(struct qed_hwfn *p_hwfn,
+						  u32 *dump_buf,
+						  u32 num_dumped_dwords,
+						  u32 *results_buf_size);
+
+/**
+ * @brief qed_print_reg_fifo_results - Prints reg fifo results
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - reg fifo dump buffer, starting from the header.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf - buffer for printing the reg fifo results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_reg_fifo_results(struct qed_hwfn *p_hwfn,
+					   u32 *dump_buf,
+					   u32 num_dumped_dwords,
+					   char *results_buf);
+
+/**
+ * @brief qed_get_igu_fifo_results_buf_size - Returns the required buffer size
+ *	for igu_fifo results (in bytes).
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - IGU fifo dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed
+ *	results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_get_igu_fifo_results_buf_size(struct qed_hwfn *p_hwfn,
+						  u32 *dump_buf,
+						  u32 num_dumped_dwords,
+						  u32 *results_buf_size);
+
+/**
+ * @brief qed_print_igu_fifo_results - Prints IGU fifo results
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - IGU fifo dump buffer, starting from the header.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf - buffer for printing the IGU fifo results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_igu_fifo_results(struct qed_hwfn *p_hwfn,
+					   u32 *dump_buf,
+					   u32 num_dumped_dwords,
+					   char *results_buf);
+
+/**
+ * @brief qed_get_protection_override_results_buf_size - Returns the required
+ *	buffer size for protection override results (in bytes).
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - protection override dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed
+ *	results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status
+qed_get_protection_override_results_buf_size(struct qed_hwfn *p_hwfn,
+					     u32 *dump_buf,
+					     u32 num_dumped_dwords,
+					     u32 *results_buf_size);
+
+/**
+ * @brief qed_print_protection_override_results - Prints protection override
+ *	results.
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - protection override dump buffer, starting from the header.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf - buffer for printing the reg fifo results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_protection_override_results(struct qed_hwfn *p_hwfn,
+						      u32 *dump_buf,
+						      u32 num_dumped_dwords,
+						      char *results_buf);
+
+/**
+ * @brief qed_get_fw_asserts_results_buf_size - Returns the required buffer size
+ *	for FW Asserts results (in bytes).
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - FW Asserts dump buffer.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf_size - OUT: required buffer size (in bytes) for the parsed
+ *	results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_get_fw_asserts_results_buf_size(struct qed_hwfn *p_hwfn,
+						    u32 *dump_buf,
+						    u32 num_dumped_dwords,
+						    u32 *results_buf_size);
+
+/**
+ * @brief qed_print_fw_asserts_results - Prints FW Asserts results
+ *
+ * @param p_hwfn - HW device data
+ * @param dump_buf - FW Asserts dump buffer, starting from the header.
+ * @param num_dumped_dwords - number of dwords that were dumped.
+ * @param results_buf - buffer for printing the FW Asserts results.
+ *
+ * @return error if the parsing fails, ok otherwise.
+ */
+enum dbg_status qed_print_fw_asserts_results(struct qed_hwfn *p_hwfn,
+					     u32 *dump_buf,
+					     u32 num_dumped_dwords,
+					     char *results_buf);
+
+/**
+ * @brief qed_dbg_parse_attn - Parses and prints attention registers values in
+ * the specified results struct.
+ *
+ * @param p_hwfn -  HW device data
+ * @param results - Pointer to the attention read results
+ *
+ * @return error if one of the following holds:
+ *	- the version wasn't set
+ * Otherwise, returns ok.
+ */
+enum dbg_status qed_dbg_parse_attn(struct qed_hwfn *p_hwfn,
+				   struct dbg_attn_block_result *results);
+
+/* Debug Bus blocks */
+static const u32 dbg_bus_blocks[] = {
+	0x0000000f,		/* grc, bb, 15 lines */
+	0x0000000f,		/* grc, k2, 15 lines */
+	0x00000000,
+	0x00000000,		/* miscs, bb, 0 lines */
+	0x00000000,		/* miscs, k2, 0 lines */
+	0x00000000,
+	0x00000000,		/* misc, bb, 0 lines */
+	0x00000000,		/* misc, k2, 0 lines */
+	0x00000000,
+	0x00000000,		/* dbu, bb, 0 lines */
+	0x00000000,		/* dbu, k2, 0 lines */
+	0x00000000,
+	0x000f0127,		/* pglue_b, bb, 39 lines */
+	0x0036012a,		/* pglue_b, k2, 42 lines */
+	0x00000000,
+	0x00000000,		/* cnig, bb, 0 lines */
+	0x00120102,		/* cnig, k2, 2 lines */
+	0x00000000,
+	0x00000000,		/* cpmu, bb, 0 lines */
+	0x00000000,		/* cpmu, k2, 0 lines */
+	0x00000000,
+	0x00000001,		/* ncsi, bb, 1 lines */
+	0x00000001,		/* ncsi, k2, 1 lines */
+	0x00000000,
+	0x00000000,		/* opte, bb, 0 lines */
+	0x00000000,		/* opte, k2, 0 lines */
+	0x00000000,
+	0x00600085,		/* bmb, bb, 133 lines */
+	0x00600085,		/* bmb, k2, 133 lines */
+	0x00000000,
+	0x00000000,		/* pcie, bb, 0 lines */
+	0x00e50033,		/* pcie, k2, 51 lines */
+	0x00000000,
+	0x00000000,		/* mcp, bb, 0 lines */
+	0x00000000,		/* mcp, k2, 0 lines */
+	0x00000000,
+	0x01180009,		/* mcp2, bb, 9 lines */
+	0x01180009,		/* mcp2, k2, 9 lines */
+	0x00000000,
+	0x01210104,		/* pswhst, bb, 4 lines */
+	0x01210104,		/* pswhst, k2, 4 lines */
+	0x00000000,
+	0x01250103,		/* pswhst2, bb, 3 lines */
+	0x01250103,		/* pswhst2, k2, 3 lines */
+	0x00000000,
+	0x00340101,		/* pswrd, bb, 1 lines */
+	0x00340101,		/* pswrd, k2, 1 lines */
+	0x00000000,
+	0x01280119,		/* pswrd2, bb, 25 lines */
+	0x01280119,		/* pswrd2, k2, 25 lines */
+	0x00000000,
+	0x01410109,		/* pswwr, bb, 9 lines */
+	0x01410109,		/* pswwr, k2, 9 lines */
+	0x00000000,
+	0x00000000,		/* pswwr2, bb, 0 lines */
+	0x00000000,		/* pswwr2, k2, 0 lines */
+	0x00000000,
+	0x001c0001,		/* pswrq, bb, 1 lines */
+	0x001c0001,		/* pswrq, k2, 1 lines */
+	0x00000000,
+	0x014a0015,		/* pswrq2, bb, 21 lines */
+	0x014a0015,		/* pswrq2, k2, 21 lines */
+	0x00000000,
+	0x00000000,		/* pglcs, bb, 0 lines */
+	0x00120006,		/* pglcs, k2, 6 lines */
+	0x00000000,
+	0x00100001,		/* dmae, bb, 1 lines */
+	0x00100001,		/* dmae, k2, 1 lines */
+	0x00000000,
+	0x015f0105,		/* ptu, bb, 5 lines */
+	0x015f0105,		/* ptu, k2, 5 lines */
+	0x00000000,
+	0x01640120,		/* tcm, bb, 32 lines */
+	0x01640120,		/* tcm, k2, 32 lines */
+	0x00000000,
+	0x01640120,		/* mcm, bb, 32 lines */
+	0x01640120,		/* mcm, k2, 32 lines */
+	0x00000000,
+	0x01640120,		/* ucm, bb, 32 lines */
+	0x01640120,		/* ucm, k2, 32 lines */
+	0x00000000,
+	0x01640120,		/* xcm, bb, 32 lines */
+	0x01640120,		/* xcm, k2, 32 lines */
+	0x00000000,
+	0x01640120,		/* ycm, bb, 32 lines */
+	0x01640120,		/* ycm, k2, 32 lines */
+	0x00000000,
+	0x01640120,		/* pcm, bb, 32 lines */
+	0x01640120,		/* pcm, k2, 32 lines */
+	0x00000000,
+	0x01840062,		/* qm, bb, 98 lines */
+	0x01840062,		/* qm, k2, 98 lines */
+	0x00000000,
+	0x01e60021,		/* tm, bb, 33 lines */
+	0x01e60021,		/* tm, k2, 33 lines */
+	0x00000000,
+	0x02070107,		/* dorq, bb, 7 lines */
+	0x02070107,		/* dorq, k2, 7 lines */
+	0x00000000,
+	0x00600185,		/* brb, bb, 133 lines */
+	0x00600185,		/* brb, k2, 133 lines */
+	0x00000000,
+	0x020e0019,		/* src, bb, 25 lines */
+	0x020c001a,		/* src, k2, 26 lines */
+	0x00000000,
+	0x02270104,		/* prs, bb, 4 lines */
+	0x02270104,		/* prs, k2, 4 lines */
+	0x00000000,
+	0x022b0133,		/* tsdm, bb, 51 lines */
+	0x022b0133,		/* tsdm, k2, 51 lines */
+	0x00000000,
+	0x022b0133,		/* msdm, bb, 51 lines */
+	0x022b0133,		/* msdm, k2, 51 lines */
+	0x00000000,
+	0x022b0133,		/* usdm, bb, 51 lines */
+	0x022b0133,		/* usdm, k2, 51 lines */
+	0x00000000,
+	0x022b0133,		/* xsdm, bb, 51 lines */
+	0x022b0133,		/* xsdm, k2, 51 lines */
+	0x00000000,
+	0x022b0133,		/* ysdm, bb, 51 lines */
+	0x022b0133,		/* ysdm, k2, 51 lines */
+	0x00000000,
+	0x022b0133,		/* psdm, bb, 51 lines */
+	0x022b0133,		/* psdm, k2, 51 lines */
+	0x00000000,
+	0x025e010c,		/* tsem, bb, 12 lines */
+	0x025e010c,		/* tsem, k2, 12 lines */
+	0x00000000,
+	0x025e010c,		/* msem, bb, 12 lines */
+	0x025e010c,		/* msem, k2, 12 lines */
+	0x00000000,
+	0x025e010c,		/* usem, bb, 12 lines */
+	0x025e010c,		/* usem, k2, 12 lines */
+	0x00000000,
+	0x025e010c,		/* xsem, bb, 12 lines */
+	0x025e010c,		/* xsem, k2, 12 lines */
+	0x00000000,
+	0x025e010c,		/* ysem, bb, 12 lines */
+	0x025e010c,		/* ysem, k2, 12 lines */
+	0x00000000,
+	0x025e010c,		/* psem, bb, 12 lines */
+	0x025e010c,		/* psem, k2, 12 lines */
+	0x00000000,
+	0x026a000d,		/* rss, bb, 13 lines */
+	0x026a000d,		/* rss, k2, 13 lines */
+	0x00000000,
+	0x02770106,		/* tmld, bb, 6 lines */
+	0x02770106,		/* tmld, k2, 6 lines */
+	0x00000000,
+	0x027d0106,		/* muld, bb, 6 lines */
+	0x027d0106,		/* muld, k2, 6 lines */
+	0x00000000,
+	0x02770005,		/* yuld, bb, 5 lines */
+	0x02770005,		/* yuld, k2, 5 lines */
+	0x00000000,
+	0x02830107,		/* xyld, bb, 7 lines */
+	0x027d0107,		/* xyld, k2, 7 lines */
+	0x00000000,
+	0x00000000,		/* ptld, bb, 0 lines */
+	0x00000000,		/* ptld, k2, 0 lines */
+	0x00000000,
+	0x00000000,		/* ypld, bb, 0 lines */
+	0x00000000,		/* ypld, k2, 0 lines */
+	0x00000000,
+	0x028a010e,		/* prm, bb, 14 lines */
+	0x02980110,		/* prm, k2, 16 lines */
+	0x00000000,
+	0x02a8000d,		/* pbf_pb1, bb, 13 lines */
+	0x02a8000d,		/* pbf_pb1, k2, 13 lines */
+	0x00000000,
+	0x02a8000d,		/* pbf_pb2, bb, 13 lines */
+	0x02a8000d,		/* pbf_pb2, k2, 13 lines */
+	0x00000000,
+	0x02a8000d,		/* rpb, bb, 13 lines */
+	0x02a8000d,		/* rpb, k2, 13 lines */
+	0x00000000,
+	0x00600185,		/* btb, bb, 133 lines */
+	0x00600185,		/* btb, k2, 133 lines */
+	0x00000000,
+	0x02b50117,		/* pbf, bb, 23 lines */
+	0x02b50117,		/* pbf, k2, 23 lines */
+	0x00000000,
+	0x02cc0006,		/* rdif, bb, 6 lines */
+	0x02cc0006,		/* rdif, k2, 6 lines */
+	0x00000000,
+	0x02d20006,		/* tdif, bb, 6 lines */
+	0x02d20006,		/* tdif, k2, 6 lines */
+	0x00000000,
+	0x02d80003,		/* cdu, bb, 3 lines */
+	0x02db000e,		/* cdu, k2, 14 lines */
+	0x00000000,
+	0x02e9010d,		/* ccfc, bb, 13 lines */
+	0x02f60117,		/* ccfc, k2, 23 lines */
+	0x00000000,
+	0x02e9010d,		/* tcfc, bb, 13 lines */
+	0x02f60117,		/* tcfc, k2, 23 lines */
+	0x00000000,
+	0x030d0133,		/* igu, bb, 51 lines */
+	0x030d0133,		/* igu, k2, 51 lines */
+	0x00000000,
+	0x03400106,		/* cau, bb, 6 lines */
+	0x03400106,		/* cau, k2, 6 lines */
+	0x00000000,
+	0x00000000,		/* rgfs, bb, 0 lines */
+	0x00000000,		/* rgfs, k2, 0 lines */
+	0x00000000,
+	0x00000000,		/* rgsrc, bb, 0 lines */
+	0x00000000,		/* rgsrc, k2, 0 lines */
+	0x00000000,
+	0x00000000,		/* tgfs, bb, 0 lines */
+	0x00000000,		/* tgfs, k2, 0 lines */
+	0x00000000,
+	0x00000000,		/* tgsrc, bb, 0 lines */
+	0x00000000,		/* tgsrc, k2, 0 lines */
+	0x00000000,
+	0x00000000,		/* umac, bb, 0 lines */
+	0x00120006,		/* umac, k2, 6 lines */
+	0x00000000,
+	0x00000000,		/* xmac, bb, 0 lines */
+	0x00000000,		/* xmac, k2, 0 lines */
+	0x00000000,
+	0x00000000,		/* dbg, bb, 0 lines */
+	0x00000000,		/* dbg, k2, 0 lines */
+	0x00000000,
+	0x0346012b,		/* nig, bb, 43 lines */
+	0x0346011d,		/* nig, k2, 29 lines */
+	0x00000000,
+	0x00000000,		/* wol, bb, 0 lines */
+	0x001c0002,		/* wol, k2, 2 lines */
+	0x00000000,
+	0x00000000,		/* bmbn, bb, 0 lines */
+	0x00210008,		/* bmbn, k2, 8 lines */
+	0x00000000,
+	0x00000000,		/* ipc, bb, 0 lines */
+	0x00000000,		/* ipc, k2, 0 lines */
+	0x00000000,
+	0x00000000,		/* nwm, bb, 0 lines */
+	0x0371000b,		/* nwm, k2, 11 lines */
+	0x00000000,
+	0x00000000,		/* nws, bb, 0 lines */
+	0x037c0009,		/* nws, k2, 9 lines */
+	0x00000000,
+	0x00000000,		/* ms, bb, 0 lines */
+	0x00120004,		/* ms, k2, 4 lines */
+	0x00000000,
+	0x00000000,		/* phy_pcie, bb, 0 lines */
+	0x00e5001a,		/* phy_pcie, k2, 26 lines */
+	0x00000000,
+	0x00000000,		/* led, bb, 0 lines */
+	0x00000000,		/* led, k2, 0 lines */
+	0x00000000,
+	0x00000000,		/* avs_wrap, bb, 0 lines */
+	0x00000000,		/* avs_wrap, k2, 0 lines */
+	0x00000000,
+	0x00000000,		/* bar0_map, bb, 0 lines */
+	0x00000000,		/* bar0_map, k2, 0 lines */
+	0x00000000,
+	0x00000000,		/* bar0_map, bb, 0 lines */
+	0x00000000,		/* bar0_map, k2, 0 lines */
+	0x00000000,
+};
+
+/* Win 2 */
+#define GTT_BAR0_MAP_REG_IGU_CMD	0x00f000UL
+
+/* Win 3 */
+#define GTT_BAR0_MAP_REG_TSDM_RAM	0x010000UL
+
+/* Win 4 */
+#define GTT_BAR0_MAP_REG_MSDM_RAM	0x011000UL
+
+/* Win 5 */
+#define GTT_BAR0_MAP_REG_MSDM_RAM_1024	0x012000UL
+
+/* Win 6 */
+#define GTT_BAR0_MAP_REG_USDM_RAM	0x013000UL
+
+/* Win 7 */
+#define GTT_BAR0_MAP_REG_USDM_RAM_1024	0x014000UL
+
+/* Win 8 */
+#define GTT_BAR0_MAP_REG_USDM_RAM_2048	0x015000UL
+
+/* Win 9 */
+#define GTT_BAR0_MAP_REG_XSDM_RAM	0x016000UL
+
+/* Win 10 */
+#define GTT_BAR0_MAP_REG_YSDM_RAM	0x017000UL
+
+/* Win 11 */
+#define GTT_BAR0_MAP_REG_PSDM_RAM	0x018000UL
+
+/**
+ * @brief qed_qm_pf_mem_size - prepare QM ILT sizes
+ *
+ * Returns the required host memory size in 4KB units.
+ * Must be called before all QM init HSI functions.
+ *
+ * @param num_pf_cids - number of connections used by this PF
+ * @param num_vf_cids - number of connections used by VFs of this PF
+ * @param num_tids - number of tasks used by this PF
+ * @param num_pf_pqs - number of PQs used by this PF
+ * @param num_vf_pqs - number of PQs used by VFs of this PF
+ *
+ * @return The required host memory size in 4KB units.
+ */
+u32 qed_qm_pf_mem_size(u32 num_pf_cids,
+		       u32 num_vf_cids,
+		       u32 num_tids, u16 num_pf_pqs, u16 num_vf_pqs);
+
+struct qed_qm_common_rt_init_params {
+	u8 max_ports_per_engine;
+	u8 max_phys_tcs_per_port;
+	bool pf_rl_en;
+	bool pf_wfq_en;
+	bool vport_rl_en;
+	bool vport_wfq_en;
+	struct init_qm_port_params *port_params;
+};
+
+int qed_qm_common_rt_init(struct qed_hwfn *p_hwfn,
+			  struct qed_qm_common_rt_init_params *p_params);
+
+struct qed_qm_pf_rt_init_params {
+	u8 port_id;
+	u8 pf_id;
+	u8 max_phys_tcs_per_port;
+	bool is_pf_loading;
+	u32 num_pf_cids;
+	u32 num_vf_cids;
+	u32 num_tids;
+	u16 start_pq;
+	u16 num_pf_pqs;
+	u16 num_vf_pqs;
+	u8 start_vport;
+	u8 num_vports;
+	u16 pf_wfq;
+	u32 pf_rl;
+	u32 link_speed;
+	struct init_qm_pq_params *pq_params;
+	struct init_qm_vport_params *vport_params;
+};
+
+int qed_qm_pf_rt_init(struct qed_hwfn *p_hwfn,
+	struct qed_ptt *p_ptt,
+	struct qed_qm_pf_rt_init_params *p_params);
+
+/**
+ * @brief qed_init_pf_wfq - Initializes the WFQ weight of the specified PF
+ *
+ * @param p_hwfn
+ * @param p_ptt - ptt window used for writing the registers
+ * @param pf_id - PF ID
+ * @param pf_wfq - WFQ weight. Must be non-zero.
+ *
+ * @return 0 on success, -1 on error.
+ */
+int qed_init_pf_wfq(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt, u8 pf_id, u16 pf_wfq);
+
+/**
+ * @brief qed_init_pf_rl - Initializes the rate limit of the specified PF
+ *
+ * @param p_hwfn
+ * @param p_ptt - ptt window used for writing the registers
+ * @param pf_id - PF ID
+ * @param pf_rl - rate limit in Mb/sec units
+ *
+ * @return 0 on success, -1 on error.
+ */
+int qed_init_pf_rl(struct qed_hwfn *p_hwfn,
+		   struct qed_ptt *p_ptt, u8 pf_id, u32 pf_rl);
+
+/**
+ * @brief qed_init_vport_wfq Initializes the WFQ weight of the specified VPORT
+ *
+ * @param p_hwfn
+ * @param p_ptt - ptt window used for writing the registers
+ * @param first_tx_pq_id- An array containing the first Tx PQ ID associated
+ *	  with the VPORT for each TC. This array is filled by
+ *	  qed_qm_pf_rt_init
+ * @param vport_wfq - WFQ weight. Must be non-zero.
+ *
+ * @return 0 on success, -1 on error.
+ */
+int qed_init_vport_wfq(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt,
+		       u16 first_tx_pq_id[NUM_OF_TCS], u16 vport_wfq);
+
+/**
+ * @brief qed_init_vport_rl - Initializes the rate limit of the specified VPORT
+ *
+ * @param p_hwfn
+ * @param p_ptt - ptt window used for writing the registers
+ * @param vport_id - VPORT ID
+ * @param vport_rl - rate limit in Mb/sec units
+ * @param link_speed - link speed in Mbps.
+ *
+ * @return 0 on success, -1 on error.
+ */
+int qed_init_vport_rl(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt,
+		      u8 vport_id, u32 vport_rl, u32 link_speed);
+
+/**
+ * @brief qed_send_qm_stop_cmd  Sends a stop command to the QM
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param is_release_cmd - true for release, false for stop.
+ * @param is_tx_pq - true for Tx PQs, false for Other PQs.
+ * @param start_pq - first PQ ID to stop
+ * @param num_pqs - Number of PQs to stop, starting from start_pq.
+ *
+ * @return bool, true if successful, false if timeout occurred while waiting for
+ *	QM command done.
+ */
+bool qed_send_qm_stop_cmd(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  bool is_release_cmd,
+			  bool is_tx_pq, u16 start_pq, u16 num_pqs);
+
+/**
+ * @brief qed_set_vxlan_dest_port - initializes vxlan tunnel destination udp port
+ *
+ * @param p_hwfn
+ * @param p_ptt - ptt window used for writing the registers.
+ * @param dest_port - vxlan destination udp port.
+ */
+void qed_set_vxlan_dest_port(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt, u16 dest_port);
+
+/**
+ * @brief qed_set_vxlan_enable - enable or disable VXLAN tunnel in HW
+ *
+ * @param p_hwfn
+ * @param p_ptt - ptt window used for writing the registers.
+ * @param vxlan_enable - vxlan enable flag.
+ */
+void qed_set_vxlan_enable(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, bool vxlan_enable);
+
+/**
+ * @brief qed_set_gre_enable - enable or disable GRE tunnel in HW
+ *
+ * @param p_hwfn
+ * @param p_ptt - ptt window used for writing the registers.
+ * @param eth_gre_enable - eth GRE enable enable flag.
+ * @param ip_gre_enable - IP GRE enable enable flag.
+ */
+void qed_set_gre_enable(struct qed_hwfn *p_hwfn,
+			struct qed_ptt *p_ptt,
+			bool eth_gre_enable, bool ip_gre_enable);
+
+/**
+ * @brief qed_set_geneve_dest_port - initializes geneve tunnel destination udp port
+ *
+ * @param p_hwfn
+ * @param p_ptt - ptt window used for writing the registers.
+ * @param dest_port - geneve destination udp port.
+ */
+void qed_set_geneve_dest_port(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt, u16 dest_port);
+
+/**
+ * @brief qed_set_gre_enable - enable or disable GRE tunnel in HW
+ *
+ * @param p_ptt - ptt window used for writing the registers.
+ * @param eth_geneve_enable - eth GENEVE enable enable flag.
+ * @param ip_geneve_enable - IP GENEVE enable enable flag.
+ */
+void qed_set_geneve_enable(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt,
+			   bool eth_geneve_enable, bool ip_geneve_enable);
+
+void qed_set_vxlan_no_l2_enable(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt, bool enable);
+
+/**
+ * @brief qed_gft_disable - Disable GFT
+ *
+ * @param p_hwfn
+ * @param p_ptt - ptt window used for writing the registers.
+ * @param pf_id - pf on which to disable GFT.
+ */
+void qed_gft_disable(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, u16 pf_id);
+
+/**
+ * @brief qed_gft_config - Enable and configure HW for GFT
+ *
+ * @param p_hwfn
+ * @param p_ptt - ptt window used for writing the registers.
+ * @param pf_id - pf on which to enable GFT.
+ * @param tcp - set profile tcp packets.
+ * @param udp - set profile udp  packet.
+ * @param ipv4 - set profile ipv4 packet.
+ * @param ipv6 - set profile ipv6 packet.
+ * @param profile_type - define packet same fields. Use enum gft_profile_type.
+ */
+void qed_gft_config(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt,
+		    u16 pf_id,
+		    bool tcp,
+		    bool udp,
+		    bool ipv4, bool ipv6, enum gft_profile_type profile_type);
+
+/**
+ * @brief qed_enable_context_validation - Enable and configure context
+ *	validation.
+ *
+ * @param p_hwfn
+ * @param p_ptt - ptt window used for writing the registers.
+ */
+void qed_enable_context_validation(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt);
+
+/**
+ * @brief qed_calc_session_ctx_validation - Calcualte validation byte for
+ *	session context.
+ *
+ * @param p_ctx_mem - pointer to context memory.
+ * @param ctx_size - context size.
+ * @param ctx_type - context type.
+ * @param cid - context cid.
+ */
+void qed_calc_session_ctx_validation(void *p_ctx_mem,
+				     u16 ctx_size, u8 ctx_type, u32 cid);
+
+/**
+ * @brief qed_calc_task_ctx_validation - Calcualte validation byte for task
+ *	context.
+ *
+ * @param p_ctx_mem - pointer to context memory.
+ * @param ctx_size - context size.
+ * @param ctx_type - context type.
+ * @param tid - context tid.
+ */
+void qed_calc_task_ctx_validation(void *p_ctx_mem,
+				  u16 ctx_size, u8 ctx_type, u32 tid);
+
+/**
+ * @brief qed_memset_session_ctx - Memset session context to 0 while
+ *	preserving validation bytes.
+ *
+ * @param p_hwfn -
+ * @param p_ctx_mem - pointer to context memory.
+ * @param ctx_size - size to initialzie.
+ * @param ctx_type - context type.
+ */
+void qed_memset_session_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type);
+
+/**
+ * @brief qed_memset_task_ctx - Memset task context to 0 while preserving
+ *	validation bytes.
+ *
+ * @param p_ctx_mem - pointer to context memory.
+ * @param ctx_size - size to initialzie.
+ * @param ctx_type - context type.
+ */
+void qed_memset_task_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type);
+
+/* Ystorm flow control mode. Use enum fw_flow_ctrl_mode */
+#define YSTORM_FLOW_CONTROL_MODE_OFFSET			(IRO[0].base)
+#define YSTORM_FLOW_CONTROL_MODE_SIZE			(IRO[0].size)
+
+/* Tstorm port statistics */
+#define TSTORM_PORT_STAT_OFFSET(port_id) \
+	(IRO[1].base + ((port_id) * IRO[1].m1))
+#define TSTORM_PORT_STAT_SIZE				(IRO[1].size)
+
+/* Tstorm ll2 port statistics */
+#define TSTORM_LL2_PORT_STAT_OFFSET(port_id) \
+	(IRO[2].base + ((port_id) * IRO[2].m1))
+#define TSTORM_LL2_PORT_STAT_SIZE			(IRO[2].size)
+
+/* Ustorm VF-PF Channel ready flag */
+#define USTORM_VF_PF_CHANNEL_READY_OFFSET(vf_id) \
+	(IRO[3].base + ((vf_id) * IRO[3].m1))
+#define USTORM_VF_PF_CHANNEL_READY_SIZE			(IRO[3].size)
+
+/* Ustorm Final flr cleanup ack */
+#define USTORM_FLR_FINAL_ACK_OFFSET(pf_id) \
+	(IRO[4].base + ((pf_id) * IRO[4].m1))
+#define USTORM_FLR_FINAL_ACK_SIZE			(IRO[4].size)
+
+/* Ustorm Event ring consumer */
+#define USTORM_EQE_CONS_OFFSET(pf_id) \
+	(IRO[5].base + ((pf_id) * IRO[5].m1))
+#define USTORM_EQE_CONS_SIZE				(IRO[5].size)
+
+/* Ustorm eth queue zone */
+#define USTORM_ETH_QUEUE_ZONE_OFFSET(queue_zone_id) \
+	(IRO[6].base + ((queue_zone_id) * IRO[6].m1))
+#define USTORM_ETH_QUEUE_ZONE_SIZE			(IRO[6].size)
+
+/* Ustorm Common Queue ring consumer */
+#define USTORM_COMMON_QUEUE_CONS_OFFSET(queue_zone_id) \
+	(IRO[7].base + ((queue_zone_id) * IRO[7].m1))
+#define USTORM_COMMON_QUEUE_CONS_SIZE			(IRO[7].size)
+
+/* Xstorm Integration Test Data */
+#define XSTORM_INTEG_TEST_DATA_OFFSET			(IRO[8].base)
+#define XSTORM_INTEG_TEST_DATA_SIZE			(IRO[8].size)
+
+/* Ystorm Integration Test Data */
+#define YSTORM_INTEG_TEST_DATA_OFFSET			(IRO[9].base)
+#define YSTORM_INTEG_TEST_DATA_SIZE			(IRO[9].size)
+
+/* Pstorm Integration Test Data */
+#define PSTORM_INTEG_TEST_DATA_OFFSET			(IRO[10].base)
+#define PSTORM_INTEG_TEST_DATA_SIZE			(IRO[10].size)
+
+/* Tstorm Integration Test Data */
+#define TSTORM_INTEG_TEST_DATA_OFFSET			(IRO[11].base)
+#define TSTORM_INTEG_TEST_DATA_SIZE			(IRO[11].size)
+
+/* Mstorm Integration Test Data */
+#define MSTORM_INTEG_TEST_DATA_OFFSET			(IRO[12].base)
+#define MSTORM_INTEG_TEST_DATA_SIZE			(IRO[12].size)
+
+/* Ustorm Integration Test Data */
+#define USTORM_INTEG_TEST_DATA_OFFSET			(IRO[13].base)
+#define USTORM_INTEG_TEST_DATA_SIZE			(IRO[13].size)
+
+/* Tstorm producers */
+#define TSTORM_LL2_RX_PRODS_OFFSET(core_rx_queue_id) \
+	(IRO[14].base + ((core_rx_queue_id) * IRO[14].m1))
+#define TSTORM_LL2_RX_PRODS_SIZE			(IRO[14].size)
+
+/* Tstorm LightL2 queue statistics */
+#define CORE_LL2_TSTORM_PER_QUEUE_STAT_OFFSET(core_rx_queue_id) \
+	(IRO[15].base + ((core_rx_queue_id) * IRO[15].m1))
+#define CORE_LL2_TSTORM_PER_QUEUE_STAT_SIZE		(IRO[15].size)
+
+/* Ustorm LiteL2 queue statistics */
+#define CORE_LL2_USTORM_PER_QUEUE_STAT_OFFSET(core_rx_queue_id) \
+	(IRO[16].base + ((core_rx_queue_id) * IRO[16].m1))
+#define CORE_LL2_USTORM_PER_QUEUE_STAT_SIZE		(IRO[16].size)
+
+/* Pstorm LiteL2 queue statistics */
+#define CORE_LL2_PSTORM_PER_QUEUE_STAT_OFFSET(core_tx_stats_id) \
+	(IRO[17].base + ((core_tx_stats_id) * IRO[17].m1))
+#define CORE_LL2_PSTORM_PER_QUEUE_STAT_SIZE		(IRO[17].size)
+
+/* Mstorm queue statistics */
+#define MSTORM_QUEUE_STAT_OFFSET(stat_counter_id) \
+	(IRO[18].base + ((stat_counter_id) * IRO[18].m1))
+#define MSTORM_QUEUE_STAT_SIZE				(IRO[18].size)
+
+/* Mstorm ETH PF queues producers */
+#define MSTORM_ETH_PF_PRODS_OFFSET(queue_id) \
+	(IRO[19].base + ((queue_id) * IRO[19].m1))
+#define MSTORM_ETH_PF_PRODS_SIZE			(IRO[19].size)
+
+/* Mstorm ETH VF queues producers offset in RAM. Used in default VF zone size
+ * mode.
+ */
+#define MSTORM_ETH_VF_PRODS_OFFSET(vf_id, vf_queue_id) \
+	(IRO[20].base + ((vf_id) * IRO[20].m1) + ((vf_queue_id) * IRO[20].m2))
+#define MSTORM_ETH_VF_PRODS_SIZE			(IRO[20].size)
+
+/* TPA agregation timeout in us resolution (on ASIC) */
+#define MSTORM_TPA_TIMEOUT_US_OFFSET			(IRO[21].base)
+#define MSTORM_TPA_TIMEOUT_US_SIZE			(IRO[21].size)
+
+/* Mstorm pf statistics */
+#define MSTORM_ETH_PF_STAT_OFFSET(pf_id) \
+	(IRO[22].base + ((pf_id) * IRO[22].m1))
+#define MSTORM_ETH_PF_STAT_SIZE				(IRO[22].size)
+
+/* Ustorm queue statistics */
+#define USTORM_QUEUE_STAT_OFFSET(stat_counter_id) \
+	(IRO[23].base + ((stat_counter_id) * IRO[23].m1))
+#define USTORM_QUEUE_STAT_SIZE				(IRO[23].size)
+
+/* Ustorm pf statistics */
+#define USTORM_ETH_PF_STAT_OFFSET(pf_id)\
+	(IRO[24].base + ((pf_id) * IRO[24].m1))
+#define USTORM_ETH_PF_STAT_SIZE				(IRO[24].size)
+
+/* Pstorm queue statistics */
+#define PSTORM_QUEUE_STAT_OFFSET(stat_counter_id) \
+	(IRO[25].base + ((stat_counter_id) * IRO[25].m1))
+#define PSTORM_QUEUE_STAT_SIZE				(IRO[25].size)
+
+/* Pstorm pf statistics */
+#define PSTORM_ETH_PF_STAT_OFFSET(pf_id) \
+	(IRO[26].base + ((pf_id) * IRO[26].m1))
+#define PSTORM_ETH_PF_STAT_SIZE				(IRO[26].size)
+
+/* Control frame's EthType configuration for TX control frame security */
+#define PSTORM_CTL_FRAME_ETHTYPE_OFFSET(eth_type_id) \
+	(IRO[27].base + ((eth_type_id) * IRO[27].m1))
+#define PSTORM_CTL_FRAME_ETHTYPE_SIZE			(IRO[27].size)
+
+/* Tstorm last parser message */
+#define TSTORM_ETH_PRS_INPUT_OFFSET			(IRO[28].base)
+#define TSTORM_ETH_PRS_INPUT_SIZE			(IRO[28].size)
+
+/* Tstorm Eth limit Rx rate */
+#define ETH_RX_RATE_LIMIT_OFFSET(pf_id) \
+	(IRO[29].base + ((pf_id) * IRO[29].m1))
+#define ETH_RX_RATE_LIMIT_SIZE				(IRO[29].size)
+
+/* Xstorm queue zone */
+#define XSTORM_ETH_QUEUE_ZONE_OFFSET(queue_id) \
+	(IRO[30].base + ((queue_id) * IRO[30].m1))
+#define XSTORM_ETH_QUEUE_ZONE_SIZE			(IRO[30].size)
+
+/* Ystorm cqe producer */
+#define YSTORM_TOE_CQ_PROD_OFFSET(rss_id) \
+	(IRO[31].base + ((rss_id) * IRO[31].m1))
+#define YSTORM_TOE_CQ_PROD_SIZE				(IRO[31].size)
+
+/* Ustorm cqe producer */
+#define USTORM_TOE_CQ_PROD_OFFSET(rss_id) \
+	(IRO[32].base + ((rss_id) * IRO[32].m1))
+#define USTORM_TOE_CQ_PROD_SIZE				(IRO[32].size)
+
+/* Ustorm grq producer */
+#define USTORM_TOE_GRQ_PROD_OFFSET(pf_id) \
+	(IRO[33].base + ((pf_id) * IRO[33].m1))
+#define USTORM_TOE_GRQ_PROD_SIZE			(IRO[33].size)
+
+/* Tstorm cmdq-cons of given command queue-id */
+#define TSTORM_SCSI_CMDQ_CONS_OFFSET(cmdq_queue_id) \
+	(IRO[34].base + ((cmdq_queue_id) * IRO[34].m1))
+#define TSTORM_SCSI_CMDQ_CONS_SIZE			(IRO[34].size)
+
+/* Tstorm (reflects M-Storm) bdq-external-producer of given function ID,
+ * BDqueue-id.
+ */
+#define TSTORM_SCSI_BDQ_EXT_PROD_OFFSET(func_id, bdq_id) \
+	(IRO[35].base + ((func_id) * IRO[35].m1) + ((bdq_id) * IRO[35].m2))
+#define TSTORM_SCSI_BDQ_EXT_PROD_SIZE			(IRO[35].size)
+
+/* Mstorm bdq-external-producer of given BDQ resource ID, BDqueue-id */
+#define MSTORM_SCSI_BDQ_EXT_PROD_OFFSET(func_id, bdq_id) \
+	(IRO[36].base + ((func_id) * IRO[36].m1) + ((bdq_id) * IRO[36].m2))
+#define MSTORM_SCSI_BDQ_EXT_PROD_SIZE			(IRO[36].size)
+
+/* Tstorm iSCSI RX stats */
+#define TSTORM_ISCSI_RX_STATS_OFFSET(pf_id) \
+	(IRO[37].base + ((pf_id) * IRO[37].m1))
+#define TSTORM_ISCSI_RX_STATS_SIZE			(IRO[37].size)
+
+/* Mstorm iSCSI RX stats */
+#define MSTORM_ISCSI_RX_STATS_OFFSET(pf_id) \
+	(IRO[38].base + ((pf_id) * IRO[38].m1))
+#define MSTORM_ISCSI_RX_STATS_SIZE			(IRO[38].size)
+
+/* Ustorm iSCSI RX stats */
+#define USTORM_ISCSI_RX_STATS_OFFSET(pf_id) \
+	(IRO[39].base + ((pf_id) * IRO[39].m1))
+#define USTORM_ISCSI_RX_STATS_SIZE			(IRO[39].size)
+
+/* Xstorm iSCSI TX stats */
+#define XSTORM_ISCSI_TX_STATS_OFFSET(pf_id) \
+	(IRO[40].base + ((pf_id) * IRO[40].m1))
+#define XSTORM_ISCSI_TX_STATS_SIZE			(IRO[40].size)
+
+/* Ystorm iSCSI TX stats */
+#define YSTORM_ISCSI_TX_STATS_OFFSET(pf_id) \
+	(IRO[41].base + ((pf_id) * IRO[41].m1))
+#define YSTORM_ISCSI_TX_STATS_SIZE			(IRO[41].size)
+
+/* Pstorm iSCSI TX stats */
+#define PSTORM_ISCSI_TX_STATS_OFFSET(pf_id) \
+	(IRO[42].base + ((pf_id) * IRO[42].m1))
+#define PSTORM_ISCSI_TX_STATS_SIZE			(IRO[42].size)
+
+/* Tstorm FCoE RX stats */
+#define TSTORM_FCOE_RX_STATS_OFFSET(pf_id) \
+	(IRO[43].base + ((pf_id) * IRO[43].m1))
+#define TSTORM_FCOE_RX_STATS_SIZE			(IRO[43].size)
+
+/* Pstorm FCoE TX stats */
+#define PSTORM_FCOE_TX_STATS_OFFSET(pf_id) \
+	(IRO[44].base + ((pf_id) * IRO[44].m1))
+#define PSTORM_FCOE_TX_STATS_SIZE			(IRO[44].size)
+
+/* Pstorm RDMA queue statistics */
+#define PSTORM_RDMA_QUEUE_STAT_OFFSET(rdma_stat_counter_id) \
+	(IRO[45].base + ((rdma_stat_counter_id) * IRO[45].m1))
+#define PSTORM_RDMA_QUEUE_STAT_SIZE			(IRO[45].size)
+
+/* Tstorm RDMA queue statistics */
+#define TSTORM_RDMA_QUEUE_STAT_OFFSET(rdma_stat_counter_id) \
+	(IRO[46].base + ((rdma_stat_counter_id) * IRO[46].m1))
+#define TSTORM_RDMA_QUEUE_STAT_SIZE			(IRO[46].size)
+
+/* Xstorm iWARP rxmit stats */
+#define XSTORM_IWARP_RXMIT_STATS_OFFSET(pf_id) \
+	(IRO[47].base + ((pf_id) * IRO[47].m1))
+#define XSTORM_IWARP_RXMIT_STATS_SIZE			(IRO[47].size)
+
+/* Tstorm RoCE Event Statistics */
+#define TSTORM_ROCE_EVENTS_STAT_OFFSET(roce_pf_id) \
+	(IRO[48].base + ((roce_pf_id) * IRO[48].m1))
+#define TSTORM_ROCE_EVENTS_STAT_SIZE			(IRO[48].size)
+
+/* DCQCN Received Statistics */
+#define YSTORM_ROCE_DCQCN_RECEIVED_STATS_OFFSET(roce_pf_id) \
+	(IRO[49].base + ((roce_pf_id) * IRO[49].m1))
+#define YSTORM_ROCE_DCQCN_RECEIVED_STATS_SIZE		(IRO[49].size)
+
+/* DCQCN Sent Statistics */
+#define PSTORM_ROCE_DCQCN_SENT_STATS_OFFSET(roce_pf_id) \
+	(IRO[50].base + ((roce_pf_id) * IRO[50].m1))
+#define PSTORM_ROCE_DCQCN_SENT_STATS_SIZE		(IRO[50].size)
+
+static const struct iro iro_arr[51] = {
+	{0x0, 0x0, 0x0, 0x0, 0x8},
+	{0x4cb8, 0x88, 0x0, 0x0, 0x88},
+	{0x6530, 0x20, 0x0, 0x0, 0x20},
+	{0xb00, 0x8, 0x0, 0x0, 0x4},
+	{0xa80, 0x8, 0x0, 0x0, 0x4},
+	{0x0, 0x8, 0x0, 0x0, 0x2},
+	{0x80, 0x8, 0x0, 0x0, 0x4},
+	{0x84, 0x8, 0x0, 0x0, 0x2},
+	{0x4c48, 0x0, 0x0, 0x0, 0x78},
+	{0x3e38, 0x0, 0x0, 0x0, 0x78},
+	{0x2b78, 0x0, 0x0, 0x0, 0x78},
+	{0x4c40, 0x0, 0x0, 0x0, 0x78},
+	{0x4998, 0x0, 0x0, 0x0, 0x78},
+	{0x7f50, 0x0, 0x0, 0x0, 0x78},
+	{0xa28, 0x8, 0x0, 0x0, 0x8},
+	{0x6210, 0x10, 0x0, 0x0, 0x10},
+	{0xb820, 0x30, 0x0, 0x0, 0x30},
+	{0x96c0, 0x30, 0x0, 0x0, 0x30},
+	{0x4b68, 0x80, 0x0, 0x0, 0x40},
+	{0x1f8, 0x4, 0x0, 0x0, 0x4},
+	{0x53a8, 0x80, 0x4, 0x0, 0x4},
+	{0xc7d0, 0x0, 0x0, 0x0, 0x4},
+	{0x4ba8, 0x80, 0x0, 0x0, 0x20},
+	{0x8158, 0x40, 0x0, 0x0, 0x30},
+	{0xe770, 0x60, 0x0, 0x0, 0x60},
+	{0x2d10, 0x80, 0x0, 0x0, 0x38},
+	{0xf2b8, 0x78, 0x0, 0x0, 0x78},
+	{0x1f8, 0x4, 0x0, 0x0, 0x4},
+	{0xaf20, 0x0, 0x0, 0x0, 0xf0},
+	{0xb010, 0x8, 0x0, 0x0, 0x8},
+	{0x1f8, 0x8, 0x0, 0x0, 0x8},
+	{0xac0, 0x8, 0x0, 0x0, 0x8},
+	{0x2578, 0x8, 0x0, 0x0, 0x8},
+	{0x24f8, 0x8, 0x0, 0x0, 0x8},
+	{0x0, 0x8, 0x0, 0x0, 0x8},
+	{0x400, 0x18, 0x8, 0x0, 0x8},
+	{0xb78, 0x18, 0x8, 0x0, 0x2},
+	{0xd898, 0x50, 0x0, 0x0, 0x3c},
+	{0x12908, 0x18, 0x0, 0x0, 0x10},
+	{0x11aa8, 0x40, 0x0, 0x0, 0x18},
+	{0xa588, 0x50, 0x0, 0x0, 0x20},
+	{0x8700, 0x40, 0x0, 0x0, 0x28},
+	{0x10300, 0x18, 0x0, 0x0, 0x10},
+	{0xde48, 0x48, 0x0, 0x0, 0x38},
+	{0x10768, 0x20, 0x0, 0x0, 0x20},
+	{0x2d48, 0x80, 0x0, 0x0, 0x10},
+	{0x5048, 0x10, 0x0, 0x0, 0x10},
+	{0xc9b8, 0x30, 0x0, 0x0, 0x10},
+	{0xed90, 0x10, 0x0, 0x0, 0x10},
+	{0xa3a0, 0x10, 0x0, 0x0, 0x10},
+	{0x13108, 0x8, 0x0, 0x0, 0x8},
+};
+
+/* Runtime array offsets */
+#define DORQ_REG_PF_MAX_ICID_0_RT_OFFSET			0
+#define DORQ_REG_PF_MAX_ICID_1_RT_OFFSET			1
+#define DORQ_REG_PF_MAX_ICID_2_RT_OFFSET			2
+#define DORQ_REG_PF_MAX_ICID_3_RT_OFFSET			3
+#define DORQ_REG_PF_MAX_ICID_4_RT_OFFSET			4
+#define DORQ_REG_PF_MAX_ICID_5_RT_OFFSET			5
+#define DORQ_REG_PF_MAX_ICID_6_RT_OFFSET			6
+#define DORQ_REG_PF_MAX_ICID_7_RT_OFFSET			7
+#define DORQ_REG_VF_MAX_ICID_0_RT_OFFSET			8
+#define DORQ_REG_VF_MAX_ICID_1_RT_OFFSET			9
+#define DORQ_REG_VF_MAX_ICID_2_RT_OFFSET			10
+#define DORQ_REG_VF_MAX_ICID_3_RT_OFFSET			11
+#define DORQ_REG_VF_MAX_ICID_4_RT_OFFSET			12
+#define DORQ_REG_VF_MAX_ICID_5_RT_OFFSET			13
+#define DORQ_REG_VF_MAX_ICID_6_RT_OFFSET			14
+#define DORQ_REG_VF_MAX_ICID_7_RT_OFFSET			15
+#define DORQ_REG_PF_WAKE_ALL_RT_OFFSET				16
+#define DORQ_REG_TAG1_ETHERTYPE_RT_OFFSET			17
+#define DORQ_REG_GLB_MAX_ICID_0_RT_OFFSET			18
+#define DORQ_REG_GLB_MAX_ICID_1_RT_OFFSET			19
+#define DORQ_REG_GLB_RANGE2CONN_TYPE_0_RT_OFFSET		20
+#define DORQ_REG_GLB_RANGE2CONN_TYPE_1_RT_OFFSET		21
+#define DORQ_REG_PRV_PF_MAX_ICID_2_RT_OFFSET			22
+#define DORQ_REG_PRV_PF_MAX_ICID_3_RT_OFFSET			23
+#define DORQ_REG_PRV_PF_MAX_ICID_4_RT_OFFSET			24
+#define DORQ_REG_PRV_PF_MAX_ICID_5_RT_OFFSET			25
+#define DORQ_REG_PRV_VF_MAX_ICID_2_RT_OFFSET			26
+#define DORQ_REG_PRV_VF_MAX_ICID_3_RT_OFFSET			27
+#define DORQ_REG_PRV_VF_MAX_ICID_4_RT_OFFSET			28
+#define DORQ_REG_PRV_VF_MAX_ICID_5_RT_OFFSET			29
+#define DORQ_REG_PRV_PF_RANGE2CONN_TYPE_2_RT_OFFSET		30
+#define DORQ_REG_PRV_PF_RANGE2CONN_TYPE_3_RT_OFFSET		31
+#define DORQ_REG_PRV_PF_RANGE2CONN_TYPE_4_RT_OFFSET		32
+#define DORQ_REG_PRV_PF_RANGE2CONN_TYPE_5_RT_OFFSET		33
+#define DORQ_REG_PRV_VF_RANGE2CONN_TYPE_2_RT_OFFSET		34
+#define DORQ_REG_PRV_VF_RANGE2CONN_TYPE_3_RT_OFFSET		35
+#define DORQ_REG_PRV_VF_RANGE2CONN_TYPE_4_RT_OFFSET		36
+#define DORQ_REG_PRV_VF_RANGE2CONN_TYPE_5_RT_OFFSET		37
+#define IGU_REG_PF_CONFIGURATION_RT_OFFSET			38
+#define IGU_REG_VF_CONFIGURATION_RT_OFFSET			39
+#define IGU_REG_ATTN_MSG_ADDR_L_RT_OFFSET			40
+#define IGU_REG_ATTN_MSG_ADDR_H_RT_OFFSET			41
+#define IGU_REG_LEADING_EDGE_LATCH_RT_OFFSET			42
+#define IGU_REG_TRAILING_EDGE_LATCH_RT_OFFSET			43
+#define CAU_REG_CQE_AGG_UNIT_SIZE_RT_OFFSET			44
+#define CAU_REG_SB_VAR_MEMORY_RT_OFFSET				45
+#define CAU_REG_SB_VAR_MEMORY_RT_SIZE				1024
+#define CAU_REG_SB_ADDR_MEMORY_RT_OFFSET			1069
+#define CAU_REG_SB_ADDR_MEMORY_RT_SIZE				1024
+#define CAU_REG_PI_MEMORY_RT_OFFSET				2093
+#define CAU_REG_PI_MEMORY_RT_SIZE				4416
+#define PRS_REG_SEARCH_RESP_INITIATOR_TYPE_RT_OFFSET		6509
+#define PRS_REG_TASK_ID_MAX_INITIATOR_PF_RT_OFFSET		6510
+#define PRS_REG_TASK_ID_MAX_INITIATOR_VF_RT_OFFSET		6511
+#define PRS_REG_TASK_ID_MAX_TARGET_PF_RT_OFFSET			6512
+#define PRS_REG_TASK_ID_MAX_TARGET_VF_RT_OFFSET			6513
+#define PRS_REG_SEARCH_TCP_RT_OFFSET				6514
+#define PRS_REG_SEARCH_FCOE_RT_OFFSET				6515
+#define PRS_REG_SEARCH_ROCE_RT_OFFSET				6516
+#define PRS_REG_ROCE_DEST_QP_MAX_VF_RT_OFFSET			6517
+#define PRS_REG_ROCE_DEST_QP_MAX_PF_RT_OFFSET			6518
+#define PRS_REG_SEARCH_OPENFLOW_RT_OFFSET			6519
+#define PRS_REG_SEARCH_NON_IP_AS_OPENFLOW_RT_OFFSET		6520
+#define PRS_REG_OPENFLOW_SUPPORT_ONLY_KNOWN_OVER_IP_RT_OFFSET	6521
+#define PRS_REG_OPENFLOW_SEARCH_KEY_MASK_RT_OFFSET		6522
+#define PRS_REG_TAG_ETHERTYPE_0_RT_OFFSET			6523
+#define PRS_REG_LIGHT_L2_ETHERTYPE_EN_RT_OFFSET			6524
+#define SRC_REG_FIRSTFREE_RT_OFFSET				6525
+#define SRC_REG_FIRSTFREE_RT_SIZE				2
+#define SRC_REG_LASTFREE_RT_OFFSET				6527
+#define SRC_REG_LASTFREE_RT_SIZE				2
+#define SRC_REG_COUNTFREE_RT_OFFSET				6529
+#define SRC_REG_NUMBER_HASH_BITS_RT_OFFSET			6530
+#define PSWRQ2_REG_CDUT_P_SIZE_RT_OFFSET			6531
+#define PSWRQ2_REG_CDUC_P_SIZE_RT_OFFSET			6532
+#define PSWRQ2_REG_TM_P_SIZE_RT_OFFSET				6533
+#define PSWRQ2_REG_QM_P_SIZE_RT_OFFSET				6534
+#define PSWRQ2_REG_SRC_P_SIZE_RT_OFFSET				6535
+#define PSWRQ2_REG_TSDM_P_SIZE_RT_OFFSET			6536
+#define PSWRQ2_REG_TM_FIRST_ILT_RT_OFFSET			6537
+#define PSWRQ2_REG_TM_LAST_ILT_RT_OFFSET			6538
+#define PSWRQ2_REG_QM_FIRST_ILT_RT_OFFSET			6539
+#define PSWRQ2_REG_QM_LAST_ILT_RT_OFFSET			6540
+#define PSWRQ2_REG_SRC_FIRST_ILT_RT_OFFSET			6541
+#define PSWRQ2_REG_SRC_LAST_ILT_RT_OFFSET			6542
+#define PSWRQ2_REG_CDUC_FIRST_ILT_RT_OFFSET			6543
+#define PSWRQ2_REG_CDUC_LAST_ILT_RT_OFFSET			6544
+#define PSWRQ2_REG_CDUT_FIRST_ILT_RT_OFFSET			6545
+#define PSWRQ2_REG_CDUT_LAST_ILT_RT_OFFSET			6546
+#define PSWRQ2_REG_TSDM_FIRST_ILT_RT_OFFSET			6547
+#define PSWRQ2_REG_TSDM_LAST_ILT_RT_OFFSET			6548
+#define PSWRQ2_REG_TM_NUMBER_OF_PF_BLOCKS_RT_OFFSET		6549
+#define PSWRQ2_REG_CDUT_NUMBER_OF_PF_BLOCKS_RT_OFFSET		6550
+#define PSWRQ2_REG_CDUC_NUMBER_OF_PF_BLOCKS_RT_OFFSET		6551
+#define PSWRQ2_REG_TM_VF_BLOCKS_RT_OFFSET			6552
+#define PSWRQ2_REG_CDUT_VF_BLOCKS_RT_OFFSET			6553
+#define PSWRQ2_REG_CDUC_VF_BLOCKS_RT_OFFSET			6554
+#define PSWRQ2_REG_TM_BLOCKS_FACTOR_RT_OFFSET			6555
+#define PSWRQ2_REG_CDUT_BLOCKS_FACTOR_RT_OFFSET			6556
+#define PSWRQ2_REG_CDUC_BLOCKS_FACTOR_RT_OFFSET			6557
+#define PSWRQ2_REG_VF_BASE_RT_OFFSET				6558
+#define PSWRQ2_REG_VF_LAST_ILT_RT_OFFSET			6559
+#define PSWRQ2_REG_DRAM_ALIGN_WR_RT_OFFSET			6560
+#define PSWRQ2_REG_DRAM_ALIGN_RD_RT_OFFSET			6561
+#define PSWRQ2_REG_TGSRC_FIRST_ILT_RT_OFFSET			6562
+#define PSWRQ2_REG_RGSRC_FIRST_ILT_RT_OFFSET			6563
+#define PSWRQ2_REG_TGSRC_LAST_ILT_RT_OFFSET			6564
+#define PSWRQ2_REG_RGSRC_LAST_ILT_RT_OFFSET			6565
+#define PSWRQ2_REG_ILT_MEMORY_RT_OFFSET				6566
+#define PSWRQ2_REG_ILT_MEMORY_RT_SIZE				26414
+#define PGLUE_REG_B_VF_BASE_RT_OFFSET				32980
+#define PGLUE_REG_B_MSDM_OFFSET_MASK_B_RT_OFFSET		32981
+#define PGLUE_REG_B_MSDM_VF_SHIFT_B_RT_OFFSET			32982
+#define PGLUE_REG_B_CACHE_LINE_SIZE_RT_OFFSET			32983
+#define PGLUE_REG_B_PF_BAR0_SIZE_RT_OFFSET			32984
+#define PGLUE_REG_B_PF_BAR1_SIZE_RT_OFFSET			32985
+#define PGLUE_REG_B_VF_BAR1_SIZE_RT_OFFSET			32986
+#define TM_REG_VF_ENABLE_CONN_RT_OFFSET				32987
+#define TM_REG_PF_ENABLE_CONN_RT_OFFSET				32988
+#define TM_REG_PF_ENABLE_TASK_RT_OFFSET				32989
+#define TM_REG_GROUP_SIZE_RESOLUTION_CONN_RT_OFFSET		32990
+#define TM_REG_GROUP_SIZE_RESOLUTION_TASK_RT_OFFSET		32991
+#define TM_REG_CONFIG_CONN_MEM_RT_OFFSET			32992
+#define TM_REG_CONFIG_CONN_MEM_RT_SIZE				416
+#define TM_REG_CONFIG_TASK_MEM_RT_OFFSET			33408
+#define TM_REG_CONFIG_TASK_MEM_RT_SIZE				608
+#define QM_REG_MAXPQSIZE_0_RT_OFFSET				34016
+#define QM_REG_MAXPQSIZE_1_RT_OFFSET				34017
+#define QM_REG_MAXPQSIZE_2_RT_OFFSET				34018
+#define QM_REG_MAXPQSIZETXSEL_0_RT_OFFSET			34019
+#define QM_REG_MAXPQSIZETXSEL_1_RT_OFFSET			34020
+#define QM_REG_MAXPQSIZETXSEL_2_RT_OFFSET			34021
+#define QM_REG_MAXPQSIZETXSEL_3_RT_OFFSET			34022
+#define QM_REG_MAXPQSIZETXSEL_4_RT_OFFSET			34023
+#define QM_REG_MAXPQSIZETXSEL_5_RT_OFFSET			34024
+#define QM_REG_MAXPQSIZETXSEL_6_RT_OFFSET			34025
+#define QM_REG_MAXPQSIZETXSEL_7_RT_OFFSET			34026
+#define QM_REG_MAXPQSIZETXSEL_8_RT_OFFSET			34027
+#define QM_REG_MAXPQSIZETXSEL_9_RT_OFFSET			34028
+#define QM_REG_MAXPQSIZETXSEL_10_RT_OFFSET			34029
+#define QM_REG_MAXPQSIZETXSEL_11_RT_OFFSET			34030
+#define QM_REG_MAXPQSIZETXSEL_12_RT_OFFSET			34031
+#define QM_REG_MAXPQSIZETXSEL_13_RT_OFFSET			34032
+#define QM_REG_MAXPQSIZETXSEL_14_RT_OFFSET			34033
+#define QM_REG_MAXPQSIZETXSEL_15_RT_OFFSET			34034
+#define QM_REG_MAXPQSIZETXSEL_16_RT_OFFSET			34035
+#define QM_REG_MAXPQSIZETXSEL_17_RT_OFFSET			34036
+#define QM_REG_MAXPQSIZETXSEL_18_RT_OFFSET			34037
+#define QM_REG_MAXPQSIZETXSEL_19_RT_OFFSET			34038
+#define QM_REG_MAXPQSIZETXSEL_20_RT_OFFSET			34039
+#define QM_REG_MAXPQSIZETXSEL_21_RT_OFFSET			34040
+#define QM_REG_MAXPQSIZETXSEL_22_RT_OFFSET			34041
+#define QM_REG_MAXPQSIZETXSEL_23_RT_OFFSET			34042
+#define QM_REG_MAXPQSIZETXSEL_24_RT_OFFSET			34043
+#define QM_REG_MAXPQSIZETXSEL_25_RT_OFFSET			34044
+#define QM_REG_MAXPQSIZETXSEL_26_RT_OFFSET			34045
+#define QM_REG_MAXPQSIZETXSEL_27_RT_OFFSET			34046
+#define QM_REG_MAXPQSIZETXSEL_28_RT_OFFSET			34047
+#define QM_REG_MAXPQSIZETXSEL_29_RT_OFFSET			34048
+#define QM_REG_MAXPQSIZETXSEL_30_RT_OFFSET			34049
+#define QM_REG_MAXPQSIZETXSEL_31_RT_OFFSET			34050
+#define QM_REG_MAXPQSIZETXSEL_32_RT_OFFSET			34051
+#define QM_REG_MAXPQSIZETXSEL_33_RT_OFFSET			34052
+#define QM_REG_MAXPQSIZETXSEL_34_RT_OFFSET			34053
+#define QM_REG_MAXPQSIZETXSEL_35_RT_OFFSET			34054
+#define QM_REG_MAXPQSIZETXSEL_36_RT_OFFSET			34055
+#define QM_REG_MAXPQSIZETXSEL_37_RT_OFFSET			34056
+#define QM_REG_MAXPQSIZETXSEL_38_RT_OFFSET			34057
+#define QM_REG_MAXPQSIZETXSEL_39_RT_OFFSET			34058
+#define QM_REG_MAXPQSIZETXSEL_40_RT_OFFSET			34059
+#define QM_REG_MAXPQSIZETXSEL_41_RT_OFFSET			34060
+#define QM_REG_MAXPQSIZETXSEL_42_RT_OFFSET			34061
+#define QM_REG_MAXPQSIZETXSEL_43_RT_OFFSET			34062
+#define QM_REG_MAXPQSIZETXSEL_44_RT_OFFSET			34063
+#define QM_REG_MAXPQSIZETXSEL_45_RT_OFFSET			34064
+#define QM_REG_MAXPQSIZETXSEL_46_RT_OFFSET			34065
+#define QM_REG_MAXPQSIZETXSEL_47_RT_OFFSET			34066
+#define QM_REG_MAXPQSIZETXSEL_48_RT_OFFSET			34067
+#define QM_REG_MAXPQSIZETXSEL_49_RT_OFFSET			34068
+#define QM_REG_MAXPQSIZETXSEL_50_RT_OFFSET			34069
+#define QM_REG_MAXPQSIZETXSEL_51_RT_OFFSET			34070
+#define QM_REG_MAXPQSIZETXSEL_52_RT_OFFSET			34071
+#define QM_REG_MAXPQSIZETXSEL_53_RT_OFFSET			34072
+#define QM_REG_MAXPQSIZETXSEL_54_RT_OFFSET			34073
+#define QM_REG_MAXPQSIZETXSEL_55_RT_OFFSET			34074
+#define QM_REG_MAXPQSIZETXSEL_56_RT_OFFSET			34075
+#define QM_REG_MAXPQSIZETXSEL_57_RT_OFFSET			34076
+#define QM_REG_MAXPQSIZETXSEL_58_RT_OFFSET			34077
+#define QM_REG_MAXPQSIZETXSEL_59_RT_OFFSET			34078
+#define QM_REG_MAXPQSIZETXSEL_60_RT_OFFSET			34079
+#define QM_REG_MAXPQSIZETXSEL_61_RT_OFFSET			34080
+#define QM_REG_MAXPQSIZETXSEL_62_RT_OFFSET			34081
+#define QM_REG_MAXPQSIZETXSEL_63_RT_OFFSET			34082
+#define QM_REG_BASEADDROTHERPQ_RT_OFFSET			34083
+#define QM_REG_BASEADDROTHERPQ_RT_SIZE				128
+#define QM_REG_PTRTBLOTHER_RT_OFFSET				34211
+#define QM_REG_PTRTBLOTHER_RT_SIZE				256
+#define QM_REG_AFULLQMBYPTHRPFWFQ_RT_OFFSET			34467
+#define QM_REG_AFULLQMBYPTHRVPWFQ_RT_OFFSET			34468
+#define QM_REG_AFULLQMBYPTHRPFRL_RT_OFFSET			34469
+#define QM_REG_AFULLQMBYPTHRGLBLRL_RT_OFFSET			34470
+#define QM_REG_AFULLOPRTNSTCCRDMASK_RT_OFFSET			34471
+#define QM_REG_WRROTHERPQGRP_0_RT_OFFSET			34472
+#define QM_REG_WRROTHERPQGRP_1_RT_OFFSET			34473
+#define QM_REG_WRROTHERPQGRP_2_RT_OFFSET			34474
+#define QM_REG_WRROTHERPQGRP_3_RT_OFFSET			34475
+#define QM_REG_WRROTHERPQGRP_4_RT_OFFSET			34476
+#define QM_REG_WRROTHERPQGRP_5_RT_OFFSET			34477
+#define QM_REG_WRROTHERPQGRP_6_RT_OFFSET			34478
+#define QM_REG_WRROTHERPQGRP_7_RT_OFFSET			34479
+#define QM_REG_WRROTHERPQGRP_8_RT_OFFSET			34480
+#define QM_REG_WRROTHERPQGRP_9_RT_OFFSET			34481
+#define QM_REG_WRROTHERPQGRP_10_RT_OFFSET			34482
+#define QM_REG_WRROTHERPQGRP_11_RT_OFFSET			34483
+#define QM_REG_WRROTHERPQGRP_12_RT_OFFSET			34484
+#define QM_REG_WRROTHERPQGRP_13_RT_OFFSET			34485
+#define QM_REG_WRROTHERPQGRP_14_RT_OFFSET			34486
+#define QM_REG_WRROTHERPQGRP_15_RT_OFFSET			34487
+#define QM_REG_WRROTHERGRPWEIGHT_0_RT_OFFSET			34488
+#define QM_REG_WRROTHERGRPWEIGHT_1_RT_OFFSET			34489
+#define QM_REG_WRROTHERGRPWEIGHT_2_RT_OFFSET			34490
+#define QM_REG_WRROTHERGRPWEIGHT_3_RT_OFFSET			34491
+#define QM_REG_WRRTXGRPWEIGHT_0_RT_OFFSET			34492
+#define QM_REG_WRRTXGRPWEIGHT_1_RT_OFFSET			34493
+#define QM_REG_PQTX2PF_0_RT_OFFSET				34494
+#define QM_REG_PQTX2PF_1_RT_OFFSET				34495
+#define QM_REG_PQTX2PF_2_RT_OFFSET				34496
+#define QM_REG_PQTX2PF_3_RT_OFFSET				34497
+#define QM_REG_PQTX2PF_4_RT_OFFSET				34498
+#define QM_REG_PQTX2PF_5_RT_OFFSET				34499
+#define QM_REG_PQTX2PF_6_RT_OFFSET				34500
+#define QM_REG_PQTX2PF_7_RT_OFFSET				34501
+#define QM_REG_PQTX2PF_8_RT_OFFSET				34502
+#define QM_REG_PQTX2PF_9_RT_OFFSET				34503
+#define QM_REG_PQTX2PF_10_RT_OFFSET				34504
+#define QM_REG_PQTX2PF_11_RT_OFFSET				34505
+#define QM_REG_PQTX2PF_12_RT_OFFSET				34506
+#define QM_REG_PQTX2PF_13_RT_OFFSET				34507
+#define QM_REG_PQTX2PF_14_RT_OFFSET				34508
+#define QM_REG_PQTX2PF_15_RT_OFFSET				34509
+#define QM_REG_PQTX2PF_16_RT_OFFSET				34510
+#define QM_REG_PQTX2PF_17_RT_OFFSET				34511
+#define QM_REG_PQTX2PF_18_RT_OFFSET				34512
+#define QM_REG_PQTX2PF_19_RT_OFFSET				34513
+#define QM_REG_PQTX2PF_20_RT_OFFSET				34514
+#define QM_REG_PQTX2PF_21_RT_OFFSET				34515
+#define QM_REG_PQTX2PF_22_RT_OFFSET				34516
+#define QM_REG_PQTX2PF_23_RT_OFFSET				34517
+#define QM_REG_PQTX2PF_24_RT_OFFSET				34518
+#define QM_REG_PQTX2PF_25_RT_OFFSET				34519
+#define QM_REG_PQTX2PF_26_RT_OFFSET				34520
+#define QM_REG_PQTX2PF_27_RT_OFFSET				34521
+#define QM_REG_PQTX2PF_28_RT_OFFSET				34522
+#define QM_REG_PQTX2PF_29_RT_OFFSET				34523
+#define QM_REG_PQTX2PF_30_RT_OFFSET				34524
+#define QM_REG_PQTX2PF_31_RT_OFFSET				34525
+#define QM_REG_PQTX2PF_32_RT_OFFSET				34526
+#define QM_REG_PQTX2PF_33_RT_OFFSET				34527
+#define QM_REG_PQTX2PF_34_RT_OFFSET				34528
+#define QM_REG_PQTX2PF_35_RT_OFFSET				34529
+#define QM_REG_PQTX2PF_36_RT_OFFSET				34530
+#define QM_REG_PQTX2PF_37_RT_OFFSET				34531
+#define QM_REG_PQTX2PF_38_RT_OFFSET				34532
+#define QM_REG_PQTX2PF_39_RT_OFFSET				34533
+#define QM_REG_PQTX2PF_40_RT_OFFSET				34534
+#define QM_REG_PQTX2PF_41_RT_OFFSET				34535
+#define QM_REG_PQTX2PF_42_RT_OFFSET				34536
+#define QM_REG_PQTX2PF_43_RT_OFFSET				34537
+#define QM_REG_PQTX2PF_44_RT_OFFSET				34538
+#define QM_REG_PQTX2PF_45_RT_OFFSET				34539
+#define QM_REG_PQTX2PF_46_RT_OFFSET				34540
+#define QM_REG_PQTX2PF_47_RT_OFFSET				34541
+#define QM_REG_PQTX2PF_48_RT_OFFSET				34542
+#define QM_REG_PQTX2PF_49_RT_OFFSET				34543
+#define QM_REG_PQTX2PF_50_RT_OFFSET				34544
+#define QM_REG_PQTX2PF_51_RT_OFFSET				34545
+#define QM_REG_PQTX2PF_52_RT_OFFSET				34546
+#define QM_REG_PQTX2PF_53_RT_OFFSET				34547
+#define QM_REG_PQTX2PF_54_RT_OFFSET				34548
+#define QM_REG_PQTX2PF_55_RT_OFFSET				34549
+#define QM_REG_PQTX2PF_56_RT_OFFSET				34550
+#define QM_REG_PQTX2PF_57_RT_OFFSET				34551
+#define QM_REG_PQTX2PF_58_RT_OFFSET				34552
+#define QM_REG_PQTX2PF_59_RT_OFFSET				34553
+#define QM_REG_PQTX2PF_60_RT_OFFSET				34554
+#define QM_REG_PQTX2PF_61_RT_OFFSET				34555
+#define QM_REG_PQTX2PF_62_RT_OFFSET				34556
+#define QM_REG_PQTX2PF_63_RT_OFFSET				34557
+#define QM_REG_PQOTHER2PF_0_RT_OFFSET				34558
+#define QM_REG_PQOTHER2PF_1_RT_OFFSET				34559
+#define QM_REG_PQOTHER2PF_2_RT_OFFSET				34560
+#define QM_REG_PQOTHER2PF_3_RT_OFFSET				34561
+#define QM_REG_PQOTHER2PF_4_RT_OFFSET				34562
+#define QM_REG_PQOTHER2PF_5_RT_OFFSET				34563
+#define QM_REG_PQOTHER2PF_6_RT_OFFSET				34564
+#define QM_REG_PQOTHER2PF_7_RT_OFFSET				34565
+#define QM_REG_PQOTHER2PF_8_RT_OFFSET				34566
+#define QM_REG_PQOTHER2PF_9_RT_OFFSET				34567
+#define QM_REG_PQOTHER2PF_10_RT_OFFSET				34568
+#define QM_REG_PQOTHER2PF_11_RT_OFFSET				34569
+#define QM_REG_PQOTHER2PF_12_RT_OFFSET				34570
+#define QM_REG_PQOTHER2PF_13_RT_OFFSET				34571
+#define QM_REG_PQOTHER2PF_14_RT_OFFSET				34572
+#define QM_REG_PQOTHER2PF_15_RT_OFFSET				34573
+#define QM_REG_RLGLBLPERIOD_0_RT_OFFSET				34574
+#define QM_REG_RLGLBLPERIOD_1_RT_OFFSET				34575
+#define QM_REG_RLGLBLPERIODTIMER_0_RT_OFFSET			34576
+#define QM_REG_RLGLBLPERIODTIMER_1_RT_OFFSET			34577
+#define QM_REG_RLGLBLPERIODSEL_0_RT_OFFSET			34578
+#define QM_REG_RLGLBLPERIODSEL_1_RT_OFFSET			34579
+#define QM_REG_RLGLBLPERIODSEL_2_RT_OFFSET			34580
+#define QM_REG_RLGLBLPERIODSEL_3_RT_OFFSET			34581
+#define QM_REG_RLGLBLPERIODSEL_4_RT_OFFSET			34582
+#define QM_REG_RLGLBLPERIODSEL_5_RT_OFFSET			34583
+#define QM_REG_RLGLBLPERIODSEL_6_RT_OFFSET			34584
+#define QM_REG_RLGLBLPERIODSEL_7_RT_OFFSET			34585
+#define QM_REG_RLGLBLINCVAL_RT_OFFSET				34586
+#define QM_REG_RLGLBLINCVAL_RT_SIZE				256
+#define QM_REG_RLGLBLUPPERBOUND_RT_OFFSET			34842
+#define QM_REG_RLGLBLUPPERBOUND_RT_SIZE				256
+#define QM_REG_RLGLBLCRD_RT_OFFSET				35098
+#define QM_REG_RLGLBLCRD_RT_SIZE				256
+#define QM_REG_RLGLBLENABLE_RT_OFFSET				35354
+#define QM_REG_RLPFPERIOD_RT_OFFSET				35355
+#define QM_REG_RLPFPERIODTIMER_RT_OFFSET			35356
+#define QM_REG_RLPFINCVAL_RT_OFFSET				35357
+#define QM_REG_RLPFINCVAL_RT_SIZE				16
+#define QM_REG_RLPFUPPERBOUND_RT_OFFSET				35373
+#define QM_REG_RLPFUPPERBOUND_RT_SIZE				16
+#define QM_REG_RLPFCRD_RT_OFFSET				35389
+#define QM_REG_RLPFCRD_RT_SIZE					16
+#define QM_REG_RLPFENABLE_RT_OFFSET				35405
+#define QM_REG_RLPFVOQENABLE_RT_OFFSET				35406
+#define QM_REG_WFQPFWEIGHT_RT_OFFSET				35407
+#define QM_REG_WFQPFWEIGHT_RT_SIZE				16
+#define QM_REG_WFQPFUPPERBOUND_RT_OFFSET			35423
+#define QM_REG_WFQPFUPPERBOUND_RT_SIZE				16
+#define QM_REG_WFQPFCRD_RT_OFFSET				35439
+#define QM_REG_WFQPFCRD_RT_SIZE					256
+#define QM_REG_WFQPFENABLE_RT_OFFSET				35695
+#define QM_REG_WFQVPENABLE_RT_OFFSET				35696
+#define QM_REG_BASEADDRTXPQ_RT_OFFSET				35697
+#define QM_REG_BASEADDRTXPQ_RT_SIZE				512
+#define QM_REG_TXPQMAP_RT_OFFSET				36209
+#define QM_REG_TXPQMAP_RT_SIZE					512
+#define QM_REG_WFQVPWEIGHT_RT_OFFSET				36721
+#define QM_REG_WFQVPWEIGHT_RT_SIZE				512
+#define QM_REG_WFQVPCRD_RT_OFFSET				37233
+#define QM_REG_WFQVPCRD_RT_SIZE					512
+#define QM_REG_WFQVPMAP_RT_OFFSET				37745
+#define QM_REG_WFQVPMAP_RT_SIZE					512
+#define QM_REG_PTRTBLTX_RT_OFFSET				38257
+#define QM_REG_PTRTBLTX_RT_SIZE					1024
+#define QM_REG_WFQPFCRD_MSB_RT_OFFSET				39281
+#define QM_REG_WFQPFCRD_MSB_RT_SIZE				320
+#define QM_REG_VOQCRDLINE_RT_OFFSET				39601
+#define QM_REG_VOQCRDLINE_RT_SIZE				36
+#define QM_REG_VOQINITCRDLINE_RT_OFFSET				39637
+#define QM_REG_VOQINITCRDLINE_RT_SIZE				36
+#define QM_REG_RLPFVOQENABLE_MSB_RT_OFFSET			39673
+#define NIG_REG_TAG_ETHERTYPE_0_RT_OFFSET			39674
+#define NIG_REG_BRB_GATE_DNTFWD_PORT_RT_OFFSET			39675
+#define NIG_REG_OUTER_TAG_VALUE_LIST0_RT_OFFSET			39676
+#define NIG_REG_OUTER_TAG_VALUE_LIST1_RT_OFFSET			39677
+#define NIG_REG_OUTER_TAG_VALUE_LIST2_RT_OFFSET			39678
+#define NIG_REG_OUTER_TAG_VALUE_LIST3_RT_OFFSET			39679
+#define NIG_REG_LLH_FUNC_TAGMAC_CLS_TYPE_RT_OFFSET		39680
+#define NIG_REG_LLH_FUNC_TAG_EN_RT_OFFSET			39681
+#define NIG_REG_LLH_FUNC_TAG_EN_RT_SIZE				4
+#define NIG_REG_LLH_FUNC_TAG_VALUE_RT_OFFSET			39685
+#define NIG_REG_LLH_FUNC_TAG_VALUE_RT_SIZE			4
+#define NIG_REG_LLH_FUNC_FILTER_VALUE_RT_OFFSET			39689
+#define NIG_REG_LLH_FUNC_FILTER_VALUE_RT_SIZE			32
+#define NIG_REG_LLH_FUNC_FILTER_EN_RT_OFFSET			39721
+#define NIG_REG_LLH_FUNC_FILTER_EN_RT_SIZE			16
+#define NIG_REG_LLH_FUNC_FILTER_MODE_RT_OFFSET			39737
+#define NIG_REG_LLH_FUNC_FILTER_MODE_RT_SIZE			16
+#define NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE_RT_OFFSET		39753
+#define NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE_RT_SIZE		16
+#define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_OFFSET		39769
+#define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_SIZE			16
+#define NIG_REG_TX_EDPM_CTRL_RT_OFFSET				39785
+#define NIG_REG_ROCE_DUPLICATE_TO_HOST_RT_OFFSET		39786
+#define NIG_REG_PPF_TO_ENGINE_SEL_RT_OFFSET			39787
+#define NIG_REG_PPF_TO_ENGINE_SEL_RT_SIZE			8
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_VALUE_RT_OFFSET		39795
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_VALUE_RT_SIZE		1024
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_EN_RT_OFFSET		40819
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_EN_RT_SIZE		512
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_MODE_RT_OFFSET		41331
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_MODE_RT_SIZE		512
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_PROTOCOL_TYPE_RT_OFFSET	41843
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_PROTOCOL_TYPE_RT_SIZE	512
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_HDR_SEL_RT_OFFSET	42355
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_HDR_SEL_RT_SIZE		512
+#define NIG_REG_LLH_PF_CLS_FILTERS_MAP_RT_OFFSET		42867
+#define NIG_REG_LLH_PF_CLS_FILTERS_MAP_RT_SIZE			32
+#define CDU_REG_CID_ADDR_PARAMS_RT_OFFSET			42899
+#define CDU_REG_SEGMENT0_PARAMS_RT_OFFSET			42900
+#define CDU_REG_SEGMENT1_PARAMS_RT_OFFSET			42901
+#define CDU_REG_PF_SEG0_TYPE_OFFSET_RT_OFFSET			42902
+#define CDU_REG_PF_SEG1_TYPE_OFFSET_RT_OFFSET			42903
+#define CDU_REG_PF_SEG2_TYPE_OFFSET_RT_OFFSET			42904
+#define CDU_REG_PF_SEG3_TYPE_OFFSET_RT_OFFSET			42905
+#define CDU_REG_PF_FL_SEG0_TYPE_OFFSET_RT_OFFSET		42906
+#define CDU_REG_PF_FL_SEG1_TYPE_OFFSET_RT_OFFSET		42907
+#define CDU_REG_PF_FL_SEG2_TYPE_OFFSET_RT_OFFSET		42908
+#define CDU_REG_PF_FL_SEG3_TYPE_OFFSET_RT_OFFSET		42909
+#define CDU_REG_VF_SEG_TYPE_OFFSET_RT_OFFSET			42910
+#define CDU_REG_VF_FL_SEG_TYPE_OFFSET_RT_OFFSET			42911
+#define PBF_REG_TAG_ETHERTYPE_0_RT_OFFSET			42912
+#define PBF_REG_BTB_SHARED_AREA_SIZE_RT_OFFSET			42913
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ0_RT_OFFSET		42914
+#define PBF_REG_BTB_GUARANTEED_VOQ0_RT_OFFSET			42915
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ0_RT_OFFSET		42916
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ1_RT_OFFSET		42917
+#define PBF_REG_BTB_GUARANTEED_VOQ1_RT_OFFSET			42918
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ1_RT_OFFSET		42919
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ2_RT_OFFSET		42920
+#define PBF_REG_BTB_GUARANTEED_VOQ2_RT_OFFSET			42921
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ2_RT_OFFSET		42922
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ3_RT_OFFSET		42923
+#define PBF_REG_BTB_GUARANTEED_VOQ3_RT_OFFSET			42924
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ3_RT_OFFSET		42925
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ4_RT_OFFSET		42926
+#define PBF_REG_BTB_GUARANTEED_VOQ4_RT_OFFSET			42927
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ4_RT_OFFSET		42928
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ5_RT_OFFSET		42929
+#define PBF_REG_BTB_GUARANTEED_VOQ5_RT_OFFSET			42930
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ5_RT_OFFSET		42931
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ6_RT_OFFSET		42932
+#define PBF_REG_BTB_GUARANTEED_VOQ6_RT_OFFSET			42933
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ6_RT_OFFSET		42934
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ7_RT_OFFSET		42935
+#define PBF_REG_BTB_GUARANTEED_VOQ7_RT_OFFSET			42936
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ7_RT_OFFSET		42937
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ8_RT_OFFSET		42938
+#define PBF_REG_BTB_GUARANTEED_VOQ8_RT_OFFSET			42939
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ8_RT_OFFSET		42940
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ9_RT_OFFSET		42941
+#define PBF_REG_BTB_GUARANTEED_VOQ9_RT_OFFSET			42942
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ9_RT_OFFSET		42943
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ10_RT_OFFSET		42944
+#define PBF_REG_BTB_GUARANTEED_VOQ10_RT_OFFSET			42945
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ10_RT_OFFSET		42946
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ11_RT_OFFSET		42947
+#define PBF_REG_BTB_GUARANTEED_VOQ11_RT_OFFSET			42948
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ11_RT_OFFSET		42949
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ12_RT_OFFSET		42950
+#define PBF_REG_BTB_GUARANTEED_VOQ12_RT_OFFSET			42951
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ12_RT_OFFSET		42952
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ13_RT_OFFSET		42953
+#define PBF_REG_BTB_GUARANTEED_VOQ13_RT_OFFSET			42954
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ13_RT_OFFSET		42955
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ14_RT_OFFSET		42956
+#define PBF_REG_BTB_GUARANTEED_VOQ14_RT_OFFSET			42957
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ14_RT_OFFSET		42958
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ15_RT_OFFSET		42959
+#define PBF_REG_BTB_GUARANTEED_VOQ15_RT_OFFSET			42960
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ15_RT_OFFSET		42961
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ16_RT_OFFSET		42962
+#define PBF_REG_BTB_GUARANTEED_VOQ16_RT_OFFSET			42963
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ16_RT_OFFSET		42964
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ17_RT_OFFSET		42965
+#define PBF_REG_BTB_GUARANTEED_VOQ17_RT_OFFSET			42966
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ17_RT_OFFSET		42967
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ18_RT_OFFSET		42968
+#define PBF_REG_BTB_GUARANTEED_VOQ18_RT_OFFSET			42969
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ18_RT_OFFSET		42970
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ19_RT_OFFSET		42971
+#define PBF_REG_BTB_GUARANTEED_VOQ19_RT_OFFSET			42972
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ19_RT_OFFSET		42973
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ20_RT_OFFSET		42974
+#define PBF_REG_BTB_GUARANTEED_VOQ20_RT_OFFSET			42975
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ20_RT_OFFSET		42976
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ21_RT_OFFSET		42977
+#define PBF_REG_BTB_GUARANTEED_VOQ21_RT_OFFSET			42978
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ21_RT_OFFSET		42979
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ22_RT_OFFSET		42980
+#define PBF_REG_BTB_GUARANTEED_VOQ22_RT_OFFSET			42981
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ22_RT_OFFSET		42982
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ23_RT_OFFSET		42983
+#define PBF_REG_BTB_GUARANTEED_VOQ23_RT_OFFSET			42984
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ23_RT_OFFSET		42985
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ24_RT_OFFSET		42986
+#define PBF_REG_BTB_GUARANTEED_VOQ24_RT_OFFSET			42987
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ24_RT_OFFSET		42988
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ25_RT_OFFSET		42989
+#define PBF_REG_BTB_GUARANTEED_VOQ25_RT_OFFSET			42990
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ25_RT_OFFSET		42991
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ26_RT_OFFSET		42992
+#define PBF_REG_BTB_GUARANTEED_VOQ26_RT_OFFSET			42993
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ26_RT_OFFSET		42994
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ27_RT_OFFSET		42995
+#define PBF_REG_BTB_GUARANTEED_VOQ27_RT_OFFSET			42996
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ27_RT_OFFSET		42997
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ28_RT_OFFSET		42998
+#define PBF_REG_BTB_GUARANTEED_VOQ28_RT_OFFSET			42999
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ28_RT_OFFSET		43000
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ29_RT_OFFSET		43001
+#define PBF_REG_BTB_GUARANTEED_VOQ29_RT_OFFSET			43002
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ29_RT_OFFSET		43003
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ30_RT_OFFSET		43004
+#define PBF_REG_BTB_GUARANTEED_VOQ30_RT_OFFSET			43005
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ30_RT_OFFSET		43006
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ31_RT_OFFSET		43007
+#define PBF_REG_BTB_GUARANTEED_VOQ31_RT_OFFSET			43008
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ31_RT_OFFSET		43009
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ32_RT_OFFSET		43010
+#define PBF_REG_BTB_GUARANTEED_VOQ32_RT_OFFSET			43011
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ32_RT_OFFSET		43012
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ33_RT_OFFSET		43013
+#define PBF_REG_BTB_GUARANTEED_VOQ33_RT_OFFSET			43014
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ33_RT_OFFSET		43015
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ34_RT_OFFSET		43016
+#define PBF_REG_BTB_GUARANTEED_VOQ34_RT_OFFSET			43017
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ34_RT_OFFSET		43018
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ35_RT_OFFSET		43019
+#define PBF_REG_BTB_GUARANTEED_VOQ35_RT_OFFSET			43020
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ35_RT_OFFSET		43021
+#define XCM_REG_CON_PHY_Q3_RT_OFFSET				43022
+
+#define RUNTIME_ARRAY_SIZE	43023
+
+/* Init Callbacks */
+#define DMAE_READY_CB	0
+
+/* The eth storm context for the Tstorm */
+struct tstorm_eth_conn_st_ctx {
+	__le32 reserved[4];
+};
+
+/* The eth storm context for the Pstorm */
+struct pstorm_eth_conn_st_ctx {
+	__le32 reserved[8];
+};
+
+/* The eth storm context for the Xstorm */
+struct xstorm_eth_conn_st_ctx {
+	__le32 reserved[60];
+};
+
+struct e4_xstorm_eth_conn_ag_ctx {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4_XSTORM_ETH_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	0
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED1_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED1_SHIFT	1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED2_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED2_SHIFT	2
+#define E4_XSTORM_ETH_CONN_AG_CTX_EXIST_IN_QM3_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_EXIST_IN_QM3_SHIFT	3
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED3_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED3_SHIFT	4
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED4_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED4_SHIFT	5
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED5_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED5_SHIFT	6
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED6_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED6_SHIFT	7
+		u8 flags1;
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED7_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED7_SHIFT	0
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED8_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED8_SHIFT	1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED9_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED9_SHIFT	2
+#define E4_XSTORM_ETH_CONN_AG_CTX_BIT11_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_BIT11_SHIFT		3
+#define E4_XSTORM_ETH_CONN_AG_CTX_E5_RESERVED2_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_E5_RESERVED2_SHIFT	4
+#define E4_XSTORM_ETH_CONN_AG_CTX_E5_RESERVED3_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_E5_RESERVED3_SHIFT	5
+#define E4_XSTORM_ETH_CONN_AG_CTX_TX_RULE_ACTIVE_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_TX_RULE_ACTIVE_SHIFT	6
+#define E4_XSTORM_ETH_CONN_AG_CTX_DQ_CF_ACTIVE_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_DQ_CF_ACTIVE_SHIFT	7
+	u8 flags2;
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF0_SHIFT	0
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF1_SHIFT	2
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF2_SHIFT	4
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF3_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF3_SHIFT	6
+	u8 flags3;
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF4_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF4_SHIFT	0
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF5_SHIFT	2
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF6_SHIFT	4
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF7_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF7_SHIFT	6
+		u8 flags4;
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF8_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF8_SHIFT	0
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF9_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF9_SHIFT	2
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF10_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF10_SHIFT	4
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF11_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF11_SHIFT	6
+	u8 flags5;
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF12_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF12_SHIFT	0
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF13_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF13_SHIFT	2
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF14_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF14_SHIFT	4
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF15_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF15_SHIFT	6
+	u8 flags6;
+#define E4_XSTORM_ETH_CONN_AG_CTX_GO_TO_BD_CONS_CF_MASK		0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_GO_TO_BD_CONS_CF_SHIFT	0
+#define E4_XSTORM_ETH_CONN_AG_CTX_MULTI_UNICAST_CF_MASK		0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_MULTI_UNICAST_CF_SHIFT	2
+#define E4_XSTORM_ETH_CONN_AG_CTX_DQ_CF_MASK			0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_DQ_CF_SHIFT			4
+#define E4_XSTORM_ETH_CONN_AG_CTX_TERMINATE_CF_MASK		0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_TERMINATE_CF_SHIFT		6
+	u8 flags7;
+#define E4_XSTORM_ETH_CONN_AG_CTX_FLUSH_Q0_MASK		0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_FLUSH_Q0_SHIFT	0
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED10_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED10_SHIFT	2
+#define E4_XSTORM_ETH_CONN_AG_CTX_SLOW_PATH_MASK	0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_SLOW_PATH_SHIFT	4
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF0EN_SHIFT		6
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF1EN_SHIFT		7
+	u8 flags8;
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF2EN_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF2EN_SHIFT	0
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF3EN_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF3EN_SHIFT	1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF4EN_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF4EN_SHIFT	2
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF5EN_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF5EN_SHIFT	3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF6EN_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF6EN_SHIFT	4
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF7EN_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF7EN_SHIFT	5
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF8EN_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF8EN_SHIFT	6
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF9EN_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF9EN_SHIFT	7
+	u8 flags9;
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF10EN_MASK			0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF10EN_SHIFT			0
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF11EN_MASK			0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF11EN_SHIFT			1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF12EN_MASK			0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF12EN_SHIFT			2
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF13EN_MASK			0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF13EN_SHIFT			3
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF14EN_MASK			0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF14EN_SHIFT			4
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF15EN_MASK			0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_CF15EN_SHIFT			5
+#define E4_XSTORM_ETH_CONN_AG_CTX_GO_TO_BD_CONS_CF_EN_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_GO_TO_BD_CONS_CF_EN_SHIFT	6
+#define E4_XSTORM_ETH_CONN_AG_CTX_MULTI_UNICAST_CF_EN_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_MULTI_UNICAST_CF_EN_SHIFT	7
+	u8 flags10;
+#define E4_XSTORM_ETH_CONN_AG_CTX_DQ_CF_EN_MASK			0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_DQ_CF_EN_SHIFT		0
+#define E4_XSTORM_ETH_CONN_AG_CTX_TERMINATE_CF_EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_TERMINATE_CF_EN_SHIFT		1
+#define E4_XSTORM_ETH_CONN_AG_CTX_FLUSH_Q0_EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_FLUSH_Q0_EN_SHIFT		2
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED11_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED11_SHIFT		3
+#define E4_XSTORM_ETH_CONN_AG_CTX_SLOW_PATH_EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_SLOW_PATH_EN_SHIFT		4
+#define E4_XSTORM_ETH_CONN_AG_CTX_TPH_ENABLE_EN_RESERVED_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_TPH_ENABLE_EN_RESERVED_SHIFT	5
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED12_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED12_SHIFT		6
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED13_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED13_SHIFT		7
+	u8 flags11;
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED14_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED14_SHIFT	0
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED15_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RESERVED15_SHIFT	1
+#define E4_XSTORM_ETH_CONN_AG_CTX_TX_DEC_RULE_EN_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_TX_DEC_RULE_EN_SHIFT	2
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE5EN_SHIFT		3
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE6EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE6EN_SHIFT		4
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE7EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE7EN_SHIFT		5
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED1_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED1_SHIFT	6
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE9EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE9EN_SHIFT		7
+	u8 flags12;
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE10EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE10EN_SHIFT	0
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE11EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE11EN_SHIFT	1
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED2_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED2_SHIFT	2
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED3_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED3_SHIFT	3
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE14EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE14EN_SHIFT	4
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE15EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE15EN_SHIFT	5
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE16EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE16EN_SHIFT	6
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE17EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE17EN_SHIFT	7
+	u8 flags13;
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE18EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE18EN_SHIFT	0
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE19EN_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_RULE19EN_SHIFT	1
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED4_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED4_SHIFT	2
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED5_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED5_SHIFT	3
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED6_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED6_SHIFT	4
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED7_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED7_SHIFT	5
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED8_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED8_SHIFT	6
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED9_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_A0_RESERVED9_SHIFT	7
+	u8 flags14;
+#define E4_XSTORM_ETH_CONN_AG_CTX_EDPM_USE_EXT_HDR_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_EDPM_USE_EXT_HDR_SHIFT	0
+#define E4_XSTORM_ETH_CONN_AG_CTX_EDPM_SEND_RAW_L3L4_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_EDPM_SEND_RAW_L3L4_SHIFT	1
+#define E4_XSTORM_ETH_CONN_AG_CTX_EDPM_INBAND_PROP_HDR_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_EDPM_INBAND_PROP_HDR_SHIFT	2
+#define E4_XSTORM_ETH_CONN_AG_CTX_EDPM_SEND_EXT_TUNNEL_MASK	0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_EDPM_SEND_EXT_TUNNEL_SHIFT	3
+#define E4_XSTORM_ETH_CONN_AG_CTX_L2_EDPM_ENABLE_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_L2_EDPM_ENABLE_SHIFT		4
+#define E4_XSTORM_ETH_CONN_AG_CTX_ROCE_EDPM_ENABLE_MASK		0x1
+#define E4_XSTORM_ETH_CONN_AG_CTX_ROCE_EDPM_ENABLE_SHIFT	5
+#define E4_XSTORM_ETH_CONN_AG_CTX_TPH_ENABLE_MASK		0x3
+#define E4_XSTORM_ETH_CONN_AG_CTX_TPH_ENABLE_SHIFT		6
+	u8 edpm_event_id;
+	__le16 physical_q0;
+	__le16 e5_reserved1;
+	__le16 edpm_num_bds;
+	__le16 tx_bd_cons;
+	__le16 tx_bd_prod;
+	__le16 updated_qm_pq_id;
+	__le16 conn_dpi;
+	u8 byte3;
+	u8 byte4;
+	u8 byte5;
+	u8 byte6;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 reg6;
+	__le16 word7;
+	__le16 word8;
+	__le16 word9;
+	__le16 word10;
+	__le32 reg7;
+	__le32 reg8;
+	__le32 reg9;
+	u8 byte7;
+	u8 byte8;
+	u8 byte9;
+	u8 byte10;
+	u8 byte11;
+	u8 byte12;
+	u8 byte13;
+	u8 byte14;
+	u8 byte15;
+	u8 e5_reserved;
+	__le16 word11;
+	__le32 reg10;
+	__le32 reg11;
+	__le32 reg12;
+	__le32 reg13;
+	__le32 reg14;
+	__le32 reg15;
+	__le32 reg16;
+	__le32 reg17;
+	__le32 reg18;
+	__le32 reg19;
+	__le16 word12;
+	__le16 word13;
+	__le16 word14;
+	__le16 word15;
+};
+
+/* The eth storm context for the Ystorm */
+struct ystorm_eth_conn_st_ctx {
+	__le32 reserved[8];
+};
+
+struct e4_ystorm_eth_conn_ag_ctx {
+	u8 byte0;
+	u8 state;
+	u8 flags0;
+#define E4_YSTORM_ETH_CONN_AG_CTX_BIT0_MASK			0x1
+#define E4_YSTORM_ETH_CONN_AG_CTX_BIT0_SHIFT			0
+#define E4_YSTORM_ETH_CONN_AG_CTX_BIT1_MASK			0x1
+#define E4_YSTORM_ETH_CONN_AG_CTX_BIT1_SHIFT			1
+#define E4_YSTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_MASK	0x3
+#define E4_YSTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_SHIFT	2
+#define E4_YSTORM_ETH_CONN_AG_CTX_PMD_TERMINATE_CF_MASK		0x3
+#define E4_YSTORM_ETH_CONN_AG_CTX_PMD_TERMINATE_CF_SHIFT	4
+#define E4_YSTORM_ETH_CONN_AG_CTX_CF2_MASK			0x3
+#define E4_YSTORM_ETH_CONN_AG_CTX_CF2_SHIFT			6
+	u8 flags1;
+#define E4_YSTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_EN_MASK	0x1
+#define E4_YSTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_EN_SHIFT	0
+#define E4_YSTORM_ETH_CONN_AG_CTX_PMD_TERMINATE_CF_EN_MASK	0x1
+#define E4_YSTORM_ETH_CONN_AG_CTX_PMD_TERMINATE_CF_EN_SHIFT	1
+#define E4_YSTORM_ETH_CONN_AG_CTX_CF2EN_MASK			0x1
+#define E4_YSTORM_ETH_CONN_AG_CTX_CF2EN_SHIFT			2
+#define E4_YSTORM_ETH_CONN_AG_CTX_RULE0EN_MASK			0x1
+#define E4_YSTORM_ETH_CONN_AG_CTX_RULE0EN_SHIFT			3
+#define E4_YSTORM_ETH_CONN_AG_CTX_RULE1EN_MASK			0x1
+#define E4_YSTORM_ETH_CONN_AG_CTX_RULE1EN_SHIFT			4
+#define E4_YSTORM_ETH_CONN_AG_CTX_RULE2EN_MASK			0x1
+#define E4_YSTORM_ETH_CONN_AG_CTX_RULE2EN_SHIFT			5
+#define E4_YSTORM_ETH_CONN_AG_CTX_RULE3EN_MASK			0x1
+#define E4_YSTORM_ETH_CONN_AG_CTX_RULE3EN_SHIFT			6
+#define E4_YSTORM_ETH_CONN_AG_CTX_RULE4EN_MASK			0x1
+#define E4_YSTORM_ETH_CONN_AG_CTX_RULE4EN_SHIFT			7
+	u8 tx_q0_int_coallecing_timeset;
+	u8 byte3;
+	__le16 word0;
+	__le32 terminate_spqe;
+	__le32 reg1;
+	__le16 tx_bd_cons_upd;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le32 reg2;
+	__le32 reg3;
+};
+
+struct e4_tstorm_eth_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_TSTORM_ETH_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_TSTORM_ETH_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_TSTORM_ETH_CONN_AG_CTX_BIT2_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_BIT2_SHIFT	2
+#define E4_TSTORM_ETH_CONN_AG_CTX_BIT3_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_BIT3_SHIFT	3
+#define E4_TSTORM_ETH_CONN_AG_CTX_BIT4_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_BIT4_SHIFT	4
+#define E4_TSTORM_ETH_CONN_AG_CTX_BIT5_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_BIT5_SHIFT	5
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF0_SHIFT	6
+	u8 flags1;
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF1_SHIFT	0
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF2_SHIFT	2
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF3_MASK	0x3
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF3_SHIFT	4
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF4_MASK	0x3
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF4_SHIFT	6
+	u8 flags2;
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF5_SHIFT	0
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF6_SHIFT	2
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF7_MASK	0x3
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF7_SHIFT	4
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF8_MASK	0x3
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF8_SHIFT	6
+	u8 flags3;
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF9_MASK	0x3
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF9_SHIFT	0
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF10_MASK	0x3
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF10_SHIFT	2
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF0EN_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF0EN_SHIFT	4
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF1EN_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF1EN_SHIFT	5
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF2EN_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF2EN_SHIFT	6
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF3EN_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF3EN_SHIFT	7
+	u8 flags4;
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF4EN_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF4EN_SHIFT	0
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF5EN_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF5EN_SHIFT	1
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF6EN_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF6EN_SHIFT	2
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF7EN_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF7EN_SHIFT	3
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF8EN_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF8EN_SHIFT	4
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF9EN_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF9EN_SHIFT	5
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF10EN_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_CF10EN_SHIFT	6
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE0EN_SHIFT	7
+	u8 flags5;
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE1EN_SHIFT		0
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE2EN_SHIFT		1
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE3EN_SHIFT		2
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE4EN_SHIFT		3
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE5EN_SHIFT		4
+#define E4_TSTORM_ETH_CONN_AG_CTX_RX_BD_EN_MASK		0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_RX_BD_EN_SHIFT	5
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE7EN_MASK		0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE7EN_SHIFT		6
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE8EN_MASK		0x1
+#define E4_TSTORM_ETH_CONN_AG_CTX_RULE8EN_SHIFT		7
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 reg6;
+	__le32 reg7;
+	__le32 reg8;
+	u8 byte2;
+	u8 byte3;
+	__le16 rx_bd_cons;
+	u8 byte4;
+	u8 byte5;
+	__le16 rx_bd_prod;
+	__le16 word2;
+	__le16 word3;
+	__le32 reg9;
+	__le32 reg10;
+};
+
+struct e4_ustorm_eth_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_USTORM_ETH_CONN_AG_CTX_BIT0_MASK			0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_BIT0_SHIFT			0
+#define E4_USTORM_ETH_CONN_AG_CTX_BIT1_MASK			0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_BIT1_SHIFT			1
+#define E4_USTORM_ETH_CONN_AG_CTX_TX_PMD_TERMINATE_CF_MASK	0x3
+#define E4_USTORM_ETH_CONN_AG_CTX_TX_PMD_TERMINATE_CF_SHIFT	2
+#define E4_USTORM_ETH_CONN_AG_CTX_RX_PMD_TERMINATE_CF_MASK	0x3
+#define E4_USTORM_ETH_CONN_AG_CTX_RX_PMD_TERMINATE_CF_SHIFT	4
+#define E4_USTORM_ETH_CONN_AG_CTX_CF2_MASK			0x3
+#define E4_USTORM_ETH_CONN_AG_CTX_CF2_SHIFT			6
+	u8 flags1;
+#define E4_USTORM_ETH_CONN_AG_CTX_CF3_MASK			0x3
+#define E4_USTORM_ETH_CONN_AG_CTX_CF3_SHIFT			0
+#define E4_USTORM_ETH_CONN_AG_CTX_TX_ARM_CF_MASK		0x3
+#define E4_USTORM_ETH_CONN_AG_CTX_TX_ARM_CF_SHIFT		2
+#define E4_USTORM_ETH_CONN_AG_CTX_RX_ARM_CF_MASK		0x3
+#define E4_USTORM_ETH_CONN_AG_CTX_RX_ARM_CF_SHIFT		4
+#define E4_USTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_MASK	0x3
+#define E4_USTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_SHIFT	6
+	u8 flags2;
+#define E4_USTORM_ETH_CONN_AG_CTX_TX_PMD_TERMINATE_CF_EN_MASK	0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_TX_PMD_TERMINATE_CF_EN_SHIFT	0
+#define E4_USTORM_ETH_CONN_AG_CTX_RX_PMD_TERMINATE_CF_EN_MASK	0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_RX_PMD_TERMINATE_CF_EN_SHIFT	1
+#define E4_USTORM_ETH_CONN_AG_CTX_CF2EN_MASK			0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_CF2EN_SHIFT			2
+#define E4_USTORM_ETH_CONN_AG_CTX_CF3EN_MASK			0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_CF3EN_SHIFT			3
+#define E4_USTORM_ETH_CONN_AG_CTX_TX_ARM_CF_EN_MASK		0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_TX_ARM_CF_EN_SHIFT		4
+#define E4_USTORM_ETH_CONN_AG_CTX_RX_ARM_CF_EN_MASK		0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_RX_ARM_CF_EN_SHIFT		5
+#define E4_USTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_EN_MASK	0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_TX_BD_CONS_UPD_CF_EN_SHIFT	6
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE0EN_MASK			0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE0EN_SHIFT			7
+	u8 flags3;
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE1EN_SHIFT	0
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE2EN_SHIFT	1
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE3EN_SHIFT	2
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE4EN_SHIFT	3
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE5EN_MASK	0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE5EN_SHIFT	4
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE6EN_MASK	0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE6EN_SHIFT	5
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE7EN_MASK	0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE7EN_SHIFT	6
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE8EN_MASK	0x1
+#define E4_USTORM_ETH_CONN_AG_CTX_RULE8EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le16 tx_bd_cons;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 tx_int_coallecing_timeset;
+	__le16 tx_drv_bd_cons;
+	__le16 rx_drv_cqe_cons;
+};
+
+/* The eth storm context for the Ustorm */
+struct ustorm_eth_conn_st_ctx {
+	__le32 reserved[40];
+};
+
+/* The eth storm context for the Mstorm */
+struct mstorm_eth_conn_st_ctx {
+	__le32 reserved[8];
+};
+
+/* eth connection context */
+struct e4_eth_conn_context {
+	struct tstorm_eth_conn_st_ctx tstorm_st_context;
+	struct regpair tstorm_st_padding[2];
+	struct pstorm_eth_conn_st_ctx pstorm_st_context;
+	struct xstorm_eth_conn_st_ctx xstorm_st_context;
+	struct e4_xstorm_eth_conn_ag_ctx xstorm_ag_context;
+	struct ystorm_eth_conn_st_ctx ystorm_st_context;
+	struct e4_ystorm_eth_conn_ag_ctx ystorm_ag_context;
+	struct e4_tstorm_eth_conn_ag_ctx tstorm_ag_context;
+	struct e4_ustorm_eth_conn_ag_ctx ustorm_ag_context;
+	struct ustorm_eth_conn_st_ctx ustorm_st_context;
+	struct mstorm_eth_conn_st_ctx mstorm_st_context;
+};
+
+/* Ethernet filter types: mac/vlan/pair */
+enum eth_error_code {
+	ETH_OK = 0x00,
+	ETH_FILTERS_MAC_ADD_FAIL_FULL,
+	ETH_FILTERS_MAC_ADD_FAIL_FULL_MTT2,
+	ETH_FILTERS_MAC_ADD_FAIL_DUP_MTT2,
+	ETH_FILTERS_MAC_ADD_FAIL_DUP_STT2,
+	ETH_FILTERS_MAC_DEL_FAIL_NOF,
+	ETH_FILTERS_MAC_DEL_FAIL_NOF_MTT2,
+	ETH_FILTERS_MAC_DEL_FAIL_NOF_STT2,
+	ETH_FILTERS_MAC_ADD_FAIL_ZERO_MAC,
+	ETH_FILTERS_VLAN_ADD_FAIL_FULL,
+	ETH_FILTERS_VLAN_ADD_FAIL_DUP,
+	ETH_FILTERS_VLAN_DEL_FAIL_NOF,
+	ETH_FILTERS_VLAN_DEL_FAIL_NOF_TT1,
+	ETH_FILTERS_PAIR_ADD_FAIL_DUP,
+	ETH_FILTERS_PAIR_ADD_FAIL_FULL,
+	ETH_FILTERS_PAIR_ADD_FAIL_FULL_MAC,
+	ETH_FILTERS_PAIR_DEL_FAIL_NOF,
+	ETH_FILTERS_PAIR_DEL_FAIL_NOF_TT1,
+	ETH_FILTERS_PAIR_ADD_FAIL_ZERO_MAC,
+	ETH_FILTERS_VNI_ADD_FAIL_FULL,
+	ETH_FILTERS_VNI_ADD_FAIL_DUP,
+	ETH_FILTERS_GFT_UPDATE_FAIL,
+	MAX_ETH_ERROR_CODE
+};
+
+/* Opcodes for the event ring */
+enum eth_event_opcode {
+	ETH_EVENT_UNUSED,
+	ETH_EVENT_VPORT_START,
+	ETH_EVENT_VPORT_UPDATE,
+	ETH_EVENT_VPORT_STOP,
+	ETH_EVENT_TX_QUEUE_START,
+	ETH_EVENT_TX_QUEUE_STOP,
+	ETH_EVENT_RX_QUEUE_START,
+	ETH_EVENT_RX_QUEUE_UPDATE,
+	ETH_EVENT_RX_QUEUE_STOP,
+	ETH_EVENT_FILTERS_UPDATE,
+	ETH_EVENT_RX_ADD_OPENFLOW_FILTER,
+	ETH_EVENT_RX_DELETE_OPENFLOW_FILTER,
+	ETH_EVENT_RX_CREATE_OPENFLOW_ACTION,
+	ETH_EVENT_RX_ADD_UDP_FILTER,
+	ETH_EVENT_RX_DELETE_UDP_FILTER,
+	ETH_EVENT_RX_CREATE_GFT_ACTION,
+	ETH_EVENT_RX_GFT_UPDATE_FILTER,
+	ETH_EVENT_TX_QUEUE_UPDATE,
+	MAX_ETH_EVENT_OPCODE
+};
+
+/* Classify rule types in E2/E3 */
+enum eth_filter_action {
+	ETH_FILTER_ACTION_UNUSED,
+	ETH_FILTER_ACTION_REMOVE,
+	ETH_FILTER_ACTION_ADD,
+	ETH_FILTER_ACTION_REMOVE_ALL,
+	MAX_ETH_FILTER_ACTION
+};
+
+/* Command for adding/removing a classification rule $$KEEP_ENDIANNESS$$ */
+struct eth_filter_cmd {
+	u8 type;
+	u8 vport_id;
+	u8 action;
+	u8 reserved0;
+	__le32 vni;
+	__le16 mac_lsb;
+	__le16 mac_mid;
+	__le16 mac_msb;
+	__le16 vlan_id;
+};
+
+/*	$$KEEP_ENDIANNESS$$ */
+struct eth_filter_cmd_header {
+	u8 rx;
+	u8 tx;
+	u8 cmd_cnt;
+	u8 assert_on_error;
+	u8 reserved1[4];
+};
+
+/* Ethernet filter types: mac/vlan/pair */
+enum eth_filter_type {
+	ETH_FILTER_TYPE_UNUSED,
+	ETH_FILTER_TYPE_MAC,
+	ETH_FILTER_TYPE_VLAN,
+	ETH_FILTER_TYPE_PAIR,
+	ETH_FILTER_TYPE_INNER_MAC,
+	ETH_FILTER_TYPE_INNER_VLAN,
+	ETH_FILTER_TYPE_INNER_PAIR,
+	ETH_FILTER_TYPE_INNER_MAC_VNI_PAIR,
+	ETH_FILTER_TYPE_MAC_VNI_PAIR,
+	ETH_FILTER_TYPE_VNI,
+	MAX_ETH_FILTER_TYPE
+};
+
+/* Eth IPv4 Fragment Type */
+enum eth_ipv4_frag_type {
+	ETH_IPV4_NOT_FRAG,
+	ETH_IPV4_FIRST_FRAG,
+	ETH_IPV4_NON_FIRST_FRAG,
+	MAX_ETH_IPV4_FRAG_TYPE
+};
+
+/* eth IPv4 Fragment Type */
+enum eth_ip_type {
+	ETH_IPV4,
+	ETH_IPV6,
+	MAX_ETH_IP_TYPE
+};
+
+/* Ethernet Ramrod Command IDs */
+enum eth_ramrod_cmd_id {
+	ETH_RAMROD_UNUSED,
+	ETH_RAMROD_VPORT_START,
+	ETH_RAMROD_VPORT_UPDATE,
+	ETH_RAMROD_VPORT_STOP,
+	ETH_RAMROD_RX_QUEUE_START,
+	ETH_RAMROD_RX_QUEUE_STOP,
+	ETH_RAMROD_TX_QUEUE_START,
+	ETH_RAMROD_TX_QUEUE_STOP,
+	ETH_RAMROD_FILTERS_UPDATE,
+	ETH_RAMROD_RX_QUEUE_UPDATE,
+	ETH_RAMROD_RX_CREATE_OPENFLOW_ACTION,
+	ETH_RAMROD_RX_ADD_OPENFLOW_FILTER,
+	ETH_RAMROD_RX_DELETE_OPENFLOW_FILTER,
+	ETH_RAMROD_RX_ADD_UDP_FILTER,
+	ETH_RAMROD_RX_DELETE_UDP_FILTER,
+	ETH_RAMROD_RX_CREATE_GFT_ACTION,
+	ETH_RAMROD_GFT_UPDATE_FILTER,
+	ETH_RAMROD_TX_QUEUE_UPDATE,
+	MAX_ETH_RAMROD_CMD_ID
+};
+
+/* Return code from eth sp ramrods */
+struct eth_return_code {
+	u8 value;
+#define ETH_RETURN_CODE_ERR_CODE_MASK	0x1F
+#define ETH_RETURN_CODE_ERR_CODE_SHIFT	0
+#define ETH_RETURN_CODE_RESERVED_MASK	0x3
+#define ETH_RETURN_CODE_RESERVED_SHIFT	5
+#define ETH_RETURN_CODE_RX_TX_MASK	0x1
+#define ETH_RETURN_CODE_RX_TX_SHIFT	7
+};
+
+/* What to do in case an error occurs */
+enum eth_tx_err {
+	ETH_TX_ERR_DROP,
+	ETH_TX_ERR_ASSERT_MALICIOUS,
+	MAX_ETH_TX_ERR
+};
+
+/* Array of the different error type behaviors */
+struct eth_tx_err_vals {
+	__le16 values;
+#define ETH_TX_ERR_VALS_ILLEGAL_VLAN_MODE_MASK			0x1
+#define ETH_TX_ERR_VALS_ILLEGAL_VLAN_MODE_SHIFT			0
+#define ETH_TX_ERR_VALS_PACKET_TOO_SMALL_MASK			0x1
+#define ETH_TX_ERR_VALS_PACKET_TOO_SMALL_SHIFT			1
+#define ETH_TX_ERR_VALS_ANTI_SPOOFING_ERR_MASK			0x1
+#define ETH_TX_ERR_VALS_ANTI_SPOOFING_ERR_SHIFT			2
+#define ETH_TX_ERR_VALS_ILLEGAL_INBAND_TAGS_MASK		0x1
+#define ETH_TX_ERR_VALS_ILLEGAL_INBAND_TAGS_SHIFT		3
+#define ETH_TX_ERR_VALS_VLAN_INSERTION_W_INBAND_TAG_MASK	0x1
+#define ETH_TX_ERR_VALS_VLAN_INSERTION_W_INBAND_TAG_SHIFT	4
+#define ETH_TX_ERR_VALS_MTU_VIOLATION_MASK			0x1
+#define ETH_TX_ERR_VALS_MTU_VIOLATION_SHIFT			5
+#define ETH_TX_ERR_VALS_ILLEGAL_CONTROL_FRAME_MASK		0x1
+#define ETH_TX_ERR_VALS_ILLEGAL_CONTROL_FRAME_SHIFT		6
+#define ETH_TX_ERR_VALS_RESERVED_MASK				0x1FF
+#define ETH_TX_ERR_VALS_RESERVED_SHIFT				7
+};
+
+/* vport rss configuration data */
+struct eth_vport_rss_config {
+	__le16 capabilities;
+#define ETH_VPORT_RSS_CONFIG_IPV4_CAPABILITY_MASK		0x1
+#define ETH_VPORT_RSS_CONFIG_IPV4_CAPABILITY_SHIFT		0
+#define ETH_VPORT_RSS_CONFIG_IPV6_CAPABILITY_MASK		0x1
+#define ETH_VPORT_RSS_CONFIG_IPV6_CAPABILITY_SHIFT		1
+#define ETH_VPORT_RSS_CONFIG_IPV4_TCP_CAPABILITY_MASK		0x1
+#define ETH_VPORT_RSS_CONFIG_IPV4_TCP_CAPABILITY_SHIFT		2
+#define ETH_VPORT_RSS_CONFIG_IPV6_TCP_CAPABILITY_MASK		0x1
+#define ETH_VPORT_RSS_CONFIG_IPV6_TCP_CAPABILITY_SHIFT		3
+#define ETH_VPORT_RSS_CONFIG_IPV4_UDP_CAPABILITY_MASK		0x1
+#define ETH_VPORT_RSS_CONFIG_IPV4_UDP_CAPABILITY_SHIFT		4
+#define ETH_VPORT_RSS_CONFIG_IPV6_UDP_CAPABILITY_MASK		0x1
+#define ETH_VPORT_RSS_CONFIG_IPV6_UDP_CAPABILITY_SHIFT		5
+#define ETH_VPORT_RSS_CONFIG_EN_5_TUPLE_CAPABILITY_MASK		0x1
+#define ETH_VPORT_RSS_CONFIG_EN_5_TUPLE_CAPABILITY_SHIFT	6
+#define ETH_VPORT_RSS_CONFIG_RESERVED0_MASK			0x1FF
+#define ETH_VPORT_RSS_CONFIG_RESERVED0_SHIFT			7
+	u8 rss_id;
+	u8 rss_mode;
+	u8 update_rss_key;
+	u8 update_rss_ind_table;
+	u8 update_rss_capabilities;
+	u8 tbl_size;
+	__le32 reserved2[2];
+	__le16 indirection_table[ETH_RSS_IND_TABLE_ENTRIES_NUM];
+
+	__le32 rss_key[ETH_RSS_KEY_SIZE_REGS];
+	__le32 reserved3[2];
+};
+
+/* eth vport RSS mode */
+enum eth_vport_rss_mode {
+	ETH_VPORT_RSS_MODE_DISABLED,
+	ETH_VPORT_RSS_MODE_REGULAR,
+	MAX_ETH_VPORT_RSS_MODE
+};
+
+/* Command for setting classification flags for a vport $$KEEP_ENDIANNESS$$ */
+struct eth_vport_rx_mode {
+	__le16 state;
+#define ETH_VPORT_RX_MODE_UCAST_DROP_ALL_MASK		0x1
+#define ETH_VPORT_RX_MODE_UCAST_DROP_ALL_SHIFT		0
+#define ETH_VPORT_RX_MODE_UCAST_ACCEPT_ALL_MASK		0x1
+#define ETH_VPORT_RX_MODE_UCAST_ACCEPT_ALL_SHIFT	1
+#define ETH_VPORT_RX_MODE_UCAST_ACCEPT_UNMATCHED_MASK	0x1
+#define ETH_VPORT_RX_MODE_UCAST_ACCEPT_UNMATCHED_SHIFT	2
+#define ETH_VPORT_RX_MODE_MCAST_DROP_ALL_MASK		0x1
+#define ETH_VPORT_RX_MODE_MCAST_DROP_ALL_SHIFT		3
+#define ETH_VPORT_RX_MODE_MCAST_ACCEPT_ALL_MASK		0x1
+#define ETH_VPORT_RX_MODE_MCAST_ACCEPT_ALL_SHIFT	4
+#define ETH_VPORT_RX_MODE_BCAST_ACCEPT_ALL_MASK		0x1
+#define ETH_VPORT_RX_MODE_BCAST_ACCEPT_ALL_SHIFT	5
+#define ETH_VPORT_RX_MODE_RESERVED1_MASK		0x3FF
+#define ETH_VPORT_RX_MODE_RESERVED1_SHIFT		6
+};
+
+/* Command for setting tpa parameters */
+struct eth_vport_tpa_param {
+	u8 tpa_ipv4_en_flg;
+	u8 tpa_ipv6_en_flg;
+	u8 tpa_ipv4_tunn_en_flg;
+	u8 tpa_ipv6_tunn_en_flg;
+	u8 tpa_pkt_split_flg;
+	u8 tpa_hdr_data_split_flg;
+	u8 tpa_gro_consistent_flg;
+
+	u8 tpa_max_aggs_num;
+
+	__le16 tpa_max_size;
+	__le16 tpa_min_size_to_start;
+
+	__le16 tpa_min_size_to_cont;
+	u8 max_buff_num;
+	u8 reserved;
+};
+
+/* Command for setting classification flags for a vport $$KEEP_ENDIANNESS$$ */
+struct eth_vport_tx_mode {
+	__le16 state;
+#define ETH_VPORT_TX_MODE_UCAST_DROP_ALL_MASK		0x1
+#define ETH_VPORT_TX_MODE_UCAST_DROP_ALL_SHIFT		0
+#define ETH_VPORT_TX_MODE_UCAST_ACCEPT_ALL_MASK		0x1
+#define ETH_VPORT_TX_MODE_UCAST_ACCEPT_ALL_SHIFT	1
+#define ETH_VPORT_TX_MODE_MCAST_DROP_ALL_MASK		0x1
+#define ETH_VPORT_TX_MODE_MCAST_DROP_ALL_SHIFT		2
+#define ETH_VPORT_TX_MODE_MCAST_ACCEPT_ALL_MASK		0x1
+#define ETH_VPORT_TX_MODE_MCAST_ACCEPT_ALL_SHIFT	3
+#define ETH_VPORT_TX_MODE_BCAST_ACCEPT_ALL_MASK		0x1
+#define ETH_VPORT_TX_MODE_BCAST_ACCEPT_ALL_SHIFT	4
+#define ETH_VPORT_TX_MODE_RESERVED1_MASK		0x7FF
+#define ETH_VPORT_TX_MODE_RESERVED1_SHIFT		5
+};
+
+/* GFT filter update action type */
+enum gft_filter_update_action {
+	GFT_ADD_FILTER,
+	GFT_DELETE_FILTER,
+	MAX_GFT_FILTER_UPDATE_ACTION
+};
+
+/* Ramrod data for rx add openflow filter */
+struct rx_add_openflow_filter_data {
+	__le16 action_icid;
+	u8 priority;
+	u8 reserved0;
+	__le32 tenant_id;
+	__le16 dst_mac_hi;
+	__le16 dst_mac_mid;
+	__le16 dst_mac_lo;
+	__le16 src_mac_hi;
+	__le16 src_mac_mid;
+	__le16 src_mac_lo;
+	__le16 vlan_id;
+	__le16 l2_eth_type;
+	u8 ipv4_dscp;
+	u8 ipv4_frag_type;
+	u8 ipv4_over_ip;
+	u8 tenant_id_exists;
+	__le32 ipv4_dst_addr;
+	__le32 ipv4_src_addr;
+	__le16 l4_dst_port;
+	__le16 l4_src_port;
+};
+
+/* Ramrod data for rx create gft action */
+struct rx_create_gft_action_data {
+	u8 vport_id;
+	u8 reserved[7];
+};
+
+/* Ramrod data for rx create openflow action */
+struct rx_create_openflow_action_data {
+	u8 vport_id;
+	u8 reserved[7];
+};
+
+/* Ramrod data for rx queue start ramrod */
+struct rx_queue_start_ramrod_data {
+	__le16 rx_queue_id;
+	__le16 num_of_pbl_pages;
+	__le16 bd_max_bytes;
+	__le16 sb_id;
+	u8 sb_index;
+	u8 vport_id;
+	u8 default_rss_queue_flg;
+	u8 complete_cqe_flg;
+	u8 complete_event_flg;
+	u8 stats_counter_id;
+	u8 pin_context;
+	u8 pxp_tph_valid_bd;
+	u8 pxp_tph_valid_pkt;
+	u8 pxp_st_hint;
+
+	__le16 pxp_st_index;
+	u8 pmd_mode;
+
+	u8 notify_en;
+	u8 toggle_val;
+
+	u8 vf_rx_prod_index;
+	u8 vf_rx_prod_use_zone_a;
+	u8 reserved[5];
+	__le16 reserved1;
+	struct regpair cqe_pbl_addr;
+	struct regpair bd_base;
+	struct regpair reserved2;
+};
+
+/* Ramrod data for rx queue stop ramrod */
+struct rx_queue_stop_ramrod_data {
+	__le16 rx_queue_id;
+	u8 complete_cqe_flg;
+	u8 complete_event_flg;
+	u8 vport_id;
+	u8 reserved[3];
+};
+
+/* Ramrod data for rx queue update ramrod */
+struct rx_queue_update_ramrod_data {
+	__le16 rx_queue_id;
+	u8 complete_cqe_flg;
+	u8 complete_event_flg;
+	u8 vport_id;
+	u8 set_default_rss_queue;
+	u8 reserved[3];
+	u8 reserved1;
+	u8 reserved2;
+	u8 reserved3;
+	__le16 reserved4;
+	__le16 reserved5;
+	struct regpair reserved6;
+};
+
+/* Ramrod data for rx Add UDP Filter */
+struct rx_udp_filter_data {
+	__le16 action_icid;
+	__le16 vlan_id;
+	u8 ip_type;
+	u8 tenant_id_exists;
+	__le16 reserved1;
+	__le32 ip_dst_addr[4];
+	__le32 ip_src_addr[4];
+	__le16 udp_dst_port;
+	__le16 udp_src_port;
+	__le32 tenant_id;
+};
+
+/* Add or delete GFT filter - filter is packet header of type of packet wished
+ * to pass certain FW flow.
+ */
+struct rx_update_gft_filter_data {
+	struct regpair pkt_hdr_addr;
+	__le16 pkt_hdr_length;
+	__le16 action_icid;
+	__le16 rx_qid;
+	__le16 flow_id;
+	__le16 vport_id;
+	u8 action_icid_valid;
+	u8 rx_qid_valid;
+	u8 flow_id_valid;
+	u8 filter_action;
+	u8 assert_on_error;
+	u8 inner_vlan_removal_en;
+};
+
+/* Ramrod data for rx queue start ramrod */
+struct tx_queue_start_ramrod_data {
+	__le16 sb_id;
+	u8 sb_index;
+	u8 vport_id;
+	u8 reserved0;
+	u8 stats_counter_id;
+	__le16 qm_pq_id;
+	u8 flags;
+#define TX_QUEUE_START_RAMROD_DATA_DISABLE_OPPORTUNISTIC_MASK	0x1
+#define TX_QUEUE_START_RAMROD_DATA_DISABLE_OPPORTUNISTIC_SHIFT	0
+#define TX_QUEUE_START_RAMROD_DATA_TEST_MODE_PKT_DUP_MASK	0x1
+#define TX_QUEUE_START_RAMROD_DATA_TEST_MODE_PKT_DUP_SHIFT	1
+#define TX_QUEUE_START_RAMROD_DATA_TEST_MODE_TX_DEST_MASK	0x1
+#define TX_QUEUE_START_RAMROD_DATA_TEST_MODE_TX_DEST_SHIFT	2
+#define TX_QUEUE_START_RAMROD_DATA_PMD_MODE_MASK		0x1
+#define TX_QUEUE_START_RAMROD_DATA_PMD_MODE_SHIFT		3
+#define TX_QUEUE_START_RAMROD_DATA_NOTIFY_EN_MASK		0x1
+#define TX_QUEUE_START_RAMROD_DATA_NOTIFY_EN_SHIFT		4
+#define TX_QUEUE_START_RAMROD_DATA_PIN_CONTEXT_MASK		0x1
+#define TX_QUEUE_START_RAMROD_DATA_PIN_CONTEXT_SHIFT		5
+#define TX_QUEUE_START_RAMROD_DATA_RESERVED1_MASK		0x3
+#define TX_QUEUE_START_RAMROD_DATA_RESERVED1_SHIFT		6
+	u8 pxp_st_hint;
+	u8 pxp_tph_valid_bd;
+	u8 pxp_tph_valid_pkt;
+	__le16 pxp_st_index;
+	__le16 comp_agg_size;
+	__le16 queue_zone_id;
+	__le16 reserved2;
+	__le16 pbl_size;
+	__le16 tx_queue_id;
+	__le16 same_as_last_id;
+	__le16 reserved[3];
+	struct regpair pbl_base_addr;
+	struct regpair bd_cons_address;
+};
+
+/* Ramrod data for tx queue stop ramrod */
+struct tx_queue_stop_ramrod_data {
+	__le16 reserved[4];
+};
+
+/* Ramrod data for tx queue update ramrod */
+struct tx_queue_update_ramrod_data {
+	__le16 update_qm_pq_id_flg;
+	__le16 qm_pq_id;
+	__le32 reserved0;
+	struct regpair reserved1[5];
+};
+
+/* Ramrod data for vport update ramrod */
+struct vport_filter_update_ramrod_data {
+	struct eth_filter_cmd_header filter_cmd_hdr;
+	struct eth_filter_cmd filter_cmds[ETH_FILTER_RULES_COUNT];
+};
+
+/* Ramrod data for vport start ramrod */
+struct vport_start_ramrod_data {
+	u8 vport_id;
+	u8 sw_fid;
+	__le16 mtu;
+	u8 drop_ttl0_en;
+	u8 inner_vlan_removal_en;
+	struct eth_vport_rx_mode rx_mode;
+	struct eth_vport_tx_mode tx_mode;
+	struct eth_vport_tpa_param tpa_param;
+	__le16 default_vlan;
+	u8 tx_switching_en;
+	u8 anti_spoofing_en;
+
+	u8 default_vlan_en;
+
+	u8 handle_ptp_pkts;
+	u8 silent_vlan_removal_en;
+	u8 untagged;
+	struct eth_tx_err_vals tx_err_behav;
+
+	u8 zero_placement_offset;
+	u8 ctl_frame_mac_check_en;
+	u8 ctl_frame_ethtype_check_en;
+	u8 reserved[1];
+};
+
+/* Ramrod data for vport stop ramrod */
+struct vport_stop_ramrod_data {
+	u8 vport_id;
+	u8 reserved[7];
+};
+
+/* Ramrod data for vport update ramrod */
+struct vport_update_ramrod_data_cmn {
+	u8 vport_id;
+	u8 update_rx_active_flg;
+	u8 rx_active_flg;
+	u8 update_tx_active_flg;
+	u8 tx_active_flg;
+	u8 update_rx_mode_flg;
+	u8 update_tx_mode_flg;
+	u8 update_approx_mcast_flg;
+
+	u8 update_rss_flg;
+	u8 update_inner_vlan_removal_en_flg;
+
+	u8 inner_vlan_removal_en;
+	u8 update_tpa_param_flg;
+	u8 update_tpa_en_flg;
+	u8 update_tx_switching_en_flg;
+
+	u8 tx_switching_en;
+	u8 update_anti_spoofing_en_flg;
+
+	u8 anti_spoofing_en;
+	u8 update_handle_ptp_pkts;
+
+	u8 handle_ptp_pkts;
+	u8 update_default_vlan_en_flg;
+
+	u8 default_vlan_en;
+
+	u8 update_default_vlan_flg;
+
+	__le16 default_vlan;
+	u8 update_accept_any_vlan_flg;
+
+	u8 accept_any_vlan;
+	u8 silent_vlan_removal_en;
+	u8 update_mtu_flg;
+
+	__le16 mtu;
+	u8 update_ctl_frame_checks_en_flg;
+	u8 ctl_frame_mac_check_en;
+	u8 ctl_frame_ethtype_check_en;
+	u8 reserved[15];
+};
+
+struct vport_update_ramrod_mcast {
+	__le32 bins[ETH_MULTICAST_MAC_BINS_IN_REGS];
+};
+
+/* Ramrod data for vport update ramrod */
+struct vport_update_ramrod_data {
+	struct vport_update_ramrod_data_cmn common;
+
+	struct eth_vport_rx_mode rx_mode;
+	struct eth_vport_tx_mode tx_mode;
+	__le32 reserved[3];
+	struct eth_vport_tpa_param tpa_param;
+	struct vport_update_ramrod_mcast approx_mcast;
+	struct eth_vport_rss_config rss_config;
+};
+
+struct e4_xstorm_eth_conn_ag_ctx_dq_ext_ldpart {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_EXIST_IN_QM0_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_EXIST_IN_QM0_SHIFT	0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED1_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED1_SHIFT		1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED2_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED2_SHIFT		2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_EXIST_IN_QM3_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_EXIST_IN_QM3_SHIFT	3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED3_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED3_SHIFT		4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED4_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED4_SHIFT		5
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED5_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED5_SHIFT		6
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED6_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED6_SHIFT		7
+	u8 flags1;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED7_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED7_SHIFT		0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED8_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED8_SHIFT		1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED9_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED9_SHIFT		2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_BIT11_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_BIT11_SHIFT		3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_E5_RESERVED2_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_E5_RESERVED2_SHIFT	4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_E5_RESERVED3_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_E5_RESERVED3_SHIFT	5
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_TX_RULE_ACTIVE_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_TX_RULE_ACTIVE_SHIFT	6
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_DQ_CF_ACTIVE_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_DQ_CF_ACTIVE_SHIFT	7
+	u8 flags2;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF0_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF0_SHIFT	0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF1_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF1_SHIFT	2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF2_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF2_SHIFT	4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF3_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF3_SHIFT	6
+	u8 flags3;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF4_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF4_SHIFT	0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF5_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF5_SHIFT	2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF6_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF6_SHIFT	4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF7_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF7_SHIFT	6
+	u8 flags4;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF8_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF8_SHIFT	0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF9_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF9_SHIFT	2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF10_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF10_SHIFT	4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF11_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF11_SHIFT	6
+	u8 flags5;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF12_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF12_SHIFT	0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF13_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF13_SHIFT	2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF14_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF14_SHIFT	4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF15_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF15_SHIFT	6
+	u8 flags6;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_GO_TO_BD_CONS_CF_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_GO_TO_BD_CONS_CF_SHIFT	0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_MULTI_UNICAST_CF_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_MULTI_UNICAST_CF_SHIFT	2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_DQ_CF_MASK		0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_DQ_CF_SHIFT		4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_TERMINATE_CF_MASK	0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_TERMINATE_CF_SHIFT	6
+	u8 flags7;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_FLUSH_Q0_MASK		0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_FLUSH_Q0_SHIFT		0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED10_MASK		0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED10_SHIFT	2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_SLOW_PATH_MASK		0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_SLOW_PATH_SHIFT		4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF0EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF0EN_SHIFT		6
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF1EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF1EN_SHIFT		7
+	u8 flags8;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF2EN_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF2EN_SHIFT	0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF3EN_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF3EN_SHIFT	1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF4EN_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF4EN_SHIFT	2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF5EN_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF5EN_SHIFT	3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF6EN_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF6EN_SHIFT	4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF7EN_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF7EN_SHIFT	5
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF8EN_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF8EN_SHIFT	6
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF9EN_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF9EN_SHIFT	7
+	u8 flags9;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF10EN_MASK			0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF10EN_SHIFT			0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF11EN_MASK			0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF11EN_SHIFT			1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF12EN_MASK			0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF12EN_SHIFT			2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF13EN_MASK			0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF13EN_SHIFT			3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF14EN_MASK			0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF14EN_SHIFT			4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF15EN_MASK			0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_CF15EN_SHIFT			5
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_GO_TO_BD_CONS_CF_EN_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_GO_TO_BD_CONS_CF_EN_SHIFT	6
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_MULTI_UNICAST_CF_EN_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_MULTI_UNICAST_CF_EN_SHIFT	7
+	u8 flags10;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_DQ_CF_EN_MASK			0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_DQ_CF_EN_SHIFT			0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_TERMINATE_CF_EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_TERMINATE_CF_EN_SHIFT		1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_FLUSH_Q0_EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_FLUSH_Q0_EN_SHIFT		2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED11_MASK			0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED11_SHIFT		3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_SLOW_PATH_EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_SLOW_PATH_EN_SHIFT		4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_TPH_ENABLE_EN_RESERVED_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_TPH_ENABLE_EN_RESERVED_SHIFT	5
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED12_MASK			0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED12_SHIFT		6
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED13_MASK			0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED13_SHIFT		7
+	u8 flags11;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED14_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED14_SHIFT	0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED15_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED15_SHIFT	1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_TX_DEC_RULE_EN_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_TX_DEC_RULE_EN_SHIFT	2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE5EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE5EN_SHIFT		3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE6EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE6EN_SHIFT		4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE7EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE7EN_SHIFT		5
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED1_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED1_SHIFT	6
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE9EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE9EN_SHIFT		7
+	u8 flags12;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE10EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE10EN_SHIFT		0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE11EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE11EN_SHIFT		1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED2_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED2_SHIFT	2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED3_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED3_SHIFT	3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE14EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE14EN_SHIFT		4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE15EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE15EN_SHIFT		5
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE16EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE16EN_SHIFT		6
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE17EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE17EN_SHIFT		7
+	u8 flags13;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE18EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE18EN_SHIFT		0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE19EN_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_RULE19EN_SHIFT		1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED4_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED4_SHIFT	2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED5_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED5_SHIFT	3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED6_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED6_SHIFT	4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED7_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED7_SHIFT	5
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED8_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED8_SHIFT	6
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED9_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED9_SHIFT	7
+	u8 flags14;
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_USE_EXT_HDR_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_USE_EXT_HDR_SHIFT		0
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_SEND_RAW_L3L4_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_SEND_RAW_L3L4_SHIFT	1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_INBAND_PROP_HDR_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_INBAND_PROP_HDR_SHIFT	2
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_SEND_EXT_TUNNEL_MASK	0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_SEND_EXT_TUNNEL_SHIFT	3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_L2_EDPM_ENABLE_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_L2_EDPM_ENABLE_SHIFT		4
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_ROCE_EDPM_ENABLE_MASK		0x1
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_ROCE_EDPM_ENABLE_SHIFT		5
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_TPH_ENABLE_MASK			0x3
+#define E4XSTORMETHCONNAGCTXDQEXTLDPART_TPH_ENABLE_SHIFT		6
+	u8 edpm_event_id;
+	__le16 physical_q0;
+	__le16 e5_reserved1;
+	__le16 edpm_num_bds;
+	__le16 tx_bd_cons;
+	__le16 tx_bd_prod;
+	__le16 updated_qm_pq_id;
+	__le16 conn_dpi;
+	u8 byte3;
+	u8 byte4;
+	u8 byte5;
+	u8 byte6;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le32 reg4;
+};
+
+struct e4_mstorm_eth_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_MSTORM_ETH_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_MSTORM_ETH_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	 0
+#define E4_MSTORM_ETH_CONN_AG_CTX_BIT1_MASK		0x1
+#define E4_MSTORM_ETH_CONN_AG_CTX_BIT1_SHIFT		1
+#define E4_MSTORM_ETH_CONN_AG_CTX_CF0_MASK		0x3
+#define E4_MSTORM_ETH_CONN_AG_CTX_CF0_SHIFT		2
+#define E4_MSTORM_ETH_CONN_AG_CTX_CF1_MASK		0x3
+#define E4_MSTORM_ETH_CONN_AG_CTX_CF1_SHIFT		4
+#define E4_MSTORM_ETH_CONN_AG_CTX_CF2_MASK		0x3
+#define E4_MSTORM_ETH_CONN_AG_CTX_CF2_SHIFT		6
+	u8 flags1;
+#define E4_MSTORM_ETH_CONN_AG_CTX_CF0EN_MASK	0x1
+#define E4_MSTORM_ETH_CONN_AG_CTX_CF0EN_SHIFT	0
+#define E4_MSTORM_ETH_CONN_AG_CTX_CF1EN_MASK	0x1
+#define E4_MSTORM_ETH_CONN_AG_CTX_CF1EN_SHIFT	1
+#define E4_MSTORM_ETH_CONN_AG_CTX_CF2EN_MASK	0x1
+#define E4_MSTORM_ETH_CONN_AG_CTX_CF2EN_SHIFT	2
+#define E4_MSTORM_ETH_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define E4_MSTORM_ETH_CONN_AG_CTX_RULE0EN_SHIFT	3
+#define E4_MSTORM_ETH_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define E4_MSTORM_ETH_CONN_AG_CTX_RULE1EN_SHIFT	4
+#define E4_MSTORM_ETH_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_MSTORM_ETH_CONN_AG_CTX_RULE2EN_SHIFT	5
+#define E4_MSTORM_ETH_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_MSTORM_ETH_CONN_AG_CTX_RULE3EN_SHIFT	6
+#define E4_MSTORM_ETH_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_MSTORM_ETH_CONN_AG_CTX_RULE4EN_SHIFT	7
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+};
+
+struct e4_xstorm_eth_hw_conn_ag_ctx {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED1_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED1_SHIFT	1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED2_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED2_SHIFT	2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_EXIST_IN_QM3_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_EXIST_IN_QM3_SHIFT	3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED3_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED3_SHIFT	4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED4_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED4_SHIFT	5
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED5_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED5_SHIFT	6
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED6_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED6_SHIFT	7
+	u8 flags1;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED7_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED7_SHIFT		0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED8_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED8_SHIFT		1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED9_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED9_SHIFT		2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_BIT11_MASK			0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_BIT11_SHIFT		3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_E5_RESERVED2_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_E5_RESERVED2_SHIFT		4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_E5_RESERVED3_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_E5_RESERVED3_SHIFT		5
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_TX_RULE_ACTIVE_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_TX_RULE_ACTIVE_SHIFT	6
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_ACTIVE_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_ACTIVE_SHIFT		7
+	u8 flags2;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF0_SHIFT	0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF1_SHIFT	2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF2_SHIFT	4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF3_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF3_SHIFT	6
+	u8 flags3;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF4_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF4_SHIFT	0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF5_SHIFT	2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF6_SHIFT	4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF7_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF7_SHIFT	6
+	u8 flags4;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF8_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF8_SHIFT	0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF9_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF9_SHIFT	2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF10_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF10_SHIFT	4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF11_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF11_SHIFT	6
+	u8 flags5;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF12_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF12_SHIFT	0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF13_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF13_SHIFT	2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF14_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF14_SHIFT	4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF15_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF15_SHIFT	6
+	u8 flags6;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_GO_TO_BD_CONS_CF_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_GO_TO_BD_CONS_CF_SHIFT	0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_MULTI_UNICAST_CF_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_MULTI_UNICAST_CF_SHIFT	2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_MASK			0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_SHIFT		4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_TERMINATE_CF_MASK		0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_TERMINATE_CF_SHIFT		6
+	u8 flags7;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_FLUSH_Q0_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_FLUSH_Q0_SHIFT	0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED10_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED10_SHIFT	2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_SLOW_PATH_MASK	0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_SLOW_PATH_SHIFT	4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF0EN_SHIFT	6
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF1EN_SHIFT	7
+	u8 flags8;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF2EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF2EN_SHIFT	0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF3EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF3EN_SHIFT	1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF4EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF4EN_SHIFT	2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF5EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF5EN_SHIFT	3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF6EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF6EN_SHIFT	4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF7EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF7EN_SHIFT	5
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF8EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF8EN_SHIFT	6
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF9EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF9EN_SHIFT	7
+	u8 flags9;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF10EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF10EN_SHIFT		0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF11EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF11EN_SHIFT		1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF12EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF12EN_SHIFT		2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF13EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF13EN_SHIFT		3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF14EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF14EN_SHIFT		4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF15EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_CF15EN_SHIFT		5
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_GO_TO_BD_CONS_CF_EN_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_GO_TO_BD_CONS_CF_EN_SHIFT	6
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_MULTI_UNICAST_CF_EN_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_MULTI_UNICAST_CF_EN_SHIFT	7
+	u8 flags10;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_EN_MASK			0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_EN_SHIFT			0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_TERMINATE_CF_EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_TERMINATE_CF_EN_SHIFT		1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_FLUSH_Q0_EN_MASK			0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_FLUSH_Q0_EN_SHIFT			2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED11_MASK			0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED11_SHIFT			3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_SLOW_PATH_EN_MASK			0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_SLOW_PATH_EN_SHIFT			4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_TPH_ENABLE_EN_RESERVED_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_TPH_ENABLE_EN_RESERVED_SHIFT	5
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED12_MASK			0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED12_SHIFT			6
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED13_MASK			0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED13_SHIFT			7
+	u8 flags11;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED14_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED14_SHIFT		0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED15_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RESERVED15_SHIFT		1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_TX_DEC_RULE_EN_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_TX_DEC_RULE_EN_SHIFT	2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE5EN_SHIFT		3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE6EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE6EN_SHIFT		4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE7EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE7EN_SHIFT		5
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED1_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED1_SHIFT		6
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE9EN_MASK		0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE9EN_SHIFT		7
+	u8 flags12;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE10EN_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE10EN_SHIFT	0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE11EN_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE11EN_SHIFT	1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED2_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED2_SHIFT	2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED3_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED3_SHIFT	3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE14EN_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE14EN_SHIFT	4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE15EN_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE15EN_SHIFT	5
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE16EN_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE16EN_SHIFT	6
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE17EN_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE17EN_SHIFT	7
+	u8 flags13;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE18EN_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE18EN_SHIFT	0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE19EN_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_RULE19EN_SHIFT	1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED4_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED4_SHIFT	2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED5_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED5_SHIFT	3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED6_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED6_SHIFT	4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED7_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED7_SHIFT	5
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED8_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED8_SHIFT	6
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED9_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED9_SHIFT	7
+	u8 flags14;
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_EDPM_USE_EXT_HDR_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_EDPM_USE_EXT_HDR_SHIFT	0
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_EDPM_SEND_RAW_L3L4_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_EDPM_SEND_RAW_L3L4_SHIFT	1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_EDPM_INBAND_PROP_HDR_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_EDPM_INBAND_PROP_HDR_SHIFT	2
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_EDPM_SEND_EXT_TUNNEL_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_EDPM_SEND_EXT_TUNNEL_SHIFT	3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_L2_EDPM_ENABLE_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_L2_EDPM_ENABLE_SHIFT	4
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_ROCE_EDPM_ENABLE_MASK	0x1
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_ROCE_EDPM_ENABLE_SHIFT	5
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_TPH_ENABLE_MASK		0x3
+#define E4_XSTORM_ETH_HW_CONN_AG_CTX_TPH_ENABLE_SHIFT		6
+	u8 edpm_event_id;
+	__le16 physical_q0;
+	__le16 e5_reserved1;
+	__le16 edpm_num_bds;
+	__le16 tx_bd_cons;
+	__le16 tx_bd_prod;
+	__le16 updated_qm_pq_id;
+	__le16 conn_dpi;
+};
+
+/* GFT CAM line struct */
+struct gft_cam_line {
+	__le32 camline;
+#define GFT_CAM_LINE_VALID_MASK		0x1
+#define GFT_CAM_LINE_VALID_SHIFT	0
+#define GFT_CAM_LINE_DATA_MASK		0x3FFF
+#define GFT_CAM_LINE_DATA_SHIFT		1
+#define GFT_CAM_LINE_MASK_BITS_MASK	0x3FFF
+#define GFT_CAM_LINE_MASK_BITS_SHIFT	15
+#define GFT_CAM_LINE_RESERVED1_MASK	0x7
+#define GFT_CAM_LINE_RESERVED1_SHIFT	29
+};
+
+/* GFT CAM line struct with fields breakout */
+struct gft_cam_line_mapped {
+	__le32 camline;
+#define GFT_CAM_LINE_MAPPED_VALID_MASK				0x1
+#define GFT_CAM_LINE_MAPPED_VALID_SHIFT				0
+#define GFT_CAM_LINE_MAPPED_IP_VERSION_MASK			0x1
+#define GFT_CAM_LINE_MAPPED_IP_VERSION_SHIFT			1
+#define GFT_CAM_LINE_MAPPED_TUNNEL_IP_VERSION_MASK		0x1
+#define GFT_CAM_LINE_MAPPED_TUNNEL_IP_VERSION_SHIFT		2
+#define GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE_MASK		0xF
+#define GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE_SHIFT		3
+#define GFT_CAM_LINE_MAPPED_TUNNEL_TYPE_MASK			0xF
+#define GFT_CAM_LINE_MAPPED_TUNNEL_TYPE_SHIFT			7
+#define GFT_CAM_LINE_MAPPED_PF_ID_MASK				0xF
+#define GFT_CAM_LINE_MAPPED_PF_ID_SHIFT				11
+#define GFT_CAM_LINE_MAPPED_IP_VERSION_MASK_MASK		0x1
+#define GFT_CAM_LINE_MAPPED_IP_VERSION_MASK_SHIFT		15
+#define GFT_CAM_LINE_MAPPED_TUNNEL_IP_VERSION_MASK_MASK		0x1
+#define GFT_CAM_LINE_MAPPED_TUNNEL_IP_VERSION_MASK_SHIFT	16
+#define GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE_MASK_MASK	0xF
+#define GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE_MASK_SHIFT	17
+#define GFT_CAM_LINE_MAPPED_TUNNEL_TYPE_MASK_MASK		0xF
+#define GFT_CAM_LINE_MAPPED_TUNNEL_TYPE_MASK_SHIFT		21
+#define GFT_CAM_LINE_MAPPED_PF_ID_MASK_MASK			0xF
+#define GFT_CAM_LINE_MAPPED_PF_ID_MASK_SHIFT			25
+#define GFT_CAM_LINE_MAPPED_RESERVED1_MASK			0x7
+#define GFT_CAM_LINE_MAPPED_RESERVED1_SHIFT			29
+};
+
+union gft_cam_line_union {
+	struct gft_cam_line cam_line;
+	struct gft_cam_line_mapped cam_line_mapped;
+};
+
+/* Used in gft_profile_key: Indication for ip version */
+enum gft_profile_ip_version {
+	GFT_PROFILE_IPV4 = 0,
+	GFT_PROFILE_IPV6 = 1,
+	MAX_GFT_PROFILE_IP_VERSION
+};
+
+/* Profile key stucr fot GFT logic in Prs */
+struct gft_profile_key {
+	__le16 profile_key;
+#define GFT_PROFILE_KEY_IP_VERSION_MASK			0x1
+#define GFT_PROFILE_KEY_IP_VERSION_SHIFT		0
+#define GFT_PROFILE_KEY_TUNNEL_IP_VERSION_MASK		0x1
+#define GFT_PROFILE_KEY_TUNNEL_IP_VERSION_SHIFT		1
+#define GFT_PROFILE_KEY_UPPER_PROTOCOL_TYPE_MASK	0xF
+#define GFT_PROFILE_KEY_UPPER_PROTOCOL_TYPE_SHIFT	2
+#define GFT_PROFILE_KEY_TUNNEL_TYPE_MASK		0xF
+#define GFT_PROFILE_KEY_TUNNEL_TYPE_SHIFT		6
+#define GFT_PROFILE_KEY_PF_ID_MASK			0xF
+#define GFT_PROFILE_KEY_PF_ID_SHIFT			10
+#define GFT_PROFILE_KEY_RESERVED0_MASK			0x3
+#define GFT_PROFILE_KEY_RESERVED0_SHIFT			14
+};
+
+/* Used in gft_profile_key: Indication for tunnel type */
+enum gft_profile_tunnel_type {
+	GFT_PROFILE_NO_TUNNEL = 0,
+	GFT_PROFILE_VXLAN_TUNNEL = 1,
+	GFT_PROFILE_GRE_MAC_OR_NVGRE_TUNNEL = 2,
+	GFT_PROFILE_GRE_IP_TUNNEL = 3,
+	GFT_PROFILE_GENEVE_MAC_TUNNEL = 4,
+	GFT_PROFILE_GENEVE_IP_TUNNEL = 5,
+	MAX_GFT_PROFILE_TUNNEL_TYPE
+};
+
+/* Used in gft_profile_key: Indication for protocol type */
+enum gft_profile_upper_protocol_type {
+	GFT_PROFILE_ROCE_PROTOCOL = 0,
+	GFT_PROFILE_RROCE_PROTOCOL = 1,
+	GFT_PROFILE_FCOE_PROTOCOL = 2,
+	GFT_PROFILE_ICMP_PROTOCOL = 3,
+	GFT_PROFILE_ARP_PROTOCOL = 4,
+	GFT_PROFILE_USER_TCP_SRC_PORT_1_INNER = 5,
+	GFT_PROFILE_USER_TCP_DST_PORT_1_INNER = 6,
+	GFT_PROFILE_TCP_PROTOCOL = 7,
+	GFT_PROFILE_USER_UDP_DST_PORT_1_INNER = 8,
+	GFT_PROFILE_USER_UDP_DST_PORT_2_OUTER = 9,
+	GFT_PROFILE_UDP_PROTOCOL = 10,
+	GFT_PROFILE_USER_IP_1_INNER = 11,
+	GFT_PROFILE_USER_IP_2_OUTER = 12,
+	GFT_PROFILE_USER_ETH_1_INNER = 13,
+	GFT_PROFILE_USER_ETH_2_OUTER = 14,
+	GFT_PROFILE_RAW = 15,
+	MAX_GFT_PROFILE_UPPER_PROTOCOL_TYPE
+};
+
+/* GFT RAM line struct */
+struct gft_ram_line {
+	__le32 lo;
+#define GFT_RAM_LINE_VLAN_SELECT_MASK			0x3
+#define GFT_RAM_LINE_VLAN_SELECT_SHIFT			0
+#define GFT_RAM_LINE_TUNNEL_ENTROPHY_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_ENTROPHY_SHIFT		2
+#define GFT_RAM_LINE_TUNNEL_TTL_EQUAL_ONE_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_TTL_EQUAL_ONE_SHIFT		3
+#define GFT_RAM_LINE_TUNNEL_TTL_MASK			0x1
+#define GFT_RAM_LINE_TUNNEL_TTL_SHIFT			4
+#define GFT_RAM_LINE_TUNNEL_ETHERTYPE_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_ETHERTYPE_SHIFT		5
+#define GFT_RAM_LINE_TUNNEL_DST_PORT_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_DST_PORT_SHIFT		6
+#define GFT_RAM_LINE_TUNNEL_SRC_PORT_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_SRC_PORT_SHIFT		7
+#define GFT_RAM_LINE_TUNNEL_DSCP_MASK			0x1
+#define GFT_RAM_LINE_TUNNEL_DSCP_SHIFT			8
+#define GFT_RAM_LINE_TUNNEL_OVER_IP_PROTOCOL_MASK	0x1
+#define GFT_RAM_LINE_TUNNEL_OVER_IP_PROTOCOL_SHIFT	9
+#define GFT_RAM_LINE_TUNNEL_DST_IP_MASK			0x1
+#define GFT_RAM_LINE_TUNNEL_DST_IP_SHIFT		10
+#define GFT_RAM_LINE_TUNNEL_SRC_IP_MASK			0x1
+#define GFT_RAM_LINE_TUNNEL_SRC_IP_SHIFT		11
+#define GFT_RAM_LINE_TUNNEL_PRIORITY_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_PRIORITY_SHIFT		12
+#define GFT_RAM_LINE_TUNNEL_PROVIDER_VLAN_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_PROVIDER_VLAN_SHIFT		13
+#define GFT_RAM_LINE_TUNNEL_VLAN_MASK			0x1
+#define GFT_RAM_LINE_TUNNEL_VLAN_SHIFT			14
+#define GFT_RAM_LINE_TUNNEL_DST_MAC_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_DST_MAC_SHIFT		15
+#define GFT_RAM_LINE_TUNNEL_SRC_MAC_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_SRC_MAC_SHIFT		16
+#define GFT_RAM_LINE_TTL_EQUAL_ONE_MASK			0x1
+#define GFT_RAM_LINE_TTL_EQUAL_ONE_SHIFT		17
+#define GFT_RAM_LINE_TTL_MASK				0x1
+#define GFT_RAM_LINE_TTL_SHIFT				18
+#define GFT_RAM_LINE_ETHERTYPE_MASK			0x1
+#define GFT_RAM_LINE_ETHERTYPE_SHIFT			19
+#define GFT_RAM_LINE_RESERVED0_MASK			0x1
+#define GFT_RAM_LINE_RESERVED0_SHIFT			20
+#define GFT_RAM_LINE_TCP_FLAG_FIN_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_FIN_SHIFT			21
+#define GFT_RAM_LINE_TCP_FLAG_SYN_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_SYN_SHIFT			22
+#define GFT_RAM_LINE_TCP_FLAG_RST_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_RST_SHIFT			23
+#define GFT_RAM_LINE_TCP_FLAG_PSH_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_PSH_SHIFT			24
+#define GFT_RAM_LINE_TCP_FLAG_ACK_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_ACK_SHIFT			25
+#define GFT_RAM_LINE_TCP_FLAG_URG_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_URG_SHIFT			26
+#define GFT_RAM_LINE_TCP_FLAG_ECE_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_ECE_SHIFT			27
+#define GFT_RAM_LINE_TCP_FLAG_CWR_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_CWR_SHIFT			28
+#define GFT_RAM_LINE_TCP_FLAG_NS_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_NS_SHIFT			29
+#define GFT_RAM_LINE_DST_PORT_MASK			0x1
+#define GFT_RAM_LINE_DST_PORT_SHIFT			30
+#define GFT_RAM_LINE_SRC_PORT_MASK			0x1
+#define GFT_RAM_LINE_SRC_PORT_SHIFT			31
+	__le32 hi;
+#define GFT_RAM_LINE_DSCP_MASK				0x1
+#define GFT_RAM_LINE_DSCP_SHIFT				0
+#define GFT_RAM_LINE_OVER_IP_PROTOCOL_MASK		0x1
+#define GFT_RAM_LINE_OVER_IP_PROTOCOL_SHIFT		1
+#define GFT_RAM_LINE_DST_IP_MASK			0x1
+#define GFT_RAM_LINE_DST_IP_SHIFT			2
+#define GFT_RAM_LINE_SRC_IP_MASK			0x1
+#define GFT_RAM_LINE_SRC_IP_SHIFT			3
+#define GFT_RAM_LINE_PRIORITY_MASK			0x1
+#define GFT_RAM_LINE_PRIORITY_SHIFT			4
+#define GFT_RAM_LINE_PROVIDER_VLAN_MASK			0x1
+#define GFT_RAM_LINE_PROVIDER_VLAN_SHIFT		5
+#define GFT_RAM_LINE_VLAN_MASK				0x1
+#define GFT_RAM_LINE_VLAN_SHIFT				6
+#define GFT_RAM_LINE_DST_MAC_MASK			0x1
+#define GFT_RAM_LINE_DST_MAC_SHIFT			7
+#define GFT_RAM_LINE_SRC_MAC_MASK			0x1
+#define GFT_RAM_LINE_SRC_MAC_SHIFT			8
+#define GFT_RAM_LINE_TENANT_ID_MASK			0x1
+#define GFT_RAM_LINE_TENANT_ID_SHIFT			9
+#define GFT_RAM_LINE_RESERVED1_MASK			0x3FFFFF
+#define GFT_RAM_LINE_RESERVED1_SHIFT			10
+};
+
+/* Used in the first 2 bits for gft_ram_line: Indication for vlan mask */
+enum gft_vlan_select {
+	INNER_PROVIDER_VLAN = 0,
+	INNER_VLAN = 1,
+	OUTER_PROVIDER_VLAN = 2,
+	OUTER_VLAN = 3,
+	MAX_GFT_VLAN_SELECT
+};
+
+/* The rdma task context of Mstorm */
+struct ystorm_rdma_task_st_ctx {
+	struct regpair temp[4];
+};
+
+struct e4_ystorm_rdma_task_ag_ctx {
+	u8 reserved;
+	u8 byte1;
+	__le16 msem_ctx_upd_seq;
+	u8 flags0;
+#define E4_YSTORM_RDMA_TASK_AG_CTX_CONNECTION_TYPE_MASK		0xF
+#define E4_YSTORM_RDMA_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
+#define E4_YSTORM_RDMA_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
+#define E4_YSTORM_RDMA_TASK_AG_CTX_BIT1_MASK			0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_BIT1_SHIFT			5
+#define E4_YSTORM_RDMA_TASK_AG_CTX_VALID_MASK			0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_VALID_SHIFT			6
+#define E4_YSTORM_RDMA_TASK_AG_CTX_DIF_FIRST_IO_MASK		0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_DIF_FIRST_IO_SHIFT		7
+	u8 flags1;
+#define E4_YSTORM_RDMA_TASK_AG_CTX_CF0_MASK		0x3
+#define E4_YSTORM_RDMA_TASK_AG_CTX_CF0_SHIFT		0
+#define E4_YSTORM_RDMA_TASK_AG_CTX_CF1_MASK		0x3
+#define E4_YSTORM_RDMA_TASK_AG_CTX_CF1_SHIFT		2
+#define E4_YSTORM_RDMA_TASK_AG_CTX_CF2SPECIAL_MASK	0x3
+#define E4_YSTORM_RDMA_TASK_AG_CTX_CF2SPECIAL_SHIFT	4
+#define E4_YSTORM_RDMA_TASK_AG_CTX_CF0EN_MASK		0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_CF0EN_SHIFT		6
+#define E4_YSTORM_RDMA_TASK_AG_CTX_CF1EN_MASK		0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_CF1EN_SHIFT		7
+	u8 flags2;
+#define E4_YSTORM_RDMA_TASK_AG_CTX_BIT4_MASK		0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_BIT4_SHIFT		0
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE0EN_MASK		0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE0EN_SHIFT	1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE1EN_MASK		0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE1EN_SHIFT	2
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE2EN_MASK		0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE2EN_SHIFT	3
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE3EN_MASK		0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE3EN_SHIFT	4
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE4EN_MASK		0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE4EN_SHIFT	5
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE5EN_MASK		0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE5EN_SHIFT	6
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE6EN_MASK		0x1
+#define E4_YSTORM_RDMA_TASK_AG_CTX_RULE6EN_SHIFT	7
+	u8 key;
+	__le32 mw_cnt;
+	u8 ref_cnt_seq;
+	u8 ctx_upd_seq;
+	__le16 dif_flags;
+	__le16 tx_ref_count;
+	__le16 last_used_ltid;
+	__le16 parent_mr_lo;
+	__le16 parent_mr_hi;
+	__le32 fbo_lo;
+	__le32 fbo_hi;
+};
+
+struct e4_mstorm_rdma_task_ag_ctx {
+	u8 reserved;
+	u8 byte1;
+	__le16 icid;
+	u8 flags0;
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CONNECTION_TYPE_MASK		0xF
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
+#define E4_MSTORM_RDMA_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
+#define E4_MSTORM_RDMA_TASK_AG_CTX_BIT1_MASK			0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_BIT1_SHIFT			5
+#define E4_MSTORM_RDMA_TASK_AG_CTX_BIT2_MASK			0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_BIT2_SHIFT			6
+#define E4_MSTORM_RDMA_TASK_AG_CTX_DIF_FIRST_IO_MASK		0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_DIF_FIRST_IO_SHIFT		7
+	u8 flags1;
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CF0_MASK	0x3
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CF0_SHIFT	0
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CF1_MASK	0x3
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CF1_SHIFT	2
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CF2_MASK	0x3
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CF2_SHIFT	4
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CF0EN_MASK	0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CF0EN_SHIFT	6
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CF1EN_MASK	0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CF1EN_SHIFT	7
+	u8 flags2;
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CF2EN_MASK		0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_CF2EN_SHIFT		0
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE0EN_MASK		0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE0EN_SHIFT	1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE1EN_MASK		0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE1EN_SHIFT	2
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE2EN_MASK		0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE2EN_SHIFT	3
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE3EN_MASK		0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE3EN_SHIFT	4
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE4EN_MASK		0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE4EN_SHIFT	5
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE5EN_MASK		0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE5EN_SHIFT	6
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE6EN_MASK		0x1
+#define E4_MSTORM_RDMA_TASK_AG_CTX_RULE6EN_SHIFT	7
+	u8 key;
+	__le32 mw_cnt;
+	u8 ref_cnt_seq;
+	u8 ctx_upd_seq;
+	__le16 dif_flags;
+	__le16 tx_ref_count;
+	__le16 last_used_ltid;
+	__le16 parent_mr_lo;
+	__le16 parent_mr_hi;
+	__le32 fbo_lo;
+	__le32 fbo_hi;
+};
+
+/* The roce task context of Mstorm */
+struct mstorm_rdma_task_st_ctx {
+	struct regpair temp[4];
+};
+
+/* The roce task context of Ustorm */
+struct ustorm_rdma_task_st_ctx {
+	struct regpair temp[2];
+};
+
+struct e4_ustorm_rdma_task_ag_ctx {
+	u8 reserved;
+	u8 state;
+	__le16 icid;
+	u8 flags0;
+#define E4_USTORM_RDMA_TASK_AG_CTX_CONNECTION_TYPE_MASK		0xF
+#define E4_USTORM_RDMA_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
+#define E4_USTORM_RDMA_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_RUNT_VALID_MASK		0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_RUNT_VALID_SHIFT		5
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_WRITE_RESULT_CF_MASK	0x3
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_WRITE_RESULT_CF_SHIFT	6
+	u8 flags1;
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_RESULT_TOGGLE_BIT_MASK	0x3
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_RESULT_TOGGLE_BIT_SHIFT	0
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_TX_IO_FLG_MASK		0x3
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_TX_IO_FLG_SHIFT		2
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_BLOCK_SIZE_MASK          0x3
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_BLOCK_SIZE_SHIFT         4
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_ERROR_CF_MASK		0x3
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_ERROR_CF_SHIFT		6
+	u8 flags2;
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_WRITE_RESULT_CF_EN_MASK	0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_WRITE_RESULT_CF_EN_SHIFT	0
+#define E4_USTORM_RDMA_TASK_AG_CTX_RESERVED2_MASK		0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_RESERVED2_SHIFT		1
+#define E4_USTORM_RDMA_TASK_AG_CTX_RESERVED3_MASK		0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_RESERVED3_SHIFT		2
+#define E4_USTORM_RDMA_TASK_AG_CTX_RESERVED4_MASK               0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_RESERVED4_SHIFT              3
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_ERROR_CF_EN_MASK		0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_ERROR_CF_EN_SHIFT	4
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE0EN_MASK			0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE0EN_SHIFT		5
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE1EN_MASK			0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE1EN_SHIFT		6
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE2EN_MASK			0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE2EN_SHIFT		7
+	u8 flags3;
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE3EN_MASK		0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE3EN_SHIFT	0
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE4EN_MASK		0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE4EN_SHIFT	1
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE5EN_MASK		0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE5EN_SHIFT	2
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE6EN_MASK		0x1
+#define E4_USTORM_RDMA_TASK_AG_CTX_RULE6EN_SHIFT	3
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_ERROR_TYPE_MASK	0xF
+#define E4_USTORM_RDMA_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT	4
+	__le32 dif_err_intervals;
+	__le32 dif_error_1st_interval;
+	__le32 sq_cons;
+	__le32 dif_runt_value;
+	__le32 sge_index;
+	__le32 reg5;
+	u8 byte2;
+	u8 byte3;
+	__le16 word1;
+	__le16 word2;
+	__le16 word3;
+	__le32 reg6;
+	__le32 reg7;
+};
+
+/* RDMA task context */
+struct e4_rdma_task_context {
+	struct ystorm_rdma_task_st_ctx ystorm_st_context;
+	struct e4_ystorm_rdma_task_ag_ctx ystorm_ag_context;
+	struct tdif_task_context tdif_context;
+	struct e4_mstorm_rdma_task_ag_ctx mstorm_ag_context;
+	struct mstorm_rdma_task_st_ctx mstorm_st_context;
+	struct rdif_task_context rdif_context;
+	struct ustorm_rdma_task_st_ctx ustorm_st_context;
+	struct regpair ustorm_st_padding[2];
+	struct e4_ustorm_rdma_task_ag_ctx ustorm_ag_context;
+};
+
+/* rdma function init ramrod data */
+struct rdma_close_func_ramrod_data {
+	u8 cnq_start_offset;
+	u8 num_cnqs;
+	u8 vf_id;
+	u8 vf_valid;
+	u8 reserved[4];
+};
+
+/* rdma function init CNQ parameters */
+struct rdma_cnq_params {
+	__le16 sb_num;
+	u8 sb_index;
+	u8 num_pbl_pages;
+	__le32 reserved;
+	struct regpair pbl_base_addr;
+	__le16 queue_zone_num;
+	u8 reserved1[6];
+};
+
+/* rdma create cq ramrod data */
+struct rdma_create_cq_ramrod_data {
+	struct regpair cq_handle;
+	struct regpair pbl_addr;
+	__le32 max_cqes;
+	__le16 pbl_num_pages;
+	__le16 dpi;
+	u8 is_two_level_pbl;
+	u8 cnq_id;
+	u8 pbl_log_page_size;
+	u8 toggle_bit;
+	__le16 int_timeout;
+	__le16 reserved1;
+};
+
+/* rdma deregister tid ramrod data */
+struct rdma_deregister_tid_ramrod_data {
+	__le32 itid;
+	__le32 reserved;
+};
+
+/* rdma destroy cq output params */
+struct rdma_destroy_cq_output_params {
+	__le16 cnq_num;
+	__le16 reserved0;
+	__le32 reserved1;
+};
+
+/* rdma destroy cq ramrod data */
+struct rdma_destroy_cq_ramrod_data {
+	struct regpair output_params_addr;
+};
+
+/* RDMA slow path EQ cmd IDs */
+enum rdma_event_opcode {
+	RDMA_EVENT_UNUSED,
+	RDMA_EVENT_FUNC_INIT,
+	RDMA_EVENT_FUNC_CLOSE,
+	RDMA_EVENT_REGISTER_MR,
+	RDMA_EVENT_DEREGISTER_MR,
+	RDMA_EVENT_CREATE_CQ,
+	RDMA_EVENT_RESIZE_CQ,
+	RDMA_EVENT_DESTROY_CQ,
+	RDMA_EVENT_CREATE_SRQ,
+	RDMA_EVENT_MODIFY_SRQ,
+	RDMA_EVENT_DESTROY_SRQ,
+	MAX_RDMA_EVENT_OPCODE
+};
+
+/* RDMA FW return code for slow path ramrods */
+enum rdma_fw_return_code {
+	RDMA_RETURN_OK = 0,
+	RDMA_RETURN_REGISTER_MR_BAD_STATE_ERR,
+	RDMA_RETURN_DEREGISTER_MR_BAD_STATE_ERR,
+	RDMA_RETURN_RESIZE_CQ_ERR,
+	RDMA_RETURN_NIG_DRAIN_REQ,
+	MAX_RDMA_FW_RETURN_CODE
+};
+
+/* rdma function init header */
+struct rdma_init_func_hdr {
+	u8 cnq_start_offset;
+	u8 num_cnqs;
+	u8 cq_ring_mode;
+	u8 vf_id;
+	u8 vf_valid;
+	u8 relaxed_ordering;
+	__le16 first_reg_srq_id;
+	__le32 reg_srq_base_addr;
+	__le32 reserved;
+};
+
+/* rdma function init ramrod data */
+struct rdma_init_func_ramrod_data {
+	struct rdma_init_func_hdr params_header;
+	struct rdma_cnq_params cnq_params[NUM_OF_GLOBAL_QUEUES];
+};
+
+/* RDMA ramrod command IDs */
+enum rdma_ramrod_cmd_id {
+	RDMA_RAMROD_UNUSED,
+	RDMA_RAMROD_FUNC_INIT,
+	RDMA_RAMROD_FUNC_CLOSE,
+	RDMA_RAMROD_REGISTER_MR,
+	RDMA_RAMROD_DEREGISTER_MR,
+	RDMA_RAMROD_CREATE_CQ,
+	RDMA_RAMROD_RESIZE_CQ,
+	RDMA_RAMROD_DESTROY_CQ,
+	RDMA_RAMROD_CREATE_SRQ,
+	RDMA_RAMROD_MODIFY_SRQ,
+	RDMA_RAMROD_DESTROY_SRQ,
+	MAX_RDMA_RAMROD_CMD_ID
+};
+
+/* rdma register tid ramrod data */
+struct rdma_register_tid_ramrod_data {
+	__le16 flags;
+#define RDMA_REGISTER_TID_RAMROD_DATA_PAGE_SIZE_LOG_MASK	0x1F
+#define RDMA_REGISTER_TID_RAMROD_DATA_PAGE_SIZE_LOG_SHIFT	0
+#define RDMA_REGISTER_TID_RAMROD_DATA_TWO_LEVEL_PBL_MASK	0x1
+#define RDMA_REGISTER_TID_RAMROD_DATA_TWO_LEVEL_PBL_SHIFT	5
+#define RDMA_REGISTER_TID_RAMROD_DATA_ZERO_BASED_MASK		0x1
+#define RDMA_REGISTER_TID_RAMROD_DATA_ZERO_BASED_SHIFT		6
+#define RDMA_REGISTER_TID_RAMROD_DATA_PHY_MR_MASK		0x1
+#define RDMA_REGISTER_TID_RAMROD_DATA_PHY_MR_SHIFT		7
+#define RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_READ_MASK		0x1
+#define RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_READ_SHIFT		8
+#define RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_WRITE_MASK		0x1
+#define RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_WRITE_SHIFT	9
+#define RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_ATOMIC_MASK	0x1
+#define RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_ATOMIC_SHIFT	10
+#define RDMA_REGISTER_TID_RAMROD_DATA_LOCAL_WRITE_MASK		0x1
+#define RDMA_REGISTER_TID_RAMROD_DATA_LOCAL_WRITE_SHIFT		11
+#define RDMA_REGISTER_TID_RAMROD_DATA_LOCAL_READ_MASK		0x1
+#define RDMA_REGISTER_TID_RAMROD_DATA_LOCAL_READ_SHIFT		12
+#define RDMA_REGISTER_TID_RAMROD_DATA_ENABLE_MW_BIND_MASK	0x1
+#define RDMA_REGISTER_TID_RAMROD_DATA_ENABLE_MW_BIND_SHIFT	13
+#define RDMA_REGISTER_TID_RAMROD_DATA_RESERVED_MASK		0x3
+#define RDMA_REGISTER_TID_RAMROD_DATA_RESERVED_SHIFT		14
+	u8 flags1;
+#define RDMA_REGISTER_TID_RAMROD_DATA_PBL_PAGE_SIZE_LOG_MASK	0x1F
+#define RDMA_REGISTER_TID_RAMROD_DATA_PBL_PAGE_SIZE_LOG_SHIFT	0
+#define RDMA_REGISTER_TID_RAMROD_DATA_TID_TYPE_MASK		0x7
+#define RDMA_REGISTER_TID_RAMROD_DATA_TID_TYPE_SHIFT		5
+	u8 flags2;
+#define RDMA_REGISTER_TID_RAMROD_DATA_DMA_MR_MASK		0x1
+#define RDMA_REGISTER_TID_RAMROD_DATA_DMA_MR_SHIFT		0
+#define RDMA_REGISTER_TID_RAMROD_DATA_DIF_ON_HOST_FLG_MASK	0x1
+#define RDMA_REGISTER_TID_RAMROD_DATA_DIF_ON_HOST_FLG_SHIFT	1
+#define RDMA_REGISTER_TID_RAMROD_DATA_RESERVED1_MASK		0x3F
+#define RDMA_REGISTER_TID_RAMROD_DATA_RESERVED1_SHIFT		2
+	u8 key;
+	u8 length_hi;
+	u8 vf_id;
+	u8 vf_valid;
+	__le16 pd;
+	__le16 reserved2;
+	__le32 length_lo;
+	__le32 itid;
+	__le32 reserved3;
+	struct regpair va;
+	struct regpair pbl_base;
+	struct regpair dif_error_addr;
+	struct regpair dif_runt_addr;
+	__le32 reserved4[2];
+};
+
+/* rdma resize cq output params */
+struct rdma_resize_cq_output_params {
+	__le32 old_cq_cons;
+	__le32 old_cq_prod;
+};
+
+/* rdma resize cq ramrod data */
+struct rdma_resize_cq_ramrod_data {
+	u8 flags;
+#define RDMA_RESIZE_CQ_RAMROD_DATA_TOGGLE_BIT_MASK		0x1
+#define RDMA_RESIZE_CQ_RAMROD_DATA_TOGGLE_BIT_SHIFT		0
+#define RDMA_RESIZE_CQ_RAMROD_DATA_IS_TWO_LEVEL_PBL_MASK	0x1
+#define RDMA_RESIZE_CQ_RAMROD_DATA_IS_TWO_LEVEL_PBL_SHIFT	1
+#define RDMA_RESIZE_CQ_RAMROD_DATA_RESERVED_MASK		0x3F
+#define RDMA_RESIZE_CQ_RAMROD_DATA_RESERVED_SHIFT		2
+	u8 pbl_log_page_size;
+	__le16 pbl_num_pages;
+	__le32 max_cqes;
+	struct regpair pbl_addr;
+	struct regpair output_params_addr;
+};
+
+/* The rdma storm context of Mstorm */
+struct rdma_srq_context {
+	struct regpair temp[8];
+};
+
+/* rdma create qp requester ramrod data */
+struct rdma_srq_create_ramrod_data {
+	u8 flags;
+#define RDMA_SRQ_CREATE_RAMROD_DATA_XRC_FLAG_MASK         0x1
+#define RDMA_SRQ_CREATE_RAMROD_DATA_XRC_FLAG_SHIFT        0
+#define RDMA_SRQ_CREATE_RAMROD_DATA_RESERVED_KEY_EN_MASK  0x1
+#define RDMA_SRQ_CREATE_RAMROD_DATA_RESERVED_KEY_EN_SHIFT 1
+#define RDMA_SRQ_CREATE_RAMROD_DATA_RESERVED1_MASK        0x3F
+#define RDMA_SRQ_CREATE_RAMROD_DATA_RESERVED1_SHIFT       2
+	u8 reserved2;
+	__le16 xrc_domain;
+	__le32 xrc_srq_cq_cid;
+	struct regpair pbl_base_addr;
+	__le16 pages_in_srq_pbl;
+	__le16 pd_id;
+	struct rdma_srq_id srq_id;
+	__le16 page_size;
+	__le16 reserved3;
+	__le32 reserved4;
+	struct regpair producers_addr;
+};
+
+/* rdma create qp requester ramrod data */
+struct rdma_srq_destroy_ramrod_data {
+	struct rdma_srq_id srq_id;
+	__le32 reserved;
+};
+
+/* rdma create qp requester ramrod data */
+struct rdma_srq_modify_ramrod_data {
+	struct rdma_srq_id srq_id;
+	__le32 wqe_limit;
+};
+
+/* RDMA Tid type enumeration (for register_tid ramrod) */
+enum rdma_tid_type {
+	RDMA_TID_REGISTERED_MR,
+	RDMA_TID_FMR,
+	RDMA_TID_MW_TYPE1,
+	RDMA_TID_MW_TYPE2A,
+	MAX_RDMA_TID_TYPE
+};
+
+struct rdma_xrc_srq_context {
+	struct regpair temp[9];
+};
+
+struct e4_tstorm_rdma_task_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	__le16 word0;
+	u8 flags0;
+#define E4_TSTORM_RDMA_TASK_AG_CTX_NIBBLE0_MASK		0xF
+#define E4_TSTORM_RDMA_TASK_AG_CTX_NIBBLE0_SHIFT	0
+#define E4_TSTORM_RDMA_TASK_AG_CTX_BIT0_MASK		0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_BIT0_SHIFT		4
+#define E4_TSTORM_RDMA_TASK_AG_CTX_BIT1_MASK		0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_BIT1_SHIFT		5
+#define E4_TSTORM_RDMA_TASK_AG_CTX_BIT2_MASK		0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_BIT2_SHIFT		6
+#define E4_TSTORM_RDMA_TASK_AG_CTX_BIT3_MASK		0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_BIT3_SHIFT		7
+	u8 flags1;
+#define E4_TSTORM_RDMA_TASK_AG_CTX_BIT4_MASK	0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_BIT4_SHIFT	0
+#define E4_TSTORM_RDMA_TASK_AG_CTX_BIT5_MASK	0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_BIT5_SHIFT	1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF0_MASK	0x3
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF0_SHIFT	2
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF1_MASK	0x3
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF1_SHIFT	4
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF2_MASK	0x3
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF2_SHIFT	6
+	u8 flags2;
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF3_MASK	0x3
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF3_SHIFT	0
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF4_MASK	0x3
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF4_SHIFT	2
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF5_MASK	0x3
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF5_SHIFT	4
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF6_MASK	0x3
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF6_SHIFT	6
+	u8 flags3;
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF7_MASK	0x3
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF7_SHIFT	0
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF0EN_MASK	0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF0EN_SHIFT	2
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF1EN_MASK	0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF1EN_SHIFT	3
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF2EN_MASK	0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF2EN_SHIFT	4
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF3EN_MASK	0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF3EN_SHIFT	5
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF4EN_MASK	0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF4EN_SHIFT	6
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF5EN_MASK	0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF5EN_SHIFT	7
+	u8 flags4;
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF6EN_MASK		0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF6EN_SHIFT		0
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF7EN_MASK		0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_CF7EN_SHIFT		1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_RULE0EN_MASK		0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_RULE0EN_SHIFT	2
+#define E4_TSTORM_RDMA_TASK_AG_CTX_RULE1EN_MASK		0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_RULE1EN_SHIFT	3
+#define E4_TSTORM_RDMA_TASK_AG_CTX_RULE2EN_MASK		0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_RULE2EN_SHIFT	4
+#define E4_TSTORM_RDMA_TASK_AG_CTX_RULE3EN_MASK		0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_RULE3EN_SHIFT	5
+#define E4_TSTORM_RDMA_TASK_AG_CTX_RULE4EN_MASK		0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_RULE4EN_SHIFT	6
+#define E4_TSTORM_RDMA_TASK_AG_CTX_RULE5EN_MASK		0x1
+#define E4_TSTORM_RDMA_TASK_AG_CTX_RULE5EN_SHIFT	7
+	u8 byte2;
+	__le16 word1;
+	__le32 reg0;
+	u8 byte3;
+	u8 byte4;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le32 reg1;
+	__le32 reg2;
+};
+
+struct e4_ustorm_rdma_conn_ag_ctx {
+	u8 reserved;
+	u8 byte1;
+	u8 flags0;
+#define E4_USTORM_RDMA_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	0
+#define E4_USTORM_RDMA_CONN_AG_CTX_DIF_ERROR_REPORTED_MASK  0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_DIF_ERROR_REPORTED_SHIFT 1
+#define E4_USTORM_RDMA_CONN_AG_CTX_FLUSH_Q0_CF_MASK	0x3
+#define E4_USTORM_RDMA_CONN_AG_CTX_FLUSH_Q0_CF_SHIFT	2
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF1_MASK		0x3
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF1_SHIFT		4
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF2_MASK		0x3
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF2_SHIFT		6
+	u8 flags1;
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF3_MASK		0x3
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF3_SHIFT		0
+#define E4_USTORM_RDMA_CONN_AG_CTX_CQ_ARM_SE_CF_MASK	0x3
+#define E4_USTORM_RDMA_CONN_AG_CTX_CQ_ARM_SE_CF_SHIFT	2
+#define E4_USTORM_RDMA_CONN_AG_CTX_CQ_ARM_CF_MASK	0x3
+#define E4_USTORM_RDMA_CONN_AG_CTX_CQ_ARM_CF_SHIFT	4
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF6_MASK		0x3
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF6_SHIFT		6
+	u8 flags2;
+#define E4_USTORM_RDMA_CONN_AG_CTX_FLUSH_Q0_CF_EN_MASK		0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_FLUSH_Q0_CF_EN_SHIFT		0
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF1EN_MASK			0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF1EN_SHIFT			1
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF2EN_MASK			0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF2EN_SHIFT			2
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF3EN_MASK			0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF3EN_SHIFT			3
+#define E4_USTORM_RDMA_CONN_AG_CTX_CQ_ARM_SE_CF_EN_MASK		0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_CQ_ARM_SE_CF_EN_SHIFT	4
+#define E4_USTORM_RDMA_CONN_AG_CTX_CQ_ARM_CF_EN_MASK		0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_CQ_ARM_CF_EN_SHIFT		5
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF6EN_MASK			0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_CF6EN_SHIFT			6
+#define E4_USTORM_RDMA_CONN_AG_CTX_CQ_SE_EN_MASK		0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_CQ_SE_EN_SHIFT		7
+	u8 flags3;
+#define E4_USTORM_RDMA_CONN_AG_CTX_CQ_EN_MASK		0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_CQ_EN_SHIFT		0
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE2EN_SHIFT	1
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE3EN_SHIFT	2
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE4EN_SHIFT	3
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE5EN_SHIFT	4
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE6EN_MASK		0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE6EN_SHIFT	5
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE7EN_MASK		0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE7EN_SHIFT	6
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE8EN_MASK		0x1
+#define E4_USTORM_RDMA_CONN_AG_CTX_RULE8EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 conn_dpi;
+	__le16 word1;
+	__le32 cq_cons;
+	__le32 cq_se_prod;
+	__le32 cq_prod;
+	__le32 reg3;
+	__le16 int_timeout;
+	__le16 word3;
+};
+
+struct e4_xstorm_roce_conn_ag_ctx {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_EXIST_IN_QM0_MASK      0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_EXIST_IN_QM0_SHIFT     0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT1_MASK              0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT1_SHIFT             1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT2_MASK              0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT2_SHIFT             2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_EXIST_IN_QM3_MASK      0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_EXIST_IN_QM3_SHIFT     3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT4_MASK              0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT4_SHIFT             4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT5_MASK              0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT5_SHIFT             5
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT6_MASK              0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT6_SHIFT             6
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT7_MASK              0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT7_SHIFT             7
+	u8 flags1;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT8_MASK              0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT8_SHIFT             0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT9_MASK              0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT9_SHIFT             1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT10_MASK             0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT10_SHIFT            2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT11_MASK             0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT11_SHIFT            3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT12_MASK             0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT12_SHIFT            4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_MSEM_FLUSH_MASK        0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_MSEM_FLUSH_SHIFT       5
+#define E4_XSTORM_ROCE_CONN_AG_CTX_MSDM_FLUSH_MASK        0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_MSDM_FLUSH_SHIFT       6
+#define E4_XSTORM_ROCE_CONN_AG_CTX_YSTORM_FLUSH_MASK      0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_YSTORM_FLUSH_SHIFT     7
+	u8 flags2;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF0_MASK               0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF0_SHIFT              0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF1_MASK               0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF1_SHIFT              2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF2_MASK               0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF2_SHIFT              4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF3_MASK               0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF3_SHIFT              6
+	u8 flags3;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF4_MASK               0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF4_SHIFT              0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF5_MASK               0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF5_SHIFT              2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF6_MASK               0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF6_SHIFT              4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_FLUSH_Q0_CF_MASK       0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_FLUSH_Q0_CF_SHIFT      6
+	u8 flags4;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF8_MASK               0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF8_SHIFT              0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF9_MASK               0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF9_SHIFT              2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF10_MASK              0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF10_SHIFT             4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF11_MASK              0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF11_SHIFT             6
+	u8 flags5;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF12_MASK              0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF12_SHIFT             0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF13_MASK              0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF13_SHIFT             2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF14_MASK              0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF14_SHIFT             4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF15_MASK              0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF15_SHIFT             6
+	u8 flags6;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF16_MASK              0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF16_SHIFT             0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF17_MASK              0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF17_SHIFT             2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF18_MASK              0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF18_SHIFT             4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF19_MASK              0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF19_SHIFT             6
+	u8 flags7;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF20_MASK              0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF20_SHIFT             0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF21_MASK              0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF21_SHIFT             2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_SLOW_PATH_MASK         0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_SLOW_PATH_SHIFT        4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF0EN_MASK             0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF0EN_SHIFT            6
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF1EN_MASK             0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF1EN_SHIFT            7
+	u8 flags8;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF2EN_MASK             0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF2EN_SHIFT            0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF3EN_MASK             0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF3EN_SHIFT            1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF4EN_MASK             0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF4EN_SHIFT            2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF5EN_MASK             0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF5EN_SHIFT            3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF6EN_MASK             0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF6EN_SHIFT            4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_FLUSH_Q0_CF_EN_MASK    0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_FLUSH_Q0_CF_EN_SHIFT   5
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF8EN_MASK             0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF8EN_SHIFT            6
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF9EN_MASK             0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF9EN_SHIFT            7
+	u8 flags9;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF10EN_MASK            0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF10EN_SHIFT           0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF11EN_MASK            0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF11EN_SHIFT           1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF12EN_MASK            0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF12EN_SHIFT           2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF13EN_MASK            0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF13EN_SHIFT           3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF14EN_MASK            0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF14EN_SHIFT           4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF15EN_MASK            0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF15EN_SHIFT           5
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF16EN_MASK            0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF16EN_SHIFT           6
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF17EN_MASK            0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF17EN_SHIFT           7
+	u8 flags10;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF18EN_MASK            0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF18EN_SHIFT           0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF19EN_MASK            0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF19EN_SHIFT           1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF20EN_MASK            0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF20EN_SHIFT           2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF21EN_MASK            0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF21EN_SHIFT           3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_SLOW_PATH_EN_MASK      0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_SLOW_PATH_EN_SHIFT     4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF23EN_MASK            0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF23EN_SHIFT           5
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE0EN_MASK           0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE0EN_SHIFT          6
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE1EN_MASK           0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE1EN_SHIFT          7
+	u8 flags11;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE2EN_MASK           0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE2EN_SHIFT          0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE3EN_MASK           0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE3EN_SHIFT          1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE4EN_MASK           0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE4EN_SHIFT          2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE5EN_MASK           0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE5EN_SHIFT          3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE6EN_MASK           0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE6EN_SHIFT          4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE7EN_MASK           0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE7EN_SHIFT          5
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED1_MASK      0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED1_SHIFT     6
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE9EN_MASK           0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE9EN_SHIFT          7
+	u8 flags12;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE10EN_MASK          0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE10EN_SHIFT         0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE11EN_MASK          0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE11EN_SHIFT         1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED2_MASK      0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED2_SHIFT     2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED3_MASK      0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED3_SHIFT     3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE14EN_MASK          0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE14EN_SHIFT         4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE15EN_MASK          0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE15EN_SHIFT         5
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE16EN_MASK          0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE16EN_SHIFT         6
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE17EN_MASK          0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE17EN_SHIFT         7
+	u8 flags13;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE18EN_MASK          0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE18EN_SHIFT         0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE19EN_MASK          0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RULE19EN_SHIFT         1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED4_MASK      0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED4_SHIFT     2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED5_MASK      0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED5_SHIFT     3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED6_MASK      0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED6_SHIFT     4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED7_MASK      0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED7_SHIFT     5
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED8_MASK      0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED8_SHIFT     6
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED9_MASK      0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_A0_RESERVED9_SHIFT     7
+	u8 flags14;
+#define E4_XSTORM_ROCE_CONN_AG_CTX_MIGRATION_MASK         0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_MIGRATION_SHIFT        0
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT17_MASK             0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_BIT17_SHIFT            1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_DPM_PORT_NUM_MASK      0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_DPM_PORT_NUM_SHIFT     2
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RESERVED_MASK          0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_RESERVED_SHIFT         4
+#define E4_XSTORM_ROCE_CONN_AG_CTX_ROCE_EDPM_ENABLE_MASK  0x1
+#define E4_XSTORM_ROCE_CONN_AG_CTX_ROCE_EDPM_ENABLE_SHIFT 5
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF23_MASK              0x3
+#define E4_XSTORM_ROCE_CONN_AG_CTX_CF23_SHIFT             6
+	u8 byte2;
+	__le16 physical_q0;
+	__le16 word1;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le16 word5;
+	__le16 conn_dpi;
+	u8 byte3;
+	u8 byte4;
+	u8 byte5;
+	u8 byte6;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 snd_nxt_psn;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 reg6;
+};
+
+struct e4_tstorm_roce_conn_ag_ctx {
+	u8 reserved0;
+	u8 byte1;
+	u8 flags0;
+#define E4_TSTORM_ROCE_CONN_AG_CTX_EXIST_IN_QM0_MASK          0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_EXIST_IN_QM0_SHIFT         0
+#define E4_TSTORM_ROCE_CONN_AG_CTX_BIT1_MASK                  0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_BIT1_SHIFT                 1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_BIT2_MASK                  0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_BIT2_SHIFT                 2
+#define E4_TSTORM_ROCE_CONN_AG_CTX_BIT3_MASK                  0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_BIT3_SHIFT                 3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_BIT4_MASK                  0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_BIT4_SHIFT                 4
+#define E4_TSTORM_ROCE_CONN_AG_CTX_BIT5_MASK                  0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_BIT5_SHIFT                 5
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF0_MASK                   0x3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF0_SHIFT                  6
+	u8 flags1;
+#define E4_TSTORM_ROCE_CONN_AG_CTX_MSTORM_FLUSH_CF_MASK       0x3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_MSTORM_FLUSH_CF_SHIFT      0
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF2_MASK                   0x3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF2_SHIFT                  2
+#define E4_TSTORM_ROCE_CONN_AG_CTX_TIMER_STOP_ALL_CF_MASK     0x3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_TIMER_STOP_ALL_CF_SHIFT    4
+#define E4_TSTORM_ROCE_CONN_AG_CTX_FLUSH_Q0_CF_MASK           0x3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_FLUSH_Q0_CF_SHIFT          6
+	u8 flags2;
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF5_MASK                   0x3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF5_SHIFT                  0
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF6_MASK                   0x3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF6_SHIFT                  2
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF7_MASK                   0x3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF7_SHIFT                  4
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF8_MASK                   0x3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF8_SHIFT                  6
+	u8 flags3;
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF9_MASK                   0x3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF9_SHIFT                  0
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF10_MASK                  0x3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF10_SHIFT                 2
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF0EN_MASK                 0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF0EN_SHIFT                4
+#define E4_TSTORM_ROCE_CONN_AG_CTX_MSTORM_FLUSH_CF_EN_MASK    0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_MSTORM_FLUSH_CF_EN_SHIFT   5
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF2EN_MASK                 0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF2EN_SHIFT                6
+#define E4_TSTORM_ROCE_CONN_AG_CTX_TIMER_STOP_ALL_CF_EN_MASK  0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_TIMER_STOP_ALL_CF_EN_SHIFT 7
+	u8 flags4;
+#define E4_TSTORM_ROCE_CONN_AG_CTX_FLUSH_Q0_CF_EN_MASK        0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_FLUSH_Q0_CF_EN_SHIFT       0
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF5EN_MASK                 0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF5EN_SHIFT                1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF6EN_MASK                 0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF6EN_SHIFT                2
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF7EN_MASK                 0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF7EN_SHIFT                3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF8EN_MASK                 0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF8EN_SHIFT                4
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF9EN_MASK                 0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF9EN_SHIFT                5
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF10EN_MASK                0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_CF10EN_SHIFT               6
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE0EN_MASK               0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE0EN_SHIFT              7
+	u8 flags5;
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE1EN_MASK               0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE1EN_SHIFT              0
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE2EN_MASK               0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE2EN_SHIFT              1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE3EN_MASK               0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE3EN_SHIFT              2
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE4EN_MASK               0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE4EN_SHIFT              3
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE5EN_MASK               0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE5EN_SHIFT              4
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE6EN_MASK               0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE6EN_SHIFT              5
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE7EN_MASK               0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE7EN_SHIFT              6
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE8EN_MASK               0x1
+#define E4_TSTORM_ROCE_CONN_AG_CTX_RULE8EN_SHIFT              7
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 reg6;
+	__le32 reg7;
+	__le32 reg8;
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	u8 byte4;
+	u8 byte5;
+	__le16 word1;
+	__le16 word2;
+	__le16 word3;
+	__le32 reg9;
+	__le32 reg10;
+};
+
+/* The roce storm context of Ystorm */
+struct ystorm_roce_conn_st_ctx {
+	struct regpair temp[2];
+};
+
+/* The roce storm context of Mstorm */
+struct pstorm_roce_conn_st_ctx {
+	struct regpair temp[16];
+};
+
+/* The roce storm context of Xstorm */
+struct xstorm_roce_conn_st_ctx {
+	struct regpair temp[24];
+};
+
+/* The roce storm context of Tstorm */
+struct tstorm_roce_conn_st_ctx {
+	struct regpair temp[30];
+};
+
+/* The roce storm context of Mstorm */
+struct mstorm_roce_conn_st_ctx {
+	struct regpair temp[6];
+};
+
+/* The roce storm context of Ystorm */
+struct ustorm_roce_conn_st_ctx {
+	struct regpair temp[12];
+};
+
+/* roce connection context */
+struct e4_roce_conn_context {
+	struct ystorm_roce_conn_st_ctx ystorm_st_context;
+	struct regpair ystorm_st_padding[2];
+	struct pstorm_roce_conn_st_ctx pstorm_st_context;
+	struct xstorm_roce_conn_st_ctx xstorm_st_context;
+	struct e4_xstorm_roce_conn_ag_ctx xstorm_ag_context;
+	struct e4_tstorm_roce_conn_ag_ctx tstorm_ag_context;
+	struct timers_context timer_context;
+	struct e4_ustorm_rdma_conn_ag_ctx ustorm_ag_context;
+	struct tstorm_roce_conn_st_ctx tstorm_st_context;
+	struct regpair tstorm_st_padding[2];
+	struct mstorm_roce_conn_st_ctx mstorm_st_context;
+	struct regpair mstorm_st_padding[2];
+	struct ustorm_roce_conn_st_ctx ustorm_st_context;
+};
+
+/* roce create qp requester ramrod data */
+struct roce_create_qp_req_ramrod_data {
+	__le16 flags;
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_ROCE_FLAVOR_MASK			0x3
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_ROCE_FLAVOR_SHIFT		0
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_FMR_AND_RESERVED_EN_MASK		0x1
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_FMR_AND_RESERVED_EN_SHIFT	2
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_SIGNALED_COMP_MASK		0x1
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_SIGNALED_COMP_SHIFT		3
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_PRI_MASK				0x7
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_PRI_SHIFT			4
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_XRC_FLAG_MASK			0x1
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_XRC_FLAG_SHIFT			7
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_ERR_RETRY_CNT_MASK		0xF
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_ERR_RETRY_CNT_SHIFT		8
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_RNR_NAK_CNT_MASK			0xF
+#define ROCE_CREATE_QP_REQ_RAMROD_DATA_RNR_NAK_CNT_SHIFT		12
+	u8 max_ord;
+	u8 traffic_class;
+	u8 hop_limit;
+	u8 orq_num_pages;
+	__le16 p_key;
+	__le32 flow_label;
+	__le32 dst_qp_id;
+	__le32 ack_timeout_val;
+	__le32 initial_psn;
+	__le16 mtu;
+	__le16 pd;
+	__le16 sq_num_pages;
+	__le16 low_latency_phy_queue;
+	struct regpair sq_pbl_addr;
+	struct regpair orq_pbl_addr;
+	__le16 local_mac_addr[3];
+	__le16 remote_mac_addr[3];
+	__le16 vlan_id;
+	__le16 udp_src_port;
+	__le32 src_gid[4];
+	__le32 dst_gid[4];
+	__le32 cq_cid;
+	struct regpair qp_handle_for_cqe;
+	struct regpair qp_handle_for_async;
+	u8 stats_counter_id;
+	u8 reserved3[7];
+	__le16 regular_latency_phy_queue;
+	__le16 dpi;
+};
+
+/* roce create qp responder ramrod data */
+struct roce_create_qp_resp_ramrod_data {
+	__le32 flags;
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_ROCE_FLAVOR_MASK		0x3
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_ROCE_FLAVOR_SHIFT		0
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_RDMA_RD_EN_MASK			0x1
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_RDMA_RD_EN_SHIFT		2
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_RDMA_WR_EN_MASK			0x1
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_RDMA_WR_EN_SHIFT		3
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_ATOMIC_EN_MASK			0x1
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_ATOMIC_EN_SHIFT			4
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_SRQ_FLG_MASK			0x1
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_SRQ_FLG_SHIFT			5
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_E2E_FLOW_CONTROL_EN_MASK	0x1
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_E2E_FLOW_CONTROL_EN_SHIFT	6
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_RESERVED_KEY_EN_MASK		0x1
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_RESERVED_KEY_EN_SHIFT		7
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_PRI_MASK			0x7
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_PRI_SHIFT			8
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER_MASK		0x1F
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER_SHIFT		11
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_XRC_FLAG_MASK             0x1
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_XRC_FLAG_SHIFT            16
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_RESERVED_MASK             0x7FFF
+#define ROCE_CREATE_QP_RESP_RAMROD_DATA_RESERVED_SHIFT            17
+	__le16 xrc_domain;
+	u8 max_ird;
+	u8 traffic_class;
+	u8 hop_limit;
+	u8 irq_num_pages;
+	__le16 p_key;
+	__le32 flow_label;
+	__le32 dst_qp_id;
+	u8 stats_counter_id;
+	u8 reserved1;
+	__le16 mtu;
+	__le32 initial_psn;
+	__le16 pd;
+	__le16 rq_num_pages;
+	struct rdma_srq_id srq_id;
+	struct regpair rq_pbl_addr;
+	struct regpair irq_pbl_addr;
+	__le16 local_mac_addr[3];
+	__le16 remote_mac_addr[3];
+	__le16 vlan_id;
+	__le16 udp_src_port;
+	__le32 src_gid[4];
+	__le32 dst_gid[4];
+	struct regpair qp_handle_for_cqe;
+	struct regpair qp_handle_for_async;
+	__le16 low_latency_phy_queue;
+	u8 reserved2[2];
+	__le32 cq_cid;
+	__le16 regular_latency_phy_queue;
+	__le16 dpi;
+};
+
+/* roce DCQCN received statistics */
+struct roce_dcqcn_received_stats {
+	struct regpair ecn_pkt_rcv;
+	struct regpair cnp_pkt_rcv;
+};
+
+/* roce DCQCN sent statistics */
+struct roce_dcqcn_sent_stats {
+	struct regpair cnp_pkt_sent;
+};
+
+/* RoCE destroy qp requester output params */
+struct roce_destroy_qp_req_output_params {
+	__le32 num_bound_mw;
+	__le32 cq_prod;
+};
+
+/* RoCE destroy qp requester ramrod data */
+struct roce_destroy_qp_req_ramrod_data {
+	struct regpair output_params_addr;
+};
+
+/* RoCE destroy qp responder output params */
+struct roce_destroy_qp_resp_output_params {
+	__le32 num_invalidated_mw;
+	__le32 cq_prod;
+};
+
+/* RoCE destroy qp responder ramrod data */
+struct roce_destroy_qp_resp_ramrod_data {
+	struct regpair output_params_addr;
+};
+
+/* roce special events statistics */
+struct roce_events_stats {
+	__le16 silent_drops;
+	__le16 rnr_naks_sent;
+	__le32 retransmit_count;
+	__le32 icrc_error_count;
+	__le32 reserved;
+};
+
+/* ROCE slow path EQ cmd IDs */
+enum roce_event_opcode {
+	ROCE_EVENT_CREATE_QP = 11,
+	ROCE_EVENT_MODIFY_QP,
+	ROCE_EVENT_QUERY_QP,
+	ROCE_EVENT_DESTROY_QP,
+	ROCE_EVENT_CREATE_UD_QP,
+	ROCE_EVENT_DESTROY_UD_QP,
+	MAX_ROCE_EVENT_OPCODE
+};
+
+/* roce func init ramrod data */
+struct roce_init_func_params {
+	u8 ll2_queue_id;
+	u8 cnp_vlan_priority;
+	u8 cnp_dscp;
+	u8 reserved;
+	__le32 cnp_send_timeout;
+};
+
+/* roce func init ramrod data */
+struct roce_init_func_ramrod_data {
+	struct rdma_init_func_ramrod_data rdma;
+	struct roce_init_func_params roce;
+};
+
+/* roce modify qp requester ramrod data */
+struct roce_modify_qp_req_ramrod_data {
+	__le16 flags;
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_MOVE_TO_ERR_FLG_MASK		0x1
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_MOVE_TO_ERR_FLG_SHIFT		0
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_MOVE_TO_SQD_FLG_MASK		0x1
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_MOVE_TO_SQD_FLG_SHIFT		1
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_EN_SQD_ASYNC_NOTIFY_MASK		0x1
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_EN_SQD_ASYNC_NOTIFY_SHIFT	2
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_P_KEY_FLG_MASK			0x1
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_P_KEY_FLG_SHIFT			3
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_ADDRESS_VECTOR_FLG_MASK		0x1
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_ADDRESS_VECTOR_FLG_SHIFT		4
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_MAX_ORD_FLG_MASK			0x1
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_MAX_ORD_FLG_SHIFT		5
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_RNR_NAK_CNT_FLG_MASK		0x1
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_RNR_NAK_CNT_FLG_SHIFT		6
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_ERR_RETRY_CNT_FLG_MASK		0x1
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_ERR_RETRY_CNT_FLG_SHIFT		7
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_ACK_TIMEOUT_FLG_MASK		0x1
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_ACK_TIMEOUT_FLG_SHIFT		8
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_PRI_FLG_MASK			0x1
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_PRI_FLG_SHIFT			9
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_PRI_MASK				0x7
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_PRI_SHIFT			10
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_PHYSICAL_QUEUES_FLG_MASK		0x1
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_PHYSICAL_QUEUES_FLG_SHIFT	13
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_RESERVED1_MASK			0x3
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_RESERVED1_SHIFT			14
+	u8 fields;
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_ERR_RETRY_CNT_MASK	0xF
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_ERR_RETRY_CNT_SHIFT	0
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_RNR_NAK_CNT_MASK		0xF
+#define ROCE_MODIFY_QP_REQ_RAMROD_DATA_RNR_NAK_CNT_SHIFT	4
+	u8 max_ord;
+	u8 traffic_class;
+	u8 hop_limit;
+	__le16 p_key;
+	__le32 flow_label;
+	__le32 ack_timeout_val;
+	__le16 mtu;
+	__le16 reserved2;
+	__le32 reserved3[2];
+	__le16 low_latency_phy_queue;
+	__le16 regular_latency_phy_queue;
+	__le32 src_gid[4];
+	__le32 dst_gid[4];
+};
+
+/* roce modify qp responder ramrod data */
+struct roce_modify_qp_resp_ramrod_data {
+	__le16 flags;
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_MOVE_TO_ERR_FLG_MASK		0x1
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_MOVE_TO_ERR_FLG_SHIFT		0
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_RDMA_RD_EN_MASK			0x1
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_RDMA_RD_EN_SHIFT		1
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_RDMA_WR_EN_MASK			0x1
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_RDMA_WR_EN_SHIFT		2
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_ATOMIC_EN_MASK			0x1
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_ATOMIC_EN_SHIFT			3
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_P_KEY_FLG_MASK			0x1
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_P_KEY_FLG_SHIFT			4
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_ADDRESS_VECTOR_FLG_MASK		0x1
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_ADDRESS_VECTOR_FLG_SHIFT	5
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_MAX_IRD_FLG_MASK		0x1
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_MAX_IRD_FLG_SHIFT		6
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_PRI_FLG_MASK			0x1
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_PRI_FLG_SHIFT			7
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER_FLG_MASK	0x1
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER_FLG_SHIFT	8
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_RDMA_OPS_EN_FLG_MASK		0x1
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_RDMA_OPS_EN_FLG_SHIFT		9
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_PHYSICAL_QUEUES_FLG_MASK	0x1
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_PHYSICAL_QUEUES_FLG_SHIFT	10
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_RESERVED1_MASK			0x1F
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_RESERVED1_SHIFT			11
+	u8 fields;
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_PRI_MASK		0x7
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_PRI_SHIFT		0
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER_MASK	0x1F
+#define ROCE_MODIFY_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER_SHIFT	3
+	u8 max_ird;
+	u8 traffic_class;
+	u8 hop_limit;
+	__le16 p_key;
+	__le32 flow_label;
+	__le16 mtu;
+	__le16 low_latency_phy_queue;
+	__le16 regular_latency_phy_queue;
+	u8 reserved2[6];
+	__le32 src_gid[4];
+	__le32 dst_gid[4];
+};
+
+/* RoCE query qp requester output params */
+struct roce_query_qp_req_output_params {
+	__le32 psn;
+	__le32 flags;
+#define ROCE_QUERY_QP_REQ_OUTPUT_PARAMS_ERR_FLG_MASK		0x1
+#define ROCE_QUERY_QP_REQ_OUTPUT_PARAMS_ERR_FLG_SHIFT		0
+#define ROCE_QUERY_QP_REQ_OUTPUT_PARAMS_SQ_DRAINING_FLG_MASK	0x1
+#define ROCE_QUERY_QP_REQ_OUTPUT_PARAMS_SQ_DRAINING_FLG_SHIFT	1
+#define ROCE_QUERY_QP_REQ_OUTPUT_PARAMS_RESERVED0_MASK		0x3FFFFFFF
+#define ROCE_QUERY_QP_REQ_OUTPUT_PARAMS_RESERVED0_SHIFT		2
+};
+
+/* RoCE query qp requester ramrod data */
+struct roce_query_qp_req_ramrod_data {
+	struct regpair output_params_addr;
+};
+
+/* RoCE query qp responder output params */
+struct roce_query_qp_resp_output_params {
+	__le32 psn;
+	__le32 err_flag;
+#define ROCE_QUERY_QP_RESP_OUTPUT_PARAMS_ERROR_FLG_MASK  0x1
+#define ROCE_QUERY_QP_RESP_OUTPUT_PARAMS_ERROR_FLG_SHIFT 0
+#define ROCE_QUERY_QP_RESP_OUTPUT_PARAMS_RESERVED0_MASK  0x7FFFFFFF
+#define ROCE_QUERY_QP_RESP_OUTPUT_PARAMS_RESERVED0_SHIFT 1
+};
+
+/* RoCE query qp responder ramrod data */
+struct roce_query_qp_resp_ramrod_data {
+	struct regpair output_params_addr;
+};
+
+/* ROCE ramrod command IDs */
+enum roce_ramrod_cmd_id {
+	ROCE_RAMROD_CREATE_QP = 11,
+	ROCE_RAMROD_MODIFY_QP,
+	ROCE_RAMROD_QUERY_QP,
+	ROCE_RAMROD_DESTROY_QP,
+	ROCE_RAMROD_CREATE_UD_QP,
+	ROCE_RAMROD_DESTROY_UD_QP,
+	MAX_ROCE_RAMROD_CMD_ID
+};
+
+struct e4_xstorm_roce_conn_ag_ctx_dq_ext_ld_part {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_EXIST_IN_QM0_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_EXIST_IN_QM0_SHIFT	0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT1_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT1_SHIFT		1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT2_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT2_SHIFT		2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_EXIST_IN_QM3_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_EXIST_IN_QM3_SHIFT	3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT4_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT4_SHIFT		4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT5_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT5_SHIFT		5
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT6_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT6_SHIFT		6
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT7_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT7_SHIFT		7
+	u8 flags1;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT8_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT8_SHIFT		0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT9_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT9_SHIFT		1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT10_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT10_SHIFT		2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT11_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT11_SHIFT		3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT12_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT12_SHIFT		4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_MSEM_FLUSH_MASK        0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_MSEM_FLUSH_SHIFT       5
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_MSDM_FLUSH_MASK        0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_MSDM_FLUSH_SHIFT       6
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_YSTORM_FLUSH_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_YSTORM_FLUSH_SHIFT	7
+	u8 flags2;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF0_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF0_SHIFT	0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF1_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF1_SHIFT	2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF2_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF2_SHIFT	4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF3_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF3_SHIFT	6
+	u8 flags3;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF4_MASK		0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF4_SHIFT		0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF5_MASK		0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF5_SHIFT		2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF6_MASK		0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF6_SHIFT		4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_FLUSH_Q0_CF_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_FLUSH_Q0_CF_SHIFT	6
+	u8 flags4;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF8_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF8_SHIFT	0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF9_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF9_SHIFT	2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF10_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF10_SHIFT	4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF11_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF11_SHIFT	6
+	u8 flags5;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF12_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF12_SHIFT	0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF13_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF13_SHIFT	2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF14_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF14_SHIFT	4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF15_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF15_SHIFT	6
+	u8 flags6;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF16_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF16_SHIFT	0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF17_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF17_SHIFT	2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF18_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF18_SHIFT	4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF19_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF19_SHIFT	6
+	u8 flags7;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF20_MASK		0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF20_SHIFT		0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF21_MASK		0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF21_SHIFT		2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_SLOW_PATH_MASK		0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_SLOW_PATH_SHIFT	4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF0EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF0EN_SHIFT		6
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF1EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF1EN_SHIFT		7
+	u8 flags8;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF2EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF2EN_SHIFT		0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF3EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF3EN_SHIFT		1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF4EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF4EN_SHIFT		2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF5EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF5EN_SHIFT		3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF6EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF6EN_SHIFT		4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_FLUSH_Q0_CF_EN_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_FLUSH_Q0_CF_EN_SHIFT	5
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF8EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF8EN_SHIFT		6
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF9EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF9EN_SHIFT		7
+	u8 flags9;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF10EN_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF10EN_SHIFT	0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF11EN_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF11EN_SHIFT	1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF12EN_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF12EN_SHIFT	2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF13EN_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF13EN_SHIFT	3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF14EN_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF14EN_SHIFT	4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF15EN_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF15EN_SHIFT	5
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF16EN_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF16EN_SHIFT	6
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF17EN_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF17EN_SHIFT	7
+	u8 flags10;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF18EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF18EN_SHIFT		0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF19EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF19EN_SHIFT		1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF20EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF20EN_SHIFT		2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF21EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF21EN_SHIFT		3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_SLOW_PATH_EN_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_SLOW_PATH_EN_SHIFT	4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF23EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF23EN_SHIFT		5
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE0EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE0EN_SHIFT		6
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE1EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE1EN_SHIFT		7
+	u8 flags11;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE2EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE2EN_SHIFT		0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE3EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE3EN_SHIFT		1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE4EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE4EN_SHIFT		2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE5EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE5EN_SHIFT		3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE6EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE6EN_SHIFT		4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE7EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE7EN_SHIFT		5
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED1_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED1_SHIFT	6
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE9EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE9EN_SHIFT		7
+	u8 flags12;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE10EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE10EN_SHIFT		0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE11EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE11EN_SHIFT		1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED2_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED2_SHIFT	2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED3_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED3_SHIFT	3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE14EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE14EN_SHIFT		4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE15EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE15EN_SHIFT		5
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE16EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE16EN_SHIFT		6
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE17EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE17EN_SHIFT		7
+	u8 flags13;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE18EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE18EN_SHIFT		0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE19EN_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RULE19EN_SHIFT		1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED4_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED4_SHIFT	2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED5_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED5_SHIFT	3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED6_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED6_SHIFT	4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED7_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED7_SHIFT	5
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED8_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED8_SHIFT	6
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED9_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_A0_RESERVED9_SHIFT	7
+	u8 flags14;
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_MIGRATION_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_MIGRATION_SHIFT	0
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT17_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_BIT17_SHIFT		1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_DPM_PORT_NUM_MASK	0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_DPM_PORT_NUM_SHIFT	2
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RESERVED_MASK		0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_RESERVED_SHIFT		4
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_ROCE_EDPM_ENABLE_MASK	0x1
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_ROCE_EDPM_ENABLE_SHIFT	5
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF23_MASK		0x3
+#define E4XSTORMROCECONNAGCTXDQEXTLDPART_CF23_SHIFT		6
+	u8 byte2;
+	__le16 physical_q0;
+	__le16 word1;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le16 word5;
+	__le16 conn_dpi;
+	u8 byte3;
+	u8 byte4;
+	u8 byte5;
+	u8 byte6;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 snd_nxt_psn;
+	__le32 reg4;
+};
+
+struct e4_mstorm_roce_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_MSTORM_ROCE_CONN_AG_CTX_BIT0_MASK     0x1
+#define E4_MSTORM_ROCE_CONN_AG_CTX_BIT0_SHIFT    0
+#define E4_MSTORM_ROCE_CONN_AG_CTX_BIT1_MASK     0x1
+#define E4_MSTORM_ROCE_CONN_AG_CTX_BIT1_SHIFT    1
+#define E4_MSTORM_ROCE_CONN_AG_CTX_CF0_MASK      0x3
+#define E4_MSTORM_ROCE_CONN_AG_CTX_CF0_SHIFT     2
+#define E4_MSTORM_ROCE_CONN_AG_CTX_CF1_MASK      0x3
+#define E4_MSTORM_ROCE_CONN_AG_CTX_CF1_SHIFT     4
+#define E4_MSTORM_ROCE_CONN_AG_CTX_CF2_MASK      0x3
+#define E4_MSTORM_ROCE_CONN_AG_CTX_CF2_SHIFT     6
+	u8 flags1;
+#define E4_MSTORM_ROCE_CONN_AG_CTX_CF0EN_MASK    0x1
+#define E4_MSTORM_ROCE_CONN_AG_CTX_CF0EN_SHIFT   0
+#define E4_MSTORM_ROCE_CONN_AG_CTX_CF1EN_MASK    0x1
+#define E4_MSTORM_ROCE_CONN_AG_CTX_CF1EN_SHIFT   1
+#define E4_MSTORM_ROCE_CONN_AG_CTX_CF2EN_MASK    0x1
+#define E4_MSTORM_ROCE_CONN_AG_CTX_CF2EN_SHIFT   2
+#define E4_MSTORM_ROCE_CONN_AG_CTX_RULE0EN_MASK  0x1
+#define E4_MSTORM_ROCE_CONN_AG_CTX_RULE0EN_SHIFT 3
+#define E4_MSTORM_ROCE_CONN_AG_CTX_RULE1EN_MASK  0x1
+#define E4_MSTORM_ROCE_CONN_AG_CTX_RULE1EN_SHIFT 4
+#define E4_MSTORM_ROCE_CONN_AG_CTX_RULE2EN_MASK  0x1
+#define E4_MSTORM_ROCE_CONN_AG_CTX_RULE2EN_SHIFT 5
+#define E4_MSTORM_ROCE_CONN_AG_CTX_RULE3EN_MASK  0x1
+#define E4_MSTORM_ROCE_CONN_AG_CTX_RULE3EN_SHIFT 6
+#define E4_MSTORM_ROCE_CONN_AG_CTX_RULE4EN_MASK  0x1
+#define E4_MSTORM_ROCE_CONN_AG_CTX_RULE4EN_SHIFT 7
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+};
+
+struct e4_mstorm_roce_req_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_CF0_MASK		0x3
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_CF1_MASK		0x3
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_CF2_MASK		0x3
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_CF0EN_MASK	0x1
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_CF0EN_SHIFT	0
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_CF1EN_MASK	0x1
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_CF1EN_SHIFT	1
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_CF2EN_MASK	0x1
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_CF2EN_SHIFT	2
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_RULE0EN_SHIFT	3
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_RULE1EN_SHIFT	4
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_RULE2EN_SHIFT	5
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_RULE3EN_SHIFT	6
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_MSTORM_ROCE_REQ_CONN_AG_CTX_RULE4EN_SHIFT	7
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+};
+
+struct e4_mstorm_roce_resp_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_CF0EN_MASK	0x1
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_CF0EN_SHIFT	0
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_CF1EN_MASK	0x1
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_CF1EN_SHIFT	1
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_CF2EN_MASK	0x1
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_CF2EN_SHIFT	2
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_RULE0EN_SHIFT	3
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_RULE1EN_SHIFT	4
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_RULE2EN_SHIFT	5
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_RULE3EN_SHIFT	6
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_MSTORM_ROCE_RESP_CONN_AG_CTX_RULE4EN_SHIFT	7
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+};
+
+struct e4_tstorm_roce_req_conn_ag_ctx {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_EXIST_IN_QM0_SHIFT		0
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RX_ERROR_OCCURRED_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RX_ERROR_OCCURRED_SHIFT		1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TX_CQE_ERROR_OCCURRED_MASK	0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TX_CQE_ERROR_OCCURRED_SHIFT	2
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_BIT3_MASK			0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_BIT3_SHIFT			3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_MSTORM_FLUSH_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_MSTORM_FLUSH_SHIFT		4
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_CACHED_ORQ_MASK			0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_CACHED_ORQ_SHIFT			5
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TIMER_CF_MASK			0x3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TIMER_CF_SHIFT			6
+	u8 flags1;
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_MSTORM_FLUSH_CF_MASK             0x3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_MSTORM_FLUSH_CF_SHIFT            0
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_FLUSH_SQ_CF_MASK			0x3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_FLUSH_SQ_CF_SHIFT		2
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TIMER_STOP_ALL_CF_MASK		0x3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TIMER_STOP_ALL_CF_SHIFT		4
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_FLUSH_Q0_CF_MASK			0x3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_FLUSH_Q0_CF_SHIFT		6
+	u8 flags2;
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_FORCE_COMP_CF_MASK               0x3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_FORCE_COMP_CF_SHIFT              0
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_SET_TIMER_CF_MASK	0x3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_SET_TIMER_CF_SHIFT	2
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TX_ASYNC_ERROR_CF_MASK	0x3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TX_ASYNC_ERROR_CF_SHIFT	4
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RXMIT_DONE_CF_MASK	0x3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RXMIT_DONE_CF_SHIFT	6
+	u8 flags3;
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_ERROR_SCAN_COMPLETED_CF_MASK	0x3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_ERROR_SCAN_COMPLETED_CF_SHIFT	0
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_SQ_DRAIN_COMPLETED_CF_MASK	0x3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_SQ_DRAIN_COMPLETED_CF_SHIFT	2
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TIMER_CF_EN_MASK			0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TIMER_CF_EN_SHIFT		4
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_MSTORM_FLUSH_CF_EN_MASK          0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_MSTORM_FLUSH_CF_EN_SHIFT         5
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_FLUSH_SQ_CF_EN_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_FLUSH_SQ_CF_EN_SHIFT		6
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TIMER_STOP_ALL_CF_EN_MASK	0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TIMER_STOP_ALL_CF_EN_SHIFT	7
+	u8 flags4;
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_FLUSH_Q0_CF_EN_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_FLUSH_Q0_CF_EN_SHIFT		0
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_FORCE_COMP_CF_EN_MASK            0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_FORCE_COMP_CF_EN_SHIFT           1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_SET_TIMER_CF_EN_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_SET_TIMER_CF_EN_SHIFT		2
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TX_ASYNC_ERROR_CF_EN_MASK	0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_TX_ASYNC_ERROR_CF_EN_SHIFT	3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RXMIT_DONE_CF_EN_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RXMIT_DONE_CF_EN_SHIFT		4
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_ERROR_SCAN_COMPLETED_CF_EN_MASK	0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_ERROR_SCAN_COMPLETED_CF_EN_SHIFT	5
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_SQ_DRAIN_COMPLETED_CF_EN_MASK	0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_SQ_DRAIN_COMPLETED_CF_EN_SHIFT	6
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE0EN_MASK			0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE0EN_SHIFT			7
+	u8 flags5;
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE1EN_SHIFT		0
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE2EN_SHIFT		1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE3EN_SHIFT		2
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE4EN_SHIFT		3
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE5EN_SHIFT		4
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_SND_SQ_CONS_EN_MASK	0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_SND_SQ_CONS_EN_SHIFT	5
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE7EN_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE7EN_SHIFT		6
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE8EN_MASK		0x1
+#define E4_TSTORM_ROCE_REQ_CONN_AG_CTX_RULE8EN_SHIFT		7
+	__le32 reg0;
+	__le32 snd_nxt_psn;
+	__le32 snd_max_psn;
+	__le32 orq_prod;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 reg6;
+	__le32 reg7;
+	__le32 reg8;
+	u8 tx_cqe_error_type;
+	u8 orq_cache_idx;
+	__le16 snd_sq_cons_th;
+	u8 byte4;
+	u8 byte5;
+	__le16 snd_sq_cons;
+	__le16 conn_dpi;
+	__le16 force_comp_cons;
+	__le32 reg9;
+	__le32 reg10;
+};
+
+struct e4_tstorm_roce_resp_conn_ag_ctx {
+	u8 byte0;
+	u8 state;
+	u8 flags0;
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_EXIST_IN_QM0_SHIFT		0
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RX_ERROR_NOTIFY_REQUESTER_MASK	0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RX_ERROR_NOTIFY_REQUESTER_SHIFT	1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_BIT2_MASK			0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_BIT2_SHIFT			2
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_BIT3_MASK			0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_BIT3_SHIFT			3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_MSTORM_FLUSH_MASK		0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_MSTORM_FLUSH_SHIFT		4
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_BIT5_MASK			0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_BIT5_SHIFT			5
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF0_MASK			0x3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF0_SHIFT			6
+	u8 flags1;
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_MSTORM_FLUSH_CF_MASK            0x3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_MSTORM_FLUSH_CF_SHIFT           0
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_TX_ERROR_CF_MASK	0x3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_TX_ERROR_CF_SHIFT	2
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF3_MASK		0x3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF3_SHIFT		4
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_FLUSH_Q0_CF_MASK	0x3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_FLUSH_Q0_CF_SHIFT	6
+	u8 flags2;
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RX_ERROR_CF_MASK                0x3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RX_ERROR_CF_SHIFT               0
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF6_MASK		0x3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF6_SHIFT		2
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF7_MASK		0x3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF7_SHIFT		4
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF8_MASK		0x3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF8_SHIFT		6
+	u8 flags3;
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF9_MASK		0x3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF9_SHIFT		0
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF10_MASK		0x3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF10_SHIFT		2
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF0EN_SHIFT		4
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_MSTORM_FLUSH_CF_EN_MASK         0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_MSTORM_FLUSH_CF_EN_SHIFT        5
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_TX_ERROR_CF_EN_MASK	0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_TX_ERROR_CF_EN_SHIFT	6
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF3EN_MASK		0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF3EN_SHIFT		7
+	u8 flags4;
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_FLUSH_Q0_CF_EN_MASK		0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_FLUSH_Q0_CF_EN_SHIFT		0
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RX_ERROR_CF_EN_MASK             0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RX_ERROR_CF_EN_SHIFT            1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF6EN_MASK			0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF6EN_SHIFT			2
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF7EN_MASK			0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF7EN_SHIFT			3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF8EN_MASK			0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF8EN_SHIFT			4
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF9EN_MASK			0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF9EN_SHIFT			5
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF10EN_MASK			0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_CF10EN_SHIFT			6
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE0EN_MASK			0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE0EN_SHIFT			7
+	u8 flags5;
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE1EN_SHIFT		0
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE2EN_SHIFT		1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE3EN_SHIFT		2
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE4EN_SHIFT		3
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE5EN_SHIFT		4
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RQ_RULE_EN_MASK		0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RQ_RULE_EN_SHIFT	5
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE7EN_MASK		0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE7EN_SHIFT		6
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE8EN_MASK		0x1
+#define E4_TSTORM_ROCE_RESP_CONN_AG_CTX_RULE8EN_SHIFT		7
+	__le32 psn_and_rxmit_id_echo;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 reg6;
+	__le32 reg7;
+	__le32 reg8;
+	u8 tx_async_error_type;
+	u8 byte3;
+	__le16 rq_cons;
+	u8 byte4;
+	u8 byte5;
+	__le16 rq_prod;
+	__le16 conn_dpi;
+	__le16 irq_cons;
+	__le32 num_invlidated_mw;
+	__le32 reg10;
+};
+
+struct e4_ustorm_roce_req_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF0_MASK		0x3
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF1_MASK		0x3
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF2_MASK		0x3
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF3_MASK		0x3
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF3_SHIFT	0
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF4_MASK		0x3
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF4_SHIFT	2
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF5_MASK		0x3
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF5_SHIFT	4
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF6_MASK		0x3
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF6_SHIFT	6
+	u8 flags2;
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF0EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF0EN_SHIFT	0
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF1EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF1EN_SHIFT	1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF2EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF2EN_SHIFT	2
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF3EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF3EN_SHIFT	3
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF4EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF4EN_SHIFT	4
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF5EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF5EN_SHIFT	5
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF6EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_CF6EN_SHIFT	6
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE0EN_SHIFT	7
+	u8 flags3;
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE1EN_SHIFT	0
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE2EN_SHIFT	1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE3EN_SHIFT	2
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE4EN_SHIFT	3
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE5EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE5EN_SHIFT	4
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE6EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE6EN_SHIFT	5
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE7EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE7EN_SHIFT	6
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE8EN_MASK	0x1
+#define E4_USTORM_ROCE_REQ_CONN_AG_CTX_RULE8EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le16 word2;
+	__le16 word3;
+};
+
+struct e4_ustorm_roce_resp_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF3_MASK	0x3
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF3_SHIFT	0
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF4_MASK	0x3
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF4_SHIFT	2
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF5_SHIFT	4
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF6_SHIFT	6
+	u8 flags2;
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF0EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF0EN_SHIFT	0
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF1EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF1EN_SHIFT	1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF2EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF2EN_SHIFT	2
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF3EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF3EN_SHIFT	3
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF4EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF4EN_SHIFT	4
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF5EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF5EN_SHIFT	5
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF6EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_CF6EN_SHIFT	6
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE0EN_SHIFT	7
+	u8 flags3;
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE1EN_SHIFT	0
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE2EN_SHIFT	1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE3EN_SHIFT	2
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE4EN_SHIFT	3
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE5EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE5EN_SHIFT	4
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE6EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE6EN_SHIFT	5
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE7EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE7EN_SHIFT	6
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE8EN_MASK	0x1
+#define E4_USTORM_ROCE_RESP_CONN_AG_CTX_RULE8EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le16 word2;
+	__le16 word3;
+};
+
+struct e4_xstorm_roce_req_conn_ag_ctx {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED1_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED1_SHIFT		1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED2_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED2_SHIFT		2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_EXIST_IN_QM3_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_EXIST_IN_QM3_SHIFT	3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED3_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED3_SHIFT		4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED4_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED4_SHIFT		5
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED5_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED5_SHIFT		6
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED6_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED6_SHIFT		7
+	u8 flags1;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED7_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED7_SHIFT		0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED8_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED8_SHIFT		1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_BIT10_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_BIT10_SHIFT		2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_BIT11_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_BIT11_SHIFT		3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_BIT12_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_BIT12_SHIFT		4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_BIT13_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_BIT13_SHIFT		5
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_ERROR_STATE_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_ERROR_STATE_SHIFT	6
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_YSTORM_FLUSH_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_YSTORM_FLUSH_SHIFT	7
+	u8 flags2;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF0_MASK		0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF0_SHIFT	0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF1_MASK		0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF1_SHIFT	2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF2_MASK		0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF2_SHIFT	4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF3_MASK		0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF3_SHIFT	6
+	u8 flags3;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SQ_FLUSH_CF_MASK		0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SQ_FLUSH_CF_SHIFT	0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RX_ERROR_CF_MASK		0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RX_ERROR_CF_SHIFT	2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SND_RXMIT_CF_MASK	0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SND_RXMIT_CF_SHIFT	4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_FLUSH_Q0_CF_MASK		0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_FLUSH_Q0_CF_SHIFT	6
+	u8 flags4;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_DIF_ERROR_CF_MASK        0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_DIF_ERROR_CF_SHIFT       0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SCAN_SQ_FOR_COMP_CF_MASK     0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SCAN_SQ_FOR_COMP_CF_SHIFT    2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF10_MASK	0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF10_SHIFT	4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF11_MASK	0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF11_SHIFT	6
+	u8 flags5;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF12_MASK		0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF12_SHIFT		0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF13_MASK		0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF13_SHIFT		2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_FMR_ENDED_CF_MASK	0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_FMR_ENDED_CF_SHIFT	4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF15_MASK		0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF15_SHIFT		6
+	u8 flags6;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF16_MASK	0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF16_SHIFT	0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF17_MASK	0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF17_SHIFT	2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF18_MASK	0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF18_SHIFT	4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF19_MASK	0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF19_SHIFT	6
+	u8 flags7;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF20_MASK	0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF20_SHIFT	0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF21_MASK	0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF21_SHIFT	2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SLOW_PATH_MASK	0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SLOW_PATH_SHIFT	4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF0EN_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF0EN_SHIFT	6
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF1EN_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF1EN_SHIFT	7
+	u8 flags8;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF2EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF2EN_SHIFT		0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF3EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF3EN_SHIFT		1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SQ_FLUSH_CF_EN_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SQ_FLUSH_CF_EN_SHIFT	2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RX_ERROR_CF_EN_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RX_ERROR_CF_EN_SHIFT	3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SND_RXMIT_CF_EN_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SND_RXMIT_CF_EN_SHIFT	4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_FLUSH_Q0_CF_EN_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_FLUSH_Q0_CF_EN_SHIFT	5
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_DIF_ERROR_CF_EN_MASK     0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_DIF_ERROR_CF_EN_SHIFT    6
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SCAN_SQ_FOR_COMP_CF_EN_MASK  0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SCAN_SQ_FOR_COMP_CF_EN_SHIFT 7
+	u8 flags9;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF10EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF10EN_SHIFT		0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF11EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF11EN_SHIFT		1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF12EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF12EN_SHIFT		2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF13EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF13EN_SHIFT		3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_FME_ENDED_CF_EN_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_FME_ENDED_CF_EN_SHIFT	4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF15EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF15EN_SHIFT		5
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF16EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF16EN_SHIFT		6
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF17EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF17EN_SHIFT		7
+	u8 flags10;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF18EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF18EN_SHIFT		0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF19EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF19EN_SHIFT		1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF20EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF20EN_SHIFT		2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF21EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF21EN_SHIFT		3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SLOW_PATH_EN_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SLOW_PATH_EN_SHIFT	4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF23EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF23EN_SHIFT		5
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE0EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE0EN_SHIFT		6
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE1EN_SHIFT		7
+	u8 flags11;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE2EN_SHIFT		0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE3EN_SHIFT		1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE4EN_SHIFT		2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE5EN_SHIFT		3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE6EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE6EN_SHIFT		4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_E2E_CREDIT_RULE_EN_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_E2E_CREDIT_RULE_EN_SHIFT	5
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED1_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED1_SHIFT	6
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE9EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE9EN_SHIFT		7
+	u8 flags12;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SQ_PROD_EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_SQ_PROD_EN_SHIFT		0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE11EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE11EN_SHIFT		1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED2_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED2_SHIFT	2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED3_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED3_SHIFT	3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_INV_FENCE_RULE_EN_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_INV_FENCE_RULE_EN_SHIFT	4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE15EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE15EN_SHIFT		5
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_ORQ_FENCE_RULE_EN_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_ORQ_FENCE_RULE_EN_SHIFT	6
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_MAX_ORD_RULE_EN_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_MAX_ORD_RULE_EN_SHIFT	7
+	u8 flags13;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE18EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE18EN_SHIFT		0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE19EN_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RULE19EN_SHIFT		1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED4_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED4_SHIFT	2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED5_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED5_SHIFT	3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED6_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED6_SHIFT	4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED7_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED7_SHIFT	5
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED8_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED8_SHIFT	6
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED9_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_A0_RESERVED9_SHIFT	7
+	u8 flags14;
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_MIGRATION_FLAG_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_MIGRATION_FLAG_SHIFT	0
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_BIT17_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_BIT17_SHIFT		1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_DPM_PORT_NUM_MASK	0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_DPM_PORT_NUM_SHIFT	2
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED_MASK		0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_RESERVED_SHIFT		4
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_ROCE_EDPM_ENABLE_MASK	0x1
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_ROCE_EDPM_ENABLE_SHIFT	5
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF23_MASK		0x3
+#define E4_XSTORM_ROCE_REQ_CONN_AG_CTX_CF23_SHIFT		6
+	u8 byte2;
+	__le16 physical_q0;
+	__le16 word1;
+	__le16 sq_cmp_cons;
+	__le16 sq_cons;
+	__le16 sq_prod;
+	__le16 dif_error_first_sq_cons;
+	__le16 conn_dpi;
+	u8 dif_error_sge_index;
+	u8 byte4;
+	u8 byte5;
+	u8 byte6;
+	__le32 lsn;
+	__le32 ssn;
+	__le32 snd_una_psn;
+	__le32 snd_nxt_psn;
+	__le32 dif_error_offset;
+	__le32 orq_cons_th;
+	__le32 orq_cons;
+};
+
+struct e4_xstorm_roce_resp_conn_ag_ctx {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED1_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED1_SHIFT		1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED2_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED2_SHIFT		2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_EXIST_IN_QM3_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_EXIST_IN_QM3_SHIFT	3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED3_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED3_SHIFT		4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED4_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED4_SHIFT		5
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED5_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED5_SHIFT		6
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED6_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED6_SHIFT		7
+	u8 flags1;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED7_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED7_SHIFT		0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED8_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RESERVED8_SHIFT		1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT10_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT10_SHIFT		2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT11_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT11_SHIFT		3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT12_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT12_SHIFT		4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT13_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT13_SHIFT		5
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_ERROR_STATE_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_ERROR_STATE_SHIFT	6
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_YSTORM_FLUSH_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_YSTORM_FLUSH_SHIFT	7
+	u8 flags2;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF0_SHIFT	0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF1_SHIFT	2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF2_SHIFT	4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF3_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF3_SHIFT	6
+	u8 flags3;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RXMIT_CF_MASK		0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RXMIT_CF_SHIFT		0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RX_ERROR_CF_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RX_ERROR_CF_SHIFT	2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_FORCE_ACK_CF_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_FORCE_ACK_CF_SHIFT	4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_FLUSH_Q0_CF_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_FLUSH_Q0_CF_SHIFT	6
+	u8 flags4;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF8_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF8_SHIFT	0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF9_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF9_SHIFT	2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF10_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF10_SHIFT	4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF11_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF11_SHIFT	6
+	u8 flags5;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF12_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF12_SHIFT	0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF13_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF13_SHIFT	2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF14_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF14_SHIFT	4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF15_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF15_SHIFT	6
+	u8 flags6;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF16_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF16_SHIFT	0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF17_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF17_SHIFT	2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF18_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF18_SHIFT	4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF19_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF19_SHIFT	6
+	u8 flags7;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF20_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF20_SHIFT	0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF21_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF21_SHIFT	2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_SLOW_PATH_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_SLOW_PATH_SHIFT	4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF0EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF0EN_SHIFT	6
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF1EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF1EN_SHIFT	7
+	u8 flags8;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF2EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF2EN_SHIFT		0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF3EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF3EN_SHIFT		1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RXMIT_CF_EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RXMIT_CF_EN_SHIFT	2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RX_ERROR_CF_EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RX_ERROR_CF_EN_SHIFT	3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_FORCE_ACK_CF_EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_FORCE_ACK_CF_EN_SHIFT	4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_FLUSH_Q0_CF_EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_FLUSH_Q0_CF_EN_SHIFT	5
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF8EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF8EN_SHIFT		6
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF9EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF9EN_SHIFT		7
+	u8 flags9;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF10EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF10EN_SHIFT	0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF11EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF11EN_SHIFT	1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF12EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF12EN_SHIFT	2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF13EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF13EN_SHIFT	3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF14EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF14EN_SHIFT	4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF15EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF15EN_SHIFT	5
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF16EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF16EN_SHIFT	6
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF17EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF17EN_SHIFT	7
+	u8 flags10;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF18EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF18EN_SHIFT		0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF19EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF19EN_SHIFT		1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF20EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF20EN_SHIFT		2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF21EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF21EN_SHIFT		3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_SLOW_PATH_EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_SLOW_PATH_EN_SHIFT	4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF23EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF23EN_SHIFT		5
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE0EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE0EN_SHIFT		6
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE1EN_SHIFT		7
+	u8 flags11;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE2EN_SHIFT		0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE3EN_SHIFT		1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE4EN_SHIFT		2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE5EN_SHIFT		3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE6EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE6EN_SHIFT		4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE7EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE7EN_SHIFT		5
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED1_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED1_SHIFT	6
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE9EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE9EN_SHIFT		7
+	u8 flags12;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_IRQ_PROD_RULE_EN_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_IRQ_PROD_RULE_EN_SHIFT	0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE11EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE11EN_SHIFT		1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED2_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED2_SHIFT	2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED3_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED3_SHIFT	3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE14EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE14EN_SHIFT		4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE15EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE15EN_SHIFT		5
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE16EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE16EN_SHIFT		6
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE17EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE17EN_SHIFT		7
+	u8 flags13;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE18EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE18EN_SHIFT		0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE19EN_MASK		0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_RULE19EN_SHIFT		1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED4_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED4_SHIFT	2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED5_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED5_SHIFT	3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED6_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED6_SHIFT	4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED7_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED7_SHIFT	5
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED8_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED8_SHIFT	6
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED9_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_A0_RESERVED9_SHIFT	7
+	u8 flags14;
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT16_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT16_SHIFT	0
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT17_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT17_SHIFT	1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT18_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT18_SHIFT	2
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT19_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT19_SHIFT	3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT20_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT20_SHIFT	4
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT21_MASK	0x1
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_BIT21_SHIFT	5
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF23_MASK	0x3
+#define E4_XSTORM_ROCE_RESP_CONN_AG_CTX_CF23_SHIFT	6
+	u8 byte2;
+	__le16 physical_q0;
+	__le16 irq_prod_shadow;
+	__le16 word2;
+	__le16 irq_cons;
+	__le16 irq_prod;
+	__le16 e5_reserved1;
+	__le16 conn_dpi;
+	u8 rxmit_opcode;
+	u8 byte4;
+	u8 byte5;
+	u8 byte6;
+	__le32 rxmit_psn_and_id;
+	__le32 rxmit_bytes_length;
+	__le32 psn;
+	__le32 reg3;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 msn_and_syndrome;
+};
+
+struct e4_ystorm_roce_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_YSTORM_ROCE_CONN_AG_CTX_BIT0_MASK     0x1
+#define E4_YSTORM_ROCE_CONN_AG_CTX_BIT0_SHIFT    0
+#define E4_YSTORM_ROCE_CONN_AG_CTX_BIT1_MASK     0x1
+#define E4_YSTORM_ROCE_CONN_AG_CTX_BIT1_SHIFT    1
+#define E4_YSTORM_ROCE_CONN_AG_CTX_CF0_MASK      0x3
+#define E4_YSTORM_ROCE_CONN_AG_CTX_CF0_SHIFT     2
+#define E4_YSTORM_ROCE_CONN_AG_CTX_CF1_MASK      0x3
+#define E4_YSTORM_ROCE_CONN_AG_CTX_CF1_SHIFT     4
+#define E4_YSTORM_ROCE_CONN_AG_CTX_CF2_MASK      0x3
+#define E4_YSTORM_ROCE_CONN_AG_CTX_CF2_SHIFT     6
+	u8 flags1;
+#define E4_YSTORM_ROCE_CONN_AG_CTX_CF0EN_MASK    0x1
+#define E4_YSTORM_ROCE_CONN_AG_CTX_CF0EN_SHIFT   0
+#define E4_YSTORM_ROCE_CONN_AG_CTX_CF1EN_MASK    0x1
+#define E4_YSTORM_ROCE_CONN_AG_CTX_CF1EN_SHIFT   1
+#define E4_YSTORM_ROCE_CONN_AG_CTX_CF2EN_MASK    0x1
+#define E4_YSTORM_ROCE_CONN_AG_CTX_CF2EN_SHIFT   2
+#define E4_YSTORM_ROCE_CONN_AG_CTX_RULE0EN_MASK  0x1
+#define E4_YSTORM_ROCE_CONN_AG_CTX_RULE0EN_SHIFT 3
+#define E4_YSTORM_ROCE_CONN_AG_CTX_RULE1EN_MASK  0x1
+#define E4_YSTORM_ROCE_CONN_AG_CTX_RULE1EN_SHIFT 4
+#define E4_YSTORM_ROCE_CONN_AG_CTX_RULE2EN_MASK  0x1
+#define E4_YSTORM_ROCE_CONN_AG_CTX_RULE2EN_SHIFT 5
+#define E4_YSTORM_ROCE_CONN_AG_CTX_RULE3EN_MASK  0x1
+#define E4_YSTORM_ROCE_CONN_AG_CTX_RULE3EN_SHIFT 6
+#define E4_YSTORM_ROCE_CONN_AG_CTX_RULE4EN_MASK  0x1
+#define E4_YSTORM_ROCE_CONN_AG_CTX_RULE4EN_SHIFT 7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le32 reg0;
+	__le32 reg1;
+	__le16 word1;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le32 reg2;
+	__le32 reg3;
+};
+
+struct e4_ystorm_roce_req_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_CF0_MASK		0x3
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_CF1_MASK		0x3
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_CF2_MASK		0x3
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_CF0EN_MASK	0x1
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_CF0EN_SHIFT	0
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_CF1EN_MASK	0x1
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_CF1EN_SHIFT	1
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_CF2EN_MASK	0x1
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_CF2EN_SHIFT	2
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_RULE0EN_SHIFT	3
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_RULE1EN_SHIFT	4
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_RULE2EN_SHIFT	5
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_RULE3EN_SHIFT	6
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_YSTORM_ROCE_REQ_CONN_AG_CTX_RULE4EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le32 reg0;
+	__le32 reg1;
+	__le16 word1;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le32 reg2;
+	__le32 reg3;
+};
+
+struct e4_ystorm_roce_resp_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_CF0EN_MASK	0x1
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_CF0EN_SHIFT	0
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_CF1EN_MASK	0x1
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_CF1EN_SHIFT	1
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_CF2EN_MASK	0x1
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_CF2EN_SHIFT	2
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_RULE0EN_SHIFT	3
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_RULE1EN_SHIFT	4
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_RULE2EN_SHIFT	5
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_RULE3EN_SHIFT	6
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_YSTORM_ROCE_RESP_CONN_AG_CTX_RULE4EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le32 reg0;
+	__le32 reg1;
+	__le16 word1;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le32 reg2;
+	__le32 reg3;
+};
+
+/* Roce doorbell data */
+enum roce_flavor {
+	PLAIN_ROCE,
+	RROCE_IPV4,
+	RROCE_IPV6,
+	MAX_ROCE_FLAVOR
+};
+
+/* The iwarp storm context of Ystorm */
+struct ystorm_iwarp_conn_st_ctx {
+	__le32 reserved[4];
+};
+
+/* The iwarp storm context of Pstorm */
+struct pstorm_iwarp_conn_st_ctx {
+	__le32 reserved[36];
+};
+
+/* The iwarp storm context of Xstorm */
+struct xstorm_iwarp_conn_st_ctx {
+	__le32 reserved[48];
+};
+
+struct e4_xstorm_iwarp_conn_ag_ctx {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM1_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM1_SHIFT	1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM2_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM2_SHIFT	2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM3_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM3_SHIFT	3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT4_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT4_SHIFT		4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RESERVED2_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RESERVED2_SHIFT	5
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT6_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT6_SHIFT		6
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT7_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT7_SHIFT		7
+	u8 flags1;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT8_MASK				0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT8_SHIFT				0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT9_MASK				0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT9_SHIFT				1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT10_MASK				0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT10_SHIFT				2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT11_MASK				0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT11_SHIFT				3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT12_MASK				0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT12_SHIFT				4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT13_MASK				0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT13_SHIFT				5
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT14_MASK				0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT14_SHIFT				6
+#define E4_XSTORM_IWARP_CONN_AG_CTX_YSTORM_FLUSH_OR_REWIND_SND_MAX_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_YSTORM_FLUSH_OR_REWIND_SND_MAX_SHIFT 7
+	u8 flags2;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF0_MASK			0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF0_SHIFT			0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF1_MASK			0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF1_SHIFT			2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF2_MASK			0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF2_SHIFT			4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_TIMER_STOP_ALL_MASK		0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_TIMER_STOP_ALL_SHIFT	6
+	u8 flags3;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF4_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF4_SHIFT	0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF5_SHIFT	2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF6_SHIFT	4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF7_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF7_SHIFT	6
+	u8 flags4;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF8_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF8_SHIFT	0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF9_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF9_SHIFT	2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF10_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF10_SHIFT	4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF11_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF11_SHIFT	6
+	u8 flags5;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF12_MASK		0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF12_SHIFT		0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF13_MASK		0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF13_SHIFT		2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SQ_FLUSH_CF_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SQ_FLUSH_CF_SHIFT	4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF15_MASK		0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF15_SHIFT		6
+	u8 flags6;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_MPA_OR_ERROR_WAKEUP_TRIGGER_CF_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_MPA_OR_ERROR_WAKEUP_TRIGGER_CF_SHIFT 0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF17_MASK				0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF17_SHIFT				2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF18_MASK				0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF18_SHIFT				4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_DQ_FLUSH_MASK			0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_DQ_FLUSH_SHIFT			6
+	u8 flags7;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_FLUSH_Q0_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_FLUSH_Q0_SHIFT	0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_FLUSH_Q1_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_FLUSH_Q1_SHIFT	2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SLOW_PATH_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SLOW_PATH_SHIFT	4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF0EN_SHIFT		6
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF1EN_SHIFT		7
+	u8 flags8;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF2EN_MASK			0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF2EN_SHIFT			0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_TIMER_STOP_ALL_EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_TIMER_STOP_ALL_EN_SHIFT	1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF4EN_MASK			0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF4EN_SHIFT			2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF5EN_MASK			0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF5EN_SHIFT			3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF6EN_MASK			0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF6EN_SHIFT			4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF7EN_MASK			0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF7EN_SHIFT			5
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF8EN_MASK			0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF8EN_SHIFT			6
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF9EN_MASK			0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF9EN_SHIFT			7
+	u8 flags9;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF10EN_MASK				0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF10EN_SHIFT			0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF11EN_MASK				0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF11EN_SHIFT			1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF12EN_MASK				0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF12EN_SHIFT			2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF13EN_MASK				0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF13EN_SHIFT			3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SQ_FLUSH_CF_EN_MASK			0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SQ_FLUSH_CF_EN_SHIFT		4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF15EN_MASK				0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF15EN_SHIFT			5
+#define E4_XSTORM_IWARP_CONN_AG_CTX_MPA_OR_ERROR_WAKEUP_TRIGGER_CF_EN_MASK 0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_MPA_OR_ERROR_WAKEUP_TRIGGER_CF_EN_SHIFT 6
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF17EN_MASK				0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF17EN_SHIFT			7
+	u8 flags10;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF18EN_MASK			0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_CF18EN_SHIFT		0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_DQ_FLUSH_EN_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_DQ_FLUSH_EN_SHIFT		1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_FLUSH_Q0_EN_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_FLUSH_Q0_EN_SHIFT		2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_FLUSH_Q1_EN_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_FLUSH_Q1_EN_SHIFT		3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SLOW_PATH_EN_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SLOW_PATH_EN_SHIFT		4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SEND_TERMINATE_CF_EN_MASK               0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SEND_TERMINATE_CF_EN_SHIFT              5
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE0EN_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE0EN_SHIFT		6
+#define E4_XSTORM_IWARP_CONN_AG_CTX_MORE_TO_SEND_RULE_EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_MORE_TO_SEND_RULE_EN_SHIFT	7
+	u8 flags11;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_TX_BLOCKED_EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_TX_BLOCKED_EN_SHIFT	0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE3EN_SHIFT	1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RESERVED3_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RESERVED3_SHIFT	2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE5EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE5EN_SHIFT	3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE6EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE6EN_SHIFT	4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE7EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE7EN_SHIFT	5
+#define E4_XSTORM_IWARP_CONN_AG_CTX_A0_RESERVED1_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_A0_RESERVED1_SHIFT	6
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE9EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE9EN_SHIFT	7
+	u8 flags12;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SQ_NOT_EMPTY_RULE_EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SQ_NOT_EMPTY_RULE_EN_SHIFT	0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE11EN_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE11EN_SHIFT		1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_A0_RESERVED2_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_A0_RESERVED2_SHIFT		2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_A0_RESERVED3_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_A0_RESERVED3_SHIFT		3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SQ_FENCE_RULE_EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SQ_FENCE_RULE_EN_SHIFT	4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE15EN_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE15EN_SHIFT		5
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE16EN_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE16EN_SHIFT		6
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE17EN_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE17EN_SHIFT		7
+	u8 flags13;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_IRQ_NOT_EMPTY_RULE_EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_IRQ_NOT_EMPTY_RULE_EN_SHIFT	0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_HQ_NOT_FULL_RULE_EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_HQ_NOT_FULL_RULE_EN_SHIFT	1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_ORQ_RD_FENCE_RULE_EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_ORQ_RD_FENCE_RULE_EN_SHIFT	2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE21EN_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_RULE21EN_SHIFT		3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_A0_RESERVED6_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_A0_RESERVED6_SHIFT		4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_ORQ_NOT_FULL_RULE_EN_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_ORQ_NOT_FULL_RULE_EN_SHIFT	5
+#define E4_XSTORM_IWARP_CONN_AG_CTX_A0_RESERVED8_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_A0_RESERVED8_SHIFT		6
+#define E4_XSTORM_IWARP_CONN_AG_CTX_A0_RESERVED9_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_A0_RESERVED9_SHIFT		7
+	u8 flags14;
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT16_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT16_SHIFT		0
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT17_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT17_SHIFT		1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT18_MASK		0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_BIT18_SHIFT		2
+#define E4_XSTORM_IWARP_CONN_AG_CTX_E5_RESERVED1_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_E5_RESERVED1_SHIFT	3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_E5_RESERVED2_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_E5_RESERVED2_SHIFT	4
+#define E4_XSTORM_IWARP_CONN_AG_CTX_E5_RESERVED3_MASK	0x1
+#define E4_XSTORM_IWARP_CONN_AG_CTX_E5_RESERVED3_SHIFT	5
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SEND_TERMINATE_CF_MASK	0x3
+#define E4_XSTORM_IWARP_CONN_AG_CTX_SEND_TERMINATE_CF_SHIFT	6
+	u8 byte2;
+	__le16 physical_q0;
+	__le16 physical_q1;
+	__le16 sq_comp_cons;
+	__le16 sq_tx_cons;
+	__le16 sq_prod;
+	__le16 word5;
+	__le16 conn_dpi;
+	u8 byte3;
+	u8 byte4;
+	u8 byte5;
+	u8 byte6;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 more_to_send_seq;
+	__le32 reg4;
+	__le32 rewinded_snd_max_or_term_opcode;
+	__le32 rd_msn;
+	__le16 irq_prod_via_msdm;
+	__le16 irq_cons;
+	__le16 hq_cons_th_or_mpa_data;
+	__le16 hq_cons;
+	__le32 atom_msn;
+	__le32 orq_cons;
+	__le32 orq_cons_th;
+	u8 byte7;
+	u8 wqe_data_pad_bytes;
+	u8 max_ord;
+	u8 former_hq_prod;
+	u8 irq_prod_via_msem;
+	u8 byte12;
+	u8 max_pkt_pdu_size_lo;
+	u8 max_pkt_pdu_size_hi;
+	u8 byte15;
+	u8 e5_reserved;
+	__le16 e5_reserved4;
+	__le32 reg10;
+	__le32 reg11;
+	__le32 shared_queue_page_addr_lo;
+	__le32 shared_queue_page_addr_hi;
+	__le32 reg14;
+	__le32 reg15;
+	__le32 reg16;
+	__le32 reg17;
+};
+
+struct e4_tstorm_iwarp_conn_ag_ctx {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4_TSTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	0
+#define E4_TSTORM_IWARP_CONN_AG_CTX_BIT1_MASK		0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_BIT1_SHIFT		1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_BIT2_MASK		0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_BIT2_SHIFT		2
+#define E4_TSTORM_IWARP_CONN_AG_CTX_MSTORM_FLUSH_OR_TERMINATE_SENT_MASK  0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_MSTORM_FLUSH_OR_TERMINATE_SENT_SHIFT 3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_BIT4_MASK		0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_BIT4_SHIFT		4
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CACHED_ORQ_MASK	0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CACHED_ORQ_SHIFT	5
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF0_MASK		0x3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF0_SHIFT		6
+	u8 flags1;
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RQ_POST_CF_MASK		0x3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RQ_POST_CF_SHIFT		0
+#define E4_TSTORM_IWARP_CONN_AG_CTX_MPA_TIMEOUT_CF_MASK		0x3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_MPA_TIMEOUT_CF_SHIFT	2
+#define E4_TSTORM_IWARP_CONN_AG_CTX_TIMER_STOP_ALL_MASK		0x3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_TIMER_STOP_ALL_SHIFT	4
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF4_MASK			0x3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF4_SHIFT			6
+	u8 flags2;
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF5_SHIFT	0
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF6_SHIFT	2
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF7_MASK	0x3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF7_SHIFT	4
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF8_MASK	0x3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF8_SHIFT	6
+	u8 flags3;
+#define E4_TSTORM_IWARP_CONN_AG_CTX_FLUSH_Q0_AND_TCP_HANDSHAKE_COMPLETE_MASK 0x3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_FLUSH_Q0_AND_TCP_HANDSHAKE_COMPLETE_SHIFT 0
+#define E4_TSTORM_IWARP_CONN_AG_CTX_FLUSH_OR_ERROR_DETECTED_MASK	0x3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_FLUSH_OR_ERROR_DETECTED_SHIFT	2
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF0EN_MASK				0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF0EN_SHIFT				4
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RQ_POST_CF_EN_MASK			0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RQ_POST_CF_EN_SHIFT			5
+#define E4_TSTORM_IWARP_CONN_AG_CTX_MPA_TIMEOUT_CF_EN_MASK		0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_MPA_TIMEOUT_CF_EN_SHIFT		6
+#define E4_TSTORM_IWARP_CONN_AG_CTX_TIMER_STOP_ALL_EN_MASK		0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_TIMER_STOP_ALL_EN_SHIFT		7
+	u8 flags4;
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF4EN_MASK				0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF4EN_SHIFT				0
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF5EN_MASK				0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF5EN_SHIFT				1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF6EN_MASK				0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF6EN_SHIFT				2
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF7EN_MASK				0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF7EN_SHIFT				3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF8EN_MASK				0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_CF8EN_SHIFT				4
+#define E4_TSTORM_IWARP_CONN_AG_CTX_FLUSH_Q0_AND_TCP_HANDSHAKE_COMPL_EN_MASK 0x1
+#define	E4_TSTORM_IWARP_CONN_AG_CTX_FLUSH_Q0_AND_TCP_HANDSHAKE_COMPL_EN_SHIFT 5
+#define E4_TSTORM_IWARP_CONN_AG_CTX_FLUSH_OR_ERROR_DETECTED_EN_MASK	0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_FLUSH_OR_ERROR_DETECTED_EN_SHIFT	6
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE0EN_MASK			0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE0EN_SHIFT			7
+	u8 flags5;
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE1EN_SHIFT		0
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE2EN_SHIFT		1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE3EN_SHIFT		2
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE4EN_SHIFT		3
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE5EN_SHIFT		4
+#define E4_TSTORM_IWARP_CONN_AG_CTX_SND_SQ_CONS_RULE_MASK	0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_SND_SQ_CONS_RULE_SHIFT	5
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE7EN_MASK		0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE7EN_SHIFT		6
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE8EN_MASK		0x1
+#define E4_TSTORM_IWARP_CONN_AG_CTX_RULE8EN_SHIFT		7
+	__le32 reg0;
+	__le32 reg1;
+	__le32 unaligned_nxt_seq;
+	__le32 reg3;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 reg6;
+	__le32 reg7;
+	__le32 reg8;
+	u8 orq_cache_idx;
+	u8 hq_prod;
+	__le16 sq_tx_cons_th;
+	u8 orq_prod;
+	u8 irq_cons;
+	__le16 sq_tx_cons;
+	__le16 conn_dpi;
+	__le16 rq_prod;
+	__le32 snd_seq;
+	__le32 last_hq_sequence;
+};
+
+/* The iwarp storm context of Tstorm */
+struct tstorm_iwarp_conn_st_ctx {
+	__le32 reserved[60];
+};
+
+/* The iwarp storm context of Mstorm */
+struct mstorm_iwarp_conn_st_ctx {
+	__le32 reserved[32];
+};
+
+/* The iwarp storm context of Ustorm */
+struct ustorm_iwarp_conn_st_ctx {
+	__le32 reserved[24];
+};
+
+/* iwarp connection context */
+struct e4_iwarp_conn_context {
+	struct ystorm_iwarp_conn_st_ctx ystorm_st_context;
+	struct regpair ystorm_st_padding[2];
+	struct pstorm_iwarp_conn_st_ctx pstorm_st_context;
+	struct regpair pstorm_st_padding[2];
+	struct xstorm_iwarp_conn_st_ctx xstorm_st_context;
+	struct e4_xstorm_iwarp_conn_ag_ctx xstorm_ag_context;
+	struct e4_tstorm_iwarp_conn_ag_ctx tstorm_ag_context;
+	struct timers_context timer_context;
+	struct e4_ustorm_rdma_conn_ag_ctx ustorm_ag_context;
+	struct tstorm_iwarp_conn_st_ctx tstorm_st_context;
+	struct regpair tstorm_st_padding[2];
+	struct mstorm_iwarp_conn_st_ctx mstorm_st_context;
+	struct ustorm_iwarp_conn_st_ctx ustorm_st_context;
+};
+
+/* iWARP create QP params passed by driver to FW in CreateQP Request Ramrod */
+struct iwarp_create_qp_ramrod_data {
+	u8 flags;
+#define IWARP_CREATE_QP_RAMROD_DATA_FMR_AND_RESERVED_EN_MASK	0x1
+#define IWARP_CREATE_QP_RAMROD_DATA_FMR_AND_RESERVED_EN_SHIFT	0
+#define IWARP_CREATE_QP_RAMROD_DATA_SIGNALED_COMP_MASK		0x1
+#define IWARP_CREATE_QP_RAMROD_DATA_SIGNALED_COMP_SHIFT		1
+#define IWARP_CREATE_QP_RAMROD_DATA_RDMA_RD_EN_MASK		0x1
+#define IWARP_CREATE_QP_RAMROD_DATA_RDMA_RD_EN_SHIFT		2
+#define IWARP_CREATE_QP_RAMROD_DATA_RDMA_WR_EN_MASK		0x1
+#define IWARP_CREATE_QP_RAMROD_DATA_RDMA_WR_EN_SHIFT		3
+#define IWARP_CREATE_QP_RAMROD_DATA_ATOMIC_EN_MASK		0x1
+#define IWARP_CREATE_QP_RAMROD_DATA_ATOMIC_EN_SHIFT		4
+#define IWARP_CREATE_QP_RAMROD_DATA_SRQ_FLG_MASK		0x1
+#define IWARP_CREATE_QP_RAMROD_DATA_SRQ_FLG_SHIFT		5
+#define IWARP_CREATE_QP_RAMROD_DATA_LOW_LATENCY_QUEUE_EN_MASK	0x1
+#define IWARP_CREATE_QP_RAMROD_DATA_LOW_LATENCY_QUEUE_EN_SHIFT	6
+#define IWARP_CREATE_QP_RAMROD_DATA_RESERVED0_MASK		0x1
+#define IWARP_CREATE_QP_RAMROD_DATA_RESERVED0_SHIFT		7
+	u8 reserved1;
+	__le16 pd;
+	__le16 sq_num_pages;
+	__le16 rq_num_pages;
+	__le32 reserved3[2];
+	struct regpair qp_handle_for_cqe;
+	struct rdma_srq_id srq_id;
+	__le32 cq_cid_for_sq;
+	__le32 cq_cid_for_rq;
+	__le16 dpi;
+	__le16 physical_q0;
+	__le16 physical_q1;
+	u8 reserved2[6];
+};
+
+/* iWARP completion queue types */
+enum iwarp_eqe_async_opcode {
+	IWARP_EVENT_TYPE_ASYNC_CONNECT_COMPLETE,
+	IWARP_EVENT_TYPE_ASYNC_ENHANCED_MPA_REPLY_ARRIVED,
+	IWARP_EVENT_TYPE_ASYNC_MPA_HANDSHAKE_COMPLETE,
+	IWARP_EVENT_TYPE_ASYNC_CID_CLEANED,
+	IWARP_EVENT_TYPE_ASYNC_EXCEPTION_DETECTED,
+	IWARP_EVENT_TYPE_ASYNC_QP_IN_ERROR_STATE,
+	IWARP_EVENT_TYPE_ASYNC_CQ_OVERFLOW,
+	MAX_IWARP_EQE_ASYNC_OPCODE
+};
+
+struct iwarp_eqe_data_mpa_async_completion {
+	__le16 ulp_data_len;
+	u8 reserved[6];
+};
+
+struct iwarp_eqe_data_tcp_async_completion {
+	__le16 ulp_data_len;
+	u8 mpa_handshake_mode;
+	u8 reserved[5];
+};
+
+/* iWARP completion queue types */
+enum iwarp_eqe_sync_opcode {
+	IWARP_EVENT_TYPE_TCP_OFFLOAD =
+	11,
+	IWARP_EVENT_TYPE_MPA_OFFLOAD,
+	IWARP_EVENT_TYPE_MPA_OFFLOAD_SEND_RTR,
+	IWARP_EVENT_TYPE_CREATE_QP,
+	IWARP_EVENT_TYPE_QUERY_QP,
+	IWARP_EVENT_TYPE_MODIFY_QP,
+	IWARP_EVENT_TYPE_DESTROY_QP,
+	IWARP_EVENT_TYPE_ABORT_TCP_OFFLOAD,
+	MAX_IWARP_EQE_SYNC_OPCODE
+};
+
+/* iWARP EQE completion status */
+enum iwarp_fw_return_code {
+	IWARP_CONN_ERROR_TCP_CONNECT_INVALID_PACKET = 5,
+	IWARP_CONN_ERROR_TCP_CONNECTION_RST,
+	IWARP_CONN_ERROR_TCP_CONNECT_TIMEOUT,
+	IWARP_CONN_ERROR_MPA_ERROR_REJECT,
+	IWARP_CONN_ERROR_MPA_NOT_SUPPORTED_VER,
+	IWARP_CONN_ERROR_MPA_RST,
+	IWARP_CONN_ERROR_MPA_FIN,
+	IWARP_CONN_ERROR_MPA_RTR_MISMATCH,
+	IWARP_CONN_ERROR_MPA_INSUF_IRD,
+	IWARP_CONN_ERROR_MPA_INVALID_PACKET,
+	IWARP_CONN_ERROR_MPA_LOCAL_ERROR,
+	IWARP_CONN_ERROR_MPA_TIMEOUT,
+	IWARP_CONN_ERROR_MPA_TERMINATE,
+	IWARP_QP_IN_ERROR_GOOD_CLOSE,
+	IWARP_QP_IN_ERROR_BAD_CLOSE,
+	IWARP_EXCEPTION_DETECTED_LLP_CLOSED,
+	IWARP_EXCEPTION_DETECTED_LLP_RESET,
+	IWARP_EXCEPTION_DETECTED_IRQ_FULL,
+	IWARP_EXCEPTION_DETECTED_RQ_EMPTY,
+	IWARP_EXCEPTION_DETECTED_SRQ_EMPTY,
+	IWARP_EXCEPTION_DETECTED_SRQ_LIMIT,
+	IWARP_EXCEPTION_DETECTED_LLP_TIMEOUT,
+	IWARP_EXCEPTION_DETECTED_REMOTE_PROTECTION_ERROR,
+	IWARP_EXCEPTION_DETECTED_CQ_OVERFLOW,
+	IWARP_EXCEPTION_DETECTED_LOCAL_CATASTROPHIC,
+	IWARP_EXCEPTION_DETECTED_LOCAL_ACCESS_ERROR,
+	IWARP_EXCEPTION_DETECTED_REMOTE_OPERATION_ERROR,
+	IWARP_EXCEPTION_DETECTED_TERMINATE_RECEIVED,
+	MAX_IWARP_FW_RETURN_CODE
+};
+
+/* unaligned opaque data received from LL2 */
+struct iwarp_init_func_params {
+	u8 ll2_ooo_q_index;
+	u8 reserved1[7];
+};
+
+/* iwarp func init ramrod data */
+struct iwarp_init_func_ramrod_data {
+	struct rdma_init_func_ramrod_data rdma;
+	struct tcp_init_params tcp;
+	struct iwarp_init_func_params iwarp;
+};
+
+/* iWARP QP - possible states to transition to */
+enum iwarp_modify_qp_new_state_type {
+	IWARP_MODIFY_QP_STATE_CLOSING = 1,
+	IWARP_MODIFY_QP_STATE_ERROR = 2,
+	MAX_IWARP_MODIFY_QP_NEW_STATE_TYPE
+};
+
+/* iwarp modify qp responder ramrod data */
+struct iwarp_modify_qp_ramrod_data {
+	__le16 transition_to_state;
+	__le16 flags;
+#define IWARP_MODIFY_QP_RAMROD_DATA_RDMA_RD_EN_MASK		0x1
+#define IWARP_MODIFY_QP_RAMROD_DATA_RDMA_RD_EN_SHIFT		0
+#define IWARP_MODIFY_QP_RAMROD_DATA_RDMA_WR_EN_MASK		0x1
+#define IWARP_MODIFY_QP_RAMROD_DATA_RDMA_WR_EN_SHIFT		1
+#define IWARP_MODIFY_QP_RAMROD_DATA_ATOMIC_EN_MASK		0x1
+#define IWARP_MODIFY_QP_RAMROD_DATA_ATOMIC_EN_SHIFT		2
+#define IWARP_MODIFY_QP_RAMROD_DATA_STATE_TRANS_EN_MASK		0x1
+#define IWARP_MODIFY_QP_RAMROD_DATA_STATE_TRANS_EN_SHIFT	3
+#define IWARP_MODIFY_QP_RAMROD_DATA_RDMA_OPS_EN_FLG_MASK	0x1
+#define IWARP_MODIFY_QP_RAMROD_DATA_RDMA_OPS_EN_FLG_SHIFT	4
+#define IWARP_MODIFY_QP_RAMROD_DATA_PHYSICAL_QUEUE_FLG_MASK	0x1
+#define IWARP_MODIFY_QP_RAMROD_DATA_PHYSICAL_QUEUE_FLG_SHIFT	5
+#define IWARP_MODIFY_QP_RAMROD_DATA_RESERVED_MASK		0x3FF
+#define IWARP_MODIFY_QP_RAMROD_DATA_RESERVED_SHIFT		6
+	__le16 physical_q0;
+	__le16 physical_q1;
+	__le32 reserved1[10];
+};
+
+/* MPA params for Enhanced mode */
+struct mpa_rq_params {
+	__le32 ird;
+	__le32 ord;
+};
+
+/* MPA host Address-Len for private data */
+struct mpa_ulp_buffer {
+	struct regpair addr;
+	__le16 len;
+	__le16 reserved[3];
+};
+
+/* iWARP MPA offload params common to Basic and Enhanced modes */
+struct mpa_outgoing_params {
+	u8 crc_needed;
+	u8 reject;
+	u8 reserved[6];
+	struct mpa_rq_params out_rq;
+	struct mpa_ulp_buffer outgoing_ulp_buffer;
+};
+
+/* iWARP MPA offload params passed by driver to FW in MPA Offload Request
+ * Ramrod.
+ */
+struct iwarp_mpa_offload_ramrod_data {
+	struct mpa_outgoing_params common;
+	__le32 tcp_cid;
+	u8 mode;
+	u8 tcp_connect_side;
+	u8 rtr_pref;
+#define IWARP_MPA_OFFLOAD_RAMROD_DATA_RTR_SUPPORTED_MASK	0x7
+#define IWARP_MPA_OFFLOAD_RAMROD_DATA_RTR_SUPPORTED_SHIFT	0
+#define IWARP_MPA_OFFLOAD_RAMROD_DATA_RESERVED1_MASK		0x1F
+#define IWARP_MPA_OFFLOAD_RAMROD_DATA_RESERVED1_SHIFT		3
+	u8 reserved2;
+	struct mpa_ulp_buffer incoming_ulp_buffer;
+	struct regpair async_eqe_output_buf;
+	struct regpair handle_for_async;
+	struct regpair shared_queue_addr;
+	__le16 rcv_wnd;
+	u8 stats_counter_id;
+	u8 reserved3[13];
+};
+
+/* iWARP TCP connection offload params passed by driver to FW */
+struct iwarp_offload_params {
+	struct mpa_ulp_buffer incoming_ulp_buffer;
+	struct regpair async_eqe_output_buf;
+	struct regpair handle_for_async;
+	__le16 physical_q0;
+	__le16 physical_q1;
+	u8 stats_counter_id;
+	u8 mpa_mode;
+	u8 reserved[10];
+};
+
+/* iWARP query QP output params */
+struct iwarp_query_qp_output_params {
+	__le32 flags;
+#define IWARP_QUERY_QP_OUTPUT_PARAMS_ERROR_FLG_MASK	0x1
+#define IWARP_QUERY_QP_OUTPUT_PARAMS_ERROR_FLG_SHIFT	0
+#define IWARP_QUERY_QP_OUTPUT_PARAMS_RESERVED0_MASK	0x7FFFFFFF
+#define IWARP_QUERY_QP_OUTPUT_PARAMS_RESERVED0_SHIFT	1
+	u8 reserved1[4];
+};
+
+/* iWARP query QP ramrod data */
+struct iwarp_query_qp_ramrod_data {
+	struct regpair output_params_addr;
+};
+
+/* iWARP Ramrod Command IDs */
+enum iwarp_ramrod_cmd_id {
+	IWARP_RAMROD_CMD_ID_TCP_OFFLOAD = 11,
+	IWARP_RAMROD_CMD_ID_MPA_OFFLOAD,
+	IWARP_RAMROD_CMD_ID_MPA_OFFLOAD_SEND_RTR,
+	IWARP_RAMROD_CMD_ID_CREATE_QP,
+	IWARP_RAMROD_CMD_ID_QUERY_QP,
+	IWARP_RAMROD_CMD_ID_MODIFY_QP,
+	IWARP_RAMROD_CMD_ID_DESTROY_QP,
+	IWARP_RAMROD_CMD_ID_ABORT_TCP_OFFLOAD,
+	MAX_IWARP_RAMROD_CMD_ID
+};
+
+/* Per PF iWARP retransmit path statistics */
+struct iwarp_rxmit_stats_drv {
+	struct regpair tx_go_to_slow_start_event_cnt;
+	struct regpair tx_fast_retransmit_event_cnt;
+};
+
+/* iWARP and TCP connection offload params passed by driver to FW in iWARP
+ * offload ramrod.
+ */
+struct iwarp_tcp_offload_ramrod_data {
+	struct iwarp_offload_params iwarp;
+	struct tcp_offload_params_opt2 tcp;
+};
+
+/* iWARP MPA negotiation types */
+enum mpa_negotiation_mode {
+	MPA_NEGOTIATION_TYPE_BASIC = 1,
+	MPA_NEGOTIATION_TYPE_ENHANCED = 2,
+	MAX_MPA_NEGOTIATION_MODE
+};
+
+/* iWARP MPA Enhanced mode RTR types */
+enum mpa_rtr_type {
+	MPA_RTR_TYPE_NONE = 0,
+	MPA_RTR_TYPE_ZERO_SEND = 1,
+	MPA_RTR_TYPE_ZERO_WRITE = 2,
+	MPA_RTR_TYPE_ZERO_SEND_AND_WRITE = 3,
+	MPA_RTR_TYPE_ZERO_READ = 4,
+	MPA_RTR_TYPE_ZERO_SEND_AND_READ = 5,
+	MPA_RTR_TYPE_ZERO_WRITE_AND_READ = 6,
+	MPA_RTR_TYPE_ZERO_SEND_AND_WRITE_AND_READ = 7,
+	MAX_MPA_RTR_TYPE
+};
+
+/* unaligned opaque data received from LL2 */
+struct unaligned_opaque_data {
+	__le16 first_mpa_offset;
+	u8 tcp_payload_offset;
+	u8 flags;
+#define UNALIGNED_OPAQUE_DATA_PKT_REACHED_WIN_RIGHT_EDGE_MASK	0x1
+#define UNALIGNED_OPAQUE_DATA_PKT_REACHED_WIN_RIGHT_EDGE_SHIFT	0
+#define UNALIGNED_OPAQUE_DATA_CONNECTION_CLOSED_MASK		0x1
+#define UNALIGNED_OPAQUE_DATA_CONNECTION_CLOSED_SHIFT		1
+#define UNALIGNED_OPAQUE_DATA_RESERVED_MASK			0x3F
+#define UNALIGNED_OPAQUE_DATA_RESERVED_SHIFT			2
+	__le32 cid;
+};
+
+struct e4_mstorm_iwarp_conn_ag_ctx {
+	u8 reserved;
+	u8 state;
+	u8 flags0;
+#define E4_MSTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define E4_MSTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM0_SHIFT		0
+#define E4_MSTORM_IWARP_CONN_AG_CTX_BIT1_MASK			0x1
+#define E4_MSTORM_IWARP_CONN_AG_CTX_BIT1_SHIFT			1
+#define E4_MSTORM_IWARP_CONN_AG_CTX_INV_STAG_DONE_CF_MASK	0x3
+#define E4_MSTORM_IWARP_CONN_AG_CTX_INV_STAG_DONE_CF_SHIFT	2
+#define E4_MSTORM_IWARP_CONN_AG_CTX_CF1_MASK			0x3
+#define E4_MSTORM_IWARP_CONN_AG_CTX_CF1_SHIFT			4
+#define E4_MSTORM_IWARP_CONN_AG_CTX_CF2_MASK			0x3
+#define E4_MSTORM_IWARP_CONN_AG_CTX_CF2_SHIFT			6
+	u8 flags1;
+#define E4_MSTORM_IWARP_CONN_AG_CTX_INV_STAG_DONE_CF_EN_MASK	0x1
+#define E4_MSTORM_IWARP_CONN_AG_CTX_INV_STAG_DONE_CF_EN_SHIFT	0
+#define E4_MSTORM_IWARP_CONN_AG_CTX_CF1EN_MASK			0x1
+#define E4_MSTORM_IWARP_CONN_AG_CTX_CF1EN_SHIFT			1
+#define E4_MSTORM_IWARP_CONN_AG_CTX_CF2EN_MASK			0x1
+#define E4_MSTORM_IWARP_CONN_AG_CTX_CF2EN_SHIFT			2
+#define E4_MSTORM_IWARP_CONN_AG_CTX_RULE0EN_MASK		0x1
+#define E4_MSTORM_IWARP_CONN_AG_CTX_RULE0EN_SHIFT		3
+#define E4_MSTORM_IWARP_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_MSTORM_IWARP_CONN_AG_CTX_RULE1EN_SHIFT		4
+#define E4_MSTORM_IWARP_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_MSTORM_IWARP_CONN_AG_CTX_RULE2EN_SHIFT		5
+#define E4_MSTORM_IWARP_CONN_AG_CTX_RCQ_CONS_EN_MASK		0x1
+#define E4_MSTORM_IWARP_CONN_AG_CTX_RCQ_CONS_EN_SHIFT		6
+#define E4_MSTORM_IWARP_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_MSTORM_IWARP_CONN_AG_CTX_RULE4EN_SHIFT		7
+	__le16 rcq_cons;
+	__le16 rcq_cons_th;
+	__le32 reg0;
+	__le32 reg1;
+};
+
+struct e4_ustorm_iwarp_conn_ag_ctx {
+	u8 reserved;
+	u8 byte1;
+	u8 flags0;
+#define E4_USTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	0
+#define E4_USTORM_IWARP_CONN_AG_CTX_BIT1_MASK		0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_BIT1_SHIFT		1
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF0_MASK		0x3
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF0_SHIFT		2
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF1_MASK		0x3
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF1_SHIFT		4
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF2_MASK		0x3
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF2_SHIFT		6
+	u8 flags1;
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF3_MASK		0x3
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF3_SHIFT		0
+#define E4_USTORM_IWARP_CONN_AG_CTX_CQ_ARM_SE_CF_MASK	0x3
+#define E4_USTORM_IWARP_CONN_AG_CTX_CQ_ARM_SE_CF_SHIFT	2
+#define E4_USTORM_IWARP_CONN_AG_CTX_CQ_ARM_CF_MASK	0x3
+#define E4_USTORM_IWARP_CONN_AG_CTX_CQ_ARM_CF_SHIFT	4
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF6_MASK		0x3
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF6_SHIFT		6
+	u8 flags2;
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF0EN_MASK			0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF0EN_SHIFT			0
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF1EN_MASK			0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF1EN_SHIFT			1
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF2EN_MASK			0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF2EN_SHIFT			2
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF3EN_MASK			0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF3EN_SHIFT			3
+#define E4_USTORM_IWARP_CONN_AG_CTX_CQ_ARM_SE_CF_EN_MASK	0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_CQ_ARM_SE_CF_EN_SHIFT	4
+#define E4_USTORM_IWARP_CONN_AG_CTX_CQ_ARM_CF_EN_MASK		0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_CQ_ARM_CF_EN_SHIFT		5
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF6EN_MASK			0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_CF6EN_SHIFT			6
+#define E4_USTORM_IWARP_CONN_AG_CTX_CQ_SE_EN_MASK		0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_CQ_SE_EN_SHIFT		7
+	u8 flags3;
+#define E4_USTORM_IWARP_CONN_AG_CTX_CQ_EN_MASK		0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_CQ_EN_SHIFT		0
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE2EN_SHIFT	1
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE3EN_SHIFT	2
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE4EN_SHIFT	3
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE5EN_MASK	0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE5EN_SHIFT	4
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE6EN_MASK	0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE6EN_SHIFT	5
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE7EN_MASK	0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE7EN_SHIFT	6
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE8EN_MASK	0x1
+#define E4_USTORM_IWARP_CONN_AG_CTX_RULE8EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le16 word1;
+	__le32 cq_cons;
+	__le32 cq_se_prod;
+	__le32 cq_prod;
+	__le32 reg3;
+	__le16 word2;
+	__le16 word3;
+};
+
+struct e4_ystorm_iwarp_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_YSTORM_IWARP_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_YSTORM_IWARP_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_YSTORM_IWARP_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_YSTORM_IWARP_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_YSTORM_IWARP_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_YSTORM_IWARP_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_YSTORM_IWARP_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_YSTORM_IWARP_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_YSTORM_IWARP_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_YSTORM_IWARP_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_YSTORM_IWARP_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_YSTORM_IWARP_CONN_AG_CTX_CF0EN_SHIFT		0
+#define E4_YSTORM_IWARP_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_YSTORM_IWARP_CONN_AG_CTX_CF1EN_SHIFT		1
+#define E4_YSTORM_IWARP_CONN_AG_CTX_CF2EN_MASK		0x1
+#define E4_YSTORM_IWARP_CONN_AG_CTX_CF2EN_SHIFT		2
+#define E4_YSTORM_IWARP_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define E4_YSTORM_IWARP_CONN_AG_CTX_RULE0EN_SHIFT	3
+#define E4_YSTORM_IWARP_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define E4_YSTORM_IWARP_CONN_AG_CTX_RULE1EN_SHIFT	4
+#define E4_YSTORM_IWARP_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_YSTORM_IWARP_CONN_AG_CTX_RULE2EN_SHIFT	5
+#define E4_YSTORM_IWARP_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_YSTORM_IWARP_CONN_AG_CTX_RULE3EN_SHIFT	6
+#define E4_YSTORM_IWARP_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_YSTORM_IWARP_CONN_AG_CTX_RULE4EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le32 reg0;
+	__le32 reg1;
+	__le16 word1;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le32 reg2;
+	__le32 reg3;
+};
+
+/* The fcoe storm context of Ystorm */
+struct ystorm_fcoe_conn_st_ctx {
+	u8 func_mode;
+	u8 cos;
+	u8 conf_version;
+	u8 eth_hdr_size;
+	__le16 stat_ram_addr;
+	__le16 mtu;
+	__le16 max_fc_payload_len;
+	__le16 tx_max_fc_pay_len;
+	u8 fcp_cmd_size;
+	u8 fcp_rsp_size;
+	__le16 mss;
+	struct regpair reserved;
+	__le16 min_frame_size;
+	u8 protection_info_flags;
+#define YSTORM_FCOE_CONN_ST_CTX_SUPPORT_PROTECTION_MASK		0x1
+#define YSTORM_FCOE_CONN_ST_CTX_SUPPORT_PROTECTION_SHIFT	0
+#define YSTORM_FCOE_CONN_ST_CTX_VALID_MASK			0x1
+#define YSTORM_FCOE_CONN_ST_CTX_VALID_SHIFT			1
+#define YSTORM_FCOE_CONN_ST_CTX_RESERVED1_MASK			0x3F
+#define YSTORM_FCOE_CONN_ST_CTX_RESERVED1_SHIFT			2
+	u8 dst_protection_per_mss;
+	u8 src_protection_per_mss;
+	u8 ptu_log_page_size;
+	u8 flags;
+#define YSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_MASK	0x1
+#define YSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_SHIFT	0
+#define YSTORM_FCOE_CONN_ST_CTX_OUTER_VLAN_FLAG_MASK	0x1
+#define YSTORM_FCOE_CONN_ST_CTX_OUTER_VLAN_FLAG_SHIFT	1
+#define YSTORM_FCOE_CONN_ST_CTX_RSRV_MASK		0x3F
+#define YSTORM_FCOE_CONN_ST_CTX_RSRV_SHIFT		2
+	u8 fcp_xfer_size;
+};
+
+/* FCoE 16-bits vlan structure */
+struct fcoe_vlan_fields {
+	__le16 fields;
+#define FCOE_VLAN_FIELDS_VID_MASK	0xFFF
+#define FCOE_VLAN_FIELDS_VID_SHIFT	0
+#define FCOE_VLAN_FIELDS_CLI_MASK	0x1
+#define FCOE_VLAN_FIELDS_CLI_SHIFT	12
+#define FCOE_VLAN_FIELDS_PRI_MASK	0x7
+#define FCOE_VLAN_FIELDS_PRI_SHIFT	13
+};
+
+/* FCoE 16-bits vlan union */
+union fcoe_vlan_field_union {
+	struct fcoe_vlan_fields fields;
+	__le16 val;
+};
+
+/* FCoE 16-bits vlan, vif union */
+union fcoe_vlan_vif_field_union {
+	union fcoe_vlan_field_union vlan;
+	__le16 vif;
+};
+
+/* Ethernet context section */
+struct pstorm_fcoe_eth_context_section {
+	u8 remote_addr_3;
+	u8 remote_addr_2;
+	u8 remote_addr_1;
+	u8 remote_addr_0;
+	u8 local_addr_1;
+	u8 local_addr_0;
+	u8 remote_addr_5;
+	u8 remote_addr_4;
+	u8 local_addr_5;
+	u8 local_addr_4;
+	u8 local_addr_3;
+	u8 local_addr_2;
+	union fcoe_vlan_vif_field_union vif_outer_vlan;
+	__le16 vif_outer_eth_type;
+	union fcoe_vlan_vif_field_union inner_vlan;
+	__le16 inner_eth_type;
+};
+
+/* The fcoe storm context of Pstorm */
+struct pstorm_fcoe_conn_st_ctx {
+	u8 func_mode;
+	u8 cos;
+	u8 conf_version;
+	u8 rsrv;
+	__le16 stat_ram_addr;
+	__le16 mss;
+	struct regpair abts_cleanup_addr;
+	struct pstorm_fcoe_eth_context_section eth;
+	u8 sid_2;
+	u8 sid_1;
+	u8 sid_0;
+	u8 flags;
+#define PSTORM_FCOE_CONN_ST_CTX_VNTAG_VLAN_MASK			0x1
+#define PSTORM_FCOE_CONN_ST_CTX_VNTAG_VLAN_SHIFT		0
+#define PSTORM_FCOE_CONN_ST_CTX_SUPPORT_REC_RR_TOV_MASK		0x1
+#define PSTORM_FCOE_CONN_ST_CTX_SUPPORT_REC_RR_TOV_SHIFT	1
+#define PSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_MASK		0x1
+#define PSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_SHIFT		2
+#define PSTORM_FCOE_CONN_ST_CTX_OUTER_VLAN_FLAG_MASK		0x1
+#define PSTORM_FCOE_CONN_ST_CTX_OUTER_VLAN_FLAG_SHIFT		3
+#define PSTORM_FCOE_CONN_ST_CTX_SINGLE_VLAN_FLAG_MASK		0x1
+#define PSTORM_FCOE_CONN_ST_CTX_SINGLE_VLAN_FLAG_SHIFT		4
+#define PSTORM_FCOE_CONN_ST_CTX_RESERVED_MASK			0x7
+#define PSTORM_FCOE_CONN_ST_CTX_RESERVED_SHIFT			5
+	u8 did_2;
+	u8 did_1;
+	u8 did_0;
+	u8 src_mac_index;
+	__le16 rec_rr_tov_val;
+	u8 q_relative_offset;
+	u8 reserved1;
+};
+
+/* The fcoe storm context of Xstorm */
+struct xstorm_fcoe_conn_st_ctx {
+	u8 func_mode;
+	u8 src_mac_index;
+	u8 conf_version;
+	u8 cached_wqes_avail;
+	__le16 stat_ram_addr;
+	u8 flags;
+#define XSTORM_FCOE_CONN_ST_CTX_SQ_DEFERRED_MASK		0x1
+#define XSTORM_FCOE_CONN_ST_CTX_SQ_DEFERRED_SHIFT		0
+#define XSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_MASK		0x1
+#define XSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_SHIFT		1
+#define XSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_ORIG_MASK	0x1
+#define XSTORM_FCOE_CONN_ST_CTX_INNER_VLAN_FLAG_ORIG_SHIFT	2
+#define XSTORM_FCOE_CONN_ST_CTX_LAST_QUEUE_HANDLED_MASK		0x3
+#define XSTORM_FCOE_CONN_ST_CTX_LAST_QUEUE_HANDLED_SHIFT	3
+#define XSTORM_FCOE_CONN_ST_CTX_RSRV_MASK			0x7
+#define XSTORM_FCOE_CONN_ST_CTX_RSRV_SHIFT			5
+	u8 cached_wqes_offset;
+	u8 reserved2;
+	u8 eth_hdr_size;
+	u8 seq_id;
+	u8 max_conc_seqs;
+	__le16 num_pages_in_pbl;
+	__le16 reserved;
+	struct regpair sq_pbl_addr;
+	struct regpair sq_curr_page_addr;
+	struct regpair sq_next_page_addr;
+	struct regpair xferq_pbl_addr;
+	struct regpair xferq_curr_page_addr;
+	struct regpair xferq_next_page_addr;
+	struct regpair respq_pbl_addr;
+	struct regpair respq_curr_page_addr;
+	struct regpair respq_next_page_addr;
+	__le16 mtu;
+	__le16 tx_max_fc_pay_len;
+	__le16 max_fc_payload_len;
+	__le16 min_frame_size;
+	__le16 sq_pbl_next_index;
+	__le16 respq_pbl_next_index;
+	u8 fcp_cmd_byte_credit;
+	u8 fcp_rsp_byte_credit;
+	__le16 protection_info;
+#define XSTORM_FCOE_CONN_ST_CTX_PROTECTION_PERF_MASK		0x1
+#define XSTORM_FCOE_CONN_ST_CTX_PROTECTION_PERF_SHIFT		0
+#define XSTORM_FCOE_CONN_ST_CTX_SUPPORT_PROTECTION_MASK		0x1
+#define XSTORM_FCOE_CONN_ST_CTX_SUPPORT_PROTECTION_SHIFT	1
+#define XSTORM_FCOE_CONN_ST_CTX_VALID_MASK			0x1
+#define XSTORM_FCOE_CONN_ST_CTX_VALID_SHIFT			2
+#define XSTORM_FCOE_CONN_ST_CTX_FRAME_PROT_ALIGNED_MASK		0x1
+#define XSTORM_FCOE_CONN_ST_CTX_FRAME_PROT_ALIGNED_SHIFT	3
+#define XSTORM_FCOE_CONN_ST_CTX_RESERVED3_MASK			0xF
+#define XSTORM_FCOE_CONN_ST_CTX_RESERVED3_SHIFT			4
+#define XSTORM_FCOE_CONN_ST_CTX_DST_PROTECTION_PER_MSS_MASK	0xFF
+#define XSTORM_FCOE_CONN_ST_CTX_DST_PROTECTION_PER_MSS_SHIFT	8
+	__le16 xferq_pbl_next_index;
+	__le16 page_size;
+	u8 mid_seq;
+	u8 fcp_xfer_byte_credit;
+	u8 reserved1[2];
+	struct fcoe_wqe cached_wqes[16];
+};
+
+struct e4_xstorm_fcoe_conn_ag_ctx {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED1_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED1_SHIFT	1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED2_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED2_SHIFT	2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_EXIST_IN_QM3_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_EXIST_IN_QM3_SHIFT	3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED3_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED3_SHIFT	4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED4_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED4_SHIFT	5
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED5_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED5_SHIFT	6
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED6_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED6_SHIFT	7
+	u8 flags1;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED7_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED7_SHIFT	0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED8_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED8_SHIFT	1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED9_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED9_SHIFT	2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT11_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT11_SHIFT		3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT12_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT12_SHIFT		4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT13_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT13_SHIFT		5
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT14_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT14_SHIFT		6
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT15_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT15_SHIFT		7
+	u8 flags2;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF0_SHIFT	0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF1_SHIFT	2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF2_SHIFT	4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF3_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF3_SHIFT	6
+	u8 flags3;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF4_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF4_SHIFT	0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF5_SHIFT	2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF6_SHIFT	4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF7_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF7_SHIFT	6
+	u8 flags4;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF8_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF8_SHIFT	0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF9_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF9_SHIFT	2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF10_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF10_SHIFT	4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF11_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF11_SHIFT	6
+	u8 flags5;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF12_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF12_SHIFT	0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF13_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF13_SHIFT	2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF14_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF14_SHIFT	4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF15_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF15_SHIFT	6
+	u8 flags6;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF16_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF16_SHIFT	0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF17_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF17_SHIFT	2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF18_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF18_SHIFT	4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_DQ_CF_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_DQ_CF_SHIFT	6
+	u8 flags7;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_SHIFT	0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED10_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED10_SHIFT	2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_SLOW_PATH_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_SLOW_PATH_SHIFT	4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF0EN_SHIFT		6
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF1EN_SHIFT		7
+	u8 flags8;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF2EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF2EN_SHIFT	0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF3EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF3EN_SHIFT	1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF4EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF4EN_SHIFT	2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF5EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF5EN_SHIFT	3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF6EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF6EN_SHIFT	4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF7EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF7EN_SHIFT	5
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF8EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF8EN_SHIFT	6
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF9EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF9EN_SHIFT	7
+	u8 flags9;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF10EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF10EN_SHIFT	0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF11EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF11EN_SHIFT	1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF12EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF12EN_SHIFT	2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF13EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF13EN_SHIFT	3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF14EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF14EN_SHIFT	4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF15EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF15EN_SHIFT	5
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF16EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF16EN_SHIFT	6
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF17EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF17EN_SHIFT	7
+	u8 flags10;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF18EN_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF18EN_SHIFT		0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_DQ_CF_EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_DQ_CF_EN_SHIFT	1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_EN_SHIFT	2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED11_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED11_SHIFT	3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_SLOW_PATH_EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_SLOW_PATH_EN_SHIFT	4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF23EN_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF23EN_SHIFT		5
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED12_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED12_SHIFT	6
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED13_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED13_SHIFT	7
+	u8 flags11;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED14_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED14_SHIFT		0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED15_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED15_SHIFT		1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED16_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESERVED16_SHIFT		2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE5EN_MASK			0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE5EN_SHIFT		3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE6EN_MASK			0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE6EN_SHIFT		4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE7EN_MASK			0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE7EN_SHIFT		5
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED1_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED1_SHIFT		6
+#define E4_XSTORM_FCOE_CONN_AG_CTX_XFERQ_DECISION_EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_XFERQ_DECISION_EN_SHIFT	7
+	u8 flags12;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_SQ_DECISION_EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_SQ_DECISION_EN_SHIFT	0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE11EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE11EN_SHIFT	1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED2_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED2_SHIFT	2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED3_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED3_SHIFT	3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE14EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE14EN_SHIFT	4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE15EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE15EN_SHIFT	5
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE16EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE16EN_SHIFT	6
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE17EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE17EN_SHIFT	7
+	u8 flags13;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESPQ_DECISION_EN_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RESPQ_DECISION_EN_SHIFT	0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE19EN_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_RULE19EN_SHIFT		1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED4_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED4_SHIFT		2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED5_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED5_SHIFT		3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED6_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED6_SHIFT		4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED7_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED7_SHIFT		5
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED8_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED8_SHIFT		6
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED9_MASK		0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_A0_RESERVED9_SHIFT		7
+	u8 flags14;
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT16_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT16_SHIFT	0
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT17_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT17_SHIFT	1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT18_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT18_SHIFT	2
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT19_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT19_SHIFT	3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT20_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT20_SHIFT	4
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT21_MASK	0x1
+#define E4_XSTORM_FCOE_CONN_AG_CTX_BIT21_SHIFT	5
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF23_MASK	0x3
+#define E4_XSTORM_FCOE_CONN_AG_CTX_CF23_SHIFT	6
+	u8 byte2;
+	__le16 physical_q0;
+	__le16 word1;
+	__le16 word2;
+	__le16 sq_cons;
+	__le16 sq_prod;
+	__le16 xferq_prod;
+	__le16 xferq_cons;
+	u8 byte3;
+	u8 byte4;
+	u8 byte5;
+	u8 byte6;
+	__le32 remain_io;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 reg6;
+	__le16 respq_prod;
+	__le16 respq_cons;
+	__le16 word9;
+	__le16 word10;
+	__le32 reg7;
+	__le32 reg8;
+};
+
+/* The fcoe storm context of Ustorm */
+struct ustorm_fcoe_conn_st_ctx {
+	struct regpair respq_pbl_addr;
+	__le16 num_pages_in_pbl;
+	u8 ptu_log_page_size;
+	u8 log_page_size;
+	__le16 respq_prod;
+	u8 reserved[2];
+};
+
+struct e4_tstorm_fcoe_conn_ag_ctx {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4_TSTORM_FCOE_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	0
+#define E4_TSTORM_FCOE_CONN_AG_CTX_BIT1_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_BIT1_SHIFT		1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_BIT2_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_BIT2_SHIFT		2
+#define E4_TSTORM_FCOE_CONN_AG_CTX_BIT3_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_BIT3_SHIFT		3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_BIT4_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_BIT4_SHIFT		4
+#define E4_TSTORM_FCOE_CONN_AG_CTX_BIT5_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_BIT5_SHIFT		5
+#define E4_TSTORM_FCOE_CONN_AG_CTX_DUMMY_TIMER_CF_MASK	0x3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_DUMMY_TIMER_CF_SHIFT	6
+	u8 flags1;
+#define E4_TSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_CF_MASK		0x3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_CF_SHIFT		0
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF2_MASK			0x3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF2_SHIFT			2
+#define E4_TSTORM_FCOE_CONN_AG_CTX_TIMER_STOP_ALL_CF_MASK	0x3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_TIMER_STOP_ALL_CF_SHIFT	4
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF4_MASK			0x3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF4_SHIFT			6
+	u8 flags2;
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF5_SHIFT	0
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF6_SHIFT	2
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF7_MASK	0x3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF7_SHIFT	4
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF8_MASK	0x3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF8_SHIFT	6
+	u8 flags3;
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF9_MASK			0x3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF9_SHIFT			0
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF10_MASK			0x3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF10_SHIFT			2
+#define E4_TSTORM_FCOE_CONN_AG_CTX_DUMMY_TIMER_CF_EN_MASK	0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_DUMMY_TIMER_CF_EN_SHIFT	4
+#define E4_TSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_CF_EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_FLUSH_Q0_CF_EN_SHIFT		5
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF2EN_MASK			0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF2EN_SHIFT			6
+#define E4_TSTORM_FCOE_CONN_AG_CTX_TIMER_STOP_ALL_CF_EN_MASK	0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_TIMER_STOP_ALL_CF_EN_SHIFT	7
+	u8 flags4;
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF4EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF4EN_SHIFT		0
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF5EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF5EN_SHIFT		1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF6EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF6EN_SHIFT		2
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF7EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF7EN_SHIFT		3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF8EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF8EN_SHIFT		4
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF9EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF9EN_SHIFT		5
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF10EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_CF10EN_SHIFT		6
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE0EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE0EN_SHIFT	7
+	u8 flags5;
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE1EN_SHIFT	0
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE2EN_SHIFT	1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE3EN_SHIFT	2
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE4EN_SHIFT	3
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE5EN_SHIFT	4
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE6EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE6EN_SHIFT	5
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE7EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE7EN_SHIFT	6
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE8EN_MASK		0x1
+#define E4_TSTORM_FCOE_CONN_AG_CTX_RULE8EN_SHIFT	7
+	__le32 reg0;
+	__le32 reg1;
+};
+
+struct e4_ustorm_fcoe_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_USTORM_FCOE_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_USTORM_FCOE_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF3_MASK	0x3
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF3_SHIFT	0
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF4_MASK	0x3
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF4_SHIFT	2
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF5_SHIFT	4
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF6_SHIFT	6
+	u8 flags2;
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF0EN_SHIFT		0
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF1EN_SHIFT		1
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF2EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF2EN_SHIFT		2
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF3EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF3EN_SHIFT		3
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF4EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF4EN_SHIFT		4
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF5EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF5EN_SHIFT		5
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF6EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_CF6EN_SHIFT		6
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE0EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE0EN_SHIFT	7
+	u8 flags3;
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE1EN_SHIFT	0
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE2EN_SHIFT	1
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE3EN_SHIFT	2
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE4EN_SHIFT	3
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE5EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE5EN_SHIFT	4
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE6EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE6EN_SHIFT	5
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE7EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE7EN_SHIFT	6
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE8EN_MASK		0x1
+#define E4_USTORM_FCOE_CONN_AG_CTX_RULE8EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le16 word2;
+	__le16 word3;
+};
+
+/* The fcoe storm context of Tstorm */
+struct tstorm_fcoe_conn_st_ctx {
+	__le16 stat_ram_addr;
+	__le16 rx_max_fc_payload_len;
+	__le16 e_d_tov_val;
+	u8 flags;
+#define TSTORM_FCOE_CONN_ST_CTX_INC_SEQ_CNT_MASK	0x1
+#define TSTORM_FCOE_CONN_ST_CTX_INC_SEQ_CNT_SHIFT	0
+#define TSTORM_FCOE_CONN_ST_CTX_SUPPORT_CONF_MASK	0x1
+#define TSTORM_FCOE_CONN_ST_CTX_SUPPORT_CONF_SHIFT	1
+#define TSTORM_FCOE_CONN_ST_CTX_DEF_Q_IDX_MASK		0x3F
+#define TSTORM_FCOE_CONN_ST_CTX_DEF_Q_IDX_SHIFT		2
+	u8 timers_cleanup_invocation_cnt;
+	__le32 reserved1[2];
+	__le32 dst_mac_address_bytes_0_to_3;
+	__le16 dst_mac_address_bytes_4_to_5;
+	__le16 ramrod_echo;
+	u8 flags1;
+#define TSTORM_FCOE_CONN_ST_CTX_MODE_MASK	0x3
+#define TSTORM_FCOE_CONN_ST_CTX_MODE_SHIFT	0
+#define TSTORM_FCOE_CONN_ST_CTX_RESERVED_MASK	0x3F
+#define TSTORM_FCOE_CONN_ST_CTX_RESERVED_SHIFT	2
+	u8 cq_relative_offset;
+	u8 cmdq_relative_offset;
+	u8 bdq_resource_id;
+	u8 reserved0[4];
+};
+
+struct e4_mstorm_fcoe_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_MSTORM_FCOE_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_MSTORM_FCOE_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_MSTORM_FCOE_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_MSTORM_FCOE_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_MSTORM_FCOE_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_MSTORM_FCOE_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_MSTORM_FCOE_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_MSTORM_FCOE_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_MSTORM_FCOE_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_MSTORM_FCOE_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_MSTORM_FCOE_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_MSTORM_FCOE_CONN_AG_CTX_CF0EN_SHIFT		0
+#define E4_MSTORM_FCOE_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_MSTORM_FCOE_CONN_AG_CTX_CF1EN_SHIFT		1
+#define E4_MSTORM_FCOE_CONN_AG_CTX_CF2EN_MASK		0x1
+#define E4_MSTORM_FCOE_CONN_AG_CTX_CF2EN_SHIFT		2
+#define E4_MSTORM_FCOE_CONN_AG_CTX_RULE0EN_MASK		0x1
+#define E4_MSTORM_FCOE_CONN_AG_CTX_RULE0EN_SHIFT	3
+#define E4_MSTORM_FCOE_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_MSTORM_FCOE_CONN_AG_CTX_RULE1EN_SHIFT	4
+#define E4_MSTORM_FCOE_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_MSTORM_FCOE_CONN_AG_CTX_RULE2EN_SHIFT	5
+#define E4_MSTORM_FCOE_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_MSTORM_FCOE_CONN_AG_CTX_RULE3EN_SHIFT	6
+#define E4_MSTORM_FCOE_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_MSTORM_FCOE_CONN_AG_CTX_RULE4EN_SHIFT	7
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+};
+
+/* Fast path part of the fcoe storm context of Mstorm */
+struct fcoe_mstorm_fcoe_conn_st_ctx_fp {
+	__le16 xfer_prod;
+	u8 num_cqs;
+	u8 reserved1;
+	u8 protection_info;
+#define FCOE_MSTORM_FCOE_CONN_ST_CTX_FP_SUPPORT_PROTECTION_MASK  0x1
+#define FCOE_MSTORM_FCOE_CONN_ST_CTX_FP_SUPPORT_PROTECTION_SHIFT 0
+#define FCOE_MSTORM_FCOE_CONN_ST_CTX_FP_VALID_MASK               0x1
+#define FCOE_MSTORM_FCOE_CONN_ST_CTX_FP_VALID_SHIFT              1
+#define FCOE_MSTORM_FCOE_CONN_ST_CTX_FP_RESERVED0_MASK           0x3F
+#define FCOE_MSTORM_FCOE_CONN_ST_CTX_FP_RESERVED0_SHIFT          2
+	u8 q_relative_offset;
+	u8 reserved2[2];
+};
+
+/* Non fast path part of the fcoe storm context of Mstorm */
+struct fcoe_mstorm_fcoe_conn_st_ctx_non_fp {
+	__le16 conn_id;
+	__le16 stat_ram_addr;
+	__le16 num_pages_in_pbl;
+	u8 ptu_log_page_size;
+	u8 log_page_size;
+	__le16 unsolicited_cq_count;
+	__le16 cmdq_count;
+	u8 bdq_resource_id;
+	u8 reserved0[3];
+	struct regpair xferq_pbl_addr;
+	struct regpair reserved1;
+	struct regpair reserved2[3];
+};
+
+/* The fcoe storm context of Mstorm */
+struct mstorm_fcoe_conn_st_ctx {
+	struct fcoe_mstorm_fcoe_conn_st_ctx_fp fp;
+	struct fcoe_mstorm_fcoe_conn_st_ctx_non_fp non_fp;
+};
+
+/* fcoe connection context */
+struct e4_fcoe_conn_context {
+	struct ystorm_fcoe_conn_st_ctx ystorm_st_context;
+	struct pstorm_fcoe_conn_st_ctx pstorm_st_context;
+	struct regpair pstorm_st_padding[2];
+	struct xstorm_fcoe_conn_st_ctx xstorm_st_context;
+	struct e4_xstorm_fcoe_conn_ag_ctx xstorm_ag_context;
+	struct regpair xstorm_ag_padding[6];
+	struct ustorm_fcoe_conn_st_ctx ustorm_st_context;
+	struct regpair ustorm_st_padding[2];
+	struct e4_tstorm_fcoe_conn_ag_ctx tstorm_ag_context;
+	struct regpair tstorm_ag_padding[2];
+	struct timers_context timer_context;
+	struct e4_ustorm_fcoe_conn_ag_ctx ustorm_ag_context;
+	struct tstorm_fcoe_conn_st_ctx tstorm_st_context;
+	struct e4_mstorm_fcoe_conn_ag_ctx mstorm_ag_context;
+	struct mstorm_fcoe_conn_st_ctx mstorm_st_context;
+};
+
+/* FCoE connection offload params passed by driver to FW in FCoE offload
+ * ramrod.
+ */
+struct fcoe_conn_offload_ramrod_params {
+	struct fcoe_conn_offload_ramrod_data offload_ramrod_data;
+};
+
+/* FCoE connection terminate params passed by driver to FW in FCoE terminate
+ * conn ramrod.
+ */
+struct fcoe_conn_terminate_ramrod_params {
+	struct fcoe_conn_terminate_ramrod_data terminate_ramrod_data;
+};
+
+/* FCoE event type */
+enum fcoe_event_type {
+	FCOE_EVENT_INIT_FUNC,
+	FCOE_EVENT_DESTROY_FUNC,
+	FCOE_EVENT_STAT_FUNC,
+	FCOE_EVENT_OFFLOAD_CONN,
+	FCOE_EVENT_TERMINATE_CONN,
+	FCOE_EVENT_ERROR,
+	MAX_FCOE_EVENT_TYPE
+};
+
+/* FCoE init params passed by driver to FW in FCoE init ramrod */
+struct fcoe_init_ramrod_params {
+	struct fcoe_init_func_ramrod_data init_ramrod_data;
+};
+
+/* FCoE ramrod Command IDs */
+enum fcoe_ramrod_cmd_id {
+	FCOE_RAMROD_CMD_ID_INIT_FUNC,
+	FCOE_RAMROD_CMD_ID_DESTROY_FUNC,
+	FCOE_RAMROD_CMD_ID_STAT_FUNC,
+	FCOE_RAMROD_CMD_ID_OFFLOAD_CONN,
+	FCOE_RAMROD_CMD_ID_TERMINATE_CONN,
+	MAX_FCOE_RAMROD_CMD_ID
+};
+
+/* FCoE statistics params buffer passed by driver to FW in FCoE statistics
+ * ramrod.
+ */
+struct fcoe_stat_ramrod_params {
+	struct fcoe_stat_ramrod_data stat_ramrod_data;
+};
+
+struct e4_ystorm_fcoe_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_YSTORM_FCOE_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_YSTORM_FCOE_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_YSTORM_FCOE_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_YSTORM_FCOE_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_YSTORM_FCOE_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_YSTORM_FCOE_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_YSTORM_FCOE_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_YSTORM_FCOE_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_YSTORM_FCOE_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_YSTORM_FCOE_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_YSTORM_FCOE_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_YSTORM_FCOE_CONN_AG_CTX_CF0EN_SHIFT		0
+#define E4_YSTORM_FCOE_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_YSTORM_FCOE_CONN_AG_CTX_CF1EN_SHIFT		1
+#define E4_YSTORM_FCOE_CONN_AG_CTX_CF2EN_MASK		0x1
+#define E4_YSTORM_FCOE_CONN_AG_CTX_CF2EN_SHIFT		2
+#define E4_YSTORM_FCOE_CONN_AG_CTX_RULE0EN_MASK		0x1
+#define E4_YSTORM_FCOE_CONN_AG_CTX_RULE0EN_SHIFT	3
+#define E4_YSTORM_FCOE_CONN_AG_CTX_RULE1EN_MASK		0x1
+#define E4_YSTORM_FCOE_CONN_AG_CTX_RULE1EN_SHIFT	4
+#define E4_YSTORM_FCOE_CONN_AG_CTX_RULE2EN_MASK		0x1
+#define E4_YSTORM_FCOE_CONN_AG_CTX_RULE2EN_SHIFT	5
+#define E4_YSTORM_FCOE_CONN_AG_CTX_RULE3EN_MASK		0x1
+#define E4_YSTORM_FCOE_CONN_AG_CTX_RULE3EN_SHIFT	6
+#define E4_YSTORM_FCOE_CONN_AG_CTX_RULE4EN_MASK		0x1
+#define E4_YSTORM_FCOE_CONN_AG_CTX_RULE4EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le32 reg0;
+	__le32 reg1;
+	__le16 word1;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le32 reg2;
+	__le32 reg3;
+};
+
+/* The iscsi storm connection context of Ystorm */
+struct ystorm_iscsi_conn_st_ctx {
+	__le32 reserved[8];
+};
+
+/* Combined iSCSI and TCP storm connection of Pstorm */
+struct pstorm_iscsi_tcp_conn_st_ctx {
+	__le32 tcp[32];
+	__le32 iscsi[4];
+};
+
+/* The combined tcp and iscsi storm context of Xstorm */
+struct xstorm_iscsi_tcp_conn_st_ctx {
+	__le32 reserved_tcp[4];
+	__le32 reserved_iscsi[44];
+};
+
+struct e4_xstorm_iscsi_conn_ag_ctx {
+	u8 cdu_validation;
+	u8 state;
+	u8 flags0;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_EXIST_IN_QM1_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_EXIST_IN_QM1_SHIFT	1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RESERVED1_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RESERVED1_SHIFT	2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_EXIST_IN_QM3_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_EXIST_IN_QM3_SHIFT	3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT4_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT4_SHIFT		4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RESERVED2_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RESERVED2_SHIFT	5
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT6_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT6_SHIFT		6
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT7_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT7_SHIFT		7
+	u8 flags1;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT8_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT8_SHIFT		0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT9_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT9_SHIFT		1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT10_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT10_SHIFT		2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT11_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT11_SHIFT		3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT12_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT12_SHIFT		4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT13_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT13_SHIFT		5
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT14_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT14_SHIFT		6
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_TX_TRUNCATE_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_TX_TRUNCATE_SHIFT	7
+	u8 flags2;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF0_MASK			0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF0_SHIFT			0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF1_MASK			0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF1_SHIFT			2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF2_MASK			0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF2_SHIFT			4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_TIMER_STOP_ALL_MASK		0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_TIMER_STOP_ALL_SHIFT	6
+	u8 flags3;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF4_MASK	0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF4_SHIFT	0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF5_SHIFT	2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF6_SHIFT	4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF7_MASK	0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF7_SHIFT	6
+	u8 flags4;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF8_MASK	0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF8_SHIFT	0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF9_MASK	0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF9_SHIFT	2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF10_MASK	0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF10_SHIFT	4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF11_MASK	0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF11_SHIFT	6
+	u8 flags5;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF12_MASK				0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF12_SHIFT				0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF13_MASK				0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF13_SHIFT				2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF14_MASK				0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF14_SHIFT				4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_UPDATE_STATE_TO_BASE_CF_MASK	0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_UPDATE_STATE_TO_BASE_CF_SHIFT	6
+	u8 flags6;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF16_MASK		0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF16_SHIFT		0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF17_MASK		0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF17_SHIFT		2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF18_MASK		0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF18_SHIFT		4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_DQ_FLUSH_MASK	0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_DQ_FLUSH_SHIFT	6
+	u8 flags7;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_MST_XCM_Q0_FLUSH_CF_MASK	0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_MST_XCM_Q0_FLUSH_CF_SHIFT	0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_UST_XCM_Q1_FLUSH_CF_MASK	0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_UST_XCM_Q1_FLUSH_CF_SHIFT	2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_SLOW_PATH_MASK		0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_SLOW_PATH_SHIFT		4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF0EN_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF0EN_SHIFT			6
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF1EN_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF1EN_SHIFT			7
+	u8 flags8;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF2EN_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF2EN_SHIFT			0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_TIMER_STOP_ALL_EN_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_TIMER_STOP_ALL_EN_SHIFT	1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF4EN_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF4EN_SHIFT			2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF5EN_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF5EN_SHIFT			3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF6EN_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF6EN_SHIFT			4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF7EN_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF7EN_SHIFT			5
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF8EN_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF8EN_SHIFT			6
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF9EN_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF9EN_SHIFT			7
+	u8 flags9;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF10EN_MASK				0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF10EN_SHIFT			0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF11EN_MASK				0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF11EN_SHIFT			1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF12EN_MASK				0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF12EN_SHIFT			2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF13EN_MASK				0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF13EN_SHIFT			3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF14EN_MASK				0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF14EN_SHIFT			4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_UPDATE_STATE_TO_BASE_CF_EN_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_UPDATE_STATE_TO_BASE_CF_EN_SHIFT	5
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF16EN_MASK				0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF16EN_SHIFT			6
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF17EN_MASK				0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF17EN_SHIFT			7
+	u8 flags10;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF18EN_MASK				0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_CF18EN_SHIFT			0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_DQ_FLUSH_EN_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_DQ_FLUSH_EN_SHIFT			1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_MST_XCM_Q0_FLUSH_CF_EN_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_MST_XCM_Q0_FLUSH_CF_EN_SHIFT	2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_UST_XCM_Q1_FLUSH_CF_EN_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_UST_XCM_Q1_FLUSH_CF_EN_SHIFT	3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_SLOW_PATH_EN_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_SLOW_PATH_EN_SHIFT			4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_PROC_ONLY_CLEANUP_EN_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_PROC_ONLY_CLEANUP_EN_SHIFT		5
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE0EN_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE0EN_SHIFT			6
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_MORE_TO_SEND_DEC_RULE_EN_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_MORE_TO_SEND_DEC_RULE_EN_SHIFT	7
+	u8 flags11;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_TX_BLOCKED_EN_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_TX_BLOCKED_EN_SHIFT	0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE3EN_SHIFT	1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RESERVED3_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RESERVED3_SHIFT	2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE5EN_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE5EN_SHIFT	3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE6EN_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE6EN_SHIFT	4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE7EN_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE7EN_SHIFT	5
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED1_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED1_SHIFT	6
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE9EN_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE9EN_SHIFT	7
+	u8 flags12;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_SQ_DEC_RULE_EN_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_SQ_DEC_RULE_EN_SHIFT	0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE11EN_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE11EN_SHIFT		1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED2_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED2_SHIFT		2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED3_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED3_SHIFT		3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE14EN_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE14EN_SHIFT		4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE15EN_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE15EN_SHIFT		5
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE16EN_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE16EN_SHIFT		6
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE17EN_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_RULE17EN_SHIFT		7
+	u8 flags13;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_R2TQ_DEC_RULE_EN_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_R2TQ_DEC_RULE_EN_SHIFT	0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_HQ_DEC_RULE_EN_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_HQ_DEC_RULE_EN_SHIFT	1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED4_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED4_SHIFT		2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED5_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED5_SHIFT		3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED6_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED6_SHIFT		4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED7_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED7_SHIFT		5
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED8_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED8_SHIFT		6
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED9_MASK		0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_A0_RESERVED9_SHIFT		7
+	u8 flags14;
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT16_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT16_SHIFT			0
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT17_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT17_SHIFT			1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT18_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT18_SHIFT			2
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT19_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT19_SHIFT			3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT20_MASK			0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_BIT20_SHIFT			4
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_DUMMY_READ_DONE_MASK	0x1
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_DUMMY_READ_DONE_SHIFT	5
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_PROC_ONLY_CLEANUP_MASK	0x3
+#define E4_XSTORM_ISCSI_CONN_AG_CTX_PROC_ONLY_CLEANUP_SHIFT	6
+	u8 byte2;
+	__le16 physical_q0;
+	__le16 physical_q1;
+	__le16 dummy_dorq_var;
+	__le16 sq_cons;
+	__le16 sq_prod;
+	__le16 word5;
+	__le16 slow_io_total_data_tx_update;
+	u8 byte3;
+	u8 byte4;
+	u8 byte5;
+	u8 byte6;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 more_to_send_seq;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 hq_scan_next_relevant_ack;
+	__le16 r2tq_prod;
+	__le16 r2tq_cons;
+	__le16 hq_prod;
+	__le16 hq_cons;
+	__le32 remain_seq;
+	__le32 bytes_to_next_pdu;
+	__le32 hq_tcp_seq;
+	u8 byte7;
+	u8 byte8;
+	u8 byte9;
+	u8 byte10;
+	u8 byte11;
+	u8 byte12;
+	u8 byte13;
+	u8 byte14;
+	u8 byte15;
+	u8 e5_reserved;
+	__le16 word11;
+	__le32 reg10;
+	__le32 reg11;
+	__le32 exp_stat_sn;
+	__le32 ongoing_fast_rxmit_seq;
+	__le32 reg14;
+	__le32 reg15;
+	__le32 reg16;
+	__le32 reg17;
+};
+
+struct e4_tstorm_iscsi_conn_ag_ctx {
+	u8 reserved0;
+	u8 state;
+	u8 flags0;
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_EXIST_IN_QM0_SHIFT	0
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_BIT1_MASK		0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_BIT1_SHIFT		1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_BIT2_MASK		0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_BIT2_SHIFT		2
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_BIT3_MASK		0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_BIT3_SHIFT		3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_BIT4_MASK		0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_BIT4_SHIFT		4
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_BIT5_MASK		0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_BIT5_SHIFT		5
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF0_MASK		0x3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF0_SHIFT		6
+	u8 flags1;
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_P2T_FLUSH_CF_MASK		0x3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_P2T_FLUSH_CF_SHIFT		0
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_M2T_FLUSH_CF_MASK		0x3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_M2T_FLUSH_CF_SHIFT		2
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_TIMER_STOP_ALL_MASK		0x3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_TIMER_STOP_ALL_SHIFT	4
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF4_MASK			0x3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF4_SHIFT			6
+	u8 flags2;
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF5_SHIFT	0
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF6_SHIFT	2
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF7_MASK	0x3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF7_SHIFT	4
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF8_MASK	0x3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF8_SHIFT	6
+	u8 flags3;
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_FLUSH_Q0_MASK		0x3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_FLUSH_Q0_SHIFT		0
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF10_MASK			0x3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF10_SHIFT			2
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF0EN_MASK			0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF0EN_SHIFT			4
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_P2T_FLUSH_CF_EN_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_P2T_FLUSH_CF_EN_SHIFT	5
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_M2T_FLUSH_CF_EN_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_M2T_FLUSH_CF_EN_SHIFT	6
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_TIMER_STOP_ALL_EN_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_TIMER_STOP_ALL_EN_SHIFT	7
+	u8 flags4;
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF4EN_MASK		0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF4EN_SHIFT		0
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF5EN_MASK		0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF5EN_SHIFT		1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF6EN_MASK		0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF6EN_SHIFT		2
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF7EN_MASK		0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF7EN_SHIFT		3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF8EN_MASK		0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF8EN_SHIFT		4
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_FLUSH_Q0_EN_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_FLUSH_Q0_EN_SHIFT	5
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF10EN_MASK		0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_CF10EN_SHIFT	6
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE0EN_SHIFT	7
+	u8 flags5;
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE1EN_SHIFT	0
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE2EN_SHIFT	1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE3EN_SHIFT	2
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE4EN_SHIFT	3
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE5EN_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE5EN_SHIFT	4
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE6EN_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE6EN_SHIFT	5
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE7EN_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE7EN_SHIFT	6
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE8EN_MASK	0x1
+#define E4_TSTORM_ISCSI_CONN_AG_CTX_RULE8EN_SHIFT	7
+	__le32 reg0;
+	__le32 reg1;
+	__le32 rx_tcp_checksum_err_cnt;
+	__le32 reg3;
+	__le32 reg4;
+	__le32 reg5;
+	__le32 reg6;
+	__le32 reg7;
+	__le32 reg8;
+	u8 cid_offload_cnt;
+	u8 byte3;
+	__le16 word0;
+};
+
+struct e4_ustorm_iscsi_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_USTORM_ISCSI_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_USTORM_ISCSI_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF3_MASK	0x3
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF3_SHIFT	0
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF4_MASK	0x3
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF4_SHIFT	2
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF5_MASK	0x3
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF5_SHIFT	4
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF6_MASK	0x3
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF6_SHIFT	6
+	u8 flags2;
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF0EN_SHIFT		0
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF1EN_SHIFT		1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF2EN_MASK		0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF2EN_SHIFT		2
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF3EN_MASK		0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF3EN_SHIFT		3
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF4EN_MASK		0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF4EN_SHIFT		4
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF5EN_MASK		0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF5EN_SHIFT		5
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF6EN_MASK		0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_CF6EN_SHIFT		6
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE0EN_SHIFT	7
+	u8 flags3;
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE1EN_SHIFT	0
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE2EN_SHIFT	1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE3EN_SHIFT	2
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE4EN_SHIFT	3
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE5EN_MASK	0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE5EN_SHIFT	4
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE6EN_MASK	0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE6EN_SHIFT	5
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE7EN_MASK	0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE7EN_SHIFT	6
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE8EN_MASK	0x1
+#define E4_USTORM_ISCSI_CONN_AG_CTX_RULE8EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le16 word2;
+	__le16 word3;
+};
+
+/* The iscsi storm connection context of Tstorm */
+struct tstorm_iscsi_conn_st_ctx {
+	__le32 reserved[44];
+};
+
+struct e4_mstorm_iscsi_conn_ag_ctx {
+	u8 reserved;
+	u8 state;
+	u8 flags0;
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_CF0EN_SHIFT		0
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_CF1EN_SHIFT		1
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_CF2EN_MASK		0x1
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_CF2EN_SHIFT		2
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_RULE0EN_SHIFT	3
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_RULE1EN_SHIFT	4
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_RULE2EN_SHIFT	5
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_RULE3EN_SHIFT	6
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_MSTORM_ISCSI_CONN_AG_CTX_RULE4EN_SHIFT	7
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+};
+
+/* Combined iSCSI and TCP storm connection of Mstorm */
+struct mstorm_iscsi_tcp_conn_st_ctx {
+	__le32 reserved_tcp[20];
+	__le32 reserved_iscsi[12];
+};
+
+/* The iscsi storm context of Ustorm */
+struct ustorm_iscsi_conn_st_ctx {
+	__le32 reserved[52];
+};
+
+/* iscsi connection context */
+struct e4_iscsi_conn_context {
+	struct ystorm_iscsi_conn_st_ctx ystorm_st_context;
+	struct pstorm_iscsi_tcp_conn_st_ctx pstorm_st_context;
+	struct regpair pstorm_st_padding[2];
+	struct pb_context xpb2_context;
+	struct xstorm_iscsi_tcp_conn_st_ctx xstorm_st_context;
+	struct regpair xstorm_st_padding[2];
+	struct e4_xstorm_iscsi_conn_ag_ctx xstorm_ag_context;
+	struct e4_tstorm_iscsi_conn_ag_ctx tstorm_ag_context;
+	struct regpair tstorm_ag_padding[2];
+	struct timers_context timer_context;
+	struct e4_ustorm_iscsi_conn_ag_ctx ustorm_ag_context;
+	struct pb_context upb_context;
+	struct tstorm_iscsi_conn_st_ctx tstorm_st_context;
+	struct regpair tstorm_st_padding[2];
+	struct e4_mstorm_iscsi_conn_ag_ctx mstorm_ag_context;
+	struct mstorm_iscsi_tcp_conn_st_ctx mstorm_st_context;
+	struct ustorm_iscsi_conn_st_ctx ustorm_st_context;
+};
+
+/* iSCSI init params passed by driver to FW in iSCSI init ramrod */
+struct iscsi_init_ramrod_params {
+	struct iscsi_spe_func_init iscsi_init_spe;
+	struct tcp_init_params tcp_init;
+};
+
+struct e4_ystorm_iscsi_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_BIT0_MASK	0x1
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_BIT0_SHIFT	0
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_BIT1_MASK	0x1
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_BIT1_SHIFT	1
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_CF0_MASK	0x3
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_CF0_SHIFT	2
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_CF1_MASK	0x3
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_CF1_SHIFT	4
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_CF2_MASK	0x3
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_CF2_SHIFT	6
+	u8 flags1;
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_CF0EN_MASK		0x1
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_CF0EN_SHIFT		0
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_CF1EN_MASK		0x1
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_CF1EN_SHIFT		1
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_CF2EN_MASK		0x1
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_CF2EN_SHIFT		2
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_RULE0EN_SHIFT	3
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_RULE1EN_SHIFT	4
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_RULE2EN_SHIFT	5
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_RULE3EN_SHIFT	6
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define E4_YSTORM_ISCSI_CONN_AG_CTX_RULE4EN_SHIFT	7
+	u8 byte2;
+	u8 byte3;
+	__le16 word0;
+	__le32 reg0;
+	__le32 reg1;
+	__le16 word1;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le32 reg2;
+	__le32 reg3;
+};
+
+#define MFW_TRACE_SIGNATURE     0x25071946
+
+/* The trace in the buffer */
+#define MFW_TRACE_EVENTID_MASK          0x00ffff
+#define MFW_TRACE_PRM_SIZE_MASK         0x0f0000
+#define MFW_TRACE_PRM_SIZE_SHIFT        16
+#define MFW_TRACE_ENTRY_SIZE            3
+
+struct mcp_trace {
+	u32 signature;		/* Help to identify that the trace is valid */
+	u32 size;		/* the size of the trace buffer in bytes */
+	u32 curr_level;		/* 2 - all will be written to the buffer
+				 * 1 - debug trace will not be written
+				 * 0 - just errors will be written to the buffer
+				 */
+	u32 modules_mask[2];	/* a bit per module, 1 means write it, 0 means
+				 * mask it.
+				 */
+
+	/* Warning: the following pointers are assumed to be 32bits as they are
+	 * used only in the MFW.
+	 */
+	u32 trace_prod; /* The next trace will be written to this offset */
+	u32 trace_oldest; /* The oldest valid trace starts at this offset
+			   * (usually very close after the current producer).
+			   */
+};
+
+#define VF_MAX_STATIC 192
+
+#define MCP_GLOB_PATH_MAX	2
+#define MCP_PORT_MAX		2
+#define MCP_GLOB_PORT_MAX	4
+#define MCP_GLOB_FUNC_MAX	16
+
+typedef u32 offsize_t;		/* In DWORDS !!! */
+/* Offset from the beginning of the MCP scratchpad */
+#define OFFSIZE_OFFSET_SHIFT	0
+#define OFFSIZE_OFFSET_MASK	0x0000ffff
+/* Size of specific element (not the whole array if any) */
+#define OFFSIZE_SIZE_SHIFT	16
+#define OFFSIZE_SIZE_MASK	0xffff0000
+
+#define SECTION_OFFSET(_offsize) ((((_offsize &			\
+				     OFFSIZE_OFFSET_MASK) >>	\
+				    OFFSIZE_OFFSET_SHIFT) << 2))
+
+#define QED_SECTION_SIZE(_offsize) (((_offsize &		\
+				      OFFSIZE_SIZE_MASK) >>	\
+				     OFFSIZE_SIZE_SHIFT) << 2)
+
+#define SECTION_ADDR(_offsize, idx) (MCP_REG_SCRATCH +			\
+				     SECTION_OFFSET(_offsize) +		\
+				     (QED_SECTION_SIZE(_offsize) * idx))
+
+#define SECTION_OFFSIZE_ADDR(_pub_base, _section)	\
+	(_pub_base + offsetof(struct mcp_public_data, sections[_section]))
+
+/* PHY configuration */
+struct eth_phy_cfg {
+	u32 speed;
+#define ETH_SPEED_AUTONEG	0
+#define ETH_SPEED_SMARTLINQ	0x8
+
+	u32 pause;
+#define ETH_PAUSE_NONE		0x0
+#define ETH_PAUSE_AUTONEG	0x1
+#define ETH_PAUSE_RX		0x2
+#define ETH_PAUSE_TX		0x4
+
+	u32 adv_speed;
+	u32 loopback_mode;
+#define ETH_LOOPBACK_NONE		(0)
+#define ETH_LOOPBACK_INT_PHY		(1)
+#define ETH_LOOPBACK_EXT_PHY		(2)
+#define ETH_LOOPBACK_EXT		(3)
+#define ETH_LOOPBACK_MAC		(4)
+
+	u32 eee_cfg;
+#define EEE_CFG_EEE_ENABLED			BIT(0)
+#define EEE_CFG_TX_LPI				BIT(1)
+#define EEE_CFG_ADV_SPEED_1G			BIT(2)
+#define EEE_CFG_ADV_SPEED_10G			BIT(3)
+#define EEE_TX_TIMER_USEC_MASK			(0xfffffff0)
+#define EEE_TX_TIMER_USEC_OFFSET		4
+#define EEE_TX_TIMER_USEC_BALANCED_TIME		(0xa00)
+#define EEE_TX_TIMER_USEC_AGGRESSIVE_TIME	(0x100)
+#define EEE_TX_TIMER_USEC_LATENCY_TIME		(0x6000)
+
+	u32 feature_config_flags;
+#define ETH_EEE_MODE_ADV_LPI		(1 << 0)
+};
+
+struct port_mf_cfg {
+	u32 dynamic_cfg;
+#define PORT_MF_CFG_OV_TAG_MASK		0x0000ffff
+#define PORT_MF_CFG_OV_TAG_SHIFT	0
+#define PORT_MF_CFG_OV_TAG_DEFAULT	PORT_MF_CFG_OV_TAG_MASK
+
+	u32 reserved[1];
+};
+
+struct eth_stats {
+	u64 r64;
+	u64 r127;
+	u64 r255;
+	u64 r511;
+	u64 r1023;
+	u64 r1518;
+
+	union {
+		struct {
+			u64 r1522;
+			u64 r2047;
+			u64 r4095;
+			u64 r9216;
+			u64 r16383;
+		} bb0;
+		struct {
+			u64 unused1;
+			u64 r1519_to_max;
+			u64 unused2;
+			u64 unused3;
+			u64 unused4;
+		} ah0;
+	} u0;
+
+	u64 rfcs;
+	u64 rxcf;
+	u64 rxpf;
+	u64 rxpp;
+	u64 raln;
+	u64 rfcr;
+	u64 rovr;
+	u64 rjbr;
+	u64 rund;
+	u64 rfrg;
+	u64 t64;
+	u64 t127;
+	u64 t255;
+	u64 t511;
+	u64 t1023;
+	u64 t1518;
+
+	union {
+		struct {
+			u64 t2047;
+			u64 t4095;
+			u64 t9216;
+			u64 t16383;
+		} bb1;
+		struct {
+			u64 t1519_to_max;
+			u64 unused6;
+			u64 unused7;
+			u64 unused8;
+		} ah1;
+	} u1;
+
+	u64 txpf;
+	u64 txpp;
+
+	union {
+		struct {
+			u64 tlpiec;
+			u64 tncl;
+		} bb2;
+		struct {
+			u64 unused9;
+			u64 unused10;
+		} ah2;
+	} u2;
+
+	u64 rbyte;
+	u64 rxuca;
+	u64 rxmca;
+	u64 rxbca;
+	u64 rxpok;
+	u64 tbyte;
+	u64 txuca;
+	u64 txmca;
+	u64 txbca;
+	u64 txcf;
+};
+
+struct brb_stats {
+	u64 brb_truncate[8];
+	u64 brb_discard[8];
+};
+
+struct port_stats {
+	struct brb_stats brb;
+	struct eth_stats eth;
+};
+
+struct couple_mode_teaming {
+	u8 port_cmt[MCP_GLOB_PORT_MAX];
+#define PORT_CMT_IN_TEAM	(1 << 0)
+
+#define PORT_CMT_PORT_ROLE	(1 << 1)
+#define PORT_CMT_PORT_INACTIVE	(0 << 1)
+#define PORT_CMT_PORT_ACTIVE	(1 << 1)
+
+#define PORT_CMT_TEAM_MASK	(1 << 2)
+#define PORT_CMT_TEAM0		(0 << 2)
+#define PORT_CMT_TEAM1		(1 << 2)
+};
+
+#define LLDP_CHASSIS_ID_STAT_LEN	4
+#define LLDP_PORT_ID_STAT_LEN		4
+#define DCBX_MAX_APP_PROTOCOL		32
+#define MAX_SYSTEM_LLDP_TLV_DATA	32
+
+enum _lldp_agent {
+	LLDP_NEAREST_BRIDGE = 0,
+	LLDP_NEAREST_NON_TPMR_BRIDGE,
+	LLDP_NEAREST_CUSTOMER_BRIDGE,
+	LLDP_MAX_LLDP_AGENTS
+};
+
+struct lldp_config_params_s {
+	u32 config;
+#define LLDP_CONFIG_TX_INTERVAL_MASK	0x000000ff
+#define LLDP_CONFIG_TX_INTERVAL_SHIFT	0
+#define LLDP_CONFIG_HOLD_MASK		0x00000f00
+#define LLDP_CONFIG_HOLD_SHIFT		8
+#define LLDP_CONFIG_MAX_CREDIT_MASK	0x0000f000
+#define LLDP_CONFIG_MAX_CREDIT_SHIFT	12
+#define LLDP_CONFIG_ENABLE_RX_MASK	0x40000000
+#define LLDP_CONFIG_ENABLE_RX_SHIFT	30
+#define LLDP_CONFIG_ENABLE_TX_MASK	0x80000000
+#define LLDP_CONFIG_ENABLE_TX_SHIFT	31
+	u32 local_chassis_id[LLDP_CHASSIS_ID_STAT_LEN];
+	u32 local_port_id[LLDP_PORT_ID_STAT_LEN];
+};
+
+struct lldp_status_params_s {
+	u32 prefix_seq_num;
+	u32 status;
+	u32 peer_chassis_id[LLDP_CHASSIS_ID_STAT_LEN];
+	u32 peer_port_id[LLDP_PORT_ID_STAT_LEN];
+	u32 suffix_seq_num;
+};
+
+struct dcbx_ets_feature {
+	u32 flags;
+#define DCBX_ETS_ENABLED_MASK	0x00000001
+#define DCBX_ETS_ENABLED_SHIFT	0
+#define DCBX_ETS_WILLING_MASK	0x00000002
+#define DCBX_ETS_WILLING_SHIFT	1
+#define DCBX_ETS_ERROR_MASK	0x00000004
+#define DCBX_ETS_ERROR_SHIFT	2
+#define DCBX_ETS_CBS_MASK	0x00000008
+#define DCBX_ETS_CBS_SHIFT	3
+#define DCBX_ETS_MAX_TCS_MASK	0x000000f0
+#define DCBX_ETS_MAX_TCS_SHIFT	4
+#define DCBX_OOO_TC_MASK	0x00000f00
+#define DCBX_OOO_TC_SHIFT	8
+	u32 pri_tc_tbl[1];
+#define DCBX_TCP_OOO_TC		(4)
+
+#define NIG_ETS_ISCSI_OOO_CLIENT_OFFSET	(DCBX_TCP_OOO_TC + 1)
+#define DCBX_CEE_STRICT_PRIORITY	0xf
+	u32 tc_bw_tbl[2];
+	u32 tc_tsa_tbl[2];
+#define DCBX_ETS_TSA_STRICT	0
+#define DCBX_ETS_TSA_CBS	1
+#define DCBX_ETS_TSA_ETS	2
+};
+
+#define DCBX_TCP_OOO_TC			(4)
+#define DCBX_TCP_OOO_K2_4PORT_TC	(3)
+
+struct dcbx_app_priority_entry {
+	u32 entry;
+#define DCBX_APP_PRI_MAP_MASK		0x000000ff
+#define DCBX_APP_PRI_MAP_SHIFT		0
+#define DCBX_APP_PRI_0			0x01
+#define DCBX_APP_PRI_1			0x02
+#define DCBX_APP_PRI_2			0x04
+#define DCBX_APP_PRI_3			0x08
+#define DCBX_APP_PRI_4			0x10
+#define DCBX_APP_PRI_5			0x20
+#define DCBX_APP_PRI_6			0x40
+#define DCBX_APP_PRI_7			0x80
+#define DCBX_APP_SF_MASK		0x00000300
+#define DCBX_APP_SF_SHIFT		8
+#define DCBX_APP_SF_ETHTYPE		0
+#define DCBX_APP_SF_PORT		1
+#define DCBX_APP_SF_IEEE_MASK		0x0000f000
+#define DCBX_APP_SF_IEEE_SHIFT		12
+#define DCBX_APP_SF_IEEE_RESERVED	0
+#define DCBX_APP_SF_IEEE_ETHTYPE	1
+#define DCBX_APP_SF_IEEE_TCP_PORT	2
+#define DCBX_APP_SF_IEEE_UDP_PORT	3
+#define DCBX_APP_SF_IEEE_TCP_UDP_PORT	4
+
+#define DCBX_APP_PROTOCOL_ID_MASK	0xffff0000
+#define DCBX_APP_PROTOCOL_ID_SHIFT	16
+};
+
+struct dcbx_app_priority_feature {
+	u32 flags;
+#define DCBX_APP_ENABLED_MASK		0x00000001
+#define DCBX_APP_ENABLED_SHIFT		0
+#define DCBX_APP_WILLING_MASK		0x00000002
+#define DCBX_APP_WILLING_SHIFT		1
+#define DCBX_APP_ERROR_MASK		0x00000004
+#define DCBX_APP_ERROR_SHIFT		2
+#define DCBX_APP_MAX_TCS_MASK		0x0000f000
+#define DCBX_APP_MAX_TCS_SHIFT		12
+#define DCBX_APP_NUM_ENTRIES_MASK	0x00ff0000
+#define DCBX_APP_NUM_ENTRIES_SHIFT	16
+	struct dcbx_app_priority_entry app_pri_tbl[DCBX_MAX_APP_PROTOCOL];
+};
+
+struct dcbx_features {
+	struct dcbx_ets_feature ets;
+	u32 pfc;
+#define DCBX_PFC_PRI_EN_BITMAP_MASK	0x000000ff
+#define DCBX_PFC_PRI_EN_BITMAP_SHIFT	0
+#define DCBX_PFC_PRI_EN_BITMAP_PRI_0	0x01
+#define DCBX_PFC_PRI_EN_BITMAP_PRI_1	0x02
+#define DCBX_PFC_PRI_EN_BITMAP_PRI_2	0x04
+#define DCBX_PFC_PRI_EN_BITMAP_PRI_3	0x08
+#define DCBX_PFC_PRI_EN_BITMAP_PRI_4	0x10
+#define DCBX_PFC_PRI_EN_BITMAP_PRI_5	0x20
+#define DCBX_PFC_PRI_EN_BITMAP_PRI_6	0x40
+#define DCBX_PFC_PRI_EN_BITMAP_PRI_7	0x80
+
+#define DCBX_PFC_FLAGS_MASK		0x0000ff00
+#define DCBX_PFC_FLAGS_SHIFT		8
+#define DCBX_PFC_CAPS_MASK		0x00000f00
+#define DCBX_PFC_CAPS_SHIFT		8
+#define DCBX_PFC_MBC_MASK		0x00004000
+#define DCBX_PFC_MBC_SHIFT		14
+#define DCBX_PFC_WILLING_MASK		0x00008000
+#define DCBX_PFC_WILLING_SHIFT		15
+#define DCBX_PFC_ENABLED_MASK		0x00010000
+#define DCBX_PFC_ENABLED_SHIFT		16
+#define DCBX_PFC_ERROR_MASK		0x00020000
+#define DCBX_PFC_ERROR_SHIFT		17
+
+	struct dcbx_app_priority_feature app;
+};
+
+struct dcbx_local_params {
+	u32 config;
+#define DCBX_CONFIG_VERSION_MASK	0x00000007
+#define DCBX_CONFIG_VERSION_SHIFT	0
+#define DCBX_CONFIG_VERSION_DISABLED	0
+#define DCBX_CONFIG_VERSION_IEEE	1
+#define DCBX_CONFIG_VERSION_CEE		2
+#define DCBX_CONFIG_VERSION_STATIC	4
+
+	u32 flags;
+	struct dcbx_features features;
+};
+
+struct dcbx_mib {
+	u32 prefix_seq_num;
+	u32 flags;
+	struct dcbx_features features;
+	u32 suffix_seq_num;
+};
+
+struct lldp_system_tlvs_buffer_s {
+	u16 valid;
+	u16 length;
+	u32 data[MAX_SYSTEM_LLDP_TLV_DATA];
+};
+
+struct dcb_dscp_map {
+	u32 flags;
+#define DCB_DSCP_ENABLE_MASK	0x1
+#define DCB_DSCP_ENABLE_SHIFT	0
+#define DCB_DSCP_ENABLE	1
+	u32 dscp_pri_map[8];
+};
+
+struct public_global {
+	u32 max_path;
+	u32 max_ports;
+#define MODE_1P 1
+#define MODE_2P 2
+#define MODE_3P 3
+#define MODE_4P 4
+	u32 debug_mb_offset;
+	u32 phymod_dbg_mb_offset;
+	struct couple_mode_teaming cmt;
+	s32 internal_temperature;
+	u32 mfw_ver;
+	u32 running_bundle_id;
+	s32 external_temperature;
+	u32 mdump_reason;
+};
+
+struct fw_flr_mb {
+	u32 aggint;
+	u32 opgen_addr;
+	u32 accum_ack;
+};
+
+struct public_path {
+	struct fw_flr_mb flr_mb;
+	u32 mcp_vf_disabled[VF_MAX_STATIC / 32];
+
+	u32 process_kill;
+#define PROCESS_KILL_COUNTER_MASK	0x0000ffff
+#define PROCESS_KILL_COUNTER_SHIFT	0
+#define PROCESS_KILL_GLOB_AEU_BIT_MASK	0xffff0000
+#define PROCESS_KILL_GLOB_AEU_BIT_SHIFT	16
+#define GLOBAL_AEU_BIT(aeu_reg_id, aeu_bit) (aeu_reg_id * 32 + aeu_bit)
+};
+
+struct public_port {
+	u32 validity_map;
+
+	u32 link_status;
+#define LINK_STATUS_LINK_UP			0x00000001
+#define LINK_STATUS_SPEED_AND_DUPLEX_MASK	0x0000001e
+#define LINK_STATUS_SPEED_AND_DUPLEX_1000THD	(1 << 1)
+#define LINK_STATUS_SPEED_AND_DUPLEX_1000TFD	(2 << 1)
+#define LINK_STATUS_SPEED_AND_DUPLEX_10G	(3 << 1)
+#define LINK_STATUS_SPEED_AND_DUPLEX_20G	(4 << 1)
+#define LINK_STATUS_SPEED_AND_DUPLEX_40G	(5 << 1)
+#define LINK_STATUS_SPEED_AND_DUPLEX_50G	(6 << 1)
+#define LINK_STATUS_SPEED_AND_DUPLEX_100G	(7 << 1)
+#define LINK_STATUS_SPEED_AND_DUPLEX_25G	(8 << 1)
+
+#define LINK_STATUS_AUTO_NEGOTIATE_ENABLED	0x00000020
+
+#define LINK_STATUS_AUTO_NEGOTIATE_COMPLETE	0x00000040
+#define LINK_STATUS_PARALLEL_DETECTION_USED	0x00000080
+
+#define LINK_STATUS_PFC_ENABLED				0x00000100
+#define LINK_STATUS_LINK_PARTNER_1000TFD_CAPABLE 0x00000200
+#define LINK_STATUS_LINK_PARTNER_1000THD_CAPABLE 0x00000400
+#define LINK_STATUS_LINK_PARTNER_10G_CAPABLE		0x00000800
+#define LINK_STATUS_LINK_PARTNER_20G_CAPABLE		0x00001000
+#define LINK_STATUS_LINK_PARTNER_40G_CAPABLE		0x00002000
+#define LINK_STATUS_LINK_PARTNER_50G_CAPABLE		0x00004000
+#define LINK_STATUS_LINK_PARTNER_100G_CAPABLE		0x00008000
+#define LINK_STATUS_LINK_PARTNER_25G_CAPABLE		0x00010000
+
+#define LINK_STATUS_LINK_PARTNER_FLOW_CONTROL_MASK	0x000C0000
+#define LINK_STATUS_LINK_PARTNER_NOT_PAUSE_CAPABLE	(0 << 18)
+#define LINK_STATUS_LINK_PARTNER_SYMMETRIC_PAUSE	(1 << 18)
+#define LINK_STATUS_LINK_PARTNER_ASYMMETRIC_PAUSE	(2 << 18)
+#define LINK_STATUS_LINK_PARTNER_BOTH_PAUSE		(3 << 18)
+
+#define LINK_STATUS_SFP_TX_FAULT			0x00100000
+#define LINK_STATUS_TX_FLOW_CONTROL_ENABLED		0x00200000
+#define LINK_STATUS_RX_FLOW_CONTROL_ENABLED		0x00400000
+#define LINK_STATUS_RX_SIGNAL_PRESENT			0x00800000
+#define LINK_STATUS_MAC_LOCAL_FAULT			0x01000000
+#define LINK_STATUS_MAC_REMOTE_FAULT			0x02000000
+#define LINK_STATUS_UNSUPPORTED_SPD_REQ			0x04000000
+
+	u32 link_status1;
+	u32 ext_phy_fw_version;
+	u32 drv_phy_cfg_addr;
+
+	u32 port_stx;
+
+	u32 stat_nig_timer;
+
+	struct port_mf_cfg port_mf_config;
+	struct port_stats stats;
+
+	u32 media_type;
+#define MEDIA_UNSPECIFIED	0x0
+#define MEDIA_SFPP_10G_FIBER	0x1
+#define MEDIA_XFP_FIBER		0x2
+#define MEDIA_DA_TWINAX		0x3
+#define MEDIA_BASE_T		0x4
+#define MEDIA_SFP_1G_FIBER	0x5
+#define MEDIA_MODULE_FIBER	0x6
+#define MEDIA_KR		0xf0
+#define MEDIA_NOT_PRESENT	0xff
+
+	u32 lfa_status;
+	u32 link_change_count;
+
+	struct lldp_config_params_s lldp_config_params[LLDP_MAX_LLDP_AGENTS];
+	struct lldp_status_params_s lldp_status_params[LLDP_MAX_LLDP_AGENTS];
+	struct lldp_system_tlvs_buffer_s system_lldp_tlvs_buf;
+
+	/* DCBX related MIB */
+	struct dcbx_local_params local_admin_dcbx_mib;
+	struct dcbx_mib remote_dcbx_mib;
+	struct dcbx_mib operational_dcbx_mib;
+
+	u32 reserved[2];
+	u32 transceiver_data;
+#define ETH_TRANSCEIVER_STATE_MASK	0x000000FF
+#define ETH_TRANSCEIVER_STATE_SHIFT	0x00000000
+#define ETH_TRANSCEIVER_STATE_UNPLUGGED	0x00000000
+#define ETH_TRANSCEIVER_STATE_PRESENT	0x00000001
+#define ETH_TRANSCEIVER_STATE_VALID	0x00000003
+#define ETH_TRANSCEIVER_STATE_UPDATING	0x00000008
+
+	u32 wol_info;
+	u32 wol_pkt_len;
+	u32 wol_pkt_details;
+	struct dcb_dscp_map dcb_dscp_map;
+
+	u32 eee_status;
+#define EEE_ACTIVE_BIT			BIT(0)
+#define EEE_LD_ADV_STATUS_MASK		0x000000f0
+#define EEE_LD_ADV_STATUS_OFFSET	4
+#define EEE_1G_ADV			BIT(1)
+#define EEE_10G_ADV			BIT(2)
+#define EEE_LP_ADV_STATUS_MASK		0x00000f00
+#define EEE_LP_ADV_STATUS_OFFSET	8
+#define EEE_SUPPORTED_SPEED_MASK	0x0000f000
+#define EEE_SUPPORTED_SPEED_OFFSET	12
+#define EEE_1G_SUPPORTED		BIT(1)
+#define EEE_10G_SUPPORTED		BIT(2)
+
+	u32 eee_remote;
+#define EEE_REMOTE_TW_TX_MASK   0x0000ffff
+#define EEE_REMOTE_TW_TX_OFFSET 0
+#define EEE_REMOTE_TW_RX_MASK   0xffff0000
+#define EEE_REMOTE_TW_RX_OFFSET 16
+};
+
+struct public_func {
+	u32 reserved0[2];
+
+	u32 mtu_size;
+
+	u32 reserved[7];
+
+	u32 config;
+#define FUNC_MF_CFG_FUNC_HIDE			0x00000001
+#define FUNC_MF_CFG_PAUSE_ON_HOST_RING		0x00000002
+#define FUNC_MF_CFG_PAUSE_ON_HOST_RING_SHIFT	0x00000001
+
+#define FUNC_MF_CFG_PROTOCOL_MASK	0x000000f0
+#define FUNC_MF_CFG_PROTOCOL_SHIFT	4
+#define FUNC_MF_CFG_PROTOCOL_ETHERNET	0x00000000
+#define FUNC_MF_CFG_PROTOCOL_ISCSI              0x00000010
+#define FUNC_MF_CFG_PROTOCOL_FCOE               0x00000020
+#define FUNC_MF_CFG_PROTOCOL_ROCE               0x00000030
+#define FUNC_MF_CFG_PROTOCOL_MAX	0x00000030
+
+#define FUNC_MF_CFG_MIN_BW_MASK		0x0000ff00
+#define FUNC_MF_CFG_MIN_BW_SHIFT	8
+#define FUNC_MF_CFG_MIN_BW_DEFAULT	0x00000000
+#define FUNC_MF_CFG_MAX_BW_MASK		0x00ff0000
+#define FUNC_MF_CFG_MAX_BW_SHIFT	16
+#define FUNC_MF_CFG_MAX_BW_DEFAULT	0x00640000
+
+	u32 status;
+#define FUNC_STATUS_VLINK_DOWN		0x00000001
+
+	u32 mac_upper;
+#define FUNC_MF_CFG_UPPERMAC_MASK	0x0000ffff
+#define FUNC_MF_CFG_UPPERMAC_SHIFT	0
+#define FUNC_MF_CFG_UPPERMAC_DEFAULT	FUNC_MF_CFG_UPPERMAC_MASK
+	u32 mac_lower;
+#define FUNC_MF_CFG_LOWERMAC_DEFAULT	0xffffffff
+
+	u32 fcoe_wwn_port_name_upper;
+	u32 fcoe_wwn_port_name_lower;
+
+	u32 fcoe_wwn_node_name_upper;
+	u32 fcoe_wwn_node_name_lower;
+
+	u32 ovlan_stag;
+#define FUNC_MF_CFG_OV_STAG_MASK	0x0000ffff
+#define FUNC_MF_CFG_OV_STAG_SHIFT	0
+#define FUNC_MF_CFG_OV_STAG_DEFAULT	FUNC_MF_CFG_OV_STAG_MASK
+
+	u32 pf_allocation;
+
+	u32 preserve_data;
+
+	u32 driver_last_activity_ts;
+
+	u32 drv_ack_vf_disabled[VF_MAX_STATIC / 32];
+
+	u32 drv_id;
+#define DRV_ID_PDA_COMP_VER_MASK	0x0000ffff
+#define DRV_ID_PDA_COMP_VER_SHIFT	0
+
+#define LOAD_REQ_HSI_VERSION		2
+#define DRV_ID_MCP_HSI_VER_MASK		0x00ff0000
+#define DRV_ID_MCP_HSI_VER_SHIFT	16
+#define DRV_ID_MCP_HSI_VER_CURRENT	(LOAD_REQ_HSI_VERSION << \
+					 DRV_ID_MCP_HSI_VER_SHIFT)
+
+#define DRV_ID_DRV_TYPE_MASK		0x7f000000
+#define DRV_ID_DRV_TYPE_SHIFT		24
+#define DRV_ID_DRV_TYPE_UNKNOWN		(0 << DRV_ID_DRV_TYPE_SHIFT)
+#define DRV_ID_DRV_TYPE_LINUX		(1 << DRV_ID_DRV_TYPE_SHIFT)
+
+#define DRV_ID_DRV_INIT_HW_MASK		0x80000000
+#define DRV_ID_DRV_INIT_HW_SHIFT	31
+#define DRV_ID_DRV_INIT_HW_FLAG		(1 << DRV_ID_DRV_INIT_HW_SHIFT)
+};
+
+struct mcp_mac {
+	u32 mac_upper;
+	u32 mac_lower;
+};
+
+struct mcp_val64 {
+	u32 lo;
+	u32 hi;
+};
+
+struct mcp_file_att {
+	u32 nvm_start_addr;
+	u32 len;
+};
+
+struct bist_nvm_image_att {
+	u32 return_code;
+	u32 image_type;
+	u32 nvm_start_addr;
+	u32 len;
+};
+
+#define MCP_DRV_VER_STR_SIZE 16
+#define MCP_DRV_VER_STR_SIZE_DWORD (MCP_DRV_VER_STR_SIZE / sizeof(u32))
+#define MCP_DRV_NVM_BUF_LEN 32
+struct drv_version_stc {
+	u32 version;
+	u8 name[MCP_DRV_VER_STR_SIZE - 4];
+};
+
+struct lan_stats_stc {
+	u64 ucast_rx_pkts;
+	u64 ucast_tx_pkts;
+	u32 fcs_err;
+	u32 rserved;
+};
+
+struct fcoe_stats_stc {
+	u64 rx_pkts;
+	u64 tx_pkts;
+	u32 fcs_err;
+	u32 login_failure;
+};
+
+struct ocbb_data_stc {
+	u32 ocbb_host_addr;
+	u32 ocsd_host_addr;
+	u32 ocsd_req_update_interval;
+};
+
+#define MAX_NUM_OF_SENSORS 7
+struct temperature_status_stc {
+	u32 num_of_sensors;
+	u32 sensor[MAX_NUM_OF_SENSORS];
+};
+
+/* crash dump configuration header */
+struct mdump_config_stc {
+	u32 version;
+	u32 config;
+	u32 epoc;
+	u32 num_of_logs;
+	u32 valid_logs;
+};
+
+enum resource_id_enum {
+	RESOURCE_NUM_SB_E = 0,
+	RESOURCE_NUM_L2_QUEUE_E = 1,
+	RESOURCE_NUM_VPORT_E = 2,
+	RESOURCE_NUM_VMQ_E = 3,
+	RESOURCE_FACTOR_NUM_RSS_PF_E = 4,
+	RESOURCE_FACTOR_RSS_PER_VF_E = 5,
+	RESOURCE_NUM_RL_E = 6,
+	RESOURCE_NUM_PQ_E = 7,
+	RESOURCE_NUM_VF_E = 8,
+	RESOURCE_VFC_FILTER_E = 9,
+	RESOURCE_ILT_E = 10,
+	RESOURCE_CQS_E = 11,
+	RESOURCE_GFT_PROFILES_E = 12,
+	RESOURCE_NUM_TC_E = 13,
+	RESOURCE_NUM_RSS_ENGINES_E = 14,
+	RESOURCE_LL2_QUEUE_E = 15,
+	RESOURCE_RDMA_STATS_QUEUE_E = 16,
+	RESOURCE_BDQ_E = 17,
+	RESOURCE_MAX_NUM,
+	RESOURCE_NUM_INVALID = 0xFFFFFFFF
+};
+
+/* Resource ID is to be filled by the driver in the MB request
+ * Size, offset & flags to be filled by the MFW in the MB response
+ */
+struct resource_info {
+	enum resource_id_enum res_id;
+	u32 size;		/* number of allocated resources */
+	u32 offset;		/* Offset of the 1st resource */
+	u32 vf_size;
+	u32 vf_offset;
+	u32 flags;
+#define RESOURCE_ELEMENT_STRICT (1 << 0)
+};
+
+#define DRV_ROLE_NONE           0
+#define DRV_ROLE_PREBOOT        1
+#define DRV_ROLE_OS             2
+#define DRV_ROLE_KDUMP          3
+
+struct load_req_stc {
+	u32 drv_ver_0;
+	u32 drv_ver_1;
+	u32 fw_ver;
+	u32 misc0;
+#define LOAD_REQ_ROLE_MASK              0x000000FF
+#define LOAD_REQ_ROLE_SHIFT             0
+#define LOAD_REQ_LOCK_TO_MASK           0x0000FF00
+#define LOAD_REQ_LOCK_TO_SHIFT          8
+#define LOAD_REQ_LOCK_TO_DEFAULT        0
+#define LOAD_REQ_LOCK_TO_NONE           255
+#define LOAD_REQ_FORCE_MASK             0x000F0000
+#define LOAD_REQ_FORCE_SHIFT            16
+#define LOAD_REQ_FORCE_NONE             0
+#define LOAD_REQ_FORCE_PF               1
+#define LOAD_REQ_FORCE_ALL              2
+#define LOAD_REQ_FLAGS0_MASK            0x00F00000
+#define LOAD_REQ_FLAGS0_SHIFT           20
+#define LOAD_REQ_FLAGS0_AVOID_RESET     (0x1 << 0)
+};
+
+struct load_rsp_stc {
+	u32 drv_ver_0;
+	u32 drv_ver_1;
+	u32 fw_ver;
+	u32 misc0;
+#define LOAD_RSP_ROLE_MASK              0x000000FF
+#define LOAD_RSP_ROLE_SHIFT             0
+#define LOAD_RSP_HSI_MASK               0x0000FF00
+#define LOAD_RSP_HSI_SHIFT              8
+#define LOAD_RSP_FLAGS0_MASK            0x000F0000
+#define LOAD_RSP_FLAGS0_SHIFT           16
+#define LOAD_RSP_FLAGS0_DRV_EXISTS      (0x1 << 0)
+};
+
+union drv_union_data {
+	u32 ver_str[MCP_DRV_VER_STR_SIZE_DWORD];
+	struct mcp_mac wol_mac;
+
+	struct eth_phy_cfg drv_phy_cfg;
+
+	struct mcp_val64 val64;
+
+	u8 raw_data[MCP_DRV_NVM_BUF_LEN];
+
+	struct mcp_file_att file_att;
+
+	u32 ack_vf_disabled[VF_MAX_STATIC / 32];
+
+	struct drv_version_stc drv_version;
+
+	struct lan_stats_stc lan_stats;
+	struct fcoe_stats_stc fcoe_stats;
+	struct ocbb_data_stc ocbb_info;
+	struct temperature_status_stc temp_info;
+	struct resource_info resource;
+	struct bist_nvm_image_att nvm_image_att;
+	struct mdump_config_stc mdump_config;
+};
+
+struct public_drv_mb {
+	u32 drv_mb_header;
+#define DRV_MSG_CODE_MASK			0xffff0000
+#define DRV_MSG_CODE_LOAD_REQ			0x10000000
+#define DRV_MSG_CODE_LOAD_DONE			0x11000000
+#define DRV_MSG_CODE_INIT_HW			0x12000000
+#define DRV_MSG_CODE_CANCEL_LOAD_REQ            0x13000000
+#define DRV_MSG_CODE_UNLOAD_REQ			0x20000000
+#define DRV_MSG_CODE_UNLOAD_DONE		0x21000000
+#define DRV_MSG_CODE_INIT_PHY			0x22000000
+#define DRV_MSG_CODE_LINK_RESET			0x23000000
+#define DRV_MSG_CODE_SET_DCBX			0x25000000
+#define DRV_MSG_CODE_OV_UPDATE_CURR_CFG         0x26000000
+#define DRV_MSG_CODE_OV_UPDATE_BUS_NUM          0x27000000
+#define DRV_MSG_CODE_OV_UPDATE_BOOT_PROGRESS    0x28000000
+#define DRV_MSG_CODE_OV_UPDATE_STORM_FW_VER     0x29000000
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE     0x31000000
+#define DRV_MSG_CODE_BW_UPDATE_ACK              0x32000000
+#define DRV_MSG_CODE_OV_UPDATE_MTU              0x33000000
+#define DRV_MSG_GET_RESOURCE_ALLOC_MSG		0x34000000
+#define DRV_MSG_SET_RESOURCE_VALUE_MSG		0x35000000
+#define DRV_MSG_CODE_OV_UPDATE_WOL              0x38000000
+#define DRV_MSG_CODE_OV_UPDATE_ESWITCH_MODE     0x39000000
+
+#define DRV_MSG_CODE_BW_UPDATE_ACK		0x32000000
+#define DRV_MSG_CODE_NIG_DRAIN			0x30000000
+#define DRV_MSG_CODE_S_TAG_UPDATE_ACK		0x3b000000
+#define DRV_MSG_CODE_INITIATE_PF_FLR            0x02010000
+#define DRV_MSG_CODE_VF_DISABLED_DONE		0xc0000000
+#define DRV_MSG_CODE_CFG_VF_MSIX		0xc0010000
+#define DRV_MSG_CODE_CFG_PF_VFS_MSIX		0xc0020000
+#define DRV_MSG_CODE_NVM_PUT_FILE_BEGIN		0x00010000
+#define DRV_MSG_CODE_NVM_PUT_FILE_DATA		0x00020000
+#define DRV_MSG_CODE_NVM_GET_FILE_ATT		0x00030000
+#define DRV_MSG_CODE_NVM_READ_NVRAM		0x00050000
+#define DRV_MSG_CODE_NVM_WRITE_NVRAM		0x00060000
+#define DRV_MSG_CODE_MCP_RESET			0x00090000
+#define DRV_MSG_CODE_SET_VERSION		0x000f0000
+#define DRV_MSG_CODE_MCP_HALT                   0x00100000
+#define DRV_MSG_CODE_SET_VMAC                   0x00110000
+#define DRV_MSG_CODE_GET_VMAC                   0x00120000
+#define DRV_MSG_CODE_VMAC_TYPE_SHIFT            4
+#define DRV_MSG_CODE_VMAC_TYPE_MASK             0x30
+#define DRV_MSG_CODE_VMAC_TYPE_MAC              1
+#define DRV_MSG_CODE_VMAC_TYPE_WWNN             2
+#define DRV_MSG_CODE_VMAC_TYPE_WWPN             3
+
+#define DRV_MSG_CODE_GET_STATS                  0x00130000
+#define DRV_MSG_CODE_STATS_TYPE_LAN             1
+#define DRV_MSG_CODE_STATS_TYPE_FCOE            2
+#define DRV_MSG_CODE_STATS_TYPE_ISCSI           3
+#define DRV_MSG_CODE_STATS_TYPE_RDMA            4
+
+#define DRV_MSG_CODE_MASK_PARITIES              0x001a0000
+
+#define DRV_MSG_CODE_BIST_TEST			0x001e0000
+#define DRV_MSG_CODE_SET_LED_MODE		0x00200000
+#define DRV_MSG_CODE_RESOURCE_CMD	0x00230000
+
+#define RESOURCE_CMD_REQ_RESC_MASK		0x0000001F
+#define RESOURCE_CMD_REQ_RESC_SHIFT		0
+#define RESOURCE_CMD_REQ_OPCODE_MASK		0x000000E0
+#define RESOURCE_CMD_REQ_OPCODE_SHIFT		5
+#define RESOURCE_OPCODE_REQ			1
+#define RESOURCE_OPCODE_REQ_WO_AGING		2
+#define RESOURCE_OPCODE_REQ_W_AGING		3
+#define RESOURCE_OPCODE_RELEASE			4
+#define RESOURCE_OPCODE_FORCE_RELEASE		5
+#define RESOURCE_CMD_REQ_AGE_MASK		0x0000FF00
+#define RESOURCE_CMD_REQ_AGE_SHIFT		8
+
+#define RESOURCE_CMD_RSP_OWNER_MASK		0x000000FF
+#define RESOURCE_CMD_RSP_OWNER_SHIFT		0
+#define RESOURCE_CMD_RSP_OPCODE_MASK		0x00000700
+#define RESOURCE_CMD_RSP_OPCODE_SHIFT		8
+#define RESOURCE_OPCODE_GNT			1
+#define RESOURCE_OPCODE_BUSY			2
+#define RESOURCE_OPCODE_RELEASED		3
+#define RESOURCE_OPCODE_RELEASED_PREVIOUS	4
+#define RESOURCE_OPCODE_WRONG_OWNER		5
+#define RESOURCE_OPCODE_UNKNOWN_CMD		255
+
+#define RESOURCE_DUMP				0
+
+#define DRV_MSG_CODE_GET_PF_RDMA_PROTOCOL	0x002b0000
+#define DRV_MSG_CODE_OS_WOL			0x002e0000
+
+#define DRV_MSG_CODE_FEATURE_SUPPORT		0x00300000
+#define DRV_MSG_CODE_GET_MFW_FEATURE_SUPPORT	0x00310000
+#define DRV_MSG_SEQ_NUMBER_MASK			0x0000ffff
+
+	u32 drv_mb_param;
+#define DRV_MB_PARAM_UNLOAD_WOL_UNKNOWN         0x00000000
+#define DRV_MB_PARAM_UNLOAD_WOL_MCP             0x00000001
+#define DRV_MB_PARAM_UNLOAD_WOL_DISABLED        0x00000002
+#define DRV_MB_PARAM_UNLOAD_WOL_ENABLED         0x00000003
+#define DRV_MB_PARAM_DCBX_NOTIFY_MASK		0x000000FF
+#define DRV_MB_PARAM_DCBX_NOTIFY_SHIFT		3
+
+#define DRV_MB_PARAM_NVM_LEN_OFFSET		24
+
+#define DRV_MB_PARAM_CFG_VF_MSIX_VF_ID_SHIFT	0
+#define DRV_MB_PARAM_CFG_VF_MSIX_VF_ID_MASK	0x000000FF
+#define DRV_MB_PARAM_CFG_VF_MSIX_SB_NUM_SHIFT	8
+#define DRV_MB_PARAM_CFG_VF_MSIX_SB_NUM_MASK	0x0000FF00
+#define DRV_MB_PARAM_LLDP_SEND_MASK		0x00000001
+#define DRV_MB_PARAM_LLDP_SEND_SHIFT		0
+
+#define DRV_MB_PARAM_OV_CURR_CFG_SHIFT		0
+#define DRV_MB_PARAM_OV_CURR_CFG_MASK		0x0000000F
+#define DRV_MB_PARAM_OV_CURR_CFG_NONE		0
+#define DRV_MB_PARAM_OV_CURR_CFG_OS		1
+#define DRV_MB_PARAM_OV_CURR_CFG_VENDOR_SPEC	2
+#define DRV_MB_PARAM_OV_CURR_CFG_OTHER		3
+
+#define DRV_MB_PARAM_OV_STORM_FW_VER_SHIFT	0
+#define DRV_MB_PARAM_OV_STORM_FW_VER_MASK	0xFFFFFFFF
+#define DRV_MB_PARAM_OV_STORM_FW_VER_MAJOR_MASK	0xFF000000
+#define DRV_MB_PARAM_OV_STORM_FW_VER_MINOR_MASK	0x00FF0000
+#define DRV_MB_PARAM_OV_STORM_FW_VER_BUILD_MASK	0x0000FF00
+#define DRV_MB_PARAM_OV_STORM_FW_VER_DROP_MASK	0x000000FF
+
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_SHIFT	0
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_MASK	0xF
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_UNKNOWN	0x1
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_NOT_LOADED	0x2
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_LOADING	0x3
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_DISABLED	0x4
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_ACTIVE	0x5
+
+#define DRV_MB_PARAM_OV_MTU_SIZE_SHIFT	0
+#define DRV_MB_PARAM_OV_MTU_SIZE_MASK	0xFFFFFFFF
+
+#define DRV_MB_PARAM_WOL_MASK	(DRV_MB_PARAM_WOL_DEFAULT | \
+				 DRV_MB_PARAM_WOL_DISABLED | \
+				 DRV_MB_PARAM_WOL_ENABLED)
+#define DRV_MB_PARAM_WOL_DEFAULT	DRV_MB_PARAM_UNLOAD_WOL_MCP
+#define DRV_MB_PARAM_WOL_DISABLED	DRV_MB_PARAM_UNLOAD_WOL_DISABLED
+#define DRV_MB_PARAM_WOL_ENABLED	DRV_MB_PARAM_UNLOAD_WOL_ENABLED
+
+#define DRV_MB_PARAM_ESWITCH_MODE_MASK	(DRV_MB_PARAM_ESWITCH_MODE_NONE | \
+					 DRV_MB_PARAM_ESWITCH_MODE_VEB | \
+					 DRV_MB_PARAM_ESWITCH_MODE_VEPA)
+#define DRV_MB_PARAM_ESWITCH_MODE_NONE	0x0
+#define DRV_MB_PARAM_ESWITCH_MODE_VEB	0x1
+#define DRV_MB_PARAM_ESWITCH_MODE_VEPA	0x2
+
+#define DRV_MB_PARAM_SET_LED_MODE_OPER		0x0
+#define DRV_MB_PARAM_SET_LED_MODE_ON		0x1
+#define DRV_MB_PARAM_SET_LED_MODE_OFF		0x2
+
+	/* Resource Allocation params - Driver version support */
+#define DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_MASK	0xFFFF0000
+#define DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_SHIFT	16
+#define DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MINOR_MASK	0x0000FFFF
+#define DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MINOR_SHIFT	0
+
+#define DRV_MB_PARAM_BIST_REGISTER_TEST		1
+#define DRV_MB_PARAM_BIST_CLOCK_TEST		2
+#define DRV_MB_PARAM_BIST_NVM_TEST_NUM_IMAGES	3
+#define DRV_MB_PARAM_BIST_NVM_TEST_IMAGE_BY_INDEX	4
+
+#define DRV_MB_PARAM_BIST_RC_UNKNOWN		0
+#define DRV_MB_PARAM_BIST_RC_PASSED		1
+#define DRV_MB_PARAM_BIST_RC_FAILED		2
+#define DRV_MB_PARAM_BIST_RC_INVALID_PARAMETER	3
+
+#define DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT	0
+#define DRV_MB_PARAM_BIST_TEST_INDEX_MASK	0x000000FF
+#define DRV_MB_PARAM_BIST_TEST_IMAGE_INDEX_SHIFT	8
+#define DRV_MB_PARAM_BIST_TEST_IMAGE_INDEX_MASK		0x0000FF00
+
+#define DRV_MB_PARAM_FEATURE_SUPPORT_PORT_MASK		0x0000FFFF
+#define DRV_MB_PARAM_FEATURE_SUPPORT_PORT_OFFSET	0
+#define DRV_MB_PARAM_FEATURE_SUPPORT_PORT_EEE		0x00000002
+
+	u32 fw_mb_header;
+#define FW_MSG_CODE_MASK			0xffff0000
+#define FW_MSG_CODE_UNSUPPORTED                 0x00000000
+#define FW_MSG_CODE_DRV_LOAD_ENGINE		0x10100000
+#define FW_MSG_CODE_DRV_LOAD_PORT		0x10110000
+#define FW_MSG_CODE_DRV_LOAD_FUNCTION		0x10120000
+#define FW_MSG_CODE_DRV_LOAD_REFUSED_PDA	0x10200000
+#define FW_MSG_CODE_DRV_LOAD_REFUSED_HSI_1	0x10210000
+#define FW_MSG_CODE_DRV_LOAD_REFUSED_DIAG	0x10220000
+#define FW_MSG_CODE_DRV_LOAD_REFUSED_HSI        0x10230000
+#define FW_MSG_CODE_DRV_LOAD_REFUSED_REQUIRES_FORCE 0x10300000
+#define FW_MSG_CODE_DRV_LOAD_REFUSED_REJECT     0x10310000
+#define FW_MSG_CODE_DRV_LOAD_DONE		0x11100000
+#define FW_MSG_CODE_DRV_UNLOAD_ENGINE		0x20110000
+#define FW_MSG_CODE_DRV_UNLOAD_PORT		0x20120000
+#define FW_MSG_CODE_DRV_UNLOAD_FUNCTION		0x20130000
+#define FW_MSG_CODE_DRV_UNLOAD_DONE		0x21100000
+#define FW_MSG_CODE_RESOURCE_ALLOC_OK           0x34000000
+#define FW_MSG_CODE_RESOURCE_ALLOC_UNKNOWN      0x35000000
+#define FW_MSG_CODE_RESOURCE_ALLOC_DEPRECATED   0x36000000
+#define FW_MSG_CODE_S_TAG_UPDATE_ACK_DONE	0x3b000000
+#define FW_MSG_CODE_DRV_CFG_VF_MSIX_DONE	0xb0010000
+
+#define FW_MSG_CODE_NVM_OK			0x00010000
+#define FW_MSG_CODE_NVM_PUT_FILE_FINISH_OK	0x00400000
+#define FW_MSG_CODE_PHY_OK			0x00110000
+#define FW_MSG_CODE_OK				0x00160000
+#define FW_MSG_CODE_ERROR			0x00170000
+
+#define FW_MSG_CODE_OS_WOL_SUPPORTED            0x00800000
+#define FW_MSG_CODE_OS_WOL_NOT_SUPPORTED        0x00810000
+#define FW_MSG_CODE_DRV_CFG_PF_VFS_MSIX_DONE	0x00870000
+#define FW_MSG_SEQ_NUMBER_MASK			0x0000ffff
+
+	u32 fw_mb_param;
+#define FW_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_MASK	0xFFFF0000
+#define FW_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_SHIFT	16
+#define FW_MB_PARAM_RESOURCE_ALLOC_VERSION_MINOR_MASK	0x0000FFFF
+#define FW_MB_PARAM_RESOURCE_ALLOC_VERSION_MINOR_SHIFT	0
+
+	/* get pf rdma protocol command responce */
+#define FW_MB_PARAM_GET_PF_RDMA_NONE		0x0
+#define FW_MB_PARAM_GET_PF_RDMA_ROCE		0x1
+#define FW_MB_PARAM_GET_PF_RDMA_IWARP		0x2
+#define FW_MB_PARAM_GET_PF_RDMA_BOTH		0x3
+
+/* get MFW feature support response */
+#define FW_MB_PARAM_FEATURE_SUPPORT_EEE		0x00000002
+
+#define FW_MB_PARAM_LOAD_DONE_DID_EFUSE_ERROR	(1 << 0)
+
+	u32 drv_pulse_mb;
+#define DRV_PULSE_SEQ_MASK			0x00007fff
+#define DRV_PULSE_SYSTEM_TIME_MASK		0xffff0000
+#define DRV_PULSE_ALWAYS_ALIVE			0x00008000
+
+	u32 mcp_pulse_mb;
+#define MCP_PULSE_SEQ_MASK			0x00007fff
+#define MCP_PULSE_ALWAYS_ALIVE			0x00008000
+#define MCP_EVENT_MASK				0xffff0000
+#define MCP_EVENT_OTHER_DRIVER_RESET_REQ	0x00010000
+
+	union drv_union_data union_data;
+};
+
+enum MFW_DRV_MSG_TYPE {
+	MFW_DRV_MSG_LINK_CHANGE,
+	MFW_DRV_MSG_FLR_FW_ACK_FAILED,
+	MFW_DRV_MSG_VF_DISABLED,
+	MFW_DRV_MSG_LLDP_DATA_UPDATED,
+	MFW_DRV_MSG_DCBX_REMOTE_MIB_UPDATED,
+	MFW_DRV_MSG_DCBX_OPERATIONAL_MIB_UPDATED,
+	MFW_DRV_MSG_RESERVED4,
+	MFW_DRV_MSG_BW_UPDATE,
+	MFW_DRV_MSG_S_TAG_UPDATE,
+	MFW_DRV_MSG_GET_LAN_STATS,
+	MFW_DRV_MSG_GET_FCOE_STATS,
+	MFW_DRV_MSG_GET_ISCSI_STATS,
+	MFW_DRV_MSG_GET_RDMA_STATS,
+	MFW_DRV_MSG_BW_UPDATE10,
+	MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE,
+	MFW_DRV_MSG_BW_UPDATE11,
+	MFW_DRV_MSG_MAX
+};
+
+#define MFW_DRV_MSG_MAX_DWORDS(msgs)	(((msgs - 1) >> 2) + 1)
+#define MFW_DRV_MSG_DWORD(msg_id)	(msg_id >> 2)
+#define MFW_DRV_MSG_OFFSET(msg_id)	((msg_id & 0x3) << 3)
+#define MFW_DRV_MSG_MASK(msg_id)	(0xff << MFW_DRV_MSG_OFFSET(msg_id))
+
+struct public_mfw_mb {
+	u32 sup_msgs;
+	u32 msg[MFW_DRV_MSG_MAX_DWORDS(MFW_DRV_MSG_MAX)];
+	u32 ack[MFW_DRV_MSG_MAX_DWORDS(MFW_DRV_MSG_MAX)];
+};
+
+enum public_sections {
+	PUBLIC_DRV_MB,
+	PUBLIC_MFW_MB,
+	PUBLIC_GLOBAL,
+	PUBLIC_PATH,
+	PUBLIC_PORT,
+	PUBLIC_FUNC,
+	PUBLIC_MAX_SECTIONS
+};
+
+struct mcp_public_data {
+	u32 num_sections;
+	u32 sections[PUBLIC_MAX_SECTIONS];
+	struct public_drv_mb drv_mb[MCP_GLOB_FUNC_MAX];
+	struct public_mfw_mb mfw_mb[MCP_GLOB_FUNC_MAX];
+	struct public_global global;
+	struct public_path path[MCP_GLOB_PATH_MAX];
+	struct public_port port[MCP_GLOB_PORT_MAX];
+	struct public_func func[MCP_GLOB_FUNC_MAX];
+};
+
+struct nvm_cfg_mac_address {
+	u32 mac_addr_hi;
+#define NVM_CFG_MAC_ADDRESS_HI_MASK	0x0000FFFF
+#define NVM_CFG_MAC_ADDRESS_HI_OFFSET	0
+	u32 mac_addr_lo;
+};
+
+struct nvm_cfg1_glob {
+	u32 generic_cont0;
+#define NVM_CFG1_GLOB_MF_MODE_MASK		0x00000FF0
+#define NVM_CFG1_GLOB_MF_MODE_OFFSET		4
+#define NVM_CFG1_GLOB_MF_MODE_MF_ALLOWED	0x0
+#define NVM_CFG1_GLOB_MF_MODE_DEFAULT		0x1
+#define NVM_CFG1_GLOB_MF_MODE_SPIO4		0x2
+#define NVM_CFG1_GLOB_MF_MODE_NPAR1_0		0x3
+#define NVM_CFG1_GLOB_MF_MODE_NPAR1_5		0x4
+#define NVM_CFG1_GLOB_MF_MODE_NPAR2_0		0x5
+#define NVM_CFG1_GLOB_MF_MODE_BD		0x6
+#define NVM_CFG1_GLOB_MF_MODE_UFP		0x7
+	u32 engineering_change[3];
+	u32 manufacturing_id;
+	u32 serial_number[4];
+	u32 pcie_cfg;
+	u32 mgmt_traffic;
+	u32 core_cfg;
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_MASK		0x000000FF
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_OFFSET		0
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_BB_2X40G	0x0
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_2X50G		0x1
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_BB_1X100G	0x2
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_4X10G_F		0x3
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_BB_4X10G_E	0x4
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_BB_4X20G	0x5
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_1X40G		0xB
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_2X25G		0xC
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_1X25G		0xD
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_4X25G		0xE
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_2X10G		0xF
+
+	u32 e_lane_cfg1;
+	u32 e_lane_cfg2;
+	u32 f_lane_cfg1;
+	u32 f_lane_cfg2;
+	u32 mps10_preemphasis;
+	u32 mps10_driver_current;
+	u32 mps25_preemphasis;
+	u32 mps25_driver_current;
+	u32 pci_id;
+	u32 pci_subsys_id;
+	u32 bar;
+	u32 mps10_txfir_main;
+	u32 mps10_txfir_post;
+	u32 mps25_txfir_main;
+	u32 mps25_txfir_post;
+	u32 manufacture_ver;
+	u32 manufacture_time;
+	u32 led_global_settings;
+	u32 generic_cont1;
+	u32 mbi_version;
+#define NVM_CFG1_GLOB_MBI_VERSION_0_MASK		0x000000FF
+#define NVM_CFG1_GLOB_MBI_VERSION_0_OFFSET		0
+#define NVM_CFG1_GLOB_MBI_VERSION_1_MASK		0x0000FF00
+#define NVM_CFG1_GLOB_MBI_VERSION_1_OFFSET		8
+#define NVM_CFG1_GLOB_MBI_VERSION_2_MASK		0x00FF0000
+#define NVM_CFG1_GLOB_MBI_VERSION_2_OFFSET		16
+	u32 mbi_date;
+	u32 misc_sig;
+	u32 device_capabilities;
+#define NVM_CFG1_GLOB_DEVICE_CAPABILITIES_ETHERNET	0x1
+#define NVM_CFG1_GLOB_DEVICE_CAPABILITIES_FCOE		0x2
+#define NVM_CFG1_GLOB_DEVICE_CAPABILITIES_ISCSI		0x4
+#define NVM_CFG1_GLOB_DEVICE_CAPABILITIES_ROCE		0x8
+	u32 power_dissipated;
+	u32 power_consumed;
+	u32 efi_version;
+	u32 multi_network_modes_capability;
+	u32 reserved[41];
+};
+
+struct nvm_cfg1_path {
+	u32 reserved[30];
+};
+
+struct nvm_cfg1_port {
+	u32 reserved__m_relocated_to_option_123;
+	u32 reserved__m_relocated_to_option_124;
+	u32 generic_cont0;
+#define NVM_CFG1_PORT_DCBX_MODE_MASK				0x000F0000
+#define NVM_CFG1_PORT_DCBX_MODE_OFFSET				16
+#define NVM_CFG1_PORT_DCBX_MODE_DISABLED			0x0
+#define NVM_CFG1_PORT_DCBX_MODE_IEEE				0x1
+#define NVM_CFG1_PORT_DCBX_MODE_CEE				0x2
+#define NVM_CFG1_PORT_DCBX_MODE_DYNAMIC				0x3
+#define NVM_CFG1_PORT_DEFAULT_ENABLED_PROTOCOLS_MASK		0x00F00000
+#define NVM_CFG1_PORT_DEFAULT_ENABLED_PROTOCOLS_OFFSET		20
+#define NVM_CFG1_PORT_DEFAULT_ENABLED_PROTOCOLS_ETHERNET	0x1
+#define NVM_CFG1_PORT_DEFAULT_ENABLED_PROTOCOLS_FCOE		0x2
+#define NVM_CFG1_PORT_DEFAULT_ENABLED_PROTOCOLS_ISCSI		0x4
+	u32 pcie_cfg;
+	u32 features;
+	u32 speed_cap_mask;
+#define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_MASK		0x0000FFFF
+#define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_OFFSET		0
+#define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G		0x1
+#define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G		0x2
+#define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G		0x8
+#define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G		0x10
+#define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G		0x20
+#define NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G		0x40
+	u32 link_settings;
+#define NVM_CFG1_PORT_DRV_LINK_SPEED_MASK			0x0000000F
+#define NVM_CFG1_PORT_DRV_LINK_SPEED_OFFSET			0
+#define NVM_CFG1_PORT_DRV_LINK_SPEED_AUTONEG			0x0
+#define NVM_CFG1_PORT_DRV_LINK_SPEED_1G				0x1
+#define NVM_CFG1_PORT_DRV_LINK_SPEED_10G			0x2
+#define NVM_CFG1_PORT_DRV_LINK_SPEED_25G			0x4
+#define NVM_CFG1_PORT_DRV_LINK_SPEED_40G			0x5
+#define NVM_CFG1_PORT_DRV_LINK_SPEED_50G			0x6
+#define NVM_CFG1_PORT_DRV_LINK_SPEED_BB_100G			0x7
+#define NVM_CFG1_PORT_DRV_LINK_SPEED_SMARTLINQ			0x8
+#define NVM_CFG1_PORT_DRV_FLOW_CONTROL_MASK			0x00000070
+#define NVM_CFG1_PORT_DRV_FLOW_CONTROL_OFFSET			4
+#define NVM_CFG1_PORT_DRV_FLOW_CONTROL_AUTONEG			0x1
+#define NVM_CFG1_PORT_DRV_FLOW_CONTROL_RX			0x2
+#define NVM_CFG1_PORT_DRV_FLOW_CONTROL_TX			0x4
+	u32 phy_cfg;
+	u32 mgmt_traffic;
+
+	u32 ext_phy;
+	/* EEE power saving mode */
+#define NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_MASK		0x00FF0000
+#define NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_OFFSET		16
+#define NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_DISABLED		0x0
+#define NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_BALANCED		0x1
+#define NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_AGGRESSIVE		0x2
+#define NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_LOW_LATENCY		0x3
+
+	u32 mba_cfg1;
+	u32 mba_cfg2;
+	u32 vf_cfg;
+	struct nvm_cfg_mac_address lldp_mac_address;
+	u32 led_port_settings;
+	u32 transceiver_00;
+	u32 device_ids;
+	u32 board_cfg;
+	u32 mnm_10g_cap;
+	u32 mnm_10g_ctrl;
+	u32 mnm_10g_misc;
+	u32 mnm_25g_cap;
+	u32 mnm_25g_ctrl;
+	u32 mnm_25g_misc;
+	u32 mnm_40g_cap;
+	u32 mnm_40g_ctrl;
+	u32 mnm_40g_misc;
+	u32 mnm_50g_cap;
+	u32 mnm_50g_ctrl;
+	u32 mnm_50g_misc;
+	u32 mnm_100g_cap;
+	u32 mnm_100g_ctrl;
+	u32 mnm_100g_misc;
+	u32 reserved[116];
+};
+
+struct nvm_cfg1_func {
+	struct nvm_cfg_mac_address mac_address;
+	u32 rsrv1;
+	u32 rsrv2;
+	u32 device_id;
+	u32 cmn_cfg;
+	u32 pci_cfg;
+	struct nvm_cfg_mac_address fcoe_node_wwn_mac_addr;
+	struct nvm_cfg_mac_address fcoe_port_wwn_mac_addr;
+	u32 preboot_generic_cfg;
+	u32 reserved[8];
+};
+
+struct nvm_cfg1 {
+	struct nvm_cfg1_glob glob;
+	struct nvm_cfg1_path path[MCP_GLOB_PATH_MAX];
+	struct nvm_cfg1_port port[MCP_GLOB_PORT_MAX];
+	struct nvm_cfg1_func func[MCP_GLOB_FUNC_MAX];
+};
+
+enum spad_sections {
+	SPAD_SECTION_TRACE,
+	SPAD_SECTION_NVM_CFG,
+	SPAD_SECTION_PUBLIC,
+	SPAD_SECTION_PRIVATE,
+	SPAD_SECTION_MAX
+};
+
+#define MCP_TRACE_SIZE          2048	/* 2kb */
+
+/* This section is located at a fixed location in the beginning of the
+ * scratchpad, to ensure that the MCP trace is not run over during MFW upgrade.
+ * All the rest of data has a floating location which differs from version to
+ * version, and is pointed by the mcp_meta_data below.
+ * Moreover, the spad_layout section is part of the MFW firmware, and is loaded
+ * with it from nvram in order to clear this portion.
+ */
+struct static_init {
+	u32 num_sections;
+	offsize_t sections[SPAD_SECTION_MAX];
+#define SECTION(_sec_) (*((offsize_t *)(STRUCT_OFFSET(sections[_sec_]))))
+
+	struct mcp_trace trace;
+#define MCP_TRACE_P ((struct mcp_trace *)(STRUCT_OFFSET(trace)))
+	u8 trace_buffer[MCP_TRACE_SIZE];
+#define MCP_TRACE_BUF ((u8 *)(STRUCT_OFFSET(trace_buffer)))
+	/* running_mfw has the same definition as in nvm_map.h.
+	 * This bit indicate both the running dir, and the running bundle.
+	 * It is set once when the LIM is loaded.
+	 */
+	u32 running_mfw;
+#define RUNNING_MFW (*((u32 *)(STRUCT_OFFSET(running_mfw))))
+	u32 build_time;
+#define MFW_BUILD_TIME (*((u32 *)(STRUCT_OFFSET(build_time))))
+	u32 reset_type;
+#define RESET_TYPE (*((u32 *)(STRUCT_OFFSET(reset_type))))
+	u32 mfw_secure_mode;
+#define MFW_SECURE_MODE (*((u32 *)(STRUCT_OFFSET(mfw_secure_mode))))
+	u16 pme_status_pf_bitmap;
+#define PME_STATUS_PF_BITMAP (*((u16 *)(STRUCT_OFFSET(pme_status_pf_bitmap))))
+	u16 pme_enable_pf_bitmap;
+#define PME_ENABLE_PF_BITMAP (*((u16 *)(STRUCT_OFFSET(pme_enable_pf_bitmap))))
+	u32 mim_nvm_addr;
+	u32 mim_start_addr;
+	u32 ah_pcie_link_params;
+#define AH_PCIE_LINK_PARAMS_LINK_SPEED_MASK     (0x000000ff)
+#define AH_PCIE_LINK_PARAMS_LINK_SPEED_SHIFT    (0)
+#define AH_PCIE_LINK_PARAMS_LINK_WIDTH_MASK     (0x0000ff00)
+#define AH_PCIE_LINK_PARAMS_LINK_WIDTH_SHIFT    (8)
+#define AH_PCIE_LINK_PARAMS_ASPM_MODE_MASK      (0x00ff0000)
+#define AH_PCIE_LINK_PARAMS_ASPM_MODE_SHIFT     (16)
+#define AH_PCIE_LINK_PARAMS_ASPM_CAP_MASK       (0xff000000)
+#define AH_PCIE_LINK_PARAMS_ASPM_CAP_SHIFT      (24)
+#define AH_PCIE_LINK_PARAMS (*((u32 *)(STRUCT_OFFSET(ah_pcie_link_params))))
+
+	u32 rsrv_persist[5];	/* Persist reserved for MFW upgrades */
+};
+
+#define NVM_MAGIC_VALUE		0x669955aa
+
+enum nvm_image_type {
+	NVM_TYPE_TIM1 = 0x01,
+	NVM_TYPE_TIM2 = 0x02,
+	NVM_TYPE_MIM1 = 0x03,
+	NVM_TYPE_MIM2 = 0x04,
+	NVM_TYPE_MBA = 0x05,
+	NVM_TYPE_MODULES_PN = 0x06,
+	NVM_TYPE_VPD = 0x07,
+	NVM_TYPE_MFW_TRACE1 = 0x08,
+	NVM_TYPE_MFW_TRACE2 = 0x09,
+	NVM_TYPE_NVM_CFG1 = 0x0a,
+	NVM_TYPE_L2B = 0x0b,
+	NVM_TYPE_DIR1 = 0x0c,
+	NVM_TYPE_EAGLE_FW1 = 0x0d,
+	NVM_TYPE_FALCON_FW1 = 0x0e,
+	NVM_TYPE_PCIE_FW1 = 0x0f,
+	NVM_TYPE_HW_SET = 0x10,
+	NVM_TYPE_LIM = 0x11,
+	NVM_TYPE_AVS_FW1 = 0x12,
+	NVM_TYPE_DIR2 = 0x13,
+	NVM_TYPE_CCM = 0x14,
+	NVM_TYPE_EAGLE_FW2 = 0x15,
+	NVM_TYPE_FALCON_FW2 = 0x16,
+	NVM_TYPE_PCIE_FW2 = 0x17,
+	NVM_TYPE_AVS_FW2 = 0x18,
+	NVM_TYPE_INIT_HW = 0x19,
+	NVM_TYPE_DEFAULT_CFG = 0x1a,
+	NVM_TYPE_MDUMP = 0x1b,
+	NVM_TYPE_META = 0x1c,
+	NVM_TYPE_ISCSI_CFG = 0x1d,
+	NVM_TYPE_FCOE_CFG = 0x1f,
+	NVM_TYPE_ETH_PHY_FW1 = 0x20,
+	NVM_TYPE_ETH_PHY_FW2 = 0x21,
+	NVM_TYPE_MAX,
+};
+
+#define DIR_ID_1    (0)
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.c b/drivers/net/ethernet/qlogic/qed/qed_hw.c
new file mode 100644
index 0000000..fca2dbd
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.c
@@ -0,0 +1,877 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/qed/qed_chain.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_reg_addr.h"
+#include "qed_sriov.h"
+
+#define QED_BAR_ACQUIRE_TIMEOUT 1000
+
+/* Invalid values */
+#define QED_BAR_INVALID_OFFSET          (cpu_to_le32(-1))
+
+struct qed_ptt {
+	struct list_head	list_entry;
+	unsigned int		idx;
+	struct pxp_ptt_entry	pxp;
+	u8			hwfn_id;
+};
+
+struct qed_ptt_pool {
+	struct list_head	free_list;
+	spinlock_t		lock; /* ptt synchronized access */
+	struct qed_ptt		ptts[PXP_EXTERNAL_BAR_PF_WINDOW_NUM];
+};
+
+int qed_ptt_pool_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ptt_pool *p_pool = kmalloc(sizeof(*p_pool), GFP_KERNEL);
+	int i;
+
+	if (!p_pool)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&p_pool->free_list);
+	for (i = 0; i < PXP_EXTERNAL_BAR_PF_WINDOW_NUM; i++) {
+		p_pool->ptts[i].idx = i;
+		p_pool->ptts[i].pxp.offset = QED_BAR_INVALID_OFFSET;
+		p_pool->ptts[i].pxp.pretend.control = 0;
+		p_pool->ptts[i].hwfn_id = p_hwfn->my_id;
+		if (i >= RESERVED_PTT_MAX)
+			list_add(&p_pool->ptts[i].list_entry,
+				 &p_pool->free_list);
+	}
+
+	p_hwfn->p_ptt_pool = p_pool;
+	spin_lock_init(&p_pool->lock);
+
+	return 0;
+}
+
+void qed_ptt_invalidate(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ptt *p_ptt;
+	int i;
+
+	for (i = 0; i < PXP_EXTERNAL_BAR_PF_WINDOW_NUM; i++) {
+		p_ptt = &p_hwfn->p_ptt_pool->ptts[i];
+		p_ptt->pxp.offset = QED_BAR_INVALID_OFFSET;
+	}
+}
+
+void qed_ptt_pool_free(struct qed_hwfn *p_hwfn)
+{
+	kfree(p_hwfn->p_ptt_pool);
+	p_hwfn->p_ptt_pool = NULL;
+}
+
+struct qed_ptt *qed_ptt_acquire(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ptt *p_ptt;
+	unsigned int i;
+
+	/* Take the free PTT from the list */
+	for (i = 0; i < QED_BAR_ACQUIRE_TIMEOUT; i++) {
+		spin_lock_bh(&p_hwfn->p_ptt_pool->lock);
+
+		if (!list_empty(&p_hwfn->p_ptt_pool->free_list)) {
+			p_ptt = list_first_entry(&p_hwfn->p_ptt_pool->free_list,
+						 struct qed_ptt, list_entry);
+			list_del(&p_ptt->list_entry);
+
+			spin_unlock_bh(&p_hwfn->p_ptt_pool->lock);
+
+			DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+				   "allocated ptt %d\n", p_ptt->idx);
+			return p_ptt;
+		}
+
+		spin_unlock_bh(&p_hwfn->p_ptt_pool->lock);
+		usleep_range(1000, 2000);
+	}
+
+	DP_NOTICE(p_hwfn, "PTT acquire timeout - failed to allocate PTT\n");
+	return NULL;
+}
+
+void qed_ptt_release(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	spin_lock_bh(&p_hwfn->p_ptt_pool->lock);
+	list_add(&p_ptt->list_entry, &p_hwfn->p_ptt_pool->free_list);
+	spin_unlock_bh(&p_hwfn->p_ptt_pool->lock);
+}
+
+u32 qed_ptt_get_hw_addr(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	/* The HW is using DWORDS and we need to translate it to Bytes */
+	return le32_to_cpu(p_ptt->pxp.offset) << 2;
+}
+
+static u32 qed_ptt_config_addr(struct qed_ptt *p_ptt)
+{
+	return PXP_PF_WINDOW_ADMIN_PER_PF_START +
+	       p_ptt->idx * sizeof(struct pxp_ptt_entry);
+}
+
+u32 qed_ptt_get_bar_addr(struct qed_ptt *p_ptt)
+{
+	return PXP_EXTERNAL_BAR_PF_WINDOW_START +
+	       p_ptt->idx * PXP_EXTERNAL_BAR_PF_WINDOW_SINGLE_SIZE;
+}
+
+void qed_ptt_set_win(struct qed_hwfn *p_hwfn,
+		     struct qed_ptt *p_ptt, u32 new_hw_addr)
+{
+	u32 prev_hw_addr;
+
+	prev_hw_addr = qed_ptt_get_hw_addr(p_hwfn, p_ptt);
+
+	if (new_hw_addr == prev_hw_addr)
+		return;
+
+	/* Update PTT entery in admin window */
+	DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+		   "Updating PTT entry %d to offset 0x%x\n",
+		   p_ptt->idx, new_hw_addr);
+
+	/* The HW is using DWORDS and the address is in Bytes */
+	p_ptt->pxp.offset = cpu_to_le32(new_hw_addr >> 2);
+
+	REG_WR(p_hwfn,
+	       qed_ptt_config_addr(p_ptt) +
+	       offsetof(struct pxp_ptt_entry, offset),
+	       le32_to_cpu(p_ptt->pxp.offset));
+}
+
+static u32 qed_set_ptt(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt, u32 hw_addr)
+{
+	u32 win_hw_addr = qed_ptt_get_hw_addr(p_hwfn, p_ptt);
+	u32 offset;
+
+	offset = hw_addr - win_hw_addr;
+
+	if (p_ptt->hwfn_id != p_hwfn->my_id)
+		DP_NOTICE(p_hwfn,
+			  "ptt[%d] of hwfn[%02x] is used by hwfn[%02x]!\n",
+			  p_ptt->idx, p_ptt->hwfn_id, p_hwfn->my_id);
+
+	/* Verify the address is within the window */
+	if (hw_addr < win_hw_addr ||
+	    offset >= PXP_EXTERNAL_BAR_PF_WINDOW_SINGLE_SIZE) {
+		qed_ptt_set_win(p_hwfn, p_ptt, hw_addr);
+		offset = 0;
+	}
+
+	return qed_ptt_get_bar_addr(p_ptt) + offset;
+}
+
+struct qed_ptt *qed_get_reserved_ptt(struct qed_hwfn *p_hwfn,
+				     enum reserved_ptts ptt_idx)
+{
+	if (ptt_idx >= RESERVED_PTT_MAX) {
+		DP_NOTICE(p_hwfn,
+			  "Requested PTT %d is out of range\n", ptt_idx);
+		return NULL;
+	}
+
+	return &p_hwfn->p_ptt_pool->ptts[ptt_idx];
+}
+
+void qed_wr(struct qed_hwfn *p_hwfn,
+	    struct qed_ptt *p_ptt,
+	    u32 hw_addr, u32 val)
+{
+	u32 bar_addr = qed_set_ptt(p_hwfn, p_ptt, hw_addr);
+
+	REG_WR(p_hwfn, bar_addr, val);
+	DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+		   "bar_addr 0x%x, hw_addr 0x%x, val 0x%x\n",
+		   bar_addr, hw_addr, val);
+}
+
+u32 qed_rd(struct qed_hwfn *p_hwfn,
+	   struct qed_ptt *p_ptt,
+	   u32 hw_addr)
+{
+	u32 bar_addr = qed_set_ptt(p_hwfn, p_ptt, hw_addr);
+	u32 val = REG_RD(p_hwfn, bar_addr);
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+		   "bar_addr 0x%x, hw_addr 0x%x, val 0x%x\n",
+		   bar_addr, hw_addr, val);
+
+	return val;
+}
+
+static void qed_memcpy_hw(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  void *addr, u32 hw_addr, size_t n, bool to_device)
+{
+	u32 dw_count, *host_addr, hw_offset;
+	size_t quota, done = 0;
+	u32 __iomem *reg_addr;
+
+	while (done < n) {
+		quota = min_t(size_t, n - done,
+			      PXP_EXTERNAL_BAR_PF_WINDOW_SINGLE_SIZE);
+
+		if (IS_PF(p_hwfn->cdev)) {
+			qed_ptt_set_win(p_hwfn, p_ptt, hw_addr + done);
+			hw_offset = qed_ptt_get_bar_addr(p_ptt);
+		} else {
+			hw_offset = hw_addr + done;
+		}
+
+		dw_count = quota / 4;
+		host_addr = (u32 *)((u8 *)addr + done);
+		reg_addr = (u32 __iomem *)REG_ADDR(p_hwfn, hw_offset);
+		if (to_device)
+			while (dw_count--)
+				DIRECT_REG_WR(reg_addr++, *host_addr++);
+		else
+			while (dw_count--)
+				*host_addr++ = DIRECT_REG_RD(reg_addr++);
+
+		done += quota;
+	}
+}
+
+void qed_memcpy_from(struct qed_hwfn *p_hwfn,
+		     struct qed_ptt *p_ptt, void *dest, u32 hw_addr, size_t n)
+{
+	DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+		   "hw_addr 0x%x, dest %p hw_addr 0x%x, size %lu\n",
+		   hw_addr, dest, hw_addr, (unsigned long)n);
+
+	qed_memcpy_hw(p_hwfn, p_ptt, dest, hw_addr, n, false);
+}
+
+void qed_memcpy_to(struct qed_hwfn *p_hwfn,
+		   struct qed_ptt *p_ptt, u32 hw_addr, void *src, size_t n)
+{
+	DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+		   "hw_addr 0x%x, hw_addr 0x%x, src %p size %lu\n",
+		   hw_addr, hw_addr, src, (unsigned long)n);
+
+	qed_memcpy_hw(p_hwfn, p_ptt, src, hw_addr, n, true);
+}
+
+void qed_fid_pretend(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, u16 fid)
+{
+	u16 control = 0;
+
+	SET_FIELD(control, PXP_PRETEND_CMD_IS_CONCRETE, 1);
+	SET_FIELD(control, PXP_PRETEND_CMD_PRETEND_FUNCTION, 1);
+
+	/* Every pretend undos previous pretends, including
+	 * previous port pretend.
+	 */
+	SET_FIELD(control, PXP_PRETEND_CMD_PORT, 0);
+	SET_FIELD(control, PXP_PRETEND_CMD_USE_PORT, 0);
+	SET_FIELD(control, PXP_PRETEND_CMD_PRETEND_PORT, 1);
+
+	if (!GET_FIELD(fid, PXP_CONCRETE_FID_VFVALID))
+		fid = GET_FIELD(fid, PXP_CONCRETE_FID_PFID);
+
+	p_ptt->pxp.pretend.control = cpu_to_le16(control);
+	p_ptt->pxp.pretend.fid.concrete_fid.fid = cpu_to_le16(fid);
+
+	REG_WR(p_hwfn,
+	       qed_ptt_config_addr(p_ptt) +
+	       offsetof(struct pxp_ptt_entry, pretend),
+	       *(u32 *)&p_ptt->pxp.pretend);
+}
+
+void qed_port_pretend(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt, u8 port_id)
+{
+	u16 control = 0;
+
+	SET_FIELD(control, PXP_PRETEND_CMD_PORT, port_id);
+	SET_FIELD(control, PXP_PRETEND_CMD_USE_PORT, 1);
+	SET_FIELD(control, PXP_PRETEND_CMD_PRETEND_PORT, 1);
+
+	p_ptt->pxp.pretend.control = cpu_to_le16(control);
+
+	REG_WR(p_hwfn,
+	       qed_ptt_config_addr(p_ptt) +
+	       offsetof(struct pxp_ptt_entry, pretend),
+	       *(u32 *)&p_ptt->pxp.pretend);
+}
+
+void qed_port_unpretend(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u16 control = 0;
+
+	SET_FIELD(control, PXP_PRETEND_CMD_PORT, 0);
+	SET_FIELD(control, PXP_PRETEND_CMD_USE_PORT, 0);
+	SET_FIELD(control, PXP_PRETEND_CMD_PRETEND_PORT, 1);
+
+	p_ptt->pxp.pretend.control = cpu_to_le16(control);
+
+	REG_WR(p_hwfn,
+	       qed_ptt_config_addr(p_ptt) +
+	       offsetof(struct pxp_ptt_entry, pretend),
+	       *(u32 *)&p_ptt->pxp.pretend);
+}
+
+u32 qed_vfid_to_concrete(struct qed_hwfn *p_hwfn, u8 vfid)
+{
+	u32 concrete_fid = 0;
+
+	SET_FIELD(concrete_fid, PXP_CONCRETE_FID_PFID, p_hwfn->rel_pf_id);
+	SET_FIELD(concrete_fid, PXP_CONCRETE_FID_VFID, vfid);
+	SET_FIELD(concrete_fid, PXP_CONCRETE_FID_VFVALID, 1);
+
+	return concrete_fid;
+}
+
+/* DMAE */
+static void qed_dmae_opcode(struct qed_hwfn *p_hwfn,
+			    const u8 is_src_type_grc,
+			    const u8 is_dst_type_grc,
+			    struct qed_dmae_params *p_params)
+{
+	u16 opcode_b = 0;
+	u32 opcode = 0;
+
+	/* Whether the source is the PCIe or the GRC.
+	 * 0- The source is the PCIe
+	 * 1- The source is the GRC.
+	 */
+	opcode |= (is_src_type_grc ? DMAE_CMD_SRC_MASK_GRC
+				   : DMAE_CMD_SRC_MASK_PCIE) <<
+		   DMAE_CMD_SRC_SHIFT;
+	opcode |= ((p_hwfn->rel_pf_id & DMAE_CMD_SRC_PF_ID_MASK) <<
+		   DMAE_CMD_SRC_PF_ID_SHIFT);
+
+	/* The destination of the DMA can be: 0-None 1-PCIe 2-GRC 3-None */
+	opcode |= (is_dst_type_grc ? DMAE_CMD_DST_MASK_GRC
+				   : DMAE_CMD_DST_MASK_PCIE) <<
+		   DMAE_CMD_DST_SHIFT;
+	opcode |= ((p_hwfn->rel_pf_id & DMAE_CMD_DST_PF_ID_MASK) <<
+		   DMAE_CMD_DST_PF_ID_SHIFT);
+
+	/* Whether to write a completion word to the completion destination:
+	 * 0-Do not write a completion word
+	 * 1-Write the completion word
+	 */
+	opcode |= (DMAE_CMD_COMP_WORD_EN_MASK << DMAE_CMD_COMP_WORD_EN_SHIFT);
+	opcode |= (DMAE_CMD_SRC_ADDR_RESET_MASK <<
+		   DMAE_CMD_SRC_ADDR_RESET_SHIFT);
+
+	if (p_params->flags & QED_DMAE_FLAG_COMPLETION_DST)
+		opcode |= (1 << DMAE_CMD_COMP_FUNC_SHIFT);
+
+	opcode |= (DMAE_CMD_ENDIANITY << DMAE_CMD_ENDIANITY_MODE_SHIFT);
+
+	opcode |= ((p_hwfn->port_id) << DMAE_CMD_PORT_ID_SHIFT);
+
+	/* reset source address in next go */
+	opcode |= (DMAE_CMD_SRC_ADDR_RESET_MASK <<
+		   DMAE_CMD_SRC_ADDR_RESET_SHIFT);
+
+	/* reset dest address in next go */
+	opcode |= (DMAE_CMD_DST_ADDR_RESET_MASK <<
+		   DMAE_CMD_DST_ADDR_RESET_SHIFT);
+
+	/* SRC/DST VFID: all 1's - pf, otherwise VF id */
+	if (p_params->flags & QED_DMAE_FLAG_VF_SRC) {
+		opcode |= 1 << DMAE_CMD_SRC_VF_ID_VALID_SHIFT;
+		opcode_b |= p_params->src_vfid << DMAE_CMD_SRC_VF_ID_SHIFT;
+	} else {
+		opcode_b |= DMAE_CMD_SRC_VF_ID_MASK <<
+			    DMAE_CMD_SRC_VF_ID_SHIFT;
+	}
+
+	if (p_params->flags & QED_DMAE_FLAG_VF_DST) {
+		opcode |= 1 << DMAE_CMD_DST_VF_ID_VALID_SHIFT;
+		opcode_b |= p_params->dst_vfid << DMAE_CMD_DST_VF_ID_SHIFT;
+	} else {
+		opcode_b |= DMAE_CMD_DST_VF_ID_MASK << DMAE_CMD_DST_VF_ID_SHIFT;
+	}
+
+	p_hwfn->dmae_info.p_dmae_cmd->opcode = cpu_to_le32(opcode);
+	p_hwfn->dmae_info.p_dmae_cmd->opcode_b = cpu_to_le16(opcode_b);
+}
+
+u32 qed_dmae_idx_to_go_cmd(u8 idx)
+{
+	/* All the DMAE 'go' registers form an array in internal memory */
+	return DMAE_REG_GO_C0 + (idx << 2);
+}
+
+static int qed_dmae_post_command(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt)
+{
+	struct dmae_cmd *p_command = p_hwfn->dmae_info.p_dmae_cmd;
+	u8 idx_cmd = p_hwfn->dmae_info.channel, i;
+	int qed_status = 0;
+
+	/* verify address is not NULL */
+	if ((((!p_command->dst_addr_lo) && (!p_command->dst_addr_hi)) ||
+	     ((!p_command->src_addr_lo) && (!p_command->src_addr_hi)))) {
+		DP_NOTICE(p_hwfn,
+			  "source or destination address 0 idx_cmd=%d\n"
+			  "opcode = [0x%08x,0x%04x] len=0x%x src=0x%x:%x dst=0x%x:%x\n",
+			  idx_cmd,
+			  le32_to_cpu(p_command->opcode),
+			  le16_to_cpu(p_command->opcode_b),
+			  le16_to_cpu(p_command->length_dw),
+			  le32_to_cpu(p_command->src_addr_hi),
+			  le32_to_cpu(p_command->src_addr_lo),
+			  le32_to_cpu(p_command->dst_addr_hi),
+			  le32_to_cpu(p_command->dst_addr_lo));
+
+		return -EINVAL;
+	}
+
+	DP_VERBOSE(p_hwfn,
+		   NETIF_MSG_HW,
+		   "Posting DMAE command [idx %d]: opcode = [0x%08x,0x%04x] len=0x%x src=0x%x:%x dst=0x%x:%x\n",
+		   idx_cmd,
+		   le32_to_cpu(p_command->opcode),
+		   le16_to_cpu(p_command->opcode_b),
+		   le16_to_cpu(p_command->length_dw),
+		   le32_to_cpu(p_command->src_addr_hi),
+		   le32_to_cpu(p_command->src_addr_lo),
+		   le32_to_cpu(p_command->dst_addr_hi),
+		   le32_to_cpu(p_command->dst_addr_lo));
+
+	/* Copy the command to DMAE - need to do it before every call
+	 * for source/dest address no reset.
+	 * The first 9 DWs are the command registers, the 10 DW is the
+	 * GO register, and the rest are result registers
+	 * (which are read only by the client).
+	 */
+	for (i = 0; i < DMAE_CMD_SIZE; i++) {
+		u32 data = (i < DMAE_CMD_SIZE_TO_FILL) ?
+			   *(((u32 *)p_command) + i) : 0;
+
+		qed_wr(p_hwfn, p_ptt,
+		       DMAE_REG_CMD_MEM +
+		       (idx_cmd * DMAE_CMD_SIZE * sizeof(u32)) +
+		       (i * sizeof(u32)), data);
+	}
+
+	qed_wr(p_hwfn, p_ptt, qed_dmae_idx_to_go_cmd(idx_cmd), DMAE_GO_VALUE);
+
+	return qed_status;
+}
+
+int qed_dmae_info_alloc(struct qed_hwfn *p_hwfn)
+{
+	dma_addr_t *p_addr = &p_hwfn->dmae_info.completion_word_phys_addr;
+	struct dmae_cmd **p_cmd = &p_hwfn->dmae_info.p_dmae_cmd;
+	u32 **p_buff = &p_hwfn->dmae_info.p_intermediate_buffer;
+	u32 **p_comp = &p_hwfn->dmae_info.p_completion_word;
+
+	*p_comp = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				     sizeof(u32), p_addr, GFP_KERNEL);
+	if (!*p_comp)
+		goto err;
+
+	p_addr = &p_hwfn->dmae_info.dmae_cmd_phys_addr;
+	*p_cmd = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				    sizeof(struct dmae_cmd),
+				    p_addr, GFP_KERNEL);
+	if (!*p_cmd)
+		goto err;
+
+	p_addr = &p_hwfn->dmae_info.intermediate_buffer_phys_addr;
+	*p_buff = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				     sizeof(u32) * DMAE_MAX_RW_SIZE,
+				     p_addr, GFP_KERNEL);
+	if (!*p_buff)
+		goto err;
+
+	p_hwfn->dmae_info.channel = p_hwfn->rel_pf_id;
+
+	return 0;
+err:
+	qed_dmae_info_free(p_hwfn);
+	return -ENOMEM;
+}
+
+void qed_dmae_info_free(struct qed_hwfn *p_hwfn)
+{
+	dma_addr_t p_phys;
+
+	/* Just make sure no one is in the middle */
+	mutex_lock(&p_hwfn->dmae_info.mutex);
+
+	if (p_hwfn->dmae_info.p_completion_word) {
+		p_phys = p_hwfn->dmae_info.completion_word_phys_addr;
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  sizeof(u32),
+				  p_hwfn->dmae_info.p_completion_word, p_phys);
+		p_hwfn->dmae_info.p_completion_word = NULL;
+	}
+
+	if (p_hwfn->dmae_info.p_dmae_cmd) {
+		p_phys = p_hwfn->dmae_info.dmae_cmd_phys_addr;
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  sizeof(struct dmae_cmd),
+				  p_hwfn->dmae_info.p_dmae_cmd, p_phys);
+		p_hwfn->dmae_info.p_dmae_cmd = NULL;
+	}
+
+	if (p_hwfn->dmae_info.p_intermediate_buffer) {
+		p_phys = p_hwfn->dmae_info.intermediate_buffer_phys_addr;
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  sizeof(u32) * DMAE_MAX_RW_SIZE,
+				  p_hwfn->dmae_info.p_intermediate_buffer,
+				  p_phys);
+		p_hwfn->dmae_info.p_intermediate_buffer = NULL;
+	}
+
+	mutex_unlock(&p_hwfn->dmae_info.mutex);
+}
+
+static int qed_dmae_operation_wait(struct qed_hwfn *p_hwfn)
+{
+	u32 wait_cnt_limit = 10000, wait_cnt = 0;
+	int qed_status = 0;
+
+	barrier();
+	while (*p_hwfn->dmae_info.p_completion_word != DMAE_COMPLETION_VAL) {
+		udelay(DMAE_MIN_WAIT_TIME);
+		if (++wait_cnt > wait_cnt_limit) {
+			DP_NOTICE(p_hwfn->cdev,
+				  "Timed-out waiting for operation to complete. Completion word is 0x%08x expected 0x%08x.\n",
+				  *p_hwfn->dmae_info.p_completion_word,
+				 DMAE_COMPLETION_VAL);
+			qed_status = -EBUSY;
+			break;
+		}
+
+		/* to sync the completion_word since we are not
+		 * using the volatile keyword for p_completion_word
+		 */
+		barrier();
+	}
+
+	if (qed_status == 0)
+		*p_hwfn->dmae_info.p_completion_word = 0;
+
+	return qed_status;
+}
+
+static int qed_dmae_execute_sub_operation(struct qed_hwfn *p_hwfn,
+					  struct qed_ptt *p_ptt,
+					  u64 src_addr,
+					  u64 dst_addr,
+					  u8 src_type,
+					  u8 dst_type,
+					  u32 length_dw)
+{
+	dma_addr_t phys = p_hwfn->dmae_info.intermediate_buffer_phys_addr;
+	struct dmae_cmd *cmd = p_hwfn->dmae_info.p_dmae_cmd;
+	int qed_status = 0;
+
+	switch (src_type) {
+	case QED_DMAE_ADDRESS_GRC:
+	case QED_DMAE_ADDRESS_HOST_PHYS:
+		cmd->src_addr_hi = cpu_to_le32(upper_32_bits(src_addr));
+		cmd->src_addr_lo = cpu_to_le32(lower_32_bits(src_addr));
+		break;
+	/* for virtual source addresses we use the intermediate buffer. */
+	case QED_DMAE_ADDRESS_HOST_VIRT:
+		cmd->src_addr_hi = cpu_to_le32(upper_32_bits(phys));
+		cmd->src_addr_lo = cpu_to_le32(lower_32_bits(phys));
+		memcpy(&p_hwfn->dmae_info.p_intermediate_buffer[0],
+		       (void *)(uintptr_t)src_addr,
+		       length_dw * sizeof(u32));
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	switch (dst_type) {
+	case QED_DMAE_ADDRESS_GRC:
+	case QED_DMAE_ADDRESS_HOST_PHYS:
+		cmd->dst_addr_hi = cpu_to_le32(upper_32_bits(dst_addr));
+		cmd->dst_addr_lo = cpu_to_le32(lower_32_bits(dst_addr));
+		break;
+	/* for virtual source addresses we use the intermediate buffer. */
+	case QED_DMAE_ADDRESS_HOST_VIRT:
+		cmd->dst_addr_hi = cpu_to_le32(upper_32_bits(phys));
+		cmd->dst_addr_lo = cpu_to_le32(lower_32_bits(phys));
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	cmd->length_dw = cpu_to_le16((u16)length_dw);
+
+	qed_dmae_post_command(p_hwfn, p_ptt);
+
+	qed_status = qed_dmae_operation_wait(p_hwfn);
+
+	if (qed_status) {
+		DP_NOTICE(p_hwfn,
+			  "qed_dmae_host2grc: Wait Failed. source_addr 0x%llx, grc_addr 0x%llx, size_in_dwords 0x%x\n",
+			  src_addr, dst_addr, length_dw);
+		return qed_status;
+	}
+
+	if (dst_type == QED_DMAE_ADDRESS_HOST_VIRT)
+		memcpy((void *)(uintptr_t)(dst_addr),
+		       &p_hwfn->dmae_info.p_intermediate_buffer[0],
+		       length_dw * sizeof(u32));
+
+	return 0;
+}
+
+static int qed_dmae_execute_command(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    u64 src_addr, u64 dst_addr,
+				    u8 src_type, u8 dst_type,
+				    u32 size_in_dwords,
+				    struct qed_dmae_params *p_params)
+{
+	dma_addr_t phys = p_hwfn->dmae_info.completion_word_phys_addr;
+	u16 length_cur = 0, i = 0, cnt_split = 0, length_mod = 0;
+	struct dmae_cmd *cmd = p_hwfn->dmae_info.p_dmae_cmd;
+	u64 src_addr_split = 0, dst_addr_split = 0;
+	u16 length_limit = DMAE_MAX_RW_SIZE;
+	int qed_status = 0;
+	u32 offset = 0;
+
+	qed_dmae_opcode(p_hwfn,
+			(src_type == QED_DMAE_ADDRESS_GRC),
+			(dst_type == QED_DMAE_ADDRESS_GRC),
+			p_params);
+
+	cmd->comp_addr_lo = cpu_to_le32(lower_32_bits(phys));
+	cmd->comp_addr_hi = cpu_to_le32(upper_32_bits(phys));
+	cmd->comp_val = cpu_to_le32(DMAE_COMPLETION_VAL);
+
+	/* Check if the grc_addr is valid like < MAX_GRC_OFFSET */
+	cnt_split = size_in_dwords / length_limit;
+	length_mod = size_in_dwords % length_limit;
+
+	src_addr_split = src_addr;
+	dst_addr_split = dst_addr;
+
+	for (i = 0; i <= cnt_split; i++) {
+		offset = length_limit * i;
+
+		if (!(p_params->flags & QED_DMAE_FLAG_RW_REPL_SRC)) {
+			if (src_type == QED_DMAE_ADDRESS_GRC)
+				src_addr_split = src_addr + offset;
+			else
+				src_addr_split = src_addr + (offset * 4);
+		}
+
+		if (dst_type == QED_DMAE_ADDRESS_GRC)
+			dst_addr_split = dst_addr + offset;
+		else
+			dst_addr_split = dst_addr + (offset * 4);
+
+		length_cur = (cnt_split == i) ? length_mod : length_limit;
+
+		/* might be zero on last iteration */
+		if (!length_cur)
+			continue;
+
+		qed_status = qed_dmae_execute_sub_operation(p_hwfn,
+							    p_ptt,
+							    src_addr_split,
+							    dst_addr_split,
+							    src_type,
+							    dst_type,
+							    length_cur);
+		if (qed_status) {
+			DP_NOTICE(p_hwfn,
+				  "qed_dmae_execute_sub_operation Failed with error 0x%x. source_addr 0x%llx, destination addr 0x%llx, size_in_dwords 0x%x\n",
+				  qed_status, src_addr, dst_addr, length_cur);
+			break;
+		}
+	}
+
+	return qed_status;
+}
+
+int qed_dmae_host2grc(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt,
+		  u64 source_addr, u32 grc_addr, u32 size_in_dwords, u32 flags)
+{
+	u32 grc_addr_in_dw = grc_addr / sizeof(u32);
+	struct qed_dmae_params params;
+	int rc;
+
+	memset(&params, 0, sizeof(struct qed_dmae_params));
+	params.flags = flags;
+
+	mutex_lock(&p_hwfn->dmae_info.mutex);
+
+	rc = qed_dmae_execute_command(p_hwfn, p_ptt, source_addr,
+				      grc_addr_in_dw,
+				      QED_DMAE_ADDRESS_HOST_VIRT,
+				      QED_DMAE_ADDRESS_GRC,
+				      size_in_dwords, &params);
+
+	mutex_unlock(&p_hwfn->dmae_info.mutex);
+
+	return rc;
+}
+
+int qed_dmae_grc2host(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt,
+		      u32 grc_addr,
+		      dma_addr_t dest_addr, u32 size_in_dwords, u32 flags)
+{
+	u32 grc_addr_in_dw = grc_addr / sizeof(u32);
+	struct qed_dmae_params params;
+	int rc;
+
+	memset(&params, 0, sizeof(struct qed_dmae_params));
+	params.flags = flags;
+
+	mutex_lock(&p_hwfn->dmae_info.mutex);
+
+	rc = qed_dmae_execute_command(p_hwfn, p_ptt, grc_addr_in_dw,
+				      dest_addr, QED_DMAE_ADDRESS_GRC,
+				      QED_DMAE_ADDRESS_HOST_VIRT,
+				      size_in_dwords, &params);
+
+	mutex_unlock(&p_hwfn->dmae_info.mutex);
+
+	return rc;
+}
+
+int qed_dmae_host2host(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt,
+		       dma_addr_t source_addr,
+		       dma_addr_t dest_addr,
+		       u32 size_in_dwords, struct qed_dmae_params *p_params)
+{
+	int rc;
+
+	mutex_lock(&(p_hwfn->dmae_info.mutex));
+
+	rc = qed_dmae_execute_command(p_hwfn, p_ptt, source_addr,
+				      dest_addr,
+				      QED_DMAE_ADDRESS_HOST_PHYS,
+				      QED_DMAE_ADDRESS_HOST_PHYS,
+				      size_in_dwords, p_params);
+
+	mutex_unlock(&(p_hwfn->dmae_info.mutex));
+
+	return rc;
+}
+
+int qed_dmae_sanity(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt, const char *phase)
+{
+	u32 size = PAGE_SIZE / 2, val;
+	struct qed_dmae_params params;
+	int rc = 0;
+	dma_addr_t p_phys;
+	void *p_virt;
+	u32 *p_tmp;
+
+	p_virt = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				    2 * size, &p_phys, GFP_KERNEL);
+	if (!p_virt) {
+		DP_NOTICE(p_hwfn,
+			  "DMAE sanity [%s]: failed to allocate memory\n",
+			  phase);
+		return -ENOMEM;
+	}
+
+	/* Fill the bottom half of the allocated memory with a known pattern */
+	for (p_tmp = (u32 *)p_virt;
+	     p_tmp < (u32 *)((u8 *)p_virt + size); p_tmp++) {
+		/* Save the address itself as the value */
+		val = (u32)(uintptr_t)p_tmp;
+		*p_tmp = val;
+	}
+
+	/* Zero the top half of the allocated memory */
+	memset((u8 *)p_virt + size, 0, size);
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_SP,
+		   "DMAE sanity [%s]: src_addr={phys 0x%llx, virt %p}, dst_addr={phys 0x%llx, virt %p}, size 0x%x\n",
+		   phase,
+		   (u64)p_phys,
+		   p_virt, (u64)(p_phys + size), (u8 *)p_virt + size, size);
+
+	memset(&params, 0, sizeof(params));
+	rc = qed_dmae_host2host(p_hwfn, p_ptt, p_phys, p_phys + size,
+				size / 4 /* size_in_dwords */, &params);
+	if (rc) {
+		DP_NOTICE(p_hwfn,
+			  "DMAE sanity [%s]: qed_dmae_host2host() failed. rc = %d.\n",
+			  phase, rc);
+		goto out;
+	}
+
+	/* Verify that the top half of the allocated memory has the pattern */
+	for (p_tmp = (u32 *)((u8 *)p_virt + size);
+	     p_tmp < (u32 *)((u8 *)p_virt + (2 * size)); p_tmp++) {
+		/* The corresponding address in the bottom half */
+		val = (u32)(uintptr_t)p_tmp - size;
+
+		if (*p_tmp != val) {
+			DP_NOTICE(p_hwfn,
+				  "DMAE sanity [%s]: addr={phys 0x%llx, virt %p}, read_val 0x%08x, expected_val 0x%08x\n",
+				  phase,
+				  (u64)p_phys + ((u8 *)p_tmp - (u8 *)p_virt),
+				  p_tmp, *p_tmp, val);
+			rc = -EINVAL;
+			goto out;
+		}
+	}
+
+out:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev, 2 * size, p_virt, p_phys);
+	return rc;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.h b/drivers/net/ethernet/qlogic/qed/qed_hw.h
new file mode 100644
index 0000000..8db2839
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.h
@@ -0,0 +1,306 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_HW_H
+#define _QED_HW_H
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "qed.h"
+#include "qed_dev_api.h"
+
+/* Forward decleration */
+struct qed_ptt;
+
+enum reserved_ptts {
+	RESERVED_PTT_EDIAG,
+	RESERVED_PTT_USER_SPACE,
+	RESERVED_PTT_MAIN,
+	RESERVED_PTT_DPC,
+	RESERVED_PTT_MAX
+};
+
+enum _dmae_cmd_dst_mask {
+	DMAE_CMD_DST_MASK_NONE	= 0,
+	DMAE_CMD_DST_MASK_PCIE	= 1,
+	DMAE_CMD_DST_MASK_GRC	= 2
+};
+
+enum _dmae_cmd_src_mask {
+	DMAE_CMD_SRC_MASK_PCIE	= 0,
+	DMAE_CMD_SRC_MASK_GRC	= 1
+};
+
+enum _dmae_cmd_crc_mask {
+	DMAE_CMD_COMP_CRC_EN_MASK_NONE	= 0,
+	DMAE_CMD_COMP_CRC_EN_MASK_SET	= 1
+};
+
+/* definitions for DMA constants */
+#define DMAE_GO_VALUE   0x1
+
+#define DMAE_COMPLETION_VAL     0xD1AE
+#define DMAE_CMD_ENDIANITY      0x2
+
+#define DMAE_CMD_SIZE   14
+#define DMAE_CMD_SIZE_TO_FILL   (DMAE_CMD_SIZE - 5)
+#define DMAE_MIN_WAIT_TIME      0x2
+#define DMAE_MAX_CLIENTS        32
+
+/**
+ * @brief qed_gtt_init - Initialize GTT windows
+ *
+ * @param p_hwfn
+ */
+void qed_gtt_init(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_ptt_invalidate - Forces all ptt entries to be re-configured
+ *
+ * @param p_hwfn
+ */
+void qed_ptt_invalidate(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_ptt_pool_alloc - Allocate and initialize PTT pool
+ *
+ * @param p_hwfn
+ *
+ * @return struct _qed_status - success (0), negative - error.
+ */
+int qed_ptt_pool_alloc(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_ptt_pool_free -
+ *
+ * @param p_hwfn
+ */
+void qed_ptt_pool_free(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_ptt_get_hw_addr - Get PTT's GRC/HW address
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @return u32
+ */
+u32 qed_ptt_get_hw_addr(struct qed_hwfn *p_hwfn,
+			struct qed_ptt *p_ptt);
+
+/**
+ * @brief qed_ptt_get_bar_addr - Get PPT's external BAR address
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @return u32
+ */
+u32 qed_ptt_get_bar_addr(struct qed_ptt *p_ptt);
+
+/**
+ * @brief qed_ptt_set_win - Set PTT Window's GRC BAR address
+ *
+ * @param p_hwfn
+ * @param new_hw_addr
+ * @param p_ptt
+ */
+void qed_ptt_set_win(struct qed_hwfn *p_hwfn,
+		     struct qed_ptt *p_ptt,
+		     u32 new_hw_addr);
+
+/**
+ * @brief qed_get_reserved_ptt - Get a specific reserved PTT
+ *
+ * @param p_hwfn
+ * @param ptt_idx
+ *
+ * @return struct qed_ptt *
+ */
+struct qed_ptt *qed_get_reserved_ptt(struct qed_hwfn *p_hwfn,
+				     enum reserved_ptts ptt_idx);
+
+/**
+ * @brief qed_wr - Write value to BAR using the given ptt
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param val
+ * @param hw_addr
+ */
+void qed_wr(struct qed_hwfn *p_hwfn,
+	    struct qed_ptt *p_ptt,
+	    u32 hw_addr,
+	    u32 val);
+
+/**
+ * @brief qed_rd - Read value from BAR using the given ptt
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param val
+ * @param hw_addr
+ */
+u32 qed_rd(struct qed_hwfn *p_hwfn,
+	   struct qed_ptt *p_ptt,
+	   u32 hw_addr);
+
+/**
+ * @brief qed_memcpy_from - copy n bytes from BAR using the given
+ *        ptt
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param dest
+ * @param hw_addr
+ * @param n
+ */
+void qed_memcpy_from(struct qed_hwfn *p_hwfn,
+		     struct qed_ptt *p_ptt,
+		     void *dest,
+		     u32 hw_addr,
+		     size_t n);
+
+/**
+ * @brief qed_memcpy_to - copy n bytes to BAR using the given
+ *        ptt
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param hw_addr
+ * @param src
+ * @param n
+ */
+void qed_memcpy_to(struct qed_hwfn *p_hwfn,
+		   struct qed_ptt *p_ptt,
+		   u32 hw_addr,
+		   void *src,
+		   size_t n);
+/**
+ * @brief qed_fid_pretend - pretend to another function when
+ *        accessing the ptt window. There is no way to unpretend
+ *        a function. The only way to cancel a pretend is to
+ *        pretend back to the original function.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param fid - fid field of pxp_pretend structure. Can contain
+ *            either pf / vf, port/path fields are don't care.
+ */
+void qed_fid_pretend(struct qed_hwfn *p_hwfn,
+		     struct qed_ptt *p_ptt,
+		     u16 fid);
+
+/**
+ * @brief qed_port_pretend - pretend to another port when
+ *        accessing the ptt window
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param port_id - the port to pretend to
+ */
+void qed_port_pretend(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt,
+		      u8 port_id);
+
+/**
+ * @brief qed_port_unpretend - cancel any previously set port
+ *        pretend
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+void qed_port_unpretend(struct qed_hwfn *p_hwfn,
+			struct qed_ptt *p_ptt);
+
+/**
+ * @brief qed_vfid_to_concrete - build a concrete FID for a
+ *        given VF ID
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param vfid
+ */
+u32 qed_vfid_to_concrete(struct qed_hwfn *p_hwfn, u8 vfid);
+
+/**
+ * @brief qed_dmae_idx_to_go_cmd - map the idx to dmae cmd
+ * this is declared here since other files will require it.
+ * @param idx
+ */
+u32 qed_dmae_idx_to_go_cmd(u8 idx);
+
+/**
+ * @brief qed_dmae_info_alloc - Init the dmae_info structure
+ * which is part of p_hwfn.
+ * @param p_hwfn
+ */
+int qed_dmae_info_alloc(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_dmae_info_free - Free the dmae_info structure
+ * which is part of p_hwfn
+ *
+ * @param p_hwfn
+ */
+void qed_dmae_info_free(struct qed_hwfn *p_hwfn);
+
+union qed_qm_pq_params {
+	struct {
+		u8 q_idx;
+	} iscsi;
+
+	struct {
+		u8 tc;
+	}	core;
+
+	struct {
+		u8	is_vf;
+		u8	vf_id;
+		u8	tc;
+	}	eth;
+
+	struct {
+		u8 dcqcn;
+		u8 qpid;	/* roce relative */
+	} roce;
+};
+
+int qed_init_fw_data(struct qed_dev *cdev,
+		     const u8 *fw_data);
+
+int qed_dmae_sanity(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt, const char *phase);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
new file mode 100644
index 0000000..1365da7
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
@@ -0,0 +1,1511 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/crc8.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_init_ops.h"
+#include "qed_reg_addr.h"
+
+#define CDU_VALIDATION_DEFAULT_CFG	61
+
+static u16 con_region_offsets[3][NUM_OF_CONNECTION_TYPES_E4] = {
+	{400, 336, 352, 304, 304, 384, 416, 352},	/* region 3 offsets */
+	{528, 496, 416, 448, 448, 512, 544, 480},	/* region 4 offsets */
+	{608, 544, 496, 512, 576, 592, 624, 560}	/* region 5 offsets */
+};
+
+static u16 task_region_offsets[1][NUM_OF_CONNECTION_TYPES_E4] = {
+	{240, 240, 112, 0, 0, 0, 0, 96}	/* region 1 offsets */
+};
+
+/* General constants */
+#define QM_PQ_MEM_4KB(pq_size)	(pq_size ? DIV_ROUND_UP((pq_size + 1) *	\
+							QM_PQ_ELEMENT_SIZE, \
+							0x1000) : 0)
+#define QM_PQ_SIZE_256B(pq_size)	(pq_size ? DIV_ROUND_UP(pq_size, \
+								0x100) - 1 : 0)
+#define QM_INVALID_PQ_ID		0xffff
+
+/* Feature enable */
+#define QM_BYPASS_EN	1
+#define QM_BYTE_CRD_EN	1
+
+/* Other PQ constants */
+#define QM_OTHER_PQS_PER_PF	4
+
+/* WFQ constants */
+
+/* Upper bound in MB, 10 * burst size of 1ms in 50Gbps */
+#define QM_WFQ_UPPER_BOUND	62500000
+
+/* Bit  of VOQ in WFQ VP PQ map */
+#define QM_WFQ_VP_PQ_VOQ_SHIFT	0
+
+/* Bit  of PF in WFQ VP PQ map */
+#define QM_WFQ_VP_PQ_PF_E4_SHIFT	5
+
+/* 0x9000 = 4*9*1024 */
+#define QM_WFQ_INC_VAL(weight)	((weight) * 0x9000)
+
+/* Max WFQ increment value is 0.7 * upper bound */
+#define QM_WFQ_MAX_INC_VAL	((QM_WFQ_UPPER_BOUND * 7) / 10)
+
+/* RL constants */
+
+/* Period in us */
+#define QM_RL_PERIOD	5
+
+/* Period in 25MHz cycles */
+#define QM_RL_PERIOD_CLK_25M	(25 * QM_RL_PERIOD)
+
+/* RL increment value - rate is specified in mbps */
+#define QM_RL_INC_VAL(rate) ({ \
+	typeof(rate) __rate = (rate); \
+	max_t(u32, \
+	      (u32)(((__rate ? __rate : 1000000) * QM_RL_PERIOD * 101) / \
+		    (8 * 100)), \
+	      1); })
+
+/* PF RL Upper bound is set to 10 * burst size of 1ms in 50Gbps */
+#define QM_PF_RL_UPPER_BOUND	62500000
+
+/* Max PF RL increment value is 0.7 * upper bound */
+#define QM_PF_RL_MAX_INC_VAL	((QM_PF_RL_UPPER_BOUND * 7) / 10)
+
+/* Vport RL Upper bound, link speed is in Mpbs */
+#define QM_VP_RL_UPPER_BOUND(speed)	((u32)max_t(u32, \
+						    QM_RL_INC_VAL(speed), \
+						    9700 + 1000))
+
+/* Max Vport RL increment value is the Vport RL upper bound */
+#define QM_VP_RL_MAX_INC_VAL(speed)	QM_VP_RL_UPPER_BOUND(speed)
+
+/* Vport RL credit threshold in case of QM bypass */
+#define QM_VP_RL_BYPASS_THRESH_SPEED	(QM_VP_RL_UPPER_BOUND(10000) - 1)
+
+/* AFullOprtnstcCrdMask constants */
+#define QM_OPPOR_LINE_VOQ_DEF	1
+#define QM_OPPOR_FW_STOP_DEF	0
+#define QM_OPPOR_PQ_EMPTY_DEF	1
+
+/* Command Queue constants */
+
+/* Pure LB CmdQ lines (+spare) */
+#define PBF_CMDQ_PURE_LB_LINES	150
+
+#define PBF_CMDQ_LINES_E5_RSVD_RATIO	8
+
+#define PBF_CMDQ_LINES_RT_OFFSET(ext_voq) \
+	(PBF_REG_YCMD_QS_NUM_LINES_VOQ0_RT_OFFSET + \
+	 (ext_voq) * (PBF_REG_YCMD_QS_NUM_LINES_VOQ1_RT_OFFSET - \
+		PBF_REG_YCMD_QS_NUM_LINES_VOQ0_RT_OFFSET))
+
+#define PBF_BTB_GUARANTEED_RT_OFFSET(ext_voq) \
+	(PBF_REG_BTB_GUARANTEED_VOQ0_RT_OFFSET + \
+	 (ext_voq) * (PBF_REG_BTB_GUARANTEED_VOQ1_RT_OFFSET - \
+		PBF_REG_BTB_GUARANTEED_VOQ0_RT_OFFSET))
+
+#define QM_VOQ_LINE_CRD(pbf_cmd_lines) \
+	((((pbf_cmd_lines) - 4) * 2) | QM_LINE_CRD_REG_SIGN_BIT)
+
+/* BTB: blocks constants (block size = 256B) */
+
+/* 256B blocks in 9700B packet */
+#define BTB_JUMBO_PKT_BLOCKS	38
+
+/* Headroom per-port */
+#define BTB_HEADROOM_BLOCKS	BTB_JUMBO_PKT_BLOCKS
+#define BTB_PURE_LB_FACTOR	10
+
+/* Factored (hence really 0.7) */
+#define BTB_PURE_LB_RATIO	7
+
+/* QM stop command constants */
+#define QM_STOP_PQ_MASK_WIDTH		32
+#define QM_STOP_CMD_ADDR		2
+#define QM_STOP_CMD_STRUCT_SIZE		2
+#define QM_STOP_CMD_PAUSE_MASK_OFFSET	0
+#define QM_STOP_CMD_PAUSE_MASK_SHIFT	0
+#define QM_STOP_CMD_PAUSE_MASK_MASK	-1
+#define QM_STOP_CMD_GROUP_ID_OFFSET	1
+#define QM_STOP_CMD_GROUP_ID_SHIFT	16
+#define QM_STOP_CMD_GROUP_ID_MASK	15
+#define QM_STOP_CMD_PQ_TYPE_OFFSET	1
+#define QM_STOP_CMD_PQ_TYPE_SHIFT	24
+#define QM_STOP_CMD_PQ_TYPE_MASK	1
+#define QM_STOP_CMD_MAX_POLL_COUNT	100
+#define QM_STOP_CMD_POLL_PERIOD_US	500
+
+/* QM command macros */
+#define QM_CMD_STRUCT_SIZE(cmd)	cmd ## _STRUCT_SIZE
+#define QM_CMD_SET_FIELD(var, cmd, field, value) \
+	SET_FIELD(var[cmd ## _ ## field ## _OFFSET], \
+		  cmd ## _ ## field, \
+		  value)
+
+#define QM_INIT_TX_PQ_MAP(p_hwfn, map, chip, pq_id, rl_valid, vp_pq_id, rl_id, \
+			  ext_voq, wrr) \
+	do { \
+		typeof(map) __map; \
+		memset(&__map, 0, sizeof(__map)); \
+		SET_FIELD(__map.reg, QM_RF_PQ_MAP_ ## chip ## _PQ_VALID, 1); \
+		SET_FIELD(__map.reg, QM_RF_PQ_MAP_ ## chip ## _RL_VALID, \
+			  rl_valid); \
+		SET_FIELD(__map.reg, QM_RF_PQ_MAP_ ## chip ## _VP_PQ_ID, \
+			  vp_pq_id); \
+		SET_FIELD(__map.reg, QM_RF_PQ_MAP_ ## chip ## _RL_ID, rl_id); \
+		SET_FIELD(__map.reg, QM_RF_PQ_MAP_ ## chip ## _VOQ, ext_voq); \
+		SET_FIELD(__map.reg, \
+			  QM_RF_PQ_MAP_ ## chip ## _WRR_WEIGHT_GROUP, wrr); \
+		STORE_RT_REG(p_hwfn, QM_REG_TXPQMAP_RT_OFFSET + (pq_id), \
+			     *((u32 *)&__map)); \
+		(map) = __map; \
+	} while (0)
+
+#define WRITE_PQ_INFO_TO_RAM	1
+#define PQ_INFO_ELEMENT(vp, pf, tc, port, rl_valid, rl) \
+	(((vp) << 0) | ((pf) << 12) | ((tc) << 16) | ((port) << 20) | \
+	((rl_valid) << 22) | ((rl) << 24))
+#define PQ_INFO_RAM_GRC_ADDRESS(pq_id) \
+	(XSEM_REG_FAST_MEMORY + SEM_FAST_REG_INT_RAM + 21776 + (pq_id) * 4)
+
+/******************** INTERNAL IMPLEMENTATION *********************/
+
+/* Returns the external VOQ number */
+static u8 qed_get_ext_voq(struct qed_hwfn *p_hwfn,
+			  u8 port_id, u8 tc, u8 max_phys_tcs_per_port)
+{
+	if (tc == PURE_LB_TC)
+		return NUM_OF_PHYS_TCS * MAX_NUM_PORTS_BB + port_id;
+	else
+		return port_id * max_phys_tcs_per_port + tc;
+}
+
+/* Prepare PF RL enable/disable runtime init values */
+static void qed_enable_pf_rl(struct qed_hwfn *p_hwfn, bool pf_rl_en)
+{
+	STORE_RT_REG(p_hwfn, QM_REG_RLPFENABLE_RT_OFFSET, pf_rl_en ? 1 : 0);
+	if (pf_rl_en) {
+		u8 num_ext_voqs = MAX_NUM_VOQS_E4;
+		u64 voq_bit_mask = ((u64)1 << num_ext_voqs) - 1;
+
+		/* Enable RLs for all VOQs */
+		STORE_RT_REG(p_hwfn,
+			     QM_REG_RLPFVOQENABLE_RT_OFFSET,
+			     (u32)voq_bit_mask);
+		if (num_ext_voqs >= 32)
+			STORE_RT_REG(p_hwfn, QM_REG_RLPFVOQENABLE_MSB_RT_OFFSET,
+				     (u32)(voq_bit_mask >> 32));
+
+		/* Write RL period */
+		STORE_RT_REG(p_hwfn,
+			     QM_REG_RLPFPERIOD_RT_OFFSET, QM_RL_PERIOD_CLK_25M);
+		STORE_RT_REG(p_hwfn,
+			     QM_REG_RLPFPERIODTIMER_RT_OFFSET,
+			     QM_RL_PERIOD_CLK_25M);
+
+		/* Set credit threshold for QM bypass flow */
+		if (QM_BYPASS_EN)
+			STORE_RT_REG(p_hwfn,
+				     QM_REG_AFULLQMBYPTHRPFRL_RT_OFFSET,
+				     QM_PF_RL_UPPER_BOUND);
+	}
+}
+
+/* Prepare PF WFQ enable/disable runtime init values */
+static void qed_enable_pf_wfq(struct qed_hwfn *p_hwfn, bool pf_wfq_en)
+{
+	STORE_RT_REG(p_hwfn, QM_REG_WFQPFENABLE_RT_OFFSET, pf_wfq_en ? 1 : 0);
+
+	/* Set credit threshold for QM bypass flow */
+	if (pf_wfq_en && QM_BYPASS_EN)
+		STORE_RT_REG(p_hwfn,
+			     QM_REG_AFULLQMBYPTHRPFWFQ_RT_OFFSET,
+			     QM_WFQ_UPPER_BOUND);
+}
+
+/* Prepare VPORT RL enable/disable runtime init values */
+static void qed_enable_vport_rl(struct qed_hwfn *p_hwfn, bool vport_rl_en)
+{
+	STORE_RT_REG(p_hwfn, QM_REG_RLGLBLENABLE_RT_OFFSET,
+		     vport_rl_en ? 1 : 0);
+	if (vport_rl_en) {
+		/* Write RL period (use timer 0 only) */
+		STORE_RT_REG(p_hwfn,
+			     QM_REG_RLGLBLPERIOD_0_RT_OFFSET,
+			     QM_RL_PERIOD_CLK_25M);
+		STORE_RT_REG(p_hwfn,
+			     QM_REG_RLGLBLPERIODTIMER_0_RT_OFFSET,
+			     QM_RL_PERIOD_CLK_25M);
+
+		/* Set credit threshold for QM bypass flow */
+		if (QM_BYPASS_EN)
+			STORE_RT_REG(p_hwfn,
+				     QM_REG_AFULLQMBYPTHRGLBLRL_RT_OFFSET,
+				     QM_VP_RL_BYPASS_THRESH_SPEED);
+	}
+}
+
+/* Prepare VPORT WFQ enable/disable runtime init values */
+static void qed_enable_vport_wfq(struct qed_hwfn *p_hwfn, bool vport_wfq_en)
+{
+	STORE_RT_REG(p_hwfn, QM_REG_WFQVPENABLE_RT_OFFSET,
+		     vport_wfq_en ? 1 : 0);
+
+	/* Set credit threshold for QM bypass flow */
+	if (vport_wfq_en && QM_BYPASS_EN)
+		STORE_RT_REG(p_hwfn,
+			     QM_REG_AFULLQMBYPTHRVPWFQ_RT_OFFSET,
+			     QM_WFQ_UPPER_BOUND);
+}
+
+/* Prepare runtime init values to allocate PBF command queue lines for
+ * the specified VOQ.
+ */
+static void qed_cmdq_lines_voq_rt_init(struct qed_hwfn *p_hwfn,
+				       u8 ext_voq, u16 cmdq_lines)
+{
+	u32 qm_line_crd = QM_VOQ_LINE_CRD(cmdq_lines);
+
+	OVERWRITE_RT_REG(p_hwfn, PBF_CMDQ_LINES_RT_OFFSET(ext_voq),
+			 (u32)cmdq_lines);
+	STORE_RT_REG(p_hwfn, QM_REG_VOQCRDLINE_RT_OFFSET + ext_voq,
+		     qm_line_crd);
+	STORE_RT_REG(p_hwfn, QM_REG_VOQINITCRDLINE_RT_OFFSET + ext_voq,
+		     qm_line_crd);
+}
+
+/* Prepare runtime init values to allocate PBF command queue lines. */
+static void qed_cmdq_lines_rt_init(
+	struct qed_hwfn *p_hwfn,
+	u8 max_ports_per_engine,
+	u8 max_phys_tcs_per_port,
+	struct init_qm_port_params port_params[MAX_NUM_PORTS])
+{
+	u8 tc, ext_voq, port_id, num_tcs_in_port;
+	u8 num_ext_voqs = MAX_NUM_VOQS_E4;
+
+	/* Clear PBF lines of all VOQs */
+	for (ext_voq = 0; ext_voq < num_ext_voqs; ext_voq++)
+		STORE_RT_REG(p_hwfn, PBF_CMDQ_LINES_RT_OFFSET(ext_voq), 0);
+
+	for (port_id = 0; port_id < max_ports_per_engine; port_id++) {
+		u16 phys_lines, phys_lines_per_tc;
+
+		if (!port_params[port_id].active)
+			continue;
+
+		/* Find number of command queue lines to divide between the
+		 * active physical TCs. In E5, 1/8 of the lines are reserved.
+		 * the lines for pure LB TC are subtracted.
+		 */
+		phys_lines = port_params[port_id].num_pbf_cmd_lines;
+		phys_lines -= PBF_CMDQ_PURE_LB_LINES;
+
+		/* Find #lines per active physical TC */
+		num_tcs_in_port = 0;
+		for (tc = 0; tc < max_phys_tcs_per_port; tc++)
+			if (((port_params[port_id].active_phys_tcs >>
+			      tc) & 0x1) == 1)
+				num_tcs_in_port++;
+		phys_lines_per_tc = phys_lines / num_tcs_in_port;
+
+		/* Init registers per active TC */
+		for (tc = 0; tc < max_phys_tcs_per_port; tc++) {
+			ext_voq = qed_get_ext_voq(p_hwfn,
+						  port_id,
+						  tc, max_phys_tcs_per_port);
+			if (((port_params[port_id].active_phys_tcs >>
+			      tc) & 0x1) == 1)
+				qed_cmdq_lines_voq_rt_init(p_hwfn,
+							   ext_voq,
+							   phys_lines_per_tc);
+		}
+
+		/* Init registers for pure LB TC */
+		ext_voq = qed_get_ext_voq(p_hwfn,
+					  port_id,
+					  PURE_LB_TC, max_phys_tcs_per_port);
+		qed_cmdq_lines_voq_rt_init(p_hwfn,
+					   ext_voq, PBF_CMDQ_PURE_LB_LINES);
+	}
+}
+
+static void qed_btb_blocks_rt_init(
+	struct qed_hwfn *p_hwfn,
+	u8 max_ports_per_engine,
+	u8 max_phys_tcs_per_port,
+	struct init_qm_port_params port_params[MAX_NUM_PORTS])
+{
+	u32 usable_blocks, pure_lb_blocks, phys_blocks;
+	u8 tc, ext_voq, port_id, num_tcs_in_port;
+
+	for (port_id = 0; port_id < max_ports_per_engine; port_id++) {
+		if (!port_params[port_id].active)
+			continue;
+
+		/* Subtract headroom blocks */
+		usable_blocks = port_params[port_id].num_btb_blocks -
+				BTB_HEADROOM_BLOCKS;
+
+		/* Find blocks per physical TC. Use factor to avoid floating
+		 * arithmethic.
+		 */
+		num_tcs_in_port = 0;
+		for (tc = 0; tc < NUM_OF_PHYS_TCS; tc++)
+			if (((port_params[port_id].active_phys_tcs >>
+			      tc) & 0x1) == 1)
+				num_tcs_in_port++;
+
+		pure_lb_blocks = (usable_blocks * BTB_PURE_LB_FACTOR) /
+				 (num_tcs_in_port * BTB_PURE_LB_FACTOR +
+				  BTB_PURE_LB_RATIO);
+		pure_lb_blocks = max_t(u32, BTB_JUMBO_PKT_BLOCKS,
+				       pure_lb_blocks / BTB_PURE_LB_FACTOR);
+		phys_blocks = (usable_blocks - pure_lb_blocks) /
+			      num_tcs_in_port;
+
+		/* Init physical TCs */
+		for (tc = 0; tc < NUM_OF_PHYS_TCS; tc++) {
+			if (((port_params[port_id].active_phys_tcs >>
+			      tc) & 0x1) == 1) {
+				ext_voq =
+					qed_get_ext_voq(p_hwfn,
+							port_id,
+							tc,
+							max_phys_tcs_per_port);
+				STORE_RT_REG(p_hwfn,
+					     PBF_BTB_GUARANTEED_RT_OFFSET
+					     (ext_voq), phys_blocks);
+			}
+		}
+
+		/* Init pure LB TC */
+		ext_voq = qed_get_ext_voq(p_hwfn,
+					  port_id,
+					  PURE_LB_TC, max_phys_tcs_per_port);
+		STORE_RT_REG(p_hwfn, PBF_BTB_GUARANTEED_RT_OFFSET(ext_voq),
+			     pure_lb_blocks);
+	}
+}
+
+/* Prepare Tx PQ mapping runtime init values for the specified PF */
+static void qed_tx_pq_map_rt_init(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_qm_pf_rt_init_params *p_params,
+				  u32 base_mem_addr_4kb)
+{
+	u32 tx_pq_vf_mask[MAX_QM_TX_QUEUES / QM_PF_QUEUE_GROUP_SIZE] = { 0 };
+	struct init_qm_vport_params *vport_params = p_params->vport_params;
+	u32 num_tx_pq_vf_masks = MAX_QM_TX_QUEUES / QM_PF_QUEUE_GROUP_SIZE;
+	u16 num_pqs, first_pq_group, last_pq_group, i, j, pq_id, pq_group;
+	struct init_qm_pq_params *pq_params = p_params->pq_params;
+	u32 pq_mem_4kb, vport_pq_mem_4kb, mem_addr_4kb;
+
+	num_pqs = p_params->num_pf_pqs + p_params->num_vf_pqs;
+
+	first_pq_group = p_params->start_pq / QM_PF_QUEUE_GROUP_SIZE;
+	last_pq_group = (p_params->start_pq + num_pqs - 1) /
+			QM_PF_QUEUE_GROUP_SIZE;
+
+	pq_mem_4kb = QM_PQ_MEM_4KB(p_params->num_pf_cids);
+	vport_pq_mem_4kb = QM_PQ_MEM_4KB(p_params->num_vf_cids);
+	mem_addr_4kb = base_mem_addr_4kb;
+
+	/* Set mapping from PQ group to PF */
+	for (pq_group = first_pq_group; pq_group <= last_pq_group; pq_group++)
+		STORE_RT_REG(p_hwfn, QM_REG_PQTX2PF_0_RT_OFFSET + pq_group,
+			     (u32)(p_params->pf_id));
+
+	/* Set PQ sizes */
+	STORE_RT_REG(p_hwfn, QM_REG_MAXPQSIZE_0_RT_OFFSET,
+		     QM_PQ_SIZE_256B(p_params->num_pf_cids));
+	STORE_RT_REG(p_hwfn, QM_REG_MAXPQSIZE_1_RT_OFFSET,
+		     QM_PQ_SIZE_256B(p_params->num_vf_cids));
+
+	/* Go over all Tx PQs */
+	for (i = 0, pq_id = p_params->start_pq; i < num_pqs; i++, pq_id++) {
+		u8 ext_voq, vport_id_in_pf, tc_id = pq_params[i].tc_id;
+		u32 max_qm_global_rls = MAX_QM_GLOBAL_RLS;
+		struct qm_rf_pq_map_e4 tx_pq_map;
+		bool is_vf_pq, rl_valid;
+		u16 *p_first_tx_pq_id;
+
+		ext_voq = qed_get_ext_voq(p_hwfn,
+					  pq_params[i].port_id,
+					  tc_id,
+					  p_params->max_phys_tcs_per_port);
+		is_vf_pq = (i >= p_params->num_pf_pqs);
+		rl_valid = pq_params[i].rl_valid > 0;
+
+		/* Update first Tx PQ of VPORT/TC */
+		vport_id_in_pf = pq_params[i].vport_id - p_params->start_vport;
+		p_first_tx_pq_id =
+		    &vport_params[vport_id_in_pf].first_tx_pq_id[tc_id];
+		if (*p_first_tx_pq_id == QM_INVALID_PQ_ID) {
+			u32 map_val =
+				(ext_voq << QM_WFQ_VP_PQ_VOQ_SHIFT) |
+				(p_params->pf_id << QM_WFQ_VP_PQ_PF_E4_SHIFT);
+
+			/* Create new VP PQ */
+			*p_first_tx_pq_id = pq_id;
+
+			/* Map VP PQ to VOQ and PF */
+			STORE_RT_REG(p_hwfn,
+				     QM_REG_WFQVPMAP_RT_OFFSET +
+				     *p_first_tx_pq_id,
+				     map_val);
+		}
+
+		/* Check RL ID */
+		if (rl_valid && pq_params[i].vport_id >= max_qm_global_rls) {
+			DP_NOTICE(p_hwfn,
+				  "Invalid VPORT ID for rate limiter configuration\n");
+			rl_valid = false;
+		}
+
+		/* Prepare PQ map entry */
+		QM_INIT_TX_PQ_MAP(p_hwfn,
+				  tx_pq_map,
+				  E4,
+				  pq_id,
+				  rl_valid ? 1 : 0,
+				  *p_first_tx_pq_id,
+				  rl_valid ? pq_params[i].vport_id : 0,
+				  ext_voq, pq_params[i].wrr_group);
+
+		/* Set PQ base address */
+		STORE_RT_REG(p_hwfn,
+			     QM_REG_BASEADDRTXPQ_RT_OFFSET + pq_id,
+			     mem_addr_4kb);
+
+		/* Clear PQ pointer table entry (64 bit) */
+		if (p_params->is_pf_loading)
+			for (j = 0; j < 2; j++)
+				STORE_RT_REG(p_hwfn,
+					     QM_REG_PTRTBLTX_RT_OFFSET +
+					     (pq_id * 2) + j, 0);
+
+		/* Write PQ info to RAM */
+		if (WRITE_PQ_INFO_TO_RAM != 0) {
+			u32 pq_info = 0;
+
+			pq_info = PQ_INFO_ELEMENT(*p_first_tx_pq_id,
+						  p_params->pf_id,
+						  tc_id,
+						  pq_params[i].port_id,
+						  rl_valid ? 1 : 0,
+						  rl_valid ?
+						  pq_params[i].vport_id : 0);
+			qed_wr(p_hwfn, p_ptt, PQ_INFO_RAM_GRC_ADDRESS(pq_id),
+			       pq_info);
+		}
+
+		/* If VF PQ, add indication to PQ VF mask */
+		if (is_vf_pq) {
+			tx_pq_vf_mask[pq_id /
+				      QM_PF_QUEUE_GROUP_SIZE] |=
+			    BIT((pq_id % QM_PF_QUEUE_GROUP_SIZE));
+			mem_addr_4kb += vport_pq_mem_4kb;
+		} else {
+			mem_addr_4kb += pq_mem_4kb;
+		}
+	}
+
+	/* Store Tx PQ VF mask to size select register */
+	for (i = 0; i < num_tx_pq_vf_masks; i++)
+		if (tx_pq_vf_mask[i])
+			STORE_RT_REG(p_hwfn,
+				     QM_REG_MAXPQSIZETXSEL_0_RT_OFFSET + i,
+				     tx_pq_vf_mask[i]);
+}
+
+/* Prepare Other PQ mapping runtime init values for the specified PF */
+static void qed_other_pq_map_rt_init(struct qed_hwfn *p_hwfn,
+				     u8 pf_id,
+				     bool is_pf_loading,
+				     u32 num_pf_cids,
+				     u32 num_tids, u32 base_mem_addr_4kb)
+{
+	u32 pq_size, pq_mem_4kb, mem_addr_4kb;
+	u16 i, j, pq_id, pq_group;
+
+	/* A single other PQ group is used in each PF, where PQ group i is used
+	 * in PF i.
+	 */
+	pq_group = pf_id;
+	pq_size = num_pf_cids + num_tids;
+	pq_mem_4kb = QM_PQ_MEM_4KB(pq_size);
+	mem_addr_4kb = base_mem_addr_4kb;
+
+	/* Map PQ group to PF */
+	STORE_RT_REG(p_hwfn, QM_REG_PQOTHER2PF_0_RT_OFFSET + pq_group,
+		     (u32)(pf_id));
+
+	/* Set PQ sizes */
+	STORE_RT_REG(p_hwfn, QM_REG_MAXPQSIZE_2_RT_OFFSET,
+		     QM_PQ_SIZE_256B(pq_size));
+
+	for (i = 0, pq_id = pf_id * QM_PF_QUEUE_GROUP_SIZE;
+	     i < QM_OTHER_PQS_PER_PF; i++, pq_id++) {
+		/* Set PQ base address */
+		STORE_RT_REG(p_hwfn,
+			     QM_REG_BASEADDROTHERPQ_RT_OFFSET + pq_id,
+			     mem_addr_4kb);
+
+		/* Clear PQ pointer table entry */
+		if (is_pf_loading)
+			for (j = 0; j < 2; j++)
+				STORE_RT_REG(p_hwfn,
+					     QM_REG_PTRTBLOTHER_RT_OFFSET +
+					     (pq_id * 2) + j, 0);
+
+		mem_addr_4kb += pq_mem_4kb;
+	}
+}
+
+/* Prepare PF WFQ runtime init values for the specified PF.
+ * Return -1 on error.
+ */
+static int qed_pf_wfq_rt_init(struct qed_hwfn *p_hwfn,
+
+			      struct qed_qm_pf_rt_init_params *p_params)
+{
+	u16 num_tx_pqs = p_params->num_pf_pqs + p_params->num_vf_pqs;
+	struct init_qm_pq_params *pq_params = p_params->pq_params;
+	u32 inc_val, crd_reg_offset;
+	u8 ext_voq;
+	u16 i;
+
+	inc_val = QM_WFQ_INC_VAL(p_params->pf_wfq);
+	if (!inc_val || inc_val > QM_WFQ_MAX_INC_VAL) {
+		DP_NOTICE(p_hwfn, "Invalid PF WFQ weight configuration\n");
+		return -1;
+	}
+
+	for (i = 0; i < num_tx_pqs; i++) {
+		ext_voq = qed_get_ext_voq(p_hwfn,
+					  pq_params[i].port_id,
+					  pq_params[i].tc_id,
+					  p_params->max_phys_tcs_per_port);
+		crd_reg_offset =
+			(p_params->pf_id < MAX_NUM_PFS_BB ?
+			 QM_REG_WFQPFCRD_RT_OFFSET :
+			 QM_REG_WFQPFCRD_MSB_RT_OFFSET) +
+			ext_voq * MAX_NUM_PFS_BB +
+			(p_params->pf_id % MAX_NUM_PFS_BB);
+		OVERWRITE_RT_REG(p_hwfn,
+				 crd_reg_offset, (u32)QM_WFQ_CRD_REG_SIGN_BIT);
+	}
+
+	STORE_RT_REG(p_hwfn,
+		     QM_REG_WFQPFUPPERBOUND_RT_OFFSET + p_params->pf_id,
+		     QM_WFQ_UPPER_BOUND | (u32)QM_WFQ_CRD_REG_SIGN_BIT);
+	STORE_RT_REG(p_hwfn, QM_REG_WFQPFWEIGHT_RT_OFFSET + p_params->pf_id,
+		     inc_val);
+
+	return 0;
+}
+
+/* Prepare PF RL runtime init values for the specified PF.
+ * Return -1 on error.
+ */
+static int qed_pf_rl_rt_init(struct qed_hwfn *p_hwfn, u8 pf_id, u32 pf_rl)
+{
+	u32 inc_val = QM_RL_INC_VAL(pf_rl);
+
+	if (inc_val > QM_PF_RL_MAX_INC_VAL) {
+		DP_NOTICE(p_hwfn, "Invalid PF rate limit configuration\n");
+		return -1;
+	}
+
+	STORE_RT_REG(p_hwfn,
+		     QM_REG_RLPFCRD_RT_OFFSET + pf_id,
+		     (u32)QM_RL_CRD_REG_SIGN_BIT);
+	STORE_RT_REG(p_hwfn,
+		     QM_REG_RLPFUPPERBOUND_RT_OFFSET + pf_id,
+		     QM_PF_RL_UPPER_BOUND | (u32)QM_RL_CRD_REG_SIGN_BIT);
+	STORE_RT_REG(p_hwfn, QM_REG_RLPFINCVAL_RT_OFFSET + pf_id, inc_val);
+
+	return 0;
+}
+
+/* Prepare VPORT WFQ runtime init values for the specified VPORTs.
+ * Return -1 on error.
+ */
+static int qed_vp_wfq_rt_init(struct qed_hwfn *p_hwfn,
+			      u8 num_vports,
+			      struct init_qm_vport_params *vport_params)
+{
+	u16 vport_pq_id;
+	u32 inc_val;
+	u8 tc, i;
+
+	/* Go over all PF VPORTs */
+	for (i = 0; i < num_vports; i++) {
+		if (!vport_params[i].vport_wfq)
+			continue;
+
+		inc_val = QM_WFQ_INC_VAL(vport_params[i].vport_wfq);
+		if (inc_val > QM_WFQ_MAX_INC_VAL) {
+			DP_NOTICE(p_hwfn,
+				  "Invalid VPORT WFQ weight configuration\n");
+			return -1;
+		}
+
+		/* Each VPORT can have several VPORT PQ IDs for various TCs */
+		for (tc = 0; tc < NUM_OF_TCS; tc++) {
+			vport_pq_id = vport_params[i].first_tx_pq_id[tc];
+			if (vport_pq_id != QM_INVALID_PQ_ID) {
+				STORE_RT_REG(p_hwfn,
+					     QM_REG_WFQVPCRD_RT_OFFSET +
+					     vport_pq_id,
+					     (u32)QM_WFQ_CRD_REG_SIGN_BIT);
+				STORE_RT_REG(p_hwfn,
+					     QM_REG_WFQVPWEIGHT_RT_OFFSET +
+					     vport_pq_id, inc_val);
+			}
+		}
+	}
+
+	return 0;
+}
+
+/* Prepare VPORT RL runtime init values for the specified VPORTs.
+ * Return -1 on error.
+ */
+static int qed_vport_rl_rt_init(struct qed_hwfn *p_hwfn,
+				u8 start_vport,
+				u8 num_vports,
+				u32 link_speed,
+				struct init_qm_vport_params *vport_params)
+{
+	u8 i, vport_id;
+	u32 inc_val;
+
+	if (start_vport + num_vports >= MAX_QM_GLOBAL_RLS) {
+		DP_NOTICE(p_hwfn,
+			  "Invalid VPORT ID for rate limiter configuration\n");
+		return -1;
+	}
+
+	/* Go over all PF VPORTs */
+	for (i = 0, vport_id = start_vport; i < num_vports; i++, vport_id++) {
+		inc_val = QM_RL_INC_VAL(vport_params[i].vport_rl ?
+			  vport_params[i].vport_rl :
+			  link_speed);
+		if (inc_val > QM_VP_RL_MAX_INC_VAL(link_speed)) {
+			DP_NOTICE(p_hwfn,
+				  "Invalid VPORT rate-limit configuration\n");
+			return -1;
+		}
+
+		STORE_RT_REG(p_hwfn, QM_REG_RLGLBLCRD_RT_OFFSET + vport_id,
+			     (u32)QM_RL_CRD_REG_SIGN_BIT);
+		STORE_RT_REG(p_hwfn,
+			     QM_REG_RLGLBLUPPERBOUND_RT_OFFSET + vport_id,
+			     QM_VP_RL_UPPER_BOUND(link_speed) |
+			     (u32)QM_RL_CRD_REG_SIGN_BIT);
+		STORE_RT_REG(p_hwfn, QM_REG_RLGLBLINCVAL_RT_OFFSET + vport_id,
+			     inc_val);
+	}
+
+	return 0;
+}
+
+static bool qed_poll_on_qm_cmd_ready(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt)
+{
+	u32 reg_val, i;
+
+	for (i = 0, reg_val = 0; i < QM_STOP_CMD_MAX_POLL_COUNT && !reg_val;
+	     i++) {
+		udelay(QM_STOP_CMD_POLL_PERIOD_US);
+		reg_val = qed_rd(p_hwfn, p_ptt, QM_REG_SDMCMDREADY);
+	}
+
+	/* Check if timeout while waiting for SDM command ready */
+	if (i == QM_STOP_CMD_MAX_POLL_COUNT) {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "Timeout when waiting for QM SDM command ready signal\n");
+		return false;
+	}
+
+	return true;
+}
+
+static bool qed_send_qm_cmd(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt,
+			    u32 cmd_addr, u32 cmd_data_lsb, u32 cmd_data_msb)
+{
+	if (!qed_poll_on_qm_cmd_ready(p_hwfn, p_ptt))
+		return false;
+
+	qed_wr(p_hwfn, p_ptt, QM_REG_SDMCMDADDR, cmd_addr);
+	qed_wr(p_hwfn, p_ptt, QM_REG_SDMCMDDATALSB, cmd_data_lsb);
+	qed_wr(p_hwfn, p_ptt, QM_REG_SDMCMDDATAMSB, cmd_data_msb);
+	qed_wr(p_hwfn, p_ptt, QM_REG_SDMCMDGO, 1);
+	qed_wr(p_hwfn, p_ptt, QM_REG_SDMCMDGO, 0);
+
+	return qed_poll_on_qm_cmd_ready(p_hwfn, p_ptt);
+}
+
+/******************** INTERFACE IMPLEMENTATION *********************/
+
+u32 qed_qm_pf_mem_size(u32 num_pf_cids,
+		       u32 num_vf_cids,
+		       u32 num_tids, u16 num_pf_pqs, u16 num_vf_pqs)
+{
+	return QM_PQ_MEM_4KB(num_pf_cids) * num_pf_pqs +
+	       QM_PQ_MEM_4KB(num_vf_cids) * num_vf_pqs +
+	       QM_PQ_MEM_4KB(num_pf_cids + num_tids) * QM_OTHER_PQS_PER_PF;
+}
+
+int qed_qm_common_rt_init(struct qed_hwfn *p_hwfn,
+			  struct qed_qm_common_rt_init_params *p_params)
+{
+	/* Init AFullOprtnstcCrdMask */
+	u32 mask = (QM_OPPOR_LINE_VOQ_DEF <<
+		    QM_RF_OPPORTUNISTIC_MASK_LINEVOQ_SHIFT) |
+		   (QM_BYTE_CRD_EN << QM_RF_OPPORTUNISTIC_MASK_BYTEVOQ_SHIFT) |
+		   (p_params->pf_wfq_en <<
+		    QM_RF_OPPORTUNISTIC_MASK_PFWFQ_SHIFT) |
+		   (p_params->vport_wfq_en <<
+		    QM_RF_OPPORTUNISTIC_MASK_VPWFQ_SHIFT) |
+		   (p_params->pf_rl_en <<
+		    QM_RF_OPPORTUNISTIC_MASK_PFRL_SHIFT) |
+		   (p_params->vport_rl_en <<
+		    QM_RF_OPPORTUNISTIC_MASK_VPQCNRL_SHIFT) |
+		   (QM_OPPOR_FW_STOP_DEF <<
+		    QM_RF_OPPORTUNISTIC_MASK_FWPAUSE_SHIFT) |
+		   (QM_OPPOR_PQ_EMPTY_DEF <<
+		    QM_RF_OPPORTUNISTIC_MASK_QUEUEEMPTY_SHIFT);
+
+	STORE_RT_REG(p_hwfn, QM_REG_AFULLOPRTNSTCCRDMASK_RT_OFFSET, mask);
+
+	/* Enable/disable PF RL */
+	qed_enable_pf_rl(p_hwfn, p_params->pf_rl_en);
+
+	/* Enable/disable PF WFQ */
+	qed_enable_pf_wfq(p_hwfn, p_params->pf_wfq_en);
+
+	/* Enable/disable VPORT RL */
+	qed_enable_vport_rl(p_hwfn, p_params->vport_rl_en);
+
+	/* Enable/disable VPORT WFQ */
+	qed_enable_vport_wfq(p_hwfn, p_params->vport_wfq_en);
+
+	/* Init PBF CMDQ line credit */
+	qed_cmdq_lines_rt_init(p_hwfn,
+			       p_params->max_ports_per_engine,
+			       p_params->max_phys_tcs_per_port,
+			       p_params->port_params);
+
+	/* Init BTB blocks in PBF */
+	qed_btb_blocks_rt_init(p_hwfn,
+			       p_params->max_ports_per_engine,
+			       p_params->max_phys_tcs_per_port,
+			       p_params->port_params);
+
+	return 0;
+}
+
+int qed_qm_pf_rt_init(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt,
+		      struct qed_qm_pf_rt_init_params *p_params)
+{
+	struct init_qm_vport_params *vport_params = p_params->vport_params;
+	u32 other_mem_size_4kb = QM_PQ_MEM_4KB(p_params->num_pf_cids +
+					       p_params->num_tids) *
+				 QM_OTHER_PQS_PER_PF;
+	u8 tc, i;
+
+	/* Clear first Tx PQ ID array for each VPORT */
+	for (i = 0; i < p_params->num_vports; i++)
+		for (tc = 0; tc < NUM_OF_TCS; tc++)
+			vport_params[i].first_tx_pq_id[tc] = QM_INVALID_PQ_ID;
+
+	/* Map Other PQs (if any) */
+	qed_other_pq_map_rt_init(p_hwfn,
+				 p_params->pf_id,
+				 p_params->is_pf_loading, p_params->num_pf_cids,
+				 p_params->num_tids, 0);
+
+	/* Map Tx PQs */
+	qed_tx_pq_map_rt_init(p_hwfn, p_ptt, p_params, other_mem_size_4kb);
+
+	/* Init PF WFQ */
+	if (p_params->pf_wfq)
+		if (qed_pf_wfq_rt_init(p_hwfn, p_params))
+			return -1;
+
+	/* Init PF RL */
+	if (qed_pf_rl_rt_init(p_hwfn, p_params->pf_id, p_params->pf_rl))
+		return -1;
+
+	/* Set VPORT WFQ */
+	if (qed_vp_wfq_rt_init(p_hwfn, p_params->num_vports, vport_params))
+		return -1;
+
+	/* Set VPORT RL */
+	if (qed_vport_rl_rt_init(p_hwfn, p_params->start_vport,
+				 p_params->num_vports, p_params->link_speed,
+				 vport_params))
+		return -1;
+
+	return 0;
+}
+
+int qed_init_pf_wfq(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt, u8 pf_id, u16 pf_wfq)
+{
+	u32 inc_val = QM_WFQ_INC_VAL(pf_wfq);
+
+	if (!inc_val || inc_val > QM_WFQ_MAX_INC_VAL) {
+		DP_NOTICE(p_hwfn, "Invalid PF WFQ weight configuration\n");
+		return -1;
+	}
+
+	qed_wr(p_hwfn, p_ptt, QM_REG_WFQPFWEIGHT + pf_id * 4, inc_val);
+
+	return 0;
+}
+
+int qed_init_pf_rl(struct qed_hwfn *p_hwfn,
+		   struct qed_ptt *p_ptt, u8 pf_id, u32 pf_rl)
+{
+	u32 inc_val = QM_RL_INC_VAL(pf_rl);
+
+	if (inc_val > QM_PF_RL_MAX_INC_VAL) {
+		DP_NOTICE(p_hwfn, "Invalid PF rate limit configuration\n");
+		return -1;
+	}
+
+	qed_wr(p_hwfn,
+	       p_ptt, QM_REG_RLPFCRD + pf_id * 4, (u32)QM_RL_CRD_REG_SIGN_BIT);
+	qed_wr(p_hwfn, p_ptt, QM_REG_RLPFINCVAL + pf_id * 4, inc_val);
+
+	return 0;
+}
+
+int qed_init_vport_wfq(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt,
+		       u16 first_tx_pq_id[NUM_OF_TCS], u16 vport_wfq)
+{
+	u16 vport_pq_id;
+	u32 inc_val;
+	u8 tc;
+
+	inc_val = QM_WFQ_INC_VAL(vport_wfq);
+	if (!inc_val || inc_val > QM_WFQ_MAX_INC_VAL) {
+		DP_NOTICE(p_hwfn, "Invalid VPORT WFQ weight configuration\n");
+		return -1;
+	}
+
+	for (tc = 0; tc < NUM_OF_TCS; tc++) {
+		vport_pq_id = first_tx_pq_id[tc];
+		if (vport_pq_id != QM_INVALID_PQ_ID)
+			qed_wr(p_hwfn,
+			       p_ptt,
+			       QM_REG_WFQVPWEIGHT + vport_pq_id * 4, inc_val);
+	}
+
+	return 0;
+}
+
+int qed_init_vport_rl(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt,
+		      u8 vport_id, u32 vport_rl, u32 link_speed)
+{
+	u32 inc_val, max_qm_global_rls = MAX_QM_GLOBAL_RLS;
+
+	if (vport_id >= max_qm_global_rls) {
+		DP_NOTICE(p_hwfn,
+			  "Invalid VPORT ID for rate limiter configuration\n");
+		return -1;
+	}
+
+	inc_val = QM_RL_INC_VAL(vport_rl ? vport_rl : link_speed);
+	if (inc_val > QM_VP_RL_MAX_INC_VAL(link_speed)) {
+		DP_NOTICE(p_hwfn, "Invalid VPORT rate-limit configuration\n");
+		return -1;
+	}
+
+	qed_wr(p_hwfn,
+	       p_ptt,
+	       QM_REG_RLGLBLCRD + vport_id * 4, (u32)QM_RL_CRD_REG_SIGN_BIT);
+	qed_wr(p_hwfn, p_ptt, QM_REG_RLGLBLINCVAL + vport_id * 4, inc_val);
+
+	return 0;
+}
+
+bool qed_send_qm_stop_cmd(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  bool is_release_cmd,
+			  bool is_tx_pq, u16 start_pq, u16 num_pqs)
+{
+	u32 cmd_arr[QM_CMD_STRUCT_SIZE(QM_STOP_CMD)] = { 0 };
+	u32 pq_mask = 0, last_pq, pq_id;
+
+	last_pq = start_pq + num_pqs - 1;
+
+	/* Set command's PQ type */
+	QM_CMD_SET_FIELD(cmd_arr, QM_STOP_CMD, PQ_TYPE, is_tx_pq ? 0 : 1);
+
+	/* Go over requested PQs */
+	for (pq_id = start_pq; pq_id <= last_pq; pq_id++) {
+		/* Set PQ bit in mask (stop command only) */
+		if (!is_release_cmd)
+			pq_mask |= BIT((pq_id % QM_STOP_PQ_MASK_WIDTH));
+
+		/* If last PQ or end of PQ mask, write command */
+		if ((pq_id == last_pq) ||
+		    (pq_id % QM_STOP_PQ_MASK_WIDTH ==
+		     (QM_STOP_PQ_MASK_WIDTH - 1))) {
+			QM_CMD_SET_FIELD(cmd_arr,
+					 QM_STOP_CMD, PAUSE_MASK, pq_mask);
+			QM_CMD_SET_FIELD(cmd_arr,
+					 QM_STOP_CMD,
+					 GROUP_ID,
+					 pq_id / QM_STOP_PQ_MASK_WIDTH);
+			if (!qed_send_qm_cmd(p_hwfn, p_ptt, QM_STOP_CMD_ADDR,
+					     cmd_arr[0], cmd_arr[1]))
+				return false;
+			pq_mask = 0;
+		}
+	}
+
+	return true;
+}
+
+
+#define SET_TUNNEL_TYPE_ENABLE_BIT(var, offset, enable) \
+	do { \
+		typeof(var) *__p_var = &(var); \
+		typeof(offset) __offset = offset; \
+		*__p_var = (*__p_var & ~BIT(__offset)) | \
+			   ((enable) ? BIT(__offset) : 0); \
+	} while (0)
+#define PRS_ETH_TUNN_OUTPUT_FORMAT        -188897008
+#define PRS_ETH_OUTPUT_FORMAT             -46832
+
+void qed_set_vxlan_dest_port(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt, u16 dest_port)
+{
+	/* Update PRS register */
+	qed_wr(p_hwfn, p_ptt, PRS_REG_VXLAN_PORT, dest_port);
+
+	/* Update NIG register */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_VXLAN_CTRL, dest_port);
+
+	/* Update PBF register */
+	qed_wr(p_hwfn, p_ptt, PBF_REG_VXLAN_PORT, dest_port);
+}
+
+void qed_set_vxlan_enable(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, bool vxlan_enable)
+{
+	u32 reg_val;
+	u8 shift;
+
+	/* Update PRS register */
+	reg_val = qed_rd(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN);
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_VXLAN_ENABLE_SHIFT;
+	SET_TUNNEL_TYPE_ENABLE_BIT(reg_val, shift, vxlan_enable);
+	qed_wr(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN, reg_val);
+	if (reg_val) {
+		reg_val =
+		    qed_rd(p_hwfn, p_ptt, PRS_REG_OUTPUT_FORMAT_4_0_BB_K2);
+
+		/* Update output  only if tunnel blocks not included. */
+		if (reg_val == (u32)PRS_ETH_OUTPUT_FORMAT)
+			qed_wr(p_hwfn, p_ptt, PRS_REG_OUTPUT_FORMAT_4_0_BB_K2,
+			       (u32)PRS_ETH_TUNN_OUTPUT_FORMAT);
+	}
+
+	/* Update NIG register */
+	reg_val = qed_rd(p_hwfn, p_ptt, NIG_REG_ENC_TYPE_ENABLE);
+	shift = NIG_REG_ENC_TYPE_ENABLE_VXLAN_ENABLE_SHIFT;
+	SET_TUNNEL_TYPE_ENABLE_BIT(reg_val, shift, vxlan_enable);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_ENC_TYPE_ENABLE, reg_val);
+
+	/* Update DORQ register */
+	qed_wr(p_hwfn,
+	       p_ptt, DORQ_REG_L2_EDPM_TUNNEL_VXLAN_EN, vxlan_enable ? 1 : 0);
+}
+
+void qed_set_gre_enable(struct qed_hwfn *p_hwfn,
+			struct qed_ptt *p_ptt,
+			bool eth_gre_enable, bool ip_gre_enable)
+{
+	u32 reg_val;
+	u8 shift;
+
+	/* Update PRS register */
+	reg_val = qed_rd(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN);
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_ETH_OVER_GRE_ENABLE_SHIFT;
+	SET_TUNNEL_TYPE_ENABLE_BIT(reg_val, shift, eth_gre_enable);
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_IP_OVER_GRE_ENABLE_SHIFT;
+	SET_TUNNEL_TYPE_ENABLE_BIT(reg_val, shift, ip_gre_enable);
+	qed_wr(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN, reg_val);
+	if (reg_val) {
+		reg_val =
+		    qed_rd(p_hwfn, p_ptt, PRS_REG_OUTPUT_FORMAT_4_0_BB_K2);
+
+		/* Update output  only if tunnel blocks not included. */
+		if (reg_val == (u32)PRS_ETH_OUTPUT_FORMAT)
+			qed_wr(p_hwfn, p_ptt, PRS_REG_OUTPUT_FORMAT_4_0_BB_K2,
+			       (u32)PRS_ETH_TUNN_OUTPUT_FORMAT);
+	}
+
+	/* Update NIG register */
+	reg_val = qed_rd(p_hwfn, p_ptt, NIG_REG_ENC_TYPE_ENABLE);
+	shift = NIG_REG_ENC_TYPE_ENABLE_ETH_OVER_GRE_ENABLE_SHIFT;
+	SET_TUNNEL_TYPE_ENABLE_BIT(reg_val, shift, eth_gre_enable);
+	shift = NIG_REG_ENC_TYPE_ENABLE_IP_OVER_GRE_ENABLE_SHIFT;
+	SET_TUNNEL_TYPE_ENABLE_BIT(reg_val, shift, ip_gre_enable);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_ENC_TYPE_ENABLE, reg_val);
+
+	/* Update DORQ registers */
+	qed_wr(p_hwfn,
+	       p_ptt,
+	       DORQ_REG_L2_EDPM_TUNNEL_GRE_ETH_EN, eth_gre_enable ? 1 : 0);
+	qed_wr(p_hwfn,
+	       p_ptt, DORQ_REG_L2_EDPM_TUNNEL_GRE_IP_EN, ip_gre_enable ? 1 : 0);
+}
+
+void qed_set_geneve_dest_port(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt, u16 dest_port)
+{
+	/* Update PRS register */
+	qed_wr(p_hwfn, p_ptt, PRS_REG_NGE_PORT, dest_port);
+
+	/* Update NIG register */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_NGE_PORT, dest_port);
+
+	/* Update PBF register */
+	qed_wr(p_hwfn, p_ptt, PBF_REG_NGE_PORT, dest_port);
+}
+
+void qed_set_geneve_enable(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt,
+			   bool eth_geneve_enable, bool ip_geneve_enable)
+{
+	u32 reg_val;
+	u8 shift;
+
+	/* Update PRS register */
+	reg_val = qed_rd(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN);
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_ETH_OVER_GENEVE_ENABLE_SHIFT;
+	SET_TUNNEL_TYPE_ENABLE_BIT(reg_val, shift, eth_geneve_enable);
+	shift = PRS_REG_ENCAPSULATION_TYPE_EN_IP_OVER_GENEVE_ENABLE_SHIFT;
+	SET_TUNNEL_TYPE_ENABLE_BIT(reg_val, shift, ip_geneve_enable);
+	qed_wr(p_hwfn, p_ptt, PRS_REG_ENCAPSULATION_TYPE_EN, reg_val);
+	if (reg_val) {
+		reg_val =
+		    qed_rd(p_hwfn, p_ptt, PRS_REG_OUTPUT_FORMAT_4_0_BB_K2);
+
+		/* Update output  only if tunnel blocks not included. */
+		if (reg_val == (u32)PRS_ETH_OUTPUT_FORMAT)
+			qed_wr(p_hwfn, p_ptt, PRS_REG_OUTPUT_FORMAT_4_0_BB_K2,
+			       (u32)PRS_ETH_TUNN_OUTPUT_FORMAT);
+	}
+
+	/* Update NIG register */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_NGE_ETH_ENABLE,
+	       eth_geneve_enable ? 1 : 0);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_NGE_IP_ENABLE, ip_geneve_enable ? 1 : 0);
+
+	/* EDPM with geneve tunnel not supported in BB */
+	if (QED_IS_BB_B0(p_hwfn->cdev))
+		return;
+
+	/* Update DORQ registers */
+	qed_wr(p_hwfn,
+	       p_ptt,
+	       DORQ_REG_L2_EDPM_TUNNEL_NGE_ETH_EN_K2_E5,
+	       eth_geneve_enable ? 1 : 0);
+	qed_wr(p_hwfn,
+	       p_ptt,
+	       DORQ_REG_L2_EDPM_TUNNEL_NGE_IP_EN_K2_E5,
+	       ip_geneve_enable ? 1 : 0);
+}
+
+#define PRS_ETH_VXLAN_NO_L2_ENABLE_OFFSET   4
+#define PRS_ETH_VXLAN_NO_L2_OUTPUT_FORMAT      -927094512
+
+void qed_set_vxlan_no_l2_enable(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt, bool enable)
+{
+	u32 reg_val, cfg_mask;
+
+	/* read PRS config register */
+	reg_val = qed_rd(p_hwfn, p_ptt, PRS_REG_MSG_INFO);
+
+	/* set VXLAN_NO_L2_ENABLE mask */
+	cfg_mask = BIT(PRS_ETH_VXLAN_NO_L2_ENABLE_OFFSET);
+
+	if (enable) {
+		/* set VXLAN_NO_L2_ENABLE flag */
+		reg_val |= cfg_mask;
+
+		/* update PRS FIC  register */
+		qed_wr(p_hwfn,
+		       p_ptt,
+		       PRS_REG_OUTPUT_FORMAT_4_0_BB_K2,
+		       (u32)PRS_ETH_VXLAN_NO_L2_OUTPUT_FORMAT);
+	} else {
+		/* clear VXLAN_NO_L2_ENABLE flag */
+		reg_val &= ~cfg_mask;
+	}
+
+	/* write PRS config register */
+	qed_wr(p_hwfn, p_ptt, PRS_REG_MSG_INFO, reg_val);
+}
+
+#define T_ETH_PACKET_ACTION_GFT_EVENTID  23
+#define PARSER_ETH_CONN_GFT_ACTION_CM_HDR  272
+#define T_ETH_PACKET_MATCH_RFS_EVENTID 25
+#define PARSER_ETH_CONN_CM_HDR 0
+#define CAM_LINE_SIZE sizeof(u32)
+#define RAM_LINE_SIZE sizeof(u64)
+#define REG_SIZE sizeof(u32)
+
+void qed_gft_disable(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, u16 pf_id)
+{
+	/* Disable gft search for PF */
+	qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_GFT, 0);
+
+	/* Clean ram & cam for next gft session */
+
+	/* Zero camline */
+	qed_wr(p_hwfn, p_ptt, PRS_REG_GFT_CAM + CAM_LINE_SIZE * pf_id, 0);
+
+	/* Zero ramline */
+	qed_wr(p_hwfn,
+	       p_ptt, PRS_REG_GFT_PROFILE_MASK_RAM + RAM_LINE_SIZE * pf_id, 0);
+	qed_wr(p_hwfn,
+	       p_ptt,
+	       PRS_REG_GFT_PROFILE_MASK_RAM + RAM_LINE_SIZE * pf_id + REG_SIZE,
+	       0);
+}
+
+void qed_set_gft_event_id_cm_hdr(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 rfs_cm_hdr_event_id;
+
+	/* Set RFS event ID to be awakened i Tstorm By Prs */
+	rfs_cm_hdr_event_id = qed_rd(p_hwfn, p_ptt, PRS_REG_CM_HDR_GFT);
+	rfs_cm_hdr_event_id |= T_ETH_PACKET_ACTION_GFT_EVENTID <<
+			       PRS_REG_CM_HDR_GFT_EVENT_ID_SHIFT;
+	rfs_cm_hdr_event_id |= PARSER_ETH_CONN_GFT_ACTION_CM_HDR <<
+			       PRS_REG_CM_HDR_GFT_CM_HDR_SHIFT;
+	qed_wr(p_hwfn, p_ptt, PRS_REG_CM_HDR_GFT, rfs_cm_hdr_event_id);
+}
+
+void qed_gft_config(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt,
+		    u16 pf_id,
+		    bool tcp,
+		    bool udp,
+		    bool ipv4, bool ipv6, enum gft_profile_type profile_type)
+{
+	u32 reg_val, cam_line, ram_line_lo, ram_line_hi;
+
+	if (!ipv6 && !ipv4)
+		DP_NOTICE(p_hwfn,
+			  "gft_config: must accept at least on of - ipv4 or ipv6'\n");
+	if (!tcp && !udp)
+		DP_NOTICE(p_hwfn,
+			  "gft_config: must accept at least on of - udp or tcp\n");
+	if (profile_type >= MAX_GFT_PROFILE_TYPE)
+		DP_NOTICE(p_hwfn, "gft_config: unsupported gft_profile_type\n");
+
+	/* Set RFS event ID to be awakened i Tstorm By Prs */
+	reg_val = T_ETH_PACKET_MATCH_RFS_EVENTID <<
+		  PRS_REG_CM_HDR_GFT_EVENT_ID_SHIFT;
+	reg_val |= PARSER_ETH_CONN_CM_HDR << PRS_REG_CM_HDR_GFT_CM_HDR_SHIFT;
+	qed_wr(p_hwfn, p_ptt, PRS_REG_CM_HDR_GFT, reg_val);
+
+	/* Do not load context only cid in PRS on match. */
+	qed_wr(p_hwfn, p_ptt, PRS_REG_LOAD_L2_FILTER, 0);
+
+	/* Do not use tenant ID exist bit for gft search */
+	qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_TENANT_ID, 0);
+
+	/* Set Cam */
+	cam_line = 0;
+	SET_FIELD(cam_line, GFT_CAM_LINE_MAPPED_VALID, 1);
+
+	/* Filters are per PF!! */
+	SET_FIELD(cam_line,
+		  GFT_CAM_LINE_MAPPED_PF_ID_MASK,
+		  GFT_CAM_LINE_MAPPED_PF_ID_MASK_MASK);
+	SET_FIELD(cam_line, GFT_CAM_LINE_MAPPED_PF_ID, pf_id);
+
+	if (!(tcp && udp)) {
+		SET_FIELD(cam_line,
+			  GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE_MASK,
+			  GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE_MASK_MASK);
+		if (tcp)
+			SET_FIELD(cam_line,
+				  GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE,
+				  GFT_PROFILE_TCP_PROTOCOL);
+		else
+			SET_FIELD(cam_line,
+				  GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE,
+				  GFT_PROFILE_UDP_PROTOCOL);
+	}
+
+	if (!(ipv4 && ipv6)) {
+		SET_FIELD(cam_line, GFT_CAM_LINE_MAPPED_IP_VERSION_MASK, 1);
+		if (ipv4)
+			SET_FIELD(cam_line,
+				  GFT_CAM_LINE_MAPPED_IP_VERSION,
+				  GFT_PROFILE_IPV4);
+		else
+			SET_FIELD(cam_line,
+				  GFT_CAM_LINE_MAPPED_IP_VERSION,
+				  GFT_PROFILE_IPV6);
+	}
+
+	/* Write characteristics to cam */
+	qed_wr(p_hwfn, p_ptt, PRS_REG_GFT_CAM + CAM_LINE_SIZE * pf_id,
+	       cam_line);
+	cam_line =
+	    qed_rd(p_hwfn, p_ptt, PRS_REG_GFT_CAM + CAM_LINE_SIZE * pf_id);
+
+	/* Write line to RAM - compare to filter 4 tuple */
+	ram_line_lo = 0;
+	ram_line_hi = 0;
+
+	/* Tunnel type */
+	SET_FIELD(ram_line_lo, GFT_RAM_LINE_TUNNEL_DST_PORT, 1);
+	SET_FIELD(ram_line_lo, GFT_RAM_LINE_TUNNEL_OVER_IP_PROTOCOL, 1);
+
+	if (profile_type == GFT_PROFILE_TYPE_4_TUPLE) {
+		SET_FIELD(ram_line_hi, GFT_RAM_LINE_DST_IP, 1);
+		SET_FIELD(ram_line_hi, GFT_RAM_LINE_SRC_IP, 1);
+		SET_FIELD(ram_line_hi, GFT_RAM_LINE_OVER_IP_PROTOCOL, 1);
+		SET_FIELD(ram_line_lo, GFT_RAM_LINE_ETHERTYPE, 1);
+		SET_FIELD(ram_line_lo, GFT_RAM_LINE_SRC_PORT, 1);
+		SET_FIELD(ram_line_lo, GFT_RAM_LINE_DST_PORT, 1);
+	} else if (profile_type == GFT_PROFILE_TYPE_L4_DST_PORT) {
+		SET_FIELD(ram_line_hi, GFT_RAM_LINE_OVER_IP_PROTOCOL, 1);
+		SET_FIELD(ram_line_lo, GFT_RAM_LINE_ETHERTYPE, 1);
+		SET_FIELD(ram_line_lo, GFT_RAM_LINE_DST_PORT, 1);
+	} else if (profile_type == GFT_PROFILE_TYPE_IP_DST_ADDR) {
+		SET_FIELD(ram_line_hi, GFT_RAM_LINE_DST_IP, 1);
+		SET_FIELD(ram_line_lo, GFT_RAM_LINE_ETHERTYPE, 1);
+	} else if (profile_type == GFT_PROFILE_TYPE_IP_SRC_ADDR) {
+		SET_FIELD(ram_line_hi, GFT_RAM_LINE_SRC_IP, 1);
+		SET_FIELD(ram_line_lo, GFT_RAM_LINE_ETHERTYPE, 1);
+	} else if (profile_type == GFT_PROFILE_TYPE_TUNNEL_TYPE) {
+		SET_FIELD(ram_line_lo, GFT_RAM_LINE_TUNNEL_ETHERTYPE, 1);
+	}
+
+	qed_wr(p_hwfn,
+	       p_ptt,
+	       PRS_REG_GFT_PROFILE_MASK_RAM + RAM_LINE_SIZE * pf_id,
+	       ram_line_lo);
+	qed_wr(p_hwfn,
+	       p_ptt,
+	       PRS_REG_GFT_PROFILE_MASK_RAM + RAM_LINE_SIZE * pf_id + REG_SIZE,
+	       ram_line_hi);
+
+	/* Set default profile so that no filter match will happen */
+	qed_wr(p_hwfn,
+	       p_ptt,
+	       PRS_REG_GFT_PROFILE_MASK_RAM + RAM_LINE_SIZE *
+	       PRS_GFT_CAM_LINES_NO_MATCH, 0xffffffff);
+	qed_wr(p_hwfn,
+	       p_ptt,
+	       PRS_REG_GFT_PROFILE_MASK_RAM + RAM_LINE_SIZE *
+	       PRS_GFT_CAM_LINES_NO_MATCH + REG_SIZE, 0x3ff);
+
+	/* Enable gft search */
+	qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_GFT, 1);
+}
+
+DECLARE_CRC8_TABLE(cdu_crc8_table);
+
+/* Calculate and return CDU validation byte per connection type/region/cid */
+static u8 qed_calc_cdu_validation_byte(u8 conn_type, u8 region, u32 cid)
+{
+	const u8 validation_cfg = CDU_VALIDATION_DEFAULT_CFG;
+	u8 crc, validation_byte = 0;
+	static u8 crc8_table_valid; /* automatically initialized to 0 */
+	u32 validation_string = 0;
+	u32 data_to_crc;
+
+	if (!crc8_table_valid) {
+		crc8_populate_msb(cdu_crc8_table, 0x07);
+		crc8_table_valid = 1;
+	}
+
+	/* The CRC is calculated on the String-to-compress:
+	 * [31:8]  = {CID[31:20],CID[11:0]}
+	 * [7:4]   = Region
+	 * [3:0]   = Type
+	 */
+	if ((validation_cfg >> CDU_CONTEXT_VALIDATION_CFG_USE_CID) & 1)
+		validation_string |= (cid & 0xFFF00000) | ((cid & 0xFFF) << 8);
+
+	if ((validation_cfg >> CDU_CONTEXT_VALIDATION_CFG_USE_REGION) & 1)
+		validation_string |= ((region & 0xF) << 4);
+
+	if ((validation_cfg >> CDU_CONTEXT_VALIDATION_CFG_USE_TYPE) & 1)
+		validation_string |= (conn_type & 0xF);
+
+	/* Convert to big-endian and calculate CRC8 */
+	data_to_crc = be32_to_cpu(validation_string);
+
+	crc = crc8(cdu_crc8_table,
+		   (u8 *)&data_to_crc, sizeof(data_to_crc), CRC8_INIT_VALUE);
+
+	/* The validation byte [7:0] is composed:
+	 * for type A validation
+	 * [7]          = active configuration bit
+	 * [6:0]        = crc[6:0]
+	 *
+	 * for type B validation
+	 * [7]          = active configuration bit
+	 * [6:3]        = connection_type[3:0]
+	 * [2:0]        = crc[2:0]
+	 */
+	validation_byte |=
+	    ((validation_cfg >>
+	      CDU_CONTEXT_VALIDATION_CFG_USE_ACTIVE) & 1) << 7;
+
+	if ((validation_cfg >>
+	     CDU_CONTEXT_VALIDATION_CFG_VALIDATION_TYPE_SHIFT) & 1)
+		validation_byte |= ((conn_type & 0xF) << 3) | (crc & 0x7);
+	else
+		validation_byte |= crc & 0x7F;
+
+	return validation_byte;
+}
+
+/* Calcualte and set validation bytes for session context */
+void qed_calc_session_ctx_validation(void *p_ctx_mem,
+				     u16 ctx_size, u8 ctx_type, u32 cid)
+{
+	u8 *x_val_ptr, *t_val_ptr, *u_val_ptr, *p_ctx;
+
+	p_ctx = (u8 * const)p_ctx_mem;
+	x_val_ptr = &p_ctx[con_region_offsets[0][ctx_type]];
+	t_val_ptr = &p_ctx[con_region_offsets[1][ctx_type]];
+	u_val_ptr = &p_ctx[con_region_offsets[2][ctx_type]];
+
+	memset(p_ctx, 0, ctx_size);
+
+	*x_val_ptr = qed_calc_cdu_validation_byte(ctx_type, 3, cid);
+	*t_val_ptr = qed_calc_cdu_validation_byte(ctx_type, 4, cid);
+	*u_val_ptr = qed_calc_cdu_validation_byte(ctx_type, 5, cid);
+}
+
+/* Calcualte and set validation bytes for task context */
+void qed_calc_task_ctx_validation(void *p_ctx_mem,
+				  u16 ctx_size, u8 ctx_type, u32 tid)
+{
+	u8 *p_ctx, *region1_val_ptr;
+
+	p_ctx = (u8 * const)p_ctx_mem;
+	region1_val_ptr = &p_ctx[task_region_offsets[0][ctx_type]];
+
+	memset(p_ctx, 0, ctx_size);
+
+	*region1_val_ptr = qed_calc_cdu_validation_byte(ctx_type, 1, tid);
+}
+
+/* Memset session context to 0 while preserving validation bytes */
+void qed_memset_session_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type)
+{
+	u8 *x_val_ptr, *t_val_ptr, *u_val_ptr, *p_ctx;
+	u8 x_val, t_val, u_val;
+
+	p_ctx = (u8 * const)p_ctx_mem;
+	x_val_ptr = &p_ctx[con_region_offsets[0][ctx_type]];
+	t_val_ptr = &p_ctx[con_region_offsets[1][ctx_type]];
+	u_val_ptr = &p_ctx[con_region_offsets[2][ctx_type]];
+
+	x_val = *x_val_ptr;
+	t_val = *t_val_ptr;
+	u_val = *u_val_ptr;
+
+	memset(p_ctx, 0, ctx_size);
+
+	*x_val_ptr = x_val;
+	*t_val_ptr = t_val;
+	*u_val_ptr = u_val;
+}
+
+/* Memset task context to 0 while preserving validation bytes */
+void qed_memset_task_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type)
+{
+	u8 *p_ctx, *region1_val_ptr;
+	u8 region1_val;
+
+	p_ctx = (u8 * const)p_ctx_mem;
+	region1_val_ptr = &p_ctx[task_region_offsets[0][ctx_type]];
+
+	region1_val = *region1_val_ptr;
+
+	memset(p_ctx, 0, ctx_size);
+
+	*region1_val_ptr = region1_val;
+}
+
+/* Enable and configure context validation */
+void qed_enable_context_validation(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt)
+{
+	u32 ctx_validation;
+
+	/* Enable validation for connection region 3: CCFC_CTX_VALID0[31:24] */
+	ctx_validation = CDU_VALIDATION_DEFAULT_CFG << 24;
+	qed_wr(p_hwfn, p_ptt, CDU_REG_CCFC_CTX_VALID0, ctx_validation);
+
+	/* Enable validation for connection region 5: CCFC_CTX_VALID1[15:8] */
+	ctx_validation = CDU_VALIDATION_DEFAULT_CFG << 8;
+	qed_wr(p_hwfn, p_ptt, CDU_REG_CCFC_CTX_VALID1, ctx_validation);
+
+	/* Enable validation for connection region 1: TCFC_CTX_VALID0[15:8] */
+	ctx_validation = CDU_VALIDATION_DEFAULT_CFG << 8;
+	qed_wr(p_hwfn, p_ptt, CDU_REG_TCFC_CTX_VALID0, ctx_validation);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_ops.c b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
new file mode 100644
index 0000000..3bb76da
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
@@ -0,0 +1,590 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_init_ops.h"
+#include "qed_reg_addr.h"
+#include "qed_sriov.h"
+
+#define QED_INIT_MAX_POLL_COUNT 100
+#define QED_INIT_POLL_PERIOD_US 500
+
+static u32 pxp_global_win[] = {
+	0,
+	0,
+	0x1c02, /* win 2: addr=0x1c02000, size=4096 bytes */
+	0x1c80, /* win 3: addr=0x1c80000, size=4096 bytes */
+	0x1d00, /* win 4: addr=0x1d00000, size=4096 bytes */
+	0x1d01, /* win 5: addr=0x1d01000, size=4096 bytes */
+	0x1d80, /* win 6: addr=0x1d80000, size=4096 bytes */
+	0x1d81, /* win 7: addr=0x1d81000, size=4096 bytes */
+	0x1d82, /* win 8: addr=0x1d82000, size=4096 bytes */
+	0x1e00, /* win 9: addr=0x1e00000, size=4096 bytes */
+	0x1e80, /* win 10: addr=0x1e80000, size=4096 bytes */
+	0x1f00, /* win 11: addr=0x1f00000, size=4096 bytes */
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+};
+
+void qed_init_iro_array(struct qed_dev *cdev)
+{
+	cdev->iro_arr = iro_arr;
+}
+
+/* Runtime configuration helpers */
+void qed_init_clear_rt_data(struct qed_hwfn *p_hwfn)
+{
+	int i;
+
+	for (i = 0; i < RUNTIME_ARRAY_SIZE; i++)
+		p_hwfn->rt_data.b_valid[i] = false;
+}
+
+void qed_init_store_rt_reg(struct qed_hwfn *p_hwfn, u32 rt_offset, u32 val)
+{
+	p_hwfn->rt_data.init_val[rt_offset] = val;
+	p_hwfn->rt_data.b_valid[rt_offset] = true;
+}
+
+void qed_init_store_rt_agg(struct qed_hwfn *p_hwfn,
+			   u32 rt_offset, u32 *p_val, size_t size)
+{
+	size_t i;
+
+	for (i = 0; i < size / sizeof(u32); i++) {
+		p_hwfn->rt_data.init_val[rt_offset + i] = p_val[i];
+		p_hwfn->rt_data.b_valid[rt_offset + i]	= true;
+	}
+}
+
+static int qed_init_rt(struct qed_hwfn	*p_hwfn,
+		       struct qed_ptt *p_ptt,
+		       u32 addr, u16 rt_offset, u16 size, bool b_must_dmae)
+{
+	u32 *p_init_val = &p_hwfn->rt_data.init_val[rt_offset];
+	bool *p_valid = &p_hwfn->rt_data.b_valid[rt_offset];
+	u16 i, segment;
+	int rc = 0;
+
+	/* Since not all RT entries are initialized, go over the RT and
+	 * for each segment of initialized values use DMA.
+	 */
+	for (i = 0; i < size; i++) {
+		if (!p_valid[i])
+			continue;
+
+		/* In case there isn't any wide-bus configuration here,
+		 * simply write the data instead of using dmae.
+		 */
+		if (!b_must_dmae) {
+			qed_wr(p_hwfn, p_ptt, addr + (i << 2), p_init_val[i]);
+			continue;
+		}
+
+		/* Start of a new segment */
+		for (segment = 1; i + segment < size; segment++)
+			if (!p_valid[i + segment])
+				break;
+
+		rc = qed_dmae_host2grc(p_hwfn, p_ptt,
+				       (uintptr_t)(p_init_val + i),
+				       addr + (i << 2), segment, 0);
+		if (rc)
+			return rc;
+
+		/* Jump over the entire segment, including invalid entry */
+		i += segment;
+	}
+
+	return rc;
+}
+
+int qed_init_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_rt_data *rt_data = &p_hwfn->rt_data;
+
+	if (IS_VF(p_hwfn->cdev))
+		return 0;
+
+	rt_data->b_valid = kzalloc(sizeof(bool) * RUNTIME_ARRAY_SIZE,
+				   GFP_KERNEL);
+	if (!rt_data->b_valid)
+		return -ENOMEM;
+
+	rt_data->init_val = kzalloc(sizeof(u32) * RUNTIME_ARRAY_SIZE,
+				    GFP_KERNEL);
+	if (!rt_data->init_val) {
+		kfree(rt_data->b_valid);
+		rt_data->b_valid = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void qed_init_free(struct qed_hwfn *p_hwfn)
+{
+	kfree(p_hwfn->rt_data.init_val);
+	p_hwfn->rt_data.init_val = NULL;
+	kfree(p_hwfn->rt_data.b_valid);
+	p_hwfn->rt_data.b_valid = NULL;
+}
+
+static int qed_init_array_dmae(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt,
+			       u32 addr,
+			       u32 dmae_data_offset,
+			       u32 size,
+			       const u32 *buf,
+			       bool b_must_dmae,
+			       bool b_can_dmae)
+{
+	int rc = 0;
+
+	/* Perform DMAE only for lengthy enough sections or for wide-bus */
+	if (!b_can_dmae || (!b_must_dmae && (size < 16))) {
+		const u32 *data = buf + dmae_data_offset;
+		u32 i;
+
+		for (i = 0; i < size; i++)
+			qed_wr(p_hwfn, p_ptt, addr + (i << 2), data[i]);
+	} else {
+		rc = qed_dmae_host2grc(p_hwfn, p_ptt,
+				       (uintptr_t)(buf + dmae_data_offset),
+				       addr, size, 0);
+	}
+
+	return rc;
+}
+
+static int qed_init_fill_dmae(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      u32 addr, u32 fill, u32 fill_count)
+{
+	static u32 zero_buffer[DMAE_MAX_RW_SIZE];
+
+	memset(zero_buffer, 0, sizeof(u32) * DMAE_MAX_RW_SIZE);
+
+	/* invoke the DMAE virtual/physical buffer API with
+	 * 1. DMAE init channel
+	 * 2. addr,
+	 * 3. p_hwfb->temp_data,
+	 * 4. fill_count
+	 */
+
+	return qed_dmae_host2grc(p_hwfn, p_ptt,
+				 (uintptr_t)(&zero_buffer[0]),
+				 addr, fill_count, QED_DMAE_FLAG_RW_REPL_SRC);
+}
+
+static void qed_init_fill(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  u32 addr, u32 fill, u32 fill_count)
+{
+	u32 i;
+
+	for (i = 0; i < fill_count; i++, addr += sizeof(u32))
+		qed_wr(p_hwfn, p_ptt, addr, fill);
+}
+
+static int qed_init_cmd_array(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      struct init_write_op *cmd,
+			      bool b_must_dmae, bool b_can_dmae)
+{
+	u32 dmae_array_offset = le32_to_cpu(cmd->args.array_offset);
+	u32 data = le32_to_cpu(cmd->data);
+	u32 addr = GET_FIELD(data, INIT_WRITE_OP_ADDRESS) << 2;
+
+	u32 offset, output_len, input_len, max_size;
+	struct qed_dev *cdev = p_hwfn->cdev;
+	union init_array_hdr *hdr;
+	const u32 *array_data;
+	int rc = 0;
+	u32 size;
+
+	array_data = cdev->fw_data->arr_data;
+
+	hdr = (union init_array_hdr *)(array_data + dmae_array_offset);
+	data = le32_to_cpu(hdr->raw.data);
+	switch (GET_FIELD(data, INIT_ARRAY_RAW_HDR_TYPE)) {
+	case INIT_ARR_ZIPPED:
+		offset = dmae_array_offset + 1;
+		input_len = GET_FIELD(data,
+				      INIT_ARRAY_ZIPPED_HDR_ZIPPED_SIZE);
+		max_size = MAX_ZIPPED_SIZE * 4;
+		memset(p_hwfn->unzip_buf, 0, max_size);
+
+		output_len = qed_unzip_data(p_hwfn, input_len,
+					    (u8 *)&array_data[offset],
+					    max_size, (u8 *)p_hwfn->unzip_buf);
+		if (output_len) {
+			rc = qed_init_array_dmae(p_hwfn, p_ptt, addr, 0,
+						 output_len,
+						 p_hwfn->unzip_buf,
+						 b_must_dmae, b_can_dmae);
+		} else {
+			DP_NOTICE(p_hwfn, "Failed to unzip dmae data\n");
+			rc = -EINVAL;
+		}
+		break;
+	case INIT_ARR_PATTERN:
+	{
+		u32 repeats = GET_FIELD(data,
+					INIT_ARRAY_PATTERN_HDR_REPETITIONS);
+		u32 i;
+
+		size = GET_FIELD(data, INIT_ARRAY_PATTERN_HDR_PATTERN_SIZE);
+
+		for (i = 0; i < repeats; i++, addr += size << 2) {
+			rc = qed_init_array_dmae(p_hwfn, p_ptt, addr,
+						 dmae_array_offset + 1,
+						 size, array_data,
+						 b_must_dmae, b_can_dmae);
+			if (rc)
+				break;
+		}
+		break;
+	}
+	case INIT_ARR_STANDARD:
+		size = GET_FIELD(data, INIT_ARRAY_STANDARD_HDR_SIZE);
+		rc = qed_init_array_dmae(p_hwfn, p_ptt, addr,
+					 dmae_array_offset + 1,
+					 size, array_data,
+					 b_must_dmae, b_can_dmae);
+		break;
+	}
+
+	return rc;
+}
+
+/* init_ops write command */
+static int qed_init_cmd_wr(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt,
+			   struct init_write_op *p_cmd, bool b_can_dmae)
+{
+	u32 data = le32_to_cpu(p_cmd->data);
+	bool b_must_dmae = GET_FIELD(data, INIT_WRITE_OP_WIDE_BUS);
+	u32 addr = GET_FIELD(data, INIT_WRITE_OP_ADDRESS) << 2;
+	union init_write_args *arg = &p_cmd->args;
+	int rc = 0;
+
+	/* Sanitize */
+	if (b_must_dmae && !b_can_dmae) {
+		DP_NOTICE(p_hwfn,
+			  "Need to write to %08x for Wide-bus but DMAE isn't allowed\n",
+			  addr);
+		return -EINVAL;
+	}
+
+	switch (GET_FIELD(data, INIT_WRITE_OP_SOURCE)) {
+	case INIT_SRC_INLINE:
+		data = le32_to_cpu(p_cmd->args.inline_val);
+		qed_wr(p_hwfn, p_ptt, addr, data);
+		break;
+	case INIT_SRC_ZEROS:
+		data = le32_to_cpu(p_cmd->args.zeros_count);
+		if (b_must_dmae || (b_can_dmae && (data >= 64)))
+			rc = qed_init_fill_dmae(p_hwfn, p_ptt, addr, 0, data);
+		else
+			qed_init_fill(p_hwfn, p_ptt, addr, 0, data);
+		break;
+	case INIT_SRC_ARRAY:
+		rc = qed_init_cmd_array(p_hwfn, p_ptt, p_cmd,
+					b_must_dmae, b_can_dmae);
+		break;
+	case INIT_SRC_RUNTIME:
+		qed_init_rt(p_hwfn, p_ptt, addr,
+			    le16_to_cpu(arg->runtime.offset),
+			    le16_to_cpu(arg->runtime.size),
+			    b_must_dmae);
+		break;
+	}
+
+	return rc;
+}
+
+static inline bool comp_eq(u32 val, u32 expected_val)
+{
+	return val == expected_val;
+}
+
+static inline bool comp_and(u32 val, u32 expected_val)
+{
+	return (val & expected_val) == expected_val;
+}
+
+static inline bool comp_or(u32 val, u32 expected_val)
+{
+	return (val | expected_val) > 0;
+}
+
+/* init_ops read/poll commands */
+static void qed_init_cmd_rd(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt, struct init_read_op *cmd)
+{
+	bool (*comp_check)(u32 val, u32 expected_val);
+	u32 delay = QED_INIT_POLL_PERIOD_US, val;
+	u32 data, addr, poll;
+	int i;
+
+	data = le32_to_cpu(cmd->op_data);
+	addr = GET_FIELD(data, INIT_READ_OP_ADDRESS) << 2;
+	poll = GET_FIELD(data, INIT_READ_OP_POLL_TYPE);
+
+
+	val = qed_rd(p_hwfn, p_ptt, addr);
+
+	if (poll == INIT_POLL_NONE)
+		return;
+
+	switch (poll) {
+	case INIT_POLL_EQ:
+		comp_check = comp_eq;
+		break;
+	case INIT_POLL_OR:
+		comp_check = comp_or;
+		break;
+	case INIT_POLL_AND:
+		comp_check = comp_and;
+		break;
+	default:
+		DP_ERR(p_hwfn, "Invalid poll comparison type %08x\n",
+		       cmd->op_data);
+		return;
+	}
+
+	data = le32_to_cpu(cmd->expected_val);
+	for (i = 0;
+	     i < QED_INIT_MAX_POLL_COUNT && !comp_check(val, data);
+	     i++) {
+		udelay(delay);
+		val = qed_rd(p_hwfn, p_ptt, addr);
+	}
+
+	if (i == QED_INIT_MAX_POLL_COUNT) {
+		DP_ERR(p_hwfn,
+		       "Timeout when polling reg: 0x%08x [ Waiting-for: %08x Got: %08x (comparsion %08x)]\n",
+		       addr, le32_to_cpu(cmd->expected_val),
+		       val, le32_to_cpu(cmd->op_data));
+	}
+}
+
+/* init_ops callbacks entry point */
+static int qed_init_cmd_cb(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt,
+			   struct init_callback_op *p_cmd)
+{
+	int rc;
+
+	switch (p_cmd->callback_id) {
+	case DMAE_READY_CB:
+		rc = qed_dmae_sanity(p_hwfn, p_ptt, "engine_phase");
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Unexpected init op callback ID %d\n",
+			  p_cmd->callback_id);
+		return -EINVAL;
+	}
+
+	return rc;
+}
+
+static u8 qed_init_cmd_mode_match(struct qed_hwfn *p_hwfn,
+				  u16 *p_offset, int modes)
+{
+	struct qed_dev *cdev = p_hwfn->cdev;
+	const u8 *modes_tree_buf;
+	u8 arg1, arg2, tree_val;
+
+	modes_tree_buf = cdev->fw_data->modes_tree_buf;
+	tree_val = modes_tree_buf[(*p_offset)++];
+	switch (tree_val) {
+	case INIT_MODE_OP_NOT:
+		return qed_init_cmd_mode_match(p_hwfn, p_offset, modes) ^ 1;
+	case INIT_MODE_OP_OR:
+		arg1 = qed_init_cmd_mode_match(p_hwfn, p_offset, modes);
+		arg2 = qed_init_cmd_mode_match(p_hwfn, p_offset, modes);
+		return arg1 | arg2;
+	case INIT_MODE_OP_AND:
+		arg1 = qed_init_cmd_mode_match(p_hwfn, p_offset, modes);
+		arg2 = qed_init_cmd_mode_match(p_hwfn, p_offset, modes);
+		return arg1 & arg2;
+	default:
+		tree_val -= MAX_INIT_MODE_OPS;
+		return (modes & BIT(tree_val)) ? 1 : 0;
+	}
+}
+
+static u32 qed_init_cmd_mode(struct qed_hwfn *p_hwfn,
+			     struct init_if_mode_op *p_cmd, int modes)
+{
+	u16 offset = le16_to_cpu(p_cmd->modes_buf_offset);
+
+	if (qed_init_cmd_mode_match(p_hwfn, &offset, modes))
+		return 0;
+	else
+		return GET_FIELD(le32_to_cpu(p_cmd->op_data),
+				 INIT_IF_MODE_OP_CMD_OFFSET);
+}
+
+static u32 qed_init_cmd_phase(struct qed_hwfn *p_hwfn,
+			      struct init_if_phase_op *p_cmd,
+			      u32 phase, u32 phase_id)
+{
+	u32 data = le32_to_cpu(p_cmd->phase_data);
+	u32 op_data = le32_to_cpu(p_cmd->op_data);
+
+	if (!(GET_FIELD(data, INIT_IF_PHASE_OP_PHASE) == phase &&
+	      (GET_FIELD(data, INIT_IF_PHASE_OP_PHASE_ID) == ANY_PHASE_ID ||
+	       GET_FIELD(data, INIT_IF_PHASE_OP_PHASE_ID) == phase_id)))
+		return GET_FIELD(op_data, INIT_IF_PHASE_OP_CMD_OFFSET);
+	else
+		return 0;
+}
+
+int qed_init_run(struct qed_hwfn *p_hwfn,
+		 struct qed_ptt *p_ptt, int phase, int phase_id, int modes)
+{
+	struct qed_dev *cdev = p_hwfn->cdev;
+	u32 cmd_num, num_init_ops;
+	union init_op *init_ops;
+	bool b_dmae = false;
+	int rc = 0;
+
+	num_init_ops = cdev->fw_data->init_ops_size;
+	init_ops = cdev->fw_data->init_ops;
+
+	p_hwfn->unzip_buf = kzalloc(MAX_ZIPPED_SIZE * 4, GFP_ATOMIC);
+	if (!p_hwfn->unzip_buf)
+		return -ENOMEM;
+
+	for (cmd_num = 0; cmd_num < num_init_ops; cmd_num++) {
+		union init_op *cmd = &init_ops[cmd_num];
+		u32 data = le32_to_cpu(cmd->raw.op_data);
+
+		switch (GET_FIELD(data, INIT_CALLBACK_OP_OP)) {
+		case INIT_OP_WRITE:
+			rc = qed_init_cmd_wr(p_hwfn, p_ptt, &cmd->write,
+					     b_dmae);
+			break;
+		case INIT_OP_READ:
+			qed_init_cmd_rd(p_hwfn, p_ptt, &cmd->read);
+			break;
+		case INIT_OP_IF_MODE:
+			cmd_num += qed_init_cmd_mode(p_hwfn, &cmd->if_mode,
+						     modes);
+			break;
+		case INIT_OP_IF_PHASE:
+			cmd_num += qed_init_cmd_phase(p_hwfn, &cmd->if_phase,
+						      phase, phase_id);
+			b_dmae = GET_FIELD(data, INIT_IF_PHASE_OP_DMAE_ENABLE);
+			break;
+		case INIT_OP_DELAY:
+			/* qed_init_run is always invoked from
+			 * sleep-able context
+			 */
+			udelay(le32_to_cpu(cmd->delay.delay));
+			break;
+
+		case INIT_OP_CALLBACK:
+			rc = qed_init_cmd_cb(p_hwfn, p_ptt, &cmd->callback);
+			break;
+		}
+
+		if (rc)
+			break;
+	}
+
+	kfree(p_hwfn->unzip_buf);
+	p_hwfn->unzip_buf = NULL;
+	return rc;
+}
+
+void qed_gtt_init(struct qed_hwfn *p_hwfn)
+{
+	u32 gtt_base;
+	u32 i;
+
+	/* Set the global windows */
+	gtt_base = PXP_PF_WINDOW_ADMIN_START + PXP_PF_WINDOW_ADMIN_GLOBAL_START;
+
+	for (i = 0; i < ARRAY_SIZE(pxp_global_win); i++)
+		if (pxp_global_win[i])
+			REG_WR(p_hwfn, gtt_base + i * PXP_GLOBAL_ENTRY_SIZE,
+			       pxp_global_win[i]);
+}
+
+int qed_init_fw_data(struct qed_dev *cdev, const u8 *data)
+{
+	struct qed_fw_data *fw = cdev->fw_data;
+	struct bin_buffer_hdr *buf_hdr;
+	u32 offset, len;
+
+	if (!data) {
+		DP_NOTICE(cdev, "Invalid fw data\n");
+		return -EINVAL;
+	}
+
+	/* First Dword contains metadata and should be skipped */
+	buf_hdr = (struct bin_buffer_hdr *)data;
+
+	offset = buf_hdr[BIN_BUF_INIT_FW_VER_INFO].offset;
+	fw->fw_ver_info = (struct fw_ver_info *)(data + offset);
+
+	offset = buf_hdr[BIN_BUF_INIT_CMD].offset;
+	fw->init_ops = (union init_op *)(data + offset);
+
+	offset = buf_hdr[BIN_BUF_INIT_VAL].offset;
+	fw->arr_data = (u32 *)(data + offset);
+
+	offset = buf_hdr[BIN_BUF_INIT_MODE_TREE].offset;
+	fw->modes_tree_buf = (u8 *)(data + offset);
+	len = buf_hdr[BIN_BUF_INIT_CMD].length;
+	fw->init_ops_size = len / sizeof(struct init_raw_op);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_ops.h b/drivers/net/ethernet/qlogic/qed/qed_init_ops.h
new file mode 100644
index 0000000..555dd08
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_ops.h
@@ -0,0 +1,134 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_INIT_OPS_H
+#define _QED_INIT_OPS_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include "qed.h"
+
+/**
+ * @brief qed_init_iro_array - init iro_arr.
+ *
+ *
+ * @param cdev
+ */
+void qed_init_iro_array(struct qed_dev *cdev);
+
+/**
+ * @brief qed_init_run - Run the init-sequence.
+ *
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param phase
+ * @param phase_id
+ * @param modes
+ * @return _qed_status_t
+ */
+int qed_init_run(struct qed_hwfn *p_hwfn,
+		 struct qed_ptt *p_ptt,
+		 int phase,
+		 int phase_id,
+		 int modes);
+
+/**
+ * @brief qed_init_hwfn_allocate - Allocate RT array, Store 'values' ptrs.
+ *
+ *
+ * @param p_hwfn
+ *
+ * @return _qed_status_t
+ */
+int qed_init_alloc(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_init_hwfn_deallocate
+ *
+ *
+ * @param p_hwfn
+ */
+void qed_init_free(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_init_clear_rt_data - Clears the runtime init array.
+ *
+ *
+ * @param p_hwfn
+ */
+void qed_init_clear_rt_data(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_init_store_rt_reg - Store a configuration value in the RT array.
+ *
+ *
+ * @param p_hwfn
+ * @param rt_offset
+ * @param val
+ */
+void qed_init_store_rt_reg(struct qed_hwfn *p_hwfn,
+			   u32 rt_offset,
+			   u32 val);
+
+#define STORE_RT_REG(hwfn, offset, val)	\
+	qed_init_store_rt_reg(hwfn, offset, val)
+
+#define OVERWRITE_RT_REG(hwfn, offset, val) \
+	qed_init_store_rt_reg(hwfn, offset, val)
+
+/**
+ * @brief
+ *
+ *
+ * @param p_hwfn
+ * @param rt_offset
+ * @param val
+ * @param size
+ */
+void qed_init_store_rt_agg(struct qed_hwfn *p_hwfn,
+			   u32 rt_offset,
+			   u32 *val,
+			   size_t size);
+
+#define STORE_RT_REG_AGG(hwfn, offset, val) \
+	qed_init_store_rt_agg(hwfn, offset, (u32 *)&val, sizeof(val))
+
+/**
+ * @brief
+ *      Initialize GTT global windows and set admin window
+ *      related params of GTT/PTT to default values.
+ *
+ * @param p_hwfn
+ */
+void qed_gtt_init(struct qed_hwfn *p_hwfn);
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.c b/drivers/net/ethernet/qlogic/qed/qed_int.c
new file mode 100644
index 0000000..af3a28e
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.c
@@ -0,0 +1,2234 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/io.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_init_ops.h"
+#include "qed_int.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+#include "qed_sp.h"
+#include "qed_sriov.h"
+#include "qed_vf.h"
+
+struct qed_pi_info {
+	qed_int_comp_cb_t	comp_cb;
+	void			*cookie;
+};
+
+struct qed_sb_sp_info {
+	struct qed_sb_info sb_info;
+
+	/* per protocol index data */
+	struct qed_pi_info pi_info_arr[PIS_PER_SB_E4];
+};
+
+enum qed_attention_type {
+	QED_ATTN_TYPE_ATTN,
+	QED_ATTN_TYPE_PARITY,
+};
+
+#define SB_ATTN_ALIGNED_SIZE(p_hwfn) \
+	ALIGNED_TYPE_SIZE(struct atten_status_block, p_hwfn)
+
+struct aeu_invert_reg_bit {
+	char bit_name[30];
+
+#define ATTENTION_PARITY                (1 << 0)
+
+#define ATTENTION_LENGTH_MASK           (0x00000ff0)
+#define ATTENTION_LENGTH_SHIFT          (4)
+#define ATTENTION_LENGTH(flags)         (((flags) & ATTENTION_LENGTH_MASK) >> \
+					 ATTENTION_LENGTH_SHIFT)
+#define ATTENTION_SINGLE                BIT(ATTENTION_LENGTH_SHIFT)
+#define ATTENTION_PAR                   (ATTENTION_SINGLE | ATTENTION_PARITY)
+#define ATTENTION_PAR_INT               ((2 << ATTENTION_LENGTH_SHIFT) | \
+					 ATTENTION_PARITY)
+
+/* Multiple bits start with this offset */
+#define ATTENTION_OFFSET_MASK           (0x000ff000)
+#define ATTENTION_OFFSET_SHIFT          (12)
+
+#define ATTENTION_BB_MASK               (0x00700000)
+#define ATTENTION_BB_SHIFT              (20)
+#define ATTENTION_BB(value)             (value << ATTENTION_BB_SHIFT)
+#define ATTENTION_BB_DIFFERENT          BIT(23)
+
+	unsigned int flags;
+
+	/* Callback to call if attention will be triggered */
+	int (*cb)(struct qed_hwfn *p_hwfn);
+
+	enum block_id block_index;
+};
+
+struct aeu_invert_reg {
+	struct aeu_invert_reg_bit bits[32];
+};
+
+#define MAX_ATTN_GRPS           (8)
+#define NUM_ATTN_REGS           (9)
+
+/* Specific HW attention callbacks */
+static int qed_mcp_attn_cb(struct qed_hwfn *p_hwfn)
+{
+	u32 tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, MCP_REG_CPU_STATE);
+
+	/* This might occur on certain instances; Log it once then mask it */
+	DP_INFO(p_hwfn->cdev, "MCP_REG_CPU_STATE: %08x - Masking...\n",
+		tmp);
+	qed_wr(p_hwfn, p_hwfn->p_dpc_ptt, MCP_REG_CPU_EVENT_MASK,
+	       0xffffffff);
+
+	return 0;
+}
+
+#define QED_PSWHST_ATTENTION_INCORRECT_ACCESS		(0x1)
+#define ATTENTION_INCORRECT_ACCESS_WR_MASK		(0x1)
+#define ATTENTION_INCORRECT_ACCESS_WR_SHIFT		(0)
+#define ATTENTION_INCORRECT_ACCESS_CLIENT_MASK		(0xf)
+#define ATTENTION_INCORRECT_ACCESS_CLIENT_SHIFT		(1)
+#define ATTENTION_INCORRECT_ACCESS_VF_VALID_MASK	(0x1)
+#define ATTENTION_INCORRECT_ACCESS_VF_VALID_SHIFT	(5)
+#define ATTENTION_INCORRECT_ACCESS_VF_ID_MASK		(0xff)
+#define ATTENTION_INCORRECT_ACCESS_VF_ID_SHIFT		(6)
+#define ATTENTION_INCORRECT_ACCESS_PF_ID_MASK		(0xf)
+#define ATTENTION_INCORRECT_ACCESS_PF_ID_SHIFT		(14)
+#define ATTENTION_INCORRECT_ACCESS_BYTE_EN_MASK		(0xff)
+#define ATTENTION_INCORRECT_ACCESS_BYTE_EN_SHIFT	(18)
+static int qed_pswhst_attn_cb(struct qed_hwfn *p_hwfn)
+{
+	u32 tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+			 PSWHST_REG_INCORRECT_ACCESS_VALID);
+
+	if (tmp & QED_PSWHST_ATTENTION_INCORRECT_ACCESS) {
+		u32 addr, data, length;
+
+		addr = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+			      PSWHST_REG_INCORRECT_ACCESS_ADDRESS);
+		data = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+			      PSWHST_REG_INCORRECT_ACCESS_DATA);
+		length = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+				PSWHST_REG_INCORRECT_ACCESS_LENGTH);
+
+		DP_INFO(p_hwfn->cdev,
+			"Incorrect access to %08x of length %08x - PF [%02x] VF [%04x] [valid %02x] client [%02x] write [%02x] Byte-Enable [%04x] [%08x]\n",
+			addr, length,
+			(u8) GET_FIELD(data, ATTENTION_INCORRECT_ACCESS_PF_ID),
+			(u8) GET_FIELD(data, ATTENTION_INCORRECT_ACCESS_VF_ID),
+			(u8) GET_FIELD(data,
+				       ATTENTION_INCORRECT_ACCESS_VF_VALID),
+			(u8) GET_FIELD(data,
+				       ATTENTION_INCORRECT_ACCESS_CLIENT),
+			(u8) GET_FIELD(data, ATTENTION_INCORRECT_ACCESS_WR),
+			(u8) GET_FIELD(data,
+				       ATTENTION_INCORRECT_ACCESS_BYTE_EN),
+			data);
+	}
+
+	return 0;
+}
+
+#define QED_GRC_ATTENTION_VALID_BIT	(1 << 0)
+#define QED_GRC_ATTENTION_ADDRESS_MASK	(0x7fffff)
+#define QED_GRC_ATTENTION_ADDRESS_SHIFT	(0)
+#define QED_GRC_ATTENTION_RDWR_BIT	(1 << 23)
+#define QED_GRC_ATTENTION_MASTER_MASK	(0xf)
+#define QED_GRC_ATTENTION_MASTER_SHIFT	(24)
+#define QED_GRC_ATTENTION_PF_MASK	(0xf)
+#define QED_GRC_ATTENTION_PF_SHIFT	(0)
+#define QED_GRC_ATTENTION_VF_MASK	(0xff)
+#define QED_GRC_ATTENTION_VF_SHIFT	(4)
+#define QED_GRC_ATTENTION_PRIV_MASK	(0x3)
+#define QED_GRC_ATTENTION_PRIV_SHIFT	(14)
+#define QED_GRC_ATTENTION_PRIV_VF	(0)
+static const char *attn_master_to_str(u8 master)
+{
+	switch (master) {
+	case 1: return "PXP";
+	case 2: return "MCP";
+	case 3: return "MSDM";
+	case 4: return "PSDM";
+	case 5: return "YSDM";
+	case 6: return "USDM";
+	case 7: return "TSDM";
+	case 8: return "XSDM";
+	case 9: return "DBU";
+	case 10: return "DMAE";
+	default:
+		return "Unknown";
+	}
+}
+
+static int qed_grc_attn_cb(struct qed_hwfn *p_hwfn)
+{
+	u32 tmp, tmp2;
+
+	/* We've already cleared the timeout interrupt register, so we learn
+	 * of interrupts via the validity register
+	 */
+	tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+		     GRC_REG_TIMEOUT_ATTN_ACCESS_VALID);
+	if (!(tmp & QED_GRC_ATTENTION_VALID_BIT))
+		goto out;
+
+	/* Read the GRC timeout information */
+	tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+		     GRC_REG_TIMEOUT_ATTN_ACCESS_DATA_0);
+	tmp2 = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+		      GRC_REG_TIMEOUT_ATTN_ACCESS_DATA_1);
+
+	DP_INFO(p_hwfn->cdev,
+		"GRC timeout [%08x:%08x] - %s Address [%08x] [Master %s] [PF: %02x %s %02x]\n",
+		tmp2, tmp,
+		(tmp & QED_GRC_ATTENTION_RDWR_BIT) ? "Write to" : "Read from",
+		GET_FIELD(tmp, QED_GRC_ATTENTION_ADDRESS) << 2,
+		attn_master_to_str(GET_FIELD(tmp, QED_GRC_ATTENTION_MASTER)),
+		GET_FIELD(tmp2, QED_GRC_ATTENTION_PF),
+		(GET_FIELD(tmp2, QED_GRC_ATTENTION_PRIV) ==
+		 QED_GRC_ATTENTION_PRIV_VF) ? "VF" : "(Ireelevant)",
+		GET_FIELD(tmp2, QED_GRC_ATTENTION_VF));
+
+out:
+	/* Regardles of anything else, clean the validity bit */
+	qed_wr(p_hwfn, p_hwfn->p_dpc_ptt,
+	       GRC_REG_TIMEOUT_ATTN_ACCESS_VALID, 0);
+	return 0;
+}
+
+#define PGLUE_ATTENTION_VALID			(1 << 29)
+#define PGLUE_ATTENTION_RD_VALID		(1 << 26)
+#define PGLUE_ATTENTION_DETAILS_PFID_MASK	(0xf)
+#define PGLUE_ATTENTION_DETAILS_PFID_SHIFT	(20)
+#define PGLUE_ATTENTION_DETAILS_VF_VALID_MASK	(0x1)
+#define PGLUE_ATTENTION_DETAILS_VF_VALID_SHIFT	(19)
+#define PGLUE_ATTENTION_DETAILS_VFID_MASK	(0xff)
+#define PGLUE_ATTENTION_DETAILS_VFID_SHIFT	(24)
+#define PGLUE_ATTENTION_DETAILS2_WAS_ERR_MASK	(0x1)
+#define PGLUE_ATTENTION_DETAILS2_WAS_ERR_SHIFT	(21)
+#define PGLUE_ATTENTION_DETAILS2_BME_MASK	(0x1)
+#define PGLUE_ATTENTION_DETAILS2_BME_SHIFT	(22)
+#define PGLUE_ATTENTION_DETAILS2_FID_EN_MASK	(0x1)
+#define PGLUE_ATTENTION_DETAILS2_FID_EN_SHIFT	(23)
+#define PGLUE_ATTENTION_ICPL_VALID		(1 << 23)
+#define PGLUE_ATTENTION_ZLR_VALID		(1 << 25)
+#define PGLUE_ATTENTION_ILT_VALID		(1 << 23)
+static int qed_pglub_rbc_attn_cb(struct qed_hwfn *p_hwfn)
+{
+	u32 tmp;
+
+	tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+		     PGLUE_B_REG_TX_ERR_WR_DETAILS2);
+	if (tmp & PGLUE_ATTENTION_VALID) {
+		u32 addr_lo, addr_hi, details;
+
+		addr_lo = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+				 PGLUE_B_REG_TX_ERR_WR_ADD_31_0);
+		addr_hi = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+				 PGLUE_B_REG_TX_ERR_WR_ADD_63_32);
+		details = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+				 PGLUE_B_REG_TX_ERR_WR_DETAILS);
+
+		DP_INFO(p_hwfn,
+			"Illegal write by chip to [%08x:%08x] blocked.\n"
+			"Details: %08x [PFID %02x, VFID %02x, VF_VALID %02x]\n"
+			"Details2 %08x [Was_error %02x BME deassert %02x FID_enable deassert %02x]\n",
+			addr_hi, addr_lo, details,
+			(u8)GET_FIELD(details, PGLUE_ATTENTION_DETAILS_PFID),
+			(u8)GET_FIELD(details, PGLUE_ATTENTION_DETAILS_VFID),
+			GET_FIELD(details,
+				  PGLUE_ATTENTION_DETAILS_VF_VALID) ? 1 : 0,
+			tmp,
+			GET_FIELD(tmp,
+				  PGLUE_ATTENTION_DETAILS2_WAS_ERR) ? 1 : 0,
+			GET_FIELD(tmp,
+				  PGLUE_ATTENTION_DETAILS2_BME) ? 1 : 0,
+			GET_FIELD(tmp,
+				  PGLUE_ATTENTION_DETAILS2_FID_EN) ? 1 : 0);
+	}
+
+	tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+		     PGLUE_B_REG_TX_ERR_RD_DETAILS2);
+	if (tmp & PGLUE_ATTENTION_RD_VALID) {
+		u32 addr_lo, addr_hi, details;
+
+		addr_lo = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+				 PGLUE_B_REG_TX_ERR_RD_ADD_31_0);
+		addr_hi = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+				 PGLUE_B_REG_TX_ERR_RD_ADD_63_32);
+		details = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+				 PGLUE_B_REG_TX_ERR_RD_DETAILS);
+
+		DP_INFO(p_hwfn,
+			"Illegal read by chip from [%08x:%08x] blocked.\n"
+			" Details: %08x [PFID %02x, VFID %02x, VF_VALID %02x]\n"
+			" Details2 %08x [Was_error %02x BME deassert %02x FID_enable deassert %02x]\n",
+			addr_hi, addr_lo, details,
+			(u8)GET_FIELD(details, PGLUE_ATTENTION_DETAILS_PFID),
+			(u8)GET_FIELD(details, PGLUE_ATTENTION_DETAILS_VFID),
+			GET_FIELD(details,
+				  PGLUE_ATTENTION_DETAILS_VF_VALID) ? 1 : 0,
+			tmp,
+			GET_FIELD(tmp, PGLUE_ATTENTION_DETAILS2_WAS_ERR) ? 1
+									 : 0,
+			GET_FIELD(tmp, PGLUE_ATTENTION_DETAILS2_BME) ? 1 : 0,
+			GET_FIELD(tmp, PGLUE_ATTENTION_DETAILS2_FID_EN) ? 1
+									: 0);
+	}
+
+	tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+		     PGLUE_B_REG_TX_ERR_WR_DETAILS_ICPL);
+	if (tmp & PGLUE_ATTENTION_ICPL_VALID)
+		DP_INFO(p_hwfn, "ICPL error - %08x\n", tmp);
+
+	tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+		     PGLUE_B_REG_MASTER_ZLR_ERR_DETAILS);
+	if (tmp & PGLUE_ATTENTION_ZLR_VALID) {
+		u32 addr_hi, addr_lo;
+
+		addr_lo = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+				 PGLUE_B_REG_MASTER_ZLR_ERR_ADD_31_0);
+		addr_hi = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+				 PGLUE_B_REG_MASTER_ZLR_ERR_ADD_63_32);
+
+		DP_INFO(p_hwfn, "ZLR eror - %08x [Address %08x:%08x]\n",
+			tmp, addr_hi, addr_lo);
+	}
+
+	tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+		     PGLUE_B_REG_VF_ILT_ERR_DETAILS2);
+	if (tmp & PGLUE_ATTENTION_ILT_VALID) {
+		u32 addr_hi, addr_lo, details;
+
+		addr_lo = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+				 PGLUE_B_REG_VF_ILT_ERR_ADD_31_0);
+		addr_hi = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+				 PGLUE_B_REG_VF_ILT_ERR_ADD_63_32);
+		details = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+				 PGLUE_B_REG_VF_ILT_ERR_DETAILS);
+
+		DP_INFO(p_hwfn,
+			"ILT error - Details %08x Details2 %08x [Address %08x:%08x]\n",
+			details, tmp, addr_hi, addr_lo);
+	}
+
+	/* Clear the indications */
+	qed_wr(p_hwfn, p_hwfn->p_dpc_ptt,
+	       PGLUE_B_REG_LATCHED_ERRORS_CLR, (1 << 2));
+
+	return 0;
+}
+
+#define QED_DORQ_ATTENTION_REASON_MASK	(0xfffff)
+#define QED_DORQ_ATTENTION_OPAQUE_MASK (0xffff)
+#define QED_DORQ_ATTENTION_SIZE_MASK	(0x7f)
+#define QED_DORQ_ATTENTION_SIZE_SHIFT	(16)
+static int qed_dorq_attn_cb(struct qed_hwfn *p_hwfn)
+{
+	u32 reason;
+
+	reason = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, DORQ_REG_DB_DROP_REASON) &
+			QED_DORQ_ATTENTION_REASON_MASK;
+	if (reason) {
+		u32 details = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+				     DORQ_REG_DB_DROP_DETAILS);
+
+		DP_INFO(p_hwfn->cdev,
+			"DORQ db_drop: address 0x%08x Opaque FID 0x%04x Size [bytes] 0x%08x Reason: 0x%08x\n",
+			qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+			       DORQ_REG_DB_DROP_DETAILS_ADDRESS),
+			(u16)(details & QED_DORQ_ATTENTION_OPAQUE_MASK),
+			GET_FIELD(details, QED_DORQ_ATTENTION_SIZE) * 4,
+			reason);
+	}
+
+	return -EINVAL;
+}
+
+/* Instead of major changes to the data-structure, we have a some 'special'
+ * identifiers for sources that changed meaning between adapters.
+ */
+enum aeu_invert_reg_special_type {
+	AEU_INVERT_REG_SPECIAL_CNIG_0,
+	AEU_INVERT_REG_SPECIAL_CNIG_1,
+	AEU_INVERT_REG_SPECIAL_CNIG_2,
+	AEU_INVERT_REG_SPECIAL_CNIG_3,
+	AEU_INVERT_REG_SPECIAL_MAX,
+};
+
+static struct aeu_invert_reg_bit
+aeu_descs_special[AEU_INVERT_REG_SPECIAL_MAX] = {
+	{"CNIG port 0", ATTENTION_SINGLE, NULL, BLOCK_CNIG},
+	{"CNIG port 1", ATTENTION_SINGLE, NULL, BLOCK_CNIG},
+	{"CNIG port 2", ATTENTION_SINGLE, NULL, BLOCK_CNIG},
+	{"CNIG port 3", ATTENTION_SINGLE, NULL, BLOCK_CNIG},
+};
+
+/* Notice aeu_invert_reg must be defined in the same order of bits as HW;  */
+static struct aeu_invert_reg aeu_descs[NUM_ATTN_REGS] = {
+	{
+		{       /* After Invert 1 */
+			{"GPIO0 function%d",
+			 (32 << ATTENTION_LENGTH_SHIFT), NULL, MAX_BLOCK_ID},
+		}
+	},
+
+	{
+		{       /* After Invert 2 */
+			{"PGLUE config_space", ATTENTION_SINGLE,
+			 NULL, MAX_BLOCK_ID},
+			{"PGLUE misc_flr", ATTENTION_SINGLE,
+			 NULL, MAX_BLOCK_ID},
+			{"PGLUE B RBC", ATTENTION_PAR_INT,
+			 qed_pglub_rbc_attn_cb, BLOCK_PGLUE_B},
+			{"PGLUE misc_mctp", ATTENTION_SINGLE,
+			 NULL, MAX_BLOCK_ID},
+			{"Flash event", ATTENTION_SINGLE, NULL, MAX_BLOCK_ID},
+			{"SMB event", ATTENTION_SINGLE,	NULL, MAX_BLOCK_ID},
+			{"Main Power", ATTENTION_SINGLE, NULL, MAX_BLOCK_ID},
+			{"SW timers #%d", (8 << ATTENTION_LENGTH_SHIFT) |
+					  (1 << ATTENTION_OFFSET_SHIFT),
+			 NULL, MAX_BLOCK_ID},
+			{"PCIE glue/PXP VPD %d",
+			 (16 << ATTENTION_LENGTH_SHIFT), NULL, BLOCK_PGLCS},
+		}
+	},
+
+	{
+		{       /* After Invert 3 */
+			{"General Attention %d",
+			 (32 << ATTENTION_LENGTH_SHIFT), NULL, MAX_BLOCK_ID},
+		}
+	},
+
+	{
+		{       /* After Invert 4 */
+			{"General Attention 32", ATTENTION_SINGLE,
+			 NULL, MAX_BLOCK_ID},
+			{"General Attention %d",
+			 (2 << ATTENTION_LENGTH_SHIFT) |
+			 (33 << ATTENTION_OFFSET_SHIFT), NULL, MAX_BLOCK_ID},
+			{"General Attention 35", ATTENTION_SINGLE,
+			 NULL, MAX_BLOCK_ID},
+			{"NWS Parity",
+			 ATTENTION_PAR | ATTENTION_BB_DIFFERENT |
+			 ATTENTION_BB(AEU_INVERT_REG_SPECIAL_CNIG_0),
+			 NULL, BLOCK_NWS},
+			{"NWS Interrupt",
+			 ATTENTION_SINGLE | ATTENTION_BB_DIFFERENT |
+			 ATTENTION_BB(AEU_INVERT_REG_SPECIAL_CNIG_1),
+			 NULL, BLOCK_NWS},
+			{"NWM Parity",
+			 ATTENTION_PAR | ATTENTION_BB_DIFFERENT |
+			 ATTENTION_BB(AEU_INVERT_REG_SPECIAL_CNIG_2),
+			 NULL, BLOCK_NWM},
+			{"NWM Interrupt",
+			 ATTENTION_SINGLE | ATTENTION_BB_DIFFERENT |
+			 ATTENTION_BB(AEU_INVERT_REG_SPECIAL_CNIG_3),
+			 NULL, BLOCK_NWM},
+			{"MCP CPU", ATTENTION_SINGLE,
+			 qed_mcp_attn_cb, MAX_BLOCK_ID},
+			{"MCP Watchdog timer", ATTENTION_SINGLE,
+			 NULL, MAX_BLOCK_ID},
+			{"MCP M2P", ATTENTION_SINGLE, NULL, MAX_BLOCK_ID},
+			{"AVS stop status ready", ATTENTION_SINGLE,
+			 NULL, MAX_BLOCK_ID},
+			{"MSTAT", ATTENTION_PAR_INT, NULL, MAX_BLOCK_ID},
+			{"MSTAT per-path", ATTENTION_PAR_INT,
+			 NULL, MAX_BLOCK_ID},
+			{"Reserved %d", (6 << ATTENTION_LENGTH_SHIFT),
+			 NULL, MAX_BLOCK_ID},
+			{"NIG", ATTENTION_PAR_INT, NULL, BLOCK_NIG},
+			{"BMB/OPTE/MCP", ATTENTION_PAR_INT, NULL, BLOCK_BMB},
+			{"BTB",	ATTENTION_PAR_INT, NULL, BLOCK_BTB},
+			{"BRB",	ATTENTION_PAR_INT, NULL, BLOCK_BRB},
+			{"PRS",	ATTENTION_PAR_INT, NULL, BLOCK_PRS},
+		}
+	},
+
+	{
+		{       /* After Invert 5 */
+			{"SRC", ATTENTION_PAR_INT, NULL, BLOCK_SRC},
+			{"PB Client1", ATTENTION_PAR_INT, NULL, BLOCK_PBF_PB1},
+			{"PB Client2", ATTENTION_PAR_INT, NULL, BLOCK_PBF_PB2},
+			{"RPB", ATTENTION_PAR_INT, NULL, BLOCK_RPB},
+			{"PBF", ATTENTION_PAR_INT, NULL, BLOCK_PBF},
+			{"QM", ATTENTION_PAR_INT, NULL, BLOCK_QM},
+			{"TM", ATTENTION_PAR_INT, NULL, BLOCK_TM},
+			{"MCM",  ATTENTION_PAR_INT, NULL, BLOCK_MCM},
+			{"MSDM", ATTENTION_PAR_INT, NULL, BLOCK_MSDM},
+			{"MSEM", ATTENTION_PAR_INT, NULL, BLOCK_MSEM},
+			{"PCM", ATTENTION_PAR_INT, NULL, BLOCK_PCM},
+			{"PSDM", ATTENTION_PAR_INT, NULL, BLOCK_PSDM},
+			{"PSEM", ATTENTION_PAR_INT, NULL, BLOCK_PSEM},
+			{"TCM", ATTENTION_PAR_INT, NULL, BLOCK_TCM},
+			{"TSDM", ATTENTION_PAR_INT, NULL, BLOCK_TSDM},
+			{"TSEM", ATTENTION_PAR_INT, NULL, BLOCK_TSEM},
+		}
+	},
+
+	{
+		{       /* After Invert 6 */
+			{"UCM", ATTENTION_PAR_INT, NULL, BLOCK_UCM},
+			{"USDM", ATTENTION_PAR_INT, NULL, BLOCK_USDM},
+			{"USEM", ATTENTION_PAR_INT, NULL, BLOCK_USEM},
+			{"XCM",	ATTENTION_PAR_INT, NULL, BLOCK_XCM},
+			{"XSDM", ATTENTION_PAR_INT, NULL, BLOCK_XSDM},
+			{"XSEM", ATTENTION_PAR_INT, NULL, BLOCK_XSEM},
+			{"YCM",	ATTENTION_PAR_INT, NULL, BLOCK_YCM},
+			{"YSDM", ATTENTION_PAR_INT, NULL, BLOCK_YSDM},
+			{"YSEM", ATTENTION_PAR_INT, NULL, BLOCK_YSEM},
+			{"XYLD", ATTENTION_PAR_INT, NULL, BLOCK_XYLD},
+			{"TMLD", ATTENTION_PAR_INT, NULL, BLOCK_TMLD},
+			{"MYLD", ATTENTION_PAR_INT, NULL, BLOCK_MULD},
+			{"YULD", ATTENTION_PAR_INT, NULL, BLOCK_YULD},
+			{"DORQ", ATTENTION_PAR_INT,
+			 qed_dorq_attn_cb, BLOCK_DORQ},
+			{"DBG", ATTENTION_PAR_INT, NULL, BLOCK_DBG},
+			{"IPC",	ATTENTION_PAR_INT, NULL, BLOCK_IPC},
+		}
+	},
+
+	{
+		{       /* After Invert 7 */
+			{"CCFC", ATTENTION_PAR_INT, NULL, BLOCK_CCFC},
+			{"CDU", ATTENTION_PAR_INT, NULL, BLOCK_CDU},
+			{"DMAE", ATTENTION_PAR_INT, NULL, BLOCK_DMAE},
+			{"IGU", ATTENTION_PAR_INT, NULL, BLOCK_IGU},
+			{"ATC", ATTENTION_PAR_INT, NULL, MAX_BLOCK_ID},
+			{"CAU", ATTENTION_PAR_INT, NULL, BLOCK_CAU},
+			{"PTU", ATTENTION_PAR_INT, NULL, BLOCK_PTU},
+			{"PRM", ATTENTION_PAR_INT, NULL, BLOCK_PRM},
+			{"TCFC", ATTENTION_PAR_INT, NULL, BLOCK_TCFC},
+			{"RDIF", ATTENTION_PAR_INT, NULL, BLOCK_RDIF},
+			{"TDIF", ATTENTION_PAR_INT, NULL, BLOCK_TDIF},
+			{"RSS", ATTENTION_PAR_INT, NULL, BLOCK_RSS},
+			{"MISC", ATTENTION_PAR_INT, NULL, BLOCK_MISC},
+			{"MISCS", ATTENTION_PAR_INT, NULL, BLOCK_MISCS},
+			{"PCIE", ATTENTION_PAR, NULL, BLOCK_PCIE},
+			{"Vaux PCI core", ATTENTION_SINGLE, NULL, BLOCK_PGLCS},
+			{"PSWRQ", ATTENTION_PAR_INT, NULL, BLOCK_PSWRQ},
+		}
+	},
+
+	{
+		{       /* After Invert 8 */
+			{"PSWRQ (pci_clk)", ATTENTION_PAR_INT,
+			 NULL, BLOCK_PSWRQ2},
+			{"PSWWR", ATTENTION_PAR_INT, NULL, BLOCK_PSWWR},
+			{"PSWWR (pci_clk)", ATTENTION_PAR_INT,
+			 NULL, BLOCK_PSWWR2},
+			{"PSWRD", ATTENTION_PAR_INT, NULL, BLOCK_PSWRD},
+			{"PSWRD (pci_clk)", ATTENTION_PAR_INT,
+			 NULL, BLOCK_PSWRD2},
+			{"PSWHST", ATTENTION_PAR_INT,
+			 qed_pswhst_attn_cb, BLOCK_PSWHST},
+			{"PSWHST (pci_clk)", ATTENTION_PAR_INT,
+			 NULL, BLOCK_PSWHST2},
+			{"GRC",	ATTENTION_PAR_INT,
+			 qed_grc_attn_cb, BLOCK_GRC},
+			{"CPMU", ATTENTION_PAR_INT, NULL, BLOCK_CPMU},
+			{"NCSI", ATTENTION_PAR_INT, NULL, BLOCK_NCSI},
+			{"MSEM PRAM", ATTENTION_PAR, NULL, MAX_BLOCK_ID},
+			{"PSEM PRAM", ATTENTION_PAR, NULL, MAX_BLOCK_ID},
+			{"TSEM PRAM", ATTENTION_PAR, NULL, MAX_BLOCK_ID},
+			{"USEM PRAM", ATTENTION_PAR, NULL, MAX_BLOCK_ID},
+			{"XSEM PRAM", ATTENTION_PAR, NULL, MAX_BLOCK_ID},
+			{"YSEM PRAM", ATTENTION_PAR, NULL, MAX_BLOCK_ID},
+			{"pxp_misc_mps", ATTENTION_PAR, NULL, BLOCK_PGLCS},
+			{"PCIE glue/PXP Exp. ROM", ATTENTION_SINGLE,
+			 NULL, BLOCK_PGLCS},
+			{"PERST_B assertion", ATTENTION_SINGLE,
+			 NULL, MAX_BLOCK_ID},
+			{"PERST_B deassertion", ATTENTION_SINGLE,
+			 NULL, MAX_BLOCK_ID},
+			{"Reserved %d", (2 << ATTENTION_LENGTH_SHIFT),
+			 NULL, MAX_BLOCK_ID},
+		}
+	},
+
+	{
+		{       /* After Invert 9 */
+			{"MCP Latched memory", ATTENTION_PAR,
+			 NULL, MAX_BLOCK_ID},
+			{"MCP Latched scratchpad cache", ATTENTION_SINGLE,
+			 NULL, MAX_BLOCK_ID},
+			{"MCP Latched ump_tx", ATTENTION_PAR,
+			 NULL, MAX_BLOCK_ID},
+			{"MCP Latched scratchpad", ATTENTION_PAR,
+			 NULL, MAX_BLOCK_ID},
+			{"Reserved %d", (28 << ATTENTION_LENGTH_SHIFT),
+			 NULL, MAX_BLOCK_ID},
+		}
+	},
+};
+
+static struct aeu_invert_reg_bit *
+qed_int_aeu_translate(struct qed_hwfn *p_hwfn,
+		      struct aeu_invert_reg_bit *p_bit)
+{
+	if (!QED_IS_BB(p_hwfn->cdev))
+		return p_bit;
+
+	if (!(p_bit->flags & ATTENTION_BB_DIFFERENT))
+		return p_bit;
+
+	return &aeu_descs_special[(p_bit->flags & ATTENTION_BB_MASK) >>
+				  ATTENTION_BB_SHIFT];
+}
+
+static bool qed_int_is_parity_flag(struct qed_hwfn *p_hwfn,
+				   struct aeu_invert_reg_bit *p_bit)
+{
+	return !!(qed_int_aeu_translate(p_hwfn, p_bit)->flags &
+		   ATTENTION_PARITY);
+}
+
+#define ATTN_STATE_BITS         (0xfff)
+#define ATTN_BITS_MASKABLE      (0x3ff)
+struct qed_sb_attn_info {
+	/* Virtual & Physical address of the SB */
+	struct atten_status_block       *sb_attn;
+	dma_addr_t			sb_phys;
+
+	/* Last seen running index */
+	u16				index;
+
+	/* A mask of the AEU bits resulting in a parity error */
+	u32				parity_mask[NUM_ATTN_REGS];
+
+	/* A pointer to the attention description structure */
+	struct aeu_invert_reg		*p_aeu_desc;
+
+	/* Previously asserted attentions, which are still unasserted */
+	u16				known_attn;
+
+	/* Cleanup address for the link's general hw attention */
+	u32				mfw_attn_addr;
+};
+
+static inline u16 qed_attn_update_idx(struct qed_hwfn *p_hwfn,
+				      struct qed_sb_attn_info *p_sb_desc)
+{
+	u16 rc = 0, index;
+
+	/* Make certain HW write took affect */
+	mmiowb();
+
+	index = le16_to_cpu(p_sb_desc->sb_attn->sb_index);
+	if (p_sb_desc->index != index) {
+		p_sb_desc->index	= index;
+		rc		      = QED_SB_ATT_IDX;
+	}
+
+	/* Make certain we got a consistent view with HW */
+	mmiowb();
+
+	return rc;
+}
+
+/**
+ *  @brief qed_int_assertion - handles asserted attention bits
+ *
+ *  @param p_hwfn
+ *  @param asserted_bits newly asserted bits
+ *  @return int
+ */
+static int qed_int_assertion(struct qed_hwfn *p_hwfn, u16 asserted_bits)
+{
+	struct qed_sb_attn_info *sb_attn_sw = p_hwfn->p_sb_attn;
+	u32 igu_mask;
+
+	/* Mask the source of the attention in the IGU */
+	igu_mask = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, IGU_REG_ATTENTION_ENABLE);
+	DP_VERBOSE(p_hwfn, NETIF_MSG_INTR, "IGU mask: 0x%08x --> 0x%08x\n",
+		   igu_mask, igu_mask & ~(asserted_bits & ATTN_BITS_MASKABLE));
+	igu_mask &= ~(asserted_bits & ATTN_BITS_MASKABLE);
+	qed_wr(p_hwfn, p_hwfn->p_dpc_ptt, IGU_REG_ATTENTION_ENABLE, igu_mask);
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_INTR,
+		   "inner known ATTN state: 0x%04x --> 0x%04x\n",
+		   sb_attn_sw->known_attn,
+		   sb_attn_sw->known_attn | asserted_bits);
+	sb_attn_sw->known_attn |= asserted_bits;
+
+	/* Handle MCP events */
+	if (asserted_bits & 0x100) {
+		qed_mcp_handle_events(p_hwfn, p_hwfn->p_dpc_ptt);
+		/* Clean the MCP attention */
+		qed_wr(p_hwfn, p_hwfn->p_dpc_ptt,
+		       sb_attn_sw->mfw_attn_addr, 0);
+	}
+
+	DIRECT_REG_WR((u8 __iomem *)p_hwfn->regview +
+		      GTT_BAR0_MAP_REG_IGU_CMD +
+		      ((IGU_CMD_ATTN_BIT_SET_UPPER -
+			IGU_CMD_INT_ACK_BASE) << 3),
+		      (u32)asserted_bits);
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_INTR, "set cmd IGU: 0x%04x\n",
+		   asserted_bits);
+
+	return 0;
+}
+
+static void qed_int_attn_print(struct qed_hwfn *p_hwfn,
+			       enum block_id id,
+			       enum dbg_attn_type type, bool b_clear)
+{
+	struct dbg_attn_block_result attn_results;
+	enum dbg_status status;
+
+	memset(&attn_results, 0, sizeof(attn_results));
+
+	status = qed_dbg_read_attn(p_hwfn, p_hwfn->p_dpc_ptt, id, type,
+				   b_clear, &attn_results);
+	if (status != DBG_STATUS_OK)
+		DP_NOTICE(p_hwfn,
+			  "Failed to parse attention information [status: %s]\n",
+			  qed_dbg_get_status_str(status));
+	else
+		qed_dbg_parse_attn(p_hwfn, &attn_results);
+}
+
+/**
+ * @brief qed_int_deassertion_aeu_bit - handles the effects of a single
+ * cause of the attention
+ *
+ * @param p_hwfn
+ * @param p_aeu - descriptor of an AEU bit which caused the attention
+ * @param aeu_en_reg - register offset of the AEU enable reg. which configured
+ *  this bit to this group.
+ * @param bit_index - index of this bit in the aeu_en_reg
+ *
+ * @return int
+ */
+static int
+qed_int_deassertion_aeu_bit(struct qed_hwfn *p_hwfn,
+			    struct aeu_invert_reg_bit *p_aeu,
+			    u32 aeu_en_reg,
+			    const char *p_bit_name, u32 bitmask)
+{
+	bool b_fatal = false;
+	int rc = -EINVAL;
+	u32 val;
+
+	DP_INFO(p_hwfn, "Deasserted attention `%s'[%08x]\n",
+		p_bit_name, bitmask);
+
+	/* Call callback before clearing the interrupt status */
+	if (p_aeu->cb) {
+		DP_INFO(p_hwfn, "`%s (attention)': Calling Callback function\n",
+			p_bit_name);
+		rc = p_aeu->cb(p_hwfn);
+	}
+
+	if (rc)
+		b_fatal = true;
+
+	/* Print HW block interrupt registers */
+	if (p_aeu->block_index != MAX_BLOCK_ID)
+		qed_int_attn_print(p_hwfn, p_aeu->block_index,
+				   ATTN_TYPE_INTERRUPT, !b_fatal);
+
+
+	/* If the attention is benign, no need to prevent it */
+	if (!rc)
+		goto out;
+
+	/* Prevent this Attention from being asserted in the future */
+	val = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, aeu_en_reg);
+	qed_wr(p_hwfn, p_hwfn->p_dpc_ptt, aeu_en_reg, (val & ~bitmask));
+	DP_INFO(p_hwfn, "`%s' - Disabled future attentions\n",
+		p_bit_name);
+
+out:
+	return rc;
+}
+
+/**
+ * @brief qed_int_deassertion_parity - handle a single parity AEU source
+ *
+ * @param p_hwfn
+ * @param p_aeu - descriptor of an AEU bit which caused the parity
+ * @param aeu_en_reg - address of the AEU enable register
+ * @param bit_index
+ */
+static void qed_int_deassertion_parity(struct qed_hwfn *p_hwfn,
+				       struct aeu_invert_reg_bit *p_aeu,
+				       u32 aeu_en_reg, u8 bit_index)
+{
+	u32 block_id = p_aeu->block_index, mask, val;
+
+	DP_NOTICE(p_hwfn->cdev,
+		  "%s parity attention is set [address 0x%08x, bit %d]\n",
+		  p_aeu->bit_name, aeu_en_reg, bit_index);
+
+	if (block_id != MAX_BLOCK_ID) {
+		qed_int_attn_print(p_hwfn, block_id, ATTN_TYPE_PARITY, false);
+
+		/* In BB, there's a single parity bit for several blocks */
+		if (block_id == BLOCK_BTB) {
+			qed_int_attn_print(p_hwfn, BLOCK_OPTE,
+					   ATTN_TYPE_PARITY, false);
+			qed_int_attn_print(p_hwfn, BLOCK_MCP,
+					   ATTN_TYPE_PARITY, false);
+		}
+	}
+
+	/* Prevent this parity error from being re-asserted */
+	mask = ~BIT(bit_index);
+	val = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, aeu_en_reg);
+	qed_wr(p_hwfn, p_hwfn->p_dpc_ptt, aeu_en_reg, val & mask);
+	DP_INFO(p_hwfn, "`%s' - Disabled future parity errors\n",
+		p_aeu->bit_name);
+}
+
+/**
+ * @brief - handles deassertion of previously asserted attentions.
+ *
+ * @param p_hwfn
+ * @param deasserted_bits - newly deasserted bits
+ * @return int
+ *
+ */
+static int qed_int_deassertion(struct qed_hwfn  *p_hwfn,
+			       u16 deasserted_bits)
+{
+	struct qed_sb_attn_info *sb_attn_sw = p_hwfn->p_sb_attn;
+	u32 aeu_inv_arr[NUM_ATTN_REGS], aeu_mask, aeu_en, en;
+	u8 i, j, k, bit_idx;
+	int rc = 0;
+
+	/* Read the attention registers in the AEU */
+	for (i = 0; i < NUM_ATTN_REGS; i++) {
+		aeu_inv_arr[i] = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt,
+					MISC_REG_AEU_AFTER_INVERT_1_IGU +
+					i * 0x4);
+		DP_VERBOSE(p_hwfn, NETIF_MSG_INTR,
+			   "Deasserted bits [%d]: %08x\n",
+			   i, aeu_inv_arr[i]);
+	}
+
+	/* Find parity attentions first */
+	for (i = 0; i < NUM_ATTN_REGS; i++) {
+		struct aeu_invert_reg *p_aeu = &sb_attn_sw->p_aeu_desc[i];
+		u32 parities;
+
+		aeu_en = MISC_REG_AEU_ENABLE1_IGU_OUT_0 + i * sizeof(u32);
+		en = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, aeu_en);
+
+		/* Skip register in which no parity bit is currently set */
+		parities = sb_attn_sw->parity_mask[i] & aeu_inv_arr[i] & en;
+		if (!parities)
+			continue;
+
+		for (j = 0, bit_idx = 0; bit_idx < 32; j++) {
+			struct aeu_invert_reg_bit *p_bit = &p_aeu->bits[j];
+
+			if (qed_int_is_parity_flag(p_hwfn, p_bit) &&
+			    !!(parities & BIT(bit_idx)))
+				qed_int_deassertion_parity(p_hwfn, p_bit,
+							   aeu_en, bit_idx);
+
+			bit_idx += ATTENTION_LENGTH(p_bit->flags);
+		}
+	}
+
+	/* Find non-parity cause for attention and act */
+	for (k = 0; k < MAX_ATTN_GRPS; k++) {
+		struct aeu_invert_reg_bit *p_aeu;
+
+		/* Handle only groups whose attention is currently deasserted */
+		if (!(deasserted_bits & (1 << k)))
+			continue;
+
+		for (i = 0; i < NUM_ATTN_REGS; i++) {
+			u32 bits;
+
+			aeu_en = MISC_REG_AEU_ENABLE1_IGU_OUT_0 +
+				 i * sizeof(u32) +
+				 k * sizeof(u32) * NUM_ATTN_REGS;
+
+			en = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, aeu_en);
+			bits = aeu_inv_arr[i] & en;
+
+			/* Skip if no bit from this group is currently set */
+			if (!bits)
+				continue;
+
+			/* Find all set bits from current register which belong
+			 * to current group, making them responsible for the
+			 * previous assertion.
+			 */
+			for (j = 0, bit_idx = 0; bit_idx < 32; j++) {
+				long unsigned int bitmask;
+				u8 bit, bit_len;
+
+				p_aeu = &sb_attn_sw->p_aeu_desc[i].bits[j];
+				p_aeu = qed_int_aeu_translate(p_hwfn, p_aeu);
+
+				bit = bit_idx;
+				bit_len = ATTENTION_LENGTH(p_aeu->flags);
+				if (qed_int_is_parity_flag(p_hwfn, p_aeu)) {
+					/* Skip Parity */
+					bit++;
+					bit_len--;
+				}
+
+				bitmask = bits & (((1 << bit_len) - 1) << bit);
+				bitmask >>= bit;
+
+				if (bitmask) {
+					u32 flags = p_aeu->flags;
+					char bit_name[30];
+					u8 num;
+
+					num = (u8)find_first_bit(&bitmask,
+								 bit_len);
+
+					/* Some bits represent more than a
+					 * a single interrupt. Correctly print
+					 * their name.
+					 */
+					if (ATTENTION_LENGTH(flags) > 2 ||
+					    ((flags & ATTENTION_PAR_INT) &&
+					     ATTENTION_LENGTH(flags) > 1))
+						snprintf(bit_name, 30,
+							 p_aeu->bit_name, num);
+					else
+						strncpy(bit_name,
+							p_aeu->bit_name, 30);
+
+					/* We now need to pass bitmask in its
+					 * correct position.
+					 */
+					bitmask <<= bit;
+
+					/* Handle source of the attention */
+					qed_int_deassertion_aeu_bit(p_hwfn,
+								    p_aeu,
+								    aeu_en,
+								    bit_name,
+								    bitmask);
+				}
+
+				bit_idx += ATTENTION_LENGTH(p_aeu->flags);
+			}
+		}
+	}
+
+	/* Clear IGU indication for the deasserted bits */
+	DIRECT_REG_WR((u8 __iomem *)p_hwfn->regview +
+				    GTT_BAR0_MAP_REG_IGU_CMD +
+				    ((IGU_CMD_ATTN_BIT_CLR_UPPER -
+				      IGU_CMD_INT_ACK_BASE) << 3),
+				    ~((u32)deasserted_bits));
+
+	/* Unmask deasserted attentions in IGU */
+	aeu_mask = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, IGU_REG_ATTENTION_ENABLE);
+	aeu_mask |= (deasserted_bits & ATTN_BITS_MASKABLE);
+	qed_wr(p_hwfn, p_hwfn->p_dpc_ptt, IGU_REG_ATTENTION_ENABLE, aeu_mask);
+
+	/* Clear deassertion from inner state */
+	sb_attn_sw->known_attn &= ~deasserted_bits;
+
+	return rc;
+}
+
+static int qed_int_attentions(struct qed_hwfn *p_hwfn)
+{
+	struct qed_sb_attn_info *p_sb_attn_sw = p_hwfn->p_sb_attn;
+	struct atten_status_block *p_sb_attn = p_sb_attn_sw->sb_attn;
+	u32 attn_bits = 0, attn_acks = 0;
+	u16 asserted_bits, deasserted_bits;
+	__le16 index;
+	int rc = 0;
+
+	/* Read current attention bits/acks - safeguard against attentions
+	 * by guaranting work on a synchronized timeframe
+	 */
+	do {
+		index = p_sb_attn->sb_index;
+		attn_bits = le32_to_cpu(p_sb_attn->atten_bits);
+		attn_acks = le32_to_cpu(p_sb_attn->atten_ack);
+	} while (index != p_sb_attn->sb_index);
+	p_sb_attn->sb_index = index;
+
+	/* Attention / Deassertion are meaningful (and in correct state)
+	 * only when they differ and consistent with known state - deassertion
+	 * when previous attention & current ack, and assertion when current
+	 * attention with no previous attention
+	 */
+	asserted_bits = (attn_bits & ~attn_acks & ATTN_STATE_BITS) &
+		~p_sb_attn_sw->known_attn;
+	deasserted_bits = (~attn_bits & attn_acks & ATTN_STATE_BITS) &
+		p_sb_attn_sw->known_attn;
+
+	if ((asserted_bits & ~0x100) || (deasserted_bits & ~0x100)) {
+		DP_INFO(p_hwfn,
+			"Attention: Index: 0x%04x, Bits: 0x%08x, Acks: 0x%08x, asserted: 0x%04x, De-asserted 0x%04x [Prev. known: 0x%04x]\n",
+			index, attn_bits, attn_acks, asserted_bits,
+			deasserted_bits, p_sb_attn_sw->known_attn);
+	} else if (asserted_bits == 0x100) {
+		DP_INFO(p_hwfn, "MFW indication via attention\n");
+	} else {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_INTR,
+			   "MFW indication [deassertion]\n");
+	}
+
+	if (asserted_bits) {
+		rc = qed_int_assertion(p_hwfn, asserted_bits);
+		if (rc)
+			return rc;
+	}
+
+	if (deasserted_bits)
+		rc = qed_int_deassertion(p_hwfn, deasserted_bits);
+
+	return rc;
+}
+
+static void qed_sb_ack_attn(struct qed_hwfn *p_hwfn,
+			    void __iomem *igu_addr, u32 ack_cons)
+{
+	struct igu_prod_cons_update igu_ack = { 0 };
+
+	igu_ack.sb_id_and_flags =
+		((ack_cons << IGU_PROD_CONS_UPDATE_SB_INDEX_SHIFT) |
+		 (1 << IGU_PROD_CONS_UPDATE_UPDATE_FLAG_SHIFT) |
+		 (IGU_INT_NOP << IGU_PROD_CONS_UPDATE_ENABLE_INT_SHIFT) |
+		 (IGU_SEG_ACCESS_ATTN <<
+		  IGU_PROD_CONS_UPDATE_SEGMENT_ACCESS_SHIFT));
+
+	DIRECT_REG_WR(igu_addr, igu_ack.sb_id_and_flags);
+
+	/* Both segments (interrupts & acks) are written to same place address;
+	 * Need to guarantee all commands will be received (in-order) by HW.
+	 */
+	mmiowb();
+	barrier();
+}
+
+void qed_int_sp_dpc(unsigned long hwfn_cookie)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)hwfn_cookie;
+	struct qed_pi_info *pi_info = NULL;
+	struct qed_sb_attn_info *sb_attn;
+	struct qed_sb_info *sb_info;
+	int arr_size;
+	u16 rc = 0;
+
+	if (!p_hwfn->p_sp_sb) {
+		DP_ERR(p_hwfn->cdev, "DPC called - no p_sp_sb\n");
+		return;
+	}
+
+	sb_info = &p_hwfn->p_sp_sb->sb_info;
+	arr_size = ARRAY_SIZE(p_hwfn->p_sp_sb->pi_info_arr);
+	if (!sb_info) {
+		DP_ERR(p_hwfn->cdev,
+		       "Status block is NULL - cannot ack interrupts\n");
+		return;
+	}
+
+	if (!p_hwfn->p_sb_attn) {
+		DP_ERR(p_hwfn->cdev, "DPC called - no p_sb_attn");
+		return;
+	}
+	sb_attn = p_hwfn->p_sb_attn;
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_INTR, "DPC Called! (hwfn %p %d)\n",
+		   p_hwfn, p_hwfn->my_id);
+
+	/* Disable ack for def status block. Required both for msix +
+	 * inta in non-mask mode, in inta does no harm.
+	 */
+	qed_sb_ack(sb_info, IGU_INT_DISABLE, 0);
+
+	/* Gather Interrupts/Attentions information */
+	if (!sb_info->sb_virt) {
+		DP_ERR(p_hwfn->cdev,
+		       "Interrupt Status block is NULL - cannot check for new interrupts!\n");
+	} else {
+		u32 tmp_index = sb_info->sb_ack;
+
+		rc = qed_sb_update_sb_idx(sb_info);
+		DP_VERBOSE(p_hwfn->cdev, NETIF_MSG_INTR,
+			   "Interrupt indices: 0x%08x --> 0x%08x\n",
+			   tmp_index, sb_info->sb_ack);
+	}
+
+	if (!sb_attn || !sb_attn->sb_attn) {
+		DP_ERR(p_hwfn->cdev,
+		       "Attentions Status block is NULL - cannot check for new attentions!\n");
+	} else {
+		u16 tmp_index = sb_attn->index;
+
+		rc |= qed_attn_update_idx(p_hwfn, sb_attn);
+		DP_VERBOSE(p_hwfn->cdev, NETIF_MSG_INTR,
+			   "Attention indices: 0x%08x --> 0x%08x\n",
+			   tmp_index, sb_attn->index);
+	}
+
+	/* Check if we expect interrupts at this time. if not just ack them */
+	if (!(rc & QED_SB_EVENT_MASK)) {
+		qed_sb_ack(sb_info, IGU_INT_ENABLE, 1);
+		return;
+	}
+
+	/* Check the validity of the DPC ptt. If not ack interrupts and fail */
+	if (!p_hwfn->p_dpc_ptt) {
+		DP_NOTICE(p_hwfn->cdev, "Failed to allocate PTT\n");
+		qed_sb_ack(sb_info, IGU_INT_ENABLE, 1);
+		return;
+	}
+
+	if (rc & QED_SB_ATT_IDX)
+		qed_int_attentions(p_hwfn);
+
+	if (rc & QED_SB_IDX) {
+		int pi;
+
+		/* Look for a free index */
+		for (pi = 0; pi < arr_size; pi++) {
+			pi_info = &p_hwfn->p_sp_sb->pi_info_arr[pi];
+			if (pi_info->comp_cb)
+				pi_info->comp_cb(p_hwfn, pi_info->cookie);
+		}
+	}
+
+	if (sb_attn && (rc & QED_SB_ATT_IDX))
+		/* This should be done before the interrupts are enabled,
+		 * since otherwise a new attention will be generated.
+		 */
+		qed_sb_ack_attn(p_hwfn, sb_info->igu_addr, sb_attn->index);
+
+	qed_sb_ack(sb_info, IGU_INT_ENABLE, 1);
+}
+
+static void qed_int_sb_attn_free(struct qed_hwfn *p_hwfn)
+{
+	struct qed_sb_attn_info *p_sb = p_hwfn->p_sb_attn;
+
+	if (!p_sb)
+		return;
+
+	if (p_sb->sb_attn)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  SB_ATTN_ALIGNED_SIZE(p_hwfn),
+				  p_sb->sb_attn, p_sb->sb_phys);
+	kfree(p_sb);
+	p_hwfn->p_sb_attn = NULL;
+}
+
+static void qed_int_sb_attn_setup(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt)
+{
+	struct qed_sb_attn_info *sb_info = p_hwfn->p_sb_attn;
+
+	memset(sb_info->sb_attn, 0, sizeof(*sb_info->sb_attn));
+
+	sb_info->index = 0;
+	sb_info->known_attn = 0;
+
+	/* Configure Attention Status Block in IGU */
+	qed_wr(p_hwfn, p_ptt, IGU_REG_ATTN_MSG_ADDR_L,
+	       lower_32_bits(p_hwfn->p_sb_attn->sb_phys));
+	qed_wr(p_hwfn, p_ptt, IGU_REG_ATTN_MSG_ADDR_H,
+	       upper_32_bits(p_hwfn->p_sb_attn->sb_phys));
+}
+
+static void qed_int_sb_attn_init(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 void *sb_virt_addr, dma_addr_t sb_phy_addr)
+{
+	struct qed_sb_attn_info *sb_info = p_hwfn->p_sb_attn;
+	int i, j, k;
+
+	sb_info->sb_attn = sb_virt_addr;
+	sb_info->sb_phys = sb_phy_addr;
+
+	/* Set the pointer to the AEU descriptors */
+	sb_info->p_aeu_desc = aeu_descs;
+
+	/* Calculate Parity Masks */
+	memset(sb_info->parity_mask, 0, sizeof(u32) * NUM_ATTN_REGS);
+	for (i = 0; i < NUM_ATTN_REGS; i++) {
+		/* j is array index, k is bit index */
+		for (j = 0, k = 0; k < 32; j++) {
+			struct aeu_invert_reg_bit *p_aeu;
+
+			p_aeu = &aeu_descs[i].bits[j];
+			if (qed_int_is_parity_flag(p_hwfn, p_aeu))
+				sb_info->parity_mask[i] |= 1 << k;
+
+			k += ATTENTION_LENGTH(p_aeu->flags);
+		}
+		DP_VERBOSE(p_hwfn, NETIF_MSG_INTR,
+			   "Attn Mask [Reg %d]: 0x%08x\n",
+			   i, sb_info->parity_mask[i]);
+	}
+
+	/* Set the address of cleanup for the mcp attention */
+	sb_info->mfw_attn_addr = (p_hwfn->rel_pf_id << 3) +
+				 MISC_REG_AEU_GENERAL_ATTN_0;
+
+	qed_int_sb_attn_setup(p_hwfn, p_ptt);
+}
+
+static int qed_int_sb_attn_alloc(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt)
+{
+	struct qed_dev *cdev = p_hwfn->cdev;
+	struct qed_sb_attn_info *p_sb;
+	dma_addr_t p_phys = 0;
+	void *p_virt;
+
+	/* SB struct */
+	p_sb = kmalloc(sizeof(*p_sb), GFP_KERNEL);
+	if (!p_sb)
+		return -ENOMEM;
+
+	/* SB ring  */
+	p_virt = dma_alloc_coherent(&cdev->pdev->dev,
+				    SB_ATTN_ALIGNED_SIZE(p_hwfn),
+				    &p_phys, GFP_KERNEL);
+
+	if (!p_virt) {
+		kfree(p_sb);
+		return -ENOMEM;
+	}
+
+	/* Attention setup */
+	p_hwfn->p_sb_attn = p_sb;
+	qed_int_sb_attn_init(p_hwfn, p_ptt, p_virt, p_phys);
+
+	return 0;
+}
+
+/* coalescing timeout = timeset << (timer_res + 1) */
+#define QED_CAU_DEF_RX_USECS 24
+#define QED_CAU_DEF_TX_USECS 48
+
+void qed_init_cau_sb_entry(struct qed_hwfn *p_hwfn,
+			   struct cau_sb_entry *p_sb_entry,
+			   u8 pf_id, u16 vf_number, u8 vf_valid)
+{
+	struct qed_dev *cdev = p_hwfn->cdev;
+	u32 cau_state;
+	u8 timer_res;
+
+	memset(p_sb_entry, 0, sizeof(*p_sb_entry));
+
+	SET_FIELD(p_sb_entry->params, CAU_SB_ENTRY_PF_NUMBER, pf_id);
+	SET_FIELD(p_sb_entry->params, CAU_SB_ENTRY_VF_NUMBER, vf_number);
+	SET_FIELD(p_sb_entry->params, CAU_SB_ENTRY_VF_VALID, vf_valid);
+	SET_FIELD(p_sb_entry->params, CAU_SB_ENTRY_SB_TIMESET0, 0x7F);
+	SET_FIELD(p_sb_entry->params, CAU_SB_ENTRY_SB_TIMESET1, 0x7F);
+
+	cau_state = CAU_HC_DISABLE_STATE;
+
+	if (cdev->int_coalescing_mode == QED_COAL_MODE_ENABLE) {
+		cau_state = CAU_HC_ENABLE_STATE;
+		if (!cdev->rx_coalesce_usecs)
+			cdev->rx_coalesce_usecs = QED_CAU_DEF_RX_USECS;
+		if (!cdev->tx_coalesce_usecs)
+			cdev->tx_coalesce_usecs = QED_CAU_DEF_TX_USECS;
+	}
+
+	/* Coalesce = (timeset << timer-res), timeset is 7bit wide */
+	if (cdev->rx_coalesce_usecs <= 0x7F)
+		timer_res = 0;
+	else if (cdev->rx_coalesce_usecs <= 0xFF)
+		timer_res = 1;
+	else
+		timer_res = 2;
+	SET_FIELD(p_sb_entry->params, CAU_SB_ENTRY_TIMER_RES0, timer_res);
+
+	if (cdev->tx_coalesce_usecs <= 0x7F)
+		timer_res = 0;
+	else if (cdev->tx_coalesce_usecs <= 0xFF)
+		timer_res = 1;
+	else
+		timer_res = 2;
+	SET_FIELD(p_sb_entry->params, CAU_SB_ENTRY_TIMER_RES1, timer_res);
+
+	SET_FIELD(p_sb_entry->data, CAU_SB_ENTRY_STATE0, cau_state);
+	SET_FIELD(p_sb_entry->data, CAU_SB_ENTRY_STATE1, cau_state);
+}
+
+static void qed_int_cau_conf_pi(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt,
+				u16 igu_sb_id,
+				u32 pi_index,
+				enum qed_coalescing_fsm coalescing_fsm,
+				u8 timeset)
+{
+	struct cau_pi_entry pi_entry;
+	u32 sb_offset, pi_offset;
+
+	if (IS_VF(p_hwfn->cdev))
+		return;
+
+	sb_offset = igu_sb_id * PIS_PER_SB_E4;
+	memset(&pi_entry, 0, sizeof(struct cau_pi_entry));
+
+	SET_FIELD(pi_entry.prod, CAU_PI_ENTRY_PI_TIMESET, timeset);
+	if (coalescing_fsm == QED_COAL_RX_STATE_MACHINE)
+		SET_FIELD(pi_entry.prod, CAU_PI_ENTRY_FSM_SEL, 0);
+	else
+		SET_FIELD(pi_entry.prod, CAU_PI_ENTRY_FSM_SEL, 1);
+
+	pi_offset = sb_offset + pi_index;
+	if (p_hwfn->hw_init_done) {
+		qed_wr(p_hwfn, p_ptt,
+		       CAU_REG_PI_MEMORY + pi_offset * sizeof(u32),
+		       *((u32 *)&(pi_entry)));
+	} else {
+		STORE_RT_REG(p_hwfn,
+			     CAU_REG_PI_MEMORY_RT_OFFSET + pi_offset,
+			     *((u32 *)&(pi_entry)));
+	}
+}
+
+void qed_int_cau_conf_sb(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 dma_addr_t sb_phys,
+			 u16 igu_sb_id, u16 vf_number, u8 vf_valid)
+{
+	struct cau_sb_entry sb_entry;
+
+	qed_init_cau_sb_entry(p_hwfn, &sb_entry, p_hwfn->rel_pf_id,
+			      vf_number, vf_valid);
+
+	if (p_hwfn->hw_init_done) {
+		/* Wide-bus, initialize via DMAE */
+		u64 phys_addr = (u64)sb_phys;
+
+		qed_dmae_host2grc(p_hwfn, p_ptt, (u64)(uintptr_t)&phys_addr,
+				  CAU_REG_SB_ADDR_MEMORY +
+				  igu_sb_id * sizeof(u64), 2, 0);
+		qed_dmae_host2grc(p_hwfn, p_ptt, (u64)(uintptr_t)&sb_entry,
+				  CAU_REG_SB_VAR_MEMORY +
+				  igu_sb_id * sizeof(u64), 2, 0);
+	} else {
+		/* Initialize Status Block Address */
+		STORE_RT_REG_AGG(p_hwfn,
+				 CAU_REG_SB_ADDR_MEMORY_RT_OFFSET +
+				 igu_sb_id * 2,
+				 sb_phys);
+
+		STORE_RT_REG_AGG(p_hwfn,
+				 CAU_REG_SB_VAR_MEMORY_RT_OFFSET +
+				 igu_sb_id * 2,
+				 sb_entry);
+	}
+
+	/* Configure pi coalescing if set */
+	if (p_hwfn->cdev->int_coalescing_mode == QED_COAL_MODE_ENABLE) {
+		u8 num_tc = p_hwfn->hw_info.num_hw_tc;
+		u8 timeset, timer_res;
+		u8 i;
+
+		/* timeset = (coalesce >> timer-res), timeset is 7bit wide */
+		if (p_hwfn->cdev->rx_coalesce_usecs <= 0x7F)
+			timer_res = 0;
+		else if (p_hwfn->cdev->rx_coalesce_usecs <= 0xFF)
+			timer_res = 1;
+		else
+			timer_res = 2;
+		timeset = (u8)(p_hwfn->cdev->rx_coalesce_usecs >> timer_res);
+		qed_int_cau_conf_pi(p_hwfn, p_ptt, igu_sb_id, RX_PI,
+				    QED_COAL_RX_STATE_MACHINE, timeset);
+
+		if (p_hwfn->cdev->tx_coalesce_usecs <= 0x7F)
+			timer_res = 0;
+		else if (p_hwfn->cdev->tx_coalesce_usecs <= 0xFF)
+			timer_res = 1;
+		else
+			timer_res = 2;
+		timeset = (u8)(p_hwfn->cdev->tx_coalesce_usecs >> timer_res);
+		for (i = 0; i < num_tc; i++) {
+			qed_int_cau_conf_pi(p_hwfn, p_ptt,
+					    igu_sb_id, TX_PI(i),
+					    QED_COAL_TX_STATE_MACHINE,
+					    timeset);
+		}
+	}
+}
+
+void qed_int_sb_setup(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt, struct qed_sb_info *sb_info)
+{
+	/* zero status block and ack counter */
+	sb_info->sb_ack = 0;
+	memset(sb_info->sb_virt, 0, sizeof(*sb_info->sb_virt));
+
+	if (IS_PF(p_hwfn->cdev))
+		qed_int_cau_conf_sb(p_hwfn, p_ptt, sb_info->sb_phys,
+				    sb_info->igu_sb_id, 0, 0);
+}
+
+struct qed_igu_block *qed_get_igu_free_sb(struct qed_hwfn *p_hwfn, bool b_is_pf)
+{
+	struct qed_igu_block *p_block;
+	u16 igu_id;
+
+	for (igu_id = 0; igu_id < QED_MAPPING_MEMORY_SIZE(p_hwfn->cdev);
+	     igu_id++) {
+		p_block = &p_hwfn->hw_info.p_igu_info->entry[igu_id];
+
+		if (!(p_block->status & QED_IGU_STATUS_VALID) ||
+		    !(p_block->status & QED_IGU_STATUS_FREE))
+			continue;
+
+		if (!!(p_block->status & QED_IGU_STATUS_PF) == b_is_pf)
+			return p_block;
+	}
+
+	return NULL;
+}
+
+static u16 qed_get_pf_igu_sb_id(struct qed_hwfn *p_hwfn, u16 vector_id)
+{
+	struct qed_igu_block *p_block;
+	u16 igu_id;
+
+	for (igu_id = 0; igu_id < QED_MAPPING_MEMORY_SIZE(p_hwfn->cdev);
+	     igu_id++) {
+		p_block = &p_hwfn->hw_info.p_igu_info->entry[igu_id];
+
+		if (!(p_block->status & QED_IGU_STATUS_VALID) ||
+		    !p_block->is_pf ||
+		    p_block->vector_number != vector_id)
+			continue;
+
+		return igu_id;
+	}
+
+	return QED_SB_INVALID_IDX;
+}
+
+u16 qed_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
+{
+	u16 igu_sb_id;
+
+	/* Assuming continuous set of IGU SBs dedicated for given PF */
+	if (sb_id == QED_SP_SB_ID)
+		igu_sb_id = p_hwfn->hw_info.p_igu_info->igu_dsb_id;
+	else if (IS_PF(p_hwfn->cdev))
+		igu_sb_id = qed_get_pf_igu_sb_id(p_hwfn, sb_id + 1);
+	else
+		igu_sb_id = qed_vf_get_igu_sb_id(p_hwfn, sb_id);
+
+	if (sb_id == QED_SP_SB_ID)
+		DP_VERBOSE(p_hwfn, NETIF_MSG_INTR,
+			   "Slowpath SB index in IGU is 0x%04x\n", igu_sb_id);
+	else
+		DP_VERBOSE(p_hwfn, NETIF_MSG_INTR,
+			   "SB [%04x] <--> IGU SB [%04x]\n", sb_id, igu_sb_id);
+
+	return igu_sb_id;
+}
+
+int qed_int_sb_init(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt,
+		    struct qed_sb_info *sb_info,
+		    void *sb_virt_addr, dma_addr_t sb_phy_addr, u16 sb_id)
+{
+	sb_info->sb_virt = sb_virt_addr;
+	sb_info->sb_phys = sb_phy_addr;
+
+	sb_info->igu_sb_id = qed_get_igu_sb_id(p_hwfn, sb_id);
+
+	if (sb_id != QED_SP_SB_ID) {
+		if (IS_PF(p_hwfn->cdev)) {
+			struct qed_igu_info *p_info;
+			struct qed_igu_block *p_block;
+
+			p_info = p_hwfn->hw_info.p_igu_info;
+			p_block = &p_info->entry[sb_info->igu_sb_id];
+
+			p_block->sb_info = sb_info;
+			p_block->status &= ~QED_IGU_STATUS_FREE;
+			p_info->usage.free_cnt--;
+		} else {
+			qed_vf_set_sb_info(p_hwfn, sb_id, sb_info);
+		}
+	}
+
+	sb_info->cdev = p_hwfn->cdev;
+
+	/* The igu address will hold the absolute address that needs to be
+	 * written to for a specific status block
+	 */
+	if (IS_PF(p_hwfn->cdev)) {
+		sb_info->igu_addr = (u8 __iomem *)p_hwfn->regview +
+						  GTT_BAR0_MAP_REG_IGU_CMD +
+						  (sb_info->igu_sb_id << 3);
+	} else {
+		sb_info->igu_addr = (u8 __iomem *)p_hwfn->regview +
+						  PXP_VF_BAR0_START_IGU +
+						  ((IGU_CMD_INT_ACK_BASE +
+						    sb_info->igu_sb_id) << 3);
+	}
+
+	sb_info->flags |= QED_SB_INFO_INIT;
+
+	qed_int_sb_setup(p_hwfn, p_ptt, sb_info);
+
+	return 0;
+}
+
+int qed_int_sb_release(struct qed_hwfn *p_hwfn,
+		       struct qed_sb_info *sb_info, u16 sb_id)
+{
+	struct qed_igu_block *p_block;
+	struct qed_igu_info *p_info;
+
+	if (!sb_info)
+		return 0;
+
+	/* zero status block and ack counter */
+	sb_info->sb_ack = 0;
+	memset(sb_info->sb_virt, 0, sizeof(*sb_info->sb_virt));
+
+	if (IS_VF(p_hwfn->cdev)) {
+		qed_vf_set_sb_info(p_hwfn, sb_id, NULL);
+		return 0;
+	}
+
+	p_info = p_hwfn->hw_info.p_igu_info;
+	p_block = &p_info->entry[sb_info->igu_sb_id];
+
+	/* Vector 0 is reserved to Default SB */
+	if (!p_block->vector_number) {
+		DP_ERR(p_hwfn, "Do Not free sp sb using this function");
+		return -EINVAL;
+	}
+
+	/* Lose reference to client's SB info, and fix counters */
+	p_block->sb_info = NULL;
+	p_block->status |= QED_IGU_STATUS_FREE;
+	p_info->usage.free_cnt++;
+
+	return 0;
+}
+
+static void qed_int_sp_sb_free(struct qed_hwfn *p_hwfn)
+{
+	struct qed_sb_sp_info *p_sb = p_hwfn->p_sp_sb;
+
+	if (!p_sb)
+		return;
+
+	if (p_sb->sb_info.sb_virt)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  SB_ALIGNED_SIZE(p_hwfn),
+				  p_sb->sb_info.sb_virt,
+				  p_sb->sb_info.sb_phys);
+	kfree(p_sb);
+	p_hwfn->p_sp_sb = NULL;
+}
+
+static int qed_int_sp_sb_alloc(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_sb_sp_info *p_sb;
+	dma_addr_t p_phys = 0;
+	void *p_virt;
+
+	/* SB struct */
+	p_sb = kmalloc(sizeof(*p_sb), GFP_KERNEL);
+	if (!p_sb)
+		return -ENOMEM;
+
+	/* SB ring  */
+	p_virt = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				    SB_ALIGNED_SIZE(p_hwfn),
+				    &p_phys, GFP_KERNEL);
+	if (!p_virt) {
+		kfree(p_sb);
+		return -ENOMEM;
+	}
+
+	/* Status Block setup */
+	p_hwfn->p_sp_sb = p_sb;
+	qed_int_sb_init(p_hwfn, p_ptt, &p_sb->sb_info, p_virt,
+			p_phys, QED_SP_SB_ID);
+
+	memset(p_sb->pi_info_arr, 0, sizeof(p_sb->pi_info_arr));
+
+	return 0;
+}
+
+int qed_int_register_cb(struct qed_hwfn *p_hwfn,
+			qed_int_comp_cb_t comp_cb,
+			void *cookie, u8 *sb_idx, __le16 **p_fw_cons)
+{
+	struct qed_sb_sp_info *p_sp_sb = p_hwfn->p_sp_sb;
+	int rc = -ENOMEM;
+	u8 pi;
+
+	/* Look for a free index */
+	for (pi = 0; pi < ARRAY_SIZE(p_sp_sb->pi_info_arr); pi++) {
+		if (p_sp_sb->pi_info_arr[pi].comp_cb)
+			continue;
+
+		p_sp_sb->pi_info_arr[pi].comp_cb = comp_cb;
+		p_sp_sb->pi_info_arr[pi].cookie = cookie;
+		*sb_idx = pi;
+		*p_fw_cons = &p_sp_sb->sb_info.sb_virt->pi_array[pi];
+		rc = 0;
+		break;
+	}
+
+	return rc;
+}
+
+int qed_int_unregister_cb(struct qed_hwfn *p_hwfn, u8 pi)
+{
+	struct qed_sb_sp_info *p_sp_sb = p_hwfn->p_sp_sb;
+
+	if (p_sp_sb->pi_info_arr[pi].comp_cb == NULL)
+		return -ENOMEM;
+
+	p_sp_sb->pi_info_arr[pi].comp_cb = NULL;
+	p_sp_sb->pi_info_arr[pi].cookie = NULL;
+
+	return 0;
+}
+
+u16 qed_int_get_sp_sb_id(struct qed_hwfn *p_hwfn)
+{
+	return p_hwfn->p_sp_sb->sb_info.igu_sb_id;
+}
+
+void qed_int_igu_enable_int(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt, enum qed_int_mode int_mode)
+{
+	u32 igu_pf_conf = IGU_PF_CONF_FUNC_EN | IGU_PF_CONF_ATTN_BIT_EN;
+
+	p_hwfn->cdev->int_mode = int_mode;
+	switch (p_hwfn->cdev->int_mode) {
+	case QED_INT_MODE_INTA:
+		igu_pf_conf |= IGU_PF_CONF_INT_LINE_EN;
+		igu_pf_conf |= IGU_PF_CONF_SINGLE_ISR_EN;
+		break;
+
+	case QED_INT_MODE_MSI:
+		igu_pf_conf |= IGU_PF_CONF_MSI_MSIX_EN;
+		igu_pf_conf |= IGU_PF_CONF_SINGLE_ISR_EN;
+		break;
+
+	case QED_INT_MODE_MSIX:
+		igu_pf_conf |= IGU_PF_CONF_MSI_MSIX_EN;
+		break;
+	case QED_INT_MODE_POLL:
+		break;
+	}
+
+	qed_wr(p_hwfn, p_ptt, IGU_REG_PF_CONFIGURATION, igu_pf_conf);
+}
+
+static void qed_int_igu_enable_attn(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt)
+{
+
+	/* Configure AEU signal change to produce attentions */
+	qed_wr(p_hwfn, p_ptt, IGU_REG_ATTENTION_ENABLE, 0);
+	qed_wr(p_hwfn, p_ptt, IGU_REG_LEADING_EDGE_LATCH, 0xfff);
+	qed_wr(p_hwfn, p_ptt, IGU_REG_TRAILING_EDGE_LATCH, 0xfff);
+	qed_wr(p_hwfn, p_ptt, IGU_REG_ATTENTION_ENABLE, 0xfff);
+
+	/* Flush the writes to IGU */
+	mmiowb();
+
+	/* Unmask AEU signals toward IGU */
+	qed_wr(p_hwfn, p_ptt, MISC_REG_AEU_MASK_ATTN_IGU, 0xff);
+}
+
+int
+qed_int_igu_enable(struct qed_hwfn *p_hwfn,
+		   struct qed_ptt *p_ptt, enum qed_int_mode int_mode)
+{
+	int rc = 0;
+
+	qed_int_igu_enable_attn(p_hwfn, p_ptt);
+
+	if ((int_mode != QED_INT_MODE_INTA) || IS_LEAD_HWFN(p_hwfn)) {
+		rc = qed_slowpath_irq_req(p_hwfn);
+		if (rc) {
+			DP_NOTICE(p_hwfn, "Slowpath IRQ request failed\n");
+			return -EINVAL;
+		}
+		p_hwfn->b_int_requested = true;
+	}
+	/* Enable interrupt Generation */
+	qed_int_igu_enable_int(p_hwfn, p_ptt, int_mode);
+	p_hwfn->b_int_enabled = 1;
+
+	return rc;
+}
+
+void qed_int_igu_disable_int(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	p_hwfn->b_int_enabled = 0;
+
+	if (IS_VF(p_hwfn->cdev))
+		return;
+
+	qed_wr(p_hwfn, p_ptt, IGU_REG_PF_CONFIGURATION, 0);
+}
+
+#define IGU_CLEANUP_SLEEP_LENGTH                (1000)
+static void qed_int_igu_cleanup_sb(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   u16 igu_sb_id,
+				   bool cleanup_set, u16 opaque_fid)
+{
+	u32 cmd_ctrl = 0, val = 0, sb_bit = 0, sb_bit_addr = 0, data = 0;
+	u32 pxp_addr = IGU_CMD_INT_ACK_BASE + igu_sb_id;
+	u32 sleep_cnt = IGU_CLEANUP_SLEEP_LENGTH;
+
+	/* Set the data field */
+	SET_FIELD(data, IGU_CLEANUP_CLEANUP_SET, cleanup_set ? 1 : 0);
+	SET_FIELD(data, IGU_CLEANUP_CLEANUP_TYPE, 0);
+	SET_FIELD(data, IGU_CLEANUP_COMMAND_TYPE, IGU_COMMAND_TYPE_SET);
+
+	/* Set the control register */
+	SET_FIELD(cmd_ctrl, IGU_CTRL_REG_PXP_ADDR, pxp_addr);
+	SET_FIELD(cmd_ctrl, IGU_CTRL_REG_FID, opaque_fid);
+	SET_FIELD(cmd_ctrl, IGU_CTRL_REG_TYPE, IGU_CTRL_CMD_TYPE_WR);
+
+	qed_wr(p_hwfn, p_ptt, IGU_REG_COMMAND_REG_32LSB_DATA, data);
+
+	barrier();
+
+	qed_wr(p_hwfn, p_ptt, IGU_REG_COMMAND_REG_CTRL, cmd_ctrl);
+
+	/* Flush the write to IGU */
+	mmiowb();
+
+	/* calculate where to read the status bit from */
+	sb_bit = 1 << (igu_sb_id % 32);
+	sb_bit_addr = igu_sb_id / 32 * sizeof(u32);
+
+	sb_bit_addr += IGU_REG_CLEANUP_STATUS_0;
+
+	/* Now wait for the command to complete */
+	do {
+		val = qed_rd(p_hwfn, p_ptt, sb_bit_addr);
+
+		if ((val & sb_bit) == (cleanup_set ? sb_bit : 0))
+			break;
+
+		usleep_range(5000, 10000);
+	} while (--sleep_cnt);
+
+	if (!sleep_cnt)
+		DP_NOTICE(p_hwfn,
+			  "Timeout waiting for clear status 0x%08x [for sb %d]\n",
+			  val, igu_sb_id);
+}
+
+void qed_int_igu_init_pure_rt_single(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     u16 igu_sb_id, u16 opaque, bool b_set)
+{
+	struct qed_igu_block *p_block;
+	int pi, i;
+
+	p_block = &p_hwfn->hw_info.p_igu_info->entry[igu_sb_id];
+	DP_VERBOSE(p_hwfn, NETIF_MSG_INTR,
+		   "Cleaning SB [%04x]: func_id= %d is_pf = %d vector_num = 0x%0x\n",
+		   igu_sb_id,
+		   p_block->function_id,
+		   p_block->is_pf, p_block->vector_number);
+
+	/* Set */
+	if (b_set)
+		qed_int_igu_cleanup_sb(p_hwfn, p_ptt, igu_sb_id, 1, opaque);
+
+	/* Clear */
+	qed_int_igu_cleanup_sb(p_hwfn, p_ptt, igu_sb_id, 0, opaque);
+
+	/* Wait for the IGU SB to cleanup */
+	for (i = 0; i < IGU_CLEANUP_SLEEP_LENGTH; i++) {
+		u32 val;
+
+		val = qed_rd(p_hwfn, p_ptt,
+			     IGU_REG_WRITE_DONE_PENDING +
+			     ((igu_sb_id / 32) * 4));
+		if (val & BIT((igu_sb_id % 32)))
+			usleep_range(10, 20);
+		else
+			break;
+	}
+	if (i == IGU_CLEANUP_SLEEP_LENGTH)
+		DP_NOTICE(p_hwfn,
+			  "Failed SB[0x%08x] still appearing in WRITE_DONE_PENDING\n",
+			  igu_sb_id);
+
+	/* Clear the CAU for the SB */
+	for (pi = 0; pi < 12; pi++)
+		qed_wr(p_hwfn, p_ptt,
+		       CAU_REG_PI_MEMORY + (igu_sb_id * 12 + pi) * 4, 0);
+}
+
+void qed_int_igu_init_pure_rt(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      bool b_set, bool b_slowpath)
+{
+	struct qed_igu_info *p_info = p_hwfn->hw_info.p_igu_info;
+	struct qed_igu_block *p_block;
+	u16 igu_sb_id = 0;
+	u32 val = 0;
+
+	val = qed_rd(p_hwfn, p_ptt, IGU_REG_BLOCK_CONFIGURATION);
+	val |= IGU_REG_BLOCK_CONFIGURATION_VF_CLEANUP_EN;
+	val &= ~IGU_REG_BLOCK_CONFIGURATION_PXP_TPH_INTERFACE_EN;
+	qed_wr(p_hwfn, p_ptt, IGU_REG_BLOCK_CONFIGURATION, val);
+
+	for (igu_sb_id = 0;
+	     igu_sb_id < QED_MAPPING_MEMORY_SIZE(p_hwfn->cdev); igu_sb_id++) {
+		p_block = &p_info->entry[igu_sb_id];
+
+		if (!(p_block->status & QED_IGU_STATUS_VALID) ||
+		    !p_block->is_pf ||
+		    (p_block->status & QED_IGU_STATUS_DSB))
+			continue;
+
+		qed_int_igu_init_pure_rt_single(p_hwfn, p_ptt, igu_sb_id,
+						p_hwfn->hw_info.opaque_fid,
+						b_set);
+	}
+
+	if (b_slowpath)
+		qed_int_igu_init_pure_rt_single(p_hwfn, p_ptt,
+						p_info->igu_dsb_id,
+						p_hwfn->hw_info.opaque_fid,
+						b_set);
+}
+
+int qed_int_igu_reset_cam(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_igu_info *p_info = p_hwfn->hw_info.p_igu_info;
+	struct qed_igu_block *p_block;
+	int pf_sbs, vf_sbs;
+	u16 igu_sb_id;
+	u32 val, rval;
+
+	if (!RESC_NUM(p_hwfn, QED_SB)) {
+		p_info->b_allow_pf_vf_change = false;
+	} else {
+		/* Use the numbers the MFW have provided -
+		 * don't forget MFW accounts for the default SB as well.
+		 */
+		p_info->b_allow_pf_vf_change = true;
+
+		if (p_info->usage.cnt != RESC_NUM(p_hwfn, QED_SB) - 1) {
+			DP_INFO(p_hwfn,
+				"MFW notifies of 0x%04x PF SBs; IGU indicates of only 0x%04x\n",
+				RESC_NUM(p_hwfn, QED_SB) - 1,
+				p_info->usage.cnt);
+			p_info->usage.cnt = RESC_NUM(p_hwfn, QED_SB) - 1;
+		}
+
+		if (IS_PF_SRIOV(p_hwfn)) {
+			u16 vfs = p_hwfn->cdev->p_iov_info->total_vfs;
+
+			if (vfs != p_info->usage.iov_cnt)
+				DP_VERBOSE(p_hwfn,
+					   NETIF_MSG_INTR,
+					   "0x%04x VF SBs in IGU CAM != PCI configuration 0x%04x\n",
+					   p_info->usage.iov_cnt, vfs);
+
+			/* At this point we know how many SBs we have totally
+			 * in IGU + number of PF SBs. So we can validate that
+			 * we'd have sufficient for VF.
+			 */
+			if (vfs > p_info->usage.free_cnt +
+			    p_info->usage.free_cnt_iov - p_info->usage.cnt) {
+				DP_NOTICE(p_hwfn,
+					  "Not enough SBs for VFs - 0x%04x SBs, from which %04x PFs and %04x are required\n",
+					  p_info->usage.free_cnt +
+					  p_info->usage.free_cnt_iov,
+					  p_info->usage.cnt, vfs);
+				return -EINVAL;
+			}
+
+			/* Currently cap the number of VFs SBs by the
+			 * number of VFs.
+			 */
+			p_info->usage.iov_cnt = vfs;
+		}
+	}
+
+	/* Mark all SBs as free, now in the right PF/VFs division */
+	p_info->usage.free_cnt = p_info->usage.cnt;
+	p_info->usage.free_cnt_iov = p_info->usage.iov_cnt;
+	p_info->usage.orig = p_info->usage.cnt;
+	p_info->usage.iov_orig = p_info->usage.iov_cnt;
+
+	/* We now proceed to re-configure the IGU cam to reflect the initial
+	 * configuration. We can start with the Default SB.
+	 */
+	pf_sbs = p_info->usage.cnt;
+	vf_sbs = p_info->usage.iov_cnt;
+
+	for (igu_sb_id = p_info->igu_dsb_id;
+	     igu_sb_id < QED_MAPPING_MEMORY_SIZE(p_hwfn->cdev); igu_sb_id++) {
+		p_block = &p_info->entry[igu_sb_id];
+		val = 0;
+
+		if (!(p_block->status & QED_IGU_STATUS_VALID))
+			continue;
+
+		if (p_block->status & QED_IGU_STATUS_DSB) {
+			p_block->function_id = p_hwfn->rel_pf_id;
+			p_block->is_pf = 1;
+			p_block->vector_number = 0;
+			p_block->status = QED_IGU_STATUS_VALID |
+					  QED_IGU_STATUS_PF |
+					  QED_IGU_STATUS_DSB;
+		} else if (pf_sbs) {
+			pf_sbs--;
+			p_block->function_id = p_hwfn->rel_pf_id;
+			p_block->is_pf = 1;
+			p_block->vector_number = p_info->usage.cnt - pf_sbs;
+			p_block->status = QED_IGU_STATUS_VALID |
+					  QED_IGU_STATUS_PF |
+					  QED_IGU_STATUS_FREE;
+		} else if (vf_sbs) {
+			p_block->function_id =
+			    p_hwfn->cdev->p_iov_info->first_vf_in_pf +
+			    p_info->usage.iov_cnt - vf_sbs;
+			p_block->is_pf = 0;
+			p_block->vector_number = 0;
+			p_block->status = QED_IGU_STATUS_VALID |
+					  QED_IGU_STATUS_FREE;
+			vf_sbs--;
+		} else {
+			p_block->function_id = 0;
+			p_block->is_pf = 0;
+			p_block->vector_number = 0;
+		}
+
+		SET_FIELD(val, IGU_MAPPING_LINE_FUNCTION_NUMBER,
+			  p_block->function_id);
+		SET_FIELD(val, IGU_MAPPING_LINE_PF_VALID, p_block->is_pf);
+		SET_FIELD(val, IGU_MAPPING_LINE_VECTOR_NUMBER,
+			  p_block->vector_number);
+
+		/* VF entries would be enabled when VF is initializaed */
+		SET_FIELD(val, IGU_MAPPING_LINE_VALID, p_block->is_pf);
+
+		rval = qed_rd(p_hwfn, p_ptt,
+			      IGU_REG_MAPPING_MEMORY + sizeof(u32) * igu_sb_id);
+
+		if (rval != val) {
+			qed_wr(p_hwfn, p_ptt,
+			       IGU_REG_MAPPING_MEMORY +
+			       sizeof(u32) * igu_sb_id, val);
+
+			DP_VERBOSE(p_hwfn,
+				   NETIF_MSG_INTR,
+				   "IGU reset: [SB 0x%04x] func_id = %d is_pf = %d vector_num = 0x%x [%08x -> %08x]\n",
+				   igu_sb_id,
+				   p_block->function_id,
+				   p_block->is_pf,
+				   p_block->vector_number, rval, val);
+		}
+	}
+
+	return 0;
+}
+
+static void qed_int_igu_read_cam_block(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt, u16 igu_sb_id)
+{
+	u32 val = qed_rd(p_hwfn, p_ptt,
+			 IGU_REG_MAPPING_MEMORY + sizeof(u32) * igu_sb_id);
+	struct qed_igu_block *p_block;
+
+	p_block = &p_hwfn->hw_info.p_igu_info->entry[igu_sb_id];
+
+	/* Fill the block information */
+	p_block->function_id = GET_FIELD(val, IGU_MAPPING_LINE_FUNCTION_NUMBER);
+	p_block->is_pf = GET_FIELD(val, IGU_MAPPING_LINE_PF_VALID);
+	p_block->vector_number = GET_FIELD(val, IGU_MAPPING_LINE_VECTOR_NUMBER);
+	p_block->igu_sb_id = igu_sb_id;
+}
+
+int qed_int_igu_read_cam(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_igu_info *p_igu_info;
+	struct qed_igu_block *p_block;
+	u32 min_vf = 0, max_vf = 0;
+	u16 igu_sb_id;
+
+	p_hwfn->hw_info.p_igu_info = kzalloc(sizeof(*p_igu_info), GFP_KERNEL);
+	if (!p_hwfn->hw_info.p_igu_info)
+		return -ENOMEM;
+
+	p_igu_info = p_hwfn->hw_info.p_igu_info;
+
+	/* Distinguish between existent and non-existent default SB */
+	p_igu_info->igu_dsb_id = QED_SB_INVALID_IDX;
+
+	/* Find the range of VF ids whose SB belong to this PF */
+	if (p_hwfn->cdev->p_iov_info) {
+		struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
+
+		min_vf	= p_iov->first_vf_in_pf;
+		max_vf	= p_iov->first_vf_in_pf + p_iov->total_vfs;
+	}
+
+	for (igu_sb_id = 0;
+	     igu_sb_id < QED_MAPPING_MEMORY_SIZE(p_hwfn->cdev); igu_sb_id++) {
+		/* Read current entry; Notice it might not belong to this PF */
+		qed_int_igu_read_cam_block(p_hwfn, p_ptt, igu_sb_id);
+		p_block = &p_igu_info->entry[igu_sb_id];
+
+		if ((p_block->is_pf) &&
+		    (p_block->function_id == p_hwfn->rel_pf_id)) {
+			p_block->status = QED_IGU_STATUS_PF |
+					  QED_IGU_STATUS_VALID |
+					  QED_IGU_STATUS_FREE;
+
+			if (p_igu_info->igu_dsb_id != QED_SB_INVALID_IDX)
+				p_igu_info->usage.cnt++;
+		} else if (!(p_block->is_pf) &&
+			   (p_block->function_id >= min_vf) &&
+			   (p_block->function_id < max_vf)) {
+			/* Available for VFs of this PF */
+			p_block->status = QED_IGU_STATUS_VALID |
+					  QED_IGU_STATUS_FREE;
+
+			if (p_igu_info->igu_dsb_id != QED_SB_INVALID_IDX)
+				p_igu_info->usage.iov_cnt++;
+		}
+
+		/* Mark the First entry belonging to the PF or its VFs
+		 * as the default SB [we'll reset IGU prior to first usage].
+		 */
+		if ((p_block->status & QED_IGU_STATUS_VALID) &&
+		    (p_igu_info->igu_dsb_id == QED_SB_INVALID_IDX)) {
+			p_igu_info->igu_dsb_id = igu_sb_id;
+			p_block->status |= QED_IGU_STATUS_DSB;
+		}
+
+		/* limit number of prints by having each PF print only its
+		 * entries with the exception of PF0 which would print
+		 * everything.
+		 */
+		if ((p_block->status & QED_IGU_STATUS_VALID) ||
+		    (p_hwfn->abs_pf_id == 0)) {
+			DP_VERBOSE(p_hwfn, NETIF_MSG_INTR,
+				   "IGU_BLOCK: [SB 0x%04x] func_id = %d is_pf = %d vector_num = 0x%x\n",
+				   igu_sb_id, p_block->function_id,
+				   p_block->is_pf, p_block->vector_number);
+		}
+	}
+
+	if (p_igu_info->igu_dsb_id == QED_SB_INVALID_IDX) {
+		DP_NOTICE(p_hwfn,
+			  "IGU CAM returned invalid values igu_dsb_id=0x%x\n",
+			  p_igu_info->igu_dsb_id);
+		return -EINVAL;
+	}
+
+	/* All non default SB are considered free at this point */
+	p_igu_info->usage.free_cnt = p_igu_info->usage.cnt;
+	p_igu_info->usage.free_cnt_iov = p_igu_info->usage.iov_cnt;
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_INTR,
+		   "igu_dsb_id=0x%x, num Free SBs - PF: %04x VF: %04x [might change after resource allocation]\n",
+		   p_igu_info->igu_dsb_id,
+		   p_igu_info->usage.cnt, p_igu_info->usage.iov_cnt);
+
+	return 0;
+}
+
+/**
+ * @brief Initialize igu runtime registers
+ *
+ * @param p_hwfn
+ */
+void qed_int_igu_init_rt(struct qed_hwfn *p_hwfn)
+{
+	u32 igu_pf_conf = IGU_PF_CONF_FUNC_EN;
+
+	STORE_RT_REG(p_hwfn, IGU_REG_PF_CONFIGURATION_RT_OFFSET, igu_pf_conf);
+}
+
+u64 qed_int_igu_read_sisr_reg(struct qed_hwfn *p_hwfn)
+{
+	u32 lsb_igu_cmd_addr = IGU_REG_SISR_MDPC_WMASK_LSB_UPPER -
+			       IGU_CMD_INT_ACK_BASE;
+	u32 msb_igu_cmd_addr = IGU_REG_SISR_MDPC_WMASK_MSB_UPPER -
+			       IGU_CMD_INT_ACK_BASE;
+	u32 intr_status_hi = 0, intr_status_lo = 0;
+	u64 intr_status = 0;
+
+	intr_status_lo = REG_RD(p_hwfn,
+				GTT_BAR0_MAP_REG_IGU_CMD +
+				lsb_igu_cmd_addr * 8);
+	intr_status_hi = REG_RD(p_hwfn,
+				GTT_BAR0_MAP_REG_IGU_CMD +
+				msb_igu_cmd_addr * 8);
+	intr_status = ((u64)intr_status_hi << 32) + (u64)intr_status_lo;
+
+	return intr_status;
+}
+
+static void qed_int_sp_dpc_setup(struct qed_hwfn *p_hwfn)
+{
+	tasklet_init(p_hwfn->sp_dpc,
+		     qed_int_sp_dpc, (unsigned long)p_hwfn);
+	p_hwfn->b_sp_dpc_enabled = true;
+}
+
+static int qed_int_sp_dpc_alloc(struct qed_hwfn *p_hwfn)
+{
+	p_hwfn->sp_dpc = kmalloc(sizeof(*p_hwfn->sp_dpc), GFP_KERNEL);
+	if (!p_hwfn->sp_dpc)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void qed_int_sp_dpc_free(struct qed_hwfn *p_hwfn)
+{
+	kfree(p_hwfn->sp_dpc);
+	p_hwfn->sp_dpc = NULL;
+}
+
+int qed_int_alloc(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	int rc = 0;
+
+	rc = qed_int_sp_dpc_alloc(p_hwfn);
+	if (rc)
+		return rc;
+
+	rc = qed_int_sp_sb_alloc(p_hwfn, p_ptt);
+	if (rc)
+		return rc;
+
+	rc = qed_int_sb_attn_alloc(p_hwfn, p_ptt);
+
+	return rc;
+}
+
+void qed_int_free(struct qed_hwfn *p_hwfn)
+{
+	qed_int_sp_sb_free(p_hwfn);
+	qed_int_sb_attn_free(p_hwfn);
+	qed_int_sp_dpc_free(p_hwfn);
+}
+
+void qed_int_setup(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	qed_int_sb_setup(p_hwfn, p_ptt, &p_hwfn->p_sp_sb->sb_info);
+	qed_int_sb_attn_setup(p_hwfn, p_ptt);
+	qed_int_sp_dpc_setup(p_hwfn);
+}
+
+void qed_int_get_num_sbs(struct qed_hwfn	*p_hwfn,
+			 struct qed_sb_cnt_info *p_sb_cnt_info)
+{
+	struct qed_igu_info *info = p_hwfn->hw_info.p_igu_info;
+
+	if (!info || !p_sb_cnt_info)
+		return;
+
+	memcpy(p_sb_cnt_info, &info->usage, sizeof(*p_sb_cnt_info));
+}
+
+void qed_int_disable_post_isr_release(struct qed_dev *cdev)
+{
+	int i;
+
+	for_each_hwfn(cdev, i)
+		cdev->hwfns[i].b_int_requested = false;
+}
+
+int qed_int_set_timer_res(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			  u8 timer_res, u16 sb_id, bool tx)
+{
+	struct cau_sb_entry sb_entry;
+	int rc;
+
+	if (!p_hwfn->hw_init_done) {
+		DP_ERR(p_hwfn, "hardware not initialized yet\n");
+		return -EINVAL;
+	}
+
+	rc = qed_dmae_grc2host(p_hwfn, p_ptt, CAU_REG_SB_VAR_MEMORY +
+			       sb_id * sizeof(u64),
+			       (u64)(uintptr_t)&sb_entry, 2, 0);
+	if (rc) {
+		DP_ERR(p_hwfn, "dmae_grc2host failed %d\n", rc);
+		return rc;
+	}
+
+	if (tx)
+		SET_FIELD(sb_entry.params, CAU_SB_ENTRY_TIMER_RES1, timer_res);
+	else
+		SET_FIELD(sb_entry.params, CAU_SB_ENTRY_TIMER_RES0, timer_res);
+
+	rc = qed_dmae_host2grc(p_hwfn, p_ptt,
+			       (u64)(uintptr_t)&sb_entry,
+			       CAU_REG_SB_VAR_MEMORY +
+			       sb_id * sizeof(u64), 2, 0);
+	if (rc) {
+		DP_ERR(p_hwfn, "dmae_host2grc failed %d\n", rc);
+		return rc;
+	}
+
+	return rc;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.h b/drivers/net/ethernet/qlogic/qed/qed_int.h
new file mode 100644
index 0000000..54b4ee0
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.h
@@ -0,0 +1,424 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_INT_H
+#define _QED_INT_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include "qed.h"
+
+/* Fields of IGU PF CONFIGRATION REGISTER */
+#define IGU_PF_CONF_FUNC_EN       (0x1 << 0)    /* function enable        */
+#define IGU_PF_CONF_MSI_MSIX_EN   (0x1 << 1)    /* MSI/MSIX enable        */
+#define IGU_PF_CONF_INT_LINE_EN   (0x1 << 2)    /* INT enable             */
+#define IGU_PF_CONF_ATTN_BIT_EN   (0x1 << 3)    /* attention enable       */
+#define IGU_PF_CONF_SINGLE_ISR_EN (0x1 << 4)    /* single ISR mode enable */
+#define IGU_PF_CONF_SIMD_MODE     (0x1 << 5)    /* simd all ones mode     */
+/* Fields of IGU VF CONFIGRATION REGISTER */
+#define IGU_VF_CONF_FUNC_EN        (0x1 << 0)	/* function enable        */
+#define IGU_VF_CONF_MSI_MSIX_EN    (0x1 << 1)	/* MSI/MSIX enable        */
+#define IGU_VF_CONF_SINGLE_ISR_EN  (0x1 << 4)	/* single ISR mode enable */
+#define IGU_VF_CONF_PARENT_MASK    (0xF)	/* Parent PF              */
+#define IGU_VF_CONF_PARENT_SHIFT   5		/* Parent PF              */
+
+/* Igu control commands
+ */
+enum igu_ctrl_cmd {
+	IGU_CTRL_CMD_TYPE_RD,
+	IGU_CTRL_CMD_TYPE_WR,
+	MAX_IGU_CTRL_CMD
+};
+
+/* Control register for the IGU command register
+ */
+struct igu_ctrl_reg {
+	u32 ctrl_data;
+#define IGU_CTRL_REG_FID_MASK           0xFFFF  /* Opaque_FID	 */
+#define IGU_CTRL_REG_FID_SHIFT          0
+#define IGU_CTRL_REG_PXP_ADDR_MASK      0xFFF   /* Command address */
+#define IGU_CTRL_REG_PXP_ADDR_SHIFT     16
+#define IGU_CTRL_REG_RESERVED_MASK      0x1
+#define IGU_CTRL_REG_RESERVED_SHIFT     28
+#define IGU_CTRL_REG_TYPE_MASK          0x1 /* use enum igu_ctrl_cmd */
+#define IGU_CTRL_REG_TYPE_SHIFT         31
+};
+
+enum qed_coalescing_fsm {
+	QED_COAL_RX_STATE_MACHINE,
+	QED_COAL_TX_STATE_MACHINE
+};
+
+/**
+ * @brief qed_int_igu_enable_int - enable device interrupts
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param int_mode - interrupt mode to use
+ */
+void qed_int_igu_enable_int(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt,
+			    enum qed_int_mode int_mode);
+
+/**
+ * @brief qed_int_igu_disable_int - disable device interrupts
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+void qed_int_igu_disable_int(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt);
+
+/**
+ * @brief qed_int_igu_read_sisr_reg - Reads the single isr multiple dpc
+ *        register from igu.
+ *
+ * @param p_hwfn
+ *
+ * @return u64
+ */
+u64 qed_int_igu_read_sisr_reg(struct qed_hwfn *p_hwfn);
+
+#define QED_SP_SB_ID 0xffff
+/**
+ * @brief qed_int_sb_init - Initializes the sb_info structure.
+ *
+ * once the structure is initialized it can be passed to sb related functions.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param sb_info	points to an uninitialized (but
+ *			allocated) sb_info structure
+ * @param sb_virt_addr
+ * @param sb_phy_addr
+ * @param sb_id	the sb_id to be used (zero based in driver)
+ *			should use QED_SP_SB_ID for SP Status block
+ *
+ * @return int
+ */
+int qed_int_sb_init(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt,
+		    struct qed_sb_info *sb_info,
+		    void *sb_virt_addr,
+		    dma_addr_t sb_phy_addr,
+		    u16 sb_id);
+/**
+ * @brief qed_int_sb_setup - Setup the sb.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param sb_info	initialized sb_info structure
+ */
+void qed_int_sb_setup(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt,
+		      struct qed_sb_info *sb_info);
+
+/**
+ * @brief qed_int_sb_release - releases the sb_info structure.
+ *
+ * once the structure is released, it's memory can be freed
+ *
+ * @param p_hwfn
+ * @param sb_info	points to an allocated sb_info structure
+ * @param sb_id		the sb_id to be used (zero based in driver)
+ *			should never be equal to QED_SP_SB_ID
+ *			(SP Status block)
+ *
+ * @return int
+ */
+int qed_int_sb_release(struct qed_hwfn *p_hwfn,
+		       struct qed_sb_info *sb_info,
+		       u16 sb_id);
+
+/**
+ * @brief qed_int_sp_dpc - To be called when an interrupt is received on the
+ *        default status block.
+ *
+ * @param p_hwfn - pointer to hwfn
+ *
+ */
+void qed_int_sp_dpc(unsigned long hwfn_cookie);
+
+/**
+ * @brief qed_int_get_num_sbs - get the number of status
+ *        blocks configured for this funciton in the igu.
+ *
+ * @param p_hwfn
+ * @param p_sb_cnt_info
+ *
+ * @return int - number of status blocks configured
+ */
+void qed_int_get_num_sbs(struct qed_hwfn	*p_hwfn,
+			 struct qed_sb_cnt_info *p_sb_cnt_info);
+
+/**
+ * @brief qed_int_disable_post_isr_release - performs the cleanup post ISR
+ *        release. The API need to be called after releasing all slowpath IRQs
+ *        of the device.
+ *
+ * @param cdev
+ *
+ */
+void qed_int_disable_post_isr_release(struct qed_dev *cdev);
+
+#define QED_CAU_DEF_RX_TIMER_RES 0
+#define QED_CAU_DEF_TX_TIMER_RES 0
+
+#define QED_SB_ATT_IDX  0x0001
+#define QED_SB_EVENT_MASK       0x0003
+
+#define SB_ALIGNED_SIZE(p_hwfn)	\
+	ALIGNED_TYPE_SIZE(struct status_block_e4, p_hwfn)
+
+#define QED_SB_INVALID_IDX      0xffff
+
+struct qed_igu_block {
+	u8 status;
+#define QED_IGU_STATUS_FREE     0x01
+#define QED_IGU_STATUS_VALID    0x02
+#define QED_IGU_STATUS_PF       0x04
+#define QED_IGU_STATUS_DSB      0x08
+
+	u8 vector_number;
+	u8 function_id;
+	u8 is_pf;
+
+	/* Index inside IGU [meant for back reference] */
+	u16 igu_sb_id;
+
+	struct qed_sb_info *sb_info;
+};
+
+struct qed_igu_info {
+	struct qed_igu_block entry[MAX_TOT_SB_PER_PATH];
+	u16 igu_dsb_id;
+
+	struct qed_sb_cnt_info usage;
+
+	bool b_allow_pf_vf_change;
+};
+
+/**
+ * @brief - Make sure the IGU CAM reflects the resources provided by MFW
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+int qed_int_igu_reset_cam(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
+/**
+ * @brief Translate the weakly-defined client sb-id into an IGU sb-id
+ *
+ * @param p_hwfn
+ * @param sb_id - user provided sb_id
+ *
+ * @return an index inside IGU CAM where the SB resides
+ */
+u16 qed_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id);
+
+/**
+ * @brief return a pointer to an unused valid SB
+ *
+ * @param p_hwfn
+ * @param b_is_pf - true iff we want a SB belonging to a PF
+ *
+ * @return point to an igu_block, NULL if none is available
+ */
+struct qed_igu_block *qed_get_igu_free_sb(struct qed_hwfn *p_hwfn,
+					  bool b_is_pf);
+
+void qed_int_igu_init_pure_rt(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      bool b_set,
+			      bool b_slowpath);
+
+void qed_int_igu_init_rt(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_int_igu_read_cam - Reads the IGU CAM.
+ *	This function needs to be called during hardware
+ *	prepare. It reads the info from igu cam to know which
+ *	status block is the default / base status block etc.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @return int
+ */
+int qed_int_igu_read_cam(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt);
+
+typedef int (*qed_int_comp_cb_t)(struct qed_hwfn *p_hwfn,
+				 void *cookie);
+/**
+ * @brief qed_int_register_cb - Register callback func for
+ *      slowhwfn statusblock.
+ *
+ *	Every protocol that uses the slowhwfn status block
+ *	should register a callback function that will be called
+ *	once there is an update of the sp status block.
+ *
+ * @param p_hwfn
+ * @param comp_cb - function to be called when there is an
+ *                  interrupt on the sp sb
+ *
+ * @param cookie  - passed to the callback function
+ * @param sb_idx  - OUT parameter which gives the chosen index
+ *                  for this protocol.
+ * @param p_fw_cons  - pointer to the actual address of the
+ *                     consumer for this protocol.
+ *
+ * @return int
+ */
+int qed_int_register_cb(struct qed_hwfn *p_hwfn,
+			qed_int_comp_cb_t comp_cb,
+			void *cookie,
+			u8 *sb_idx,
+			__le16 **p_fw_cons);
+
+/**
+ * @brief qed_int_unregister_cb - Unregisters callback
+ *      function from sp sb.
+ *      Partner of qed_int_register_cb -> should be called
+ *      when no longer required.
+ *
+ * @param p_hwfn
+ * @param pi
+ *
+ * @return int
+ */
+int qed_int_unregister_cb(struct qed_hwfn *p_hwfn,
+			  u8 pi);
+
+/**
+ * @brief qed_int_get_sp_sb_id - Get the slowhwfn sb id.
+ *
+ * @param p_hwfn
+ *
+ * @return u16
+ */
+u16 qed_int_get_sp_sb_id(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief Status block cleanup. Should be called for each status
+ *        block that will be used -> both PF / VF
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param igu_sb_id	- igu status block id
+ * @param opaque	- opaque fid of the sb owner.
+ * @param b_set		- set(1) / clear(0)
+ */
+void qed_int_igu_init_pure_rt_single(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     u16 igu_sb_id,
+				     u16 opaque,
+				     bool b_set);
+
+/**
+ * @brief qed_int_cau_conf - configure cau for a given status
+ *        block
+ *
+ * @param p_hwfn
+ * @param ptt
+ * @param sb_phys
+ * @param igu_sb_id
+ * @param vf_number
+ * @param vf_valid
+ */
+void qed_int_cau_conf_sb(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 dma_addr_t sb_phys,
+			 u16 igu_sb_id,
+			 u16 vf_number,
+			 u8 vf_valid);
+
+/**
+ * @brief qed_int_alloc
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @return int
+ */
+int qed_int_alloc(struct qed_hwfn *p_hwfn,
+		  struct qed_ptt *p_ptt);
+
+/**
+ * @brief qed_int_free
+ *
+ * @param p_hwfn
+ */
+void qed_int_free(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_int_setup
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+void qed_int_setup(struct qed_hwfn *p_hwfn,
+		   struct qed_ptt *p_ptt);
+
+/**
+ * @brief - Enable Interrupt & Attention for hw function
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param int_mode
+ *
+ * @return int
+ */
+int qed_int_igu_enable(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+		       enum qed_int_mode int_mode);
+
+/**
+ * @brief - Initialize CAU status block entry
+ *
+ * @param p_hwfn
+ * @param p_sb_entry
+ * @param pf_id
+ * @param vf_number
+ * @param vf_valid
+ */
+void qed_init_cau_sb_entry(struct qed_hwfn *p_hwfn,
+			   struct cau_sb_entry *p_sb_entry,
+			   u8 pf_id,
+			   u16 vf_number,
+			   u8 vf_valid);
+
+int qed_int_set_timer_res(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			  u8 timer_res, u16 sb_id, bool tx);
+
+#define QED_MAPPING_MEMORY_SIZE(dev)	(NUM_OF_SBS(dev))
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iscsi.c b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
new file mode 100644
index 0000000..c0d4a54
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
@@ -0,0 +1,1460 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <asm/param.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/etherdevice.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/qed/qed_iscsi_if.h>
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_dev_api.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_int.h"
+#include "qed_iscsi.h"
+#include "qed_ll2.h"
+#include "qed_mcp.h"
+#include "qed_sp.h"
+#include "qed_sriov.h"
+#include "qed_reg_addr.h"
+
+struct qed_iscsi_conn {
+	struct list_head list_entry;
+	bool free_on_delete;
+
+	u16 conn_id;
+	u32 icid;
+	u32 fw_cid;
+
+	u8 layer_code;
+	u8 offl_flags;
+	u8 connect_mode;
+	u32 initial_ack;
+	dma_addr_t sq_pbl_addr;
+	struct qed_chain r2tq;
+	struct qed_chain xhq;
+	struct qed_chain uhq;
+
+	struct tcp_upload_params *tcp_upload_params_virt_addr;
+	dma_addr_t tcp_upload_params_phys_addr;
+	struct scsi_terminate_extra_params *queue_cnts_virt_addr;
+	dma_addr_t queue_cnts_phys_addr;
+	dma_addr_t syn_phy_addr;
+
+	u16 syn_ip_payload_length;
+	u8 local_mac[6];
+	u8 remote_mac[6];
+	u16 vlan_id;
+	u16 tcp_flags;
+	u8 ip_version;
+	u32 remote_ip[4];
+	u32 local_ip[4];
+	u8 ka_max_probe_cnt;
+	u8 dup_ack_theshold;
+	u32 rcv_next;
+	u32 snd_una;
+	u32 snd_next;
+	u32 snd_max;
+	u32 snd_wnd;
+	u32 rcv_wnd;
+	u32 snd_wl1;
+	u32 cwnd;
+	u32 ss_thresh;
+	u16 srtt;
+	u16 rtt_var;
+	u32 ts_recent;
+	u32 ts_recent_age;
+	u32 total_rt;
+	u32 ka_timeout_delta;
+	u32 rt_timeout_delta;
+	u8 dup_ack_cnt;
+	u8 snd_wnd_probe_cnt;
+	u8 ka_probe_cnt;
+	u8 rt_cnt;
+	u32 flow_label;
+	u32 ka_timeout;
+	u32 ka_interval;
+	u32 max_rt_time;
+	u32 initial_rcv_wnd;
+	u8 ttl;
+	u8 tos_or_tc;
+	u16 remote_port;
+	u16 local_port;
+	u16 mss;
+	u8 snd_wnd_scale;
+	u8 rcv_wnd_scale;
+	u16 da_timeout_value;
+	u8 ack_frequency;
+
+	u8 update_flag;
+	u8 default_cq;
+	u32 max_seq_size;
+	u32 max_recv_pdu_length;
+	u32 max_send_pdu_length;
+	u32 first_seq_length;
+	u32 exp_stat_sn;
+	u32 stat_sn;
+	u16 physical_q0;
+	u16 physical_q1;
+	u8 abortive_dsconnect;
+};
+
+static int
+qed_iscsi_async_event(struct qed_hwfn *p_hwfn,
+		      u8 fw_event_code,
+		      u16 echo, union event_ring_data *data, u8 fw_return_code)
+{
+	if (p_hwfn->p_iscsi_info->event_cb) {
+		struct qed_iscsi_info *p_iscsi = p_hwfn->p_iscsi_info;
+
+		return p_iscsi->event_cb(p_iscsi->event_context,
+					 fw_event_code, data);
+	} else {
+		DP_NOTICE(p_hwfn, "iSCSI async completion is not set\n");
+		return -EINVAL;
+	}
+}
+
+static int
+qed_sp_iscsi_func_start(struct qed_hwfn *p_hwfn,
+			enum spq_mode comp_mode,
+			struct qed_spq_comp_cb *p_comp_addr,
+			void *event_context, iscsi_event_cb_t async_event_cb)
+{
+	struct iscsi_init_ramrod_params *p_ramrod = NULL;
+	struct scsi_init_func_queues *p_queue = NULL;
+	struct qed_iscsi_pf_params *p_params = NULL;
+	struct iscsi_spe_func_init *p_init = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = 0;
+	u32 dval;
+	u16 val;
+	u8 i;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ISCSI_RAMROD_CMD_ID_INIT_FUNC,
+				 PROTOCOLID_ISCSI, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iscsi_init;
+	p_init = &p_ramrod->iscsi_init_spe;
+	p_params = &p_hwfn->pf_params.iscsi_pf_params;
+	p_queue = &p_init->q_params;
+
+	/* Sanity */
+	if (p_params->num_queues > p_hwfn->hw_info.feat_num[QED_ISCSI_CQ]) {
+		DP_ERR(p_hwfn,
+		       "Cannot satisfy CQ amount. Queues requested %d, CQs available %d. Aborting function start\n",
+		       p_params->num_queues,
+		       p_hwfn->hw_info.feat_num[QED_ISCSI_CQ]);
+		return -EINVAL;
+	}
+
+	SET_FIELD(p_init->hdr.flags,
+		  ISCSI_SLOW_PATH_HDR_LAYER_CODE, ISCSI_SLOW_PATH_LAYER_CODE);
+	p_init->hdr.op_code = ISCSI_RAMROD_CMD_ID_INIT_FUNC;
+
+	val = p_params->half_way_close_timeout;
+	p_init->half_way_close_timeout = cpu_to_le16(val);
+	p_init->num_sq_pages_in_ring = p_params->num_sq_pages_in_ring;
+	p_init->num_r2tq_pages_in_ring = p_params->num_r2tq_pages_in_ring;
+	p_init->num_uhq_pages_in_ring = p_params->num_uhq_pages_in_ring;
+	p_init->ll2_rx_queue_id = p_hwfn->hw_info.resc_start[QED_LL2_QUEUE] +
+				  p_params->ll2_ooo_queue_id;
+
+	p_init->func_params.log_page_size = p_params->log_page_size;
+	val = p_params->num_tasks;
+	p_init->func_params.num_tasks = cpu_to_le16(val);
+	p_init->debug_mode.flags = p_params->debug_mode;
+
+	DMA_REGPAIR_LE(p_queue->glbl_q_params_addr,
+		       p_params->glbl_q_params_addr);
+
+	val = p_params->cq_num_entries;
+	p_queue->cq_num_entries = cpu_to_le16(val);
+	val = p_params->cmdq_num_entries;
+	p_queue->cmdq_num_entries = cpu_to_le16(val);
+	p_queue->num_queues = p_params->num_queues;
+	dval = (u8)p_hwfn->hw_info.resc_start[QED_CMDQS_CQS];
+	p_queue->queue_relative_offset = (u8)dval;
+	p_queue->cq_sb_pi = p_params->gl_rq_pi;
+	p_queue->cmdq_sb_pi = p_params->gl_cmd_pi;
+
+	for (i = 0; i < p_params->num_queues; i++) {
+		val = qed_get_igu_sb_id(p_hwfn, i);
+		p_queue->cq_cmdq_sb_num_arr[i] = cpu_to_le16(val);
+	}
+
+	p_queue->bdq_resource_id = (u8)RESC_START(p_hwfn, QED_BDQ);
+
+	DMA_REGPAIR_LE(p_queue->bdq_pbl_base_address[BDQ_ID_RQ],
+		       p_params->bdq_pbl_base_addr[BDQ_ID_RQ]);
+	p_queue->bdq_pbl_num_entries[BDQ_ID_RQ] =
+	    p_params->bdq_pbl_num_entries[BDQ_ID_RQ];
+	val = p_params->bdq_xoff_threshold[BDQ_ID_RQ];
+	p_queue->bdq_xoff_threshold[BDQ_ID_RQ] = cpu_to_le16(val);
+	val = p_params->bdq_xon_threshold[BDQ_ID_RQ];
+	p_queue->bdq_xon_threshold[BDQ_ID_RQ] = cpu_to_le16(val);
+
+	DMA_REGPAIR_LE(p_queue->bdq_pbl_base_address[BDQ_ID_IMM_DATA],
+		       p_params->bdq_pbl_base_addr[BDQ_ID_IMM_DATA]);
+	p_queue->bdq_pbl_num_entries[BDQ_ID_IMM_DATA] =
+	    p_params->bdq_pbl_num_entries[BDQ_ID_IMM_DATA];
+	val = p_params->bdq_xoff_threshold[BDQ_ID_IMM_DATA];
+	p_queue->bdq_xoff_threshold[BDQ_ID_IMM_DATA] = cpu_to_le16(val);
+	val = p_params->bdq_xon_threshold[BDQ_ID_IMM_DATA];
+	p_queue->bdq_xon_threshold[BDQ_ID_IMM_DATA] = cpu_to_le16(val);
+	val = p_params->rq_buffer_size;
+	p_queue->rq_buffer_size = cpu_to_le16(val);
+	if (p_params->is_target) {
+		SET_FIELD(p_queue->q_validity,
+			  SCSI_INIT_FUNC_QUEUES_RQ_VALID, 1);
+		if (p_queue->bdq_pbl_num_entries[BDQ_ID_IMM_DATA])
+			SET_FIELD(p_queue->q_validity,
+				  SCSI_INIT_FUNC_QUEUES_IMM_DATA_VALID, 1);
+		SET_FIELD(p_queue->q_validity,
+			  SCSI_INIT_FUNC_QUEUES_CMD_VALID, 1);
+	} else {
+		SET_FIELD(p_queue->q_validity,
+			  SCSI_INIT_FUNC_QUEUES_RQ_VALID, 1);
+	}
+	p_ramrod->tcp_init.two_msl_timer = cpu_to_le32(p_params->two_msl_timer);
+	val = p_params->tx_sws_timer;
+	p_ramrod->tcp_init.tx_sws_timer = cpu_to_le16(val);
+	p_ramrod->tcp_init.max_fin_rt = p_params->max_fin_rt;
+
+	p_hwfn->p_iscsi_info->event_context = event_context;
+	p_hwfn->p_iscsi_info->event_cb = async_event_cb;
+
+	qed_spq_register_async_cb(p_hwfn, PROTOCOLID_ISCSI,
+				  qed_iscsi_async_event);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_iscsi_conn_offload(struct qed_hwfn *p_hwfn,
+				     struct qed_iscsi_conn *p_conn,
+				     enum spq_mode comp_mode,
+				     struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct iscsi_spe_conn_offload *p_ramrod = NULL;
+	struct tcp_offload_params_opt2 *p_tcp2 = NULL;
+	struct tcp_offload_params *p_tcp = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	dma_addr_t r2tq_pbl_addr;
+	dma_addr_t xhq_pbl_addr;
+	dma_addr_t uhq_pbl_addr;
+	u16 physical_q;
+	int rc = 0;
+	u32 dval;
+	u16 wval;
+	u16 *p;
+	u8 i;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ISCSI_RAMROD_CMD_ID_OFFLOAD_CONN,
+				 PROTOCOLID_ISCSI, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iscsi_conn_offload;
+
+	/* Transmission PQ is the first of the PF */
+	physical_q = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_OFLD);
+	p_conn->physical_q0 = cpu_to_le16(physical_q);
+	p_ramrod->iscsi.physical_q0 = cpu_to_le16(physical_q);
+
+	/* iSCSI Pure-ACK PQ */
+	physical_q = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_ACK);
+	p_conn->physical_q1 = cpu_to_le16(physical_q);
+	p_ramrod->iscsi.physical_q1 = cpu_to_le16(physical_q);
+
+	p_ramrod->hdr.op_code = ISCSI_RAMROD_CMD_ID_OFFLOAD_CONN;
+	SET_FIELD(p_ramrod->hdr.flags, ISCSI_SLOW_PATH_HDR_LAYER_CODE,
+		  p_conn->layer_code);
+
+	p_ramrod->conn_id = cpu_to_le16(p_conn->conn_id);
+	p_ramrod->fw_cid = cpu_to_le32(p_conn->icid);
+
+	DMA_REGPAIR_LE(p_ramrod->iscsi.sq_pbl_addr, p_conn->sq_pbl_addr);
+
+	r2tq_pbl_addr = qed_chain_get_pbl_phys(&p_conn->r2tq);
+	DMA_REGPAIR_LE(p_ramrod->iscsi.r2tq_pbl_addr, r2tq_pbl_addr);
+
+	xhq_pbl_addr = qed_chain_get_pbl_phys(&p_conn->xhq);
+	DMA_REGPAIR_LE(p_ramrod->iscsi.xhq_pbl_addr, xhq_pbl_addr);
+
+	uhq_pbl_addr = qed_chain_get_pbl_phys(&p_conn->uhq);
+	DMA_REGPAIR_LE(p_ramrod->iscsi.uhq_pbl_addr, uhq_pbl_addr);
+
+	p_ramrod->iscsi.initial_ack = cpu_to_le32(p_conn->initial_ack);
+	p_ramrod->iscsi.flags = p_conn->offl_flags;
+	p_ramrod->iscsi.default_cq = p_conn->default_cq;
+	p_ramrod->iscsi.stat_sn = cpu_to_le32(p_conn->stat_sn);
+
+	if (!GET_FIELD(p_ramrod->iscsi.flags,
+		       ISCSI_CONN_OFFLOAD_PARAMS_TCP_ON_CHIP_1B)) {
+		p_tcp = &p_ramrod->tcp;
+
+		p = (u16 *)p_conn->local_mac;
+		p_tcp->local_mac_addr_hi = swab16(get_unaligned(p));
+		p_tcp->local_mac_addr_mid = swab16(get_unaligned(p + 1));
+		p_tcp->local_mac_addr_lo = swab16(get_unaligned(p + 2));
+
+		p = (u16 *)p_conn->remote_mac;
+		p_tcp->remote_mac_addr_hi = swab16(get_unaligned(p));
+		p_tcp->remote_mac_addr_mid = swab16(get_unaligned(p + 1));
+		p_tcp->remote_mac_addr_lo = swab16(get_unaligned(p + 2));
+
+		p_tcp->vlan_id = cpu_to_le16(p_conn->vlan_id);
+
+		p_tcp->flags = cpu_to_le16(p_conn->tcp_flags);
+		p_tcp->ip_version = p_conn->ip_version;
+		for (i = 0; i < 4; i++) {
+			dval = p_conn->remote_ip[i];
+			p_tcp->remote_ip[i] = cpu_to_le32(dval);
+			dval = p_conn->local_ip[i];
+			p_tcp->local_ip[i] = cpu_to_le32(dval);
+		}
+		p_tcp->ka_max_probe_cnt = p_conn->ka_max_probe_cnt;
+		p_tcp->dup_ack_theshold = p_conn->dup_ack_theshold;
+
+		p_tcp->rcv_next = cpu_to_le32(p_conn->rcv_next);
+		p_tcp->snd_una = cpu_to_le32(p_conn->snd_una);
+		p_tcp->snd_next = cpu_to_le32(p_conn->snd_next);
+		p_tcp->snd_max = cpu_to_le32(p_conn->snd_max);
+		p_tcp->snd_wnd = cpu_to_le32(p_conn->snd_wnd);
+		p_tcp->rcv_wnd = cpu_to_le32(p_conn->rcv_wnd);
+		p_tcp->snd_wl1 = cpu_to_le32(p_conn->snd_wl1);
+		p_tcp->cwnd = cpu_to_le32(p_conn->cwnd);
+		p_tcp->ss_thresh = cpu_to_le32(p_conn->ss_thresh);
+		p_tcp->srtt = cpu_to_le16(p_conn->srtt);
+		p_tcp->rtt_var = cpu_to_le16(p_conn->rtt_var);
+		p_tcp->ts_recent = cpu_to_le32(p_conn->ts_recent);
+		p_tcp->ts_recent_age = cpu_to_le32(p_conn->ts_recent_age);
+		p_tcp->total_rt = cpu_to_le32(p_conn->total_rt);
+		dval = p_conn->ka_timeout_delta;
+		p_tcp->ka_timeout_delta = cpu_to_le32(dval);
+		dval = p_conn->rt_timeout_delta;
+		p_tcp->rt_timeout_delta = cpu_to_le32(dval);
+		p_tcp->dup_ack_cnt = p_conn->dup_ack_cnt;
+		p_tcp->snd_wnd_probe_cnt = p_conn->snd_wnd_probe_cnt;
+		p_tcp->ka_probe_cnt = p_conn->ka_probe_cnt;
+		p_tcp->rt_cnt = p_conn->rt_cnt;
+		p_tcp->flow_label = cpu_to_le32(p_conn->flow_label);
+		p_tcp->ka_timeout = cpu_to_le32(p_conn->ka_timeout);
+		p_tcp->ka_interval = cpu_to_le32(p_conn->ka_interval);
+		p_tcp->max_rt_time = cpu_to_le32(p_conn->max_rt_time);
+		dval = p_conn->initial_rcv_wnd;
+		p_tcp->initial_rcv_wnd = cpu_to_le32(dval);
+		p_tcp->ttl = p_conn->ttl;
+		p_tcp->tos_or_tc = p_conn->tos_or_tc;
+		p_tcp->remote_port = cpu_to_le16(p_conn->remote_port);
+		p_tcp->local_port = cpu_to_le16(p_conn->local_port);
+		p_tcp->mss = cpu_to_le16(p_conn->mss);
+		p_tcp->snd_wnd_scale = p_conn->snd_wnd_scale;
+		p_tcp->rcv_wnd_scale = p_conn->rcv_wnd_scale;
+		wval = p_conn->da_timeout_value;
+		p_tcp->da_timeout_value = cpu_to_le16(wval);
+		p_tcp->ack_frequency = p_conn->ack_frequency;
+		p_tcp->connect_mode = p_conn->connect_mode;
+	} else {
+		p_tcp2 =
+		    &((struct iscsi_spe_conn_offload_option2 *)p_ramrod)->tcp;
+
+		p = (u16 *)p_conn->local_mac;
+		p_tcp2->local_mac_addr_hi = swab16(get_unaligned(p));
+		p_tcp2->local_mac_addr_mid = swab16(get_unaligned(p + 1));
+		p_tcp2->local_mac_addr_lo = swab16(get_unaligned(p + 2));
+
+		p = (u16 *)p_conn->remote_mac;
+		p_tcp2->remote_mac_addr_hi = swab16(get_unaligned(p));
+		p_tcp2->remote_mac_addr_mid = swab16(get_unaligned(p + 1));
+		p_tcp2->remote_mac_addr_lo = swab16(get_unaligned(p + 2));
+
+		p_tcp2->vlan_id = cpu_to_le16(p_conn->vlan_id);
+		p_tcp2->flags = cpu_to_le16(p_conn->tcp_flags);
+
+		p_tcp2->ip_version = p_conn->ip_version;
+		for (i = 0; i < 4; i++) {
+			dval = p_conn->remote_ip[i];
+			p_tcp2->remote_ip[i] = cpu_to_le32(dval);
+			dval = p_conn->local_ip[i];
+			p_tcp2->local_ip[i] = cpu_to_le32(dval);
+		}
+
+		p_tcp2->flow_label = cpu_to_le32(p_conn->flow_label);
+		p_tcp2->ttl = p_conn->ttl;
+		p_tcp2->tos_or_tc = p_conn->tos_or_tc;
+		p_tcp2->remote_port = cpu_to_le16(p_conn->remote_port);
+		p_tcp2->local_port = cpu_to_le16(p_conn->local_port);
+		p_tcp2->mss = cpu_to_le16(p_conn->mss);
+		p_tcp2->rcv_wnd_scale = p_conn->rcv_wnd_scale;
+		p_tcp2->connect_mode = p_conn->connect_mode;
+		wval = p_conn->syn_ip_payload_length;
+		p_tcp2->syn_ip_payload_length = cpu_to_le16(wval);
+		p_tcp2->syn_phy_addr_lo = DMA_LO_LE(p_conn->syn_phy_addr);
+		p_tcp2->syn_phy_addr_hi = DMA_HI_LE(p_conn->syn_phy_addr);
+		p_tcp2->cwnd = cpu_to_le32(p_conn->cwnd);
+		p_tcp2->ka_max_probe_cnt = p_conn->ka_probe_cnt;
+		p_tcp2->ka_timeout = cpu_to_le32(p_conn->ka_timeout);
+		p_tcp2->max_rt_time = cpu_to_le32(p_conn->max_rt_time);
+		p_tcp2->ka_interval = cpu_to_le32(p_conn->ka_interval);
+	}
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_iscsi_conn_update(struct qed_hwfn *p_hwfn,
+				    struct qed_iscsi_conn *p_conn,
+				    enum spq_mode comp_mode,
+				    struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct iscsi_conn_update_ramrod_params *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+	u32 dval;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ISCSI_RAMROD_CMD_ID_UPDATE_CONN,
+				 PROTOCOLID_ISCSI, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iscsi_conn_update;
+	p_ramrod->hdr.op_code = ISCSI_RAMROD_CMD_ID_UPDATE_CONN;
+	SET_FIELD(p_ramrod->hdr.flags,
+		  ISCSI_SLOW_PATH_HDR_LAYER_CODE, p_conn->layer_code);
+
+	p_ramrod->conn_id = cpu_to_le16(p_conn->conn_id);
+	p_ramrod->fw_cid = cpu_to_le32(p_conn->icid);
+	p_ramrod->flags = p_conn->update_flag;
+	p_ramrod->max_seq_size = cpu_to_le32(p_conn->max_seq_size);
+	dval = p_conn->max_recv_pdu_length;
+	p_ramrod->max_recv_pdu_length = cpu_to_le32(dval);
+	dval = p_conn->max_send_pdu_length;
+	p_ramrod->max_send_pdu_length = cpu_to_le32(dval);
+	dval = p_conn->first_seq_length;
+	p_ramrod->first_seq_length = cpu_to_le32(dval);
+	p_ramrod->exp_stat_sn = cpu_to_le32(p_conn->exp_stat_sn);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int
+qed_sp_iscsi_mac_update(struct qed_hwfn *p_hwfn,
+			struct qed_iscsi_conn *p_conn,
+			enum spq_mode comp_mode,
+			struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct iscsi_spe_conn_mac_update *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+	u8 ucval;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ISCSI_RAMROD_CMD_ID_MAC_UPDATE,
+				 PROTOCOLID_ISCSI, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iscsi_conn_mac_update;
+	p_ramrod->hdr.op_code = ISCSI_RAMROD_CMD_ID_MAC_UPDATE;
+	SET_FIELD(p_ramrod->hdr.flags,
+		  ISCSI_SLOW_PATH_HDR_LAYER_CODE, p_conn->layer_code);
+
+	p_ramrod->conn_id = cpu_to_le16(p_conn->conn_id);
+	p_ramrod->fw_cid = cpu_to_le32(p_conn->icid);
+	ucval = p_conn->remote_mac[1];
+	((u8 *)(&p_ramrod->remote_mac_addr_hi))[0] = ucval;
+	ucval = p_conn->remote_mac[0];
+	((u8 *)(&p_ramrod->remote_mac_addr_hi))[1] = ucval;
+	ucval = p_conn->remote_mac[3];
+	((u8 *)(&p_ramrod->remote_mac_addr_mid))[0] = ucval;
+	ucval = p_conn->remote_mac[2];
+	((u8 *)(&p_ramrod->remote_mac_addr_mid))[1] = ucval;
+	ucval = p_conn->remote_mac[5];
+	((u8 *)(&p_ramrod->remote_mac_addr_lo))[0] = ucval;
+	ucval = p_conn->remote_mac[4];
+	((u8 *)(&p_ramrod->remote_mac_addr_lo))[1] = ucval;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_iscsi_conn_terminate(struct qed_hwfn *p_hwfn,
+				       struct qed_iscsi_conn *p_conn,
+				       enum spq_mode comp_mode,
+				       struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct iscsi_spe_conn_termination *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ISCSI_RAMROD_CMD_ID_TERMINATION_CONN,
+				 PROTOCOLID_ISCSI, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iscsi_conn_terminate;
+	p_ramrod->hdr.op_code = ISCSI_RAMROD_CMD_ID_TERMINATION_CONN;
+	SET_FIELD(p_ramrod->hdr.flags,
+		  ISCSI_SLOW_PATH_HDR_LAYER_CODE, p_conn->layer_code);
+
+	p_ramrod->conn_id = cpu_to_le16(p_conn->conn_id);
+	p_ramrod->fw_cid = cpu_to_le32(p_conn->icid);
+	p_ramrod->abortive = p_conn->abortive_dsconnect;
+
+	DMA_REGPAIR_LE(p_ramrod->query_params_addr,
+		       p_conn->tcp_upload_params_phys_addr);
+	DMA_REGPAIR_LE(p_ramrod->queue_cnts_addr, p_conn->queue_cnts_phys_addr);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_iscsi_conn_clear_sq(struct qed_hwfn *p_hwfn,
+				      struct qed_iscsi_conn *p_conn,
+				      enum spq_mode comp_mode,
+				      struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct iscsi_slow_path_hdr *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ISCSI_RAMROD_CMD_ID_CLEAR_SQ,
+				 PROTOCOLID_ISCSI, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iscsi_empty;
+	p_ramrod->op_code = ISCSI_RAMROD_CMD_ID_CLEAR_SQ;
+	SET_FIELD(p_ramrod->flags,
+		  ISCSI_SLOW_PATH_HDR_LAYER_CODE, p_conn->layer_code);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_iscsi_func_stop(struct qed_hwfn *p_hwfn,
+				  enum spq_mode comp_mode,
+				  struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct iscsi_spe_func_dstry *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = 0;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ISCSI_RAMROD_CMD_ID_DESTROY_FUNC,
+				 PROTOCOLID_ISCSI, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iscsi_destroy;
+	p_ramrod->hdr.op_code = ISCSI_RAMROD_CMD_ID_DESTROY_FUNC;
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+	qed_spq_unregister_async_cb(p_hwfn, PROTOCOLID_ISCSI);
+	return rc;
+}
+
+static void __iomem *qed_iscsi_get_db_addr(struct qed_hwfn *p_hwfn, u32 cid)
+{
+	return (u8 __iomem *)p_hwfn->doorbells +
+			     qed_db_addr(cid, DQ_DEMS_LEGACY);
+}
+
+static void __iomem *qed_iscsi_get_primary_bdq_prod(struct qed_hwfn *p_hwfn,
+						    u8 bdq_id)
+{
+	if (RESC_NUM(p_hwfn, QED_BDQ)) {
+		return (u8 __iomem *)p_hwfn->regview +
+		       GTT_BAR0_MAP_REG_MSDM_RAM +
+		       MSTORM_SCSI_BDQ_EXT_PROD_OFFSET(RESC_START(p_hwfn,
+								  QED_BDQ),
+						       bdq_id);
+	} else {
+		DP_NOTICE(p_hwfn, "BDQ is not allocated!\n");
+		return NULL;
+	}
+}
+
+static void __iomem *qed_iscsi_get_secondary_bdq_prod(struct qed_hwfn *p_hwfn,
+						      u8 bdq_id)
+{
+	if (RESC_NUM(p_hwfn, QED_BDQ)) {
+		return (u8 __iomem *)p_hwfn->regview +
+		       GTT_BAR0_MAP_REG_TSDM_RAM +
+		       TSTORM_SCSI_BDQ_EXT_PROD_OFFSET(RESC_START(p_hwfn,
+								  QED_BDQ),
+						       bdq_id);
+	} else {
+		DP_NOTICE(p_hwfn, "BDQ is not allocated!\n");
+		return NULL;
+	}
+}
+
+static int qed_iscsi_setup_connection(struct qed_iscsi_conn *p_conn)
+{
+	if (!p_conn->queue_cnts_virt_addr)
+		goto nomem;
+	memset(p_conn->queue_cnts_virt_addr, 0,
+	       sizeof(*p_conn->queue_cnts_virt_addr));
+
+	if (!p_conn->tcp_upload_params_virt_addr)
+		goto nomem;
+	memset(p_conn->tcp_upload_params_virt_addr, 0,
+	       sizeof(*p_conn->tcp_upload_params_virt_addr));
+
+	if (!p_conn->r2tq.p_virt_addr)
+		goto nomem;
+	qed_chain_pbl_zero_mem(&p_conn->r2tq);
+
+	if (!p_conn->uhq.p_virt_addr)
+		goto nomem;
+	qed_chain_pbl_zero_mem(&p_conn->uhq);
+
+	if (!p_conn->xhq.p_virt_addr)
+		goto nomem;
+	qed_chain_pbl_zero_mem(&p_conn->xhq);
+
+	return 0;
+nomem:
+	return -ENOMEM;
+}
+
+static int qed_iscsi_allocate_connection(struct qed_hwfn *p_hwfn,
+					 struct qed_iscsi_conn **p_out_conn)
+{
+	u16 uhq_num_elements = 0, xhq_num_elements = 0, r2tq_num_elements = 0;
+	struct scsi_terminate_extra_params *p_q_cnts = NULL;
+	struct qed_iscsi_pf_params *p_params = NULL;
+	struct tcp_upload_params *p_tcp = NULL;
+	struct qed_iscsi_conn *p_conn = NULL;
+	int rc = 0;
+
+	/* Try finding a free connection that can be used */
+	spin_lock_bh(&p_hwfn->p_iscsi_info->lock);
+	if (!list_empty(&p_hwfn->p_iscsi_info->free_list))
+		p_conn = list_first_entry(&p_hwfn->p_iscsi_info->free_list,
+					  struct qed_iscsi_conn, list_entry);
+	if (p_conn) {
+		list_del(&p_conn->list_entry);
+		spin_unlock_bh(&p_hwfn->p_iscsi_info->lock);
+		*p_out_conn = p_conn;
+		return 0;
+	}
+	spin_unlock_bh(&p_hwfn->p_iscsi_info->lock);
+
+	/* Need to allocate a new connection */
+	p_params = &p_hwfn->pf_params.iscsi_pf_params;
+
+	p_conn = kzalloc(sizeof(*p_conn), GFP_KERNEL);
+	if (!p_conn)
+		return -ENOMEM;
+
+	p_q_cnts = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				      sizeof(*p_q_cnts),
+				      &p_conn->queue_cnts_phys_addr,
+				      GFP_KERNEL);
+	if (!p_q_cnts)
+		goto nomem_queue_cnts_param;
+	p_conn->queue_cnts_virt_addr = p_q_cnts;
+
+	p_tcp = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				   sizeof(*p_tcp),
+				   &p_conn->tcp_upload_params_phys_addr,
+				   GFP_KERNEL);
+	if (!p_tcp)
+		goto nomem_upload_param;
+	p_conn->tcp_upload_params_virt_addr = p_tcp;
+
+	r2tq_num_elements = p_params->num_r2tq_pages_in_ring *
+			    QED_CHAIN_PAGE_SIZE / 0x80;
+	rc = qed_chain_alloc(p_hwfn->cdev,
+			     QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+			     QED_CHAIN_MODE_PBL,
+			     QED_CHAIN_CNT_TYPE_U16,
+			     r2tq_num_elements, 0x80, &p_conn->r2tq, NULL);
+	if (rc)
+		goto nomem_r2tq;
+
+	uhq_num_elements = p_params->num_uhq_pages_in_ring *
+			   QED_CHAIN_PAGE_SIZE / sizeof(struct iscsi_uhqe);
+	rc = qed_chain_alloc(p_hwfn->cdev,
+			     QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+			     QED_CHAIN_MODE_PBL,
+			     QED_CHAIN_CNT_TYPE_U16,
+			     uhq_num_elements,
+			     sizeof(struct iscsi_uhqe), &p_conn->uhq, NULL);
+	if (rc)
+		goto nomem_uhq;
+
+	xhq_num_elements = uhq_num_elements;
+	rc = qed_chain_alloc(p_hwfn->cdev,
+			     QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+			     QED_CHAIN_MODE_PBL,
+			     QED_CHAIN_CNT_TYPE_U16,
+			     xhq_num_elements,
+			     sizeof(struct iscsi_xhqe), &p_conn->xhq, NULL);
+	if (rc)
+		goto nomem;
+
+	p_conn->free_on_delete = true;
+	*p_out_conn = p_conn;
+	return 0;
+
+nomem:
+	qed_chain_free(p_hwfn->cdev, &p_conn->uhq);
+nomem_uhq:
+	qed_chain_free(p_hwfn->cdev, &p_conn->r2tq);
+nomem_r2tq:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  sizeof(struct tcp_upload_params),
+			  p_conn->tcp_upload_params_virt_addr,
+			  p_conn->tcp_upload_params_phys_addr);
+nomem_upload_param:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  sizeof(struct scsi_terminate_extra_params),
+			  p_conn->queue_cnts_virt_addr,
+			  p_conn->queue_cnts_phys_addr);
+nomem_queue_cnts_param:
+	kfree(p_conn);
+
+	return -ENOMEM;
+}
+
+static int qed_iscsi_acquire_connection(struct qed_hwfn *p_hwfn,
+					struct qed_iscsi_conn *p_in_conn,
+					struct qed_iscsi_conn **p_out_conn)
+{
+	struct qed_iscsi_conn *p_conn = NULL;
+	int rc = 0;
+	u32 icid;
+
+	spin_lock_bh(&p_hwfn->p_iscsi_info->lock);
+	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_ISCSI, &icid);
+	spin_unlock_bh(&p_hwfn->p_iscsi_info->lock);
+	if (rc)
+		return rc;
+
+	/* Use input connection or allocate a new one */
+	if (p_in_conn)
+		p_conn = p_in_conn;
+	else
+		rc = qed_iscsi_allocate_connection(p_hwfn, &p_conn);
+
+	if (!rc)
+		rc = qed_iscsi_setup_connection(p_conn);
+
+	if (rc) {
+		spin_lock_bh(&p_hwfn->p_iscsi_info->lock);
+		qed_cxt_release_cid(p_hwfn, icid);
+		spin_unlock_bh(&p_hwfn->p_iscsi_info->lock);
+		return rc;
+	}
+
+	p_conn->icid = icid;
+	p_conn->conn_id = (u16)icid;
+	p_conn->fw_cid = (p_hwfn->hw_info.opaque_fid << 16) | icid;
+
+	*p_out_conn = p_conn;
+
+	return rc;
+}
+
+static void qed_iscsi_release_connection(struct qed_hwfn *p_hwfn,
+					 struct qed_iscsi_conn *p_conn)
+{
+	spin_lock_bh(&p_hwfn->p_iscsi_info->lock);
+	list_add_tail(&p_conn->list_entry, &p_hwfn->p_iscsi_info->free_list);
+	qed_cxt_release_cid(p_hwfn, p_conn->icid);
+	spin_unlock_bh(&p_hwfn->p_iscsi_info->lock);
+}
+
+void qed_iscsi_free_connection(struct qed_hwfn *p_hwfn,
+			       struct qed_iscsi_conn *p_conn)
+{
+	qed_chain_free(p_hwfn->cdev, &p_conn->xhq);
+	qed_chain_free(p_hwfn->cdev, &p_conn->uhq);
+	qed_chain_free(p_hwfn->cdev, &p_conn->r2tq);
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  sizeof(struct tcp_upload_params),
+			  p_conn->tcp_upload_params_virt_addr,
+			  p_conn->tcp_upload_params_phys_addr);
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  sizeof(struct scsi_terminate_extra_params),
+			  p_conn->queue_cnts_virt_addr,
+			  p_conn->queue_cnts_phys_addr);
+	kfree(p_conn);
+}
+
+int qed_iscsi_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_iscsi_info *p_iscsi_info;
+
+	p_iscsi_info = kzalloc(sizeof(*p_iscsi_info), GFP_KERNEL);
+	if (!p_iscsi_info)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&p_iscsi_info->free_list);
+
+	p_hwfn->p_iscsi_info = p_iscsi_info;
+	return 0;
+}
+
+void qed_iscsi_setup(struct qed_hwfn *p_hwfn)
+{
+	spin_lock_init(&p_hwfn->p_iscsi_info->lock);
+}
+
+void qed_iscsi_free(struct qed_hwfn *p_hwfn)
+{
+	struct qed_iscsi_conn *p_conn = NULL;
+
+	if (!p_hwfn->p_iscsi_info)
+		return;
+
+	while (!list_empty(&p_hwfn->p_iscsi_info->free_list)) {
+		p_conn = list_first_entry(&p_hwfn->p_iscsi_info->free_list,
+					  struct qed_iscsi_conn, list_entry);
+		if (p_conn) {
+			list_del(&p_conn->list_entry);
+			qed_iscsi_free_connection(p_hwfn, p_conn);
+		}
+	}
+
+	kfree(p_hwfn->p_iscsi_info);
+	p_hwfn->p_iscsi_info = NULL;
+}
+
+static void _qed_iscsi_get_tstats(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_iscsi_stats *p_stats)
+{
+	struct tstorm_iscsi_stats_drv tstats;
+	u32 tstats_addr;
+
+	memset(&tstats, 0, sizeof(tstats));
+	tstats_addr = BAR0_MAP_REG_TSDM_RAM +
+		      TSTORM_ISCSI_RX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &tstats, tstats_addr, sizeof(tstats));
+
+	p_stats->iscsi_rx_bytes_cnt =
+	    HILO_64_REGPAIR(tstats.iscsi_rx_bytes_cnt);
+	p_stats->iscsi_rx_packet_cnt =
+	    HILO_64_REGPAIR(tstats.iscsi_rx_packet_cnt);
+	p_stats->iscsi_rx_new_ooo_isle_events_cnt =
+	    HILO_64_REGPAIR(tstats.iscsi_rx_new_ooo_isle_events_cnt);
+	p_stats->iscsi_cmdq_threshold_cnt =
+	    le32_to_cpu(tstats.iscsi_cmdq_threshold_cnt);
+	p_stats->iscsi_rq_threshold_cnt =
+	    le32_to_cpu(tstats.iscsi_rq_threshold_cnt);
+	p_stats->iscsi_immq_threshold_cnt =
+	    le32_to_cpu(tstats.iscsi_immq_threshold_cnt);
+}
+
+static void _qed_iscsi_get_mstats(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_iscsi_stats *p_stats)
+{
+	struct mstorm_iscsi_stats_drv mstats;
+	u32 mstats_addr;
+
+	memset(&mstats, 0, sizeof(mstats));
+	mstats_addr = BAR0_MAP_REG_MSDM_RAM +
+		      MSTORM_ISCSI_RX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &mstats, mstats_addr, sizeof(mstats));
+
+	p_stats->iscsi_rx_dropped_pdus_task_not_valid =
+	    HILO_64_REGPAIR(mstats.iscsi_rx_dropped_pdus_task_not_valid);
+}
+
+static void _qed_iscsi_get_ustats(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_iscsi_stats *p_stats)
+{
+	struct ustorm_iscsi_stats_drv ustats;
+	u32 ustats_addr;
+
+	memset(&ustats, 0, sizeof(ustats));
+	ustats_addr = BAR0_MAP_REG_USDM_RAM +
+		      USTORM_ISCSI_RX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &ustats, ustats_addr, sizeof(ustats));
+
+	p_stats->iscsi_rx_data_pdu_cnt =
+	    HILO_64_REGPAIR(ustats.iscsi_rx_data_pdu_cnt);
+	p_stats->iscsi_rx_r2t_pdu_cnt =
+	    HILO_64_REGPAIR(ustats.iscsi_rx_r2t_pdu_cnt);
+	p_stats->iscsi_rx_total_pdu_cnt =
+	    HILO_64_REGPAIR(ustats.iscsi_rx_total_pdu_cnt);
+}
+
+static void _qed_iscsi_get_xstats(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_iscsi_stats *p_stats)
+{
+	struct xstorm_iscsi_stats_drv xstats;
+	u32 xstats_addr;
+
+	memset(&xstats, 0, sizeof(xstats));
+	xstats_addr = BAR0_MAP_REG_XSDM_RAM +
+		      XSTORM_ISCSI_TX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &xstats, xstats_addr, sizeof(xstats));
+
+	p_stats->iscsi_tx_go_to_slow_start_event_cnt =
+	    HILO_64_REGPAIR(xstats.iscsi_tx_go_to_slow_start_event_cnt);
+	p_stats->iscsi_tx_fast_retransmit_event_cnt =
+	    HILO_64_REGPAIR(xstats.iscsi_tx_fast_retransmit_event_cnt);
+}
+
+static void _qed_iscsi_get_ystats(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_iscsi_stats *p_stats)
+{
+	struct ystorm_iscsi_stats_drv ystats;
+	u32 ystats_addr;
+
+	memset(&ystats, 0, sizeof(ystats));
+	ystats_addr = BAR0_MAP_REG_YSDM_RAM +
+		      YSTORM_ISCSI_TX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &ystats, ystats_addr, sizeof(ystats));
+
+	p_stats->iscsi_tx_data_pdu_cnt =
+	    HILO_64_REGPAIR(ystats.iscsi_tx_data_pdu_cnt);
+	p_stats->iscsi_tx_r2t_pdu_cnt =
+	    HILO_64_REGPAIR(ystats.iscsi_tx_r2t_pdu_cnt);
+	p_stats->iscsi_tx_total_pdu_cnt =
+	    HILO_64_REGPAIR(ystats.iscsi_tx_total_pdu_cnt);
+}
+
+static void _qed_iscsi_get_pstats(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_iscsi_stats *p_stats)
+{
+	struct pstorm_iscsi_stats_drv pstats;
+	u32 pstats_addr;
+
+	memset(&pstats, 0, sizeof(pstats));
+	pstats_addr = BAR0_MAP_REG_PSDM_RAM +
+		      PSTORM_ISCSI_TX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &pstats, pstats_addr, sizeof(pstats));
+
+	p_stats->iscsi_tx_bytes_cnt =
+	    HILO_64_REGPAIR(pstats.iscsi_tx_bytes_cnt);
+	p_stats->iscsi_tx_packet_cnt =
+	    HILO_64_REGPAIR(pstats.iscsi_tx_packet_cnt);
+}
+
+static int qed_iscsi_get_stats(struct qed_hwfn *p_hwfn,
+			       struct qed_iscsi_stats *stats)
+{
+	struct qed_ptt *p_ptt;
+
+	memset(stats, 0, sizeof(*stats));
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt) {
+		DP_ERR(p_hwfn, "Failed to acquire ptt\n");
+		return -EAGAIN;
+	}
+
+	_qed_iscsi_get_tstats(p_hwfn, p_ptt, stats);
+	_qed_iscsi_get_mstats(p_hwfn, p_ptt, stats);
+	_qed_iscsi_get_ustats(p_hwfn, p_ptt, stats);
+
+	_qed_iscsi_get_xstats(p_hwfn, p_ptt, stats);
+	_qed_iscsi_get_ystats(p_hwfn, p_ptt, stats);
+	_qed_iscsi_get_pstats(p_hwfn, p_ptt, stats);
+
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return 0;
+}
+
+struct qed_hash_iscsi_con {
+	struct hlist_node node;
+	struct qed_iscsi_conn *con;
+};
+
+static int qed_fill_iscsi_dev_info(struct qed_dev *cdev,
+				   struct qed_dev_iscsi_info *info)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+
+	int rc;
+
+	memset(info, 0, sizeof(*info));
+	rc = qed_fill_dev_info(cdev, &info->common);
+
+	info->primary_dbq_rq_addr =
+	    qed_iscsi_get_primary_bdq_prod(hwfn, BDQ_ID_RQ);
+	info->secondary_bdq_rq_addr =
+	    qed_iscsi_get_secondary_bdq_prod(hwfn, BDQ_ID_RQ);
+
+	info->num_cqs = FEAT_NUM(hwfn, QED_ISCSI_CQ);
+
+	return rc;
+}
+
+static void qed_register_iscsi_ops(struct qed_dev *cdev,
+				   struct qed_iscsi_cb_ops *ops, void *cookie)
+{
+	cdev->protocol_ops.iscsi = ops;
+	cdev->ops_cookie = cookie;
+}
+
+static struct qed_hash_iscsi_con *qed_iscsi_get_hash(struct qed_dev *cdev,
+						     u32 handle)
+{
+	struct qed_hash_iscsi_con *hash_con = NULL;
+
+	if (!(cdev->flags & QED_FLAG_STORAGE_STARTED))
+		return NULL;
+
+	hash_for_each_possible(cdev->connections, hash_con, node, handle) {
+		if (hash_con->con->icid == handle)
+			break;
+	}
+
+	if (!hash_con || (hash_con->con->icid != handle))
+		return NULL;
+
+	return hash_con;
+}
+
+static int qed_iscsi_stop(struct qed_dev *cdev)
+{
+	int rc;
+
+	if (!(cdev->flags & QED_FLAG_STORAGE_STARTED)) {
+		DP_NOTICE(cdev, "iscsi already stopped\n");
+		return 0;
+	}
+
+	if (!hash_empty(cdev->connections)) {
+		DP_NOTICE(cdev,
+			  "Can't stop iscsi - not all connections were returned\n");
+		return -EINVAL;
+	}
+
+	/* Stop the iscsi */
+	rc = qed_sp_iscsi_func_stop(QED_LEADING_HWFN(cdev),
+				    QED_SPQ_MODE_EBLOCK, NULL);
+	cdev->flags &= ~QED_FLAG_STORAGE_STARTED;
+
+	return rc;
+}
+
+static int qed_iscsi_start(struct qed_dev *cdev,
+			   struct qed_iscsi_tid *tasks,
+			   void *event_context,
+			   iscsi_event_cb_t async_event_cb)
+{
+	int rc;
+	struct qed_tid_mem *tid_info;
+
+	if (cdev->flags & QED_FLAG_STORAGE_STARTED) {
+		DP_NOTICE(cdev, "iscsi already started;\n");
+		return 0;
+	}
+
+	rc = qed_sp_iscsi_func_start(QED_LEADING_HWFN(cdev),
+				     QED_SPQ_MODE_EBLOCK, NULL, event_context,
+				     async_event_cb);
+	if (rc) {
+		DP_NOTICE(cdev, "Failed to start iscsi\n");
+		return rc;
+	}
+
+	cdev->flags |= QED_FLAG_STORAGE_STARTED;
+	hash_init(cdev->connections);
+
+	if (!tasks)
+		return 0;
+
+	tid_info = kzalloc(sizeof(*tid_info), GFP_KERNEL);
+
+	if (!tid_info) {
+		qed_iscsi_stop(cdev);
+		return -ENOMEM;
+	}
+
+	rc = qed_cxt_get_tid_mem_info(QED_LEADING_HWFN(cdev),
+				      tid_info);
+	if (rc) {
+		DP_NOTICE(cdev, "Failed to gather task information\n");
+		qed_iscsi_stop(cdev);
+		kfree(tid_info);
+		return rc;
+	}
+
+	/* Fill task information */
+	tasks->size = tid_info->tid_size;
+	tasks->num_tids_per_block = tid_info->num_tids_per_block;
+	memcpy(tasks->blocks, tid_info->blocks,
+	       MAX_TID_BLOCKS_ISCSI * sizeof(u8 *));
+
+	kfree(tid_info);
+
+	return 0;
+}
+
+static int qed_iscsi_acquire_conn(struct qed_dev *cdev,
+				  u32 *handle,
+				  u32 *fw_cid, void __iomem **p_doorbell)
+{
+	struct qed_hash_iscsi_con *hash_con;
+	int rc;
+
+	/* Allocate a hashed connection */
+	hash_con = kzalloc(sizeof(*hash_con), GFP_ATOMIC);
+	if (!hash_con)
+		return -ENOMEM;
+
+	/* Acquire the connection */
+	rc = qed_iscsi_acquire_connection(QED_LEADING_HWFN(cdev), NULL,
+					  &hash_con->con);
+	if (rc) {
+		DP_NOTICE(cdev, "Failed to acquire Connection\n");
+		kfree(hash_con);
+		return rc;
+	}
+
+	/* Added the connection to hash table */
+	*handle = hash_con->con->icid;
+	*fw_cid = hash_con->con->fw_cid;
+	hash_add(cdev->connections, &hash_con->node, *handle);
+
+	if (p_doorbell)
+		*p_doorbell = qed_iscsi_get_db_addr(QED_LEADING_HWFN(cdev),
+						    *handle);
+
+	return 0;
+}
+
+static int qed_iscsi_release_conn(struct qed_dev *cdev, u32 handle)
+{
+	struct qed_hash_iscsi_con *hash_con;
+
+	hash_con = qed_iscsi_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	hlist_del(&hash_con->node);
+	qed_iscsi_release_connection(QED_LEADING_HWFN(cdev), hash_con->con);
+	kfree(hash_con);
+
+	return 0;
+}
+
+static int qed_iscsi_offload_conn(struct qed_dev *cdev,
+				  u32 handle,
+				  struct qed_iscsi_params_offload *conn_info)
+{
+	struct qed_hash_iscsi_con *hash_con;
+	struct qed_iscsi_conn *con;
+
+	hash_con = qed_iscsi_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	/* Update the connection with information from the params */
+	con = hash_con->con;
+
+	ether_addr_copy(con->local_mac, conn_info->src.mac);
+	ether_addr_copy(con->remote_mac, conn_info->dst.mac);
+	memcpy(con->local_ip, conn_info->src.ip, sizeof(con->local_ip));
+	memcpy(con->remote_ip, conn_info->dst.ip, sizeof(con->remote_ip));
+	con->local_port = conn_info->src.port;
+	con->remote_port = conn_info->dst.port;
+
+	con->layer_code = conn_info->layer_code;
+	con->sq_pbl_addr = conn_info->sq_pbl_addr;
+	con->initial_ack = conn_info->initial_ack;
+	con->vlan_id = conn_info->vlan_id;
+	con->tcp_flags = conn_info->tcp_flags;
+	con->ip_version = conn_info->ip_version;
+	con->default_cq = conn_info->default_cq;
+	con->ka_max_probe_cnt = conn_info->ka_max_probe_cnt;
+	con->dup_ack_theshold = conn_info->dup_ack_theshold;
+	con->rcv_next = conn_info->rcv_next;
+	con->snd_una = conn_info->snd_una;
+	con->snd_next = conn_info->snd_next;
+	con->snd_max = conn_info->snd_max;
+	con->snd_wnd = conn_info->snd_wnd;
+	con->rcv_wnd = conn_info->rcv_wnd;
+	con->snd_wl1 = conn_info->snd_wl1;
+	con->cwnd = conn_info->cwnd;
+	con->ss_thresh = conn_info->ss_thresh;
+	con->srtt = conn_info->srtt;
+	con->rtt_var = conn_info->rtt_var;
+	con->ts_recent = conn_info->ts_recent;
+	con->ts_recent_age = conn_info->ts_recent_age;
+	con->total_rt = conn_info->total_rt;
+	con->ka_timeout_delta = conn_info->ka_timeout_delta;
+	con->rt_timeout_delta = conn_info->rt_timeout_delta;
+	con->dup_ack_cnt = conn_info->dup_ack_cnt;
+	con->snd_wnd_probe_cnt = conn_info->snd_wnd_probe_cnt;
+	con->ka_probe_cnt = conn_info->ka_probe_cnt;
+	con->rt_cnt = conn_info->rt_cnt;
+	con->flow_label = conn_info->flow_label;
+	con->ka_timeout = conn_info->ka_timeout;
+	con->ka_interval = conn_info->ka_interval;
+	con->max_rt_time = conn_info->max_rt_time;
+	con->initial_rcv_wnd = conn_info->initial_rcv_wnd;
+	con->ttl = conn_info->ttl;
+	con->tos_or_tc = conn_info->tos_or_tc;
+	con->remote_port = conn_info->remote_port;
+	con->local_port = conn_info->local_port;
+	con->mss = conn_info->mss;
+	con->snd_wnd_scale = conn_info->snd_wnd_scale;
+	con->rcv_wnd_scale = conn_info->rcv_wnd_scale;
+	con->da_timeout_value = conn_info->da_timeout_value;
+	con->ack_frequency = conn_info->ack_frequency;
+
+	/* Set default values on other connection fields */
+	con->offl_flags = 0x1;
+
+	return qed_sp_iscsi_conn_offload(QED_LEADING_HWFN(cdev), con,
+					 QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_iscsi_update_conn(struct qed_dev *cdev,
+				 u32 handle,
+				 struct qed_iscsi_params_update *conn_info)
+{
+	struct qed_hash_iscsi_con *hash_con;
+	struct qed_iscsi_conn *con;
+
+	hash_con = qed_iscsi_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	/* Update the connection with information from the params */
+	con = hash_con->con;
+	con->update_flag = conn_info->update_flag;
+	con->max_seq_size = conn_info->max_seq_size;
+	con->max_recv_pdu_length = conn_info->max_recv_pdu_length;
+	con->max_send_pdu_length = conn_info->max_send_pdu_length;
+	con->first_seq_length = conn_info->first_seq_length;
+	con->exp_stat_sn = conn_info->exp_stat_sn;
+
+	return qed_sp_iscsi_conn_update(QED_LEADING_HWFN(cdev), con,
+					QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_iscsi_clear_conn_sq(struct qed_dev *cdev, u32 handle)
+{
+	struct qed_hash_iscsi_con *hash_con;
+
+	hash_con = qed_iscsi_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	return qed_sp_iscsi_conn_clear_sq(QED_LEADING_HWFN(cdev),
+					  hash_con->con,
+					  QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_iscsi_destroy_conn(struct qed_dev *cdev,
+				  u32 handle, u8 abrt_conn)
+{
+	struct qed_hash_iscsi_con *hash_con;
+
+	hash_con = qed_iscsi_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	hash_con->con->abortive_dsconnect = abrt_conn;
+
+	return qed_sp_iscsi_conn_terminate(QED_LEADING_HWFN(cdev),
+					   hash_con->con,
+					   QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_iscsi_stats(struct qed_dev *cdev, struct qed_iscsi_stats *stats)
+{
+	return qed_iscsi_get_stats(QED_LEADING_HWFN(cdev), stats);
+}
+
+static int qed_iscsi_change_mac(struct qed_dev *cdev,
+				u32 handle, const u8 *mac)
+{
+	struct qed_hash_iscsi_con *hash_con;
+
+	hash_con = qed_iscsi_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	return qed_sp_iscsi_mac_update(QED_LEADING_HWFN(cdev),
+				       hash_con->con,
+				       QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+void qed_get_protocol_stats_iscsi(struct qed_dev *cdev,
+				  struct qed_mcp_iscsi_stats *stats)
+{
+	struct qed_iscsi_stats proto_stats;
+
+	/* Retrieve FW statistics */
+	memset(&proto_stats, 0, sizeof(proto_stats));
+	if (qed_iscsi_stats(cdev, &proto_stats)) {
+		DP_VERBOSE(cdev, QED_MSG_STORAGE,
+			   "Failed to collect ISCSI statistics\n");
+		return;
+	}
+
+	/* Translate FW statistics into struct */
+	stats->rx_pdus = proto_stats.iscsi_rx_total_pdu_cnt;
+	stats->tx_pdus = proto_stats.iscsi_tx_total_pdu_cnt;
+	stats->rx_bytes = proto_stats.iscsi_rx_bytes_cnt;
+	stats->tx_bytes = proto_stats.iscsi_tx_bytes_cnt;
+}
+
+static const struct qed_iscsi_ops qed_iscsi_ops_pass = {
+	.common = &qed_common_ops_pass,
+	.ll2 = &qed_ll2_ops_pass,
+	.fill_dev_info = &qed_fill_iscsi_dev_info,
+	.register_ops = &qed_register_iscsi_ops,
+	.start = &qed_iscsi_start,
+	.stop = &qed_iscsi_stop,
+	.acquire_conn = &qed_iscsi_acquire_conn,
+	.release_conn = &qed_iscsi_release_conn,
+	.offload_conn = &qed_iscsi_offload_conn,
+	.update_conn = &qed_iscsi_update_conn,
+	.destroy_conn = &qed_iscsi_destroy_conn,
+	.clear_sq = &qed_iscsi_clear_conn_sq,
+	.get_stats = &qed_iscsi_stats,
+	.change_mac = &qed_iscsi_change_mac,
+};
+
+const struct qed_iscsi_ops *qed_get_iscsi_ops(void)
+{
+	return &qed_iscsi_ops_pass;
+}
+EXPORT_SYMBOL(qed_get_iscsi_ops);
+
+void qed_put_iscsi_ops(void)
+{
+}
+EXPORT_SYMBOL(qed_put_iscsi_ops);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iscsi.h b/drivers/net/ethernet/qlogic/qed/qed_iscsi.h
new file mode 100644
index 0000000..225c75b
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_iscsi.h
@@ -0,0 +1,89 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_ISCSI_H
+#define _QED_ISCSI_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/qed/tcp_common.h>
+#include <linux/qed/qed_iscsi_if.h>
+#include <linux/qed/qed_chain.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_mcp.h"
+#include "qed_sp.h"
+
+struct qed_iscsi_info {
+	spinlock_t lock; /* Connection resources. */
+	struct list_head free_list;
+	u16 max_num_outstanding_tasks;
+	void *event_context;
+	iscsi_event_cb_t event_cb;
+};
+
+#ifdef CONFIG_QED_LL2
+extern const struct qed_ll2_ops qed_ll2_ops_pass;
+#endif
+
+#if IS_ENABLED(CONFIG_QED_ISCSI)
+int qed_iscsi_alloc(struct qed_hwfn *p_hwfn);
+
+void qed_iscsi_setup(struct qed_hwfn *p_hwfn);
+
+void qed_iscsi_free(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief - Fills provided statistics struct with statistics.
+ *
+ * @param cdev
+ * @param stats - points to struct that will be filled with statistics.
+ */
+void qed_get_protocol_stats_iscsi(struct qed_dev *cdev,
+				  struct qed_mcp_iscsi_stats *stats);
+#else /* IS_ENABLED(CONFIG_QED_ISCSI) */
+static inline int qed_iscsi_alloc(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
+static inline void qed_iscsi_setup(struct qed_hwfn *p_hwfn) {}
+
+static inline void qed_iscsi_free(struct qed_hwfn *p_hwfn) {}
+
+static inline void
+qed_get_protocol_stats_iscsi(struct qed_dev *cdev,
+			     struct qed_mcp_iscsi_stats *stats) {}
+#endif /* IS_ENABLED(CONFIG_QED_ISCSI) */
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
new file mode 100644
index 0000000..2a2b101
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -0,0 +1,3186 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/spinlock.h>
+#include <linux/tcp.h>
+#include "qed_cxt.h"
+#include "qed_hw.h"
+#include "qed_ll2.h"
+#include "qed_rdma.h"
+#include "qed_reg_addr.h"
+#include "qed_sp.h"
+#include "qed_ooo.h"
+
+#define QED_IWARP_ORD_DEFAULT		32
+#define QED_IWARP_IRD_DEFAULT		32
+#define QED_IWARP_MAX_FW_MSS		4120
+
+#define QED_EP_SIG 0xecabcdef
+
+struct mpa_v2_hdr {
+	__be16 ird;
+	__be16 ord;
+};
+
+#define MPA_V2_PEER2PEER_MODEL  0x8000
+#define MPA_V2_SEND_RTR         0x4000	/* on ird */
+#define MPA_V2_READ_RTR         0x4000	/* on ord */
+#define MPA_V2_WRITE_RTR        0x8000
+#define MPA_V2_IRD_ORD_MASK     0x3FFF
+
+#define MPA_REV2(_mpa_rev) ((_mpa_rev) == MPA_NEGOTIATION_TYPE_ENHANCED)
+
+#define QED_IWARP_INVALID_TCP_CID	0xffffffff
+#define QED_IWARP_RCV_WND_SIZE_DEF	(256 * 1024)
+#define QED_IWARP_RCV_WND_SIZE_MIN	(0xffff)
+#define TIMESTAMP_HEADER_SIZE		(12)
+#define QED_IWARP_MAX_FIN_RT_DEFAULT	(2)
+
+#define QED_IWARP_TS_EN			BIT(0)
+#define QED_IWARP_DA_EN			BIT(1)
+#define QED_IWARP_PARAM_CRC_NEEDED	(1)
+#define QED_IWARP_PARAM_P2P		(1)
+
+#define QED_IWARP_DEF_MAX_RT_TIME	(0)
+#define QED_IWARP_DEF_CWND_FACTOR	(4)
+#define QED_IWARP_DEF_KA_MAX_PROBE_CNT	(5)
+#define QED_IWARP_DEF_KA_TIMEOUT	(1200000)	/* 20 min */
+#define QED_IWARP_DEF_KA_INTERVAL	(1000)		/* 1 sec */
+
+static int qed_iwarp_async_event(struct qed_hwfn *p_hwfn,
+				 u8 fw_event_code, u16 echo,
+				 union event_ring_data *data,
+				 u8 fw_return_code);
+
+/* Override devinfo with iWARP specific values */
+void qed_iwarp_init_devinfo(struct qed_hwfn *p_hwfn)
+{
+	struct qed_rdma_device *dev = p_hwfn->p_rdma_info->dev;
+
+	dev->max_inline = IWARP_REQ_MAX_INLINE_DATA_SIZE;
+	dev->max_qp = min_t(u32,
+			    IWARP_MAX_QPS,
+			    p_hwfn->p_rdma_info->num_qps) -
+		      QED_IWARP_PREALLOC_CNT;
+
+	dev->max_cq = dev->max_qp;
+
+	dev->max_qp_resp_rd_atomic_resc = QED_IWARP_IRD_DEFAULT;
+	dev->max_qp_req_rd_atomic_resc = QED_IWARP_ORD_DEFAULT;
+}
+
+void qed_iwarp_init_hw(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	p_hwfn->rdma_prs_search_reg = PRS_REG_SEARCH_TCP;
+	qed_wr(p_hwfn, p_ptt, p_hwfn->rdma_prs_search_reg, 1);
+	p_hwfn->b_rdma_enabled_in_prs = true;
+}
+
+/* We have two cid maps, one for tcp which should be used only from passive
+ * syn processing and replacing a pre-allocated ep in the list. The second
+ * for active tcp and for QPs.
+ */
+static void qed_iwarp_cid_cleaned(struct qed_hwfn *p_hwfn, u32 cid)
+{
+	cid -= qed_cxt_get_proto_cid_start(p_hwfn, p_hwfn->p_rdma_info->proto);
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+
+	if (cid < QED_IWARP_PREALLOC_CNT)
+		qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->tcp_cid_map,
+				    cid);
+	else
+		qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->cid_map, cid);
+
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+}
+
+void
+qed_iwarp_init_fw_ramrod(struct qed_hwfn *p_hwfn,
+			 struct iwarp_init_func_ramrod_data *p_ramrod)
+{
+	p_ramrod->iwarp.ll2_ooo_q_index =
+		RESC_START(p_hwfn, QED_LL2_QUEUE) +
+		p_hwfn->p_rdma_info->iwarp.ll2_ooo_handle;
+
+	p_ramrod->tcp.max_fin_rt = QED_IWARP_MAX_FIN_RT_DEFAULT;
+
+	return;
+}
+
+static int qed_iwarp_alloc_cid(struct qed_hwfn *p_hwfn, u32 *cid)
+{
+	int rc;
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	rc = qed_rdma_bmap_alloc_id(p_hwfn, &p_hwfn->p_rdma_info->cid_map, cid);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Failed in allocating iwarp cid\n");
+		return rc;
+	}
+	*cid += qed_cxt_get_proto_cid_start(p_hwfn, p_hwfn->p_rdma_info->proto);
+
+	rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, QED_ELEM_CXT, *cid);
+	if (rc)
+		qed_iwarp_cid_cleaned(p_hwfn, *cid);
+
+	return rc;
+}
+
+static void qed_iwarp_set_tcp_cid(struct qed_hwfn *p_hwfn, u32 cid)
+{
+	cid -= qed_cxt_get_proto_cid_start(p_hwfn, p_hwfn->p_rdma_info->proto);
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	qed_bmap_set_id(p_hwfn, &p_hwfn->p_rdma_info->tcp_cid_map, cid);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+}
+
+/* This function allocates a cid for passive tcp (called from syn receive)
+ * the reason it's separate from the regular cid allocation is because it
+ * is assured that these cids already have ilt allocated. They are preallocated
+ * to ensure that we won't need to allocate memory during syn processing
+ */
+static int qed_iwarp_alloc_tcp_cid(struct qed_hwfn *p_hwfn, u32 *cid)
+{
+	int rc;
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+
+	rc = qed_rdma_bmap_alloc_id(p_hwfn,
+				    &p_hwfn->p_rdma_info->tcp_cid_map, cid);
+
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "can't allocate iwarp tcp cid max-count=%d\n",
+			   p_hwfn->p_rdma_info->tcp_cid_map.max_count);
+
+		*cid = QED_IWARP_INVALID_TCP_CID;
+		return rc;
+	}
+
+	*cid += qed_cxt_get_proto_cid_start(p_hwfn,
+					    p_hwfn->p_rdma_info->proto);
+	return 0;
+}
+
+int qed_iwarp_create_qp(struct qed_hwfn *p_hwfn,
+			struct qed_rdma_qp *qp,
+			struct qed_rdma_create_qp_out_params *out_params)
+{
+	struct iwarp_create_qp_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	u16 physical_queue;
+	u32 cid;
+	int rc;
+
+	qp->shared_queue = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+					      IWARP_SHARED_QUEUE_PAGE_SIZE,
+					      &qp->shared_queue_phys_addr,
+					      GFP_KERNEL);
+	if (!qp->shared_queue)
+		return -ENOMEM;
+
+	out_params->sq_pbl_virt = (u8 *)qp->shared_queue +
+	    IWARP_SHARED_QUEUE_PAGE_SQ_PBL_OFFSET;
+	out_params->sq_pbl_phys = qp->shared_queue_phys_addr +
+	    IWARP_SHARED_QUEUE_PAGE_SQ_PBL_OFFSET;
+	out_params->rq_pbl_virt = (u8 *)qp->shared_queue +
+	    IWARP_SHARED_QUEUE_PAGE_RQ_PBL_OFFSET;
+	out_params->rq_pbl_phys = qp->shared_queue_phys_addr +
+	    IWARP_SHARED_QUEUE_PAGE_RQ_PBL_OFFSET;
+
+	rc = qed_iwarp_alloc_cid(p_hwfn, &cid);
+	if (rc)
+		goto err1;
+
+	qp->icid = (u16)cid;
+
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.cid = qp->icid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 IWARP_RAMROD_CMD_ID_CREATE_QP,
+				 PROTOCOLID_IWARP, &init_data);
+	if (rc)
+		goto err2;
+
+	p_ramrod = &p_ent->ramrod.iwarp_create_qp;
+
+	SET_FIELD(p_ramrod->flags,
+		  IWARP_CREATE_QP_RAMROD_DATA_FMR_AND_RESERVED_EN,
+		  qp->fmr_and_reserved_lkey);
+
+	SET_FIELD(p_ramrod->flags,
+		  IWARP_CREATE_QP_RAMROD_DATA_SIGNALED_COMP, qp->signal_all);
+
+	SET_FIELD(p_ramrod->flags,
+		  IWARP_CREATE_QP_RAMROD_DATA_RDMA_RD_EN,
+		  qp->incoming_rdma_read_en);
+
+	SET_FIELD(p_ramrod->flags,
+		  IWARP_CREATE_QP_RAMROD_DATA_RDMA_WR_EN,
+		  qp->incoming_rdma_write_en);
+
+	SET_FIELD(p_ramrod->flags,
+		  IWARP_CREATE_QP_RAMROD_DATA_ATOMIC_EN,
+		  qp->incoming_atomic_en);
+
+	SET_FIELD(p_ramrod->flags,
+		  IWARP_CREATE_QP_RAMROD_DATA_SRQ_FLG, qp->use_srq);
+
+	p_ramrod->pd = qp->pd;
+	p_ramrod->sq_num_pages = qp->sq_num_pages;
+	p_ramrod->rq_num_pages = qp->rq_num_pages;
+
+	p_ramrod->qp_handle_for_cqe.hi = cpu_to_le32(qp->qp_handle.hi);
+	p_ramrod->qp_handle_for_cqe.lo = cpu_to_le32(qp->qp_handle.lo);
+
+	p_ramrod->cq_cid_for_sq =
+	    cpu_to_le32((p_hwfn->hw_info.opaque_fid << 16) | qp->sq_cq_id);
+	p_ramrod->cq_cid_for_rq =
+	    cpu_to_le32((p_hwfn->hw_info.opaque_fid << 16) | qp->rq_cq_id);
+
+	p_ramrod->dpi = cpu_to_le16(qp->dpi);
+
+	physical_queue = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_OFLD);
+	p_ramrod->physical_q0 = cpu_to_le16(physical_queue);
+	physical_queue = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_ACK);
+	p_ramrod->physical_q1 = cpu_to_le16(physical_queue);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		goto err2;
+
+	return rc;
+
+err2:
+	qed_iwarp_cid_cleaned(p_hwfn, cid);
+err1:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  IWARP_SHARED_QUEUE_PAGE_SIZE,
+			  qp->shared_queue, qp->shared_queue_phys_addr);
+
+	return rc;
+}
+
+static int qed_iwarp_modify_fw(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp)
+{
+	struct iwarp_modify_qp_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	int rc;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qp->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 IWARP_RAMROD_CMD_ID_MODIFY_QP,
+				 p_hwfn->p_rdma_info->proto, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iwarp_modify_qp;
+	SET_FIELD(p_ramrod->flags, IWARP_MODIFY_QP_RAMROD_DATA_STATE_TRANS_EN,
+		  0x1);
+	if (qp->iwarp_state == QED_IWARP_QP_STATE_CLOSING)
+		p_ramrod->transition_to_state = IWARP_MODIFY_QP_STATE_CLOSING;
+	else
+		p_ramrod->transition_to_state = IWARP_MODIFY_QP_STATE_ERROR;
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "QP(0x%x)rc=%d\n", qp->icid, rc);
+
+	return rc;
+}
+
+enum qed_iwarp_qp_state qed_roce2iwarp_state(enum qed_roce_qp_state state)
+{
+	switch (state) {
+	case QED_ROCE_QP_STATE_RESET:
+	case QED_ROCE_QP_STATE_INIT:
+	case QED_ROCE_QP_STATE_RTR:
+		return QED_IWARP_QP_STATE_IDLE;
+	case QED_ROCE_QP_STATE_RTS:
+		return QED_IWARP_QP_STATE_RTS;
+	case QED_ROCE_QP_STATE_SQD:
+		return QED_IWARP_QP_STATE_CLOSING;
+	case QED_ROCE_QP_STATE_ERR:
+		return QED_IWARP_QP_STATE_ERROR;
+	case QED_ROCE_QP_STATE_SQE:
+		return QED_IWARP_QP_STATE_TERMINATE;
+	default:
+		return QED_IWARP_QP_STATE_ERROR;
+	}
+}
+
+static enum qed_roce_qp_state
+qed_iwarp2roce_state(enum qed_iwarp_qp_state state)
+{
+	switch (state) {
+	case QED_IWARP_QP_STATE_IDLE:
+		return QED_ROCE_QP_STATE_INIT;
+	case QED_IWARP_QP_STATE_RTS:
+		return QED_ROCE_QP_STATE_RTS;
+	case QED_IWARP_QP_STATE_TERMINATE:
+		return QED_ROCE_QP_STATE_SQE;
+	case QED_IWARP_QP_STATE_CLOSING:
+		return QED_ROCE_QP_STATE_SQD;
+	case QED_IWARP_QP_STATE_ERROR:
+		return QED_ROCE_QP_STATE_ERR;
+	default:
+		return QED_ROCE_QP_STATE_ERR;
+	}
+}
+
+const char *iwarp_state_names[] = {
+	"IDLE",
+	"RTS",
+	"TERMINATE",
+	"CLOSING",
+	"ERROR",
+};
+
+int
+qed_iwarp_modify_qp(struct qed_hwfn *p_hwfn,
+		    struct qed_rdma_qp *qp,
+		    enum qed_iwarp_qp_state new_state, bool internal)
+{
+	enum qed_iwarp_qp_state prev_iw_state;
+	bool modify_fw = false;
+	int rc = 0;
+
+	/* modify QP can be called from upper-layer or as a result of async
+	 * RST/FIN... therefore need to protect
+	 */
+	spin_lock_bh(&p_hwfn->p_rdma_info->iwarp.qp_lock);
+	prev_iw_state = qp->iwarp_state;
+
+	if (prev_iw_state == new_state) {
+		spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.qp_lock);
+		return 0;
+	}
+
+	switch (prev_iw_state) {
+	case QED_IWARP_QP_STATE_IDLE:
+		switch (new_state) {
+		case QED_IWARP_QP_STATE_RTS:
+			qp->iwarp_state = QED_IWARP_QP_STATE_RTS;
+			break;
+		case QED_IWARP_QP_STATE_ERROR:
+			qp->iwarp_state = QED_IWARP_QP_STATE_ERROR;
+			if (!internal)
+				modify_fw = true;
+			break;
+		default:
+			break;
+		}
+		break;
+	case QED_IWARP_QP_STATE_RTS:
+		switch (new_state) {
+		case QED_IWARP_QP_STATE_CLOSING:
+			if (!internal)
+				modify_fw = true;
+
+			qp->iwarp_state = QED_IWARP_QP_STATE_CLOSING;
+			break;
+		case QED_IWARP_QP_STATE_ERROR:
+			if (!internal)
+				modify_fw = true;
+			qp->iwarp_state = QED_IWARP_QP_STATE_ERROR;
+			break;
+		default:
+			break;
+		}
+		break;
+	case QED_IWARP_QP_STATE_ERROR:
+		switch (new_state) {
+		case QED_IWARP_QP_STATE_IDLE:
+
+			qp->iwarp_state = new_state;
+			break;
+		case QED_IWARP_QP_STATE_CLOSING:
+			/* could happen due to race... do nothing.... */
+			break;
+		default:
+			rc = -EINVAL;
+		}
+		break;
+	case QED_IWARP_QP_STATE_TERMINATE:
+	case QED_IWARP_QP_STATE_CLOSING:
+		qp->iwarp_state = new_state;
+		break;
+	default:
+		break;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "QP(0x%x) %s --> %s%s\n",
+		   qp->icid,
+		   iwarp_state_names[prev_iw_state],
+		   iwarp_state_names[qp->iwarp_state],
+		   internal ? "internal" : "");
+
+	spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.qp_lock);
+
+	if (modify_fw)
+		rc = qed_iwarp_modify_fw(p_hwfn, qp);
+
+	return rc;
+}
+
+int qed_iwarp_fw_destroy(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp)
+{
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	int rc;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qp->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 IWARP_RAMROD_CMD_ID_DESTROY_QP,
+				 p_hwfn->p_rdma_info->proto, &init_data);
+	if (rc)
+		return rc;
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "QP(0x%x) rc = %d\n", qp->icid, rc);
+
+	return rc;
+}
+
+static void qed_iwarp_destroy_ep(struct qed_hwfn *p_hwfn,
+				 struct qed_iwarp_ep *ep,
+				 bool remove_from_active_list)
+{
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  sizeof(*ep->ep_buffer_virt),
+			  ep->ep_buffer_virt, ep->ep_buffer_phys);
+
+	if (remove_from_active_list) {
+		spin_lock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+		list_del(&ep->list_entry);
+		spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+	}
+
+	if (ep->qp)
+		ep->qp->ep = NULL;
+
+	kfree(ep);
+}
+
+int qed_iwarp_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp)
+{
+	struct qed_iwarp_ep *ep = qp->ep;
+	int wait_count = 0;
+	int rc = 0;
+
+	if (qp->iwarp_state != QED_IWARP_QP_STATE_ERROR) {
+		rc = qed_iwarp_modify_qp(p_hwfn, qp,
+					 QED_IWARP_QP_STATE_ERROR, false);
+		if (rc)
+			return rc;
+	}
+
+	/* Make sure ep is closed before returning and freeing memory. */
+	if (ep) {
+		while (ep->state != QED_IWARP_EP_CLOSED && wait_count++ < 200)
+			msleep(100);
+
+		if (ep->state != QED_IWARP_EP_CLOSED)
+			DP_NOTICE(p_hwfn, "ep state close timeout state=%x\n",
+				  ep->state);
+
+		qed_iwarp_destroy_ep(p_hwfn, ep, false);
+	}
+
+	rc = qed_iwarp_fw_destroy(p_hwfn, qp);
+
+	if (qp->shared_queue)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  IWARP_SHARED_QUEUE_PAGE_SIZE,
+				  qp->shared_queue, qp->shared_queue_phys_addr);
+
+	return rc;
+}
+
+static int
+qed_iwarp_create_ep(struct qed_hwfn *p_hwfn, struct qed_iwarp_ep **ep_out)
+{
+	struct qed_iwarp_ep *ep;
+	int rc;
+
+	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+	if (!ep)
+		return -ENOMEM;
+
+	ep->state = QED_IWARP_EP_INIT;
+
+	ep->ep_buffer_virt = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+						sizeof(*ep->ep_buffer_virt),
+						&ep->ep_buffer_phys,
+						GFP_KERNEL);
+	if (!ep->ep_buffer_virt) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	ep->sig = QED_EP_SIG;
+
+	*ep_out = ep;
+
+	return 0;
+
+err:
+	kfree(ep);
+	return rc;
+}
+
+static void
+qed_iwarp_print_tcp_ramrod(struct qed_hwfn *p_hwfn,
+			   struct iwarp_tcp_offload_ramrod_data *p_tcp_ramrod)
+{
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "local_mac=%x %x %x, remote_mac=%x %x %x\n",
+		   p_tcp_ramrod->tcp.local_mac_addr_lo,
+		   p_tcp_ramrod->tcp.local_mac_addr_mid,
+		   p_tcp_ramrod->tcp.local_mac_addr_hi,
+		   p_tcp_ramrod->tcp.remote_mac_addr_lo,
+		   p_tcp_ramrod->tcp.remote_mac_addr_mid,
+		   p_tcp_ramrod->tcp.remote_mac_addr_hi);
+
+	if (p_tcp_ramrod->tcp.ip_version == TCP_IPV4) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "local_ip=%pI4h:%x, remote_ip=%pI4h:%x, vlan=%x\n",
+			   p_tcp_ramrod->tcp.local_ip,
+			   p_tcp_ramrod->tcp.local_port,
+			   p_tcp_ramrod->tcp.remote_ip,
+			   p_tcp_ramrod->tcp.remote_port,
+			   p_tcp_ramrod->tcp.vlan_id);
+	} else {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "local_ip=%pI6:%x, remote_ip=%pI6:%x, vlan=%x\n",
+			   p_tcp_ramrod->tcp.local_ip,
+			   p_tcp_ramrod->tcp.local_port,
+			   p_tcp_ramrod->tcp.remote_ip,
+			   p_tcp_ramrod->tcp.remote_port,
+			   p_tcp_ramrod->tcp.vlan_id);
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "flow_label=%x, ttl=%x, tos_or_tc=%x, mss=%x, rcv_wnd_scale=%x, connect_mode=%x, flags=%x\n",
+		   p_tcp_ramrod->tcp.flow_label,
+		   p_tcp_ramrod->tcp.ttl,
+		   p_tcp_ramrod->tcp.tos_or_tc,
+		   p_tcp_ramrod->tcp.mss,
+		   p_tcp_ramrod->tcp.rcv_wnd_scale,
+		   p_tcp_ramrod->tcp.connect_mode,
+		   p_tcp_ramrod->tcp.flags);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "syn_ip_payload_length=%x, lo=%x, hi=%x\n",
+		   p_tcp_ramrod->tcp.syn_ip_payload_length,
+		   p_tcp_ramrod->tcp.syn_phy_addr_lo,
+		   p_tcp_ramrod->tcp.syn_phy_addr_hi);
+}
+
+static int
+qed_iwarp_tcp_offload(struct qed_hwfn *p_hwfn, struct qed_iwarp_ep *ep)
+{
+	struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+	struct iwarp_tcp_offload_ramrod_data *p_tcp_ramrod;
+	struct tcp_offload_params_opt2 *tcp;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	dma_addr_t async_output_phys;
+	dma_addr_t in_pdata_phys;
+	u16 physical_q;
+	u8 tcp_flags;
+	int rc;
+	int i;
+
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = ep->tcp_cid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	if (ep->connect_mode == TCP_CONNECT_PASSIVE)
+		init_data.comp_mode = QED_SPQ_MODE_CB;
+	else
+		init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 IWARP_RAMROD_CMD_ID_TCP_OFFLOAD,
+				 PROTOCOLID_IWARP, &init_data);
+	if (rc)
+		return rc;
+
+	p_tcp_ramrod = &p_ent->ramrod.iwarp_tcp_offload;
+
+	in_pdata_phys = ep->ep_buffer_phys +
+			offsetof(struct qed_iwarp_ep_memory, in_pdata);
+	DMA_REGPAIR_LE(p_tcp_ramrod->iwarp.incoming_ulp_buffer.addr,
+		       in_pdata_phys);
+
+	p_tcp_ramrod->iwarp.incoming_ulp_buffer.len =
+	    cpu_to_le16(sizeof(ep->ep_buffer_virt->in_pdata));
+
+	async_output_phys = ep->ep_buffer_phys +
+			    offsetof(struct qed_iwarp_ep_memory, async_output);
+	DMA_REGPAIR_LE(p_tcp_ramrod->iwarp.async_eqe_output_buf,
+		       async_output_phys);
+
+	p_tcp_ramrod->iwarp.handle_for_async.hi = cpu_to_le32(PTR_HI(ep));
+	p_tcp_ramrod->iwarp.handle_for_async.lo = cpu_to_le32(PTR_LO(ep));
+
+	physical_q = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_OFLD);
+	p_tcp_ramrod->iwarp.physical_q0 = cpu_to_le16(physical_q);
+	physical_q = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_ACK);
+	p_tcp_ramrod->iwarp.physical_q1 = cpu_to_le16(physical_q);
+	p_tcp_ramrod->iwarp.mpa_mode = iwarp_info->mpa_rev;
+
+	tcp = &p_tcp_ramrod->tcp;
+	qed_set_fw_mac_addr(&tcp->remote_mac_addr_hi,
+			    &tcp->remote_mac_addr_mid,
+			    &tcp->remote_mac_addr_lo, ep->remote_mac_addr);
+	qed_set_fw_mac_addr(&tcp->local_mac_addr_hi, &tcp->local_mac_addr_mid,
+			    &tcp->local_mac_addr_lo, ep->local_mac_addr);
+
+	tcp->vlan_id = cpu_to_le16(ep->cm_info.vlan);
+
+	tcp_flags = p_hwfn->p_rdma_info->iwarp.tcp_flags;
+	tcp->flags = 0;
+	SET_FIELD(tcp->flags, TCP_OFFLOAD_PARAMS_OPT2_TS_EN,
+		  !!(tcp_flags & QED_IWARP_TS_EN));
+
+	SET_FIELD(tcp->flags, TCP_OFFLOAD_PARAMS_OPT2_DA_EN,
+		  !!(tcp_flags & QED_IWARP_DA_EN));
+
+	tcp->ip_version = ep->cm_info.ip_version;
+
+	for (i = 0; i < 4; i++) {
+		tcp->remote_ip[i] = cpu_to_le32(ep->cm_info.remote_ip[i]);
+		tcp->local_ip[i] = cpu_to_le32(ep->cm_info.local_ip[i]);
+	}
+
+	tcp->remote_port = cpu_to_le16(ep->cm_info.remote_port);
+	tcp->local_port = cpu_to_le16(ep->cm_info.local_port);
+	tcp->mss = cpu_to_le16(ep->mss);
+	tcp->flow_label = 0;
+	tcp->ttl = 0x40;
+	tcp->tos_or_tc = 0;
+
+	tcp->max_rt_time = QED_IWARP_DEF_MAX_RT_TIME;
+	tcp->cwnd = QED_IWARP_DEF_CWND_FACTOR *  tcp->mss;
+	tcp->ka_max_probe_cnt = QED_IWARP_DEF_KA_MAX_PROBE_CNT;
+	tcp->ka_timeout = QED_IWARP_DEF_KA_TIMEOUT;
+	tcp->ka_interval = QED_IWARP_DEF_KA_INTERVAL;
+
+	tcp->rcv_wnd_scale = (u8)p_hwfn->p_rdma_info->iwarp.rcv_wnd_scale;
+	tcp->connect_mode = ep->connect_mode;
+
+	if (ep->connect_mode == TCP_CONNECT_PASSIVE) {
+		tcp->syn_ip_payload_length =
+			cpu_to_le16(ep->syn_ip_payload_length);
+		tcp->syn_phy_addr_hi = DMA_HI_LE(ep->syn_phy_addr);
+		tcp->syn_phy_addr_lo = DMA_LO_LE(ep->syn_phy_addr);
+	}
+
+	qed_iwarp_print_tcp_ramrod(p_hwfn, p_tcp_ramrod);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "EP(0x%x) Offload completed rc=%d\n", ep->tcp_cid, rc);
+
+	return rc;
+}
+
+static void
+qed_iwarp_mpa_received(struct qed_hwfn *p_hwfn, struct qed_iwarp_ep *ep)
+{
+	struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+	struct qed_iwarp_cm_event_params params;
+	struct mpa_v2_hdr *mpa_v2;
+	union async_output *async_data;
+	u16 mpa_ord, mpa_ird;
+	u8 mpa_hdr_size = 0;
+	u8 mpa_rev;
+
+	async_data = &ep->ep_buffer_virt->async_output;
+
+	mpa_rev = async_data->mpa_request.mpa_handshake_mode;
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "private_data_len=%x handshake_mode=%x private_data=(%x)\n",
+		   async_data->mpa_request.ulp_data_len,
+		   mpa_rev, *((u32 *)(ep->ep_buffer_virt->in_pdata)));
+
+	if (mpa_rev == MPA_NEGOTIATION_TYPE_ENHANCED) {
+		/* Read ord/ird values from private data buffer */
+		mpa_v2 = (struct mpa_v2_hdr *)ep->ep_buffer_virt->in_pdata;
+		mpa_hdr_size = sizeof(*mpa_v2);
+
+		mpa_ord = ntohs(mpa_v2->ord);
+		mpa_ird = ntohs(mpa_v2->ird);
+
+		/* Temprary store in cm_info incoming ord/ird requested, later
+		 * replace with negotiated value during accept
+		 */
+		ep->cm_info.ord = (u8)min_t(u16,
+					    (mpa_ord & MPA_V2_IRD_ORD_MASK),
+					    QED_IWARP_ORD_DEFAULT);
+
+		ep->cm_info.ird = (u8)min_t(u16,
+					    (mpa_ird & MPA_V2_IRD_ORD_MASK),
+					    QED_IWARP_IRD_DEFAULT);
+
+		/* Peer2Peer negotiation */
+		ep->rtr_type = MPA_RTR_TYPE_NONE;
+		if (mpa_ird & MPA_V2_PEER2PEER_MODEL) {
+			if (mpa_ord & MPA_V2_WRITE_RTR)
+				ep->rtr_type |= MPA_RTR_TYPE_ZERO_WRITE;
+
+			if (mpa_ord & MPA_V2_READ_RTR)
+				ep->rtr_type |= MPA_RTR_TYPE_ZERO_READ;
+
+			if (mpa_ird & MPA_V2_SEND_RTR)
+				ep->rtr_type |= MPA_RTR_TYPE_ZERO_SEND;
+
+			ep->rtr_type &= iwarp_info->rtr_type;
+
+			/* if we're left with no match send our capabilities */
+			if (ep->rtr_type == MPA_RTR_TYPE_NONE)
+				ep->rtr_type = iwarp_info->rtr_type;
+		}
+
+		ep->mpa_rev = MPA_NEGOTIATION_TYPE_ENHANCED;
+	} else {
+		ep->cm_info.ord = QED_IWARP_ORD_DEFAULT;
+		ep->cm_info.ird = QED_IWARP_IRD_DEFAULT;
+		ep->mpa_rev = MPA_NEGOTIATION_TYPE_BASIC;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "MPA_NEGOTIATE (v%d): ORD: 0x%x IRD: 0x%x rtr:0x%x ulp_data_len = %x mpa_hdr_size = %x\n",
+		   mpa_rev, ep->cm_info.ord, ep->cm_info.ird, ep->rtr_type,
+		   async_data->mpa_request.ulp_data_len, mpa_hdr_size);
+
+	/* Strip mpa v2 hdr from private data before sending to upper layer */
+	ep->cm_info.private_data = ep->ep_buffer_virt->in_pdata + mpa_hdr_size;
+
+	ep->cm_info.private_data_len = async_data->mpa_request.ulp_data_len -
+				       mpa_hdr_size;
+
+	params.event = QED_IWARP_EVENT_MPA_REQUEST;
+	params.cm_info = &ep->cm_info;
+	params.ep_context = ep;
+	params.status = 0;
+
+	ep->state = QED_IWARP_EP_MPA_REQ_RCVD;
+	ep->event_cb(ep->cb_context, &params);
+}
+
+static int
+qed_iwarp_mpa_offload(struct qed_hwfn *p_hwfn, struct qed_iwarp_ep *ep)
+{
+	struct iwarp_mpa_offload_ramrod_data *p_mpa_ramrod;
+	struct qed_iwarp_info *iwarp_info;
+	struct qed_sp_init_data init_data;
+	dma_addr_t async_output_phys;
+	struct qed_spq_entry *p_ent;
+	dma_addr_t out_pdata_phys;
+	dma_addr_t in_pdata_phys;
+	struct qed_rdma_qp *qp;
+	bool reject;
+	int rc;
+
+	if (!ep)
+		return -EINVAL;
+
+	qp = ep->qp;
+	reject = !qp;
+
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = reject ? ep->tcp_cid : qp->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+
+	if (ep->connect_mode == TCP_CONNECT_ACTIVE)
+		init_data.comp_mode = QED_SPQ_MODE_CB;
+	else
+		init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 IWARP_RAMROD_CMD_ID_MPA_OFFLOAD,
+				 PROTOCOLID_IWARP, &init_data);
+	if (rc)
+		return rc;
+
+	p_mpa_ramrod = &p_ent->ramrod.iwarp_mpa_offload;
+	out_pdata_phys = ep->ep_buffer_phys +
+			 offsetof(struct qed_iwarp_ep_memory, out_pdata);
+	DMA_REGPAIR_LE(p_mpa_ramrod->common.outgoing_ulp_buffer.addr,
+		       out_pdata_phys);
+	p_mpa_ramrod->common.outgoing_ulp_buffer.len =
+	    ep->cm_info.private_data_len;
+	p_mpa_ramrod->common.crc_needed = p_hwfn->p_rdma_info->iwarp.crc_needed;
+
+	p_mpa_ramrod->common.out_rq.ord = ep->cm_info.ord;
+	p_mpa_ramrod->common.out_rq.ird = ep->cm_info.ird;
+
+	p_mpa_ramrod->tcp_cid = p_hwfn->hw_info.opaque_fid << 16 | ep->tcp_cid;
+
+	in_pdata_phys = ep->ep_buffer_phys +
+			offsetof(struct qed_iwarp_ep_memory, in_pdata);
+	p_mpa_ramrod->tcp_connect_side = ep->connect_mode;
+	DMA_REGPAIR_LE(p_mpa_ramrod->incoming_ulp_buffer.addr,
+		       in_pdata_phys);
+	p_mpa_ramrod->incoming_ulp_buffer.len =
+	    cpu_to_le16(sizeof(ep->ep_buffer_virt->in_pdata));
+	async_output_phys = ep->ep_buffer_phys +
+			    offsetof(struct qed_iwarp_ep_memory, async_output);
+	DMA_REGPAIR_LE(p_mpa_ramrod->async_eqe_output_buf,
+		       async_output_phys);
+	p_mpa_ramrod->handle_for_async.hi = cpu_to_le32(PTR_HI(ep));
+	p_mpa_ramrod->handle_for_async.lo = cpu_to_le32(PTR_LO(ep));
+
+	if (!reject) {
+		DMA_REGPAIR_LE(p_mpa_ramrod->shared_queue_addr,
+			       qp->shared_queue_phys_addr);
+		p_mpa_ramrod->stats_counter_id =
+		    RESC_START(p_hwfn, QED_RDMA_STATS_QUEUE) + qp->stats_queue;
+	} else {
+		p_mpa_ramrod->common.reject = 1;
+	}
+
+	iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+	p_mpa_ramrod->rcv_wnd = iwarp_info->rcv_wnd_size;
+	p_mpa_ramrod->mode = ep->mpa_rev;
+	SET_FIELD(p_mpa_ramrod->rtr_pref,
+		  IWARP_MPA_OFFLOAD_RAMROD_DATA_RTR_SUPPORTED, ep->rtr_type);
+
+	ep->state = QED_IWARP_EP_MPA_OFFLOADED;
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (!reject)
+		ep->cid = qp->icid;	/* Now they're migrated. */
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_RDMA,
+		   "QP(0x%x) EP(0x%x) MPA Offload rc = %d IRD=0x%x ORD=0x%x rtr_type=%d mpa_rev=%d reject=%d\n",
+		   reject ? 0xffff : qp->icid,
+		   ep->tcp_cid,
+		   rc,
+		   ep->cm_info.ird,
+		   ep->cm_info.ord, ep->rtr_type, ep->mpa_rev, reject);
+	return rc;
+}
+
+static void
+qed_iwarp_return_ep(struct qed_hwfn *p_hwfn, struct qed_iwarp_ep *ep)
+{
+	ep->state = QED_IWARP_EP_INIT;
+	if (ep->qp)
+		ep->qp->ep = NULL;
+	ep->qp = NULL;
+	memset(&ep->cm_info, 0, sizeof(ep->cm_info));
+
+	if (ep->tcp_cid == QED_IWARP_INVALID_TCP_CID) {
+		/* We don't care about the return code, it's ok if tcp_cid
+		 * remains invalid...in this case we'll defer allocation
+		 */
+		qed_iwarp_alloc_tcp_cid(p_hwfn, &ep->tcp_cid);
+	}
+	spin_lock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+
+	list_del(&ep->list_entry);
+	list_add_tail(&ep->list_entry,
+		      &p_hwfn->p_rdma_info->iwarp.ep_free_list);
+
+	spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+}
+
+void
+qed_iwarp_parse_private_data(struct qed_hwfn *p_hwfn, struct qed_iwarp_ep *ep)
+{
+	struct mpa_v2_hdr *mpa_v2_params;
+	union async_output *async_data;
+	u16 mpa_ird, mpa_ord;
+	u8 mpa_data_size = 0;
+
+	if (MPA_REV2(p_hwfn->p_rdma_info->iwarp.mpa_rev)) {
+		mpa_v2_params =
+			(struct mpa_v2_hdr *)(ep->ep_buffer_virt->in_pdata);
+		mpa_data_size = sizeof(*mpa_v2_params);
+		mpa_ird = ntohs(mpa_v2_params->ird);
+		mpa_ord = ntohs(mpa_v2_params->ord);
+
+		ep->cm_info.ird = (u8)(mpa_ord & MPA_V2_IRD_ORD_MASK);
+		ep->cm_info.ord = (u8)(mpa_ird & MPA_V2_IRD_ORD_MASK);
+	}
+	async_data = &ep->ep_buffer_virt->async_output;
+
+	ep->cm_info.private_data = ep->ep_buffer_virt->in_pdata + mpa_data_size;
+	ep->cm_info.private_data_len = async_data->mpa_response.ulp_data_len -
+				       mpa_data_size;
+}
+
+void
+qed_iwarp_mpa_reply_arrived(struct qed_hwfn *p_hwfn, struct qed_iwarp_ep *ep)
+{
+	struct qed_iwarp_cm_event_params params;
+
+	if (ep->connect_mode == TCP_CONNECT_PASSIVE) {
+		DP_NOTICE(p_hwfn,
+			  "MPA reply event not expected on passive side!\n");
+		return;
+	}
+
+	params.event = QED_IWARP_EVENT_ACTIVE_MPA_REPLY;
+
+	qed_iwarp_parse_private_data(p_hwfn, ep);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "MPA_NEGOTIATE (v%d): ORD: 0x%x IRD: 0x%x\n",
+		   ep->mpa_rev, ep->cm_info.ord, ep->cm_info.ird);
+
+	params.cm_info = &ep->cm_info;
+	params.ep_context = ep;
+	params.status = 0;
+
+	ep->mpa_reply_processed = true;
+
+	ep->event_cb(ep->cb_context, &params);
+}
+
+#define QED_IWARP_CONNECT_MODE_STRING(ep) \
+	((ep)->connect_mode == TCP_CONNECT_PASSIVE) ? "Passive" : "Active"
+
+/* Called as a result of the event:
+ * IWARP_EVENT_TYPE_ASYNC_MPA_HANDSHAKE_COMPLETE
+ */
+static void
+qed_iwarp_mpa_complete(struct qed_hwfn *p_hwfn,
+		       struct qed_iwarp_ep *ep, u8 fw_return_code)
+{
+	struct qed_iwarp_cm_event_params params;
+
+	if (ep->connect_mode == TCP_CONNECT_ACTIVE)
+		params.event = QED_IWARP_EVENT_ACTIVE_COMPLETE;
+	else
+		params.event = QED_IWARP_EVENT_PASSIVE_COMPLETE;
+
+	if (ep->connect_mode == TCP_CONNECT_ACTIVE && !ep->mpa_reply_processed)
+		qed_iwarp_parse_private_data(p_hwfn, ep);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "MPA_NEGOTIATE (v%d): ORD: 0x%x IRD: 0x%x\n",
+		   ep->mpa_rev, ep->cm_info.ord, ep->cm_info.ird);
+
+	params.cm_info = &ep->cm_info;
+
+	params.ep_context = ep;
+
+	ep->state = QED_IWARP_EP_CLOSED;
+
+	switch (fw_return_code) {
+	case RDMA_RETURN_OK:
+		ep->qp->max_rd_atomic_req = ep->cm_info.ord;
+		ep->qp->max_rd_atomic_resp = ep->cm_info.ird;
+		qed_iwarp_modify_qp(p_hwfn, ep->qp, QED_IWARP_QP_STATE_RTS, 1);
+		ep->state = QED_IWARP_EP_ESTABLISHED;
+		params.status = 0;
+		break;
+	case IWARP_CONN_ERROR_MPA_TIMEOUT:
+		DP_NOTICE(p_hwfn, "%s(0x%x) MPA timeout\n",
+			  QED_IWARP_CONNECT_MODE_STRING(ep), ep->cid);
+		params.status = -EBUSY;
+		break;
+	case IWARP_CONN_ERROR_MPA_ERROR_REJECT:
+		DP_NOTICE(p_hwfn, "%s(0x%x) MPA Reject\n",
+			  QED_IWARP_CONNECT_MODE_STRING(ep), ep->cid);
+		params.status = -ECONNREFUSED;
+		break;
+	case IWARP_CONN_ERROR_MPA_RST:
+		DP_NOTICE(p_hwfn, "%s(0x%x) MPA reset(tcp cid: 0x%x)\n",
+			  QED_IWARP_CONNECT_MODE_STRING(ep), ep->cid,
+			  ep->tcp_cid);
+		params.status = -ECONNRESET;
+		break;
+	case IWARP_CONN_ERROR_MPA_FIN:
+		DP_NOTICE(p_hwfn, "%s(0x%x) MPA received FIN\n",
+			  QED_IWARP_CONNECT_MODE_STRING(ep), ep->cid);
+		params.status = -ECONNREFUSED;
+		break;
+	case IWARP_CONN_ERROR_MPA_INSUF_IRD:
+		DP_NOTICE(p_hwfn, "%s(0x%x) MPA insufficient ird\n",
+			  QED_IWARP_CONNECT_MODE_STRING(ep), ep->cid);
+		params.status = -ECONNREFUSED;
+		break;
+	case IWARP_CONN_ERROR_MPA_RTR_MISMATCH:
+		DP_NOTICE(p_hwfn, "%s(0x%x) MPA RTR MISMATCH\n",
+			  QED_IWARP_CONNECT_MODE_STRING(ep), ep->cid);
+		params.status = -ECONNREFUSED;
+		break;
+	case IWARP_CONN_ERROR_MPA_INVALID_PACKET:
+		DP_NOTICE(p_hwfn, "%s(0x%x) MPA Invalid Packet\n",
+			  QED_IWARP_CONNECT_MODE_STRING(ep), ep->cid);
+		params.status = -ECONNREFUSED;
+		break;
+	case IWARP_CONN_ERROR_MPA_LOCAL_ERROR:
+		DP_NOTICE(p_hwfn, "%s(0x%x) MPA Local Error\n",
+			  QED_IWARP_CONNECT_MODE_STRING(ep), ep->cid);
+		params.status = -ECONNREFUSED;
+		break;
+	case IWARP_CONN_ERROR_MPA_TERMINATE:
+		DP_NOTICE(p_hwfn, "%s(0x%x) MPA TERMINATE\n",
+			  QED_IWARP_CONNECT_MODE_STRING(ep), ep->cid);
+		params.status = -ECONNREFUSED;
+		break;
+	default:
+		params.status = -ECONNRESET;
+		break;
+	}
+
+	ep->event_cb(ep->cb_context, &params);
+
+	/* on passive side, if there is no associated QP (REJECT) we need to
+	 * return the ep to the pool, (in the regular case we add an element
+	 * in accept instead of this one.
+	 * In both cases we need to remove it from the ep_list.
+	 */
+	if (fw_return_code != RDMA_RETURN_OK) {
+		ep->tcp_cid = QED_IWARP_INVALID_TCP_CID;
+		if ((ep->connect_mode == TCP_CONNECT_PASSIVE) &&
+		    (!ep->qp)) {	/* Rejected */
+			qed_iwarp_return_ep(p_hwfn, ep);
+		} else {
+			spin_lock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+			list_del(&ep->list_entry);
+			spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+		}
+	}
+}
+
+static void
+qed_iwarp_mpa_v2_set_private(struct qed_hwfn *p_hwfn,
+			     struct qed_iwarp_ep *ep, u8 *mpa_data_size)
+{
+	struct mpa_v2_hdr *mpa_v2_params;
+	u16 mpa_ird, mpa_ord;
+
+	*mpa_data_size = 0;
+	if (MPA_REV2(ep->mpa_rev)) {
+		mpa_v2_params =
+		    (struct mpa_v2_hdr *)ep->ep_buffer_virt->out_pdata;
+		*mpa_data_size = sizeof(*mpa_v2_params);
+
+		mpa_ird = (u16)ep->cm_info.ird;
+		mpa_ord = (u16)ep->cm_info.ord;
+
+		if (ep->rtr_type != MPA_RTR_TYPE_NONE) {
+			mpa_ird |= MPA_V2_PEER2PEER_MODEL;
+
+			if (ep->rtr_type & MPA_RTR_TYPE_ZERO_SEND)
+				mpa_ird |= MPA_V2_SEND_RTR;
+
+			if (ep->rtr_type & MPA_RTR_TYPE_ZERO_WRITE)
+				mpa_ord |= MPA_V2_WRITE_RTR;
+
+			if (ep->rtr_type & MPA_RTR_TYPE_ZERO_READ)
+				mpa_ord |= MPA_V2_READ_RTR;
+		}
+
+		mpa_v2_params->ird = htons(mpa_ird);
+		mpa_v2_params->ord = htons(mpa_ord);
+
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_RDMA,
+			   "MPA_NEGOTIATE Header: [%x ord:%x ird] %x ord:%x ird:%x peer2peer:%x rtr_send:%x rtr_write:%x rtr_read:%x\n",
+			   mpa_v2_params->ird,
+			   mpa_v2_params->ord,
+			   *((u32 *)mpa_v2_params),
+			   mpa_ord & MPA_V2_IRD_ORD_MASK,
+			   mpa_ird & MPA_V2_IRD_ORD_MASK,
+			   !!(mpa_ird & MPA_V2_PEER2PEER_MODEL),
+			   !!(mpa_ird & MPA_V2_SEND_RTR),
+			   !!(mpa_ord & MPA_V2_WRITE_RTR),
+			   !!(mpa_ord & MPA_V2_READ_RTR));
+	}
+}
+
+int qed_iwarp_connect(void *rdma_cxt,
+		      struct qed_iwarp_connect_in *iparams,
+		      struct qed_iwarp_connect_out *oparams)
+{
+	struct qed_hwfn *p_hwfn = rdma_cxt;
+	struct qed_iwarp_info *iwarp_info;
+	struct qed_iwarp_ep *ep;
+	u8 mpa_data_size = 0;
+	u8 ts_hdr_size = 0;
+	u32 cid;
+	int rc;
+
+	if ((iparams->cm_info.ord > QED_IWARP_ORD_DEFAULT) ||
+	    (iparams->cm_info.ird > QED_IWARP_IRD_DEFAULT)) {
+		DP_NOTICE(p_hwfn,
+			  "QP(0x%x) ERROR: Invalid ord(0x%x)/ird(0x%x)\n",
+			  iparams->qp->icid, iparams->cm_info.ord,
+			  iparams->cm_info.ird);
+
+		return -EINVAL;
+	}
+
+	iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+
+	/* Allocate ep object */
+	rc = qed_iwarp_alloc_cid(p_hwfn, &cid);
+	if (rc)
+		return rc;
+
+	rc = qed_iwarp_create_ep(p_hwfn, &ep);
+	if (rc)
+		goto err;
+
+	ep->tcp_cid = cid;
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+	list_add_tail(&ep->list_entry, &p_hwfn->p_rdma_info->iwarp.ep_list);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+
+	ep->qp = iparams->qp;
+	ep->qp->ep = ep;
+	ether_addr_copy(ep->remote_mac_addr, iparams->remote_mac_addr);
+	ether_addr_copy(ep->local_mac_addr, iparams->local_mac_addr);
+	memcpy(&ep->cm_info, &iparams->cm_info, sizeof(ep->cm_info));
+
+	ep->cm_info.ord = iparams->cm_info.ord;
+	ep->cm_info.ird = iparams->cm_info.ird;
+
+	ep->rtr_type = iwarp_info->rtr_type;
+	if (!iwarp_info->peer2peer)
+		ep->rtr_type = MPA_RTR_TYPE_NONE;
+
+	if ((ep->rtr_type & MPA_RTR_TYPE_ZERO_READ) && (ep->cm_info.ord == 0))
+		ep->cm_info.ord = 1;
+
+	ep->mpa_rev = iwarp_info->mpa_rev;
+
+	qed_iwarp_mpa_v2_set_private(p_hwfn, ep, &mpa_data_size);
+
+	ep->cm_info.private_data = ep->ep_buffer_virt->out_pdata;
+	ep->cm_info.private_data_len = iparams->cm_info.private_data_len +
+				       mpa_data_size;
+
+	memcpy((u8 *)ep->ep_buffer_virt->out_pdata + mpa_data_size,
+	       iparams->cm_info.private_data,
+	       iparams->cm_info.private_data_len);
+
+	if (p_hwfn->p_rdma_info->iwarp.tcp_flags & QED_IWARP_TS_EN)
+		ts_hdr_size = TIMESTAMP_HEADER_SIZE;
+
+	ep->mss = iparams->mss - ts_hdr_size;
+	ep->mss = min_t(u16, QED_IWARP_MAX_FW_MSS, ep->mss);
+
+	ep->event_cb = iparams->event_cb;
+	ep->cb_context = iparams->cb_context;
+	ep->connect_mode = TCP_CONNECT_ACTIVE;
+
+	oparams->ep_context = ep;
+
+	rc = qed_iwarp_tcp_offload(p_hwfn, ep);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "QP(0x%x) EP(0x%x) rc = %d\n",
+		   iparams->qp->icid, ep->tcp_cid, rc);
+
+	if (rc) {
+		qed_iwarp_destroy_ep(p_hwfn, ep, true);
+		goto err;
+	}
+
+	return rc;
+err:
+	qed_iwarp_cid_cleaned(p_hwfn, cid);
+
+	return rc;
+}
+
+static struct qed_iwarp_ep *qed_iwarp_get_free_ep(struct qed_hwfn *p_hwfn)
+{
+	struct qed_iwarp_ep *ep = NULL;
+	int rc;
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+
+	if (list_empty(&p_hwfn->p_rdma_info->iwarp.ep_free_list)) {
+		DP_ERR(p_hwfn, "Ep list is empty\n");
+		goto out;
+	}
+
+	ep = list_first_entry(&p_hwfn->p_rdma_info->iwarp.ep_free_list,
+			      struct qed_iwarp_ep, list_entry);
+
+	/* in some cases we could have failed allocating a tcp cid when added
+	 * from accept / failure... retry now..this is not the common case.
+	 */
+	if (ep->tcp_cid == QED_IWARP_INVALID_TCP_CID) {
+		rc = qed_iwarp_alloc_tcp_cid(p_hwfn, &ep->tcp_cid);
+
+		/* if we fail we could look for another entry with a valid
+		 * tcp_cid, but since we don't expect to reach this anyway
+		 * it's not worth the handling
+		 */
+		if (rc) {
+			ep->tcp_cid = QED_IWARP_INVALID_TCP_CID;
+			ep = NULL;
+			goto out;
+		}
+	}
+
+	list_del(&ep->list_entry);
+
+out:
+	spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+	return ep;
+}
+
+#define QED_IWARP_MAX_CID_CLEAN_TIME  100
+#define QED_IWARP_MAX_NO_PROGRESS_CNT 5
+
+/* This function waits for all the bits of a bmap to be cleared, as long as
+ * there is progress ( i.e. the number of bits left to be cleared decreases )
+ * the function continues.
+ */
+static int
+qed_iwarp_wait_cid_map_cleared(struct qed_hwfn *p_hwfn, struct qed_bmap *bmap)
+{
+	int prev_weight = 0;
+	int wait_count = 0;
+	int weight = 0;
+
+	weight = bitmap_weight(bmap->bitmap, bmap->max_count);
+	prev_weight = weight;
+
+	while (weight) {
+		msleep(QED_IWARP_MAX_CID_CLEAN_TIME);
+
+		weight = bitmap_weight(bmap->bitmap, bmap->max_count);
+
+		if (prev_weight == weight) {
+			wait_count++;
+		} else {
+			prev_weight = weight;
+			wait_count = 0;
+		}
+
+		if (wait_count > QED_IWARP_MAX_NO_PROGRESS_CNT) {
+			DP_NOTICE(p_hwfn,
+				  "%s bitmap wait timed out (%d cids pending)\n",
+				  bmap->name, weight);
+			return -EBUSY;
+		}
+	}
+	return 0;
+}
+
+static int qed_iwarp_wait_for_all_cids(struct qed_hwfn *p_hwfn)
+{
+	int rc;
+	int i;
+
+	rc = qed_iwarp_wait_cid_map_cleared(p_hwfn,
+					    &p_hwfn->p_rdma_info->tcp_cid_map);
+	if (rc)
+		return rc;
+
+	/* Now free the tcp cids from the main cid map */
+	for (i = 0; i < QED_IWARP_PREALLOC_CNT; i++)
+		qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->cid_map, i);
+
+	/* Now wait for all cids to be completed */
+	return qed_iwarp_wait_cid_map_cleared(p_hwfn,
+					      &p_hwfn->p_rdma_info->cid_map);
+}
+
+static void qed_iwarp_free_prealloc_ep(struct qed_hwfn *p_hwfn)
+{
+	struct qed_iwarp_ep *ep;
+
+	while (!list_empty(&p_hwfn->p_rdma_info->iwarp.ep_free_list)) {
+		spin_lock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+
+		ep = list_first_entry(&p_hwfn->p_rdma_info->iwarp.ep_free_list,
+				      struct qed_iwarp_ep, list_entry);
+
+		if (!ep) {
+			spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+			break;
+		}
+		list_del(&ep->list_entry);
+
+		spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+
+		if (ep->tcp_cid != QED_IWARP_INVALID_TCP_CID)
+			qed_iwarp_cid_cleaned(p_hwfn, ep->tcp_cid);
+
+		qed_iwarp_destroy_ep(p_hwfn, ep, false);
+	}
+}
+
+static int qed_iwarp_prealloc_ep(struct qed_hwfn *p_hwfn, bool init)
+{
+	struct qed_iwarp_ep *ep;
+	int rc = 0;
+	int count;
+	u32 cid;
+	int i;
+
+	count = init ? QED_IWARP_PREALLOC_CNT : 1;
+	for (i = 0; i < count; i++) {
+		rc = qed_iwarp_create_ep(p_hwfn, &ep);
+		if (rc)
+			return rc;
+
+		/* During initialization we allocate from the main pool,
+		 * afterwards we allocate only from the tcp_cid.
+		 */
+		if (init) {
+			rc = qed_iwarp_alloc_cid(p_hwfn, &cid);
+			if (rc)
+				goto err;
+			qed_iwarp_set_tcp_cid(p_hwfn, cid);
+		} else {
+			/* We don't care about the return code, it's ok if
+			 * tcp_cid remains invalid...in this case we'll
+			 * defer allocation
+			 */
+			qed_iwarp_alloc_tcp_cid(p_hwfn, &cid);
+		}
+
+		ep->tcp_cid = cid;
+
+		spin_lock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+		list_add_tail(&ep->list_entry,
+			      &p_hwfn->p_rdma_info->iwarp.ep_free_list);
+		spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+	}
+
+	return rc;
+
+err:
+	qed_iwarp_destroy_ep(p_hwfn, ep, false);
+
+	return rc;
+}
+
+int qed_iwarp_alloc(struct qed_hwfn *p_hwfn)
+{
+	int rc;
+
+	/* Allocate bitmap for tcp cid. These are used by passive side
+	 * to ensure it can allocate a tcp cid during dpc that was
+	 * pre-acquired and doesn't require dynamic allocation of ilt
+	 */
+	rc = qed_rdma_bmap_alloc(p_hwfn, &p_hwfn->p_rdma_info->tcp_cid_map,
+				 QED_IWARP_PREALLOC_CNT, "TCP_CID");
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Failed to allocate tcp cid, rc = %d\n", rc);
+		return rc;
+	}
+
+	INIT_LIST_HEAD(&p_hwfn->p_rdma_info->iwarp.ep_free_list);
+	spin_lock_init(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+
+	rc = qed_iwarp_prealloc_ep(p_hwfn, true);
+	if (rc)
+		return rc;
+
+	return qed_ooo_alloc(p_hwfn);
+}
+
+void qed_iwarp_resc_free(struct qed_hwfn *p_hwfn)
+{
+	struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+
+	qed_ooo_free(p_hwfn);
+	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->tcp_cid_map, 1);
+	kfree(iwarp_info->mpa_bufs);
+	kfree(iwarp_info->partial_fpdus);
+	kfree(iwarp_info->mpa_intermediate_buf);
+}
+
+int qed_iwarp_accept(void *rdma_cxt, struct qed_iwarp_accept_in *iparams)
+{
+	struct qed_hwfn *p_hwfn = rdma_cxt;
+	struct qed_iwarp_ep *ep;
+	u8 mpa_data_size = 0;
+	int rc;
+
+	ep = iparams->ep_context;
+	if (!ep) {
+		DP_ERR(p_hwfn, "Ep Context receive in accept is NULL\n");
+		return -EINVAL;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "QP(0x%x) EP(0x%x)\n",
+		   iparams->qp->icid, ep->tcp_cid);
+
+	if ((iparams->ord > QED_IWARP_ORD_DEFAULT) ||
+	    (iparams->ird > QED_IWARP_IRD_DEFAULT)) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_RDMA,
+			   "QP(0x%x) EP(0x%x) ERROR: Invalid ord(0x%x)/ird(0x%x)\n",
+			   iparams->qp->icid,
+			   ep->tcp_cid, iparams->ord, iparams->ord);
+		return -EINVAL;
+	}
+
+	qed_iwarp_prealloc_ep(p_hwfn, false);
+
+	ep->cb_context = iparams->cb_context;
+	ep->qp = iparams->qp;
+	ep->qp->ep = ep;
+
+	if (ep->mpa_rev == MPA_NEGOTIATION_TYPE_ENHANCED) {
+		/* Negotiate ord/ird: if upperlayer requested ord larger than
+		 * ird advertised by remote, we need to decrease our ord
+		 */
+		if (iparams->ord > ep->cm_info.ird)
+			iparams->ord = ep->cm_info.ird;
+
+		if ((ep->rtr_type & MPA_RTR_TYPE_ZERO_READ) &&
+		    (iparams->ird == 0))
+			iparams->ird = 1;
+	}
+
+	/* Update cm_info ord/ird to be negotiated values */
+	ep->cm_info.ord = iparams->ord;
+	ep->cm_info.ird = iparams->ird;
+
+	qed_iwarp_mpa_v2_set_private(p_hwfn, ep, &mpa_data_size);
+
+	ep->cm_info.private_data = ep->ep_buffer_virt->out_pdata;
+	ep->cm_info.private_data_len = iparams->private_data_len +
+				       mpa_data_size;
+
+	memcpy((u8 *)ep->ep_buffer_virt->out_pdata + mpa_data_size,
+	       iparams->private_data, iparams->private_data_len);
+
+	rc = qed_iwarp_mpa_offload(p_hwfn, ep);
+	if (rc)
+		qed_iwarp_modify_qp(p_hwfn,
+				    iparams->qp, QED_IWARP_QP_STATE_ERROR, 1);
+
+	return rc;
+}
+
+int qed_iwarp_reject(void *rdma_cxt, struct qed_iwarp_reject_in *iparams)
+{
+	struct qed_hwfn *p_hwfn = rdma_cxt;
+	struct qed_iwarp_ep *ep;
+	u8 mpa_data_size = 0;
+
+	ep = iparams->ep_context;
+	if (!ep) {
+		DP_ERR(p_hwfn, "Ep Context receive in reject is NULL\n");
+		return -EINVAL;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "EP(0x%x)\n", ep->tcp_cid);
+
+	ep->cb_context = iparams->cb_context;
+	ep->qp = NULL;
+
+	qed_iwarp_mpa_v2_set_private(p_hwfn, ep, &mpa_data_size);
+
+	ep->cm_info.private_data = ep->ep_buffer_virt->out_pdata;
+	ep->cm_info.private_data_len = iparams->private_data_len +
+				       mpa_data_size;
+
+	memcpy((u8 *)ep->ep_buffer_virt->out_pdata + mpa_data_size,
+	       iparams->private_data, iparams->private_data_len);
+
+	return qed_iwarp_mpa_offload(p_hwfn, ep);
+}
+
+static void
+qed_iwarp_print_cm_info(struct qed_hwfn *p_hwfn,
+			struct qed_iwarp_cm_info *cm_info)
+{
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "ip_version = %d\n",
+		   cm_info->ip_version);
+
+	if (cm_info->ip_version == QED_TCP_IPV4)
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "remote_ip %pI4h:%x, local_ip %pI4h:%x vlan=%x\n",
+			   cm_info->remote_ip, cm_info->remote_port,
+			   cm_info->local_ip, cm_info->local_port,
+			   cm_info->vlan);
+	else
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "remote_ip %pI6:%x, local_ip %pI6:%x vlan=%x\n",
+			   cm_info->remote_ip, cm_info->remote_port,
+			   cm_info->local_ip, cm_info->local_port,
+			   cm_info->vlan);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "private_data_len = %x ord = %d, ird = %d\n",
+		   cm_info->private_data_len, cm_info->ord, cm_info->ird);
+}
+
+static int
+qed_iwarp_ll2_post_rx(struct qed_hwfn *p_hwfn,
+		      struct qed_iwarp_ll2_buff *buf, u8 handle)
+{
+	int rc;
+
+	rc = qed_ll2_post_rx_buffer(p_hwfn, handle, buf->data_phys_addr,
+				    (u16)buf->buff_size, buf, 1);
+	if (rc) {
+		DP_NOTICE(p_hwfn,
+			  "Failed to repost rx buffer to ll2 rc = %d, handle=%d\n",
+			  rc, handle);
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev, buf->buff_size,
+				  buf->data, buf->data_phys_addr);
+		kfree(buf);
+	}
+
+	return rc;
+}
+
+static bool
+qed_iwarp_ep_exists(struct qed_hwfn *p_hwfn, struct qed_iwarp_cm_info *cm_info)
+{
+	struct qed_iwarp_ep *ep = NULL;
+	bool found = false;
+
+	list_for_each_entry(ep,
+			    &p_hwfn->p_rdma_info->iwarp.ep_list,
+			    list_entry) {
+		if ((ep->cm_info.local_port == cm_info->local_port) &&
+		    (ep->cm_info.remote_port == cm_info->remote_port) &&
+		    (ep->cm_info.vlan == cm_info->vlan) &&
+		    !memcmp(&ep->cm_info.local_ip, cm_info->local_ip,
+			    sizeof(cm_info->local_ip)) &&
+		    !memcmp(&ep->cm_info.remote_ip, cm_info->remote_ip,
+			    sizeof(cm_info->remote_ip))) {
+			found = true;
+			break;
+		}
+	}
+
+	if (found) {
+		DP_NOTICE(p_hwfn,
+			  "SYN received on active connection - dropping\n");
+		qed_iwarp_print_cm_info(p_hwfn, cm_info);
+
+		return true;
+	}
+
+	return false;
+}
+
+static struct qed_iwarp_listener *
+qed_iwarp_get_listener(struct qed_hwfn *p_hwfn,
+		       struct qed_iwarp_cm_info *cm_info)
+{
+	struct qed_iwarp_listener *listener = NULL;
+	static const u32 ip_zero[4] = { 0, 0, 0, 0 };
+	bool found = false;
+
+	qed_iwarp_print_cm_info(p_hwfn, cm_info);
+
+	list_for_each_entry(listener,
+			    &p_hwfn->p_rdma_info->iwarp.listen_list,
+			    list_entry) {
+		if (listener->port == cm_info->local_port) {
+			if (!memcmp(listener->ip_addr,
+				    ip_zero, sizeof(ip_zero))) {
+				found = true;
+				break;
+			}
+
+			if (!memcmp(listener->ip_addr,
+				    cm_info->local_ip,
+				    sizeof(cm_info->local_ip)) &&
+			    (listener->vlan == cm_info->vlan)) {
+				found = true;
+				break;
+			}
+		}
+	}
+
+	if (found) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "listener found = %p\n",
+			   listener);
+		return listener;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "listener not found\n");
+	return NULL;
+}
+
+static int
+qed_iwarp_parse_rx_pkt(struct qed_hwfn *p_hwfn,
+		       struct qed_iwarp_cm_info *cm_info,
+		       void *buf,
+		       u8 *remote_mac_addr,
+		       u8 *local_mac_addr,
+		       int *payload_len, int *tcp_start_offset)
+{
+	struct vlan_ethhdr *vethh;
+	bool vlan_valid = false;
+	struct ipv6hdr *ip6h;
+	struct ethhdr *ethh;
+	struct tcphdr *tcph;
+	struct iphdr *iph;
+	int eth_hlen;
+	int ip_hlen;
+	int eth_type;
+	int i;
+
+	ethh = buf;
+	eth_type = ntohs(ethh->h_proto);
+	if (eth_type == ETH_P_8021Q) {
+		vlan_valid = true;
+		vethh = (struct vlan_ethhdr *)ethh;
+		cm_info->vlan = ntohs(vethh->h_vlan_TCI) & VLAN_VID_MASK;
+		eth_type = ntohs(vethh->h_vlan_encapsulated_proto);
+	}
+
+	eth_hlen = ETH_HLEN + (vlan_valid ? sizeof(u32) : 0);
+
+	ether_addr_copy(remote_mac_addr, ethh->h_source);
+	ether_addr_copy(local_mac_addr, ethh->h_dest);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "eth_type =%d source mac: %pM\n",
+		   eth_type, ethh->h_source);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "eth_hlen=%d destination mac: %pM\n",
+		   eth_hlen, ethh->h_dest);
+
+	iph = (struct iphdr *)((u8 *)(ethh) + eth_hlen);
+
+	if (eth_type == ETH_P_IP) {
+		if (iph->protocol != IPPROTO_TCP) {
+			DP_NOTICE(p_hwfn,
+				  "Unexpected ip protocol on ll2 %x\n",
+				  iph->protocol);
+			return -EINVAL;
+		}
+
+		cm_info->local_ip[0] = ntohl(iph->daddr);
+		cm_info->remote_ip[0] = ntohl(iph->saddr);
+		cm_info->ip_version = TCP_IPV4;
+
+		ip_hlen = (iph->ihl) * sizeof(u32);
+		*payload_len = ntohs(iph->tot_len) - ip_hlen;
+	} else if (eth_type == ETH_P_IPV6) {
+		ip6h = (struct ipv6hdr *)iph;
+
+		if (ip6h->nexthdr != IPPROTO_TCP) {
+			DP_NOTICE(p_hwfn,
+				  "Unexpected ip protocol on ll2 %x\n",
+				  iph->protocol);
+			return -EINVAL;
+		}
+
+		for (i = 0; i < 4; i++) {
+			cm_info->local_ip[i] =
+			    ntohl(ip6h->daddr.in6_u.u6_addr32[i]);
+			cm_info->remote_ip[i] =
+			    ntohl(ip6h->saddr.in6_u.u6_addr32[i]);
+		}
+		cm_info->ip_version = TCP_IPV6;
+
+		ip_hlen = sizeof(*ip6h);
+		*payload_len = ntohs(ip6h->payload_len);
+	} else {
+		DP_NOTICE(p_hwfn, "Unexpected ethertype on ll2 %x\n", eth_type);
+		return -EINVAL;
+	}
+
+	tcph = (struct tcphdr *)((u8 *)iph + ip_hlen);
+
+	if (!tcph->syn) {
+		DP_NOTICE(p_hwfn,
+			  "Only SYN type packet expected on this ll2 conn, iph->ihl=%d source=%d dest=%d\n",
+			  iph->ihl, tcph->source, tcph->dest);
+		return -EINVAL;
+	}
+
+	cm_info->local_port = ntohs(tcph->dest);
+	cm_info->remote_port = ntohs(tcph->source);
+
+	qed_iwarp_print_cm_info(p_hwfn, cm_info);
+
+	*tcp_start_offset = eth_hlen + ip_hlen;
+
+	return 0;
+}
+
+static struct qed_iwarp_fpdu *qed_iwarp_get_curr_fpdu(struct qed_hwfn *p_hwfn,
+						      u16 cid)
+{
+	struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+	struct qed_iwarp_fpdu *partial_fpdu;
+	u32 idx;
+
+	idx = cid - qed_cxt_get_proto_cid_start(p_hwfn, PROTOCOLID_IWARP);
+	if (idx >= iwarp_info->max_num_partial_fpdus) {
+		DP_ERR(p_hwfn, "Invalid cid %x max_num_partial_fpdus=%x\n", cid,
+		       iwarp_info->max_num_partial_fpdus);
+		return NULL;
+	}
+
+	partial_fpdu = &iwarp_info->partial_fpdus[idx];
+
+	return partial_fpdu;
+}
+
+enum qed_iwarp_mpa_pkt_type {
+	QED_IWARP_MPA_PKT_PACKED,
+	QED_IWARP_MPA_PKT_PARTIAL,
+	QED_IWARP_MPA_PKT_UNALIGNED
+};
+
+#define QED_IWARP_INVALID_FPDU_LENGTH 0xffff
+#define QED_IWARP_MPA_FPDU_LENGTH_SIZE (2)
+#define QED_IWARP_MPA_CRC32_DIGEST_SIZE (4)
+
+/* Pad to multiple of 4 */
+#define QED_IWARP_PDU_DATA_LEN_WITH_PAD(data_len) ALIGN(data_len, 4)
+#define QED_IWARP_FPDU_LEN_WITH_PAD(_mpa_len)				   \
+	(QED_IWARP_PDU_DATA_LEN_WITH_PAD((_mpa_len) +			   \
+					 QED_IWARP_MPA_FPDU_LENGTH_SIZE) + \
+					 QED_IWARP_MPA_CRC32_DIGEST_SIZE)
+
+/* fpdu can be fragmented over maximum 3 bds: header, partial mpa, unaligned */
+#define QED_IWARP_MAX_BDS_PER_FPDU 3
+
+static const char * const pkt_type_str[] = {
+	"QED_IWARP_MPA_PKT_PACKED",
+	"QED_IWARP_MPA_PKT_PARTIAL",
+	"QED_IWARP_MPA_PKT_UNALIGNED"
+};
+
+static int
+qed_iwarp_recycle_pkt(struct qed_hwfn *p_hwfn,
+		      struct qed_iwarp_fpdu *fpdu,
+		      struct qed_iwarp_ll2_buff *buf);
+
+static enum qed_iwarp_mpa_pkt_type
+qed_iwarp_mpa_classify(struct qed_hwfn *p_hwfn,
+		       struct qed_iwarp_fpdu *fpdu,
+		       u16 tcp_payload_len, u8 *mpa_data)
+{
+	enum qed_iwarp_mpa_pkt_type pkt_type;
+	u16 mpa_len;
+
+	if (fpdu->incomplete_bytes) {
+		pkt_type = QED_IWARP_MPA_PKT_UNALIGNED;
+		goto out;
+	}
+
+	/* special case of one byte remaining...
+	 * lower byte will be read next packet
+	 */
+	if (tcp_payload_len == 1) {
+		fpdu->fpdu_length = *mpa_data << BITS_PER_BYTE;
+		pkt_type = QED_IWARP_MPA_PKT_PARTIAL;
+		goto out;
+	}
+
+	mpa_len = ntohs(*((u16 *)(mpa_data)));
+	fpdu->fpdu_length = QED_IWARP_FPDU_LEN_WITH_PAD(mpa_len);
+
+	if (fpdu->fpdu_length <= tcp_payload_len)
+		pkt_type = QED_IWARP_MPA_PKT_PACKED;
+	else
+		pkt_type = QED_IWARP_MPA_PKT_PARTIAL;
+
+out:
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "MPA_ALIGN: %s: fpdu_length=0x%x tcp_payload_len:0x%x\n",
+		   pkt_type_str[pkt_type], fpdu->fpdu_length, tcp_payload_len);
+
+	return pkt_type;
+}
+
+static void
+qed_iwarp_init_fpdu(struct qed_iwarp_ll2_buff *buf,
+		    struct qed_iwarp_fpdu *fpdu,
+		    struct unaligned_opaque_data *pkt_data,
+		    u16 tcp_payload_size, u8 placement_offset)
+{
+	fpdu->mpa_buf = buf;
+	fpdu->pkt_hdr = buf->data_phys_addr + placement_offset;
+	fpdu->pkt_hdr_size = pkt_data->tcp_payload_offset;
+	fpdu->mpa_frag = buf->data_phys_addr + pkt_data->first_mpa_offset;
+	fpdu->mpa_frag_virt = (u8 *)(buf->data) + pkt_data->first_mpa_offset;
+
+	if (tcp_payload_size == 1)
+		fpdu->incomplete_bytes = QED_IWARP_INVALID_FPDU_LENGTH;
+	else if (tcp_payload_size < fpdu->fpdu_length)
+		fpdu->incomplete_bytes = fpdu->fpdu_length - tcp_payload_size;
+	else
+		fpdu->incomplete_bytes = 0;	/* complete fpdu */
+
+	fpdu->mpa_frag_len = fpdu->fpdu_length - fpdu->incomplete_bytes;
+}
+
+static int
+qed_iwarp_cp_pkt(struct qed_hwfn *p_hwfn,
+		 struct qed_iwarp_fpdu *fpdu,
+		 struct unaligned_opaque_data *pkt_data,
+		 struct qed_iwarp_ll2_buff *buf, u16 tcp_payload_size)
+{
+	u8 *tmp_buf = p_hwfn->p_rdma_info->iwarp.mpa_intermediate_buf;
+	int rc;
+
+	/* need to copy the data from the partial packet stored in fpdu
+	 * to the new buf, for this we also need to move the data currently
+	 * placed on the buf. The assumption is that the buffer is big enough
+	 * since fpdu_length <= mss, we use an intermediate buffer since
+	 * we may need to copy the new data to an overlapping location
+	 */
+	if ((fpdu->mpa_frag_len + tcp_payload_size) > (u16)buf->buff_size) {
+		DP_ERR(p_hwfn,
+		       "MPA ALIGN: Unexpected: buffer is not large enough for split fpdu buff_size = %d mpa_frag_len = %d, tcp_payload_size = %d, incomplete_bytes = %d\n",
+		       buf->buff_size, fpdu->mpa_frag_len,
+		       tcp_payload_size, fpdu->incomplete_bytes);
+		return -EINVAL;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "MPA ALIGN Copying fpdu: [%p, %d] [%p, %d]\n",
+		   fpdu->mpa_frag_virt, fpdu->mpa_frag_len,
+		   (u8 *)(buf->data) + pkt_data->first_mpa_offset,
+		   tcp_payload_size);
+
+	memcpy(tmp_buf, fpdu->mpa_frag_virt, fpdu->mpa_frag_len);
+	memcpy(tmp_buf + fpdu->mpa_frag_len,
+	       (u8 *)(buf->data) + pkt_data->first_mpa_offset,
+	       tcp_payload_size);
+
+	rc = qed_iwarp_recycle_pkt(p_hwfn, fpdu, fpdu->mpa_buf);
+	if (rc)
+		return rc;
+
+	/* If we managed to post the buffer copy the data to the new buffer
+	 * o/w this will occur in the next round...
+	 */
+	memcpy((u8 *)(buf->data), tmp_buf,
+	       fpdu->mpa_frag_len + tcp_payload_size);
+
+	fpdu->mpa_buf = buf;
+	/* fpdu->pkt_hdr remains as is */
+	/* fpdu->mpa_frag is overridden with new buf */
+	fpdu->mpa_frag = buf->data_phys_addr;
+	fpdu->mpa_frag_virt = buf->data;
+	fpdu->mpa_frag_len += tcp_payload_size;
+
+	fpdu->incomplete_bytes -= tcp_payload_size;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_RDMA,
+		   "MPA ALIGN: split fpdu buff_size = %d mpa_frag_len = %d, tcp_payload_size = %d, incomplete_bytes = %d\n",
+		   buf->buff_size, fpdu->mpa_frag_len, tcp_payload_size,
+		   fpdu->incomplete_bytes);
+
+	return 0;
+}
+
+static void
+qed_iwarp_update_fpdu_length(struct qed_hwfn *p_hwfn,
+			     struct qed_iwarp_fpdu *fpdu, u8 *mpa_data)
+{
+	u16 mpa_len;
+
+	/* Update incomplete packets if needed */
+	if (fpdu->incomplete_bytes == QED_IWARP_INVALID_FPDU_LENGTH) {
+		/* Missing lower byte is now available */
+		mpa_len = fpdu->fpdu_length | *mpa_data;
+		fpdu->fpdu_length = QED_IWARP_FPDU_LEN_WITH_PAD(mpa_len);
+		/* one byte of hdr */
+		fpdu->mpa_frag_len = 1;
+		fpdu->incomplete_bytes = fpdu->fpdu_length - 1;
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_RDMA,
+			   "MPA_ALIGN: Partial header mpa_len=%x fpdu_length=%x incomplete_bytes=%x\n",
+			   mpa_len, fpdu->fpdu_length, fpdu->incomplete_bytes);
+	}
+}
+
+#define QED_IWARP_IS_RIGHT_EDGE(_curr_pkt) \
+	(GET_FIELD((_curr_pkt)->flags,	   \
+		   UNALIGNED_OPAQUE_DATA_PKT_REACHED_WIN_RIGHT_EDGE))
+
+/* This function is used to recycle a buffer using the ll2 drop option. It
+ * uses the mechanism to ensure that all buffers posted to tx before this one
+ * were completed. The buffer sent here will be sent as a cookie in the tx
+ * completion function and can then be reposted to rx chain when done. The flow
+ * that requires this is the flow where a FPDU splits over more than 3 tcp
+ * segments. In this case the driver needs to re-post a rx buffer instead of
+ * the one received, but driver can't simply repost a buffer it copied from
+ * as there is a case where the buffer was originally a packed FPDU, and is
+ * partially posted to FW. Driver needs to ensure FW is done with it.
+ */
+static int
+qed_iwarp_recycle_pkt(struct qed_hwfn *p_hwfn,
+		      struct qed_iwarp_fpdu *fpdu,
+		      struct qed_iwarp_ll2_buff *buf)
+{
+	struct qed_ll2_tx_pkt_info tx_pkt;
+	u8 ll2_handle;
+	int rc;
+
+	memset(&tx_pkt, 0, sizeof(tx_pkt));
+	tx_pkt.num_of_bds = 1;
+	tx_pkt.tx_dest = QED_LL2_TX_DEST_DROP;
+	tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2;
+	tx_pkt.first_frag = fpdu->pkt_hdr;
+	tx_pkt.first_frag_len = fpdu->pkt_hdr_size;
+	buf->piggy_buf = NULL;
+	tx_pkt.cookie = buf;
+
+	ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
+
+	rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true);
+	if (rc)
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Can't drop packet rc=%d\n", rc);
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_RDMA,
+		   "MPA_ALIGN: send drop tx packet [%lx, 0x%x], buf=%p, rc=%d\n",
+		   (unsigned long int)tx_pkt.first_frag,
+		   tx_pkt.first_frag_len, buf, rc);
+
+	return rc;
+}
+
+static int
+qed_iwarp_win_right_edge(struct qed_hwfn *p_hwfn, struct qed_iwarp_fpdu *fpdu)
+{
+	struct qed_ll2_tx_pkt_info tx_pkt;
+	u8 ll2_handle;
+	int rc;
+
+	memset(&tx_pkt, 0, sizeof(tx_pkt));
+	tx_pkt.num_of_bds = 1;
+	tx_pkt.tx_dest = QED_LL2_TX_DEST_LB;
+	tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2;
+
+	tx_pkt.first_frag = fpdu->pkt_hdr;
+	tx_pkt.first_frag_len = fpdu->pkt_hdr_size;
+	tx_pkt.enable_ip_cksum = true;
+	tx_pkt.enable_l4_cksum = true;
+	tx_pkt.calc_ip_len = true;
+	/* vlan overload with enum iwarp_ll2_tx_queues */
+	tx_pkt.vlan = IWARP_LL2_ALIGNED_RIGHT_TRIMMED_TX_QUEUE;
+
+	ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
+
+	rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true);
+	if (rc)
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Can't send right edge rc=%d\n", rc);
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_RDMA,
+		   "MPA_ALIGN: Sent right edge FPDU num_bds=%d [%lx, 0x%x], rc=%d\n",
+		   tx_pkt.num_of_bds,
+		   (unsigned long int)tx_pkt.first_frag,
+		   tx_pkt.first_frag_len, rc);
+
+	return rc;
+}
+
+static int
+qed_iwarp_send_fpdu(struct qed_hwfn *p_hwfn,
+		    struct qed_iwarp_fpdu *fpdu,
+		    struct unaligned_opaque_data *curr_pkt,
+		    struct qed_iwarp_ll2_buff *buf,
+		    u16 tcp_payload_size, enum qed_iwarp_mpa_pkt_type pkt_type)
+{
+	struct qed_ll2_tx_pkt_info tx_pkt;
+	u8 ll2_handle;
+	int rc;
+
+	memset(&tx_pkt, 0, sizeof(tx_pkt));
+
+	/* An unaligned packet means it's split over two tcp segments. So the
+	 * complete packet requires 3 bds, one for the header, one for the
+	 * part of the fpdu of the first tcp segment, and the last fragment
+	 * will point to the remainder of the fpdu. A packed pdu, requires only
+	 * two bds, one for the header and one for the data.
+	 */
+	tx_pkt.num_of_bds = (pkt_type == QED_IWARP_MPA_PKT_UNALIGNED) ? 3 : 2;
+	tx_pkt.tx_dest = QED_LL2_TX_DEST_LB;
+	tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2; /* offset in words */
+
+	/* Send the mpa_buf only with the last fpdu (in case of packed) */
+	if (pkt_type == QED_IWARP_MPA_PKT_UNALIGNED ||
+	    tcp_payload_size <= fpdu->fpdu_length)
+		tx_pkt.cookie = fpdu->mpa_buf;
+
+	tx_pkt.first_frag = fpdu->pkt_hdr;
+	tx_pkt.first_frag_len = fpdu->pkt_hdr_size;
+	tx_pkt.enable_ip_cksum = true;
+	tx_pkt.enable_l4_cksum = true;
+	tx_pkt.calc_ip_len = true;
+	/* vlan overload with enum iwarp_ll2_tx_queues */
+	tx_pkt.vlan = IWARP_LL2_ALIGNED_TX_QUEUE;
+
+	/* special case of unaligned packet and not packed, need to send
+	 * both buffers as cookie to release.
+	 */
+	if (tcp_payload_size == fpdu->incomplete_bytes)
+		fpdu->mpa_buf->piggy_buf = buf;
+
+	ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
+
+	/* Set first fragment to header */
+	rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true);
+	if (rc)
+		goto out;
+
+	/* Set second fragment to first part of packet */
+	rc = qed_ll2_set_fragment_of_tx_packet(p_hwfn, ll2_handle,
+					       fpdu->mpa_frag,
+					       fpdu->mpa_frag_len);
+	if (rc)
+		goto out;
+
+	if (!fpdu->incomplete_bytes)
+		goto out;
+
+	/* Set third fragment to second part of the packet */
+	rc = qed_ll2_set_fragment_of_tx_packet(p_hwfn,
+					       ll2_handle,
+					       buf->data_phys_addr +
+					       curr_pkt->first_mpa_offset,
+					       fpdu->incomplete_bytes);
+out:
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_RDMA,
+		   "MPA_ALIGN: Sent FPDU num_bds=%d first_frag_len=%x, mpa_frag_len=0x%x, incomplete_bytes:0x%x rc=%d\n",
+		   tx_pkt.num_of_bds,
+		   tx_pkt.first_frag_len,
+		   fpdu->mpa_frag_len,
+		   fpdu->incomplete_bytes, rc);
+
+	return rc;
+}
+
+static void
+qed_iwarp_mpa_get_data(struct qed_hwfn *p_hwfn,
+		       struct unaligned_opaque_data *curr_pkt,
+		       u32 opaque_data0, u32 opaque_data1)
+{
+	u64 opaque_data;
+
+	opaque_data = HILO_64(opaque_data1, opaque_data0);
+	*curr_pkt = *((struct unaligned_opaque_data *)&opaque_data);
+
+	curr_pkt->first_mpa_offset = curr_pkt->tcp_payload_offset +
+				     le16_to_cpu(curr_pkt->first_mpa_offset);
+	curr_pkt->cid = le32_to_cpu(curr_pkt->cid);
+}
+
+/* This function is called when an unaligned or incomplete MPA packet arrives
+ * driver needs to align the packet, perhaps using previous data and send
+ * it down to FW once it is aligned.
+ */
+static int
+qed_iwarp_process_mpa_pkt(struct qed_hwfn *p_hwfn,
+			  struct qed_iwarp_ll2_mpa_buf *mpa_buf)
+{
+	struct unaligned_opaque_data *curr_pkt = &mpa_buf->data;
+	struct qed_iwarp_ll2_buff *buf = mpa_buf->ll2_buf;
+	enum qed_iwarp_mpa_pkt_type pkt_type;
+	struct qed_iwarp_fpdu *fpdu;
+	int rc = -EINVAL;
+	u8 *mpa_data;
+
+	fpdu = qed_iwarp_get_curr_fpdu(p_hwfn, curr_pkt->cid & 0xffff);
+	if (!fpdu) { /* something corrupt with cid, post rx back */
+		DP_ERR(p_hwfn, "Invalid cid, drop and post back to rx cid=%x\n",
+		       curr_pkt->cid);
+		goto err;
+	}
+
+	do {
+		mpa_data = ((u8 *)(buf->data) + curr_pkt->first_mpa_offset);
+
+		pkt_type = qed_iwarp_mpa_classify(p_hwfn, fpdu,
+						  mpa_buf->tcp_payload_len,
+						  mpa_data);
+
+		switch (pkt_type) {
+		case QED_IWARP_MPA_PKT_PARTIAL:
+			qed_iwarp_init_fpdu(buf, fpdu,
+					    curr_pkt,
+					    mpa_buf->tcp_payload_len,
+					    mpa_buf->placement_offset);
+
+			if (!QED_IWARP_IS_RIGHT_EDGE(curr_pkt)) {
+				mpa_buf->tcp_payload_len = 0;
+				break;
+			}
+
+			rc = qed_iwarp_win_right_edge(p_hwfn, fpdu);
+
+			if (rc) {
+				DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+					   "Can't send FPDU:reset rc=%d\n", rc);
+				memset(fpdu, 0, sizeof(*fpdu));
+				break;
+			}
+
+			mpa_buf->tcp_payload_len = 0;
+			break;
+		case QED_IWARP_MPA_PKT_PACKED:
+			qed_iwarp_init_fpdu(buf, fpdu,
+					    curr_pkt,
+					    mpa_buf->tcp_payload_len,
+					    mpa_buf->placement_offset);
+
+			rc = qed_iwarp_send_fpdu(p_hwfn, fpdu, curr_pkt, buf,
+						 mpa_buf->tcp_payload_len,
+						 pkt_type);
+			if (rc) {
+				DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+					   "Can't send FPDU:reset rc=%d\n", rc);
+				memset(fpdu, 0, sizeof(*fpdu));
+				break;
+			}
+
+			mpa_buf->tcp_payload_len -= fpdu->fpdu_length;
+			curr_pkt->first_mpa_offset += fpdu->fpdu_length;
+			break;
+		case QED_IWARP_MPA_PKT_UNALIGNED:
+			qed_iwarp_update_fpdu_length(p_hwfn, fpdu, mpa_data);
+			if (mpa_buf->tcp_payload_len < fpdu->incomplete_bytes) {
+				/* special handling of fpdu split over more
+				 * than 2 segments
+				 */
+				if (QED_IWARP_IS_RIGHT_EDGE(curr_pkt)) {
+					rc = qed_iwarp_win_right_edge(p_hwfn,
+								      fpdu);
+					/* packet will be re-processed later */
+					if (rc)
+						return rc;
+				}
+
+				rc = qed_iwarp_cp_pkt(p_hwfn, fpdu, curr_pkt,
+						      buf,
+						      mpa_buf->tcp_payload_len);
+				if (rc) /* packet will be re-processed later */
+					return rc;
+
+				mpa_buf->tcp_payload_len = 0;
+				break;
+			}
+
+			rc = qed_iwarp_send_fpdu(p_hwfn, fpdu, curr_pkt, buf,
+						 mpa_buf->tcp_payload_len,
+						 pkt_type);
+			if (rc) {
+				DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+					   "Can't send FPDU:delay rc=%d\n", rc);
+				/* don't reset fpdu -> we need it for next
+				 * classify
+				 */
+				break;
+			}
+
+			mpa_buf->tcp_payload_len -= fpdu->incomplete_bytes;
+			curr_pkt->first_mpa_offset += fpdu->incomplete_bytes;
+			/* The framed PDU was sent - no more incomplete bytes */
+			fpdu->incomplete_bytes = 0;
+			break;
+		}
+	} while (mpa_buf->tcp_payload_len && !rc);
+
+	return rc;
+
+err:
+	qed_iwarp_ll2_post_rx(p_hwfn,
+			      buf,
+			      p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle);
+	return rc;
+}
+
+static void qed_iwarp_process_pending_pkts(struct qed_hwfn *p_hwfn)
+{
+	struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+	struct qed_iwarp_ll2_mpa_buf *mpa_buf = NULL;
+	int rc;
+
+	while (!list_empty(&iwarp_info->mpa_buf_pending_list)) {
+		mpa_buf = list_first_entry(&iwarp_info->mpa_buf_pending_list,
+					   struct qed_iwarp_ll2_mpa_buf,
+					   list_entry);
+
+		rc = qed_iwarp_process_mpa_pkt(p_hwfn, mpa_buf);
+
+		/* busy means break and continue processing later, don't
+		 * remove the buf from the pending list.
+		 */
+		if (rc == -EBUSY)
+			break;
+
+		list_del(&mpa_buf->list_entry);
+		list_add_tail(&mpa_buf->list_entry, &iwarp_info->mpa_buf_list);
+
+		if (rc) {	/* different error, don't continue */
+			DP_NOTICE(p_hwfn, "process pkts failed rc=%d\n", rc);
+			break;
+		}
+	}
+}
+
+static void
+qed_iwarp_ll2_comp_mpa_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
+{
+	struct qed_iwarp_ll2_mpa_buf *mpa_buf;
+	struct qed_iwarp_info *iwarp_info;
+	struct qed_hwfn *p_hwfn = cxt;
+
+	iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+	mpa_buf = list_first_entry(&iwarp_info->mpa_buf_list,
+				   struct qed_iwarp_ll2_mpa_buf, list_entry);
+	if (!mpa_buf) {
+		DP_ERR(p_hwfn, "No free mpa buf\n");
+		goto err;
+	}
+
+	list_del(&mpa_buf->list_entry);
+	qed_iwarp_mpa_get_data(p_hwfn, &mpa_buf->data,
+			       data->opaque_data_0, data->opaque_data_1);
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_RDMA,
+		   "LL2 MPA CompRx payload_len:0x%x\tfirst_mpa_offset:0x%x\ttcp_payload_offset:0x%x\tflags:0x%x\tcid:0x%x\n",
+		   data->length.packet_length, mpa_buf->data.first_mpa_offset,
+		   mpa_buf->data.tcp_payload_offset, mpa_buf->data.flags,
+		   mpa_buf->data.cid);
+
+	mpa_buf->ll2_buf = data->cookie;
+	mpa_buf->tcp_payload_len = data->length.packet_length -
+				   mpa_buf->data.first_mpa_offset;
+	mpa_buf->data.first_mpa_offset += data->u.placement_offset;
+	mpa_buf->placement_offset = data->u.placement_offset;
+
+	list_add_tail(&mpa_buf->list_entry, &iwarp_info->mpa_buf_pending_list);
+
+	qed_iwarp_process_pending_pkts(p_hwfn);
+	return;
+err:
+	qed_iwarp_ll2_post_rx(p_hwfn, data->cookie,
+			      iwarp_info->ll2_mpa_handle);
+}
+
+static void
+qed_iwarp_ll2_comp_syn_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
+{
+	struct qed_iwarp_ll2_buff *buf = data->cookie;
+	struct qed_iwarp_listener *listener;
+	struct qed_ll2_tx_pkt_info tx_pkt;
+	struct qed_iwarp_cm_info cm_info;
+	struct qed_hwfn *p_hwfn = cxt;
+	u8 remote_mac_addr[ETH_ALEN];
+	u8 local_mac_addr[ETH_ALEN];
+	struct qed_iwarp_ep *ep;
+	int tcp_start_offset;
+	u8 ts_hdr_size = 0;
+	u8 ll2_syn_handle;
+	int payload_len;
+	u32 hdr_size;
+	int rc;
+
+	memset(&cm_info, 0, sizeof(cm_info));
+	ll2_syn_handle = p_hwfn->p_rdma_info->iwarp.ll2_syn_handle;
+
+	/* Check if packet was received with errors... */
+	if (data->err_flags) {
+		DP_NOTICE(p_hwfn, "Error received on SYN packet: 0x%x\n",
+			  data->err_flags);
+		goto err;
+	}
+
+	if (GET_FIELD(data->parse_flags,
+		      PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED) &&
+	    GET_FIELD(data->parse_flags, PARSING_AND_ERR_FLAGS_L4CHKSMERROR)) {
+		DP_NOTICE(p_hwfn, "Syn packet received with checksum error\n");
+		goto err;
+	}
+
+	rc = qed_iwarp_parse_rx_pkt(p_hwfn, &cm_info, (u8 *)(buf->data) +
+				    data->u.placement_offset, remote_mac_addr,
+				    local_mac_addr, &payload_len,
+				    &tcp_start_offset);
+	if (rc)
+		goto err;
+
+	/* Check if there is a listener for this 4-tuple+vlan */
+	listener = qed_iwarp_get_listener(p_hwfn, &cm_info);
+	if (!listener) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_RDMA,
+			   "SYN received on tuple not listened on parse_flags=%d packet len=%d\n",
+			   data->parse_flags, data->length.packet_length);
+
+		memset(&tx_pkt, 0, sizeof(tx_pkt));
+		tx_pkt.num_of_bds = 1;
+		tx_pkt.l4_hdr_offset_w = (data->length.packet_length) >> 2;
+		tx_pkt.tx_dest = QED_LL2_TX_DEST_LB;
+		tx_pkt.first_frag = buf->data_phys_addr +
+				    data->u.placement_offset;
+		tx_pkt.first_frag_len = data->length.packet_length;
+		tx_pkt.cookie = buf;
+
+		rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_syn_handle,
+					       &tx_pkt, true);
+
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "Can't post SYN back to chip rc=%d\n", rc);
+			goto err;
+		}
+		return;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Received syn on listening port\n");
+	/* There may be an open ep on this connection if this is a syn
+	 * retrasnmit... need to make sure there isn't...
+	 */
+	if (qed_iwarp_ep_exists(p_hwfn, &cm_info))
+		goto err;
+
+	ep = qed_iwarp_get_free_ep(p_hwfn);
+	if (!ep)
+		goto err;
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+	list_add_tail(&ep->list_entry, &p_hwfn->p_rdma_info->iwarp.ep_list);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+
+	ether_addr_copy(ep->remote_mac_addr, remote_mac_addr);
+	ether_addr_copy(ep->local_mac_addr, local_mac_addr);
+
+	memcpy(&ep->cm_info, &cm_info, sizeof(ep->cm_info));
+
+	if (p_hwfn->p_rdma_info->iwarp.tcp_flags & QED_IWARP_TS_EN)
+		ts_hdr_size = TIMESTAMP_HEADER_SIZE;
+
+	hdr_size = ((cm_info.ip_version == QED_TCP_IPV4) ? 40 : 60) +
+		   ts_hdr_size;
+	ep->mss = p_hwfn->p_rdma_info->iwarp.max_mtu - hdr_size;
+	ep->mss = min_t(u16, QED_IWARP_MAX_FW_MSS, ep->mss);
+
+	ep->event_cb = listener->event_cb;
+	ep->cb_context = listener->cb_context;
+	ep->connect_mode = TCP_CONNECT_PASSIVE;
+
+	ep->syn = buf;
+	ep->syn_ip_payload_length = (u16)payload_len;
+	ep->syn_phy_addr = buf->data_phys_addr + data->u.placement_offset +
+			   tcp_start_offset;
+
+	rc = qed_iwarp_tcp_offload(p_hwfn, ep);
+	if (rc) {
+		qed_iwarp_return_ep(p_hwfn, ep);
+		goto err;
+	}
+
+	return;
+err:
+	qed_iwarp_ll2_post_rx(p_hwfn, buf, ll2_syn_handle);
+}
+
+static void qed_iwarp_ll2_rel_rx_pkt(void *cxt, u8 connection_handle,
+				     void *cookie, dma_addr_t rx_buf_addr,
+				     bool b_last_packet)
+{
+	struct qed_iwarp_ll2_buff *buffer = cookie;
+	struct qed_hwfn *p_hwfn = cxt;
+
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev, buffer->buff_size,
+			  buffer->data, buffer->data_phys_addr);
+	kfree(buffer);
+}
+
+static void qed_iwarp_ll2_comp_tx_pkt(void *cxt, u8 connection_handle,
+				      void *cookie, dma_addr_t first_frag_addr,
+				      bool b_last_fragment, bool b_last_packet)
+{
+	struct qed_iwarp_ll2_buff *buffer = cookie;
+	struct qed_iwarp_ll2_buff *piggy;
+	struct qed_hwfn *p_hwfn = cxt;
+
+	if (!buffer)		/* can happen in packed mpa unaligned... */
+		return;
+
+	/* this was originally an rx packet, post it back */
+	piggy = buffer->piggy_buf;
+	if (piggy) {
+		buffer->piggy_buf = NULL;
+		qed_iwarp_ll2_post_rx(p_hwfn, piggy, connection_handle);
+	}
+
+	qed_iwarp_ll2_post_rx(p_hwfn, buffer, connection_handle);
+
+	if (connection_handle == p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle)
+		qed_iwarp_process_pending_pkts(p_hwfn);
+
+	return;
+}
+
+static void qed_iwarp_ll2_rel_tx_pkt(void *cxt, u8 connection_handle,
+				     void *cookie, dma_addr_t first_frag_addr,
+				     bool b_last_fragment, bool b_last_packet)
+{
+	struct qed_iwarp_ll2_buff *buffer = cookie;
+	struct qed_hwfn *p_hwfn = cxt;
+
+	if (!buffer)
+		return;
+
+	if (buffer->piggy_buf) {
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  buffer->piggy_buf->buff_size,
+				  buffer->piggy_buf->data,
+				  buffer->piggy_buf->data_phys_addr);
+
+		kfree(buffer->piggy_buf);
+	}
+
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev, buffer->buff_size,
+			  buffer->data, buffer->data_phys_addr);
+
+	kfree(buffer);
+}
+
+/* The only slowpath for iwarp ll2 is unalign flush. When this completion
+ * is received, need to reset the FPDU.
+ */
+void
+qed_iwarp_ll2_slowpath(void *cxt,
+		       u8 connection_handle,
+		       u32 opaque_data_0, u32 opaque_data_1)
+{
+	struct unaligned_opaque_data unalign_data;
+	struct qed_hwfn *p_hwfn = cxt;
+	struct qed_iwarp_fpdu *fpdu;
+
+	qed_iwarp_mpa_get_data(p_hwfn, &unalign_data,
+			       opaque_data_0, opaque_data_1);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "(0x%x) Flush fpdu\n",
+		   unalign_data.cid);
+
+	fpdu = qed_iwarp_get_curr_fpdu(p_hwfn, (u16)unalign_data.cid);
+	if (fpdu)
+		memset(fpdu, 0, sizeof(*fpdu));
+}
+
+static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+	int rc = 0;
+
+	if (iwarp_info->ll2_syn_handle != QED_IWARP_HANDLE_INVAL) {
+		rc = qed_ll2_terminate_connection(p_hwfn,
+						  iwarp_info->ll2_syn_handle);
+		if (rc)
+			DP_INFO(p_hwfn, "Failed to terminate syn connection\n");
+
+		qed_ll2_release_connection(p_hwfn, iwarp_info->ll2_syn_handle);
+		iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL;
+	}
+
+	if (iwarp_info->ll2_ooo_handle != QED_IWARP_HANDLE_INVAL) {
+		rc = qed_ll2_terminate_connection(p_hwfn,
+						  iwarp_info->ll2_ooo_handle);
+		if (rc)
+			DP_INFO(p_hwfn, "Failed to terminate ooo connection\n");
+
+		qed_ll2_release_connection(p_hwfn, iwarp_info->ll2_ooo_handle);
+		iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
+	}
+
+	if (iwarp_info->ll2_mpa_handle != QED_IWARP_HANDLE_INVAL) {
+		rc = qed_ll2_terminate_connection(p_hwfn,
+						  iwarp_info->ll2_mpa_handle);
+		if (rc)
+			DP_INFO(p_hwfn, "Failed to terminate mpa connection\n");
+
+		qed_ll2_release_connection(p_hwfn, iwarp_info->ll2_mpa_handle);
+		iwarp_info->ll2_mpa_handle = QED_IWARP_HANDLE_INVAL;
+	}
+
+	qed_llh_remove_mac_filter(p_hwfn,
+				  p_ptt, p_hwfn->p_rdma_info->iwarp.mac_addr);
+	return rc;
+}
+
+static int
+qed_iwarp_ll2_alloc_buffers(struct qed_hwfn *p_hwfn,
+			    int num_rx_bufs, int buff_size, u8 ll2_handle)
+{
+	struct qed_iwarp_ll2_buff *buffer;
+	int rc = 0;
+	int i;
+
+	for (i = 0; i < num_rx_bufs; i++) {
+		buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
+		if (!buffer) {
+			rc = -ENOMEM;
+			break;
+		}
+
+		buffer->data = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+						  buff_size,
+						  &buffer->data_phys_addr,
+						  GFP_KERNEL);
+		if (!buffer->data) {
+			kfree(buffer);
+			rc = -ENOMEM;
+			break;
+		}
+
+		buffer->buff_size = buff_size;
+		rc = qed_iwarp_ll2_post_rx(p_hwfn, buffer, ll2_handle);
+		if (rc)
+			/* buffers will be deallocated by qed_ll2 */
+			break;
+	}
+	return rc;
+}
+
+#define QED_IWARP_MAX_BUF_SIZE(mtu)				     \
+	ALIGN((mtu) + ETH_HLEN + 2 * VLAN_HLEN + 2 + ETH_CACHE_LINE_SIZE, \
+		ETH_CACHE_LINE_SIZE)
+
+static int
+qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn,
+		    struct qed_rdma_start_in_params *params,
+		    struct qed_ptt *p_ptt)
+{
+	struct qed_iwarp_info *iwarp_info;
+	struct qed_ll2_acquire_data data;
+	struct qed_ll2_cbs cbs;
+	u32 mpa_buff_size;
+	u16 n_ooo_bufs;
+	int rc = 0;
+	int i;
+
+	iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+	iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL;
+	iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
+	iwarp_info->ll2_mpa_handle = QED_IWARP_HANDLE_INVAL;
+
+	iwarp_info->max_mtu = params->max_mtu;
+
+	ether_addr_copy(p_hwfn->p_rdma_info->iwarp.mac_addr, params->mac_addr);
+
+	rc = qed_llh_add_mac_filter(p_hwfn, p_ptt, params->mac_addr);
+	if (rc)
+		return rc;
+
+	/* Start SYN connection */
+	cbs.rx_comp_cb = qed_iwarp_ll2_comp_syn_pkt;
+	cbs.rx_release_cb = qed_iwarp_ll2_rel_rx_pkt;
+	cbs.tx_comp_cb = qed_iwarp_ll2_comp_tx_pkt;
+	cbs.tx_release_cb = qed_iwarp_ll2_rel_tx_pkt;
+	cbs.cookie = p_hwfn;
+
+	memset(&data, 0, sizeof(data));
+	data.input.conn_type = QED_LL2_TYPE_IWARP;
+	data.input.mtu = QED_IWARP_MAX_SYN_PKT_SIZE;
+	data.input.rx_num_desc = QED_IWARP_LL2_SYN_RX_SIZE;
+	data.input.tx_num_desc = QED_IWARP_LL2_SYN_TX_SIZE;
+	data.input.tx_max_bds_per_packet = 1;	/* will never be fragmented */
+	data.input.tx_tc = PKT_LB_TC;
+	data.input.tx_dest = QED_LL2_TX_DEST_LB;
+	data.p_connection_handle = &iwarp_info->ll2_syn_handle;
+	data.cbs = &cbs;
+
+	rc = qed_ll2_acquire_connection(p_hwfn, &data);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Failed to acquire LL2 connection\n");
+		qed_llh_remove_mac_filter(p_hwfn, p_ptt, params->mac_addr);
+		return rc;
+	}
+
+	rc = qed_ll2_establish_connection(p_hwfn, iwarp_info->ll2_syn_handle);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Failed to establish LL2 connection\n");
+		goto err;
+	}
+
+	rc = qed_iwarp_ll2_alloc_buffers(p_hwfn,
+					 QED_IWARP_LL2_SYN_RX_SIZE,
+					 QED_IWARP_MAX_SYN_PKT_SIZE,
+					 iwarp_info->ll2_syn_handle);
+	if (rc)
+		goto err;
+
+	/* Start OOO connection */
+	data.input.conn_type = QED_LL2_TYPE_OOO;
+	data.input.mtu = params->max_mtu;
+
+	n_ooo_bufs = (QED_IWARP_MAX_OOO * QED_IWARP_RCV_WND_SIZE_DEF) /
+		     iwarp_info->max_mtu;
+	n_ooo_bufs = min_t(u32, n_ooo_bufs, QED_IWARP_LL2_OOO_MAX_RX_SIZE);
+
+	data.input.rx_num_desc = n_ooo_bufs;
+	data.input.rx_num_ooo_buffers = n_ooo_bufs;
+
+	data.input.tx_max_bds_per_packet = 1;	/* will never be fragmented */
+	data.input.tx_num_desc = QED_IWARP_LL2_OOO_DEF_TX_SIZE;
+	data.p_connection_handle = &iwarp_info->ll2_ooo_handle;
+
+	rc = qed_ll2_acquire_connection(p_hwfn, &data);
+	if (rc)
+		goto err;
+
+	rc = qed_ll2_establish_connection(p_hwfn, iwarp_info->ll2_ooo_handle);
+	if (rc)
+		goto err;
+
+	/* Start Unaligned MPA connection */
+	cbs.rx_comp_cb = qed_iwarp_ll2_comp_mpa_pkt;
+	cbs.slowpath_cb = qed_iwarp_ll2_slowpath;
+
+	memset(&data, 0, sizeof(data));
+	data.input.conn_type = QED_LL2_TYPE_IWARP;
+	data.input.mtu = params->max_mtu;
+	/* FW requires that once a packet arrives OOO, it must have at
+	 * least 2 rx buffers available on the unaligned connection
+	 * for handling the case that it is a partial fpdu.
+	 */
+	data.input.rx_num_desc = n_ooo_bufs * 2;
+	data.input.tx_num_desc = data.input.rx_num_desc;
+	data.input.tx_max_bds_per_packet = QED_IWARP_MAX_BDS_PER_FPDU;
+	data.p_connection_handle = &iwarp_info->ll2_mpa_handle;
+	data.input.secondary_queue = true;
+	data.cbs = &cbs;
+
+	rc = qed_ll2_acquire_connection(p_hwfn, &data);
+	if (rc)
+		goto err;
+
+	rc = qed_ll2_establish_connection(p_hwfn, iwarp_info->ll2_mpa_handle);
+	if (rc)
+		goto err;
+
+	mpa_buff_size = QED_IWARP_MAX_BUF_SIZE(params->max_mtu);
+	rc = qed_iwarp_ll2_alloc_buffers(p_hwfn,
+					 data.input.rx_num_desc,
+					 mpa_buff_size,
+					 iwarp_info->ll2_mpa_handle);
+	if (rc)
+		goto err;
+
+	iwarp_info->partial_fpdus = kcalloc((u16)p_hwfn->p_rdma_info->num_qps,
+					    sizeof(*iwarp_info->partial_fpdus),
+					    GFP_KERNEL);
+	if (!iwarp_info->partial_fpdus)
+		goto err;
+
+	iwarp_info->max_num_partial_fpdus = (u16)p_hwfn->p_rdma_info->num_qps;
+
+	iwarp_info->mpa_intermediate_buf = kzalloc(mpa_buff_size, GFP_KERNEL);
+	if (!iwarp_info->mpa_intermediate_buf)
+		goto err;
+
+	/* The mpa_bufs array serves for pending RX packets received on the
+	 * mpa ll2 that don't have place on the tx ring and require later
+	 * processing. We can't fail on allocation of such a struct therefore
+	 * we allocate enough to take care of all rx packets
+	 */
+	iwarp_info->mpa_bufs = kcalloc(data.input.rx_num_desc,
+				       sizeof(*iwarp_info->mpa_bufs),
+				       GFP_KERNEL);
+	if (!iwarp_info->mpa_bufs)
+		goto err;
+
+	INIT_LIST_HEAD(&iwarp_info->mpa_buf_pending_list);
+	INIT_LIST_HEAD(&iwarp_info->mpa_buf_list);
+	for (i = 0; i < data.input.rx_num_desc; i++)
+		list_add_tail(&iwarp_info->mpa_bufs[i].list_entry,
+			      &iwarp_info->mpa_buf_list);
+	return rc;
+err:
+	qed_iwarp_ll2_stop(p_hwfn, p_ptt);
+
+	return rc;
+}
+
+int qed_iwarp_setup(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+		    struct qed_rdma_start_in_params *params)
+{
+	struct qed_iwarp_info *iwarp_info;
+	u32 rcv_wnd_size;
+
+	iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+
+	iwarp_info->tcp_flags = QED_IWARP_TS_EN;
+	rcv_wnd_size = QED_IWARP_RCV_WND_SIZE_DEF;
+
+	/* value 0 is used for ilog2(QED_IWARP_RCV_WND_SIZE_MIN) */
+	iwarp_info->rcv_wnd_scale = ilog2(rcv_wnd_size) -
+	    ilog2(QED_IWARP_RCV_WND_SIZE_MIN);
+	iwarp_info->rcv_wnd_size = rcv_wnd_size >> iwarp_info->rcv_wnd_scale;
+	iwarp_info->crc_needed = QED_IWARP_PARAM_CRC_NEEDED;
+	iwarp_info->mpa_rev = MPA_NEGOTIATION_TYPE_ENHANCED;
+
+	iwarp_info->peer2peer = QED_IWARP_PARAM_P2P;
+
+	iwarp_info->rtr_type =  MPA_RTR_TYPE_ZERO_SEND |
+				MPA_RTR_TYPE_ZERO_WRITE |
+				MPA_RTR_TYPE_ZERO_READ;
+
+	spin_lock_init(&p_hwfn->p_rdma_info->iwarp.qp_lock);
+	INIT_LIST_HEAD(&p_hwfn->p_rdma_info->iwarp.ep_list);
+	INIT_LIST_HEAD(&p_hwfn->p_rdma_info->iwarp.listen_list);
+
+	qed_spq_register_async_cb(p_hwfn, PROTOCOLID_IWARP,
+				  qed_iwarp_async_event);
+	qed_ooo_setup(p_hwfn);
+
+	return qed_iwarp_ll2_start(p_hwfn, params, p_ptt);
+}
+
+int qed_iwarp_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	int rc;
+
+	qed_iwarp_free_prealloc_ep(p_hwfn);
+	rc = qed_iwarp_wait_for_all_cids(p_hwfn);
+	if (rc)
+		return rc;
+
+	qed_spq_unregister_async_cb(p_hwfn, PROTOCOLID_IWARP);
+
+	return qed_iwarp_ll2_stop(p_hwfn, p_ptt);
+}
+
+void qed_iwarp_qp_in_error(struct qed_hwfn *p_hwfn,
+			   struct qed_iwarp_ep *ep, u8 fw_return_code)
+{
+	struct qed_iwarp_cm_event_params params;
+
+	qed_iwarp_modify_qp(p_hwfn, ep->qp, QED_IWARP_QP_STATE_ERROR, true);
+
+	params.event = QED_IWARP_EVENT_CLOSE;
+	params.ep_context = ep;
+	params.cm_info = &ep->cm_info;
+	params.status = (fw_return_code == IWARP_QP_IN_ERROR_GOOD_CLOSE) ?
+			 0 : -ECONNRESET;
+
+	ep->state = QED_IWARP_EP_CLOSED;
+	spin_lock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+	list_del(&ep->list_entry);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+
+	ep->event_cb(ep->cb_context, &params);
+}
+
+void qed_iwarp_exception_received(struct qed_hwfn *p_hwfn,
+				  struct qed_iwarp_ep *ep, int fw_ret_code)
+{
+	struct qed_iwarp_cm_event_params params;
+	bool event_cb = false;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "EP(0x%x) fw_ret_code=%d\n",
+		   ep->cid, fw_ret_code);
+
+	switch (fw_ret_code) {
+	case IWARP_EXCEPTION_DETECTED_LLP_CLOSED:
+		params.status = 0;
+		params.event = QED_IWARP_EVENT_DISCONNECT;
+		event_cb = true;
+		break;
+	case IWARP_EXCEPTION_DETECTED_LLP_RESET:
+		params.status = -ECONNRESET;
+		params.event = QED_IWARP_EVENT_DISCONNECT;
+		event_cb = true;
+		break;
+	case IWARP_EXCEPTION_DETECTED_RQ_EMPTY:
+		params.event = QED_IWARP_EVENT_RQ_EMPTY;
+		event_cb = true;
+		break;
+	case IWARP_EXCEPTION_DETECTED_IRQ_FULL:
+		params.event = QED_IWARP_EVENT_IRQ_FULL;
+		event_cb = true;
+		break;
+	case IWARP_EXCEPTION_DETECTED_LLP_TIMEOUT:
+		params.event = QED_IWARP_EVENT_LLP_TIMEOUT;
+		event_cb = true;
+		break;
+	case IWARP_EXCEPTION_DETECTED_REMOTE_PROTECTION_ERROR:
+		params.event = QED_IWARP_EVENT_REMOTE_PROTECTION_ERROR;
+		event_cb = true;
+		break;
+	case IWARP_EXCEPTION_DETECTED_CQ_OVERFLOW:
+		params.event = QED_IWARP_EVENT_CQ_OVERFLOW;
+		event_cb = true;
+		break;
+	case IWARP_EXCEPTION_DETECTED_LOCAL_CATASTROPHIC:
+		params.event = QED_IWARP_EVENT_QP_CATASTROPHIC;
+		event_cb = true;
+		break;
+	case IWARP_EXCEPTION_DETECTED_LOCAL_ACCESS_ERROR:
+		params.event = QED_IWARP_EVENT_LOCAL_ACCESS_ERROR;
+		event_cb = true;
+		break;
+	case IWARP_EXCEPTION_DETECTED_REMOTE_OPERATION_ERROR:
+		params.event = QED_IWARP_EVENT_REMOTE_OPERATION_ERROR;
+		event_cb = true;
+		break;
+	case IWARP_EXCEPTION_DETECTED_TERMINATE_RECEIVED:
+		params.event = QED_IWARP_EVENT_TERMINATE_RECEIVED;
+		event_cb = true;
+		break;
+	default:
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Unhandled exception received...fw_ret_code=%d\n",
+			   fw_ret_code);
+		break;
+	}
+
+	if (event_cb) {
+		params.ep_context = ep;
+		params.cm_info = &ep->cm_info;
+		ep->event_cb(ep->cb_context, &params);
+	}
+}
+
+static void
+qed_iwarp_tcp_connect_unsuccessful(struct qed_hwfn *p_hwfn,
+				   struct qed_iwarp_ep *ep, u8 fw_return_code)
+{
+	struct qed_iwarp_cm_event_params params;
+
+	memset(&params, 0, sizeof(params));
+	params.event = QED_IWARP_EVENT_ACTIVE_COMPLETE;
+	params.ep_context = ep;
+	params.cm_info = &ep->cm_info;
+	ep->state = QED_IWARP_EP_CLOSED;
+
+	switch (fw_return_code) {
+	case IWARP_CONN_ERROR_TCP_CONNECT_INVALID_PACKET:
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "%s(0x%x) TCP connect got invalid packet\n",
+			   QED_IWARP_CONNECT_MODE_STRING(ep), ep->tcp_cid);
+		params.status = -ECONNRESET;
+		break;
+	case IWARP_CONN_ERROR_TCP_CONNECTION_RST:
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "%s(0x%x) TCP Connection Reset\n",
+			   QED_IWARP_CONNECT_MODE_STRING(ep), ep->tcp_cid);
+		params.status = -ECONNRESET;
+		break;
+	case IWARP_CONN_ERROR_TCP_CONNECT_TIMEOUT:
+		DP_NOTICE(p_hwfn, "%s(0x%x) TCP timeout\n",
+			  QED_IWARP_CONNECT_MODE_STRING(ep), ep->tcp_cid);
+		params.status = -EBUSY;
+		break;
+	case IWARP_CONN_ERROR_MPA_NOT_SUPPORTED_VER:
+		DP_NOTICE(p_hwfn, "%s(0x%x) MPA not supported VER\n",
+			  QED_IWARP_CONNECT_MODE_STRING(ep), ep->tcp_cid);
+		params.status = -ECONNREFUSED;
+		break;
+	case IWARP_CONN_ERROR_MPA_INVALID_PACKET:
+		DP_NOTICE(p_hwfn, "%s(0x%x) MPA Invalid Packet\n",
+			  QED_IWARP_CONNECT_MODE_STRING(ep), ep->tcp_cid);
+		params.status = -ECONNRESET;
+		break;
+	default:
+		DP_ERR(p_hwfn,
+		       "%s(0x%x) Unexpected return code tcp connect: %d\n",
+		       QED_IWARP_CONNECT_MODE_STRING(ep),
+		       ep->tcp_cid, fw_return_code);
+		params.status = -ECONNRESET;
+		break;
+	}
+
+	if (ep->connect_mode == TCP_CONNECT_PASSIVE) {
+		ep->tcp_cid = QED_IWARP_INVALID_TCP_CID;
+		qed_iwarp_return_ep(p_hwfn, ep);
+	} else {
+		ep->event_cb(ep->cb_context, &params);
+		spin_lock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+		list_del(&ep->list_entry);
+		spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+	}
+}
+
+void
+qed_iwarp_connect_complete(struct qed_hwfn *p_hwfn,
+			   struct qed_iwarp_ep *ep, u8 fw_return_code)
+{
+	u8 ll2_syn_handle = p_hwfn->p_rdma_info->iwarp.ll2_syn_handle;
+
+	if (ep->connect_mode == TCP_CONNECT_PASSIVE) {
+		/* Done with the SYN packet, post back to ll2 rx */
+		qed_iwarp_ll2_post_rx(p_hwfn, ep->syn, ll2_syn_handle);
+
+		ep->syn = NULL;
+
+		/* If connect failed - upper layer doesn't know about it */
+		if (fw_return_code == RDMA_RETURN_OK)
+			qed_iwarp_mpa_received(p_hwfn, ep);
+		else
+			qed_iwarp_tcp_connect_unsuccessful(p_hwfn, ep,
+							   fw_return_code);
+	} else {
+		if (fw_return_code == RDMA_RETURN_OK)
+			qed_iwarp_mpa_offload(p_hwfn, ep);
+		else
+			qed_iwarp_tcp_connect_unsuccessful(p_hwfn, ep,
+							   fw_return_code);
+	}
+}
+
+static inline bool
+qed_iwarp_check_ep_ok(struct qed_hwfn *p_hwfn, struct qed_iwarp_ep *ep)
+{
+	if (!ep || (ep->sig != QED_EP_SIG)) {
+		DP_ERR(p_hwfn, "ERROR ON ASYNC ep=%p\n", ep);
+		return false;
+	}
+
+	return true;
+}
+
+static int qed_iwarp_async_event(struct qed_hwfn *p_hwfn,
+				 u8 fw_event_code, u16 echo,
+				 union event_ring_data *data,
+				 u8 fw_return_code)
+{
+	struct regpair *fw_handle = &data->rdma_data.async_handle;
+	struct qed_iwarp_ep *ep = NULL;
+	u16 cid;
+
+	ep = (struct qed_iwarp_ep *)(uintptr_t)HILO_64(fw_handle->hi,
+						       fw_handle->lo);
+
+	switch (fw_event_code) {
+	case IWARP_EVENT_TYPE_ASYNC_CONNECT_COMPLETE:
+		/* Async completion after TCP 3-way handshake */
+		if (!qed_iwarp_check_ep_ok(p_hwfn, ep))
+			return -EINVAL;
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_RDMA,
+			   "EP(0x%x) IWARP_EVENT_TYPE_ASYNC_CONNECT_COMPLETE fw_ret_code=%d\n",
+			   ep->tcp_cid, fw_return_code);
+		qed_iwarp_connect_complete(p_hwfn, ep, fw_return_code);
+		break;
+	case IWARP_EVENT_TYPE_ASYNC_EXCEPTION_DETECTED:
+		if (!qed_iwarp_check_ep_ok(p_hwfn, ep))
+			return -EINVAL;
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_RDMA,
+			   "QP(0x%x) IWARP_EVENT_TYPE_ASYNC_EXCEPTION_DETECTED fw_ret_code=%d\n",
+			   ep->cid, fw_return_code);
+		qed_iwarp_exception_received(p_hwfn, ep, fw_return_code);
+		break;
+	case IWARP_EVENT_TYPE_ASYNC_QP_IN_ERROR_STATE:
+		/* Async completion for Close Connection ramrod */
+		if (!qed_iwarp_check_ep_ok(p_hwfn, ep))
+			return -EINVAL;
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_RDMA,
+			   "QP(0x%x) IWARP_EVENT_TYPE_ASYNC_QP_IN_ERROR_STATE fw_ret_code=%d\n",
+			   ep->cid, fw_return_code);
+		qed_iwarp_qp_in_error(p_hwfn, ep, fw_return_code);
+		break;
+	case IWARP_EVENT_TYPE_ASYNC_ENHANCED_MPA_REPLY_ARRIVED:
+		/* Async event for active side only */
+		if (!qed_iwarp_check_ep_ok(p_hwfn, ep))
+			return -EINVAL;
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_RDMA,
+			   "QP(0x%x) IWARP_EVENT_TYPE_ASYNC_MPA_HANDSHAKE_MPA_REPLY_ARRIVED fw_ret_code=%d\n",
+			   ep->cid, fw_return_code);
+		qed_iwarp_mpa_reply_arrived(p_hwfn, ep);
+		break;
+	case IWARP_EVENT_TYPE_ASYNC_MPA_HANDSHAKE_COMPLETE:
+		if (!qed_iwarp_check_ep_ok(p_hwfn, ep))
+			return -EINVAL;
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_RDMA,
+			   "QP(0x%x) IWARP_EVENT_TYPE_ASYNC_MPA_HANDSHAKE_COMPLETE fw_ret_code=%d\n",
+			   ep->cid, fw_return_code);
+		qed_iwarp_mpa_complete(p_hwfn, ep, fw_return_code);
+		break;
+	case IWARP_EVENT_TYPE_ASYNC_CID_CLEANED:
+		cid = (u16)le32_to_cpu(fw_handle->lo);
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "(0x%x)IWARP_EVENT_TYPE_ASYNC_CID_CLEANED\n", cid);
+		qed_iwarp_cid_cleaned(p_hwfn, cid);
+
+		break;
+	case IWARP_EVENT_TYPE_ASYNC_CQ_OVERFLOW:
+		DP_NOTICE(p_hwfn, "IWARP_EVENT_TYPE_ASYNC_CQ_OVERFLOW\n");
+
+		p_hwfn->p_rdma_info->events.affiliated_event(
+			p_hwfn->p_rdma_info->events.context,
+			QED_IWARP_EVENT_CQ_OVERFLOW,
+			(void *)fw_handle);
+		break;
+	default:
+		DP_ERR(p_hwfn, "Received unexpected async iwarp event %d\n",
+		       fw_event_code);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int
+qed_iwarp_create_listen(void *rdma_cxt,
+			struct qed_iwarp_listen_in *iparams,
+			struct qed_iwarp_listen_out *oparams)
+{
+	struct qed_hwfn *p_hwfn = rdma_cxt;
+	struct qed_iwarp_listener *listener;
+
+	listener = kzalloc(sizeof(*listener), GFP_KERNEL);
+	if (!listener)
+		return -ENOMEM;
+
+	listener->ip_version = iparams->ip_version;
+	memcpy(listener->ip_addr, iparams->ip_addr, sizeof(listener->ip_addr));
+	listener->port = iparams->port;
+	listener->vlan = iparams->vlan;
+
+	listener->event_cb = iparams->event_cb;
+	listener->cb_context = iparams->cb_context;
+	listener->max_backlog = iparams->max_backlog;
+	oparams->handle = listener;
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+	list_add_tail(&listener->list_entry,
+		      &p_hwfn->p_rdma_info->iwarp.listen_list);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_RDMA,
+		   "callback=%p handle=%p ip=%x:%x:%x:%x port=0x%x vlan=0x%x\n",
+		   listener->event_cb,
+		   listener,
+		   listener->ip_addr[0],
+		   listener->ip_addr[1],
+		   listener->ip_addr[2],
+		   listener->ip_addr[3], listener->port, listener->vlan);
+
+	return 0;
+}
+
+int qed_iwarp_destroy_listen(void *rdma_cxt, void *handle)
+{
+	struct qed_iwarp_listener *listener = handle;
+	struct qed_hwfn *p_hwfn = rdma_cxt;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "handle=%p\n", handle);
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+	list_del(&listener->list_entry);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->iwarp.iw_lock);
+
+	kfree(listener);
+
+	return 0;
+}
+
+int qed_iwarp_send_rtr(void *rdma_cxt, struct qed_iwarp_send_rtr_in *iparams)
+{
+	struct qed_hwfn *p_hwfn = rdma_cxt;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	struct qed_iwarp_ep *ep;
+	struct qed_rdma_qp *qp;
+	int rc;
+
+	ep = iparams->ep_context;
+	if (!ep) {
+		DP_ERR(p_hwfn, "Ep Context receive in send_rtr is NULL\n");
+		return -EINVAL;
+	}
+
+	qp = ep->qp;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "QP(0x%x) EP(0x%x)\n",
+		   qp->icid, ep->tcp_cid);
+
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qp->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_CB;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 IWARP_RAMROD_CMD_ID_MPA_OFFLOAD_SEND_RTR,
+				 PROTOCOLID_IWARP, &init_data);
+
+	if (rc)
+		return rc;
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = 0x%x\n", rc);
+
+	return rc;
+}
+
+void
+qed_iwarp_query_qp(struct qed_rdma_qp *qp,
+		   struct qed_rdma_query_qp_out_params *out_params)
+{
+	out_params->state = qed_iwarp2roce_state(qp->iwarp_state);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
new file mode 100644
index 0000000..b8f612d
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
@@ -0,0 +1,232 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _QED_IWARP_H
+#define _QED_IWARP_H
+
+enum qed_iwarp_qp_state {
+	QED_IWARP_QP_STATE_IDLE,
+	QED_IWARP_QP_STATE_RTS,
+	QED_IWARP_QP_STATE_TERMINATE,
+	QED_IWARP_QP_STATE_CLOSING,
+	QED_IWARP_QP_STATE_ERROR,
+};
+
+enum qed_iwarp_qp_state qed_roce2iwarp_state(enum qed_roce_qp_state state);
+
+#define QED_IWARP_PREALLOC_CNT  (256)
+
+#define QED_IWARP_LL2_SYN_TX_SIZE       (128)
+#define QED_IWARP_LL2_SYN_RX_SIZE       (256)
+#define QED_IWARP_MAX_SYN_PKT_SIZE      (128)
+
+#define QED_IWARP_LL2_OOO_DEF_TX_SIZE   (256)
+#define QED_IWARP_MAX_OOO		(16)
+#define QED_IWARP_LL2_OOO_MAX_RX_SIZE   (16384)
+
+#define QED_IWARP_HANDLE_INVAL		(0xff)
+
+struct qed_iwarp_ll2_buff {
+	struct qed_iwarp_ll2_buff *piggy_buf;
+	void *data;
+	dma_addr_t data_phys_addr;
+	u32 buff_size;
+};
+
+struct qed_iwarp_ll2_mpa_buf {
+	struct list_head list_entry;
+	struct qed_iwarp_ll2_buff *ll2_buf;
+	struct unaligned_opaque_data data;
+	u16 tcp_payload_len;
+	u8 placement_offset;
+};
+
+/* In some cases a fpdu will arrive with only one byte of the header, in this
+ * case the fpdu_length will be partial (contain only higher byte and
+ * incomplete bytes will contain the invalid value
+ */
+#define QED_IWARP_INVALID_INCOMPLETE_BYTES 0xffff
+
+struct qed_iwarp_fpdu {
+	struct qed_iwarp_ll2_buff *mpa_buf;
+	void *mpa_frag_virt;
+	dma_addr_t mpa_frag;
+	dma_addr_t pkt_hdr;
+	u16 mpa_frag_len;
+	u16 fpdu_length;
+	u16 incomplete_bytes;
+	u8 pkt_hdr_size;
+};
+
+struct qed_iwarp_info {
+	struct list_head listen_list;	/* qed_iwarp_listener */
+	struct list_head ep_list;	/* qed_iwarp_ep */
+	struct list_head ep_free_list;	/* pre-allocated ep's */
+	struct list_head mpa_buf_list;	/* list of mpa_bufs */
+	struct list_head mpa_buf_pending_list;
+	spinlock_t iw_lock;	/* for iwarp resources */
+	spinlock_t qp_lock;	/* for teardown races */
+	u32 rcv_wnd_scale;
+	u16 rcv_wnd_size;
+	u16 max_mtu;
+	u8 mac_addr[ETH_ALEN];
+	u8 crc_needed;
+	u8 tcp_flags;
+	u8 ll2_syn_handle;
+	u8 ll2_ooo_handle;
+	u8 ll2_mpa_handle;
+	u8 peer2peer;
+	enum mpa_negotiation_mode mpa_rev;
+	enum mpa_rtr_type rtr_type;
+	struct qed_iwarp_fpdu *partial_fpdus;
+	struct qed_iwarp_ll2_mpa_buf *mpa_bufs;
+	u8 *mpa_intermediate_buf;
+	u16 max_num_partial_fpdus;
+};
+
+enum qed_iwarp_ep_state {
+	QED_IWARP_EP_INIT,
+	QED_IWARP_EP_MPA_REQ_RCVD,
+	QED_IWARP_EP_MPA_OFFLOADED,
+	QED_IWARP_EP_ESTABLISHED,
+	QED_IWARP_EP_CLOSED
+};
+
+union async_output {
+	struct iwarp_eqe_data_mpa_async_completion mpa_response;
+	struct iwarp_eqe_data_tcp_async_completion mpa_request;
+};
+
+#define QED_MAX_PRIV_DATA_LEN (512)
+struct qed_iwarp_ep_memory {
+	u8 in_pdata[QED_MAX_PRIV_DATA_LEN];
+	u8 out_pdata[QED_MAX_PRIV_DATA_LEN];
+	union async_output async_output;
+};
+
+/* Endpoint structure represents a TCP connection. This connection can be
+ * associated with a QP or not (in which case QP==NULL)
+ */
+struct qed_iwarp_ep {
+	struct list_head list_entry;
+	struct qed_rdma_qp *qp;
+	struct qed_iwarp_ep_memory *ep_buffer_virt;
+	dma_addr_t ep_buffer_phys;
+	enum qed_iwarp_ep_state state;
+	int sig;
+	struct qed_iwarp_cm_info cm_info;
+	enum tcp_connect_mode connect_mode;
+	enum mpa_rtr_type rtr_type;
+	enum mpa_negotiation_mode mpa_rev;
+	u32 tcp_cid;
+	u32 cid;
+	u16 mss;
+	u8 remote_mac_addr[6];
+	u8 local_mac_addr[6];
+	bool mpa_reply_processed;
+
+	/* For Passive side - syn packet related data */
+	u16 syn_ip_payload_length;
+	struct qed_iwarp_ll2_buff *syn;
+	dma_addr_t syn_phy_addr;
+
+	/* The event_cb function is called for asynchrounous events associated
+	 * with the ep. It is initialized at different entry points depending
+	 * on whether the ep is the tcp connection active side or passive side
+	 * The cb_context is passed to the event_cb function.
+	 */
+	iwarp_event_handler event_cb;
+	void *cb_context;
+};
+
+struct qed_iwarp_listener {
+	struct list_head list_entry;
+
+	/* The event_cb function is called for connection requests.
+	 * The cb_context is passed to the event_cb function.
+	 */
+	iwarp_event_handler event_cb;
+	void *cb_context;
+	u32 max_backlog;
+	u32 ip_addr[4];
+	u16 port;
+	u16 vlan;
+	u8 ip_version;
+};
+
+int qed_iwarp_alloc(struct qed_hwfn *p_hwfn);
+
+int qed_iwarp_setup(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+		    struct qed_rdma_start_in_params *params);
+
+void qed_iwarp_init_fw_ramrod(struct qed_hwfn *p_hwfn,
+			      struct iwarp_init_func_ramrod_data *p_ramrod);
+
+int qed_iwarp_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
+void qed_iwarp_resc_free(struct qed_hwfn *p_hwfn);
+
+void qed_iwarp_init_devinfo(struct qed_hwfn *p_hwfn);
+
+void qed_iwarp_init_hw(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
+int qed_iwarp_create_qp(struct qed_hwfn *p_hwfn,
+			struct qed_rdma_qp *qp,
+			struct qed_rdma_create_qp_out_params *out_params);
+
+int qed_iwarp_modify_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp,
+			enum qed_iwarp_qp_state new_state, bool internal);
+
+int qed_iwarp_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp);
+
+int qed_iwarp_fw_destroy(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp);
+
+void qed_iwarp_query_qp(struct qed_rdma_qp *qp,
+			struct qed_rdma_query_qp_out_params *out_params);
+
+int
+qed_iwarp_connect(void *rdma_cxt,
+		  struct qed_iwarp_connect_in *iparams,
+		  struct qed_iwarp_connect_out *oparams);
+
+int
+qed_iwarp_create_listen(void *rdma_cxt,
+			struct qed_iwarp_listen_in *iparams,
+			struct qed_iwarp_listen_out *oparams);
+
+int qed_iwarp_accept(void *rdma_cxt, struct qed_iwarp_accept_in *iparams);
+
+int qed_iwarp_reject(void *rdma_cxt, struct qed_iwarp_reject_in *iparams);
+int qed_iwarp_destroy_listen(void *rdma_cxt, void *handle);
+
+int qed_iwarp_send_rtr(void *rdma_cxt, struct qed_iwarp_send_rtr_in *iparams);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
new file mode 100644
index 0000000..8667799
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -0,0 +1,2900 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <asm/param.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/etherdevice.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/vmalloc.h>
+#include "qed.h"
+#include <linux/qed/qed_chain.h>
+#include "qed_cxt.h"
+#include "qed_dev_api.h"
+#include <linux/qed/qed_eth_if.h>
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_int.h"
+#include "qed_l2.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+#include "qed_sp.h"
+#include "qed_sriov.h"
+
+
+#define QED_MAX_SGES_NUM 16
+#define CRC32_POLY 0x1edc6f41
+
+struct qed_l2_info {
+	u32 queues;
+	unsigned long **pp_qid_usage;
+
+	/* The lock is meant to synchronize access to the qid usage */
+	struct mutex lock;
+};
+
+int qed_l2_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_l2_info *p_l2_info;
+	unsigned long **pp_qids;
+	u32 i;
+
+	if (!QED_IS_L2_PERSONALITY(p_hwfn))
+		return 0;
+
+	p_l2_info = kzalloc(sizeof(*p_l2_info), GFP_KERNEL);
+	if (!p_l2_info)
+		return -ENOMEM;
+	p_hwfn->p_l2_info = p_l2_info;
+
+	if (IS_PF(p_hwfn->cdev)) {
+		p_l2_info->queues = RESC_NUM(p_hwfn, QED_L2_QUEUE);
+	} else {
+		u8 rx = 0, tx = 0;
+
+		qed_vf_get_num_rxqs(p_hwfn, &rx);
+		qed_vf_get_num_txqs(p_hwfn, &tx);
+
+		p_l2_info->queues = max_t(u8, rx, tx);
+	}
+
+	pp_qids = kzalloc(sizeof(unsigned long *) * p_l2_info->queues,
+			  GFP_KERNEL);
+	if (!pp_qids)
+		return -ENOMEM;
+	p_l2_info->pp_qid_usage = pp_qids;
+
+	for (i = 0; i < p_l2_info->queues; i++) {
+		pp_qids[i] = kzalloc(MAX_QUEUES_PER_QZONE / 8, GFP_KERNEL);
+		if (!pp_qids[i])
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void qed_l2_setup(struct qed_hwfn *p_hwfn)
+{
+	if (!QED_IS_L2_PERSONALITY(p_hwfn))
+		return;
+
+	mutex_init(&p_hwfn->p_l2_info->lock);
+}
+
+void qed_l2_free(struct qed_hwfn *p_hwfn)
+{
+	u32 i;
+
+	if (!QED_IS_L2_PERSONALITY(p_hwfn))
+		return;
+
+	if (!p_hwfn->p_l2_info)
+		return;
+
+	if (!p_hwfn->p_l2_info->pp_qid_usage)
+		goto out_l2_info;
+
+	/* Free until hit first uninitialized entry */
+	for (i = 0; i < p_hwfn->p_l2_info->queues; i++) {
+		if (!p_hwfn->p_l2_info->pp_qid_usage[i])
+			break;
+		kfree(p_hwfn->p_l2_info->pp_qid_usage[i]);
+	}
+
+	kfree(p_hwfn->p_l2_info->pp_qid_usage);
+
+out_l2_info:
+	kfree(p_hwfn->p_l2_info);
+	p_hwfn->p_l2_info = NULL;
+}
+
+static bool qed_eth_queue_qid_usage_add(struct qed_hwfn *p_hwfn,
+					struct qed_queue_cid *p_cid)
+{
+	struct qed_l2_info *p_l2_info = p_hwfn->p_l2_info;
+	u16 queue_id = p_cid->rel.queue_id;
+	bool b_rc = true;
+	u8 first;
+
+	mutex_lock(&p_l2_info->lock);
+
+	if (queue_id >= p_l2_info->queues) {
+		DP_NOTICE(p_hwfn,
+			  "Requested to increase usage for qzone %04x out of %08x\n",
+			  queue_id, p_l2_info->queues);
+		b_rc = false;
+		goto out;
+	}
+
+	first = (u8)find_first_zero_bit(p_l2_info->pp_qid_usage[queue_id],
+					MAX_QUEUES_PER_QZONE);
+	if (first >= MAX_QUEUES_PER_QZONE) {
+		b_rc = false;
+		goto out;
+	}
+
+	__set_bit(first, p_l2_info->pp_qid_usage[queue_id]);
+	p_cid->qid_usage_idx = first;
+
+out:
+	mutex_unlock(&p_l2_info->lock);
+	return b_rc;
+}
+
+static void qed_eth_queue_qid_usage_del(struct qed_hwfn *p_hwfn,
+					struct qed_queue_cid *p_cid)
+{
+	mutex_lock(&p_hwfn->p_l2_info->lock);
+
+	clear_bit(p_cid->qid_usage_idx,
+		  p_hwfn->p_l2_info->pp_qid_usage[p_cid->rel.queue_id]);
+
+	mutex_unlock(&p_hwfn->p_l2_info->lock);
+}
+
+void qed_eth_queue_cid_release(struct qed_hwfn *p_hwfn,
+			       struct qed_queue_cid *p_cid)
+{
+	bool b_legacy_vf = !!(p_cid->vf_legacy & QED_QCID_LEGACY_VF_CID);
+
+	if (IS_PF(p_hwfn->cdev) && !b_legacy_vf)
+		_qed_cxt_release_cid(p_hwfn, p_cid->cid, p_cid->vfid);
+
+	/* For PF's VFs we maintain the index inside queue-zone in IOV */
+	if (p_cid->vfid == QED_QUEUE_CID_SELF)
+		qed_eth_queue_qid_usage_del(p_hwfn, p_cid);
+
+	vfree(p_cid);
+}
+
+/* The internal is only meant to be directly called by PFs initializeing CIDs
+ * for their VFs.
+ */
+static struct qed_queue_cid *
+_qed_eth_queue_to_cid(struct qed_hwfn *p_hwfn,
+		      u16 opaque_fid,
+		      u32 cid,
+		      struct qed_queue_start_common_params *p_params,
+		      bool b_is_rx,
+		      struct qed_queue_cid_vf_params *p_vf_params)
+{
+	struct qed_queue_cid *p_cid;
+	int rc;
+
+	p_cid = vzalloc(sizeof(*p_cid));
+	if (!p_cid)
+		return NULL;
+
+	p_cid->opaque_fid = opaque_fid;
+	p_cid->cid = cid;
+	p_cid->p_owner = p_hwfn;
+
+	/* Fill in parameters */
+	p_cid->rel.vport_id = p_params->vport_id;
+	p_cid->rel.queue_id = p_params->queue_id;
+	p_cid->rel.stats_id = p_params->stats_id;
+	p_cid->sb_igu_id = p_params->p_sb->igu_sb_id;
+	p_cid->b_is_rx = b_is_rx;
+	p_cid->sb_idx = p_params->sb_idx;
+
+	/* Fill-in bits related to VFs' queues if information was provided */
+	if (p_vf_params) {
+		p_cid->vfid = p_vf_params->vfid;
+		p_cid->vf_qid = p_vf_params->vf_qid;
+		p_cid->vf_legacy = p_vf_params->vf_legacy;
+	} else {
+		p_cid->vfid = QED_QUEUE_CID_SELF;
+	}
+
+	/* Don't try calculating the absolute indices for VFs */
+	if (IS_VF(p_hwfn->cdev)) {
+		p_cid->abs = p_cid->rel;
+		goto out;
+	}
+
+	/* Calculate the engine-absolute indices of the resources.
+	 * This would guarantee they're valid later on.
+	 * In some cases [SBs] we already have the right values.
+	 */
+	rc = qed_fw_vport(p_hwfn, p_cid->rel.vport_id, &p_cid->abs.vport_id);
+	if (rc)
+		goto fail;
+
+	rc = qed_fw_l2_queue(p_hwfn, p_cid->rel.queue_id, &p_cid->abs.queue_id);
+	if (rc)
+		goto fail;
+
+	/* In case of a PF configuring its VF's queues, the stats-id is already
+	 * absolute [since there's a single index that's suitable per-VF].
+	 */
+	if (p_cid->vfid == QED_QUEUE_CID_SELF) {
+		rc = qed_fw_vport(p_hwfn, p_cid->rel.stats_id,
+				  &p_cid->abs.stats_id);
+		if (rc)
+			goto fail;
+	} else {
+		p_cid->abs.stats_id = p_cid->rel.stats_id;
+	}
+
+out:
+	/* VF-images have provided the qid_usage_idx on their own.
+	 * Otherwise, we need to allocate a unique one.
+	 */
+	if (!p_vf_params) {
+		if (!qed_eth_queue_qid_usage_add(p_hwfn, p_cid))
+			goto fail;
+	} else {
+		p_cid->qid_usage_idx = p_vf_params->qid_usage_idx;
+	}
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_SP,
+		   "opaque_fid: %04x CID %08x vport %02x [%02x] qzone %04x.%02x [%04x] stats %02x [%02x] SB %04x PI %02x\n",
+		   p_cid->opaque_fid,
+		   p_cid->cid,
+		   p_cid->rel.vport_id,
+		   p_cid->abs.vport_id,
+		   p_cid->rel.queue_id,
+		   p_cid->qid_usage_idx,
+		   p_cid->abs.queue_id,
+		   p_cid->rel.stats_id,
+		   p_cid->abs.stats_id, p_cid->sb_igu_id, p_cid->sb_idx);
+
+	return p_cid;
+
+fail:
+	vfree(p_cid);
+	return NULL;
+}
+
+struct qed_queue_cid *
+qed_eth_queue_to_cid(struct qed_hwfn *p_hwfn,
+		     u16 opaque_fid,
+		     struct qed_queue_start_common_params *p_params,
+		     bool b_is_rx,
+		     struct qed_queue_cid_vf_params *p_vf_params)
+{
+	struct qed_queue_cid *p_cid;
+	u8 vfid = QED_CXT_PF_CID;
+	bool b_legacy_vf = false;
+	u32 cid = 0;
+
+	/* In case of legacy VFs, The CID can be derived from the additional
+	 * VF parameters - the VF assumes queue X uses CID X, so we can simply
+	 * use the vf_qid for this purpose as well.
+	 */
+	if (p_vf_params) {
+		vfid = p_vf_params->vfid;
+
+		if (p_vf_params->vf_legacy & QED_QCID_LEGACY_VF_CID) {
+			b_legacy_vf = true;
+			cid = p_vf_params->vf_qid;
+		}
+	}
+
+	/* Get a unique firmware CID for this queue, in case it's a PF.
+	 * VF's don't need a CID as the queue configuration will be done
+	 * by PF.
+	 */
+	if (IS_PF(p_hwfn->cdev) && !b_legacy_vf) {
+		if (_qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_ETH,
+					 &cid, vfid)) {
+			DP_NOTICE(p_hwfn, "Failed to acquire cid\n");
+			return NULL;
+		}
+	}
+
+	p_cid = _qed_eth_queue_to_cid(p_hwfn, opaque_fid, cid,
+				      p_params, b_is_rx, p_vf_params);
+	if (!p_cid && IS_PF(p_hwfn->cdev) && !b_legacy_vf)
+		_qed_cxt_release_cid(p_hwfn, cid, vfid);
+
+	return p_cid;
+}
+
+static struct qed_queue_cid *
+qed_eth_queue_to_cid_pf(struct qed_hwfn *p_hwfn,
+			u16 opaque_fid,
+			bool b_is_rx,
+			struct qed_queue_start_common_params *p_params)
+{
+	return qed_eth_queue_to_cid(p_hwfn, opaque_fid, p_params, b_is_rx,
+				    NULL);
+}
+
+int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
+			   struct qed_sp_vport_start_params *p_params)
+{
+	struct vport_start_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent =  NULL;
+	struct qed_sp_init_data init_data;
+	u8 abs_vport_id = 0;
+	int rc = -EINVAL;
+	u16 rx_mode = 0;
+
+	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
+	if (rc)
+		return rc;
+
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_params->opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ETH_RAMROD_VPORT_START,
+				 PROTOCOLID_ETH, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod		= &p_ent->ramrod.vport_start;
+	p_ramrod->vport_id	= abs_vport_id;
+
+	p_ramrod->mtu			= cpu_to_le16(p_params->mtu);
+	p_ramrod->handle_ptp_pkts	= p_params->handle_ptp_pkts;
+	p_ramrod->inner_vlan_removal_en	= p_params->remove_inner_vlan;
+	p_ramrod->drop_ttl0_en		= p_params->drop_ttl0;
+	p_ramrod->untagged		= p_params->only_untagged;
+
+	SET_FIELD(rx_mode, ETH_VPORT_RX_MODE_UCAST_DROP_ALL, 1);
+	SET_FIELD(rx_mode, ETH_VPORT_RX_MODE_MCAST_DROP_ALL, 1);
+
+	p_ramrod->rx_mode.state = cpu_to_le16(rx_mode);
+
+	/* TPA related fields */
+	memset(&p_ramrod->tpa_param, 0, sizeof(struct eth_vport_tpa_param));
+
+	p_ramrod->tpa_param.max_buff_num = p_params->max_buffers_per_cqe;
+
+	switch (p_params->tpa_mode) {
+	case QED_TPA_MODE_GRO:
+		p_ramrod->tpa_param.tpa_max_aggs_num = ETH_TPA_MAX_AGGS_NUM;
+		p_ramrod->tpa_param.tpa_max_size = (u16)-1;
+		p_ramrod->tpa_param.tpa_min_size_to_cont = p_params->mtu / 2;
+		p_ramrod->tpa_param.tpa_min_size_to_start = p_params->mtu / 2;
+		p_ramrod->tpa_param.tpa_ipv4_en_flg = 1;
+		p_ramrod->tpa_param.tpa_ipv6_en_flg = 1;
+		p_ramrod->tpa_param.tpa_pkt_split_flg = 1;
+		p_ramrod->tpa_param.tpa_gro_consistent_flg = 1;
+		break;
+	default:
+		break;
+	}
+
+	p_ramrod->tx_switching_en = p_params->tx_switching;
+
+	p_ramrod->ctl_frame_mac_check_en = !!p_params->check_mac;
+	p_ramrod->ctl_frame_ethtype_check_en = !!p_params->check_ethtype;
+
+	/* Software Function ID in hwfn (PFs are 0 - 15, VFs are 16 - 135) */
+	p_ramrod->sw_fid = qed_concrete_to_sw_fid(p_hwfn->cdev,
+						  p_params->concrete_fid);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_vport_start(struct qed_hwfn *p_hwfn,
+			      struct qed_sp_vport_start_params *p_params)
+{
+	if (IS_VF(p_hwfn->cdev)) {
+		return qed_vf_pf_vport_start(p_hwfn, p_params->vport_id,
+					     p_params->mtu,
+					     p_params->remove_inner_vlan,
+					     p_params->tpa_mode,
+					     p_params->max_buffers_per_cqe,
+					     p_params->only_untagged);
+	}
+
+	return qed_sp_eth_vport_start(p_hwfn, p_params);
+}
+
+static int
+qed_sp_vport_update_rss(struct qed_hwfn *p_hwfn,
+			struct vport_update_ramrod_data *p_ramrod,
+			struct qed_rss_params *p_rss)
+{
+	struct eth_vport_rss_config *p_config;
+	u16 capabilities = 0;
+	int i, table_size;
+	int rc = 0;
+
+	if (!p_rss) {
+		p_ramrod->common.update_rss_flg = 0;
+		return rc;
+	}
+	p_config = &p_ramrod->rss_config;
+
+	BUILD_BUG_ON(QED_RSS_IND_TABLE_SIZE != ETH_RSS_IND_TABLE_ENTRIES_NUM);
+
+	rc = qed_fw_rss_eng(p_hwfn, p_rss->rss_eng_id, &p_config->rss_id);
+	if (rc)
+		return rc;
+
+	p_ramrod->common.update_rss_flg = p_rss->update_rss_config;
+	p_config->update_rss_capabilities = p_rss->update_rss_capabilities;
+	p_config->update_rss_ind_table = p_rss->update_rss_ind_table;
+	p_config->update_rss_key = p_rss->update_rss_key;
+
+	p_config->rss_mode = p_rss->rss_enable ?
+			     ETH_VPORT_RSS_MODE_REGULAR :
+			     ETH_VPORT_RSS_MODE_DISABLED;
+
+	SET_FIELD(capabilities,
+		  ETH_VPORT_RSS_CONFIG_IPV4_CAPABILITY,
+		  !!(p_rss->rss_caps & QED_RSS_IPV4));
+	SET_FIELD(capabilities,
+		  ETH_VPORT_RSS_CONFIG_IPV6_CAPABILITY,
+		  !!(p_rss->rss_caps & QED_RSS_IPV6));
+	SET_FIELD(capabilities,
+		  ETH_VPORT_RSS_CONFIG_IPV4_TCP_CAPABILITY,
+		  !!(p_rss->rss_caps & QED_RSS_IPV4_TCP));
+	SET_FIELD(capabilities,
+		  ETH_VPORT_RSS_CONFIG_IPV6_TCP_CAPABILITY,
+		  !!(p_rss->rss_caps & QED_RSS_IPV6_TCP));
+	SET_FIELD(capabilities,
+		  ETH_VPORT_RSS_CONFIG_IPV4_UDP_CAPABILITY,
+		  !!(p_rss->rss_caps & QED_RSS_IPV4_UDP));
+	SET_FIELD(capabilities,
+		  ETH_VPORT_RSS_CONFIG_IPV6_UDP_CAPABILITY,
+		  !!(p_rss->rss_caps & QED_RSS_IPV6_UDP));
+	p_config->tbl_size = p_rss->rss_table_size_log;
+
+	p_config->capabilities = cpu_to_le16(capabilities);
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_IFUP,
+		   "update rss flag %d, rss_mode = %d, update_caps = %d, capabilities = %d, update_ind = %d, update_rss_key = %d\n",
+		   p_ramrod->common.update_rss_flg,
+		   p_config->rss_mode,
+		   p_config->update_rss_capabilities,
+		   p_config->capabilities,
+		   p_config->update_rss_ind_table, p_config->update_rss_key);
+
+	table_size = min_t(int, QED_RSS_IND_TABLE_SIZE,
+			   1 << p_config->tbl_size);
+	for (i = 0; i < table_size; i++) {
+		struct qed_queue_cid *p_queue = p_rss->rss_ind_table[i];
+
+		if (!p_queue)
+			return -EINVAL;
+
+		p_config->indirection_table[i] =
+		    cpu_to_le16(p_queue->abs.queue_id);
+	}
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_IFUP,
+		   "Configured RSS indirection table [%d entries]:\n",
+		   table_size);
+	for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i += 0x10) {
+		DP_VERBOSE(p_hwfn,
+			   NETIF_MSG_IFUP,
+			   "%04x %04x %04x %04x %04x %04x %04x %04x %04x %04x %04x %04x %04x %04x %04x %04x\n",
+			   le16_to_cpu(p_config->indirection_table[i]),
+			   le16_to_cpu(p_config->indirection_table[i + 1]),
+			   le16_to_cpu(p_config->indirection_table[i + 2]),
+			   le16_to_cpu(p_config->indirection_table[i + 3]),
+			   le16_to_cpu(p_config->indirection_table[i + 4]),
+			   le16_to_cpu(p_config->indirection_table[i + 5]),
+			   le16_to_cpu(p_config->indirection_table[i + 6]),
+			   le16_to_cpu(p_config->indirection_table[i + 7]),
+			   le16_to_cpu(p_config->indirection_table[i + 8]),
+			   le16_to_cpu(p_config->indirection_table[i + 9]),
+			   le16_to_cpu(p_config->indirection_table[i + 10]),
+			   le16_to_cpu(p_config->indirection_table[i + 11]),
+			   le16_to_cpu(p_config->indirection_table[i + 12]),
+			   le16_to_cpu(p_config->indirection_table[i + 13]),
+			   le16_to_cpu(p_config->indirection_table[i + 14]),
+			   le16_to_cpu(p_config->indirection_table[i + 15]));
+	}
+
+	for (i = 0; i < 10; i++)
+		p_config->rss_key[i] = cpu_to_le32(p_rss->rss_key[i]);
+
+	return rc;
+}
+
+static void
+qed_sp_update_accept_mode(struct qed_hwfn *p_hwfn,
+			  struct vport_update_ramrod_data *p_ramrod,
+			  struct qed_filter_accept_flags accept_flags)
+{
+	p_ramrod->common.update_rx_mode_flg =
+		accept_flags.update_rx_mode_config;
+
+	p_ramrod->common.update_tx_mode_flg =
+		accept_flags.update_tx_mode_config;
+
+	/* Set Rx mode accept flags */
+	if (p_ramrod->common.update_rx_mode_flg) {
+		u8 accept_filter = accept_flags.rx_accept_filter;
+		u16 state = 0;
+
+		SET_FIELD(state, ETH_VPORT_RX_MODE_UCAST_DROP_ALL,
+			  !(!!(accept_filter & QED_ACCEPT_UCAST_MATCHED) ||
+			    !!(accept_filter & QED_ACCEPT_UCAST_UNMATCHED)));
+
+		SET_FIELD(state, ETH_VPORT_RX_MODE_UCAST_ACCEPT_UNMATCHED,
+			  !!(accept_filter & QED_ACCEPT_UCAST_UNMATCHED));
+
+		SET_FIELD(state, ETH_VPORT_RX_MODE_MCAST_DROP_ALL,
+			  !(!!(accept_filter & QED_ACCEPT_MCAST_MATCHED) ||
+			    !!(accept_filter & QED_ACCEPT_MCAST_UNMATCHED)));
+
+		SET_FIELD(state, ETH_VPORT_RX_MODE_MCAST_ACCEPT_ALL,
+			  (!!(accept_filter & QED_ACCEPT_MCAST_MATCHED) &&
+			   !!(accept_filter & QED_ACCEPT_MCAST_UNMATCHED)));
+
+		SET_FIELD(state, ETH_VPORT_RX_MODE_BCAST_ACCEPT_ALL,
+			  !!(accept_filter & QED_ACCEPT_BCAST));
+
+		p_ramrod->rx_mode.state = cpu_to_le16(state);
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "p_ramrod->rx_mode.state = 0x%x\n", state);
+	}
+
+	/* Set Tx mode accept flags */
+	if (p_ramrod->common.update_tx_mode_flg) {
+		u8 accept_filter = accept_flags.tx_accept_filter;
+		u16 state = 0;
+
+		SET_FIELD(state, ETH_VPORT_TX_MODE_UCAST_DROP_ALL,
+			  !!(accept_filter & QED_ACCEPT_NONE));
+
+		SET_FIELD(state, ETH_VPORT_TX_MODE_MCAST_DROP_ALL,
+			  !!(accept_filter & QED_ACCEPT_NONE));
+
+		SET_FIELD(state, ETH_VPORT_TX_MODE_MCAST_ACCEPT_ALL,
+			  (!!(accept_filter & QED_ACCEPT_MCAST_MATCHED) &&
+			   !!(accept_filter & QED_ACCEPT_MCAST_UNMATCHED)));
+
+		SET_FIELD(state, ETH_VPORT_TX_MODE_BCAST_ACCEPT_ALL,
+			  !!(accept_filter & QED_ACCEPT_BCAST));
+
+		p_ramrod->tx_mode.state = cpu_to_le16(state);
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "p_ramrod->tx_mode.state = 0x%x\n", state);
+	}
+}
+
+static void
+qed_sp_vport_update_sge_tpa(struct qed_hwfn *p_hwfn,
+			    struct vport_update_ramrod_data *p_ramrod,
+			    struct qed_sge_tpa_params *p_params)
+{
+	struct eth_vport_tpa_param *p_tpa;
+
+	if (!p_params) {
+		p_ramrod->common.update_tpa_param_flg = 0;
+		p_ramrod->common.update_tpa_en_flg = 0;
+		p_ramrod->common.update_tpa_param_flg = 0;
+		return;
+	}
+
+	p_ramrod->common.update_tpa_en_flg = p_params->update_tpa_en_flg;
+	p_tpa = &p_ramrod->tpa_param;
+	p_tpa->tpa_ipv4_en_flg = p_params->tpa_ipv4_en_flg;
+	p_tpa->tpa_ipv6_en_flg = p_params->tpa_ipv6_en_flg;
+	p_tpa->tpa_ipv4_tunn_en_flg = p_params->tpa_ipv4_tunn_en_flg;
+	p_tpa->tpa_ipv6_tunn_en_flg = p_params->tpa_ipv6_tunn_en_flg;
+
+	p_ramrod->common.update_tpa_param_flg = p_params->update_tpa_param_flg;
+	p_tpa->max_buff_num = p_params->max_buffers_per_cqe;
+	p_tpa->tpa_pkt_split_flg = p_params->tpa_pkt_split_flg;
+	p_tpa->tpa_hdr_data_split_flg = p_params->tpa_hdr_data_split_flg;
+	p_tpa->tpa_gro_consistent_flg = p_params->tpa_gro_consistent_flg;
+	p_tpa->tpa_max_aggs_num = p_params->tpa_max_aggs_num;
+	p_tpa->tpa_max_size = p_params->tpa_max_size;
+	p_tpa->tpa_min_size_to_start = p_params->tpa_min_size_to_start;
+	p_tpa->tpa_min_size_to_cont = p_params->tpa_min_size_to_cont;
+}
+
+static void
+qed_sp_update_mcast_bin(struct qed_hwfn *p_hwfn,
+			struct vport_update_ramrod_data *p_ramrod,
+			struct qed_sp_vport_update_params *p_params)
+{
+	int i;
+
+	memset(&p_ramrod->approx_mcast.bins, 0,
+	       sizeof(p_ramrod->approx_mcast.bins));
+
+	if (!p_params->update_approx_mcast_flg)
+		return;
+
+	p_ramrod->common.update_approx_mcast_flg = 1;
+	for (i = 0; i < ETH_MULTICAST_MAC_BINS_IN_REGS; i++) {
+		u32 *p_bins = (u32 *)p_params->bins;
+
+		p_ramrod->approx_mcast.bins[i] = cpu_to_le32(p_bins[i]);
+	}
+}
+
+int qed_sp_vport_update(struct qed_hwfn *p_hwfn,
+			struct qed_sp_vport_update_params *p_params,
+			enum spq_mode comp_mode,
+			struct qed_spq_comp_cb *p_comp_data)
+{
+	struct qed_rss_params *p_rss_params = p_params->rss_params;
+	struct vport_update_ramrod_data_cmn *p_cmn;
+	struct qed_sp_init_data init_data;
+	struct vport_update_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	u8 abs_vport_id = 0, val;
+	int rc = -EINVAL;
+
+	if (IS_VF(p_hwfn->cdev)) {
+		rc = qed_vf_pf_vport_update(p_hwfn, p_params);
+		return rc;
+	}
+
+	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
+	if (rc)
+		return rc;
+
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_params->opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_data;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ETH_RAMROD_VPORT_UPDATE,
+				 PROTOCOLID_ETH, &init_data);
+	if (rc)
+		return rc;
+
+	/* Copy input params to ramrod according to FW struct */
+	p_ramrod = &p_ent->ramrod.vport_update;
+	p_cmn = &p_ramrod->common;
+
+	p_cmn->vport_id = abs_vport_id;
+	p_cmn->rx_active_flg = p_params->vport_active_rx_flg;
+	p_cmn->update_rx_active_flg = p_params->update_vport_active_rx_flg;
+	p_cmn->tx_active_flg = p_params->vport_active_tx_flg;
+	p_cmn->update_tx_active_flg = p_params->update_vport_active_tx_flg;
+	p_cmn->accept_any_vlan = p_params->accept_any_vlan;
+	val = p_params->update_accept_any_vlan_flg;
+	p_cmn->update_accept_any_vlan_flg = val;
+
+	p_cmn->inner_vlan_removal_en = p_params->inner_vlan_removal_flg;
+	val = p_params->update_inner_vlan_removal_flg;
+	p_cmn->update_inner_vlan_removal_en_flg = val;
+
+	p_cmn->default_vlan_en = p_params->default_vlan_enable_flg;
+	val = p_params->update_default_vlan_enable_flg;
+	p_cmn->update_default_vlan_en_flg = val;
+
+	p_cmn->default_vlan = cpu_to_le16(p_params->default_vlan);
+	p_cmn->update_default_vlan_flg = p_params->update_default_vlan_flg;
+
+	p_cmn->silent_vlan_removal_en = p_params->silent_vlan_removal_flg;
+
+	p_ramrod->common.tx_switching_en = p_params->tx_switching_flg;
+	p_cmn->update_tx_switching_en_flg = p_params->update_tx_switching_flg;
+
+	p_cmn->anti_spoofing_en = p_params->anti_spoofing_en;
+	val = p_params->update_anti_spoofing_en_flg;
+	p_ramrod->common.update_anti_spoofing_en_flg = val;
+
+	rc = qed_sp_vport_update_rss(p_hwfn, p_ramrod, p_rss_params);
+	if (rc) {
+		/* Return spq entry which is taken in qed_sp_init_request()*/
+		qed_spq_return_entry(p_hwfn, p_ent);
+		return rc;
+	}
+
+	/* Update mcast bins for VFs, PF doesn't use this functionality */
+	qed_sp_update_mcast_bin(p_hwfn, p_ramrod, p_params);
+
+	qed_sp_update_accept_mode(p_hwfn, p_ramrod, p_params->accept_flags);
+	qed_sp_vport_update_sge_tpa(p_hwfn, p_ramrod, p_params->sge_tpa_params);
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+int qed_sp_vport_stop(struct qed_hwfn *p_hwfn, u16 opaque_fid, u8 vport_id)
+{
+	struct vport_stop_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	u8 abs_vport_id = 0;
+	int rc;
+
+	if (IS_VF(p_hwfn->cdev))
+		return qed_vf_pf_vport_stop(p_hwfn);
+
+	rc = qed_fw_vport(p_hwfn, vport_id, &abs_vport_id);
+	if (rc)
+		return rc;
+
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ETH_RAMROD_VPORT_STOP,
+				 PROTOCOLID_ETH, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.vport_stop;
+	p_ramrod->vport_id = abs_vport_id;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int
+qed_vf_pf_accept_flags(struct qed_hwfn *p_hwfn,
+		       struct qed_filter_accept_flags *p_accept_flags)
+{
+	struct qed_sp_vport_update_params s_params;
+
+	memset(&s_params, 0, sizeof(s_params));
+	memcpy(&s_params.accept_flags, p_accept_flags,
+	       sizeof(struct qed_filter_accept_flags));
+
+	return qed_vf_pf_vport_update(p_hwfn, &s_params);
+}
+
+static int qed_filter_accept_cmd(struct qed_dev *cdev,
+				 u8 vport,
+				 struct qed_filter_accept_flags accept_flags,
+				 u8 update_accept_any_vlan,
+				 u8 accept_any_vlan,
+				 enum spq_mode comp_mode,
+				 struct qed_spq_comp_cb *p_comp_data)
+{
+	struct qed_sp_vport_update_params vport_update_params;
+	int i, rc;
+
+	/* Prepare and send the vport rx_mode change */
+	memset(&vport_update_params, 0, sizeof(vport_update_params));
+	vport_update_params.vport_id = vport;
+	vport_update_params.accept_flags = accept_flags;
+	vport_update_params.update_accept_any_vlan_flg = update_accept_any_vlan;
+	vport_update_params.accept_any_vlan = accept_any_vlan;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		vport_update_params.opaque_fid = p_hwfn->hw_info.opaque_fid;
+
+		if (IS_VF(cdev)) {
+			rc = qed_vf_pf_accept_flags(p_hwfn, &accept_flags);
+			if (rc)
+				return rc;
+			continue;
+		}
+
+		rc = qed_sp_vport_update(p_hwfn, &vport_update_params,
+					 comp_mode, p_comp_data);
+		if (rc) {
+			DP_ERR(cdev, "Update rx_mode failed %d\n", rc);
+			return rc;
+		}
+
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Accept filter configured, flags = [Rx]%x [Tx]%x\n",
+			   accept_flags.rx_accept_filter,
+			   accept_flags.tx_accept_filter);
+		if (update_accept_any_vlan)
+			DP_VERBOSE(p_hwfn, QED_MSG_SP,
+				   "accept_any_vlan=%d configured\n",
+				   accept_any_vlan);
+	}
+
+	return 0;
+}
+
+int qed_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
+			     struct qed_queue_cid *p_cid,
+			     u16 bd_max_bytes,
+			     dma_addr_t bd_chain_phys_addr,
+			     dma_addr_t cqe_pbl_addr, u16 cqe_pbl_size)
+{
+	struct rx_queue_start_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SP,
+		   "opaque_fid=0x%x, cid=0x%x, rx_qzone=0x%x, vport_id=0x%x, sb_id=0x%x\n",
+		   p_cid->opaque_fid, p_cid->cid,
+		   p_cid->abs.queue_id, p_cid->abs.vport_id, p_cid->sb_igu_id);
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_cid->cid;
+	init_data.opaque_fid = p_cid->opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ETH_RAMROD_RX_QUEUE_START,
+				 PROTOCOLID_ETH, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.rx_queue_start;
+
+	p_ramrod->sb_id = cpu_to_le16(p_cid->sb_igu_id);
+	p_ramrod->sb_index = p_cid->sb_idx;
+	p_ramrod->vport_id = p_cid->abs.vport_id;
+	p_ramrod->stats_counter_id = p_cid->abs.stats_id;
+	p_ramrod->rx_queue_id = cpu_to_le16(p_cid->abs.queue_id);
+	p_ramrod->complete_cqe_flg = 0;
+	p_ramrod->complete_event_flg = 1;
+
+	p_ramrod->bd_max_bytes = cpu_to_le16(bd_max_bytes);
+	DMA_REGPAIR_LE(p_ramrod->bd_base, bd_chain_phys_addr);
+
+	p_ramrod->num_of_pbl_pages = cpu_to_le16(cqe_pbl_size);
+	DMA_REGPAIR_LE(p_ramrod->cqe_pbl_addr, cqe_pbl_addr);
+
+	if (p_cid->vfid != QED_QUEUE_CID_SELF) {
+		bool b_legacy_vf = !!(p_cid->vf_legacy &
+				      QED_QCID_LEGACY_VF_RX_PROD);
+
+		p_ramrod->vf_rx_prod_index = p_cid->vf_qid;
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Queue%s is meant for VF rxq[%02x]\n",
+			   b_legacy_vf ? " [legacy]" : "", p_cid->vf_qid);
+		p_ramrod->vf_rx_prod_use_zone_a = b_legacy_vf;
+	}
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int
+qed_eth_pf_rx_queue_start(struct qed_hwfn *p_hwfn,
+			  struct qed_queue_cid *p_cid,
+			  u16 bd_max_bytes,
+			  dma_addr_t bd_chain_phys_addr,
+			  dma_addr_t cqe_pbl_addr,
+			  u16 cqe_pbl_size, void __iomem **pp_prod)
+{
+	u32 init_prod_val = 0;
+
+	*pp_prod = p_hwfn->regview +
+		   GTT_BAR0_MAP_REG_MSDM_RAM +
+		    MSTORM_ETH_PF_PRODS_OFFSET(p_cid->abs.queue_id);
+
+	/* Init the rcq, rx bd and rx sge (if valid) producers to 0 */
+	__internal_ram_wr(p_hwfn, *pp_prod, sizeof(u32),
+			  (u32 *)(&init_prod_val));
+
+	return qed_eth_rxq_start_ramrod(p_hwfn, p_cid,
+					bd_max_bytes,
+					bd_chain_phys_addr,
+					cqe_pbl_addr, cqe_pbl_size);
+}
+
+static int
+qed_eth_rx_queue_start(struct qed_hwfn *p_hwfn,
+		       u16 opaque_fid,
+		       struct qed_queue_start_common_params *p_params,
+		       u16 bd_max_bytes,
+		       dma_addr_t bd_chain_phys_addr,
+		       dma_addr_t cqe_pbl_addr,
+		       u16 cqe_pbl_size,
+		       struct qed_rxq_start_ret_params *p_ret_params)
+{
+	struct qed_queue_cid *p_cid;
+	int rc;
+
+	/* Allocate a CID for the queue */
+	p_cid = qed_eth_queue_to_cid_pf(p_hwfn, opaque_fid, true, p_params);
+	if (!p_cid)
+		return -ENOMEM;
+
+	if (IS_PF(p_hwfn->cdev)) {
+		rc = qed_eth_pf_rx_queue_start(p_hwfn, p_cid,
+					       bd_max_bytes,
+					       bd_chain_phys_addr,
+					       cqe_pbl_addr, cqe_pbl_size,
+					       &p_ret_params->p_prod);
+	} else {
+		rc = qed_vf_pf_rxq_start(p_hwfn, p_cid,
+					 bd_max_bytes,
+					 bd_chain_phys_addr,
+					 cqe_pbl_addr,
+					 cqe_pbl_size, &p_ret_params->p_prod);
+	}
+
+	/* Provide the caller with a reference to as handler */
+	if (rc)
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	else
+		p_ret_params->p_handle = (void *)p_cid;
+
+	return rc;
+}
+
+int qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
+				void **pp_rxq_handles,
+				u8 num_rxqs,
+				u8 complete_cqe_flg,
+				u8 complete_event_flg,
+				enum spq_mode comp_mode,
+				struct qed_spq_comp_cb *p_comp_data)
+{
+	struct rx_queue_update_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	struct qed_queue_cid *p_cid;
+	int rc = -EINVAL;
+	u8 i;
+
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_data;
+
+	for (i = 0; i < num_rxqs; i++) {
+		p_cid = ((struct qed_queue_cid **)pp_rxq_handles)[i];
+
+		/* Get SPQ entry */
+		init_data.cid = p_cid->cid;
+		init_data.opaque_fid = p_cid->opaque_fid;
+
+		rc = qed_sp_init_request(p_hwfn, &p_ent,
+					 ETH_RAMROD_RX_QUEUE_UPDATE,
+					 PROTOCOLID_ETH, &init_data);
+		if (rc)
+			return rc;
+
+		p_ramrod = &p_ent->ramrod.rx_queue_update;
+		p_ramrod->vport_id = p_cid->abs.vport_id;
+
+		p_ramrod->rx_queue_id = cpu_to_le16(p_cid->abs.queue_id);
+		p_ramrod->complete_cqe_flg = complete_cqe_flg;
+		p_ramrod->complete_event_flg = complete_event_flg;
+
+		rc = qed_spq_post(p_hwfn, p_ent, NULL);
+		if (rc)
+			return rc;
+	}
+
+	return rc;
+}
+
+static int
+qed_eth_pf_rx_queue_stop(struct qed_hwfn *p_hwfn,
+			 struct qed_queue_cid *p_cid,
+			 bool b_eq_completion_only, bool b_cqe_completion)
+{
+	struct rx_queue_stop_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc;
+
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_cid->cid;
+	init_data.opaque_fid = p_cid->opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ETH_RAMROD_RX_QUEUE_STOP,
+				 PROTOCOLID_ETH, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.rx_queue_stop;
+	p_ramrod->vport_id = p_cid->abs.vport_id;
+	p_ramrod->rx_queue_id = cpu_to_le16(p_cid->abs.queue_id);
+
+	/* Cleaning the queue requires the completion to arrive there.
+	 * In addition, VFs require the answer to come as eqe to PF.
+	 */
+	p_ramrod->complete_cqe_flg = ((p_cid->vfid == QED_QUEUE_CID_SELF) &&
+				      !b_eq_completion_only) ||
+				     b_cqe_completion;
+	p_ramrod->complete_event_flg = (p_cid->vfid != QED_QUEUE_CID_SELF) ||
+				       b_eq_completion_only;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+int qed_eth_rx_queue_stop(struct qed_hwfn *p_hwfn,
+			  void *p_rxq,
+			  bool eq_completion_only, bool cqe_completion)
+{
+	struct qed_queue_cid *p_cid = (struct qed_queue_cid *)p_rxq;
+	int rc = -EINVAL;
+
+	if (IS_PF(p_hwfn->cdev))
+		rc = qed_eth_pf_rx_queue_stop(p_hwfn, p_cid,
+					      eq_completion_only,
+					      cqe_completion);
+	else
+		rc = qed_vf_pf_rxq_stop(p_hwfn, p_cid, cqe_completion);
+
+	if (!rc)
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	return rc;
+}
+
+int
+qed_eth_txq_start_ramrod(struct qed_hwfn *p_hwfn,
+			 struct qed_queue_cid *p_cid,
+			 dma_addr_t pbl_addr, u16 pbl_size, u16 pq_id)
+{
+	struct tx_queue_start_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_cid->cid;
+	init_data.opaque_fid = p_cid->opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ETH_RAMROD_TX_QUEUE_START,
+				 PROTOCOLID_ETH, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.tx_queue_start;
+	p_ramrod->vport_id = p_cid->abs.vport_id;
+
+	p_ramrod->sb_id = cpu_to_le16(p_cid->sb_igu_id);
+	p_ramrod->sb_index = p_cid->sb_idx;
+	p_ramrod->stats_counter_id = p_cid->abs.stats_id;
+
+	p_ramrod->queue_zone_id = cpu_to_le16(p_cid->abs.queue_id);
+	p_ramrod->same_as_last_id = cpu_to_le16(p_cid->abs.queue_id);
+
+	p_ramrod->pbl_size = cpu_to_le16(pbl_size);
+	DMA_REGPAIR_LE(p_ramrod->pbl_base_addr, pbl_addr);
+
+	p_ramrod->qm_pq_id = cpu_to_le16(pq_id);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int
+qed_eth_pf_tx_queue_start(struct qed_hwfn *p_hwfn,
+			  struct qed_queue_cid *p_cid,
+			  u8 tc,
+			  dma_addr_t pbl_addr,
+			  u16 pbl_size, void __iomem **pp_doorbell)
+{
+	int rc;
+
+
+	rc = qed_eth_txq_start_ramrod(p_hwfn, p_cid,
+				      pbl_addr, pbl_size,
+				      qed_get_cm_pq_idx_mcos(p_hwfn, tc));
+	if (rc)
+		return rc;
+
+	/* Provide the caller with the necessary return values */
+	*pp_doorbell = p_hwfn->doorbells +
+		       qed_db_addr(p_cid->cid, DQ_DEMS_LEGACY);
+
+	return 0;
+}
+
+static int
+qed_eth_tx_queue_start(struct qed_hwfn *p_hwfn,
+		       u16 opaque_fid,
+		       struct qed_queue_start_common_params *p_params,
+		       u8 tc,
+		       dma_addr_t pbl_addr,
+		       u16 pbl_size,
+		       struct qed_txq_start_ret_params *p_ret_params)
+{
+	struct qed_queue_cid *p_cid;
+	int rc;
+
+	p_cid = qed_eth_queue_to_cid_pf(p_hwfn, opaque_fid, false, p_params);
+	if (!p_cid)
+		return -EINVAL;
+
+	if (IS_PF(p_hwfn->cdev))
+		rc = qed_eth_pf_tx_queue_start(p_hwfn, p_cid, tc,
+					       pbl_addr, pbl_size,
+					       &p_ret_params->p_doorbell);
+	else
+		rc = qed_vf_pf_txq_start(p_hwfn, p_cid,
+					 pbl_addr, pbl_size,
+					 &p_ret_params->p_doorbell);
+
+	if (rc)
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	else
+		p_ret_params->p_handle = (void *)p_cid;
+
+	return rc;
+}
+
+static int
+qed_eth_pf_tx_queue_stop(struct qed_hwfn *p_hwfn, struct qed_queue_cid *p_cid)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc;
+
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_cid->cid;
+	init_data.opaque_fid = p_cid->opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ETH_RAMROD_TX_QUEUE_STOP,
+				 PROTOCOLID_ETH, &init_data);
+	if (rc)
+		return rc;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+int qed_eth_tx_queue_stop(struct qed_hwfn *p_hwfn, void *p_handle)
+{
+	struct qed_queue_cid *p_cid = (struct qed_queue_cid *)p_handle;
+	int rc;
+
+	if (IS_PF(p_hwfn->cdev))
+		rc = qed_eth_pf_tx_queue_stop(p_hwfn, p_cid);
+	else
+		rc = qed_vf_pf_txq_stop(p_hwfn, p_cid);
+
+	if (!rc)
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	return rc;
+}
+
+static enum eth_filter_action qed_filter_action(enum qed_filter_opcode opcode)
+{
+	enum eth_filter_action action = MAX_ETH_FILTER_ACTION;
+
+	switch (opcode) {
+	case QED_FILTER_ADD:
+		action = ETH_FILTER_ACTION_ADD;
+		break;
+	case QED_FILTER_REMOVE:
+		action = ETH_FILTER_ACTION_REMOVE;
+		break;
+	case QED_FILTER_FLUSH:
+		action = ETH_FILTER_ACTION_REMOVE_ALL;
+		break;
+	default:
+		action = MAX_ETH_FILTER_ACTION;
+	}
+
+	return action;
+}
+
+static int
+qed_filter_ucast_common(struct qed_hwfn *p_hwfn,
+			u16 opaque_fid,
+			struct qed_filter_ucast *p_filter_cmd,
+			struct vport_filter_update_ramrod_data **pp_ramrod,
+			struct qed_spq_entry **pp_ent,
+			enum spq_mode comp_mode,
+			struct qed_spq_comp_cb *p_comp_data)
+{
+	u8 vport_to_add_to = 0, vport_to_remove_from = 0;
+	struct vport_filter_update_ramrod_data *p_ramrod;
+	struct eth_filter_cmd *p_first_filter;
+	struct eth_filter_cmd *p_second_filter;
+	struct qed_sp_init_data init_data;
+	enum eth_filter_action action;
+	int rc;
+
+	rc = qed_fw_vport(p_hwfn, p_filter_cmd->vport_to_remove_from,
+			  &vport_to_remove_from);
+	if (rc)
+		return rc;
+
+	rc = qed_fw_vport(p_hwfn, p_filter_cmd->vport_to_add_to,
+			  &vport_to_add_to);
+	if (rc)
+		return rc;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_data;
+
+	rc = qed_sp_init_request(p_hwfn, pp_ent,
+				 ETH_RAMROD_FILTERS_UPDATE,
+				 PROTOCOLID_ETH, &init_data);
+	if (rc)
+		return rc;
+
+	*pp_ramrod = &(*pp_ent)->ramrod.vport_filter_update;
+	p_ramrod = *pp_ramrod;
+	p_ramrod->filter_cmd_hdr.rx = p_filter_cmd->is_rx_filter ? 1 : 0;
+	p_ramrod->filter_cmd_hdr.tx = p_filter_cmd->is_tx_filter ? 1 : 0;
+
+	switch (p_filter_cmd->opcode) {
+	case QED_FILTER_REPLACE:
+	case QED_FILTER_MOVE:
+		p_ramrod->filter_cmd_hdr.cmd_cnt = 2; break;
+	default:
+		p_ramrod->filter_cmd_hdr.cmd_cnt = 1; break;
+	}
+
+	p_first_filter	= &p_ramrod->filter_cmds[0];
+	p_second_filter = &p_ramrod->filter_cmds[1];
+
+	switch (p_filter_cmd->type) {
+	case QED_FILTER_MAC:
+		p_first_filter->type = ETH_FILTER_TYPE_MAC; break;
+	case QED_FILTER_VLAN:
+		p_first_filter->type = ETH_FILTER_TYPE_VLAN; break;
+	case QED_FILTER_MAC_VLAN:
+		p_first_filter->type = ETH_FILTER_TYPE_PAIR; break;
+	case QED_FILTER_INNER_MAC:
+		p_first_filter->type = ETH_FILTER_TYPE_INNER_MAC; break;
+	case QED_FILTER_INNER_VLAN:
+		p_first_filter->type = ETH_FILTER_TYPE_INNER_VLAN; break;
+	case QED_FILTER_INNER_PAIR:
+		p_first_filter->type = ETH_FILTER_TYPE_INNER_PAIR; break;
+	case QED_FILTER_INNER_MAC_VNI_PAIR:
+		p_first_filter->type = ETH_FILTER_TYPE_INNER_MAC_VNI_PAIR;
+		break;
+	case QED_FILTER_MAC_VNI_PAIR:
+		p_first_filter->type = ETH_FILTER_TYPE_MAC_VNI_PAIR; break;
+	case QED_FILTER_VNI:
+		p_first_filter->type = ETH_FILTER_TYPE_VNI; break;
+	}
+
+	if ((p_first_filter->type == ETH_FILTER_TYPE_MAC) ||
+	    (p_first_filter->type == ETH_FILTER_TYPE_PAIR) ||
+	    (p_first_filter->type == ETH_FILTER_TYPE_INNER_MAC) ||
+	    (p_first_filter->type == ETH_FILTER_TYPE_INNER_PAIR) ||
+	    (p_first_filter->type == ETH_FILTER_TYPE_INNER_MAC_VNI_PAIR) ||
+	    (p_first_filter->type == ETH_FILTER_TYPE_MAC_VNI_PAIR)) {
+		qed_set_fw_mac_addr(&p_first_filter->mac_msb,
+				    &p_first_filter->mac_mid,
+				    &p_first_filter->mac_lsb,
+				    (u8 *)p_filter_cmd->mac);
+	}
+
+	if ((p_first_filter->type == ETH_FILTER_TYPE_VLAN) ||
+	    (p_first_filter->type == ETH_FILTER_TYPE_PAIR) ||
+	    (p_first_filter->type == ETH_FILTER_TYPE_INNER_VLAN) ||
+	    (p_first_filter->type == ETH_FILTER_TYPE_INNER_PAIR))
+		p_first_filter->vlan_id = cpu_to_le16(p_filter_cmd->vlan);
+
+	if ((p_first_filter->type == ETH_FILTER_TYPE_INNER_MAC_VNI_PAIR) ||
+	    (p_first_filter->type == ETH_FILTER_TYPE_MAC_VNI_PAIR) ||
+	    (p_first_filter->type == ETH_FILTER_TYPE_VNI))
+		p_first_filter->vni = cpu_to_le32(p_filter_cmd->vni);
+
+	if (p_filter_cmd->opcode == QED_FILTER_MOVE) {
+		p_second_filter->type = p_first_filter->type;
+		p_second_filter->mac_msb = p_first_filter->mac_msb;
+		p_second_filter->mac_mid = p_first_filter->mac_mid;
+		p_second_filter->mac_lsb = p_first_filter->mac_lsb;
+		p_second_filter->vlan_id = p_first_filter->vlan_id;
+		p_second_filter->vni = p_first_filter->vni;
+
+		p_first_filter->action = ETH_FILTER_ACTION_REMOVE;
+
+		p_first_filter->vport_id = vport_to_remove_from;
+
+		p_second_filter->action = ETH_FILTER_ACTION_ADD;
+		p_second_filter->vport_id = vport_to_add_to;
+	} else if (p_filter_cmd->opcode == QED_FILTER_REPLACE) {
+		p_first_filter->vport_id = vport_to_add_to;
+		memcpy(p_second_filter, p_first_filter,
+		       sizeof(*p_second_filter));
+		p_first_filter->action	= ETH_FILTER_ACTION_REMOVE_ALL;
+		p_second_filter->action = ETH_FILTER_ACTION_ADD;
+	} else {
+		action = qed_filter_action(p_filter_cmd->opcode);
+
+		if (action == MAX_ETH_FILTER_ACTION) {
+			DP_NOTICE(p_hwfn,
+				  "%d is not supported yet\n",
+				  p_filter_cmd->opcode);
+			return -EINVAL;
+		}
+
+		p_first_filter->action = action;
+		p_first_filter->vport_id = (p_filter_cmd->opcode ==
+					    QED_FILTER_REMOVE) ?
+					   vport_to_remove_from :
+					   vport_to_add_to;
+	}
+
+	return 0;
+}
+
+int qed_sp_eth_filter_ucast(struct qed_hwfn *p_hwfn,
+			    u16 opaque_fid,
+			    struct qed_filter_ucast *p_filter_cmd,
+			    enum spq_mode comp_mode,
+			    struct qed_spq_comp_cb *p_comp_data)
+{
+	struct vport_filter_update_ramrod_data	*p_ramrod	= NULL;
+	struct qed_spq_entry			*p_ent		= NULL;
+	struct eth_filter_cmd_header		*p_header;
+	int					rc;
+
+	rc = qed_filter_ucast_common(p_hwfn, opaque_fid, p_filter_cmd,
+				     &p_ramrod, &p_ent,
+				     comp_mode, p_comp_data);
+	if (rc) {
+		DP_ERR(p_hwfn, "Uni. filter command failed %d\n", rc);
+		return rc;
+	}
+	p_header = &p_ramrod->filter_cmd_hdr;
+	p_header->assert_on_error = p_filter_cmd->assert_on_error;
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc) {
+		DP_ERR(p_hwfn, "Unicast filter ADD command failed %d\n", rc);
+		return rc;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SP,
+		   "Unicast filter configured, opcode = %s, type = %s, cmd_cnt = %d, is_rx_filter = %d, is_tx_filter = %d\n",
+		   (p_filter_cmd->opcode == QED_FILTER_ADD) ? "ADD" :
+		   ((p_filter_cmd->opcode == QED_FILTER_REMOVE) ?
+		   "REMOVE" :
+		   ((p_filter_cmd->opcode == QED_FILTER_MOVE) ?
+		    "MOVE" : "REPLACE")),
+		   (p_filter_cmd->type == QED_FILTER_MAC) ? "MAC" :
+		   ((p_filter_cmd->type == QED_FILTER_VLAN) ?
+		    "VLAN" : "MAC & VLAN"),
+		   p_ramrod->filter_cmd_hdr.cmd_cnt,
+		   p_filter_cmd->is_rx_filter,
+		   p_filter_cmd->is_tx_filter);
+	DP_VERBOSE(p_hwfn, QED_MSG_SP,
+		   "vport_to_add_to = %d, vport_to_remove_from = %d, mac = %2x:%2x:%2x:%2x:%2x:%2x, vlan = %d\n",
+		   p_filter_cmd->vport_to_add_to,
+		   p_filter_cmd->vport_to_remove_from,
+		   p_filter_cmd->mac[0],
+		   p_filter_cmd->mac[1],
+		   p_filter_cmd->mac[2],
+		   p_filter_cmd->mac[3],
+		   p_filter_cmd->mac[4],
+		   p_filter_cmd->mac[5],
+		   p_filter_cmd->vlan);
+
+	return 0;
+}
+
+/*******************************************************************************
+ * Description:
+ *         Calculates crc 32 on a buffer
+ *         Note: crc32_length MUST be aligned to 8
+ * Return:
+ ******************************************************************************/
+static u32 qed_calc_crc32c(u8 *crc32_packet,
+			   u32 crc32_length, u32 crc32_seed, u8 complement)
+{
+	u32 byte = 0, bit = 0, crc32_result = crc32_seed;
+	u8 msb = 0, current_byte = 0;
+
+	if ((!crc32_packet) ||
+	    (crc32_length == 0) ||
+	    ((crc32_length % 8) != 0))
+		return crc32_result;
+	for (byte = 0; byte < crc32_length; byte++) {
+		current_byte = crc32_packet[byte];
+		for (bit = 0; bit < 8; bit++) {
+			msb = (u8)(crc32_result >> 31);
+			crc32_result = crc32_result << 1;
+			if (msb != (0x1 & (current_byte >> bit))) {
+				crc32_result = crc32_result ^ CRC32_POLY;
+				crc32_result |= 1; /*crc32_result[0] = 1;*/
+			}
+		}
+	}
+	return crc32_result;
+}
+
+static u32 qed_crc32c_le(u32 seed, u8 *mac, u32 len)
+{
+	u32 packet_buf[2] = { 0 };
+
+	memcpy((u8 *)(&packet_buf[0]), &mac[0], 6);
+	return qed_calc_crc32c((u8 *)packet_buf, 8, seed, 0);
+}
+
+u8 qed_mcast_bin_from_mac(u8 *mac)
+{
+	u32 crc = qed_crc32c_le(ETH_MULTICAST_BIN_FROM_MAC_SEED,
+				mac, ETH_ALEN);
+
+	return crc & 0xff;
+}
+
+static int
+qed_sp_eth_filter_mcast(struct qed_hwfn *p_hwfn,
+			u16 opaque_fid,
+			struct qed_filter_mcast *p_filter_cmd,
+			enum spq_mode comp_mode,
+			struct qed_spq_comp_cb *p_comp_data)
+{
+	unsigned long bins[ETH_MULTICAST_MAC_BINS_IN_REGS];
+	struct vport_update_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	u8 abs_vport_id = 0;
+	int rc, i;
+
+	if (p_filter_cmd->opcode == QED_FILTER_ADD)
+		rc = qed_fw_vport(p_hwfn, p_filter_cmd->vport_to_add_to,
+				  &abs_vport_id);
+	else
+		rc = qed_fw_vport(p_hwfn, p_filter_cmd->vport_to_remove_from,
+				  &abs_vport_id);
+	if (rc)
+		return rc;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_data;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ETH_RAMROD_VPORT_UPDATE,
+				 PROTOCOLID_ETH, &init_data);
+	if (rc) {
+		DP_ERR(p_hwfn, "Multi-cast command failed %d\n", rc);
+		return rc;
+	}
+
+	p_ramrod = &p_ent->ramrod.vport_update;
+	p_ramrod->common.update_approx_mcast_flg = 1;
+
+	/* explicitly clear out the entire vector */
+	memset(&p_ramrod->approx_mcast.bins, 0,
+	       sizeof(p_ramrod->approx_mcast.bins));
+	memset(bins, 0, sizeof(unsigned long) *
+	       ETH_MULTICAST_MAC_BINS_IN_REGS);
+	/* filter ADD op is explicit set op and it removes
+	 *  any existing filters for the vport
+	 */
+	if (p_filter_cmd->opcode == QED_FILTER_ADD) {
+		for (i = 0; i < p_filter_cmd->num_mc_addrs; i++) {
+			u32 bit;
+
+			bit = qed_mcast_bin_from_mac(p_filter_cmd->mac[i]);
+			__set_bit(bit, bins);
+		}
+
+		/* Convert to correct endianity */
+		for (i = 0; i < ETH_MULTICAST_MAC_BINS_IN_REGS; i++) {
+			struct vport_update_ramrod_mcast *p_ramrod_bins;
+			u32 *p_bins = (u32 *)bins;
+
+			p_ramrod_bins = &p_ramrod->approx_mcast;
+			p_ramrod_bins->bins[i] = cpu_to_le32(p_bins[i]);
+		}
+	}
+
+	p_ramrod->common.vport_id = abs_vport_id;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_filter_mcast_cmd(struct qed_dev *cdev,
+				struct qed_filter_mcast *p_filter_cmd,
+				enum spq_mode comp_mode,
+				struct qed_spq_comp_cb *p_comp_data)
+{
+	int rc = 0;
+	int i;
+
+	/* only ADD and REMOVE operations are supported for multi-cast */
+	if ((p_filter_cmd->opcode != QED_FILTER_ADD &&
+	     (p_filter_cmd->opcode != QED_FILTER_REMOVE)) ||
+	    (p_filter_cmd->num_mc_addrs > QED_MAX_MC_ADDRS))
+		return -EINVAL;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		u16 opaque_fid;
+
+		if (IS_VF(cdev)) {
+			qed_vf_pf_filter_mcast(p_hwfn, p_filter_cmd);
+			continue;
+		}
+
+		opaque_fid = p_hwfn->hw_info.opaque_fid;
+
+		rc = qed_sp_eth_filter_mcast(p_hwfn,
+					     opaque_fid,
+					     p_filter_cmd,
+					     comp_mode, p_comp_data);
+	}
+	return rc;
+}
+
+static int qed_filter_ucast_cmd(struct qed_dev *cdev,
+				struct qed_filter_ucast *p_filter_cmd,
+				enum spq_mode comp_mode,
+				struct qed_spq_comp_cb *p_comp_data)
+{
+	int rc = 0;
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+		u16 opaque_fid;
+
+		if (IS_VF(cdev)) {
+			rc = qed_vf_pf_filter_ucast(p_hwfn, p_filter_cmd);
+			continue;
+		}
+
+		opaque_fid = p_hwfn->hw_info.opaque_fid;
+
+		rc = qed_sp_eth_filter_ucast(p_hwfn,
+					     opaque_fid,
+					     p_filter_cmd,
+					     comp_mode, p_comp_data);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+/* Statistics related code */
+static void __qed_get_vport_pstats_addrlen(struct qed_hwfn *p_hwfn,
+					   u32 *p_addr,
+					   u32 *p_len, u16 statistics_bin)
+{
+	if (IS_PF(p_hwfn->cdev)) {
+		*p_addr = BAR0_MAP_REG_PSDM_RAM +
+		    PSTORM_QUEUE_STAT_OFFSET(statistics_bin);
+		*p_len = sizeof(struct eth_pstorm_per_queue_stat);
+	} else {
+		struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+		struct pfvf_acquire_resp_tlv *p_resp = &p_iov->acquire_resp;
+
+		*p_addr = p_resp->pfdev_info.stats_info.pstats.address;
+		*p_len = p_resp->pfdev_info.stats_info.pstats.len;
+	}
+}
+
+static void __qed_get_vport_pstats(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_eth_stats *p_stats,
+				   u16 statistics_bin)
+{
+	struct eth_pstorm_per_queue_stat pstats;
+	u32 pstats_addr = 0, pstats_len = 0;
+
+	__qed_get_vport_pstats_addrlen(p_hwfn, &pstats_addr, &pstats_len,
+				       statistics_bin);
+
+	memset(&pstats, 0, sizeof(pstats));
+	qed_memcpy_from(p_hwfn, p_ptt, &pstats, pstats_addr, pstats_len);
+
+	p_stats->common.tx_ucast_bytes +=
+	    HILO_64_REGPAIR(pstats.sent_ucast_bytes);
+	p_stats->common.tx_mcast_bytes +=
+	    HILO_64_REGPAIR(pstats.sent_mcast_bytes);
+	p_stats->common.tx_bcast_bytes +=
+	    HILO_64_REGPAIR(pstats.sent_bcast_bytes);
+	p_stats->common.tx_ucast_pkts +=
+	    HILO_64_REGPAIR(pstats.sent_ucast_pkts);
+	p_stats->common.tx_mcast_pkts +=
+	    HILO_64_REGPAIR(pstats.sent_mcast_pkts);
+	p_stats->common.tx_bcast_pkts +=
+	    HILO_64_REGPAIR(pstats.sent_bcast_pkts);
+	p_stats->common.tx_err_drop_pkts +=
+	    HILO_64_REGPAIR(pstats.error_drop_pkts);
+}
+
+static void __qed_get_vport_tstats(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_eth_stats *p_stats,
+				   u16 statistics_bin)
+{
+	struct tstorm_per_port_stat tstats;
+	u32 tstats_addr, tstats_len;
+
+	if (IS_PF(p_hwfn->cdev)) {
+		tstats_addr = BAR0_MAP_REG_TSDM_RAM +
+		    TSTORM_PORT_STAT_OFFSET(MFW_PORT(p_hwfn));
+		tstats_len = sizeof(struct tstorm_per_port_stat);
+	} else {
+		struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+		struct pfvf_acquire_resp_tlv *p_resp = &p_iov->acquire_resp;
+
+		tstats_addr = p_resp->pfdev_info.stats_info.tstats.address;
+		tstats_len = p_resp->pfdev_info.stats_info.tstats.len;
+	}
+
+	memset(&tstats, 0, sizeof(tstats));
+	qed_memcpy_from(p_hwfn, p_ptt, &tstats, tstats_addr, tstats_len);
+
+	p_stats->common.mftag_filter_discards +=
+	    HILO_64_REGPAIR(tstats.mftag_filter_discard);
+	p_stats->common.mac_filter_discards +=
+	    HILO_64_REGPAIR(tstats.eth_mac_filter_discard);
+}
+
+static void __qed_get_vport_ustats_addrlen(struct qed_hwfn *p_hwfn,
+					   u32 *p_addr,
+					   u32 *p_len, u16 statistics_bin)
+{
+	if (IS_PF(p_hwfn->cdev)) {
+		*p_addr = BAR0_MAP_REG_USDM_RAM +
+		    USTORM_QUEUE_STAT_OFFSET(statistics_bin);
+		*p_len = sizeof(struct eth_ustorm_per_queue_stat);
+	} else {
+		struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+		struct pfvf_acquire_resp_tlv *p_resp = &p_iov->acquire_resp;
+
+		*p_addr = p_resp->pfdev_info.stats_info.ustats.address;
+		*p_len = p_resp->pfdev_info.stats_info.ustats.len;
+	}
+}
+
+static void __qed_get_vport_ustats(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_eth_stats *p_stats,
+				   u16 statistics_bin)
+{
+	struct eth_ustorm_per_queue_stat ustats;
+	u32 ustats_addr = 0, ustats_len = 0;
+
+	__qed_get_vport_ustats_addrlen(p_hwfn, &ustats_addr, &ustats_len,
+				       statistics_bin);
+
+	memset(&ustats, 0, sizeof(ustats));
+	qed_memcpy_from(p_hwfn, p_ptt, &ustats, ustats_addr, ustats_len);
+
+	p_stats->common.rx_ucast_bytes +=
+	    HILO_64_REGPAIR(ustats.rcv_ucast_bytes);
+	p_stats->common.rx_mcast_bytes +=
+	    HILO_64_REGPAIR(ustats.rcv_mcast_bytes);
+	p_stats->common.rx_bcast_bytes +=
+	    HILO_64_REGPAIR(ustats.rcv_bcast_bytes);
+	p_stats->common.rx_ucast_pkts += HILO_64_REGPAIR(ustats.rcv_ucast_pkts);
+	p_stats->common.rx_mcast_pkts += HILO_64_REGPAIR(ustats.rcv_mcast_pkts);
+	p_stats->common.rx_bcast_pkts += HILO_64_REGPAIR(ustats.rcv_bcast_pkts);
+}
+
+static void __qed_get_vport_mstats_addrlen(struct qed_hwfn *p_hwfn,
+					   u32 *p_addr,
+					   u32 *p_len, u16 statistics_bin)
+{
+	if (IS_PF(p_hwfn->cdev)) {
+		*p_addr = BAR0_MAP_REG_MSDM_RAM +
+		    MSTORM_QUEUE_STAT_OFFSET(statistics_bin);
+		*p_len = sizeof(struct eth_mstorm_per_queue_stat);
+	} else {
+		struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+		struct pfvf_acquire_resp_tlv *p_resp = &p_iov->acquire_resp;
+
+		*p_addr = p_resp->pfdev_info.stats_info.mstats.address;
+		*p_len = p_resp->pfdev_info.stats_info.mstats.len;
+	}
+}
+
+static void __qed_get_vport_mstats(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_eth_stats *p_stats,
+				   u16 statistics_bin)
+{
+	struct eth_mstorm_per_queue_stat mstats;
+	u32 mstats_addr = 0, mstats_len = 0;
+
+	__qed_get_vport_mstats_addrlen(p_hwfn, &mstats_addr, &mstats_len,
+				       statistics_bin);
+
+	memset(&mstats, 0, sizeof(mstats));
+	qed_memcpy_from(p_hwfn, p_ptt, &mstats, mstats_addr, mstats_len);
+
+	p_stats->common.no_buff_discards +=
+	    HILO_64_REGPAIR(mstats.no_buff_discard);
+	p_stats->common.packet_too_big_discard +=
+	    HILO_64_REGPAIR(mstats.packet_too_big_discard);
+	p_stats->common.ttl0_discard += HILO_64_REGPAIR(mstats.ttl0_discard);
+	p_stats->common.tpa_coalesced_pkts +=
+	    HILO_64_REGPAIR(mstats.tpa_coalesced_pkts);
+	p_stats->common.tpa_coalesced_events +=
+	    HILO_64_REGPAIR(mstats.tpa_coalesced_events);
+	p_stats->common.tpa_aborts_num +=
+	    HILO_64_REGPAIR(mstats.tpa_aborts_num);
+	p_stats->common.tpa_coalesced_bytes +=
+	    HILO_64_REGPAIR(mstats.tpa_coalesced_bytes);
+}
+
+static void __qed_get_vport_port_stats(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       struct qed_eth_stats *p_stats)
+{
+	struct qed_eth_stats_common *p_common = &p_stats->common;
+	struct port_stats port_stats;
+	int j;
+
+	memset(&port_stats, 0, sizeof(port_stats));
+
+	qed_memcpy_from(p_hwfn, p_ptt, &port_stats,
+			p_hwfn->mcp_info->port_addr +
+			offsetof(struct public_port, stats),
+			sizeof(port_stats));
+
+	p_common->rx_64_byte_packets += port_stats.eth.r64;
+	p_common->rx_65_to_127_byte_packets += port_stats.eth.r127;
+	p_common->rx_128_to_255_byte_packets += port_stats.eth.r255;
+	p_common->rx_256_to_511_byte_packets += port_stats.eth.r511;
+	p_common->rx_512_to_1023_byte_packets += port_stats.eth.r1023;
+	p_common->rx_1024_to_1518_byte_packets += port_stats.eth.r1518;
+	p_common->rx_crc_errors += port_stats.eth.rfcs;
+	p_common->rx_mac_crtl_frames += port_stats.eth.rxcf;
+	p_common->rx_pause_frames += port_stats.eth.rxpf;
+	p_common->rx_pfc_frames += port_stats.eth.rxpp;
+	p_common->rx_align_errors += port_stats.eth.raln;
+	p_common->rx_carrier_errors += port_stats.eth.rfcr;
+	p_common->rx_oversize_packets += port_stats.eth.rovr;
+	p_common->rx_jabbers += port_stats.eth.rjbr;
+	p_common->rx_undersize_packets += port_stats.eth.rund;
+	p_common->rx_fragments += port_stats.eth.rfrg;
+	p_common->tx_64_byte_packets += port_stats.eth.t64;
+	p_common->tx_65_to_127_byte_packets += port_stats.eth.t127;
+	p_common->tx_128_to_255_byte_packets += port_stats.eth.t255;
+	p_common->tx_256_to_511_byte_packets += port_stats.eth.t511;
+	p_common->tx_512_to_1023_byte_packets += port_stats.eth.t1023;
+	p_common->tx_1024_to_1518_byte_packets += port_stats.eth.t1518;
+	p_common->tx_pause_frames += port_stats.eth.txpf;
+	p_common->tx_pfc_frames += port_stats.eth.txpp;
+	p_common->rx_mac_bytes += port_stats.eth.rbyte;
+	p_common->rx_mac_uc_packets += port_stats.eth.rxuca;
+	p_common->rx_mac_mc_packets += port_stats.eth.rxmca;
+	p_common->rx_mac_bc_packets += port_stats.eth.rxbca;
+	p_common->rx_mac_frames_ok += port_stats.eth.rxpok;
+	p_common->tx_mac_bytes += port_stats.eth.tbyte;
+	p_common->tx_mac_uc_packets += port_stats.eth.txuca;
+	p_common->tx_mac_mc_packets += port_stats.eth.txmca;
+	p_common->tx_mac_bc_packets += port_stats.eth.txbca;
+	p_common->tx_mac_ctrl_frames += port_stats.eth.txcf;
+	for (j = 0; j < 8; j++) {
+		p_common->brb_truncates += port_stats.brb.brb_truncate[j];
+		p_common->brb_discards += port_stats.brb.brb_discard[j];
+	}
+
+	if (QED_IS_BB(p_hwfn->cdev)) {
+		struct qed_eth_stats_bb *p_bb = &p_stats->bb;
+
+		p_bb->rx_1519_to_1522_byte_packets +=
+		    port_stats.eth.u0.bb0.r1522;
+		p_bb->rx_1519_to_2047_byte_packets +=
+		    port_stats.eth.u0.bb0.r2047;
+		p_bb->rx_2048_to_4095_byte_packets +=
+		    port_stats.eth.u0.bb0.r4095;
+		p_bb->rx_4096_to_9216_byte_packets +=
+		    port_stats.eth.u0.bb0.r9216;
+		p_bb->rx_9217_to_16383_byte_packets +=
+		    port_stats.eth.u0.bb0.r16383;
+		p_bb->tx_1519_to_2047_byte_packets +=
+		    port_stats.eth.u1.bb1.t2047;
+		p_bb->tx_2048_to_4095_byte_packets +=
+		    port_stats.eth.u1.bb1.t4095;
+		p_bb->tx_4096_to_9216_byte_packets +=
+		    port_stats.eth.u1.bb1.t9216;
+		p_bb->tx_9217_to_16383_byte_packets +=
+		    port_stats.eth.u1.bb1.t16383;
+		p_bb->tx_lpi_entry_count += port_stats.eth.u2.bb2.tlpiec;
+		p_bb->tx_total_collisions += port_stats.eth.u2.bb2.tncl;
+	} else {
+		struct qed_eth_stats_ah *p_ah = &p_stats->ah;
+
+		p_ah->rx_1519_to_max_byte_packets +=
+		    port_stats.eth.u0.ah0.r1519_to_max;
+		p_ah->tx_1519_to_max_byte_packets =
+		    port_stats.eth.u1.ah1.t1519_to_max;
+	}
+}
+
+static void __qed_get_vport_stats(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_eth_stats *stats,
+				  u16 statistics_bin, bool b_get_port_stats)
+{
+	__qed_get_vport_mstats(p_hwfn, p_ptt, stats, statistics_bin);
+	__qed_get_vport_ustats(p_hwfn, p_ptt, stats, statistics_bin);
+	__qed_get_vport_tstats(p_hwfn, p_ptt, stats, statistics_bin);
+	__qed_get_vport_pstats(p_hwfn, p_ptt, stats, statistics_bin);
+
+	if (b_get_port_stats && p_hwfn->mcp_info)
+		__qed_get_vport_port_stats(p_hwfn, p_ptt, stats);
+}
+
+static void _qed_get_vport_stats(struct qed_dev *cdev,
+				 struct qed_eth_stats *stats)
+{
+	u8 fw_vport = 0;
+	int i;
+
+	memset(stats, 0, sizeof(*stats));
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+		struct qed_ptt *p_ptt = IS_PF(cdev) ? qed_ptt_acquire(p_hwfn)
+						    :  NULL;
+
+		if (IS_PF(cdev)) {
+			/* The main vport index is relative first */
+			if (qed_fw_vport(p_hwfn, 0, &fw_vport)) {
+				DP_ERR(p_hwfn, "No vport available!\n");
+				goto out;
+			}
+		}
+
+		if (IS_PF(cdev) && !p_ptt) {
+			DP_ERR(p_hwfn, "Failed to acquire ptt\n");
+			continue;
+		}
+
+		__qed_get_vport_stats(p_hwfn, p_ptt, stats, fw_vport,
+				      IS_PF(cdev) ? true : false);
+
+out:
+		if (IS_PF(cdev) && p_ptt)
+			qed_ptt_release(p_hwfn, p_ptt);
+	}
+}
+
+void qed_get_vport_stats(struct qed_dev *cdev, struct qed_eth_stats *stats)
+{
+	u32 i;
+
+	if (!cdev) {
+		memset(stats, 0, sizeof(*stats));
+		return;
+	}
+
+	_qed_get_vport_stats(cdev, stats);
+
+	if (!cdev->reset_stats)
+		return;
+
+	/* Reduce the statistics baseline */
+	for (i = 0; i < sizeof(struct qed_eth_stats) / sizeof(u64); i++)
+		((u64 *)stats)[i] -= ((u64 *)cdev->reset_stats)[i];
+}
+
+/* zeroes V-PORT specific portion of stats (Port stats remains untouched) */
+void qed_reset_vport_stats(struct qed_dev *cdev)
+{
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+		struct eth_mstorm_per_queue_stat mstats;
+		struct eth_ustorm_per_queue_stat ustats;
+		struct eth_pstorm_per_queue_stat pstats;
+		struct qed_ptt *p_ptt = IS_PF(cdev) ? qed_ptt_acquire(p_hwfn)
+						    : NULL;
+		u32 addr = 0, len = 0;
+
+		if (IS_PF(cdev) && !p_ptt) {
+			DP_ERR(p_hwfn, "Failed to acquire ptt\n");
+			continue;
+		}
+
+		memset(&mstats, 0, sizeof(mstats));
+		__qed_get_vport_mstats_addrlen(p_hwfn, &addr, &len, 0);
+		qed_memcpy_to(p_hwfn, p_ptt, addr, &mstats, len);
+
+		memset(&ustats, 0, sizeof(ustats));
+		__qed_get_vport_ustats_addrlen(p_hwfn, &addr, &len, 0);
+		qed_memcpy_to(p_hwfn, p_ptt, addr, &ustats, len);
+
+		memset(&pstats, 0, sizeof(pstats));
+		__qed_get_vport_pstats_addrlen(p_hwfn, &addr, &len, 0);
+		qed_memcpy_to(p_hwfn, p_ptt, addr, &pstats, len);
+
+		if (IS_PF(cdev))
+			qed_ptt_release(p_hwfn, p_ptt);
+	}
+
+	/* PORT statistics are not necessarily reset, so we need to
+	 * read and create a baseline for future statistics.
+	 */
+	if (!cdev->reset_stats)
+		DP_INFO(cdev, "Reset stats not allocated\n");
+	else
+		_qed_get_vport_stats(cdev, cdev->reset_stats);
+}
+
+static enum gft_profile_type
+qed_arfs_mode_to_hsi(enum qed_filter_config_mode mode)
+{
+	if (mode == QED_FILTER_CONFIG_MODE_5_TUPLE)
+		return GFT_PROFILE_TYPE_4_TUPLE;
+	if (mode == QED_FILTER_CONFIG_MODE_IP_DEST)
+		return GFT_PROFILE_TYPE_IP_DST_ADDR;
+	return GFT_PROFILE_TYPE_L4_DST_PORT;
+}
+
+void qed_arfs_mode_configure(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt,
+			     struct qed_arfs_config_params *p_cfg_params)
+{
+	if (p_cfg_params->mode != QED_FILTER_CONFIG_MODE_DISABLE) {
+		qed_gft_config(p_hwfn, p_ptt, p_hwfn->rel_pf_id,
+			       p_cfg_params->tcp,
+			       p_cfg_params->udp,
+			       p_cfg_params->ipv4,
+			       p_cfg_params->ipv6,
+			       qed_arfs_mode_to_hsi(p_cfg_params->mode));
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_SP,
+			   "Configured Filtering: tcp = %s, udp = %s, ipv4 = %s, ipv6 =%s mode=%08x\n",
+			   p_cfg_params->tcp ? "Enable" : "Disable",
+			   p_cfg_params->udp ? "Enable" : "Disable",
+			   p_cfg_params->ipv4 ? "Enable" : "Disable",
+			   p_cfg_params->ipv6 ? "Enable" : "Disable",
+			   (u32)p_cfg_params->mode);
+	} else {
+		DP_VERBOSE(p_hwfn, QED_MSG_SP, "Disabled Filtering\n");
+		qed_gft_disable(p_hwfn, p_ptt, p_hwfn->rel_pf_id);
+	}
+}
+
+int
+qed_configure_rfs_ntuple_filter(struct qed_hwfn *p_hwfn,
+				struct qed_spq_comp_cb *p_cb,
+				struct qed_ntuple_filter_params *p_params)
+{
+	struct rx_update_gft_filter_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	u16 abs_rx_q_id = 0;
+	u8 abs_vport_id = 0;
+	int rc = -EINVAL;
+
+	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
+	if (rc)
+		return rc;
+
+	if (p_params->qid != QED_RFS_NTUPLE_QID_RSS) {
+		rc = qed_fw_l2_queue(p_hwfn, p_params->qid, &abs_rx_q_id);
+		if (rc)
+			return rc;
+	}
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+
+	if (p_cb) {
+		init_data.comp_mode = QED_SPQ_MODE_CB;
+		init_data.p_comp_data = p_cb;
+	} else {
+		init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+	}
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ETH_RAMROD_GFT_UPDATE_FILTER,
+				 PROTOCOLID_ETH, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.rx_update_gft;
+
+	DMA_REGPAIR_LE(p_ramrod->pkt_hdr_addr, p_params->addr);
+	p_ramrod->pkt_hdr_length = cpu_to_le16(p_params->length);
+
+	if (p_params->qid != QED_RFS_NTUPLE_QID_RSS) {
+		p_ramrod->rx_qid_valid = 1;
+		p_ramrod->rx_qid = cpu_to_le16(abs_rx_q_id);
+	}
+
+	p_ramrod->flow_id_valid = 0;
+	p_ramrod->flow_id = 0;
+
+	p_ramrod->vport_id = cpu_to_le16((u16)abs_vport_id);
+	p_ramrod->filter_action = p_params->b_is_add ? GFT_ADD_FILTER
+	    : GFT_DELETE_FILTER;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SP,
+		   "V[%0x], Q[%04x] - %s filter from 0x%llx [length %04xb]\n",
+		   abs_vport_id, abs_rx_q_id,
+		   p_params->b_is_add ? "Adding" : "Removing",
+		   (u64)p_params->addr, p_params->length);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+int qed_get_rxq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 struct qed_queue_cid *p_cid, u16 *p_rx_coal)
+{
+	u32 coalesce, address, is_valid;
+	struct cau_sb_entry sb_entry;
+	u8 timer_res;
+	int rc;
+
+	rc = qed_dmae_grc2host(p_hwfn, p_ptt, CAU_REG_SB_VAR_MEMORY +
+			       p_cid->sb_igu_id * sizeof(u64),
+			       (u64)(uintptr_t)&sb_entry, 2, 0);
+	if (rc) {
+		DP_ERR(p_hwfn, "dmae_grc2host failed %d\n", rc);
+		return rc;
+	}
+
+	timer_res = GET_FIELD(sb_entry.params, CAU_SB_ENTRY_TIMER_RES0);
+
+	address = BAR0_MAP_REG_USDM_RAM +
+		  USTORM_ETH_QUEUE_ZONE_OFFSET(p_cid->abs.queue_id);
+	coalesce = qed_rd(p_hwfn, p_ptt, address);
+
+	is_valid = GET_FIELD(coalesce, COALESCING_TIMESET_VALID);
+	if (!is_valid)
+		return -EINVAL;
+
+	coalesce = GET_FIELD(coalesce, COALESCING_TIMESET_TIMESET);
+	*p_rx_coal = (u16)(coalesce << timer_res);
+
+	return 0;
+}
+
+int qed_get_txq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 struct qed_queue_cid *p_cid, u16 *p_tx_coal)
+{
+	u32 coalesce, address, is_valid;
+	struct cau_sb_entry sb_entry;
+	u8 timer_res;
+	int rc;
+
+	rc = qed_dmae_grc2host(p_hwfn, p_ptt, CAU_REG_SB_VAR_MEMORY +
+			       p_cid->sb_igu_id * sizeof(u64),
+			       (u64)(uintptr_t)&sb_entry, 2, 0);
+	if (rc) {
+		DP_ERR(p_hwfn, "dmae_grc2host failed %d\n", rc);
+		return rc;
+	}
+
+	timer_res = GET_FIELD(sb_entry.params, CAU_SB_ENTRY_TIMER_RES1);
+
+	address = BAR0_MAP_REG_XSDM_RAM +
+		  XSTORM_ETH_QUEUE_ZONE_OFFSET(p_cid->abs.queue_id);
+	coalesce = qed_rd(p_hwfn, p_ptt, address);
+
+	is_valid = GET_FIELD(coalesce, COALESCING_TIMESET_VALID);
+	if (!is_valid)
+		return -EINVAL;
+
+	coalesce = GET_FIELD(coalesce, COALESCING_TIMESET_TIMESET);
+	*p_tx_coal = (u16)(coalesce << timer_res);
+
+	return 0;
+}
+
+int qed_get_queue_coalesce(struct qed_hwfn *p_hwfn, u16 *p_coal, void *handle)
+{
+	struct qed_queue_cid *p_cid = handle;
+	struct qed_ptt *p_ptt;
+	int rc = 0;
+
+	if (IS_VF(p_hwfn->cdev)) {
+		rc = qed_vf_pf_get_coalesce(p_hwfn, p_coal, p_cid);
+		if (rc)
+			DP_NOTICE(p_hwfn, "Unable to read queue coalescing\n");
+
+		return rc;
+	}
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EAGAIN;
+
+	if (p_cid->b_is_rx) {
+		rc = qed_get_rxq_coalesce(p_hwfn, p_ptt, p_cid, p_coal);
+		if (rc)
+			goto out;
+	} else {
+		rc = qed_get_txq_coalesce(p_hwfn, p_ptt, p_cid, p_coal);
+		if (rc)
+			goto out;
+	}
+
+out:
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return rc;
+}
+
+static int qed_fill_eth_dev_info(struct qed_dev *cdev,
+				 struct qed_dev_eth_info *info)
+{
+	int i;
+
+	memset(info, 0, sizeof(*info));
+
+	info->num_tc = 1;
+
+	if (IS_PF(cdev)) {
+		int max_vf_vlan_filters = 0;
+		int max_vf_mac_filters = 0;
+
+		if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) {
+			u16 num_queues = 0;
+
+			/* Since the feature controls only queue-zones,
+			 * make sure we have the contexts [rx, tx, xdp] to
+			 * match.
+			 */
+			for_each_hwfn(cdev, i) {
+				struct qed_hwfn *hwfn = &cdev->hwfns[i];
+				u16 l2_queues = (u16)FEAT_NUM(hwfn,
+							      QED_PF_L2_QUE);
+				u16 cids;
+
+				cids = hwfn->pf_params.eth_pf_params.num_cons;
+				num_queues += min_t(u16, l2_queues, cids / 3);
+			}
+
+			/* queues might theoretically be >256, but interrupts'
+			 * upper-limit guarantes that it would fit in a u8.
+			 */
+			if (cdev->int_params.fp_msix_cnt) {
+				u8 irqs = cdev->int_params.fp_msix_cnt;
+
+				info->num_queues = (u8)min_t(u16,
+							     num_queues, irqs);
+			}
+		} else {
+			info->num_queues = cdev->num_hwfns;
+		}
+
+		if (IS_QED_SRIOV(cdev)) {
+			max_vf_vlan_filters = cdev->p_iov_info->total_vfs *
+					      QED_ETH_VF_NUM_VLAN_FILTERS;
+			max_vf_mac_filters = cdev->p_iov_info->total_vfs *
+					     QED_ETH_VF_NUM_MAC_FILTERS;
+		}
+		info->num_vlan_filters = RESC_NUM(QED_LEADING_HWFN(cdev),
+						  QED_VLAN) -
+					 max_vf_vlan_filters;
+		info->num_mac_filters = RESC_NUM(QED_LEADING_HWFN(cdev),
+						 QED_MAC) -
+					max_vf_mac_filters;
+
+		ether_addr_copy(info->port_mac,
+				cdev->hwfns[0].hw_info.hw_mac_addr);
+
+		info->xdp_supported = true;
+	} else {
+		u16 total_cids = 0;
+
+		/* Determine queues &  XDP support */
+		for_each_hwfn(cdev, i) {
+			struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+			u8 queues, cids;
+
+			qed_vf_get_num_cids(p_hwfn, &cids);
+			qed_vf_get_num_rxqs(p_hwfn, &queues);
+			info->num_queues += queues;
+			total_cids += cids;
+		}
+
+		/* Enable VF XDP in case PF guarntees sufficient connections */
+		if (total_cids >= info->num_queues * 3)
+			info->xdp_supported = true;
+
+		qed_vf_get_num_vlan_filters(&cdev->hwfns[0],
+					    (u8 *)&info->num_vlan_filters);
+		qed_vf_get_num_mac_filters(&cdev->hwfns[0],
+					   (u8 *)&info->num_mac_filters);
+		qed_vf_get_port_mac(&cdev->hwfns[0], info->port_mac);
+
+		info->is_legacy = !!cdev->hwfns[0].vf_iov_info->b_pre_fp_hsi;
+	}
+
+	qed_fill_dev_info(cdev, &info->common);
+
+	if (IS_VF(cdev))
+		eth_zero_addr(info->common.hw_mac);
+
+	return 0;
+}
+
+static void qed_register_eth_ops(struct qed_dev *cdev,
+				 struct qed_eth_cb_ops *ops, void *cookie)
+{
+	cdev->protocol_ops.eth = ops;
+	cdev->ops_cookie = cookie;
+
+	/* For VF, we start bulletin reading */
+	if (IS_VF(cdev))
+		qed_vf_start_iov_wq(cdev);
+}
+
+static bool qed_check_mac(struct qed_dev *cdev, u8 *mac)
+{
+	if (IS_PF(cdev))
+		return true;
+
+	return qed_vf_check_mac(&cdev->hwfns[0], mac);
+}
+
+static int qed_start_vport(struct qed_dev *cdev,
+			   struct qed_start_vport_params *params)
+{
+	int rc, i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_sp_vport_start_params start = { 0 };
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		start.tpa_mode = params->gro_enable ? QED_TPA_MODE_GRO :
+							QED_TPA_MODE_NONE;
+		start.remove_inner_vlan = params->remove_inner_vlan;
+		start.only_untagged = true;	/* untagged only */
+		start.drop_ttl0 = params->drop_ttl0;
+		start.opaque_fid = p_hwfn->hw_info.opaque_fid;
+		start.concrete_fid = p_hwfn->hw_info.concrete_fid;
+		start.handle_ptp_pkts = params->handle_ptp_pkts;
+		start.vport_id = params->vport_id;
+		start.max_buffers_per_cqe = 16;
+		start.mtu = params->mtu;
+
+		rc = qed_sp_vport_start(p_hwfn, &start);
+		if (rc) {
+			DP_ERR(cdev, "Failed to start VPORT\n");
+			return rc;
+		}
+
+		rc = qed_hw_start_fastpath(p_hwfn);
+		if (rc) {
+			DP_ERR(cdev, "Failed to start VPORT fastpath\n");
+			return rc;
+		}
+
+		DP_VERBOSE(cdev, (QED_MSG_SPQ | NETIF_MSG_IFUP),
+			   "Started V-PORT %d with MTU %d\n",
+			   start.vport_id, start.mtu);
+	}
+
+	if (params->clear_stats)
+		qed_reset_vport_stats(cdev);
+
+	return 0;
+}
+
+static int qed_stop_vport(struct qed_dev *cdev, u8 vport_id)
+{
+	int rc, i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		rc = qed_sp_vport_stop(p_hwfn,
+				       p_hwfn->hw_info.opaque_fid, vport_id);
+
+		if (rc) {
+			DP_ERR(cdev, "Failed to stop VPORT\n");
+			return rc;
+		}
+	}
+	return 0;
+}
+
+static int qed_update_vport_rss(struct qed_dev *cdev,
+				struct qed_update_vport_rss_params *input,
+				struct qed_rss_params *rss)
+{
+	int i, fn;
+
+	/* Update configuration with what's correct regardless of CMT */
+	rss->update_rss_config = 1;
+	rss->rss_enable = 1;
+	rss->update_rss_capabilities = 1;
+	rss->update_rss_ind_table = 1;
+	rss->update_rss_key = 1;
+	rss->rss_caps = input->rss_caps;
+	memcpy(rss->rss_key, input->rss_key, QED_RSS_KEY_SIZE * sizeof(u32));
+
+	/* In regular scenario, we'd simply need to take input handlers.
+	 * But in CMT, we'd have to split the handlers according to the
+	 * engine they were configured on. We'd then have to understand
+	 * whether RSS is really required, since 2-queues on CMT doesn't
+	 * require RSS.
+	 */
+	if (cdev->num_hwfns == 1) {
+		memcpy(rss->rss_ind_table,
+		       input->rss_ind_table,
+		       QED_RSS_IND_TABLE_SIZE * sizeof(void *));
+		rss->rss_table_size_log = 7;
+		return 0;
+	}
+
+	/* Start by copying the non-spcific information to the 2nd copy */
+	memcpy(&rss[1], &rss[0], sizeof(struct qed_rss_params));
+
+	/* CMT should be round-robin */
+	for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++) {
+		struct qed_queue_cid *cid = input->rss_ind_table[i];
+		struct qed_rss_params *t_rss;
+
+		if (cid->p_owner == QED_LEADING_HWFN(cdev))
+			t_rss = &rss[0];
+		else
+			t_rss = &rss[1];
+
+		t_rss->rss_ind_table[i / cdev->num_hwfns] = cid;
+	}
+
+	/* Make sure RSS is actually required */
+	for_each_hwfn(cdev, fn) {
+		for (i = 1; i < QED_RSS_IND_TABLE_SIZE / cdev->num_hwfns; i++) {
+			if (rss[fn].rss_ind_table[i] !=
+			    rss[fn].rss_ind_table[0])
+				break;
+		}
+		if (i == QED_RSS_IND_TABLE_SIZE / cdev->num_hwfns) {
+			DP_VERBOSE(cdev, NETIF_MSG_IFUP,
+				   "CMT - 1 queue per-hwfn; Disabling RSS\n");
+			return -EINVAL;
+		}
+		rss[fn].rss_table_size_log = 6;
+	}
+
+	return 0;
+}
+
+static int qed_update_vport(struct qed_dev *cdev,
+			    struct qed_update_vport_params *params)
+{
+	struct qed_sp_vport_update_params sp_params;
+	struct qed_rss_params *rss;
+	int rc = 0, i;
+
+	if (!cdev)
+		return -ENODEV;
+
+	rss = vzalloc(sizeof(*rss) * cdev->num_hwfns);
+	if (!rss)
+		return -ENOMEM;
+
+	memset(&sp_params, 0, sizeof(sp_params));
+
+	/* Translate protocol params into sp params */
+	sp_params.vport_id = params->vport_id;
+	sp_params.update_vport_active_rx_flg = params->update_vport_active_flg;
+	sp_params.update_vport_active_tx_flg = params->update_vport_active_flg;
+	sp_params.vport_active_rx_flg = params->vport_active_flg;
+	sp_params.vport_active_tx_flg = params->vport_active_flg;
+	sp_params.update_tx_switching_flg = params->update_tx_switching_flg;
+	sp_params.tx_switching_flg = params->tx_switching_flg;
+	sp_params.accept_any_vlan = params->accept_any_vlan;
+	sp_params.update_accept_any_vlan_flg =
+		params->update_accept_any_vlan_flg;
+
+	/* Prepare the RSS configuration */
+	if (params->update_rss_flg)
+		if (qed_update_vport_rss(cdev, &params->rss_params, rss))
+			params->update_rss_flg = 0;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		if (params->update_rss_flg)
+			sp_params.rss_params = &rss[i];
+
+		sp_params.opaque_fid = p_hwfn->hw_info.opaque_fid;
+		rc = qed_sp_vport_update(p_hwfn, &sp_params,
+					 QED_SPQ_MODE_EBLOCK,
+					 NULL);
+		if (rc) {
+			DP_ERR(cdev, "Failed to update VPORT\n");
+			goto out;
+		}
+
+		DP_VERBOSE(cdev, (QED_MSG_SPQ | NETIF_MSG_IFUP),
+			   "Updated V-PORT %d: active_flag %d [update %d]\n",
+			   params->vport_id, params->vport_active_flg,
+			   params->update_vport_active_flg);
+	}
+
+out:
+	vfree(rss);
+	return rc;
+}
+
+static int qed_start_rxq(struct qed_dev *cdev,
+			 u8 rss_num,
+			 struct qed_queue_start_common_params *p_params,
+			 u16 bd_max_bytes,
+			 dma_addr_t bd_chain_phys_addr,
+			 dma_addr_t cqe_pbl_addr,
+			 u16 cqe_pbl_size,
+			 struct qed_rxq_start_ret_params *ret_params)
+{
+	struct qed_hwfn *p_hwfn;
+	int rc, hwfn_index;
+
+	hwfn_index = rss_num % cdev->num_hwfns;
+	p_hwfn = &cdev->hwfns[hwfn_index];
+
+	p_params->queue_id = p_params->queue_id / cdev->num_hwfns;
+	p_params->stats_id = p_params->vport_id;
+
+	rc = qed_eth_rx_queue_start(p_hwfn,
+				    p_hwfn->hw_info.opaque_fid,
+				    p_params,
+				    bd_max_bytes,
+				    bd_chain_phys_addr,
+				    cqe_pbl_addr, cqe_pbl_size, ret_params);
+	if (rc) {
+		DP_ERR(cdev, "Failed to start RXQ#%d\n", p_params->queue_id);
+		return rc;
+	}
+
+	DP_VERBOSE(cdev, (QED_MSG_SPQ | NETIF_MSG_IFUP),
+		   "Started RX-Q %d [rss_num %d] on V-PORT %d and SB igu %d\n",
+		   p_params->queue_id, rss_num, p_params->vport_id,
+		   p_params->p_sb->igu_sb_id);
+
+	return 0;
+}
+
+static int qed_stop_rxq(struct qed_dev *cdev, u8 rss_id, void *handle)
+{
+	int rc, hwfn_index;
+	struct qed_hwfn *p_hwfn;
+
+	hwfn_index = rss_id % cdev->num_hwfns;
+	p_hwfn = &cdev->hwfns[hwfn_index];
+
+	rc = qed_eth_rx_queue_stop(p_hwfn, handle, false, false);
+	if (rc) {
+		DP_ERR(cdev, "Failed to stop RXQ#%02x\n", rss_id);
+		return rc;
+	}
+
+	return 0;
+}
+
+static int qed_start_txq(struct qed_dev *cdev,
+			 u8 rss_num,
+			 struct qed_queue_start_common_params *p_params,
+			 dma_addr_t pbl_addr,
+			 u16 pbl_size,
+			 struct qed_txq_start_ret_params *ret_params)
+{
+	struct qed_hwfn *p_hwfn;
+	int rc, hwfn_index;
+
+	hwfn_index = rss_num % cdev->num_hwfns;
+	p_hwfn = &cdev->hwfns[hwfn_index];
+	p_params->queue_id = p_params->queue_id / cdev->num_hwfns;
+	p_params->stats_id = p_params->vport_id;
+
+	rc = qed_eth_tx_queue_start(p_hwfn,
+				    p_hwfn->hw_info.opaque_fid,
+				    p_params, 0,
+				    pbl_addr, pbl_size, ret_params);
+
+	if (rc) {
+		DP_ERR(cdev, "Failed to start TXQ#%d\n", p_params->queue_id);
+		return rc;
+	}
+
+	DP_VERBOSE(cdev, (QED_MSG_SPQ | NETIF_MSG_IFUP),
+		   "Started TX-Q %d [rss_num %d] on V-PORT %d and SB igu %d\n",
+		   p_params->queue_id, rss_num, p_params->vport_id,
+		   p_params->p_sb->igu_sb_id);
+
+	return 0;
+}
+
+#define QED_HW_STOP_RETRY_LIMIT (10)
+static int qed_fastpath_stop(struct qed_dev *cdev)
+{
+	int rc;
+
+	rc = qed_hw_stop_fastpath(cdev);
+	if (rc) {
+		DP_ERR(cdev, "Failed to stop Fastpath\n");
+		return rc;
+	}
+
+	return 0;
+}
+
+static int qed_stop_txq(struct qed_dev *cdev, u8 rss_id, void *handle)
+{
+	struct qed_hwfn *p_hwfn;
+	int rc, hwfn_index;
+
+	hwfn_index = rss_id % cdev->num_hwfns;
+	p_hwfn = &cdev->hwfns[hwfn_index];
+
+	rc = qed_eth_tx_queue_stop(p_hwfn, handle);
+	if (rc) {
+		DP_ERR(cdev, "Failed to stop TXQ#%02x\n", rss_id);
+		return rc;
+	}
+
+	return 0;
+}
+
+static int qed_tunn_configure(struct qed_dev *cdev,
+			      struct qed_tunn_params *tunn_params)
+{
+	struct qed_tunnel_info tunn_info;
+	int i, rc;
+
+	memset(&tunn_info, 0, sizeof(tunn_info));
+	if (tunn_params->update_vxlan_port) {
+		tunn_info.vxlan_port.b_update_port = true;
+		tunn_info.vxlan_port.port = tunn_params->vxlan_port;
+	}
+
+	if (tunn_params->update_geneve_port) {
+		tunn_info.geneve_port.b_update_port = true;
+		tunn_info.geneve_port.port = tunn_params->geneve_port;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_ptt *p_ptt;
+		struct qed_tunnel_info *tun;
+
+		tun = &hwfn->cdev->tunnel;
+		if (IS_PF(cdev)) {
+			p_ptt = qed_ptt_acquire(hwfn);
+			if (!p_ptt)
+				return -EAGAIN;
+		} else {
+			p_ptt = NULL;
+		}
+
+		rc = qed_sp_pf_update_tunn_cfg(hwfn, p_ptt, &tunn_info,
+					       QED_SPQ_MODE_EBLOCK, NULL);
+		if (rc) {
+			if (IS_PF(cdev))
+				qed_ptt_release(hwfn, p_ptt);
+			return rc;
+		}
+
+		if (IS_PF_SRIOV(hwfn)) {
+			u16 vxlan_port, geneve_port;
+			int j;
+
+			vxlan_port = tun->vxlan_port.port;
+			geneve_port = tun->geneve_port.port;
+
+			qed_for_each_vf(hwfn, j) {
+				qed_iov_bulletin_set_udp_ports(hwfn, j,
+							       vxlan_port,
+							       geneve_port);
+			}
+
+			qed_schedule_iov(hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
+		}
+		if (IS_PF(cdev))
+			qed_ptt_release(hwfn, p_ptt);
+	}
+
+	return 0;
+}
+
+static int qed_configure_filter_rx_mode(struct qed_dev *cdev,
+					enum qed_filter_rx_mode_type type)
+{
+	struct qed_filter_accept_flags accept_flags;
+
+	memset(&accept_flags, 0, sizeof(accept_flags));
+
+	accept_flags.update_rx_mode_config = 1;
+	accept_flags.update_tx_mode_config = 1;
+	accept_flags.rx_accept_filter = QED_ACCEPT_UCAST_MATCHED |
+					QED_ACCEPT_MCAST_MATCHED |
+					QED_ACCEPT_BCAST;
+	accept_flags.tx_accept_filter = QED_ACCEPT_UCAST_MATCHED |
+					QED_ACCEPT_MCAST_MATCHED |
+					QED_ACCEPT_BCAST;
+
+	if (type == QED_FILTER_RX_MODE_TYPE_PROMISC) {
+		accept_flags.rx_accept_filter |= QED_ACCEPT_UCAST_UNMATCHED |
+						 QED_ACCEPT_MCAST_UNMATCHED;
+		accept_flags.tx_accept_filter |= QED_ACCEPT_MCAST_UNMATCHED;
+	} else if (type == QED_FILTER_RX_MODE_TYPE_MULTI_PROMISC) {
+		accept_flags.rx_accept_filter |= QED_ACCEPT_MCAST_UNMATCHED;
+		accept_flags.tx_accept_filter |= QED_ACCEPT_MCAST_UNMATCHED;
+	}
+
+	return qed_filter_accept_cmd(cdev, 0, accept_flags, false, false,
+				     QED_SPQ_MODE_CB, NULL);
+}
+
+static int qed_configure_filter_ucast(struct qed_dev *cdev,
+				      struct qed_filter_ucast_params *params)
+{
+	struct qed_filter_ucast ucast;
+
+	if (!params->vlan_valid && !params->mac_valid) {
+		DP_NOTICE(cdev,
+			  "Tried configuring a unicast filter, but both MAC and VLAN are not set\n");
+		return -EINVAL;
+	}
+
+	memset(&ucast, 0, sizeof(ucast));
+	switch (params->type) {
+	case QED_FILTER_XCAST_TYPE_ADD:
+		ucast.opcode = QED_FILTER_ADD;
+		break;
+	case QED_FILTER_XCAST_TYPE_DEL:
+		ucast.opcode = QED_FILTER_REMOVE;
+		break;
+	case QED_FILTER_XCAST_TYPE_REPLACE:
+		ucast.opcode = QED_FILTER_REPLACE;
+		break;
+	default:
+		DP_NOTICE(cdev, "Unknown unicast filter type %d\n",
+			  params->type);
+	}
+
+	if (params->vlan_valid && params->mac_valid) {
+		ucast.type = QED_FILTER_MAC_VLAN;
+		ether_addr_copy(ucast.mac, params->mac);
+		ucast.vlan = params->vlan;
+	} else if (params->mac_valid) {
+		ucast.type = QED_FILTER_MAC;
+		ether_addr_copy(ucast.mac, params->mac);
+	} else {
+		ucast.type = QED_FILTER_VLAN;
+		ucast.vlan = params->vlan;
+	}
+
+	ucast.is_rx_filter = true;
+	ucast.is_tx_filter = true;
+
+	return qed_filter_ucast_cmd(cdev, &ucast, QED_SPQ_MODE_CB, NULL);
+}
+
+static int qed_configure_filter_mcast(struct qed_dev *cdev,
+				      struct qed_filter_mcast_params *params)
+{
+	struct qed_filter_mcast mcast;
+	int i;
+
+	memset(&mcast, 0, sizeof(mcast));
+	switch (params->type) {
+	case QED_FILTER_XCAST_TYPE_ADD:
+		mcast.opcode = QED_FILTER_ADD;
+		break;
+	case QED_FILTER_XCAST_TYPE_DEL:
+		mcast.opcode = QED_FILTER_REMOVE;
+		break;
+	default:
+		DP_NOTICE(cdev, "Unknown multicast filter type %d\n",
+			  params->type);
+	}
+
+	mcast.num_mc_addrs = params->num;
+	for (i = 0; i < mcast.num_mc_addrs; i++)
+		ether_addr_copy(mcast.mac[i], params->mac[i]);
+
+	return qed_filter_mcast_cmd(cdev, &mcast, QED_SPQ_MODE_CB, NULL);
+}
+
+static int qed_configure_filter(struct qed_dev *cdev,
+				struct qed_filter_params *params)
+{
+	enum qed_filter_rx_mode_type accept_flags;
+
+	switch (params->type) {
+	case QED_FILTER_TYPE_UCAST:
+		return qed_configure_filter_ucast(cdev, &params->filter.ucast);
+	case QED_FILTER_TYPE_MCAST:
+		return qed_configure_filter_mcast(cdev, &params->filter.mcast);
+	case QED_FILTER_TYPE_RX_MODE:
+		accept_flags = params->filter.accept_flags;
+		return qed_configure_filter_rx_mode(cdev, accept_flags);
+	default:
+		DP_NOTICE(cdev, "Unknown filter type %d\n", (int)params->type);
+		return -EINVAL;
+	}
+}
+
+static int qed_configure_arfs_searcher(struct qed_dev *cdev,
+				       enum qed_filter_config_mode mode)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_arfs_config_params arfs_config_params;
+
+	memset(&arfs_config_params, 0, sizeof(arfs_config_params));
+	arfs_config_params.tcp = true;
+	arfs_config_params.udp = true;
+	arfs_config_params.ipv4 = true;
+	arfs_config_params.ipv6 = true;
+	arfs_config_params.mode = mode;
+	qed_arfs_mode_configure(p_hwfn, p_hwfn->p_arfs_ptt,
+				&arfs_config_params);
+	return 0;
+}
+
+static void
+qed_arfs_sp_response_handler(struct qed_hwfn *p_hwfn,
+			     void *cookie,
+			     union event_ring_data *data, u8 fw_return_code)
+{
+	struct qed_common_cb_ops *op = p_hwfn->cdev->protocol_ops.common;
+	void *dev = p_hwfn->cdev->ops_cookie;
+
+	op->arfs_filter_op(dev, cookie, fw_return_code);
+}
+
+static int
+qed_ntuple_arfs_filter_config(struct qed_dev *cdev,
+			      void *cookie,
+			      struct qed_ntuple_filter_params *params)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_spq_comp_cb cb;
+	int rc = -EINVAL;
+
+	cb.function = qed_arfs_sp_response_handler;
+	cb.cookie = cookie;
+
+	if (params->b_is_vf) {
+		if (!qed_iov_is_valid_vfid(p_hwfn, params->vf_id, false,
+					   false)) {
+			DP_INFO(p_hwfn, "vfid 0x%02x is out of bounds\n",
+				params->vf_id);
+			return rc;
+		}
+
+		params->vport_id = params->vf_id + 1;
+		params->qid = QED_RFS_NTUPLE_QID_RSS;
+	}
+
+	rc = qed_configure_rfs_ntuple_filter(p_hwfn, &cb, params);
+	if (rc)
+		DP_NOTICE(p_hwfn,
+			  "Failed to issue a-RFS filter configuration\n");
+	else
+		DP_VERBOSE(p_hwfn, NETIF_MSG_DRV,
+			   "Successfully issued a-RFS filter configuration\n");
+
+	return rc;
+}
+
+static int qed_get_coalesce(struct qed_dev *cdev, u16 *coal, void *handle)
+{
+	struct qed_queue_cid *p_cid = handle;
+	struct qed_hwfn *p_hwfn;
+	int rc;
+
+	p_hwfn = p_cid->p_owner;
+	rc = qed_get_queue_coalesce(p_hwfn, coal, handle);
+	if (rc)
+		DP_NOTICE(p_hwfn, "Unable to read queue coalescing\n");
+
+	return rc;
+}
+
+static int qed_fp_cqe_completion(struct qed_dev *dev,
+				 u8 rss_id, struct eth_slow_path_rx_cqe *cqe)
+{
+	return qed_eth_cqe_completion(&dev->hwfns[rss_id % dev->num_hwfns],
+				      cqe);
+}
+
+#ifdef CONFIG_QED_SRIOV
+extern const struct qed_iov_hv_ops qed_iov_ops_pass;
+#endif
+
+#ifdef CONFIG_DCB
+extern const struct qed_eth_dcbnl_ops qed_dcbnl_ops_pass;
+#endif
+
+extern const struct qed_eth_ptp_ops qed_ptp_ops_pass;
+
+static const struct qed_eth_ops qed_eth_ops_pass = {
+	.common = &qed_common_ops_pass,
+#ifdef CONFIG_QED_SRIOV
+	.iov = &qed_iov_ops_pass,
+#endif
+#ifdef CONFIG_DCB
+	.dcb = &qed_dcbnl_ops_pass,
+#endif
+	.ptp = &qed_ptp_ops_pass,
+	.fill_dev_info = &qed_fill_eth_dev_info,
+	.register_ops = &qed_register_eth_ops,
+	.check_mac = &qed_check_mac,
+	.vport_start = &qed_start_vport,
+	.vport_stop = &qed_stop_vport,
+	.vport_update = &qed_update_vport,
+	.q_rx_start = &qed_start_rxq,
+	.q_rx_stop = &qed_stop_rxq,
+	.q_tx_start = &qed_start_txq,
+	.q_tx_stop = &qed_stop_txq,
+	.filter_config = &qed_configure_filter,
+	.fastpath_stop = &qed_fastpath_stop,
+	.eth_cqe_completion = &qed_fp_cqe_completion,
+	.get_vport_stats = &qed_get_vport_stats,
+	.tunn_config = &qed_tunn_configure,
+	.ntuple_filter_config = &qed_ntuple_arfs_filter_config,
+	.configure_arfs_searcher = &qed_configure_arfs_searcher,
+	.get_coalesce = &qed_get_coalesce,
+};
+
+const struct qed_eth_ops *qed_get_eth_ops(void)
+{
+	return &qed_eth_ops_pass;
+}
+EXPORT_SYMBOL(qed_get_eth_ops);
+
+void qed_put_eth_ops(void)
+{
+	/* TODO - reference count for module? */
+}
+EXPORT_SYMBOL(qed_put_eth_ops);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
new file mode 100644
index 0000000..c4030e9
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -0,0 +1,450 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _QED_L2_H
+#define _QED_L2_H
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/qed/qed_eth_if.h>
+#include "qed.h"
+#include "qed_hw.h"
+#include "qed_sp.h"
+struct qed_rss_params {
+	u8 update_rss_config;
+	u8 rss_enable;
+	u8 rss_eng_id;
+	u8 update_rss_capabilities;
+	u8 update_rss_ind_table;
+	u8 update_rss_key;
+	u8 rss_caps;
+	u8 rss_table_size_log;
+
+	/* Indirection table consist of rx queue handles */
+	void *rss_ind_table[QED_RSS_IND_TABLE_SIZE];
+	u32 rss_key[QED_RSS_KEY_SIZE];
+};
+
+struct qed_sge_tpa_params {
+	u8 max_buffers_per_cqe;
+
+	u8 update_tpa_en_flg;
+	u8 tpa_ipv4_en_flg;
+	u8 tpa_ipv6_en_flg;
+	u8 tpa_ipv4_tunn_en_flg;
+	u8 tpa_ipv6_tunn_en_flg;
+
+	u8 update_tpa_param_flg;
+	u8 tpa_pkt_split_flg;
+	u8 tpa_hdr_data_split_flg;
+	u8 tpa_gro_consistent_flg;
+	u8 tpa_max_aggs_num;
+	u16 tpa_max_size;
+	u16 tpa_min_size_to_start;
+	u16 tpa_min_size_to_cont;
+};
+
+enum qed_filter_opcode {
+	QED_FILTER_ADD,
+	QED_FILTER_REMOVE,
+	QED_FILTER_MOVE,
+	QED_FILTER_REPLACE,	/* Delete all MACs and add new one instead */
+	QED_FILTER_FLUSH,	/* Removes all filters */
+};
+
+enum qed_filter_ucast_type {
+	QED_FILTER_MAC,
+	QED_FILTER_VLAN,
+	QED_FILTER_MAC_VLAN,
+	QED_FILTER_INNER_MAC,
+	QED_FILTER_INNER_VLAN,
+	QED_FILTER_INNER_PAIR,
+	QED_FILTER_INNER_MAC_VNI_PAIR,
+	QED_FILTER_MAC_VNI_PAIR,
+	QED_FILTER_VNI,
+};
+
+struct qed_filter_ucast {
+	enum qed_filter_opcode opcode;
+	enum qed_filter_ucast_type type;
+	u8 is_rx_filter;
+	u8 is_tx_filter;
+	u8 vport_to_add_to;
+	u8 vport_to_remove_from;
+	unsigned char mac[ETH_ALEN];
+	u8 assert_on_error;
+	u16 vlan;
+	u32 vni;
+};
+
+struct qed_filter_mcast {
+	/* MOVE is not supported for multicast */
+	enum qed_filter_opcode opcode;
+	u8 vport_to_add_to;
+	u8 vport_to_remove_from;
+	u8 num_mc_addrs;
+#define QED_MAX_MC_ADDRS        64
+	unsigned char mac[QED_MAX_MC_ADDRS][ETH_ALEN];
+};
+
+/**
+ * @brief qed_eth_rx_queue_stop - This ramrod closes an Rx queue
+ *
+ * @param p_hwfn
+ * @param p_rxq			Handler of queue to close
+ * @param eq_completion_only	If True completion will be on
+ *				EQe, if False completion will be
+ *				on EQe if p_hwfn opaque
+ *				different from the RXQ opaque
+ *				otherwise on CQe.
+ * @param cqe_completion	If True completion will be
+ *				receive on CQe.
+ * @return int
+ */
+int
+qed_eth_rx_queue_stop(struct qed_hwfn *p_hwfn,
+		      void *p_rxq,
+		      bool eq_completion_only, bool cqe_completion);
+
+/**
+ * @brief qed_eth_tx_queue_stop - closes a Tx queue
+ *
+ * @param p_hwfn
+ * @param p_txq - handle to Tx queue needed to be closed
+ *
+ * @return int
+ */
+int qed_eth_tx_queue_stop(struct qed_hwfn *p_hwfn, void *p_txq);
+
+enum qed_tpa_mode {
+	QED_TPA_MODE_NONE,
+	QED_TPA_MODE_UNUSED,
+	QED_TPA_MODE_GRO,
+	QED_TPA_MODE_MAX
+};
+
+struct qed_sp_vport_start_params {
+	enum qed_tpa_mode tpa_mode;
+	bool remove_inner_vlan;
+	bool tx_switching;
+	bool handle_ptp_pkts;
+	bool only_untagged;
+	bool drop_ttl0;
+	u8 max_buffers_per_cqe;
+	u32 concrete_fid;
+	u16 opaque_fid;
+	u8 vport_id;
+	u16 mtu;
+	bool check_mac;
+	bool check_ethtype;
+};
+
+int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
+			   struct qed_sp_vport_start_params *p_params);
+
+
+struct qed_filter_accept_flags {
+	u8	update_rx_mode_config;
+	u8	update_tx_mode_config;
+	u8	rx_accept_filter;
+	u8	tx_accept_filter;
+#define QED_ACCEPT_NONE         0x01
+#define QED_ACCEPT_UCAST_MATCHED        0x02
+#define QED_ACCEPT_UCAST_UNMATCHED      0x04
+#define QED_ACCEPT_MCAST_MATCHED        0x08
+#define QED_ACCEPT_MCAST_UNMATCHED      0x10
+#define QED_ACCEPT_BCAST                0x20
+};
+
+struct qed_arfs_config_params {
+	bool tcp;
+	bool udp;
+	bool ipv4;
+	bool ipv6;
+	enum qed_filter_config_mode mode;
+};
+
+struct qed_sp_vport_update_params {
+	u16				opaque_fid;
+	u8				vport_id;
+	u8				update_vport_active_rx_flg;
+	u8				vport_active_rx_flg;
+	u8				update_vport_active_tx_flg;
+	u8				vport_active_tx_flg;
+	u8				update_inner_vlan_removal_flg;
+	u8				inner_vlan_removal_flg;
+	u8				silent_vlan_removal_flg;
+	u8				update_default_vlan_enable_flg;
+	u8				default_vlan_enable_flg;
+	u8				update_default_vlan_flg;
+	u16				default_vlan;
+	u8				update_tx_switching_flg;
+	u8				tx_switching_flg;
+	u8				update_approx_mcast_flg;
+	u8				update_anti_spoofing_en_flg;
+	u8				anti_spoofing_en;
+	u8				update_accept_any_vlan_flg;
+	u8				accept_any_vlan;
+	unsigned long			bins[8];
+	struct qed_rss_params		*rss_params;
+	struct qed_filter_accept_flags	accept_flags;
+	struct qed_sge_tpa_params	*sge_tpa_params;
+};
+
+int qed_sp_vport_update(struct qed_hwfn *p_hwfn,
+			struct qed_sp_vport_update_params *p_params,
+			enum spq_mode comp_mode,
+			struct qed_spq_comp_cb *p_comp_data);
+
+/**
+ * @brief qed_sp_vport_stop -
+ *
+ * This ramrod closes a VPort after all its RX and TX queues are terminated.
+ * An Assert is generated if any queues are left open.
+ *
+ * @param p_hwfn
+ * @param opaque_fid
+ * @param vport_id VPort ID
+ *
+ * @return int
+ */
+int qed_sp_vport_stop(struct qed_hwfn *p_hwfn, u16 opaque_fid, u8 vport_id);
+
+int qed_sp_eth_filter_ucast(struct qed_hwfn *p_hwfn,
+			    u16 opaque_fid,
+			    struct qed_filter_ucast *p_filter_cmd,
+			    enum spq_mode comp_mode,
+			    struct qed_spq_comp_cb *p_comp_data);
+
+/**
+ * @brief qed_sp_rx_eth_queues_update -
+ *
+ * This ramrod updates an RX queue. It is used for setting the active state
+ * of the queue and updating the TPA and SGE parameters.
+ *
+ * @note At the moment - only used by non-linux VFs.
+ *
+ * @param p_hwfn
+ * @param pp_rxq_handlers	An array of queue handlers to be updated.
+ * @param num_rxqs              number of queues to update.
+ * @param complete_cqe_flg	Post completion to the CQE Ring if set
+ * @param complete_event_flg	Post completion to the Event Ring if set
+ * @param comp_mode
+ * @param p_comp_data
+ *
+ * @return int
+ */
+
+int
+qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
+			    void **pp_rxq_handlers,
+			    u8 num_rxqs,
+			    u8 complete_cqe_flg,
+			    u8 complete_event_flg,
+			    enum spq_mode comp_mode,
+			    struct qed_spq_comp_cb *p_comp_data);
+
+void qed_get_vport_stats(struct qed_dev *cdev, struct qed_eth_stats *stats);
+
+void qed_reset_vport_stats(struct qed_dev *cdev);
+
+/**
+ * *@brief qed_arfs_mode_configure -
+ *
+ **Enable or disable rfs mode. It must accept atleast one of tcp or udp true
+ **and atleast one of ipv4 or ipv6 true to enable rfs mode.
+ *
+ **@param p_hwfn
+ **@param p_ptt
+ **@param p_cfg_params - arfs mode configuration parameters.
+ *
+ */
+void qed_arfs_mode_configure(struct qed_hwfn *p_hwfn,
+			     struct qed_ptt *p_ptt,
+			     struct qed_arfs_config_params *p_cfg_params);
+
+/**
+ * @brief - qed_configure_rfs_ntuple_filter
+ *
+ * This ramrod should be used to add or remove arfs hw filter
+ *
+ * @params p_hwfn
+ * @params p_cb - Used for QED_SPQ_MODE_CB,where client would initialize
+ *		  it with cookie and callback function address, if not
+ *		  using this mode then client must pass NULL.
+ * @params p_params
+ */
+int
+qed_configure_rfs_ntuple_filter(struct qed_hwfn *p_hwfn,
+				struct qed_spq_comp_cb *p_cb,
+				struct qed_ntuple_filter_params *p_params);
+
+#define MAX_QUEUES_PER_QZONE    (sizeof(unsigned long) * 8)
+#define QED_QUEUE_CID_SELF	(0xff)
+
+/* Almost identical to the qed_queue_start_common_params,
+ * but here we maintain the SB index in IGU CAM.
+ */
+struct qed_queue_cid_params {
+	u8 vport_id;
+	u16 queue_id;
+	u8 stats_id;
+};
+
+/* Additional parameters required for initialization of the queue_cid
+ * and are relevant only for a PF initializing one for its VFs.
+ */
+struct qed_queue_cid_vf_params {
+	/* Should match the VF's relative index */
+	u8 vfid;
+
+	/* 0-based queue index. Should reflect the relative qzone the
+	 * VF thinks is associated with it [in its range].
+	 */
+	u8 vf_qid;
+
+	/* Indicates a VF is legacy, making it differ in several things:
+	 *  - Producers would be placed in a different place.
+	 *  - Makes assumptions regarding the CIDs.
+	 */
+	u8 vf_legacy;
+
+	u8 qid_usage_idx;
+};
+
+struct qed_queue_cid {
+	/* For stats-id, the `rel' is actually absolute as well */
+	struct qed_queue_cid_params rel;
+	struct qed_queue_cid_params abs;
+
+	/* These have no 'relative' meaning */
+	u16 sb_igu_id;
+	u8 sb_idx;
+
+	u32 cid;
+	u16 opaque_fid;
+
+	bool b_is_rx;
+
+	/* VFs queues are mapped differently, so we need to know the
+	 * relative queue associated with them [0-based].
+	 * Notice this is relevant on the *PF* queue-cid of its VF's queues,
+	 * and not on the VF itself.
+	 */
+	u8 vfid;
+	u8 vf_qid;
+
+	/* We need an additional index to differentiate between queues opened
+	 * for same queue-zone, as VFs would have to communicate the info
+	 * to the PF [otherwise PF has no way to differentiate].
+	 */
+	u8 qid_usage_idx;
+
+	u8 vf_legacy;
+#define QED_QCID_LEGACY_VF_RX_PROD	(BIT(0))
+#define QED_QCID_LEGACY_VF_CID		(BIT(1))
+
+	struct qed_hwfn *p_owner;
+};
+
+int qed_l2_alloc(struct qed_hwfn *p_hwfn);
+void qed_l2_setup(struct qed_hwfn *p_hwfn);
+void qed_l2_free(struct qed_hwfn *p_hwfn);
+
+void qed_eth_queue_cid_release(struct qed_hwfn *p_hwfn,
+			       struct qed_queue_cid *p_cid);
+
+struct qed_queue_cid *
+qed_eth_queue_to_cid(struct qed_hwfn *p_hwfn,
+		     u16 opaque_fid,
+		     struct qed_queue_start_common_params *p_params,
+		     bool b_is_rx,
+		     struct qed_queue_cid_vf_params *p_vf_params);
+
+int
+qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
+		       struct qed_sp_vport_start_params *p_params);
+
+/**
+ * @brief - Starts an Rx queue, when queue_cid is already prepared
+ *
+ * @param p_hwfn
+ * @param p_cid
+ * @param bd_max_bytes
+ * @param bd_chain_phys_addr
+ * @param cqe_pbl_addr
+ * @param cqe_pbl_size
+ *
+ * @return int
+ */
+int
+qed_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
+			 struct qed_queue_cid *p_cid,
+			 u16 bd_max_bytes,
+			 dma_addr_t bd_chain_phys_addr,
+			 dma_addr_t cqe_pbl_addr, u16 cqe_pbl_size);
+
+/**
+ * @brief - Starts a Tx queue, where queue_cid is already prepared
+ *
+ * @param p_hwfn
+ * @param p_cid
+ * @param pbl_addr
+ * @param pbl_size
+ * @param p_pq_params - parameters for choosing the PQ for this Tx queue
+ *
+ * @return int
+ */
+int
+qed_eth_txq_start_ramrod(struct qed_hwfn *p_hwfn,
+			 struct qed_queue_cid *p_cid,
+			 dma_addr_t pbl_addr, u16 pbl_size, u16 pq_id);
+
+u8 qed_mcast_bin_from_mac(u8 *mac);
+
+int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 u16 coalesce, struct qed_queue_cid *p_cid);
+
+int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 u16 coalesce, struct qed_queue_cid *p_cid);
+
+int qed_get_rxq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 struct qed_queue_cid *p_cid, u16 *p_hw_coal);
+
+int qed_get_txq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 struct qed_queue_cid *p_cid, u16 *p_hw_coal);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
new file mode 100644
index 0000000..468c59d
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -0,0 +1,2513 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/dma-mapping.h>
+#include <linux/if_vlan.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/workqueue.h>
+#include <net/ipv6.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/io.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/qed/qed_ll2_if.h>
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_dev_api.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_int.h"
+#include "qed_ll2.h"
+#include "qed_mcp.h"
+#include "qed_ooo.h"
+#include "qed_reg_addr.h"
+#include "qed_sp.h"
+#include "qed_rdma.h"
+
+#define QED_LL2_RX_REGISTERED(ll2)	((ll2)->rx_queue.b_cb_registred)
+#define QED_LL2_TX_REGISTERED(ll2)	((ll2)->tx_queue.b_cb_registred)
+
+#define QED_LL2_TX_SIZE (256)
+#define QED_LL2_RX_SIZE (4096)
+
+struct qed_cb_ll2_info {
+	int rx_cnt;
+	u32 rx_size;
+	u8 handle;
+
+	/* Lock protecting LL2 buffer lists in sleepless context */
+	spinlock_t lock;
+	struct list_head list;
+
+	const struct qed_ll2_cb_ops *cbs;
+	void *cb_cookie;
+};
+
+struct qed_ll2_buffer {
+	struct list_head list;
+	void *data;
+	dma_addr_t phys_addr;
+};
+
+static void qed_ll2b_complete_tx_packet(void *cxt,
+					u8 connection_handle,
+					void *cookie,
+					dma_addr_t first_frag_addr,
+					bool b_last_fragment,
+					bool b_last_packet)
+{
+	struct qed_hwfn *p_hwfn = cxt;
+	struct qed_dev *cdev = p_hwfn->cdev;
+	struct sk_buff *skb = cookie;
+
+	/* All we need to do is release the mapping */
+	dma_unmap_single(&p_hwfn->cdev->pdev->dev, first_frag_addr,
+			 skb_headlen(skb), DMA_TO_DEVICE);
+
+	if (cdev->ll2->cbs && cdev->ll2->cbs->tx_cb)
+		cdev->ll2->cbs->tx_cb(cdev->ll2->cb_cookie, skb,
+				      b_last_fragment);
+
+	dev_kfree_skb_any(skb);
+}
+
+static int qed_ll2_alloc_buffer(struct qed_dev *cdev,
+				u8 **data, dma_addr_t *phys_addr)
+{
+	*data = kmalloc(cdev->ll2->rx_size, GFP_ATOMIC);
+	if (!(*data)) {
+		DP_INFO(cdev, "Failed to allocate LL2 buffer data\n");
+		return -ENOMEM;
+	}
+
+	*phys_addr = dma_map_single(&cdev->pdev->dev,
+				    ((*data) + NET_SKB_PAD),
+				    cdev->ll2->rx_size, DMA_FROM_DEVICE);
+	if (dma_mapping_error(&cdev->pdev->dev, *phys_addr)) {
+		DP_INFO(cdev, "Failed to map LL2 buffer data\n");
+		kfree((*data));
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int qed_ll2_dealloc_buffer(struct qed_dev *cdev,
+				 struct qed_ll2_buffer *buffer)
+{
+	spin_lock_bh(&cdev->ll2->lock);
+
+	dma_unmap_single(&cdev->pdev->dev, buffer->phys_addr,
+			 cdev->ll2->rx_size, DMA_FROM_DEVICE);
+	kfree(buffer->data);
+	list_del(&buffer->list);
+
+	cdev->ll2->rx_cnt--;
+	if (!cdev->ll2->rx_cnt)
+		DP_INFO(cdev, "All LL2 entries were removed\n");
+
+	spin_unlock_bh(&cdev->ll2->lock);
+
+	return 0;
+}
+
+static void qed_ll2_kill_buffers(struct qed_dev *cdev)
+{
+	struct qed_ll2_buffer *buffer, *tmp_buffer;
+
+	list_for_each_entry_safe(buffer, tmp_buffer, &cdev->ll2->list, list)
+		qed_ll2_dealloc_buffer(cdev, buffer);
+}
+
+void qed_ll2b_complete_rx_packet(void *cxt, struct qed_ll2_comp_rx_data *data)
+{
+	struct qed_hwfn *p_hwfn = cxt;
+	struct qed_ll2_buffer *buffer = data->cookie;
+	struct qed_dev *cdev = p_hwfn->cdev;
+	dma_addr_t new_phys_addr;
+	struct sk_buff *skb;
+	bool reuse = false;
+	int rc = -EINVAL;
+	u8 *new_data;
+
+	DP_VERBOSE(p_hwfn,
+		   (NETIF_MSG_RX_STATUS | QED_MSG_STORAGE | NETIF_MSG_PKTDATA),
+		   "Got an LL2 Rx completion: [Buffer at phys 0x%llx, offset 0x%02x] Length 0x%04x Parse_flags 0x%04x vlan 0x%04x Opaque data [0x%08x:0x%08x]\n",
+		   (u64)data->rx_buf_addr,
+		   data->u.placement_offset,
+		   data->length.packet_length,
+		   data->parse_flags,
+		   data->vlan, data->opaque_data_0, data->opaque_data_1);
+
+	if ((cdev->dp_module & NETIF_MSG_PKTDATA) && buffer->data) {
+		print_hex_dump(KERN_INFO, "",
+			       DUMP_PREFIX_OFFSET, 16, 1,
+			       buffer->data, data->length.packet_length, false);
+	}
+
+	/* Determine if data is valid */
+	if (data->length.packet_length < ETH_HLEN)
+		reuse = true;
+
+	/* Allocate a replacement for buffer; Reuse upon failure */
+	if (!reuse)
+		rc = qed_ll2_alloc_buffer(p_hwfn->cdev, &new_data,
+					  &new_phys_addr);
+
+	/* If need to reuse or there's no replacement buffer, repost this */
+	if (rc)
+		goto out_post;
+	dma_unmap_single(&cdev->pdev->dev, buffer->phys_addr,
+			 cdev->ll2->rx_size, DMA_FROM_DEVICE);
+
+	skb = build_skb(buffer->data, 0);
+	if (!skb) {
+		rc = -ENOMEM;
+		goto out_post;
+	}
+
+	data->u.placement_offset += NET_SKB_PAD;
+	skb_reserve(skb, data->u.placement_offset);
+	skb_put(skb, data->length.packet_length);
+	skb_checksum_none_assert(skb);
+
+	/* Get parital ethernet information instead of eth_type_trans(),
+	 * Since we don't have an associated net_device.
+	 */
+	skb_reset_mac_header(skb);
+	skb->protocol = eth_hdr(skb)->h_proto;
+
+	/* Pass SKB onward */
+	if (cdev->ll2->cbs && cdev->ll2->cbs->rx_cb) {
+		if (data->vlan)
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+					       data->vlan);
+		cdev->ll2->cbs->rx_cb(cdev->ll2->cb_cookie, skb,
+				      data->opaque_data_0,
+				      data->opaque_data_1);
+	}
+
+	/* Update Buffer information and update FW producer */
+	buffer->data = new_data;
+	buffer->phys_addr = new_phys_addr;
+
+out_post:
+	rc = qed_ll2_post_rx_buffer(QED_LEADING_HWFN(cdev), cdev->ll2->handle,
+				    buffer->phys_addr, 0,  buffer, 1);
+
+	if (rc)
+		qed_ll2_dealloc_buffer(cdev, buffer);
+}
+
+static struct qed_ll2_info *__qed_ll2_handle_sanity(struct qed_hwfn *p_hwfn,
+						    u8 connection_handle,
+						    bool b_lock,
+						    bool b_only_active)
+{
+	struct qed_ll2_info *p_ll2_conn, *p_ret = NULL;
+
+	if (connection_handle >= QED_MAX_NUM_OF_LL2_CONNECTIONS)
+		return NULL;
+
+	if (!p_hwfn->p_ll2_info)
+		return NULL;
+
+	p_ll2_conn = &p_hwfn->p_ll2_info[connection_handle];
+
+	if (b_only_active) {
+		if (b_lock)
+			mutex_lock(&p_ll2_conn->mutex);
+		if (p_ll2_conn->b_active)
+			p_ret = p_ll2_conn;
+		if (b_lock)
+			mutex_unlock(&p_ll2_conn->mutex);
+	} else {
+		p_ret = p_ll2_conn;
+	}
+
+	return p_ret;
+}
+
+static struct qed_ll2_info *qed_ll2_handle_sanity(struct qed_hwfn *p_hwfn,
+						  u8 connection_handle)
+{
+	return __qed_ll2_handle_sanity(p_hwfn, connection_handle, false, true);
+}
+
+static struct qed_ll2_info *qed_ll2_handle_sanity_lock(struct qed_hwfn *p_hwfn,
+						       u8 connection_handle)
+{
+	return __qed_ll2_handle_sanity(p_hwfn, connection_handle, true, true);
+}
+
+static struct qed_ll2_info *qed_ll2_handle_sanity_inactive(struct qed_hwfn
+							   *p_hwfn,
+							   u8 connection_handle)
+{
+	return __qed_ll2_handle_sanity(p_hwfn, connection_handle, false, false);
+}
+
+static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
+{
+	bool b_last_packet = false, b_last_frag = false;
+	struct qed_ll2_tx_packet *p_pkt = NULL;
+	struct qed_ll2_info *p_ll2_conn;
+	struct qed_ll2_tx_queue *p_tx;
+	unsigned long flags = 0;
+	dma_addr_t tx_frag;
+
+	p_ll2_conn = qed_ll2_handle_sanity_inactive(p_hwfn, connection_handle);
+	if (!p_ll2_conn)
+		return;
+
+	p_tx = &p_ll2_conn->tx_queue;
+
+	spin_lock_irqsave(&p_tx->lock, flags);
+	while (!list_empty(&p_tx->active_descq)) {
+		p_pkt = list_first_entry(&p_tx->active_descq,
+					 struct qed_ll2_tx_packet, list_entry);
+		if (!p_pkt)
+			break;
+
+		list_del(&p_pkt->list_entry);
+		b_last_packet = list_empty(&p_tx->active_descq);
+		list_add_tail(&p_pkt->list_entry, &p_tx->free_descq);
+		spin_unlock_irqrestore(&p_tx->lock, flags);
+		if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_OOO) {
+			struct qed_ooo_buffer *p_buffer;
+
+			p_buffer = (struct qed_ooo_buffer *)p_pkt->cookie;
+			qed_ooo_put_free_buffer(p_hwfn, p_hwfn->p_ooo_info,
+						p_buffer);
+		} else {
+			p_tx->cur_completing_packet = *p_pkt;
+			p_tx->cur_completing_bd_idx = 1;
+			b_last_frag =
+				p_tx->cur_completing_bd_idx == p_pkt->bd_used;
+			tx_frag = p_pkt->bds_set[0].tx_frag;
+			p_ll2_conn->cbs.tx_release_cb(p_ll2_conn->cbs.cookie,
+						      p_ll2_conn->my_id,
+						      p_pkt->cookie,
+						      tx_frag,
+						      b_last_frag,
+						      b_last_packet);
+		}
+		spin_lock_irqsave(&p_tx->lock, flags);
+	}
+	spin_unlock_irqrestore(&p_tx->lock, flags);
+}
+
+static int qed_ll2_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie)
+{
+	struct qed_ll2_info *p_ll2_conn = p_cookie;
+	struct qed_ll2_tx_queue *p_tx = &p_ll2_conn->tx_queue;
+	u16 new_idx = 0, num_bds = 0, num_bds_in_packet = 0;
+	struct qed_ll2_tx_packet *p_pkt;
+	bool b_last_frag = false;
+	unsigned long flags;
+	int rc = -EINVAL;
+
+	spin_lock_irqsave(&p_tx->lock, flags);
+	if (p_tx->b_completing_packet) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	new_idx = le16_to_cpu(*p_tx->p_fw_cons);
+	num_bds = ((s16)new_idx - (s16)p_tx->bds_idx);
+	while (num_bds) {
+		if (list_empty(&p_tx->active_descq))
+			goto out;
+
+		p_pkt = list_first_entry(&p_tx->active_descq,
+					 struct qed_ll2_tx_packet, list_entry);
+		if (!p_pkt)
+			goto out;
+
+		p_tx->b_completing_packet = true;
+		p_tx->cur_completing_packet = *p_pkt;
+		num_bds_in_packet = p_pkt->bd_used;
+		list_del(&p_pkt->list_entry);
+
+		if (num_bds < num_bds_in_packet) {
+			DP_NOTICE(p_hwfn,
+				  "Rest of BDs does not cover whole packet\n");
+			goto out;
+		}
+
+		num_bds -= num_bds_in_packet;
+		p_tx->bds_idx += num_bds_in_packet;
+		while (num_bds_in_packet--)
+			qed_chain_consume(&p_tx->txq_chain);
+
+		p_tx->cur_completing_bd_idx = 1;
+		b_last_frag = p_tx->cur_completing_bd_idx == p_pkt->bd_used;
+		list_add_tail(&p_pkt->list_entry, &p_tx->free_descq);
+
+		spin_unlock_irqrestore(&p_tx->lock, flags);
+
+		p_ll2_conn->cbs.tx_comp_cb(p_ll2_conn->cbs.cookie,
+					   p_ll2_conn->my_id,
+					   p_pkt->cookie,
+					   p_pkt->bds_set[0].tx_frag,
+					   b_last_frag, !num_bds);
+
+		spin_lock_irqsave(&p_tx->lock, flags);
+	}
+
+	p_tx->b_completing_packet = false;
+	rc = 0;
+out:
+	spin_unlock_irqrestore(&p_tx->lock, flags);
+	return rc;
+}
+
+static void qed_ll2_rxq_parse_gsi(struct qed_hwfn *p_hwfn,
+				  union core_rx_cqe_union *p_cqe,
+				  struct qed_ll2_comp_rx_data *data)
+{
+	data->parse_flags = le16_to_cpu(p_cqe->rx_cqe_gsi.parse_flags.flags);
+	data->length.data_length = le16_to_cpu(p_cqe->rx_cqe_gsi.data_length);
+	data->vlan = le16_to_cpu(p_cqe->rx_cqe_gsi.vlan);
+	data->opaque_data_0 = le32_to_cpu(p_cqe->rx_cqe_gsi.src_mac_addrhi);
+	data->opaque_data_1 = le16_to_cpu(p_cqe->rx_cqe_gsi.src_mac_addrlo);
+	data->u.data_length_error = p_cqe->rx_cqe_gsi.data_length_error;
+	data->qp_id = le16_to_cpu(p_cqe->rx_cqe_gsi.qp_id);
+
+	data->src_qp = le32_to_cpu(p_cqe->rx_cqe_gsi.src_qp);
+}
+
+static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn,
+				  union core_rx_cqe_union *p_cqe,
+				  struct qed_ll2_comp_rx_data *data)
+{
+	data->parse_flags = le16_to_cpu(p_cqe->rx_cqe_fp.parse_flags.flags);
+	data->err_flags = le16_to_cpu(p_cqe->rx_cqe_fp.err_flags.flags);
+	data->length.packet_length =
+	    le16_to_cpu(p_cqe->rx_cqe_fp.packet_length);
+	data->vlan = le16_to_cpu(p_cqe->rx_cqe_fp.vlan);
+	data->opaque_data_0 = le32_to_cpu(p_cqe->rx_cqe_fp.opaque_data.data[0]);
+	data->opaque_data_1 = le32_to_cpu(p_cqe->rx_cqe_fp.opaque_data.data[1]);
+	data->u.placement_offset = p_cqe->rx_cqe_fp.placement_offset;
+}
+
+static int
+qed_ll2_handle_slowpath(struct qed_hwfn *p_hwfn,
+			struct qed_ll2_info *p_ll2_conn,
+			union core_rx_cqe_union *p_cqe,
+			unsigned long *p_lock_flags)
+{
+	struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue;
+	struct core_rx_slow_path_cqe *sp_cqe;
+
+	sp_cqe = &p_cqe->rx_cqe_sp;
+	if (sp_cqe->ramrod_cmd_id != CORE_RAMROD_RX_QUEUE_FLUSH) {
+		DP_NOTICE(p_hwfn,
+			  "LL2 - unexpected Rx CQE slowpath ramrod_cmd_id:%d\n",
+			  sp_cqe->ramrod_cmd_id);
+		return -EINVAL;
+	}
+
+	if (!p_ll2_conn->cbs.slowpath_cb) {
+		DP_NOTICE(p_hwfn,
+			  "LL2 - received RX_QUEUE_FLUSH but no callback was provided\n");
+		return -EINVAL;
+	}
+
+	spin_unlock_irqrestore(&p_rx->lock, *p_lock_flags);
+
+	p_ll2_conn->cbs.slowpath_cb(p_ll2_conn->cbs.cookie,
+				    p_ll2_conn->my_id,
+				    le32_to_cpu(sp_cqe->opaque_data.data[0]),
+				    le32_to_cpu(sp_cqe->opaque_data.data[1]));
+
+	spin_lock_irqsave(&p_rx->lock, *p_lock_flags);
+
+	return 0;
+}
+
+static int
+qed_ll2_rxq_handle_completion(struct qed_hwfn *p_hwfn,
+			      struct qed_ll2_info *p_ll2_conn,
+			      union core_rx_cqe_union *p_cqe,
+			      unsigned long *p_lock_flags, bool b_last_cqe)
+{
+	struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue;
+	struct qed_ll2_rx_packet *p_pkt = NULL;
+	struct qed_ll2_comp_rx_data data;
+
+	if (!list_empty(&p_rx->active_descq))
+		p_pkt = list_first_entry(&p_rx->active_descq,
+					 struct qed_ll2_rx_packet, list_entry);
+	if (!p_pkt) {
+		DP_NOTICE(p_hwfn,
+			  "[%d] LL2 Rx completion but active_descq is empty\n",
+			  p_ll2_conn->input.conn_type);
+
+		return -EIO;
+	}
+	list_del(&p_pkt->list_entry);
+
+	if (p_cqe->rx_cqe_sp.type == CORE_RX_CQE_TYPE_REGULAR)
+		qed_ll2_rxq_parse_reg(p_hwfn, p_cqe, &data);
+	else
+		qed_ll2_rxq_parse_gsi(p_hwfn, p_cqe, &data);
+	if (qed_chain_consume(&p_rx->rxq_chain) != p_pkt->rxq_bd)
+		DP_NOTICE(p_hwfn,
+			  "Mismatch between active_descq and the LL2 Rx chain\n");
+
+	list_add_tail(&p_pkt->list_entry, &p_rx->free_descq);
+
+	data.connection_handle = p_ll2_conn->my_id;
+	data.cookie = p_pkt->cookie;
+	data.rx_buf_addr = p_pkt->rx_buf_addr;
+	data.b_last_packet = b_last_cqe;
+
+	spin_unlock_irqrestore(&p_rx->lock, *p_lock_flags);
+	p_ll2_conn->cbs.rx_comp_cb(p_ll2_conn->cbs.cookie, &data);
+
+	spin_lock_irqsave(&p_rx->lock, *p_lock_flags);
+
+	return 0;
+}
+
+static int qed_ll2_rxq_completion(struct qed_hwfn *p_hwfn, void *cookie)
+{
+	struct qed_ll2_info *p_ll2_conn = (struct qed_ll2_info *)cookie;
+	struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue;
+	union core_rx_cqe_union *cqe = NULL;
+	u16 cq_new_idx = 0, cq_old_idx = 0;
+	unsigned long flags = 0;
+	int rc = 0;
+
+	spin_lock_irqsave(&p_rx->lock, flags);
+	cq_new_idx = le16_to_cpu(*p_rx->p_fw_cons);
+	cq_old_idx = qed_chain_get_cons_idx(&p_rx->rcq_chain);
+
+	while (cq_new_idx != cq_old_idx) {
+		bool b_last_cqe = (cq_new_idx == cq_old_idx);
+
+		cqe =
+		    (union core_rx_cqe_union *)
+		    qed_chain_consume(&p_rx->rcq_chain);
+		cq_old_idx = qed_chain_get_cons_idx(&p_rx->rcq_chain);
+
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_LL2,
+			   "LL2 [sw. cons %04x, fw. at %04x] - Got Packet of type %02x\n",
+			   cq_old_idx, cq_new_idx, cqe->rx_cqe_sp.type);
+
+		switch (cqe->rx_cqe_sp.type) {
+		case CORE_RX_CQE_TYPE_SLOW_PATH:
+			rc = qed_ll2_handle_slowpath(p_hwfn, p_ll2_conn,
+						     cqe, &flags);
+			break;
+		case CORE_RX_CQE_TYPE_GSI_OFFLOAD:
+		case CORE_RX_CQE_TYPE_REGULAR:
+			rc = qed_ll2_rxq_handle_completion(p_hwfn, p_ll2_conn,
+							   cqe, &flags,
+							   b_last_cqe);
+			break;
+		default:
+			rc = -EIO;
+		}
+	}
+
+	spin_unlock_irqrestore(&p_rx->lock, flags);
+	return rc;
+}
+
+static void qed_ll2_rxq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
+{
+	struct qed_ll2_info *p_ll2_conn = NULL;
+	struct qed_ll2_rx_packet *p_pkt = NULL;
+	struct qed_ll2_rx_queue *p_rx;
+	unsigned long flags = 0;
+
+	p_ll2_conn = qed_ll2_handle_sanity_inactive(p_hwfn, connection_handle);
+	if (!p_ll2_conn)
+		return;
+
+	p_rx = &p_ll2_conn->rx_queue;
+
+	spin_lock_irqsave(&p_rx->lock, flags);
+	while (!list_empty(&p_rx->active_descq)) {
+		p_pkt = list_first_entry(&p_rx->active_descq,
+					 struct qed_ll2_rx_packet, list_entry);
+		if (!p_pkt)
+			break;
+		list_move_tail(&p_pkt->list_entry, &p_rx->free_descq);
+		spin_unlock_irqrestore(&p_rx->lock, flags);
+
+		if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_OOO) {
+			struct qed_ooo_buffer *p_buffer;
+
+			p_buffer = (struct qed_ooo_buffer *)p_pkt->cookie;
+			qed_ooo_put_free_buffer(p_hwfn, p_hwfn->p_ooo_info,
+						p_buffer);
+		} else {
+			dma_addr_t rx_buf_addr = p_pkt->rx_buf_addr;
+			void *cookie = p_pkt->cookie;
+			bool b_last;
+
+			b_last = list_empty(&p_rx->active_descq);
+			p_ll2_conn->cbs.rx_release_cb(p_ll2_conn->cbs.cookie,
+						      p_ll2_conn->my_id,
+						      cookie,
+						      rx_buf_addr, b_last);
+		}
+		spin_lock_irqsave(&p_rx->lock, flags);
+	}
+	spin_unlock_irqrestore(&p_rx->lock, flags);
+}
+
+static bool
+qed_ll2_lb_rxq_handler_slowpath(struct qed_hwfn *p_hwfn,
+				struct core_rx_slow_path_cqe *p_cqe)
+{
+	struct ooo_opaque *iscsi_ooo;
+	u32 cid;
+
+	if (p_cqe->ramrod_cmd_id != CORE_RAMROD_RX_QUEUE_FLUSH)
+		return false;
+
+	iscsi_ooo = (struct ooo_opaque *)&p_cqe->opaque_data;
+	if (iscsi_ooo->ooo_opcode != TCP_EVENT_DELETE_ISLES)
+		return false;
+
+	/* Need to make a flush */
+	cid = le32_to_cpu(iscsi_ooo->cid);
+	qed_ooo_release_connection_isles(p_hwfn, p_hwfn->p_ooo_info, cid);
+
+	return true;
+}
+
+static int qed_ll2_lb_rxq_handler(struct qed_hwfn *p_hwfn,
+				  struct qed_ll2_info *p_ll2_conn)
+{
+	struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue;
+	u16 packet_length = 0, parse_flags = 0, vlan = 0;
+	struct qed_ll2_rx_packet *p_pkt = NULL;
+	u32 num_ooo_add_to_peninsula = 0, cid;
+	union core_rx_cqe_union *cqe = NULL;
+	u16 cq_new_idx = 0, cq_old_idx = 0;
+	struct qed_ooo_buffer *p_buffer;
+	struct ooo_opaque *iscsi_ooo;
+	u8 placement_offset = 0;
+	u8 cqe_type;
+
+	cq_new_idx = le16_to_cpu(*p_rx->p_fw_cons);
+	cq_old_idx = qed_chain_get_cons_idx(&p_rx->rcq_chain);
+	if (cq_new_idx == cq_old_idx)
+		return 0;
+
+	while (cq_new_idx != cq_old_idx) {
+		struct core_rx_fast_path_cqe *p_cqe_fp;
+
+		cqe = qed_chain_consume(&p_rx->rcq_chain);
+		cq_old_idx = qed_chain_get_cons_idx(&p_rx->rcq_chain);
+		cqe_type = cqe->rx_cqe_sp.type;
+
+		if (cqe_type == CORE_RX_CQE_TYPE_SLOW_PATH)
+			if (qed_ll2_lb_rxq_handler_slowpath(p_hwfn,
+							    &cqe->rx_cqe_sp))
+				continue;
+
+		if (cqe_type != CORE_RX_CQE_TYPE_REGULAR) {
+			DP_NOTICE(p_hwfn,
+				  "Got a non-regular LB LL2 completion [type 0x%02x]\n",
+				  cqe_type);
+			return -EINVAL;
+		}
+		p_cqe_fp = &cqe->rx_cqe_fp;
+
+		placement_offset = p_cqe_fp->placement_offset;
+		parse_flags = le16_to_cpu(p_cqe_fp->parse_flags.flags);
+		packet_length = le16_to_cpu(p_cqe_fp->packet_length);
+		vlan = le16_to_cpu(p_cqe_fp->vlan);
+		iscsi_ooo = (struct ooo_opaque *)&p_cqe_fp->opaque_data;
+		qed_ooo_save_history_entry(p_hwfn, p_hwfn->p_ooo_info,
+					   iscsi_ooo);
+		cid = le32_to_cpu(iscsi_ooo->cid);
+
+		/* Process delete isle first */
+		if (iscsi_ooo->drop_size)
+			qed_ooo_delete_isles(p_hwfn, p_hwfn->p_ooo_info, cid,
+					     iscsi_ooo->drop_isle,
+					     iscsi_ooo->drop_size);
+
+		if (iscsi_ooo->ooo_opcode == TCP_EVENT_NOP)
+			continue;
+
+		/* Now process create/add/join isles */
+		if (list_empty(&p_rx->active_descq)) {
+			DP_NOTICE(p_hwfn,
+				  "LL2 OOO RX chain has no submitted buffers\n"
+				  );
+			return -EIO;
+		}
+
+		p_pkt = list_first_entry(&p_rx->active_descq,
+					 struct qed_ll2_rx_packet, list_entry);
+
+		if ((iscsi_ooo->ooo_opcode == TCP_EVENT_ADD_NEW_ISLE) ||
+		    (iscsi_ooo->ooo_opcode == TCP_EVENT_ADD_ISLE_RIGHT) ||
+		    (iscsi_ooo->ooo_opcode == TCP_EVENT_ADD_ISLE_LEFT) ||
+		    (iscsi_ooo->ooo_opcode == TCP_EVENT_ADD_PEN) ||
+		    (iscsi_ooo->ooo_opcode == TCP_EVENT_JOIN)) {
+			if (!p_pkt) {
+				DP_NOTICE(p_hwfn,
+					  "LL2 OOO RX packet is not valid\n");
+				return -EIO;
+			}
+			list_del(&p_pkt->list_entry);
+			p_buffer = (struct qed_ooo_buffer *)p_pkt->cookie;
+			p_buffer->packet_length = packet_length;
+			p_buffer->parse_flags = parse_flags;
+			p_buffer->vlan = vlan;
+			p_buffer->placement_offset = placement_offset;
+			qed_chain_consume(&p_rx->rxq_chain);
+			list_add_tail(&p_pkt->list_entry, &p_rx->free_descq);
+
+			switch (iscsi_ooo->ooo_opcode) {
+			case TCP_EVENT_ADD_NEW_ISLE:
+				qed_ooo_add_new_isle(p_hwfn,
+						     p_hwfn->p_ooo_info,
+						     cid,
+						     iscsi_ooo->ooo_isle,
+						     p_buffer);
+				break;
+			case TCP_EVENT_ADD_ISLE_RIGHT:
+				qed_ooo_add_new_buffer(p_hwfn,
+						       p_hwfn->p_ooo_info,
+						       cid,
+						       iscsi_ooo->ooo_isle,
+						       p_buffer,
+						       QED_OOO_RIGHT_BUF);
+				break;
+			case TCP_EVENT_ADD_ISLE_LEFT:
+				qed_ooo_add_new_buffer(p_hwfn,
+						       p_hwfn->p_ooo_info,
+						       cid,
+						       iscsi_ooo->ooo_isle,
+						       p_buffer,
+						       QED_OOO_LEFT_BUF);
+				break;
+			case TCP_EVENT_JOIN:
+				qed_ooo_add_new_buffer(p_hwfn,
+						       p_hwfn->p_ooo_info,
+						       cid,
+						       iscsi_ooo->ooo_isle +
+						       1,
+						       p_buffer,
+						       QED_OOO_LEFT_BUF);
+				qed_ooo_join_isles(p_hwfn,
+						   p_hwfn->p_ooo_info,
+						   cid, iscsi_ooo->ooo_isle);
+				break;
+			case TCP_EVENT_ADD_PEN:
+				num_ooo_add_to_peninsula++;
+				qed_ooo_put_ready_buffer(p_hwfn,
+							 p_hwfn->p_ooo_info,
+							 p_buffer, true);
+				break;
+			}
+		} else {
+			DP_NOTICE(p_hwfn,
+				  "Unexpected event (%d) TX OOO completion\n",
+				  iscsi_ooo->ooo_opcode);
+		}
+	}
+
+	return 0;
+}
+
+static void
+qed_ooo_submit_tx_buffers(struct qed_hwfn *p_hwfn,
+			  struct qed_ll2_info *p_ll2_conn)
+{
+	struct qed_ll2_tx_pkt_info tx_pkt;
+	struct qed_ooo_buffer *p_buffer;
+	u16 l4_hdr_offset_w;
+	dma_addr_t first_frag;
+	u8 bd_flags;
+	int rc;
+
+	/* Submit Tx buffers here */
+	while ((p_buffer = qed_ooo_get_ready_buffer(p_hwfn,
+						    p_hwfn->p_ooo_info))) {
+		l4_hdr_offset_w = 0;
+		bd_flags = 0;
+
+		first_frag = p_buffer->rx_buffer_phys_addr +
+			     p_buffer->placement_offset;
+		SET_FIELD(bd_flags, CORE_TX_BD_DATA_FORCE_VLAN_MODE, 1);
+		SET_FIELD(bd_flags, CORE_TX_BD_DATA_L4_PROTOCOL, 1);
+
+		memset(&tx_pkt, 0, sizeof(tx_pkt));
+		tx_pkt.num_of_bds = 1;
+		tx_pkt.vlan = p_buffer->vlan;
+		tx_pkt.bd_flags = bd_flags;
+		tx_pkt.l4_hdr_offset_w = l4_hdr_offset_w;
+		tx_pkt.tx_dest = p_ll2_conn->tx_dest;
+		tx_pkt.first_frag = first_frag;
+		tx_pkt.first_frag_len = p_buffer->packet_length;
+		tx_pkt.cookie = p_buffer;
+
+		rc = qed_ll2_prepare_tx_packet(p_hwfn, p_ll2_conn->my_id,
+					       &tx_pkt, true);
+		if (rc) {
+			qed_ooo_put_ready_buffer(p_hwfn, p_hwfn->p_ooo_info,
+						 p_buffer, false);
+			break;
+		}
+	}
+}
+
+static void
+qed_ooo_submit_rx_buffers(struct qed_hwfn *p_hwfn,
+			  struct qed_ll2_info *p_ll2_conn)
+{
+	struct qed_ooo_buffer *p_buffer;
+	int rc;
+
+	while ((p_buffer = qed_ooo_get_free_buffer(p_hwfn,
+						   p_hwfn->p_ooo_info))) {
+		rc = qed_ll2_post_rx_buffer(p_hwfn,
+					    p_ll2_conn->my_id,
+					    p_buffer->rx_buffer_phys_addr,
+					    0, p_buffer, true);
+		if (rc) {
+			qed_ooo_put_free_buffer(p_hwfn,
+						p_hwfn->p_ooo_info, p_buffer);
+			break;
+		}
+	}
+}
+
+static int qed_ll2_lb_rxq_completion(struct qed_hwfn *p_hwfn, void *p_cookie)
+{
+	struct qed_ll2_info *p_ll2_conn = (struct qed_ll2_info *)p_cookie;
+	int rc;
+
+	if (!QED_LL2_RX_REGISTERED(p_ll2_conn))
+		return 0;
+
+	rc = qed_ll2_lb_rxq_handler(p_hwfn, p_ll2_conn);
+	if (rc)
+		return rc;
+
+	qed_ooo_submit_rx_buffers(p_hwfn, p_ll2_conn);
+	qed_ooo_submit_tx_buffers(p_hwfn, p_ll2_conn);
+
+	return 0;
+}
+
+static int qed_ll2_lb_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie)
+{
+	struct qed_ll2_info *p_ll2_conn = (struct qed_ll2_info *)p_cookie;
+	struct qed_ll2_tx_queue *p_tx = &p_ll2_conn->tx_queue;
+	struct qed_ll2_tx_packet *p_pkt = NULL;
+	struct qed_ooo_buffer *p_buffer;
+	bool b_dont_submit_rx = false;
+	u16 new_idx = 0, num_bds = 0;
+	int rc;
+
+	if (!QED_LL2_TX_REGISTERED(p_ll2_conn))
+		return 0;
+
+	new_idx = le16_to_cpu(*p_tx->p_fw_cons);
+	num_bds = ((s16)new_idx - (s16)p_tx->bds_idx);
+
+	if (!num_bds)
+		return 0;
+
+	while (num_bds) {
+		if (list_empty(&p_tx->active_descq))
+			return -EINVAL;
+
+		p_pkt = list_first_entry(&p_tx->active_descq,
+					 struct qed_ll2_tx_packet, list_entry);
+		if (!p_pkt)
+			return -EINVAL;
+
+		if (p_pkt->bd_used != 1) {
+			DP_NOTICE(p_hwfn,
+				  "Unexpectedly many BDs(%d) in TX OOO completion\n",
+				  p_pkt->bd_used);
+			return -EINVAL;
+		}
+
+		list_del(&p_pkt->list_entry);
+
+		num_bds--;
+		p_tx->bds_idx++;
+		qed_chain_consume(&p_tx->txq_chain);
+
+		p_buffer = (struct qed_ooo_buffer *)p_pkt->cookie;
+		list_add_tail(&p_pkt->list_entry, &p_tx->free_descq);
+
+		if (b_dont_submit_rx) {
+			qed_ooo_put_free_buffer(p_hwfn, p_hwfn->p_ooo_info,
+						p_buffer);
+			continue;
+		}
+
+		rc = qed_ll2_post_rx_buffer(p_hwfn, p_ll2_conn->my_id,
+					    p_buffer->rx_buffer_phys_addr, 0,
+					    p_buffer, true);
+		if (rc != 0) {
+			qed_ooo_put_free_buffer(p_hwfn,
+						p_hwfn->p_ooo_info, p_buffer);
+			b_dont_submit_rx = true;
+		}
+	}
+
+	qed_ooo_submit_tx_buffers(p_hwfn, p_ll2_conn);
+
+	return 0;
+}
+
+static void qed_ll2_stop_ooo(struct qed_dev *cdev)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	u8 *handle = &hwfn->pf_params.iscsi_pf_params.ll2_ooo_queue_id;
+
+	DP_VERBOSE(cdev, QED_MSG_STORAGE, "Stopping LL2 OOO queue [%02x]\n",
+		   *handle);
+
+	qed_ll2_terminate_connection(hwfn, *handle);
+	qed_ll2_release_connection(hwfn, *handle);
+	*handle = QED_LL2_UNUSED_HANDLE;
+}
+
+static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn,
+				     struct qed_ll2_info *p_ll2_conn,
+				     u8 action_on_error)
+{
+	enum qed_ll2_conn_type conn_type = p_ll2_conn->input.conn_type;
+	struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue;
+	struct core_rx_start_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	u16 cqe_pbl_size;
+	int rc = 0;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_ll2_conn->cid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 CORE_RAMROD_RX_QUEUE_START,
+				 PROTOCOLID_CORE, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.core_rx_queue_start;
+
+	p_ramrod->sb_id = cpu_to_le16(qed_int_get_sp_sb_id(p_hwfn));
+	p_ramrod->sb_index = p_rx->rx_sb_index;
+	p_ramrod->complete_event_flg = 1;
+
+	p_ramrod->mtu = cpu_to_le16(p_ll2_conn->input.mtu);
+	DMA_REGPAIR_LE(p_ramrod->bd_base, p_rx->rxq_chain.p_phys_addr);
+	cqe_pbl_size = (u16)qed_chain_get_page_cnt(&p_rx->rcq_chain);
+	p_ramrod->num_of_pbl_pages = cpu_to_le16(cqe_pbl_size);
+	DMA_REGPAIR_LE(p_ramrod->cqe_pbl_addr,
+		       qed_chain_get_pbl_phys(&p_rx->rcq_chain));
+
+	p_ramrod->drop_ttl0_flg = p_ll2_conn->input.rx_drop_ttl0_flg;
+	p_ramrod->inner_vlan_stripping_en =
+		p_ll2_conn->input.rx_vlan_removal_en;
+	p_ramrod->queue_id = p_ll2_conn->queue_id;
+	p_ramrod->main_func_queue = p_ll2_conn->main_func_queue ? 1 : 0;
+
+	if ((IS_MF_DEFAULT(p_hwfn) || IS_MF_SI(p_hwfn)) &&
+	    p_ramrod->main_func_queue && (conn_type != QED_LL2_TYPE_ROCE) &&
+	    (conn_type != QED_LL2_TYPE_IWARP)) {
+		p_ramrod->mf_si_bcast_accept_all = 1;
+		p_ramrod->mf_si_mcast_accept_all = 1;
+	} else {
+		p_ramrod->mf_si_bcast_accept_all = 0;
+		p_ramrod->mf_si_mcast_accept_all = 0;
+	}
+
+	p_ramrod->action_on_error.error_type = action_on_error;
+	p_ramrod->gsi_offload_flag = p_ll2_conn->input.gsi_enable;
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_ll2_tx_queue_start(struct qed_hwfn *p_hwfn,
+				     struct qed_ll2_info *p_ll2_conn)
+{
+	enum qed_ll2_conn_type conn_type = p_ll2_conn->input.conn_type;
+	struct qed_ll2_tx_queue *p_tx = &p_ll2_conn->tx_queue;
+	struct core_tx_start_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	u16 pq_id = 0, pbl_size;
+	int rc = -EINVAL;
+
+	if (!QED_LL2_TX_REGISTERED(p_ll2_conn))
+		return 0;
+
+	if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_OOO)
+		p_ll2_conn->tx_stats_en = 0;
+	else
+		p_ll2_conn->tx_stats_en = 1;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_ll2_conn->cid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 CORE_RAMROD_TX_QUEUE_START,
+				 PROTOCOLID_CORE, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.core_tx_queue_start;
+
+	p_ramrod->sb_id = cpu_to_le16(qed_int_get_sp_sb_id(p_hwfn));
+	p_ramrod->sb_index = p_tx->tx_sb_index;
+	p_ramrod->mtu = cpu_to_le16(p_ll2_conn->input.mtu);
+	p_ramrod->stats_en = p_ll2_conn->tx_stats_en;
+	p_ramrod->stats_id = p_ll2_conn->tx_stats_id;
+
+	DMA_REGPAIR_LE(p_ramrod->pbl_base_addr,
+		       qed_chain_get_pbl_phys(&p_tx->txq_chain));
+	pbl_size = qed_chain_get_page_cnt(&p_tx->txq_chain);
+	p_ramrod->pbl_size = cpu_to_le16(pbl_size);
+
+	switch (p_ll2_conn->input.tx_tc) {
+	case PURE_LB_TC:
+		pq_id = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_LB);
+		break;
+	case PKT_LB_TC:
+		pq_id = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_OOO);
+		break;
+	default:
+		pq_id = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_OFLD);
+		break;
+	}
+
+	p_ramrod->qm_pq_id = cpu_to_le16(pq_id);
+
+	switch (conn_type) {
+	case QED_LL2_TYPE_FCOE:
+		p_ramrod->conn_type = PROTOCOLID_FCOE;
+		break;
+	case QED_LL2_TYPE_ISCSI:
+		p_ramrod->conn_type = PROTOCOLID_ISCSI;
+		break;
+	case QED_LL2_TYPE_ROCE:
+		p_ramrod->conn_type = PROTOCOLID_ROCE;
+		break;
+	case QED_LL2_TYPE_IWARP:
+		p_ramrod->conn_type = PROTOCOLID_IWARP;
+		break;
+	case QED_LL2_TYPE_OOO:
+		if (p_hwfn->hw_info.personality == QED_PCI_ISCSI)
+			p_ramrod->conn_type = PROTOCOLID_ISCSI;
+		else
+			p_ramrod->conn_type = PROTOCOLID_IWARP;
+		break;
+	default:
+		p_ramrod->conn_type = PROTOCOLID_ETH;
+		DP_NOTICE(p_hwfn, "Unknown connection type: %d\n", conn_type);
+	}
+
+	p_ramrod->gsi_offload_flag = p_ll2_conn->input.gsi_enable;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_ll2_rx_queue_stop(struct qed_hwfn *p_hwfn,
+				    struct qed_ll2_info *p_ll2_conn)
+{
+	struct core_rx_stop_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_ll2_conn->cid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 CORE_RAMROD_RX_QUEUE_STOP,
+				 PROTOCOLID_CORE, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.core_rx_queue_stop;
+
+	p_ramrod->complete_event_flg = 1;
+	p_ramrod->queue_id = p_ll2_conn->queue_id;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_ll2_tx_queue_stop(struct qed_hwfn *p_hwfn,
+				    struct qed_ll2_info *p_ll2_conn)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_ll2_conn->cid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 CORE_RAMROD_TX_QUEUE_STOP,
+				 PROTOCOLID_CORE, &init_data);
+	if (rc)
+		return rc;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int
+qed_ll2_acquire_connection_rx(struct qed_hwfn *p_hwfn,
+			      struct qed_ll2_info *p_ll2_info)
+{
+	struct qed_ll2_rx_packet *p_descq;
+	u32 capacity;
+	int rc = 0;
+
+	if (!p_ll2_info->input.rx_num_desc)
+		goto out;
+
+	rc = qed_chain_alloc(p_hwfn->cdev,
+			     QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+			     QED_CHAIN_MODE_NEXT_PTR,
+			     QED_CHAIN_CNT_TYPE_U16,
+			     p_ll2_info->input.rx_num_desc,
+			     sizeof(struct core_rx_bd),
+			     &p_ll2_info->rx_queue.rxq_chain, NULL);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Failed to allocate ll2 rxq chain\n");
+		goto out;
+	}
+
+	capacity = qed_chain_get_capacity(&p_ll2_info->rx_queue.rxq_chain);
+	p_descq = kcalloc(capacity, sizeof(struct qed_ll2_rx_packet),
+			  GFP_KERNEL);
+	if (!p_descq) {
+		rc = -ENOMEM;
+		DP_NOTICE(p_hwfn, "Failed to allocate ll2 Rx desc\n");
+		goto out;
+	}
+	p_ll2_info->rx_queue.descq_array = p_descq;
+
+	rc = qed_chain_alloc(p_hwfn->cdev,
+			     QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+			     QED_CHAIN_MODE_PBL,
+			     QED_CHAIN_CNT_TYPE_U16,
+			     p_ll2_info->input.rx_num_desc,
+			     sizeof(struct core_rx_fast_path_cqe),
+			     &p_ll2_info->rx_queue.rcq_chain, NULL);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Failed to allocate ll2 rcq chain\n");
+		goto out;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_LL2,
+		   "Allocated LL2 Rxq [Type %08x] with 0x%08x buffers\n",
+		   p_ll2_info->input.conn_type, p_ll2_info->input.rx_num_desc);
+
+out:
+	return rc;
+}
+
+static int qed_ll2_acquire_connection_tx(struct qed_hwfn *p_hwfn,
+					 struct qed_ll2_info *p_ll2_info)
+{
+	struct qed_ll2_tx_packet *p_descq;
+	u32 desc_size;
+	u32 capacity;
+	int rc = 0;
+
+	if (!p_ll2_info->input.tx_num_desc)
+		goto out;
+
+	rc = qed_chain_alloc(p_hwfn->cdev,
+			     QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+			     QED_CHAIN_MODE_PBL,
+			     QED_CHAIN_CNT_TYPE_U16,
+			     p_ll2_info->input.tx_num_desc,
+			     sizeof(struct core_tx_bd),
+			     &p_ll2_info->tx_queue.txq_chain, NULL);
+	if (rc)
+		goto out;
+
+	capacity = qed_chain_get_capacity(&p_ll2_info->tx_queue.txq_chain);
+	/* First element is part of the packet, rest are flexibly added */
+	desc_size = (sizeof(*p_descq) +
+		     (p_ll2_info->input.tx_max_bds_per_packet - 1) *
+		     sizeof(p_descq->bds_set));
+
+	p_descq = kcalloc(capacity, desc_size, GFP_KERNEL);
+	if (!p_descq) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	p_ll2_info->tx_queue.descq_mem = p_descq;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_LL2,
+		   "Allocated LL2 Txq [Type %08x] with 0x%08x buffers\n",
+		   p_ll2_info->input.conn_type, p_ll2_info->input.tx_num_desc);
+
+out:
+	if (rc)
+		DP_NOTICE(p_hwfn,
+			  "Can't allocate memory for Tx LL2 with 0x%08x buffers\n",
+			  p_ll2_info->input.tx_num_desc);
+	return rc;
+}
+
+static int
+qed_ll2_acquire_connection_ooo(struct qed_hwfn *p_hwfn,
+			       struct qed_ll2_info *p_ll2_info, u16 mtu)
+{
+	struct qed_ooo_buffer *p_buf = NULL;
+	void *p_virt;
+	u16 buf_idx;
+	int rc = 0;
+
+	if (p_ll2_info->input.conn_type != QED_LL2_TYPE_OOO)
+		return rc;
+
+	/* Correct number of requested OOO buffers if needed */
+	if (!p_ll2_info->input.rx_num_ooo_buffers) {
+		u16 num_desc = p_ll2_info->input.rx_num_desc;
+
+		if (!num_desc)
+			return -EINVAL;
+		p_ll2_info->input.rx_num_ooo_buffers = num_desc * 2;
+	}
+
+	for (buf_idx = 0; buf_idx < p_ll2_info->input.rx_num_ooo_buffers;
+	     buf_idx++) {
+		p_buf = kzalloc(sizeof(*p_buf), GFP_KERNEL);
+		if (!p_buf) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		p_buf->rx_buffer_size = mtu + 26 + ETH_CACHE_LINE_SIZE;
+		p_buf->rx_buffer_size = (p_buf->rx_buffer_size +
+					 ETH_CACHE_LINE_SIZE - 1) &
+					~(ETH_CACHE_LINE_SIZE - 1);
+		p_virt = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+					    p_buf->rx_buffer_size,
+					    &p_buf->rx_buffer_phys_addr,
+					    GFP_KERNEL);
+		if (!p_virt) {
+			kfree(p_buf);
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		p_buf->rx_buffer_virt_addr = p_virt;
+		qed_ooo_put_free_buffer(p_hwfn, p_hwfn->p_ooo_info, p_buf);
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_LL2,
+		   "Allocated [%04x] LL2 OOO buffers [each of size 0x%08x]\n",
+		   p_ll2_info->input.rx_num_ooo_buffers, p_buf->rx_buffer_size);
+
+out:
+	return rc;
+}
+
+static int
+qed_ll2_set_cbs(struct qed_ll2_info *p_ll2_info, const struct qed_ll2_cbs *cbs)
+{
+	if (!cbs || (!cbs->rx_comp_cb ||
+		     !cbs->rx_release_cb ||
+		     !cbs->tx_comp_cb || !cbs->tx_release_cb || !cbs->cookie))
+		return -EINVAL;
+
+	p_ll2_info->cbs.rx_comp_cb = cbs->rx_comp_cb;
+	p_ll2_info->cbs.rx_release_cb = cbs->rx_release_cb;
+	p_ll2_info->cbs.tx_comp_cb = cbs->tx_comp_cb;
+	p_ll2_info->cbs.tx_release_cb = cbs->tx_release_cb;
+	p_ll2_info->cbs.slowpath_cb = cbs->slowpath_cb;
+	p_ll2_info->cbs.cookie = cbs->cookie;
+
+	return 0;
+}
+
+static enum core_error_handle
+qed_ll2_get_error_choice(enum qed_ll2_error_handle err)
+{
+	switch (err) {
+	case QED_LL2_DROP_PACKET:
+		return LL2_DROP_PACKET;
+	case QED_LL2_DO_NOTHING:
+		return LL2_DO_NOTHING;
+	case QED_LL2_ASSERT:
+		return LL2_ASSERT;
+	default:
+		return LL2_DO_NOTHING;
+	}
+}
+
+int qed_ll2_acquire_connection(void *cxt, struct qed_ll2_acquire_data *data)
+{
+	struct qed_hwfn *p_hwfn = cxt;
+	qed_int_comp_cb_t comp_rx_cb, comp_tx_cb;
+	struct qed_ll2_info *p_ll2_info = NULL;
+	u8 i, *p_tx_max;
+	int rc;
+
+	if (!data->p_connection_handle || !p_hwfn->p_ll2_info)
+		return -EINVAL;
+
+	/* Find a free connection to be used */
+	for (i = 0; (i < QED_MAX_NUM_OF_LL2_CONNECTIONS); i++) {
+		mutex_lock(&p_hwfn->p_ll2_info[i].mutex);
+		if (p_hwfn->p_ll2_info[i].b_active) {
+			mutex_unlock(&p_hwfn->p_ll2_info[i].mutex);
+			continue;
+		}
+
+		p_hwfn->p_ll2_info[i].b_active = true;
+		p_ll2_info = &p_hwfn->p_ll2_info[i];
+		mutex_unlock(&p_hwfn->p_ll2_info[i].mutex);
+		break;
+	}
+	if (!p_ll2_info)
+		return -EBUSY;
+
+	memcpy(&p_ll2_info->input, &data->input, sizeof(p_ll2_info->input));
+
+	switch (data->input.tx_dest) {
+	case QED_LL2_TX_DEST_NW:
+		p_ll2_info->tx_dest = CORE_TX_DEST_NW;
+		break;
+	case QED_LL2_TX_DEST_LB:
+		p_ll2_info->tx_dest = CORE_TX_DEST_LB;
+		break;
+	case QED_LL2_TX_DEST_DROP:
+		p_ll2_info->tx_dest = CORE_TX_DEST_DROP;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (data->input.conn_type == QED_LL2_TYPE_OOO ||
+	    data->input.secondary_queue)
+		p_ll2_info->main_func_queue = false;
+	else
+		p_ll2_info->main_func_queue = true;
+
+	/* Correct maximum number of Tx BDs */
+	p_tx_max = &p_ll2_info->input.tx_max_bds_per_packet;
+	if (*p_tx_max == 0)
+		*p_tx_max = CORE_LL2_TX_MAX_BDS_PER_PACKET;
+	else
+		*p_tx_max = min_t(u8, *p_tx_max,
+				  CORE_LL2_TX_MAX_BDS_PER_PACKET);
+
+	rc = qed_ll2_set_cbs(p_ll2_info, data->cbs);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Invalid callback functions\n");
+		goto q_allocate_fail;
+	}
+
+	rc = qed_ll2_acquire_connection_rx(p_hwfn, p_ll2_info);
+	if (rc)
+		goto q_allocate_fail;
+
+	rc = qed_ll2_acquire_connection_tx(p_hwfn, p_ll2_info);
+	if (rc)
+		goto q_allocate_fail;
+
+	rc = qed_ll2_acquire_connection_ooo(p_hwfn, p_ll2_info,
+					    data->input.mtu);
+	if (rc)
+		goto q_allocate_fail;
+
+	/* Register callbacks for the Rx/Tx queues */
+	if (data->input.conn_type == QED_LL2_TYPE_OOO) {
+		comp_rx_cb = qed_ll2_lb_rxq_completion;
+		comp_tx_cb = qed_ll2_lb_txq_completion;
+	} else {
+		comp_rx_cb = qed_ll2_rxq_completion;
+		comp_tx_cb = qed_ll2_txq_completion;
+	}
+
+	if (data->input.rx_num_desc) {
+		qed_int_register_cb(p_hwfn, comp_rx_cb,
+				    &p_hwfn->p_ll2_info[i],
+				    &p_ll2_info->rx_queue.rx_sb_index,
+				    &p_ll2_info->rx_queue.p_fw_cons);
+		p_ll2_info->rx_queue.b_cb_registred = true;
+	}
+
+	if (data->input.tx_num_desc) {
+		qed_int_register_cb(p_hwfn,
+				    comp_tx_cb,
+				    &p_hwfn->p_ll2_info[i],
+				    &p_ll2_info->tx_queue.tx_sb_index,
+				    &p_ll2_info->tx_queue.p_fw_cons);
+		p_ll2_info->tx_queue.b_cb_registred = true;
+	}
+
+	*data->p_connection_handle = i;
+	return rc;
+
+q_allocate_fail:
+	qed_ll2_release_connection(p_hwfn, i);
+	return -ENOMEM;
+}
+
+static int qed_ll2_establish_connection_rx(struct qed_hwfn *p_hwfn,
+					   struct qed_ll2_info *p_ll2_conn)
+{
+	enum qed_ll2_error_handle error_input;
+	enum core_error_handle error_mode;
+	u8 action_on_error = 0;
+
+	if (!QED_LL2_RX_REGISTERED(p_ll2_conn))
+		return 0;
+
+	DIRECT_REG_WR(p_ll2_conn->rx_queue.set_prod_addr, 0x0);
+	error_input = p_ll2_conn->input.ai_err_packet_too_big;
+	error_mode = qed_ll2_get_error_choice(error_input);
+	SET_FIELD(action_on_error,
+		  CORE_RX_ACTION_ON_ERROR_PACKET_TOO_BIG, error_mode);
+	error_input = p_ll2_conn->input.ai_err_no_buf;
+	error_mode = qed_ll2_get_error_choice(error_input);
+	SET_FIELD(action_on_error, CORE_RX_ACTION_ON_ERROR_NO_BUFF, error_mode);
+
+	return qed_sp_ll2_rx_queue_start(p_hwfn, p_ll2_conn, action_on_error);
+}
+
+static void
+qed_ll2_establish_connection_ooo(struct qed_hwfn *p_hwfn,
+				 struct qed_ll2_info *p_ll2_conn)
+{
+	if (p_ll2_conn->input.conn_type != QED_LL2_TYPE_OOO)
+		return;
+
+	qed_ooo_release_all_isles(p_hwfn, p_hwfn->p_ooo_info);
+	qed_ooo_submit_rx_buffers(p_hwfn, p_ll2_conn);
+}
+
+int qed_ll2_establish_connection(void *cxt, u8 connection_handle)
+{
+	struct qed_hwfn *p_hwfn = cxt;
+	struct qed_ll2_info *p_ll2_conn;
+	struct qed_ll2_tx_packet *p_pkt;
+	struct qed_ll2_rx_queue *p_rx;
+	struct qed_ll2_tx_queue *p_tx;
+	struct qed_ptt *p_ptt;
+	int rc = -EINVAL;
+	u32 i, capacity;
+	u32 desc_size;
+	u8 qid;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EAGAIN;
+
+	p_ll2_conn = qed_ll2_handle_sanity_lock(p_hwfn, connection_handle);
+	if (!p_ll2_conn) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	p_rx = &p_ll2_conn->rx_queue;
+	p_tx = &p_ll2_conn->tx_queue;
+
+	qed_chain_reset(&p_rx->rxq_chain);
+	qed_chain_reset(&p_rx->rcq_chain);
+	INIT_LIST_HEAD(&p_rx->active_descq);
+	INIT_LIST_HEAD(&p_rx->free_descq);
+	INIT_LIST_HEAD(&p_rx->posting_descq);
+	spin_lock_init(&p_rx->lock);
+	capacity = qed_chain_get_capacity(&p_rx->rxq_chain);
+	for (i = 0; i < capacity; i++)
+		list_add_tail(&p_rx->descq_array[i].list_entry,
+			      &p_rx->free_descq);
+	*p_rx->p_fw_cons = 0;
+
+	qed_chain_reset(&p_tx->txq_chain);
+	INIT_LIST_HEAD(&p_tx->active_descq);
+	INIT_LIST_HEAD(&p_tx->free_descq);
+	INIT_LIST_HEAD(&p_tx->sending_descq);
+	spin_lock_init(&p_tx->lock);
+	capacity = qed_chain_get_capacity(&p_tx->txq_chain);
+	/* First element is part of the packet, rest are flexibly added */
+	desc_size = (sizeof(*p_pkt) +
+		     (p_ll2_conn->input.tx_max_bds_per_packet - 1) *
+		     sizeof(p_pkt->bds_set));
+
+	for (i = 0; i < capacity; i++) {
+		p_pkt = p_tx->descq_mem + desc_size * i;
+		list_add_tail(&p_pkt->list_entry, &p_tx->free_descq);
+	}
+	p_tx->cur_completing_bd_idx = 0;
+	p_tx->bds_idx = 0;
+	p_tx->b_completing_packet = false;
+	p_tx->cur_send_packet = NULL;
+	p_tx->cur_send_frag_num = 0;
+	p_tx->cur_completing_frag_num = 0;
+	*p_tx->p_fw_cons = 0;
+
+	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_CORE, &p_ll2_conn->cid);
+	if (rc)
+		goto out;
+
+	qid = p_hwfn->hw_info.resc_start[QED_LL2_QUEUE] + connection_handle;
+	p_ll2_conn->queue_id = qid;
+	p_ll2_conn->tx_stats_id = qid;
+	p_rx->set_prod_addr = (u8 __iomem *)p_hwfn->regview +
+					    GTT_BAR0_MAP_REG_TSDM_RAM +
+					    TSTORM_LL2_RX_PRODS_OFFSET(qid);
+	p_tx->doorbell_addr = (u8 __iomem *)p_hwfn->doorbells +
+					    qed_db_addr(p_ll2_conn->cid,
+							DQ_DEMS_LEGACY);
+
+	rc = qed_ll2_establish_connection_rx(p_hwfn, p_ll2_conn);
+	if (rc)
+		goto out;
+
+	rc = qed_sp_ll2_tx_queue_start(p_hwfn, p_ll2_conn);
+	if (rc)
+		goto out;
+
+	if (!QED_IS_RDMA_PERSONALITY(p_hwfn))
+		qed_wr(p_hwfn, p_ptt, PRS_REG_USE_LIGHT_L2, 1);
+
+	qed_ll2_establish_connection_ooo(p_hwfn, p_ll2_conn);
+
+	if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_FCOE) {
+		qed_llh_add_protocol_filter(p_hwfn, p_ptt,
+					    0x8906, 0,
+					    QED_LLH_FILTER_ETHERTYPE);
+		qed_llh_add_protocol_filter(p_hwfn, p_ptt,
+					    0x8914, 0,
+					    QED_LLH_FILTER_ETHERTYPE);
+	}
+
+out:
+	qed_ptt_release(p_hwfn, p_ptt);
+	return rc;
+}
+
+static void qed_ll2_post_rx_buffer_notify_fw(struct qed_hwfn *p_hwfn,
+					     struct qed_ll2_rx_queue *p_rx,
+					     struct qed_ll2_rx_packet *p_curp)
+{
+	struct qed_ll2_rx_packet *p_posting_packet = NULL;
+	struct core_ll2_rx_prod rx_prod = { 0, 0, 0 };
+	bool b_notify_fw = false;
+	u16 bd_prod, cq_prod;
+
+	/* This handles the flushing of already posted buffers */
+	while (!list_empty(&p_rx->posting_descq)) {
+		p_posting_packet = list_first_entry(&p_rx->posting_descq,
+						    struct qed_ll2_rx_packet,
+						    list_entry);
+		list_move_tail(&p_posting_packet->list_entry,
+			       &p_rx->active_descq);
+		b_notify_fw = true;
+	}
+
+	/* This handles the supplied packet [if there is one] */
+	if (p_curp) {
+		list_add_tail(&p_curp->list_entry, &p_rx->active_descq);
+		b_notify_fw = true;
+	}
+
+	if (!b_notify_fw)
+		return;
+
+	bd_prod = qed_chain_get_prod_idx(&p_rx->rxq_chain);
+	cq_prod = qed_chain_get_prod_idx(&p_rx->rcq_chain);
+	rx_prod.bd_prod = cpu_to_le16(bd_prod);
+	rx_prod.cqe_prod = cpu_to_le16(cq_prod);
+	DIRECT_REG_WR(p_rx->set_prod_addr, *((u32 *)&rx_prod));
+}
+
+int qed_ll2_post_rx_buffer(void *cxt,
+			   u8 connection_handle,
+			   dma_addr_t addr,
+			   u16 buf_len, void *cookie, u8 notify_fw)
+{
+	struct qed_hwfn *p_hwfn = cxt;
+	struct core_rx_bd_with_buff_len *p_curb = NULL;
+	struct qed_ll2_rx_packet *p_curp = NULL;
+	struct qed_ll2_info *p_ll2_conn;
+	struct qed_ll2_rx_queue *p_rx;
+	unsigned long flags;
+	void *p_data;
+	int rc = 0;
+
+	p_ll2_conn = qed_ll2_handle_sanity(p_hwfn, connection_handle);
+	if (!p_ll2_conn)
+		return -EINVAL;
+	p_rx = &p_ll2_conn->rx_queue;
+
+	spin_lock_irqsave(&p_rx->lock, flags);
+	if (!list_empty(&p_rx->free_descq))
+		p_curp = list_first_entry(&p_rx->free_descq,
+					  struct qed_ll2_rx_packet, list_entry);
+	if (p_curp) {
+		if (qed_chain_get_elem_left(&p_rx->rxq_chain) &&
+		    qed_chain_get_elem_left(&p_rx->rcq_chain)) {
+			p_data = qed_chain_produce(&p_rx->rxq_chain);
+			p_curb = (struct core_rx_bd_with_buff_len *)p_data;
+			qed_chain_produce(&p_rx->rcq_chain);
+		}
+	}
+
+	/* If we're lacking entires, let's try to flush buffers to FW */
+	if (!p_curp || !p_curb) {
+		rc = -EBUSY;
+		p_curp = NULL;
+		goto out_notify;
+	}
+
+	/* We have an Rx packet we can fill */
+	DMA_REGPAIR_LE(p_curb->addr, addr);
+	p_curb->buff_length = cpu_to_le16(buf_len);
+	p_curp->rx_buf_addr = addr;
+	p_curp->cookie = cookie;
+	p_curp->rxq_bd = p_curb;
+	p_curp->buf_length = buf_len;
+	list_del(&p_curp->list_entry);
+
+	/* Check if we only want to enqueue this packet without informing FW */
+	if (!notify_fw) {
+		list_add_tail(&p_curp->list_entry, &p_rx->posting_descq);
+		goto out;
+	}
+
+out_notify:
+	qed_ll2_post_rx_buffer_notify_fw(p_hwfn, p_rx, p_curp);
+out:
+	spin_unlock_irqrestore(&p_rx->lock, flags);
+	return rc;
+}
+
+static void qed_ll2_prepare_tx_packet_set(struct qed_hwfn *p_hwfn,
+					  struct qed_ll2_tx_queue *p_tx,
+					  struct qed_ll2_tx_packet *p_curp,
+					  struct qed_ll2_tx_pkt_info *pkt,
+					  u8 notify_fw)
+{
+	list_del(&p_curp->list_entry);
+	p_curp->cookie = pkt->cookie;
+	p_curp->bd_used = pkt->num_of_bds;
+	p_curp->notify_fw = notify_fw;
+	p_tx->cur_send_packet = p_curp;
+	p_tx->cur_send_frag_num = 0;
+
+	p_curp->bds_set[p_tx->cur_send_frag_num].tx_frag = pkt->first_frag;
+	p_curp->bds_set[p_tx->cur_send_frag_num].frag_len = pkt->first_frag_len;
+	p_tx->cur_send_frag_num++;
+}
+
+static void
+qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
+				 struct qed_ll2_info *p_ll2,
+				 struct qed_ll2_tx_packet *p_curp,
+				 struct qed_ll2_tx_pkt_info *pkt)
+{
+	struct qed_chain *p_tx_chain = &p_ll2->tx_queue.txq_chain;
+	u16 prod_idx = qed_chain_get_prod_idx(p_tx_chain);
+	struct core_tx_bd *start_bd = NULL;
+	enum core_roce_flavor_type roce_flavor;
+	enum core_tx_dest tx_dest;
+	u16 bd_data = 0, frag_idx;
+
+	roce_flavor = (pkt->qed_roce_flavor == QED_LL2_ROCE) ? CORE_ROCE
+							     : CORE_RROCE;
+
+	switch (pkt->tx_dest) {
+	case QED_LL2_TX_DEST_NW:
+		tx_dest = CORE_TX_DEST_NW;
+		break;
+	case QED_LL2_TX_DEST_LB:
+		tx_dest = CORE_TX_DEST_LB;
+		break;
+	case QED_LL2_TX_DEST_DROP:
+		tx_dest = CORE_TX_DEST_DROP;
+		break;
+	default:
+		tx_dest = CORE_TX_DEST_LB;
+		break;
+	}
+
+	start_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain);
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn) &&
+	    p_ll2->input.conn_type == QED_LL2_TYPE_OOO)
+		start_bd->nw_vlan_or_lb_echo =
+		    cpu_to_le16(IWARP_LL2_IN_ORDER_TX_QUEUE);
+	else
+		start_bd->nw_vlan_or_lb_echo = cpu_to_le16(pkt->vlan);
+	SET_FIELD(start_bd->bitfield1, CORE_TX_BD_L4_HDR_OFFSET_W,
+		  cpu_to_le16(pkt->l4_hdr_offset_w));
+	SET_FIELD(start_bd->bitfield1, CORE_TX_BD_TX_DST, tx_dest);
+	bd_data |= pkt->bd_flags;
+	SET_FIELD(bd_data, CORE_TX_BD_DATA_START_BD, 0x1);
+	SET_FIELD(bd_data, CORE_TX_BD_DATA_NBDS, pkt->num_of_bds);
+	SET_FIELD(bd_data, CORE_TX_BD_DATA_ROCE_FLAV, roce_flavor);
+	SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_CSUM, !!(pkt->enable_ip_cksum));
+	SET_FIELD(bd_data, CORE_TX_BD_DATA_L4_CSUM, !!(pkt->enable_l4_cksum));
+	SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_LEN, !!(pkt->calc_ip_len));
+	start_bd->bd_data.as_bitfield = cpu_to_le16(bd_data);
+	DMA_REGPAIR_LE(start_bd->addr, pkt->first_frag);
+	start_bd->nbytes = cpu_to_le16(pkt->first_frag_len);
+
+	DP_VERBOSE(p_hwfn,
+		   (NETIF_MSG_TX_QUEUED | QED_MSG_LL2),
+		   "LL2 [q 0x%02x cid 0x%08x type 0x%08x] Tx Producer at [0x%04x] - set with a %04x bytes %02x BDs buffer at %08x:%08x\n",
+		   p_ll2->queue_id,
+		   p_ll2->cid,
+		   p_ll2->input.conn_type,
+		   prod_idx,
+		   pkt->first_frag_len,
+		   pkt->num_of_bds,
+		   le32_to_cpu(start_bd->addr.hi),
+		   le32_to_cpu(start_bd->addr.lo));
+
+	if (p_ll2->tx_queue.cur_send_frag_num == pkt->num_of_bds)
+		return;
+
+	/* Need to provide the packet with additional BDs for frags */
+	for (frag_idx = p_ll2->tx_queue.cur_send_frag_num;
+	     frag_idx < pkt->num_of_bds; frag_idx++) {
+		struct core_tx_bd **p_bd = &p_curp->bds_set[frag_idx].txq_bd;
+
+		*p_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain);
+		(*p_bd)->bd_data.as_bitfield = 0;
+		(*p_bd)->bitfield1 = 0;
+		p_curp->bds_set[frag_idx].tx_frag = 0;
+		p_curp->bds_set[frag_idx].frag_len = 0;
+	}
+}
+
+/* This should be called while the Txq spinlock is being held */
+static void qed_ll2_tx_packet_notify(struct qed_hwfn *p_hwfn,
+				     struct qed_ll2_info *p_ll2_conn)
+{
+	bool b_notify = p_ll2_conn->tx_queue.cur_send_packet->notify_fw;
+	struct qed_ll2_tx_queue *p_tx = &p_ll2_conn->tx_queue;
+	struct qed_ll2_tx_packet *p_pkt = NULL;
+	struct core_db_data db_msg = { 0, 0, 0 };
+	u16 bd_prod;
+
+	/* If there are missing BDs, don't do anything now */
+	if (p_ll2_conn->tx_queue.cur_send_frag_num !=
+	    p_ll2_conn->tx_queue.cur_send_packet->bd_used)
+		return;
+
+	/* Push the current packet to the list and clean after it */
+	list_add_tail(&p_ll2_conn->tx_queue.cur_send_packet->list_entry,
+		      &p_ll2_conn->tx_queue.sending_descq);
+	p_ll2_conn->tx_queue.cur_send_packet = NULL;
+	p_ll2_conn->tx_queue.cur_send_frag_num = 0;
+
+	/* Notify FW of packet only if requested to */
+	if (!b_notify)
+		return;
+
+	bd_prod = qed_chain_get_prod_idx(&p_ll2_conn->tx_queue.txq_chain);
+
+	while (!list_empty(&p_tx->sending_descq)) {
+		p_pkt = list_first_entry(&p_tx->sending_descq,
+					 struct qed_ll2_tx_packet, list_entry);
+		if (!p_pkt)
+			break;
+
+		list_move_tail(&p_pkt->list_entry, &p_tx->active_descq);
+	}
+
+	SET_FIELD(db_msg.params, CORE_DB_DATA_DEST, DB_DEST_XCM);
+	SET_FIELD(db_msg.params, CORE_DB_DATA_AGG_CMD, DB_AGG_CMD_SET);
+	SET_FIELD(db_msg.params, CORE_DB_DATA_AGG_VAL_SEL,
+		  DQ_XCM_CORE_TX_BD_PROD_CMD);
+	db_msg.agg_flags = DQ_XCM_CORE_DQ_CF_CMD;
+	db_msg.spq_prod = cpu_to_le16(bd_prod);
+
+	/* Make sure the BDs data is updated before ringing the doorbell */
+	wmb();
+
+	DIRECT_REG_WR(p_tx->doorbell_addr, *((u32 *)&db_msg));
+
+	DP_VERBOSE(p_hwfn,
+		   (NETIF_MSG_TX_QUEUED | QED_MSG_LL2),
+		   "LL2 [q 0x%02x cid 0x%08x type 0x%08x] Doorbelled [producer 0x%04x]\n",
+		   p_ll2_conn->queue_id,
+		   p_ll2_conn->cid,
+		   p_ll2_conn->input.conn_type, db_msg.spq_prod);
+}
+
+int qed_ll2_prepare_tx_packet(void *cxt,
+			      u8 connection_handle,
+			      struct qed_ll2_tx_pkt_info *pkt,
+			      bool notify_fw)
+{
+	struct qed_hwfn *p_hwfn = cxt;
+	struct qed_ll2_tx_packet *p_curp = NULL;
+	struct qed_ll2_info *p_ll2_conn = NULL;
+	struct qed_ll2_tx_queue *p_tx;
+	struct qed_chain *p_tx_chain;
+	unsigned long flags;
+	int rc = 0;
+
+	p_ll2_conn = qed_ll2_handle_sanity(p_hwfn, connection_handle);
+	if (!p_ll2_conn)
+		return -EINVAL;
+	p_tx = &p_ll2_conn->tx_queue;
+	p_tx_chain = &p_tx->txq_chain;
+
+	if (pkt->num_of_bds > p_ll2_conn->input.tx_max_bds_per_packet)
+		return -EIO;
+
+	spin_lock_irqsave(&p_tx->lock, flags);
+	if (p_tx->cur_send_packet) {
+		rc = -EEXIST;
+		goto out;
+	}
+
+	/* Get entry, but only if we have tx elements for it */
+	if (!list_empty(&p_tx->free_descq))
+		p_curp = list_first_entry(&p_tx->free_descq,
+					  struct qed_ll2_tx_packet, list_entry);
+	if (p_curp && qed_chain_get_elem_left(p_tx_chain) < pkt->num_of_bds)
+		p_curp = NULL;
+
+	if (!p_curp) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	/* Prepare packet and BD, and perhaps send a doorbell to FW */
+	qed_ll2_prepare_tx_packet_set(p_hwfn, p_tx, p_curp, pkt, notify_fw);
+
+	qed_ll2_prepare_tx_packet_set_bd(p_hwfn, p_ll2_conn, p_curp, pkt);
+
+	qed_ll2_tx_packet_notify(p_hwfn, p_ll2_conn);
+
+out:
+	spin_unlock_irqrestore(&p_tx->lock, flags);
+	return rc;
+}
+
+int qed_ll2_set_fragment_of_tx_packet(void *cxt,
+				      u8 connection_handle,
+				      dma_addr_t addr, u16 nbytes)
+{
+	struct qed_ll2_tx_packet *p_cur_send_packet = NULL;
+	struct qed_hwfn *p_hwfn = cxt;
+	struct qed_ll2_info *p_ll2_conn = NULL;
+	u16 cur_send_frag_num = 0;
+	struct core_tx_bd *p_bd;
+	unsigned long flags;
+
+	p_ll2_conn = qed_ll2_handle_sanity(p_hwfn, connection_handle);
+	if (!p_ll2_conn)
+		return -EINVAL;
+
+	if (!p_ll2_conn->tx_queue.cur_send_packet)
+		return -EINVAL;
+
+	p_cur_send_packet = p_ll2_conn->tx_queue.cur_send_packet;
+	cur_send_frag_num = p_ll2_conn->tx_queue.cur_send_frag_num;
+
+	if (cur_send_frag_num >= p_cur_send_packet->bd_used)
+		return -EINVAL;
+
+	/* Fill the BD information, and possibly notify FW */
+	p_bd = p_cur_send_packet->bds_set[cur_send_frag_num].txq_bd;
+	DMA_REGPAIR_LE(p_bd->addr, addr);
+	p_bd->nbytes = cpu_to_le16(nbytes);
+	p_cur_send_packet->bds_set[cur_send_frag_num].tx_frag = addr;
+	p_cur_send_packet->bds_set[cur_send_frag_num].frag_len = nbytes;
+
+	p_ll2_conn->tx_queue.cur_send_frag_num++;
+
+	spin_lock_irqsave(&p_ll2_conn->tx_queue.lock, flags);
+	qed_ll2_tx_packet_notify(p_hwfn, p_ll2_conn);
+	spin_unlock_irqrestore(&p_ll2_conn->tx_queue.lock, flags);
+
+	return 0;
+}
+
+int qed_ll2_terminate_connection(void *cxt, u8 connection_handle)
+{
+	struct qed_hwfn *p_hwfn = cxt;
+	struct qed_ll2_info *p_ll2_conn = NULL;
+	int rc = -EINVAL;
+	struct qed_ptt *p_ptt;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EAGAIN;
+
+	p_ll2_conn = qed_ll2_handle_sanity_lock(p_hwfn, connection_handle);
+	if (!p_ll2_conn) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* Stop Tx & Rx of connection, if needed */
+	if (QED_LL2_TX_REGISTERED(p_ll2_conn)) {
+		p_ll2_conn->tx_queue.b_cb_registred = false;
+		smp_wmb(); /* Make sure this is seen by ll2_lb_rxq_completion */
+		rc = qed_sp_ll2_tx_queue_stop(p_hwfn, p_ll2_conn);
+		if (rc)
+			goto out;
+
+		qed_ll2_txq_flush(p_hwfn, connection_handle);
+		qed_int_unregister_cb(p_hwfn, p_ll2_conn->tx_queue.tx_sb_index);
+	}
+
+	if (QED_LL2_RX_REGISTERED(p_ll2_conn)) {
+		p_ll2_conn->rx_queue.b_cb_registred = false;
+		smp_wmb(); /* Make sure this is seen by ll2_lb_rxq_completion */
+		rc = qed_sp_ll2_rx_queue_stop(p_hwfn, p_ll2_conn);
+		if (rc)
+			goto out;
+
+		qed_ll2_rxq_flush(p_hwfn, connection_handle);
+		qed_int_unregister_cb(p_hwfn, p_ll2_conn->rx_queue.rx_sb_index);
+	}
+
+	if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_OOO)
+		qed_ooo_release_all_isles(p_hwfn, p_hwfn->p_ooo_info);
+
+	if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_FCOE) {
+		qed_llh_remove_protocol_filter(p_hwfn, p_ptt,
+					       0x8906, 0,
+					       QED_LLH_FILTER_ETHERTYPE);
+		qed_llh_remove_protocol_filter(p_hwfn, p_ptt,
+					       0x8914, 0,
+					       QED_LLH_FILTER_ETHERTYPE);
+	}
+
+out:
+	qed_ptt_release(p_hwfn, p_ptt);
+	return rc;
+}
+
+static void qed_ll2_release_connection_ooo(struct qed_hwfn *p_hwfn,
+					   struct qed_ll2_info *p_ll2_conn)
+{
+	struct qed_ooo_buffer *p_buffer;
+
+	if (p_ll2_conn->input.conn_type != QED_LL2_TYPE_OOO)
+		return;
+
+	qed_ooo_release_all_isles(p_hwfn, p_hwfn->p_ooo_info);
+	while ((p_buffer = qed_ooo_get_free_buffer(p_hwfn,
+						   p_hwfn->p_ooo_info))) {
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  p_buffer->rx_buffer_size,
+				  p_buffer->rx_buffer_virt_addr,
+				  p_buffer->rx_buffer_phys_addr);
+		kfree(p_buffer);
+	}
+}
+
+void qed_ll2_release_connection(void *cxt, u8 connection_handle)
+{
+	struct qed_hwfn *p_hwfn = cxt;
+	struct qed_ll2_info *p_ll2_conn = NULL;
+
+	p_ll2_conn = qed_ll2_handle_sanity(p_hwfn, connection_handle);
+	if (!p_ll2_conn)
+		return;
+
+	kfree(p_ll2_conn->tx_queue.descq_mem);
+	qed_chain_free(p_hwfn->cdev, &p_ll2_conn->tx_queue.txq_chain);
+
+	kfree(p_ll2_conn->rx_queue.descq_array);
+	qed_chain_free(p_hwfn->cdev, &p_ll2_conn->rx_queue.rxq_chain);
+	qed_chain_free(p_hwfn->cdev, &p_ll2_conn->rx_queue.rcq_chain);
+
+	qed_cxt_release_cid(p_hwfn, p_ll2_conn->cid);
+
+	qed_ll2_release_connection_ooo(p_hwfn, p_ll2_conn);
+
+	mutex_lock(&p_ll2_conn->mutex);
+	p_ll2_conn->b_active = false;
+	mutex_unlock(&p_ll2_conn->mutex);
+}
+
+int qed_ll2_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ll2_info *p_ll2_connections;
+	u8 i;
+
+	/* Allocate LL2's set struct */
+	p_ll2_connections = kcalloc(QED_MAX_NUM_OF_LL2_CONNECTIONS,
+				    sizeof(struct qed_ll2_info), GFP_KERNEL);
+	if (!p_ll2_connections) {
+		DP_NOTICE(p_hwfn, "Failed to allocate `struct qed_ll2'\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < QED_MAX_NUM_OF_LL2_CONNECTIONS; i++)
+		p_ll2_connections[i].my_id = i;
+
+	p_hwfn->p_ll2_info = p_ll2_connections;
+	return 0;
+}
+
+void qed_ll2_setup(struct qed_hwfn *p_hwfn)
+{
+	int i;
+
+	for (i = 0; i < QED_MAX_NUM_OF_LL2_CONNECTIONS; i++)
+		mutex_init(&p_hwfn->p_ll2_info[i].mutex);
+}
+
+void qed_ll2_free(struct qed_hwfn *p_hwfn)
+{
+	if (!p_hwfn->p_ll2_info)
+		return;
+
+	kfree(p_hwfn->p_ll2_info);
+	p_hwfn->p_ll2_info = NULL;
+}
+
+static void _qed_ll2_get_port_stats(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    struct qed_ll2_stats *p_stats)
+{
+	struct core_ll2_port_stats port_stats;
+
+	memset(&port_stats, 0, sizeof(port_stats));
+	qed_memcpy_from(p_hwfn, p_ptt, &port_stats,
+			BAR0_MAP_REG_TSDM_RAM +
+			TSTORM_LL2_PORT_STAT_OFFSET(MFW_PORT(p_hwfn)),
+			sizeof(port_stats));
+
+	p_stats->gsi_invalid_hdr = HILO_64_REGPAIR(port_stats.gsi_invalid_hdr);
+	p_stats->gsi_invalid_pkt_length =
+	    HILO_64_REGPAIR(port_stats.gsi_invalid_pkt_length);
+	p_stats->gsi_unsupported_pkt_typ =
+	    HILO_64_REGPAIR(port_stats.gsi_unsupported_pkt_typ);
+	p_stats->gsi_crcchksm_error =
+	    HILO_64_REGPAIR(port_stats.gsi_crcchksm_error);
+}
+
+static void _qed_ll2_get_tstats(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt,
+				struct qed_ll2_info *p_ll2_conn,
+				struct qed_ll2_stats *p_stats)
+{
+	struct core_ll2_tstorm_per_queue_stat tstats;
+	u8 qid = p_ll2_conn->queue_id;
+	u32 tstats_addr;
+
+	memset(&tstats, 0, sizeof(tstats));
+	tstats_addr = BAR0_MAP_REG_TSDM_RAM +
+		      CORE_LL2_TSTORM_PER_QUEUE_STAT_OFFSET(qid);
+	qed_memcpy_from(p_hwfn, p_ptt, &tstats, tstats_addr, sizeof(tstats));
+
+	p_stats->packet_too_big_discard =
+			HILO_64_REGPAIR(tstats.packet_too_big_discard);
+	p_stats->no_buff_discard = HILO_64_REGPAIR(tstats.no_buff_discard);
+}
+
+static void _qed_ll2_get_ustats(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt,
+				struct qed_ll2_info *p_ll2_conn,
+				struct qed_ll2_stats *p_stats)
+{
+	struct core_ll2_ustorm_per_queue_stat ustats;
+	u8 qid = p_ll2_conn->queue_id;
+	u32 ustats_addr;
+
+	memset(&ustats, 0, sizeof(ustats));
+	ustats_addr = BAR0_MAP_REG_USDM_RAM +
+		      CORE_LL2_USTORM_PER_QUEUE_STAT_OFFSET(qid);
+	qed_memcpy_from(p_hwfn, p_ptt, &ustats, ustats_addr, sizeof(ustats));
+
+	p_stats->rcv_ucast_bytes = HILO_64_REGPAIR(ustats.rcv_ucast_bytes);
+	p_stats->rcv_mcast_bytes = HILO_64_REGPAIR(ustats.rcv_mcast_bytes);
+	p_stats->rcv_bcast_bytes = HILO_64_REGPAIR(ustats.rcv_bcast_bytes);
+	p_stats->rcv_ucast_pkts = HILO_64_REGPAIR(ustats.rcv_ucast_pkts);
+	p_stats->rcv_mcast_pkts = HILO_64_REGPAIR(ustats.rcv_mcast_pkts);
+	p_stats->rcv_bcast_pkts = HILO_64_REGPAIR(ustats.rcv_bcast_pkts);
+}
+
+static void _qed_ll2_get_pstats(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt,
+				struct qed_ll2_info *p_ll2_conn,
+				struct qed_ll2_stats *p_stats)
+{
+	struct core_ll2_pstorm_per_queue_stat pstats;
+	u8 stats_id = p_ll2_conn->tx_stats_id;
+	u32 pstats_addr;
+
+	memset(&pstats, 0, sizeof(pstats));
+	pstats_addr = BAR0_MAP_REG_PSDM_RAM +
+		      CORE_LL2_PSTORM_PER_QUEUE_STAT_OFFSET(stats_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &pstats, pstats_addr, sizeof(pstats));
+
+	p_stats->sent_ucast_bytes = HILO_64_REGPAIR(pstats.sent_ucast_bytes);
+	p_stats->sent_mcast_bytes = HILO_64_REGPAIR(pstats.sent_mcast_bytes);
+	p_stats->sent_bcast_bytes = HILO_64_REGPAIR(pstats.sent_bcast_bytes);
+	p_stats->sent_ucast_pkts = HILO_64_REGPAIR(pstats.sent_ucast_pkts);
+	p_stats->sent_mcast_pkts = HILO_64_REGPAIR(pstats.sent_mcast_pkts);
+	p_stats->sent_bcast_pkts = HILO_64_REGPAIR(pstats.sent_bcast_pkts);
+}
+
+int qed_ll2_get_stats(void *cxt,
+		      u8 connection_handle, struct qed_ll2_stats *p_stats)
+{
+	struct qed_hwfn *p_hwfn = cxt;
+	struct qed_ll2_info *p_ll2_conn = NULL;
+	struct qed_ptt *p_ptt;
+
+	memset(p_stats, 0, sizeof(*p_stats));
+
+	if ((connection_handle >= QED_MAX_NUM_OF_LL2_CONNECTIONS) ||
+	    !p_hwfn->p_ll2_info)
+		return -EINVAL;
+
+	p_ll2_conn = &p_hwfn->p_ll2_info[connection_handle];
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt) {
+		DP_ERR(p_hwfn, "Failed to acquire ptt\n");
+		return -EINVAL;
+	}
+
+	if (p_ll2_conn->input.gsi_enable)
+		_qed_ll2_get_port_stats(p_hwfn, p_ptt, p_stats);
+	_qed_ll2_get_tstats(p_hwfn, p_ptt, p_ll2_conn, p_stats);
+	_qed_ll2_get_ustats(p_hwfn, p_ptt, p_ll2_conn, p_stats);
+	if (p_ll2_conn->tx_stats_en)
+		_qed_ll2_get_pstats(p_hwfn, p_ptt, p_ll2_conn, p_stats);
+
+	qed_ptt_release(p_hwfn, p_ptt);
+	return 0;
+}
+
+static void qed_ll2b_release_rx_packet(void *cxt,
+				       u8 connection_handle,
+				       void *cookie,
+				       dma_addr_t rx_buf_addr,
+				       bool b_last_packet)
+{
+	struct qed_hwfn *p_hwfn = cxt;
+
+	qed_ll2_dealloc_buffer(p_hwfn->cdev, cookie);
+}
+
+static void qed_ll2_register_cb_ops(struct qed_dev *cdev,
+				    const struct qed_ll2_cb_ops *ops,
+				    void *cookie)
+{
+	cdev->ll2->cbs = ops;
+	cdev->ll2->cb_cookie = cookie;
+}
+
+struct qed_ll2_cbs ll2_cbs = {
+	.rx_comp_cb = &qed_ll2b_complete_rx_packet,
+	.rx_release_cb = &qed_ll2b_release_rx_packet,
+	.tx_comp_cb = &qed_ll2b_complete_tx_packet,
+	.tx_release_cb = &qed_ll2b_complete_tx_packet,
+};
+
+static void qed_ll2_set_conn_data(struct qed_dev *cdev,
+				  struct qed_ll2_acquire_data *data,
+				  struct qed_ll2_params *params,
+				  enum qed_ll2_conn_type conn_type,
+				  u8 *handle, bool lb)
+{
+	memset(data, 0, sizeof(*data));
+
+	data->input.conn_type = conn_type;
+	data->input.mtu = params->mtu;
+	data->input.rx_num_desc = QED_LL2_RX_SIZE;
+	data->input.rx_drop_ttl0_flg = params->drop_ttl0_packets;
+	data->input.rx_vlan_removal_en = params->rx_vlan_stripping;
+	data->input.tx_num_desc = QED_LL2_TX_SIZE;
+	data->p_connection_handle = handle;
+	data->cbs = &ll2_cbs;
+	ll2_cbs.cookie = QED_LEADING_HWFN(cdev);
+
+	if (lb) {
+		data->input.tx_tc = PKT_LB_TC;
+		data->input.tx_dest = QED_LL2_TX_DEST_LB;
+	} else {
+		data->input.tx_tc = 0;
+		data->input.tx_dest = QED_LL2_TX_DEST_NW;
+	}
+}
+
+static int qed_ll2_start_ooo(struct qed_dev *cdev,
+			     struct qed_ll2_params *params)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	u8 *handle = &hwfn->pf_params.iscsi_pf_params.ll2_ooo_queue_id;
+	struct qed_ll2_acquire_data data;
+	int rc;
+
+	qed_ll2_set_conn_data(cdev, &data, params,
+			      QED_LL2_TYPE_OOO, handle, true);
+
+	rc = qed_ll2_acquire_connection(hwfn, &data);
+	if (rc) {
+		DP_INFO(cdev, "Failed to acquire LL2 OOO connection\n");
+		goto out;
+	}
+
+	rc = qed_ll2_establish_connection(hwfn, *handle);
+	if (rc) {
+		DP_INFO(cdev, "Failed to establist LL2 OOO connection\n");
+		goto fail;
+	}
+
+	return 0;
+
+fail:
+	qed_ll2_release_connection(hwfn, *handle);
+out:
+	*handle = QED_LL2_UNUSED_HANDLE;
+	return rc;
+}
+
+static int qed_ll2_start(struct qed_dev *cdev, struct qed_ll2_params *params)
+{
+	struct qed_ll2_buffer *buffer, *tmp_buffer;
+	enum qed_ll2_conn_type conn_type;
+	struct qed_ll2_acquire_data data;
+	struct qed_ptt *p_ptt;
+	int rc, i;
+
+
+	/* Initialize LL2 locks & lists */
+	INIT_LIST_HEAD(&cdev->ll2->list);
+	spin_lock_init(&cdev->ll2->lock);
+	cdev->ll2->rx_size = NET_SKB_PAD + ETH_HLEN +
+			     L1_CACHE_BYTES + params->mtu;
+
+	/*Allocate memory for LL2 */
+	DP_INFO(cdev, "Allocating LL2 buffers of size %08x bytes\n",
+		cdev->ll2->rx_size);
+	for (i = 0; i < QED_LL2_RX_SIZE; i++) {
+		buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
+		if (!buffer) {
+			DP_INFO(cdev, "Failed to allocate LL2 buffers\n");
+			goto fail;
+		}
+
+		rc = qed_ll2_alloc_buffer(cdev, (u8 **)&buffer->data,
+					  &buffer->phys_addr);
+		if (rc) {
+			kfree(buffer);
+			goto fail;
+		}
+
+		list_add_tail(&buffer->list, &cdev->ll2->list);
+	}
+
+	switch (QED_LEADING_HWFN(cdev)->hw_info.personality) {
+	case QED_PCI_FCOE:
+		conn_type = QED_LL2_TYPE_FCOE;
+		break;
+	case QED_PCI_ISCSI:
+		conn_type = QED_LL2_TYPE_ISCSI;
+		break;
+	case QED_PCI_ETH_ROCE:
+		conn_type = QED_LL2_TYPE_ROCE;
+		break;
+	default:
+		conn_type = QED_LL2_TYPE_TEST;
+	}
+
+	qed_ll2_set_conn_data(cdev, &data, params, conn_type,
+			      &cdev->ll2->handle, false);
+
+	rc = qed_ll2_acquire_connection(QED_LEADING_HWFN(cdev), &data);
+	if (rc) {
+		DP_INFO(cdev, "Failed to acquire LL2 connection\n");
+		goto fail;
+	}
+
+	rc = qed_ll2_establish_connection(QED_LEADING_HWFN(cdev),
+					  cdev->ll2->handle);
+	if (rc) {
+		DP_INFO(cdev, "Failed to establish LL2 connection\n");
+		goto release_fail;
+	}
+
+	/* Post all Rx buffers to FW */
+	spin_lock_bh(&cdev->ll2->lock);
+	list_for_each_entry_safe(buffer, tmp_buffer, &cdev->ll2->list, list) {
+		rc = qed_ll2_post_rx_buffer(QED_LEADING_HWFN(cdev),
+					    cdev->ll2->handle,
+					    buffer->phys_addr, 0, buffer, 1);
+		if (rc) {
+			DP_INFO(cdev,
+				"Failed to post an Rx buffer; Deleting it\n");
+			dma_unmap_single(&cdev->pdev->dev, buffer->phys_addr,
+					 cdev->ll2->rx_size, DMA_FROM_DEVICE);
+			kfree(buffer->data);
+			list_del(&buffer->list);
+			kfree(buffer);
+		} else {
+			cdev->ll2->rx_cnt++;
+		}
+	}
+	spin_unlock_bh(&cdev->ll2->lock);
+
+	if (!cdev->ll2->rx_cnt) {
+		DP_INFO(cdev, "Failed passing even a single Rx buffer\n");
+		goto release_terminate;
+	}
+
+	if (!is_valid_ether_addr(params->ll2_mac_address)) {
+		DP_INFO(cdev, "Invalid Ethernet address\n");
+		goto release_terminate;
+	}
+
+	if (QED_LEADING_HWFN(cdev)->hw_info.personality == QED_PCI_ISCSI) {
+		DP_VERBOSE(cdev, QED_MSG_STORAGE, "Starting OOO LL2 queue\n");
+		rc = qed_ll2_start_ooo(cdev, params);
+		if (rc) {
+			DP_INFO(cdev,
+				"Failed to initialize the OOO LL2 queue\n");
+			goto release_terminate;
+		}
+	}
+
+	p_ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev));
+	if (!p_ptt) {
+		DP_INFO(cdev, "Failed to acquire PTT\n");
+		goto release_terminate;
+	}
+
+	rc = qed_llh_add_mac_filter(QED_LEADING_HWFN(cdev), p_ptt,
+				    params->ll2_mac_address);
+	qed_ptt_release(QED_LEADING_HWFN(cdev), p_ptt);
+	if (rc) {
+		DP_ERR(cdev, "Failed to allocate LLH filter\n");
+		goto release_terminate_all;
+	}
+
+	ether_addr_copy(cdev->ll2_mac_address, params->ll2_mac_address);
+	return 0;
+
+release_terminate_all:
+
+release_terminate:
+	qed_ll2_terminate_connection(QED_LEADING_HWFN(cdev), cdev->ll2->handle);
+release_fail:
+	qed_ll2_release_connection(QED_LEADING_HWFN(cdev), cdev->ll2->handle);
+fail:
+	qed_ll2_kill_buffers(cdev);
+	cdev->ll2->handle = QED_LL2_UNUSED_HANDLE;
+	return -EINVAL;
+}
+
+static int qed_ll2_stop(struct qed_dev *cdev)
+{
+	struct qed_ptt *p_ptt;
+	int rc;
+
+	if (cdev->ll2->handle == QED_LL2_UNUSED_HANDLE)
+		return 0;
+
+	p_ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev));
+	if (!p_ptt) {
+		DP_INFO(cdev, "Failed to acquire PTT\n");
+		goto fail;
+	}
+
+	qed_llh_remove_mac_filter(QED_LEADING_HWFN(cdev), p_ptt,
+				  cdev->ll2_mac_address);
+	qed_ptt_release(QED_LEADING_HWFN(cdev), p_ptt);
+	eth_zero_addr(cdev->ll2_mac_address);
+
+	if (QED_LEADING_HWFN(cdev)->hw_info.personality == QED_PCI_ISCSI)
+		qed_ll2_stop_ooo(cdev);
+
+	rc = qed_ll2_terminate_connection(QED_LEADING_HWFN(cdev),
+					  cdev->ll2->handle);
+	if (rc)
+		DP_INFO(cdev, "Failed to terminate LL2 connection\n");
+
+	qed_ll2_kill_buffers(cdev);
+
+	qed_ll2_release_connection(QED_LEADING_HWFN(cdev), cdev->ll2->handle);
+	cdev->ll2->handle = QED_LL2_UNUSED_HANDLE;
+
+	return rc;
+fail:
+	return -EINVAL;
+}
+
+static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb)
+{
+	struct qed_ll2_tx_pkt_info pkt;
+	const skb_frag_t *frag;
+	int rc = -EINVAL, i;
+	dma_addr_t mapping;
+	u16 vlan = 0;
+	u8 flags = 0;
+
+	if (unlikely(skb->ip_summed != CHECKSUM_NONE)) {
+		DP_INFO(cdev, "Cannot transmit a checksummed packet\n");
+		return -EINVAL;
+	}
+
+	if (1 + skb_shinfo(skb)->nr_frags > CORE_LL2_TX_MAX_BDS_PER_PACKET) {
+		DP_ERR(cdev, "Cannot transmit a packet with %d fragments\n",
+		       1 + skb_shinfo(skb)->nr_frags);
+		return -EINVAL;
+	}
+
+	mapping = dma_map_single(&cdev->pdev->dev, skb->data,
+				 skb->len, DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(&cdev->pdev->dev, mapping))) {
+		DP_NOTICE(cdev, "SKB mapping failed\n");
+		return -EINVAL;
+	}
+
+	/* Request HW to calculate IP csum */
+	if (!((vlan_get_protocol(skb) == htons(ETH_P_IPV6)) &&
+	      ipv6_hdr(skb)->nexthdr == NEXTHDR_IPV6))
+		flags |= BIT(CORE_TX_BD_DATA_IP_CSUM_SHIFT);
+
+	if (skb_vlan_tag_present(skb)) {
+		vlan = skb_vlan_tag_get(skb);
+		flags |= BIT(CORE_TX_BD_DATA_VLAN_INSERTION_SHIFT);
+	}
+
+	memset(&pkt, 0, sizeof(pkt));
+	pkt.num_of_bds = 1 + skb_shinfo(skb)->nr_frags;
+	pkt.vlan = vlan;
+	pkt.bd_flags = flags;
+	pkt.tx_dest = QED_LL2_TX_DEST_NW;
+	pkt.first_frag = mapping;
+	pkt.first_frag_len = skb->len;
+	pkt.cookie = skb;
+
+	rc = qed_ll2_prepare_tx_packet(&cdev->hwfns[0], cdev->ll2->handle,
+				       &pkt, 1);
+	if (rc)
+		goto err;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		frag = &skb_shinfo(skb)->frags[i];
+
+		mapping = skb_frag_dma_map(&cdev->pdev->dev, frag, 0,
+					   skb_frag_size(frag), DMA_TO_DEVICE);
+
+		if (unlikely(dma_mapping_error(&cdev->pdev->dev, mapping))) {
+			DP_NOTICE(cdev,
+				  "Unable to map frag - dropping packet\n");
+			goto err;
+		}
+
+		rc = qed_ll2_set_fragment_of_tx_packet(QED_LEADING_HWFN(cdev),
+						       cdev->ll2->handle,
+						       mapping,
+						       skb_frag_size(frag));
+
+		/* if failed not much to do here, partial packet has been posted
+		 * we can't free memory, will need to wait for completion.
+		 */
+		if (rc)
+			goto err2;
+	}
+
+	return 0;
+
+err:
+	dma_unmap_single(&cdev->pdev->dev, mapping, skb->len, DMA_TO_DEVICE);
+
+err2:
+	return rc;
+}
+
+static int qed_ll2_stats(struct qed_dev *cdev, struct qed_ll2_stats *stats)
+{
+	if (!cdev->ll2)
+		return -EINVAL;
+
+	return qed_ll2_get_stats(QED_LEADING_HWFN(cdev),
+				 cdev->ll2->handle, stats);
+}
+
+const struct qed_ll2_ops qed_ll2_ops_pass = {
+	.start = &qed_ll2_start,
+	.stop = &qed_ll2_stop,
+	.start_xmit = &qed_ll2_start_xmit,
+	.register_cb_ops = &qed_ll2_register_cb_ops,
+	.get_stats = &qed_ll2_stats,
+};
+
+int qed_ll2_alloc_if(struct qed_dev *cdev)
+{
+	cdev->ll2 = kzalloc(sizeof(*cdev->ll2), GFP_KERNEL);
+	return cdev->ll2 ? 0 : -ENOMEM;
+}
+
+void qed_ll2_dealloc_if(struct qed_dev *cdev)
+{
+	kfree(cdev->ll2);
+	cdev->ll2 = NULL;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.h b/drivers/net/ethernet/qlogic/qed/qed_ll2.h
new file mode 100644
index 0000000..f658170
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.h
@@ -0,0 +1,268 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_LL2_H
+#define _QED_LL2_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/qed/qed_chain.h>
+#include <linux/qed/qed_ll2_if.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_sp.h"
+
+#define QED_MAX_NUM_OF_LL2_CONNECTIONS                    (4)
+
+struct qed_ll2_rx_packet {
+	struct list_head list_entry;
+	struct core_rx_bd_with_buff_len *rxq_bd;
+	dma_addr_t rx_buf_addr;
+	u16 buf_length;
+	void *cookie;
+	u8 placement_offset;
+	u16 parse_flags;
+	u16 packet_length;
+	u16 vlan;
+	u32 opaque_data[2];
+};
+
+struct qed_ll2_tx_packet {
+	struct list_head list_entry;
+	u16 bd_used;
+	bool notify_fw;
+	void *cookie;
+	/* Flexible Array of bds_set determined by max_bds_per_packet */
+	struct {
+		struct core_tx_bd *txq_bd;
+		dma_addr_t tx_frag;
+		u16 frag_len;
+	} bds_set[1];
+};
+
+struct qed_ll2_rx_queue {
+	/* Lock protecting the Rx queue manipulation */
+	spinlock_t lock;
+	struct qed_chain rxq_chain;
+	struct qed_chain rcq_chain;
+	u8 rx_sb_index;
+	bool b_cb_registred;
+	__le16 *p_fw_cons;
+	struct list_head active_descq;
+	struct list_head free_descq;
+	struct list_head posting_descq;
+	struct qed_ll2_rx_packet *descq_array;
+	void __iomem *set_prod_addr;
+};
+
+struct qed_ll2_tx_queue {
+	/* Lock protecting the Tx queue manipulation */
+	spinlock_t lock;
+	struct qed_chain txq_chain;
+	u8 tx_sb_index;
+	bool b_cb_registred;
+	__le16 *p_fw_cons;
+	struct list_head active_descq;
+	struct list_head free_descq;
+	struct list_head sending_descq;
+	void *descq_mem; /* memory for variable sized qed_ll2_tx_packet*/
+	struct qed_ll2_tx_packet *cur_send_packet;
+	struct qed_ll2_tx_packet cur_completing_packet;
+	u16 cur_completing_bd_idx;
+	void __iomem *doorbell_addr;
+	u16 bds_idx;
+	u16 cur_send_frag_num;
+	u16 cur_completing_frag_num;
+	bool b_completing_packet;
+};
+
+struct qed_ll2_info {
+	/* Lock protecting the state of LL2 */
+	struct mutex mutex;
+
+	struct qed_ll2_acquire_data_inputs input;
+	u32 cid;
+	u8 my_id;
+	u8 queue_id;
+	u8 tx_stats_id;
+	bool b_active;
+	enum core_tx_dest tx_dest;
+	u8 tx_stats_en;
+	bool main_func_queue;
+	struct qed_ll2_rx_queue rx_queue;
+	struct qed_ll2_tx_queue tx_queue;
+	struct qed_ll2_cbs cbs;
+};
+
+/**
+ * @brief qed_ll2_acquire_connection - allocate resources,
+ *        starts rx & tx (if relevant) queues pair. Provides
+ *        connecion handler as output parameter.
+ *
+ *
+ * @param cxt - pointer to the hw-function [opaque to some]
+ * @param data - describes connection parameters
+ * @return int
+ */
+int qed_ll2_acquire_connection(void *cxt, struct qed_ll2_acquire_data *data);
+
+/**
+ * @brief qed_ll2_establish_connection - start previously
+ *        allocated LL2 queues pair
+ *
+ * @param cxt - pointer to the hw-function [opaque to some]
+ * @param p_ptt
+ * @param connection_handle	LL2 connection's handle obtained from
+ *                              qed_ll2_require_connection
+ *
+ * @return 0 on success, failure otherwise
+ */
+int qed_ll2_establish_connection(void *cxt, u8 connection_handle);
+
+/**
+ * @brief qed_ll2_post_rx_buffers - submit buffers to LL2 Rx queue.
+ *
+ * @param cxt - pointer to the hw-function [opaque to some]
+ * @param connection_handle	LL2 connection's handle obtained from
+ *				qed_ll2_require_connection
+ * @param addr			rx (physical address) buffers to submit
+ * @param cookie
+ * @param notify_fw		produce corresponding Rx BD immediately
+ *
+ * @return 0 on success, failure otherwise
+ */
+int qed_ll2_post_rx_buffer(void *cxt,
+			   u8 connection_handle,
+			   dma_addr_t addr,
+			   u16 buf_len, void *cookie, u8 notify_fw);
+
+/**
+ * @brief qed_ll2_prepare_tx_packet - request for start Tx BD
+ *				      to prepare Tx packet submission to FW.
+ *
+ * @param cxt - pointer to the hw-function [opaque to some]
+ * @param connection_handle
+ * @param pkt - info regarding the tx packet
+ * @param notify_fw - issue doorbell to fw for this packet
+ *
+ * @return 0 on success, failure otherwise
+ */
+int qed_ll2_prepare_tx_packet(void *cxt,
+			      u8 connection_handle,
+			      struct qed_ll2_tx_pkt_info *pkt,
+			      bool notify_fw);
+
+/**
+ * @brief qed_ll2_release_connection -	releases resources
+ *					allocated for LL2 connection
+ *
+ * @param cxt - pointer to the hw-function [opaque to some]
+ * @param connection_handle		LL2 connection's handle obtained from
+ *					qed_ll2_require_connection
+ */
+void qed_ll2_release_connection(void *cxt, u8 connection_handle);
+
+/**
+ * @brief qed_ll2_set_fragment_of_tx_packet -	provides fragments to fill
+ *						Tx BD of BDs requested by
+ *						qed_ll2_prepare_tx_packet
+ *
+ * @param cxt - pointer to the hw-function [opaque to some]
+ * @param connection_handle			LL2 connection's handle
+ *						obtained from
+ *						qed_ll2_require_connection
+ * @param addr
+ * @param nbytes
+ *
+ * @return 0 on success, failure otherwise
+ */
+int qed_ll2_set_fragment_of_tx_packet(void *cxt,
+				      u8 connection_handle,
+				      dma_addr_t addr, u16 nbytes);
+
+/**
+ * @brief qed_ll2_terminate_connection -	stops Tx/Rx queues
+ *
+ *
+ * @param cxt - pointer to the hw-function [opaque to some]
+ * @param connection_handle			LL2 connection's handle
+ *						obtained from
+ *						qed_ll2_require_connection
+ *
+ * @return 0 on success, failure otherwise
+ */
+int qed_ll2_terminate_connection(void *cxt, u8 connection_handle);
+
+/**
+ * @brief qed_ll2_get_stats -	get LL2 queue's statistics
+ *
+ *
+ * @param cxt - pointer to the hw-function [opaque to some]
+ * @param connection_handle	LL2 connection's handle obtained from
+ *				qed_ll2_require_connection
+ * @param p_stats
+ *
+ * @return 0 on success, failure otherwise
+ */
+int qed_ll2_get_stats(void *cxt,
+		      u8 connection_handle, struct qed_ll2_stats *p_stats);
+
+/**
+ * @brief qed_ll2_alloc - Allocates LL2 connections set
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_ll2_alloc(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_ll2_setup - Inits LL2 connections set
+ *
+ * @param p_hwfn
+ *
+ */
+void qed_ll2_setup(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_ll2_free - Releases LL2 connections set
+ *
+ * @param p_hwfn
+ *
+ */
+void qed_ll2_free(struct qed_hwfn *p_hwfn);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
new file mode 100644
index 0000000..7870ae2
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -0,0 +1,2097 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/stddef.h>
+#include <linux/pci.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <asm/byteorder.h>
+#include <linux/dma-mapping.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/workqueue.h>
+#include <linux/ethtool.h>
+#include <linux/etherdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/crash_dump.h>
+#include <linux/crc32.h>
+#include <linux/qed/qed_if.h>
+#include <linux/qed/qed_ll2_if.h>
+
+#include "qed.h"
+#include "qed_sriov.h"
+#include "qed_sp.h"
+#include "qed_dev_api.h"
+#include "qed_ll2.h"
+#include "qed_fcoe.h"
+#include "qed_iscsi.h"
+
+#include "qed_mcp.h"
+#include "qed_hw.h"
+#include "qed_selftest.h"
+#include "qed_debug.h"
+
+#define QED_ROCE_QPS			(8192)
+#define QED_ROCE_DPIS			(8)
+
+static char version[] =
+	"QLogic FastLinQ 4xxxx Core Module qed " DRV_MODULE_VERSION "\n";
+
+MODULE_DESCRIPTION("QLogic FastLinQ 4xxxx Core Module");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_MODULE_VERSION);
+
+#define FW_FILE_VERSION				\
+	__stringify(FW_MAJOR_VERSION) "."	\
+	__stringify(FW_MINOR_VERSION) "."	\
+	__stringify(FW_REVISION_VERSION) "."	\
+	__stringify(FW_ENGINEERING_VERSION)
+
+#define QED_FW_FILE_NAME	\
+	"qed/qed_init_values_zipped-" FW_FILE_VERSION ".bin"
+
+MODULE_FIRMWARE(QED_FW_FILE_NAME);
+
+static int __init qed_init(void)
+{
+	pr_info("%s", version);
+
+	return 0;
+}
+
+static void __exit qed_cleanup(void)
+{
+	pr_notice("qed_cleanup called\n");
+}
+
+module_init(qed_init);
+module_exit(qed_cleanup);
+
+/* Check if the DMA controller on the machine can properly handle the DMA
+ * addressing required by the device.
+*/
+static int qed_set_coherency_mask(struct qed_dev *cdev)
+{
+	struct device *dev = &cdev->pdev->dev;
+
+	if (dma_set_mask(dev, DMA_BIT_MASK(64)) == 0) {
+		if (dma_set_coherent_mask(dev, DMA_BIT_MASK(64)) != 0) {
+			DP_NOTICE(cdev,
+				  "Can't request 64-bit consistent allocations\n");
+			return -EIO;
+		}
+	} else if (dma_set_mask(dev, DMA_BIT_MASK(32)) != 0) {
+		DP_NOTICE(cdev, "Can't request 64b/32b DMA addresses\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void qed_free_pci(struct qed_dev *cdev)
+{
+	struct pci_dev *pdev = cdev->pdev;
+
+	if (cdev->doorbells && cdev->db_size)
+		iounmap(cdev->doorbells);
+	if (cdev->regview)
+		iounmap(cdev->regview);
+	if (atomic_read(&pdev->enable_cnt) == 1)
+		pci_release_regions(pdev);
+
+	pci_disable_device(pdev);
+}
+
+#define PCI_REVISION_ID_ERROR_VAL	0xff
+
+/* Performs PCI initializations as well as initializing PCI-related parameters
+ * in the device structrue. Returns 0 in case of success.
+ */
+static int qed_init_pci(struct qed_dev *cdev, struct pci_dev *pdev)
+{
+	u8 rev_id;
+	int rc;
+
+	cdev->pdev = pdev;
+
+	rc = pci_enable_device(pdev);
+	if (rc) {
+		DP_NOTICE(cdev, "Cannot enable PCI device\n");
+		goto err0;
+	}
+
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		DP_NOTICE(cdev, "No memory region found in bar #0\n");
+		rc = -EIO;
+		goto err1;
+	}
+
+	if (IS_PF(cdev) && !(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+		DP_NOTICE(cdev, "No memory region found in bar #2\n");
+		rc = -EIO;
+		goto err1;
+	}
+
+	if (atomic_read(&pdev->enable_cnt) == 1) {
+		rc = pci_request_regions(pdev, "qed");
+		if (rc) {
+			DP_NOTICE(cdev,
+				  "Failed to request PCI memory resources\n");
+			goto err1;
+		}
+		pci_set_master(pdev);
+		pci_save_state(pdev);
+	}
+
+	pci_read_config_byte(pdev, PCI_REVISION_ID, &rev_id);
+	if (rev_id == PCI_REVISION_ID_ERROR_VAL) {
+		DP_NOTICE(cdev,
+			  "Detected PCI device error [rev_id 0x%x]. Probably due to prior indication. Aborting.\n",
+			  rev_id);
+		rc = -ENODEV;
+		goto err2;
+	}
+	if (!pci_is_pcie(pdev)) {
+		DP_NOTICE(cdev, "The bus is not PCI Express\n");
+		rc = -EIO;
+		goto err2;
+	}
+
+	cdev->pci_params.pm_cap = pci_find_capability(pdev, PCI_CAP_ID_PM);
+	if (IS_PF(cdev) && !cdev->pci_params.pm_cap)
+		DP_NOTICE(cdev, "Cannot find power management capability\n");
+
+	rc = qed_set_coherency_mask(cdev);
+	if (rc)
+		goto err2;
+
+	cdev->pci_params.mem_start = pci_resource_start(pdev, 0);
+	cdev->pci_params.mem_end = pci_resource_end(pdev, 0);
+	cdev->pci_params.irq = pdev->irq;
+
+	cdev->regview = pci_ioremap_bar(pdev, 0);
+	if (!cdev->regview) {
+		DP_NOTICE(cdev, "Cannot map register space, aborting\n");
+		rc = -ENOMEM;
+		goto err2;
+	}
+
+	cdev->db_phys_addr = pci_resource_start(cdev->pdev, 2);
+	cdev->db_size = pci_resource_len(cdev->pdev, 2);
+	if (!cdev->db_size) {
+		if (IS_PF(cdev)) {
+			DP_NOTICE(cdev, "No Doorbell bar available\n");
+			return -EINVAL;
+		} else {
+			return 0;
+		}
+	}
+
+	cdev->doorbells = ioremap_wc(cdev->db_phys_addr, cdev->db_size);
+
+	if (!cdev->doorbells) {
+		DP_NOTICE(cdev, "Cannot map doorbell space\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+
+err2:
+	pci_release_regions(pdev);
+err1:
+	pci_disable_device(pdev);
+err0:
+	return rc;
+}
+
+int qed_fill_dev_info(struct qed_dev *cdev,
+		      struct qed_dev_info *dev_info)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_hw_info *hw_info = &p_hwfn->hw_info;
+	struct qed_tunnel_info *tun = &cdev->tunnel;
+	struct qed_ptt  *ptt;
+
+	memset(dev_info, 0, sizeof(struct qed_dev_info));
+
+	if (tun->vxlan.tun_cls == QED_TUNN_CLSS_MAC_VLAN &&
+	    tun->vxlan.b_mode_enabled)
+		dev_info->vxlan_enable = true;
+
+	if (tun->l2_gre.b_mode_enabled && tun->ip_gre.b_mode_enabled &&
+	    tun->l2_gre.tun_cls == QED_TUNN_CLSS_MAC_VLAN &&
+	    tun->ip_gre.tun_cls == QED_TUNN_CLSS_MAC_VLAN)
+		dev_info->gre_enable = true;
+
+	if (tun->l2_geneve.b_mode_enabled && tun->ip_geneve.b_mode_enabled &&
+	    tun->l2_geneve.tun_cls == QED_TUNN_CLSS_MAC_VLAN &&
+	    tun->ip_geneve.tun_cls == QED_TUNN_CLSS_MAC_VLAN)
+		dev_info->geneve_enable = true;
+
+	dev_info->num_hwfns = cdev->num_hwfns;
+	dev_info->pci_mem_start = cdev->pci_params.mem_start;
+	dev_info->pci_mem_end = cdev->pci_params.mem_end;
+	dev_info->pci_irq = cdev->pci_params.irq;
+	dev_info->rdma_supported = QED_IS_RDMA_PERSONALITY(p_hwfn);
+	dev_info->is_mf_default = IS_MF_DEFAULT(&cdev->hwfns[0]);
+	dev_info->dev_type = cdev->type;
+	ether_addr_copy(dev_info->hw_mac, hw_info->hw_mac_addr);
+
+	if (IS_PF(cdev)) {
+		dev_info->fw_major = FW_MAJOR_VERSION;
+		dev_info->fw_minor = FW_MINOR_VERSION;
+		dev_info->fw_rev = FW_REVISION_VERSION;
+		dev_info->fw_eng = FW_ENGINEERING_VERSION;
+		dev_info->mf_mode = cdev->mf_mode;
+		dev_info->tx_switching = true;
+
+		if (hw_info->b_wol_support == QED_WOL_SUPPORT_PME)
+			dev_info->wol_support = true;
+
+		dev_info->abs_pf_id = QED_LEADING_HWFN(cdev)->abs_pf_id;
+	} else {
+		qed_vf_get_fw_version(&cdev->hwfns[0], &dev_info->fw_major,
+				      &dev_info->fw_minor, &dev_info->fw_rev,
+				      &dev_info->fw_eng);
+	}
+
+	if (IS_PF(cdev)) {
+		ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev));
+		if (ptt) {
+			qed_mcp_get_mfw_ver(QED_LEADING_HWFN(cdev), ptt,
+					    &dev_info->mfw_rev, NULL);
+
+			qed_mcp_get_mbi_ver(QED_LEADING_HWFN(cdev), ptt,
+					    &dev_info->mbi_version);
+
+			qed_mcp_get_flash_size(QED_LEADING_HWFN(cdev), ptt,
+					       &dev_info->flash_size);
+
+			qed_ptt_release(QED_LEADING_HWFN(cdev), ptt);
+		}
+	} else {
+		qed_mcp_get_mfw_ver(QED_LEADING_HWFN(cdev), NULL,
+				    &dev_info->mfw_rev, NULL);
+	}
+
+	dev_info->mtu = hw_info->mtu;
+
+	return 0;
+}
+
+static void qed_free_cdev(struct qed_dev *cdev)
+{
+	kfree((void *)cdev);
+}
+
+static struct qed_dev *qed_alloc_cdev(struct pci_dev *pdev)
+{
+	struct qed_dev *cdev;
+
+	cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
+	if (!cdev)
+		return cdev;
+
+	qed_init_struct(cdev);
+
+	return cdev;
+}
+
+/* Sets the requested power state */
+static int qed_set_power_state(struct qed_dev *cdev, pci_power_t state)
+{
+	if (!cdev)
+		return -ENODEV;
+
+	DP_VERBOSE(cdev, NETIF_MSG_DRV, "Omitting Power state change\n");
+	return 0;
+}
+
+/* probing */
+static struct qed_dev *qed_probe(struct pci_dev *pdev,
+				 struct qed_probe_params *params)
+{
+	struct qed_dev *cdev;
+	int rc;
+
+	cdev = qed_alloc_cdev(pdev);
+	if (!cdev)
+		goto err0;
+
+	cdev->drv_type = DRV_ID_DRV_TYPE_LINUX;
+	cdev->protocol = params->protocol;
+
+	if (params->is_vf)
+		cdev->b_is_vf = true;
+
+	qed_init_dp(cdev, params->dp_module, params->dp_level);
+
+	rc = qed_init_pci(cdev, pdev);
+	if (rc) {
+		DP_ERR(cdev, "init pci failed\n");
+		goto err1;
+	}
+	DP_INFO(cdev, "PCI init completed successfully\n");
+
+	rc = qed_hw_prepare(cdev, QED_PCI_DEFAULT);
+	if (rc) {
+		DP_ERR(cdev, "hw prepare failed\n");
+		goto err2;
+	}
+
+	DP_INFO(cdev, "qed_probe completed successffuly\n");
+
+	return cdev;
+
+err2:
+	qed_free_pci(cdev);
+err1:
+	qed_free_cdev(cdev);
+err0:
+	return NULL;
+}
+
+static void qed_remove(struct qed_dev *cdev)
+{
+	if (!cdev)
+		return;
+
+	qed_hw_remove(cdev);
+
+	qed_free_pci(cdev);
+
+	qed_set_power_state(cdev, PCI_D3hot);
+
+	qed_free_cdev(cdev);
+}
+
+static void qed_disable_msix(struct qed_dev *cdev)
+{
+	if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) {
+		pci_disable_msix(cdev->pdev);
+		kfree(cdev->int_params.msix_table);
+	} else if (cdev->int_params.out.int_mode == QED_INT_MODE_MSI) {
+		pci_disable_msi(cdev->pdev);
+	}
+
+	memset(&cdev->int_params.out, 0, sizeof(struct qed_int_param));
+}
+
+static int qed_enable_msix(struct qed_dev *cdev,
+			   struct qed_int_params *int_params)
+{
+	int i, rc, cnt;
+
+	cnt = int_params->in.num_vectors;
+
+	for (i = 0; i < cnt; i++)
+		int_params->msix_table[i].entry = i;
+
+	rc = pci_enable_msix_range(cdev->pdev, int_params->msix_table,
+				   int_params->in.min_msix_cnt, cnt);
+	if (rc < cnt && rc >= int_params->in.min_msix_cnt &&
+	    (rc % cdev->num_hwfns)) {
+		pci_disable_msix(cdev->pdev);
+
+		/* If fastpath is initialized, we need at least one interrupt
+		 * per hwfn [and the slow path interrupts]. New requested number
+		 * should be a multiple of the number of hwfns.
+		 */
+		cnt = (rc / cdev->num_hwfns) * cdev->num_hwfns;
+		DP_NOTICE(cdev,
+			  "Trying to enable MSI-X with less vectors (%d out of %d)\n",
+			  cnt, int_params->in.num_vectors);
+		rc = pci_enable_msix_exact(cdev->pdev, int_params->msix_table,
+					   cnt);
+		if (!rc)
+			rc = cnt;
+	}
+
+	if (rc > 0) {
+		/* MSI-x configuration was achieved */
+		int_params->out.int_mode = QED_INT_MODE_MSIX;
+		int_params->out.num_vectors = rc;
+		rc = 0;
+	} else {
+		DP_NOTICE(cdev,
+			  "Failed to enable MSI-X [Requested %d vectors][rc %d]\n",
+			  cnt, rc);
+	}
+
+	return rc;
+}
+
+/* This function outputs the int mode and the number of enabled msix vector */
+static int qed_set_int_mode(struct qed_dev *cdev, bool force_mode)
+{
+	struct qed_int_params *int_params = &cdev->int_params;
+	struct msix_entry *tbl;
+	int rc = 0, cnt;
+
+	switch (int_params->in.int_mode) {
+	case QED_INT_MODE_MSIX:
+		/* Allocate MSIX table */
+		cnt = int_params->in.num_vectors;
+		int_params->msix_table = kcalloc(cnt, sizeof(*tbl), GFP_KERNEL);
+		if (!int_params->msix_table) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		/* Enable MSIX */
+		rc = qed_enable_msix(cdev, int_params);
+		if (!rc)
+			goto out;
+
+		DP_NOTICE(cdev, "Failed to enable MSI-X\n");
+		kfree(int_params->msix_table);
+		if (force_mode)
+			goto out;
+		/* Fallthrough */
+
+	case QED_INT_MODE_MSI:
+		if (cdev->num_hwfns == 1) {
+			rc = pci_enable_msi(cdev->pdev);
+			if (!rc) {
+				int_params->out.int_mode = QED_INT_MODE_MSI;
+				goto out;
+			}
+
+			DP_NOTICE(cdev, "Failed to enable MSI\n");
+			if (force_mode)
+				goto out;
+		}
+		/* Fallthrough */
+
+	case QED_INT_MODE_INTA:
+			int_params->out.int_mode = QED_INT_MODE_INTA;
+			rc = 0;
+			goto out;
+	default:
+		DP_NOTICE(cdev, "Unknown int_mode value %d\n",
+			  int_params->in.int_mode);
+		rc = -EINVAL;
+	}
+
+out:
+	if (!rc)
+		DP_INFO(cdev, "Using %s interrupts\n",
+			int_params->out.int_mode == QED_INT_MODE_INTA ?
+			"INTa" : int_params->out.int_mode == QED_INT_MODE_MSI ?
+			"MSI" : "MSIX");
+	cdev->int_coalescing_mode = QED_COAL_MODE_ENABLE;
+
+	return rc;
+}
+
+static void qed_simd_handler_config(struct qed_dev *cdev, void *token,
+				    int index, void(*handler)(void *))
+{
+	struct qed_hwfn *hwfn = &cdev->hwfns[index % cdev->num_hwfns];
+	int relative_idx = index / cdev->num_hwfns;
+
+	hwfn->simd_proto_handler[relative_idx].func = handler;
+	hwfn->simd_proto_handler[relative_idx].token = token;
+}
+
+static void qed_simd_handler_clean(struct qed_dev *cdev, int index)
+{
+	struct qed_hwfn *hwfn = &cdev->hwfns[index % cdev->num_hwfns];
+	int relative_idx = index / cdev->num_hwfns;
+
+	memset(&hwfn->simd_proto_handler[relative_idx], 0,
+	       sizeof(struct qed_simd_fp_handler));
+}
+
+static irqreturn_t qed_msix_sp_int(int irq, void *tasklet)
+{
+	tasklet_schedule((struct tasklet_struct *)tasklet);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t qed_single_int(int irq, void *dev_instance)
+{
+	struct qed_dev *cdev = (struct qed_dev *)dev_instance;
+	struct qed_hwfn *hwfn;
+	irqreturn_t rc = IRQ_NONE;
+	u64 status;
+	int i, j;
+
+	for (i = 0; i < cdev->num_hwfns; i++) {
+		status = qed_int_igu_read_sisr_reg(&cdev->hwfns[i]);
+
+		if (!status)
+			continue;
+
+		hwfn = &cdev->hwfns[i];
+
+		/* Slowpath interrupt */
+		if (unlikely(status & 0x1)) {
+			tasklet_schedule(hwfn->sp_dpc);
+			status &= ~0x1;
+			rc = IRQ_HANDLED;
+		}
+
+		/* Fastpath interrupts */
+		for (j = 0; j < 64; j++) {
+			if ((0x2ULL << j) & status) {
+				hwfn->simd_proto_handler[j].func(
+					hwfn->simd_proto_handler[j].token);
+				status &= ~(0x2ULL << j);
+				rc = IRQ_HANDLED;
+			}
+		}
+
+		if (unlikely(status))
+			DP_VERBOSE(hwfn, NETIF_MSG_INTR,
+				   "got an unknown interrupt status 0x%llx\n",
+				   status);
+	}
+
+	return rc;
+}
+
+int qed_slowpath_irq_req(struct qed_hwfn *hwfn)
+{
+	struct qed_dev *cdev = hwfn->cdev;
+	u32 int_mode;
+	int rc = 0;
+	u8 id;
+
+	int_mode = cdev->int_params.out.int_mode;
+	if (int_mode == QED_INT_MODE_MSIX) {
+		id = hwfn->my_id;
+		snprintf(hwfn->name, NAME_SIZE, "sp-%d-%02x:%02x.%02x",
+			 id, cdev->pdev->bus->number,
+			 PCI_SLOT(cdev->pdev->devfn), hwfn->abs_pf_id);
+		rc = request_irq(cdev->int_params.msix_table[id].vector,
+				 qed_msix_sp_int, 0, hwfn->name, hwfn->sp_dpc);
+	} else {
+		unsigned long flags = 0;
+
+		snprintf(cdev->name, NAME_SIZE, "%02x:%02x.%02x",
+			 cdev->pdev->bus->number, PCI_SLOT(cdev->pdev->devfn),
+			 PCI_FUNC(cdev->pdev->devfn));
+
+		if (cdev->int_params.out.int_mode == QED_INT_MODE_INTA)
+			flags |= IRQF_SHARED;
+
+		rc = request_irq(cdev->pdev->irq, qed_single_int,
+				 flags, cdev->name, cdev);
+	}
+
+	if (rc)
+		DP_NOTICE(cdev, "request_irq failed, rc = %d\n", rc);
+	else
+		DP_VERBOSE(hwfn, (NETIF_MSG_INTR | QED_MSG_SP),
+			   "Requested slowpath %s\n",
+			   (int_mode == QED_INT_MODE_MSIX) ? "MSI-X" : "IRQ");
+
+	return rc;
+}
+
+static void qed_slowpath_tasklet_flush(struct qed_hwfn *p_hwfn)
+{
+	/* Calling the disable function will make sure that any
+	 * currently-running function is completed. The following call to the
+	 * enable function makes this sequence a flush-like operation.
+	 */
+	if (p_hwfn->b_sp_dpc_enabled) {
+		tasklet_disable(p_hwfn->sp_dpc);
+		tasklet_enable(p_hwfn->sp_dpc);
+	}
+}
+
+void qed_slowpath_irq_sync(struct qed_hwfn *p_hwfn)
+{
+	struct qed_dev *cdev = p_hwfn->cdev;
+	u8 id = p_hwfn->my_id;
+	u32 int_mode;
+
+	int_mode = cdev->int_params.out.int_mode;
+	if (int_mode == QED_INT_MODE_MSIX)
+		synchronize_irq(cdev->int_params.msix_table[id].vector);
+	else
+		synchronize_irq(cdev->pdev->irq);
+
+	qed_slowpath_tasklet_flush(p_hwfn);
+}
+
+static void qed_slowpath_irq_free(struct qed_dev *cdev)
+{
+	int i;
+
+	if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) {
+		for_each_hwfn(cdev, i) {
+			if (!cdev->hwfns[i].b_int_requested)
+				break;
+			synchronize_irq(cdev->int_params.msix_table[i].vector);
+			free_irq(cdev->int_params.msix_table[i].vector,
+				 cdev->hwfns[i].sp_dpc);
+		}
+	} else {
+		if (QED_LEADING_HWFN(cdev)->b_int_requested)
+			free_irq(cdev->pdev->irq, cdev);
+	}
+	qed_int_disable_post_isr_release(cdev);
+}
+
+static int qed_nic_stop(struct qed_dev *cdev)
+{
+	int i, rc;
+
+	rc = qed_hw_stop(cdev);
+
+	for (i = 0; i < cdev->num_hwfns; i++) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		if (p_hwfn->b_sp_dpc_enabled) {
+			tasklet_disable(p_hwfn->sp_dpc);
+			p_hwfn->b_sp_dpc_enabled = false;
+			DP_VERBOSE(cdev, NETIF_MSG_IFDOWN,
+				   "Disabled sp tasklet [hwfn %d] at %p\n",
+				   i, p_hwfn->sp_dpc);
+		}
+	}
+
+	qed_dbg_pf_exit(cdev);
+
+	return rc;
+}
+
+static int qed_nic_setup(struct qed_dev *cdev)
+{
+	int rc, i;
+
+	/* Determine if interface is going to require LL2 */
+	if (QED_LEADING_HWFN(cdev)->hw_info.personality != QED_PCI_ETH) {
+		for (i = 0; i < cdev->num_hwfns; i++) {
+			struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+			p_hwfn->using_ll2 = true;
+		}
+	}
+
+	rc = qed_resc_alloc(cdev);
+	if (rc)
+		return rc;
+
+	DP_INFO(cdev, "Allocated qed resources\n");
+
+	qed_resc_setup(cdev);
+
+	return rc;
+}
+
+static int qed_set_int_fp(struct qed_dev *cdev, u16 cnt)
+{
+	int limit = 0;
+
+	/* Mark the fastpath as free/used */
+	cdev->int_params.fp_initialized = cnt ? true : false;
+
+	if (cdev->int_params.out.int_mode != QED_INT_MODE_MSIX)
+		limit = cdev->num_hwfns * 63;
+	else if (cdev->int_params.fp_msix_cnt)
+		limit = cdev->int_params.fp_msix_cnt;
+
+	if (!limit)
+		return -ENOMEM;
+
+	return min_t(int, cnt, limit);
+}
+
+static int qed_get_int_fp(struct qed_dev *cdev, struct qed_int_info *info)
+{
+	memset(info, 0, sizeof(struct qed_int_info));
+
+	if (!cdev->int_params.fp_initialized) {
+		DP_INFO(cdev,
+			"Protocol driver requested interrupt information, but its support is not yet configured\n");
+		return -EINVAL;
+	}
+
+	/* Need to expose only MSI-X information; Single IRQ is handled solely
+	 * by qed.
+	 */
+	if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) {
+		int msix_base = cdev->int_params.fp_msix_base;
+
+		info->msix_cnt = cdev->int_params.fp_msix_cnt;
+		info->msix = &cdev->int_params.msix_table[msix_base];
+	}
+
+	return 0;
+}
+
+static int qed_slowpath_setup_int(struct qed_dev *cdev,
+				  enum qed_int_mode int_mode)
+{
+	struct qed_sb_cnt_info sb_cnt_info;
+	int num_l2_queues = 0;
+	int rc;
+	int i;
+
+	if ((int_mode == QED_INT_MODE_MSI) && (cdev->num_hwfns > 1)) {
+		DP_NOTICE(cdev, "MSI mode is not supported for CMT devices\n");
+		return -EINVAL;
+	}
+
+	memset(&cdev->int_params, 0, sizeof(struct qed_int_params));
+	cdev->int_params.in.int_mode = int_mode;
+	for_each_hwfn(cdev, i) {
+		memset(&sb_cnt_info, 0, sizeof(sb_cnt_info));
+		qed_int_get_num_sbs(&cdev->hwfns[i], &sb_cnt_info);
+		cdev->int_params.in.num_vectors += sb_cnt_info.cnt;
+		cdev->int_params.in.num_vectors++; /* slowpath */
+	}
+
+	/* We want a minimum of one slowpath and one fastpath vector per hwfn */
+	cdev->int_params.in.min_msix_cnt = cdev->num_hwfns * 2;
+
+	rc = qed_set_int_mode(cdev, false);
+	if (rc)  {
+		DP_ERR(cdev, "qed_slowpath_setup_int ERR\n");
+		return rc;
+	}
+
+	cdev->int_params.fp_msix_base = cdev->num_hwfns;
+	cdev->int_params.fp_msix_cnt = cdev->int_params.out.num_vectors -
+				       cdev->num_hwfns;
+
+	if (!IS_ENABLED(CONFIG_QED_RDMA) ||
+	    !QED_IS_RDMA_PERSONALITY(QED_LEADING_HWFN(cdev)))
+		return 0;
+
+	for_each_hwfn(cdev, i)
+		num_l2_queues += FEAT_NUM(&cdev->hwfns[i], QED_PF_L2_QUE);
+
+	DP_VERBOSE(cdev, QED_MSG_RDMA,
+		   "cdev->int_params.fp_msix_cnt=%d num_l2_queues=%d\n",
+		   cdev->int_params.fp_msix_cnt, num_l2_queues);
+
+	if (cdev->int_params.fp_msix_cnt > num_l2_queues) {
+		cdev->int_params.rdma_msix_cnt =
+			(cdev->int_params.fp_msix_cnt - num_l2_queues)
+			/ cdev->num_hwfns;
+		cdev->int_params.rdma_msix_base =
+			cdev->int_params.fp_msix_base + num_l2_queues;
+		cdev->int_params.fp_msix_cnt = num_l2_queues;
+	} else {
+		cdev->int_params.rdma_msix_cnt = 0;
+	}
+
+	DP_VERBOSE(cdev, QED_MSG_RDMA, "roce_msix_cnt=%d roce_msix_base=%d\n",
+		   cdev->int_params.rdma_msix_cnt,
+		   cdev->int_params.rdma_msix_base);
+
+	return 0;
+}
+
+static int qed_slowpath_vf_setup_int(struct qed_dev *cdev)
+{
+	int rc;
+
+	memset(&cdev->int_params, 0, sizeof(struct qed_int_params));
+	cdev->int_params.in.int_mode = QED_INT_MODE_MSIX;
+
+	qed_vf_get_num_rxqs(QED_LEADING_HWFN(cdev),
+			    &cdev->int_params.in.num_vectors);
+	if (cdev->num_hwfns > 1) {
+		u8 vectors = 0;
+
+		qed_vf_get_num_rxqs(&cdev->hwfns[1], &vectors);
+		cdev->int_params.in.num_vectors += vectors;
+	}
+
+	/* We want a minimum of one fastpath vector per vf hwfn */
+	cdev->int_params.in.min_msix_cnt = cdev->num_hwfns;
+
+	rc = qed_set_int_mode(cdev, true);
+	if (rc)
+		return rc;
+
+	cdev->int_params.fp_msix_base = 0;
+	cdev->int_params.fp_msix_cnt = cdev->int_params.out.num_vectors;
+
+	return 0;
+}
+
+u32 qed_unzip_data(struct qed_hwfn *p_hwfn, u32 input_len,
+		   u8 *input_buf, u32 max_size, u8 *unzip_buf)
+{
+	int rc;
+
+	p_hwfn->stream->next_in = input_buf;
+	p_hwfn->stream->avail_in = input_len;
+	p_hwfn->stream->next_out = unzip_buf;
+	p_hwfn->stream->avail_out = max_size;
+
+	rc = zlib_inflateInit2(p_hwfn->stream, MAX_WBITS);
+
+	if (rc != Z_OK) {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_DRV, "zlib init failed, rc = %d\n",
+			   rc);
+		return 0;
+	}
+
+	rc = zlib_inflate(p_hwfn->stream, Z_FINISH);
+	zlib_inflateEnd(p_hwfn->stream);
+
+	if (rc != Z_OK && rc != Z_STREAM_END) {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_DRV, "FW unzip error: %s, rc=%d\n",
+			   p_hwfn->stream->msg, rc);
+		return 0;
+	}
+
+	return p_hwfn->stream->total_out / 4;
+}
+
+static int qed_alloc_stream_mem(struct qed_dev *cdev)
+{
+	int i;
+	void *workspace;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		p_hwfn->stream = kzalloc(sizeof(*p_hwfn->stream), GFP_KERNEL);
+		if (!p_hwfn->stream)
+			return -ENOMEM;
+
+		workspace = vzalloc(zlib_inflate_workspacesize());
+		if (!workspace)
+			return -ENOMEM;
+		p_hwfn->stream->workspace = workspace;
+	}
+
+	return 0;
+}
+
+static void qed_free_stream_mem(struct qed_dev *cdev)
+{
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		if (!p_hwfn->stream)
+			return;
+
+		vfree(p_hwfn->stream->workspace);
+		kfree(p_hwfn->stream);
+	}
+}
+
+static void qed_update_pf_params(struct qed_dev *cdev,
+				 struct qed_pf_params *params)
+{
+	int i;
+
+	if (IS_ENABLED(CONFIG_QED_RDMA)) {
+		params->rdma_pf_params.num_qps = QED_ROCE_QPS;
+		params->rdma_pf_params.min_dpis = QED_ROCE_DPIS;
+		/* divide by 3 the MRs to avoid MF ILT overflow */
+		params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX;
+	}
+
+	if (cdev->num_hwfns > 1 || IS_VF(cdev))
+		params->eth_pf_params.num_arfs_filters = 0;
+
+	/* In case we might support RDMA, don't allow qede to be greedy
+	 * with the L2 contexts. Allow for 64 queues [rx, tx, xdp] per hwfn.
+	 */
+	if (QED_IS_RDMA_PERSONALITY(QED_LEADING_HWFN(cdev))) {
+		u16 *num_cons;
+
+		num_cons = &params->eth_pf_params.num_cons;
+		*num_cons = min_t(u16, *num_cons, 192);
+	}
+
+	for (i = 0; i < cdev->num_hwfns; i++) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		p_hwfn->pf_params = *params;
+	}
+}
+
+static int qed_slowpath_start(struct qed_dev *cdev,
+			      struct qed_slowpath_params *params)
+{
+	struct qed_drv_load_params drv_load_params;
+	struct qed_hw_init_params hw_init_params;
+	struct qed_mcp_drv_version drv_version;
+	struct qed_tunnel_info tunn_info;
+	const u8 *data = NULL;
+	struct qed_hwfn *hwfn;
+	struct qed_ptt *p_ptt;
+	int rc = -EINVAL;
+
+	if (qed_iov_wq_start(cdev))
+		goto err;
+
+	if (IS_PF(cdev)) {
+		rc = request_firmware(&cdev->firmware, QED_FW_FILE_NAME,
+				      &cdev->pdev->dev);
+		if (rc) {
+			DP_NOTICE(cdev,
+				  "Failed to find fw file - /lib/firmware/%s\n",
+				  QED_FW_FILE_NAME);
+			goto err;
+		}
+
+		if (cdev->num_hwfns == 1) {
+			p_ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev));
+			if (p_ptt) {
+				QED_LEADING_HWFN(cdev)->p_arfs_ptt = p_ptt;
+			} else {
+				DP_NOTICE(cdev,
+					  "Failed to acquire PTT for aRFS\n");
+				goto err;
+			}
+		}
+	}
+
+	cdev->rx_coalesce_usecs = QED_DEFAULT_RX_USECS;
+	rc = qed_nic_setup(cdev);
+	if (rc)
+		goto err;
+
+	if (IS_PF(cdev))
+		rc = qed_slowpath_setup_int(cdev, params->int_mode);
+	else
+		rc = qed_slowpath_vf_setup_int(cdev);
+	if (rc)
+		goto err1;
+
+	if (IS_PF(cdev)) {
+		/* Allocate stream for unzipping */
+		rc = qed_alloc_stream_mem(cdev);
+		if (rc)
+			goto err2;
+
+		/* First Dword used to differentiate between various sources */
+		data = cdev->firmware->data + sizeof(u32);
+
+		qed_dbg_pf_init(cdev);
+	}
+
+	/* Start the slowpath */
+	memset(&hw_init_params, 0, sizeof(hw_init_params));
+	memset(&tunn_info, 0, sizeof(tunn_info));
+	tunn_info.vxlan.b_mode_enabled = true;
+	tunn_info.l2_gre.b_mode_enabled = true;
+	tunn_info.ip_gre.b_mode_enabled = true;
+	tunn_info.l2_geneve.b_mode_enabled = true;
+	tunn_info.ip_geneve.b_mode_enabled = true;
+	tunn_info.vxlan.tun_cls = QED_TUNN_CLSS_MAC_VLAN;
+	tunn_info.l2_gre.tun_cls = QED_TUNN_CLSS_MAC_VLAN;
+	tunn_info.ip_gre.tun_cls = QED_TUNN_CLSS_MAC_VLAN;
+	tunn_info.l2_geneve.tun_cls = QED_TUNN_CLSS_MAC_VLAN;
+	tunn_info.ip_geneve.tun_cls = QED_TUNN_CLSS_MAC_VLAN;
+	hw_init_params.p_tunn = &tunn_info;
+	hw_init_params.b_hw_start = true;
+	hw_init_params.int_mode = cdev->int_params.out.int_mode;
+	hw_init_params.allow_npar_tx_switch = true;
+	hw_init_params.bin_fw_data = data;
+
+	memset(&drv_load_params, 0, sizeof(drv_load_params));
+	drv_load_params.is_crash_kernel = is_kdump_kernel();
+	drv_load_params.mfw_timeout_val = QED_LOAD_REQ_LOCK_TO_DEFAULT;
+	drv_load_params.avoid_eng_reset = false;
+	drv_load_params.override_force_load = QED_OVERRIDE_FORCE_LOAD_NONE;
+	hw_init_params.p_drv_load_params = &drv_load_params;
+
+	rc = qed_hw_init(cdev, &hw_init_params);
+	if (rc)
+		goto err2;
+
+	DP_INFO(cdev,
+		"HW initialization and function start completed successfully\n");
+
+	if (IS_PF(cdev)) {
+		cdev->tunn_feature_mask = (BIT(QED_MODE_VXLAN_TUNN) |
+					   BIT(QED_MODE_L2GENEVE_TUNN) |
+					   BIT(QED_MODE_IPGENEVE_TUNN) |
+					   BIT(QED_MODE_L2GRE_TUNN) |
+					   BIT(QED_MODE_IPGRE_TUNN));
+	}
+
+	/* Allocate LL2 interface if needed */
+	if (QED_LEADING_HWFN(cdev)->using_ll2) {
+		rc = qed_ll2_alloc_if(cdev);
+		if (rc)
+			goto err3;
+	}
+	if (IS_PF(cdev)) {
+		hwfn = QED_LEADING_HWFN(cdev);
+		drv_version.version = (params->drv_major << 24) |
+				      (params->drv_minor << 16) |
+				      (params->drv_rev << 8) |
+				      (params->drv_eng);
+		strlcpy(drv_version.name, params->name,
+			MCP_DRV_VER_STR_SIZE - 4);
+		rc = qed_mcp_send_drv_version(hwfn, hwfn->p_main_ptt,
+					      &drv_version);
+		if (rc) {
+			DP_NOTICE(cdev, "Failed sending drv version command\n");
+			return rc;
+		}
+	}
+
+	qed_reset_vport_stats(cdev);
+
+	return 0;
+
+err3:
+	qed_hw_stop(cdev);
+err2:
+	qed_hw_timers_stop_all(cdev);
+	if (IS_PF(cdev))
+		qed_slowpath_irq_free(cdev);
+	qed_free_stream_mem(cdev);
+	qed_disable_msix(cdev);
+err1:
+	qed_resc_free(cdev);
+err:
+	if (IS_PF(cdev))
+		release_firmware(cdev->firmware);
+
+	if (IS_PF(cdev) && (cdev->num_hwfns == 1) &&
+	    QED_LEADING_HWFN(cdev)->p_arfs_ptt)
+		qed_ptt_release(QED_LEADING_HWFN(cdev),
+				QED_LEADING_HWFN(cdev)->p_arfs_ptt);
+
+	qed_iov_wq_stop(cdev, false);
+
+	return rc;
+}
+
+static int qed_slowpath_stop(struct qed_dev *cdev)
+{
+	if (!cdev)
+		return -ENODEV;
+
+	qed_ll2_dealloc_if(cdev);
+
+	if (IS_PF(cdev)) {
+		if (cdev->num_hwfns == 1)
+			qed_ptt_release(QED_LEADING_HWFN(cdev),
+					QED_LEADING_HWFN(cdev)->p_arfs_ptt);
+		qed_free_stream_mem(cdev);
+		if (IS_QED_ETH_IF(cdev))
+			qed_sriov_disable(cdev, true);
+	}
+
+	qed_nic_stop(cdev);
+
+	if (IS_PF(cdev))
+		qed_slowpath_irq_free(cdev);
+
+	qed_disable_msix(cdev);
+
+	qed_resc_free(cdev);
+
+	qed_iov_wq_stop(cdev, true);
+
+	if (IS_PF(cdev))
+		release_firmware(cdev->firmware);
+
+	return 0;
+}
+
+static void qed_set_name(struct qed_dev *cdev, char name[NAME_SIZE])
+{
+	int i;
+
+	memcpy(cdev->name, name, NAME_SIZE);
+	for_each_hwfn(cdev, i)
+		snprintf(cdev->hwfns[i].name, NAME_SIZE, "%s-%d", name, i);
+}
+
+static u32 qed_sb_init(struct qed_dev *cdev,
+		       struct qed_sb_info *sb_info,
+		       void *sb_virt_addr,
+		       dma_addr_t sb_phy_addr, u16 sb_id,
+		       enum qed_sb_type type)
+{
+	struct qed_hwfn *p_hwfn;
+	struct qed_ptt *p_ptt;
+	int hwfn_index;
+	u16 rel_sb_id;
+	u8 n_hwfns;
+	u32 rc;
+
+	/* RoCE uses single engine and CMT uses two engines. When using both
+	 * we force only a single engine. Storage uses only engine 0 too.
+	 */
+	if (type == QED_SB_TYPE_L2_QUEUE)
+		n_hwfns = cdev->num_hwfns;
+	else
+		n_hwfns = 1;
+
+	hwfn_index = sb_id % n_hwfns;
+	p_hwfn = &cdev->hwfns[hwfn_index];
+	rel_sb_id = sb_id / n_hwfns;
+
+	DP_VERBOSE(cdev, NETIF_MSG_INTR,
+		   "hwfn [%d] <--[init]-- SB %04x [0x%04x upper]\n",
+		   hwfn_index, rel_sb_id, sb_id);
+
+	if (IS_PF(p_hwfn->cdev)) {
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt)
+			return -EBUSY;
+
+		rc = qed_int_sb_init(p_hwfn, p_ptt, sb_info, sb_virt_addr,
+				     sb_phy_addr, rel_sb_id);
+		qed_ptt_release(p_hwfn, p_ptt);
+	} else {
+		rc = qed_int_sb_init(p_hwfn, NULL, sb_info, sb_virt_addr,
+				     sb_phy_addr, rel_sb_id);
+	}
+
+	return rc;
+}
+
+static u32 qed_sb_release(struct qed_dev *cdev,
+			  struct qed_sb_info *sb_info, u16 sb_id)
+{
+	struct qed_hwfn *p_hwfn;
+	int hwfn_index;
+	u16 rel_sb_id;
+	u32 rc;
+
+	hwfn_index = sb_id % cdev->num_hwfns;
+	p_hwfn = &cdev->hwfns[hwfn_index];
+	rel_sb_id = sb_id / cdev->num_hwfns;
+
+	DP_VERBOSE(cdev, NETIF_MSG_INTR,
+		   "hwfn [%d] <--[init]-- SB %04x [0x%04x upper]\n",
+		   hwfn_index, rel_sb_id, sb_id);
+
+	rc = qed_int_sb_release(p_hwfn, sb_info, rel_sb_id);
+
+	return rc;
+}
+
+static bool qed_can_link_change(struct qed_dev *cdev)
+{
+	return true;
+}
+
+static int qed_set_link(struct qed_dev *cdev, struct qed_link_params *params)
+{
+	struct qed_hwfn *hwfn;
+	struct qed_mcp_link_params *link_params;
+	struct qed_ptt *ptt;
+	int rc;
+
+	if (!cdev)
+		return -ENODEV;
+
+	/* The link should be set only once per PF */
+	hwfn = &cdev->hwfns[0];
+
+	/* When VF wants to set link, force it to read the bulletin instead.
+	 * This mimics the PF behavior, where a noitification [both immediate
+	 * and possible later] would be generated when changing properties.
+	 */
+	if (IS_VF(cdev)) {
+		qed_schedule_iov(hwfn, QED_IOV_WQ_VF_FORCE_LINK_QUERY_FLAG);
+		return 0;
+	}
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EBUSY;
+
+	link_params = qed_mcp_get_link_params(hwfn);
+	if (params->override_flags & QED_LINK_OVERRIDE_SPEED_AUTONEG)
+		link_params->speed.autoneg = params->autoneg;
+	if (params->override_flags & QED_LINK_OVERRIDE_SPEED_ADV_SPEEDS) {
+		link_params->speed.advertised_speeds = 0;
+		if ((params->adv_speeds & QED_LM_1000baseT_Half_BIT) ||
+		    (params->adv_speeds & QED_LM_1000baseT_Full_BIT))
+			link_params->speed.advertised_speeds |=
+			    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G;
+		if (params->adv_speeds & QED_LM_10000baseKR_Full_BIT)
+			link_params->speed.advertised_speeds |=
+			    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G;
+		if (params->adv_speeds & QED_LM_25000baseKR_Full_BIT)
+			link_params->speed.advertised_speeds |=
+			    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G;
+		if (params->adv_speeds & QED_LM_40000baseLR4_Full_BIT)
+			link_params->speed.advertised_speeds |=
+			    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G;
+		if (params->adv_speeds & QED_LM_50000baseKR2_Full_BIT)
+			link_params->speed.advertised_speeds |=
+			    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G;
+		if (params->adv_speeds & QED_LM_100000baseKR4_Full_BIT)
+			link_params->speed.advertised_speeds |=
+			    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G;
+	}
+	if (params->override_flags & QED_LINK_OVERRIDE_SPEED_FORCED_SPEED)
+		link_params->speed.forced_speed = params->forced_speed;
+	if (params->override_flags & QED_LINK_OVERRIDE_PAUSE_CONFIG) {
+		if (params->pause_config & QED_LINK_PAUSE_AUTONEG_ENABLE)
+			link_params->pause.autoneg = true;
+		else
+			link_params->pause.autoneg = false;
+		if (params->pause_config & QED_LINK_PAUSE_RX_ENABLE)
+			link_params->pause.forced_rx = true;
+		else
+			link_params->pause.forced_rx = false;
+		if (params->pause_config & QED_LINK_PAUSE_TX_ENABLE)
+			link_params->pause.forced_tx = true;
+		else
+			link_params->pause.forced_tx = false;
+	}
+	if (params->override_flags & QED_LINK_OVERRIDE_LOOPBACK_MODE) {
+		switch (params->loopback_mode) {
+		case QED_LINK_LOOPBACK_INT_PHY:
+			link_params->loopback_mode = ETH_LOOPBACK_INT_PHY;
+			break;
+		case QED_LINK_LOOPBACK_EXT_PHY:
+			link_params->loopback_mode = ETH_LOOPBACK_EXT_PHY;
+			break;
+		case QED_LINK_LOOPBACK_EXT:
+			link_params->loopback_mode = ETH_LOOPBACK_EXT;
+			break;
+		case QED_LINK_LOOPBACK_MAC:
+			link_params->loopback_mode = ETH_LOOPBACK_MAC;
+			break;
+		default:
+			link_params->loopback_mode = ETH_LOOPBACK_NONE;
+			break;
+		}
+	}
+
+	if (params->override_flags & QED_LINK_OVERRIDE_EEE_CONFIG)
+		memcpy(&link_params->eee, &params->eee,
+		       sizeof(link_params->eee));
+
+	rc = qed_mcp_set_link(hwfn, ptt, params->link_up);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return rc;
+}
+
+static int qed_get_port_type(u32 media_type)
+{
+	int port_type;
+
+	switch (media_type) {
+	case MEDIA_SFPP_10G_FIBER:
+	case MEDIA_SFP_1G_FIBER:
+	case MEDIA_XFP_FIBER:
+	case MEDIA_MODULE_FIBER:
+	case MEDIA_KR:
+		port_type = PORT_FIBRE;
+		break;
+	case MEDIA_DA_TWINAX:
+		port_type = PORT_DA;
+		break;
+	case MEDIA_BASE_T:
+		port_type = PORT_TP;
+		break;
+	case MEDIA_NOT_PRESENT:
+		port_type = PORT_NONE;
+		break;
+	case MEDIA_UNSPECIFIED:
+	default:
+		port_type = PORT_OTHER;
+		break;
+	}
+	return port_type;
+}
+
+static int qed_get_link_data(struct qed_hwfn *hwfn,
+			     struct qed_mcp_link_params *params,
+			     struct qed_mcp_link_state *link,
+			     struct qed_mcp_link_capabilities *link_caps)
+{
+	void *p;
+
+	if (!IS_PF(hwfn->cdev)) {
+		qed_vf_get_link_params(hwfn, params);
+		qed_vf_get_link_state(hwfn, link);
+		qed_vf_get_link_caps(hwfn, link_caps);
+
+		return 0;
+	}
+
+	p = qed_mcp_get_link_params(hwfn);
+	if (!p)
+		return -ENXIO;
+	memcpy(params, p, sizeof(*params));
+
+	p = qed_mcp_get_link_state(hwfn);
+	if (!p)
+		return -ENXIO;
+	memcpy(link, p, sizeof(*link));
+
+	p = qed_mcp_get_link_capabilities(hwfn);
+	if (!p)
+		return -ENXIO;
+	memcpy(link_caps, p, sizeof(*link_caps));
+
+	return 0;
+}
+
+static void qed_fill_link(struct qed_hwfn *hwfn,
+			  struct qed_link_output *if_link)
+{
+	struct qed_mcp_link_params params;
+	struct qed_mcp_link_state link;
+	struct qed_mcp_link_capabilities link_caps;
+	u32 media_type;
+
+	memset(if_link, 0, sizeof(*if_link));
+
+	/* Prepare source inputs */
+	if (qed_get_link_data(hwfn, &params, &link, &link_caps)) {
+		dev_warn(&hwfn->cdev->pdev->dev, "no link data available\n");
+		return;
+	}
+
+	/* Set the link parameters to pass to protocol driver */
+	if (link.link_up)
+		if_link->link_up = true;
+
+	/* TODO - at the moment assume supported and advertised speed equal */
+	if_link->supported_caps = QED_LM_FIBRE_BIT;
+	if (link_caps.default_speed_autoneg)
+		if_link->supported_caps |= QED_LM_Autoneg_BIT;
+	if (params.pause.autoneg ||
+	    (params.pause.forced_rx && params.pause.forced_tx))
+		if_link->supported_caps |= QED_LM_Asym_Pause_BIT;
+	if (params.pause.autoneg || params.pause.forced_rx ||
+	    params.pause.forced_tx)
+		if_link->supported_caps |= QED_LM_Pause_BIT;
+
+	if_link->advertised_caps = if_link->supported_caps;
+	if (params.speed.autoneg)
+		if_link->advertised_caps |= QED_LM_Autoneg_BIT;
+	else
+		if_link->advertised_caps &= ~QED_LM_Autoneg_BIT;
+	if (params.speed.advertised_speeds &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G)
+		if_link->advertised_caps |= QED_LM_1000baseT_Half_BIT |
+		    QED_LM_1000baseT_Full_BIT;
+	if (params.speed.advertised_speeds &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G)
+		if_link->advertised_caps |= QED_LM_10000baseKR_Full_BIT;
+	if (params.speed.advertised_speeds &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G)
+		if_link->advertised_caps |= QED_LM_25000baseKR_Full_BIT;
+	if (params.speed.advertised_speeds &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G)
+		if_link->advertised_caps |= QED_LM_40000baseLR4_Full_BIT;
+	if (params.speed.advertised_speeds &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G)
+		if_link->advertised_caps |= QED_LM_50000baseKR2_Full_BIT;
+	if (params.speed.advertised_speeds &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G)
+		if_link->advertised_caps |= QED_LM_100000baseKR4_Full_BIT;
+
+	if (link_caps.speed_capabilities &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G)
+		if_link->supported_caps |= QED_LM_1000baseT_Half_BIT |
+		    QED_LM_1000baseT_Full_BIT;
+	if (link_caps.speed_capabilities &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G)
+		if_link->supported_caps |= QED_LM_10000baseKR_Full_BIT;
+	if (link_caps.speed_capabilities &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G)
+		if_link->supported_caps |= QED_LM_25000baseKR_Full_BIT;
+	if (link_caps.speed_capabilities &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_40G)
+		if_link->supported_caps |= QED_LM_40000baseLR4_Full_BIT;
+	if (link_caps.speed_capabilities &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G)
+		if_link->supported_caps |= QED_LM_50000baseKR2_Full_BIT;
+	if (link_caps.speed_capabilities &
+	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G)
+		if_link->supported_caps |= QED_LM_100000baseKR4_Full_BIT;
+
+	if (link.link_up)
+		if_link->speed = link.speed;
+
+	/* TODO - fill duplex properly */
+	if_link->duplex = DUPLEX_FULL;
+	qed_mcp_get_media_type(hwfn->cdev, &media_type);
+	if_link->port = qed_get_port_type(media_type);
+
+	if_link->autoneg = params.speed.autoneg;
+
+	if (params.pause.autoneg)
+		if_link->pause_config |= QED_LINK_PAUSE_AUTONEG_ENABLE;
+	if (params.pause.forced_rx)
+		if_link->pause_config |= QED_LINK_PAUSE_RX_ENABLE;
+	if (params.pause.forced_tx)
+		if_link->pause_config |= QED_LINK_PAUSE_TX_ENABLE;
+
+	/* Link partner capabilities */
+	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_1G_HD)
+		if_link->lp_caps |= QED_LM_1000baseT_Half_BIT;
+	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_1G_FD)
+		if_link->lp_caps |= QED_LM_1000baseT_Full_BIT;
+	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_10G)
+		if_link->lp_caps |= QED_LM_10000baseKR_Full_BIT;
+	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_25G)
+		if_link->lp_caps |= QED_LM_25000baseKR_Full_BIT;
+	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_40G)
+		if_link->lp_caps |= QED_LM_40000baseLR4_Full_BIT;
+	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_50G)
+		if_link->lp_caps |= QED_LM_50000baseKR2_Full_BIT;
+	if (link.partner_adv_speed & QED_LINK_PARTNER_SPEED_100G)
+		if_link->lp_caps |= QED_LM_100000baseKR4_Full_BIT;
+
+	if (link.an_complete)
+		if_link->lp_caps |= QED_LM_Autoneg_BIT;
+
+	if (link.partner_adv_pause)
+		if_link->lp_caps |= QED_LM_Pause_BIT;
+	if (link.partner_adv_pause == QED_LINK_PARTNER_ASYMMETRIC_PAUSE ||
+	    link.partner_adv_pause == QED_LINK_PARTNER_BOTH_PAUSE)
+		if_link->lp_caps |= QED_LM_Asym_Pause_BIT;
+
+	if (link_caps.default_eee == QED_MCP_EEE_UNSUPPORTED) {
+		if_link->eee_supported = false;
+	} else {
+		if_link->eee_supported = true;
+		if_link->eee_active = link.eee_active;
+		if_link->sup_caps = link_caps.eee_speed_caps;
+		/* MFW clears adv_caps on eee disable; use configured value */
+		if_link->eee.adv_caps = link.eee_adv_caps ? link.eee_adv_caps :
+					params.eee.adv_caps;
+		if_link->eee.lp_adv_caps = link.eee_lp_adv_caps;
+		if_link->eee.enable = params.eee.enable;
+		if_link->eee.tx_lpi_enable = params.eee.tx_lpi_enable;
+		if_link->eee.tx_lpi_timer = params.eee.tx_lpi_timer;
+	}
+}
+
+static void qed_get_current_link(struct qed_dev *cdev,
+				 struct qed_link_output *if_link)
+{
+	int i;
+
+	qed_fill_link(&cdev->hwfns[0], if_link);
+
+	for_each_hwfn(cdev, i)
+		qed_inform_vf_link_state(&cdev->hwfns[i]);
+}
+
+void qed_link_update(struct qed_hwfn *hwfn)
+{
+	void *cookie = hwfn->cdev->ops_cookie;
+	struct qed_common_cb_ops *op = hwfn->cdev->protocol_ops.common;
+	struct qed_link_output if_link;
+
+	qed_fill_link(hwfn, &if_link);
+	qed_inform_vf_link_state(hwfn);
+
+	if (IS_LEAD_HWFN(hwfn) && cookie)
+		op->link_update(cookie, &if_link);
+}
+
+static int qed_drain(struct qed_dev *cdev)
+{
+	struct qed_hwfn *hwfn;
+	struct qed_ptt *ptt;
+	int i, rc;
+
+	if (IS_VF(cdev))
+		return 0;
+
+	for_each_hwfn(cdev, i) {
+		hwfn = &cdev->hwfns[i];
+		ptt = qed_ptt_acquire(hwfn);
+		if (!ptt) {
+			DP_NOTICE(hwfn, "Failed to drain NIG; No PTT\n");
+			return -EBUSY;
+		}
+		rc = qed_mcp_drain(hwfn, ptt);
+		if (rc)
+			return rc;
+		qed_ptt_release(hwfn, ptt);
+	}
+
+	return 0;
+}
+
+static u32 qed_nvm_flash_image_access_crc(struct qed_dev *cdev,
+					  struct qed_nvm_image_att *nvm_image,
+					  u32 *crc)
+{
+	u8 *buf = NULL;
+	int rc, j;
+	u32 val;
+
+	/* Allocate a buffer for holding the nvram image */
+	buf = kzalloc(nvm_image->length, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* Read image into buffer */
+	rc = qed_mcp_nvm_read(cdev, nvm_image->start_addr,
+			      buf, nvm_image->length);
+	if (rc) {
+		DP_ERR(cdev, "Failed reading image from nvm\n");
+		goto out;
+	}
+
+	/* Convert the buffer into big-endian format (excluding the
+	 * closing 4 bytes of CRC).
+	 */
+	for (j = 0; j < nvm_image->length - 4; j += 4) {
+		val = cpu_to_be32(*(u32 *)&buf[j]);
+		*(u32 *)&buf[j] = val;
+	}
+
+	/* Calc CRC for the "actual" image buffer, i.e. not including
+	 * the last 4 CRC bytes.
+	 */
+	*crc = (~cpu_to_be32(crc32(0xffffffff, buf, nvm_image->length - 4)));
+
+out:
+	kfree(buf);
+
+	return rc;
+}
+
+/* Binary file format -
+ *     /----------------------------------------------------------------------\
+ * 0B  |                       0x4 [command index]                            |
+ * 4B  | image_type     | Options        |  Number of register settings       |
+ * 8B  |                       Value                                          |
+ * 12B |                       Mask                                           |
+ * 16B |                       Offset                                         |
+ *     \----------------------------------------------------------------------/
+ * There can be several Value-Mask-Offset sets as specified by 'Number of...'.
+ * Options - 0'b - Calculate & Update CRC for image
+ */
+static int qed_nvm_flash_image_access(struct qed_dev *cdev, const u8 **data,
+				      bool *check_resp)
+{
+	struct qed_nvm_image_att nvm_image;
+	struct qed_hwfn *p_hwfn;
+	bool is_crc = false;
+	u32 image_type;
+	int rc = 0, i;
+	u16 len;
+
+	*data += 4;
+	image_type = **data;
+	p_hwfn = QED_LEADING_HWFN(cdev);
+	for (i = 0; i < p_hwfn->nvm_info.num_images; i++)
+		if (image_type == p_hwfn->nvm_info.image_att[i].image_type)
+			break;
+	if (i == p_hwfn->nvm_info.num_images) {
+		DP_ERR(cdev, "Failed to find nvram image of type %08x\n",
+		       image_type);
+		return -ENOENT;
+	}
+
+	nvm_image.start_addr = p_hwfn->nvm_info.image_att[i].nvm_start_addr;
+	nvm_image.length = p_hwfn->nvm_info.image_att[i].len;
+
+	DP_VERBOSE(cdev, NETIF_MSG_DRV,
+		   "Read image %02x; type = %08x; NVM [%08x,...,%08x]\n",
+		   **data, image_type, nvm_image.start_addr,
+		   nvm_image.start_addr + nvm_image.length - 1);
+	(*data)++;
+	is_crc = !!(**data & BIT(0));
+	(*data)++;
+	len = *((u16 *)*data);
+	*data += 2;
+	if (is_crc) {
+		u32 crc = 0;
+
+		rc = qed_nvm_flash_image_access_crc(cdev, &nvm_image, &crc);
+		if (rc) {
+			DP_ERR(cdev, "Failed calculating CRC, rc = %d\n", rc);
+			goto exit;
+		}
+
+		rc = qed_mcp_nvm_write(cdev, QED_NVM_WRITE_NVRAM,
+				       (nvm_image.start_addr +
+					nvm_image.length - 4), (u8 *)&crc, 4);
+		if (rc)
+			DP_ERR(cdev, "Failed writing to %08x, rc = %d\n",
+			       nvm_image.start_addr + nvm_image.length - 4, rc);
+		goto exit;
+	}
+
+	/* Iterate over the values for setting */
+	while (len) {
+		u32 offset, mask, value, cur_value;
+		u8 buf[4];
+
+		value = *((u32 *)*data);
+		*data += 4;
+		mask = *((u32 *)*data);
+		*data += 4;
+		offset = *((u32 *)*data);
+		*data += 4;
+
+		rc = qed_mcp_nvm_read(cdev, nvm_image.start_addr + offset, buf,
+				      4);
+		if (rc) {
+			DP_ERR(cdev, "Failed reading from %08x\n",
+			       nvm_image.start_addr + offset);
+			goto exit;
+		}
+
+		cur_value = le32_to_cpu(*((__le32 *)buf));
+		DP_VERBOSE(cdev, NETIF_MSG_DRV,
+			   "NVM %08x: %08x -> %08x [Value %08x Mask %08x]\n",
+			   nvm_image.start_addr + offset, cur_value,
+			   (cur_value & ~mask) | (value & mask), value, mask);
+		value = (value & mask) | (cur_value & ~mask);
+		rc = qed_mcp_nvm_write(cdev, QED_NVM_WRITE_NVRAM,
+				       nvm_image.start_addr + offset,
+				       (u8 *)&value, 4);
+		if (rc) {
+			DP_ERR(cdev, "Failed writing to %08x\n",
+			       nvm_image.start_addr + offset);
+			goto exit;
+		}
+
+		len--;
+	}
+exit:
+	return rc;
+}
+
+/* Binary file format -
+ *     /----------------------------------------------------------------------\
+ * 0B  |                       0x3 [command index]                            |
+ * 4B  | b'0: check_response?   | b'1-31  reserved                            |
+ * 8B  | File-type |                   reserved                               |
+ *     \----------------------------------------------------------------------/
+ *     Start a new file of the provided type
+ */
+static int qed_nvm_flash_image_file_start(struct qed_dev *cdev,
+					  const u8 **data, bool *check_resp)
+{
+	int rc;
+
+	*data += 4;
+	*check_resp = !!(**data & BIT(0));
+	*data += 4;
+
+	DP_VERBOSE(cdev, NETIF_MSG_DRV,
+		   "About to start a new file of type %02x\n", **data);
+	rc = qed_mcp_nvm_put_file_begin(cdev, **data);
+	*data += 4;
+
+	return rc;
+}
+
+/* Binary file format -
+ *     /----------------------------------------------------------------------\
+ * 0B  |                       0x2 [command index]                            |
+ * 4B  |                       Length in bytes                                |
+ * 8B  | b'0: check_response?   | b'1-31  reserved                            |
+ * 12B |                       Offset in bytes                                |
+ * 16B |                       Data ...                                       |
+ *     \----------------------------------------------------------------------/
+ *     Write data as part of a file that was previously started. Data should be
+ *     of length equal to that provided in the message
+ */
+static int qed_nvm_flash_image_file_data(struct qed_dev *cdev,
+					 const u8 **data, bool *check_resp)
+{
+	u32 offset, len;
+	int rc;
+
+	*data += 4;
+	len = *((u32 *)(*data));
+	*data += 4;
+	*check_resp = !!(**data & BIT(0));
+	*data += 4;
+	offset = *((u32 *)(*data));
+	*data += 4;
+
+	DP_VERBOSE(cdev, NETIF_MSG_DRV,
+		   "About to write File-data: %08x bytes to offset %08x\n",
+		   len, offset);
+
+	rc = qed_mcp_nvm_write(cdev, QED_PUT_FILE_DATA, offset,
+			       (char *)(*data), len);
+	*data += len;
+
+	return rc;
+}
+
+/* Binary file format [General header] -
+ *     /----------------------------------------------------------------------\
+ * 0B  |                       QED_NVM_SIGNATURE                              |
+ * 4B  |                       Length in bytes                                |
+ * 8B  | Highest command in this batchfile |          Reserved                |
+ *     \----------------------------------------------------------------------/
+ */
+static int qed_nvm_flash_image_validate(struct qed_dev *cdev,
+					const struct firmware *image,
+					const u8 **data)
+{
+	u32 signature, len;
+
+	/* Check minimum size */
+	if (image->size < 12) {
+		DP_ERR(cdev, "Image is too short [%08x]\n", (u32)image->size);
+		return -EINVAL;
+	}
+
+	/* Check signature */
+	signature = *((u32 *)(*data));
+	if (signature != QED_NVM_SIGNATURE) {
+		DP_ERR(cdev, "Wrong signature '%08x'\n", signature);
+		return -EINVAL;
+	}
+
+	*data += 4;
+	/* Validate internal size equals the image-size */
+	len = *((u32 *)(*data));
+	if (len != image->size) {
+		DP_ERR(cdev, "Size mismatch: internal = %08x image = %08x\n",
+		       len, (u32)image->size);
+		return -EINVAL;
+	}
+
+	*data += 4;
+	/* Make sure driver familiar with all commands necessary for this */
+	if (*((u16 *)(*data)) >= QED_NVM_FLASH_CMD_NVM_MAX) {
+		DP_ERR(cdev, "File contains unsupported commands [Need %04x]\n",
+		       *((u16 *)(*data)));
+		return -EINVAL;
+	}
+
+	*data += 4;
+
+	return 0;
+}
+
+static int qed_nvm_flash(struct qed_dev *cdev, const char *name)
+{
+	const struct firmware *image;
+	const u8 *data, *data_end;
+	u32 cmd_type;
+	int rc;
+
+	rc = request_firmware(&image, name, &cdev->pdev->dev);
+	if (rc) {
+		DP_ERR(cdev, "Failed to find '%s'\n", name);
+		return rc;
+	}
+
+	DP_VERBOSE(cdev, NETIF_MSG_DRV,
+		   "Flashing '%s' - firmware's data at %p, size is %08x\n",
+		   name, image->data, (u32)image->size);
+	data = image->data;
+	data_end = data + image->size;
+
+	rc = qed_nvm_flash_image_validate(cdev, image, &data);
+	if (rc)
+		goto exit;
+
+	while (data < data_end) {
+		bool check_resp = false;
+
+		/* Parse the actual command */
+		cmd_type = *((u32 *)data);
+		switch (cmd_type) {
+		case QED_NVM_FLASH_CMD_FILE_DATA:
+			rc = qed_nvm_flash_image_file_data(cdev, &data,
+							   &check_resp);
+			break;
+		case QED_NVM_FLASH_CMD_FILE_START:
+			rc = qed_nvm_flash_image_file_start(cdev, &data,
+							    &check_resp);
+			break;
+		case QED_NVM_FLASH_CMD_NVM_CHANGE:
+			rc = qed_nvm_flash_image_access(cdev, &data,
+							&check_resp);
+			break;
+		default:
+			DP_ERR(cdev, "Unknown command %08x\n", cmd_type);
+			rc = -EINVAL;
+			goto exit;
+		}
+
+		if (rc) {
+			DP_ERR(cdev, "Command %08x failed\n", cmd_type);
+			goto exit;
+		}
+
+		/* Check response if needed */
+		if (check_resp) {
+			u32 mcp_response = 0;
+
+			if (qed_mcp_nvm_resp(cdev, (u8 *)&mcp_response)) {
+				DP_ERR(cdev, "Failed getting MCP response\n");
+				rc = -EINVAL;
+				goto exit;
+			}
+
+			switch (mcp_response & FW_MSG_CODE_MASK) {
+			case FW_MSG_CODE_OK:
+			case FW_MSG_CODE_NVM_OK:
+			case FW_MSG_CODE_NVM_PUT_FILE_FINISH_OK:
+			case FW_MSG_CODE_PHY_OK:
+				break;
+			default:
+				DP_ERR(cdev, "MFW returns error: %08x\n",
+				       mcp_response);
+				rc = -EINVAL;
+				goto exit;
+			}
+		}
+	}
+
+exit:
+	release_firmware(image);
+
+	return rc;
+}
+
+static int qed_nvm_get_image(struct qed_dev *cdev, enum qed_nvm_images type,
+			     u8 *buf, u16 len)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
+	int rc;
+
+	if (!ptt)
+		return -EAGAIN;
+
+	rc = qed_mcp_get_nvm_image(hwfn, ptt, type, buf, len);
+	qed_ptt_release(hwfn, ptt);
+	return rc;
+}
+
+static int qed_set_coalesce(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal,
+			    void *handle)
+{
+		return qed_set_queue_coalesce(rx_coal, tx_coal, handle);
+}
+
+static int qed_set_led(struct qed_dev *cdev, enum qed_led_mode mode)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *ptt;
+	int status = 0;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EAGAIN;
+
+	status = qed_mcp_set_led(hwfn, ptt, mode);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return status;
+}
+
+static int qed_update_wol(struct qed_dev *cdev, bool enabled)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *ptt;
+	int rc = 0;
+
+	if (IS_VF(cdev))
+		return 0;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EAGAIN;
+
+	rc = qed_mcp_ov_update_wol(hwfn, ptt, enabled ? QED_OV_WOL_ENABLED
+				   : QED_OV_WOL_DISABLED);
+	if (rc)
+		goto out;
+	rc = qed_mcp_ov_update_current_config(hwfn, ptt, QED_OV_CLIENT_DRV);
+
+out:
+	qed_ptt_release(hwfn, ptt);
+	return rc;
+}
+
+static int qed_update_drv_state(struct qed_dev *cdev, bool active)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *ptt;
+	int status = 0;
+
+	if (IS_VF(cdev))
+		return 0;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EAGAIN;
+
+	status = qed_mcp_ov_update_driver_state(hwfn, ptt, active ?
+						QED_OV_DRIVER_STATE_ACTIVE :
+						QED_OV_DRIVER_STATE_DISABLED);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return status;
+}
+
+static int qed_update_mac(struct qed_dev *cdev, u8 *mac)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *ptt;
+	int status = 0;
+
+	if (IS_VF(cdev))
+		return 0;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EAGAIN;
+
+	status = qed_mcp_ov_update_mac(hwfn, ptt, mac);
+	if (status)
+		goto out;
+
+	status = qed_mcp_ov_update_current_config(hwfn, ptt, QED_OV_CLIENT_DRV);
+
+out:
+	qed_ptt_release(hwfn, ptt);
+	return status;
+}
+
+static int qed_update_mtu(struct qed_dev *cdev, u16 mtu)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *ptt;
+	int status = 0;
+
+	if (IS_VF(cdev))
+		return 0;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EAGAIN;
+
+	status = qed_mcp_ov_update_mtu(hwfn, ptt, mtu);
+	if (status)
+		goto out;
+
+	status = qed_mcp_ov_update_current_config(hwfn, ptt, QED_OV_CLIENT_DRV);
+
+out:
+	qed_ptt_release(hwfn, ptt);
+	return status;
+}
+
+static struct qed_selftest_ops qed_selftest_ops_pass = {
+	.selftest_memory = &qed_selftest_memory,
+	.selftest_interrupt = &qed_selftest_interrupt,
+	.selftest_register = &qed_selftest_register,
+	.selftest_clock = &qed_selftest_clock,
+	.selftest_nvram = &qed_selftest_nvram,
+};
+
+const struct qed_common_ops qed_common_ops_pass = {
+	.selftest = &qed_selftest_ops_pass,
+	.probe = &qed_probe,
+	.remove = &qed_remove,
+	.set_power_state = &qed_set_power_state,
+	.set_name = &qed_set_name,
+	.update_pf_params = &qed_update_pf_params,
+	.slowpath_start = &qed_slowpath_start,
+	.slowpath_stop = &qed_slowpath_stop,
+	.set_fp_int = &qed_set_int_fp,
+	.get_fp_int = &qed_get_int_fp,
+	.sb_init = &qed_sb_init,
+	.sb_release = &qed_sb_release,
+	.simd_handler_config = &qed_simd_handler_config,
+	.simd_handler_clean = &qed_simd_handler_clean,
+	.dbg_grc = &qed_dbg_grc,
+	.dbg_grc_size = &qed_dbg_grc_size,
+	.can_link_change = &qed_can_link_change,
+	.set_link = &qed_set_link,
+	.get_link = &qed_get_current_link,
+	.drain = &qed_drain,
+	.update_msglvl = &qed_init_dp,
+	.dbg_all_data = &qed_dbg_all_data,
+	.dbg_all_data_size = &qed_dbg_all_data_size,
+	.chain_alloc = &qed_chain_alloc,
+	.chain_free = &qed_chain_free,
+	.nvm_flash = &qed_nvm_flash,
+	.nvm_get_image = &qed_nvm_get_image,
+	.set_coalesce = &qed_set_coalesce,
+	.set_led = &qed_set_led,
+	.update_drv_state = &qed_update_drv_state,
+	.update_mac = &qed_update_mac,
+	.update_mtu = &qed_update_mtu,
+	.update_wol = &qed_update_wol,
+};
+
+void qed_get_protocol_stats(struct qed_dev *cdev,
+			    enum qed_mcp_protocol_type type,
+			    union qed_mcp_protocol_stats *stats)
+{
+	struct qed_eth_stats eth_stats;
+
+	memset(stats, 0, sizeof(*stats));
+
+	switch (type) {
+	case QED_MCP_LAN_STATS:
+		qed_get_vport_stats(cdev, &eth_stats);
+		stats->lan_stats.ucast_rx_pkts =
+					eth_stats.common.rx_ucast_pkts;
+		stats->lan_stats.ucast_tx_pkts =
+					eth_stats.common.tx_ucast_pkts;
+		stats->lan_stats.fcs_err = -1;
+		break;
+	case QED_MCP_FCOE_STATS:
+		qed_get_protocol_stats_fcoe(cdev, &stats->fcoe_stats);
+		break;
+	case QED_MCP_ISCSI_STATS:
+		qed_get_protocol_stats_iscsi(cdev, &stats->iscsi_stats);
+		break;
+	default:
+		DP_VERBOSE(cdev, QED_MSG_SP,
+			   "Invalid protocol type = %d\n", type);
+		return;
+	}
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
new file mode 100644
index 0000000..ec0d425
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -0,0 +1,3055 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/etherdevice.h>
+#include "qed.h"
+#include "qed_dcbx.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+#include "qed_sriov.h"
+
+#define CHIP_MCP_RESP_ITER_US 10
+
+#define QED_DRV_MB_MAX_RETRIES	(500 * 1000)	/* Account for 5 sec */
+#define QED_MCP_RESET_RETRIES	(50 * 1000)	/* Account for 500 msec */
+
+#define DRV_INNER_WR(_p_hwfn, _p_ptt, _ptr, _offset, _val)	     \
+	qed_wr(_p_hwfn, _p_ptt, (_p_hwfn->mcp_info->_ptr + _offset), \
+	       _val)
+
+#define DRV_INNER_RD(_p_hwfn, _p_ptt, _ptr, _offset) \
+	qed_rd(_p_hwfn, _p_ptt, (_p_hwfn->mcp_info->_ptr + _offset))
+
+#define DRV_MB_WR(_p_hwfn, _p_ptt, _field, _val)  \
+	DRV_INNER_WR(p_hwfn, _p_ptt, drv_mb_addr, \
+		     offsetof(struct public_drv_mb, _field), _val)
+
+#define DRV_MB_RD(_p_hwfn, _p_ptt, _field)	   \
+	DRV_INNER_RD(_p_hwfn, _p_ptt, drv_mb_addr, \
+		     offsetof(struct public_drv_mb, _field))
+
+#define PDA_COMP (((FW_MAJOR_VERSION) + (FW_MINOR_VERSION << 8)) << \
+		  DRV_ID_PDA_COMP_VER_SHIFT)
+
+#define MCP_BYTES_PER_MBIT_SHIFT 17
+
+bool qed_mcp_is_init(struct qed_hwfn *p_hwfn)
+{
+	if (!p_hwfn->mcp_info || !p_hwfn->mcp_info->public_base)
+		return false;
+	return true;
+}
+
+void qed_mcp_cmd_port_init(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 addr = SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->public_base,
+					PUBLIC_PORT);
+	u32 mfw_mb_offsize = qed_rd(p_hwfn, p_ptt, addr);
+
+	p_hwfn->mcp_info->port_addr = SECTION_ADDR(mfw_mb_offsize,
+						   MFW_PORT(p_hwfn));
+	DP_VERBOSE(p_hwfn, QED_MSG_SP,
+		   "port_addr = 0x%x, port_id 0x%02x\n",
+		   p_hwfn->mcp_info->port_addr, MFW_PORT(p_hwfn));
+}
+
+void qed_mcp_read_mb(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 length = MFW_DRV_MSG_MAX_DWORDS(p_hwfn->mcp_info->mfw_mb_length);
+	u32 tmp, i;
+
+	if (!p_hwfn->mcp_info->public_base)
+		return;
+
+	for (i = 0; i < length; i++) {
+		tmp = qed_rd(p_hwfn, p_ptt,
+			     p_hwfn->mcp_info->mfw_mb_addr +
+			     (i << 2) + sizeof(u32));
+
+		/* The MB data is actually BE; Need to force it to cpu */
+		((u32 *)p_hwfn->mcp_info->mfw_mb_cur)[i] =
+			be32_to_cpu((__force __be32)tmp);
+	}
+}
+
+struct qed_mcp_cmd_elem {
+	struct list_head list;
+	struct qed_mcp_mb_params *p_mb_params;
+	u16 expected_seq_num;
+	bool b_is_completed;
+};
+
+/* Must be called while cmd_lock is acquired */
+static struct qed_mcp_cmd_elem *
+qed_mcp_cmd_add_elem(struct qed_hwfn *p_hwfn,
+		     struct qed_mcp_mb_params *p_mb_params,
+		     u16 expected_seq_num)
+{
+	struct qed_mcp_cmd_elem *p_cmd_elem = NULL;
+
+	p_cmd_elem = kzalloc(sizeof(*p_cmd_elem), GFP_ATOMIC);
+	if (!p_cmd_elem)
+		goto out;
+
+	p_cmd_elem->p_mb_params = p_mb_params;
+	p_cmd_elem->expected_seq_num = expected_seq_num;
+	list_add(&p_cmd_elem->list, &p_hwfn->mcp_info->cmd_list);
+out:
+	return p_cmd_elem;
+}
+
+/* Must be called while cmd_lock is acquired */
+static void qed_mcp_cmd_del_elem(struct qed_hwfn *p_hwfn,
+				 struct qed_mcp_cmd_elem *p_cmd_elem)
+{
+	list_del(&p_cmd_elem->list);
+	kfree(p_cmd_elem);
+}
+
+/* Must be called while cmd_lock is acquired */
+static struct qed_mcp_cmd_elem *qed_mcp_cmd_get_elem(struct qed_hwfn *p_hwfn,
+						     u16 seq_num)
+{
+	struct qed_mcp_cmd_elem *p_cmd_elem = NULL;
+
+	list_for_each_entry(p_cmd_elem, &p_hwfn->mcp_info->cmd_list, list) {
+		if (p_cmd_elem->expected_seq_num == seq_num)
+			return p_cmd_elem;
+	}
+
+	return NULL;
+}
+
+int qed_mcp_free(struct qed_hwfn *p_hwfn)
+{
+	if (p_hwfn->mcp_info) {
+		struct qed_mcp_cmd_elem *p_cmd_elem, *p_tmp;
+
+		kfree(p_hwfn->mcp_info->mfw_mb_cur);
+		kfree(p_hwfn->mcp_info->mfw_mb_shadow);
+
+		spin_lock_bh(&p_hwfn->mcp_info->cmd_lock);
+		list_for_each_entry_safe(p_cmd_elem,
+					 p_tmp,
+					 &p_hwfn->mcp_info->cmd_list, list) {
+			qed_mcp_cmd_del_elem(p_hwfn, p_cmd_elem);
+		}
+		spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock);
+	}
+
+	kfree(p_hwfn->mcp_info);
+	p_hwfn->mcp_info = NULL;
+
+	return 0;
+}
+
+static int qed_load_mcp_offsets(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_mcp_info *p_info = p_hwfn->mcp_info;
+	u32 drv_mb_offsize, mfw_mb_offsize;
+	u32 mcp_pf_id = MCP_PF_ID(p_hwfn);
+
+	p_info->public_base = qed_rd(p_hwfn, p_ptt, MISC_REG_SHARED_MEM_ADDR);
+	if (!p_info->public_base)
+		return 0;
+
+	p_info->public_base |= GRCBASE_MCP;
+
+	/* Calculate the driver and MFW mailbox address */
+	drv_mb_offsize = qed_rd(p_hwfn, p_ptt,
+				SECTION_OFFSIZE_ADDR(p_info->public_base,
+						     PUBLIC_DRV_MB));
+	p_info->drv_mb_addr = SECTION_ADDR(drv_mb_offsize, mcp_pf_id);
+	DP_VERBOSE(p_hwfn, QED_MSG_SP,
+		   "drv_mb_offsiz = 0x%x, drv_mb_addr = 0x%x mcp_pf_id = 0x%x\n",
+		   drv_mb_offsize, p_info->drv_mb_addr, mcp_pf_id);
+
+	/* Set the MFW MB address */
+	mfw_mb_offsize = qed_rd(p_hwfn, p_ptt,
+				SECTION_OFFSIZE_ADDR(p_info->public_base,
+						     PUBLIC_MFW_MB));
+	p_info->mfw_mb_addr = SECTION_ADDR(mfw_mb_offsize, mcp_pf_id);
+	p_info->mfw_mb_length =	(u16)qed_rd(p_hwfn, p_ptt, p_info->mfw_mb_addr);
+
+	/* Get the current driver mailbox sequence before sending
+	 * the first command
+	 */
+	p_info->drv_mb_seq = DRV_MB_RD(p_hwfn, p_ptt, drv_mb_header) &
+			     DRV_MSG_SEQ_NUMBER_MASK;
+
+	/* Get current FW pulse sequence */
+	p_info->drv_pulse_seq = DRV_MB_RD(p_hwfn, p_ptt, drv_pulse_mb) &
+				DRV_PULSE_SEQ_MASK;
+
+	p_info->mcp_hist = qed_rd(p_hwfn, p_ptt, MISCS_REG_GENERIC_POR_0);
+
+	return 0;
+}
+
+int qed_mcp_cmd_init(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_mcp_info *p_info;
+	u32 size;
+
+	/* Allocate mcp_info structure */
+	p_hwfn->mcp_info = kzalloc(sizeof(*p_hwfn->mcp_info), GFP_KERNEL);
+	if (!p_hwfn->mcp_info)
+		goto err;
+	p_info = p_hwfn->mcp_info;
+
+	/* Initialize the MFW spinlock */
+	spin_lock_init(&p_info->cmd_lock);
+	spin_lock_init(&p_info->link_lock);
+
+	INIT_LIST_HEAD(&p_info->cmd_list);
+
+	if (qed_load_mcp_offsets(p_hwfn, p_ptt) != 0) {
+		DP_NOTICE(p_hwfn, "MCP is not initialized\n");
+		/* Do not free mcp_info here, since public_base indicate that
+		 * the MCP is not initialized
+		 */
+		return 0;
+	}
+
+	size = MFW_DRV_MSG_MAX_DWORDS(p_info->mfw_mb_length) * sizeof(u32);
+	p_info->mfw_mb_cur = kzalloc(size, GFP_KERNEL);
+	p_info->mfw_mb_shadow = kzalloc(size, GFP_KERNEL);
+	if (!p_info->mfw_mb_cur || !p_info->mfw_mb_shadow)
+		goto err;
+
+	return 0;
+
+err:
+	qed_mcp_free(p_hwfn);
+	return -ENOMEM;
+}
+
+static void qed_mcp_reread_offsets(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt)
+{
+	u32 generic_por_0 = qed_rd(p_hwfn, p_ptt, MISCS_REG_GENERIC_POR_0);
+
+	/* Use MCP history register to check if MCP reset occurred between init
+	 * time and now.
+	 */
+	if (p_hwfn->mcp_info->mcp_hist != generic_por_0) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_SP,
+			   "Rereading MCP offsets [mcp_hist 0x%08x, generic_por_0 0x%08x]\n",
+			   p_hwfn->mcp_info->mcp_hist, generic_por_0);
+
+		qed_load_mcp_offsets(p_hwfn, p_ptt);
+		qed_mcp_cmd_port_init(p_hwfn, p_ptt);
+	}
+}
+
+int qed_mcp_reset(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 org_mcp_reset_seq, seq, delay = CHIP_MCP_RESP_ITER_US, cnt = 0;
+	int rc = 0;
+
+	/* Ensure that only a single thread is accessing the mailbox */
+	spin_lock_bh(&p_hwfn->mcp_info->cmd_lock);
+
+	org_mcp_reset_seq = qed_rd(p_hwfn, p_ptt, MISCS_REG_GENERIC_POR_0);
+
+	/* Set drv command along with the updated sequence */
+	qed_mcp_reread_offsets(p_hwfn, p_ptt);
+	seq = ++p_hwfn->mcp_info->drv_mb_seq;
+	DRV_MB_WR(p_hwfn, p_ptt, drv_mb_header, (DRV_MSG_CODE_MCP_RESET | seq));
+
+	do {
+		/* Wait for MFW response */
+		udelay(delay);
+		/* Give the FW up to 500 second (50*1000*10usec) */
+	} while ((org_mcp_reset_seq == qed_rd(p_hwfn, p_ptt,
+					      MISCS_REG_GENERIC_POR_0)) &&
+		 (cnt++ < QED_MCP_RESET_RETRIES));
+
+	if (org_mcp_reset_seq !=
+	    qed_rd(p_hwfn, p_ptt, MISCS_REG_GENERIC_POR_0)) {
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "MCP was reset after %d usec\n", cnt * delay);
+	} else {
+		DP_ERR(p_hwfn, "Failed to reset MCP\n");
+		rc = -EAGAIN;
+	}
+
+	spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock);
+
+	return rc;
+}
+
+/* Must be called while cmd_lock is acquired */
+static bool qed_mcp_has_pending_cmd(struct qed_hwfn *p_hwfn)
+{
+	struct qed_mcp_cmd_elem *p_cmd_elem;
+
+	/* There is at most one pending command at a certain time, and if it
+	 * exists - it is placed at the HEAD of the list.
+	 */
+	if (!list_empty(&p_hwfn->mcp_info->cmd_list)) {
+		p_cmd_elem = list_first_entry(&p_hwfn->mcp_info->cmd_list,
+					      struct qed_mcp_cmd_elem, list);
+		return !p_cmd_elem->b_is_completed;
+	}
+
+	return false;
+}
+
+/* Must be called while cmd_lock is acquired */
+static int
+qed_mcp_update_pending_cmd(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_mcp_mb_params *p_mb_params;
+	struct qed_mcp_cmd_elem *p_cmd_elem;
+	u32 mcp_resp;
+	u16 seq_num;
+
+	mcp_resp = DRV_MB_RD(p_hwfn, p_ptt, fw_mb_header);
+	seq_num = (u16)(mcp_resp & FW_MSG_SEQ_NUMBER_MASK);
+
+	/* Return if no new non-handled response has been received */
+	if (seq_num != p_hwfn->mcp_info->drv_mb_seq)
+		return -EAGAIN;
+
+	p_cmd_elem = qed_mcp_cmd_get_elem(p_hwfn, seq_num);
+	if (!p_cmd_elem) {
+		DP_ERR(p_hwfn,
+		       "Failed to find a pending mailbox cmd that expects sequence number %d\n",
+		       seq_num);
+		return -EINVAL;
+	}
+
+	p_mb_params = p_cmd_elem->p_mb_params;
+
+	/* Get the MFW response along with the sequence number */
+	p_mb_params->mcp_resp = mcp_resp;
+
+	/* Get the MFW param */
+	p_mb_params->mcp_param = DRV_MB_RD(p_hwfn, p_ptt, fw_mb_param);
+
+	/* Get the union data */
+	if (p_mb_params->p_data_dst != NULL && p_mb_params->data_dst_size) {
+		u32 union_data_addr = p_hwfn->mcp_info->drv_mb_addr +
+				      offsetof(struct public_drv_mb,
+					       union_data);
+		qed_memcpy_from(p_hwfn, p_ptt, p_mb_params->p_data_dst,
+				union_data_addr, p_mb_params->data_dst_size);
+	}
+
+	p_cmd_elem->b_is_completed = true;
+
+	return 0;
+}
+
+/* Must be called while cmd_lock is acquired */
+static void __qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    struct qed_mcp_mb_params *p_mb_params,
+				    u16 seq_num)
+{
+	union drv_union_data union_data;
+	u32 union_data_addr;
+
+	/* Set the union data */
+	union_data_addr = p_hwfn->mcp_info->drv_mb_addr +
+			  offsetof(struct public_drv_mb, union_data);
+	memset(&union_data, 0, sizeof(union_data));
+	if (p_mb_params->p_data_src != NULL && p_mb_params->data_src_size)
+		memcpy(&union_data, p_mb_params->p_data_src,
+		       p_mb_params->data_src_size);
+	qed_memcpy_to(p_hwfn, p_ptt, union_data_addr, &union_data,
+		      sizeof(union_data));
+
+	/* Set the drv param */
+	DRV_MB_WR(p_hwfn, p_ptt, drv_mb_param, p_mb_params->param);
+
+	/* Set the drv command along with the sequence number */
+	DRV_MB_WR(p_hwfn, p_ptt, drv_mb_header, (p_mb_params->cmd | seq_num));
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SP,
+		   "MFW mailbox: command 0x%08x param 0x%08x\n",
+		   (p_mb_params->cmd | seq_num), p_mb_params->param);
+}
+
+static int
+_qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt,
+		       struct qed_mcp_mb_params *p_mb_params,
+		       u32 max_retries, u32 delay)
+{
+	struct qed_mcp_cmd_elem *p_cmd_elem;
+	u32 cnt = 0;
+	u16 seq_num;
+	int rc = 0;
+
+	/* Wait until the mailbox is non-occupied */
+	do {
+		/* Exit the loop if there is no pending command, or if the
+		 * pending command is completed during this iteration.
+		 * The spinlock stays locked until the command is sent.
+		 */
+
+		spin_lock_bh(&p_hwfn->mcp_info->cmd_lock);
+
+		if (!qed_mcp_has_pending_cmd(p_hwfn))
+			break;
+
+		rc = qed_mcp_update_pending_cmd(p_hwfn, p_ptt);
+		if (!rc)
+			break;
+		else if (rc != -EAGAIN)
+			goto err;
+
+		spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock);
+		udelay(delay);
+	} while (++cnt < max_retries);
+
+	if (cnt >= max_retries) {
+		DP_NOTICE(p_hwfn,
+			  "The MFW mailbox is occupied by an uncompleted command. Failed to send command 0x%08x [param 0x%08x].\n",
+			  p_mb_params->cmd, p_mb_params->param);
+		return -EAGAIN;
+	}
+
+	/* Send the mailbox command */
+	qed_mcp_reread_offsets(p_hwfn, p_ptt);
+	seq_num = ++p_hwfn->mcp_info->drv_mb_seq;
+	p_cmd_elem = qed_mcp_cmd_add_elem(p_hwfn, p_mb_params, seq_num);
+	if (!p_cmd_elem) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	__qed_mcp_cmd_and_union(p_hwfn, p_ptt, p_mb_params, seq_num);
+	spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock);
+
+	/* Wait for the MFW response */
+	do {
+		/* Exit the loop if the command is already completed, or if the
+		 * command is completed during this iteration.
+		 * The spinlock stays locked until the list element is removed.
+		 */
+
+		udelay(delay);
+		spin_lock_bh(&p_hwfn->mcp_info->cmd_lock);
+
+		if (p_cmd_elem->b_is_completed)
+			break;
+
+		rc = qed_mcp_update_pending_cmd(p_hwfn, p_ptt);
+		if (!rc)
+			break;
+		else if (rc != -EAGAIN)
+			goto err;
+
+		spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock);
+	} while (++cnt < max_retries);
+
+	if (cnt >= max_retries) {
+		DP_NOTICE(p_hwfn,
+			  "The MFW failed to respond to command 0x%08x [param 0x%08x].\n",
+			  p_mb_params->cmd, p_mb_params->param);
+
+		spin_lock_bh(&p_hwfn->mcp_info->cmd_lock);
+		qed_mcp_cmd_del_elem(p_hwfn, p_cmd_elem);
+		spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock);
+
+		return -EAGAIN;
+	}
+
+	qed_mcp_cmd_del_elem(p_hwfn, p_cmd_elem);
+	spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock);
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_SP,
+		   "MFW mailbox: response 0x%08x param 0x%08x [after %d.%03d ms]\n",
+		   p_mb_params->mcp_resp,
+		   p_mb_params->mcp_param,
+		   (cnt * delay) / 1000, (cnt * delay) % 1000);
+
+	/* Clear the sequence number from the MFW response */
+	p_mb_params->mcp_resp &= FW_MSG_CODE_MASK;
+
+	return 0;
+
+err:
+	spin_unlock_bh(&p_hwfn->mcp_info->cmd_lock);
+	return rc;
+}
+
+static int qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 struct qed_mcp_mb_params *p_mb_params)
+{
+	size_t union_data_size = sizeof(union drv_union_data);
+	u32 max_retries = QED_DRV_MB_MAX_RETRIES;
+	u32 delay = CHIP_MCP_RESP_ITER_US;
+
+	/* MCP not initialized */
+	if (!qed_mcp_is_init(p_hwfn)) {
+		DP_NOTICE(p_hwfn, "MFW is not initialized!\n");
+		return -EBUSY;
+	}
+
+	if (p_mb_params->data_src_size > union_data_size ||
+	    p_mb_params->data_dst_size > union_data_size) {
+		DP_ERR(p_hwfn,
+		       "The provided size is larger than the union data size [src_size %u, dst_size %u, union_data_size %zu]\n",
+		       p_mb_params->data_src_size,
+		       p_mb_params->data_dst_size, union_data_size);
+		return -EINVAL;
+	}
+
+	return _qed_mcp_cmd_and_union(p_hwfn, p_ptt, p_mb_params, max_retries,
+				      delay);
+}
+
+int qed_mcp_cmd(struct qed_hwfn *p_hwfn,
+		struct qed_ptt *p_ptt,
+		u32 cmd,
+		u32 param,
+		u32 *o_mcp_resp,
+		u32 *o_mcp_param)
+{
+	struct qed_mcp_mb_params mb_params;
+	int rc;
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = cmd;
+	mb_params.param = param;
+
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc)
+		return rc;
+
+	*o_mcp_resp = mb_params.mcp_resp;
+	*o_mcp_param = mb_params.mcp_param;
+
+	return 0;
+}
+
+int qed_mcp_nvm_wr_cmd(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt,
+		       u32 cmd,
+		       u32 param,
+		       u32 *o_mcp_resp,
+		       u32 *o_mcp_param, u32 i_txn_size, u32 *i_buf)
+{
+	struct qed_mcp_mb_params mb_params;
+	int rc;
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = cmd;
+	mb_params.param = param;
+	mb_params.p_data_src = i_buf;
+	mb_params.data_src_size = (u8)i_txn_size;
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc)
+		return rc;
+
+	*o_mcp_resp = mb_params.mcp_resp;
+	*o_mcp_param = mb_params.mcp_param;
+
+	return 0;
+}
+
+int qed_mcp_nvm_rd_cmd(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt,
+		       u32 cmd,
+		       u32 param,
+		       u32 *o_mcp_resp,
+		       u32 *o_mcp_param, u32 *o_txn_size, u32 *o_buf)
+{
+	struct qed_mcp_mb_params mb_params;
+	u8 raw_data[MCP_DRV_NVM_BUF_LEN];
+	int rc;
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = cmd;
+	mb_params.param = param;
+	mb_params.p_data_dst = raw_data;
+
+	/* Use the maximal value since the actual one is part of the response */
+	mb_params.data_dst_size = MCP_DRV_NVM_BUF_LEN;
+
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc)
+		return rc;
+
+	*o_mcp_resp = mb_params.mcp_resp;
+	*o_mcp_param = mb_params.mcp_param;
+
+	*o_txn_size = *o_mcp_param;
+	memcpy(o_buf, raw_data, *o_txn_size);
+
+	return 0;
+}
+
+static bool
+qed_mcp_can_force_load(u8 drv_role,
+		       u8 exist_drv_role,
+		       enum qed_override_force_load override_force_load)
+{
+	bool can_force_load = false;
+
+	switch (override_force_load) {
+	case QED_OVERRIDE_FORCE_LOAD_ALWAYS:
+		can_force_load = true;
+		break;
+	case QED_OVERRIDE_FORCE_LOAD_NEVER:
+		can_force_load = false;
+		break;
+	default:
+		can_force_load = (drv_role == DRV_ROLE_OS &&
+				  exist_drv_role == DRV_ROLE_PREBOOT) ||
+				 (drv_role == DRV_ROLE_KDUMP &&
+				  exist_drv_role == DRV_ROLE_OS);
+		break;
+	}
+
+	return can_force_load;
+}
+
+static int qed_mcp_cancel_load_req(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt)
+{
+	u32 resp = 0, param = 0;
+	int rc;
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_CANCEL_LOAD_REQ, 0,
+			 &resp, &param);
+	if (rc)
+		DP_NOTICE(p_hwfn,
+			  "Failed to send cancel load request, rc = %d\n", rc);
+
+	return rc;
+}
+
+#define CONFIG_QEDE_BITMAP_IDX		BIT(0)
+#define CONFIG_QED_SRIOV_BITMAP_IDX	BIT(1)
+#define CONFIG_QEDR_BITMAP_IDX		BIT(2)
+#define CONFIG_QEDF_BITMAP_IDX		BIT(4)
+#define CONFIG_QEDI_BITMAP_IDX		BIT(5)
+#define CONFIG_QED_LL2_BITMAP_IDX	BIT(6)
+
+static u32 qed_get_config_bitmap(void)
+{
+	u32 config_bitmap = 0x0;
+
+	if (IS_ENABLED(CONFIG_QEDE))
+		config_bitmap |= CONFIG_QEDE_BITMAP_IDX;
+
+	if (IS_ENABLED(CONFIG_QED_SRIOV))
+		config_bitmap |= CONFIG_QED_SRIOV_BITMAP_IDX;
+
+	if (IS_ENABLED(CONFIG_QED_RDMA))
+		config_bitmap |= CONFIG_QEDR_BITMAP_IDX;
+
+	if (IS_ENABLED(CONFIG_QED_FCOE))
+		config_bitmap |= CONFIG_QEDF_BITMAP_IDX;
+
+	if (IS_ENABLED(CONFIG_QED_ISCSI))
+		config_bitmap |= CONFIG_QEDI_BITMAP_IDX;
+
+	if (IS_ENABLED(CONFIG_QED_LL2))
+		config_bitmap |= CONFIG_QED_LL2_BITMAP_IDX;
+
+	return config_bitmap;
+}
+
+struct qed_load_req_in_params {
+	u8 hsi_ver;
+#define QED_LOAD_REQ_HSI_VER_DEFAULT	0
+#define QED_LOAD_REQ_HSI_VER_1		1
+	u32 drv_ver_0;
+	u32 drv_ver_1;
+	u32 fw_ver;
+	u8 drv_role;
+	u8 timeout_val;
+	u8 force_cmd;
+	bool avoid_eng_reset;
+};
+
+struct qed_load_req_out_params {
+	u32 load_code;
+	u32 exist_drv_ver_0;
+	u32 exist_drv_ver_1;
+	u32 exist_fw_ver;
+	u8 exist_drv_role;
+	u8 mfw_hsi_ver;
+	bool drv_exists;
+};
+
+static int
+__qed_mcp_load_req(struct qed_hwfn *p_hwfn,
+		   struct qed_ptt *p_ptt,
+		   struct qed_load_req_in_params *p_in_params,
+		   struct qed_load_req_out_params *p_out_params)
+{
+	struct qed_mcp_mb_params mb_params;
+	struct load_req_stc load_req;
+	struct load_rsp_stc load_rsp;
+	u32 hsi_ver;
+	int rc;
+
+	memset(&load_req, 0, sizeof(load_req));
+	load_req.drv_ver_0 = p_in_params->drv_ver_0;
+	load_req.drv_ver_1 = p_in_params->drv_ver_1;
+	load_req.fw_ver = p_in_params->fw_ver;
+	QED_MFW_SET_FIELD(load_req.misc0, LOAD_REQ_ROLE, p_in_params->drv_role);
+	QED_MFW_SET_FIELD(load_req.misc0, LOAD_REQ_LOCK_TO,
+			  p_in_params->timeout_val);
+	QED_MFW_SET_FIELD(load_req.misc0, LOAD_REQ_FORCE,
+			  p_in_params->force_cmd);
+	QED_MFW_SET_FIELD(load_req.misc0, LOAD_REQ_FLAGS0,
+			  p_in_params->avoid_eng_reset);
+
+	hsi_ver = (p_in_params->hsi_ver == QED_LOAD_REQ_HSI_VER_DEFAULT) ?
+		  DRV_ID_MCP_HSI_VER_CURRENT :
+		  (p_in_params->hsi_ver << DRV_ID_MCP_HSI_VER_SHIFT);
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = DRV_MSG_CODE_LOAD_REQ;
+	mb_params.param = PDA_COMP | hsi_ver | p_hwfn->cdev->drv_type;
+	mb_params.p_data_src = &load_req;
+	mb_params.data_src_size = sizeof(load_req);
+	mb_params.p_data_dst = &load_rsp;
+	mb_params.data_dst_size = sizeof(load_rsp);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SP,
+		   "Load Request: param 0x%08x [init_hw %d, drv_type %d, hsi_ver %d, pda 0x%04x]\n",
+		   mb_params.param,
+		   QED_MFW_GET_FIELD(mb_params.param, DRV_ID_DRV_INIT_HW),
+		   QED_MFW_GET_FIELD(mb_params.param, DRV_ID_DRV_TYPE),
+		   QED_MFW_GET_FIELD(mb_params.param, DRV_ID_MCP_HSI_VER),
+		   QED_MFW_GET_FIELD(mb_params.param, DRV_ID_PDA_COMP_VER));
+
+	if (p_in_params->hsi_ver != QED_LOAD_REQ_HSI_VER_1) {
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Load Request: drv_ver 0x%08x_0x%08x, fw_ver 0x%08x, misc0 0x%08x [role %d, timeout %d, force %d, flags0 0x%x]\n",
+			   load_req.drv_ver_0,
+			   load_req.drv_ver_1,
+			   load_req.fw_ver,
+			   load_req.misc0,
+			   QED_MFW_GET_FIELD(load_req.misc0, LOAD_REQ_ROLE),
+			   QED_MFW_GET_FIELD(load_req.misc0,
+					     LOAD_REQ_LOCK_TO),
+			   QED_MFW_GET_FIELD(load_req.misc0, LOAD_REQ_FORCE),
+			   QED_MFW_GET_FIELD(load_req.misc0, LOAD_REQ_FLAGS0));
+	}
+
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Failed to send load request, rc = %d\n", rc);
+		return rc;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SP,
+		   "Load Response: resp 0x%08x\n", mb_params.mcp_resp);
+	p_out_params->load_code = mb_params.mcp_resp;
+
+	if (p_in_params->hsi_ver != QED_LOAD_REQ_HSI_VER_1 &&
+	    p_out_params->load_code != FW_MSG_CODE_DRV_LOAD_REFUSED_HSI_1) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_SP,
+			   "Load Response: exist_drv_ver 0x%08x_0x%08x, exist_fw_ver 0x%08x, misc0 0x%08x [exist_role %d, mfw_hsi %d, flags0 0x%x]\n",
+			   load_rsp.drv_ver_0,
+			   load_rsp.drv_ver_1,
+			   load_rsp.fw_ver,
+			   load_rsp.misc0,
+			   QED_MFW_GET_FIELD(load_rsp.misc0, LOAD_RSP_ROLE),
+			   QED_MFW_GET_FIELD(load_rsp.misc0, LOAD_RSP_HSI),
+			   QED_MFW_GET_FIELD(load_rsp.misc0, LOAD_RSP_FLAGS0));
+
+		p_out_params->exist_drv_ver_0 = load_rsp.drv_ver_0;
+		p_out_params->exist_drv_ver_1 = load_rsp.drv_ver_1;
+		p_out_params->exist_fw_ver = load_rsp.fw_ver;
+		p_out_params->exist_drv_role =
+		    QED_MFW_GET_FIELD(load_rsp.misc0, LOAD_RSP_ROLE);
+		p_out_params->mfw_hsi_ver =
+		    QED_MFW_GET_FIELD(load_rsp.misc0, LOAD_RSP_HSI);
+		p_out_params->drv_exists =
+		    QED_MFW_GET_FIELD(load_rsp.misc0, LOAD_RSP_FLAGS0) &
+		    LOAD_RSP_FLAGS0_DRV_EXISTS;
+	}
+
+	return 0;
+}
+
+static int eocre_get_mfw_drv_role(struct qed_hwfn *p_hwfn,
+				  enum qed_drv_role drv_role,
+				  u8 *p_mfw_drv_role)
+{
+	switch (drv_role) {
+	case QED_DRV_ROLE_OS:
+		*p_mfw_drv_role = DRV_ROLE_OS;
+		break;
+	case QED_DRV_ROLE_KDUMP:
+		*p_mfw_drv_role = DRV_ROLE_KDUMP;
+		break;
+	default:
+		DP_ERR(p_hwfn, "Unexpected driver role %d\n", drv_role);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+enum qed_load_req_force {
+	QED_LOAD_REQ_FORCE_NONE,
+	QED_LOAD_REQ_FORCE_PF,
+	QED_LOAD_REQ_FORCE_ALL,
+};
+
+static void qed_get_mfw_force_cmd(struct qed_hwfn *p_hwfn,
+
+				  enum qed_load_req_force force_cmd,
+				  u8 *p_mfw_force_cmd)
+{
+	switch (force_cmd) {
+	case QED_LOAD_REQ_FORCE_NONE:
+		*p_mfw_force_cmd = LOAD_REQ_FORCE_NONE;
+		break;
+	case QED_LOAD_REQ_FORCE_PF:
+		*p_mfw_force_cmd = LOAD_REQ_FORCE_PF;
+		break;
+	case QED_LOAD_REQ_FORCE_ALL:
+		*p_mfw_force_cmd = LOAD_REQ_FORCE_ALL;
+		break;
+	}
+}
+
+int qed_mcp_load_req(struct qed_hwfn *p_hwfn,
+		     struct qed_ptt *p_ptt,
+		     struct qed_load_req_params *p_params)
+{
+	struct qed_load_req_out_params out_params;
+	struct qed_load_req_in_params in_params;
+	u8 mfw_drv_role, mfw_force_cmd;
+	int rc;
+
+	memset(&in_params, 0, sizeof(in_params));
+	in_params.hsi_ver = QED_LOAD_REQ_HSI_VER_DEFAULT;
+	in_params.drv_ver_0 = QED_VERSION;
+	in_params.drv_ver_1 = qed_get_config_bitmap();
+	in_params.fw_ver = STORM_FW_VERSION;
+	rc = eocre_get_mfw_drv_role(p_hwfn, p_params->drv_role, &mfw_drv_role);
+	if (rc)
+		return rc;
+
+	in_params.drv_role = mfw_drv_role;
+	in_params.timeout_val = p_params->timeout_val;
+	qed_get_mfw_force_cmd(p_hwfn,
+			      QED_LOAD_REQ_FORCE_NONE, &mfw_force_cmd);
+
+	in_params.force_cmd = mfw_force_cmd;
+	in_params.avoid_eng_reset = p_params->avoid_eng_reset;
+
+	memset(&out_params, 0, sizeof(out_params));
+	rc = __qed_mcp_load_req(p_hwfn, p_ptt, &in_params, &out_params);
+	if (rc)
+		return rc;
+
+	/* First handle cases where another load request should/might be sent:
+	 * - MFW expects the old interface [HSI version = 1]
+	 * - MFW responds that a force load request is required
+	 */
+	if (out_params.load_code == FW_MSG_CODE_DRV_LOAD_REFUSED_HSI_1) {
+		DP_INFO(p_hwfn,
+			"MFW refused a load request due to HSI > 1. Resending with HSI = 1\n");
+
+		in_params.hsi_ver = QED_LOAD_REQ_HSI_VER_1;
+		memset(&out_params, 0, sizeof(out_params));
+		rc = __qed_mcp_load_req(p_hwfn, p_ptt, &in_params, &out_params);
+		if (rc)
+			return rc;
+	} else if (out_params.load_code ==
+		   FW_MSG_CODE_DRV_LOAD_REFUSED_REQUIRES_FORCE) {
+		if (qed_mcp_can_force_load(in_params.drv_role,
+					   out_params.exist_drv_role,
+					   p_params->override_force_load)) {
+			DP_INFO(p_hwfn,
+				"A force load is required [{role, fw_ver, drv_ver}: loading={%d, 0x%08x, x%08x_0x%08x}, existing={%d, 0x%08x, 0x%08x_0x%08x}]\n",
+				in_params.drv_role, in_params.fw_ver,
+				in_params.drv_ver_0, in_params.drv_ver_1,
+				out_params.exist_drv_role,
+				out_params.exist_fw_ver,
+				out_params.exist_drv_ver_0,
+				out_params.exist_drv_ver_1);
+
+			qed_get_mfw_force_cmd(p_hwfn,
+					      QED_LOAD_REQ_FORCE_ALL,
+					      &mfw_force_cmd);
+
+			in_params.force_cmd = mfw_force_cmd;
+			memset(&out_params, 0, sizeof(out_params));
+			rc = __qed_mcp_load_req(p_hwfn, p_ptt, &in_params,
+						&out_params);
+			if (rc)
+				return rc;
+		} else {
+			DP_NOTICE(p_hwfn,
+				  "A force load is required [{role, fw_ver, drv_ver}: loading={%d, 0x%08x, x%08x_0x%08x}, existing={%d, 0x%08x, 0x%08x_0x%08x}] - Avoid\n",
+				  in_params.drv_role, in_params.fw_ver,
+				  in_params.drv_ver_0, in_params.drv_ver_1,
+				  out_params.exist_drv_role,
+				  out_params.exist_fw_ver,
+				  out_params.exist_drv_ver_0,
+				  out_params.exist_drv_ver_1);
+			DP_NOTICE(p_hwfn,
+				  "Avoid sending a force load request to prevent disruption of active PFs\n");
+
+			qed_mcp_cancel_load_req(p_hwfn, p_ptt);
+			return -EBUSY;
+		}
+	}
+
+	/* Now handle the other types of responses.
+	 * The "REFUSED_HSI_1" and "REFUSED_REQUIRES_FORCE" responses are not
+	 * expected here after the additional revised load requests were sent.
+	 */
+	switch (out_params.load_code) {
+	case FW_MSG_CODE_DRV_LOAD_ENGINE:
+	case FW_MSG_CODE_DRV_LOAD_PORT:
+	case FW_MSG_CODE_DRV_LOAD_FUNCTION:
+		if (out_params.mfw_hsi_ver != QED_LOAD_REQ_HSI_VER_1 &&
+		    out_params.drv_exists) {
+			/* The role and fw/driver version match, but the PF is
+			 * already loaded and has not been unloaded gracefully.
+			 */
+			DP_NOTICE(p_hwfn,
+				  "PF is already loaded\n");
+			return -EINVAL;
+		}
+		break;
+	default:
+		DP_NOTICE(p_hwfn,
+			  "Unexpected refusal to load request [resp 0x%08x]. Aborting.\n",
+			  out_params.load_code);
+		return -EBUSY;
+	}
+
+	p_params->load_code = out_params.load_code;
+
+	return 0;
+}
+
+int qed_mcp_unload_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 wol_param, mcp_resp, mcp_param;
+
+	switch (p_hwfn->cdev->wol_config) {
+	case QED_OV_WOL_DISABLED:
+		wol_param = DRV_MB_PARAM_UNLOAD_WOL_DISABLED;
+		break;
+	case QED_OV_WOL_ENABLED:
+		wol_param = DRV_MB_PARAM_UNLOAD_WOL_ENABLED;
+		break;
+	default:
+		DP_NOTICE(p_hwfn,
+			  "Unknown WoL configuration %02x\n",
+			  p_hwfn->cdev->wol_config);
+		/* Fallthrough */
+	case QED_OV_WOL_DEFAULT:
+		wol_param = DRV_MB_PARAM_UNLOAD_WOL_MCP;
+	}
+
+	return qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_UNLOAD_REQ, wol_param,
+			   &mcp_resp, &mcp_param);
+}
+
+int qed_mcp_unload_done(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_mcp_mb_params mb_params;
+	struct mcp_mac wol_mac;
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = DRV_MSG_CODE_UNLOAD_DONE;
+
+	/* Set the primary MAC if WoL is enabled */
+	if (p_hwfn->cdev->wol_config == QED_OV_WOL_ENABLED) {
+		u8 *p_mac = p_hwfn->cdev->wol_mac;
+
+		memset(&wol_mac, 0, sizeof(wol_mac));
+		wol_mac.mac_upper = p_mac[0] << 8 | p_mac[1];
+		wol_mac.mac_lower = p_mac[2] << 24 | p_mac[3] << 16 |
+				    p_mac[4] << 8 | p_mac[5];
+
+		DP_VERBOSE(p_hwfn,
+			   (QED_MSG_SP | NETIF_MSG_IFDOWN),
+			   "Setting WoL MAC: %pM --> [%08x,%08x]\n",
+			   p_mac, wol_mac.mac_upper, wol_mac.mac_lower);
+
+		mb_params.p_data_src = &wol_mac;
+		mb_params.data_src_size = sizeof(wol_mac);
+	}
+
+	return qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+}
+
+static void qed_mcp_handle_vf_flr(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt)
+{
+	u32 addr = SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->public_base,
+					PUBLIC_PATH);
+	u32 mfw_path_offsize = qed_rd(p_hwfn, p_ptt, addr);
+	u32 path_addr = SECTION_ADDR(mfw_path_offsize,
+				     QED_PATH_ID(p_hwfn));
+	u32 disabled_vfs[VF_MAX_STATIC / 32];
+	int i;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_SP,
+		   "Reading Disabled VF information from [offset %08x], path_addr %08x\n",
+		   mfw_path_offsize, path_addr);
+
+	for (i = 0; i < (VF_MAX_STATIC / 32); i++) {
+		disabled_vfs[i] = qed_rd(p_hwfn, p_ptt,
+					 path_addr +
+					 offsetof(struct public_path,
+						  mcp_vf_disabled) +
+					 sizeof(u32) * i);
+		DP_VERBOSE(p_hwfn, (QED_MSG_SP | QED_MSG_IOV),
+			   "FLR-ed VFs [%08x,...,%08x] - %08x\n",
+			   i * 32, (i + 1) * 32 - 1, disabled_vfs[i]);
+	}
+
+	if (qed_iov_mark_vf_flr(p_hwfn, disabled_vfs))
+		qed_schedule_iov(p_hwfn, QED_IOV_WQ_FLR_FLAG);
+}
+
+int qed_mcp_ack_vf_flr(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt, u32 *vfs_to_ack)
+{
+	u32 addr = SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->public_base,
+					PUBLIC_FUNC);
+	u32 mfw_func_offsize = qed_rd(p_hwfn, p_ptt, addr);
+	u32 func_addr = SECTION_ADDR(mfw_func_offsize,
+				     MCP_PF_ID(p_hwfn));
+	struct qed_mcp_mb_params mb_params;
+	int rc;
+	int i;
+
+	for (i = 0; i < (VF_MAX_STATIC / 32); i++)
+		DP_VERBOSE(p_hwfn, (QED_MSG_SP | QED_MSG_IOV),
+			   "Acking VFs [%08x,...,%08x] - %08x\n",
+			   i * 32, (i + 1) * 32 - 1, vfs_to_ack[i]);
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = DRV_MSG_CODE_VF_DISABLED_DONE;
+	mb_params.p_data_src = vfs_to_ack;
+	mb_params.data_src_size = VF_MAX_STATIC / 8;
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Failed to pass ACK for VF flr to MFW\n");
+		return -EBUSY;
+	}
+
+	/* Clear the ACK bits */
+	for (i = 0; i < (VF_MAX_STATIC / 32); i++)
+		qed_wr(p_hwfn, p_ptt,
+		       func_addr +
+		       offsetof(struct public_func, drv_ack_vf_disabled) +
+		       i * sizeof(u32), 0);
+
+	return rc;
+}
+
+static void qed_mcp_handle_transceiver_change(struct qed_hwfn *p_hwfn,
+					      struct qed_ptt *p_ptt)
+{
+	u32 transceiver_state;
+
+	transceiver_state = qed_rd(p_hwfn, p_ptt,
+				   p_hwfn->mcp_info->port_addr +
+				   offsetof(struct public_port,
+					    transceiver_data));
+
+	DP_VERBOSE(p_hwfn,
+		   (NETIF_MSG_HW | QED_MSG_SP),
+		   "Received transceiver state update [0x%08x] from mfw [Addr 0x%x]\n",
+		   transceiver_state,
+		   (u32)(p_hwfn->mcp_info->port_addr +
+			  offsetof(struct public_port, transceiver_data)));
+
+	transceiver_state = GET_FIELD(transceiver_state,
+				      ETH_TRANSCEIVER_STATE);
+
+	if (transceiver_state == ETH_TRANSCEIVER_STATE_PRESENT)
+		DP_NOTICE(p_hwfn, "Transceiver is present.\n");
+	else
+		DP_NOTICE(p_hwfn, "Transceiver is unplugged.\n");
+}
+
+static void qed_mcp_read_eee_config(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    struct qed_mcp_link_state *p_link)
+{
+	u32 eee_status, val;
+
+	p_link->eee_adv_caps = 0;
+	p_link->eee_lp_adv_caps = 0;
+	eee_status = qed_rd(p_hwfn,
+			    p_ptt,
+			    p_hwfn->mcp_info->port_addr +
+			    offsetof(struct public_port, eee_status));
+	p_link->eee_active = !!(eee_status & EEE_ACTIVE_BIT);
+	val = (eee_status & EEE_LD_ADV_STATUS_MASK) >> EEE_LD_ADV_STATUS_OFFSET;
+	if (val & EEE_1G_ADV)
+		p_link->eee_adv_caps |= QED_EEE_1G_ADV;
+	if (val & EEE_10G_ADV)
+		p_link->eee_adv_caps |= QED_EEE_10G_ADV;
+	val = (eee_status & EEE_LP_ADV_STATUS_MASK) >> EEE_LP_ADV_STATUS_OFFSET;
+	if (val & EEE_1G_ADV)
+		p_link->eee_lp_adv_caps |= QED_EEE_1G_ADV;
+	if (val & EEE_10G_ADV)
+		p_link->eee_lp_adv_caps |= QED_EEE_10G_ADV;
+}
+
+static void qed_mcp_handle_link_change(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt, bool b_reset)
+{
+	struct qed_mcp_link_state *p_link;
+	u8 max_bw, min_bw;
+	u32 status = 0;
+
+	/* Prevent SW/attentions from doing this at the same time */
+	spin_lock_bh(&p_hwfn->mcp_info->link_lock);
+
+	p_link = &p_hwfn->mcp_info->link_output;
+	memset(p_link, 0, sizeof(*p_link));
+	if (!b_reset) {
+		status = qed_rd(p_hwfn, p_ptt,
+				p_hwfn->mcp_info->port_addr +
+				offsetof(struct public_port, link_status));
+		DP_VERBOSE(p_hwfn, (NETIF_MSG_LINK | QED_MSG_SP),
+			   "Received link update [0x%08x] from mfw [Addr 0x%x]\n",
+			   status,
+			   (u32)(p_hwfn->mcp_info->port_addr +
+				 offsetof(struct public_port, link_status)));
+	} else {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_LINK,
+			   "Resetting link indications\n");
+		goto out;
+	}
+
+	if (p_hwfn->b_drv_link_init)
+		p_link->link_up = !!(status & LINK_STATUS_LINK_UP);
+	else
+		p_link->link_up = false;
+
+	p_link->full_duplex = true;
+	switch ((status & LINK_STATUS_SPEED_AND_DUPLEX_MASK)) {
+	case LINK_STATUS_SPEED_AND_DUPLEX_100G:
+		p_link->speed = 100000;
+		break;
+	case LINK_STATUS_SPEED_AND_DUPLEX_50G:
+		p_link->speed = 50000;
+		break;
+	case LINK_STATUS_SPEED_AND_DUPLEX_40G:
+		p_link->speed = 40000;
+		break;
+	case LINK_STATUS_SPEED_AND_DUPLEX_25G:
+		p_link->speed = 25000;
+		break;
+	case LINK_STATUS_SPEED_AND_DUPLEX_20G:
+		p_link->speed = 20000;
+		break;
+	case LINK_STATUS_SPEED_AND_DUPLEX_10G:
+		p_link->speed = 10000;
+		break;
+	case LINK_STATUS_SPEED_AND_DUPLEX_1000THD:
+		p_link->full_duplex = false;
+	/* Fall-through */
+	case LINK_STATUS_SPEED_AND_DUPLEX_1000TFD:
+		p_link->speed = 1000;
+		break;
+	default:
+		p_link->speed = 0;
+	}
+
+	if (p_link->link_up && p_link->speed)
+		p_link->line_speed = p_link->speed;
+	else
+		p_link->line_speed = 0;
+
+	max_bw = p_hwfn->mcp_info->func_info.bandwidth_max;
+	min_bw = p_hwfn->mcp_info->func_info.bandwidth_min;
+
+	/* Max bandwidth configuration */
+	__qed_configure_pf_max_bandwidth(p_hwfn, p_ptt, p_link, max_bw);
+
+	/* Min bandwidth configuration */
+	__qed_configure_pf_min_bandwidth(p_hwfn, p_ptt, p_link, min_bw);
+	qed_configure_vp_wfq_on_link_change(p_hwfn->cdev, p_ptt,
+					    p_link->min_pf_rate);
+
+	p_link->an = !!(status & LINK_STATUS_AUTO_NEGOTIATE_ENABLED);
+	p_link->an_complete = !!(status &
+				 LINK_STATUS_AUTO_NEGOTIATE_COMPLETE);
+	p_link->parallel_detection = !!(status &
+					LINK_STATUS_PARALLEL_DETECTION_USED);
+	p_link->pfc_enabled = !!(status & LINK_STATUS_PFC_ENABLED);
+
+	p_link->partner_adv_speed |=
+		(status & LINK_STATUS_LINK_PARTNER_1000TFD_CAPABLE) ?
+		QED_LINK_PARTNER_SPEED_1G_FD : 0;
+	p_link->partner_adv_speed |=
+		(status & LINK_STATUS_LINK_PARTNER_1000THD_CAPABLE) ?
+		QED_LINK_PARTNER_SPEED_1G_HD : 0;
+	p_link->partner_adv_speed |=
+		(status & LINK_STATUS_LINK_PARTNER_10G_CAPABLE) ?
+		QED_LINK_PARTNER_SPEED_10G : 0;
+	p_link->partner_adv_speed |=
+		(status & LINK_STATUS_LINK_PARTNER_20G_CAPABLE) ?
+		QED_LINK_PARTNER_SPEED_20G : 0;
+	p_link->partner_adv_speed |=
+		(status & LINK_STATUS_LINK_PARTNER_25G_CAPABLE) ?
+		QED_LINK_PARTNER_SPEED_25G : 0;
+	p_link->partner_adv_speed |=
+		(status & LINK_STATUS_LINK_PARTNER_40G_CAPABLE) ?
+		QED_LINK_PARTNER_SPEED_40G : 0;
+	p_link->partner_adv_speed |=
+		(status & LINK_STATUS_LINK_PARTNER_50G_CAPABLE) ?
+		QED_LINK_PARTNER_SPEED_50G : 0;
+	p_link->partner_adv_speed |=
+		(status & LINK_STATUS_LINK_PARTNER_100G_CAPABLE) ?
+		QED_LINK_PARTNER_SPEED_100G : 0;
+
+	p_link->partner_tx_flow_ctrl_en =
+		!!(status & LINK_STATUS_TX_FLOW_CONTROL_ENABLED);
+	p_link->partner_rx_flow_ctrl_en =
+		!!(status & LINK_STATUS_RX_FLOW_CONTROL_ENABLED);
+
+	switch (status & LINK_STATUS_LINK_PARTNER_FLOW_CONTROL_MASK) {
+	case LINK_STATUS_LINK_PARTNER_SYMMETRIC_PAUSE:
+		p_link->partner_adv_pause = QED_LINK_PARTNER_SYMMETRIC_PAUSE;
+		break;
+	case LINK_STATUS_LINK_PARTNER_ASYMMETRIC_PAUSE:
+		p_link->partner_adv_pause = QED_LINK_PARTNER_ASYMMETRIC_PAUSE;
+		break;
+	case LINK_STATUS_LINK_PARTNER_BOTH_PAUSE:
+		p_link->partner_adv_pause = QED_LINK_PARTNER_BOTH_PAUSE;
+		break;
+	default:
+		p_link->partner_adv_pause = 0;
+	}
+
+	p_link->sfp_tx_fault = !!(status & LINK_STATUS_SFP_TX_FAULT);
+
+	if (p_hwfn->mcp_info->capabilities & FW_MB_PARAM_FEATURE_SUPPORT_EEE)
+		qed_mcp_read_eee_config(p_hwfn, p_ptt, p_link);
+
+	qed_link_update(p_hwfn);
+out:
+	spin_unlock_bh(&p_hwfn->mcp_info->link_lock);
+}
+
+int qed_mcp_set_link(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, bool b_up)
+{
+	struct qed_mcp_link_params *params = &p_hwfn->mcp_info->link_input;
+	struct qed_mcp_mb_params mb_params;
+	struct eth_phy_cfg phy_cfg;
+	int rc = 0;
+	u32 cmd;
+
+	/* Set the shmem configuration according to params */
+	memset(&phy_cfg, 0, sizeof(phy_cfg));
+	cmd = b_up ? DRV_MSG_CODE_INIT_PHY : DRV_MSG_CODE_LINK_RESET;
+	if (!params->speed.autoneg)
+		phy_cfg.speed = params->speed.forced_speed;
+	phy_cfg.pause |= (params->pause.autoneg) ? ETH_PAUSE_AUTONEG : 0;
+	phy_cfg.pause |= (params->pause.forced_rx) ? ETH_PAUSE_RX : 0;
+	phy_cfg.pause |= (params->pause.forced_tx) ? ETH_PAUSE_TX : 0;
+	phy_cfg.adv_speed = params->speed.advertised_speeds;
+	phy_cfg.loopback_mode = params->loopback_mode;
+	if (p_hwfn->mcp_info->capabilities & FW_MB_PARAM_FEATURE_SUPPORT_EEE) {
+		if (params->eee.enable)
+			phy_cfg.eee_cfg |= EEE_CFG_EEE_ENABLED;
+		if (params->eee.tx_lpi_enable)
+			phy_cfg.eee_cfg |= EEE_CFG_TX_LPI;
+		if (params->eee.adv_caps & QED_EEE_1G_ADV)
+			phy_cfg.eee_cfg |= EEE_CFG_ADV_SPEED_1G;
+		if (params->eee.adv_caps & QED_EEE_10G_ADV)
+			phy_cfg.eee_cfg |= EEE_CFG_ADV_SPEED_10G;
+		phy_cfg.eee_cfg |= (params->eee.tx_lpi_timer <<
+				    EEE_TX_TIMER_USEC_OFFSET) &
+				   EEE_TX_TIMER_USEC_MASK;
+	}
+
+	p_hwfn->b_drv_link_init = b_up;
+
+	if (b_up) {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_LINK,
+			   "Configuring Link: Speed 0x%08x, Pause 0x%08x, adv_speed 0x%08x, loopback 0x%08x, features 0x%08x\n",
+			   phy_cfg.speed,
+			   phy_cfg.pause,
+			   phy_cfg.adv_speed,
+			   phy_cfg.loopback_mode,
+			   phy_cfg.feature_config_flags);
+	} else {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_LINK,
+			   "Resetting link\n");
+	}
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = cmd;
+	mb_params.p_data_src = &phy_cfg;
+	mb_params.data_src_size = sizeof(phy_cfg);
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+
+	/* if mcp fails to respond we must abort */
+	if (rc) {
+		DP_ERR(p_hwfn, "MCP response failure, aborting\n");
+		return rc;
+	}
+
+	/* Mimic link-change attention, done for several reasons:
+	 *  - On reset, there's no guarantee MFW would trigger
+	 *    an attention.
+	 *  - On initialization, older MFWs might not indicate link change
+	 *    during LFA, so we'll never get an UP indication.
+	 */
+	qed_mcp_handle_link_change(p_hwfn, p_ptt, !b_up);
+
+	return 0;
+}
+
+static void qed_mcp_send_protocol_stats(struct qed_hwfn *p_hwfn,
+					struct qed_ptt *p_ptt,
+					enum MFW_DRV_MSG_TYPE type)
+{
+	enum qed_mcp_protocol_type stats_type;
+	union qed_mcp_protocol_stats stats;
+	struct qed_mcp_mb_params mb_params;
+	u32 hsi_param;
+
+	switch (type) {
+	case MFW_DRV_MSG_GET_LAN_STATS:
+		stats_type = QED_MCP_LAN_STATS;
+		hsi_param = DRV_MSG_CODE_STATS_TYPE_LAN;
+		break;
+	case MFW_DRV_MSG_GET_FCOE_STATS:
+		stats_type = QED_MCP_FCOE_STATS;
+		hsi_param = DRV_MSG_CODE_STATS_TYPE_FCOE;
+		break;
+	case MFW_DRV_MSG_GET_ISCSI_STATS:
+		stats_type = QED_MCP_ISCSI_STATS;
+		hsi_param = DRV_MSG_CODE_STATS_TYPE_ISCSI;
+		break;
+	case MFW_DRV_MSG_GET_RDMA_STATS:
+		stats_type = QED_MCP_RDMA_STATS;
+		hsi_param = DRV_MSG_CODE_STATS_TYPE_RDMA;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Invalid protocol type %d\n", type);
+		return;
+	}
+
+	qed_get_protocol_stats(p_hwfn->cdev, stats_type, &stats);
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = DRV_MSG_CODE_GET_STATS;
+	mb_params.param = hsi_param;
+	mb_params.p_data_src = &stats;
+	mb_params.data_src_size = sizeof(stats);
+	qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+}
+
+static void qed_read_pf_bandwidth(struct qed_hwfn *p_hwfn,
+				  struct public_func *p_shmem_info)
+{
+	struct qed_mcp_function_info *p_info;
+
+	p_info = &p_hwfn->mcp_info->func_info;
+
+	p_info->bandwidth_min = (p_shmem_info->config &
+				 FUNC_MF_CFG_MIN_BW_MASK) >>
+					FUNC_MF_CFG_MIN_BW_SHIFT;
+	if (p_info->bandwidth_min < 1 || p_info->bandwidth_min > 100) {
+		DP_INFO(p_hwfn,
+			"bandwidth minimum out of bounds [%02x]. Set to 1\n",
+			p_info->bandwidth_min);
+		p_info->bandwidth_min = 1;
+	}
+
+	p_info->bandwidth_max = (p_shmem_info->config &
+				 FUNC_MF_CFG_MAX_BW_MASK) >>
+					FUNC_MF_CFG_MAX_BW_SHIFT;
+	if (p_info->bandwidth_max < 1 || p_info->bandwidth_max > 100) {
+		DP_INFO(p_hwfn,
+			"bandwidth maximum out of bounds [%02x]. Set to 100\n",
+			p_info->bandwidth_max);
+		p_info->bandwidth_max = 100;
+	}
+}
+
+static u32 qed_mcp_get_shmem_func(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct public_func *p_data, int pfid)
+{
+	u32 addr = SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->public_base,
+					PUBLIC_FUNC);
+	u32 mfw_path_offsize = qed_rd(p_hwfn, p_ptt, addr);
+	u32 func_addr = SECTION_ADDR(mfw_path_offsize, pfid);
+	u32 i, size;
+
+	memset(p_data, 0, sizeof(*p_data));
+
+	size = min_t(u32, sizeof(*p_data), QED_SECTION_SIZE(mfw_path_offsize));
+	for (i = 0; i < size / sizeof(u32); i++)
+		((u32 *)p_data)[i] = qed_rd(p_hwfn, p_ptt,
+					    func_addr + (i << 2));
+	return size;
+}
+
+static void qed_mcp_update_bw(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_mcp_function_info *p_info;
+	struct public_func shmem_info;
+	u32 resp = 0, param = 0;
+
+	qed_mcp_get_shmem_func(p_hwfn, p_ptt, &shmem_info, MCP_PF_ID(p_hwfn));
+
+	qed_read_pf_bandwidth(p_hwfn, &shmem_info);
+
+	p_info = &p_hwfn->mcp_info->func_info;
+
+	qed_configure_pf_min_bandwidth(p_hwfn->cdev, p_info->bandwidth_min);
+	qed_configure_pf_max_bandwidth(p_hwfn->cdev, p_info->bandwidth_max);
+
+	/* Acknowledge the MFW */
+	qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_BW_UPDATE_ACK, 0, &resp,
+		    &param);
+}
+
+static void qed_mcp_update_stag(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct public_func shmem_info;
+	u32 resp = 0, param = 0;
+
+	qed_mcp_get_shmem_func(p_hwfn, p_ptt, &shmem_info, MCP_PF_ID(p_hwfn));
+
+	p_hwfn->mcp_info->func_info.ovlan = (u16)shmem_info.ovlan_stag &
+						 FUNC_MF_CFG_OV_STAG_MASK;
+	p_hwfn->hw_info.ovlan = p_hwfn->mcp_info->func_info.ovlan;
+	if ((p_hwfn->hw_info.hw_mode & BIT(MODE_MF_SD)) &&
+	    (p_hwfn->hw_info.ovlan != QED_MCP_VLAN_UNSET)) {
+		qed_wr(p_hwfn, p_ptt,
+		       NIG_REG_LLH_FUNC_TAG_VALUE, p_hwfn->hw_info.ovlan);
+		qed_sp_pf_update_stag(p_hwfn);
+	}
+
+	/* Acknowledge the MFW */
+	qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_S_TAG_UPDATE_ACK, 0,
+		    &resp, &param);
+}
+
+int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt)
+{
+	struct qed_mcp_info *info = p_hwfn->mcp_info;
+	int rc = 0;
+	bool found = false;
+	u16 i;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SP, "Received message from MFW\n");
+
+	/* Read Messages from MFW */
+	qed_mcp_read_mb(p_hwfn, p_ptt);
+
+	/* Compare current messages to old ones */
+	for (i = 0; i < info->mfw_mb_length; i++) {
+		if (info->mfw_mb_cur[i] == info->mfw_mb_shadow[i])
+			continue;
+
+		found = true;
+
+		DP_VERBOSE(p_hwfn, NETIF_MSG_LINK,
+			   "Msg [%d] - old CMD 0x%02x, new CMD 0x%02x\n",
+			   i, info->mfw_mb_shadow[i], info->mfw_mb_cur[i]);
+
+		switch (i) {
+		case MFW_DRV_MSG_LINK_CHANGE:
+			qed_mcp_handle_link_change(p_hwfn, p_ptt, false);
+			break;
+		case MFW_DRV_MSG_VF_DISABLED:
+			qed_mcp_handle_vf_flr(p_hwfn, p_ptt);
+			break;
+		case MFW_DRV_MSG_LLDP_DATA_UPDATED:
+			qed_dcbx_mib_update_event(p_hwfn, p_ptt,
+						  QED_DCBX_REMOTE_LLDP_MIB);
+			break;
+		case MFW_DRV_MSG_DCBX_REMOTE_MIB_UPDATED:
+			qed_dcbx_mib_update_event(p_hwfn, p_ptt,
+						  QED_DCBX_REMOTE_MIB);
+			break;
+		case MFW_DRV_MSG_DCBX_OPERATIONAL_MIB_UPDATED:
+			qed_dcbx_mib_update_event(p_hwfn, p_ptt,
+						  QED_DCBX_OPERATIONAL_MIB);
+			break;
+		case MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE:
+			qed_mcp_handle_transceiver_change(p_hwfn, p_ptt);
+			break;
+		case MFW_DRV_MSG_GET_LAN_STATS:
+		case MFW_DRV_MSG_GET_FCOE_STATS:
+		case MFW_DRV_MSG_GET_ISCSI_STATS:
+		case MFW_DRV_MSG_GET_RDMA_STATS:
+			qed_mcp_send_protocol_stats(p_hwfn, p_ptt, i);
+			break;
+		case MFW_DRV_MSG_BW_UPDATE:
+			qed_mcp_update_bw(p_hwfn, p_ptt);
+			break;
+		case MFW_DRV_MSG_S_TAG_UPDATE:
+			qed_mcp_update_stag(p_hwfn, p_ptt);
+			break;
+			break;
+		default:
+			DP_INFO(p_hwfn, "Unimplemented MFW message %d\n", i);
+			rc = -EINVAL;
+		}
+	}
+
+	/* ACK everything */
+	for (i = 0; i < MFW_DRV_MSG_MAX_DWORDS(info->mfw_mb_length); i++) {
+		__be32 val = cpu_to_be32(((u32 *)info->mfw_mb_cur)[i]);
+
+		/* MFW expect answer in BE, so we force write in that format */
+		qed_wr(p_hwfn, p_ptt,
+		       info->mfw_mb_addr + sizeof(u32) +
+		       MFW_DRV_MSG_MAX_DWORDS(info->mfw_mb_length) *
+		       sizeof(u32) + i * sizeof(u32),
+		       (__force u32)val);
+	}
+
+	if (!found) {
+		DP_NOTICE(p_hwfn,
+			  "Received an MFW message indication but no new message!\n");
+		rc = -EINVAL;
+	}
+
+	/* Copy the new mfw messages into the shadow */
+	memcpy(info->mfw_mb_shadow, info->mfw_mb_cur, info->mfw_mb_length);
+
+	return rc;
+}
+
+int qed_mcp_get_mfw_ver(struct qed_hwfn *p_hwfn,
+			struct qed_ptt *p_ptt,
+			u32 *p_mfw_ver, u32 *p_running_bundle_id)
+{
+	u32 global_offsize;
+
+	if (IS_VF(p_hwfn->cdev)) {
+		if (p_hwfn->vf_iov_info) {
+			struct pfvf_acquire_resp_tlv *p_resp;
+
+			p_resp = &p_hwfn->vf_iov_info->acquire_resp;
+			*p_mfw_ver = p_resp->pfdev_info.mfw_ver;
+			return 0;
+		} else {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF requested MFW version prior to ACQUIRE\n");
+			return -EINVAL;
+		}
+	}
+
+	global_offsize = qed_rd(p_hwfn, p_ptt,
+				SECTION_OFFSIZE_ADDR(p_hwfn->
+						     mcp_info->public_base,
+						     PUBLIC_GLOBAL));
+	*p_mfw_ver =
+	    qed_rd(p_hwfn, p_ptt,
+		   SECTION_ADDR(global_offsize,
+				0) + offsetof(struct public_global, mfw_ver));
+
+	if (p_running_bundle_id != NULL) {
+		*p_running_bundle_id = qed_rd(p_hwfn, p_ptt,
+					      SECTION_ADDR(global_offsize, 0) +
+					      offsetof(struct public_global,
+						       running_bundle_id));
+	}
+
+	return 0;
+}
+
+int qed_mcp_get_mbi_ver(struct qed_hwfn *p_hwfn,
+			struct qed_ptt *p_ptt, u32 *p_mbi_ver)
+{
+	u32 nvm_cfg_addr, nvm_cfg1_offset, mbi_ver_addr;
+
+	if (IS_VF(p_hwfn->cdev))
+		return -EINVAL;
+
+	/* Read the address of the nvm_cfg */
+	nvm_cfg_addr = qed_rd(p_hwfn, p_ptt, MISC_REG_GEN_PURP_CR0);
+	if (!nvm_cfg_addr) {
+		DP_NOTICE(p_hwfn, "Shared memory not initialized\n");
+		return -EINVAL;
+	}
+
+	/* Read the offset of nvm_cfg1 */
+	nvm_cfg1_offset = qed_rd(p_hwfn, p_ptt, nvm_cfg_addr + 4);
+
+	mbi_ver_addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
+		       offsetof(struct nvm_cfg1, glob) +
+		       offsetof(struct nvm_cfg1_glob, mbi_version);
+	*p_mbi_ver = qed_rd(p_hwfn, p_ptt,
+			    mbi_ver_addr) &
+		     (NVM_CFG1_GLOB_MBI_VERSION_0_MASK |
+		      NVM_CFG1_GLOB_MBI_VERSION_1_MASK |
+		      NVM_CFG1_GLOB_MBI_VERSION_2_MASK);
+
+	return 0;
+}
+
+int qed_mcp_get_media_type(struct qed_dev *cdev, u32 *p_media_type)
+{
+	struct qed_hwfn *p_hwfn = &cdev->hwfns[0];
+	struct qed_ptt  *p_ptt;
+
+	if (IS_VF(cdev))
+		return -EINVAL;
+
+	if (!qed_mcp_is_init(p_hwfn)) {
+		DP_NOTICE(p_hwfn, "MFW is not initialized!\n");
+		return -EBUSY;
+	}
+
+	*p_media_type = MEDIA_UNSPECIFIED;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EBUSY;
+
+	*p_media_type = qed_rd(p_hwfn, p_ptt, p_hwfn->mcp_info->port_addr +
+			       offsetof(struct public_port, media_type));
+
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return 0;
+}
+
+/* Old MFW has a global configuration for all PFs regarding RDMA support */
+static void
+qed_mcp_get_shmem_proto_legacy(struct qed_hwfn *p_hwfn,
+			       enum qed_pci_personality *p_proto)
+{
+	/* There wasn't ever a legacy MFW that published iwarp.
+	 * So at this point, this is either plain l2 or RoCE.
+	 */
+	if (test_bit(QED_DEV_CAP_ROCE, &p_hwfn->hw_info.device_capabilities))
+		*p_proto = QED_PCI_ETH_ROCE;
+	else
+		*p_proto = QED_PCI_ETH;
+
+	DP_VERBOSE(p_hwfn, NETIF_MSG_IFUP,
+		   "According to Legacy capabilities, L2 personality is %08x\n",
+		   (u32) *p_proto);
+}
+
+static int
+qed_mcp_get_shmem_proto_mfw(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt,
+			    enum qed_pci_personality *p_proto)
+{
+	u32 resp = 0, param = 0;
+	int rc;
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt,
+			 DRV_MSG_CODE_GET_PF_RDMA_PROTOCOL, 0, &resp, &param);
+	if (rc)
+		return rc;
+	if (resp != FW_MSG_CODE_OK) {
+		DP_VERBOSE(p_hwfn, NETIF_MSG_IFUP,
+			   "MFW lacks support for command; Returns %08x\n",
+			   resp);
+		return -EINVAL;
+	}
+
+	switch (param) {
+	case FW_MB_PARAM_GET_PF_RDMA_NONE:
+		*p_proto = QED_PCI_ETH;
+		break;
+	case FW_MB_PARAM_GET_PF_RDMA_ROCE:
+		*p_proto = QED_PCI_ETH_ROCE;
+		break;
+	case FW_MB_PARAM_GET_PF_RDMA_IWARP:
+		*p_proto = QED_PCI_ETH_IWARP;
+		break;
+	case FW_MB_PARAM_GET_PF_RDMA_BOTH:
+		*p_proto = QED_PCI_ETH_RDMA;
+		break;
+	default:
+		DP_NOTICE(p_hwfn,
+			  "MFW answers GET_PF_RDMA_PROTOCOL but param is %08x\n",
+			  param);
+		return -EINVAL;
+	}
+
+	DP_VERBOSE(p_hwfn,
+		   NETIF_MSG_IFUP,
+		   "According to capabilities, L2 personality is %08x [resp %08x param %08x]\n",
+		   (u32) *p_proto, resp, param);
+	return 0;
+}
+
+static int
+qed_mcp_get_shmem_proto(struct qed_hwfn *p_hwfn,
+			struct public_func *p_info,
+			struct qed_ptt *p_ptt,
+			enum qed_pci_personality *p_proto)
+{
+	int rc = 0;
+
+	switch (p_info->config & FUNC_MF_CFG_PROTOCOL_MASK) {
+	case FUNC_MF_CFG_PROTOCOL_ETHERNET:
+		if (!IS_ENABLED(CONFIG_QED_RDMA))
+			*p_proto = QED_PCI_ETH;
+		else if (qed_mcp_get_shmem_proto_mfw(p_hwfn, p_ptt, p_proto))
+			qed_mcp_get_shmem_proto_legacy(p_hwfn, p_proto);
+		break;
+	case FUNC_MF_CFG_PROTOCOL_ISCSI:
+		*p_proto = QED_PCI_ISCSI;
+		break;
+	case FUNC_MF_CFG_PROTOCOL_FCOE:
+		*p_proto = QED_PCI_FCOE;
+		break;
+	case FUNC_MF_CFG_PROTOCOL_ROCE:
+		DP_NOTICE(p_hwfn, "RoCE personality is not a valid value!\n");
+	/* Fallthrough */
+	default:
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+int qed_mcp_fill_shmem_func_info(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt)
+{
+	struct qed_mcp_function_info *info;
+	struct public_func shmem_info;
+
+	qed_mcp_get_shmem_func(p_hwfn, p_ptt, &shmem_info, MCP_PF_ID(p_hwfn));
+	info = &p_hwfn->mcp_info->func_info;
+
+	info->pause_on_host = (shmem_info.config &
+			       FUNC_MF_CFG_PAUSE_ON_HOST_RING) ? 1 : 0;
+
+	if (qed_mcp_get_shmem_proto(p_hwfn, &shmem_info, p_ptt,
+				    &info->protocol)) {
+		DP_ERR(p_hwfn, "Unknown personality %08x\n",
+		       (u32)(shmem_info.config & FUNC_MF_CFG_PROTOCOL_MASK));
+		return -EINVAL;
+	}
+
+	qed_read_pf_bandwidth(p_hwfn, &shmem_info);
+
+	if (shmem_info.mac_upper || shmem_info.mac_lower) {
+		info->mac[0] = (u8)(shmem_info.mac_upper >> 8);
+		info->mac[1] = (u8)(shmem_info.mac_upper);
+		info->mac[2] = (u8)(shmem_info.mac_lower >> 24);
+		info->mac[3] = (u8)(shmem_info.mac_lower >> 16);
+		info->mac[4] = (u8)(shmem_info.mac_lower >> 8);
+		info->mac[5] = (u8)(shmem_info.mac_lower);
+
+		/* Store primary MAC for later possible WoL */
+		memcpy(&p_hwfn->cdev->wol_mac, info->mac, ETH_ALEN);
+	} else {
+		DP_NOTICE(p_hwfn, "MAC is 0 in shmem\n");
+	}
+
+	info->wwn_port = (u64)shmem_info.fcoe_wwn_port_name_lower |
+			 (((u64)shmem_info.fcoe_wwn_port_name_upper) << 32);
+	info->wwn_node = (u64)shmem_info.fcoe_wwn_node_name_lower |
+			 (((u64)shmem_info.fcoe_wwn_node_name_upper) << 32);
+
+	info->ovlan = (u16)(shmem_info.ovlan_stag & FUNC_MF_CFG_OV_STAG_MASK);
+
+	info->mtu = (u16)shmem_info.mtu_size;
+
+	p_hwfn->hw_info.b_wol_support = QED_WOL_SUPPORT_NONE;
+	p_hwfn->cdev->wol_config = (u8)QED_OV_WOL_DEFAULT;
+	if (qed_mcp_is_init(p_hwfn)) {
+		u32 resp = 0, param = 0;
+		int rc;
+
+		rc = qed_mcp_cmd(p_hwfn, p_ptt,
+				 DRV_MSG_CODE_OS_WOL, 0, &resp, &param);
+		if (rc)
+			return rc;
+		if (resp == FW_MSG_CODE_OS_WOL_SUPPORTED)
+			p_hwfn->hw_info.b_wol_support = QED_WOL_SUPPORT_PME;
+	}
+
+	DP_VERBOSE(p_hwfn, (QED_MSG_SP | NETIF_MSG_IFUP),
+		   "Read configuration from shmem: pause_on_host %02x protocol %02x BW [%02x - %02x] MAC %02x:%02x:%02x:%02x:%02x:%02x wwn port %llx node %llx ovlan %04x wol %02x\n",
+		info->pause_on_host, info->protocol,
+		info->bandwidth_min, info->bandwidth_max,
+		info->mac[0], info->mac[1], info->mac[2],
+		info->mac[3], info->mac[4], info->mac[5],
+		info->wwn_port, info->wwn_node,
+		info->ovlan, (u8)p_hwfn->hw_info.b_wol_support);
+
+	return 0;
+}
+
+struct qed_mcp_link_params
+*qed_mcp_get_link_params(struct qed_hwfn *p_hwfn)
+{
+	if (!p_hwfn || !p_hwfn->mcp_info)
+		return NULL;
+	return &p_hwfn->mcp_info->link_input;
+}
+
+struct qed_mcp_link_state
+*qed_mcp_get_link_state(struct qed_hwfn *p_hwfn)
+{
+	if (!p_hwfn || !p_hwfn->mcp_info)
+		return NULL;
+	return &p_hwfn->mcp_info->link_output;
+}
+
+struct qed_mcp_link_capabilities
+*qed_mcp_get_link_capabilities(struct qed_hwfn *p_hwfn)
+{
+	if (!p_hwfn || !p_hwfn->mcp_info)
+		return NULL;
+	return &p_hwfn->mcp_info->link_capabilities;
+}
+
+int qed_mcp_drain(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 resp = 0, param = 0;
+	int rc;
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt,
+			 DRV_MSG_CODE_NIG_DRAIN, 1000, &resp, &param);
+
+	/* Wait for the drain to complete before returning */
+	msleep(1020);
+
+	return rc;
+}
+
+int qed_mcp_get_flash_size(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt, u32 *p_flash_size)
+{
+	u32 flash_size;
+
+	if (IS_VF(p_hwfn->cdev))
+		return -EINVAL;
+
+	flash_size = qed_rd(p_hwfn, p_ptt, MCP_REG_NVM_CFG4);
+	flash_size = (flash_size & MCP_REG_NVM_CFG4_FLASH_SIZE) >>
+		      MCP_REG_NVM_CFG4_FLASH_SIZE_SHIFT;
+	flash_size = (1 << (flash_size + MCP_BYTES_PER_MBIT_SHIFT));
+
+	*p_flash_size = flash_size;
+
+	return 0;
+}
+
+static int
+qed_mcp_config_vf_msix_bb(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u8 vf_id, u8 num)
+{
+	u32 resp = 0, param = 0, rc_param = 0;
+	int rc;
+
+	/* Only Leader can configure MSIX, and need to take CMT into account */
+	if (!IS_LEAD_HWFN(p_hwfn))
+		return 0;
+	num *= p_hwfn->cdev->num_hwfns;
+
+	param |= (vf_id << DRV_MB_PARAM_CFG_VF_MSIX_VF_ID_SHIFT) &
+		 DRV_MB_PARAM_CFG_VF_MSIX_VF_ID_MASK;
+	param |= (num << DRV_MB_PARAM_CFG_VF_MSIX_SB_NUM_SHIFT) &
+		 DRV_MB_PARAM_CFG_VF_MSIX_SB_NUM_MASK;
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_CFG_VF_MSIX, param,
+			 &resp, &rc_param);
+
+	if (resp != FW_MSG_CODE_DRV_CFG_VF_MSIX_DONE) {
+		DP_NOTICE(p_hwfn, "VF[%d]: MFW failed to set MSI-X\n", vf_id);
+		rc = -EINVAL;
+	} else {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Requested 0x%02x MSI-x interrupts from VF 0x%02x\n",
+			   num, vf_id);
+	}
+
+	return rc;
+}
+
+static int
+qed_mcp_config_vf_msix_ah(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u8 num)
+{
+	u32 resp = 0, param = num, rc_param = 0;
+	int rc;
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_CFG_PF_VFS_MSIX,
+			 param, &resp, &rc_param);
+
+	if (resp != FW_MSG_CODE_DRV_CFG_PF_VFS_MSIX_DONE) {
+		DP_NOTICE(p_hwfn, "MFW failed to set MSI-X for VFs\n");
+		rc = -EINVAL;
+	} else {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Requested 0x%02x MSI-x interrupts for VFs\n", num);
+	}
+
+	return rc;
+}
+
+int qed_mcp_config_vf_msix(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt, u8 vf_id, u8 num)
+{
+	if (QED_IS_BB(p_hwfn->cdev))
+		return qed_mcp_config_vf_msix_bb(p_hwfn, p_ptt, vf_id, num);
+	else
+		return qed_mcp_config_vf_msix_ah(p_hwfn, p_ptt, num);
+}
+
+int
+qed_mcp_send_drv_version(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 struct qed_mcp_drv_version *p_ver)
+{
+	struct qed_mcp_mb_params mb_params;
+	struct drv_version_stc drv_version;
+	__be32 val;
+	u32 i;
+	int rc;
+
+	memset(&drv_version, 0, sizeof(drv_version));
+	drv_version.version = p_ver->version;
+	for (i = 0; i < (MCP_DRV_VER_STR_SIZE - 4) / sizeof(u32); i++) {
+		val = cpu_to_be32(*((u32 *)&p_ver->name[i * sizeof(u32)]));
+		*(__be32 *)&drv_version.name[i * sizeof(u32)] = val;
+	}
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = DRV_MSG_CODE_SET_VERSION;
+	mb_params.p_data_src = &drv_version;
+	mb_params.data_src_size = sizeof(drv_version);
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc)
+		DP_ERR(p_hwfn, "MCP response failure, aborting\n");
+
+	return rc;
+}
+
+int qed_mcp_halt(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 resp = 0, param = 0;
+	int rc;
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_MCP_HALT, 0, &resp,
+			 &param);
+	if (rc)
+		DP_ERR(p_hwfn, "MCP response failure, aborting\n");
+
+	return rc;
+}
+
+int qed_mcp_resume(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 value, cpu_mode;
+
+	qed_wr(p_hwfn, p_ptt, MCP_REG_CPU_STATE, 0xffffffff);
+
+	value = qed_rd(p_hwfn, p_ptt, MCP_REG_CPU_MODE);
+	value &= ~MCP_REG_CPU_MODE_SOFT_HALT;
+	qed_wr(p_hwfn, p_ptt, MCP_REG_CPU_MODE, value);
+	cpu_mode = qed_rd(p_hwfn, p_ptt, MCP_REG_CPU_MODE);
+
+	return (cpu_mode & MCP_REG_CPU_MODE_SOFT_HALT) ? -EAGAIN : 0;
+}
+
+int qed_mcp_ov_update_current_config(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     enum qed_ov_client client)
+{
+	u32 resp = 0, param = 0;
+	u32 drv_mb_param;
+	int rc;
+
+	switch (client) {
+	case QED_OV_CLIENT_DRV:
+		drv_mb_param = DRV_MB_PARAM_OV_CURR_CFG_OS;
+		break;
+	case QED_OV_CLIENT_USER:
+		drv_mb_param = DRV_MB_PARAM_OV_CURR_CFG_OTHER;
+		break;
+	case QED_OV_CLIENT_VENDOR_SPEC:
+		drv_mb_param = DRV_MB_PARAM_OV_CURR_CFG_VENDOR_SPEC;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Invalid client type %d\n", client);
+		return -EINVAL;
+	}
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_OV_UPDATE_CURR_CFG,
+			 drv_mb_param, &resp, &param);
+	if (rc)
+		DP_ERR(p_hwfn, "MCP response failure, aborting\n");
+
+	return rc;
+}
+
+int qed_mcp_ov_update_driver_state(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   enum qed_ov_driver_state drv_state)
+{
+	u32 resp = 0, param = 0;
+	u32 drv_mb_param;
+	int rc;
+
+	switch (drv_state) {
+	case QED_OV_DRIVER_STATE_NOT_LOADED:
+		drv_mb_param = DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_NOT_LOADED;
+		break;
+	case QED_OV_DRIVER_STATE_DISABLED:
+		drv_mb_param = DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_DISABLED;
+		break;
+	case QED_OV_DRIVER_STATE_ACTIVE:
+		drv_mb_param = DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_ACTIVE;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Invalid driver state %d\n", drv_state);
+		return -EINVAL;
+	}
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE,
+			 drv_mb_param, &resp, &param);
+	if (rc)
+		DP_ERR(p_hwfn, "Failed to send driver state\n");
+
+	return rc;
+}
+
+int qed_mcp_ov_update_mtu(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u16 mtu)
+{
+	u32 resp = 0, param = 0;
+	u32 drv_mb_param;
+	int rc;
+
+	drv_mb_param = (u32)mtu << DRV_MB_PARAM_OV_MTU_SIZE_SHIFT;
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_OV_UPDATE_MTU,
+			 drv_mb_param, &resp, &param);
+	if (rc)
+		DP_ERR(p_hwfn, "Failed to send mtu value, rc = %d\n", rc);
+
+	return rc;
+}
+
+int qed_mcp_ov_update_mac(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u8 *mac)
+{
+	struct qed_mcp_mb_params mb_params;
+	u32 mfw_mac[2];
+	int rc;
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = DRV_MSG_CODE_SET_VMAC;
+	mb_params.param = DRV_MSG_CODE_VMAC_TYPE_MAC <<
+			  DRV_MSG_CODE_VMAC_TYPE_SHIFT;
+	mb_params.param |= MCP_PF_ID(p_hwfn);
+
+	/* MCP is BE, and on LE platforms PCI would swap access to SHMEM
+	 * in 32-bit granularity.
+	 * So the MAC has to be set in native order [and not byte order],
+	 * otherwise it would be read incorrectly by MFW after swap.
+	 */
+	mfw_mac[0] = mac[0] << 24 | mac[1] << 16 | mac[2] << 8 | mac[3];
+	mfw_mac[1] = mac[4] << 24 | mac[5] << 16;
+
+	mb_params.p_data_src = (u8 *)mfw_mac;
+	mb_params.data_src_size = 8;
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc)
+		DP_ERR(p_hwfn, "Failed to send mac address, rc = %d\n", rc);
+
+	/* Store primary MAC for later possible WoL */
+	memcpy(p_hwfn->cdev->wol_mac, mac, ETH_ALEN);
+
+	return rc;
+}
+
+int qed_mcp_ov_update_wol(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, enum qed_ov_wol wol)
+{
+	u32 resp = 0, param = 0;
+	u32 drv_mb_param;
+	int rc;
+
+	if (p_hwfn->hw_info.b_wol_support == QED_WOL_SUPPORT_NONE) {
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Can't change WoL configuration when WoL isn't supported\n");
+		return -EINVAL;
+	}
+
+	switch (wol) {
+	case QED_OV_WOL_DEFAULT:
+		drv_mb_param = DRV_MB_PARAM_WOL_DEFAULT;
+		break;
+	case QED_OV_WOL_DISABLED:
+		drv_mb_param = DRV_MB_PARAM_WOL_DISABLED;
+		break;
+	case QED_OV_WOL_ENABLED:
+		drv_mb_param = DRV_MB_PARAM_WOL_ENABLED;
+		break;
+	default:
+		DP_ERR(p_hwfn, "Invalid wol state %d\n", wol);
+		return -EINVAL;
+	}
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_OV_UPDATE_WOL,
+			 drv_mb_param, &resp, &param);
+	if (rc)
+		DP_ERR(p_hwfn, "Failed to send wol mode, rc = %d\n", rc);
+
+	/* Store the WoL update for a future unload */
+	p_hwfn->cdev->wol_config = (u8)wol;
+
+	return rc;
+}
+
+int qed_mcp_ov_update_eswitch(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      enum qed_ov_eswitch eswitch)
+{
+	u32 resp = 0, param = 0;
+	u32 drv_mb_param;
+	int rc;
+
+	switch (eswitch) {
+	case QED_OV_ESWITCH_NONE:
+		drv_mb_param = DRV_MB_PARAM_ESWITCH_MODE_NONE;
+		break;
+	case QED_OV_ESWITCH_VEB:
+		drv_mb_param = DRV_MB_PARAM_ESWITCH_MODE_VEB;
+		break;
+	case QED_OV_ESWITCH_VEPA:
+		drv_mb_param = DRV_MB_PARAM_ESWITCH_MODE_VEPA;
+		break;
+	default:
+		DP_ERR(p_hwfn, "Invalid eswitch mode %d\n", eswitch);
+		return -EINVAL;
+	}
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_OV_UPDATE_ESWITCH_MODE,
+			 drv_mb_param, &resp, &param);
+	if (rc)
+		DP_ERR(p_hwfn, "Failed to send eswitch mode, rc = %d\n", rc);
+
+	return rc;
+}
+
+int qed_mcp_set_led(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt, enum qed_led_mode mode)
+{
+	u32 resp = 0, param = 0, drv_mb_param;
+	int rc;
+
+	switch (mode) {
+	case QED_LED_MODE_ON:
+		drv_mb_param = DRV_MB_PARAM_SET_LED_MODE_ON;
+		break;
+	case QED_LED_MODE_OFF:
+		drv_mb_param = DRV_MB_PARAM_SET_LED_MODE_OFF;
+		break;
+	case QED_LED_MODE_RESTORE:
+		drv_mb_param = DRV_MB_PARAM_SET_LED_MODE_OPER;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Invalid LED mode %d\n", mode);
+		return -EINVAL;
+	}
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_SET_LED_MODE,
+			 drv_mb_param, &resp, &param);
+
+	return rc;
+}
+
+int qed_mcp_mask_parities(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u32 mask_parities)
+{
+	u32 resp = 0, param = 0;
+	int rc;
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_MASK_PARITIES,
+			 mask_parities, &resp, &param);
+
+	if (rc) {
+		DP_ERR(p_hwfn,
+		       "MCP response failure for mask parities, aborting\n");
+	} else if (resp != FW_MSG_CODE_OK) {
+		DP_ERR(p_hwfn,
+		       "MCP did not acknowledge mask parity request. Old MFW?\n");
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+int qed_mcp_nvm_read(struct qed_dev *cdev, u32 addr, u8 *p_buf, u32 len)
+{
+	u32 bytes_left = len, offset = 0, bytes_to_copy, read_len = 0;
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	u32 resp = 0, resp_param = 0;
+	struct qed_ptt *p_ptt;
+	int rc = 0;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EBUSY;
+
+	while (bytes_left > 0) {
+		bytes_to_copy = min_t(u32, bytes_left, MCP_DRV_NVM_BUF_LEN);
+
+		rc = qed_mcp_nvm_rd_cmd(p_hwfn, p_ptt,
+					DRV_MSG_CODE_NVM_READ_NVRAM,
+					addr + offset +
+					(bytes_to_copy <<
+					 DRV_MB_PARAM_NVM_LEN_OFFSET),
+					&resp, &resp_param,
+					&read_len,
+					(u32 *)(p_buf + offset));
+
+		if (rc || (resp != FW_MSG_CODE_NVM_OK)) {
+			DP_NOTICE(cdev, "MCP command rc = %d\n", rc);
+			break;
+		}
+
+		/* This can be a lengthy process, and it's possible scheduler
+		 * isn't preemptable. Sleep a bit to prevent CPU hogging.
+		 */
+		if (bytes_left % 0x1000 <
+		    (bytes_left - read_len) % 0x1000)
+			usleep_range(1000, 2000);
+
+		offset += read_len;
+		bytes_left -= read_len;
+	}
+
+	cdev->mcp_nvm_resp = resp;
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return rc;
+}
+
+int qed_mcp_nvm_resp(struct qed_dev *cdev, u8 *p_buf)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EBUSY;
+
+	memcpy(p_buf, &cdev->mcp_nvm_resp, sizeof(cdev->mcp_nvm_resp));
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return 0;
+}
+
+int qed_mcp_nvm_put_file_begin(struct qed_dev *cdev, u32 addr)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt;
+	u32 resp, param;
+	int rc;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EBUSY;
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_NVM_PUT_FILE_BEGIN, addr,
+			 &resp, &param);
+	cdev->mcp_nvm_resp = resp;
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return rc;
+}
+
+int qed_mcp_nvm_write(struct qed_dev *cdev,
+		      u32 cmd, u32 addr, u8 *p_buf, u32 len)
+{
+	u32 buf_idx = 0, buf_size, nvm_cmd, nvm_offset, resp = 0, param;
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt;
+	int rc = -EINVAL;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EBUSY;
+
+	switch (cmd) {
+	case QED_PUT_FILE_DATA:
+		nvm_cmd = DRV_MSG_CODE_NVM_PUT_FILE_DATA;
+		break;
+	case QED_NVM_WRITE_NVRAM:
+		nvm_cmd = DRV_MSG_CODE_NVM_WRITE_NVRAM;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Invalid nvm write command 0x%x\n", cmd);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	while (buf_idx < len) {
+		buf_size = min_t(u32, (len - buf_idx), MCP_DRV_NVM_BUF_LEN);
+		nvm_offset = ((buf_size << DRV_MB_PARAM_NVM_LEN_OFFSET) |
+			      addr) + buf_idx;
+		rc = qed_mcp_nvm_wr_cmd(p_hwfn, p_ptt, nvm_cmd, nvm_offset,
+					&resp, &param, buf_size,
+					(u32 *)&p_buf[buf_idx]);
+		if (rc) {
+			DP_NOTICE(cdev, "nvm write failed, rc = %d\n", rc);
+			resp = FW_MSG_CODE_ERROR;
+			break;
+		}
+
+		if (resp != FW_MSG_CODE_OK &&
+		    resp != FW_MSG_CODE_NVM_OK &&
+		    resp != FW_MSG_CODE_NVM_PUT_FILE_FINISH_OK) {
+			DP_NOTICE(cdev,
+				  "nvm write failed, resp = 0x%08x\n", resp);
+			rc = -EINVAL;
+			break;
+		}
+
+		/* This can be a lengthy process, and it's possible scheduler
+		 * isn't pre-emptable. Sleep a bit to prevent CPU hogging.
+		 */
+		if (buf_idx % 0x1000 > (buf_idx + buf_size) % 0x1000)
+			usleep_range(1000, 2000);
+
+		buf_idx += buf_size;
+	}
+
+	cdev->mcp_nvm_resp = resp;
+out:
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return rc;
+}
+
+int qed_mcp_bist_register_test(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 drv_mb_param = 0, rsp, param;
+	int rc = 0;
+
+	drv_mb_param = (DRV_MB_PARAM_BIST_REGISTER_TEST <<
+			DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT);
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_BIST_TEST,
+			 drv_mb_param, &rsp, &param);
+
+	if (rc)
+		return rc;
+
+	if (((rsp & FW_MSG_CODE_MASK) != FW_MSG_CODE_OK) ||
+	    (param != DRV_MB_PARAM_BIST_RC_PASSED))
+		rc = -EAGAIN;
+
+	return rc;
+}
+
+int qed_mcp_bist_clock_test(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 drv_mb_param, rsp, param;
+	int rc = 0;
+
+	drv_mb_param = (DRV_MB_PARAM_BIST_CLOCK_TEST <<
+			DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT);
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_BIST_TEST,
+			 drv_mb_param, &rsp, &param);
+
+	if (rc)
+		return rc;
+
+	if (((rsp & FW_MSG_CODE_MASK) != FW_MSG_CODE_OK) ||
+	    (param != DRV_MB_PARAM_BIST_RC_PASSED))
+		rc = -EAGAIN;
+
+	return rc;
+}
+
+int qed_mcp_bist_nvm_get_num_images(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    u32 *num_images)
+{
+	u32 drv_mb_param = 0, rsp;
+	int rc = 0;
+
+	drv_mb_param = (DRV_MB_PARAM_BIST_NVM_TEST_NUM_IMAGES <<
+			DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT);
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_BIST_TEST,
+			 drv_mb_param, &rsp, num_images);
+	if (rc)
+		return rc;
+
+	if (((rsp & FW_MSG_CODE_MASK) != FW_MSG_CODE_OK))
+		rc = -EINVAL;
+
+	return rc;
+}
+
+int qed_mcp_bist_nvm_get_image_att(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct bist_nvm_image_att *p_image_att,
+				   u32 image_index)
+{
+	u32 buf_size = 0, param, resp = 0, resp_param = 0;
+	int rc;
+
+	param = DRV_MB_PARAM_BIST_NVM_TEST_IMAGE_BY_INDEX <<
+		DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT;
+	param |= image_index << DRV_MB_PARAM_BIST_TEST_IMAGE_INDEX_SHIFT;
+
+	rc = qed_mcp_nvm_rd_cmd(p_hwfn, p_ptt,
+				DRV_MSG_CODE_BIST_TEST, param,
+				&resp, &resp_param,
+				&buf_size,
+				(u32 *)p_image_att);
+	if (rc)
+		return rc;
+
+	if (((resp & FW_MSG_CODE_MASK) != FW_MSG_CODE_OK) ||
+	    (p_image_att->return_code != 1))
+		rc = -EINVAL;
+
+	return rc;
+}
+
+int qed_mcp_nvm_info_populate(struct qed_hwfn *p_hwfn)
+{
+	struct qed_nvm_image_info *nvm_info = &p_hwfn->nvm_info;
+	struct qed_ptt *p_ptt;
+	int rc;
+	u32 i;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt) {
+		DP_ERR(p_hwfn, "failed to acquire ptt\n");
+		return -EBUSY;
+	}
+
+	/* Acquire from MFW the amount of available images */
+	nvm_info->num_images = 0;
+	rc = qed_mcp_bist_nvm_get_num_images(p_hwfn,
+					     p_ptt, &nvm_info->num_images);
+	if (rc == -EOPNOTSUPP) {
+		DP_INFO(p_hwfn, "DRV_MSG_CODE_BIST_TEST is not supported\n");
+		goto out;
+	} else if (rc || !nvm_info->num_images) {
+		DP_ERR(p_hwfn, "Failed getting number of images\n");
+		goto err0;
+	}
+
+	nvm_info->image_att = kmalloc(nvm_info->num_images *
+				      sizeof(struct bist_nvm_image_att),
+				      GFP_KERNEL);
+	if (!nvm_info->image_att) {
+		rc = -ENOMEM;
+		goto err0;
+	}
+
+	/* Iterate over images and get their attributes */
+	for (i = 0; i < nvm_info->num_images; i++) {
+		rc = qed_mcp_bist_nvm_get_image_att(p_hwfn, p_ptt,
+						    &nvm_info->image_att[i], i);
+		if (rc) {
+			DP_ERR(p_hwfn,
+			       "Failed getting image index %d attributes\n", i);
+			goto err1;
+		}
+
+		DP_VERBOSE(p_hwfn, QED_MSG_SP, "image index %d, size %x\n", i,
+			   nvm_info->image_att[i].len);
+	}
+out:
+	qed_ptt_release(p_hwfn, p_ptt);
+	return 0;
+
+err1:
+	kfree(nvm_info->image_att);
+err0:
+	qed_ptt_release(p_hwfn, p_ptt);
+	return rc;
+}
+
+static int
+qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  enum qed_nvm_images image_id,
+			  struct qed_nvm_image_att *p_image_att)
+{
+	enum nvm_image_type type;
+	u32 i;
+
+	/* Translate image_id into MFW definitions */
+	switch (image_id) {
+	case QED_NVM_IMAGE_ISCSI_CFG:
+		type = NVM_TYPE_ISCSI_CFG;
+		break;
+	case QED_NVM_IMAGE_FCOE_CFG:
+		type = NVM_TYPE_FCOE_CFG;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Unknown request of image_id %08x\n",
+			  image_id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < p_hwfn->nvm_info.num_images; i++)
+		if (type == p_hwfn->nvm_info.image_att[i].image_type)
+			break;
+	if (i == p_hwfn->nvm_info.num_images) {
+		DP_VERBOSE(p_hwfn, QED_MSG_STORAGE,
+			   "Failed to find nvram image of type %08x\n",
+			   image_id);
+		return -ENOENT;
+	}
+
+	p_image_att->start_addr = p_hwfn->nvm_info.image_att[i].nvm_start_addr;
+	p_image_att->length = p_hwfn->nvm_info.image_att[i].len;
+
+	return 0;
+}
+
+int qed_mcp_get_nvm_image(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  enum qed_nvm_images image_id,
+			  u8 *p_buffer, u32 buffer_len)
+{
+	struct qed_nvm_image_att image_att;
+	int rc;
+
+	memset(p_buffer, 0, buffer_len);
+
+	rc = qed_mcp_get_nvm_image_att(p_hwfn, p_ptt, image_id, &image_att);
+	if (rc)
+		return rc;
+
+	/* Validate sizes - both the image's and the supplied buffer's */
+	if (image_att.length <= 4) {
+		DP_VERBOSE(p_hwfn, QED_MSG_STORAGE,
+			   "Image [%d] is too small - only %d bytes\n",
+			   image_id, image_att.length);
+		return -EINVAL;
+	}
+
+	/* Each NVM image is suffixed by CRC; Upper-layer has no need for it */
+	image_att.length -= 4;
+
+	if (image_att.length > buffer_len) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_STORAGE,
+			   "Image [%d] is too big - %08x bytes where only %08x are available\n",
+			   image_id, image_att.length, buffer_len);
+		return -ENOMEM;
+	}
+
+	return qed_mcp_nvm_read(p_hwfn->cdev, image_att.start_addr,
+				p_buffer, image_att.length);
+}
+
+static enum resource_id_enum qed_mcp_get_mfw_res_id(enum qed_resources res_id)
+{
+	enum resource_id_enum mfw_res_id = RESOURCE_NUM_INVALID;
+
+	switch (res_id) {
+	case QED_SB:
+		mfw_res_id = RESOURCE_NUM_SB_E;
+		break;
+	case QED_L2_QUEUE:
+		mfw_res_id = RESOURCE_NUM_L2_QUEUE_E;
+		break;
+	case QED_VPORT:
+		mfw_res_id = RESOURCE_NUM_VPORT_E;
+		break;
+	case QED_RSS_ENG:
+		mfw_res_id = RESOURCE_NUM_RSS_ENGINES_E;
+		break;
+	case QED_PQ:
+		mfw_res_id = RESOURCE_NUM_PQ_E;
+		break;
+	case QED_RL:
+		mfw_res_id = RESOURCE_NUM_RL_E;
+		break;
+	case QED_MAC:
+	case QED_VLAN:
+		/* Each VFC resource can accommodate both a MAC and a VLAN */
+		mfw_res_id = RESOURCE_VFC_FILTER_E;
+		break;
+	case QED_ILT:
+		mfw_res_id = RESOURCE_ILT_E;
+		break;
+	case QED_LL2_QUEUE:
+		mfw_res_id = RESOURCE_LL2_QUEUE_E;
+		break;
+	case QED_RDMA_CNQ_RAM:
+	case QED_CMDQS_CQS:
+		/* CNQ/CMDQS are the same resource */
+		mfw_res_id = RESOURCE_CQS_E;
+		break;
+	case QED_RDMA_STATS_QUEUE:
+		mfw_res_id = RESOURCE_RDMA_STATS_QUEUE_E;
+		break;
+	case QED_BDQ:
+		mfw_res_id = RESOURCE_BDQ_E;
+		break;
+	default:
+		break;
+	}
+
+	return mfw_res_id;
+}
+
+#define QED_RESC_ALLOC_VERSION_MAJOR    2
+#define QED_RESC_ALLOC_VERSION_MINOR    0
+#define QED_RESC_ALLOC_VERSION				     \
+	((QED_RESC_ALLOC_VERSION_MAJOR <<		     \
+	  DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_SHIFT) | \
+	 (QED_RESC_ALLOC_VERSION_MINOR <<		     \
+	  DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MINOR_SHIFT))
+
+struct qed_resc_alloc_in_params {
+	u32 cmd;
+	enum qed_resources res_id;
+	u32 resc_max_val;
+};
+
+struct qed_resc_alloc_out_params {
+	u32 mcp_resp;
+	u32 mcp_param;
+	u32 resc_num;
+	u32 resc_start;
+	u32 vf_resc_num;
+	u32 vf_resc_start;
+	u32 flags;
+};
+
+static int
+qed_mcp_resc_allocation_msg(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt,
+			    struct qed_resc_alloc_in_params *p_in_params,
+			    struct qed_resc_alloc_out_params *p_out_params)
+{
+	struct qed_mcp_mb_params mb_params;
+	struct resource_info mfw_resc_info;
+	int rc;
+
+	memset(&mfw_resc_info, 0, sizeof(mfw_resc_info));
+
+	mfw_resc_info.res_id = qed_mcp_get_mfw_res_id(p_in_params->res_id);
+	if (mfw_resc_info.res_id == RESOURCE_NUM_INVALID) {
+		DP_ERR(p_hwfn,
+		       "Failed to match resource %d [%s] with the MFW resources\n",
+		       p_in_params->res_id,
+		       qed_hw_get_resc_name(p_in_params->res_id));
+		return -EINVAL;
+	}
+
+	switch (p_in_params->cmd) {
+	case DRV_MSG_SET_RESOURCE_VALUE_MSG:
+		mfw_resc_info.size = p_in_params->resc_max_val;
+		/* Fallthrough */
+	case DRV_MSG_GET_RESOURCE_ALLOC_MSG:
+		break;
+	default:
+		DP_ERR(p_hwfn, "Unexpected resource alloc command [0x%08x]\n",
+		       p_in_params->cmd);
+		return -EINVAL;
+	}
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = p_in_params->cmd;
+	mb_params.param = QED_RESC_ALLOC_VERSION;
+	mb_params.p_data_src = &mfw_resc_info;
+	mb_params.data_src_size = sizeof(mfw_resc_info);
+	mb_params.p_data_dst = mb_params.p_data_src;
+	mb_params.data_dst_size = mb_params.data_src_size;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_SP,
+		   "Resource message request: cmd 0x%08x, res_id %d [%s], hsi_version %d.%d, val 0x%x\n",
+		   p_in_params->cmd,
+		   p_in_params->res_id,
+		   qed_hw_get_resc_name(p_in_params->res_id),
+		   QED_MFW_GET_FIELD(mb_params.param,
+				     DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR),
+		   QED_MFW_GET_FIELD(mb_params.param,
+				     DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MINOR),
+		   p_in_params->resc_max_val);
+
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc)
+		return rc;
+
+	p_out_params->mcp_resp = mb_params.mcp_resp;
+	p_out_params->mcp_param = mb_params.mcp_param;
+	p_out_params->resc_num = mfw_resc_info.size;
+	p_out_params->resc_start = mfw_resc_info.offset;
+	p_out_params->vf_resc_num = mfw_resc_info.vf_size;
+	p_out_params->vf_resc_start = mfw_resc_info.vf_offset;
+	p_out_params->flags = mfw_resc_info.flags;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_SP,
+		   "Resource message response: mfw_hsi_version %d.%d, num 0x%x, start 0x%x, vf_num 0x%x, vf_start 0x%x, flags 0x%08x\n",
+		   QED_MFW_GET_FIELD(p_out_params->mcp_param,
+				     FW_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR),
+		   QED_MFW_GET_FIELD(p_out_params->mcp_param,
+				     FW_MB_PARAM_RESOURCE_ALLOC_VERSION_MINOR),
+		   p_out_params->resc_num,
+		   p_out_params->resc_start,
+		   p_out_params->vf_resc_num,
+		   p_out_params->vf_resc_start, p_out_params->flags);
+
+	return 0;
+}
+
+int
+qed_mcp_set_resc_max_val(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 enum qed_resources res_id,
+			 u32 resc_max_val, u32 *p_mcp_resp)
+{
+	struct qed_resc_alloc_out_params out_params;
+	struct qed_resc_alloc_in_params in_params;
+	int rc;
+
+	memset(&in_params, 0, sizeof(in_params));
+	in_params.cmd = DRV_MSG_SET_RESOURCE_VALUE_MSG;
+	in_params.res_id = res_id;
+	in_params.resc_max_val = resc_max_val;
+	memset(&out_params, 0, sizeof(out_params));
+	rc = qed_mcp_resc_allocation_msg(p_hwfn, p_ptt, &in_params,
+					 &out_params);
+	if (rc)
+		return rc;
+
+	*p_mcp_resp = out_params.mcp_resp;
+
+	return 0;
+}
+
+int
+qed_mcp_get_resc_info(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt,
+		      enum qed_resources res_id,
+		      u32 *p_mcp_resp, u32 *p_resc_num, u32 *p_resc_start)
+{
+	struct qed_resc_alloc_out_params out_params;
+	struct qed_resc_alloc_in_params in_params;
+	int rc;
+
+	memset(&in_params, 0, sizeof(in_params));
+	in_params.cmd = DRV_MSG_GET_RESOURCE_ALLOC_MSG;
+	in_params.res_id = res_id;
+	memset(&out_params, 0, sizeof(out_params));
+	rc = qed_mcp_resc_allocation_msg(p_hwfn, p_ptt, &in_params,
+					 &out_params);
+	if (rc)
+		return rc;
+
+	*p_mcp_resp = out_params.mcp_resp;
+
+	if (*p_mcp_resp == FW_MSG_CODE_RESOURCE_ALLOC_OK) {
+		*p_resc_num = out_params.resc_num;
+		*p_resc_start = out_params.resc_start;
+	}
+
+	return 0;
+}
+
+int qed_mcp_initiate_pf_flr(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 mcp_resp, mcp_param;
+
+	return qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_INITIATE_PF_FLR, 0,
+			   &mcp_resp, &mcp_param);
+}
+
+static int qed_mcp_resource_cmd(struct qed_hwfn *p_hwfn,
+				struct qed_ptt *p_ptt,
+				u32 param, u32 *p_mcp_resp, u32 *p_mcp_param)
+{
+	int rc;
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_RESOURCE_CMD, param,
+			 p_mcp_resp, p_mcp_param);
+	if (rc)
+		return rc;
+
+	if (*p_mcp_resp == FW_MSG_CODE_UNSUPPORTED) {
+		DP_INFO(p_hwfn,
+			"The resource command is unsupported by the MFW\n");
+		return -EINVAL;
+	}
+
+	if (*p_mcp_param == RESOURCE_OPCODE_UNKNOWN_CMD) {
+		u8 opcode = QED_MFW_GET_FIELD(param, RESOURCE_CMD_REQ_OPCODE);
+
+		DP_NOTICE(p_hwfn,
+			  "The resource command is unknown to the MFW [param 0x%08x, opcode %d]\n",
+			  param, opcode);
+		return -EINVAL;
+	}
+
+	return rc;
+}
+
+int
+__qed_mcp_resc_lock(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt,
+		    struct qed_resc_lock_params *p_params)
+{
+	u32 param = 0, mcp_resp, mcp_param;
+	u8 opcode;
+	int rc;
+
+	switch (p_params->timeout) {
+	case QED_MCP_RESC_LOCK_TO_DEFAULT:
+		opcode = RESOURCE_OPCODE_REQ;
+		p_params->timeout = 0;
+		break;
+	case QED_MCP_RESC_LOCK_TO_NONE:
+		opcode = RESOURCE_OPCODE_REQ_WO_AGING;
+		p_params->timeout = 0;
+		break;
+	default:
+		opcode = RESOURCE_OPCODE_REQ_W_AGING;
+		break;
+	}
+
+	QED_MFW_SET_FIELD(param, RESOURCE_CMD_REQ_RESC, p_params->resource);
+	QED_MFW_SET_FIELD(param, RESOURCE_CMD_REQ_OPCODE, opcode);
+	QED_MFW_SET_FIELD(param, RESOURCE_CMD_REQ_AGE, p_params->timeout);
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_SP,
+		   "Resource lock request: param 0x%08x [age %d, opcode %d, resource %d]\n",
+		   param, p_params->timeout, opcode, p_params->resource);
+
+	/* Attempt to acquire the resource */
+	rc = qed_mcp_resource_cmd(p_hwfn, p_ptt, param, &mcp_resp, &mcp_param);
+	if (rc)
+		return rc;
+
+	/* Analyze the response */
+	p_params->owner = QED_MFW_GET_FIELD(mcp_param, RESOURCE_CMD_RSP_OWNER);
+	opcode = QED_MFW_GET_FIELD(mcp_param, RESOURCE_CMD_RSP_OPCODE);
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_SP,
+		   "Resource lock response: mcp_param 0x%08x [opcode %d, owner %d]\n",
+		   mcp_param, opcode, p_params->owner);
+
+	switch (opcode) {
+	case RESOURCE_OPCODE_GNT:
+		p_params->b_granted = true;
+		break;
+	case RESOURCE_OPCODE_BUSY:
+		p_params->b_granted = false;
+		break;
+	default:
+		DP_NOTICE(p_hwfn,
+			  "Unexpected opcode in resource lock response [mcp_param 0x%08x, opcode %d]\n",
+			  mcp_param, opcode);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int
+qed_mcp_resc_lock(struct qed_hwfn *p_hwfn,
+		  struct qed_ptt *p_ptt, struct qed_resc_lock_params *p_params)
+{
+	u32 retry_cnt = 0;
+	int rc;
+
+	do {
+		/* No need for an interval before the first iteration */
+		if (retry_cnt) {
+			if (p_params->sleep_b4_retry) {
+				u16 retry_interval_in_ms =
+				    DIV_ROUND_UP(p_params->retry_interval,
+						 1000);
+
+				msleep(retry_interval_in_ms);
+			} else {
+				udelay(p_params->retry_interval);
+			}
+		}
+
+		rc = __qed_mcp_resc_lock(p_hwfn, p_ptt, p_params);
+		if (rc)
+			return rc;
+
+		if (p_params->b_granted)
+			break;
+	} while (retry_cnt++ < p_params->retry_num);
+
+	return 0;
+}
+
+int
+qed_mcp_resc_unlock(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt,
+		    struct qed_resc_unlock_params *p_params)
+{
+	u32 param = 0, mcp_resp, mcp_param;
+	u8 opcode;
+	int rc;
+
+	opcode = p_params->b_force ? RESOURCE_OPCODE_FORCE_RELEASE
+				   : RESOURCE_OPCODE_RELEASE;
+	QED_MFW_SET_FIELD(param, RESOURCE_CMD_REQ_RESC, p_params->resource);
+	QED_MFW_SET_FIELD(param, RESOURCE_CMD_REQ_OPCODE, opcode);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SP,
+		   "Resource unlock request: param 0x%08x [opcode %d, resource %d]\n",
+		   param, opcode, p_params->resource);
+
+	/* Attempt to release the resource */
+	rc = qed_mcp_resource_cmd(p_hwfn, p_ptt, param, &mcp_resp, &mcp_param);
+	if (rc)
+		return rc;
+
+	/* Analyze the response */
+	opcode = QED_MFW_GET_FIELD(mcp_param, RESOURCE_CMD_RSP_OPCODE);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SP,
+		   "Resource unlock response: mcp_param 0x%08x [opcode %d]\n",
+		   mcp_param, opcode);
+
+	switch (opcode) {
+	case RESOURCE_OPCODE_RELEASED_PREVIOUS:
+		DP_INFO(p_hwfn,
+			"Resource unlock request for an already released resource [%d]\n",
+			p_params->resource);
+		/* Fallthrough */
+	case RESOURCE_OPCODE_RELEASED:
+		p_params->b_released = true;
+		break;
+	case RESOURCE_OPCODE_WRONG_OWNER:
+		p_params->b_released = false;
+		break;
+	default:
+		DP_NOTICE(p_hwfn,
+			  "Unexpected opcode in resource unlock response [mcp_param 0x%08x, opcode %d]\n",
+			  mcp_param, opcode);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void qed_mcp_resc_lock_default_init(struct qed_resc_lock_params *p_lock,
+				    struct qed_resc_unlock_params *p_unlock,
+				    enum qed_resc_lock
+				    resource, bool b_is_permanent)
+{
+	if (p_lock) {
+		memset(p_lock, 0, sizeof(*p_lock));
+
+		/* Permanent resources don't require aging, and there's no
+		 * point in trying to acquire them more than once since it's
+		 * unexpected another entity would release them.
+		 */
+		if (b_is_permanent) {
+			p_lock->timeout = QED_MCP_RESC_LOCK_TO_NONE;
+		} else {
+			p_lock->retry_num = QED_MCP_RESC_LOCK_RETRY_CNT_DFLT;
+			p_lock->retry_interval =
+			    QED_MCP_RESC_LOCK_RETRY_VAL_DFLT;
+			p_lock->sleep_b4_retry = true;
+		}
+
+		p_lock->resource = resource;
+	}
+
+	if (p_unlock) {
+		memset(p_unlock, 0, sizeof(*p_unlock));
+		p_unlock->resource = resource;
+	}
+}
+
+int qed_mcp_get_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 mcp_resp;
+	int rc;
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_GET_MFW_FEATURE_SUPPORT,
+			 0, &mcp_resp, &p_hwfn->mcp_info->capabilities);
+	if (!rc)
+		DP_VERBOSE(p_hwfn, (QED_MSG_SP | NETIF_MSG_PROBE),
+			   "MFW supported features: %08x\n",
+			   p_hwfn->mcp_info->capabilities);
+
+	return rc;
+}
+
+int qed_mcp_set_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 mcp_resp, mcp_param, features;
+
+	features = DRV_MB_PARAM_FEATURE_SUPPORT_PORT_EEE;
+
+	return qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_FEATURE_SUPPORT,
+			   features, &mcp_resp, &mcp_param);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
new file mode 100644
index 0000000..8a5c988
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -0,0 +1,1002 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_MCP_H
+#define _QED_MCP_H
+
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/qed/qed_fcoe_if.h>
+#include "qed_hsi.h"
+#include "qed_dev_api.h"
+
+struct qed_mcp_link_speed_params {
+	bool    autoneg;
+	u32     advertised_speeds;      /* bitmask of DRV_SPEED_CAPABILITY */
+	u32     forced_speed;	   /* In Mb/s */
+};
+
+struct qed_mcp_link_pause_params {
+	bool    autoneg;
+	bool    forced_rx;
+	bool    forced_tx;
+};
+
+enum qed_mcp_eee_mode {
+	QED_MCP_EEE_DISABLED,
+	QED_MCP_EEE_ENABLED,
+	QED_MCP_EEE_UNSUPPORTED
+};
+
+struct qed_mcp_link_params {
+	struct qed_mcp_link_speed_params speed;
+	struct qed_mcp_link_pause_params pause;
+	u32 loopback_mode;
+	struct qed_link_eee_params eee;
+};
+
+struct qed_mcp_link_capabilities {
+	u32 speed_capabilities;
+	bool default_speed_autoneg;
+	enum qed_mcp_eee_mode default_eee;
+	u32 eee_lpi_timer;
+	u8 eee_speed_caps;
+};
+
+struct qed_mcp_link_state {
+	bool    link_up;
+
+	u32	min_pf_rate;
+
+	/* Actual link speed in Mb/s */
+	u32	line_speed;
+
+	/* PF max speed in Mb/s, deduced from line_speed
+	 * according to PF max bandwidth configuration.
+	 */
+	u32     speed;
+	bool    full_duplex;
+
+	bool    an;
+	bool    an_complete;
+	bool    parallel_detection;
+	bool    pfc_enabled;
+
+#define QED_LINK_PARTNER_SPEED_1G_HD    BIT(0)
+#define QED_LINK_PARTNER_SPEED_1G_FD    BIT(1)
+#define QED_LINK_PARTNER_SPEED_10G      BIT(2)
+#define QED_LINK_PARTNER_SPEED_20G      BIT(3)
+#define QED_LINK_PARTNER_SPEED_25G      BIT(4)
+#define QED_LINK_PARTNER_SPEED_40G      BIT(5)
+#define QED_LINK_PARTNER_SPEED_50G      BIT(6)
+#define QED_LINK_PARTNER_SPEED_100G     BIT(7)
+	u32     partner_adv_speed;
+
+	bool    partner_tx_flow_ctrl_en;
+	bool    partner_rx_flow_ctrl_en;
+
+#define QED_LINK_PARTNER_SYMMETRIC_PAUSE (1)
+#define QED_LINK_PARTNER_ASYMMETRIC_PAUSE (2)
+#define QED_LINK_PARTNER_BOTH_PAUSE (3)
+	u8      partner_adv_pause;
+
+	bool    sfp_tx_fault;
+	bool    eee_active;
+	u8      eee_adv_caps;
+	u8      eee_lp_adv_caps;
+};
+
+struct qed_mcp_function_info {
+	u8				pause_on_host;
+
+	enum qed_pci_personality	protocol;
+
+	u8				bandwidth_min;
+	u8				bandwidth_max;
+
+	u8				mac[ETH_ALEN];
+
+	u64				wwn_port;
+	u64				wwn_node;
+
+#define QED_MCP_VLAN_UNSET              (0xffff)
+	u16				ovlan;
+
+	u16				mtu;
+};
+
+struct qed_mcp_nvm_common {
+	u32	offset;
+	u32	param;
+	u32	resp;
+	u32	cmd;
+};
+
+struct qed_mcp_drv_version {
+	u32	version;
+	u8	name[MCP_DRV_VER_STR_SIZE - 4];
+};
+
+struct qed_mcp_lan_stats {
+	u64 ucast_rx_pkts;
+	u64 ucast_tx_pkts;
+	u32 fcs_err;
+};
+
+struct qed_mcp_fcoe_stats {
+	u64 rx_pkts;
+	u64 tx_pkts;
+	u32 fcs_err;
+	u32 login_failure;
+};
+
+struct qed_mcp_iscsi_stats {
+	u64 rx_pdus;
+	u64 tx_pdus;
+	u64 rx_bytes;
+	u64 tx_bytes;
+};
+
+struct qed_mcp_rdma_stats {
+	u64 rx_pkts;
+	u64 tx_pkts;
+	u64 rx_bytes;
+	u64 tx_byts;
+};
+
+enum qed_mcp_protocol_type {
+	QED_MCP_LAN_STATS,
+	QED_MCP_FCOE_STATS,
+	QED_MCP_ISCSI_STATS,
+	QED_MCP_RDMA_STATS
+};
+
+union qed_mcp_protocol_stats {
+	struct qed_mcp_lan_stats lan_stats;
+	struct qed_mcp_fcoe_stats fcoe_stats;
+	struct qed_mcp_iscsi_stats iscsi_stats;
+	struct qed_mcp_rdma_stats rdma_stats;
+};
+
+enum qed_ov_eswitch {
+	QED_OV_ESWITCH_NONE,
+	QED_OV_ESWITCH_VEB,
+	QED_OV_ESWITCH_VEPA
+};
+
+enum qed_ov_client {
+	QED_OV_CLIENT_DRV,
+	QED_OV_CLIENT_USER,
+	QED_OV_CLIENT_VENDOR_SPEC
+};
+
+enum qed_ov_driver_state {
+	QED_OV_DRIVER_STATE_NOT_LOADED,
+	QED_OV_DRIVER_STATE_DISABLED,
+	QED_OV_DRIVER_STATE_ACTIVE
+};
+
+enum qed_ov_wol {
+	QED_OV_WOL_DEFAULT,
+	QED_OV_WOL_DISABLED,
+	QED_OV_WOL_ENABLED
+};
+
+/**
+ * @brief - returns the link params of the hw function
+ *
+ * @param p_hwfn
+ *
+ * @returns pointer to link params
+ */
+struct qed_mcp_link_params *qed_mcp_get_link_params(struct qed_hwfn *);
+
+/**
+ * @brief - return the link state of the hw function
+ *
+ * @param p_hwfn
+ *
+ * @returns pointer to link state
+ */
+struct qed_mcp_link_state *qed_mcp_get_link_state(struct qed_hwfn *);
+
+/**
+ * @brief - return the link capabilities of the hw function
+ *
+ * @param p_hwfn
+ *
+ * @returns pointer to link capabilities
+ */
+struct qed_mcp_link_capabilities
+	*qed_mcp_get_link_capabilities(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief Request the MFW to set the the link according to 'link_input'.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param b_up - raise link if `true'. Reset link if `false'.
+ *
+ * @return int
+ */
+int qed_mcp_set_link(struct qed_hwfn   *p_hwfn,
+		     struct qed_ptt     *p_ptt,
+		     bool               b_up);
+
+/**
+ * @brief Get the management firmware version value
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param p_mfw_ver    - mfw version value
+ * @param p_running_bundle_id	- image id in nvram; Optional.
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_get_mfw_ver(struct qed_hwfn *p_hwfn,
+			struct qed_ptt *p_ptt,
+			u32 *p_mfw_ver, u32 *p_running_bundle_id);
+
+/**
+ * @brief Get the MBI version value
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param p_mbi_ver - A pointer to a variable to be filled with the MBI version.
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_get_mbi_ver(struct qed_hwfn *p_hwfn,
+			struct qed_ptt *p_ptt, u32 *p_mbi_ver);
+
+/**
+ * @brief Get media type value of the port.
+ *
+ * @param cdev      - qed dev pointer
+ * @param mfw_ver    - media type value
+ *
+ * @return int -
+ *      0 - Operation was successul.
+ *      -EBUSY - Operation failed
+ */
+int qed_mcp_get_media_type(struct qed_dev      *cdev,
+			   u32                  *media_type);
+
+/**
+ * @brief General function for sending commands to the MCP
+ *        mailbox. It acquire mutex lock for the entire
+ *        operation, from sending the request until the MCP
+ *        response. Waiting for MCP response will be checked up
+ *        to 5 seconds every 5ms.
+ *
+ * @param p_hwfn     - hw function
+ * @param p_ptt      - PTT required for register access
+ * @param cmd        - command to be sent to the MCP.
+ * @param param      - Optional param
+ * @param o_mcp_resp - The MCP response code (exclude sequence).
+ * @param o_mcp_param- Optional parameter provided by the MCP
+ *                     response
+ * @return int - 0 - operation
+ * was successul.
+ */
+int qed_mcp_cmd(struct qed_hwfn *p_hwfn,
+		struct qed_ptt *p_ptt,
+		u32 cmd,
+		u32 param,
+		u32 *o_mcp_resp,
+		u32 *o_mcp_param);
+
+/**
+ * @brief - drains the nig, allowing completion to pass in case of pauses.
+ *          (Should be called only from sleepable context)
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+int qed_mcp_drain(struct qed_hwfn *p_hwfn,
+		  struct qed_ptt *p_ptt);
+
+/**
+ * @brief Get the flash size value
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param p_flash_size  - flash size in bytes to be filled.
+ *
+ * @return int - 0 - operation was successul.
+ */
+int qed_mcp_get_flash_size(struct qed_hwfn     *p_hwfn,
+			   struct qed_ptt       *p_ptt,
+			   u32 *p_flash_size);
+
+/**
+ * @brief Send driver version to MFW
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param version - Version value
+ * @param name - Protocol driver name
+ *
+ * @return int - 0 - operation was successul.
+ */
+int
+qed_mcp_send_drv_version(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 struct qed_mcp_drv_version *p_ver);
+
+/**
+ * @brief Notify MFW about the change in base device properties
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param client - qed client type
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_ov_update_current_config(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     enum qed_ov_client client);
+
+/**
+ * @brief Notify MFW about the driver state
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param drv_state - Driver state
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_ov_update_driver_state(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   enum qed_ov_driver_state drv_state);
+
+/**
+ * @brief Send MTU size to MFW
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param mtu - MTU size
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_ov_update_mtu(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u16 mtu);
+
+/**
+ * @brief Send MAC address to MFW
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param mac - MAC address
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_ov_update_mac(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u8 *mac);
+
+/**
+ * @brief Send WOL mode to MFW
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param wol - WOL mode
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_ov_update_wol(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  enum qed_ov_wol wol);
+
+/**
+ * @brief Set LED status
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param mode - LED mode
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_set_led(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt,
+		    enum qed_led_mode mode);
+
+/**
+ * @brief Read from nvm
+ *
+ *  @param cdev
+ *  @param addr - nvm offset
+ *  @param p_buf - nvm read buffer
+ *  @param len - buffer len
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_nvm_read(struct qed_dev *cdev, u32 addr, u8 *p_buf, u32 len);
+
+/**
+ * @brief Write to nvm
+ *
+ *  @param cdev
+ *  @param addr - nvm offset
+ *  @param cmd - nvm command
+ *  @param p_buf - nvm write buffer
+ *  @param len - buffer len
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_nvm_write(struct qed_dev *cdev,
+		      u32 cmd, u32 addr, u8 *p_buf, u32 len);
+
+/**
+ * @brief Put file begin
+ *
+ *  @param cdev
+ *  @param addr - nvm offset
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_nvm_put_file_begin(struct qed_dev *cdev, u32 addr);
+
+/**
+ * @brief Check latest response
+ *
+ *  @param cdev
+ *  @param p_buf - nvm write buffer
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_nvm_resp(struct qed_dev *cdev, u8 *p_buf);
+
+struct qed_nvm_image_att {
+	u32 start_addr;
+	u32 length;
+};
+
+/**
+ * @brief Allows reading a whole nvram image
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param image_id - image requested for reading
+ * @param p_buffer - allocated buffer into which to fill data
+ * @param buffer_len - length of the allocated buffer.
+ *
+ * @return 0 iff p_buffer now contains the nvram image.
+ */
+int qed_mcp_get_nvm_image(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  enum qed_nvm_images image_id,
+			  u8 *p_buffer, u32 buffer_len);
+
+/**
+ * @brief Bist register test
+ *
+ *  @param p_hwfn    - hw function
+ *  @param p_ptt     - PTT required for register access
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_bist_register_test(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt);
+
+/**
+ * @brief Bist clock test
+ *
+ *  @param p_hwfn    - hw function
+ *  @param p_ptt     - PTT required for register access
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_bist_clock_test(struct qed_hwfn *p_hwfn,
+			    struct qed_ptt *p_ptt);
+
+/**
+ * @brief Bist nvm test - get number of images
+ *
+ *  @param p_hwfn       - hw function
+ *  @param p_ptt        - PTT required for register access
+ *  @param num_images   - number of images if operation was
+ *			  successful. 0 if not.
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_bist_nvm_get_num_images(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    u32 *num_images);
+
+/**
+ * @brief Bist nvm test - get image attributes by index
+ *
+ *  @param p_hwfn      - hw function
+ *  @param p_ptt       - PTT required for register access
+ *  @param p_image_att - Attributes of image
+ *  @param image_index - Index of image to get information for
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_bist_nvm_get_image_att(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct bist_nvm_image_att *p_image_att,
+				   u32 image_index);
+
+/* Using hwfn number (and not pf_num) is required since in CMT mode,
+ * same pf_num may be used by two different hwfn
+ * TODO - this shouldn't really be in .h file, but until all fields
+ * required during hw-init will be placed in their correct place in shmem
+ * we need it in qed_dev.c [for readin the nvram reflection in shmem].
+ */
+#define MCP_PF_ID_BY_REL(p_hwfn, rel_pfid) (QED_IS_BB((p_hwfn)->cdev) ?	       \
+					    ((rel_pfid) |		       \
+					     ((p_hwfn)->abs_pf_id & 1) << 3) : \
+					    rel_pfid)
+#define MCP_PF_ID(p_hwfn) MCP_PF_ID_BY_REL(p_hwfn, (p_hwfn)->rel_pf_id)
+
+#define MFW_PORT(_p_hwfn)       ((_p_hwfn)->abs_pf_id %			  \
+				 ((_p_hwfn)->cdev->num_ports_in_engine * \
+				  qed_device_num_engines((_p_hwfn)->cdev)))
+
+struct qed_mcp_info {
+	/* List for mailbox commands which were sent and wait for a response */
+	struct list_head			cmd_list;
+
+	/* Spinlock used for protecting the access to the mailbox commands list
+	 * and the sending of the commands.
+	 */
+	spinlock_t				cmd_lock;
+
+	/* Spinlock used for syncing SW link-changes and link-changes
+	 * originating from attention context.
+	 */
+	spinlock_t				link_lock;
+	bool					block_mb_sending;
+	u32					public_base;
+	u32					drv_mb_addr;
+	u32					mfw_mb_addr;
+	u32					port_addr;
+	u16					drv_mb_seq;
+	u16					drv_pulse_seq;
+	struct qed_mcp_link_params		link_input;
+	struct qed_mcp_link_state		link_output;
+	struct qed_mcp_link_capabilities	link_capabilities;
+	struct qed_mcp_function_info		func_info;
+	u8					*mfw_mb_cur;
+	u8					*mfw_mb_shadow;
+	u16					mfw_mb_length;
+	u32					mcp_hist;
+
+	/* Capabilties negotiated with the MFW */
+	u32					capabilities;
+};
+
+struct qed_mcp_mb_params {
+	u32			cmd;
+	u32			param;
+	void			*p_data_src;
+	u8			data_src_size;
+	void			*p_data_dst;
+	u8			data_dst_size;
+	u32			mcp_resp;
+	u32			mcp_param;
+};
+
+/**
+ * @brief Initialize the interface with the MCP
+ *
+ * @param p_hwfn - HW func
+ * @param p_ptt - PTT required for register access
+ *
+ * @return int
+ */
+int qed_mcp_cmd_init(struct qed_hwfn *p_hwfn,
+		     struct qed_ptt *p_ptt);
+
+/**
+ * @brief Initialize the port interface with the MCP
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * Can only be called after `num_ports_in_engines' is set
+ */
+void qed_mcp_cmd_port_init(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt);
+/**
+ * @brief Releases resources allocated during the init process.
+ *
+ * @param p_hwfn - HW func
+ * @param p_ptt - PTT required for register access
+ *
+ * @return int
+ */
+
+int qed_mcp_free(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief This function is called from the DPC context. After
+ * pointing PTT to the mfw mb, check for events sent by the MCP
+ * to the driver and ack them. In case a critical event
+ * detected, it will be handled here, otherwise the work will be
+ * queued to a sleepable work-queue.
+ *
+ * @param p_hwfn - HW function
+ * @param p_ptt - PTT required for register access
+ * @return int - 0 - operation
+ * was successul.
+ */
+int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt);
+
+enum qed_drv_role {
+	QED_DRV_ROLE_OS,
+	QED_DRV_ROLE_KDUMP,
+};
+
+struct qed_load_req_params {
+	/* Input params */
+	enum qed_drv_role drv_role;
+	u8 timeout_val;
+	bool avoid_eng_reset;
+	enum qed_override_force_load override_force_load;
+
+	/* Output params */
+	u32 load_code;
+};
+
+/**
+ * @brief Sends a LOAD_REQ to the MFW, and in case the operation succeeds,
+ *        returns whether this PF is the first on the engine/port or function.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param p_params
+ *
+ * @return int - 0 - Operation was successful.
+ */
+int qed_mcp_load_req(struct qed_hwfn *p_hwfn,
+		     struct qed_ptt *p_ptt,
+		     struct qed_load_req_params *p_params);
+
+/**
+ * @brief Sends a UNLOAD_REQ message to the MFW
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @return int - 0 - Operation was successful.
+ */
+int qed_mcp_unload_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
+/**
+ * @brief Sends a UNLOAD_DONE message to the MFW
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @return int - 0 - Operation was successful.
+ */
+int qed_mcp_unload_done(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
+/**
+ * @brief Read the MFW mailbox into Current buffer.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+void qed_mcp_read_mb(struct qed_hwfn *p_hwfn,
+		     struct qed_ptt *p_ptt);
+
+/**
+ * @brief Ack to mfw that driver finished FLR process for VFs
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param vfs_to_ack - bit mask of all engine VFs for which the PF acks.
+ *
+ * @param return int - 0 upon success.
+ */
+int qed_mcp_ack_vf_flr(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt, u32 *vfs_to_ack);
+
+/**
+ * @brief - calls during init to read shmem of all function-related info.
+ *
+ * @param p_hwfn
+ *
+ * @param return 0 upon success.
+ */
+int qed_mcp_fill_shmem_func_info(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt);
+
+/**
+ * @brief - Reset the MCP using mailbox command.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @param return 0 upon success.
+ */
+int qed_mcp_reset(struct qed_hwfn *p_hwfn,
+		  struct qed_ptt *p_ptt);
+
+/**
+ * @brief - Sends an NVM read command request to the MFW to get
+ *        a buffer.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param cmd - Command: DRV_MSG_CODE_NVM_GET_FILE_DATA or
+ *            DRV_MSG_CODE_NVM_READ_NVRAM commands
+ * @param param - [0:23] - Offset [24:31] - Size
+ * @param o_mcp_resp - MCP response
+ * @param o_mcp_param - MCP response param
+ * @param o_txn_size -  Buffer size output
+ * @param o_buf - Pointer to the buffer returned by the MFW.
+ *
+ * @param return 0 upon success.
+ */
+int qed_mcp_nvm_rd_cmd(struct qed_hwfn *p_hwfn,
+		       struct qed_ptt *p_ptt,
+		       u32 cmd,
+		       u32 param,
+		       u32 *o_mcp_resp,
+		       u32 *o_mcp_param, u32 *o_txn_size, u32 *o_buf);
+
+/**
+ * @brief indicates whether the MFW objects [under mcp_info] are accessible
+ *
+ * @param p_hwfn
+ *
+ * @return true iff MFW is running and mcp_info is initialized
+ */
+bool qed_mcp_is_init(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief request MFW to configure MSI-X for a VF
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param vf_id - absolute inside engine
+ * @param num_sbs - number of entries to request
+ *
+ * @return int
+ */
+int qed_mcp_config_vf_msix(struct qed_hwfn *p_hwfn,
+			   struct qed_ptt *p_ptt, u8 vf_id, u8 num);
+
+/**
+ * @brief - Halt the MCP.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @param return 0 upon success.
+ */
+int qed_mcp_halt(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
+/**
+ * @brief - Wake up the MCP.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @param return 0 upon success.
+ */
+int qed_mcp_resume(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
+int qed_configure_pf_min_bandwidth(struct qed_dev *cdev, u8 min_bw);
+int qed_configure_pf_max_bandwidth(struct qed_dev *cdev, u8 max_bw);
+int __qed_configure_pf_max_bandwidth(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     struct qed_mcp_link_state *p_link,
+				     u8 max_bw);
+int __qed_configure_pf_min_bandwidth(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     struct qed_mcp_link_state *p_link,
+				     u8 min_bw);
+
+int qed_mcp_mask_parities(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u32 mask_parities);
+
+/**
+ * @brief - Sets the MFW's max value for the given resource
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param res_id
+ *  @param resc_max_val
+ *  @param p_mcp_resp
+ *
+ * @return int - 0 - operation was successful.
+ */
+int
+qed_mcp_set_resc_max_val(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 enum qed_resources res_id,
+			 u32 resc_max_val, u32 *p_mcp_resp);
+
+/**
+ * @brief - Gets the MFW allocation info for the given resource
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param res_id
+ *  @param p_mcp_resp
+ *  @param p_resc_num
+ *  @param p_resc_start
+ *
+ * @return int - 0 - operation was successful.
+ */
+int
+qed_mcp_get_resc_info(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt,
+		      enum qed_resources res_id,
+		      u32 *p_mcp_resp, u32 *p_resc_num, u32 *p_resc_start);
+
+/**
+ * @brief Send eswitch mode to MFW
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param eswitch - eswitch mode
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_ov_update_eswitch(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      enum qed_ov_eswitch eswitch);
+
+#define QED_MCP_RESC_LOCK_MIN_VAL       RESOURCE_DUMP
+#define QED_MCP_RESC_LOCK_MAX_VAL       31
+
+enum qed_resc_lock {
+	QED_RESC_LOCK_DBG_DUMP = QED_MCP_RESC_LOCK_MIN_VAL,
+	QED_RESC_LOCK_PTP_PORT0,
+	QED_RESC_LOCK_PTP_PORT1,
+	QED_RESC_LOCK_PTP_PORT2,
+	QED_RESC_LOCK_PTP_PORT3,
+	QED_RESC_LOCK_RESC_ALLOC = QED_MCP_RESC_LOCK_MAX_VAL,
+	QED_RESC_LOCK_RESC_INVALID
+};
+
+/**
+ * @brief - Initiates PF FLR
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_initiate_pf_flr(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+struct qed_resc_lock_params {
+	/* Resource number [valid values are 0..31] */
+	u8 resource;
+
+	/* Lock timeout value in seconds [default, none or 1..254] */
+	u8 timeout;
+#define QED_MCP_RESC_LOCK_TO_DEFAULT    0
+#define QED_MCP_RESC_LOCK_TO_NONE       255
+
+	/* Number of times to retry locking */
+	u8 retry_num;
+#define QED_MCP_RESC_LOCK_RETRY_CNT_DFLT        10
+
+	/* The interval in usec between retries */
+	u16 retry_interval;
+#define QED_MCP_RESC_LOCK_RETRY_VAL_DFLT        10000
+
+	/* Use sleep or delay between retries */
+	bool sleep_b4_retry;
+
+	/* Will be set as true if the resource is free and granted */
+	bool b_granted;
+
+	/* Will be filled with the resource owner.
+	 * [0..15 = PF0-15, 16 = MFW]
+	 */
+	u8 owner;
+};
+
+/**
+ * @brief Acquires MFW generic resource lock
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param p_params
+ *
+ * @return int - 0 - operation was successful.
+ */
+int
+qed_mcp_resc_lock(struct qed_hwfn *p_hwfn,
+		  struct qed_ptt *p_ptt, struct qed_resc_lock_params *p_params);
+
+struct qed_resc_unlock_params {
+	/* Resource number [valid values are 0..31] */
+	u8 resource;
+
+	/* Allow to release a resource even if belongs to another PF */
+	bool b_force;
+
+	/* Will be set as true if the resource is released */
+	bool b_released;
+};
+
+/**
+ * @brief Releases MFW generic resource lock
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param p_params
+ *
+ * @return int - 0 - operation was successful.
+ */
+int
+qed_mcp_resc_unlock(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt,
+		    struct qed_resc_unlock_params *p_params);
+
+/**
+ * @brief - default initialization for lock/unlock resource structs
+ *
+ * @param p_lock - lock params struct to be initialized; Can be NULL
+ * @param p_unlock - unlock params struct to be initialized; Can be NULL
+ * @param resource - the requested resource
+ * @paral b_is_permanent - disable retries & aging when set
+ */
+void qed_mcp_resc_lock_default_init(struct qed_resc_lock_params *p_lock,
+				    struct qed_resc_unlock_params *p_unlock,
+				    enum qed_resc_lock
+				    resource, bool b_is_permanent);
+/**
+ * @brief Learn of supported MFW features; To be done during early init
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+int qed_mcp_get_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
+/**
+ * @brief Inform MFW of set of features supported by driver. Should be done
+ * inside the content of the LOAD_REQ.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+int qed_mcp_set_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
+/**
+ * @brief Populate the nvm info shadow in the given hardware function
+ *
+ * @param p_hwfn
+ */
+int qed_mcp_nvm_info_populate(struct qed_hwfn *p_hwfn);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ooo.c b/drivers/net/ethernet/qlogic/qed/qed_ooo.c
new file mode 100644
index 0000000..6172354
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_ooo.c
@@ -0,0 +1,497 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/dma-mapping.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "qed.h"
+#include "qed_iscsi.h"
+#include "qed_ll2.h"
+#include "qed_ooo.h"
+#include "qed_cxt.h"
+
+static struct qed_ooo_archipelago
+*qed_ooo_seek_archipelago(struct qed_hwfn *p_hwfn,
+			  struct qed_ooo_info
+			  *p_ooo_info,
+			  u32 cid)
+{
+	u32 idx = (cid & 0xffff) - p_ooo_info->cid_base;
+	struct qed_ooo_archipelago *p_archipelago;
+
+	if (idx >= p_ooo_info->max_num_archipelagos)
+		return NULL;
+
+	p_archipelago = &p_ooo_info->p_archipelagos_mem[idx];
+
+	if (list_empty(&p_archipelago->isles_list))
+		return NULL;
+
+	return p_archipelago;
+}
+
+static struct qed_ooo_isle *qed_ooo_seek_isle(struct qed_hwfn *p_hwfn,
+					      struct qed_ooo_info *p_ooo_info,
+					      u32 cid, u8 isle)
+{
+	struct qed_ooo_archipelago *p_archipelago = NULL;
+	struct qed_ooo_isle *p_isle = NULL;
+	u8 the_num_of_isle = 1;
+
+	p_archipelago = qed_ooo_seek_archipelago(p_hwfn, p_ooo_info, cid);
+	if (!p_archipelago) {
+		DP_NOTICE(p_hwfn,
+			  "Connection %d is not found in OOO list\n", cid);
+		return NULL;
+	}
+
+	list_for_each_entry(p_isle, &p_archipelago->isles_list, list_entry) {
+		if (the_num_of_isle == isle)
+			return p_isle;
+		the_num_of_isle++;
+	}
+
+	return NULL;
+}
+
+void qed_ooo_save_history_entry(struct qed_hwfn *p_hwfn,
+				struct qed_ooo_info *p_ooo_info,
+				struct ooo_opaque *p_cqe)
+{
+	struct qed_ooo_history *p_history = &p_ooo_info->ooo_history;
+
+	if (p_history->head_idx == p_history->num_of_cqes)
+		p_history->head_idx = 0;
+	p_history->p_cqes[p_history->head_idx] = *p_cqe;
+	p_history->head_idx++;
+}
+
+int qed_ooo_alloc(struct qed_hwfn *p_hwfn)
+{
+	u16 max_num_archipelagos = 0, cid_base;
+	struct qed_ooo_info *p_ooo_info;
+	enum protocol_type proto;
+	u16 max_num_isles = 0;
+	u32 i;
+
+	switch (p_hwfn->hw_info.personality) {
+	case QED_PCI_ISCSI:
+		proto = PROTOCOLID_ISCSI;
+		break;
+	case QED_PCI_ETH_RDMA:
+	case QED_PCI_ETH_IWARP:
+		proto = PROTOCOLID_IWARP;
+		break;
+	default:
+		DP_NOTICE(p_hwfn,
+			  "Failed to allocate qed_ooo_info: unknown personality\n");
+		return -EINVAL;
+	}
+
+	max_num_archipelagos = (u16)qed_cxt_get_proto_cid_count(p_hwfn, proto,
+								NULL);
+	max_num_isles = QED_MAX_NUM_ISLES + max_num_archipelagos;
+	cid_base = (u16)qed_cxt_get_proto_cid_start(p_hwfn, proto);
+
+	if (!max_num_archipelagos) {
+		DP_NOTICE(p_hwfn,
+			  "Failed to allocate qed_ooo_info: unknown amount of connections\n");
+		return -EINVAL;
+	}
+
+	p_ooo_info = kzalloc(sizeof(*p_ooo_info), GFP_KERNEL);
+	if (!p_ooo_info)
+		return -ENOMEM;
+
+	p_ooo_info->cid_base = cid_base;
+	p_ooo_info->max_num_archipelagos = max_num_archipelagos;
+
+	INIT_LIST_HEAD(&p_ooo_info->free_buffers_list);
+	INIT_LIST_HEAD(&p_ooo_info->ready_buffers_list);
+	INIT_LIST_HEAD(&p_ooo_info->free_isles_list);
+
+	p_ooo_info->p_isles_mem = kcalloc(max_num_isles,
+					  sizeof(struct qed_ooo_isle),
+					  GFP_KERNEL);
+	if (!p_ooo_info->p_isles_mem)
+		goto no_isles_mem;
+
+	for (i = 0; i < max_num_isles; i++) {
+		INIT_LIST_HEAD(&p_ooo_info->p_isles_mem[i].buffers_list);
+		list_add_tail(&p_ooo_info->p_isles_mem[i].list_entry,
+			      &p_ooo_info->free_isles_list);
+	}
+
+	p_ooo_info->p_archipelagos_mem =
+				kcalloc(max_num_archipelagos,
+					sizeof(struct qed_ooo_archipelago),
+					GFP_KERNEL);
+	if (!p_ooo_info->p_archipelagos_mem)
+		goto no_archipelagos_mem;
+
+	for (i = 0; i < max_num_archipelagos; i++)
+		INIT_LIST_HEAD(&p_ooo_info->p_archipelagos_mem[i].isles_list);
+
+	p_ooo_info->ooo_history.p_cqes =
+				kcalloc(QED_MAX_NUM_OOO_HISTORY_ENTRIES,
+					sizeof(struct ooo_opaque),
+					GFP_KERNEL);
+	if (!p_ooo_info->ooo_history.p_cqes)
+		goto no_history_mem;
+
+	p_ooo_info->ooo_history.num_of_cqes = QED_MAX_NUM_OOO_HISTORY_ENTRIES;
+
+	p_hwfn->p_ooo_info = p_ooo_info;
+	return 0;
+
+no_history_mem:
+	kfree(p_ooo_info->p_archipelagos_mem);
+no_archipelagos_mem:
+	kfree(p_ooo_info->p_isles_mem);
+no_isles_mem:
+	kfree(p_ooo_info);
+	return -ENOMEM;
+}
+
+void qed_ooo_release_connection_isles(struct qed_hwfn *p_hwfn,
+				      struct qed_ooo_info *p_ooo_info, u32 cid)
+{
+	struct qed_ooo_archipelago *p_archipelago;
+	struct qed_ooo_buffer *p_buffer;
+	struct qed_ooo_isle *p_isle;
+
+	p_archipelago = qed_ooo_seek_archipelago(p_hwfn, p_ooo_info, cid);
+	if (!p_archipelago)
+		return;
+
+	while (!list_empty(&p_archipelago->isles_list)) {
+		p_isle = list_first_entry(&p_archipelago->isles_list,
+					  struct qed_ooo_isle, list_entry);
+
+		list_del(&p_isle->list_entry);
+
+		while (!list_empty(&p_isle->buffers_list)) {
+			p_buffer = list_first_entry(&p_isle->buffers_list,
+						    struct qed_ooo_buffer,
+						    list_entry);
+
+			if (!p_buffer)
+				break;
+
+			list_del(&p_buffer->list_entry);
+			list_add_tail(&p_buffer->list_entry,
+				      &p_ooo_info->free_buffers_list);
+		}
+		list_add_tail(&p_isle->list_entry,
+			      &p_ooo_info->free_isles_list);
+	}
+}
+
+void qed_ooo_release_all_isles(struct qed_hwfn *p_hwfn,
+			       struct qed_ooo_info *p_ooo_info)
+{
+	struct qed_ooo_archipelago *p_archipelago;
+	struct qed_ooo_buffer *p_buffer;
+	struct qed_ooo_isle *p_isle;
+	u32 i;
+
+	for (i = 0; i < p_ooo_info->max_num_archipelagos; i++) {
+		p_archipelago = &(p_ooo_info->p_archipelagos_mem[i]);
+
+		while (!list_empty(&p_archipelago->isles_list)) {
+			p_isle = list_first_entry(&p_archipelago->isles_list,
+						  struct qed_ooo_isle,
+						  list_entry);
+
+			list_del(&p_isle->list_entry);
+
+			while (!list_empty(&p_isle->buffers_list)) {
+				p_buffer =
+				    list_first_entry(&p_isle->buffers_list,
+						     struct qed_ooo_buffer,
+						     list_entry);
+
+				if (!p_buffer)
+					break;
+
+			list_del(&p_buffer->list_entry);
+				list_add_tail(&p_buffer->list_entry,
+					      &p_ooo_info->free_buffers_list);
+			}
+			list_add_tail(&p_isle->list_entry,
+				      &p_ooo_info->free_isles_list);
+		}
+	}
+	if (!list_empty(&p_ooo_info->ready_buffers_list))
+		list_splice_tail_init(&p_ooo_info->ready_buffers_list,
+				      &p_ooo_info->free_buffers_list);
+}
+
+void qed_ooo_setup(struct qed_hwfn *p_hwfn)
+{
+	qed_ooo_release_all_isles(p_hwfn, p_hwfn->p_ooo_info);
+	memset(p_hwfn->p_ooo_info->ooo_history.p_cqes, 0,
+	       p_hwfn->p_ooo_info->ooo_history.num_of_cqes *
+	       sizeof(struct ooo_opaque));
+	p_hwfn->p_ooo_info->ooo_history.head_idx = 0;
+}
+
+void qed_ooo_free(struct qed_hwfn *p_hwfn)
+{
+	struct qed_ooo_info *p_ooo_info  = p_hwfn->p_ooo_info;
+	struct qed_ooo_buffer *p_buffer;
+
+	if (!p_ooo_info)
+		return;
+
+	qed_ooo_release_all_isles(p_hwfn, p_ooo_info);
+	while (!list_empty(&p_ooo_info->free_buffers_list)) {
+		p_buffer = list_first_entry(&p_ooo_info->free_buffers_list,
+					    struct qed_ooo_buffer, list_entry);
+
+		if (!p_buffer)
+			break;
+
+		list_del(&p_buffer->list_entry);
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  p_buffer->rx_buffer_size,
+				  p_buffer->rx_buffer_virt_addr,
+				  p_buffer->rx_buffer_phys_addr);
+		kfree(p_buffer);
+	}
+
+	kfree(p_ooo_info->p_isles_mem);
+	kfree(p_ooo_info->p_archipelagos_mem);
+	kfree(p_ooo_info->ooo_history.p_cqes);
+	kfree(p_ooo_info);
+	p_hwfn->p_ooo_info = NULL;
+}
+
+void qed_ooo_put_free_buffer(struct qed_hwfn *p_hwfn,
+			     struct qed_ooo_info *p_ooo_info,
+			     struct qed_ooo_buffer *p_buffer)
+{
+	list_add_tail(&p_buffer->list_entry, &p_ooo_info->free_buffers_list);
+}
+
+struct qed_ooo_buffer *qed_ooo_get_free_buffer(struct qed_hwfn *p_hwfn,
+					       struct qed_ooo_info *p_ooo_info)
+{
+	struct qed_ooo_buffer *p_buffer = NULL;
+
+	if (!list_empty(&p_ooo_info->free_buffers_list)) {
+		p_buffer = list_first_entry(&p_ooo_info->free_buffers_list,
+					    struct qed_ooo_buffer, list_entry);
+
+		list_del(&p_buffer->list_entry);
+	}
+
+	return p_buffer;
+}
+
+void qed_ooo_put_ready_buffer(struct qed_hwfn *p_hwfn,
+			      struct qed_ooo_info *p_ooo_info,
+			      struct qed_ooo_buffer *p_buffer, u8 on_tail)
+{
+	if (on_tail)
+		list_add_tail(&p_buffer->list_entry,
+			      &p_ooo_info->ready_buffers_list);
+	else
+		list_add(&p_buffer->list_entry,
+			 &p_ooo_info->ready_buffers_list);
+}
+
+struct qed_ooo_buffer *qed_ooo_get_ready_buffer(struct qed_hwfn *p_hwfn,
+						struct qed_ooo_info *p_ooo_info)
+{
+	struct qed_ooo_buffer *p_buffer = NULL;
+
+	if (!list_empty(&p_ooo_info->ready_buffers_list)) {
+		p_buffer = list_first_entry(&p_ooo_info->ready_buffers_list,
+					    struct qed_ooo_buffer, list_entry);
+
+		list_del(&p_buffer->list_entry);
+	}
+
+	return p_buffer;
+}
+
+void qed_ooo_delete_isles(struct qed_hwfn *p_hwfn,
+			  struct qed_ooo_info *p_ooo_info,
+			  u32 cid, u8 drop_isle, u8 drop_size)
+{
+	struct qed_ooo_archipelago *p_archipelago = NULL;
+	struct qed_ooo_isle *p_isle = NULL;
+	u8 isle_idx;
+
+	p_archipelago = qed_ooo_seek_archipelago(p_hwfn, p_ooo_info, cid);
+	for (isle_idx = 0; isle_idx < drop_size; isle_idx++) {
+		p_isle = qed_ooo_seek_isle(p_hwfn, p_ooo_info, cid, drop_isle);
+		if (!p_isle) {
+			DP_NOTICE(p_hwfn,
+				  "Isle %d is not found(cid %d)\n",
+				  drop_isle, cid);
+			return;
+		}
+		if (list_empty(&p_isle->buffers_list))
+			DP_NOTICE(p_hwfn,
+				  "Isle %d is empty(cid %d)\n", drop_isle, cid);
+		else
+			list_splice_tail_init(&p_isle->buffers_list,
+					      &p_ooo_info->free_buffers_list);
+
+		list_del(&p_isle->list_entry);
+		p_ooo_info->cur_isles_number--;
+		list_add(&p_isle->list_entry, &p_ooo_info->free_isles_list);
+	}
+}
+
+void qed_ooo_add_new_isle(struct qed_hwfn *p_hwfn,
+			  struct qed_ooo_info *p_ooo_info,
+			  u32 cid, u8 ooo_isle,
+			  struct qed_ooo_buffer *p_buffer)
+{
+	struct qed_ooo_archipelago *p_archipelago = NULL;
+	struct qed_ooo_isle *p_prev_isle = NULL;
+	struct qed_ooo_isle *p_isle = NULL;
+
+	if (ooo_isle > 1) {
+		p_prev_isle = qed_ooo_seek_isle(p_hwfn,
+						p_ooo_info, cid, ooo_isle - 1);
+		if (!p_prev_isle) {
+			DP_NOTICE(p_hwfn,
+				  "Isle %d is not found(cid %d)\n",
+				  ooo_isle - 1, cid);
+			return;
+		}
+	}
+	p_archipelago = qed_ooo_seek_archipelago(p_hwfn, p_ooo_info, cid);
+	if (!p_archipelago && (ooo_isle != 1)) {
+		DP_NOTICE(p_hwfn,
+			  "Connection %d is not found in OOO list\n", cid);
+		return;
+	}
+
+	if (!list_empty(&p_ooo_info->free_isles_list)) {
+		p_isle = list_first_entry(&p_ooo_info->free_isles_list,
+					  struct qed_ooo_isle, list_entry);
+
+		list_del(&p_isle->list_entry);
+		if (!list_empty(&p_isle->buffers_list)) {
+			DP_NOTICE(p_hwfn, "Free isle is not empty\n");
+			INIT_LIST_HEAD(&p_isle->buffers_list);
+		}
+	} else {
+		DP_NOTICE(p_hwfn, "No more free isles\n");
+		return;
+	}
+
+	if (!p_archipelago) {
+		u32 idx = (cid & 0xffff) - p_ooo_info->cid_base;
+
+		p_archipelago = &p_ooo_info->p_archipelagos_mem[idx];
+	}
+
+	list_add(&p_buffer->list_entry, &p_isle->buffers_list);
+	p_ooo_info->cur_isles_number++;
+	p_ooo_info->gen_isles_number++;
+
+	if (p_ooo_info->cur_isles_number > p_ooo_info->max_isles_number)
+		p_ooo_info->max_isles_number = p_ooo_info->cur_isles_number;
+
+	if (!p_prev_isle)
+		list_add(&p_isle->list_entry, &p_archipelago->isles_list);
+	else
+		list_add(&p_isle->list_entry, &p_prev_isle->list_entry);
+}
+
+void qed_ooo_add_new_buffer(struct qed_hwfn *p_hwfn,
+			    struct qed_ooo_info *p_ooo_info,
+			    u32 cid,
+			    u8 ooo_isle,
+			    struct qed_ooo_buffer *p_buffer, u8 buffer_side)
+{
+	struct qed_ooo_isle *p_isle = NULL;
+
+	p_isle = qed_ooo_seek_isle(p_hwfn, p_ooo_info, cid, ooo_isle);
+	if (!p_isle) {
+		DP_NOTICE(p_hwfn,
+			  "Isle %d is not found(cid %d)\n", ooo_isle, cid);
+		return;
+	}
+
+	if (buffer_side == QED_OOO_LEFT_BUF)
+		list_add(&p_buffer->list_entry, &p_isle->buffers_list);
+	else
+		list_add_tail(&p_buffer->list_entry, &p_isle->buffers_list);
+}
+
+void qed_ooo_join_isles(struct qed_hwfn *p_hwfn,
+			struct qed_ooo_info *p_ooo_info, u32 cid, u8 left_isle)
+{
+	struct qed_ooo_archipelago *p_archipelago = NULL;
+	struct qed_ooo_isle *p_right_isle = NULL;
+	struct qed_ooo_isle *p_left_isle = NULL;
+
+	p_right_isle = qed_ooo_seek_isle(p_hwfn, p_ooo_info, cid,
+					 left_isle + 1);
+	if (!p_right_isle) {
+		DP_NOTICE(p_hwfn,
+			  "Right isle %d is not found(cid %d)\n",
+			  left_isle + 1, cid);
+		return;
+	}
+
+	p_archipelago = qed_ooo_seek_archipelago(p_hwfn, p_ooo_info, cid);
+	list_del(&p_right_isle->list_entry);
+	p_ooo_info->cur_isles_number--;
+	if (left_isle) {
+		p_left_isle = qed_ooo_seek_isle(p_hwfn, p_ooo_info, cid,
+						left_isle);
+		if (!p_left_isle) {
+			DP_NOTICE(p_hwfn,
+				  "Left isle %d is not found(cid %d)\n",
+				  left_isle, cid);
+			return;
+		}
+		list_splice_tail_init(&p_right_isle->buffers_list,
+				      &p_left_isle->buffers_list);
+	} else {
+		list_splice_tail_init(&p_right_isle->buffers_list,
+				      &p_ooo_info->ready_buffers_list);
+	}
+	list_add_tail(&p_right_isle->list_entry, &p_ooo_info->free_isles_list);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ooo.h b/drivers/net/ethernet/qlogic/qed/qed_ooo.h
new file mode 100644
index 0000000..49c4e75
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_ooo.h
@@ -0,0 +1,198 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_OOO_H
+#define _QED_OOO_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include "qed.h"
+
+#define QED_MAX_NUM_ISLES	256
+#define QED_MAX_NUM_OOO_HISTORY_ENTRIES	512
+
+#define QED_OOO_LEFT_BUF	0
+#define QED_OOO_RIGHT_BUF	1
+
+struct qed_ooo_buffer {
+	struct list_head list_entry;
+	void *rx_buffer_virt_addr;
+	dma_addr_t rx_buffer_phys_addr;
+	u32 rx_buffer_size;
+	u16 packet_length;
+	u16 parse_flags;
+	u16 vlan;
+	u8 placement_offset;
+};
+
+struct qed_ooo_isle {
+	struct list_head list_entry;
+	struct list_head buffers_list;
+};
+
+struct qed_ooo_archipelago {
+	struct list_head isles_list;
+};
+
+struct qed_ooo_history {
+	struct ooo_opaque *p_cqes;
+	u32 head_idx;
+	u32 num_of_cqes;
+};
+
+struct qed_ooo_info {
+	struct list_head free_buffers_list;
+	struct list_head ready_buffers_list;
+	struct list_head free_isles_list;
+	struct qed_ooo_archipelago *p_archipelagos_mem;
+	struct qed_ooo_isle *p_isles_mem;
+	struct qed_ooo_history ooo_history;
+	u32 cur_isles_number;
+	u32 max_isles_number;
+	u32 gen_isles_number;
+	u16 max_num_archipelagos;
+	u16 cid_base;
+};
+
+#if IS_ENABLED(CONFIG_QED_OOO)
+void qed_ooo_save_history_entry(struct qed_hwfn *p_hwfn,
+				struct qed_ooo_info *p_ooo_info,
+				struct ooo_opaque *p_cqe);
+
+int qed_ooo_alloc(struct qed_hwfn *p_hwfn);
+
+void qed_ooo_setup(struct qed_hwfn *p_hwfn);
+
+void qed_ooo_free(struct qed_hwfn *p_hwfn);
+
+void qed_ooo_release_connection_isles(struct qed_hwfn *p_hwfn,
+				      struct qed_ooo_info *p_ooo_info,
+				      u32 cid);
+
+void qed_ooo_release_all_isles(struct qed_hwfn *p_hwfn,
+			       struct qed_ooo_info *p_ooo_info);
+
+void qed_ooo_put_free_buffer(struct qed_hwfn *p_hwfn,
+			     struct qed_ooo_info *p_ooo_info,
+			     struct qed_ooo_buffer *p_buffer);
+
+struct qed_ooo_buffer *
+qed_ooo_get_free_buffer(struct qed_hwfn *p_hwfn,
+			struct qed_ooo_info *p_ooo_info);
+
+void qed_ooo_put_ready_buffer(struct qed_hwfn *p_hwfn,
+			      struct qed_ooo_info *p_ooo_info,
+			      struct qed_ooo_buffer *p_buffer, u8 on_tail);
+
+struct qed_ooo_buffer *
+qed_ooo_get_ready_buffer(struct qed_hwfn *p_hwfn,
+			 struct qed_ooo_info *p_ooo_info);
+
+void qed_ooo_delete_isles(struct qed_hwfn *p_hwfn,
+			  struct qed_ooo_info *p_ooo_info,
+			  u32 cid, u8 drop_isle, u8 drop_size);
+
+void qed_ooo_add_new_isle(struct qed_hwfn *p_hwfn,
+			  struct qed_ooo_info *p_ooo_info,
+			  u32 cid,
+			  u8 ooo_isle, struct qed_ooo_buffer *p_buffer);
+
+void qed_ooo_add_new_buffer(struct qed_hwfn *p_hwfn,
+			    struct qed_ooo_info *p_ooo_info,
+			    u32 cid,
+			    u8 ooo_isle,
+			    struct qed_ooo_buffer *p_buffer, u8 buffer_side);
+
+void qed_ooo_join_isles(struct qed_hwfn *p_hwfn,
+			struct qed_ooo_info *p_ooo_info, u32 cid,
+			u8 left_isle);
+#else /* IS_ENABLED(CONFIG_QED_ISCSI) */
+static inline void qed_ooo_save_history_entry(struct qed_hwfn *p_hwfn,
+					      struct qed_ooo_info *p_ooo_info,
+					      struct ooo_opaque *p_cqe) {}
+
+static inline int qed_ooo_alloc(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
+static inline void qed_ooo_setup(struct qed_hwfn *p_hwfn) {}
+
+static inline void qed_ooo_free(struct qed_hwfn *p_hwfn) {}
+
+static inline void
+qed_ooo_release_connection_isles(struct qed_hwfn *p_hwfn,
+				 struct qed_ooo_info *p_ooo_info,
+				 u32 cid) {}
+
+static inline void qed_ooo_release_all_isles(struct qed_hwfn *p_hwfn,
+					     struct qed_ooo_info *p_ooo_info)
+					     {}
+
+static inline void qed_ooo_put_free_buffer(struct qed_hwfn *p_hwfn,
+					   struct qed_ooo_info *p_ooo_info,
+					   struct qed_ooo_buffer *p_buffer) {}
+
+static inline struct qed_ooo_buffer *
+qed_ooo_get_free_buffer(struct qed_hwfn *p_hwfn,
+			struct qed_ooo_info *p_ooo_info) { return NULL; }
+
+static inline void qed_ooo_put_ready_buffer(struct qed_hwfn *p_hwfn,
+					    struct qed_ooo_info *p_ooo_info,
+					    struct qed_ooo_buffer *p_buffer,
+					    u8 on_tail) {}
+
+static inline struct qed_ooo_buffer *
+qed_ooo_get_ready_buffer(struct qed_hwfn *p_hwfn,
+			 struct qed_ooo_info *p_ooo_info) { return NULL; }
+
+static inline void qed_ooo_delete_isles(struct qed_hwfn *p_hwfn,
+					struct qed_ooo_info *p_ooo_info,
+					u32 cid, u8 drop_isle, u8 drop_size) {}
+
+static inline void qed_ooo_add_new_isle(struct qed_hwfn *p_hwfn,
+					struct qed_ooo_info *p_ooo_info,
+					u32 cid, u8 ooo_isle,
+					struct qed_ooo_buffer *p_buffer) {}
+
+static inline void qed_ooo_add_new_buffer(struct qed_hwfn *p_hwfn,
+					  struct qed_ooo_info *p_ooo_info,
+					  u32 cid, u8 ooo_isle,
+					  struct qed_ooo_buffer *p_buffer,
+					  u8 buffer_side) {}
+
+static inline void qed_ooo_join_isles(struct qed_hwfn *p_hwfn,
+				      struct qed_ooo_info *p_ooo_info, u32 cid,
+				      u8 left_isle) {}
+#endif /* IS_ENABLED(CONFIG_QED_ISCSI) */
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ptp.c b/drivers/net/ethernet/qlogic/qed/qed_ptp.c
new file mode 100644
index 0000000..5a90d69
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_ptp.c
@@ -0,0 +1,452 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/types.h>
+#include "qed.h"
+#include "qed_dev_api.h"
+#include "qed_hw.h"
+#include "qed_l2.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+
+/* 16 nano second time quantas to wait before making a Drift adjustment */
+#define QED_DRIFT_CNTR_TIME_QUANTA_SHIFT	0
+/* Nano seconds to add/subtract when making a Drift adjustment */
+#define QED_DRIFT_CNTR_ADJUSTMENT_SHIFT		28
+/* Add/subtract the Adjustment_Value when making a Drift adjustment */
+#define QED_DRIFT_CNTR_DIRECTION_SHIFT		31
+#define QED_TIMESTAMP_MASK			BIT(16)
+
+static enum qed_resc_lock qed_ptcdev_to_resc(struct qed_hwfn *p_hwfn)
+{
+	switch (qed_device_get_port_id(p_hwfn->cdev)) {
+	case 0:
+		return QED_RESC_LOCK_PTP_PORT0;
+	case 1:
+		return QED_RESC_LOCK_PTP_PORT1;
+	case 2:
+		return QED_RESC_LOCK_PTP_PORT2;
+	case 3:
+		return QED_RESC_LOCK_PTP_PORT3;
+	default:
+		return QED_RESC_LOCK_RESC_INVALID;
+	}
+}
+
+static int qed_ptp_res_lock(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_resc_lock_params params;
+	enum qed_resc_lock resource;
+	int rc;
+
+	resource = qed_ptcdev_to_resc(p_hwfn);
+	if (resource == QED_RESC_LOCK_RESC_INVALID)
+		return -EINVAL;
+
+	qed_mcp_resc_lock_default_init(&params, NULL, resource, true);
+
+	rc = qed_mcp_resc_lock(p_hwfn, p_ptt, &params);
+	if (rc && rc != -EINVAL) {
+		return rc;
+	} else if (rc == -EINVAL) {
+		/* MFW doesn't support resource locking, first PF on the port
+		 * has lock ownership.
+		 */
+		if (p_hwfn->abs_pf_id < p_hwfn->cdev->num_ports_in_engine)
+			return 0;
+
+		DP_INFO(p_hwfn, "PF doesn't have lock ownership\n");
+		return -EBUSY;
+	} else if (!rc && !params.b_granted) {
+		DP_INFO(p_hwfn, "Failed to acquire ptp resource lock\n");
+		return -EBUSY;
+	}
+
+	return rc;
+}
+
+static int qed_ptp_res_unlock(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_resc_unlock_params params;
+	enum qed_resc_lock resource;
+	int rc;
+
+	resource = qed_ptcdev_to_resc(p_hwfn);
+	if (resource == QED_RESC_LOCK_RESC_INVALID)
+		return -EINVAL;
+
+	qed_mcp_resc_lock_default_init(NULL, &params, resource, true);
+
+	rc = qed_mcp_resc_unlock(p_hwfn, p_ptt, &params);
+	if (rc == -EINVAL) {
+		/* MFW doesn't support locking, first PF has lock ownership */
+		if (p_hwfn->abs_pf_id < p_hwfn->cdev->num_ports_in_engine) {
+			rc = 0;
+		} else {
+			DP_INFO(p_hwfn, "PF doesn't have lock ownership\n");
+			return -EINVAL;
+		}
+	} else if (rc) {
+		DP_INFO(p_hwfn, "Failed to release the ptp resource lock\n");
+	}
+
+	return rc;
+}
+
+/* Read Rx timestamp */
+static int qed_ptp_hw_read_rx_ts(struct qed_dev *cdev, u64 *timestamp)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+	u32 val;
+
+	*timestamp = 0;
+	val = qed_rd(p_hwfn, p_ptt, NIG_REG_LLH_PTP_HOST_BUF_SEQID);
+	if (!(val & QED_TIMESTAMP_MASK)) {
+		DP_INFO(p_hwfn, "Invalid Rx timestamp, buf_seqid = %d\n", val);
+		return -EINVAL;
+	}
+
+	val = qed_rd(p_hwfn, p_ptt, NIG_REG_LLH_PTP_HOST_BUF_TS_LSB);
+	*timestamp = qed_rd(p_hwfn, p_ptt, NIG_REG_LLH_PTP_HOST_BUF_TS_MSB);
+	*timestamp <<= 32;
+	*timestamp |= val;
+
+	/* Reset timestamp register to allow new timestamp */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_HOST_BUF_SEQID,
+	       QED_TIMESTAMP_MASK);
+
+	return 0;
+}
+
+/* Read Tx timestamp */
+static int qed_ptp_hw_read_tx_ts(struct qed_dev *cdev, u64 *timestamp)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+	u32 val;
+
+	*timestamp = 0;
+	val = qed_rd(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_BUF_SEQID);
+	if (!(val & QED_TIMESTAMP_MASK)) {
+		DP_INFO(p_hwfn, "Invalid Tx timestamp, buf_seqid = %d\n", val);
+		return -EINVAL;
+	}
+
+	val = qed_rd(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_BUF_TS_LSB);
+	*timestamp = qed_rd(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_BUF_TS_MSB);
+	*timestamp <<= 32;
+	*timestamp |= val;
+
+	/* Reset timestamp register to allow new timestamp */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_BUF_SEQID, QED_TIMESTAMP_MASK);
+
+	return 0;
+}
+
+/* Read Phy Hardware Clock */
+static int qed_ptp_hw_read_cc(struct qed_dev *cdev, u64 *phc_cycles)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+	u32 temp = 0;
+
+	temp = qed_rd(p_hwfn, p_ptt, NIG_REG_TSGEN_SYNC_TIME_LSB);
+	*phc_cycles = qed_rd(p_hwfn, p_ptt, NIG_REG_TSGEN_SYNC_TIME_MSB);
+	*phc_cycles <<= 32;
+	*phc_cycles |= temp;
+
+	return 0;
+}
+
+/* Filter PTP protocol packets that need to be timestamped */
+static int qed_ptp_hw_cfg_filters(struct qed_dev *cdev,
+				  enum qed_ptp_filter_type rx_type,
+				  enum qed_ptp_hwtstamp_tx_type tx_type)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+	u32 rule_mask, enable_cfg = 0x0;
+
+	switch (rx_type) {
+	case QED_PTP_FILTER_NONE:
+		enable_cfg = 0x0;
+		rule_mask = 0x3FFF;
+		break;
+	case QED_PTP_FILTER_ALL:
+		enable_cfg = 0x7;
+		rule_mask = 0x3CAA;
+		break;
+	case QED_PTP_FILTER_V1_L4_EVENT:
+		enable_cfg = 0x3;
+		rule_mask = 0x3FFA;
+		break;
+	case QED_PTP_FILTER_V1_L4_GEN:
+		enable_cfg = 0x3;
+		rule_mask = 0x3FFE;
+		break;
+	case QED_PTP_FILTER_V2_L4_EVENT:
+		enable_cfg = 0x5;
+		rule_mask = 0x3FAA;
+		break;
+	case QED_PTP_FILTER_V2_L4_GEN:
+		enable_cfg = 0x5;
+		rule_mask = 0x3FEE;
+		break;
+	case QED_PTP_FILTER_V2_L2_EVENT:
+		enable_cfg = 0x5;
+		rule_mask = 0x3CFF;
+		break;
+	case QED_PTP_FILTER_V2_L2_GEN:
+		enable_cfg = 0x5;
+		rule_mask = 0x3EFF;
+		break;
+	case QED_PTP_FILTER_V2_EVENT:
+		enable_cfg = 0x5;
+		rule_mask = 0x3CAA;
+		break;
+	case QED_PTP_FILTER_V2_GEN:
+		enable_cfg = 0x5;
+		rule_mask = 0x3EEE;
+		break;
+	default:
+		DP_INFO(p_hwfn, "Invalid PTP filter type %d\n", rx_type);
+		return -EINVAL;
+	}
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_PARAM_MASK, 0);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_RULE_MASK, rule_mask);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_RX_PTP_EN, enable_cfg);
+
+	if (tx_type == QED_PTP_HWTSTAMP_TX_OFF) {
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TX_PTP_EN, 0x0);
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_PARAM_MASK, 0x7FF);
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_RULE_MASK, 0x3FFF);
+	} else {
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TX_PTP_EN, enable_cfg);
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_PARAM_MASK, 0);
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_RULE_MASK, rule_mask);
+	}
+
+	/* Reset possibly old timestamps */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_HOST_BUF_SEQID,
+	       QED_TIMESTAMP_MASK);
+
+	return 0;
+}
+
+/* Adjust the HW clock by a rate given in parts-per-billion (ppb) units.
+ * FW/HW accepts the adjustment value in terms of 3 parameters:
+ *   Drift period - adjustment happens once in certain number of nano seconds.
+ *   Drift value - time is adjusted by a certain value, for example by 5 ns.
+ *   Drift direction - add or subtract the adjustment value.
+ * The routine translates ppb into the adjustment triplet in an optimal manner.
+ */
+static int qed_ptp_hw_adjfreq(struct qed_dev *cdev, s32 ppb)
+{
+	s64 best_val = 0, val, best_period = 0, period, approx_dev, dif, dif2;
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+	u32 drift_ctr_cfg = 0, drift_state;
+	int drift_dir = 1;
+
+	if (ppb < 0) {
+		ppb = -ppb;
+		drift_dir = 0;
+	}
+
+	if (ppb > 1) {
+		s64 best_dif = ppb, best_approx_dev = 1;
+
+		/* Adjustment value is up to +/-7ns, find an optimal value in
+		 * this range.
+		 */
+		for (val = 7; val > 0; val--) {
+			period = div_s64(val * 1000000000, ppb);
+			period -= 8;
+			period >>= 4;
+			if (period < 1)
+				period = 1;
+			if (period > 0xFFFFFFE)
+				period = 0xFFFFFFE;
+
+			/* Check both rounding ends for approximate error */
+			approx_dev = period * 16 + 8;
+			dif = ppb * approx_dev - val * 1000000000;
+			dif2 = dif + 16 * ppb;
+
+			if (dif < 0)
+				dif = -dif;
+			if (dif2 < 0)
+				dif2 = -dif2;
+
+			/* Determine which end gives better approximation */
+			if (dif * (approx_dev + 16) > dif2 * approx_dev) {
+				period++;
+				approx_dev += 16;
+				dif = dif2;
+			}
+
+			/* Track best approximation found so far */
+			if (best_dif * approx_dev > dif * best_approx_dev) {
+				best_dif = dif;
+				best_val = val;
+				best_period = period;
+				best_approx_dev = approx_dev;
+			}
+		}
+	} else if (ppb == 1) {
+		/* This is a special case as its the only value which wouldn't
+		 * fit in a s64 variable. In order to prevent castings simple
+		 * handle it seperately.
+		 */
+		best_val = 4;
+		best_period = 0xee6b27f;
+	} else {
+		best_val = 0;
+		best_period = 0xFFFFFFF;
+	}
+
+	drift_ctr_cfg = (best_period << QED_DRIFT_CNTR_TIME_QUANTA_SHIFT) |
+			(((int)best_val) << QED_DRIFT_CNTR_ADJUSTMENT_SHIFT) |
+			(((int)drift_dir) << QED_DRIFT_CNTR_DIRECTION_SHIFT);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_RST_DRIFT_CNTR, 0x1);
+
+	drift_state = qed_rd(p_hwfn, p_ptt, NIG_REG_TSGEN_RST_DRIFT_CNTR);
+	if (drift_state & 1) {
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_DRIFT_CNTR_CONF,
+		       drift_ctr_cfg);
+	} else {
+		DP_INFO(p_hwfn, "Drift counter is not reset\n");
+		return -EINVAL;
+	}
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_RST_DRIFT_CNTR, 0x0);
+
+	return 0;
+}
+
+static int qed_ptp_hw_enable(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt;
+	int rc;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt) {
+		DP_NOTICE(p_hwfn, "Failed to acquire PTT for PTP\n");
+		return -EBUSY;
+	}
+
+	p_hwfn->p_ptp_ptt = p_ptt;
+
+	rc = qed_ptp_res_lock(p_hwfn, p_ptt);
+	if (rc) {
+		DP_INFO(p_hwfn,
+			"Couldn't acquire the resource lock, skip ptp enable for this PF\n");
+		qed_ptt_release(p_hwfn, p_ptt);
+		p_hwfn->p_ptp_ptt = NULL;
+		return rc;
+	}
+
+	/* Reset PTP event detection rules - will be configured in the IOCTL */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_PARAM_MASK, 0x7FF);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_RULE_MASK, 0x3FFF);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_PARAM_MASK, 0x7FF);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_RULE_MASK, 0x3FFF);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_PTP_EN, 7);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_RX_PTP_EN, 7);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TS_OUTPUT_ENABLE_PDA, 0x1);
+
+	/* Pause free running counter */
+	if (QED_IS_BB_B0(p_hwfn->cdev))
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TIMESYNC_GEN_REG_BB, 2);
+	if (QED_IS_AH(p_hwfn->cdev))
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_FREECNT_UPDATE_K2, 2);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_FREE_CNT_VALUE_LSB, 0);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_FREE_CNT_VALUE_MSB, 0);
+	/* Resume free running counter */
+	if (QED_IS_BB_B0(p_hwfn->cdev))
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TIMESYNC_GEN_REG_BB, 4);
+	if (QED_IS_AH(p_hwfn->cdev)) {
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_FREECNT_UPDATE_K2, 4);
+		qed_wr(p_hwfn, p_ptt, NIG_REG_PTP_LATCH_OSTS_PKT_TIME, 1);
+	}
+
+	/* Disable drift register */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_DRIFT_CNTR_CONF, 0x0);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_RST_DRIFT_CNTR, 0x0);
+
+	/* Reset possibly old timestamps */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_HOST_BUF_SEQID,
+	       QED_TIMESTAMP_MASK);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_BUF_SEQID, QED_TIMESTAMP_MASK);
+
+	return 0;
+}
+
+static int qed_ptp_hw_disable(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
+
+	qed_ptp_res_unlock(p_hwfn, p_ptt);
+
+	/* Reset PTP event detection rules */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_PARAM_MASK, 0x7FF);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_RULE_MASK, 0x3FFF);
+
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_PARAM_MASK, 0x7FF);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_RULE_MASK, 0x3FFF);
+
+	/* Disable the PTP feature */
+	qed_wr(p_hwfn, p_ptt, NIG_REG_RX_PTP_EN, 0x0);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_PTP_EN, 0x0);
+
+	qed_ptt_release(p_hwfn, p_ptt);
+	p_hwfn->p_ptp_ptt = NULL;
+
+	return 0;
+}
+
+const struct qed_eth_ptp_ops qed_ptp_ops_pass = {
+	.cfg_filters = qed_ptp_hw_cfg_filters,
+	.read_rx_ts = qed_ptp_hw_read_rx_ts,
+	.read_tx_ts = qed_ptp_hw_read_tx_ts,
+	.read_cc = qed_ptp_hw_read_cc,
+	.adjfreq = qed_ptp_hw_adjfreq,
+	.disable = qed_ptp_hw_disable,
+	.enable = qed_ptp_hw_enable,
+};
diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
new file mode 100644
index 0000000..a411f9c
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -0,0 +1,1797 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_init_ops.h"
+#include "qed_int.h"
+#include "qed_ll2.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+#include <linux/qed/qed_rdma_if.h>
+#include "qed_rdma.h"
+#include "qed_roce.h"
+#include "qed_sp.h"
+
+
+int qed_rdma_bmap_alloc(struct qed_hwfn *p_hwfn,
+			struct qed_bmap *bmap, u32 max_count, char *name)
+{
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "max_count = %08x\n", max_count);
+
+	bmap->max_count = max_count;
+
+	bmap->bitmap = kcalloc(BITS_TO_LONGS(max_count), sizeof(long),
+			       GFP_KERNEL);
+	if (!bmap->bitmap)
+		return -ENOMEM;
+
+	snprintf(bmap->name, QED_RDMA_MAX_BMAP_NAME, "%s", name);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "0\n");
+	return 0;
+}
+
+int qed_rdma_bmap_alloc_id(struct qed_hwfn *p_hwfn,
+			   struct qed_bmap *bmap, u32 *id_num)
+{
+	*id_num = find_first_zero_bit(bmap->bitmap, bmap->max_count);
+	if (*id_num >= bmap->max_count)
+		return -EINVAL;
+
+	__set_bit(*id_num, bmap->bitmap);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "%s bitmap: allocated id %d\n",
+		   bmap->name, *id_num);
+
+	return 0;
+}
+
+void qed_bmap_set_id(struct qed_hwfn *p_hwfn,
+		     struct qed_bmap *bmap, u32 id_num)
+{
+	if (id_num >= bmap->max_count)
+		return;
+
+	__set_bit(id_num, bmap->bitmap);
+}
+
+void qed_bmap_release_id(struct qed_hwfn *p_hwfn,
+			 struct qed_bmap *bmap, u32 id_num)
+{
+	bool b_acquired;
+
+	if (id_num >= bmap->max_count)
+		return;
+
+	b_acquired = test_and_clear_bit(id_num, bmap->bitmap);
+	if (!b_acquired) {
+		DP_NOTICE(p_hwfn, "%s bitmap: id %d already released\n",
+			  bmap->name, id_num);
+		return;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "%s bitmap: released id %d\n",
+		   bmap->name, id_num);
+}
+
+int qed_bmap_test_id(struct qed_hwfn *p_hwfn,
+		     struct qed_bmap *bmap, u32 id_num)
+{
+	if (id_num >= bmap->max_count)
+		return -1;
+
+	return test_bit(id_num, bmap->bitmap);
+}
+
+static bool qed_bmap_is_empty(struct qed_bmap *bmap)
+{
+	return bmap->max_count == find_first_bit(bmap->bitmap, bmap->max_count);
+}
+
+u32 qed_rdma_get_sb_id(void *p_hwfn, u32 rel_sb_id)
+{
+	/* First sb id for RoCE is after all the l2 sb */
+	return FEAT_NUM((struct qed_hwfn *)p_hwfn, QED_PF_L2_QUE) + rel_sb_id;
+}
+
+static int qed_rdma_alloc(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  struct qed_rdma_start_in_params *params)
+{
+	struct qed_rdma_info *p_rdma_info;
+	u32 num_cons, num_tasks;
+	int rc = -ENOMEM;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocating RDMA\n");
+
+	/* Allocate a struct with current pf rdma info */
+	p_rdma_info = kzalloc(sizeof(*p_rdma_info), GFP_KERNEL);
+	if (!p_rdma_info)
+		return rc;
+
+	p_hwfn->p_rdma_info = p_rdma_info;
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn))
+		p_rdma_info->proto = PROTOCOLID_IWARP;
+	else
+		p_rdma_info->proto = PROTOCOLID_ROCE;
+
+	num_cons = qed_cxt_get_proto_cid_count(p_hwfn, p_rdma_info->proto,
+					       NULL);
+
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn))
+		p_rdma_info->num_qps = num_cons;
+	else
+		p_rdma_info->num_qps = num_cons / 2; /* 2 cids per qp */
+
+	num_tasks = qed_cxt_get_proto_tid_count(p_hwfn, PROTOCOLID_ROCE);
+
+	/* Each MR uses a single task */
+	p_rdma_info->num_mrs = num_tasks;
+
+	/* Queue zone lines are shared between RoCE and L2 in such a way that
+	 * they can be used by each without obstructing the other.
+	 */
+	p_rdma_info->queue_zone_base = (u16)RESC_START(p_hwfn, QED_L2_QUEUE);
+	p_rdma_info->max_queue_zones = (u16)RESC_NUM(p_hwfn, QED_L2_QUEUE);
+
+	/* Allocate a struct with device params and fill it */
+	p_rdma_info->dev = kzalloc(sizeof(*p_rdma_info->dev), GFP_KERNEL);
+	if (!p_rdma_info->dev)
+		goto free_rdma_info;
+
+	/* Allocate a struct with port params and fill it */
+	p_rdma_info->port = kzalloc(sizeof(*p_rdma_info->port), GFP_KERNEL);
+	if (!p_rdma_info->port)
+		goto free_rdma_dev;
+
+	/* Allocate bit map for pd's */
+	rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->pd_map, RDMA_MAX_PDS,
+				 "PD");
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Failed to allocate pd_map, rc = %d\n",
+			   rc);
+		goto free_rdma_port;
+	}
+
+	/* Allocate DPI bitmap */
+	rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->dpi_map,
+				 p_hwfn->dpi_count, "DPI");
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Failed to allocate DPI bitmap, rc = %d\n", rc);
+		goto free_pd_map;
+	}
+
+	/* Allocate bitmap for cq's. The maximum number of CQs is bound to
+	 * the number of connections we support. (num_qps in iWARP or
+	 * num_qps/2 in RoCE).
+	 */
+	rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->cq_map, num_cons, "CQ");
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Failed to allocate cq bitmap, rc = %d\n", rc);
+		goto free_dpi_map;
+	}
+
+	/* Allocate bitmap for toggle bit for cq icids
+	 * We toggle the bit every time we create or resize cq for a given icid.
+	 * Size needs to equal the size of the cq bmap.
+	 */
+	rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->toggle_bits,
+				 num_cons, "Toggle");
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Failed to allocate toogle bits, rc = %d\n", rc);
+		goto free_cq_map;
+	}
+
+	/* Allocate bitmap for itids */
+	rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->tid_map,
+				 p_rdma_info->num_mrs, "MR");
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Failed to allocate itids bitmaps, rc = %d\n", rc);
+		goto free_toggle_map;
+	}
+
+	/* Allocate bitmap for cids used for qps. */
+	rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->cid_map, num_cons,
+				 "CID");
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Failed to allocate cid bitmap, rc = %d\n", rc);
+		goto free_tid_map;
+	}
+
+	/* Allocate bitmap for cids used for responders/requesters. */
+	rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->real_cid_map, num_cons,
+				 "REAL_CID");
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Failed to allocate real cid bitmap, rc = %d\n", rc);
+		goto free_cid_map;
+	}
+
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn))
+		rc = qed_iwarp_alloc(p_hwfn);
+
+	if (rc)
+		goto free_cid_map;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocation successful\n");
+	return 0;
+
+free_cid_map:
+	kfree(p_rdma_info->cid_map.bitmap);
+free_tid_map:
+	kfree(p_rdma_info->tid_map.bitmap);
+free_toggle_map:
+	kfree(p_rdma_info->toggle_bits.bitmap);
+free_cq_map:
+	kfree(p_rdma_info->cq_map.bitmap);
+free_dpi_map:
+	kfree(p_rdma_info->dpi_map.bitmap);
+free_pd_map:
+	kfree(p_rdma_info->pd_map.bitmap);
+free_rdma_port:
+	kfree(p_rdma_info->port);
+free_rdma_dev:
+	kfree(p_rdma_info->dev);
+free_rdma_info:
+	kfree(p_rdma_info);
+
+	return rc;
+}
+
+void qed_rdma_bmap_free(struct qed_hwfn *p_hwfn,
+			struct qed_bmap *bmap, bool check)
+{
+	int weight = bitmap_weight(bmap->bitmap, bmap->max_count);
+	int last_line = bmap->max_count / (64 * 8);
+	int last_item = last_line * 8 +
+	    DIV_ROUND_UP(bmap->max_count % (64 * 8), 64);
+	u64 *pmap = (u64 *)bmap->bitmap;
+	int line, item, offset;
+	u8 str_last_line[200] = { 0 };
+
+	if (!weight || !check)
+		goto end;
+
+	DP_NOTICE(p_hwfn,
+		  "%s bitmap not free - size=%d, weight=%d, 512 bits per line\n",
+		  bmap->name, bmap->max_count, weight);
+
+	/* print aligned non-zero lines, if any */
+	for (item = 0, line = 0; line < last_line; line++, item += 8)
+		if (bitmap_weight((unsigned long *)&pmap[item], 64 * 8))
+			DP_NOTICE(p_hwfn,
+				  "line 0x%04x: 0x%016llx 0x%016llx 0x%016llx 0x%016llx 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n",
+				  line,
+				  pmap[item],
+				  pmap[item + 1],
+				  pmap[item + 2],
+				  pmap[item + 3],
+				  pmap[item + 4],
+				  pmap[item + 5],
+				  pmap[item + 6], pmap[item + 7]);
+
+	/* print last unaligned non-zero line, if any */
+	if ((bmap->max_count % (64 * 8)) &&
+	    (bitmap_weight((unsigned long *)&pmap[item],
+			   bmap->max_count - item * 64))) {
+		offset = sprintf(str_last_line, "line 0x%04x: ", line);
+		for (; item < last_item; item++)
+			offset += sprintf(str_last_line + offset,
+					  "0x%016llx ", pmap[item]);
+		DP_NOTICE(p_hwfn, "%s\n", str_last_line);
+	}
+
+end:
+	kfree(bmap->bitmap);
+	bmap->bitmap = NULL;
+}
+
+static void qed_rdma_resc_free(struct qed_hwfn *p_hwfn)
+{
+	struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info;
+
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn))
+		qed_iwarp_resc_free(p_hwfn);
+
+	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->cid_map, 1);
+	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->pd_map, 1);
+	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->dpi_map, 1);
+	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->cq_map, 1);
+	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->toggle_bits, 0);
+	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->tid_map, 1);
+
+	kfree(p_rdma_info->port);
+	kfree(p_rdma_info->dev);
+
+	kfree(p_rdma_info);
+}
+
+static void qed_rdma_free_tid(void *rdma_cxt, u32 itid)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "itid = %08x\n", itid);
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->tid_map, itid);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+}
+
+static void qed_rdma_free_reserved_lkey(struct qed_hwfn *p_hwfn)
+{
+	qed_rdma_free_tid(p_hwfn, p_hwfn->p_rdma_info->dev->reserved_lkey);
+}
+
+static void qed_rdma_free(struct qed_hwfn *p_hwfn)
+{
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Freeing RDMA\n");
+
+	qed_rdma_free_reserved_lkey(p_hwfn);
+	qed_cxt_free_proto_ilt(p_hwfn, p_hwfn->p_rdma_info->proto);
+	qed_rdma_resc_free(p_hwfn);
+}
+
+static void qed_rdma_get_guid(struct qed_hwfn *p_hwfn, u8 *guid)
+{
+	guid[0] = p_hwfn->hw_info.hw_mac_addr[0] ^ 2;
+	guid[1] = p_hwfn->hw_info.hw_mac_addr[1];
+	guid[2] = p_hwfn->hw_info.hw_mac_addr[2];
+	guid[3] = 0xff;
+	guid[4] = 0xfe;
+	guid[5] = p_hwfn->hw_info.hw_mac_addr[3];
+	guid[6] = p_hwfn->hw_info.hw_mac_addr[4];
+	guid[7] = p_hwfn->hw_info.hw_mac_addr[5];
+}
+
+static void qed_rdma_init_events(struct qed_hwfn *p_hwfn,
+				 struct qed_rdma_start_in_params *params)
+{
+	struct qed_rdma_events *events;
+
+	events = &p_hwfn->p_rdma_info->events;
+
+	events->unaffiliated_event = params->events->unaffiliated_event;
+	events->affiliated_event = params->events->affiliated_event;
+	events->context = params->events->context;
+}
+
+static void qed_rdma_init_devinfo(struct qed_hwfn *p_hwfn,
+				  struct qed_rdma_start_in_params *params)
+{
+	struct qed_rdma_device *dev = p_hwfn->p_rdma_info->dev;
+	struct qed_dev *cdev = p_hwfn->cdev;
+	u32 pci_status_control;
+	u32 num_qps;
+
+	/* Vendor specific information */
+	dev->vendor_id = cdev->vendor_id;
+	dev->vendor_part_id = cdev->device_id;
+	dev->hw_ver = 0;
+	dev->fw_ver = (FW_MAJOR_VERSION << 24) | (FW_MINOR_VERSION << 16) |
+		      (FW_REVISION_VERSION << 8) | (FW_ENGINEERING_VERSION);
+
+	qed_rdma_get_guid(p_hwfn, (u8 *)&dev->sys_image_guid);
+	dev->node_guid = dev->sys_image_guid;
+
+	dev->max_sge = min_t(u32, RDMA_MAX_SGE_PER_SQ_WQE,
+			     RDMA_MAX_SGE_PER_RQ_WQE);
+
+	if (cdev->rdma_max_sge)
+		dev->max_sge = min_t(u32, cdev->rdma_max_sge, dev->max_sge);
+
+	dev->max_inline = ROCE_REQ_MAX_INLINE_DATA_SIZE;
+
+	dev->max_inline = (cdev->rdma_max_inline) ?
+			  min_t(u32, cdev->rdma_max_inline, dev->max_inline) :
+			  dev->max_inline;
+
+	dev->max_wqe = QED_RDMA_MAX_WQE;
+	dev->max_cnq = (u8)FEAT_NUM(p_hwfn, QED_RDMA_CNQ);
+
+	/* The number of QPs may be higher than QED_ROCE_MAX_QPS, because
+	 * it is up-aligned to 16 and then to ILT page size within qed cxt.
+	 * This is OK in terms of ILT but we don't want to configure the FW
+	 * above its abilities
+	 */
+	num_qps = ROCE_MAX_QPS;
+	num_qps = min_t(u64, num_qps, p_hwfn->p_rdma_info->num_qps);
+	dev->max_qp = num_qps;
+
+	/* CQs uses the same icids that QPs use hence they are limited by the
+	 * number of icids. There are two icids per QP.
+	 */
+	dev->max_cq = num_qps * 2;
+
+	/* The number of mrs is smaller by 1 since the first is reserved */
+	dev->max_mr = p_hwfn->p_rdma_info->num_mrs - 1;
+	dev->max_mr_size = QED_RDMA_MAX_MR_SIZE;
+
+	/* The maximum CQE capacity per CQ supported.
+	 * max number of cqes will be in two layer pbl,
+	 * 8 is the pointer size in bytes
+	 * 32 is the size of cq element in bytes
+	 */
+	if (params->cq_mode == QED_RDMA_CQ_MODE_32_BITS)
+		dev->max_cqe = QED_RDMA_MAX_CQE_32_BIT;
+	else
+		dev->max_cqe = QED_RDMA_MAX_CQE_16_BIT;
+
+	dev->max_mw = 0;
+	dev->max_fmr = QED_RDMA_MAX_FMR;
+	dev->max_mr_mw_fmr_pbl = (PAGE_SIZE / 8) * (PAGE_SIZE / 8);
+	dev->max_mr_mw_fmr_size = dev->max_mr_mw_fmr_pbl * PAGE_SIZE;
+	dev->max_pkey = QED_RDMA_MAX_P_KEY;
+
+	dev->max_qp_resp_rd_atomic_resc = RDMA_RING_PAGE_SIZE /
+					  (RDMA_RESP_RD_ATOMIC_ELM_SIZE * 2);
+	dev->max_qp_req_rd_atomic_resc = RDMA_RING_PAGE_SIZE /
+					 RDMA_REQ_RD_ATOMIC_ELM_SIZE;
+	dev->max_dev_resp_rd_atomic_resc = dev->max_qp_resp_rd_atomic_resc *
+					   p_hwfn->p_rdma_info->num_qps;
+	dev->page_size_caps = QED_RDMA_PAGE_SIZE_CAPS;
+	dev->dev_ack_delay = QED_RDMA_ACK_DELAY;
+	dev->max_pd = RDMA_MAX_PDS;
+	dev->max_ah = p_hwfn->p_rdma_info->num_qps;
+	dev->max_stats_queues = (u8)RESC_NUM(p_hwfn, QED_RDMA_STATS_QUEUE);
+
+	/* Set capablities */
+	dev->dev_caps = 0;
+	SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_RNR_NAK, 1);
+	SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_PORT_ACTIVE_EVENT, 1);
+	SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_PORT_CHANGE_EVENT, 1);
+	SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_RESIZE_CQ, 1);
+	SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_BASE_MEMORY_EXT, 1);
+	SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_BASE_QUEUE_EXT, 1);
+	SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_ZBVA, 1);
+	SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_LOCAL_INV_FENCE, 1);
+
+	/* Check atomic operations support in PCI configuration space. */
+	pci_read_config_dword(cdev->pdev,
+			      cdev->pdev->pcie_cap + PCI_EXP_DEVCTL2,
+			      &pci_status_control);
+
+	if (pci_status_control & PCI_EXP_DEVCTL2_LTR_EN)
+		SET_FIELD(dev->dev_caps, QED_RDMA_DEV_CAP_ATOMIC_OP, 1);
+
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn))
+		qed_iwarp_init_devinfo(p_hwfn);
+}
+
+static void qed_rdma_init_port(struct qed_hwfn *p_hwfn)
+{
+	struct qed_rdma_port *port = p_hwfn->p_rdma_info->port;
+	struct qed_rdma_device *dev = p_hwfn->p_rdma_info->dev;
+
+	port->port_state = p_hwfn->mcp_info->link_output.link_up ?
+			   QED_RDMA_PORT_UP : QED_RDMA_PORT_DOWN;
+
+	port->max_msg_size = min_t(u64,
+				   (dev->max_mr_mw_fmr_size *
+				    p_hwfn->cdev->rdma_max_sge),
+				   BIT(31));
+
+	port->pkey_bad_counter = 0;
+}
+
+static int qed_rdma_init_hw(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	int rc = 0;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Initializing HW\n");
+	p_hwfn->b_rdma_enabled_in_prs = false;
+
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn))
+		qed_iwarp_init_hw(p_hwfn, p_ptt);
+	else
+		rc = qed_roce_init_hw(p_hwfn, p_ptt);
+
+	return rc;
+}
+
+static int qed_rdma_start_fw(struct qed_hwfn *p_hwfn,
+			     struct qed_rdma_start_in_params *params,
+			     struct qed_ptt *p_ptt)
+{
+	struct rdma_init_func_ramrod_data *p_ramrod;
+	struct qed_rdma_cnq_params *p_cnq_pbl_list;
+	struct rdma_init_func_hdr *p_params_header;
+	struct rdma_cnq_params *p_cnq_params;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	u32 cnq_id, sb_id;
+	u16 igu_sb_id;
+	int rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Starting FW\n");
+
+	/* Save the number of cnqs for the function close ramrod */
+	p_hwfn->p_rdma_info->num_cnqs = params->desired_cnq;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent, RDMA_RAMROD_FUNC_INIT,
+				 p_hwfn->p_rdma_info->proto, &init_data);
+	if (rc)
+		return rc;
+
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn)) {
+		qed_iwarp_init_fw_ramrod(p_hwfn,
+					 &p_ent->ramrod.iwarp_init_func);
+		p_ramrod = &p_ent->ramrod.iwarp_init_func.rdma;
+	} else {
+		p_ramrod = &p_ent->ramrod.roce_init_func.rdma;
+	}
+
+	p_params_header = &p_ramrod->params_header;
+	p_params_header->cnq_start_offset = (u8)RESC_START(p_hwfn,
+							   QED_RDMA_CNQ_RAM);
+	p_params_header->num_cnqs = params->desired_cnq;
+
+	if (params->cq_mode == QED_RDMA_CQ_MODE_16_BITS)
+		p_params_header->cq_ring_mode = 1;
+	else
+		p_params_header->cq_ring_mode = 0;
+
+	for (cnq_id = 0; cnq_id < params->desired_cnq; cnq_id++) {
+		sb_id = qed_rdma_get_sb_id(p_hwfn, cnq_id);
+		igu_sb_id = qed_get_igu_sb_id(p_hwfn, sb_id);
+		p_ramrod->cnq_params[cnq_id].sb_num = cpu_to_le16(igu_sb_id);
+		p_cnq_params = &p_ramrod->cnq_params[cnq_id];
+		p_cnq_pbl_list = &params->cnq_pbl_list[cnq_id];
+
+		p_cnq_params->sb_index = p_hwfn->pf_params.rdma_pf_params.gl_pi;
+		p_cnq_params->num_pbl_pages = p_cnq_pbl_list->num_pbl_pages;
+
+		DMA_REGPAIR_LE(p_cnq_params->pbl_base_addr,
+			       p_cnq_pbl_list->pbl_ptr);
+
+		/* we assume here that cnq_id and qz_offset are the same */
+		p_cnq_params->queue_zone_num =
+			cpu_to_le16(p_hwfn->p_rdma_info->queue_zone_base +
+				    cnq_id);
+	}
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_rdma_alloc_tid(void *rdma_cxt, u32 *itid)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	int rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocate TID\n");
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	rc = qed_rdma_bmap_alloc_id(p_hwfn,
+				    &p_hwfn->p_rdma_info->tid_map, itid);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+	if (rc)
+		goto out;
+
+	rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, QED_ELEM_TASK, *itid);
+out:
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocate TID - done, rc = %d\n", rc);
+	return rc;
+}
+
+static int qed_rdma_reserve_lkey(struct qed_hwfn *p_hwfn)
+{
+	struct qed_rdma_device *dev = p_hwfn->p_rdma_info->dev;
+
+	/* Tid 0 will be used as the key for "reserved MR".
+	 * The driver should allocate memory for it so it can be loaded but no
+	 * ramrod should be passed on it.
+	 */
+	qed_rdma_alloc_tid(p_hwfn, &dev->reserved_lkey);
+	if (dev->reserved_lkey != RDMA_RESERVED_LKEY) {
+		DP_NOTICE(p_hwfn,
+			  "Reserved lkey should be equal to RDMA_RESERVED_LKEY\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int qed_rdma_setup(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  struct qed_rdma_start_in_params *params)
+{
+	int rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "RDMA setup\n");
+
+	spin_lock_init(&p_hwfn->p_rdma_info->lock);
+
+	qed_rdma_init_devinfo(p_hwfn, params);
+	qed_rdma_init_port(p_hwfn);
+	qed_rdma_init_events(p_hwfn, params);
+
+	rc = qed_rdma_reserve_lkey(p_hwfn);
+	if (rc)
+		return rc;
+
+	rc = qed_rdma_init_hw(p_hwfn, p_ptt);
+	if (rc)
+		return rc;
+
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn)) {
+		rc = qed_iwarp_setup(p_hwfn, p_ptt, params);
+		if (rc)
+			return rc;
+	} else {
+		rc = qed_roce_setup(p_hwfn);
+		if (rc)
+			return rc;
+	}
+
+	return qed_rdma_start_fw(p_hwfn, params, p_ptt);
+}
+
+int qed_rdma_stop(void *rdma_cxt)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	struct rdma_close_func_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	struct qed_ptt *p_ptt;
+	u32 ll2_ethertype_en;
+	int rc = -EBUSY;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "RDMA stop\n");
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Failed to acquire PTT\n");
+		return rc;
+	}
+
+	/* Disable RoCE search */
+	qed_wr(p_hwfn, p_ptt, p_hwfn->rdma_prs_search_reg, 0);
+	p_hwfn->b_rdma_enabled_in_prs = false;
+
+	qed_wr(p_hwfn, p_ptt, PRS_REG_ROCE_DEST_QP_MAX_PF, 0);
+
+	ll2_ethertype_en = qed_rd(p_hwfn, p_ptt, PRS_REG_LIGHT_L2_ETHERTYPE_EN);
+
+	qed_wr(p_hwfn, p_ptt, PRS_REG_LIGHT_L2_ETHERTYPE_EN,
+	       (ll2_ethertype_en & 0xFFFE));
+
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn)) {
+		rc = qed_iwarp_stop(p_hwfn, p_ptt);
+		if (rc) {
+			qed_ptt_release(p_hwfn, p_ptt);
+			return rc;
+		}
+	} else {
+		qed_roce_stop(p_hwfn);
+	}
+
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	/* Stop RoCE */
+	rc = qed_sp_init_request(p_hwfn, &p_ent, RDMA_RAMROD_FUNC_CLOSE,
+				 p_hwfn->p_rdma_info->proto, &init_data);
+	if (rc)
+		goto out;
+
+	p_ramrod = &p_ent->ramrod.rdma_close_func;
+
+	p_ramrod->num_cnqs = p_hwfn->p_rdma_info->num_cnqs;
+	p_ramrod->cnq_start_offset = (u8)RESC_START(p_hwfn, QED_RDMA_CNQ_RAM);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+out:
+	qed_rdma_free(p_hwfn);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "RDMA stop done, rc = %d\n", rc);
+	return rc;
+}
+
+static int qed_rdma_add_user(void *rdma_cxt,
+			     struct qed_rdma_add_user_out_params *out_params)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	u32 dpi_start_offset;
+	u32 returned_id = 0;
+	int rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Adding User\n");
+
+	/* Allocate DPI */
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	rc = qed_rdma_bmap_alloc_id(p_hwfn, &p_hwfn->p_rdma_info->dpi_map,
+				    &returned_id);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+
+	out_params->dpi = (u16)returned_id;
+
+	/* Calculate the corresponding DPI address */
+	dpi_start_offset = p_hwfn->dpi_start_offset;
+
+	out_params->dpi_addr = (u64)((u8 __iomem *)p_hwfn->doorbells +
+				     dpi_start_offset +
+				     ((out_params->dpi) * p_hwfn->dpi_size));
+
+	out_params->dpi_phys_addr = p_hwfn->cdev->db_phys_addr +
+				    dpi_start_offset +
+				    ((out_params->dpi) * p_hwfn->dpi_size);
+
+	out_params->dpi_size = p_hwfn->dpi_size;
+	out_params->wid_count = p_hwfn->wid_count;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Adding user - done, rc = %d\n", rc);
+	return rc;
+}
+
+static struct qed_rdma_port *qed_rdma_query_port(void *rdma_cxt)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	struct qed_rdma_port *p_port = p_hwfn->p_rdma_info->port;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "RDMA Query port\n");
+
+	/* Link may have changed */
+	p_port->port_state = p_hwfn->mcp_info->link_output.link_up ?
+			     QED_RDMA_PORT_UP : QED_RDMA_PORT_DOWN;
+
+	p_port->link_speed = p_hwfn->mcp_info->link_output.speed;
+
+	p_port->max_msg_size = RDMA_MAX_DATA_SIZE_IN_WQE;
+
+	return p_port;
+}
+
+static struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Query device\n");
+
+	/* Return struct with device parameters */
+	return p_hwfn->p_rdma_info->dev;
+}
+
+static void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 qz_offset, u16 prod)
+{
+	struct qed_hwfn *p_hwfn;
+	u16 qz_num;
+	u32 addr;
+
+	p_hwfn = (struct qed_hwfn *)rdma_cxt;
+
+	if (qz_offset > p_hwfn->p_rdma_info->max_queue_zones) {
+		DP_NOTICE(p_hwfn,
+			  "queue zone offset %d is too large (max is %d)\n",
+			  qz_offset, p_hwfn->p_rdma_info->max_queue_zones);
+		return;
+	}
+
+	qz_num = p_hwfn->p_rdma_info->queue_zone_base + qz_offset;
+	addr = GTT_BAR0_MAP_REG_USDM_RAM +
+	       USTORM_COMMON_QUEUE_CONS_OFFSET(qz_num);
+
+	REG_WR16(p_hwfn, addr, prod);
+
+	/* keep prod updates ordered */
+	wmb();
+}
+
+static int qed_fill_rdma_dev_info(struct qed_dev *cdev,
+				  struct qed_dev_rdma_info *info)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+
+	memset(info, 0, sizeof(*info));
+
+	info->rdma_type = QED_IS_ROCE_PERSONALITY(p_hwfn) ?
+	    QED_RDMA_TYPE_ROCE : QED_RDMA_TYPE_IWARP;
+
+	info->user_dpm_enabled = (p_hwfn->db_bar_no_edpm == 0);
+
+	qed_fill_dev_info(cdev, &info->common);
+
+	return 0;
+}
+
+static int qed_rdma_get_sb_start(struct qed_dev *cdev)
+{
+	int feat_num;
+
+	if (cdev->num_hwfns > 1)
+		feat_num = FEAT_NUM(QED_LEADING_HWFN(cdev), QED_PF_L2_QUE);
+	else
+		feat_num = FEAT_NUM(QED_LEADING_HWFN(cdev), QED_PF_L2_QUE) *
+			   cdev->num_hwfns;
+
+	return feat_num;
+}
+
+static int qed_rdma_get_min_cnq_msix(struct qed_dev *cdev)
+{
+	int n_cnq = FEAT_NUM(QED_LEADING_HWFN(cdev), QED_RDMA_CNQ);
+	int n_msix = cdev->int_params.rdma_msix_cnt;
+
+	return min_t(int, n_cnq, n_msix);
+}
+
+static int qed_rdma_set_int(struct qed_dev *cdev, u16 cnt)
+{
+	int limit = 0;
+
+	/* Mark the fastpath as free/used */
+	cdev->int_params.fp_initialized = cnt ? true : false;
+
+	if (cdev->int_params.out.int_mode != QED_INT_MODE_MSIX) {
+		DP_ERR(cdev,
+		       "qed roce supports only MSI-X interrupts (detected %d).\n",
+		       cdev->int_params.out.int_mode);
+		return -EINVAL;
+	} else if (cdev->int_params.fp_msix_cnt) {
+		limit = cdev->int_params.rdma_msix_cnt;
+	}
+
+	if (!limit)
+		return -ENOMEM;
+
+	return min_t(int, cnt, limit);
+}
+
+static int qed_rdma_get_int(struct qed_dev *cdev, struct qed_int_info *info)
+{
+	memset(info, 0, sizeof(*info));
+
+	if (!cdev->int_params.fp_initialized) {
+		DP_INFO(cdev,
+			"Protocol driver requested interrupt information, but its support is not yet configured\n");
+		return -EINVAL;
+	}
+
+	if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) {
+		int msix_base = cdev->int_params.rdma_msix_base;
+
+		info->msix_cnt = cdev->int_params.rdma_msix_cnt;
+		info->msix = &cdev->int_params.msix_table[msix_base];
+
+		DP_VERBOSE(cdev, QED_MSG_RDMA, "msix_cnt = %d msix_base=%d\n",
+			   info->msix_cnt, msix_base);
+	}
+
+	return 0;
+}
+
+static int qed_rdma_alloc_pd(void *rdma_cxt, u16 *pd)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	u32 returned_id;
+	int rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Alloc PD\n");
+
+	/* Allocates an unused protection domain */
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	rc = qed_rdma_bmap_alloc_id(p_hwfn,
+				    &p_hwfn->p_rdma_info->pd_map, &returned_id);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+
+	*pd = (u16)returned_id;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Alloc PD - done, rc = %d\n", rc);
+	return rc;
+}
+
+static void qed_rdma_free_pd(void *rdma_cxt, u16 pd)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "pd = %08x\n", pd);
+
+	/* Returns a previously allocated protection domain for reuse */
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->pd_map, pd);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+}
+
+static enum qed_rdma_toggle_bit
+qed_rdma_toggle_bit_create_resize_cq(struct qed_hwfn *p_hwfn, u16 icid)
+{
+	struct qed_rdma_info *p_info = p_hwfn->p_rdma_info;
+	enum qed_rdma_toggle_bit toggle_bit;
+	u32 bmap_id;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", icid);
+
+	/* the function toggle the bit that is related to a given icid
+	 * and returns the new toggle bit's value
+	 */
+	bmap_id = icid - qed_cxt_get_proto_cid_start(p_hwfn, p_info->proto);
+
+	spin_lock_bh(&p_info->lock);
+	toggle_bit = !test_and_change_bit(bmap_id,
+					  p_info->toggle_bits.bitmap);
+	spin_unlock_bh(&p_info->lock);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "QED_RDMA_TOGGLE_BIT_= %d\n",
+		   toggle_bit);
+
+	return toggle_bit;
+}
+
+static int qed_rdma_create_cq(void *rdma_cxt,
+			      struct qed_rdma_create_cq_in_params *params,
+			      u16 *icid)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	struct qed_rdma_info *p_info = p_hwfn->p_rdma_info;
+	struct rdma_create_cq_ramrod_data *p_ramrod;
+	enum qed_rdma_toggle_bit toggle_bit;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	u32 returned_id, start_cid;
+	int rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "cq_handle = %08x%08x\n",
+		   params->cq_handle_hi, params->cq_handle_lo);
+
+	/* Allocate icid */
+	spin_lock_bh(&p_info->lock);
+	rc = qed_rdma_bmap_alloc_id(p_hwfn, &p_info->cq_map, &returned_id);
+	spin_unlock_bh(&p_info->lock);
+
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Can't create CQ, rc = %d\n", rc);
+		return rc;
+	}
+
+	start_cid = qed_cxt_get_proto_cid_start(p_hwfn,
+						p_info->proto);
+	*icid = returned_id + start_cid;
+
+	/* Check if icid requires a page allocation */
+	rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, QED_ELEM_CXT, *icid);
+	if (rc)
+		goto err;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = *icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	/* Send create CQ ramrod */
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 RDMA_RAMROD_CREATE_CQ,
+				 p_info->proto, &init_data);
+	if (rc)
+		goto err;
+
+	p_ramrod = &p_ent->ramrod.rdma_create_cq;
+
+	p_ramrod->cq_handle.hi = cpu_to_le32(params->cq_handle_hi);
+	p_ramrod->cq_handle.lo = cpu_to_le32(params->cq_handle_lo);
+	p_ramrod->dpi = cpu_to_le16(params->dpi);
+	p_ramrod->is_two_level_pbl = params->pbl_two_level;
+	p_ramrod->max_cqes = cpu_to_le32(params->cq_size);
+	DMA_REGPAIR_LE(p_ramrod->pbl_addr, params->pbl_ptr);
+	p_ramrod->pbl_num_pages = cpu_to_le16(params->pbl_num_pages);
+	p_ramrod->cnq_id = (u8)RESC_START(p_hwfn, QED_RDMA_CNQ_RAM) +
+			   params->cnq_id;
+	p_ramrod->int_timeout = params->int_timeout;
+
+	/* toggle the bit for every resize or create cq for a given icid */
+	toggle_bit = qed_rdma_toggle_bit_create_resize_cq(p_hwfn, *icid);
+
+	p_ramrod->toggle_bit = toggle_bit;
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc) {
+		/* restore toggle bit */
+		qed_rdma_toggle_bit_create_resize_cq(p_hwfn, *icid);
+		goto err;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Created CQ, rc = %d\n", rc);
+	return rc;
+
+err:
+	/* release allocated icid */
+	spin_lock_bh(&p_info->lock);
+	qed_bmap_release_id(p_hwfn, &p_info->cq_map, returned_id);
+	spin_unlock_bh(&p_info->lock);
+	DP_NOTICE(p_hwfn, "Create CQ failed, rc = %d\n", rc);
+
+	return rc;
+}
+
+static int
+qed_rdma_destroy_cq(void *rdma_cxt,
+		    struct qed_rdma_destroy_cq_in_params *in_params,
+		    struct qed_rdma_destroy_cq_out_params *out_params)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	struct rdma_destroy_cq_output_params *p_ramrod_res;
+	struct rdma_destroy_cq_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	dma_addr_t ramrod_res_phys;
+	enum protocol_type proto;
+	int rc = -ENOMEM;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", in_params->icid);
+
+	p_ramrod_res =
+	    (struct rdma_destroy_cq_output_params *)
+	    dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+			       sizeof(struct rdma_destroy_cq_output_params),
+			       &ramrod_res_phys, GFP_KERNEL);
+	if (!p_ramrod_res) {
+		DP_NOTICE(p_hwfn,
+			  "qed destroy cq failed: cannot allocate memory (ramrod)\n");
+		return rc;
+	}
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = in_params->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+	proto = p_hwfn->p_rdma_info->proto;
+	/* Send destroy CQ ramrod */
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 RDMA_RAMROD_DESTROY_CQ,
+				 proto, &init_data);
+	if (rc)
+		goto err;
+
+	p_ramrod = &p_ent->ramrod.rdma_destroy_cq;
+	DMA_REGPAIR_LE(p_ramrod->output_params_addr, ramrod_res_phys);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		goto err;
+
+	out_params->num_cq_notif = le16_to_cpu(p_ramrod_res->cnq_num);
+
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  sizeof(struct rdma_destroy_cq_output_params),
+			  p_ramrod_res, ramrod_res_phys);
+
+	/* Free icid */
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+
+	qed_bmap_release_id(p_hwfn,
+			    &p_hwfn->p_rdma_info->cq_map,
+			    (in_params->icid -
+			     qed_cxt_get_proto_cid_start(p_hwfn, proto)));
+
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Destroyed CQ, rc = %d\n", rc);
+	return rc;
+
+err:	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  sizeof(struct rdma_destroy_cq_output_params),
+			  p_ramrod_res, ramrod_res_phys);
+
+	return rc;
+}
+
+void qed_rdma_set_fw_mac(u16 *p_fw_mac, u8 *p_qed_mac)
+{
+	p_fw_mac[0] = cpu_to_le16((p_qed_mac[0] << 8) + p_qed_mac[1]);
+	p_fw_mac[1] = cpu_to_le16((p_qed_mac[2] << 8) + p_qed_mac[3]);
+	p_fw_mac[2] = cpu_to_le16((p_qed_mac[4] << 8) + p_qed_mac[5]);
+}
+
+static int qed_rdma_query_qp(void *rdma_cxt,
+			     struct qed_rdma_qp *qp,
+			     struct qed_rdma_query_qp_out_params *out_params)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	int rc = 0;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid);
+
+	/* The following fields are filled in from qp and not FW as they can't
+	 * be modified by FW
+	 */
+	out_params->mtu = qp->mtu;
+	out_params->dest_qp = qp->dest_qp;
+	out_params->incoming_atomic_en = qp->incoming_atomic_en;
+	out_params->e2e_flow_control_en = qp->e2e_flow_control_en;
+	out_params->incoming_rdma_read_en = qp->incoming_rdma_read_en;
+	out_params->incoming_rdma_write_en = qp->incoming_rdma_write_en;
+	out_params->dgid = qp->dgid;
+	out_params->flow_label = qp->flow_label;
+	out_params->hop_limit_ttl = qp->hop_limit_ttl;
+	out_params->traffic_class_tos = qp->traffic_class_tos;
+	out_params->timeout = qp->ack_timeout;
+	out_params->rnr_retry = qp->rnr_retry_cnt;
+	out_params->retry_cnt = qp->retry_cnt;
+	out_params->min_rnr_nak_timer = qp->min_rnr_nak_timer;
+	out_params->pkey_index = 0;
+	out_params->max_rd_atomic = qp->max_rd_atomic_req;
+	out_params->max_dest_rd_atomic = qp->max_rd_atomic_resp;
+	out_params->sqd_async = qp->sqd_async;
+
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn))
+		qed_iwarp_query_qp(qp, out_params);
+	else
+		rc = qed_roce_query_qp(p_hwfn, qp, out_params);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Query QP, rc = %d\n", rc);
+	return rc;
+}
+
+static int qed_rdma_destroy_qp(void *rdma_cxt, struct qed_rdma_qp *qp)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	int rc = 0;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid);
+
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn))
+		rc = qed_iwarp_destroy_qp(p_hwfn, qp);
+	else
+		rc = qed_roce_destroy_qp(p_hwfn, qp);
+
+	/* free qp params struct */
+	kfree(qp);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "QP destroyed\n");
+	return rc;
+}
+
+static struct qed_rdma_qp *
+qed_rdma_create_qp(void *rdma_cxt,
+		   struct qed_rdma_create_qp_in_params *in_params,
+		   struct qed_rdma_create_qp_out_params *out_params)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	struct qed_rdma_qp *qp;
+	u8 max_stats_queues;
+	int rc;
+
+	if (!rdma_cxt || !in_params || !out_params || !p_hwfn->p_rdma_info) {
+		DP_ERR(p_hwfn->cdev,
+		       "qed roce create qp failed due to NULL entry (rdma_cxt=%p, in=%p, out=%p, roce_info=?\n",
+		       rdma_cxt, in_params, out_params);
+		return NULL;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "qed rdma create qp called with qp_handle = %08x%08x\n",
+		   in_params->qp_handle_hi, in_params->qp_handle_lo);
+
+	/* Some sanity checks... */
+	max_stats_queues = p_hwfn->p_rdma_info->dev->max_stats_queues;
+	if (in_params->stats_queue >= max_stats_queues) {
+		DP_ERR(p_hwfn->cdev,
+		       "qed rdma create qp failed due to invalid statistics queue %d. maximum is %d\n",
+		       in_params->stats_queue, max_stats_queues);
+		return NULL;
+	}
+
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn)) {
+		if (in_params->sq_num_pages * sizeof(struct regpair) >
+		    IWARP_SHARED_QUEUE_PAGE_SQ_PBL_MAX_SIZE) {
+			DP_NOTICE(p_hwfn->cdev,
+				  "Sq num pages: %d exceeds maximum\n",
+				  in_params->sq_num_pages);
+			return NULL;
+		}
+		if (in_params->rq_num_pages * sizeof(struct regpair) >
+		    IWARP_SHARED_QUEUE_PAGE_RQ_PBL_MAX_SIZE) {
+			DP_NOTICE(p_hwfn->cdev,
+				  "Rq num pages: %d exceeds maximum\n",
+				  in_params->rq_num_pages);
+			return NULL;
+		}
+	}
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return NULL;
+
+	qp->cur_state = QED_ROCE_QP_STATE_RESET;
+	qp->qp_handle.hi = cpu_to_le32(in_params->qp_handle_hi);
+	qp->qp_handle.lo = cpu_to_le32(in_params->qp_handle_lo);
+	qp->qp_handle_async.hi = cpu_to_le32(in_params->qp_handle_async_hi);
+	qp->qp_handle_async.lo = cpu_to_le32(in_params->qp_handle_async_lo);
+	qp->use_srq = in_params->use_srq;
+	qp->signal_all = in_params->signal_all;
+	qp->fmr_and_reserved_lkey = in_params->fmr_and_reserved_lkey;
+	qp->pd = in_params->pd;
+	qp->dpi = in_params->dpi;
+	qp->sq_cq_id = in_params->sq_cq_id;
+	qp->sq_num_pages = in_params->sq_num_pages;
+	qp->sq_pbl_ptr = in_params->sq_pbl_ptr;
+	qp->rq_cq_id = in_params->rq_cq_id;
+	qp->rq_num_pages = in_params->rq_num_pages;
+	qp->rq_pbl_ptr = in_params->rq_pbl_ptr;
+	qp->srq_id = in_params->srq_id;
+	qp->req_offloaded = false;
+	qp->resp_offloaded = false;
+	qp->e2e_flow_control_en = qp->use_srq ? false : true;
+	qp->stats_queue = in_params->stats_queue;
+
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn)) {
+		rc = qed_iwarp_create_qp(p_hwfn, qp, out_params);
+		qp->qpid = qp->icid;
+	} else {
+		rc = qed_roce_alloc_cid(p_hwfn, &qp->icid);
+		qp->qpid = ((0xFF << 16) | qp->icid);
+	}
+
+	if (rc) {
+		kfree(qp);
+		return NULL;
+	}
+
+	out_params->icid = qp->icid;
+	out_params->qp_id = qp->qpid;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Create QP, rc = %d\n", rc);
+	return qp;
+}
+
+static int qed_rdma_modify_qp(void *rdma_cxt,
+			      struct qed_rdma_qp *qp,
+			      struct qed_rdma_modify_qp_in_params *params)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	enum qed_roce_qp_state prev_state;
+	int rc = 0;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x params->new_state=%d\n",
+		   qp->icid, params->new_state);
+
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc);
+		return rc;
+	}
+
+	if (GET_FIELD(params->modify_flags,
+		      QED_RDMA_MODIFY_QP_VALID_RDMA_OPS_EN)) {
+		qp->incoming_rdma_read_en = params->incoming_rdma_read_en;
+		qp->incoming_rdma_write_en = params->incoming_rdma_write_en;
+		qp->incoming_atomic_en = params->incoming_atomic_en;
+	}
+
+	/* Update QP structure with the updated values */
+	if (GET_FIELD(params->modify_flags, QED_ROCE_MODIFY_QP_VALID_ROCE_MODE))
+		qp->roce_mode = params->roce_mode;
+	if (GET_FIELD(params->modify_flags, QED_ROCE_MODIFY_QP_VALID_PKEY))
+		qp->pkey = params->pkey;
+	if (GET_FIELD(params->modify_flags,
+		      QED_ROCE_MODIFY_QP_VALID_E2E_FLOW_CONTROL_EN))
+		qp->e2e_flow_control_en = params->e2e_flow_control_en;
+	if (GET_FIELD(params->modify_flags, QED_ROCE_MODIFY_QP_VALID_DEST_QP))
+		qp->dest_qp = params->dest_qp;
+	if (GET_FIELD(params->modify_flags,
+		      QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR)) {
+		/* Indicates that the following parameters have changed:
+		 * Traffic class, flow label, hop limit, source GID,
+		 * destination GID, loopback indicator
+		 */
+		qp->traffic_class_tos = params->traffic_class_tos;
+		qp->flow_label = params->flow_label;
+		qp->hop_limit_ttl = params->hop_limit_ttl;
+
+		qp->sgid = params->sgid;
+		qp->dgid = params->dgid;
+		qp->udp_src_port = 0;
+		qp->vlan_id = params->vlan_id;
+		qp->mtu = params->mtu;
+		qp->lb_indication = params->lb_indication;
+		memcpy((u8 *)&qp->remote_mac_addr[0],
+		       (u8 *)&params->remote_mac_addr[0], ETH_ALEN);
+		if (params->use_local_mac) {
+			memcpy((u8 *)&qp->local_mac_addr[0],
+			       (u8 *)&params->local_mac_addr[0], ETH_ALEN);
+		} else {
+			memcpy((u8 *)&qp->local_mac_addr[0],
+			       (u8 *)&p_hwfn->hw_info.hw_mac_addr, ETH_ALEN);
+		}
+	}
+	if (GET_FIELD(params->modify_flags, QED_ROCE_MODIFY_QP_VALID_RQ_PSN))
+		qp->rq_psn = params->rq_psn;
+	if (GET_FIELD(params->modify_flags, QED_ROCE_MODIFY_QP_VALID_SQ_PSN))
+		qp->sq_psn = params->sq_psn;
+	if (GET_FIELD(params->modify_flags,
+		      QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_REQ))
+		qp->max_rd_atomic_req = params->max_rd_atomic_req;
+	if (GET_FIELD(params->modify_flags,
+		      QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_RESP))
+		qp->max_rd_atomic_resp = params->max_rd_atomic_resp;
+	if (GET_FIELD(params->modify_flags,
+		      QED_ROCE_MODIFY_QP_VALID_ACK_TIMEOUT))
+		qp->ack_timeout = params->ack_timeout;
+	if (GET_FIELD(params->modify_flags, QED_ROCE_MODIFY_QP_VALID_RETRY_CNT))
+		qp->retry_cnt = params->retry_cnt;
+	if (GET_FIELD(params->modify_flags,
+		      QED_ROCE_MODIFY_QP_VALID_RNR_RETRY_CNT))
+		qp->rnr_retry_cnt = params->rnr_retry_cnt;
+	if (GET_FIELD(params->modify_flags,
+		      QED_ROCE_MODIFY_QP_VALID_MIN_RNR_NAK_TIMER))
+		qp->min_rnr_nak_timer = params->min_rnr_nak_timer;
+
+	qp->sqd_async = params->sqd_async;
+
+	prev_state = qp->cur_state;
+	if (GET_FIELD(params->modify_flags,
+		      QED_RDMA_MODIFY_QP_VALID_NEW_STATE)) {
+		qp->cur_state = params->new_state;
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "qp->cur_state=%d\n",
+			   qp->cur_state);
+	}
+
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn)) {
+		enum qed_iwarp_qp_state new_state =
+		    qed_roce2iwarp_state(qp->cur_state);
+
+		rc = qed_iwarp_modify_qp(p_hwfn, qp, new_state, 0);
+	} else {
+		rc = qed_roce_modify_qp(p_hwfn, qp, prev_state, params);
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Modify QP, rc = %d\n", rc);
+	return rc;
+}
+
+static int
+qed_rdma_register_tid(void *rdma_cxt,
+		      struct qed_rdma_register_tid_in_params *params)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	struct rdma_register_tid_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	enum rdma_tid_type tid_type;
+	u8 fw_return_code;
+	int rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "itid = %08x\n", params->itid);
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent, RDMA_RAMROD_REGISTER_MR,
+				 p_hwfn->p_rdma_info->proto, &init_data);
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc);
+		return rc;
+	}
+
+	if (p_hwfn->p_rdma_info->last_tid < params->itid)
+		p_hwfn->p_rdma_info->last_tid = params->itid;
+
+	p_ramrod = &p_ent->ramrod.rdma_register_tid;
+
+	p_ramrod->flags = 0;
+	SET_FIELD(p_ramrod->flags,
+		  RDMA_REGISTER_TID_RAMROD_DATA_TWO_LEVEL_PBL,
+		  params->pbl_two_level);
+
+	SET_FIELD(p_ramrod->flags,
+		  RDMA_REGISTER_TID_RAMROD_DATA_ZERO_BASED, params->zbva);
+
+	SET_FIELD(p_ramrod->flags,
+		  RDMA_REGISTER_TID_RAMROD_DATA_PHY_MR, params->phy_mr);
+
+	/* Don't initialize D/C field, as it may override other bits. */
+	if (!(params->tid_type == QED_RDMA_TID_FMR) && !(params->dma_mr))
+		SET_FIELD(p_ramrod->flags,
+			  RDMA_REGISTER_TID_RAMROD_DATA_PAGE_SIZE_LOG,
+			  params->page_size_log - 12);
+
+	SET_FIELD(p_ramrod->flags,
+		  RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_READ,
+		  params->remote_read);
+
+	SET_FIELD(p_ramrod->flags,
+		  RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_WRITE,
+		  params->remote_write);
+
+	SET_FIELD(p_ramrod->flags,
+		  RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_ATOMIC,
+		  params->remote_atomic);
+
+	SET_FIELD(p_ramrod->flags,
+		  RDMA_REGISTER_TID_RAMROD_DATA_LOCAL_WRITE,
+		  params->local_write);
+
+	SET_FIELD(p_ramrod->flags,
+		  RDMA_REGISTER_TID_RAMROD_DATA_LOCAL_READ, params->local_read);
+
+	SET_FIELD(p_ramrod->flags,
+		  RDMA_REGISTER_TID_RAMROD_DATA_ENABLE_MW_BIND,
+		  params->mw_bind);
+
+	SET_FIELD(p_ramrod->flags1,
+		  RDMA_REGISTER_TID_RAMROD_DATA_PBL_PAGE_SIZE_LOG,
+		  params->pbl_page_size_log - 12);
+
+	SET_FIELD(p_ramrod->flags2,
+		  RDMA_REGISTER_TID_RAMROD_DATA_DMA_MR, params->dma_mr);
+
+	switch (params->tid_type) {
+	case QED_RDMA_TID_REGISTERED_MR:
+		tid_type = RDMA_TID_REGISTERED_MR;
+		break;
+	case QED_RDMA_TID_FMR:
+		tid_type = RDMA_TID_FMR;
+		break;
+	case QED_RDMA_TID_MW_TYPE1:
+		tid_type = RDMA_TID_MW_TYPE1;
+		break;
+	case QED_RDMA_TID_MW_TYPE2A:
+		tid_type = RDMA_TID_MW_TYPE2A;
+		break;
+	default:
+		rc = -EINVAL;
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc);
+		return rc;
+	}
+	SET_FIELD(p_ramrod->flags1,
+		  RDMA_REGISTER_TID_RAMROD_DATA_TID_TYPE, tid_type);
+
+	p_ramrod->itid = cpu_to_le32(params->itid);
+	p_ramrod->key = params->key;
+	p_ramrod->pd = cpu_to_le16(params->pd);
+	p_ramrod->length_hi = (u8)(params->length >> 32);
+	p_ramrod->length_lo = DMA_LO_LE(params->length);
+	if (params->zbva) {
+		/* Lower 32 bits of the registered MR address.
+		 * In case of zero based MR, will hold FBO
+		 */
+		p_ramrod->va.hi = 0;
+		p_ramrod->va.lo = cpu_to_le32(params->fbo);
+	} else {
+		DMA_REGPAIR_LE(p_ramrod->va, params->vaddr);
+	}
+	DMA_REGPAIR_LE(p_ramrod->pbl_base, params->pbl_ptr);
+
+	/* DIF */
+	if (params->dif_enabled) {
+		SET_FIELD(p_ramrod->flags2,
+			  RDMA_REGISTER_TID_RAMROD_DATA_DIF_ON_HOST_FLG, 1);
+		DMA_REGPAIR_LE(p_ramrod->dif_error_addr,
+			       params->dif_error_addr);
+		DMA_REGPAIR_LE(p_ramrod->dif_runt_addr, params->dif_runt_addr);
+	}
+
+	rc = qed_spq_post(p_hwfn, p_ent, &fw_return_code);
+	if (rc)
+		return rc;
+
+	if (fw_return_code != RDMA_RETURN_OK) {
+		DP_NOTICE(p_hwfn, "fw_return_code = %d\n", fw_return_code);
+		return -EINVAL;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Register TID, rc = %d\n", rc);
+	return rc;
+}
+
+static int qed_rdma_deregister_tid(void *rdma_cxt, u32 itid)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	struct rdma_deregister_tid_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	struct qed_ptt *p_ptt;
+	u8 fw_return_code;
+	int rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "itid = %08x\n", itid);
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent, RDMA_RAMROD_DEREGISTER_MR,
+				 p_hwfn->p_rdma_info->proto, &init_data);
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc);
+		return rc;
+	}
+
+	p_ramrod = &p_ent->ramrod.rdma_deregister_tid;
+	p_ramrod->itid = cpu_to_le32(itid);
+
+	rc = qed_spq_post(p_hwfn, p_ent, &fw_return_code);
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc);
+		return rc;
+	}
+
+	if (fw_return_code == RDMA_RETURN_DEREGISTER_MR_BAD_STATE_ERR) {
+		DP_NOTICE(p_hwfn, "fw_return_code = %d\n", fw_return_code);
+		return -EINVAL;
+	} else if (fw_return_code == RDMA_RETURN_NIG_DRAIN_REQ) {
+		/* Bit indicating that the TID is in use and a nig drain is
+		 * required before sending the ramrod again
+		 */
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt) {
+			rc = -EBUSY;
+			DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+				   "Failed to acquire PTT\n");
+			return rc;
+		}
+
+		rc = qed_mcp_drain(p_hwfn, p_ptt);
+		if (rc) {
+			qed_ptt_release(p_hwfn, p_ptt);
+			DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+				   "Drain failed\n");
+			return rc;
+		}
+
+		qed_ptt_release(p_hwfn, p_ptt);
+
+		/* Resend the ramrod */
+		rc = qed_sp_init_request(p_hwfn, &p_ent,
+					 RDMA_RAMROD_DEREGISTER_MR,
+					 p_hwfn->p_rdma_info->proto,
+					 &init_data);
+		if (rc) {
+			DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+				   "Failed to init sp-element\n");
+			return rc;
+		}
+
+		rc = qed_spq_post(p_hwfn, p_ent, &fw_return_code);
+		if (rc) {
+			DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+				   "Ramrod failed\n");
+			return rc;
+		}
+
+		if (fw_return_code != RDMA_RETURN_OK) {
+			DP_NOTICE(p_hwfn, "fw_return_code = %d\n",
+				  fw_return_code);
+			return rc;
+		}
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "De-registered TID, rc = %d\n", rc);
+	return rc;
+}
+
+static void *qed_rdma_get_rdma_ctx(struct qed_dev *cdev)
+{
+	return QED_LEADING_HWFN(cdev);
+}
+
+bool qed_rdma_allocated_qps(struct qed_hwfn *p_hwfn)
+{
+	bool result;
+
+	/* if rdma info has not been allocated, naturally there are no qps */
+	if (!p_hwfn->p_rdma_info)
+		return false;
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	if (!p_hwfn->p_rdma_info->cid_map.bitmap)
+		result = false;
+	else
+		result = !qed_bmap_is_empty(&p_hwfn->p_rdma_info->cid_map);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+	return result;
+}
+
+void qed_rdma_dpm_conf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 val;
+
+	val = (p_hwfn->dcbx_no_edpm || p_hwfn->db_bar_no_edpm) ? 0 : 1;
+
+	qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_DPM_ENABLE, val);
+	DP_VERBOSE(p_hwfn, (QED_MSG_DCB | QED_MSG_RDMA),
+		   "Changing DPM_EN state to %d (DCBX=%d, DB_BAR=%d)\n",
+		   val, p_hwfn->dcbx_no_edpm, p_hwfn->db_bar_no_edpm);
+}
+
+
+void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	p_hwfn->db_bar_no_edpm = true;
+
+	qed_rdma_dpm_conf(p_hwfn, p_ptt);
+}
+
+static int qed_rdma_start(void *rdma_cxt,
+			  struct qed_rdma_start_in_params *params)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+	struct qed_ptt *p_ptt;
+	int rc = -EBUSY;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "desired_cnq = %08x\n", params->desired_cnq);
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		goto err;
+
+	rc = qed_rdma_alloc(p_hwfn, p_ptt, params);
+	if (rc)
+		goto err1;
+
+	rc = qed_rdma_setup(p_hwfn, p_ptt, params);
+	if (rc)
+		goto err2;
+
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return rc;
+
+err2:
+	qed_rdma_free(p_hwfn);
+err1:
+	qed_ptt_release(p_hwfn, p_ptt);
+err:
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "RDMA start - error, rc = %d\n", rc);
+	return rc;
+}
+
+static int qed_rdma_init(struct qed_dev *cdev,
+			 struct qed_rdma_start_in_params *params)
+{
+	return qed_rdma_start(QED_LEADING_HWFN(cdev), params);
+}
+
+static void qed_rdma_remove_user(void *rdma_cxt, u16 dpi)
+{
+	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "dpi = %08x\n", dpi);
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->dpi_map, dpi);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+}
+
+static int qed_roce_ll2_set_mac_filter(struct qed_dev *cdev,
+				       u8 *old_mac_address,
+				       u8 *new_mac_address)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt;
+	int rc = 0;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt) {
+		DP_ERR(cdev,
+		       "qed roce ll2 mac filter set: failed to acquire PTT\n");
+		return -EINVAL;
+	}
+
+	if (old_mac_address)
+		qed_llh_remove_mac_filter(p_hwfn, p_ptt, old_mac_address);
+	if (new_mac_address)
+		rc = qed_llh_add_mac_filter(p_hwfn, p_ptt, new_mac_address);
+
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	if (rc)
+		DP_ERR(cdev,
+		       "qed roce ll2 mac filter set: failed to add MAC filter\n");
+
+	return rc;
+}
+
+static const struct qed_rdma_ops qed_rdma_ops_pass = {
+	.common = &qed_common_ops_pass,
+	.fill_dev_info = &qed_fill_rdma_dev_info,
+	.rdma_get_rdma_ctx = &qed_rdma_get_rdma_ctx,
+	.rdma_init = &qed_rdma_init,
+	.rdma_add_user = &qed_rdma_add_user,
+	.rdma_remove_user = &qed_rdma_remove_user,
+	.rdma_stop = &qed_rdma_stop,
+	.rdma_query_port = &qed_rdma_query_port,
+	.rdma_query_device = &qed_rdma_query_device,
+	.rdma_get_start_sb = &qed_rdma_get_sb_start,
+	.rdma_get_rdma_int = &qed_rdma_get_int,
+	.rdma_set_rdma_int = &qed_rdma_set_int,
+	.rdma_get_min_cnq_msix = &qed_rdma_get_min_cnq_msix,
+	.rdma_cnq_prod_update = &qed_rdma_cnq_prod_update,
+	.rdma_alloc_pd = &qed_rdma_alloc_pd,
+	.rdma_dealloc_pd = &qed_rdma_free_pd,
+	.rdma_create_cq = &qed_rdma_create_cq,
+	.rdma_destroy_cq = &qed_rdma_destroy_cq,
+	.rdma_create_qp = &qed_rdma_create_qp,
+	.rdma_modify_qp = &qed_rdma_modify_qp,
+	.rdma_query_qp = &qed_rdma_query_qp,
+	.rdma_destroy_qp = &qed_rdma_destroy_qp,
+	.rdma_alloc_tid = &qed_rdma_alloc_tid,
+	.rdma_free_tid = &qed_rdma_free_tid,
+	.rdma_register_tid = &qed_rdma_register_tid,
+	.rdma_deregister_tid = &qed_rdma_deregister_tid,
+	.ll2_acquire_connection = &qed_ll2_acquire_connection,
+	.ll2_establish_connection = &qed_ll2_establish_connection,
+	.ll2_terminate_connection = &qed_ll2_terminate_connection,
+	.ll2_release_connection = &qed_ll2_release_connection,
+	.ll2_post_rx_buffer = &qed_ll2_post_rx_buffer,
+	.ll2_prepare_tx_packet = &qed_ll2_prepare_tx_packet,
+	.ll2_set_fragment_of_tx_packet = &qed_ll2_set_fragment_of_tx_packet,
+	.ll2_set_mac_filter = &qed_roce_ll2_set_mac_filter,
+	.ll2_get_stats = &qed_ll2_get_stats,
+	.iwarp_connect = &qed_iwarp_connect,
+	.iwarp_create_listen = &qed_iwarp_create_listen,
+	.iwarp_destroy_listen = &qed_iwarp_destroy_listen,
+	.iwarp_accept = &qed_iwarp_accept,
+	.iwarp_reject = &qed_iwarp_reject,
+	.iwarp_send_rtr = &qed_iwarp_send_rtr,
+};
+
+const struct qed_rdma_ops *qed_get_rdma_ops(void)
+{
+	return &qed_rdma_ops_pass;
+}
+EXPORT_SYMBOL(qed_get_rdma_ops);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.h b/drivers/net/ethernet/qlogic/qed/qed_rdma.h
new file mode 100644
index 0000000..18ec9cb
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.h
@@ -0,0 +1,206 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _QED_RDMA_H
+#define _QED_RDMA_H
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/qed/qed_if.h>
+#include <linux/qed/qed_rdma_if.h>
+#include "qed.h"
+#include "qed_dev_api.h"
+#include "qed_hsi.h"
+#include "qed_iwarp.h"
+#include "qed_roce.h"
+
+#define QED_RDMA_MAX_FMR                    (RDMA_MAX_TIDS)
+#define QED_RDMA_MAX_P_KEY                  (1)
+#define QED_RDMA_MAX_WQE                    (0x7FFF)
+#define QED_RDMA_MAX_SRQ_WQE_ELEM           (0x7FFF)
+#define QED_RDMA_PAGE_SIZE_CAPS             (0xFFFFF000)
+#define QED_RDMA_ACK_DELAY                  (15)
+#define QED_RDMA_MAX_MR_SIZE                (0x10000000000ULL)
+#define QED_RDMA_MAX_CQS                    (RDMA_MAX_CQS)
+#define QED_RDMA_MAX_MRS                    (RDMA_MAX_TIDS)
+/* Add 1 for header element */
+#define QED_RDMA_MAX_SRQ_ELEM_PER_WQE	    (RDMA_MAX_SGE_PER_RQ_WQE + 1)
+#define QED_RDMA_MAX_SGE_PER_SRQ_WQE        (RDMA_MAX_SGE_PER_RQ_WQE)
+#define QED_RDMA_SRQ_WQE_ELEM_SIZE          (16)
+#define QED_RDMA_MAX_SRQS                   (32 * 1024)
+
+#define QED_RDMA_MAX_CQE_32_BIT             (0x7FFFFFFF - 1)
+#define QED_RDMA_MAX_CQE_16_BIT             (0x7FFF - 1)
+
+enum qed_rdma_toggle_bit {
+	QED_RDMA_TOGGLE_BIT_CLEAR = 0,
+	QED_RDMA_TOGGLE_BIT_SET = 1
+};
+
+#define QED_RDMA_MAX_BMAP_NAME	(10)
+struct qed_bmap {
+	unsigned long *bitmap;
+	u32 max_count;
+	char name[QED_RDMA_MAX_BMAP_NAME];
+};
+
+struct qed_rdma_info {
+	/* spin lock to protect bitmaps */
+	spinlock_t lock;
+
+	struct qed_bmap cq_map;
+	struct qed_bmap pd_map;
+	struct qed_bmap tid_map;
+	struct qed_bmap qp_map;
+	struct qed_bmap srq_map;
+	struct qed_bmap cid_map;
+	struct qed_bmap tcp_cid_map;
+	struct qed_bmap real_cid_map;
+	struct qed_bmap dpi_map;
+	struct qed_bmap toggle_bits;
+	struct qed_rdma_events events;
+	struct qed_rdma_device *dev;
+	struct qed_rdma_port *port;
+	u32 last_tid;
+	u8 num_cnqs;
+	u32 num_qps;
+	u32 num_mrs;
+	u16 queue_zone_base;
+	u16 max_queue_zones;
+	enum protocol_type proto;
+	struct qed_iwarp_info iwarp;
+};
+
+struct qed_rdma_qp {
+	struct regpair qp_handle;
+	struct regpair qp_handle_async;
+	u32 qpid;
+	u16 icid;
+	enum qed_roce_qp_state cur_state;
+	enum qed_iwarp_qp_state iwarp_state;
+	bool use_srq;
+	bool signal_all;
+	bool fmr_and_reserved_lkey;
+
+	bool incoming_rdma_read_en;
+	bool incoming_rdma_write_en;
+	bool incoming_atomic_en;
+	bool e2e_flow_control_en;
+
+	u16 pd;
+	u16 pkey;
+	u32 dest_qp;
+	u16 mtu;
+	u16 srq_id;
+	u8 traffic_class_tos;
+	u8 hop_limit_ttl;
+	u16 dpi;
+	u32 flow_label;
+	bool lb_indication;
+	u16 vlan_id;
+	u32 ack_timeout;
+	u8 retry_cnt;
+	u8 rnr_retry_cnt;
+	u8 min_rnr_nak_timer;
+	bool sqd_async;
+	union qed_gid sgid;
+	union qed_gid dgid;
+	enum roce_mode roce_mode;
+	u16 udp_src_port;
+	u8 stats_queue;
+
+	/* requeseter */
+	u8 max_rd_atomic_req;
+	u32 sq_psn;
+	u16 sq_cq_id;
+	u16 sq_num_pages;
+	dma_addr_t sq_pbl_ptr;
+	void *orq;
+	dma_addr_t orq_phys_addr;
+	u8 orq_num_pages;
+	bool req_offloaded;
+
+	/* responder */
+	u8 max_rd_atomic_resp;
+	u32 rq_psn;
+	u16 rq_cq_id;
+	u16 rq_num_pages;
+	dma_addr_t rq_pbl_ptr;
+	void *irq;
+	dma_addr_t irq_phys_addr;
+	u8 irq_num_pages;
+	bool resp_offloaded;
+	u32 cq_prod;
+
+	u8 remote_mac_addr[6];
+	u8 local_mac_addr[6];
+
+	void *shared_queue;
+	dma_addr_t shared_queue_phys_addr;
+	struct qed_iwarp_ep *ep;
+};
+
+#if IS_ENABLED(CONFIG_QED_RDMA)
+void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+void qed_rdma_dpm_conf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+#else
+static inline void qed_rdma_dpm_conf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) {}
+static inline void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt) {}
+#endif
+
+int
+qed_rdma_bmap_alloc(struct qed_hwfn *p_hwfn,
+		    struct qed_bmap *bmap, u32 max_count, char *name);
+
+void
+qed_rdma_bmap_free(struct qed_hwfn *p_hwfn, struct qed_bmap *bmap, bool check);
+
+int
+qed_rdma_bmap_alloc_id(struct qed_hwfn *p_hwfn,
+		       struct qed_bmap *bmap, u32 *id_num);
+
+void
+qed_bmap_set_id(struct qed_hwfn *p_hwfn, struct qed_bmap *bmap, u32 id_num);
+
+void
+qed_bmap_release_id(struct qed_hwfn *p_hwfn, struct qed_bmap *bmap, u32 id_num);
+
+int
+qed_bmap_test_id(struct qed_hwfn *p_hwfn, struct qed_bmap *bmap, u32 id_num);
+
+void qed_rdma_set_fw_mac(u16 *p_fw_mac, u8 *p_qed_mac);
+
+bool qed_rdma_allocated_qps(struct qed_hwfn *p_hwfn);
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
new file mode 100644
index 0000000..f712205
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -0,0 +1,1631 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef REG_ADDR_H
+#define REG_ADDR_H
+
+#define  CDU_REG_CID_ADDR_PARAMS_CONTEXT_SIZE_SHIFT \
+	0
+
+#define  CDU_REG_CID_ADDR_PARAMS_CONTEXT_SIZE		( \
+		0xfff << 0)
+
+#define  CDU_REG_CID_ADDR_PARAMS_BLOCK_WASTE_SHIFT \
+	12
+
+#define  CDU_REG_CID_ADDR_PARAMS_BLOCK_WASTE		( \
+		0xfff << 12)
+
+#define  CDU_REG_CID_ADDR_PARAMS_NCIB_SHIFT \
+	24
+
+#define  CDU_REG_CID_ADDR_PARAMS_NCIB			( \
+		0xff << 24)
+
+#define CDU_REG_SEGMENT0_PARAMS	\
+	0x580904UL
+#define CDU_REG_SEGMENT0_PARAMS_T0_NUM_TIDS_IN_BLOCK \
+	(0xfff << 0)
+#define CDU_REG_SEGMENT0_PARAMS_T0_NUM_TIDS_IN_BLOCK_SHIFT \
+	0
+#define CDU_REG_SEGMENT0_PARAMS_T0_TID_BLOCK_WASTE \
+	(0xff << 16)
+#define CDU_REG_SEGMENT0_PARAMS_T0_TID_BLOCK_WASTE_SHIFT \
+	16
+#define CDU_REG_SEGMENT0_PARAMS_T0_TID_SIZE \
+	(0xff << 24)
+#define CDU_REG_SEGMENT0_PARAMS_T0_TID_SIZE_SHIFT \
+	24
+#define CDU_REG_SEGMENT1_PARAMS	\
+	0x580908UL
+#define CDU_REG_SEGMENT1_PARAMS_T1_NUM_TIDS_IN_BLOCK \
+	(0xfff << 0)
+#define CDU_REG_SEGMENT1_PARAMS_T1_NUM_TIDS_IN_BLOCK_SHIFT \
+	0
+#define CDU_REG_SEGMENT1_PARAMS_T1_TID_BLOCK_WASTE \
+	(0xff << 16)
+#define CDU_REG_SEGMENT1_PARAMS_T1_TID_BLOCK_WASTE_SHIFT \
+	16
+#define CDU_REG_SEGMENT1_PARAMS_T1_TID_SIZE \
+	(0xff << 24)
+#define CDU_REG_SEGMENT1_PARAMS_T1_TID_SIZE_SHIFT \
+	24
+
+#define  XSDM_REG_OPERATION_GEN \
+	0xf80408UL
+#define  NIG_REG_RX_BRB_OUT_EN \
+	0x500e18UL
+#define  NIG_REG_STORM_OUT_EN \
+	0x500e08UL
+#define  PSWRQ2_REG_L2P_VALIDATE_VFID \
+	0x240c50UL
+#define  PGLUE_B_REG_USE_CLIENTID_IN_TAG	\
+	0x2aae04UL
+#define  PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER	\
+	0x2aa16cUL
+#define PGLUE_B_REG_WAS_ERROR_VF_31_0_CLR \
+	0x2aa118UL
+#define PSWHST_REG_ZONE_PERMISSION_TABLE \
+	0x2a0800UL
+#define  BAR0_MAP_REG_MSDM_RAM \
+	0x1d00000UL
+#define  BAR0_MAP_REG_USDM_RAM \
+	0x1d80000UL
+#define  BAR0_MAP_REG_PSDM_RAM \
+	0x1f00000UL
+#define  BAR0_MAP_REG_TSDM_RAM \
+	0x1c80000UL
+#define BAR0_MAP_REG_XSDM_RAM \
+	0x1e00000UL
+#define BAR0_MAP_REG_YSDM_RAM \
+	0x1e80000UL
+#define  NIG_REG_RX_LLH_BRB_GATE_DNTFWD_PERPF \
+	0x5011f4UL
+#define PRS_REG_SEARCH_RESP_INITIATOR_TYPE \
+	0x1f0164UL
+#define  PRS_REG_SEARCH_TCP \
+	0x1f0400UL
+#define  PRS_REG_SEARCH_UDP \
+	0x1f0404UL
+#define  PRS_REG_SEARCH_FCOE \
+	0x1f0408UL
+#define  PRS_REG_SEARCH_ROCE \
+	0x1f040cUL
+#define  PRS_REG_SEARCH_OPENFLOW	\
+	0x1f0434UL
+#define PRS_REG_SEARCH_TAG1 \
+	0x1f0444UL
+#define PRS_REG_SEARCH_TENANT_ID \
+	0x1f044cUL
+#define PRS_REG_PKT_LEN_STAT_TAGS_NOT_COUNTED_FIRST \
+	0x1f0a0cUL
+#define PRS_REG_SEARCH_TCP_FIRST_FRAG \
+	0x1f0410UL
+#define  TM_REG_PF_ENABLE_CONN \
+	0x2c043cUL
+#define  TM_REG_PF_ENABLE_TASK \
+	0x2c0444UL
+#define  TM_REG_PF_SCAN_ACTIVE_CONN \
+	0x2c04fcUL
+#define  TM_REG_PF_SCAN_ACTIVE_TASK \
+	0x2c0500UL
+#define  IGU_REG_LEADING_EDGE_LATCH \
+	0x18082cUL
+#define  IGU_REG_TRAILING_EDGE_LATCH \
+	0x180830UL
+#define  QM_REG_USG_CNT_PF_TX \
+	0x2f2eacUL
+#define  QM_REG_USG_CNT_PF_OTHER	\
+	0x2f2eb0UL
+#define  DORQ_REG_PF_DB_ENABLE \
+	0x100508UL
+#define DORQ_REG_VF_USAGE_CNT \
+	0x1009c4UL
+#define  QM_REG_PF_EN \
+	0x2f2ea4UL
+#define TCFC_REG_WEAK_ENABLE_VF \
+	0x2d0704UL
+#define  TCFC_REG_STRONG_ENABLE_PF \
+	0x2d0708UL
+#define  TCFC_REG_STRONG_ENABLE_VF \
+	0x2d070cUL
+#define CCFC_REG_WEAK_ENABLE_VF \
+	0x2e0704UL
+#define  CCFC_REG_STRONG_ENABLE_PF \
+	0x2e0708UL
+#define  PGLUE_B_REG_PGL_ADDR_88_F0_BB \
+	0x2aa404UL
+#define  PGLUE_B_REG_PGL_ADDR_8C_F0_BB \
+	0x2aa408UL
+#define  PGLUE_B_REG_PGL_ADDR_90_F0_BB \
+	0x2aa40cUL
+#define  PGLUE_B_REG_PGL_ADDR_94_F0_BB \
+	0x2aa410UL
+#define  PGLUE_B_REG_WAS_ERROR_PF_31_0_CLR \
+	0x2aa138UL
+#define  PGLUE_B_REG_INTERNAL_PFID_ENABLE_TARGET_READ \
+	0x2aa174UL
+#define  MISC_REG_GEN_PURP_CR0 \
+	0x008c80UL
+#define  MCP_REG_SCRATCH	\
+	0xe20000UL
+#define  CNIG_REG_NW_PORT_MODE_BB_B0 \
+	0x218200UL
+#define  MISCS_REG_CHIP_NUM \
+	0x00976cUL
+#define  MISCS_REG_CHIP_REV \
+	0x009770UL
+#define  MISCS_REG_CMT_ENABLED_FOR_PAIR \
+	0x00971cUL
+#define  MISCS_REG_CHIP_TEST_REG	\
+	0x009778UL
+#define  MISCS_REG_CHIP_METAL \
+	0x009774UL
+#define MISCS_REG_FUNCTION_HIDE \
+	0x0096f0UL
+#define  BRB_REG_HEADER_SIZE \
+	0x340804UL
+#define  BTB_REG_HEADER_SIZE \
+	0xdb0804UL
+#define  CAU_REG_LONG_TIMEOUT_THRESHOLD \
+	0x1c0708UL
+#define  CCFC_REG_ACTIVITY_COUNTER \
+	0x2e8800UL
+#define CCFC_REG_STRONG_ENABLE_VF \
+	0x2e070cUL
+#define CDU_REG_CCFC_CTX_VALID0 \
+	0x580400UL
+#define CDU_REG_CCFC_CTX_VALID1 \
+	0x580404UL
+#define CDU_REG_TCFC_CTX_VALID0 \
+	0x580408UL
+#define  CDU_REG_CID_ADDR_PARAMS \
+	0x580900UL
+#define  DBG_REG_CLIENT_ENABLE \
+	0x010004UL
+#define  DMAE_REG_INIT \
+	0x00c000UL
+#define  DORQ_REG_IFEN \
+	0x100040UL
+#define DORQ_REG_DB_DROP_REASON \
+	0x100a2cUL
+#define DORQ_REG_DB_DROP_DETAILS \
+	0x100a24UL
+#define DORQ_REG_DB_DROP_DETAILS_ADDRESS \
+	0x100a1cUL
+#define  GRC_REG_TIMEOUT_EN \
+	0x050404UL
+#define GRC_REG_TIMEOUT_ATTN_ACCESS_VALID \
+	0x050054UL
+#define GRC_REG_TIMEOUT_ATTN_ACCESS_DATA_0 \
+	0x05004cUL
+#define GRC_REG_TIMEOUT_ATTN_ACCESS_DATA_1 \
+	0x050050UL
+#define  IGU_REG_BLOCK_CONFIGURATION \
+	0x180040UL
+#define  MCM_REG_INIT \
+	0x1200000UL
+#define  MCP2_REG_DBG_DWORD_ENABLE \
+	0x052404UL
+#define  MISC_REG_PORT_MODE \
+	0x008c00UL
+#define  MISCS_REG_CLK_100G_MODE	\
+	0x009070UL
+#define  MSDM_REG_ENABLE_IN1 \
+	0xfc0004UL
+#define  MSEM_REG_ENABLE_IN \
+	0x1800004UL
+#define  NIG_REG_CM_HDR \
+	0x500840UL
+#define NIG_REG_LLH_TAGMAC_DEF_PF_VECTOR \
+	0x50196cUL
+#define NIG_REG_LLH_CLS_TYPE_DUALMODE \
+	0x501964UL
+#define NIG_REG_LLH_FUNC_TAG_EN 0x5019b0UL
+#define NIG_REG_LLH_FUNC_TAG_VALUE 0x5019d0UL
+#define NIG_REG_LLH_FUNC_FILTER_VALUE \
+	0x501a00UL
+#define NIG_REG_LLH_FUNC_FILTER_VALUE_SIZE \
+	32
+#define NIG_REG_LLH_FUNC_FILTER_EN \
+	0x501a80UL
+#define NIG_REG_LLH_FUNC_FILTER_EN_SIZE	\
+	16
+#define NIG_REG_LLH_FUNC_FILTER_MODE \
+	0x501ac0UL
+#define NIG_REG_LLH_FUNC_FILTER_MODE_SIZE \
+	16
+#define NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE \
+	0x501b00UL
+#define NIG_REG_LLH_FUNC_FILTER_PROTOCOL_TYPE_SIZE \
+	16
+#define NIG_REG_LLH_FUNC_FILTER_HDR_SEL	\
+	0x501b40UL
+#define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_SIZE \
+	16
+#define  NCSI_REG_CONFIG	\
+	0x040200UL
+#define  PBF_REG_INIT \
+	0xd80000UL
+#define PBF_REG_NUM_BLOCKS_ALLOCATED_PROD_VOQ0 \
+	0xd806c8UL
+#define PBF_REG_NUM_BLOCKS_ALLOCATED_CONS_VOQ0 \
+	0xd806ccUL
+#define  PTU_REG_ATC_INIT_ARRAY \
+	0x560000UL
+#define  PCM_REG_INIT \
+	0x1100000UL
+#define  PGLUE_B_REG_ADMIN_PER_PF_REGION	\
+	0x2a9000UL
+#define PGLUE_B_REG_TX_ERR_WR_DETAILS2 \
+	0x2aa150UL
+#define PGLUE_B_REG_TX_ERR_WR_ADD_31_0 \
+	0x2aa144UL
+#define PGLUE_B_REG_TX_ERR_WR_ADD_63_32 \
+	0x2aa148UL
+#define PGLUE_B_REG_TX_ERR_WR_DETAILS \
+	0x2aa14cUL
+#define PGLUE_B_REG_TX_ERR_RD_ADD_31_0 \
+	0x2aa154UL
+#define PGLUE_B_REG_TX_ERR_RD_ADD_63_32 \
+	0x2aa158UL
+#define PGLUE_B_REG_TX_ERR_RD_DETAILS \
+	0x2aa15cUL
+#define PGLUE_B_REG_TX_ERR_RD_DETAILS2 \
+	0x2aa160UL
+#define PGLUE_B_REG_TX_ERR_WR_DETAILS_ICPL \
+	0x2aa164UL
+#define PGLUE_B_REG_MASTER_ZLR_ERR_DETAILS \
+	0x2aa54cUL
+#define PGLUE_B_REG_MASTER_ZLR_ERR_ADD_31_0 \
+	0x2aa544UL
+#define PGLUE_B_REG_MASTER_ZLR_ERR_ADD_63_32 \
+	0x2aa548UL
+#define PGLUE_B_REG_VF_ILT_ERR_ADD_31_0 \
+	0x2aae74UL
+#define PGLUE_B_REG_VF_ILT_ERR_ADD_63_32 \
+	0x2aae78UL
+#define PGLUE_B_REG_VF_ILT_ERR_DETAILS \
+	0x2aae7cUL
+#define PGLUE_B_REG_VF_ILT_ERR_DETAILS2 \
+	0x2aae80UL
+#define PGLUE_B_REG_LATCHED_ERRORS_CLR \
+	0x2aa3bcUL
+#define  PRM_REG_DISABLE_PRM \
+	0x230000UL
+#define  PRS_REG_SOFT_RST \
+	0x1f0000UL
+#define PRS_REG_MSG_INFO \
+	0x1f0a1cUL
+#define PRS_REG_ROCE_DEST_QP_MAX_PF \
+	0x1f0430UL
+#define PRS_REG_USE_LIGHT_L2 \
+	0x1f096cUL
+#define  PSDM_REG_ENABLE_IN1 \
+	0xfa0004UL
+#define  PSEM_REG_ENABLE_IN \
+	0x1600004UL
+#define  PSWRQ_REG_DBG_SELECT \
+	0x280020UL
+#define  PSWRQ2_REG_CDUT_P_SIZE \
+	0x24000cUL
+#define PSWRQ2_REG_ILT_MEMORY \
+	0x260000UL
+#define  PSWHST_REG_DISCARD_INTERNAL_WRITES \
+	0x2a0040UL
+#define  PSWHST2_REG_DBGSYN_ALMOST_FULL_THR \
+	0x29e050UL
+#define PSWHST_REG_INCORRECT_ACCESS_VALID \
+	0x2a0070UL
+#define PSWHST_REG_INCORRECT_ACCESS_ADDRESS \
+	0x2a0074UL
+#define PSWHST_REG_INCORRECT_ACCESS_DATA \
+	0x2a0068UL
+#define PSWHST_REG_INCORRECT_ACCESS_LENGTH \
+	0x2a006cUL
+#define  PSWRD_REG_DBG_SELECT \
+	0x29c040UL
+#define  PSWRD2_REG_CONF11 \
+	0x29d064UL
+#define  PSWWR_REG_USDM_FULL_TH \
+	0x29a040UL
+#define  PSWWR2_REG_CDU_FULL_TH2	\
+	0x29b040UL
+#define  QM_REG_MAXPQSIZE_0 \
+	0x2f0434UL
+#define  RSS_REG_RSS_INIT_EN \
+	0x238804UL
+#define  RDIF_REG_STOP_ON_ERROR \
+	0x300040UL
+#define RDIF_REG_DEBUG_ERROR_INFO \
+	0x300400UL
+#define RDIF_REG_DEBUG_ERROR_INFO_SIZE \
+	64
+#define  SRC_REG_SOFT_RST \
+	0x23874cUL
+#define  TCFC_REG_ACTIVITY_COUNTER \
+	0x2d8800UL
+#define  TCM_REG_INIT \
+	0x1180000UL
+#define  TM_REG_PXP_READ_DATA_FIFO_INIT \
+	0x2c0014UL
+#define  TSDM_REG_ENABLE_IN1 \
+	0xfb0004UL
+#define  TSEM_REG_ENABLE_IN \
+	0x1700004UL
+#define  TDIF_REG_STOP_ON_ERROR \
+	0x310040UL
+#define TDIF_REG_DEBUG_ERROR_INFO \
+	0x310400UL
+#define TDIF_REG_DEBUG_ERROR_INFO_SIZE \
+	64
+#define  UCM_REG_INIT \
+	0x1280000UL
+#define  UMAC_REG_IPG_HD_BKP_CNTL_BB_B0 \
+	0x051004UL
+#define  USDM_REG_ENABLE_IN1 \
+	0xfd0004UL
+#define  USEM_REG_ENABLE_IN \
+	0x1900004UL
+#define  XCM_REG_INIT \
+	0x1000000UL
+#define  XSDM_REG_ENABLE_IN1 \
+	0xf80004UL
+#define  XSEM_REG_ENABLE_IN \
+	0x1400004UL
+#define  YCM_REG_INIT \
+	0x1080000UL
+#define  YSDM_REG_ENABLE_IN1 \
+	0xf90004UL
+#define  YSEM_REG_ENABLE_IN \
+	0x1500004UL
+#define  XYLD_REG_SCBD_STRICT_PRIO \
+	0x4c0000UL
+#define  TMLD_REG_SCBD_STRICT_PRIO \
+	0x4d0000UL
+#define  MULD_REG_SCBD_STRICT_PRIO \
+	0x4e0000UL
+#define  YULD_REG_SCBD_STRICT_PRIO \
+	0x4c8000UL
+#define  MISC_REG_SHARED_MEM_ADDR \
+	0x008c20UL
+#define  DMAE_REG_GO_C0 \
+	0x00c048UL
+#define  DMAE_REG_GO_C1 \
+	0x00c04cUL
+#define  DMAE_REG_GO_C2 \
+	0x00c050UL
+#define  DMAE_REG_GO_C3 \
+	0x00c054UL
+#define  DMAE_REG_GO_C4 \
+	0x00c058UL
+#define  DMAE_REG_GO_C5 \
+	0x00c05cUL
+#define  DMAE_REG_GO_C6 \
+	0x00c060UL
+#define  DMAE_REG_GO_C7 \
+	0x00c064UL
+#define  DMAE_REG_GO_C8 \
+	0x00c068UL
+#define  DMAE_REG_GO_C9 \
+	0x00c06cUL
+#define  DMAE_REG_GO_C10	\
+	0x00c070UL
+#define  DMAE_REG_GO_C11	\
+	0x00c074UL
+#define  DMAE_REG_GO_C12	\
+	0x00c078UL
+#define  DMAE_REG_GO_C13	\
+	0x00c07cUL
+#define  DMAE_REG_GO_C14	\
+	0x00c080UL
+#define  DMAE_REG_GO_C15	\
+	0x00c084UL
+#define  DMAE_REG_GO_C16	\
+	0x00c088UL
+#define  DMAE_REG_GO_C17	\
+	0x00c08cUL
+#define  DMAE_REG_GO_C18	\
+	0x00c090UL
+#define  DMAE_REG_GO_C19	\
+	0x00c094UL
+#define  DMAE_REG_GO_C20	\
+	0x00c098UL
+#define  DMAE_REG_GO_C21	\
+	0x00c09cUL
+#define  DMAE_REG_GO_C22	\
+	0x00c0a0UL
+#define  DMAE_REG_GO_C23	\
+	0x00c0a4UL
+#define  DMAE_REG_GO_C24	\
+	0x00c0a8UL
+#define  DMAE_REG_GO_C25	\
+	0x00c0acUL
+#define  DMAE_REG_GO_C26	\
+	0x00c0b0UL
+#define  DMAE_REG_GO_C27	\
+	0x00c0b4UL
+#define  DMAE_REG_GO_C28	\
+	0x00c0b8UL
+#define  DMAE_REG_GO_C29	\
+	0x00c0bcUL
+#define  DMAE_REG_GO_C30	\
+	0x00c0c0UL
+#define  DMAE_REG_GO_C31	\
+	0x00c0c4UL
+#define  DMAE_REG_CMD_MEM \
+	0x00c800UL
+#define  QM_REG_MAXPQSIZETXSEL_0	\
+	0x2f0440UL
+#define  QM_REG_SDMCMDREADY \
+	0x2f1e10UL
+#define  QM_REG_SDMCMDADDR \
+	0x2f1e04UL
+#define  QM_REG_SDMCMDDATALSB \
+	0x2f1e08UL
+#define  QM_REG_SDMCMDDATAMSB \
+	0x2f1e0cUL
+#define  QM_REG_SDMCMDGO	\
+	0x2f1e14UL
+#define  QM_REG_RLPFCRD \
+	0x2f4d80UL
+#define  QM_REG_RLPFINCVAL \
+	0x2f4c80UL
+#define  QM_REG_RLGLBLCRD \
+	0x2f4400UL
+#define  QM_REG_RLGLBLINCVAL \
+	0x2f3400UL
+#define  IGU_REG_ATTENTION_ENABLE \
+	0x18083cUL
+#define  IGU_REG_ATTN_MSG_ADDR_L	\
+	0x180820UL
+#define  IGU_REG_ATTN_MSG_ADDR_H	\
+	0x180824UL
+#define  MISC_REG_AEU_GENERAL_ATTN_0 \
+	0x008400UL
+#define  CAU_REG_SB_ADDR_MEMORY \
+	0x1c8000UL
+#define  CAU_REG_SB_VAR_MEMORY \
+	0x1c6000UL
+#define  CAU_REG_PI_MEMORY \
+	0x1d0000UL
+#define  IGU_REG_PF_CONFIGURATION \
+	0x180800UL
+#define IGU_REG_VF_CONFIGURATION \
+	0x180804UL
+#define  MISC_REG_AEU_ENABLE1_IGU_OUT_0 \
+	0x00849cUL
+#define MISC_REG_AEU_AFTER_INVERT_1_IGU	\
+	0x0087b4UL
+#define  MISC_REG_AEU_MASK_ATTN_IGU \
+	0x008494UL
+#define  IGU_REG_CLEANUP_STATUS_0 \
+	0x180980UL
+#define  IGU_REG_CLEANUP_STATUS_1 \
+	0x180a00UL
+#define  IGU_REG_CLEANUP_STATUS_2 \
+	0x180a80UL
+#define  IGU_REG_CLEANUP_STATUS_3 \
+	0x180b00UL
+#define  IGU_REG_CLEANUP_STATUS_4 \
+	0x180b80UL
+#define  IGU_REG_COMMAND_REG_32LSB_DATA \
+	0x180840UL
+#define  IGU_REG_COMMAND_REG_CTRL \
+	0x180848UL
+#define  IGU_REG_BLOCK_CONFIGURATION_VF_CLEANUP_EN	( \
+		0x1 << 1)
+#define  IGU_REG_BLOCK_CONFIGURATION_PXP_TPH_INTERFACE_EN	( \
+		0x1 << 0)
+#define  IGU_REG_MAPPING_MEMORY \
+	0x184000UL
+#define IGU_REG_STATISTIC_NUM_VF_MSG_SENT \
+	0x180408UL
+#define IGU_REG_WRITE_DONE_PENDING \
+	0x180900UL
+#define  MISCS_REG_GENERIC_POR_0	\
+	0x0096d4UL
+#define  MCP_REG_NVM_CFG4 \
+	0xe0642cUL
+#define  MCP_REG_NVM_CFG4_FLASH_SIZE	( \
+		0x7 << 0)
+#define  MCP_REG_NVM_CFG4_FLASH_SIZE_SHIFT \
+	0
+#define MCP_REG_CPU_STATE \
+	0xe05004UL
+#define MCP_REG_CPU_EVENT_MASK \
+	0xe05008UL
+#define PGLUE_B_REG_PF_BAR0_SIZE \
+	0x2aae60UL
+#define PGLUE_B_REG_PF_BAR1_SIZE \
+	0x2aae64UL
+#define PGLUE_B_REG_VF_BAR1_SIZE 0x2aae68UL
+#define PRS_REG_ENCAPSULATION_TYPE_EN	0x1f0730UL
+#define PRS_REG_GRE_PROTOCOL		0x1f0734UL
+#define PRS_REG_VXLAN_PORT		0x1f0738UL
+#define PRS_REG_OUTPUT_FORMAT_4_0_BB_K2	0x1f099cUL
+#define NIG_REG_ENC_TYPE_ENABLE		0x501058UL
+
+#define NIG_REG_ENC_TYPE_ENABLE_ETH_OVER_GRE_ENABLE		(0x1 << 0)
+#define NIG_REG_ENC_TYPE_ENABLE_ETH_OVER_GRE_ENABLE_SHIFT	0
+#define NIG_REG_ENC_TYPE_ENABLE_IP_OVER_GRE_ENABLE		(0x1 << 1)
+#define NIG_REG_ENC_TYPE_ENABLE_IP_OVER_GRE_ENABLE_SHIFT	1
+#define NIG_REG_ENC_TYPE_ENABLE_VXLAN_ENABLE			(0x1 << 2)
+#define NIG_REG_ENC_TYPE_ENABLE_VXLAN_ENABLE_SHIFT		2
+
+#define NIG_REG_VXLAN_CTRL		0x50105cUL
+#define PBF_REG_VXLAN_PORT		0xd80518UL
+#define PBF_REG_NGE_PORT		0xd8051cUL
+#define PRS_REG_NGE_PORT		0x1f086cUL
+#define NIG_REG_NGE_PORT		0x508b38UL
+
+#define DORQ_REG_L2_EDPM_TUNNEL_GRE_ETH_EN		0x10090cUL
+#define DORQ_REG_L2_EDPM_TUNNEL_GRE_IP_EN		0x100910UL
+#define DORQ_REG_L2_EDPM_TUNNEL_VXLAN_EN		0x100914UL
+#define DORQ_REG_L2_EDPM_TUNNEL_NGE_IP_EN_K2_E5		0x10092cUL
+#define DORQ_REG_L2_EDPM_TUNNEL_NGE_ETH_EN_K2_E5	0x100930UL
+
+#define NIG_REG_NGE_IP_ENABLE			0x508b28UL
+#define NIG_REG_NGE_ETH_ENABLE			0x508b2cUL
+#define NIG_REG_NGE_COMP_VER			0x508b30UL
+#define PBF_REG_NGE_COMP_VER			0xd80524UL
+#define PRS_REG_NGE_COMP_VER			0x1f0878UL
+
+#define QM_REG_WFQPFWEIGHT	0x2f4e80UL
+#define QM_REG_WFQVPWEIGHT	0x2fa000UL
+
+#define PGLCS_REG_DBG_SELECT_K2_E5 \
+	0x001d14UL
+#define PGLCS_REG_DBG_DWORD_ENABLE_K2_E5 \
+	0x001d18UL
+#define PGLCS_REG_DBG_SHIFT_K2_E5 \
+	0x001d1cUL
+#define PGLCS_REG_DBG_FORCE_VALID_K2_E5 \
+	0x001d20UL
+#define PGLCS_REG_DBG_FORCE_FRAME_K2_E5 \
+	0x001d24UL
+#define MISC_REG_RESET_PL_PDA_VMAIN_1 \
+	0x008070UL
+#define MISC_REG_RESET_PL_PDA_VMAIN_2 \
+	0x008080UL
+#define MISC_REG_RESET_PL_PDA_VAUX \
+	0x008090UL
+#define MISCS_REG_RESET_PL_UA \
+	0x009050UL
+#define MISCS_REG_RESET_PL_HV \
+	0x009060UL
+#define MISCS_REG_RESET_PL_HV_2_K2_E5 \
+	0x009150UL
+#define DMAE_REG_DBG_SELECT \
+	0x00c510UL
+#define DMAE_REG_DBG_DWORD_ENABLE \
+	0x00c514UL
+#define DMAE_REG_DBG_SHIFT \
+	0x00c518UL
+#define DMAE_REG_DBG_FORCE_VALID \
+	0x00c51cUL
+#define DMAE_REG_DBG_FORCE_FRAME \
+	0x00c520UL
+#define NCSI_REG_DBG_SELECT \
+	0x040474UL
+#define NCSI_REG_DBG_DWORD_ENABLE \
+	0x040478UL
+#define NCSI_REG_DBG_SHIFT \
+	0x04047cUL
+#define NCSI_REG_DBG_FORCE_VALID \
+	0x040480UL
+#define NCSI_REG_DBG_FORCE_FRAME \
+	0x040484UL
+#define GRC_REG_DBG_SELECT \
+	0x0500a4UL
+#define GRC_REG_DBG_DWORD_ENABLE \
+	0x0500a8UL
+#define GRC_REG_DBG_SHIFT \
+	0x0500acUL
+#define GRC_REG_DBG_FORCE_VALID	\
+	0x0500b0UL
+#define GRC_REG_DBG_FORCE_FRAME	\
+	0x0500b4UL
+#define UMAC_REG_DBG_SELECT_K2_E5 \
+	0x051094UL
+#define UMAC_REG_DBG_DWORD_ENABLE_K2_E5 \
+	0x051098UL
+#define UMAC_REG_DBG_SHIFT_K2_E5 \
+	0x05109cUL
+#define UMAC_REG_DBG_FORCE_VALID_K2_E5 \
+	0x0510a0UL
+#define UMAC_REG_DBG_FORCE_FRAME_K2_E5 \
+	0x0510a4UL
+#define MCP2_REG_DBG_SELECT \
+	0x052400UL
+#define MCP2_REG_DBG_DWORD_ENABLE \
+	0x052404UL
+#define MCP2_REG_DBG_SHIFT \
+	0x052408UL
+#define MCP2_REG_DBG_FORCE_VALID \
+	0x052440UL
+#define MCP2_REG_DBG_FORCE_FRAME \
+	0x052444UL
+#define PCIE_REG_DBG_SELECT \
+	0x0547e8UL
+#define PCIE_REG_DBG_DWORD_ENABLE \
+	0x0547ecUL
+#define PCIE_REG_DBG_SHIFT \
+	0x0547f0UL
+#define PCIE_REG_DBG_FORCE_VALID \
+	0x0547f4UL
+#define PCIE_REG_DBG_FORCE_FRAME \
+	0x0547f8UL
+#define DORQ_REG_DBG_SELECT \
+	0x100ad0UL
+#define DORQ_REG_DBG_DWORD_ENABLE \
+	0x100ad4UL
+#define DORQ_REG_DBG_SHIFT \
+	0x100ad8UL
+#define DORQ_REG_DBG_FORCE_VALID \
+	0x100adcUL
+#define DORQ_REG_DBG_FORCE_FRAME \
+	0x100ae0UL
+#define IGU_REG_DBG_SELECT \
+	0x181578UL
+#define IGU_REG_DBG_DWORD_ENABLE \
+	0x18157cUL
+#define IGU_REG_DBG_SHIFT \
+	0x181580UL
+#define IGU_REG_DBG_FORCE_VALID	\
+	0x181584UL
+#define IGU_REG_DBG_FORCE_FRAME	\
+	0x181588UL
+#define CAU_REG_DBG_SELECT \
+	0x1c0ea8UL
+#define CAU_REG_DBG_DWORD_ENABLE \
+	0x1c0eacUL
+#define CAU_REG_DBG_SHIFT \
+	0x1c0eb0UL
+#define CAU_REG_DBG_FORCE_VALID	\
+	0x1c0eb4UL
+#define CAU_REG_DBG_FORCE_FRAME	\
+	0x1c0eb8UL
+#define PRS_REG_DBG_SELECT \
+	0x1f0b6cUL
+#define PRS_REG_DBG_DWORD_ENABLE \
+	0x1f0b70UL
+#define PRS_REG_DBG_SHIFT \
+	0x1f0b74UL
+#define PRS_REG_DBG_FORCE_VALID	\
+	0x1f0ba0UL
+#define PRS_REG_DBG_FORCE_FRAME	\
+	0x1f0ba4UL
+#define CNIG_REG_DBG_SELECT_K2_E5 \
+	0x218254UL
+#define CNIG_REG_DBG_DWORD_ENABLE_K2_E5 \
+	0x218258UL
+#define CNIG_REG_DBG_SHIFT_K2_E5 \
+	0x21825cUL
+#define CNIG_REG_DBG_FORCE_VALID_K2_E5 \
+	0x218260UL
+#define CNIG_REG_DBG_FORCE_FRAME_K2_E5 \
+	0x218264UL
+#define PRM_REG_DBG_SELECT \
+	0x2306a8UL
+#define PRM_REG_DBG_DWORD_ENABLE \
+	0x2306acUL
+#define PRM_REG_DBG_SHIFT \
+	0x2306b0UL
+#define PRM_REG_DBG_FORCE_VALID	\
+	0x2306b4UL
+#define PRM_REG_DBG_FORCE_FRAME	\
+	0x2306b8UL
+#define SRC_REG_DBG_SELECT \
+	0x238700UL
+#define SRC_REG_DBG_DWORD_ENABLE \
+	0x238704UL
+#define SRC_REG_DBG_SHIFT \
+	0x238708UL
+#define SRC_REG_DBG_FORCE_VALID	\
+	0x23870cUL
+#define SRC_REG_DBG_FORCE_FRAME	\
+	0x238710UL
+#define RSS_REG_DBG_SELECT \
+	0x238c4cUL
+#define RSS_REG_DBG_DWORD_ENABLE \
+	0x238c50UL
+#define RSS_REG_DBG_SHIFT \
+	0x238c54UL
+#define RSS_REG_DBG_FORCE_VALID	\
+	0x238c58UL
+#define RSS_REG_DBG_FORCE_FRAME	\
+	0x238c5cUL
+#define RPB_REG_DBG_SELECT \
+	0x23c728UL
+#define RPB_REG_DBG_DWORD_ENABLE \
+	0x23c72cUL
+#define RPB_REG_DBG_SHIFT \
+	0x23c730UL
+#define RPB_REG_DBG_FORCE_VALID	\
+	0x23c734UL
+#define RPB_REG_DBG_FORCE_FRAME	\
+	0x23c738UL
+#define PSWRQ2_REG_DBG_SELECT \
+	0x240100UL
+#define PSWRQ2_REG_DBG_DWORD_ENABLE \
+	0x240104UL
+#define PSWRQ2_REG_DBG_SHIFT \
+	0x240108UL
+#define PSWRQ2_REG_DBG_FORCE_VALID \
+	0x24010cUL
+#define PSWRQ2_REG_DBG_FORCE_FRAME \
+	0x240110UL
+#define PSWRQ_REG_DBG_SELECT \
+	0x280020UL
+#define PSWRQ_REG_DBG_DWORD_ENABLE \
+	0x280024UL
+#define PSWRQ_REG_DBG_SHIFT \
+	0x280028UL
+#define PSWRQ_REG_DBG_FORCE_VALID \
+	0x28002cUL
+#define PSWRQ_REG_DBG_FORCE_FRAME \
+	0x280030UL
+#define PSWWR_REG_DBG_SELECT \
+	0x29a084UL
+#define PSWWR_REG_DBG_DWORD_ENABLE \
+	0x29a088UL
+#define PSWWR_REG_DBG_SHIFT \
+	0x29a08cUL
+#define PSWWR_REG_DBG_FORCE_VALID \
+	0x29a090UL
+#define PSWWR_REG_DBG_FORCE_FRAME \
+	0x29a094UL
+#define PSWRD_REG_DBG_SELECT \
+	0x29c040UL
+#define PSWRD_REG_DBG_DWORD_ENABLE \
+	0x29c044UL
+#define PSWRD_REG_DBG_SHIFT \
+	0x29c048UL
+#define PSWRD_REG_DBG_FORCE_VALID \
+	0x29c04cUL
+#define PSWRD_REG_DBG_FORCE_FRAME \
+	0x29c050UL
+#define PSWRD2_REG_DBG_SELECT \
+	0x29d400UL
+#define PSWRD2_REG_DBG_DWORD_ENABLE \
+	0x29d404UL
+#define PSWRD2_REG_DBG_SHIFT \
+	0x29d408UL
+#define PSWRD2_REG_DBG_FORCE_VALID \
+	0x29d40cUL
+#define PSWRD2_REG_DBG_FORCE_FRAME \
+	0x29d410UL
+#define PSWHST2_REG_DBG_SELECT \
+	0x29e058UL
+#define PSWHST2_REG_DBG_DWORD_ENABLE \
+	0x29e05cUL
+#define PSWHST2_REG_DBG_SHIFT \
+	0x29e060UL
+#define PSWHST2_REG_DBG_FORCE_VALID \
+	0x29e064UL
+#define PSWHST2_REG_DBG_FORCE_FRAME \
+	0x29e068UL
+#define PSWHST_REG_DBG_SELECT \
+	0x2a0100UL
+#define PSWHST_REG_DBG_DWORD_ENABLE \
+	0x2a0104UL
+#define PSWHST_REG_DBG_SHIFT \
+	0x2a0108UL
+#define PSWHST_REG_DBG_FORCE_VALID \
+	0x2a010cUL
+#define PSWHST_REG_DBG_FORCE_FRAME \
+	0x2a0110UL
+#define PGLUE_B_REG_DBG_SELECT \
+	0x2a8400UL
+#define PGLUE_B_REG_DBG_DWORD_ENABLE \
+	0x2a8404UL
+#define PGLUE_B_REG_DBG_SHIFT \
+	0x2a8408UL
+#define PGLUE_B_REG_DBG_FORCE_VALID \
+	0x2a840cUL
+#define PGLUE_B_REG_DBG_FORCE_FRAME \
+	0x2a8410UL
+#define TM_REG_DBG_SELECT \
+	0x2c07a8UL
+#define TM_REG_DBG_DWORD_ENABLE	\
+	0x2c07acUL
+#define TM_REG_DBG_SHIFT \
+	0x2c07b0UL
+#define TM_REG_DBG_FORCE_VALID \
+	0x2c07b4UL
+#define TM_REG_DBG_FORCE_FRAME \
+	0x2c07b8UL
+#define TCFC_REG_DBG_SELECT \
+	0x2d0500UL
+#define TCFC_REG_DBG_DWORD_ENABLE \
+	0x2d0504UL
+#define TCFC_REG_DBG_SHIFT \
+	0x2d0508UL
+#define TCFC_REG_DBG_FORCE_VALID \
+	0x2d050cUL
+#define TCFC_REG_DBG_FORCE_FRAME \
+	0x2d0510UL
+#define CCFC_REG_DBG_SELECT \
+	0x2e0500UL
+#define CCFC_REG_DBG_DWORD_ENABLE \
+	0x2e0504UL
+#define CCFC_REG_DBG_SHIFT \
+	0x2e0508UL
+#define CCFC_REG_DBG_FORCE_VALID \
+	0x2e050cUL
+#define CCFC_REG_DBG_FORCE_FRAME \
+	0x2e0510UL
+#define QM_REG_DBG_SELECT \
+	0x2f2e74UL
+#define QM_REG_DBG_DWORD_ENABLE	\
+	0x2f2e78UL
+#define QM_REG_DBG_SHIFT \
+	0x2f2e7cUL
+#define QM_REG_DBG_FORCE_VALID \
+	0x2f2e80UL
+#define QM_REG_DBG_FORCE_FRAME \
+	0x2f2e84UL
+#define RDIF_REG_DBG_SELECT \
+	0x300500UL
+#define RDIF_REG_DBG_DWORD_ENABLE \
+	0x300504UL
+#define RDIF_REG_DBG_SHIFT \
+	0x300508UL
+#define RDIF_REG_DBG_FORCE_VALID \
+	0x30050cUL
+#define RDIF_REG_DBG_FORCE_FRAME \
+	0x300510UL
+#define TDIF_REG_DBG_SELECT \
+	0x310500UL
+#define TDIF_REG_DBG_DWORD_ENABLE \
+	0x310504UL
+#define TDIF_REG_DBG_SHIFT \
+	0x310508UL
+#define TDIF_REG_DBG_FORCE_VALID \
+	0x31050cUL
+#define TDIF_REG_DBG_FORCE_FRAME \
+	0x310510UL
+#define BRB_REG_DBG_SELECT \
+	0x340ed0UL
+#define BRB_REG_DBG_DWORD_ENABLE \
+	0x340ed4UL
+#define BRB_REG_DBG_SHIFT \
+	0x340ed8UL
+#define BRB_REG_DBG_FORCE_VALID	\
+	0x340edcUL
+#define BRB_REG_DBG_FORCE_FRAME	\
+	0x340ee0UL
+#define XYLD_REG_DBG_SELECT \
+	0x4c1600UL
+#define XYLD_REG_DBG_DWORD_ENABLE \
+	0x4c1604UL
+#define XYLD_REG_DBG_SHIFT \
+	0x4c1608UL
+#define XYLD_REG_DBG_FORCE_VALID \
+	0x4c160cUL
+#define XYLD_REG_DBG_FORCE_FRAME \
+	0x4c1610UL
+#define YULD_REG_DBG_SELECT_BB_K2 \
+	0x4c9600UL
+#define YULD_REG_DBG_DWORD_ENABLE_BB_K2 \
+	0x4c9604UL
+#define YULD_REG_DBG_SHIFT_BB_K2 \
+	0x4c9608UL
+#define YULD_REG_DBG_FORCE_VALID_BB_K2 \
+	0x4c960cUL
+#define YULD_REG_DBG_FORCE_FRAME_BB_K2 \
+	0x4c9610UL
+#define TMLD_REG_DBG_SELECT \
+	0x4d1600UL
+#define TMLD_REG_DBG_DWORD_ENABLE \
+	0x4d1604UL
+#define TMLD_REG_DBG_SHIFT \
+	0x4d1608UL
+#define TMLD_REG_DBG_FORCE_VALID \
+	0x4d160cUL
+#define TMLD_REG_DBG_FORCE_FRAME \
+	0x4d1610UL
+#define MULD_REG_DBG_SELECT \
+	0x4e1600UL
+#define MULD_REG_DBG_DWORD_ENABLE \
+	0x4e1604UL
+#define MULD_REG_DBG_SHIFT \
+	0x4e1608UL
+#define MULD_REG_DBG_FORCE_VALID \
+	0x4e160cUL
+#define MULD_REG_DBG_FORCE_FRAME \
+	0x4e1610UL
+#define NIG_REG_DBG_SELECT \
+	0x502140UL
+#define NIG_REG_DBG_DWORD_ENABLE \
+	0x502144UL
+#define NIG_REG_DBG_SHIFT \
+	0x502148UL
+#define NIG_REG_DBG_FORCE_VALID	\
+	0x50214cUL
+#define NIG_REG_DBG_FORCE_FRAME	\
+	0x502150UL
+#define BMB_REG_DBG_SELECT \
+	0x540a7cUL
+#define BMB_REG_DBG_DWORD_ENABLE \
+	0x540a80UL
+#define BMB_REG_DBG_SHIFT \
+	0x540a84UL
+#define BMB_REG_DBG_FORCE_VALID	\
+	0x540a88UL
+#define BMB_REG_DBG_FORCE_FRAME	\
+	0x540a8cUL
+#define PTU_REG_DBG_SELECT \
+	0x560100UL
+#define PTU_REG_DBG_DWORD_ENABLE \
+	0x560104UL
+#define PTU_REG_DBG_SHIFT \
+	0x560108UL
+#define PTU_REG_DBG_FORCE_VALID	\
+	0x56010cUL
+#define PTU_REG_DBG_FORCE_FRAME	\
+	0x560110UL
+#define CDU_REG_DBG_SELECT \
+	0x580704UL
+#define CDU_REG_DBG_DWORD_ENABLE \
+	0x580708UL
+#define CDU_REG_DBG_SHIFT \
+	0x58070cUL
+#define CDU_REG_DBG_FORCE_VALID	\
+	0x580710UL
+#define CDU_REG_DBG_FORCE_FRAME	\
+	0x580714UL
+#define WOL_REG_DBG_SELECT_K2_E5 \
+	0x600140UL
+#define WOL_REG_DBG_DWORD_ENABLE_K2_E5 \
+	0x600144UL
+#define WOL_REG_DBG_SHIFT_K2_E5 \
+	0x600148UL
+#define WOL_REG_DBG_FORCE_VALID_K2_E5 \
+	0x60014cUL
+#define WOL_REG_DBG_FORCE_FRAME_K2_E5 \
+	0x600150UL
+#define BMBN_REG_DBG_SELECT_K2_E5 \
+	0x610140UL
+#define BMBN_REG_DBG_DWORD_ENABLE_K2_E5 \
+	0x610144UL
+#define BMBN_REG_DBG_SHIFT_K2_E5 \
+	0x610148UL
+#define BMBN_REG_DBG_FORCE_VALID_K2_E5 \
+	0x61014cUL
+#define BMBN_REG_DBG_FORCE_FRAME_K2_E5 \
+	0x610150UL
+#define NWM_REG_DBG_SELECT_K2_E5 \
+	0x8000ecUL
+#define NWM_REG_DBG_DWORD_ENABLE_K2_E5 \
+	0x8000f0UL
+#define NWM_REG_DBG_SHIFT_K2_E5 \
+	0x8000f4UL
+#define NWM_REG_DBG_FORCE_VALID_K2_E5 \
+	0x8000f8UL
+#define NWM_REG_DBG_FORCE_FRAME_K2_E5 \
+	0x8000fcUL
+#define PBF_REG_DBG_SELECT \
+	0xd80060UL
+#define PBF_REG_DBG_DWORD_ENABLE \
+	0xd80064UL
+#define PBF_REG_DBG_SHIFT \
+	0xd80068UL
+#define PBF_REG_DBG_FORCE_VALID	\
+	0xd8006cUL
+#define PBF_REG_DBG_FORCE_FRAME	\
+	0xd80070UL
+#define PBF_PB1_REG_DBG_SELECT \
+	0xda0728UL
+#define PBF_PB1_REG_DBG_DWORD_ENABLE \
+	0xda072cUL
+#define PBF_PB1_REG_DBG_SHIFT \
+	0xda0730UL
+#define PBF_PB1_REG_DBG_FORCE_VALID \
+	0xda0734UL
+#define PBF_PB1_REG_DBG_FORCE_FRAME \
+	0xda0738UL
+#define PBF_PB2_REG_DBG_SELECT \
+	0xda4728UL
+#define PBF_PB2_REG_DBG_DWORD_ENABLE \
+	0xda472cUL
+#define PBF_PB2_REG_DBG_SHIFT \
+	0xda4730UL
+#define PBF_PB2_REG_DBG_FORCE_VALID \
+	0xda4734UL
+#define PBF_PB2_REG_DBG_FORCE_FRAME \
+	0xda4738UL
+#define BTB_REG_DBG_SELECT \
+	0xdb08c8UL
+#define BTB_REG_DBG_DWORD_ENABLE \
+	0xdb08ccUL
+#define BTB_REG_DBG_SHIFT \
+	0xdb08d0UL
+#define BTB_REG_DBG_FORCE_VALID	\
+	0xdb08d4UL
+#define BTB_REG_DBG_FORCE_FRAME	\
+	0xdb08d8UL
+#define XSDM_REG_DBG_SELECT \
+	0xf80e28UL
+#define XSDM_REG_DBG_DWORD_ENABLE \
+	0xf80e2cUL
+#define XSDM_REG_DBG_SHIFT \
+	0xf80e30UL
+#define XSDM_REG_DBG_FORCE_VALID \
+	0xf80e34UL
+#define XSDM_REG_DBG_FORCE_FRAME \
+	0xf80e38UL
+#define YSDM_REG_DBG_SELECT \
+	0xf90e28UL
+#define YSDM_REG_DBG_DWORD_ENABLE \
+	0xf90e2cUL
+#define YSDM_REG_DBG_SHIFT \
+	0xf90e30UL
+#define YSDM_REG_DBG_FORCE_VALID \
+	0xf90e34UL
+#define YSDM_REG_DBG_FORCE_FRAME \
+	0xf90e38UL
+#define PSDM_REG_DBG_SELECT \
+	0xfa0e28UL
+#define PSDM_REG_DBG_DWORD_ENABLE \
+	0xfa0e2cUL
+#define PSDM_REG_DBG_SHIFT \
+	0xfa0e30UL
+#define PSDM_REG_DBG_FORCE_VALID \
+	0xfa0e34UL
+#define PSDM_REG_DBG_FORCE_FRAME \
+	0xfa0e38UL
+#define TSDM_REG_DBG_SELECT \
+	0xfb0e28UL
+#define TSDM_REG_DBG_DWORD_ENABLE \
+	0xfb0e2cUL
+#define TSDM_REG_DBG_SHIFT \
+	0xfb0e30UL
+#define TSDM_REG_DBG_FORCE_VALID \
+	0xfb0e34UL
+#define TSDM_REG_DBG_FORCE_FRAME \
+	0xfb0e38UL
+#define MSDM_REG_DBG_SELECT \
+	0xfc0e28UL
+#define MSDM_REG_DBG_DWORD_ENABLE \
+	0xfc0e2cUL
+#define MSDM_REG_DBG_SHIFT \
+	0xfc0e30UL
+#define MSDM_REG_DBG_FORCE_VALID \
+	0xfc0e34UL
+#define MSDM_REG_DBG_FORCE_FRAME \
+	0xfc0e38UL
+#define USDM_REG_DBG_SELECT \
+	0xfd0e28UL
+#define USDM_REG_DBG_DWORD_ENABLE \
+	0xfd0e2cUL
+#define USDM_REG_DBG_SHIFT \
+	0xfd0e30UL
+#define USDM_REG_DBG_FORCE_VALID \
+	0xfd0e34UL
+#define USDM_REG_DBG_FORCE_FRAME \
+	0xfd0e38UL
+#define XCM_REG_DBG_SELECT \
+	0x1000040UL
+#define XCM_REG_DBG_DWORD_ENABLE \
+	0x1000044UL
+#define XCM_REG_DBG_SHIFT \
+	0x1000048UL
+#define XCM_REG_DBG_FORCE_VALID	\
+	0x100004cUL
+#define XCM_REG_DBG_FORCE_FRAME	\
+	0x1000050UL
+#define YCM_REG_DBG_SELECT \
+	0x1080040UL
+#define YCM_REG_DBG_DWORD_ENABLE \
+	0x1080044UL
+#define YCM_REG_DBG_SHIFT \
+	0x1080048UL
+#define YCM_REG_DBG_FORCE_VALID	\
+	0x108004cUL
+#define YCM_REG_DBG_FORCE_FRAME	\
+	0x1080050UL
+#define PCM_REG_DBG_SELECT \
+	0x1100040UL
+#define PCM_REG_DBG_DWORD_ENABLE \
+	0x1100044UL
+#define PCM_REG_DBG_SHIFT \
+	0x1100048UL
+#define PCM_REG_DBG_FORCE_VALID	\
+	0x110004cUL
+#define PCM_REG_DBG_FORCE_FRAME	\
+	0x1100050UL
+#define TCM_REG_DBG_SELECT \
+	0x1180040UL
+#define TCM_REG_DBG_DWORD_ENABLE \
+	0x1180044UL
+#define TCM_REG_DBG_SHIFT \
+	0x1180048UL
+#define TCM_REG_DBG_FORCE_VALID	\
+	0x118004cUL
+#define TCM_REG_DBG_FORCE_FRAME	\
+	0x1180050UL
+#define MCM_REG_DBG_SELECT \
+	0x1200040UL
+#define MCM_REG_DBG_DWORD_ENABLE \
+	0x1200044UL
+#define MCM_REG_DBG_SHIFT \
+	0x1200048UL
+#define MCM_REG_DBG_FORCE_VALID	\
+	0x120004cUL
+#define MCM_REG_DBG_FORCE_FRAME	\
+	0x1200050UL
+#define UCM_REG_DBG_SELECT \
+	0x1280050UL
+#define UCM_REG_DBG_DWORD_ENABLE \
+	0x1280054UL
+#define UCM_REG_DBG_SHIFT \
+	0x1280058UL
+#define UCM_REG_DBG_FORCE_VALID	\
+	0x128005cUL
+#define UCM_REG_DBG_FORCE_FRAME	\
+	0x1280060UL
+#define XSEM_REG_DBG_SELECT \
+	0x1401528UL
+#define XSEM_REG_DBG_DWORD_ENABLE \
+	0x140152cUL
+#define XSEM_REG_DBG_SHIFT \
+	0x1401530UL
+#define XSEM_REG_DBG_FORCE_VALID \
+	0x1401534UL
+#define XSEM_REG_DBG_FORCE_FRAME \
+	0x1401538UL
+#define YSEM_REG_DBG_SELECT \
+	0x1501528UL
+#define YSEM_REG_DBG_DWORD_ENABLE \
+	0x150152cUL
+#define YSEM_REG_DBG_SHIFT \
+	0x1501530UL
+#define YSEM_REG_DBG_FORCE_VALID \
+	0x1501534UL
+#define YSEM_REG_DBG_FORCE_FRAME \
+	0x1501538UL
+#define PSEM_REG_DBG_SELECT \
+	0x1601528UL
+#define PSEM_REG_DBG_DWORD_ENABLE \
+	0x160152cUL
+#define PSEM_REG_DBG_SHIFT \
+	0x1601530UL
+#define PSEM_REG_DBG_FORCE_VALID \
+	0x1601534UL
+#define PSEM_REG_DBG_FORCE_FRAME \
+	0x1601538UL
+#define TSEM_REG_DBG_SELECT \
+	0x1701528UL
+#define TSEM_REG_DBG_DWORD_ENABLE \
+	0x170152cUL
+#define TSEM_REG_DBG_SHIFT \
+	0x1701530UL
+#define TSEM_REG_DBG_FORCE_VALID \
+	0x1701534UL
+#define TSEM_REG_DBG_FORCE_FRAME \
+	0x1701538UL
+#define MSEM_REG_DBG_SELECT \
+	0x1801528UL
+#define MSEM_REG_DBG_DWORD_ENABLE \
+	0x180152cUL
+#define MSEM_REG_DBG_SHIFT \
+	0x1801530UL
+#define MSEM_REG_DBG_FORCE_VALID \
+	0x1801534UL
+#define MSEM_REG_DBG_FORCE_FRAME \
+	0x1801538UL
+#define USEM_REG_DBG_SELECT \
+	0x1901528UL
+#define USEM_REG_DBG_DWORD_ENABLE \
+	0x190152cUL
+#define USEM_REG_DBG_SHIFT \
+	0x1901530UL
+#define USEM_REG_DBG_FORCE_VALID \
+	0x1901534UL
+#define USEM_REG_DBG_FORCE_FRAME \
+	0x1901538UL
+#define NWS_REG_DBG_SELECT_K2_E5 \
+	0x700128UL
+#define NWS_REG_DBG_DWORD_ENABLE_K2_E5 \
+	0x70012cUL
+#define NWS_REG_DBG_SHIFT_K2_E5 \
+	0x700130UL
+#define NWS_REG_DBG_FORCE_VALID_K2_E5 \
+	0x700134UL
+#define NWS_REG_DBG_FORCE_FRAME_K2_E5 \
+	0x700138UL
+#define MS_REG_DBG_SELECT_K2_E5 \
+	0x6a0228UL
+#define MS_REG_DBG_DWORD_ENABLE_K2_E5 \
+	0x6a022cUL
+#define MS_REG_DBG_SHIFT_K2_E5 \
+	0x6a0230UL
+#define MS_REG_DBG_FORCE_VALID_K2_E5 \
+	0x6a0234UL
+#define MS_REG_DBG_FORCE_FRAME_K2_E5 \
+	0x6a0238UL
+#define PCIE_REG_DBG_COMMON_SELECT_K2_E5 \
+	0x054398UL
+#define PCIE_REG_DBG_COMMON_DWORD_ENABLE_K2_E5 \
+	0x05439cUL
+#define PCIE_REG_DBG_COMMON_SHIFT_K2_E5 \
+	0x0543a0UL
+#define PCIE_REG_DBG_COMMON_FORCE_VALID_K2_E5 \
+	0x0543a4UL
+#define PCIE_REG_DBG_COMMON_FORCE_FRAME_K2_E5 \
+	0x0543a8UL
+#define PTLD_REG_DBG_SELECT_E5 \
+	0x5a1600UL
+#define PTLD_REG_DBG_DWORD_ENABLE_E5 \
+	0x5a1604UL
+#define PTLD_REG_DBG_SHIFT_E5 \
+	0x5a1608UL
+#define PTLD_REG_DBG_FORCE_VALID_E5 \
+	0x5a160cUL
+#define PTLD_REG_DBG_FORCE_FRAME_E5 \
+	0x5a1610UL
+#define YPLD_REG_DBG_SELECT_E5 \
+	0x5c1600UL
+#define YPLD_REG_DBG_DWORD_ENABLE_E5 \
+	0x5c1604UL
+#define YPLD_REG_DBG_SHIFT_E5 \
+	0x5c1608UL
+#define YPLD_REG_DBG_FORCE_VALID_E5 \
+	0x5c160cUL
+#define YPLD_REG_DBG_FORCE_FRAME_E5 \
+	0x5c1610UL
+#define RGSRC_REG_DBG_SELECT_E5	\
+	0x320040UL
+#define RGSRC_REG_DBG_DWORD_ENABLE_E5 \
+	0x320044UL
+#define RGSRC_REG_DBG_SHIFT_E5 \
+	0x320048UL
+#define RGSRC_REG_DBG_FORCE_VALID_E5 \
+	0x32004cUL
+#define RGSRC_REG_DBG_FORCE_FRAME_E5 \
+	0x320050UL
+#define TGSRC_REG_DBG_SELECT_E5	\
+	0x322040UL
+#define TGSRC_REG_DBG_DWORD_ENABLE_E5 \
+	0x322044UL
+#define TGSRC_REG_DBG_SHIFT_E5 \
+	0x322048UL
+#define TGSRC_REG_DBG_FORCE_VALID_E5 \
+	0x32204cUL
+#define TGSRC_REG_DBG_FORCE_FRAME_E5 \
+	0x322050UL
+#define MISC_REG_RESET_PL_UA \
+	0x008050UL
+#define MISC_REG_RESET_PL_HV \
+	0x008060UL
+#define XCM_REG_CTX_RBC_ACCS \
+	0x1001800UL
+#define XCM_REG_AGG_CON_CTX \
+	0x1001804UL
+#define XCM_REG_SM_CON_CTX \
+	0x1001808UL
+#define YCM_REG_CTX_RBC_ACCS \
+	0x1081800UL
+#define YCM_REG_AGG_CON_CTX \
+	0x1081804UL
+#define YCM_REG_AGG_TASK_CTX \
+	0x1081808UL
+#define YCM_REG_SM_CON_CTX \
+	0x108180cUL
+#define YCM_REG_SM_TASK_CTX \
+	0x1081810UL
+#define PCM_REG_CTX_RBC_ACCS \
+	0x1101440UL
+#define PCM_REG_SM_CON_CTX \
+	0x1101444UL
+#define TCM_REG_CTX_RBC_ACCS \
+	0x11814c0UL
+#define TCM_REG_AGG_CON_CTX \
+	0x11814c4UL
+#define TCM_REG_AGG_TASK_CTX \
+	0x11814c8UL
+#define TCM_REG_SM_CON_CTX \
+	0x11814ccUL
+#define TCM_REG_SM_TASK_CTX \
+	0x11814d0UL
+#define MCM_REG_CTX_RBC_ACCS \
+	0x1201800UL
+#define MCM_REG_AGG_CON_CTX \
+	0x1201804UL
+#define MCM_REG_AGG_TASK_CTX \
+	0x1201808UL
+#define MCM_REG_SM_CON_CTX \
+	0x120180cUL
+#define MCM_REG_SM_TASK_CTX \
+	0x1201810UL
+#define UCM_REG_CTX_RBC_ACCS \
+	0x1281700UL
+#define UCM_REG_AGG_CON_CTX \
+	0x1281704UL
+#define UCM_REG_AGG_TASK_CTX \
+	0x1281708UL
+#define UCM_REG_SM_CON_CTX \
+	0x128170cUL
+#define UCM_REG_SM_TASK_CTX \
+	0x1281710UL
+#define XSEM_REG_SLOW_DBG_EMPTY_BB_K2	\
+	0x1401140UL
+#define XSEM_REG_SYNC_DBG_EMPTY	\
+	0x1401160UL
+#define XSEM_REG_SLOW_DBG_ACTIVE_BB_K2 \
+	0x1401400UL
+#define XSEM_REG_SLOW_DBG_MODE_BB_K2 \
+	0x1401404UL
+#define XSEM_REG_DBG_FRAME_MODE_BB_K2	\
+	0x1401408UL
+#define XSEM_REG_DBG_MODE1_CFG_BB_K2 \
+	0x1401420UL
+#define XSEM_REG_FAST_MEMORY \
+	0x1440000UL
+#define YSEM_REG_SYNC_DBG_EMPTY	\
+	0x1501160UL
+#define YSEM_REG_SLOW_DBG_ACTIVE_BB_K2 \
+	0x1501400UL
+#define YSEM_REG_SLOW_DBG_MODE_BB_K2 \
+	0x1501404UL
+#define YSEM_REG_DBG_FRAME_MODE_BB_K2	\
+	0x1501408UL
+#define YSEM_REG_DBG_MODE1_CFG_BB_K2 \
+	0x1501420UL
+#define YSEM_REG_FAST_MEMORY \
+	0x1540000UL
+#define PSEM_REG_SLOW_DBG_EMPTY_BB_K2	\
+	0x1601140UL
+#define PSEM_REG_SYNC_DBG_EMPTY	\
+	0x1601160UL
+#define PSEM_REG_SLOW_DBG_ACTIVE_BB_K2 \
+	0x1601400UL
+#define PSEM_REG_SLOW_DBG_MODE_BB_K2 \
+	0x1601404UL
+#define PSEM_REG_DBG_FRAME_MODE_BB_K2	\
+	0x1601408UL
+#define PSEM_REG_DBG_MODE1_CFG_BB_K2 \
+	0x1601420UL
+#define PSEM_REG_FAST_MEMORY \
+	0x1640000UL
+#define TSEM_REG_SLOW_DBG_EMPTY_BB_K2	\
+	0x1701140UL
+#define TSEM_REG_SYNC_DBG_EMPTY	\
+	0x1701160UL
+#define TSEM_REG_SLOW_DBG_ACTIVE_BB_K2 \
+	0x1701400UL
+#define TSEM_REG_SLOW_DBG_MODE_BB_K2 \
+	0x1701404UL
+#define TSEM_REG_DBG_FRAME_MODE_BB_K2	\
+	0x1701408UL
+#define TSEM_REG_DBG_MODE1_CFG_BB_K2 \
+	0x1701420UL
+#define TSEM_REG_FAST_MEMORY \
+	0x1740000UL
+#define MSEM_REG_SLOW_DBG_EMPTY_BB_K2	\
+	0x1801140UL
+#define MSEM_REG_SYNC_DBG_EMPTY	\
+	0x1801160UL
+#define MSEM_REG_SLOW_DBG_ACTIVE_BB_K2 \
+	0x1801400UL
+#define MSEM_REG_SLOW_DBG_MODE_BB_K2 \
+	0x1801404UL
+#define MSEM_REG_DBG_FRAME_MODE_BB_K2	\
+	0x1801408UL
+#define MSEM_REG_DBG_MODE1_CFG_BB_K2 \
+	0x1801420UL
+#define MSEM_REG_FAST_MEMORY \
+	0x1840000UL
+#define USEM_REG_SLOW_DBG_EMPTY_BB_K2	\
+	0x1901140UL
+#define USEM_REG_SYNC_DBG_EMPTY	\
+	0x1901160UL
+#define USEM_REG_SLOW_DBG_ACTIVE_BB_K2 \
+	0x1901400UL
+#define USEM_REG_SLOW_DBG_MODE_BB_K2 \
+	0x1901404UL
+#define USEM_REG_DBG_FRAME_MODE_BB_K2	\
+	0x1901408UL
+#define USEM_REG_DBG_MODE1_CFG_BB_K2 \
+	0x1901420UL
+#define USEM_REG_FAST_MEMORY \
+	0x1940000UL
+#define SEM_FAST_REG_INT_RAM \
+	0x020000UL
+#define SEM_FAST_REG_INT_RAM_SIZE_BB_K2 \
+	20480
+#define GRC_REG_TRACE_FIFO_VALID_DATA \
+	0x050064UL
+#define GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW \
+	0x05040cUL
+#define GRC_REG_PROTECTION_OVERRIDE_WINDOW \
+	0x050500UL
+#define IGU_REG_ERROR_HANDLING_MEMORY \
+	0x181520UL
+#define MCP_REG_CPU_MODE \
+	0xe05000UL
+#define MCP_REG_CPU_MODE_SOFT_HALT \
+		(0x1 << 10)
+#define BRB_REG_BIG_RAM_ADDRESS \
+	0x340800UL
+#define BRB_REG_BIG_RAM_DATA \
+	0x341500UL
+#define BRB_REG_BIG_RAM_DATA_SIZE \
+	64
+#define SEM_FAST_REG_STALL_0_BB_K2 \
+	0x000488UL
+#define SEM_FAST_REG_STALLED \
+	0x000494UL
+#define BTB_REG_BIG_RAM_ADDRESS \
+	0xdb0800UL
+#define BTB_REG_BIG_RAM_DATA \
+	0xdb0c00UL
+#define BMB_REG_BIG_RAM_ADDRESS \
+	0x540800UL
+#define BMB_REG_BIG_RAM_DATA \
+	0x540f00UL
+#define SEM_FAST_REG_STORM_REG_FILE \
+	0x008000UL
+#define RSS_REG_RSS_RAM_ADDR \
+	0x238c30UL
+#define MISCS_REG_BLOCK_256B_EN \
+	0x009074UL
+#define MCP_REG_SCRATCH_SIZE_BB_K2 \
+	57344
+#define MCP_REG_CPU_REG_FILE \
+	0xe05200UL
+#define MCP_REG_CPU_REG_FILE_SIZE \
+	32
+#define DBG_REG_DEBUG_TARGET \
+	0x01005cUL
+#define DBG_REG_FULL_MODE \
+	0x010060UL
+#define DBG_REG_CALENDAR_OUT_DATA \
+	0x010480UL
+#define GRC_REG_TRACE_FIFO \
+	0x050068UL
+#define IGU_REG_ERROR_HANDLING_DATA_VALID \
+	0x181530UL
+#define DBG_REG_DBG_BLOCK_ON \
+	0x010454UL
+#define DBG_REG_FRAMING_MODE \
+	0x010058UL
+#define SEM_FAST_REG_VFC_DATA_WR \
+	0x000b40UL
+#define SEM_FAST_REG_VFC_ADDR \
+	0x000b44UL
+#define SEM_FAST_REG_VFC_DATA_RD \
+	0x000b48UL
+#define RSS_REG_RSS_RAM_DATA \
+	0x238c20UL
+#define RSS_REG_RSS_RAM_DATA_SIZE \
+	4
+#define MISC_REG_BLOCK_256B_EN \
+	0x008c14UL
+#define NWS_REG_NWS_CMU_K2	\
+	0x720000UL
+#define PHY_NW_IP_REG_PHY0_TOP_TBUS_ADDR_7_0_K2_E5 \
+	0x000680UL
+#define PHY_NW_IP_REG_PHY0_TOP_TBUS_ADDR_15_8_K2_E5 \
+	0x000684UL
+#define PHY_NW_IP_REG_PHY0_TOP_TBUS_DATA_7_0_K2_E5 \
+	0x0006c0UL
+#define PHY_NW_IP_REG_PHY0_TOP_TBUS_DATA_11_8_K2_E5 \
+	0x0006c4UL
+#define MS_REG_MS_CMU_K2_E5 \
+	0x6a4000UL
+#define PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X130_K2_E5 \
+	0x000208UL
+#define PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X131_K2_E5 \
+	0x00020cUL
+#define PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X132_K2_E5 \
+	0x000210UL
+#define PHY_SGMII_IP_REG_AHB_CMU_CSR_0_X133_K2_E5 \
+	0x000214UL
+#define PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X130_K2_E5 \
+	0x000208UL
+#define PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X131_K2_E5 \
+	0x00020cUL
+#define PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X132_K2_E5 \
+	0x000210UL
+#define PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X133_K2_E5 \
+	0x000214UL
+#define PHY_PCIE_REG_PHY0_K2_E5 \
+	0x620000UL
+#define PHY_PCIE_REG_PHY1_K2_E5 \
+	0x624000UL
+#define NIG_REG_ROCE_DUPLICATE_TO_HOST 0x5088f0UL
+#define PRS_REG_LIGHT_L2_ETHERTYPE_EN 0x1f0968UL
+#define NIG_REG_LLH_ENG_CLS_ENG_ID_TBL 0x501b90UL
+#define DORQ_REG_PF_DPM_ENABLE 0x100510UL
+#define DORQ_REG_PF_ICID_BIT_SHIFT_NORM	0x100448UL
+#define DORQ_REG_PF_MIN_ADDR_REG1 0x100400UL
+#define DORQ_REG_PF_DPI_BIT_SHIFT 0x100450UL
+#define NIG_REG_RX_PTP_EN 0x501900UL
+#define NIG_REG_TX_PTP_EN 0x501904UL
+#define NIG_REG_LLH_PTP_TO_HOST	0x501908UL
+#define NIG_REG_LLH_PTP_TO_MCP 0x50190cUL
+#define NIG_REG_PTP_SW_TXTSEN 0x501910UL
+#define NIG_REG_LLH_PTP_ETHERTYPE_1 0x501914UL
+#define NIG_REG_LLH_PTP_MAC_DA_2_LSB 0x501918UL
+#define NIG_REG_LLH_PTP_MAC_DA_2_MSB 0x50191cUL
+#define NIG_REG_LLH_PTP_PARAM_MASK 0x501920UL
+#define NIG_REG_LLH_PTP_RULE_MASK 0x501924UL
+#define NIG_REG_TX_LLH_PTP_PARAM_MASK 0x501928UL
+#define NIG_REG_TX_LLH_PTP_RULE_MASK 0x50192cUL
+#define NIG_REG_LLH_PTP_HOST_BUF_SEQID 0x501930UL
+#define NIG_REG_LLH_PTP_HOST_BUF_TS_LSB 0x501934UL
+#define NIG_REG_LLH_PTP_HOST_BUF_TS_MSB	0x501938UL
+#define NIG_REG_LLH_PTP_MCP_BUF_SEQID 0x50193cUL
+#define NIG_REG_LLH_PTP_MCP_BUF_TS_LSB 0x501940UL
+#define NIG_REG_LLH_PTP_MCP_BUF_TS_MSB 0x501944UL
+#define NIG_REG_TX_LLH_PTP_BUF_SEQID 0x501948UL
+#define NIG_REG_TX_LLH_PTP_BUF_TS_LSB 0x50194cUL
+#define NIG_REG_TX_LLH_PTP_BUF_TS_MSB 0x501950UL
+#define NIG_REG_RX_PTP_TS_MSB_ERR 0x501954UL
+#define NIG_REG_TX_PTP_TS_MSB_ERR 0x501958UL
+#define NIG_REG_TSGEN_SYNC_TIME_LSB 0x5088c0UL
+#define NIG_REG_TSGEN_SYNC_TIME_MSB 0x5088c4UL
+#define NIG_REG_TSGEN_RST_DRIFT_CNTR 0x5088d8UL
+#define NIG_REG_TSGEN_DRIFT_CNTR_CONF 0x5088dcUL
+#define NIG_REG_TS_OUTPUT_ENABLE_PDA 0x508870UL
+#define NIG_REG_TIMESYNC_GEN_REG_BB 0x500d00UL
+#define NIG_REG_TSGEN_FREE_CNT_VALUE_LSB 0x5088a8UL
+#define NIG_REG_TSGEN_FREE_CNT_VALUE_MSB 0x5088acUL
+#define NIG_REG_PTP_LATCH_OSTS_PKT_TIME 0x509040UL
+#define PSWRQ2_REG_WR_MBS0 0x240400UL
+
+#define PGLUE_B_REG_PGL_ADDR_E8_F0_K2 0x2aaf98UL
+#define PGLUE_B_REG_PGL_ADDR_EC_F0_K2 0x2aaf9cUL
+#define PGLUE_B_REG_PGL_ADDR_F0_F0_K2 0x2aafa0UL
+#define PGLUE_B_REG_PGL_ADDR_F4_F0_K2 0x2aafa4UL
+#define PGLUE_B_REG_MASTER_WRITE_PAD_ENABLE 0x2aae30UL
+#define NIG_REG_TSGEN_FREECNT_UPDATE_K2 0x509008UL
+#define CNIG_REG_NIG_PORT0_CONF_K2 0x218200UL
+
+#define NIG_REG_TX_EDPM_CTRL 0x501f0cUL
+#define NIG_REG_TX_EDPM_CTRL_TX_EDPM_EN (0x1 << 0)
+#define NIG_REG_TX_EDPM_CTRL_TX_EDPM_EN_SHIFT 0
+#define NIG_REG_TX_EDPM_CTRL_TX_EDPM_TC_EN (0xff << 1)
+#define NIG_REG_TX_EDPM_CTRL_TX_EDPM_TC_EN_SHIFT 1
+
+#define PRS_REG_SEARCH_GFT 0x1f11bcUL
+#define PRS_REG_CM_HDR_GFT 0x1f11c8UL
+#define PRS_REG_GFT_CAM 0x1f1100UL
+#define PRS_REG_GFT_PROFILE_MASK_RAM 0x1f1000UL
+#define PRS_REG_CM_HDR_GFT_EVENT_ID_SHIFT 0
+#define PRS_REG_CM_HDR_GFT_CM_HDR_SHIFT 8
+#define PRS_REG_LOAD_L2_FILTER 0x1f0198UL
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c
new file mode 100644
index 0000000..6acfd43
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c
@@ -0,0 +1,1171 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_init_ops.h"
+#include "qed_int.h"
+#include "qed_ll2.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+#include <linux/qed/qed_rdma_if.h>
+#include "qed_rdma.h"
+#include "qed_roce.h"
+#include "qed_sp.h"
+
+static void qed_roce_free_real_icid(struct qed_hwfn *p_hwfn, u16 icid);
+
+static int
+qed_roce_async_event(struct qed_hwfn *p_hwfn,
+		     u8 fw_event_code,
+		     u16 echo, union event_ring_data *data, u8 fw_return_code)
+{
+	if (fw_event_code == ROCE_ASYNC_EVENT_DESTROY_QP_DONE) {
+		u16 icid =
+		    (u16)le32_to_cpu(data->rdma_data.rdma_destroy_qp_data.cid);
+
+		/* icid release in this async event can occur only if the icid
+		 * was offloaded to the FW. In case it wasn't offloaded this is
+		 * handled in qed_roce_sp_destroy_qp.
+		 */
+		qed_roce_free_real_icid(p_hwfn, icid);
+	} else {
+		struct qed_rdma_events *events = &p_hwfn->p_rdma_info->events;
+
+		events->affiliated_event(p_hwfn->p_rdma_info->events.context,
+					 fw_event_code,
+				     (void *)&data->rdma_data.async_handle);
+	}
+
+	return 0;
+}
+
+void qed_roce_stop(struct qed_hwfn *p_hwfn)
+{
+	struct qed_bmap *rcid_map = &p_hwfn->p_rdma_info->real_cid_map;
+	int wait_count = 0;
+
+	/* when destroying a_RoCE QP the control is returned to the user after
+	 * the synchronous part. The asynchronous part may take a little longer.
+	 * We delay for a short while if an async destroy QP is still expected.
+	 * Beyond the added delay we clear the bitmap anyway.
+	 */
+	while (bitmap_weight(rcid_map->bitmap, rcid_map->max_count)) {
+		msleep(100);
+		if (wait_count++ > 20) {
+			DP_NOTICE(p_hwfn, "cid bitmap wait timed out\n");
+			break;
+		}
+	}
+	qed_spq_unregister_async_cb(p_hwfn, PROTOCOLID_ROCE);
+}
+
+static void qed_rdma_copy_gids(struct qed_rdma_qp *qp, __le32 *src_gid,
+			       __le32 *dst_gid)
+{
+	u32 i;
+
+	if (qp->roce_mode == ROCE_V2_IPV4) {
+		/* The IPv4 addresses shall be aligned to the highest word.
+		 * The lower words must be zero.
+		 */
+		memset(src_gid, 0, sizeof(union qed_gid));
+		memset(dst_gid, 0, sizeof(union qed_gid));
+		src_gid[3] = cpu_to_le32(qp->sgid.ipv4_addr);
+		dst_gid[3] = cpu_to_le32(qp->dgid.ipv4_addr);
+	} else {
+		/* GIDs and IPv6 addresses coincide in location and size */
+		for (i = 0; i < ARRAY_SIZE(qp->sgid.dwords); i++) {
+			src_gid[i] = cpu_to_le32(qp->sgid.dwords[i]);
+			dst_gid[i] = cpu_to_le32(qp->dgid.dwords[i]);
+		}
+	}
+}
+
+static enum roce_flavor qed_roce_mode_to_flavor(enum roce_mode roce_mode)
+{
+	enum roce_flavor flavor;
+
+	switch (roce_mode) {
+	case ROCE_V1:
+		flavor = PLAIN_ROCE;
+		break;
+	case ROCE_V2_IPV4:
+		flavor = RROCE_IPV4;
+		break;
+	case ROCE_V2_IPV6:
+		flavor = ROCE_V2_IPV6;
+		break;
+	default:
+		flavor = MAX_ROCE_MODE;
+		break;
+	}
+	return flavor;
+}
+
+void qed_roce_free_cid_pair(struct qed_hwfn *p_hwfn, u16 cid)
+{
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->cid_map, cid);
+	qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->cid_map, cid + 1);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+}
+
+int qed_roce_alloc_cid(struct qed_hwfn *p_hwfn, u16 *cid)
+{
+	struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info;
+	u32 responder_icid;
+	u32 requester_icid;
+	int rc;
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	rc = qed_rdma_bmap_alloc_id(p_hwfn, &p_rdma_info->cid_map,
+				    &responder_icid);
+	if (rc) {
+		spin_unlock_bh(&p_rdma_info->lock);
+		return rc;
+	}
+
+	rc = qed_rdma_bmap_alloc_id(p_hwfn, &p_rdma_info->cid_map,
+				    &requester_icid);
+
+	spin_unlock_bh(&p_rdma_info->lock);
+	if (rc)
+		goto err;
+
+	/* the two icid's should be adjacent */
+	if ((requester_icid - responder_icid) != 1) {
+		DP_NOTICE(p_hwfn, "Failed to allocate two adjacent qp's'\n");
+		rc = -EINVAL;
+		goto err;
+	}
+
+	responder_icid += qed_cxt_get_proto_cid_start(p_hwfn,
+						      p_rdma_info->proto);
+	requester_icid += qed_cxt_get_proto_cid_start(p_hwfn,
+						      p_rdma_info->proto);
+
+	/* If these icids require a new ILT line allocate DMA-able context for
+	 * an ILT page
+	 */
+	rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, QED_ELEM_CXT, responder_icid);
+	if (rc)
+		goto err;
+
+	rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, QED_ELEM_CXT, requester_icid);
+	if (rc)
+		goto err;
+
+	*cid = (u16)responder_icid;
+	return rc;
+
+err:
+	spin_lock_bh(&p_rdma_info->lock);
+	qed_bmap_release_id(p_hwfn, &p_rdma_info->cid_map, responder_icid);
+	qed_bmap_release_id(p_hwfn, &p_rdma_info->cid_map, requester_icid);
+
+	spin_unlock_bh(&p_rdma_info->lock);
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "Allocate CID - failed, rc = %d\n", rc);
+	return rc;
+}
+
+static void qed_roce_set_real_cid(struct qed_hwfn *p_hwfn, u32 cid)
+{
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	qed_bmap_set_id(p_hwfn, &p_hwfn->p_rdma_info->real_cid_map, cid);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+}
+
+static int qed_roce_sp_create_responder(struct qed_hwfn *p_hwfn,
+					struct qed_rdma_qp *qp)
+{
+	struct roce_create_qp_resp_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data;
+	enum roce_flavor roce_flavor;
+	struct qed_spq_entry *p_ent;
+	u16 regular_latency_queue;
+	enum protocol_type proto;
+	int rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid);
+
+	/* Allocate DMA-able memory for IRQ */
+	qp->irq_num_pages = 1;
+	qp->irq = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				     RDMA_RING_PAGE_SIZE,
+				     &qp->irq_phys_addr, GFP_KERNEL);
+	if (!qp->irq) {
+		rc = -ENOMEM;
+		DP_NOTICE(p_hwfn,
+			  "qed create responder failed: cannot allocate memory (irq). rc = %d\n",
+			  rc);
+		return rc;
+	}
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qp->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent, ROCE_RAMROD_CREATE_QP,
+				 PROTOCOLID_ROCE, &init_data);
+	if (rc)
+		goto err;
+
+	p_ramrod = &p_ent->ramrod.roce_create_qp_resp;
+
+	p_ramrod->flags = 0;
+
+	roce_flavor = qed_roce_mode_to_flavor(qp->roce_mode);
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_RESP_RAMROD_DATA_ROCE_FLAVOR, roce_flavor);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_RESP_RAMROD_DATA_RDMA_RD_EN,
+		  qp->incoming_rdma_read_en);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_RESP_RAMROD_DATA_RDMA_WR_EN,
+		  qp->incoming_rdma_write_en);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_RESP_RAMROD_DATA_ATOMIC_EN,
+		  qp->incoming_atomic_en);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_RESP_RAMROD_DATA_E2E_FLOW_CONTROL_EN,
+		  qp->e2e_flow_control_en);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_RESP_RAMROD_DATA_SRQ_FLG, qp->use_srq);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_RESP_RAMROD_DATA_RESERVED_KEY_EN,
+		  qp->fmr_and_reserved_lkey);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER,
+		  qp->min_rnr_nak_timer);
+
+	p_ramrod->max_ird = qp->max_rd_atomic_resp;
+	p_ramrod->traffic_class = qp->traffic_class_tos;
+	p_ramrod->hop_limit = qp->hop_limit_ttl;
+	p_ramrod->irq_num_pages = qp->irq_num_pages;
+	p_ramrod->p_key = cpu_to_le16(qp->pkey);
+	p_ramrod->flow_label = cpu_to_le32(qp->flow_label);
+	p_ramrod->dst_qp_id = cpu_to_le32(qp->dest_qp);
+	p_ramrod->mtu = cpu_to_le16(qp->mtu);
+	p_ramrod->initial_psn = cpu_to_le32(qp->rq_psn);
+	p_ramrod->pd = cpu_to_le16(qp->pd);
+	p_ramrod->rq_num_pages = cpu_to_le16(qp->rq_num_pages);
+	DMA_REGPAIR_LE(p_ramrod->rq_pbl_addr, qp->rq_pbl_ptr);
+	DMA_REGPAIR_LE(p_ramrod->irq_pbl_addr, qp->irq_phys_addr);
+	qed_rdma_copy_gids(qp, p_ramrod->src_gid, p_ramrod->dst_gid);
+	p_ramrod->qp_handle_for_async.hi = cpu_to_le32(qp->qp_handle_async.hi);
+	p_ramrod->qp_handle_for_async.lo = cpu_to_le32(qp->qp_handle_async.lo);
+	p_ramrod->qp_handle_for_cqe.hi = cpu_to_le32(qp->qp_handle.hi);
+	p_ramrod->qp_handle_for_cqe.lo = cpu_to_le32(qp->qp_handle.lo);
+	p_ramrod->cq_cid = cpu_to_le32((p_hwfn->hw_info.opaque_fid << 16) |
+				       qp->rq_cq_id);
+
+	regular_latency_queue = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_OFLD);
+
+	p_ramrod->regular_latency_phy_queue =
+	    cpu_to_le16(regular_latency_queue);
+	p_ramrod->low_latency_phy_queue =
+	    cpu_to_le16(regular_latency_queue);
+
+	p_ramrod->dpi = cpu_to_le16(qp->dpi);
+
+	qed_rdma_set_fw_mac(p_ramrod->remote_mac_addr, qp->remote_mac_addr);
+	qed_rdma_set_fw_mac(p_ramrod->local_mac_addr, qp->local_mac_addr);
+
+	p_ramrod->udp_src_port = qp->udp_src_port;
+	p_ramrod->vlan_id = cpu_to_le16(qp->vlan_id);
+	p_ramrod->srq_id.srq_idx = cpu_to_le16(qp->srq_id);
+	p_ramrod->srq_id.opaque_fid = cpu_to_le16(p_hwfn->hw_info.opaque_fid);
+
+	p_ramrod->stats_counter_id = RESC_START(p_hwfn, QED_RDMA_STATS_QUEUE) +
+				     qp->stats_queue;
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "rc = %d regular physical queue = 0x%x\n", rc,
+		   regular_latency_queue);
+
+	if (rc)
+		goto err;
+
+	qp->resp_offloaded = true;
+	qp->cq_prod = 0;
+
+	proto = p_hwfn->p_rdma_info->proto;
+	qed_roce_set_real_cid(p_hwfn, qp->icid -
+			      qed_cxt_get_proto_cid_start(p_hwfn, proto));
+
+	return rc;
+
+err:
+	DP_NOTICE(p_hwfn, "create responder - failed, rc = %d\n", rc);
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  qp->irq_num_pages * RDMA_RING_PAGE_SIZE,
+			  qp->irq, qp->irq_phys_addr);
+
+	return rc;
+}
+
+static int qed_roce_sp_create_requester(struct qed_hwfn *p_hwfn,
+					struct qed_rdma_qp *qp)
+{
+	struct roce_create_qp_req_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data;
+	enum roce_flavor roce_flavor;
+	struct qed_spq_entry *p_ent;
+	u16 regular_latency_queue;
+	enum protocol_type proto;
+	int rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid);
+
+	/* Allocate DMA-able memory for ORQ */
+	qp->orq_num_pages = 1;
+	qp->orq = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				     RDMA_RING_PAGE_SIZE,
+				     &qp->orq_phys_addr, GFP_KERNEL);
+	if (!qp->orq) {
+		rc = -ENOMEM;
+		DP_NOTICE(p_hwfn,
+			  "qed create requester failed: cannot allocate memory (orq). rc = %d\n",
+			  rc);
+		return rc;
+	}
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qp->icid + 1;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ROCE_RAMROD_CREATE_QP,
+				 PROTOCOLID_ROCE, &init_data);
+	if (rc)
+		goto err;
+
+	p_ramrod = &p_ent->ramrod.roce_create_qp_req;
+
+	p_ramrod->flags = 0;
+
+	roce_flavor = qed_roce_mode_to_flavor(qp->roce_mode);
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_REQ_RAMROD_DATA_ROCE_FLAVOR, roce_flavor);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_REQ_RAMROD_DATA_FMR_AND_RESERVED_EN,
+		  qp->fmr_and_reserved_lkey);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_REQ_RAMROD_DATA_SIGNALED_COMP, qp->signal_all);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_REQ_RAMROD_DATA_ERR_RETRY_CNT, qp->retry_cnt);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_REQ_RAMROD_DATA_RNR_NAK_CNT,
+		  qp->rnr_retry_cnt);
+
+	p_ramrod->max_ord = qp->max_rd_atomic_req;
+	p_ramrod->traffic_class = qp->traffic_class_tos;
+	p_ramrod->hop_limit = qp->hop_limit_ttl;
+	p_ramrod->orq_num_pages = qp->orq_num_pages;
+	p_ramrod->p_key = cpu_to_le16(qp->pkey);
+	p_ramrod->flow_label = cpu_to_le32(qp->flow_label);
+	p_ramrod->dst_qp_id = cpu_to_le32(qp->dest_qp);
+	p_ramrod->ack_timeout_val = cpu_to_le32(qp->ack_timeout);
+	p_ramrod->mtu = cpu_to_le16(qp->mtu);
+	p_ramrod->initial_psn = cpu_to_le32(qp->sq_psn);
+	p_ramrod->pd = cpu_to_le16(qp->pd);
+	p_ramrod->sq_num_pages = cpu_to_le16(qp->sq_num_pages);
+	DMA_REGPAIR_LE(p_ramrod->sq_pbl_addr, qp->sq_pbl_ptr);
+	DMA_REGPAIR_LE(p_ramrod->orq_pbl_addr, qp->orq_phys_addr);
+	qed_rdma_copy_gids(qp, p_ramrod->src_gid, p_ramrod->dst_gid);
+	p_ramrod->qp_handle_for_async.hi = cpu_to_le32(qp->qp_handle_async.hi);
+	p_ramrod->qp_handle_for_async.lo = cpu_to_le32(qp->qp_handle_async.lo);
+	p_ramrod->qp_handle_for_cqe.hi = cpu_to_le32(qp->qp_handle.hi);
+	p_ramrod->qp_handle_for_cqe.lo = cpu_to_le32(qp->qp_handle.lo);
+	p_ramrod->cq_cid =
+	    cpu_to_le32((p_hwfn->hw_info.opaque_fid << 16) | qp->sq_cq_id);
+
+	regular_latency_queue = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_OFLD);
+
+	p_ramrod->regular_latency_phy_queue =
+	    cpu_to_le16(regular_latency_queue);
+	p_ramrod->low_latency_phy_queue =
+	    cpu_to_le16(regular_latency_queue);
+
+	p_ramrod->dpi = cpu_to_le16(qp->dpi);
+
+	qed_rdma_set_fw_mac(p_ramrod->remote_mac_addr, qp->remote_mac_addr);
+	qed_rdma_set_fw_mac(p_ramrod->local_mac_addr, qp->local_mac_addr);
+
+	p_ramrod->udp_src_port = qp->udp_src_port;
+	p_ramrod->vlan_id = cpu_to_le16(qp->vlan_id);
+	p_ramrod->stats_counter_id = RESC_START(p_hwfn, QED_RDMA_STATS_QUEUE) +
+				     qp->stats_queue;
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc);
+
+	if (rc)
+		goto err;
+
+	qp->req_offloaded = true;
+	proto = p_hwfn->p_rdma_info->proto;
+	qed_roce_set_real_cid(p_hwfn,
+			      qp->icid + 1 -
+			      qed_cxt_get_proto_cid_start(p_hwfn, proto));
+
+	return rc;
+
+err:
+	DP_NOTICE(p_hwfn, "Create requested - failed, rc = %d\n", rc);
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  qp->orq_num_pages * RDMA_RING_PAGE_SIZE,
+			  qp->orq, qp->orq_phys_addr);
+	return rc;
+}
+
+static int qed_roce_sp_modify_responder(struct qed_hwfn *p_hwfn,
+					struct qed_rdma_qp *qp,
+					bool move_to_err, u32 modify_flags)
+{
+	struct roce_modify_qp_resp_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	int rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid);
+
+	if (move_to_err && !qp->resp_offloaded)
+		return 0;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qp->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ROCE_EVENT_MODIFY_QP,
+				 PROTOCOLID_ROCE, &init_data);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "rc = %d\n", rc);
+		return rc;
+	}
+
+	p_ramrod = &p_ent->ramrod.roce_modify_qp_resp;
+
+	p_ramrod->flags = 0;
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_RESP_RAMROD_DATA_MOVE_TO_ERR_FLG, move_to_err);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_RESP_RAMROD_DATA_RDMA_RD_EN,
+		  qp->incoming_rdma_read_en);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_RESP_RAMROD_DATA_RDMA_WR_EN,
+		  qp->incoming_rdma_write_en);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_RESP_RAMROD_DATA_ATOMIC_EN,
+		  qp->incoming_atomic_en);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_CREATE_QP_RESP_RAMROD_DATA_E2E_FLOW_CONTROL_EN,
+		  qp->e2e_flow_control_en);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_RESP_RAMROD_DATA_RDMA_OPS_EN_FLG,
+		  GET_FIELD(modify_flags,
+			    QED_RDMA_MODIFY_QP_VALID_RDMA_OPS_EN));
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_RESP_RAMROD_DATA_P_KEY_FLG,
+		  GET_FIELD(modify_flags, QED_ROCE_MODIFY_QP_VALID_PKEY));
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_RESP_RAMROD_DATA_ADDRESS_VECTOR_FLG,
+		  GET_FIELD(modify_flags,
+			    QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR));
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_RESP_RAMROD_DATA_MAX_IRD_FLG,
+		  GET_FIELD(modify_flags,
+			    QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_RESP));
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER_FLG,
+		  GET_FIELD(modify_flags,
+			    QED_ROCE_MODIFY_QP_VALID_MIN_RNR_NAK_TIMER));
+
+	p_ramrod->fields = 0;
+	SET_FIELD(p_ramrod->fields,
+		  ROCE_MODIFY_QP_RESP_RAMROD_DATA_MIN_RNR_NAK_TIMER,
+		  qp->min_rnr_nak_timer);
+
+	p_ramrod->max_ird = qp->max_rd_atomic_resp;
+	p_ramrod->traffic_class = qp->traffic_class_tos;
+	p_ramrod->hop_limit = qp->hop_limit_ttl;
+	p_ramrod->p_key = cpu_to_le16(qp->pkey);
+	p_ramrod->flow_label = cpu_to_le32(qp->flow_label);
+	p_ramrod->mtu = cpu_to_le16(qp->mtu);
+	qed_rdma_copy_gids(qp, p_ramrod->src_gid, p_ramrod->dst_gid);
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Modify responder, rc = %d\n", rc);
+	return rc;
+}
+
+static int qed_roce_sp_modify_requester(struct qed_hwfn *p_hwfn,
+					struct qed_rdma_qp *qp,
+					bool move_to_sqd,
+					bool move_to_err, u32 modify_flags)
+{
+	struct roce_modify_qp_req_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	int rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid);
+
+	if (move_to_err && !(qp->req_offloaded))
+		return 0;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qp->icid + 1;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ROCE_EVENT_MODIFY_QP,
+				 PROTOCOLID_ROCE, &init_data);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "rc = %d\n", rc);
+		return rc;
+	}
+
+	p_ramrod = &p_ent->ramrod.roce_modify_qp_req;
+
+	p_ramrod->flags = 0;
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_REQ_RAMROD_DATA_MOVE_TO_ERR_FLG, move_to_err);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_REQ_RAMROD_DATA_MOVE_TO_SQD_FLG, move_to_sqd);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_REQ_RAMROD_DATA_EN_SQD_ASYNC_NOTIFY,
+		  qp->sqd_async);
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_REQ_RAMROD_DATA_P_KEY_FLG,
+		  GET_FIELD(modify_flags, QED_ROCE_MODIFY_QP_VALID_PKEY));
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_REQ_RAMROD_DATA_ADDRESS_VECTOR_FLG,
+		  GET_FIELD(modify_flags,
+			    QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR));
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_REQ_RAMROD_DATA_MAX_ORD_FLG,
+		  GET_FIELD(modify_flags,
+			    QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_REQ));
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_REQ_RAMROD_DATA_RNR_NAK_CNT_FLG,
+		  GET_FIELD(modify_flags,
+			    QED_ROCE_MODIFY_QP_VALID_RNR_RETRY_CNT));
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_REQ_RAMROD_DATA_ERR_RETRY_CNT_FLG,
+		  GET_FIELD(modify_flags, QED_ROCE_MODIFY_QP_VALID_RETRY_CNT));
+
+	SET_FIELD(p_ramrod->flags,
+		  ROCE_MODIFY_QP_REQ_RAMROD_DATA_ACK_TIMEOUT_FLG,
+		  GET_FIELD(modify_flags,
+			    QED_ROCE_MODIFY_QP_VALID_ACK_TIMEOUT));
+
+	p_ramrod->fields = 0;
+	SET_FIELD(p_ramrod->fields,
+		  ROCE_MODIFY_QP_REQ_RAMROD_DATA_ERR_RETRY_CNT, qp->retry_cnt);
+
+	SET_FIELD(p_ramrod->fields,
+		  ROCE_MODIFY_QP_REQ_RAMROD_DATA_RNR_NAK_CNT,
+		  qp->rnr_retry_cnt);
+
+	p_ramrod->max_ord = qp->max_rd_atomic_req;
+	p_ramrod->traffic_class = qp->traffic_class_tos;
+	p_ramrod->hop_limit = qp->hop_limit_ttl;
+	p_ramrod->p_key = cpu_to_le16(qp->pkey);
+	p_ramrod->flow_label = cpu_to_le32(qp->flow_label);
+	p_ramrod->ack_timeout_val = cpu_to_le32(qp->ack_timeout);
+	p_ramrod->mtu = cpu_to_le16(qp->mtu);
+	qed_rdma_copy_gids(qp, p_ramrod->src_gid, p_ramrod->dst_gid);
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Modify requester, rc = %d\n", rc);
+	return rc;
+}
+
+static int qed_roce_sp_destroy_qp_responder(struct qed_hwfn *p_hwfn,
+					    struct qed_rdma_qp *qp,
+					    u32 *num_invalidated_mw,
+					    u32 *cq_prod)
+{
+	struct roce_destroy_qp_resp_output_params *p_ramrod_res;
+	struct roce_destroy_qp_resp_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	dma_addr_t ramrod_res_phys;
+	int rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid);
+
+	*num_invalidated_mw = 0;
+	*cq_prod = qp->cq_prod;
+
+	if (!qp->resp_offloaded) {
+		/* If a responder was never offload, we need to free the cids
+		 * allocated in create_qp as a FW async event will never arrive
+		 */
+		u32 cid;
+
+		cid = qp->icid -
+		      qed_cxt_get_proto_cid_start(p_hwfn,
+						  p_hwfn->p_rdma_info->proto);
+		qed_roce_free_cid_pair(p_hwfn, (u16)cid);
+
+		return 0;
+	}
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qp->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ROCE_RAMROD_DESTROY_QP,
+				 PROTOCOLID_ROCE, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.roce_destroy_qp_resp;
+
+	p_ramrod_res = (struct roce_destroy_qp_resp_output_params *)
+	    dma_alloc_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_ramrod_res),
+			       &ramrod_res_phys, GFP_KERNEL);
+
+	if (!p_ramrod_res) {
+		rc = -ENOMEM;
+		DP_NOTICE(p_hwfn,
+			  "qed destroy responder failed: cannot allocate memory (ramrod). rc = %d\n",
+			  rc);
+		return rc;
+	}
+
+	DMA_REGPAIR_LE(p_ramrod->output_params_addr, ramrod_res_phys);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		goto err;
+
+	*num_invalidated_mw = le32_to_cpu(p_ramrod_res->num_invalidated_mw);
+	*cq_prod = le32_to_cpu(p_ramrod_res->cq_prod);
+	qp->cq_prod = *cq_prod;
+
+	/* Free IRQ - only if ramrod succeeded, in case FW is still using it */
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  qp->irq_num_pages * RDMA_RING_PAGE_SIZE,
+			  qp->irq, qp->irq_phys_addr);
+
+	qp->resp_offloaded = false;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Destroy responder, rc = %d\n", rc);
+
+err:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  sizeof(struct roce_destroy_qp_resp_output_params),
+			  p_ramrod_res, ramrod_res_phys);
+
+	return rc;
+}
+
+static int qed_roce_sp_destroy_qp_requester(struct qed_hwfn *p_hwfn,
+					    struct qed_rdma_qp *qp,
+					    u32 *num_bound_mw)
+{
+	struct roce_destroy_qp_req_output_params *p_ramrod_res;
+	struct roce_destroy_qp_req_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data;
+	struct qed_spq_entry *p_ent;
+	dma_addr_t ramrod_res_phys;
+	int rc = -ENOMEM;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid);
+
+	if (!qp->req_offloaded)
+		return 0;
+
+	p_ramrod_res = (struct roce_destroy_qp_req_output_params *)
+		       dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+					  sizeof(*p_ramrod_res),
+					  &ramrod_res_phys, GFP_KERNEL);
+	if (!p_ramrod_res) {
+		DP_NOTICE(p_hwfn,
+			  "qed destroy requester failed: cannot allocate memory (ramrod)\n");
+		return rc;
+	}
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qp->icid + 1;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent, ROCE_RAMROD_DESTROY_QP,
+				 PROTOCOLID_ROCE, &init_data);
+	if (rc)
+		goto err;
+
+	p_ramrod = &p_ent->ramrod.roce_destroy_qp_req;
+	DMA_REGPAIR_LE(p_ramrod->output_params_addr, ramrod_res_phys);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		goto err;
+
+	*num_bound_mw = le32_to_cpu(p_ramrod_res->num_bound_mw);
+
+	/* Free ORQ - only if ramrod succeeded, in case FW is still using it */
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  qp->orq_num_pages * RDMA_RING_PAGE_SIZE,
+			  qp->orq, qp->orq_phys_addr);
+
+	qp->req_offloaded = false;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Destroy requester, rc = %d\n", rc);
+
+err:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_ramrod_res),
+			  p_ramrod_res, ramrod_res_phys);
+
+	return rc;
+}
+
+int qed_roce_query_qp(struct qed_hwfn *p_hwfn,
+		      struct qed_rdma_qp *qp,
+		      struct qed_rdma_query_qp_out_params *out_params)
+{
+	struct roce_query_qp_resp_output_params *p_resp_ramrod_res;
+	struct roce_query_qp_req_output_params *p_req_ramrod_res;
+	struct roce_query_qp_resp_ramrod_data *p_resp_ramrod;
+	struct roce_query_qp_req_ramrod_data *p_req_ramrod;
+	struct qed_sp_init_data init_data;
+	dma_addr_t resp_ramrod_res_phys;
+	dma_addr_t req_ramrod_res_phys;
+	struct qed_spq_entry *p_ent;
+	bool rq_err_state;
+	bool sq_err_state;
+	bool sq_draining;
+	int rc = -ENOMEM;
+
+	if ((!(qp->resp_offloaded)) && (!(qp->req_offloaded))) {
+		/* We can't send ramrod to the fw since this qp wasn't offloaded
+		 * to the fw yet
+		 */
+		out_params->draining = false;
+		out_params->rq_psn = qp->rq_psn;
+		out_params->sq_psn = qp->sq_psn;
+		out_params->state = qp->cur_state;
+
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "No QPs as no offload\n");
+		return 0;
+	}
+
+	if (!(qp->resp_offloaded)) {
+		DP_NOTICE(p_hwfn,
+			  "The responder's qp should be offloaded before requester's\n");
+		return -EINVAL;
+	}
+
+	/* Send a query responder ramrod to FW to get RQ-PSN and state */
+	p_resp_ramrod_res = (struct roce_query_qp_resp_output_params *)
+	    dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+			       sizeof(*p_resp_ramrod_res),
+			       &resp_ramrod_res_phys, GFP_KERNEL);
+	if (!p_resp_ramrod_res) {
+		DP_NOTICE(p_hwfn,
+			  "qed query qp failed: cannot allocate memory (ramrod)\n");
+		return rc;
+	}
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qp->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+	rc = qed_sp_init_request(p_hwfn, &p_ent, ROCE_RAMROD_QUERY_QP,
+				 PROTOCOLID_ROCE, &init_data);
+	if (rc)
+		goto err_resp;
+
+	p_resp_ramrod = &p_ent->ramrod.roce_query_qp_resp;
+	DMA_REGPAIR_LE(p_resp_ramrod->output_params_addr, resp_ramrod_res_phys);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		goto err_resp;
+
+	out_params->rq_psn = le32_to_cpu(p_resp_ramrod_res->psn);
+	rq_err_state = GET_FIELD(le32_to_cpu(p_resp_ramrod_res->err_flag),
+				 ROCE_QUERY_QP_RESP_OUTPUT_PARAMS_ERROR_FLG);
+
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_resp_ramrod_res),
+			  p_resp_ramrod_res, resp_ramrod_res_phys);
+
+	if (!(qp->req_offloaded)) {
+		/* Don't send query qp for the requester */
+		out_params->sq_psn = qp->sq_psn;
+		out_params->draining = false;
+
+		if (rq_err_state)
+			qp->cur_state = QED_ROCE_QP_STATE_ERR;
+
+		out_params->state = qp->cur_state;
+
+		return 0;
+	}
+
+	/* Send a query requester ramrod to FW to get SQ-PSN and state */
+	p_req_ramrod_res = (struct roce_query_qp_req_output_params *)
+			   dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+					      sizeof(*p_req_ramrod_res),
+					      &req_ramrod_res_phys,
+					      GFP_KERNEL);
+	if (!p_req_ramrod_res) {
+		rc = -ENOMEM;
+		DP_NOTICE(p_hwfn,
+			  "qed query qp failed: cannot allocate memory (ramrod)\n");
+		return rc;
+	}
+
+	/* Get SPQ entry */
+	init_data.cid = qp->icid + 1;
+	rc = qed_sp_init_request(p_hwfn, &p_ent, ROCE_RAMROD_QUERY_QP,
+				 PROTOCOLID_ROCE, &init_data);
+	if (rc)
+		goto err_req;
+
+	p_req_ramrod = &p_ent->ramrod.roce_query_qp_req;
+	DMA_REGPAIR_LE(p_req_ramrod->output_params_addr, req_ramrod_res_phys);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		goto err_req;
+
+	out_params->sq_psn = le32_to_cpu(p_req_ramrod_res->psn);
+	sq_err_state = GET_FIELD(le32_to_cpu(p_req_ramrod_res->flags),
+				 ROCE_QUERY_QP_REQ_OUTPUT_PARAMS_ERR_FLG);
+	sq_draining =
+		GET_FIELD(le32_to_cpu(p_req_ramrod_res->flags),
+			  ROCE_QUERY_QP_REQ_OUTPUT_PARAMS_SQ_DRAINING_FLG);
+
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_req_ramrod_res),
+			  p_req_ramrod_res, req_ramrod_res_phys);
+
+	out_params->draining = false;
+
+	if (rq_err_state || sq_err_state)
+		qp->cur_state = QED_ROCE_QP_STATE_ERR;
+	else if (sq_draining)
+		out_params->draining = true;
+	out_params->state = qp->cur_state;
+
+	return 0;
+
+err_req:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_req_ramrod_res),
+			  p_req_ramrod_res, req_ramrod_res_phys);
+	return rc;
+err_resp:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev, sizeof(*p_resp_ramrod_res),
+			  p_resp_ramrod_res, resp_ramrod_res_phys);
+	return rc;
+}
+
+int qed_roce_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp)
+{
+	u32 num_invalidated_mw = 0;
+	u32 num_bound_mw = 0;
+	u32 cq_prod;
+	int rc;
+
+	/* Destroys the specified QP */
+	if ((qp->cur_state != QED_ROCE_QP_STATE_RESET) &&
+	    (qp->cur_state != QED_ROCE_QP_STATE_ERR) &&
+	    (qp->cur_state != QED_ROCE_QP_STATE_INIT)) {
+		DP_NOTICE(p_hwfn,
+			  "QP must be in error, reset or init state before destroying it\n");
+		return -EINVAL;
+	}
+
+	if (qp->cur_state != QED_ROCE_QP_STATE_RESET) {
+		rc = qed_roce_sp_destroy_qp_responder(p_hwfn, qp,
+						      &num_invalidated_mw,
+						      &cq_prod);
+		if (rc)
+			return rc;
+
+		/* Send destroy requester ramrod */
+		rc = qed_roce_sp_destroy_qp_requester(p_hwfn, qp,
+						      &num_bound_mw);
+		if (rc)
+			return rc;
+
+		if (num_invalidated_mw != num_bound_mw) {
+			DP_NOTICE(p_hwfn,
+				  "number of invalidate memory windows is different from bounded ones\n");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+int qed_roce_modify_qp(struct qed_hwfn *p_hwfn,
+		       struct qed_rdma_qp *qp,
+		       enum qed_roce_qp_state prev_state,
+		       struct qed_rdma_modify_qp_in_params *params)
+{
+	u32 num_invalidated_mw = 0, num_bound_mw = 0;
+	int rc = 0;
+
+	/* Perform additional operations according to the current state and the
+	 * next state
+	 */
+	if (((prev_state == QED_ROCE_QP_STATE_INIT) ||
+	     (prev_state == QED_ROCE_QP_STATE_RESET)) &&
+	    (qp->cur_state == QED_ROCE_QP_STATE_RTR)) {
+		/* Init->RTR or Reset->RTR */
+		rc = qed_roce_sp_create_responder(p_hwfn, qp);
+		return rc;
+	} else if ((prev_state == QED_ROCE_QP_STATE_RTR) &&
+		   (qp->cur_state == QED_ROCE_QP_STATE_RTS)) {
+		/* RTR-> RTS */
+		rc = qed_roce_sp_create_requester(p_hwfn, qp);
+		if (rc)
+			return rc;
+
+		/* Send modify responder ramrod */
+		rc = qed_roce_sp_modify_responder(p_hwfn, qp, false,
+						  params->modify_flags);
+		return rc;
+	} else if ((prev_state == QED_ROCE_QP_STATE_RTS) &&
+		   (qp->cur_state == QED_ROCE_QP_STATE_RTS)) {
+		/* RTS->RTS */
+		rc = qed_roce_sp_modify_responder(p_hwfn, qp, false,
+						  params->modify_flags);
+		if (rc)
+			return rc;
+
+		rc = qed_roce_sp_modify_requester(p_hwfn, qp, false, false,
+						  params->modify_flags);
+		return rc;
+	} else if ((prev_state == QED_ROCE_QP_STATE_RTS) &&
+		   (qp->cur_state == QED_ROCE_QP_STATE_SQD)) {
+		/* RTS->SQD */
+		rc = qed_roce_sp_modify_requester(p_hwfn, qp, true, false,
+						  params->modify_flags);
+		return rc;
+	} else if ((prev_state == QED_ROCE_QP_STATE_SQD) &&
+		   (qp->cur_state == QED_ROCE_QP_STATE_SQD)) {
+		/* SQD->SQD */
+		rc = qed_roce_sp_modify_responder(p_hwfn, qp, false,
+						  params->modify_flags);
+		if (rc)
+			return rc;
+
+		rc = qed_roce_sp_modify_requester(p_hwfn, qp, false, false,
+						  params->modify_flags);
+		return rc;
+	} else if ((prev_state == QED_ROCE_QP_STATE_SQD) &&
+		   (qp->cur_state == QED_ROCE_QP_STATE_RTS)) {
+		/* SQD->RTS */
+		rc = qed_roce_sp_modify_responder(p_hwfn, qp, false,
+						  params->modify_flags);
+		if (rc)
+			return rc;
+
+		rc = qed_roce_sp_modify_requester(p_hwfn, qp, false, false,
+						  params->modify_flags);
+
+		return rc;
+	} else if (qp->cur_state == QED_ROCE_QP_STATE_ERR) {
+		/* ->ERR */
+		rc = qed_roce_sp_modify_responder(p_hwfn, qp, true,
+						  params->modify_flags);
+		if (rc)
+			return rc;
+
+		rc = qed_roce_sp_modify_requester(p_hwfn, qp, false, true,
+						  params->modify_flags);
+		return rc;
+	} else if (qp->cur_state == QED_ROCE_QP_STATE_RESET) {
+		/* Any state -> RESET */
+		u32 cq_prod;
+
+		/* Send destroy responder ramrod */
+		rc = qed_roce_sp_destroy_qp_responder(p_hwfn,
+						      qp,
+						      &num_invalidated_mw,
+						      &cq_prod);
+
+		if (rc)
+			return rc;
+
+		qp->cq_prod = cq_prod;
+
+		rc = qed_roce_sp_destroy_qp_requester(p_hwfn, qp,
+						      &num_bound_mw);
+
+		if (num_invalidated_mw != num_bound_mw) {
+			DP_NOTICE(p_hwfn,
+				  "number of invalidate memory windows is different from bounded ones\n");
+			return -EINVAL;
+		}
+	} else {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "0\n");
+	}
+
+	return rc;
+}
+
+static void qed_roce_free_real_icid(struct qed_hwfn *p_hwfn, u16 icid)
+{
+	struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info;
+	u32 start_cid, cid, xcid;
+
+	/* an even icid belongs to a responder while an odd icid belongs to a
+	 * requester. The 'cid' received as an input can be either. We calculate
+	 * the "partner" icid and call it xcid. Only if both are free then the
+	 * "cid" map can be cleared.
+	 */
+	start_cid = qed_cxt_get_proto_cid_start(p_hwfn, p_rdma_info->proto);
+	cid = icid - start_cid;
+	xcid = cid ^ 1;
+
+	spin_lock_bh(&p_rdma_info->lock);
+
+	qed_bmap_release_id(p_hwfn, &p_rdma_info->real_cid_map, cid);
+	if (qed_bmap_test_id(p_hwfn, &p_rdma_info->real_cid_map, xcid) == 0) {
+		qed_bmap_release_id(p_hwfn, &p_rdma_info->cid_map, cid);
+		qed_bmap_release_id(p_hwfn, &p_rdma_info->cid_map, xcid);
+	}
+
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+}
+
+void qed_roce_dpm_dcbx(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u8 val;
+
+	/* if any QPs are already active, we want to disable DPM, since their
+	 * context information contains information from before the latest DCBx
+	 * update. Otherwise enable it.
+	 */
+	val = qed_rdma_allocated_qps(p_hwfn) ? true : false;
+	p_hwfn->dcbx_no_edpm = (u8)val;
+
+	qed_rdma_dpm_conf(p_hwfn, p_ptt);
+}
+
+int qed_roce_setup(struct qed_hwfn *p_hwfn)
+{
+	return qed_spq_register_async_cb(p_hwfn, PROTOCOLID_ROCE,
+					 qed_roce_async_event);
+}
+
+int qed_roce_init_hw(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 ll2_ethertype_en;
+
+	qed_wr(p_hwfn, p_ptt, PRS_REG_ROCE_DEST_QP_MAX_PF, 0);
+
+	p_hwfn->rdma_prs_search_reg = PRS_REG_SEARCH_ROCE;
+
+	ll2_ethertype_en = qed_rd(p_hwfn, p_ptt, PRS_REG_LIGHT_L2_ETHERTYPE_EN);
+	qed_wr(p_hwfn, p_ptt, PRS_REG_LIGHT_L2_ETHERTYPE_EN,
+	       (ll2_ethertype_en | 0x01));
+
+	if (qed_cxt_get_proto_cid_start(p_hwfn, PROTOCOLID_ROCE) % 2) {
+		DP_NOTICE(p_hwfn, "The first RoCE's cid should be even\n");
+		return -EINVAL;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Initializing HW - Done\n");
+	return 0;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.h b/drivers/net/ethernet/qlogic/qed/qed_roce.h
new file mode 100644
index 0000000..f801f39
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.h
@@ -0,0 +1,59 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _QED_ROCE_H
+#define _QED_ROCE_H
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#if IS_ENABLED(CONFIG_QED_RDMA)
+void qed_roce_dpm_dcbx(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+#else
+static inline void qed_roce_dpm_dcbx(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt) {}
+#endif
+
+int qed_roce_setup(struct qed_hwfn *p_hwfn);
+void qed_roce_stop(struct qed_hwfn *p_hwfn);
+int qed_roce_init_hw(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+int qed_roce_alloc_cid(struct qed_hwfn *p_hwfn, u16 *cid);
+int qed_roce_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp);
+
+int qed_roce_query_qp(struct qed_hwfn *p_hwfn,
+		      struct qed_rdma_qp *qp,
+		      struct qed_rdma_query_qp_out_params *out_params);
+
+int qed_roce_modify_qp(struct qed_hwfn *p_hwfn,
+		       struct qed_rdma_qp *qp,
+		       enum qed_roce_qp_state prev_state,
+		       struct qed_rdma_modify_qp_in_params *params);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_selftest.c b/drivers/net/ethernet/qlogic/qed/qed_selftest.c
new file mode 100644
index 0000000..cf1d447
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_selftest.c
@@ -0,0 +1,211 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2016  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/crc32.h>
+#include "qed.h"
+#include "qed_dev_api.h"
+#include "qed_mcp.h"
+#include "qed_sp.h"
+#include "qed_selftest.h"
+
+int qed_selftest_memory(struct qed_dev *cdev)
+{
+	int rc = 0, i;
+
+	for_each_hwfn(cdev, i) {
+		rc = qed_sp_heartbeat_ramrod(&cdev->hwfns[i]);
+		if (rc)
+			return rc;
+	}
+
+	return rc;
+}
+
+int qed_selftest_interrupt(struct qed_dev *cdev)
+{
+	int rc = 0, i;
+
+	for_each_hwfn(cdev, i) {
+		rc = qed_sp_heartbeat_ramrod(&cdev->hwfns[i]);
+		if (rc)
+			return rc;
+	}
+
+	return rc;
+}
+
+int qed_selftest_register(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn;
+	struct qed_ptt *p_ptt;
+	int rc = 0, i;
+
+	/* although performed by MCP, this test is per engine */
+	for_each_hwfn(cdev, i) {
+		p_hwfn = &cdev->hwfns[i];
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt) {
+			DP_ERR(p_hwfn, "failed to acquire ptt\n");
+			return -EBUSY;
+		}
+		rc = qed_mcp_bist_register_test(p_hwfn, p_ptt);
+		qed_ptt_release(p_hwfn, p_ptt);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+int qed_selftest_clock(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn;
+	struct qed_ptt *p_ptt;
+	int rc = 0, i;
+
+	/* although performed by MCP, this test is per engine */
+	for_each_hwfn(cdev, i) {
+		p_hwfn = &cdev->hwfns[i];
+		p_ptt = qed_ptt_acquire(p_hwfn);
+		if (!p_ptt) {
+			DP_ERR(p_hwfn, "failed to acquire ptt\n");
+			return -EBUSY;
+		}
+		rc = qed_mcp_bist_clock_test(p_hwfn, p_ptt);
+		qed_ptt_release(p_hwfn, p_ptt);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+int qed_selftest_nvram(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = qed_ptt_acquire(p_hwfn);
+	u32 num_images, i, j, nvm_crc, calc_crc;
+	struct bist_nvm_image_att image_att;
+	u8 *buf = NULL;
+	__be32 val;
+	int rc;
+
+	if (!p_ptt) {
+		DP_ERR(p_hwfn, "failed to acquire ptt\n");
+		return -EBUSY;
+	}
+
+	/* Acquire from MFW the amount of available images */
+	rc = qed_mcp_bist_nvm_get_num_images(p_hwfn, p_ptt, &num_images);
+	if (rc || !num_images) {
+		DP_ERR(p_hwfn, "Failed getting number of images\n");
+		rc = -EINVAL;
+		goto err0;
+	}
+
+	/* Iterate over images and validate CRC */
+	for (i = 0; i < num_images; i++) {
+		/* This mailbox returns information about the image required for
+		 * reading it.
+		 */
+		rc = qed_mcp_bist_nvm_get_image_att(p_hwfn, p_ptt,
+						    &image_att, i);
+		if (rc) {
+			DP_ERR(p_hwfn,
+			       "Failed getting image index %d attributes\n",
+			       i);
+			goto err0;
+		}
+
+		/* After MFW crash dump is collected - the image's CRC stops
+		 * being valid.
+		 */
+		if (image_att.image_type == NVM_TYPE_MDUMP)
+			continue;
+
+		DP_VERBOSE(p_hwfn, QED_MSG_SP, "image index %d, size %x\n",
+			   i, image_att.len);
+
+		/* Allocate a buffer for holding the nvram image */
+		buf = kzalloc(image_att.len, GFP_KERNEL);
+		if (!buf) {
+			rc = -ENOMEM;
+			goto err0;
+		}
+
+		/* Read image into buffer */
+		rc = qed_mcp_nvm_read(p_hwfn->cdev, image_att.nvm_start_addr,
+				      buf, image_att.len);
+		if (rc) {
+			DP_ERR(p_hwfn,
+			       "Failed reading image index %d from nvm.\n", i);
+			goto err1;
+		}
+
+		/* Convert the buffer into big-endian format (excluding the
+		 * closing 4 bytes of CRC).
+		 */
+		for (j = 0; j < image_att.len - 4; j += 4) {
+			val = cpu_to_be32(*(u32 *)&buf[j]);
+			*(u32 *)&buf[j] = (__force u32)val;
+		}
+
+		/* Calc CRC for the "actual" image buffer, i.e. not including
+		 * the last 4 CRC bytes.
+		 */
+		nvm_crc = *(u32 *)(buf + image_att.len - 4);
+		calc_crc = crc32(0xffffffff, buf, image_att.len - 4);
+		calc_crc = (__force u32)~cpu_to_be32(calc_crc);
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "nvm crc 0x%x, calc_crc 0x%x\n", nvm_crc, calc_crc);
+
+		if (calc_crc != nvm_crc) {
+			rc = -EINVAL;
+			goto err1;
+		}
+
+		/* Done with this image; Free to prevent double release
+		 * on subsequent failure.
+		 */
+		kfree(buf);
+		buf = NULL;
+	}
+
+	qed_ptt_release(p_hwfn, p_ptt);
+	return 0;
+
+err1:
+	kfree(buf);
+err0:
+	qed_ptt_release(p_hwfn, p_ptt);
+	return rc;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_selftest.h b/drivers/net/ethernet/qlogic/qed/qed_selftest.h
new file mode 100644
index 0000000..ad00d08
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_selftest.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _QED_SELFTEST_API_H
+#define _QED_SELFTEST_API_H
+#include <linux/types.h>
+
+/**
+ * @brief qed_selftest_memory - Perform memory test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_memory(struct qed_dev *cdev);
+
+/**
+ * @brief qed_selftest_interrupt - Perform interrupt test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_interrupt(struct qed_dev *cdev);
+
+/**
+ * @brief qed_selftest_register - Perform register test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_register(struct qed_dev *cdev);
+
+/**
+ * @brief qed_selftest_clock - Perform clock test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_clock(struct qed_dev *cdev);
+
+/**
+ * @brief qed_selftest_nvram - Perform nvram test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_nvram(struct qed_dev *cdev);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
new file mode 100644
index 0000000..ab4ad8a
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -0,0 +1,483 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_SP_H
+#define _QED_SP_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/qed/qed_chain.h>
+#include "qed.h"
+#include "qed_hsi.h"
+
+enum spq_mode {
+	QED_SPQ_MODE_BLOCK,     /* Client will poll a designated mem. address */
+	QED_SPQ_MODE_CB,        /* Client supplies a callback */
+	QED_SPQ_MODE_EBLOCK,    /* QED should block until completion */
+};
+
+struct qed_spq_comp_cb {
+	void	(*function)(struct qed_hwfn *,
+			    void *,
+			    union event_ring_data *,
+			    u8 fw_return_code);
+	void	*cookie;
+};
+
+/**
+ * @brief qed_eth_cqe_completion - handles the completion of a
+ *        ramrod on the cqe ring
+ *
+ * @param p_hwfn
+ * @param cqe
+ *
+ * @return int
+ */
+int qed_eth_cqe_completion(struct qed_hwfn *p_hwfn,
+			   struct eth_slow_path_rx_cqe *cqe);
+
+/**
+ *  @file
+ *
+ *  QED Slow-hwfn queue interface
+ */
+
+union ramrod_data {
+	struct pf_start_ramrod_data pf_start;
+	struct pf_update_ramrod_data pf_update;
+	struct rx_queue_start_ramrod_data rx_queue_start;
+	struct rx_queue_update_ramrod_data rx_queue_update;
+	struct rx_queue_stop_ramrod_data rx_queue_stop;
+	struct tx_queue_start_ramrod_data tx_queue_start;
+	struct tx_queue_stop_ramrod_data tx_queue_stop;
+	struct vport_start_ramrod_data vport_start;
+	struct vport_stop_ramrod_data vport_stop;
+	struct rx_update_gft_filter_data rx_update_gft;
+	struct vport_update_ramrod_data vport_update;
+	struct core_rx_start_ramrod_data core_rx_queue_start;
+	struct core_rx_stop_ramrod_data core_rx_queue_stop;
+	struct core_tx_start_ramrod_data core_tx_queue_start;
+	struct core_tx_stop_ramrod_data core_tx_queue_stop;
+	struct vport_filter_update_ramrod_data vport_filter_update;
+
+	struct rdma_init_func_ramrod_data rdma_init_func;
+	struct rdma_close_func_ramrod_data rdma_close_func;
+	struct rdma_register_tid_ramrod_data rdma_register_tid;
+	struct rdma_deregister_tid_ramrod_data rdma_deregister_tid;
+	struct roce_create_qp_resp_ramrod_data roce_create_qp_resp;
+	struct roce_create_qp_req_ramrod_data roce_create_qp_req;
+	struct roce_modify_qp_resp_ramrod_data roce_modify_qp_resp;
+	struct roce_modify_qp_req_ramrod_data roce_modify_qp_req;
+	struct roce_query_qp_resp_ramrod_data roce_query_qp_resp;
+	struct roce_query_qp_req_ramrod_data roce_query_qp_req;
+	struct roce_destroy_qp_resp_ramrod_data roce_destroy_qp_resp;
+	struct roce_destroy_qp_req_ramrod_data roce_destroy_qp_req;
+	struct roce_init_func_ramrod_data roce_init_func;
+	struct rdma_create_cq_ramrod_data rdma_create_cq;
+	struct rdma_destroy_cq_ramrod_data rdma_destroy_cq;
+	struct rdma_srq_create_ramrod_data rdma_create_srq;
+	struct rdma_srq_destroy_ramrod_data rdma_destroy_srq;
+	struct rdma_srq_modify_ramrod_data rdma_modify_srq;
+	struct iwarp_create_qp_ramrod_data iwarp_create_qp;
+	struct iwarp_tcp_offload_ramrod_data iwarp_tcp_offload;
+	struct iwarp_mpa_offload_ramrod_data iwarp_mpa_offload;
+	struct iwarp_modify_qp_ramrod_data iwarp_modify_qp;
+	struct iwarp_init_func_ramrod_data iwarp_init_func;
+	struct fcoe_init_ramrod_params fcoe_init;
+	struct fcoe_conn_offload_ramrod_params fcoe_conn_ofld;
+	struct fcoe_conn_terminate_ramrod_params fcoe_conn_terminate;
+	struct fcoe_stat_ramrod_params fcoe_stat;
+
+	struct iscsi_slow_path_hdr iscsi_empty;
+	struct iscsi_init_ramrod_params iscsi_init;
+	struct iscsi_spe_func_dstry iscsi_destroy;
+	struct iscsi_spe_conn_offload iscsi_conn_offload;
+	struct iscsi_conn_update_ramrod_params iscsi_conn_update;
+	struct iscsi_spe_conn_mac_update iscsi_conn_mac_update;
+	struct iscsi_spe_conn_termination iscsi_conn_terminate;
+
+	struct vf_start_ramrod_data vf_start;
+	struct vf_stop_ramrod_data vf_stop;
+};
+
+#define EQ_MAX_CREDIT   0xffffffff
+
+enum spq_priority {
+	QED_SPQ_PRIORITY_NORMAL,
+	QED_SPQ_PRIORITY_HIGH,
+};
+
+union qed_spq_req_comp {
+	struct qed_spq_comp_cb	cb;
+	u64			*done_addr;
+};
+
+struct qed_spq_comp_done {
+	unsigned int	done;
+	u8		fw_return_code;
+};
+
+struct qed_spq_entry {
+	struct list_head		list;
+
+	u8				flags;
+
+	/* HSI slow path element */
+	struct slow_path_element	elem;
+
+	union ramrod_data		ramrod;
+
+	enum spq_priority		priority;
+
+	/* pending queue for this entry */
+	struct list_head		*queue;
+
+	enum spq_mode			comp_mode;
+	struct qed_spq_comp_cb		comp_cb;
+	struct qed_spq_comp_done	comp_done; /* SPQ_MODE_EBLOCK */
+};
+
+struct qed_eq {
+	struct qed_chain	chain;
+	u8			eq_sb_index;    /* index within the SB */
+	__le16			*p_fw_cons;     /* ptr to index value */
+};
+
+struct qed_consq {
+	struct qed_chain chain;
+};
+
+typedef int
+(*qed_spq_async_comp_cb)(struct qed_hwfn *p_hwfn,
+			 u8 opcode,
+			 u16 echo,
+			 union event_ring_data *data,
+			 u8 fw_return_code);
+
+int
+qed_spq_register_async_cb(struct qed_hwfn *p_hwfn,
+			  enum protocol_type protocol_id,
+			  qed_spq_async_comp_cb cb);
+
+void
+qed_spq_unregister_async_cb(struct qed_hwfn *p_hwfn,
+			    enum protocol_type protocol_id);
+
+struct qed_spq {
+	spinlock_t		lock; /* SPQ lock */
+
+	struct list_head	unlimited_pending;
+	struct list_head	pending;
+	struct list_head	completion_pending;
+	struct list_head	free_pool;
+
+	struct qed_chain	chain;
+
+	/* allocated dma-able memory for spq entries (+ramrod data) */
+	dma_addr_t		p_phys;
+	struct qed_spq_entry	*p_virt;
+
+#define SPQ_RING_SIZE \
+	(CORE_SPQE_PAGE_SIZE_BYTES / sizeof(struct slow_path_element))
+
+	/* Bitmap for handling out-of-order completions */
+	DECLARE_BITMAP(p_comp_bitmap, SPQ_RING_SIZE);
+	u8			comp_bitmap_idx;
+
+	/* Statistics */
+	u32			unlimited_pending_count;
+	u32			normal_count;
+	u32			high_count;
+	u32			comp_sent_count;
+	u32			comp_count;
+
+	u32			cid;
+	qed_spq_async_comp_cb async_comp_cb[MAX_PROTOCOL_TYPE];
+};
+
+/**
+ * @brief qed_spq_post - Posts a Slow hwfn request to FW, or lacking that
+ *        Pends it to the future list.
+ *
+ * @param p_hwfn
+ * @param p_req
+ *
+ * @return int
+ */
+int qed_spq_post(struct qed_hwfn *p_hwfn,
+		 struct qed_spq_entry *p_ent,
+		 u8 *fw_return_code);
+
+/**
+ * @brief qed_spq_allocate - Alloocates & initializes the SPQ and EQ.
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_spq_alloc(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_spq_setup - Reset the SPQ to its start state.
+ *
+ * @param p_hwfn
+ */
+void qed_spq_setup(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_spq_deallocate - Deallocates the given SPQ struct.
+ *
+ * @param p_hwfn
+ */
+void qed_spq_free(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_spq_get_entry - Obtain an entrry from the spq
+ *        free pool list.
+ *
+ *
+ *
+ * @param p_hwfn
+ * @param pp_ent
+ *
+ * @return int
+ */
+int
+qed_spq_get_entry(struct qed_hwfn *p_hwfn,
+		  struct qed_spq_entry **pp_ent);
+
+/**
+ * @brief qed_spq_return_entry - Return an entry to spq free
+ *                                 pool list
+ *
+ * @param p_hwfn
+ * @param p_ent
+ */
+void qed_spq_return_entry(struct qed_hwfn *p_hwfn,
+			  struct qed_spq_entry *p_ent);
+/**
+ * @brief qed_eq_allocate - Allocates & initializes an EQ struct
+ *
+ * @param p_hwfn
+ * @param num_elem number of elements in the eq
+ *
+ * @return int
+ */
+int qed_eq_alloc(struct qed_hwfn *p_hwfn, u16 num_elem);
+
+/**
+ * @brief qed_eq_setup - Reset the EQ to its start state.
+ *
+ * @param p_hwfn
+ */
+void qed_eq_setup(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_eq_free - deallocates the given EQ struct.
+ *
+ * @param p_hwfn
+ */
+void qed_eq_free(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_eq_prod_update - update the FW with default EQ producer
+ *
+ * @param p_hwfn
+ * @param prod
+ */
+void qed_eq_prod_update(struct qed_hwfn *p_hwfn,
+			u16 prod);
+
+/**
+ * @brief qed_eq_completion - Completes currently pending EQ elements
+ *
+ * @param p_hwfn
+ * @param cookie
+ *
+ * @return int
+ */
+int qed_eq_completion(struct qed_hwfn *p_hwfn,
+		      void *cookie);
+
+/**
+ * @brief qed_spq_completion - Completes a single event
+ *
+ * @param p_hwfn
+ * @param echo - echo value from cookie (used for determining completion)
+ * @param p_data - data from cookie (used in callback function if applicable)
+ *
+ * @return int
+ */
+int qed_spq_completion(struct qed_hwfn *p_hwfn,
+		       __le16 echo,
+		       u8 fw_return_code,
+		       union event_ring_data *p_data);
+
+/**
+ * @brief qed_spq_get_cid - Given p_hwfn, return cid for the hwfn's SPQ
+ *
+ * @param p_hwfn
+ *
+ * @return u32 - SPQ CID
+ */
+u32 qed_spq_get_cid(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_consq_alloc - Allocates & initializes an ConsQ
+ *        struct
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_consq_alloc(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_consq_setup - Reset the ConsQ to its start state.
+ *
+ * @param p_hwfn
+ */
+void qed_consq_setup(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_consq_free - deallocates the given ConsQ struct.
+ *
+ * @param p_hwfn
+ */
+void qed_consq_free(struct qed_hwfn *p_hwfn);
+
+/**
+ * @file
+ *
+ * @brief Slow-hwfn low-level commands (Ramrods) function definitions.
+ */
+
+#define QED_SP_EQ_COMPLETION  0x01
+#define QED_SP_CQE_COMPLETION 0x02
+
+struct qed_sp_init_data {
+	u32			cid;
+	u16			opaque_fid;
+
+	/* Information regarding operation upon sending & completion */
+	enum spq_mode		comp_mode;
+	struct qed_spq_comp_cb *p_comp_data;
+};
+
+int qed_sp_init_request(struct qed_hwfn *p_hwfn,
+			struct qed_spq_entry **pp_ent,
+			u8 cmd,
+			u8 protocol,
+			struct qed_sp_init_data *p_data);
+
+/**
+ * @brief qed_sp_pf_start - PF Function Start Ramrod
+ *
+ * This ramrod is sent to initialize a physical function (PF). It will
+ * configure the function related parameters and write its completion to the
+ * event ring specified in the parameters.
+ *
+ * Ramrods complete on the common event ring for the PF. This ring is
+ * allocated by the driver on host memory and its parameters are written
+ * to the internal RAM of the UStorm by the Function Start Ramrod.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param p_tunn
+ * @param mode
+ * @param allow_npar_tx_switch
+ *
+ * @return int
+ */
+
+int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt,
+		    struct qed_tunnel_info *p_tunn,
+		    enum qed_mf_mode mode, bool allow_npar_tx_switch);
+
+/**
+ * @brief qed_sp_pf_update - PF Function Update Ramrod
+ *
+ * This ramrod updates function-related parameters. Every parameter can be
+ * updated independently, according to configuration flags.
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+
+int qed_sp_pf_update(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_sp_pf_update_stag - Update firmware of new outer tag
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_sp_pf_update_stag(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_sp_pf_stop - PF Function Stop Ramrod
+ *
+ * This ramrod is sent to close a Physical Function (PF). It is the last ramrod
+ * sent and the last completion written to the PFs Event Ring. This ramrod also
+ * deletes the context for the Slowhwfn connection on this PF.
+ *
+ * @note Not required for first packet.
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+
+int qed_sp_pf_stop(struct qed_hwfn *p_hwfn);
+
+int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      struct qed_tunnel_info *p_tunn,
+			      enum spq_mode comp_mode,
+			      struct qed_spq_comp_cb *p_comp_data);
+/**
+ * @brief qed_sp_heartbeat_ramrod - Send empty Ramrod
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+
+int qed_sp_heartbeat_ramrod(struct qed_hwfn *p_hwfn);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
new file mode 100644
index 0000000..5e927b6
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -0,0 +1,543 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include "qed.h"
+#include <linux/qed/qed_chain.h>
+#include "qed_cxt.h"
+#include "qed_dcbx.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_int.h"
+#include "qed_reg_addr.h"
+#include "qed_sp.h"
+#include "qed_sriov.h"
+
+int qed_sp_init_request(struct qed_hwfn *p_hwfn,
+			struct qed_spq_entry **pp_ent,
+			u8 cmd, u8 protocol, struct qed_sp_init_data *p_data)
+{
+	u32 opaque_cid = p_data->opaque_fid << 16 | p_data->cid;
+	struct qed_spq_entry *p_ent = NULL;
+	int rc;
+
+	if (!pp_ent)
+		return -ENOMEM;
+
+	rc = qed_spq_get_entry(p_hwfn, pp_ent);
+
+	if (rc)
+		return rc;
+
+	p_ent = *pp_ent;
+
+	p_ent->elem.hdr.cid		= cpu_to_le32(opaque_cid);
+	p_ent->elem.hdr.cmd_id		= cmd;
+	p_ent->elem.hdr.protocol_id	= protocol;
+
+	p_ent->priority		= QED_SPQ_PRIORITY_NORMAL;
+	p_ent->comp_mode	= p_data->comp_mode;
+	p_ent->comp_done.done	= 0;
+
+	switch (p_ent->comp_mode) {
+	case QED_SPQ_MODE_EBLOCK:
+		p_ent->comp_cb.cookie = &p_ent->comp_done;
+		break;
+
+	case QED_SPQ_MODE_BLOCK:
+		if (!p_data->p_comp_data)
+			return -EINVAL;
+
+		p_ent->comp_cb.cookie = p_data->p_comp_data->cookie;
+		break;
+
+	case QED_SPQ_MODE_CB:
+		if (!p_data->p_comp_data)
+			p_ent->comp_cb.function = NULL;
+		else
+			p_ent->comp_cb = *p_data->p_comp_data;
+		break;
+
+	default:
+		DP_NOTICE(p_hwfn, "Unknown SPQE completion mode %d\n",
+			  p_ent->comp_mode);
+		return -EINVAL;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
+		   "Initialized: CID %08x cmd %02x protocol %02x data_addr %lu comp_mode [%s]\n",
+		   opaque_cid, cmd, protocol,
+		   (unsigned long)&p_ent->ramrod,
+		   D_TRINE(p_ent->comp_mode, QED_SPQ_MODE_EBLOCK,
+			   QED_SPQ_MODE_BLOCK, "MODE_EBLOCK", "MODE_BLOCK",
+			   "MODE_CB"));
+
+	memset(&p_ent->ramrod, 0, sizeof(p_ent->ramrod));
+
+	return 0;
+}
+
+static enum tunnel_clss qed_tunn_clss_to_fw_clss(u8 type)
+{
+	switch (type) {
+	case QED_TUNN_CLSS_MAC_VLAN:
+		return TUNNEL_CLSS_MAC_VLAN;
+	case QED_TUNN_CLSS_MAC_VNI:
+		return TUNNEL_CLSS_MAC_VNI;
+	case QED_TUNN_CLSS_INNER_MAC_VLAN:
+		return TUNNEL_CLSS_INNER_MAC_VLAN;
+	case QED_TUNN_CLSS_INNER_MAC_VNI:
+		return TUNNEL_CLSS_INNER_MAC_VNI;
+	case QED_TUNN_CLSS_MAC_VLAN_DUAL_STAGE:
+		return TUNNEL_CLSS_MAC_VLAN_DUAL_STAGE;
+	default:
+		return TUNNEL_CLSS_MAC_VLAN;
+	}
+}
+
+static void
+qed_set_pf_update_tunn_mode(struct qed_tunnel_info *p_tun,
+			    struct qed_tunnel_info *p_src, bool b_pf_start)
+{
+	if (p_src->vxlan.b_update_mode || b_pf_start)
+		p_tun->vxlan.b_mode_enabled = p_src->vxlan.b_mode_enabled;
+
+	if (p_src->l2_gre.b_update_mode || b_pf_start)
+		p_tun->l2_gre.b_mode_enabled = p_src->l2_gre.b_mode_enabled;
+
+	if (p_src->ip_gre.b_update_mode || b_pf_start)
+		p_tun->ip_gre.b_mode_enabled = p_src->ip_gre.b_mode_enabled;
+
+	if (p_src->l2_geneve.b_update_mode || b_pf_start)
+		p_tun->l2_geneve.b_mode_enabled =
+		    p_src->l2_geneve.b_mode_enabled;
+
+	if (p_src->ip_geneve.b_update_mode || b_pf_start)
+		p_tun->ip_geneve.b_mode_enabled =
+		    p_src->ip_geneve.b_mode_enabled;
+}
+
+static void qed_set_tunn_cls_info(struct qed_tunnel_info *p_tun,
+				  struct qed_tunnel_info *p_src)
+{
+	enum tunnel_clss type;
+
+	p_tun->b_update_rx_cls = p_src->b_update_rx_cls;
+	p_tun->b_update_tx_cls = p_src->b_update_tx_cls;
+
+	type = qed_tunn_clss_to_fw_clss(p_src->vxlan.tun_cls);
+	p_tun->vxlan.tun_cls = type;
+	type = qed_tunn_clss_to_fw_clss(p_src->l2_gre.tun_cls);
+	p_tun->l2_gre.tun_cls = type;
+	type = qed_tunn_clss_to_fw_clss(p_src->ip_gre.tun_cls);
+	p_tun->ip_gre.tun_cls = type;
+	type = qed_tunn_clss_to_fw_clss(p_src->l2_geneve.tun_cls);
+	p_tun->l2_geneve.tun_cls = type;
+	type = qed_tunn_clss_to_fw_clss(p_src->ip_geneve.tun_cls);
+	p_tun->ip_geneve.tun_cls = type;
+}
+
+static void qed_set_tunn_ports(struct qed_tunnel_info *p_tun,
+			       struct qed_tunnel_info *p_src)
+{
+	p_tun->geneve_port.b_update_port = p_src->geneve_port.b_update_port;
+	p_tun->vxlan_port.b_update_port = p_src->vxlan_port.b_update_port;
+
+	if (p_src->geneve_port.b_update_port)
+		p_tun->geneve_port.port = p_src->geneve_port.port;
+
+	if (p_src->vxlan_port.b_update_port)
+		p_tun->vxlan_port.port = p_src->vxlan_port.port;
+}
+
+static void
+__qed_set_ramrod_tunnel_param(u8 *p_tunn_cls,
+			      struct qed_tunn_update_type *tun_type)
+{
+	*p_tunn_cls = tun_type->tun_cls;
+}
+
+static void
+qed_set_ramrod_tunnel_param(u8 *p_tunn_cls,
+			    struct qed_tunn_update_type *tun_type,
+			    u8 *p_update_port,
+			    __le16 *p_port,
+			    struct qed_tunn_update_udp_port *p_udp_port)
+{
+	__qed_set_ramrod_tunnel_param(p_tunn_cls, tun_type);
+	if (p_udp_port->b_update_port) {
+		*p_update_port = 1;
+		*p_port = cpu_to_le16(p_udp_port->port);
+	}
+}
+
+static void
+qed_tunn_set_pf_update_params(struct qed_hwfn *p_hwfn,
+			      struct qed_tunnel_info *p_src,
+			      struct pf_update_tunnel_config *p_tunn_cfg)
+{
+	struct qed_tunnel_info *p_tun = &p_hwfn->cdev->tunnel;
+
+	qed_set_pf_update_tunn_mode(p_tun, p_src, false);
+	qed_set_tunn_cls_info(p_tun, p_src);
+	qed_set_tunn_ports(p_tun, p_src);
+
+	qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_vxlan,
+				    &p_tun->vxlan,
+				    &p_tunn_cfg->set_vxlan_udp_port_flg,
+				    &p_tunn_cfg->vxlan_udp_port,
+				    &p_tun->vxlan_port);
+
+	qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_l2geneve,
+				    &p_tun->l2_geneve,
+				    &p_tunn_cfg->set_geneve_udp_port_flg,
+				    &p_tunn_cfg->geneve_udp_port,
+				    &p_tun->geneve_port);
+
+	__qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_ipgeneve,
+				      &p_tun->ip_geneve);
+
+	__qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_l2gre,
+				      &p_tun->l2_gre);
+
+	__qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_ipgre,
+				      &p_tun->ip_gre);
+
+	p_tunn_cfg->update_rx_pf_clss = p_tun->b_update_rx_cls;
+}
+
+static void qed_set_hw_tunn_mode(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 struct qed_tunnel_info *p_tun)
+{
+	qed_set_gre_enable(p_hwfn, p_ptt, p_tun->l2_gre.b_mode_enabled,
+			   p_tun->ip_gre.b_mode_enabled);
+	qed_set_vxlan_enable(p_hwfn, p_ptt, p_tun->vxlan.b_mode_enabled);
+
+	qed_set_geneve_enable(p_hwfn, p_ptt, p_tun->l2_geneve.b_mode_enabled,
+			      p_tun->ip_geneve.b_mode_enabled);
+}
+
+static void qed_set_hw_tunn_mode_port(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      struct qed_tunnel_info *p_tunn)
+{
+	if (p_tunn->vxlan_port.b_update_port)
+		qed_set_vxlan_dest_port(p_hwfn, p_ptt,
+					p_tunn->vxlan_port.port);
+
+	if (p_tunn->geneve_port.b_update_port)
+		qed_set_geneve_dest_port(p_hwfn, p_ptt,
+					 p_tunn->geneve_port.port);
+
+	qed_set_hw_tunn_mode(p_hwfn, p_ptt, p_tunn);
+}
+
+static void
+qed_tunn_set_pf_start_params(struct qed_hwfn *p_hwfn,
+			     struct qed_tunnel_info *p_src,
+			     struct pf_start_tunnel_config *p_tunn_cfg)
+{
+	struct qed_tunnel_info *p_tun = &p_hwfn->cdev->tunnel;
+
+	if (!p_src)
+		return;
+
+	qed_set_pf_update_tunn_mode(p_tun, p_src, true);
+	qed_set_tunn_cls_info(p_tun, p_src);
+	qed_set_tunn_ports(p_tun, p_src);
+
+	qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_vxlan,
+				    &p_tun->vxlan,
+				    &p_tunn_cfg->set_vxlan_udp_port_flg,
+				    &p_tunn_cfg->vxlan_udp_port,
+				    &p_tun->vxlan_port);
+
+	qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_l2geneve,
+				    &p_tun->l2_geneve,
+				    &p_tunn_cfg->set_geneve_udp_port_flg,
+				    &p_tunn_cfg->geneve_udp_port,
+				    &p_tun->geneve_port);
+
+	__qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_ipgeneve,
+				      &p_tun->ip_geneve);
+
+	__qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_l2gre,
+				      &p_tun->l2_gre);
+
+	__qed_set_ramrod_tunnel_param(&p_tunn_cfg->tunnel_clss_ipgre,
+				      &p_tun->ip_gre);
+}
+
+int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt,
+		    struct qed_tunnel_info *p_tunn,
+		    enum qed_mf_mode mode, bool allow_npar_tx_switch)
+{
+	struct pf_start_ramrod_data *p_ramrod = NULL;
+	u16 sb = qed_int_get_sp_sb_id(p_hwfn);
+	u8 sb_index = p_hwfn->p_eq->eq_sb_index;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+	u8 page_cnt;
+
+	/* update initial eq producer */
+	qed_eq_prod_update(p_hwfn,
+			   qed_chain_get_prod_idx(&p_hwfn->p_eq->chain));
+
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_PF_START,
+				 PROTOCOLID_COMMON, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.pf_start;
+
+	p_ramrod->event_ring_sb_id	= cpu_to_le16(sb);
+	p_ramrod->event_ring_sb_index	= sb_index;
+	p_ramrod->path_id		= QED_PATH_ID(p_hwfn);
+	p_ramrod->dont_log_ramrods	= 0;
+	p_ramrod->log_type_mask		= cpu_to_le16(0xf);
+
+	switch (mode) {
+	case QED_MF_DEFAULT:
+	case QED_MF_NPAR:
+		p_ramrod->mf_mode = MF_NPAR;
+		break;
+	case QED_MF_OVLAN:
+		p_ramrod->mf_mode = MF_OVLAN;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Unsupported MF mode, init as DEFAULT\n");
+		p_ramrod->mf_mode = MF_NPAR;
+	}
+
+	p_ramrod->outer_tag_config.outer_tag.tci =
+		cpu_to_le16(p_hwfn->hw_info.ovlan);
+
+	/* Place EQ address in RAMROD */
+	DMA_REGPAIR_LE(p_ramrod->event_ring_pbl_addr,
+		       p_hwfn->p_eq->chain.pbl_sp.p_phys_table);
+	page_cnt = (u8)qed_chain_get_page_cnt(&p_hwfn->p_eq->chain);
+	p_ramrod->event_ring_num_pages = page_cnt;
+	DMA_REGPAIR_LE(p_ramrod->consolid_q_pbl_addr,
+		       p_hwfn->p_consq->chain.pbl_sp.p_phys_table);
+
+	qed_tunn_set_pf_start_params(p_hwfn, p_tunn, &p_ramrod->tunnel_config);
+
+	if (IS_MF_SI(p_hwfn))
+		p_ramrod->allow_npar_tx_switching = allow_npar_tx_switch;
+
+	switch (p_hwfn->hw_info.personality) {
+	case QED_PCI_ETH:
+		p_ramrod->personality = PERSONALITY_ETH;
+		break;
+	case QED_PCI_FCOE:
+		p_ramrod->personality = PERSONALITY_FCOE;
+		break;
+	case QED_PCI_ISCSI:
+		p_ramrod->personality = PERSONALITY_ISCSI;
+		break;
+	case QED_PCI_ETH_ROCE:
+	case QED_PCI_ETH_IWARP:
+		p_ramrod->personality = PERSONALITY_RDMA_AND_ETH;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Unknown personality %d\n",
+			  p_hwfn->hw_info.personality);
+		p_ramrod->personality = PERSONALITY_ETH;
+	}
+
+	if (p_hwfn->cdev->p_iov_info) {
+		struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
+
+		p_ramrod->base_vf_id = (u8) p_iov->first_vf_in_pf;
+		p_ramrod->num_vfs = (u8) p_iov->total_vfs;
+	}
+	p_ramrod->hsi_fp_ver.major_ver_arr[ETH_VER_KEY] = ETH_HSI_VER_MAJOR;
+	p_ramrod->hsi_fp_ver.minor_ver_arr[ETH_VER_KEY] = ETH_HSI_VER_MINOR;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
+		   "Setting event_ring_sb [id %04x index %02x], outer_tag.tci [%d]\n",
+		   sb, sb_index, p_ramrod->outer_tag_config.outer_tag.tci);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+
+	if (p_tunn)
+		qed_set_hw_tunn_mode_port(p_hwfn, p_ptt,
+					  &p_hwfn->cdev->tunnel);
+
+	return rc;
+}
+
+int qed_sp_pf_update(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_CB;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_PF_UPDATE, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	qed_dcbx_set_pf_update_params(&p_hwfn->p_dcbx_info->results,
+				      &p_ent->ramrod.pf_update);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+/* Set pf update ramrod command params */
+int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      struct qed_tunnel_info *p_tunn,
+			      enum spq_mode comp_mode,
+			      struct qed_spq_comp_cb *p_comp_data)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	if (IS_VF(p_hwfn->cdev))
+		return qed_vf_pf_tunnel_param_update(p_hwfn, p_tunn);
+
+	if (!p_tunn)
+		return -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_data;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_PF_UPDATE, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	qed_tunn_set_pf_update_params(p_hwfn, p_tunn,
+				      &p_ent->ramrod.pf_update.tunnel_config);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		return rc;
+
+	qed_set_hw_tunn_mode_port(p_hwfn, p_ptt, &p_hwfn->cdev->tunnel);
+
+	return rc;
+}
+
+int qed_sp_pf_stop(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_PF_STOP, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+int qed_sp_heartbeat_ramrod(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_EMPTY, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+int qed_sp_pf_update_stag(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_CB;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_PF_UPDATE, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	p_ent->ramrod.pf_update.update_mf_vlan_flag = true;
+	p_ent->ramrod.pf_update.mf_vlan = cpu_to_le16(p_hwfn->hw_info.ovlan);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c
new file mode 100644
index 0000000..1673fc9
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c
@@ -0,0 +1,991 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_dev_api.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_int.h"
+#include "qed_iscsi.h"
+#include "qed_mcp.h"
+#include "qed_ooo.h"
+#include "qed_reg_addr.h"
+#include "qed_sp.h"
+#include "qed_sriov.h"
+#include "qed_rdma.h"
+
+/***************************************************************************
+* Structures & Definitions
+***************************************************************************/
+
+#define SPQ_HIGH_PRI_RESERVE_DEFAULT    (1)
+
+#define SPQ_BLOCK_DELAY_MAX_ITER        (10)
+#define SPQ_BLOCK_DELAY_US              (10)
+#define SPQ_BLOCK_SLEEP_MAX_ITER        (1000)
+#define SPQ_BLOCK_SLEEP_MS              (5)
+
+/***************************************************************************
+* Blocking Imp. (BLOCK/EBLOCK mode)
+***************************************************************************/
+static void qed_spq_blocking_cb(struct qed_hwfn *p_hwfn,
+				void *cookie,
+				union event_ring_data *data, u8 fw_return_code)
+{
+	struct qed_spq_comp_done *comp_done;
+
+	comp_done = (struct qed_spq_comp_done *)cookie;
+
+	comp_done->fw_return_code = fw_return_code;
+
+	/* Make sure completion done is visible on waiting thread */
+	smp_store_release(&comp_done->done, 0x1);
+}
+
+static int __qed_spq_block(struct qed_hwfn *p_hwfn,
+			   struct qed_spq_entry *p_ent,
+			   u8 *p_fw_ret, bool sleep_between_iter)
+{
+	struct qed_spq_comp_done *comp_done;
+	u32 iter_cnt;
+
+	comp_done = (struct qed_spq_comp_done *)p_ent->comp_cb.cookie;
+	iter_cnt = sleep_between_iter ? SPQ_BLOCK_SLEEP_MAX_ITER
+				      : SPQ_BLOCK_DELAY_MAX_ITER;
+
+	while (iter_cnt--) {
+		/* Validate we receive completion update */
+		if (smp_load_acquire(&comp_done->done) == 1) { /* ^^^ */
+			if (p_fw_ret)
+				*p_fw_ret = comp_done->fw_return_code;
+			return 0;
+		}
+
+		if (sleep_between_iter)
+			msleep(SPQ_BLOCK_SLEEP_MS);
+		else
+			udelay(SPQ_BLOCK_DELAY_US);
+	}
+
+	return -EBUSY;
+}
+
+static int qed_spq_block(struct qed_hwfn *p_hwfn,
+			 struct qed_spq_entry *p_ent,
+			 u8 *p_fw_ret, bool skip_quick_poll)
+{
+	struct qed_spq_comp_done *comp_done;
+	struct qed_ptt *p_ptt;
+	int rc;
+
+	/* A relatively short polling period w/o sleeping, to allow the FW to
+	 * complete the ramrod and thus possibly to avoid the following sleeps.
+	 */
+	if (!skip_quick_poll) {
+		rc = __qed_spq_block(p_hwfn, p_ent, p_fw_ret, false);
+		if (!rc)
+			return 0;
+	}
+
+	/* Move to polling with a sleeping period between iterations */
+	rc = __qed_spq_block(p_hwfn, p_ent, p_fw_ret, true);
+	if (!rc)
+		return 0;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt) {
+		DP_NOTICE(p_hwfn, "ptt, failed to acquire\n");
+		return -EAGAIN;
+	}
+
+	DP_INFO(p_hwfn, "Ramrod is stuck, requesting MCP drain\n");
+	rc = qed_mcp_drain(p_hwfn, p_ptt);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "MCP drain failed\n");
+		goto err;
+	}
+
+	/* Retry after drain */
+	rc = __qed_spq_block(p_hwfn, p_ent, p_fw_ret, true);
+	if (!rc)
+		goto out;
+
+	comp_done = (struct qed_spq_comp_done *)p_ent->comp_cb.cookie;
+	if (comp_done->done == 1)
+		if (p_fw_ret)
+			*p_fw_ret = comp_done->fw_return_code;
+out:
+	qed_ptt_release(p_hwfn, p_ptt);
+	return 0;
+
+err:
+	qed_ptt_release(p_hwfn, p_ptt);
+	DP_NOTICE(p_hwfn,
+		  "Ramrod is stuck [CID %08x cmd %02x protocol %02x echo %04x]\n",
+		  le32_to_cpu(p_ent->elem.hdr.cid),
+		  p_ent->elem.hdr.cmd_id,
+		  p_ent->elem.hdr.protocol_id,
+		  le16_to_cpu(p_ent->elem.hdr.echo));
+
+	return -EBUSY;
+}
+
+/***************************************************************************
+* SPQ entries inner API
+***************************************************************************/
+static int qed_spq_fill_entry(struct qed_hwfn *p_hwfn,
+			      struct qed_spq_entry *p_ent)
+{
+	p_ent->flags = 0;
+
+	switch (p_ent->comp_mode) {
+	case QED_SPQ_MODE_EBLOCK:
+	case QED_SPQ_MODE_BLOCK:
+		p_ent->comp_cb.function = qed_spq_blocking_cb;
+		break;
+	case QED_SPQ_MODE_CB:
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Unknown SPQE completion mode %d\n",
+			  p_ent->comp_mode);
+		return -EINVAL;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
+		   "Ramrod header: [CID 0x%08x CMD 0x%02x protocol 0x%02x] Data pointer: [%08x:%08x] Completion Mode: %s\n",
+		   p_ent->elem.hdr.cid,
+		   p_ent->elem.hdr.cmd_id,
+		   p_ent->elem.hdr.protocol_id,
+		   p_ent->elem.data_ptr.hi,
+		   p_ent->elem.data_ptr.lo,
+		   D_TRINE(p_ent->comp_mode, QED_SPQ_MODE_EBLOCK,
+			   QED_SPQ_MODE_BLOCK, "MODE_EBLOCK", "MODE_BLOCK",
+			   "MODE_CB"));
+
+	return 0;
+}
+
+/***************************************************************************
+* HSI access
+***************************************************************************/
+static void qed_spq_hw_initialize(struct qed_hwfn *p_hwfn,
+				  struct qed_spq *p_spq)
+{
+	struct e4_core_conn_context *p_cxt;
+	struct qed_cxt_info cxt_info;
+	u16 physical_q;
+	int rc;
+
+	cxt_info.iid = p_spq->cid;
+
+	rc = qed_cxt_get_cid_info(p_hwfn, &cxt_info);
+
+	if (rc < 0) {
+		DP_NOTICE(p_hwfn, "Cannot find context info for cid=%d\n",
+			  p_spq->cid);
+		return;
+	}
+
+	p_cxt = cxt_info.p_cxt;
+
+	SET_FIELD(p_cxt->xstorm_ag_context.flags10,
+		  E4_XSTORM_CORE_CONN_AG_CTX_DQ_CF_EN, 1);
+	SET_FIELD(p_cxt->xstorm_ag_context.flags1,
+		  E4_XSTORM_CORE_CONN_AG_CTX_DQ_CF_ACTIVE, 1);
+	SET_FIELD(p_cxt->xstorm_ag_context.flags9,
+		  E4_XSTORM_CORE_CONN_AG_CTX_CONSOLID_PROD_CF_EN, 1);
+
+	/* QM physical queue */
+	physical_q = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_LB);
+	p_cxt->xstorm_ag_context.physical_q0 = cpu_to_le16(physical_q);
+
+	p_cxt->xstorm_st_context.spq_base_lo =
+		DMA_LO_LE(p_spq->chain.p_phys_addr);
+	p_cxt->xstorm_st_context.spq_base_hi =
+		DMA_HI_LE(p_spq->chain.p_phys_addr);
+
+	DMA_REGPAIR_LE(p_cxt->xstorm_st_context.consolid_base_addr,
+		       p_hwfn->p_consq->chain.p_phys_addr);
+}
+
+static int qed_spq_hw_post(struct qed_hwfn *p_hwfn,
+			   struct qed_spq *p_spq, struct qed_spq_entry *p_ent)
+{
+	struct qed_chain *p_chain = &p_hwfn->p_spq->chain;
+	u16 echo = qed_chain_get_prod_idx(p_chain);
+	struct slow_path_element	*elem;
+	struct core_db_data		db;
+
+	p_ent->elem.hdr.echo	= cpu_to_le16(echo);
+	elem = qed_chain_produce(p_chain);
+	if (!elem) {
+		DP_NOTICE(p_hwfn, "Failed to produce from SPQ chain\n");
+		return -EINVAL;
+	}
+
+	*elem = p_ent->elem; /* struct assignment */
+
+	/* send a doorbell on the slow hwfn session */
+	memset(&db, 0, sizeof(db));
+	SET_FIELD(db.params, CORE_DB_DATA_DEST, DB_DEST_XCM);
+	SET_FIELD(db.params, CORE_DB_DATA_AGG_CMD, DB_AGG_CMD_SET);
+	SET_FIELD(db.params, CORE_DB_DATA_AGG_VAL_SEL,
+		  DQ_XCM_CORE_SPQ_PROD_CMD);
+	db.agg_flags = DQ_XCM_CORE_DQ_CF_CMD;
+	db.spq_prod = cpu_to_le16(qed_chain_get_prod_idx(p_chain));
+
+	/* make sure the SPQE is updated before the doorbell */
+	wmb();
+
+	DOORBELL(p_hwfn, qed_db_addr(p_spq->cid, DQ_DEMS_LEGACY), *(u32 *)&db);
+
+	/* make sure doorbell is rang */
+	wmb();
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
+		   "Doorbelled [0x%08x, CID 0x%08x] with Flags: %02x agg_params: %02x, prod: %04x\n",
+		   qed_db_addr(p_spq->cid, DQ_DEMS_LEGACY),
+		   p_spq->cid, db.params, db.agg_flags,
+		   qed_chain_get_prod_idx(p_chain));
+
+	return 0;
+}
+
+/***************************************************************************
+* Asynchronous events
+***************************************************************************/
+static int
+qed_async_event_completion(struct qed_hwfn *p_hwfn,
+			   struct event_ring_entry *p_eqe)
+{
+	qed_spq_async_comp_cb cb;
+
+	if (!p_hwfn->p_spq || (p_eqe->protocol_id >= MAX_PROTOCOL_TYPE))
+		return -EINVAL;
+
+	cb = p_hwfn->p_spq->async_comp_cb[p_eqe->protocol_id];
+	if (cb) {
+		return cb(p_hwfn, p_eqe->opcode, p_eqe->echo,
+			  &p_eqe->data, p_eqe->fw_return_code);
+	} else {
+		DP_NOTICE(p_hwfn,
+			  "Unknown Async completion for protocol: %d\n",
+			  p_eqe->protocol_id);
+		return -EINVAL;
+	}
+}
+
+int
+qed_spq_register_async_cb(struct qed_hwfn *p_hwfn,
+			  enum protocol_type protocol_id,
+			  qed_spq_async_comp_cb cb)
+{
+	if (!p_hwfn->p_spq || (protocol_id >= MAX_PROTOCOL_TYPE))
+		return -EINVAL;
+
+	p_hwfn->p_spq->async_comp_cb[protocol_id] = cb;
+	return 0;
+}
+
+void
+qed_spq_unregister_async_cb(struct qed_hwfn *p_hwfn,
+			    enum protocol_type protocol_id)
+{
+	if (!p_hwfn->p_spq || (protocol_id >= MAX_PROTOCOL_TYPE))
+		return;
+
+	p_hwfn->p_spq->async_comp_cb[protocol_id] = NULL;
+}
+
+/***************************************************************************
+* EQ API
+***************************************************************************/
+void qed_eq_prod_update(struct qed_hwfn *p_hwfn, u16 prod)
+{
+	u32 addr = GTT_BAR0_MAP_REG_USDM_RAM +
+		   USTORM_EQE_CONS_OFFSET(p_hwfn->rel_pf_id);
+
+	REG_WR16(p_hwfn, addr, prod);
+
+	/* keep prod updates ordered */
+	mmiowb();
+}
+
+int qed_eq_completion(struct qed_hwfn *p_hwfn, void *cookie)
+{
+	struct qed_eq *p_eq = cookie;
+	struct qed_chain *p_chain = &p_eq->chain;
+	int rc = 0;
+
+	/* take a snapshot of the FW consumer */
+	u16 fw_cons_idx = le16_to_cpu(*p_eq->p_fw_cons);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SPQ, "fw_cons_idx %x\n", fw_cons_idx);
+
+	/* Need to guarantee the fw_cons index we use points to a usuable
+	 * element (to comply with our chain), so our macros would comply
+	 */
+	if ((fw_cons_idx & qed_chain_get_usable_per_page(p_chain)) ==
+	    qed_chain_get_usable_per_page(p_chain))
+		fw_cons_idx += qed_chain_get_unusable_per_page(p_chain);
+
+	/* Complete current segment of eq entries */
+	while (fw_cons_idx != qed_chain_get_cons_idx(p_chain)) {
+		struct event_ring_entry *p_eqe = qed_chain_consume(p_chain);
+
+		if (!p_eqe) {
+			rc = -EINVAL;
+			break;
+		}
+
+		DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
+			   "op %x prot %x res0 %x echo %x fwret %x flags %x\n",
+			   p_eqe->opcode,
+			   p_eqe->protocol_id,
+			   p_eqe->reserved0,
+			   le16_to_cpu(p_eqe->echo),
+			   p_eqe->fw_return_code,
+			   p_eqe->flags);
+
+		if (GET_FIELD(p_eqe->flags, EVENT_RING_ENTRY_ASYNC)) {
+			if (qed_async_event_completion(p_hwfn, p_eqe))
+				rc = -EINVAL;
+		} else if (qed_spq_completion(p_hwfn,
+					      p_eqe->echo,
+					      p_eqe->fw_return_code,
+					      &p_eqe->data)) {
+			rc = -EINVAL;
+		}
+
+		qed_chain_recycle_consumed(p_chain);
+	}
+
+	qed_eq_prod_update(p_hwfn, qed_chain_get_prod_idx(p_chain));
+
+	return rc;
+}
+
+int qed_eq_alloc(struct qed_hwfn *p_hwfn, u16 num_elem)
+{
+	struct qed_eq *p_eq;
+
+	/* Allocate EQ struct */
+	p_eq = kzalloc(sizeof(*p_eq), GFP_KERNEL);
+	if (!p_eq)
+		return -ENOMEM;
+
+	/* Allocate and initialize EQ chain*/
+	if (qed_chain_alloc(p_hwfn->cdev,
+			    QED_CHAIN_USE_TO_PRODUCE,
+			    QED_CHAIN_MODE_PBL,
+			    QED_CHAIN_CNT_TYPE_U16,
+			    num_elem,
+			    sizeof(union event_ring_element),
+			    &p_eq->chain, NULL))
+		goto eq_allocate_fail;
+
+	/* register EQ completion on the SP SB */
+	qed_int_register_cb(p_hwfn, qed_eq_completion,
+			    p_eq, &p_eq->eq_sb_index, &p_eq->p_fw_cons);
+
+	p_hwfn->p_eq = p_eq;
+	return 0;
+
+eq_allocate_fail:
+	kfree(p_eq);
+	return -ENOMEM;
+}
+
+void qed_eq_setup(struct qed_hwfn *p_hwfn)
+{
+	qed_chain_reset(&p_hwfn->p_eq->chain);
+}
+
+void qed_eq_free(struct qed_hwfn *p_hwfn)
+{
+	if (!p_hwfn->p_eq)
+		return;
+
+	qed_chain_free(p_hwfn->cdev, &p_hwfn->p_eq->chain);
+
+	kfree(p_hwfn->p_eq);
+	p_hwfn->p_eq = NULL;
+}
+
+/***************************************************************************
+* CQE API - manipulate EQ functionality
+***************************************************************************/
+static int qed_cqe_completion(struct qed_hwfn *p_hwfn,
+			      struct eth_slow_path_rx_cqe *cqe,
+			      enum protocol_type protocol)
+{
+	if (IS_VF(p_hwfn->cdev))
+		return 0;
+
+	/* @@@tmp - it's possible we'll eventually want to handle some
+	 * actual commands that can arrive here, but for now this is only
+	 * used to complete the ramrod using the echo value on the cqe
+	 */
+	return qed_spq_completion(p_hwfn, cqe->echo, 0, NULL);
+}
+
+int qed_eth_cqe_completion(struct qed_hwfn *p_hwfn,
+			   struct eth_slow_path_rx_cqe *cqe)
+{
+	int rc;
+
+	rc = qed_cqe_completion(p_hwfn, cqe, PROTOCOLID_ETH);
+	if (rc)
+		DP_NOTICE(p_hwfn,
+			  "Failed to handle RXQ CQE [cmd 0x%02x]\n",
+			  cqe->ramrod_cmd_id);
+
+	return rc;
+}
+
+/***************************************************************************
+* Slow hwfn Queue (spq)
+***************************************************************************/
+void qed_spq_setup(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq *p_spq = p_hwfn->p_spq;
+	struct qed_spq_entry *p_virt = NULL;
+	dma_addr_t p_phys = 0;
+	u32 i, capacity;
+
+	INIT_LIST_HEAD(&p_spq->pending);
+	INIT_LIST_HEAD(&p_spq->completion_pending);
+	INIT_LIST_HEAD(&p_spq->free_pool);
+	INIT_LIST_HEAD(&p_spq->unlimited_pending);
+	spin_lock_init(&p_spq->lock);
+
+	/* SPQ empty pool */
+	p_phys	= p_spq->p_phys + offsetof(struct qed_spq_entry, ramrod);
+	p_virt	= p_spq->p_virt;
+
+	capacity = qed_chain_get_capacity(&p_spq->chain);
+	for (i = 0; i < capacity; i++) {
+		DMA_REGPAIR_LE(p_virt->elem.data_ptr, p_phys);
+
+		list_add_tail(&p_virt->list, &p_spq->free_pool);
+
+		p_virt++;
+		p_phys += sizeof(struct qed_spq_entry);
+	}
+
+	/* Statistics */
+	p_spq->normal_count		= 0;
+	p_spq->comp_count		= 0;
+	p_spq->comp_sent_count		= 0;
+	p_spq->unlimited_pending_count	= 0;
+
+	bitmap_zero(p_spq->p_comp_bitmap, SPQ_RING_SIZE);
+	p_spq->comp_bitmap_idx = 0;
+
+	/* SPQ cid, cannot fail */
+	qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_CORE, &p_spq->cid);
+	qed_spq_hw_initialize(p_hwfn, p_spq);
+
+	/* reset the chain itself */
+	qed_chain_reset(&p_spq->chain);
+}
+
+int qed_spq_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq_entry *p_virt = NULL;
+	struct qed_spq *p_spq = NULL;
+	dma_addr_t p_phys = 0;
+	u32 capacity;
+
+	/* SPQ struct */
+	p_spq = kzalloc(sizeof(struct qed_spq), GFP_KERNEL);
+	if (!p_spq)
+		return -ENOMEM;
+
+	/* SPQ ring  */
+	if (qed_chain_alloc(p_hwfn->cdev,
+			    QED_CHAIN_USE_TO_PRODUCE,
+			    QED_CHAIN_MODE_SINGLE,
+			    QED_CHAIN_CNT_TYPE_U16,
+			    0,   /* N/A when the mode is SINGLE */
+			    sizeof(struct slow_path_element),
+			    &p_spq->chain, NULL))
+		goto spq_allocate_fail;
+
+	/* allocate and fill the SPQ elements (incl. ramrod data list) */
+	capacity = qed_chain_get_capacity(&p_spq->chain);
+	p_virt = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				    capacity * sizeof(struct qed_spq_entry),
+				    &p_phys, GFP_KERNEL);
+	if (!p_virt)
+		goto spq_allocate_fail;
+
+	p_spq->p_virt = p_virt;
+	p_spq->p_phys = p_phys;
+	p_hwfn->p_spq = p_spq;
+
+	return 0;
+
+spq_allocate_fail:
+	qed_chain_free(p_hwfn->cdev, &p_spq->chain);
+	kfree(p_spq);
+	return -ENOMEM;
+}
+
+void qed_spq_free(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq *p_spq = p_hwfn->p_spq;
+	u32 capacity;
+
+	if (!p_spq)
+		return;
+
+	if (p_spq->p_virt) {
+		capacity = qed_chain_get_capacity(&p_spq->chain);
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  capacity *
+				  sizeof(struct qed_spq_entry),
+				  p_spq->p_virt, p_spq->p_phys);
+	}
+
+	qed_chain_free(p_hwfn->cdev, &p_spq->chain);
+	kfree(p_spq);
+	p_hwfn->p_spq = NULL;
+}
+
+int qed_spq_get_entry(struct qed_hwfn *p_hwfn, struct qed_spq_entry **pp_ent)
+{
+	struct qed_spq *p_spq = p_hwfn->p_spq;
+	struct qed_spq_entry *p_ent = NULL;
+	int rc = 0;
+
+	spin_lock_bh(&p_spq->lock);
+
+	if (list_empty(&p_spq->free_pool)) {
+		p_ent = kzalloc(sizeof(*p_ent), GFP_ATOMIC);
+		if (!p_ent) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to allocate an SPQ entry for a pending ramrod\n");
+			rc = -ENOMEM;
+			goto out_unlock;
+		}
+		p_ent->queue = &p_spq->unlimited_pending;
+	} else {
+		p_ent = list_first_entry(&p_spq->free_pool,
+					 struct qed_spq_entry, list);
+		list_del(&p_ent->list);
+		p_ent->queue = &p_spq->pending;
+	}
+
+	*pp_ent = p_ent;
+
+out_unlock:
+	spin_unlock_bh(&p_spq->lock);
+	return rc;
+}
+
+/* Locked variant; Should be called while the SPQ lock is taken */
+static void __qed_spq_return_entry(struct qed_hwfn *p_hwfn,
+				   struct qed_spq_entry *p_ent)
+{
+	list_add_tail(&p_ent->list, &p_hwfn->p_spq->free_pool);
+}
+
+void qed_spq_return_entry(struct qed_hwfn *p_hwfn, struct qed_spq_entry *p_ent)
+{
+	spin_lock_bh(&p_hwfn->p_spq->lock);
+	__qed_spq_return_entry(p_hwfn, p_ent);
+	spin_unlock_bh(&p_hwfn->p_spq->lock);
+}
+
+/**
+ * @brief qed_spq_add_entry - adds a new entry to the pending
+ *        list. Should be used while lock is being held.
+ *
+ * Addes an entry to the pending list is there is room (en empty
+ * element is available in the free_pool), or else places the
+ * entry in the unlimited_pending pool.
+ *
+ * @param p_hwfn
+ * @param p_ent
+ * @param priority
+ *
+ * @return int
+ */
+static int qed_spq_add_entry(struct qed_hwfn *p_hwfn,
+			     struct qed_spq_entry *p_ent,
+			     enum spq_priority priority)
+{
+	struct qed_spq *p_spq = p_hwfn->p_spq;
+
+	if (p_ent->queue == &p_spq->unlimited_pending) {
+
+		if (list_empty(&p_spq->free_pool)) {
+			list_add_tail(&p_ent->list, &p_spq->unlimited_pending);
+			p_spq->unlimited_pending_count++;
+
+			return 0;
+		} else {
+			struct qed_spq_entry *p_en2;
+
+			p_en2 = list_first_entry(&p_spq->free_pool,
+						 struct qed_spq_entry, list);
+			list_del(&p_en2->list);
+
+			/* Copy the ring element physical pointer to the new
+			 * entry, since we are about to override the entire ring
+			 * entry and don't want to lose the pointer.
+			 */
+			p_ent->elem.data_ptr = p_en2->elem.data_ptr;
+
+			*p_en2 = *p_ent;
+
+			/* EBLOCK responsible to free the allocated p_ent */
+			if (p_ent->comp_mode != QED_SPQ_MODE_EBLOCK)
+				kfree(p_ent);
+
+			p_ent = p_en2;
+		}
+	}
+
+	/* entry is to be placed in 'pending' queue */
+	switch (priority) {
+	case QED_SPQ_PRIORITY_NORMAL:
+		list_add_tail(&p_ent->list, &p_spq->pending);
+		p_spq->normal_count++;
+		break;
+	case QED_SPQ_PRIORITY_HIGH:
+		list_add(&p_ent->list, &p_spq->pending);
+		p_spq->high_count++;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/***************************************************************************
+* Accessor
+***************************************************************************/
+u32 qed_spq_get_cid(struct qed_hwfn *p_hwfn)
+{
+	if (!p_hwfn->p_spq)
+		return 0xffffffff;      /* illegal */
+	return p_hwfn->p_spq->cid;
+}
+
+/***************************************************************************
+* Posting new Ramrods
+***************************************************************************/
+static int qed_spq_post_list(struct qed_hwfn *p_hwfn,
+			     struct list_head *head, u32 keep_reserve)
+{
+	struct qed_spq *p_spq = p_hwfn->p_spq;
+	int rc;
+
+	while (qed_chain_get_elem_left(&p_spq->chain) > keep_reserve &&
+	       !list_empty(head)) {
+		struct qed_spq_entry *p_ent =
+			list_first_entry(head, struct qed_spq_entry, list);
+		list_del(&p_ent->list);
+		list_add_tail(&p_ent->list, &p_spq->completion_pending);
+		p_spq->comp_sent_count++;
+
+		rc = qed_spq_hw_post(p_hwfn, p_spq, p_ent);
+		if (rc) {
+			list_del(&p_ent->list);
+			__qed_spq_return_entry(p_hwfn, p_ent);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static int qed_spq_pend_post(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq *p_spq = p_hwfn->p_spq;
+	struct qed_spq_entry *p_ent = NULL;
+
+	while (!list_empty(&p_spq->free_pool)) {
+		if (list_empty(&p_spq->unlimited_pending))
+			break;
+
+		p_ent = list_first_entry(&p_spq->unlimited_pending,
+					 struct qed_spq_entry, list);
+		if (!p_ent)
+			return -EINVAL;
+
+		list_del(&p_ent->list);
+
+		qed_spq_add_entry(p_hwfn, p_ent, p_ent->priority);
+	}
+
+	return qed_spq_post_list(p_hwfn, &p_spq->pending,
+				 SPQ_HIGH_PRI_RESERVE_DEFAULT);
+}
+
+int qed_spq_post(struct qed_hwfn *p_hwfn,
+		 struct qed_spq_entry *p_ent, u8 *fw_return_code)
+{
+	int rc = 0;
+	struct qed_spq *p_spq = p_hwfn ? p_hwfn->p_spq : NULL;
+	bool b_ret_ent = true;
+	bool eblock;
+
+	if (!p_hwfn)
+		return -EINVAL;
+
+	if (!p_ent) {
+		DP_NOTICE(p_hwfn, "Got a NULL pointer\n");
+		return -EINVAL;
+	}
+
+	/* Complete the entry */
+	rc = qed_spq_fill_entry(p_hwfn, p_ent);
+
+	spin_lock_bh(&p_spq->lock);
+
+	/* Check return value after LOCK is taken for cleaner error flow */
+	if (rc)
+		goto spq_post_fail;
+
+	/* Check if entry is in block mode before qed_spq_add_entry,
+	 * which might kfree p_ent.
+	 */
+	eblock = (p_ent->comp_mode == QED_SPQ_MODE_EBLOCK);
+
+	/* Add the request to the pending queue */
+	rc = qed_spq_add_entry(p_hwfn, p_ent, p_ent->priority);
+	if (rc)
+		goto spq_post_fail;
+
+	rc = qed_spq_pend_post(p_hwfn);
+	if (rc) {
+		/* Since it's possible that pending failed for a different
+		 * entry [although unlikely], the failed entry was already
+		 * dealt with; No need to return it here.
+		 */
+		b_ret_ent = false;
+		goto spq_post_fail;
+	}
+
+	spin_unlock_bh(&p_spq->lock);
+
+	if (eblock) {
+		/* For entries in QED BLOCK mode, the completion code cannot
+		 * perform the necessary cleanup - if it did, we couldn't
+		 * access p_ent here to see whether it's successful or not.
+		 * Thus, after gaining the answer perform the cleanup here.
+		 */
+		rc = qed_spq_block(p_hwfn, p_ent, fw_return_code,
+				   p_ent->queue == &p_spq->unlimited_pending);
+
+		if (p_ent->queue == &p_spq->unlimited_pending) {
+			/* This is an allocated p_ent which does not need to
+			 * return to pool.
+			 */
+			kfree(p_ent);
+			return rc;
+		}
+
+		if (rc)
+			goto spq_post_fail2;
+
+		/* return to pool */
+		qed_spq_return_entry(p_hwfn, p_ent);
+	}
+	return rc;
+
+spq_post_fail2:
+	spin_lock_bh(&p_spq->lock);
+	list_del(&p_ent->list);
+	qed_chain_return_produced(&p_spq->chain);
+
+spq_post_fail:
+	/* return to the free pool */
+	if (b_ret_ent)
+		__qed_spq_return_entry(p_hwfn, p_ent);
+	spin_unlock_bh(&p_spq->lock);
+
+	return rc;
+}
+
+int qed_spq_completion(struct qed_hwfn *p_hwfn,
+		       __le16 echo,
+		       u8 fw_return_code,
+		       union event_ring_data *p_data)
+{
+	struct qed_spq		*p_spq;
+	struct qed_spq_entry	*p_ent = NULL;
+	struct qed_spq_entry	*tmp;
+	struct qed_spq_entry	*found = NULL;
+	int			rc;
+
+	if (!p_hwfn)
+		return -EINVAL;
+
+	p_spq = p_hwfn->p_spq;
+	if (!p_spq)
+		return -EINVAL;
+
+	spin_lock_bh(&p_spq->lock);
+	list_for_each_entry_safe(p_ent, tmp, &p_spq->completion_pending, list) {
+		if (p_ent->elem.hdr.echo == echo) {
+			u16 pos = le16_to_cpu(echo) % SPQ_RING_SIZE;
+
+			list_del(&p_ent->list);
+
+			/* Avoid overriding of SPQ entries when getting
+			 * out-of-order completions, by marking the completions
+			 * in a bitmap and increasing the chain consumer only
+			 * for the first successive completed entries.
+			 */
+			__set_bit(pos, p_spq->p_comp_bitmap);
+
+			while (test_bit(p_spq->comp_bitmap_idx,
+					p_spq->p_comp_bitmap)) {
+				__clear_bit(p_spq->comp_bitmap_idx,
+					    p_spq->p_comp_bitmap);
+				p_spq->comp_bitmap_idx++;
+				qed_chain_return_produced(&p_spq->chain);
+			}
+
+			p_spq->comp_count++;
+			found = p_ent;
+			break;
+		}
+
+		/* This is relatively uncommon - depends on scenarios
+		 * which have mutliple per-PF sent ramrods.
+		 */
+		DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
+			   "Got completion for echo %04x - doesn't match echo %04x in completion pending list\n",
+			   le16_to_cpu(echo),
+			   le16_to_cpu(p_ent->elem.hdr.echo));
+	}
+
+	/* Release lock before callback, as callback may post
+	 * an additional ramrod.
+	 */
+	spin_unlock_bh(&p_spq->lock);
+
+	if (!found) {
+		DP_NOTICE(p_hwfn,
+			  "Failed to find an entry this EQE [echo %04x] completes\n",
+			  le16_to_cpu(echo));
+		return -EEXIST;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
+		   "Complete EQE [echo %04x]: func %p cookie %p)\n",
+		   le16_to_cpu(echo),
+		   p_ent->comp_cb.function, p_ent->comp_cb.cookie);
+	if (found->comp_cb.function)
+		found->comp_cb.function(p_hwfn, found->comp_cb.cookie, p_data,
+					fw_return_code);
+	else
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_SPQ,
+			   "Got a completion without a callback function\n");
+
+	if ((found->comp_mode != QED_SPQ_MODE_EBLOCK) ||
+	    (found->queue == &p_spq->unlimited_pending))
+		/* EBLOCK  is responsible for returning its own entry into the
+		 * free list, unless it originally added the entry into the
+		 * unlimited pending list.
+		 */
+		qed_spq_return_entry(p_hwfn, found);
+
+	/* Attempt to post pending requests */
+	spin_lock_bh(&p_spq->lock);
+	rc = qed_spq_pend_post(p_hwfn);
+	spin_unlock_bh(&p_spq->lock);
+
+	return rc;
+}
+
+int qed_consq_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_consq *p_consq;
+
+	/* Allocate ConsQ struct */
+	p_consq = kzalloc(sizeof(*p_consq), GFP_KERNEL);
+	if (!p_consq)
+		return -ENOMEM;
+
+	/* Allocate and initialize EQ chain*/
+	if (qed_chain_alloc(p_hwfn->cdev,
+			    QED_CHAIN_USE_TO_PRODUCE,
+			    QED_CHAIN_MODE_PBL,
+			    QED_CHAIN_CNT_TYPE_U16,
+			    QED_CHAIN_PAGE_SIZE / 0x80,
+			    0x80, &p_consq->chain, NULL))
+		goto consq_allocate_fail;
+
+	p_hwfn->p_consq = p_consq;
+	return 0;
+
+consq_allocate_fail:
+	kfree(p_consq);
+	return -ENOMEM;
+}
+
+void qed_consq_setup(struct qed_hwfn *p_hwfn)
+{
+	qed_chain_reset(&p_hwfn->p_consq->chain);
+}
+
+void qed_consq_free(struct qed_hwfn *p_hwfn)
+{
+	if (!p_hwfn->p_consq)
+		return;
+
+	qed_chain_free(p_hwfn->cdev, &p_hwfn->p_consq->chain);
+
+	kfree(p_hwfn->p_consq);
+	p_hwfn->p_consq = NULL;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
new file mode 100644
index 0000000..5acb91b
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -0,0 +1,5026 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/crc32.h>
+#include <linux/vmalloc.h>
+#include <linux/qed/qed_iov_if.h>
+#include "qed_cxt.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_init_ops.h"
+#include "qed_int.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+#include "qed_sp.h"
+#include "qed_sriov.h"
+#include "qed_vf.h"
+static int qed_sriov_eqe_event(struct qed_hwfn *p_hwfn,
+			       u8 opcode,
+			       __le16 echo,
+			       union event_ring_data *data, u8 fw_return_code);
+
+
+static u8 qed_vf_calculate_legacy(struct qed_vf_info *p_vf)
+{
+	u8 legacy = 0;
+
+	if (p_vf->acquire.vfdev_info.eth_fp_hsi_minor ==
+	    ETH_HSI_VER_NO_PKT_LEN_TUNN)
+		legacy |= QED_QCID_LEGACY_VF_RX_PROD;
+
+	if (!(p_vf->acquire.vfdev_info.capabilities &
+	      VFPF_ACQUIRE_CAP_QUEUE_QIDS))
+		legacy |= QED_QCID_LEGACY_VF_CID;
+
+	return legacy;
+}
+
+/* IOV ramrods */
+static int qed_sp_vf_start(struct qed_hwfn *p_hwfn, struct qed_vf_info *p_vf)
+{
+	struct vf_start_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+	u8 fp_minor;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_vf->opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_VF_START,
+				 PROTOCOLID_COMMON, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.vf_start;
+
+	p_ramrod->vf_id = GET_FIELD(p_vf->concrete_fid, PXP_CONCRETE_FID_VFID);
+	p_ramrod->opaque_fid = cpu_to_le16(p_vf->opaque_fid);
+
+	switch (p_hwfn->hw_info.personality) {
+	case QED_PCI_ETH:
+		p_ramrod->personality = PERSONALITY_ETH;
+		break;
+	case QED_PCI_ETH_ROCE:
+		p_ramrod->personality = PERSONALITY_RDMA_AND_ETH;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Unknown VF personality %d\n",
+			  p_hwfn->hw_info.personality);
+		return -EINVAL;
+	}
+
+	fp_minor = p_vf->acquire.vfdev_info.eth_fp_hsi_minor;
+	if (fp_minor > ETH_HSI_VER_MINOR &&
+	    fp_minor != ETH_HSI_VER_NO_PKT_LEN_TUNN) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "VF [%d] - Requested fp hsi %02x.%02x which is slightly newer than PF's %02x.%02x; Configuring PFs version\n",
+			   p_vf->abs_vf_id,
+			   ETH_HSI_VER_MAJOR,
+			   fp_minor, ETH_HSI_VER_MAJOR, ETH_HSI_VER_MINOR);
+		fp_minor = ETH_HSI_VER_MINOR;
+	}
+
+	p_ramrod->hsi_fp_ver.major_ver_arr[ETH_VER_KEY] = ETH_HSI_VER_MAJOR;
+	p_ramrod->hsi_fp_ver.minor_ver_arr[ETH_VER_KEY] = fp_minor;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "VF[%d] - Starting using HSI %02x.%02x\n",
+		   p_vf->abs_vf_id, ETH_HSI_VER_MAJOR, fp_minor);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_vf_stop(struct qed_hwfn *p_hwfn,
+			  u32 concrete_vfid, u16 opaque_vfid)
+{
+	struct vf_stop_ramrod_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = opaque_vfid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_VF_STOP,
+				 PROTOCOLID_COMMON, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.vf_stop;
+
+	p_ramrod->vf_id = GET_FIELD(concrete_vfid, PXP_CONCRETE_FID_VFID);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+bool qed_iov_is_valid_vfid(struct qed_hwfn *p_hwfn,
+			   int rel_vf_id,
+			   bool b_enabled_only, bool b_non_malicious)
+{
+	if (!p_hwfn->pf_iov_info) {
+		DP_NOTICE(p_hwfn->cdev, "No iov info\n");
+		return false;
+	}
+
+	if ((rel_vf_id >= p_hwfn->cdev->p_iov_info->total_vfs) ||
+	    (rel_vf_id < 0))
+		return false;
+
+	if ((!p_hwfn->pf_iov_info->vfs_array[rel_vf_id].b_init) &&
+	    b_enabled_only)
+		return false;
+
+	if ((p_hwfn->pf_iov_info->vfs_array[rel_vf_id].b_malicious) &&
+	    b_non_malicious)
+		return false;
+
+	return true;
+}
+
+static struct qed_vf_info *qed_iov_get_vf_info(struct qed_hwfn *p_hwfn,
+					       u16 relative_vf_id,
+					       bool b_enabled_only)
+{
+	struct qed_vf_info *vf = NULL;
+
+	if (!p_hwfn->pf_iov_info) {
+		DP_NOTICE(p_hwfn->cdev, "No iov info\n");
+		return NULL;
+	}
+
+	if (qed_iov_is_valid_vfid(p_hwfn, relative_vf_id,
+				  b_enabled_only, false))
+		vf = &p_hwfn->pf_iov_info->vfs_array[relative_vf_id];
+	else
+		DP_ERR(p_hwfn, "qed_iov_get_vf_info: VF[%d] is not enabled\n",
+		       relative_vf_id);
+
+	return vf;
+}
+
+static struct qed_queue_cid *
+qed_iov_get_vf_rx_queue_cid(struct qed_vf_queue *p_queue)
+{
+	int i;
+
+	for (i = 0; i < MAX_QUEUES_PER_QZONE; i++) {
+		if (p_queue->cids[i].p_cid && !p_queue->cids[i].b_is_tx)
+			return p_queue->cids[i].p_cid;
+	}
+
+	return NULL;
+}
+
+enum qed_iov_validate_q_mode {
+	QED_IOV_VALIDATE_Q_NA,
+	QED_IOV_VALIDATE_Q_ENABLE,
+	QED_IOV_VALIDATE_Q_DISABLE,
+};
+
+static bool qed_iov_validate_queue_mode(struct qed_hwfn *p_hwfn,
+					struct qed_vf_info *p_vf,
+					u16 qid,
+					enum qed_iov_validate_q_mode mode,
+					bool b_is_tx)
+{
+	int i;
+
+	if (mode == QED_IOV_VALIDATE_Q_NA)
+		return true;
+
+	for (i = 0; i < MAX_QUEUES_PER_QZONE; i++) {
+		struct qed_vf_queue_cid *p_qcid;
+
+		p_qcid = &p_vf->vf_queues[qid].cids[i];
+
+		if (!p_qcid->p_cid)
+			continue;
+
+		if (p_qcid->b_is_tx != b_is_tx)
+			continue;
+
+		return mode == QED_IOV_VALIDATE_Q_ENABLE;
+	}
+
+	/* In case we haven't found any valid cid, then its disabled */
+	return mode == QED_IOV_VALIDATE_Q_DISABLE;
+}
+
+static bool qed_iov_validate_rxq(struct qed_hwfn *p_hwfn,
+				 struct qed_vf_info *p_vf,
+				 u16 rx_qid,
+				 enum qed_iov_validate_q_mode mode)
+{
+	if (rx_qid >= p_vf->num_rxqs) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "VF[0x%02x] - can't touch Rx queue[%04x]; Only 0x%04x are allocated\n",
+			   p_vf->abs_vf_id, rx_qid, p_vf->num_rxqs);
+		return false;
+	}
+
+	return qed_iov_validate_queue_mode(p_hwfn, p_vf, rx_qid, mode, false);
+}
+
+static bool qed_iov_validate_txq(struct qed_hwfn *p_hwfn,
+				 struct qed_vf_info *p_vf,
+				 u16 tx_qid,
+				 enum qed_iov_validate_q_mode mode)
+{
+	if (tx_qid >= p_vf->num_txqs) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "VF[0x%02x] - can't touch Tx queue[%04x]; Only 0x%04x are allocated\n",
+			   p_vf->abs_vf_id, tx_qid, p_vf->num_txqs);
+		return false;
+	}
+
+	return qed_iov_validate_queue_mode(p_hwfn, p_vf, tx_qid, mode, true);
+}
+
+static bool qed_iov_validate_sb(struct qed_hwfn *p_hwfn,
+				struct qed_vf_info *p_vf, u16 sb_idx)
+{
+	int i;
+
+	for (i = 0; i < p_vf->num_sbs; i++)
+		if (p_vf->igu_sbs[i] == sb_idx)
+			return true;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF[0%02x] - tried using sb_idx %04x which doesn't exist as one of its 0x%02x SBs\n",
+		   p_vf->abs_vf_id, sb_idx, p_vf->num_sbs);
+
+	return false;
+}
+
+static bool qed_iov_validate_active_rxq(struct qed_hwfn *p_hwfn,
+					struct qed_vf_info *p_vf)
+{
+	u8 i;
+
+	for (i = 0; i < p_vf->num_rxqs; i++)
+		if (qed_iov_validate_queue_mode(p_hwfn, p_vf, i,
+						QED_IOV_VALIDATE_Q_ENABLE,
+						false))
+			return true;
+
+	return false;
+}
+
+static bool qed_iov_validate_active_txq(struct qed_hwfn *p_hwfn,
+					struct qed_vf_info *p_vf)
+{
+	u8 i;
+
+	for (i = 0; i < p_vf->num_txqs; i++)
+		if (qed_iov_validate_queue_mode(p_hwfn, p_vf, i,
+						QED_IOV_VALIDATE_Q_ENABLE,
+						true))
+			return true;
+
+	return false;
+}
+
+static int qed_iov_post_vf_bulletin(struct qed_hwfn *p_hwfn,
+				    int vfid, struct qed_ptt *p_ptt)
+{
+	struct qed_bulletin_content *p_bulletin;
+	int crc_size = sizeof(p_bulletin->crc);
+	struct qed_dmae_params params;
+	struct qed_vf_info *p_vf;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!p_vf)
+		return -EINVAL;
+
+	if (!p_vf->vf_bulletin)
+		return -EINVAL;
+
+	p_bulletin = p_vf->bulletin.p_virt;
+
+	/* Increment bulletin board version and compute crc */
+	p_bulletin->version++;
+	p_bulletin->crc = crc32(0, (u8 *)p_bulletin + crc_size,
+				p_vf->bulletin.size - crc_size);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "Posting Bulletin 0x%08x to VF[%d] (CRC 0x%08x)\n",
+		   p_bulletin->version, p_vf->relative_vf_id, p_bulletin->crc);
+
+	/* propagate bulletin board via dmae to vm memory */
+	memset(&params, 0, sizeof(params));
+	params.flags = QED_DMAE_FLAG_VF_DST;
+	params.dst_vfid = p_vf->abs_vf_id;
+	return qed_dmae_host2host(p_hwfn, p_ptt, p_vf->bulletin.phys,
+				  p_vf->vf_bulletin, p_vf->bulletin.size / 4,
+				  &params);
+}
+
+static int qed_iov_pci_cfg_info(struct qed_dev *cdev)
+{
+	struct qed_hw_sriov_info *iov = cdev->p_iov_info;
+	int pos = iov->pos;
+
+	DP_VERBOSE(cdev, QED_MSG_IOV, "sriov ext pos %d\n", pos);
+	pci_read_config_word(cdev->pdev, pos + PCI_SRIOV_CTRL, &iov->ctrl);
+
+	pci_read_config_word(cdev->pdev,
+			     pos + PCI_SRIOV_TOTAL_VF, &iov->total_vfs);
+	pci_read_config_word(cdev->pdev,
+			     pos + PCI_SRIOV_INITIAL_VF, &iov->initial_vfs);
+
+	pci_read_config_word(cdev->pdev, pos + PCI_SRIOV_NUM_VF, &iov->num_vfs);
+	if (iov->num_vfs) {
+		DP_VERBOSE(cdev,
+			   QED_MSG_IOV,
+			   "Number of VFs are already set to non-zero value. Ignoring PCI configuration value\n");
+		iov->num_vfs = 0;
+	}
+
+	pci_read_config_word(cdev->pdev,
+			     pos + PCI_SRIOV_VF_OFFSET, &iov->offset);
+
+	pci_read_config_word(cdev->pdev,
+			     pos + PCI_SRIOV_VF_STRIDE, &iov->stride);
+
+	pci_read_config_word(cdev->pdev,
+			     pos + PCI_SRIOV_VF_DID, &iov->vf_device_id);
+
+	pci_read_config_dword(cdev->pdev,
+			      pos + PCI_SRIOV_SUP_PGSIZE, &iov->pgsz);
+
+	pci_read_config_dword(cdev->pdev, pos + PCI_SRIOV_CAP, &iov->cap);
+
+	pci_read_config_byte(cdev->pdev, pos + PCI_SRIOV_FUNC_LINK, &iov->link);
+
+	DP_VERBOSE(cdev,
+		   QED_MSG_IOV,
+		   "IOV info: nres %d, cap 0x%x, ctrl 0x%x, total %d, initial %d, num vfs %d, offset %d, stride %d, page size 0x%x\n",
+		   iov->nres,
+		   iov->cap,
+		   iov->ctrl,
+		   iov->total_vfs,
+		   iov->initial_vfs,
+		   iov->nr_virtfn, iov->offset, iov->stride, iov->pgsz);
+
+	/* Some sanity checks */
+	if (iov->num_vfs > NUM_OF_VFS(cdev) ||
+	    iov->total_vfs > NUM_OF_VFS(cdev)) {
+		/* This can happen only due to a bug. In this case we set
+		 * num_vfs to zero to avoid memory corruption in the code that
+		 * assumes max number of vfs
+		 */
+		DP_NOTICE(cdev,
+			  "IOV: Unexpected number of vfs set: %d setting num_vf to zero\n",
+			  iov->num_vfs);
+
+		iov->num_vfs = 0;
+		iov->total_vfs = 0;
+	}
+
+	return 0;
+}
+
+static void qed_iov_setup_vfdb(struct qed_hwfn *p_hwfn)
+{
+	struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
+	struct qed_pf_iov *p_iov_info = p_hwfn->pf_iov_info;
+	struct qed_bulletin_content *p_bulletin_virt;
+	dma_addr_t req_p, rply_p, bulletin_p;
+	union pfvf_tlvs *p_reply_virt_addr;
+	union vfpf_tlvs *p_req_virt_addr;
+	u8 idx = 0;
+
+	memset(p_iov_info->vfs_array, 0, sizeof(p_iov_info->vfs_array));
+
+	p_req_virt_addr = p_iov_info->mbx_msg_virt_addr;
+	req_p = p_iov_info->mbx_msg_phys_addr;
+	p_reply_virt_addr = p_iov_info->mbx_reply_virt_addr;
+	rply_p = p_iov_info->mbx_reply_phys_addr;
+	p_bulletin_virt = p_iov_info->p_bulletins;
+	bulletin_p = p_iov_info->bulletins_phys;
+	if (!p_req_virt_addr || !p_reply_virt_addr || !p_bulletin_virt) {
+		DP_ERR(p_hwfn,
+		       "qed_iov_setup_vfdb called without allocating mem first\n");
+		return;
+	}
+
+	for (idx = 0; idx < p_iov->total_vfs; idx++) {
+		struct qed_vf_info *vf = &p_iov_info->vfs_array[idx];
+		u32 concrete;
+
+		vf->vf_mbx.req_virt = p_req_virt_addr + idx;
+		vf->vf_mbx.req_phys = req_p + idx * sizeof(union vfpf_tlvs);
+		vf->vf_mbx.reply_virt = p_reply_virt_addr + idx;
+		vf->vf_mbx.reply_phys = rply_p + idx * sizeof(union pfvf_tlvs);
+
+		vf->state = VF_STOPPED;
+		vf->b_init = false;
+
+		vf->bulletin.phys = idx *
+				    sizeof(struct qed_bulletin_content) +
+				    bulletin_p;
+		vf->bulletin.p_virt = p_bulletin_virt + idx;
+		vf->bulletin.size = sizeof(struct qed_bulletin_content);
+
+		vf->relative_vf_id = idx;
+		vf->abs_vf_id = idx + p_iov->first_vf_in_pf;
+		concrete = qed_vfid_to_concrete(p_hwfn, vf->abs_vf_id);
+		vf->concrete_fid = concrete;
+		vf->opaque_fid = (p_hwfn->hw_info.opaque_fid & 0xff) |
+				 (vf->abs_vf_id << 8);
+		vf->vport_id = idx + 1;
+
+		vf->num_mac_filters = QED_ETH_VF_NUM_MAC_FILTERS;
+		vf->num_vlan_filters = QED_ETH_VF_NUM_VLAN_FILTERS;
+	}
+}
+
+static int qed_iov_allocate_vfdb(struct qed_hwfn *p_hwfn)
+{
+	struct qed_pf_iov *p_iov_info = p_hwfn->pf_iov_info;
+	void **p_v_addr;
+	u16 num_vfs = 0;
+
+	num_vfs = p_hwfn->cdev->p_iov_info->total_vfs;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "qed_iov_allocate_vfdb for %d VFs\n", num_vfs);
+
+	/* Allocate PF Mailbox buffer (per-VF) */
+	p_iov_info->mbx_msg_size = sizeof(union vfpf_tlvs) * num_vfs;
+	p_v_addr = &p_iov_info->mbx_msg_virt_addr;
+	*p_v_addr = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				       p_iov_info->mbx_msg_size,
+				       &p_iov_info->mbx_msg_phys_addr,
+				       GFP_KERNEL);
+	if (!*p_v_addr)
+		return -ENOMEM;
+
+	/* Allocate PF Mailbox Reply buffer (per-VF) */
+	p_iov_info->mbx_reply_size = sizeof(union pfvf_tlvs) * num_vfs;
+	p_v_addr = &p_iov_info->mbx_reply_virt_addr;
+	*p_v_addr = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				       p_iov_info->mbx_reply_size,
+				       &p_iov_info->mbx_reply_phys_addr,
+				       GFP_KERNEL);
+	if (!*p_v_addr)
+		return -ENOMEM;
+
+	p_iov_info->bulletins_size = sizeof(struct qed_bulletin_content) *
+				     num_vfs;
+	p_v_addr = &p_iov_info->p_bulletins;
+	*p_v_addr = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				       p_iov_info->bulletins_size,
+				       &p_iov_info->bulletins_phys,
+				       GFP_KERNEL);
+	if (!*p_v_addr)
+		return -ENOMEM;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "PF's Requests mailbox [%p virt 0x%llx phys],  Response mailbox [%p virt 0x%llx phys] Bulletins [%p virt 0x%llx phys]\n",
+		   p_iov_info->mbx_msg_virt_addr,
+		   (u64) p_iov_info->mbx_msg_phys_addr,
+		   p_iov_info->mbx_reply_virt_addr,
+		   (u64) p_iov_info->mbx_reply_phys_addr,
+		   p_iov_info->p_bulletins, (u64) p_iov_info->bulletins_phys);
+
+	return 0;
+}
+
+static void qed_iov_free_vfdb(struct qed_hwfn *p_hwfn)
+{
+	struct qed_pf_iov *p_iov_info = p_hwfn->pf_iov_info;
+
+	if (p_hwfn->pf_iov_info->mbx_msg_virt_addr)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  p_iov_info->mbx_msg_size,
+				  p_iov_info->mbx_msg_virt_addr,
+				  p_iov_info->mbx_msg_phys_addr);
+
+	if (p_hwfn->pf_iov_info->mbx_reply_virt_addr)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  p_iov_info->mbx_reply_size,
+				  p_iov_info->mbx_reply_virt_addr,
+				  p_iov_info->mbx_reply_phys_addr);
+
+	if (p_iov_info->p_bulletins)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  p_iov_info->bulletins_size,
+				  p_iov_info->p_bulletins,
+				  p_iov_info->bulletins_phys);
+}
+
+int qed_iov_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_pf_iov *p_sriov;
+
+	if (!IS_PF_SRIOV(p_hwfn)) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "No SR-IOV - no need for IOV db\n");
+		return 0;
+	}
+
+	p_sriov = kzalloc(sizeof(*p_sriov), GFP_KERNEL);
+	if (!p_sriov)
+		return -ENOMEM;
+
+	p_hwfn->pf_iov_info = p_sriov;
+
+	qed_spq_register_async_cb(p_hwfn, PROTOCOLID_COMMON,
+				  qed_sriov_eqe_event);
+
+	return qed_iov_allocate_vfdb(p_hwfn);
+}
+
+void qed_iov_setup(struct qed_hwfn *p_hwfn)
+{
+	if (!IS_PF_SRIOV(p_hwfn) || !IS_PF_SRIOV_ALLOC(p_hwfn))
+		return;
+
+	qed_iov_setup_vfdb(p_hwfn);
+}
+
+void qed_iov_free(struct qed_hwfn *p_hwfn)
+{
+	qed_spq_unregister_async_cb(p_hwfn, PROTOCOLID_COMMON);
+
+	if (IS_PF_SRIOV_ALLOC(p_hwfn)) {
+		qed_iov_free_vfdb(p_hwfn);
+		kfree(p_hwfn->pf_iov_info);
+	}
+}
+
+void qed_iov_free_hw_info(struct qed_dev *cdev)
+{
+	kfree(cdev->p_iov_info);
+	cdev->p_iov_info = NULL;
+}
+
+int qed_iov_hw_info(struct qed_hwfn *p_hwfn)
+{
+	struct qed_dev *cdev = p_hwfn->cdev;
+	int pos;
+	int rc;
+
+	if (IS_VF(p_hwfn->cdev))
+		return 0;
+
+	/* Learn the PCI configuration */
+	pos = pci_find_ext_capability(p_hwfn->cdev->pdev,
+				      PCI_EXT_CAP_ID_SRIOV);
+	if (!pos) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV, "No PCIe IOV support\n");
+		return 0;
+	}
+
+	/* Allocate a new struct for IOV information */
+	cdev->p_iov_info = kzalloc(sizeof(*cdev->p_iov_info), GFP_KERNEL);
+	if (!cdev->p_iov_info)
+		return -ENOMEM;
+
+	cdev->p_iov_info->pos = pos;
+
+	rc = qed_iov_pci_cfg_info(cdev);
+	if (rc)
+		return rc;
+
+	/* We want PF IOV to be synonemous with the existance of p_iov_info;
+	 * In case the capability is published but there are no VFs, simply
+	 * de-allocate the struct.
+	 */
+	if (!cdev->p_iov_info->total_vfs) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "IOV capabilities, but no VFs are published\n");
+		kfree(cdev->p_iov_info);
+		cdev->p_iov_info = NULL;
+		return 0;
+	}
+
+	/* First VF index based on offset is tricky:
+	 *  - If ARI is supported [likely], offset - (16 - pf_id) would
+	 *    provide the number for eng0. 2nd engine Vfs would begin
+	 *    after the first engine's VFs.
+	 *  - If !ARI, VFs would start on next device.
+	 *    so offset - (256 - pf_id) would provide the number.
+	 * Utilize the fact that (256 - pf_id) is achieved only by later
+	 * to differentiate between the two.
+	 */
+
+	if (p_hwfn->cdev->p_iov_info->offset < (256 - p_hwfn->abs_pf_id)) {
+		u32 first = p_hwfn->cdev->p_iov_info->offset +
+			    p_hwfn->abs_pf_id - 16;
+
+		cdev->p_iov_info->first_vf_in_pf = first;
+
+		if (QED_PATH_ID(p_hwfn))
+			cdev->p_iov_info->first_vf_in_pf -= MAX_NUM_VFS_BB;
+	} else {
+		u32 first = p_hwfn->cdev->p_iov_info->offset +
+			    p_hwfn->abs_pf_id - 256;
+
+		cdev->p_iov_info->first_vf_in_pf = first;
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "First VF in hwfn 0x%08x\n",
+		   cdev->p_iov_info->first_vf_in_pf);
+
+	return 0;
+}
+
+bool _qed_iov_pf_sanity_check(struct qed_hwfn *p_hwfn,
+			      int vfid, bool b_fail_malicious)
+{
+	/* Check PF supports sriov */
+	if (IS_VF(p_hwfn->cdev) || !IS_QED_SRIOV(p_hwfn->cdev) ||
+	    !IS_PF_SRIOV_ALLOC(p_hwfn))
+		return false;
+
+	/* Check VF validity */
+	if (!qed_iov_is_valid_vfid(p_hwfn, vfid, true, b_fail_malicious))
+		return false;
+
+	return true;
+}
+
+bool qed_iov_pf_sanity_check(struct qed_hwfn *p_hwfn, int vfid)
+{
+	return _qed_iov_pf_sanity_check(p_hwfn, vfid, true);
+}
+
+static void qed_iov_set_vf_to_disable(struct qed_dev *cdev,
+				      u16 rel_vf_id, u8 to_disable)
+{
+	struct qed_vf_info *vf;
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, false);
+		if (!vf)
+			continue;
+
+		vf->to_disable = to_disable;
+	}
+}
+
+static void qed_iov_set_vfs_to_disable(struct qed_dev *cdev, u8 to_disable)
+{
+	u16 i;
+
+	if (!IS_QED_SRIOV(cdev))
+		return;
+
+	for (i = 0; i < cdev->p_iov_info->total_vfs; i++)
+		qed_iov_set_vf_to_disable(cdev, i, to_disable);
+}
+
+static void qed_iov_vf_pglue_clear_err(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt, u8 abs_vfid)
+{
+	qed_wr(p_hwfn, p_ptt,
+	       PGLUE_B_REG_WAS_ERROR_VF_31_0_CLR + (abs_vfid >> 5) * 4,
+	       1 << (abs_vfid & 0x1f));
+}
+
+static void qed_iov_vf_igu_reset(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt, struct qed_vf_info *vf)
+{
+	int i;
+
+	/* Set VF masks and configuration - pretend */
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) vf->concrete_fid);
+
+	qed_wr(p_hwfn, p_ptt, IGU_REG_STATISTIC_NUM_VF_MSG_SENT, 0);
+
+	/* unpretend */
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) p_hwfn->hw_info.concrete_fid);
+
+	/* iterate over all queues, clear sb consumer */
+	for (i = 0; i < vf->num_sbs; i++)
+		qed_int_igu_init_pure_rt_single(p_hwfn, p_ptt,
+						vf->igu_sbs[i],
+						vf->opaque_fid, true);
+}
+
+static void qed_iov_vf_igu_set_int(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_vf_info *vf, bool enable)
+{
+	u32 igu_vf_conf;
+
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) vf->concrete_fid);
+
+	igu_vf_conf = qed_rd(p_hwfn, p_ptt, IGU_REG_VF_CONFIGURATION);
+
+	if (enable)
+		igu_vf_conf |= IGU_VF_CONF_MSI_MSIX_EN;
+	else
+		igu_vf_conf &= ~IGU_VF_CONF_MSI_MSIX_EN;
+
+	qed_wr(p_hwfn, p_ptt, IGU_REG_VF_CONFIGURATION, igu_vf_conf);
+
+	/* unpretend */
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) p_hwfn->hw_info.concrete_fid);
+}
+
+static int
+qed_iov_enable_vf_access_msix(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt, u8 abs_vf_id, u8 num_sbs)
+{
+	u8 current_max = 0;
+	int i;
+
+	/* For AH onward, configuration is per-PF. Find maximum of all
+	 * the currently enabled child VFs, and set the number to be that.
+	 */
+	if (!QED_IS_BB(p_hwfn->cdev)) {
+		qed_for_each_vf(p_hwfn, i) {
+			struct qed_vf_info *p_vf;
+
+			p_vf = qed_iov_get_vf_info(p_hwfn, (u16)i, true);
+			if (!p_vf)
+				continue;
+
+			current_max = max_t(u8, current_max, p_vf->num_sbs);
+		}
+	}
+
+	if (num_sbs > current_max)
+		return qed_mcp_config_vf_msix(p_hwfn, p_ptt,
+					      abs_vf_id, num_sbs);
+
+	return 0;
+}
+
+static int qed_iov_enable_vf_access(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    struct qed_vf_info *vf)
+{
+	u32 igu_vf_conf = IGU_VF_CONF_FUNC_EN;
+	int rc;
+
+	/* It's possible VF was previously considered malicious -
+	 * clear the indication even if we're only going to disable VF.
+	 */
+	vf->b_malicious = false;
+
+	if (vf->to_disable)
+		return 0;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "Enable internal access for vf %x [abs %x]\n",
+		   vf->abs_vf_id, QED_VF_ABS_ID(p_hwfn, vf));
+
+	qed_iov_vf_pglue_clear_err(p_hwfn, p_ptt, QED_VF_ABS_ID(p_hwfn, vf));
+
+	qed_iov_vf_igu_reset(p_hwfn, p_ptt, vf);
+
+	rc = qed_iov_enable_vf_access_msix(p_hwfn, p_ptt,
+					   vf->abs_vf_id, vf->num_sbs);
+	if (rc)
+		return rc;
+
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) vf->concrete_fid);
+
+	SET_FIELD(igu_vf_conf, IGU_VF_CONF_PARENT, p_hwfn->rel_pf_id);
+	STORE_RT_REG(p_hwfn, IGU_REG_VF_CONFIGURATION_RT_OFFSET, igu_vf_conf);
+
+	qed_init_run(p_hwfn, p_ptt, PHASE_VF, vf->abs_vf_id,
+		     p_hwfn->hw_info.hw_mode);
+
+	/* unpretend */
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) p_hwfn->hw_info.concrete_fid);
+
+	vf->state = VF_FREE;
+
+	return rc;
+}
+
+/**
+ * @brief qed_iov_config_perm_table - configure the permission
+ *      zone table.
+ *      In E4, queue zone permission table size is 320x9. There
+ *      are 320 VF queues for single engine device (256 for dual
+ *      engine device), and each entry has the following format:
+ *      {Valid, VF[7:0]}
+ * @param p_hwfn
+ * @param p_ptt
+ * @param vf
+ * @param enable
+ */
+static void qed_iov_config_perm_table(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      struct qed_vf_info *vf, u8 enable)
+{
+	u32 reg_addr, val;
+	u16 qzone_id = 0;
+	int qid;
+
+	for (qid = 0; qid < vf->num_rxqs; qid++) {
+		qed_fw_l2_queue(p_hwfn, vf->vf_queues[qid].fw_rx_qid,
+				&qzone_id);
+
+		reg_addr = PSWHST_REG_ZONE_PERMISSION_TABLE + qzone_id * 4;
+		val = enable ? (vf->abs_vf_id | BIT(8)) : 0;
+		qed_wr(p_hwfn, p_ptt, reg_addr, val);
+	}
+}
+
+static void qed_iov_enable_vf_traffic(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      struct qed_vf_info *vf)
+{
+	/* Reset vf in IGU - interrupts are still disabled */
+	qed_iov_vf_igu_reset(p_hwfn, p_ptt, vf);
+
+	qed_iov_vf_igu_set_int(p_hwfn, p_ptt, vf, 1);
+
+	/* Permission Table */
+	qed_iov_config_perm_table(p_hwfn, p_ptt, vf, true);
+}
+
+static u8 qed_iov_alloc_vf_igu_sbs(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_vf_info *vf, u16 num_rx_queues)
+{
+	struct qed_igu_block *p_block;
+	struct cau_sb_entry sb_entry;
+	int qid = 0;
+	u32 val = 0;
+
+	if (num_rx_queues > p_hwfn->hw_info.p_igu_info->usage.free_cnt_iov)
+		num_rx_queues = p_hwfn->hw_info.p_igu_info->usage.free_cnt_iov;
+	p_hwfn->hw_info.p_igu_info->usage.free_cnt_iov -= num_rx_queues;
+
+	SET_FIELD(val, IGU_MAPPING_LINE_FUNCTION_NUMBER, vf->abs_vf_id);
+	SET_FIELD(val, IGU_MAPPING_LINE_VALID, 1);
+	SET_FIELD(val, IGU_MAPPING_LINE_PF_VALID, 0);
+
+	for (qid = 0; qid < num_rx_queues; qid++) {
+		p_block = qed_get_igu_free_sb(p_hwfn, false);
+		vf->igu_sbs[qid] = p_block->igu_sb_id;
+		p_block->status &= ~QED_IGU_STATUS_FREE;
+		SET_FIELD(val, IGU_MAPPING_LINE_VECTOR_NUMBER, qid);
+
+		qed_wr(p_hwfn, p_ptt,
+		       IGU_REG_MAPPING_MEMORY +
+		       sizeof(u32) * p_block->igu_sb_id, val);
+
+		/* Configure igu sb in CAU which were marked valid */
+		qed_init_cau_sb_entry(p_hwfn, &sb_entry,
+				      p_hwfn->rel_pf_id, vf->abs_vf_id, 1);
+		qed_dmae_host2grc(p_hwfn, p_ptt,
+				  (u64)(uintptr_t)&sb_entry,
+				  CAU_REG_SB_VAR_MEMORY +
+				  p_block->igu_sb_id * sizeof(u64), 2, 0);
+	}
+
+	vf->num_sbs = (u8) num_rx_queues;
+
+	return vf->num_sbs;
+}
+
+static void qed_iov_free_vf_igu_sbs(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    struct qed_vf_info *vf)
+{
+	struct qed_igu_info *p_info = p_hwfn->hw_info.p_igu_info;
+	int idx, igu_id;
+	u32 addr, val;
+
+	/* Invalidate igu CAM lines and mark them as free */
+	for (idx = 0; idx < vf->num_sbs; idx++) {
+		igu_id = vf->igu_sbs[idx];
+		addr = IGU_REG_MAPPING_MEMORY + sizeof(u32) * igu_id;
+
+		val = qed_rd(p_hwfn, p_ptt, addr);
+		SET_FIELD(val, IGU_MAPPING_LINE_VALID, 0);
+		qed_wr(p_hwfn, p_ptt, addr, val);
+
+		p_info->entry[igu_id].status |= QED_IGU_STATUS_FREE;
+		p_hwfn->hw_info.p_igu_info->usage.free_cnt_iov++;
+	}
+
+	vf->num_sbs = 0;
+}
+
+static void qed_iov_set_link(struct qed_hwfn *p_hwfn,
+			     u16 vfid,
+			     struct qed_mcp_link_params *params,
+			     struct qed_mcp_link_state *link,
+			     struct qed_mcp_link_capabilities *p_caps)
+{
+	struct qed_vf_info *p_vf = qed_iov_get_vf_info(p_hwfn,
+						       vfid,
+						       false);
+	struct qed_bulletin_content *p_bulletin;
+
+	if (!p_vf)
+		return;
+
+	p_bulletin = p_vf->bulletin.p_virt;
+	p_bulletin->req_autoneg = params->speed.autoneg;
+	p_bulletin->req_adv_speed = params->speed.advertised_speeds;
+	p_bulletin->req_forced_speed = params->speed.forced_speed;
+	p_bulletin->req_autoneg_pause = params->pause.autoneg;
+	p_bulletin->req_forced_rx = params->pause.forced_rx;
+	p_bulletin->req_forced_tx = params->pause.forced_tx;
+	p_bulletin->req_loopback = params->loopback_mode;
+
+	p_bulletin->link_up = link->link_up;
+	p_bulletin->speed = link->speed;
+	p_bulletin->full_duplex = link->full_duplex;
+	p_bulletin->autoneg = link->an;
+	p_bulletin->autoneg_complete = link->an_complete;
+	p_bulletin->parallel_detection = link->parallel_detection;
+	p_bulletin->pfc_enabled = link->pfc_enabled;
+	p_bulletin->partner_adv_speed = link->partner_adv_speed;
+	p_bulletin->partner_tx_flow_ctrl_en = link->partner_tx_flow_ctrl_en;
+	p_bulletin->partner_rx_flow_ctrl_en = link->partner_rx_flow_ctrl_en;
+	p_bulletin->partner_adv_pause = link->partner_adv_pause;
+	p_bulletin->sfp_tx_fault = link->sfp_tx_fault;
+
+	p_bulletin->capability_speed = p_caps->speed_capabilities;
+}
+
+static int qed_iov_init_hw_for_vf(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_iov_vf_init_params *p_params)
+{
+	struct qed_mcp_link_capabilities link_caps;
+	struct qed_mcp_link_params link_params;
+	struct qed_mcp_link_state link_state;
+	u8 num_of_vf_avaiable_chains = 0;
+	struct qed_vf_info *vf = NULL;
+	u16 qid, num_irqs;
+	int rc = 0;
+	u32 cids;
+	u8 i;
+
+	vf = qed_iov_get_vf_info(p_hwfn, p_params->rel_vf_id, false);
+	if (!vf) {
+		DP_ERR(p_hwfn, "qed_iov_init_hw_for_vf : vf is NULL\n");
+		return -EINVAL;
+	}
+
+	if (vf->b_init) {
+		DP_NOTICE(p_hwfn, "VF[%d] is already active.\n",
+			  p_params->rel_vf_id);
+		return -EINVAL;
+	}
+
+	/* Perform sanity checking on the requested queue_id */
+	for (i = 0; i < p_params->num_queues; i++) {
+		u16 min_vf_qzone = FEAT_NUM(p_hwfn, QED_PF_L2_QUE);
+		u16 max_vf_qzone = min_vf_qzone +
+		    FEAT_NUM(p_hwfn, QED_VF_L2_QUE) - 1;
+
+		qid = p_params->req_rx_queue[i];
+		if (qid < min_vf_qzone || qid > max_vf_qzone) {
+			DP_NOTICE(p_hwfn,
+				  "Can't enable Rx qid [%04x] for VF[%d]: qids [0x%04x,...,0x%04x] available\n",
+				  qid,
+				  p_params->rel_vf_id,
+				  min_vf_qzone, max_vf_qzone);
+			return -EINVAL;
+		}
+
+		qid = p_params->req_tx_queue[i];
+		if (qid > max_vf_qzone) {
+			DP_NOTICE(p_hwfn,
+				  "Can't enable Tx qid [%04x] for VF[%d]: max qid 0x%04x\n",
+				  qid, p_params->rel_vf_id, max_vf_qzone);
+			return -EINVAL;
+		}
+
+		/* If client *really* wants, Tx qid can be shared with PF */
+		if (qid < min_vf_qzone)
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF[%d] is using PF qid [0x%04x] for Txq[0x%02x]\n",
+				   p_params->rel_vf_id, qid, i);
+	}
+
+	/* Limit number of queues according to number of CIDs */
+	qed_cxt_get_proto_cid_count(p_hwfn, PROTOCOLID_ETH, &cids);
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF[%d] - requesting to initialize for 0x%04x queues [0x%04x CIDs available]\n",
+		   vf->relative_vf_id, p_params->num_queues, (u16)cids);
+	num_irqs = min_t(u16, p_params->num_queues, ((u16)cids));
+
+	num_of_vf_avaiable_chains = qed_iov_alloc_vf_igu_sbs(p_hwfn,
+							     p_ptt,
+							     vf, num_irqs);
+	if (!num_of_vf_avaiable_chains) {
+		DP_ERR(p_hwfn, "no available igu sbs\n");
+		return -ENOMEM;
+	}
+
+	/* Choose queue number and index ranges */
+	vf->num_rxqs = num_of_vf_avaiable_chains;
+	vf->num_txqs = num_of_vf_avaiable_chains;
+
+	for (i = 0; i < vf->num_rxqs; i++) {
+		struct qed_vf_queue *p_queue = &vf->vf_queues[i];
+
+		p_queue->fw_rx_qid = p_params->req_rx_queue[i];
+		p_queue->fw_tx_qid = p_params->req_tx_queue[i];
+
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF[%d] - Q[%d] SB %04x, qid [Rx %04x Tx %04x]\n",
+			   vf->relative_vf_id, i, vf->igu_sbs[i],
+			   p_queue->fw_rx_qid, p_queue->fw_tx_qid);
+	}
+
+	/* Update the link configuration in bulletin */
+	memcpy(&link_params, qed_mcp_get_link_params(p_hwfn),
+	       sizeof(link_params));
+	memcpy(&link_state, qed_mcp_get_link_state(p_hwfn), sizeof(link_state));
+	memcpy(&link_caps, qed_mcp_get_link_capabilities(p_hwfn),
+	       sizeof(link_caps));
+	qed_iov_set_link(p_hwfn, p_params->rel_vf_id,
+			 &link_params, &link_state, &link_caps);
+
+	rc = qed_iov_enable_vf_access(p_hwfn, p_ptt, vf);
+	if (!rc) {
+		vf->b_init = true;
+
+		if (IS_LEAD_HWFN(p_hwfn))
+			p_hwfn->cdev->p_iov_info->num_vfs++;
+	}
+
+	return rc;
+}
+
+static int qed_iov_release_hw_for_vf(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt, u16 rel_vf_id)
+{
+	struct qed_mcp_link_capabilities caps;
+	struct qed_mcp_link_params params;
+	struct qed_mcp_link_state link;
+	struct qed_vf_info *vf = NULL;
+
+	vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, true);
+	if (!vf) {
+		DP_ERR(p_hwfn, "qed_iov_release_hw_for_vf : vf is NULL\n");
+		return -EINVAL;
+	}
+
+	if (vf->bulletin.p_virt)
+		memset(vf->bulletin.p_virt, 0, sizeof(*vf->bulletin.p_virt));
+
+	memset(&vf->p_vf_info, 0, sizeof(vf->p_vf_info));
+
+	/* Get the link configuration back in bulletin so
+	 * that when VFs are re-enabled they get the actual
+	 * link configuration.
+	 */
+	memcpy(&params, qed_mcp_get_link_params(p_hwfn), sizeof(params));
+	memcpy(&link, qed_mcp_get_link_state(p_hwfn), sizeof(link));
+	memcpy(&caps, qed_mcp_get_link_capabilities(p_hwfn), sizeof(caps));
+	qed_iov_set_link(p_hwfn, rel_vf_id, &params, &link, &caps);
+
+	/* Forget the VF's acquisition message */
+	memset(&vf->acquire, 0, sizeof(vf->acquire));
+
+	/* disablng interrupts and resetting permission table was done during
+	 * vf-close, however, we could get here without going through vf_close
+	 */
+	/* Disable Interrupts for VF */
+	qed_iov_vf_igu_set_int(p_hwfn, p_ptt, vf, 0);
+
+	/* Reset Permission table */
+	qed_iov_config_perm_table(p_hwfn, p_ptt, vf, 0);
+
+	vf->num_rxqs = 0;
+	vf->num_txqs = 0;
+	qed_iov_free_vf_igu_sbs(p_hwfn, p_ptt, vf);
+
+	if (vf->b_init) {
+		vf->b_init = false;
+
+		if (IS_LEAD_HWFN(p_hwfn))
+			p_hwfn->cdev->p_iov_info->num_vfs--;
+	}
+
+	return 0;
+}
+
+static bool qed_iov_tlv_supported(u16 tlvtype)
+{
+	return CHANNEL_TLV_NONE < tlvtype && tlvtype < CHANNEL_TLV_MAX;
+}
+
+/* place a given tlv on the tlv buffer, continuing current tlv list */
+void *qed_add_tlv(struct qed_hwfn *p_hwfn, u8 **offset, u16 type, u16 length)
+{
+	struct channel_tlv *tl = (struct channel_tlv *)*offset;
+
+	tl->type = type;
+	tl->length = length;
+
+	/* Offset should keep pointing to next TLV (the end of the last) */
+	*offset += length;
+
+	/* Return a pointer to the start of the added tlv */
+	return *offset - length;
+}
+
+/* list the types and lengths of the tlvs on the buffer */
+void qed_dp_tlv_list(struct qed_hwfn *p_hwfn, void *tlvs_list)
+{
+	u16 i = 1, total_length = 0;
+	struct channel_tlv *tlv;
+
+	do {
+		tlv = (struct channel_tlv *)((u8 *)tlvs_list + total_length);
+
+		/* output tlv */
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "TLV number %d: type %d, length %d\n",
+			   i, tlv->type, tlv->length);
+
+		if (tlv->type == CHANNEL_TLV_LIST_END)
+			return;
+
+		/* Validate entry - protect against malicious VFs */
+		if (!tlv->length) {
+			DP_NOTICE(p_hwfn, "TLV of length 0 found\n");
+			return;
+		}
+
+		total_length += tlv->length;
+
+		if (total_length >= sizeof(struct tlv_buffer_size)) {
+			DP_NOTICE(p_hwfn, "TLV ==> Buffer overflow\n");
+			return;
+		}
+
+		i++;
+	} while (1);
+}
+
+static void qed_iov_send_response(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_vf_info *p_vf,
+				  u16 length, u8 status)
+{
+	struct qed_iov_vf_mbx *mbx = &p_vf->vf_mbx;
+	struct qed_dmae_params params;
+	u8 eng_vf_id;
+
+	mbx->reply_virt->default_resp.hdr.status = status;
+
+	qed_dp_tlv_list(p_hwfn, mbx->reply_virt);
+
+	eng_vf_id = p_vf->abs_vf_id;
+
+	memset(&params, 0, sizeof(struct qed_dmae_params));
+	params.flags = QED_DMAE_FLAG_VF_DST;
+	params.dst_vfid = eng_vf_id;
+
+	qed_dmae_host2host(p_hwfn, p_ptt, mbx->reply_phys + sizeof(u64),
+			   mbx->req_virt->first_tlv.reply_address +
+			   sizeof(u64),
+			   (sizeof(union pfvf_tlvs) - sizeof(u64)) / 4,
+			   &params);
+
+	/* Once PF copies the rc to the VF, the latter can continue
+	 * and send an additional message. So we have to make sure the
+	 * channel would be re-set to ready prior to that.
+	 */
+	REG_WR(p_hwfn,
+	       GTT_BAR0_MAP_REG_USDM_RAM +
+	       USTORM_VF_PF_CHANNEL_READY_OFFSET(eng_vf_id), 1);
+
+	qed_dmae_host2host(p_hwfn, p_ptt, mbx->reply_phys,
+			   mbx->req_virt->first_tlv.reply_address,
+			   sizeof(u64) / 4, &params);
+}
+
+static u16 qed_iov_vport_to_tlv(struct qed_hwfn *p_hwfn,
+				enum qed_iov_vport_update_flag flag)
+{
+	switch (flag) {
+	case QED_IOV_VP_UPDATE_ACTIVATE:
+		return CHANNEL_TLV_VPORT_UPDATE_ACTIVATE;
+	case QED_IOV_VP_UPDATE_VLAN_STRIP:
+		return CHANNEL_TLV_VPORT_UPDATE_VLAN_STRIP;
+	case QED_IOV_VP_UPDATE_TX_SWITCH:
+		return CHANNEL_TLV_VPORT_UPDATE_TX_SWITCH;
+	case QED_IOV_VP_UPDATE_MCAST:
+		return CHANNEL_TLV_VPORT_UPDATE_MCAST;
+	case QED_IOV_VP_UPDATE_ACCEPT_PARAM:
+		return CHANNEL_TLV_VPORT_UPDATE_ACCEPT_PARAM;
+	case QED_IOV_VP_UPDATE_RSS:
+		return CHANNEL_TLV_VPORT_UPDATE_RSS;
+	case QED_IOV_VP_UPDATE_ACCEPT_ANY_VLAN:
+		return CHANNEL_TLV_VPORT_UPDATE_ACCEPT_ANY_VLAN;
+	case QED_IOV_VP_UPDATE_SGE_TPA:
+		return CHANNEL_TLV_VPORT_UPDATE_SGE_TPA;
+	default:
+		return 0;
+	}
+}
+
+static u16 qed_iov_prep_vp_update_resp_tlvs(struct qed_hwfn *p_hwfn,
+					    struct qed_vf_info *p_vf,
+					    struct qed_iov_vf_mbx *p_mbx,
+					    u8 status,
+					    u16 tlvs_mask, u16 tlvs_accepted)
+{
+	struct pfvf_def_resp_tlv *resp;
+	u16 size, total_len, i;
+
+	memset(p_mbx->reply_virt, 0, sizeof(union pfvf_tlvs));
+	p_mbx->offset = (u8 *)p_mbx->reply_virt;
+	size = sizeof(struct pfvf_def_resp_tlv);
+	total_len = size;
+
+	qed_add_tlv(p_hwfn, &p_mbx->offset, CHANNEL_TLV_VPORT_UPDATE, size);
+
+	/* Prepare response for all extended tlvs if they are found by PF */
+	for (i = 0; i < QED_IOV_VP_UPDATE_MAX; i++) {
+		if (!(tlvs_mask & BIT(i)))
+			continue;
+
+		resp = qed_add_tlv(p_hwfn, &p_mbx->offset,
+				   qed_iov_vport_to_tlv(p_hwfn, i), size);
+
+		if (tlvs_accepted & BIT(i))
+			resp->hdr.status = status;
+		else
+			resp->hdr.status = PFVF_STATUS_NOT_SUPPORTED;
+
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "VF[%d] - vport_update response: TLV %d, status %02x\n",
+			   p_vf->relative_vf_id,
+			   qed_iov_vport_to_tlv(p_hwfn, i), resp->hdr.status);
+
+		total_len += size;
+	}
+
+	qed_add_tlv(p_hwfn, &p_mbx->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	return total_len;
+}
+
+static void qed_iov_prepare_resp(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 struct qed_vf_info *vf_info,
+				 u16 type, u16 length, u8 status)
+{
+	struct qed_iov_vf_mbx *mbx = &vf_info->vf_mbx;
+
+	mbx->offset = (u8 *)mbx->reply_virt;
+
+	qed_add_tlv(p_hwfn, &mbx->offset, type, length);
+	qed_add_tlv(p_hwfn, &mbx->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	qed_iov_send_response(p_hwfn, p_ptt, vf_info, length, status);
+}
+
+static struct
+qed_public_vf_info *qed_iov_get_public_vf_info(struct qed_hwfn *p_hwfn,
+					       u16 relative_vf_id,
+					       bool b_enabled_only)
+{
+	struct qed_vf_info *vf = NULL;
+
+	vf = qed_iov_get_vf_info(p_hwfn, relative_vf_id, b_enabled_only);
+	if (!vf)
+		return NULL;
+
+	return &vf->p_vf_info;
+}
+
+static void qed_iov_clean_vf(struct qed_hwfn *p_hwfn, u8 vfid)
+{
+	struct qed_public_vf_info *vf_info;
+
+	vf_info = qed_iov_get_public_vf_info(p_hwfn, vfid, false);
+
+	if (!vf_info)
+		return;
+
+	/* Clear the VF mac */
+	eth_zero_addr(vf_info->mac);
+
+	vf_info->rx_accept_mode = 0;
+	vf_info->tx_accept_mode = 0;
+}
+
+static void qed_iov_vf_cleanup(struct qed_hwfn *p_hwfn,
+			       struct qed_vf_info *p_vf)
+{
+	u32 i, j;
+
+	p_vf->vf_bulletin = 0;
+	p_vf->vport_instance = 0;
+	p_vf->configured_features = 0;
+
+	/* If VF previously requested less resources, go back to default */
+	p_vf->num_rxqs = p_vf->num_sbs;
+	p_vf->num_txqs = p_vf->num_sbs;
+
+	p_vf->num_active_rxqs = 0;
+
+	for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++) {
+		struct qed_vf_queue *p_queue = &p_vf->vf_queues[i];
+
+		for (j = 0; j < MAX_QUEUES_PER_QZONE; j++) {
+			if (!p_queue->cids[j].p_cid)
+				continue;
+
+			qed_eth_queue_cid_release(p_hwfn,
+						  p_queue->cids[j].p_cid);
+			p_queue->cids[j].p_cid = NULL;
+		}
+	}
+
+	memset(&p_vf->shadow_config, 0, sizeof(p_vf->shadow_config));
+	memset(&p_vf->acquire, 0, sizeof(p_vf->acquire));
+	qed_iov_clean_vf(p_hwfn, p_vf->relative_vf_id);
+}
+
+/* Returns either 0, or log(size) */
+static u32 qed_iov_vf_db_bar_size(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt)
+{
+	u32 val = qed_rd(p_hwfn, p_ptt, PGLUE_B_REG_VF_BAR1_SIZE);
+
+	if (val)
+		return val + 11;
+	return 0;
+}
+
+static void
+qed_iov_vf_mbx_acquire_resc_cids(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt,
+				 struct qed_vf_info *p_vf,
+				 struct vf_pf_resc_request *p_req,
+				 struct pf_vf_resc *p_resp)
+{
+	u8 num_vf_cons = p_hwfn->pf_params.eth_pf_params.num_vf_cons;
+	u8 db_size = qed_db_addr_vf(1, DQ_DEMS_LEGACY) -
+		     qed_db_addr_vf(0, DQ_DEMS_LEGACY);
+	u32 bar_size;
+
+	p_resp->num_cids = min_t(u8, p_req->num_cids, num_vf_cons);
+
+	/* If VF didn't bother asking for QIDs than don't bother limiting
+	 * number of CIDs. The VF doesn't care about the number, and this
+	 * has the likely result of causing an additional acquisition.
+	 */
+	if (!(p_vf->acquire.vfdev_info.capabilities &
+	      VFPF_ACQUIRE_CAP_QUEUE_QIDS))
+		return;
+
+	/* If doorbell bar was mapped by VF, limit the VF CIDs to an amount
+	 * that would make sure doorbells for all CIDs fall within the bar.
+	 * If it doesn't, make sure regview window is sufficient.
+	 */
+	if (p_vf->acquire.vfdev_info.capabilities &
+	    VFPF_ACQUIRE_CAP_PHYSICAL_BAR) {
+		bar_size = qed_iov_vf_db_bar_size(p_hwfn, p_ptt);
+		if (bar_size)
+			bar_size = 1 << bar_size;
+
+		if (p_hwfn->cdev->num_hwfns > 1)
+			bar_size /= 2;
+	} else {
+		bar_size = PXP_VF_BAR0_DQ_LENGTH;
+	}
+
+	if (bar_size / db_size < 256)
+		p_resp->num_cids = min_t(u8, p_resp->num_cids,
+					 (u8)(bar_size / db_size));
+}
+
+static u8 qed_iov_vf_mbx_acquire_resc(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      struct qed_vf_info *p_vf,
+				      struct vf_pf_resc_request *p_req,
+				      struct pf_vf_resc *p_resp)
+{
+	u8 i;
+
+	/* Queue related information */
+	p_resp->num_rxqs = p_vf->num_rxqs;
+	p_resp->num_txqs = p_vf->num_txqs;
+	p_resp->num_sbs = p_vf->num_sbs;
+
+	for (i = 0; i < p_resp->num_sbs; i++) {
+		p_resp->hw_sbs[i].hw_sb_id = p_vf->igu_sbs[i];
+		p_resp->hw_sbs[i].sb_qid = 0;
+	}
+
+	/* These fields are filled for backward compatibility.
+	 * Unused by modern vfs.
+	 */
+	for (i = 0; i < p_resp->num_rxqs; i++) {
+		qed_fw_l2_queue(p_hwfn, p_vf->vf_queues[i].fw_rx_qid,
+				(u16 *)&p_resp->hw_qid[i]);
+		p_resp->cid[i] = i;
+	}
+
+	/* Filter related information */
+	p_resp->num_mac_filters = min_t(u8, p_vf->num_mac_filters,
+					p_req->num_mac_filters);
+	p_resp->num_vlan_filters = min_t(u8, p_vf->num_vlan_filters,
+					 p_req->num_vlan_filters);
+
+	qed_iov_vf_mbx_acquire_resc_cids(p_hwfn, p_ptt, p_vf, p_req, p_resp);
+
+	/* This isn't really needed/enforced, but some legacy VFs might depend
+	 * on the correct filling of this field.
+	 */
+	p_resp->num_mc_filters = QED_MAX_MC_ADDRS;
+
+	/* Validate sufficient resources for VF */
+	if (p_resp->num_rxqs < p_req->num_rxqs ||
+	    p_resp->num_txqs < p_req->num_txqs ||
+	    p_resp->num_sbs < p_req->num_sbs ||
+	    p_resp->num_mac_filters < p_req->num_mac_filters ||
+	    p_resp->num_vlan_filters < p_req->num_vlan_filters ||
+	    p_resp->num_mc_filters < p_req->num_mc_filters ||
+	    p_resp->num_cids < p_req->num_cids) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "VF[%d] - Insufficient resources: rxq [%02x/%02x] txq [%02x/%02x] sbs [%02x/%02x] mac [%02x/%02x] vlan [%02x/%02x] mc [%02x/%02x] cids [%02x/%02x]\n",
+			   p_vf->abs_vf_id,
+			   p_req->num_rxqs,
+			   p_resp->num_rxqs,
+			   p_req->num_rxqs,
+			   p_resp->num_txqs,
+			   p_req->num_sbs,
+			   p_resp->num_sbs,
+			   p_req->num_mac_filters,
+			   p_resp->num_mac_filters,
+			   p_req->num_vlan_filters,
+			   p_resp->num_vlan_filters,
+			   p_req->num_mc_filters,
+			   p_resp->num_mc_filters,
+			   p_req->num_cids, p_resp->num_cids);
+
+		/* Some legacy OSes are incapable of correctly handling this
+		 * failure.
+		 */
+		if ((p_vf->acquire.vfdev_info.eth_fp_hsi_minor ==
+		     ETH_HSI_VER_NO_PKT_LEN_TUNN) &&
+		    (p_vf->acquire.vfdev_info.os_type ==
+		     VFPF_ACQUIRE_OS_WINDOWS))
+			return PFVF_STATUS_SUCCESS;
+
+		return PFVF_STATUS_NO_RESOURCE;
+	}
+
+	return PFVF_STATUS_SUCCESS;
+}
+
+static void qed_iov_vf_mbx_acquire_stats(struct qed_hwfn *p_hwfn,
+					 struct pfvf_stats_info *p_stats)
+{
+	p_stats->mstats.address = PXP_VF_BAR0_START_MSDM_ZONE_B +
+				  offsetof(struct mstorm_vf_zone,
+					   non_trigger.eth_queue_stat);
+	p_stats->mstats.len = sizeof(struct eth_mstorm_per_queue_stat);
+	p_stats->ustats.address = PXP_VF_BAR0_START_USDM_ZONE_B +
+				  offsetof(struct ustorm_vf_zone,
+					   non_trigger.eth_queue_stat);
+	p_stats->ustats.len = sizeof(struct eth_ustorm_per_queue_stat);
+	p_stats->pstats.address = PXP_VF_BAR0_START_PSDM_ZONE_B +
+				  offsetof(struct pstorm_vf_zone,
+					   non_trigger.eth_queue_stat);
+	p_stats->pstats.len = sizeof(struct eth_pstorm_per_queue_stat);
+	p_stats->tstats.address = 0;
+	p_stats->tstats.len = 0;
+}
+
+static void qed_iov_vf_mbx_acquire(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_vf_info *vf)
+{
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	struct pfvf_acquire_resp_tlv *resp = &mbx->reply_virt->acquire_resp;
+	struct pf_vf_pfdev_info *pfdev_info = &resp->pfdev_info;
+	struct vfpf_acquire_tlv *req = &mbx->req_virt->acquire;
+	u8 vfpf_status = PFVF_STATUS_NOT_SUPPORTED;
+	struct pf_vf_resc *resc = &resp->resc;
+	int rc;
+
+	memset(resp, 0, sizeof(*resp));
+
+	/* Write the PF version so that VF would know which version
+	 * is supported - might be later overriden. This guarantees that
+	 * VF could recognize legacy PF based on lack of versions in reply.
+	 */
+	pfdev_info->major_fp_hsi = ETH_HSI_VER_MAJOR;
+	pfdev_info->minor_fp_hsi = ETH_HSI_VER_MINOR;
+
+	if (vf->state != VF_FREE && vf->state != VF_STOPPED) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "VF[%d] sent ACQUIRE but is already in state %d - fail request\n",
+			   vf->abs_vf_id, vf->state);
+		goto out;
+	}
+
+	/* Validate FW compatibility */
+	if (req->vfdev_info.eth_fp_hsi_major != ETH_HSI_VER_MAJOR) {
+		if (req->vfdev_info.capabilities &
+		    VFPF_ACQUIRE_CAP_PRE_FP_HSI) {
+			struct vf_pf_vfdev_info *p_vfdev = &req->vfdev_info;
+
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "VF[%d] is pre-fastpath HSI\n",
+				   vf->abs_vf_id);
+			p_vfdev->eth_fp_hsi_major = ETH_HSI_VER_MAJOR;
+			p_vfdev->eth_fp_hsi_minor = ETH_HSI_VER_NO_PKT_LEN_TUNN;
+		} else {
+			DP_INFO(p_hwfn,
+				"VF[%d] needs fastpath HSI %02x.%02x, which is incompatible with loaded FW's faspath HSI %02x.%02x\n",
+				vf->abs_vf_id,
+				req->vfdev_info.eth_fp_hsi_major,
+				req->vfdev_info.eth_fp_hsi_minor,
+				ETH_HSI_VER_MAJOR, ETH_HSI_VER_MINOR);
+
+			goto out;
+		}
+	}
+
+	/* On 100g PFs, prevent old VFs from loading */
+	if ((p_hwfn->cdev->num_hwfns > 1) &&
+	    !(req->vfdev_info.capabilities & VFPF_ACQUIRE_CAP_100G)) {
+		DP_INFO(p_hwfn,
+			"VF[%d] is running an old driver that doesn't support 100g\n",
+			vf->abs_vf_id);
+		goto out;
+	}
+
+	/* Store the acquire message */
+	memcpy(&vf->acquire, req, sizeof(vf->acquire));
+
+	vf->opaque_fid = req->vfdev_info.opaque_fid;
+
+	vf->vf_bulletin = req->bulletin_addr;
+	vf->bulletin.size = (vf->bulletin.size < req->bulletin_size) ?
+			    vf->bulletin.size : req->bulletin_size;
+
+	/* fill in pfdev info */
+	pfdev_info->chip_num = p_hwfn->cdev->chip_num;
+	pfdev_info->db_size = 0;
+	pfdev_info->indices_per_sb = PIS_PER_SB_E4;
+
+	pfdev_info->capabilities = PFVF_ACQUIRE_CAP_DEFAULT_UNTAGGED |
+				   PFVF_ACQUIRE_CAP_POST_FW_OVERRIDE;
+	if (p_hwfn->cdev->num_hwfns > 1)
+		pfdev_info->capabilities |= PFVF_ACQUIRE_CAP_100G;
+
+	/* Share our ability to use multiple queue-ids only with VFs
+	 * that request it.
+	 */
+	if (req->vfdev_info.capabilities & VFPF_ACQUIRE_CAP_QUEUE_QIDS)
+		pfdev_info->capabilities |= PFVF_ACQUIRE_CAP_QUEUE_QIDS;
+
+	/* Share the sizes of the bars with VF */
+	resp->pfdev_info.bar_size = qed_iov_vf_db_bar_size(p_hwfn, p_ptt);
+
+	qed_iov_vf_mbx_acquire_stats(p_hwfn, &pfdev_info->stats_info);
+
+	memcpy(pfdev_info->port_mac, p_hwfn->hw_info.hw_mac_addr, ETH_ALEN);
+
+	pfdev_info->fw_major = FW_MAJOR_VERSION;
+	pfdev_info->fw_minor = FW_MINOR_VERSION;
+	pfdev_info->fw_rev = FW_REVISION_VERSION;
+	pfdev_info->fw_eng = FW_ENGINEERING_VERSION;
+
+	/* Incorrect when legacy, but doesn't matter as legacy isn't reading
+	 * this field.
+	 */
+	pfdev_info->minor_fp_hsi = min_t(u8, ETH_HSI_VER_MINOR,
+					 req->vfdev_info.eth_fp_hsi_minor);
+	pfdev_info->os_type = VFPF_ACQUIRE_OS_LINUX;
+	qed_mcp_get_mfw_ver(p_hwfn, p_ptt, &pfdev_info->mfw_ver, NULL);
+
+	pfdev_info->dev_type = p_hwfn->cdev->type;
+	pfdev_info->chip_rev = p_hwfn->cdev->chip_rev;
+
+	/* Fill resources available to VF; Make sure there are enough to
+	 * satisfy the VF's request.
+	 */
+	vfpf_status = qed_iov_vf_mbx_acquire_resc(p_hwfn, p_ptt, vf,
+						  &req->resc_request, resc);
+	if (vfpf_status != PFVF_STATUS_SUCCESS)
+		goto out;
+
+	/* Start the VF in FW */
+	rc = qed_sp_vf_start(p_hwfn, vf);
+	if (rc) {
+		DP_NOTICE(p_hwfn, "Failed to start VF[%02x]\n", vf->abs_vf_id);
+		vfpf_status = PFVF_STATUS_FAILURE;
+		goto out;
+	}
+
+	/* Fill agreed size of bulletin board in response */
+	resp->bulletin_size = vf->bulletin.size;
+	qed_iov_post_vf_bulletin(p_hwfn, vf->relative_vf_id, p_ptt);
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF[%d] ACQUIRE_RESPONSE: pfdev_info- chip_num=0x%x, db_size=%d, idx_per_sb=%d, pf_cap=0x%llx\n"
+		   "resources- n_rxq-%d, n_txq-%d, n_sbs-%d, n_macs-%d, n_vlans-%d\n",
+		   vf->abs_vf_id,
+		   resp->pfdev_info.chip_num,
+		   resp->pfdev_info.db_size,
+		   resp->pfdev_info.indices_per_sb,
+		   resp->pfdev_info.capabilities,
+		   resc->num_rxqs,
+		   resc->num_txqs,
+		   resc->num_sbs,
+		   resc->num_mac_filters,
+		   resc->num_vlan_filters);
+	vf->state = VF_ACQUIRED;
+
+	/* Prepare Response */
+out:
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_ACQUIRE,
+			     sizeof(struct pfvf_acquire_resp_tlv), vfpf_status);
+}
+
+static int __qed_iov_spoofchk_set(struct qed_hwfn *p_hwfn,
+				  struct qed_vf_info *p_vf, bool val)
+{
+	struct qed_sp_vport_update_params params;
+	int rc;
+
+	if (val == p_vf->spoof_chk) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Spoofchk value[%d] is already configured\n", val);
+		return 0;
+	}
+
+	memset(&params, 0, sizeof(struct qed_sp_vport_update_params));
+	params.opaque_fid = p_vf->opaque_fid;
+	params.vport_id = p_vf->vport_id;
+	params.update_anti_spoofing_en_flg = 1;
+	params.anti_spoofing_en = val;
+
+	rc = qed_sp_vport_update(p_hwfn, &params, QED_SPQ_MODE_EBLOCK, NULL);
+	if (!rc) {
+		p_vf->spoof_chk = val;
+		p_vf->req_spoofchk_val = p_vf->spoof_chk;
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Spoofchk val[%d] configured\n", val);
+	} else {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Spoofchk configuration[val:%d] failed for VF[%d]\n",
+			   val, p_vf->relative_vf_id);
+	}
+
+	return rc;
+}
+
+static int qed_iov_reconfigure_unicast_vlan(struct qed_hwfn *p_hwfn,
+					    struct qed_vf_info *p_vf)
+{
+	struct qed_filter_ucast filter;
+	int rc = 0;
+	int i;
+
+	memset(&filter, 0, sizeof(filter));
+	filter.is_rx_filter = 1;
+	filter.is_tx_filter = 1;
+	filter.vport_to_add_to = p_vf->vport_id;
+	filter.opcode = QED_FILTER_ADD;
+
+	/* Reconfigure vlans */
+	for (i = 0; i < QED_ETH_VF_NUM_VLAN_FILTERS + 1; i++) {
+		if (!p_vf->shadow_config.vlans[i].used)
+			continue;
+
+		filter.type = QED_FILTER_VLAN;
+		filter.vlan = p_vf->shadow_config.vlans[i].vid;
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Reconfiguring VLAN [0x%04x] for VF [%04x]\n",
+			   filter.vlan, p_vf->relative_vf_id);
+		rc = qed_sp_eth_filter_ucast(p_hwfn, p_vf->opaque_fid,
+					     &filter, QED_SPQ_MODE_CB, NULL);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to configure VLAN [%04x] to VF [%04x]\n",
+				  filter.vlan, p_vf->relative_vf_id);
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static int
+qed_iov_reconfigure_unicast_shadow(struct qed_hwfn *p_hwfn,
+				   struct qed_vf_info *p_vf, u64 events)
+{
+	int rc = 0;
+
+	if ((events & BIT(VLAN_ADDR_FORCED)) &&
+	    !(p_vf->configured_features & (1 << VLAN_ADDR_FORCED)))
+		rc = qed_iov_reconfigure_unicast_vlan(p_hwfn, p_vf);
+
+	return rc;
+}
+
+static int qed_iov_configure_vport_forced(struct qed_hwfn *p_hwfn,
+					  struct qed_vf_info *p_vf, u64 events)
+{
+	int rc = 0;
+	struct qed_filter_ucast filter;
+
+	if (!p_vf->vport_instance)
+		return -EINVAL;
+
+	if (events & BIT(MAC_ADDR_FORCED)) {
+		/* Since there's no way [currently] of removing the MAC,
+		 * we can always assume this means we need to force it.
+		 */
+		memset(&filter, 0, sizeof(filter));
+		filter.type = QED_FILTER_MAC;
+		filter.opcode = QED_FILTER_REPLACE;
+		filter.is_rx_filter = 1;
+		filter.is_tx_filter = 1;
+		filter.vport_to_add_to = p_vf->vport_id;
+		ether_addr_copy(filter.mac, p_vf->bulletin.p_virt->mac);
+
+		rc = qed_sp_eth_filter_ucast(p_hwfn, p_vf->opaque_fid,
+					     &filter, QED_SPQ_MODE_CB, NULL);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "PF failed to configure MAC for VF\n");
+			return rc;
+		}
+
+		p_vf->configured_features |= 1 << MAC_ADDR_FORCED;
+	}
+
+	if (events & BIT(VLAN_ADDR_FORCED)) {
+		struct qed_sp_vport_update_params vport_update;
+		u8 removal;
+		int i;
+
+		memset(&filter, 0, sizeof(filter));
+		filter.type = QED_FILTER_VLAN;
+		filter.is_rx_filter = 1;
+		filter.is_tx_filter = 1;
+		filter.vport_to_add_to = p_vf->vport_id;
+		filter.vlan = p_vf->bulletin.p_virt->pvid;
+		filter.opcode = filter.vlan ? QED_FILTER_REPLACE :
+					      QED_FILTER_FLUSH;
+
+		/* Send the ramrod */
+		rc = qed_sp_eth_filter_ucast(p_hwfn, p_vf->opaque_fid,
+					     &filter, QED_SPQ_MODE_CB, NULL);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "PF failed to configure VLAN for VF\n");
+			return rc;
+		}
+
+		/* Update the default-vlan & silent vlan stripping */
+		memset(&vport_update, 0, sizeof(vport_update));
+		vport_update.opaque_fid = p_vf->opaque_fid;
+		vport_update.vport_id = p_vf->vport_id;
+		vport_update.update_default_vlan_enable_flg = 1;
+		vport_update.default_vlan_enable_flg = filter.vlan ? 1 : 0;
+		vport_update.update_default_vlan_flg = 1;
+		vport_update.default_vlan = filter.vlan;
+
+		vport_update.update_inner_vlan_removal_flg = 1;
+		removal = filter.vlan ? 1
+				      : p_vf->shadow_config.inner_vlan_removal;
+		vport_update.inner_vlan_removal_flg = removal;
+		vport_update.silent_vlan_removal_flg = filter.vlan ? 1 : 0;
+		rc = qed_sp_vport_update(p_hwfn,
+					 &vport_update,
+					 QED_SPQ_MODE_EBLOCK, NULL);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "PF failed to configure VF vport for vlan\n");
+			return rc;
+		}
+
+		/* Update all the Rx queues */
+		for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++) {
+			struct qed_vf_queue *p_queue = &p_vf->vf_queues[i];
+			struct qed_queue_cid *p_cid = NULL;
+
+			/* There can be at most 1 Rx queue on qzone. Find it */
+			p_cid = qed_iov_get_vf_rx_queue_cid(p_queue);
+			if (!p_cid)
+				continue;
+
+			rc = qed_sp_eth_rx_queues_update(p_hwfn,
+							 (void **)&p_cid,
+							 1, 0, 1,
+							 QED_SPQ_MODE_EBLOCK,
+							 NULL);
+			if (rc) {
+				DP_NOTICE(p_hwfn,
+					  "Failed to send Rx update fo queue[0x%04x]\n",
+					  p_cid->rel.queue_id);
+				return rc;
+			}
+		}
+
+		if (filter.vlan)
+			p_vf->configured_features |= 1 << VLAN_ADDR_FORCED;
+		else
+			p_vf->configured_features &= ~BIT(VLAN_ADDR_FORCED);
+	}
+
+	/* If forced features are terminated, we need to configure the shadow
+	 * configuration back again.
+	 */
+	if (events)
+		qed_iov_reconfigure_unicast_shadow(p_hwfn, p_vf, events);
+
+	return rc;
+}
+
+static void qed_iov_vf_mbx_start_vport(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       struct qed_vf_info *vf)
+{
+	struct qed_sp_vport_start_params params = { 0 };
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	struct vfpf_vport_start_tlv *start;
+	u8 status = PFVF_STATUS_SUCCESS;
+	struct qed_vf_info *vf_info;
+	u64 *p_bitmap;
+	int sb_id;
+	int rc;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vf->relative_vf_id, true);
+	if (!vf_info) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "Failed to get VF info, invalid vfid [%d]\n",
+			  vf->relative_vf_id);
+		return;
+	}
+
+	vf->state = VF_ENABLED;
+	start = &mbx->req_virt->start_vport;
+
+	qed_iov_enable_vf_traffic(p_hwfn, p_ptt, vf);
+
+	/* Initialize Status block in CAU */
+	for (sb_id = 0; sb_id < vf->num_sbs; sb_id++) {
+		if (!start->sb_addr[sb_id]) {
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "VF[%d] did not fill the address of SB %d\n",
+				   vf->relative_vf_id, sb_id);
+			break;
+		}
+
+		qed_int_cau_conf_sb(p_hwfn, p_ptt,
+				    start->sb_addr[sb_id],
+				    vf->igu_sbs[sb_id], vf->abs_vf_id, 1);
+	}
+
+	vf->mtu = start->mtu;
+	vf->shadow_config.inner_vlan_removal = start->inner_vlan_removal;
+
+	/* Take into consideration configuration forced by hypervisor;
+	 * If none is configured, use the supplied VF values [for old
+	 * vfs that would still be fine, since they passed '0' as padding].
+	 */
+	p_bitmap = &vf_info->bulletin.p_virt->valid_bitmap;
+	if (!(*p_bitmap & BIT(VFPF_BULLETIN_UNTAGGED_DEFAULT_FORCED))) {
+		u8 vf_req = start->only_untagged;
+
+		vf_info->bulletin.p_virt->default_only_untagged = vf_req;
+		*p_bitmap |= 1 << VFPF_BULLETIN_UNTAGGED_DEFAULT;
+	}
+
+	params.tpa_mode = start->tpa_mode;
+	params.remove_inner_vlan = start->inner_vlan_removal;
+	params.tx_switching = true;
+
+	params.only_untagged = vf_info->bulletin.p_virt->default_only_untagged;
+	params.drop_ttl0 = false;
+	params.concrete_fid = vf->concrete_fid;
+	params.opaque_fid = vf->opaque_fid;
+	params.vport_id = vf->vport_id;
+	params.max_buffers_per_cqe = start->max_buffers_per_cqe;
+	params.mtu = vf->mtu;
+	params.check_mac = true;
+
+	rc = qed_sp_eth_vport_start(p_hwfn, &params);
+	if (rc) {
+		DP_ERR(p_hwfn,
+		       "qed_iov_vf_mbx_start_vport returned error %d\n", rc);
+		status = PFVF_STATUS_FAILURE;
+	} else {
+		vf->vport_instance++;
+
+		/* Force configuration if needed on the newly opened vport */
+		qed_iov_configure_vport_forced(p_hwfn, vf, *p_bitmap);
+
+		__qed_iov_spoofchk_set(p_hwfn, vf, vf->req_spoofchk_val);
+	}
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_VPORT_START,
+			     sizeof(struct pfvf_def_resp_tlv), status);
+}
+
+static void qed_iov_vf_mbx_stop_vport(struct qed_hwfn *p_hwfn,
+				      struct qed_ptt *p_ptt,
+				      struct qed_vf_info *vf)
+{
+	u8 status = PFVF_STATUS_SUCCESS;
+	int rc;
+
+	vf->vport_instance--;
+	vf->spoof_chk = false;
+
+	if ((qed_iov_validate_active_rxq(p_hwfn, vf)) ||
+	    (qed_iov_validate_active_txq(p_hwfn, vf))) {
+		vf->b_malicious = true;
+		DP_NOTICE(p_hwfn,
+			  "VF [%02x] - considered malicious; Unable to stop RX/TX queuess\n",
+			  vf->abs_vf_id);
+		status = PFVF_STATUS_MALICIOUS;
+		goto out;
+	}
+
+	rc = qed_sp_vport_stop(p_hwfn, vf->opaque_fid, vf->vport_id);
+	if (rc) {
+		DP_ERR(p_hwfn, "qed_iov_vf_mbx_stop_vport returned error %d\n",
+		       rc);
+		status = PFVF_STATUS_FAILURE;
+	}
+
+	/* Forget the configuration on the vport */
+	vf->configured_features = 0;
+	memset(&vf->shadow_config, 0, sizeof(vf->shadow_config));
+
+out:
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_VPORT_TEARDOWN,
+			     sizeof(struct pfvf_def_resp_tlv), status);
+}
+
+static void qed_iov_vf_mbx_start_rxq_resp(struct qed_hwfn *p_hwfn,
+					  struct qed_ptt *p_ptt,
+					  struct qed_vf_info *vf,
+					  u8 status, bool b_legacy)
+{
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	struct pfvf_start_queue_resp_tlv *p_tlv;
+	struct vfpf_start_rxq_tlv *req;
+	u16 length;
+
+	mbx->offset = (u8 *)mbx->reply_virt;
+
+	/* Taking a bigger struct instead of adding a TLV to list was a
+	 * mistake, but one which we're now stuck with, as some older
+	 * clients assume the size of the previous response.
+	 */
+	if (!b_legacy)
+		length = sizeof(*p_tlv);
+	else
+		length = sizeof(struct pfvf_def_resp_tlv);
+
+	p_tlv = qed_add_tlv(p_hwfn, &mbx->offset, CHANNEL_TLV_START_RXQ,
+			    length);
+	qed_add_tlv(p_hwfn, &mbx->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	/* Update the TLV with the response */
+	if ((status == PFVF_STATUS_SUCCESS) && !b_legacy) {
+		req = &mbx->req_virt->start_rxq;
+		p_tlv->offset = PXP_VF_BAR0_START_MSDM_ZONE_B +
+				offsetof(struct mstorm_vf_zone,
+					 non_trigger.eth_rx_queue_producers) +
+				sizeof(struct eth_rx_prod_data) * req->rx_qid;
+	}
+
+	qed_iov_send_response(p_hwfn, p_ptt, vf, length, status);
+}
+
+static u8 qed_iov_vf_mbx_qid(struct qed_hwfn *p_hwfn,
+			     struct qed_vf_info *p_vf, bool b_is_tx)
+{
+	struct qed_iov_vf_mbx *p_mbx = &p_vf->vf_mbx;
+	struct vfpf_qid_tlv *p_qid_tlv;
+
+	/* Search for the qid if the VF published its going to provide it */
+	if (!(p_vf->acquire.vfdev_info.capabilities &
+	      VFPF_ACQUIRE_CAP_QUEUE_QIDS)) {
+		if (b_is_tx)
+			return QED_IOV_LEGACY_QID_TX;
+		else
+			return QED_IOV_LEGACY_QID_RX;
+	}
+
+	p_qid_tlv = (struct vfpf_qid_tlv *)
+		    qed_iov_search_list_tlvs(p_hwfn, p_mbx->req_virt,
+					     CHANNEL_TLV_QID);
+	if (!p_qid_tlv) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF[%2x]: Failed to provide qid\n",
+			   p_vf->relative_vf_id);
+
+		return QED_IOV_QID_INVALID;
+	}
+
+	if (p_qid_tlv->qid >= MAX_QUEUES_PER_QZONE) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF[%02x]: Provided qid out-of-bounds %02x\n",
+			   p_vf->relative_vf_id, p_qid_tlv->qid);
+		return QED_IOV_QID_INVALID;
+	}
+
+	return p_qid_tlv->qid;
+}
+
+static void qed_iov_vf_mbx_start_rxq(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     struct qed_vf_info *vf)
+{
+	struct qed_queue_start_common_params params;
+	struct qed_queue_cid_vf_params vf_params;
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	u8 status = PFVF_STATUS_NO_RESOURCE;
+	u8 qid_usage_idx, vf_legacy = 0;
+	struct vfpf_start_rxq_tlv *req;
+	struct qed_vf_queue *p_queue;
+	struct qed_queue_cid *p_cid;
+	struct qed_sb_info sb_dummy;
+	int rc;
+
+	req = &mbx->req_virt->start_rxq;
+
+	if (!qed_iov_validate_rxq(p_hwfn, vf, req->rx_qid,
+				  QED_IOV_VALIDATE_Q_DISABLE) ||
+	    !qed_iov_validate_sb(p_hwfn, vf, req->hw_sb))
+		goto out;
+
+	qid_usage_idx = qed_iov_vf_mbx_qid(p_hwfn, vf, false);
+	if (qid_usage_idx == QED_IOV_QID_INVALID)
+		goto out;
+
+	p_queue = &vf->vf_queues[req->rx_qid];
+	if (p_queue->cids[qid_usage_idx].p_cid)
+		goto out;
+
+	vf_legacy = qed_vf_calculate_legacy(vf);
+
+	/* Acquire a new queue-cid */
+	memset(&params, 0, sizeof(params));
+	params.queue_id = p_queue->fw_rx_qid;
+	params.vport_id = vf->vport_id;
+	params.stats_id = vf->abs_vf_id + 0x10;
+	/* Since IGU index is passed via sb_info, construct a dummy one */
+	memset(&sb_dummy, 0, sizeof(sb_dummy));
+	sb_dummy.igu_sb_id = req->hw_sb;
+	params.p_sb = &sb_dummy;
+	params.sb_idx = req->sb_index;
+
+	memset(&vf_params, 0, sizeof(vf_params));
+	vf_params.vfid = vf->relative_vf_id;
+	vf_params.vf_qid = (u8)req->rx_qid;
+	vf_params.vf_legacy = vf_legacy;
+	vf_params.qid_usage_idx = qid_usage_idx;
+	p_cid = qed_eth_queue_to_cid(p_hwfn, vf->opaque_fid,
+				     &params, true, &vf_params);
+	if (!p_cid)
+		goto out;
+
+	/* Legacy VFs have their Producers in a different location, which they
+	 * calculate on their own and clean the producer prior to this.
+	 */
+	if (!(vf_legacy & QED_QCID_LEGACY_VF_RX_PROD))
+		REG_WR(p_hwfn,
+		       GTT_BAR0_MAP_REG_MSDM_RAM +
+		       MSTORM_ETH_VF_PRODS_OFFSET(vf->abs_vf_id, req->rx_qid),
+		       0);
+
+	rc = qed_eth_rxq_start_ramrod(p_hwfn, p_cid,
+				      req->bd_max_bytes,
+				      req->rxq_addr,
+				      req->cqe_pbl_addr, req->cqe_pbl_size);
+	if (rc) {
+		status = PFVF_STATUS_FAILURE;
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	} else {
+		p_queue->cids[qid_usage_idx].p_cid = p_cid;
+		p_queue->cids[qid_usage_idx].b_is_tx = false;
+		status = PFVF_STATUS_SUCCESS;
+		vf->num_active_rxqs++;
+	}
+
+out:
+	qed_iov_vf_mbx_start_rxq_resp(p_hwfn, p_ptt, vf, status,
+				      !!(vf_legacy &
+					 QED_QCID_LEGACY_VF_RX_PROD));
+}
+
+static void
+qed_iov_pf_update_tun_response(struct pfvf_update_tunn_param_tlv *p_resp,
+			       struct qed_tunnel_info *p_tun,
+			       u16 tunn_feature_mask)
+{
+	p_resp->tunn_feature_mask = tunn_feature_mask;
+	p_resp->vxlan_mode = p_tun->vxlan.b_mode_enabled;
+	p_resp->l2geneve_mode = p_tun->l2_geneve.b_mode_enabled;
+	p_resp->ipgeneve_mode = p_tun->ip_geneve.b_mode_enabled;
+	p_resp->l2gre_mode = p_tun->l2_gre.b_mode_enabled;
+	p_resp->ipgre_mode = p_tun->l2_gre.b_mode_enabled;
+	p_resp->vxlan_clss = p_tun->vxlan.tun_cls;
+	p_resp->l2gre_clss = p_tun->l2_gre.tun_cls;
+	p_resp->ipgre_clss = p_tun->ip_gre.tun_cls;
+	p_resp->l2geneve_clss = p_tun->l2_geneve.tun_cls;
+	p_resp->ipgeneve_clss = p_tun->ip_geneve.tun_cls;
+	p_resp->geneve_udp_port = p_tun->geneve_port.port;
+	p_resp->vxlan_udp_port = p_tun->vxlan_port.port;
+}
+
+static void
+__qed_iov_pf_update_tun_param(struct vfpf_update_tunn_param_tlv *p_req,
+			      struct qed_tunn_update_type *p_tun,
+			      enum qed_tunn_mode mask, u8 tun_cls)
+{
+	if (p_req->tun_mode_update_mask & BIT(mask)) {
+		p_tun->b_update_mode = true;
+
+		if (p_req->tunn_mode & BIT(mask))
+			p_tun->b_mode_enabled = true;
+	}
+
+	p_tun->tun_cls = tun_cls;
+}
+
+static void
+qed_iov_pf_update_tun_param(struct vfpf_update_tunn_param_tlv *p_req,
+			    struct qed_tunn_update_type *p_tun,
+			    struct qed_tunn_update_udp_port *p_port,
+			    enum qed_tunn_mode mask,
+			    u8 tun_cls, u8 update_port, u16 port)
+{
+	if (update_port) {
+		p_port->b_update_port = true;
+		p_port->port = port;
+	}
+
+	__qed_iov_pf_update_tun_param(p_req, p_tun, mask, tun_cls);
+}
+
+static bool
+qed_iov_pf_validate_tunn_param(struct vfpf_update_tunn_param_tlv *p_req)
+{
+	bool b_update_requested = false;
+
+	if (p_req->tun_mode_update_mask || p_req->update_tun_cls ||
+	    p_req->update_geneve_port || p_req->update_vxlan_port)
+		b_update_requested = true;
+
+	return b_update_requested;
+}
+
+static void qed_pf_validate_tunn_mode(struct qed_tunn_update_type *tun, int *rc)
+{
+	if (tun->b_update_mode && !tun->b_mode_enabled) {
+		tun->b_update_mode = false;
+		*rc = -EINVAL;
+	}
+}
+
+static int
+qed_pf_validate_modify_tunn_config(struct qed_hwfn *p_hwfn,
+				   u16 *tun_features, bool *update,
+				   struct qed_tunnel_info *tun_src)
+{
+	struct qed_eth_cb_ops *ops = p_hwfn->cdev->protocol_ops.eth;
+	struct qed_tunnel_info *tun = &p_hwfn->cdev->tunnel;
+	u16 bultn_vxlan_port, bultn_geneve_port;
+	void *cookie = p_hwfn->cdev->ops_cookie;
+	int i, rc = 0;
+
+	*tun_features = p_hwfn->cdev->tunn_feature_mask;
+	bultn_vxlan_port = tun->vxlan_port.port;
+	bultn_geneve_port = tun->geneve_port.port;
+	qed_pf_validate_tunn_mode(&tun_src->vxlan, &rc);
+	qed_pf_validate_tunn_mode(&tun_src->l2_geneve, &rc);
+	qed_pf_validate_tunn_mode(&tun_src->ip_geneve, &rc);
+	qed_pf_validate_tunn_mode(&tun_src->l2_gre, &rc);
+	qed_pf_validate_tunn_mode(&tun_src->ip_gre, &rc);
+
+	if ((tun_src->b_update_rx_cls || tun_src->b_update_tx_cls) &&
+	    (tun_src->vxlan.tun_cls != QED_TUNN_CLSS_MAC_VLAN ||
+	     tun_src->l2_geneve.tun_cls != QED_TUNN_CLSS_MAC_VLAN ||
+	     tun_src->ip_geneve.tun_cls != QED_TUNN_CLSS_MAC_VLAN ||
+	     tun_src->l2_gre.tun_cls != QED_TUNN_CLSS_MAC_VLAN ||
+	     tun_src->ip_gre.tun_cls != QED_TUNN_CLSS_MAC_VLAN)) {
+		tun_src->b_update_rx_cls = false;
+		tun_src->b_update_tx_cls = false;
+		rc = -EINVAL;
+	}
+
+	if (tun_src->vxlan_port.b_update_port) {
+		if (tun_src->vxlan_port.port == tun->vxlan_port.port) {
+			tun_src->vxlan_port.b_update_port = false;
+		} else {
+			*update = true;
+			bultn_vxlan_port = tun_src->vxlan_port.port;
+		}
+	}
+
+	if (tun_src->geneve_port.b_update_port) {
+		if (tun_src->geneve_port.port == tun->geneve_port.port) {
+			tun_src->geneve_port.b_update_port = false;
+		} else {
+			*update = true;
+			bultn_geneve_port = tun_src->geneve_port.port;
+		}
+	}
+
+	qed_for_each_vf(p_hwfn, i) {
+		qed_iov_bulletin_set_udp_ports(p_hwfn, i, bultn_vxlan_port,
+					       bultn_geneve_port);
+	}
+
+	qed_schedule_iov(p_hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
+	ops->ports_update(cookie, bultn_vxlan_port, bultn_geneve_port);
+
+	return rc;
+}
+
+static void qed_iov_vf_mbx_update_tunn_param(struct qed_hwfn *p_hwfn,
+					     struct qed_ptt *p_ptt,
+					     struct qed_vf_info *p_vf)
+{
+	struct qed_tunnel_info *p_tun = &p_hwfn->cdev->tunnel;
+	struct qed_iov_vf_mbx *mbx = &p_vf->vf_mbx;
+	struct pfvf_update_tunn_param_tlv *p_resp;
+	struct vfpf_update_tunn_param_tlv *p_req;
+	u8 status = PFVF_STATUS_SUCCESS;
+	bool b_update_required = false;
+	struct qed_tunnel_info tunn;
+	u16 tunn_feature_mask = 0;
+	int i, rc = 0;
+
+	mbx->offset = (u8 *)mbx->reply_virt;
+
+	memset(&tunn, 0, sizeof(tunn));
+	p_req = &mbx->req_virt->tunn_param_update;
+
+	if (!qed_iov_pf_validate_tunn_param(p_req)) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "No tunnel update requested by VF\n");
+		status = PFVF_STATUS_FAILURE;
+		goto send_resp;
+	}
+
+	tunn.b_update_rx_cls = p_req->update_tun_cls;
+	tunn.b_update_tx_cls = p_req->update_tun_cls;
+
+	qed_iov_pf_update_tun_param(p_req, &tunn.vxlan, &tunn.vxlan_port,
+				    QED_MODE_VXLAN_TUNN, p_req->vxlan_clss,
+				    p_req->update_vxlan_port,
+				    p_req->vxlan_port);
+	qed_iov_pf_update_tun_param(p_req, &tunn.l2_geneve, &tunn.geneve_port,
+				    QED_MODE_L2GENEVE_TUNN,
+				    p_req->l2geneve_clss,
+				    p_req->update_geneve_port,
+				    p_req->geneve_port);
+	__qed_iov_pf_update_tun_param(p_req, &tunn.ip_geneve,
+				      QED_MODE_IPGENEVE_TUNN,
+				      p_req->ipgeneve_clss);
+	__qed_iov_pf_update_tun_param(p_req, &tunn.l2_gre,
+				      QED_MODE_L2GRE_TUNN, p_req->l2gre_clss);
+	__qed_iov_pf_update_tun_param(p_req, &tunn.ip_gre,
+				      QED_MODE_IPGRE_TUNN, p_req->ipgre_clss);
+
+	/* If PF modifies VF's req then it should
+	 * still return an error in case of partial configuration
+	 * or modified configuration as opposed to requested one.
+	 */
+	rc = qed_pf_validate_modify_tunn_config(p_hwfn, &tunn_feature_mask,
+						&b_update_required, &tunn);
+
+	if (rc)
+		status = PFVF_STATUS_FAILURE;
+
+	/* If QED client is willing to update anything ? */
+	if (b_update_required) {
+		u16 geneve_port;
+
+		rc = qed_sp_pf_update_tunn_cfg(p_hwfn, p_ptt, &tunn,
+					       QED_SPQ_MODE_EBLOCK, NULL);
+		if (rc)
+			status = PFVF_STATUS_FAILURE;
+
+		geneve_port = p_tun->geneve_port.port;
+		qed_for_each_vf(p_hwfn, i) {
+			qed_iov_bulletin_set_udp_ports(p_hwfn, i,
+						       p_tun->vxlan_port.port,
+						       geneve_port);
+		}
+	}
+
+send_resp:
+	p_resp = qed_add_tlv(p_hwfn, &mbx->offset,
+			     CHANNEL_TLV_UPDATE_TUNN_PARAM, sizeof(*p_resp));
+
+	qed_iov_pf_update_tun_response(p_resp, p_tun, tunn_feature_mask);
+	qed_add_tlv(p_hwfn, &mbx->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	qed_iov_send_response(p_hwfn, p_ptt, p_vf, sizeof(*p_resp), status);
+}
+
+static void qed_iov_vf_mbx_start_txq_resp(struct qed_hwfn *p_hwfn,
+					  struct qed_ptt *p_ptt,
+					  struct qed_vf_info *p_vf,
+					  u32 cid, u8 status)
+{
+	struct qed_iov_vf_mbx *mbx = &p_vf->vf_mbx;
+	struct pfvf_start_queue_resp_tlv *p_tlv;
+	bool b_legacy = false;
+	u16 length;
+
+	mbx->offset = (u8 *)mbx->reply_virt;
+
+	/* Taking a bigger struct instead of adding a TLV to list was a
+	 * mistake, but one which we're now stuck with, as some older
+	 * clients assume the size of the previous response.
+	 */
+	if (p_vf->acquire.vfdev_info.eth_fp_hsi_minor ==
+	    ETH_HSI_VER_NO_PKT_LEN_TUNN)
+		b_legacy = true;
+
+	if (!b_legacy)
+		length = sizeof(*p_tlv);
+	else
+		length = sizeof(struct pfvf_def_resp_tlv);
+
+	p_tlv = qed_add_tlv(p_hwfn, &mbx->offset, CHANNEL_TLV_START_TXQ,
+			    length);
+	qed_add_tlv(p_hwfn, &mbx->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	/* Update the TLV with the response */
+	if ((status == PFVF_STATUS_SUCCESS) && !b_legacy)
+		p_tlv->offset = qed_db_addr_vf(cid, DQ_DEMS_LEGACY);
+
+	qed_iov_send_response(p_hwfn, p_ptt, p_vf, length, status);
+}
+
+static void qed_iov_vf_mbx_start_txq(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     struct qed_vf_info *vf)
+{
+	struct qed_queue_start_common_params params;
+	struct qed_queue_cid_vf_params vf_params;
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	u8 status = PFVF_STATUS_NO_RESOURCE;
+	struct vfpf_start_txq_tlv *req;
+	struct qed_vf_queue *p_queue;
+	struct qed_queue_cid *p_cid;
+	struct qed_sb_info sb_dummy;
+	u8 qid_usage_idx, vf_legacy;
+	u32 cid = 0;
+	int rc;
+	u16 pq;
+
+	memset(&params, 0, sizeof(params));
+	req = &mbx->req_virt->start_txq;
+
+	if (!qed_iov_validate_txq(p_hwfn, vf, req->tx_qid,
+				  QED_IOV_VALIDATE_Q_NA) ||
+	    !qed_iov_validate_sb(p_hwfn, vf, req->hw_sb))
+		goto out;
+
+	qid_usage_idx = qed_iov_vf_mbx_qid(p_hwfn, vf, true);
+	if (qid_usage_idx == QED_IOV_QID_INVALID)
+		goto out;
+
+	p_queue = &vf->vf_queues[req->tx_qid];
+	if (p_queue->cids[qid_usage_idx].p_cid)
+		goto out;
+
+	vf_legacy = qed_vf_calculate_legacy(vf);
+
+	/* Acquire a new queue-cid */
+	params.queue_id = p_queue->fw_tx_qid;
+	params.vport_id = vf->vport_id;
+	params.stats_id = vf->abs_vf_id + 0x10;
+
+	/* Since IGU index is passed via sb_info, construct a dummy one */
+	memset(&sb_dummy, 0, sizeof(sb_dummy));
+	sb_dummy.igu_sb_id = req->hw_sb;
+	params.p_sb = &sb_dummy;
+	params.sb_idx = req->sb_index;
+
+	memset(&vf_params, 0, sizeof(vf_params));
+	vf_params.vfid = vf->relative_vf_id;
+	vf_params.vf_qid = (u8)req->tx_qid;
+	vf_params.vf_legacy = vf_legacy;
+	vf_params.qid_usage_idx = qid_usage_idx;
+
+	p_cid = qed_eth_queue_to_cid(p_hwfn, vf->opaque_fid,
+				     &params, false, &vf_params);
+	if (!p_cid)
+		goto out;
+
+	pq = qed_get_cm_pq_idx_vf(p_hwfn, vf->relative_vf_id);
+	rc = qed_eth_txq_start_ramrod(p_hwfn, p_cid,
+				      req->pbl_addr, req->pbl_size, pq);
+	if (rc) {
+		status = PFVF_STATUS_FAILURE;
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	} else {
+		status = PFVF_STATUS_SUCCESS;
+		p_queue->cids[qid_usage_idx].p_cid = p_cid;
+		p_queue->cids[qid_usage_idx].b_is_tx = true;
+		cid = p_cid->cid;
+	}
+
+out:
+	qed_iov_vf_mbx_start_txq_resp(p_hwfn, p_ptt, vf, cid, status);
+}
+
+static int qed_iov_vf_stop_rxqs(struct qed_hwfn *p_hwfn,
+				struct qed_vf_info *vf,
+				u16 rxq_id,
+				u8 qid_usage_idx, bool cqe_completion)
+{
+	struct qed_vf_queue *p_queue;
+	int rc = 0;
+
+	if (!qed_iov_validate_rxq(p_hwfn, vf, rxq_id, QED_IOV_VALIDATE_Q_NA)) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "VF[%d] Tried Closing Rx 0x%04x.%02x which is inactive\n",
+			   vf->relative_vf_id, rxq_id, qid_usage_idx);
+		return -EINVAL;
+	}
+
+	p_queue = &vf->vf_queues[rxq_id];
+
+	/* We've validated the index and the existence of the active RXQ -
+	 * now we need to make sure that it's using the correct qid.
+	 */
+	if (!p_queue->cids[qid_usage_idx].p_cid ||
+	    p_queue->cids[qid_usage_idx].b_is_tx) {
+		struct qed_queue_cid *p_cid;
+
+		p_cid = qed_iov_get_vf_rx_queue_cid(p_queue);
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "VF[%d] - Tried Closing Rx 0x%04x.%02x, but Rx is at %04x.%02x\n",
+			   vf->relative_vf_id,
+			   rxq_id, qid_usage_idx, rxq_id, p_cid->qid_usage_idx);
+		return -EINVAL;
+	}
+
+	/* Now that we know we have a valid Rx-queue - close it */
+	rc = qed_eth_rx_queue_stop(p_hwfn,
+				   p_queue->cids[qid_usage_idx].p_cid,
+				   false, cqe_completion);
+	if (rc)
+		return rc;
+
+	p_queue->cids[qid_usage_idx].p_cid = NULL;
+	vf->num_active_rxqs--;
+
+	return 0;
+}
+
+static int qed_iov_vf_stop_txqs(struct qed_hwfn *p_hwfn,
+				struct qed_vf_info *vf,
+				u16 txq_id, u8 qid_usage_idx)
+{
+	struct qed_vf_queue *p_queue;
+	int rc = 0;
+
+	if (!qed_iov_validate_txq(p_hwfn, vf, txq_id, QED_IOV_VALIDATE_Q_NA))
+		return -EINVAL;
+
+	p_queue = &vf->vf_queues[txq_id];
+	if (!p_queue->cids[qid_usage_idx].p_cid ||
+	    !p_queue->cids[qid_usage_idx].b_is_tx)
+		return -EINVAL;
+
+	rc = qed_eth_tx_queue_stop(p_hwfn, p_queue->cids[qid_usage_idx].p_cid);
+	if (rc)
+		return rc;
+
+	p_queue->cids[qid_usage_idx].p_cid = NULL;
+	return 0;
+}
+
+static void qed_iov_vf_mbx_stop_rxqs(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     struct qed_vf_info *vf)
+{
+	u16 length = sizeof(struct pfvf_def_resp_tlv);
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	u8 status = PFVF_STATUS_FAILURE;
+	struct vfpf_stop_rxqs_tlv *req;
+	u8 qid_usage_idx;
+	int rc;
+
+	/* There has never been an official driver that used this interface
+	 * for stopping multiple queues, and it is now considered deprecated.
+	 * Validate this isn't used here.
+	 */
+	req = &mbx->req_virt->stop_rxqs;
+	if (req->num_rxqs != 1) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Odd; VF[%d] tried stopping multiple Rx queues\n",
+			   vf->relative_vf_id);
+		status = PFVF_STATUS_NOT_SUPPORTED;
+		goto out;
+	}
+
+	/* Find which qid-index is associated with the queue */
+	qid_usage_idx = qed_iov_vf_mbx_qid(p_hwfn, vf, false);
+	if (qid_usage_idx == QED_IOV_QID_INVALID)
+		goto out;
+
+	rc = qed_iov_vf_stop_rxqs(p_hwfn, vf, req->rx_qid,
+				  qid_usage_idx, req->cqe_completion);
+	if (!rc)
+		status = PFVF_STATUS_SUCCESS;
+out:
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_STOP_RXQS,
+			     length, status);
+}
+
+static void qed_iov_vf_mbx_stop_txqs(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     struct qed_vf_info *vf)
+{
+	u16 length = sizeof(struct pfvf_def_resp_tlv);
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	u8 status = PFVF_STATUS_FAILURE;
+	struct vfpf_stop_txqs_tlv *req;
+	u8 qid_usage_idx;
+	int rc;
+
+	/* There has never been an official driver that used this interface
+	 * for stopping multiple queues, and it is now considered deprecated.
+	 * Validate this isn't used here.
+	 */
+	req = &mbx->req_virt->stop_txqs;
+	if (req->num_txqs != 1) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Odd; VF[%d] tried stopping multiple Tx queues\n",
+			   vf->relative_vf_id);
+		status = PFVF_STATUS_NOT_SUPPORTED;
+		goto out;
+	}
+
+	/* Find which qid-index is associated with the queue */
+	qid_usage_idx = qed_iov_vf_mbx_qid(p_hwfn, vf, true);
+	if (qid_usage_idx == QED_IOV_QID_INVALID)
+		goto out;
+
+	rc = qed_iov_vf_stop_txqs(p_hwfn, vf, req->tx_qid, qid_usage_idx);
+	if (!rc)
+		status = PFVF_STATUS_SUCCESS;
+
+out:
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_STOP_TXQS,
+			     length, status);
+}
+
+static void qed_iov_vf_mbx_update_rxqs(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       struct qed_vf_info *vf)
+{
+	struct qed_queue_cid *handlers[QED_MAX_VF_CHAINS_PER_PF];
+	u16 length = sizeof(struct pfvf_def_resp_tlv);
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	struct vfpf_update_rxq_tlv *req;
+	u8 status = PFVF_STATUS_FAILURE;
+	u8 complete_event_flg;
+	u8 complete_cqe_flg;
+	u8 qid_usage_idx;
+	int rc;
+	u8 i;
+
+	req = &mbx->req_virt->update_rxq;
+	complete_cqe_flg = !!(req->flags & VFPF_RXQ_UPD_COMPLETE_CQE_FLAG);
+	complete_event_flg = !!(req->flags & VFPF_RXQ_UPD_COMPLETE_EVENT_FLAG);
+
+	qid_usage_idx = qed_iov_vf_mbx_qid(p_hwfn, vf, false);
+	if (qid_usage_idx == QED_IOV_QID_INVALID)
+		goto out;
+
+	/* There shouldn't exist a VF that uses queue-qids yet uses this
+	 * API with multiple Rx queues. Validate this.
+	 */
+	if ((vf->acquire.vfdev_info.capabilities &
+	     VFPF_ACQUIRE_CAP_QUEUE_QIDS) && req->num_rxqs != 1) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF[%d] supports QIDs but sends multiple queues\n",
+			   vf->relative_vf_id);
+		goto out;
+	}
+
+	/* Validate inputs - for the legacy case this is still true since
+	 * qid_usage_idx for each Rx queue would be LEGACY_QID_RX.
+	 */
+	for (i = req->rx_qid; i < req->rx_qid + req->num_rxqs; i++) {
+		if (!qed_iov_validate_rxq(p_hwfn, vf, i,
+					  QED_IOV_VALIDATE_Q_NA) ||
+		    !vf->vf_queues[i].cids[qid_usage_idx].p_cid ||
+		    vf->vf_queues[i].cids[qid_usage_idx].b_is_tx) {
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "VF[%d]: Incorrect Rxqs [%04x, %02x]\n",
+				   vf->relative_vf_id, req->rx_qid,
+				   req->num_rxqs);
+			goto out;
+		}
+	}
+
+	/* Prepare the handlers */
+	for (i = 0; i < req->num_rxqs; i++) {
+		u16 qid = req->rx_qid + i;
+
+		handlers[i] = vf->vf_queues[qid].cids[qid_usage_idx].p_cid;
+	}
+
+	rc = qed_sp_eth_rx_queues_update(p_hwfn, (void **)&handlers,
+					 req->num_rxqs,
+					 complete_cqe_flg,
+					 complete_event_flg,
+					 QED_SPQ_MODE_EBLOCK, NULL);
+	if (rc)
+		goto out;
+
+	status = PFVF_STATUS_SUCCESS;
+out:
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_UPDATE_RXQ,
+			     length, status);
+}
+
+void *qed_iov_search_list_tlvs(struct qed_hwfn *p_hwfn,
+			       void *p_tlvs_list, u16 req_type)
+{
+	struct channel_tlv *p_tlv = (struct channel_tlv *)p_tlvs_list;
+	int len = 0;
+
+	do {
+		if (!p_tlv->length) {
+			DP_NOTICE(p_hwfn, "Zero length TLV found\n");
+			return NULL;
+		}
+
+		if (p_tlv->type == req_type) {
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "Extended tlv type %d, length %d found\n",
+				   p_tlv->type, p_tlv->length);
+			return p_tlv;
+		}
+
+		len += p_tlv->length;
+		p_tlv = (struct channel_tlv *)((u8 *)p_tlv + p_tlv->length);
+
+		if ((len + p_tlv->length) > TLV_BUFFER_SIZE) {
+			DP_NOTICE(p_hwfn, "TLVs has overrun the buffer size\n");
+			return NULL;
+		}
+	} while (p_tlv->type != CHANNEL_TLV_LIST_END);
+
+	return NULL;
+}
+
+static void
+qed_iov_vp_update_act_param(struct qed_hwfn *p_hwfn,
+			    struct qed_sp_vport_update_params *p_data,
+			    struct qed_iov_vf_mbx *p_mbx, u16 *tlvs_mask)
+{
+	struct vfpf_vport_update_activate_tlv *p_act_tlv;
+	u16 tlv = CHANNEL_TLV_VPORT_UPDATE_ACTIVATE;
+
+	p_act_tlv = (struct vfpf_vport_update_activate_tlv *)
+		    qed_iov_search_list_tlvs(p_hwfn, p_mbx->req_virt, tlv);
+	if (!p_act_tlv)
+		return;
+
+	p_data->update_vport_active_rx_flg = p_act_tlv->update_rx;
+	p_data->vport_active_rx_flg = p_act_tlv->active_rx;
+	p_data->update_vport_active_tx_flg = p_act_tlv->update_tx;
+	p_data->vport_active_tx_flg = p_act_tlv->active_tx;
+	*tlvs_mask |= 1 << QED_IOV_VP_UPDATE_ACTIVATE;
+}
+
+static void
+qed_iov_vp_update_vlan_param(struct qed_hwfn *p_hwfn,
+			     struct qed_sp_vport_update_params *p_data,
+			     struct qed_vf_info *p_vf,
+			     struct qed_iov_vf_mbx *p_mbx, u16 *tlvs_mask)
+{
+	struct vfpf_vport_update_vlan_strip_tlv *p_vlan_tlv;
+	u16 tlv = CHANNEL_TLV_VPORT_UPDATE_VLAN_STRIP;
+
+	p_vlan_tlv = (struct vfpf_vport_update_vlan_strip_tlv *)
+		     qed_iov_search_list_tlvs(p_hwfn, p_mbx->req_virt, tlv);
+	if (!p_vlan_tlv)
+		return;
+
+	p_vf->shadow_config.inner_vlan_removal = p_vlan_tlv->remove_vlan;
+
+	/* Ignore the VF request if we're forcing a vlan */
+	if (!(p_vf->configured_features & BIT(VLAN_ADDR_FORCED))) {
+		p_data->update_inner_vlan_removal_flg = 1;
+		p_data->inner_vlan_removal_flg = p_vlan_tlv->remove_vlan;
+	}
+
+	*tlvs_mask |= 1 << QED_IOV_VP_UPDATE_VLAN_STRIP;
+}
+
+static void
+qed_iov_vp_update_tx_switch(struct qed_hwfn *p_hwfn,
+			    struct qed_sp_vport_update_params *p_data,
+			    struct qed_iov_vf_mbx *p_mbx, u16 *tlvs_mask)
+{
+	struct vfpf_vport_update_tx_switch_tlv *p_tx_switch_tlv;
+	u16 tlv = CHANNEL_TLV_VPORT_UPDATE_TX_SWITCH;
+
+	p_tx_switch_tlv = (struct vfpf_vport_update_tx_switch_tlv *)
+			  qed_iov_search_list_tlvs(p_hwfn, p_mbx->req_virt,
+						   tlv);
+	if (!p_tx_switch_tlv)
+		return;
+
+	p_data->update_tx_switching_flg = 1;
+	p_data->tx_switching_flg = p_tx_switch_tlv->tx_switching;
+	*tlvs_mask |= 1 << QED_IOV_VP_UPDATE_TX_SWITCH;
+}
+
+static void
+qed_iov_vp_update_mcast_bin_param(struct qed_hwfn *p_hwfn,
+				  struct qed_sp_vport_update_params *p_data,
+				  struct qed_iov_vf_mbx *p_mbx, u16 *tlvs_mask)
+{
+	struct vfpf_vport_update_mcast_bin_tlv *p_mcast_tlv;
+	u16 tlv = CHANNEL_TLV_VPORT_UPDATE_MCAST;
+
+	p_mcast_tlv = (struct vfpf_vport_update_mcast_bin_tlv *)
+	    qed_iov_search_list_tlvs(p_hwfn, p_mbx->req_virt, tlv);
+	if (!p_mcast_tlv)
+		return;
+
+	p_data->update_approx_mcast_flg = 1;
+	memcpy(p_data->bins, p_mcast_tlv->bins,
+	       sizeof(unsigned long) * ETH_MULTICAST_MAC_BINS_IN_REGS);
+	*tlvs_mask |= 1 << QED_IOV_VP_UPDATE_MCAST;
+}
+
+static void
+qed_iov_vp_update_accept_flag(struct qed_hwfn *p_hwfn,
+			      struct qed_sp_vport_update_params *p_data,
+			      struct qed_iov_vf_mbx *p_mbx, u16 *tlvs_mask)
+{
+	struct qed_filter_accept_flags *p_flags = &p_data->accept_flags;
+	struct vfpf_vport_update_accept_param_tlv *p_accept_tlv;
+	u16 tlv = CHANNEL_TLV_VPORT_UPDATE_ACCEPT_PARAM;
+
+	p_accept_tlv = (struct vfpf_vport_update_accept_param_tlv *)
+	    qed_iov_search_list_tlvs(p_hwfn, p_mbx->req_virt, tlv);
+	if (!p_accept_tlv)
+		return;
+
+	p_flags->update_rx_mode_config = p_accept_tlv->update_rx_mode;
+	p_flags->rx_accept_filter = p_accept_tlv->rx_accept_filter;
+	p_flags->update_tx_mode_config = p_accept_tlv->update_tx_mode;
+	p_flags->tx_accept_filter = p_accept_tlv->tx_accept_filter;
+	*tlvs_mask |= 1 << QED_IOV_VP_UPDATE_ACCEPT_PARAM;
+}
+
+static void
+qed_iov_vp_update_accept_any_vlan(struct qed_hwfn *p_hwfn,
+				  struct qed_sp_vport_update_params *p_data,
+				  struct qed_iov_vf_mbx *p_mbx, u16 *tlvs_mask)
+{
+	struct vfpf_vport_update_accept_any_vlan_tlv *p_accept_any_vlan;
+	u16 tlv = CHANNEL_TLV_VPORT_UPDATE_ACCEPT_ANY_VLAN;
+
+	p_accept_any_vlan = (struct vfpf_vport_update_accept_any_vlan_tlv *)
+			    qed_iov_search_list_tlvs(p_hwfn, p_mbx->req_virt,
+						     tlv);
+	if (!p_accept_any_vlan)
+		return;
+
+	p_data->accept_any_vlan = p_accept_any_vlan->accept_any_vlan;
+	p_data->update_accept_any_vlan_flg =
+		    p_accept_any_vlan->update_accept_any_vlan_flg;
+	*tlvs_mask |= 1 << QED_IOV_VP_UPDATE_ACCEPT_ANY_VLAN;
+}
+
+static void
+qed_iov_vp_update_rss_param(struct qed_hwfn *p_hwfn,
+			    struct qed_vf_info *vf,
+			    struct qed_sp_vport_update_params *p_data,
+			    struct qed_rss_params *p_rss,
+			    struct qed_iov_vf_mbx *p_mbx,
+			    u16 *tlvs_mask, u16 *tlvs_accepted)
+{
+	struct vfpf_vport_update_rss_tlv *p_rss_tlv;
+	u16 tlv = CHANNEL_TLV_VPORT_UPDATE_RSS;
+	bool b_reject = false;
+	u16 table_size;
+	u16 i, q_idx;
+
+	p_rss_tlv = (struct vfpf_vport_update_rss_tlv *)
+		    qed_iov_search_list_tlvs(p_hwfn, p_mbx->req_virt, tlv);
+	if (!p_rss_tlv) {
+		p_data->rss_params = NULL;
+		return;
+	}
+
+	memset(p_rss, 0, sizeof(struct qed_rss_params));
+
+	p_rss->update_rss_config = !!(p_rss_tlv->update_rss_flags &
+				      VFPF_UPDATE_RSS_CONFIG_FLAG);
+	p_rss->update_rss_capabilities = !!(p_rss_tlv->update_rss_flags &
+					    VFPF_UPDATE_RSS_CAPS_FLAG);
+	p_rss->update_rss_ind_table = !!(p_rss_tlv->update_rss_flags &
+					 VFPF_UPDATE_RSS_IND_TABLE_FLAG);
+	p_rss->update_rss_key = !!(p_rss_tlv->update_rss_flags &
+				   VFPF_UPDATE_RSS_KEY_FLAG);
+
+	p_rss->rss_enable = p_rss_tlv->rss_enable;
+	p_rss->rss_eng_id = vf->relative_vf_id + 1;
+	p_rss->rss_caps = p_rss_tlv->rss_caps;
+	p_rss->rss_table_size_log = p_rss_tlv->rss_table_size_log;
+	memcpy(p_rss->rss_key, p_rss_tlv->rss_key, sizeof(p_rss->rss_key));
+
+	table_size = min_t(u16, ARRAY_SIZE(p_rss->rss_ind_table),
+			   (1 << p_rss_tlv->rss_table_size_log));
+
+	for (i = 0; i < table_size; i++) {
+		struct qed_queue_cid *p_cid;
+
+		q_idx = p_rss_tlv->rss_ind_table[i];
+		if (!qed_iov_validate_rxq(p_hwfn, vf, q_idx,
+					  QED_IOV_VALIDATE_Q_ENABLE)) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF[%d]: Omitting RSS due to wrong queue %04x\n",
+				   vf->relative_vf_id, q_idx);
+			b_reject = true;
+			goto out;
+		}
+
+		p_cid = qed_iov_get_vf_rx_queue_cid(&vf->vf_queues[q_idx]);
+		p_rss->rss_ind_table[i] = p_cid;
+	}
+
+	p_data->rss_params = p_rss;
+out:
+	*tlvs_mask |= 1 << QED_IOV_VP_UPDATE_RSS;
+	if (!b_reject)
+		*tlvs_accepted |= 1 << QED_IOV_VP_UPDATE_RSS;
+}
+
+static void
+qed_iov_vp_update_sge_tpa_param(struct qed_hwfn *p_hwfn,
+				struct qed_vf_info *vf,
+				struct qed_sp_vport_update_params *p_data,
+				struct qed_sge_tpa_params *p_sge_tpa,
+				struct qed_iov_vf_mbx *p_mbx, u16 *tlvs_mask)
+{
+	struct vfpf_vport_update_sge_tpa_tlv *p_sge_tpa_tlv;
+	u16 tlv = CHANNEL_TLV_VPORT_UPDATE_SGE_TPA;
+
+	p_sge_tpa_tlv = (struct vfpf_vport_update_sge_tpa_tlv *)
+	    qed_iov_search_list_tlvs(p_hwfn, p_mbx->req_virt, tlv);
+
+	if (!p_sge_tpa_tlv) {
+		p_data->sge_tpa_params = NULL;
+		return;
+	}
+
+	memset(p_sge_tpa, 0, sizeof(struct qed_sge_tpa_params));
+
+	p_sge_tpa->update_tpa_en_flg =
+	    !!(p_sge_tpa_tlv->update_sge_tpa_flags & VFPF_UPDATE_TPA_EN_FLAG);
+	p_sge_tpa->update_tpa_param_flg =
+	    !!(p_sge_tpa_tlv->update_sge_tpa_flags &
+		VFPF_UPDATE_TPA_PARAM_FLAG);
+
+	p_sge_tpa->tpa_ipv4_en_flg =
+	    !!(p_sge_tpa_tlv->sge_tpa_flags & VFPF_TPA_IPV4_EN_FLAG);
+	p_sge_tpa->tpa_ipv6_en_flg =
+	    !!(p_sge_tpa_tlv->sge_tpa_flags & VFPF_TPA_IPV6_EN_FLAG);
+	p_sge_tpa->tpa_pkt_split_flg =
+	    !!(p_sge_tpa_tlv->sge_tpa_flags & VFPF_TPA_PKT_SPLIT_FLAG);
+	p_sge_tpa->tpa_hdr_data_split_flg =
+	    !!(p_sge_tpa_tlv->sge_tpa_flags & VFPF_TPA_HDR_DATA_SPLIT_FLAG);
+	p_sge_tpa->tpa_gro_consistent_flg =
+	    !!(p_sge_tpa_tlv->sge_tpa_flags & VFPF_TPA_GRO_CONSIST_FLAG);
+
+	p_sge_tpa->tpa_max_aggs_num = p_sge_tpa_tlv->tpa_max_aggs_num;
+	p_sge_tpa->tpa_max_size = p_sge_tpa_tlv->tpa_max_size;
+	p_sge_tpa->tpa_min_size_to_start = p_sge_tpa_tlv->tpa_min_size_to_start;
+	p_sge_tpa->tpa_min_size_to_cont = p_sge_tpa_tlv->tpa_min_size_to_cont;
+	p_sge_tpa->max_buffers_per_cqe = p_sge_tpa_tlv->max_buffers_per_cqe;
+
+	p_data->sge_tpa_params = p_sge_tpa;
+
+	*tlvs_mask |= 1 << QED_IOV_VP_UPDATE_SGE_TPA;
+}
+
+static int qed_iov_pre_update_vport(struct qed_hwfn *hwfn,
+				    u8 vfid,
+				    struct qed_sp_vport_update_params *params,
+				    u16 *tlvs)
+{
+	u8 mask = QED_ACCEPT_UCAST_UNMATCHED | QED_ACCEPT_MCAST_UNMATCHED;
+	struct qed_filter_accept_flags *flags = &params->accept_flags;
+	struct qed_public_vf_info *vf_info;
+
+	/* Untrusted VFs can't even be trusted to know that fact.
+	 * Simply indicate everything is configured fine, and trace
+	 * configuration 'behind their back'.
+	 */
+	if (!(*tlvs & BIT(QED_IOV_VP_UPDATE_ACCEPT_PARAM)))
+		return 0;
+
+	vf_info = qed_iov_get_public_vf_info(hwfn, vfid, true);
+
+	if (flags->update_rx_mode_config) {
+		vf_info->rx_accept_mode = flags->rx_accept_filter;
+		if (!vf_info->is_trusted_configured)
+			flags->rx_accept_filter &= ~mask;
+	}
+
+	if (flags->update_tx_mode_config) {
+		vf_info->tx_accept_mode = flags->tx_accept_filter;
+		if (!vf_info->is_trusted_configured)
+			flags->tx_accept_filter &= ~mask;
+	}
+
+	return 0;
+}
+
+static void qed_iov_vf_mbx_vport_update(struct qed_hwfn *p_hwfn,
+					struct qed_ptt *p_ptt,
+					struct qed_vf_info *vf)
+{
+	struct qed_rss_params *p_rss_params = NULL;
+	struct qed_sp_vport_update_params params;
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	struct qed_sge_tpa_params sge_tpa_params;
+	u16 tlvs_mask = 0, tlvs_accepted = 0;
+	u8 status = PFVF_STATUS_SUCCESS;
+	u16 length;
+	int rc;
+
+	/* Valiate PF can send such a request */
+	if (!vf->vport_instance) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "No VPORT instance available for VF[%d], failing vport update\n",
+			   vf->abs_vf_id);
+		status = PFVF_STATUS_FAILURE;
+		goto out;
+	}
+	p_rss_params = vzalloc(sizeof(*p_rss_params));
+	if (p_rss_params == NULL) {
+		status = PFVF_STATUS_FAILURE;
+		goto out;
+	}
+
+	memset(&params, 0, sizeof(params));
+	params.opaque_fid = vf->opaque_fid;
+	params.vport_id = vf->vport_id;
+	params.rss_params = NULL;
+
+	/* Search for extended tlvs list and update values
+	 * from VF in struct qed_sp_vport_update_params.
+	 */
+	qed_iov_vp_update_act_param(p_hwfn, &params, mbx, &tlvs_mask);
+	qed_iov_vp_update_vlan_param(p_hwfn, &params, vf, mbx, &tlvs_mask);
+	qed_iov_vp_update_tx_switch(p_hwfn, &params, mbx, &tlvs_mask);
+	qed_iov_vp_update_mcast_bin_param(p_hwfn, &params, mbx, &tlvs_mask);
+	qed_iov_vp_update_accept_flag(p_hwfn, &params, mbx, &tlvs_mask);
+	qed_iov_vp_update_accept_any_vlan(p_hwfn, &params, mbx, &tlvs_mask);
+	qed_iov_vp_update_sge_tpa_param(p_hwfn, vf, &params,
+					&sge_tpa_params, mbx, &tlvs_mask);
+
+	tlvs_accepted = tlvs_mask;
+
+	/* Some of the extended TLVs need to be validated first; In that case,
+	 * they can update the mask without updating the accepted [so that
+	 * PF could communicate to VF it has rejected request].
+	 */
+	qed_iov_vp_update_rss_param(p_hwfn, vf, &params, p_rss_params,
+				    mbx, &tlvs_mask, &tlvs_accepted);
+
+	if (qed_iov_pre_update_vport(p_hwfn, vf->relative_vf_id,
+				     &params, &tlvs_accepted)) {
+		tlvs_accepted = 0;
+		status = PFVF_STATUS_NOT_SUPPORTED;
+		goto out;
+	}
+
+	if (!tlvs_accepted) {
+		if (tlvs_mask)
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "Upper-layer prevents VF vport configuration\n");
+		else
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "No feature tlvs found for vport update\n");
+		status = PFVF_STATUS_NOT_SUPPORTED;
+		goto out;
+	}
+
+	rc = qed_sp_vport_update(p_hwfn, &params, QED_SPQ_MODE_EBLOCK, NULL);
+
+	if (rc)
+		status = PFVF_STATUS_FAILURE;
+
+out:
+	vfree(p_rss_params);
+	length = qed_iov_prep_vp_update_resp_tlvs(p_hwfn, vf, mbx, status,
+						  tlvs_mask, tlvs_accepted);
+	qed_iov_send_response(p_hwfn, p_ptt, vf, length, status);
+}
+
+static int qed_iov_vf_update_vlan_shadow(struct qed_hwfn *p_hwfn,
+					 struct qed_vf_info *p_vf,
+					 struct qed_filter_ucast *p_params)
+{
+	int i;
+
+	/* First remove entries and then add new ones */
+	if (p_params->opcode == QED_FILTER_REMOVE) {
+		for (i = 0; i < QED_ETH_VF_NUM_VLAN_FILTERS + 1; i++)
+			if (p_vf->shadow_config.vlans[i].used &&
+			    p_vf->shadow_config.vlans[i].vid ==
+			    p_params->vlan) {
+				p_vf->shadow_config.vlans[i].used = false;
+				break;
+			}
+		if (i == QED_ETH_VF_NUM_VLAN_FILTERS + 1) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF [%d] - Tries to remove a non-existing vlan\n",
+				   p_vf->relative_vf_id);
+			return -EINVAL;
+		}
+	} else if (p_params->opcode == QED_FILTER_REPLACE ||
+		   p_params->opcode == QED_FILTER_FLUSH) {
+		for (i = 0; i < QED_ETH_VF_NUM_VLAN_FILTERS + 1; i++)
+			p_vf->shadow_config.vlans[i].used = false;
+	}
+
+	/* In forced mode, we're willing to remove entries - but we don't add
+	 * new ones.
+	 */
+	if (p_vf->bulletin.p_virt->valid_bitmap & BIT(VLAN_ADDR_FORCED))
+		return 0;
+
+	if (p_params->opcode == QED_FILTER_ADD ||
+	    p_params->opcode == QED_FILTER_REPLACE) {
+		for (i = 0; i < QED_ETH_VF_NUM_VLAN_FILTERS + 1; i++) {
+			if (p_vf->shadow_config.vlans[i].used)
+				continue;
+
+			p_vf->shadow_config.vlans[i].used = true;
+			p_vf->shadow_config.vlans[i].vid = p_params->vlan;
+			break;
+		}
+
+		if (i == QED_ETH_VF_NUM_VLAN_FILTERS + 1) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF [%d] - Tries to configure more than %d vlan filters\n",
+				   p_vf->relative_vf_id,
+				   QED_ETH_VF_NUM_VLAN_FILTERS + 1);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int qed_iov_vf_update_mac_shadow(struct qed_hwfn *p_hwfn,
+					struct qed_vf_info *p_vf,
+					struct qed_filter_ucast *p_params)
+{
+	int i;
+
+	/* If we're in forced-mode, we don't allow any change */
+	if (p_vf->bulletin.p_virt->valid_bitmap & BIT(MAC_ADDR_FORCED))
+		return 0;
+
+	/* First remove entries and then add new ones */
+	if (p_params->opcode == QED_FILTER_REMOVE) {
+		for (i = 0; i < QED_ETH_VF_NUM_MAC_FILTERS; i++) {
+			if (ether_addr_equal(p_vf->shadow_config.macs[i],
+					     p_params->mac)) {
+				eth_zero_addr(p_vf->shadow_config.macs[i]);
+				break;
+			}
+		}
+
+		if (i == QED_ETH_VF_NUM_MAC_FILTERS) {
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "MAC isn't configured\n");
+			return -EINVAL;
+		}
+	} else if (p_params->opcode == QED_FILTER_REPLACE ||
+		   p_params->opcode == QED_FILTER_FLUSH) {
+		for (i = 0; i < QED_ETH_VF_NUM_MAC_FILTERS; i++)
+			eth_zero_addr(p_vf->shadow_config.macs[i]);
+	}
+
+	/* List the new MAC address */
+	if (p_params->opcode != QED_FILTER_ADD &&
+	    p_params->opcode != QED_FILTER_REPLACE)
+		return 0;
+
+	for (i = 0; i < QED_ETH_VF_NUM_MAC_FILTERS; i++) {
+		if (is_zero_ether_addr(p_vf->shadow_config.macs[i])) {
+			ether_addr_copy(p_vf->shadow_config.macs[i],
+					p_params->mac);
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "Added MAC at %d entry in shadow\n", i);
+			break;
+		}
+	}
+
+	if (i == QED_ETH_VF_NUM_MAC_FILTERS) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV, "No available place for MAC\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+qed_iov_vf_update_unicast_shadow(struct qed_hwfn *p_hwfn,
+				 struct qed_vf_info *p_vf,
+				 struct qed_filter_ucast *p_params)
+{
+	int rc = 0;
+
+	if (p_params->type == QED_FILTER_MAC) {
+		rc = qed_iov_vf_update_mac_shadow(p_hwfn, p_vf, p_params);
+		if (rc)
+			return rc;
+	}
+
+	if (p_params->type == QED_FILTER_VLAN)
+		rc = qed_iov_vf_update_vlan_shadow(p_hwfn, p_vf, p_params);
+
+	return rc;
+}
+
+static int qed_iov_chk_ucast(struct qed_hwfn *hwfn,
+			     int vfid, struct qed_filter_ucast *params)
+{
+	struct qed_public_vf_info *vf;
+
+	vf = qed_iov_get_public_vf_info(hwfn, vfid, true);
+	if (!vf)
+		return -EINVAL;
+
+	/* No real decision to make; Store the configured MAC */
+	if (params->type == QED_FILTER_MAC ||
+	    params->type == QED_FILTER_MAC_VLAN)
+		ether_addr_copy(vf->mac, params->mac);
+
+	return 0;
+}
+
+static void qed_iov_vf_mbx_ucast_filter(struct qed_hwfn *p_hwfn,
+					struct qed_ptt *p_ptt,
+					struct qed_vf_info *vf)
+{
+	struct qed_bulletin_content *p_bulletin = vf->bulletin.p_virt;
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	struct vfpf_ucast_filter_tlv *req;
+	u8 status = PFVF_STATUS_SUCCESS;
+	struct qed_filter_ucast params;
+	int rc;
+
+	/* Prepare the unicast filter params */
+	memset(&params, 0, sizeof(struct qed_filter_ucast));
+	req = &mbx->req_virt->ucast_filter;
+	params.opcode = (enum qed_filter_opcode)req->opcode;
+	params.type = (enum qed_filter_ucast_type)req->type;
+
+	params.is_rx_filter = 1;
+	params.is_tx_filter = 1;
+	params.vport_to_remove_from = vf->vport_id;
+	params.vport_to_add_to = vf->vport_id;
+	memcpy(params.mac, req->mac, ETH_ALEN);
+	params.vlan = req->vlan;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF[%d]: opcode 0x%02x type 0x%02x [%s %s] [vport 0x%02x] MAC %02x:%02x:%02x:%02x:%02x:%02x, vlan 0x%04x\n",
+		   vf->abs_vf_id, params.opcode, params.type,
+		   params.is_rx_filter ? "RX" : "",
+		   params.is_tx_filter ? "TX" : "",
+		   params.vport_to_add_to,
+		   params.mac[0], params.mac[1],
+		   params.mac[2], params.mac[3],
+		   params.mac[4], params.mac[5], params.vlan);
+
+	if (!vf->vport_instance) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "No VPORT instance available for VF[%d], failing ucast MAC configuration\n",
+			   vf->abs_vf_id);
+		status = PFVF_STATUS_FAILURE;
+		goto out;
+	}
+
+	/* Update shadow copy of the VF configuration */
+	if (qed_iov_vf_update_unicast_shadow(p_hwfn, vf, &params)) {
+		status = PFVF_STATUS_FAILURE;
+		goto out;
+	}
+
+	/* Determine if the unicast filtering is acceptible by PF */
+	if ((p_bulletin->valid_bitmap & BIT(VLAN_ADDR_FORCED)) &&
+	    (params.type == QED_FILTER_VLAN ||
+	     params.type == QED_FILTER_MAC_VLAN)) {
+		/* Once VLAN is forced or PVID is set, do not allow
+		 * to add/replace any further VLANs.
+		 */
+		if (params.opcode == QED_FILTER_ADD ||
+		    params.opcode == QED_FILTER_REPLACE)
+			status = PFVF_STATUS_FORCED;
+		goto out;
+	}
+
+	if ((p_bulletin->valid_bitmap & BIT(MAC_ADDR_FORCED)) &&
+	    (params.type == QED_FILTER_MAC ||
+	     params.type == QED_FILTER_MAC_VLAN)) {
+		if (!ether_addr_equal(p_bulletin->mac, params.mac) ||
+		    (params.opcode != QED_FILTER_ADD &&
+		     params.opcode != QED_FILTER_REPLACE))
+			status = PFVF_STATUS_FORCED;
+		goto out;
+	}
+
+	rc = qed_iov_chk_ucast(p_hwfn, vf->relative_vf_id, &params);
+	if (rc) {
+		status = PFVF_STATUS_FAILURE;
+		goto out;
+	}
+
+	rc = qed_sp_eth_filter_ucast(p_hwfn, vf->opaque_fid, &params,
+				     QED_SPQ_MODE_CB, NULL);
+	if (rc)
+		status = PFVF_STATUS_FAILURE;
+
+out:
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_UCAST_FILTER,
+			     sizeof(struct pfvf_def_resp_tlv), status);
+}
+
+static void qed_iov_vf_mbx_int_cleanup(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       struct qed_vf_info *vf)
+{
+	int i;
+
+	/* Reset the SBs */
+	for (i = 0; i < vf->num_sbs; i++)
+		qed_int_igu_init_pure_rt_single(p_hwfn, p_ptt,
+						vf->igu_sbs[i],
+						vf->opaque_fid, false);
+
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_INT_CLEANUP,
+			     sizeof(struct pfvf_def_resp_tlv),
+			     PFVF_STATUS_SUCCESS);
+}
+
+static void qed_iov_vf_mbx_close(struct qed_hwfn *p_hwfn,
+				 struct qed_ptt *p_ptt, struct qed_vf_info *vf)
+{
+	u16 length = sizeof(struct pfvf_def_resp_tlv);
+	u8 status = PFVF_STATUS_SUCCESS;
+
+	/* Disable Interrupts for VF */
+	qed_iov_vf_igu_set_int(p_hwfn, p_ptt, vf, 0);
+
+	/* Reset Permission table */
+	qed_iov_config_perm_table(p_hwfn, p_ptt, vf, 0);
+
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_CLOSE,
+			     length, status);
+}
+
+static void qed_iov_vf_mbx_release(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   struct qed_vf_info *p_vf)
+{
+	u16 length = sizeof(struct pfvf_def_resp_tlv);
+	u8 status = PFVF_STATUS_SUCCESS;
+	int rc = 0;
+
+	qed_iov_vf_cleanup(p_hwfn, p_vf);
+
+	if (p_vf->state != VF_STOPPED && p_vf->state != VF_FREE) {
+		/* Stopping the VF */
+		rc = qed_sp_vf_stop(p_hwfn, p_vf->concrete_fid,
+				    p_vf->opaque_fid);
+
+		if (rc) {
+			DP_ERR(p_hwfn, "qed_sp_vf_stop returned error %d\n",
+			       rc);
+			status = PFVF_STATUS_FAILURE;
+		}
+
+		p_vf->state = VF_STOPPED;
+	}
+
+	qed_iov_prepare_resp(p_hwfn, p_ptt, p_vf, CHANNEL_TLV_RELEASE,
+			     length, status);
+}
+
+static void qed_iov_vf_pf_get_coalesce(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       struct qed_vf_info *p_vf)
+{
+	struct qed_iov_vf_mbx *mbx = &p_vf->vf_mbx;
+	struct pfvf_read_coal_resp_tlv *p_resp;
+	struct vfpf_read_coal_req_tlv *req;
+	u8 status = PFVF_STATUS_FAILURE;
+	struct qed_vf_queue *p_queue;
+	struct qed_queue_cid *p_cid;
+	u16 coal = 0, qid, i;
+	bool b_is_rx;
+	int rc = 0;
+
+	mbx->offset = (u8 *)mbx->reply_virt;
+	req = &mbx->req_virt->read_coal_req;
+
+	qid = req->qid;
+	b_is_rx = req->is_rx ? true : false;
+
+	if (b_is_rx) {
+		if (!qed_iov_validate_rxq(p_hwfn, p_vf, qid,
+					  QED_IOV_VALIDATE_Q_ENABLE)) {
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "VF[%d]: Invalid Rx queue_id = %d\n",
+				   p_vf->abs_vf_id, qid);
+			goto send_resp;
+		}
+
+		p_cid = qed_iov_get_vf_rx_queue_cid(&p_vf->vf_queues[qid]);
+		rc = qed_get_rxq_coalesce(p_hwfn, p_ptt, p_cid, &coal);
+		if (rc)
+			goto send_resp;
+	} else {
+		if (!qed_iov_validate_txq(p_hwfn, p_vf, qid,
+					  QED_IOV_VALIDATE_Q_ENABLE)) {
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "VF[%d]: Invalid Tx queue_id = %d\n",
+				   p_vf->abs_vf_id, qid);
+			goto send_resp;
+		}
+		for (i = 0; i < MAX_QUEUES_PER_QZONE; i++) {
+			p_queue = &p_vf->vf_queues[qid];
+			if ((!p_queue->cids[i].p_cid) ||
+			    (!p_queue->cids[i].b_is_tx))
+				continue;
+
+			p_cid = p_queue->cids[i].p_cid;
+
+			rc = qed_get_txq_coalesce(p_hwfn, p_ptt, p_cid, &coal);
+			if (rc)
+				goto send_resp;
+			break;
+		}
+	}
+
+	status = PFVF_STATUS_SUCCESS;
+
+send_resp:
+	p_resp = qed_add_tlv(p_hwfn, &mbx->offset, CHANNEL_TLV_COALESCE_READ,
+			     sizeof(*p_resp));
+	p_resp->coal = coal;
+
+	qed_add_tlv(p_hwfn, &mbx->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	qed_iov_send_response(p_hwfn, p_ptt, p_vf, sizeof(*p_resp), status);
+}
+
+static void qed_iov_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       struct qed_vf_info *vf)
+{
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	struct vfpf_update_coalesce *req;
+	u8 status = PFVF_STATUS_FAILURE;
+	struct qed_queue_cid *p_cid;
+	u16 rx_coal, tx_coal;
+	int rc = 0, i;
+	u16 qid;
+
+	req = &mbx->req_virt->update_coalesce;
+
+	rx_coal = req->rx_coal;
+	tx_coal = req->tx_coal;
+	qid = req->qid;
+
+	if (!qed_iov_validate_rxq(p_hwfn, vf, qid,
+				  QED_IOV_VALIDATE_Q_ENABLE) && rx_coal) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF[%d]: Invalid Rx queue_id = %d\n",
+			   vf->abs_vf_id, qid);
+		goto out;
+	}
+
+	if (!qed_iov_validate_txq(p_hwfn, vf, qid,
+				  QED_IOV_VALIDATE_Q_ENABLE) && tx_coal) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF[%d]: Invalid Tx queue_id = %d\n",
+			   vf->abs_vf_id, qid);
+		goto out;
+	}
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF[%d]: Setting coalesce for VF rx_coal = %d, tx_coal = %d at queue = %d\n",
+		   vf->abs_vf_id, rx_coal, tx_coal, qid);
+
+	if (rx_coal) {
+		p_cid = qed_iov_get_vf_rx_queue_cid(&vf->vf_queues[qid]);
+
+		rc = qed_set_rxq_coalesce(p_hwfn, p_ptt, rx_coal, p_cid);
+		if (rc) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF[%d]: Unable to set rx queue = %d coalesce\n",
+				   vf->abs_vf_id, vf->vf_queues[qid].fw_rx_qid);
+			goto out;
+		}
+		vf->rx_coal = rx_coal;
+	}
+
+	if (tx_coal) {
+		struct qed_vf_queue *p_queue = &vf->vf_queues[qid];
+
+		for (i = 0; i < MAX_QUEUES_PER_QZONE; i++) {
+			if (!p_queue->cids[i].p_cid)
+				continue;
+
+			if (!p_queue->cids[i].b_is_tx)
+				continue;
+
+			rc = qed_set_txq_coalesce(p_hwfn, p_ptt, tx_coal,
+						  p_queue->cids[i].p_cid);
+
+			if (rc) {
+				DP_VERBOSE(p_hwfn,
+					   QED_MSG_IOV,
+					   "VF[%d]: Unable to set tx queue coalesce\n",
+					   vf->abs_vf_id);
+				goto out;
+			}
+		}
+		vf->tx_coal = tx_coal;
+	}
+
+	status = PFVF_STATUS_SUCCESS;
+out:
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_COALESCE_UPDATE,
+			     sizeof(struct pfvf_def_resp_tlv), status);
+}
+static int
+qed_iov_vf_flr_poll_dorq(struct qed_hwfn *p_hwfn,
+			 struct qed_vf_info *p_vf, struct qed_ptt *p_ptt)
+{
+	int cnt;
+	u32 val;
+
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) p_vf->concrete_fid);
+
+	for (cnt = 0; cnt < 50; cnt++) {
+		val = qed_rd(p_hwfn, p_ptt, DORQ_REG_VF_USAGE_CNT);
+		if (!val)
+			break;
+		msleep(20);
+	}
+	qed_fid_pretend(p_hwfn, p_ptt, (u16) p_hwfn->hw_info.concrete_fid);
+
+	if (cnt == 50) {
+		DP_ERR(p_hwfn,
+		       "VF[%d] - dorq failed to cleanup [usage 0x%08x]\n",
+		       p_vf->abs_vf_id, val);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int
+qed_iov_vf_flr_poll_pbf(struct qed_hwfn *p_hwfn,
+			struct qed_vf_info *p_vf, struct qed_ptt *p_ptt)
+{
+	u32 cons[MAX_NUM_VOQS_E4], distance[MAX_NUM_VOQS_E4];
+	int i, cnt;
+
+	/* Read initial consumers & producers */
+	for (i = 0; i < MAX_NUM_VOQS_E4; i++) {
+		u32 prod;
+
+		cons[i] = qed_rd(p_hwfn, p_ptt,
+				 PBF_REG_NUM_BLOCKS_ALLOCATED_CONS_VOQ0 +
+				 i * 0x40);
+		prod = qed_rd(p_hwfn, p_ptt,
+			      PBF_REG_NUM_BLOCKS_ALLOCATED_PROD_VOQ0 +
+			      i * 0x40);
+		distance[i] = prod - cons[i];
+	}
+
+	/* Wait for consumers to pass the producers */
+	i = 0;
+	for (cnt = 0; cnt < 50; cnt++) {
+		for (; i < MAX_NUM_VOQS_E4; i++) {
+			u32 tmp;
+
+			tmp = qed_rd(p_hwfn, p_ptt,
+				     PBF_REG_NUM_BLOCKS_ALLOCATED_CONS_VOQ0 +
+				     i * 0x40);
+			if (distance[i] > tmp - cons[i])
+				break;
+		}
+
+		if (i == MAX_NUM_VOQS_E4)
+			break;
+
+		msleep(20);
+	}
+
+	if (cnt == 50) {
+		DP_ERR(p_hwfn, "VF[%d] - pbf polling failed on VOQ %d\n",
+		       p_vf->abs_vf_id, i);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int qed_iov_vf_flr_poll(struct qed_hwfn *p_hwfn,
+			       struct qed_vf_info *p_vf, struct qed_ptt *p_ptt)
+{
+	int rc;
+
+	rc = qed_iov_vf_flr_poll_dorq(p_hwfn, p_vf, p_ptt);
+	if (rc)
+		return rc;
+
+	rc = qed_iov_vf_flr_poll_pbf(p_hwfn, p_vf, p_ptt);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int
+qed_iov_execute_vf_flr_cleanup(struct qed_hwfn *p_hwfn,
+			       struct qed_ptt *p_ptt,
+			       u16 rel_vf_id, u32 *ack_vfs)
+{
+	struct qed_vf_info *p_vf;
+	int rc = 0;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, false);
+	if (!p_vf)
+		return 0;
+
+	if (p_hwfn->pf_iov_info->pending_flr[rel_vf_id / 64] &
+	    (1ULL << (rel_vf_id % 64))) {
+		u16 vfid = p_vf->abs_vf_id;
+
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF[%d] - Handling FLR\n", vfid);
+
+		qed_iov_vf_cleanup(p_hwfn, p_vf);
+
+		/* If VF isn't active, no need for anything but SW */
+		if (!p_vf->b_init)
+			goto cleanup;
+
+		rc = qed_iov_vf_flr_poll(p_hwfn, p_vf, p_ptt);
+		if (rc)
+			goto cleanup;
+
+		rc = qed_final_cleanup(p_hwfn, p_ptt, vfid, true);
+		if (rc) {
+			DP_ERR(p_hwfn, "Failed handle FLR of VF[%d]\n", vfid);
+			return rc;
+		}
+
+		/* Workaround to make VF-PF channel ready, as FW
+		 * doesn't do that as a part of FLR.
+		 */
+		REG_WR(p_hwfn,
+		       GTT_BAR0_MAP_REG_USDM_RAM +
+		       USTORM_VF_PF_CHANNEL_READY_OFFSET(vfid), 1);
+
+		/* VF_STOPPED has to be set only after final cleanup
+		 * but prior to re-enabling the VF.
+		 */
+		p_vf->state = VF_STOPPED;
+
+		rc = qed_iov_enable_vf_access(p_hwfn, p_ptt, p_vf);
+		if (rc) {
+			DP_ERR(p_hwfn, "Failed to re-enable VF[%d] acces\n",
+			       vfid);
+			return rc;
+		}
+cleanup:
+		/* Mark VF for ack and clean pending state */
+		if (p_vf->state == VF_RESET)
+			p_vf->state = VF_STOPPED;
+		ack_vfs[vfid / 32] |= BIT((vfid % 32));
+		p_hwfn->pf_iov_info->pending_flr[rel_vf_id / 64] &=
+		    ~(1ULL << (rel_vf_id % 64));
+		p_vf->vf_mbx.b_pending_msg = false;
+	}
+
+	return rc;
+}
+
+static int
+qed_iov_vf_flr_cleanup(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 ack_vfs[VF_MAX_STATIC / 32];
+	int rc = 0;
+	u16 i;
+
+	memset(ack_vfs, 0, sizeof(u32) * (VF_MAX_STATIC / 32));
+
+	/* Since BRB <-> PRS interface can't be tested as part of the flr
+	 * polling due to HW limitations, simply sleep a bit. And since
+	 * there's no need to wait per-vf, do it before looping.
+	 */
+	msleep(100);
+
+	for (i = 0; i < p_hwfn->cdev->p_iov_info->total_vfs; i++)
+		qed_iov_execute_vf_flr_cleanup(p_hwfn, p_ptt, i, ack_vfs);
+
+	rc = qed_mcp_ack_vf_flr(p_hwfn, p_ptt, ack_vfs);
+	return rc;
+}
+
+bool qed_iov_mark_vf_flr(struct qed_hwfn *p_hwfn, u32 *p_disabled_vfs)
+{
+	bool found = false;
+	u16 i;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV, "Marking FLR-ed VFs\n");
+	for (i = 0; i < (VF_MAX_STATIC / 32); i++)
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "[%08x,...,%08x]: %08x\n",
+			   i * 32, (i + 1) * 32 - 1, p_disabled_vfs[i]);
+
+	if (!p_hwfn->cdev->p_iov_info) {
+		DP_NOTICE(p_hwfn, "VF flr but no IOV\n");
+		return false;
+	}
+
+	/* Mark VFs */
+	for (i = 0; i < p_hwfn->cdev->p_iov_info->total_vfs; i++) {
+		struct qed_vf_info *p_vf;
+		u8 vfid;
+
+		p_vf = qed_iov_get_vf_info(p_hwfn, i, false);
+		if (!p_vf)
+			continue;
+
+		vfid = p_vf->abs_vf_id;
+		if (BIT((vfid % 32)) & p_disabled_vfs[vfid / 32]) {
+			u64 *p_flr = p_hwfn->pf_iov_info->pending_flr;
+			u16 rel_vf_id = p_vf->relative_vf_id;
+
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "VF[%d] [rel %d] got FLR-ed\n",
+				   vfid, rel_vf_id);
+
+			p_vf->state = VF_RESET;
+
+			/* No need to lock here, since pending_flr should
+			 * only change here and before ACKing MFw. Since
+			 * MFW will not trigger an additional attention for
+			 * VF flr until ACKs, we're safe.
+			 */
+			p_flr[rel_vf_id / 64] |= 1ULL << (rel_vf_id % 64);
+			found = true;
+		}
+	}
+
+	return found;
+}
+
+static void qed_iov_get_link(struct qed_hwfn *p_hwfn,
+			     u16 vfid,
+			     struct qed_mcp_link_params *p_params,
+			     struct qed_mcp_link_state *p_link,
+			     struct qed_mcp_link_capabilities *p_caps)
+{
+	struct qed_vf_info *p_vf = qed_iov_get_vf_info(p_hwfn,
+						       vfid,
+						       false);
+	struct qed_bulletin_content *p_bulletin;
+
+	if (!p_vf)
+		return;
+
+	p_bulletin = p_vf->bulletin.p_virt;
+
+	if (p_params)
+		__qed_vf_get_link_params(p_hwfn, p_params, p_bulletin);
+	if (p_link)
+		__qed_vf_get_link_state(p_hwfn, p_link, p_bulletin);
+	if (p_caps)
+		__qed_vf_get_link_caps(p_hwfn, p_caps, p_bulletin);
+}
+
+static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt, int vfid)
+{
+	struct qed_iov_vf_mbx *mbx;
+	struct qed_vf_info *p_vf;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!p_vf)
+		return;
+
+	mbx = &p_vf->vf_mbx;
+
+	/* qed_iov_process_mbx_request */
+	if (!mbx->b_pending_msg) {
+		DP_NOTICE(p_hwfn,
+			  "VF[%02x]: Trying to process mailbox message when none is pending\n",
+			  p_vf->abs_vf_id);
+		return;
+	}
+	mbx->b_pending_msg = false;
+
+	mbx->first_tlv = mbx->req_virt->first_tlv;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "VF[%02x]: Processing mailbox message [type %04x]\n",
+		   p_vf->abs_vf_id, mbx->first_tlv.tl.type);
+
+	/* check if tlv type is known */
+	if (qed_iov_tlv_supported(mbx->first_tlv.tl.type) &&
+	    !p_vf->b_malicious) {
+		switch (mbx->first_tlv.tl.type) {
+		case CHANNEL_TLV_ACQUIRE:
+			qed_iov_vf_mbx_acquire(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_VPORT_START:
+			qed_iov_vf_mbx_start_vport(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_VPORT_TEARDOWN:
+			qed_iov_vf_mbx_stop_vport(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_START_RXQ:
+			qed_iov_vf_mbx_start_rxq(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_START_TXQ:
+			qed_iov_vf_mbx_start_txq(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_STOP_RXQS:
+			qed_iov_vf_mbx_stop_rxqs(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_STOP_TXQS:
+			qed_iov_vf_mbx_stop_txqs(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_UPDATE_RXQ:
+			qed_iov_vf_mbx_update_rxqs(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_VPORT_UPDATE:
+			qed_iov_vf_mbx_vport_update(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_UCAST_FILTER:
+			qed_iov_vf_mbx_ucast_filter(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_CLOSE:
+			qed_iov_vf_mbx_close(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_INT_CLEANUP:
+			qed_iov_vf_mbx_int_cleanup(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_RELEASE:
+			qed_iov_vf_mbx_release(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_UPDATE_TUNN_PARAM:
+			qed_iov_vf_mbx_update_tunn_param(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_COALESCE_UPDATE:
+			qed_iov_vf_pf_set_coalesce(p_hwfn, p_ptt, p_vf);
+			break;
+		case CHANNEL_TLV_COALESCE_READ:
+			qed_iov_vf_pf_get_coalesce(p_hwfn, p_ptt, p_vf);
+			break;
+		}
+	} else if (qed_iov_tlv_supported(mbx->first_tlv.tl.type)) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF [%02x] - considered malicious; Ignoring TLV [%04x]\n",
+			   p_vf->abs_vf_id, mbx->first_tlv.tl.type);
+
+		qed_iov_prepare_resp(p_hwfn, p_ptt, p_vf,
+				     mbx->first_tlv.tl.type,
+				     sizeof(struct pfvf_def_resp_tlv),
+				     PFVF_STATUS_MALICIOUS);
+	} else {
+		/* unknown TLV - this may belong to a VF driver from the future
+		 * - a version written after this PF driver was written, which
+		 * supports features unknown as of yet. Too bad since we don't
+		 * support them. Or this may be because someone wrote a crappy
+		 * VF driver and is sending garbage over the channel.
+		 */
+		DP_NOTICE(p_hwfn,
+			  "VF[%02x]: unknown TLV. type %04x length %04x padding %08x reply address %llu\n",
+			  p_vf->abs_vf_id,
+			  mbx->first_tlv.tl.type,
+			  mbx->first_tlv.tl.length,
+			  mbx->first_tlv.padding, mbx->first_tlv.reply_address);
+
+		/* Try replying in case reply address matches the acquisition's
+		 * posted address.
+		 */
+		if (p_vf->acquire.first_tlv.reply_address &&
+		    (mbx->first_tlv.reply_address ==
+		     p_vf->acquire.first_tlv.reply_address)) {
+			qed_iov_prepare_resp(p_hwfn, p_ptt, p_vf,
+					     mbx->first_tlv.tl.type,
+					     sizeof(struct pfvf_def_resp_tlv),
+					     PFVF_STATUS_NOT_SUPPORTED);
+		} else {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF[%02x]: Can't respond to TLV - no valid reply address\n",
+				   p_vf->abs_vf_id);
+		}
+	}
+}
+
+void qed_iov_pf_get_pending_events(struct qed_hwfn *p_hwfn, u64 *events)
+{
+	int i;
+
+	memset(events, 0, sizeof(u64) * QED_VF_ARRAY_LENGTH);
+
+	qed_for_each_vf(p_hwfn, i) {
+		struct qed_vf_info *p_vf;
+
+		p_vf = &p_hwfn->pf_iov_info->vfs_array[i];
+		if (p_vf->vf_mbx.b_pending_msg)
+			events[i / 64] |= 1ULL << (i % 64);
+	}
+}
+
+static struct qed_vf_info *qed_sriov_get_vf_from_absid(struct qed_hwfn *p_hwfn,
+						       u16 abs_vfid)
+{
+	u8 min = (u8) p_hwfn->cdev->p_iov_info->first_vf_in_pf;
+
+	if (!_qed_iov_pf_sanity_check(p_hwfn, (int)abs_vfid - min, false)) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "Got indication for VF [abs 0x%08x] that cannot be handled by PF\n",
+			   abs_vfid);
+		return NULL;
+	}
+
+	return &p_hwfn->pf_iov_info->vfs_array[(u8) abs_vfid - min];
+}
+
+static int qed_sriov_vfpf_msg(struct qed_hwfn *p_hwfn,
+			      u16 abs_vfid, struct regpair *vf_msg)
+{
+	struct qed_vf_info *p_vf = qed_sriov_get_vf_from_absid(p_hwfn,
+			   abs_vfid);
+
+	if (!p_vf)
+		return 0;
+
+	/* List the physical address of the request so that handler
+	 * could later on copy the message from it.
+	 */
+	p_vf->vf_mbx.pending_req = (((u64)vf_msg->hi) << 32) | vf_msg->lo;
+
+	/* Mark the event and schedule the workqueue */
+	p_vf->vf_mbx.b_pending_msg = true;
+	qed_schedule_iov(p_hwfn, QED_IOV_WQ_MSG_FLAG);
+
+	return 0;
+}
+
+static void qed_sriov_vfpf_malicious(struct qed_hwfn *p_hwfn,
+				     struct malicious_vf_eqe_data *p_data)
+{
+	struct qed_vf_info *p_vf;
+
+	p_vf = qed_sriov_get_vf_from_absid(p_hwfn, p_data->vf_id);
+
+	if (!p_vf)
+		return;
+
+	if (!p_vf->b_malicious) {
+		DP_NOTICE(p_hwfn,
+			  "VF [%d] - Malicious behavior [%02x]\n",
+			  p_vf->abs_vf_id, p_data->err_id);
+
+		p_vf->b_malicious = true;
+	} else {
+		DP_INFO(p_hwfn,
+			"VF [%d] - Malicious behavior [%02x]\n",
+			p_vf->abs_vf_id, p_data->err_id);
+	}
+}
+
+static int qed_sriov_eqe_event(struct qed_hwfn *p_hwfn,
+			       u8 opcode,
+			       __le16 echo,
+			       union event_ring_data *data, u8 fw_return_code)
+{
+	switch (opcode) {
+	case COMMON_EVENT_VF_PF_CHANNEL:
+		return qed_sriov_vfpf_msg(p_hwfn, le16_to_cpu(echo),
+					  &data->vf_pf_channel.msg_addr);
+	case COMMON_EVENT_MALICIOUS_VF:
+		qed_sriov_vfpf_malicious(p_hwfn, &data->malicious_vf);
+		return 0;
+	default:
+		DP_INFO(p_hwfn->cdev, "Unknown sriov eqe event 0x%02x\n",
+			opcode);
+		return -EINVAL;
+	}
+}
+
+u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn, u16 rel_vf_id)
+{
+	struct qed_hw_sriov_info *p_iov = p_hwfn->cdev->p_iov_info;
+	u16 i;
+
+	if (!p_iov)
+		goto out;
+
+	for (i = rel_vf_id; i < p_iov->total_vfs; i++)
+		if (qed_iov_is_valid_vfid(p_hwfn, rel_vf_id, true, false))
+			return i;
+
+out:
+	return MAX_NUM_VFS;
+}
+
+static int qed_iov_copy_vf_msg(struct qed_hwfn *p_hwfn, struct qed_ptt *ptt,
+			       int vfid)
+{
+	struct qed_dmae_params params;
+	struct qed_vf_info *vf_info;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf_info)
+		return -EINVAL;
+
+	memset(&params, 0, sizeof(struct qed_dmae_params));
+	params.flags = QED_DMAE_FLAG_VF_SRC | QED_DMAE_FLAG_COMPLETION_DST;
+	params.src_vfid = vf_info->abs_vf_id;
+
+	if (qed_dmae_host2host(p_hwfn, ptt,
+			       vf_info->vf_mbx.pending_req,
+			       vf_info->vf_mbx.req_phys,
+			       sizeof(union vfpf_tlvs) / 4, &params)) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Failed to copy message from VF 0x%02x\n", vfid);
+
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void qed_iov_bulletin_set_forced_mac(struct qed_hwfn *p_hwfn,
+					    u8 *mac, int vfid)
+{
+	struct qed_vf_info *vf_info;
+	u64 feature;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16)vfid, true);
+	if (!vf_info) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "Can not set forced MAC, invalid vfid [%d]\n", vfid);
+		return;
+	}
+
+	if (vf_info->b_malicious) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "Can't set forced MAC to malicious VF [%d]\n", vfid);
+		return;
+	}
+
+	feature = 1 << MAC_ADDR_FORCED;
+	memcpy(vf_info->bulletin.p_virt->mac, mac, ETH_ALEN);
+
+	vf_info->bulletin.p_virt->valid_bitmap |= feature;
+	/* Forced MAC will disable MAC_ADDR */
+	vf_info->bulletin.p_virt->valid_bitmap &= ~BIT(VFPF_BULLETIN_MAC_ADDR);
+
+	qed_iov_configure_vport_forced(p_hwfn, vf_info, feature);
+}
+
+static void qed_iov_bulletin_set_forced_vlan(struct qed_hwfn *p_hwfn,
+					     u16 pvid, int vfid)
+{
+	struct qed_vf_info *vf_info;
+	u64 feature;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf_info) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "Can not set forced MAC, invalid vfid [%d]\n", vfid);
+		return;
+	}
+
+	if (vf_info->b_malicious) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "Can't set forced vlan to malicious VF [%d]\n", vfid);
+		return;
+	}
+
+	feature = 1 << VLAN_ADDR_FORCED;
+	vf_info->bulletin.p_virt->pvid = pvid;
+	if (pvid)
+		vf_info->bulletin.p_virt->valid_bitmap |= feature;
+	else
+		vf_info->bulletin.p_virt->valid_bitmap &= ~feature;
+
+	qed_iov_configure_vport_forced(p_hwfn, vf_info, feature);
+}
+
+void qed_iov_bulletin_set_udp_ports(struct qed_hwfn *p_hwfn,
+				    int vfid, u16 vxlan_port, u16 geneve_port)
+{
+	struct qed_vf_info *vf_info;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16)vfid, true);
+	if (!vf_info) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "Can not set udp ports, invalid vfid [%d]\n", vfid);
+		return;
+	}
+
+	if (vf_info->b_malicious) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Can not set udp ports to malicious VF [%d]\n",
+			   vfid);
+		return;
+	}
+
+	vf_info->bulletin.p_virt->vxlan_udp_port = vxlan_port;
+	vf_info->bulletin.p_virt->geneve_udp_port = geneve_port;
+}
+
+static bool qed_iov_vf_has_vport_instance(struct qed_hwfn *p_hwfn, int vfid)
+{
+	struct qed_vf_info *p_vf_info;
+
+	p_vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!p_vf_info)
+		return false;
+
+	return !!p_vf_info->vport_instance;
+}
+
+static bool qed_iov_is_vf_stopped(struct qed_hwfn *p_hwfn, int vfid)
+{
+	struct qed_vf_info *p_vf_info;
+
+	p_vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!p_vf_info)
+		return true;
+
+	return p_vf_info->state == VF_STOPPED;
+}
+
+static bool qed_iov_spoofchk_get(struct qed_hwfn *p_hwfn, int vfid)
+{
+	struct qed_vf_info *vf_info;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf_info)
+		return false;
+
+	return vf_info->spoof_chk;
+}
+
+static int qed_iov_spoofchk_set(struct qed_hwfn *p_hwfn, int vfid, bool val)
+{
+	struct qed_vf_info *vf;
+	int rc = -EINVAL;
+
+	if (!qed_iov_pf_sanity_check(p_hwfn, vfid)) {
+		DP_NOTICE(p_hwfn,
+			  "SR-IOV sanity check failed, can't set spoofchk\n");
+		goto out;
+	}
+
+	vf = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf)
+		goto out;
+
+	if (!qed_iov_vf_has_vport_instance(p_hwfn, vfid)) {
+		/* After VF VPORT start PF will configure spoof check */
+		vf->req_spoofchk_val = val;
+		rc = 0;
+		goto out;
+	}
+
+	rc = __qed_iov_spoofchk_set(p_hwfn, vf, val);
+
+out:
+	return rc;
+}
+
+static u8 *qed_iov_bulletin_get_forced_mac(struct qed_hwfn *p_hwfn,
+					   u16 rel_vf_id)
+{
+	struct qed_vf_info *p_vf;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, true);
+	if (!p_vf || !p_vf->bulletin.p_virt)
+		return NULL;
+
+	if (!(p_vf->bulletin.p_virt->valid_bitmap & BIT(MAC_ADDR_FORCED)))
+		return NULL;
+
+	return p_vf->bulletin.p_virt->mac;
+}
+
+static u16
+qed_iov_bulletin_get_forced_vlan(struct qed_hwfn *p_hwfn, u16 rel_vf_id)
+{
+	struct qed_vf_info *p_vf;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, true);
+	if (!p_vf || !p_vf->bulletin.p_virt)
+		return 0;
+
+	if (!(p_vf->bulletin.p_virt->valid_bitmap & BIT(VLAN_ADDR_FORCED)))
+		return 0;
+
+	return p_vf->bulletin.p_virt->pvid;
+}
+
+static int qed_iov_configure_tx_rate(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt, int vfid, int val)
+{
+	struct qed_mcp_link_state *p_link;
+	struct qed_vf_info *vf;
+	u8 abs_vp_id = 0;
+	int rc;
+
+	vf = qed_iov_get_vf_info(p_hwfn, (u16)vfid, true);
+	if (!vf)
+		return -EINVAL;
+
+	rc = qed_fw_vport(p_hwfn, vf->vport_id, &abs_vp_id);
+	if (rc)
+		return rc;
+
+	p_link = &QED_LEADING_HWFN(p_hwfn->cdev)->mcp_info->link_output;
+
+	return qed_init_vport_rl(p_hwfn, p_ptt, abs_vp_id, (u32)val,
+				 p_link->speed);
+}
+
+static int
+qed_iov_configure_min_tx_rate(struct qed_dev *cdev, int vfid, u32 rate)
+{
+	struct qed_vf_info *vf;
+	u8 vport_id;
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		if (!qed_iov_pf_sanity_check(p_hwfn, vfid)) {
+			DP_NOTICE(p_hwfn,
+				  "SR-IOV sanity check failed, can't set min rate\n");
+			return -EINVAL;
+		}
+	}
+
+	vf = qed_iov_get_vf_info(QED_LEADING_HWFN(cdev), (u16)vfid, true);
+	vport_id = vf->vport_id;
+
+	return qed_configure_vport_wfq(cdev, vport_id, rate);
+}
+
+static int qed_iov_get_vf_min_rate(struct qed_hwfn *p_hwfn, int vfid)
+{
+	struct qed_wfq_data *vf_vp_wfq;
+	struct qed_vf_info *vf_info;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16) vfid, true);
+	if (!vf_info)
+		return 0;
+
+	vf_vp_wfq = &p_hwfn->qm_info.wfq_data[vf_info->vport_id];
+
+	if (vf_vp_wfq->configured)
+		return vf_vp_wfq->min_speed;
+	else
+		return 0;
+}
+
+/**
+ * qed_schedule_iov - schedules IOV task for VF and PF
+ * @hwfn: hardware function pointer
+ * @flag: IOV flag for VF/PF
+ */
+void qed_schedule_iov(struct qed_hwfn *hwfn, enum qed_iov_wq_flag flag)
+{
+	smp_mb__before_atomic();
+	set_bit(flag, &hwfn->iov_task_flags);
+	smp_mb__after_atomic();
+	DP_VERBOSE(hwfn, QED_MSG_IOV, "Scheduling iov task [Flag: %d]\n", flag);
+	queue_delayed_work(hwfn->iov_wq, &hwfn->iov_task, 0);
+}
+
+void qed_vf_start_iov_wq(struct qed_dev *cdev)
+{
+	int i;
+
+	for_each_hwfn(cdev, i)
+	    queue_delayed_work(cdev->hwfns[i].iov_wq,
+			       &cdev->hwfns[i].iov_task, 0);
+}
+
+int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled)
+{
+	int i, j;
+
+	for_each_hwfn(cdev, i)
+	    if (cdev->hwfns[i].iov_wq)
+		flush_workqueue(cdev->hwfns[i].iov_wq);
+
+	/* Mark VFs for disablement */
+	qed_iov_set_vfs_to_disable(cdev, true);
+
+	if (cdev->p_iov_info && cdev->p_iov_info->num_vfs && pci_enabled)
+		pci_disable_sriov(cdev->pdev);
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
+
+		/* Failure to acquire the ptt in 100g creates an odd error
+		 * where the first engine has already relased IOV.
+		 */
+		if (!ptt) {
+			DP_ERR(hwfn, "Failed to acquire ptt\n");
+			return -EBUSY;
+		}
+
+		/* Clean WFQ db and configure equal weight for all vports */
+		qed_clean_wfq_db(hwfn, ptt);
+
+		qed_for_each_vf(hwfn, j) {
+			int k;
+
+			if (!qed_iov_is_valid_vfid(hwfn, j, true, false))
+				continue;
+
+			/* Wait until VF is disabled before releasing */
+			for (k = 0; k < 100; k++) {
+				if (!qed_iov_is_vf_stopped(hwfn, j))
+					msleep(20);
+				else
+					break;
+			}
+
+			if (k < 100)
+				qed_iov_release_hw_for_vf(&cdev->hwfns[i],
+							  ptt, j);
+			else
+				DP_ERR(hwfn,
+				       "Timeout waiting for VF's FLR to end\n");
+		}
+
+		qed_ptt_release(hwfn, ptt);
+	}
+
+	qed_iov_set_vfs_to_disable(cdev, false);
+
+	return 0;
+}
+
+static void qed_sriov_enable_qid_config(struct qed_hwfn *hwfn,
+					u16 vfid,
+					struct qed_iov_vf_init_params *params)
+{
+	u16 base, i;
+
+	/* Since we have an equal resource distribution per-VF, and we assume
+	 * PF has acquired the QED_PF_L2_QUE first queues, we start setting
+	 * sequentially from there.
+	 */
+	base = FEAT_NUM(hwfn, QED_PF_L2_QUE) + vfid * params->num_queues;
+
+	params->rel_vf_id = vfid;
+	for (i = 0; i < params->num_queues; i++) {
+		params->req_rx_queue[i] = base + i;
+		params->req_tx_queue[i] = base + i;
+	}
+}
+
+static int qed_sriov_enable(struct qed_dev *cdev, int num)
+{
+	struct qed_iov_vf_init_params params;
+	int i, j, rc;
+
+	if (num >= RESC_NUM(&cdev->hwfns[0], QED_VPORT)) {
+		DP_NOTICE(cdev, "Can start at most %d VFs\n",
+			  RESC_NUM(&cdev->hwfns[0], QED_VPORT) - 1);
+		return -EINVAL;
+	}
+
+	memset(&params, 0, sizeof(params));
+
+	/* Initialize HW for VF access */
+	for_each_hwfn(cdev, j) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[j];
+		struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
+
+		/* Make sure not to use more than 16 queues per VF */
+		params.num_queues = min_t(int,
+					  FEAT_NUM(hwfn, QED_VF_L2_QUE) / num,
+					  16);
+
+		if (!ptt) {
+			DP_ERR(hwfn, "Failed to acquire ptt\n");
+			rc = -EBUSY;
+			goto err;
+		}
+
+		for (i = 0; i < num; i++) {
+			if (!qed_iov_is_valid_vfid(hwfn, i, false, true))
+				continue;
+
+			qed_sriov_enable_qid_config(hwfn, i, &params);
+			rc = qed_iov_init_hw_for_vf(hwfn, ptt, &params);
+			if (rc) {
+				DP_ERR(cdev, "Failed to enable VF[%d]\n", i);
+				qed_ptt_release(hwfn, ptt);
+				goto err;
+			}
+		}
+
+		qed_ptt_release(hwfn, ptt);
+	}
+
+	/* Enable SRIOV PCIe functions */
+	rc = pci_enable_sriov(cdev->pdev, num);
+	if (rc) {
+		DP_ERR(cdev, "Failed to enable sriov [%d]\n", rc);
+		goto err;
+	}
+
+	return num;
+
+err:
+	qed_sriov_disable(cdev, false);
+	return rc;
+}
+
+static int qed_sriov_configure(struct qed_dev *cdev, int num_vfs_param)
+{
+	if (!IS_QED_SRIOV(cdev)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV, "SR-IOV is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (num_vfs_param)
+		return qed_sriov_enable(cdev, num_vfs_param);
+	else
+		return qed_sriov_disable(cdev, true);
+}
+
+static int qed_sriov_pf_set_mac(struct qed_dev *cdev, u8 *mac, int vfid)
+{
+	int i;
+
+	if (!IS_QED_SRIOV(cdev) || !IS_PF_SRIOV_ALLOC(&cdev->hwfns[0])) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "Cannot set a VF MAC; Sriov is not enabled\n");
+		return -EINVAL;
+	}
+
+	if (!qed_iov_is_valid_vfid(&cdev->hwfns[0], vfid, true, true)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "Cannot set VF[%d] MAC (VF is not active)\n", vfid);
+		return -EINVAL;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf_info;
+
+		vf_info = qed_iov_get_public_vf_info(hwfn, vfid, true);
+		if (!vf_info)
+			continue;
+
+		/* Set the forced MAC, and schedule the IOV task */
+		ether_addr_copy(vf_info->forced_mac, mac);
+		qed_schedule_iov(hwfn, QED_IOV_WQ_SET_UNICAST_FILTER_FLAG);
+	}
+
+	return 0;
+}
+
+static int qed_sriov_pf_set_vlan(struct qed_dev *cdev, u16 vid, int vfid)
+{
+	int i;
+
+	if (!IS_QED_SRIOV(cdev) || !IS_PF_SRIOV_ALLOC(&cdev->hwfns[0])) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "Cannot set a VF MAC; Sriov is not enabled\n");
+		return -EINVAL;
+	}
+
+	if (!qed_iov_is_valid_vfid(&cdev->hwfns[0], vfid, true, true)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "Cannot set VF[%d] MAC (VF is not active)\n", vfid);
+		return -EINVAL;
+	}
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf_info;
+
+		vf_info = qed_iov_get_public_vf_info(hwfn, vfid, true);
+		if (!vf_info)
+			continue;
+
+		/* Set the forced vlan, and schedule the IOV task */
+		vf_info->forced_vlan = vid;
+		qed_schedule_iov(hwfn, QED_IOV_WQ_SET_UNICAST_FILTER_FLAG);
+	}
+
+	return 0;
+}
+
+static int qed_get_vf_config(struct qed_dev *cdev,
+			     int vf_id, struct ifla_vf_info *ivi)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_public_vf_info *vf_info;
+	struct qed_mcp_link_state link;
+	u32 tx_rate;
+
+	/* Sanitize request */
+	if (IS_VF(cdev))
+		return -EINVAL;
+
+	if (!qed_iov_is_valid_vfid(&cdev->hwfns[0], vf_id, true, false)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "VF index [%d] isn't active\n", vf_id);
+		return -EINVAL;
+	}
+
+	vf_info = qed_iov_get_public_vf_info(hwfn, vf_id, true);
+
+	qed_iov_get_link(hwfn, vf_id, NULL, &link, NULL);
+
+	/* Fill information about VF */
+	ivi->vf = vf_id;
+
+	if (is_valid_ether_addr(vf_info->forced_mac))
+		ether_addr_copy(ivi->mac, vf_info->forced_mac);
+	else
+		ether_addr_copy(ivi->mac, vf_info->mac);
+
+	ivi->vlan = vf_info->forced_vlan;
+	ivi->spoofchk = qed_iov_spoofchk_get(hwfn, vf_id);
+	ivi->linkstate = vf_info->link_state;
+	tx_rate = vf_info->tx_rate;
+	ivi->max_tx_rate = tx_rate ? tx_rate : link.speed;
+	ivi->min_tx_rate = qed_iov_get_vf_min_rate(hwfn, vf_id);
+
+	return 0;
+}
+
+void qed_inform_vf_link_state(struct qed_hwfn *hwfn)
+{
+	struct qed_hwfn *lead_hwfn = QED_LEADING_HWFN(hwfn->cdev);
+	struct qed_mcp_link_capabilities caps;
+	struct qed_mcp_link_params params;
+	struct qed_mcp_link_state link;
+	int i;
+
+	if (!hwfn->pf_iov_info)
+		return;
+
+	/* Update bulletin of all future possible VFs with link configuration */
+	for (i = 0; i < hwfn->cdev->p_iov_info->total_vfs; i++) {
+		struct qed_public_vf_info *vf_info;
+
+		vf_info = qed_iov_get_public_vf_info(hwfn, i, false);
+		if (!vf_info)
+			continue;
+
+		/* Only hwfn0 is actually interested in the link speed.
+		 * But since only it would receive an MFW indication of link,
+		 * need to take configuration from it - otherwise things like
+		 * rate limiting for hwfn1 VF would not work.
+		 */
+		memcpy(&params, qed_mcp_get_link_params(lead_hwfn),
+		       sizeof(params));
+		memcpy(&link, qed_mcp_get_link_state(lead_hwfn), sizeof(link));
+		memcpy(&caps, qed_mcp_get_link_capabilities(lead_hwfn),
+		       sizeof(caps));
+
+		/* Modify link according to the VF's configured link state */
+		switch (vf_info->link_state) {
+		case IFLA_VF_LINK_STATE_DISABLE:
+			link.link_up = false;
+			break;
+		case IFLA_VF_LINK_STATE_ENABLE:
+			link.link_up = true;
+			/* Set speed according to maximum supported by HW.
+			 * that is 40G for regular devices and 100G for CMT
+			 * mode devices.
+			 */
+			link.speed = (hwfn->cdev->num_hwfns > 1) ?
+				     100000 : 40000;
+		default:
+			/* In auto mode pass PF link image to VF */
+			break;
+		}
+
+		if (link.link_up && vf_info->tx_rate) {
+			struct qed_ptt *ptt;
+			int rate;
+
+			rate = min_t(int, vf_info->tx_rate, link.speed);
+
+			ptt = qed_ptt_acquire(hwfn);
+			if (!ptt) {
+				DP_NOTICE(hwfn, "Failed to acquire PTT\n");
+				return;
+			}
+
+			if (!qed_iov_configure_tx_rate(hwfn, ptt, i, rate)) {
+				vf_info->tx_rate = rate;
+				link.speed = rate;
+			}
+
+			qed_ptt_release(hwfn, ptt);
+		}
+
+		qed_iov_set_link(hwfn, i, &params, &link, &caps);
+	}
+
+	qed_schedule_iov(hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
+}
+
+static int qed_set_vf_link_state(struct qed_dev *cdev,
+				 int vf_id, int link_state)
+{
+	int i;
+
+	/* Sanitize request */
+	if (IS_VF(cdev))
+		return -EINVAL;
+
+	if (!qed_iov_is_valid_vfid(&cdev->hwfns[0], vf_id, true, true)) {
+		DP_VERBOSE(cdev, QED_MSG_IOV,
+			   "VF index [%d] isn't active\n", vf_id);
+		return -EINVAL;
+	}
+
+	/* Handle configuration of link state */
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf;
+
+		vf = qed_iov_get_public_vf_info(hwfn, vf_id, true);
+		if (!vf)
+			continue;
+
+		if (vf->link_state == link_state)
+			continue;
+
+		vf->link_state = link_state;
+		qed_inform_vf_link_state(&cdev->hwfns[i]);
+	}
+
+	return 0;
+}
+
+static int qed_spoof_configure(struct qed_dev *cdev, int vfid, bool val)
+{
+	int i, rc = -EINVAL;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		rc = qed_iov_spoofchk_set(p_hwfn, vfid, val);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+static int qed_configure_max_vf_rate(struct qed_dev *cdev, int vfid, int rate)
+{
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf;
+
+		if (!qed_iov_pf_sanity_check(p_hwfn, vfid)) {
+			DP_NOTICE(p_hwfn,
+				  "SR-IOV sanity check failed, can't set tx rate\n");
+			return -EINVAL;
+		}
+
+		vf = qed_iov_get_public_vf_info(p_hwfn, vfid, true);
+
+		vf->tx_rate = rate;
+
+		qed_inform_vf_link_state(p_hwfn);
+	}
+
+	return 0;
+}
+
+static int qed_set_vf_rate(struct qed_dev *cdev,
+			   int vfid, u32 min_rate, u32 max_rate)
+{
+	int rc_min = 0, rc_max = 0;
+
+	if (max_rate)
+		rc_max = qed_configure_max_vf_rate(cdev, vfid, max_rate);
+
+	if (min_rate)
+		rc_min = qed_iov_configure_min_tx_rate(cdev, vfid, min_rate);
+
+	if (rc_max | rc_min)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qed_set_vf_trust(struct qed_dev *cdev, int vfid, bool trust)
+{
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_public_vf_info *vf;
+
+		if (!qed_iov_pf_sanity_check(hwfn, vfid)) {
+			DP_NOTICE(hwfn,
+				  "SR-IOV sanity check failed, can't set trust\n");
+			return -EINVAL;
+		}
+
+		vf = qed_iov_get_public_vf_info(hwfn, vfid, true);
+
+		if (vf->is_trusted_request == trust)
+			return 0;
+		vf->is_trusted_request = trust;
+
+		qed_schedule_iov(hwfn, QED_IOV_WQ_TRUST_FLAG);
+	}
+
+	return 0;
+}
+
+static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
+{
+	u64 events[QED_VF_ARRAY_LENGTH];
+	struct qed_ptt *ptt;
+	int i;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt) {
+		DP_VERBOSE(hwfn, QED_MSG_IOV,
+			   "Can't acquire PTT; re-scheduling\n");
+		qed_schedule_iov(hwfn, QED_IOV_WQ_MSG_FLAG);
+		return;
+	}
+
+	qed_iov_pf_get_pending_events(hwfn, events);
+
+	DP_VERBOSE(hwfn, QED_MSG_IOV,
+		   "Event mask of VF events: 0x%llx 0x%llx 0x%llx\n",
+		   events[0], events[1], events[2]);
+
+	qed_for_each_vf(hwfn, i) {
+		/* Skip VFs with no pending messages */
+		if (!(events[i / 64] & (1ULL << (i % 64))))
+			continue;
+
+		DP_VERBOSE(hwfn, QED_MSG_IOV,
+			   "Handling VF message from VF 0x%02x [Abs 0x%02x]\n",
+			   i, hwfn->cdev->p_iov_info->first_vf_in_pf + i);
+
+		/* Copy VF's message to PF's request buffer for that VF */
+		if (qed_iov_copy_vf_msg(hwfn, ptt, i))
+			continue;
+
+		qed_iov_process_mbx_req(hwfn, ptt, i);
+	}
+
+	qed_ptt_release(hwfn, ptt);
+}
+
+static void qed_handle_pf_set_vf_unicast(struct qed_hwfn *hwfn)
+{
+	int i;
+
+	qed_for_each_vf(hwfn, i) {
+		struct qed_public_vf_info *info;
+		bool update = false;
+		u8 *mac;
+
+		info = qed_iov_get_public_vf_info(hwfn, i, true);
+		if (!info)
+			continue;
+
+		/* Update data on bulletin board */
+		mac = qed_iov_bulletin_get_forced_mac(hwfn, i);
+		if (is_valid_ether_addr(info->forced_mac) &&
+		    (!mac || !ether_addr_equal(mac, info->forced_mac))) {
+			DP_VERBOSE(hwfn,
+				   QED_MSG_IOV,
+				   "Handling PF setting of VF MAC to VF 0x%02x [Abs 0x%02x]\n",
+				   i,
+				   hwfn->cdev->p_iov_info->first_vf_in_pf + i);
+
+			/* Update bulletin board with forced MAC */
+			qed_iov_bulletin_set_forced_mac(hwfn,
+							info->forced_mac, i);
+			update = true;
+		}
+
+		if (qed_iov_bulletin_get_forced_vlan(hwfn, i) ^
+		    info->forced_vlan) {
+			DP_VERBOSE(hwfn,
+				   QED_MSG_IOV,
+				   "Handling PF setting of pvid [0x%04x] to VF 0x%02x [Abs 0x%02x]\n",
+				   info->forced_vlan,
+				   i,
+				   hwfn->cdev->p_iov_info->first_vf_in_pf + i);
+			qed_iov_bulletin_set_forced_vlan(hwfn,
+							 info->forced_vlan, i);
+			update = true;
+		}
+
+		if (update)
+			qed_schedule_iov(hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
+	}
+}
+
+static void qed_handle_bulletin_post(struct qed_hwfn *hwfn)
+{
+	struct qed_ptt *ptt;
+	int i;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt) {
+		DP_NOTICE(hwfn, "Failed allocating a ptt entry\n");
+		qed_schedule_iov(hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
+		return;
+	}
+
+	qed_for_each_vf(hwfn, i)
+	    qed_iov_post_vf_bulletin(hwfn, i, ptt);
+
+	qed_ptt_release(hwfn, ptt);
+}
+
+static void qed_iov_handle_trust_change(struct qed_hwfn *hwfn)
+{
+	struct qed_sp_vport_update_params params;
+	struct qed_filter_accept_flags *flags;
+	struct qed_public_vf_info *vf_info;
+	struct qed_vf_info *vf;
+	u8 mask;
+	int i;
+
+	mask = QED_ACCEPT_UCAST_UNMATCHED | QED_ACCEPT_MCAST_UNMATCHED;
+	flags = &params.accept_flags;
+
+	qed_for_each_vf(hwfn, i) {
+		/* Need to make sure current requested configuration didn't
+		 * flip so that we'll end up configuring something that's not
+		 * needed.
+		 */
+		vf_info = qed_iov_get_public_vf_info(hwfn, i, true);
+		if (vf_info->is_trusted_configured ==
+		    vf_info->is_trusted_request)
+			continue;
+		vf_info->is_trusted_configured = vf_info->is_trusted_request;
+
+		/* Validate that the VF has a configured vport */
+		vf = qed_iov_get_vf_info(hwfn, i, true);
+		if (!vf->vport_instance)
+			continue;
+
+		memset(&params, 0, sizeof(params));
+		params.opaque_fid = vf->opaque_fid;
+		params.vport_id = vf->vport_id;
+
+		if (vf_info->rx_accept_mode & mask) {
+			flags->update_rx_mode_config = 1;
+			flags->rx_accept_filter = vf_info->rx_accept_mode;
+		}
+
+		if (vf_info->tx_accept_mode & mask) {
+			flags->update_tx_mode_config = 1;
+			flags->tx_accept_filter = vf_info->tx_accept_mode;
+		}
+
+		/* Remove if needed; Otherwise this would set the mask */
+		if (!vf_info->is_trusted_configured) {
+			flags->rx_accept_filter &= ~mask;
+			flags->tx_accept_filter &= ~mask;
+		}
+
+		if (flags->update_rx_mode_config ||
+		    flags->update_tx_mode_config)
+			qed_sp_vport_update(hwfn, &params,
+					    QED_SPQ_MODE_EBLOCK, NULL);
+	}
+}
+
+static void qed_iov_pf_task(struct work_struct *work)
+
+{
+	struct qed_hwfn *hwfn = container_of(work, struct qed_hwfn,
+					     iov_task.work);
+	int rc;
+
+	if (test_and_clear_bit(QED_IOV_WQ_STOP_WQ_FLAG, &hwfn->iov_task_flags))
+		return;
+
+	if (test_and_clear_bit(QED_IOV_WQ_FLR_FLAG, &hwfn->iov_task_flags)) {
+		struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
+
+		if (!ptt) {
+			qed_schedule_iov(hwfn, QED_IOV_WQ_FLR_FLAG);
+			return;
+		}
+
+		rc = qed_iov_vf_flr_cleanup(hwfn, ptt);
+		if (rc)
+			qed_schedule_iov(hwfn, QED_IOV_WQ_FLR_FLAG);
+
+		qed_ptt_release(hwfn, ptt);
+	}
+
+	if (test_and_clear_bit(QED_IOV_WQ_MSG_FLAG, &hwfn->iov_task_flags))
+		qed_handle_vf_msg(hwfn);
+
+	if (test_and_clear_bit(QED_IOV_WQ_SET_UNICAST_FILTER_FLAG,
+			       &hwfn->iov_task_flags))
+		qed_handle_pf_set_vf_unicast(hwfn);
+
+	if (test_and_clear_bit(QED_IOV_WQ_BULLETIN_UPDATE_FLAG,
+			       &hwfn->iov_task_flags))
+		qed_handle_bulletin_post(hwfn);
+
+	if (test_and_clear_bit(QED_IOV_WQ_TRUST_FLAG, &hwfn->iov_task_flags))
+		qed_iov_handle_trust_change(hwfn);
+}
+
+void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first)
+{
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		if (!cdev->hwfns[i].iov_wq)
+			continue;
+
+		if (schedule_first) {
+			qed_schedule_iov(&cdev->hwfns[i],
+					 QED_IOV_WQ_STOP_WQ_FLAG);
+			cancel_delayed_work_sync(&cdev->hwfns[i].iov_task);
+		}
+
+		flush_workqueue(cdev->hwfns[i].iov_wq);
+		destroy_workqueue(cdev->hwfns[i].iov_wq);
+	}
+}
+
+int qed_iov_wq_start(struct qed_dev *cdev)
+{
+	char name[NAME_SIZE];
+	int i;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		/* PFs needs a dedicated workqueue only if they support IOV.
+		 * VFs always require one.
+		 */
+		if (IS_PF(p_hwfn->cdev) && !IS_PF_SRIOV(p_hwfn))
+			continue;
+
+		snprintf(name, NAME_SIZE, "iov-%02x:%02x.%02x",
+			 cdev->pdev->bus->number,
+			 PCI_SLOT(cdev->pdev->devfn), p_hwfn->abs_pf_id);
+
+		p_hwfn->iov_wq = create_singlethread_workqueue(name);
+		if (!p_hwfn->iov_wq) {
+			DP_NOTICE(p_hwfn, "Cannot create iov workqueue\n");
+			return -ENOMEM;
+		}
+
+		if (IS_PF(cdev))
+			INIT_DELAYED_WORK(&p_hwfn->iov_task, qed_iov_pf_task);
+		else
+			INIT_DELAYED_WORK(&p_hwfn->iov_task, qed_iov_vf_task);
+	}
+
+	return 0;
+}
+
+const struct qed_iov_hv_ops qed_iov_ops_pass = {
+	.configure = &qed_sriov_configure,
+	.set_mac = &qed_sriov_pf_set_mac,
+	.set_vlan = &qed_sriov_pf_set_vlan,
+	.get_config = &qed_get_vf_config,
+	.set_link_state = &qed_set_vf_link_state,
+	.set_spoof = &qed_spoof_configure,
+	.set_rate = &qed_set_vf_rate,
+	.set_trust = &qed_set_vf_trust,
+};
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
new file mode 100644
index 0000000..9a8fd79
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -0,0 +1,476 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_SRIOV_H
+#define _QED_SRIOV_H
+#include <linux/types.h>
+#include "qed_vf.h"
+
+#define QED_ETH_VF_NUM_MAC_FILTERS 1
+#define QED_ETH_VF_NUM_VLAN_FILTERS 2
+#define QED_VF_ARRAY_LENGTH (3)
+
+#ifdef CONFIG_QED_SRIOV
+#define IS_VF(cdev)             ((cdev)->b_is_vf)
+#define IS_PF(cdev)             (!((cdev)->b_is_vf))
+#define IS_PF_SRIOV(p_hwfn)     (!!((p_hwfn)->cdev->p_iov_info))
+#else
+#define IS_VF(cdev)             (0)
+#define IS_PF(cdev)             (1)
+#define IS_PF_SRIOV(p_hwfn)     (0)
+#endif
+#define IS_PF_SRIOV_ALLOC(p_hwfn)       (!!((p_hwfn)->pf_iov_info))
+
+#define QED_MAX_VF_CHAINS_PER_PF 16
+
+#define QED_ETH_MAX_VF_NUM_VLAN_FILTERS	\
+	(MAX_NUM_VFS * QED_ETH_VF_NUM_VLAN_FILTERS)
+
+enum qed_iov_vport_update_flag {
+	QED_IOV_VP_UPDATE_ACTIVATE,
+	QED_IOV_VP_UPDATE_VLAN_STRIP,
+	QED_IOV_VP_UPDATE_TX_SWITCH,
+	QED_IOV_VP_UPDATE_MCAST,
+	QED_IOV_VP_UPDATE_ACCEPT_PARAM,
+	QED_IOV_VP_UPDATE_RSS,
+	QED_IOV_VP_UPDATE_ACCEPT_ANY_VLAN,
+	QED_IOV_VP_UPDATE_SGE_TPA,
+	QED_IOV_VP_UPDATE_MAX,
+};
+
+struct qed_public_vf_info {
+	/* These copies will later be reflected in the bulletin board,
+	 * but this copy should be newer.
+	 */
+	u8 forced_mac[ETH_ALEN];
+	u16 forced_vlan;
+	u8 mac[ETH_ALEN];
+
+	/* IFLA_VF_LINK_STATE_<X> */
+	int link_state;
+
+	/* Currently configured Tx rate in MB/sec. 0 if unconfigured */
+	int tx_rate;
+
+	/* Trusted VFs can configure promiscuous mode.
+	 * Also store shadow promisc configuration if needed.
+	 */
+	bool is_trusted_configured;
+	bool is_trusted_request;
+	u8 rx_accept_mode;
+	u8 tx_accept_mode;
+};
+
+struct qed_iov_vf_init_params {
+	u16 rel_vf_id;
+
+	/* Number of requested Queues; Currently, don't support different
+	 * number of Rx/Tx queues.
+	 */
+
+	u16 num_queues;
+
+	/* Allow the client to choose which qzones to use for Rx/Tx,
+	 * and which queue_base to use for Tx queues on a per-queue basis.
+	 * Notice values should be relative to the PF resources.
+	 */
+	u16 req_rx_queue[QED_MAX_VF_CHAINS_PER_PF];
+	u16 req_tx_queue[QED_MAX_VF_CHAINS_PER_PF];
+};
+
+/* This struct is part of qed_dev and contains data relevant to all hwfns;
+ * Initialized only if SR-IOV cpabability is exposed in PCIe config space.
+ */
+struct qed_hw_sriov_info {
+	int pos;		/* capability position */
+	int nres;		/* number of resources */
+	u32 cap;		/* SR-IOV Capabilities */
+	u16 ctrl;		/* SR-IOV Control */
+	u16 total_vfs;		/* total VFs associated with the PF */
+	u16 num_vfs;		/* number of vfs that have been started */
+	u16 initial_vfs;	/* initial VFs associated with the PF */
+	u16 nr_virtfn;		/* number of VFs available */
+	u16 offset;		/* first VF Routing ID offset */
+	u16 stride;		/* following VF stride */
+	u16 vf_device_id;	/* VF device id */
+	u32 pgsz;		/* page size for BAR alignment */
+	u8 link;		/* Function Dependency Link */
+
+	u32 first_vf_in_pf;
+};
+
+/* This mailbox is maintained per VF in its PF contains all information
+ * required for sending / receiving a message.
+ */
+struct qed_iov_vf_mbx {
+	union vfpf_tlvs *req_virt;
+	dma_addr_t req_phys;
+	union pfvf_tlvs *reply_virt;
+	dma_addr_t reply_phys;
+
+	/* Address in VF where a pending message is located */
+	dma_addr_t pending_req;
+
+	/* Message from VF awaits handling */
+	bool b_pending_msg;
+
+	u8 *offset;
+
+	/* saved VF request header */
+	struct vfpf_first_tlv first_tlv;
+};
+
+#define QED_IOV_LEGACY_QID_RX (0)
+#define QED_IOV_LEGACY_QID_TX (1)
+#define QED_IOV_QID_INVALID (0xFE)
+
+struct qed_vf_queue_cid {
+	bool b_is_tx;
+	struct qed_queue_cid *p_cid;
+};
+
+/* Describes a qzone associated with the VF */
+struct qed_vf_queue {
+	u16 fw_rx_qid;
+	u16 fw_tx_qid;
+
+	struct qed_vf_queue_cid cids[MAX_QUEUES_PER_QZONE];
+};
+
+enum vf_state {
+	VF_FREE = 0,		/* VF ready to be acquired holds no resc */
+	VF_ACQUIRED,		/* VF, acquired, but not initalized */
+	VF_ENABLED,		/* VF, Enabled */
+	VF_RESET,		/* VF, FLR'd, pending cleanup */
+	VF_STOPPED		/* VF, Stopped */
+};
+
+struct qed_vf_vlan_shadow {
+	bool used;
+	u16 vid;
+};
+
+struct qed_vf_shadow_config {
+	/* Shadow copy of all guest vlans */
+	struct qed_vf_vlan_shadow vlans[QED_ETH_VF_NUM_VLAN_FILTERS + 1];
+
+	/* Shadow copy of all configured MACs; Empty if forcing MACs */
+	u8 macs[QED_ETH_VF_NUM_MAC_FILTERS][ETH_ALEN];
+	u8 inner_vlan_removal;
+};
+
+/* PFs maintain an array of this structure, per VF */
+struct qed_vf_info {
+	struct qed_iov_vf_mbx vf_mbx;
+	enum vf_state state;
+	bool b_init;
+	bool b_malicious;
+	u8 to_disable;
+
+	struct qed_bulletin bulletin;
+	dma_addr_t vf_bulletin;
+
+	/* PF saves a copy of the last VF acquire message */
+	struct vfpf_acquire_tlv acquire;
+
+	u32 concrete_fid;
+	u16 opaque_fid;
+	u16 mtu;
+
+	u8 vport_id;
+	u8 relative_vf_id;
+	u8 abs_vf_id;
+#define QED_VF_ABS_ID(p_hwfn, p_vf)	(QED_PATH_ID(p_hwfn) ?		      \
+					 (p_vf)->abs_vf_id + MAX_NUM_VFS_BB : \
+					 (p_vf)->abs_vf_id)
+
+	u8 vport_instance;
+	u8 num_rxqs;
+	u8 num_txqs;
+
+	u16 rx_coal;
+	u16 tx_coal;
+
+	u8 num_sbs;
+
+	u8 num_mac_filters;
+	u8 num_vlan_filters;
+
+	struct qed_vf_queue vf_queues[QED_MAX_VF_CHAINS_PER_PF];
+	u16 igu_sbs[QED_MAX_VF_CHAINS_PER_PF];
+	u8 num_active_rxqs;
+	struct qed_public_vf_info p_vf_info;
+	bool spoof_chk;
+	bool req_spoofchk_val;
+
+	/* Stores the configuration requested by VF */
+	struct qed_vf_shadow_config shadow_config;
+
+	/* A bitfield using bulletin's valid-map bits, used to indicate
+	 * which of the bulletin board features have been configured.
+	 */
+	u64 configured_features;
+#define QED_IOV_CONFIGURED_FEATURES_MASK        ((1 << MAC_ADDR_FORCED) | \
+						 (1 << VLAN_ADDR_FORCED))
+};
+
+/* This structure is part of qed_hwfn and used only for PFs that have sriov
+ * capability enabled.
+ */
+struct qed_pf_iov {
+	struct qed_vf_info vfs_array[MAX_NUM_VFS];
+	u64 pending_flr[QED_VF_ARRAY_LENGTH];
+
+	/* Allocate message address continuosuly and split to each VF */
+	void *mbx_msg_virt_addr;
+	dma_addr_t mbx_msg_phys_addr;
+	u32 mbx_msg_size;
+	void *mbx_reply_virt_addr;
+	dma_addr_t mbx_reply_phys_addr;
+	u32 mbx_reply_size;
+	void *p_bulletins;
+	dma_addr_t bulletins_phys;
+	u32 bulletins_size;
+};
+
+enum qed_iov_wq_flag {
+	QED_IOV_WQ_MSG_FLAG,
+	QED_IOV_WQ_SET_UNICAST_FILTER_FLAG,
+	QED_IOV_WQ_BULLETIN_UPDATE_FLAG,
+	QED_IOV_WQ_STOP_WQ_FLAG,
+	QED_IOV_WQ_FLR_FLAG,
+	QED_IOV_WQ_TRUST_FLAG,
+	QED_IOV_WQ_VF_FORCE_LINK_QUERY_FLAG,
+};
+
+#ifdef CONFIG_QED_SRIOV
+/**
+ * @brief Check if given VF ID @vfid is valid
+ *        w.r.t. @b_enabled_only value
+ *        if b_enabled_only = true - only enabled VF id is valid
+ *        else any VF id less than max_vfs is valid
+ *
+ * @param p_hwfn
+ * @param rel_vf_id - Relative VF ID
+ * @param b_enabled_only - consider only enabled VF
+ * @param b_non_malicious - true iff we want to validate vf isn't malicious.
+ *
+ * @return bool - true for valid VF ID
+ */
+bool qed_iov_is_valid_vfid(struct qed_hwfn *p_hwfn,
+			   int rel_vf_id,
+			   bool b_enabled_only, bool b_non_malicious);
+
+/**
+ * @brief - Given a VF index, return index of next [including that] active VF.
+ *
+ * @param p_hwfn
+ * @param rel_vf_id
+ *
+ * @return MAX_NUM_VFS in case no further active VFs, otherwise index.
+ */
+u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn, u16 rel_vf_id);
+
+void qed_iov_bulletin_set_udp_ports(struct qed_hwfn *p_hwfn,
+				    int vfid, u16 vxlan_port, u16 geneve_port);
+
+/**
+ * @brief Read sriov related information and allocated resources
+ *  reads from configuraiton space, shmem, etc.
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_iov_hw_info(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_add_tlv - place a given tlv on the tlv buffer at next offset
+ *
+ * @param p_hwfn
+ * @param p_iov
+ * @param type
+ * @param length
+ *
+ * @return pointer to the newly placed tlv
+ */
+void *qed_add_tlv(struct qed_hwfn *p_hwfn, u8 **offset, u16 type, u16 length);
+
+/**
+ * @brief list the types and lengths of the tlvs on the buffer
+ *
+ * @param p_hwfn
+ * @param tlvs_list
+ */
+void qed_dp_tlv_list(struct qed_hwfn *p_hwfn, void *tlvs_list);
+
+/**
+ * @brief qed_iov_alloc - allocate sriov related resources
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_iov_alloc(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_iov_setup - setup sriov related resources
+ *
+ * @param p_hwfn
+ */
+void qed_iov_setup(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_iov_free - free sriov related resources
+ *
+ * @param p_hwfn
+ */
+void qed_iov_free(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief free sriov related memory that was allocated during hw_prepare
+ *
+ * @param cdev
+ */
+void qed_iov_free_hw_info(struct qed_dev *cdev);
+
+/**
+ * @brief Mark structs of vfs that have been FLR-ed.
+ *
+ * @param p_hwfn
+ * @param disabled_vfs - bitmask of all VFs on path that were FLRed
+ *
+ * @return true iff one of the PF's vfs got FLRed. false otherwise.
+ */
+bool qed_iov_mark_vf_flr(struct qed_hwfn *p_hwfn, u32 *disabled_vfs);
+
+/**
+ * @brief Search extended TLVs in request/reply buffer.
+ *
+ * @param p_hwfn
+ * @param p_tlvs_list - Pointer to tlvs list
+ * @param req_type - Type of TLV
+ *
+ * @return pointer to tlv type if found, otherwise returns NULL.
+ */
+void *qed_iov_search_list_tlvs(struct qed_hwfn *p_hwfn,
+			       void *p_tlvs_list, u16 req_type);
+
+void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first);
+int qed_iov_wq_start(struct qed_dev *cdev);
+
+void qed_schedule_iov(struct qed_hwfn *hwfn, enum qed_iov_wq_flag flag);
+void qed_vf_start_iov_wq(struct qed_dev *cdev);
+int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled);
+void qed_inform_vf_link_state(struct qed_hwfn *hwfn);
+#else
+static inline bool
+qed_iov_is_valid_vfid(struct qed_hwfn *p_hwfn,
+		      int rel_vf_id, bool b_enabled_only, bool b_non_malicious)
+{
+	return false;
+}
+
+static inline u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn,
+					     u16 rel_vf_id)
+{
+	return MAX_NUM_VFS;
+}
+
+static inline void
+qed_iov_bulletin_set_udp_ports(struct qed_hwfn *p_hwfn, int vfid,
+			       u16 vxlan_port, u16 geneve_port)
+{
+}
+
+static inline int qed_iov_hw_info(struct qed_hwfn *p_hwfn)
+{
+	return 0;
+}
+
+static inline int qed_iov_alloc(struct qed_hwfn *p_hwfn)
+{
+	return 0;
+}
+
+static inline void qed_iov_setup(struct qed_hwfn *p_hwfn)
+{
+}
+
+static inline void qed_iov_free(struct qed_hwfn *p_hwfn)
+{
+}
+
+static inline void qed_iov_free_hw_info(struct qed_dev *cdev)
+{
+}
+
+static inline bool qed_iov_mark_vf_flr(struct qed_hwfn *p_hwfn,
+				       u32 *disabled_vfs)
+{
+	return false;
+}
+
+static inline void qed_iov_wq_stop(struct qed_dev *cdev, bool schedule_first)
+{
+}
+
+static inline int qed_iov_wq_start(struct qed_dev *cdev)
+{
+	return 0;
+}
+
+static inline void qed_schedule_iov(struct qed_hwfn *hwfn,
+				    enum qed_iov_wq_flag flag)
+{
+}
+
+static inline void qed_vf_start_iov_wq(struct qed_dev *cdev)
+{
+}
+
+static inline int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled)
+{
+	return 0;
+}
+
+static inline void qed_inform_vf_link_state(struct qed_hwfn *hwfn)
+{
+}
+#endif
+
+#define qed_for_each_vf(_p_hwfn, _i)			  \
+	for (_i = qed_iov_get_next_active_vf(_p_hwfn, 0); \
+	     _i < MAX_NUM_VFS;				  \
+	     _i = qed_iov_get_next_active_vf(_p_hwfn, _i + 1))
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
new file mode 100644
index 0000000..91b5e9f
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -0,0 +1,1685 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/crc32.h>
+#include <linux/etherdevice.h>
+#include "qed.h"
+#include "qed_sriov.h"
+#include "qed_vf.h"
+
+static void *qed_vf_pf_prep(struct qed_hwfn *p_hwfn, u16 type, u16 length)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	void *p_tlv;
+
+	/* This lock is released when we receive PF's response
+	 * in qed_send_msg2pf().
+	 * So, qed_vf_pf_prep() and qed_send_msg2pf()
+	 * must come in sequence.
+	 */
+	mutex_lock(&(p_iov->mutex));
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "preparing to send 0x%04x tlv over vf pf channel\n",
+		   type);
+
+	/* Reset Requst offset */
+	p_iov->offset = (u8 *)p_iov->vf2pf_request;
+
+	/* Clear mailbox - both request and reply */
+	memset(p_iov->vf2pf_request, 0, sizeof(union vfpf_tlvs));
+	memset(p_iov->pf2vf_reply, 0, sizeof(union pfvf_tlvs));
+
+	/* Init type and length */
+	p_tlv = qed_add_tlv(p_hwfn, &p_iov->offset, type, length);
+
+	/* Init first tlv header */
+	((struct vfpf_first_tlv *)p_tlv)->reply_address =
+	    (u64)p_iov->pf2vf_reply_phys;
+
+	return p_tlv;
+}
+
+static void qed_vf_pf_req_end(struct qed_hwfn *p_hwfn, int req_status)
+{
+	union pfvf_tlvs *resp = p_hwfn->vf_iov_info->pf2vf_reply;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "VF request status = 0x%x, PF reply status = 0x%x\n",
+		   req_status, resp->default_resp.hdr.status);
+
+	mutex_unlock(&(p_hwfn->vf_iov_info->mutex));
+}
+
+static int qed_send_msg2pf(struct qed_hwfn *p_hwfn, u8 *done, u32 resp_size)
+{
+	union vfpf_tlvs *p_req = p_hwfn->vf_iov_info->vf2pf_request;
+	struct ustorm_trigger_vf_zone trigger;
+	struct ustorm_vf_zone *zone_data;
+	int rc = 0, time = 100;
+
+	zone_data = (struct ustorm_vf_zone *)PXP_VF_BAR0_START_USDM_ZONE_B;
+
+	/* output tlvs list */
+	qed_dp_tlv_list(p_hwfn, p_req);
+
+	/* need to add the END TLV to the message size */
+	resp_size += sizeof(struct channel_list_end_tlv);
+
+	/* Send TLVs over HW channel */
+	memset(&trigger, 0, sizeof(struct ustorm_trigger_vf_zone));
+	trigger.vf_pf_msg_valid = 1;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF -> PF [%02x] message: [%08x, %08x] --> %p, %08x --> %p\n",
+		   GET_FIELD(p_hwfn->hw_info.concrete_fid,
+			     PXP_CONCRETE_FID_PFID),
+		   upper_32_bits(p_hwfn->vf_iov_info->vf2pf_request_phys),
+		   lower_32_bits(p_hwfn->vf_iov_info->vf2pf_request_phys),
+		   &zone_data->non_trigger.vf_pf_msg_addr,
+		   *((u32 *)&trigger), &zone_data->trigger);
+
+	REG_WR(p_hwfn,
+	       (uintptr_t)&zone_data->non_trigger.vf_pf_msg_addr.lo,
+	       lower_32_bits(p_hwfn->vf_iov_info->vf2pf_request_phys));
+
+	REG_WR(p_hwfn,
+	       (uintptr_t)&zone_data->non_trigger.vf_pf_msg_addr.hi,
+	       upper_32_bits(p_hwfn->vf_iov_info->vf2pf_request_phys));
+
+	/* The message data must be written first, to prevent trigger before
+	 * data is written.
+	 */
+	wmb();
+
+	REG_WR(p_hwfn, (uintptr_t)&zone_data->trigger, *((u32 *)&trigger));
+
+	/* When PF would be done with the response, it would write back to the
+	 * `done' address. Poll until then.
+	 */
+	while ((!*done) && time) {
+		msleep(25);
+		time--;
+	}
+
+	if (!*done) {
+		DP_NOTICE(p_hwfn,
+			  "VF <-- PF Timeout [Type %d]\n",
+			  p_req->first_tlv.tl.type);
+		rc = -EBUSY;
+	} else {
+		if ((*done != PFVF_STATUS_SUCCESS) &&
+		    (*done != PFVF_STATUS_NO_RESOURCE))
+			DP_NOTICE(p_hwfn,
+				  "PF response: %d [Type %d]\n",
+				  *done, p_req->first_tlv.tl.type);
+		else
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "PF response: %d [Type %d]\n",
+				   *done, p_req->first_tlv.tl.type);
+	}
+
+	return rc;
+}
+
+static void qed_vf_pf_add_qid(struct qed_hwfn *p_hwfn,
+			      struct qed_queue_cid *p_cid)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct vfpf_qid_tlv *p_qid_tlv;
+
+	/* Only add QIDs for the queue if it was negotiated with PF */
+	if (!(p_iov->acquire_resp.pfdev_info.capabilities &
+	      PFVF_ACQUIRE_CAP_QUEUE_QIDS))
+		return;
+
+	p_qid_tlv = qed_add_tlv(p_hwfn, &p_iov->offset,
+				CHANNEL_TLV_QID, sizeof(*p_qid_tlv));
+	p_qid_tlv->qid = p_cid->qid_usage_idx;
+}
+
+int _qed_vf_pf_release(struct qed_hwfn *p_hwfn, bool b_final)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_def_resp_tlv *resp;
+	struct vfpf_first_tlv *req;
+	u32 size;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_RELEASE, sizeof(*req));
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+
+	if (!rc && resp->hdr.status != PFVF_STATUS_SUCCESS)
+		rc = -EAGAIN;
+
+	qed_vf_pf_req_end(p_hwfn, rc);
+	if (!b_final)
+		return rc;
+
+	p_hwfn->b_int_enabled = 0;
+
+	if (p_iov->vf2pf_request)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  sizeof(union vfpf_tlvs),
+				  p_iov->vf2pf_request,
+				  p_iov->vf2pf_request_phys);
+	if (p_iov->pf2vf_reply)
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  sizeof(union pfvf_tlvs),
+				  p_iov->pf2vf_reply, p_iov->pf2vf_reply_phys);
+
+	if (p_iov->bulletin.p_virt) {
+		size = sizeof(struct qed_bulletin_content);
+		dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+				  size,
+				  p_iov->bulletin.p_virt, p_iov->bulletin.phys);
+	}
+
+	kfree(p_hwfn->vf_iov_info);
+	p_hwfn->vf_iov_info = NULL;
+
+	return rc;
+}
+
+int qed_vf_pf_release(struct qed_hwfn *p_hwfn)
+{
+	return _qed_vf_pf_release(p_hwfn, true);
+}
+
+#define VF_ACQUIRE_THRESH 3
+static void qed_vf_pf_acquire_reduce_resc(struct qed_hwfn *p_hwfn,
+					  struct vf_pf_resc_request *p_req,
+					  struct pf_vf_resc *p_resp)
+{
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "PF unwilling to fullill resource request: rxq [%02x/%02x] txq [%02x/%02x] sbs [%02x/%02x] mac [%02x/%02x] vlan [%02x/%02x] mc [%02x/%02x] cids [%02x/%02x]. Try PF recommended amount\n",
+		   p_req->num_rxqs,
+		   p_resp->num_rxqs,
+		   p_req->num_rxqs,
+		   p_resp->num_txqs,
+		   p_req->num_sbs,
+		   p_resp->num_sbs,
+		   p_req->num_mac_filters,
+		   p_resp->num_mac_filters,
+		   p_req->num_vlan_filters,
+		   p_resp->num_vlan_filters,
+		   p_req->num_mc_filters,
+		   p_resp->num_mc_filters, p_req->num_cids, p_resp->num_cids);
+
+	/* humble our request */
+	p_req->num_txqs = p_resp->num_txqs;
+	p_req->num_rxqs = p_resp->num_rxqs;
+	p_req->num_sbs = p_resp->num_sbs;
+	p_req->num_mac_filters = p_resp->num_mac_filters;
+	p_req->num_vlan_filters = p_resp->num_vlan_filters;
+	p_req->num_mc_filters = p_resp->num_mc_filters;
+	p_req->num_cids = p_resp->num_cids;
+}
+
+static int qed_vf_pf_acquire(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_acquire_resp_tlv *resp = &p_iov->pf2vf_reply->acquire_resp;
+	struct pf_vf_pfdev_info *pfdev_info = &resp->pfdev_info;
+	struct vf_pf_resc_request *p_resc;
+	bool resources_acquired = false;
+	struct vfpf_acquire_tlv *req;
+	int rc = 0, attempts = 0;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_ACQUIRE, sizeof(*req));
+	p_resc = &req->resc_request;
+
+	/* starting filling the request */
+	req->vfdev_info.opaque_fid = p_hwfn->hw_info.opaque_fid;
+
+	p_resc->num_rxqs = QED_MAX_VF_CHAINS_PER_PF;
+	p_resc->num_txqs = QED_MAX_VF_CHAINS_PER_PF;
+	p_resc->num_sbs = QED_MAX_VF_CHAINS_PER_PF;
+	p_resc->num_mac_filters = QED_ETH_VF_NUM_MAC_FILTERS;
+	p_resc->num_vlan_filters = QED_ETH_VF_NUM_VLAN_FILTERS;
+	p_resc->num_cids = QED_ETH_VF_DEFAULT_NUM_CIDS;
+
+	req->vfdev_info.os_type = VFPF_ACQUIRE_OS_LINUX;
+	req->vfdev_info.fw_major = FW_MAJOR_VERSION;
+	req->vfdev_info.fw_minor = FW_MINOR_VERSION;
+	req->vfdev_info.fw_revision = FW_REVISION_VERSION;
+	req->vfdev_info.fw_engineering = FW_ENGINEERING_VERSION;
+	req->vfdev_info.eth_fp_hsi_major = ETH_HSI_VER_MAJOR;
+	req->vfdev_info.eth_fp_hsi_minor = ETH_HSI_VER_MINOR;
+
+	/* Fill capability field with any non-deprecated config we support */
+	req->vfdev_info.capabilities |= VFPF_ACQUIRE_CAP_100G;
+
+	/* If we've mapped the doorbell bar, try using queue qids */
+	if (p_iov->b_doorbell_bar) {
+		req->vfdev_info.capabilities |= VFPF_ACQUIRE_CAP_PHYSICAL_BAR |
+						VFPF_ACQUIRE_CAP_QUEUE_QIDS;
+		p_resc->num_cids = QED_ETH_VF_MAX_NUM_CIDS;
+	}
+
+	/* pf 2 vf bulletin board address */
+	req->bulletin_addr = p_iov->bulletin.phys;
+	req->bulletin_size = p_iov->bulletin.size;
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	while (!resources_acquired) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV, "attempting to acquire resources\n");
+
+		/* Clear response buffer, as this might be a re-send */
+		memset(p_iov->pf2vf_reply, 0, sizeof(union pfvf_tlvs));
+
+		/* send acquire request */
+		rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+		if (rc)
+			goto exit;
+
+		/* copy acquire response from buffer to p_hwfn */
+		memcpy(&p_iov->acquire_resp, resp, sizeof(p_iov->acquire_resp));
+
+		attempts++;
+
+		if (resp->hdr.status == PFVF_STATUS_SUCCESS) {
+			/* PF agrees to allocate our resources */
+			if (!(resp->pfdev_info.capabilities &
+			      PFVF_ACQUIRE_CAP_POST_FW_OVERRIDE)) {
+				/* It's possible legacy PF mistakenly accepted;
+				 * but we don't care - simply mark it as
+				 * legacy and continue.
+				 */
+				req->vfdev_info.capabilities |=
+				    VFPF_ACQUIRE_CAP_PRE_FP_HSI;
+			}
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV, "resources acquired\n");
+			resources_acquired = true;
+		} else if (resp->hdr.status == PFVF_STATUS_NO_RESOURCE &&
+			   attempts < VF_ACQUIRE_THRESH) {
+			qed_vf_pf_acquire_reduce_resc(p_hwfn, p_resc,
+						      &resp->resc);
+		} else if (resp->hdr.status == PFVF_STATUS_NOT_SUPPORTED) {
+			if (pfdev_info->major_fp_hsi &&
+			    (pfdev_info->major_fp_hsi != ETH_HSI_VER_MAJOR)) {
+				DP_NOTICE(p_hwfn,
+					  "PF uses an incompatible fastpath HSI %02x.%02x [VF requires %02x.%02x]. Please change to a VF driver using %02x.xx.\n",
+					  pfdev_info->major_fp_hsi,
+					  pfdev_info->minor_fp_hsi,
+					  ETH_HSI_VER_MAJOR,
+					  ETH_HSI_VER_MINOR,
+					  pfdev_info->major_fp_hsi);
+				rc = -EINVAL;
+				goto exit;
+			}
+
+			if (!pfdev_info->major_fp_hsi) {
+				if (req->vfdev_info.capabilities &
+				    VFPF_ACQUIRE_CAP_PRE_FP_HSI) {
+					DP_NOTICE(p_hwfn,
+						  "PF uses very old drivers. Please change to a VF driver using no later than 8.8.x.x.\n");
+					rc = -EINVAL;
+					goto exit;
+				} else {
+					DP_INFO(p_hwfn,
+						"PF is old - try re-acquire to see if it supports FW-version override\n");
+					req->vfdev_info.capabilities |=
+					    VFPF_ACQUIRE_CAP_PRE_FP_HSI;
+					continue;
+				}
+			}
+
+			/* If PF/VF are using same Major, PF must have had
+			 * it's reasons. Simply fail.
+			 */
+			DP_NOTICE(p_hwfn, "PF rejected acquisition by VF\n");
+			rc = -EINVAL;
+			goto exit;
+		} else {
+			DP_ERR(p_hwfn,
+			       "PF returned error %d to VF acquisition request\n",
+			       resp->hdr.status);
+			rc = -EAGAIN;
+			goto exit;
+		}
+	}
+
+	/* Mark the PF as legacy, if needed */
+	if (req->vfdev_info.capabilities & VFPF_ACQUIRE_CAP_PRE_FP_HSI)
+		p_iov->b_pre_fp_hsi = true;
+
+	/* In case PF doesn't support multi-queue Tx, update the number of
+	 * CIDs to reflect the number of queues [older PFs didn't fill that
+	 * field].
+	 */
+	if (!(resp->pfdev_info.capabilities & PFVF_ACQUIRE_CAP_QUEUE_QIDS))
+		resp->resc.num_cids = resp->resc.num_rxqs + resp->resc.num_txqs;
+
+	/* Update bulletin board size with response from PF */
+	p_iov->bulletin.size = resp->bulletin_size;
+
+	/* get HW info */
+	p_hwfn->cdev->type = resp->pfdev_info.dev_type;
+	p_hwfn->cdev->chip_rev = resp->pfdev_info.chip_rev;
+
+	p_hwfn->cdev->chip_num = pfdev_info->chip_num & 0xffff;
+
+	/* Learn of the possibility of CMT */
+	if (IS_LEAD_HWFN(p_hwfn)) {
+		if (resp->pfdev_info.capabilities & PFVF_ACQUIRE_CAP_100G) {
+			DP_NOTICE(p_hwfn, "100g VF\n");
+			p_hwfn->cdev->num_hwfns = 2;
+		}
+	}
+
+	if (!p_iov->b_pre_fp_hsi &&
+	    ETH_HSI_VER_MINOR &&
+	    (resp->pfdev_info.minor_fp_hsi < ETH_HSI_VER_MINOR)) {
+		DP_INFO(p_hwfn,
+			"PF is using older fastpath HSI; %02x.%02x is configured\n",
+			ETH_HSI_VER_MAJOR, resp->pfdev_info.minor_fp_hsi);
+	}
+
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+
+	return rc;
+}
+
+u32 qed_vf_hw_bar_size(struct qed_hwfn *p_hwfn, enum BAR_ID bar_id)
+{
+	u32 bar_size;
+
+	/* Regview size is fixed */
+	if (bar_id == BAR_ID_0)
+		return 1 << 17;
+
+	/* Doorbell is received from PF */
+	bar_size = p_hwfn->vf_iov_info->acquire_resp.pfdev_info.bar_size;
+	if (bar_size)
+		return 1 << bar_size;
+	return 0;
+}
+
+int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn)
+{
+	struct qed_hwfn *p_lead = QED_LEADING_HWFN(p_hwfn->cdev);
+	struct qed_vf_iov *p_iov;
+	u32 reg;
+	int rc;
+
+	/* Set number of hwfns - might be overriden once leading hwfn learns
+	 * actual configuration from PF.
+	 */
+	if (IS_LEAD_HWFN(p_hwfn))
+		p_hwfn->cdev->num_hwfns = 1;
+
+	reg = PXP_VF_BAR0_ME_OPAQUE_ADDRESS;
+	p_hwfn->hw_info.opaque_fid = (u16)REG_RD(p_hwfn, reg);
+
+	reg = PXP_VF_BAR0_ME_CONCRETE_ADDRESS;
+	p_hwfn->hw_info.concrete_fid = REG_RD(p_hwfn, reg);
+
+	/* Allocate vf sriov info */
+	p_iov = kzalloc(sizeof(*p_iov), GFP_KERNEL);
+	if (!p_iov)
+		return -ENOMEM;
+
+	/* Doorbells are tricky; Upper-layer has alreday set the hwfn doorbell
+	 * value, but there are several incompatibily scenarios where that
+	 * would be incorrect and we'd need to override it.
+	 */
+	if (!p_hwfn->doorbells) {
+		p_hwfn->doorbells = (u8 __iomem *)p_hwfn->regview +
+						  PXP_VF_BAR0_START_DQ;
+	} else if (p_hwfn == p_lead) {
+		/* For leading hw-function, value is always correct, but need
+		 * to handle scenario where legacy PF would not support 100g
+		 * mapped bars later.
+		 */
+		p_iov->b_doorbell_bar = true;
+	} else {
+		/* here, value would be correct ONLY if the leading hwfn
+		 * received indication that mapped-bars are supported.
+		 */
+		if (p_lead->vf_iov_info->b_doorbell_bar)
+			p_iov->b_doorbell_bar = true;
+		else
+			p_hwfn->doorbells = (u8 __iomem *)
+			    p_hwfn->regview + PXP_VF_BAR0_START_DQ;
+	}
+
+	/* Allocate vf2pf msg */
+	p_iov->vf2pf_request = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+						  sizeof(union vfpf_tlvs),
+						  &p_iov->vf2pf_request_phys,
+						  GFP_KERNEL);
+	if (!p_iov->vf2pf_request)
+		goto free_p_iov;
+
+	p_iov->pf2vf_reply = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+						sizeof(union pfvf_tlvs),
+						&p_iov->pf2vf_reply_phys,
+						GFP_KERNEL);
+	if (!p_iov->pf2vf_reply)
+		goto free_vf2pf_request;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF's Request mailbox [%p virt 0x%llx phys], Response mailbox [%p virt 0x%llx phys]\n",
+		   p_iov->vf2pf_request,
+		   (u64) p_iov->vf2pf_request_phys,
+		   p_iov->pf2vf_reply, (u64)p_iov->pf2vf_reply_phys);
+
+	/* Allocate Bulletin board */
+	p_iov->bulletin.size = sizeof(struct qed_bulletin_content);
+	p_iov->bulletin.p_virt = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+						    p_iov->bulletin.size,
+						    &p_iov->bulletin.phys,
+						    GFP_KERNEL);
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "VF's bulletin Board [%p virt 0x%llx phys 0x%08x bytes]\n",
+		   p_iov->bulletin.p_virt,
+		   (u64)p_iov->bulletin.phys, p_iov->bulletin.size);
+
+	mutex_init(&p_iov->mutex);
+
+	p_hwfn->vf_iov_info = p_iov;
+
+	p_hwfn->hw_info.personality = QED_PCI_ETH;
+
+	rc = qed_vf_pf_acquire(p_hwfn);
+
+	/* If VF is 100g using a mapped bar and PF is too old to support that,
+	 * acquisition would succeed - but the VF would have no way knowing
+	 * the size of the doorbell bar configured in HW and thus will not
+	 * know how to split it for 2nd hw-function.
+	 * In this case we re-try without the indication of the mapped
+	 * doorbell.
+	 */
+	if (!rc && p_iov->b_doorbell_bar &&
+	    !qed_vf_hw_bar_size(p_hwfn, BAR_ID_1) &&
+	    (p_hwfn->cdev->num_hwfns > 1)) {
+		rc = _qed_vf_pf_release(p_hwfn, false);
+		if (rc)
+			return rc;
+
+		p_iov->b_doorbell_bar = false;
+		p_hwfn->doorbells = (u8 __iomem *)p_hwfn->regview +
+						  PXP_VF_BAR0_START_DQ;
+		rc = qed_vf_pf_acquire(p_hwfn);
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "Regview [%p], Doorbell [%p], Device-doorbell [%p]\n",
+		   p_hwfn->regview, p_hwfn->doorbells, p_hwfn->cdev->doorbells);
+
+	return rc;
+
+free_vf2pf_request:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  sizeof(union vfpf_tlvs),
+			  p_iov->vf2pf_request, p_iov->vf2pf_request_phys);
+free_p_iov:
+	kfree(p_iov);
+
+	return -ENOMEM;
+}
+#define TSTORM_QZONE_START   PXP_VF_BAR0_START_SDM_ZONE_A
+#define MSTORM_QZONE_START(dev)   (TSTORM_QZONE_START +	\
+				   (TSTORM_QZONE_SIZE * NUM_OF_L2_QUEUES(dev)))
+
+static void
+__qed_vf_prep_tunn_req_tlv(struct vfpf_update_tunn_param_tlv *p_req,
+			   struct qed_tunn_update_type *p_src,
+			   enum qed_tunn_clss mask, u8 *p_cls)
+{
+	if (p_src->b_update_mode) {
+		p_req->tun_mode_update_mask |= BIT(mask);
+
+		if (p_src->b_mode_enabled)
+			p_req->tunn_mode |= BIT(mask);
+	}
+
+	*p_cls = p_src->tun_cls;
+}
+
+static void
+qed_vf_prep_tunn_req_tlv(struct vfpf_update_tunn_param_tlv *p_req,
+			 struct qed_tunn_update_type *p_src,
+			 enum qed_tunn_clss mask,
+			 u8 *p_cls, struct qed_tunn_update_udp_port *p_port,
+			 u8 *p_update_port, u16 *p_udp_port)
+{
+	if (p_port->b_update_port) {
+		*p_update_port = 1;
+		*p_udp_port = p_port->port;
+	}
+
+	__qed_vf_prep_tunn_req_tlv(p_req, p_src, mask, p_cls);
+}
+
+void qed_vf_set_vf_start_tunn_update_param(struct qed_tunnel_info *p_tun)
+{
+	if (p_tun->vxlan.b_mode_enabled)
+		p_tun->vxlan.b_update_mode = true;
+	if (p_tun->l2_geneve.b_mode_enabled)
+		p_tun->l2_geneve.b_update_mode = true;
+	if (p_tun->ip_geneve.b_mode_enabled)
+		p_tun->ip_geneve.b_update_mode = true;
+	if (p_tun->l2_gre.b_mode_enabled)
+		p_tun->l2_gre.b_update_mode = true;
+	if (p_tun->ip_gre.b_mode_enabled)
+		p_tun->ip_gre.b_update_mode = true;
+
+	p_tun->b_update_rx_cls = true;
+	p_tun->b_update_tx_cls = true;
+}
+
+static void
+__qed_vf_update_tunn_param(struct qed_tunn_update_type *p_tun,
+			   u16 feature_mask, u8 tunn_mode,
+			   u8 tunn_cls, enum qed_tunn_mode val)
+{
+	if (feature_mask & BIT(val)) {
+		p_tun->b_mode_enabled = tunn_mode;
+		p_tun->tun_cls = tunn_cls;
+	} else {
+		p_tun->b_mode_enabled = false;
+	}
+}
+
+static void qed_vf_update_tunn_param(struct qed_hwfn *p_hwfn,
+				     struct qed_tunnel_info *p_tun,
+				     struct pfvf_update_tunn_param_tlv *p_resp)
+{
+	/* Update mode and classes provided by PF */
+	u16 feat_mask = p_resp->tunn_feature_mask;
+
+	__qed_vf_update_tunn_param(&p_tun->vxlan, feat_mask,
+				   p_resp->vxlan_mode, p_resp->vxlan_clss,
+				   QED_MODE_VXLAN_TUNN);
+	__qed_vf_update_tunn_param(&p_tun->l2_geneve, feat_mask,
+				   p_resp->l2geneve_mode,
+				   p_resp->l2geneve_clss,
+				   QED_MODE_L2GENEVE_TUNN);
+	__qed_vf_update_tunn_param(&p_tun->ip_geneve, feat_mask,
+				   p_resp->ipgeneve_mode,
+				   p_resp->ipgeneve_clss,
+				   QED_MODE_IPGENEVE_TUNN);
+	__qed_vf_update_tunn_param(&p_tun->l2_gre, feat_mask,
+				   p_resp->l2gre_mode, p_resp->l2gre_clss,
+				   QED_MODE_L2GRE_TUNN);
+	__qed_vf_update_tunn_param(&p_tun->ip_gre, feat_mask,
+				   p_resp->ipgre_mode, p_resp->ipgre_clss,
+				   QED_MODE_IPGRE_TUNN);
+	p_tun->geneve_port.port = p_resp->geneve_udp_port;
+	p_tun->vxlan_port.port = p_resp->vxlan_udp_port;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "tunn mode: vxlan=0x%x, l2geneve=0x%x, ipgeneve=0x%x, l2gre=0x%x, ipgre=0x%x",
+		   p_tun->vxlan.b_mode_enabled, p_tun->l2_geneve.b_mode_enabled,
+		   p_tun->ip_geneve.b_mode_enabled,
+		   p_tun->l2_gre.b_mode_enabled, p_tun->ip_gre.b_mode_enabled);
+}
+
+int qed_vf_pf_tunnel_param_update(struct qed_hwfn *p_hwfn,
+				  struct qed_tunnel_info *p_src)
+{
+	struct qed_tunnel_info *p_tun = &p_hwfn->cdev->tunnel;
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_update_tunn_param_tlv *p_resp;
+	struct vfpf_update_tunn_param_tlv *p_req;
+	int rc;
+
+	p_req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_UPDATE_TUNN_PARAM,
+			       sizeof(*p_req));
+
+	if (p_src->b_update_rx_cls && p_src->b_update_tx_cls)
+		p_req->update_tun_cls = 1;
+
+	qed_vf_prep_tunn_req_tlv(p_req, &p_src->vxlan, QED_MODE_VXLAN_TUNN,
+				 &p_req->vxlan_clss, &p_src->vxlan_port,
+				 &p_req->update_vxlan_port,
+				 &p_req->vxlan_port);
+	qed_vf_prep_tunn_req_tlv(p_req, &p_src->l2_geneve,
+				 QED_MODE_L2GENEVE_TUNN,
+				 &p_req->l2geneve_clss, &p_src->geneve_port,
+				 &p_req->update_geneve_port,
+				 &p_req->geneve_port);
+	__qed_vf_prep_tunn_req_tlv(p_req, &p_src->ip_geneve,
+				   QED_MODE_IPGENEVE_TUNN,
+				   &p_req->ipgeneve_clss);
+	__qed_vf_prep_tunn_req_tlv(p_req, &p_src->l2_gre,
+				   QED_MODE_L2GRE_TUNN, &p_req->l2gre_clss);
+	__qed_vf_prep_tunn_req_tlv(p_req, &p_src->ip_gre,
+				   QED_MODE_IPGRE_TUNN, &p_req->ipgre_clss);
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	p_resp = &p_iov->pf2vf_reply->tunn_param_resp;
+	rc = qed_send_msg2pf(p_hwfn, &p_resp->hdr.status, sizeof(*p_resp));
+
+	if (rc)
+		goto exit;
+
+	if (p_resp->hdr.status != PFVF_STATUS_SUCCESS) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Failed to update tunnel parameters\n");
+		rc = -EINVAL;
+	}
+
+	qed_vf_update_tunn_param(p_hwfn, p_tun, p_resp);
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+	return rc;
+}
+
+int
+qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
+		    struct qed_queue_cid *p_cid,
+		    u16 bd_max_bytes,
+		    dma_addr_t bd_chain_phys_addr,
+		    dma_addr_t cqe_pbl_addr,
+		    u16 cqe_pbl_size, void __iomem **pp_prod)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_start_queue_resp_tlv *resp;
+	struct vfpf_start_rxq_tlv *req;
+	u8 rx_qid = p_cid->rel.queue_id;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_START_RXQ, sizeof(*req));
+
+	req->rx_qid = rx_qid;
+	req->cqe_pbl_addr = cqe_pbl_addr;
+	req->cqe_pbl_size = cqe_pbl_size;
+	req->rxq_addr = bd_chain_phys_addr;
+	req->hw_sb = p_cid->sb_igu_id;
+	req->sb_index = p_cid->sb_idx;
+	req->bd_max_bytes = bd_max_bytes;
+	req->stat_id = -1;
+
+	/* If PF is legacy, we'll need to calculate producers ourselves
+	 * as well as clean them.
+	 */
+	if (p_iov->b_pre_fp_hsi) {
+		u8 hw_qid = p_iov->acquire_resp.resc.hw_qid[rx_qid];
+		u32 init_prod_val = 0;
+
+		*pp_prod = (u8 __iomem *)
+		    p_hwfn->regview +
+		    MSTORM_QZONE_START(p_hwfn->cdev) +
+		    hw_qid * MSTORM_QZONE_SIZE;
+
+		/* Init the rcq, rx bd and rx sge (if valid) producers to 0 */
+		__internal_ram_wr(p_hwfn, *pp_prod, sizeof(u32),
+				  (u32 *)(&init_prod_val));
+	}
+
+	qed_vf_pf_add_qid(p_hwfn, p_cid);
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->queue_start;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS) {
+		rc = -EINVAL;
+		goto exit;
+	}
+
+	/* Learn the address of the producer from the response */
+	if (!p_iov->b_pre_fp_hsi) {
+		u32 init_prod_val = 0;
+
+		*pp_prod = (u8 __iomem *)p_hwfn->regview + resp->offset;
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Rxq[0x%02x]: producer at %p [offset 0x%08x]\n",
+			   rx_qid, *pp_prod, resp->offset);
+
+		/* Init the rcq, rx bd and rx sge (if valid) producers to 0 */
+		__internal_ram_wr(p_hwfn, *pp_prod, sizeof(u32),
+				  (u32 *)&init_prod_val);
+	}
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+
+	return rc;
+}
+
+int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn,
+		       struct qed_queue_cid *p_cid, bool cqe_completion)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct vfpf_stop_rxqs_tlv *req;
+	struct pfvf_def_resp_tlv *resp;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_STOP_RXQS, sizeof(*req));
+
+	req->rx_qid = p_cid->rel.queue_id;
+	req->num_rxqs = 1;
+	req->cqe_completion = cqe_completion;
+
+	qed_vf_pf_add_qid(p_hwfn, p_cid);
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS) {
+		rc = -EINVAL;
+		goto exit;
+	}
+
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+
+	return rc;
+}
+
+int
+qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
+		    struct qed_queue_cid *p_cid,
+		    dma_addr_t pbl_addr,
+		    u16 pbl_size, void __iomem **pp_doorbell)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_start_queue_resp_tlv *resp;
+	struct vfpf_start_txq_tlv *req;
+	u16 qid = p_cid->rel.queue_id;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_START_TXQ, sizeof(*req));
+
+	req->tx_qid = qid;
+
+	/* Tx */
+	req->pbl_addr = pbl_addr;
+	req->pbl_size = pbl_size;
+	req->hw_sb = p_cid->sb_igu_id;
+	req->sb_index = p_cid->sb_idx;
+
+	qed_vf_pf_add_qid(p_hwfn, p_cid);
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->queue_start;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS) {
+		rc = -EINVAL;
+		goto exit;
+	}
+
+	/* Modern PFs provide the actual offsets, while legacy
+	 * provided only the queue id.
+	 */
+	if (!p_iov->b_pre_fp_hsi) {
+		*pp_doorbell = (u8 __iomem *)p_hwfn->doorbells + resp->offset;
+	} else {
+		u8 cid = p_iov->acquire_resp.resc.cid[qid];
+
+		*pp_doorbell = (u8 __iomem *)p_hwfn->doorbells +
+					     qed_db_addr_vf(cid,
+							    DQ_DEMS_LEGACY);
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "Txq[0x%02x.%02x]: doorbell at %p [offset 0x%08x]\n",
+		   qid, p_cid->qid_usage_idx, *pp_doorbell, resp->offset);
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+
+	return rc;
+}
+
+int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, struct qed_queue_cid *p_cid)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct vfpf_stop_txqs_tlv *req;
+	struct pfvf_def_resp_tlv *resp;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_STOP_TXQS, sizeof(*req));
+
+	req->tx_qid = p_cid->rel.queue_id;
+	req->num_txqs = 1;
+
+	qed_vf_pf_add_qid(p_hwfn, p_cid);
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS) {
+		rc = -EINVAL;
+		goto exit;
+	}
+
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+
+	return rc;
+}
+
+int qed_vf_pf_vport_start(struct qed_hwfn *p_hwfn,
+			  u8 vport_id,
+			  u16 mtu,
+			  u8 inner_vlan_removal,
+			  enum qed_tpa_mode tpa_mode,
+			  u8 max_buffers_per_cqe, u8 only_untagged)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct vfpf_vport_start_tlv *req;
+	struct pfvf_def_resp_tlv *resp;
+	int rc, i;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_VPORT_START, sizeof(*req));
+
+	req->mtu = mtu;
+	req->vport_id = vport_id;
+	req->inner_vlan_removal = inner_vlan_removal;
+	req->tpa_mode = tpa_mode;
+	req->max_buffers_per_cqe = max_buffers_per_cqe;
+	req->only_untagged = only_untagged;
+
+	/* status blocks */
+	for (i = 0; i < p_hwfn->vf_iov_info->acquire_resp.resc.num_sbs; i++) {
+		struct qed_sb_info *p_sb = p_hwfn->vf_iov_info->sbs_info[i];
+
+		if (p_sb)
+			req->sb_addr[i] = p_sb->sb_phys;
+	}
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS) {
+		rc = -EINVAL;
+		goto exit;
+	}
+
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+
+	return rc;
+}
+
+int qed_vf_pf_vport_stop(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_def_resp_tlv *resp = &p_iov->pf2vf_reply->default_resp;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_VPORT_TEARDOWN,
+		       sizeof(struct vfpf_first_tlv));
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS) {
+		rc = -EINVAL;
+		goto exit;
+	}
+
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+
+	return rc;
+}
+
+static bool
+qed_vf_handle_vp_update_is_needed(struct qed_hwfn *p_hwfn,
+				  struct qed_sp_vport_update_params *p_data,
+				  u16 tlv)
+{
+	switch (tlv) {
+	case CHANNEL_TLV_VPORT_UPDATE_ACTIVATE:
+		return !!(p_data->update_vport_active_rx_flg ||
+			  p_data->update_vport_active_tx_flg);
+	case CHANNEL_TLV_VPORT_UPDATE_TX_SWITCH:
+		return !!p_data->update_tx_switching_flg;
+	case CHANNEL_TLV_VPORT_UPDATE_VLAN_STRIP:
+		return !!p_data->update_inner_vlan_removal_flg;
+	case CHANNEL_TLV_VPORT_UPDATE_ACCEPT_ANY_VLAN:
+		return !!p_data->update_accept_any_vlan_flg;
+	case CHANNEL_TLV_VPORT_UPDATE_MCAST:
+		return !!p_data->update_approx_mcast_flg;
+	case CHANNEL_TLV_VPORT_UPDATE_ACCEPT_PARAM:
+		return !!(p_data->accept_flags.update_rx_mode_config ||
+			  p_data->accept_flags.update_tx_mode_config);
+	case CHANNEL_TLV_VPORT_UPDATE_RSS:
+		return !!p_data->rss_params;
+	case CHANNEL_TLV_VPORT_UPDATE_SGE_TPA:
+		return !!p_data->sge_tpa_params;
+	default:
+		DP_INFO(p_hwfn, "Unexpected vport-update TLV[%d]\n",
+			tlv);
+		return false;
+	}
+}
+
+static void
+qed_vf_handle_vp_update_tlvs_resp(struct qed_hwfn *p_hwfn,
+				  struct qed_sp_vport_update_params *p_data)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_def_resp_tlv *p_resp;
+	u16 tlv;
+
+	for (tlv = CHANNEL_TLV_VPORT_UPDATE_ACTIVATE;
+	     tlv < CHANNEL_TLV_VPORT_UPDATE_MAX; tlv++) {
+		if (!qed_vf_handle_vp_update_is_needed(p_hwfn, p_data, tlv))
+			continue;
+
+		p_resp = (struct pfvf_def_resp_tlv *)
+			 qed_iov_search_list_tlvs(p_hwfn, p_iov->pf2vf_reply,
+						  tlv);
+		if (p_resp && p_resp->hdr.status)
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "TLV[%d] Configuration %s\n",
+				   tlv,
+				   (p_resp && p_resp->hdr.status) ? "succeeded"
+								  : "failed");
+	}
+}
+
+int qed_vf_pf_vport_update(struct qed_hwfn *p_hwfn,
+			   struct qed_sp_vport_update_params *p_params)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct vfpf_vport_update_tlv *req;
+	struct pfvf_def_resp_tlv *resp;
+	u8 update_rx, update_tx;
+	u32 resp_size = 0;
+	u16 size, tlv;
+	int rc;
+
+	resp = &p_iov->pf2vf_reply->default_resp;
+	resp_size = sizeof(*resp);
+
+	update_rx = p_params->update_vport_active_rx_flg;
+	update_tx = p_params->update_vport_active_tx_flg;
+
+	/* clear mailbox and prep header tlv */
+	qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_VPORT_UPDATE, sizeof(*req));
+
+	/* Prepare extended tlvs */
+	if (update_rx || update_tx) {
+		struct vfpf_vport_update_activate_tlv *p_act_tlv;
+
+		size = sizeof(struct vfpf_vport_update_activate_tlv);
+		p_act_tlv = qed_add_tlv(p_hwfn, &p_iov->offset,
+					CHANNEL_TLV_VPORT_UPDATE_ACTIVATE,
+					size);
+		resp_size += sizeof(struct pfvf_def_resp_tlv);
+
+		if (update_rx) {
+			p_act_tlv->update_rx = update_rx;
+			p_act_tlv->active_rx = p_params->vport_active_rx_flg;
+		}
+
+		if (update_tx) {
+			p_act_tlv->update_tx = update_tx;
+			p_act_tlv->active_tx = p_params->vport_active_tx_flg;
+		}
+	}
+
+	if (p_params->update_tx_switching_flg) {
+		struct vfpf_vport_update_tx_switch_tlv *p_tx_switch_tlv;
+
+		size = sizeof(struct vfpf_vport_update_tx_switch_tlv);
+		tlv = CHANNEL_TLV_VPORT_UPDATE_TX_SWITCH;
+		p_tx_switch_tlv = qed_add_tlv(p_hwfn, &p_iov->offset,
+					      tlv, size);
+		resp_size += sizeof(struct pfvf_def_resp_tlv);
+
+		p_tx_switch_tlv->tx_switching = p_params->tx_switching_flg;
+	}
+
+	if (p_params->update_approx_mcast_flg) {
+		struct vfpf_vport_update_mcast_bin_tlv *p_mcast_tlv;
+
+		size = sizeof(struct vfpf_vport_update_mcast_bin_tlv);
+		p_mcast_tlv = qed_add_tlv(p_hwfn, &p_iov->offset,
+					  CHANNEL_TLV_VPORT_UPDATE_MCAST, size);
+		resp_size += sizeof(struct pfvf_def_resp_tlv);
+
+		memcpy(p_mcast_tlv->bins, p_params->bins,
+		       sizeof(unsigned long) * ETH_MULTICAST_MAC_BINS_IN_REGS);
+	}
+
+	update_rx = p_params->accept_flags.update_rx_mode_config;
+	update_tx = p_params->accept_flags.update_tx_mode_config;
+
+	if (update_rx || update_tx) {
+		struct vfpf_vport_update_accept_param_tlv *p_accept_tlv;
+
+		tlv = CHANNEL_TLV_VPORT_UPDATE_ACCEPT_PARAM;
+		size = sizeof(struct vfpf_vport_update_accept_param_tlv);
+		p_accept_tlv = qed_add_tlv(p_hwfn, &p_iov->offset, tlv, size);
+		resp_size += sizeof(struct pfvf_def_resp_tlv);
+
+		if (update_rx) {
+			p_accept_tlv->update_rx_mode = update_rx;
+			p_accept_tlv->rx_accept_filter =
+			    p_params->accept_flags.rx_accept_filter;
+		}
+
+		if (update_tx) {
+			p_accept_tlv->update_tx_mode = update_tx;
+			p_accept_tlv->tx_accept_filter =
+			    p_params->accept_flags.tx_accept_filter;
+		}
+	}
+
+	if (p_params->rss_params) {
+		struct qed_rss_params *rss_params = p_params->rss_params;
+		struct vfpf_vport_update_rss_tlv *p_rss_tlv;
+		int i, table_size;
+
+		size = sizeof(struct vfpf_vport_update_rss_tlv);
+		p_rss_tlv = qed_add_tlv(p_hwfn,
+					&p_iov->offset,
+					CHANNEL_TLV_VPORT_UPDATE_RSS, size);
+		resp_size += sizeof(struct pfvf_def_resp_tlv);
+
+		if (rss_params->update_rss_config)
+			p_rss_tlv->update_rss_flags |=
+			    VFPF_UPDATE_RSS_CONFIG_FLAG;
+		if (rss_params->update_rss_capabilities)
+			p_rss_tlv->update_rss_flags |=
+			    VFPF_UPDATE_RSS_CAPS_FLAG;
+		if (rss_params->update_rss_ind_table)
+			p_rss_tlv->update_rss_flags |=
+			    VFPF_UPDATE_RSS_IND_TABLE_FLAG;
+		if (rss_params->update_rss_key)
+			p_rss_tlv->update_rss_flags |= VFPF_UPDATE_RSS_KEY_FLAG;
+
+		p_rss_tlv->rss_enable = rss_params->rss_enable;
+		p_rss_tlv->rss_caps = rss_params->rss_caps;
+		p_rss_tlv->rss_table_size_log = rss_params->rss_table_size_log;
+
+		table_size = min_t(int, T_ETH_INDIRECTION_TABLE_SIZE,
+				   1 << p_rss_tlv->rss_table_size_log);
+		for (i = 0; i < table_size; i++) {
+			struct qed_queue_cid *p_queue;
+
+			p_queue = rss_params->rss_ind_table[i];
+			p_rss_tlv->rss_ind_table[i] = p_queue->rel.queue_id;
+		}
+		memcpy(p_rss_tlv->rss_key, rss_params->rss_key,
+		       sizeof(rss_params->rss_key));
+	}
+
+	if (p_params->update_accept_any_vlan_flg) {
+		struct vfpf_vport_update_accept_any_vlan_tlv *p_any_vlan_tlv;
+
+		size = sizeof(struct vfpf_vport_update_accept_any_vlan_tlv);
+		tlv = CHANNEL_TLV_VPORT_UPDATE_ACCEPT_ANY_VLAN;
+		p_any_vlan_tlv = qed_add_tlv(p_hwfn, &p_iov->offset, tlv, size);
+
+		resp_size += sizeof(struct pfvf_def_resp_tlv);
+		p_any_vlan_tlv->accept_any_vlan = p_params->accept_any_vlan;
+		p_any_vlan_tlv->update_accept_any_vlan_flg =
+		    p_params->update_accept_any_vlan_flg;
+	}
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, resp_size);
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS) {
+		rc = -EINVAL;
+		goto exit;
+	}
+
+	qed_vf_handle_vp_update_tlvs_resp(p_hwfn, p_params);
+
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+
+	return rc;
+}
+
+int qed_vf_pf_reset(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_def_resp_tlv *resp;
+	struct vfpf_first_tlv *req;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_CLOSE, sizeof(*req));
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS) {
+		rc = -EAGAIN;
+		goto exit;
+	}
+
+	p_hwfn->b_int_enabled = 0;
+
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+
+	return rc;
+}
+
+void qed_vf_pf_filter_mcast(struct qed_hwfn *p_hwfn,
+			    struct qed_filter_mcast *p_filter_cmd)
+{
+	struct qed_sp_vport_update_params sp_params;
+	int i;
+
+	memset(&sp_params, 0, sizeof(sp_params));
+	sp_params.update_approx_mcast_flg = 1;
+
+	if (p_filter_cmd->opcode == QED_FILTER_ADD) {
+		for (i = 0; i < p_filter_cmd->num_mc_addrs; i++) {
+			u32 bit;
+
+			bit = qed_mcast_bin_from_mac(p_filter_cmd->mac[i]);
+			__set_bit(bit, sp_params.bins);
+		}
+	}
+
+	qed_vf_pf_vport_update(p_hwfn, &sp_params);
+}
+
+int qed_vf_pf_filter_ucast(struct qed_hwfn *p_hwfn,
+			   struct qed_filter_ucast *p_ucast)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct vfpf_ucast_filter_tlv *req;
+	struct pfvf_def_resp_tlv *resp;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_UCAST_FILTER, sizeof(*req));
+	req->opcode = (u8) p_ucast->opcode;
+	req->type = (u8) p_ucast->type;
+	memcpy(req->mac, p_ucast->mac, ETH_ALEN);
+	req->vlan = p_ucast->vlan;
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS) {
+		rc = -EAGAIN;
+		goto exit;
+	}
+
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+
+	return rc;
+}
+
+int qed_vf_pf_int_cleanup(struct qed_hwfn *p_hwfn)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_def_resp_tlv *resp = &p_iov->pf2vf_reply->default_resp;
+	int rc;
+
+	/* clear mailbox and prep first tlv */
+	qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_INT_CLEANUP,
+		       sizeof(struct vfpf_first_tlv));
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset,
+		    CHANNEL_TLV_LIST_END, sizeof(struct channel_list_end_tlv));
+
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS) {
+		rc = -EINVAL;
+		goto exit;
+	}
+
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+
+	return rc;
+}
+
+int qed_vf_pf_get_coalesce(struct qed_hwfn *p_hwfn,
+			   u16 *p_coal, struct qed_queue_cid *p_cid)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_read_coal_resp_tlv *resp;
+	struct vfpf_read_coal_req_tlv *req;
+	int rc;
+
+	/* clear mailbox and prep header tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_COALESCE_READ, sizeof(*req));
+	req->qid = p_cid->rel.queue_id;
+	req->is_rx = p_cid->b_is_rx ? 1 : 0;
+
+	qed_add_tlv(p_hwfn, &p_iov->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+	resp = &p_iov->pf2vf_reply->read_coal_resp;
+
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS)
+		goto exit;
+
+	*p_coal = resp->coal;
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+
+	return rc;
+}
+
+int
+qed_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn,
+		       u16 rx_coal, u16 tx_coal, struct qed_queue_cid *p_cid)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct vfpf_update_coalesce *req;
+	struct pfvf_def_resp_tlv *resp;
+	int rc;
+
+	/* clear mailbox and prep header tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_COALESCE_UPDATE, sizeof(*req));
+
+	req->rx_coal = rx_coal;
+	req->tx_coal = tx_coal;
+	req->qid = p_cid->rel.queue_id;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "Setting coalesce rx_coal = %d, tx_coal = %d at queue = %d\n",
+		   rx_coal, tx_coal, req->qid);
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS)
+		goto exit;
+
+	if (rx_coal)
+		p_hwfn->cdev->rx_coalesce_usecs = rx_coal;
+
+	if (tx_coal)
+		p_hwfn->cdev->tx_coalesce_usecs = tx_coal;
+
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+	return rc;
+}
+
+u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+
+	if (!p_iov) {
+		DP_NOTICE(p_hwfn, "vf_sriov_info isn't initialized\n");
+		return 0;
+	}
+
+	return p_iov->acquire_resp.resc.hw_sbs[sb_id].hw_sb_id;
+}
+
+void qed_vf_set_sb_info(struct qed_hwfn *p_hwfn,
+			u16 sb_id, struct qed_sb_info *p_sb)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+
+	if (!p_iov) {
+		DP_NOTICE(p_hwfn, "vf_sriov_info isn't initialized\n");
+		return;
+	}
+
+	if (sb_id >= PFVF_MAX_SBS_PER_VF) {
+		DP_NOTICE(p_hwfn, "Can't configure SB %04x\n", sb_id);
+		return;
+	}
+
+	p_iov->sbs_info[sb_id] = p_sb;
+}
+
+int qed_vf_read_bulletin(struct qed_hwfn *p_hwfn, u8 *p_change)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct qed_bulletin_content shadow;
+	u32 crc, crc_size;
+
+	crc_size = sizeof(p_iov->bulletin.p_virt->crc);
+	*p_change = 0;
+
+	/* Need to guarantee PF is not in the middle of writing it */
+	memcpy(&shadow, p_iov->bulletin.p_virt, p_iov->bulletin.size);
+
+	/* If version did not update, no need to do anything */
+	if (shadow.version == p_iov->bulletin_shadow.version)
+		return 0;
+
+	/* Verify the bulletin we see is valid */
+	crc = crc32(0, (u8 *)&shadow + crc_size,
+		    p_iov->bulletin.size - crc_size);
+	if (crc != shadow.crc)
+		return -EAGAIN;
+
+	/* Set the shadow bulletin and process it */
+	memcpy(&p_iov->bulletin_shadow, &shadow, p_iov->bulletin.size);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "Read a bulletin update %08x\n", shadow.version);
+
+	*p_change = 1;
+
+	return 0;
+}
+
+void __qed_vf_get_link_params(struct qed_hwfn *p_hwfn,
+			      struct qed_mcp_link_params *p_params,
+			      struct qed_bulletin_content *p_bulletin)
+{
+	memset(p_params, 0, sizeof(*p_params));
+
+	p_params->speed.autoneg = p_bulletin->req_autoneg;
+	p_params->speed.advertised_speeds = p_bulletin->req_adv_speed;
+	p_params->speed.forced_speed = p_bulletin->req_forced_speed;
+	p_params->pause.autoneg = p_bulletin->req_autoneg_pause;
+	p_params->pause.forced_rx = p_bulletin->req_forced_rx;
+	p_params->pause.forced_tx = p_bulletin->req_forced_tx;
+	p_params->loopback_mode = p_bulletin->req_loopback;
+}
+
+void qed_vf_get_link_params(struct qed_hwfn *p_hwfn,
+			    struct qed_mcp_link_params *params)
+{
+	__qed_vf_get_link_params(p_hwfn, params,
+				 &(p_hwfn->vf_iov_info->bulletin_shadow));
+}
+
+void __qed_vf_get_link_state(struct qed_hwfn *p_hwfn,
+			     struct qed_mcp_link_state *p_link,
+			     struct qed_bulletin_content *p_bulletin)
+{
+	memset(p_link, 0, sizeof(*p_link));
+
+	p_link->link_up = p_bulletin->link_up;
+	p_link->speed = p_bulletin->speed;
+	p_link->full_duplex = p_bulletin->full_duplex;
+	p_link->an = p_bulletin->autoneg;
+	p_link->an_complete = p_bulletin->autoneg_complete;
+	p_link->parallel_detection = p_bulletin->parallel_detection;
+	p_link->pfc_enabled = p_bulletin->pfc_enabled;
+	p_link->partner_adv_speed = p_bulletin->partner_adv_speed;
+	p_link->partner_tx_flow_ctrl_en = p_bulletin->partner_tx_flow_ctrl_en;
+	p_link->partner_rx_flow_ctrl_en = p_bulletin->partner_rx_flow_ctrl_en;
+	p_link->partner_adv_pause = p_bulletin->partner_adv_pause;
+	p_link->sfp_tx_fault = p_bulletin->sfp_tx_fault;
+}
+
+void qed_vf_get_link_state(struct qed_hwfn *p_hwfn,
+			   struct qed_mcp_link_state *link)
+{
+	__qed_vf_get_link_state(p_hwfn, link,
+				&(p_hwfn->vf_iov_info->bulletin_shadow));
+}
+
+void __qed_vf_get_link_caps(struct qed_hwfn *p_hwfn,
+			    struct qed_mcp_link_capabilities *p_link_caps,
+			    struct qed_bulletin_content *p_bulletin)
+{
+	memset(p_link_caps, 0, sizeof(*p_link_caps));
+	p_link_caps->speed_capabilities = p_bulletin->capability_speed;
+}
+
+void qed_vf_get_link_caps(struct qed_hwfn *p_hwfn,
+			  struct qed_mcp_link_capabilities *p_link_caps)
+{
+	__qed_vf_get_link_caps(p_hwfn, p_link_caps,
+			       &(p_hwfn->vf_iov_info->bulletin_shadow));
+}
+
+void qed_vf_get_num_rxqs(struct qed_hwfn *p_hwfn, u8 *num_rxqs)
+{
+	*num_rxqs = p_hwfn->vf_iov_info->acquire_resp.resc.num_rxqs;
+}
+
+void qed_vf_get_num_txqs(struct qed_hwfn *p_hwfn, u8 *num_txqs)
+{
+	*num_txqs = p_hwfn->vf_iov_info->acquire_resp.resc.num_txqs;
+}
+
+void qed_vf_get_num_cids(struct qed_hwfn *p_hwfn, u8 *num_cids)
+{
+	*num_cids = p_hwfn->vf_iov_info->acquire_resp.resc.num_cids;
+}
+
+void qed_vf_get_port_mac(struct qed_hwfn *p_hwfn, u8 *port_mac)
+{
+	memcpy(port_mac,
+	       p_hwfn->vf_iov_info->acquire_resp.pfdev_info.port_mac, ETH_ALEN);
+}
+
+void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn, u8 *num_vlan_filters)
+{
+	struct qed_vf_iov *p_vf;
+
+	p_vf = p_hwfn->vf_iov_info;
+	*num_vlan_filters = p_vf->acquire_resp.resc.num_vlan_filters;
+}
+
+void qed_vf_get_num_mac_filters(struct qed_hwfn *p_hwfn, u8 *num_mac_filters)
+{
+	struct qed_vf_iov *p_vf = p_hwfn->vf_iov_info;
+
+	*num_mac_filters = p_vf->acquire_resp.resc.num_mac_filters;
+}
+
+bool qed_vf_check_mac(struct qed_hwfn *p_hwfn, u8 *mac)
+{
+	struct qed_bulletin_content *bulletin;
+
+	bulletin = &p_hwfn->vf_iov_info->bulletin_shadow;
+	if (!(bulletin->valid_bitmap & (1 << MAC_ADDR_FORCED)))
+		return true;
+
+	/* Forbid VF from changing a MAC enforced by PF */
+	if (ether_addr_equal(bulletin->mac, mac))
+		return false;
+
+	return false;
+}
+
+static bool qed_vf_bulletin_get_forced_mac(struct qed_hwfn *hwfn,
+					   u8 *dst_mac, u8 *p_is_forced)
+{
+	struct qed_bulletin_content *bulletin;
+
+	bulletin = &hwfn->vf_iov_info->bulletin_shadow;
+
+	if (bulletin->valid_bitmap & (1 << MAC_ADDR_FORCED)) {
+		if (p_is_forced)
+			*p_is_forced = 1;
+	} else if (bulletin->valid_bitmap & (1 << VFPF_BULLETIN_MAC_ADDR)) {
+		if (p_is_forced)
+			*p_is_forced = 0;
+	} else {
+		return false;
+	}
+
+	ether_addr_copy(dst_mac, bulletin->mac);
+
+	return true;
+}
+
+static void
+qed_vf_bulletin_get_udp_ports(struct qed_hwfn *p_hwfn,
+			      u16 *p_vxlan_port, u16 *p_geneve_port)
+{
+	struct qed_bulletin_content *p_bulletin;
+
+	p_bulletin = &p_hwfn->vf_iov_info->bulletin_shadow;
+
+	*p_vxlan_port = p_bulletin->vxlan_udp_port;
+	*p_geneve_port = p_bulletin->geneve_udp_port;
+}
+
+void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
+			   u16 *fw_major, u16 *fw_minor,
+			   u16 *fw_rev, u16 *fw_eng)
+{
+	struct pf_vf_pfdev_info *info;
+
+	info = &p_hwfn->vf_iov_info->acquire_resp.pfdev_info;
+
+	*fw_major = info->fw_major;
+	*fw_minor = info->fw_minor;
+	*fw_rev = info->fw_rev;
+	*fw_eng = info->fw_eng;
+}
+
+static void qed_handle_bulletin_change(struct qed_hwfn *hwfn)
+{
+	struct qed_eth_cb_ops *ops = hwfn->cdev->protocol_ops.eth;
+	u8 mac[ETH_ALEN], is_mac_exist, is_mac_forced;
+	void *cookie = hwfn->cdev->ops_cookie;
+	u16 vxlan_port, geneve_port;
+
+	qed_vf_bulletin_get_udp_ports(hwfn, &vxlan_port, &geneve_port);
+	is_mac_exist = qed_vf_bulletin_get_forced_mac(hwfn, mac,
+						      &is_mac_forced);
+	if (is_mac_exist && cookie)
+		ops->force_mac(cookie, mac, !!is_mac_forced);
+
+	ops->ports_update(cookie, vxlan_port, geneve_port);
+
+	/* Always update link configuration according to bulletin */
+	qed_link_update(hwfn);
+}
+
+void qed_iov_vf_task(struct work_struct *work)
+{
+	struct qed_hwfn *hwfn = container_of(work, struct qed_hwfn,
+					     iov_task.work);
+	u8 change = 0;
+
+	if (test_and_clear_bit(QED_IOV_WQ_STOP_WQ_FLAG, &hwfn->iov_task_flags))
+		return;
+
+	/* Handle bulletin board changes */
+	qed_vf_read_bulletin(hwfn, &change);
+	if (test_and_clear_bit(QED_IOV_WQ_VF_FORCE_LINK_QUERY_FLAG,
+			       &hwfn->iov_task_flags))
+		change = 1;
+	if (change)
+		qed_handle_bulletin_change(hwfn);
+
+	/* As VF is polling bulletin board, need to constantly re-schedule */
+	queue_delayed_work(hwfn->iov_wq, &hwfn->iov_task, HZ);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
new file mode 100644
index 0000000..97d44df
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -0,0 +1,1239 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_VF_H
+#define _QED_VF_H
+
+#include "qed_l2.h"
+#include "qed_mcp.h"
+
+#define T_ETH_INDIRECTION_TABLE_SIZE 128
+#define T_ETH_RSS_KEY_SIZE 10
+
+struct vf_pf_resc_request {
+	u8 num_rxqs;
+	u8 num_txqs;
+	u8 num_sbs;
+	u8 num_mac_filters;
+	u8 num_vlan_filters;
+	u8 num_mc_filters;
+	u8 num_cids;
+	u8 padding;
+};
+
+struct hw_sb_info {
+	u16 hw_sb_id;
+	u8 sb_qid;
+	u8 padding[5];
+};
+
+#define TLV_BUFFER_SIZE                 1024
+
+enum {
+	PFVF_STATUS_WAITING,
+	PFVF_STATUS_SUCCESS,
+	PFVF_STATUS_FAILURE,
+	PFVF_STATUS_NOT_SUPPORTED,
+	PFVF_STATUS_NO_RESOURCE,
+	PFVF_STATUS_FORCED,
+	PFVF_STATUS_MALICIOUS,
+};
+
+/* vf pf channel tlvs */
+/* general tlv header (used for both vf->pf request and pf->vf response) */
+struct channel_tlv {
+	u16 type;
+	u16 length;
+};
+
+/* header of first vf->pf tlv carries the offset used to calculate reponse
+ * buffer address
+ */
+struct vfpf_first_tlv {
+	struct channel_tlv tl;
+	u32 padding;
+	u64 reply_address;
+};
+
+/* header of pf->vf tlvs, carries the status of handling the request */
+struct pfvf_tlv {
+	struct channel_tlv tl;
+	u8 status;
+	u8 padding[3];
+};
+
+/* response tlv used for most tlvs */
+struct pfvf_def_resp_tlv {
+	struct pfvf_tlv hdr;
+};
+
+/* used to terminate and pad a tlv list */
+struct channel_list_end_tlv {
+	struct channel_tlv tl;
+	u8 padding[4];
+};
+
+#define VFPF_ACQUIRE_OS_LINUX (0)
+#define VFPF_ACQUIRE_OS_WINDOWS (1)
+#define VFPF_ACQUIRE_OS_ESX (2)
+#define VFPF_ACQUIRE_OS_SOLARIS (3)
+#define VFPF_ACQUIRE_OS_LINUX_USERSPACE (4)
+
+struct vfpf_acquire_tlv {
+	struct vfpf_first_tlv first_tlv;
+
+	struct vf_pf_vfdev_info {
+#define VFPF_ACQUIRE_CAP_PRE_FP_HSI     (1 << 0) /* VF pre-FP hsi version */
+#define VFPF_ACQUIRE_CAP_100G		(1 << 1) /* VF can support 100g */
+	/* A requirement for supporting multi-Tx queues on a single queue-zone,
+	 * VF would pass qids as additional information whenever passing queue
+	 * references.
+	 */
+#define VFPF_ACQUIRE_CAP_QUEUE_QIDS     BIT(2)
+
+	/* The VF is using the physical bar. While this is mostly internal
+	 * to the VF, might affect the number of CIDs supported assuming
+	 * QUEUE_QIDS is set.
+	 */
+#define VFPF_ACQUIRE_CAP_PHYSICAL_BAR   BIT(3)
+		u64 capabilities;
+		u8 fw_major;
+		u8 fw_minor;
+		u8 fw_revision;
+		u8 fw_engineering;
+		u32 driver_version;
+		u16 opaque_fid;	/* ME register value */
+		u8 os_type;	/* VFPF_ACQUIRE_OS_* value */
+		u8 eth_fp_hsi_major;
+		u8 eth_fp_hsi_minor;
+		u8 padding[3];
+	} vfdev_info;
+
+	struct vf_pf_resc_request resc_request;
+
+	u64 bulletin_addr;
+	u32 bulletin_size;
+	u32 padding;
+};
+
+/* receive side scaling tlv */
+struct vfpf_vport_update_rss_tlv {
+	struct channel_tlv tl;
+
+	u8 update_rss_flags;
+#define VFPF_UPDATE_RSS_CONFIG_FLAG       BIT(0)
+#define VFPF_UPDATE_RSS_CAPS_FLAG         BIT(1)
+#define VFPF_UPDATE_RSS_IND_TABLE_FLAG    BIT(2)
+#define VFPF_UPDATE_RSS_KEY_FLAG          BIT(3)
+
+	u8 rss_enable;
+	u8 rss_caps;
+	u8 rss_table_size_log;	/* The table size is 2 ^ rss_table_size_log */
+	u16 rss_ind_table[T_ETH_INDIRECTION_TABLE_SIZE];
+	u32 rss_key[T_ETH_RSS_KEY_SIZE];
+};
+
+struct pfvf_storm_stats {
+	u32 address;
+	u32 len;
+};
+
+struct pfvf_stats_info {
+	struct pfvf_storm_stats mstats;
+	struct pfvf_storm_stats pstats;
+	struct pfvf_storm_stats tstats;
+	struct pfvf_storm_stats ustats;
+};
+
+struct pfvf_acquire_resp_tlv {
+	struct pfvf_tlv hdr;
+
+	struct pf_vf_pfdev_info {
+		u32 chip_num;
+		u32 mfw_ver;
+
+		u16 fw_major;
+		u16 fw_minor;
+		u16 fw_rev;
+		u16 fw_eng;
+
+		u64 capabilities;
+#define PFVF_ACQUIRE_CAP_DEFAULT_UNTAGGED	BIT(0)
+#define PFVF_ACQUIRE_CAP_100G			BIT(1)	/* If set, 100g PF */
+/* There are old PF versions where the PF might mistakenly override the sanity
+ * mechanism [version-based] and allow a VF that can't be supported to pass
+ * the acquisition phase.
+ * To overcome this, PFs now indicate that they're past that point and the new
+ * VFs would fail probe on the older PFs that fail to do so.
+ */
+#define PFVF_ACQUIRE_CAP_POST_FW_OVERRIDE	BIT(2)
+
+	/* PF expects queues to be received with additional qids */
+#define PFVF_ACQUIRE_CAP_QUEUE_QIDS             BIT(3)
+
+		u16 db_size;
+		u8 indices_per_sb;
+		u8 os_type;
+
+		/* These should match the PF's qed_dev values */
+		u16 chip_rev;
+		u8 dev_type;
+
+		/* Doorbell bar size configured in HW: log(size) or 0 */
+		u8 bar_size;
+
+		struct pfvf_stats_info stats_info;
+
+		u8 port_mac[ETH_ALEN];
+
+		/* It's possible PF had to configure an older fastpath HSI
+		 * [in case VF is newer than PF]. This is communicated back
+		 * to the VF. It can also be used in case of error due to
+		 * non-matching versions to shed light in VF about failure.
+		 */
+		u8 major_fp_hsi;
+		u8 minor_fp_hsi;
+	} pfdev_info;
+
+	struct pf_vf_resc {
+#define PFVF_MAX_QUEUES_PER_VF		16
+#define PFVF_MAX_SBS_PER_VF		16
+		struct hw_sb_info hw_sbs[PFVF_MAX_SBS_PER_VF];
+		u8 hw_qid[PFVF_MAX_QUEUES_PER_VF];
+		u8 cid[PFVF_MAX_QUEUES_PER_VF];
+
+		u8 num_rxqs;
+		u8 num_txqs;
+		u8 num_sbs;
+		u8 num_mac_filters;
+		u8 num_vlan_filters;
+		u8 num_mc_filters;
+		u8 num_cids;
+		u8 padding;
+	} resc;
+
+	u32 bulletin_size;
+	u32 padding;
+};
+
+struct pfvf_start_queue_resp_tlv {
+	struct pfvf_tlv hdr;
+	u32 offset;		/* offset to consumer/producer of queue */
+	u8 padding[4];
+};
+
+/* Extended queue information - additional index for reference inside qzone.
+ * If commmunicated between VF/PF, each TLV relating to queues should be
+ * extended by one such [or have a future base TLV that already contains info].
+ */
+struct vfpf_qid_tlv {
+	struct channel_tlv tl;
+	u8 qid;
+	u8 padding[3];
+};
+
+/* Setup Queue */
+struct vfpf_start_rxq_tlv {
+	struct vfpf_first_tlv first_tlv;
+
+	/* physical addresses */
+	u64 rxq_addr;
+	u64 deprecated_sge_addr;
+	u64 cqe_pbl_addr;
+
+	u16 cqe_pbl_size;
+	u16 hw_sb;
+	u16 rx_qid;
+	u16 hc_rate;		/* desired interrupts per sec. */
+
+	u16 bd_max_bytes;
+	u16 stat_id;
+	u8 sb_index;
+	u8 padding[3];
+};
+
+struct vfpf_start_txq_tlv {
+	struct vfpf_first_tlv first_tlv;
+
+	/* physical addresses */
+	u64 pbl_addr;
+	u16 pbl_size;
+	u16 stat_id;
+	u16 tx_qid;
+	u16 hw_sb;
+
+	u32 flags;		/* VFPF_QUEUE_FLG_X flags */
+	u16 hc_rate;		/* desired interrupts per sec. */
+	u8 sb_index;
+	u8 padding[3];
+};
+
+/* Stop RX Queue */
+struct vfpf_stop_rxqs_tlv {
+	struct vfpf_first_tlv first_tlv;
+
+	u16 rx_qid;
+
+	/* this field is deprecated and should *always* be set to '1' */
+	u8 num_rxqs;
+	u8 cqe_completion;
+	u8 padding[4];
+};
+
+/* Stop TX Queues */
+struct vfpf_stop_txqs_tlv {
+	struct vfpf_first_tlv first_tlv;
+
+	u16 tx_qid;
+
+	/* this field is deprecated and should *always* be set to '1' */
+	u8 num_txqs;
+	u8 padding[5];
+};
+
+struct vfpf_update_rxq_tlv {
+	struct vfpf_first_tlv first_tlv;
+
+	u64 deprecated_sge_addr[PFVF_MAX_QUEUES_PER_VF];
+
+	u16 rx_qid;
+	u8 num_rxqs;
+	u8 flags;
+#define VFPF_RXQ_UPD_INIT_SGE_DEPRECATE_FLAG    BIT(0)
+#define VFPF_RXQ_UPD_COMPLETE_CQE_FLAG          BIT(1)
+#define VFPF_RXQ_UPD_COMPLETE_EVENT_FLAG        BIT(2)
+
+	u8 padding[4];
+};
+
+/* Set Queue Filters */
+struct vfpf_q_mac_vlan_filter {
+	u32 flags;
+#define VFPF_Q_FILTER_DEST_MAC_VALID    0x01
+#define VFPF_Q_FILTER_VLAN_TAG_VALID    0x02
+#define VFPF_Q_FILTER_SET_MAC           0x100	/* set/clear */
+
+	u8 mac[ETH_ALEN];
+	u16 vlan_tag;
+
+	u8 padding[4];
+};
+
+/* Start a vport */
+struct vfpf_vport_start_tlv {
+	struct vfpf_first_tlv first_tlv;
+
+	u64 sb_addr[PFVF_MAX_SBS_PER_VF];
+
+	u32 tpa_mode;
+	u16 dep1;
+	u16 mtu;
+
+	u8 vport_id;
+	u8 inner_vlan_removal;
+
+	u8 only_untagged;
+	u8 max_buffers_per_cqe;
+
+	u8 padding[4];
+};
+
+/* Extended tlvs - need to add rss, mcast, accept mode tlvs */
+struct vfpf_vport_update_activate_tlv {
+	struct channel_tlv tl;
+	u8 update_rx;
+	u8 update_tx;
+	u8 active_rx;
+	u8 active_tx;
+};
+
+struct vfpf_vport_update_tx_switch_tlv {
+	struct channel_tlv tl;
+	u8 tx_switching;
+	u8 padding[3];
+};
+
+struct vfpf_vport_update_vlan_strip_tlv {
+	struct channel_tlv tl;
+	u8 remove_vlan;
+	u8 padding[3];
+};
+
+struct vfpf_vport_update_mcast_bin_tlv {
+	struct channel_tlv tl;
+	u8 padding[4];
+
+	u64 bins[8];
+};
+
+struct vfpf_vport_update_accept_param_tlv {
+	struct channel_tlv tl;
+	u8 update_rx_mode;
+	u8 update_tx_mode;
+	u8 rx_accept_filter;
+	u8 tx_accept_filter;
+};
+
+struct vfpf_vport_update_accept_any_vlan_tlv {
+	struct channel_tlv tl;
+	u8 update_accept_any_vlan_flg;
+	u8 accept_any_vlan;
+
+	u8 padding[2];
+};
+
+struct vfpf_vport_update_sge_tpa_tlv {
+	struct channel_tlv tl;
+
+	u16 sge_tpa_flags;
+#define VFPF_TPA_IPV4_EN_FLAG		BIT(0)
+#define VFPF_TPA_IPV6_EN_FLAG		BIT(1)
+#define VFPF_TPA_PKT_SPLIT_FLAG		BIT(2)
+#define VFPF_TPA_HDR_DATA_SPLIT_FLAG	BIT(3)
+#define VFPF_TPA_GRO_CONSIST_FLAG	BIT(4)
+
+	u8 update_sge_tpa_flags;
+#define VFPF_UPDATE_SGE_DEPRECATED_FLAG	BIT(0)
+#define VFPF_UPDATE_TPA_EN_FLAG		BIT(1)
+#define VFPF_UPDATE_TPA_PARAM_FLAG	BIT(2)
+
+	u8 max_buffers_per_cqe;
+
+	u16 deprecated_sge_buff_size;
+	u16 tpa_max_size;
+	u16 tpa_min_size_to_start;
+	u16 tpa_min_size_to_cont;
+
+	u8 tpa_max_aggs_num;
+	u8 padding[7];
+};
+
+/* Primary tlv as a header for various extended tlvs for
+ * various functionalities in vport update ramrod.
+ */
+struct vfpf_vport_update_tlv {
+	struct vfpf_first_tlv first_tlv;
+};
+
+struct vfpf_ucast_filter_tlv {
+	struct vfpf_first_tlv first_tlv;
+
+	u8 opcode;
+	u8 type;
+
+	u8 mac[ETH_ALEN];
+
+	u16 vlan;
+	u16 padding[3];
+};
+
+/* tunnel update param tlv */
+struct vfpf_update_tunn_param_tlv {
+	struct vfpf_first_tlv first_tlv;
+
+	u8 tun_mode_update_mask;
+	u8 tunn_mode;
+	u8 update_tun_cls;
+	u8 vxlan_clss;
+	u8 l2gre_clss;
+	u8 ipgre_clss;
+	u8 l2geneve_clss;
+	u8 ipgeneve_clss;
+	u8 update_geneve_port;
+	u8 update_vxlan_port;
+	u16 geneve_port;
+	u16 vxlan_port;
+	u8 padding[2];
+};
+
+struct pfvf_update_tunn_param_tlv {
+	struct pfvf_tlv hdr;
+
+	u16 tunn_feature_mask;
+	u8 vxlan_mode;
+	u8 l2geneve_mode;
+	u8 ipgeneve_mode;
+	u8 l2gre_mode;
+	u8 ipgre_mode;
+	u8 vxlan_clss;
+	u8 l2gre_clss;
+	u8 ipgre_clss;
+	u8 l2geneve_clss;
+	u8 ipgeneve_clss;
+	u16 vxlan_udp_port;
+	u16 geneve_udp_port;
+};
+
+struct tlv_buffer_size {
+	u8 tlv_buffer[TLV_BUFFER_SIZE];
+};
+
+struct vfpf_update_coalesce {
+	struct vfpf_first_tlv first_tlv;
+	u16 rx_coal;
+	u16 tx_coal;
+	u16 qid;
+	u8 padding[2];
+};
+
+struct vfpf_read_coal_req_tlv {
+	struct vfpf_first_tlv first_tlv;
+	u16 qid;
+	u8 is_rx;
+	u8 padding[5];
+};
+
+struct pfvf_read_coal_resp_tlv {
+	struct pfvf_tlv hdr;
+	u16 coal;
+	u8 padding[6];
+};
+
+union vfpf_tlvs {
+	struct vfpf_first_tlv first_tlv;
+	struct vfpf_acquire_tlv acquire;
+	struct vfpf_start_rxq_tlv start_rxq;
+	struct vfpf_start_txq_tlv start_txq;
+	struct vfpf_stop_rxqs_tlv stop_rxqs;
+	struct vfpf_stop_txqs_tlv stop_txqs;
+	struct vfpf_update_rxq_tlv update_rxq;
+	struct vfpf_vport_start_tlv start_vport;
+	struct vfpf_vport_update_tlv vport_update;
+	struct vfpf_ucast_filter_tlv ucast_filter;
+	struct vfpf_update_tunn_param_tlv tunn_param_update;
+	struct vfpf_update_coalesce update_coalesce;
+	struct vfpf_read_coal_req_tlv read_coal_req;
+	struct tlv_buffer_size tlv_buf_size;
+};
+
+union pfvf_tlvs {
+	struct pfvf_def_resp_tlv default_resp;
+	struct pfvf_acquire_resp_tlv acquire_resp;
+	struct tlv_buffer_size tlv_buf_size;
+	struct pfvf_start_queue_resp_tlv queue_start;
+	struct pfvf_update_tunn_param_tlv tunn_param_resp;
+	struct pfvf_read_coal_resp_tlv read_coal_resp;
+};
+
+enum qed_bulletin_bit {
+	/* Alert the VF that a forced MAC was set by the PF */
+	MAC_ADDR_FORCED = 0,
+	/* Alert the VF that a forced VLAN was set by the PF */
+	VLAN_ADDR_FORCED = 2,
+
+	/* Indicate that `default_only_untagged' contains actual data */
+	VFPF_BULLETIN_UNTAGGED_DEFAULT = 3,
+	VFPF_BULLETIN_UNTAGGED_DEFAULT_FORCED = 4,
+
+	/* Alert the VF that suggested mac was sent by the PF.
+	 * MAC_ADDR will be disabled in case MAC_ADDR_FORCED is set.
+	 */
+	VFPF_BULLETIN_MAC_ADDR = 5
+};
+
+struct qed_bulletin_content {
+	/* crc of structure to ensure is not in mid-update */
+	u32 crc;
+
+	u32 version;
+
+	/* bitmap indicating which fields hold valid values */
+	u64 valid_bitmap;
+
+	/* used for MAC_ADDR or MAC_ADDR_FORCED */
+	u8 mac[ETH_ALEN];
+
+	/* If valid, 1 => only untagged Rx if no vlan is configured */
+	u8 default_only_untagged;
+	u8 padding;
+
+	/* The following is a 'copy' of qed_mcp_link_state,
+	 * qed_mcp_link_params and qed_mcp_link_capabilities. Since it's
+	 * possible the structs will increase further along the road we cannot
+	 * have it here; Instead we need to have all of its fields.
+	 */
+	u8 req_autoneg;
+	u8 req_autoneg_pause;
+	u8 req_forced_rx;
+	u8 req_forced_tx;
+	u8 padding2[4];
+
+	u32 req_adv_speed;
+	u32 req_forced_speed;
+	u32 req_loopback;
+	u32 padding3;
+
+	u8 link_up;
+	u8 full_duplex;
+	u8 autoneg;
+	u8 autoneg_complete;
+	u8 parallel_detection;
+	u8 pfc_enabled;
+	u8 partner_tx_flow_ctrl_en;
+	u8 partner_rx_flow_ctrl_en;
+	u8 partner_adv_pause;
+	u8 sfp_tx_fault;
+	u16 vxlan_udp_port;
+	u16 geneve_udp_port;
+	u8 padding4[2];
+
+	u32 speed;
+	u32 partner_adv_speed;
+
+	u32 capability_speed;
+
+	/* Forced vlan */
+	u16 pvid;
+	u16 padding5;
+};
+
+struct qed_bulletin {
+	dma_addr_t phys;
+	struct qed_bulletin_content *p_virt;
+	u32 size;
+};
+
+enum {
+	CHANNEL_TLV_NONE,	/* ends tlv sequence */
+	CHANNEL_TLV_ACQUIRE,
+	CHANNEL_TLV_VPORT_START,
+	CHANNEL_TLV_VPORT_UPDATE,
+	CHANNEL_TLV_VPORT_TEARDOWN,
+	CHANNEL_TLV_START_RXQ,
+	CHANNEL_TLV_START_TXQ,
+	CHANNEL_TLV_STOP_RXQS,
+	CHANNEL_TLV_STOP_TXQS,
+	CHANNEL_TLV_UPDATE_RXQ,
+	CHANNEL_TLV_INT_CLEANUP,
+	CHANNEL_TLV_CLOSE,
+	CHANNEL_TLV_RELEASE,
+	CHANNEL_TLV_LIST_END,
+	CHANNEL_TLV_UCAST_FILTER,
+	CHANNEL_TLV_VPORT_UPDATE_ACTIVATE,
+	CHANNEL_TLV_VPORT_UPDATE_TX_SWITCH,
+	CHANNEL_TLV_VPORT_UPDATE_VLAN_STRIP,
+	CHANNEL_TLV_VPORT_UPDATE_MCAST,
+	CHANNEL_TLV_VPORT_UPDATE_ACCEPT_PARAM,
+	CHANNEL_TLV_VPORT_UPDATE_RSS,
+	CHANNEL_TLV_VPORT_UPDATE_ACCEPT_ANY_VLAN,
+	CHANNEL_TLV_VPORT_UPDATE_SGE_TPA,
+	CHANNEL_TLV_UPDATE_TUNN_PARAM,
+	CHANNEL_TLV_COALESCE_UPDATE,
+	CHANNEL_TLV_QID,
+	CHANNEL_TLV_COALESCE_READ,
+	CHANNEL_TLV_MAX,
+
+	/* Required for iterating over vport-update tlvs.
+	 * Will break in case non-sequential vport-update tlvs.
+	 */
+	CHANNEL_TLV_VPORT_UPDATE_MAX = CHANNEL_TLV_VPORT_UPDATE_SGE_TPA + 1,
+};
+
+/* Default number of CIDs [total of both Rx and Tx] to be requested
+ * by default, and maximum possible number.
+ */
+#define QED_ETH_VF_DEFAULT_NUM_CIDS (32)
+#define QED_ETH_VF_MAX_NUM_CIDS (250)
+
+/* This data is held in the qed_hwfn structure for VFs only. */
+struct qed_vf_iov {
+	union vfpf_tlvs *vf2pf_request;
+	dma_addr_t vf2pf_request_phys;
+	union pfvf_tlvs *pf2vf_reply;
+	dma_addr_t pf2vf_reply_phys;
+
+	/* Should be taken whenever the mailbox buffers are accessed */
+	struct mutex mutex;
+	u8 *offset;
+
+	/* Bulletin Board */
+	struct qed_bulletin bulletin;
+	struct qed_bulletin_content bulletin_shadow;
+
+	/* we set aside a copy of the acquire response */
+	struct pfvf_acquire_resp_tlv acquire_resp;
+
+	/* In case PF originates prior to the fp-hsi version comparison,
+	 * this has to be propagated as it affects the fastpath.
+	 */
+	bool b_pre_fp_hsi;
+
+	/* Current day VFs are passing the SBs physical address on vport
+	 * start, and as they lack an IGU mapping they need to store the
+	 * addresses of previously registered SBs.
+	 * Even if we were to change configuration flow, due to backward
+	 * compatibility [with older PFs] we'd still need to store these.
+	 */
+	struct qed_sb_info *sbs_info[PFVF_MAX_SBS_PER_VF];
+
+	/* Determines whether VF utilizes doorbells via limited register
+	 * bar or via the doorbell bar.
+	 */
+	bool b_doorbell_bar;
+};
+
+/**
+ * @brief VF - Set Rx/Tx coalesce per VF's relative queue.
+ *             Coalesce value '0' will omit the configuration.
+ *
+ * @param p_hwfn
+ * @param rx_coal - coalesce value in micro second for rx queue
+ * @param tx_coal - coalesce value in micro second for tx queue
+ * @param p_cid   - queue cid
+ *
+ **/
+int qed_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn,
+			   u16 rx_coal,
+			   u16 tx_coal, struct qed_queue_cid *p_cid);
+
+/**
+ * @brief VF - Get coalesce per VF's relative queue.
+ *
+ * @param p_hwfn
+ * @param p_coal - coalesce value in micro second for VF queues.
+ * @param p_cid  - queue cid
+ *
+ **/
+int qed_vf_pf_get_coalesce(struct qed_hwfn *p_hwfn,
+			   u16 *p_coal, struct qed_queue_cid *p_cid);
+
+#ifdef CONFIG_QED_SRIOV
+/**
+ * @brief Read the VF bulletin and act on it if needed
+ *
+ * @param p_hwfn
+ * @param p_change - qed fills 1 iff bulletin board has changed, 0 otherwise.
+ *
+ * @return enum _qed_status
+ */
+int qed_vf_read_bulletin(struct qed_hwfn *p_hwfn, u8 *p_change);
+
+/**
+ * @brief Get link paramters for VF from qed
+ *
+ * @param p_hwfn
+ * @param params - the link params structure to be filled for the VF
+ */
+void qed_vf_get_link_params(struct qed_hwfn *p_hwfn,
+			    struct qed_mcp_link_params *params);
+
+/**
+ * @brief Get link state for VF from qed
+ *
+ * @param p_hwfn
+ * @param link - the link state structure to be filled for the VF
+ */
+void qed_vf_get_link_state(struct qed_hwfn *p_hwfn,
+			   struct qed_mcp_link_state *link);
+
+/**
+ * @brief Get link capabilities for VF from qed
+ *
+ * @param p_hwfn
+ * @param p_link_caps - the link capabilities structure to be filled for the VF
+ */
+void qed_vf_get_link_caps(struct qed_hwfn *p_hwfn,
+			  struct qed_mcp_link_capabilities *p_link_caps);
+
+/**
+ * @brief Get number of Rx queues allocated for VF by qed
+ *
+ *  @param p_hwfn
+ *  @param num_rxqs - allocated RX queues
+ */
+void qed_vf_get_num_rxqs(struct qed_hwfn *p_hwfn, u8 *num_rxqs);
+
+/**
+ * @brief Get number of Rx queues allocated for VF by qed
+ *
+ *  @param p_hwfn
+ *  @param num_txqs - allocated RX queues
+ */
+void qed_vf_get_num_txqs(struct qed_hwfn *p_hwfn, u8 *num_txqs);
+
+/**
+ * @brief Get number of available connections [both Rx and Tx] for VF
+ *
+ * @param p_hwfn
+ * @param num_cids - allocated number of connections
+ */
+void qed_vf_get_num_cids(struct qed_hwfn *p_hwfn, u8 *num_cids);
+
+/**
+ * @brief Get port mac address for VF
+ *
+ * @param p_hwfn
+ * @param port_mac - destination location for port mac
+ */
+void qed_vf_get_port_mac(struct qed_hwfn *p_hwfn, u8 *port_mac);
+
+/**
+ * @brief Get number of VLAN filters allocated for VF by qed
+ *
+ *  @param p_hwfn
+ *  @param num_rxqs - allocated VLAN filters
+ */
+void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn,
+				 u8 *num_vlan_filters);
+
+/**
+ * @brief Get number of MAC filters allocated for VF by qed
+ *
+ *  @param p_hwfn
+ *  @param num_rxqs - allocated MAC filters
+ */
+void qed_vf_get_num_mac_filters(struct qed_hwfn *p_hwfn, u8 *num_mac_filters);
+
+/**
+ * @brief Check if VF can set a MAC address
+ *
+ * @param p_hwfn
+ * @param mac
+ *
+ * @return bool
+ */
+bool qed_vf_check_mac(struct qed_hwfn *p_hwfn, u8 *mac);
+
+/**
+ * @brief Set firmware version information in dev_info from VFs acquire response tlv
+ *
+ * @param p_hwfn
+ * @param fw_major
+ * @param fw_minor
+ * @param fw_rev
+ * @param fw_eng
+ */
+void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
+			   u16 *fw_major, u16 *fw_minor,
+			   u16 *fw_rev, u16 *fw_eng);
+
+/**
+ * @brief hw preparation for VF
+ *      sends ACQUIRE message
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief VF - start the RX Queue by sending a message to the PF
+ * @param p_hwfn
+ * @param p_cid			- Only relative fields are relevant
+ * @param bd_max_bytes          - maximum number of bytes per bd
+ * @param bd_chain_phys_addr    - physical address of bd chain
+ * @param cqe_pbl_addr          - physical address of pbl
+ * @param cqe_pbl_size          - pbl size
+ * @param pp_prod               - pointer to the producer to be
+ *				  used in fastpath
+ *
+ * @return int
+ */
+int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
+			struct qed_queue_cid *p_cid,
+			u16 bd_max_bytes,
+			dma_addr_t bd_chain_phys_addr,
+			dma_addr_t cqe_pbl_addr,
+			u16 cqe_pbl_size, void __iomem **pp_prod);
+
+/**
+ * @brief VF - start the TX queue by sending a message to the
+ *        PF.
+ *
+ * @param p_hwfn
+ * @param tx_queue_id           - zero based within the VF
+ * @param sb                    - status block for this queue
+ * @param sb_index              - index within the status block
+ * @param bd_chain_phys_addr    - physical address of tx chain
+ * @param pp_doorbell           - pointer to address to which to
+ *                      write the doorbell too..
+ *
+ * @return int
+ */
+int
+qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
+		    struct qed_queue_cid *p_cid,
+		    dma_addr_t pbl_addr,
+		    u16 pbl_size, void __iomem **pp_doorbell);
+
+/**
+ * @brief VF - stop the RX queue by sending a message to the PF
+ *
+ * @param p_hwfn
+ * @param p_cid
+ * @param cqe_completion
+ *
+ * @return int
+ */
+int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn,
+		       struct qed_queue_cid *p_cid, bool cqe_completion);
+
+/**
+ * @brief VF - stop the TX queue by sending a message to the PF
+ *
+ * @param p_hwfn
+ * @param tx_qid
+ *
+ * @return int
+ */
+int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, struct qed_queue_cid *p_cid);
+
+/**
+ * @brief VF - send a vport update command
+ *
+ * @param p_hwfn
+ * @param params
+ *
+ * @return int
+ */
+int qed_vf_pf_vport_update(struct qed_hwfn *p_hwfn,
+			   struct qed_sp_vport_update_params *p_params);
+
+/**
+ *
+ * @brief VF - send a close message to PF
+ *
+ * @param p_hwfn
+ *
+ * @return enum _qed_status
+ */
+int qed_vf_pf_reset(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief VF - free vf`s memories
+ *
+ * @param p_hwfn
+ *
+ * @return enum _qed_status
+ */
+int qed_vf_pf_release(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief qed_vf_get_igu_sb_id - Get the IGU SB ID for a given
+ *        sb_id. For VFs igu sbs don't have to be contiguous
+ *
+ * @param p_hwfn
+ * @param sb_id
+ *
+ * @return INLINE u16
+ */
+u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id);
+
+/**
+ * @brief Stores [or removes] a configured sb_info.
+ *
+ * @param p_hwfn
+ * @param sb_id - zero-based SB index [for fastpath]
+ * @param sb_info - may be NULL [during removal].
+ */
+void qed_vf_set_sb_info(struct qed_hwfn *p_hwfn,
+			u16 sb_id, struct qed_sb_info *p_sb);
+
+/**
+ * @brief qed_vf_pf_vport_start - perform vport start for VF.
+ *
+ * @param p_hwfn
+ * @param vport_id
+ * @param mtu
+ * @param inner_vlan_removal
+ * @param tpa_mode
+ * @param max_buffers_per_cqe,
+ * @param only_untagged - default behavior regarding vlan acceptance
+ *
+ * @return enum _qed_status
+ */
+int qed_vf_pf_vport_start(struct qed_hwfn *p_hwfn,
+			  u8 vport_id,
+			  u16 mtu,
+			  u8 inner_vlan_removal,
+			  enum qed_tpa_mode tpa_mode,
+			  u8 max_buffers_per_cqe, u8 only_untagged);
+
+/**
+ * @brief qed_vf_pf_vport_stop - stop the VF's vport
+ *
+ * @param p_hwfn
+ *
+ * @return enum _qed_status
+ */
+int qed_vf_pf_vport_stop(struct qed_hwfn *p_hwfn);
+
+int qed_vf_pf_filter_ucast(struct qed_hwfn *p_hwfn,
+			   struct qed_filter_ucast *p_param);
+
+void qed_vf_pf_filter_mcast(struct qed_hwfn *p_hwfn,
+			    struct qed_filter_mcast *p_filter_cmd);
+
+/**
+ * @brief qed_vf_pf_int_cleanup - clean the SB of the VF
+ *
+ * @param p_hwfn
+ *
+ * @return enum _qed_status
+ */
+int qed_vf_pf_int_cleanup(struct qed_hwfn *p_hwfn);
+
+/**
+ * @brief - return the link params in a given bulletin board
+ *
+ * @param p_hwfn
+ * @param p_params - pointer to a struct to fill with link params
+ * @param p_bulletin
+ */
+void __qed_vf_get_link_params(struct qed_hwfn *p_hwfn,
+			      struct qed_mcp_link_params *p_params,
+			      struct qed_bulletin_content *p_bulletin);
+
+/**
+ * @brief - return the link state in a given bulletin board
+ *
+ * @param p_hwfn
+ * @param p_link - pointer to a struct to fill with link state
+ * @param p_bulletin
+ */
+void __qed_vf_get_link_state(struct qed_hwfn *p_hwfn,
+			     struct qed_mcp_link_state *p_link,
+			     struct qed_bulletin_content *p_bulletin);
+
+/**
+ * @brief - return the link capabilities in a given bulletin board
+ *
+ * @param p_hwfn
+ * @param p_link - pointer to a struct to fill with link capabilities
+ * @param p_bulletin
+ */
+void __qed_vf_get_link_caps(struct qed_hwfn *p_hwfn,
+			    struct qed_mcp_link_capabilities *p_link_caps,
+			    struct qed_bulletin_content *p_bulletin);
+
+void qed_iov_vf_task(struct work_struct *work);
+void qed_vf_set_vf_start_tunn_update_param(struct qed_tunnel_info *p_tun);
+int qed_vf_pf_tunnel_param_update(struct qed_hwfn *p_hwfn,
+				  struct qed_tunnel_info *p_tunn);
+
+u32 qed_vf_hw_bar_size(struct qed_hwfn *p_hwfn, enum BAR_ID bar_id);
+#else
+static inline void qed_vf_get_link_params(struct qed_hwfn *p_hwfn,
+					  struct qed_mcp_link_params *params)
+{
+}
+
+static inline void qed_vf_get_link_state(struct qed_hwfn *p_hwfn,
+					 struct qed_mcp_link_state *link)
+{
+}
+
+static inline void
+qed_vf_get_link_caps(struct qed_hwfn *p_hwfn,
+		     struct qed_mcp_link_capabilities *p_link_caps)
+{
+}
+
+static inline void qed_vf_get_num_rxqs(struct qed_hwfn *p_hwfn, u8 *num_rxqs)
+{
+}
+
+static inline void qed_vf_get_num_txqs(struct qed_hwfn *p_hwfn, u8 *num_txqs)
+{
+}
+
+static inline void qed_vf_get_num_cids(struct qed_hwfn *p_hwfn, u8 *num_cids)
+{
+}
+
+static inline void qed_vf_get_port_mac(struct qed_hwfn *p_hwfn, u8 *port_mac)
+{
+}
+
+static inline void qed_vf_get_num_vlan_filters(struct qed_hwfn *p_hwfn,
+					       u8 *num_vlan_filters)
+{
+}
+
+static inline void qed_vf_get_num_mac_filters(struct qed_hwfn *p_hwfn,
+					      u8 *num_mac_filters)
+{
+}
+
+static inline bool qed_vf_check_mac(struct qed_hwfn *p_hwfn, u8 *mac)
+{
+	return false;
+}
+
+static inline void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
+					 u16 *fw_major, u16 *fw_minor,
+					 u16 *fw_rev, u16 *fw_eng)
+{
+}
+
+static inline int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
+static inline int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
+				      struct qed_queue_cid *p_cid,
+				      u16 bd_max_bytes,
+				      dma_addr_t bd_chain_phys_adr,
+				      dma_addr_t cqe_pbl_addr,
+				      u16 cqe_pbl_size, void __iomem **pp_prod)
+{
+	return -EINVAL;
+}
+
+static inline int qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
+				      struct qed_queue_cid *p_cid,
+				      dma_addr_t pbl_addr,
+				      u16 pbl_size, void __iomem **pp_doorbell)
+{
+	return -EINVAL;
+}
+
+static inline int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn,
+				     struct qed_queue_cid *p_cid,
+				     bool cqe_completion)
+{
+	return -EINVAL;
+}
+
+static inline int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn,
+				     struct qed_queue_cid *p_cid)
+{
+	return -EINVAL;
+}
+
+static inline int
+qed_vf_pf_vport_update(struct qed_hwfn *p_hwfn,
+		       struct qed_sp_vport_update_params *p_params)
+{
+	return -EINVAL;
+}
+
+static inline int qed_vf_pf_reset(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
+static inline int qed_vf_pf_release(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
+static inline u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
+{
+	return 0;
+}
+
+static inline void qed_vf_set_sb_info(struct qed_hwfn *p_hwfn, u16 sb_id,
+				      struct qed_sb_info *p_sb)
+{
+}
+
+static inline int qed_vf_pf_vport_start(struct qed_hwfn *p_hwfn,
+					u8 vport_id,
+					u16 mtu,
+					u8 inner_vlan_removal,
+					enum qed_tpa_mode tpa_mode,
+					u8 max_buffers_per_cqe,
+					u8 only_untagged)
+{
+	return -EINVAL;
+}
+
+static inline int qed_vf_pf_vport_stop(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
+static inline int qed_vf_pf_filter_ucast(struct qed_hwfn *p_hwfn,
+					 struct qed_filter_ucast *p_param)
+{
+	return -EINVAL;
+}
+
+static inline void qed_vf_pf_filter_mcast(struct qed_hwfn *p_hwfn,
+					  struct qed_filter_mcast *p_filter_cmd)
+{
+}
+
+static inline int qed_vf_pf_int_cleanup(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
+static inline void __qed_vf_get_link_params(struct qed_hwfn *p_hwfn,
+					    struct qed_mcp_link_params
+					    *p_params,
+					    struct qed_bulletin_content
+					    *p_bulletin)
+{
+}
+
+static inline void __qed_vf_get_link_state(struct qed_hwfn *p_hwfn,
+					   struct qed_mcp_link_state *p_link,
+					   struct qed_bulletin_content
+					   *p_bulletin)
+{
+}
+
+static inline void
+__qed_vf_get_link_caps(struct qed_hwfn *p_hwfn,
+		       struct qed_mcp_link_capabilities *p_link_caps,
+		       struct qed_bulletin_content *p_bulletin)
+{
+}
+
+static inline void qed_iov_vf_task(struct work_struct *work)
+{
+}
+
+static inline void
+qed_vf_set_vf_start_tunn_update_param(struct qed_tunnel_info *p_tun)
+{
+}
+
+static inline int qed_vf_pf_tunnel_param_update(struct qed_hwfn *p_hwfn,
+						struct qed_tunnel_info *p_tunn)
+{
+	return -EINVAL;
+}
+
+static inline u32
+qed_vf_hw_bar_size(struct qed_hwfn  *p_hwfn,
+		   enum BAR_ID bar_id)
+{
+	return 0;
+}
+#endif
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qede/Makefile b/drivers/net/ethernet/qlogic/qede/Makefile
new file mode 100644
index 0000000..75408fb
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qede/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_QEDE) := qede.o
+
+qede-y := qede_main.o qede_fp.o qede_filter.o qede_ethtool.o qede_ptp.o
+qede-$(CONFIG_DCB) += qede_dcbnl.o
+qede-$(CONFIG_QED_RDMA) += qede_rdma.o
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
new file mode 100644
index 0000000..9935978
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -0,0 +1,546 @@
+/* QLogic qede NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _QEDE_H_
+#define _QEDE_H_
+#include <linux/compiler.h>
+#include <linux/version.h>
+#include <linux/workqueue.h>
+#include <linux/netdevice.h>
+#include <linux/interrupt.h>
+#include <linux/bitmap.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/bpf.h>
+#include <net/xdp.h>
+#include <linux/qed/qede_rdma.h>
+#include <linux/io.h>
+#ifdef CONFIG_RFS_ACCEL
+#include <linux/cpu_rmap.h>
+#endif
+#include <linux/qed/common_hsi.h>
+#include <linux/qed/eth_common.h>
+#include <linux/qed/qed_if.h>
+#include <linux/qed/qed_chain.h>
+#include <linux/qed/qed_eth_if.h>
+
+#define QEDE_MAJOR_VERSION		8
+#define QEDE_MINOR_VERSION		33
+#define QEDE_REVISION_VERSION		0
+#define QEDE_ENGINEERING_VERSION	20
+#define DRV_MODULE_VERSION __stringify(QEDE_MAJOR_VERSION) "."	\
+		__stringify(QEDE_MINOR_VERSION) "."		\
+		__stringify(QEDE_REVISION_VERSION) "."		\
+		__stringify(QEDE_ENGINEERING_VERSION)
+
+#define DRV_MODULE_SYM		qede
+
+struct qede_stats_common {
+	u64 no_buff_discards;
+	u64 packet_too_big_discard;
+	u64 ttl0_discard;
+	u64 rx_ucast_bytes;
+	u64 rx_mcast_bytes;
+	u64 rx_bcast_bytes;
+	u64 rx_ucast_pkts;
+	u64 rx_mcast_pkts;
+	u64 rx_bcast_pkts;
+	u64 mftag_filter_discards;
+	u64 mac_filter_discards;
+	u64 tx_ucast_bytes;
+	u64 tx_mcast_bytes;
+	u64 tx_bcast_bytes;
+	u64 tx_ucast_pkts;
+	u64 tx_mcast_pkts;
+	u64 tx_bcast_pkts;
+	u64 tx_err_drop_pkts;
+	u64 coalesced_pkts;
+	u64 coalesced_events;
+	u64 coalesced_aborts_num;
+	u64 non_coalesced_pkts;
+	u64 coalesced_bytes;
+
+	/* port */
+	u64 rx_64_byte_packets;
+	u64 rx_65_to_127_byte_packets;
+	u64 rx_128_to_255_byte_packets;
+	u64 rx_256_to_511_byte_packets;
+	u64 rx_512_to_1023_byte_packets;
+	u64 rx_1024_to_1518_byte_packets;
+	u64 rx_crc_errors;
+	u64 rx_mac_crtl_frames;
+	u64 rx_pause_frames;
+	u64 rx_pfc_frames;
+	u64 rx_align_errors;
+	u64 rx_carrier_errors;
+	u64 rx_oversize_packets;
+	u64 rx_jabbers;
+	u64 rx_undersize_packets;
+	u64 rx_fragments;
+	u64 tx_64_byte_packets;
+	u64 tx_65_to_127_byte_packets;
+	u64 tx_128_to_255_byte_packets;
+	u64 tx_256_to_511_byte_packets;
+	u64 tx_512_to_1023_byte_packets;
+	u64 tx_1024_to_1518_byte_packets;
+	u64 tx_pause_frames;
+	u64 tx_pfc_frames;
+	u64 brb_truncates;
+	u64 brb_discards;
+	u64 tx_mac_ctrl_frames;
+};
+
+struct qede_stats_bb {
+	u64 rx_1519_to_1522_byte_packets;
+	u64 rx_1519_to_2047_byte_packets;
+	u64 rx_2048_to_4095_byte_packets;
+	u64 rx_4096_to_9216_byte_packets;
+	u64 rx_9217_to_16383_byte_packets;
+	u64 tx_1519_to_2047_byte_packets;
+	u64 tx_2048_to_4095_byte_packets;
+	u64 tx_4096_to_9216_byte_packets;
+	u64 tx_9217_to_16383_byte_packets;
+	u64 tx_lpi_entry_count;
+	u64 tx_total_collisions;
+};
+
+struct qede_stats_ah {
+	u64 rx_1519_to_max_byte_packets;
+	u64 tx_1519_to_max_byte_packets;
+};
+
+struct qede_stats {
+	struct qede_stats_common common;
+
+	union {
+		struct qede_stats_bb bb;
+		struct qede_stats_ah ah;
+	};
+};
+
+struct qede_vlan {
+	struct list_head list;
+	u16 vid;
+	bool configured;
+};
+
+struct qede_rdma_dev {
+	struct qedr_dev *qedr_dev;
+	struct list_head entry;
+	struct list_head rdma_event_list;
+	struct workqueue_struct *rdma_wq;
+};
+
+struct qede_ptp;
+
+#define QEDE_RFS_MAX_FLTR	256
+
+struct qede_dev {
+	struct qed_dev			*cdev;
+	struct net_device		*ndev;
+	struct pci_dev			*pdev;
+
+	u32				dp_module;
+	u8				dp_level;
+
+	unsigned long flags;
+#define QEDE_FLAG_IS_VF			BIT(0)
+#define IS_VF(edev)	(!!((edev)->flags & QEDE_FLAG_IS_VF))
+#define QEDE_TX_TIMESTAMPING_EN		BIT(1)
+#define QEDE_FLAGS_PTP_TX_IN_PRORGESS	BIT(2)
+
+	const struct qed_eth_ops	*ops;
+	struct qede_ptp			*ptp;
+
+	struct qed_dev_eth_info dev_info;
+#define QEDE_MAX_RSS_CNT(edev)	((edev)->dev_info.num_queues)
+#define QEDE_MAX_TSS_CNT(edev)	((edev)->dev_info.num_queues)
+#define QEDE_IS_BB(edev) \
+	((edev)->dev_info.common.dev_type == QED_DEV_TYPE_BB)
+#define QEDE_IS_AH(edev) \
+	((edev)->dev_info.common.dev_type == QED_DEV_TYPE_AH)
+
+	struct qede_fastpath		*fp_array;
+	u8				req_num_tx;
+	u8				fp_num_tx;
+	u8				req_num_rx;
+	u8				fp_num_rx;
+	u16				req_queues;
+	u16				num_queues;
+#define QEDE_QUEUE_CNT(edev)	((edev)->num_queues)
+#define QEDE_RSS_COUNT(edev)	((edev)->num_queues - (edev)->fp_num_tx)
+#define QEDE_RX_QUEUE_IDX(edev, i)	(i)
+#define QEDE_TSS_COUNT(edev)	((edev)->num_queues - (edev)->fp_num_rx)
+
+	struct qed_int_info		int_info;
+
+	/* Smaller private varaiant of the RTNL lock */
+	struct mutex			qede_lock;
+	u32				state; /* Protected by qede_lock */
+	u16				rx_buf_size;
+	u32				rx_copybreak;
+
+	/* L2 header size + 2*VLANs (8 bytes) + LLC SNAP (8 bytes) */
+#define ETH_OVERHEAD			(ETH_HLEN + 8 + 8)
+	/* Max supported alignment is 256 (8 shift)
+	 * minimal alignment shift 6 is optimal for 57xxx HW performance
+	 */
+#define QEDE_RX_ALIGN_SHIFT		max(6, min(8, L1_CACHE_SHIFT))
+	/* We assume skb_build() uses sizeof(struct skb_shared_info) bytes
+	 * at the end of skb->data, to avoid wasting a full cache line.
+	 * This reduces memory use (skb->truesize).
+	 */
+#define QEDE_FW_RX_ALIGN_END					\
+	max_t(u64, 1UL << QEDE_RX_ALIGN_SHIFT,			\
+	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+
+	struct qede_stats		stats;
+#define QEDE_RSS_INDIR_INITED	BIT(0)
+#define QEDE_RSS_KEY_INITED	BIT(1)
+#define QEDE_RSS_CAPS_INITED	BIT(2)
+	u32 rss_params_inited; /* bit-field to track initialized rss params */
+	u16 rss_ind_table[128];
+	u32 rss_key[10];
+	u8 rss_caps;
+
+	u16			q_num_rx_buffers; /* Must be a power of two */
+	u16			q_num_tx_buffers; /* Must be a power of two */
+
+	bool gro_disable;
+	struct list_head vlan_list;
+	u16 configured_vlans;
+	u16 non_configured_vlans;
+	bool accept_any_vlan;
+	struct delayed_work		sp_task;
+	unsigned long			sp_flags;
+	u16				vxlan_dst_port;
+	u16				geneve_dst_port;
+
+	struct qede_arfs		*arfs;
+	bool				wol_enabled;
+
+	struct qede_rdma_dev		rdma_info;
+
+	struct bpf_prog *xdp_prog;
+};
+
+enum QEDE_STATE {
+	QEDE_STATE_CLOSED,
+	QEDE_STATE_OPEN,
+};
+
+#define HILO_U64(hi, lo)		((((u64)(hi)) << 32) + (lo))
+
+#define	MAX_NUM_TC	8
+#define	MAX_NUM_PRI	8
+
+/* The driver supports the new build_skb() API:
+ * RX ring buffer contains pointer to kmalloc() data only,
+ * skb are built only after the frame was DMA-ed.
+ */
+struct sw_rx_data {
+	struct page *data;
+	dma_addr_t mapping;
+	unsigned int page_offset;
+};
+
+enum qede_agg_state {
+	QEDE_AGG_STATE_NONE  = 0,
+	QEDE_AGG_STATE_START = 1,
+	QEDE_AGG_STATE_ERROR = 2
+};
+
+struct qede_agg_info {
+	/* rx_buf is a data buffer that can be placed / consumed from rx bd
+	 * chain. It has two purposes: We will preallocate the data buffer
+	 * for each aggregation when we open the interface and will place this
+	 * buffer on the rx-bd-ring when we receive TPA_START. We don't want
+	 * to be in a state where allocation fails, as we can't reuse the
+	 * consumer buffer in the rx-chain since FW may still be writing to it
+	 * (since header needs to be modified for TPA).
+	 * The second purpose is to keep a pointer to the bd buffer during
+	 * aggregation.
+	 */
+	struct sw_rx_data buffer;
+	dma_addr_t buffer_mapping;
+
+	struct sk_buff *skb;
+
+	/* We need some structs from the start cookie until termination */
+	u16 vlan_tag;
+	u16 start_cqe_bd_len;
+	u8 start_cqe_placement_offset;
+
+	u8 state;
+	u8 frag_id;
+
+	u8 tunnel_type;
+};
+
+struct qede_rx_queue {
+	__le16 *hw_cons_ptr;
+	void __iomem *hw_rxq_prod_addr;
+
+	/* Required for the allocation of replacement buffers */
+	struct device *dev;
+
+	struct bpf_prog *xdp_prog;
+
+	u16 sw_rx_cons;
+	u16 sw_rx_prod;
+
+	u16 filled_buffers;
+	u8 data_direction;
+	u8 rxq_id;
+
+	/* Used once per each NAPI run */
+	u16 num_rx_buffers;
+
+	u16 rx_headroom;
+
+	u32 rx_buf_size;
+	u32 rx_buf_seg_size;
+
+	struct sw_rx_data *sw_rx_ring;
+	struct qed_chain rx_bd_ring;
+	struct qed_chain rx_comp_ring ____cacheline_aligned;
+
+	/* GRO */
+	struct qede_agg_info tpa_info[ETH_TPA_MAX_AGGS_NUM];
+
+	/* Used once per each NAPI run */
+	u64 rcv_pkts;
+
+	u64 rx_hw_errors;
+	u64 rx_alloc_errors;
+	u64 rx_ip_frags;
+
+	u64 xdp_no_pass;
+
+	void *handle;
+	struct xdp_rxq_info xdp_rxq;
+};
+
+union db_prod {
+	struct eth_db_data data;
+	u32		raw;
+};
+
+struct sw_tx_bd {
+	struct sk_buff *skb;
+	u8 flags;
+/* Set on the first BD descriptor when there is a split BD */
+#define QEDE_TSO_SPLIT_BD		BIT(0)
+};
+
+struct sw_tx_xdp {
+	struct page *page;
+	dma_addr_t mapping;
+};
+
+struct qede_tx_queue {
+	u8 is_xdp;
+	bool is_legacy;
+	u16 sw_tx_cons;
+	u16 sw_tx_prod;
+	u16 num_tx_buffers; /* Slowpath only */
+
+	u64 xmit_pkts;
+	u64 stopped_cnt;
+
+	__le16 *hw_cons_ptr;
+
+	/* Needed for the mapping of packets */
+	struct device *dev;
+
+	void __iomem *doorbell_addr;
+	union db_prod tx_db;
+	int index; /* Slowpath only */
+#define QEDE_TXQ_XDP_TO_IDX(edev, txq)	((txq)->index - \
+					 QEDE_MAX_TSS_CNT(edev))
+#define QEDE_TXQ_IDX_TO_XDP(edev, idx)	((idx) + QEDE_MAX_TSS_CNT(edev))
+
+	/* Regular Tx requires skb + metadata for release purpose,
+	 * while XDP requires the pages and the mapped address.
+	 */
+	union {
+		struct sw_tx_bd *skbs;
+		struct sw_tx_xdp *xdp;
+	} sw_tx_ring;
+
+	struct qed_chain tx_pbl;
+
+	/* Slowpath; Should be kept in end [unless missing padding] */
+	void *handle;
+};
+
+#define BD_UNMAP_ADDR(bd)		HILO_U64(le32_to_cpu((bd)->addr.hi), \
+						 le32_to_cpu((bd)->addr.lo))
+#define BD_SET_UNMAP_ADDR_LEN(bd, maddr, len)				\
+	do {								\
+		(bd)->addr.hi = cpu_to_le32(upper_32_bits(maddr));	\
+		(bd)->addr.lo = cpu_to_le32(lower_32_bits(maddr));	\
+		(bd)->nbytes = cpu_to_le16(len);			\
+	} while (0)
+#define BD_UNMAP_LEN(bd)		(le16_to_cpu((bd)->nbytes))
+
+struct qede_fastpath {
+	struct qede_dev	*edev;
+#define QEDE_FASTPATH_TX	BIT(0)
+#define QEDE_FASTPATH_RX	BIT(1)
+#define QEDE_FASTPATH_XDP	BIT(2)
+#define QEDE_FASTPATH_COMBINED	(QEDE_FASTPATH_TX | QEDE_FASTPATH_RX)
+	u8			type;
+	u8			id;
+	u8			xdp_xmit;
+	struct napi_struct	napi;
+	struct qed_sb_info	*sb_info;
+	struct qede_rx_queue	*rxq;
+	struct qede_tx_queue	*txq;
+	struct qede_tx_queue	*xdp_tx;
+
+#define VEC_NAME_SIZE	(sizeof(((struct net_device *)0)->name) + 8)
+	char	name[VEC_NAME_SIZE];
+};
+
+/* Debug print definitions */
+#define DP_NAME(edev) ((edev)->ndev->name)
+
+#define XMIT_PLAIN		0
+#define XMIT_L4_CSUM		BIT(0)
+#define XMIT_LSO		BIT(1)
+#define XMIT_ENC		BIT(2)
+#define XMIT_ENC_GSO_L4_CSUM	BIT(3)
+
+#define QEDE_CSUM_ERROR			BIT(0)
+#define QEDE_CSUM_UNNECESSARY		BIT(1)
+#define QEDE_TUNN_CSUM_UNNECESSARY	BIT(2)
+
+#define QEDE_SP_RX_MODE			1
+
+#ifdef CONFIG_RFS_ACCEL
+int qede_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+		       u16 rxq_index, u32 flow_id);
+#define QEDE_SP_ARFS_CONFIG	4
+#define QEDE_SP_TASK_POLL_DELAY	(5 * HZ)
+#endif
+
+void qede_process_arfs_filters(struct qede_dev *edev, bool free_fltr);
+void qede_poll_for_freeing_arfs_filters(struct qede_dev *edev);
+void qede_arfs_filter_op(void *dev, void *filter, u8 fw_rc);
+void qede_free_arfs(struct qede_dev *edev);
+int qede_alloc_arfs(struct qede_dev *edev);
+int qede_add_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info);
+int qede_del_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info);
+int qede_get_cls_rule_entry(struct qede_dev *edev, struct ethtool_rxnfc *cmd);
+int qede_get_cls_rule_all(struct qede_dev *edev, struct ethtool_rxnfc *info,
+			  u32 *rule_locs);
+int qede_get_arfs_filter_count(struct qede_dev *edev);
+
+struct qede_reload_args {
+	void (*func)(struct qede_dev *edev, struct qede_reload_args *args);
+	union {
+		netdev_features_t features;
+		struct bpf_prog *new_prog;
+		u16 mtu;
+	} u;
+};
+
+/* Datapath functions definition */
+netdev_tx_t qede_start_xmit(struct sk_buff *skb, struct net_device *ndev);
+netdev_features_t qede_features_check(struct sk_buff *skb,
+				      struct net_device *dev,
+				      netdev_features_t features);
+void qede_tx_log_print(struct qede_dev *edev, struct qede_fastpath *fp);
+int qede_alloc_rx_buffer(struct qede_rx_queue *rxq, bool allow_lazy);
+int qede_free_tx_pkt(struct qede_dev *edev,
+		     struct qede_tx_queue *txq, int *len);
+int qede_poll(struct napi_struct *napi, int budget);
+irqreturn_t qede_msix_fp_int(int irq, void *fp_cookie);
+
+/* Filtering function definitions */
+void qede_force_mac(void *dev, u8 *mac, bool forced);
+void qede_udp_ports_update(void *dev, u16 vxlan_port, u16 geneve_port);
+int qede_set_mac_addr(struct net_device *ndev, void *p);
+
+int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid);
+int qede_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid);
+void qede_vlan_mark_nonconfigured(struct qede_dev *edev);
+int qede_configure_vlan_filters(struct qede_dev *edev);
+
+netdev_features_t qede_fix_features(struct net_device *dev,
+				    netdev_features_t features);
+int qede_set_features(struct net_device *dev, netdev_features_t features);
+void qede_set_rx_mode(struct net_device *ndev);
+void qede_config_rx_mode(struct net_device *ndev);
+void qede_fill_rss_params(struct qede_dev *edev,
+			  struct qed_update_vport_rss_params *rss, u8 *update);
+
+void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti);
+void qede_udp_tunnel_del(struct net_device *dev, struct udp_tunnel_info *ti);
+
+int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp);
+
+#ifdef CONFIG_DCB
+void qede_set_dcbnl_ops(struct net_device *ndev);
+#endif
+
+void qede_config_debug(uint debug, u32 *p_dp_module, u8 *p_dp_level);
+void qede_set_ethtool_ops(struct net_device *netdev);
+void qede_reload(struct qede_dev *edev,
+		 struct qede_reload_args *args, bool is_locked);
+int qede_change_mtu(struct net_device *dev, int new_mtu);
+void qede_fill_by_demand_stats(struct qede_dev *edev);
+void __qede_lock(struct qede_dev *edev);
+void __qede_unlock(struct qede_dev *edev);
+bool qede_has_rx_work(struct qede_rx_queue *rxq);
+int qede_txq_has_work(struct qede_tx_queue *txq);
+void qede_recycle_rx_bd_ring(struct qede_rx_queue *rxq, u8 count);
+void qede_update_rx_prod(struct qede_dev *edev, struct qede_rx_queue *rxq);
+
+#define RX_RING_SIZE_POW	13
+#define RX_RING_SIZE		((u16)BIT(RX_RING_SIZE_POW))
+#define NUM_RX_BDS_MAX		(RX_RING_SIZE - 1)
+#define NUM_RX_BDS_MIN		128
+#define NUM_RX_BDS_DEF		((u16)BIT(10) - 1)
+
+#define TX_RING_SIZE_POW	13
+#define TX_RING_SIZE		((u16)BIT(TX_RING_SIZE_POW))
+#define NUM_TX_BDS_MAX		(TX_RING_SIZE - 1)
+#define NUM_TX_BDS_MIN		128
+#define NUM_TX_BDS_DEF		NUM_TX_BDS_MAX
+
+#define QEDE_MIN_PKT_LEN		64
+#define QEDE_RX_HDR_SIZE		256
+#define QEDE_MAX_JUMBO_PACKET_SIZE	9600
+#define	for_each_queue(i) for (i = 0; i < edev->num_queues; i++)
+
+#endif /* _QEDE_H_ */
diff --git a/drivers/net/ethernet/qlogic/qede/qede_dcbnl.c b/drivers/net/ethernet/qlogic/qede/qede_dcbnl.c
new file mode 100644
index 0000000..6e7747b
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qede/qede_dcbnl.c
@@ -0,0 +1,352 @@
+/* QLogic qede NIC Driver
+* Copyright (c) 2015 QLogic Corporation
+*
+* This software is available under the terms of the GNU General Public License
+* (GPL) Version 2, available from the file COPYING in the main directory of
+* this source tree.
+*/
+
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/dcbnl.h>
+#include "qede.h"
+
+static u8 qede_dcbnl_getstate(struct net_device *netdev)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->getstate(edev->cdev);
+}
+
+static u8 qede_dcbnl_setstate(struct net_device *netdev, u8 state)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->setstate(edev->cdev, state);
+}
+
+static void qede_dcbnl_getpermhwaddr(struct net_device *netdev,
+				     u8 *perm_addr)
+{
+	memcpy(perm_addr, netdev->dev_addr, netdev->addr_len);
+}
+
+static void qede_dcbnl_getpgtccfgtx(struct net_device *netdev, int prio,
+				    u8 *prio_type, u8 *pgid, u8 *bw_pct,
+				    u8 *up_map)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	edev->ops->dcb->getpgtccfgtx(edev->cdev, prio, prio_type,
+				     pgid, bw_pct, up_map);
+}
+
+static void qede_dcbnl_getpgbwgcfgtx(struct net_device *netdev,
+				     int pgid, u8 *bw_pct)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	edev->ops->dcb->getpgbwgcfgtx(edev->cdev, pgid, bw_pct);
+}
+
+static void qede_dcbnl_getpgtccfgrx(struct net_device *netdev, int prio,
+				    u8 *prio_type, u8 *pgid, u8 *bw_pct,
+				    u8 *up_map)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	edev->ops->dcb->getpgtccfgrx(edev->cdev, prio, prio_type, pgid, bw_pct,
+				     up_map);
+}
+
+static void qede_dcbnl_getpgbwgcfgrx(struct net_device *netdev,
+				     int pgid, u8 *bw_pct)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	edev->ops->dcb->getpgbwgcfgrx(edev->cdev, pgid, bw_pct);
+}
+
+static void qede_dcbnl_getpfccfg(struct net_device *netdev, int prio,
+				 u8 *setting)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	edev->ops->dcb->getpfccfg(edev->cdev, prio, setting);
+}
+
+static void qede_dcbnl_setpfccfg(struct net_device *netdev, int prio,
+				 u8 setting)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	edev->ops->dcb->setpfccfg(edev->cdev, prio, setting);
+}
+
+static u8 qede_dcbnl_getcap(struct net_device *netdev, int capid, u8 *cap)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->getcap(edev->cdev, capid, cap);
+}
+
+static int qede_dcbnl_getnumtcs(struct net_device *netdev, int tcid, u8 *num)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->getnumtcs(edev->cdev, tcid, num);
+}
+
+static u8 qede_dcbnl_getpfcstate(struct net_device *netdev)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->getpfcstate(edev->cdev);
+}
+
+static int qede_dcbnl_getapp(struct net_device *netdev, u8 idtype, u16 id)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->getapp(edev->cdev, idtype, id);
+}
+
+static u8 qede_dcbnl_getdcbx(struct net_device *netdev)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->getdcbx(edev->cdev);
+}
+
+static void qede_dcbnl_setpgtccfgtx(struct net_device *netdev, int prio,
+				    u8 pri_type, u8 pgid, u8 bw_pct, u8 up_map)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->setpgtccfgtx(edev->cdev, prio, pri_type, pgid,
+					    bw_pct, up_map);
+}
+
+static void qede_dcbnl_setpgtccfgrx(struct net_device *netdev, int prio,
+				    u8 pri_type, u8 pgid, u8 bw_pct, u8 up_map)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->setpgtccfgrx(edev->cdev, prio, pri_type, pgid,
+					    bw_pct, up_map);
+}
+
+static void qede_dcbnl_setpgbwgcfgtx(struct net_device *netdev, int pgid,
+				     u8 bw_pct)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->setpgbwgcfgtx(edev->cdev, pgid, bw_pct);
+}
+
+static void qede_dcbnl_setpgbwgcfgrx(struct net_device *netdev, int pgid,
+				     u8 bw_pct)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->setpgbwgcfgrx(edev->cdev, pgid, bw_pct);
+}
+
+static u8 qede_dcbnl_setall(struct net_device *netdev)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->setall(edev->cdev);
+}
+
+static int qede_dcbnl_setnumtcs(struct net_device *netdev, int tcid, u8 num)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->setnumtcs(edev->cdev, tcid, num);
+}
+
+static void qede_dcbnl_setpfcstate(struct net_device *netdev, u8 state)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->setpfcstate(edev->cdev, state);
+}
+
+static int qede_dcbnl_setapp(struct net_device *netdev, u8 idtype, u16 idval,
+			     u8 up)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->setapp(edev->cdev, idtype, idval, up);
+}
+
+static u8 qede_dcbnl_setdcbx(struct net_device *netdev, u8 state)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->setdcbx(edev->cdev, state);
+}
+
+static u8 qede_dcbnl_getfeatcfg(struct net_device *netdev, int featid,
+				u8 *flags)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->getfeatcfg(edev->cdev, featid, flags);
+}
+
+static u8 qede_dcbnl_setfeatcfg(struct net_device *netdev, int featid, u8 flags)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->setfeatcfg(edev->cdev, featid, flags);
+}
+
+static int qede_dcbnl_peer_getappinfo(struct net_device *netdev,
+				      struct dcb_peer_app_info *info,
+				      u16 *count)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->peer_getappinfo(edev->cdev, info, count);
+}
+
+static int qede_dcbnl_peer_getapptable(struct net_device *netdev,
+				       struct dcb_app *app)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->peer_getapptable(edev->cdev, app);
+}
+
+static int qede_dcbnl_cee_peer_getpfc(struct net_device *netdev,
+				      struct cee_pfc *pfc)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->cee_peer_getpfc(edev->cdev, pfc);
+}
+
+static int qede_dcbnl_cee_peer_getpg(struct net_device *netdev,
+				     struct cee_pg *pg)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->cee_peer_getpg(edev->cdev, pg);
+}
+
+static int qede_dcbnl_ieee_getpfc(struct net_device *netdev,
+				  struct ieee_pfc *pfc)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->ieee_getpfc(edev->cdev, pfc);
+}
+
+static int qede_dcbnl_ieee_setpfc(struct net_device *netdev,
+				  struct ieee_pfc *pfc)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->ieee_setpfc(edev->cdev, pfc);
+}
+
+static int qede_dcbnl_ieee_getets(struct net_device *netdev,
+				  struct ieee_ets *ets)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->ieee_getets(edev->cdev, ets);
+}
+
+static int qede_dcbnl_ieee_setets(struct net_device *netdev,
+				  struct ieee_ets *ets)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->ieee_setets(edev->cdev, ets);
+}
+
+static int qede_dcbnl_ieee_getapp(struct net_device *netdev,
+				  struct dcb_app *app)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->ieee_getapp(edev->cdev, app);
+}
+
+static int qede_dcbnl_ieee_setapp(struct net_device *netdev,
+				  struct dcb_app *app)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+	int err;
+
+	err = dcb_ieee_setapp(netdev, app);
+	if (err)
+		return err;
+
+	return edev->ops->dcb->ieee_setapp(edev->cdev, app);
+}
+
+static int qede_dcbnl_ieee_peer_getpfc(struct net_device *netdev,
+				       struct ieee_pfc *pfc)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->ieee_peer_getpfc(edev->cdev, pfc);
+}
+
+static int qede_dcbnl_ieee_peer_getets(struct net_device *netdev,
+				       struct ieee_ets *ets)
+{
+	struct qede_dev *edev = netdev_priv(netdev);
+
+	return edev->ops->dcb->ieee_peer_getets(edev->cdev, ets);
+}
+
+static const struct dcbnl_rtnl_ops qede_dcbnl_ops = {
+	.ieee_getpfc = qede_dcbnl_ieee_getpfc,
+	.ieee_setpfc = qede_dcbnl_ieee_setpfc,
+	.ieee_getets = qede_dcbnl_ieee_getets,
+	.ieee_setets = qede_dcbnl_ieee_setets,
+	.ieee_getapp = qede_dcbnl_ieee_getapp,
+	.ieee_setapp = qede_dcbnl_ieee_setapp,
+	.ieee_peer_getpfc = qede_dcbnl_ieee_peer_getpfc,
+	.ieee_peer_getets = qede_dcbnl_ieee_peer_getets,
+	.getstate = qede_dcbnl_getstate,
+	.setstate = qede_dcbnl_setstate,
+	.getpermhwaddr = qede_dcbnl_getpermhwaddr,
+	.getpgtccfgtx = qede_dcbnl_getpgtccfgtx,
+	.getpgbwgcfgtx = qede_dcbnl_getpgbwgcfgtx,
+	.getpgtccfgrx = qede_dcbnl_getpgtccfgrx,
+	.getpgbwgcfgrx = qede_dcbnl_getpgbwgcfgrx,
+	.getpfccfg = qede_dcbnl_getpfccfg,
+	.setpfccfg = qede_dcbnl_setpfccfg,
+	.getcap = qede_dcbnl_getcap,
+	.getnumtcs = qede_dcbnl_getnumtcs,
+	.getpfcstate = qede_dcbnl_getpfcstate,
+	.getapp = qede_dcbnl_getapp,
+	.getdcbx = qede_dcbnl_getdcbx,
+	.setpgtccfgtx = qede_dcbnl_setpgtccfgtx,
+	.setpgtccfgrx = qede_dcbnl_setpgtccfgrx,
+	.setpgbwgcfgtx = qede_dcbnl_setpgbwgcfgtx,
+	.setpgbwgcfgrx = qede_dcbnl_setpgbwgcfgrx,
+	.setall = qede_dcbnl_setall,
+	.setnumtcs = qede_dcbnl_setnumtcs,
+	.setpfcstate = qede_dcbnl_setpfcstate,
+	.setapp = qede_dcbnl_setapp,
+	.setdcbx = qede_dcbnl_setdcbx,
+	.setfeatcfg = qede_dcbnl_setfeatcfg,
+	.getfeatcfg = qede_dcbnl_getfeatcfg,
+	.peer_getappinfo = qede_dcbnl_peer_getappinfo,
+	.peer_getapptable = qede_dcbnl_peer_getapptable,
+	.cee_peer_getpfc = qede_dcbnl_cee_peer_getpfc,
+	.cee_peer_getpg = qede_dcbnl_cee_peer_getpg,
+};
+
+void qede_set_dcbnl_ops(struct net_device *dev)
+{
+	dev->dcbnl_ops = &qede_dcbnl_ops;
+}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
new file mode 100644
index 0000000..ecbf1de
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -0,0 +1,1854 @@
+/* QLogic qede NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/version.h>
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/capability.h>
+#include <linux/vmalloc.h>
+#include "qede.h"
+#include "qede_ptp.h"
+
+#define QEDE_RQSTAT_OFFSET(stat_name) \
+	 (offsetof(struct qede_rx_queue, stat_name))
+#define QEDE_RQSTAT_STRING(stat_name) (#stat_name)
+#define QEDE_RQSTAT(stat_name) \
+	 {QEDE_RQSTAT_OFFSET(stat_name), QEDE_RQSTAT_STRING(stat_name)}
+
+#define QEDE_SELFTEST_POLL_COUNT 100
+
+static const struct {
+	u64 offset;
+	char string[ETH_GSTRING_LEN];
+} qede_rqstats_arr[] = {
+	QEDE_RQSTAT(rcv_pkts),
+	QEDE_RQSTAT(rx_hw_errors),
+	QEDE_RQSTAT(rx_alloc_errors),
+	QEDE_RQSTAT(rx_ip_frags),
+	QEDE_RQSTAT(xdp_no_pass),
+};
+
+#define QEDE_NUM_RQSTATS ARRAY_SIZE(qede_rqstats_arr)
+#define QEDE_TQSTAT_OFFSET(stat_name) \
+	(offsetof(struct qede_tx_queue, stat_name))
+#define QEDE_TQSTAT_STRING(stat_name) (#stat_name)
+#define QEDE_TQSTAT(stat_name) \
+	{QEDE_TQSTAT_OFFSET(stat_name), QEDE_TQSTAT_STRING(stat_name)}
+#define QEDE_NUM_TQSTATS ARRAY_SIZE(qede_tqstats_arr)
+static const struct {
+	u64 offset;
+	char string[ETH_GSTRING_LEN];
+} qede_tqstats_arr[] = {
+	QEDE_TQSTAT(xmit_pkts),
+	QEDE_TQSTAT(stopped_cnt),
+};
+
+#define QEDE_STAT_OFFSET(stat_name, type, base) \
+	(offsetof(type, stat_name) + (base))
+#define QEDE_STAT_STRING(stat_name)	(#stat_name)
+#define _QEDE_STAT(stat_name, type, base, attr) \
+	{QEDE_STAT_OFFSET(stat_name, type, base), \
+	 QEDE_STAT_STRING(stat_name), \
+	 attr}
+#define QEDE_STAT(stat_name) \
+	_QEDE_STAT(stat_name, struct qede_stats_common, 0, 0x0)
+#define QEDE_PF_STAT(stat_name) \
+	_QEDE_STAT(stat_name, struct qede_stats_common, 0, \
+		   BIT(QEDE_STAT_PF_ONLY))
+#define QEDE_PF_BB_STAT(stat_name) \
+	_QEDE_STAT(stat_name, struct qede_stats_bb, \
+		   offsetof(struct qede_stats, bb), \
+		   BIT(QEDE_STAT_PF_ONLY) | BIT(QEDE_STAT_BB_ONLY))
+#define QEDE_PF_AH_STAT(stat_name) \
+	_QEDE_STAT(stat_name, struct qede_stats_ah, \
+		   offsetof(struct qede_stats, ah), \
+		   BIT(QEDE_STAT_PF_ONLY) | BIT(QEDE_STAT_AH_ONLY))
+static const struct {
+	u64 offset;
+	char string[ETH_GSTRING_LEN];
+	unsigned long attr;
+#define QEDE_STAT_PF_ONLY	0
+#define QEDE_STAT_BB_ONLY	1
+#define QEDE_STAT_AH_ONLY	2
+} qede_stats_arr[] = {
+	QEDE_STAT(rx_ucast_bytes),
+	QEDE_STAT(rx_mcast_bytes),
+	QEDE_STAT(rx_bcast_bytes),
+	QEDE_STAT(rx_ucast_pkts),
+	QEDE_STAT(rx_mcast_pkts),
+	QEDE_STAT(rx_bcast_pkts),
+
+	QEDE_STAT(tx_ucast_bytes),
+	QEDE_STAT(tx_mcast_bytes),
+	QEDE_STAT(tx_bcast_bytes),
+	QEDE_STAT(tx_ucast_pkts),
+	QEDE_STAT(tx_mcast_pkts),
+	QEDE_STAT(tx_bcast_pkts),
+
+	QEDE_PF_STAT(rx_64_byte_packets),
+	QEDE_PF_STAT(rx_65_to_127_byte_packets),
+	QEDE_PF_STAT(rx_128_to_255_byte_packets),
+	QEDE_PF_STAT(rx_256_to_511_byte_packets),
+	QEDE_PF_STAT(rx_512_to_1023_byte_packets),
+	QEDE_PF_STAT(rx_1024_to_1518_byte_packets),
+	QEDE_PF_BB_STAT(rx_1519_to_1522_byte_packets),
+	QEDE_PF_BB_STAT(rx_1519_to_2047_byte_packets),
+	QEDE_PF_BB_STAT(rx_2048_to_4095_byte_packets),
+	QEDE_PF_BB_STAT(rx_4096_to_9216_byte_packets),
+	QEDE_PF_BB_STAT(rx_9217_to_16383_byte_packets),
+	QEDE_PF_AH_STAT(rx_1519_to_max_byte_packets),
+	QEDE_PF_STAT(tx_64_byte_packets),
+	QEDE_PF_STAT(tx_65_to_127_byte_packets),
+	QEDE_PF_STAT(tx_128_to_255_byte_packets),
+	QEDE_PF_STAT(tx_256_to_511_byte_packets),
+	QEDE_PF_STAT(tx_512_to_1023_byte_packets),
+	QEDE_PF_STAT(tx_1024_to_1518_byte_packets),
+	QEDE_PF_BB_STAT(tx_1519_to_2047_byte_packets),
+	QEDE_PF_BB_STAT(tx_2048_to_4095_byte_packets),
+	QEDE_PF_BB_STAT(tx_4096_to_9216_byte_packets),
+	QEDE_PF_BB_STAT(tx_9217_to_16383_byte_packets),
+	QEDE_PF_AH_STAT(tx_1519_to_max_byte_packets),
+	QEDE_PF_STAT(rx_mac_crtl_frames),
+	QEDE_PF_STAT(tx_mac_ctrl_frames),
+	QEDE_PF_STAT(rx_pause_frames),
+	QEDE_PF_STAT(tx_pause_frames),
+	QEDE_PF_STAT(rx_pfc_frames),
+	QEDE_PF_STAT(tx_pfc_frames),
+
+	QEDE_PF_STAT(rx_crc_errors),
+	QEDE_PF_STAT(rx_align_errors),
+	QEDE_PF_STAT(rx_carrier_errors),
+	QEDE_PF_STAT(rx_oversize_packets),
+	QEDE_PF_STAT(rx_jabbers),
+	QEDE_PF_STAT(rx_undersize_packets),
+	QEDE_PF_STAT(rx_fragments),
+	QEDE_PF_BB_STAT(tx_lpi_entry_count),
+	QEDE_PF_BB_STAT(tx_total_collisions),
+	QEDE_PF_STAT(brb_truncates),
+	QEDE_PF_STAT(brb_discards),
+	QEDE_STAT(no_buff_discards),
+	QEDE_PF_STAT(mftag_filter_discards),
+	QEDE_PF_STAT(mac_filter_discards),
+	QEDE_STAT(tx_err_drop_pkts),
+	QEDE_STAT(ttl0_discard),
+	QEDE_STAT(packet_too_big_discard),
+
+	QEDE_STAT(coalesced_pkts),
+	QEDE_STAT(coalesced_events),
+	QEDE_STAT(coalesced_aborts_num),
+	QEDE_STAT(non_coalesced_pkts),
+	QEDE_STAT(coalesced_bytes),
+};
+
+#define QEDE_NUM_STATS	ARRAY_SIZE(qede_stats_arr)
+#define QEDE_STAT_IS_PF_ONLY(i) \
+	test_bit(QEDE_STAT_PF_ONLY, &qede_stats_arr[i].attr)
+#define QEDE_STAT_IS_BB_ONLY(i) \
+	test_bit(QEDE_STAT_BB_ONLY, &qede_stats_arr[i].attr)
+#define QEDE_STAT_IS_AH_ONLY(i) \
+	test_bit(QEDE_STAT_AH_ONLY, &qede_stats_arr[i].attr)
+
+enum {
+	QEDE_PRI_FLAG_CMT,
+	QEDE_PRI_FLAG_LEN,
+};
+
+static const char qede_private_arr[QEDE_PRI_FLAG_LEN][ETH_GSTRING_LEN] = {
+	"Coupled-Function",
+};
+
+enum qede_ethtool_tests {
+	QEDE_ETHTOOL_INT_LOOPBACK,
+	QEDE_ETHTOOL_INTERRUPT_TEST,
+	QEDE_ETHTOOL_MEMORY_TEST,
+	QEDE_ETHTOOL_REGISTER_TEST,
+	QEDE_ETHTOOL_CLOCK_TEST,
+	QEDE_ETHTOOL_NVRAM_TEST,
+	QEDE_ETHTOOL_TEST_MAX
+};
+
+static const char qede_tests_str_arr[QEDE_ETHTOOL_TEST_MAX][ETH_GSTRING_LEN] = {
+	"Internal loopback (offline)",
+	"Interrupt (online)\t",
+	"Memory (online)\t\t",
+	"Register (online)\t",
+	"Clock (online)\t\t",
+	"Nvram (online)\t\t",
+};
+
+static void qede_get_strings_stats_txq(struct qede_dev *edev,
+				       struct qede_tx_queue *txq, u8 **buf)
+{
+	int i;
+
+	for (i = 0; i < QEDE_NUM_TQSTATS; i++) {
+		if (txq->is_xdp)
+			sprintf(*buf, "%d [XDP]: %s",
+				QEDE_TXQ_XDP_TO_IDX(edev, txq),
+				qede_tqstats_arr[i].string);
+		else
+			sprintf(*buf, "%d: %s", txq->index,
+				qede_tqstats_arr[i].string);
+		*buf += ETH_GSTRING_LEN;
+	}
+}
+
+static void qede_get_strings_stats_rxq(struct qede_dev *edev,
+				       struct qede_rx_queue *rxq, u8 **buf)
+{
+	int i;
+
+	for (i = 0; i < QEDE_NUM_RQSTATS; i++) {
+		sprintf(*buf, "%d: %s", rxq->rxq_id,
+			qede_rqstats_arr[i].string);
+		*buf += ETH_GSTRING_LEN;
+	}
+}
+
+static bool qede_is_irrelevant_stat(struct qede_dev *edev, int stat_index)
+{
+	return (IS_VF(edev) && QEDE_STAT_IS_PF_ONLY(stat_index)) ||
+	       (QEDE_IS_BB(edev) && QEDE_STAT_IS_AH_ONLY(stat_index)) ||
+	       (QEDE_IS_AH(edev) && QEDE_STAT_IS_BB_ONLY(stat_index));
+}
+
+static void qede_get_strings_stats(struct qede_dev *edev, u8 *buf)
+{
+	struct qede_fastpath *fp;
+	int i;
+
+	/* Account for queue statistics */
+	for (i = 0; i < QEDE_QUEUE_CNT(edev); i++) {
+		fp = &edev->fp_array[i];
+
+		if (fp->type & QEDE_FASTPATH_RX)
+			qede_get_strings_stats_rxq(edev, fp->rxq, &buf);
+
+		if (fp->type & QEDE_FASTPATH_XDP)
+			qede_get_strings_stats_txq(edev, fp->xdp_tx, &buf);
+
+		if (fp->type & QEDE_FASTPATH_TX)
+			qede_get_strings_stats_txq(edev, fp->txq, &buf);
+	}
+
+	/* Account for non-queue statistics */
+	for (i = 0; i < QEDE_NUM_STATS; i++) {
+		if (qede_is_irrelevant_stat(edev, i))
+			continue;
+		strcpy(buf, qede_stats_arr[i].string);
+		buf += ETH_GSTRING_LEN;
+	}
+}
+
+static void qede_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		qede_get_strings_stats(edev, buf);
+		break;
+	case ETH_SS_PRIV_FLAGS:
+		memcpy(buf, qede_private_arr,
+		       ETH_GSTRING_LEN * QEDE_PRI_FLAG_LEN);
+		break;
+	case ETH_SS_TEST:
+		memcpy(buf, qede_tests_str_arr,
+		       ETH_GSTRING_LEN * QEDE_ETHTOOL_TEST_MAX);
+		break;
+	default:
+		DP_VERBOSE(edev, QED_MSG_DEBUG,
+			   "Unsupported stringset 0x%08x\n", stringset);
+	}
+}
+
+static void qede_get_ethtool_stats_txq(struct qede_tx_queue *txq, u64 **buf)
+{
+	int i;
+
+	for (i = 0; i < QEDE_NUM_TQSTATS; i++) {
+		**buf = *((u64 *)(((void *)txq) + qede_tqstats_arr[i].offset));
+		(*buf)++;
+	}
+}
+
+static void qede_get_ethtool_stats_rxq(struct qede_rx_queue *rxq, u64 **buf)
+{
+	int i;
+
+	for (i = 0; i < QEDE_NUM_RQSTATS; i++) {
+		**buf = *((u64 *)(((void *)rxq) + qede_rqstats_arr[i].offset));
+		(*buf)++;
+	}
+}
+
+static void qede_get_ethtool_stats(struct net_device *dev,
+				   struct ethtool_stats *stats, u64 *buf)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qede_fastpath *fp;
+	int i;
+
+	qede_fill_by_demand_stats(edev);
+
+	/* Need to protect the access to the fastpath array */
+	__qede_lock(edev);
+
+	for (i = 0; i < QEDE_QUEUE_CNT(edev); i++) {
+		fp = &edev->fp_array[i];
+
+		if (fp->type & QEDE_FASTPATH_RX)
+			qede_get_ethtool_stats_rxq(fp->rxq, &buf);
+
+		if (fp->type & QEDE_FASTPATH_XDP)
+			qede_get_ethtool_stats_txq(fp->xdp_tx, &buf);
+
+		if (fp->type & QEDE_FASTPATH_TX)
+			qede_get_ethtool_stats_txq(fp->txq, &buf);
+	}
+
+	for (i = 0; i < QEDE_NUM_STATS; i++) {
+		if (qede_is_irrelevant_stat(edev, i))
+			continue;
+		*buf = *((u64 *)(((void *)&edev->stats) +
+				 qede_stats_arr[i].offset));
+
+		buf++;
+	}
+
+	__qede_unlock(edev);
+}
+
+static int qede_get_sset_count(struct net_device *dev, int stringset)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	int num_stats = QEDE_NUM_STATS, i;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < QEDE_NUM_STATS; i++)
+			if (qede_is_irrelevant_stat(edev, i))
+				num_stats--;
+
+		/* Account for the Regular Tx statistics */
+		num_stats += QEDE_TSS_COUNT(edev) * QEDE_NUM_TQSTATS;
+
+		/* Account for the Regular Rx statistics */
+		num_stats += QEDE_RSS_COUNT(edev) * QEDE_NUM_RQSTATS;
+
+		/* Account for XDP statistics [if needed] */
+		if (edev->xdp_prog)
+			num_stats += QEDE_RSS_COUNT(edev) * QEDE_NUM_TQSTATS;
+		return num_stats;
+
+	case ETH_SS_PRIV_FLAGS:
+		return QEDE_PRI_FLAG_LEN;
+	case ETH_SS_TEST:
+		if (!IS_VF(edev))
+			return QEDE_ETHTOOL_TEST_MAX;
+		else
+			return 0;
+	default:
+		DP_VERBOSE(edev, QED_MSG_DEBUG,
+			   "Unsupported stringset 0x%08x\n", stringset);
+		return -EINVAL;
+	}
+}
+
+static u32 qede_get_priv_flags(struct net_device *dev)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	return (!!(edev->dev_info.common.num_hwfns > 1)) << QEDE_PRI_FLAG_CMT;
+}
+
+struct qede_link_mode_mapping {
+	u32 qed_link_mode;
+	u32 ethtool_link_mode;
+};
+
+static const struct qede_link_mode_mapping qed_lm_map[] = {
+	{QED_LM_FIBRE_BIT, ETHTOOL_LINK_MODE_FIBRE_BIT},
+	{QED_LM_Autoneg_BIT, ETHTOOL_LINK_MODE_Autoneg_BIT},
+	{QED_LM_Asym_Pause_BIT, ETHTOOL_LINK_MODE_Asym_Pause_BIT},
+	{QED_LM_Pause_BIT, ETHTOOL_LINK_MODE_Pause_BIT},
+	{QED_LM_1000baseT_Half_BIT, ETHTOOL_LINK_MODE_1000baseT_Half_BIT},
+	{QED_LM_1000baseT_Full_BIT, ETHTOOL_LINK_MODE_1000baseT_Full_BIT},
+	{QED_LM_10000baseKR_Full_BIT, ETHTOOL_LINK_MODE_10000baseKR_Full_BIT},
+	{QED_LM_25000baseKR_Full_BIT, ETHTOOL_LINK_MODE_25000baseKR_Full_BIT},
+	{QED_LM_40000baseLR4_Full_BIT, ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT},
+	{QED_LM_50000baseKR2_Full_BIT, ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT},
+	{QED_LM_100000baseKR4_Full_BIT,
+	 ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT},
+};
+
+#define QEDE_DRV_TO_ETHTOOL_CAPS(caps, lk_ksettings, name)	\
+{								\
+	int i;							\
+								\
+	for (i = 0; i < ARRAY_SIZE(qed_lm_map); i++) {		\
+		if ((caps) & (qed_lm_map[i].qed_link_mode))	\
+			__set_bit(qed_lm_map[i].ethtool_link_mode,\
+				  lk_ksettings->link_modes.name); \
+	}							\
+}
+
+#define QEDE_ETHTOOL_TO_DRV_CAPS(caps, lk_ksettings, name)	\
+{								\
+	int i;							\
+								\
+	for (i = 0; i < ARRAY_SIZE(qed_lm_map); i++) {		\
+		if (test_bit(qed_lm_map[i].ethtool_link_mode,	\
+			     lk_ksettings->link_modes.name))	\
+			caps |= qed_lm_map[i].qed_link_mode;	\
+	}							\
+}
+
+static int qede_get_link_ksettings(struct net_device *dev,
+				   struct ethtool_link_ksettings *cmd)
+{
+	struct ethtool_link_settings *base = &cmd->base;
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qed_link_output current_link;
+
+	__qede_lock(edev);
+
+	memset(&current_link, 0, sizeof(current_link));
+	edev->ops->common->get_link(edev->cdev, &current_link);
+
+	ethtool_link_ksettings_zero_link_mode(cmd, supported);
+	QEDE_DRV_TO_ETHTOOL_CAPS(current_link.supported_caps, cmd, supported)
+
+	ethtool_link_ksettings_zero_link_mode(cmd, advertising);
+	QEDE_DRV_TO_ETHTOOL_CAPS(current_link.advertised_caps, cmd, advertising)
+
+	ethtool_link_ksettings_zero_link_mode(cmd, lp_advertising);
+	QEDE_DRV_TO_ETHTOOL_CAPS(current_link.lp_caps, cmd, lp_advertising)
+
+	if ((edev->state == QEDE_STATE_OPEN) && (current_link.link_up)) {
+		base->speed = current_link.speed;
+		base->duplex = current_link.duplex;
+	} else {
+		base->speed = SPEED_UNKNOWN;
+		base->duplex = DUPLEX_UNKNOWN;
+	}
+
+	__qede_unlock(edev);
+
+	base->port = current_link.port;
+	base->autoneg = (current_link.autoneg) ? AUTONEG_ENABLE :
+			AUTONEG_DISABLE;
+
+	return 0;
+}
+
+static int qede_set_link_ksettings(struct net_device *dev,
+				   const struct ethtool_link_ksettings *cmd)
+{
+	const struct ethtool_link_settings *base = &cmd->base;
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qed_link_output current_link;
+	struct qed_link_params params;
+
+	if (!edev->ops || !edev->ops->common->can_link_change(edev->cdev)) {
+		DP_INFO(edev, "Link settings are not allowed to be changed\n");
+		return -EOPNOTSUPP;
+	}
+	memset(&current_link, 0, sizeof(current_link));
+	memset(&params, 0, sizeof(params));
+	edev->ops->common->get_link(edev->cdev, &current_link);
+
+	params.override_flags |= QED_LINK_OVERRIDE_SPEED_ADV_SPEEDS;
+	params.override_flags |= QED_LINK_OVERRIDE_SPEED_AUTONEG;
+	if (base->autoneg == AUTONEG_ENABLE) {
+		if (!(current_link.supported_caps & QED_LM_Autoneg_BIT)) {
+			DP_INFO(edev, "Auto negotiation is not supported\n");
+			return -EOPNOTSUPP;
+		}
+
+		params.autoneg = true;
+		params.forced_speed = 0;
+		QEDE_ETHTOOL_TO_DRV_CAPS(params.adv_speeds, cmd, advertising)
+	} else {		/* forced speed */
+		params.override_flags |= QED_LINK_OVERRIDE_SPEED_FORCED_SPEED;
+		params.autoneg = false;
+		params.forced_speed = base->speed;
+		switch (base->speed) {
+		case SPEED_1000:
+			if (!(current_link.supported_caps &
+			      QED_LM_1000baseT_Full_BIT)) {
+				DP_INFO(edev, "1G speed not supported\n");
+				return -EINVAL;
+			}
+			params.adv_speeds = QED_LM_1000baseT_Full_BIT;
+			break;
+		case SPEED_10000:
+			if (!(current_link.supported_caps &
+			      QED_LM_10000baseKR_Full_BIT)) {
+				DP_INFO(edev, "10G speed not supported\n");
+				return -EINVAL;
+			}
+			params.adv_speeds = QED_LM_10000baseKR_Full_BIT;
+			break;
+		case SPEED_25000:
+			if (!(current_link.supported_caps &
+			      QED_LM_25000baseKR_Full_BIT)) {
+				DP_INFO(edev, "25G speed not supported\n");
+				return -EINVAL;
+			}
+			params.adv_speeds = QED_LM_25000baseKR_Full_BIT;
+			break;
+		case SPEED_40000:
+			if (!(current_link.supported_caps &
+			      QED_LM_40000baseLR4_Full_BIT)) {
+				DP_INFO(edev, "40G speed not supported\n");
+				return -EINVAL;
+			}
+			params.adv_speeds = QED_LM_40000baseLR4_Full_BIT;
+			break;
+		case SPEED_50000:
+			if (!(current_link.supported_caps &
+			      QED_LM_50000baseKR2_Full_BIT)) {
+				DP_INFO(edev, "50G speed not supported\n");
+				return -EINVAL;
+			}
+			params.adv_speeds = QED_LM_50000baseKR2_Full_BIT;
+			break;
+		case SPEED_100000:
+			if (!(current_link.supported_caps &
+			      QED_LM_100000baseKR4_Full_BIT)) {
+				DP_INFO(edev, "100G speed not supported\n");
+				return -EINVAL;
+			}
+			params.adv_speeds = QED_LM_100000baseKR4_Full_BIT;
+			break;
+		default:
+			DP_INFO(edev, "Unsupported speed %u\n", base->speed);
+			return -EINVAL;
+		}
+	}
+
+	params.link_up = true;
+	edev->ops->common->set_link(edev->cdev, &params);
+
+	return 0;
+}
+
+static void qede_get_drvinfo(struct net_device *ndev,
+			     struct ethtool_drvinfo *info)
+{
+	char mfw[ETHTOOL_FWVERS_LEN], storm[ETHTOOL_FWVERS_LEN];
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	strlcpy(info->driver, "qede", sizeof(info->driver));
+	strlcpy(info->version, DRV_MODULE_VERSION, sizeof(info->version));
+
+	snprintf(storm, ETHTOOL_FWVERS_LEN, "%d.%d.%d.%d",
+		 edev->dev_info.common.fw_major,
+		 edev->dev_info.common.fw_minor,
+		 edev->dev_info.common.fw_rev,
+		 edev->dev_info.common.fw_eng);
+
+	snprintf(mfw, ETHTOOL_FWVERS_LEN, "%d.%d.%d.%d",
+		 (edev->dev_info.common.mfw_rev >> 24) & 0xFF,
+		 (edev->dev_info.common.mfw_rev >> 16) & 0xFF,
+		 (edev->dev_info.common.mfw_rev >> 8) & 0xFF,
+		 edev->dev_info.common.mfw_rev & 0xFF);
+
+	if ((strlen(storm) + strlen(mfw) + strlen("mfw storm  ")) <
+	    sizeof(info->fw_version)) {
+		snprintf(info->fw_version, sizeof(info->fw_version),
+			 "mfw %s storm %s", mfw, storm);
+	} else {
+		snprintf(info->fw_version, sizeof(info->fw_version),
+			 "%s %s", mfw, storm);
+	}
+
+	strlcpy(info->bus_info, pci_name(edev->pdev), sizeof(info->bus_info));
+}
+
+static void qede_get_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	if (edev->dev_info.common.wol_support) {
+		wol->supported = WAKE_MAGIC;
+		wol->wolopts = edev->wol_enabled ? WAKE_MAGIC : 0;
+	}
+}
+
+static int qede_set_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+	bool wol_requested;
+	int rc;
+
+	if (wol->wolopts & ~WAKE_MAGIC) {
+		DP_INFO(edev,
+			"Can't support WoL options other than magic-packet\n");
+		return -EINVAL;
+	}
+
+	wol_requested = !!(wol->wolopts & WAKE_MAGIC);
+	if (wol_requested == edev->wol_enabled)
+		return 0;
+
+	/* Need to actually change configuration */
+	if (!edev->dev_info.common.wol_support) {
+		DP_INFO(edev, "Device doesn't support WoL\n");
+		return -EINVAL;
+	}
+
+	rc = edev->ops->common->update_wol(edev->cdev, wol_requested);
+	if (!rc)
+		edev->wol_enabled = wol_requested;
+
+	return rc;
+}
+
+static u32 qede_get_msglevel(struct net_device *ndev)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	return ((u32)edev->dp_level << QED_LOG_LEVEL_SHIFT) | edev->dp_module;
+}
+
+static void qede_set_msglevel(struct net_device *ndev, u32 level)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+	u32 dp_module = 0;
+	u8 dp_level = 0;
+
+	qede_config_debug(level, &dp_module, &dp_level);
+
+	edev->dp_level = dp_level;
+	edev->dp_module = dp_module;
+	edev->ops->common->update_msglvl(edev->cdev,
+					 dp_module, dp_level);
+}
+
+static int qede_nway_reset(struct net_device *dev)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qed_link_output current_link;
+	struct qed_link_params link_params;
+
+	if (!edev->ops || !edev->ops->common->can_link_change(edev->cdev)) {
+		DP_INFO(edev, "Link settings are not allowed to be changed\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!netif_running(dev))
+		return 0;
+
+	memset(&current_link, 0, sizeof(current_link));
+	edev->ops->common->get_link(edev->cdev, &current_link);
+	if (!current_link.link_up)
+		return 0;
+
+	/* Toggle the link */
+	memset(&link_params, 0, sizeof(link_params));
+	link_params.link_up = false;
+	edev->ops->common->set_link(edev->cdev, &link_params);
+	link_params.link_up = true;
+	edev->ops->common->set_link(edev->cdev, &link_params);
+
+	return 0;
+}
+
+static u32 qede_get_link(struct net_device *dev)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qed_link_output current_link;
+
+	memset(&current_link, 0, sizeof(current_link));
+	edev->ops->common->get_link(edev->cdev, &current_link);
+
+	return current_link.link_up;
+}
+
+static int qede_flash_device(struct net_device *dev,
+			     struct ethtool_flash *flash)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	return edev->ops->common->nvm_flash(edev->cdev, flash->data);
+}
+
+static int qede_get_coalesce(struct net_device *dev,
+			     struct ethtool_coalesce *coal)
+{
+	void *rx_handle = NULL, *tx_handle = NULL;
+	struct qede_dev *edev = netdev_priv(dev);
+	u16 rx_coal, tx_coal, i, rc = 0;
+	struct qede_fastpath *fp;
+
+	rx_coal = QED_DEFAULT_RX_USECS;
+	tx_coal = QED_DEFAULT_TX_USECS;
+
+	memset(coal, 0, sizeof(struct ethtool_coalesce));
+
+	__qede_lock(edev);
+	if (edev->state == QEDE_STATE_OPEN) {
+		for_each_queue(i) {
+			fp = &edev->fp_array[i];
+
+			if (fp->type & QEDE_FASTPATH_RX) {
+				rx_handle = fp->rxq->handle;
+				break;
+			}
+		}
+
+		rc = edev->ops->get_coalesce(edev->cdev, &rx_coal, rx_handle);
+		if (rc) {
+			DP_INFO(edev, "Read Rx coalesce error\n");
+			goto out;
+		}
+
+		for_each_queue(i) {
+			fp = &edev->fp_array[i];
+			if (fp->type & QEDE_FASTPATH_TX) {
+				tx_handle = fp->txq->handle;
+				break;
+			}
+		}
+
+		rc = edev->ops->get_coalesce(edev->cdev, &tx_coal, tx_handle);
+		if (rc)
+			DP_INFO(edev, "Read Tx coalesce error\n");
+	}
+
+out:
+	__qede_unlock(edev);
+
+	coal->rx_coalesce_usecs = rx_coal;
+	coal->tx_coalesce_usecs = tx_coal;
+
+	return rc;
+}
+
+static int qede_set_coalesce(struct net_device *dev,
+			     struct ethtool_coalesce *coal)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qede_fastpath *fp;
+	int i, rc = 0;
+	u16 rxc, txc;
+
+	if (!netif_running(dev)) {
+		DP_INFO(edev, "Interface is down\n");
+		return -EINVAL;
+	}
+
+	if (coal->rx_coalesce_usecs > QED_COALESCE_MAX ||
+	    coal->tx_coalesce_usecs > QED_COALESCE_MAX) {
+		DP_INFO(edev,
+			"Can't support requested %s coalesce value [max supported value %d]\n",
+			coal->rx_coalesce_usecs > QED_COALESCE_MAX ? "rx" :
+			"tx", QED_COALESCE_MAX);
+		return -EINVAL;
+	}
+
+	rxc = (u16)coal->rx_coalesce_usecs;
+	txc = (u16)coal->tx_coalesce_usecs;
+	for_each_queue(i) {
+		fp = &edev->fp_array[i];
+
+		if (edev->fp_array[i].type & QEDE_FASTPATH_RX) {
+			rc = edev->ops->common->set_coalesce(edev->cdev,
+							     rxc, 0,
+							     fp->rxq->handle);
+			if (rc) {
+				DP_INFO(edev,
+					"Set RX coalesce error, rc = %d\n", rc);
+				return rc;
+			}
+		}
+
+		if (edev->fp_array[i].type & QEDE_FASTPATH_TX) {
+			rc = edev->ops->common->set_coalesce(edev->cdev,
+							     0, txc,
+							     fp->txq->handle);
+			if (rc) {
+				DP_INFO(edev,
+					"Set TX coalesce error, rc = %d\n", rc);
+				return rc;
+			}
+		}
+	}
+
+	return rc;
+}
+
+static void qede_get_ringparam(struct net_device *dev,
+			       struct ethtool_ringparam *ering)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	ering->rx_max_pending = NUM_RX_BDS_MAX;
+	ering->rx_pending = edev->q_num_rx_buffers;
+	ering->tx_max_pending = NUM_TX_BDS_MAX;
+	ering->tx_pending = edev->q_num_tx_buffers;
+}
+
+static int qede_set_ringparam(struct net_device *dev,
+			      struct ethtool_ringparam *ering)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
+		   "Set ring params command parameters: rx_pending = %d, tx_pending = %d\n",
+		   ering->rx_pending, ering->tx_pending);
+
+	/* Validate legality of configuration */
+	if (ering->rx_pending > NUM_RX_BDS_MAX ||
+	    ering->rx_pending < NUM_RX_BDS_MIN ||
+	    ering->tx_pending > NUM_TX_BDS_MAX ||
+	    ering->tx_pending < NUM_TX_BDS_MIN) {
+		DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
+			   "Can only support Rx Buffer size [0%08x,...,0x%08x] and Tx Buffer size [0x%08x,...,0x%08x]\n",
+			   NUM_RX_BDS_MIN, NUM_RX_BDS_MAX,
+			   NUM_TX_BDS_MIN, NUM_TX_BDS_MAX);
+		return -EINVAL;
+	}
+
+	/* Change ring size and re-load */
+	edev->q_num_rx_buffers = ering->rx_pending;
+	edev->q_num_tx_buffers = ering->tx_pending;
+
+	qede_reload(edev, NULL, false);
+
+	return 0;
+}
+
+static void qede_get_pauseparam(struct net_device *dev,
+				struct ethtool_pauseparam *epause)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qed_link_output current_link;
+
+	memset(&current_link, 0, sizeof(current_link));
+	edev->ops->common->get_link(edev->cdev, &current_link);
+
+	if (current_link.pause_config & QED_LINK_PAUSE_AUTONEG_ENABLE)
+		epause->autoneg = true;
+	if (current_link.pause_config & QED_LINK_PAUSE_RX_ENABLE)
+		epause->rx_pause = true;
+	if (current_link.pause_config & QED_LINK_PAUSE_TX_ENABLE)
+		epause->tx_pause = true;
+
+	DP_VERBOSE(edev, QED_MSG_DEBUG,
+		   "ethtool_pauseparam: cmd %d  autoneg %d  rx_pause %d  tx_pause %d\n",
+		   epause->cmd, epause->autoneg, epause->rx_pause,
+		   epause->tx_pause);
+}
+
+static int qede_set_pauseparam(struct net_device *dev,
+			       struct ethtool_pauseparam *epause)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qed_link_params params;
+	struct qed_link_output current_link;
+
+	if (!edev->ops || !edev->ops->common->can_link_change(edev->cdev)) {
+		DP_INFO(edev,
+			"Pause settings are not allowed to be changed\n");
+		return -EOPNOTSUPP;
+	}
+
+	memset(&current_link, 0, sizeof(current_link));
+	edev->ops->common->get_link(edev->cdev, &current_link);
+
+	memset(&params, 0, sizeof(params));
+	params.override_flags |= QED_LINK_OVERRIDE_PAUSE_CONFIG;
+	if (epause->autoneg) {
+		if (!(current_link.supported_caps & QED_LM_Autoneg_BIT)) {
+			DP_INFO(edev, "autoneg not supported\n");
+			return -EINVAL;
+		}
+		params.pause_config |= QED_LINK_PAUSE_AUTONEG_ENABLE;
+	}
+	if (epause->rx_pause)
+		params.pause_config |= QED_LINK_PAUSE_RX_ENABLE;
+	if (epause->tx_pause)
+		params.pause_config |= QED_LINK_PAUSE_TX_ENABLE;
+
+	params.link_up = true;
+	edev->ops->common->set_link(edev->cdev, &params);
+
+	return 0;
+}
+
+static void qede_get_regs(struct net_device *ndev,
+			  struct ethtool_regs *regs, void *buffer)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	regs->version = 0;
+	memset(buffer, 0, regs->len);
+
+	if (edev->ops && edev->ops->common)
+		edev->ops->common->dbg_all_data(edev->cdev, buffer);
+}
+
+static int qede_get_regs_len(struct net_device *ndev)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	if (edev->ops && edev->ops->common)
+		return edev->ops->common->dbg_all_data_size(edev->cdev);
+	else
+		return -EINVAL;
+}
+
+static void qede_update_mtu(struct qede_dev *edev,
+			    struct qede_reload_args *args)
+{
+	edev->ndev->mtu = args->u.mtu;
+}
+
+/* Netdevice NDOs */
+int qede_change_mtu(struct net_device *ndev, int new_mtu)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+	struct qede_reload_args args;
+
+	DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
+		   "Configuring MTU size of %d\n", new_mtu);
+
+	if (new_mtu > PAGE_SIZE)
+		ndev->features &= ~NETIF_F_GRO_HW;
+
+	/* Set the mtu field and re-start the interface if needed */
+	args.u.mtu = new_mtu;
+	args.func = &qede_update_mtu;
+	qede_reload(edev, &args, false);
+
+	edev->ops->common->update_mtu(edev->cdev, new_mtu);
+
+	return 0;
+}
+
+static void qede_get_channels(struct net_device *dev,
+			      struct ethtool_channels *channels)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	channels->max_combined = QEDE_MAX_RSS_CNT(edev);
+	channels->max_rx = QEDE_MAX_RSS_CNT(edev);
+	channels->max_tx = QEDE_MAX_RSS_CNT(edev);
+	channels->combined_count = QEDE_QUEUE_CNT(edev) - edev->fp_num_tx -
+					edev->fp_num_rx;
+	channels->tx_count = edev->fp_num_tx;
+	channels->rx_count = edev->fp_num_rx;
+}
+
+static int qede_set_channels(struct net_device *dev,
+			     struct ethtool_channels *channels)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	u32 count;
+
+	DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
+		   "set-channels command parameters: rx = %d, tx = %d, other = %d, combined = %d\n",
+		   channels->rx_count, channels->tx_count,
+		   channels->other_count, channels->combined_count);
+
+	count = channels->rx_count + channels->tx_count +
+			channels->combined_count;
+
+	/* We don't support `other' channels */
+	if (channels->other_count) {
+		DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
+			   "command parameters not supported\n");
+		return -EINVAL;
+	}
+
+	if (!(channels->combined_count || (channels->rx_count &&
+					   channels->tx_count))) {
+		DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
+			   "need to request at least one transmit and one receive channel\n");
+		return -EINVAL;
+	}
+
+	if (count > QEDE_MAX_RSS_CNT(edev)) {
+		DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
+			   "requested channels = %d max supported channels = %d\n",
+			   count, QEDE_MAX_RSS_CNT(edev));
+		return -EINVAL;
+	}
+
+	/* Check if there was a change in the active parameters */
+	if ((count == QEDE_QUEUE_CNT(edev)) &&
+	    (channels->tx_count == edev->fp_num_tx) &&
+	    (channels->rx_count == edev->fp_num_rx)) {
+		DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
+			   "No change in active parameters\n");
+		return 0;
+	}
+
+	/* We need the number of queues to be divisible between the hwfns */
+	if ((count % edev->dev_info.common.num_hwfns) ||
+	    (channels->tx_count % edev->dev_info.common.num_hwfns) ||
+	    (channels->rx_count % edev->dev_info.common.num_hwfns)) {
+		DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
+			   "Number of channels must be divisible by %04x\n",
+			   edev->dev_info.common.num_hwfns);
+		return -EINVAL;
+	}
+
+	/* Set number of queues and reload if necessary */
+	edev->req_queues = count;
+	edev->req_num_tx = channels->tx_count;
+	edev->req_num_rx = channels->rx_count;
+	/* Reset the indirection table if rx queue count is updated */
+	if ((edev->req_queues - edev->req_num_tx) != QEDE_RSS_COUNT(edev)) {
+		edev->rss_params_inited &= ~QEDE_RSS_INDIR_INITED;
+		memset(edev->rss_ind_table, 0, sizeof(edev->rss_ind_table));
+	}
+
+	qede_reload(edev, NULL, false);
+
+	return 0;
+}
+
+static int qede_get_ts_info(struct net_device *dev,
+			    struct ethtool_ts_info *info)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	return qede_ptp_get_ts_info(edev, info);
+}
+
+static int qede_set_phys_id(struct net_device *dev,
+			    enum ethtool_phys_id_state state)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	u8 led_state = 0;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		return 1;	/* cycle on/off once per second */
+
+	case ETHTOOL_ID_ON:
+		led_state = QED_LED_MODE_ON;
+		break;
+
+	case ETHTOOL_ID_OFF:
+		led_state = QED_LED_MODE_OFF;
+		break;
+
+	case ETHTOOL_ID_INACTIVE:
+		led_state = QED_LED_MODE_RESTORE;
+		break;
+	}
+
+	edev->ops->common->set_led(edev->cdev, led_state);
+
+	return 0;
+}
+
+static int qede_get_rss_flags(struct qede_dev *edev, struct ethtool_rxnfc *info)
+{
+	info->data = RXH_IP_SRC | RXH_IP_DST;
+
+	switch (info->flow_type) {
+	case TCP_V4_FLOW:
+	case TCP_V6_FLOW:
+		info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		break;
+	case UDP_V4_FLOW:
+		if (edev->rss_caps & QED_RSS_IPV4_UDP)
+			info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		break;
+	case UDP_V6_FLOW:
+		if (edev->rss_caps & QED_RSS_IPV6_UDP)
+			info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		break;
+	case IPV4_FLOW:
+	case IPV6_FLOW:
+		break;
+	default:
+		info->data = 0;
+		break;
+	}
+
+	return 0;
+}
+
+static int qede_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info,
+			  u32 *rule_locs)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	int rc = 0;
+
+	switch (info->cmd) {
+	case ETHTOOL_GRXRINGS:
+		info->data = QEDE_RSS_COUNT(edev);
+		break;
+	case ETHTOOL_GRXFH:
+		rc = qede_get_rss_flags(edev, info);
+		break;
+	case ETHTOOL_GRXCLSRLCNT:
+		info->rule_cnt = qede_get_arfs_filter_count(edev);
+		info->data = QEDE_RFS_MAX_FLTR;
+		break;
+	case ETHTOOL_GRXCLSRULE:
+		rc = qede_get_cls_rule_entry(edev, info);
+		break;
+	case ETHTOOL_GRXCLSRLALL:
+		rc = qede_get_cls_rule_all(edev, info, rule_locs);
+		break;
+	default:
+		DP_ERR(edev, "Command parameters not supported\n");
+		rc = -EOPNOTSUPP;
+	}
+
+	return rc;
+}
+
+static int qede_set_rss_flags(struct qede_dev *edev, struct ethtool_rxnfc *info)
+{
+	struct qed_update_vport_params *vport_update_params;
+	u8 set_caps = 0, clr_caps = 0;
+	int rc = 0;
+
+	DP_VERBOSE(edev, QED_MSG_DEBUG,
+		   "Set rss flags command parameters: flow type = %d, data = %llu\n",
+		   info->flow_type, info->data);
+
+	switch (info->flow_type) {
+	case TCP_V4_FLOW:
+	case TCP_V6_FLOW:
+		/* For TCP only 4-tuple hash is supported */
+		if (info->data ^ (RXH_IP_SRC | RXH_IP_DST |
+				  RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
+			DP_INFO(edev, "Command parameters not supported\n");
+			return -EINVAL;
+		}
+		return 0;
+	case UDP_V4_FLOW:
+		/* For UDP either 2-tuple hash or 4-tuple hash is supported */
+		if (info->data == (RXH_IP_SRC | RXH_IP_DST |
+				   RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
+			set_caps = QED_RSS_IPV4_UDP;
+			DP_VERBOSE(edev, QED_MSG_DEBUG,
+				   "UDP 4-tuple enabled\n");
+		} else if (info->data == (RXH_IP_SRC | RXH_IP_DST)) {
+			clr_caps = QED_RSS_IPV4_UDP;
+			DP_VERBOSE(edev, QED_MSG_DEBUG,
+				   "UDP 4-tuple disabled\n");
+		} else {
+			return -EINVAL;
+		}
+		break;
+	case UDP_V6_FLOW:
+		/* For UDP either 2-tuple hash or 4-tuple hash is supported */
+		if (info->data == (RXH_IP_SRC | RXH_IP_DST |
+				   RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
+			set_caps = QED_RSS_IPV6_UDP;
+			DP_VERBOSE(edev, QED_MSG_DEBUG,
+				   "UDP 4-tuple enabled\n");
+		} else if (info->data == (RXH_IP_SRC | RXH_IP_DST)) {
+			clr_caps = QED_RSS_IPV6_UDP;
+			DP_VERBOSE(edev, QED_MSG_DEBUG,
+				   "UDP 4-tuple disabled\n");
+		} else {
+			return -EINVAL;
+		}
+		break;
+	case IPV4_FLOW:
+	case IPV6_FLOW:
+		/* For IP only 2-tuple hash is supported */
+		if (info->data ^ (RXH_IP_SRC | RXH_IP_DST)) {
+			DP_INFO(edev, "Command parameters not supported\n");
+			return -EINVAL;
+		}
+		return 0;
+	case SCTP_V4_FLOW:
+	case AH_ESP_V4_FLOW:
+	case AH_V4_FLOW:
+	case ESP_V4_FLOW:
+	case SCTP_V6_FLOW:
+	case AH_ESP_V6_FLOW:
+	case AH_V6_FLOW:
+	case ESP_V6_FLOW:
+	case IP_USER_FLOW:
+	case ETHER_FLOW:
+		/* RSS is not supported for these protocols */
+		if (info->data) {
+			DP_INFO(edev, "Command parameters not supported\n");
+			return -EINVAL;
+		}
+		return 0;
+	default:
+		return -EINVAL;
+	}
+
+	/* No action is needed if there is no change in the rss capability */
+	if (edev->rss_caps == ((edev->rss_caps & ~clr_caps) | set_caps))
+		return 0;
+
+	/* Update internal configuration */
+	edev->rss_caps = ((edev->rss_caps & ~clr_caps) | set_caps);
+	edev->rss_params_inited |= QEDE_RSS_CAPS_INITED;
+
+	/* Re-configure if possible */
+	__qede_lock(edev);
+	if (edev->state == QEDE_STATE_OPEN) {
+		vport_update_params = vzalloc(sizeof(*vport_update_params));
+		if (!vport_update_params) {
+			__qede_unlock(edev);
+			return -ENOMEM;
+		}
+		qede_fill_rss_params(edev, &vport_update_params->rss_params,
+				     &vport_update_params->update_rss_flg);
+		rc = edev->ops->vport_update(edev->cdev, vport_update_params);
+		vfree(vport_update_params);
+	}
+	__qede_unlock(edev);
+
+	return rc;
+}
+
+static int qede_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	int rc;
+
+	switch (info->cmd) {
+	case ETHTOOL_SRXFH:
+		rc = qede_set_rss_flags(edev, info);
+		break;
+	case ETHTOOL_SRXCLSRLINS:
+		rc = qede_add_cls_rule(edev, info);
+		break;
+	case ETHTOOL_SRXCLSRLDEL:
+		rc = qede_del_cls_rule(edev, info);
+		break;
+	default:
+		DP_INFO(edev, "Command parameters not supported\n");
+		rc = -EOPNOTSUPP;
+	}
+
+	return rc;
+}
+
+static u32 qede_get_rxfh_indir_size(struct net_device *dev)
+{
+	return QED_RSS_IND_TABLE_SIZE;
+}
+
+static u32 qede_get_rxfh_key_size(struct net_device *dev)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	return sizeof(edev->rss_key);
+}
+
+static int qede_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	int i;
+
+	if (hfunc)
+		*hfunc = ETH_RSS_HASH_TOP;
+
+	if (!indir)
+		return 0;
+
+	for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++)
+		indir[i] = edev->rss_ind_table[i];
+
+	if (key)
+		memcpy(key, edev->rss_key, qede_get_rxfh_key_size(dev));
+
+	return 0;
+}
+
+static int qede_set_rxfh(struct net_device *dev, const u32 *indir,
+			 const u8 *key, const u8 hfunc)
+{
+	struct qed_update_vport_params *vport_update_params;
+	struct qede_dev *edev = netdev_priv(dev);
+	int i, rc = 0;
+
+	if (edev->dev_info.common.num_hwfns > 1) {
+		DP_INFO(edev,
+			"RSS configuration is not supported for 100G devices\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+		return -EOPNOTSUPP;
+
+	if (!indir && !key)
+		return 0;
+
+	if (indir) {
+		for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++)
+			edev->rss_ind_table[i] = indir[i];
+		edev->rss_params_inited |= QEDE_RSS_INDIR_INITED;
+	}
+
+	if (key) {
+		memcpy(&edev->rss_key, key, qede_get_rxfh_key_size(dev));
+		edev->rss_params_inited |= QEDE_RSS_KEY_INITED;
+	}
+
+	__qede_lock(edev);
+	if (edev->state == QEDE_STATE_OPEN) {
+		vport_update_params = vzalloc(sizeof(*vport_update_params));
+		if (!vport_update_params) {
+			__qede_unlock(edev);
+			return -ENOMEM;
+		}
+		qede_fill_rss_params(edev, &vport_update_params->rss_params,
+				     &vport_update_params->update_rss_flg);
+		rc = edev->ops->vport_update(edev->cdev, vport_update_params);
+		vfree(vport_update_params);
+	}
+	__qede_unlock(edev);
+
+	return rc;
+}
+
+/* This function enables the interrupt generation and the NAPI on the device */
+static void qede_netif_start(struct qede_dev *edev)
+{
+	int i;
+
+	if (!netif_running(edev->ndev))
+		return;
+
+	for_each_queue(i) {
+		/* Update and reenable interrupts */
+		qed_sb_ack(edev->fp_array[i].sb_info, IGU_INT_ENABLE, 1);
+		napi_enable(&edev->fp_array[i].napi);
+	}
+}
+
+/* This function disables the NAPI and the interrupt generation on the device */
+static void qede_netif_stop(struct qede_dev *edev)
+{
+	int i;
+
+	for_each_queue(i) {
+		napi_disable(&edev->fp_array[i].napi);
+		/* Disable interrupts */
+		qed_sb_ack(edev->fp_array[i].sb_info, IGU_INT_DISABLE, 0);
+	}
+}
+
+static int qede_selftest_transmit_traffic(struct qede_dev *edev,
+					  struct sk_buff *skb)
+{
+	struct qede_tx_queue *txq = NULL;
+	struct eth_tx_1st_bd *first_bd;
+	dma_addr_t mapping;
+	int i, idx;
+	u16 val;
+
+	for_each_queue(i) {
+		if (edev->fp_array[i].type & QEDE_FASTPATH_TX) {
+			txq = edev->fp_array[i].txq;
+			break;
+		}
+	}
+
+	if (!txq) {
+		DP_NOTICE(edev, "Tx path is not available\n");
+		return -1;
+	}
+
+	/* Fill the entry in the SW ring and the BDs in the FW ring */
+	idx = txq->sw_tx_prod;
+	txq->sw_tx_ring.skbs[idx].skb = skb;
+	first_bd = qed_chain_produce(&txq->tx_pbl);
+	memset(first_bd, 0, sizeof(*first_bd));
+	val = 1 << ETH_TX_1ST_BD_FLAGS_START_BD_SHIFT;
+	first_bd->data.bd_flags.bitfields = val;
+	val = skb->len & ETH_TX_DATA_1ST_BD_PKT_LEN_MASK;
+	val = val << ETH_TX_DATA_1ST_BD_PKT_LEN_SHIFT;
+	first_bd->data.bitfields |= cpu_to_le16(val);
+
+	/* Map skb linear data for DMA and set in the first BD */
+	mapping = dma_map_single(&edev->pdev->dev, skb->data,
+				 skb_headlen(skb), DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(&edev->pdev->dev, mapping))) {
+		DP_NOTICE(edev, "SKB mapping failed\n");
+		return -ENOMEM;
+	}
+	BD_SET_UNMAP_ADDR_LEN(first_bd, mapping, skb_headlen(skb));
+
+	/* update the first BD with the actual num BDs */
+	first_bd->data.nbds = 1;
+	txq->sw_tx_prod = (txq->sw_tx_prod + 1) % txq->num_tx_buffers;
+	/* 'next page' entries are counted in the producer value */
+	val = qed_chain_get_prod_idx(&txq->tx_pbl);
+	txq->tx_db.data.bd_prod = cpu_to_le16(val);
+
+	/* wmb makes sure that the BDs data is updated before updating the
+	 * producer, otherwise FW may read old data from the BDs.
+	 */
+	wmb();
+	barrier();
+	writel(txq->tx_db.raw, txq->doorbell_addr);
+
+	/* mmiowb is needed to synchronize doorbell writes from more than one
+	 * processor. It guarantees that the write arrives to the device before
+	 * the queue lock is released and another start_xmit is called (possibly
+	 * on another CPU). Without this barrier, the next doorbell can bypass
+	 * this doorbell. This is applicable to IA64/Altix systems.
+	 */
+	mmiowb();
+
+	for (i = 0; i < QEDE_SELFTEST_POLL_COUNT; i++) {
+		if (qede_txq_has_work(txq))
+			break;
+		usleep_range(100, 200);
+	}
+
+	if (!qede_txq_has_work(txq)) {
+		DP_NOTICE(edev, "Tx completion didn't happen\n");
+		return -1;
+	}
+
+	first_bd = (struct eth_tx_1st_bd *)qed_chain_consume(&txq->tx_pbl);
+	dma_unmap_single(&edev->pdev->dev, BD_UNMAP_ADDR(first_bd),
+			 BD_UNMAP_LEN(first_bd), DMA_TO_DEVICE);
+	txq->sw_tx_cons = (txq->sw_tx_cons + 1) % txq->num_tx_buffers;
+	txq->sw_tx_ring.skbs[idx].skb = NULL;
+
+	return 0;
+}
+
+static int qede_selftest_receive_traffic(struct qede_dev *edev)
+{
+	u16 hw_comp_cons, sw_comp_cons, sw_rx_index, len;
+	struct eth_fast_path_rx_reg_cqe *fp_cqe;
+	struct qede_rx_queue *rxq = NULL;
+	struct sw_rx_data *sw_rx_data;
+	union eth_rx_cqe *cqe;
+	int i, iter, rc = 0;
+	u8 *data_ptr;
+
+	for_each_queue(i) {
+		if (edev->fp_array[i].type & QEDE_FASTPATH_RX) {
+			rxq = edev->fp_array[i].rxq;
+			break;
+		}
+	}
+
+	if (!rxq) {
+		DP_NOTICE(edev, "Rx path is not available\n");
+		return -1;
+	}
+
+	/* The packet is expected to receive on rx-queue 0 even though RSS is
+	 * enabled. This is because the queue 0 is configured as the default
+	 * queue and that the loopback traffic is not IP.
+	 */
+	for (iter = 0; iter < QEDE_SELFTEST_POLL_COUNT; iter++) {
+		if (!qede_has_rx_work(rxq)) {
+			usleep_range(100, 200);
+			continue;
+		}
+
+		hw_comp_cons = le16_to_cpu(*rxq->hw_cons_ptr);
+		sw_comp_cons = qed_chain_get_cons_idx(&rxq->rx_comp_ring);
+
+		/* Memory barrier to prevent the CPU from doing speculative
+		 * reads of CQE/BD before reading hw_comp_cons. If the CQE is
+		 * read before it is written by FW, then FW writes CQE and SB,
+		 * and then the CPU reads the hw_comp_cons, it will use an old
+		 * CQE.
+		 */
+		rmb();
+
+		/* Get the CQE from the completion ring */
+		cqe = (union eth_rx_cqe *)qed_chain_consume(&rxq->rx_comp_ring);
+
+		/* Get the data from the SW ring */
+		sw_rx_index = rxq->sw_rx_cons & NUM_RX_BDS_MAX;
+		sw_rx_data = &rxq->sw_rx_ring[sw_rx_index];
+		fp_cqe = &cqe->fast_path_regular;
+		len =  le16_to_cpu(fp_cqe->len_on_first_bd);
+		data_ptr = (u8 *)(page_address(sw_rx_data->data) +
+				  fp_cqe->placement_offset +
+				  sw_rx_data->page_offset);
+		if (ether_addr_equal(data_ptr,  edev->ndev->dev_addr) &&
+		    ether_addr_equal(data_ptr + ETH_ALEN,
+				     edev->ndev->dev_addr)) {
+			for (i = ETH_HLEN; i < len; i++)
+				if (data_ptr[i] != (unsigned char)(i & 0xff)) {
+					rc = -1;
+					break;
+				}
+
+			qede_recycle_rx_bd_ring(rxq, 1);
+			qed_chain_recycle_consumed(&rxq->rx_comp_ring);
+			break;
+		}
+
+		DP_INFO(edev, "Not the transmitted packet\n");
+		qede_recycle_rx_bd_ring(rxq, 1);
+		qed_chain_recycle_consumed(&rxq->rx_comp_ring);
+	}
+
+	if (iter == QEDE_SELFTEST_POLL_COUNT) {
+		DP_NOTICE(edev, "Failed to receive the traffic\n");
+		return -1;
+	}
+
+	qede_update_rx_prod(edev, rxq);
+
+	return rc;
+}
+
+static int qede_selftest_run_loopback(struct qede_dev *edev, u32 loopback_mode)
+{
+	struct qed_link_params link_params;
+	struct sk_buff *skb = NULL;
+	int rc = 0, i;
+	u32 pkt_size;
+	u8 *packet;
+
+	if (!netif_running(edev->ndev)) {
+		DP_NOTICE(edev, "Interface is down\n");
+		return -EINVAL;
+	}
+
+	qede_netif_stop(edev);
+
+	/* Bring up the link in Loopback mode */
+	memset(&link_params, 0, sizeof(link_params));
+	link_params.link_up = true;
+	link_params.override_flags = QED_LINK_OVERRIDE_LOOPBACK_MODE;
+	link_params.loopback_mode = loopback_mode;
+	edev->ops->common->set_link(edev->cdev, &link_params);
+
+	/* Wait for loopback configuration to apply */
+	msleep_interruptible(500);
+
+	/* prepare the loopback packet */
+	pkt_size = edev->ndev->mtu + ETH_HLEN;
+
+	skb = netdev_alloc_skb(edev->ndev, pkt_size);
+	if (!skb) {
+		DP_INFO(edev, "Can't allocate skb\n");
+		rc = -ENOMEM;
+		goto test_loopback_exit;
+	}
+	packet = skb_put(skb, pkt_size);
+	ether_addr_copy(packet, edev->ndev->dev_addr);
+	ether_addr_copy(packet + ETH_ALEN, edev->ndev->dev_addr);
+	memset(packet + (2 * ETH_ALEN), 0x77, (ETH_HLEN - (2 * ETH_ALEN)));
+	for (i = ETH_HLEN; i < pkt_size; i++)
+		packet[i] = (unsigned char)(i & 0xff);
+
+	rc = qede_selftest_transmit_traffic(edev, skb);
+	if (rc)
+		goto test_loopback_exit;
+
+	rc = qede_selftest_receive_traffic(edev);
+	if (rc)
+		goto test_loopback_exit;
+
+	DP_VERBOSE(edev, NETIF_MSG_RX_STATUS, "Loopback test successful\n");
+
+test_loopback_exit:
+	dev_kfree_skb(skb);
+
+	/* Bring up the link in Normal mode */
+	memset(&link_params, 0, sizeof(link_params));
+	link_params.link_up = true;
+	link_params.override_flags = QED_LINK_OVERRIDE_LOOPBACK_MODE;
+	link_params.loopback_mode = QED_LINK_LOOPBACK_NONE;
+	edev->ops->common->set_link(edev->cdev, &link_params);
+
+	/* Wait for loopback configuration to apply */
+	msleep_interruptible(500);
+
+	qede_netif_start(edev);
+
+	return rc;
+}
+
+static void qede_self_test(struct net_device *dev,
+			   struct ethtool_test *etest, u64 *buf)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	DP_VERBOSE(edev, QED_MSG_DEBUG,
+		   "Self-test command parameters: offline = %d, external_lb = %d\n",
+		   (etest->flags & ETH_TEST_FL_OFFLINE),
+		   (etest->flags & ETH_TEST_FL_EXTERNAL_LB) >> 2);
+
+	memset(buf, 0, sizeof(u64) * QEDE_ETHTOOL_TEST_MAX);
+
+	if (etest->flags & ETH_TEST_FL_OFFLINE) {
+		if (qede_selftest_run_loopback(edev,
+					       QED_LINK_LOOPBACK_INT_PHY)) {
+			buf[QEDE_ETHTOOL_INT_LOOPBACK] = 1;
+			etest->flags |= ETH_TEST_FL_FAILED;
+		}
+	}
+
+	if (edev->ops->common->selftest->selftest_interrupt(edev->cdev)) {
+		buf[QEDE_ETHTOOL_INTERRUPT_TEST] = 1;
+		etest->flags |= ETH_TEST_FL_FAILED;
+	}
+
+	if (edev->ops->common->selftest->selftest_memory(edev->cdev)) {
+		buf[QEDE_ETHTOOL_MEMORY_TEST] = 1;
+		etest->flags |= ETH_TEST_FL_FAILED;
+	}
+
+	if (edev->ops->common->selftest->selftest_register(edev->cdev)) {
+		buf[QEDE_ETHTOOL_REGISTER_TEST] = 1;
+		etest->flags |= ETH_TEST_FL_FAILED;
+	}
+
+	if (edev->ops->common->selftest->selftest_clock(edev->cdev)) {
+		buf[QEDE_ETHTOOL_CLOCK_TEST] = 1;
+		etest->flags |= ETH_TEST_FL_FAILED;
+	}
+
+	if (edev->ops->common->selftest->selftest_nvram(edev->cdev)) {
+		buf[QEDE_ETHTOOL_NVRAM_TEST] = 1;
+		etest->flags |= ETH_TEST_FL_FAILED;
+	}
+}
+
+static int qede_set_tunable(struct net_device *dev,
+			    const struct ethtool_tunable *tuna,
+			    const void *data)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	u32 val;
+
+	switch (tuna->id) {
+	case ETHTOOL_RX_COPYBREAK:
+		val = *(u32 *)data;
+		if (val < QEDE_MIN_PKT_LEN || val > QEDE_RX_HDR_SIZE) {
+			DP_VERBOSE(edev, QED_MSG_DEBUG,
+				   "Invalid rx copy break value, range is [%u, %u]",
+				   QEDE_MIN_PKT_LEN, QEDE_RX_HDR_SIZE);
+			return -EINVAL;
+		}
+
+		edev->rx_copybreak = *(u32 *)data;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int qede_get_tunable(struct net_device *dev,
+			    const struct ethtool_tunable *tuna, void *data)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	switch (tuna->id) {
+	case ETHTOOL_RX_COPYBREAK:
+		*(u32 *)data = edev->rx_copybreak;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int qede_get_eee(struct net_device *dev, struct ethtool_eee *edata)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qed_link_output current_link;
+
+	memset(&current_link, 0, sizeof(current_link));
+	edev->ops->common->get_link(edev->cdev, &current_link);
+
+	if (!current_link.eee_supported) {
+		DP_INFO(edev, "EEE is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (current_link.eee.adv_caps & QED_EEE_1G_ADV)
+		edata->advertised = ADVERTISED_1000baseT_Full;
+	if (current_link.eee.adv_caps & QED_EEE_10G_ADV)
+		edata->advertised |= ADVERTISED_10000baseT_Full;
+	if (current_link.sup_caps & QED_EEE_1G_ADV)
+		edata->supported = ADVERTISED_1000baseT_Full;
+	if (current_link.sup_caps & QED_EEE_10G_ADV)
+		edata->supported |= ADVERTISED_10000baseT_Full;
+	if (current_link.eee.lp_adv_caps & QED_EEE_1G_ADV)
+		edata->lp_advertised = ADVERTISED_1000baseT_Full;
+	if (current_link.eee.lp_adv_caps & QED_EEE_10G_ADV)
+		edata->lp_advertised |= ADVERTISED_10000baseT_Full;
+
+	edata->tx_lpi_timer = current_link.eee.tx_lpi_timer;
+	edata->eee_enabled = current_link.eee.enable;
+	edata->tx_lpi_enabled = current_link.eee.tx_lpi_enable;
+	edata->eee_active = current_link.eee_active;
+
+	return 0;
+}
+
+static int qede_set_eee(struct net_device *dev, struct ethtool_eee *edata)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qed_link_output current_link;
+	struct qed_link_params params;
+
+	if (!edev->ops->common->can_link_change(edev->cdev)) {
+		DP_INFO(edev, "Link settings are not allowed to be changed\n");
+		return -EOPNOTSUPP;
+	}
+
+	memset(&current_link, 0, sizeof(current_link));
+	edev->ops->common->get_link(edev->cdev, &current_link);
+
+	if (!current_link.eee_supported) {
+		DP_INFO(edev, "EEE is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	memset(&params, 0, sizeof(params));
+	params.override_flags |= QED_LINK_OVERRIDE_EEE_CONFIG;
+
+	if (!(edata->advertised & (ADVERTISED_1000baseT_Full |
+				   ADVERTISED_10000baseT_Full)) ||
+	    ((edata->advertised & (ADVERTISED_1000baseT_Full |
+				   ADVERTISED_10000baseT_Full)) !=
+	     edata->advertised)) {
+		DP_VERBOSE(edev, QED_MSG_DEBUG,
+			   "Invalid advertised capabilities %d\n",
+			   edata->advertised);
+		return -EINVAL;
+	}
+
+	if (edata->advertised & ADVERTISED_1000baseT_Full)
+		params.eee.adv_caps = QED_EEE_1G_ADV;
+	if (edata->advertised & ADVERTISED_10000baseT_Full)
+		params.eee.adv_caps |= QED_EEE_10G_ADV;
+	params.eee.enable = edata->eee_enabled;
+	params.eee.tx_lpi_enable = edata->tx_lpi_enabled;
+	params.eee.tx_lpi_timer = edata->tx_lpi_timer;
+
+	params.link_up = true;
+	edev->ops->common->set_link(edev->cdev, &params);
+
+	return 0;
+}
+
+static const struct ethtool_ops qede_ethtool_ops = {
+	.get_link_ksettings = qede_get_link_ksettings,
+	.set_link_ksettings = qede_set_link_ksettings,
+	.get_drvinfo = qede_get_drvinfo,
+	.get_regs_len = qede_get_regs_len,
+	.get_regs = qede_get_regs,
+	.get_wol = qede_get_wol,
+	.set_wol = qede_set_wol,
+	.get_msglevel = qede_get_msglevel,
+	.set_msglevel = qede_set_msglevel,
+	.nway_reset = qede_nway_reset,
+	.get_link = qede_get_link,
+	.get_coalesce = qede_get_coalesce,
+	.set_coalesce = qede_set_coalesce,
+	.get_ringparam = qede_get_ringparam,
+	.set_ringparam = qede_set_ringparam,
+	.get_pauseparam = qede_get_pauseparam,
+	.set_pauseparam = qede_set_pauseparam,
+	.get_strings = qede_get_strings,
+	.set_phys_id = qede_set_phys_id,
+	.get_ethtool_stats = qede_get_ethtool_stats,
+	.get_priv_flags = qede_get_priv_flags,
+	.get_sset_count = qede_get_sset_count,
+	.get_rxnfc = qede_get_rxnfc,
+	.set_rxnfc = qede_set_rxnfc,
+	.get_rxfh_indir_size = qede_get_rxfh_indir_size,
+	.get_rxfh_key_size = qede_get_rxfh_key_size,
+	.get_rxfh = qede_get_rxfh,
+	.set_rxfh = qede_set_rxfh,
+	.get_ts_info = qede_get_ts_info,
+	.get_channels = qede_get_channels,
+	.set_channels = qede_set_channels,
+	.self_test = qede_self_test,
+	.get_eee = qede_get_eee,
+	.set_eee = qede_set_eee,
+
+	.get_tunable = qede_get_tunable,
+	.set_tunable = qede_set_tunable,
+	.flash_device = qede_flash_device,
+};
+
+static const struct ethtool_ops qede_vf_ethtool_ops = {
+	.get_link_ksettings = qede_get_link_ksettings,
+	.get_drvinfo = qede_get_drvinfo,
+	.get_msglevel = qede_get_msglevel,
+	.set_msglevel = qede_set_msglevel,
+	.get_link = qede_get_link,
+	.get_coalesce = qede_get_coalesce,
+	.set_coalesce = qede_set_coalesce,
+	.get_ringparam = qede_get_ringparam,
+	.set_ringparam = qede_set_ringparam,
+	.get_strings = qede_get_strings,
+	.get_ethtool_stats = qede_get_ethtool_stats,
+	.get_priv_flags = qede_get_priv_flags,
+	.get_sset_count = qede_get_sset_count,
+	.get_rxnfc = qede_get_rxnfc,
+	.set_rxnfc = qede_set_rxnfc,
+	.get_rxfh_indir_size = qede_get_rxfh_indir_size,
+	.get_rxfh_key_size = qede_get_rxfh_key_size,
+	.get_rxfh = qede_get_rxfh,
+	.set_rxfh = qede_set_rxfh,
+	.get_channels = qede_get_channels,
+	.set_channels = qede_set_channels,
+	.get_tunable = qede_get_tunable,
+	.set_tunable = qede_set_tunable,
+};
+
+void qede_set_ethtool_ops(struct net_device *dev)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (IS_VF(edev))
+		dev->ethtool_ops = &qede_vf_ethtool_ops;
+	else
+		dev->ethtool_ops = &qede_ethtool_ops;
+}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
new file mode 100644
index 0000000..6687e04
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -0,0 +1,1693 @@
+/* QLogic qede NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/udp_tunnel.h>
+#include <linux/bitops.h>
+#include <linux/vmalloc.h>
+
+#include <linux/qed/qed_if.h>
+#include "qede.h"
+
+struct qede_arfs_tuple {
+	union {
+		__be32 src_ipv4;
+		struct in6_addr src_ipv6;
+	};
+	union {
+		__be32 dst_ipv4;
+		struct in6_addr dst_ipv6;
+	};
+	__be16  src_port;
+	__be16  dst_port;
+	__be16  eth_proto;
+	u8      ip_proto;
+};
+
+struct qede_arfs_fltr_node {
+#define QEDE_FLTR_VALID	 0
+	unsigned long state;
+
+	/* pointer to aRFS packet buffer */
+	void *data;
+
+	/* dma map address of aRFS packet buffer */
+	dma_addr_t mapping;
+
+	/* length of aRFS packet buffer */
+	int buf_len;
+
+	/* tuples to hold from aRFS packet buffer */
+	struct qede_arfs_tuple tuple;
+
+	u32 flow_id;
+	u16 sw_id;
+	u16 rxq_id;
+	u16 next_rxq_id;
+	bool filter_op;
+	bool used;
+	u8 fw_rc;
+	struct hlist_node node;
+};
+
+struct qede_arfs {
+#define QEDE_ARFS_BUCKET_HEAD(edev, idx) (&(edev)->arfs->arfs_hl_head[idx])
+#define QEDE_ARFS_POLL_COUNT	100
+#define QEDE_RFS_FLW_BITSHIFT	(4)
+#define QEDE_RFS_FLW_MASK	((1 << QEDE_RFS_FLW_BITSHIFT) - 1)
+	struct hlist_head	arfs_hl_head[1 << QEDE_RFS_FLW_BITSHIFT];
+
+	/* lock for filter list access */
+	spinlock_t		arfs_list_lock;
+	unsigned long		*arfs_fltr_bmap;
+	int			filter_count;
+	bool			enable;
+};
+
+static void qede_configure_arfs_fltr(struct qede_dev *edev,
+				     struct qede_arfs_fltr_node *n,
+				     u16 rxq_id, bool add_fltr)
+{
+	const struct qed_eth_ops *op = edev->ops;
+	struct qed_ntuple_filter_params params;
+
+	if (n->used)
+		return;
+
+	memset(&params, 0, sizeof(params));
+
+	params.addr = n->mapping;
+	params.length = n->buf_len;
+	params.qid = rxq_id;
+	params.b_is_add = add_fltr;
+
+	DP_VERBOSE(edev, NETIF_MSG_RX_STATUS,
+		   "%s arfs filter flow_id=%d, sw_id=%d, src_port=%d, dst_port=%d, rxq=%d\n",
+		   add_fltr ? "Adding" : "Deleting",
+		   n->flow_id, n->sw_id, ntohs(n->tuple.src_port),
+		   ntohs(n->tuple.dst_port), rxq_id);
+
+	n->used = true;
+	n->filter_op = add_fltr;
+	op->ntuple_filter_config(edev->cdev, n, &params);
+}
+
+static void
+qede_free_arfs_filter(struct qede_dev *edev,  struct qede_arfs_fltr_node *fltr)
+{
+	kfree(fltr->data);
+	clear_bit(fltr->sw_id, edev->arfs->arfs_fltr_bmap);
+	kfree(fltr);
+}
+
+static int
+qede_enqueue_fltr_and_config_searcher(struct qede_dev *edev,
+				      struct qede_arfs_fltr_node *fltr,
+				      u16 bucket_idx)
+{
+	fltr->mapping = dma_map_single(&edev->pdev->dev, fltr->data,
+				       fltr->buf_len, DMA_TO_DEVICE);
+	if (dma_mapping_error(&edev->pdev->dev, fltr->mapping)) {
+		DP_NOTICE(edev, "Failed to map DMA memory for rule\n");
+		qede_free_arfs_filter(edev, fltr);
+		return -ENOMEM;
+	}
+
+	INIT_HLIST_NODE(&fltr->node);
+	hlist_add_head(&fltr->node,
+		       QEDE_ARFS_BUCKET_HEAD(edev, bucket_idx));
+	edev->arfs->filter_count++;
+
+	if (edev->arfs->filter_count == 1 && !edev->arfs->enable) {
+		enum qed_filter_config_mode mode;
+
+		mode = QED_FILTER_CONFIG_MODE_5_TUPLE;
+		edev->ops->configure_arfs_searcher(edev->cdev, mode);
+		edev->arfs->enable = true;
+	}
+
+	return 0;
+}
+
+static void
+qede_dequeue_fltr_and_config_searcher(struct qede_dev *edev,
+				      struct qede_arfs_fltr_node *fltr)
+{
+	hlist_del(&fltr->node);
+	dma_unmap_single(&edev->pdev->dev, fltr->mapping,
+			 fltr->buf_len, DMA_TO_DEVICE);
+
+	qede_free_arfs_filter(edev, fltr);
+	edev->arfs->filter_count--;
+
+	if (!edev->arfs->filter_count && edev->arfs->enable) {
+		enum qed_filter_config_mode mode;
+
+		mode = QED_FILTER_CONFIG_MODE_DISABLE;
+		edev->arfs->enable = false;
+		edev->ops->configure_arfs_searcher(edev->cdev, mode);
+	}
+}
+
+void qede_arfs_filter_op(void *dev, void *filter, u8 fw_rc)
+{
+	struct qede_arfs_fltr_node *fltr = filter;
+	struct qede_dev *edev = dev;
+
+	fltr->fw_rc = fw_rc;
+
+	if (fw_rc) {
+		DP_NOTICE(edev,
+			  "Failed arfs filter configuration fw_rc=%d, flow_id=%d, sw_id=%d, src_port=%d, dst_port=%d, rxq=%d\n",
+			  fw_rc, fltr->flow_id, fltr->sw_id,
+			  ntohs(fltr->tuple.src_port),
+			  ntohs(fltr->tuple.dst_port), fltr->rxq_id);
+
+		spin_lock_bh(&edev->arfs->arfs_list_lock);
+
+		fltr->used = false;
+		clear_bit(QEDE_FLTR_VALID, &fltr->state);
+
+		spin_unlock_bh(&edev->arfs->arfs_list_lock);
+		return;
+	}
+
+	spin_lock_bh(&edev->arfs->arfs_list_lock);
+
+	fltr->used = false;
+
+	if (fltr->filter_op) {
+		set_bit(QEDE_FLTR_VALID, &fltr->state);
+		if (fltr->rxq_id != fltr->next_rxq_id)
+			qede_configure_arfs_fltr(edev, fltr, fltr->rxq_id,
+						 false);
+	} else {
+		clear_bit(QEDE_FLTR_VALID, &fltr->state);
+		if (fltr->rxq_id != fltr->next_rxq_id) {
+			fltr->rxq_id = fltr->next_rxq_id;
+			qede_configure_arfs_fltr(edev, fltr,
+						 fltr->rxq_id, true);
+		}
+	}
+
+	spin_unlock_bh(&edev->arfs->arfs_list_lock);
+}
+
+/* Should be called while qede_lock is held */
+void qede_process_arfs_filters(struct qede_dev *edev, bool free_fltr)
+{
+	int i;
+
+	for (i = 0; i <= QEDE_RFS_FLW_MASK; i++) {
+		struct hlist_node *temp;
+		struct hlist_head *head;
+		struct qede_arfs_fltr_node *fltr;
+
+		head = &edev->arfs->arfs_hl_head[i];
+
+		hlist_for_each_entry_safe(fltr, temp, head, node) {
+			bool del = false;
+
+			if (edev->state != QEDE_STATE_OPEN)
+				del = true;
+
+			spin_lock_bh(&edev->arfs->arfs_list_lock);
+
+			if ((!test_bit(QEDE_FLTR_VALID, &fltr->state) &&
+			     !fltr->used) || free_fltr) {
+				qede_dequeue_fltr_and_config_searcher(edev,
+								      fltr);
+			} else {
+				bool flow_exp = false;
+#ifdef CONFIG_RFS_ACCEL
+				flow_exp = rps_may_expire_flow(edev->ndev,
+							       fltr->rxq_id,
+							       fltr->flow_id,
+							       fltr->sw_id);
+#endif
+				if ((flow_exp || del) && !free_fltr)
+					qede_configure_arfs_fltr(edev, fltr,
+								 fltr->rxq_id,
+								 false);
+			}
+
+			spin_unlock_bh(&edev->arfs->arfs_list_lock);
+		}
+	}
+
+	spin_lock_bh(&edev->arfs->arfs_list_lock);
+
+	if (!edev->arfs->filter_count) {
+		if (edev->arfs->enable) {
+			enum qed_filter_config_mode mode;
+
+			mode = QED_FILTER_CONFIG_MODE_DISABLE;
+			edev->arfs->enable = false;
+			edev->ops->configure_arfs_searcher(edev->cdev, mode);
+		}
+#ifdef CONFIG_RFS_ACCEL
+	} else {
+		set_bit(QEDE_SP_ARFS_CONFIG, &edev->sp_flags);
+		schedule_delayed_work(&edev->sp_task,
+				      QEDE_SP_TASK_POLL_DELAY);
+#endif
+	}
+
+	spin_unlock_bh(&edev->arfs->arfs_list_lock);
+}
+
+/* This function waits until all aRFS filters get deleted and freed.
+ * On timeout it frees all filters forcefully.
+ */
+void qede_poll_for_freeing_arfs_filters(struct qede_dev *edev)
+{
+	int count = QEDE_ARFS_POLL_COUNT;
+
+	while (count) {
+		qede_process_arfs_filters(edev, false);
+
+		if (!edev->arfs->filter_count)
+			break;
+
+		msleep(100);
+		count--;
+	}
+
+	if (!count) {
+		DP_NOTICE(edev, "Timeout in polling for arfs filter free\n");
+
+		/* Something is terribly wrong, free forcefully */
+		qede_process_arfs_filters(edev, true);
+	}
+}
+
+int qede_alloc_arfs(struct qede_dev *edev)
+{
+	int i;
+
+	edev->arfs = vzalloc(sizeof(*edev->arfs));
+	if (!edev->arfs)
+		return -ENOMEM;
+
+	spin_lock_init(&edev->arfs->arfs_list_lock);
+
+	for (i = 0; i <= QEDE_RFS_FLW_MASK; i++)
+		INIT_HLIST_HEAD(QEDE_ARFS_BUCKET_HEAD(edev, i));
+
+	edev->arfs->arfs_fltr_bmap = vzalloc(BITS_TO_LONGS(QEDE_RFS_MAX_FLTR) *
+					     sizeof(long));
+	if (!edev->arfs->arfs_fltr_bmap) {
+		vfree(edev->arfs);
+		edev->arfs = NULL;
+		return -ENOMEM;
+	}
+
+#ifdef CONFIG_RFS_ACCEL
+	edev->ndev->rx_cpu_rmap = alloc_irq_cpu_rmap(QEDE_RSS_COUNT(edev));
+	if (!edev->ndev->rx_cpu_rmap) {
+		vfree(edev->arfs->arfs_fltr_bmap);
+		edev->arfs->arfs_fltr_bmap = NULL;
+		vfree(edev->arfs);
+		edev->arfs = NULL;
+		return -ENOMEM;
+	}
+#endif
+	return 0;
+}
+
+void qede_free_arfs(struct qede_dev *edev)
+{
+	if (!edev->arfs)
+		return;
+
+#ifdef CONFIG_RFS_ACCEL
+	if (edev->ndev->rx_cpu_rmap)
+		free_irq_cpu_rmap(edev->ndev->rx_cpu_rmap);
+
+	edev->ndev->rx_cpu_rmap = NULL;
+#endif
+	vfree(edev->arfs->arfs_fltr_bmap);
+	edev->arfs->arfs_fltr_bmap = NULL;
+	vfree(edev->arfs);
+	edev->arfs = NULL;
+}
+
+#ifdef CONFIG_RFS_ACCEL
+static bool qede_compare_ip_addr(struct qede_arfs_fltr_node *tpos,
+				 const struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP)) {
+		if (tpos->tuple.src_ipv4 == ip_hdr(skb)->saddr &&
+		    tpos->tuple.dst_ipv4 == ip_hdr(skb)->daddr)
+			return true;
+		else
+			return false;
+	} else {
+		struct in6_addr *src = &tpos->tuple.src_ipv6;
+		u8 size = sizeof(struct in6_addr);
+
+		if (!memcmp(src, &ipv6_hdr(skb)->saddr, size) &&
+		    !memcmp(&tpos->tuple.dst_ipv6, &ipv6_hdr(skb)->daddr, size))
+			return true;
+		else
+			return false;
+	}
+}
+
+static struct qede_arfs_fltr_node *
+qede_arfs_htbl_key_search(struct hlist_head *h, const struct sk_buff *skb,
+			  __be16 src_port, __be16 dst_port, u8 ip_proto)
+{
+	struct qede_arfs_fltr_node *tpos;
+
+	hlist_for_each_entry(tpos, h, node)
+		if (tpos->tuple.ip_proto == ip_proto &&
+		    tpos->tuple.eth_proto == skb->protocol &&
+		    qede_compare_ip_addr(tpos, skb) &&
+		    tpos->tuple.src_port == src_port &&
+		    tpos->tuple.dst_port == dst_port)
+			return tpos;
+
+	return NULL;
+}
+
+static struct qede_arfs_fltr_node *
+qede_alloc_filter(struct qede_dev *edev, int min_hlen)
+{
+	struct qede_arfs_fltr_node *n;
+	int bit_id;
+
+	bit_id = find_first_zero_bit(edev->arfs->arfs_fltr_bmap,
+				     QEDE_RFS_MAX_FLTR);
+
+	if (bit_id >= QEDE_RFS_MAX_FLTR)
+		return NULL;
+
+	n = kzalloc(sizeof(*n), GFP_ATOMIC);
+	if (!n)
+		return NULL;
+
+	n->data = kzalloc(min_hlen, GFP_ATOMIC);
+	if (!n->data) {
+		kfree(n);
+		return NULL;
+	}
+
+	n->sw_id = (u16)bit_id;
+	set_bit(bit_id, edev->arfs->arfs_fltr_bmap);
+	return n;
+}
+
+int qede_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+		       u16 rxq_index, u32 flow_id)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qede_arfs_fltr_node *n;
+	int min_hlen, rc, tp_offset;
+	struct ethhdr *eth;
+	__be16 *ports;
+	u16 tbl_idx;
+	u8 ip_proto;
+
+	if (skb->encapsulation)
+		return -EPROTONOSUPPORT;
+
+	if (skb->protocol != htons(ETH_P_IP) &&
+	    skb->protocol != htons(ETH_P_IPV6))
+		return -EPROTONOSUPPORT;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		ip_proto = ip_hdr(skb)->protocol;
+		tp_offset = sizeof(struct iphdr);
+	} else {
+		ip_proto = ipv6_hdr(skb)->nexthdr;
+		tp_offset = sizeof(struct ipv6hdr);
+	}
+
+	if (ip_proto != IPPROTO_TCP && ip_proto != IPPROTO_UDP)
+		return -EPROTONOSUPPORT;
+
+	ports = (__be16 *)(skb->data + tp_offset);
+	tbl_idx = skb_get_hash_raw(skb) & QEDE_RFS_FLW_MASK;
+
+	spin_lock_bh(&edev->arfs->arfs_list_lock);
+
+	n = qede_arfs_htbl_key_search(QEDE_ARFS_BUCKET_HEAD(edev, tbl_idx),
+				      skb, ports[0], ports[1], ip_proto);
+	if (n) {
+		/* Filter match */
+		n->next_rxq_id = rxq_index;
+
+		if (test_bit(QEDE_FLTR_VALID, &n->state)) {
+			if (n->rxq_id != rxq_index)
+				qede_configure_arfs_fltr(edev, n, n->rxq_id,
+							 false);
+		} else {
+			if (!n->used) {
+				n->rxq_id = rxq_index;
+				qede_configure_arfs_fltr(edev, n, n->rxq_id,
+							 true);
+			}
+		}
+
+		rc = n->sw_id;
+		goto ret_unlock;
+	}
+
+	min_hlen = ETH_HLEN + skb_headlen(skb);
+
+	n = qede_alloc_filter(edev, min_hlen);
+	if (!n) {
+		rc = -ENOMEM;
+		goto ret_unlock;
+	}
+
+	n->buf_len = min_hlen;
+	n->rxq_id = rxq_index;
+	n->next_rxq_id = rxq_index;
+	n->tuple.src_port = ports[0];
+	n->tuple.dst_port = ports[1];
+	n->flow_id = flow_id;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		n->tuple.src_ipv4 = ip_hdr(skb)->saddr;
+		n->tuple.dst_ipv4 = ip_hdr(skb)->daddr;
+	} else {
+		memcpy(&n->tuple.src_ipv6, &ipv6_hdr(skb)->saddr,
+		       sizeof(struct in6_addr));
+		memcpy(&n->tuple.dst_ipv6, &ipv6_hdr(skb)->daddr,
+		       sizeof(struct in6_addr));
+	}
+
+	eth = (struct ethhdr *)n->data;
+	eth->h_proto = skb->protocol;
+	n->tuple.eth_proto = skb->protocol;
+	n->tuple.ip_proto = ip_proto;
+	memcpy(n->data + ETH_HLEN, skb->data, skb_headlen(skb));
+
+	rc = qede_enqueue_fltr_and_config_searcher(edev, n, tbl_idx);
+	if (rc)
+		goto ret_unlock;
+
+	qede_configure_arfs_fltr(edev, n, n->rxq_id, true);
+
+	spin_unlock_bh(&edev->arfs->arfs_list_lock);
+
+	set_bit(QEDE_SP_ARFS_CONFIG, &edev->sp_flags);
+	schedule_delayed_work(&edev->sp_task, 0);
+
+	return n->sw_id;
+
+ret_unlock:
+	spin_unlock_bh(&edev->arfs->arfs_list_lock);
+	return rc;
+}
+#endif
+
+void qede_udp_ports_update(void *dev, u16 vxlan_port, u16 geneve_port)
+{
+	struct qede_dev *edev = dev;
+
+	if (edev->vxlan_dst_port != vxlan_port)
+		edev->vxlan_dst_port = 0;
+
+	if (edev->geneve_dst_port != geneve_port)
+		edev->geneve_dst_port = 0;
+}
+
+void qede_force_mac(void *dev, u8 *mac, bool forced)
+{
+	struct qede_dev *edev = dev;
+
+	__qede_lock(edev);
+
+	/* MAC hints take effect only if we haven't set one already */
+	if (is_valid_ether_addr(edev->ndev->dev_addr) && !forced) {
+		__qede_unlock(edev);
+		return;
+	}
+
+	ether_addr_copy(edev->ndev->dev_addr, mac);
+	__qede_unlock(edev);
+}
+
+void qede_fill_rss_params(struct qede_dev *edev,
+			  struct qed_update_vport_rss_params *rss, u8 *update)
+{
+	bool need_reset = false;
+	int i;
+
+	if (QEDE_RSS_COUNT(edev) <= 1) {
+		memset(rss, 0, sizeof(*rss));
+		*update = 0;
+		return;
+	}
+
+	/* Need to validate current RSS config uses valid entries */
+	for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++) {
+		if (edev->rss_ind_table[i] >= QEDE_RSS_COUNT(edev)) {
+			need_reset = true;
+			break;
+		}
+	}
+
+	if (!(edev->rss_params_inited & QEDE_RSS_INDIR_INITED) || need_reset) {
+		for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++) {
+			u16 indir_val, val;
+
+			val = QEDE_RSS_COUNT(edev);
+			indir_val = ethtool_rxfh_indir_default(i, val);
+			edev->rss_ind_table[i] = indir_val;
+		}
+		edev->rss_params_inited |= QEDE_RSS_INDIR_INITED;
+	}
+
+	/* Now that we have the queue-indirection, prepare the handles */
+	for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++) {
+		u16 idx = QEDE_RX_QUEUE_IDX(edev, edev->rss_ind_table[i]);
+
+		rss->rss_ind_table[i] = edev->fp_array[idx].rxq->handle;
+	}
+
+	if (!(edev->rss_params_inited & QEDE_RSS_KEY_INITED)) {
+		netdev_rss_key_fill(edev->rss_key, sizeof(edev->rss_key));
+		edev->rss_params_inited |= QEDE_RSS_KEY_INITED;
+	}
+	memcpy(rss->rss_key, edev->rss_key, sizeof(rss->rss_key));
+
+	if (!(edev->rss_params_inited & QEDE_RSS_CAPS_INITED)) {
+		edev->rss_caps = QED_RSS_IPV4 | QED_RSS_IPV6 |
+		    QED_RSS_IPV4_TCP | QED_RSS_IPV6_TCP;
+		edev->rss_params_inited |= QEDE_RSS_CAPS_INITED;
+	}
+	rss->rss_caps = edev->rss_caps;
+
+	*update = 1;
+}
+
+static int qede_set_ucast_rx_mac(struct qede_dev *edev,
+				 enum qed_filter_xcast_params_type opcode,
+				 unsigned char mac[ETH_ALEN])
+{
+	struct qed_filter_params filter_cmd;
+
+	memset(&filter_cmd, 0, sizeof(filter_cmd));
+	filter_cmd.type = QED_FILTER_TYPE_UCAST;
+	filter_cmd.filter.ucast.type = opcode;
+	filter_cmd.filter.ucast.mac_valid = 1;
+	ether_addr_copy(filter_cmd.filter.ucast.mac, mac);
+
+	return edev->ops->filter_config(edev->cdev, &filter_cmd);
+}
+
+static int qede_set_ucast_rx_vlan(struct qede_dev *edev,
+				  enum qed_filter_xcast_params_type opcode,
+				  u16 vid)
+{
+	struct qed_filter_params filter_cmd;
+
+	memset(&filter_cmd, 0, sizeof(filter_cmd));
+	filter_cmd.type = QED_FILTER_TYPE_UCAST;
+	filter_cmd.filter.ucast.type = opcode;
+	filter_cmd.filter.ucast.vlan_valid = 1;
+	filter_cmd.filter.ucast.vlan = vid;
+
+	return edev->ops->filter_config(edev->cdev, &filter_cmd);
+}
+
+static int qede_config_accept_any_vlan(struct qede_dev *edev, bool action)
+{
+	struct qed_update_vport_params *params;
+	int rc;
+
+	/* Proceed only if action actually needs to be performed */
+	if (edev->accept_any_vlan == action)
+		return 0;
+
+	params = vzalloc(sizeof(*params));
+	if (!params)
+		return -ENOMEM;
+
+	params->vport_id = 0;
+	params->accept_any_vlan = action;
+	params->update_accept_any_vlan_flg = 1;
+
+	rc = edev->ops->vport_update(edev->cdev, params);
+	if (rc) {
+		DP_ERR(edev, "Failed to %s accept-any-vlan\n",
+		       action ? "enable" : "disable");
+	} else {
+		DP_INFO(edev, "%s accept-any-vlan\n",
+			action ? "enabled" : "disabled");
+		edev->accept_any_vlan = action;
+	}
+
+	vfree(params);
+	return 0;
+}
+
+int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qede_vlan *vlan, *tmp;
+	int rc = 0;
+
+	DP_VERBOSE(edev, NETIF_MSG_IFUP, "Adding vlan 0x%04x\n", vid);
+
+	vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
+	if (!vlan) {
+		DP_INFO(edev, "Failed to allocate struct for vlan\n");
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&vlan->list);
+	vlan->vid = vid;
+	vlan->configured = false;
+
+	/* Verify vlan isn't already configured */
+	list_for_each_entry(tmp, &edev->vlan_list, list) {
+		if (tmp->vid == vlan->vid) {
+			DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
+				   "vlan already configured\n");
+			kfree(vlan);
+			return -EEXIST;
+		}
+	}
+
+	/* If interface is down, cache this VLAN ID and return */
+	__qede_lock(edev);
+	if (edev->state != QEDE_STATE_OPEN) {
+		DP_VERBOSE(edev, NETIF_MSG_IFDOWN,
+			   "Interface is down, VLAN %d will be configured when interface is up\n",
+			   vid);
+		if (vid != 0)
+			edev->non_configured_vlans++;
+		list_add(&vlan->list, &edev->vlan_list);
+		goto out;
+	}
+
+	/* Check for the filter limit.
+	 * Note - vlan0 has a reserved filter and can be added without
+	 * worrying about quota
+	 */
+	if ((edev->configured_vlans < edev->dev_info.num_vlan_filters) ||
+	    (vlan->vid == 0)) {
+		rc = qede_set_ucast_rx_vlan(edev,
+					    QED_FILTER_XCAST_TYPE_ADD,
+					    vlan->vid);
+		if (rc) {
+			DP_ERR(edev, "Failed to configure VLAN %d\n",
+			       vlan->vid);
+			kfree(vlan);
+			goto out;
+		}
+		vlan->configured = true;
+
+		/* vlan0 filter isn't consuming out of our quota */
+		if (vlan->vid != 0)
+			edev->configured_vlans++;
+	} else {
+		/* Out of quota; Activate accept-any-VLAN mode */
+		if (!edev->non_configured_vlans) {
+			rc = qede_config_accept_any_vlan(edev, true);
+			if (rc) {
+				kfree(vlan);
+				goto out;
+			}
+		}
+
+		edev->non_configured_vlans++;
+	}
+
+	list_add(&vlan->list, &edev->vlan_list);
+
+out:
+	__qede_unlock(edev);
+	return rc;
+}
+
+static void qede_del_vlan_from_list(struct qede_dev *edev,
+				    struct qede_vlan *vlan)
+{
+	/* vlan0 filter isn't consuming out of our quota */
+	if (vlan->vid != 0) {
+		if (vlan->configured)
+			edev->configured_vlans--;
+		else
+			edev->non_configured_vlans--;
+	}
+
+	list_del(&vlan->list);
+	kfree(vlan);
+}
+
+int qede_configure_vlan_filters(struct qede_dev *edev)
+{
+	int rc = 0, real_rc = 0, accept_any_vlan = 0;
+	struct qed_dev_eth_info *dev_info;
+	struct qede_vlan *vlan = NULL;
+
+	if (list_empty(&edev->vlan_list))
+		return 0;
+
+	dev_info = &edev->dev_info;
+
+	/* Configure non-configured vlans */
+	list_for_each_entry(vlan, &edev->vlan_list, list) {
+		if (vlan->configured)
+			continue;
+
+		/* We have used all our credits, now enable accept_any_vlan */
+		if ((vlan->vid != 0) &&
+		    (edev->configured_vlans == dev_info->num_vlan_filters)) {
+			accept_any_vlan = 1;
+			continue;
+		}
+
+		DP_VERBOSE(edev, NETIF_MSG_IFUP, "Adding vlan %d\n", vlan->vid);
+
+		rc = qede_set_ucast_rx_vlan(edev, QED_FILTER_XCAST_TYPE_ADD,
+					    vlan->vid);
+		if (rc) {
+			DP_ERR(edev, "Failed to configure VLAN %u\n",
+			       vlan->vid);
+			real_rc = rc;
+			continue;
+		}
+
+		vlan->configured = true;
+		/* vlan0 filter doesn't consume our VLAN filter's quota */
+		if (vlan->vid != 0) {
+			edev->non_configured_vlans--;
+			edev->configured_vlans++;
+		}
+	}
+
+	/* enable accept_any_vlan mode if we have more VLANs than credits,
+	 * or remove accept_any_vlan mode if we've actually removed
+	 * a non-configured vlan, and all remaining vlans are truly configured.
+	 */
+
+	if (accept_any_vlan)
+		rc = qede_config_accept_any_vlan(edev, true);
+	else if (!edev->non_configured_vlans)
+		rc = qede_config_accept_any_vlan(edev, false);
+
+	if (rc && !real_rc)
+		real_rc = rc;
+
+	return real_rc;
+}
+
+int qede_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qede_vlan *vlan = NULL;
+	int rc = 0;
+
+	DP_VERBOSE(edev, NETIF_MSG_IFDOWN, "Removing vlan 0x%04x\n", vid);
+
+	/* Find whether entry exists */
+	__qede_lock(edev);
+	list_for_each_entry(vlan, &edev->vlan_list, list)
+		if (vlan->vid == vid)
+			break;
+
+	if (!vlan || (vlan->vid != vid)) {
+		DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
+			   "Vlan isn't configured\n");
+		goto out;
+	}
+
+	if (edev->state != QEDE_STATE_OPEN) {
+		/* As interface is already down, we don't have a VPORT
+		 * instance to remove vlan filter. So just update vlan list
+		 */
+		DP_VERBOSE(edev, NETIF_MSG_IFDOWN,
+			   "Interface is down, removing VLAN from list only\n");
+		qede_del_vlan_from_list(edev, vlan);
+		goto out;
+	}
+
+	/* Remove vlan */
+	if (vlan->configured) {
+		rc = qede_set_ucast_rx_vlan(edev, QED_FILTER_XCAST_TYPE_DEL,
+					    vid);
+		if (rc) {
+			DP_ERR(edev, "Failed to remove VLAN %d\n", vid);
+			goto out;
+		}
+	}
+
+	qede_del_vlan_from_list(edev, vlan);
+
+	/* We have removed a VLAN - try to see if we can
+	 * configure non-configured VLAN from the list.
+	 */
+	rc = qede_configure_vlan_filters(edev);
+
+out:
+	__qede_unlock(edev);
+	return rc;
+}
+
+void qede_vlan_mark_nonconfigured(struct qede_dev *edev)
+{
+	struct qede_vlan *vlan = NULL;
+
+	if (list_empty(&edev->vlan_list))
+		return;
+
+	list_for_each_entry(vlan, &edev->vlan_list, list) {
+		if (!vlan->configured)
+			continue;
+
+		vlan->configured = false;
+
+		/* vlan0 filter isn't consuming out of our quota */
+		if (vlan->vid != 0) {
+			edev->non_configured_vlans++;
+			edev->configured_vlans--;
+		}
+
+		DP_VERBOSE(edev, NETIF_MSG_IFDOWN,
+			   "marked vlan %d as non-configured\n", vlan->vid);
+	}
+
+	edev->accept_any_vlan = false;
+}
+
+static void qede_set_features_reload(struct qede_dev *edev,
+				     struct qede_reload_args *args)
+{
+	edev->ndev->features = args->u.features;
+}
+
+netdev_features_t qede_fix_features(struct net_device *dev,
+				    netdev_features_t features)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (edev->xdp_prog || edev->ndev->mtu > PAGE_SIZE ||
+	    !(features & NETIF_F_GRO))
+		features &= ~NETIF_F_GRO_HW;
+
+	return features;
+}
+
+int qede_set_features(struct net_device *dev, netdev_features_t features)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	netdev_features_t changes = features ^ dev->features;
+	bool need_reload = false;
+
+	if (changes & NETIF_F_GRO_HW)
+		need_reload = true;
+
+	if (need_reload) {
+		struct qede_reload_args args;
+
+		args.u.features = features;
+		args.func = &qede_set_features_reload;
+
+		/* Make sure that we definitely need to reload.
+		 * In case of an eBPF attached program, there will be no FW
+		 * aggregations, so no need to actually reload.
+		 */
+		__qede_lock(edev);
+		if (edev->xdp_prog)
+			args.func(edev, &args);
+		else
+			qede_reload(edev, &args, true);
+		__qede_unlock(edev);
+
+		return 1;
+	}
+
+	return 0;
+}
+
+void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qed_tunn_params tunn_params;
+	u16 t_port = ntohs(ti->port);
+	int rc;
+
+	memset(&tunn_params, 0, sizeof(tunn_params));
+
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		if (!edev->dev_info.common.vxlan_enable)
+			return;
+
+		if (edev->vxlan_dst_port)
+			return;
+
+		tunn_params.update_vxlan_port = 1;
+		tunn_params.vxlan_port = t_port;
+
+		__qede_lock(edev);
+		rc = edev->ops->tunn_config(edev->cdev, &tunn_params);
+		__qede_unlock(edev);
+
+		if (!rc) {
+			edev->vxlan_dst_port = t_port;
+			DP_VERBOSE(edev, QED_MSG_DEBUG, "Added vxlan port=%d\n",
+				   t_port);
+		} else {
+			DP_NOTICE(edev, "Failed to add vxlan UDP port=%d\n",
+				  t_port);
+		}
+
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		if (!edev->dev_info.common.geneve_enable)
+			return;
+
+		if (edev->geneve_dst_port)
+			return;
+
+		tunn_params.update_geneve_port = 1;
+		tunn_params.geneve_port = t_port;
+
+		__qede_lock(edev);
+		rc = edev->ops->tunn_config(edev->cdev, &tunn_params);
+		__qede_unlock(edev);
+
+		if (!rc) {
+			edev->geneve_dst_port = t_port;
+			DP_VERBOSE(edev, QED_MSG_DEBUG,
+				   "Added geneve port=%d\n", t_port);
+		} else {
+			DP_NOTICE(edev, "Failed to add geneve UDP port=%d\n",
+				  t_port);
+		}
+
+		break;
+	default:
+		return;
+	}
+}
+
+void qede_udp_tunnel_del(struct net_device *dev,
+			 struct udp_tunnel_info *ti)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qed_tunn_params tunn_params;
+	u16 t_port = ntohs(ti->port);
+
+	memset(&tunn_params, 0, sizeof(tunn_params));
+
+	switch (ti->type) {
+	case UDP_TUNNEL_TYPE_VXLAN:
+		if (t_port != edev->vxlan_dst_port)
+			return;
+
+		tunn_params.update_vxlan_port = 1;
+		tunn_params.vxlan_port = 0;
+
+		__qede_lock(edev);
+		edev->ops->tunn_config(edev->cdev, &tunn_params);
+		__qede_unlock(edev);
+
+		edev->vxlan_dst_port = 0;
+
+		DP_VERBOSE(edev, QED_MSG_DEBUG, "Deleted vxlan port=%d\n",
+			   t_port);
+
+		break;
+	case UDP_TUNNEL_TYPE_GENEVE:
+		if (t_port != edev->geneve_dst_port)
+			return;
+
+		tunn_params.update_geneve_port = 1;
+		tunn_params.geneve_port = 0;
+
+		__qede_lock(edev);
+		edev->ops->tunn_config(edev->cdev, &tunn_params);
+		__qede_unlock(edev);
+
+		edev->geneve_dst_port = 0;
+
+		DP_VERBOSE(edev, QED_MSG_DEBUG, "Deleted geneve port=%d\n",
+			   t_port);
+		break;
+	default:
+		return;
+	}
+}
+
+static void qede_xdp_reload_func(struct qede_dev *edev,
+				 struct qede_reload_args *args)
+{
+	struct bpf_prog *old;
+
+	old = xchg(&edev->xdp_prog, args->u.new_prog);
+	if (old)
+		bpf_prog_put(old);
+}
+
+static int qede_xdp_set(struct qede_dev *edev, struct bpf_prog *prog)
+{
+	struct qede_reload_args args;
+
+	/* If we're called, there was already a bpf reference increment */
+	args.func = &qede_xdp_reload_func;
+	args.u.new_prog = prog;
+	qede_reload(edev, &args, false);
+
+	return 0;
+}
+
+int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return qede_xdp_set(edev, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_attached = !!edev->xdp_prog;
+		xdp->prog_id = edev->xdp_prog ? edev->xdp_prog->aux->id : 0;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int qede_set_mcast_rx_mac(struct qede_dev *edev,
+				 enum qed_filter_xcast_params_type opcode,
+				 unsigned char *mac, int num_macs)
+{
+	struct qed_filter_params filter_cmd;
+	int i;
+
+	memset(&filter_cmd, 0, sizeof(filter_cmd));
+	filter_cmd.type = QED_FILTER_TYPE_MCAST;
+	filter_cmd.filter.mcast.type = opcode;
+	filter_cmd.filter.mcast.num = num_macs;
+
+	for (i = 0; i < num_macs; i++, mac += ETH_ALEN)
+		ether_addr_copy(filter_cmd.filter.mcast.mac[i], mac);
+
+	return edev->ops->filter_config(edev->cdev, &filter_cmd);
+}
+
+int qede_set_mac_addr(struct net_device *ndev, void *p)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+	struct sockaddr *addr = p;
+	int rc = 0;
+
+	/* Make sure the state doesn't transition while changing the MAC.
+	 * Also, all flows accessing the dev_addr field are doing that under
+	 * this lock.
+	 */
+	__qede_lock(edev);
+
+	if (!is_valid_ether_addr(addr->sa_data)) {
+		DP_NOTICE(edev, "The MAC address is not valid\n");
+		rc = -EFAULT;
+		goto out;
+	}
+
+	if (!edev->ops->check_mac(edev->cdev, addr->sa_data)) {
+		DP_NOTICE(edev, "qed prevents setting MAC %pM\n",
+			  addr->sa_data);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (edev->state == QEDE_STATE_OPEN) {
+		/* Remove the previous primary mac */
+		rc = qede_set_ucast_rx_mac(edev, QED_FILTER_XCAST_TYPE_DEL,
+					   ndev->dev_addr);
+		if (rc)
+			goto out;
+	}
+
+	ether_addr_copy(ndev->dev_addr, addr->sa_data);
+	DP_INFO(edev, "Setting device MAC to %pM\n", addr->sa_data);
+
+	if (edev->state != QEDE_STATE_OPEN) {
+		DP_VERBOSE(edev, NETIF_MSG_IFDOWN,
+			   "The device is currently down\n");
+		goto out;
+	}
+
+	edev->ops->common->update_mac(edev->cdev, ndev->dev_addr);
+
+	rc = qede_set_ucast_rx_mac(edev, QED_FILTER_XCAST_TYPE_ADD,
+				   ndev->dev_addr);
+out:
+	__qede_unlock(edev);
+	return rc;
+}
+
+static int
+qede_configure_mcast_filtering(struct net_device *ndev,
+			       enum qed_filter_rx_mode_type *accept_flags)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+	unsigned char *mc_macs, *temp;
+	struct netdev_hw_addr *ha;
+	int rc = 0, mc_count;
+	size_t size;
+
+	size = 64 * ETH_ALEN;
+
+	mc_macs = kzalloc(size, GFP_KERNEL);
+	if (!mc_macs) {
+		DP_NOTICE(edev,
+			  "Failed to allocate memory for multicast MACs\n");
+		rc = -ENOMEM;
+		goto exit;
+	}
+
+	temp = mc_macs;
+
+	/* Remove all previously configured MAC filters */
+	rc = qede_set_mcast_rx_mac(edev, QED_FILTER_XCAST_TYPE_DEL,
+				   mc_macs, 1);
+	if (rc)
+		goto exit;
+
+	netif_addr_lock_bh(ndev);
+
+	mc_count = netdev_mc_count(ndev);
+	if (mc_count < 64) {
+		netdev_for_each_mc_addr(ha, ndev) {
+			ether_addr_copy(temp, ha->addr);
+			temp += ETH_ALEN;
+		}
+	}
+
+	netif_addr_unlock_bh(ndev);
+
+	/* Check for all multicast @@@TBD resource allocation */
+	if ((ndev->flags & IFF_ALLMULTI) || (mc_count > 64)) {
+		if (*accept_flags == QED_FILTER_RX_MODE_TYPE_REGULAR)
+			*accept_flags = QED_FILTER_RX_MODE_TYPE_MULTI_PROMISC;
+	} else {
+		/* Add all multicast MAC filters */
+		rc = qede_set_mcast_rx_mac(edev, QED_FILTER_XCAST_TYPE_ADD,
+					   mc_macs, mc_count);
+	}
+
+exit:
+	kfree(mc_macs);
+	return rc;
+}
+
+void qede_set_rx_mode(struct net_device *ndev)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	set_bit(QEDE_SP_RX_MODE, &edev->sp_flags);
+	schedule_delayed_work(&edev->sp_task, 0);
+}
+
+/* Must be called with qede_lock held */
+void qede_config_rx_mode(struct net_device *ndev)
+{
+	enum qed_filter_rx_mode_type accept_flags;
+	struct qede_dev *edev = netdev_priv(ndev);
+	struct qed_filter_params rx_mode;
+	unsigned char *uc_macs, *temp;
+	struct netdev_hw_addr *ha;
+	int rc, uc_count;
+	size_t size;
+
+	netif_addr_lock_bh(ndev);
+
+	uc_count = netdev_uc_count(ndev);
+	size = uc_count * ETH_ALEN;
+
+	uc_macs = kzalloc(size, GFP_ATOMIC);
+	if (!uc_macs) {
+		DP_NOTICE(edev, "Failed to allocate memory for unicast MACs\n");
+		netif_addr_unlock_bh(ndev);
+		return;
+	}
+
+	temp = uc_macs;
+	netdev_for_each_uc_addr(ha, ndev) {
+		ether_addr_copy(temp, ha->addr);
+		temp += ETH_ALEN;
+	}
+
+	netif_addr_unlock_bh(ndev);
+
+	/* Configure the struct for the Rx mode */
+	memset(&rx_mode, 0, sizeof(struct qed_filter_params));
+	rx_mode.type = QED_FILTER_TYPE_RX_MODE;
+
+	/* Remove all previous unicast secondary macs and multicast macs
+	 * (configrue / leave the primary mac)
+	 */
+	rc = qede_set_ucast_rx_mac(edev, QED_FILTER_XCAST_TYPE_REPLACE,
+				   edev->ndev->dev_addr);
+	if (rc)
+		goto out;
+
+	/* Check for promiscuous */
+	if (ndev->flags & IFF_PROMISC)
+		accept_flags = QED_FILTER_RX_MODE_TYPE_PROMISC;
+	else
+		accept_flags = QED_FILTER_RX_MODE_TYPE_REGULAR;
+
+	/* Configure all filters regardless, in case promisc is rejected */
+	if (uc_count < edev->dev_info.num_mac_filters) {
+		int i;
+
+		temp = uc_macs;
+		for (i = 0; i < uc_count; i++) {
+			rc = qede_set_ucast_rx_mac(edev,
+						   QED_FILTER_XCAST_TYPE_ADD,
+						   temp);
+			if (rc)
+				goto out;
+
+			temp += ETH_ALEN;
+		}
+	} else {
+		accept_flags = QED_FILTER_RX_MODE_TYPE_PROMISC;
+	}
+
+	rc = qede_configure_mcast_filtering(ndev, &accept_flags);
+	if (rc)
+		goto out;
+
+	/* take care of VLAN mode */
+	if (ndev->flags & IFF_PROMISC) {
+		qede_config_accept_any_vlan(edev, true);
+	} else if (!edev->non_configured_vlans) {
+		/* It's possible that accept_any_vlan mode is set due to a
+		 * previous setting of IFF_PROMISC. If vlan credits are
+		 * sufficient, disable accept_any_vlan.
+		 */
+		qede_config_accept_any_vlan(edev, false);
+	}
+
+	rx_mode.filter.accept_flags = accept_flags;
+	edev->ops->filter_config(edev->cdev, &rx_mode);
+out:
+	kfree(uc_macs);
+}
+
+static struct qede_arfs_fltr_node *
+qede_get_arfs_fltr_by_loc(struct hlist_head *head, u32 location)
+{
+	struct qede_arfs_fltr_node *fltr;
+
+	hlist_for_each_entry(fltr, head, node)
+		if (location == fltr->sw_id)
+			return fltr;
+
+	return NULL;
+}
+
+static bool
+qede_compare_user_flow_ips(struct qede_arfs_fltr_node *tpos,
+			   struct ethtool_rx_flow_spec *fsp,
+			   __be16 proto)
+{
+	if (proto == htons(ETH_P_IP)) {
+		struct ethtool_tcpip4_spec *ip;
+
+		ip = &fsp->h_u.tcp_ip4_spec;
+
+		if (tpos->tuple.src_ipv4 == ip->ip4src &&
+		    tpos->tuple.dst_ipv4 == ip->ip4dst)
+			return true;
+		else
+			return false;
+	} else {
+		struct ethtool_tcpip6_spec *ip6;
+		struct in6_addr *src;
+
+		ip6 = &fsp->h_u.tcp_ip6_spec;
+		src = &tpos->tuple.src_ipv6;
+
+		if (!memcmp(src, &ip6->ip6src, sizeof(struct in6_addr)) &&
+		    !memcmp(&tpos->tuple.dst_ipv6, &ip6->ip6dst,
+			    sizeof(struct in6_addr)))
+			return true;
+		else
+			return false;
+	}
+	return false;
+}
+
+int qede_get_cls_rule_all(struct qede_dev *edev, struct ethtool_rxnfc *info,
+			  u32 *rule_locs)
+{
+	struct qede_arfs_fltr_node *fltr;
+	struct hlist_head *head;
+	int cnt = 0, rc = 0;
+
+	info->data = QEDE_RFS_MAX_FLTR;
+
+	__qede_lock(edev);
+
+	if (!edev->arfs) {
+		rc = -EPERM;
+		goto unlock;
+	}
+
+	head = QEDE_ARFS_BUCKET_HEAD(edev, 0);
+
+	hlist_for_each_entry(fltr, head, node) {
+		if (cnt == info->rule_cnt) {
+			rc = -EMSGSIZE;
+			goto unlock;
+		}
+
+		rule_locs[cnt] = fltr->sw_id;
+		cnt++;
+	}
+
+	info->rule_cnt = cnt;
+
+unlock:
+	__qede_unlock(edev);
+	return rc;
+}
+
+int qede_get_cls_rule_entry(struct qede_dev *edev, struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_rx_flow_spec *fsp = &cmd->fs;
+	struct qede_arfs_fltr_node *fltr = NULL;
+	int rc = 0;
+
+	cmd->data = QEDE_RFS_MAX_FLTR;
+
+	__qede_lock(edev);
+
+	if (!edev->arfs) {
+		rc = -EPERM;
+		goto unlock;
+	}
+
+	fltr = qede_get_arfs_fltr_by_loc(QEDE_ARFS_BUCKET_HEAD(edev, 0),
+					 fsp->location);
+	if (!fltr) {
+		DP_NOTICE(edev, "Rule not found - location=0x%x\n",
+			  fsp->location);
+		rc = -EINVAL;
+		goto unlock;
+	}
+
+	if (fltr->tuple.eth_proto == htons(ETH_P_IP)) {
+		if (fltr->tuple.ip_proto == IPPROTO_TCP)
+			fsp->flow_type = TCP_V4_FLOW;
+		else
+			fsp->flow_type = UDP_V4_FLOW;
+
+		fsp->h_u.tcp_ip4_spec.psrc = fltr->tuple.src_port;
+		fsp->h_u.tcp_ip4_spec.pdst = fltr->tuple.dst_port;
+		fsp->h_u.tcp_ip4_spec.ip4src = fltr->tuple.src_ipv4;
+		fsp->h_u.tcp_ip4_spec.ip4dst = fltr->tuple.dst_ipv4;
+	} else {
+		if (fltr->tuple.ip_proto == IPPROTO_TCP)
+			fsp->flow_type = TCP_V6_FLOW;
+		else
+			fsp->flow_type = UDP_V6_FLOW;
+		fsp->h_u.tcp_ip6_spec.psrc = fltr->tuple.src_port;
+		fsp->h_u.tcp_ip6_spec.pdst = fltr->tuple.dst_port;
+		memcpy(&fsp->h_u.tcp_ip6_spec.ip6src,
+		       &fltr->tuple.src_ipv6, sizeof(struct in6_addr));
+		memcpy(&fsp->h_u.tcp_ip6_spec.ip6dst,
+		       &fltr->tuple.dst_ipv6, sizeof(struct in6_addr));
+	}
+
+	fsp->ring_cookie = fltr->rxq_id;
+
+unlock:
+	__qede_unlock(edev);
+	return rc;
+}
+
+static int
+qede_validate_and_check_flow_exist(struct qede_dev *edev,
+				   struct ethtool_rx_flow_spec *fsp,
+				   int *min_hlen)
+{
+	__be16 src_port = 0x0, dst_port = 0x0;
+	struct qede_arfs_fltr_node *fltr;
+	struct hlist_node *temp;
+	struct hlist_head *head;
+	__be16 eth_proto;
+	u8 ip_proto;
+
+	if (fsp->location >= QEDE_RFS_MAX_FLTR ||
+	    fsp->ring_cookie >= QEDE_RSS_COUNT(edev))
+		return -EINVAL;
+
+	if (fsp->flow_type == TCP_V4_FLOW) {
+		*min_hlen += sizeof(struct iphdr) +
+				sizeof(struct tcphdr);
+		eth_proto = htons(ETH_P_IP);
+		ip_proto = IPPROTO_TCP;
+	} else if (fsp->flow_type == UDP_V4_FLOW) {
+		*min_hlen += sizeof(struct iphdr) +
+				sizeof(struct udphdr);
+		eth_proto = htons(ETH_P_IP);
+		ip_proto = IPPROTO_UDP;
+	} else if (fsp->flow_type == TCP_V6_FLOW) {
+		*min_hlen += sizeof(struct ipv6hdr) +
+				sizeof(struct tcphdr);
+		eth_proto = htons(ETH_P_IPV6);
+		ip_proto = IPPROTO_TCP;
+	} else if (fsp->flow_type == UDP_V6_FLOW) {
+		*min_hlen += sizeof(struct ipv6hdr) +
+				sizeof(struct udphdr);
+		eth_proto = htons(ETH_P_IPV6);
+		ip_proto = IPPROTO_UDP;
+	} else {
+		DP_NOTICE(edev, "Unsupported flow type = 0x%x\n",
+			  fsp->flow_type);
+		return -EPROTONOSUPPORT;
+	}
+
+	if (eth_proto == htons(ETH_P_IP)) {
+		src_port = fsp->h_u.tcp_ip4_spec.psrc;
+		dst_port = fsp->h_u.tcp_ip4_spec.pdst;
+	} else {
+		src_port = fsp->h_u.tcp_ip6_spec.psrc;
+		dst_port = fsp->h_u.tcp_ip6_spec.pdst;
+	}
+
+	head = QEDE_ARFS_BUCKET_HEAD(edev, 0);
+	hlist_for_each_entry_safe(fltr, temp, head, node) {
+		if ((fltr->tuple.ip_proto == ip_proto &&
+		     fltr->tuple.eth_proto == eth_proto &&
+		     qede_compare_user_flow_ips(fltr, fsp, eth_proto) &&
+		     fltr->tuple.src_port == src_port &&
+		     fltr->tuple.dst_port == dst_port) ||
+		    fltr->sw_id == fsp->location)
+			return -EEXIST;
+	}
+
+	return 0;
+}
+
+static int
+qede_poll_arfs_filter_config(struct qede_dev *edev,
+			     struct qede_arfs_fltr_node *fltr)
+{
+	int count = QEDE_ARFS_POLL_COUNT;
+
+	while (fltr->used && count) {
+		msleep(20);
+		count--;
+	}
+
+	if (count == 0 || fltr->fw_rc) {
+		qede_dequeue_fltr_and_config_searcher(edev, fltr);
+		return -EIO;
+	}
+
+	return fltr->fw_rc;
+}
+
+int qede_add_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info)
+{
+	struct ethtool_rx_flow_spec *fsp = &info->fs;
+	struct qede_arfs_fltr_node *n;
+	int min_hlen = ETH_HLEN, rc;
+	struct ethhdr *eth;
+	struct iphdr *ip;
+	__be16 *ports;
+
+	__qede_lock(edev);
+
+	if (!edev->arfs) {
+		rc = -EPERM;
+		goto unlock;
+	}
+
+	rc = qede_validate_and_check_flow_exist(edev, fsp, &min_hlen);
+	if (rc)
+		goto unlock;
+
+	n = kzalloc(sizeof(*n), GFP_KERNEL);
+	if (!n) {
+		rc = -ENOMEM;
+		goto unlock;
+	}
+
+	n->data = kzalloc(min_hlen, GFP_KERNEL);
+	if (!n->data) {
+		kfree(n);
+		rc = -ENOMEM;
+		goto unlock;
+	}
+
+	n->sw_id = fsp->location;
+	set_bit(n->sw_id, edev->arfs->arfs_fltr_bmap);
+	n->buf_len = min_hlen;
+	n->rxq_id = fsp->ring_cookie;
+	n->next_rxq_id = n->rxq_id;
+	eth = (struct ethhdr *)n->data;
+
+	if (info->fs.flow_type == TCP_V4_FLOW ||
+	    info->fs.flow_type == UDP_V4_FLOW) {
+		ports = (__be16 *)(n->data + ETH_HLEN +
+					sizeof(struct iphdr));
+		eth->h_proto = htons(ETH_P_IP);
+		n->tuple.eth_proto = htons(ETH_P_IP);
+		n->tuple.src_ipv4 = info->fs.h_u.tcp_ip4_spec.ip4src;
+		n->tuple.dst_ipv4 = info->fs.h_u.tcp_ip4_spec.ip4dst;
+		n->tuple.src_port = info->fs.h_u.tcp_ip4_spec.psrc;
+		n->tuple.dst_port = info->fs.h_u.tcp_ip4_spec.pdst;
+		ports[0] = n->tuple.src_port;
+		ports[1] = n->tuple.dst_port;
+		ip = (struct iphdr *)(n->data + ETH_HLEN);
+		ip->saddr = info->fs.h_u.tcp_ip4_spec.ip4src;
+		ip->daddr = info->fs.h_u.tcp_ip4_spec.ip4dst;
+		ip->version = 0x4;
+		ip->ihl = 0x5;
+
+		if (info->fs.flow_type == TCP_V4_FLOW) {
+			n->tuple.ip_proto = IPPROTO_TCP;
+			ip->protocol = IPPROTO_TCP;
+		} else {
+			n->tuple.ip_proto = IPPROTO_UDP;
+			ip->protocol = IPPROTO_UDP;
+		}
+		ip->tot_len = cpu_to_be16(min_hlen - ETH_HLEN);
+	} else {
+		struct ipv6hdr *ip6;
+
+		ip6 = (struct ipv6hdr *)(n->data + ETH_HLEN);
+		ports = (__be16 *)(n->data + ETH_HLEN +
+					sizeof(struct ipv6hdr));
+		eth->h_proto = htons(ETH_P_IPV6);
+		n->tuple.eth_proto = htons(ETH_P_IPV6);
+		memcpy(&n->tuple.src_ipv6, &info->fs.h_u.tcp_ip6_spec.ip6src,
+		       sizeof(struct in6_addr));
+		memcpy(&n->tuple.dst_ipv6, &info->fs.h_u.tcp_ip6_spec.ip6dst,
+		       sizeof(struct in6_addr));
+		n->tuple.src_port = info->fs.h_u.tcp_ip6_spec.psrc;
+		n->tuple.dst_port = info->fs.h_u.tcp_ip6_spec.pdst;
+		ports[0] = n->tuple.src_port;
+		ports[1] = n->tuple.dst_port;
+		memcpy(&ip6->saddr, &n->tuple.src_ipv6,
+		       sizeof(struct in6_addr));
+		memcpy(&ip6->daddr, &n->tuple.dst_ipv6,
+		       sizeof(struct in6_addr));
+		ip6->version = 0x6;
+
+		if (info->fs.flow_type == TCP_V6_FLOW) {
+			n->tuple.ip_proto = IPPROTO_TCP;
+			ip6->nexthdr = NEXTHDR_TCP;
+			ip6->payload_len = cpu_to_be16(sizeof(struct tcphdr));
+		} else {
+			n->tuple.ip_proto = IPPROTO_UDP;
+			ip6->nexthdr = NEXTHDR_UDP;
+			ip6->payload_len = cpu_to_be16(sizeof(struct udphdr));
+		}
+	}
+
+	rc = qede_enqueue_fltr_and_config_searcher(edev, n, 0);
+	if (rc)
+		goto unlock;
+
+	qede_configure_arfs_fltr(edev, n, n->rxq_id, true);
+	rc = qede_poll_arfs_filter_config(edev, n);
+unlock:
+	__qede_unlock(edev);
+	return rc;
+}
+
+int qede_del_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info)
+{
+	struct ethtool_rx_flow_spec *fsp = &info->fs;
+	struct qede_arfs_fltr_node *fltr = NULL;
+	int rc = -EPERM;
+
+	__qede_lock(edev);
+	if (!edev->arfs)
+		goto unlock;
+
+	fltr = qede_get_arfs_fltr_by_loc(QEDE_ARFS_BUCKET_HEAD(edev, 0),
+					 fsp->location);
+	if (!fltr)
+		goto unlock;
+
+	qede_configure_arfs_fltr(edev, fltr, fltr->rxq_id, false);
+
+	rc = qede_poll_arfs_filter_config(edev, fltr);
+	if (rc == 0)
+		qede_dequeue_fltr_and_config_searcher(edev, fltr);
+
+unlock:
+	__qede_unlock(edev);
+	return rc;
+}
+
+int qede_get_arfs_filter_count(struct qede_dev *edev)
+{
+	int count = 0;
+
+	__qede_lock(edev);
+
+	if (!edev->arfs)
+		goto unlock;
+
+	count = edev->arfs->filter_count;
+
+unlock:
+	__qede_unlock(edev);
+	return count;
+}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
new file mode 100644
index 0000000..1494130
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -0,0 +1,1717 @@
+/* QLogic qede NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/bpf_trace.h>
+#include <net/udp_tunnel.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/ip6_checksum.h>
+#include "qede_ptp.h"
+
+#include <linux/qed/qed_if.h>
+#include "qede.h"
+/*********************************
+ * Content also used by slowpath *
+ *********************************/
+
+int qede_alloc_rx_buffer(struct qede_rx_queue *rxq, bool allow_lazy)
+{
+	struct sw_rx_data *sw_rx_data;
+	struct eth_rx_bd *rx_bd;
+	dma_addr_t mapping;
+	struct page *data;
+
+	/* In case lazy-allocation is allowed, postpone allocation until the
+	 * end of the NAPI run. We'd still need to make sure the Rx ring has
+	 * sufficient buffers to guarantee an additional Rx interrupt.
+	 */
+	if (allow_lazy && likely(rxq->filled_buffers > 12)) {
+		rxq->filled_buffers--;
+		return 0;
+	}
+
+	data = alloc_pages(GFP_ATOMIC, 0);
+	if (unlikely(!data))
+		return -ENOMEM;
+
+	/* Map the entire page as it would be used
+	 * for multiple RX buffer segment size mapping.
+	 */
+	mapping = dma_map_page(rxq->dev, data, 0,
+			       PAGE_SIZE, rxq->data_direction);
+	if (unlikely(dma_mapping_error(rxq->dev, mapping))) {
+		__free_page(data);
+		return -ENOMEM;
+	}
+
+	sw_rx_data = &rxq->sw_rx_ring[rxq->sw_rx_prod & NUM_RX_BDS_MAX];
+	sw_rx_data->page_offset = 0;
+	sw_rx_data->data = data;
+	sw_rx_data->mapping = mapping;
+
+	/* Advance PROD and get BD pointer */
+	rx_bd = (struct eth_rx_bd *)qed_chain_produce(&rxq->rx_bd_ring);
+	WARN_ON(!rx_bd);
+	rx_bd->addr.hi = cpu_to_le32(upper_32_bits(mapping));
+	rx_bd->addr.lo = cpu_to_le32(lower_32_bits(mapping) +
+				     rxq->rx_headroom);
+
+	rxq->sw_rx_prod++;
+	rxq->filled_buffers++;
+
+	return 0;
+}
+
+/* Unmap the data and free skb */
+int qede_free_tx_pkt(struct qede_dev *edev, struct qede_tx_queue *txq, int *len)
+{
+	u16 idx = txq->sw_tx_cons;
+	struct sk_buff *skb = txq->sw_tx_ring.skbs[idx].skb;
+	struct eth_tx_1st_bd *first_bd;
+	struct eth_tx_bd *tx_data_bd;
+	int bds_consumed = 0;
+	int nbds;
+	bool data_split = txq->sw_tx_ring.skbs[idx].flags & QEDE_TSO_SPLIT_BD;
+	int i, split_bd_len = 0;
+
+	if (unlikely(!skb)) {
+		DP_ERR(edev,
+		       "skb is null for txq idx=%d txq->sw_tx_cons=%d txq->sw_tx_prod=%d\n",
+		       idx, txq->sw_tx_cons, txq->sw_tx_prod);
+		return -1;
+	}
+
+	*len = skb->len;
+
+	first_bd = (struct eth_tx_1st_bd *)qed_chain_consume(&txq->tx_pbl);
+
+	bds_consumed++;
+
+	nbds = first_bd->data.nbds;
+
+	if (data_split) {
+		struct eth_tx_bd *split = (struct eth_tx_bd *)
+			qed_chain_consume(&txq->tx_pbl);
+		split_bd_len = BD_UNMAP_LEN(split);
+		bds_consumed++;
+	}
+	dma_unmap_single(&edev->pdev->dev, BD_UNMAP_ADDR(first_bd),
+			 BD_UNMAP_LEN(first_bd) + split_bd_len, DMA_TO_DEVICE);
+
+	/* Unmap the data of the skb frags */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, bds_consumed++) {
+		tx_data_bd = (struct eth_tx_bd *)
+			qed_chain_consume(&txq->tx_pbl);
+		dma_unmap_page(&edev->pdev->dev, BD_UNMAP_ADDR(tx_data_bd),
+			       BD_UNMAP_LEN(tx_data_bd), DMA_TO_DEVICE);
+	}
+
+	while (bds_consumed++ < nbds)
+		qed_chain_consume(&txq->tx_pbl);
+
+	/* Free skb */
+	dev_kfree_skb_any(skb);
+	txq->sw_tx_ring.skbs[idx].skb = NULL;
+	txq->sw_tx_ring.skbs[idx].flags = 0;
+
+	return 0;
+}
+
+/* Unmap the data and free skb when mapping failed during start_xmit */
+static void qede_free_failed_tx_pkt(struct qede_tx_queue *txq,
+				    struct eth_tx_1st_bd *first_bd,
+				    int nbd, bool data_split)
+{
+	u16 idx = txq->sw_tx_prod;
+	struct sk_buff *skb = txq->sw_tx_ring.skbs[idx].skb;
+	struct eth_tx_bd *tx_data_bd;
+	int i, split_bd_len = 0;
+
+	/* Return prod to its position before this skb was handled */
+	qed_chain_set_prod(&txq->tx_pbl,
+			   le16_to_cpu(txq->tx_db.data.bd_prod), first_bd);
+
+	first_bd = (struct eth_tx_1st_bd *)qed_chain_produce(&txq->tx_pbl);
+
+	if (data_split) {
+		struct eth_tx_bd *split = (struct eth_tx_bd *)
+					  qed_chain_produce(&txq->tx_pbl);
+		split_bd_len = BD_UNMAP_LEN(split);
+		nbd--;
+	}
+
+	dma_unmap_single(txq->dev, BD_UNMAP_ADDR(first_bd),
+			 BD_UNMAP_LEN(first_bd) + split_bd_len, DMA_TO_DEVICE);
+
+	/* Unmap the data of the skb frags */
+	for (i = 0; i < nbd; i++) {
+		tx_data_bd = (struct eth_tx_bd *)
+			qed_chain_produce(&txq->tx_pbl);
+		if (tx_data_bd->nbytes)
+			dma_unmap_page(txq->dev,
+				       BD_UNMAP_ADDR(tx_data_bd),
+				       BD_UNMAP_LEN(tx_data_bd), DMA_TO_DEVICE);
+	}
+
+	/* Return again prod to its position before this skb was handled */
+	qed_chain_set_prod(&txq->tx_pbl,
+			   le16_to_cpu(txq->tx_db.data.bd_prod), first_bd);
+
+	/* Free skb */
+	dev_kfree_skb_any(skb);
+	txq->sw_tx_ring.skbs[idx].skb = NULL;
+	txq->sw_tx_ring.skbs[idx].flags = 0;
+}
+
+static u32 qede_xmit_type(struct sk_buff *skb, int *ipv6_ext)
+{
+	u32 rc = XMIT_L4_CSUM;
+	__be16 l3_proto;
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		return XMIT_PLAIN;
+
+	l3_proto = vlan_get_protocol(skb);
+	if (l3_proto == htons(ETH_P_IPV6) &&
+	    (ipv6_hdr(skb)->nexthdr == NEXTHDR_IPV6))
+		*ipv6_ext = 1;
+
+	if (skb->encapsulation) {
+		rc |= XMIT_ENC;
+		if (skb_is_gso(skb)) {
+			unsigned short gso_type = skb_shinfo(skb)->gso_type;
+
+			if ((gso_type & SKB_GSO_UDP_TUNNEL_CSUM) ||
+			    (gso_type & SKB_GSO_GRE_CSUM))
+				rc |= XMIT_ENC_GSO_L4_CSUM;
+
+			rc |= XMIT_LSO;
+			return rc;
+		}
+	}
+
+	if (skb_is_gso(skb))
+		rc |= XMIT_LSO;
+
+	return rc;
+}
+
+static void qede_set_params_for_ipv6_ext(struct sk_buff *skb,
+					 struct eth_tx_2nd_bd *second_bd,
+					 struct eth_tx_3rd_bd *third_bd)
+{
+	u8 l4_proto;
+	u16 bd2_bits1 = 0, bd2_bits2 = 0;
+
+	bd2_bits1 |= (1 << ETH_TX_DATA_2ND_BD_IPV6_EXT_SHIFT);
+
+	bd2_bits2 |= ((((u8 *)skb_transport_header(skb) - skb->data) >> 1) &
+		     ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_MASK)
+		    << ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_SHIFT;
+
+	bd2_bits1 |= (ETH_L4_PSEUDO_CSUM_CORRECT_LENGTH <<
+		      ETH_TX_DATA_2ND_BD_L4_PSEUDO_CSUM_MODE_SHIFT);
+
+	if (vlan_get_protocol(skb) == htons(ETH_P_IPV6))
+		l4_proto = ipv6_hdr(skb)->nexthdr;
+	else
+		l4_proto = ip_hdr(skb)->protocol;
+
+	if (l4_proto == IPPROTO_UDP)
+		bd2_bits1 |= 1 << ETH_TX_DATA_2ND_BD_L4_UDP_SHIFT;
+
+	if (third_bd)
+		third_bd->data.bitfields |=
+			cpu_to_le16(((tcp_hdrlen(skb) / 4) &
+				ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_MASK) <<
+				ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_SHIFT);
+
+	second_bd->data.bitfields1 = cpu_to_le16(bd2_bits1);
+	second_bd->data.bitfields2 = cpu_to_le16(bd2_bits2);
+}
+
+static int map_frag_to_bd(struct qede_tx_queue *txq,
+			  skb_frag_t *frag, struct eth_tx_bd *bd)
+{
+	dma_addr_t mapping;
+
+	/* Map skb non-linear frag data for DMA */
+	mapping = skb_frag_dma_map(txq->dev, frag, 0,
+				   skb_frag_size(frag), DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(txq->dev, mapping)))
+		return -ENOMEM;
+
+	/* Setup the data pointer of the frag data */
+	BD_SET_UNMAP_ADDR_LEN(bd, mapping, skb_frag_size(frag));
+
+	return 0;
+}
+
+static u16 qede_get_skb_hlen(struct sk_buff *skb, bool is_encap_pkt)
+{
+	if (is_encap_pkt)
+		return (skb_inner_transport_header(skb) +
+			inner_tcp_hdrlen(skb) - skb->data);
+	else
+		return (skb_transport_header(skb) +
+			tcp_hdrlen(skb) - skb->data);
+}
+
+/* +2 for 1st BD for headers and 2nd BD for headlen (if required) */
+#if ((MAX_SKB_FRAGS + 2) > ETH_TX_MAX_BDS_PER_NON_LSO_PACKET)
+static bool qede_pkt_req_lin(struct sk_buff *skb, u8 xmit_type)
+{
+	int allowed_frags = ETH_TX_MAX_BDS_PER_NON_LSO_PACKET - 1;
+
+	if (xmit_type & XMIT_LSO) {
+		int hlen;
+
+		hlen = qede_get_skb_hlen(skb, xmit_type & XMIT_ENC);
+
+		/* linear payload would require its own BD */
+		if (skb_headlen(skb) > hlen)
+			allowed_frags--;
+	}
+
+	return (skb_shinfo(skb)->nr_frags > allowed_frags);
+}
+#endif
+
+static inline void qede_update_tx_producer(struct qede_tx_queue *txq)
+{
+	/* wmb makes sure that the BDs data is updated before updating the
+	 * producer, otherwise FW may read old data from the BDs.
+	 */
+	wmb();
+	barrier();
+	writel(txq->tx_db.raw, txq->doorbell_addr);
+
+	/* Fence required to flush the write combined buffer, since another
+	 * CPU may write to the same doorbell address and data may be lost
+	 * due to relaxed order nature of write combined bar.
+	 */
+	wmb();
+}
+
+static int qede_xdp_xmit(struct qede_dev *edev, struct qede_fastpath *fp,
+			 struct sw_rx_data *metadata, u16 padding, u16 length)
+{
+	struct qede_tx_queue *txq = fp->xdp_tx;
+	struct eth_tx_1st_bd *first_bd;
+	u16 idx = txq->sw_tx_prod;
+	u16 val;
+
+	if (!qed_chain_get_elem_left(&txq->tx_pbl)) {
+		txq->stopped_cnt++;
+		return -ENOMEM;
+	}
+
+	first_bd = (struct eth_tx_1st_bd *)qed_chain_produce(&txq->tx_pbl);
+
+	memset(first_bd, 0, sizeof(*first_bd));
+	first_bd->data.bd_flags.bitfields =
+	    BIT(ETH_TX_1ST_BD_FLAGS_START_BD_SHIFT);
+
+	val = (length & ETH_TX_DATA_1ST_BD_PKT_LEN_MASK) <<
+	       ETH_TX_DATA_1ST_BD_PKT_LEN_SHIFT;
+
+	first_bd->data.bitfields |= cpu_to_le16(val);
+	first_bd->data.nbds = 1;
+
+	/* We can safely ignore the offset, as it's 0 for XDP */
+	BD_SET_UNMAP_ADDR_LEN(first_bd, metadata->mapping + padding, length);
+
+	/* Synchronize the buffer back to device, as program [probably]
+	 * has changed it.
+	 */
+	dma_sync_single_for_device(&edev->pdev->dev,
+				   metadata->mapping + padding,
+				   length, PCI_DMA_TODEVICE);
+
+	txq->sw_tx_ring.xdp[idx].page = metadata->data;
+	txq->sw_tx_ring.xdp[idx].mapping = metadata->mapping;
+	txq->sw_tx_prod = (txq->sw_tx_prod + 1) % txq->num_tx_buffers;
+
+	/* Mark the fastpath for future XDP doorbell */
+	fp->xdp_xmit = 1;
+
+	return 0;
+}
+
+int qede_txq_has_work(struct qede_tx_queue *txq)
+{
+	u16 hw_bd_cons;
+
+	/* Tell compiler that consumer and producer can change */
+	barrier();
+	hw_bd_cons = le16_to_cpu(*txq->hw_cons_ptr);
+	if (qed_chain_get_cons_idx(&txq->tx_pbl) == hw_bd_cons + 1)
+		return 0;
+
+	return hw_bd_cons != qed_chain_get_cons_idx(&txq->tx_pbl);
+}
+
+static void qede_xdp_tx_int(struct qede_dev *edev, struct qede_tx_queue *txq)
+{
+	u16 hw_bd_cons, idx;
+
+	hw_bd_cons = le16_to_cpu(*txq->hw_cons_ptr);
+	barrier();
+
+	while (hw_bd_cons != qed_chain_get_cons_idx(&txq->tx_pbl)) {
+		qed_chain_consume(&txq->tx_pbl);
+		idx = txq->sw_tx_cons;
+
+		dma_unmap_page(&edev->pdev->dev,
+			       txq->sw_tx_ring.xdp[idx].mapping,
+			       PAGE_SIZE, DMA_BIDIRECTIONAL);
+		__free_page(txq->sw_tx_ring.xdp[idx].page);
+
+		txq->sw_tx_cons = (txq->sw_tx_cons + 1) % txq->num_tx_buffers;
+		txq->xmit_pkts++;
+	}
+}
+
+static int qede_tx_int(struct qede_dev *edev, struct qede_tx_queue *txq)
+{
+	struct netdev_queue *netdev_txq;
+	u16 hw_bd_cons;
+	unsigned int pkts_compl = 0, bytes_compl = 0;
+	int rc;
+
+	netdev_txq = netdev_get_tx_queue(edev->ndev, txq->index);
+
+	hw_bd_cons = le16_to_cpu(*txq->hw_cons_ptr);
+	barrier();
+
+	while (hw_bd_cons != qed_chain_get_cons_idx(&txq->tx_pbl)) {
+		int len = 0;
+
+		rc = qede_free_tx_pkt(edev, txq, &len);
+		if (rc) {
+			DP_NOTICE(edev, "hw_bd_cons = %d, chain_cons=%d\n",
+				  hw_bd_cons,
+				  qed_chain_get_cons_idx(&txq->tx_pbl));
+			break;
+		}
+
+		bytes_compl += len;
+		pkts_compl++;
+		txq->sw_tx_cons = (txq->sw_tx_cons + 1) % txq->num_tx_buffers;
+		txq->xmit_pkts++;
+	}
+
+	netdev_tx_completed_queue(netdev_txq, pkts_compl, bytes_compl);
+
+	/* Need to make the tx_bd_cons update visible to start_xmit()
+	 * before checking for netif_tx_queue_stopped().  Without the
+	 * memory barrier, there is a small possibility that
+	 * start_xmit() will miss it and cause the queue to be stopped
+	 * forever.
+	 * On the other hand we need an rmb() here to ensure the proper
+	 * ordering of bit testing in the following
+	 * netif_tx_queue_stopped(txq) call.
+	 */
+	smp_mb();
+
+	if (unlikely(netif_tx_queue_stopped(netdev_txq))) {
+		/* Taking tx_lock is needed to prevent reenabling the queue
+		 * while it's empty. This could have happen if rx_action() gets
+		 * suspended in qede_tx_int() after the condition before
+		 * netif_tx_wake_queue(), while tx_action (qede_start_xmit()):
+		 *
+		 * stops the queue->sees fresh tx_bd_cons->releases the queue->
+		 * sends some packets consuming the whole queue again->
+		 * stops the queue
+		 */
+
+		__netif_tx_lock(netdev_txq, smp_processor_id());
+
+		if ((netif_tx_queue_stopped(netdev_txq)) &&
+		    (edev->state == QEDE_STATE_OPEN) &&
+		    (qed_chain_get_elem_left(&txq->tx_pbl)
+		      >= (MAX_SKB_FRAGS + 1))) {
+			netif_tx_wake_queue(netdev_txq);
+			DP_VERBOSE(edev, NETIF_MSG_TX_DONE,
+				   "Wake queue was called\n");
+		}
+
+		__netif_tx_unlock(netdev_txq);
+	}
+
+	return 0;
+}
+
+bool qede_has_rx_work(struct qede_rx_queue *rxq)
+{
+	u16 hw_comp_cons, sw_comp_cons;
+
+	/* Tell compiler that status block fields can change */
+	barrier();
+
+	hw_comp_cons = le16_to_cpu(*rxq->hw_cons_ptr);
+	sw_comp_cons = qed_chain_get_cons_idx(&rxq->rx_comp_ring);
+
+	return hw_comp_cons != sw_comp_cons;
+}
+
+static inline void qede_rx_bd_ring_consume(struct qede_rx_queue *rxq)
+{
+	qed_chain_consume(&rxq->rx_bd_ring);
+	rxq->sw_rx_cons++;
+}
+
+/* This function reuses the buffer(from an offset) from
+ * consumer index to producer index in the bd ring
+ */
+static inline void qede_reuse_page(struct qede_rx_queue *rxq,
+				   struct sw_rx_data *curr_cons)
+{
+	struct eth_rx_bd *rx_bd_prod = qed_chain_produce(&rxq->rx_bd_ring);
+	struct sw_rx_data *curr_prod;
+	dma_addr_t new_mapping;
+
+	curr_prod = &rxq->sw_rx_ring[rxq->sw_rx_prod & NUM_RX_BDS_MAX];
+	*curr_prod = *curr_cons;
+
+	new_mapping = curr_prod->mapping + curr_prod->page_offset;
+
+	rx_bd_prod->addr.hi = cpu_to_le32(upper_32_bits(new_mapping));
+	rx_bd_prod->addr.lo = cpu_to_le32(lower_32_bits(new_mapping) +
+					  rxq->rx_headroom);
+
+	rxq->sw_rx_prod++;
+	curr_cons->data = NULL;
+}
+
+/* In case of allocation failures reuse buffers
+ * from consumer index to produce buffers for firmware
+ */
+void qede_recycle_rx_bd_ring(struct qede_rx_queue *rxq, u8 count)
+{
+	struct sw_rx_data *curr_cons;
+
+	for (; count > 0; count--) {
+		curr_cons = &rxq->sw_rx_ring[rxq->sw_rx_cons & NUM_RX_BDS_MAX];
+		qede_reuse_page(rxq, curr_cons);
+		qede_rx_bd_ring_consume(rxq);
+	}
+}
+
+static inline int qede_realloc_rx_buffer(struct qede_rx_queue *rxq,
+					 struct sw_rx_data *curr_cons)
+{
+	/* Move to the next segment in the page */
+	curr_cons->page_offset += rxq->rx_buf_seg_size;
+
+	if (curr_cons->page_offset == PAGE_SIZE) {
+		if (unlikely(qede_alloc_rx_buffer(rxq, true))) {
+			/* Since we failed to allocate new buffer
+			 * current buffer can be used again.
+			 */
+			curr_cons->page_offset -= rxq->rx_buf_seg_size;
+
+			return -ENOMEM;
+		}
+
+		dma_unmap_page(rxq->dev, curr_cons->mapping,
+			       PAGE_SIZE, rxq->data_direction);
+	} else {
+		/* Increment refcount of the page as we don't want
+		 * network stack to take the ownership of the page
+		 * which can be recycled multiple times by the driver.
+		 */
+		page_ref_inc(curr_cons->data);
+		qede_reuse_page(rxq, curr_cons);
+	}
+
+	return 0;
+}
+
+void qede_update_rx_prod(struct qede_dev *edev, struct qede_rx_queue *rxq)
+{
+	u16 bd_prod = qed_chain_get_prod_idx(&rxq->rx_bd_ring);
+	u16 cqe_prod = qed_chain_get_prod_idx(&rxq->rx_comp_ring);
+	struct eth_rx_prod_data rx_prods = {0};
+
+	/* Update producers */
+	rx_prods.bd_prod = cpu_to_le16(bd_prod);
+	rx_prods.cqe_prod = cpu_to_le16(cqe_prod);
+
+	/* Make sure that the BD and SGE data is updated before updating the
+	 * producers since FW might read the BD/SGE right after the producer
+	 * is updated.
+	 */
+	wmb();
+
+	internal_ram_wr(rxq->hw_rxq_prod_addr, sizeof(rx_prods),
+			(u32 *)&rx_prods);
+
+	/* mmiowb is needed to synchronize doorbell writes from more than one
+	 * processor. It guarantees that the write arrives to the device before
+	 * the napi lock is released and another qede_poll is called (possibly
+	 * on another CPU). Without this barrier, the next doorbell can bypass
+	 * this doorbell. This is applicable to IA64/Altix systems.
+	 */
+	mmiowb();
+}
+
+static void qede_get_rxhash(struct sk_buff *skb, u8 bitfields, __le32 rss_hash)
+{
+	enum pkt_hash_types hash_type = PKT_HASH_TYPE_NONE;
+	enum rss_hash_type htype;
+	u32 hash = 0;
+
+	htype = GET_FIELD(bitfields, ETH_FAST_PATH_RX_REG_CQE_RSS_HASH_TYPE);
+	if (htype) {
+		hash_type = ((htype == RSS_HASH_TYPE_IPV4) ||
+			     (htype == RSS_HASH_TYPE_IPV6)) ?
+			    PKT_HASH_TYPE_L3 : PKT_HASH_TYPE_L4;
+		hash = le32_to_cpu(rss_hash);
+	}
+	skb_set_hash(skb, hash, hash_type);
+}
+
+static void qede_set_skb_csum(struct sk_buff *skb, u8 csum_flag)
+{
+	skb_checksum_none_assert(skb);
+
+	if (csum_flag & QEDE_CSUM_UNNECESSARY)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	if (csum_flag & QEDE_TUNN_CSUM_UNNECESSARY) {
+		skb->csum_level = 1;
+		skb->encapsulation = 1;
+	}
+}
+
+static inline void qede_skb_receive(struct qede_dev *edev,
+				    struct qede_fastpath *fp,
+				    struct qede_rx_queue *rxq,
+				    struct sk_buff *skb, u16 vlan_tag)
+{
+	if (vlan_tag)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+
+	napi_gro_receive(&fp->napi, skb);
+}
+
+static void qede_set_gro_params(struct qede_dev *edev,
+				struct sk_buff *skb,
+				struct eth_fast_path_rx_tpa_start_cqe *cqe)
+{
+	u16 parsing_flags = le16_to_cpu(cqe->pars_flags.flags);
+
+	if (((parsing_flags >> PARSING_AND_ERR_FLAGS_L3TYPE_SHIFT) &
+	    PARSING_AND_ERR_FLAGS_L3TYPE_MASK) == 2)
+		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+	else
+		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+	skb_shinfo(skb)->gso_size = __le16_to_cpu(cqe->len_on_first_bd) -
+				    cqe->header_len;
+}
+
+static int qede_fill_frag_skb(struct qede_dev *edev,
+			      struct qede_rx_queue *rxq,
+			      u8 tpa_agg_index, u16 len_on_bd)
+{
+	struct sw_rx_data *current_bd = &rxq->sw_rx_ring[rxq->sw_rx_cons &
+							 NUM_RX_BDS_MAX];
+	struct qede_agg_info *tpa_info = &rxq->tpa_info[tpa_agg_index];
+	struct sk_buff *skb = tpa_info->skb;
+
+	if (unlikely(tpa_info->state != QEDE_AGG_STATE_START))
+		goto out;
+
+	/* Add one frag and update the appropriate fields in the skb */
+	skb_fill_page_desc(skb, tpa_info->frag_id++,
+			   current_bd->data, current_bd->page_offset,
+			   len_on_bd);
+
+	if (unlikely(qede_realloc_rx_buffer(rxq, current_bd))) {
+		/* Incr page ref count to reuse on allocation failure
+		 * so that it doesn't get freed while freeing SKB.
+		 */
+		page_ref_inc(current_bd->data);
+		goto out;
+	}
+
+	qed_chain_consume(&rxq->rx_bd_ring);
+	rxq->sw_rx_cons++;
+
+	skb->data_len += len_on_bd;
+	skb->truesize += rxq->rx_buf_seg_size;
+	skb->len += len_on_bd;
+
+	return 0;
+
+out:
+	tpa_info->state = QEDE_AGG_STATE_ERROR;
+	qede_recycle_rx_bd_ring(rxq, 1);
+
+	return -ENOMEM;
+}
+
+static bool qede_tunn_exist(u16 flag)
+{
+	return !!(flag & (PARSING_AND_ERR_FLAGS_TUNNELEXIST_MASK <<
+			  PARSING_AND_ERR_FLAGS_TUNNELEXIST_SHIFT));
+}
+
+static u8 qede_check_tunn_csum(u16 flag)
+{
+	u16 csum_flag = 0;
+	u8 tcsum = 0;
+
+	if (flag & (PARSING_AND_ERR_FLAGS_TUNNELL4CHKSMWASCALCULATED_MASK <<
+		    PARSING_AND_ERR_FLAGS_TUNNELL4CHKSMWASCALCULATED_SHIFT))
+		csum_flag |= PARSING_AND_ERR_FLAGS_TUNNELL4CHKSMERROR_MASK <<
+			     PARSING_AND_ERR_FLAGS_TUNNELL4CHKSMERROR_SHIFT;
+
+	if (flag & (PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED_MASK <<
+		    PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED_SHIFT)) {
+		csum_flag |= PARSING_AND_ERR_FLAGS_L4CHKSMERROR_MASK <<
+			     PARSING_AND_ERR_FLAGS_L4CHKSMERROR_SHIFT;
+		tcsum = QEDE_TUNN_CSUM_UNNECESSARY;
+	}
+
+	csum_flag |= PARSING_AND_ERR_FLAGS_TUNNELIPHDRERROR_MASK <<
+		     PARSING_AND_ERR_FLAGS_TUNNELIPHDRERROR_SHIFT |
+		     PARSING_AND_ERR_FLAGS_IPHDRERROR_MASK <<
+		     PARSING_AND_ERR_FLAGS_IPHDRERROR_SHIFT;
+
+	if (csum_flag & flag)
+		return QEDE_CSUM_ERROR;
+
+	return QEDE_CSUM_UNNECESSARY | tcsum;
+}
+
+static void qede_tpa_start(struct qede_dev *edev,
+			   struct qede_rx_queue *rxq,
+			   struct eth_fast_path_rx_tpa_start_cqe *cqe)
+{
+	struct qede_agg_info *tpa_info = &rxq->tpa_info[cqe->tpa_agg_index];
+	struct eth_rx_bd *rx_bd_cons = qed_chain_consume(&rxq->rx_bd_ring);
+	struct eth_rx_bd *rx_bd_prod = qed_chain_produce(&rxq->rx_bd_ring);
+	struct sw_rx_data *replace_buf = &tpa_info->buffer;
+	dma_addr_t mapping = tpa_info->buffer_mapping;
+	struct sw_rx_data *sw_rx_data_cons;
+	struct sw_rx_data *sw_rx_data_prod;
+
+	sw_rx_data_cons = &rxq->sw_rx_ring[rxq->sw_rx_cons & NUM_RX_BDS_MAX];
+	sw_rx_data_prod = &rxq->sw_rx_ring[rxq->sw_rx_prod & NUM_RX_BDS_MAX];
+
+	/* Use pre-allocated replacement buffer - we can't release the agg.
+	 * start until its over and we don't want to risk allocation failing
+	 * here, so re-allocate when aggregation will be over.
+	 */
+	sw_rx_data_prod->mapping = replace_buf->mapping;
+
+	sw_rx_data_prod->data = replace_buf->data;
+	rx_bd_prod->addr.hi = cpu_to_le32(upper_32_bits(mapping));
+	rx_bd_prod->addr.lo = cpu_to_le32(lower_32_bits(mapping));
+	sw_rx_data_prod->page_offset = replace_buf->page_offset;
+
+	rxq->sw_rx_prod++;
+
+	/* move partial skb from cons to pool (don't unmap yet)
+	 * save mapping, incase we drop the packet later on.
+	 */
+	tpa_info->buffer = *sw_rx_data_cons;
+	mapping = HILO_U64(le32_to_cpu(rx_bd_cons->addr.hi),
+			   le32_to_cpu(rx_bd_cons->addr.lo));
+
+	tpa_info->buffer_mapping = mapping;
+	rxq->sw_rx_cons++;
+
+	/* set tpa state to start only if we are able to allocate skb
+	 * for this aggregation, otherwise mark as error and aggregation will
+	 * be dropped
+	 */
+	tpa_info->skb = netdev_alloc_skb(edev->ndev,
+					 le16_to_cpu(cqe->len_on_first_bd));
+	if (unlikely(!tpa_info->skb)) {
+		DP_NOTICE(edev, "Failed to allocate SKB for gro\n");
+		tpa_info->state = QEDE_AGG_STATE_ERROR;
+		goto cons_buf;
+	}
+
+	/* Start filling in the aggregation info */
+	skb_put(tpa_info->skb, le16_to_cpu(cqe->len_on_first_bd));
+	tpa_info->frag_id = 0;
+	tpa_info->state = QEDE_AGG_STATE_START;
+
+	/* Store some information from first CQE */
+	tpa_info->start_cqe_placement_offset = cqe->placement_offset;
+	tpa_info->start_cqe_bd_len = le16_to_cpu(cqe->len_on_first_bd);
+	if ((le16_to_cpu(cqe->pars_flags.flags) >>
+	     PARSING_AND_ERR_FLAGS_TAG8021QEXIST_SHIFT) &
+	    PARSING_AND_ERR_FLAGS_TAG8021QEXIST_MASK)
+		tpa_info->vlan_tag = le16_to_cpu(cqe->vlan_tag);
+	else
+		tpa_info->vlan_tag = 0;
+
+	qede_get_rxhash(tpa_info->skb, cqe->bitfields, cqe->rss_hash);
+
+	/* This is needed in order to enable forwarding support */
+	qede_set_gro_params(edev, tpa_info->skb, cqe);
+
+cons_buf: /* We still need to handle bd_len_list to consume buffers */
+	if (likely(cqe->ext_bd_len_list[0]))
+		qede_fill_frag_skb(edev, rxq, cqe->tpa_agg_index,
+				   le16_to_cpu(cqe->ext_bd_len_list[0]));
+
+	if (unlikely(cqe->ext_bd_len_list[1])) {
+		DP_ERR(edev,
+		       "Unlikely - got a TPA aggregation with more than one ext_bd_len_list entry in the TPA start\n");
+		tpa_info->state = QEDE_AGG_STATE_ERROR;
+	}
+}
+
+#ifdef CONFIG_INET
+static void qede_gro_ip_csum(struct sk_buff *skb)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct tcphdr *th;
+
+	skb_set_transport_header(skb, sizeof(struct iphdr));
+	th = tcp_hdr(skb);
+
+	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
+				  iph->saddr, iph->daddr, 0);
+
+	tcp_gro_complete(skb);
+}
+
+static void qede_gro_ipv6_csum(struct sk_buff *skb)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct tcphdr *th;
+
+	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+	th = tcp_hdr(skb);
+
+	th->check = ~tcp_v6_check(skb->len - skb_transport_offset(skb),
+				  &iph->saddr, &iph->daddr, 0);
+	tcp_gro_complete(skb);
+}
+#endif
+
+static void qede_gro_receive(struct qede_dev *edev,
+			     struct qede_fastpath *fp,
+			     struct sk_buff *skb,
+			     u16 vlan_tag)
+{
+	/* FW can send a single MTU sized packet from gro flow
+	 * due to aggregation timeout/last segment etc. which
+	 * is not expected to be a gro packet. If a skb has zero
+	 * frags then simply push it in the stack as non gso skb.
+	 */
+	if (unlikely(!skb->data_len)) {
+		skb_shinfo(skb)->gso_type = 0;
+		skb_shinfo(skb)->gso_size = 0;
+		goto send_skb;
+	}
+
+#ifdef CONFIG_INET
+	if (skb_shinfo(skb)->gso_size) {
+		skb_reset_network_header(skb);
+
+		switch (skb->protocol) {
+		case htons(ETH_P_IP):
+			qede_gro_ip_csum(skb);
+			break;
+		case htons(ETH_P_IPV6):
+			qede_gro_ipv6_csum(skb);
+			break;
+		default:
+			DP_ERR(edev,
+			       "Error: FW GRO supports only IPv4/IPv6, not 0x%04x\n",
+			       ntohs(skb->protocol));
+		}
+	}
+#endif
+
+send_skb:
+	skb_record_rx_queue(skb, fp->rxq->rxq_id);
+	qede_skb_receive(edev, fp, fp->rxq, skb, vlan_tag);
+}
+
+static inline void qede_tpa_cont(struct qede_dev *edev,
+				 struct qede_rx_queue *rxq,
+				 struct eth_fast_path_rx_tpa_cont_cqe *cqe)
+{
+	int i;
+
+	for (i = 0; cqe->len_list[i]; i++)
+		qede_fill_frag_skb(edev, rxq, cqe->tpa_agg_index,
+				   le16_to_cpu(cqe->len_list[i]));
+
+	if (unlikely(i > 1))
+		DP_ERR(edev,
+		       "Strange - TPA cont with more than a single len_list entry\n");
+}
+
+static int qede_tpa_end(struct qede_dev *edev,
+			struct qede_fastpath *fp,
+			struct eth_fast_path_rx_tpa_end_cqe *cqe)
+{
+	struct qede_rx_queue *rxq = fp->rxq;
+	struct qede_agg_info *tpa_info;
+	struct sk_buff *skb;
+	int i;
+
+	tpa_info = &rxq->tpa_info[cqe->tpa_agg_index];
+	skb = tpa_info->skb;
+
+	for (i = 0; cqe->len_list[i]; i++)
+		qede_fill_frag_skb(edev, rxq, cqe->tpa_agg_index,
+				   le16_to_cpu(cqe->len_list[i]));
+	if (unlikely(i > 1))
+		DP_ERR(edev,
+		       "Strange - TPA emd with more than a single len_list entry\n");
+
+	if (unlikely(tpa_info->state != QEDE_AGG_STATE_START))
+		goto err;
+
+	/* Sanity */
+	if (unlikely(cqe->num_of_bds != tpa_info->frag_id + 1))
+		DP_ERR(edev,
+		       "Strange - TPA had %02x BDs, but SKB has only %d frags\n",
+		       cqe->num_of_bds, tpa_info->frag_id);
+	if (unlikely(skb->len != le16_to_cpu(cqe->total_packet_len)))
+		DP_ERR(edev,
+		       "Strange - total packet len [cqe] is %4x but SKB has len %04x\n",
+		       le16_to_cpu(cqe->total_packet_len), skb->len);
+
+	memcpy(skb->data,
+	       page_address(tpa_info->buffer.data) +
+	       tpa_info->start_cqe_placement_offset +
+	       tpa_info->buffer.page_offset, tpa_info->start_cqe_bd_len);
+
+	/* Finalize the SKB */
+	skb->protocol = eth_type_trans(skb, edev->ndev);
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	/* tcp_gro_complete() will copy NAPI_GRO_CB(skb)->count
+	 * to skb_shinfo(skb)->gso_segs
+	 */
+	NAPI_GRO_CB(skb)->count = le16_to_cpu(cqe->num_of_coalesced_segs);
+
+	qede_gro_receive(edev, fp, skb, tpa_info->vlan_tag);
+
+	tpa_info->state = QEDE_AGG_STATE_NONE;
+
+	return 1;
+err:
+	tpa_info->state = QEDE_AGG_STATE_NONE;
+	dev_kfree_skb_any(tpa_info->skb);
+	tpa_info->skb = NULL;
+	return 0;
+}
+
+static u8 qede_check_notunn_csum(u16 flag)
+{
+	u16 csum_flag = 0;
+	u8 csum = 0;
+
+	if (flag & (PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED_MASK <<
+		    PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED_SHIFT)) {
+		csum_flag |= PARSING_AND_ERR_FLAGS_L4CHKSMERROR_MASK <<
+			     PARSING_AND_ERR_FLAGS_L4CHKSMERROR_SHIFT;
+		csum = QEDE_CSUM_UNNECESSARY;
+	}
+
+	csum_flag |= PARSING_AND_ERR_FLAGS_IPHDRERROR_MASK <<
+		     PARSING_AND_ERR_FLAGS_IPHDRERROR_SHIFT;
+
+	if (csum_flag & flag)
+		return QEDE_CSUM_ERROR;
+
+	return csum;
+}
+
+static u8 qede_check_csum(u16 flag)
+{
+	if (!qede_tunn_exist(flag))
+		return qede_check_notunn_csum(flag);
+	else
+		return qede_check_tunn_csum(flag);
+}
+
+static bool qede_pkt_is_ip_fragmented(struct eth_fast_path_rx_reg_cqe *cqe,
+				      u16 flag)
+{
+	u8 tun_pars_flg = cqe->tunnel_pars_flags.flags;
+
+	if ((tun_pars_flg & (ETH_TUNNEL_PARSING_FLAGS_IPV4_FRAGMENT_MASK <<
+			     ETH_TUNNEL_PARSING_FLAGS_IPV4_FRAGMENT_SHIFT)) ||
+	    (flag & (PARSING_AND_ERR_FLAGS_IPV4FRAG_MASK <<
+		     PARSING_AND_ERR_FLAGS_IPV4FRAG_SHIFT)))
+		return true;
+
+	return false;
+}
+
+/* Return true iff packet is to be passed to stack */
+static bool qede_rx_xdp(struct qede_dev *edev,
+			struct qede_fastpath *fp,
+			struct qede_rx_queue *rxq,
+			struct bpf_prog *prog,
+			struct sw_rx_data *bd,
+			struct eth_fast_path_rx_reg_cqe *cqe,
+			u16 *data_offset, u16 *len)
+{
+	struct xdp_buff xdp;
+	enum xdp_action act;
+
+	xdp.data_hard_start = page_address(bd->data);
+	xdp.data = xdp.data_hard_start + *data_offset;
+	xdp_set_data_meta_invalid(&xdp);
+	xdp.data_end = xdp.data + *len;
+	xdp.rxq = &rxq->xdp_rxq;
+
+	/* Queues always have a full reset currently, so for the time
+	 * being until there's atomic program replace just mark read
+	 * side for map helpers.
+	 */
+	rcu_read_lock();
+	act = bpf_prog_run_xdp(prog, &xdp);
+	rcu_read_unlock();
+
+	/* Recalculate, as XDP might have changed the headers */
+	*data_offset = xdp.data - xdp.data_hard_start;
+	*len = xdp.data_end - xdp.data;
+
+	if (act == XDP_PASS)
+		return true;
+
+	/* Count number of packets not to be passed to stack */
+	rxq->xdp_no_pass++;
+
+	switch (act) {
+	case XDP_TX:
+		/* We need the replacement buffer before transmit. */
+		if (qede_alloc_rx_buffer(rxq, true)) {
+			qede_recycle_rx_bd_ring(rxq, 1);
+			trace_xdp_exception(edev->ndev, prog, act);
+			return false;
+		}
+
+		/* Now if there's a transmission problem, we'd still have to
+		 * throw current buffer, as replacement was already allocated.
+		 */
+		if (qede_xdp_xmit(edev, fp, bd, *data_offset, *len)) {
+			dma_unmap_page(rxq->dev, bd->mapping,
+				       PAGE_SIZE, DMA_BIDIRECTIONAL);
+			__free_page(bd->data);
+			trace_xdp_exception(edev->ndev, prog, act);
+		}
+
+		/* Regardless, we've consumed an Rx BD */
+		qede_rx_bd_ring_consume(rxq);
+		return false;
+
+	default:
+		bpf_warn_invalid_xdp_action(act);
+	case XDP_ABORTED:
+		trace_xdp_exception(edev->ndev, prog, act);
+	case XDP_DROP:
+		qede_recycle_rx_bd_ring(rxq, cqe->bd_num);
+	}
+
+	return false;
+}
+
+static struct sk_buff *qede_rx_allocate_skb(struct qede_dev *edev,
+					    struct qede_rx_queue *rxq,
+					    struct sw_rx_data *bd, u16 len,
+					    u16 pad)
+{
+	unsigned int offset = bd->page_offset + pad;
+	struct skb_frag_struct *frag;
+	struct page *page = bd->data;
+	unsigned int pull_len;
+	struct sk_buff *skb;
+	unsigned char *va;
+
+	/* Allocate a new SKB with a sufficient large header len */
+	skb = netdev_alloc_skb(edev->ndev, QEDE_RX_HDR_SIZE);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* Copy data into SKB - if it's small, we can simply copy it and
+	 * re-use the already allcoated & mapped memory.
+	 */
+	if (len + pad <= edev->rx_copybreak) {
+		skb_put_data(skb, page_address(page) + offset, len);
+		qede_reuse_page(rxq, bd);
+		goto out;
+	}
+
+	frag = &skb_shinfo(skb)->frags[0];
+
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+			page, offset, len, rxq->rx_buf_seg_size);
+
+	va = skb_frag_address(frag);
+	pull_len = eth_get_headlen(va, QEDE_RX_HDR_SIZE);
+
+	/* Align the pull_len to optimize memcpy */
+	memcpy(skb->data, va, ALIGN(pull_len, sizeof(long)));
+
+	/* Correct the skb & frag sizes offset after the pull */
+	skb_frag_size_sub(frag, pull_len);
+	frag->page_offset += pull_len;
+	skb->data_len -= pull_len;
+	skb->tail += pull_len;
+
+	if (unlikely(qede_realloc_rx_buffer(rxq, bd))) {
+		/* Incr page ref count to reuse on allocation failure so
+		 * that it doesn't get freed while freeing SKB [as its
+		 * already mapped there].
+		 */
+		page_ref_inc(page);
+		dev_kfree_skb_any(skb);
+		return NULL;
+	}
+
+out:
+	/* We've consumed the first BD and prepared an SKB */
+	qede_rx_bd_ring_consume(rxq);
+	return skb;
+}
+
+static int qede_rx_build_jumbo(struct qede_dev *edev,
+			       struct qede_rx_queue *rxq,
+			       struct sk_buff *skb,
+			       struct eth_fast_path_rx_reg_cqe *cqe,
+			       u16 first_bd_len)
+{
+	u16 pkt_len = le16_to_cpu(cqe->pkt_len);
+	struct sw_rx_data *bd;
+	u16 bd_cons_idx;
+	u8 num_frags;
+
+	pkt_len -= first_bd_len;
+
+	/* We've already used one BD for the SKB. Now take care of the rest */
+	for (num_frags = cqe->bd_num - 1; num_frags > 0; num_frags--) {
+		u16 cur_size = pkt_len > rxq->rx_buf_size ? rxq->rx_buf_size :
+		    pkt_len;
+
+		if (unlikely(!cur_size)) {
+			DP_ERR(edev,
+			       "Still got %d BDs for mapping jumbo, but length became 0\n",
+			       num_frags);
+			goto out;
+		}
+
+		/* We need a replacement buffer for each BD */
+		if (unlikely(qede_alloc_rx_buffer(rxq, true)))
+			goto out;
+
+		/* Now that we've allocated the replacement buffer,
+		 * we can safely consume the next BD and map it to the SKB.
+		 */
+		bd_cons_idx = rxq->sw_rx_cons & NUM_RX_BDS_MAX;
+		bd = &rxq->sw_rx_ring[bd_cons_idx];
+		qede_rx_bd_ring_consume(rxq);
+
+		dma_unmap_page(rxq->dev, bd->mapping,
+			       PAGE_SIZE, DMA_FROM_DEVICE);
+
+		skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags++,
+				   bd->data, 0, cur_size);
+
+		skb->truesize += PAGE_SIZE;
+		skb->data_len += cur_size;
+		skb->len += cur_size;
+		pkt_len -= cur_size;
+	}
+
+	if (unlikely(pkt_len))
+		DP_ERR(edev,
+		       "Mapped all BDs of jumbo, but still have %d bytes\n",
+		       pkt_len);
+
+out:
+	return num_frags;
+}
+
+static int qede_rx_process_tpa_cqe(struct qede_dev *edev,
+				   struct qede_fastpath *fp,
+				   struct qede_rx_queue *rxq,
+				   union eth_rx_cqe *cqe,
+				   enum eth_rx_cqe_type type)
+{
+	switch (type) {
+	case ETH_RX_CQE_TYPE_TPA_START:
+		qede_tpa_start(edev, rxq, &cqe->fast_path_tpa_start);
+		return 0;
+	case ETH_RX_CQE_TYPE_TPA_CONT:
+		qede_tpa_cont(edev, rxq, &cqe->fast_path_tpa_cont);
+		return 0;
+	case ETH_RX_CQE_TYPE_TPA_END:
+		return qede_tpa_end(edev, fp, &cqe->fast_path_tpa_end);
+	default:
+		return 0;
+	}
+}
+
+static int qede_rx_process_cqe(struct qede_dev *edev,
+			       struct qede_fastpath *fp,
+			       struct qede_rx_queue *rxq)
+{
+	struct bpf_prog *xdp_prog = READ_ONCE(rxq->xdp_prog);
+	struct eth_fast_path_rx_reg_cqe *fp_cqe;
+	u16 len, pad, bd_cons_idx, parse_flag;
+	enum eth_rx_cqe_type cqe_type;
+	union eth_rx_cqe *cqe;
+	struct sw_rx_data *bd;
+	struct sk_buff *skb;
+	__le16 flags;
+	u8 csum_flag;
+
+	/* Get the CQE from the completion ring */
+	cqe = (union eth_rx_cqe *)qed_chain_consume(&rxq->rx_comp_ring);
+	cqe_type = cqe->fast_path_regular.type;
+
+	/* Process an unlikely slowpath event */
+	if (unlikely(cqe_type == ETH_RX_CQE_TYPE_SLOW_PATH)) {
+		struct eth_slow_path_rx_cqe *sp_cqe;
+
+		sp_cqe = (struct eth_slow_path_rx_cqe *)cqe;
+		edev->ops->eth_cqe_completion(edev->cdev, fp->id, sp_cqe);
+		return 0;
+	}
+
+	/* Handle TPA cqes */
+	if (cqe_type != ETH_RX_CQE_TYPE_REGULAR)
+		return qede_rx_process_tpa_cqe(edev, fp, rxq, cqe, cqe_type);
+
+	/* Get the data from the SW ring; Consume it only after it's evident
+	 * we wouldn't recycle it.
+	 */
+	bd_cons_idx = rxq->sw_rx_cons & NUM_RX_BDS_MAX;
+	bd = &rxq->sw_rx_ring[bd_cons_idx];
+
+	fp_cqe = &cqe->fast_path_regular;
+	len = le16_to_cpu(fp_cqe->len_on_first_bd);
+	pad = fp_cqe->placement_offset + rxq->rx_headroom;
+
+	/* Run eBPF program if one is attached */
+	if (xdp_prog)
+		if (!qede_rx_xdp(edev, fp, rxq, xdp_prog, bd, fp_cqe,
+				 &pad, &len))
+			return 0;
+
+	/* If this is an error packet then drop it */
+	flags = cqe->fast_path_regular.pars_flags.flags;
+	parse_flag = le16_to_cpu(flags);
+
+	csum_flag = qede_check_csum(parse_flag);
+	if (unlikely(csum_flag == QEDE_CSUM_ERROR)) {
+		if (qede_pkt_is_ip_fragmented(fp_cqe, parse_flag))
+			rxq->rx_ip_frags++;
+		else
+			rxq->rx_hw_errors++;
+	}
+
+	/* Basic validation passed; Need to prepare an SKB. This would also
+	 * guarantee to finally consume the first BD upon success.
+	 */
+	skb = qede_rx_allocate_skb(edev, rxq, bd, len, pad);
+	if (!skb) {
+		rxq->rx_alloc_errors++;
+		qede_recycle_rx_bd_ring(rxq, fp_cqe->bd_num);
+		return 0;
+	}
+
+	/* In case of Jumbo packet, several PAGE_SIZEd buffers will be pointed
+	 * by a single cqe.
+	 */
+	if (fp_cqe->bd_num > 1) {
+		u16 unmapped_frags = qede_rx_build_jumbo(edev, rxq, skb,
+							 fp_cqe, len);
+
+		if (unlikely(unmapped_frags > 0)) {
+			qede_recycle_rx_bd_ring(rxq, unmapped_frags);
+			dev_kfree_skb_any(skb);
+			return 0;
+		}
+	}
+
+	/* The SKB contains all the data. Now prepare meta-magic */
+	skb->protocol = eth_type_trans(skb, edev->ndev);
+	qede_get_rxhash(skb, fp_cqe->bitfields, fp_cqe->rss_hash);
+	qede_set_skb_csum(skb, csum_flag);
+	skb_record_rx_queue(skb, rxq->rxq_id);
+	qede_ptp_record_rx_ts(edev, cqe, skb);
+
+	/* SKB is prepared - pass it to stack */
+	qede_skb_receive(edev, fp, rxq, skb, le16_to_cpu(fp_cqe->vlan_tag));
+
+	return 1;
+}
+
+static int qede_rx_int(struct qede_fastpath *fp, int budget)
+{
+	struct qede_rx_queue *rxq = fp->rxq;
+	struct qede_dev *edev = fp->edev;
+	int work_done = 0, rcv_pkts = 0;
+	u16 hw_comp_cons, sw_comp_cons;
+
+	hw_comp_cons = le16_to_cpu(*rxq->hw_cons_ptr);
+	sw_comp_cons = qed_chain_get_cons_idx(&rxq->rx_comp_ring);
+
+	/* Memory barrier to prevent the CPU from doing speculative reads of CQE
+	 * / BD in the while-loop before reading hw_comp_cons. If the CQE is
+	 * read before it is written by FW, then FW writes CQE and SB, and then
+	 * the CPU reads the hw_comp_cons, it will use an old CQE.
+	 */
+	rmb();
+
+	/* Loop to complete all indicated BDs */
+	while ((sw_comp_cons != hw_comp_cons) && (work_done < budget)) {
+		rcv_pkts += qede_rx_process_cqe(edev, fp, rxq);
+		qed_chain_recycle_consumed(&rxq->rx_comp_ring);
+		sw_comp_cons = qed_chain_get_cons_idx(&rxq->rx_comp_ring);
+		work_done++;
+	}
+
+	rxq->rcv_pkts += rcv_pkts;
+
+	/* Allocate replacement buffers */
+	while (rxq->num_rx_buffers - rxq->filled_buffers)
+		if (qede_alloc_rx_buffer(rxq, false))
+			break;
+
+	/* Update producers */
+	qede_update_rx_prod(edev, rxq);
+
+	return work_done;
+}
+
+static bool qede_poll_is_more_work(struct qede_fastpath *fp)
+{
+	qed_sb_update_sb_idx(fp->sb_info);
+
+	/* *_has_*_work() reads the status block, thus we need to ensure that
+	 * status block indices have been actually read (qed_sb_update_sb_idx)
+	 * prior to this check (*_has_*_work) so that we won't write the
+	 * "newer" value of the status block to HW (if there was a DMA right
+	 * after qede_has_rx_work and if there is no rmb, the memory reading
+	 * (qed_sb_update_sb_idx) may be postponed to right before *_ack_sb).
+	 * In this case there will never be another interrupt until there is
+	 * another update of the status block, while there is still unhandled
+	 * work.
+	 */
+	rmb();
+
+	if (likely(fp->type & QEDE_FASTPATH_RX))
+		if (qede_has_rx_work(fp->rxq))
+			return true;
+
+	if (fp->type & QEDE_FASTPATH_XDP)
+		if (qede_txq_has_work(fp->xdp_tx))
+			return true;
+
+	if (likely(fp->type & QEDE_FASTPATH_TX))
+		if (qede_txq_has_work(fp->txq))
+			return true;
+
+	return false;
+}
+
+/*********************
+ * NDO & API related *
+ *********************/
+int qede_poll(struct napi_struct *napi, int budget)
+{
+	struct qede_fastpath *fp = container_of(napi, struct qede_fastpath,
+						napi);
+	struct qede_dev *edev = fp->edev;
+	int rx_work_done = 0;
+
+	if (likely(fp->type & QEDE_FASTPATH_TX) && qede_txq_has_work(fp->txq))
+		qede_tx_int(edev, fp->txq);
+
+	if ((fp->type & QEDE_FASTPATH_XDP) && qede_txq_has_work(fp->xdp_tx))
+		qede_xdp_tx_int(edev, fp->xdp_tx);
+
+	rx_work_done = (likely(fp->type & QEDE_FASTPATH_RX) &&
+			qede_has_rx_work(fp->rxq)) ?
+			qede_rx_int(fp, budget) : 0;
+	if (rx_work_done < budget) {
+		if (!qede_poll_is_more_work(fp)) {
+			napi_complete_done(napi, rx_work_done);
+
+			/* Update and reenable interrupts */
+			qed_sb_ack(fp->sb_info, IGU_INT_ENABLE, 1);
+		} else {
+			rx_work_done = budget;
+		}
+	}
+
+	if (fp->xdp_xmit) {
+		u16 xdp_prod = qed_chain_get_prod_idx(&fp->xdp_tx->tx_pbl);
+
+		fp->xdp_xmit = 0;
+		fp->xdp_tx->tx_db.data.bd_prod = cpu_to_le16(xdp_prod);
+		qede_update_tx_producer(fp->xdp_tx);
+	}
+
+	return rx_work_done;
+}
+
+irqreturn_t qede_msix_fp_int(int irq, void *fp_cookie)
+{
+	struct qede_fastpath *fp = fp_cookie;
+
+	qed_sb_ack(fp->sb_info, IGU_INT_DISABLE, 0 /*do not update*/);
+
+	napi_schedule_irqoff(&fp->napi);
+	return IRQ_HANDLED;
+}
+
+/* Main transmit function */
+netdev_tx_t qede_start_xmit(struct sk_buff *skb, struct net_device *ndev)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+	struct netdev_queue *netdev_txq;
+	struct qede_tx_queue *txq;
+	struct eth_tx_1st_bd *first_bd;
+	struct eth_tx_2nd_bd *second_bd = NULL;
+	struct eth_tx_3rd_bd *third_bd = NULL;
+	struct eth_tx_bd *tx_data_bd = NULL;
+	u16 txq_index, val = 0;
+	u8 nbd = 0;
+	dma_addr_t mapping;
+	int rc, frag_idx = 0, ipv6_ext = 0;
+	u8 xmit_type;
+	u16 idx;
+	u16 hlen;
+	bool data_split = false;
+
+	/* Get tx-queue context and netdev index */
+	txq_index = skb_get_queue_mapping(skb);
+	WARN_ON(txq_index >= QEDE_TSS_COUNT(edev));
+	txq = edev->fp_array[edev->fp_num_rx + txq_index].txq;
+	netdev_txq = netdev_get_tx_queue(ndev, txq_index);
+
+	WARN_ON(qed_chain_get_elem_left(&txq->tx_pbl) < (MAX_SKB_FRAGS + 1));
+
+	xmit_type = qede_xmit_type(skb, &ipv6_ext);
+
+#if ((MAX_SKB_FRAGS + 2) > ETH_TX_MAX_BDS_PER_NON_LSO_PACKET)
+	if (qede_pkt_req_lin(skb, xmit_type)) {
+		if (skb_linearize(skb)) {
+			DP_NOTICE(edev,
+				  "SKB linearization failed - silently dropping this SKB\n");
+			dev_kfree_skb_any(skb);
+			return NETDEV_TX_OK;
+		}
+	}
+#endif
+
+	/* Fill the entry in the SW ring and the BDs in the FW ring */
+	idx = txq->sw_tx_prod;
+	txq->sw_tx_ring.skbs[idx].skb = skb;
+	first_bd = (struct eth_tx_1st_bd *)
+		   qed_chain_produce(&txq->tx_pbl);
+	memset(first_bd, 0, sizeof(*first_bd));
+	first_bd->data.bd_flags.bitfields =
+		1 << ETH_TX_1ST_BD_FLAGS_START_BD_SHIFT;
+
+	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
+		qede_ptp_tx_ts(edev, skb);
+
+	/* Map skb linear data for DMA and set in the first BD */
+	mapping = dma_map_single(txq->dev, skb->data,
+				 skb_headlen(skb), DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(txq->dev, mapping))) {
+		DP_NOTICE(edev, "SKB mapping failed\n");
+		qede_free_failed_tx_pkt(txq, first_bd, 0, false);
+		qede_update_tx_producer(txq);
+		return NETDEV_TX_OK;
+	}
+	nbd++;
+	BD_SET_UNMAP_ADDR_LEN(first_bd, mapping, skb_headlen(skb));
+
+	/* In case there is IPv6 with extension headers or LSO we need 2nd and
+	 * 3rd BDs.
+	 */
+	if (unlikely((xmit_type & XMIT_LSO) | ipv6_ext)) {
+		second_bd = (struct eth_tx_2nd_bd *)
+			qed_chain_produce(&txq->tx_pbl);
+		memset(second_bd, 0, sizeof(*second_bd));
+
+		nbd++;
+		third_bd = (struct eth_tx_3rd_bd *)
+			qed_chain_produce(&txq->tx_pbl);
+		memset(third_bd, 0, sizeof(*third_bd));
+
+		nbd++;
+		/* We need to fill in additional data in second_bd... */
+		tx_data_bd = (struct eth_tx_bd *)second_bd;
+	}
+
+	if (skb_vlan_tag_present(skb)) {
+		first_bd->data.vlan = cpu_to_le16(skb_vlan_tag_get(skb));
+		first_bd->data.bd_flags.bitfields |=
+			1 << ETH_TX_1ST_BD_FLAGS_VLAN_INSERTION_SHIFT;
+	}
+
+	/* Fill the parsing flags & params according to the requested offload */
+	if (xmit_type & XMIT_L4_CSUM) {
+		/* We don't re-calculate IP checksum as it is already done by
+		 * the upper stack
+		 */
+		first_bd->data.bd_flags.bitfields |=
+			1 << ETH_TX_1ST_BD_FLAGS_L4_CSUM_SHIFT;
+
+		if (xmit_type & XMIT_ENC) {
+			first_bd->data.bd_flags.bitfields |=
+				1 << ETH_TX_1ST_BD_FLAGS_IP_CSUM_SHIFT;
+
+			val |= (1 << ETH_TX_DATA_1ST_BD_TUNN_FLAG_SHIFT);
+		}
+
+		/* Legacy FW had flipped behavior in regard to this bit -
+		 * I.e., needed to set to prevent FW from touching encapsulated
+		 * packets when it didn't need to.
+		 */
+		if (unlikely(txq->is_legacy))
+			val ^= (1 << ETH_TX_DATA_1ST_BD_TUNN_FLAG_SHIFT);
+
+		/* If the packet is IPv6 with extension header, indicate that
+		 * to FW and pass few params, since the device cracker doesn't
+		 * support parsing IPv6 with extension header/s.
+		 */
+		if (unlikely(ipv6_ext))
+			qede_set_params_for_ipv6_ext(skb, second_bd, third_bd);
+	}
+
+	if (xmit_type & XMIT_LSO) {
+		first_bd->data.bd_flags.bitfields |=
+			(1 << ETH_TX_1ST_BD_FLAGS_LSO_SHIFT);
+		third_bd->data.lso_mss =
+			cpu_to_le16(skb_shinfo(skb)->gso_size);
+
+		if (unlikely(xmit_type & XMIT_ENC)) {
+			first_bd->data.bd_flags.bitfields |=
+				1 << ETH_TX_1ST_BD_FLAGS_TUNN_IP_CSUM_SHIFT;
+
+			if (xmit_type & XMIT_ENC_GSO_L4_CSUM) {
+				u8 tmp = ETH_TX_1ST_BD_FLAGS_TUNN_L4_CSUM_SHIFT;
+
+				first_bd->data.bd_flags.bitfields |= 1 << tmp;
+			}
+			hlen = qede_get_skb_hlen(skb, true);
+		} else {
+			first_bd->data.bd_flags.bitfields |=
+				1 << ETH_TX_1ST_BD_FLAGS_IP_CSUM_SHIFT;
+			hlen = qede_get_skb_hlen(skb, false);
+		}
+
+		/* @@@TBD - if will not be removed need to check */
+		third_bd->data.bitfields |=
+			cpu_to_le16(1 << ETH_TX_DATA_3RD_BD_HDR_NBD_SHIFT);
+
+		/* Make life easier for FW guys who can't deal with header and
+		 * data on same BD. If we need to split, use the second bd...
+		 */
+		if (unlikely(skb_headlen(skb) > hlen)) {
+			DP_VERBOSE(edev, NETIF_MSG_TX_QUEUED,
+				   "TSO split header size is %d (%x:%x)\n",
+				   first_bd->nbytes, first_bd->addr.hi,
+				   first_bd->addr.lo);
+
+			mapping = HILO_U64(le32_to_cpu(first_bd->addr.hi),
+					   le32_to_cpu(first_bd->addr.lo)) +
+					   hlen;
+
+			BD_SET_UNMAP_ADDR_LEN(tx_data_bd, mapping,
+					      le16_to_cpu(first_bd->nbytes) -
+					      hlen);
+
+			/* this marks the BD as one that has no
+			 * individual mapping
+			 */
+			txq->sw_tx_ring.skbs[idx].flags |= QEDE_TSO_SPLIT_BD;
+
+			first_bd->nbytes = cpu_to_le16(hlen);
+
+			tx_data_bd = (struct eth_tx_bd *)third_bd;
+			data_split = true;
+		}
+	} else {
+		val |= ((skb->len & ETH_TX_DATA_1ST_BD_PKT_LEN_MASK) <<
+			 ETH_TX_DATA_1ST_BD_PKT_LEN_SHIFT);
+	}
+
+	first_bd->data.bitfields = cpu_to_le16(val);
+
+	/* Handle fragmented skb */
+	/* special handle for frags inside 2nd and 3rd bds.. */
+	while (tx_data_bd && frag_idx < skb_shinfo(skb)->nr_frags) {
+		rc = map_frag_to_bd(txq,
+				    &skb_shinfo(skb)->frags[frag_idx],
+				    tx_data_bd);
+		if (rc) {
+			qede_free_failed_tx_pkt(txq, first_bd, nbd, data_split);
+			qede_update_tx_producer(txq);
+			return NETDEV_TX_OK;
+		}
+
+		if (tx_data_bd == (struct eth_tx_bd *)second_bd)
+			tx_data_bd = (struct eth_tx_bd *)third_bd;
+		else
+			tx_data_bd = NULL;
+
+		frag_idx++;
+	}
+
+	/* map last frags into 4th, 5th .... */
+	for (; frag_idx < skb_shinfo(skb)->nr_frags; frag_idx++, nbd++) {
+		tx_data_bd = (struct eth_tx_bd *)
+			     qed_chain_produce(&txq->tx_pbl);
+
+		memset(tx_data_bd, 0, sizeof(*tx_data_bd));
+
+		rc = map_frag_to_bd(txq,
+				    &skb_shinfo(skb)->frags[frag_idx],
+				    tx_data_bd);
+		if (rc) {
+			qede_free_failed_tx_pkt(txq, first_bd, nbd, data_split);
+			qede_update_tx_producer(txq);
+			return NETDEV_TX_OK;
+		}
+	}
+
+	/* update the first BD with the actual num BDs */
+	first_bd->data.nbds = nbd;
+
+	netdev_tx_sent_queue(netdev_txq, skb->len);
+
+	skb_tx_timestamp(skb);
+
+	/* Advance packet producer only before sending the packet since mapping
+	 * of pages may fail.
+	 */
+	txq->sw_tx_prod = (txq->sw_tx_prod + 1) % txq->num_tx_buffers;
+
+	/* 'next page' entries are counted in the producer value */
+	txq->tx_db.data.bd_prod =
+		cpu_to_le16(qed_chain_get_prod_idx(&txq->tx_pbl));
+
+	if (!skb->xmit_more || netif_xmit_stopped(netdev_txq))
+		qede_update_tx_producer(txq);
+
+	if (unlikely(qed_chain_get_elem_left(&txq->tx_pbl)
+		      < (MAX_SKB_FRAGS + 1))) {
+		if (skb->xmit_more)
+			qede_update_tx_producer(txq);
+
+		netif_tx_stop_queue(netdev_txq);
+		txq->stopped_cnt++;
+		DP_VERBOSE(edev, NETIF_MSG_TX_QUEUED,
+			   "Stop queue was called\n");
+		/* paired memory barrier is in qede_tx_int(), we have to keep
+		 * ordering of set_bit() in netif_tx_stop_queue() and read of
+		 * fp->bd_tx_cons
+		 */
+		smp_mb();
+
+		if ((qed_chain_get_elem_left(&txq->tx_pbl) >=
+		     (MAX_SKB_FRAGS + 1)) &&
+		    (edev->state == QEDE_STATE_OPEN)) {
+			netif_tx_wake_queue(netdev_txq);
+			DP_VERBOSE(edev, NETIF_MSG_TX_QUEUED,
+				   "Wake queue was called\n");
+		}
+	}
+
+	return NETDEV_TX_OK;
+}
+
+/* 8B udp header + 8B base tunnel header + 32B option length */
+#define QEDE_MAX_TUN_HDR_LEN 48
+
+netdev_features_t qede_features_check(struct sk_buff *skb,
+				      struct net_device *dev,
+				      netdev_features_t features)
+{
+	if (skb->encapsulation) {
+		u8 l4_proto = 0;
+
+		switch (vlan_get_protocol(skb)) {
+		case htons(ETH_P_IP):
+			l4_proto = ip_hdr(skb)->protocol;
+			break;
+		case htons(ETH_P_IPV6):
+			l4_proto = ipv6_hdr(skb)->nexthdr;
+			break;
+		default:
+			return features;
+		}
+
+		/* Disable offloads for geneve tunnels, as HW can't parse
+		 * the geneve header which has option length greater than 32b
+		 * and disable offloads for the ports which are not offloaded.
+		 */
+		if (l4_proto == IPPROTO_UDP) {
+			struct qede_dev *edev = netdev_priv(dev);
+			u16 hdrlen, vxln_port, gnv_port;
+
+			hdrlen = QEDE_MAX_TUN_HDR_LEN;
+			vxln_port = edev->vxlan_dst_port;
+			gnv_port = edev->geneve_dst_port;
+
+			if ((skb_inner_mac_header(skb) -
+			     skb_transport_header(skb)) > hdrlen ||
+			     (ntohs(udp_hdr(skb)->dest) != vxln_port &&
+			      ntohs(udp_hdr(skb)->dest) != gnv_port))
+				return features & ~(NETIF_F_CSUM_MASK |
+						    NETIF_F_GSO_MASK);
+		}
+	}
+
+	return features;
+}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
new file mode 100644
index 0000000..f6655e2
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -0,0 +1,2179 @@
+/* QLogic qede NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/version.h>
+#include <linux/device.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <asm/byteorder.h>
+#include <asm/param.h>
+#include <linux/io.h>
+#include <linux/netdev_features.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <net/udp_tunnel.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/pkt_sched.h>
+#include <linux/ethtool.h>
+#include <linux/in.h>
+#include <linux/random.h>
+#include <net/ip6_checksum.h>
+#include <linux/bitops.h>
+#include <linux/vmalloc.h>
+#include "qede.h"
+#include "qede_ptp.h"
+
+static char version[] =
+	"QLogic FastLinQ 4xxxx Ethernet Driver qede " DRV_MODULE_VERSION "\n";
+
+MODULE_DESCRIPTION("QLogic FastLinQ 4xxxx Ethernet Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_MODULE_VERSION);
+
+static uint debug;
+module_param(debug, uint, 0);
+MODULE_PARM_DESC(debug, " Default debug msglevel");
+
+static const struct qed_eth_ops *qed_ops;
+
+#define CHIP_NUM_57980S_40		0x1634
+#define CHIP_NUM_57980S_10		0x1666
+#define CHIP_NUM_57980S_MF		0x1636
+#define CHIP_NUM_57980S_100		0x1644
+#define CHIP_NUM_57980S_50		0x1654
+#define CHIP_NUM_57980S_25		0x1656
+#define CHIP_NUM_57980S_IOV		0x1664
+#define CHIP_NUM_AH			0x8070
+#define CHIP_NUM_AH_IOV			0x8090
+
+#ifndef PCI_DEVICE_ID_NX2_57980E
+#define PCI_DEVICE_ID_57980S_40		CHIP_NUM_57980S_40
+#define PCI_DEVICE_ID_57980S_10		CHIP_NUM_57980S_10
+#define PCI_DEVICE_ID_57980S_MF		CHIP_NUM_57980S_MF
+#define PCI_DEVICE_ID_57980S_100	CHIP_NUM_57980S_100
+#define PCI_DEVICE_ID_57980S_50		CHIP_NUM_57980S_50
+#define PCI_DEVICE_ID_57980S_25		CHIP_NUM_57980S_25
+#define PCI_DEVICE_ID_57980S_IOV	CHIP_NUM_57980S_IOV
+#define PCI_DEVICE_ID_AH		CHIP_NUM_AH
+#define PCI_DEVICE_ID_AH_IOV		CHIP_NUM_AH_IOV
+
+#endif
+
+enum qede_pci_private {
+	QEDE_PRIVATE_PF,
+	QEDE_PRIVATE_VF
+};
+
+static const struct pci_device_id qede_pci_tbl[] = {
+	{PCI_VDEVICE(QLOGIC, PCI_DEVICE_ID_57980S_40), QEDE_PRIVATE_PF},
+	{PCI_VDEVICE(QLOGIC, PCI_DEVICE_ID_57980S_10), QEDE_PRIVATE_PF},
+	{PCI_VDEVICE(QLOGIC, PCI_DEVICE_ID_57980S_MF), QEDE_PRIVATE_PF},
+	{PCI_VDEVICE(QLOGIC, PCI_DEVICE_ID_57980S_100), QEDE_PRIVATE_PF},
+	{PCI_VDEVICE(QLOGIC, PCI_DEVICE_ID_57980S_50), QEDE_PRIVATE_PF},
+	{PCI_VDEVICE(QLOGIC, PCI_DEVICE_ID_57980S_25), QEDE_PRIVATE_PF},
+#ifdef CONFIG_QED_SRIOV
+	{PCI_VDEVICE(QLOGIC, PCI_DEVICE_ID_57980S_IOV), QEDE_PRIVATE_VF},
+#endif
+	{PCI_VDEVICE(QLOGIC, PCI_DEVICE_ID_AH), QEDE_PRIVATE_PF},
+#ifdef CONFIG_QED_SRIOV
+	{PCI_VDEVICE(QLOGIC, PCI_DEVICE_ID_AH_IOV), QEDE_PRIVATE_VF},
+#endif
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, qede_pci_tbl);
+
+static int qede_probe(struct pci_dev *pdev, const struct pci_device_id *id);
+
+#define TX_TIMEOUT		(5 * HZ)
+
+/* Utilize last protocol index for XDP */
+#define XDP_PI	11
+
+static void qede_remove(struct pci_dev *pdev);
+static void qede_shutdown(struct pci_dev *pdev);
+static void qede_link_update(void *dev, struct qed_link_output *link);
+
+/* The qede lock is used to protect driver state change and driver flows that
+ * are not reentrant.
+ */
+void __qede_lock(struct qede_dev *edev)
+{
+	mutex_lock(&edev->qede_lock);
+}
+
+void __qede_unlock(struct qede_dev *edev)
+{
+	mutex_unlock(&edev->qede_lock);
+}
+
+#ifdef CONFIG_QED_SRIOV
+static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos,
+			    __be16 vlan_proto)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	if (vlan > 4095) {
+		DP_NOTICE(edev, "Illegal vlan value %d\n", vlan);
+		return -EINVAL;
+	}
+
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
+	DP_VERBOSE(edev, QED_MSG_IOV, "Setting Vlan 0x%04x to VF [%d]\n",
+		   vlan, vf);
+
+	return edev->ops->iov->set_vlan(edev->cdev, vlan, vf);
+}
+
+static int qede_set_vf_mac(struct net_device *ndev, int vfidx, u8 *mac)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	DP_VERBOSE(edev, QED_MSG_IOV,
+		   "Setting MAC %02x:%02x:%02x:%02x:%02x:%02x to VF [%d]\n",
+		   mac[0], mac[1], mac[2], mac[3], mac[4], mac[5], vfidx);
+
+	if (!is_valid_ether_addr(mac)) {
+		DP_VERBOSE(edev, QED_MSG_IOV, "MAC address isn't valid\n");
+		return -EINVAL;
+	}
+
+	return edev->ops->iov->set_mac(edev->cdev, mac, vfidx);
+}
+
+static int qede_sriov_configure(struct pci_dev *pdev, int num_vfs_param)
+{
+	struct qede_dev *edev = netdev_priv(pci_get_drvdata(pdev));
+	struct qed_dev_info *qed_info = &edev->dev_info.common;
+	struct qed_update_vport_params *vport_params;
+	int rc;
+
+	vport_params = vzalloc(sizeof(*vport_params));
+	if (!vport_params)
+		return -ENOMEM;
+	DP_VERBOSE(edev, QED_MSG_IOV, "Requested %d VFs\n", num_vfs_param);
+
+	rc = edev->ops->iov->configure(edev->cdev, num_vfs_param);
+
+	/* Enable/Disable Tx switching for PF */
+	if ((rc == num_vfs_param) && netif_running(edev->ndev) &&
+	    qed_info->mf_mode != QED_MF_NPAR && qed_info->tx_switching) {
+		vport_params->vport_id = 0;
+		vport_params->update_tx_switching_flg = 1;
+		vport_params->tx_switching_flg = num_vfs_param ? 1 : 0;
+		edev->ops->vport_update(edev->cdev, vport_params);
+	}
+
+	vfree(vport_params);
+	return rc;
+}
+#endif
+
+static struct pci_driver qede_pci_driver = {
+	.name = "qede",
+	.id_table = qede_pci_tbl,
+	.probe = qede_probe,
+	.remove = qede_remove,
+	.shutdown = qede_shutdown,
+#ifdef CONFIG_QED_SRIOV
+	.sriov_configure = qede_sriov_configure,
+#endif
+};
+
+static struct qed_eth_cb_ops qede_ll_ops = {
+	{
+#ifdef CONFIG_RFS_ACCEL
+		.arfs_filter_op = qede_arfs_filter_op,
+#endif
+		.link_update = qede_link_update,
+	},
+	.force_mac = qede_force_mac,
+	.ports_update = qede_udp_ports_update,
+};
+
+static int qede_netdev_event(struct notifier_block *this, unsigned long event,
+			     void *ptr)
+{
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	struct ethtool_drvinfo drvinfo;
+	struct qede_dev *edev;
+
+	if (event != NETDEV_CHANGENAME && event != NETDEV_CHANGEADDR)
+		goto done;
+
+	/* Check whether this is a qede device */
+	if (!ndev || !ndev->ethtool_ops || !ndev->ethtool_ops->get_drvinfo)
+		goto done;
+
+	memset(&drvinfo, 0, sizeof(drvinfo));
+	ndev->ethtool_ops->get_drvinfo(ndev, &drvinfo);
+	if (strcmp(drvinfo.driver, "qede"))
+		goto done;
+	edev = netdev_priv(ndev);
+
+	switch (event) {
+	case NETDEV_CHANGENAME:
+		/* Notify qed of the name change */
+		if (!edev->ops || !edev->ops->common)
+			goto done;
+		edev->ops->common->set_name(edev->cdev, edev->ndev->name);
+		break;
+	case NETDEV_CHANGEADDR:
+		edev = netdev_priv(ndev);
+		qede_rdma_event_changeaddr(edev);
+		break;
+	}
+
+done:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block qede_netdev_notifier = {
+	.notifier_call = qede_netdev_event,
+};
+
+static
+int __init qede_init(void)
+{
+	int ret;
+
+	pr_info("qede_init: %s\n", version);
+
+	qed_ops = qed_get_eth_ops();
+	if (!qed_ops) {
+		pr_notice("Failed to get qed ethtool operations\n");
+		return -EINVAL;
+	}
+
+	/* Must register notifier before pci ops, since we might miss
+	 * interface rename after pci probe and netdev registration.
+	 */
+	ret = register_netdevice_notifier(&qede_netdev_notifier);
+	if (ret) {
+		pr_notice("Failed to register netdevice_notifier\n");
+		qed_put_eth_ops();
+		return -EINVAL;
+	}
+
+	ret = pci_register_driver(&qede_pci_driver);
+	if (ret) {
+		pr_notice("Failed to register driver\n");
+		unregister_netdevice_notifier(&qede_netdev_notifier);
+		qed_put_eth_ops();
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void __exit qede_cleanup(void)
+{
+	if (debug & QED_LOG_INFO_MASK)
+		pr_info("qede_cleanup called\n");
+
+	unregister_netdevice_notifier(&qede_netdev_notifier);
+	pci_unregister_driver(&qede_pci_driver);
+	qed_put_eth_ops();
+}
+
+module_init(qede_init);
+module_exit(qede_cleanup);
+
+static int qede_open(struct net_device *ndev);
+static int qede_close(struct net_device *ndev);
+
+void qede_fill_by_demand_stats(struct qede_dev *edev)
+{
+	struct qede_stats_common *p_common = &edev->stats.common;
+	struct qed_eth_stats stats;
+
+	edev->ops->get_vport_stats(edev->cdev, &stats);
+
+	p_common->no_buff_discards = stats.common.no_buff_discards;
+	p_common->packet_too_big_discard = stats.common.packet_too_big_discard;
+	p_common->ttl0_discard = stats.common.ttl0_discard;
+	p_common->rx_ucast_bytes = stats.common.rx_ucast_bytes;
+	p_common->rx_mcast_bytes = stats.common.rx_mcast_bytes;
+	p_common->rx_bcast_bytes = stats.common.rx_bcast_bytes;
+	p_common->rx_ucast_pkts = stats.common.rx_ucast_pkts;
+	p_common->rx_mcast_pkts = stats.common.rx_mcast_pkts;
+	p_common->rx_bcast_pkts = stats.common.rx_bcast_pkts;
+	p_common->mftag_filter_discards = stats.common.mftag_filter_discards;
+	p_common->mac_filter_discards = stats.common.mac_filter_discards;
+
+	p_common->tx_ucast_bytes = stats.common.tx_ucast_bytes;
+	p_common->tx_mcast_bytes = stats.common.tx_mcast_bytes;
+	p_common->tx_bcast_bytes = stats.common.tx_bcast_bytes;
+	p_common->tx_ucast_pkts = stats.common.tx_ucast_pkts;
+	p_common->tx_mcast_pkts = stats.common.tx_mcast_pkts;
+	p_common->tx_bcast_pkts = stats.common.tx_bcast_pkts;
+	p_common->tx_err_drop_pkts = stats.common.tx_err_drop_pkts;
+	p_common->coalesced_pkts = stats.common.tpa_coalesced_pkts;
+	p_common->coalesced_events = stats.common.tpa_coalesced_events;
+	p_common->coalesced_aborts_num = stats.common.tpa_aborts_num;
+	p_common->non_coalesced_pkts = stats.common.tpa_not_coalesced_pkts;
+	p_common->coalesced_bytes = stats.common.tpa_coalesced_bytes;
+
+	p_common->rx_64_byte_packets = stats.common.rx_64_byte_packets;
+	p_common->rx_65_to_127_byte_packets =
+	    stats.common.rx_65_to_127_byte_packets;
+	p_common->rx_128_to_255_byte_packets =
+	    stats.common.rx_128_to_255_byte_packets;
+	p_common->rx_256_to_511_byte_packets =
+	    stats.common.rx_256_to_511_byte_packets;
+	p_common->rx_512_to_1023_byte_packets =
+	    stats.common.rx_512_to_1023_byte_packets;
+	p_common->rx_1024_to_1518_byte_packets =
+	    stats.common.rx_1024_to_1518_byte_packets;
+	p_common->rx_crc_errors = stats.common.rx_crc_errors;
+	p_common->rx_mac_crtl_frames = stats.common.rx_mac_crtl_frames;
+	p_common->rx_pause_frames = stats.common.rx_pause_frames;
+	p_common->rx_pfc_frames = stats.common.rx_pfc_frames;
+	p_common->rx_align_errors = stats.common.rx_align_errors;
+	p_common->rx_carrier_errors = stats.common.rx_carrier_errors;
+	p_common->rx_oversize_packets = stats.common.rx_oversize_packets;
+	p_common->rx_jabbers = stats.common.rx_jabbers;
+	p_common->rx_undersize_packets = stats.common.rx_undersize_packets;
+	p_common->rx_fragments = stats.common.rx_fragments;
+	p_common->tx_64_byte_packets = stats.common.tx_64_byte_packets;
+	p_common->tx_65_to_127_byte_packets =
+	    stats.common.tx_65_to_127_byte_packets;
+	p_common->tx_128_to_255_byte_packets =
+	    stats.common.tx_128_to_255_byte_packets;
+	p_common->tx_256_to_511_byte_packets =
+	    stats.common.tx_256_to_511_byte_packets;
+	p_common->tx_512_to_1023_byte_packets =
+	    stats.common.tx_512_to_1023_byte_packets;
+	p_common->tx_1024_to_1518_byte_packets =
+	    stats.common.tx_1024_to_1518_byte_packets;
+	p_common->tx_pause_frames = stats.common.tx_pause_frames;
+	p_common->tx_pfc_frames = stats.common.tx_pfc_frames;
+	p_common->brb_truncates = stats.common.brb_truncates;
+	p_common->brb_discards = stats.common.brb_discards;
+	p_common->tx_mac_ctrl_frames = stats.common.tx_mac_ctrl_frames;
+
+	if (QEDE_IS_BB(edev)) {
+		struct qede_stats_bb *p_bb = &edev->stats.bb;
+
+		p_bb->rx_1519_to_1522_byte_packets =
+		    stats.bb.rx_1519_to_1522_byte_packets;
+		p_bb->rx_1519_to_2047_byte_packets =
+		    stats.bb.rx_1519_to_2047_byte_packets;
+		p_bb->rx_2048_to_4095_byte_packets =
+		    stats.bb.rx_2048_to_4095_byte_packets;
+		p_bb->rx_4096_to_9216_byte_packets =
+		    stats.bb.rx_4096_to_9216_byte_packets;
+		p_bb->rx_9217_to_16383_byte_packets =
+		    stats.bb.rx_9217_to_16383_byte_packets;
+		p_bb->tx_1519_to_2047_byte_packets =
+		    stats.bb.tx_1519_to_2047_byte_packets;
+		p_bb->tx_2048_to_4095_byte_packets =
+		    stats.bb.tx_2048_to_4095_byte_packets;
+		p_bb->tx_4096_to_9216_byte_packets =
+		    stats.bb.tx_4096_to_9216_byte_packets;
+		p_bb->tx_9217_to_16383_byte_packets =
+		    stats.bb.tx_9217_to_16383_byte_packets;
+		p_bb->tx_lpi_entry_count = stats.bb.tx_lpi_entry_count;
+		p_bb->tx_total_collisions = stats.bb.tx_total_collisions;
+	} else {
+		struct qede_stats_ah *p_ah = &edev->stats.ah;
+
+		p_ah->rx_1519_to_max_byte_packets =
+		    stats.ah.rx_1519_to_max_byte_packets;
+		p_ah->tx_1519_to_max_byte_packets =
+		    stats.ah.tx_1519_to_max_byte_packets;
+	}
+}
+
+static void qede_get_stats64(struct net_device *dev,
+			     struct rtnl_link_stats64 *stats)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+	struct qede_stats_common *p_common;
+
+	qede_fill_by_demand_stats(edev);
+	p_common = &edev->stats.common;
+
+	stats->rx_packets = p_common->rx_ucast_pkts + p_common->rx_mcast_pkts +
+			    p_common->rx_bcast_pkts;
+	stats->tx_packets = p_common->tx_ucast_pkts + p_common->tx_mcast_pkts +
+			    p_common->tx_bcast_pkts;
+
+	stats->rx_bytes = p_common->rx_ucast_bytes + p_common->rx_mcast_bytes +
+			  p_common->rx_bcast_bytes;
+	stats->tx_bytes = p_common->tx_ucast_bytes + p_common->tx_mcast_bytes +
+			  p_common->tx_bcast_bytes;
+
+	stats->tx_errors = p_common->tx_err_drop_pkts;
+	stats->multicast = p_common->rx_mcast_pkts + p_common->rx_bcast_pkts;
+
+	stats->rx_fifo_errors = p_common->no_buff_discards;
+
+	if (QEDE_IS_BB(edev))
+		stats->collisions = edev->stats.bb.tx_total_collisions;
+	stats->rx_crc_errors = p_common->rx_crc_errors;
+	stats->rx_frame_errors = p_common->rx_align_errors;
+}
+
+#ifdef CONFIG_QED_SRIOV
+static int qede_get_vf_config(struct net_device *dev, int vfidx,
+			      struct ifla_vf_info *ivi)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (!edev->ops)
+		return -EINVAL;
+
+	return edev->ops->iov->get_config(edev->cdev, vfidx, ivi);
+}
+
+static int qede_set_vf_rate(struct net_device *dev, int vfidx,
+			    int min_tx_rate, int max_tx_rate)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	return edev->ops->iov->set_rate(edev->cdev, vfidx, min_tx_rate,
+					max_tx_rate);
+}
+
+static int qede_set_vf_spoofchk(struct net_device *dev, int vfidx, bool val)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (!edev->ops)
+		return -EINVAL;
+
+	return edev->ops->iov->set_spoof(edev->cdev, vfidx, val);
+}
+
+static int qede_set_vf_link_state(struct net_device *dev, int vfidx,
+				  int link_state)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (!edev->ops)
+		return -EINVAL;
+
+	return edev->ops->iov->set_link_state(edev->cdev, vfidx, link_state);
+}
+
+static int qede_set_vf_trust(struct net_device *dev, int vfidx, bool setting)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (!edev->ops)
+		return -EINVAL;
+
+	return edev->ops->iov->set_trust(edev->cdev, vfidx, setting);
+}
+#endif
+
+static int qede_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct qede_dev *edev = netdev_priv(dev);
+
+	if (!netif_running(dev))
+		return -EAGAIN;
+
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		return qede_ptp_hw_ts(edev, ifr);
+	default:
+		DP_VERBOSE(edev, QED_MSG_DEBUG,
+			   "default IOCTL cmd 0x%x\n", cmd);
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static const struct net_device_ops qede_netdev_ops = {
+	.ndo_open = qede_open,
+	.ndo_stop = qede_close,
+	.ndo_start_xmit = qede_start_xmit,
+	.ndo_set_rx_mode = qede_set_rx_mode,
+	.ndo_set_mac_address = qede_set_mac_addr,
+	.ndo_validate_addr = eth_validate_addr,
+	.ndo_change_mtu = qede_change_mtu,
+	.ndo_do_ioctl = qede_ioctl,
+#ifdef CONFIG_QED_SRIOV
+	.ndo_set_vf_mac = qede_set_vf_mac,
+	.ndo_set_vf_vlan = qede_set_vf_vlan,
+	.ndo_set_vf_trust = qede_set_vf_trust,
+#endif
+	.ndo_vlan_rx_add_vid = qede_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid = qede_vlan_rx_kill_vid,
+	.ndo_fix_features = qede_fix_features,
+	.ndo_set_features = qede_set_features,
+	.ndo_get_stats64 = qede_get_stats64,
+#ifdef CONFIG_QED_SRIOV
+	.ndo_set_vf_link_state = qede_set_vf_link_state,
+	.ndo_set_vf_spoofchk = qede_set_vf_spoofchk,
+	.ndo_get_vf_config = qede_get_vf_config,
+	.ndo_set_vf_rate = qede_set_vf_rate,
+#endif
+	.ndo_udp_tunnel_add = qede_udp_tunnel_add,
+	.ndo_udp_tunnel_del = qede_udp_tunnel_del,
+	.ndo_features_check = qede_features_check,
+	.ndo_bpf = qede_xdp,
+#ifdef CONFIG_RFS_ACCEL
+	.ndo_rx_flow_steer = qede_rx_flow_steer,
+#endif
+};
+
+static const struct net_device_ops qede_netdev_vf_ops = {
+	.ndo_open = qede_open,
+	.ndo_stop = qede_close,
+	.ndo_start_xmit = qede_start_xmit,
+	.ndo_set_rx_mode = qede_set_rx_mode,
+	.ndo_set_mac_address = qede_set_mac_addr,
+	.ndo_validate_addr = eth_validate_addr,
+	.ndo_change_mtu = qede_change_mtu,
+	.ndo_vlan_rx_add_vid = qede_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid = qede_vlan_rx_kill_vid,
+	.ndo_fix_features = qede_fix_features,
+	.ndo_set_features = qede_set_features,
+	.ndo_get_stats64 = qede_get_stats64,
+	.ndo_udp_tunnel_add = qede_udp_tunnel_add,
+	.ndo_udp_tunnel_del = qede_udp_tunnel_del,
+	.ndo_features_check = qede_features_check,
+};
+
+static const struct net_device_ops qede_netdev_vf_xdp_ops = {
+	.ndo_open = qede_open,
+	.ndo_stop = qede_close,
+	.ndo_start_xmit = qede_start_xmit,
+	.ndo_set_rx_mode = qede_set_rx_mode,
+	.ndo_set_mac_address = qede_set_mac_addr,
+	.ndo_validate_addr = eth_validate_addr,
+	.ndo_change_mtu = qede_change_mtu,
+	.ndo_vlan_rx_add_vid = qede_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid = qede_vlan_rx_kill_vid,
+	.ndo_fix_features = qede_fix_features,
+	.ndo_set_features = qede_set_features,
+	.ndo_get_stats64 = qede_get_stats64,
+	.ndo_udp_tunnel_add = qede_udp_tunnel_add,
+	.ndo_udp_tunnel_del = qede_udp_tunnel_del,
+	.ndo_features_check = qede_features_check,
+	.ndo_bpf = qede_xdp,
+};
+
+/* -------------------------------------------------------------------------
+ * START OF PROBE / REMOVE
+ * -------------------------------------------------------------------------
+ */
+
+static struct qede_dev *qede_alloc_etherdev(struct qed_dev *cdev,
+					    struct pci_dev *pdev,
+					    struct qed_dev_eth_info *info,
+					    u32 dp_module, u8 dp_level)
+{
+	struct net_device *ndev;
+	struct qede_dev *edev;
+
+	ndev = alloc_etherdev_mqs(sizeof(*edev),
+				  info->num_queues, info->num_queues);
+	if (!ndev) {
+		pr_err("etherdev allocation failed\n");
+		return NULL;
+	}
+
+	edev = netdev_priv(ndev);
+	edev->ndev = ndev;
+	edev->cdev = cdev;
+	edev->pdev = pdev;
+	edev->dp_module = dp_module;
+	edev->dp_level = dp_level;
+	edev->ops = qed_ops;
+	edev->q_num_rx_buffers = NUM_RX_BDS_DEF;
+	edev->q_num_tx_buffers = NUM_TX_BDS_DEF;
+
+	DP_INFO(edev, "Allocated netdev with %d tx queues and %d rx queues\n",
+		info->num_queues, info->num_queues);
+
+	SET_NETDEV_DEV(ndev, &pdev->dev);
+
+	memset(&edev->stats, 0, sizeof(edev->stats));
+	memcpy(&edev->dev_info, info, sizeof(*info));
+
+	/* As ethtool doesn't have the ability to show WoL behavior as
+	 * 'default', if device supports it declare it's enabled.
+	 */
+	if (edev->dev_info.common.wol_support)
+		edev->wol_enabled = true;
+
+	INIT_LIST_HEAD(&edev->vlan_list);
+
+	return edev;
+}
+
+static void qede_init_ndev(struct qede_dev *edev)
+{
+	struct net_device *ndev = edev->ndev;
+	struct pci_dev *pdev = edev->pdev;
+	bool udp_tunnel_enable = false;
+	netdev_features_t hw_features;
+
+	pci_set_drvdata(pdev, ndev);
+
+	ndev->mem_start = edev->dev_info.common.pci_mem_start;
+	ndev->base_addr = ndev->mem_start;
+	ndev->mem_end = edev->dev_info.common.pci_mem_end;
+	ndev->irq = edev->dev_info.common.pci_irq;
+
+	ndev->watchdog_timeo = TX_TIMEOUT;
+
+	if (IS_VF(edev)) {
+		if (edev->dev_info.xdp_supported)
+			ndev->netdev_ops = &qede_netdev_vf_xdp_ops;
+		else
+			ndev->netdev_ops = &qede_netdev_vf_ops;
+	} else {
+		ndev->netdev_ops = &qede_netdev_ops;
+	}
+
+	qede_set_ethtool_ops(ndev);
+
+	ndev->priv_flags |= IFF_UNICAST_FLT;
+
+	/* user-changeble features */
+	hw_features = NETIF_F_GRO | NETIF_F_GRO_HW | NETIF_F_SG |
+		      NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+		      NETIF_F_TSO | NETIF_F_TSO6;
+
+	if (!IS_VF(edev) && edev->dev_info.common.num_hwfns == 1)
+		hw_features |= NETIF_F_NTUPLE;
+
+	if (edev->dev_info.common.vxlan_enable ||
+	    edev->dev_info.common.geneve_enable)
+		udp_tunnel_enable = true;
+
+	if (udp_tunnel_enable || edev->dev_info.common.gre_enable) {
+		hw_features |= NETIF_F_TSO_ECN;
+		ndev->hw_enc_features = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+					NETIF_F_SG | NETIF_F_TSO |
+					NETIF_F_TSO_ECN | NETIF_F_TSO6 |
+					NETIF_F_RXCSUM;
+	}
+
+	if (udp_tunnel_enable) {
+		hw_features |= (NETIF_F_GSO_UDP_TUNNEL |
+				NETIF_F_GSO_UDP_TUNNEL_CSUM);
+		ndev->hw_enc_features |= (NETIF_F_GSO_UDP_TUNNEL |
+					  NETIF_F_GSO_UDP_TUNNEL_CSUM);
+	}
+
+	if (edev->dev_info.common.gre_enable) {
+		hw_features |= (NETIF_F_GSO_GRE | NETIF_F_GSO_GRE_CSUM);
+		ndev->hw_enc_features |= (NETIF_F_GSO_GRE |
+					  NETIF_F_GSO_GRE_CSUM);
+	}
+
+	ndev->vlan_features = hw_features | NETIF_F_RXHASH | NETIF_F_RXCSUM |
+			      NETIF_F_HIGHDMA;
+	ndev->features = hw_features | NETIF_F_RXHASH | NETIF_F_RXCSUM |
+			 NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HIGHDMA |
+			 NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_CTAG_TX;
+
+	ndev->hw_features = hw_features;
+
+	/* MTU range: 46 - 9600 */
+	ndev->min_mtu = ETH_ZLEN - ETH_HLEN;
+	ndev->max_mtu = QEDE_MAX_JUMBO_PACKET_SIZE;
+
+	/* Set network device HW mac */
+	ether_addr_copy(edev->ndev->dev_addr, edev->dev_info.common.hw_mac);
+
+	ndev->mtu = edev->dev_info.common.mtu;
+}
+
+/* This function converts from 32b param to two params of level and module
+ * Input 32b decoding:
+ * b31 - enable all NOTICE prints. NOTICE prints are for deviation from the
+ * 'happy' flow, e.g. memory allocation failed.
+ * b30 - enable all INFO prints. INFO prints are for major steps in the flow
+ * and provide important parameters.
+ * b29-b0 - per-module bitmap, where each bit enables VERBOSE prints of that
+ * module. VERBOSE prints are for tracking the specific flow in low level.
+ *
+ * Notice that the level should be that of the lowest required logs.
+ */
+void qede_config_debug(uint debug, u32 *p_dp_module, u8 *p_dp_level)
+{
+	*p_dp_level = QED_LEVEL_NOTICE;
+	*p_dp_module = 0;
+
+	if (debug & QED_LOG_VERBOSE_MASK) {
+		*p_dp_level = QED_LEVEL_VERBOSE;
+		*p_dp_module = (debug & 0x3FFFFFFF);
+	} else if (debug & QED_LOG_INFO_MASK) {
+		*p_dp_level = QED_LEVEL_INFO;
+	} else if (debug & QED_LOG_NOTICE_MASK) {
+		*p_dp_level = QED_LEVEL_NOTICE;
+	}
+}
+
+static void qede_free_fp_array(struct qede_dev *edev)
+{
+	if (edev->fp_array) {
+		struct qede_fastpath *fp;
+		int i;
+
+		for_each_queue(i) {
+			fp = &edev->fp_array[i];
+
+			kfree(fp->sb_info);
+			/* Handle mem alloc failure case where qede_init_fp
+			 * didn't register xdp_rxq_info yet.
+			 * Implicit only (fp->type & QEDE_FASTPATH_RX)
+			 */
+			if (fp->rxq && xdp_rxq_info_is_reg(&fp->rxq->xdp_rxq))
+				xdp_rxq_info_unreg(&fp->rxq->xdp_rxq);
+			kfree(fp->rxq);
+			kfree(fp->xdp_tx);
+			kfree(fp->txq);
+		}
+		kfree(edev->fp_array);
+	}
+
+	edev->num_queues = 0;
+	edev->fp_num_tx = 0;
+	edev->fp_num_rx = 0;
+}
+
+static int qede_alloc_fp_array(struct qede_dev *edev)
+{
+	u8 fp_combined, fp_rx = edev->fp_num_rx;
+	struct qede_fastpath *fp;
+	int i;
+
+	edev->fp_array = kcalloc(QEDE_QUEUE_CNT(edev),
+				 sizeof(*edev->fp_array), GFP_KERNEL);
+	if (!edev->fp_array) {
+		DP_NOTICE(edev, "fp array allocation failed\n");
+		goto err;
+	}
+
+	fp_combined = QEDE_QUEUE_CNT(edev) - fp_rx - edev->fp_num_tx;
+
+	/* Allocate the FP elements for Rx queues followed by combined and then
+	 * the Tx. This ordering should be maintained so that the respective
+	 * queues (Rx or Tx) will be together in the fastpath array and the
+	 * associated ids will be sequential.
+	 */
+	for_each_queue(i) {
+		fp = &edev->fp_array[i];
+
+		fp->sb_info = kzalloc(sizeof(*fp->sb_info), GFP_KERNEL);
+		if (!fp->sb_info) {
+			DP_NOTICE(edev, "sb info struct allocation failed\n");
+			goto err;
+		}
+
+		if (fp_rx) {
+			fp->type = QEDE_FASTPATH_RX;
+			fp_rx--;
+		} else if (fp_combined) {
+			fp->type = QEDE_FASTPATH_COMBINED;
+			fp_combined--;
+		} else {
+			fp->type = QEDE_FASTPATH_TX;
+		}
+
+		if (fp->type & QEDE_FASTPATH_TX) {
+			fp->txq = kzalloc(sizeof(*fp->txq), GFP_KERNEL);
+			if (!fp->txq)
+				goto err;
+		}
+
+		if (fp->type & QEDE_FASTPATH_RX) {
+			fp->rxq = kzalloc(sizeof(*fp->rxq), GFP_KERNEL);
+			if (!fp->rxq)
+				goto err;
+
+			if (edev->xdp_prog) {
+				fp->xdp_tx = kzalloc(sizeof(*fp->xdp_tx),
+						     GFP_KERNEL);
+				if (!fp->xdp_tx)
+					goto err;
+				fp->type |= QEDE_FASTPATH_XDP;
+			}
+		}
+	}
+
+	return 0;
+err:
+	qede_free_fp_array(edev);
+	return -ENOMEM;
+}
+
+static void qede_sp_task(struct work_struct *work)
+{
+	struct qede_dev *edev = container_of(work, struct qede_dev,
+					     sp_task.work);
+
+	__qede_lock(edev);
+
+	if (test_and_clear_bit(QEDE_SP_RX_MODE, &edev->sp_flags))
+		if (edev->state == QEDE_STATE_OPEN)
+			qede_config_rx_mode(edev->ndev);
+
+#ifdef CONFIG_RFS_ACCEL
+	if (test_and_clear_bit(QEDE_SP_ARFS_CONFIG, &edev->sp_flags)) {
+		if (edev->state == QEDE_STATE_OPEN)
+			qede_process_arfs_filters(edev, false);
+	}
+#endif
+	__qede_unlock(edev);
+}
+
+static void qede_update_pf_params(struct qed_dev *cdev)
+{
+	struct qed_pf_params pf_params;
+
+	/* 64 rx + 64 tx + 64 XDP */
+	memset(&pf_params, 0, sizeof(struct qed_pf_params));
+	pf_params.eth_pf_params.num_cons = (MAX_SB_PER_PF_MIMD - 1) * 3;
+
+	/* Same for VFs - make sure they'll have sufficient connections
+	 * to support XDP Tx queues.
+	 */
+	pf_params.eth_pf_params.num_vf_cons = 48;
+
+	pf_params.eth_pf_params.num_arfs_filters = QEDE_RFS_MAX_FLTR;
+	qed_ops->common->update_pf_params(cdev, &pf_params);
+}
+
+#define QEDE_FW_VER_STR_SIZE	80
+
+static void qede_log_probe(struct qede_dev *edev)
+{
+	struct qed_dev_info *p_dev_info = &edev->dev_info.common;
+	u8 buf[QEDE_FW_VER_STR_SIZE];
+	size_t left_size;
+
+	snprintf(buf, QEDE_FW_VER_STR_SIZE,
+		 "Storm FW %d.%d.%d.%d, Management FW %d.%d.%d.%d",
+		 p_dev_info->fw_major, p_dev_info->fw_minor, p_dev_info->fw_rev,
+		 p_dev_info->fw_eng,
+		 (p_dev_info->mfw_rev & QED_MFW_VERSION_3_MASK) >>
+		 QED_MFW_VERSION_3_OFFSET,
+		 (p_dev_info->mfw_rev & QED_MFW_VERSION_2_MASK) >>
+		 QED_MFW_VERSION_2_OFFSET,
+		 (p_dev_info->mfw_rev & QED_MFW_VERSION_1_MASK) >>
+		 QED_MFW_VERSION_1_OFFSET,
+		 (p_dev_info->mfw_rev & QED_MFW_VERSION_0_MASK) >>
+		 QED_MFW_VERSION_0_OFFSET);
+
+	left_size = QEDE_FW_VER_STR_SIZE - strlen(buf);
+	if (p_dev_info->mbi_version && left_size)
+		snprintf(buf + strlen(buf), left_size,
+			 " [MBI %d.%d.%d]",
+			 (p_dev_info->mbi_version & QED_MBI_VERSION_2_MASK) >>
+			 QED_MBI_VERSION_2_OFFSET,
+			 (p_dev_info->mbi_version & QED_MBI_VERSION_1_MASK) >>
+			 QED_MBI_VERSION_1_OFFSET,
+			 (p_dev_info->mbi_version & QED_MBI_VERSION_0_MASK) >>
+			 QED_MBI_VERSION_0_OFFSET);
+
+	pr_info("qede %02x:%02x.%02x: %s [%s]\n", edev->pdev->bus->number,
+		PCI_SLOT(edev->pdev->devfn), PCI_FUNC(edev->pdev->devfn),
+		buf, edev->ndev->name);
+}
+
+enum qede_probe_mode {
+	QEDE_PROBE_NORMAL,
+};
+
+static int __qede_probe(struct pci_dev *pdev, u32 dp_module, u8 dp_level,
+			bool is_vf, enum qede_probe_mode mode)
+{
+	struct qed_probe_params probe_params;
+	struct qed_slowpath_params sp_params;
+	struct qed_dev_eth_info dev_info;
+	struct qede_dev *edev;
+	struct qed_dev *cdev;
+	int rc;
+
+	if (unlikely(dp_level & QED_LEVEL_INFO))
+		pr_notice("Starting qede probe\n");
+
+	memset(&probe_params, 0, sizeof(probe_params));
+	probe_params.protocol = QED_PROTOCOL_ETH;
+	probe_params.dp_module = dp_module;
+	probe_params.dp_level = dp_level;
+	probe_params.is_vf = is_vf;
+	cdev = qed_ops->common->probe(pdev, &probe_params);
+	if (!cdev) {
+		rc = -ENODEV;
+		goto err0;
+	}
+
+	qede_update_pf_params(cdev);
+
+	/* Start the Slowpath-process */
+	memset(&sp_params, 0, sizeof(sp_params));
+	sp_params.int_mode = QED_INT_MODE_MSIX;
+	sp_params.drv_major = QEDE_MAJOR_VERSION;
+	sp_params.drv_minor = QEDE_MINOR_VERSION;
+	sp_params.drv_rev = QEDE_REVISION_VERSION;
+	sp_params.drv_eng = QEDE_ENGINEERING_VERSION;
+	strlcpy(sp_params.name, "qede LAN", QED_DRV_VER_STR_SIZE);
+	rc = qed_ops->common->slowpath_start(cdev, &sp_params);
+	if (rc) {
+		pr_notice("Cannot start slowpath\n");
+		goto err1;
+	}
+
+	/* Learn information crucial for qede to progress */
+	rc = qed_ops->fill_dev_info(cdev, &dev_info);
+	if (rc)
+		goto err2;
+
+	edev = qede_alloc_etherdev(cdev, pdev, &dev_info, dp_module,
+				   dp_level);
+	if (!edev) {
+		rc = -ENOMEM;
+		goto err2;
+	}
+
+	if (is_vf)
+		edev->flags |= QEDE_FLAG_IS_VF;
+
+	qede_init_ndev(edev);
+
+	rc = qede_rdma_dev_add(edev);
+	if (rc)
+		goto err3;
+
+	/* Prepare the lock prior to the registration of the netdev,
+	 * as once it's registered we might reach flows requiring it
+	 * [it's even possible to reach a flow needing it directly
+	 * from there, although it's unlikely].
+	 */
+	INIT_DELAYED_WORK(&edev->sp_task, qede_sp_task);
+	mutex_init(&edev->qede_lock);
+	rc = register_netdev(edev->ndev);
+	if (rc) {
+		DP_NOTICE(edev, "Cannot register net-device\n");
+		goto err4;
+	}
+
+	edev->ops->common->set_name(cdev, edev->ndev->name);
+
+	/* PTP not supported on VFs */
+	if (!is_vf)
+		qede_ptp_enable(edev, true);
+
+	edev->ops->register_ops(cdev, &qede_ll_ops, edev);
+
+#ifdef CONFIG_DCB
+	if (!IS_VF(edev))
+		qede_set_dcbnl_ops(edev->ndev);
+#endif
+
+	edev->rx_copybreak = QEDE_RX_HDR_SIZE;
+
+	qede_log_probe(edev);
+	return 0;
+
+err4:
+	qede_rdma_dev_remove(edev);
+err3:
+	free_netdev(edev->ndev);
+err2:
+	qed_ops->common->slowpath_stop(cdev);
+err1:
+	qed_ops->common->remove(cdev);
+err0:
+	return rc;
+}
+
+static int qede_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	bool is_vf = false;
+	u32 dp_module = 0;
+	u8 dp_level = 0;
+
+	switch ((enum qede_pci_private)id->driver_data) {
+	case QEDE_PRIVATE_VF:
+		if (debug & QED_LOG_VERBOSE_MASK)
+			dev_err(&pdev->dev, "Probing a VF\n");
+		is_vf = true;
+		break;
+	default:
+		if (debug & QED_LOG_VERBOSE_MASK)
+			dev_err(&pdev->dev, "Probing a PF\n");
+	}
+
+	qede_config_debug(debug, &dp_module, &dp_level);
+
+	return __qede_probe(pdev, dp_module, dp_level, is_vf,
+			    QEDE_PROBE_NORMAL);
+}
+
+enum qede_remove_mode {
+	QEDE_REMOVE_NORMAL,
+};
+
+static void __qede_remove(struct pci_dev *pdev, enum qede_remove_mode mode)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct qede_dev *edev = netdev_priv(ndev);
+	struct qed_dev *cdev = edev->cdev;
+
+	DP_INFO(edev, "Starting qede_remove\n");
+
+	qede_rdma_dev_remove(edev);
+	unregister_netdev(ndev);
+	cancel_delayed_work_sync(&edev->sp_task);
+
+	qede_ptp_disable(edev);
+
+	edev->ops->common->set_power_state(cdev, PCI_D0);
+
+	pci_set_drvdata(pdev, NULL);
+
+	/* Use global ops since we've freed edev */
+	qed_ops->common->slowpath_stop(cdev);
+	if (system_state == SYSTEM_POWER_OFF)
+		return;
+	qed_ops->common->remove(cdev);
+
+	/* Since this can happen out-of-sync with other flows,
+	 * don't release the netdevice until after slowpath stop
+	 * has been called to guarantee various other contexts
+	 * [e.g., QED register callbacks] won't break anything when
+	 * accessing the netdevice.
+	 */
+	 free_netdev(ndev);
+
+	dev_info(&pdev->dev, "Ending qede_remove successfully\n");
+}
+
+static void qede_remove(struct pci_dev *pdev)
+{
+	__qede_remove(pdev, QEDE_REMOVE_NORMAL);
+}
+
+static void qede_shutdown(struct pci_dev *pdev)
+{
+	__qede_remove(pdev, QEDE_REMOVE_NORMAL);
+}
+
+/* -------------------------------------------------------------------------
+ * START OF LOAD / UNLOAD
+ * -------------------------------------------------------------------------
+ */
+
+static int qede_set_num_queues(struct qede_dev *edev)
+{
+	int rc;
+	u16 rss_num;
+
+	/* Setup queues according to possible resources*/
+	if (edev->req_queues)
+		rss_num = edev->req_queues;
+	else
+		rss_num = netif_get_num_default_rss_queues() *
+			  edev->dev_info.common.num_hwfns;
+
+	rss_num = min_t(u16, QEDE_MAX_RSS_CNT(edev), rss_num);
+
+	rc = edev->ops->common->set_fp_int(edev->cdev, rss_num);
+	if (rc > 0) {
+		/* Managed to request interrupts for our queues */
+		edev->num_queues = rc;
+		DP_INFO(edev, "Managed %d [of %d] RSS queues\n",
+			QEDE_QUEUE_CNT(edev), rss_num);
+		rc = 0;
+	}
+
+	edev->fp_num_tx = edev->req_num_tx;
+	edev->fp_num_rx = edev->req_num_rx;
+
+	return rc;
+}
+
+static void qede_free_mem_sb(struct qede_dev *edev, struct qed_sb_info *sb_info,
+			     u16 sb_id)
+{
+	if (sb_info->sb_virt) {
+		edev->ops->common->sb_release(edev->cdev, sb_info, sb_id);
+		dma_free_coherent(&edev->pdev->dev, sizeof(*sb_info->sb_virt),
+				  (void *)sb_info->sb_virt, sb_info->sb_phys);
+		memset(sb_info, 0, sizeof(*sb_info));
+	}
+}
+
+/* This function allocates fast-path status block memory */
+static int qede_alloc_mem_sb(struct qede_dev *edev,
+			     struct qed_sb_info *sb_info, u16 sb_id)
+{
+	struct status_block_e4 *sb_virt;
+	dma_addr_t sb_phys;
+	int rc;
+
+	sb_virt = dma_alloc_coherent(&edev->pdev->dev,
+				     sizeof(*sb_virt), &sb_phys, GFP_KERNEL);
+	if (!sb_virt) {
+		DP_ERR(edev, "Status block allocation failed\n");
+		return -ENOMEM;
+	}
+
+	rc = edev->ops->common->sb_init(edev->cdev, sb_info,
+					sb_virt, sb_phys, sb_id,
+					QED_SB_TYPE_L2_QUEUE);
+	if (rc) {
+		DP_ERR(edev, "Status block initialization failed\n");
+		dma_free_coherent(&edev->pdev->dev, sizeof(*sb_virt),
+				  sb_virt, sb_phys);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void qede_free_rx_buffers(struct qede_dev *edev,
+				 struct qede_rx_queue *rxq)
+{
+	u16 i;
+
+	for (i = rxq->sw_rx_cons; i != rxq->sw_rx_prod; i++) {
+		struct sw_rx_data *rx_buf;
+		struct page *data;
+
+		rx_buf = &rxq->sw_rx_ring[i & NUM_RX_BDS_MAX];
+		data = rx_buf->data;
+
+		dma_unmap_page(&edev->pdev->dev,
+			       rx_buf->mapping, PAGE_SIZE, rxq->data_direction);
+
+		rx_buf->data = NULL;
+		__free_page(data);
+	}
+}
+
+static void qede_free_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
+{
+	int i;
+
+	if (edev->gro_disable)
+		return;
+
+	for (i = 0; i < ETH_TPA_MAX_AGGS_NUM; i++) {
+		struct qede_agg_info *tpa_info = &rxq->tpa_info[i];
+		struct sw_rx_data *replace_buf = &tpa_info->buffer;
+
+		if (replace_buf->data) {
+			dma_unmap_page(&edev->pdev->dev,
+				       replace_buf->mapping,
+				       PAGE_SIZE, DMA_FROM_DEVICE);
+			__free_page(replace_buf->data);
+		}
+	}
+}
+
+static void qede_free_mem_rxq(struct qede_dev *edev, struct qede_rx_queue *rxq)
+{
+	qede_free_sge_mem(edev, rxq);
+
+	/* Free rx buffers */
+	qede_free_rx_buffers(edev, rxq);
+
+	/* Free the parallel SW ring */
+	kfree(rxq->sw_rx_ring);
+
+	/* Free the real RQ ring used by FW */
+	edev->ops->common->chain_free(edev->cdev, &rxq->rx_bd_ring);
+	edev->ops->common->chain_free(edev->cdev, &rxq->rx_comp_ring);
+}
+
+static int qede_alloc_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
+{
+	dma_addr_t mapping;
+	int i;
+
+	if (edev->gro_disable)
+		return 0;
+
+	for (i = 0; i < ETH_TPA_MAX_AGGS_NUM; i++) {
+		struct qede_agg_info *tpa_info = &rxq->tpa_info[i];
+		struct sw_rx_data *replace_buf = &tpa_info->buffer;
+
+		replace_buf->data = alloc_pages(GFP_ATOMIC, 0);
+		if (unlikely(!replace_buf->data)) {
+			DP_NOTICE(edev,
+				  "Failed to allocate TPA skb pool [replacement buffer]\n");
+			goto err;
+		}
+
+		mapping = dma_map_page(&edev->pdev->dev, replace_buf->data, 0,
+				       PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(dma_mapping_error(&edev->pdev->dev, mapping))) {
+			DP_NOTICE(edev,
+				  "Failed to map TPA replacement buffer\n");
+			goto err;
+		}
+
+		replace_buf->mapping = mapping;
+		tpa_info->buffer.page_offset = 0;
+		tpa_info->buffer_mapping = mapping;
+		tpa_info->state = QEDE_AGG_STATE_NONE;
+	}
+
+	return 0;
+err:
+	qede_free_sge_mem(edev, rxq);
+	edev->gro_disable = 1;
+	edev->ndev->features &= ~NETIF_F_GRO_HW;
+	return -ENOMEM;
+}
+
+/* This function allocates all memory needed per Rx queue */
+static int qede_alloc_mem_rxq(struct qede_dev *edev, struct qede_rx_queue *rxq)
+{
+	int i, rc, size;
+
+	rxq->num_rx_buffers = edev->q_num_rx_buffers;
+
+	rxq->rx_buf_size = NET_IP_ALIGN + ETH_OVERHEAD + edev->ndev->mtu;
+	rxq->rx_headroom = edev->xdp_prog ? XDP_PACKET_HEADROOM : 0;
+
+	/* Make sure that the headroom and  payload fit in a single page */
+	if (rxq->rx_buf_size + rxq->rx_headroom > PAGE_SIZE)
+		rxq->rx_buf_size = PAGE_SIZE - rxq->rx_headroom;
+
+	/* Segment size to spilt a page in multiple equal parts,
+	 * unless XDP is used in which case we'd use the entire page.
+	 */
+	if (!edev->xdp_prog)
+		rxq->rx_buf_seg_size = roundup_pow_of_two(rxq->rx_buf_size);
+	else
+		rxq->rx_buf_seg_size = PAGE_SIZE;
+
+	/* Allocate the parallel driver ring for Rx buffers */
+	size = sizeof(*rxq->sw_rx_ring) * RX_RING_SIZE;
+	rxq->sw_rx_ring = kzalloc(size, GFP_KERNEL);
+	if (!rxq->sw_rx_ring) {
+		DP_ERR(edev, "Rx buffers ring allocation failed\n");
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	/* Allocate FW Rx ring  */
+	rc = edev->ops->common->chain_alloc(edev->cdev,
+					    QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+					    QED_CHAIN_MODE_NEXT_PTR,
+					    QED_CHAIN_CNT_TYPE_U16,
+					    RX_RING_SIZE,
+					    sizeof(struct eth_rx_bd),
+					    &rxq->rx_bd_ring, NULL);
+	if (rc)
+		goto err;
+
+	/* Allocate FW completion ring */
+	rc = edev->ops->common->chain_alloc(edev->cdev,
+					    QED_CHAIN_USE_TO_CONSUME,
+					    QED_CHAIN_MODE_PBL,
+					    QED_CHAIN_CNT_TYPE_U16,
+					    RX_RING_SIZE,
+					    sizeof(union eth_rx_cqe),
+					    &rxq->rx_comp_ring, NULL);
+	if (rc)
+		goto err;
+
+	/* Allocate buffers for the Rx ring */
+	rxq->filled_buffers = 0;
+	for (i = 0; i < rxq->num_rx_buffers; i++) {
+		rc = qede_alloc_rx_buffer(rxq, false);
+		if (rc) {
+			DP_ERR(edev,
+			       "Rx buffers allocation failed at index %d\n", i);
+			goto err;
+		}
+	}
+
+	rc = qede_alloc_sge_mem(edev, rxq);
+err:
+	return rc;
+}
+
+static void qede_free_mem_txq(struct qede_dev *edev, struct qede_tx_queue *txq)
+{
+	/* Free the parallel SW ring */
+	if (txq->is_xdp)
+		kfree(txq->sw_tx_ring.xdp);
+	else
+		kfree(txq->sw_tx_ring.skbs);
+
+	/* Free the real RQ ring used by FW */
+	edev->ops->common->chain_free(edev->cdev, &txq->tx_pbl);
+}
+
+/* This function allocates all memory needed per Tx queue */
+static int qede_alloc_mem_txq(struct qede_dev *edev, struct qede_tx_queue *txq)
+{
+	union eth_tx_bd_types *p_virt;
+	int size, rc;
+
+	txq->num_tx_buffers = edev->q_num_tx_buffers;
+
+	/* Allocate the parallel driver ring for Tx buffers */
+	if (txq->is_xdp) {
+		size = sizeof(*txq->sw_tx_ring.xdp) * txq->num_tx_buffers;
+		txq->sw_tx_ring.xdp = kzalloc(size, GFP_KERNEL);
+		if (!txq->sw_tx_ring.xdp)
+			goto err;
+	} else {
+		size = sizeof(*txq->sw_tx_ring.skbs) * txq->num_tx_buffers;
+		txq->sw_tx_ring.skbs = kzalloc(size, GFP_KERNEL);
+		if (!txq->sw_tx_ring.skbs)
+			goto err;
+	}
+
+	rc = edev->ops->common->chain_alloc(edev->cdev,
+					    QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+					    QED_CHAIN_MODE_PBL,
+					    QED_CHAIN_CNT_TYPE_U16,
+					    txq->num_tx_buffers,
+					    sizeof(*p_virt),
+					    &txq->tx_pbl, NULL);
+	if (rc)
+		goto err;
+
+	return 0;
+
+err:
+	qede_free_mem_txq(edev, txq);
+	return -ENOMEM;
+}
+
+/* This function frees all memory of a single fp */
+static void qede_free_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp)
+{
+	qede_free_mem_sb(edev, fp->sb_info, fp->id);
+
+	if (fp->type & QEDE_FASTPATH_RX)
+		qede_free_mem_rxq(edev, fp->rxq);
+
+	if (fp->type & QEDE_FASTPATH_XDP)
+		qede_free_mem_txq(edev, fp->xdp_tx);
+
+	if (fp->type & QEDE_FASTPATH_TX)
+		qede_free_mem_txq(edev, fp->txq);
+}
+
+/* This function allocates all memory needed for a single fp (i.e. an entity
+ * which contains status block, one rx queue and/or multiple per-TC tx queues.
+ */
+static int qede_alloc_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp)
+{
+	int rc = 0;
+
+	rc = qede_alloc_mem_sb(edev, fp->sb_info, fp->id);
+	if (rc)
+		goto out;
+
+	if (fp->type & QEDE_FASTPATH_RX) {
+		rc = qede_alloc_mem_rxq(edev, fp->rxq);
+		if (rc)
+			goto out;
+	}
+
+	if (fp->type & QEDE_FASTPATH_XDP) {
+		rc = qede_alloc_mem_txq(edev, fp->xdp_tx);
+		if (rc)
+			goto out;
+	}
+
+	if (fp->type & QEDE_FASTPATH_TX) {
+		rc = qede_alloc_mem_txq(edev, fp->txq);
+		if (rc)
+			goto out;
+	}
+
+out:
+	return rc;
+}
+
+static void qede_free_mem_load(struct qede_dev *edev)
+{
+	int i;
+
+	for_each_queue(i) {
+		struct qede_fastpath *fp = &edev->fp_array[i];
+
+		qede_free_mem_fp(edev, fp);
+	}
+}
+
+/* This function allocates all qede memory at NIC load. */
+static int qede_alloc_mem_load(struct qede_dev *edev)
+{
+	int rc = 0, queue_id;
+
+	for (queue_id = 0; queue_id < QEDE_QUEUE_CNT(edev); queue_id++) {
+		struct qede_fastpath *fp = &edev->fp_array[queue_id];
+
+		rc = qede_alloc_mem_fp(edev, fp);
+		if (rc) {
+			DP_ERR(edev,
+			       "Failed to allocate memory for fastpath - rss id = %d\n",
+			       queue_id);
+			qede_free_mem_load(edev);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+/* This function inits fp content and resets the SB, RXQ and TXQ structures */
+static void qede_init_fp(struct qede_dev *edev)
+{
+	int queue_id, rxq_index = 0, txq_index = 0;
+	struct qede_fastpath *fp;
+
+	for_each_queue(queue_id) {
+		fp = &edev->fp_array[queue_id];
+
+		fp->edev = edev;
+		fp->id = queue_id;
+
+		if (fp->type & QEDE_FASTPATH_XDP) {
+			fp->xdp_tx->index = QEDE_TXQ_IDX_TO_XDP(edev,
+								rxq_index);
+			fp->xdp_tx->is_xdp = 1;
+		}
+
+		if (fp->type & QEDE_FASTPATH_RX) {
+			fp->rxq->rxq_id = rxq_index++;
+
+			/* Determine how to map buffers for this queue */
+			if (fp->type & QEDE_FASTPATH_XDP)
+				fp->rxq->data_direction = DMA_BIDIRECTIONAL;
+			else
+				fp->rxq->data_direction = DMA_FROM_DEVICE;
+			fp->rxq->dev = &edev->pdev->dev;
+
+			/* Driver have no error path from here */
+			WARN_ON(xdp_rxq_info_reg(&fp->rxq->xdp_rxq, edev->ndev,
+						 fp->rxq->rxq_id) < 0);
+		}
+
+		if (fp->type & QEDE_FASTPATH_TX) {
+			fp->txq->index = txq_index++;
+			if (edev->dev_info.is_legacy)
+				fp->txq->is_legacy = 1;
+			fp->txq->dev = &edev->pdev->dev;
+		}
+
+		snprintf(fp->name, sizeof(fp->name), "%s-fp-%d",
+			 edev->ndev->name, queue_id);
+	}
+
+	edev->gro_disable = !(edev->ndev->features & NETIF_F_GRO_HW);
+}
+
+static int qede_set_real_num_queues(struct qede_dev *edev)
+{
+	int rc = 0;
+
+	rc = netif_set_real_num_tx_queues(edev->ndev, QEDE_TSS_COUNT(edev));
+	if (rc) {
+		DP_NOTICE(edev, "Failed to set real number of Tx queues\n");
+		return rc;
+	}
+
+	rc = netif_set_real_num_rx_queues(edev->ndev, QEDE_RSS_COUNT(edev));
+	if (rc) {
+		DP_NOTICE(edev, "Failed to set real number of Rx queues\n");
+		return rc;
+	}
+
+	return 0;
+}
+
+static void qede_napi_disable_remove(struct qede_dev *edev)
+{
+	int i;
+
+	for_each_queue(i) {
+		napi_disable(&edev->fp_array[i].napi);
+
+		netif_napi_del(&edev->fp_array[i].napi);
+	}
+}
+
+static void qede_napi_add_enable(struct qede_dev *edev)
+{
+	int i;
+
+	/* Add NAPI objects */
+	for_each_queue(i) {
+		netif_napi_add(edev->ndev, &edev->fp_array[i].napi,
+			       qede_poll, NAPI_POLL_WEIGHT);
+		napi_enable(&edev->fp_array[i].napi);
+	}
+}
+
+static void qede_sync_free_irqs(struct qede_dev *edev)
+{
+	int i;
+
+	for (i = 0; i < edev->int_info.used_cnt; i++) {
+		if (edev->int_info.msix_cnt) {
+			synchronize_irq(edev->int_info.msix[i].vector);
+			free_irq(edev->int_info.msix[i].vector,
+				 &edev->fp_array[i]);
+		} else {
+			edev->ops->common->simd_handler_clean(edev->cdev, i);
+		}
+	}
+
+	edev->int_info.used_cnt = 0;
+}
+
+static int qede_req_msix_irqs(struct qede_dev *edev)
+{
+	int i, rc;
+
+	/* Sanitize number of interrupts == number of prepared RSS queues */
+	if (QEDE_QUEUE_CNT(edev) > edev->int_info.msix_cnt) {
+		DP_ERR(edev,
+		       "Interrupt mismatch: %d RSS queues > %d MSI-x vectors\n",
+		       QEDE_QUEUE_CNT(edev), edev->int_info.msix_cnt);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < QEDE_QUEUE_CNT(edev); i++) {
+#ifdef CONFIG_RFS_ACCEL
+		struct qede_fastpath *fp = &edev->fp_array[i];
+
+		if (edev->ndev->rx_cpu_rmap && (fp->type & QEDE_FASTPATH_RX)) {
+			rc = irq_cpu_rmap_add(edev->ndev->rx_cpu_rmap,
+					      edev->int_info.msix[i].vector);
+			if (rc) {
+				DP_ERR(edev, "Failed to add CPU rmap\n");
+				qede_free_arfs(edev);
+			}
+		}
+#endif
+		rc = request_irq(edev->int_info.msix[i].vector,
+				 qede_msix_fp_int, 0, edev->fp_array[i].name,
+				 &edev->fp_array[i]);
+		if (rc) {
+			DP_ERR(edev, "Request fp %d irq failed\n", i);
+			qede_sync_free_irqs(edev);
+			return rc;
+		}
+		DP_VERBOSE(edev, NETIF_MSG_INTR,
+			   "Requested fp irq for %s [entry %d]. Cookie is at %p\n",
+			   edev->fp_array[i].name, i,
+			   &edev->fp_array[i]);
+		edev->int_info.used_cnt++;
+	}
+
+	return 0;
+}
+
+static void qede_simd_fp_handler(void *cookie)
+{
+	struct qede_fastpath *fp = (struct qede_fastpath *)cookie;
+
+	napi_schedule_irqoff(&fp->napi);
+}
+
+static int qede_setup_irqs(struct qede_dev *edev)
+{
+	int i, rc = 0;
+
+	/* Learn Interrupt configuration */
+	rc = edev->ops->common->get_fp_int(edev->cdev, &edev->int_info);
+	if (rc)
+		return rc;
+
+	if (edev->int_info.msix_cnt) {
+		rc = qede_req_msix_irqs(edev);
+		if (rc)
+			return rc;
+		edev->ndev->irq = edev->int_info.msix[0].vector;
+	} else {
+		const struct qed_common_ops *ops;
+
+		/* qed should learn receive the RSS ids and callbacks */
+		ops = edev->ops->common;
+		for (i = 0; i < QEDE_QUEUE_CNT(edev); i++)
+			ops->simd_handler_config(edev->cdev,
+						 &edev->fp_array[i], i,
+						 qede_simd_fp_handler);
+		edev->int_info.used_cnt = QEDE_QUEUE_CNT(edev);
+	}
+	return 0;
+}
+
+static int qede_drain_txq(struct qede_dev *edev,
+			  struct qede_tx_queue *txq, bool allow_drain)
+{
+	int rc, cnt = 1000;
+
+	while (txq->sw_tx_cons != txq->sw_tx_prod) {
+		if (!cnt) {
+			if (allow_drain) {
+				DP_NOTICE(edev,
+					  "Tx queue[%d] is stuck, requesting MCP to drain\n",
+					  txq->index);
+				rc = edev->ops->common->drain(edev->cdev);
+				if (rc)
+					return rc;
+				return qede_drain_txq(edev, txq, false);
+			}
+			DP_NOTICE(edev,
+				  "Timeout waiting for tx queue[%d]: PROD=%d, CONS=%d\n",
+				  txq->index, txq->sw_tx_prod,
+				  txq->sw_tx_cons);
+			return -ENODEV;
+		}
+		cnt--;
+		usleep_range(1000, 2000);
+		barrier();
+	}
+
+	/* FW finished processing, wait for HW to transmit all tx packets */
+	usleep_range(1000, 2000);
+
+	return 0;
+}
+
+static int qede_stop_txq(struct qede_dev *edev,
+			 struct qede_tx_queue *txq, int rss_id)
+{
+	return edev->ops->q_tx_stop(edev->cdev, rss_id, txq->handle);
+}
+
+static int qede_stop_queues(struct qede_dev *edev)
+{
+	struct qed_update_vport_params *vport_update_params;
+	struct qed_dev *cdev = edev->cdev;
+	struct qede_fastpath *fp;
+	int rc, i;
+
+	/* Disable the vport */
+	vport_update_params = vzalloc(sizeof(*vport_update_params));
+	if (!vport_update_params)
+		return -ENOMEM;
+
+	vport_update_params->vport_id = 0;
+	vport_update_params->update_vport_active_flg = 1;
+	vport_update_params->vport_active_flg = 0;
+	vport_update_params->update_rss_flg = 0;
+
+	rc = edev->ops->vport_update(cdev, vport_update_params);
+	vfree(vport_update_params);
+
+	if (rc) {
+		DP_ERR(edev, "Failed to update vport\n");
+		return rc;
+	}
+
+	/* Flush Tx queues. If needed, request drain from MCP */
+	for_each_queue(i) {
+		fp = &edev->fp_array[i];
+
+		if (fp->type & QEDE_FASTPATH_TX) {
+			rc = qede_drain_txq(edev, fp->txq, true);
+			if (rc)
+				return rc;
+		}
+
+		if (fp->type & QEDE_FASTPATH_XDP) {
+			rc = qede_drain_txq(edev, fp->xdp_tx, true);
+			if (rc)
+				return rc;
+		}
+	}
+
+	/* Stop all Queues in reverse order */
+	for (i = QEDE_QUEUE_CNT(edev) - 1; i >= 0; i--) {
+		fp = &edev->fp_array[i];
+
+		/* Stop the Tx Queue(s) */
+		if (fp->type & QEDE_FASTPATH_TX) {
+			rc = qede_stop_txq(edev, fp->txq, i);
+			if (rc)
+				return rc;
+		}
+
+		/* Stop the Rx Queue */
+		if (fp->type & QEDE_FASTPATH_RX) {
+			rc = edev->ops->q_rx_stop(cdev, i, fp->rxq->handle);
+			if (rc) {
+				DP_ERR(edev, "Failed to stop RXQ #%d\n", i);
+				return rc;
+			}
+		}
+
+		/* Stop the XDP forwarding queue */
+		if (fp->type & QEDE_FASTPATH_XDP) {
+			rc = qede_stop_txq(edev, fp->xdp_tx, i);
+			if (rc)
+				return rc;
+
+			bpf_prog_put(fp->rxq->xdp_prog);
+		}
+	}
+
+	/* Stop the vport */
+	rc = edev->ops->vport_stop(cdev, 0);
+	if (rc)
+		DP_ERR(edev, "Failed to stop VPORT\n");
+
+	return rc;
+}
+
+static int qede_start_txq(struct qede_dev *edev,
+			  struct qede_fastpath *fp,
+			  struct qede_tx_queue *txq, u8 rss_id, u16 sb_idx)
+{
+	dma_addr_t phys_table = qed_chain_get_pbl_phys(&txq->tx_pbl);
+	u32 page_cnt = qed_chain_get_page_cnt(&txq->tx_pbl);
+	struct qed_queue_start_common_params params;
+	struct qed_txq_start_ret_params ret_params;
+	int rc;
+
+	memset(&params, 0, sizeof(params));
+	memset(&ret_params, 0, sizeof(ret_params));
+
+	/* Let the XDP queue share the queue-zone with one of the regular txq.
+	 * We don't really care about its coalescing.
+	 */
+	if (txq->is_xdp)
+		params.queue_id = QEDE_TXQ_XDP_TO_IDX(edev, txq);
+	else
+		params.queue_id = txq->index;
+
+	params.p_sb = fp->sb_info;
+	params.sb_idx = sb_idx;
+
+	rc = edev->ops->q_tx_start(edev->cdev, rss_id, &params, phys_table,
+				   page_cnt, &ret_params);
+	if (rc) {
+		DP_ERR(edev, "Start TXQ #%d failed %d\n", txq->index, rc);
+		return rc;
+	}
+
+	txq->doorbell_addr = ret_params.p_doorbell;
+	txq->handle = ret_params.p_handle;
+
+	/* Determine the FW consumer address associated */
+	txq->hw_cons_ptr = &fp->sb_info->sb_virt->pi_array[sb_idx];
+
+	/* Prepare the doorbell parameters */
+	SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_DEST, DB_DEST_XCM);
+	SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_AGG_CMD, DB_AGG_CMD_SET);
+	SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_AGG_VAL_SEL,
+		  DQ_XCM_ETH_TX_BD_PROD_CMD);
+	txq->tx_db.data.agg_flags = DQ_XCM_ETH_DQ_CF_CMD;
+
+	return rc;
+}
+
+static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
+{
+	int vlan_removal_en = 1;
+	struct qed_dev *cdev = edev->cdev;
+	struct qed_dev_info *qed_info = &edev->dev_info.common;
+	struct qed_update_vport_params *vport_update_params;
+	struct qed_queue_start_common_params q_params;
+	struct qed_start_vport_params start = {0};
+	int rc, i;
+
+	if (!edev->num_queues) {
+		DP_ERR(edev,
+		       "Cannot update V-VPORT as active as there are no Rx queues\n");
+		return -EINVAL;
+	}
+
+	vport_update_params = vzalloc(sizeof(*vport_update_params));
+	if (!vport_update_params)
+		return -ENOMEM;
+
+	start.handle_ptp_pkts = !!(edev->ptp);
+	start.gro_enable = !edev->gro_disable;
+	start.mtu = edev->ndev->mtu;
+	start.vport_id = 0;
+	start.drop_ttl0 = true;
+	start.remove_inner_vlan = vlan_removal_en;
+	start.clear_stats = clear_stats;
+
+	rc = edev->ops->vport_start(cdev, &start);
+
+	if (rc) {
+		DP_ERR(edev, "Start V-PORT failed %d\n", rc);
+		goto out;
+	}
+
+	DP_VERBOSE(edev, NETIF_MSG_IFUP,
+		   "Start vport ramrod passed, vport_id = %d, MTU = %d, vlan_removal_en = %d\n",
+		   start.vport_id, edev->ndev->mtu + 0xe, vlan_removal_en);
+
+	for_each_queue(i) {
+		struct qede_fastpath *fp = &edev->fp_array[i];
+		dma_addr_t p_phys_table;
+		u32 page_cnt;
+
+		if (fp->type & QEDE_FASTPATH_RX) {
+			struct qed_rxq_start_ret_params ret_params;
+			struct qede_rx_queue *rxq = fp->rxq;
+			__le16 *val;
+
+			memset(&ret_params, 0, sizeof(ret_params));
+			memset(&q_params, 0, sizeof(q_params));
+			q_params.queue_id = rxq->rxq_id;
+			q_params.vport_id = 0;
+			q_params.p_sb = fp->sb_info;
+			q_params.sb_idx = RX_PI;
+
+			p_phys_table =
+			    qed_chain_get_pbl_phys(&rxq->rx_comp_ring);
+			page_cnt = qed_chain_get_page_cnt(&rxq->rx_comp_ring);
+
+			rc = edev->ops->q_rx_start(cdev, i, &q_params,
+						   rxq->rx_buf_size,
+						   rxq->rx_bd_ring.p_phys_addr,
+						   p_phys_table,
+						   page_cnt, &ret_params);
+			if (rc) {
+				DP_ERR(edev, "Start RXQ #%d failed %d\n", i,
+				       rc);
+				goto out;
+			}
+
+			/* Use the return parameters */
+			rxq->hw_rxq_prod_addr = ret_params.p_prod;
+			rxq->handle = ret_params.p_handle;
+
+			val = &fp->sb_info->sb_virt->pi_array[RX_PI];
+			rxq->hw_cons_ptr = val;
+
+			qede_update_rx_prod(edev, rxq);
+		}
+
+		if (fp->type & QEDE_FASTPATH_XDP) {
+			rc = qede_start_txq(edev, fp, fp->xdp_tx, i, XDP_PI);
+			if (rc)
+				goto out;
+
+			fp->rxq->xdp_prog = bpf_prog_add(edev->xdp_prog, 1);
+			if (IS_ERR(fp->rxq->xdp_prog)) {
+				rc = PTR_ERR(fp->rxq->xdp_prog);
+				fp->rxq->xdp_prog = NULL;
+				goto out;
+			}
+		}
+
+		if (fp->type & QEDE_FASTPATH_TX) {
+			rc = qede_start_txq(edev, fp, fp->txq, i, TX_PI(0));
+			if (rc)
+				goto out;
+		}
+	}
+
+	/* Prepare and send the vport enable */
+	vport_update_params->vport_id = start.vport_id;
+	vport_update_params->update_vport_active_flg = 1;
+	vport_update_params->vport_active_flg = 1;
+
+	if ((qed_info->mf_mode == QED_MF_NPAR || pci_num_vf(edev->pdev)) &&
+	    qed_info->tx_switching) {
+		vport_update_params->update_tx_switching_flg = 1;
+		vport_update_params->tx_switching_flg = 1;
+	}
+
+	qede_fill_rss_params(edev, &vport_update_params->rss_params,
+			     &vport_update_params->update_rss_flg);
+
+	rc = edev->ops->vport_update(cdev, vport_update_params);
+	if (rc)
+		DP_ERR(edev, "Update V-PORT failed %d\n", rc);
+
+out:
+	vfree(vport_update_params);
+	return rc;
+}
+
+enum qede_unload_mode {
+	QEDE_UNLOAD_NORMAL,
+};
+
+static void qede_unload(struct qede_dev *edev, enum qede_unload_mode mode,
+			bool is_locked)
+{
+	struct qed_link_params link_params;
+	int rc;
+
+	DP_INFO(edev, "Starting qede unload\n");
+
+	if (!is_locked)
+		__qede_lock(edev);
+
+	edev->state = QEDE_STATE_CLOSED;
+
+	qede_rdma_dev_event_close(edev);
+
+	/* Close OS Tx */
+	netif_tx_disable(edev->ndev);
+	netif_carrier_off(edev->ndev);
+
+	/* Reset the link */
+	memset(&link_params, 0, sizeof(link_params));
+	link_params.link_up = false;
+	edev->ops->common->set_link(edev->cdev, &link_params);
+	rc = qede_stop_queues(edev);
+	if (rc) {
+		qede_sync_free_irqs(edev);
+		goto out;
+	}
+
+	DP_INFO(edev, "Stopped Queues\n");
+
+	qede_vlan_mark_nonconfigured(edev);
+	edev->ops->fastpath_stop(edev->cdev);
+
+	if (!IS_VF(edev) && edev->dev_info.common.num_hwfns == 1) {
+		qede_poll_for_freeing_arfs_filters(edev);
+		qede_free_arfs(edev);
+	}
+
+	/* Release the interrupts */
+	qede_sync_free_irqs(edev);
+	edev->ops->common->set_fp_int(edev->cdev, 0);
+
+	qede_napi_disable_remove(edev);
+
+	qede_free_mem_load(edev);
+	qede_free_fp_array(edev);
+
+out:
+	if (!is_locked)
+		__qede_unlock(edev);
+	DP_INFO(edev, "Ending qede unload\n");
+}
+
+enum qede_load_mode {
+	QEDE_LOAD_NORMAL,
+	QEDE_LOAD_RELOAD,
+};
+
+static int qede_load(struct qede_dev *edev, enum qede_load_mode mode,
+		     bool is_locked)
+{
+	struct qed_link_params link_params;
+	int rc;
+
+	DP_INFO(edev, "Starting qede load\n");
+
+	if (!is_locked)
+		__qede_lock(edev);
+
+	rc = qede_set_num_queues(edev);
+	if (rc)
+		goto out;
+
+	rc = qede_alloc_fp_array(edev);
+	if (rc)
+		goto out;
+
+	qede_init_fp(edev);
+
+	rc = qede_alloc_mem_load(edev);
+	if (rc)
+		goto err1;
+	DP_INFO(edev, "Allocated %d Rx, %d Tx queues\n",
+		QEDE_RSS_COUNT(edev), QEDE_TSS_COUNT(edev));
+
+	rc = qede_set_real_num_queues(edev);
+	if (rc)
+		goto err2;
+
+	if (!IS_VF(edev) && edev->dev_info.common.num_hwfns == 1) {
+		rc = qede_alloc_arfs(edev);
+		if (rc)
+			DP_NOTICE(edev, "aRFS memory allocation failed\n");
+	}
+
+	qede_napi_add_enable(edev);
+	DP_INFO(edev, "Napi added and enabled\n");
+
+	rc = qede_setup_irqs(edev);
+	if (rc)
+		goto err3;
+	DP_INFO(edev, "Setup IRQs succeeded\n");
+
+	rc = qede_start_queues(edev, mode != QEDE_LOAD_RELOAD);
+	if (rc)
+		goto err4;
+	DP_INFO(edev, "Start VPORT, RXQ and TXQ succeeded\n");
+
+	/* Program un-configured VLANs */
+	qede_configure_vlan_filters(edev);
+
+	/* Ask for link-up using current configuration */
+	memset(&link_params, 0, sizeof(link_params));
+	link_params.link_up = true;
+	edev->ops->common->set_link(edev->cdev, &link_params);
+
+	edev->state = QEDE_STATE_OPEN;
+
+	DP_INFO(edev, "Ending successfully qede load\n");
+
+	goto out;
+err4:
+	qede_sync_free_irqs(edev);
+	memset(&edev->int_info.msix_cnt, 0, sizeof(struct qed_int_info));
+err3:
+	qede_napi_disable_remove(edev);
+err2:
+	qede_free_mem_load(edev);
+err1:
+	edev->ops->common->set_fp_int(edev->cdev, 0);
+	qede_free_fp_array(edev);
+	edev->num_queues = 0;
+	edev->fp_num_tx = 0;
+	edev->fp_num_rx = 0;
+out:
+	if (!is_locked)
+		__qede_unlock(edev);
+
+	return rc;
+}
+
+/* 'func' should be able to run between unload and reload assuming interface
+ * is actually running, or afterwards in case it's currently DOWN.
+ */
+void qede_reload(struct qede_dev *edev,
+		 struct qede_reload_args *args, bool is_locked)
+{
+	if (!is_locked)
+		__qede_lock(edev);
+
+	/* Since qede_lock is held, internal state wouldn't change even
+	 * if netdev state would start transitioning. Check whether current
+	 * internal configuration indicates device is up, then reload.
+	 */
+	if (edev->state == QEDE_STATE_OPEN) {
+		qede_unload(edev, QEDE_UNLOAD_NORMAL, true);
+		if (args)
+			args->func(edev, args);
+		qede_load(edev, QEDE_LOAD_RELOAD, true);
+
+		/* Since no one is going to do it for us, re-configure */
+		qede_config_rx_mode(edev->ndev);
+	} else if (args) {
+		args->func(edev, args);
+	}
+
+	if (!is_locked)
+		__qede_unlock(edev);
+}
+
+/* called with rtnl_lock */
+static int qede_open(struct net_device *ndev)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+	int rc;
+
+	netif_carrier_off(ndev);
+
+	edev->ops->common->set_power_state(edev->cdev, PCI_D0);
+
+	rc = qede_load(edev, QEDE_LOAD_NORMAL, false);
+	if (rc)
+		return rc;
+
+	udp_tunnel_get_rx_info(ndev);
+
+	edev->ops->common->update_drv_state(edev->cdev, true);
+
+	return 0;
+}
+
+static int qede_close(struct net_device *ndev)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	qede_unload(edev, QEDE_UNLOAD_NORMAL, false);
+
+	edev->ops->common->update_drv_state(edev->cdev, false);
+
+	return 0;
+}
+
+static void qede_link_update(void *dev, struct qed_link_output *link)
+{
+	struct qede_dev *edev = dev;
+
+	if (!netif_running(edev->ndev)) {
+		DP_VERBOSE(edev, NETIF_MSG_LINK, "Interface is not running\n");
+		return;
+	}
+
+	if (link->link_up) {
+		if (!netif_carrier_ok(edev->ndev)) {
+			DP_NOTICE(edev, "Link is up\n");
+			netif_tx_start_all_queues(edev->ndev);
+			netif_carrier_on(edev->ndev);
+			qede_rdma_dev_event_open(edev);
+		}
+	} else {
+		if (netif_carrier_ok(edev->ndev)) {
+			DP_NOTICE(edev, "Link is down\n");
+			netif_tx_disable(edev->ndev);
+			netif_carrier_off(edev->ndev);
+			qede_rdma_dev_event_close(edev);
+		}
+	}
+}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ptp.c b/drivers/net/ethernet/qlogic/qede/qede_ptp.c
new file mode 100644
index 0000000..02adb51
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qede/qede_ptp.c
@@ -0,0 +1,553 @@
+/* QLogic qede NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "qede_ptp.h"
+
+struct qede_ptp {
+	const struct qed_eth_ptp_ops	*ops;
+	struct ptp_clock_info		clock_info;
+	struct cyclecounter		cc;
+	struct timecounter		tc;
+	struct ptp_clock		*clock;
+	struct work_struct		work;
+	struct qede_dev			*edev;
+	struct sk_buff			*tx_skb;
+
+	/* ptp spinlock is used for protecting the cycle/time counter fields
+	 * and, also for serializing the qed PTP API invocations.
+	 */
+	spinlock_t			lock;
+	bool				hw_ts_ioctl_called;
+	u16				tx_type;
+	u16				rx_filter;
+};
+
+/**
+ * qede_ptp_adjfreq
+ * @ptp: the ptp clock structure
+ * @ppb: parts per billion adjustment from base
+ *
+ * Adjust the frequency of the ptp cycle counter by the
+ * indicated ppb from the base frequency.
+ */
+static int qede_ptp_adjfreq(struct ptp_clock_info *info, s32 ppb)
+{
+	struct qede_ptp *ptp = container_of(info, struct qede_ptp, clock_info);
+	struct qede_dev *edev = ptp->edev;
+	int rc;
+
+	__qede_lock(edev);
+	if (edev->state == QEDE_STATE_OPEN) {
+		spin_lock_bh(&ptp->lock);
+		rc = ptp->ops->adjfreq(edev->cdev, ppb);
+		spin_unlock_bh(&ptp->lock);
+	} else {
+		DP_ERR(edev, "PTP adjfreq called while interface is down\n");
+		rc = -EFAULT;
+	}
+	__qede_unlock(edev);
+
+	return rc;
+}
+
+static int qede_ptp_adjtime(struct ptp_clock_info *info, s64 delta)
+{
+	struct qede_dev *edev;
+	struct qede_ptp *ptp;
+
+	ptp = container_of(info, struct qede_ptp, clock_info);
+	edev = ptp->edev;
+
+	DP_VERBOSE(edev, QED_MSG_DEBUG, "PTP adjtime called, delta = %llx\n",
+		   delta);
+
+	spin_lock_bh(&ptp->lock);
+	timecounter_adjtime(&ptp->tc, delta);
+	spin_unlock_bh(&ptp->lock);
+
+	return 0;
+}
+
+static int qede_ptp_gettime(struct ptp_clock_info *info, struct timespec64 *ts)
+{
+	struct qede_dev *edev;
+	struct qede_ptp *ptp;
+	u64 ns;
+
+	ptp = container_of(info, struct qede_ptp, clock_info);
+	edev = ptp->edev;
+
+	spin_lock_bh(&ptp->lock);
+	ns = timecounter_read(&ptp->tc);
+	spin_unlock_bh(&ptp->lock);
+
+	DP_VERBOSE(edev, QED_MSG_DEBUG, "PTP gettime called, ns = %llu\n", ns);
+
+	*ts = ns_to_timespec64(ns);
+
+	return 0;
+}
+
+static int qede_ptp_settime(struct ptp_clock_info *info,
+			    const struct timespec64 *ts)
+{
+	struct qede_dev *edev;
+	struct qede_ptp *ptp;
+	u64 ns;
+
+	ptp = container_of(info, struct qede_ptp, clock_info);
+	edev = ptp->edev;
+
+	ns = timespec64_to_ns(ts);
+
+	DP_VERBOSE(edev, QED_MSG_DEBUG, "PTP settime called, ns = %llu\n", ns);
+
+	/* Re-init the timecounter */
+	spin_lock_bh(&ptp->lock);
+	timecounter_init(&ptp->tc, &ptp->cc, ns);
+	spin_unlock_bh(&ptp->lock);
+
+	return 0;
+}
+
+/* Enable (or disable) ancillary features of the phc subsystem */
+static int qede_ptp_ancillary_feature_enable(struct ptp_clock_info *info,
+					     struct ptp_clock_request *rq,
+					     int on)
+{
+	struct qede_dev *edev;
+	struct qede_ptp *ptp;
+
+	ptp = container_of(info, struct qede_ptp, clock_info);
+	edev = ptp->edev;
+
+	DP_ERR(edev, "PHC ancillary features are not supported\n");
+
+	return -ENOTSUPP;
+}
+
+static void qede_ptp_task(struct work_struct *work)
+{
+	struct skb_shared_hwtstamps shhwtstamps;
+	struct qede_dev *edev;
+	struct qede_ptp *ptp;
+	u64 timestamp, ns;
+	int rc;
+
+	ptp = container_of(work, struct qede_ptp, work);
+	edev = ptp->edev;
+
+	/* Read Tx timestamp registers */
+	spin_lock_bh(&ptp->lock);
+	rc = ptp->ops->read_tx_ts(edev->cdev, &timestamp);
+	spin_unlock_bh(&ptp->lock);
+	if (rc) {
+		/* Reschedule to keep checking for a valid timestamp value */
+		schedule_work(&ptp->work);
+		return;
+	}
+
+	ns = timecounter_cyc2time(&ptp->tc, timestamp);
+	memset(&shhwtstamps, 0, sizeof(shhwtstamps));
+	shhwtstamps.hwtstamp = ns_to_ktime(ns);
+	skb_tstamp_tx(ptp->tx_skb, &shhwtstamps);
+	dev_kfree_skb_any(ptp->tx_skb);
+	ptp->tx_skb = NULL;
+	clear_bit_unlock(QEDE_FLAGS_PTP_TX_IN_PRORGESS, &edev->flags);
+
+	DP_VERBOSE(edev, QED_MSG_DEBUG,
+		   "Tx timestamp, timestamp cycles = %llu, ns = %llu\n",
+		   timestamp, ns);
+}
+
+/* Read the PHC. This API is invoked with ptp_lock held. */
+static u64 qede_ptp_read_cc(const struct cyclecounter *cc)
+{
+	struct qede_dev *edev;
+	struct qede_ptp *ptp;
+	u64 phc_cycles;
+	int rc;
+
+	ptp = container_of(cc, struct qede_ptp, cc);
+	edev = ptp->edev;
+	rc = ptp->ops->read_cc(edev->cdev, &phc_cycles);
+	if (rc)
+		WARN_ONCE(1, "PHC read err %d\n", rc);
+
+	DP_VERBOSE(edev, QED_MSG_DEBUG, "PHC read cycles = %llu\n", phc_cycles);
+
+	return phc_cycles;
+}
+
+static int qede_ptp_cfg_filters(struct qede_dev *edev)
+{
+	enum qed_ptp_hwtstamp_tx_type tx_type = QED_PTP_HWTSTAMP_TX_ON;
+	enum qed_ptp_filter_type rx_filter = QED_PTP_FILTER_NONE;
+	struct qede_ptp *ptp = edev->ptp;
+
+	if (!ptp)
+		return -EIO;
+
+	if (!ptp->hw_ts_ioctl_called) {
+		DP_INFO(edev, "TS IOCTL not called\n");
+		return 0;
+	}
+
+	switch (ptp->tx_type) {
+	case HWTSTAMP_TX_ON:
+		edev->flags |= QEDE_TX_TIMESTAMPING_EN;
+		tx_type = QED_PTP_HWTSTAMP_TX_ON;
+		break;
+
+	case HWTSTAMP_TX_OFF:
+		edev->flags &= ~QEDE_TX_TIMESTAMPING_EN;
+		tx_type = QED_PTP_HWTSTAMP_TX_OFF;
+		break;
+
+	case HWTSTAMP_TX_ONESTEP_SYNC:
+		DP_ERR(edev, "One-step timestamping is not supported\n");
+		return -ERANGE;
+	}
+
+	spin_lock_bh(&ptp->lock);
+	switch (ptp->rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		rx_filter = QED_PTP_FILTER_NONE;
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_NTP_ALL:
+		ptp->rx_filter = HWTSTAMP_FILTER_NONE;
+		rx_filter = QED_PTP_FILTER_ALL;
+		break;
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
+		rx_filter = QED_PTP_FILTER_V1_L4_EVENT;
+		break;
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
+		/* Initialize PTP detection for UDP/IPv4 events */
+		rx_filter = QED_PTP_FILTER_V1_L4_GEN;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
+		rx_filter = QED_PTP_FILTER_V2_L4_EVENT;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
+		/* Initialize PTP detection for UDP/IPv4 or UDP/IPv6 events */
+		rx_filter = QED_PTP_FILTER_V2_L4_GEN;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
+		rx_filter = QED_PTP_FILTER_V2_L2_EVENT;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
+		/* Initialize PTP detection L2 events */
+		rx_filter = QED_PTP_FILTER_V2_L2_GEN;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+		rx_filter = QED_PTP_FILTER_V2_EVENT;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+		/* Initialize PTP detection L2, UDP/IPv4 or UDP/IPv6 events */
+		rx_filter = QED_PTP_FILTER_V2_GEN;
+		break;
+	}
+
+	ptp->ops->cfg_filters(edev->cdev, rx_filter, tx_type);
+
+	spin_unlock_bh(&ptp->lock);
+
+	return 0;
+}
+
+int qede_ptp_hw_ts(struct qede_dev *edev, struct ifreq *ifr)
+{
+	struct hwtstamp_config config;
+	struct qede_ptp *ptp;
+	int rc;
+
+	ptp = edev->ptp;
+	if (!ptp)
+		return -EIO;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	DP_VERBOSE(edev, QED_MSG_DEBUG,
+		   "HWTSTAMP IOCTL: Requested tx_type = %d, requested rx_filters = %d\n",
+		   config.tx_type, config.rx_filter);
+
+	if (config.flags) {
+		DP_ERR(edev, "config.flags is reserved for future use\n");
+		return -EINVAL;
+	}
+
+	ptp->hw_ts_ioctl_called = 1;
+	ptp->tx_type = config.tx_type;
+	ptp->rx_filter = config.rx_filter;
+
+	rc = qede_ptp_cfg_filters(edev);
+	if (rc)
+		return rc;
+
+	config.rx_filter = ptp->rx_filter;
+
+	return copy_to_user(ifr->ifr_data, &config,
+			    sizeof(config)) ? -EFAULT : 0;
+}
+
+int qede_ptp_get_ts_info(struct qede_dev *edev, struct ethtool_ts_info *info)
+{
+	struct qede_ptp *ptp = edev->ptp;
+
+	if (!ptp)
+		return -EIO;
+
+	info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
+				SOF_TIMESTAMPING_RX_SOFTWARE |
+				SOF_TIMESTAMPING_SOFTWARE |
+				SOF_TIMESTAMPING_TX_HARDWARE |
+				SOF_TIMESTAMPING_RX_HARDWARE |
+				SOF_TIMESTAMPING_RAW_HARDWARE;
+
+	if (ptp->clock)
+		info->phc_index = ptp_clock_index(ptp->clock);
+	else
+		info->phc_index = -1;
+
+	info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) |
+			   BIT(HWTSTAMP_FILTER_PTP_V1_L4_EVENT) |
+			   BIT(HWTSTAMP_FILTER_PTP_V1_L4_SYNC) |
+			   BIT(HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ) |
+			   BIT(HWTSTAMP_FILTER_PTP_V2_L4_EVENT) |
+			   BIT(HWTSTAMP_FILTER_PTP_V2_L4_SYNC) |
+			   BIT(HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ) |
+			   BIT(HWTSTAMP_FILTER_PTP_V2_L2_EVENT) |
+			   BIT(HWTSTAMP_FILTER_PTP_V2_L2_SYNC) |
+			   BIT(HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ) |
+			   BIT(HWTSTAMP_FILTER_PTP_V2_EVENT) |
+			   BIT(HWTSTAMP_FILTER_PTP_V2_SYNC) |
+			   BIT(HWTSTAMP_FILTER_PTP_V2_DELAY_REQ);
+
+	info->tx_types = BIT(HWTSTAMP_TX_OFF) | BIT(HWTSTAMP_TX_ON);
+
+	return 0;
+}
+
+void qede_ptp_disable(struct qede_dev *edev)
+{
+	struct qede_ptp *ptp;
+
+	ptp = edev->ptp;
+	if (!ptp)
+		return;
+
+	if (ptp->clock) {
+		ptp_clock_unregister(ptp->clock);
+		ptp->clock = NULL;
+	}
+
+	/* Cancel PTP work queue. Should be done after the Tx queues are
+	 * drained to prevent additional scheduling.
+	 */
+	cancel_work_sync(&ptp->work);
+	if (ptp->tx_skb) {
+		dev_kfree_skb_any(ptp->tx_skb);
+		ptp->tx_skb = NULL;
+	}
+
+	/* Disable PTP in HW */
+	spin_lock_bh(&ptp->lock);
+	ptp->ops->disable(edev->cdev);
+	spin_unlock_bh(&ptp->lock);
+
+	kfree(ptp);
+	edev->ptp = NULL;
+}
+
+static int qede_ptp_init(struct qede_dev *edev, bool init_tc)
+{
+	struct qede_ptp *ptp;
+	int rc;
+
+	ptp = edev->ptp;
+	if (!ptp)
+		return -EINVAL;
+
+	spin_lock_init(&ptp->lock);
+
+	/* Configure PTP in HW */
+	rc = ptp->ops->enable(edev->cdev);
+	if (rc) {
+		DP_INFO(edev, "PTP HW enable failed\n");
+		return rc;
+	}
+
+	/* Init work queue for Tx timestamping */
+	INIT_WORK(&ptp->work, qede_ptp_task);
+
+	/* Init cyclecounter and timecounter. This is done only in the first
+	 * load. If done in every load, PTP application will fail when doing
+	 * unload / load (e.g. MTU change) while it is running.
+	 */
+	if (init_tc) {
+		memset(&ptp->cc, 0, sizeof(ptp->cc));
+		ptp->cc.read = qede_ptp_read_cc;
+		ptp->cc.mask = CYCLECOUNTER_MASK(64);
+		ptp->cc.shift = 0;
+		ptp->cc.mult = 1;
+
+		timecounter_init(&ptp->tc, &ptp->cc,
+				 ktime_to_ns(ktime_get_real()));
+	}
+
+	return rc;
+}
+
+int qede_ptp_enable(struct qede_dev *edev, bool init_tc)
+{
+	struct qede_ptp *ptp;
+	int rc;
+
+	ptp = kzalloc(sizeof(*ptp), GFP_KERNEL);
+	if (!ptp) {
+		DP_INFO(edev, "Failed to allocate struct for PTP\n");
+		return -ENOMEM;
+	}
+
+	ptp->edev = edev;
+	ptp->ops = edev->ops->ptp;
+	if (!ptp->ops) {
+		DP_INFO(edev, "PTP enable failed\n");
+		rc = -EIO;
+		goto err1;
+	}
+
+	edev->ptp = ptp;
+
+	rc = qede_ptp_init(edev, init_tc);
+	if (rc)
+		goto err1;
+
+	qede_ptp_cfg_filters(edev);
+
+	/* Fill the ptp_clock_info struct and register PTP clock */
+	ptp->clock_info.owner = THIS_MODULE;
+	snprintf(ptp->clock_info.name, 16, "%s", edev->ndev->name);
+	ptp->clock_info.max_adj = QED_MAX_PHC_DRIFT_PPB;
+	ptp->clock_info.n_alarm = 0;
+	ptp->clock_info.n_ext_ts = 0;
+	ptp->clock_info.n_per_out = 0;
+	ptp->clock_info.pps = 0;
+	ptp->clock_info.adjfreq = qede_ptp_adjfreq;
+	ptp->clock_info.adjtime = qede_ptp_adjtime;
+	ptp->clock_info.gettime64 = qede_ptp_gettime;
+	ptp->clock_info.settime64 = qede_ptp_settime;
+	ptp->clock_info.enable = qede_ptp_ancillary_feature_enable;
+
+	ptp->clock = ptp_clock_register(&ptp->clock_info, &edev->pdev->dev);
+	if (IS_ERR(ptp->clock)) {
+		rc = -EINVAL;
+		DP_ERR(edev, "PTP clock registration failed\n");
+		goto err2;
+	}
+
+	return 0;
+
+err2:
+	qede_ptp_disable(edev);
+	ptp->clock = NULL;
+err1:
+	kfree(ptp);
+	edev->ptp = NULL;
+
+	return rc;
+}
+
+void qede_ptp_tx_ts(struct qede_dev *edev, struct sk_buff *skb)
+{
+	struct qede_ptp *ptp;
+
+	ptp = edev->ptp;
+	if (!ptp)
+		return;
+
+	if (test_and_set_bit_lock(QEDE_FLAGS_PTP_TX_IN_PRORGESS, &edev->flags))
+		return;
+
+	if (unlikely(!(edev->flags & QEDE_TX_TIMESTAMPING_EN))) {
+		DP_NOTICE(edev,
+			  "Tx timestamping was not enabled, this packet will not be timestamped\n");
+	} else if (unlikely(ptp->tx_skb)) {
+		DP_NOTICE(edev,
+			  "The device supports only a single outstanding packet to timestamp, this packet will not be timestamped\n");
+	} else {
+		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+		/* schedule check for Tx timestamp */
+		ptp->tx_skb = skb_get(skb);
+		schedule_work(&ptp->work);
+	}
+}
+
+void qede_ptp_rx_ts(struct qede_dev *edev, struct sk_buff *skb)
+{
+	struct qede_ptp *ptp;
+	u64 timestamp, ns;
+	int rc;
+
+	ptp = edev->ptp;
+	if (!ptp)
+		return;
+
+	spin_lock_bh(&ptp->lock);
+	rc = ptp->ops->read_rx_ts(edev->cdev, &timestamp);
+	if (rc) {
+		spin_unlock_bh(&ptp->lock);
+		DP_INFO(edev, "Invalid Rx timestamp\n");
+		return;
+	}
+
+	ns = timecounter_cyc2time(&ptp->tc, timestamp);
+	spin_unlock_bh(&ptp->lock);
+	skb_hwtstamps(skb)->hwtstamp = ns_to_ktime(ns);
+	DP_VERBOSE(edev, QED_MSG_DEBUG,
+		   "Rx timestamp, timestamp cycles = %llu, ns = %llu\n",
+		   timestamp, ns);
+}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ptp.h b/drivers/net/ethernet/qlogic/qede/qede_ptp.h
new file mode 100644
index 0000000..691a14c
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qede/qede_ptp.h
@@ -0,0 +1,63 @@
+/* QLogic qede NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _QEDE_PTP_H_
+#define _QEDE_PTP_H_
+
+#include <linux/ptp_clock_kernel.h>
+#include <linux/net_tstamp.h>
+#include <linux/timecounter.h>
+#include "qede.h"
+
+void qede_ptp_rx_ts(struct qede_dev *edev, struct sk_buff *skb);
+void qede_ptp_tx_ts(struct qede_dev *edev, struct sk_buff *skb);
+int qede_ptp_hw_ts(struct qede_dev *edev, struct ifreq *req);
+void qede_ptp_disable(struct qede_dev *edev);
+int qede_ptp_enable(struct qede_dev *edev, bool init_tc);
+int qede_ptp_get_ts_info(struct qede_dev *edev, struct ethtool_ts_info *ts);
+
+static inline void qede_ptp_record_rx_ts(struct qede_dev *edev,
+					 union eth_rx_cqe *cqe,
+					 struct sk_buff *skb)
+{
+	/* Check if this packet was timestamped */
+	if (unlikely(le16_to_cpu(cqe->fast_path_regular.pars_flags.flags) &
+		     (1 << PARSING_AND_ERR_FLAGS_TIMESTAMPRECORDED_SHIFT))) {
+		if (likely(le16_to_cpu(cqe->fast_path_regular.pars_flags.flags)
+		    & (1 << PARSING_AND_ERR_FLAGS_TIMESYNCPKT_SHIFT))) {
+			qede_ptp_rx_ts(edev, skb);
+		} else {
+			DP_INFO(edev,
+				"Timestamp recorded for non PTP packets\n");
+		}
+	}
+}
+#endif /* _QEDE_PTP_H_ */
diff --git a/drivers/net/ethernet/qlogic/qede/qede_rdma.c b/drivers/net/ethernet/qlogic/qede/qede_rdma.c
new file mode 100644
index 0000000..1900bf7
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qede/qede_rdma.c
@@ -0,0 +1,314 @@
+/* QLogic qedr NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/qed/qede_rdma.h>
+#include "qede.h"
+
+static struct qedr_driver *qedr_drv;
+static LIST_HEAD(qedr_dev_list);
+static DEFINE_MUTEX(qedr_dev_list_lock);
+
+bool qede_rdma_supported(struct qede_dev *dev)
+{
+	return dev->dev_info.common.rdma_supported;
+}
+
+static void _qede_rdma_dev_add(struct qede_dev *edev)
+{
+	if (!qedr_drv)
+		return;
+
+	edev->rdma_info.qedr_dev = qedr_drv->add(edev->cdev, edev->pdev,
+						 edev->ndev);
+}
+
+static int qede_rdma_create_wq(struct qede_dev *edev)
+{
+	INIT_LIST_HEAD(&edev->rdma_info.rdma_event_list);
+	edev->rdma_info.rdma_wq = create_singlethread_workqueue("rdma_wq");
+	if (!edev->rdma_info.rdma_wq) {
+		DP_NOTICE(edev, "qedr: Could not create workqueue\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void qede_rdma_cleanup_event(struct qede_dev *edev)
+{
+	struct list_head *head = &edev->rdma_info.rdma_event_list;
+	struct qede_rdma_event_work *event_node;
+
+	flush_workqueue(edev->rdma_info.rdma_wq);
+	while (!list_empty(head)) {
+		event_node = list_entry(head->next, struct qede_rdma_event_work,
+					list);
+		cancel_work_sync(&event_node->work);
+		list_del(&event_node->list);
+		kfree(event_node);
+	}
+}
+
+static void qede_rdma_destroy_wq(struct qede_dev *edev)
+{
+	qede_rdma_cleanup_event(edev);
+	destroy_workqueue(edev->rdma_info.rdma_wq);
+}
+
+int qede_rdma_dev_add(struct qede_dev *edev)
+{
+	int rc = 0;
+
+	if (qede_rdma_supported(edev)) {
+		rc = qede_rdma_create_wq(edev);
+		if (rc)
+			return rc;
+
+		INIT_LIST_HEAD(&edev->rdma_info.entry);
+		mutex_lock(&qedr_dev_list_lock);
+		list_add_tail(&edev->rdma_info.entry, &qedr_dev_list);
+		_qede_rdma_dev_add(edev);
+		mutex_unlock(&qedr_dev_list_lock);
+	}
+
+	return rc;
+}
+
+static void _qede_rdma_dev_remove(struct qede_dev *edev)
+{
+	if (qedr_drv && qedr_drv->remove && edev->rdma_info.qedr_dev)
+		qedr_drv->remove(edev->rdma_info.qedr_dev);
+	edev->rdma_info.qedr_dev = NULL;
+}
+
+void qede_rdma_dev_remove(struct qede_dev *edev)
+{
+	if (!qede_rdma_supported(edev))
+		return;
+
+	qede_rdma_destroy_wq(edev);
+	mutex_lock(&qedr_dev_list_lock);
+	_qede_rdma_dev_remove(edev);
+	list_del(&edev->rdma_info.entry);
+	mutex_unlock(&qedr_dev_list_lock);
+}
+
+static void _qede_rdma_dev_open(struct qede_dev *edev)
+{
+	if (qedr_drv && edev->rdma_info.qedr_dev && qedr_drv->notify)
+		qedr_drv->notify(edev->rdma_info.qedr_dev, QEDE_UP);
+}
+
+static void qede_rdma_dev_open(struct qede_dev *edev)
+{
+	if (!qede_rdma_supported(edev))
+		return;
+
+	mutex_lock(&qedr_dev_list_lock);
+	_qede_rdma_dev_open(edev);
+	mutex_unlock(&qedr_dev_list_lock);
+}
+
+static void _qede_rdma_dev_close(struct qede_dev *edev)
+{
+	if (qedr_drv && edev->rdma_info.qedr_dev && qedr_drv->notify)
+		qedr_drv->notify(edev->rdma_info.qedr_dev, QEDE_DOWN);
+}
+
+static void qede_rdma_dev_close(struct qede_dev *edev)
+{
+	if (!qede_rdma_supported(edev))
+		return;
+
+	mutex_lock(&qedr_dev_list_lock);
+	_qede_rdma_dev_close(edev);
+	mutex_unlock(&qedr_dev_list_lock);
+}
+
+static void qede_rdma_dev_shutdown(struct qede_dev *edev)
+{
+	if (!qede_rdma_supported(edev))
+		return;
+
+	mutex_lock(&qedr_dev_list_lock);
+	if (qedr_drv && edev->rdma_info.qedr_dev && qedr_drv->notify)
+		qedr_drv->notify(edev->rdma_info.qedr_dev, QEDE_CLOSE);
+	mutex_unlock(&qedr_dev_list_lock);
+}
+
+int qede_rdma_register_driver(struct qedr_driver *drv)
+{
+	struct qede_dev *edev;
+	u8 qedr_counter = 0;
+
+	mutex_lock(&qedr_dev_list_lock);
+	if (qedr_drv) {
+		mutex_unlock(&qedr_dev_list_lock);
+		return -EINVAL;
+	}
+	qedr_drv = drv;
+
+	list_for_each_entry(edev, &qedr_dev_list, rdma_info.entry) {
+		struct net_device *ndev;
+
+		qedr_counter++;
+		_qede_rdma_dev_add(edev);
+		ndev = edev->ndev;
+		if (netif_running(ndev) && netif_oper_up(ndev))
+			_qede_rdma_dev_open(edev);
+	}
+	mutex_unlock(&qedr_dev_list_lock);
+
+	pr_notice("qedr: discovered and registered %d RDMA funcs\n",
+		  qedr_counter);
+
+	return 0;
+}
+EXPORT_SYMBOL(qede_rdma_register_driver);
+
+void qede_rdma_unregister_driver(struct qedr_driver *drv)
+{
+	struct qede_dev *edev;
+
+	mutex_lock(&qedr_dev_list_lock);
+	list_for_each_entry(edev, &qedr_dev_list, rdma_info.entry) {
+		if (edev->rdma_info.qedr_dev)
+			_qede_rdma_dev_remove(edev);
+	}
+	qedr_drv = NULL;
+	mutex_unlock(&qedr_dev_list_lock);
+}
+EXPORT_SYMBOL(qede_rdma_unregister_driver);
+
+static void qede_rdma_changeaddr(struct qede_dev *edev)
+{
+	if (!qede_rdma_supported(edev))
+		return;
+
+	if (qedr_drv && edev->rdma_info.qedr_dev && qedr_drv->notify)
+		qedr_drv->notify(edev->rdma_info.qedr_dev, QEDE_CHANGE_ADDR);
+}
+
+static struct qede_rdma_event_work *
+qede_rdma_get_free_event_node(struct qede_dev *edev)
+{
+	struct qede_rdma_event_work *event_node = NULL;
+	struct list_head *list_node = NULL;
+	bool found = false;
+
+	list_for_each(list_node, &edev->rdma_info.rdma_event_list) {
+		event_node = list_entry(list_node, struct qede_rdma_event_work,
+					list);
+		if (!work_pending(&event_node->work)) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		event_node = kzalloc(sizeof(*event_node), GFP_ATOMIC);
+		if (!event_node) {
+			DP_NOTICE(edev,
+				  "qedr: Could not allocate memory for rdma work\n");
+			return NULL;
+		}
+		list_add_tail(&event_node->list,
+			      &edev->rdma_info.rdma_event_list);
+	}
+
+	return event_node;
+}
+
+static void qede_rdma_handle_event(struct work_struct *work)
+{
+	struct qede_rdma_event_work *event_node;
+	enum qede_rdma_event event;
+	struct qede_dev *edev;
+
+	event_node = container_of(work, struct qede_rdma_event_work, work);
+	event = event_node->event;
+	edev = event_node->ptr;
+
+	switch (event) {
+	case QEDE_UP:
+		qede_rdma_dev_open(edev);
+		break;
+	case QEDE_DOWN:
+		qede_rdma_dev_close(edev);
+		break;
+	case QEDE_CLOSE:
+		qede_rdma_dev_shutdown(edev);
+		break;
+	case QEDE_CHANGE_ADDR:
+		qede_rdma_changeaddr(edev);
+		break;
+	default:
+		DP_NOTICE(edev, "Invalid rdma event %d", event);
+	}
+}
+
+static void qede_rdma_add_event(struct qede_dev *edev,
+				enum qede_rdma_event event)
+{
+	struct qede_rdma_event_work *event_node;
+
+	if (!edev->rdma_info.qedr_dev)
+		return;
+
+	event_node = qede_rdma_get_free_event_node(edev);
+	if (!event_node)
+		return;
+
+	event_node->event = event;
+	event_node->ptr = edev;
+
+	INIT_WORK(&event_node->work, qede_rdma_handle_event);
+	queue_work(edev->rdma_info.rdma_wq, &event_node->work);
+}
+
+void qede_rdma_dev_event_open(struct qede_dev *edev)
+{
+	qede_rdma_add_event(edev, QEDE_UP);
+}
+
+void qede_rdma_dev_event_close(struct qede_dev *edev)
+{
+	qede_rdma_add_event(edev, QEDE_DOWN);
+}
+
+void qede_rdma_event_changeaddr(struct qede_dev *edev)
+{
+	qede_rdma_add_event(edev, QEDE_CHANGE_ADDR);
+}
diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c
new file mode 100644
index 0000000..b48f761
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qla3xxx.c
@@ -0,0 +1,3949 @@
+/*
+ * QLogic QLA3xxx NIC HBA Driver
+ * Copyright (c)  2003-2006 QLogic Corporation
+ *
+ * See LICENSE.qla3xxx for copyright and licensing details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/dmapool.h>
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/prefetch.h>
+
+#include "qla3xxx.h"
+
+#define DRV_NAME	"qla3xxx"
+#define DRV_STRING	"QLogic ISP3XXX Network Driver"
+#define DRV_VERSION	"v2.03.00-k5"
+
+static const char ql3xxx_driver_name[] = DRV_NAME;
+static const char ql3xxx_driver_version[] = DRV_VERSION;
+
+#define TIMED_OUT_MSG							\
+"Timed out waiting for management port to get free before issuing command\n"
+
+MODULE_AUTHOR("QLogic Corporation");
+MODULE_DESCRIPTION("QLogic ISP3XXX Network Driver " DRV_VERSION " ");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static const u32 default_msg
+    = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
+    | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
+
+static int debug = -1;		/* defaults above */
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+
+static int msi;
+module_param(msi, int, 0);
+MODULE_PARM_DESC(msi, "Turn on Message Signaled Interrupts.");
+
+static const struct pci_device_id ql3xxx_pci_tbl[] = {
+	{PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, QL3022_DEVICE_ID)},
+	{PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, QL3032_DEVICE_ID)},
+	/* required last entry */
+	{0,}
+};
+
+MODULE_DEVICE_TABLE(pci, ql3xxx_pci_tbl);
+
+/*
+ *  These are the known PHY's which are used
+ */
+enum PHY_DEVICE_TYPE {
+   PHY_TYPE_UNKNOWN   = 0,
+   PHY_VITESSE_VSC8211,
+   PHY_AGERE_ET1011C,
+   MAX_PHY_DEV_TYPES
+};
+
+struct PHY_DEVICE_INFO {
+	const enum PHY_DEVICE_TYPE	phyDevice;
+	const u32		phyIdOUI;
+	const u16		phyIdModel;
+	const char		*name;
+};
+
+static const struct PHY_DEVICE_INFO PHY_DEVICES[] = {
+	{PHY_TYPE_UNKNOWN,    0x000000, 0x0, "PHY_TYPE_UNKNOWN"},
+	{PHY_VITESSE_VSC8211, 0x0003f1, 0xb, "PHY_VITESSE_VSC8211"},
+	{PHY_AGERE_ET1011C,   0x00a0bc, 0x1, "PHY_AGERE_ET1011C"},
+};
+
+
+/*
+ * Caller must take hw_lock.
+ */
+static int ql_sem_spinlock(struct ql3_adapter *qdev,
+			    u32 sem_mask, u32 sem_bits)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+	u32 value;
+	unsigned int seconds = 3;
+
+	do {
+		writel((sem_mask | sem_bits),
+		       &port_regs->CommonRegs.semaphoreReg);
+		value = readl(&port_regs->CommonRegs.semaphoreReg);
+		if ((value & (sem_mask >> 16)) == sem_bits)
+			return 0;
+		ssleep(1);
+	} while (--seconds);
+	return -1;
+}
+
+static void ql_sem_unlock(struct ql3_adapter *qdev, u32 sem_mask)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+	writel(sem_mask, &port_regs->CommonRegs.semaphoreReg);
+	readl(&port_regs->CommonRegs.semaphoreReg);
+}
+
+static int ql_sem_lock(struct ql3_adapter *qdev, u32 sem_mask, u32 sem_bits)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+	u32 value;
+
+	writel((sem_mask | sem_bits), &port_regs->CommonRegs.semaphoreReg);
+	value = readl(&port_regs->CommonRegs.semaphoreReg);
+	return ((value & (sem_mask >> 16)) == sem_bits);
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static int ql_wait_for_drvr_lock(struct ql3_adapter *qdev)
+{
+	int i = 0;
+
+	do {
+		if (ql_sem_lock(qdev,
+				QL_DRVR_SEM_MASK,
+				(QL_RESOURCE_BITS_BASE_CODE | (qdev->mac_index)
+				 * 2) << 1)) {
+			netdev_printk(KERN_DEBUG, qdev->ndev,
+				      "driver lock acquired\n");
+			return 1;
+		}
+		ssleep(1);
+	} while (++i < 10);
+
+	netdev_err(qdev->ndev, "Timed out waiting for driver lock...\n");
+	return 0;
+}
+
+static void ql_set_register_page(struct ql3_adapter *qdev, u32 page)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+
+	writel(((ISP_CONTROL_NP_MASK << 16) | page),
+			&port_regs->CommonRegs.ispControlStatus);
+	readl(&port_regs->CommonRegs.ispControlStatus);
+	qdev->current_page = page;
+}
+
+static u32 ql_read_common_reg_l(struct ql3_adapter *qdev, u32 __iomem *reg)
+{
+	u32 value;
+	unsigned long hw_flags;
+
+	spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+	value = readl(reg);
+	spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+
+	return value;
+}
+
+static u32 ql_read_common_reg(struct ql3_adapter *qdev, u32 __iomem *reg)
+{
+	return readl(reg);
+}
+
+static u32 ql_read_page0_reg_l(struct ql3_adapter *qdev, u32 __iomem *reg)
+{
+	u32 value;
+	unsigned long hw_flags;
+
+	spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+
+	if (qdev->current_page != 0)
+		ql_set_register_page(qdev, 0);
+	value = readl(reg);
+
+	spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+	return value;
+}
+
+static u32 ql_read_page0_reg(struct ql3_adapter *qdev, u32 __iomem *reg)
+{
+	if (qdev->current_page != 0)
+		ql_set_register_page(qdev, 0);
+	return readl(reg);
+}
+
+static void ql_write_common_reg_l(struct ql3_adapter *qdev,
+				u32 __iomem *reg, u32 value)
+{
+	unsigned long hw_flags;
+
+	spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+	writel(value, reg);
+	readl(reg);
+	spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+}
+
+static void ql_write_common_reg(struct ql3_adapter *qdev,
+				u32 __iomem *reg, u32 value)
+{
+	writel(value, reg);
+	readl(reg);
+}
+
+static void ql_write_nvram_reg(struct ql3_adapter *qdev,
+				u32 __iomem *reg, u32 value)
+{
+	writel(value, reg);
+	readl(reg);
+	udelay(1);
+}
+
+static void ql_write_page0_reg(struct ql3_adapter *qdev,
+			       u32 __iomem *reg, u32 value)
+{
+	if (qdev->current_page != 0)
+		ql_set_register_page(qdev, 0);
+	writel(value, reg);
+	readl(reg);
+}
+
+/*
+ * Caller holds hw_lock. Only called during init.
+ */
+static void ql_write_page1_reg(struct ql3_adapter *qdev,
+			       u32 __iomem *reg, u32 value)
+{
+	if (qdev->current_page != 1)
+		ql_set_register_page(qdev, 1);
+	writel(value, reg);
+	readl(reg);
+}
+
+/*
+ * Caller holds hw_lock. Only called during init.
+ */
+static void ql_write_page2_reg(struct ql3_adapter *qdev,
+			       u32 __iomem *reg, u32 value)
+{
+	if (qdev->current_page != 2)
+		ql_set_register_page(qdev, 2);
+	writel(value, reg);
+	readl(reg);
+}
+
+static void ql_disable_interrupts(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+
+	ql_write_common_reg_l(qdev, &port_regs->CommonRegs.ispInterruptMaskReg,
+			    (ISP_IMR_ENABLE_INT << 16));
+
+}
+
+static void ql_enable_interrupts(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+
+	ql_write_common_reg_l(qdev, &port_regs->CommonRegs.ispInterruptMaskReg,
+			    ((0xff << 16) | ISP_IMR_ENABLE_INT));
+
+}
+
+static void ql_release_to_lrg_buf_free_list(struct ql3_adapter *qdev,
+					    struct ql_rcv_buf_cb *lrg_buf_cb)
+{
+	dma_addr_t map;
+	int err;
+	lrg_buf_cb->next = NULL;
+
+	if (qdev->lrg_buf_free_tail == NULL) {	/* The list is empty  */
+		qdev->lrg_buf_free_head = qdev->lrg_buf_free_tail = lrg_buf_cb;
+	} else {
+		qdev->lrg_buf_free_tail->next = lrg_buf_cb;
+		qdev->lrg_buf_free_tail = lrg_buf_cb;
+	}
+
+	if (!lrg_buf_cb->skb) {
+		lrg_buf_cb->skb = netdev_alloc_skb(qdev->ndev,
+						   qdev->lrg_buffer_len);
+		if (unlikely(!lrg_buf_cb->skb)) {
+			qdev->lrg_buf_skb_check++;
+		} else {
+			/*
+			 * We save some space to copy the ethhdr from first
+			 * buffer
+			 */
+			skb_reserve(lrg_buf_cb->skb, QL_HEADER_SPACE);
+			map = pci_map_single(qdev->pdev,
+					     lrg_buf_cb->skb->data,
+					     qdev->lrg_buffer_len -
+					     QL_HEADER_SPACE,
+					     PCI_DMA_FROMDEVICE);
+			err = pci_dma_mapping_error(qdev->pdev, map);
+			if (err) {
+				netdev_err(qdev->ndev,
+					   "PCI mapping failed with error: %d\n",
+					   err);
+				dev_kfree_skb(lrg_buf_cb->skb);
+				lrg_buf_cb->skb = NULL;
+
+				qdev->lrg_buf_skb_check++;
+				return;
+			}
+
+			lrg_buf_cb->buf_phy_addr_low =
+			    cpu_to_le32(LS_64BITS(map));
+			lrg_buf_cb->buf_phy_addr_high =
+			    cpu_to_le32(MS_64BITS(map));
+			dma_unmap_addr_set(lrg_buf_cb, mapaddr, map);
+			dma_unmap_len_set(lrg_buf_cb, maplen,
+					  qdev->lrg_buffer_len -
+					  QL_HEADER_SPACE);
+		}
+	}
+
+	qdev->lrg_buf_free_count++;
+}
+
+static struct ql_rcv_buf_cb *ql_get_from_lrg_buf_free_list(struct ql3_adapter
+							   *qdev)
+{
+	struct ql_rcv_buf_cb *lrg_buf_cb = qdev->lrg_buf_free_head;
+
+	if (lrg_buf_cb != NULL) {
+		qdev->lrg_buf_free_head = lrg_buf_cb->next;
+		if (qdev->lrg_buf_free_head == NULL)
+			qdev->lrg_buf_free_tail = NULL;
+		qdev->lrg_buf_free_count--;
+	}
+
+	return lrg_buf_cb;
+}
+
+static u32 addrBits = EEPROM_NO_ADDR_BITS;
+static u32 dataBits = EEPROM_NO_DATA_BITS;
+
+static void fm93c56a_deselect(struct ql3_adapter *qdev);
+static void eeprom_readword(struct ql3_adapter *qdev, u32 eepromAddr,
+			    unsigned short *value);
+
+/*
+ * Caller holds hw_lock.
+ */
+static void fm93c56a_select(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	__iomem u32 *spir = &port_regs->CommonRegs.serialPortInterfaceReg;
+
+	qdev->eeprom_cmd_data = AUBURN_EEPROM_CS_1;
+	ql_write_nvram_reg(qdev, spir, ISP_NVRAM_MASK | qdev->eeprom_cmd_data);
+	ql_write_nvram_reg(qdev, spir,
+			   ((ISP_NVRAM_MASK << 16) | qdev->eeprom_cmd_data));
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static void fm93c56a_cmd(struct ql3_adapter *qdev, u32 cmd, u32 eepromAddr)
+{
+	int i;
+	u32 mask;
+	u32 dataBit;
+	u32 previousBit;
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	__iomem u32 *spir = &port_regs->CommonRegs.serialPortInterfaceReg;
+
+	/* Clock in a zero, then do the start bit */
+	ql_write_nvram_reg(qdev, spir,
+			   (ISP_NVRAM_MASK | qdev->eeprom_cmd_data |
+			    AUBURN_EEPROM_DO_1));
+	ql_write_nvram_reg(qdev, spir,
+			   (ISP_NVRAM_MASK | qdev->eeprom_cmd_data |
+			    AUBURN_EEPROM_DO_1 | AUBURN_EEPROM_CLK_RISE));
+	ql_write_nvram_reg(qdev, spir,
+			   (ISP_NVRAM_MASK | qdev->eeprom_cmd_data |
+			    AUBURN_EEPROM_DO_1 | AUBURN_EEPROM_CLK_FALL));
+
+	mask = 1 << (FM93C56A_CMD_BITS - 1);
+	/* Force the previous data bit to be different */
+	previousBit = 0xffff;
+	for (i = 0; i < FM93C56A_CMD_BITS; i++) {
+		dataBit = (cmd & mask)
+			? AUBURN_EEPROM_DO_1
+			: AUBURN_EEPROM_DO_0;
+		if (previousBit != dataBit) {
+			/* If the bit changed, change the DO state to match */
+			ql_write_nvram_reg(qdev, spir,
+					   (ISP_NVRAM_MASK |
+					    qdev->eeprom_cmd_data | dataBit));
+			previousBit = dataBit;
+		}
+		ql_write_nvram_reg(qdev, spir,
+				   (ISP_NVRAM_MASK | qdev->eeprom_cmd_data |
+				    dataBit | AUBURN_EEPROM_CLK_RISE));
+		ql_write_nvram_reg(qdev, spir,
+				   (ISP_NVRAM_MASK | qdev->eeprom_cmd_data |
+				    dataBit | AUBURN_EEPROM_CLK_FALL));
+		cmd = cmd << 1;
+	}
+
+	mask = 1 << (addrBits - 1);
+	/* Force the previous data bit to be different */
+	previousBit = 0xffff;
+	for (i = 0; i < addrBits; i++) {
+		dataBit = (eepromAddr & mask) ? AUBURN_EEPROM_DO_1
+			: AUBURN_EEPROM_DO_0;
+		if (previousBit != dataBit) {
+			/*
+			 * If the bit changed, then change the DO state to
+			 * match
+			 */
+			ql_write_nvram_reg(qdev, spir,
+					   (ISP_NVRAM_MASK |
+					    qdev->eeprom_cmd_data | dataBit));
+			previousBit = dataBit;
+		}
+		ql_write_nvram_reg(qdev, spir,
+				   (ISP_NVRAM_MASK | qdev->eeprom_cmd_data |
+				    dataBit | AUBURN_EEPROM_CLK_RISE));
+		ql_write_nvram_reg(qdev, spir,
+				   (ISP_NVRAM_MASK | qdev->eeprom_cmd_data |
+				    dataBit | AUBURN_EEPROM_CLK_FALL));
+		eepromAddr = eepromAddr << 1;
+	}
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static void fm93c56a_deselect(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	__iomem u32 *spir = &port_regs->CommonRegs.serialPortInterfaceReg;
+
+	qdev->eeprom_cmd_data = AUBURN_EEPROM_CS_0;
+	ql_write_nvram_reg(qdev, spir, ISP_NVRAM_MASK | qdev->eeprom_cmd_data);
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static void fm93c56a_datain(struct ql3_adapter *qdev, unsigned short *value)
+{
+	int i;
+	u32 data = 0;
+	u32 dataBit;
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	__iomem u32 *spir = &port_regs->CommonRegs.serialPortInterfaceReg;
+
+	/* Read the data bits */
+	/* The first bit is a dummy.  Clock right over it. */
+	for (i = 0; i < dataBits; i++) {
+		ql_write_nvram_reg(qdev, spir,
+				   ISP_NVRAM_MASK | qdev->eeprom_cmd_data |
+				   AUBURN_EEPROM_CLK_RISE);
+		ql_write_nvram_reg(qdev, spir,
+				   ISP_NVRAM_MASK | qdev->eeprom_cmd_data |
+				   AUBURN_EEPROM_CLK_FALL);
+		dataBit = (ql_read_common_reg(qdev, spir) &
+			   AUBURN_EEPROM_DI_1) ? 1 : 0;
+		data = (data << 1) | dataBit;
+	}
+	*value = (u16)data;
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static void eeprom_readword(struct ql3_adapter *qdev,
+			    u32 eepromAddr, unsigned short *value)
+{
+	fm93c56a_select(qdev);
+	fm93c56a_cmd(qdev, (int)FM93C56A_READ, eepromAddr);
+	fm93c56a_datain(qdev, value);
+	fm93c56a_deselect(qdev);
+}
+
+static void ql_set_mac_addr(struct net_device *ndev, u16 *addr)
+{
+	__le16 *p = (__le16 *)ndev->dev_addr;
+	p[0] = cpu_to_le16(addr[0]);
+	p[1] = cpu_to_le16(addr[1]);
+	p[2] = cpu_to_le16(addr[2]);
+}
+
+static int ql_get_nvram_params(struct ql3_adapter *qdev)
+{
+	u16 *pEEPROMData;
+	u16 checksum = 0;
+	u32 index;
+	unsigned long hw_flags;
+
+	spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+
+	pEEPROMData = (u16 *)&qdev->nvram_data;
+	qdev->eeprom_cmd_data = 0;
+	if (ql_sem_spinlock(qdev, QL_NVRAM_SEM_MASK,
+			(QL_RESOURCE_BITS_BASE_CODE | (qdev->mac_index) *
+			 2) << 10)) {
+		pr_err("%s: Failed ql_sem_spinlock()\n", __func__);
+		spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+		return -1;
+	}
+
+	for (index = 0; index < EEPROM_SIZE; index++) {
+		eeprom_readword(qdev, index, pEEPROMData);
+		checksum += *pEEPROMData;
+		pEEPROMData++;
+	}
+	ql_sem_unlock(qdev, QL_NVRAM_SEM_MASK);
+
+	if (checksum != 0) {
+		netdev_err(qdev->ndev, "checksum should be zero, is %x!!\n",
+			   checksum);
+		spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+		return -1;
+	}
+
+	spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+	return checksum;
+}
+
+static const u32 PHYAddr[2] = {
+	PORT0_PHY_ADDRESS, PORT1_PHY_ADDRESS
+};
+
+static int ql_wait_for_mii_ready(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u32 temp;
+	int count = 1000;
+
+	while (count) {
+		temp = ql_read_page0_reg(qdev, &port_regs->macMIIStatusReg);
+		if (!(temp & MAC_MII_STATUS_BSY))
+			return 0;
+		udelay(10);
+		count--;
+	}
+	return -1;
+}
+
+static void ql_mii_enable_scan_mode(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u32 scanControl;
+
+	if (qdev->numPorts > 1) {
+		/* Auto scan will cycle through multiple ports */
+		scanControl = MAC_MII_CONTROL_AS | MAC_MII_CONTROL_SC;
+	} else {
+		scanControl = MAC_MII_CONTROL_SC;
+	}
+
+	/*
+	 * Scan register 1 of PHY/PETBI,
+	 * Set up to scan both devices
+	 * The autoscan starts from the first register, completes
+	 * the last one before rolling over to the first
+	 */
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtAddrReg,
+			   PHYAddr[0] | MII_SCAN_REGISTER);
+
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtControlReg,
+			   (scanControl) |
+			   ((MAC_MII_CONTROL_SC | MAC_MII_CONTROL_AS) << 16));
+}
+
+static u8 ql_mii_disable_scan_mode(struct ql3_adapter *qdev)
+{
+	u8 ret;
+	struct ql3xxx_port_registers __iomem *port_regs =
+					qdev->mem_map_registers;
+
+	/* See if scan mode is enabled before we turn it off */
+	if (ql_read_page0_reg(qdev, &port_regs->macMIIMgmtControlReg) &
+	    (MAC_MII_CONTROL_AS | MAC_MII_CONTROL_SC)) {
+		/* Scan is enabled */
+		ret = 1;
+	} else {
+		/* Scan is disabled */
+		ret = 0;
+	}
+
+	/*
+	 * When disabling scan mode you must first change the MII register
+	 * address
+	 */
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtAddrReg,
+			   PHYAddr[0] | MII_SCAN_REGISTER);
+
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtControlReg,
+			   ((MAC_MII_CONTROL_SC | MAC_MII_CONTROL_AS |
+			     MAC_MII_CONTROL_RC) << 16));
+
+	return ret;
+}
+
+static int ql_mii_write_reg_ex(struct ql3_adapter *qdev,
+			       u16 regAddr, u16 value, u32 phyAddr)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u8 scanWasEnabled;
+
+	scanWasEnabled = ql_mii_disable_scan_mode(qdev);
+
+	if (ql_wait_for_mii_ready(qdev)) {
+		netif_warn(qdev, link, qdev->ndev, TIMED_OUT_MSG);
+		return -1;
+	}
+
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtAddrReg,
+			   phyAddr | regAddr);
+
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtDataReg, value);
+
+	/* Wait for write to complete 9/10/04 SJP */
+	if (ql_wait_for_mii_ready(qdev)) {
+		netif_warn(qdev, link, qdev->ndev, TIMED_OUT_MSG);
+		return -1;
+	}
+
+	if (scanWasEnabled)
+		ql_mii_enable_scan_mode(qdev);
+
+	return 0;
+}
+
+static int ql_mii_read_reg_ex(struct ql3_adapter *qdev, u16 regAddr,
+			      u16 *value, u32 phyAddr)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u8 scanWasEnabled;
+	u32 temp;
+
+	scanWasEnabled = ql_mii_disable_scan_mode(qdev);
+
+	if (ql_wait_for_mii_ready(qdev)) {
+		netif_warn(qdev, link, qdev->ndev, TIMED_OUT_MSG);
+		return -1;
+	}
+
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtAddrReg,
+			   phyAddr | regAddr);
+
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtControlReg,
+			   (MAC_MII_CONTROL_RC << 16));
+
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtControlReg,
+			   (MAC_MII_CONTROL_RC << 16) | MAC_MII_CONTROL_RC);
+
+	/* Wait for the read to complete */
+	if (ql_wait_for_mii_ready(qdev)) {
+		netif_warn(qdev, link, qdev->ndev, TIMED_OUT_MSG);
+		return -1;
+	}
+
+	temp = ql_read_page0_reg(qdev, &port_regs->macMIIMgmtDataReg);
+	*value = (u16) temp;
+
+	if (scanWasEnabled)
+		ql_mii_enable_scan_mode(qdev);
+
+	return 0;
+}
+
+static int ql_mii_write_reg(struct ql3_adapter *qdev, u16 regAddr, u16 value)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+
+	ql_mii_disable_scan_mode(qdev);
+
+	if (ql_wait_for_mii_ready(qdev)) {
+		netif_warn(qdev, link, qdev->ndev, TIMED_OUT_MSG);
+		return -1;
+	}
+
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtAddrReg,
+			   qdev->PHYAddr | regAddr);
+
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtDataReg, value);
+
+	/* Wait for write to complete. */
+	if (ql_wait_for_mii_ready(qdev)) {
+		netif_warn(qdev, link, qdev->ndev, TIMED_OUT_MSG);
+		return -1;
+	}
+
+	ql_mii_enable_scan_mode(qdev);
+
+	return 0;
+}
+
+static int ql_mii_read_reg(struct ql3_adapter *qdev, u16 regAddr, u16 *value)
+{
+	u32 temp;
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+
+	ql_mii_disable_scan_mode(qdev);
+
+	if (ql_wait_for_mii_ready(qdev)) {
+		netif_warn(qdev, link, qdev->ndev, TIMED_OUT_MSG);
+		return -1;
+	}
+
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtAddrReg,
+			   qdev->PHYAddr | regAddr);
+
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtControlReg,
+			   (MAC_MII_CONTROL_RC << 16));
+
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtControlReg,
+			   (MAC_MII_CONTROL_RC << 16) | MAC_MII_CONTROL_RC);
+
+	/* Wait for the read to complete */
+	if (ql_wait_for_mii_ready(qdev)) {
+		netif_warn(qdev, link, qdev->ndev, TIMED_OUT_MSG);
+		return -1;
+	}
+
+	temp = ql_read_page0_reg(qdev, &port_regs->macMIIMgmtDataReg);
+	*value = (u16) temp;
+
+	ql_mii_enable_scan_mode(qdev);
+
+	return 0;
+}
+
+static void ql_petbi_reset(struct ql3_adapter *qdev)
+{
+	ql_mii_write_reg(qdev, PETBI_CONTROL_REG, PETBI_CTRL_SOFT_RESET);
+}
+
+static void ql_petbi_start_neg(struct ql3_adapter *qdev)
+{
+	u16 reg;
+
+	/* Enable Auto-negotiation sense */
+	ql_mii_read_reg(qdev, PETBI_TBI_CTRL, &reg);
+	reg |= PETBI_TBI_AUTO_SENSE;
+	ql_mii_write_reg(qdev, PETBI_TBI_CTRL, reg);
+
+	ql_mii_write_reg(qdev, PETBI_NEG_ADVER,
+			 PETBI_NEG_PAUSE | PETBI_NEG_DUPLEX);
+
+	ql_mii_write_reg(qdev, PETBI_CONTROL_REG,
+			 PETBI_CTRL_AUTO_NEG | PETBI_CTRL_RESTART_NEG |
+			 PETBI_CTRL_FULL_DUPLEX | PETBI_CTRL_SPEED_1000);
+
+}
+
+static void ql_petbi_reset_ex(struct ql3_adapter *qdev)
+{
+	ql_mii_write_reg_ex(qdev, PETBI_CONTROL_REG, PETBI_CTRL_SOFT_RESET,
+			    PHYAddr[qdev->mac_index]);
+}
+
+static void ql_petbi_start_neg_ex(struct ql3_adapter *qdev)
+{
+	u16 reg;
+
+	/* Enable Auto-negotiation sense */
+	ql_mii_read_reg_ex(qdev, PETBI_TBI_CTRL, &reg,
+			   PHYAddr[qdev->mac_index]);
+	reg |= PETBI_TBI_AUTO_SENSE;
+	ql_mii_write_reg_ex(qdev, PETBI_TBI_CTRL, reg,
+			    PHYAddr[qdev->mac_index]);
+
+	ql_mii_write_reg_ex(qdev, PETBI_NEG_ADVER,
+			    PETBI_NEG_PAUSE | PETBI_NEG_DUPLEX,
+			    PHYAddr[qdev->mac_index]);
+
+	ql_mii_write_reg_ex(qdev, PETBI_CONTROL_REG,
+			    PETBI_CTRL_AUTO_NEG | PETBI_CTRL_RESTART_NEG |
+			    PETBI_CTRL_FULL_DUPLEX | PETBI_CTRL_SPEED_1000,
+			    PHYAddr[qdev->mac_index]);
+}
+
+static void ql_petbi_init(struct ql3_adapter *qdev)
+{
+	ql_petbi_reset(qdev);
+	ql_petbi_start_neg(qdev);
+}
+
+static void ql_petbi_init_ex(struct ql3_adapter *qdev)
+{
+	ql_petbi_reset_ex(qdev);
+	ql_petbi_start_neg_ex(qdev);
+}
+
+static int ql_is_petbi_neg_pause(struct ql3_adapter *qdev)
+{
+	u16 reg;
+
+	if (ql_mii_read_reg(qdev, PETBI_NEG_PARTNER, &reg) < 0)
+		return 0;
+
+	return (reg & PETBI_NEG_PAUSE_MASK) == PETBI_NEG_PAUSE;
+}
+
+static void phyAgereSpecificInit(struct ql3_adapter *qdev, u32 miiAddr)
+{
+	netdev_info(qdev->ndev, "enabling Agere specific PHY\n");
+	/* power down device bit 11 = 1 */
+	ql_mii_write_reg_ex(qdev, 0x00, 0x1940, miiAddr);
+	/* enable diagnostic mode bit 2 = 1 */
+	ql_mii_write_reg_ex(qdev, 0x12, 0x840e, miiAddr);
+	/* 1000MB amplitude adjust (see Agere errata) */
+	ql_mii_write_reg_ex(qdev, 0x10, 0x8805, miiAddr);
+	/* 1000MB amplitude adjust (see Agere errata) */
+	ql_mii_write_reg_ex(qdev, 0x11, 0xf03e, miiAddr);
+	/* 100MB amplitude adjust (see Agere errata) */
+	ql_mii_write_reg_ex(qdev, 0x10, 0x8806, miiAddr);
+	/* 100MB amplitude adjust (see Agere errata) */
+	ql_mii_write_reg_ex(qdev, 0x11, 0x003e, miiAddr);
+	/* 10MB amplitude adjust (see Agere errata) */
+	ql_mii_write_reg_ex(qdev, 0x10, 0x8807, miiAddr);
+	/* 10MB amplitude adjust (see Agere errata) */
+	ql_mii_write_reg_ex(qdev, 0x11, 0x1f00, miiAddr);
+	/* point to hidden reg 0x2806 */
+	ql_mii_write_reg_ex(qdev, 0x10, 0x2806, miiAddr);
+	/* Write new PHYAD w/bit 5 set */
+	ql_mii_write_reg_ex(qdev, 0x11,
+			    0x0020 | (PHYAddr[qdev->mac_index] >> 8), miiAddr);
+	/*
+	 * Disable diagnostic mode bit 2 = 0
+	 * Power up device bit 11 = 0
+	 * Link up (on) and activity (blink)
+	 */
+	ql_mii_write_reg(qdev, 0x12, 0x840a);
+	ql_mii_write_reg(qdev, 0x00, 0x1140);
+	ql_mii_write_reg(qdev, 0x1c, 0xfaf0);
+}
+
+static enum PHY_DEVICE_TYPE getPhyType(struct ql3_adapter *qdev,
+				       u16 phyIdReg0, u16 phyIdReg1)
+{
+	enum PHY_DEVICE_TYPE result = PHY_TYPE_UNKNOWN;
+	u32   oui;
+	u16   model;
+	int i;
+
+	if (phyIdReg0 == 0xffff)
+		return result;
+
+	if (phyIdReg1 == 0xffff)
+		return result;
+
+	/* oui is split between two registers */
+	oui = (phyIdReg0 << 6) | ((phyIdReg1 & PHY_OUI_1_MASK) >> 10);
+
+	model = (phyIdReg1 & PHY_MODEL_MASK) >> 4;
+
+	/* Scan table for this PHY */
+	for (i = 0; i < MAX_PHY_DEV_TYPES; i++) {
+		if ((oui == PHY_DEVICES[i].phyIdOUI) &&
+		    (model == PHY_DEVICES[i].phyIdModel)) {
+			netdev_info(qdev->ndev, "Phy: %s\n",
+				    PHY_DEVICES[i].name);
+			result = PHY_DEVICES[i].phyDevice;
+			break;
+		}
+	}
+
+	return result;
+}
+
+static int ql_phy_get_speed(struct ql3_adapter *qdev)
+{
+	u16 reg;
+
+	switch (qdev->phyType) {
+	case PHY_AGERE_ET1011C: {
+		if (ql_mii_read_reg(qdev, 0x1A, &reg) < 0)
+			return 0;
+
+		reg = (reg >> 8) & 3;
+		break;
+	}
+	default:
+		if (ql_mii_read_reg(qdev, AUX_CONTROL_STATUS, &reg) < 0)
+			return 0;
+
+		reg = (((reg & 0x18) >> 3) & 3);
+	}
+
+	switch (reg) {
+	case 2:
+		return SPEED_1000;
+	case 1:
+		return SPEED_100;
+	case 0:
+		return SPEED_10;
+	default:
+		return -1;
+	}
+}
+
+static int ql_is_full_dup(struct ql3_adapter *qdev)
+{
+	u16 reg;
+
+	switch (qdev->phyType) {
+	case PHY_AGERE_ET1011C: {
+		if (ql_mii_read_reg(qdev, 0x1A, &reg))
+			return 0;
+
+		return ((reg & 0x0080) && (reg & 0x1000)) != 0;
+	}
+	case PHY_VITESSE_VSC8211:
+	default: {
+		if (ql_mii_read_reg(qdev, AUX_CONTROL_STATUS, &reg) < 0)
+			return 0;
+		return (reg & PHY_AUX_DUPLEX_STAT) != 0;
+	}
+	}
+}
+
+static int ql_is_phy_neg_pause(struct ql3_adapter *qdev)
+{
+	u16 reg;
+
+	if (ql_mii_read_reg(qdev, PHY_NEG_PARTNER, &reg) < 0)
+		return 0;
+
+	return (reg & PHY_NEG_PAUSE) != 0;
+}
+
+static int PHY_Setup(struct ql3_adapter *qdev)
+{
+	u16   reg1;
+	u16   reg2;
+	bool  agereAddrChangeNeeded = false;
+	u32 miiAddr = 0;
+	int err;
+
+	/*  Determine the PHY we are using by reading the ID's */
+	err = ql_mii_read_reg(qdev, PHY_ID_0_REG, &reg1);
+	if (err != 0) {
+		netdev_err(qdev->ndev, "Could not read from reg PHY_ID_0_REG\n");
+		return err;
+	}
+
+	err = ql_mii_read_reg(qdev, PHY_ID_1_REG, &reg2);
+	if (err != 0) {
+		netdev_err(qdev->ndev, "Could not read from reg PHY_ID_1_REG\n");
+		return err;
+	}
+
+	/*  Check if we have a Agere PHY */
+	if ((reg1 == 0xffff) || (reg2 == 0xffff)) {
+
+		/* Determine which MII address we should be using
+		   determined by the index of the card */
+		if (qdev->mac_index == 0)
+			miiAddr = MII_AGERE_ADDR_1;
+		else
+			miiAddr = MII_AGERE_ADDR_2;
+
+		err = ql_mii_read_reg_ex(qdev, PHY_ID_0_REG, &reg1, miiAddr);
+		if (err != 0) {
+			netdev_err(qdev->ndev,
+				   "Could not read from reg PHY_ID_0_REG after Agere detected\n");
+			return err;
+		}
+
+		err = ql_mii_read_reg_ex(qdev, PHY_ID_1_REG, &reg2, miiAddr);
+		if (err != 0) {
+			netdev_err(qdev->ndev, "Could not read from reg PHY_ID_1_REG after Agere detected\n");
+			return err;
+		}
+
+		/*  We need to remember to initialize the Agere PHY */
+		agereAddrChangeNeeded = true;
+	}
+
+	/*  Determine the particular PHY we have on board to apply
+	    PHY specific initializations */
+	qdev->phyType = getPhyType(qdev, reg1, reg2);
+
+	if ((qdev->phyType == PHY_AGERE_ET1011C) && agereAddrChangeNeeded) {
+		/* need this here so address gets changed */
+		phyAgereSpecificInit(qdev, miiAddr);
+	} else if (qdev->phyType == PHY_TYPE_UNKNOWN) {
+		netdev_err(qdev->ndev, "PHY is unknown\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static void ql_mac_enable(struct ql3_adapter *qdev, u32 enable)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u32 value;
+
+	if (enable)
+		value = (MAC_CONFIG_REG_PE | (MAC_CONFIG_REG_PE << 16));
+	else
+		value = (MAC_CONFIG_REG_PE << 16);
+
+	if (qdev->mac_index)
+		ql_write_page0_reg(qdev, &port_regs->mac1ConfigReg, value);
+	else
+		ql_write_page0_reg(qdev, &port_regs->mac0ConfigReg, value);
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static void ql_mac_cfg_soft_reset(struct ql3_adapter *qdev, u32 enable)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u32 value;
+
+	if (enable)
+		value = (MAC_CONFIG_REG_SR | (MAC_CONFIG_REG_SR << 16));
+	else
+		value = (MAC_CONFIG_REG_SR << 16);
+
+	if (qdev->mac_index)
+		ql_write_page0_reg(qdev, &port_regs->mac1ConfigReg, value);
+	else
+		ql_write_page0_reg(qdev, &port_regs->mac0ConfigReg, value);
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static void ql_mac_cfg_gig(struct ql3_adapter *qdev, u32 enable)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u32 value;
+
+	if (enable)
+		value = (MAC_CONFIG_REG_GM | (MAC_CONFIG_REG_GM << 16));
+	else
+		value = (MAC_CONFIG_REG_GM << 16);
+
+	if (qdev->mac_index)
+		ql_write_page0_reg(qdev, &port_regs->mac1ConfigReg, value);
+	else
+		ql_write_page0_reg(qdev, &port_regs->mac0ConfigReg, value);
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static void ql_mac_cfg_full_dup(struct ql3_adapter *qdev, u32 enable)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u32 value;
+
+	if (enable)
+		value = (MAC_CONFIG_REG_FD | (MAC_CONFIG_REG_FD << 16));
+	else
+		value = (MAC_CONFIG_REG_FD << 16);
+
+	if (qdev->mac_index)
+		ql_write_page0_reg(qdev, &port_regs->mac1ConfigReg, value);
+	else
+		ql_write_page0_reg(qdev, &port_regs->mac0ConfigReg, value);
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static void ql_mac_cfg_pause(struct ql3_adapter *qdev, u32 enable)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u32 value;
+
+	if (enable)
+		value =
+		    ((MAC_CONFIG_REG_TF | MAC_CONFIG_REG_RF) |
+		     ((MAC_CONFIG_REG_TF | MAC_CONFIG_REG_RF) << 16));
+	else
+		value = ((MAC_CONFIG_REG_TF | MAC_CONFIG_REG_RF) << 16);
+
+	if (qdev->mac_index)
+		ql_write_page0_reg(qdev, &port_regs->mac1ConfigReg, value);
+	else
+		ql_write_page0_reg(qdev, &port_regs->mac0ConfigReg, value);
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static int ql_is_fiber(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u32 bitToCheck = 0;
+	u32 temp;
+
+	switch (qdev->mac_index) {
+	case 0:
+		bitToCheck = PORT_STATUS_SM0;
+		break;
+	case 1:
+		bitToCheck = PORT_STATUS_SM1;
+		break;
+	}
+
+	temp = ql_read_page0_reg(qdev, &port_regs->portStatus);
+	return (temp & bitToCheck) != 0;
+}
+
+static int ql_is_auto_cfg(struct ql3_adapter *qdev)
+{
+	u16 reg;
+	ql_mii_read_reg(qdev, 0x00, &reg);
+	return (reg & 0x1000) != 0;
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static int ql_is_auto_neg_complete(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u32 bitToCheck = 0;
+	u32 temp;
+
+	switch (qdev->mac_index) {
+	case 0:
+		bitToCheck = PORT_STATUS_AC0;
+		break;
+	case 1:
+		bitToCheck = PORT_STATUS_AC1;
+		break;
+	}
+
+	temp = ql_read_page0_reg(qdev, &port_regs->portStatus);
+	if (temp & bitToCheck) {
+		netif_info(qdev, link, qdev->ndev, "Auto-Negotiate complete\n");
+		return 1;
+	}
+	netif_info(qdev, link, qdev->ndev, "Auto-Negotiate incomplete\n");
+	return 0;
+}
+
+/*
+ *  ql_is_neg_pause() returns 1 if pause was negotiated to be on
+ */
+static int ql_is_neg_pause(struct ql3_adapter *qdev)
+{
+	if (ql_is_fiber(qdev))
+		return ql_is_petbi_neg_pause(qdev);
+	else
+		return ql_is_phy_neg_pause(qdev);
+}
+
+static int ql_auto_neg_error(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u32 bitToCheck = 0;
+	u32 temp;
+
+	switch (qdev->mac_index) {
+	case 0:
+		bitToCheck = PORT_STATUS_AE0;
+		break;
+	case 1:
+		bitToCheck = PORT_STATUS_AE1;
+		break;
+	}
+	temp = ql_read_page0_reg(qdev, &port_regs->portStatus);
+	return (temp & bitToCheck) != 0;
+}
+
+static u32 ql_get_link_speed(struct ql3_adapter *qdev)
+{
+	if (ql_is_fiber(qdev))
+		return SPEED_1000;
+	else
+		return ql_phy_get_speed(qdev);
+}
+
+static int ql_is_link_full_dup(struct ql3_adapter *qdev)
+{
+	if (ql_is_fiber(qdev))
+		return 1;
+	else
+		return ql_is_full_dup(qdev);
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static int ql_link_down_detect(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u32 bitToCheck = 0;
+	u32 temp;
+
+	switch (qdev->mac_index) {
+	case 0:
+		bitToCheck = ISP_CONTROL_LINK_DN_0;
+		break;
+	case 1:
+		bitToCheck = ISP_CONTROL_LINK_DN_1;
+		break;
+	}
+
+	temp =
+	    ql_read_common_reg(qdev, &port_regs->CommonRegs.ispControlStatus);
+	return (temp & bitToCheck) != 0;
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static int ql_link_down_detect_clear(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+
+	switch (qdev->mac_index) {
+	case 0:
+		ql_write_common_reg(qdev,
+				    &port_regs->CommonRegs.ispControlStatus,
+				    (ISP_CONTROL_LINK_DN_0) |
+				    (ISP_CONTROL_LINK_DN_0 << 16));
+		break;
+
+	case 1:
+		ql_write_common_reg(qdev,
+				    &port_regs->CommonRegs.ispControlStatus,
+				    (ISP_CONTROL_LINK_DN_1) |
+				    (ISP_CONTROL_LINK_DN_1 << 16));
+		break;
+
+	default:
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static int ql_this_adapter_controls_port(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u32 bitToCheck = 0;
+	u32 temp;
+
+	switch (qdev->mac_index) {
+	case 0:
+		bitToCheck = PORT_STATUS_F1_ENABLED;
+		break;
+	case 1:
+		bitToCheck = PORT_STATUS_F3_ENABLED;
+		break;
+	default:
+		break;
+	}
+
+	temp = ql_read_page0_reg(qdev, &port_regs->portStatus);
+	if (temp & bitToCheck) {
+		netif_printk(qdev, link, KERN_DEBUG, qdev->ndev,
+			     "not link master\n");
+		return 0;
+	}
+
+	netif_printk(qdev, link, KERN_DEBUG, qdev->ndev, "link master\n");
+	return 1;
+}
+
+static void ql_phy_reset_ex(struct ql3_adapter *qdev)
+{
+	ql_mii_write_reg_ex(qdev, CONTROL_REG, PHY_CTRL_SOFT_RESET,
+			    PHYAddr[qdev->mac_index]);
+}
+
+static void ql_phy_start_neg_ex(struct ql3_adapter *qdev)
+{
+	u16 reg;
+	u16 portConfiguration;
+
+	if (qdev->phyType == PHY_AGERE_ET1011C)
+		ql_mii_write_reg(qdev, 0x13, 0x0000);
+					/* turn off external loopback */
+
+	if (qdev->mac_index == 0)
+		portConfiguration =
+			qdev->nvram_data.macCfg_port0.portConfiguration;
+	else
+		portConfiguration =
+			qdev->nvram_data.macCfg_port1.portConfiguration;
+
+	/*  Some HBA's in the field are set to 0 and they need to
+	    be reinterpreted with a default value */
+	if (portConfiguration == 0)
+		portConfiguration = PORT_CONFIG_DEFAULT;
+
+	/* Set the 1000 advertisements */
+	ql_mii_read_reg_ex(qdev, PHY_GIG_CONTROL, &reg,
+			   PHYAddr[qdev->mac_index]);
+	reg &= ~PHY_GIG_ALL_PARAMS;
+
+	if (portConfiguration & PORT_CONFIG_1000MB_SPEED) {
+		if (portConfiguration & PORT_CONFIG_FULL_DUPLEX_ENABLED)
+			reg |= PHY_GIG_ADV_1000F;
+		else
+			reg |= PHY_GIG_ADV_1000H;
+	}
+
+	ql_mii_write_reg_ex(qdev, PHY_GIG_CONTROL, reg,
+			    PHYAddr[qdev->mac_index]);
+
+	/* Set the 10/100 & pause negotiation advertisements */
+	ql_mii_read_reg_ex(qdev, PHY_NEG_ADVER, &reg,
+			   PHYAddr[qdev->mac_index]);
+	reg &= ~PHY_NEG_ALL_PARAMS;
+
+	if (portConfiguration & PORT_CONFIG_SYM_PAUSE_ENABLED)
+		reg |= PHY_NEG_ASY_PAUSE | PHY_NEG_SYM_PAUSE;
+
+	if (portConfiguration & PORT_CONFIG_FULL_DUPLEX_ENABLED) {
+		if (portConfiguration & PORT_CONFIG_100MB_SPEED)
+			reg |= PHY_NEG_ADV_100F;
+
+		if (portConfiguration & PORT_CONFIG_10MB_SPEED)
+			reg |= PHY_NEG_ADV_10F;
+	}
+
+	if (portConfiguration & PORT_CONFIG_HALF_DUPLEX_ENABLED) {
+		if (portConfiguration & PORT_CONFIG_100MB_SPEED)
+			reg |= PHY_NEG_ADV_100H;
+
+		if (portConfiguration & PORT_CONFIG_10MB_SPEED)
+			reg |= PHY_NEG_ADV_10H;
+	}
+
+	if (portConfiguration & PORT_CONFIG_1000MB_SPEED)
+		reg |= 1;
+
+	ql_mii_write_reg_ex(qdev, PHY_NEG_ADVER, reg,
+			    PHYAddr[qdev->mac_index]);
+
+	ql_mii_read_reg_ex(qdev, CONTROL_REG, &reg, PHYAddr[qdev->mac_index]);
+
+	ql_mii_write_reg_ex(qdev, CONTROL_REG,
+			    reg | PHY_CTRL_RESTART_NEG | PHY_CTRL_AUTO_NEG,
+			    PHYAddr[qdev->mac_index]);
+}
+
+static void ql_phy_init_ex(struct ql3_adapter *qdev)
+{
+	ql_phy_reset_ex(qdev);
+	PHY_Setup(qdev);
+	ql_phy_start_neg_ex(qdev);
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static u32 ql_get_link_state(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	u32 bitToCheck = 0;
+	u32 temp, linkState;
+
+	switch (qdev->mac_index) {
+	case 0:
+		bitToCheck = PORT_STATUS_UP0;
+		break;
+	case 1:
+		bitToCheck = PORT_STATUS_UP1;
+		break;
+	}
+
+	temp = ql_read_page0_reg(qdev, &port_regs->portStatus);
+	if (temp & bitToCheck)
+		linkState = LS_UP;
+	else
+		linkState = LS_DOWN;
+
+	return linkState;
+}
+
+static int ql_port_start(struct ql3_adapter *qdev)
+{
+	if (ql_sem_spinlock(qdev, QL_PHY_GIO_SEM_MASK,
+		(QL_RESOURCE_BITS_BASE_CODE | (qdev->mac_index) *
+			 2) << 7)) {
+		netdev_err(qdev->ndev, "Could not get hw lock for GIO\n");
+		return -1;
+	}
+
+	if (ql_is_fiber(qdev)) {
+		ql_petbi_init(qdev);
+	} else {
+		/* Copper port */
+		ql_phy_init_ex(qdev);
+	}
+
+	ql_sem_unlock(qdev, QL_PHY_GIO_SEM_MASK);
+	return 0;
+}
+
+static int ql_finish_auto_neg(struct ql3_adapter *qdev)
+{
+
+	if (ql_sem_spinlock(qdev, QL_PHY_GIO_SEM_MASK,
+		(QL_RESOURCE_BITS_BASE_CODE | (qdev->mac_index) *
+			 2) << 7))
+		return -1;
+
+	if (!ql_auto_neg_error(qdev)) {
+		if (test_bit(QL_LINK_MASTER, &qdev->flags)) {
+			/* configure the MAC */
+			netif_printk(qdev, link, KERN_DEBUG, qdev->ndev,
+				     "Configuring link\n");
+			ql_mac_cfg_soft_reset(qdev, 1);
+			ql_mac_cfg_gig(qdev,
+				       (ql_get_link_speed
+					(qdev) ==
+					SPEED_1000));
+			ql_mac_cfg_full_dup(qdev,
+					    ql_is_link_full_dup
+					    (qdev));
+			ql_mac_cfg_pause(qdev,
+					 ql_is_neg_pause
+					 (qdev));
+			ql_mac_cfg_soft_reset(qdev, 0);
+
+			/* enable the MAC */
+			netif_printk(qdev, link, KERN_DEBUG, qdev->ndev,
+				     "Enabling mac\n");
+			ql_mac_enable(qdev, 1);
+		}
+
+		qdev->port_link_state = LS_UP;
+		netif_start_queue(qdev->ndev);
+		netif_carrier_on(qdev->ndev);
+		netif_info(qdev, link, qdev->ndev,
+			   "Link is up at %d Mbps, %s duplex\n",
+			   ql_get_link_speed(qdev),
+			   ql_is_link_full_dup(qdev) ? "full" : "half");
+
+	} else {	/* Remote error detected */
+
+		if (test_bit(QL_LINK_MASTER, &qdev->flags)) {
+			netif_printk(qdev, link, KERN_DEBUG, qdev->ndev,
+				     "Remote error detected. Calling ql_port_start()\n");
+			/*
+			 * ql_port_start() is shared code and needs
+			 * to lock the PHY on it's own.
+			 */
+			ql_sem_unlock(qdev, QL_PHY_GIO_SEM_MASK);
+			if (ql_port_start(qdev))	/* Restart port */
+				return -1;
+			return 0;
+		}
+	}
+	ql_sem_unlock(qdev, QL_PHY_GIO_SEM_MASK);
+	return 0;
+}
+
+static void ql_link_state_machine_work(struct work_struct *work)
+{
+	struct ql3_adapter *qdev =
+		container_of(work, struct ql3_adapter, link_state_work.work);
+
+	u32 curr_link_state;
+	unsigned long hw_flags;
+
+	spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+
+	curr_link_state = ql_get_link_state(qdev);
+
+	if (test_bit(QL_RESET_ACTIVE, &qdev->flags)) {
+		netif_info(qdev, link, qdev->ndev,
+			   "Reset in progress, skip processing link state\n");
+
+		spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+
+		/* Restart timer on 2 second interval. */
+		mod_timer(&qdev->adapter_timer, jiffies + HZ * 1);
+
+		return;
+	}
+
+	switch (qdev->port_link_state) {
+	default:
+		if (test_bit(QL_LINK_MASTER, &qdev->flags))
+			ql_port_start(qdev);
+		qdev->port_link_state = LS_DOWN;
+		/* Fall Through */
+
+	case LS_DOWN:
+		if (curr_link_state == LS_UP) {
+			netif_info(qdev, link, qdev->ndev, "Link is up\n");
+			if (ql_is_auto_neg_complete(qdev))
+				ql_finish_auto_neg(qdev);
+
+			if (qdev->port_link_state == LS_UP)
+				ql_link_down_detect_clear(qdev);
+
+			qdev->port_link_state = LS_UP;
+		}
+		break;
+
+	case LS_UP:
+		/*
+		 * See if the link is currently down or went down and came
+		 * back up
+		 */
+		if (curr_link_state == LS_DOWN) {
+			netif_info(qdev, link, qdev->ndev, "Link is down\n");
+			qdev->port_link_state = LS_DOWN;
+		}
+		if (ql_link_down_detect(qdev))
+			qdev->port_link_state = LS_DOWN;
+		break;
+	}
+	spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+
+	/* Restart timer on 2 second interval. */
+	mod_timer(&qdev->adapter_timer, jiffies + HZ * 1);
+}
+
+/*
+ * Caller must take hw_lock and QL_PHY_GIO_SEM.
+ */
+static void ql_get_phy_owner(struct ql3_adapter *qdev)
+{
+	if (ql_this_adapter_controls_port(qdev))
+		set_bit(QL_LINK_MASTER, &qdev->flags);
+	else
+		clear_bit(QL_LINK_MASTER, &qdev->flags);
+}
+
+/*
+ * Caller must take hw_lock and QL_PHY_GIO_SEM.
+ */
+static void ql_init_scan_mode(struct ql3_adapter *qdev)
+{
+	ql_mii_enable_scan_mode(qdev);
+
+	if (test_bit(QL_LINK_OPTICAL, &qdev->flags)) {
+		if (ql_this_adapter_controls_port(qdev))
+			ql_petbi_init_ex(qdev);
+	} else {
+		if (ql_this_adapter_controls_port(qdev))
+			ql_phy_init_ex(qdev);
+	}
+}
+
+/*
+ * MII_Setup needs to be called before taking the PHY out of reset
+ * so that the management interface clock speed can be set properly.
+ * It would be better if we had a way to disable MDC until after the
+ * PHY is out of reset, but we don't have that capability.
+ */
+static int ql_mii_setup(struct ql3_adapter *qdev)
+{
+	u32 reg;
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+
+	if (ql_sem_spinlock(qdev, QL_PHY_GIO_SEM_MASK,
+			(QL_RESOURCE_BITS_BASE_CODE | (qdev->mac_index) *
+			 2) << 7))
+		return -1;
+
+	if (qdev->device_id == QL3032_DEVICE_ID)
+		ql_write_page0_reg(qdev,
+			&port_regs->macMIIMgmtControlReg, 0x0f00000);
+
+	/* Divide 125MHz clock by 28 to meet PHY timing requirements */
+	reg = MAC_MII_CONTROL_CLK_SEL_DIV28;
+
+	ql_write_page0_reg(qdev, &port_regs->macMIIMgmtControlReg,
+			   reg | ((MAC_MII_CONTROL_CLK_SEL_MASK) << 16));
+
+	ql_sem_unlock(qdev, QL_PHY_GIO_SEM_MASK);
+	return 0;
+}
+
+#define SUPPORTED_OPTICAL_MODES	(SUPPORTED_1000baseT_Full |	\
+				 SUPPORTED_FIBRE |		\
+				 SUPPORTED_Autoneg)
+#define SUPPORTED_TP_MODES	(SUPPORTED_10baseT_Half |	\
+				 SUPPORTED_10baseT_Full |	\
+				 SUPPORTED_100baseT_Half |	\
+				 SUPPORTED_100baseT_Full |	\
+				 SUPPORTED_1000baseT_Half |	\
+				 SUPPORTED_1000baseT_Full |	\
+				 SUPPORTED_Autoneg |		\
+				 SUPPORTED_TP)			\
+
+static u32 ql_supported_modes(struct ql3_adapter *qdev)
+{
+	if (test_bit(QL_LINK_OPTICAL, &qdev->flags))
+		return SUPPORTED_OPTICAL_MODES;
+
+	return SUPPORTED_TP_MODES;
+}
+
+static int ql_get_auto_cfg_status(struct ql3_adapter *qdev)
+{
+	int status;
+	unsigned long hw_flags;
+	spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+	if (ql_sem_spinlock(qdev, QL_PHY_GIO_SEM_MASK,
+			    (QL_RESOURCE_BITS_BASE_CODE |
+			     (qdev->mac_index) * 2) << 7)) {
+		spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+		return 0;
+	}
+	status = ql_is_auto_cfg(qdev);
+	ql_sem_unlock(qdev, QL_PHY_GIO_SEM_MASK);
+	spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+	return status;
+}
+
+static u32 ql_get_speed(struct ql3_adapter *qdev)
+{
+	u32 status;
+	unsigned long hw_flags;
+	spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+	if (ql_sem_spinlock(qdev, QL_PHY_GIO_SEM_MASK,
+			    (QL_RESOURCE_BITS_BASE_CODE |
+			     (qdev->mac_index) * 2) << 7)) {
+		spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+		return 0;
+	}
+	status = ql_get_link_speed(qdev);
+	ql_sem_unlock(qdev, QL_PHY_GIO_SEM_MASK);
+	spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+	return status;
+}
+
+static int ql_get_full_dup(struct ql3_adapter *qdev)
+{
+	int status;
+	unsigned long hw_flags;
+	spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+	if (ql_sem_spinlock(qdev, QL_PHY_GIO_SEM_MASK,
+			    (QL_RESOURCE_BITS_BASE_CODE |
+			     (qdev->mac_index) * 2) << 7)) {
+		spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+		return 0;
+	}
+	status = ql_is_link_full_dup(qdev);
+	ql_sem_unlock(qdev, QL_PHY_GIO_SEM_MASK);
+	spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+	return status;
+}
+
+static int ql_get_link_ksettings(struct net_device *ndev,
+				 struct ethtool_link_ksettings *cmd)
+{
+	struct ql3_adapter *qdev = netdev_priv(ndev);
+	u32 supported, advertising;
+
+	supported = ql_supported_modes(qdev);
+
+	if (test_bit(QL_LINK_OPTICAL, &qdev->flags)) {
+		cmd->base.port = PORT_FIBRE;
+	} else {
+		cmd->base.port = PORT_TP;
+		cmd->base.phy_address = qdev->PHYAddr;
+	}
+	advertising = ql_supported_modes(qdev);
+	cmd->base.autoneg = ql_get_auto_cfg_status(qdev);
+	cmd->base.speed = ql_get_speed(qdev);
+	cmd->base.duplex = ql_get_full_dup(qdev);
+
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
+						supported);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
+						advertising);
+
+	return 0;
+}
+
+static void ql_get_drvinfo(struct net_device *ndev,
+			   struct ethtool_drvinfo *drvinfo)
+{
+	struct ql3_adapter *qdev = netdev_priv(ndev);
+	strlcpy(drvinfo->driver, ql3xxx_driver_name, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, ql3xxx_driver_version,
+		sizeof(drvinfo->version));
+	strlcpy(drvinfo->bus_info, pci_name(qdev->pdev),
+		sizeof(drvinfo->bus_info));
+}
+
+static u32 ql_get_msglevel(struct net_device *ndev)
+{
+	struct ql3_adapter *qdev = netdev_priv(ndev);
+	return qdev->msg_enable;
+}
+
+static void ql_set_msglevel(struct net_device *ndev, u32 value)
+{
+	struct ql3_adapter *qdev = netdev_priv(ndev);
+	qdev->msg_enable = value;
+}
+
+static void ql_get_pauseparam(struct net_device *ndev,
+			      struct ethtool_pauseparam *pause)
+{
+	struct ql3_adapter *qdev = netdev_priv(ndev);
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+
+	u32 reg;
+	if (qdev->mac_index == 0)
+		reg = ql_read_page0_reg(qdev, &port_regs->mac0ConfigReg);
+	else
+		reg = ql_read_page0_reg(qdev, &port_regs->mac1ConfigReg);
+
+	pause->autoneg  = ql_get_auto_cfg_status(qdev);
+	pause->rx_pause = (reg & MAC_CONFIG_REG_RF) >> 2;
+	pause->tx_pause = (reg & MAC_CONFIG_REG_TF) >> 1;
+}
+
+static const struct ethtool_ops ql3xxx_ethtool_ops = {
+	.get_drvinfo = ql_get_drvinfo,
+	.get_link = ethtool_op_get_link,
+	.get_msglevel = ql_get_msglevel,
+	.set_msglevel = ql_set_msglevel,
+	.get_pauseparam = ql_get_pauseparam,
+	.get_link_ksettings = ql_get_link_ksettings,
+};
+
+static int ql_populate_free_queue(struct ql3_adapter *qdev)
+{
+	struct ql_rcv_buf_cb *lrg_buf_cb = qdev->lrg_buf_free_head;
+	dma_addr_t map;
+	int err;
+
+	while (lrg_buf_cb) {
+		if (!lrg_buf_cb->skb) {
+			lrg_buf_cb->skb =
+				netdev_alloc_skb(qdev->ndev,
+						 qdev->lrg_buffer_len);
+			if (unlikely(!lrg_buf_cb->skb)) {
+				netdev_printk(KERN_DEBUG, qdev->ndev,
+					      "Failed netdev_alloc_skb()\n");
+				break;
+			} else {
+				/*
+				 * We save some space to copy the ethhdr from
+				 * first buffer
+				 */
+				skb_reserve(lrg_buf_cb->skb, QL_HEADER_SPACE);
+				map = pci_map_single(qdev->pdev,
+						     lrg_buf_cb->skb->data,
+						     qdev->lrg_buffer_len -
+						     QL_HEADER_SPACE,
+						     PCI_DMA_FROMDEVICE);
+
+				err = pci_dma_mapping_error(qdev->pdev, map);
+				if (err) {
+					netdev_err(qdev->ndev,
+						   "PCI mapping failed with error: %d\n",
+						   err);
+					dev_kfree_skb(lrg_buf_cb->skb);
+					lrg_buf_cb->skb = NULL;
+					break;
+				}
+
+
+				lrg_buf_cb->buf_phy_addr_low =
+					cpu_to_le32(LS_64BITS(map));
+				lrg_buf_cb->buf_phy_addr_high =
+					cpu_to_le32(MS_64BITS(map));
+				dma_unmap_addr_set(lrg_buf_cb, mapaddr, map);
+				dma_unmap_len_set(lrg_buf_cb, maplen,
+						  qdev->lrg_buffer_len -
+						  QL_HEADER_SPACE);
+				--qdev->lrg_buf_skb_check;
+				if (!qdev->lrg_buf_skb_check)
+					return 1;
+			}
+		}
+		lrg_buf_cb = lrg_buf_cb->next;
+	}
+	return 0;
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static void ql_update_small_bufq_prod_index(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+
+	if (qdev->small_buf_release_cnt >= 16) {
+		while (qdev->small_buf_release_cnt >= 16) {
+			qdev->small_buf_q_producer_index++;
+
+			if (qdev->small_buf_q_producer_index ==
+			    NUM_SBUFQ_ENTRIES)
+				qdev->small_buf_q_producer_index = 0;
+			qdev->small_buf_release_cnt -= 8;
+		}
+		wmb();
+		writel_relaxed(qdev->small_buf_q_producer_index,
+			       &port_regs->CommonRegs.rxSmallQProducerIndex);
+		mmiowb();
+	}
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static void ql_update_lrg_bufq_prod_index(struct ql3_adapter *qdev)
+{
+	struct bufq_addr_element *lrg_buf_q_ele;
+	int i;
+	struct ql_rcv_buf_cb *lrg_buf_cb;
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+
+	if ((qdev->lrg_buf_free_count >= 8) &&
+	    (qdev->lrg_buf_release_cnt >= 16)) {
+
+		if (qdev->lrg_buf_skb_check)
+			if (!ql_populate_free_queue(qdev))
+				return;
+
+		lrg_buf_q_ele = qdev->lrg_buf_next_free;
+
+		while ((qdev->lrg_buf_release_cnt >= 16) &&
+		       (qdev->lrg_buf_free_count >= 8)) {
+
+			for (i = 0; i < 8; i++) {
+				lrg_buf_cb =
+				    ql_get_from_lrg_buf_free_list(qdev);
+				lrg_buf_q_ele->addr_high =
+				    lrg_buf_cb->buf_phy_addr_high;
+				lrg_buf_q_ele->addr_low =
+				    lrg_buf_cb->buf_phy_addr_low;
+				lrg_buf_q_ele++;
+
+				qdev->lrg_buf_release_cnt--;
+			}
+
+			qdev->lrg_buf_q_producer_index++;
+
+			if (qdev->lrg_buf_q_producer_index ==
+			    qdev->num_lbufq_entries)
+				qdev->lrg_buf_q_producer_index = 0;
+
+			if (qdev->lrg_buf_q_producer_index ==
+			    (qdev->num_lbufq_entries - 1)) {
+				lrg_buf_q_ele = qdev->lrg_buf_q_virt_addr;
+			}
+		}
+		wmb();
+		qdev->lrg_buf_next_free = lrg_buf_q_ele;
+		writel(qdev->lrg_buf_q_producer_index,
+			&port_regs->CommonRegs.rxLargeQProducerIndex);
+	}
+}
+
+static void ql_process_mac_tx_intr(struct ql3_adapter *qdev,
+				   struct ob_mac_iocb_rsp *mac_rsp)
+{
+	struct ql_tx_buf_cb *tx_cb;
+	int i;
+
+	if (mac_rsp->flags & OB_MAC_IOCB_RSP_S) {
+		netdev_warn(qdev->ndev,
+			    "Frame too short but it was padded and sent\n");
+	}
+
+	tx_cb = &qdev->tx_buf[mac_rsp->transaction_id];
+
+	/*  Check the transmit response flags for any errors */
+	if (mac_rsp->flags & OB_MAC_IOCB_RSP_S) {
+		netdev_err(qdev->ndev,
+			   "Frame too short to be legal, frame not sent\n");
+
+		qdev->ndev->stats.tx_errors++;
+		goto frame_not_sent;
+	}
+
+	if (tx_cb->seg_count == 0) {
+		netdev_err(qdev->ndev, "tx_cb->seg_count == 0: %d\n",
+			   mac_rsp->transaction_id);
+
+		qdev->ndev->stats.tx_errors++;
+		goto invalid_seg_count;
+	}
+
+	pci_unmap_single(qdev->pdev,
+			 dma_unmap_addr(&tx_cb->map[0], mapaddr),
+			 dma_unmap_len(&tx_cb->map[0], maplen),
+			 PCI_DMA_TODEVICE);
+	tx_cb->seg_count--;
+	if (tx_cb->seg_count) {
+		for (i = 1; i < tx_cb->seg_count; i++) {
+			pci_unmap_page(qdev->pdev,
+				       dma_unmap_addr(&tx_cb->map[i],
+						      mapaddr),
+				       dma_unmap_len(&tx_cb->map[i], maplen),
+				       PCI_DMA_TODEVICE);
+		}
+	}
+	qdev->ndev->stats.tx_packets++;
+	qdev->ndev->stats.tx_bytes += tx_cb->skb->len;
+
+frame_not_sent:
+	dev_kfree_skb_irq(tx_cb->skb);
+	tx_cb->skb = NULL;
+
+invalid_seg_count:
+	atomic_inc(&qdev->tx_count);
+}
+
+static void ql_get_sbuf(struct ql3_adapter *qdev)
+{
+	if (++qdev->small_buf_index == NUM_SMALL_BUFFERS)
+		qdev->small_buf_index = 0;
+	qdev->small_buf_release_cnt++;
+}
+
+static struct ql_rcv_buf_cb *ql_get_lbuf(struct ql3_adapter *qdev)
+{
+	struct ql_rcv_buf_cb *lrg_buf_cb = NULL;
+	lrg_buf_cb = &qdev->lrg_buf[qdev->lrg_buf_index];
+	qdev->lrg_buf_release_cnt++;
+	if (++qdev->lrg_buf_index == qdev->num_large_buffers)
+		qdev->lrg_buf_index = 0;
+	return lrg_buf_cb;
+}
+
+/*
+ * The difference between 3022 and 3032 for inbound completions:
+ * 3022 uses two buffers per completion.  The first buffer contains
+ * (some) header info, the second the remainder of the headers plus
+ * the data.  For this chip we reserve some space at the top of the
+ * receive buffer so that the header info in buffer one can be
+ * prepended to the buffer two.  Buffer two is the sent up while
+ * buffer one is returned to the hardware to be reused.
+ * 3032 receives all of it's data and headers in one buffer for a
+ * simpler process.  3032 also supports checksum verification as
+ * can be seen in ql_process_macip_rx_intr().
+ */
+static void ql_process_mac_rx_intr(struct ql3_adapter *qdev,
+				   struct ib_mac_iocb_rsp *ib_mac_rsp_ptr)
+{
+	struct ql_rcv_buf_cb *lrg_buf_cb1 = NULL;
+	struct ql_rcv_buf_cb *lrg_buf_cb2 = NULL;
+	struct sk_buff *skb;
+	u16 length = le16_to_cpu(ib_mac_rsp_ptr->length);
+
+	/*
+	 * Get the inbound address list (small buffer).
+	 */
+	ql_get_sbuf(qdev);
+
+	if (qdev->device_id == QL3022_DEVICE_ID)
+		lrg_buf_cb1 = ql_get_lbuf(qdev);
+
+	/* start of second buffer */
+	lrg_buf_cb2 = ql_get_lbuf(qdev);
+	skb = lrg_buf_cb2->skb;
+
+	qdev->ndev->stats.rx_packets++;
+	qdev->ndev->stats.rx_bytes += length;
+
+	skb_put(skb, length);
+	pci_unmap_single(qdev->pdev,
+			 dma_unmap_addr(lrg_buf_cb2, mapaddr),
+			 dma_unmap_len(lrg_buf_cb2, maplen),
+			 PCI_DMA_FROMDEVICE);
+	prefetch(skb->data);
+	skb_checksum_none_assert(skb);
+	skb->protocol = eth_type_trans(skb, qdev->ndev);
+
+	napi_gro_receive(&qdev->napi, skb);
+	lrg_buf_cb2->skb = NULL;
+
+	if (qdev->device_id == QL3022_DEVICE_ID)
+		ql_release_to_lrg_buf_free_list(qdev, lrg_buf_cb1);
+	ql_release_to_lrg_buf_free_list(qdev, lrg_buf_cb2);
+}
+
+static void ql_process_macip_rx_intr(struct ql3_adapter *qdev,
+				     struct ib_ip_iocb_rsp *ib_ip_rsp_ptr)
+{
+	struct ql_rcv_buf_cb *lrg_buf_cb1 = NULL;
+	struct ql_rcv_buf_cb *lrg_buf_cb2 = NULL;
+	struct sk_buff *skb1 = NULL, *skb2;
+	struct net_device *ndev = qdev->ndev;
+	u16 length = le16_to_cpu(ib_ip_rsp_ptr->length);
+	u16 size = 0;
+
+	/*
+	 * Get the inbound address list (small buffer).
+	 */
+
+	ql_get_sbuf(qdev);
+
+	if (qdev->device_id == QL3022_DEVICE_ID) {
+		/* start of first buffer on 3022 */
+		lrg_buf_cb1 = ql_get_lbuf(qdev);
+		skb1 = lrg_buf_cb1->skb;
+		size = ETH_HLEN;
+		if (*((u16 *) skb1->data) != 0xFFFF)
+			size += VLAN_ETH_HLEN - ETH_HLEN;
+	}
+
+	/* start of second buffer */
+	lrg_buf_cb2 = ql_get_lbuf(qdev);
+	skb2 = lrg_buf_cb2->skb;
+
+	skb_put(skb2, length);	/* Just the second buffer length here. */
+	pci_unmap_single(qdev->pdev,
+			 dma_unmap_addr(lrg_buf_cb2, mapaddr),
+			 dma_unmap_len(lrg_buf_cb2, maplen),
+			 PCI_DMA_FROMDEVICE);
+	prefetch(skb2->data);
+
+	skb_checksum_none_assert(skb2);
+	if (qdev->device_id == QL3022_DEVICE_ID) {
+		/*
+		 * Copy the ethhdr from first buffer to second. This
+		 * is necessary for 3022 IP completions.
+		 */
+		skb_copy_from_linear_data_offset(skb1, VLAN_ID_LEN,
+						 skb_push(skb2, size), size);
+	} else {
+		u16 checksum = le16_to_cpu(ib_ip_rsp_ptr->checksum);
+		if (checksum &
+			(IB_IP_IOCB_RSP_3032_ICE |
+			 IB_IP_IOCB_RSP_3032_CE)) {
+			netdev_err(ndev,
+				   "%s: Bad checksum for this %s packet, checksum = %x\n",
+				   __func__,
+				   ((checksum & IB_IP_IOCB_RSP_3032_TCP) ?
+				    "TCP" : "UDP"), checksum);
+		} else if ((checksum & IB_IP_IOCB_RSP_3032_TCP) ||
+				(checksum & IB_IP_IOCB_RSP_3032_UDP &&
+				!(checksum & IB_IP_IOCB_RSP_3032_NUC))) {
+			skb2->ip_summed = CHECKSUM_UNNECESSARY;
+		}
+	}
+	skb2->protocol = eth_type_trans(skb2, qdev->ndev);
+
+	napi_gro_receive(&qdev->napi, skb2);
+	ndev->stats.rx_packets++;
+	ndev->stats.rx_bytes += length;
+	lrg_buf_cb2->skb = NULL;
+
+	if (qdev->device_id == QL3022_DEVICE_ID)
+		ql_release_to_lrg_buf_free_list(qdev, lrg_buf_cb1);
+	ql_release_to_lrg_buf_free_list(qdev, lrg_buf_cb2);
+}
+
+static int ql_tx_rx_clean(struct ql3_adapter *qdev, int budget)
+{
+	struct net_rsp_iocb *net_rsp;
+	struct net_device *ndev = qdev->ndev;
+	int work_done = 0;
+
+	/* While there are entries in the completion queue. */
+	while ((le32_to_cpu(*(qdev->prsp_producer_index)) !=
+		qdev->rsp_consumer_index) && (work_done < budget)) {
+
+		net_rsp = qdev->rsp_current;
+		rmb();
+		/*
+		 * Fix 4032 chip's undocumented "feature" where bit-8 is set
+		 * if the inbound completion is for a VLAN.
+		 */
+		if (qdev->device_id == QL3032_DEVICE_ID)
+			net_rsp->opcode &= 0x7f;
+		switch (net_rsp->opcode) {
+
+		case OPCODE_OB_MAC_IOCB_FN0:
+		case OPCODE_OB_MAC_IOCB_FN2:
+			ql_process_mac_tx_intr(qdev, (struct ob_mac_iocb_rsp *)
+					       net_rsp);
+			break;
+
+		case OPCODE_IB_MAC_IOCB:
+		case OPCODE_IB_3032_MAC_IOCB:
+			ql_process_mac_rx_intr(qdev, (struct ib_mac_iocb_rsp *)
+					       net_rsp);
+			work_done++;
+			break;
+
+		case OPCODE_IB_IP_IOCB:
+		case OPCODE_IB_3032_IP_IOCB:
+			ql_process_macip_rx_intr(qdev, (struct ib_ip_iocb_rsp *)
+						 net_rsp);
+			work_done++;
+			break;
+		default: {
+			u32 *tmp = (u32 *)net_rsp;
+			netdev_err(ndev,
+				   "Hit default case, not handled!\n"
+				   "	dropping the packet, opcode = %x\n"
+				   "0x%08lx 0x%08lx 0x%08lx 0x%08lx\n",
+				   net_rsp->opcode,
+				   (unsigned long int)tmp[0],
+				   (unsigned long int)tmp[1],
+				   (unsigned long int)tmp[2],
+				   (unsigned long int)tmp[3]);
+		}
+		}
+
+		qdev->rsp_consumer_index++;
+
+		if (qdev->rsp_consumer_index == NUM_RSP_Q_ENTRIES) {
+			qdev->rsp_consumer_index = 0;
+			qdev->rsp_current = qdev->rsp_q_virt_addr;
+		} else {
+			qdev->rsp_current++;
+		}
+
+	}
+
+	return work_done;
+}
+
+static int ql_poll(struct napi_struct *napi, int budget)
+{
+	struct ql3_adapter *qdev = container_of(napi, struct ql3_adapter, napi);
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+	int work_done;
+
+	work_done = ql_tx_rx_clean(qdev, budget);
+
+	if (work_done < budget && napi_complete_done(napi, work_done)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&qdev->hw_lock, flags);
+		ql_update_small_bufq_prod_index(qdev);
+		ql_update_lrg_bufq_prod_index(qdev);
+		writel(qdev->rsp_consumer_index,
+			    &port_regs->CommonRegs.rspQConsumerIndex);
+		spin_unlock_irqrestore(&qdev->hw_lock, flags);
+
+		ql_enable_interrupts(qdev);
+	}
+	return work_done;
+}
+
+static irqreturn_t ql3xxx_isr(int irq, void *dev_id)
+{
+
+	struct net_device *ndev = dev_id;
+	struct ql3_adapter *qdev = netdev_priv(ndev);
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+	u32 value;
+	int handled = 1;
+	u32 var;
+
+	value = ql_read_common_reg_l(qdev,
+				     &port_regs->CommonRegs.ispControlStatus);
+
+	if (value & (ISP_CONTROL_FE | ISP_CONTROL_RI)) {
+		spin_lock(&qdev->adapter_lock);
+		netif_stop_queue(qdev->ndev);
+		netif_carrier_off(qdev->ndev);
+		ql_disable_interrupts(qdev);
+		qdev->port_link_state = LS_DOWN;
+		set_bit(QL_RESET_ACTIVE, &qdev->flags) ;
+
+		if (value & ISP_CONTROL_FE) {
+			/*
+			 * Chip Fatal Error.
+			 */
+			var =
+			    ql_read_page0_reg_l(qdev,
+					      &port_regs->PortFatalErrStatus);
+			netdev_warn(ndev,
+				    "Resetting chip. PortFatalErrStatus register = 0x%x\n",
+				    var);
+			set_bit(QL_RESET_START, &qdev->flags) ;
+		} else {
+			/*
+			 * Soft Reset Requested.
+			 */
+			set_bit(QL_RESET_PER_SCSI, &qdev->flags) ;
+			netdev_err(ndev,
+				   "Another function issued a reset to the chip. ISR value = %x\n",
+				   value);
+		}
+		queue_delayed_work(qdev->workqueue, &qdev->reset_work, 0);
+		spin_unlock(&qdev->adapter_lock);
+	} else if (value & ISP_IMR_DISABLE_CMPL_INT) {
+		ql_disable_interrupts(qdev);
+		if (likely(napi_schedule_prep(&qdev->napi)))
+			__napi_schedule(&qdev->napi);
+	} else
+		return IRQ_NONE;
+
+	return IRQ_RETVAL(handled);
+}
+
+/*
+ * Get the total number of segments needed for the given number of fragments.
+ * This is necessary because outbound address lists (OAL) will be used when
+ * more than two frags are given.  Each address list has 5 addr/len pairs.
+ * The 5th pair in each OAL is used to  point to the next OAL if more frags
+ * are coming.  That is why the frags:segment count ratio is not linear.
+ */
+static int ql_get_seg_count(struct ql3_adapter *qdev, unsigned short frags)
+{
+	if (qdev->device_id == QL3022_DEVICE_ID)
+		return 1;
+
+	if (frags <= 2)
+		return frags + 1;
+	else if (frags <= 6)
+		return frags + 2;
+	else if (frags <= 10)
+		return frags + 3;
+	else if (frags <= 14)
+		return frags + 4;
+	else if (frags <= 18)
+		return frags + 5;
+	return -1;
+}
+
+static void ql_hw_csum_setup(const struct sk_buff *skb,
+			     struct ob_mac_iocb_req *mac_iocb_ptr)
+{
+	const struct iphdr *ip = ip_hdr(skb);
+
+	mac_iocb_ptr->ip_hdr_off = skb_network_offset(skb);
+	mac_iocb_ptr->ip_hdr_len = ip->ihl;
+
+	if (ip->protocol == IPPROTO_TCP) {
+		mac_iocb_ptr->flags1 |= OB_3032MAC_IOCB_REQ_TC |
+			OB_3032MAC_IOCB_REQ_IC;
+	} else {
+		mac_iocb_ptr->flags1 |= OB_3032MAC_IOCB_REQ_UC |
+			OB_3032MAC_IOCB_REQ_IC;
+	}
+
+}
+
+/*
+ * Map the buffers for this transmit.
+ * This will return NETDEV_TX_BUSY or NETDEV_TX_OK based on success.
+ */
+static int ql_send_map(struct ql3_adapter *qdev,
+				struct ob_mac_iocb_req *mac_iocb_ptr,
+				struct ql_tx_buf_cb *tx_cb,
+				struct sk_buff *skb)
+{
+	struct oal *oal;
+	struct oal_entry *oal_entry;
+	int len = skb_headlen(skb);
+	dma_addr_t map;
+	int err;
+	int completed_segs, i;
+	int seg_cnt, seg = 0;
+	int frag_cnt = (int)skb_shinfo(skb)->nr_frags;
+
+	seg_cnt = tx_cb->seg_count;
+	/*
+	 * Map the skb buffer first.
+	 */
+	map = pci_map_single(qdev->pdev, skb->data, len, PCI_DMA_TODEVICE);
+
+	err = pci_dma_mapping_error(qdev->pdev, map);
+	if (err) {
+		netdev_err(qdev->ndev, "PCI mapping failed with error: %d\n",
+			   err);
+
+		return NETDEV_TX_BUSY;
+	}
+
+	oal_entry = (struct oal_entry *)&mac_iocb_ptr->buf_addr0_low;
+	oal_entry->dma_lo = cpu_to_le32(LS_64BITS(map));
+	oal_entry->dma_hi = cpu_to_le32(MS_64BITS(map));
+	oal_entry->len = cpu_to_le32(len);
+	dma_unmap_addr_set(&tx_cb->map[seg], mapaddr, map);
+	dma_unmap_len_set(&tx_cb->map[seg], maplen, len);
+	seg++;
+
+	if (seg_cnt == 1) {
+		/* Terminate the last segment. */
+		oal_entry->len |= cpu_to_le32(OAL_LAST_ENTRY);
+		return NETDEV_TX_OK;
+	}
+	oal = tx_cb->oal;
+	for (completed_segs = 0;
+	     completed_segs < frag_cnt;
+	     completed_segs++, seg++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[completed_segs];
+		oal_entry++;
+		/*
+		 * Check for continuation requirements.
+		 * It's strange but necessary.
+		 * Continuation entry points to outbound address list.
+		 */
+		if ((seg == 2 && seg_cnt > 3) ||
+		    (seg == 7 && seg_cnt > 8) ||
+		    (seg == 12 && seg_cnt > 13) ||
+		    (seg == 17 && seg_cnt > 18)) {
+			map = pci_map_single(qdev->pdev, oal,
+					     sizeof(struct oal),
+					     PCI_DMA_TODEVICE);
+
+			err = pci_dma_mapping_error(qdev->pdev, map);
+			if (err) {
+				netdev_err(qdev->ndev,
+					   "PCI mapping outbound address list with error: %d\n",
+					   err);
+				goto map_error;
+			}
+
+			oal_entry->dma_lo = cpu_to_le32(LS_64BITS(map));
+			oal_entry->dma_hi = cpu_to_le32(MS_64BITS(map));
+			oal_entry->len = cpu_to_le32(sizeof(struct oal) |
+						     OAL_CONT_ENTRY);
+			dma_unmap_addr_set(&tx_cb->map[seg], mapaddr, map);
+			dma_unmap_len_set(&tx_cb->map[seg], maplen,
+					  sizeof(struct oal));
+			oal_entry = (struct oal_entry *)oal;
+			oal++;
+			seg++;
+		}
+
+		map = skb_frag_dma_map(&qdev->pdev->dev, frag, 0, skb_frag_size(frag),
+				       DMA_TO_DEVICE);
+
+		err = dma_mapping_error(&qdev->pdev->dev, map);
+		if (err) {
+			netdev_err(qdev->ndev,
+				   "PCI mapping frags failed with error: %d\n",
+				   err);
+			goto map_error;
+		}
+
+		oal_entry->dma_lo = cpu_to_le32(LS_64BITS(map));
+		oal_entry->dma_hi = cpu_to_le32(MS_64BITS(map));
+		oal_entry->len = cpu_to_le32(skb_frag_size(frag));
+		dma_unmap_addr_set(&tx_cb->map[seg], mapaddr, map);
+		dma_unmap_len_set(&tx_cb->map[seg], maplen, skb_frag_size(frag));
+		}
+	/* Terminate the last segment. */
+	oal_entry->len |= cpu_to_le32(OAL_LAST_ENTRY);
+	return NETDEV_TX_OK;
+
+map_error:
+	/* A PCI mapping failed and now we will need to back out
+	 * We need to traverse through the oal's and associated pages which
+	 * have been mapped and now we must unmap them to clean up properly
+	 */
+
+	seg = 1;
+	oal_entry = (struct oal_entry *)&mac_iocb_ptr->buf_addr0_low;
+	oal = tx_cb->oal;
+	for (i = 0; i < completed_segs; i++, seg++) {
+		oal_entry++;
+
+		/*
+		 * Check for continuation requirements.
+		 * It's strange but necessary.
+		 */
+
+		if ((seg == 2 && seg_cnt > 3) ||
+		    (seg == 7 && seg_cnt > 8) ||
+		    (seg == 12 && seg_cnt > 13) ||
+		    (seg == 17 && seg_cnt > 18)) {
+			pci_unmap_single(qdev->pdev,
+				dma_unmap_addr(&tx_cb->map[seg], mapaddr),
+				dma_unmap_len(&tx_cb->map[seg], maplen),
+				 PCI_DMA_TODEVICE);
+			oal++;
+			seg++;
+		}
+
+		pci_unmap_page(qdev->pdev,
+			       dma_unmap_addr(&tx_cb->map[seg], mapaddr),
+			       dma_unmap_len(&tx_cb->map[seg], maplen),
+			       PCI_DMA_TODEVICE);
+	}
+
+	pci_unmap_single(qdev->pdev,
+			 dma_unmap_addr(&tx_cb->map[0], mapaddr),
+			 dma_unmap_addr(&tx_cb->map[0], maplen),
+			 PCI_DMA_TODEVICE);
+
+	return NETDEV_TX_BUSY;
+
+}
+
+/*
+ * The difference between 3022 and 3032 sends:
+ * 3022 only supports a simple single segment transmission.
+ * 3032 supports checksumming and scatter/gather lists (fragments).
+ * The 3032 supports sglists by using the 3 addr/len pairs (ALP)
+ * in the IOCB plus a chain of outbound address lists (OAL) that
+ * each contain 5 ALPs.  The last ALP of the IOCB (3rd) or OAL (5th)
+ * will be used to point to an OAL when more ALP entries are required.
+ * The IOCB is always the top of the chain followed by one or more
+ * OALs (when necessary).
+ */
+static netdev_tx_t ql3xxx_send(struct sk_buff *skb,
+			       struct net_device *ndev)
+{
+	struct ql3_adapter *qdev = netdev_priv(ndev);
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	struct ql_tx_buf_cb *tx_cb;
+	u32 tot_len = skb->len;
+	struct ob_mac_iocb_req *mac_iocb_ptr;
+
+	if (unlikely(atomic_read(&qdev->tx_count) < 2))
+		return NETDEV_TX_BUSY;
+
+	tx_cb = &qdev->tx_buf[qdev->req_producer_index];
+	tx_cb->seg_count = ql_get_seg_count(qdev,
+					     skb_shinfo(skb)->nr_frags);
+	if (tx_cb->seg_count == -1) {
+		netdev_err(ndev, "%s: invalid segment count!\n", __func__);
+		return NETDEV_TX_OK;
+	}
+
+	mac_iocb_ptr = tx_cb->queue_entry;
+	memset((void *)mac_iocb_ptr, 0, sizeof(struct ob_mac_iocb_req));
+	mac_iocb_ptr->opcode = qdev->mac_ob_opcode;
+	mac_iocb_ptr->flags = OB_MAC_IOCB_REQ_X;
+	mac_iocb_ptr->flags |= qdev->mb_bit_mask;
+	mac_iocb_ptr->transaction_id = qdev->req_producer_index;
+	mac_iocb_ptr->data_len = cpu_to_le16((u16) tot_len);
+	tx_cb->skb = skb;
+	if (qdev->device_id == QL3032_DEVICE_ID &&
+	    skb->ip_summed == CHECKSUM_PARTIAL)
+		ql_hw_csum_setup(skb, mac_iocb_ptr);
+
+	if (ql_send_map(qdev, mac_iocb_ptr, tx_cb, skb) != NETDEV_TX_OK) {
+		netdev_err(ndev, "%s: Could not map the segments!\n", __func__);
+		return NETDEV_TX_BUSY;
+	}
+
+	wmb();
+	qdev->req_producer_index++;
+	if (qdev->req_producer_index == NUM_REQ_Q_ENTRIES)
+		qdev->req_producer_index = 0;
+	wmb();
+	ql_write_common_reg_l(qdev,
+			    &port_regs->CommonRegs.reqQProducerIndex,
+			    qdev->req_producer_index);
+
+	netif_printk(qdev, tx_queued, KERN_DEBUG, ndev,
+		     "tx queued, slot %d, len %d\n",
+		     qdev->req_producer_index, skb->len);
+
+	atomic_dec(&qdev->tx_count);
+	return NETDEV_TX_OK;
+}
+
+static int ql_alloc_net_req_rsp_queues(struct ql3_adapter *qdev)
+{
+	qdev->req_q_size =
+	    (u32) (NUM_REQ_Q_ENTRIES * sizeof(struct ob_mac_iocb_req));
+
+	qdev->rsp_q_size = NUM_RSP_Q_ENTRIES * sizeof(struct net_rsp_iocb);
+
+	/* The barrier is required to ensure request and response queue
+	 * addr writes to the registers.
+	 */
+	wmb();
+
+	qdev->req_q_virt_addr =
+	    pci_alloc_consistent(qdev->pdev,
+				 (size_t) qdev->req_q_size,
+				 &qdev->req_q_phy_addr);
+
+	if ((qdev->req_q_virt_addr == NULL) ||
+	    LS_64BITS(qdev->req_q_phy_addr) & (qdev->req_q_size - 1)) {
+		netdev_err(qdev->ndev, "reqQ failed\n");
+		return -ENOMEM;
+	}
+
+	qdev->rsp_q_virt_addr =
+	    pci_alloc_consistent(qdev->pdev,
+				 (size_t) qdev->rsp_q_size,
+				 &qdev->rsp_q_phy_addr);
+
+	if ((qdev->rsp_q_virt_addr == NULL) ||
+	    LS_64BITS(qdev->rsp_q_phy_addr) & (qdev->rsp_q_size - 1)) {
+		netdev_err(qdev->ndev, "rspQ allocation failed\n");
+		pci_free_consistent(qdev->pdev, (size_t) qdev->req_q_size,
+				    qdev->req_q_virt_addr,
+				    qdev->req_q_phy_addr);
+		return -ENOMEM;
+	}
+
+	set_bit(QL_ALLOC_REQ_RSP_Q_DONE, &qdev->flags);
+
+	return 0;
+}
+
+static void ql_free_net_req_rsp_queues(struct ql3_adapter *qdev)
+{
+	if (!test_bit(QL_ALLOC_REQ_RSP_Q_DONE, &qdev->flags)) {
+		netdev_info(qdev->ndev, "Already done\n");
+		return;
+	}
+
+	pci_free_consistent(qdev->pdev,
+			    qdev->req_q_size,
+			    qdev->req_q_virt_addr, qdev->req_q_phy_addr);
+
+	qdev->req_q_virt_addr = NULL;
+
+	pci_free_consistent(qdev->pdev,
+			    qdev->rsp_q_size,
+			    qdev->rsp_q_virt_addr, qdev->rsp_q_phy_addr);
+
+	qdev->rsp_q_virt_addr = NULL;
+
+	clear_bit(QL_ALLOC_REQ_RSP_Q_DONE, &qdev->flags);
+}
+
+static int ql_alloc_buffer_queues(struct ql3_adapter *qdev)
+{
+	/* Create Large Buffer Queue */
+	qdev->lrg_buf_q_size =
+		qdev->num_lbufq_entries * sizeof(struct lrg_buf_q_entry);
+	if (qdev->lrg_buf_q_size < PAGE_SIZE)
+		qdev->lrg_buf_q_alloc_size = PAGE_SIZE;
+	else
+		qdev->lrg_buf_q_alloc_size = qdev->lrg_buf_q_size * 2;
+
+	qdev->lrg_buf = kmalloc_array(qdev->num_large_buffers,
+				      sizeof(struct ql_rcv_buf_cb),
+				      GFP_KERNEL);
+	if (qdev->lrg_buf == NULL)
+		return -ENOMEM;
+
+	qdev->lrg_buf_q_alloc_virt_addr =
+		pci_alloc_consistent(qdev->pdev,
+				     qdev->lrg_buf_q_alloc_size,
+				     &qdev->lrg_buf_q_alloc_phy_addr);
+
+	if (qdev->lrg_buf_q_alloc_virt_addr == NULL) {
+		netdev_err(qdev->ndev, "lBufQ failed\n");
+		return -ENOMEM;
+	}
+	qdev->lrg_buf_q_virt_addr = qdev->lrg_buf_q_alloc_virt_addr;
+	qdev->lrg_buf_q_phy_addr = qdev->lrg_buf_q_alloc_phy_addr;
+
+	/* Create Small Buffer Queue */
+	qdev->small_buf_q_size =
+		NUM_SBUFQ_ENTRIES * sizeof(struct lrg_buf_q_entry);
+	if (qdev->small_buf_q_size < PAGE_SIZE)
+		qdev->small_buf_q_alloc_size = PAGE_SIZE;
+	else
+		qdev->small_buf_q_alloc_size = qdev->small_buf_q_size * 2;
+
+	qdev->small_buf_q_alloc_virt_addr =
+		pci_alloc_consistent(qdev->pdev,
+				     qdev->small_buf_q_alloc_size,
+				     &qdev->small_buf_q_alloc_phy_addr);
+
+	if (qdev->small_buf_q_alloc_virt_addr == NULL) {
+		netdev_err(qdev->ndev, "Small Buffer Queue allocation failed\n");
+		pci_free_consistent(qdev->pdev, qdev->lrg_buf_q_alloc_size,
+				    qdev->lrg_buf_q_alloc_virt_addr,
+				    qdev->lrg_buf_q_alloc_phy_addr);
+		return -ENOMEM;
+	}
+
+	qdev->small_buf_q_virt_addr = qdev->small_buf_q_alloc_virt_addr;
+	qdev->small_buf_q_phy_addr = qdev->small_buf_q_alloc_phy_addr;
+	set_bit(QL_ALLOC_BUFQS_DONE, &qdev->flags);
+	return 0;
+}
+
+static void ql_free_buffer_queues(struct ql3_adapter *qdev)
+{
+	if (!test_bit(QL_ALLOC_BUFQS_DONE, &qdev->flags)) {
+		netdev_info(qdev->ndev, "Already done\n");
+		return;
+	}
+	kfree(qdev->lrg_buf);
+	pci_free_consistent(qdev->pdev,
+			    qdev->lrg_buf_q_alloc_size,
+			    qdev->lrg_buf_q_alloc_virt_addr,
+			    qdev->lrg_buf_q_alloc_phy_addr);
+
+	qdev->lrg_buf_q_virt_addr = NULL;
+
+	pci_free_consistent(qdev->pdev,
+			    qdev->small_buf_q_alloc_size,
+			    qdev->small_buf_q_alloc_virt_addr,
+			    qdev->small_buf_q_alloc_phy_addr);
+
+	qdev->small_buf_q_virt_addr = NULL;
+
+	clear_bit(QL_ALLOC_BUFQS_DONE, &qdev->flags);
+}
+
+static int ql_alloc_small_buffers(struct ql3_adapter *qdev)
+{
+	int i;
+	struct bufq_addr_element *small_buf_q_entry;
+
+	/* Currently we allocate on one of memory and use it for smallbuffers */
+	qdev->small_buf_total_size =
+		(QL_ADDR_ELE_PER_BUFQ_ENTRY * NUM_SBUFQ_ENTRIES *
+		 QL_SMALL_BUFFER_SIZE);
+
+	qdev->small_buf_virt_addr =
+		pci_alloc_consistent(qdev->pdev,
+				     qdev->small_buf_total_size,
+				     &qdev->small_buf_phy_addr);
+
+	if (qdev->small_buf_virt_addr == NULL) {
+		netdev_err(qdev->ndev, "Failed to get small buffer memory\n");
+		return -ENOMEM;
+	}
+
+	qdev->small_buf_phy_addr_low = LS_64BITS(qdev->small_buf_phy_addr);
+	qdev->small_buf_phy_addr_high = MS_64BITS(qdev->small_buf_phy_addr);
+
+	small_buf_q_entry = qdev->small_buf_q_virt_addr;
+
+	/* Initialize the small buffer queue. */
+	for (i = 0; i < (QL_ADDR_ELE_PER_BUFQ_ENTRY * NUM_SBUFQ_ENTRIES); i++) {
+		small_buf_q_entry->addr_high =
+		    cpu_to_le32(qdev->small_buf_phy_addr_high);
+		small_buf_q_entry->addr_low =
+		    cpu_to_le32(qdev->small_buf_phy_addr_low +
+				(i * QL_SMALL_BUFFER_SIZE));
+		small_buf_q_entry++;
+	}
+	qdev->small_buf_index = 0;
+	set_bit(QL_ALLOC_SMALL_BUF_DONE, &qdev->flags);
+	return 0;
+}
+
+static void ql_free_small_buffers(struct ql3_adapter *qdev)
+{
+	if (!test_bit(QL_ALLOC_SMALL_BUF_DONE, &qdev->flags)) {
+		netdev_info(qdev->ndev, "Already done\n");
+		return;
+	}
+	if (qdev->small_buf_virt_addr != NULL) {
+		pci_free_consistent(qdev->pdev,
+				    qdev->small_buf_total_size,
+				    qdev->small_buf_virt_addr,
+				    qdev->small_buf_phy_addr);
+
+		qdev->small_buf_virt_addr = NULL;
+	}
+}
+
+static void ql_free_large_buffers(struct ql3_adapter *qdev)
+{
+	int i = 0;
+	struct ql_rcv_buf_cb *lrg_buf_cb;
+
+	for (i = 0; i < qdev->num_large_buffers; i++) {
+		lrg_buf_cb = &qdev->lrg_buf[i];
+		if (lrg_buf_cb->skb) {
+			dev_kfree_skb(lrg_buf_cb->skb);
+			pci_unmap_single(qdev->pdev,
+					 dma_unmap_addr(lrg_buf_cb, mapaddr),
+					 dma_unmap_len(lrg_buf_cb, maplen),
+					 PCI_DMA_FROMDEVICE);
+			memset(lrg_buf_cb, 0, sizeof(struct ql_rcv_buf_cb));
+		} else {
+			break;
+		}
+	}
+}
+
+static void ql_init_large_buffers(struct ql3_adapter *qdev)
+{
+	int i;
+	struct ql_rcv_buf_cb *lrg_buf_cb;
+	struct bufq_addr_element *buf_addr_ele = qdev->lrg_buf_q_virt_addr;
+
+	for (i = 0; i < qdev->num_large_buffers; i++) {
+		lrg_buf_cb = &qdev->lrg_buf[i];
+		buf_addr_ele->addr_high = lrg_buf_cb->buf_phy_addr_high;
+		buf_addr_ele->addr_low = lrg_buf_cb->buf_phy_addr_low;
+		buf_addr_ele++;
+	}
+	qdev->lrg_buf_index = 0;
+	qdev->lrg_buf_skb_check = 0;
+}
+
+static int ql_alloc_large_buffers(struct ql3_adapter *qdev)
+{
+	int i;
+	struct ql_rcv_buf_cb *lrg_buf_cb;
+	struct sk_buff *skb;
+	dma_addr_t map;
+	int err;
+
+	for (i = 0; i < qdev->num_large_buffers; i++) {
+		skb = netdev_alloc_skb(qdev->ndev,
+				       qdev->lrg_buffer_len);
+		if (unlikely(!skb)) {
+			/* Better luck next round */
+			netdev_err(qdev->ndev,
+				   "large buff alloc failed for %d bytes at index %d\n",
+				   qdev->lrg_buffer_len * 2, i);
+			ql_free_large_buffers(qdev);
+			return -ENOMEM;
+		} else {
+
+			lrg_buf_cb = &qdev->lrg_buf[i];
+			memset(lrg_buf_cb, 0, sizeof(struct ql_rcv_buf_cb));
+			lrg_buf_cb->index = i;
+			lrg_buf_cb->skb = skb;
+			/*
+			 * We save some space to copy the ethhdr from first
+			 * buffer
+			 */
+			skb_reserve(skb, QL_HEADER_SPACE);
+			map = pci_map_single(qdev->pdev,
+					     skb->data,
+					     qdev->lrg_buffer_len -
+					     QL_HEADER_SPACE,
+					     PCI_DMA_FROMDEVICE);
+
+			err = pci_dma_mapping_error(qdev->pdev, map);
+			if (err) {
+				netdev_err(qdev->ndev,
+					   "PCI mapping failed with error: %d\n",
+					   err);
+				ql_free_large_buffers(qdev);
+				return -ENOMEM;
+			}
+
+			dma_unmap_addr_set(lrg_buf_cb, mapaddr, map);
+			dma_unmap_len_set(lrg_buf_cb, maplen,
+					  qdev->lrg_buffer_len -
+					  QL_HEADER_SPACE);
+			lrg_buf_cb->buf_phy_addr_low =
+			    cpu_to_le32(LS_64BITS(map));
+			lrg_buf_cb->buf_phy_addr_high =
+			    cpu_to_le32(MS_64BITS(map));
+		}
+	}
+	return 0;
+}
+
+static void ql_free_send_free_list(struct ql3_adapter *qdev)
+{
+	struct ql_tx_buf_cb *tx_cb;
+	int i;
+
+	tx_cb = &qdev->tx_buf[0];
+	for (i = 0; i < NUM_REQ_Q_ENTRIES; i++) {
+		kfree(tx_cb->oal);
+		tx_cb->oal = NULL;
+		tx_cb++;
+	}
+}
+
+static int ql_create_send_free_list(struct ql3_adapter *qdev)
+{
+	struct ql_tx_buf_cb *tx_cb;
+	int i;
+	struct ob_mac_iocb_req *req_q_curr = qdev->req_q_virt_addr;
+
+	/* Create free list of transmit buffers */
+	for (i = 0; i < NUM_REQ_Q_ENTRIES; i++) {
+
+		tx_cb = &qdev->tx_buf[i];
+		tx_cb->skb = NULL;
+		tx_cb->queue_entry = req_q_curr;
+		req_q_curr++;
+		tx_cb->oal = kmalloc(512, GFP_KERNEL);
+		if (tx_cb->oal == NULL)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+static int ql_alloc_mem_resources(struct ql3_adapter *qdev)
+{
+	if (qdev->ndev->mtu == NORMAL_MTU_SIZE) {
+		qdev->num_lbufq_entries = NUM_LBUFQ_ENTRIES;
+		qdev->lrg_buffer_len = NORMAL_MTU_SIZE;
+	} else if (qdev->ndev->mtu == JUMBO_MTU_SIZE) {
+		/*
+		 * Bigger buffers, so less of them.
+		 */
+		qdev->num_lbufq_entries = JUMBO_NUM_LBUFQ_ENTRIES;
+		qdev->lrg_buffer_len = JUMBO_MTU_SIZE;
+	} else {
+		netdev_err(qdev->ndev, "Invalid mtu size: %d.  Only %d and %d are accepted.\n",
+			   qdev->ndev->mtu, NORMAL_MTU_SIZE, JUMBO_MTU_SIZE);
+		return -ENOMEM;
+	}
+	qdev->num_large_buffers =
+		qdev->num_lbufq_entries * QL_ADDR_ELE_PER_BUFQ_ENTRY;
+	qdev->lrg_buffer_len += VLAN_ETH_HLEN + VLAN_ID_LEN + QL_HEADER_SPACE;
+	qdev->max_frame_size =
+		(qdev->lrg_buffer_len - QL_HEADER_SPACE) + ETHERNET_CRC_SIZE;
+
+	/*
+	 * First allocate a page of shared memory and use it for shadow
+	 * locations of Network Request Queue Consumer Address Register and
+	 * Network Completion Queue Producer Index Register
+	 */
+	qdev->shadow_reg_virt_addr =
+		pci_alloc_consistent(qdev->pdev,
+				     PAGE_SIZE, &qdev->shadow_reg_phy_addr);
+
+	if (qdev->shadow_reg_virt_addr != NULL) {
+		qdev->preq_consumer_index = qdev->shadow_reg_virt_addr;
+		qdev->req_consumer_index_phy_addr_high =
+			MS_64BITS(qdev->shadow_reg_phy_addr);
+		qdev->req_consumer_index_phy_addr_low =
+			LS_64BITS(qdev->shadow_reg_phy_addr);
+
+		qdev->prsp_producer_index =
+			(__le32 *) (((u8 *) qdev->preq_consumer_index) + 8);
+		qdev->rsp_producer_index_phy_addr_high =
+			qdev->req_consumer_index_phy_addr_high;
+		qdev->rsp_producer_index_phy_addr_low =
+			qdev->req_consumer_index_phy_addr_low + 8;
+	} else {
+		netdev_err(qdev->ndev, "shadowReg Alloc failed\n");
+		return -ENOMEM;
+	}
+
+	if (ql_alloc_net_req_rsp_queues(qdev) != 0) {
+		netdev_err(qdev->ndev, "ql_alloc_net_req_rsp_queues failed\n");
+		goto err_req_rsp;
+	}
+
+	if (ql_alloc_buffer_queues(qdev) != 0) {
+		netdev_err(qdev->ndev, "ql_alloc_buffer_queues failed\n");
+		goto err_buffer_queues;
+	}
+
+	if (ql_alloc_small_buffers(qdev) != 0) {
+		netdev_err(qdev->ndev, "ql_alloc_small_buffers failed\n");
+		goto err_small_buffers;
+	}
+
+	if (ql_alloc_large_buffers(qdev) != 0) {
+		netdev_err(qdev->ndev, "ql_alloc_large_buffers failed\n");
+		goto err_small_buffers;
+	}
+
+	/* Initialize the large buffer queue. */
+	ql_init_large_buffers(qdev);
+	if (ql_create_send_free_list(qdev))
+		goto err_free_list;
+
+	qdev->rsp_current = qdev->rsp_q_virt_addr;
+
+	return 0;
+err_free_list:
+	ql_free_send_free_list(qdev);
+err_small_buffers:
+	ql_free_buffer_queues(qdev);
+err_buffer_queues:
+	ql_free_net_req_rsp_queues(qdev);
+err_req_rsp:
+	pci_free_consistent(qdev->pdev,
+			    PAGE_SIZE,
+			    qdev->shadow_reg_virt_addr,
+			    qdev->shadow_reg_phy_addr);
+
+	return -ENOMEM;
+}
+
+static void ql_free_mem_resources(struct ql3_adapter *qdev)
+{
+	ql_free_send_free_list(qdev);
+	ql_free_large_buffers(qdev);
+	ql_free_small_buffers(qdev);
+	ql_free_buffer_queues(qdev);
+	ql_free_net_req_rsp_queues(qdev);
+	if (qdev->shadow_reg_virt_addr != NULL) {
+		pci_free_consistent(qdev->pdev,
+				    PAGE_SIZE,
+				    qdev->shadow_reg_virt_addr,
+				    qdev->shadow_reg_phy_addr);
+		qdev->shadow_reg_virt_addr = NULL;
+	}
+}
+
+static int ql_init_misc_registers(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_local_ram_registers __iomem *local_ram =
+	    (void __iomem *)qdev->mem_map_registers;
+
+	if (ql_sem_spinlock(qdev, QL_DDR_RAM_SEM_MASK,
+			(QL_RESOURCE_BITS_BASE_CODE | (qdev->mac_index) *
+			 2) << 4))
+		return -1;
+
+	ql_write_page2_reg(qdev,
+			   &local_ram->bufletSize, qdev->nvram_data.bufletSize);
+
+	ql_write_page2_reg(qdev,
+			   &local_ram->maxBufletCount,
+			   qdev->nvram_data.bufletCount);
+
+	ql_write_page2_reg(qdev,
+			   &local_ram->freeBufletThresholdLow,
+			   (qdev->nvram_data.tcpWindowThreshold25 << 16) |
+			   (qdev->nvram_data.tcpWindowThreshold0));
+
+	ql_write_page2_reg(qdev,
+			   &local_ram->freeBufletThresholdHigh,
+			   qdev->nvram_data.tcpWindowThreshold50);
+
+	ql_write_page2_reg(qdev,
+			   &local_ram->ipHashTableBase,
+			   (qdev->nvram_data.ipHashTableBaseHi << 16) |
+			   qdev->nvram_data.ipHashTableBaseLo);
+	ql_write_page2_reg(qdev,
+			   &local_ram->ipHashTableCount,
+			   qdev->nvram_data.ipHashTableSize);
+	ql_write_page2_reg(qdev,
+			   &local_ram->tcpHashTableBase,
+			   (qdev->nvram_data.tcpHashTableBaseHi << 16) |
+			   qdev->nvram_data.tcpHashTableBaseLo);
+	ql_write_page2_reg(qdev,
+			   &local_ram->tcpHashTableCount,
+			   qdev->nvram_data.tcpHashTableSize);
+	ql_write_page2_reg(qdev,
+			   &local_ram->ncbBase,
+			   (qdev->nvram_data.ncbTableBaseHi << 16) |
+			   qdev->nvram_data.ncbTableBaseLo);
+	ql_write_page2_reg(qdev,
+			   &local_ram->maxNcbCount,
+			   qdev->nvram_data.ncbTableSize);
+	ql_write_page2_reg(qdev,
+			   &local_ram->drbBase,
+			   (qdev->nvram_data.drbTableBaseHi << 16) |
+			   qdev->nvram_data.drbTableBaseLo);
+	ql_write_page2_reg(qdev,
+			   &local_ram->maxDrbCount,
+			   qdev->nvram_data.drbTableSize);
+	ql_sem_unlock(qdev, QL_DDR_RAM_SEM_MASK);
+	return 0;
+}
+
+static int ql_adapter_initialize(struct ql3_adapter *qdev)
+{
+	u32 value;
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+	__iomem u32 *spir = &port_regs->CommonRegs.serialPortInterfaceReg;
+	struct ql3xxx_host_memory_registers __iomem *hmem_regs =
+		(void __iomem *)port_regs;
+	u32 delay = 10;
+	int status = 0;
+
+	if (ql_mii_setup(qdev))
+		return -1;
+
+	/* Bring out PHY out of reset */
+	ql_write_common_reg(qdev, spir,
+			    (ISP_SERIAL_PORT_IF_WE |
+			     (ISP_SERIAL_PORT_IF_WE << 16)));
+	/* Give the PHY time to come out of reset. */
+	mdelay(100);
+	qdev->port_link_state = LS_DOWN;
+	netif_carrier_off(qdev->ndev);
+
+	/* V2 chip fix for ARS-39168. */
+	ql_write_common_reg(qdev, spir,
+			    (ISP_SERIAL_PORT_IF_SDE |
+			     (ISP_SERIAL_PORT_IF_SDE << 16)));
+
+	/* Request Queue Registers */
+	*((u32 *)(qdev->preq_consumer_index)) = 0;
+	atomic_set(&qdev->tx_count, NUM_REQ_Q_ENTRIES);
+	qdev->req_producer_index = 0;
+
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->reqConsumerIndexAddrHigh,
+			   qdev->req_consumer_index_phy_addr_high);
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->reqConsumerIndexAddrLow,
+			   qdev->req_consumer_index_phy_addr_low);
+
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->reqBaseAddrHigh,
+			   MS_64BITS(qdev->req_q_phy_addr));
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->reqBaseAddrLow,
+			   LS_64BITS(qdev->req_q_phy_addr));
+	ql_write_page1_reg(qdev, &hmem_regs->reqLength, NUM_REQ_Q_ENTRIES);
+
+	/* Response Queue Registers */
+	*((__le16 *) (qdev->prsp_producer_index)) = 0;
+	qdev->rsp_consumer_index = 0;
+	qdev->rsp_current = qdev->rsp_q_virt_addr;
+
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->rspProducerIndexAddrHigh,
+			   qdev->rsp_producer_index_phy_addr_high);
+
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->rspProducerIndexAddrLow,
+			   qdev->rsp_producer_index_phy_addr_low);
+
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->rspBaseAddrHigh,
+			   MS_64BITS(qdev->rsp_q_phy_addr));
+
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->rspBaseAddrLow,
+			   LS_64BITS(qdev->rsp_q_phy_addr));
+
+	ql_write_page1_reg(qdev, &hmem_regs->rspLength, NUM_RSP_Q_ENTRIES);
+
+	/* Large Buffer Queue */
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->rxLargeQBaseAddrHigh,
+			   MS_64BITS(qdev->lrg_buf_q_phy_addr));
+
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->rxLargeQBaseAddrLow,
+			   LS_64BITS(qdev->lrg_buf_q_phy_addr));
+
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->rxLargeQLength,
+			   qdev->num_lbufq_entries);
+
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->rxLargeBufferLength,
+			   qdev->lrg_buffer_len);
+
+	/* Small Buffer Queue */
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->rxSmallQBaseAddrHigh,
+			   MS_64BITS(qdev->small_buf_q_phy_addr));
+
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->rxSmallQBaseAddrLow,
+			   LS_64BITS(qdev->small_buf_q_phy_addr));
+
+	ql_write_page1_reg(qdev, &hmem_regs->rxSmallQLength, NUM_SBUFQ_ENTRIES);
+	ql_write_page1_reg(qdev,
+			   &hmem_regs->rxSmallBufferLength,
+			   QL_SMALL_BUFFER_SIZE);
+
+	qdev->small_buf_q_producer_index = NUM_SBUFQ_ENTRIES - 1;
+	qdev->small_buf_release_cnt = 8;
+	qdev->lrg_buf_q_producer_index = qdev->num_lbufq_entries - 1;
+	qdev->lrg_buf_release_cnt = 8;
+	qdev->lrg_buf_next_free = qdev->lrg_buf_q_virt_addr;
+	qdev->small_buf_index = 0;
+	qdev->lrg_buf_index = 0;
+	qdev->lrg_buf_free_count = 0;
+	qdev->lrg_buf_free_head = NULL;
+	qdev->lrg_buf_free_tail = NULL;
+
+	ql_write_common_reg(qdev,
+			    &port_regs->CommonRegs.
+			    rxSmallQProducerIndex,
+			    qdev->small_buf_q_producer_index);
+	ql_write_common_reg(qdev,
+			    &port_regs->CommonRegs.
+			    rxLargeQProducerIndex,
+			    qdev->lrg_buf_q_producer_index);
+
+	/*
+	 * Find out if the chip has already been initialized.  If it has, then
+	 * we skip some of the initialization.
+	 */
+	clear_bit(QL_LINK_MASTER, &qdev->flags);
+	value = ql_read_page0_reg(qdev, &port_regs->portStatus);
+	if ((value & PORT_STATUS_IC) == 0) {
+
+		/* Chip has not been configured yet, so let it rip. */
+		if (ql_init_misc_registers(qdev)) {
+			status = -1;
+			goto out;
+		}
+
+		value = qdev->nvram_data.tcpMaxWindowSize;
+		ql_write_page0_reg(qdev, &port_regs->tcpMaxWindow, value);
+
+		value = (0xFFFF << 16) | qdev->nvram_data.extHwConfig;
+
+		if (ql_sem_spinlock(qdev, QL_FLASH_SEM_MASK,
+				(QL_RESOURCE_BITS_BASE_CODE | (qdev->mac_index)
+				 * 2) << 13)) {
+			status = -1;
+			goto out;
+		}
+		ql_write_page0_reg(qdev, &port_regs->ExternalHWConfig, value);
+		ql_write_page0_reg(qdev, &port_regs->InternalChipConfig,
+				   (((INTERNAL_CHIP_SD | INTERNAL_CHIP_WE) <<
+				     16) | (INTERNAL_CHIP_SD |
+					    INTERNAL_CHIP_WE)));
+		ql_sem_unlock(qdev, QL_FLASH_SEM_MASK);
+	}
+
+	if (qdev->mac_index)
+		ql_write_page0_reg(qdev,
+				   &port_regs->mac1MaxFrameLengthReg,
+				   qdev->max_frame_size);
+	else
+		ql_write_page0_reg(qdev,
+					   &port_regs->mac0MaxFrameLengthReg,
+					   qdev->max_frame_size);
+
+	if (ql_sem_spinlock(qdev, QL_PHY_GIO_SEM_MASK,
+			(QL_RESOURCE_BITS_BASE_CODE | (qdev->mac_index) *
+			 2) << 7)) {
+		status = -1;
+		goto out;
+	}
+
+	PHY_Setup(qdev);
+	ql_init_scan_mode(qdev);
+	ql_get_phy_owner(qdev);
+
+	/* Load the MAC Configuration */
+
+	/* Program lower 32 bits of the MAC address */
+	ql_write_page0_reg(qdev, &port_regs->macAddrIndirectPtrReg,
+			   (MAC_ADDR_INDIRECT_PTR_REG_RP_MASK << 16));
+	ql_write_page0_reg(qdev, &port_regs->macAddrDataReg,
+			   ((qdev->ndev->dev_addr[2] << 24)
+			    | (qdev->ndev->dev_addr[3] << 16)
+			    | (qdev->ndev->dev_addr[4] << 8)
+			    | qdev->ndev->dev_addr[5]));
+
+	/* Program top 16 bits of the MAC address */
+	ql_write_page0_reg(qdev, &port_regs->macAddrIndirectPtrReg,
+			   ((MAC_ADDR_INDIRECT_PTR_REG_RP_MASK << 16) | 1));
+	ql_write_page0_reg(qdev, &port_regs->macAddrDataReg,
+			   ((qdev->ndev->dev_addr[0] << 8)
+			    | qdev->ndev->dev_addr[1]));
+
+	/* Enable Primary MAC */
+	ql_write_page0_reg(qdev, &port_regs->macAddrIndirectPtrReg,
+			   ((MAC_ADDR_INDIRECT_PTR_REG_PE << 16) |
+			    MAC_ADDR_INDIRECT_PTR_REG_PE));
+
+	/* Clear Primary and Secondary IP addresses */
+	ql_write_page0_reg(qdev, &port_regs->ipAddrIndexReg,
+			   ((IP_ADDR_INDEX_REG_MASK << 16) |
+			    (qdev->mac_index << 2)));
+	ql_write_page0_reg(qdev, &port_regs->ipAddrDataReg, 0);
+
+	ql_write_page0_reg(qdev, &port_regs->ipAddrIndexReg,
+			   ((IP_ADDR_INDEX_REG_MASK << 16) |
+			    ((qdev->mac_index << 2) + 1)));
+	ql_write_page0_reg(qdev, &port_regs->ipAddrDataReg, 0);
+
+	ql_sem_unlock(qdev, QL_PHY_GIO_SEM_MASK);
+
+	/* Indicate Configuration Complete */
+	ql_write_page0_reg(qdev,
+			   &port_regs->portControl,
+			   ((PORT_CONTROL_CC << 16) | PORT_CONTROL_CC));
+
+	do {
+		value = ql_read_page0_reg(qdev, &port_regs->portStatus);
+		if (value & PORT_STATUS_IC)
+			break;
+		spin_unlock_irq(&qdev->hw_lock);
+		msleep(500);
+		spin_lock_irq(&qdev->hw_lock);
+	} while (--delay);
+
+	if (delay == 0) {
+		netdev_err(qdev->ndev, "Hw Initialization timeout\n");
+		status = -1;
+		goto out;
+	}
+
+	/* Enable Ethernet Function */
+	if (qdev->device_id == QL3032_DEVICE_ID) {
+		value =
+		    (QL3032_PORT_CONTROL_EF | QL3032_PORT_CONTROL_KIE |
+		     QL3032_PORT_CONTROL_EIv6 | QL3032_PORT_CONTROL_EIv4 |
+			QL3032_PORT_CONTROL_ET);
+		ql_write_page0_reg(qdev, &port_regs->functionControl,
+				   ((value << 16) | value));
+	} else {
+		value =
+		    (PORT_CONTROL_EF | PORT_CONTROL_ET | PORT_CONTROL_EI |
+		     PORT_CONTROL_HH);
+		ql_write_page0_reg(qdev, &port_regs->portControl,
+				   ((value << 16) | value));
+	}
+
+
+out:
+	return status;
+}
+
+/*
+ * Caller holds hw_lock.
+ */
+static int ql_adapter_reset(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+	int status = 0;
+	u16 value;
+	int max_wait_time;
+
+	set_bit(QL_RESET_ACTIVE, &qdev->flags);
+	clear_bit(QL_RESET_DONE, &qdev->flags);
+
+	/*
+	 * Issue soft reset to chip.
+	 */
+	netdev_printk(KERN_DEBUG, qdev->ndev, "Issue soft reset to chip\n");
+	ql_write_common_reg(qdev,
+			    &port_regs->CommonRegs.ispControlStatus,
+			    ((ISP_CONTROL_SR << 16) | ISP_CONTROL_SR));
+
+	/* Wait 3 seconds for reset to complete. */
+	netdev_printk(KERN_DEBUG, qdev->ndev,
+		      "Wait 10 milliseconds for reset to complete\n");
+
+	/* Wait until the firmware tells us the Soft Reset is done */
+	max_wait_time = 5;
+	do {
+		value =
+		    ql_read_common_reg(qdev,
+				       &port_regs->CommonRegs.ispControlStatus);
+		if ((value & ISP_CONTROL_SR) == 0)
+			break;
+
+		ssleep(1);
+	} while ((--max_wait_time));
+
+	/*
+	 * Also, make sure that the Network Reset Interrupt bit has been
+	 * cleared after the soft reset has taken place.
+	 */
+	value =
+	    ql_read_common_reg(qdev, &port_regs->CommonRegs.ispControlStatus);
+	if (value & ISP_CONTROL_RI) {
+		netdev_printk(KERN_DEBUG, qdev->ndev,
+			      "clearing RI after reset\n");
+		ql_write_common_reg(qdev,
+				    &port_regs->CommonRegs.
+				    ispControlStatus,
+				    ((ISP_CONTROL_RI << 16) | ISP_CONTROL_RI));
+	}
+
+	if (max_wait_time == 0) {
+		/* Issue Force Soft Reset */
+		ql_write_common_reg(qdev,
+				    &port_regs->CommonRegs.
+				    ispControlStatus,
+				    ((ISP_CONTROL_FSR << 16) |
+				     ISP_CONTROL_FSR));
+		/*
+		 * Wait until the firmware tells us the Force Soft Reset is
+		 * done
+		 */
+		max_wait_time = 5;
+		do {
+			value = ql_read_common_reg(qdev,
+						   &port_regs->CommonRegs.
+						   ispControlStatus);
+			if ((value & ISP_CONTROL_FSR) == 0)
+				break;
+			ssleep(1);
+		} while ((--max_wait_time));
+	}
+	if (max_wait_time == 0)
+		status = 1;
+
+	clear_bit(QL_RESET_ACTIVE, &qdev->flags);
+	set_bit(QL_RESET_DONE, &qdev->flags);
+	return status;
+}
+
+static void ql_set_mac_info(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+	u32 value, port_status;
+	u8 func_number;
+
+	/* Get the function number */
+	value =
+	    ql_read_common_reg_l(qdev, &port_regs->CommonRegs.ispControlStatus);
+	func_number = (u8) ((value >> 4) & OPCODE_FUNC_ID_MASK);
+	port_status = ql_read_page0_reg(qdev, &port_regs->portStatus);
+	switch (value & ISP_CONTROL_FN_MASK) {
+	case ISP_CONTROL_FN0_NET:
+		qdev->mac_index = 0;
+		qdev->mac_ob_opcode = OUTBOUND_MAC_IOCB | func_number;
+		qdev->mb_bit_mask = FN0_MA_BITS_MASK;
+		qdev->PHYAddr = PORT0_PHY_ADDRESS;
+		if (port_status & PORT_STATUS_SM0)
+			set_bit(QL_LINK_OPTICAL, &qdev->flags);
+		else
+			clear_bit(QL_LINK_OPTICAL, &qdev->flags);
+		break;
+
+	case ISP_CONTROL_FN1_NET:
+		qdev->mac_index = 1;
+		qdev->mac_ob_opcode = OUTBOUND_MAC_IOCB | func_number;
+		qdev->mb_bit_mask = FN1_MA_BITS_MASK;
+		qdev->PHYAddr = PORT1_PHY_ADDRESS;
+		if (port_status & PORT_STATUS_SM1)
+			set_bit(QL_LINK_OPTICAL, &qdev->flags);
+		else
+			clear_bit(QL_LINK_OPTICAL, &qdev->flags);
+		break;
+
+	case ISP_CONTROL_FN0_SCSI:
+	case ISP_CONTROL_FN1_SCSI:
+	default:
+		netdev_printk(KERN_DEBUG, qdev->ndev,
+			      "Invalid function number, ispControlStatus = 0x%x\n",
+			      value);
+		break;
+	}
+	qdev->numPorts = qdev->nvram_data.version_and_numPorts >> 8;
+}
+
+static void ql_display_dev_info(struct net_device *ndev)
+{
+	struct ql3_adapter *qdev = netdev_priv(ndev);
+	struct pci_dev *pdev = qdev->pdev;
+
+	netdev_info(ndev,
+		    "%s Adapter %d RevisionID %d found %s on PCI slot %d\n",
+		    DRV_NAME, qdev->index, qdev->chip_rev_id,
+		    qdev->device_id == QL3032_DEVICE_ID ? "QLA3032" : "QLA3022",
+		    qdev->pci_slot);
+	netdev_info(ndev, "%s Interface\n",
+		test_bit(QL_LINK_OPTICAL, &qdev->flags) ? "OPTICAL" : "COPPER");
+
+	/*
+	 * Print PCI bus width/type.
+	 */
+	netdev_info(ndev, "Bus interface is %s %s\n",
+		    ((qdev->pci_width == 64) ? "64-bit" : "32-bit"),
+		    ((qdev->pci_x) ? "PCI-X" : "PCI"));
+
+	netdev_info(ndev, "mem  IO base address adjusted = 0x%p\n",
+		    qdev->mem_map_registers);
+	netdev_info(ndev, "Interrupt number = %d\n", pdev->irq);
+
+	netif_info(qdev, probe, ndev, "MAC address %pM\n", ndev->dev_addr);
+}
+
+static int ql_adapter_down(struct ql3_adapter *qdev, int do_reset)
+{
+	struct net_device *ndev = qdev->ndev;
+	int retval = 0;
+
+	netif_stop_queue(ndev);
+	netif_carrier_off(ndev);
+
+	clear_bit(QL_ADAPTER_UP, &qdev->flags);
+	clear_bit(QL_LINK_MASTER, &qdev->flags);
+
+	ql_disable_interrupts(qdev);
+
+	free_irq(qdev->pdev->irq, ndev);
+
+	if (qdev->msi && test_bit(QL_MSI_ENABLED, &qdev->flags)) {
+		netdev_info(qdev->ndev, "calling pci_disable_msi()\n");
+		clear_bit(QL_MSI_ENABLED, &qdev->flags);
+		pci_disable_msi(qdev->pdev);
+	}
+
+	del_timer_sync(&qdev->adapter_timer);
+
+	napi_disable(&qdev->napi);
+
+	if (do_reset) {
+		int soft_reset;
+		unsigned long hw_flags;
+
+		spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+		if (ql_wait_for_drvr_lock(qdev)) {
+			soft_reset = ql_adapter_reset(qdev);
+			if (soft_reset) {
+				netdev_err(ndev, "ql_adapter_reset(%d) FAILED!\n",
+					   qdev->index);
+			}
+			netdev_err(ndev,
+				   "Releasing driver lock via chip reset\n");
+		} else {
+			netdev_err(ndev,
+				   "Could not acquire driver lock to do reset!\n");
+			retval = -1;
+		}
+		spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+	}
+	ql_free_mem_resources(qdev);
+	return retval;
+}
+
+static int ql_adapter_up(struct ql3_adapter *qdev)
+{
+	struct net_device *ndev = qdev->ndev;
+	int err;
+	unsigned long irq_flags = IRQF_SHARED;
+	unsigned long hw_flags;
+
+	if (ql_alloc_mem_resources(qdev)) {
+		netdev_err(ndev, "Unable to  allocate buffers\n");
+		return -ENOMEM;
+	}
+
+	if (qdev->msi) {
+		if (pci_enable_msi(qdev->pdev)) {
+			netdev_err(ndev,
+				   "User requested MSI, but MSI failed to initialize.  Continuing without MSI.\n");
+			qdev->msi = 0;
+		} else {
+			netdev_info(ndev, "MSI Enabled...\n");
+			set_bit(QL_MSI_ENABLED, &qdev->flags);
+			irq_flags &= ~IRQF_SHARED;
+		}
+	}
+
+	err = request_irq(qdev->pdev->irq, ql3xxx_isr,
+			  irq_flags, ndev->name, ndev);
+	if (err) {
+		netdev_err(ndev,
+			   "Failed to reserve interrupt %d - already in use\n",
+			   qdev->pdev->irq);
+		goto err_irq;
+	}
+
+	spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+
+	err = ql_wait_for_drvr_lock(qdev);
+	if (err) {
+		err = ql_adapter_initialize(qdev);
+		if (err) {
+			netdev_err(ndev, "Unable to initialize adapter\n");
+			goto err_init;
+		}
+		netdev_err(ndev, "Releasing driver lock\n");
+		ql_sem_unlock(qdev, QL_DRVR_SEM_MASK);
+	} else {
+		netdev_err(ndev, "Could not acquire driver lock\n");
+		goto err_lock;
+	}
+
+	spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+
+	set_bit(QL_ADAPTER_UP, &qdev->flags);
+
+	mod_timer(&qdev->adapter_timer, jiffies + HZ * 1);
+
+	napi_enable(&qdev->napi);
+	ql_enable_interrupts(qdev);
+	return 0;
+
+err_init:
+	ql_sem_unlock(qdev, QL_DRVR_SEM_MASK);
+err_lock:
+	spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+	free_irq(qdev->pdev->irq, ndev);
+err_irq:
+	if (qdev->msi && test_bit(QL_MSI_ENABLED, &qdev->flags)) {
+		netdev_info(ndev, "calling pci_disable_msi()\n");
+		clear_bit(QL_MSI_ENABLED, &qdev->flags);
+		pci_disable_msi(qdev->pdev);
+	}
+	return err;
+}
+
+static int ql_cycle_adapter(struct ql3_adapter *qdev, int reset)
+{
+	if (ql_adapter_down(qdev, reset) || ql_adapter_up(qdev)) {
+		netdev_err(qdev->ndev,
+			   "Driver up/down cycle failed, closing device\n");
+		rtnl_lock();
+		dev_close(qdev->ndev);
+		rtnl_unlock();
+		return -1;
+	}
+	return 0;
+}
+
+static int ql3xxx_close(struct net_device *ndev)
+{
+	struct ql3_adapter *qdev = netdev_priv(ndev);
+
+	/*
+	 * Wait for device to recover from a reset.
+	 * (Rarely happens, but possible.)
+	 */
+	while (!test_bit(QL_ADAPTER_UP, &qdev->flags))
+		msleep(50);
+
+	ql_adapter_down(qdev, QL_DO_RESET);
+	return 0;
+}
+
+static int ql3xxx_open(struct net_device *ndev)
+{
+	struct ql3_adapter *qdev = netdev_priv(ndev);
+	return ql_adapter_up(qdev);
+}
+
+static int ql3xxx_set_mac_address(struct net_device *ndev, void *p)
+{
+	struct ql3_adapter *qdev = netdev_priv(ndev);
+	struct ql3xxx_port_registers __iomem *port_regs =
+			qdev->mem_map_registers;
+	struct sockaddr *addr = p;
+	unsigned long hw_flags;
+
+	if (netif_running(ndev))
+		return -EBUSY;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	memcpy(ndev->dev_addr, addr->sa_data, ndev->addr_len);
+
+	spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+	/* Program lower 32 bits of the MAC address */
+	ql_write_page0_reg(qdev, &port_regs->macAddrIndirectPtrReg,
+			   (MAC_ADDR_INDIRECT_PTR_REG_RP_MASK << 16));
+	ql_write_page0_reg(qdev, &port_regs->macAddrDataReg,
+			   ((ndev->dev_addr[2] << 24) | (ndev->
+							 dev_addr[3] << 16) |
+			    (ndev->dev_addr[4] << 8) | ndev->dev_addr[5]));
+
+	/* Program top 16 bits of the MAC address */
+	ql_write_page0_reg(qdev, &port_regs->macAddrIndirectPtrReg,
+			   ((MAC_ADDR_INDIRECT_PTR_REG_RP_MASK << 16) | 1));
+	ql_write_page0_reg(qdev, &port_regs->macAddrDataReg,
+			   ((ndev->dev_addr[0] << 8) | ndev->dev_addr[1]));
+	spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+
+	return 0;
+}
+
+static void ql3xxx_tx_timeout(struct net_device *ndev)
+{
+	struct ql3_adapter *qdev = netdev_priv(ndev);
+
+	netdev_err(ndev, "Resetting...\n");
+	/*
+	 * Stop the queues, we've got a problem.
+	 */
+	netif_stop_queue(ndev);
+
+	/*
+	 * Wake up the worker to process this event.
+	 */
+	queue_delayed_work(qdev->workqueue, &qdev->tx_timeout_work, 0);
+}
+
+static void ql_reset_work(struct work_struct *work)
+{
+	struct ql3_adapter *qdev =
+		container_of(work, struct ql3_adapter, reset_work.work);
+	struct net_device *ndev = qdev->ndev;
+	u32 value;
+	struct ql_tx_buf_cb *tx_cb;
+	int max_wait_time, i;
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+	unsigned long hw_flags;
+
+	if (test_bit((QL_RESET_PER_SCSI | QL_RESET_START), &qdev->flags)) {
+		clear_bit(QL_LINK_MASTER, &qdev->flags);
+
+		/*
+		 * Loop through the active list and return the skb.
+		 */
+		for (i = 0; i < NUM_REQ_Q_ENTRIES; i++) {
+			int j;
+			tx_cb = &qdev->tx_buf[i];
+			if (tx_cb->skb) {
+				netdev_printk(KERN_DEBUG, ndev,
+					      "Freeing lost SKB\n");
+				pci_unmap_single(qdev->pdev,
+					 dma_unmap_addr(&tx_cb->map[0],
+							mapaddr),
+					 dma_unmap_len(&tx_cb->map[0], maplen),
+					 PCI_DMA_TODEVICE);
+				for (j = 1; j < tx_cb->seg_count; j++) {
+					pci_unmap_page(qdev->pdev,
+					       dma_unmap_addr(&tx_cb->map[j],
+							      mapaddr),
+					       dma_unmap_len(&tx_cb->map[j],
+							     maplen),
+					       PCI_DMA_TODEVICE);
+				}
+				dev_kfree_skb(tx_cb->skb);
+				tx_cb->skb = NULL;
+			}
+		}
+
+		netdev_err(ndev, "Clearing NRI after reset\n");
+		spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+		ql_write_common_reg(qdev,
+				    &port_regs->CommonRegs.
+				    ispControlStatus,
+				    ((ISP_CONTROL_RI << 16) | ISP_CONTROL_RI));
+		/*
+		 * Wait the for Soft Reset to Complete.
+		 */
+		max_wait_time = 10;
+		do {
+			value = ql_read_common_reg(qdev,
+						   &port_regs->CommonRegs.
+
+						   ispControlStatus);
+			if ((value & ISP_CONTROL_SR) == 0) {
+				netdev_printk(KERN_DEBUG, ndev,
+					      "reset completed\n");
+				break;
+			}
+
+			if (value & ISP_CONTROL_RI) {
+				netdev_printk(KERN_DEBUG, ndev,
+					      "clearing NRI after reset\n");
+				ql_write_common_reg(qdev,
+						    &port_regs->
+						    CommonRegs.
+						    ispControlStatus,
+						    ((ISP_CONTROL_RI <<
+						      16) | ISP_CONTROL_RI));
+			}
+
+			spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+			ssleep(1);
+			spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+		} while (--max_wait_time);
+		spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+
+		if (value & ISP_CONTROL_SR) {
+
+			/*
+			 * Set the reset flags and clear the board again.
+			 * Nothing else to do...
+			 */
+			netdev_err(ndev,
+				   "Timed out waiting for reset to complete\n");
+			netdev_err(ndev, "Do a reset\n");
+			clear_bit(QL_RESET_PER_SCSI, &qdev->flags);
+			clear_bit(QL_RESET_START, &qdev->flags);
+			ql_cycle_adapter(qdev, QL_DO_RESET);
+			return;
+		}
+
+		clear_bit(QL_RESET_ACTIVE, &qdev->flags);
+		clear_bit(QL_RESET_PER_SCSI, &qdev->flags);
+		clear_bit(QL_RESET_START, &qdev->flags);
+		ql_cycle_adapter(qdev, QL_NO_RESET);
+	}
+}
+
+static void ql_tx_timeout_work(struct work_struct *work)
+{
+	struct ql3_adapter *qdev =
+		container_of(work, struct ql3_adapter, tx_timeout_work.work);
+
+	ql_cycle_adapter(qdev, QL_DO_RESET);
+}
+
+static void ql_get_board_info(struct ql3_adapter *qdev)
+{
+	struct ql3xxx_port_registers __iomem *port_regs =
+		qdev->mem_map_registers;
+	u32 value;
+
+	value = ql_read_page0_reg_l(qdev, &port_regs->portStatus);
+
+	qdev->chip_rev_id = ((value & PORT_STATUS_REV_ID_MASK) >> 12);
+	if (value & PORT_STATUS_64)
+		qdev->pci_width = 64;
+	else
+		qdev->pci_width = 32;
+	if (value & PORT_STATUS_X)
+		qdev->pci_x = 1;
+	else
+		qdev->pci_x = 0;
+	qdev->pci_slot = (u8) PCI_SLOT(qdev->pdev->devfn);
+}
+
+static void ql3xxx_timer(struct timer_list *t)
+{
+	struct ql3_adapter *qdev = from_timer(qdev, t, adapter_timer);
+	queue_delayed_work(qdev->workqueue, &qdev->link_state_work, 0);
+}
+
+static const struct net_device_ops ql3xxx_netdev_ops = {
+	.ndo_open		= ql3xxx_open,
+	.ndo_start_xmit		= ql3xxx_send,
+	.ndo_stop		= ql3xxx_close,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= ql3xxx_set_mac_address,
+	.ndo_tx_timeout		= ql3xxx_tx_timeout,
+};
+
+static int ql3xxx_probe(struct pci_dev *pdev,
+			const struct pci_device_id *pci_entry)
+{
+	struct net_device *ndev = NULL;
+	struct ql3_adapter *qdev = NULL;
+	static int cards_found;
+	int uninitialized_var(pci_using_dac), err;
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		pr_err("%s cannot enable PCI device\n", pci_name(pdev));
+		goto err_out;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		pr_err("%s cannot obtain PCI resources\n", pci_name(pdev));
+		goto err_out_disable_pdev;
+	}
+
+	pci_set_master(pdev);
+
+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+		pci_using_dac = 1;
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	} else if (!(err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)))) {
+		pci_using_dac = 0;
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+	}
+
+	if (err) {
+		pr_err("%s no usable DMA configuration\n", pci_name(pdev));
+		goto err_out_free_regions;
+	}
+
+	ndev = alloc_etherdev(sizeof(struct ql3_adapter));
+	if (!ndev) {
+		err = -ENOMEM;
+		goto err_out_free_regions;
+	}
+
+	SET_NETDEV_DEV(ndev, &pdev->dev);
+
+	pci_set_drvdata(pdev, ndev);
+
+	qdev = netdev_priv(ndev);
+	qdev->index = cards_found;
+	qdev->ndev = ndev;
+	qdev->pdev = pdev;
+	qdev->device_id = pci_entry->device;
+	qdev->port_link_state = LS_DOWN;
+	if (msi)
+		qdev->msi = 1;
+
+	qdev->msg_enable = netif_msg_init(debug, default_msg);
+
+	if (pci_using_dac)
+		ndev->features |= NETIF_F_HIGHDMA;
+	if (qdev->device_id == QL3032_DEVICE_ID)
+		ndev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
+
+	qdev->mem_map_registers = pci_ioremap_bar(pdev, 1);
+	if (!qdev->mem_map_registers) {
+		pr_err("%s: cannot map device registers\n", pci_name(pdev));
+		err = -EIO;
+		goto err_out_free_ndev;
+	}
+
+	spin_lock_init(&qdev->adapter_lock);
+	spin_lock_init(&qdev->hw_lock);
+
+	/* Set driver entry points */
+	ndev->netdev_ops = &ql3xxx_netdev_ops;
+	ndev->ethtool_ops = &ql3xxx_ethtool_ops;
+	ndev->watchdog_timeo = 5 * HZ;
+
+	netif_napi_add(ndev, &qdev->napi, ql_poll, 64);
+
+	ndev->irq = pdev->irq;
+
+	/* make sure the EEPROM is good */
+	if (ql_get_nvram_params(qdev)) {
+		pr_alert("%s: Adapter #%d, Invalid NVRAM parameters\n",
+			 __func__, qdev->index);
+		err = -EIO;
+		goto err_out_iounmap;
+	}
+
+	ql_set_mac_info(qdev);
+
+	/* Validate and set parameters */
+	if (qdev->mac_index) {
+		ndev->mtu = qdev->nvram_data.macCfg_port1.etherMtu_mac ;
+		ql_set_mac_addr(ndev, qdev->nvram_data.funcCfg_fn2.macAddress);
+	} else {
+		ndev->mtu = qdev->nvram_data.macCfg_port0.etherMtu_mac ;
+		ql_set_mac_addr(ndev, qdev->nvram_data.funcCfg_fn0.macAddress);
+	}
+
+	ndev->tx_queue_len = NUM_REQ_Q_ENTRIES;
+
+	/* Record PCI bus information. */
+	ql_get_board_info(qdev);
+
+	/*
+	 * Set the Maximum Memory Read Byte Count value. We do this to handle
+	 * jumbo frames.
+	 */
+	if (qdev->pci_x)
+		pci_write_config_word(pdev, (int)0x4e, (u16) 0x0036);
+
+	err = register_netdev(ndev);
+	if (err) {
+		pr_err("%s: cannot register net device\n", pci_name(pdev));
+		goto err_out_iounmap;
+	}
+
+	/* we're going to reset, so assume we have no link for now */
+
+	netif_carrier_off(ndev);
+	netif_stop_queue(ndev);
+
+	qdev->workqueue = create_singlethread_workqueue(ndev->name);
+	INIT_DELAYED_WORK(&qdev->reset_work, ql_reset_work);
+	INIT_DELAYED_WORK(&qdev->tx_timeout_work, ql_tx_timeout_work);
+	INIT_DELAYED_WORK(&qdev->link_state_work, ql_link_state_machine_work);
+
+	timer_setup(&qdev->adapter_timer, ql3xxx_timer, 0);
+	qdev->adapter_timer.expires = jiffies + HZ * 2;	/* two second delay */
+
+	if (!cards_found) {
+		pr_alert("%s\n", DRV_STRING);
+		pr_alert("Driver name: %s, Version: %s\n",
+			 DRV_NAME, DRV_VERSION);
+	}
+	ql_display_dev_info(ndev);
+
+	cards_found++;
+	return 0;
+
+err_out_iounmap:
+	iounmap(qdev->mem_map_registers);
+err_out_free_ndev:
+	free_netdev(ndev);
+err_out_free_regions:
+	pci_release_regions(pdev);
+err_out_disable_pdev:
+	pci_disable_device(pdev);
+err_out:
+	return err;
+}
+
+static void ql3xxx_remove(struct pci_dev *pdev)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql3_adapter *qdev = netdev_priv(ndev);
+
+	unregister_netdev(ndev);
+
+	ql_disable_interrupts(qdev);
+
+	if (qdev->workqueue) {
+		cancel_delayed_work(&qdev->reset_work);
+		cancel_delayed_work(&qdev->tx_timeout_work);
+		destroy_workqueue(qdev->workqueue);
+		qdev->workqueue = NULL;
+	}
+
+	iounmap(qdev->mem_map_registers);
+	pci_release_regions(pdev);
+	free_netdev(ndev);
+}
+
+static struct pci_driver ql3xxx_driver = {
+
+	.name = DRV_NAME,
+	.id_table = ql3xxx_pci_tbl,
+	.probe = ql3xxx_probe,
+	.remove = ql3xxx_remove,
+};
+
+module_pci_driver(ql3xxx_driver);
diff --git a/drivers/net/ethernet/qlogic/qla3xxx.h b/drivers/net/ethernet/qlogic/qla3xxx.h
new file mode 100644
index 0000000..73e2343
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qla3xxx.h
@@ -0,0 +1,1189 @@
+/*
+ * QLogic QLA3xxx NIC HBA Driver
+ * Copyright (c)  2003-2006 QLogic Corporation
+ *
+ * See LICENSE.qla3xxx for copyright and licensing details.
+ */
+#ifndef _QLA3XXX_H_
+#define _QLA3XXX_H_
+
+/*
+ * IOCB Definitions...
+ */
+#pragma pack(1)
+
+#define OPCODE_OB_MAC_IOCB_FN0          0x01
+#define OPCODE_OB_MAC_IOCB_FN2          0x21
+
+#define OPCODE_IB_MAC_IOCB          0xF9
+#define OPCODE_IB_3032_MAC_IOCB     0x09
+#define OPCODE_IB_IP_IOCB           0xFA
+#define OPCODE_IB_3032_IP_IOCB      0x0A
+
+#define OPCODE_FUNC_ID_MASK                 0x30
+#define OUTBOUND_MAC_IOCB                   0x01	/* plus function bits */
+
+#define FN0_MA_BITS_MASK    0x00
+#define FN1_MA_BITS_MASK    0x80
+
+struct ob_mac_iocb_req {
+	u8 opcode;
+	u8 flags;
+#define OB_MAC_IOCB_REQ_MA  0xe0
+#define OB_MAC_IOCB_REQ_F   0x10
+#define OB_MAC_IOCB_REQ_X   0x08
+#define OB_MAC_IOCB_REQ_D   0x02
+#define OB_MAC_IOCB_REQ_I   0x01
+	u8 flags1;
+#define OB_3032MAC_IOCB_REQ_IC	0x04
+#define OB_3032MAC_IOCB_REQ_TC	0x02
+#define OB_3032MAC_IOCB_REQ_UC	0x01
+	u8 reserved0;
+
+	u32 transaction_id;	/* opaque for hardware */
+	__le16 data_len;
+	u8 ip_hdr_off;
+	u8 ip_hdr_len;
+	__le32 reserved1;
+	__le32 reserved2;
+	__le32 buf_addr0_low;
+	__le32 buf_addr0_high;
+	__le32 buf_0_len;
+	__le32 buf_addr1_low;
+	__le32 buf_addr1_high;
+	__le32 buf_1_len;
+	__le32 buf_addr2_low;
+	__le32 buf_addr2_high;
+	__le32 buf_2_len;
+	__le32 reserved3;
+	__le32 reserved4;
+};
+/*
+ * The following constants define control bits for buffer
+ * length fields for all IOCB's.
+ */
+#define OB_MAC_IOCB_REQ_E   0x80000000	/* Last valid buffer in list. */
+#define OB_MAC_IOCB_REQ_C   0x40000000	/* points to an OAL. (continuation) */
+#define OB_MAC_IOCB_REQ_L   0x20000000	/* Auburn local address pointer. */
+#define OB_MAC_IOCB_REQ_R   0x10000000	/* 32-bit address pointer. */
+
+struct ob_mac_iocb_rsp {
+	u8 opcode;
+	u8 flags;
+#define OB_MAC_IOCB_RSP_P   0x08
+#define OB_MAC_IOCB_RSP_L   0x04
+#define OB_MAC_IOCB_RSP_S   0x02
+#define OB_MAC_IOCB_RSP_I   0x01
+
+	__le16 reserved0;
+	u32 transaction_id;	/* opaque for hardware */
+	__le32 reserved1;
+	__le32 reserved2;
+};
+
+struct ib_mac_iocb_rsp {
+	u8 opcode;
+#define IB_MAC_IOCB_RSP_V   0x80
+	u8 flags;
+#define IB_MAC_IOCB_RSP_S   0x80
+#define IB_MAC_IOCB_RSP_H1  0x40
+#define IB_MAC_IOCB_RSP_H0  0x20
+#define IB_MAC_IOCB_RSP_B   0x10
+#define IB_MAC_IOCB_RSP_M   0x08
+#define IB_MAC_IOCB_RSP_MA  0x07
+
+	__le16 length;
+	__le32 reserved;
+	__le32 ial_low;
+	__le32 ial_high;
+
+};
+
+struct ob_ip_iocb_req {
+	u8 opcode;
+	__le16 flags;
+#define OB_IP_IOCB_REQ_O        0x100
+#define OB_IP_IOCB_REQ_H        0x008
+#define OB_IP_IOCB_REQ_U        0x004
+#define OB_IP_IOCB_REQ_D        0x002
+#define OB_IP_IOCB_REQ_I        0x001
+
+	u8 reserved0;
+
+	__le32 transaction_id;
+	__le16 data_len;
+	__le16 reserved1;
+	__le32 hncb_ptr_low;
+	__le32 hncb_ptr_high;
+	__le32 buf_addr0_low;
+	__le32 buf_addr0_high;
+	__le32 buf_0_len;
+	__le32 buf_addr1_low;
+	__le32 buf_addr1_high;
+	__le32 buf_1_len;
+	__le32 buf_addr2_low;
+	__le32 buf_addr2_high;
+	__le32 buf_2_len;
+	__le32 reserved2;
+	__le32 reserved3;
+};
+
+/* defines for BufferLength fields above */
+#define OB_IP_IOCB_REQ_E    0x80000000
+#define OB_IP_IOCB_REQ_C    0x40000000
+#define OB_IP_IOCB_REQ_L    0x20000000
+#define OB_IP_IOCB_REQ_R    0x10000000
+
+struct ob_ip_iocb_rsp {
+	u8 opcode;
+	u8 flags;
+#define OB_MAC_IOCB_RSP_H       0x10
+#define OB_MAC_IOCB_RSP_E       0x08
+#define OB_MAC_IOCB_RSP_L       0x04
+#define OB_MAC_IOCB_RSP_S       0x02
+#define OB_MAC_IOCB_RSP_I       0x01
+
+	__le16 reserved0;
+	__le32 transaction_id;
+	__le32 reserved1;
+	__le32 reserved2;
+};
+
+struct ib_ip_iocb_rsp {
+	u8 opcode;
+#define IB_IP_IOCB_RSP_3032_V   0x80
+#define IB_IP_IOCB_RSP_3032_O   0x40
+#define IB_IP_IOCB_RSP_3032_I   0x20
+#define IB_IP_IOCB_RSP_3032_R   0x10
+	u8 flags;
+#define IB_IP_IOCB_RSP_S        0x80
+#define IB_IP_IOCB_RSP_H1       0x40
+#define IB_IP_IOCB_RSP_H0       0x20
+#define IB_IP_IOCB_RSP_B        0x10
+#define IB_IP_IOCB_RSP_M        0x08
+#define IB_IP_IOCB_RSP_MA       0x07
+
+	__le16 length;
+	__le16 checksum;
+#define IB_IP_IOCB_RSP_3032_ICE		0x01
+#define IB_IP_IOCB_RSP_3032_CE		0x02
+#define IB_IP_IOCB_RSP_3032_NUC		0x04
+#define IB_IP_IOCB_RSP_3032_UDP		0x08
+#define IB_IP_IOCB_RSP_3032_TCP		0x10
+#define IB_IP_IOCB_RSP_3032_IPE		0x20
+	__le16 reserved;
+#define IB_IP_IOCB_RSP_R        0x01
+	__le32 ial_low;
+	__le32 ial_high;
+};
+
+struct net_rsp_iocb {
+	u8 opcode;
+	u8 flags;
+	__le16 reserved0;
+	__le32 reserved[3];
+};
+#pragma pack()
+
+/*
+ * Register Definitions...
+ */
+#define PORT0_PHY_ADDRESS   0x1e00
+#define PORT1_PHY_ADDRESS   0x1f00
+
+#define ETHERNET_CRC_SIZE   4
+
+#define MII_SCAN_REGISTER 0x00000001
+
+#define PHY_ID_0_REG    2
+#define PHY_ID_1_REG    3
+
+#define PHY_OUI_1_MASK       0xfc00
+#define PHY_MODEL_MASK       0x03f0
+
+/*  Address for the Agere Phy */
+#define MII_AGERE_ADDR_1  0x00001000
+#define MII_AGERE_ADDR_2  0x00001100
+
+/* 32-bit ispControlStatus */
+enum {
+	ISP_CONTROL_NP_MASK = 0x0003,
+	ISP_CONTROL_NP_PCSR = 0x0000,
+	ISP_CONTROL_NP_HMCR = 0x0001,
+	ISP_CONTROL_NP_LRAMCR = 0x0002,
+	ISP_CONTROL_NP_PSR = 0x0003,
+	ISP_CONTROL_RI = 0x0008,
+	ISP_CONTROL_CI = 0x0010,
+	ISP_CONTROL_PI = 0x0020,
+	ISP_CONTROL_IN = 0x0040,
+	ISP_CONTROL_BE = 0x0080,
+	ISP_CONTROL_FN_MASK = 0x0700,
+	ISP_CONTROL_FN0_NET = 0x0400,
+	ISP_CONTROL_FN0_SCSI = 0x0500,
+	ISP_CONTROL_FN1_NET = 0x0600,
+	ISP_CONTROL_FN1_SCSI = 0x0700,
+	ISP_CONTROL_LINK_DN_0 = 0x0800,
+	ISP_CONTROL_LINK_DN_1 = 0x1000,
+	ISP_CONTROL_FSR = 0x2000,
+	ISP_CONTROL_FE = 0x4000,
+	ISP_CONTROL_SR = 0x8000,
+};
+
+/* 32-bit ispInterruptMaskReg */
+enum {
+	ISP_IMR_ENABLE_INT = 0x0004,
+	ISP_IMR_DISABLE_RESET_INT = 0x0008,
+	ISP_IMR_DISABLE_CMPL_INT = 0x0010,
+	ISP_IMR_DISABLE_PROC_INT = 0x0020,
+};
+
+/* 32-bit serialPortInterfaceReg */
+enum {
+	ISP_SERIAL_PORT_IF_CLK = 0x0001,
+	ISP_SERIAL_PORT_IF_CS = 0x0002,
+	ISP_SERIAL_PORT_IF_D0 = 0x0004,
+	ISP_SERIAL_PORT_IF_DI = 0x0008,
+	ISP_NVRAM_MASK = (0x000F << 16),
+	ISP_SERIAL_PORT_IF_WE = 0x0010,
+	ISP_SERIAL_PORT_IF_NVR_MASK = 0x001F,
+	ISP_SERIAL_PORT_IF_SCI = 0x0400,
+	ISP_SERIAL_PORT_IF_SC0 = 0x0800,
+	ISP_SERIAL_PORT_IF_SCE = 0x1000,
+	ISP_SERIAL_PORT_IF_SDI = 0x2000,
+	ISP_SERIAL_PORT_IF_SDO = 0x4000,
+	ISP_SERIAL_PORT_IF_SDE = 0x8000,
+	ISP_SERIAL_PORT_IF_I2C_MASK = 0xFC00,
+};
+
+/* semaphoreReg */
+enum {
+	QL_RESOURCE_MASK_BASE_CODE = 0x7,
+	QL_RESOURCE_BITS_BASE_CODE = 0x4,
+	QL_DRVR_SEM_BITS = (QL_RESOURCE_BITS_BASE_CODE << 1),
+	QL_DDR_RAM_SEM_BITS = (QL_RESOURCE_BITS_BASE_CODE << 4),
+	QL_PHY_GIO_SEM_BITS = (QL_RESOURCE_BITS_BASE_CODE << 7),
+	QL_NVRAM_SEM_BITS = (QL_RESOURCE_BITS_BASE_CODE << 10),
+	QL_FLASH_SEM_BITS = (QL_RESOURCE_BITS_BASE_CODE << 13),
+	QL_DRVR_SEM_MASK = (QL_RESOURCE_MASK_BASE_CODE << (1 + 16)),
+	QL_DDR_RAM_SEM_MASK = (QL_RESOURCE_MASK_BASE_CODE << (4 + 16)),
+	QL_PHY_GIO_SEM_MASK = (QL_RESOURCE_MASK_BASE_CODE << (7 + 16)),
+	QL_NVRAM_SEM_MASK = (QL_RESOURCE_MASK_BASE_CODE << (10 + 16)),
+	QL_FLASH_SEM_MASK = (QL_RESOURCE_MASK_BASE_CODE << (13 + 16)),
+};
+
+ /*
+  * QL3XXX memory-mapped registers
+  * QL3XXX has 4 "pages" of registers, each page occupying
+  * 256 bytes.  Each page has a "common" area at the start and then
+  * page-specific registers after that.
+  */
+struct ql3xxx_common_registers {
+	u32 MB0;		/* Offset 0x00 */
+	u32 MB1;		/* Offset 0x04 */
+	u32 MB2;		/* Offset 0x08 */
+	u32 MB3;		/* Offset 0x0c */
+	u32 MB4;		/* Offset 0x10 */
+	u32 MB5;		/* Offset 0x14 */
+	u32 MB6;		/* Offset 0x18 */
+	u32 MB7;		/* Offset 0x1c */
+	u32 flashBiosAddr;
+	u32 flashBiosData;
+	u32 ispControlStatus;
+	u32 ispInterruptMaskReg;
+	u32 serialPortInterfaceReg;
+	u32 semaphoreReg;
+	u32 reqQProducerIndex;
+	u32 rspQConsumerIndex;
+
+	u32 rxLargeQProducerIndex;
+	u32 rxSmallQProducerIndex;
+	u32 arcMadiCommand;
+	u32 arcMadiData;
+};
+
+enum {
+	EXT_HW_CONFIG_SP_MASK = 0x0006,
+	EXT_HW_CONFIG_SP_NONE = 0x0000,
+	EXT_HW_CONFIG_SP_BYTE_PARITY = 0x0002,
+	EXT_HW_CONFIG_SP_ECC = 0x0004,
+	EXT_HW_CONFIG_SP_ECCx = 0x0006,
+	EXT_HW_CONFIG_SIZE_MASK = 0x0060,
+	EXT_HW_CONFIG_SIZE_128M = 0x0000,
+	EXT_HW_CONFIG_SIZE_256M = 0x0020,
+	EXT_HW_CONFIG_SIZE_512M = 0x0040,
+	EXT_HW_CONFIG_SIZE_INVALID = 0x0060,
+	EXT_HW_CONFIG_PD = 0x0080,
+	EXT_HW_CONFIG_FW = 0x0200,
+	EXT_HW_CONFIG_US = 0x0400,
+	EXT_HW_CONFIG_DCS_MASK = 0x1800,
+	EXT_HW_CONFIG_DCS_9MA = 0x0000,
+	EXT_HW_CONFIG_DCS_15MA = 0x0800,
+	EXT_HW_CONFIG_DCS_18MA = 0x1000,
+	EXT_HW_CONFIG_DCS_24MA = 0x1800,
+	EXT_HW_CONFIG_DDS_MASK = 0x6000,
+	EXT_HW_CONFIG_DDS_9MA = 0x0000,
+	EXT_HW_CONFIG_DDS_15MA = 0x2000,
+	EXT_HW_CONFIG_DDS_18MA = 0x4000,
+	EXT_HW_CONFIG_DDS_24MA = 0x6000,
+};
+
+/* InternalChipConfig */
+enum {
+	INTERNAL_CHIP_DM = 0x0001,
+	INTERNAL_CHIP_SD = 0x0002,
+	INTERNAL_CHIP_RAP_MASK = 0x000C,
+	INTERNAL_CHIP_RAP_RR = 0x0000,
+	INTERNAL_CHIP_RAP_NRM = 0x0004,
+	INTERNAL_CHIP_RAP_ERM = 0x0008,
+	INTERNAL_CHIP_RAP_ERMx = 0x000C,
+	INTERNAL_CHIP_WE = 0x0010,
+	INTERNAL_CHIP_EF = 0x0020,
+	INTERNAL_CHIP_FR = 0x0040,
+	INTERNAL_CHIP_FW = 0x0080,
+	INTERNAL_CHIP_FI = 0x0100,
+	INTERNAL_CHIP_FT = 0x0200,
+};
+
+/* portControl */
+enum {
+	PORT_CONTROL_DS = 0x0001,
+	PORT_CONTROL_HH = 0x0002,
+	PORT_CONTROL_EI = 0x0004,
+	PORT_CONTROL_ET = 0x0008,
+	PORT_CONTROL_EF = 0x0010,
+	PORT_CONTROL_DRM = 0x0020,
+	PORT_CONTROL_RLB = 0x0040,
+	PORT_CONTROL_RCB = 0x0080,
+	PORT_CONTROL_MAC = 0x0100,
+	PORT_CONTROL_IPV = 0x0200,
+	PORT_CONTROL_IFP = 0x0400,
+	PORT_CONTROL_ITP = 0x0800,
+	PORT_CONTROL_FI = 0x1000,
+	PORT_CONTROL_DFP = 0x2000,
+	PORT_CONTROL_OI = 0x4000,
+	PORT_CONTROL_CC = 0x8000,
+};
+
+/* portStatus */
+enum {
+	PORT_STATUS_SM0 = 0x0001,
+	PORT_STATUS_SM1 = 0x0002,
+	PORT_STATUS_X = 0x0008,
+	PORT_STATUS_DL = 0x0080,
+	PORT_STATUS_IC = 0x0200,
+	PORT_STATUS_MRC = 0x0400,
+	PORT_STATUS_NL = 0x0800,
+	PORT_STATUS_REV_ID_MASK = 0x7000,
+	PORT_STATUS_REV_ID_1 = 0x1000,
+	PORT_STATUS_REV_ID_2 = 0x2000,
+	PORT_STATUS_REV_ID_3 = 0x3000,
+	PORT_STATUS_64 = 0x8000,
+	PORT_STATUS_UP0 = 0x10000,
+	PORT_STATUS_AC0 = 0x20000,
+	PORT_STATUS_AE0 = 0x40000,
+	PORT_STATUS_UP1 = 0x100000,
+	PORT_STATUS_AC1 = 0x200000,
+	PORT_STATUS_AE1 = 0x400000,
+	PORT_STATUS_F0_ENABLED = 0x1000000,
+	PORT_STATUS_F1_ENABLED = 0x2000000,
+	PORT_STATUS_F2_ENABLED = 0x4000000,
+	PORT_STATUS_F3_ENABLED = 0x8000000,
+};
+
+/* macMIIMgmtControlReg */
+enum {
+	MAC_ADDR_INDIRECT_PTR_REG_RP_MASK = 0x0003,
+	MAC_ADDR_INDIRECT_PTR_REG_RP_PRI_LWR = 0x0000,
+	MAC_ADDR_INDIRECT_PTR_REG_RP_PRI_UPR = 0x0001,
+	MAC_ADDR_INDIRECT_PTR_REG_RP_SEC_LWR = 0x0002,
+	MAC_ADDR_INDIRECT_PTR_REG_RP_SEC_UPR = 0x0003,
+	MAC_ADDR_INDIRECT_PTR_REG_PR = 0x0008,
+	MAC_ADDR_INDIRECT_PTR_REG_SS = 0x0010,
+	MAC_ADDR_INDIRECT_PTR_REG_SE = 0x0020,
+	MAC_ADDR_INDIRECT_PTR_REG_SP = 0x0040,
+	MAC_ADDR_INDIRECT_PTR_REG_PE = 0x0080,
+};
+
+/* macMIIMgmtControlReg */
+enum {
+	MAC_MII_CONTROL_RC = 0x0001,
+	MAC_MII_CONTROL_SC = 0x0002,
+	MAC_MII_CONTROL_AS = 0x0004,
+	MAC_MII_CONTROL_NP = 0x0008,
+	MAC_MII_CONTROL_CLK_SEL_MASK = 0x0070,
+	MAC_MII_CONTROL_CLK_SEL_DIV2 = 0x0000,
+	MAC_MII_CONTROL_CLK_SEL_DIV4 = 0x0010,
+	MAC_MII_CONTROL_CLK_SEL_DIV6 = 0x0020,
+	MAC_MII_CONTROL_CLK_SEL_DIV8 = 0x0030,
+	MAC_MII_CONTROL_CLK_SEL_DIV10 = 0x0040,
+	MAC_MII_CONTROL_CLK_SEL_DIV14 = 0x0050,
+	MAC_MII_CONTROL_CLK_SEL_DIV20 = 0x0060,
+	MAC_MII_CONTROL_CLK_SEL_DIV28 = 0x0070,
+	MAC_MII_CONTROL_RM = 0x8000,
+};
+
+/* macMIIStatusReg */
+enum {
+	MAC_MII_STATUS_BSY = 0x0001,
+	MAC_MII_STATUS_SC = 0x0002,
+	MAC_MII_STATUS_NV = 0x0004,
+};
+
+enum {
+	MAC_CONFIG_REG_PE = 0x0001,
+	MAC_CONFIG_REG_TF = 0x0002,
+	MAC_CONFIG_REG_RF = 0x0004,
+	MAC_CONFIG_REG_FD = 0x0008,
+	MAC_CONFIG_REG_GM = 0x0010,
+	MAC_CONFIG_REG_LB = 0x0020,
+	MAC_CONFIG_REG_SR = 0x8000,
+};
+
+enum {
+	MAC_HALF_DUPLEX_REG_ED = 0x10000,
+	MAC_HALF_DUPLEX_REG_NB = 0x20000,
+	MAC_HALF_DUPLEX_REG_BNB = 0x40000,
+	MAC_HALF_DUPLEX_REG_ALT = 0x80000,
+};
+
+enum {
+	IP_ADDR_INDEX_REG_MASK = 0x000f,
+	IP_ADDR_INDEX_REG_FUNC_0_PRI = 0x0000,
+	IP_ADDR_INDEX_REG_FUNC_0_SEC = 0x0001,
+	IP_ADDR_INDEX_REG_FUNC_1_PRI = 0x0002,
+	IP_ADDR_INDEX_REG_FUNC_1_SEC = 0x0003,
+	IP_ADDR_INDEX_REG_FUNC_2_PRI = 0x0004,
+	IP_ADDR_INDEX_REG_FUNC_2_SEC = 0x0005,
+	IP_ADDR_INDEX_REG_FUNC_3_PRI = 0x0006,
+	IP_ADDR_INDEX_REG_FUNC_3_SEC = 0x0007,
+	IP_ADDR_INDEX_REG_6 = 0x0008,
+	IP_ADDR_INDEX_REG_OFFSET_MASK = 0x0030,
+	IP_ADDR_INDEX_REG_E = 0x0040,
+};
+enum {
+	QL3032_PORT_CONTROL_DS = 0x0001,
+	QL3032_PORT_CONTROL_HH = 0x0002,
+	QL3032_PORT_CONTROL_EIv6 = 0x0004,
+	QL3032_PORT_CONTROL_EIv4 = 0x0008,
+	QL3032_PORT_CONTROL_ET = 0x0010,
+	QL3032_PORT_CONTROL_EF = 0x0020,
+	QL3032_PORT_CONTROL_DRM = 0x0040,
+	QL3032_PORT_CONTROL_RLB = 0x0080,
+	QL3032_PORT_CONTROL_RCB = 0x0100,
+	QL3032_PORT_CONTROL_KIE = 0x0200,
+};
+
+enum {
+	PROBE_MUX_ADDR_REG_MUX_SEL_MASK = 0x003f,
+	PROBE_MUX_ADDR_REG_SYSCLK = 0x0000,
+	PROBE_MUX_ADDR_REG_PCICLK = 0x0040,
+	PROBE_MUX_ADDR_REG_NRXCLK = 0x0080,
+	PROBE_MUX_ADDR_REG_CPUCLK = 0x00C0,
+	PROBE_MUX_ADDR_REG_MODULE_SEL_MASK = 0x3f00,
+	PROBE_MUX_ADDR_REG_UP = 0x4000,
+	PROBE_MUX_ADDR_REG_RE = 0x8000,
+};
+
+enum {
+	STATISTICS_INDEX_REG_MASK = 0x01ff,
+	STATISTICS_INDEX_REG_MAC0_TX_FRAME = 0x0000,
+	STATISTICS_INDEX_REG_MAC0_TX_BYTES = 0x0001,
+	STATISTICS_INDEX_REG_MAC0_TX_STAT1 = 0x0002,
+	STATISTICS_INDEX_REG_MAC0_TX_STAT2 = 0x0003,
+	STATISTICS_INDEX_REG_MAC0_TX_STAT3 = 0x0004,
+	STATISTICS_INDEX_REG_MAC0_TX_STAT4 = 0x0005,
+	STATISTICS_INDEX_REG_MAC0_TX_STAT5 = 0x0006,
+	STATISTICS_INDEX_REG_MAC0_RX_FRAME = 0x0007,
+	STATISTICS_INDEX_REG_MAC0_RX_BYTES = 0x0008,
+	STATISTICS_INDEX_REG_MAC0_RX_STAT1 = 0x0009,
+	STATISTICS_INDEX_REG_MAC0_RX_STAT2 = 0x000a,
+	STATISTICS_INDEX_REG_MAC0_RX_STAT3 = 0x000b,
+	STATISTICS_INDEX_REG_MAC0_RX_ERR_CRC = 0x000c,
+	STATISTICS_INDEX_REG_MAC0_RX_ERR_ENC = 0x000d,
+	STATISTICS_INDEX_REG_MAC0_RX_ERR_LEN = 0x000e,
+	STATISTICS_INDEX_REG_MAC0_RX_STAT4 = 0x000f,
+	STATISTICS_INDEX_REG_MAC1_TX_FRAME = 0x0010,
+	STATISTICS_INDEX_REG_MAC1_TX_BYTES = 0x0011,
+	STATISTICS_INDEX_REG_MAC1_TX_STAT1 = 0x0012,
+	STATISTICS_INDEX_REG_MAC1_TX_STAT2 = 0x0013,
+	STATISTICS_INDEX_REG_MAC1_TX_STAT3 = 0x0014,
+	STATISTICS_INDEX_REG_MAC1_TX_STAT4 = 0x0015,
+	STATISTICS_INDEX_REG_MAC1_TX_STAT5 = 0x0016,
+	STATISTICS_INDEX_REG_MAC1_RX_FRAME = 0x0017,
+	STATISTICS_INDEX_REG_MAC1_RX_BYTES = 0x0018,
+	STATISTICS_INDEX_REG_MAC1_RX_STAT1 = 0x0019,
+	STATISTICS_INDEX_REG_MAC1_RX_STAT2 = 0x001a,
+	STATISTICS_INDEX_REG_MAC1_RX_STAT3 = 0x001b,
+	STATISTICS_INDEX_REG_MAC1_RX_ERR_CRC = 0x001c,
+	STATISTICS_INDEX_REG_MAC1_RX_ERR_ENC = 0x001d,
+	STATISTICS_INDEX_REG_MAC1_RX_ERR_LEN = 0x001e,
+	STATISTICS_INDEX_REG_MAC1_RX_STAT4 = 0x001f,
+	STATISTICS_INDEX_REG_IP_TX_PKTS = 0x0020,
+	STATISTICS_INDEX_REG_IP_TX_BYTES = 0x0021,
+	STATISTICS_INDEX_REG_IP_TX_FRAG = 0x0022,
+	STATISTICS_INDEX_REG_IP_RX_PKTS = 0x0023,
+	STATISTICS_INDEX_REG_IP_RX_BYTES = 0x0024,
+	STATISTICS_INDEX_REG_IP_RX_FRAG = 0x0025,
+	STATISTICS_INDEX_REG_IP_DGRM_REASSEMBLY = 0x0026,
+	STATISTICS_INDEX_REG_IP_V6_RX_PKTS = 0x0027,
+	STATISTICS_INDEX_REG_IP_RX_PKTERR = 0x0028,
+	STATISTICS_INDEX_REG_IP_REASSEMBLY_ERR = 0x0029,
+	STATISTICS_INDEX_REG_TCP_TX_SEG = 0x0030,
+	STATISTICS_INDEX_REG_TCP_TX_BYTES = 0x0031,
+	STATISTICS_INDEX_REG_TCP_RX_SEG = 0x0032,
+	STATISTICS_INDEX_REG_TCP_RX_BYTES = 0x0033,
+	STATISTICS_INDEX_REG_TCP_TIMER_EXP = 0x0034,
+	STATISTICS_INDEX_REG_TCP_RX_ACK = 0x0035,
+	STATISTICS_INDEX_REG_TCP_TX_ACK = 0x0036,
+	STATISTICS_INDEX_REG_TCP_RX_ERR = 0x0037,
+	STATISTICS_INDEX_REG_TCP_RX_WIN_PROBE = 0x0038,
+	STATISTICS_INDEX_REG_TCP_ECC_ERR_CORR = 0x003f,
+};
+
+enum {
+	PORT_FATAL_ERROR_STATUS_OFB_RE_MAC0 = 0x00000001,
+	PORT_FATAL_ERROR_STATUS_OFB_RE_MAC1 = 0x00000002,
+	PORT_FATAL_ERROR_STATUS_OFB_WE = 0x00000004,
+	PORT_FATAL_ERROR_STATUS_IFB_RE = 0x00000008,
+	PORT_FATAL_ERROR_STATUS_IFB_WE_MAC0 = 0x00000010,
+	PORT_FATAL_ERROR_STATUS_IFB_WE_MAC1 = 0x00000020,
+	PORT_FATAL_ERROR_STATUS_ODE_RE = 0x00000040,
+	PORT_FATAL_ERROR_STATUS_ODE_WE = 0x00000080,
+	PORT_FATAL_ERROR_STATUS_IDE_RE = 0x00000100,
+	PORT_FATAL_ERROR_STATUS_IDE_WE = 0x00000200,
+	PORT_FATAL_ERROR_STATUS_SDE_RE = 0x00000400,
+	PORT_FATAL_ERROR_STATUS_SDE_WE = 0x00000800,
+	PORT_FATAL_ERROR_STATUS_BLE = 0x00001000,
+	PORT_FATAL_ERROR_STATUS_SPE = 0x00002000,
+	PORT_FATAL_ERROR_STATUS_EP0 = 0x00004000,
+	PORT_FATAL_ERROR_STATUS_EP1 = 0x00008000,
+	PORT_FATAL_ERROR_STATUS_ICE = 0x00010000,
+	PORT_FATAL_ERROR_STATUS_ILE = 0x00020000,
+	PORT_FATAL_ERROR_STATUS_OPE = 0x00040000,
+	PORT_FATAL_ERROR_STATUS_TA = 0x00080000,
+	PORT_FATAL_ERROR_STATUS_MA = 0x00100000,
+	PORT_FATAL_ERROR_STATUS_SCE = 0x00200000,
+	PORT_FATAL_ERROR_STATUS_RPE = 0x00400000,
+	PORT_FATAL_ERROR_STATUS_MPE = 0x00800000,
+	PORT_FATAL_ERROR_STATUS_OCE = 0x01000000,
+};
+
+/*
+ *  port control and status page - page 0
+ */
+
+struct ql3xxx_port_registers {
+	struct ql3xxx_common_registers CommonRegs;
+
+	u32 ExternalHWConfig;
+	u32 InternalChipConfig;
+	u32 portControl;
+	u32 portStatus;
+	u32 macAddrIndirectPtrReg;
+	u32 macAddrDataReg;
+	u32 macMIIMgmtControlReg;
+	u32 macMIIMgmtAddrReg;
+	u32 macMIIMgmtDataReg;
+	u32 macMIIStatusReg;
+	u32 mac0ConfigReg;
+	u32 mac0IpgIfgReg;
+	u32 mac0HalfDuplexReg;
+	u32 mac0MaxFrameLengthReg;
+	u32 mac0PauseThresholdReg;
+	u32 mac1ConfigReg;
+	u32 mac1IpgIfgReg;
+	u32 mac1HalfDuplexReg;
+	u32 mac1MaxFrameLengthReg;
+	u32 mac1PauseThresholdReg;
+	u32 ipAddrIndexReg;
+	u32 ipAddrDataReg;
+	u32 ipReassemblyTimeout;
+	u32 tcpMaxWindow;
+	u32 currentTcpTimestamp[2];
+	u32 internalRamRWAddrReg;
+	u32 internalRamWDataReg;
+	u32 reclaimedBufferAddrRegLow;
+	u32 reclaimedBufferAddrRegHigh;
+	u32 tcpConfiguration;
+	u32 functionControl;
+	u32 fpgaRevID;
+	u32 localRamAddr;
+	u32 localRamDataAutoIncr;
+	u32 localRamDataNonIncr;
+	u32 gpOutput;
+	u32 gpInput;
+	u32 probeMuxAddr;
+	u32 probeMuxData;
+	u32 statisticsIndexReg;
+	u32 statisticsReadDataRegAutoIncr;
+	u32 statisticsReadDataRegNoIncr;
+	u32 PortFatalErrStatus;
+};
+
+/*
+ * port host memory config page - page 1
+ */
+struct ql3xxx_host_memory_registers {
+	struct ql3xxx_common_registers CommonRegs;
+
+	u32 reserved[12];
+
+	/* Network Request Queue */
+	u32 reqConsumerIndex;
+	u32 reqConsumerIndexAddrLow;
+	u32 reqConsumerIndexAddrHigh;
+	u32 reqBaseAddrLow;
+	u32 reqBaseAddrHigh;
+	u32 reqLength;
+
+	/* Network Completion Queue */
+	u32 rspProducerIndex;
+	u32 rspProducerIndexAddrLow;
+	u32 rspProducerIndexAddrHigh;
+	u32 rspBaseAddrLow;
+	u32 rspBaseAddrHigh;
+	u32 rspLength;
+
+	/* RX Large Buffer Queue */
+	u32 rxLargeQConsumerIndex;
+	u32 rxLargeQBaseAddrLow;
+	u32 rxLargeQBaseAddrHigh;
+	u32 rxLargeQLength;
+	u32 rxLargeBufferLength;
+
+	/* RX Small Buffer Queue */
+	u32 rxSmallQConsumerIndex;
+	u32 rxSmallQBaseAddrLow;
+	u32 rxSmallQBaseAddrHigh;
+	u32 rxSmallQLength;
+	u32 rxSmallBufferLength;
+
+};
+
+/*
+ *  port local RAM page - page 2
+ */
+struct ql3xxx_local_ram_registers {
+	struct ql3xxx_common_registers CommonRegs;
+	u32 bufletSize;
+	u32 maxBufletCount;
+	u32 currentBufletCount;
+	u32 reserved;
+	u32 freeBufletThresholdLow;
+	u32 freeBufletThresholdHigh;
+	u32 ipHashTableBase;
+	u32 ipHashTableCount;
+	u32 tcpHashTableBase;
+	u32 tcpHashTableCount;
+	u32 ncbBase;
+	u32 maxNcbCount;
+	u32 currentNcbCount;
+	u32 drbBase;
+	u32 maxDrbCount;
+	u32 currentDrbCount;
+};
+
+/*
+ * definitions for Semaphore bits in Semaphore/Serial NVRAM interface register
+ */
+
+#define LS_64BITS(x)    (u32)(0xffffffff & ((u64)x))
+#define MS_64BITS(x)    (u32)(0xffffffff & (((u64)x)>>16>>16) )
+
+/*
+ * I/O register
+ */
+
+enum {
+	CONTROL_REG = 0,
+	STATUS_REG = 1,
+	PHY_STAT_LINK_UP = 0x0004,
+	PHY_CTRL_LOOPBACK = 0x4000,
+
+	PETBI_CONTROL_REG = 0x00,
+	PETBI_CTRL_ALL_PARAMS = 0x7140,
+	PETBI_CTRL_SOFT_RESET = 0x8000,
+	PETBI_CTRL_AUTO_NEG = 0x1000,
+	PETBI_CTRL_RESTART_NEG = 0x0200,
+	PETBI_CTRL_FULL_DUPLEX = 0x0100,
+	PETBI_CTRL_SPEED_1000 = 0x0040,
+
+	PETBI_STATUS_REG = 0x01,
+	PETBI_STAT_NEG_DONE = 0x0020,
+	PETBI_STAT_LINK_UP = 0x0004,
+
+	PETBI_NEG_ADVER = 0x04,
+	PETBI_NEG_PAUSE = 0x0080,
+	PETBI_NEG_PAUSE_MASK = 0x0180,
+	PETBI_NEG_DUPLEX = 0x0020,
+	PETBI_NEG_DUPLEX_MASK = 0x0060,
+
+	PETBI_NEG_PARTNER = 0x05,
+	PETBI_NEG_ERROR_MASK = 0x3000,
+
+	PETBI_EXPANSION_REG = 0x06,
+	PETBI_EXP_PAGE_RX = 0x0002,
+
+	PHY_GIG_CONTROL = 9,
+	PHY_GIG_ENABLE_MAN = 0x1000,  /* Enable Master/Slave Manual Config*/
+	PHY_GIG_SET_MASTER = 0x0800,  /* Set Master (slave if clear)*/
+	PHY_GIG_ALL_PARAMS = 0x0300,
+	PHY_GIG_ADV_1000F = 0x0200,
+	PHY_GIG_ADV_1000H = 0x0100,
+
+	PHY_NEG_ADVER = 4,
+	PHY_NEG_ALL_PARAMS = 0x0fe0,
+	PHY_NEG_ASY_PAUSE =  0x0800,
+	PHY_NEG_SYM_PAUSE =  0x0400,
+	PHY_NEG_ADV_SPEED =  0x01e0,
+	PHY_NEG_ADV_100F =   0x0100,
+	PHY_NEG_ADV_100H =   0x0080,
+	PHY_NEG_ADV_10F =    0x0040,
+	PHY_NEG_ADV_10H =    0x0020,
+
+	PETBI_TBI_CTRL = 0x11,
+	PETBI_TBI_RESET = 0x8000,
+	PETBI_TBI_AUTO_SENSE = 0x0100,
+	PETBI_TBI_SERDES_MODE = 0x0010,
+	PETBI_TBI_SERDES_WRAP = 0x0002,
+
+	AUX_CONTROL_STATUS = 0x1c,
+	PHY_AUX_NEG_DONE = 0x8000,
+	PHY_NEG_PARTNER = 5,
+	PHY_AUX_DUPLEX_STAT = 0x0020,
+	PHY_AUX_SPEED_STAT = 0x0018,
+	PHY_AUX_NO_HW_STRAP = 0x0004,
+	PHY_AUX_RESET_STICK = 0x0002,
+	PHY_NEG_PAUSE = 0x0400,
+	PHY_CTRL_SOFT_RESET = 0x8000,
+	PHY_CTRL_AUTO_NEG = 0x1000,
+	PHY_CTRL_RESTART_NEG = 0x0200,
+};
+enum {
+/* AM29LV Flash definitions	*/
+	FM93C56A_START = 0x1,
+/* Commands */
+	FM93C56A_READ = 0x2,
+	FM93C56A_WEN = 0x0,
+	FM93C56A_WRITE = 0x1,
+	FM93C56A_WRITE_ALL = 0x0,
+	FM93C56A_WDS = 0x0,
+	FM93C56A_ERASE = 0x3,
+	FM93C56A_ERASE_ALL = 0x0,
+/* Command Extensions */
+	FM93C56A_WEN_EXT = 0x3,
+	FM93C56A_WRITE_ALL_EXT = 0x1,
+	FM93C56A_WDS_EXT = 0x0,
+	FM93C56A_ERASE_ALL_EXT = 0x2,
+/* Special Bits */
+	FM93C56A_READ_DUMMY_BITS = 1,
+	FM93C56A_READY = 0,
+	FM93C56A_BUSY = 1,
+	FM93C56A_CMD_BITS = 2,
+/* AM29LV Flash definitions	*/
+	FM93C56A_SIZE_8 = 0x100,
+	FM93C56A_SIZE_16 = 0x80,
+	FM93C66A_SIZE_8 = 0x200,
+	FM93C66A_SIZE_16 = 0x100,
+	FM93C86A_SIZE_16 = 0x400,
+/* Address Bits */
+	FM93C56A_NO_ADDR_BITS_16 = 8,
+	FM93C56A_NO_ADDR_BITS_8 = 9,
+	FM93C86A_NO_ADDR_BITS_16 = 10,
+/* Data Bits */
+	FM93C56A_DATA_BITS_16 = 16,
+	FM93C56A_DATA_BITS_8 = 8,
+};
+enum {
+/* Auburn Bits */
+	    AUBURN_EEPROM_DI = 0x8,
+	AUBURN_EEPROM_DI_0 = 0x0,
+	AUBURN_EEPROM_DI_1 = 0x8,
+	AUBURN_EEPROM_DO = 0x4,
+	AUBURN_EEPROM_DO_0 = 0x0,
+	AUBURN_EEPROM_DO_1 = 0x4,
+	AUBURN_EEPROM_CS = 0x2,
+	AUBURN_EEPROM_CS_0 = 0x0,
+	AUBURN_EEPROM_CS_1 = 0x2,
+	AUBURN_EEPROM_CLK_RISE = 0x1,
+	AUBURN_EEPROM_CLK_FALL = 0x0,
+};
+enum {EEPROM_SIZE = FM93C86A_SIZE_16,
+	EEPROM_NO_ADDR_BITS = FM93C86A_NO_ADDR_BITS_16,
+	EEPROM_NO_DATA_BITS = FM93C56A_DATA_BITS_16,
+};
+
+/*
+ *  MAC Config data structure
+ */
+    struct eeprom_port_cfg {
+	u16 etherMtu_mac;
+	u16 pauseThreshold_mac;
+	u16 resumeThreshold_mac;
+	u16 portConfiguration;
+#define PORT_CONFIG_DEFAULT                 0xf700
+#define PORT_CONFIG_AUTO_NEG_ENABLED        0x8000
+#define PORT_CONFIG_SYM_PAUSE_ENABLED       0x4000
+#define PORT_CONFIG_FULL_DUPLEX_ENABLED     0x2000
+#define PORT_CONFIG_HALF_DUPLEX_ENABLED     0x1000
+#define PORT_CONFIG_1000MB_SPEED            0x0400
+#define PORT_CONFIG_100MB_SPEED             0x0200
+#define PORT_CONFIG_10MB_SPEED              0x0100
+#define PORT_CONFIG_LINK_SPEED_MASK         0x0F00
+	u16 reserved[12];
+
+};
+
+/*
+ * BIOS data structure
+ */
+struct eeprom_bios_cfg {
+	u16 SpinDlyEn:1, disBios:1, EnMemMap:1, EnSelectBoot:1, Reserved:12;
+
+	u8 bootID0:7, boodID0Valid:1;
+	u8 bootLun0[8];
+
+	u8 bootID1:7, boodID1Valid:1;
+	u8 bootLun1[8];
+
+	u16 MaxLunsTrgt;
+	u8 reserved[10];
+};
+
+/*
+ *  Function Specific Data structure
+ */
+struct eeprom_function_cfg {
+	u8 reserved[30];
+	u16 macAddress[3];
+	u16 macAddressSecondary[3];
+
+	u16 subsysVendorId;
+	u16 subsysDeviceId;
+};
+
+/*
+ *  EEPROM format
+ */
+struct eeprom_data {
+	u8 asicId[4];
+	u16 version_and_numPorts; /* together to avoid endianness crap */
+	u16 boardId;
+
+#define EEPROM_BOARDID_STR_SIZE   16
+#define EEPROM_SERIAL_NUM_SIZE    16
+
+	u8 boardIdStr[16];
+	u8 serialNumber[16];
+	u16 extHwConfig;
+	struct eeprom_port_cfg macCfg_port0;
+	struct eeprom_port_cfg macCfg_port1;
+	u16 bufletSize;
+	u16 bufletCount;
+	u16 tcpWindowThreshold50;
+	u16 tcpWindowThreshold25;
+	u16 tcpWindowThreshold0;
+	u16 ipHashTableBaseHi;
+	u16 ipHashTableBaseLo;
+	u16 ipHashTableSize;
+	u16 tcpHashTableBaseHi;
+	u16 tcpHashTableBaseLo;
+	u16 tcpHashTableSize;
+	u16 ncbTableBaseHi;
+	u16 ncbTableBaseLo;
+	u16 ncbTableSize;
+	u16 drbTableBaseHi;
+	u16 drbTableBaseLo;
+	u16 drbTableSize;
+	u16 reserved_142[4];
+	u16 ipReassemblyTimeout;
+	u16 tcpMaxWindowSize;
+	u16 ipSecurity;
+#define IPSEC_CONFIG_PRESENT 0x0001
+	u8 reserved_156[294];
+	u16 qDebug[8];
+	struct eeprom_function_cfg funcCfg_fn0;
+	u16 reserved_510;
+	u8 oemSpace[432];
+	struct eeprom_bios_cfg biosCfg_fn1;
+	struct eeprom_function_cfg funcCfg_fn1;
+	u16 reserved_1022;
+	u8 reserved_1024[464];
+	struct eeprom_function_cfg funcCfg_fn2;
+	u16 reserved_1534;
+	u8 reserved_1536[432];
+	struct eeprom_bios_cfg biosCfg_fn3;
+	struct eeprom_function_cfg funcCfg_fn3;
+	u16 checksum;
+};
+
+/*
+ * General definitions...
+ */
+
+/*
+ * Below are a number compiler switches for controlling driver behavior.
+ * Some are not supported under certain conditions and are notated as such.
+ */
+
+#define QL3XXX_VENDOR_ID    0x1077
+#define QL3022_DEVICE_ID    0x3022
+#define QL3032_DEVICE_ID    0x3032
+
+/* MTU & Frame Size stuff */
+#define NORMAL_MTU_SIZE 		ETH_DATA_LEN
+#define JUMBO_MTU_SIZE 			9000
+#define VLAN_ID_LEN			    2
+
+/* Request Queue Related Definitions */
+#define NUM_REQ_Q_ENTRIES   256	/* so that 64 * 64  = 4096 (1 page) */
+
+/* Response Queue Related Definitions */
+#define NUM_RSP_Q_ENTRIES   256	/* so that 256 * 16  = 4096 (1 page) */
+
+/* Transmit and Receive Buffers */
+#define NUM_LBUFQ_ENTRIES   	128
+#define JUMBO_NUM_LBUFQ_ENTRIES 32
+#define NUM_SBUFQ_ENTRIES   	64
+#define QL_SMALL_BUFFER_SIZE    32
+#define QL_ADDR_ELE_PER_BUFQ_ENTRY \
+(sizeof(struct lrg_buf_q_entry) / sizeof(struct bufq_addr_element))
+    /* Each send has at least control block.  This is how many we keep. */
+#define NUM_SMALL_BUFFERS     	NUM_SBUFQ_ENTRIES * QL_ADDR_ELE_PER_BUFQ_ENTRY
+
+#define QL_HEADER_SPACE 32	/* make header space at top of skb. */
+/*
+ * Large & Small Buffers for Receives
+ */
+struct lrg_buf_q_entry {
+
+	__le32 addr0_lower;
+#define IAL_LAST_ENTRY 0x00000001
+#define IAL_CONT_ENTRY 0x00000002
+#define IAL_FLAG_MASK  0x00000003
+	__le32 addr0_upper;
+	__le32 addr1_lower;
+	__le32 addr1_upper;
+	__le32 addr2_lower;
+	__le32 addr2_upper;
+	__le32 addr3_lower;
+	__le32 addr3_upper;
+	__le32 addr4_lower;
+	__le32 addr4_upper;
+	__le32 addr5_lower;
+	__le32 addr5_upper;
+	__le32 addr6_lower;
+	__le32 addr6_upper;
+	__le32 addr7_lower;
+	__le32 addr7_upper;
+
+};
+
+struct bufq_addr_element {
+	__le32 addr_low;
+	__le32 addr_high;
+};
+
+#define QL_NO_RESET			0
+#define QL_DO_RESET			1
+
+enum link_state_t {
+	LS_UNKNOWN = 0,
+	LS_DOWN,
+	LS_DEGRADE,
+	LS_RECOVER,
+	LS_UP,
+};
+
+struct ql_rcv_buf_cb {
+	struct ql_rcv_buf_cb *next;
+	struct sk_buff *skb;
+	DEFINE_DMA_UNMAP_ADDR(mapaddr);
+	DEFINE_DMA_UNMAP_LEN(maplen);
+	__le32 buf_phy_addr_low;
+	__le32 buf_phy_addr_high;
+	int index;
+};
+
+/*
+ * Original IOCB has 3 sg entries:
+ * first points to skb-data area
+ * second points to first frag
+ * third points to next oal.
+ * OAL has 5 entries:
+ * 1 thru 4 point to frags
+ * fifth points to next oal.
+ */
+#define MAX_OAL_CNT ((MAX_SKB_FRAGS-1)/4 + 1)
+
+struct oal_entry {
+	__le32 dma_lo;
+	__le32 dma_hi;
+	__le32 len;
+#define OAL_LAST_ENTRY   0x80000000	/* Last valid buffer in list. */
+#define OAL_CONT_ENTRY   0x40000000	/* points to an OAL. (continuation) */
+};
+
+struct oal {
+	struct oal_entry oal_entry[5];
+};
+
+struct map_list {
+	DEFINE_DMA_UNMAP_ADDR(mapaddr);
+	DEFINE_DMA_UNMAP_LEN(maplen);
+};
+
+struct ql_tx_buf_cb {
+	struct sk_buff *skb;
+	struct ob_mac_iocb_req *queue_entry ;
+	int seg_count;
+	struct oal *oal;
+	struct map_list map[MAX_SKB_FRAGS+1];
+};
+
+/* definitions for type field */
+#define QL_BUF_TYPE_MACIOCB 0x01
+#define QL_BUF_TYPE_IPIOCB  0x02
+#define QL_BUF_TYPE_TCPIOCB 0x03
+
+/* qdev->flags definitions. */
+enum { QL_RESET_DONE = 1,	/* Reset finished. */
+	QL_RESET_ACTIVE = 2,	/* Waiting for reset to finish. */
+	QL_RESET_START = 3,	/* Please reset the chip. */
+	QL_RESET_PER_SCSI = 4,	/* SCSI driver requests reset. */
+	QL_TX_TIMEOUT = 5,	/* Timeout in progress. */
+	QL_LINK_MASTER = 6,	/* This driver controls the link. */
+	QL_ADAPTER_UP = 7,	/* Adapter has been brought up. */
+	QL_THREAD_UP = 8,	/* This flag is available. */
+	QL_LINK_UP = 9,	/* Link Status. */
+	QL_ALLOC_REQ_RSP_Q_DONE = 10,
+	QL_ALLOC_BUFQS_DONE = 11,
+	QL_ALLOC_SMALL_BUF_DONE = 12,
+	QL_LINK_OPTICAL = 13,
+	QL_MSI_ENABLED = 14,
+};
+
+/*
+ * ql3_adapter - The main Adapter structure definition.
+ * This structure has all fields relevant to the hardware.
+ */
+
+struct ql3_adapter {
+	u32 reserved_00;
+	unsigned long flags;
+
+	/* PCI Configuration information for this device */
+	struct pci_dev *pdev;
+	struct net_device *ndev;	/* Parent NET device */
+
+	struct napi_struct napi;
+
+	/* Hardware information */
+	u8 chip_rev_id;
+	u8 pci_slot;
+	u8 pci_width;
+	u8 pci_x;
+	u32 msi;
+	int index;
+	struct timer_list adapter_timer;	/* timer used for various functions */
+
+	spinlock_t adapter_lock;
+	spinlock_t hw_lock;
+
+	/* PCI Bus Relative Register Addresses */
+	u8 __iomem *mmap_virt_base;	/* stores return value from ioremap() */
+	struct ql3xxx_port_registers __iomem *mem_map_registers;
+	u32 current_page;	/* tracks current register page */
+
+	u32 msg_enable;
+	u8 reserved_01[2];
+	u8 reserved_02[2];
+
+	/* Page for Shadow Registers */
+	void *shadow_reg_virt_addr;
+	dma_addr_t shadow_reg_phy_addr;
+
+	/* Net Request Queue */
+	u32 req_q_size;
+	u32 reserved_03;
+	struct ob_mac_iocb_req *req_q_virt_addr;
+	dma_addr_t req_q_phy_addr;
+	u16 req_producer_index;
+	u16 reserved_04;
+	u16 *preq_consumer_index;
+	u32 req_consumer_index_phy_addr_high;
+	u32 req_consumer_index_phy_addr_low;
+	atomic_t tx_count;
+	struct ql_tx_buf_cb tx_buf[NUM_REQ_Q_ENTRIES];
+
+	/* Net Response Queue */
+	u32 rsp_q_size;
+	u32 eeprom_cmd_data;
+	struct net_rsp_iocb *rsp_q_virt_addr;
+	dma_addr_t rsp_q_phy_addr;
+	struct net_rsp_iocb *rsp_current;
+	u16 rsp_consumer_index;
+	u16 reserved_06;
+	volatile __le32 *prsp_producer_index;
+	u32 rsp_producer_index_phy_addr_high;
+	u32 rsp_producer_index_phy_addr_low;
+
+	/* Large Buffer Queue */
+	u32 lrg_buf_q_alloc_size;
+	u32 lrg_buf_q_size;
+	void *lrg_buf_q_alloc_virt_addr;
+	void *lrg_buf_q_virt_addr;
+	dma_addr_t lrg_buf_q_alloc_phy_addr;
+	dma_addr_t lrg_buf_q_phy_addr;
+	u32 lrg_buf_q_producer_index;
+	u32 lrg_buf_release_cnt;
+	struct bufq_addr_element *lrg_buf_next_free;
+	u32 num_large_buffers;
+	u32 num_lbufq_entries;
+
+	/* Large (Receive) Buffers */
+	struct ql_rcv_buf_cb *lrg_buf;
+	struct ql_rcv_buf_cb *lrg_buf_free_head;
+	struct ql_rcv_buf_cb *lrg_buf_free_tail;
+	u32 lrg_buf_free_count;
+	u32 lrg_buffer_len;
+	u32 lrg_buf_index;
+	u32 lrg_buf_skb_check;
+
+	/* Small Buffer Queue */
+	u32 small_buf_q_alloc_size;
+	u32 small_buf_q_size;
+	u32 small_buf_q_producer_index;
+	void *small_buf_q_alloc_virt_addr;
+	void *small_buf_q_virt_addr;
+	dma_addr_t small_buf_q_alloc_phy_addr;
+	dma_addr_t small_buf_q_phy_addr;
+	u32 small_buf_index;
+
+	/* Small (Receive) Buffers */
+	void *small_buf_virt_addr;
+	dma_addr_t small_buf_phy_addr;
+	u32 small_buf_phy_addr_low;
+	u32 small_buf_phy_addr_high;
+	u32 small_buf_release_cnt;
+	u32 small_buf_total_size;
+
+	struct eeprom_data nvram_data;
+	u32 port_link_state;
+
+	/* 4022 specific */
+	u32 mac_index;		/* Driver's MAC number can be 0 or 1 for first and second networking functions respectively */
+	u32 PHYAddr;		/* Address of PHY 0x1e00 Port 0 and 0x1f00 Port 1 */
+	u32 mac_ob_opcode;	/* Opcode to use on mac transmission */
+	u32 mb_bit_mask;	/* MA Bits mask to use on transmission */
+	u32 numPorts;
+	struct workqueue_struct *workqueue;
+	struct delayed_work reset_work;
+	struct delayed_work tx_timeout_work;
+	struct delayed_work link_state_work;
+	u32 max_frame_size;
+	u32 device_id;
+	u16 phyType;
+};
+
+#endif				/* _QLA3XXX_H_ */
diff --git a/drivers/net/ethernet/qlogic/qlcnic/Makefile b/drivers/net/ethernet/qlogic/qlcnic/Makefile
new file mode 100644
index 0000000..dbaeab3
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for Qlogic 1G/10G Ethernet Driver for CNA devices
+#
+
+obj-$(CONFIG_QLCNIC) := qlcnic.o
+
+qlcnic-y := qlcnic_hw.o qlcnic_main.o qlcnic_init.o \
+	qlcnic_ethtool.o qlcnic_ctx.o qlcnic_io.o \
+	qlcnic_sysfs.o qlcnic_minidump.o qlcnic_83xx_hw.o \
+	qlcnic_83xx_init.o qlcnic_83xx_vnic.o \
+	qlcnic_sriov_common.o
+
+qlcnic-$(CONFIG_QLCNIC_SRIOV) += qlcnic_sriov_pf.o
+
+qlcnic-$(CONFIG_QLCNIC_DCB) += qlcnic_dcb.o
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
new file mode 100644
index 0000000..8131292
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
@@ -0,0 +1,2426 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#ifndef _QLCNIC_H_
+#define _QLCNIC_H_
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/skbuff.h>
+#include <linux/firmware.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/timer.h>
+#include <linux/irq.h>
+#include <linux/vmalloc.h>
+#include <linux/io.h>
+#include <asm/byteorder.h>
+#include <linux/bitops.h>
+#include <linux/if_vlan.h>
+
+#include "qlcnic_hdr.h"
+#include "qlcnic_hw.h"
+#include "qlcnic_83xx_hw.h"
+#include "qlcnic_dcb.h"
+
+#define _QLCNIC_LINUX_MAJOR 5
+#define _QLCNIC_LINUX_MINOR 3
+#define _QLCNIC_LINUX_SUBVERSION 66
+#define QLCNIC_LINUX_VERSIONID  "5.3.66"
+#define QLCNIC_DRV_IDC_VER  0x01
+#define QLCNIC_DRIVER_VERSION  ((_QLCNIC_LINUX_MAJOR << 16) |\
+		 (_QLCNIC_LINUX_MINOR << 8) | (_QLCNIC_LINUX_SUBVERSION))
+
+#define QLCNIC_VERSION_CODE(a, b, c)	(((a) << 24) + ((b) << 16) + (c))
+#define _major(v)	(((v) >> 24) & 0xff)
+#define _minor(v)	(((v) >> 16) & 0xff)
+#define _build(v)	((v) & 0xffff)
+
+/* version in image has weird encoding:
+ *  7:0  - major
+ * 15:8  - minor
+ * 31:16 - build (little endian)
+ */
+#define QLCNIC_DECODE_VERSION(v) \
+	QLCNIC_VERSION_CODE(((v) & 0xff), (((v) >> 8) & 0xff), ((v) >> 16))
+
+#define QLCNIC_MIN_FW_VERSION     QLCNIC_VERSION_CODE(4, 4, 2)
+#define QLCNIC_NUM_FLASH_SECTORS (64)
+#define QLCNIC_FLASH_SECTOR_SIZE (64 * 1024)
+#define QLCNIC_FLASH_TOTAL_SIZE  (QLCNIC_NUM_FLASH_SECTORS \
+					* QLCNIC_FLASH_SECTOR_SIZE)
+
+#define RCV_DESC_RINGSIZE(rds_ring)	\
+	(sizeof(struct rcv_desc) * (rds_ring)->num_desc)
+#define RCV_BUFF_RINGSIZE(rds_ring)	\
+	(sizeof(struct qlcnic_rx_buffer) * rds_ring->num_desc)
+#define STATUS_DESC_RINGSIZE(sds_ring)	\
+	(sizeof(struct status_desc) * (sds_ring)->num_desc)
+#define TX_BUFF_RINGSIZE(tx_ring)	\
+	(sizeof(struct qlcnic_cmd_buffer) * tx_ring->num_desc)
+#define TX_DESC_RINGSIZE(tx_ring)	\
+	(sizeof(struct cmd_desc_type0) * tx_ring->num_desc)
+
+#define QLCNIC_P3P_A0		0x50
+#define QLCNIC_P3P_C0		0x58
+
+#define QLCNIC_IS_REVISION_P3P(REVISION)     (REVISION >= QLCNIC_P3P_A0)
+
+#define FIRST_PAGE_GROUP_START	0
+#define FIRST_PAGE_GROUP_END	0x100000
+
+#define P3P_MAX_MTU                     (9600)
+#define P3P_MIN_MTU                     (68)
+#define QLCNIC_MAX_ETHERHDR                32 /* This contains some padding */
+
+#define QLCNIC_P3P_RX_BUF_MAX_LEN         (QLCNIC_MAX_ETHERHDR + ETH_DATA_LEN)
+#define QLCNIC_P3P_RX_JUMBO_BUF_MAX_LEN   (QLCNIC_MAX_ETHERHDR + P3P_MAX_MTU)
+#define QLCNIC_CT_DEFAULT_RX_BUF_LEN	2048
+#define QLCNIC_LRO_BUFFER_EXTRA		2048
+
+/* Tx defines */
+#define QLCNIC_MAX_FRAGS_PER_TX	14
+#define MAX_TSO_HEADER_DESC	2
+#define MGMT_CMD_DESC_RESV	4
+#define TX_STOP_THRESH		((MAX_SKB_FRAGS >> 2) + MAX_TSO_HEADER_DESC \
+							+ MGMT_CMD_DESC_RESV)
+#define QLCNIC_MAX_TX_TIMEOUTS	2
+
+/* Driver will use 1 Tx ring in INT-x/MSI/SRIOV mode. */
+#define QLCNIC_SINGLE_RING		1
+#define QLCNIC_DEF_SDS_RINGS		4
+#define QLCNIC_DEF_TX_RINGS		4
+#define QLCNIC_MAX_VNIC_TX_RINGS	4
+#define QLCNIC_MAX_VNIC_SDS_RINGS	4
+#define QLCNIC_83XX_MINIMUM_VECTOR	3
+#define QLCNIC_82XX_MINIMUM_VECTOR	2
+
+enum qlcnic_queue_type {
+	QLCNIC_TX_QUEUE = 1,
+	QLCNIC_RX_QUEUE,
+};
+
+/* Operational mode for driver */
+#define QLCNIC_VNIC_MODE	0xFF
+#define QLCNIC_DEFAULT_MODE	0x0
+
+/* Virtual NIC function count */
+#define QLC_DEFAULT_VNIC_COUNT	8
+#define QLC_84XX_VNIC_COUNT	16
+
+/*
+ * Following are the states of the Phantom. Phantom will set them and
+ * Host will read to check if the fields are correct.
+ */
+#define PHAN_INITIALIZE_FAILED		0xffff
+#define PHAN_INITIALIZE_COMPLETE	0xff01
+
+/* Host writes the following to notify that it has done the init-handshake */
+#define PHAN_INITIALIZE_ACK		0xf00f
+#define PHAN_PEG_RCV_INITIALIZED	0xff01
+
+#define NUM_RCV_DESC_RINGS	3
+
+#define RCV_RING_NORMAL 0
+#define RCV_RING_JUMBO	1
+
+#define MIN_CMD_DESCRIPTORS		64
+#define MIN_RCV_DESCRIPTORS		64
+#define MIN_JUMBO_DESCRIPTORS		32
+
+#define MAX_CMD_DESCRIPTORS		1024
+#define MAX_RCV_DESCRIPTORS_1G		4096
+#define MAX_RCV_DESCRIPTORS_10G 	8192
+#define MAX_RCV_DESCRIPTORS_VF		2048
+#define MAX_JUMBO_RCV_DESCRIPTORS_1G	512
+#define MAX_JUMBO_RCV_DESCRIPTORS_10G	1024
+
+#define DEFAULT_RCV_DESCRIPTORS_1G	2048
+#define DEFAULT_RCV_DESCRIPTORS_10G	4096
+#define DEFAULT_RCV_DESCRIPTORS_VF	1024
+#define MAX_RDS_RINGS                   2
+
+#define get_next_index(index, length)	\
+	(((index) + 1) & ((length) - 1))
+
+/*
+ * Following data structures describe the descriptors that will be used.
+ * Added fileds of tcpHdrSize and ipHdrSize, The driver needs to do it only when
+ * we are doing LSO (above the 1500 size packet) only.
+ */
+struct cmd_desc_type0 {
+	u8 tcp_hdr_offset;	/* For LSO only */
+	u8 ip_hdr_offset;	/* For LSO only */
+	__le16 flags_opcode;	/* 15:13 unused, 12:7 opcode, 6:0 flags */
+	__le32 nfrags__length;	/* 31:8 total len, 7:0 frag count */
+
+	__le64 addr_buffer2;
+
+	__le16 encap_descr;	/* 15:10 offset of outer L3 header,
+				 * 9:6 number of 32bit words in outer L3 header,
+				 * 5 offload outer L4 checksum,
+				 * 4 offload outer L3 checksum,
+				 * 3 Inner L4 type, TCP=0, UDP=1,
+				 * 2 Inner L3 type, IPv4=0, IPv6=1,
+				 * 1 Outer L3 type,IPv4=0, IPv6=1,
+				 * 0 type of encapsulation, GRE=0, VXLAN=1
+				 */
+	__le16 mss;
+	u8 port_ctxid;		/* 7:4 ctxid 3:0 port */
+	u8 hdr_length;		/* LSO only : MAC+IP+TCP Hdr size */
+	u8 outer_hdr_length;	/* Encapsulation only */
+	u8 rsvd1;
+
+	__le64 addr_buffer3;
+	__le64 addr_buffer1;
+
+	__le16 buffer_length[4];
+
+	__le64 addr_buffer4;
+
+	u8 eth_addr[ETH_ALEN];
+	__le16 vlan_TCI;	/* In case of  encapsulation,
+				 * this is for outer VLAN
+				 */
+
+} __attribute__ ((aligned(64)));
+
+/* Note: sizeof(rcv_desc) should always be a mutliple of 2 */
+struct rcv_desc {
+	__le16 reference_handle;
+	__le16 reserved;
+	__le32 buffer_length;	/* allocated buffer length (usually 2K) */
+	__le64 addr_buffer;
+} __packed;
+
+struct status_desc {
+	__le64 status_desc_data[2];
+} __attribute__ ((aligned(16)));
+
+/* UNIFIED ROMIMAGE */
+#define QLCNIC_UNI_FW_MIN_SIZE		0xc8000
+#define QLCNIC_UNI_DIR_SECT_PRODUCT_TBL	0x0
+#define QLCNIC_UNI_DIR_SECT_BOOTLD	0x6
+#define QLCNIC_UNI_DIR_SECT_FW		0x7
+
+/*Offsets */
+#define QLCNIC_UNI_CHIP_REV_OFF		10
+#define QLCNIC_UNI_FLAGS_OFF		11
+#define QLCNIC_UNI_BIOS_VERSION_OFF 	12
+#define QLCNIC_UNI_BOOTLD_IDX_OFF	27
+#define QLCNIC_UNI_FIRMWARE_IDX_OFF 	29
+
+struct uni_table_desc{
+	__le32	findex;
+	__le32	num_entries;
+	__le32	entry_size;
+	__le32	reserved[5];
+};
+
+struct uni_data_desc{
+	__le32	findex;
+	__le32	size;
+	__le32	reserved[5];
+};
+
+/* Flash Defines and Structures */
+#define QLCNIC_FLT_LOCATION	0x3F1000
+#define QLCNIC_FDT_LOCATION     0x3F0000
+#define QLCNIC_B0_FW_IMAGE_REGION 0x74
+#define QLCNIC_C0_FW_IMAGE_REGION 0x97
+#define QLCNIC_BOOTLD_REGION    0X72
+struct qlcnic_flt_header {
+	u16 version;
+	u16 len;
+	u16 checksum;
+	u16 reserved;
+};
+
+struct qlcnic_flt_entry {
+	u8 region;
+	u8 reserved0;
+	u8 attrib;
+	u8 reserved1;
+	u32 size;
+	u32 start_addr;
+	u32 end_addr;
+};
+
+/* Flash Descriptor Table */
+struct qlcnic_fdt {
+	u32	valid;
+	u16	ver;
+	u16	len;
+	u16	cksum;
+	u16	unused;
+	u8	model[16];
+	u8	mfg_id;
+	u16	id;
+	u8	flag;
+	u8	erase_cmd;
+	u8	alt_erase_cmd;
+	u8	write_enable_cmd;
+	u8	write_enable_bits;
+	u8	write_statusreg_cmd;
+	u8	unprotected_sec_cmd;
+	u8	read_manuf_cmd;
+	u32	block_size;
+	u32	alt_block_size;
+	u32	flash_size;
+	u32	write_enable_data;
+	u8	readid_addr_len;
+	u8	write_disable_bits;
+	u8	read_dev_id_len;
+	u8	chip_erase_cmd;
+	u16	read_timeo;
+	u8	protected_sec_cmd;
+	u8	resvd[65];
+};
+/* Magic number to let user know flash is programmed */
+#define	QLCNIC_BDINFO_MAGIC 0x12345678
+
+#define QLCNIC_BRDTYPE_P3P_REF_QG	0x0021
+#define QLCNIC_BRDTYPE_P3P_HMEZ		0x0022
+#define QLCNIC_BRDTYPE_P3P_10G_CX4_LP	0x0023
+#define QLCNIC_BRDTYPE_P3P_4_GB		0x0024
+#define QLCNIC_BRDTYPE_P3P_IMEZ		0x0025
+#define QLCNIC_BRDTYPE_P3P_10G_SFP_PLUS	0x0026
+#define QLCNIC_BRDTYPE_P3P_10000_BASE_T	0x0027
+#define QLCNIC_BRDTYPE_P3P_XG_LOM	0x0028
+#define QLCNIC_BRDTYPE_P3P_4_GB_MM	0x0029
+#define QLCNIC_BRDTYPE_P3P_10G_SFP_CT	0x002a
+#define QLCNIC_BRDTYPE_P3P_10G_SFP_QT	0x002b
+#define QLCNIC_BRDTYPE_P3P_10G_CX4	0x0031
+#define QLCNIC_BRDTYPE_P3P_10G_XFP	0x0032
+#define QLCNIC_BRDTYPE_P3P_10G_TP	0x0080
+
+#define QLCNIC_MSIX_TABLE_OFFSET	0x44
+
+/* Flash memory map */
+#define QLCNIC_BRDCFG_START	0x4000		/* board config */
+#define QLCNIC_BOOTLD_START	0x10000		/* bootld */
+#define QLCNIC_IMAGE_START	0x43000		/* compressed image */
+#define QLCNIC_USER_START	0x3E8000	/* Firmware info */
+
+#define QLCNIC_FW_VERSION_OFFSET	(QLCNIC_USER_START+0x408)
+#define QLCNIC_FW_SIZE_OFFSET		(QLCNIC_USER_START+0x40c)
+#define QLCNIC_FW_SERIAL_NUM_OFFSET	(QLCNIC_USER_START+0x81c)
+#define QLCNIC_BIOS_VERSION_OFFSET	(QLCNIC_USER_START+0x83c)
+
+#define QLCNIC_BRDTYPE_OFFSET		(QLCNIC_BRDCFG_START+0x8)
+#define QLCNIC_FW_MAGIC_OFFSET		(QLCNIC_BRDCFG_START+0x128)
+
+#define QLCNIC_FW_MIN_SIZE		(0x3fffff)
+#define QLCNIC_UNIFIED_ROMIMAGE  	0
+#define QLCNIC_FLASH_ROMIMAGE		1
+#define QLCNIC_UNKNOWN_ROMIMAGE		0xff
+
+#define QLCNIC_UNIFIED_ROMIMAGE_NAME	"phanfw.bin"
+#define QLCNIC_FLASH_ROMIMAGE_NAME	"flash"
+
+extern char qlcnic_driver_name[];
+
+extern int qlcnic_use_msi;
+extern int qlcnic_use_msi_x;
+extern int qlcnic_auto_fw_reset;
+extern int qlcnic_load_fw_file;
+
+/* Number of status descriptors to handle per interrupt */
+#define MAX_STATUS_HANDLE	(64)
+
+/*
+ * qlcnic_skb_frag{} is to contain mapping info for each SG list. This
+ * has to be freed when DMA is complete. This is part of qlcnic_tx_buffer{}.
+ */
+struct qlcnic_skb_frag {
+	u64 dma;
+	u64 length;
+};
+
+/*    Following defines are for the state of the buffers    */
+#define	QLCNIC_BUFFER_FREE	0
+#define	QLCNIC_BUFFER_BUSY	1
+
+/*
+ * There will be one qlcnic_buffer per skb packet.    These will be
+ * used to save the dma info for pci_unmap_page()
+ */
+struct qlcnic_cmd_buffer {
+	struct sk_buff *skb;
+	struct qlcnic_skb_frag frag_array[MAX_SKB_FRAGS + 1];
+	u32 frag_count;
+};
+
+/* In rx_buffer, we do not need multiple fragments as is a single buffer */
+struct qlcnic_rx_buffer {
+	u16 ref_handle;
+	struct sk_buff *skb;
+	struct list_head list;
+	u64 dma;
+};
+
+/* Board types */
+#define	QLCNIC_GBE	0x01
+#define	QLCNIC_XGBE	0x02
+
+/*
+ * Interrupt coalescing defaults. The defaults are for 1500 MTU. It is
+ * adjusted based on configured MTU.
+ */
+#define QLCNIC_INTR_COAL_TYPE_RX		1
+#define QLCNIC_INTR_COAL_TYPE_TX		2
+#define QLCNIC_INTR_COAL_TYPE_RX_TX		3
+
+#define QLCNIC_DEF_INTR_COALESCE_RX_TIME_US	3
+#define QLCNIC_DEF_INTR_COALESCE_RX_PACKETS	256
+
+#define QLCNIC_DEF_INTR_COALESCE_TX_TIME_US	64
+#define QLCNIC_DEF_INTR_COALESCE_TX_PACKETS	64
+
+#define QLCNIC_INTR_DEFAULT			0x04
+#define QLCNIC_CONFIG_INTR_COALESCE		3
+#define QLCNIC_DEV_INFO_SIZE			2
+
+struct qlcnic_nic_intr_coalesce {
+	u8	type;
+	u8	sts_ring_mask;
+	u16	rx_packets;
+	u16	rx_time_us;
+	u16	tx_packets;
+	u16	tx_time_us;
+	u16	flag;
+	u32	timer_out;
+};
+
+struct qlcnic_83xx_dump_template_hdr {
+	u32	type;
+	u32	offset;
+	u32	size;
+	u32	cap_mask;
+	u32	num_entries;
+	u32	version;
+	u32	timestamp;
+	u32	checksum;
+	u32	drv_cap_mask;
+	u32	sys_info[3];
+	u32	saved_state[16];
+	u32	cap_sizes[8];
+	u32	ocm_wnd_reg[16];
+	u32	rsvd[0];
+};
+
+struct qlcnic_82xx_dump_template_hdr {
+	u32	type;
+	u32	offset;
+	u32	size;
+	u32	cap_mask;
+	u32	num_entries;
+	u32	version;
+	u32	timestamp;
+	u32	checksum;
+	u32	drv_cap_mask;
+	u32	sys_info[3];
+	u32	saved_state[16];
+	u32	cap_sizes[8];
+	u32	rsvd[7];
+	u32	capabilities;
+	u32	rsvd1[0];
+};
+
+#define QLC_PEX_DMA_READ_SIZE	(PAGE_SIZE * 16)
+
+struct qlcnic_fw_dump {
+	u8	clr;	/* flag to indicate if dump is cleared */
+	bool	enable; /* enable/disable dump */
+	u32	size;	/* total size of the dump */
+	u32	cap_mask; /* Current capture mask */
+	void	*data;	/* dump data area */
+	void	*tmpl_hdr;
+	dma_addr_t phys_addr;
+	void	*dma_buffer;
+	bool	use_pex_dma;
+	/* Read only elements which are common between 82xx and 83xx
+	 * template header. Update these values immediately after we read
+	 * template header from Firmware
+	 */
+	u32	tmpl_hdr_size;
+	u32	version;
+	u32	num_entries;
+	u32	offset;
+};
+
+/*
+ * One hardware_context{} per adapter
+ * contains interrupt info as well shared hardware info.
+ */
+struct qlcnic_hardware_context {
+	void __iomem *pci_base0;
+	void __iomem *ocm_win_crb;
+
+	unsigned long pci_len0;
+
+	rwlock_t crb_lock;
+	struct mutex mem_lock;
+
+	u8 revision_id;
+	u8 pci_func;
+	u8 linkup;
+	u8 loopback_state;
+	u8 beacon_state;
+	u8 has_link_events;
+	u8 fw_type;
+	u8 physical_port;
+	u8 reset_context;
+	u8 msix_supported;
+	u8 max_mac_filters;
+	u8 mc_enabled;
+	u8 max_mc_count;
+	u8 diag_test;
+	u8 num_msix;
+	u8 nic_mode;
+	int diag_cnt;
+
+	u16 max_uc_count;
+	u16 port_type;
+	u16 board_type;
+	u16 supported_type;
+
+	u16 link_speed;
+	u16 link_duplex;
+	u16 link_autoneg;
+	u16 module_type;
+
+	u16 op_mode;
+	u16 switch_mode;
+	u16 max_tx_ques;
+	u16 max_rx_ques;
+	u16 max_mtu;
+	u32 msg_enable;
+	u16 total_nic_func;
+	u16 max_pci_func;
+	u32 max_vnic_func;
+	u32 total_pci_func;
+
+	u32 capabilities;
+	u32 extra_capability[3];
+	u32 temp;
+	u32 int_vec_bit;
+	u32 fw_hal_version;
+	u32 port_config;
+	struct qlcnic_hardware_ops *hw_ops;
+	struct qlcnic_nic_intr_coalesce coal;
+	struct qlcnic_fw_dump fw_dump;
+	struct qlcnic_fdt fdt;
+	struct qlc_83xx_reset reset;
+	struct qlc_83xx_idc idc;
+	struct qlc_83xx_fw_info *fw_info;
+	struct qlcnic_intrpt_config *intr_tbl;
+	struct qlcnic_sriov *sriov;
+	u32 *reg_tbl;
+	u32 *ext_reg_tbl;
+	u32 mbox_aen[QLC_83XX_MBX_AEN_CNT];
+	u32 mbox_reg[4];
+	struct qlcnic_mailbox *mailbox;
+	u8 extend_lb_time;
+	u8 phys_port_id[ETH_ALEN];
+	u8 lb_mode;
+	u8 vxlan_port_count;
+	u16 vxlan_port;
+	struct device *hwmon_dev;
+	u32 post_mode;
+	bool run_post;
+};
+
+struct qlcnic_adapter_stats {
+	u64  xmitcalled;
+	u64  xmitfinished;
+	u64  rxdropped;
+	u64  txdropped;
+	u64  csummed;
+	u64  rx_pkts;
+	u64  lro_pkts;
+	u64  rxbytes;
+	u64  txbytes;
+	u64  lrobytes;
+	u64  lso_frames;
+	u64  encap_lso_frames;
+	u64  encap_tx_csummed;
+	u64  encap_rx_csummed;
+	u64  xmit_on;
+	u64  xmit_off;
+	u64  skb_alloc_failure;
+	u64  null_rxbuf;
+	u64  rx_dma_map_error;
+	u64  tx_dma_map_error;
+	u64  spurious_intr;
+	u64  mac_filter_limit_overrun;
+	u64  mbx_spurious_intr;
+};
+
+/*
+ * Rcv Descriptor Context. One such per Rcv Descriptor. There may
+ * be one Rcv Descriptor for normal packets, one for jumbo and may be others.
+ */
+struct qlcnic_host_rds_ring {
+	void __iomem *crb_rcv_producer;
+	struct rcv_desc *desc_head;
+	struct qlcnic_rx_buffer *rx_buf_arr;
+	u32 num_desc;
+	u32 producer;
+	u32 dma_size;
+	u32 skb_size;
+	u32 flags;
+	struct list_head free_list;
+	spinlock_t lock;
+	dma_addr_t phys_addr;
+} ____cacheline_internodealigned_in_smp;
+
+struct qlcnic_host_sds_ring {
+	u32 consumer;
+	u32 num_desc;
+	void __iomem *crb_sts_consumer;
+
+	struct qlcnic_host_tx_ring *tx_ring;
+	struct status_desc *desc_head;
+	struct qlcnic_adapter *adapter;
+	struct napi_struct napi;
+	struct list_head free_list[NUM_RCV_DESC_RINGS];
+
+	void __iomem *crb_intr_mask;
+	int irq;
+
+	dma_addr_t phys_addr;
+	char name[IFNAMSIZ + 12];
+} ____cacheline_internodealigned_in_smp;
+
+struct qlcnic_tx_queue_stats {
+	u64 xmit_on;
+	u64 xmit_off;
+	u64 xmit_called;
+	u64 xmit_finished;
+	u64 tx_bytes;
+};
+
+struct qlcnic_host_tx_ring {
+	int irq;
+	void __iomem *crb_intr_mask;
+	char name[IFNAMSIZ + 12];
+	u16 ctx_id;
+
+	u32 state;
+	u32 producer;
+	u32 sw_consumer;
+	u32 num_desc;
+
+	struct qlcnic_tx_queue_stats tx_stats;
+
+	void __iomem *crb_cmd_producer;
+	struct cmd_desc_type0 *desc_head;
+	struct qlcnic_adapter *adapter;
+	struct napi_struct napi;
+	struct qlcnic_cmd_buffer *cmd_buf_arr;
+	__le32 *hw_consumer;
+
+	dma_addr_t phys_addr;
+	dma_addr_t hw_cons_phys_addr;
+	struct netdev_queue *txq;
+	/* Lock to protect Tx descriptors cleanup */
+	spinlock_t tx_clean_lock;
+} ____cacheline_internodealigned_in_smp;
+
+/*
+ * Receive context. There is one such structure per instance of the
+ * receive processing. Any state information that is relevant to
+ * the receive, and is must be in this structure. The global data may be
+ * present elsewhere.
+ */
+struct qlcnic_recv_context {
+	struct qlcnic_host_rds_ring *rds_rings;
+	struct qlcnic_host_sds_ring *sds_rings;
+	u32 state;
+	u16 context_id;
+	u16 virt_port;
+};
+
+/* HW context creation */
+
+#define QLCNIC_OS_CRB_RETRY_COUNT	4000
+
+#define QLCNIC_CDRP_CMD_BIT		0x80000000
+
+/*
+ * All responses must have the QLCNIC_CDRP_CMD_BIT cleared
+ * in the crb QLCNIC_CDRP_CRB_OFFSET.
+ */
+#define QLCNIC_CDRP_FORM_RSP(rsp)	(rsp)
+#define QLCNIC_CDRP_IS_RSP(rsp)	(((rsp) & QLCNIC_CDRP_CMD_BIT) == 0)
+
+#define QLCNIC_CDRP_RSP_OK		0x00000001
+#define QLCNIC_CDRP_RSP_FAIL		0x00000002
+#define QLCNIC_CDRP_RSP_TIMEOUT 	0x00000003
+
+/*
+ * All commands must have the QLCNIC_CDRP_CMD_BIT set in
+ * the crb QLCNIC_CDRP_CRB_OFFSET.
+ */
+#define QLCNIC_CDRP_FORM_CMD(cmd)	(QLCNIC_CDRP_CMD_BIT | (cmd))
+
+#define QLCNIC_RCODE_SUCCESS		0
+#define QLCNIC_RCODE_INVALID_ARGS	6
+#define QLCNIC_RCODE_NOT_SUPPORTED	9
+#define QLCNIC_RCODE_NOT_PERMITTED	10
+#define QLCNIC_RCODE_NOT_IMPL		15
+#define QLCNIC_RCODE_INVALID		16
+#define QLCNIC_RCODE_TIMEOUT		17
+#define QLCNIC_DESTROY_CTX_RESET	0
+
+/*
+ * Capabilities Announced
+ */
+#define QLCNIC_CAP0_LEGACY_CONTEXT	(1)
+#define QLCNIC_CAP0_LEGACY_MN		(1 << 2)
+#define QLCNIC_CAP0_LSO 		(1 << 6)
+#define QLCNIC_CAP0_JUMBO_CONTIGUOUS	(1 << 7)
+#define QLCNIC_CAP0_LRO_CONTIGUOUS	(1 << 8)
+#define QLCNIC_CAP0_VALIDOFF		(1 << 11)
+#define QLCNIC_CAP0_LRO_MSS		(1 << 21)
+#define QLCNIC_CAP0_TX_MULTI		(1 << 22)
+
+/*
+ * Context state
+ */
+#define QLCNIC_HOST_CTX_STATE_FREED	0
+#define QLCNIC_HOST_CTX_STATE_ACTIVE	2
+
+/*
+ * Rx context
+ */
+
+struct qlcnic_hostrq_sds_ring {
+	__le64 host_phys_addr;	/* Ring base addr */
+	__le32 ring_size;		/* Ring entries */
+	__le16 msi_index;
+	__le16 rsvd;		/* Padding */
+} __packed;
+
+struct qlcnic_hostrq_rds_ring {
+	__le64 host_phys_addr;	/* Ring base addr */
+	__le64 buff_size;		/* Packet buffer size */
+	__le32 ring_size;		/* Ring entries */
+	__le32 ring_kind;		/* Class of ring */
+} __packed;
+
+struct qlcnic_hostrq_rx_ctx {
+	__le64 host_rsp_dma_addr;	/* Response dma'd here */
+	__le32 capabilities[4];		/* Flag bit vector */
+	__le32 host_int_crb_mode;	/* Interrupt crb usage */
+	__le32 host_rds_crb_mode;	/* RDS crb usage */
+	/* These ring offsets are relative to data[0] below */
+	__le32 rds_ring_offset;	/* Offset to RDS config */
+	__le32 sds_ring_offset;	/* Offset to SDS config */
+	__le16 num_rds_rings;	/* Count of RDS rings */
+	__le16 num_sds_rings;	/* Count of SDS rings */
+	__le16 valid_field_offset;
+	u8  txrx_sds_binding;
+	u8  msix_handler;
+	u8  reserved[128];      /* reserve space for future expansion*/
+	/* MUST BE 64-bit aligned.
+	   The following is packed:
+	   - N hostrq_rds_rings
+	   - N hostrq_sds_rings */
+	char data[0];
+} __packed;
+
+struct qlcnic_cardrsp_rds_ring{
+	__le32 host_producer_crb;	/* Crb to use */
+	__le32 rsvd1;		/* Padding */
+} __packed;
+
+struct qlcnic_cardrsp_sds_ring {
+	__le32 host_consumer_crb;	/* Crb to use */
+	__le32 interrupt_crb;	/* Crb to use */
+} __packed;
+
+struct qlcnic_cardrsp_rx_ctx {
+	/* These ring offsets are relative to data[0] below */
+	__le32 rds_ring_offset;	/* Offset to RDS config */
+	__le32 sds_ring_offset;	/* Offset to SDS config */
+	__le32 host_ctx_state;	/* Starting State */
+	__le32 num_fn_per_port;	/* How many PCI fn share the port */
+	__le16 num_rds_rings;	/* Count of RDS rings */
+	__le16 num_sds_rings;	/* Count of SDS rings */
+	__le16 context_id;		/* Handle for context */
+	u8  phys_port;		/* Physical id of port */
+	u8  virt_port;		/* Virtual/Logical id of port */
+	u8  reserved[128];	/* save space for future expansion */
+	/*  MUST BE 64-bit aligned.
+	   The following is packed:
+	   - N cardrsp_rds_rings
+	   - N cardrs_sds_rings */
+	char data[0];
+} __packed;
+
+#define SIZEOF_HOSTRQ_RX(HOSTRQ_RX, rds_rings, sds_rings)	\
+	(sizeof(HOSTRQ_RX) + 					\
+	(rds_rings)*(sizeof(struct qlcnic_hostrq_rds_ring)) +		\
+	(sds_rings)*(sizeof(struct qlcnic_hostrq_sds_ring)))
+
+#define SIZEOF_CARDRSP_RX(CARDRSP_RX, rds_rings, sds_rings) 	\
+	(sizeof(CARDRSP_RX) + 					\
+	(rds_rings)*(sizeof(struct qlcnic_cardrsp_rds_ring)) + 		\
+	(sds_rings)*(sizeof(struct qlcnic_cardrsp_sds_ring)))
+
+/*
+ * Tx context
+ */
+
+struct qlcnic_hostrq_cds_ring {
+	__le64 host_phys_addr;	/* Ring base addr */
+	__le32 ring_size;		/* Ring entries */
+	__le32 rsvd;		/* Padding */
+} __packed;
+
+struct qlcnic_hostrq_tx_ctx {
+	__le64 host_rsp_dma_addr;	/* Response dma'd here */
+	__le64 cmd_cons_dma_addr;	/*  */
+	__le64 dummy_dma_addr;	/*  */
+	__le32 capabilities[4];	/* Flag bit vector */
+	__le32 host_int_crb_mode;	/* Interrupt crb usage */
+	__le32 rsvd1;		/* Padding */
+	__le16 rsvd2;		/* Padding */
+	__le16 interrupt_ctl;
+	__le16 msi_index;
+	__le16 rsvd3;		/* Padding */
+	struct qlcnic_hostrq_cds_ring cds_ring;	/* Desc of cds ring */
+	u8  reserved[128];	/* future expansion */
+} __packed;
+
+struct qlcnic_cardrsp_cds_ring {
+	__le32 host_producer_crb;	/* Crb to use */
+	__le32 interrupt_crb;	/* Crb to use */
+} __packed;
+
+struct qlcnic_cardrsp_tx_ctx {
+	__le32 host_ctx_state;	/* Starting state */
+	__le16 context_id;		/* Handle for context */
+	u8  phys_port;		/* Physical id of port */
+	u8  virt_port;		/* Virtual/Logical id of port */
+	struct qlcnic_cardrsp_cds_ring cds_ring;	/* Card cds settings */
+	u8  reserved[128];	/* future expansion */
+} __packed;
+
+#define SIZEOF_HOSTRQ_TX(HOSTRQ_TX)	(sizeof(HOSTRQ_TX))
+#define SIZEOF_CARDRSP_TX(CARDRSP_TX)	(sizeof(CARDRSP_TX))
+
+/* CRB */
+
+#define QLCNIC_HOST_RDS_CRB_MODE_UNIQUE	0
+#define QLCNIC_HOST_RDS_CRB_MODE_SHARED	1
+#define QLCNIC_HOST_RDS_CRB_MODE_CUSTOM	2
+#define QLCNIC_HOST_RDS_CRB_MODE_MAX	3
+
+#define QLCNIC_HOST_INT_CRB_MODE_UNIQUE	0
+#define QLCNIC_HOST_INT_CRB_MODE_SHARED	1
+#define QLCNIC_HOST_INT_CRB_MODE_NORX	2
+#define QLCNIC_HOST_INT_CRB_MODE_NOTX	3
+#define QLCNIC_HOST_INT_CRB_MODE_NORXTX	4
+
+
+/* MAC */
+
+#define MC_COUNT_P3P	38
+
+#define QLCNIC_MAC_NOOP	0
+#define QLCNIC_MAC_ADD	1
+#define QLCNIC_MAC_DEL	2
+#define QLCNIC_MAC_VLAN_ADD	3
+#define QLCNIC_MAC_VLAN_DEL	4
+
+enum qlcnic_mac_type {
+	QLCNIC_UNICAST_MAC,
+	QLCNIC_MULTICAST_MAC,
+	QLCNIC_BROADCAST_MAC,
+};
+
+struct qlcnic_mac_vlan_list {
+	struct list_head list;
+	uint8_t mac_addr[ETH_ALEN+2];
+	u16 vlan_id;
+	enum qlcnic_mac_type mac_type;
+};
+
+/* MAC Learn */
+#define NO_MAC_LEARN		0
+#define DRV_MAC_LEARN		1
+#define FDB_MAC_LEARN		2
+
+#define QLCNIC_HOST_REQUEST	0x13
+#define QLCNIC_REQUEST		0x14
+
+#define QLCNIC_MAC_EVENT	0x1
+
+#define QLCNIC_IP_UP		2
+#define QLCNIC_IP_DOWN		3
+
+#define QLCNIC_ILB_MODE		0x1
+#define QLCNIC_ELB_MODE		0x2
+#define QLCNIC_LB_MODE_MASK	0x3
+
+#define QLCNIC_LINKEVENT	0x1
+#define QLCNIC_LB_RESPONSE	0x2
+#define QLCNIC_IS_LB_CONFIGURED(VAL)	\
+		(VAL == (QLCNIC_LINKEVENT | QLCNIC_LB_RESPONSE))
+
+/*
+ * Driver --> Firmware
+ */
+#define QLCNIC_H2C_OPCODE_CONFIG_RSS			0x1
+#define QLCNIC_H2C_OPCODE_CONFIG_INTR_COALESCE		0x3
+#define QLCNIC_H2C_OPCODE_CONFIG_LED			0x4
+#define QLCNIC_H2C_OPCODE_LRO_REQUEST			0x7
+#define QLCNIC_H2C_OPCODE_SET_MAC_RECEIVE_MODE		0xc
+#define QLCNIC_H2C_OPCODE_CONFIG_IPADDR		0x12
+
+#define QLCNIC_H2C_OPCODE_GET_LINKEVENT		0x15
+#define QLCNIC_H2C_OPCODE_CONFIG_BRIDGING		0x17
+#define QLCNIC_H2C_OPCODE_CONFIG_HW_LRO		0x18
+#define QLCNIC_H2C_OPCODE_CONFIG_LOOPBACK		0x13
+
+/*
+ * Firmware --> Driver
+ */
+
+#define QLCNIC_C2H_OPCODE_CONFIG_LOOPBACK		0x8f
+#define QLCNIC_C2H_OPCODE_GET_LINKEVENT_RESPONSE	0x8D
+#define QLCNIC_C2H_OPCODE_GET_DCB_AEN			0x90
+
+#define VPORT_MISS_MODE_DROP		0 /* drop all unmatched */
+#define VPORT_MISS_MODE_ACCEPT_ALL	1 /* accept all packets */
+#define VPORT_MISS_MODE_ACCEPT_MULTI	2 /* accept unmatched multicast */
+
+#define QLCNIC_LRO_REQUEST_CLEANUP	4
+
+/* Capabilites received */
+#define QLCNIC_FW_CAPABILITY_TSO		BIT_1
+#define QLCNIC_FW_CAPABILITY_BDG		BIT_8
+#define QLCNIC_FW_CAPABILITY_FVLANTX		BIT_9
+#define QLCNIC_FW_CAPABILITY_HW_LRO		BIT_10
+#define QLCNIC_FW_CAPABILITY_2_MULTI_TX		BIT_4
+#define QLCNIC_FW_CAPABILITY_MULTI_LOOPBACK	BIT_27
+#define QLCNIC_FW_CAPABILITY_MORE_CAPS		BIT_31
+
+#define QLCNIC_FW_CAPABILITY_2_LRO_MAX_TCP_SEG	BIT_2
+#define QLCNIC_FW_CAP2_HW_LRO_IPV6		BIT_3
+#define QLCNIC_FW_CAPABILITY_SET_DRV_VER	BIT_5
+#define QLCNIC_FW_CAPABILITY_2_BEACON		BIT_7
+#define QLCNIC_FW_CAPABILITY_2_PER_PORT_ESWITCH_CFG	BIT_9
+#define QLCNIC_FW_CAPABILITY_2_EXT_ISCSI_DUMP	BIT_13
+
+#define QLCNIC_83XX_FW_CAPAB_ENCAP_RX_OFFLOAD	BIT_0
+#define QLCNIC_83XX_FW_CAPAB_ENCAP_TX_OFFLOAD	BIT_1
+#define QLCNIC_83XX_FW_CAPAB_ENCAP_CKO_OFFLOAD	BIT_4
+
+/* module types */
+#define LINKEVENT_MODULE_NOT_PRESENT			1
+#define LINKEVENT_MODULE_OPTICAL_UNKNOWN		2
+#define LINKEVENT_MODULE_OPTICAL_SRLR			3
+#define LINKEVENT_MODULE_OPTICAL_LRM			4
+#define LINKEVENT_MODULE_OPTICAL_SFP_1G 		5
+#define LINKEVENT_MODULE_TWINAX_UNSUPPORTED_CABLE	6
+#define LINKEVENT_MODULE_TWINAX_UNSUPPORTED_CABLELEN	7
+#define LINKEVENT_MODULE_TWINAX 			8
+
+#define LINKSPEED_10GBPS	10000
+#define LINKSPEED_1GBPS 	1000
+#define LINKSPEED_100MBPS	100
+#define LINKSPEED_10MBPS	10
+
+#define LINKSPEED_ENCODED_10MBPS	0
+#define LINKSPEED_ENCODED_100MBPS	1
+#define LINKSPEED_ENCODED_1GBPS 	2
+
+#define LINKEVENT_AUTONEG_DISABLED	0
+#define LINKEVENT_AUTONEG_ENABLED	1
+
+#define LINKEVENT_HALF_DUPLEX		0
+#define LINKEVENT_FULL_DUPLEX		1
+
+#define LINKEVENT_LINKSPEED_MBPS	0
+#define LINKEVENT_LINKSPEED_ENCODED	1
+
+/* firmware response header:
+ *	63:58 - message type
+ *	57:56 - owner
+ *	55:53 - desc count
+ *	52:48 - reserved
+ *	47:40 - completion id
+ *	39:32 - opcode
+ *	31:16 - error code
+ *	15:00 - reserved
+ */
+#define qlcnic_get_nic_msg_opcode(msg_hdr)	\
+	((msg_hdr >> 32) & 0xFF)
+
+struct qlcnic_fw_msg {
+	union {
+		struct {
+			u64 hdr;
+			u64 body[7];
+		};
+		u64 words[8];
+	};
+};
+
+struct qlcnic_nic_req {
+	__le64 qhdr;
+	__le64 req_hdr;
+	__le64 words[6];
+} __packed;
+
+struct qlcnic_mac_req {
+	u8 op;
+	u8 tag;
+	u8 mac_addr[6];
+};
+
+struct qlcnic_vlan_req {
+	__le16 vlan_id;
+	__le16 rsvd[3];
+} __packed;
+
+struct qlcnic_ipaddr {
+	__be32 ipv4;
+	__be32 ipv6[4];
+};
+
+#define QLCNIC_MSI_ENABLED		0x02
+#define QLCNIC_MSIX_ENABLED		0x04
+#define QLCNIC_LRO_ENABLED		0x01
+#define QLCNIC_LRO_DISABLED		0x00
+#define QLCNIC_BRIDGE_ENABLED       	0X10
+#define QLCNIC_DIAG_ENABLED		0x20
+#define QLCNIC_ESWITCH_ENABLED		0x40
+#define QLCNIC_ADAPTER_INITIALIZED	0x80
+#define QLCNIC_TAGGING_ENABLED		0x100
+#define QLCNIC_MACSPOOF			0x200
+#define QLCNIC_MAC_OVERRIDE_DISABLED	0x400
+#define QLCNIC_PROMISC_DISABLED		0x800
+#define QLCNIC_NEED_FLR			0x1000
+#define QLCNIC_FW_RESET_OWNER		0x2000
+#define QLCNIC_FW_HANG			0x4000
+#define QLCNIC_FW_LRO_MSS_CAP		0x8000
+#define QLCNIC_TX_INTR_SHARED		0x10000
+#define QLCNIC_APP_CHANGED_FLAGS	0x20000
+#define QLCNIC_HAS_PHYS_PORT_ID		0x40000
+#define QLCNIC_TSS_RSS			0x80000
+
+#define QLCNIC_ADD_VXLAN_PORT		0x100000
+#define QLCNIC_DEL_VXLAN_PORT		0x200000
+
+#define QLCNIC_VLAN_FILTERING		0x800000
+
+#define QLCNIC_IS_MSI_FAMILY(adapter) \
+	((adapter)->flags & (QLCNIC_MSI_ENABLED | QLCNIC_MSIX_ENABLED))
+#define QLCNIC_IS_TSO_CAPABLE(adapter)  \
+	((adapter)->ahw->capabilities & QLCNIC_FW_CAPABILITY_TSO)
+
+#define QLCNIC_BEACON_EANBLE		0xC
+#define QLCNIC_BEACON_DISABLE		0xD
+
+#define QLCNIC_BEACON_ON		2
+#define QLCNIC_BEACON_OFF		0
+
+#define QLCNIC_MSIX_TBL_SPACE		8192
+#define QLCNIC_PCI_REG_MSIX_TBL 	0x44
+#define QLCNIC_MSIX_TBL_PGSIZE		4096
+
+#define QLCNIC_ADAPTER_UP_MAGIC 777
+
+#define __QLCNIC_FW_ATTACHED		0
+#define __QLCNIC_DEV_UP 		1
+#define __QLCNIC_RESETTING		2
+#define __QLCNIC_START_FW 		4
+#define __QLCNIC_AER			5
+#define __QLCNIC_DIAG_RES_ALLOC		6
+#define __QLCNIC_LED_ENABLE		7
+#define __QLCNIC_ELB_INPROGRESS		8
+#define __QLCNIC_MULTI_TX_UNIQUE	9
+#define __QLCNIC_SRIOV_ENABLE		10
+#define __QLCNIC_SRIOV_CAPABLE		11
+#define __QLCNIC_MBX_POLL_ENABLE	12
+#define __QLCNIC_DIAG_MODE		13
+#define __QLCNIC_MAINTENANCE_MODE	16
+
+#define QLCNIC_INTERRUPT_TEST		1
+#define QLCNIC_LOOPBACK_TEST		2
+#define QLCNIC_LED_TEST		3
+
+#define QLCNIC_FILTER_AGE	80
+#define QLCNIC_READD_AGE	20
+#define QLCNIC_LB_MAX_FILTERS	64
+#define QLCNIC_LB_BUCKET_SIZE	32
+#define QLCNIC_ILB_MAX_RCV_LOOP	10
+
+struct qlcnic_filter {
+	struct hlist_node fnode;
+	u8 faddr[ETH_ALEN];
+	u16 vlan_id;
+	unsigned long ftime;
+};
+
+struct qlcnic_filter_hash {
+	struct hlist_head *fhead;
+	u8 fnum;
+	u16 fmax;
+	u16 fbucket_size;
+};
+
+/* Mailbox specific data structures */
+struct qlcnic_mailbox {
+	struct workqueue_struct	*work_q;
+	struct qlcnic_adapter	*adapter;
+	const struct qlcnic_mbx_ops *ops;
+	struct work_struct	work;
+	struct completion	completion;
+	struct list_head	cmd_q;
+	unsigned long		status;
+	spinlock_t		queue_lock;	/* Mailbox queue lock */
+	spinlock_t		aen_lock;	/* Mailbox response/AEN lock */
+	u32			rsp_status;
+	u32			num_cmds;
+};
+
+struct qlcnic_adapter {
+	struct qlcnic_hardware_context *ahw;
+	struct qlcnic_recv_context *recv_ctx;
+	struct qlcnic_host_tx_ring *tx_ring;
+	struct net_device *netdev;
+	struct pci_dev *pdev;
+
+	unsigned long state;
+	u32 flags;
+
+	u16 num_txd;
+	u16 num_rxd;
+	u16 num_jumbo_rxd;
+	u16 max_rxd;
+	u16 max_jumbo_rxd;
+
+	u8 max_rds_rings;
+
+	u8 max_sds_rings; /* max sds rings supported by adapter */
+	u8 max_tx_rings;  /* max tx rings supported by adapter */
+
+	u8 drv_tx_rings;  /* max tx rings supported by driver */
+	u8 drv_sds_rings; /* max sds rings supported by driver */
+
+	u8 drv_tss_rings; /* tss ring input */
+	u8 drv_rss_rings; /* rss ring input */
+
+	u8 rx_csum;
+	u8 portnum;
+
+	u8 fw_wait_cnt;
+	u8 fw_fail_cnt;
+	u8 tx_timeo_cnt;
+	u8 need_fw_reset;
+	u8 reset_ctx_cnt;
+
+	u16 is_up;
+	u16 rx_pvid;
+	u16 tx_pvid;
+
+	u32 irq;
+	u32 heartbeat;
+
+	u8 dev_state;
+	u8 reset_ack_timeo;
+	u8 dev_init_timeo;
+
+	u8 mac_addr[ETH_ALEN];
+
+	u64 dev_rst_time;
+	bool drv_mac_learn;
+	bool fdb_mac_learn;
+	bool rx_mac_learn;
+	unsigned long vlans[BITS_TO_LONGS(VLAN_N_VID)];
+	u8 flash_mfg_id;
+	struct qlcnic_npar_info *npars;
+	struct qlcnic_eswitch *eswitch;
+	struct qlcnic_nic_template *nic_ops;
+
+	struct qlcnic_adapter_stats stats;
+	struct list_head mac_list;
+
+	void __iomem	*tgt_mask_reg;
+	void __iomem	*tgt_status_reg;
+	void __iomem	*crb_int_state_reg;
+	void __iomem	*isr_int_vec;
+
+	struct msix_entry *msix_entries;
+	struct workqueue_struct *qlcnic_wq;
+	struct delayed_work fw_work;
+	struct delayed_work idc_aen_work;
+	struct delayed_work mbx_poll_work;
+	struct qlcnic_dcb *dcb;
+
+	struct qlcnic_filter_hash fhash;
+	struct qlcnic_filter_hash rx_fhash;
+	struct list_head vf_mc_list;
+
+	spinlock_t mac_learn_lock;
+	/* spinlock for catching rcv filters for eswitch traffic */
+	spinlock_t rx_mac_learn_lock;
+	u32 file_prd_off;	/*File fw product offset*/
+	u32 fw_version;
+	u32 offload_flags;
+	const struct firmware *fw;
+};
+
+struct qlcnic_info_le {
+	__le16	pci_func;
+	__le16	op_mode;	/* 1 = Priv, 2 = NP, 3 = NP passthru */
+	__le16	phys_port;
+	__le16	switch_mode;	/* 0 = disabled, 1 = int, 2 = ext */
+
+	__le32	capabilities;
+	u8	max_mac_filters;
+	u8	reserved1;
+	__le16	max_mtu;
+
+	__le16	max_tx_ques;
+	__le16	max_rx_ques;
+	__le16	min_tx_bw;
+	__le16	max_tx_bw;
+	__le32  op_type;
+	__le16  max_bw_reg_offset;
+	__le16  max_linkspeed_reg_offset;
+	__le32  capability1;
+	__le32  capability2;
+	__le32  capability3;
+	__le16  max_tx_mac_filters;
+	__le16  max_rx_mcast_mac_filters;
+	__le16  max_rx_ucast_mac_filters;
+	__le16  max_rx_ip_addr;
+	__le16  max_rx_lro_flow;
+	__le16  max_rx_status_rings;
+	__le16  max_rx_buf_rings;
+	__le16  max_tx_vlan_keys;
+	u8      total_pf;
+	u8      total_rss_engines;
+	__le16  max_vports;
+	__le16	linkstate_reg_offset;
+	__le16	bit_offsets;
+	__le16  max_local_ipv6_addrs;
+	__le16  max_remote_ipv6_addrs;
+	u8	reserved2[56];
+} __packed;
+
+struct qlcnic_info {
+	u16	pci_func;
+	u16	op_mode;
+	u16	phys_port;
+	u16	switch_mode;
+	u32	capabilities;
+	u8	max_mac_filters;
+	u16	max_mtu;
+	u16	max_tx_ques;
+	u16	max_rx_ques;
+	u16	min_tx_bw;
+	u16	max_tx_bw;
+	u32	op_type;
+	u16	max_bw_reg_offset;
+	u16	max_linkspeed_reg_offset;
+	u32	capability1;
+	u32	capability2;
+	u32	capability3;
+	u16	max_tx_mac_filters;
+	u16	max_rx_mcast_mac_filters;
+	u16	max_rx_ucast_mac_filters;
+	u16	max_rx_ip_addr;
+	u16	max_rx_lro_flow;
+	u16	max_rx_status_rings;
+	u16	max_rx_buf_rings;
+	u16	max_tx_vlan_keys;
+	u8      total_pf;
+	u8      total_rss_engines;
+	u16	max_vports;
+	u16	linkstate_reg_offset;
+	u16	bit_offsets;
+	u16	max_local_ipv6_addrs;
+	u16	max_remote_ipv6_addrs;
+};
+
+struct qlcnic_pci_info_le {
+	__le16	id;		/* pci function id */
+	__le16	active;		/* 1 = Enabled */
+	__le16	type;		/* 1 = NIC, 2 = FCoE, 3 = iSCSI */
+	__le16	default_port;	/* default port number */
+
+	__le16	tx_min_bw;	/* Multiple of 100mbpc */
+	__le16	tx_max_bw;
+	__le16	reserved1[2];
+
+	u8	mac[ETH_ALEN];
+	__le16  func_count;
+	u8      reserved2[104];
+
+} __packed;
+
+struct qlcnic_pci_info {
+	u16	id;
+	u16	active;
+	u16	type;
+	u16	default_port;
+	u16	tx_min_bw;
+	u16	tx_max_bw;
+	u8	mac[ETH_ALEN];
+	u16  func_count;
+};
+
+struct qlcnic_npar_info {
+	bool	eswitch_status;
+	u16	pvid;
+	u16	min_bw;
+	u16	max_bw;
+	u8	phy_port;
+	u8	type;
+	u8	active;
+	u8	enable_pm;
+	u8	dest_npar;
+	u8	discard_tagged;
+	u8	mac_override;
+	u8	mac_anti_spoof;
+	u8	promisc_mode;
+	u8	offload_flags;
+	u8      pci_func;
+	u8      mac[ETH_ALEN];
+};
+
+struct qlcnic_eswitch {
+	u8	port;
+	u8	active_vports;
+	u8	active_vlans;
+	u8	active_ucast_filters;
+	u8	max_ucast_filters;
+	u8	max_active_vlans;
+
+	u32	flags;
+#define QLCNIC_SWITCH_ENABLE		BIT_1
+#define QLCNIC_SWITCH_VLAN_FILTERING	BIT_2
+#define QLCNIC_SWITCH_PROMISC_MODE	BIT_3
+#define QLCNIC_SWITCH_PORT_MIRRORING	BIT_4
+};
+
+
+#define MAX_BW			100	/* % of link speed */
+#define MIN_BW			1	/* % of link speed */
+#define MAX_VLAN_ID		4095
+#define MIN_VLAN_ID		2
+#define DEFAULT_MAC_LEARN	1
+
+#define IS_VALID_VLAN(vlan)	(vlan >= MIN_VLAN_ID && vlan < MAX_VLAN_ID)
+#define IS_VALID_BW(bw)		(bw <= MAX_BW)
+
+struct qlcnic_pci_func_cfg {
+	u16	func_type;
+	u16	min_bw;
+	u16	max_bw;
+	u16	port_num;
+	u8	pci_func;
+	u8	func_state;
+	u8	def_mac_addr[ETH_ALEN];
+};
+
+struct qlcnic_npar_func_cfg {
+	u32	fw_capab;
+	u16	port_num;
+	u16	min_bw;
+	u16	max_bw;
+	u16	max_tx_queues;
+	u16	max_rx_queues;
+	u8	pci_func;
+	u8	op_mode;
+};
+
+struct qlcnic_pm_func_cfg {
+	u8	pci_func;
+	u8	action;
+	u8	dest_npar;
+	u8	reserved[5];
+};
+
+struct qlcnic_esw_func_cfg {
+	u16	vlan_id;
+	u8	op_mode;
+	u8	op_type;
+	u8	pci_func;
+	u8	host_vlan_tag;
+	u8	promisc_mode;
+	u8	discard_tagged;
+	u8	mac_override;
+	u8	mac_anti_spoof;
+	u8	offload_flags;
+	u8	reserved[5];
+};
+
+#define QLCNIC_STATS_VERSION		1
+#define QLCNIC_STATS_PORT		1
+#define QLCNIC_STATS_ESWITCH		2
+#define QLCNIC_QUERY_RX_COUNTER		0
+#define QLCNIC_QUERY_TX_COUNTER		1
+#define QLCNIC_STATS_NOT_AVAIL	0xffffffffffffffffULL
+#define QLCNIC_FILL_STATS(VAL1) \
+	(((VAL1) == QLCNIC_STATS_NOT_AVAIL) ? 0 : VAL1)
+#define QLCNIC_MAC_STATS 1
+#define QLCNIC_ESW_STATS 2
+
+#define QLCNIC_ADD_ESW_STATS(VAL1, VAL2)\
+do {	\
+	if (((VAL1) == QLCNIC_STATS_NOT_AVAIL) && \
+	    ((VAL2) != QLCNIC_STATS_NOT_AVAIL)) \
+		(VAL1) = (VAL2); \
+	else if (((VAL1) != QLCNIC_STATS_NOT_AVAIL) && \
+		 ((VAL2) != QLCNIC_STATS_NOT_AVAIL)) \
+			(VAL1) += (VAL2); \
+} while (0)
+
+struct qlcnic_mac_statistics_le {
+	__le64	mac_tx_frames;
+	__le64	mac_tx_bytes;
+	__le64	mac_tx_mcast_pkts;
+	__le64	mac_tx_bcast_pkts;
+	__le64	mac_tx_pause_cnt;
+	__le64	mac_tx_ctrl_pkt;
+	__le64	mac_tx_lt_64b_pkts;
+	__le64	mac_tx_lt_127b_pkts;
+	__le64	mac_tx_lt_255b_pkts;
+	__le64	mac_tx_lt_511b_pkts;
+	__le64	mac_tx_lt_1023b_pkts;
+	__le64	mac_tx_lt_1518b_pkts;
+	__le64	mac_tx_gt_1518b_pkts;
+	__le64	rsvd1[3];
+
+	__le64	mac_rx_frames;
+	__le64	mac_rx_bytes;
+	__le64	mac_rx_mcast_pkts;
+	__le64	mac_rx_bcast_pkts;
+	__le64	mac_rx_pause_cnt;
+	__le64	mac_rx_ctrl_pkt;
+	__le64	mac_rx_lt_64b_pkts;
+	__le64	mac_rx_lt_127b_pkts;
+	__le64	mac_rx_lt_255b_pkts;
+	__le64	mac_rx_lt_511b_pkts;
+	__le64	mac_rx_lt_1023b_pkts;
+	__le64	mac_rx_lt_1518b_pkts;
+	__le64	mac_rx_gt_1518b_pkts;
+	__le64	rsvd2[3];
+
+	__le64	mac_rx_length_error;
+	__le64	mac_rx_length_small;
+	__le64	mac_rx_length_large;
+	__le64	mac_rx_jabber;
+	__le64	mac_rx_dropped;
+	__le64	mac_rx_crc_error;
+	__le64	mac_align_error;
+} __packed;
+
+struct qlcnic_mac_statistics {
+	u64	mac_tx_frames;
+	u64	mac_tx_bytes;
+	u64	mac_tx_mcast_pkts;
+	u64	mac_tx_bcast_pkts;
+	u64	mac_tx_pause_cnt;
+	u64	mac_tx_ctrl_pkt;
+	u64	mac_tx_lt_64b_pkts;
+	u64	mac_tx_lt_127b_pkts;
+	u64	mac_tx_lt_255b_pkts;
+	u64	mac_tx_lt_511b_pkts;
+	u64	mac_tx_lt_1023b_pkts;
+	u64	mac_tx_lt_1518b_pkts;
+	u64	mac_tx_gt_1518b_pkts;
+	u64	rsvd1[3];
+	u64	mac_rx_frames;
+	u64	mac_rx_bytes;
+	u64	mac_rx_mcast_pkts;
+	u64	mac_rx_bcast_pkts;
+	u64	mac_rx_pause_cnt;
+	u64	mac_rx_ctrl_pkt;
+	u64	mac_rx_lt_64b_pkts;
+	u64	mac_rx_lt_127b_pkts;
+	u64	mac_rx_lt_255b_pkts;
+	u64	mac_rx_lt_511b_pkts;
+	u64	mac_rx_lt_1023b_pkts;
+	u64	mac_rx_lt_1518b_pkts;
+	u64	mac_rx_gt_1518b_pkts;
+	u64	rsvd2[3];
+	u64	mac_rx_length_error;
+	u64	mac_rx_length_small;
+	u64	mac_rx_length_large;
+	u64	mac_rx_jabber;
+	u64	mac_rx_dropped;
+	u64	mac_rx_crc_error;
+	u64	mac_align_error;
+};
+
+struct qlcnic_esw_stats_le {
+	__le16 context_id;
+	__le16 version;
+	__le16 size;
+	__le16 unused;
+	__le64 unicast_frames;
+	__le64 multicast_frames;
+	__le64 broadcast_frames;
+	__le64 dropped_frames;
+	__le64 errors;
+	__le64 local_frames;
+	__le64 numbytes;
+	__le64 rsvd[3];
+} __packed;
+
+struct __qlcnic_esw_statistics {
+	u16	context_id;
+	u16	version;
+	u16	size;
+	u16	unused;
+	u64	unicast_frames;
+	u64	multicast_frames;
+	u64	broadcast_frames;
+	u64	dropped_frames;
+	u64	errors;
+	u64	local_frames;
+	u64	numbytes;
+	u64	rsvd[3];
+};
+
+struct qlcnic_esw_statistics {
+	struct __qlcnic_esw_statistics rx;
+	struct __qlcnic_esw_statistics tx;
+};
+
+#define QLCNIC_FORCE_FW_DUMP_KEY	0xdeadfeed
+#define QLCNIC_ENABLE_FW_DUMP		0xaddfeed
+#define QLCNIC_DISABLE_FW_DUMP		0xbadfeed
+#define QLCNIC_FORCE_FW_RESET		0xdeaddead
+#define QLCNIC_SET_QUIESCENT		0xadd00010
+#define QLCNIC_RESET_QUIESCENT		0xadd00020
+
+struct _cdrp_cmd {
+	u32 num;
+	u32 *arg;
+};
+
+struct qlcnic_cmd_args {
+	struct completion	completion;
+	struct list_head	list;
+	struct _cdrp_cmd	req;
+	struct _cdrp_cmd	rsp;
+	atomic_t		rsp_status;
+	int			pay_size;
+	u32			rsp_opcode;
+	u32			total_cmds;
+	u32			op_type;
+	u32			type;
+	u32			cmd_op;
+	u32			*hdr;	/* Back channel message header */
+	u32			*pay;	/* Back channel message payload */
+	u8			func_num;
+};
+
+int qlcnic_fw_cmd_get_minidump_temp(struct qlcnic_adapter *adapter);
+int qlcnic_fw_cmd_set_port(struct qlcnic_adapter *adapter, u32 config);
+int qlcnic_pci_mem_write_2M(struct qlcnic_adapter *, u64 off, u64 data);
+int qlcnic_pci_mem_read_2M(struct qlcnic_adapter *, u64 off, u64 *data);
+
+#define ADDR_IN_RANGE(addr, low, high)	\
+	(((addr) < (high)) && ((addr) >= (low)))
+
+#define QLCRD32(adapter, off, err) \
+	(adapter->ahw->hw_ops->read_reg)(adapter, off, err)
+
+#define QLCWR32(adapter, off, val) \
+	adapter->ahw->hw_ops->write_reg(adapter, off, val)
+
+int qlcnic_pcie_sem_lock(struct qlcnic_adapter *, int, u32);
+void qlcnic_pcie_sem_unlock(struct qlcnic_adapter *, int);
+
+#define qlcnic_rom_lock(a)	\
+	qlcnic_pcie_sem_lock((a), 2, QLCNIC_ROM_LOCK_ID)
+#define qlcnic_rom_unlock(a)	\
+	qlcnic_pcie_sem_unlock((a), 2)
+#define qlcnic_phy_lock(a)	\
+	qlcnic_pcie_sem_lock((a), 3, QLCNIC_PHY_LOCK_ID)
+#define qlcnic_phy_unlock(a)	\
+	qlcnic_pcie_sem_unlock((a), 3)
+#define qlcnic_sw_lock(a)	\
+	qlcnic_pcie_sem_lock((a), 6, 0)
+#define qlcnic_sw_unlock(a)	\
+	qlcnic_pcie_sem_unlock((a), 6)
+#define crb_win_lock(a)	\
+	qlcnic_pcie_sem_lock((a), 7, QLCNIC_CRB_WIN_LOCK_ID)
+#define crb_win_unlock(a)	\
+	qlcnic_pcie_sem_unlock((a), 7)
+
+#define __QLCNIC_MAX_LED_RATE	0xf
+#define __QLCNIC_MAX_LED_STATE	0x2
+
+#define MAX_CTL_CHECK 1000
+
+void qlcnic_prune_lb_filters(struct qlcnic_adapter *adapter);
+void qlcnic_delete_lb_filters(struct qlcnic_adapter *adapter);
+int qlcnic_dump_fw(struct qlcnic_adapter *);
+int qlcnic_enable_fw_dump_state(struct qlcnic_adapter *);
+bool qlcnic_check_fw_dump_state(struct qlcnic_adapter *);
+
+/* Functions from qlcnic_init.c */
+void qlcnic_schedule_work(struct qlcnic_adapter *, work_func_t, int);
+int qlcnic_load_firmware(struct qlcnic_adapter *adapter);
+int qlcnic_need_fw_reset(struct qlcnic_adapter *adapter);
+void qlcnic_request_firmware(struct qlcnic_adapter *adapter);
+void qlcnic_release_firmware(struct qlcnic_adapter *adapter);
+int qlcnic_pinit_from_rom(struct qlcnic_adapter *adapter);
+int qlcnic_setup_idc_param(struct qlcnic_adapter *adapter);
+int qlcnic_check_flash_fw_ver(struct qlcnic_adapter *adapter);
+
+int qlcnic_rom_fast_read(struct qlcnic_adapter *adapter, u32 addr, u32 *valp);
+int qlcnic_rom_fast_read_words(struct qlcnic_adapter *adapter, int addr,
+				u8 *bytes, size_t size);
+int qlcnic_alloc_sw_resources(struct qlcnic_adapter *adapter);
+void qlcnic_free_sw_resources(struct qlcnic_adapter *adapter);
+
+void __iomem *qlcnic_get_ioaddr(struct qlcnic_hardware_context *, u32);
+
+int qlcnic_alloc_hw_resources(struct qlcnic_adapter *adapter);
+void qlcnic_free_hw_resources(struct qlcnic_adapter *adapter);
+
+int qlcnic_fw_create_ctx(struct qlcnic_adapter *adapter);
+void qlcnic_fw_destroy_ctx(struct qlcnic_adapter *adapter);
+
+void qlcnic_reset_rx_buffers_list(struct qlcnic_adapter *adapter);
+void qlcnic_release_rx_buffers(struct qlcnic_adapter *adapter);
+void qlcnic_release_tx_buffers(struct qlcnic_adapter *,
+			       struct qlcnic_host_tx_ring *);
+
+int qlcnic_check_fw_status(struct qlcnic_adapter *adapter);
+void qlcnic_watchdog_task(struct work_struct *work);
+void qlcnic_post_rx_buffers(struct qlcnic_adapter *adapter,
+		struct qlcnic_host_rds_ring *rds_ring, u8 ring_id);
+void qlcnic_set_multi(struct net_device *netdev);
+void qlcnic_flush_mcast_mac(struct qlcnic_adapter *);
+int qlcnic_nic_add_mac(struct qlcnic_adapter *, const u8 *, u16,
+		       enum qlcnic_mac_type);
+int qlcnic_nic_del_mac(struct qlcnic_adapter *, const u8 *);
+void qlcnic_82xx_free_mac_list(struct qlcnic_adapter *adapter);
+int qlcnic_82xx_read_phys_port_id(struct qlcnic_adapter *);
+
+int qlcnic_fw_cmd_set_mtu(struct qlcnic_adapter *adapter, int mtu);
+int qlcnic_fw_cmd_set_drv_version(struct qlcnic_adapter *, u32);
+int qlcnic_change_mtu(struct net_device *netdev, int new_mtu);
+netdev_features_t qlcnic_fix_features(struct net_device *netdev,
+	netdev_features_t features);
+int qlcnic_set_features(struct net_device *netdev, netdev_features_t features);
+int qlcnic_config_bridged_mode(struct qlcnic_adapter *adapter, u32 enable);
+void qlcnic_update_cmd_producer(struct qlcnic_host_tx_ring *);
+
+/* Functions from qlcnic_ethtool.c */
+int qlcnic_check_loopback_buff(unsigned char *, u8 []);
+int qlcnic_do_lb_test(struct qlcnic_adapter *, u8);
+
+/* Functions from qlcnic_main.c */
+int qlcnic_reset_context(struct qlcnic_adapter *);
+void qlcnic_diag_free_res(struct net_device *netdev, int);
+int qlcnic_diag_alloc_res(struct net_device *netdev, int);
+netdev_tx_t qlcnic_xmit_frame(struct sk_buff *, struct net_device *);
+void qlcnic_set_tx_ring_count(struct qlcnic_adapter *, u8);
+void qlcnic_set_sds_ring_count(struct qlcnic_adapter *, u8);
+int qlcnic_setup_rings(struct qlcnic_adapter *);
+int qlcnic_validate_rings(struct qlcnic_adapter *, __u32, int);
+void qlcnic_alloc_lb_filters_mem(struct qlcnic_adapter *adapter);
+int qlcnic_enable_msix(struct qlcnic_adapter *, u32);
+void qlcnic_set_drv_version(struct qlcnic_adapter *);
+
+/*  eSwitch management functions */
+int qlcnic_config_switch_port(struct qlcnic_adapter *,
+				struct qlcnic_esw_func_cfg *);
+
+int qlcnic_get_eswitch_port_config(struct qlcnic_adapter *,
+				struct qlcnic_esw_func_cfg *);
+int qlcnic_config_port_mirroring(struct qlcnic_adapter *, u8, u8, u8);
+int qlcnic_get_port_stats(struct qlcnic_adapter *, const u8, const u8,
+					struct __qlcnic_esw_statistics *);
+int qlcnic_get_eswitch_stats(struct qlcnic_adapter *, const u8, u8,
+					struct __qlcnic_esw_statistics *);
+int qlcnic_clear_esw_stats(struct qlcnic_adapter *adapter, u8, u8, u8);
+int qlcnic_get_mac_stats(struct qlcnic_adapter *, struct qlcnic_mac_statistics *);
+
+void qlcnic_free_mbx_args(struct qlcnic_cmd_args *cmd);
+
+int qlcnic_alloc_sds_rings(struct qlcnic_recv_context *, int);
+void qlcnic_free_sds_rings(struct qlcnic_recv_context *);
+void qlcnic_advert_link_change(struct qlcnic_adapter *, int);
+void qlcnic_free_tx_rings(struct qlcnic_adapter *);
+int qlcnic_alloc_tx_rings(struct qlcnic_adapter *, struct net_device *);
+void qlcnic_dump_mbx(struct qlcnic_adapter *, struct qlcnic_cmd_args *);
+
+void qlcnic_create_sysfs_entries(struct qlcnic_adapter *adapter);
+void qlcnic_remove_sysfs_entries(struct qlcnic_adapter *adapter);
+void qlcnic_82xx_add_sysfs(struct qlcnic_adapter *adapter);
+void qlcnic_82xx_remove_sysfs(struct qlcnic_adapter *adapter);
+
+int qlcnicvf_config_bridged_mode(struct qlcnic_adapter *, u32);
+int qlcnicvf_config_led(struct qlcnic_adapter *, u32, u32);
+void qlcnic_set_vlan_config(struct qlcnic_adapter *,
+			    struct qlcnic_esw_func_cfg *);
+void qlcnic_set_eswitch_port_features(struct qlcnic_adapter *,
+				      struct qlcnic_esw_func_cfg *);
+int qlcnic_setup_tss_rss_intr(struct qlcnic_adapter  *);
+void qlcnic_down(struct qlcnic_adapter *, struct net_device *);
+int qlcnic_up(struct qlcnic_adapter *, struct net_device *);
+void __qlcnic_down(struct qlcnic_adapter *, struct net_device *);
+void qlcnic_detach(struct qlcnic_adapter *);
+void qlcnic_teardown_intr(struct qlcnic_adapter *);
+int qlcnic_attach(struct qlcnic_adapter *);
+int __qlcnic_up(struct qlcnic_adapter *, struct net_device *);
+void qlcnic_restore_indev_addr(struct net_device *, unsigned long);
+
+int qlcnic_check_temp(struct qlcnic_adapter *);
+int qlcnic_init_pci_info(struct qlcnic_adapter *);
+int qlcnic_set_default_offload_settings(struct qlcnic_adapter *);
+int qlcnic_reset_npar_config(struct qlcnic_adapter *);
+int qlcnic_set_eswitch_port_config(struct qlcnic_adapter *);
+int qlcnic_83xx_configure_opmode(struct qlcnic_adapter *adapter);
+int qlcnic_read_mac_addr(struct qlcnic_adapter *);
+int qlcnic_setup_netdev(struct qlcnic_adapter *, struct net_device *, int);
+void qlcnic_set_netdev_features(struct qlcnic_adapter *,
+				struct qlcnic_esw_func_cfg *);
+void qlcnic_sriov_vf_set_multi(struct net_device *);
+int qlcnic_is_valid_nic_func(struct qlcnic_adapter *, u8);
+int qlcnic_get_pci_func_type(struct qlcnic_adapter *, u16, u16 *, u16 *,
+			     u16 *);
+
+/*
+ * QLOGIC Board information
+ */
+
+#define QLCNIC_MAX_BOARD_NAME_LEN 100
+struct qlcnic_board_info {
+	unsigned short  vendor;
+	unsigned short  device;
+	unsigned short  sub_vendor;
+	unsigned short  sub_device;
+	char short_name[QLCNIC_MAX_BOARD_NAME_LEN];
+};
+
+static inline u32 qlcnic_tx_avail(struct qlcnic_host_tx_ring *tx_ring)
+{
+	if (likely(tx_ring->producer < tx_ring->sw_consumer))
+		return tx_ring->sw_consumer - tx_ring->producer;
+	else
+		return tx_ring->sw_consumer + tx_ring->num_desc -
+				tx_ring->producer;
+}
+
+struct qlcnic_nic_template {
+	int (*config_bridged_mode) (struct qlcnic_adapter *, u32);
+	int (*config_led) (struct qlcnic_adapter *, u32, u32);
+	int (*start_firmware) (struct qlcnic_adapter *);
+	int (*init_driver) (struct qlcnic_adapter *);
+	void (*request_reset) (struct qlcnic_adapter *, u32);
+	void (*cancel_idc_work) (struct qlcnic_adapter *);
+	int (*napi_add)(struct qlcnic_adapter *, struct net_device *);
+	void (*napi_del)(struct qlcnic_adapter *);
+	void (*config_ipaddr)(struct qlcnic_adapter *, __be32, int);
+	irqreturn_t (*clear_legacy_intr)(struct qlcnic_adapter *);
+	int (*shutdown)(struct pci_dev *);
+	int (*resume)(struct qlcnic_adapter *);
+};
+
+struct qlcnic_mbx_ops {
+	int (*enqueue_cmd) (struct qlcnic_adapter *,
+			    struct qlcnic_cmd_args *, unsigned long *);
+	void (*dequeue_cmd) (struct qlcnic_adapter *, struct qlcnic_cmd_args *);
+	void (*decode_resp) (struct qlcnic_adapter *, struct qlcnic_cmd_args *);
+	void (*encode_cmd) (struct qlcnic_adapter *, struct qlcnic_cmd_args *);
+	void (*nofity_fw) (struct qlcnic_adapter *, u8);
+};
+
+int qlcnic_83xx_init_mailbox_work(struct qlcnic_adapter *);
+void qlcnic_83xx_detach_mailbox_work(struct qlcnic_adapter *);
+void qlcnic_83xx_reinit_mbx_work(struct qlcnic_mailbox *mbx);
+void qlcnic_83xx_free_mailbox(struct qlcnic_mailbox *mbx);
+void qlcnic_update_stats(struct qlcnic_adapter *);
+
+/* Adapter hardware abstraction */
+struct qlcnic_hardware_ops {
+	void (*read_crb) (struct qlcnic_adapter *, char *, loff_t, size_t);
+	void (*write_crb) (struct qlcnic_adapter *, char *, loff_t, size_t);
+	int (*read_reg) (struct qlcnic_adapter *, ulong, int *);
+	int (*write_reg) (struct qlcnic_adapter *, ulong, u32);
+	void (*get_ocm_win) (struct qlcnic_hardware_context *);
+	int (*get_mac_address) (struct qlcnic_adapter *, u8 *, u8);
+	int (*setup_intr) (struct qlcnic_adapter *);
+	int (*alloc_mbx_args)(struct qlcnic_cmd_args *,
+			      struct qlcnic_adapter *, u32);
+	int (*mbx_cmd) (struct qlcnic_adapter *, struct qlcnic_cmd_args *);
+	void (*get_func_no) (struct qlcnic_adapter *);
+	int (*api_lock) (struct qlcnic_adapter *);
+	void (*api_unlock) (struct qlcnic_adapter *);
+	void (*add_sysfs) (struct qlcnic_adapter *);
+	void (*remove_sysfs) (struct qlcnic_adapter *);
+	void (*process_lb_rcv_ring_diag) (struct qlcnic_host_sds_ring *);
+	int (*create_rx_ctx) (struct qlcnic_adapter *);
+	int (*create_tx_ctx) (struct qlcnic_adapter *,
+	struct qlcnic_host_tx_ring *, int);
+	void (*del_rx_ctx) (struct qlcnic_adapter *);
+	void (*del_tx_ctx) (struct qlcnic_adapter *,
+			    struct qlcnic_host_tx_ring *);
+	int (*setup_link_event) (struct qlcnic_adapter *, int);
+	int (*get_nic_info) (struct qlcnic_adapter *, struct qlcnic_info *, u8);
+	int (*get_pci_info) (struct qlcnic_adapter *, struct qlcnic_pci_info *);
+	int (*set_nic_info) (struct qlcnic_adapter *, struct qlcnic_info *);
+	int (*change_macvlan) (struct qlcnic_adapter *, u8*, u16, u8);
+	void (*napi_enable) (struct qlcnic_adapter *);
+	void (*napi_disable) (struct qlcnic_adapter *);
+	int (*config_intr_coal) (struct qlcnic_adapter *,
+				 struct ethtool_coalesce *);
+	int (*config_rss) (struct qlcnic_adapter *, int);
+	int (*config_hw_lro) (struct qlcnic_adapter *, int);
+	int (*config_loopback) (struct qlcnic_adapter *, u8);
+	int (*clear_loopback) (struct qlcnic_adapter *, u8);
+	int (*config_promisc_mode) (struct qlcnic_adapter *, u32);
+	void (*change_l2_filter) (struct qlcnic_adapter *, u64 *, u16);
+	int (*get_board_info) (struct qlcnic_adapter *);
+	void (*set_mac_filter_count) (struct qlcnic_adapter *);
+	void (*free_mac_list) (struct qlcnic_adapter *);
+	int (*read_phys_port_id) (struct qlcnic_adapter *);
+	pci_ers_result_t (*io_error_detected) (struct pci_dev *,
+					       pci_channel_state_t);
+	pci_ers_result_t (*io_slot_reset) (struct pci_dev *);
+	void (*io_resume) (struct pci_dev *);
+	void (*get_beacon_state)(struct qlcnic_adapter *);
+	void (*enable_sds_intr) (struct qlcnic_adapter *,
+				 struct qlcnic_host_sds_ring *);
+	void (*disable_sds_intr) (struct qlcnic_adapter *,
+				  struct qlcnic_host_sds_ring *);
+	void (*enable_tx_intr) (struct qlcnic_adapter *,
+				struct qlcnic_host_tx_ring *);
+	void (*disable_tx_intr) (struct qlcnic_adapter *,
+				 struct qlcnic_host_tx_ring *);
+	u32 (*get_saved_state)(void *, u32);
+	void (*set_saved_state)(void *, u32, u32);
+	void (*cache_tmpl_hdr_values)(struct qlcnic_fw_dump *);
+	u32 (*get_cap_size)(void *, int);
+	void (*set_sys_info)(void *, int, u32);
+	void (*store_cap_mask)(void *, u32);
+	bool (*encap_rx_offload) (struct qlcnic_adapter *adapter);
+	bool (*encap_tx_offload) (struct qlcnic_adapter *adapter);
+};
+
+extern struct qlcnic_nic_template qlcnic_vf_ops;
+
+static inline bool qlcnic_83xx_encap_tx_offload(struct qlcnic_adapter *adapter)
+{
+	return adapter->ahw->extra_capability[0] &
+	       QLCNIC_83XX_FW_CAPAB_ENCAP_TX_OFFLOAD;
+}
+
+static inline bool qlcnic_83xx_encap_rx_offload(struct qlcnic_adapter *adapter)
+{
+	return adapter->ahw->extra_capability[0] &
+	       QLCNIC_83XX_FW_CAPAB_ENCAP_RX_OFFLOAD;
+}
+
+static inline bool qlcnic_82xx_encap_tx_offload(struct qlcnic_adapter *adapter)
+{
+	return false;
+}
+
+static inline bool qlcnic_82xx_encap_rx_offload(struct qlcnic_adapter *adapter)
+{
+        return false;
+}
+
+static inline bool qlcnic_encap_rx_offload(struct qlcnic_adapter *adapter)
+{
+        return adapter->ahw->hw_ops->encap_rx_offload(adapter);
+}
+
+static inline bool qlcnic_encap_tx_offload(struct qlcnic_adapter *adapter)
+{
+        return adapter->ahw->hw_ops->encap_tx_offload(adapter);
+}
+
+static inline int qlcnic_start_firmware(struct qlcnic_adapter *adapter)
+{
+	return adapter->nic_ops->start_firmware(adapter);
+}
+
+static inline void qlcnic_read_crb(struct qlcnic_adapter *adapter, char *buf,
+				   loff_t offset, size_t size)
+{
+	adapter->ahw->hw_ops->read_crb(adapter, buf, offset, size);
+}
+
+static inline void qlcnic_write_crb(struct qlcnic_adapter *adapter, char *buf,
+				    loff_t offset, size_t size)
+{
+	adapter->ahw->hw_ops->write_crb(adapter, buf, offset, size);
+}
+
+static inline int qlcnic_hw_write_wx_2M(struct qlcnic_adapter *adapter,
+					ulong off, u32 data)
+{
+	return adapter->ahw->hw_ops->write_reg(adapter, off, data);
+}
+
+static inline int qlcnic_get_mac_address(struct qlcnic_adapter *adapter,
+					 u8 *mac, u8 function)
+{
+	return adapter->ahw->hw_ops->get_mac_address(adapter, mac, function);
+}
+
+static inline int qlcnic_setup_intr(struct qlcnic_adapter *adapter)
+{
+	return adapter->ahw->hw_ops->setup_intr(adapter);
+}
+
+static inline int qlcnic_alloc_mbx_args(struct qlcnic_cmd_args *mbx,
+					struct qlcnic_adapter *adapter, u32 arg)
+{
+	return adapter->ahw->hw_ops->alloc_mbx_args(mbx, adapter, arg);
+}
+
+static inline int qlcnic_issue_cmd(struct qlcnic_adapter *adapter,
+				   struct qlcnic_cmd_args *cmd)
+{
+	if (adapter->ahw->hw_ops->mbx_cmd)
+		return adapter->ahw->hw_ops->mbx_cmd(adapter, cmd);
+
+	return -EIO;
+}
+
+static inline void qlcnic_get_func_no(struct qlcnic_adapter *adapter)
+{
+	adapter->ahw->hw_ops->get_func_no(adapter);
+}
+
+static inline int qlcnic_api_lock(struct qlcnic_adapter *adapter)
+{
+	return adapter->ahw->hw_ops->api_lock(adapter);
+}
+
+static inline void qlcnic_api_unlock(struct qlcnic_adapter *adapter)
+{
+	adapter->ahw->hw_ops->api_unlock(adapter);
+}
+
+static inline void qlcnic_add_sysfs(struct qlcnic_adapter *adapter)
+{
+	if (adapter->ahw->hw_ops->add_sysfs)
+		adapter->ahw->hw_ops->add_sysfs(adapter);
+}
+
+static inline void qlcnic_remove_sysfs(struct qlcnic_adapter *adapter)
+{
+	if (adapter->ahw->hw_ops->remove_sysfs)
+		adapter->ahw->hw_ops->remove_sysfs(adapter);
+}
+
+static inline void
+qlcnic_process_rcv_ring_diag(struct qlcnic_host_sds_ring *sds_ring)
+{
+	sds_ring->adapter->ahw->hw_ops->process_lb_rcv_ring_diag(sds_ring);
+}
+
+static inline int qlcnic_fw_cmd_create_rx_ctx(struct qlcnic_adapter *adapter)
+{
+	return adapter->ahw->hw_ops->create_rx_ctx(adapter);
+}
+
+static inline int qlcnic_fw_cmd_create_tx_ctx(struct qlcnic_adapter *adapter,
+					      struct qlcnic_host_tx_ring *ptr,
+					      int ring)
+{
+	return adapter->ahw->hw_ops->create_tx_ctx(adapter, ptr, ring);
+}
+
+static inline void qlcnic_fw_cmd_del_rx_ctx(struct qlcnic_adapter *adapter)
+{
+	return adapter->ahw->hw_ops->del_rx_ctx(adapter);
+}
+
+static inline void qlcnic_fw_cmd_del_tx_ctx(struct qlcnic_adapter *adapter,
+					    struct qlcnic_host_tx_ring *ptr)
+{
+	return adapter->ahw->hw_ops->del_tx_ctx(adapter, ptr);
+}
+
+static inline int qlcnic_linkevent_request(struct qlcnic_adapter *adapter,
+					   int enable)
+{
+	return adapter->ahw->hw_ops->setup_link_event(adapter, enable);
+}
+
+static inline int qlcnic_get_nic_info(struct qlcnic_adapter *adapter,
+				      struct qlcnic_info *info, u8 id)
+{
+	return adapter->ahw->hw_ops->get_nic_info(adapter, info, id);
+}
+
+static inline int qlcnic_get_pci_info(struct qlcnic_adapter *adapter,
+				      struct qlcnic_pci_info *info)
+{
+	return adapter->ahw->hw_ops->get_pci_info(adapter, info);
+}
+
+static inline int qlcnic_set_nic_info(struct qlcnic_adapter *adapter,
+				      struct qlcnic_info *info)
+{
+	return adapter->ahw->hw_ops->set_nic_info(adapter, info);
+}
+
+static inline int qlcnic_sre_macaddr_change(struct qlcnic_adapter *adapter,
+					    u8 *addr, u16 id, u8 cmd)
+{
+	return adapter->ahw->hw_ops->change_macvlan(adapter, addr, id, cmd);
+}
+
+static inline int qlcnic_napi_add(struct qlcnic_adapter *adapter,
+				  struct net_device *netdev)
+{
+	return adapter->nic_ops->napi_add(adapter, netdev);
+}
+
+static inline void qlcnic_napi_del(struct qlcnic_adapter *adapter)
+{
+	adapter->nic_ops->napi_del(adapter);
+}
+
+static inline void qlcnic_napi_enable(struct qlcnic_adapter *adapter)
+{
+	adapter->ahw->hw_ops->napi_enable(adapter);
+}
+
+static inline int __qlcnic_shutdown(struct pci_dev *pdev)
+{
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+
+	return adapter->nic_ops->shutdown(pdev);
+}
+
+static inline int __qlcnic_resume(struct qlcnic_adapter *adapter)
+{
+	return adapter->nic_ops->resume(adapter);
+}
+
+static inline void qlcnic_napi_disable(struct qlcnic_adapter *adapter)
+{
+	adapter->ahw->hw_ops->napi_disable(adapter);
+}
+
+static inline int qlcnic_config_intr_coalesce(struct qlcnic_adapter *adapter,
+					      struct ethtool_coalesce *ethcoal)
+{
+	return adapter->ahw->hw_ops->config_intr_coal(adapter, ethcoal);
+}
+
+static inline int qlcnic_config_rss(struct qlcnic_adapter *adapter, int enable)
+{
+	return adapter->ahw->hw_ops->config_rss(adapter, enable);
+}
+
+static inline int qlcnic_config_hw_lro(struct qlcnic_adapter *adapter,
+				       int enable)
+{
+	return adapter->ahw->hw_ops->config_hw_lro(adapter, enable);
+}
+
+static inline int qlcnic_set_lb_mode(struct qlcnic_adapter *adapter, u8 mode)
+{
+	return adapter->ahw->hw_ops->config_loopback(adapter, mode);
+}
+
+static inline int qlcnic_clear_lb_mode(struct qlcnic_adapter *adapter, u8 mode)
+{
+	return adapter->ahw->hw_ops->clear_loopback(adapter, mode);
+}
+
+static inline int qlcnic_nic_set_promisc(struct qlcnic_adapter *adapter,
+					 u32 mode)
+{
+	return adapter->ahw->hw_ops->config_promisc_mode(adapter, mode);
+}
+
+static inline void qlcnic_change_filter(struct qlcnic_adapter *adapter,
+					u64 *addr, u16 id)
+{
+	adapter->ahw->hw_ops->change_l2_filter(adapter, addr, id);
+}
+
+static inline int qlcnic_get_board_info(struct qlcnic_adapter *adapter)
+{
+	return adapter->ahw->hw_ops->get_board_info(adapter);
+}
+
+static inline void qlcnic_free_mac_list(struct qlcnic_adapter *adapter)
+{
+	return adapter->ahw->hw_ops->free_mac_list(adapter);
+}
+
+static inline void qlcnic_set_mac_filter_count(struct qlcnic_adapter *adapter)
+{
+	if (adapter->ahw->hw_ops->set_mac_filter_count)
+		adapter->ahw->hw_ops->set_mac_filter_count(adapter);
+}
+
+static inline void qlcnic_get_beacon_state(struct qlcnic_adapter *adapter)
+{
+	adapter->ahw->hw_ops->get_beacon_state(adapter);
+}
+
+static inline void qlcnic_read_phys_port_id(struct qlcnic_adapter *adapter)
+{
+	if (adapter->ahw->hw_ops->read_phys_port_id)
+		adapter->ahw->hw_ops->read_phys_port_id(adapter);
+}
+
+static inline u32 qlcnic_get_saved_state(struct qlcnic_adapter *adapter,
+					 void *t_hdr, u32 index)
+{
+	return adapter->ahw->hw_ops->get_saved_state(t_hdr, index);
+}
+
+static inline void qlcnic_set_saved_state(struct qlcnic_adapter *adapter,
+					  void *t_hdr, u32 index, u32 value)
+{
+	adapter->ahw->hw_ops->set_saved_state(t_hdr, index, value);
+}
+
+static inline void qlcnic_cache_tmpl_hdr_values(struct qlcnic_adapter *adapter,
+						struct qlcnic_fw_dump *fw_dump)
+{
+	adapter->ahw->hw_ops->cache_tmpl_hdr_values(fw_dump);
+}
+
+static inline u32 qlcnic_get_cap_size(struct qlcnic_adapter *adapter,
+				      void *tmpl_hdr, int index)
+{
+	return adapter->ahw->hw_ops->get_cap_size(tmpl_hdr, index);
+}
+
+static inline void qlcnic_set_sys_info(struct qlcnic_adapter *adapter,
+				       void *tmpl_hdr, int idx, u32 value)
+{
+	adapter->ahw->hw_ops->set_sys_info(tmpl_hdr, idx, value);
+}
+
+static inline void qlcnic_store_cap_mask(struct qlcnic_adapter *adapter,
+					 void *tmpl_hdr, u32 mask)
+{
+	adapter->ahw->hw_ops->store_cap_mask(tmpl_hdr, mask);
+}
+
+static inline void qlcnic_dev_request_reset(struct qlcnic_adapter *adapter,
+					    u32 key)
+{
+	if (adapter->nic_ops->request_reset)
+		adapter->nic_ops->request_reset(adapter, key);
+}
+
+static inline void qlcnic_cancel_idc_work(struct qlcnic_adapter *adapter)
+{
+	if (adapter->nic_ops->cancel_idc_work)
+		adapter->nic_ops->cancel_idc_work(adapter);
+}
+
+static inline irqreturn_t
+qlcnic_clear_legacy_intr(struct qlcnic_adapter *adapter)
+{
+	return adapter->nic_ops->clear_legacy_intr(adapter);
+}
+
+static inline int qlcnic_config_led(struct qlcnic_adapter *adapter, u32 state,
+				    u32 rate)
+{
+	return adapter->nic_ops->config_led(adapter, state, rate);
+}
+
+static inline void qlcnic_config_ipaddr(struct qlcnic_adapter *adapter,
+					__be32 ip, int cmd)
+{
+	adapter->nic_ops->config_ipaddr(adapter, ip, cmd);
+}
+
+static inline bool qlcnic_check_multi_tx(struct qlcnic_adapter *adapter)
+{
+	return test_bit(__QLCNIC_MULTI_TX_UNIQUE, &adapter->state);
+}
+
+static inline void
+qlcnic_82xx_enable_tx_intr(struct qlcnic_adapter *adapter,
+			   struct qlcnic_host_tx_ring *tx_ring)
+{
+	if (qlcnic_check_multi_tx(adapter) &&
+	    !adapter->ahw->diag_test)
+		writel(0x0, tx_ring->crb_intr_mask);
+}
+
+static inline void
+qlcnic_82xx_disable_tx_intr(struct qlcnic_adapter *adapter,
+			    struct qlcnic_host_tx_ring *tx_ring)
+{
+	if (qlcnic_check_multi_tx(adapter) &&
+	    !adapter->ahw->diag_test)
+		writel(1, tx_ring->crb_intr_mask);
+}
+
+static inline void
+qlcnic_83xx_enable_tx_intr(struct qlcnic_adapter *adapter,
+			   struct qlcnic_host_tx_ring *tx_ring)
+{
+	writel(0, tx_ring->crb_intr_mask);
+}
+
+static inline void
+qlcnic_83xx_disable_tx_intr(struct qlcnic_adapter *adapter,
+			    struct qlcnic_host_tx_ring *tx_ring)
+{
+	writel(1, tx_ring->crb_intr_mask);
+}
+
+/* Enable MSI-x and INT-x interrupts */
+static inline void
+qlcnic_83xx_enable_sds_intr(struct qlcnic_adapter *adapter,
+			    struct qlcnic_host_sds_ring *sds_ring)
+{
+	writel(0, sds_ring->crb_intr_mask);
+}
+
+/* Disable MSI-x and INT-x interrupts */
+static inline void
+qlcnic_83xx_disable_sds_intr(struct qlcnic_adapter *adapter,
+			     struct qlcnic_host_sds_ring *sds_ring)
+{
+	writel(1, sds_ring->crb_intr_mask);
+}
+
+static inline void qlcnic_disable_multi_tx(struct qlcnic_adapter *adapter)
+{
+	test_and_clear_bit(__QLCNIC_MULTI_TX_UNIQUE, &adapter->state);
+	adapter->drv_tx_rings = QLCNIC_SINGLE_RING;
+}
+
+/* When operating in a muti tx mode, driver needs to write 0x1
+ * to src register, instead of 0x0 to disable receiving interrupt.
+ */
+static inline void
+qlcnic_82xx_disable_sds_intr(struct qlcnic_adapter *adapter,
+			     struct qlcnic_host_sds_ring *sds_ring)
+{
+	if (qlcnic_check_multi_tx(adapter) &&
+	    !adapter->ahw->diag_test &&
+	    (adapter->flags & QLCNIC_MSIX_ENABLED))
+		writel(0x1, sds_ring->crb_intr_mask);
+	else
+		writel(0, sds_ring->crb_intr_mask);
+}
+
+static inline void qlcnic_enable_sds_intr(struct qlcnic_adapter *adapter,
+					  struct qlcnic_host_sds_ring *sds_ring)
+{
+	if (adapter->ahw->hw_ops->enable_sds_intr)
+		adapter->ahw->hw_ops->enable_sds_intr(adapter, sds_ring);
+}
+
+static inline void
+qlcnic_disable_sds_intr(struct qlcnic_adapter *adapter,
+			struct qlcnic_host_sds_ring *sds_ring)
+{
+	if (adapter->ahw->hw_ops->disable_sds_intr)
+		adapter->ahw->hw_ops->disable_sds_intr(adapter, sds_ring);
+}
+
+static inline void qlcnic_enable_tx_intr(struct qlcnic_adapter *adapter,
+					 struct qlcnic_host_tx_ring *tx_ring)
+{
+	if (adapter->ahw->hw_ops->enable_tx_intr)
+		adapter->ahw->hw_ops->enable_tx_intr(adapter, tx_ring);
+}
+
+static inline void qlcnic_disable_tx_intr(struct qlcnic_adapter *adapter,
+					  struct qlcnic_host_tx_ring *tx_ring)
+{
+	if (adapter->ahw->hw_ops->disable_tx_intr)
+		adapter->ahw->hw_ops->disable_tx_intr(adapter, tx_ring);
+}
+
+/* When operating in a muti tx mode, driver needs to write 0x0
+ * to src register, instead of 0x1 to enable receiving interrupts.
+ */
+static inline void
+qlcnic_82xx_enable_sds_intr(struct qlcnic_adapter *adapter,
+			    struct qlcnic_host_sds_ring *sds_ring)
+{
+	if (qlcnic_check_multi_tx(adapter) &&
+	    !adapter->ahw->diag_test &&
+	    (adapter->flags & QLCNIC_MSIX_ENABLED))
+		writel(0, sds_ring->crb_intr_mask);
+	else
+		writel(0x1, sds_ring->crb_intr_mask);
+
+	if (!QLCNIC_IS_MSI_FAMILY(adapter))
+		writel(0xfbff, adapter->tgt_mask_reg);
+}
+
+static inline int qlcnic_get_diag_lock(struct qlcnic_adapter *adapter)
+{
+	return test_and_set_bit(__QLCNIC_DIAG_MODE, &adapter->state);
+}
+
+static inline void qlcnic_release_diag_lock(struct qlcnic_adapter *adapter)
+{
+	clear_bit(__QLCNIC_DIAG_MODE, &adapter->state);
+}
+
+static inline int qlcnic_check_diag_status(struct qlcnic_adapter *adapter)
+{
+	return test_bit(__QLCNIC_DIAG_MODE, &adapter->state);
+}
+
+extern const struct ethtool_ops qlcnic_sriov_vf_ethtool_ops;
+extern const struct ethtool_ops qlcnic_ethtool_ops;
+extern const struct ethtool_ops qlcnic_ethtool_failed_ops;
+
+#define QLCDB(adapter, lvl, _fmt, _args...) do {	\
+	if (NETIF_MSG_##lvl & adapter->ahw->msg_enable)	\
+		printk(KERN_INFO "%s: %s: " _fmt,	\
+			 dev_name(&adapter->pdev->dev),	\
+			__func__, ##_args);		\
+	} while (0)
+
+#define PCI_DEVICE_ID_QLOGIC_QLE824X		0x8020
+#define PCI_DEVICE_ID_QLOGIC_QLE834X		0x8030
+#define PCI_DEVICE_ID_QLOGIC_VF_QLE834X	0x8430
+#define PCI_DEVICE_ID_QLOGIC_QLE8830		0x8830
+#define PCI_DEVICE_ID_QLOGIC_VF_QLE8C30		0x8C30
+#define PCI_DEVICE_ID_QLOGIC_QLE844X		0x8040
+#define PCI_DEVICE_ID_QLOGIC_VF_QLE844X	0x8440
+
+static inline bool qlcnic_82xx_check(struct qlcnic_adapter *adapter)
+{
+	unsigned short device = adapter->pdev->device;
+	return (device == PCI_DEVICE_ID_QLOGIC_QLE824X) ? true : false;
+}
+
+static inline bool qlcnic_84xx_check(struct qlcnic_adapter *adapter)
+{
+	unsigned short device = adapter->pdev->device;
+
+	return ((device == PCI_DEVICE_ID_QLOGIC_QLE844X) ||
+		(device == PCI_DEVICE_ID_QLOGIC_VF_QLE844X)) ? true : false;
+}
+
+static inline bool qlcnic_83xx_check(struct qlcnic_adapter *adapter)
+{
+	unsigned short device = adapter->pdev->device;
+	bool status;
+
+	status = ((device == PCI_DEVICE_ID_QLOGIC_QLE834X) ||
+		  (device == PCI_DEVICE_ID_QLOGIC_QLE8830) ||
+		  (device == PCI_DEVICE_ID_QLOGIC_QLE844X) ||
+		  (device == PCI_DEVICE_ID_QLOGIC_VF_QLE844X) ||
+		  (device == PCI_DEVICE_ID_QLOGIC_VF_QLE834X) ||
+		  (device == PCI_DEVICE_ID_QLOGIC_VF_QLE8C30)) ? true : false;
+
+	return status;
+}
+
+static inline bool qlcnic_sriov_pf_check(struct qlcnic_adapter *adapter)
+{
+	return (adapter->ahw->op_mode == QLCNIC_SRIOV_PF_FUNC) ? true : false;
+}
+
+static inline bool qlcnic_sriov_vf_check(struct qlcnic_adapter *adapter)
+{
+	unsigned short device = adapter->pdev->device;
+	bool status;
+
+	status = ((device == PCI_DEVICE_ID_QLOGIC_VF_QLE834X) ||
+		  (device == PCI_DEVICE_ID_QLOGIC_VF_QLE844X) ||
+		  (device == PCI_DEVICE_ID_QLOGIC_VF_QLE8C30)) ? true : false;
+
+	return status;
+}
+
+static inline bool qlcnic_83xx_pf_check(struct qlcnic_adapter *adapter)
+{
+	unsigned short device = adapter->pdev->device;
+
+	return (device == PCI_DEVICE_ID_QLOGIC_QLE834X) ? true : false;
+}
+
+static inline bool qlcnic_83xx_vf_check(struct qlcnic_adapter *adapter)
+{
+	unsigned short device = adapter->pdev->device;
+
+	return ((device == PCI_DEVICE_ID_QLOGIC_VF_QLE834X) ||
+		(device == PCI_DEVICE_ID_QLOGIC_VF_QLE8C30)) ? true : false;
+}
+
+static inline bool qlcnic_sriov_check(struct qlcnic_adapter *adapter)
+{
+	bool status;
+
+	status = (qlcnic_sriov_pf_check(adapter) ||
+		  qlcnic_sriov_vf_check(adapter)) ? true : false;
+
+	return status;
+}
+
+static inline u32 qlcnic_get_vnic_func_count(struct qlcnic_adapter *adapter)
+{
+	if (qlcnic_84xx_check(adapter))
+		return QLC_84XX_VNIC_COUNT;
+	else
+		return QLC_DEFAULT_VNIC_COUNT;
+}
+
+static inline void qlcnic_swap32_buffer(u32 *buffer, int count)
+{
+#if defined(__BIG_ENDIAN)
+	u32 *tmp = buffer;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		*tmp = swab32(*tmp);
+		tmp++;
+	}
+#endif
+}
+
+#ifdef CONFIG_QLCNIC_HWMON
+void qlcnic_register_hwmon_dev(struct qlcnic_adapter *);
+void qlcnic_unregister_hwmon_dev(struct qlcnic_adapter *);
+#else
+static inline void qlcnic_register_hwmon_dev(struct qlcnic_adapter *adapter)
+{
+	return;
+}
+static inline void qlcnic_unregister_hwmon_dev(struct qlcnic_adapter *adapter)
+{
+	return;
+}
+#endif
+#endif				/* __QLCNIC_H_ */
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
new file mode 100644
index 0000000..97c146e
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
@@ -0,0 +1,4237 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include <linux/if_vlan.h>
+#include <linux/ipv6.h>
+#include <linux/ethtool.h>
+#include <linux/interrupt.h>
+#include <linux/aer.h>
+
+#include "qlcnic.h"
+#include "qlcnic_sriov.h"
+
+static void __qlcnic_83xx_process_aen(struct qlcnic_adapter *);
+static int qlcnic_83xx_clear_lb_mode(struct qlcnic_adapter *, u8);
+static void qlcnic_83xx_configure_mac(struct qlcnic_adapter *, u8 *, u8,
+				      struct qlcnic_cmd_args *);
+static int qlcnic_83xx_get_port_config(struct qlcnic_adapter *);
+static irqreturn_t qlcnic_83xx_handle_aen(int, void *);
+static pci_ers_result_t qlcnic_83xx_io_error_detected(struct pci_dev *,
+						      pci_channel_state_t);
+static int qlcnic_83xx_set_port_config(struct qlcnic_adapter *);
+static pci_ers_result_t qlcnic_83xx_io_slot_reset(struct pci_dev *);
+static void qlcnic_83xx_io_resume(struct pci_dev *);
+static int qlcnic_83xx_set_lb_mode(struct qlcnic_adapter *, u8);
+static void qlcnic_83xx_set_mac_filter_count(struct qlcnic_adapter *);
+static int qlcnic_83xx_resume(struct qlcnic_adapter *);
+static int qlcnic_83xx_shutdown(struct pci_dev *);
+static void qlcnic_83xx_get_beacon_state(struct qlcnic_adapter *);
+
+#define RSS_HASHTYPE_IP_TCP		0x3
+#define QLC_83XX_FW_MBX_CMD		0
+#define QLC_SKIP_INACTIVE_PCI_REGS	7
+#define QLC_MAX_LEGACY_FUNC_SUPP	8
+
+/* 83xx Module type */
+#define QLC_83XX_MODULE_FIBRE_10GBASE_LRM	0x1 /* 10GBase-LRM */
+#define QLC_83XX_MODULE_FIBRE_10GBASE_LR	0x2 /* 10GBase-LR */
+#define QLC_83XX_MODULE_FIBRE_10GBASE_SR	0x3 /* 10GBase-SR */
+#define QLC_83XX_MODULE_DA_10GE_PASSIVE_CP	0x4 /* 10GE passive
+						     * copper(compliant)
+						     */
+#define QLC_83XX_MODULE_DA_10GE_ACTIVE_CP	0x5 /* 10GE active limiting
+						     * copper(compliant)
+						     */
+#define QLC_83XX_MODULE_DA_10GE_LEGACY_CP	0x6 /* 10GE passive copper
+						     * (legacy, best effort)
+						     */
+#define QLC_83XX_MODULE_FIBRE_1000BASE_SX	0x7 /* 1000Base-SX */
+#define QLC_83XX_MODULE_FIBRE_1000BASE_LX	0x8 /* 1000Base-LX */
+#define QLC_83XX_MODULE_FIBRE_1000BASE_CX	0x9 /* 1000Base-CX */
+#define QLC_83XX_MODULE_TP_1000BASE_T		0xa /* 1000Base-T*/
+#define QLC_83XX_MODULE_DA_1GE_PASSIVE_CP	0xb /* 1GE passive copper
+						     * (legacy, best effort)
+						     */
+#define QLC_83XX_MODULE_UNKNOWN			0xf /* Unknown module type */
+
+/* Port types */
+#define QLC_83XX_10_CAPABLE	 BIT_8
+#define QLC_83XX_100_CAPABLE	 BIT_9
+#define QLC_83XX_1G_CAPABLE	 BIT_10
+#define QLC_83XX_10G_CAPABLE	 BIT_11
+#define QLC_83XX_AUTONEG_ENABLE	 BIT_15
+
+static const struct qlcnic_mailbox_metadata qlcnic_83xx_mbx_tbl[] = {
+	{QLCNIC_CMD_CONFIGURE_IP_ADDR, 6, 1},
+	{QLCNIC_CMD_CONFIG_INTRPT, 18, 34},
+	{QLCNIC_CMD_CREATE_RX_CTX, 136, 27},
+	{QLCNIC_CMD_DESTROY_RX_CTX, 2, 1},
+	{QLCNIC_CMD_CREATE_TX_CTX, 54, 18},
+	{QLCNIC_CMD_DESTROY_TX_CTX, 2, 1},
+	{QLCNIC_CMD_CONFIGURE_MAC_LEARNING, 2, 1},
+	{QLCNIC_CMD_INTRPT_TEST, 22, 12},
+	{QLCNIC_CMD_SET_MTU, 3, 1},
+	{QLCNIC_CMD_READ_PHY, 4, 2},
+	{QLCNIC_CMD_WRITE_PHY, 5, 1},
+	{QLCNIC_CMD_READ_HW_REG, 4, 1},
+	{QLCNIC_CMD_GET_FLOW_CTL, 4, 2},
+	{QLCNIC_CMD_SET_FLOW_CTL, 4, 1},
+	{QLCNIC_CMD_READ_MAX_MTU, 4, 2},
+	{QLCNIC_CMD_READ_MAX_LRO, 4, 2},
+	{QLCNIC_CMD_MAC_ADDRESS, 4, 3},
+	{QLCNIC_CMD_GET_PCI_INFO, 1, 129},
+	{QLCNIC_CMD_GET_NIC_INFO, 2, 19},
+	{QLCNIC_CMD_SET_NIC_INFO, 32, 1},
+	{QLCNIC_CMD_GET_ESWITCH_CAPABILITY, 4, 3},
+	{QLCNIC_CMD_TOGGLE_ESWITCH, 4, 1},
+	{QLCNIC_CMD_GET_ESWITCH_STATUS, 4, 3},
+	{QLCNIC_CMD_SET_PORTMIRRORING, 4, 1},
+	{QLCNIC_CMD_CONFIGURE_ESWITCH, 4, 1},
+	{QLCNIC_CMD_GET_ESWITCH_PORT_CONFIG, 4, 3},
+	{QLCNIC_CMD_GET_ESWITCH_STATS, 5, 1},
+	{QLCNIC_CMD_CONFIG_PORT, 4, 1},
+	{QLCNIC_CMD_TEMP_SIZE, 1, 4},
+	{QLCNIC_CMD_GET_TEMP_HDR, 5, 5},
+	{QLCNIC_CMD_GET_LINK_EVENT, 2, 1},
+	{QLCNIC_CMD_CONFIG_MAC_VLAN, 4, 3},
+	{QLCNIC_CMD_CONFIG_INTR_COAL, 6, 1},
+	{QLCNIC_CMD_CONFIGURE_RSS, 14, 1},
+	{QLCNIC_CMD_CONFIGURE_LED, 2, 1},
+	{QLCNIC_CMD_CONFIGURE_MAC_RX_MODE, 2, 1},
+	{QLCNIC_CMD_CONFIGURE_HW_LRO, 2, 1},
+	{QLCNIC_CMD_GET_STATISTICS, 2, 80},
+	{QLCNIC_CMD_SET_PORT_CONFIG, 2, 1},
+	{QLCNIC_CMD_GET_PORT_CONFIG, 2, 2},
+	{QLCNIC_CMD_GET_LINK_STATUS, 2, 4},
+	{QLCNIC_CMD_IDC_ACK, 5, 1},
+	{QLCNIC_CMD_INIT_NIC_FUNC, 3, 1},
+	{QLCNIC_CMD_STOP_NIC_FUNC, 2, 1},
+	{QLCNIC_CMD_SET_LED_CONFIG, 5, 1},
+	{QLCNIC_CMD_GET_LED_CONFIG, 1, 5},
+	{QLCNIC_CMD_83XX_SET_DRV_VER, 4, 1},
+	{QLCNIC_CMD_ADD_RCV_RINGS, 130, 26},
+	{QLCNIC_CMD_CONFIG_VPORT, 4, 4},
+	{QLCNIC_CMD_BC_EVENT_SETUP, 2, 1},
+	{QLCNIC_CMD_DCB_QUERY_CAP, 1, 2},
+	{QLCNIC_CMD_DCB_QUERY_PARAM, 1, 50},
+	{QLCNIC_CMD_SET_INGRESS_ENCAP, 2, 1},
+	{QLCNIC_CMD_83XX_EXTEND_ISCSI_DUMP_CAP, 4, 1},
+};
+
+const u32 qlcnic_83xx_ext_reg_tbl[] = {
+	0x38CC,		/* Global Reset */
+	0x38F0,		/* Wildcard */
+	0x38FC,		/* Informant */
+	0x3038,		/* Host MBX ctrl */
+	0x303C,		/* FW MBX ctrl */
+	0x355C,		/* BOOT LOADER ADDRESS REG */
+	0x3560,		/* BOOT LOADER SIZE REG */
+	0x3564,		/* FW IMAGE ADDR REG */
+	0x1000,		/* MBX intr enable */
+	0x1200,		/* Default Intr mask */
+	0x1204,		/* Default Interrupt ID */
+	0x3780,		/* QLC_83XX_IDC_MAJ_VERSION */
+	0x3784,		/* QLC_83XX_IDC_DEV_STATE */
+	0x3788,		/* QLC_83XX_IDC_DRV_PRESENCE */
+	0x378C,		/* QLC_83XX_IDC_DRV_ACK */
+	0x3790,		/* QLC_83XX_IDC_CTRL */
+	0x3794,		/* QLC_83XX_IDC_DRV_AUDIT */
+	0x3798,		/* QLC_83XX_IDC_MIN_VERSION */
+	0x379C,		/* QLC_83XX_RECOVER_DRV_LOCK */
+	0x37A0,		/* QLC_83XX_IDC_PF_0 */
+	0x37A4,		/* QLC_83XX_IDC_PF_1 */
+	0x37A8,		/* QLC_83XX_IDC_PF_2 */
+	0x37AC,		/* QLC_83XX_IDC_PF_3 */
+	0x37B0,		/* QLC_83XX_IDC_PF_4 */
+	0x37B4,		/* QLC_83XX_IDC_PF_5 */
+	0x37B8,		/* QLC_83XX_IDC_PF_6 */
+	0x37BC,		/* QLC_83XX_IDC_PF_7 */
+	0x37C0,		/* QLC_83XX_IDC_PF_8 */
+	0x37C4,		/* QLC_83XX_IDC_PF_9 */
+	0x37C8,		/* QLC_83XX_IDC_PF_10 */
+	0x37CC,		/* QLC_83XX_IDC_PF_11 */
+	0x37D0,		/* QLC_83XX_IDC_PF_12 */
+	0x37D4,		/* QLC_83XX_IDC_PF_13 */
+	0x37D8,		/* QLC_83XX_IDC_PF_14 */
+	0x37DC,		/* QLC_83XX_IDC_PF_15 */
+	0x37E0,		/* QLC_83XX_IDC_DEV_PARTITION_INFO_1 */
+	0x37E4,		/* QLC_83XX_IDC_DEV_PARTITION_INFO_2 */
+	0x37F0,		/* QLC_83XX_DRV_OP_MODE */
+	0x37F4,		/* QLC_83XX_VNIC_STATE */
+	0x3868,		/* QLC_83XX_DRV_LOCK */
+	0x386C,		/* QLC_83XX_DRV_UNLOCK */
+	0x3504,		/* QLC_83XX_DRV_LOCK_ID */
+	0x34A4,		/* QLC_83XX_ASIC_TEMP */
+};
+
+const u32 qlcnic_83xx_reg_tbl[] = {
+	0x34A8,		/* PEG_HALT_STAT1 */
+	0x34AC,		/* PEG_HALT_STAT2 */
+	0x34B0,		/* FW_HEARTBEAT */
+	0x3500,		/* FLASH LOCK_ID */
+	0x3528,		/* FW_CAPABILITIES */
+	0x3538,		/* Driver active, DRV_REG0 */
+	0x3540,		/* Device state, DRV_REG1 */
+	0x3544,		/* Driver state, DRV_REG2 */
+	0x3548,		/* Driver scratch, DRV_REG3 */
+	0x354C,		/* Device partition info, DRV_REG4 */
+	0x3524,		/* Driver IDC ver, DRV_REG5 */
+	0x3550,		/* FW_VER_MAJOR */
+	0x3554,		/* FW_VER_MINOR */
+	0x3558,		/* FW_VER_SUB */
+	0x359C,		/* NPAR STATE */
+	0x35FC,		/* FW_IMG_VALID */
+	0x3650,		/* CMD_PEG_STATE */
+	0x373C,		/* RCV_PEG_STATE */
+	0x37B4,		/* ASIC TEMP */
+	0x356C,		/* FW API */
+	0x3570,		/* DRV OP MODE */
+	0x3850,		/* FLASH LOCK */
+	0x3854,		/* FLASH UNLOCK */
+};
+
+static struct qlcnic_hardware_ops qlcnic_83xx_hw_ops = {
+	.read_crb			= qlcnic_83xx_read_crb,
+	.write_crb			= qlcnic_83xx_write_crb,
+	.read_reg			= qlcnic_83xx_rd_reg_indirect,
+	.write_reg			= qlcnic_83xx_wrt_reg_indirect,
+	.get_mac_address		= qlcnic_83xx_get_mac_address,
+	.setup_intr			= qlcnic_83xx_setup_intr,
+	.alloc_mbx_args			= qlcnic_83xx_alloc_mbx_args,
+	.mbx_cmd			= qlcnic_83xx_issue_cmd,
+	.get_func_no			= qlcnic_83xx_get_func_no,
+	.api_lock			= qlcnic_83xx_cam_lock,
+	.api_unlock			= qlcnic_83xx_cam_unlock,
+	.add_sysfs			= qlcnic_83xx_add_sysfs,
+	.remove_sysfs			= qlcnic_83xx_remove_sysfs,
+	.process_lb_rcv_ring_diag	= qlcnic_83xx_process_rcv_ring_diag,
+	.create_rx_ctx			= qlcnic_83xx_create_rx_ctx,
+	.create_tx_ctx			= qlcnic_83xx_create_tx_ctx,
+	.del_rx_ctx			= qlcnic_83xx_del_rx_ctx,
+	.del_tx_ctx			= qlcnic_83xx_del_tx_ctx,
+	.setup_link_event		= qlcnic_83xx_setup_link_event,
+	.get_nic_info			= qlcnic_83xx_get_nic_info,
+	.get_pci_info			= qlcnic_83xx_get_pci_info,
+	.set_nic_info			= qlcnic_83xx_set_nic_info,
+	.change_macvlan			= qlcnic_83xx_sre_macaddr_change,
+	.napi_enable			= qlcnic_83xx_napi_enable,
+	.napi_disable			= qlcnic_83xx_napi_disable,
+	.config_intr_coal		= qlcnic_83xx_config_intr_coal,
+	.config_rss			= qlcnic_83xx_config_rss,
+	.config_hw_lro			= qlcnic_83xx_config_hw_lro,
+	.config_promisc_mode		= qlcnic_83xx_nic_set_promisc,
+	.change_l2_filter		= qlcnic_83xx_change_l2_filter,
+	.get_board_info			= qlcnic_83xx_get_port_info,
+	.set_mac_filter_count		= qlcnic_83xx_set_mac_filter_count,
+	.free_mac_list			= qlcnic_82xx_free_mac_list,
+	.io_error_detected		= qlcnic_83xx_io_error_detected,
+	.io_slot_reset			= qlcnic_83xx_io_slot_reset,
+	.io_resume			= qlcnic_83xx_io_resume,
+	.get_beacon_state		= qlcnic_83xx_get_beacon_state,
+	.enable_sds_intr		= qlcnic_83xx_enable_sds_intr,
+	.disable_sds_intr		= qlcnic_83xx_disable_sds_intr,
+	.enable_tx_intr			= qlcnic_83xx_enable_tx_intr,
+	.disable_tx_intr		= qlcnic_83xx_disable_tx_intr,
+	.get_saved_state		= qlcnic_83xx_get_saved_state,
+	.set_saved_state		= qlcnic_83xx_set_saved_state,
+	.cache_tmpl_hdr_values		= qlcnic_83xx_cache_tmpl_hdr_values,
+	.get_cap_size			= qlcnic_83xx_get_cap_size,
+	.set_sys_info			= qlcnic_83xx_set_sys_info,
+	.store_cap_mask			= qlcnic_83xx_store_cap_mask,
+	.encap_rx_offload		= qlcnic_83xx_encap_rx_offload,
+	.encap_tx_offload		= qlcnic_83xx_encap_tx_offload,
+};
+
+static struct qlcnic_nic_template qlcnic_83xx_ops = {
+	.config_bridged_mode	= qlcnic_config_bridged_mode,
+	.config_led		= qlcnic_config_led,
+	.request_reset          = qlcnic_83xx_idc_request_reset,
+	.cancel_idc_work        = qlcnic_83xx_idc_exit,
+	.napi_add		= qlcnic_83xx_napi_add,
+	.napi_del		= qlcnic_83xx_napi_del,
+	.config_ipaddr		= qlcnic_83xx_config_ipaddr,
+	.clear_legacy_intr	= qlcnic_83xx_clear_legacy_intr,
+	.shutdown		= qlcnic_83xx_shutdown,
+	.resume			= qlcnic_83xx_resume,
+};
+
+void qlcnic_83xx_register_map(struct qlcnic_hardware_context *ahw)
+{
+	ahw->hw_ops		= &qlcnic_83xx_hw_ops;
+	ahw->reg_tbl		= (u32 *)qlcnic_83xx_reg_tbl;
+	ahw->ext_reg_tbl	= (u32 *)qlcnic_83xx_ext_reg_tbl;
+}
+
+int qlcnic_83xx_get_fw_version(struct qlcnic_adapter *adapter)
+{
+	u32 fw_major, fw_minor, fw_build;
+	struct pci_dev *pdev = adapter->pdev;
+
+	fw_major = QLC_SHARED_REG_RD32(adapter, QLCNIC_FW_VERSION_MAJOR);
+	fw_minor = QLC_SHARED_REG_RD32(adapter, QLCNIC_FW_VERSION_MINOR);
+	fw_build = QLC_SHARED_REG_RD32(adapter, QLCNIC_FW_VERSION_SUB);
+	adapter->fw_version = QLCNIC_VERSION_CODE(fw_major, fw_minor, fw_build);
+
+	dev_info(&pdev->dev, "Driver v%s, firmware version %d.%d.%d\n",
+		 QLCNIC_LINUX_VERSIONID, fw_major, fw_minor, fw_build);
+
+	return adapter->fw_version;
+}
+
+static int __qlcnic_set_win_base(struct qlcnic_adapter *adapter, u32 addr)
+{
+	void __iomem *base;
+	u32 val;
+
+	base = adapter->ahw->pci_base0 +
+	       QLC_83XX_CRB_WIN_FUNC(adapter->ahw->pci_func);
+	writel(addr, base);
+	val = readl(base);
+	if (val != addr)
+		return -EIO;
+
+	return 0;
+}
+
+int qlcnic_83xx_rd_reg_indirect(struct qlcnic_adapter *adapter, ulong addr,
+				int *err)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	*err = __qlcnic_set_win_base(adapter, (u32) addr);
+	if (!*err) {
+		return QLCRDX(ahw, QLCNIC_WILDCARD);
+	} else {
+		dev_err(&adapter->pdev->dev,
+			"%s failed, addr = 0x%lx\n", __func__, addr);
+		return -EIO;
+	}
+}
+
+int qlcnic_83xx_wrt_reg_indirect(struct qlcnic_adapter *adapter, ulong addr,
+				 u32 data)
+{
+	int err;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	err = __qlcnic_set_win_base(adapter, (u32) addr);
+	if (!err) {
+		QLCWRX(ahw, QLCNIC_WILDCARD, data);
+		return 0;
+	} else {
+		dev_err(&adapter->pdev->dev,
+			"%s failed, addr = 0x%x data = 0x%x\n",
+			__func__, (int)addr, data);
+		return err;
+	}
+}
+
+static void qlcnic_83xx_enable_legacy(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	/* MSI-X enablement failed, use legacy interrupt */
+	adapter->tgt_status_reg = ahw->pci_base0 + QLC_83XX_INTX_PTR;
+	adapter->tgt_mask_reg = ahw->pci_base0 + QLC_83XX_INTX_MASK;
+	adapter->isr_int_vec = ahw->pci_base0 + QLC_83XX_INTX_TRGR;
+	adapter->msix_entries[0].vector = adapter->pdev->irq;
+	dev_info(&adapter->pdev->dev, "using legacy interrupt\n");
+}
+
+static int qlcnic_83xx_calculate_msix_vector(struct qlcnic_adapter *adapter)
+{
+	int num_msix;
+
+	num_msix = adapter->drv_sds_rings;
+
+	/* account for AEN interrupt MSI-X based interrupts */
+	num_msix += 1;
+
+	if (!(adapter->flags & QLCNIC_TX_INTR_SHARED))
+		num_msix += adapter->drv_tx_rings;
+
+	return num_msix;
+}
+
+int qlcnic_83xx_setup_intr(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int err, i, num_msix;
+
+	if (adapter->flags & QLCNIC_TSS_RSS) {
+		err = qlcnic_setup_tss_rss_intr(adapter);
+		if (err < 0)
+			return err;
+		num_msix = ahw->num_msix;
+	} else {
+		num_msix = qlcnic_83xx_calculate_msix_vector(adapter);
+
+		err = qlcnic_enable_msix(adapter, num_msix);
+		if (err == -ENOMEM)
+			return err;
+
+		if (adapter->flags & QLCNIC_MSIX_ENABLED) {
+			num_msix = ahw->num_msix;
+		} else {
+			if (qlcnic_sriov_vf_check(adapter))
+				return -EINVAL;
+			num_msix = 1;
+			adapter->drv_sds_rings = QLCNIC_SINGLE_RING;
+			adapter->drv_tx_rings = QLCNIC_SINGLE_RING;
+		}
+	}
+
+	/* setup interrupt mapping table for fw */
+	ahw->intr_tbl = vzalloc(num_msix *
+				sizeof(struct qlcnic_intrpt_config));
+	if (!ahw->intr_tbl)
+		return -ENOMEM;
+
+	if (!(adapter->flags & QLCNIC_MSIX_ENABLED)) {
+		if (adapter->ahw->pci_func >= QLC_MAX_LEGACY_FUNC_SUPP) {
+			dev_err(&adapter->pdev->dev, "PCI function number 8 and higher are not supported with legacy interrupt, func 0x%x\n",
+				ahw->pci_func);
+			return -EOPNOTSUPP;
+		}
+
+		qlcnic_83xx_enable_legacy(adapter);
+	}
+
+	for (i = 0; i < num_msix; i++) {
+		if (adapter->flags & QLCNIC_MSIX_ENABLED)
+			ahw->intr_tbl[i].type = QLCNIC_INTRPT_MSIX;
+		else
+			ahw->intr_tbl[i].type = QLCNIC_INTRPT_INTX;
+		ahw->intr_tbl[i].id = i;
+		ahw->intr_tbl[i].src = 0;
+	}
+
+	return 0;
+}
+
+static inline void qlcnic_83xx_clear_legacy_intr_mask(struct qlcnic_adapter *adapter)
+{
+	writel(0, adapter->tgt_mask_reg);
+}
+
+static inline void qlcnic_83xx_set_legacy_intr_mask(struct qlcnic_adapter *adapter)
+{
+	if (adapter->tgt_mask_reg)
+		writel(1, adapter->tgt_mask_reg);
+}
+
+static inline void qlcnic_83xx_enable_legacy_msix_mbx_intr(struct qlcnic_adapter
+						    *adapter)
+{
+	u32 mask;
+
+	/* Mailbox in MSI-x mode and Legacy Interrupt share the same
+	 * source register. We could be here before contexts are created
+	 * and sds_ring->crb_intr_mask has not been initialized, calculate
+	 * BAR offset for Interrupt Source Register
+	 */
+	mask = QLCRDX(adapter->ahw, QLCNIC_DEF_INT_MASK);
+	writel(0, adapter->ahw->pci_base0 + mask);
+}
+
+void qlcnic_83xx_disable_mbx_intr(struct qlcnic_adapter *adapter)
+{
+	u32 mask;
+
+	mask = QLCRDX(adapter->ahw, QLCNIC_DEF_INT_MASK);
+	writel(1, adapter->ahw->pci_base0 + mask);
+	QLCWRX(adapter->ahw, QLCNIC_MBX_INTR_ENBL, 0);
+}
+
+static inline void qlcnic_83xx_get_mbx_data(struct qlcnic_adapter *adapter,
+				     struct qlcnic_cmd_args *cmd)
+{
+	int i;
+
+	if (cmd->op_type == QLC_83XX_MBX_POST_BC_OP)
+		return;
+
+	for (i = 0; i < cmd->rsp.num; i++)
+		cmd->rsp.arg[i] = readl(QLCNIC_MBX_FW(adapter->ahw, i));
+}
+
+irqreturn_t qlcnic_83xx_clear_legacy_intr(struct qlcnic_adapter *adapter)
+{
+	u32 intr_val;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int retries = 0;
+
+	intr_val = readl(adapter->tgt_status_reg);
+
+	if (!QLC_83XX_VALID_INTX_BIT31(intr_val))
+		return IRQ_NONE;
+
+	if (QLC_83XX_INTX_FUNC(intr_val) != adapter->ahw->pci_func) {
+		adapter->stats.spurious_intr++;
+		return IRQ_NONE;
+	}
+	/* The barrier is required to ensure writes to the registers */
+	wmb();
+
+	/* clear the interrupt trigger control register */
+	writel_relaxed(0, adapter->isr_int_vec);
+	intr_val = readl(adapter->isr_int_vec);
+	do {
+		intr_val = readl(adapter->tgt_status_reg);
+		if (QLC_83XX_INTX_FUNC(intr_val) != ahw->pci_func)
+			break;
+		retries++;
+	} while (QLC_83XX_VALID_INTX_BIT30(intr_val) &&
+		 (retries < QLC_83XX_LEGACY_INTX_MAX_RETRY));
+
+	return IRQ_HANDLED;
+}
+
+static inline void qlcnic_83xx_notify_mbx_response(struct qlcnic_mailbox *mbx)
+{
+	mbx->rsp_status = QLC_83XX_MBX_RESPONSE_ARRIVED;
+	complete(&mbx->completion);
+}
+
+static void qlcnic_83xx_poll_process_aen(struct qlcnic_adapter *adapter)
+{
+	u32 resp, event, rsp_status = QLC_83XX_MBX_RESPONSE_ARRIVED;
+	struct qlcnic_mailbox *mbx = adapter->ahw->mailbox;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mbx->aen_lock, flags);
+	resp = QLCRDX(adapter->ahw, QLCNIC_FW_MBX_CTRL);
+	if (!(resp & QLCNIC_SET_OWNER))
+		goto out;
+
+	event = readl(QLCNIC_MBX_FW(adapter->ahw, 0));
+	if (event &  QLCNIC_MBX_ASYNC_EVENT) {
+		__qlcnic_83xx_process_aen(adapter);
+	} else {
+		if (mbx->rsp_status != rsp_status)
+			qlcnic_83xx_notify_mbx_response(mbx);
+	}
+out:
+	qlcnic_83xx_enable_legacy_msix_mbx_intr(adapter);
+	spin_unlock_irqrestore(&mbx->aen_lock, flags);
+}
+
+irqreturn_t qlcnic_83xx_intr(int irq, void *data)
+{
+	struct qlcnic_adapter *adapter = data;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	if (qlcnic_83xx_clear_legacy_intr(adapter) == IRQ_NONE)
+		return IRQ_NONE;
+
+	qlcnic_83xx_poll_process_aen(adapter);
+
+	if (ahw->diag_test) {
+		if (ahw->diag_test == QLCNIC_INTERRUPT_TEST)
+			ahw->diag_cnt++;
+		qlcnic_83xx_enable_legacy_msix_mbx_intr(adapter);
+		return IRQ_HANDLED;
+	}
+
+	if (!test_bit(__QLCNIC_DEV_UP, &adapter->state)) {
+		qlcnic_83xx_enable_legacy_msix_mbx_intr(adapter);
+	} else {
+		sds_ring = &adapter->recv_ctx->sds_rings[0];
+		napi_schedule(&sds_ring->napi);
+	}
+
+	return IRQ_HANDLED;
+}
+
+irqreturn_t qlcnic_83xx_tmp_intr(int irq, void *data)
+{
+	struct qlcnic_host_sds_ring *sds_ring = data;
+	struct qlcnic_adapter *adapter = sds_ring->adapter;
+
+	if (adapter->flags & QLCNIC_MSIX_ENABLED)
+		goto done;
+
+	if (adapter->nic_ops->clear_legacy_intr(adapter) == IRQ_NONE)
+		return IRQ_NONE;
+
+done:
+	adapter->ahw->diag_cnt++;
+	qlcnic_enable_sds_intr(adapter, sds_ring);
+
+	return IRQ_HANDLED;
+}
+
+void qlcnic_83xx_free_mbx_intr(struct qlcnic_adapter *adapter)
+{
+	u32 num_msix;
+
+	if (!(adapter->flags & QLCNIC_MSIX_ENABLED))
+		qlcnic_83xx_set_legacy_intr_mask(adapter);
+
+	qlcnic_83xx_disable_mbx_intr(adapter);
+
+	if (adapter->flags & QLCNIC_MSIX_ENABLED)
+		num_msix = adapter->ahw->num_msix - 1;
+	else
+		num_msix = 0;
+
+	msleep(20);
+
+	if (adapter->msix_entries) {
+		synchronize_irq(adapter->msix_entries[num_msix].vector);
+		free_irq(adapter->msix_entries[num_msix].vector, adapter);
+	}
+}
+
+int qlcnic_83xx_setup_mbx_intr(struct qlcnic_adapter *adapter)
+{
+	irq_handler_t handler;
+	u32 val;
+	int err = 0;
+	unsigned long flags = 0;
+
+	if (!(adapter->flags & QLCNIC_MSI_ENABLED) &&
+	    !(adapter->flags & QLCNIC_MSIX_ENABLED))
+		flags |= IRQF_SHARED;
+
+	if (adapter->flags & QLCNIC_MSIX_ENABLED) {
+		handler = qlcnic_83xx_handle_aen;
+		val = adapter->msix_entries[adapter->ahw->num_msix - 1].vector;
+		err = request_irq(val, handler, flags, "qlcnic-MB", adapter);
+		if (err) {
+			dev_err(&adapter->pdev->dev,
+				"failed to register MBX interrupt\n");
+			return err;
+		}
+	} else {
+		handler = qlcnic_83xx_intr;
+		val = adapter->msix_entries[0].vector;
+		err = request_irq(val, handler, flags, "qlcnic", adapter);
+		if (err) {
+			dev_err(&adapter->pdev->dev,
+				"failed to register INTx interrupt\n");
+			return err;
+		}
+		qlcnic_83xx_clear_legacy_intr_mask(adapter);
+	}
+
+	/* Enable mailbox interrupt */
+	qlcnic_83xx_enable_mbx_interrupt(adapter);
+
+	return err;
+}
+
+void qlcnic_83xx_get_func_no(struct qlcnic_adapter *adapter)
+{
+	u32 val = QLCRDX(adapter->ahw, QLCNIC_INFORMANT);
+	adapter->ahw->pci_func = (val >> 24) & 0xff;
+}
+
+int qlcnic_83xx_cam_lock(struct qlcnic_adapter *adapter)
+{
+	void __iomem *addr;
+	u32 val, limit = 0;
+
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	addr = ahw->pci_base0 + QLC_83XX_SEM_LOCK_FUNC(ahw->pci_func);
+	do {
+		val = readl(addr);
+		if (val) {
+			/* write the function number to register */
+			QLC_SHARED_REG_WR32(adapter, QLCNIC_FLASH_LOCK_OWNER,
+					    ahw->pci_func);
+			return 0;
+		}
+		usleep_range(1000, 2000);
+	} while (++limit <= QLCNIC_PCIE_SEM_TIMEOUT);
+
+	return -EIO;
+}
+
+void qlcnic_83xx_cam_unlock(struct qlcnic_adapter *adapter)
+{
+	void __iomem *addr;
+	u32 val;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	addr = ahw->pci_base0 + QLC_83XX_SEM_UNLOCK_FUNC(ahw->pci_func);
+	val = readl(addr);
+}
+
+void qlcnic_83xx_read_crb(struct qlcnic_adapter *adapter, char *buf,
+			  loff_t offset, size_t size)
+{
+	int ret = 0;
+	u32 data;
+
+	if (qlcnic_api_lock(adapter)) {
+		dev_err(&adapter->pdev->dev,
+			"%s: failed to acquire lock. addr offset 0x%x\n",
+			__func__, (u32)offset);
+		return;
+	}
+
+	data = QLCRD32(adapter, (u32) offset, &ret);
+	qlcnic_api_unlock(adapter);
+
+	if (ret == -EIO) {
+		dev_err(&adapter->pdev->dev,
+			"%s: failed. addr offset 0x%x\n",
+			__func__, (u32)offset);
+		return;
+	}
+	memcpy(buf, &data, size);
+}
+
+void qlcnic_83xx_write_crb(struct qlcnic_adapter *adapter, char *buf,
+			   loff_t offset, size_t size)
+{
+	u32 data;
+
+	memcpy(&data, buf, size);
+	qlcnic_83xx_wrt_reg_indirect(adapter, (u32) offset, data);
+}
+
+int qlcnic_83xx_get_port_info(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int status;
+
+	status = qlcnic_83xx_get_port_config(adapter);
+	if (status) {
+		dev_err(&adapter->pdev->dev,
+			"Get Port Info failed\n");
+	} else {
+
+		if (ahw->port_config & QLC_83XX_10G_CAPABLE) {
+			ahw->port_type = QLCNIC_XGBE;
+		} else if (ahw->port_config & QLC_83XX_10_CAPABLE ||
+			   ahw->port_config & QLC_83XX_100_CAPABLE ||
+			   ahw->port_config & QLC_83XX_1G_CAPABLE) {
+			ahw->port_type = QLCNIC_GBE;
+		} else {
+			ahw->port_type = QLCNIC_XGBE;
+		}
+
+		if (QLC_83XX_AUTONEG(ahw->port_config))
+			ahw->link_autoneg = AUTONEG_ENABLE;
+
+	}
+	return status;
+}
+
+static void qlcnic_83xx_set_mac_filter_count(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u16 act_pci_fn = ahw->total_nic_func;
+	u16 count;
+
+	ahw->max_mc_count = QLC_83XX_MAX_MC_COUNT;
+	if (act_pci_fn <= 2)
+		count = (QLC_83XX_MAX_UC_COUNT - QLC_83XX_MAX_MC_COUNT) /
+			 act_pci_fn;
+	else
+		count = (QLC_83XX_LB_MAX_FILTERS - QLC_83XX_MAX_MC_COUNT) /
+			 act_pci_fn;
+	ahw->max_uc_count = count;
+}
+
+void qlcnic_83xx_enable_mbx_interrupt(struct qlcnic_adapter *adapter)
+{
+	u32 val;
+
+	if (adapter->flags & QLCNIC_MSIX_ENABLED)
+		val = BIT_2 | ((adapter->ahw->num_msix - 1) << 8);
+	else
+		val = BIT_2;
+
+	QLCWRX(adapter->ahw, QLCNIC_MBX_INTR_ENBL, val);
+	qlcnic_83xx_enable_legacy_msix_mbx_intr(adapter);
+}
+
+void qlcnic_83xx_check_vf(struct qlcnic_adapter *adapter,
+			  const struct pci_device_id *ent)
+{
+	u32 op_mode, priv_level;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	ahw->fw_hal_version = 2;
+	qlcnic_get_func_no(adapter);
+
+	if (qlcnic_sriov_vf_check(adapter)) {
+		qlcnic_sriov_vf_set_ops(adapter);
+		return;
+	}
+
+	/* Determine function privilege level */
+	op_mode = QLCRDX(adapter->ahw, QLC_83XX_DRV_OP_MODE);
+	if (op_mode == QLC_83XX_DEFAULT_OPMODE)
+		priv_level = QLCNIC_MGMT_FUNC;
+	else
+		priv_level = QLC_83XX_GET_FUNC_PRIVILEGE(op_mode,
+							 ahw->pci_func);
+
+	if (priv_level == QLCNIC_NON_PRIV_FUNC) {
+		ahw->op_mode = QLCNIC_NON_PRIV_FUNC;
+		dev_info(&adapter->pdev->dev,
+			 "HAL Version: %d Non Privileged function\n",
+			 ahw->fw_hal_version);
+		adapter->nic_ops = &qlcnic_vf_ops;
+	} else {
+		if (pci_find_ext_capability(adapter->pdev,
+					    PCI_EXT_CAP_ID_SRIOV))
+			set_bit(__QLCNIC_SRIOV_CAPABLE, &adapter->state);
+		adapter->nic_ops = &qlcnic_83xx_ops;
+	}
+}
+
+static void qlcnic_83xx_handle_link_aen(struct qlcnic_adapter *adapter,
+					u32 data[]);
+static void qlcnic_83xx_handle_idc_comp_aen(struct qlcnic_adapter *adapter,
+					    u32 data[]);
+
+void qlcnic_dump_mbx(struct qlcnic_adapter *adapter,
+		     struct qlcnic_cmd_args *cmd)
+{
+	int i;
+
+	if (cmd->op_type == QLC_83XX_MBX_POST_BC_OP)
+		return;
+
+	dev_info(&adapter->pdev->dev,
+		 "Host MBX regs(%d)\n", cmd->req.num);
+	for (i = 0; i < cmd->req.num; i++) {
+		if (i && !(i % 8))
+			pr_info("\n");
+		pr_info("%08x ", cmd->req.arg[i]);
+	}
+	pr_info("\n");
+	dev_info(&adapter->pdev->dev,
+		 "FW MBX regs(%d)\n", cmd->rsp.num);
+	for (i = 0; i < cmd->rsp.num; i++) {
+		if (i && !(i % 8))
+			pr_info("\n");
+		pr_info("%08x ", cmd->rsp.arg[i]);
+	}
+	pr_info("\n");
+}
+
+static void qlcnic_83xx_poll_for_mbx_completion(struct qlcnic_adapter *adapter,
+						struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int opcode = LSW(cmd->req.arg[0]);
+	unsigned long max_loops;
+
+	max_loops = cmd->total_cmds * QLC_83XX_MBX_CMD_LOOP;
+
+	for (; max_loops; max_loops--) {
+		if (atomic_read(&cmd->rsp_status) ==
+		    QLC_83XX_MBX_RESPONSE_ARRIVED)
+			return;
+
+		udelay(1);
+	}
+
+	dev_err(&adapter->pdev->dev,
+		"%s: Mailbox command timed out, cmd_op=0x%x, cmd_type=0x%x, pci_func=0x%x, op_mode=0x%x\n",
+		__func__, opcode, cmd->type, ahw->pci_func, ahw->op_mode);
+	flush_workqueue(ahw->mailbox->work_q);
+	return;
+}
+
+int qlcnic_83xx_issue_cmd(struct qlcnic_adapter *adapter,
+			  struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_mailbox *mbx = adapter->ahw->mailbox;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int cmd_type, err, opcode;
+	unsigned long timeout;
+
+	if (!mbx)
+		return -EIO;
+
+	opcode = LSW(cmd->req.arg[0]);
+	cmd_type = cmd->type;
+	err = mbx->ops->enqueue_cmd(adapter, cmd, &timeout);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"%s: Mailbox not available, cmd_op=0x%x, cmd_context=0x%x, pci_func=0x%x, op_mode=0x%x\n",
+			__func__, opcode, cmd->type, ahw->pci_func,
+			ahw->op_mode);
+		return err;
+	}
+
+	switch (cmd_type) {
+	case QLC_83XX_MBX_CMD_WAIT:
+		if (!wait_for_completion_timeout(&cmd->completion, timeout)) {
+			dev_err(&adapter->pdev->dev,
+				"%s: Mailbox command timed out, cmd_op=0x%x, cmd_type=0x%x, pci_func=0x%x, op_mode=0x%x\n",
+				__func__, opcode, cmd_type, ahw->pci_func,
+				ahw->op_mode);
+			flush_workqueue(mbx->work_q);
+		}
+		break;
+	case QLC_83XX_MBX_CMD_NO_WAIT:
+		return 0;
+	case QLC_83XX_MBX_CMD_BUSY_WAIT:
+		qlcnic_83xx_poll_for_mbx_completion(adapter, cmd);
+		break;
+	default:
+		dev_err(&adapter->pdev->dev,
+			"%s: Invalid mailbox command, cmd_op=0x%x, cmd_type=0x%x, pci_func=0x%x, op_mode=0x%x\n",
+			__func__, opcode, cmd_type, ahw->pci_func,
+			ahw->op_mode);
+		qlcnic_83xx_detach_mailbox_work(adapter);
+	}
+
+	return cmd->rsp_opcode;
+}
+
+int qlcnic_83xx_alloc_mbx_args(struct qlcnic_cmd_args *mbx,
+			       struct qlcnic_adapter *adapter, u32 type)
+{
+	int i, size;
+	u32 temp;
+	const struct qlcnic_mailbox_metadata *mbx_tbl;
+
+	memset(mbx, 0, sizeof(struct qlcnic_cmd_args));
+	mbx_tbl = qlcnic_83xx_mbx_tbl;
+	size = ARRAY_SIZE(qlcnic_83xx_mbx_tbl);
+	for (i = 0; i < size; i++) {
+		if (type == mbx_tbl[i].cmd) {
+			mbx->op_type = QLC_83XX_FW_MBX_CMD;
+			mbx->req.num = mbx_tbl[i].in_args;
+			mbx->rsp.num = mbx_tbl[i].out_args;
+			mbx->req.arg = kcalloc(mbx->req.num, sizeof(u32),
+					       GFP_ATOMIC);
+			if (!mbx->req.arg)
+				return -ENOMEM;
+			mbx->rsp.arg = kcalloc(mbx->rsp.num, sizeof(u32),
+					       GFP_ATOMIC);
+			if (!mbx->rsp.arg) {
+				kfree(mbx->req.arg);
+				mbx->req.arg = NULL;
+				return -ENOMEM;
+			}
+			temp = adapter->ahw->fw_hal_version << 29;
+			mbx->req.arg[0] = (type | (mbx->req.num << 16) | temp);
+			mbx->cmd_op = type;
+			return 0;
+		}
+	}
+
+	dev_err(&adapter->pdev->dev, "%s: Invalid mailbox command opcode 0x%x\n",
+		__func__, type);
+	return -EINVAL;
+}
+
+void qlcnic_83xx_idc_aen_work(struct work_struct *work)
+{
+	struct qlcnic_adapter *adapter;
+	struct qlcnic_cmd_args cmd;
+	int i, err = 0;
+
+	adapter = container_of(work, struct qlcnic_adapter, idc_aen_work.work);
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_IDC_ACK);
+	if (err)
+		return;
+
+	for (i = 1; i < QLC_83XX_MBX_AEN_CNT; i++)
+		cmd.req.arg[i] = adapter->ahw->mbox_aen[i];
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err)
+		dev_info(&adapter->pdev->dev,
+			 "%s: Mailbox IDC ACK failed.\n", __func__);
+	qlcnic_free_mbx_args(&cmd);
+}
+
+static void qlcnic_83xx_handle_idc_comp_aen(struct qlcnic_adapter *adapter,
+					    u32 data[])
+{
+	dev_dbg(&adapter->pdev->dev, "Completion AEN:0x%x.\n",
+		QLCNIC_MBX_RSP(data[0]));
+	clear_bit(QLC_83XX_IDC_COMP_AEN, &adapter->ahw->idc.status);
+	return;
+}
+
+static void __qlcnic_83xx_process_aen(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u32 event[QLC_83XX_MBX_AEN_CNT];
+	int i;
+
+	for (i = 0; i < QLC_83XX_MBX_AEN_CNT; i++)
+		event[i] = readl(QLCNIC_MBX_FW(ahw, i));
+
+	switch (QLCNIC_MBX_RSP(event[0])) {
+
+	case QLCNIC_MBX_LINK_EVENT:
+		qlcnic_83xx_handle_link_aen(adapter, event);
+		break;
+	case QLCNIC_MBX_COMP_EVENT:
+		qlcnic_83xx_handle_idc_comp_aen(adapter, event);
+		break;
+	case QLCNIC_MBX_REQUEST_EVENT:
+		for (i = 0; i < QLC_83XX_MBX_AEN_CNT; i++)
+			adapter->ahw->mbox_aen[i] = QLCNIC_MBX_RSP(event[i]);
+		queue_delayed_work(adapter->qlcnic_wq,
+				   &adapter->idc_aen_work, 0);
+		break;
+	case QLCNIC_MBX_TIME_EXTEND_EVENT:
+		ahw->extend_lb_time = event[1] >> 8 & 0xf;
+		break;
+	case QLCNIC_MBX_BC_EVENT:
+		qlcnic_sriov_handle_bc_event(adapter, event[1]);
+		break;
+	case QLCNIC_MBX_SFP_INSERT_EVENT:
+		dev_info(&adapter->pdev->dev, "SFP+ Insert AEN:0x%x.\n",
+			 QLCNIC_MBX_RSP(event[0]));
+		break;
+	case QLCNIC_MBX_SFP_REMOVE_EVENT:
+		dev_info(&adapter->pdev->dev, "SFP Removed AEN:0x%x.\n",
+			 QLCNIC_MBX_RSP(event[0]));
+		break;
+	case QLCNIC_MBX_DCBX_CONFIG_CHANGE_EVENT:
+		qlcnic_dcb_aen_handler(adapter->dcb, (void *)&event[1]);
+		break;
+	default:
+		dev_dbg(&adapter->pdev->dev, "Unsupported AEN:0x%x.\n",
+			QLCNIC_MBX_RSP(event[0]));
+		break;
+	}
+
+	QLCWRX(ahw, QLCNIC_FW_MBX_CTRL, QLCNIC_CLR_OWNER);
+}
+
+static void qlcnic_83xx_process_aen(struct qlcnic_adapter *adapter)
+{
+	u32 resp, event, rsp_status = QLC_83XX_MBX_RESPONSE_ARRIVED;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_mailbox *mbx = ahw->mailbox;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mbx->aen_lock, flags);
+	resp = QLCRDX(ahw, QLCNIC_FW_MBX_CTRL);
+	if (resp & QLCNIC_SET_OWNER) {
+		event = readl(QLCNIC_MBX_FW(ahw, 0));
+		if (event &  QLCNIC_MBX_ASYNC_EVENT) {
+			__qlcnic_83xx_process_aen(adapter);
+		} else {
+			if (mbx->rsp_status != rsp_status)
+				qlcnic_83xx_notify_mbx_response(mbx);
+		}
+	}
+	spin_unlock_irqrestore(&mbx->aen_lock, flags);
+}
+
+static void qlcnic_83xx_mbx_poll_work(struct work_struct *work)
+{
+	struct qlcnic_adapter *adapter;
+
+	adapter = container_of(work, struct qlcnic_adapter, mbx_poll_work.work);
+
+	if (!test_bit(__QLCNIC_MBX_POLL_ENABLE, &adapter->state))
+		return;
+
+	qlcnic_83xx_process_aen(adapter);
+	queue_delayed_work(adapter->qlcnic_wq, &adapter->mbx_poll_work,
+			   (HZ / 10));
+}
+
+void qlcnic_83xx_enable_mbx_poll(struct qlcnic_adapter *adapter)
+{
+	if (test_and_set_bit(__QLCNIC_MBX_POLL_ENABLE, &adapter->state))
+		return;
+
+	INIT_DELAYED_WORK(&adapter->mbx_poll_work, qlcnic_83xx_mbx_poll_work);
+	queue_delayed_work(adapter->qlcnic_wq, &adapter->mbx_poll_work, 0);
+}
+
+void qlcnic_83xx_disable_mbx_poll(struct qlcnic_adapter *adapter)
+{
+	if (!test_and_clear_bit(__QLCNIC_MBX_POLL_ENABLE, &adapter->state))
+		return;
+	cancel_delayed_work_sync(&adapter->mbx_poll_work);
+}
+
+static int qlcnic_83xx_add_rings(struct qlcnic_adapter *adapter)
+{
+	int index, i, err, sds_mbx_size;
+	u32 *buf, intrpt_id, intr_mask;
+	u16 context_id;
+	u8 num_sds;
+	struct qlcnic_cmd_args cmd;
+	struct qlcnic_host_sds_ring *sds;
+	struct qlcnic_sds_mbx sds_mbx;
+	struct qlcnic_add_rings_mbx_out *mbx_out;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	sds_mbx_size = sizeof(struct qlcnic_sds_mbx);
+	context_id = recv_ctx->context_id;
+	num_sds = adapter->drv_sds_rings - QLCNIC_MAX_SDS_RINGS;
+	ahw->hw_ops->alloc_mbx_args(&cmd, adapter,
+				    QLCNIC_CMD_ADD_RCV_RINGS);
+	cmd.req.arg[1] = 0 | (num_sds << 8) | (context_id << 16);
+
+	/* set up status rings, mbx 2-81 */
+	index = 2;
+	for (i = 8; i < adapter->drv_sds_rings; i++) {
+		memset(&sds_mbx, 0, sds_mbx_size);
+		sds = &recv_ctx->sds_rings[i];
+		sds->consumer = 0;
+		memset(sds->desc_head, 0, STATUS_DESC_RINGSIZE(sds));
+		sds_mbx.phy_addr_low = LSD(sds->phys_addr);
+		sds_mbx.phy_addr_high = MSD(sds->phys_addr);
+		sds_mbx.sds_ring_size = sds->num_desc;
+
+		if (adapter->flags & QLCNIC_MSIX_ENABLED)
+			intrpt_id = ahw->intr_tbl[i].id;
+		else
+			intrpt_id = QLCRDX(ahw, QLCNIC_DEF_INT_ID);
+
+		if (adapter->ahw->diag_test != QLCNIC_LOOPBACK_TEST)
+			sds_mbx.intrpt_id = intrpt_id;
+		else
+			sds_mbx.intrpt_id = 0xffff;
+		sds_mbx.intrpt_val = 0;
+		buf = &cmd.req.arg[index];
+		memcpy(buf, &sds_mbx, sds_mbx_size);
+		index += sds_mbx_size / sizeof(u32);
+	}
+
+	/* send the mailbox command */
+	err = ahw->hw_ops->mbx_cmd(adapter, &cmd);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to add rings %d\n", err);
+		goto out;
+	}
+
+	mbx_out = (struct qlcnic_add_rings_mbx_out *)&cmd.rsp.arg[1];
+	index = 0;
+	/* status descriptor ring */
+	for (i = 8; i < adapter->drv_sds_rings; i++) {
+		sds = &recv_ctx->sds_rings[i];
+		sds->crb_sts_consumer = ahw->pci_base0 +
+					mbx_out->host_csmr[index];
+		if (adapter->flags & QLCNIC_MSIX_ENABLED)
+			intr_mask = ahw->intr_tbl[i].src;
+		else
+			intr_mask = QLCRDX(ahw, QLCNIC_DEF_INT_MASK);
+
+		sds->crb_intr_mask = ahw->pci_base0 + intr_mask;
+		index++;
+	}
+out:
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+void qlcnic_83xx_del_rx_ctx(struct qlcnic_adapter *adapter)
+{
+	int err;
+	u32 temp = 0;
+	struct qlcnic_cmd_args cmd;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+
+	if (qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_DESTROY_RX_CTX))
+		return;
+
+	if (qlcnic_sriov_pf_check(adapter) || qlcnic_sriov_vf_check(adapter))
+		cmd.req.arg[0] |= (0x3 << 29);
+
+	if (qlcnic_sriov_pf_check(adapter))
+		qlcnic_pf_set_interface_id_del_rx_ctx(adapter, &temp);
+
+	cmd.req.arg[1] = recv_ctx->context_id | temp;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err)
+		dev_err(&adapter->pdev->dev,
+			"Failed to destroy rx ctx in firmware\n");
+
+	recv_ctx->state = QLCNIC_HOST_CTX_STATE_FREED;
+	qlcnic_free_mbx_args(&cmd);
+}
+
+int qlcnic_83xx_create_rx_ctx(struct qlcnic_adapter *adapter)
+{
+	int i, err, index, sds_mbx_size, rds_mbx_size;
+	u8 num_sds, num_rds;
+	u32 *buf, intrpt_id, intr_mask, cap = 0;
+	struct qlcnic_host_sds_ring *sds;
+	struct qlcnic_host_rds_ring *rds;
+	struct qlcnic_sds_mbx sds_mbx;
+	struct qlcnic_rds_mbx rds_mbx;
+	struct qlcnic_cmd_args cmd;
+	struct qlcnic_rcv_mbx_out *mbx_out;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	num_rds = adapter->max_rds_rings;
+
+	if (adapter->drv_sds_rings <= QLCNIC_MAX_SDS_RINGS)
+		num_sds = adapter->drv_sds_rings;
+	else
+		num_sds = QLCNIC_MAX_SDS_RINGS;
+
+	sds_mbx_size = sizeof(struct qlcnic_sds_mbx);
+	rds_mbx_size = sizeof(struct qlcnic_rds_mbx);
+	cap = QLCNIC_CAP0_LEGACY_CONTEXT;
+
+	if (adapter->flags & QLCNIC_FW_LRO_MSS_CAP)
+		cap |= QLC_83XX_FW_CAP_LRO_MSS;
+
+	/* set mailbox hdr and capabilities */
+	err = qlcnic_alloc_mbx_args(&cmd, adapter,
+				    QLCNIC_CMD_CREATE_RX_CTX);
+	if (err)
+		return err;
+
+	if (qlcnic_sriov_pf_check(adapter) || qlcnic_sriov_vf_check(adapter))
+		cmd.req.arg[0] |= (0x3 << 29);
+
+	cmd.req.arg[1] = cap;
+	cmd.req.arg[5] = 1 | (num_rds << 5) | (num_sds << 8) |
+			 (QLC_83XX_HOST_RDS_MODE_UNIQUE << 16);
+
+	if (qlcnic_sriov_pf_check(adapter))
+		qlcnic_pf_set_interface_id_create_rx_ctx(adapter,
+							 &cmd.req.arg[6]);
+	/* set up status rings, mbx 8-57/87 */
+	index = QLC_83XX_HOST_SDS_MBX_IDX;
+	for (i = 0; i < num_sds; i++) {
+		memset(&sds_mbx, 0, sds_mbx_size);
+		sds = &recv_ctx->sds_rings[i];
+		sds->consumer = 0;
+		memset(sds->desc_head, 0, STATUS_DESC_RINGSIZE(sds));
+		sds_mbx.phy_addr_low = LSD(sds->phys_addr);
+		sds_mbx.phy_addr_high = MSD(sds->phys_addr);
+		sds_mbx.sds_ring_size = sds->num_desc;
+		if (adapter->flags & QLCNIC_MSIX_ENABLED)
+			intrpt_id = ahw->intr_tbl[i].id;
+		else
+			intrpt_id = QLCRDX(ahw, QLCNIC_DEF_INT_ID);
+		if (adapter->ahw->diag_test != QLCNIC_LOOPBACK_TEST)
+			sds_mbx.intrpt_id = intrpt_id;
+		else
+			sds_mbx.intrpt_id = 0xffff;
+		sds_mbx.intrpt_val = 0;
+		buf = &cmd.req.arg[index];
+		memcpy(buf, &sds_mbx, sds_mbx_size);
+		index += sds_mbx_size / sizeof(u32);
+	}
+	/* set up receive rings, mbx 88-111/135 */
+	index = QLCNIC_HOST_RDS_MBX_IDX;
+	rds = &recv_ctx->rds_rings[0];
+	rds->producer = 0;
+	memset(&rds_mbx, 0, rds_mbx_size);
+	rds_mbx.phy_addr_reg_low = LSD(rds->phys_addr);
+	rds_mbx.phy_addr_reg_high = MSD(rds->phys_addr);
+	rds_mbx.reg_ring_sz = rds->dma_size;
+	rds_mbx.reg_ring_len = rds->num_desc;
+	/* Jumbo ring */
+	rds = &recv_ctx->rds_rings[1];
+	rds->producer = 0;
+	rds_mbx.phy_addr_jmb_low = LSD(rds->phys_addr);
+	rds_mbx.phy_addr_jmb_high = MSD(rds->phys_addr);
+	rds_mbx.jmb_ring_sz = rds->dma_size;
+	rds_mbx.jmb_ring_len = rds->num_desc;
+	buf = &cmd.req.arg[index];
+	memcpy(buf, &rds_mbx, rds_mbx_size);
+
+	/* send the mailbox command */
+	err = ahw->hw_ops->mbx_cmd(adapter, &cmd);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to create Rx ctx in firmware%d\n", err);
+		goto out;
+	}
+	mbx_out = (struct qlcnic_rcv_mbx_out *)&cmd.rsp.arg[1];
+	recv_ctx->context_id = mbx_out->ctx_id;
+	recv_ctx->state = mbx_out->state;
+	recv_ctx->virt_port = mbx_out->vport_id;
+	dev_info(&adapter->pdev->dev, "Rx Context[%d] Created, state:0x%x\n",
+		 recv_ctx->context_id, recv_ctx->state);
+	/* Receive descriptor ring */
+	/* Standard ring */
+	rds = &recv_ctx->rds_rings[0];
+	rds->crb_rcv_producer = ahw->pci_base0 +
+				mbx_out->host_prod[0].reg_buf;
+	/* Jumbo ring */
+	rds = &recv_ctx->rds_rings[1];
+	rds->crb_rcv_producer = ahw->pci_base0 +
+				mbx_out->host_prod[0].jmb_buf;
+	/* status descriptor ring */
+	for (i = 0; i < num_sds; i++) {
+		sds = &recv_ctx->sds_rings[i];
+		sds->crb_sts_consumer = ahw->pci_base0 +
+					mbx_out->host_csmr[i];
+		if (adapter->flags & QLCNIC_MSIX_ENABLED)
+			intr_mask = ahw->intr_tbl[i].src;
+		else
+			intr_mask = QLCRDX(ahw, QLCNIC_DEF_INT_MASK);
+		sds->crb_intr_mask = ahw->pci_base0 + intr_mask;
+	}
+
+	if (adapter->drv_sds_rings > QLCNIC_MAX_SDS_RINGS)
+		err = qlcnic_83xx_add_rings(adapter);
+out:
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+void qlcnic_83xx_del_tx_ctx(struct qlcnic_adapter *adapter,
+			    struct qlcnic_host_tx_ring *tx_ring)
+{
+	struct qlcnic_cmd_args cmd;
+	u32 temp = 0;
+
+	if (qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_DESTROY_TX_CTX))
+		return;
+
+	if (qlcnic_sriov_pf_check(adapter) || qlcnic_sriov_vf_check(adapter))
+		cmd.req.arg[0] |= (0x3 << 29);
+
+	if (qlcnic_sriov_pf_check(adapter))
+		qlcnic_pf_set_interface_id_del_tx_ctx(adapter, &temp);
+
+	cmd.req.arg[1] = tx_ring->ctx_id | temp;
+	if (qlcnic_issue_cmd(adapter, &cmd))
+		dev_err(&adapter->pdev->dev,
+			"Failed to destroy tx ctx in firmware\n");
+	qlcnic_free_mbx_args(&cmd);
+}
+
+int qlcnic_83xx_create_tx_ctx(struct qlcnic_adapter *adapter,
+			      struct qlcnic_host_tx_ring *tx, int ring)
+{
+	int err;
+	u16 msix_id;
+	u32 *buf, intr_mask, temp = 0;
+	struct qlcnic_cmd_args cmd;
+	struct qlcnic_tx_mbx mbx;
+	struct qlcnic_tx_mbx_out *mbx_out;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u32 msix_vector;
+
+	/* Reset host resources */
+	tx->producer = 0;
+	tx->sw_consumer = 0;
+	*(tx->hw_consumer) = 0;
+
+	memset(&mbx, 0, sizeof(struct qlcnic_tx_mbx));
+
+	/* setup mailbox inbox registerss */
+	mbx.phys_addr_low = LSD(tx->phys_addr);
+	mbx.phys_addr_high = MSD(tx->phys_addr);
+	mbx.cnsmr_index_low = LSD(tx->hw_cons_phys_addr);
+	mbx.cnsmr_index_high = MSD(tx->hw_cons_phys_addr);
+	mbx.size = tx->num_desc;
+	if (adapter->flags & QLCNIC_MSIX_ENABLED) {
+		if (!(adapter->flags & QLCNIC_TX_INTR_SHARED))
+			msix_vector = adapter->drv_sds_rings + ring;
+		else
+			msix_vector = adapter->drv_sds_rings - 1;
+		msix_id = ahw->intr_tbl[msix_vector].id;
+	} else {
+		msix_id = QLCRDX(ahw, QLCNIC_DEF_INT_ID);
+	}
+
+	if (adapter->ahw->diag_test != QLCNIC_LOOPBACK_TEST)
+		mbx.intr_id = msix_id;
+	else
+		mbx.intr_id = 0xffff;
+	mbx.src = 0;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_CREATE_TX_CTX);
+	if (err)
+		return err;
+
+	if (qlcnic_sriov_pf_check(adapter) || qlcnic_sriov_vf_check(adapter))
+		cmd.req.arg[0] |= (0x3 << 29);
+
+	if (qlcnic_sriov_pf_check(adapter))
+		qlcnic_pf_set_interface_id_create_tx_ctx(adapter, &temp);
+
+	cmd.req.arg[1] = QLCNIC_CAP0_LEGACY_CONTEXT;
+	cmd.req.arg[5] = QLCNIC_SINGLE_RING | temp;
+
+	buf = &cmd.req.arg[6];
+	memcpy(buf, &mbx, sizeof(struct qlcnic_tx_mbx));
+	/* send the mailbox command*/
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err) {
+		netdev_err(adapter->netdev,
+			   "Failed to create Tx ctx in firmware 0x%x\n", err);
+		goto out;
+	}
+	mbx_out = (struct qlcnic_tx_mbx_out *)&cmd.rsp.arg[2];
+	tx->crb_cmd_producer = ahw->pci_base0 + mbx_out->host_prod;
+	tx->ctx_id = mbx_out->ctx_id;
+	if ((adapter->flags & QLCNIC_MSIX_ENABLED) &&
+	    !(adapter->flags & QLCNIC_TX_INTR_SHARED)) {
+		intr_mask = ahw->intr_tbl[adapter->drv_sds_rings + ring].src;
+		tx->crb_intr_mask = ahw->pci_base0 + intr_mask;
+	}
+	netdev_info(adapter->netdev,
+		    "Tx Context[0x%x] Created, state:0x%x\n",
+		    tx->ctx_id, mbx_out->state);
+out:
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+static int qlcnic_83xx_diag_alloc_res(struct net_device *netdev, int test,
+				      u8 num_sds_ring)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_host_rds_ring *rds_ring;
+	u16 adapter_state = adapter->is_up;
+	u8 ring;
+	int ret;
+
+	netif_device_detach(netdev);
+
+	if (netif_running(netdev))
+		__qlcnic_down(adapter, netdev);
+
+	qlcnic_detach(adapter);
+
+	adapter->drv_sds_rings = QLCNIC_SINGLE_RING;
+	adapter->ahw->diag_test = test;
+	adapter->ahw->linkup = 0;
+
+	ret = qlcnic_attach(adapter);
+	if (ret) {
+		netif_device_attach(netdev);
+		return ret;
+	}
+
+	ret = qlcnic_fw_create_ctx(adapter);
+	if (ret) {
+		qlcnic_detach(adapter);
+		if (adapter_state == QLCNIC_ADAPTER_UP_MAGIC) {
+			adapter->drv_sds_rings = num_sds_ring;
+			qlcnic_attach(adapter);
+		}
+		netif_device_attach(netdev);
+		return ret;
+	}
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &adapter->recv_ctx->rds_rings[ring];
+		qlcnic_post_rx_buffers(adapter, rds_ring, ring);
+	}
+
+	if (adapter->ahw->diag_test == QLCNIC_INTERRUPT_TEST) {
+		for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+			sds_ring = &adapter->recv_ctx->sds_rings[ring];
+			qlcnic_enable_sds_intr(adapter, sds_ring);
+		}
+	}
+
+	if (adapter->ahw->diag_test == QLCNIC_LOOPBACK_TEST) {
+		adapter->ahw->loopback_state = 0;
+		adapter->ahw->hw_ops->setup_link_event(adapter, 1);
+	}
+
+	set_bit(__QLCNIC_DEV_UP, &adapter->state);
+	return 0;
+}
+
+static void qlcnic_83xx_diag_free_res(struct net_device *netdev,
+				      u8 drv_sds_rings)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_host_sds_ring *sds_ring;
+	int ring;
+
+	clear_bit(__QLCNIC_DEV_UP, &adapter->state);
+	if (adapter->ahw->diag_test == QLCNIC_INTERRUPT_TEST) {
+		for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+			sds_ring = &adapter->recv_ctx->sds_rings[ring];
+			if (adapter->flags & QLCNIC_MSIX_ENABLED)
+				qlcnic_disable_sds_intr(adapter, sds_ring);
+		}
+	}
+
+	qlcnic_fw_destroy_ctx(adapter);
+	qlcnic_detach(adapter);
+
+	adapter->ahw->diag_test = 0;
+	adapter->drv_sds_rings = drv_sds_rings;
+
+	if (qlcnic_attach(adapter))
+		goto out;
+
+	if (netif_running(netdev))
+		__qlcnic_up(adapter, netdev);
+
+out:
+	netif_device_attach(netdev);
+}
+
+static void qlcnic_83xx_get_beacon_state(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_cmd_args cmd;
+	u8 beacon_state;
+	int err = 0;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_LED_CONFIG);
+	if (!err) {
+		err = qlcnic_issue_cmd(adapter, &cmd);
+		if (!err) {
+			beacon_state = cmd.rsp.arg[4];
+			if (beacon_state == QLCNIC_BEACON_DISABLE)
+				ahw->beacon_state = QLC_83XX_BEACON_OFF;
+			else if (beacon_state == QLC_83XX_ENABLE_BEACON)
+				ahw->beacon_state = QLC_83XX_BEACON_ON;
+		}
+	} else {
+		netdev_err(adapter->netdev, "Get beacon state failed, err=%d\n",
+			   err);
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+
+	return;
+}
+
+int qlcnic_83xx_config_led(struct qlcnic_adapter *adapter, u32 state,
+			   u32 beacon)
+{
+	struct qlcnic_cmd_args cmd;
+	u32 mbx_in;
+	int i, status = 0;
+
+	if (state) {
+		/* Get LED configuration */
+		status = qlcnic_alloc_mbx_args(&cmd, adapter,
+					       QLCNIC_CMD_GET_LED_CONFIG);
+		if (status)
+			return status;
+
+		status = qlcnic_issue_cmd(adapter, &cmd);
+		if (status) {
+			dev_err(&adapter->pdev->dev,
+				"Get led config failed.\n");
+			goto mbx_err;
+		} else {
+			for (i = 0; i < 4; i++)
+				adapter->ahw->mbox_reg[i] = cmd.rsp.arg[i+1];
+		}
+		qlcnic_free_mbx_args(&cmd);
+		/* Set LED Configuration */
+		mbx_in = (LSW(QLC_83XX_LED_CONFIG) << 16) |
+			  LSW(QLC_83XX_LED_CONFIG);
+		status = qlcnic_alloc_mbx_args(&cmd, adapter,
+					       QLCNIC_CMD_SET_LED_CONFIG);
+		if (status)
+			return status;
+
+		cmd.req.arg[1] = mbx_in;
+		cmd.req.arg[2] = mbx_in;
+		cmd.req.arg[3] = mbx_in;
+		if (beacon)
+			cmd.req.arg[4] = QLC_83XX_ENABLE_BEACON;
+		status = qlcnic_issue_cmd(adapter, &cmd);
+		if (status) {
+			dev_err(&adapter->pdev->dev,
+				"Set led config failed.\n");
+		}
+mbx_err:
+		qlcnic_free_mbx_args(&cmd);
+		return status;
+
+	} else {
+		/* Restoring default LED configuration */
+		status = qlcnic_alloc_mbx_args(&cmd, adapter,
+					       QLCNIC_CMD_SET_LED_CONFIG);
+		if (status)
+			return status;
+
+		cmd.req.arg[1] = adapter->ahw->mbox_reg[0];
+		cmd.req.arg[2] = adapter->ahw->mbox_reg[1];
+		cmd.req.arg[3] = adapter->ahw->mbox_reg[2];
+		if (beacon)
+			cmd.req.arg[4] = adapter->ahw->mbox_reg[3];
+		status = qlcnic_issue_cmd(adapter, &cmd);
+		if (status)
+			dev_err(&adapter->pdev->dev,
+				"Restoring led config failed.\n");
+		qlcnic_free_mbx_args(&cmd);
+		return status;
+	}
+}
+
+int  qlcnic_83xx_set_led(struct net_device *netdev,
+			 enum ethtool_phys_id_state state)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	int err = -EIO, active = 1;
+
+	if (adapter->ahw->op_mode == QLCNIC_NON_PRIV_FUNC) {
+		netdev_warn(netdev,
+			    "LED test is not supported in non-privileged mode\n");
+		return -EOPNOTSUPP;
+	}
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		if (test_and_set_bit(__QLCNIC_LED_ENABLE, &adapter->state))
+			return -EBUSY;
+
+		if (test_bit(__QLCNIC_RESETTING, &adapter->state))
+			break;
+
+		err = qlcnic_83xx_config_led(adapter, active, 0);
+		if (err)
+			netdev_err(netdev, "Failed to set LED blink state\n");
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		active = 0;
+
+		if (test_bit(__QLCNIC_RESETTING, &adapter->state))
+			break;
+
+		err = qlcnic_83xx_config_led(adapter, active, 0);
+		if (err)
+			netdev_err(netdev, "Failed to reset LED blink state\n");
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (!active || err)
+		clear_bit(__QLCNIC_LED_ENABLE, &adapter->state);
+
+	return err;
+}
+
+void qlcnic_83xx_initialize_nic(struct qlcnic_adapter *adapter, int enable)
+{
+	struct qlcnic_cmd_args cmd;
+	int status;
+
+	if (qlcnic_sriov_vf_check(adapter))
+		return;
+
+	if (enable)
+		status = qlcnic_alloc_mbx_args(&cmd, adapter,
+					       QLCNIC_CMD_INIT_NIC_FUNC);
+	else
+		status = qlcnic_alloc_mbx_args(&cmd, adapter,
+					       QLCNIC_CMD_STOP_NIC_FUNC);
+
+	if (status)
+		return;
+
+	cmd.req.arg[1] = QLC_REGISTER_LB_IDC | QLC_INIT_FW_RESOURCES;
+
+	if (adapter->dcb)
+		cmd.req.arg[1] |= QLC_REGISTER_DCB_AEN;
+
+	status = qlcnic_issue_cmd(adapter, &cmd);
+	if (status)
+		dev_err(&adapter->pdev->dev,
+			"Failed to %s in NIC IDC function event.\n",
+			(enable ? "register" : "unregister"));
+
+	qlcnic_free_mbx_args(&cmd);
+}
+
+static int qlcnic_83xx_set_port_config(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_cmd_args cmd;
+	int err;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_SET_PORT_CONFIG);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = adapter->ahw->port_config;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err)
+		dev_info(&adapter->pdev->dev, "Set Port Config failed.\n");
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+static int qlcnic_83xx_get_port_config(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_cmd_args cmd;
+	int err;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_PORT_CONFIG);
+	if (err)
+		return err;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err)
+		dev_info(&adapter->pdev->dev, "Get Port config failed\n");
+	else
+		adapter->ahw->port_config = cmd.rsp.arg[1];
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+int qlcnic_83xx_setup_link_event(struct qlcnic_adapter *adapter, int enable)
+{
+	int err;
+	u32 temp;
+	struct qlcnic_cmd_args cmd;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_LINK_EVENT);
+	if (err)
+		return err;
+
+	temp = adapter->recv_ctx->context_id << 16;
+	cmd.req.arg[1] = (enable ? 1 : 0) | BIT_8 | temp;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err)
+		dev_info(&adapter->pdev->dev,
+			 "Setup linkevent mailbox failed\n");
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+static void qlcnic_83xx_set_interface_id_promisc(struct qlcnic_adapter *adapter,
+						 u32 *interface_id)
+{
+	if (qlcnic_sriov_pf_check(adapter)) {
+		qlcnic_alloc_lb_filters_mem(adapter);
+		qlcnic_pf_set_interface_id_promisc(adapter, interface_id);
+		adapter->rx_mac_learn = true;
+	} else {
+		if (!qlcnic_sriov_vf_check(adapter))
+			*interface_id = adapter->recv_ctx->context_id << 16;
+	}
+}
+
+int qlcnic_83xx_nic_set_promisc(struct qlcnic_adapter *adapter, u32 mode)
+{
+	struct qlcnic_cmd_args *cmd = NULL;
+	u32 temp = 0;
+	int err;
+
+	if (adapter->recv_ctx->state == QLCNIC_HOST_CTX_STATE_FREED)
+		return -EIO;
+
+	cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC);
+	if (!cmd)
+		return -ENOMEM;
+
+	err = qlcnic_alloc_mbx_args(cmd, adapter,
+				    QLCNIC_CMD_CONFIGURE_MAC_RX_MODE);
+	if (err)
+		goto out;
+
+	cmd->type = QLC_83XX_MBX_CMD_NO_WAIT;
+	qlcnic_83xx_set_interface_id_promisc(adapter, &temp);
+
+	if (qlcnic_84xx_check(adapter) && qlcnic_sriov_pf_check(adapter))
+		mode = VPORT_MISS_MODE_ACCEPT_ALL;
+
+	cmd->req.arg[1] = mode | temp;
+	err = qlcnic_issue_cmd(adapter, cmd);
+	if (!err)
+		return err;
+
+	qlcnic_free_mbx_args(cmd);
+
+out:
+	kfree(cmd);
+	return err;
+}
+
+int qlcnic_83xx_loopback_test(struct net_device *netdev, u8 mode)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u8 drv_sds_rings = adapter->drv_sds_rings;
+	u8 drv_tx_rings = adapter->drv_tx_rings;
+	int ret = 0, loop = 0;
+
+	if (ahw->op_mode == QLCNIC_NON_PRIV_FUNC) {
+		netdev_warn(netdev,
+			    "Loopback test not supported in non privileged mode\n");
+		return -ENOTSUPP;
+	}
+
+	if (test_bit(__QLCNIC_RESETTING, &adapter->state)) {
+		netdev_info(netdev, "Device is resetting\n");
+		return -EBUSY;
+	}
+
+	if (qlcnic_get_diag_lock(adapter)) {
+		netdev_info(netdev, "Device is in diagnostics mode\n");
+		return -EBUSY;
+	}
+
+	netdev_info(netdev, "%s loopback test in progress\n",
+		    mode == QLCNIC_ILB_MODE ? "internal" : "external");
+
+	ret = qlcnic_83xx_diag_alloc_res(netdev, QLCNIC_LOOPBACK_TEST,
+					 drv_sds_rings);
+	if (ret)
+		goto fail_diag_alloc;
+
+	ret = qlcnic_83xx_set_lb_mode(adapter, mode);
+	if (ret)
+		goto free_diag_res;
+
+	/* Poll for link up event before running traffic */
+	do {
+		msleep(QLC_83XX_LB_MSLEEP_COUNT);
+
+		if (test_bit(__QLCNIC_RESETTING, &adapter->state)) {
+			netdev_info(netdev,
+				    "Device is resetting, free LB test resources\n");
+			ret = -EBUSY;
+			goto free_diag_res;
+		}
+		if (loop++ > QLC_83XX_LB_WAIT_COUNT) {
+			netdev_info(netdev,
+				    "Firmware didn't sent link up event to loopback request\n");
+			ret = -ETIMEDOUT;
+			qlcnic_83xx_clear_lb_mode(adapter, mode);
+			goto free_diag_res;
+		}
+	} while ((adapter->ahw->linkup && ahw->has_link_events) != 1);
+
+	ret = qlcnic_do_lb_test(adapter, mode);
+
+	qlcnic_83xx_clear_lb_mode(adapter, mode);
+
+free_diag_res:
+	qlcnic_83xx_diag_free_res(netdev, drv_sds_rings);
+
+fail_diag_alloc:
+	adapter->drv_sds_rings = drv_sds_rings;
+	adapter->drv_tx_rings = drv_tx_rings;
+	qlcnic_release_diag_lock(adapter);
+	return ret;
+}
+
+static void qlcnic_extend_lb_idc_cmpltn_wait(struct qlcnic_adapter *adapter,
+					     u32 *max_wait_count)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int temp;
+
+	netdev_info(adapter->netdev, "Received loopback IDC time extend event for 0x%x seconds\n",
+		    ahw->extend_lb_time);
+	temp = ahw->extend_lb_time * 1000;
+	*max_wait_count += temp / QLC_83XX_LB_MSLEEP_COUNT;
+	ahw->extend_lb_time = 0;
+}
+
+static int qlcnic_83xx_set_lb_mode(struct qlcnic_adapter *adapter, u8 mode)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct net_device *netdev = adapter->netdev;
+	u32 config, max_wait_count;
+	int status = 0, loop = 0;
+
+	ahw->extend_lb_time = 0;
+	max_wait_count = QLC_83XX_LB_WAIT_COUNT;
+	status = qlcnic_83xx_get_port_config(adapter);
+	if (status)
+		return status;
+
+	config = ahw->port_config;
+
+	/* Check if port is already in loopback mode */
+	if ((config & QLC_83XX_CFG_LOOPBACK_HSS) ||
+	    (config & QLC_83XX_CFG_LOOPBACK_EXT)) {
+		netdev_err(netdev,
+			   "Port already in Loopback mode.\n");
+		return -EINPROGRESS;
+	}
+
+	set_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status);
+
+	if (mode == QLCNIC_ILB_MODE)
+		ahw->port_config |= QLC_83XX_CFG_LOOPBACK_HSS;
+	if (mode == QLCNIC_ELB_MODE)
+		ahw->port_config |= QLC_83XX_CFG_LOOPBACK_EXT;
+
+	status = qlcnic_83xx_set_port_config(adapter);
+	if (status) {
+		netdev_err(netdev,
+			   "Failed to Set Loopback Mode = 0x%x.\n",
+			   ahw->port_config);
+		ahw->port_config = config;
+		clear_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status);
+		return status;
+	}
+
+	/* Wait for Link and IDC Completion AEN */
+	do {
+		msleep(QLC_83XX_LB_MSLEEP_COUNT);
+
+		if (test_bit(__QLCNIC_RESETTING, &adapter->state)) {
+			netdev_info(netdev,
+				    "Device is resetting, free LB test resources\n");
+			clear_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status);
+			return -EBUSY;
+		}
+
+		if (ahw->extend_lb_time)
+			qlcnic_extend_lb_idc_cmpltn_wait(adapter,
+							 &max_wait_count);
+
+		if (loop++ > max_wait_count) {
+			netdev_err(netdev, "%s: Did not receive loopback IDC completion AEN\n",
+				   __func__);
+			clear_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status);
+			qlcnic_83xx_clear_lb_mode(adapter, mode);
+			return -ETIMEDOUT;
+		}
+	} while (test_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status));
+
+	qlcnic_sre_macaddr_change(adapter, adapter->mac_addr, 0,
+				  QLCNIC_MAC_ADD);
+	return status;
+}
+
+static int qlcnic_83xx_clear_lb_mode(struct qlcnic_adapter *adapter, u8 mode)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u32 config = ahw->port_config, max_wait_count;
+	struct net_device *netdev = adapter->netdev;
+	int status = 0, loop = 0;
+
+	ahw->extend_lb_time = 0;
+	max_wait_count = QLC_83XX_LB_WAIT_COUNT;
+	set_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status);
+	if (mode == QLCNIC_ILB_MODE)
+		ahw->port_config &= ~QLC_83XX_CFG_LOOPBACK_HSS;
+	if (mode == QLCNIC_ELB_MODE)
+		ahw->port_config &= ~QLC_83XX_CFG_LOOPBACK_EXT;
+
+	status = qlcnic_83xx_set_port_config(adapter);
+	if (status) {
+		netdev_err(netdev,
+			   "Failed to Clear Loopback Mode = 0x%x.\n",
+			   ahw->port_config);
+		ahw->port_config = config;
+		clear_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status);
+		return status;
+	}
+
+	/* Wait for Link and IDC Completion AEN */
+	do {
+		msleep(QLC_83XX_LB_MSLEEP_COUNT);
+
+		if (test_bit(__QLCNIC_RESETTING, &adapter->state)) {
+			netdev_info(netdev,
+				    "Device is resetting, free LB test resources\n");
+			clear_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status);
+			return -EBUSY;
+		}
+
+		if (ahw->extend_lb_time)
+			qlcnic_extend_lb_idc_cmpltn_wait(adapter,
+							 &max_wait_count);
+
+		if (loop++ > max_wait_count) {
+			netdev_err(netdev, "%s: Did not receive loopback IDC completion AEN\n",
+				   __func__);
+			clear_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status);
+			return -ETIMEDOUT;
+		}
+	} while (test_bit(QLC_83XX_IDC_COMP_AEN, &ahw->idc.status));
+
+	qlcnic_sre_macaddr_change(adapter, adapter->mac_addr, 0,
+				  QLCNIC_MAC_DEL);
+	return status;
+}
+
+static void qlcnic_83xx_set_interface_id_ipaddr(struct qlcnic_adapter *adapter,
+						u32 *interface_id)
+{
+	if (qlcnic_sriov_pf_check(adapter)) {
+		qlcnic_pf_set_interface_id_ipaddr(adapter, interface_id);
+	} else {
+		if (!qlcnic_sriov_vf_check(adapter))
+			*interface_id = adapter->recv_ctx->context_id << 16;
+	}
+}
+
+void qlcnic_83xx_config_ipaddr(struct qlcnic_adapter *adapter, __be32 ip,
+			       int mode)
+{
+	int err;
+	u32 temp = 0, temp_ip;
+	struct qlcnic_cmd_args cmd;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter,
+				    QLCNIC_CMD_CONFIGURE_IP_ADDR);
+	if (err)
+		return;
+
+	qlcnic_83xx_set_interface_id_ipaddr(adapter, &temp);
+
+	if (mode == QLCNIC_IP_UP)
+		cmd.req.arg[1] = 1 | temp;
+	else
+		cmd.req.arg[1] = 2 | temp;
+
+	/*
+	 * Adapter needs IP address in network byte order.
+	 * But hardware mailbox registers go through writel(), hence IP address
+	 * gets swapped on big endian architecture.
+	 * To negate swapping of writel() on big endian architecture
+	 * use swab32(value).
+	 */
+
+	temp_ip = swab32(ntohl(ip));
+	memcpy(&cmd.req.arg[2], &temp_ip, sizeof(u32));
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err != QLCNIC_RCODE_SUCCESS)
+		dev_err(&adapter->netdev->dev,
+			"could not notify %s IP 0x%x request\n",
+			(mode == QLCNIC_IP_UP) ? "Add" : "Remove", ip);
+
+	qlcnic_free_mbx_args(&cmd);
+}
+
+int qlcnic_83xx_config_hw_lro(struct qlcnic_adapter *adapter, int mode)
+{
+	int err;
+	u32 temp, arg1;
+	struct qlcnic_cmd_args cmd;
+	int lro_bit_mask;
+
+	lro_bit_mask = (mode ? (BIT_0 | BIT_1 | BIT_2 | BIT_3) : 0);
+
+	if (adapter->recv_ctx->state == QLCNIC_HOST_CTX_STATE_FREED)
+		return 0;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_CONFIGURE_HW_LRO);
+	if (err)
+		return err;
+
+	temp = adapter->recv_ctx->context_id << 16;
+	arg1 = lro_bit_mask | temp;
+	cmd.req.arg[1] = arg1;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err)
+		dev_info(&adapter->pdev->dev, "LRO config failed\n");
+	qlcnic_free_mbx_args(&cmd);
+
+	return err;
+}
+
+int qlcnic_83xx_config_rss(struct qlcnic_adapter *adapter, int enable)
+{
+	int err;
+	u32 word;
+	struct qlcnic_cmd_args cmd;
+	const u64 key[] = { 0xbeac01fa6a42b73bULL, 0x8030f20c77cb2da3ULL,
+			    0xae7b30b4d0ca2bcbULL, 0x43a38fb04167253dULL,
+			    0x255b0ec26d5a56daULL };
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_CONFIGURE_RSS);
+	if (err)
+		return err;
+	/*
+	 * RSS request:
+	 * bits 3-0: Rsvd
+	 *      5-4: hash_type_ipv4
+	 *	7-6: hash_type_ipv6
+	 *	  8: enable
+	 *        9: use indirection table
+	 *    16-31: indirection table mask
+	 */
+	word =  ((u32)(RSS_HASHTYPE_IP_TCP & 0x3) << 4) |
+		((u32)(RSS_HASHTYPE_IP_TCP & 0x3) << 6) |
+		((u32)(enable & 0x1) << 8) |
+		((0x7ULL) << 16);
+	cmd.req.arg[1] = (adapter->recv_ctx->context_id);
+	cmd.req.arg[2] = word;
+	memcpy(&cmd.req.arg[4], key, sizeof(key));
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+
+	if (err)
+		dev_info(&adapter->pdev->dev, "RSS config failed\n");
+	qlcnic_free_mbx_args(&cmd);
+
+	return err;
+
+}
+
+static void qlcnic_83xx_set_interface_id_macaddr(struct qlcnic_adapter *adapter,
+						 u32 *interface_id)
+{
+	if (qlcnic_sriov_pf_check(adapter)) {
+		qlcnic_pf_set_interface_id_macaddr(adapter, interface_id);
+	} else {
+		if (!qlcnic_sriov_vf_check(adapter))
+			*interface_id = adapter->recv_ctx->context_id << 16;
+	}
+}
+
+int qlcnic_83xx_sre_macaddr_change(struct qlcnic_adapter *adapter, u8 *addr,
+				   u16 vlan_id, u8 op)
+{
+	struct qlcnic_cmd_args *cmd = NULL;
+	struct qlcnic_macvlan_mbx mv;
+	u32 *buf, temp = 0;
+	int err;
+
+	if (adapter->recv_ctx->state == QLCNIC_HOST_CTX_STATE_FREED)
+		return -EIO;
+
+	cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC);
+	if (!cmd)
+		return -ENOMEM;
+
+	err = qlcnic_alloc_mbx_args(cmd, adapter, QLCNIC_CMD_CONFIG_MAC_VLAN);
+	if (err)
+		goto out;
+
+	cmd->type = QLC_83XX_MBX_CMD_NO_WAIT;
+
+	if (vlan_id)
+		op = (op == QLCNIC_MAC_ADD || op == QLCNIC_MAC_VLAN_ADD) ?
+		     QLCNIC_MAC_VLAN_ADD : QLCNIC_MAC_VLAN_DEL;
+
+	cmd->req.arg[1] = op | (1 << 8);
+	qlcnic_83xx_set_interface_id_macaddr(adapter, &temp);
+	cmd->req.arg[1] |= temp;
+	mv.vlan = vlan_id;
+	mv.mac_addr0 = addr[0];
+	mv.mac_addr1 = addr[1];
+	mv.mac_addr2 = addr[2];
+	mv.mac_addr3 = addr[3];
+	mv.mac_addr4 = addr[4];
+	mv.mac_addr5 = addr[5];
+	buf = &cmd->req.arg[2];
+	memcpy(buf, &mv, sizeof(struct qlcnic_macvlan_mbx));
+	err = qlcnic_issue_cmd(adapter, cmd);
+	if (!err)
+		return err;
+
+	qlcnic_free_mbx_args(cmd);
+out:
+	kfree(cmd);
+	return err;
+}
+
+void qlcnic_83xx_change_l2_filter(struct qlcnic_adapter *adapter, u64 *addr,
+				  u16 vlan_id)
+{
+	u8 mac[ETH_ALEN];
+	memcpy(&mac, addr, ETH_ALEN);
+	qlcnic_83xx_sre_macaddr_change(adapter, mac, vlan_id, QLCNIC_MAC_ADD);
+}
+
+static void qlcnic_83xx_configure_mac(struct qlcnic_adapter *adapter, u8 *mac,
+				      u8 type, struct qlcnic_cmd_args *cmd)
+{
+	switch (type) {
+	case QLCNIC_SET_STATION_MAC:
+	case QLCNIC_SET_FAC_DEF_MAC:
+		memcpy(&cmd->req.arg[2], mac, sizeof(u32));
+		memcpy(&cmd->req.arg[3], &mac[4], sizeof(u16));
+		break;
+	}
+	cmd->req.arg[1] = type;
+}
+
+int qlcnic_83xx_get_mac_address(struct qlcnic_adapter *adapter, u8 *mac,
+				u8 function)
+{
+	int err, i;
+	struct qlcnic_cmd_args cmd;
+	u32 mac_low, mac_high;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_MAC_ADDRESS);
+	if (err)
+		return err;
+
+	qlcnic_83xx_configure_mac(adapter, mac, QLCNIC_GET_CURRENT_MAC, &cmd);
+	err = qlcnic_issue_cmd(adapter, &cmd);
+
+	if (err == QLCNIC_RCODE_SUCCESS) {
+		mac_low = cmd.rsp.arg[1];
+		mac_high = cmd.rsp.arg[2];
+
+		for (i = 0; i < 2; i++)
+			mac[i] = (u8) (mac_high >> ((1 - i) * 8));
+		for (i = 2; i < 6; i++)
+			mac[i] = (u8) (mac_low >> ((5 - i) * 8));
+	} else {
+		dev_err(&adapter->pdev->dev, "Failed to get mac address%d\n",
+			err);
+		err = -EIO;
+	}
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+static int qlcnic_83xx_set_rx_intr_coal(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_nic_intr_coalesce *coal = &adapter->ahw->coal;
+	struct qlcnic_cmd_args cmd;
+	u16 temp;
+	int err;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_CONFIG_INTR_COAL);
+	if (err)
+		return err;
+
+	temp = adapter->recv_ctx->context_id;
+	cmd.req.arg[1] = QLCNIC_INTR_COAL_TYPE_RX | temp << 16;
+	temp = coal->rx_time_us;
+	cmd.req.arg[2] = coal->rx_packets | temp << 16;
+	cmd.req.arg[3] = coal->flag;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err != QLCNIC_RCODE_SUCCESS)
+		netdev_err(adapter->netdev,
+			   "failed to set interrupt coalescing parameters\n");
+
+	qlcnic_free_mbx_args(&cmd);
+
+	return err;
+}
+
+static int qlcnic_83xx_set_tx_intr_coal(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_nic_intr_coalesce *coal = &adapter->ahw->coal;
+	struct qlcnic_cmd_args cmd;
+	u16 temp;
+	int err;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_CONFIG_INTR_COAL);
+	if (err)
+		return err;
+
+	temp = adapter->tx_ring->ctx_id;
+	cmd.req.arg[1] = QLCNIC_INTR_COAL_TYPE_TX | temp << 16;
+	temp = coal->tx_time_us;
+	cmd.req.arg[2] = coal->tx_packets | temp << 16;
+	cmd.req.arg[3] = coal->flag;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err != QLCNIC_RCODE_SUCCESS)
+		netdev_err(adapter->netdev,
+			   "failed to set interrupt coalescing  parameters\n");
+
+	qlcnic_free_mbx_args(&cmd);
+
+	return err;
+}
+
+int qlcnic_83xx_set_rx_tx_intr_coal(struct qlcnic_adapter *adapter)
+{
+	int err = 0;
+
+	err = qlcnic_83xx_set_rx_intr_coal(adapter);
+	if (err)
+		netdev_err(adapter->netdev,
+			   "failed to set Rx coalescing parameters\n");
+
+	err = qlcnic_83xx_set_tx_intr_coal(adapter);
+	if (err)
+		netdev_err(adapter->netdev,
+			   "failed to set Tx coalescing parameters\n");
+
+	return err;
+}
+
+int qlcnic_83xx_config_intr_coal(struct qlcnic_adapter *adapter,
+				 struct ethtool_coalesce *ethcoal)
+{
+	struct qlcnic_nic_intr_coalesce *coal = &adapter->ahw->coal;
+	u32 rx_coalesce_usecs, rx_max_frames;
+	u32 tx_coalesce_usecs, tx_max_frames;
+	int err;
+
+	if (adapter->recv_ctx->state == QLCNIC_HOST_CTX_STATE_FREED)
+		return -EIO;
+
+	tx_coalesce_usecs = ethcoal->tx_coalesce_usecs;
+	tx_max_frames = ethcoal->tx_max_coalesced_frames;
+	rx_coalesce_usecs = ethcoal->rx_coalesce_usecs;
+	rx_max_frames = ethcoal->rx_max_coalesced_frames;
+	coal->flag = QLCNIC_INTR_DEFAULT;
+
+	if ((coal->rx_time_us == rx_coalesce_usecs) &&
+	    (coal->rx_packets == rx_max_frames)) {
+		coal->type = QLCNIC_INTR_COAL_TYPE_TX;
+		coal->tx_time_us = tx_coalesce_usecs;
+		coal->tx_packets = tx_max_frames;
+	} else if ((coal->tx_time_us == tx_coalesce_usecs) &&
+		   (coal->tx_packets == tx_max_frames)) {
+		coal->type = QLCNIC_INTR_COAL_TYPE_RX;
+		coal->rx_time_us = rx_coalesce_usecs;
+		coal->rx_packets = rx_max_frames;
+	} else {
+		coal->type = QLCNIC_INTR_COAL_TYPE_RX_TX;
+		coal->rx_time_us = rx_coalesce_usecs;
+		coal->rx_packets = rx_max_frames;
+		coal->tx_time_us = tx_coalesce_usecs;
+		coal->tx_packets = tx_max_frames;
+	}
+
+	switch (coal->type) {
+	case QLCNIC_INTR_COAL_TYPE_RX:
+		err = qlcnic_83xx_set_rx_intr_coal(adapter);
+		break;
+	case QLCNIC_INTR_COAL_TYPE_TX:
+		err = qlcnic_83xx_set_tx_intr_coal(adapter);
+		break;
+	case QLCNIC_INTR_COAL_TYPE_RX_TX:
+		err = qlcnic_83xx_set_rx_tx_intr_coal(adapter);
+		break;
+	default:
+		err = -EINVAL;
+		netdev_err(adapter->netdev,
+			   "Invalid Interrupt coalescing type\n");
+		break;
+	}
+
+	return err;
+}
+
+static void qlcnic_83xx_handle_link_aen(struct qlcnic_adapter *adapter,
+					u32 data[])
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u8 link_status, duplex;
+	/* link speed */
+	link_status = LSB(data[3]) & 1;
+	if (link_status) {
+		ahw->link_speed = MSW(data[2]);
+		duplex = LSB(MSW(data[3]));
+		if (duplex)
+			ahw->link_duplex = DUPLEX_FULL;
+		else
+			ahw->link_duplex = DUPLEX_HALF;
+	} else {
+		ahw->link_speed = SPEED_UNKNOWN;
+		ahw->link_duplex = DUPLEX_UNKNOWN;
+	}
+
+	ahw->link_autoneg = MSB(MSW(data[3]));
+	ahw->module_type = MSB(LSW(data[3]));
+	ahw->has_link_events = 1;
+	ahw->lb_mode = data[4] & QLCNIC_LB_MODE_MASK;
+	qlcnic_advert_link_change(adapter, link_status);
+}
+
+static irqreturn_t qlcnic_83xx_handle_aen(int irq, void *data)
+{
+	u32 mask, resp, event, rsp_status = QLC_83XX_MBX_RESPONSE_ARRIVED;
+	struct qlcnic_adapter *adapter = data;
+	struct qlcnic_mailbox *mbx;
+	unsigned long flags;
+
+	mbx = adapter->ahw->mailbox;
+	spin_lock_irqsave(&mbx->aen_lock, flags);
+	resp = QLCRDX(adapter->ahw, QLCNIC_FW_MBX_CTRL);
+	if (!(resp & QLCNIC_SET_OWNER))
+		goto out;
+
+	event = readl(QLCNIC_MBX_FW(adapter->ahw, 0));
+	if (event &  QLCNIC_MBX_ASYNC_EVENT) {
+		__qlcnic_83xx_process_aen(adapter);
+	} else {
+		if (mbx->rsp_status != rsp_status)
+			qlcnic_83xx_notify_mbx_response(mbx);
+		else
+			adapter->stats.mbx_spurious_intr++;
+	}
+
+out:
+	mask = QLCRDX(adapter->ahw, QLCNIC_DEF_INT_MASK);
+	writel(0, adapter->ahw->pci_base0 + mask);
+	spin_unlock_irqrestore(&mbx->aen_lock, flags);
+	return IRQ_HANDLED;
+}
+
+int qlcnic_83xx_set_nic_info(struct qlcnic_adapter *adapter,
+			     struct qlcnic_info *nic)
+{
+	int i, err = -EIO;
+	struct qlcnic_cmd_args cmd;
+
+	if (adapter->ahw->op_mode != QLCNIC_MGMT_FUNC) {
+		dev_err(&adapter->pdev->dev,
+			"%s: Error, invoked by non management func\n",
+			__func__);
+		return err;
+	}
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_SET_NIC_INFO);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = (nic->pci_func << 16);
+	cmd.req.arg[2] = 0x1 << 16;
+	cmd.req.arg[3] = nic->phys_port | (nic->switch_mode << 16);
+	cmd.req.arg[4] = nic->capabilities;
+	cmd.req.arg[5] = (nic->max_mac_filters & 0xFF) | ((nic->max_mtu) << 16);
+	cmd.req.arg[6] = (nic->max_tx_ques) | ((nic->max_rx_ques) << 16);
+	cmd.req.arg[7] = (nic->min_tx_bw) | ((nic->max_tx_bw) << 16);
+	for (i = 8; i < 32; i++)
+		cmd.req.arg[i] = 0;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+
+	if (err != QLCNIC_RCODE_SUCCESS) {
+		dev_err(&adapter->pdev->dev, "Failed to set nic info%d\n",
+			err);
+		err = -EIO;
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+
+	return err;
+}
+
+int qlcnic_83xx_get_nic_info(struct qlcnic_adapter *adapter,
+			     struct qlcnic_info *npar_info, u8 func_id)
+{
+	int err;
+	u32 temp;
+	u8 op = 0;
+	struct qlcnic_cmd_args cmd;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_NIC_INFO);
+	if (err)
+		return err;
+
+	if (func_id != ahw->pci_func) {
+		temp = func_id << 16;
+		cmd.req.arg[1] = op | BIT_31 | temp;
+	} else {
+		cmd.req.arg[1] = ahw->pci_func << 16;
+	}
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err) {
+		dev_info(&adapter->pdev->dev,
+			 "Failed to get nic info %d\n", err);
+		goto out;
+	}
+
+	npar_info->op_type = cmd.rsp.arg[1];
+	npar_info->pci_func = cmd.rsp.arg[2] & 0xFFFF;
+	npar_info->op_mode = (cmd.rsp.arg[2] & 0xFFFF0000) >> 16;
+	npar_info->phys_port = cmd.rsp.arg[3] & 0xFFFF;
+	npar_info->switch_mode = (cmd.rsp.arg[3] & 0xFFFF0000) >> 16;
+	npar_info->capabilities = cmd.rsp.arg[4];
+	npar_info->max_mac_filters = cmd.rsp.arg[5] & 0xFF;
+	npar_info->max_mtu = (cmd.rsp.arg[5] & 0xFFFF0000) >> 16;
+	npar_info->max_tx_ques = cmd.rsp.arg[6] & 0xFFFF;
+	npar_info->max_rx_ques = (cmd.rsp.arg[6] & 0xFFFF0000) >> 16;
+	npar_info->min_tx_bw = cmd.rsp.arg[7] & 0xFFFF;
+	npar_info->max_tx_bw = (cmd.rsp.arg[7] & 0xFFFF0000) >> 16;
+	if (cmd.rsp.arg[8] & 0x1)
+		npar_info->max_bw_reg_offset = (cmd.rsp.arg[8] & 0x7FFE) >> 1;
+	if (cmd.rsp.arg[8] & 0x10000) {
+		temp = (cmd.rsp.arg[8] & 0x7FFE0000) >> 17;
+		npar_info->max_linkspeed_reg_offset = temp;
+	}
+
+	memcpy(ahw->extra_capability, &cmd.rsp.arg[16],
+	       sizeof(ahw->extra_capability));
+
+out:
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+int qlcnic_get_pci_func_type(struct qlcnic_adapter *adapter, u16 type,
+			     u16 *nic, u16 *fcoe, u16 *iscsi)
+{
+	struct device *dev = &adapter->pdev->dev;
+	int err = 0;
+
+	switch (type) {
+	case QLCNIC_TYPE_NIC:
+		(*nic)++;
+		break;
+	case QLCNIC_TYPE_FCOE:
+		(*fcoe)++;
+		break;
+	case QLCNIC_TYPE_ISCSI:
+		(*iscsi)++;
+		break;
+	default:
+		dev_err(dev, "%s: Unknown PCI type[%x]\n",
+			__func__, type);
+		err = -EIO;
+	}
+
+	return err;
+}
+
+int qlcnic_83xx_get_pci_info(struct qlcnic_adapter *adapter,
+			     struct qlcnic_pci_info *pci_info)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct device *dev = &adapter->pdev->dev;
+	u16 nic = 0, fcoe = 0, iscsi = 0;
+	struct qlcnic_cmd_args cmd;
+	int i, err = 0, j = 0;
+	u32 temp;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_PCI_INFO);
+	if (err)
+		return err;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+
+	ahw->total_nic_func = 0;
+	if (err == QLCNIC_RCODE_SUCCESS) {
+		ahw->max_pci_func = cmd.rsp.arg[1] & 0xFF;
+		for (i = 2, j = 0; j < ahw->max_vnic_func; j++, pci_info++) {
+			pci_info->id = cmd.rsp.arg[i] & 0xFFFF;
+			pci_info->active = (cmd.rsp.arg[i] & 0xFFFF0000) >> 16;
+			i++;
+			if (!pci_info->active) {
+				i += QLC_SKIP_INACTIVE_PCI_REGS;
+				continue;
+			}
+			pci_info->type = cmd.rsp.arg[i] & 0xFFFF;
+			err = qlcnic_get_pci_func_type(adapter, pci_info->type,
+						       &nic, &fcoe, &iscsi);
+			temp = (cmd.rsp.arg[i] & 0xFFFF0000) >> 16;
+			pci_info->default_port = temp;
+			i++;
+			pci_info->tx_min_bw = cmd.rsp.arg[i] & 0xFFFF;
+			temp = (cmd.rsp.arg[i] & 0xFFFF0000) >> 16;
+			pci_info->tx_max_bw = temp;
+			i = i + 2;
+			memcpy(pci_info->mac, &cmd.rsp.arg[i], ETH_ALEN - 2);
+			i++;
+			memcpy(pci_info->mac + sizeof(u32), &cmd.rsp.arg[i], 2);
+			i = i + 3;
+		}
+	} else {
+		dev_err(dev, "Failed to get PCI Info, error = %d\n", err);
+		err = -EIO;
+	}
+
+	ahw->total_nic_func = nic;
+	ahw->total_pci_func = nic + fcoe + iscsi;
+	if (ahw->total_nic_func == 0 || ahw->total_pci_func == 0) {
+		dev_err(dev, "%s: Invalid function count: total nic func[%x], total pci func[%x]\n",
+			__func__, ahw->total_nic_func, ahw->total_pci_func);
+		err = -EIO;
+	}
+	qlcnic_free_mbx_args(&cmd);
+
+	return err;
+}
+
+int qlcnic_83xx_config_intrpt(struct qlcnic_adapter *adapter, bool op_type)
+{
+	int i, index, err;
+	u8 max_ints;
+	u32 val, temp, type;
+	struct qlcnic_cmd_args cmd;
+
+	max_ints = adapter->ahw->num_msix - 1;
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_CONFIG_INTRPT);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = max_ints;
+
+	if (qlcnic_sriov_vf_check(adapter))
+		cmd.req.arg[1] |= (adapter->ahw->pci_func << 8) | BIT_16;
+
+	for (i = 0, index = 2; i < max_ints; i++) {
+		type = op_type ? QLCNIC_INTRPT_ADD : QLCNIC_INTRPT_DEL;
+		val = type | (adapter->ahw->intr_tbl[i].type << 4);
+		if (adapter->ahw->intr_tbl[i].type == QLCNIC_INTRPT_MSIX)
+			val |= (adapter->ahw->intr_tbl[i].id << 16);
+		cmd.req.arg[index++] = val;
+	}
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to configure interrupts 0x%x\n", err);
+		goto out;
+	}
+
+	max_ints = cmd.rsp.arg[1];
+	for (i = 0, index = 2; i < max_ints; i++, index += 2) {
+		val = cmd.rsp.arg[index];
+		if (LSB(val)) {
+			dev_info(&adapter->pdev->dev,
+				 "Can't configure interrupt %d\n",
+				 adapter->ahw->intr_tbl[i].id);
+			continue;
+		}
+		if (op_type) {
+			adapter->ahw->intr_tbl[i].id = MSW(val);
+			adapter->ahw->intr_tbl[i].enabled = 1;
+			temp = cmd.rsp.arg[index + 1];
+			adapter->ahw->intr_tbl[i].src = temp;
+		} else {
+			adapter->ahw->intr_tbl[i].id = i;
+			adapter->ahw->intr_tbl[i].enabled = 0;
+			adapter->ahw->intr_tbl[i].src = 0;
+		}
+	}
+out:
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+int qlcnic_83xx_lock_flash(struct qlcnic_adapter *adapter)
+{
+	int id, timeout = 0;
+	u32 status = 0;
+
+	while (status == 0) {
+		status = QLC_SHARED_REG_RD32(adapter, QLCNIC_FLASH_LOCK);
+		if (status)
+			break;
+
+		if (++timeout >= QLC_83XX_FLASH_LOCK_TIMEOUT) {
+			id = QLC_SHARED_REG_RD32(adapter,
+						 QLCNIC_FLASH_LOCK_OWNER);
+			dev_err(&adapter->pdev->dev,
+				"%s: failed, lock held by %d\n", __func__, id);
+			return -EIO;
+		}
+		usleep_range(1000, 2000);
+	}
+
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_FLASH_LOCK_OWNER, adapter->portnum);
+	return 0;
+}
+
+void qlcnic_83xx_unlock_flash(struct qlcnic_adapter *adapter)
+{
+	QLC_SHARED_REG_RD32(adapter, QLCNIC_FLASH_UNLOCK);
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_FLASH_LOCK_OWNER, 0xFF);
+}
+
+int qlcnic_83xx_lockless_flash_read32(struct qlcnic_adapter *adapter,
+				      u32 flash_addr, u8 *p_data,
+				      int count)
+{
+	u32 word, range, flash_offset, addr = flash_addr, ret;
+	ulong indirect_add, direct_window;
+	int i, err = 0;
+
+	flash_offset = addr & (QLCNIC_FLASH_SECTOR_SIZE - 1);
+	if (addr & 0x3) {
+		dev_err(&adapter->pdev->dev, "Illegal addr = 0x%x\n", addr);
+		return -EIO;
+	}
+
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_DIRECT_WINDOW,
+				     (addr & 0xFFFF0000));
+
+	range = flash_offset + (count * sizeof(u32));
+	/* Check if data is spread across multiple sectors */
+	if (range > (QLCNIC_FLASH_SECTOR_SIZE - 1)) {
+
+		/* Multi sector read */
+		for (i = 0; i < count; i++) {
+			indirect_add = QLC_83XX_FLASH_DIRECT_DATA(addr);
+			ret = QLCRD32(adapter, indirect_add, &err);
+			if (err == -EIO)
+				return err;
+
+			word = ret;
+			*(u32 *)p_data  = word;
+			p_data = p_data + 4;
+			addr = addr + 4;
+			flash_offset = flash_offset + 4;
+
+			if (flash_offset > (QLCNIC_FLASH_SECTOR_SIZE - 1)) {
+				direct_window = QLC_83XX_FLASH_DIRECT_WINDOW;
+				/* This write is needed once for each sector */
+				qlcnic_83xx_wrt_reg_indirect(adapter,
+							     direct_window,
+							     (addr));
+				flash_offset = 0;
+			}
+		}
+	} else {
+		/* Single sector read */
+		for (i = 0; i < count; i++) {
+			indirect_add = QLC_83XX_FLASH_DIRECT_DATA(addr);
+			ret = QLCRD32(adapter, indirect_add, &err);
+			if (err == -EIO)
+				return err;
+
+			word = ret;
+			*(u32 *)p_data  = word;
+			p_data = p_data + 4;
+			addr = addr + 4;
+		}
+	}
+
+	return 0;
+}
+
+static int qlcnic_83xx_poll_flash_status_reg(struct qlcnic_adapter *adapter)
+{
+	u32 status;
+	int retries = QLC_83XX_FLASH_READ_RETRY_COUNT;
+	int err = 0;
+
+	do {
+		status = QLCRD32(adapter, QLC_83XX_FLASH_STATUS, &err);
+		if (err == -EIO)
+			return err;
+
+		if ((status & QLC_83XX_FLASH_STATUS_READY) ==
+		    QLC_83XX_FLASH_STATUS_READY)
+			break;
+
+		usleep_range(1000, 1100);
+	} while (--retries);
+
+	if (!retries)
+		return -EIO;
+
+	return 0;
+}
+
+int qlcnic_83xx_enable_flash_write(struct qlcnic_adapter *adapter)
+{
+	int ret;
+	u32 cmd;
+	cmd = adapter->ahw->fdt.write_statusreg_cmd;
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_ADDR,
+				     (QLC_83XX_FLASH_FDT_WRITE_DEF_SIG | cmd));
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_WRDATA,
+				     adapter->ahw->fdt.write_enable_bits);
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_CONTROL,
+				     QLC_83XX_FLASH_SECOND_ERASE_MS_VAL);
+	ret = qlcnic_83xx_poll_flash_status_reg(adapter);
+	if (ret)
+		return -EIO;
+
+	return 0;
+}
+
+int qlcnic_83xx_disable_flash_write(struct qlcnic_adapter *adapter)
+{
+	int ret;
+
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_ADDR,
+				     (QLC_83XX_FLASH_FDT_WRITE_DEF_SIG |
+				     adapter->ahw->fdt.write_statusreg_cmd));
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_WRDATA,
+				     adapter->ahw->fdt.write_disable_bits);
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_CONTROL,
+				     QLC_83XX_FLASH_SECOND_ERASE_MS_VAL);
+	ret = qlcnic_83xx_poll_flash_status_reg(adapter);
+	if (ret)
+		return -EIO;
+
+	return 0;
+}
+
+int qlcnic_83xx_read_flash_mfg_id(struct qlcnic_adapter *adapter)
+{
+	int ret, err = 0;
+	u32 mfg_id;
+
+	if (qlcnic_83xx_lock_flash(adapter))
+		return -EIO;
+
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_ADDR,
+				     QLC_83XX_FLASH_FDT_READ_MFG_ID_VAL);
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_CONTROL,
+				     QLC_83XX_FLASH_READ_CTRL);
+	ret = qlcnic_83xx_poll_flash_status_reg(adapter);
+	if (ret) {
+		qlcnic_83xx_unlock_flash(adapter);
+		return -EIO;
+	}
+
+	mfg_id = QLCRD32(adapter, QLC_83XX_FLASH_RDDATA, &err);
+	if (err == -EIO) {
+		qlcnic_83xx_unlock_flash(adapter);
+		return err;
+	}
+
+	adapter->flash_mfg_id = (mfg_id & 0xFF);
+	qlcnic_83xx_unlock_flash(adapter);
+
+	return 0;
+}
+
+int qlcnic_83xx_read_flash_descriptor_table(struct qlcnic_adapter *adapter)
+{
+	int count, fdt_size, ret = 0;
+
+	fdt_size = sizeof(struct qlcnic_fdt);
+	count = fdt_size / sizeof(u32);
+
+	if (qlcnic_83xx_lock_flash(adapter))
+		return -EIO;
+
+	memset(&adapter->ahw->fdt, 0, fdt_size);
+	ret = qlcnic_83xx_lockless_flash_read32(adapter, QLCNIC_FDT_LOCATION,
+						(u8 *)&adapter->ahw->fdt,
+						count);
+	qlcnic_swap32_buffer((u32 *)&adapter->ahw->fdt, count);
+	qlcnic_83xx_unlock_flash(adapter);
+	return ret;
+}
+
+int qlcnic_83xx_erase_flash_sector(struct qlcnic_adapter *adapter,
+				   u32 sector_start_addr)
+{
+	u32 reversed_addr, addr1, addr2, cmd;
+	int ret = -EIO;
+
+	if (qlcnic_83xx_lock_flash(adapter) != 0)
+		return -EIO;
+
+	if (adapter->ahw->fdt.mfg_id == adapter->flash_mfg_id) {
+		ret = qlcnic_83xx_enable_flash_write(adapter);
+		if (ret) {
+			qlcnic_83xx_unlock_flash(adapter);
+			dev_err(&adapter->pdev->dev,
+				"%s failed at %d\n",
+				__func__, __LINE__);
+			return ret;
+		}
+	}
+
+	ret = qlcnic_83xx_poll_flash_status_reg(adapter);
+	if (ret) {
+		qlcnic_83xx_unlock_flash(adapter);
+		dev_err(&adapter->pdev->dev,
+			"%s: failed at %d\n", __func__, __LINE__);
+		return -EIO;
+	}
+
+	addr1 = (sector_start_addr & 0xFF) << 16;
+	addr2 = (sector_start_addr & 0xFF0000) >> 16;
+	reversed_addr = addr1 | addr2 | (sector_start_addr & 0xFF00);
+
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_WRDATA,
+				     reversed_addr);
+	cmd = QLC_83XX_FLASH_FDT_ERASE_DEF_SIG | adapter->ahw->fdt.erase_cmd;
+	if (adapter->ahw->fdt.mfg_id == adapter->flash_mfg_id)
+		qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_ADDR, cmd);
+	else
+		qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_ADDR,
+					     QLC_83XX_FLASH_OEM_ERASE_SIG);
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_CONTROL,
+				     QLC_83XX_FLASH_LAST_ERASE_MS_VAL);
+
+	ret = qlcnic_83xx_poll_flash_status_reg(adapter);
+	if (ret) {
+		qlcnic_83xx_unlock_flash(adapter);
+		dev_err(&adapter->pdev->dev,
+			"%s: failed at %d\n", __func__, __LINE__);
+		return -EIO;
+	}
+
+	if (adapter->ahw->fdt.mfg_id == adapter->flash_mfg_id) {
+		ret = qlcnic_83xx_disable_flash_write(adapter);
+		if (ret) {
+			qlcnic_83xx_unlock_flash(adapter);
+			dev_err(&adapter->pdev->dev,
+				"%s: failed at %d\n", __func__, __LINE__);
+			return ret;
+		}
+	}
+
+	qlcnic_83xx_unlock_flash(adapter);
+
+	return 0;
+}
+
+int qlcnic_83xx_flash_write32(struct qlcnic_adapter *adapter, u32 addr,
+			      u32 *p_data)
+{
+	int ret = -EIO;
+	u32 addr1 = 0x00800000 | (addr >> 2);
+
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_ADDR, addr1);
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_WRDATA, *p_data);
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_CONTROL,
+				     QLC_83XX_FLASH_LAST_ERASE_MS_VAL);
+	ret = qlcnic_83xx_poll_flash_status_reg(adapter);
+	if (ret) {
+		dev_err(&adapter->pdev->dev,
+			"%s: failed at %d\n", __func__, __LINE__);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+int qlcnic_83xx_flash_bulk_write(struct qlcnic_adapter *adapter, u32 addr,
+				 u32 *p_data, int count)
+{
+	u32 temp;
+	int ret = -EIO, err = 0;
+
+	if ((count < QLC_83XX_FLASH_WRITE_MIN) ||
+	    (count > QLC_83XX_FLASH_WRITE_MAX)) {
+		dev_err(&adapter->pdev->dev,
+			"%s: Invalid word count\n", __func__);
+		return -EIO;
+	}
+
+	temp = QLCRD32(adapter, QLC_83XX_FLASH_SPI_CONTROL, &err);
+	if (err == -EIO)
+		return err;
+
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_SPI_CONTROL,
+				     (temp | QLC_83XX_FLASH_SPI_CTRL));
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_ADDR,
+				     QLC_83XX_FLASH_ADDR_TEMP_VAL);
+
+	/* First DWORD write */
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_WRDATA, *p_data++);
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_CONTROL,
+				     QLC_83XX_FLASH_FIRST_MS_PATTERN);
+	ret = qlcnic_83xx_poll_flash_status_reg(adapter);
+	if (ret) {
+		dev_err(&adapter->pdev->dev,
+			"%s: failed at %d\n", __func__, __LINE__);
+		return -EIO;
+	}
+
+	count--;
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_ADDR,
+				     QLC_83XX_FLASH_ADDR_SECOND_TEMP_VAL);
+	/* Second to N-1 DWORD writes */
+	while (count != 1) {
+		qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_WRDATA,
+					     *p_data++);
+		qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_CONTROL,
+					     QLC_83XX_FLASH_SECOND_MS_PATTERN);
+		ret = qlcnic_83xx_poll_flash_status_reg(adapter);
+		if (ret) {
+			dev_err(&adapter->pdev->dev,
+				"%s: failed at %d\n", __func__, __LINE__);
+			return -EIO;
+		}
+		count--;
+	}
+
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_ADDR,
+				     QLC_83XX_FLASH_ADDR_TEMP_VAL |
+				     (addr >> 2));
+	/* Last DWORD write */
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_WRDATA, *p_data++);
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_CONTROL,
+				     QLC_83XX_FLASH_LAST_MS_PATTERN);
+	ret = qlcnic_83xx_poll_flash_status_reg(adapter);
+	if (ret) {
+		dev_err(&adapter->pdev->dev,
+			"%s: failed at %d\n", __func__, __LINE__);
+		return -EIO;
+	}
+
+	ret = QLCRD32(adapter, QLC_83XX_FLASH_SPI_STATUS, &err);
+	if (err == -EIO)
+		return err;
+
+	if ((ret & QLC_83XX_FLASH_SPI_CTRL) == QLC_83XX_FLASH_SPI_CTRL) {
+		dev_err(&adapter->pdev->dev, "%s: failed at %d\n",
+			__func__, __LINE__);
+		/* Operation failed, clear error bit */
+		temp = QLCRD32(adapter, QLC_83XX_FLASH_SPI_CONTROL, &err);
+		if (err == -EIO)
+			return err;
+
+		qlcnic_83xx_wrt_reg_indirect(adapter,
+					     QLC_83XX_FLASH_SPI_CONTROL,
+					     (temp | QLC_83XX_FLASH_SPI_CTRL));
+	}
+
+	return 0;
+}
+
+static void qlcnic_83xx_recover_driver_lock(struct qlcnic_adapter *adapter)
+{
+	u32 val, id;
+
+	val = QLCRDX(adapter->ahw, QLC_83XX_RECOVER_DRV_LOCK);
+
+	/* Check if recovery need to be performed by the calling function */
+	if ((val & QLC_83XX_DRV_LOCK_RECOVERY_STATUS_MASK) == 0) {
+		val = val & ~0x3F;
+		val = val | ((adapter->portnum << 2) |
+			     QLC_83XX_NEED_DRV_LOCK_RECOVERY);
+		QLCWRX(adapter->ahw, QLC_83XX_RECOVER_DRV_LOCK, val);
+		dev_info(&adapter->pdev->dev,
+			 "%s: lock recovery initiated\n", __func__);
+		msleep(QLC_83XX_DRV_LOCK_RECOVERY_DELAY);
+		val = QLCRDX(adapter->ahw, QLC_83XX_RECOVER_DRV_LOCK);
+		id = ((val >> 2) & 0xF);
+		if (id == adapter->portnum) {
+			val = val & ~QLC_83XX_DRV_LOCK_RECOVERY_STATUS_MASK;
+			val = val | QLC_83XX_DRV_LOCK_RECOVERY_IN_PROGRESS;
+			QLCWRX(adapter->ahw, QLC_83XX_RECOVER_DRV_LOCK, val);
+			/* Force release the lock */
+			QLCRDX(adapter->ahw, QLC_83XX_DRV_UNLOCK);
+			/* Clear recovery bits */
+			val = val & ~0x3F;
+			QLCWRX(adapter->ahw, QLC_83XX_RECOVER_DRV_LOCK, val);
+			dev_info(&adapter->pdev->dev,
+				 "%s: lock recovery completed\n", __func__);
+		} else {
+			dev_info(&adapter->pdev->dev,
+				 "%s: func %d to resume lock recovery process\n",
+				 __func__, id);
+		}
+	} else {
+		dev_info(&adapter->pdev->dev,
+			 "%s: lock recovery initiated by other functions\n",
+			 __func__);
+	}
+}
+
+int qlcnic_83xx_lock_driver(struct qlcnic_adapter *adapter)
+{
+	u32 lock_alive_counter, val, id, i = 0, status = 0, temp = 0;
+	int max_attempt = 0;
+
+	while (status == 0) {
+		status = QLCRDX(adapter->ahw, QLC_83XX_DRV_LOCK);
+		if (status)
+			break;
+
+		msleep(QLC_83XX_DRV_LOCK_WAIT_DELAY);
+		i++;
+
+		if (i == 1)
+			temp = QLCRDX(adapter->ahw, QLC_83XX_DRV_LOCK_ID);
+
+		if (i == QLC_83XX_DRV_LOCK_WAIT_COUNTER) {
+			val = QLCRDX(adapter->ahw, QLC_83XX_DRV_LOCK_ID);
+			if (val == temp) {
+				id = val & 0xFF;
+				dev_info(&adapter->pdev->dev,
+					 "%s: lock to be recovered from %d\n",
+					 __func__, id);
+				qlcnic_83xx_recover_driver_lock(adapter);
+				i = 0;
+				max_attempt++;
+			} else {
+				dev_err(&adapter->pdev->dev,
+					"%s: failed to get lock\n", __func__);
+				return -EIO;
+			}
+		}
+
+		/* Force exit from while loop after few attempts */
+		if (max_attempt == QLC_83XX_MAX_DRV_LOCK_RECOVERY_ATTEMPT) {
+			dev_err(&adapter->pdev->dev,
+				"%s: failed to get lock\n", __func__);
+			return -EIO;
+		}
+	}
+
+	val = QLCRDX(adapter->ahw, QLC_83XX_DRV_LOCK_ID);
+	lock_alive_counter = val >> 8;
+	lock_alive_counter++;
+	val = lock_alive_counter << 8 | adapter->portnum;
+	QLCWRX(adapter->ahw, QLC_83XX_DRV_LOCK_ID, val);
+
+	return 0;
+}
+
+void qlcnic_83xx_unlock_driver(struct qlcnic_adapter *adapter)
+{
+	u32 val, lock_alive_counter, id;
+
+	val = QLCRDX(adapter->ahw, QLC_83XX_DRV_LOCK_ID);
+	id = val & 0xFF;
+	lock_alive_counter = val >> 8;
+
+	if (id != adapter->portnum)
+		dev_err(&adapter->pdev->dev,
+			"%s:Warning func %d is unlocking lock owned by %d\n",
+			__func__, adapter->portnum, id);
+
+	val = (lock_alive_counter << 8) | 0xFF;
+	QLCWRX(adapter->ahw, QLC_83XX_DRV_LOCK_ID, val);
+	QLCRDX(adapter->ahw, QLC_83XX_DRV_UNLOCK);
+}
+
+int qlcnic_ms_mem_write128(struct qlcnic_adapter *adapter, u64 addr,
+				u32 *data, u32 count)
+{
+	int i, j, ret = 0;
+	u32 temp;
+
+	/* Check alignment */
+	if (addr & 0xF)
+		return -EIO;
+
+	mutex_lock(&adapter->ahw->mem_lock);
+	qlcnic_ind_wr(adapter, QLCNIC_MS_ADDR_HI, 0);
+
+	for (i = 0; i < count; i++, addr += 16) {
+		if (!((ADDR_IN_RANGE(addr, QLCNIC_ADDR_QDR_NET,
+				     QLCNIC_ADDR_QDR_NET_MAX)) ||
+		      (ADDR_IN_RANGE(addr, QLCNIC_ADDR_DDR_NET,
+				     QLCNIC_ADDR_DDR_NET_MAX)))) {
+			mutex_unlock(&adapter->ahw->mem_lock);
+			return -EIO;
+		}
+
+		qlcnic_ind_wr(adapter, QLCNIC_MS_ADDR_LO, addr);
+		qlcnic_ind_wr(adapter, QLCNIC_MS_WRTDATA_LO, *data++);
+		qlcnic_ind_wr(adapter, QLCNIC_MS_WRTDATA_HI, *data++);
+		qlcnic_ind_wr(adapter, QLCNIC_MS_WRTDATA_ULO, *data++);
+		qlcnic_ind_wr(adapter, QLCNIC_MS_WRTDATA_UHI, *data++);
+		qlcnic_ind_wr(adapter, QLCNIC_MS_CTRL, QLCNIC_TA_WRITE_ENABLE);
+		qlcnic_ind_wr(adapter, QLCNIC_MS_CTRL, QLCNIC_TA_WRITE_START);
+
+		for (j = 0; j < MAX_CTL_CHECK; j++) {
+			temp = qlcnic_ind_rd(adapter, QLCNIC_MS_CTRL);
+
+			if ((temp & TA_CTL_BUSY) == 0)
+				break;
+		}
+
+		/* Status check failure */
+		if (j >= MAX_CTL_CHECK) {
+			printk_ratelimited(KERN_WARNING
+					   "MS memory write failed\n");
+			mutex_unlock(&adapter->ahw->mem_lock);
+			return -EIO;
+		}
+	}
+
+	mutex_unlock(&adapter->ahw->mem_lock);
+
+	return ret;
+}
+
+int qlcnic_83xx_flash_read32(struct qlcnic_adapter *adapter, u32 flash_addr,
+			     u8 *p_data, int count)
+{
+	u32 word, addr = flash_addr, ret;
+	ulong  indirect_addr;
+	int i, err = 0;
+
+	if (qlcnic_83xx_lock_flash(adapter) != 0)
+		return -EIO;
+
+	if (addr & 0x3) {
+		dev_err(&adapter->pdev->dev, "Illegal addr = 0x%x\n", addr);
+		qlcnic_83xx_unlock_flash(adapter);
+		return -EIO;
+	}
+
+	for (i = 0; i < count; i++) {
+		if (qlcnic_83xx_wrt_reg_indirect(adapter,
+						 QLC_83XX_FLASH_DIRECT_WINDOW,
+						 (addr))) {
+			qlcnic_83xx_unlock_flash(adapter);
+			return -EIO;
+		}
+
+		indirect_addr = QLC_83XX_FLASH_DIRECT_DATA(addr);
+		ret = QLCRD32(adapter, indirect_addr, &err);
+		if (err == -EIO)
+			return err;
+
+		word = ret;
+		*(u32 *)p_data  = word;
+		p_data = p_data + 4;
+		addr = addr + 4;
+	}
+
+	qlcnic_83xx_unlock_flash(adapter);
+
+	return 0;
+}
+
+void qlcnic_83xx_get_port_type(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_cmd_args cmd;
+	u32 config;
+	int err;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_LINK_STATUS);
+	if (err)
+		return;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err) {
+		dev_info(&adapter->pdev->dev,
+			 "Get Link Status Command failed: 0x%x\n", err);
+		goto out;
+	} else {
+		config = cmd.rsp.arg[3];
+
+		switch (QLC_83XX_SFP_MODULE_TYPE(config)) {
+		case QLC_83XX_MODULE_FIBRE_1000BASE_SX:
+		case QLC_83XX_MODULE_FIBRE_1000BASE_LX:
+		case QLC_83XX_MODULE_FIBRE_1000BASE_CX:
+		case QLC_83XX_MODULE_TP_1000BASE_T:
+			ahw->port_type = QLCNIC_GBE;
+			break;
+		default:
+			ahw->port_type = QLCNIC_XGBE;
+		}
+	}
+out:
+	qlcnic_free_mbx_args(&cmd);
+}
+
+int qlcnic_83xx_test_link(struct qlcnic_adapter *adapter)
+{
+	u8 pci_func;
+	int err;
+	u32 config = 0, state;
+	struct qlcnic_cmd_args cmd;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	if (qlcnic_sriov_vf_check(adapter))
+		pci_func = adapter->portnum;
+	else
+		pci_func = ahw->pci_func;
+
+	state = readl(ahw->pci_base0 + QLC_83XX_LINK_STATE(pci_func));
+	if (!QLC_83xx_FUNC_VAL(state, pci_func)) {
+		dev_info(&adapter->pdev->dev, "link state down\n");
+		return config;
+	}
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_LINK_STATUS);
+	if (err)
+		return err;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err) {
+		dev_info(&adapter->pdev->dev,
+			 "Get Link Status Command failed: 0x%x\n", err);
+		goto out;
+	} else {
+		config = cmd.rsp.arg[1];
+		switch (QLC_83XX_CURRENT_LINK_SPEED(config)) {
+		case QLC_83XX_10M_LINK:
+			ahw->link_speed = SPEED_10;
+			break;
+		case QLC_83XX_100M_LINK:
+			ahw->link_speed = SPEED_100;
+			break;
+		case QLC_83XX_1G_LINK:
+			ahw->link_speed = SPEED_1000;
+			break;
+		case QLC_83XX_10G_LINK:
+			ahw->link_speed = SPEED_10000;
+			break;
+		default:
+			ahw->link_speed = 0;
+			break;
+		}
+		config = cmd.rsp.arg[3];
+		switch (QLC_83XX_SFP_MODULE_TYPE(config)) {
+		case QLC_83XX_MODULE_FIBRE_10GBASE_LRM:
+		case QLC_83XX_MODULE_FIBRE_10GBASE_LR:
+		case QLC_83XX_MODULE_FIBRE_10GBASE_SR:
+			ahw->supported_type = PORT_FIBRE;
+			ahw->port_type = QLCNIC_XGBE;
+			break;
+		case QLC_83XX_MODULE_FIBRE_1000BASE_SX:
+		case QLC_83XX_MODULE_FIBRE_1000BASE_LX:
+		case QLC_83XX_MODULE_FIBRE_1000BASE_CX:
+			ahw->supported_type = PORT_FIBRE;
+			ahw->port_type = QLCNIC_GBE;
+			break;
+		case QLC_83XX_MODULE_TP_1000BASE_T:
+			ahw->supported_type = PORT_TP;
+			ahw->port_type = QLCNIC_GBE;
+			break;
+		case QLC_83XX_MODULE_DA_10GE_PASSIVE_CP:
+		case QLC_83XX_MODULE_DA_10GE_ACTIVE_CP:
+		case QLC_83XX_MODULE_DA_10GE_LEGACY_CP:
+		case QLC_83XX_MODULE_DA_1GE_PASSIVE_CP:
+			ahw->supported_type = PORT_DA;
+			ahw->port_type = QLCNIC_XGBE;
+			break;
+		default:
+			ahw->supported_type = PORT_OTHER;
+			ahw->port_type = QLCNIC_XGBE;
+		}
+		if (config & 1)
+			err = 1;
+	}
+out:
+	qlcnic_free_mbx_args(&cmd);
+	return config;
+}
+
+int qlcnic_83xx_get_link_ksettings(struct qlcnic_adapter *adapter,
+				   struct ethtool_link_ksettings *ecmd)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u32 config = 0;
+	int status = 0;
+	u32 supported, advertising;
+
+	if (!test_bit(__QLCNIC_MAINTENANCE_MODE, &adapter->state)) {
+		/* Get port configuration info */
+		status = qlcnic_83xx_get_port_info(adapter);
+		/* Get Link Status related info */
+		config = qlcnic_83xx_test_link(adapter);
+		ahw->module_type = QLC_83XX_SFP_MODULE_TYPE(config);
+	}
+
+	/* hard code until there is a way to get it from flash */
+	ahw->board_type = QLCNIC_BRDTYPE_83XX_10G;
+
+	if (netif_running(adapter->netdev) && ahw->has_link_events) {
+		ecmd->base.speed = ahw->link_speed;
+		ecmd->base.duplex = ahw->link_duplex;
+		ecmd->base.autoneg = ahw->link_autoneg;
+	} else {
+		ecmd->base.speed = SPEED_UNKNOWN;
+		ecmd->base.duplex = DUPLEX_UNKNOWN;
+		ecmd->base.autoneg = AUTONEG_DISABLE;
+	}
+
+	supported = (SUPPORTED_10baseT_Full |
+			   SUPPORTED_100baseT_Full |
+			   SUPPORTED_1000baseT_Full |
+			   SUPPORTED_10000baseT_Full |
+			   SUPPORTED_Autoneg);
+
+	ethtool_convert_link_mode_to_legacy_u32(&advertising,
+						ecmd->link_modes.advertising);
+
+	if (ecmd->base.autoneg == AUTONEG_ENABLE) {
+		if (ahw->port_config & QLC_83XX_10_CAPABLE)
+			advertising |= SUPPORTED_10baseT_Full;
+		if (ahw->port_config & QLC_83XX_100_CAPABLE)
+			advertising |= SUPPORTED_100baseT_Full;
+		if (ahw->port_config & QLC_83XX_1G_CAPABLE)
+			advertising |= SUPPORTED_1000baseT_Full;
+		if (ahw->port_config & QLC_83XX_10G_CAPABLE)
+			advertising |= SUPPORTED_10000baseT_Full;
+		if (ahw->port_config & QLC_83XX_AUTONEG_ENABLE)
+			advertising |= ADVERTISED_Autoneg;
+	} else {
+		switch (ahw->link_speed) {
+		case SPEED_10:
+			advertising = SUPPORTED_10baseT_Full;
+			break;
+		case SPEED_100:
+			advertising = SUPPORTED_100baseT_Full;
+			break;
+		case SPEED_1000:
+			advertising = SUPPORTED_1000baseT_Full;
+			break;
+		case SPEED_10000:
+			advertising = SUPPORTED_10000baseT_Full;
+			break;
+		default:
+			break;
+		}
+
+	}
+
+	switch (ahw->supported_type) {
+	case PORT_FIBRE:
+		supported |= SUPPORTED_FIBRE;
+		advertising |= ADVERTISED_FIBRE;
+		ecmd->base.port = PORT_FIBRE;
+		break;
+	case PORT_TP:
+		supported |= SUPPORTED_TP;
+		advertising |= ADVERTISED_TP;
+		ecmd->base.port = PORT_TP;
+		break;
+	case PORT_DA:
+		supported |= SUPPORTED_FIBRE;
+		advertising |= ADVERTISED_FIBRE;
+		ecmd->base.port = PORT_DA;
+		break;
+	default:
+		supported |= SUPPORTED_FIBRE;
+		advertising |= ADVERTISED_FIBRE;
+		ecmd->base.port = PORT_OTHER;
+		break;
+	}
+	ecmd->base.phy_address = ahw->physical_port;
+
+	ethtool_convert_legacy_u32_to_link_mode(ecmd->link_modes.supported,
+						supported);
+	ethtool_convert_legacy_u32_to_link_mode(ecmd->link_modes.advertising,
+						advertising);
+
+	return status;
+}
+
+int qlcnic_83xx_set_link_ksettings(struct qlcnic_adapter *adapter,
+				   const struct ethtool_link_ksettings *ecmd)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u32 config = adapter->ahw->port_config;
+	int status = 0;
+
+	/* 83xx devices do not support Half duplex */
+	if (ecmd->base.duplex == DUPLEX_HALF) {
+		netdev_info(adapter->netdev,
+			    "Half duplex mode not supported\n");
+		return -EINVAL;
+	}
+
+	if (ecmd->base.autoneg) {
+		ahw->port_config |= QLC_83XX_AUTONEG_ENABLE;
+		ahw->port_config |= (QLC_83XX_100_CAPABLE |
+				     QLC_83XX_1G_CAPABLE |
+				     QLC_83XX_10G_CAPABLE);
+	} else { /* force speed */
+		ahw->port_config &= ~QLC_83XX_AUTONEG_ENABLE;
+		switch (ecmd->base.speed) {
+		case SPEED_10:
+			ahw->port_config &= ~(QLC_83XX_100_CAPABLE |
+					      QLC_83XX_1G_CAPABLE |
+					      QLC_83XX_10G_CAPABLE);
+			ahw->port_config |= QLC_83XX_10_CAPABLE;
+			break;
+		case SPEED_100:
+			ahw->port_config &= ~(QLC_83XX_10_CAPABLE |
+					      QLC_83XX_1G_CAPABLE |
+					      QLC_83XX_10G_CAPABLE);
+			ahw->port_config |= QLC_83XX_100_CAPABLE;
+			break;
+		case SPEED_1000:
+			ahw->port_config &= ~(QLC_83XX_10_CAPABLE |
+					      QLC_83XX_100_CAPABLE |
+					      QLC_83XX_10G_CAPABLE);
+			ahw->port_config |= QLC_83XX_1G_CAPABLE;
+			break;
+		case SPEED_10000:
+			ahw->port_config &= ~(QLC_83XX_10_CAPABLE |
+					      QLC_83XX_100_CAPABLE |
+					      QLC_83XX_1G_CAPABLE);
+			ahw->port_config |= QLC_83XX_10G_CAPABLE;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	status = qlcnic_83xx_set_port_config(adapter);
+	if (status) {
+		netdev_info(adapter->netdev,
+			    "Failed to Set Link Speed and autoneg.\n");
+		ahw->port_config = config;
+	}
+
+	return status;
+}
+
+static inline u64 *qlcnic_83xx_copy_stats(struct qlcnic_cmd_args *cmd,
+					  u64 *data, int index)
+{
+	u32 low, hi;
+	u64 val;
+
+	low = cmd->rsp.arg[index];
+	hi = cmd->rsp.arg[index + 1];
+	val = (((u64) low) | (((u64) hi) << 32));
+	*data++ = val;
+	return data;
+}
+
+static u64 *qlcnic_83xx_fill_stats(struct qlcnic_adapter *adapter,
+				   struct qlcnic_cmd_args *cmd, u64 *data,
+				   int type, int *ret)
+{
+	int err, k, total_regs;
+
+	*ret = 0;
+	err = qlcnic_issue_cmd(adapter, cmd);
+	if (err != QLCNIC_RCODE_SUCCESS) {
+		dev_info(&adapter->pdev->dev,
+			 "Error in get statistics mailbox command\n");
+		*ret = -EIO;
+		return data;
+	}
+	total_regs = cmd->rsp.num;
+	switch (type) {
+	case QLC_83XX_STAT_MAC:
+		/* fill in MAC tx counters */
+		for (k = 2; k < 28; k += 2)
+			data = qlcnic_83xx_copy_stats(cmd, data, k);
+		/* skip 24 bytes of reserved area */
+		/* fill in MAC rx counters */
+		for (k += 6; k < 60; k += 2)
+			data = qlcnic_83xx_copy_stats(cmd, data, k);
+		/* skip 24 bytes of reserved area */
+		/* fill in MAC rx frame stats */
+		for (k += 6; k < 80; k += 2)
+			data = qlcnic_83xx_copy_stats(cmd, data, k);
+		/* fill in eSwitch stats */
+		for (; k < total_regs; k += 2)
+			data = qlcnic_83xx_copy_stats(cmd, data, k);
+		break;
+	case QLC_83XX_STAT_RX:
+		for (k = 2; k < 8; k += 2)
+			data = qlcnic_83xx_copy_stats(cmd, data, k);
+		/* skip 8 bytes of reserved data */
+		for (k += 2; k < 24; k += 2)
+			data = qlcnic_83xx_copy_stats(cmd, data, k);
+		/* skip 8 bytes containing RE1FBQ error data */
+		for (k += 2; k < total_regs; k += 2)
+			data = qlcnic_83xx_copy_stats(cmd, data, k);
+		break;
+	case QLC_83XX_STAT_TX:
+		for (k = 2; k < 10; k += 2)
+			data = qlcnic_83xx_copy_stats(cmd, data, k);
+		/* skip 8 bytes of reserved data */
+		for (k += 2; k < total_regs; k += 2)
+			data = qlcnic_83xx_copy_stats(cmd, data, k);
+		break;
+	default:
+		dev_warn(&adapter->pdev->dev, "Unknown get statistics mode\n");
+		*ret = -EIO;
+	}
+	return data;
+}
+
+void qlcnic_83xx_get_stats(struct qlcnic_adapter *adapter, u64 *data)
+{
+	struct qlcnic_cmd_args cmd;
+	struct net_device *netdev = adapter->netdev;
+	int ret = 0;
+
+	ret = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_STATISTICS);
+	if (ret)
+		return;
+	/* Get Tx stats */
+	cmd.req.arg[1] = BIT_1 | (adapter->tx_ring->ctx_id << 16);
+	cmd.rsp.num = QLC_83XX_TX_STAT_REGS;
+	data = qlcnic_83xx_fill_stats(adapter, &cmd, data,
+				      QLC_83XX_STAT_TX, &ret);
+	if (ret) {
+		netdev_err(netdev, "Error getting Tx stats\n");
+		goto out;
+	}
+	/* Get MAC stats */
+	cmd.req.arg[1] = BIT_2 | (adapter->portnum << 16);
+	cmd.rsp.num = QLC_83XX_MAC_STAT_REGS;
+	memset(cmd.rsp.arg, 0, sizeof(u32) * cmd.rsp.num);
+	data = qlcnic_83xx_fill_stats(adapter, &cmd, data,
+				      QLC_83XX_STAT_MAC, &ret);
+	if (ret) {
+		netdev_err(netdev, "Error getting MAC stats\n");
+		goto out;
+	}
+	/* Get Rx stats */
+	cmd.req.arg[1] = adapter->recv_ctx->context_id << 16;
+	cmd.rsp.num = QLC_83XX_RX_STAT_REGS;
+	memset(cmd.rsp.arg, 0, sizeof(u32) * cmd.rsp.num);
+	data = qlcnic_83xx_fill_stats(adapter, &cmd, data,
+				      QLC_83XX_STAT_RX, &ret);
+	if (ret)
+		netdev_err(netdev, "Error getting Rx stats\n");
+out:
+	qlcnic_free_mbx_args(&cmd);
+}
+
+#define QLCNIC_83XX_ADD_PORT0		BIT_0
+#define QLCNIC_83XX_ADD_PORT1		BIT_1
+#define QLCNIC_83XX_EXTENDED_MEM_SIZE	13 /* In MB */
+int qlcnic_83xx_extend_md_capab(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_cmd_args cmd;
+	int err;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter,
+				    QLCNIC_CMD_83XX_EXTEND_ISCSI_DUMP_CAP);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = (QLCNIC_83XX_ADD_PORT0 | QLCNIC_83XX_ADD_PORT1);
+	cmd.req.arg[2] = QLCNIC_83XX_EXTENDED_MEM_SIZE;
+	cmd.req.arg[3] = QLCNIC_83XX_EXTENDED_MEM_SIZE;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err)
+		dev_err(&adapter->pdev->dev,
+			"failed to issue extend iSCSI minidump capability\n");
+
+	return err;
+}
+
+int qlcnic_83xx_reg_test(struct qlcnic_adapter *adapter)
+{
+	u32 major, minor, sub;
+
+	major = QLC_SHARED_REG_RD32(adapter, QLCNIC_FW_VERSION_MAJOR);
+	minor = QLC_SHARED_REG_RD32(adapter, QLCNIC_FW_VERSION_MINOR);
+	sub = QLC_SHARED_REG_RD32(adapter, QLCNIC_FW_VERSION_SUB);
+
+	if (adapter->fw_version != QLCNIC_VERSION_CODE(major, minor, sub)) {
+		dev_info(&adapter->pdev->dev, "%s: Reg test failed\n",
+			 __func__);
+		return 1;
+	}
+	return 0;
+}
+
+inline int qlcnic_83xx_get_regs_len(struct qlcnic_adapter *adapter)
+{
+	return (ARRAY_SIZE(qlcnic_83xx_ext_reg_tbl) *
+		sizeof(*adapter->ahw->ext_reg_tbl)) +
+		(ARRAY_SIZE(qlcnic_83xx_reg_tbl) *
+		sizeof(*adapter->ahw->reg_tbl));
+}
+
+int qlcnic_83xx_get_registers(struct qlcnic_adapter *adapter, u32 *regs_buff)
+{
+	int i, j = 0;
+
+	for (i = QLCNIC_DEV_INFO_SIZE + 1;
+	     j < ARRAY_SIZE(qlcnic_83xx_reg_tbl); i++, j++)
+		regs_buff[i] = QLC_SHARED_REG_RD32(adapter, j);
+
+	for (j = 0; j < ARRAY_SIZE(qlcnic_83xx_ext_reg_tbl); j++)
+		regs_buff[i++] = QLCRDX(adapter->ahw, j);
+	return i;
+}
+
+int qlcnic_83xx_interrupt_test(struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_cmd_args cmd;
+	u8 val, drv_sds_rings = adapter->drv_sds_rings;
+	u8 drv_tx_rings = adapter->drv_tx_rings;
+	u32 data;
+	u16 intrpt_id, id;
+	int ret;
+
+	if (test_bit(__QLCNIC_RESETTING, &adapter->state)) {
+		netdev_info(netdev, "Device is resetting\n");
+		return -EBUSY;
+	}
+
+	if (qlcnic_get_diag_lock(adapter)) {
+		netdev_info(netdev, "Device in diagnostics mode\n");
+		return -EBUSY;
+	}
+
+	ret = qlcnic_83xx_diag_alloc_res(netdev, QLCNIC_INTERRUPT_TEST,
+					 drv_sds_rings);
+	if (ret)
+		goto fail_diag_irq;
+
+	ahw->diag_cnt = 0;
+	ret = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_INTRPT_TEST);
+	if (ret)
+		goto fail_diag_irq;
+
+	if (adapter->flags & QLCNIC_MSIX_ENABLED)
+		intrpt_id = ahw->intr_tbl[0].id;
+	else
+		intrpt_id = QLCRDX(ahw, QLCNIC_DEF_INT_ID);
+
+	cmd.req.arg[1] = 1;
+	cmd.req.arg[2] = intrpt_id;
+	cmd.req.arg[3] = BIT_0;
+
+	ret = qlcnic_issue_cmd(adapter, &cmd);
+	data = cmd.rsp.arg[2];
+	id = LSW(data);
+	val = LSB(MSW(data));
+	if (id != intrpt_id)
+		dev_info(&adapter->pdev->dev,
+			 "Interrupt generated: 0x%x, requested:0x%x\n",
+			 id, intrpt_id);
+	if (val)
+		dev_err(&adapter->pdev->dev,
+			 "Interrupt test error: 0x%x\n", val);
+	if (ret)
+		goto done;
+
+	msleep(20);
+	ret = !ahw->diag_cnt;
+
+done:
+	qlcnic_free_mbx_args(&cmd);
+	qlcnic_83xx_diag_free_res(netdev, drv_sds_rings);
+
+fail_diag_irq:
+	adapter->drv_sds_rings = drv_sds_rings;
+	adapter->drv_tx_rings = drv_tx_rings;
+	qlcnic_release_diag_lock(adapter);
+	return ret;
+}
+
+void qlcnic_83xx_get_pauseparam(struct qlcnic_adapter *adapter,
+				struct ethtool_pauseparam *pause)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int status = 0;
+	u32 config;
+
+	status = qlcnic_83xx_get_port_config(adapter);
+	if (status) {
+		dev_err(&adapter->pdev->dev,
+			"%s: Get Pause Config failed\n", __func__);
+		return;
+	}
+	config = ahw->port_config;
+	if (config & QLC_83XX_CFG_STD_PAUSE) {
+		switch (MSW(config)) {
+		case QLC_83XX_TX_PAUSE:
+			pause->tx_pause = 1;
+			break;
+		case QLC_83XX_RX_PAUSE:
+			pause->rx_pause = 1;
+			break;
+		case QLC_83XX_TX_RX_PAUSE:
+		default:
+			/* Backward compatibility for existing
+			 * flash definitions
+			 */
+			pause->tx_pause = 1;
+			pause->rx_pause = 1;
+		}
+	}
+
+	if (QLC_83XX_AUTONEG(config))
+		pause->autoneg = 1;
+}
+
+int qlcnic_83xx_set_pauseparam(struct qlcnic_adapter *adapter,
+			       struct ethtool_pauseparam *pause)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int status = 0;
+	u32 config;
+
+	status = qlcnic_83xx_get_port_config(adapter);
+	if (status) {
+		dev_err(&adapter->pdev->dev,
+			"%s: Get Pause Config failed.\n", __func__);
+		return status;
+	}
+	config = ahw->port_config;
+
+	if (ahw->port_type == QLCNIC_GBE) {
+		if (pause->autoneg)
+			ahw->port_config |= QLC_83XX_ENABLE_AUTONEG;
+		if (!pause->autoneg)
+			ahw->port_config &= ~QLC_83XX_ENABLE_AUTONEG;
+	} else if ((ahw->port_type == QLCNIC_XGBE) && (pause->autoneg)) {
+		return -EOPNOTSUPP;
+	}
+
+	if (!(config & QLC_83XX_CFG_STD_PAUSE))
+		ahw->port_config |= QLC_83XX_CFG_STD_PAUSE;
+
+	if (pause->rx_pause && pause->tx_pause) {
+		ahw->port_config |= QLC_83XX_CFG_STD_TX_RX_PAUSE;
+	} else if (pause->rx_pause && !pause->tx_pause) {
+		ahw->port_config &= ~QLC_83XX_CFG_STD_TX_PAUSE;
+		ahw->port_config |= QLC_83XX_CFG_STD_RX_PAUSE;
+	} else if (pause->tx_pause && !pause->rx_pause) {
+		ahw->port_config &= ~QLC_83XX_CFG_STD_RX_PAUSE;
+		ahw->port_config |= QLC_83XX_CFG_STD_TX_PAUSE;
+	} else if (!pause->rx_pause && !pause->tx_pause) {
+		ahw->port_config &= ~(QLC_83XX_CFG_STD_TX_RX_PAUSE |
+				      QLC_83XX_CFG_STD_PAUSE);
+	}
+	status = qlcnic_83xx_set_port_config(adapter);
+	if (status) {
+		dev_err(&adapter->pdev->dev,
+			"%s: Set Pause Config failed.\n", __func__);
+		ahw->port_config = config;
+	}
+	return status;
+}
+
+static int qlcnic_83xx_read_flash_status_reg(struct qlcnic_adapter *adapter)
+{
+	int ret, err = 0;
+	u32 temp;
+
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_ADDR,
+				     QLC_83XX_FLASH_OEM_READ_SIG);
+	qlcnic_83xx_wrt_reg_indirect(adapter, QLC_83XX_FLASH_CONTROL,
+				     QLC_83XX_FLASH_READ_CTRL);
+	ret = qlcnic_83xx_poll_flash_status_reg(adapter);
+	if (ret)
+		return -EIO;
+
+	temp = QLCRD32(adapter, QLC_83XX_FLASH_RDDATA, &err);
+	if (err == -EIO)
+		return err;
+
+	return temp & 0xFF;
+}
+
+int qlcnic_83xx_flash_test(struct qlcnic_adapter *adapter)
+{
+	int status;
+
+	status = qlcnic_83xx_read_flash_status_reg(adapter);
+	if (status == -EIO) {
+		dev_info(&adapter->pdev->dev, "%s: EEPROM test failed.\n",
+			 __func__);
+		return 1;
+	}
+	return 0;
+}
+
+static int qlcnic_83xx_shutdown(struct pci_dev *pdev)
+{
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+	struct net_device *netdev = adapter->netdev;
+	int retval;
+
+	netif_device_detach(netdev);
+	qlcnic_cancel_idc_work(adapter);
+
+	if (netif_running(netdev))
+		qlcnic_down(adapter, netdev);
+
+	qlcnic_83xx_disable_mbx_intr(adapter);
+	cancel_delayed_work_sync(&adapter->idc_aen_work);
+
+	retval = pci_save_state(pdev);
+	if (retval)
+		return retval;
+
+	return 0;
+}
+
+static int qlcnic_83xx_resume(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlc_83xx_idc *idc = &ahw->idc;
+	int err = 0;
+
+	err = qlcnic_83xx_idc_init(adapter);
+	if (err)
+		return err;
+
+	if (ahw->nic_mode == QLCNIC_VNIC_MODE) {
+		if (ahw->op_mode == QLCNIC_MGMT_FUNC) {
+			qlcnic_83xx_set_vnic_opmode(adapter);
+		} else {
+			err = qlcnic_83xx_check_vnic_state(adapter);
+			if (err)
+				return err;
+		}
+	}
+
+	err = qlcnic_83xx_idc_reattach_driver(adapter);
+	if (err)
+		return err;
+
+	qlcnic_schedule_work(adapter, qlcnic_83xx_idc_poll_dev_state,
+			     idc->delay);
+	return err;
+}
+
+void qlcnic_83xx_reinit_mbx_work(struct qlcnic_mailbox *mbx)
+{
+	reinit_completion(&mbx->completion);
+	set_bit(QLC_83XX_MBX_READY, &mbx->status);
+}
+
+void qlcnic_83xx_free_mailbox(struct qlcnic_mailbox *mbx)
+{
+	if (!mbx)
+		return;
+
+	destroy_workqueue(mbx->work_q);
+	kfree(mbx);
+}
+
+static inline void
+qlcnic_83xx_notify_cmd_completion(struct qlcnic_adapter *adapter,
+				  struct qlcnic_cmd_args *cmd)
+{
+	atomic_set(&cmd->rsp_status, QLC_83XX_MBX_RESPONSE_ARRIVED);
+
+	if (cmd->type == QLC_83XX_MBX_CMD_NO_WAIT) {
+		qlcnic_free_mbx_args(cmd);
+		kfree(cmd);
+		return;
+	}
+	complete(&cmd->completion);
+}
+
+static void qlcnic_83xx_flush_mbx_queue(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_mailbox *mbx = adapter->ahw->mailbox;
+	struct list_head *head = &mbx->cmd_q;
+	struct qlcnic_cmd_args *cmd = NULL;
+
+	spin_lock_bh(&mbx->queue_lock);
+
+	while (!list_empty(head)) {
+		cmd = list_entry(head->next, struct qlcnic_cmd_args, list);
+		dev_info(&adapter->pdev->dev, "%s: Mailbox command 0x%x\n",
+			 __func__, cmd->cmd_op);
+		list_del(&cmd->list);
+		mbx->num_cmds--;
+		qlcnic_83xx_notify_cmd_completion(adapter, cmd);
+	}
+
+	spin_unlock_bh(&mbx->queue_lock);
+}
+
+static int qlcnic_83xx_check_mbx_status(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_mailbox *mbx = ahw->mailbox;
+	u32 host_mbx_ctrl;
+
+	if (!test_bit(QLC_83XX_MBX_READY, &mbx->status))
+		return -EBUSY;
+
+	host_mbx_ctrl = QLCRDX(ahw, QLCNIC_HOST_MBX_CTRL);
+	if (host_mbx_ctrl) {
+		clear_bit(QLC_83XX_MBX_READY, &mbx->status);
+		ahw->idc.collect_dump = 1;
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static inline void qlcnic_83xx_signal_mbx_cmd(struct qlcnic_adapter *adapter,
+					      u8 issue_cmd)
+{
+	if (issue_cmd)
+		QLCWRX(adapter->ahw, QLCNIC_HOST_MBX_CTRL, QLCNIC_SET_OWNER);
+	else
+		QLCWRX(adapter->ahw, QLCNIC_FW_MBX_CTRL, QLCNIC_CLR_OWNER);
+}
+
+static void qlcnic_83xx_dequeue_mbx_cmd(struct qlcnic_adapter *adapter,
+					struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_mailbox *mbx = adapter->ahw->mailbox;
+
+	spin_lock_bh(&mbx->queue_lock);
+
+	list_del(&cmd->list);
+	mbx->num_cmds--;
+
+	spin_unlock_bh(&mbx->queue_lock);
+
+	qlcnic_83xx_notify_cmd_completion(adapter, cmd);
+}
+
+static void qlcnic_83xx_encode_mbx_cmd(struct qlcnic_adapter *adapter,
+				       struct qlcnic_cmd_args *cmd)
+{
+	u32 mbx_cmd, fw_hal_version, hdr_size, total_size, tmp;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int i, j;
+
+	if (cmd->op_type != QLC_83XX_MBX_POST_BC_OP) {
+		mbx_cmd = cmd->req.arg[0];
+		writel(mbx_cmd, QLCNIC_MBX_HOST(ahw, 0));
+		for (i = 1; i < cmd->req.num; i++)
+			writel(cmd->req.arg[i], QLCNIC_MBX_HOST(ahw, i));
+	} else {
+		fw_hal_version = ahw->fw_hal_version;
+		hdr_size = sizeof(struct qlcnic_bc_hdr) / sizeof(u32);
+		total_size = cmd->pay_size + hdr_size;
+		tmp = QLCNIC_CMD_BC_EVENT_SETUP | total_size << 16;
+		mbx_cmd = tmp | fw_hal_version << 29;
+		writel(mbx_cmd, QLCNIC_MBX_HOST(ahw, 0));
+
+		/* Back channel specific operations bits */
+		mbx_cmd = 0x1 | 1 << 4;
+
+		if (qlcnic_sriov_pf_check(adapter))
+			mbx_cmd |= cmd->func_num << 5;
+
+		writel(mbx_cmd, QLCNIC_MBX_HOST(ahw, 1));
+
+		for (i = 2, j = 0; j < hdr_size; i++, j++)
+			writel(*(cmd->hdr++), QLCNIC_MBX_HOST(ahw, i));
+		for (j = 0; j < cmd->pay_size; j++, i++)
+			writel(*(cmd->pay++), QLCNIC_MBX_HOST(ahw, i));
+	}
+}
+
+void qlcnic_83xx_detach_mailbox_work(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_mailbox *mbx = adapter->ahw->mailbox;
+
+	if (!mbx)
+		return;
+
+	clear_bit(QLC_83XX_MBX_READY, &mbx->status);
+	complete(&mbx->completion);
+	cancel_work_sync(&mbx->work);
+	flush_workqueue(mbx->work_q);
+	qlcnic_83xx_flush_mbx_queue(adapter);
+}
+
+static int qlcnic_83xx_enqueue_mbx_cmd(struct qlcnic_adapter *adapter,
+				       struct qlcnic_cmd_args *cmd,
+				       unsigned long *timeout)
+{
+	struct qlcnic_mailbox *mbx = adapter->ahw->mailbox;
+
+	if (test_bit(QLC_83XX_MBX_READY, &mbx->status)) {
+		atomic_set(&cmd->rsp_status, QLC_83XX_MBX_RESPONSE_WAIT);
+		init_completion(&cmd->completion);
+		cmd->rsp_opcode = QLC_83XX_MBX_RESPONSE_UNKNOWN;
+
+		spin_lock_bh(&mbx->queue_lock);
+
+		list_add_tail(&cmd->list, &mbx->cmd_q);
+		mbx->num_cmds++;
+		cmd->total_cmds = mbx->num_cmds;
+		*timeout = cmd->total_cmds * QLC_83XX_MBX_TIMEOUT;
+		queue_work(mbx->work_q, &mbx->work);
+
+		spin_unlock_bh(&mbx->queue_lock);
+
+		return 0;
+	}
+
+	return -EBUSY;
+}
+
+static int qlcnic_83xx_check_mac_rcode(struct qlcnic_adapter *adapter,
+				       struct qlcnic_cmd_args *cmd)
+{
+	u8 mac_cmd_rcode;
+	u32 fw_data;
+
+	if (cmd->cmd_op == QLCNIC_CMD_CONFIG_MAC_VLAN) {
+		fw_data = readl(QLCNIC_MBX_FW(adapter->ahw, 2));
+		mac_cmd_rcode = (u8)fw_data;
+		if (mac_cmd_rcode == QLC_83XX_NO_NIC_RESOURCE ||
+		    mac_cmd_rcode == QLC_83XX_MAC_PRESENT ||
+		    mac_cmd_rcode == QLC_83XX_MAC_ABSENT) {
+			cmd->rsp_opcode = QLCNIC_RCODE_SUCCESS;
+			return QLCNIC_RCODE_SUCCESS;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static void qlcnic_83xx_decode_mbx_rsp(struct qlcnic_adapter *adapter,
+				       struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct device *dev = &adapter->pdev->dev;
+	u8 mbx_err_code;
+	u32 fw_data;
+
+	fw_data = readl(QLCNIC_MBX_FW(ahw, 0));
+	mbx_err_code = QLCNIC_MBX_STATUS(fw_data);
+	qlcnic_83xx_get_mbx_data(adapter, cmd);
+
+	switch (mbx_err_code) {
+	case QLCNIC_MBX_RSP_OK:
+	case QLCNIC_MBX_PORT_RSP_OK:
+		cmd->rsp_opcode = QLCNIC_RCODE_SUCCESS;
+		break;
+	default:
+		if (!qlcnic_83xx_check_mac_rcode(adapter, cmd))
+			break;
+
+		dev_err(dev, "%s: Mailbox command failed, opcode=0x%x, cmd_type=0x%x, func=0x%x, op_mode=0x%x, error=0x%x\n",
+			__func__, cmd->cmd_op, cmd->type, ahw->pci_func,
+			ahw->op_mode, mbx_err_code);
+		cmd->rsp_opcode = QLC_83XX_MBX_RESPONSE_FAILED;
+		qlcnic_dump_mbx(adapter, cmd);
+	}
+
+	return;
+}
+
+static inline void qlcnic_dump_mailbox_registers(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u32 offset;
+
+	offset = QLCRDX(ahw, QLCNIC_DEF_INT_MASK);
+	dev_info(&adapter->pdev->dev, "Mbx interrupt mask=0x%x, Mbx interrupt enable=0x%x, Host mbx control=0x%x, Fw mbx control=0x%x",
+		 readl(ahw->pci_base0 + offset),
+		 QLCRDX(ahw, QLCNIC_MBX_INTR_ENBL),
+		 QLCRDX(ahw, QLCNIC_HOST_MBX_CTRL),
+		 QLCRDX(ahw, QLCNIC_FW_MBX_CTRL));
+}
+
+static void qlcnic_83xx_mailbox_worker(struct work_struct *work)
+{
+	struct qlcnic_mailbox *mbx = container_of(work, struct qlcnic_mailbox,
+						  work);
+	struct qlcnic_adapter *adapter = mbx->adapter;
+	const struct qlcnic_mbx_ops *mbx_ops = mbx->ops;
+	struct device *dev = &adapter->pdev->dev;
+	struct list_head *head = &mbx->cmd_q;
+	struct qlcnic_hardware_context *ahw;
+	struct qlcnic_cmd_args *cmd = NULL;
+	unsigned long flags;
+
+	ahw = adapter->ahw;
+
+	while (true) {
+		if (qlcnic_83xx_check_mbx_status(adapter)) {
+			qlcnic_83xx_flush_mbx_queue(adapter);
+			return;
+		}
+
+		spin_lock_irqsave(&mbx->aen_lock, flags);
+		mbx->rsp_status = QLC_83XX_MBX_RESPONSE_WAIT;
+		spin_unlock_irqrestore(&mbx->aen_lock, flags);
+
+		spin_lock_bh(&mbx->queue_lock);
+
+		if (list_empty(head)) {
+			spin_unlock_bh(&mbx->queue_lock);
+			return;
+		}
+		cmd = list_entry(head->next, struct qlcnic_cmd_args, list);
+
+		spin_unlock_bh(&mbx->queue_lock);
+
+		mbx_ops->encode_cmd(adapter, cmd);
+		mbx_ops->nofity_fw(adapter, QLC_83XX_MBX_REQUEST);
+
+		if (wait_for_completion_timeout(&mbx->completion,
+						QLC_83XX_MBX_TIMEOUT)) {
+			mbx_ops->decode_resp(adapter, cmd);
+			mbx_ops->nofity_fw(adapter, QLC_83XX_MBX_COMPLETION);
+		} else {
+			dev_err(dev, "%s: Mailbox command timeout, opcode=0x%x, cmd_type=0x%x, func=0x%x, op_mode=0x%x\n",
+				__func__, cmd->cmd_op, cmd->type, ahw->pci_func,
+				ahw->op_mode);
+			clear_bit(QLC_83XX_MBX_READY, &mbx->status);
+			qlcnic_dump_mailbox_registers(adapter);
+			qlcnic_83xx_get_mbx_data(adapter, cmd);
+			qlcnic_dump_mbx(adapter, cmd);
+			qlcnic_83xx_idc_request_reset(adapter,
+						      QLCNIC_FORCE_FW_DUMP_KEY);
+			cmd->rsp_opcode = QLCNIC_RCODE_TIMEOUT;
+		}
+		mbx_ops->dequeue_cmd(adapter, cmd);
+	}
+}
+
+static const struct qlcnic_mbx_ops qlcnic_83xx_mbx_ops = {
+	.enqueue_cmd    = qlcnic_83xx_enqueue_mbx_cmd,
+	.dequeue_cmd    = qlcnic_83xx_dequeue_mbx_cmd,
+	.decode_resp    = qlcnic_83xx_decode_mbx_rsp,
+	.encode_cmd     = qlcnic_83xx_encode_mbx_cmd,
+	.nofity_fw      = qlcnic_83xx_signal_mbx_cmd,
+};
+
+int qlcnic_83xx_init_mailbox_work(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_mailbox *mbx;
+
+	ahw->mailbox = kzalloc(sizeof(*mbx), GFP_KERNEL);
+	if (!ahw->mailbox)
+		return -ENOMEM;
+
+	mbx = ahw->mailbox;
+	mbx->ops = &qlcnic_83xx_mbx_ops;
+	mbx->adapter = adapter;
+
+	spin_lock_init(&mbx->queue_lock);
+	spin_lock_init(&mbx->aen_lock);
+	INIT_LIST_HEAD(&mbx->cmd_q);
+	init_completion(&mbx->completion);
+
+	mbx->work_q = create_singlethread_workqueue("qlcnic_mailbox");
+	if (mbx->work_q == NULL) {
+		kfree(mbx);
+		return -ENOMEM;
+	}
+
+	INIT_WORK(&mbx->work, qlcnic_83xx_mailbox_worker);
+	set_bit(QLC_83XX_MBX_READY, &mbx->status);
+	return 0;
+}
+
+static pci_ers_result_t qlcnic_83xx_io_error_detected(struct pci_dev *pdev,
+						      pci_channel_state_t state)
+{
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+
+	if (state == pci_channel_io_perm_failure)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	if (state == pci_channel_io_normal)
+		return PCI_ERS_RESULT_RECOVERED;
+
+	set_bit(__QLCNIC_AER, &adapter->state);
+	set_bit(__QLCNIC_RESETTING, &adapter->state);
+
+	qlcnic_83xx_aer_stop_poll_work(adapter);
+
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t qlcnic_83xx_io_slot_reset(struct pci_dev *pdev)
+{
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+	int err = 0;
+
+	pdev->error_state = pci_channel_io_normal;
+	err = pci_enable_device(pdev);
+	if (err)
+		goto disconnect;
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_set_master(pdev);
+	pci_restore_state(pdev);
+
+	err = qlcnic_83xx_aer_reset(adapter);
+	if (err == 0)
+		return PCI_ERS_RESULT_RECOVERED;
+disconnect:
+	clear_bit(__QLCNIC_AER, &adapter->state);
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	return PCI_ERS_RESULT_DISCONNECT;
+}
+
+static void qlcnic_83xx_io_resume(struct pci_dev *pdev)
+{
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+	if (test_and_clear_bit(__QLCNIC_AER, &adapter->state))
+		qlcnic_83xx_aer_start_poll_work(adapter);
+}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h
new file mode 100644
index 0000000..b75a812
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h
@@ -0,0 +1,666 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#ifndef __QLCNIC_83XX_HW_H
+#define __QLCNIC_83XX_HW_H
+
+#include <linux/types.h>
+#include <linux/etherdevice.h>
+
+#include "qlcnic_hw.h"
+
+#define QLCNIC_83XX_BAR0_LENGTH 0x4000
+
+/* Directly mapped registers */
+#define QLC_83XX_CRB_WIN_BASE		0x3800
+#define QLC_83XX_CRB_WIN_FUNC(f)	(QLC_83XX_CRB_WIN_BASE+((f)*4))
+#define QLC_83XX_SEM_LOCK_BASE		0x3840
+#define QLC_83XX_SEM_UNLOCK_BASE	0x3844
+#define QLC_83XX_SEM_LOCK_FUNC(f)	(QLC_83XX_SEM_LOCK_BASE+((f)*8))
+#define QLC_83XX_SEM_UNLOCK_FUNC(f)	(QLC_83XX_SEM_UNLOCK_BASE+((f)*8))
+#define QLC_83XX_LINK_STATE(f)		(0x3698+((f) > 7 ? 4 : 0))
+#define QLC_83XX_LINK_SPEED(f)		(0x36E0+(((f) >> 2) * 4))
+#define QLC_83XX_LINK_SPEED_FACTOR	10
+#define QLC_83xx_FUNC_VAL(v, f)	((v) & (1 << (f * 4)))
+#define QLC_83XX_INTX_PTR		0x38C0
+#define QLC_83XX_INTX_TRGR		0x38C4
+#define QLC_83XX_INTX_MASK		0x38C8
+
+#define QLC_83XX_DRV_LOCK_WAIT_COUNTER			100
+#define QLC_83XX_DRV_LOCK_WAIT_DELAY			20
+#define QLC_83XX_NEED_DRV_LOCK_RECOVERY		1
+#define QLC_83XX_DRV_LOCK_RECOVERY_IN_PROGRESS		2
+#define QLC_83XX_MAX_DRV_LOCK_RECOVERY_ATTEMPT		3
+#define QLC_83XX_DRV_LOCK_RECOVERY_DELAY		200
+#define QLC_83XX_DRV_LOCK_RECOVERY_STATUS_MASK		0x3
+#define QLC_83XX_LB_WAIT_COUNT				250
+#define QLC_83XX_LB_MSLEEP_COUNT			20
+#define QLC_83XX_NO_NIC_RESOURCE	0x5
+#define QLC_83XX_MAC_PRESENT		0xC
+#define QLC_83XX_MAC_ABSENT		0xD
+
+
+#define QLC_83XX_FLASH_SECTOR_SIZE		(64 * 1024)
+
+/* PEG status definitions */
+#define QLC_83XX_CMDPEG_COMPLETE		0xff01
+#define QLC_83XX_VALID_INTX_BIT30(val)		((val) & BIT_30)
+#define QLC_83XX_VALID_INTX_BIT31(val)		((val) & BIT_31)
+#define QLC_83XX_INTX_FUNC(val)		((val) & 0xFF)
+#define QLC_83XX_LEGACY_INTX_MAX_RETRY		100
+#define QLC_83XX_LEGACY_INTX_DELAY		4
+#define QLC_83XX_REG_DESC			1
+#define QLC_83XX_LRO_DESC			2
+#define QLC_83XX_CTRL_DESC			3
+#define QLC_83XX_FW_CAPABILITY_TSO		BIT_6
+#define QLC_83XX_FW_CAP_LRO_MSS		BIT_17
+#define QLC_83XX_HOST_RDS_MODE_UNIQUE		0
+#define QLC_83XX_HOST_SDS_MBX_IDX		8
+
+#define QLCNIC_HOST_RDS_MBX_IDX			88
+
+/* Pause control registers */
+#define QLC_83XX_SRE_SHIM_REG		0x0D200284
+#define QLC_83XX_PORT0_THRESHOLD	0x0B2003A4
+#define QLC_83XX_PORT1_THRESHOLD	0x0B2013A4
+#define QLC_83XX_PORT0_TC_MC_REG	0x0B200388
+#define QLC_83XX_PORT1_TC_MC_REG	0x0B201388
+#define QLC_83XX_PORT0_TC_STATS		0x0B20039C
+#define QLC_83XX_PORT1_TC_STATS		0x0B20139C
+#define QLC_83XX_PORT2_IFB_THRESHOLD	0x0B200704
+#define QLC_83XX_PORT3_IFB_THRESHOLD	0x0B201704
+
+/* Peg PC status registers */
+#define QLC_83XX_CRB_PEG_NET_0		0x3400003c
+#define QLC_83XX_CRB_PEG_NET_1		0x3410003c
+#define QLC_83XX_CRB_PEG_NET_2		0x3420003c
+#define QLC_83XX_CRB_PEG_NET_3		0x3430003c
+#define QLC_83XX_CRB_PEG_NET_4		0x34b0003c
+
+/* Firmware image definitions */
+#define QLC_83XX_BOOTLOADER_FLASH_ADDR	0x10000
+#define QLC_83XX_FW_FILE_NAME		"83xx_fw.bin"
+#define QLC_83XX_POST_FW_FILE_NAME	"83xx_post_fw.bin"
+#define QLC_84XX_FW_FILE_NAME		"84xx_fw.bin"
+#define QLC_83XX_BOOT_FROM_FLASH	0
+#define QLC_83XX_BOOT_FROM_FILE		0x12345678
+
+#define QLC_FW_FILE_NAME_LEN		20
+#define QLC_83XX_MAX_RESET_SEQ_ENTRIES	16
+
+#define QLC_83XX_MBX_POST_BC_OP		0x1
+#define QLC_83XX_MBX_COMPLETION		0x0
+#define QLC_83XX_MBX_REQUEST		0x1
+
+#define QLC_83XX_MBX_TIMEOUT		(5 * HZ)
+#define QLC_83XX_MBX_CMD_LOOP		5000000
+
+/* status descriptor mailbox data
+ * @phy_addr_{low|high}: physical address of buffer
+ * @sds_ring_size: buffer size
+ * @intrpt_id: interrupt id
+ * @intrpt_val: source of interrupt
+ */
+struct qlcnic_sds_mbx {
+	u32	phy_addr_low;
+	u32	phy_addr_high;
+	u32	rsvd1[4];
+#if defined(__LITTLE_ENDIAN)
+	u16	sds_ring_size;
+	u16	rsvd2;
+	u16	rsvd3[2];
+	u16	intrpt_id;
+	u8	intrpt_val;
+	u8	rsvd4;
+#elif defined(__BIG_ENDIAN)
+	u16	rsvd2;
+	u16	sds_ring_size;
+	u16	rsvd3[2];
+	u8	rsvd4;
+	u8	intrpt_val;
+	u16	intrpt_id;
+#endif
+	u32	rsvd5;
+} __packed;
+
+/* receive descriptor buffer data
+ * phy_addr_reg_{low|high}: physical address of regular buffer
+ * phy_addr_jmb_{low|high}: physical address of jumbo buffer
+ * reg_ring_sz: size of regular buffer
+ * reg_ring_len: no. of entries in regular buffer
+ * jmb_ring_len: no. of entries in jumbo buffer
+ * jmb_ring_sz: size of jumbo buffer
+ */
+struct qlcnic_rds_mbx {
+	u32	phy_addr_reg_low;
+	u32	phy_addr_reg_high;
+	u32	phy_addr_jmb_low;
+	u32	phy_addr_jmb_high;
+#if defined(__LITTLE_ENDIAN)
+	u16	reg_ring_sz;
+	u16	reg_ring_len;
+	u16	jmb_ring_sz;
+	u16	jmb_ring_len;
+#elif defined(__BIG_ENDIAN)
+	u16	reg_ring_len;
+	u16	reg_ring_sz;
+	u16	jmb_ring_len;
+	u16	jmb_ring_sz;
+#endif
+} __packed;
+
+/* host producers for regular and jumbo rings */
+struct __host_producer_mbx {
+	u32	reg_buf;
+	u32	jmb_buf;
+} __packed;
+
+/* Receive context mailbox data outbox registers
+ * @state: state of the context
+ * @vport_id: virtual port id
+ * @context_id: receive context id
+ * @num_pci_func: number of pci functions of the port
+ * @phy_port: physical port id
+ */
+struct qlcnic_rcv_mbx_out {
+#if defined(__LITTLE_ENDIAN)
+	u8	rcv_num;
+	u8	sts_num;
+	u16	ctx_id;
+	u8	state;
+	u8	num_pci_func;
+	u8	phy_port;
+	u8	vport_id;
+#elif defined(__BIG_ENDIAN)
+	u16	ctx_id;
+	u8	sts_num;
+	u8	rcv_num;
+	u8	vport_id;
+	u8	phy_port;
+	u8	num_pci_func;
+	u8	state;
+#endif
+	u32	host_csmr[QLCNIC_MAX_SDS_RINGS];
+	struct __host_producer_mbx host_prod[QLCNIC_MAX_SDS_RINGS];
+} __packed;
+
+struct qlcnic_add_rings_mbx_out {
+#if defined(__LITTLE_ENDIAN)
+	u8      rcv_num;
+	u8      sts_num;
+	u16	ctx_id;
+#elif defined(__BIG_ENDIAN)
+	u16	ctx_id;
+	u8	sts_num;
+	u8	rcv_num;
+#endif
+	u32  host_csmr[QLCNIC_MAX_SDS_RINGS];
+	struct __host_producer_mbx host_prod[QLCNIC_MAX_SDS_RINGS];
+} __packed;
+
+/* Transmit context mailbox inbox registers
+ * @phys_addr_{low|high}: DMA address of the transmit buffer
+ * @cnsmr_index_{low|high}: host consumer index
+ * @size: legth of transmit buffer ring
+ * @intr_id: interrupt id
+ * @src: src of interrupt
+ */
+struct qlcnic_tx_mbx {
+	u32	phys_addr_low;
+	u32	phys_addr_high;
+	u32	cnsmr_index_low;
+	u32	cnsmr_index_high;
+#if defined(__LITTLE_ENDIAN)
+	u16	size;
+	u16	intr_id;
+	u8	src;
+	u8	rsvd[3];
+#elif defined(__BIG_ENDIAN)
+	u16	intr_id;
+	u16	size;
+	u8	rsvd[3];
+	u8	src;
+#endif
+} __packed;
+
+/* Transmit context mailbox outbox registers
+ * @host_prod: host producer index
+ * @ctx_id: transmit context id
+ * @state: state of the transmit context
+ */
+
+struct qlcnic_tx_mbx_out {
+	u32	host_prod;
+#if defined(__LITTLE_ENDIAN)
+	u16	ctx_id;
+	u8	state;
+	u8	rsvd;
+#elif defined(__BIG_ENDIAN)
+	u8	rsvd;
+	u8	state;
+	u16	ctx_id;
+#endif
+} __packed;
+
+struct qlcnic_intrpt_config {
+	u8	type;
+	u8	enabled;
+	u16	id;
+	u32	src;
+};
+
+struct qlcnic_macvlan_mbx {
+#if defined(__LITTLE_ENDIAN)
+	u8	mac_addr0;
+	u8	mac_addr1;
+	u8	mac_addr2;
+	u8	mac_addr3;
+	u8	mac_addr4;
+	u8	mac_addr5;
+	u16	vlan;
+#elif defined(__BIG_ENDIAN)
+	u8	mac_addr3;
+	u8	mac_addr2;
+	u8	mac_addr1;
+	u8	mac_addr0;
+	u16	vlan;
+	u8	mac_addr5;
+	u8	mac_addr4;
+#endif
+};
+
+struct qlc_83xx_fw_info {
+	const struct firmware	*fw;
+	char	fw_file_name[QLC_FW_FILE_NAME_LEN];
+};
+
+struct qlc_83xx_reset {
+	struct qlc_83xx_reset_hdr *hdr;
+	int	seq_index;
+	int	seq_error;
+	int	array_index;
+	u32	array[QLC_83XX_MAX_RESET_SEQ_ENTRIES];
+	u8	*buff;
+	u8	*stop_offset;
+	u8	*start_offset;
+	u8	*init_offset;
+	u8	seq_end;
+	u8	template_end;
+};
+
+#define QLC_83XX_IDC_DISABLE_FW_RESET_RECOVERY		0x1
+#define QLC_83XX_IDC_GRACEFULL_RESET			0x2
+#define QLC_83XX_IDC_DISABLE_FW_DUMP			0x4
+#define QLC_83XX_IDC_TIMESTAMP				0
+#define QLC_83XX_IDC_DURATION				1
+#define QLC_83XX_IDC_INIT_TIMEOUT_SECS			30
+#define QLC_83XX_IDC_RESET_ACK_TIMEOUT_SECS		10
+#define QLC_83XX_IDC_RESET_TIMEOUT_SECS		10
+#define QLC_83XX_IDC_QUIESCE_ACK_TIMEOUT_SECS		20
+#define QLC_83XX_IDC_FW_POLL_DELAY			(1 * HZ)
+#define QLC_83XX_IDC_FW_FAIL_THRESH			2
+#define QLC_83XX_IDC_MAX_FUNC_PER_PARTITION_INFO	8
+#define QLC_83XX_IDC_MAX_CNA_FUNCTIONS			16
+#define QLC_83XX_IDC_MAJOR_VERSION			1
+#define QLC_83XX_IDC_MINOR_VERSION			0
+#define QLC_83XX_IDC_FLASH_PARAM_ADDR			0x3e8020
+
+struct qlcnic_adapter;
+struct qlcnic_fw_dump;
+
+struct qlc_83xx_idc {
+	int (*state_entry) (struct qlcnic_adapter *);
+	u64		sec_counter;
+	u64		delay;
+	unsigned long	status;
+	int		err_code;
+	int		collect_dump;
+	u8		curr_state;
+	u8		prev_state;
+	u8		vnic_state;
+	u8		vnic_wait_limit;
+	u8		quiesce_req;
+	u8		delay_reset;
+	char		**name;
+};
+
+enum qlcnic_vlan_operations {
+	QLC_VLAN_ADD = 0,
+	QLC_VLAN_DELETE
+};
+
+/* Device States */
+enum qlcnic_83xx_states {
+	QLC_83XX_IDC_DEV_UNKNOWN,
+	QLC_83XX_IDC_DEV_COLD,
+	QLC_83XX_IDC_DEV_INIT,
+	QLC_83XX_IDC_DEV_READY,
+	QLC_83XX_IDC_DEV_NEED_RESET,
+	QLC_83XX_IDC_DEV_NEED_QUISCENT,
+	QLC_83XX_IDC_DEV_FAILED,
+	QLC_83XX_IDC_DEV_QUISCENT
+};
+
+#define QLCNIC_MBX_RSP(reg)		LSW(reg)
+#define QLCNIC_MBX_NUM_REGS(reg)	(MSW(reg) & 0x1FF)
+#define QLCNIC_MBX_STATUS(reg)		(((reg) >> 25) & 0x7F)
+#define QLCNIC_MBX_HOST(ahw, i)	((ahw)->pci_base0 + ((i) * 4))
+#define QLCNIC_MBX_FW(ahw, i)		((ahw)->pci_base0 + 0x800 + ((i) * 4))
+
+/* Mailbox process AEN count */
+#define QLC_83XX_IDC_COMP_AEN			3
+#define QLC_83XX_MBX_AEN_CNT			5
+#define QLC_83XX_MODULE_LOADED			1
+#define QLC_83XX_MBX_READY			2
+#define QLC_83XX_MBX_AEN_ACK			3
+#define QLC_83XX_SFP_PRESENT(data)		((data) & 3)
+#define QLC_83XX_SFP_ERR(data)			(((data) >> 2) & 3)
+#define QLC_83XX_SFP_MODULE_TYPE(data)		(((data) >> 4) & 0x1F)
+#define QLC_83XX_SFP_CU_LENGTH(data)		(LSB((data) >> 16))
+#define QLC_83XX_SFP_TX_FAULT(data)		((data) & BIT_10)
+#define QLC_83XX_LINK_STATS(data)		((data) & BIT_0)
+#define QLC_83XX_CURRENT_LINK_SPEED(data)	(((data) >> 3) & 7)
+#define QLC_83XX_LINK_PAUSE(data)		(((data) >> 6) & 3)
+#define QLC_83XX_LINK_LB(data)			(((data) >> 8) & 7)
+#define QLC_83XX_LINK_FEC(data)		((data) & BIT_12)
+#define QLC_83XX_LINK_EEE(data)		((data) & BIT_13)
+#define QLC_83XX_DCBX(data)			(((data) >> 28) & 7)
+#define QLC_83XX_AUTONEG(data)			((data) & BIT_15)
+#define QLC_83XX_TX_PAUSE			0x10
+#define QLC_83XX_RX_PAUSE			0x20
+#define QLC_83XX_TX_RX_PAUSE			0x30
+#define QLC_83XX_CFG_STD_PAUSE			(1 << 5)
+#define QLC_83XX_CFG_STD_TX_PAUSE		(1 << 20)
+#define QLC_83XX_CFG_STD_RX_PAUSE		(2 << 20)
+#define QLC_83XX_CFG_STD_TX_RX_PAUSE		(3 << 20)
+#define QLC_83XX_ENABLE_AUTONEG		(1 << 15)
+#define QLC_83XX_CFG_LOOPBACK_HSS		(2 << 1)
+#define QLC_83XX_CFG_LOOPBACK_PHY		(3 << 1)
+#define QLC_83XX_CFG_LOOPBACK_EXT		(4 << 1)
+
+/* LED configuration settings */
+#define QLC_83XX_ENABLE_BEACON		0xe
+#define QLC_83XX_BEACON_ON		1
+#define QLC_83XX_BEACON_OFF		0
+#define QLC_83XX_LED_RATE		0xff
+#define QLC_83XX_LED_ACT		(1 << 10)
+#define QLC_83XX_LED_MOD		(0 << 13)
+#define QLC_83XX_LED_CONFIG	(QLC_83XX_LED_RATE | QLC_83XX_LED_ACT |	\
+				 QLC_83XX_LED_MOD)
+
+#define QLC_83XX_10M_LINK	1
+#define QLC_83XX_100M_LINK	2
+#define QLC_83XX_1G_LINK	3
+#define QLC_83XX_10G_LINK	4
+#define QLC_83XX_STAT_TX	3
+#define QLC_83XX_STAT_RX	2
+#define QLC_83XX_STAT_MAC	1
+#define QLC_83XX_TX_STAT_REGS	14
+#define QLC_83XX_RX_STAT_REGS	40
+#define QLC_83XX_MAC_STAT_REGS	94
+
+#define QLC_83XX_GET_FUNC_PRIVILEGE(VAL, FN)	(0x3 & ((VAL) >> (FN * 2)))
+#define QLC_83XX_SET_FUNC_OPMODE(VAL, FN)	((VAL) << (FN * 2))
+#define QLC_83XX_DEFAULT_OPMODE			0x55555555
+#define QLC_83XX_PRIVLEGED_FUNC			0x1
+#define QLC_83XX_VIRTUAL_FUNC				0x2
+
+#define QLC_83XX_LB_MAX_FILTERS			2048
+#define QLC_83XX_LB_BUCKET_SIZE			256
+#define QLC_83XX_MINIMUM_VECTOR			3
+#define QLC_83XX_MAX_MC_COUNT			38
+#define QLC_83XX_MAX_UC_COUNT			4096
+
+#define QLC_83XX_PVID_STRIP_CAPABILITY		BIT_22
+#define QLC_83XX_GET_FUNC_MODE_FROM_NPAR_INFO(val)	(val & 0x80000000)
+#define QLC_83XX_GET_LRO_CAPABILITY(val)		(val & 0x20)
+#define QLC_83XX_GET_LSO_CAPABILITY(val)		(val & 0x40)
+#define QLC_83XX_GET_HW_LRO_CAPABILITY(val)		(val & 0x400)
+#define QLC_83XX_GET_VLAN_ALIGN_CAPABILITY(val)	(val & 0x4000)
+#define QLC_83XX_GET_FW_LRO_MSS_CAPABILITY(val)	(val & 0x20000)
+#define QLC_83XX_ESWITCH_CAPABILITY			BIT_23
+#define QLC_83XX_SRIOV_MODE				0x1
+#define QLCNIC_BRDTYPE_83XX_10G			0x0083
+
+#define QLC_83XX_FLASH_SPI_STATUS		0x2808E010
+#define QLC_83XX_FLASH_SPI_CONTROL		0x2808E014
+#define QLC_83XX_FLASH_STATUS			0x42100004
+#define QLC_83XX_FLASH_CONTROL			0x42110004
+#define QLC_83XX_FLASH_ADDR			0x42110008
+#define QLC_83XX_FLASH_WRDATA			0x4211000C
+#define QLC_83XX_FLASH_RDDATA			0x42110018
+#define QLC_83XX_FLASH_DIRECT_WINDOW		0x42110030
+#define QLC_83XX_FLASH_DIRECT_DATA(DATA)	(0x42150000 | (0x0000FFFF&DATA))
+#define QLC_83XX_FLASH_SECTOR_ERASE_CMD	0xdeadbeef
+#define QLC_83XX_FLASH_WRITE_CMD		0xdacdacda
+#define QLC_83XX_FLASH_BULK_WRITE_CMD		0xcadcadca
+#define QLC_83XX_FLASH_READ_RETRY_COUNT	5000
+#define QLC_83XX_FLASH_STATUS_READY		0x6
+#define QLC_83XX_FLASH_WRITE_MIN		2
+#define QLC_83XX_FLASH_WRITE_MAX		64
+#define QLC_83XX_FLASH_STATUS_REG_POLL_DELAY	1
+#define QLC_83XX_ERASE_MODE			1
+#define QLC_83XX_WRITE_MODE			2
+#define QLC_83XX_BULK_WRITE_MODE		3
+#define QLC_83XX_FLASH_FDT_WRITE_DEF_SIG	0xFD0100
+#define QLC_83XX_FLASH_FDT_ERASE_DEF_SIG	0xFD0300
+#define QLC_83XX_FLASH_FDT_READ_MFG_ID_VAL	0xFD009F
+#define QLC_83XX_FLASH_OEM_ERASE_SIG		0xFD03D8
+#define QLC_83XX_FLASH_OEM_WRITE_SIG		0xFD0101
+#define QLC_83XX_FLASH_OEM_READ_SIG		0xFD0005
+#define QLC_83XX_FLASH_ADDR_TEMP_VAL		0x00800000
+#define QLC_83XX_FLASH_ADDR_SECOND_TEMP_VAL	0x00800001
+#define QLC_83XX_FLASH_WRDATA_DEF		0x0
+#define QLC_83XX_FLASH_READ_CTRL		0x3F
+#define QLC_83XX_FLASH_SPI_CTRL		0x4
+#define QLC_83XX_FLASH_FIRST_ERASE_MS_VAL	0x2
+#define QLC_83XX_FLASH_SECOND_ERASE_MS_VAL	0x5
+#define QLC_83XX_FLASH_LAST_ERASE_MS_VAL	0x3D
+#define QLC_83XX_FLASH_FIRST_MS_PATTERN	0x43
+#define QLC_83XX_FLASH_SECOND_MS_PATTERN	0x7F
+#define QLC_83XX_FLASH_LAST_MS_PATTERN		0x7D
+#define QLC_83xx_FLASH_MAX_WAIT_USEC		100
+#define QLC_83XX_FLASH_LOCK_TIMEOUT		10000
+
+enum qlc_83xx_mbx_cmd_type {
+	QLC_83XX_MBX_CMD_WAIT = 0,
+	QLC_83XX_MBX_CMD_NO_WAIT,
+	QLC_83XX_MBX_CMD_BUSY_WAIT,
+};
+
+enum qlc_83xx_mbx_response_states {
+	QLC_83XX_MBX_RESPONSE_WAIT = 0,
+	QLC_83XX_MBX_RESPONSE_ARRIVED,
+};
+
+#define QLC_83XX_MBX_RESPONSE_FAILED	0x2
+#define QLC_83XX_MBX_RESPONSE_UNKNOWN	0x3
+
+/* Additional registers in 83xx */
+enum qlc_83xx_ext_regs {
+	QLCNIC_GLOBAL_RESET = 0,
+	QLCNIC_WILDCARD,
+	QLCNIC_INFORMANT,
+	QLCNIC_HOST_MBX_CTRL,
+	QLCNIC_FW_MBX_CTRL,
+	QLCNIC_BOOTLOADER_ADDR,
+	QLCNIC_BOOTLOADER_SIZE,
+	QLCNIC_FW_IMAGE_ADDR,
+	QLCNIC_MBX_INTR_ENBL,
+	QLCNIC_DEF_INT_MASK,
+	QLCNIC_DEF_INT_ID,
+	QLC_83XX_IDC_MAJ_VERSION,
+	QLC_83XX_IDC_DEV_STATE,
+	QLC_83XX_IDC_DRV_PRESENCE,
+	QLC_83XX_IDC_DRV_ACK,
+	QLC_83XX_IDC_CTRL,
+	QLC_83XX_IDC_DRV_AUDIT,
+	QLC_83XX_IDC_MIN_VERSION,
+	QLC_83XX_RECOVER_DRV_LOCK,
+	QLC_83XX_IDC_PF_0,
+	QLC_83XX_IDC_PF_1,
+	QLC_83XX_IDC_PF_2,
+	QLC_83XX_IDC_PF_3,
+	QLC_83XX_IDC_PF_4,
+	QLC_83XX_IDC_PF_5,
+	QLC_83XX_IDC_PF_6,
+	QLC_83XX_IDC_PF_7,
+	QLC_83XX_IDC_PF_8,
+	QLC_83XX_IDC_PF_9,
+	QLC_83XX_IDC_PF_10,
+	QLC_83XX_IDC_PF_11,
+	QLC_83XX_IDC_PF_12,
+	QLC_83XX_IDC_PF_13,
+	QLC_83XX_IDC_PF_14,
+	QLC_83XX_IDC_PF_15,
+	QLC_83XX_IDC_DEV_PARTITION_INFO_1,
+	QLC_83XX_IDC_DEV_PARTITION_INFO_2,
+	QLC_83XX_DRV_OP_MODE,
+	QLC_83XX_VNIC_STATE,
+	QLC_83XX_DRV_LOCK,
+	QLC_83XX_DRV_UNLOCK,
+	QLC_83XX_DRV_LOCK_ID,
+	QLC_83XX_ASIC_TEMP,
+};
+
+/* Initialize/Stop NIC command bit definitions */
+#define QLC_REGISTER_LB_IDC		BIT_0
+#define QLC_REGISTER_DCB_AEN		BIT_1
+#define QLC_83XX_MULTI_TENANCY_INFO	BIT_29
+#define QLC_INIT_FW_RESOURCES		BIT_31
+
+/* 83xx funcitons */
+int qlcnic_83xx_get_fw_version(struct qlcnic_adapter *);
+int qlcnic_83xx_issue_cmd(struct qlcnic_adapter *, struct qlcnic_cmd_args *);
+int qlcnic_83xx_setup_intr(struct qlcnic_adapter *);
+void qlcnic_83xx_get_func_no(struct qlcnic_adapter *);
+int qlcnic_83xx_cam_lock(struct qlcnic_adapter *);
+void qlcnic_83xx_cam_unlock(struct qlcnic_adapter *);
+int qlcnic_send_ctrl_op(struct qlcnic_adapter *, struct qlcnic_cmd_args *, u32);
+void qlcnic_83xx_add_sysfs(struct qlcnic_adapter *);
+void qlcnic_83xx_remove_sysfs(struct qlcnic_adapter *);
+void qlcnic_83xx_write_crb(struct qlcnic_adapter *, char *, loff_t, size_t);
+void qlcnic_83xx_read_crb(struct qlcnic_adapter *, char *, loff_t, size_t);
+int qlcnic_83xx_rd_reg_indirect(struct qlcnic_adapter *, ulong, int *);
+int qlcnic_83xx_wrt_reg_indirect(struct qlcnic_adapter *, ulong, u32);
+int qlcnic_83xx_nic_set_promisc(struct qlcnic_adapter *, u32);
+int qlcnic_83xx_config_hw_lro(struct qlcnic_adapter *, int);
+int qlcnic_83xx_config_rss(struct qlcnic_adapter *, int);
+void qlcnic_83xx_change_l2_filter(struct qlcnic_adapter *, u64 *, u16);
+int qlcnic_83xx_get_pci_info(struct qlcnic_adapter *, struct qlcnic_pci_info *);
+int qlcnic_83xx_set_nic_info(struct qlcnic_adapter *, struct qlcnic_info *);
+void qlcnic_83xx_initialize_nic(struct qlcnic_adapter *, int);
+
+int qlcnic_83xx_napi_add(struct qlcnic_adapter *, struct net_device *);
+void qlcnic_83xx_napi_del(struct qlcnic_adapter *);
+void qlcnic_83xx_napi_enable(struct qlcnic_adapter *);
+void qlcnic_83xx_napi_disable(struct qlcnic_adapter *);
+int qlcnic_83xx_config_led(struct qlcnic_adapter *, u32, u32);
+int qlcnic_ind_wr(struct qlcnic_adapter *, u32, u32);
+int qlcnic_ind_rd(struct qlcnic_adapter *, u32);
+int qlcnic_83xx_create_rx_ctx(struct qlcnic_adapter *);
+int qlcnic_83xx_create_tx_ctx(struct qlcnic_adapter *,
+			      struct qlcnic_host_tx_ring *, int);
+void qlcnic_83xx_del_rx_ctx(struct qlcnic_adapter *);
+void qlcnic_83xx_del_tx_ctx(struct qlcnic_adapter *,
+			    struct qlcnic_host_tx_ring *);
+int qlcnic_83xx_get_nic_info(struct qlcnic_adapter *, struct qlcnic_info *, u8);
+int qlcnic_83xx_setup_link_event(struct qlcnic_adapter *, int);
+void qlcnic_83xx_process_rcv_ring_diag(struct qlcnic_host_sds_ring *);
+int qlcnic_83xx_config_intrpt(struct qlcnic_adapter *, bool);
+int qlcnic_83xx_sre_macaddr_change(struct qlcnic_adapter *, u8 *, u16, u8);
+int qlcnic_83xx_get_mac_address(struct qlcnic_adapter *, u8 *, u8);
+int qlcnic_83xx_alloc_mbx_args(struct qlcnic_cmd_args *,
+			       struct qlcnic_adapter *, u32);
+void qlcnic_free_mbx_args(struct qlcnic_cmd_args *);
+void qlcnic_set_npar_data(struct qlcnic_adapter *, const struct qlcnic_info *,
+			  struct qlcnic_info *);
+int qlcnic_83xx_config_intr_coal(struct qlcnic_adapter *,
+				 struct ethtool_coalesce *);
+int qlcnic_83xx_set_rx_tx_intr_coal(struct qlcnic_adapter *);
+int qlcnic_83xx_get_port_info(struct qlcnic_adapter *);
+void qlcnic_83xx_enable_mbx_interrupt(struct qlcnic_adapter *);
+void qlcnic_83xx_disable_mbx_intr(struct qlcnic_adapter *);
+irqreturn_t qlcnic_83xx_clear_legacy_intr(struct qlcnic_adapter *);
+irqreturn_t qlcnic_83xx_intr(int, void *);
+irqreturn_t qlcnic_83xx_tmp_intr(int, void *);
+void qlcnic_83xx_check_vf(struct qlcnic_adapter *,
+			  const struct pci_device_id *);
+int qlcnic_83xx_config_default_opmode(struct qlcnic_adapter *);
+int qlcnic_83xx_setup_mbx_intr(struct qlcnic_adapter *);
+void qlcnic_83xx_free_mbx_intr(struct qlcnic_adapter *);
+void qlcnic_83xx_register_map(struct qlcnic_hardware_context *);
+void qlcnic_83xx_idc_aen_work(struct work_struct *);
+void qlcnic_83xx_config_ipaddr(struct qlcnic_adapter *, __be32, int);
+
+int qlcnic_83xx_erase_flash_sector(struct qlcnic_adapter *, u32);
+int qlcnic_83xx_flash_bulk_write(struct qlcnic_adapter *, u32, u32 *, int);
+int qlcnic_83xx_flash_write32(struct qlcnic_adapter *, u32, u32 *);
+int qlcnic_83xx_lock_flash(struct qlcnic_adapter *);
+void qlcnic_83xx_unlock_flash(struct qlcnic_adapter *);
+int qlcnic_83xx_save_flash_status(struct qlcnic_adapter *);
+int qlcnic_83xx_restore_flash_status(struct qlcnic_adapter *, int);
+int qlcnic_83xx_read_flash_mfg_id(struct qlcnic_adapter *);
+int qlcnic_83xx_read_flash_descriptor_table(struct qlcnic_adapter *);
+int qlcnic_83xx_flash_read32(struct qlcnic_adapter *, u32, u8 *, int);
+int qlcnic_83xx_lockless_flash_read32(struct qlcnic_adapter *,
+				      u32, u8 *, int);
+int qlcnic_83xx_init(struct qlcnic_adapter *, int);
+int qlcnic_83xx_idc_ready_state_entry(struct qlcnic_adapter *);
+void qlcnic_83xx_idc_poll_dev_state(struct work_struct *);
+void qlcnic_83xx_idc_exit(struct qlcnic_adapter *);
+void qlcnic_83xx_idc_request_reset(struct qlcnic_adapter *, u32);
+int qlcnic_83xx_lock_driver(struct qlcnic_adapter *);
+void qlcnic_83xx_unlock_driver(struct qlcnic_adapter *);
+int qlcnic_83xx_set_default_offload_settings(struct qlcnic_adapter *);
+int qlcnic_83xx_idc_vnic_pf_entry(struct qlcnic_adapter *);
+int qlcnic_83xx_disable_vnic_mode(struct qlcnic_adapter *, int);
+int qlcnic_83xx_config_vnic_opmode(struct qlcnic_adapter *);
+int qlcnic_83xx_get_vnic_vport_info(struct qlcnic_adapter *,
+				    struct qlcnic_info *, u8);
+int qlcnic_83xx_get_vnic_pf_info(struct qlcnic_adapter *, struct qlcnic_info *);
+int qlcnic_83xx_set_port_eswitch_status(struct qlcnic_adapter *, int, int *);
+
+void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *);
+void qlcnic_83xx_get_stats(struct qlcnic_adapter *adapter, u64 *data);
+int qlcnic_83xx_extend_md_capab(struct qlcnic_adapter *);
+int qlcnic_83xx_get_link_ksettings(struct qlcnic_adapter *adapter,
+				   struct ethtool_link_ksettings *ecmd);
+int qlcnic_83xx_set_link_ksettings(struct qlcnic_adapter *adapter,
+				   const struct ethtool_link_ksettings *ecmd);
+void qlcnic_83xx_get_pauseparam(struct qlcnic_adapter *,
+				struct ethtool_pauseparam *);
+int qlcnic_83xx_set_pauseparam(struct qlcnic_adapter *,
+			       struct ethtool_pauseparam *);
+int qlcnic_83xx_test_link(struct qlcnic_adapter *);
+void qlcnic_83xx_get_port_type(struct qlcnic_adapter *adapter);
+int qlcnic_83xx_reg_test(struct qlcnic_adapter *);
+int qlcnic_83xx_get_regs_len(struct qlcnic_adapter *);
+int qlcnic_83xx_get_registers(struct qlcnic_adapter *, u32 *);
+int qlcnic_83xx_loopback_test(struct net_device *, u8);
+int qlcnic_83xx_interrupt_test(struct net_device *);
+int qlcnic_83xx_set_led(struct net_device *, enum ethtool_phys_id_state);
+int qlcnic_83xx_flash_test(struct qlcnic_adapter *);
+int qlcnic_83xx_enable_flash_write(struct qlcnic_adapter *);
+int qlcnic_83xx_disable_flash_write(struct qlcnic_adapter *);
+void qlcnic_83xx_enable_mbx_poll(struct qlcnic_adapter *);
+void qlcnic_83xx_disable_mbx_poll(struct qlcnic_adapter *);
+int qlcnic_83xx_idc_init(struct qlcnic_adapter *);
+int qlcnic_83xx_idc_reattach_driver(struct qlcnic_adapter *);
+int qlcnic_83xx_set_vnic_opmode(struct qlcnic_adapter *);
+int qlcnic_83xx_check_vnic_state(struct qlcnic_adapter *);
+void qlcnic_83xx_aer_stop_poll_work(struct qlcnic_adapter *);
+int qlcnic_83xx_aer_reset(struct qlcnic_adapter *);
+void qlcnic_83xx_aer_start_poll_work(struct qlcnic_adapter *);
+u32 qlcnic_83xx_get_saved_state(void *, u32);
+void qlcnic_83xx_set_saved_state(void *, u32, u32);
+void qlcnic_83xx_cache_tmpl_hdr_values(struct qlcnic_fw_dump *);
+u32 qlcnic_83xx_get_cap_size(void *, int);
+void qlcnic_83xx_set_sys_info(void *, int, u32);
+void qlcnic_83xx_store_cap_mask(void *, u32);
+int qlcnic_ms_mem_write128(struct qlcnic_adapter *, u64, u32 *, u32);
+#endif
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c
new file mode 100644
index 0000000..a496390
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c
@@ -0,0 +1,2612 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include "qlcnic_sriov.h"
+#include "qlcnic.h"
+#include "qlcnic_hw.h"
+
+/* Reset template definitions */
+#define QLC_83XX_RESTART_TEMPLATE_SIZE		0x2000
+#define QLC_83XX_RESET_TEMPLATE_ADDR		0x4F0000
+#define QLC_83XX_RESET_SEQ_VERSION		0x0101
+
+#define QLC_83XX_OPCODE_NOP			0x0000
+#define QLC_83XX_OPCODE_WRITE_LIST		0x0001
+#define QLC_83XX_OPCODE_READ_WRITE_LIST		0x0002
+#define QLC_83XX_OPCODE_POLL_LIST		0x0004
+#define QLC_83XX_OPCODE_POLL_WRITE_LIST		0x0008
+#define QLC_83XX_OPCODE_READ_MODIFY_WRITE	0x0010
+#define QLC_83XX_OPCODE_SEQ_PAUSE		0x0020
+#define QLC_83XX_OPCODE_SEQ_END			0x0040
+#define QLC_83XX_OPCODE_TMPL_END		0x0080
+#define QLC_83XX_OPCODE_POLL_READ_LIST		0x0100
+
+/* EPORT control registers */
+#define QLC_83XX_RESET_CONTROL			0x28084E50
+#define QLC_83XX_RESET_REG			0x28084E60
+#define QLC_83XX_RESET_PORT0			0x28084E70
+#define QLC_83XX_RESET_PORT1			0x28084E80
+#define QLC_83XX_RESET_PORT2			0x28084E90
+#define QLC_83XX_RESET_PORT3			0x28084EA0
+#define QLC_83XX_RESET_SRESHIM			0x28084EB0
+#define QLC_83XX_RESET_EPGSHIM			0x28084EC0
+#define QLC_83XX_RESET_ETHERPCS			0x28084ED0
+
+static int qlcnic_83xx_init_default_driver(struct qlcnic_adapter *adapter);
+static int qlcnic_83xx_check_heartbeat(struct qlcnic_adapter *p_dev);
+static int qlcnic_83xx_restart_hw(struct qlcnic_adapter *adapter);
+static int qlcnic_83xx_check_hw_status(struct qlcnic_adapter *p_dev);
+static int qlcnic_83xx_get_reset_instruction_template(struct qlcnic_adapter *);
+static void qlcnic_83xx_stop_hw(struct qlcnic_adapter *);
+
+/* Template header */
+struct qlc_83xx_reset_hdr {
+#if defined(__LITTLE_ENDIAN)
+	u16	version;
+	u16	signature;
+	u16	size;
+	u16	entries;
+	u16	hdr_size;
+	u16	checksum;
+	u16	init_offset;
+	u16	start_offset;
+#elif defined(__BIG_ENDIAN)
+	u16	signature;
+	u16	version;
+	u16	entries;
+	u16	size;
+	u16	checksum;
+	u16	hdr_size;
+	u16	start_offset;
+	u16	init_offset;
+#endif
+} __packed;
+
+/* Command entry header. */
+struct qlc_83xx_entry_hdr {
+#if defined(__LITTLE_ENDIAN)
+	u16	cmd;
+	u16	size;
+	u16	count;
+	u16	delay;
+#elif defined(__BIG_ENDIAN)
+	u16	size;
+	u16	cmd;
+	u16	delay;
+	u16	count;
+#endif
+} __packed;
+
+/* Generic poll command */
+struct qlc_83xx_poll {
+	u32	mask;
+	u32	status;
+} __packed;
+
+/* Read modify write command */
+struct qlc_83xx_rmw {
+	u32	mask;
+	u32	xor_value;
+	u32	or_value;
+#if defined(__LITTLE_ENDIAN)
+	u8	shl;
+	u8	shr;
+	u8	index_a;
+	u8	rsvd;
+#elif defined(__BIG_ENDIAN)
+	u8	rsvd;
+	u8	index_a;
+	u8	shr;
+	u8	shl;
+#endif
+} __packed;
+
+/* Generic command with 2 DWORD */
+struct qlc_83xx_entry {
+	u32 arg1;
+	u32 arg2;
+} __packed;
+
+/* Generic command with 4 DWORD */
+struct qlc_83xx_quad_entry {
+	u32 dr_addr;
+	u32 dr_value;
+	u32 ar_addr;
+	u32 ar_value;
+} __packed;
+static const char *const qlc_83xx_idc_states[] = {
+	"Unknown",
+	"Cold",
+	"Init",
+	"Ready",
+	"Need Reset",
+	"Need Quiesce",
+	"Failed",
+	"Quiesce"
+};
+
+static int
+qlcnic_83xx_idc_check_driver_presence_reg(struct qlcnic_adapter *adapter)
+{
+	u32 val;
+
+	val = QLCRDX(adapter->ahw, QLC_83XX_IDC_DRV_PRESENCE);
+	if ((val & 0xFFFF))
+		return 1;
+	else
+		return 0;
+}
+
+static void qlcnic_83xx_idc_log_state_history(struct qlcnic_adapter *adapter)
+{
+	u32 cur, prev;
+	cur = adapter->ahw->idc.curr_state;
+	prev = adapter->ahw->idc.prev_state;
+
+	dev_info(&adapter->pdev->dev,
+		 "current state  = %s,  prev state = %s\n",
+		 adapter->ahw->idc.name[cur],
+		 adapter->ahw->idc.name[prev]);
+}
+
+static int qlcnic_83xx_idc_update_audit_reg(struct qlcnic_adapter *adapter,
+					    u8 mode, int lock)
+{
+	u32 val;
+	int seconds;
+
+	if (lock) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+	}
+
+	val = QLCRDX(adapter->ahw, QLC_83XX_IDC_DRV_AUDIT);
+	val |= (adapter->portnum & 0xf);
+	val |= mode << 7;
+	if (mode)
+		seconds = jiffies / HZ - adapter->ahw->idc.sec_counter;
+	else
+		seconds = jiffies / HZ;
+
+	val |= seconds << 8;
+	QLCWRX(adapter->ahw, QLC_83XX_IDC_DRV_AUDIT, val);
+	adapter->ahw->idc.sec_counter = jiffies / HZ;
+
+	if (lock)
+		qlcnic_83xx_unlock_driver(adapter);
+
+	return 0;
+}
+
+static void qlcnic_83xx_idc_update_minor_version(struct qlcnic_adapter *adapter)
+{
+	u32 val;
+
+	val = QLCRDX(adapter->ahw, QLC_83XX_IDC_MIN_VERSION);
+	val = val & ~(0x3 << (adapter->portnum * 2));
+	val = val | (QLC_83XX_IDC_MINOR_VERSION << (adapter->portnum * 2));
+	QLCWRX(adapter->ahw, QLC_83XX_IDC_MIN_VERSION, val);
+}
+
+static int qlcnic_83xx_idc_update_major_version(struct qlcnic_adapter *adapter,
+						int lock)
+{
+	u32 val;
+
+	if (lock) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+	}
+
+	val = QLCRDX(adapter->ahw, QLC_83XX_IDC_MAJ_VERSION);
+	val = val & ~0xFF;
+	val = val | QLC_83XX_IDC_MAJOR_VERSION;
+	QLCWRX(adapter->ahw, QLC_83XX_IDC_MAJ_VERSION, val);
+
+	if (lock)
+		qlcnic_83xx_unlock_driver(adapter);
+
+	return 0;
+}
+
+static int
+qlcnic_83xx_idc_update_drv_presence_reg(struct qlcnic_adapter *adapter,
+					int status, int lock)
+{
+	u32 val;
+
+	if (lock) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+	}
+
+	val = QLCRDX(adapter->ahw, QLC_83XX_IDC_DRV_PRESENCE);
+
+	if (status)
+		val = val | (1 << adapter->portnum);
+	else
+		val = val & ~(1 << adapter->portnum);
+
+	QLCWRX(adapter->ahw, QLC_83XX_IDC_DRV_PRESENCE, val);
+	qlcnic_83xx_idc_update_minor_version(adapter);
+
+	if (lock)
+		qlcnic_83xx_unlock_driver(adapter);
+
+	return 0;
+}
+
+static int qlcnic_83xx_idc_check_major_version(struct qlcnic_adapter *adapter)
+{
+	u32 val;
+	u8 version;
+
+	val = QLCRDX(adapter->ahw, QLC_83XX_IDC_MAJ_VERSION);
+	version = val & 0xFF;
+
+	if (version != QLC_83XX_IDC_MAJOR_VERSION) {
+		dev_info(&adapter->pdev->dev,
+			 "%s:mismatch. version 0x%x, expected version 0x%x\n",
+			 __func__, version, QLC_83XX_IDC_MAJOR_VERSION);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int qlcnic_83xx_idc_clear_registers(struct qlcnic_adapter *adapter,
+					   int lock)
+{
+	u32 val;
+
+	if (lock) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+	}
+
+	QLCWRX(adapter->ahw, QLC_83XX_IDC_DRV_ACK, 0);
+	/* Clear graceful reset bit */
+	val = QLCRDX(adapter->ahw, QLC_83XX_IDC_CTRL);
+	val &= ~QLC_83XX_IDC_GRACEFULL_RESET;
+	QLCWRX(adapter->ahw, QLC_83XX_IDC_CTRL, val);
+
+	if (lock)
+		qlcnic_83xx_unlock_driver(adapter);
+
+	return 0;
+}
+
+static int qlcnic_83xx_idc_update_drv_ack_reg(struct qlcnic_adapter *adapter,
+					      int flag, int lock)
+{
+	u32 val;
+
+	if (lock) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+	}
+
+	val = QLCRDX(adapter->ahw, QLC_83XX_IDC_DRV_ACK);
+	if (flag)
+		val = val | (1 << adapter->portnum);
+	else
+		val = val & ~(1 << adapter->portnum);
+	QLCWRX(adapter->ahw, QLC_83XX_IDC_DRV_ACK, val);
+
+	if (lock)
+		qlcnic_83xx_unlock_driver(adapter);
+
+	return 0;
+}
+
+static int qlcnic_83xx_idc_check_timeout(struct qlcnic_adapter *adapter,
+					 int time_limit)
+{
+	u64 seconds;
+
+	seconds = jiffies / HZ - adapter->ahw->idc.sec_counter;
+	if (seconds <= time_limit)
+		return 0;
+	else
+		return -EBUSY;
+}
+
+/**
+ * qlcnic_83xx_idc_check_reset_ack_reg
+ *
+ * @adapter: adapter structure
+ *
+ * Check ACK wait limit and clear the functions which failed to ACK
+ *
+ * Return 0 if all functions have acknowledged the reset request.
+ **/
+static int qlcnic_83xx_idc_check_reset_ack_reg(struct qlcnic_adapter *adapter)
+{
+	int timeout;
+	u32 ack, presence, val;
+
+	timeout = QLC_83XX_IDC_RESET_TIMEOUT_SECS;
+	ack = QLCRDX(adapter->ahw, QLC_83XX_IDC_DRV_ACK);
+	presence = QLCRDX(adapter->ahw, QLC_83XX_IDC_DRV_PRESENCE);
+	dev_info(&adapter->pdev->dev,
+		 "%s: ack = 0x%x, presence = 0x%x\n", __func__, ack, presence);
+	if (!((ack & presence) == presence)) {
+		if (qlcnic_83xx_idc_check_timeout(adapter, timeout)) {
+			/* Clear functions which failed to ACK */
+			dev_info(&adapter->pdev->dev,
+				 "%s: ACK wait exceeds time limit\n", __func__);
+			val = QLCRDX(adapter->ahw, QLC_83XX_IDC_DRV_PRESENCE);
+			val = val & ~(ack ^ presence);
+			if (qlcnic_83xx_lock_driver(adapter))
+				return -EBUSY;
+			QLCWRX(adapter->ahw, QLC_83XX_IDC_DRV_PRESENCE, val);
+			dev_info(&adapter->pdev->dev,
+				 "%s: updated drv presence reg = 0x%x\n",
+				 __func__, val);
+			qlcnic_83xx_unlock_driver(adapter);
+			return 0;
+
+		} else {
+			return 1;
+		}
+	} else {
+		dev_info(&adapter->pdev->dev,
+			 "%s: Reset ACK received from all functions\n",
+			 __func__);
+		return 0;
+	}
+}
+
+/**
+ * qlcnic_83xx_idc_tx_soft_reset
+ *
+ * @adapter: adapter structure
+ *
+ * Handle context deletion and recreation request from transmit routine
+ *
+ * Returns -EBUSY  or Success (0)
+ *
+ **/
+static int qlcnic_83xx_idc_tx_soft_reset(struct qlcnic_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (test_and_set_bit(__QLCNIC_RESETTING, &adapter->state))
+		return -EBUSY;
+
+	netif_device_detach(netdev);
+	qlcnic_down(adapter, netdev);
+	qlcnic_up(adapter, netdev);
+	netif_device_attach(netdev);
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	netdev_info(adapter->netdev, "%s: soft reset complete.\n", __func__);
+
+	return 0;
+}
+
+/**
+ * qlcnic_83xx_idc_detach_driver
+ *
+ * @adapter: adapter structure
+ * Detach net interface, stop TX and cleanup resources before the HW reset.
+ * Returns: None
+ *
+ **/
+static void qlcnic_83xx_idc_detach_driver(struct qlcnic_adapter *adapter)
+{
+	int i;
+	struct net_device *netdev = adapter->netdev;
+
+	netif_device_detach(netdev);
+	qlcnic_83xx_detach_mailbox_work(adapter);
+
+	/* Disable mailbox interrupt */
+	qlcnic_83xx_disable_mbx_intr(adapter);
+	qlcnic_down(adapter, netdev);
+	for (i = 0; i < adapter->ahw->num_msix; i++) {
+		adapter->ahw->intr_tbl[i].id = i;
+		adapter->ahw->intr_tbl[i].enabled = 0;
+		adapter->ahw->intr_tbl[i].src = 0;
+	}
+
+	if (qlcnic_sriov_pf_check(adapter))
+		qlcnic_sriov_pf_reset(adapter);
+}
+
+/**
+ * qlcnic_83xx_idc_attach_driver
+ *
+ * @adapter: adapter structure
+ *
+ * Re-attach and re-enable net interface
+ * Returns: None
+ *
+ **/
+static void qlcnic_83xx_idc_attach_driver(struct qlcnic_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (netif_running(netdev)) {
+		if (qlcnic_up(adapter, netdev))
+			goto done;
+		qlcnic_restore_indev_addr(netdev, NETDEV_UP);
+	}
+done:
+	netif_device_attach(netdev);
+}
+
+static int qlcnic_83xx_idc_enter_failed_state(struct qlcnic_adapter *adapter,
+					      int lock)
+{
+	if (lock) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+	}
+
+	qlcnic_83xx_idc_clear_registers(adapter, 0);
+	QLCWRX(adapter->ahw, QLC_83XX_IDC_DEV_STATE, QLC_83XX_IDC_DEV_FAILED);
+	if (lock)
+		qlcnic_83xx_unlock_driver(adapter);
+
+	qlcnic_83xx_idc_log_state_history(adapter);
+	dev_info(&adapter->pdev->dev, "Device will enter failed state\n");
+
+	return 0;
+}
+
+static int qlcnic_83xx_idc_enter_init_state(struct qlcnic_adapter *adapter,
+					    int lock)
+{
+	if (lock) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+	}
+
+	QLCWRX(adapter->ahw, QLC_83XX_IDC_DEV_STATE, QLC_83XX_IDC_DEV_INIT);
+
+	if (lock)
+		qlcnic_83xx_unlock_driver(adapter);
+
+	return 0;
+}
+
+static int qlcnic_83xx_idc_enter_need_quiesce(struct qlcnic_adapter *adapter,
+					      int lock)
+{
+	if (lock) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+	}
+
+	QLCWRX(adapter->ahw, QLC_83XX_IDC_DEV_STATE,
+	       QLC_83XX_IDC_DEV_NEED_QUISCENT);
+
+	if (lock)
+		qlcnic_83xx_unlock_driver(adapter);
+
+	return 0;
+}
+
+static int
+qlcnic_83xx_idc_enter_need_reset_state(struct qlcnic_adapter *adapter, int lock)
+{
+	if (lock) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+	}
+
+	QLCWRX(adapter->ahw, QLC_83XX_IDC_DEV_STATE,
+	       QLC_83XX_IDC_DEV_NEED_RESET);
+
+	if (lock)
+		qlcnic_83xx_unlock_driver(adapter);
+
+	return 0;
+}
+
+static int qlcnic_83xx_idc_enter_ready_state(struct qlcnic_adapter *adapter,
+					     int lock)
+{
+	if (lock) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+	}
+
+	QLCWRX(adapter->ahw, QLC_83XX_IDC_DEV_STATE, QLC_83XX_IDC_DEV_READY);
+	if (lock)
+		qlcnic_83xx_unlock_driver(adapter);
+
+	return 0;
+}
+
+/**
+ * qlcnic_83xx_idc_find_reset_owner_id
+ *
+ * @adapter: adapter structure
+ *
+ * NIC gets precedence over ISCSI and ISCSI has precedence over FCOE.
+ * Within the same class, function with lowest PCI ID assumes ownership
+ *
+ * Returns: reset owner id or failure indication (-EIO)
+ *
+ **/
+static int qlcnic_83xx_idc_find_reset_owner_id(struct qlcnic_adapter *adapter)
+{
+	u32 reg, reg1, reg2, i, j, owner, class;
+
+	reg1 = QLCRDX(adapter->ahw, QLC_83XX_IDC_DEV_PARTITION_INFO_1);
+	reg2 = QLCRDX(adapter->ahw, QLC_83XX_IDC_DEV_PARTITION_INFO_2);
+	owner = QLCNIC_TYPE_NIC;
+	i = 0;
+	j = 0;
+	reg = reg1;
+
+	do {
+		class = (((reg & (0xF << j * 4)) >> j * 4) & 0x3);
+		if (class == owner)
+			break;
+		if (i == (QLC_83XX_IDC_MAX_FUNC_PER_PARTITION_INFO - 1)) {
+			reg = reg2;
+			j = 0;
+		} else {
+			j++;
+		}
+
+		if (i == (QLC_83XX_IDC_MAX_CNA_FUNCTIONS - 1)) {
+			if (owner == QLCNIC_TYPE_NIC)
+				owner = QLCNIC_TYPE_ISCSI;
+			else if (owner == QLCNIC_TYPE_ISCSI)
+				owner = QLCNIC_TYPE_FCOE;
+			else if (owner == QLCNIC_TYPE_FCOE)
+				return -EIO;
+			reg = reg1;
+			j = 0;
+			i = 0;
+		}
+	} while (i++ < QLC_83XX_IDC_MAX_CNA_FUNCTIONS);
+
+	return i;
+}
+
+static int qlcnic_83xx_idc_restart_hw(struct qlcnic_adapter *adapter, int lock)
+{
+	int ret = 0;
+
+	ret = qlcnic_83xx_restart_hw(adapter);
+
+	if (ret) {
+		qlcnic_83xx_idc_enter_failed_state(adapter, lock);
+	} else {
+		qlcnic_83xx_idc_clear_registers(adapter, lock);
+		ret = qlcnic_83xx_idc_enter_ready_state(adapter, lock);
+	}
+
+	return ret;
+}
+
+static int qlcnic_83xx_idc_check_fan_failure(struct qlcnic_adapter *adapter)
+{
+	u32 status;
+
+	status = QLC_SHARED_REG_RD32(adapter, QLCNIC_PEG_HALT_STATUS1);
+
+	if (status & QLCNIC_RCODE_FATAL_ERROR) {
+		dev_err(&adapter->pdev->dev,
+			"peg halt status1=0x%x\n", status);
+		if (QLCNIC_FWERROR_CODE(status) == QLCNIC_FWERROR_FAN_FAILURE) {
+			dev_err(&adapter->pdev->dev,
+				"On board active cooling fan failed. "
+				"Device has been halted.\n");
+			dev_err(&adapter->pdev->dev,
+				"Replace the adapter.\n");
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+int qlcnic_83xx_idc_reattach_driver(struct qlcnic_adapter *adapter)
+{
+	int err;
+
+	qlcnic_83xx_reinit_mbx_work(adapter->ahw->mailbox);
+	qlcnic_83xx_enable_mbx_interrupt(adapter);
+
+	qlcnic_83xx_initialize_nic(adapter, 1);
+
+	err = qlcnic_sriov_pf_reinit(adapter);
+	if (err)
+		return err;
+
+	qlcnic_83xx_enable_mbx_interrupt(adapter);
+
+	if (qlcnic_83xx_configure_opmode(adapter)) {
+		qlcnic_83xx_idc_enter_failed_state(adapter, 1);
+		return -EIO;
+	}
+
+	if (adapter->nic_ops->init_driver(adapter)) {
+		qlcnic_83xx_idc_enter_failed_state(adapter, 1);
+		return -EIO;
+	}
+
+	if (adapter->portnum == 0)
+		qlcnic_set_drv_version(adapter);
+
+	qlcnic_dcb_get_info(adapter->dcb);
+	qlcnic_83xx_idc_attach_driver(adapter);
+
+	return 0;
+}
+
+static void qlcnic_83xx_idc_update_idc_params(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	qlcnic_83xx_idc_update_drv_presence_reg(adapter, 1, 1);
+	qlcnic_83xx_idc_update_audit_reg(adapter, 0, 1);
+	set_bit(QLC_83XX_MODULE_LOADED, &adapter->ahw->idc.status);
+
+	ahw->idc.quiesce_req = 0;
+	ahw->idc.delay = QLC_83XX_IDC_FW_POLL_DELAY;
+	ahw->idc.err_code = 0;
+	ahw->idc.collect_dump = 0;
+	ahw->reset_context = 0;
+	adapter->tx_timeo_cnt = 0;
+	ahw->idc.delay_reset = 0;
+
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+}
+
+/**
+ * qlcnic_83xx_idc_ready_state_entry
+ *
+ * @adapter: adapter structure
+ *
+ * Perform ready state initialization, this routine will get invoked only
+ * once from READY state.
+ *
+ * Returns: Error code or Success(0)
+ *
+ **/
+int qlcnic_83xx_idc_ready_state_entry(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	if (ahw->idc.prev_state != QLC_83XX_IDC_DEV_READY) {
+		qlcnic_83xx_idc_update_idc_params(adapter);
+		/* Re-attach the device if required */
+		if ((ahw->idc.prev_state == QLC_83XX_IDC_DEV_NEED_RESET) ||
+		    (ahw->idc.prev_state == QLC_83XX_IDC_DEV_INIT)) {
+			if (qlcnic_83xx_idc_reattach_driver(adapter))
+				return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * qlcnic_83xx_idc_vnic_pf_entry
+ *
+ * @adapter: adapter structure
+ *
+ * Ensure vNIC mode privileged function starts only after vNIC mode is
+ * enabled by management function.
+ * If vNIC mode is ready, start initialization.
+ *
+ * Returns: -EIO or 0
+ *
+ **/
+int qlcnic_83xx_idc_vnic_pf_entry(struct qlcnic_adapter *adapter)
+{
+	u32 state;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	/* Privileged function waits till mgmt function enables VNIC mode */
+	state = QLCRDX(adapter->ahw, QLC_83XX_VNIC_STATE);
+	if (state != QLCNIC_DEV_NPAR_OPER) {
+		if (!ahw->idc.vnic_wait_limit--) {
+			qlcnic_83xx_idc_enter_failed_state(adapter, 1);
+			return -EIO;
+		}
+		dev_info(&adapter->pdev->dev, "vNIC mode disabled\n");
+		return -EIO;
+
+	} else {
+		/* Perform one time initialization from ready state */
+		if (ahw->idc.vnic_state != QLCNIC_DEV_NPAR_OPER) {
+			qlcnic_83xx_idc_update_idc_params(adapter);
+
+			/* If the previous state is UNKNOWN, device will be
+			   already attached properly by Init routine*/
+			if (ahw->idc.prev_state != QLC_83XX_IDC_DEV_UNKNOWN) {
+				if (qlcnic_83xx_idc_reattach_driver(adapter))
+					return -EIO;
+			}
+			adapter->ahw->idc.vnic_state =  QLCNIC_DEV_NPAR_OPER;
+			dev_info(&adapter->pdev->dev, "vNIC mode enabled\n");
+		}
+	}
+
+	return 0;
+}
+
+static int qlcnic_83xx_idc_unknown_state(struct qlcnic_adapter *adapter)
+{
+	adapter->ahw->idc.err_code = -EIO;
+	dev_err(&adapter->pdev->dev,
+		"%s: Device in unknown state\n", __func__);
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	return 0;
+}
+
+/**
+ * qlcnic_83xx_idc_cold_state
+ *
+ * @adapter: adapter structure
+ *
+ * If HW is up and running device will enter READY state.
+ * If firmware image from host needs to be loaded, device is
+ * forced to start with the file firmware image.
+ *
+ * Returns: Error code or Success(0)
+ *
+ **/
+static int qlcnic_83xx_idc_cold_state_handler(struct qlcnic_adapter *adapter)
+{
+	qlcnic_83xx_idc_update_drv_presence_reg(adapter, 1, 0);
+	qlcnic_83xx_idc_update_audit_reg(adapter, 1, 0);
+
+	if (qlcnic_load_fw_file) {
+		qlcnic_83xx_idc_restart_hw(adapter, 0);
+	} else {
+		if (qlcnic_83xx_check_hw_status(adapter)) {
+			qlcnic_83xx_idc_enter_failed_state(adapter, 0);
+			return -EIO;
+		} else {
+			qlcnic_83xx_idc_enter_ready_state(adapter, 0);
+		}
+	}
+	return 0;
+}
+
+/**
+ * qlcnic_83xx_idc_init_state
+ *
+ * @adapter: adapter structure
+ *
+ * Reset owner will restart the device from this state.
+ * Device will enter failed state if it remains
+ * in this state for more than DEV_INIT time limit.
+ *
+ * Returns: Error code or Success(0)
+ *
+ **/
+static int qlcnic_83xx_idc_init_state(struct qlcnic_adapter *adapter)
+{
+	int timeout, ret = 0;
+	u32 owner;
+
+	timeout = QLC_83XX_IDC_INIT_TIMEOUT_SECS;
+	if (adapter->ahw->idc.prev_state == QLC_83XX_IDC_DEV_NEED_RESET) {
+		owner = qlcnic_83xx_idc_find_reset_owner_id(adapter);
+		if (adapter->ahw->pci_func == owner)
+			ret = qlcnic_83xx_idc_restart_hw(adapter, 1);
+	} else {
+		ret = qlcnic_83xx_idc_check_timeout(adapter, timeout);
+	}
+
+	return ret;
+}
+
+/**
+ * qlcnic_83xx_idc_ready_state
+ *
+ * @adapter: adapter structure
+ *
+ * Perform IDC protocol specicifed actions after monitoring device state and
+ * events.
+ *
+ * Returns: Error code or Success(0)
+ *
+ **/
+static int qlcnic_83xx_idc_ready_state(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_mailbox *mbx = ahw->mailbox;
+	int ret = 0;
+	u32 val;
+
+	/* Perform NIC configuration based ready state entry actions */
+	if (ahw->idc.state_entry(adapter))
+		return -EIO;
+
+	if (qlcnic_check_temp(adapter)) {
+		if (ahw->temp == QLCNIC_TEMP_PANIC) {
+			qlcnic_83xx_idc_check_fan_failure(adapter);
+			dev_err(&adapter->pdev->dev,
+				"Error: device temperature %d above limits\n",
+				adapter->ahw->temp);
+			clear_bit(QLC_83XX_MBX_READY, &mbx->status);
+			set_bit(__QLCNIC_RESETTING, &adapter->state);
+			qlcnic_83xx_idc_detach_driver(adapter);
+			qlcnic_83xx_idc_enter_failed_state(adapter, 1);
+			return -EIO;
+		}
+	}
+
+	val = QLCRDX(adapter->ahw, QLC_83XX_IDC_CTRL);
+	ret = qlcnic_83xx_check_heartbeat(adapter);
+	if (ret) {
+		adapter->flags |= QLCNIC_FW_HANG;
+		if (!(val & QLC_83XX_IDC_DISABLE_FW_RESET_RECOVERY)) {
+			clear_bit(QLC_83XX_MBX_READY, &mbx->status);
+			set_bit(__QLCNIC_RESETTING, &adapter->state);
+			qlcnic_83xx_idc_enter_need_reset_state(adapter, 1);
+		}  else {
+			netdev_info(adapter->netdev, "%s: Auto firmware recovery is disabled\n",
+				    __func__);
+			qlcnic_83xx_idc_enter_failed_state(adapter, 1);
+		}
+		return -EIO;
+	}
+
+	if ((val & QLC_83XX_IDC_GRACEFULL_RESET) || ahw->idc.collect_dump) {
+		clear_bit(QLC_83XX_MBX_READY, &mbx->status);
+
+		/* Move to need reset state and prepare for reset */
+		qlcnic_83xx_idc_enter_need_reset_state(adapter, 1);
+		return ret;
+	}
+
+	/* Check for soft reset request */
+	if (ahw->reset_context &&
+	    !(val & QLC_83XX_IDC_DISABLE_FW_RESET_RECOVERY)) {
+		adapter->ahw->reset_context = 0;
+		qlcnic_83xx_idc_tx_soft_reset(adapter);
+		return ret;
+	}
+
+	/* Move to need quiesce state if requested */
+	if (adapter->ahw->idc.quiesce_req) {
+		qlcnic_83xx_idc_enter_need_quiesce(adapter, 1);
+		qlcnic_83xx_idc_update_audit_reg(adapter, 0, 1);
+		return ret;
+	}
+
+	return ret;
+}
+
+/**
+ * qlcnic_83xx_idc_need_reset_state
+ *
+ * @adapter: adapter structure
+ *
+ * Device will remain in this state until:
+ *	Reset request ACK's are received from all the functions
+ *	Wait time exceeds max time limit
+ *
+ * Returns: Error code or Success(0)
+ *
+ **/
+static int qlcnic_83xx_idc_need_reset_state(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_mailbox *mbx = adapter->ahw->mailbox;
+	int ret = 0;
+
+	if (adapter->ahw->idc.prev_state != QLC_83XX_IDC_DEV_NEED_RESET) {
+		qlcnic_83xx_idc_update_audit_reg(adapter, 0, 1);
+		set_bit(__QLCNIC_RESETTING, &adapter->state);
+		clear_bit(QLC_83XX_MBX_READY, &mbx->status);
+		if (adapter->ahw->nic_mode == QLCNIC_VNIC_MODE)
+			qlcnic_83xx_disable_vnic_mode(adapter, 1);
+
+		if (qlcnic_check_diag_status(adapter)) {
+			dev_info(&adapter->pdev->dev,
+				 "%s: Wait for diag completion\n", __func__);
+			adapter->ahw->idc.delay_reset = 1;
+			return 0;
+		} else {
+			qlcnic_83xx_idc_update_drv_ack_reg(adapter, 1, 1);
+			qlcnic_83xx_idc_detach_driver(adapter);
+		}
+	}
+
+	if (qlcnic_check_diag_status(adapter)) {
+		dev_info(&adapter->pdev->dev,
+			 "%s: Wait for diag completion\n", __func__);
+		return  -1;
+	} else {
+		if (adapter->ahw->idc.delay_reset) {
+			qlcnic_83xx_idc_update_drv_ack_reg(adapter, 1, 1);
+			qlcnic_83xx_idc_detach_driver(adapter);
+			adapter->ahw->idc.delay_reset = 0;
+		}
+
+		/* Check for ACK from other functions */
+		ret = qlcnic_83xx_idc_check_reset_ack_reg(adapter);
+		if (ret) {
+			dev_info(&adapter->pdev->dev,
+				 "%s: Waiting for reset ACK\n", __func__);
+			return -1;
+		}
+	}
+
+	/* Transit to INIT state and restart the HW */
+	qlcnic_83xx_idc_enter_init_state(adapter, 1);
+
+	return ret;
+}
+
+static int qlcnic_83xx_idc_need_quiesce_state(struct qlcnic_adapter *adapter)
+{
+	dev_err(&adapter->pdev->dev, "%s: TBD\n", __func__);
+	return 0;
+}
+
+static void qlcnic_83xx_idc_failed_state(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u32 val, owner;
+
+	val = QLCRDX(adapter->ahw, QLC_83XX_IDC_CTRL);
+	if (val & QLC_83XX_IDC_DISABLE_FW_RESET_RECOVERY) {
+		owner = qlcnic_83xx_idc_find_reset_owner_id(adapter);
+		if (ahw->pci_func == owner) {
+			qlcnic_83xx_stop_hw(adapter);
+			qlcnic_dump_fw(adapter);
+		}
+	}
+
+	netdev_warn(adapter->netdev, "%s: Reboot will be required to recover the adapter!!\n",
+		    __func__);
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	ahw->idc.err_code = -EIO;
+
+	return;
+}
+
+static int qlcnic_83xx_idc_quiesce_state(struct qlcnic_adapter *adapter)
+{
+	dev_info(&adapter->pdev->dev, "%s: TBD\n", __func__);
+	return 0;
+}
+
+static int qlcnic_83xx_idc_check_state_validity(struct qlcnic_adapter *adapter,
+						u32 state)
+{
+	u32 cur, prev, next;
+
+	cur = adapter->ahw->idc.curr_state;
+	prev = adapter->ahw->idc.prev_state;
+	next = state;
+
+	if ((next < QLC_83XX_IDC_DEV_COLD) ||
+	    (next > QLC_83XX_IDC_DEV_QUISCENT)) {
+		dev_err(&adapter->pdev->dev,
+			"%s: curr %d, prev %d, next state %d is  invalid\n",
+			__func__, cur, prev, state);
+		return 1;
+	}
+
+	if ((cur == QLC_83XX_IDC_DEV_UNKNOWN) &&
+	    (prev == QLC_83XX_IDC_DEV_UNKNOWN)) {
+		if ((next != QLC_83XX_IDC_DEV_COLD) &&
+		    (next != QLC_83XX_IDC_DEV_READY)) {
+			dev_err(&adapter->pdev->dev,
+				"%s: failed, cur %d prev %d next %d\n",
+				__func__, cur, prev, next);
+			return 1;
+		}
+	}
+
+	if (next == QLC_83XX_IDC_DEV_INIT) {
+		if ((prev != QLC_83XX_IDC_DEV_INIT) &&
+		    (prev != QLC_83XX_IDC_DEV_COLD) &&
+		    (prev != QLC_83XX_IDC_DEV_NEED_RESET)) {
+			dev_err(&adapter->pdev->dev,
+				"%s: failed, cur %d prev %d next %d\n",
+				__func__, cur, prev, next);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+#define QLC_83XX_ENCAP_TYPE_VXLAN	BIT_1
+#define QLC_83XX_MATCH_ENCAP_ID		BIT_2
+#define QLC_83XX_SET_VXLAN_UDP_DPORT	BIT_3
+#define QLC_83XX_VXLAN_UDP_DPORT(PORT)	((PORT & 0xffff) << 16)
+
+#define QLCNIC_ENABLE_INGRESS_ENCAP_PARSING 1
+#define QLCNIC_DISABLE_INGRESS_ENCAP_PARSING 0
+
+static int qlcnic_set_vxlan_port(struct qlcnic_adapter *adapter)
+{
+	u16 port = adapter->ahw->vxlan_port;
+	struct qlcnic_cmd_args cmd;
+	int ret = 0;
+
+	memset(&cmd, 0, sizeof(cmd));
+
+	ret = qlcnic_alloc_mbx_args(&cmd, adapter,
+				    QLCNIC_CMD_INIT_NIC_FUNC);
+	if (ret)
+		return ret;
+
+	cmd.req.arg[1] = QLC_83XX_MULTI_TENANCY_INFO;
+	cmd.req.arg[2] = QLC_83XX_ENCAP_TYPE_VXLAN |
+			 QLC_83XX_SET_VXLAN_UDP_DPORT |
+			 QLC_83XX_VXLAN_UDP_DPORT(port);
+
+	ret = qlcnic_issue_cmd(adapter, &cmd);
+	if (ret)
+		netdev_err(adapter->netdev,
+			   "Failed to set VXLAN port %d in adapter\n",
+			   port);
+
+	qlcnic_free_mbx_args(&cmd);
+
+	return ret;
+}
+
+static int qlcnic_set_vxlan_parsing(struct qlcnic_adapter *adapter,
+				    bool state)
+{
+	u16 vxlan_port = adapter->ahw->vxlan_port;
+	struct qlcnic_cmd_args cmd;
+	int ret = 0;
+
+	memset(&cmd, 0, sizeof(cmd));
+
+	ret = qlcnic_alloc_mbx_args(&cmd, adapter,
+				    QLCNIC_CMD_SET_INGRESS_ENCAP);
+	if (ret)
+		return ret;
+
+	cmd.req.arg[1] = state ? QLCNIC_ENABLE_INGRESS_ENCAP_PARSING :
+				 QLCNIC_DISABLE_INGRESS_ENCAP_PARSING;
+
+	ret = qlcnic_issue_cmd(adapter, &cmd);
+	if (ret)
+		netdev_err(adapter->netdev,
+			   "Failed to %s VXLAN parsing for port %d\n",
+			   state ? "enable" : "disable", vxlan_port);
+	else
+		netdev_info(adapter->netdev,
+			    "%s VXLAN parsing for port %d\n",
+			    state ? "Enabled" : "Disabled", vxlan_port);
+
+	qlcnic_free_mbx_args(&cmd);
+
+	return ret;
+}
+
+static void qlcnic_83xx_periodic_tasks(struct qlcnic_adapter *adapter)
+{
+	if (adapter->fhash.fnum)
+		qlcnic_prune_lb_filters(adapter);
+
+	if (adapter->flags & QLCNIC_ADD_VXLAN_PORT) {
+		if (qlcnic_set_vxlan_port(adapter))
+			return;
+
+		if (qlcnic_set_vxlan_parsing(adapter, true))
+			return;
+
+		adapter->flags &= ~QLCNIC_ADD_VXLAN_PORT;
+	} else if (adapter->flags & QLCNIC_DEL_VXLAN_PORT) {
+		if (qlcnic_set_vxlan_parsing(adapter, false))
+			return;
+
+		adapter->ahw->vxlan_port = 0;
+		adapter->flags &= ~QLCNIC_DEL_VXLAN_PORT;
+	}
+}
+
+/**
+ * qlcnic_83xx_idc_poll_dev_state
+ *
+ * @work: kernel work queue structure used to schedule the function
+ *
+ * Poll device state periodically and perform state specific
+ * actions defined by Inter Driver Communication (IDC) protocol.
+ *
+ * Returns: None
+ *
+ **/
+void qlcnic_83xx_idc_poll_dev_state(struct work_struct *work)
+{
+	struct qlcnic_adapter *adapter;
+	u32 state;
+
+	adapter = container_of(work, struct qlcnic_adapter, fw_work.work);
+	state =	QLCRDX(adapter->ahw, QLC_83XX_IDC_DEV_STATE);
+
+	if (qlcnic_83xx_idc_check_state_validity(adapter, state)) {
+		qlcnic_83xx_idc_log_state_history(adapter);
+		adapter->ahw->idc.curr_state = QLC_83XX_IDC_DEV_UNKNOWN;
+	} else {
+		adapter->ahw->idc.curr_state = state;
+	}
+
+	switch (adapter->ahw->idc.curr_state) {
+	case QLC_83XX_IDC_DEV_READY:
+		qlcnic_83xx_idc_ready_state(adapter);
+		break;
+	case QLC_83XX_IDC_DEV_NEED_RESET:
+		qlcnic_83xx_idc_need_reset_state(adapter);
+		break;
+	case QLC_83XX_IDC_DEV_NEED_QUISCENT:
+		qlcnic_83xx_idc_need_quiesce_state(adapter);
+		break;
+	case QLC_83XX_IDC_DEV_FAILED:
+		qlcnic_83xx_idc_failed_state(adapter);
+		return;
+	case QLC_83XX_IDC_DEV_INIT:
+		qlcnic_83xx_idc_init_state(adapter);
+		break;
+	case QLC_83XX_IDC_DEV_QUISCENT:
+		qlcnic_83xx_idc_quiesce_state(adapter);
+		break;
+	default:
+		qlcnic_83xx_idc_unknown_state(adapter);
+		return;
+	}
+	adapter->ahw->idc.prev_state = adapter->ahw->idc.curr_state;
+	qlcnic_83xx_periodic_tasks(adapter);
+
+	/* Re-schedule the function */
+	if (test_bit(QLC_83XX_MODULE_LOADED, &adapter->ahw->idc.status))
+		qlcnic_schedule_work(adapter, qlcnic_83xx_idc_poll_dev_state,
+				     adapter->ahw->idc.delay);
+}
+
+static void qlcnic_83xx_setup_idc_parameters(struct qlcnic_adapter *adapter)
+{
+	u32 idc_params, val;
+
+	if (qlcnic_83xx_flash_read32(adapter, QLC_83XX_IDC_FLASH_PARAM_ADDR,
+				     (u8 *)&idc_params, 1)) {
+		dev_info(&adapter->pdev->dev,
+			 "%s:failed to get IDC params from flash\n", __func__);
+		adapter->dev_init_timeo = QLC_83XX_IDC_INIT_TIMEOUT_SECS;
+		adapter->reset_ack_timeo = QLC_83XX_IDC_RESET_TIMEOUT_SECS;
+	} else {
+		adapter->dev_init_timeo = idc_params & 0xFFFF;
+		adapter->reset_ack_timeo = ((idc_params >> 16) & 0xFFFF);
+	}
+
+	adapter->ahw->idc.curr_state = QLC_83XX_IDC_DEV_UNKNOWN;
+	adapter->ahw->idc.prev_state = QLC_83XX_IDC_DEV_UNKNOWN;
+	adapter->ahw->idc.delay = QLC_83XX_IDC_FW_POLL_DELAY;
+	adapter->ahw->idc.err_code = 0;
+	adapter->ahw->idc.collect_dump = 0;
+	adapter->ahw->idc.name = (char **)qlc_83xx_idc_states;
+
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	set_bit(QLC_83XX_MODULE_LOADED, &adapter->ahw->idc.status);
+
+	/* Check if reset recovery is disabled */
+	if (!qlcnic_auto_fw_reset) {
+		/* Propagate do not reset request to other functions */
+		val = QLCRDX(adapter->ahw, QLC_83XX_IDC_CTRL);
+		val = val | QLC_83XX_IDC_DISABLE_FW_RESET_RECOVERY;
+		QLCWRX(adapter->ahw, QLC_83XX_IDC_CTRL, val);
+	}
+}
+
+static int
+qlcnic_83xx_idc_first_to_load_function_handler(struct qlcnic_adapter *adapter)
+{
+	u32 state, val;
+
+	if (qlcnic_83xx_lock_driver(adapter))
+		return -EIO;
+
+	/* Clear driver lock register */
+	QLCWRX(adapter->ahw, QLC_83XX_RECOVER_DRV_LOCK, 0);
+	if (qlcnic_83xx_idc_update_major_version(adapter, 0)) {
+		qlcnic_83xx_unlock_driver(adapter);
+		return -EIO;
+	}
+
+	state =	QLCRDX(adapter->ahw, QLC_83XX_IDC_DEV_STATE);
+	if (qlcnic_83xx_idc_check_state_validity(adapter, state)) {
+		qlcnic_83xx_unlock_driver(adapter);
+		return -EIO;
+	}
+
+	if (state != QLC_83XX_IDC_DEV_COLD && qlcnic_load_fw_file) {
+		QLCWRX(adapter->ahw, QLC_83XX_IDC_DEV_STATE,
+		       QLC_83XX_IDC_DEV_COLD);
+		state = QLC_83XX_IDC_DEV_COLD;
+	}
+
+	adapter->ahw->idc.curr_state = state;
+	/* First to load function should cold boot the device */
+	if (state == QLC_83XX_IDC_DEV_COLD)
+		qlcnic_83xx_idc_cold_state_handler(adapter);
+
+	/* Check if reset recovery is enabled */
+	if (qlcnic_auto_fw_reset) {
+		val = QLCRDX(adapter->ahw, QLC_83XX_IDC_CTRL);
+		val = val & ~QLC_83XX_IDC_DISABLE_FW_RESET_RECOVERY;
+		QLCWRX(adapter->ahw, QLC_83XX_IDC_CTRL, val);
+	}
+
+	qlcnic_83xx_unlock_driver(adapter);
+
+	return 0;
+}
+
+int qlcnic_83xx_idc_init(struct qlcnic_adapter *adapter)
+{
+	int ret = -EIO;
+
+	qlcnic_83xx_setup_idc_parameters(adapter);
+
+	if (qlcnic_83xx_get_reset_instruction_template(adapter))
+		return ret;
+
+	if (!qlcnic_83xx_idc_check_driver_presence_reg(adapter)) {
+		if (qlcnic_83xx_idc_first_to_load_function_handler(adapter))
+			return -EIO;
+	} else {
+		if (qlcnic_83xx_idc_check_major_version(adapter))
+			return -EIO;
+	}
+
+	qlcnic_83xx_idc_update_audit_reg(adapter, 0, 1);
+
+	return 0;
+}
+
+void qlcnic_83xx_idc_exit(struct qlcnic_adapter *adapter)
+{
+	int id;
+	u32 val;
+
+	while (test_and_set_bit(__QLCNIC_RESETTING, &adapter->state))
+		usleep_range(10000, 11000);
+
+	id = QLCRDX(adapter->ahw, QLC_83XX_DRV_LOCK_ID);
+	id = id & 0xFF;
+
+	if (id == adapter->portnum) {
+		dev_err(&adapter->pdev->dev,
+			"%s: wait for lock recovery.. %d\n", __func__, id);
+		msleep(20);
+		id = QLCRDX(adapter->ahw, QLC_83XX_DRV_LOCK_ID);
+		id = id & 0xFF;
+	}
+
+	/* Clear driver presence bit */
+	val = QLCRDX(adapter->ahw, QLC_83XX_IDC_DRV_PRESENCE);
+	val = val & ~(1 << adapter->portnum);
+	QLCWRX(adapter->ahw, QLC_83XX_IDC_DRV_PRESENCE, val);
+	clear_bit(QLC_83XX_MODULE_LOADED, &adapter->ahw->idc.status);
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+
+	cancel_delayed_work_sync(&adapter->fw_work);
+}
+
+void qlcnic_83xx_idc_request_reset(struct qlcnic_adapter *adapter, u32 key)
+{
+	u32 val;
+
+	if (qlcnic_sriov_vf_check(adapter))
+		return;
+
+	if (qlcnic_83xx_lock_driver(adapter)) {
+		dev_err(&adapter->pdev->dev,
+			"%s:failed, please retry\n", __func__);
+		return;
+	}
+
+	val = QLCRDX(adapter->ahw, QLC_83XX_IDC_CTRL);
+	if (val & QLC_83XX_IDC_DISABLE_FW_RESET_RECOVERY) {
+		netdev_info(adapter->netdev, "%s: Auto firmware recovery is disabled\n",
+			    __func__);
+		qlcnic_83xx_idc_enter_failed_state(adapter, 0);
+		qlcnic_83xx_unlock_driver(adapter);
+		return;
+	}
+
+	if (key == QLCNIC_FORCE_FW_RESET) {
+		val = QLCRDX(adapter->ahw, QLC_83XX_IDC_CTRL);
+		val = val | QLC_83XX_IDC_GRACEFULL_RESET;
+		QLCWRX(adapter->ahw, QLC_83XX_IDC_CTRL, val);
+	} else if (key == QLCNIC_FORCE_FW_DUMP_KEY) {
+		adapter->ahw->idc.collect_dump = 1;
+	}
+
+	qlcnic_83xx_unlock_driver(adapter);
+	return;
+}
+
+static int qlcnic_83xx_copy_bootloader(struct qlcnic_adapter *adapter)
+{
+	u8 *p_cache;
+	u32 src, size;
+	u64 dest;
+	int ret = -EIO;
+
+	src = QLC_83XX_BOOTLOADER_FLASH_ADDR;
+	dest = QLCRDX(adapter->ahw, QLCNIC_BOOTLOADER_ADDR);
+	size = QLCRDX(adapter->ahw, QLCNIC_BOOTLOADER_SIZE);
+
+	/* alignment check */
+	if (size & 0xF)
+		size = (size + 16) & ~0xF;
+
+	p_cache = vzalloc(size);
+	if (p_cache == NULL)
+		return -ENOMEM;
+
+	ret = qlcnic_83xx_lockless_flash_read32(adapter, src, p_cache,
+						size / sizeof(u32));
+	if (ret) {
+		vfree(p_cache);
+		return ret;
+	}
+	/* 16 byte write to MS memory */
+	ret = qlcnic_ms_mem_write128(adapter, dest, (u32 *)p_cache,
+				     size / 16);
+	if (ret) {
+		vfree(p_cache);
+		return ret;
+	}
+	vfree(p_cache);
+
+	return ret;
+}
+
+static int qlcnic_83xx_copy_fw_file(struct qlcnic_adapter *adapter)
+{
+	struct qlc_83xx_fw_info *fw_info = adapter->ahw->fw_info;
+	const struct firmware *fw = fw_info->fw;
+	u32 dest, *p_cache, *temp;
+	int i, ret = -EIO;
+	__le32 *temp_le;
+	u8 data[16];
+	size_t size;
+	u64 addr;
+
+	temp = vzalloc(fw->size);
+	if (!temp) {
+		release_firmware(fw);
+		fw_info->fw = NULL;
+		return -ENOMEM;
+	}
+
+	temp_le = (__le32 *)fw->data;
+
+	/* FW image in file is in little endian, swap the data to nullify
+	 * the effect of writel() operation on big endian platform.
+	 */
+	for (i = 0; i < fw->size / sizeof(u32); i++)
+		temp[i] = __le32_to_cpu(temp_le[i]);
+
+	dest = QLCRDX(adapter->ahw, QLCNIC_FW_IMAGE_ADDR);
+	size = (fw->size & ~0xF);
+	p_cache = temp;
+	addr = (u64)dest;
+
+	ret = qlcnic_ms_mem_write128(adapter, addr,
+				     p_cache, size / 16);
+	if (ret) {
+		dev_err(&adapter->pdev->dev, "MS memory write failed\n");
+		goto exit;
+	}
+
+	/* alignment check */
+	if (fw->size & 0xF) {
+		addr = dest + size;
+		for (i = 0; i < (fw->size & 0xF); i++)
+			data[i] = ((u8 *)temp)[size + i];
+		for (; i < 16; i++)
+			data[i] = 0;
+		ret = qlcnic_ms_mem_write128(adapter, addr,
+					     (u32 *)data, 1);
+		if (ret) {
+			dev_err(&adapter->pdev->dev,
+				"MS memory write failed\n");
+			goto exit;
+		}
+	}
+
+exit:
+	release_firmware(fw);
+	fw_info->fw = NULL;
+	vfree(temp);
+
+	return ret;
+}
+
+static void qlcnic_83xx_dump_pause_control_regs(struct qlcnic_adapter *adapter)
+{
+	int i, j;
+	u32 val = 0, val1 = 0, reg = 0;
+	int err = 0;
+
+	val = QLCRD32(adapter, QLC_83XX_SRE_SHIM_REG, &err);
+	if (err == -EIO)
+		return;
+	dev_info(&adapter->pdev->dev, "SRE-Shim Ctrl:0x%x\n", val);
+
+	for (j = 0; j < 2; j++) {
+		if (j == 0) {
+			dev_info(&adapter->pdev->dev,
+				 "Port 0 RxB Pause Threshold Regs[TC7..TC0]:");
+			reg = QLC_83XX_PORT0_THRESHOLD;
+		} else if (j == 1) {
+			dev_info(&adapter->pdev->dev,
+				 "Port 1 RxB Pause Threshold Regs[TC7..TC0]:");
+			reg = QLC_83XX_PORT1_THRESHOLD;
+		}
+		for (i = 0; i < 8; i++) {
+			val = QLCRD32(adapter, reg + (i * 0x4), &err);
+			if (err == -EIO)
+				return;
+			dev_info(&adapter->pdev->dev, "0x%x  ", val);
+		}
+		dev_info(&adapter->pdev->dev, "\n");
+	}
+
+	for (j = 0; j < 2; j++) {
+		if (j == 0) {
+			dev_info(&adapter->pdev->dev,
+				 "Port 0 RxB TC Max Cell Registers[4..1]:");
+			reg = QLC_83XX_PORT0_TC_MC_REG;
+		} else if (j == 1) {
+			dev_info(&adapter->pdev->dev,
+				 "Port 1 RxB TC Max Cell Registers[4..1]:");
+			reg = QLC_83XX_PORT1_TC_MC_REG;
+		}
+		for (i = 0; i < 4; i++) {
+			val = QLCRD32(adapter, reg + (i * 0x4), &err);
+			if (err == -EIO)
+				return;
+			dev_info(&adapter->pdev->dev, "0x%x  ", val);
+		}
+		dev_info(&adapter->pdev->dev, "\n");
+	}
+
+	for (j = 0; j < 2; j++) {
+		if (j == 0) {
+			dev_info(&adapter->pdev->dev,
+				 "Port 0 RxB Rx TC Stats[TC7..TC0]:");
+			reg = QLC_83XX_PORT0_TC_STATS;
+		} else if (j == 1) {
+			dev_info(&adapter->pdev->dev,
+				 "Port 1 RxB Rx TC Stats[TC7..TC0]:");
+			reg = QLC_83XX_PORT1_TC_STATS;
+		}
+		for (i = 7; i >= 0; i--) {
+			val = QLCRD32(adapter, reg, &err);
+			if (err == -EIO)
+				return;
+			val &= ~(0x7 << 29);    /* Reset bits 29 to 31 */
+			QLCWR32(adapter, reg, (val | (i << 29)));
+			val = QLCRD32(adapter, reg, &err);
+			if (err == -EIO)
+				return;
+			dev_info(&adapter->pdev->dev, "0x%x  ", val);
+		}
+		dev_info(&adapter->pdev->dev, "\n");
+	}
+
+	val = QLCRD32(adapter, QLC_83XX_PORT2_IFB_THRESHOLD, &err);
+	if (err == -EIO)
+		return;
+	val1 = QLCRD32(adapter, QLC_83XX_PORT3_IFB_THRESHOLD, &err);
+	if (err == -EIO)
+		return;
+	dev_info(&adapter->pdev->dev,
+		 "IFB-Pause Thresholds: Port 2:0x%x, Port 3:0x%x\n",
+		 val, val1);
+}
+
+
+static void qlcnic_83xx_disable_pause_frames(struct qlcnic_adapter *adapter)
+{
+	u32 reg = 0, i, j;
+
+	if (qlcnic_83xx_lock_driver(adapter)) {
+		dev_err(&adapter->pdev->dev,
+			"%s:failed to acquire driver lock\n", __func__);
+		return;
+	}
+
+	qlcnic_83xx_dump_pause_control_regs(adapter);
+	QLCWR32(adapter, QLC_83XX_SRE_SHIM_REG, 0x0);
+
+	for (j = 0; j < 2; j++) {
+		if (j == 0)
+			reg = QLC_83XX_PORT0_THRESHOLD;
+		else if (j == 1)
+			reg = QLC_83XX_PORT1_THRESHOLD;
+
+		for (i = 0; i < 8; i++)
+			QLCWR32(adapter, reg + (i * 0x4), 0x0);
+	}
+
+	for (j = 0; j < 2; j++) {
+		if (j == 0)
+			reg = QLC_83XX_PORT0_TC_MC_REG;
+		else if (j == 1)
+			reg = QLC_83XX_PORT1_TC_MC_REG;
+
+		for (i = 0; i < 4; i++)
+			QLCWR32(adapter, reg + (i * 0x4), 0x03FF03FF);
+	}
+
+	QLCWR32(adapter, QLC_83XX_PORT2_IFB_THRESHOLD, 0);
+	QLCWR32(adapter, QLC_83XX_PORT3_IFB_THRESHOLD, 0);
+	dev_info(&adapter->pdev->dev,
+		 "Disabled pause frames successfully on all ports\n");
+	qlcnic_83xx_unlock_driver(adapter);
+}
+
+static void qlcnic_83xx_take_eport_out_of_reset(struct qlcnic_adapter *adapter)
+{
+	QLCWR32(adapter, QLC_83XX_RESET_REG, 0);
+	QLCWR32(adapter, QLC_83XX_RESET_PORT0, 0);
+	QLCWR32(adapter, QLC_83XX_RESET_PORT1, 0);
+	QLCWR32(adapter, QLC_83XX_RESET_PORT2, 0);
+	QLCWR32(adapter, QLC_83XX_RESET_PORT3, 0);
+	QLCWR32(adapter, QLC_83XX_RESET_SRESHIM, 0);
+	QLCWR32(adapter, QLC_83XX_RESET_EPGSHIM, 0);
+	QLCWR32(adapter, QLC_83XX_RESET_ETHERPCS, 0);
+	QLCWR32(adapter, QLC_83XX_RESET_CONTROL, 1);
+}
+
+static int qlcnic_83xx_check_heartbeat(struct qlcnic_adapter *p_dev)
+{
+	u32 heartbeat, peg_status;
+	int retries, ret = -EIO, err = 0;
+
+	retries = QLCNIC_HEARTBEAT_CHECK_RETRY_COUNT;
+	p_dev->heartbeat = QLC_SHARED_REG_RD32(p_dev,
+					       QLCNIC_PEG_ALIVE_COUNTER);
+
+	do {
+		msleep(QLCNIC_HEARTBEAT_PERIOD_MSECS);
+		heartbeat = QLC_SHARED_REG_RD32(p_dev,
+						QLCNIC_PEG_ALIVE_COUNTER);
+		if (heartbeat != p_dev->heartbeat) {
+			ret = QLCNIC_RCODE_SUCCESS;
+			break;
+		}
+	} while (--retries);
+
+	if (ret) {
+		dev_err(&p_dev->pdev->dev, "firmware hang detected\n");
+		qlcnic_83xx_take_eport_out_of_reset(p_dev);
+		qlcnic_83xx_disable_pause_frames(p_dev);
+		peg_status = QLC_SHARED_REG_RD32(p_dev,
+						 QLCNIC_PEG_HALT_STATUS1);
+		dev_info(&p_dev->pdev->dev, "Dumping HW/FW registers\n"
+			 "PEG_HALT_STATUS1: 0x%x, PEG_HALT_STATUS2: 0x%x,\n"
+			 "PEG_NET_0_PC: 0x%x, PEG_NET_1_PC: 0x%x,\n"
+			 "PEG_NET_2_PC: 0x%x, PEG_NET_3_PC: 0x%x,\n"
+			 "PEG_NET_4_PC: 0x%x\n", peg_status,
+			 QLC_SHARED_REG_RD32(p_dev, QLCNIC_PEG_HALT_STATUS2),
+			 QLCRD32(p_dev, QLC_83XX_CRB_PEG_NET_0, &err),
+			 QLCRD32(p_dev, QLC_83XX_CRB_PEG_NET_1, &err),
+			 QLCRD32(p_dev, QLC_83XX_CRB_PEG_NET_2, &err),
+			 QLCRD32(p_dev, QLC_83XX_CRB_PEG_NET_3, &err),
+			 QLCRD32(p_dev, QLC_83XX_CRB_PEG_NET_4, &err));
+
+		if (QLCNIC_FWERROR_CODE(peg_status) == 0x67)
+			dev_err(&p_dev->pdev->dev,
+				"Device is being reset err code 0x00006700.\n");
+	}
+
+	return ret;
+}
+
+static int qlcnic_83xx_check_cmd_peg_status(struct qlcnic_adapter *p_dev)
+{
+	int retries = QLCNIC_CMDPEG_CHECK_RETRY_COUNT;
+	u32 val;
+
+	do {
+		val = QLC_SHARED_REG_RD32(p_dev, QLCNIC_CMDPEG_STATE);
+		if (val == QLC_83XX_CMDPEG_COMPLETE)
+			return 0;
+		msleep(QLCNIC_CMDPEG_CHECK_DELAY);
+	} while (--retries);
+
+	dev_err(&p_dev->pdev->dev, "%s: failed, state = 0x%x\n", __func__, val);
+	return -EIO;
+}
+
+static int qlcnic_83xx_check_hw_status(struct qlcnic_adapter *p_dev)
+{
+	int err;
+
+	err = qlcnic_83xx_check_cmd_peg_status(p_dev);
+	if (err)
+		return err;
+
+	err = qlcnic_83xx_check_heartbeat(p_dev);
+	if (err)
+		return err;
+
+	return err;
+}
+
+static int qlcnic_83xx_poll_reg(struct qlcnic_adapter *p_dev, u32 addr,
+				int duration, u32 mask, u32 status)
+{
+	int timeout_error, err = 0;
+	u32 value;
+	u8 retries;
+
+	value = QLCRD32(p_dev, addr, &err);
+	if (err == -EIO)
+		return err;
+	retries = duration / 10;
+
+	do {
+		if ((value & mask) != status) {
+			timeout_error = 1;
+			msleep(duration / 10);
+			value = QLCRD32(p_dev, addr, &err);
+			if (err == -EIO)
+				return err;
+		} else {
+			timeout_error = 0;
+			break;
+		}
+	} while (retries--);
+
+	if (timeout_error) {
+		p_dev->ahw->reset.seq_error++;
+		dev_err(&p_dev->pdev->dev,
+			"%s: Timeout Err, entry_num = %d\n",
+			__func__, p_dev->ahw->reset.seq_index);
+		dev_err(&p_dev->pdev->dev,
+			"0x%08x 0x%08x 0x%08x\n",
+			value, mask, status);
+	}
+
+	return timeout_error;
+}
+
+static int qlcnic_83xx_reset_template_checksum(struct qlcnic_adapter *p_dev)
+{
+	u32 sum = 0;
+	u16 *buff = (u16 *)p_dev->ahw->reset.buff;
+	int count = p_dev->ahw->reset.hdr->size / sizeof(u16);
+
+	while (count-- > 0)
+		sum += *buff++;
+
+	while (sum >> 16)
+		sum = (sum & 0xFFFF) + (sum >> 16);
+
+	if (~sum) {
+		return 0;
+	} else {
+		dev_err(&p_dev->pdev->dev, "%s: failed\n", __func__);
+		return -1;
+	}
+}
+
+static int qlcnic_83xx_get_reset_instruction_template(struct qlcnic_adapter *p_dev)
+{
+	struct qlcnic_hardware_context *ahw = p_dev->ahw;
+	u32 addr, count, prev_ver, curr_ver;
+	u8 *p_buff;
+
+	if (ahw->reset.buff != NULL) {
+		prev_ver = p_dev->fw_version;
+		curr_ver = qlcnic_83xx_get_fw_version(p_dev);
+		if (curr_ver > prev_ver)
+			kfree(ahw->reset.buff);
+		else
+			return 0;
+	}
+
+	ahw->reset.seq_error = 0;
+	ahw->reset.buff = kzalloc(QLC_83XX_RESTART_TEMPLATE_SIZE, GFP_KERNEL);
+	if (p_dev->ahw->reset.buff == NULL)
+		return -ENOMEM;
+
+	p_buff = p_dev->ahw->reset.buff;
+	addr = QLC_83XX_RESET_TEMPLATE_ADDR;
+	count = sizeof(struct qlc_83xx_reset_hdr) / sizeof(u32);
+
+	/* Copy template header from flash */
+	if (qlcnic_83xx_flash_read32(p_dev, addr, p_buff, count)) {
+		dev_err(&p_dev->pdev->dev, "%s: flash read failed\n", __func__);
+		return -EIO;
+	}
+	ahw->reset.hdr = (struct qlc_83xx_reset_hdr *)ahw->reset.buff;
+	addr = QLC_83XX_RESET_TEMPLATE_ADDR + ahw->reset.hdr->hdr_size;
+	p_buff = ahw->reset.buff + ahw->reset.hdr->hdr_size;
+	count = (ahw->reset.hdr->size - ahw->reset.hdr->hdr_size) / sizeof(u32);
+
+	/* Copy rest of the template */
+	if (qlcnic_83xx_flash_read32(p_dev, addr, p_buff, count)) {
+		dev_err(&p_dev->pdev->dev, "%s: flash read failed\n", __func__);
+		return -EIO;
+	}
+
+	if (qlcnic_83xx_reset_template_checksum(p_dev))
+		return -EIO;
+	/* Get Stop, Start and Init command offsets */
+	ahw->reset.init_offset = ahw->reset.buff + ahw->reset.hdr->init_offset;
+	ahw->reset.start_offset = ahw->reset.buff +
+				  ahw->reset.hdr->start_offset;
+	ahw->reset.stop_offset = ahw->reset.buff + ahw->reset.hdr->hdr_size;
+	return 0;
+}
+
+/* Read Write HW register command */
+static void qlcnic_83xx_read_write_crb_reg(struct qlcnic_adapter *p_dev,
+					   u32 raddr, u32 waddr)
+{
+	int err = 0;
+	u32 value;
+
+	value = QLCRD32(p_dev, raddr, &err);
+	if (err == -EIO)
+		return;
+	qlcnic_83xx_wrt_reg_indirect(p_dev, waddr, value);
+}
+
+/* Read Modify Write HW register command */
+static void qlcnic_83xx_rmw_crb_reg(struct qlcnic_adapter *p_dev,
+				    u32 raddr, u32 waddr,
+				    struct qlc_83xx_rmw *p_rmw_hdr)
+{
+	int err = 0;
+	u32 value;
+
+	if (p_rmw_hdr->index_a) {
+		value = p_dev->ahw->reset.array[p_rmw_hdr->index_a];
+	} else {
+		value = QLCRD32(p_dev, raddr, &err);
+		if (err == -EIO)
+			return;
+	}
+
+	value &= p_rmw_hdr->mask;
+	value <<= p_rmw_hdr->shl;
+	value >>= p_rmw_hdr->shr;
+	value |= p_rmw_hdr->or_value;
+	value ^= p_rmw_hdr->xor_value;
+	qlcnic_83xx_wrt_reg_indirect(p_dev, waddr, value);
+}
+
+/* Write HW register command */
+static void qlcnic_83xx_write_list(struct qlcnic_adapter *p_dev,
+				   struct qlc_83xx_entry_hdr *p_hdr)
+{
+	int i;
+	struct qlc_83xx_entry *entry;
+
+	entry = (struct qlc_83xx_entry *)((char *)p_hdr +
+					  sizeof(struct qlc_83xx_entry_hdr));
+
+	for (i = 0; i < p_hdr->count; i++, entry++) {
+		qlcnic_83xx_wrt_reg_indirect(p_dev, entry->arg1,
+					     entry->arg2);
+		if (p_hdr->delay)
+			udelay((u32)(p_hdr->delay));
+	}
+}
+
+/* Read and Write instruction */
+static void qlcnic_83xx_read_write_list(struct qlcnic_adapter *p_dev,
+					struct qlc_83xx_entry_hdr *p_hdr)
+{
+	int i;
+	struct qlc_83xx_entry *entry;
+
+	entry = (struct qlc_83xx_entry *)((char *)p_hdr +
+					  sizeof(struct qlc_83xx_entry_hdr));
+
+	for (i = 0; i < p_hdr->count; i++, entry++) {
+		qlcnic_83xx_read_write_crb_reg(p_dev, entry->arg1,
+					       entry->arg2);
+		if (p_hdr->delay)
+			udelay((u32)(p_hdr->delay));
+	}
+}
+
+/* Poll HW register command */
+static void qlcnic_83xx_poll_list(struct qlcnic_adapter *p_dev,
+				  struct qlc_83xx_entry_hdr *p_hdr)
+{
+	long delay;
+	struct qlc_83xx_entry *entry;
+	struct qlc_83xx_poll *poll;
+	int i, err = 0;
+	unsigned long arg1, arg2;
+
+	poll = (struct qlc_83xx_poll *)((char *)p_hdr +
+					sizeof(struct qlc_83xx_entry_hdr));
+
+	entry = (struct qlc_83xx_entry *)((char *)poll +
+					  sizeof(struct qlc_83xx_poll));
+	delay = (long)p_hdr->delay;
+
+	if (!delay) {
+		for (i = 0; i < p_hdr->count; i++, entry++)
+			qlcnic_83xx_poll_reg(p_dev, entry->arg1,
+					     delay, poll->mask,
+					     poll->status);
+	} else {
+		for (i = 0; i < p_hdr->count; i++, entry++) {
+			arg1 = entry->arg1;
+			arg2 = entry->arg2;
+			if (delay) {
+				if (qlcnic_83xx_poll_reg(p_dev,
+							 arg1, delay,
+							 poll->mask,
+							 poll->status)){
+					QLCRD32(p_dev, arg1, &err);
+					if (err == -EIO)
+						return;
+					QLCRD32(p_dev, arg2, &err);
+					if (err == -EIO)
+						return;
+				}
+			}
+		}
+	}
+}
+
+/* Poll and write HW register command */
+static void qlcnic_83xx_poll_write_list(struct qlcnic_adapter *p_dev,
+					struct qlc_83xx_entry_hdr *p_hdr)
+{
+	int i;
+	long delay;
+	struct qlc_83xx_quad_entry *entry;
+	struct qlc_83xx_poll *poll;
+
+	poll = (struct qlc_83xx_poll *)((char *)p_hdr +
+					sizeof(struct qlc_83xx_entry_hdr));
+	entry = (struct qlc_83xx_quad_entry *)((char *)poll +
+					       sizeof(struct qlc_83xx_poll));
+	delay = (long)p_hdr->delay;
+
+	for (i = 0; i < p_hdr->count; i++, entry++) {
+		qlcnic_83xx_wrt_reg_indirect(p_dev, entry->dr_addr,
+					     entry->dr_value);
+		qlcnic_83xx_wrt_reg_indirect(p_dev, entry->ar_addr,
+					     entry->ar_value);
+		if (delay)
+			qlcnic_83xx_poll_reg(p_dev, entry->ar_addr, delay,
+					     poll->mask, poll->status);
+	}
+}
+
+/* Read Modify Write register command */
+static void qlcnic_83xx_read_modify_write(struct qlcnic_adapter *p_dev,
+					  struct qlc_83xx_entry_hdr *p_hdr)
+{
+	int i;
+	struct qlc_83xx_entry *entry;
+	struct qlc_83xx_rmw *rmw_hdr;
+
+	rmw_hdr = (struct qlc_83xx_rmw *)((char *)p_hdr +
+					  sizeof(struct qlc_83xx_entry_hdr));
+
+	entry = (struct qlc_83xx_entry *)((char *)rmw_hdr +
+					  sizeof(struct qlc_83xx_rmw));
+
+	for (i = 0; i < p_hdr->count; i++, entry++) {
+		qlcnic_83xx_rmw_crb_reg(p_dev, entry->arg1,
+					entry->arg2, rmw_hdr);
+		if (p_hdr->delay)
+			udelay((u32)(p_hdr->delay));
+	}
+}
+
+static void qlcnic_83xx_pause(struct qlc_83xx_entry_hdr *p_hdr)
+{
+	if (p_hdr->delay)
+		mdelay((u32)((long)p_hdr->delay));
+}
+
+/* Read and poll register command */
+static void qlcnic_83xx_poll_read_list(struct qlcnic_adapter *p_dev,
+				       struct qlc_83xx_entry_hdr *p_hdr)
+{
+	long delay;
+	int index, i, j, err;
+	struct qlc_83xx_quad_entry *entry;
+	struct qlc_83xx_poll *poll;
+	unsigned long addr;
+
+	poll = (struct qlc_83xx_poll *)((char *)p_hdr +
+					sizeof(struct qlc_83xx_entry_hdr));
+
+	entry = (struct qlc_83xx_quad_entry *)((char *)poll +
+					       sizeof(struct qlc_83xx_poll));
+	delay = (long)p_hdr->delay;
+
+	for (i = 0; i < p_hdr->count; i++, entry++) {
+		qlcnic_83xx_wrt_reg_indirect(p_dev, entry->ar_addr,
+					     entry->ar_value);
+		if (delay) {
+			if (!qlcnic_83xx_poll_reg(p_dev, entry->ar_addr, delay,
+						  poll->mask, poll->status)){
+				index = p_dev->ahw->reset.array_index;
+				addr = entry->dr_addr;
+				j = QLCRD32(p_dev, addr, &err);
+				if (err == -EIO)
+					return;
+
+				p_dev->ahw->reset.array[index++] = j;
+
+				if (index == QLC_83XX_MAX_RESET_SEQ_ENTRIES)
+					p_dev->ahw->reset.array_index = 1;
+			}
+		}
+	}
+}
+
+static inline void qlcnic_83xx_seq_end(struct qlcnic_adapter *p_dev)
+{
+	p_dev->ahw->reset.seq_end = 1;
+}
+
+static void qlcnic_83xx_template_end(struct qlcnic_adapter *p_dev)
+{
+	p_dev->ahw->reset.template_end = 1;
+	if (p_dev->ahw->reset.seq_error == 0)
+		dev_err(&p_dev->pdev->dev,
+			"HW restart process completed successfully.\n");
+	else
+		dev_err(&p_dev->pdev->dev,
+			"HW restart completed with timeout errors.\n");
+}
+
+/**
+* qlcnic_83xx_exec_template_cmd
+*
+* @p_dev: adapter structure
+* @p_buff: Poiter to instruction template
+*
+* Template provides instructions to stop, restart and initalize firmware.
+* These instructions are abstracted as a series of read, write and
+* poll operations on hardware registers. Register information and operation
+* specifics are not exposed to the driver. Driver reads the template from
+* flash and executes the instructions located at pre-defined offsets.
+*
+* Returns: None
+* */
+static void qlcnic_83xx_exec_template_cmd(struct qlcnic_adapter *p_dev,
+					  char *p_buff)
+{
+	int index, entries;
+	struct qlc_83xx_entry_hdr *p_hdr;
+	char *entry = p_buff;
+
+	p_dev->ahw->reset.seq_end = 0;
+	p_dev->ahw->reset.template_end = 0;
+	entries = p_dev->ahw->reset.hdr->entries;
+	index = p_dev->ahw->reset.seq_index;
+
+	for (; (!p_dev->ahw->reset.seq_end) && (index < entries); index++) {
+		p_hdr = (struct qlc_83xx_entry_hdr *)entry;
+
+		switch (p_hdr->cmd) {
+		case QLC_83XX_OPCODE_NOP:
+			break;
+		case QLC_83XX_OPCODE_WRITE_LIST:
+			qlcnic_83xx_write_list(p_dev, p_hdr);
+			break;
+		case QLC_83XX_OPCODE_READ_WRITE_LIST:
+			qlcnic_83xx_read_write_list(p_dev, p_hdr);
+			break;
+		case QLC_83XX_OPCODE_POLL_LIST:
+			qlcnic_83xx_poll_list(p_dev, p_hdr);
+			break;
+		case QLC_83XX_OPCODE_POLL_WRITE_LIST:
+			qlcnic_83xx_poll_write_list(p_dev, p_hdr);
+			break;
+		case QLC_83XX_OPCODE_READ_MODIFY_WRITE:
+			qlcnic_83xx_read_modify_write(p_dev, p_hdr);
+			break;
+		case QLC_83XX_OPCODE_SEQ_PAUSE:
+			qlcnic_83xx_pause(p_hdr);
+			break;
+		case QLC_83XX_OPCODE_SEQ_END:
+			qlcnic_83xx_seq_end(p_dev);
+			break;
+		case QLC_83XX_OPCODE_TMPL_END:
+			qlcnic_83xx_template_end(p_dev);
+			break;
+		case QLC_83XX_OPCODE_POLL_READ_LIST:
+			qlcnic_83xx_poll_read_list(p_dev, p_hdr);
+			break;
+		default:
+			dev_err(&p_dev->pdev->dev,
+				"%s: Unknown opcode 0x%04x in template %d\n",
+				__func__, p_hdr->cmd, index);
+			break;
+		}
+		entry += p_hdr->size;
+	}
+	p_dev->ahw->reset.seq_index = index;
+}
+
+static void qlcnic_83xx_stop_hw(struct qlcnic_adapter *p_dev)
+{
+	p_dev->ahw->reset.seq_index = 0;
+
+	qlcnic_83xx_exec_template_cmd(p_dev, p_dev->ahw->reset.stop_offset);
+	if (p_dev->ahw->reset.seq_end != 1)
+		dev_err(&p_dev->pdev->dev, "%s: failed\n", __func__);
+}
+
+static void qlcnic_83xx_start_hw(struct qlcnic_adapter *p_dev)
+{
+	qlcnic_83xx_exec_template_cmd(p_dev, p_dev->ahw->reset.start_offset);
+	if (p_dev->ahw->reset.template_end != 1)
+		dev_err(&p_dev->pdev->dev, "%s: failed\n", __func__);
+}
+
+static void qlcnic_83xx_init_hw(struct qlcnic_adapter *p_dev)
+{
+	qlcnic_83xx_exec_template_cmd(p_dev, p_dev->ahw->reset.init_offset);
+	if (p_dev->ahw->reset.seq_end != 1)
+		dev_err(&p_dev->pdev->dev, "%s: failed\n", __func__);
+}
+
+/* POST FW related definations*/
+#define QLC_83XX_POST_SIGNATURE_REG	0x41602014
+#define QLC_83XX_POST_MODE_REG		0x41602018
+#define QLC_83XX_POST_FAST_MODE		0
+#define QLC_83XX_POST_MEDIUM_MODE	1
+#define QLC_83XX_POST_SLOW_MODE		2
+
+/* POST Timeout values in milliseconds */
+#define QLC_83XX_POST_FAST_MODE_TIMEOUT	690
+#define QLC_83XX_POST_MED_MODE_TIMEOUT	2930
+#define QLC_83XX_POST_SLOW_MODE_TIMEOUT	7500
+
+/* POST result values */
+#define QLC_83XX_POST_PASS			0xfffffff0
+#define QLC_83XX_POST_ASIC_STRESS_TEST_FAIL	0xffffffff
+#define QLC_83XX_POST_DDR_TEST_FAIL		0xfffffffe
+#define QLC_83XX_POST_ASIC_MEMORY_TEST_FAIL	0xfffffffc
+#define QLC_83XX_POST_FLASH_TEST_FAIL		0xfffffff8
+
+static int qlcnic_83xx_run_post(struct qlcnic_adapter *adapter)
+{
+	struct qlc_83xx_fw_info *fw_info = adapter->ahw->fw_info;
+	struct device *dev = &adapter->pdev->dev;
+	int timeout, count, ret = 0;
+	u32 signature;
+
+	/* Set timeout values with extra 2 seconds of buffer */
+	switch (adapter->ahw->post_mode) {
+	case QLC_83XX_POST_FAST_MODE:
+		timeout = QLC_83XX_POST_FAST_MODE_TIMEOUT + 2000;
+		break;
+	case QLC_83XX_POST_MEDIUM_MODE:
+		timeout = QLC_83XX_POST_MED_MODE_TIMEOUT + 2000;
+		break;
+	case QLC_83XX_POST_SLOW_MODE:
+		timeout = QLC_83XX_POST_SLOW_MODE_TIMEOUT + 2000;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	strncpy(fw_info->fw_file_name, QLC_83XX_POST_FW_FILE_NAME,
+		QLC_FW_FILE_NAME_LEN);
+
+	ret = request_firmware(&fw_info->fw, fw_info->fw_file_name, dev);
+	if (ret) {
+		dev_err(dev, "POST firmware can not be loaded, skipping POST\n");
+		return 0;
+	}
+
+	ret = qlcnic_83xx_copy_fw_file(adapter);
+	if (ret)
+		return ret;
+
+	/* clear QLC_83XX_POST_SIGNATURE_REG register */
+	qlcnic_ind_wr(adapter, QLC_83XX_POST_SIGNATURE_REG, 0);
+
+	/* Set POST mode */
+	qlcnic_ind_wr(adapter, QLC_83XX_POST_MODE_REG,
+		      adapter->ahw->post_mode);
+
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_FW_IMG_VALID,
+			    QLC_83XX_BOOT_FROM_FILE);
+
+	qlcnic_83xx_start_hw(adapter);
+
+	count = 0;
+	do {
+		msleep(100);
+		count += 100;
+
+		signature = qlcnic_ind_rd(adapter, QLC_83XX_POST_SIGNATURE_REG);
+		if (signature == QLC_83XX_POST_PASS)
+			break;
+	} while (timeout > count);
+
+	if (timeout <= count) {
+		dev_err(dev, "POST timed out, signature = 0x%08x\n", signature);
+		return -EIO;
+	}
+
+	switch (signature) {
+	case QLC_83XX_POST_PASS:
+		dev_info(dev, "POST passed, Signature = 0x%08x\n", signature);
+		break;
+	case QLC_83XX_POST_ASIC_STRESS_TEST_FAIL:
+		dev_err(dev, "POST failed, Test case : ASIC STRESS TEST, Signature = 0x%08x\n",
+			signature);
+		ret = -EIO;
+		break;
+	case QLC_83XX_POST_DDR_TEST_FAIL:
+		dev_err(dev, "POST failed, Test case : DDT TEST, Signature = 0x%08x\n",
+			signature);
+		ret = -EIO;
+		break;
+	case QLC_83XX_POST_ASIC_MEMORY_TEST_FAIL:
+		dev_err(dev, "POST failed, Test case : ASIC MEMORY TEST, Signature = 0x%08x\n",
+			signature);
+		ret = -EIO;
+		break;
+	case QLC_83XX_POST_FLASH_TEST_FAIL:
+		dev_err(dev, "POST failed, Test case : FLASH TEST, Signature = 0x%08x\n",
+			signature);
+		ret = -EIO;
+		break;
+	default:
+		dev_err(dev, "POST failed, Test case : INVALID, Signature = 0x%08x\n",
+			signature);
+		ret = -EIO;
+		break;
+	}
+
+	return ret;
+}
+
+static int qlcnic_83xx_load_fw_image_from_host(struct qlcnic_adapter *adapter)
+{
+	struct qlc_83xx_fw_info *fw_info = adapter->ahw->fw_info;
+	int err = -EIO;
+
+	if (request_firmware(&fw_info->fw, fw_info->fw_file_name,
+			     &(adapter->pdev->dev))) {
+		dev_err(&adapter->pdev->dev,
+			"No file FW image, loading flash FW image.\n");
+		QLC_SHARED_REG_WR32(adapter, QLCNIC_FW_IMG_VALID,
+				    QLC_83XX_BOOT_FROM_FLASH);
+	} else {
+		if (qlcnic_83xx_copy_fw_file(adapter))
+			return err;
+		QLC_SHARED_REG_WR32(adapter, QLCNIC_FW_IMG_VALID,
+				    QLC_83XX_BOOT_FROM_FILE);
+	}
+
+	return 0;
+}
+
+static int qlcnic_83xx_restart_hw(struct qlcnic_adapter *adapter)
+{
+	u32 val;
+	int err = -EIO;
+
+	qlcnic_83xx_stop_hw(adapter);
+
+	/* Collect FW register dump if required */
+	val = QLCRDX(adapter->ahw, QLC_83XX_IDC_CTRL);
+	if (!(val & QLC_83XX_IDC_GRACEFULL_RESET))
+		qlcnic_dump_fw(adapter);
+
+	if (val & QLC_83XX_IDC_DISABLE_FW_RESET_RECOVERY) {
+		netdev_info(adapter->netdev, "%s: Auto firmware recovery is disabled\n",
+			    __func__);
+		qlcnic_83xx_idc_enter_failed_state(adapter, 1);
+		return err;
+	}
+
+	qlcnic_83xx_init_hw(adapter);
+
+	if (qlcnic_83xx_copy_bootloader(adapter))
+		return err;
+
+	/* Check if POST needs to be run */
+	if (adapter->ahw->run_post) {
+		err = qlcnic_83xx_run_post(adapter);
+		if (err)
+			return err;
+
+		/* No need to run POST in next reset sequence */
+		adapter->ahw->run_post = false;
+
+		/* Again reset the adapter to load regular firmware  */
+		qlcnic_83xx_stop_hw(adapter);
+		qlcnic_83xx_init_hw(adapter);
+
+		err = qlcnic_83xx_copy_bootloader(adapter);
+		if (err)
+			return err;
+	}
+
+	/* Boot either flash image or firmware image from host file system */
+	if (qlcnic_load_fw_file == 1) {
+		if (qlcnic_83xx_load_fw_image_from_host(adapter))
+			return err;
+	} else {
+		QLC_SHARED_REG_WR32(adapter, QLCNIC_FW_IMG_VALID,
+				    QLC_83XX_BOOT_FROM_FLASH);
+	}
+
+	qlcnic_83xx_start_hw(adapter);
+	if (qlcnic_83xx_check_hw_status(adapter))
+		return -EIO;
+
+	return 0;
+}
+
+static int qlcnic_83xx_get_nic_configuration(struct qlcnic_adapter *adapter)
+{
+	int err;
+	struct qlcnic_info nic_info;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	memset(&nic_info, 0, sizeof(struct qlcnic_info));
+	err = qlcnic_get_nic_info(adapter, &nic_info, ahw->pci_func);
+	if (err)
+		return -EIO;
+
+	ahw->physical_port = (u8) nic_info.phys_port;
+	ahw->switch_mode = nic_info.switch_mode;
+	ahw->max_tx_ques = nic_info.max_tx_ques;
+	ahw->max_rx_ques = nic_info.max_rx_ques;
+	ahw->capabilities = nic_info.capabilities;
+	ahw->max_mac_filters = nic_info.max_mac_filters;
+	ahw->max_mtu = nic_info.max_mtu;
+
+	/* eSwitch capability indicates vNIC mode.
+	 * vNIC and SRIOV are mutually exclusive operational modes.
+	 * If SR-IOV capability is detected, SR-IOV physical function
+	 * will get initialized in default mode.
+	 * SR-IOV virtual function initialization follows a
+	 * different code path and opmode.
+	 * SRIOV mode has precedence over vNIC mode.
+	 */
+	if (test_bit(__QLCNIC_SRIOV_CAPABLE, &adapter->state))
+		return QLC_83XX_DEFAULT_OPMODE;
+
+	if (ahw->capabilities & QLC_83XX_ESWITCH_CAPABILITY)
+		return QLCNIC_VNIC_MODE;
+
+	return QLC_83XX_DEFAULT_OPMODE;
+}
+
+int qlcnic_83xx_configure_opmode(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u16 max_sds_rings, max_tx_rings;
+	int ret;
+
+	ret = qlcnic_83xx_get_nic_configuration(adapter);
+	if (ret == -EIO)
+		return -EIO;
+
+	if (ret == QLCNIC_VNIC_MODE) {
+		ahw->nic_mode = QLCNIC_VNIC_MODE;
+
+		if (qlcnic_83xx_config_vnic_opmode(adapter))
+			return -EIO;
+
+		max_sds_rings = QLCNIC_MAX_VNIC_SDS_RINGS;
+		max_tx_rings = QLCNIC_MAX_VNIC_TX_RINGS;
+	} else if (ret == QLC_83XX_DEFAULT_OPMODE) {
+		ahw->nic_mode = QLCNIC_DEFAULT_MODE;
+		adapter->nic_ops->init_driver = qlcnic_83xx_init_default_driver;
+		ahw->idc.state_entry = qlcnic_83xx_idc_ready_state_entry;
+		max_sds_rings = QLCNIC_MAX_SDS_RINGS;
+		max_tx_rings = QLCNIC_MAX_TX_RINGS;
+	} else {
+		dev_err(&adapter->pdev->dev, "%s: Invalid opmode %d\n",
+			__func__, ret);
+		return -EIO;
+	}
+
+	adapter->max_sds_rings = min(ahw->max_rx_ques, max_sds_rings);
+	adapter->max_tx_rings = min(ahw->max_tx_ques, max_tx_rings);
+
+	return 0;
+}
+
+static void qlcnic_83xx_config_buff_descriptors(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	if (ahw->port_type == QLCNIC_XGBE) {
+		adapter->num_rxd = DEFAULT_RCV_DESCRIPTORS_10G;
+		adapter->max_rxd = MAX_RCV_DESCRIPTORS_10G;
+		adapter->num_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_10G;
+		adapter->max_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_10G;
+
+	} else if (ahw->port_type == QLCNIC_GBE) {
+		adapter->num_rxd = DEFAULT_RCV_DESCRIPTORS_1G;
+		adapter->num_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_1G;
+		adapter->max_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_1G;
+		adapter->max_rxd = MAX_RCV_DESCRIPTORS_1G;
+	}
+	adapter->num_txd = MAX_CMD_DESCRIPTORS;
+	adapter->max_rds_rings = MAX_RDS_RINGS;
+}
+
+static int qlcnic_83xx_init_default_driver(struct qlcnic_adapter *adapter)
+{
+	int err = -EIO;
+
+	qlcnic_83xx_get_minidump_template(adapter);
+	if (qlcnic_83xx_get_port_info(adapter))
+		return err;
+
+	qlcnic_83xx_config_buff_descriptors(adapter);
+	adapter->ahw->msix_supported = !!qlcnic_use_msi_x;
+	adapter->flags |= QLCNIC_ADAPTER_INITIALIZED;
+
+	dev_info(&adapter->pdev->dev, "HAL Version: %d\n",
+		 adapter->ahw->fw_hal_version);
+
+	return 0;
+}
+
+#define IS_QLC_83XX_USED(a, b, c) (((1 << a->portnum) & b) || ((c >> 6) & 0x1))
+static void qlcnic_83xx_clear_function_resources(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_cmd_args cmd;
+	u32 presence_mask, audit_mask;
+	int status;
+
+	presence_mask = QLCRDX(adapter->ahw, QLC_83XX_IDC_DRV_PRESENCE);
+	audit_mask = QLCRDX(adapter->ahw, QLC_83XX_IDC_DRV_AUDIT);
+
+	if (IS_QLC_83XX_USED(adapter, presence_mask, audit_mask)) {
+		status = qlcnic_alloc_mbx_args(&cmd, adapter,
+					       QLCNIC_CMD_STOP_NIC_FUNC);
+		if (status)
+			return;
+
+		cmd.req.arg[1] = BIT_31;
+		status = qlcnic_issue_cmd(adapter, &cmd);
+		if (status)
+			dev_err(&adapter->pdev->dev,
+				"Failed to clean up the function resources\n");
+		qlcnic_free_mbx_args(&cmd);
+	}
+}
+
+static int qlcnic_83xx_get_fw_info(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct pci_dev *pdev = adapter->pdev;
+	struct qlc_83xx_fw_info *fw_info;
+	int err = 0;
+
+	ahw->fw_info = kzalloc(sizeof(*fw_info), GFP_KERNEL);
+	if (!ahw->fw_info) {
+		err = -ENOMEM;
+	} else {
+		fw_info = ahw->fw_info;
+		switch (pdev->device) {
+		case PCI_DEVICE_ID_QLOGIC_QLE834X:
+		case PCI_DEVICE_ID_QLOGIC_QLE8830:
+			strncpy(fw_info->fw_file_name, QLC_83XX_FW_FILE_NAME,
+				QLC_FW_FILE_NAME_LEN);
+			break;
+		case PCI_DEVICE_ID_QLOGIC_QLE844X:
+			strncpy(fw_info->fw_file_name, QLC_84XX_FW_FILE_NAME,
+				QLC_FW_FILE_NAME_LEN);
+			break;
+		default:
+			dev_err(&pdev->dev, "%s: Invalid device id\n",
+				__func__);
+			err = -EINVAL;
+			break;
+		}
+	}
+
+	return err;
+}
+
+static void qlcnic_83xx_init_rings(struct qlcnic_adapter *adapter)
+{
+	u8 rx_cnt = QLCNIC_DEF_SDS_RINGS;
+	u8 tx_cnt = QLCNIC_DEF_TX_RINGS;
+
+	adapter->max_tx_rings = QLCNIC_MAX_TX_RINGS;
+	adapter->max_sds_rings = QLCNIC_MAX_SDS_RINGS;
+
+	if (!adapter->ahw->msix_supported) {
+		rx_cnt = QLCNIC_SINGLE_RING;
+		tx_cnt = QLCNIC_SINGLE_RING;
+	}
+
+	/* compute and set drv sds rings */
+	qlcnic_set_tx_ring_count(adapter, tx_cnt);
+	qlcnic_set_sds_ring_count(adapter, rx_cnt);
+}
+
+int qlcnic_83xx_init(struct qlcnic_adapter *adapter, int pci_using_dac)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int err = 0;
+
+	adapter->rx_mac_learn = false;
+	ahw->msix_supported = !!qlcnic_use_msi_x;
+
+	/* Check if POST needs to be run */
+	switch (qlcnic_load_fw_file) {
+	case 2:
+		ahw->post_mode = QLC_83XX_POST_FAST_MODE;
+		ahw->run_post = true;
+		break;
+	case 3:
+		ahw->post_mode = QLC_83XX_POST_MEDIUM_MODE;
+		ahw->run_post = true;
+		break;
+	case 4:
+		ahw->post_mode = QLC_83XX_POST_SLOW_MODE;
+		ahw->run_post = true;
+		break;
+	default:
+		ahw->run_post = false;
+		break;
+	}
+
+	qlcnic_83xx_init_rings(adapter);
+
+	err = qlcnic_83xx_init_mailbox_work(adapter);
+	if (err)
+		goto exit;
+
+	if (qlcnic_sriov_vf_check(adapter)) {
+		err = qlcnic_sriov_vf_init(adapter, pci_using_dac);
+		if (err)
+			goto detach_mbx;
+		else
+			return err;
+	}
+
+	if (qlcnic_83xx_read_flash_descriptor_table(adapter) ||
+	    qlcnic_83xx_read_flash_mfg_id(adapter)) {
+		dev_err(&adapter->pdev->dev, "Failed reading flash mfg id\n");
+		err = -ENOTRECOVERABLE;
+		goto detach_mbx;
+	}
+
+	err = qlcnic_83xx_check_hw_status(adapter);
+	if (err)
+		goto detach_mbx;
+
+	err = qlcnic_83xx_get_fw_info(adapter);
+	if (err)
+		goto detach_mbx;
+
+	err = qlcnic_83xx_idc_init(adapter);
+	if (err)
+		goto detach_mbx;
+
+	err = qlcnic_setup_intr(adapter);
+	if (err) {
+		dev_err(&adapter->pdev->dev, "Failed to setup interrupt\n");
+		goto disable_intr;
+	}
+
+	INIT_DELAYED_WORK(&adapter->idc_aen_work, qlcnic_83xx_idc_aen_work);
+
+	err = qlcnic_83xx_setup_mbx_intr(adapter);
+	if (err)
+		goto disable_mbx_intr;
+
+	qlcnic_83xx_clear_function_resources(adapter);
+	qlcnic_dcb_enable(adapter->dcb);
+	qlcnic_83xx_initialize_nic(adapter, 1);
+	qlcnic_dcb_get_info(adapter->dcb);
+
+	/* Configure default, SR-IOV or Virtual NIC mode of operation */
+	err = qlcnic_83xx_configure_opmode(adapter);
+	if (err)
+		goto disable_mbx_intr;
+
+
+	/* Perform operating mode specific initialization */
+	err = adapter->nic_ops->init_driver(adapter);
+	if (err)
+		goto disable_mbx_intr;
+
+	/* Periodically monitor device status */
+	qlcnic_83xx_idc_poll_dev_state(&adapter->fw_work.work);
+	return 0;
+
+disable_mbx_intr:
+	qlcnic_83xx_free_mbx_intr(adapter);
+
+disable_intr:
+	qlcnic_teardown_intr(adapter);
+
+detach_mbx:
+	qlcnic_83xx_detach_mailbox_work(adapter);
+	qlcnic_83xx_free_mailbox(ahw->mailbox);
+	ahw->mailbox = NULL;
+exit:
+	return err;
+}
+
+void qlcnic_83xx_aer_stop_poll_work(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlc_83xx_idc *idc = &ahw->idc;
+
+	clear_bit(QLC_83XX_MBX_READY, &idc->status);
+	cancel_delayed_work_sync(&adapter->fw_work);
+
+	if (ahw->nic_mode == QLCNIC_VNIC_MODE)
+		qlcnic_83xx_disable_vnic_mode(adapter, 1);
+
+	qlcnic_83xx_idc_detach_driver(adapter);
+	qlcnic_83xx_initialize_nic(adapter, 0);
+
+	cancel_delayed_work_sync(&adapter->idc_aen_work);
+}
+
+int qlcnic_83xx_aer_reset(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlc_83xx_idc *idc = &ahw->idc;
+	int ret = 0;
+	u32 owner;
+
+	/* Mark the previous IDC state as NEED_RESET so
+	 * that state_entry() will perform the reattachment
+	 * and bringup the device
+	 */
+	idc->prev_state = QLC_83XX_IDC_DEV_NEED_RESET;
+	owner = qlcnic_83xx_idc_find_reset_owner_id(adapter);
+	if (ahw->pci_func == owner) {
+		ret = qlcnic_83xx_restart_hw(adapter);
+		if (ret < 0)
+			return ret;
+		qlcnic_83xx_idc_clear_registers(adapter, 0);
+	}
+
+	ret = idc->state_entry(adapter);
+	return ret;
+}
+
+void qlcnic_83xx_aer_start_poll_work(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlc_83xx_idc *idc = &ahw->idc;
+	u32 owner;
+
+	idc->prev_state = QLC_83XX_IDC_DEV_READY;
+	owner = qlcnic_83xx_idc_find_reset_owner_id(adapter);
+	if (ahw->pci_func == owner)
+		qlcnic_83xx_idc_enter_ready_state(adapter, 0);
+
+	qlcnic_schedule_work(adapter, qlcnic_83xx_idc_poll_dev_state, 0);
+}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c
new file mode 100644
index 0000000..3490675
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c
@@ -0,0 +1,285 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include "qlcnic.h"
+#include "qlcnic_hw.h"
+
+static int qlcnic_83xx_enable_vnic_mode(struct qlcnic_adapter *adapter, int lock)
+{
+	if (lock) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+	}
+	QLCWRX(adapter->ahw, QLC_83XX_VNIC_STATE, QLCNIC_DEV_NPAR_OPER);
+	if (lock)
+		qlcnic_83xx_unlock_driver(adapter);
+
+	return 0;
+}
+
+int qlcnic_83xx_disable_vnic_mode(struct qlcnic_adapter *adapter, int lock)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	if (lock) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+	}
+
+	QLCWRX(adapter->ahw, QLC_83XX_VNIC_STATE, QLCNIC_DEV_NPAR_NON_OPER);
+	ahw->idc.vnic_state = QLCNIC_DEV_NPAR_NON_OPER;
+
+	if (lock)
+		qlcnic_83xx_unlock_driver(adapter);
+
+	return 0;
+}
+
+int qlcnic_83xx_set_vnic_opmode(struct qlcnic_adapter *adapter)
+{
+	u8 id;
+	int ret = -EBUSY;
+	u32 data = QLCNIC_MGMT_FUNC;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	if (qlcnic_83xx_lock_driver(adapter))
+		return ret;
+
+	id = ahw->pci_func;
+	data = QLCRDX(adapter->ahw, QLC_83XX_DRV_OP_MODE);
+	data = (data & ~QLC_83XX_SET_FUNC_OPMODE(0x3, id)) |
+	       QLC_83XX_SET_FUNC_OPMODE(QLCNIC_MGMT_FUNC, id);
+
+	QLCWRX(adapter->ahw, QLC_83XX_DRV_OP_MODE, data);
+
+	qlcnic_83xx_unlock_driver(adapter);
+
+	return 0;
+}
+
+static void
+qlcnic_83xx_config_vnic_buff_descriptors(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	if (ahw->port_type == QLCNIC_XGBE) {
+		adapter->num_rxd = DEFAULT_RCV_DESCRIPTORS_VF;
+		adapter->max_rxd = MAX_RCV_DESCRIPTORS_VF;
+		adapter->num_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_10G;
+		adapter->max_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_10G;
+
+	} else if (ahw->port_type == QLCNIC_GBE) {
+		adapter->num_rxd = DEFAULT_RCV_DESCRIPTORS_1G;
+		adapter->num_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_1G;
+		adapter->max_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_1G;
+		adapter->max_rxd = MAX_RCV_DESCRIPTORS_1G;
+	}
+	adapter->num_txd = MAX_CMD_DESCRIPTORS;
+	adapter->max_rds_rings = MAX_RDS_RINGS;
+}
+
+
+/**
+ * qlcnic_83xx_init_mgmt_vnic
+ *
+ * @adapter: adapter structure
+ * Management virtual NIC sets the operational mode of other vNIC's and
+ * configures embedded switch (ESWITCH).
+ * Returns: Success(0) or error code.
+ *
+ **/
+static int qlcnic_83xx_init_mgmt_vnic(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct device *dev = &adapter->pdev->dev;
+	struct qlcnic_npar_info *npar;
+	int i, err = -EIO;
+
+	qlcnic_83xx_get_minidump_template(adapter);
+
+	if (!(adapter->flags & QLCNIC_ADAPTER_INITIALIZED)) {
+		if (qlcnic_init_pci_info(adapter))
+			return err;
+
+		npar = adapter->npars;
+
+		for (i = 0; i < ahw->total_nic_func; i++, npar++) {
+			dev_info(dev, "id:%d active:%d type:%d port:%d min_bw:%d max_bw:%d mac_addr:%pM\n",
+				 npar->pci_func, npar->active, npar->type,
+				 npar->phy_port, npar->min_bw, npar->max_bw,
+				 npar->mac);
+		}
+
+		dev_info(dev, "Max functions = %d, active functions = %d\n",
+			 ahw->max_pci_func, ahw->total_nic_func);
+
+		if (qlcnic_83xx_set_vnic_opmode(adapter))
+			return err;
+
+		if (qlcnic_set_default_offload_settings(adapter))
+			return err;
+	} else {
+		if (qlcnic_reset_npar_config(adapter))
+			return err;
+	}
+
+	if (qlcnic_83xx_get_port_info(adapter))
+		return err;
+
+	qlcnic_83xx_config_vnic_buff_descriptors(adapter);
+	ahw->msix_supported = qlcnic_use_msi_x ? 1 : 0;
+	adapter->flags |= QLCNIC_ADAPTER_INITIALIZED;
+	qlcnic_83xx_enable_vnic_mode(adapter, 1);
+
+	dev_info(dev, "HAL Version: %d, Management function\n",
+		 ahw->fw_hal_version);
+
+	return 0;
+}
+
+static int qlcnic_83xx_init_privileged_vnic(struct qlcnic_adapter *adapter)
+{
+	int err = -EIO;
+
+	qlcnic_83xx_get_minidump_template(adapter);
+	if (qlcnic_83xx_get_port_info(adapter))
+		return err;
+
+	qlcnic_83xx_config_vnic_buff_descriptors(adapter);
+	adapter->ahw->msix_supported = !!qlcnic_use_msi_x;
+	adapter->flags |= QLCNIC_ADAPTER_INITIALIZED;
+
+	dev_info(&adapter->pdev->dev,
+		 "HAL Version: %d, Privileged function\n",
+		 adapter->ahw->fw_hal_version);
+	return 0;
+}
+
+static int qlcnic_83xx_init_non_privileged_vnic(struct qlcnic_adapter *adapter)
+{
+	int err = -EIO;
+
+	qlcnic_83xx_get_fw_version(adapter);
+	if (qlcnic_set_eswitch_port_config(adapter))
+		return err;
+
+	if (qlcnic_83xx_get_port_info(adapter))
+		return err;
+
+	qlcnic_83xx_config_vnic_buff_descriptors(adapter);
+	adapter->ahw->msix_supported = !!qlcnic_use_msi_x;
+	adapter->flags |= QLCNIC_ADAPTER_INITIALIZED;
+
+	dev_info(&adapter->pdev->dev, "HAL Version: %d, Virtual function\n",
+		 adapter->ahw->fw_hal_version);
+
+	return 0;
+}
+
+/**
+ * qlcnic_83xx_vnic_opmode
+ *
+ * @adapter: adapter structure
+ * Identify virtual NIC operational modes.
+ *
+ * Returns: Success(0) or error code.
+ *
+ **/
+int qlcnic_83xx_config_vnic_opmode(struct qlcnic_adapter *adapter)
+{
+	u32 op_mode, priv_level;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_nic_template *nic_ops = adapter->nic_ops;
+
+	qlcnic_get_func_no(adapter);
+	op_mode = QLCRDX(adapter->ahw, QLC_83XX_DRV_OP_MODE);
+
+	if (op_mode == QLC_83XX_DEFAULT_OPMODE)
+		priv_level = QLCNIC_MGMT_FUNC;
+	else
+		priv_level = QLC_83XX_GET_FUNC_PRIVILEGE(op_mode,
+							 ahw->pci_func);
+	switch (priv_level) {
+	case QLCNIC_NON_PRIV_FUNC:
+		ahw->op_mode = QLCNIC_NON_PRIV_FUNC;
+		ahw->idc.state_entry = qlcnic_83xx_idc_ready_state_entry;
+		nic_ops->init_driver = qlcnic_83xx_init_non_privileged_vnic;
+		break;
+	case QLCNIC_PRIV_FUNC:
+		ahw->op_mode = QLCNIC_PRIV_FUNC;
+		ahw->idc.state_entry = qlcnic_83xx_idc_vnic_pf_entry;
+		nic_ops->init_driver = qlcnic_83xx_init_privileged_vnic;
+		break;
+	case QLCNIC_MGMT_FUNC:
+		ahw->op_mode = QLCNIC_MGMT_FUNC;
+		ahw->idc.state_entry = qlcnic_83xx_idc_ready_state_entry;
+		nic_ops->init_driver = qlcnic_83xx_init_mgmt_vnic;
+		break;
+	default:
+		dev_err(&adapter->pdev->dev, "Invalid Virtual NIC opmode\n");
+		return -EIO;
+	}
+
+	if (ahw->capabilities & QLC_83XX_ESWITCH_CAPABILITY) {
+		adapter->flags |= QLCNIC_ESWITCH_ENABLED;
+		if (adapter->drv_mac_learn)
+			adapter->rx_mac_learn = true;
+	} else {
+		adapter->flags &= ~QLCNIC_ESWITCH_ENABLED;
+		adapter->rx_mac_learn = false;
+	}
+
+	ahw->idc.vnic_state = QLCNIC_DEV_NPAR_NON_OPER;
+	ahw->idc.vnic_wait_limit = QLCNIC_DEV_NPAR_OPER_TIMEO;
+
+	return 0;
+}
+
+int qlcnic_83xx_check_vnic_state(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlc_83xx_idc *idc = &ahw->idc;
+	u32 state;
+
+	state = QLCRDX(ahw, QLC_83XX_VNIC_STATE);
+	while (state != QLCNIC_DEV_NPAR_OPER && idc->vnic_wait_limit) {
+		idc->vnic_wait_limit--;
+		msleep(1000);
+		state = QLCRDX(ahw, QLC_83XX_VNIC_STATE);
+	}
+
+	if (state != QLCNIC_DEV_NPAR_OPER) {
+		dev_err(&adapter->pdev->dev,
+			"vNIC mode not operational, state check timed out.\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+int qlcnic_83xx_set_port_eswitch_status(struct qlcnic_adapter *adapter,
+					int func, int *port_id)
+{
+	struct qlcnic_info nic_info;
+	int err = 0;
+
+	memset(&nic_info, 0, sizeof(struct qlcnic_info));
+
+	err = qlcnic_get_nic_info(adapter, &nic_info, func);
+	if (err)
+		return err;
+
+	if (nic_info.capabilities & QLC_83XX_ESWITCH_CAPABILITY)
+		*port_id = nic_info.phys_port;
+	else
+		err = -EIO;
+
+	if (!err)
+		adapter->eswitch[*port_id].flags |= QLCNIC_SWITCH_ENABLE;
+
+	return err;
+}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
new file mode 100644
index 0000000..d344e9d
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
@@ -0,0 +1,1430 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include "qlcnic.h"
+
+static const struct qlcnic_mailbox_metadata qlcnic_mbx_tbl[] = {
+	{QLCNIC_CMD_CREATE_RX_CTX, 4, 1},
+	{QLCNIC_CMD_DESTROY_RX_CTX, 2, 1},
+	{QLCNIC_CMD_CREATE_TX_CTX, 4, 1},
+	{QLCNIC_CMD_DESTROY_TX_CTX, 3, 1},
+	{QLCNIC_CMD_INTRPT_TEST, 4, 1},
+	{QLCNIC_CMD_SET_MTU, 4, 1},
+	{QLCNIC_CMD_READ_PHY, 4, 2},
+	{QLCNIC_CMD_WRITE_PHY, 5, 1},
+	{QLCNIC_CMD_READ_HW_REG, 4, 1},
+	{QLCNIC_CMD_GET_FLOW_CTL, 4, 2},
+	{QLCNIC_CMD_SET_FLOW_CTL, 4, 1},
+	{QLCNIC_CMD_READ_MAX_MTU, 4, 2},
+	{QLCNIC_CMD_READ_MAX_LRO, 4, 2},
+	{QLCNIC_CMD_MAC_ADDRESS, 4, 3},
+	{QLCNIC_CMD_GET_PCI_INFO, 4, 1},
+	{QLCNIC_CMD_GET_NIC_INFO, 4, 1},
+	{QLCNIC_CMD_SET_NIC_INFO, 4, 1},
+	{QLCNIC_CMD_GET_ESWITCH_CAPABILITY, 4, 3},
+	{QLCNIC_CMD_TOGGLE_ESWITCH, 4, 1},
+	{QLCNIC_CMD_GET_ESWITCH_STATUS, 4, 3},
+	{QLCNIC_CMD_SET_PORTMIRRORING, 4, 1},
+	{QLCNIC_CMD_CONFIGURE_ESWITCH, 4, 1},
+	{QLCNIC_CMD_GET_MAC_STATS, 4, 1},
+	{QLCNIC_CMD_GET_ESWITCH_PORT_CONFIG, 4, 3},
+	{QLCNIC_CMD_GET_ESWITCH_STATS, 4, 1},
+	{QLCNIC_CMD_CONFIG_PORT, 4, 1},
+	{QLCNIC_CMD_TEMP_SIZE, 4, 4},
+	{QLCNIC_CMD_GET_TEMP_HDR, 4, 1},
+	{QLCNIC_CMD_82XX_SET_DRV_VER, 4, 1},
+	{QLCNIC_CMD_GET_LED_STATUS, 4, 2},
+	{QLCNIC_CMD_MQ_TX_CONFIG_INTR, 2, 3},
+	{QLCNIC_CMD_DCB_QUERY_CAP, 1, 2},
+	{QLCNIC_CMD_DCB_QUERY_PARAM, 4, 1},
+};
+
+static inline u32 qlcnic_get_cmd_signature(struct qlcnic_hardware_context *ahw)
+{
+	return (ahw->pci_func & 0xff) | ((ahw->fw_hal_version & 0xff) << 8) |
+	       (0xcafe << 16);
+}
+
+/* Allocate mailbox registers */
+int qlcnic_82xx_alloc_mbx_args(struct qlcnic_cmd_args *mbx,
+			       struct qlcnic_adapter *adapter, u32 type)
+{
+	int i, size;
+	const struct qlcnic_mailbox_metadata *mbx_tbl;
+
+	mbx_tbl = qlcnic_mbx_tbl;
+	size = ARRAY_SIZE(qlcnic_mbx_tbl);
+	for (i = 0; i < size; i++) {
+		if (type == mbx_tbl[i].cmd) {
+			mbx->req.num = mbx_tbl[i].in_args;
+			mbx->rsp.num = mbx_tbl[i].out_args;
+			mbx->req.arg = kcalloc(mbx->req.num,
+					       sizeof(u32), GFP_ATOMIC);
+			if (!mbx->req.arg)
+				return -ENOMEM;
+			mbx->rsp.arg = kcalloc(mbx->rsp.num,
+					       sizeof(u32), GFP_ATOMIC);
+			if (!mbx->rsp.arg) {
+				kfree(mbx->req.arg);
+				mbx->req.arg = NULL;
+				return -ENOMEM;
+			}
+			mbx->req.arg[0] = type;
+			break;
+		}
+	}
+	return 0;
+}
+
+/* Free up mailbox registers */
+void qlcnic_free_mbx_args(struct qlcnic_cmd_args *cmd)
+{
+	kfree(cmd->req.arg);
+	cmd->req.arg = NULL;
+	kfree(cmd->rsp.arg);
+	cmd->rsp.arg = NULL;
+}
+
+static u32
+qlcnic_poll_rsp(struct qlcnic_adapter *adapter)
+{
+	u32 rsp;
+	int timeout = 0, err = 0;
+
+	do {
+		/* give atleast 1ms for firmware to respond */
+		mdelay(1);
+
+		if (++timeout > QLCNIC_OS_CRB_RETRY_COUNT)
+			return QLCNIC_CDRP_RSP_TIMEOUT;
+
+		rsp = QLCRD32(adapter, QLCNIC_CDRP_CRB_OFFSET, &err);
+	} while (!QLCNIC_CDRP_IS_RSP(rsp));
+
+	return rsp;
+}
+
+int qlcnic_82xx_issue_cmd(struct qlcnic_adapter *adapter,
+			  struct qlcnic_cmd_args *cmd)
+{
+	int i, err = 0;
+	u32 rsp;
+	u32 signature;
+	struct pci_dev *pdev = adapter->pdev;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	const char *fmt;
+
+	signature = qlcnic_get_cmd_signature(ahw);
+
+	/* Acquire semaphore before accessing CRB */
+	if (qlcnic_api_lock(adapter)) {
+		cmd->rsp.arg[0] = QLCNIC_RCODE_TIMEOUT;
+		return cmd->rsp.arg[0];
+	}
+
+	QLCWR32(adapter, QLCNIC_SIGN_CRB_OFFSET, signature);
+	for (i = 1; i < cmd->req.num; i++)
+		QLCWR32(adapter, QLCNIC_CDRP_ARG(i), cmd->req.arg[i]);
+	QLCWR32(adapter, QLCNIC_CDRP_CRB_OFFSET,
+		QLCNIC_CDRP_FORM_CMD(cmd->req.arg[0]));
+	rsp = qlcnic_poll_rsp(adapter);
+
+	if (rsp == QLCNIC_CDRP_RSP_TIMEOUT) {
+		dev_err(&pdev->dev, "command timeout, response = 0x%x\n", rsp);
+		cmd->rsp.arg[0] = QLCNIC_RCODE_TIMEOUT;
+	} else if (rsp == QLCNIC_CDRP_RSP_FAIL) {
+		cmd->rsp.arg[0] = QLCRD32(adapter, QLCNIC_CDRP_ARG(1), &err);
+		switch (cmd->rsp.arg[0]) {
+		case QLCNIC_RCODE_INVALID_ARGS:
+			fmt = "CDRP invalid args: [%d]\n";
+			break;
+		case QLCNIC_RCODE_NOT_SUPPORTED:
+		case QLCNIC_RCODE_NOT_IMPL:
+			fmt = "CDRP command not supported: [%d]\n";
+			break;
+		case QLCNIC_RCODE_NOT_PERMITTED:
+			fmt = "CDRP requested action not permitted: [%d]\n";
+			break;
+		case QLCNIC_RCODE_INVALID:
+			fmt = "CDRP invalid or unknown cmd received: [%d]\n";
+			break;
+		case QLCNIC_RCODE_TIMEOUT:
+			fmt = "CDRP command timeout: [%d]\n";
+			break;
+		default:
+			fmt = "CDRP command failed: [%d]\n";
+			break;
+		}
+		dev_err(&pdev->dev, fmt, cmd->rsp.arg[0]);
+		qlcnic_dump_mbx(adapter, cmd);
+	} else if (rsp == QLCNIC_CDRP_RSP_OK)
+		cmd->rsp.arg[0] = QLCNIC_RCODE_SUCCESS;
+
+	for (i = 1; i < cmd->rsp.num; i++)
+		cmd->rsp.arg[i] = QLCRD32(adapter, QLCNIC_CDRP_ARG(i), &err);
+
+	/* Release semaphore */
+	qlcnic_api_unlock(adapter);
+	return cmd->rsp.arg[0];
+}
+
+int qlcnic_fw_cmd_set_drv_version(struct qlcnic_adapter *adapter, u32 fw_cmd)
+{
+	struct qlcnic_cmd_args cmd;
+	u32 arg1, arg2, arg3;
+	char drv_string[12];
+	int err = 0;
+
+	memset(drv_string, 0, sizeof(drv_string));
+	snprintf(drv_string, sizeof(drv_string), "%d"".""%d"".""%d",
+		 _QLCNIC_LINUX_MAJOR, _QLCNIC_LINUX_MINOR,
+		 _QLCNIC_LINUX_SUBVERSION);
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, fw_cmd);
+	if (err)
+		return err;
+
+	memcpy(&arg1, drv_string, sizeof(u32));
+	memcpy(&arg2, drv_string + 4, sizeof(u32));
+	memcpy(&arg3, drv_string + 8, sizeof(u32));
+
+	cmd.req.arg[1] = arg1;
+	cmd.req.arg[2] = arg2;
+	cmd.req.arg[3] = arg3;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err) {
+		dev_info(&adapter->pdev->dev,
+			 "Failed to set driver version in firmware\n");
+		err = -EIO;
+	}
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+int
+qlcnic_fw_cmd_set_mtu(struct qlcnic_adapter *adapter, int mtu)
+{
+	int err = 0;
+	struct qlcnic_cmd_args cmd;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+
+	if (recv_ctx->state != QLCNIC_HOST_CTX_STATE_ACTIVE)
+		return err;
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_SET_MTU);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = recv_ctx->context_id;
+	cmd.req.arg[2] = mtu;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err) {
+		dev_err(&adapter->pdev->dev, "Failed to set mtu\n");
+		err = -EIO;
+	}
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+int qlcnic_82xx_fw_cmd_create_rx_ctx(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	dma_addr_t hostrq_phys_addr, cardrsp_phys_addr;
+	struct net_device *netdev = adapter->netdev;
+	u32 temp_intr_crb_mode, temp_rds_crb_mode;
+	struct qlcnic_cardrsp_rds_ring *prsp_rds;
+	struct qlcnic_cardrsp_sds_ring *prsp_sds;
+	struct qlcnic_hostrq_rds_ring *prq_rds;
+	struct qlcnic_hostrq_sds_ring *prq_sds;
+	struct qlcnic_host_rds_ring *rds_ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_cardrsp_rx_ctx *prsp;
+	struct qlcnic_hostrq_rx_ctx *prq;
+	u8 i, nrds_rings, nsds_rings;
+	struct qlcnic_cmd_args cmd;
+	size_t rq_size, rsp_size;
+	u32 cap, reg, val, reg2;
+	u64 phys_addr;
+	u16 temp_u16;
+	void *addr;
+	int err;
+
+	nrds_rings = adapter->max_rds_rings;
+	nsds_rings = adapter->drv_sds_rings;
+
+	rq_size = SIZEOF_HOSTRQ_RX(struct qlcnic_hostrq_rx_ctx, nrds_rings,
+				   nsds_rings);
+	rsp_size = SIZEOF_CARDRSP_RX(struct qlcnic_cardrsp_rx_ctx, nrds_rings,
+				     nsds_rings);
+
+	addr = dma_alloc_coherent(&adapter->pdev->dev, rq_size,
+				  &hostrq_phys_addr, GFP_KERNEL);
+	if (addr == NULL)
+		return -ENOMEM;
+	prq = addr;
+
+	addr = dma_alloc_coherent(&adapter->pdev->dev, rsp_size,
+			&cardrsp_phys_addr, GFP_KERNEL);
+	if (addr == NULL) {
+		err = -ENOMEM;
+		goto out_free_rq;
+	}
+	prsp = addr;
+
+	prq->host_rsp_dma_addr = cpu_to_le64(cardrsp_phys_addr);
+
+	cap = (QLCNIC_CAP0_LEGACY_CONTEXT | QLCNIC_CAP0_LEGACY_MN
+						| QLCNIC_CAP0_VALIDOFF);
+	cap |= (QLCNIC_CAP0_JUMBO_CONTIGUOUS | QLCNIC_CAP0_LRO_CONTIGUOUS);
+
+	if (qlcnic_check_multi_tx(adapter) &&
+	    !adapter->ahw->diag_test) {
+		cap |= QLCNIC_CAP0_TX_MULTI;
+	} else {
+		temp_u16 = offsetof(struct qlcnic_hostrq_rx_ctx, msix_handler);
+		prq->valid_field_offset = cpu_to_le16(temp_u16);
+		prq->txrx_sds_binding = nsds_rings - 1;
+		temp_intr_crb_mode = QLCNIC_HOST_INT_CRB_MODE_SHARED;
+		prq->host_int_crb_mode = cpu_to_le32(temp_intr_crb_mode);
+		temp_rds_crb_mode = QLCNIC_HOST_RDS_CRB_MODE_UNIQUE;
+		prq->host_rds_crb_mode = cpu_to_le32(temp_rds_crb_mode);
+	}
+
+	prq->capabilities[0] = cpu_to_le32(cap);
+
+	prq->num_rds_rings = cpu_to_le16(nrds_rings);
+	prq->num_sds_rings = cpu_to_le16(nsds_rings);
+	prq->rds_ring_offset = 0;
+
+	val = le32_to_cpu(prq->rds_ring_offset) +
+		(sizeof(struct qlcnic_hostrq_rds_ring) * nrds_rings);
+	prq->sds_ring_offset = cpu_to_le32(val);
+
+	prq_rds = (struct qlcnic_hostrq_rds_ring *)(prq->data +
+			le32_to_cpu(prq->rds_ring_offset));
+
+	for (i = 0; i < nrds_rings; i++) {
+		rds_ring = &recv_ctx->rds_rings[i];
+		rds_ring->producer = 0;
+		prq_rds[i].host_phys_addr = cpu_to_le64(rds_ring->phys_addr);
+		prq_rds[i].ring_size = cpu_to_le32(rds_ring->num_desc);
+		prq_rds[i].ring_kind = cpu_to_le32(i);
+		prq_rds[i].buff_size = cpu_to_le64(rds_ring->dma_size);
+	}
+
+	prq_sds = (struct qlcnic_hostrq_sds_ring *)(prq->data +
+			le32_to_cpu(prq->sds_ring_offset));
+
+	for (i = 0; i < nsds_rings; i++) {
+		sds_ring = &recv_ctx->sds_rings[i];
+		sds_ring->consumer = 0;
+		memset(sds_ring->desc_head, 0, STATUS_DESC_RINGSIZE(sds_ring));
+		prq_sds[i].host_phys_addr = cpu_to_le64(sds_ring->phys_addr);
+		prq_sds[i].ring_size = cpu_to_le32(sds_ring->num_desc);
+		if (qlcnic_check_multi_tx(adapter) &&
+		    !adapter->ahw->diag_test)
+			prq_sds[i].msi_index = cpu_to_le16(ahw->intr_tbl[i].id);
+		else
+			prq_sds[i].msi_index = cpu_to_le16(i);
+	}
+
+	phys_addr = hostrq_phys_addr;
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_CREATE_RX_CTX);
+	if (err)
+		goto out_free_rsp;
+
+	cmd.req.arg[1] = MSD(phys_addr);
+	cmd.req.arg[2] = LSD(phys_addr);
+	cmd.req.arg[3] = rq_size;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to create rx ctx in firmware%d\n", err);
+		goto out_free_rsp;
+	}
+
+	prsp_rds = ((struct qlcnic_cardrsp_rds_ring *)
+			 &prsp->data[le32_to_cpu(prsp->rds_ring_offset)]);
+
+	for (i = 0; i < le16_to_cpu(prsp->num_rds_rings); i++) {
+		rds_ring = &recv_ctx->rds_rings[i];
+		reg = le32_to_cpu(prsp_rds[i].host_producer_crb);
+		rds_ring->crb_rcv_producer = ahw->pci_base0 + reg;
+	}
+
+	prsp_sds = ((struct qlcnic_cardrsp_sds_ring *)
+			&prsp->data[le32_to_cpu(prsp->sds_ring_offset)]);
+
+	for (i = 0; i < le16_to_cpu(prsp->num_sds_rings); i++) {
+		sds_ring = &recv_ctx->sds_rings[i];
+		reg = le32_to_cpu(prsp_sds[i].host_consumer_crb);
+		if (qlcnic_check_multi_tx(adapter) && !adapter->ahw->diag_test)
+			reg2 = ahw->intr_tbl[i].src;
+		else
+			reg2 = le32_to_cpu(prsp_sds[i].interrupt_crb);
+
+		sds_ring->crb_intr_mask = ahw->pci_base0 + reg2;
+		sds_ring->crb_sts_consumer = ahw->pci_base0 + reg;
+	}
+
+	recv_ctx->state = le32_to_cpu(prsp->host_ctx_state);
+	recv_ctx->context_id = le16_to_cpu(prsp->context_id);
+	recv_ctx->virt_port = prsp->virt_port;
+
+	netdev_info(netdev, "Rx Context[%d] Created, state 0x%x\n",
+		    recv_ctx->context_id, recv_ctx->state);
+	qlcnic_free_mbx_args(&cmd);
+
+out_free_rsp:
+	dma_free_coherent(&adapter->pdev->dev, rsp_size, prsp,
+			  cardrsp_phys_addr);
+out_free_rq:
+	dma_free_coherent(&adapter->pdev->dev, rq_size, prq, hostrq_phys_addr);
+
+	return err;
+}
+
+void qlcnic_82xx_fw_cmd_del_rx_ctx(struct qlcnic_adapter *adapter)
+{
+	int err;
+	struct qlcnic_cmd_args cmd;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_DESTROY_RX_CTX);
+	if (err)
+		return;
+
+	cmd.req.arg[1] = recv_ctx->context_id;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err)
+		dev_err(&adapter->pdev->dev,
+			"Failed to destroy rx ctx in firmware\n");
+
+	recv_ctx->state = QLCNIC_HOST_CTX_STATE_FREED;
+	qlcnic_free_mbx_args(&cmd);
+}
+
+int qlcnic_82xx_fw_cmd_create_tx_ctx(struct qlcnic_adapter *adapter,
+				     struct qlcnic_host_tx_ring *tx_ring,
+				     int ring)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct net_device *netdev = adapter->netdev;
+	struct qlcnic_hostrq_tx_ctx	*prq;
+	struct qlcnic_hostrq_cds_ring	*prq_cds;
+	struct qlcnic_cardrsp_tx_ctx	*prsp;
+	struct qlcnic_cmd_args cmd;
+	u32 temp, intr_mask, temp_int_crb_mode;
+	dma_addr_t rq_phys_addr, rsp_phys_addr;
+	int temp_nsds_rings, index, err;
+	void *rq_addr, *rsp_addr;
+	size_t rq_size, rsp_size;
+	u64 phys_addr;
+	u16 msix_id;
+
+	/* reset host resources */
+	tx_ring->producer = 0;
+	tx_ring->sw_consumer = 0;
+	*(tx_ring->hw_consumer) = 0;
+
+	rq_size = SIZEOF_HOSTRQ_TX(struct qlcnic_hostrq_tx_ctx);
+	rq_addr = dma_zalloc_coherent(&adapter->pdev->dev, rq_size,
+				      &rq_phys_addr, GFP_KERNEL);
+	if (!rq_addr)
+		return -ENOMEM;
+
+	rsp_size = SIZEOF_CARDRSP_TX(struct qlcnic_cardrsp_tx_ctx);
+	rsp_addr = dma_zalloc_coherent(&adapter->pdev->dev, rsp_size,
+				       &rsp_phys_addr, GFP_KERNEL);
+	if (!rsp_addr) {
+		err = -ENOMEM;
+		goto out_free_rq;
+	}
+
+	prq = rq_addr;
+	prsp = rsp_addr;
+
+	prq->host_rsp_dma_addr = cpu_to_le64(rsp_phys_addr);
+
+	temp = (QLCNIC_CAP0_LEGACY_CONTEXT | QLCNIC_CAP0_LEGACY_MN |
+		QLCNIC_CAP0_LSO);
+	if (qlcnic_check_multi_tx(adapter) && !adapter->ahw->diag_test)
+		temp |= QLCNIC_CAP0_TX_MULTI;
+
+	prq->capabilities[0] = cpu_to_le32(temp);
+
+	if (qlcnic_check_multi_tx(adapter) &&
+	    !adapter->ahw->diag_test) {
+		temp_nsds_rings = adapter->drv_sds_rings;
+		index = temp_nsds_rings + ring;
+		msix_id = ahw->intr_tbl[index].id;
+		prq->msi_index = cpu_to_le16(msix_id);
+	} else {
+		temp_int_crb_mode = QLCNIC_HOST_INT_CRB_MODE_SHARED;
+		prq->host_int_crb_mode = cpu_to_le32(temp_int_crb_mode);
+		prq->msi_index = 0;
+	}
+
+	prq->interrupt_ctl = 0;
+	prq->cmd_cons_dma_addr = cpu_to_le64(tx_ring->hw_cons_phys_addr);
+
+	prq_cds = &prq->cds_ring;
+
+	prq_cds->host_phys_addr = cpu_to_le64(tx_ring->phys_addr);
+	prq_cds->ring_size = cpu_to_le32(tx_ring->num_desc);
+
+	phys_addr = rq_phys_addr;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_CREATE_TX_CTX);
+	if (err)
+		goto out_free_rsp;
+
+	cmd.req.arg[1] = MSD(phys_addr);
+	cmd.req.arg[2] = LSD(phys_addr);
+	cmd.req.arg[3] = rq_size;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+
+	if (err == QLCNIC_RCODE_SUCCESS) {
+		tx_ring->state = le32_to_cpu(prsp->host_ctx_state);
+		temp = le32_to_cpu(prsp->cds_ring.host_producer_crb);
+		tx_ring->crb_cmd_producer = adapter->ahw->pci_base0 + temp;
+		tx_ring->ctx_id = le16_to_cpu(prsp->context_id);
+		if (qlcnic_check_multi_tx(adapter) &&
+		    !adapter->ahw->diag_test &&
+		    (adapter->flags & QLCNIC_MSIX_ENABLED)) {
+			index = adapter->drv_sds_rings + ring;
+			intr_mask = ahw->intr_tbl[index].src;
+			tx_ring->crb_intr_mask = ahw->pci_base0 + intr_mask;
+		}
+
+		netdev_info(netdev, "Tx Context[0x%x] Created, state 0x%x\n",
+			    tx_ring->ctx_id, tx_ring->state);
+	} else {
+		netdev_err(netdev, "Failed to create tx ctx in firmware%d\n",
+			   err);
+		err = -EIO;
+	}
+	qlcnic_free_mbx_args(&cmd);
+
+out_free_rsp:
+	dma_free_coherent(&adapter->pdev->dev, rsp_size, rsp_addr,
+			  rsp_phys_addr);
+out_free_rq:
+	dma_free_coherent(&adapter->pdev->dev, rq_size, rq_addr, rq_phys_addr);
+
+	return err;
+}
+
+void qlcnic_82xx_fw_cmd_del_tx_ctx(struct qlcnic_adapter *adapter,
+				   struct qlcnic_host_tx_ring *tx_ring)
+{
+	struct qlcnic_cmd_args cmd;
+	int ret;
+
+	ret = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_DESTROY_TX_CTX);
+	if (ret)
+		return;
+
+	cmd.req.arg[1] = tx_ring->ctx_id;
+	if (qlcnic_issue_cmd(adapter, &cmd))
+		dev_err(&adapter->pdev->dev,
+			"Failed to destroy tx ctx in firmware\n");
+	qlcnic_free_mbx_args(&cmd);
+}
+
+int
+qlcnic_fw_cmd_set_port(struct qlcnic_adapter *adapter, u32 config)
+{
+	int err;
+	struct qlcnic_cmd_args cmd;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_CONFIG_PORT);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = config;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+int qlcnic_alloc_hw_resources(struct qlcnic_adapter *adapter)
+{
+	void *addr;
+	int err, ring;
+	struct qlcnic_recv_context *recv_ctx;
+	struct qlcnic_host_rds_ring *rds_ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_host_tx_ring *tx_ring;
+	__le32 *ptr;
+
+	struct pci_dev *pdev = adapter->pdev;
+
+	recv_ctx = adapter->recv_ctx;
+
+	for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+		tx_ring = &adapter->tx_ring[ring];
+		ptr = (__le32 *)dma_alloc_coherent(&pdev->dev, sizeof(u32),
+						   &tx_ring->hw_cons_phys_addr,
+						   GFP_KERNEL);
+		if (ptr == NULL) {
+			err = -ENOMEM;
+			goto err_out_free;
+		}
+
+		tx_ring->hw_consumer = ptr;
+		/* cmd desc ring */
+		addr = dma_alloc_coherent(&pdev->dev, TX_DESC_RINGSIZE(tx_ring),
+					  &tx_ring->phys_addr,
+					  GFP_KERNEL);
+		if (addr == NULL) {
+			err = -ENOMEM;
+			goto err_out_free;
+		}
+
+		tx_ring->desc_head = addr;
+	}
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &recv_ctx->rds_rings[ring];
+		addr = dma_alloc_coherent(&adapter->pdev->dev,
+					  RCV_DESC_RINGSIZE(rds_ring),
+					  &rds_ring->phys_addr, GFP_KERNEL);
+		if (addr == NULL) {
+			err = -ENOMEM;
+			goto err_out_free;
+		}
+		rds_ring->desc_head = addr;
+
+	}
+
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+
+		addr = dma_alloc_coherent(&adapter->pdev->dev,
+					  STATUS_DESC_RINGSIZE(sds_ring),
+					  &sds_ring->phys_addr, GFP_KERNEL);
+		if (addr == NULL) {
+			err = -ENOMEM;
+			goto err_out_free;
+		}
+		sds_ring->desc_head = addr;
+	}
+
+	return 0;
+
+err_out_free:
+	qlcnic_free_hw_resources(adapter);
+	return err;
+}
+
+int qlcnic_fw_create_ctx(struct qlcnic_adapter *dev)
+{
+	int i, err, ring;
+
+	if (dev->flags & QLCNIC_NEED_FLR) {
+		pci_reset_function(dev->pdev);
+		dev->flags &= ~QLCNIC_NEED_FLR;
+	}
+
+	if (qlcnic_83xx_check(dev) && (dev->flags & QLCNIC_MSIX_ENABLED)) {
+		if (dev->ahw->diag_test != QLCNIC_LOOPBACK_TEST) {
+			err = qlcnic_83xx_config_intrpt(dev, 1);
+			if (err)
+				return err;
+		}
+	}
+
+	if (qlcnic_82xx_check(dev) && (dev->flags & QLCNIC_MSIX_ENABLED) &&
+	    qlcnic_check_multi_tx(dev) && !dev->ahw->diag_test) {
+		err = qlcnic_82xx_mq_intrpt(dev, 1);
+		if (err)
+			return err;
+	}
+
+	err = qlcnic_fw_cmd_create_rx_ctx(dev);
+	if (err)
+		goto err_out;
+
+	for (ring = 0; ring < dev->drv_tx_rings; ring++) {
+		err = qlcnic_fw_cmd_create_tx_ctx(dev,
+						  &dev->tx_ring[ring],
+						  ring);
+		if (err) {
+			qlcnic_fw_cmd_del_rx_ctx(dev);
+			if (ring == 0)
+				goto err_out;
+
+			for (i = 0; i < ring; i++)
+				qlcnic_fw_cmd_del_tx_ctx(dev, &dev->tx_ring[i]);
+
+			goto err_out;
+		}
+	}
+
+	set_bit(__QLCNIC_FW_ATTACHED, &dev->state);
+
+	return 0;
+
+err_out:
+	if (qlcnic_82xx_check(dev) && (dev->flags & QLCNIC_MSIX_ENABLED) &&
+	    qlcnic_check_multi_tx(dev) && !dev->ahw->diag_test)
+			qlcnic_82xx_config_intrpt(dev, 0);
+
+	if (qlcnic_83xx_check(dev) && (dev->flags & QLCNIC_MSIX_ENABLED)) {
+		if (dev->ahw->diag_test != QLCNIC_LOOPBACK_TEST)
+			qlcnic_83xx_config_intrpt(dev, 0);
+	}
+
+	return err;
+}
+
+void qlcnic_fw_destroy_ctx(struct qlcnic_adapter *adapter)
+{
+	int ring;
+
+	if (test_and_clear_bit(__QLCNIC_FW_ATTACHED, &adapter->state)) {
+		qlcnic_fw_cmd_del_rx_ctx(adapter);
+		for (ring = 0; ring < adapter->drv_tx_rings; ring++)
+			qlcnic_fw_cmd_del_tx_ctx(adapter,
+						 &adapter->tx_ring[ring]);
+
+		if (qlcnic_82xx_check(adapter) &&
+		    (adapter->flags & QLCNIC_MSIX_ENABLED) &&
+		    qlcnic_check_multi_tx(adapter) &&
+		    !adapter->ahw->diag_test)
+				qlcnic_82xx_config_intrpt(adapter, 0);
+
+		if (qlcnic_83xx_check(adapter) &&
+		    (adapter->flags & QLCNIC_MSIX_ENABLED)) {
+			if (adapter->ahw->diag_test != QLCNIC_LOOPBACK_TEST)
+				qlcnic_83xx_config_intrpt(adapter, 0);
+		}
+		/* Allow dma queues to drain after context reset */
+		mdelay(20);
+	}
+}
+
+void qlcnic_free_hw_resources(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_recv_context *recv_ctx;
+	struct qlcnic_host_rds_ring *rds_ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_host_tx_ring *tx_ring;
+	int ring;
+
+	recv_ctx = adapter->recv_ctx;
+
+	for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+		tx_ring = &adapter->tx_ring[ring];
+		if (tx_ring->hw_consumer != NULL) {
+			dma_free_coherent(&adapter->pdev->dev, sizeof(u32),
+					  tx_ring->hw_consumer,
+					  tx_ring->hw_cons_phys_addr);
+
+			tx_ring->hw_consumer = NULL;
+		}
+
+		if (tx_ring->desc_head != NULL) {
+			dma_free_coherent(&adapter->pdev->dev,
+					  TX_DESC_RINGSIZE(tx_ring),
+					  tx_ring->desc_head,
+					  tx_ring->phys_addr);
+			tx_ring->desc_head = NULL;
+		}
+	}
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &recv_ctx->rds_rings[ring];
+
+		if (rds_ring->desc_head != NULL) {
+			dma_free_coherent(&adapter->pdev->dev,
+					RCV_DESC_RINGSIZE(rds_ring),
+					rds_ring->desc_head,
+					rds_ring->phys_addr);
+			rds_ring->desc_head = NULL;
+		}
+	}
+
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+
+		if (sds_ring->desc_head != NULL) {
+			dma_free_coherent(&adapter->pdev->dev,
+				STATUS_DESC_RINGSIZE(sds_ring),
+				sds_ring->desc_head,
+				sds_ring->phys_addr);
+			sds_ring->desc_head = NULL;
+		}
+	}
+}
+
+int qlcnic_82xx_config_intrpt(struct qlcnic_adapter *adapter, u8 op_type)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct net_device *netdev = adapter->netdev;
+	struct qlcnic_cmd_args cmd;
+	u32 type, val;
+	int i, err = 0;
+
+	for (i = 0; i < ahw->num_msix; i++) {
+		err = qlcnic_alloc_mbx_args(&cmd, adapter,
+					    QLCNIC_CMD_MQ_TX_CONFIG_INTR);
+		if (err)
+			return err;
+		type = op_type ? QLCNIC_INTRPT_ADD : QLCNIC_INTRPT_DEL;
+		val = type | (ahw->intr_tbl[i].type << 4);
+		if (ahw->intr_tbl[i].type == QLCNIC_INTRPT_MSIX)
+			val |= (ahw->intr_tbl[i].id << 16);
+		cmd.req.arg[1] = val;
+		err = qlcnic_issue_cmd(adapter, &cmd);
+		if (err) {
+			netdev_err(netdev, "Failed to %s interrupts %d\n",
+				   op_type == QLCNIC_INTRPT_ADD ? "Add" :
+				   "Delete", err);
+			qlcnic_free_mbx_args(&cmd);
+			return err;
+		}
+		val = cmd.rsp.arg[1];
+		if (LSB(val)) {
+			netdev_info(netdev,
+				    "failed to configure interrupt for %d\n",
+				    ahw->intr_tbl[i].id);
+			continue;
+		}
+		if (op_type) {
+			ahw->intr_tbl[i].id = MSW(val);
+			ahw->intr_tbl[i].enabled = 1;
+			ahw->intr_tbl[i].src = cmd.rsp.arg[2];
+		} else {
+			ahw->intr_tbl[i].id = i;
+			ahw->intr_tbl[i].enabled = 0;
+			ahw->intr_tbl[i].src = 0;
+		}
+		qlcnic_free_mbx_args(&cmd);
+	}
+
+	return err;
+}
+
+int qlcnic_82xx_get_mac_address(struct qlcnic_adapter *adapter, u8 *mac,
+				u8 function)
+{
+	int err, i;
+	struct qlcnic_cmd_args cmd;
+	u32 mac_low, mac_high;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_MAC_ADDRESS);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = function | BIT_8;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+
+	if (err == QLCNIC_RCODE_SUCCESS) {
+		mac_low = cmd.rsp.arg[1];
+		mac_high = cmd.rsp.arg[2];
+
+		for (i = 0; i < 2; i++)
+			mac[i] = (u8) (mac_high >> ((1 - i) * 8));
+		for (i = 2; i < 6; i++)
+			mac[i] = (u8) (mac_low >> ((5 - i) * 8));
+	} else {
+		dev_err(&adapter->pdev->dev,
+			"Failed to get mac address%d\n", err);
+		err = -EIO;
+	}
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+/* Get info of a NIC partition */
+int qlcnic_82xx_get_nic_info(struct qlcnic_adapter *adapter,
+			     struct qlcnic_info *npar_info, u8 func_id)
+{
+	int	err;
+	dma_addr_t nic_dma_t;
+	const struct qlcnic_info_le *nic_info;
+	void *nic_info_addr;
+	struct qlcnic_cmd_args cmd;
+	size_t  nic_size = sizeof(struct qlcnic_info_le);
+
+	nic_info_addr = dma_zalloc_coherent(&adapter->pdev->dev, nic_size,
+					    &nic_dma_t, GFP_KERNEL);
+	if (!nic_info_addr)
+		return -ENOMEM;
+
+	nic_info = nic_info_addr;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_NIC_INFO);
+	if (err)
+		goto out_free_dma;
+
+	cmd.req.arg[1] = MSD(nic_dma_t);
+	cmd.req.arg[2] = LSD(nic_dma_t);
+	cmd.req.arg[3] = (func_id << 16 | nic_size);
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err != QLCNIC_RCODE_SUCCESS) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to get nic info%d\n", err);
+		err = -EIO;
+	} else {
+		npar_info->pci_func = le16_to_cpu(nic_info->pci_func);
+		npar_info->op_mode = le16_to_cpu(nic_info->op_mode);
+		npar_info->min_tx_bw = le16_to_cpu(nic_info->min_tx_bw);
+		npar_info->max_tx_bw = le16_to_cpu(nic_info->max_tx_bw);
+		npar_info->phys_port = le16_to_cpu(nic_info->phys_port);
+		npar_info->switch_mode = le16_to_cpu(nic_info->switch_mode);
+		npar_info->max_tx_ques = le16_to_cpu(nic_info->max_tx_ques);
+		npar_info->max_rx_ques = le16_to_cpu(nic_info->max_rx_ques);
+		npar_info->capabilities = le32_to_cpu(nic_info->capabilities);
+		npar_info->max_mtu = le16_to_cpu(nic_info->max_mtu);
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+out_free_dma:
+	dma_free_coherent(&adapter->pdev->dev, nic_size, nic_info_addr,
+			  nic_dma_t);
+
+	return err;
+}
+
+/* Configure a NIC partition */
+int qlcnic_82xx_set_nic_info(struct qlcnic_adapter *adapter,
+			     struct qlcnic_info *nic)
+{
+	int err = -EIO;
+	dma_addr_t nic_dma_t;
+	void *nic_info_addr;
+	struct qlcnic_cmd_args cmd;
+	struct qlcnic_info_le *nic_info;
+	size_t nic_size = sizeof(struct qlcnic_info_le);
+
+	if (adapter->ahw->op_mode != QLCNIC_MGMT_FUNC)
+		return err;
+
+	nic_info_addr = dma_zalloc_coherent(&adapter->pdev->dev, nic_size,
+					    &nic_dma_t, GFP_KERNEL);
+	if (!nic_info_addr)
+		return -ENOMEM;
+
+	nic_info = nic_info_addr;
+
+	nic_info->pci_func = cpu_to_le16(nic->pci_func);
+	nic_info->op_mode = cpu_to_le16(nic->op_mode);
+	nic_info->phys_port = cpu_to_le16(nic->phys_port);
+	nic_info->switch_mode = cpu_to_le16(nic->switch_mode);
+	nic_info->capabilities = cpu_to_le32(nic->capabilities);
+	nic_info->max_mac_filters = nic->max_mac_filters;
+	nic_info->max_tx_ques = cpu_to_le16(nic->max_tx_ques);
+	nic_info->max_rx_ques = cpu_to_le16(nic->max_rx_ques);
+	nic_info->min_tx_bw = cpu_to_le16(nic->min_tx_bw);
+	nic_info->max_tx_bw = cpu_to_le16(nic->max_tx_bw);
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_SET_NIC_INFO);
+	if (err)
+		goto out_free_dma;
+
+	cmd.req.arg[1] = MSD(nic_dma_t);
+	cmd.req.arg[2] = LSD(nic_dma_t);
+	cmd.req.arg[3] = ((nic->pci_func << 16) | nic_size);
+	err = qlcnic_issue_cmd(adapter, &cmd);
+
+	if (err != QLCNIC_RCODE_SUCCESS) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to set nic info%d\n", err);
+		err = -EIO;
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+out_free_dma:
+	dma_free_coherent(&adapter->pdev->dev, nic_size, nic_info_addr,
+			  nic_dma_t);
+
+	return err;
+}
+
+/* Get PCI Info of a partition */
+int qlcnic_82xx_get_pci_info(struct qlcnic_adapter *adapter,
+			     struct qlcnic_pci_info *pci_info)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	size_t npar_size = sizeof(struct qlcnic_pci_info_le);
+	size_t pci_size = npar_size * ahw->max_vnic_func;
+	u16 nic = 0, fcoe = 0, iscsi = 0;
+	struct qlcnic_pci_info_le *npar;
+	struct qlcnic_cmd_args cmd;
+	dma_addr_t pci_info_dma_t;
+	void *pci_info_addr;
+	int err = 0, i;
+
+	pci_info_addr = dma_zalloc_coherent(&adapter->pdev->dev, pci_size,
+					    &pci_info_dma_t, GFP_KERNEL);
+	if (!pci_info_addr)
+		return -ENOMEM;
+
+	npar = pci_info_addr;
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_PCI_INFO);
+	if (err)
+		goto out_free_dma;
+
+	cmd.req.arg[1] = MSD(pci_info_dma_t);
+	cmd.req.arg[2] = LSD(pci_info_dma_t);
+	cmd.req.arg[3] = pci_size;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+
+	ahw->total_nic_func = 0;
+	if (err == QLCNIC_RCODE_SUCCESS) {
+		for (i = 0; i < ahw->max_vnic_func; i++, npar++, pci_info++) {
+			pci_info->id = le16_to_cpu(npar->id);
+			pci_info->active = le16_to_cpu(npar->active);
+			if (!pci_info->active)
+				continue;
+			pci_info->type = le16_to_cpu(npar->type);
+			err = qlcnic_get_pci_func_type(adapter, pci_info->type,
+						       &nic, &fcoe, &iscsi);
+			pci_info->default_port =
+				le16_to_cpu(npar->default_port);
+			pci_info->tx_min_bw =
+				le16_to_cpu(npar->tx_min_bw);
+			pci_info->tx_max_bw =
+				le16_to_cpu(npar->tx_max_bw);
+			memcpy(pci_info->mac, npar->mac, ETH_ALEN);
+		}
+	} else {
+		dev_err(&adapter->pdev->dev,
+			"Failed to get PCI Info%d\n", err);
+		err = -EIO;
+	}
+
+	ahw->total_nic_func = nic;
+	ahw->total_pci_func = nic + fcoe + iscsi;
+	if (ahw->total_nic_func == 0 || ahw->total_pci_func == 0) {
+		dev_err(&adapter->pdev->dev,
+			"%s: Invalid function count: total nic func[%x], total pci func[%x]\n",
+			__func__, ahw->total_nic_func, ahw->total_pci_func);
+		err = -EIO;
+	}
+	qlcnic_free_mbx_args(&cmd);
+out_free_dma:
+	dma_free_coherent(&adapter->pdev->dev, pci_size, pci_info_addr,
+		pci_info_dma_t);
+
+	return err;
+}
+
+/* Configure eSwitch for port mirroring */
+int qlcnic_config_port_mirroring(struct qlcnic_adapter *adapter, u8 id,
+				 u8 enable_mirroring, u8 pci_func)
+{
+	struct device *dev = &adapter->pdev->dev;
+	struct qlcnic_cmd_args cmd;
+	int err = -EIO;
+	u32 arg1;
+
+	if (adapter->ahw->op_mode != QLCNIC_MGMT_FUNC ||
+	    !(adapter->eswitch[id].flags & QLCNIC_SWITCH_ENABLE)) {
+		dev_err(&adapter->pdev->dev, "%s: Not a management function\n",
+			__func__);
+		return err;
+	}
+
+	arg1 = id | (enable_mirroring ? BIT_4 : 0);
+	arg1 |= pci_func << 8;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter,
+				    QLCNIC_CMD_SET_PORTMIRRORING);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = arg1;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+
+	if (err != QLCNIC_RCODE_SUCCESS)
+		dev_err(dev, "Failed to configure port mirroring for vNIC function %d on eSwitch %d\n",
+			pci_func, id);
+	else
+		dev_info(dev, "Configured port mirroring for vNIC function %d on eSwitch %d\n",
+			 pci_func, id);
+	qlcnic_free_mbx_args(&cmd);
+
+	return err;
+}
+
+int qlcnic_get_port_stats(struct qlcnic_adapter *adapter, const u8 func,
+		const u8 rx_tx, struct __qlcnic_esw_statistics *esw_stats) {
+
+	size_t stats_size = sizeof(struct qlcnic_esw_stats_le);
+	struct qlcnic_esw_stats_le *stats;
+	dma_addr_t stats_dma_t;
+	void *stats_addr;
+	u32 arg1;
+	struct qlcnic_cmd_args cmd;
+	int err;
+
+	if (esw_stats == NULL)
+		return -ENOMEM;
+
+	if ((adapter->ahw->op_mode != QLCNIC_MGMT_FUNC) &&
+	    (func != adapter->ahw->pci_func)) {
+		dev_err(&adapter->pdev->dev,
+			"Not privilege to query stats for func=%d", func);
+		return -EIO;
+	}
+
+	stats_addr = dma_zalloc_coherent(&adapter->pdev->dev, stats_size,
+					 &stats_dma_t, GFP_KERNEL);
+	if (!stats_addr)
+		return -ENOMEM;
+
+	arg1 = func | QLCNIC_STATS_VERSION << 8 | QLCNIC_STATS_PORT << 12;
+	arg1 |= rx_tx << 15 | stats_size << 16;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter,
+				    QLCNIC_CMD_GET_ESWITCH_STATS);
+	if (err)
+		goto out_free_dma;
+
+	cmd.req.arg[1] = arg1;
+	cmd.req.arg[2] = MSD(stats_dma_t);
+	cmd.req.arg[3] = LSD(stats_dma_t);
+	err = qlcnic_issue_cmd(adapter, &cmd);
+
+	if (!err) {
+		stats = stats_addr;
+		esw_stats->context_id = le16_to_cpu(stats->context_id);
+		esw_stats->version = le16_to_cpu(stats->version);
+		esw_stats->size = le16_to_cpu(stats->size);
+		esw_stats->multicast_frames =
+				le64_to_cpu(stats->multicast_frames);
+		esw_stats->broadcast_frames =
+				le64_to_cpu(stats->broadcast_frames);
+		esw_stats->unicast_frames = le64_to_cpu(stats->unicast_frames);
+		esw_stats->dropped_frames = le64_to_cpu(stats->dropped_frames);
+		esw_stats->local_frames = le64_to_cpu(stats->local_frames);
+		esw_stats->errors = le64_to_cpu(stats->errors);
+		esw_stats->numbytes = le64_to_cpu(stats->numbytes);
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+out_free_dma:
+	dma_free_coherent(&adapter->pdev->dev, stats_size, stats_addr,
+			  stats_dma_t);
+
+	return err;
+}
+
+/* This routine will retrieve the MAC statistics from firmware */
+int qlcnic_get_mac_stats(struct qlcnic_adapter *adapter,
+		struct qlcnic_mac_statistics *mac_stats)
+{
+	struct qlcnic_mac_statistics_le *stats;
+	struct qlcnic_cmd_args cmd;
+	size_t stats_size = sizeof(struct qlcnic_mac_statistics_le);
+	dma_addr_t stats_dma_t;
+	void *stats_addr;
+	int err;
+
+	if (mac_stats == NULL)
+		return -ENOMEM;
+
+	stats_addr = dma_zalloc_coherent(&adapter->pdev->dev, stats_size,
+					 &stats_dma_t, GFP_KERNEL);
+	if (!stats_addr)
+		return -ENOMEM;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_MAC_STATS);
+	if (err)
+		goto out_free_dma;
+
+	cmd.req.arg[1] = stats_size << 16;
+	cmd.req.arg[2] = MSD(stats_dma_t);
+	cmd.req.arg[3] = LSD(stats_dma_t);
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (!err) {
+		stats = stats_addr;
+		mac_stats->mac_tx_frames = le64_to_cpu(stats->mac_tx_frames);
+		mac_stats->mac_tx_bytes = le64_to_cpu(stats->mac_tx_bytes);
+		mac_stats->mac_tx_mcast_pkts =
+					le64_to_cpu(stats->mac_tx_mcast_pkts);
+		mac_stats->mac_tx_bcast_pkts =
+					le64_to_cpu(stats->mac_tx_bcast_pkts);
+		mac_stats->mac_rx_frames = le64_to_cpu(stats->mac_rx_frames);
+		mac_stats->mac_rx_bytes = le64_to_cpu(stats->mac_rx_bytes);
+		mac_stats->mac_rx_mcast_pkts =
+					le64_to_cpu(stats->mac_rx_mcast_pkts);
+		mac_stats->mac_rx_length_error =
+				le64_to_cpu(stats->mac_rx_length_error);
+		mac_stats->mac_rx_length_small =
+				le64_to_cpu(stats->mac_rx_length_small);
+		mac_stats->mac_rx_length_large =
+				le64_to_cpu(stats->mac_rx_length_large);
+		mac_stats->mac_rx_jabber = le64_to_cpu(stats->mac_rx_jabber);
+		mac_stats->mac_rx_dropped = le64_to_cpu(stats->mac_rx_dropped);
+		mac_stats->mac_rx_crc_error = le64_to_cpu(stats->mac_rx_crc_error);
+	} else {
+		dev_err(&adapter->pdev->dev,
+			"%s: Get mac stats failed, err=%d.\n", __func__, err);
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+
+out_free_dma:
+	dma_free_coherent(&adapter->pdev->dev, stats_size, stats_addr,
+			  stats_dma_t);
+
+	return err;
+}
+
+int qlcnic_get_eswitch_stats(struct qlcnic_adapter *adapter, const u8 eswitch,
+		const u8 rx_tx, struct __qlcnic_esw_statistics *esw_stats) {
+
+	struct __qlcnic_esw_statistics port_stats;
+	u8 i;
+	int ret = -EIO;
+
+	if (esw_stats == NULL)
+		return -ENOMEM;
+	if (adapter->ahw->op_mode != QLCNIC_MGMT_FUNC)
+		return -EIO;
+	if (adapter->npars == NULL)
+		return -EIO;
+
+	memset(esw_stats, 0, sizeof(u64));
+	esw_stats->unicast_frames = QLCNIC_STATS_NOT_AVAIL;
+	esw_stats->multicast_frames = QLCNIC_STATS_NOT_AVAIL;
+	esw_stats->broadcast_frames = QLCNIC_STATS_NOT_AVAIL;
+	esw_stats->dropped_frames = QLCNIC_STATS_NOT_AVAIL;
+	esw_stats->errors = QLCNIC_STATS_NOT_AVAIL;
+	esw_stats->local_frames = QLCNIC_STATS_NOT_AVAIL;
+	esw_stats->numbytes = QLCNIC_STATS_NOT_AVAIL;
+	esw_stats->context_id = eswitch;
+
+	for (i = 0; i < adapter->ahw->total_nic_func; i++) {
+		if (adapter->npars[i].phy_port != eswitch)
+			continue;
+
+		memset(&port_stats, 0, sizeof(struct __qlcnic_esw_statistics));
+		if (qlcnic_get_port_stats(adapter, adapter->npars[i].pci_func,
+					  rx_tx, &port_stats))
+			continue;
+
+		esw_stats->size = port_stats.size;
+		esw_stats->version = port_stats.version;
+		QLCNIC_ADD_ESW_STATS(esw_stats->unicast_frames,
+						port_stats.unicast_frames);
+		QLCNIC_ADD_ESW_STATS(esw_stats->multicast_frames,
+						port_stats.multicast_frames);
+		QLCNIC_ADD_ESW_STATS(esw_stats->broadcast_frames,
+						port_stats.broadcast_frames);
+		QLCNIC_ADD_ESW_STATS(esw_stats->dropped_frames,
+						port_stats.dropped_frames);
+		QLCNIC_ADD_ESW_STATS(esw_stats->errors,
+						port_stats.errors);
+		QLCNIC_ADD_ESW_STATS(esw_stats->local_frames,
+						port_stats.local_frames);
+		QLCNIC_ADD_ESW_STATS(esw_stats->numbytes,
+						port_stats.numbytes);
+		ret = 0;
+	}
+	return ret;
+}
+
+int qlcnic_clear_esw_stats(struct qlcnic_adapter *adapter, const u8 func_esw,
+		const u8 port, const u8 rx_tx)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_cmd_args cmd;
+	int err;
+	u32 arg1;
+
+	if (ahw->op_mode != QLCNIC_MGMT_FUNC)
+		return -EIO;
+
+	if (func_esw == QLCNIC_STATS_PORT) {
+		if (port >= ahw->max_vnic_func)
+			goto err_ret;
+	} else if (func_esw == QLCNIC_STATS_ESWITCH) {
+		if (port >= QLCNIC_NIU_MAX_XG_PORTS)
+			goto err_ret;
+	} else {
+		goto err_ret;
+	}
+
+	if (rx_tx > QLCNIC_QUERY_TX_COUNTER)
+		goto err_ret;
+
+	arg1 = port | QLCNIC_STATS_VERSION << 8 | func_esw << 12;
+	arg1 |= BIT_14 | rx_tx << 15;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter,
+				    QLCNIC_CMD_GET_ESWITCH_STATS);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = arg1;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+
+err_ret:
+	dev_err(&adapter->pdev->dev,
+		"Invalid args func_esw %d port %d rx_ctx %d\n",
+		func_esw, port, rx_tx);
+	return -EIO;
+}
+
+static int __qlcnic_get_eswitch_port_config(struct qlcnic_adapter *adapter,
+					    u32 *arg1, u32 *arg2)
+{
+	struct device *dev = &adapter->pdev->dev;
+	struct qlcnic_cmd_args cmd;
+	u8 pci_func = *arg1 >> 8;
+	int err;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter,
+				    QLCNIC_CMD_GET_ESWITCH_PORT_CONFIG);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = *arg1;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	*arg1 = cmd.rsp.arg[1];
+	*arg2 = cmd.rsp.arg[2];
+	qlcnic_free_mbx_args(&cmd);
+
+	if (err == QLCNIC_RCODE_SUCCESS)
+		dev_info(dev, "Get eSwitch port config for vNIC function %d\n",
+			 pci_func);
+	else
+		dev_err(dev, "Failed to get eswitch port config for vNIC function %d\n",
+			pci_func);
+	return err;
+}
+/* Configure eSwitch port
+op_mode = 0 for setting default port behavior
+op_mode = 1 for setting  vlan id
+op_mode = 2 for deleting vlan id
+op_type = 0 for vlan_id
+op_type = 1 for port vlan_id
+*/
+int qlcnic_config_switch_port(struct qlcnic_adapter *adapter,
+		struct qlcnic_esw_func_cfg *esw_cfg)
+{
+	struct device *dev = &adapter->pdev->dev;
+	struct qlcnic_cmd_args cmd;
+	int err = -EIO, index;
+	u32 arg1, arg2 = 0;
+	u8 pci_func;
+
+	if (adapter->ahw->op_mode != QLCNIC_MGMT_FUNC) {
+		dev_err(&adapter->pdev->dev, "%s: Not a management function\n",
+			__func__);
+		return err;
+	}
+
+	pci_func = esw_cfg->pci_func;
+	index = qlcnic_is_valid_nic_func(adapter, pci_func);
+	if (index < 0)
+		return err;
+	arg1 = (adapter->npars[index].phy_port & BIT_0);
+	arg1 |= (pci_func << 8);
+
+	if (__qlcnic_get_eswitch_port_config(adapter, &arg1, &arg2))
+		return err;
+	arg1 &= ~(0x0ff << 8);
+	arg1 |= (pci_func << 8);
+	arg1 &= ~(BIT_2 | BIT_3);
+	switch (esw_cfg->op_mode) {
+	case QLCNIC_PORT_DEFAULTS:
+		arg1 |= (BIT_4 | BIT_6 | BIT_7);
+		arg2 |= (BIT_0 | BIT_1);
+		if (adapter->ahw->capabilities & QLCNIC_FW_CAPABILITY_TSO)
+			arg2 |= (BIT_2 | BIT_3);
+		if (!(esw_cfg->discard_tagged))
+			arg1 &= ~BIT_4;
+		if (!(esw_cfg->promisc_mode))
+			arg1 &= ~BIT_6;
+		if (!(esw_cfg->mac_override))
+			arg1 &= ~BIT_7;
+		if (!(esw_cfg->mac_anti_spoof))
+			arg2 &= ~BIT_0;
+		if (!(esw_cfg->offload_flags & BIT_0))
+			arg2 &= ~(BIT_1 | BIT_2 | BIT_3);
+		if (!(esw_cfg->offload_flags & BIT_1))
+			arg2 &= ~BIT_2;
+		if (!(esw_cfg->offload_flags & BIT_2))
+			arg2 &= ~BIT_3;
+		break;
+	case QLCNIC_ADD_VLAN:
+			arg1 &= ~(0x0ffff << 16);
+			arg1 |= (BIT_2 | BIT_5);
+			arg1 |= (esw_cfg->vlan_id << 16);
+			break;
+	case QLCNIC_DEL_VLAN:
+			arg1 |= (BIT_3 | BIT_5);
+			arg1 &= ~(0x0ffff << 16);
+			break;
+	default:
+		dev_err(&adapter->pdev->dev, "%s: Invalid opmode 0x%x\n",
+			__func__, esw_cfg->op_mode);
+		return err;
+	}
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter,
+				    QLCNIC_CMD_CONFIGURE_ESWITCH);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = arg1;
+	cmd.req.arg[2] = arg2;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	qlcnic_free_mbx_args(&cmd);
+
+	if (err != QLCNIC_RCODE_SUCCESS)
+		dev_err(dev, "Failed to configure eswitch for vNIC function %d\n",
+			pci_func);
+	else
+		dev_info(dev, "Configured eSwitch for vNIC function %d\n",
+			 pci_func);
+
+	return err;
+}
+
+int
+qlcnic_get_eswitch_port_config(struct qlcnic_adapter *adapter,
+			struct qlcnic_esw_func_cfg *esw_cfg)
+{
+	u32 arg1, arg2;
+	int index;
+	u8 phy_port;
+
+	if (adapter->ahw->op_mode == QLCNIC_MGMT_FUNC) {
+		index = qlcnic_is_valid_nic_func(adapter, esw_cfg->pci_func);
+		if (index < 0)
+			return -EIO;
+		phy_port = adapter->npars[index].phy_port;
+	} else {
+		phy_port = adapter->ahw->physical_port;
+	}
+	arg1 = phy_port;
+	arg1 |= (esw_cfg->pci_func << 8);
+	if (__qlcnic_get_eswitch_port_config(adapter, &arg1, &arg2))
+		return -EIO;
+
+	esw_cfg->discard_tagged = !!(arg1 & BIT_4);
+	esw_cfg->host_vlan_tag = !!(arg1 & BIT_5);
+	esw_cfg->promisc_mode = !!(arg1 & BIT_6);
+	esw_cfg->mac_override = !!(arg1 & BIT_7);
+	esw_cfg->vlan_id = LSW(arg1 >> 16);
+	esw_cfg->mac_anti_spoof = (arg2 & 0x1);
+	esw_cfg->offload_flags = ((arg2 >> 1) & 0x7);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c
new file mode 100644
index 0000000..4b76c69
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c
@@ -0,0 +1,1147 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c)  2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include <linux/types.h>
+#include "qlcnic.h"
+
+#define QLC_DCB_NUM_PARAM		3
+#define QLC_DCB_LOCAL_IDX		0
+#define QLC_DCB_OPER_IDX		1
+#define QLC_DCB_PEER_IDX		2
+
+#define QLC_DCB_GET_MAP(V)		(1 << V)
+
+#define QLC_DCB_FW_VER			0x2
+#define QLC_DCB_MAX_TC			0x8
+#define QLC_DCB_MAX_APP			0x8
+#define QLC_DCB_MAX_PRIO		QLC_DCB_MAX_TC
+#define QLC_DCB_MAX_PG			QLC_DCB_MAX_TC
+
+#define QLC_DCB_TSA_SUPPORT(V)		(V & 0x1)
+#define QLC_DCB_ETS_SUPPORT(V)		((V >> 1) & 0x1)
+#define QLC_DCB_VERSION_SUPPORT(V)	((V >> 2) & 0xf)
+#define QLC_DCB_MAX_NUM_TC(V)		((V >> 20) & 0xf)
+#define QLC_DCB_MAX_NUM_ETS_TC(V)	((V >> 24) & 0xf)
+#define QLC_DCB_MAX_NUM_PFC_TC(V)	((V >> 28) & 0xf)
+#define QLC_DCB_GET_TC_PRIO(X, P)	((X >> (P * 3)) & 0x7)
+#define QLC_DCB_GET_PGID_PRIO(X, P)	((X >> (P * 8)) & 0xff)
+#define QLC_DCB_GET_BWPER_PG(X, P)	((X >> (P * 8)) & 0xff)
+#define QLC_DCB_GET_TSA_PG(X, P)	((X >> (P * 8)) & 0xff)
+#define QLC_DCB_GET_PFC_PRIO(X, P)	(((X >> 24) >> P) & 0x1)
+#define QLC_DCB_GET_PROTO_ID_APP(X)	((X >> 8) & 0xffff)
+#define QLC_DCB_GET_SELECTOR_APP(X)	(X & 0xff)
+
+#define QLC_DCB_LOCAL_PARAM_FWID	0x3
+#define QLC_DCB_OPER_PARAM_FWID		0x1
+#define QLC_DCB_PEER_PARAM_FWID		0x2
+
+#define QLC_83XX_DCB_GET_NUMAPP(X)	((X >> 2) & 0xf)
+#define QLC_83XX_DCB_TSA_VALID(X)	(X & 0x1)
+#define QLC_83XX_DCB_PFC_VALID(X)	((X >> 1) & 0x1)
+#define QLC_83XX_DCB_GET_PRIOMAP_APP(X)	(X >> 24)
+
+#define QLC_82XX_DCB_GET_NUMAPP(X)	((X >> 12) & 0xf)
+#define QLC_82XX_DCB_TSA_VALID(X)	((X >> 4) & 0x1)
+#define QLC_82XX_DCB_PFC_VALID(X)	((X >> 5) & 0x1)
+#define QLC_82XX_DCB_GET_PRIOVAL_APP(X)	((X >> 24) & 0x7)
+#define QLC_82XX_DCB_GET_PRIOMAP_APP(X)	(1 << X)
+#define QLC_82XX_DCB_PRIO_TC_MAP	(0x76543210)
+
+static const struct dcbnl_rtnl_ops qlcnic_dcbnl_ops;
+
+static void qlcnic_dcb_aen_work(struct work_struct *);
+static void qlcnic_dcb_data_cee_param_map(struct qlcnic_adapter *);
+
+static inline void __qlcnic_init_dcbnl_ops(struct qlcnic_dcb *);
+static void __qlcnic_dcb_free(struct qlcnic_dcb *);
+static int __qlcnic_dcb_attach(struct qlcnic_dcb *);
+static int __qlcnic_dcb_query_hw_capability(struct qlcnic_dcb *, char *);
+static void __qlcnic_dcb_get_info(struct qlcnic_dcb *);
+
+static int qlcnic_82xx_dcb_get_hw_capability(struct qlcnic_dcb *);
+static int qlcnic_82xx_dcb_query_cee_param(struct qlcnic_dcb *, char *, u8);
+static int qlcnic_82xx_dcb_get_cee_cfg(struct qlcnic_dcb *);
+static void qlcnic_82xx_dcb_aen_handler(struct qlcnic_dcb *, void *);
+
+static int qlcnic_83xx_dcb_get_hw_capability(struct qlcnic_dcb *);
+static int qlcnic_83xx_dcb_query_cee_param(struct qlcnic_dcb *, char *, u8);
+static int qlcnic_83xx_dcb_get_cee_cfg(struct qlcnic_dcb *);
+static void qlcnic_83xx_dcb_aen_handler(struct qlcnic_dcb *, void *);
+
+struct qlcnic_dcb_capability {
+	bool	tsa_capability;
+	bool	ets_capability;
+	u8	max_num_tc;
+	u8	max_ets_tc;
+	u8	max_pfc_tc;
+	u8	dcb_capability;
+};
+
+struct qlcnic_dcb_param {
+	u32 hdr_prio_pfc_map[2];
+	u32 prio_pg_map[2];
+	u32 pg_bw_map[2];
+	u32 pg_tsa_map[2];
+	u32 app[QLC_DCB_MAX_APP];
+};
+
+struct qlcnic_dcb_mbx_params {
+	/* 1st local, 2nd operational 3rd remote */
+	struct qlcnic_dcb_param type[3];
+	u32 prio_tc_map;
+};
+
+struct qlcnic_82xx_dcb_param_mbx_le {
+	__le32 hdr_prio_pfc_map[2];
+	__le32 prio_pg_map[2];
+	__le32 pg_bw_map[2];
+	__le32 pg_tsa_map[2];
+	__le32 app[QLC_DCB_MAX_APP];
+};
+
+enum qlcnic_dcb_selector {
+	QLC_SELECTOR_DEF = 0x0,
+	QLC_SELECTOR_ETHER,
+	QLC_SELECTOR_TCP,
+	QLC_SELECTOR_UDP,
+};
+
+enum qlcnic_dcb_prio_type {
+	QLC_PRIO_NONE = 0,
+	QLC_PRIO_GROUP,
+	QLC_PRIO_LINK,
+};
+
+enum qlcnic_dcb_pfc_type {
+	QLC_PFC_DISABLED = 0,
+	QLC_PFC_FULL,
+	QLC_PFC_TX,
+	QLC_PFC_RX
+};
+
+struct qlcnic_dcb_prio_cfg {
+	bool valid;
+	enum qlcnic_dcb_pfc_type pfc_type;
+};
+
+struct qlcnic_dcb_pg_cfg {
+	bool valid;
+	u8 total_bw_percent;		/* of Link/ port BW */
+	u8 prio_count;
+	u8 tsa_type;
+};
+
+struct qlcnic_dcb_tc_cfg {
+	bool valid;
+	struct qlcnic_dcb_prio_cfg prio_cfg[QLC_DCB_MAX_PRIO];
+	enum qlcnic_dcb_prio_type prio_type;	/* always prio_link */
+	u8 link_percent;			/* % of link bandwidth */
+	u8 bwg_percent;				/* % of BWG's bandwidth */
+	u8 up_tc_map;
+	u8 pgid;
+};
+
+struct qlcnic_dcb_app {
+	bool valid;
+	enum qlcnic_dcb_selector selector;
+	u16 protocol;
+	u8 priority;
+};
+
+struct qlcnic_dcb_cee {
+	struct qlcnic_dcb_tc_cfg tc_cfg[QLC_DCB_MAX_TC];
+	struct qlcnic_dcb_pg_cfg pg_cfg[QLC_DCB_MAX_PG];
+	struct qlcnic_dcb_app app[QLC_DCB_MAX_APP];
+	bool tc_param_valid;
+	bool pfc_mode_enable;
+};
+
+struct qlcnic_dcb_cfg {
+	/* 0 - local, 1 - operational, 2 - remote */
+	struct qlcnic_dcb_cee type[QLC_DCB_NUM_PARAM];
+	struct qlcnic_dcb_capability capability;
+	u32 version;
+};
+
+static const struct qlcnic_dcb_ops qlcnic_83xx_dcb_ops = {
+	.init_dcbnl_ops		= __qlcnic_init_dcbnl_ops,
+	.free			= __qlcnic_dcb_free,
+	.attach			= __qlcnic_dcb_attach,
+	.query_hw_capability	= __qlcnic_dcb_query_hw_capability,
+	.get_info		= __qlcnic_dcb_get_info,
+
+	.get_hw_capability	= qlcnic_83xx_dcb_get_hw_capability,
+	.query_cee_param	= qlcnic_83xx_dcb_query_cee_param,
+	.get_cee_cfg		= qlcnic_83xx_dcb_get_cee_cfg,
+	.aen_handler		= qlcnic_83xx_dcb_aen_handler,
+};
+
+static const struct qlcnic_dcb_ops qlcnic_82xx_dcb_ops = {
+	.init_dcbnl_ops		= __qlcnic_init_dcbnl_ops,
+	.free			= __qlcnic_dcb_free,
+	.attach			= __qlcnic_dcb_attach,
+	.query_hw_capability	= __qlcnic_dcb_query_hw_capability,
+	.get_info		= __qlcnic_dcb_get_info,
+
+	.get_hw_capability	= qlcnic_82xx_dcb_get_hw_capability,
+	.query_cee_param	= qlcnic_82xx_dcb_query_cee_param,
+	.get_cee_cfg		= qlcnic_82xx_dcb_get_cee_cfg,
+	.aen_handler		= qlcnic_82xx_dcb_aen_handler,
+};
+
+static u8 qlcnic_dcb_get_num_app(struct qlcnic_adapter *adapter, u32 val)
+{
+	if (qlcnic_82xx_check(adapter))
+		return QLC_82XX_DCB_GET_NUMAPP(val);
+	else
+		return QLC_83XX_DCB_GET_NUMAPP(val);
+}
+
+static inline u8 qlcnic_dcb_pfc_hdr_valid(struct qlcnic_adapter *adapter,
+					  u32 val)
+{
+	if (qlcnic_82xx_check(adapter))
+		return QLC_82XX_DCB_PFC_VALID(val);
+	else
+		return QLC_83XX_DCB_PFC_VALID(val);
+}
+
+static inline u8 qlcnic_dcb_tsa_hdr_valid(struct qlcnic_adapter *adapter,
+					  u32 val)
+{
+	if (qlcnic_82xx_check(adapter))
+		return QLC_82XX_DCB_TSA_VALID(val);
+	else
+		return QLC_83XX_DCB_TSA_VALID(val);
+}
+
+static inline u8 qlcnic_dcb_get_prio_map_app(struct qlcnic_adapter *adapter,
+					     u32 val)
+{
+	if (qlcnic_82xx_check(adapter))
+		return QLC_82XX_DCB_GET_PRIOMAP_APP(val);
+	else
+		return QLC_83XX_DCB_GET_PRIOMAP_APP(val);
+}
+
+static int qlcnic_dcb_prio_count(u8 up_tc_map)
+{
+	int j;
+
+	for (j = 0; j < QLC_DCB_MAX_TC; j++)
+		if (up_tc_map & QLC_DCB_GET_MAP(j))
+			break;
+
+	return j;
+}
+
+static inline void __qlcnic_init_dcbnl_ops(struct qlcnic_dcb *dcb)
+{
+	if (test_bit(QLCNIC_DCB_STATE, &dcb->state))
+		dcb->adapter->netdev->dcbnl_ops = &qlcnic_dcbnl_ops;
+}
+
+static void qlcnic_set_dcb_ops(struct qlcnic_adapter *adapter)
+{
+	if (qlcnic_82xx_check(adapter))
+		adapter->dcb->ops = &qlcnic_82xx_dcb_ops;
+	else if (qlcnic_83xx_check(adapter))
+		adapter->dcb->ops = &qlcnic_83xx_dcb_ops;
+}
+
+int qlcnic_register_dcb(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_dcb *dcb;
+
+	if (qlcnic_sriov_vf_check(adapter))
+		return 0;
+
+	dcb = kzalloc(sizeof(struct qlcnic_dcb), GFP_ATOMIC);
+	if (!dcb)
+		return -ENOMEM;
+
+	adapter->dcb = dcb;
+	dcb->adapter = adapter;
+	qlcnic_set_dcb_ops(adapter);
+	dcb->state = 0;
+
+	return 0;
+}
+
+static void __qlcnic_dcb_free(struct qlcnic_dcb *dcb)
+{
+	struct qlcnic_adapter *adapter;
+
+	if (!dcb)
+		return;
+
+	adapter = dcb->adapter;
+
+	while (test_bit(QLCNIC_DCB_AEN_MODE, &dcb->state))
+		usleep_range(10000, 11000);
+
+	cancel_delayed_work_sync(&dcb->aen_work);
+
+	if (dcb->wq) {
+		destroy_workqueue(dcb->wq);
+		dcb->wq = NULL;
+	}
+
+	kfree(dcb->cfg);
+	dcb->cfg = NULL;
+	kfree(dcb->param);
+	dcb->param = NULL;
+	kfree(dcb);
+	adapter->dcb = NULL;
+}
+
+static void __qlcnic_dcb_get_info(struct qlcnic_dcb *dcb)
+{
+	qlcnic_dcb_get_hw_capability(dcb);
+	qlcnic_dcb_get_cee_cfg(dcb);
+}
+
+static int __qlcnic_dcb_attach(struct qlcnic_dcb *dcb)
+{
+	int err = 0;
+
+	INIT_DELAYED_WORK(&dcb->aen_work, qlcnic_dcb_aen_work);
+
+	dcb->wq = create_singlethread_workqueue("qlcnic-dcb");
+	if (!dcb->wq) {
+		dev_err(&dcb->adapter->pdev->dev,
+			"DCB workqueue allocation failed. DCB will be disabled\n");
+		return -1;
+	}
+
+	dcb->cfg = kzalloc(sizeof(struct qlcnic_dcb_cfg), GFP_ATOMIC);
+	if (!dcb->cfg) {
+		err = -ENOMEM;
+		goto out_free_wq;
+	}
+
+	dcb->param = kzalloc(sizeof(struct qlcnic_dcb_mbx_params), GFP_ATOMIC);
+	if (!dcb->param) {
+		err = -ENOMEM;
+		goto out_free_cfg;
+	}
+
+	return 0;
+out_free_cfg:
+	kfree(dcb->cfg);
+	dcb->cfg = NULL;
+
+out_free_wq:
+	destroy_workqueue(dcb->wq);
+	dcb->wq = NULL;
+
+	return err;
+}
+
+static int __qlcnic_dcb_query_hw_capability(struct qlcnic_dcb *dcb, char *buf)
+{
+	struct qlcnic_adapter *adapter = dcb->adapter;
+	struct qlcnic_cmd_args cmd;
+	u32 mbx_out;
+	int err;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_DCB_QUERY_CAP);
+	if (err)
+		return err;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to query DCBX capability, err %d\n", err);
+	} else {
+		mbx_out = cmd.rsp.arg[1];
+		if (buf)
+			memcpy(buf, &mbx_out, sizeof(u32));
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+
+	return err;
+}
+
+static int __qlcnic_dcb_get_capability(struct qlcnic_dcb *dcb, u32 *val)
+{
+	struct qlcnic_dcb_capability *cap = &dcb->cfg->capability;
+	u32 mbx_out;
+	int err;
+
+	memset(cap, 0, sizeof(struct qlcnic_dcb_capability));
+
+	err = qlcnic_dcb_query_hw_capability(dcb, (char *)val);
+	if (err)
+		return err;
+
+	mbx_out = *val;
+	if (QLC_DCB_TSA_SUPPORT(mbx_out))
+		cap->tsa_capability = true;
+
+	if (QLC_DCB_ETS_SUPPORT(mbx_out))
+		cap->ets_capability = true;
+
+	cap->max_num_tc = QLC_DCB_MAX_NUM_TC(mbx_out);
+	cap->max_ets_tc = QLC_DCB_MAX_NUM_ETS_TC(mbx_out);
+	cap->max_pfc_tc = QLC_DCB_MAX_NUM_PFC_TC(mbx_out);
+
+	if (cap->max_num_tc > QLC_DCB_MAX_TC ||
+	    cap->max_ets_tc > cap->max_num_tc ||
+	    cap->max_pfc_tc > cap->max_num_tc) {
+		dev_err(&dcb->adapter->pdev->dev, "Invalid DCB configuration\n");
+		return -EINVAL;
+	}
+
+	return err;
+}
+
+static int qlcnic_82xx_dcb_get_hw_capability(struct qlcnic_dcb *dcb)
+{
+	struct qlcnic_dcb_cfg *cfg = dcb->cfg;
+	struct qlcnic_dcb_capability *cap;
+	u32 mbx_out;
+	int err;
+
+	err = __qlcnic_dcb_get_capability(dcb, &mbx_out);
+	if (err)
+		return err;
+
+	cap = &cfg->capability;
+	cap->dcb_capability = DCB_CAP_DCBX_VER_CEE | DCB_CAP_DCBX_LLD_MANAGED;
+
+	if (cap->dcb_capability && cap->tsa_capability && cap->ets_capability)
+		set_bit(QLCNIC_DCB_STATE, &dcb->state);
+
+	return err;
+}
+
+static int qlcnic_82xx_dcb_query_cee_param(struct qlcnic_dcb *dcb,
+					   char *buf, u8 type)
+{
+	u16 size = sizeof(struct qlcnic_82xx_dcb_param_mbx_le);
+	struct qlcnic_adapter *adapter = dcb->adapter;
+	struct qlcnic_82xx_dcb_param_mbx_le *prsp_le;
+	struct device *dev = &adapter->pdev->dev;
+	dma_addr_t cardrsp_phys_addr;
+	struct qlcnic_dcb_param rsp;
+	struct qlcnic_cmd_args cmd;
+	u64 phys_addr;
+	void *addr;
+	int err, i;
+
+	switch (type) {
+	case QLC_DCB_LOCAL_PARAM_FWID:
+	case QLC_DCB_OPER_PARAM_FWID:
+	case QLC_DCB_PEER_PARAM_FWID:
+		break;
+	default:
+		dev_err(dev, "Invalid parameter type %d\n", type);
+		return -EINVAL;
+	}
+
+	addr = dma_alloc_coherent(dev, size, &cardrsp_phys_addr, GFP_KERNEL);
+	if (addr == NULL)
+		return -ENOMEM;
+
+	prsp_le = addr;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_DCB_QUERY_PARAM);
+	if (err)
+		goto out_free_rsp;
+
+	phys_addr = cardrsp_phys_addr;
+	cmd.req.arg[1] = size | (type << 16);
+	cmd.req.arg[2] = MSD(phys_addr);
+	cmd.req.arg[3] = LSD(phys_addr);
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err) {
+		dev_err(dev, "Failed to query DCBX parameter, err %d\n", err);
+		goto out;
+	}
+
+	memset(&rsp, 0, sizeof(struct qlcnic_dcb_param));
+	rsp.hdr_prio_pfc_map[0] = le32_to_cpu(prsp_le->hdr_prio_pfc_map[0]);
+	rsp.hdr_prio_pfc_map[1] = le32_to_cpu(prsp_le->hdr_prio_pfc_map[1]);
+	rsp.prio_pg_map[0] = le32_to_cpu(prsp_le->prio_pg_map[0]);
+	rsp.prio_pg_map[1] = le32_to_cpu(prsp_le->prio_pg_map[1]);
+	rsp.pg_bw_map[0] = le32_to_cpu(prsp_le->pg_bw_map[0]);
+	rsp.pg_bw_map[1] = le32_to_cpu(prsp_le->pg_bw_map[1]);
+	rsp.pg_tsa_map[0] = le32_to_cpu(prsp_le->pg_tsa_map[0]);
+	rsp.pg_tsa_map[1] = le32_to_cpu(prsp_le->pg_tsa_map[1]);
+
+	for (i = 0; i < QLC_DCB_MAX_APP; i++)
+		rsp.app[i] = le32_to_cpu(prsp_le->app[i]);
+
+	if (buf)
+		memcpy(buf, &rsp, size);
+out:
+	qlcnic_free_mbx_args(&cmd);
+
+out_free_rsp:
+	dma_free_coherent(dev, size, addr, cardrsp_phys_addr);
+
+	return err;
+}
+
+static int qlcnic_82xx_dcb_get_cee_cfg(struct qlcnic_dcb *dcb)
+{
+	struct qlcnic_dcb_mbx_params *mbx;
+	int err;
+
+	mbx = dcb->param;
+	if (!mbx)
+		return 0;
+
+	err = qlcnic_dcb_query_cee_param(dcb, (char *)&mbx->type[0],
+					 QLC_DCB_LOCAL_PARAM_FWID);
+	if (err)
+		return err;
+
+	err = qlcnic_dcb_query_cee_param(dcb, (char *)&mbx->type[1],
+					 QLC_DCB_OPER_PARAM_FWID);
+	if (err)
+		return err;
+
+	err = qlcnic_dcb_query_cee_param(dcb, (char *)&mbx->type[2],
+					 QLC_DCB_PEER_PARAM_FWID);
+	if (err)
+		return err;
+
+	mbx->prio_tc_map = QLC_82XX_DCB_PRIO_TC_MAP;
+
+	qlcnic_dcb_data_cee_param_map(dcb->adapter);
+
+	return err;
+}
+
+static void qlcnic_dcb_aen_work(struct work_struct *work)
+{
+	struct qlcnic_dcb *dcb;
+
+	dcb = container_of(work, struct qlcnic_dcb, aen_work.work);
+
+	qlcnic_dcb_get_cee_cfg(dcb);
+	clear_bit(QLCNIC_DCB_AEN_MODE, &dcb->state);
+}
+
+static void qlcnic_82xx_dcb_aen_handler(struct qlcnic_dcb *dcb, void *data)
+{
+	if (test_and_set_bit(QLCNIC_DCB_AEN_MODE, &dcb->state))
+		return;
+
+	queue_delayed_work(dcb->wq, &dcb->aen_work, 0);
+}
+
+static int qlcnic_83xx_dcb_get_hw_capability(struct qlcnic_dcb *dcb)
+{
+	struct qlcnic_dcb_capability *cap = &dcb->cfg->capability;
+	u32 mbx_out;
+	int err;
+
+	err = __qlcnic_dcb_get_capability(dcb, &mbx_out);
+	if (err)
+		return err;
+
+	if (mbx_out & BIT_2)
+		cap->dcb_capability = DCB_CAP_DCBX_VER_CEE;
+	if (mbx_out & BIT_3)
+		cap->dcb_capability |= DCB_CAP_DCBX_VER_IEEE;
+	if (cap->dcb_capability)
+		cap->dcb_capability |= DCB_CAP_DCBX_LLD_MANAGED;
+
+	if (cap->dcb_capability && cap->tsa_capability && cap->ets_capability)
+		set_bit(QLCNIC_DCB_STATE, &dcb->state);
+
+	return err;
+}
+
+static int qlcnic_83xx_dcb_query_cee_param(struct qlcnic_dcb *dcb,
+					   char *buf, u8 idx)
+{
+	struct qlcnic_adapter *adapter = dcb->adapter;
+	struct qlcnic_dcb_mbx_params mbx_out;
+	int err, i, j, k, max_app, size;
+	struct qlcnic_dcb_param *each;
+	struct qlcnic_cmd_args cmd;
+	u32 val;
+	char *p;
+
+	size = 0;
+	memset(&mbx_out, 0, sizeof(struct qlcnic_dcb_mbx_params));
+	memset(buf, 0, sizeof(struct qlcnic_dcb_mbx_params));
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_DCB_QUERY_PARAM);
+	if (err)
+		return err;
+
+	cmd.req.arg[0] |= QLC_DCB_FW_VER << 29;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to query DCBX param, err %d\n", err);
+		goto out;
+	}
+
+	mbx_out.prio_tc_map = cmd.rsp.arg[1];
+	p = memcpy(buf, &mbx_out, sizeof(u32));
+	k = 2;
+	p += sizeof(u32);
+
+	for (j = 0; j < QLC_DCB_NUM_PARAM; j++) {
+		each = &mbx_out.type[j];
+
+		each->hdr_prio_pfc_map[0] = cmd.rsp.arg[k++];
+		each->hdr_prio_pfc_map[1] = cmd.rsp.arg[k++];
+		each->prio_pg_map[0] = cmd.rsp.arg[k++];
+		each->prio_pg_map[1] = cmd.rsp.arg[k++];
+		each->pg_bw_map[0] = cmd.rsp.arg[k++];
+		each->pg_bw_map[1] = cmd.rsp.arg[k++];
+		each->pg_tsa_map[0] = cmd.rsp.arg[k++];
+		each->pg_tsa_map[1] = cmd.rsp.arg[k++];
+		val = each->hdr_prio_pfc_map[0];
+
+		max_app = qlcnic_dcb_get_num_app(adapter, val);
+		for (i = 0; i < max_app; i++)
+			each->app[i] = cmd.rsp.arg[i + k];
+
+		size = 16 * sizeof(u32);
+		memcpy(p, &each->hdr_prio_pfc_map[0], size);
+		p += size;
+		if (j == 0)
+			k = 18;
+		else
+			k = 34;
+	}
+out:
+	qlcnic_free_mbx_args(&cmd);
+
+	return err;
+}
+
+static int qlcnic_83xx_dcb_get_cee_cfg(struct qlcnic_dcb *dcb)
+{
+	int err;
+
+	err = qlcnic_dcb_query_cee_param(dcb, (char *)dcb->param, 0);
+	if (err)
+		return err;
+
+	qlcnic_dcb_data_cee_param_map(dcb->adapter);
+
+	return err;
+}
+
+static void qlcnic_83xx_dcb_aen_handler(struct qlcnic_dcb *dcb, void *data)
+{
+	u32 *val = data;
+
+	if (test_and_set_bit(QLCNIC_DCB_AEN_MODE, &dcb->state))
+		return;
+
+	if (*val & BIT_8)
+		set_bit(QLCNIC_DCB_STATE, &dcb->state);
+	else
+		clear_bit(QLCNIC_DCB_STATE, &dcb->state);
+
+	queue_delayed_work(dcb->wq, &dcb->aen_work, 0);
+}
+
+static void qlcnic_dcb_fill_cee_tc_params(struct qlcnic_dcb_mbx_params *mbx,
+					  struct qlcnic_dcb_param *each,
+					  struct qlcnic_dcb_cee *type)
+{
+	struct qlcnic_dcb_tc_cfg *tc_cfg;
+	u8 i, tc, pgid;
+
+	for (i = 0; i < QLC_DCB_MAX_PRIO; i++) {
+		tc = QLC_DCB_GET_TC_PRIO(mbx->prio_tc_map, i);
+		tc_cfg = &type->tc_cfg[tc];
+		tc_cfg->valid = true;
+		tc_cfg->up_tc_map |= QLC_DCB_GET_MAP(i);
+
+		if (QLC_DCB_GET_PFC_PRIO(each->hdr_prio_pfc_map[1], i) &&
+		    type->pfc_mode_enable) {
+			tc_cfg->prio_cfg[i].valid = true;
+			tc_cfg->prio_cfg[i].pfc_type = QLC_PFC_FULL;
+		}
+
+		if (i < 4)
+			pgid = QLC_DCB_GET_PGID_PRIO(each->prio_pg_map[0], i);
+		else
+			pgid = QLC_DCB_GET_PGID_PRIO(each->prio_pg_map[1], i);
+
+		tc_cfg->pgid = pgid;
+
+		tc_cfg->prio_type = QLC_PRIO_LINK;
+		type->pg_cfg[tc_cfg->pgid].prio_count++;
+	}
+}
+
+static void qlcnic_dcb_fill_cee_pg_params(struct qlcnic_dcb_param *each,
+					  struct qlcnic_dcb_cee *type)
+{
+	struct qlcnic_dcb_pg_cfg *pg_cfg;
+	u8 i, tsa, bw_per;
+
+	for (i = 0; i < QLC_DCB_MAX_PG; i++) {
+		pg_cfg = &type->pg_cfg[i];
+		pg_cfg->valid = true;
+
+		if (i < 4) {
+			bw_per = QLC_DCB_GET_BWPER_PG(each->pg_bw_map[0], i);
+			tsa = QLC_DCB_GET_TSA_PG(each->pg_tsa_map[0], i);
+		} else {
+			bw_per = QLC_DCB_GET_BWPER_PG(each->pg_bw_map[1], i);
+			tsa = QLC_DCB_GET_TSA_PG(each->pg_tsa_map[1], i);
+		}
+
+		pg_cfg->total_bw_percent = bw_per;
+		pg_cfg->tsa_type = tsa;
+	}
+}
+
+static void
+qlcnic_dcb_fill_cee_app_params(struct qlcnic_adapter *adapter, u8 idx,
+			       struct qlcnic_dcb_param *each,
+			       struct qlcnic_dcb_cee *type)
+{
+	struct qlcnic_dcb_app *app;
+	u8 i, num_app, map, cnt;
+	struct dcb_app new_app;
+
+	num_app = qlcnic_dcb_get_num_app(adapter, each->hdr_prio_pfc_map[0]);
+	for (i = 0; i < num_app; i++) {
+		app = &type->app[i];
+		app->valid = true;
+
+		/* Only for CEE (-1) */
+		app->selector = QLC_DCB_GET_SELECTOR_APP(each->app[i]) - 1;
+		new_app.selector = app->selector;
+		app->protocol = QLC_DCB_GET_PROTO_ID_APP(each->app[i]);
+		new_app.protocol = app->protocol;
+		map = qlcnic_dcb_get_prio_map_app(adapter, each->app[i]);
+		cnt = qlcnic_dcb_prio_count(map);
+
+		if (cnt >= QLC_DCB_MAX_TC)
+			cnt = 0;
+
+		app->priority = cnt;
+		new_app.priority = cnt;
+
+		if (idx == QLC_DCB_OPER_IDX && adapter->netdev->dcbnl_ops)
+			dcb_setapp(adapter->netdev, &new_app);
+	}
+}
+
+static void qlcnic_dcb_map_cee_params(struct qlcnic_adapter *adapter, u8 idx)
+{
+	struct qlcnic_dcb_mbx_params *mbx = adapter->dcb->param;
+	struct qlcnic_dcb_param *each = &mbx->type[idx];
+	struct qlcnic_dcb_cfg *cfg = adapter->dcb->cfg;
+	struct qlcnic_dcb_cee *type = &cfg->type[idx];
+
+	type->tc_param_valid = false;
+	type->pfc_mode_enable = false;
+	memset(type->tc_cfg, 0,
+	       sizeof(struct qlcnic_dcb_tc_cfg) * QLC_DCB_MAX_TC);
+	memset(type->pg_cfg, 0,
+	       sizeof(struct qlcnic_dcb_pg_cfg) * QLC_DCB_MAX_TC);
+
+	if (qlcnic_dcb_pfc_hdr_valid(adapter, each->hdr_prio_pfc_map[0]) &&
+	    cfg->capability.max_pfc_tc)
+		type->pfc_mode_enable = true;
+
+	if (qlcnic_dcb_tsa_hdr_valid(adapter, each->hdr_prio_pfc_map[0]) &&
+	    cfg->capability.max_ets_tc)
+		type->tc_param_valid = true;
+
+	qlcnic_dcb_fill_cee_tc_params(mbx, each, type);
+	qlcnic_dcb_fill_cee_pg_params(each, type);
+	qlcnic_dcb_fill_cee_app_params(adapter, idx, each, type);
+}
+
+static void qlcnic_dcb_data_cee_param_map(struct qlcnic_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < QLC_DCB_NUM_PARAM; i++)
+		qlcnic_dcb_map_cee_params(adapter, i);
+
+	dcbnl_cee_notify(adapter->netdev, RTM_GETDCB, DCB_CMD_CEE_GET, 0, 0);
+}
+
+static u8 qlcnic_dcb_get_state(struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+
+	return test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state);
+}
+
+static void qlcnic_dcb_get_perm_hw_addr(struct net_device *netdev, u8 *addr)
+{
+	memcpy(addr, netdev->perm_addr, netdev->addr_len);
+}
+
+static void
+qlcnic_dcb_get_pg_tc_cfg_tx(struct net_device *netdev, int tc, u8 *prio,
+			    u8 *pgid, u8 *bw_per, u8 *up_tc_map)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_dcb_tc_cfg *tc_cfg, *temp;
+	struct qlcnic_dcb_cee *type;
+	u8 i, cnt, pg;
+
+	type = &adapter->dcb->cfg->type[QLC_DCB_OPER_IDX];
+	*prio = *pgid = *bw_per = *up_tc_map = 0;
+
+	if (!test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state) ||
+	    !type->tc_param_valid)
+		return;
+
+	if (tc < 0 || (tc >= QLC_DCB_MAX_TC))
+		return;
+
+	tc_cfg = &type->tc_cfg[tc];
+	if (!tc_cfg->valid)
+		return;
+
+	*pgid = tc_cfg->pgid;
+	*prio = tc_cfg->prio_type;
+	*up_tc_map = tc_cfg->up_tc_map;
+	pg = *pgid;
+
+	for (i = 0, cnt = 0; i < QLC_DCB_MAX_TC; i++) {
+		temp = &type->tc_cfg[i];
+		if (temp->valid && (pg == temp->pgid))
+			cnt++;
+	}
+
+	tc_cfg->bwg_percent = (100 / cnt);
+	*bw_per = tc_cfg->bwg_percent;
+}
+
+static void qlcnic_dcb_get_pg_bwg_cfg_tx(struct net_device *netdev, int pgid,
+					 u8 *bw_pct)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_dcb_pg_cfg *pgcfg;
+	struct qlcnic_dcb_cee *type;
+
+	*bw_pct = 0;
+	type = &adapter->dcb->cfg->type[QLC_DCB_OPER_IDX];
+
+	if (!test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state) ||
+	    !type->tc_param_valid)
+		return;
+
+	if (pgid < 0 || pgid >= QLC_DCB_MAX_PG)
+		return;
+
+	pgcfg = &type->pg_cfg[pgid];
+	if (!pgcfg->valid)
+		return;
+
+	*bw_pct = pgcfg->total_bw_percent;
+}
+
+static void qlcnic_dcb_get_pfc_cfg(struct net_device *netdev, int prio,
+				   u8 *setting)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_dcb_tc_cfg *tc_cfg;
+	u8 val = QLC_DCB_GET_MAP(prio);
+	struct qlcnic_dcb_cee *type;
+	u8 i;
+
+	*setting = 0;
+	type = &adapter->dcb->cfg->type[QLC_DCB_OPER_IDX];
+
+	if (!test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state) ||
+	    !type->pfc_mode_enable)
+		return;
+
+	for (i = 0; i < QLC_DCB_MAX_TC; i++) {
+		tc_cfg = &type->tc_cfg[i];
+		if (!tc_cfg->valid)
+			continue;
+
+		if ((val & tc_cfg->up_tc_map) && (tc_cfg->prio_cfg[prio].valid))
+			*setting = tc_cfg->prio_cfg[prio].pfc_type;
+	}
+}
+
+static u8 qlcnic_dcb_get_capability(struct net_device *netdev, int capid,
+				    u8 *cap)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+
+	if (!test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state))
+		return 0;
+
+	switch (capid) {
+	case DCB_CAP_ATTR_PG:
+	case DCB_CAP_ATTR_UP2TC:
+	case DCB_CAP_ATTR_PFC:
+	case DCB_CAP_ATTR_GSP:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_PG_TCS:
+	case DCB_CAP_ATTR_PFC_TCS:
+		*cap = 0x80;	/* 8 priorities for PGs */
+		break;
+	case DCB_CAP_ATTR_DCBX:
+		*cap = adapter->dcb->cfg->capability.dcb_capability;
+		break;
+	default:
+		*cap = false;
+	}
+
+	return 0;
+}
+
+static int qlcnic_dcb_get_num_tcs(struct net_device *netdev, int attr, u8 *num)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_dcb_cfg *cfg = adapter->dcb->cfg;
+
+	if (!test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state))
+		return -EINVAL;
+
+	switch (attr) {
+	case DCB_NUMTCS_ATTR_PG:
+		*num = cfg->capability.max_ets_tc;
+		return 0;
+	case DCB_NUMTCS_ATTR_PFC:
+		*num = cfg->capability.max_pfc_tc;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int qlcnic_dcb_get_app(struct net_device *netdev, u8 idtype, u16 id)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct dcb_app app = {
+				.selector = idtype,
+				.protocol = id,
+			     };
+
+	if (!test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state))
+		return -EINVAL;
+
+	return dcb_getapp(netdev, &app);
+}
+
+static u8 qlcnic_dcb_get_pfc_state(struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_dcb *dcb = adapter->dcb;
+
+	if (!test_bit(QLCNIC_DCB_STATE, &dcb->state))
+		return 0;
+
+	return dcb->cfg->type[QLC_DCB_OPER_IDX].pfc_mode_enable;
+}
+
+static u8 qlcnic_dcb_get_dcbx(struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_dcb_cfg *cfg = adapter->dcb->cfg;
+
+	if (!test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state))
+		return 0;
+
+	return cfg->capability.dcb_capability;
+}
+
+static u8 qlcnic_dcb_get_feat_cfg(struct net_device *netdev, int fid, u8 *flag)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_dcb_cee *type;
+
+	if (!test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state))
+		return 1;
+
+	type = &adapter->dcb->cfg->type[QLC_DCB_OPER_IDX];
+	*flag = 0;
+
+	switch (fid) {
+	case DCB_FEATCFG_ATTR_PG:
+		if (type->tc_param_valid)
+			*flag |= DCB_FEATCFG_ENABLE;
+		else
+			*flag |= DCB_FEATCFG_ERROR;
+		break;
+	case DCB_FEATCFG_ATTR_PFC:
+		if (type->pfc_mode_enable) {
+			if (type->tc_cfg[0].prio_cfg[0].pfc_type)
+				*flag |= DCB_FEATCFG_ENABLE;
+		} else {
+			*flag |= DCB_FEATCFG_ERROR;
+		}
+		break;
+	case DCB_FEATCFG_ATTR_APP:
+		*flag |= DCB_FEATCFG_ENABLE;
+		break;
+	default:
+		netdev_err(netdev, "Invalid Feature ID %d\n", fid);
+		return 1;
+	}
+
+	return 0;
+}
+
+static inline void
+qlcnic_dcb_get_pg_tc_cfg_rx(struct net_device *netdev, int prio, u8 *prio_type,
+			    u8 *pgid, u8 *bw_pct, u8 *up_map)
+{
+	*prio_type = *pgid = *bw_pct = *up_map = 0;
+}
+
+static inline void
+qlcnic_dcb_get_pg_bwg_cfg_rx(struct net_device *netdev, int pgid, u8 *bw_pct)
+{
+	*bw_pct = 0;
+}
+
+static int qlcnic_dcb_peer_app_info(struct net_device *netdev,
+				    struct dcb_peer_app_info *info,
+				    u16 *app_count)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_dcb_cee *peer;
+	int i;
+
+	memset(info, 0, sizeof(*info));
+	*app_count = 0;
+
+	if (!test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state))
+		return 0;
+
+	peer = &adapter->dcb->cfg->type[QLC_DCB_PEER_IDX];
+
+	for (i = 0; i < QLC_DCB_MAX_APP; i++) {
+		if (peer->app[i].valid)
+			(*app_count)++;
+	}
+
+	return 0;
+}
+
+static int qlcnic_dcb_peer_app_table(struct net_device *netdev,
+				     struct dcb_app *table)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_dcb_cee *peer;
+	struct qlcnic_dcb_app *app;
+	int i, j;
+
+	if (!test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state))
+		return 0;
+
+	peer = &adapter->dcb->cfg->type[QLC_DCB_PEER_IDX];
+
+	for (i = 0, j = 0; i < QLC_DCB_MAX_APP; i++) {
+		app = &peer->app[i];
+		if (!app->valid)
+			continue;
+
+		table[j].selector = app->selector;
+		table[j].priority = app->priority;
+		table[j++].protocol = app->protocol;
+	}
+
+	return 0;
+}
+
+static int qlcnic_dcb_cee_peer_get_pg(struct net_device *netdev,
+				      struct cee_pg *pg)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_dcb_cee *peer;
+	u8 i, j, k, map;
+
+	if (!test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state))
+		return 0;
+
+	peer = &adapter->dcb->cfg->type[QLC_DCB_PEER_IDX];
+
+	for (i = 0, j = 0; i < QLC_DCB_MAX_PG; i++) {
+		if (!peer->pg_cfg[i].valid)
+			continue;
+
+		pg->pg_bw[j] = peer->pg_cfg[i].total_bw_percent;
+
+		for (k = 0; k < QLC_DCB_MAX_TC; k++) {
+			if (peer->tc_cfg[i].valid &&
+			    (peer->tc_cfg[i].pgid == i)) {
+				map = peer->tc_cfg[i].up_tc_map;
+				pg->prio_pg[j++] = map;
+				break;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int qlcnic_dcb_cee_peer_get_pfc(struct net_device *netdev,
+				       struct cee_pfc *pfc)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_dcb_cfg *cfg = adapter->dcb->cfg;
+	struct qlcnic_dcb_tc_cfg *tc;
+	struct qlcnic_dcb_cee *peer;
+	u8 i, setting, prio;
+
+	pfc->pfc_en = 0;
+
+	if (!test_bit(QLCNIC_DCB_STATE, &adapter->dcb->state))
+		return 0;
+
+	peer = &cfg->type[QLC_DCB_PEER_IDX];
+
+	for (i = 0; i < QLC_DCB_MAX_TC; i++) {
+		tc = &peer->tc_cfg[i];
+		prio = qlcnic_dcb_prio_count(tc->up_tc_map);
+
+		setting = 0;
+		qlcnic_dcb_get_pfc_cfg(netdev, prio, &setting);
+		if (setting)
+			pfc->pfc_en |= QLC_DCB_GET_MAP(i);
+	}
+
+	pfc->tcs_supported = cfg->capability.max_pfc_tc;
+
+	return 0;
+}
+
+static const struct dcbnl_rtnl_ops qlcnic_dcbnl_ops = {
+	.getstate		= qlcnic_dcb_get_state,
+	.getpermhwaddr		= qlcnic_dcb_get_perm_hw_addr,
+	.getpgtccfgtx		= qlcnic_dcb_get_pg_tc_cfg_tx,
+	.getpgbwgcfgtx		= qlcnic_dcb_get_pg_bwg_cfg_tx,
+	.getpfccfg		= qlcnic_dcb_get_pfc_cfg,
+	.getcap			= qlcnic_dcb_get_capability,
+	.getnumtcs		= qlcnic_dcb_get_num_tcs,
+	.getapp			= qlcnic_dcb_get_app,
+	.getpfcstate		= qlcnic_dcb_get_pfc_state,
+	.getdcbx		= qlcnic_dcb_get_dcbx,
+	.getfeatcfg		= qlcnic_dcb_get_feat_cfg,
+
+	.getpgtccfgrx		= qlcnic_dcb_get_pg_tc_cfg_rx,
+	.getpgbwgcfgrx		= qlcnic_dcb_get_pg_bwg_cfg_rx,
+
+	.peer_getappinfo	= qlcnic_dcb_peer_app_info,
+	.peer_getapptable	= qlcnic_dcb_peer_app_table,
+	.cee_peer_getpg		= qlcnic_dcb_cee_peer_get_pg,
+	.cee_peer_getpfc	= qlcnic_dcb_cee_peer_get_pfc,
+};
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h
new file mode 100644
index 0000000..f4aa633
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h
@@ -0,0 +1,121 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c)  2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#ifndef __QLCNIC_DCBX_H
+#define __QLCNIC_DCBX_H
+
+#define QLCNIC_DCB_STATE	0
+#define QLCNIC_DCB_AEN_MODE	1
+
+#ifdef CONFIG_QLCNIC_DCB
+int qlcnic_register_dcb(struct qlcnic_adapter *);
+#else
+static inline int qlcnic_register_dcb(struct qlcnic_adapter *adapter)
+{ return 0; }
+#endif
+
+struct qlcnic_dcb;
+
+struct qlcnic_dcb_ops {
+	int (*query_hw_capability) (struct qlcnic_dcb *, char *);
+	int (*get_hw_capability) (struct qlcnic_dcb *);
+	int (*query_cee_param) (struct qlcnic_dcb *, char *, u8);
+	void (*init_dcbnl_ops) (struct qlcnic_dcb *);
+	void (*aen_handler) (struct qlcnic_dcb *, void *);
+	int (*get_cee_cfg) (struct qlcnic_dcb *);
+	void (*get_info) (struct qlcnic_dcb *);
+	int (*attach) (struct qlcnic_dcb *);
+	void (*free) (struct qlcnic_dcb *);
+};
+
+struct qlcnic_dcb {
+	struct qlcnic_dcb_mbx_params	*param;
+	struct qlcnic_adapter		*adapter;
+	struct delayed_work		aen_work;
+	struct workqueue_struct		*wq;
+	const struct qlcnic_dcb_ops	*ops;
+	struct qlcnic_dcb_cfg		*cfg;
+	unsigned long			state;
+};
+
+static inline void qlcnic_clear_dcb_ops(struct qlcnic_dcb *dcb)
+{
+	kfree(dcb);
+}
+
+static inline int qlcnic_dcb_get_hw_capability(struct qlcnic_dcb *dcb)
+{
+	if (dcb && dcb->ops->get_hw_capability)
+		return dcb->ops->get_hw_capability(dcb);
+
+	return 0;
+}
+
+static inline void qlcnic_dcb_free(struct qlcnic_dcb *dcb)
+{
+	if (dcb && dcb->ops->free)
+		dcb->ops->free(dcb);
+}
+
+static inline int qlcnic_dcb_attach(struct qlcnic_dcb *dcb)
+{
+	if (dcb && dcb->ops->attach)
+		return dcb->ops->attach(dcb);
+
+	return 0;
+}
+
+static inline int
+qlcnic_dcb_query_hw_capability(struct qlcnic_dcb *dcb, char *buf)
+{
+	if (dcb && dcb->ops->query_hw_capability)
+		return dcb->ops->query_hw_capability(dcb, buf);
+
+	return 0;
+}
+
+static inline void qlcnic_dcb_get_info(struct qlcnic_dcb *dcb)
+{
+	if (dcb && dcb->ops->get_info)
+		dcb->ops->get_info(dcb);
+}
+
+static inline int
+qlcnic_dcb_query_cee_param(struct qlcnic_dcb *dcb, char *buf, u8 type)
+{
+	if (dcb && dcb->ops->query_cee_param)
+		return dcb->ops->query_cee_param(dcb, buf, type);
+
+	return 0;
+}
+
+static inline int qlcnic_dcb_get_cee_cfg(struct qlcnic_dcb *dcb)
+{
+	if (dcb && dcb->ops->get_cee_cfg)
+		return dcb->ops->get_cee_cfg(dcb);
+
+	return 0;
+}
+
+static inline void qlcnic_dcb_aen_handler(struct qlcnic_dcb *dcb, void *msg)
+{
+	if (dcb && dcb->ops->aen_handler)
+		dcb->ops->aen_handler(dcb, msg);
+}
+
+static inline void qlcnic_dcb_init_dcbnl_ops(struct qlcnic_dcb *dcb)
+{
+	if (dcb && dcb->ops->init_dcbnl_ops)
+		dcb->ops->init_dcbnl_ops(dcb);
+}
+
+static inline void qlcnic_dcb_enable(struct qlcnic_dcb *dcb)
+{
+	if (dcb && qlcnic_dcb_attach(dcb))
+		qlcnic_clear_dcb_ops(dcb);
+}
+#endif
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
new file mode 100644
index 0000000..7f7deea
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
@@ -0,0 +1,1892 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+
+#include "qlcnic.h"
+
+struct qlcnic_stats {
+	char stat_string[ETH_GSTRING_LEN];
+	int sizeof_stat;
+	int stat_offset;
+};
+
+#define QLC_SIZEOF(m) FIELD_SIZEOF(struct qlcnic_adapter, m)
+#define QLC_OFF(m) offsetof(struct qlcnic_adapter, m)
+static const u32 qlcnic_fw_dump_level[] = {
+	0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff
+};
+
+static const struct qlcnic_stats qlcnic_gstrings_stats[] = {
+	{"xmit_on", QLC_SIZEOF(stats.xmit_on), QLC_OFF(stats.xmit_on)},
+	{"xmit_off", QLC_SIZEOF(stats.xmit_off), QLC_OFF(stats.xmit_off)},
+	{"xmit_called", QLC_SIZEOF(stats.xmitcalled),
+	 QLC_OFF(stats.xmitcalled)},
+	{"xmit_finished", QLC_SIZEOF(stats.xmitfinished),
+	 QLC_OFF(stats.xmitfinished)},
+	{"tx dma map error", QLC_SIZEOF(stats.tx_dma_map_error),
+	 QLC_OFF(stats.tx_dma_map_error)},
+	{"tx_bytes", QLC_SIZEOF(stats.txbytes), QLC_OFF(stats.txbytes)},
+	{"tx_dropped", QLC_SIZEOF(stats.txdropped), QLC_OFF(stats.txdropped)},
+	{"rx dma map error", QLC_SIZEOF(stats.rx_dma_map_error),
+	 QLC_OFF(stats.rx_dma_map_error)},
+	{"rx_pkts", QLC_SIZEOF(stats.rx_pkts), QLC_OFF(stats.rx_pkts)},
+	{"rx_bytes", QLC_SIZEOF(stats.rxbytes), QLC_OFF(stats.rxbytes)},
+	{"rx_dropped", QLC_SIZEOF(stats.rxdropped), QLC_OFF(stats.rxdropped)},
+	{"null rxbuf", QLC_SIZEOF(stats.null_rxbuf), QLC_OFF(stats.null_rxbuf)},
+	{"csummed", QLC_SIZEOF(stats.csummed), QLC_OFF(stats.csummed)},
+	{"lro_pkts", QLC_SIZEOF(stats.lro_pkts), QLC_OFF(stats.lro_pkts)},
+	{"lrobytes", QLC_SIZEOF(stats.lrobytes), QLC_OFF(stats.lrobytes)},
+	{"lso_frames", QLC_SIZEOF(stats.lso_frames), QLC_OFF(stats.lso_frames)},
+	{"encap_lso_frames", QLC_SIZEOF(stats.encap_lso_frames),
+	 QLC_OFF(stats.encap_lso_frames)},
+	{"encap_tx_csummed", QLC_SIZEOF(stats.encap_tx_csummed),
+	 QLC_OFF(stats.encap_tx_csummed)},
+	{"encap_rx_csummed", QLC_SIZEOF(stats.encap_rx_csummed),
+	 QLC_OFF(stats.encap_rx_csummed)},
+	{"skb_alloc_failure", QLC_SIZEOF(stats.skb_alloc_failure),
+	 QLC_OFF(stats.skb_alloc_failure)},
+	{"mac_filter_limit_overrun", QLC_SIZEOF(stats.mac_filter_limit_overrun),
+	 QLC_OFF(stats.mac_filter_limit_overrun)},
+	{"spurious intr", QLC_SIZEOF(stats.spurious_intr),
+	 QLC_OFF(stats.spurious_intr)},
+	{"mbx spurious intr", QLC_SIZEOF(stats.mbx_spurious_intr),
+	 QLC_OFF(stats.mbx_spurious_intr)},
+};
+
+static const char qlcnic_device_gstrings_stats[][ETH_GSTRING_LEN] = {
+	"tx unicast frames",
+	"tx multicast frames",
+	"tx broadcast frames",
+	"tx dropped frames",
+	"tx errors",
+	"tx local frames",
+	"tx numbytes",
+	"rx unicast frames",
+	"rx multicast frames",
+	"rx broadcast frames",
+	"rx dropped frames",
+	"rx errors",
+	"rx local frames",
+	"rx numbytes",
+};
+
+static const char qlcnic_83xx_tx_stats_strings[][ETH_GSTRING_LEN] = {
+	"ctx_tx_bytes",
+	"ctx_tx_pkts",
+	"ctx_tx_errors",
+	"ctx_tx_dropped_pkts",
+	"ctx_tx_num_buffers",
+};
+
+static const char qlcnic_83xx_mac_stats_strings[][ETH_GSTRING_LEN] = {
+	"mac_tx_frames",
+	"mac_tx_bytes",
+	"mac_tx_mcast_pkts",
+	"mac_tx_bcast_pkts",
+	"mac_tx_pause_cnt",
+	"mac_tx_ctrl_pkt",
+	"mac_tx_lt_64b_pkts",
+	"mac_tx_lt_127b_pkts",
+	"mac_tx_lt_255b_pkts",
+	"mac_tx_lt_511b_pkts",
+	"mac_tx_lt_1023b_pkts",
+	"mac_tx_lt_1518b_pkts",
+	"mac_tx_gt_1518b_pkts",
+	"mac_rx_frames",
+	"mac_rx_bytes",
+	"mac_rx_mcast_pkts",
+	"mac_rx_bcast_pkts",
+	"mac_rx_pause_cnt",
+	"mac_rx_ctrl_pkt",
+	"mac_rx_lt_64b_pkts",
+	"mac_rx_lt_127b_pkts",
+	"mac_rx_lt_255b_pkts",
+	"mac_rx_lt_511b_pkts",
+	"mac_rx_lt_1023b_pkts",
+	"mac_rx_lt_1518b_pkts",
+	"mac_rx_gt_1518b_pkts",
+	"mac_rx_length_error",
+	"mac_rx_length_small",
+	"mac_rx_length_large",
+	"mac_rx_jabber",
+	"mac_rx_dropped",
+	"mac_crc_error",
+	"mac_align_error",
+	"eswitch_frames",
+	"eswitch_bytes",
+	"eswitch_multicast_frames",
+	"eswitch_broadcast_frames",
+	"eswitch_unicast_frames",
+	"eswitch_error_free_frames",
+	"eswitch_error_free_bytes",
+};
+
+#define QLCNIC_STATS_LEN	ARRAY_SIZE(qlcnic_gstrings_stats)
+
+static const char qlcnic_tx_queue_stats_strings[][ETH_GSTRING_LEN] = {
+	"xmit_on",
+	"xmit_off",
+	"xmit_called",
+	"xmit_finished",
+	"tx_bytes",
+};
+
+#define QLCNIC_TX_STATS_LEN	ARRAY_SIZE(qlcnic_tx_queue_stats_strings)
+
+static const char qlcnic_83xx_rx_stats_strings[][ETH_GSTRING_LEN] = {
+	"ctx_rx_bytes",
+	"ctx_rx_pkts",
+	"ctx_lro_pkt_cnt",
+	"ctx_ip_csum_error",
+	"ctx_rx_pkts_wo_ctx",
+	"ctx_rx_pkts_drop_wo_sds_on_card",
+	"ctx_rx_pkts_drop_wo_sds_on_host",
+	"ctx_rx_osized_pkts",
+	"ctx_rx_pkts_dropped_wo_rds",
+	"ctx_rx_unexpected_mcast_pkts",
+	"ctx_invalid_mac_address",
+	"ctx_rx_rds_ring_prim_attempted",
+	"ctx_rx_rds_ring_prim_success",
+	"ctx_num_lro_flows_added",
+	"ctx_num_lro_flows_removed",
+	"ctx_num_lro_flows_active",
+	"ctx_pkts_dropped_unknown",
+};
+
+static const char qlcnic_gstrings_test[][ETH_GSTRING_LEN] = {
+	"Register_Test_on_offline",
+	"Link_Test_on_offline",
+	"Interrupt_Test_offline",
+	"Internal_Loopback_offline",
+	"External_Loopback_offline",
+	"EEPROM_Test_offline"
+};
+
+#define QLCNIC_TEST_LEN	ARRAY_SIZE(qlcnic_gstrings_test)
+
+static inline int qlcnic_82xx_statistics(struct qlcnic_adapter *adapter)
+{
+	return ARRAY_SIZE(qlcnic_gstrings_stats) +
+	       ARRAY_SIZE(qlcnic_83xx_mac_stats_strings) +
+	       QLCNIC_TX_STATS_LEN * adapter->drv_tx_rings;
+}
+
+static inline int qlcnic_83xx_statistics(struct qlcnic_adapter *adapter)
+{
+	return ARRAY_SIZE(qlcnic_gstrings_stats) +
+	       ARRAY_SIZE(qlcnic_83xx_tx_stats_strings) +
+	       ARRAY_SIZE(qlcnic_83xx_mac_stats_strings) +
+	       ARRAY_SIZE(qlcnic_83xx_rx_stats_strings) +
+	       QLCNIC_TX_STATS_LEN * adapter->drv_tx_rings;
+}
+
+static int qlcnic_dev_statistics_len(struct qlcnic_adapter *adapter)
+{
+	int len = -1;
+
+	if (qlcnic_82xx_check(adapter)) {
+		len = qlcnic_82xx_statistics(adapter);
+		if (adapter->flags & QLCNIC_ESWITCH_ENABLED)
+			len += ARRAY_SIZE(qlcnic_device_gstrings_stats);
+	} else if (qlcnic_83xx_check(adapter)) {
+		len = qlcnic_83xx_statistics(adapter);
+	}
+
+	return len;
+}
+
+#define	QLCNIC_TX_INTR_NOT_CONFIGURED	0X78563412
+
+#define QLCNIC_MAX_EEPROM_LEN   1024
+
+static const u32 diag_registers[] = {
+	QLCNIC_CMDPEG_STATE,
+	QLCNIC_RCVPEG_STATE,
+	QLCNIC_FW_CAPABILITIES,
+	QLCNIC_CRB_DRV_ACTIVE,
+	QLCNIC_CRB_DEV_STATE,
+	QLCNIC_CRB_DRV_STATE,
+	QLCNIC_CRB_DRV_SCRATCH,
+	QLCNIC_CRB_DEV_PARTITION_INFO,
+	QLCNIC_CRB_DRV_IDC_VER,
+	QLCNIC_PEG_ALIVE_COUNTER,
+	QLCNIC_PEG_HALT_STATUS1,
+	QLCNIC_PEG_HALT_STATUS2,
+	-1
+};
+
+
+static const u32 ext_diag_registers[] = {
+	CRB_XG_STATE_P3P,
+	ISR_INT_STATE_REG,
+	QLCNIC_CRB_PEG_NET_0+0x3c,
+	QLCNIC_CRB_PEG_NET_1+0x3c,
+	QLCNIC_CRB_PEG_NET_2+0x3c,
+	QLCNIC_CRB_PEG_NET_4+0x3c,
+	-1
+};
+
+#define QLCNIC_MGMT_API_VERSION	3
+#define QLCNIC_ETHTOOL_REGS_VER	4
+
+static inline int qlcnic_get_ring_regs_len(struct qlcnic_adapter *adapter)
+{
+	int ring_regs_cnt = (adapter->drv_tx_rings * 5) +
+			    (adapter->max_rds_rings * 2) +
+			    (adapter->drv_sds_rings * 3) + 5;
+	return ring_regs_cnt * sizeof(u32);
+}
+
+static int qlcnic_get_regs_len(struct net_device *dev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	u32 len;
+
+	if (qlcnic_83xx_check(adapter))
+		len = qlcnic_83xx_get_regs_len(adapter);
+	else
+		len = sizeof(ext_diag_registers) + sizeof(diag_registers);
+
+	len += ((QLCNIC_DEV_INFO_SIZE + 2) * sizeof(u32));
+	len += qlcnic_get_ring_regs_len(adapter);
+	return len;
+}
+
+static int qlcnic_get_eeprom_len(struct net_device *dev)
+{
+	return QLCNIC_FLASH_TOTAL_SIZE;
+}
+
+static void
+qlcnic_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	u32 fw_major, fw_minor, fw_build;
+	fw_major = QLC_SHARED_REG_RD32(adapter, QLCNIC_FW_VERSION_MAJOR);
+	fw_minor = QLC_SHARED_REG_RD32(adapter, QLCNIC_FW_VERSION_MINOR);
+	fw_build = QLC_SHARED_REG_RD32(adapter, QLCNIC_FW_VERSION_SUB);
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		"%d.%d.%d", fw_major, fw_minor, fw_build);
+
+	strlcpy(drvinfo->bus_info, pci_name(adapter->pdev),
+		sizeof(drvinfo->bus_info));
+	strlcpy(drvinfo->driver, qlcnic_driver_name, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, QLCNIC_LINUX_VERSIONID,
+		sizeof(drvinfo->version));
+}
+
+static int qlcnic_82xx_get_link_ksettings(struct qlcnic_adapter *adapter,
+					  struct ethtool_link_ksettings *ecmd)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u32 speed, reg;
+	int check_sfp_module = 0, err = 0;
+	u16 pcifn = ahw->pci_func;
+	u32 supported, advertising;
+
+	/* read which mode */
+	if (adapter->ahw->port_type == QLCNIC_GBE) {
+		supported = (SUPPORTED_10baseT_Half |
+				   SUPPORTED_10baseT_Full |
+				   SUPPORTED_100baseT_Half |
+				   SUPPORTED_100baseT_Full |
+				   SUPPORTED_1000baseT_Half |
+				   SUPPORTED_1000baseT_Full);
+
+		advertising = (ADVERTISED_100baseT_Half |
+				     ADVERTISED_100baseT_Full |
+				     ADVERTISED_1000baseT_Half |
+				     ADVERTISED_1000baseT_Full);
+
+		ecmd->base.speed = adapter->ahw->link_speed;
+		ecmd->base.duplex = adapter->ahw->link_duplex;
+		ecmd->base.autoneg = adapter->ahw->link_autoneg;
+
+	} else if (adapter->ahw->port_type == QLCNIC_XGBE) {
+		u32 val = 0;
+		val = QLCRD32(adapter, QLCNIC_PORT_MODE_ADDR, &err);
+
+		if (val == QLCNIC_PORT_MODE_802_3_AP) {
+			supported = SUPPORTED_1000baseT_Full;
+			advertising = ADVERTISED_1000baseT_Full;
+		} else {
+			supported = SUPPORTED_10000baseT_Full;
+			advertising = ADVERTISED_10000baseT_Full;
+		}
+
+		if (netif_running(adapter->netdev) && ahw->has_link_events) {
+			if (ahw->linkup) {
+				reg = QLCRD32(adapter,
+					      P3P_LINK_SPEED_REG(pcifn), &err);
+				speed = P3P_LINK_SPEED_VAL(pcifn, reg);
+				ahw->link_speed = speed * P3P_LINK_SPEED_MHZ;
+			}
+
+			ecmd->base.speed = ahw->link_speed;
+			ecmd->base.autoneg = ahw->link_autoneg;
+			ecmd->base.duplex = ahw->link_duplex;
+			goto skip;
+		}
+
+		ecmd->base.speed = SPEED_UNKNOWN;
+		ecmd->base.duplex = DUPLEX_UNKNOWN;
+		ecmd->base.autoneg = AUTONEG_DISABLE;
+	} else
+		return -EIO;
+
+skip:
+	ecmd->base.phy_address = adapter->ahw->physical_port;
+
+	switch (adapter->ahw->board_type) {
+	case QLCNIC_BRDTYPE_P3P_REF_QG:
+	case QLCNIC_BRDTYPE_P3P_4_GB:
+	case QLCNIC_BRDTYPE_P3P_4_GB_MM:
+
+		supported |= SUPPORTED_Autoneg;
+		advertising |= ADVERTISED_Autoneg;
+	case QLCNIC_BRDTYPE_P3P_10G_CX4:
+	case QLCNIC_BRDTYPE_P3P_10G_CX4_LP:
+	case QLCNIC_BRDTYPE_P3P_10000_BASE_T:
+		supported |= SUPPORTED_TP;
+		advertising |= ADVERTISED_TP;
+		ecmd->base.port = PORT_TP;
+		ecmd->base.autoneg =  adapter->ahw->link_autoneg;
+		break;
+	case QLCNIC_BRDTYPE_P3P_IMEZ:
+	case QLCNIC_BRDTYPE_P3P_XG_LOM:
+	case QLCNIC_BRDTYPE_P3P_HMEZ:
+		supported |= SUPPORTED_MII;
+		advertising |= ADVERTISED_MII;
+		ecmd->base.port = PORT_MII;
+		ecmd->base.autoneg = AUTONEG_DISABLE;
+		break;
+	case QLCNIC_BRDTYPE_P3P_10G_SFP_PLUS:
+	case QLCNIC_BRDTYPE_P3P_10G_SFP_CT:
+	case QLCNIC_BRDTYPE_P3P_10G_SFP_QT:
+		advertising |= ADVERTISED_TP;
+		supported |= SUPPORTED_TP;
+		check_sfp_module = netif_running(adapter->netdev) &&
+				   ahw->has_link_events;
+	case QLCNIC_BRDTYPE_P3P_10G_XFP:
+		supported |= SUPPORTED_FIBRE;
+		advertising |= ADVERTISED_FIBRE;
+		ecmd->base.port = PORT_FIBRE;
+		ecmd->base.autoneg = AUTONEG_DISABLE;
+		break;
+	case QLCNIC_BRDTYPE_P3P_10G_TP:
+		if (adapter->ahw->port_type == QLCNIC_XGBE) {
+			ecmd->base.autoneg = AUTONEG_DISABLE;
+			supported |= (SUPPORTED_FIBRE | SUPPORTED_TP);
+			advertising |=
+				(ADVERTISED_FIBRE | ADVERTISED_TP);
+			ecmd->base.port = PORT_FIBRE;
+			check_sfp_module = netif_running(adapter->netdev) &&
+					   ahw->has_link_events;
+		} else {
+			ecmd->base.autoneg = AUTONEG_ENABLE;
+			supported |= (SUPPORTED_TP | SUPPORTED_Autoneg);
+			advertising |=
+				(ADVERTISED_TP | ADVERTISED_Autoneg);
+			ecmd->base.port = PORT_TP;
+		}
+		break;
+	default:
+		dev_err(&adapter->pdev->dev, "Unsupported board model %d\n",
+			adapter->ahw->board_type);
+		return -EIO;
+	}
+
+	if (check_sfp_module) {
+		switch (adapter->ahw->module_type) {
+		case LINKEVENT_MODULE_OPTICAL_UNKNOWN:
+		case LINKEVENT_MODULE_OPTICAL_SRLR:
+		case LINKEVENT_MODULE_OPTICAL_LRM:
+		case LINKEVENT_MODULE_OPTICAL_SFP_1G:
+			ecmd->base.port = PORT_FIBRE;
+			break;
+		case LINKEVENT_MODULE_TWINAX_UNSUPPORTED_CABLE:
+		case LINKEVENT_MODULE_TWINAX_UNSUPPORTED_CABLELEN:
+		case LINKEVENT_MODULE_TWINAX:
+			ecmd->base.port = PORT_TP;
+			break;
+		default:
+			ecmd->base.port = PORT_OTHER;
+		}
+	}
+
+	ethtool_convert_legacy_u32_to_link_mode(ecmd->link_modes.supported,
+						supported);
+	ethtool_convert_legacy_u32_to_link_mode(ecmd->link_modes.advertising,
+						advertising);
+
+	return 0;
+}
+
+static int qlcnic_get_link_ksettings(struct net_device *dev,
+				     struct ethtool_link_ksettings *ecmd)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+
+	if (qlcnic_82xx_check(adapter))
+		return qlcnic_82xx_get_link_ksettings(adapter, ecmd);
+	else if (qlcnic_83xx_check(adapter))
+		return qlcnic_83xx_get_link_ksettings(adapter, ecmd);
+
+	return -EIO;
+}
+
+
+static int qlcnic_set_port_config(struct qlcnic_adapter *adapter,
+				  const struct ethtool_link_ksettings *ecmd)
+{
+	u32 ret = 0, config = 0;
+	/* read which mode */
+	if (ecmd->base.duplex)
+		config |= 0x1;
+
+	if (ecmd->base.autoneg)
+		config |= 0x2;
+
+	switch (ecmd->base.speed) {
+	case SPEED_10:
+		config |= (0 << 8);
+		break;
+	case SPEED_100:
+		config |= (1 << 8);
+		break;
+	case SPEED_1000:
+		config |= (10 << 8);
+		break;
+	default:
+		return -EIO;
+	}
+
+	ret = qlcnic_fw_cmd_set_port(adapter, config);
+
+	if (ret == QLCNIC_RCODE_NOT_SUPPORTED)
+		return -EOPNOTSUPP;
+	else if (ret)
+		return -EIO;
+	return ret;
+}
+
+static int qlcnic_set_link_ksettings(struct net_device *dev,
+				     const struct ethtool_link_ksettings *ecmd)
+{
+	u32 ret = 0;
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+
+	if (qlcnic_83xx_check(adapter))
+		qlcnic_83xx_get_port_type(adapter);
+
+	if (adapter->ahw->port_type != QLCNIC_GBE)
+		return -EOPNOTSUPP;
+
+	if (qlcnic_83xx_check(adapter))
+		ret = qlcnic_83xx_set_link_ksettings(adapter, ecmd);
+	else
+		ret = qlcnic_set_port_config(adapter, ecmd);
+
+	if (!ret)
+		return ret;
+
+	adapter->ahw->link_speed = ecmd->base.speed;
+	adapter->ahw->link_duplex = ecmd->base.duplex;
+	adapter->ahw->link_autoneg = ecmd->base.autoneg;
+
+	if (!netif_running(dev))
+		return 0;
+
+	dev->netdev_ops->ndo_stop(dev);
+	return dev->netdev_ops->ndo_open(dev);
+}
+
+static int qlcnic_82xx_get_registers(struct qlcnic_adapter *adapter,
+				     u32 *regs_buff)
+{
+	int i, j = 0, err = 0;
+
+	for (i = QLCNIC_DEV_INFO_SIZE + 1; diag_registers[j] != -1; j++, i++)
+		regs_buff[i] = QLC_SHARED_REG_RD32(adapter, diag_registers[j]);
+	j = 0;
+	while (ext_diag_registers[j] != -1)
+		regs_buff[i++] = QLCRD32(adapter, ext_diag_registers[j++],
+					 &err);
+	return i;
+}
+
+static void
+qlcnic_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *p)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_host_rds_ring *rds_rings;
+	struct qlcnic_host_tx_ring *tx_ring;
+	u32 *regs_buff = p;
+	int ring, i = 0;
+
+	memset(p, 0, qlcnic_get_regs_len(dev));
+
+	regs->version = (QLCNIC_ETHTOOL_REGS_VER << 24) |
+		(adapter->ahw->revision_id << 16) | (adapter->pdev)->device;
+
+	regs_buff[0] = (0xcafe0000 | (QLCNIC_DEV_INFO_SIZE & 0xffff));
+	regs_buff[1] = QLCNIC_MGMT_API_VERSION;
+
+	if (adapter->ahw->capabilities & QLC_83XX_ESWITCH_CAPABILITY)
+		regs_buff[2] = adapter->ahw->max_vnic_func;
+
+	if (qlcnic_82xx_check(adapter))
+		i = qlcnic_82xx_get_registers(adapter, regs_buff);
+	else
+		i = qlcnic_83xx_get_registers(adapter, regs_buff);
+
+	if (!test_bit(__QLCNIC_DEV_UP, &adapter->state))
+		return;
+
+	/* Marker btw regs and TX ring count */
+	regs_buff[i++] = 0xFFEFCDAB;
+
+	regs_buff[i++] = adapter->drv_tx_rings; /* No. of TX ring */
+	for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+		tx_ring = &adapter->tx_ring[ring];
+		regs_buff[i++] = le32_to_cpu(*(tx_ring->hw_consumer));
+		regs_buff[i++] = tx_ring->sw_consumer;
+		regs_buff[i++] = readl(tx_ring->crb_cmd_producer);
+		regs_buff[i++] = tx_ring->producer;
+		if (tx_ring->crb_intr_mask)
+			regs_buff[i++] = readl(tx_ring->crb_intr_mask);
+		else
+			regs_buff[i++] = QLCNIC_TX_INTR_NOT_CONFIGURED;
+	}
+
+	regs_buff[i++] = adapter->max_rds_rings; /* No. of RX ring */
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_rings = &recv_ctx->rds_rings[ring];
+		regs_buff[i++] = readl(rds_rings->crb_rcv_producer);
+		regs_buff[i++] = rds_rings->producer;
+	}
+
+	regs_buff[i++] = adapter->drv_sds_rings; /* No. of SDS ring */
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &(recv_ctx->sds_rings[ring]);
+		regs_buff[i++] = readl(sds_ring->crb_sts_consumer);
+		regs_buff[i++] = sds_ring->consumer;
+		regs_buff[i++] = readl(sds_ring->crb_intr_mask);
+	}
+}
+
+static u32 qlcnic_test_link(struct net_device *dev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	int err = 0;
+	u32 val;
+
+	if (qlcnic_83xx_check(adapter)) {
+		val = qlcnic_83xx_test_link(adapter);
+		return (val & 1) ? 0 : 1;
+	}
+	val = QLCRD32(adapter, CRB_XG_STATE_P3P, &err);
+	if (err == -EIO)
+		return err;
+	val = XG_LINK_STATE_P3P(adapter->ahw->pci_func, val);
+	return (val == XG_LINK_UP_P3P) ? 0 : 1;
+}
+
+static int
+qlcnic_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
+		      u8 *bytes)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	int offset;
+	int ret = -1;
+
+	if (qlcnic_83xx_check(adapter))
+		return 0;
+	if (eeprom->len == 0)
+		return -EINVAL;
+
+	eeprom->magic = (adapter->pdev)->vendor |
+			((adapter->pdev)->device << 16);
+	offset = eeprom->offset;
+
+	if (qlcnic_82xx_check(adapter))
+		ret = qlcnic_rom_fast_read_words(adapter, offset, bytes,
+						 eeprom->len);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static void
+qlcnic_get_ringparam(struct net_device *dev,
+		struct ethtool_ringparam *ring)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+
+	ring->rx_pending = adapter->num_rxd;
+	ring->rx_jumbo_pending = adapter->num_jumbo_rxd;
+	ring->tx_pending = adapter->num_txd;
+
+	ring->rx_max_pending = adapter->max_rxd;
+	ring->rx_jumbo_max_pending = adapter->max_jumbo_rxd;
+	ring->tx_max_pending = MAX_CMD_DESCRIPTORS;
+}
+
+static u32
+qlcnic_validate_ringparam(u32 val, u32 min, u32 max, char *r_name)
+{
+	u32 num_desc;
+	num_desc = max(val, min);
+	num_desc = min(num_desc, max);
+	num_desc = roundup_pow_of_two(num_desc);
+
+	if (val != num_desc) {
+		printk(KERN_INFO "%s: setting %s ring size %d instead of %d\n",
+		       qlcnic_driver_name, r_name, num_desc, val);
+	}
+
+	return num_desc;
+}
+
+static int
+qlcnic_set_ringparam(struct net_device *dev,
+		struct ethtool_ringparam *ring)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	u16 num_rxd, num_jumbo_rxd, num_txd;
+
+	if (ring->rx_mini_pending)
+		return -EOPNOTSUPP;
+
+	num_rxd = qlcnic_validate_ringparam(ring->rx_pending,
+			MIN_RCV_DESCRIPTORS, adapter->max_rxd, "rx");
+
+	num_jumbo_rxd = qlcnic_validate_ringparam(ring->rx_jumbo_pending,
+			MIN_JUMBO_DESCRIPTORS, adapter->max_jumbo_rxd,
+						"rx jumbo");
+
+	num_txd = qlcnic_validate_ringparam(ring->tx_pending,
+			MIN_CMD_DESCRIPTORS, MAX_CMD_DESCRIPTORS, "tx");
+
+	if (num_rxd == adapter->num_rxd && num_txd == adapter->num_txd &&
+			num_jumbo_rxd == adapter->num_jumbo_rxd)
+		return 0;
+
+	adapter->num_rxd = num_rxd;
+	adapter->num_jumbo_rxd = num_jumbo_rxd;
+	adapter->num_txd = num_txd;
+
+	return qlcnic_reset_context(adapter);
+}
+
+static int qlcnic_validate_ring_count(struct qlcnic_adapter *adapter,
+				      u8 rx_ring, u8 tx_ring)
+{
+	if (rx_ring == 0 || tx_ring == 0)
+		return -EINVAL;
+
+	if (rx_ring != 0) {
+		if (rx_ring > adapter->max_sds_rings) {
+			netdev_err(adapter->netdev,
+				   "Invalid ring count, SDS ring count %d should not be greater than max %d driver sds rings.\n",
+				   rx_ring, adapter->max_sds_rings);
+			return -EINVAL;
+		}
+	}
+
+	 if (tx_ring != 0) {
+		if (tx_ring > adapter->max_tx_rings) {
+			netdev_err(adapter->netdev,
+				   "Invalid ring count, Tx ring count %d should not be greater than max %d driver Tx rings.\n",
+				   tx_ring, adapter->max_tx_rings);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static void qlcnic_get_channels(struct net_device *dev,
+		struct ethtool_channels *channel)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+
+	channel->max_rx = adapter->max_sds_rings;
+	channel->max_tx = adapter->max_tx_rings;
+	channel->rx_count = adapter->drv_sds_rings;
+	channel->tx_count = adapter->drv_tx_rings;
+}
+
+static int qlcnic_set_channels(struct net_device *dev,
+			       struct ethtool_channels *channel)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	int err;
+
+	if (!(adapter->flags & QLCNIC_MSIX_ENABLED)) {
+		netdev_err(dev, "No RSS/TSS support in non MSI-X mode\n");
+		return -EINVAL;
+	}
+
+	if (channel->other_count || channel->combined_count)
+		return -EINVAL;
+
+	err = qlcnic_validate_ring_count(adapter, channel->rx_count,
+					 channel->tx_count);
+	if (err)
+		return err;
+
+	if (adapter->drv_sds_rings != channel->rx_count) {
+		err = qlcnic_validate_rings(adapter, channel->rx_count,
+					    QLCNIC_RX_QUEUE);
+		if (err) {
+			netdev_err(dev, "Unable to configure %u SDS rings\n",
+				   channel->rx_count);
+			return err;
+		}
+		adapter->drv_rss_rings = channel->rx_count;
+	}
+
+	if (adapter->drv_tx_rings != channel->tx_count) {
+		err = qlcnic_validate_rings(adapter, channel->tx_count,
+					    QLCNIC_TX_QUEUE);
+		if (err) {
+			netdev_err(dev, "Unable to configure %u Tx rings\n",
+				   channel->tx_count);
+			return err;
+		}
+		adapter->drv_tss_rings = channel->tx_count;
+	}
+
+	adapter->flags |= QLCNIC_TSS_RSS;
+
+	err = qlcnic_setup_rings(adapter);
+	netdev_info(dev, "Allocated %d SDS rings and %d Tx rings\n",
+		    adapter->drv_sds_rings, adapter->drv_tx_rings);
+
+	return err;
+}
+
+static void
+qlcnic_get_pauseparam(struct net_device *netdev,
+			  struct ethtool_pauseparam *pause)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	int port = adapter->ahw->physical_port;
+	int err = 0;
+	__u32 val;
+
+	if (qlcnic_83xx_check(adapter)) {
+		qlcnic_83xx_get_pauseparam(adapter, pause);
+		return;
+	}
+	if (adapter->ahw->port_type == QLCNIC_GBE) {
+		if ((port < 0) || (port > QLCNIC_NIU_MAX_GBE_PORTS))
+			return;
+		/* get flow control settings */
+		val = QLCRD32(adapter, QLCNIC_NIU_GB_MAC_CONFIG_0(port), &err);
+		if (err == -EIO)
+			return;
+		pause->rx_pause = qlcnic_gb_get_rx_flowctl(val);
+		val = QLCRD32(adapter, QLCNIC_NIU_GB_PAUSE_CTL, &err);
+		if (err == -EIO)
+			return;
+		switch (port) {
+		case 0:
+			pause->tx_pause = !(qlcnic_gb_get_gb0_mask(val));
+			break;
+		case 1:
+			pause->tx_pause = !(qlcnic_gb_get_gb1_mask(val));
+			break;
+		case 2:
+			pause->tx_pause = !(qlcnic_gb_get_gb2_mask(val));
+			break;
+		case 3:
+		default:
+			pause->tx_pause = !(qlcnic_gb_get_gb3_mask(val));
+			break;
+		}
+	} else if (adapter->ahw->port_type == QLCNIC_XGBE) {
+		if ((port < 0) || (port > QLCNIC_NIU_MAX_XG_PORTS))
+			return;
+		pause->rx_pause = 1;
+		val = QLCRD32(adapter, QLCNIC_NIU_XG_PAUSE_CTL, &err);
+		if (err == -EIO)
+			return;
+		if (port == 0)
+			pause->tx_pause = !(qlcnic_xg_get_xg0_mask(val));
+		else
+			pause->tx_pause = !(qlcnic_xg_get_xg1_mask(val));
+	} else {
+		dev_err(&netdev->dev, "Unknown board type: %x\n",
+					adapter->ahw->port_type);
+	}
+}
+
+static int
+qlcnic_set_pauseparam(struct net_device *netdev,
+			  struct ethtool_pauseparam *pause)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	int port = adapter->ahw->physical_port;
+	int err = 0;
+	__u32 val;
+
+	if (qlcnic_83xx_check(adapter))
+		return qlcnic_83xx_set_pauseparam(adapter, pause);
+
+	/* read mode */
+	if (adapter->ahw->port_type == QLCNIC_GBE) {
+		if ((port < 0) || (port > QLCNIC_NIU_MAX_GBE_PORTS))
+			return -EIO;
+		/* set flow control */
+		val = QLCRD32(adapter, QLCNIC_NIU_GB_MAC_CONFIG_0(port), &err);
+		if (err == -EIO)
+			return err;
+
+		if (pause->rx_pause)
+			qlcnic_gb_rx_flowctl(val);
+		else
+			qlcnic_gb_unset_rx_flowctl(val);
+
+		QLCWR32(adapter, QLCNIC_NIU_GB_MAC_CONFIG_0(port),
+				val);
+		QLCWR32(adapter, QLCNIC_NIU_GB_MAC_CONFIG_0(port), val);
+		/* set autoneg */
+		val = QLCRD32(adapter, QLCNIC_NIU_GB_PAUSE_CTL, &err);
+		if (err == -EIO)
+			return err;
+		switch (port) {
+		case 0:
+			if (pause->tx_pause)
+				qlcnic_gb_unset_gb0_mask(val);
+			else
+				qlcnic_gb_set_gb0_mask(val);
+			break;
+		case 1:
+			if (pause->tx_pause)
+				qlcnic_gb_unset_gb1_mask(val);
+			else
+				qlcnic_gb_set_gb1_mask(val);
+			break;
+		case 2:
+			if (pause->tx_pause)
+				qlcnic_gb_unset_gb2_mask(val);
+			else
+				qlcnic_gb_set_gb2_mask(val);
+			break;
+		case 3:
+		default:
+			if (pause->tx_pause)
+				qlcnic_gb_unset_gb3_mask(val);
+			else
+				qlcnic_gb_set_gb3_mask(val);
+			break;
+		}
+		QLCWR32(adapter, QLCNIC_NIU_GB_PAUSE_CTL, val);
+	} else if (adapter->ahw->port_type == QLCNIC_XGBE) {
+		if (!pause->rx_pause || pause->autoneg)
+			return -EOPNOTSUPP;
+
+		if ((port < 0) || (port > QLCNIC_NIU_MAX_XG_PORTS))
+			return -EIO;
+
+		val = QLCRD32(adapter, QLCNIC_NIU_XG_PAUSE_CTL, &err);
+		if (err == -EIO)
+			return err;
+		if (port == 0) {
+			if (pause->tx_pause)
+				qlcnic_xg_unset_xg0_mask(val);
+			else
+				qlcnic_xg_set_xg0_mask(val);
+		} else {
+			if (pause->tx_pause)
+				qlcnic_xg_unset_xg1_mask(val);
+			else
+				qlcnic_xg_set_xg1_mask(val);
+		}
+		QLCWR32(adapter, QLCNIC_NIU_XG_PAUSE_CTL, val);
+	} else {
+		dev_err(&netdev->dev, "Unknown board type: %x\n",
+				adapter->ahw->port_type);
+	}
+	return 0;
+}
+
+static int qlcnic_reg_test(struct net_device *dev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	u32 data_read;
+	int err = 0;
+
+	if (qlcnic_83xx_check(adapter))
+		return qlcnic_83xx_reg_test(adapter);
+
+	data_read = QLCRD32(adapter, QLCNIC_PCIX_PH_REG(0), &err);
+	if (err == -EIO)
+		return err;
+	if ((data_read & 0xffff) != adapter->pdev->vendor)
+		return 1;
+
+	return 0;
+}
+
+static int qlcnic_eeprom_test(struct net_device *dev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+
+	if (qlcnic_82xx_check(adapter))
+		return 0;
+
+	return qlcnic_83xx_flash_test(adapter);
+}
+
+static int qlcnic_get_sset_count(struct net_device *dev, int sset)
+{
+
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	switch (sset) {
+	case ETH_SS_TEST:
+		return QLCNIC_TEST_LEN;
+	case ETH_SS_STATS:
+		return qlcnic_dev_statistics_len(adapter);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int qlcnic_irq_test(struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_cmd_args cmd;
+	int ret, drv_sds_rings = adapter->drv_sds_rings;
+	int drv_tx_rings = adapter->drv_tx_rings;
+
+	if (qlcnic_83xx_check(adapter))
+		return qlcnic_83xx_interrupt_test(netdev);
+
+	if (test_and_set_bit(__QLCNIC_RESETTING, &adapter->state))
+		return -EIO;
+
+	ret = qlcnic_diag_alloc_res(netdev, QLCNIC_INTERRUPT_TEST);
+	if (ret)
+		goto clear_diag_irq;
+
+	ahw->diag_cnt = 0;
+	ret = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_INTRPT_TEST);
+	if (ret)
+		goto free_diag_res;
+
+	cmd.req.arg[1] = ahw->pci_func;
+	ret = qlcnic_issue_cmd(adapter, &cmd);
+	if (ret)
+		goto done;
+
+	usleep_range(1000, 12000);
+	ret = !ahw->diag_cnt;
+
+done:
+	qlcnic_free_mbx_args(&cmd);
+
+free_diag_res:
+	qlcnic_diag_free_res(netdev, drv_sds_rings);
+
+clear_diag_irq:
+	adapter->drv_sds_rings = drv_sds_rings;
+	adapter->drv_tx_rings = drv_tx_rings;
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+
+	return ret;
+}
+
+#define QLCNIC_ILB_PKT_SIZE		64
+#define QLCNIC_NUM_ILB_PKT		16
+#define QLCNIC_ILB_MAX_RCV_LOOP		10
+#define QLCNIC_LB_PKT_POLL_DELAY_MSEC	1
+#define QLCNIC_LB_PKT_POLL_COUNT	20
+
+static void qlcnic_create_loopback_buff(unsigned char *data, u8 mac[])
+{
+	unsigned char random_data[] = {0xa8, 0x06, 0x45, 0x00};
+
+	memset(data, 0x4e, QLCNIC_ILB_PKT_SIZE);
+
+	memcpy(data, mac, ETH_ALEN);
+	memcpy(data + ETH_ALEN, mac, ETH_ALEN);
+
+	memcpy(data + 2 * ETH_ALEN, random_data, sizeof(random_data));
+}
+
+int qlcnic_check_loopback_buff(unsigned char *data, u8 mac[])
+{
+	unsigned char buff[QLCNIC_ILB_PKT_SIZE];
+	qlcnic_create_loopback_buff(buff, mac);
+	return memcmp(data, buff, QLCNIC_ILB_PKT_SIZE);
+}
+
+int qlcnic_do_lb_test(struct qlcnic_adapter *adapter, u8 mode)
+{
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct qlcnic_host_sds_ring *sds_ring = &recv_ctx->sds_rings[0];
+	struct sk_buff *skb;
+	int i, loop, cnt = 0;
+
+	for (i = 0; i < QLCNIC_NUM_ILB_PKT; i++) {
+		skb = netdev_alloc_skb(adapter->netdev, QLCNIC_ILB_PKT_SIZE);
+		qlcnic_create_loopback_buff(skb->data, adapter->mac_addr);
+		skb_put(skb, QLCNIC_ILB_PKT_SIZE);
+		adapter->ahw->diag_cnt = 0;
+		qlcnic_xmit_frame(skb, adapter->netdev);
+		loop = 0;
+
+		do {
+			msleep(QLCNIC_LB_PKT_POLL_DELAY_MSEC);
+			qlcnic_process_rcv_ring_diag(sds_ring);
+			if (loop++ > QLCNIC_LB_PKT_POLL_COUNT)
+				break;
+		} while (!adapter->ahw->diag_cnt);
+
+		dev_kfree_skb_any(skb);
+
+		if (!adapter->ahw->diag_cnt)
+			dev_warn(&adapter->pdev->dev,
+				 "LB Test: packet #%d was not received\n",
+				 i + 1);
+		else
+			cnt++;
+	}
+	if (cnt != i) {
+		dev_err(&adapter->pdev->dev,
+			"LB Test: failed, TX[%d], RX[%d]\n", i, cnt);
+		if (mode != QLCNIC_ILB_MODE)
+			dev_warn(&adapter->pdev->dev,
+				 "WARNING: Please check loopback cable\n");
+		return -1;
+	}
+	return 0;
+}
+
+static int qlcnic_loopback_test(struct net_device *netdev, u8 mode)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	int drv_tx_rings = adapter->drv_tx_rings;
+	int drv_sds_rings = adapter->drv_sds_rings;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int loop = 0;
+	int ret;
+
+	if (qlcnic_83xx_check(adapter))
+		return qlcnic_83xx_loopback_test(netdev, mode);
+
+	if (!(ahw->capabilities & QLCNIC_FW_CAPABILITY_MULTI_LOOPBACK)) {
+		dev_info(&adapter->pdev->dev,
+			 "Firmware do not support loopback test\n");
+		return -EOPNOTSUPP;
+	}
+
+	dev_warn(&adapter->pdev->dev, "%s loopback test in progress\n",
+		 mode == QLCNIC_ILB_MODE ? "internal" : "external");
+	if (ahw->op_mode == QLCNIC_NON_PRIV_FUNC) {
+		dev_warn(&adapter->pdev->dev,
+			 "Loopback test not supported in nonprivileged mode\n");
+		return 0;
+	}
+
+	if (test_and_set_bit(__QLCNIC_RESETTING, &adapter->state))
+		return -EBUSY;
+
+	ret = qlcnic_diag_alloc_res(netdev, QLCNIC_LOOPBACK_TEST);
+	if (ret)
+		goto clear_it;
+
+	sds_ring = &adapter->recv_ctx->sds_rings[0];
+	ret = qlcnic_set_lb_mode(adapter, mode);
+	if (ret)
+		goto free_res;
+
+	ahw->diag_cnt = 0;
+	do {
+		msleep(500);
+		qlcnic_process_rcv_ring_diag(sds_ring);
+		if (loop++ > QLCNIC_ILB_MAX_RCV_LOOP) {
+			netdev_info(netdev,
+				    "Firmware didn't sent link up event to loopback request\n");
+			ret = -ETIMEDOUT;
+			goto free_res;
+		} else if (adapter->ahw->diag_cnt) {
+			ret = adapter->ahw->diag_cnt;
+			goto free_res;
+		}
+	} while (!QLCNIC_IS_LB_CONFIGURED(ahw->loopback_state));
+
+	ret = qlcnic_do_lb_test(adapter, mode);
+
+	qlcnic_clear_lb_mode(adapter, mode);
+
+ free_res:
+	qlcnic_diag_free_res(netdev, drv_sds_rings);
+
+ clear_it:
+	adapter->drv_sds_rings = drv_sds_rings;
+	adapter->drv_tx_rings = drv_tx_rings;
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	return ret;
+}
+
+static void
+qlcnic_diag_test(struct net_device *dev, struct ethtool_test *eth_test,
+		     u64 *data)
+{
+	memset(data, 0, sizeof(u64) * QLCNIC_TEST_LEN);
+
+	data[0] = qlcnic_reg_test(dev);
+	if (data[0])
+		eth_test->flags |= ETH_TEST_FL_FAILED;
+
+	data[1] = (u64) qlcnic_test_link(dev);
+	if (data[1])
+		eth_test->flags |= ETH_TEST_FL_FAILED;
+
+	if (eth_test->flags & ETH_TEST_FL_OFFLINE) {
+		data[2] = qlcnic_irq_test(dev);
+		if (data[2])
+			eth_test->flags |= ETH_TEST_FL_FAILED;
+
+		data[3] = qlcnic_loopback_test(dev, QLCNIC_ILB_MODE);
+		if (data[3])
+			eth_test->flags |= ETH_TEST_FL_FAILED;
+
+		if (eth_test->flags & ETH_TEST_FL_EXTERNAL_LB) {
+			data[4] = qlcnic_loopback_test(dev, QLCNIC_ELB_MODE);
+			if (data[4])
+				eth_test->flags |= ETH_TEST_FL_FAILED;
+			eth_test->flags |= ETH_TEST_FL_EXTERNAL_LB_DONE;
+		}
+
+		data[5] = qlcnic_eeprom_test(dev);
+		if (data[5])
+			eth_test->flags |= ETH_TEST_FL_FAILED;
+	}
+}
+
+static void
+qlcnic_get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	int index, i, num_stats;
+
+	switch (stringset) {
+	case ETH_SS_TEST:
+		memcpy(data, *qlcnic_gstrings_test,
+		       QLCNIC_TEST_LEN * ETH_GSTRING_LEN);
+		break;
+	case ETH_SS_STATS:
+		num_stats = ARRAY_SIZE(qlcnic_tx_queue_stats_strings);
+		for (i = 0; i < adapter->drv_tx_rings; i++) {
+			for (index = 0; index < num_stats; index++) {
+				sprintf(data, "tx_queue_%d %s", i,
+					qlcnic_tx_queue_stats_strings[index]);
+				data += ETH_GSTRING_LEN;
+			}
+		}
+
+		for (index = 0; index < QLCNIC_STATS_LEN; index++) {
+			memcpy(data + index * ETH_GSTRING_LEN,
+			       qlcnic_gstrings_stats[index].stat_string,
+			       ETH_GSTRING_LEN);
+		}
+
+		if (qlcnic_83xx_check(adapter)) {
+			num_stats = ARRAY_SIZE(qlcnic_83xx_tx_stats_strings);
+			for (i = 0; i < num_stats; i++, index++)
+				memcpy(data + index * ETH_GSTRING_LEN,
+				       qlcnic_83xx_tx_stats_strings[i],
+				       ETH_GSTRING_LEN);
+			num_stats = ARRAY_SIZE(qlcnic_83xx_mac_stats_strings);
+			for (i = 0; i < num_stats; i++, index++)
+				memcpy(data + index * ETH_GSTRING_LEN,
+				       qlcnic_83xx_mac_stats_strings[i],
+				       ETH_GSTRING_LEN);
+			num_stats = ARRAY_SIZE(qlcnic_83xx_rx_stats_strings);
+			for (i = 0; i < num_stats; i++, index++)
+				memcpy(data + index * ETH_GSTRING_LEN,
+				       qlcnic_83xx_rx_stats_strings[i],
+				       ETH_GSTRING_LEN);
+			return;
+		} else {
+			num_stats = ARRAY_SIZE(qlcnic_83xx_mac_stats_strings);
+			for (i = 0; i < num_stats; i++, index++)
+				memcpy(data + index * ETH_GSTRING_LEN,
+				       qlcnic_83xx_mac_stats_strings[i],
+				       ETH_GSTRING_LEN);
+		}
+		if (!(adapter->flags & QLCNIC_ESWITCH_ENABLED))
+			return;
+		num_stats = ARRAY_SIZE(qlcnic_device_gstrings_stats);
+		for (i = 0; i < num_stats; index++, i++) {
+			memcpy(data + index * ETH_GSTRING_LEN,
+			       qlcnic_device_gstrings_stats[i],
+			       ETH_GSTRING_LEN);
+		}
+	}
+}
+
+static u64 *qlcnic_fill_stats(u64 *data, void *stats, int type)
+{
+	if (type == QLCNIC_MAC_STATS) {
+		struct qlcnic_mac_statistics *mac_stats =
+					(struct qlcnic_mac_statistics *)stats;
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_tx_frames);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_tx_bytes);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_tx_mcast_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_tx_bcast_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_tx_pause_cnt);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_tx_ctrl_pkt);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_tx_lt_64b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_tx_lt_127b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_tx_lt_255b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_tx_lt_511b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_tx_lt_1023b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_tx_lt_1518b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_tx_gt_1518b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_frames);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_bytes);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_mcast_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_bcast_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_pause_cnt);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_ctrl_pkt);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_lt_64b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_lt_127b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_lt_255b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_lt_511b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_lt_1023b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_lt_1518b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_gt_1518b_pkts);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_length_error);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_length_small);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_length_large);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_jabber);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_dropped);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_rx_crc_error);
+		*data++ = QLCNIC_FILL_STATS(mac_stats->mac_align_error);
+	} else if (type == QLCNIC_ESW_STATS) {
+		struct __qlcnic_esw_statistics *esw_stats =
+				(struct __qlcnic_esw_statistics *)stats;
+		*data++ = QLCNIC_FILL_STATS(esw_stats->unicast_frames);
+		*data++ = QLCNIC_FILL_STATS(esw_stats->multicast_frames);
+		*data++ = QLCNIC_FILL_STATS(esw_stats->broadcast_frames);
+		*data++ = QLCNIC_FILL_STATS(esw_stats->dropped_frames);
+		*data++ = QLCNIC_FILL_STATS(esw_stats->errors);
+		*data++ = QLCNIC_FILL_STATS(esw_stats->local_frames);
+		*data++ = QLCNIC_FILL_STATS(esw_stats->numbytes);
+	}
+	return data;
+}
+
+void qlcnic_update_stats(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_tx_queue_stats tx_stats;
+	struct qlcnic_host_tx_ring *tx_ring;
+	int ring;
+
+	memset(&tx_stats, 0, sizeof(tx_stats));
+	for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+		tx_ring = &adapter->tx_ring[ring];
+		tx_stats.xmit_on += tx_ring->tx_stats.xmit_on;
+		tx_stats.xmit_off += tx_ring->tx_stats.xmit_off;
+		tx_stats.xmit_called += tx_ring->tx_stats.xmit_called;
+		tx_stats.xmit_finished += tx_ring->tx_stats.xmit_finished;
+		tx_stats.tx_bytes += tx_ring->tx_stats.tx_bytes;
+	}
+
+	adapter->stats.xmit_on = tx_stats.xmit_on;
+	adapter->stats.xmit_off = tx_stats.xmit_off;
+	adapter->stats.xmitcalled = tx_stats.xmit_called;
+	adapter->stats.xmitfinished = tx_stats.xmit_finished;
+	adapter->stats.txbytes = tx_stats.tx_bytes;
+}
+
+static u64 *qlcnic_fill_tx_queue_stats(u64 *data, void *stats)
+{
+	struct qlcnic_host_tx_ring *tx_ring;
+
+	tx_ring = (struct qlcnic_host_tx_ring *)stats;
+
+	*data++ = QLCNIC_FILL_STATS(tx_ring->tx_stats.xmit_on);
+	*data++ = QLCNIC_FILL_STATS(tx_ring->tx_stats.xmit_off);
+	*data++ = QLCNIC_FILL_STATS(tx_ring->tx_stats.xmit_called);
+	*data++ = QLCNIC_FILL_STATS(tx_ring->tx_stats.xmit_finished);
+	*data++ = QLCNIC_FILL_STATS(tx_ring->tx_stats.tx_bytes);
+
+	return data;
+}
+
+static void qlcnic_get_ethtool_stats(struct net_device *dev,
+				     struct ethtool_stats *stats, u64 *data)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	struct qlcnic_host_tx_ring *tx_ring;
+	struct qlcnic_esw_statistics port_stats;
+	struct qlcnic_mac_statistics mac_stats;
+	int index, ret, length, size, ring;
+	char *p;
+
+	memset(data, 0, stats->n_stats * sizeof(u64));
+
+	for (ring = 0, index = 0; ring < adapter->drv_tx_rings; ring++) {
+		if (adapter->is_up == QLCNIC_ADAPTER_UP_MAGIC) {
+			tx_ring = &adapter->tx_ring[ring];
+			data = qlcnic_fill_tx_queue_stats(data, tx_ring);
+			qlcnic_update_stats(adapter);
+		} else {
+			data += QLCNIC_TX_STATS_LEN;
+		}
+	}
+
+	length = QLCNIC_STATS_LEN;
+	for (index = 0; index < length; index++) {
+		p = (char *)adapter + qlcnic_gstrings_stats[index].stat_offset;
+		size = qlcnic_gstrings_stats[index].sizeof_stat;
+		*data++ = (size == sizeof(u64)) ? (*(u64 *)p) : ((*(u32 *)p));
+	}
+
+	if (qlcnic_83xx_check(adapter)) {
+		if (adapter->ahw->linkup)
+			qlcnic_83xx_get_stats(adapter, data);
+		return;
+	} else {
+		/* Retrieve MAC statistics from firmware */
+		memset(&mac_stats, 0, sizeof(struct qlcnic_mac_statistics));
+		qlcnic_get_mac_stats(adapter, &mac_stats);
+		data = qlcnic_fill_stats(data, &mac_stats, QLCNIC_MAC_STATS);
+	}
+
+	if (!(adapter->flags & QLCNIC_ESWITCH_ENABLED))
+		return;
+
+	memset(&port_stats, 0, sizeof(struct qlcnic_esw_statistics));
+	ret = qlcnic_get_port_stats(adapter, adapter->ahw->pci_func,
+			QLCNIC_QUERY_RX_COUNTER, &port_stats.rx);
+	if (ret)
+		return;
+
+	data = qlcnic_fill_stats(data, &port_stats.rx, QLCNIC_ESW_STATS);
+	ret = qlcnic_get_port_stats(adapter, adapter->ahw->pci_func,
+			QLCNIC_QUERY_TX_COUNTER, &port_stats.tx);
+	if (ret)
+		return;
+
+	qlcnic_fill_stats(data, &port_stats.tx, QLCNIC_ESW_STATS);
+}
+
+static int qlcnic_set_led(struct net_device *dev,
+			  enum ethtool_phys_id_state state)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	int drv_sds_rings = adapter->drv_sds_rings;
+	int err = -EIO, active = 1;
+
+	if (qlcnic_83xx_check(adapter))
+		return qlcnic_83xx_set_led(dev, state);
+
+	if (adapter->ahw->op_mode == QLCNIC_NON_PRIV_FUNC) {
+		netdev_warn(dev, "LED test not supported for non "
+				"privilege function\n");
+		return -EOPNOTSUPP;
+	}
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		if (test_and_set_bit(__QLCNIC_LED_ENABLE, &adapter->state))
+			return -EBUSY;
+
+		if (test_bit(__QLCNIC_RESETTING, &adapter->state))
+			break;
+
+		if (!test_bit(__QLCNIC_DEV_UP, &adapter->state)) {
+			if (qlcnic_diag_alloc_res(dev, QLCNIC_LED_TEST))
+				break;
+			set_bit(__QLCNIC_DIAG_RES_ALLOC, &adapter->state);
+		}
+
+		if (adapter->nic_ops->config_led(adapter, 1, 0xf) == 0) {
+			err = 0;
+			break;
+		}
+
+		dev_err(&adapter->pdev->dev,
+			"Failed to set LED blink state.\n");
+		break;
+
+	case ETHTOOL_ID_INACTIVE:
+		active = 0;
+
+		if (test_bit(__QLCNIC_RESETTING, &adapter->state))
+			break;
+
+		if (!test_bit(__QLCNIC_DEV_UP, &adapter->state)) {
+			if (qlcnic_diag_alloc_res(dev, QLCNIC_LED_TEST))
+				break;
+			set_bit(__QLCNIC_DIAG_RES_ALLOC, &adapter->state);
+		}
+
+		if (adapter->nic_ops->config_led(adapter, 0, 0xf))
+			dev_err(&adapter->pdev->dev,
+				"Failed to reset LED blink state.\n");
+
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (test_and_clear_bit(__QLCNIC_DIAG_RES_ALLOC, &adapter->state))
+		qlcnic_diag_free_res(dev, drv_sds_rings);
+
+	if (!active || err)
+		clear_bit(__QLCNIC_LED_ENABLE, &adapter->state);
+
+	return err;
+}
+
+static void
+qlcnic_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	u32 wol_cfg;
+	int err = 0;
+
+	if (qlcnic_83xx_check(adapter))
+		return;
+	wol->supported = 0;
+	wol->wolopts = 0;
+
+	wol_cfg = QLCRD32(adapter, QLCNIC_WOL_CONFIG_NV, &err);
+	if (err == -EIO)
+		return;
+	if (wol_cfg & (1UL << adapter->portnum))
+		wol->supported |= WAKE_MAGIC;
+
+	wol_cfg = QLCRD32(adapter, QLCNIC_WOL_CONFIG, &err);
+	if (wol_cfg & (1UL << adapter->portnum))
+		wol->wolopts |= WAKE_MAGIC;
+}
+
+static int
+qlcnic_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(dev);
+	u32 wol_cfg;
+	int err = 0;
+
+	if (qlcnic_83xx_check(adapter))
+		return -EOPNOTSUPP;
+	if (wol->wolopts & ~WAKE_MAGIC)
+		return -EINVAL;
+
+	wol_cfg = QLCRD32(adapter, QLCNIC_WOL_CONFIG_NV, &err);
+	if (err == -EIO)
+		return err;
+	if (!(wol_cfg & (1 << adapter->portnum)))
+		return -EOPNOTSUPP;
+
+	wol_cfg = QLCRD32(adapter, QLCNIC_WOL_CONFIG, &err);
+	if (err == -EIO)
+		return err;
+	if (wol->wolopts & WAKE_MAGIC)
+		wol_cfg |= 1UL << adapter->portnum;
+	else
+		wol_cfg &= ~(1UL << adapter->portnum);
+
+	QLCWR32(adapter, QLCNIC_WOL_CONFIG, wol_cfg);
+
+	return 0;
+}
+
+/*
+ * Set the coalescing parameters. Currently only normal is supported.
+ * If rx_coalesce_usecs == 0 or rx_max_coalesced_frames == 0 then set the
+ * firmware coalescing to default.
+ */
+static int qlcnic_set_intr_coalesce(struct net_device *netdev,
+			struct ethtool_coalesce *ethcoal)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	int err;
+
+	if (!test_bit(__QLCNIC_DEV_UP, &adapter->state))
+		return -EINVAL;
+
+	/*
+	* Return Error if unsupported values or
+	* unsupported parameters are set.
+	*/
+	if (ethcoal->rx_coalesce_usecs > 0xffff ||
+	    ethcoal->rx_max_coalesced_frames > 0xffff ||
+	    ethcoal->tx_coalesce_usecs > 0xffff ||
+	    ethcoal->tx_max_coalesced_frames > 0xffff ||
+	    ethcoal->rx_coalesce_usecs_irq ||
+	    ethcoal->rx_max_coalesced_frames_irq ||
+	    ethcoal->tx_coalesce_usecs_irq ||
+	    ethcoal->tx_max_coalesced_frames_irq ||
+	    ethcoal->stats_block_coalesce_usecs ||
+	    ethcoal->use_adaptive_rx_coalesce ||
+	    ethcoal->use_adaptive_tx_coalesce ||
+	    ethcoal->pkt_rate_low ||
+	    ethcoal->rx_coalesce_usecs_low ||
+	    ethcoal->rx_max_coalesced_frames_low ||
+	    ethcoal->tx_coalesce_usecs_low ||
+	    ethcoal->tx_max_coalesced_frames_low ||
+	    ethcoal->pkt_rate_high ||
+	    ethcoal->rx_coalesce_usecs_high ||
+	    ethcoal->rx_max_coalesced_frames_high ||
+	    ethcoal->tx_coalesce_usecs_high ||
+	    ethcoal->tx_max_coalesced_frames_high)
+		return -EINVAL;
+
+	err = qlcnic_config_intr_coalesce(adapter, ethcoal);
+
+	return err;
+}
+
+static int qlcnic_get_intr_coalesce(struct net_device *netdev,
+			struct ethtool_coalesce *ethcoal)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+
+	if (adapter->is_up != QLCNIC_ADAPTER_UP_MAGIC)
+		return -EINVAL;
+
+	ethcoal->rx_coalesce_usecs = adapter->ahw->coal.rx_time_us;
+	ethcoal->rx_max_coalesced_frames = adapter->ahw->coal.rx_packets;
+	ethcoal->tx_coalesce_usecs = adapter->ahw->coal.tx_time_us;
+	ethcoal->tx_max_coalesced_frames = adapter->ahw->coal.tx_packets;
+
+	return 0;
+}
+
+static u32 qlcnic_get_msglevel(struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+
+	return adapter->ahw->msg_enable;
+}
+
+static void qlcnic_set_msglevel(struct net_device *netdev, u32 msglvl)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+
+	adapter->ahw->msg_enable = msglvl;
+}
+
+int qlcnic_enable_fw_dump_state(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
+	u32 val;
+
+	if (qlcnic_84xx_check(adapter)) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+
+		val = QLCRDX(adapter->ahw, QLC_83XX_IDC_CTRL);
+		val &= ~QLC_83XX_IDC_DISABLE_FW_DUMP;
+		QLCWRX(adapter->ahw, QLC_83XX_IDC_CTRL, val);
+
+		qlcnic_83xx_unlock_driver(adapter);
+	} else {
+		fw_dump->enable = true;
+	}
+
+	dev_info(&adapter->pdev->dev, "FW dump enabled\n");
+
+	return 0;
+}
+
+static int qlcnic_disable_fw_dump_state(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
+	u32 val;
+
+	if (qlcnic_84xx_check(adapter)) {
+		if (qlcnic_83xx_lock_driver(adapter))
+			return -EBUSY;
+
+		val = QLCRDX(adapter->ahw, QLC_83XX_IDC_CTRL);
+		val |= QLC_83XX_IDC_DISABLE_FW_DUMP;
+		QLCWRX(adapter->ahw, QLC_83XX_IDC_CTRL, val);
+
+		qlcnic_83xx_unlock_driver(adapter);
+	} else {
+		fw_dump->enable = false;
+	}
+
+	dev_info(&adapter->pdev->dev, "FW dump disabled\n");
+
+	return 0;
+}
+
+bool qlcnic_check_fw_dump_state(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
+	bool state;
+	u32 val;
+
+	if (qlcnic_84xx_check(adapter)) {
+		val = QLCRDX(adapter->ahw, QLC_83XX_IDC_CTRL);
+		state = (val & QLC_83XX_IDC_DISABLE_FW_DUMP) ? false : true;
+	} else {
+		state = fw_dump->enable;
+	}
+
+	return state;
+}
+
+static int
+qlcnic_get_dump_flag(struct net_device *netdev, struct ethtool_dump *dump)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
+
+	if (!fw_dump->tmpl_hdr) {
+		netdev_err(adapter->netdev, "FW Dump not supported\n");
+		return -ENOTSUPP;
+	}
+
+	if (fw_dump->clr)
+		dump->len = fw_dump->tmpl_hdr_size + fw_dump->size;
+	else
+		dump->len = 0;
+
+	if (!qlcnic_check_fw_dump_state(adapter))
+		dump->flag = ETH_FW_DUMP_DISABLE;
+	else
+		dump->flag = fw_dump->cap_mask;
+
+	dump->version = adapter->fw_version;
+	return 0;
+}
+
+static int
+qlcnic_get_dump_data(struct net_device *netdev, struct ethtool_dump *dump,
+			void *buffer)
+{
+	int i, copy_sz;
+	u32 *hdr_ptr;
+	__le32 *data;
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
+
+	if (!fw_dump->tmpl_hdr) {
+		netdev_err(netdev, "FW Dump not supported\n");
+		return -ENOTSUPP;
+	}
+
+	if (!fw_dump->clr) {
+		netdev_info(netdev, "Dump not available\n");
+		return -EINVAL;
+	}
+
+	/* Copy template header first */
+	copy_sz = fw_dump->tmpl_hdr_size;
+	hdr_ptr = (u32 *)fw_dump->tmpl_hdr;
+	data = buffer;
+	for (i = 0; i < copy_sz/sizeof(u32); i++)
+		*data++ = cpu_to_le32(*hdr_ptr++);
+
+	/* Copy captured dump data */
+	memcpy(buffer + copy_sz, fw_dump->data, fw_dump->size);
+	dump->len = copy_sz + fw_dump->size;
+	dump->flag = fw_dump->cap_mask;
+
+	/* Free dump area once data has been captured */
+	vfree(fw_dump->data);
+	fw_dump->data = NULL;
+	fw_dump->clr = 0;
+	netdev_info(netdev, "extracted the FW dump Successfully\n");
+	return 0;
+}
+
+static int qlcnic_set_dump_mask(struct qlcnic_adapter *adapter, u32 mask)
+{
+	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
+	struct net_device *netdev = adapter->netdev;
+
+	if (!qlcnic_check_fw_dump_state(adapter)) {
+		netdev_info(netdev,
+			    "Can not change driver mask to 0x%x. FW dump not enabled\n",
+			    mask);
+		return -EOPNOTSUPP;
+	}
+
+	fw_dump->cap_mask = mask;
+
+	/* Store new capture mask in template header as well*/
+	qlcnic_store_cap_mask(adapter, fw_dump->tmpl_hdr, mask);
+
+	netdev_info(netdev, "Driver mask changed to: 0x%x\n", mask);
+	return 0;
+}
+
+static int
+qlcnic_set_dump(struct net_device *netdev, struct ethtool_dump *val)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
+	bool valid_mask = false;
+	int i, ret = 0;
+
+	switch (val->flag) {
+	case QLCNIC_FORCE_FW_DUMP_KEY:
+		if (!fw_dump->tmpl_hdr) {
+			netdev_err(netdev, "FW dump not supported\n");
+			ret = -EOPNOTSUPP;
+			break;
+		}
+
+		if (!qlcnic_check_fw_dump_state(adapter)) {
+			netdev_info(netdev, "FW dump not enabled\n");
+			ret = -EOPNOTSUPP;
+			break;
+		}
+
+		if (fw_dump->clr) {
+			netdev_info(netdev,
+				    "Previous dump not cleared, not forcing dump\n");
+			break;
+		}
+
+		netdev_info(netdev, "Forcing a FW dump\n");
+		qlcnic_dev_request_reset(adapter, val->flag);
+		break;
+	case QLCNIC_DISABLE_FW_DUMP:
+		if (!fw_dump->tmpl_hdr) {
+			netdev_err(netdev, "FW dump not supported\n");
+			ret = -EOPNOTSUPP;
+			break;
+		}
+
+		ret = qlcnic_disable_fw_dump_state(adapter);
+		break;
+
+	case QLCNIC_ENABLE_FW_DUMP:
+		if (!fw_dump->tmpl_hdr) {
+			netdev_err(netdev, "FW dump not supported\n");
+			ret = -EOPNOTSUPP;
+			break;
+		}
+
+		ret = qlcnic_enable_fw_dump_state(adapter);
+		break;
+
+	case QLCNIC_FORCE_FW_RESET:
+		netdev_info(netdev, "Forcing a FW reset\n");
+		qlcnic_dev_request_reset(adapter, val->flag);
+		adapter->flags &= ~QLCNIC_FW_RESET_OWNER;
+		break;
+
+	case QLCNIC_SET_QUIESCENT:
+	case QLCNIC_RESET_QUIESCENT:
+		if (test_bit(__QLCNIC_MAINTENANCE_MODE, &adapter->state))
+			netdev_info(netdev, "Device is in non-operational state\n");
+		break;
+
+	default:
+		if (!fw_dump->tmpl_hdr) {
+			netdev_err(netdev, "FW dump not supported\n");
+			ret = -EOPNOTSUPP;
+			break;
+		}
+
+		for (i = 0; i < ARRAY_SIZE(qlcnic_fw_dump_level); i++) {
+			if (val->flag == qlcnic_fw_dump_level[i]) {
+				valid_mask = true;
+				break;
+			}
+		}
+
+		if (valid_mask) {
+			ret = qlcnic_set_dump_mask(adapter, val->flag);
+		} else {
+			netdev_info(netdev, "Invalid dump level: 0x%x\n",
+				    val->flag);
+			ret = -EINVAL;
+		}
+	}
+	return ret;
+}
+
+const struct ethtool_ops qlcnic_ethtool_ops = {
+	.get_drvinfo = qlcnic_get_drvinfo,
+	.get_regs_len = qlcnic_get_regs_len,
+	.get_regs = qlcnic_get_regs,
+	.get_link = ethtool_op_get_link,
+	.get_eeprom_len = qlcnic_get_eeprom_len,
+	.get_eeprom = qlcnic_get_eeprom,
+	.get_ringparam = qlcnic_get_ringparam,
+	.set_ringparam = qlcnic_set_ringparam,
+	.get_channels = qlcnic_get_channels,
+	.set_channels = qlcnic_set_channels,
+	.get_pauseparam = qlcnic_get_pauseparam,
+	.set_pauseparam = qlcnic_set_pauseparam,
+	.get_wol = qlcnic_get_wol,
+	.set_wol = qlcnic_set_wol,
+	.self_test = qlcnic_diag_test,
+	.get_strings = qlcnic_get_strings,
+	.get_ethtool_stats = qlcnic_get_ethtool_stats,
+	.get_sset_count = qlcnic_get_sset_count,
+	.get_coalesce = qlcnic_get_intr_coalesce,
+	.set_coalesce = qlcnic_set_intr_coalesce,
+	.set_phys_id = qlcnic_set_led,
+	.set_msglevel = qlcnic_set_msglevel,
+	.get_msglevel = qlcnic_get_msglevel,
+	.get_dump_flag = qlcnic_get_dump_flag,
+	.get_dump_data = qlcnic_get_dump_data,
+	.set_dump = qlcnic_set_dump,
+	.get_link_ksettings = qlcnic_get_link_ksettings,
+	.set_link_ksettings = qlcnic_set_link_ksettings,
+};
+
+const struct ethtool_ops qlcnic_sriov_vf_ethtool_ops = {
+	.get_drvinfo		= qlcnic_get_drvinfo,
+	.get_regs_len		= qlcnic_get_regs_len,
+	.get_regs		= qlcnic_get_regs,
+	.get_link		= ethtool_op_get_link,
+	.get_eeprom_len		= qlcnic_get_eeprom_len,
+	.get_eeprom		= qlcnic_get_eeprom,
+	.get_ringparam		= qlcnic_get_ringparam,
+	.set_ringparam		= qlcnic_set_ringparam,
+	.get_channels		= qlcnic_get_channels,
+	.get_pauseparam		= qlcnic_get_pauseparam,
+	.get_wol		= qlcnic_get_wol,
+	.get_strings		= qlcnic_get_strings,
+	.get_ethtool_stats	= qlcnic_get_ethtool_stats,
+	.get_sset_count		= qlcnic_get_sset_count,
+	.get_coalesce		= qlcnic_get_intr_coalesce,
+	.set_coalesce		= qlcnic_set_intr_coalesce,
+	.set_msglevel		= qlcnic_set_msglevel,
+	.get_msglevel		= qlcnic_get_msglevel,
+	.get_link_ksettings	= qlcnic_get_link_ksettings,
+};
+
+const struct ethtool_ops qlcnic_ethtool_failed_ops = {
+	.get_drvinfo		= qlcnic_get_drvinfo,
+	.set_msglevel		= qlcnic_set_msglevel,
+	.get_msglevel		= qlcnic_get_msglevel,
+	.set_dump		= qlcnic_set_dump,
+	.get_link_ksettings	= qlcnic_get_link_ksettings,
+};
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hdr.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hdr.h
new file mode 100644
index 0000000..34e467b
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hdr.h
@@ -0,0 +1,948 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#ifndef __QLCNIC_HDR_H_
+#define __QLCNIC_HDR_H_
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "qlcnic_hw.h"
+
+/*
+ * The basic unit of access when reading/writing control registers.
+ */
+
+enum {
+	QLCNIC_HW_H0_CH_HUB_ADR = 0x05,
+	QLCNIC_HW_H1_CH_HUB_ADR = 0x0E,
+	QLCNIC_HW_H2_CH_HUB_ADR = 0x03,
+	QLCNIC_HW_H3_CH_HUB_ADR = 0x01,
+	QLCNIC_HW_H4_CH_HUB_ADR = 0x06,
+	QLCNIC_HW_H5_CH_HUB_ADR = 0x07,
+	QLCNIC_HW_H6_CH_HUB_ADR = 0x08
+};
+
+/*  Hub 0 */
+enum {
+	QLCNIC_HW_MN_CRB_AGT_ADR = 0x15,
+	QLCNIC_HW_MS_CRB_AGT_ADR = 0x25
+};
+
+/*  Hub 1 */
+enum {
+	QLCNIC_HW_PS_CRB_AGT_ADR = 0x73,
+	QLCNIC_HW_SS_CRB_AGT_ADR = 0x20,
+	QLCNIC_HW_RPMX3_CRB_AGT_ADR = 0x0b,
+	QLCNIC_HW_QMS_CRB_AGT_ADR = 0x00,
+	QLCNIC_HW_SQGS0_CRB_AGT_ADR = 0x01,
+	QLCNIC_HW_SQGS1_CRB_AGT_ADR = 0x02,
+	QLCNIC_HW_SQGS2_CRB_AGT_ADR = 0x03,
+	QLCNIC_HW_SQGS3_CRB_AGT_ADR = 0x04,
+	QLCNIC_HW_C2C0_CRB_AGT_ADR = 0x58,
+	QLCNIC_HW_C2C1_CRB_AGT_ADR = 0x59,
+	QLCNIC_HW_C2C2_CRB_AGT_ADR = 0x5a,
+	QLCNIC_HW_RPMX2_CRB_AGT_ADR = 0x0a,
+	QLCNIC_HW_RPMX4_CRB_AGT_ADR = 0x0c,
+	QLCNIC_HW_RPMX7_CRB_AGT_ADR = 0x0f,
+	QLCNIC_HW_RPMX9_CRB_AGT_ADR = 0x12,
+	QLCNIC_HW_SMB_CRB_AGT_ADR = 0x18
+};
+
+/*  Hub 2 */
+enum {
+	QLCNIC_HW_NIU_CRB_AGT_ADR = 0x31,
+	QLCNIC_HW_I2C0_CRB_AGT_ADR = 0x19,
+	QLCNIC_HW_I2C1_CRB_AGT_ADR = 0x29,
+
+	QLCNIC_HW_SN_CRB_AGT_ADR = 0x10,
+	QLCNIC_HW_I2Q_CRB_AGT_ADR = 0x20,
+	QLCNIC_HW_LPC_CRB_AGT_ADR = 0x22,
+	QLCNIC_HW_ROMUSB_CRB_AGT_ADR = 0x21,
+	QLCNIC_HW_QM_CRB_AGT_ADR = 0x66,
+	QLCNIC_HW_SQG0_CRB_AGT_ADR = 0x60,
+	QLCNIC_HW_SQG1_CRB_AGT_ADR = 0x61,
+	QLCNIC_HW_SQG2_CRB_AGT_ADR = 0x62,
+	QLCNIC_HW_SQG3_CRB_AGT_ADR = 0x63,
+	QLCNIC_HW_RPMX1_CRB_AGT_ADR = 0x09,
+	QLCNIC_HW_RPMX5_CRB_AGT_ADR = 0x0d,
+	QLCNIC_HW_RPMX6_CRB_AGT_ADR = 0x0e,
+	QLCNIC_HW_RPMX8_CRB_AGT_ADR = 0x11
+};
+
+/*  Hub 3 */
+enum {
+	QLCNIC_HW_PH_CRB_AGT_ADR = 0x1A,
+	QLCNIC_HW_SRE_CRB_AGT_ADR = 0x50,
+	QLCNIC_HW_EG_CRB_AGT_ADR = 0x51,
+	QLCNIC_HW_RPMX0_CRB_AGT_ADR = 0x08
+};
+
+/*  Hub 4 */
+enum {
+	QLCNIC_HW_PEGN0_CRB_AGT_ADR = 0x40,
+	QLCNIC_HW_PEGN1_CRB_AGT_ADR,
+	QLCNIC_HW_PEGN2_CRB_AGT_ADR,
+	QLCNIC_HW_PEGN3_CRB_AGT_ADR,
+	QLCNIC_HW_PEGNI_CRB_AGT_ADR,
+	QLCNIC_HW_PEGND_CRB_AGT_ADR,
+	QLCNIC_HW_PEGNC_CRB_AGT_ADR,
+	QLCNIC_HW_PEGR0_CRB_AGT_ADR,
+	QLCNIC_HW_PEGR1_CRB_AGT_ADR,
+	QLCNIC_HW_PEGR2_CRB_AGT_ADR,
+	QLCNIC_HW_PEGR3_CRB_AGT_ADR,
+	QLCNIC_HW_PEGN4_CRB_AGT_ADR
+};
+
+/*  Hub 5 */
+enum {
+	QLCNIC_HW_PEGS0_CRB_AGT_ADR = 0x40,
+	QLCNIC_HW_PEGS1_CRB_AGT_ADR,
+	QLCNIC_HW_PEGS2_CRB_AGT_ADR,
+	QLCNIC_HW_PEGS3_CRB_AGT_ADR,
+	QLCNIC_HW_PEGSI_CRB_AGT_ADR,
+	QLCNIC_HW_PEGSD_CRB_AGT_ADR,
+	QLCNIC_HW_PEGSC_CRB_AGT_ADR
+};
+
+/*  Hub 6 */
+enum {
+	QLCNIC_HW_CAS0_CRB_AGT_ADR = 0x46,
+	QLCNIC_HW_CAS1_CRB_AGT_ADR = 0x47,
+	QLCNIC_HW_CAS2_CRB_AGT_ADR = 0x48,
+	QLCNIC_HW_CAS3_CRB_AGT_ADR = 0x49,
+	QLCNIC_HW_NCM_CRB_AGT_ADR = 0x16,
+	QLCNIC_HW_TMR_CRB_AGT_ADR = 0x17,
+	QLCNIC_HW_XDMA_CRB_AGT_ADR = 0x05,
+	QLCNIC_HW_OCM0_CRB_AGT_ADR = 0x06,
+	QLCNIC_HW_OCM1_CRB_AGT_ADR = 0x07
+};
+
+/*  Floaters - non existent modules */
+#define QLCNIC_HW_EFC_RPMX0_CRB_AGT_ADR	0x67
+
+/*  This field defines PCI/X adr [25:20] of agents on the CRB */
+enum {
+	QLCNIC_HW_PX_MAP_CRB_PH = 0,
+	QLCNIC_HW_PX_MAP_CRB_PS,
+	QLCNIC_HW_PX_MAP_CRB_MN,
+	QLCNIC_HW_PX_MAP_CRB_MS,
+	QLCNIC_HW_PX_MAP_CRB_PGR1,
+	QLCNIC_HW_PX_MAP_CRB_SRE,
+	QLCNIC_HW_PX_MAP_CRB_NIU,
+	QLCNIC_HW_PX_MAP_CRB_QMN,
+	QLCNIC_HW_PX_MAP_CRB_SQN0,
+	QLCNIC_HW_PX_MAP_CRB_SQN1,
+	QLCNIC_HW_PX_MAP_CRB_SQN2,
+	QLCNIC_HW_PX_MAP_CRB_SQN3,
+	QLCNIC_HW_PX_MAP_CRB_QMS,
+	QLCNIC_HW_PX_MAP_CRB_SQS0,
+	QLCNIC_HW_PX_MAP_CRB_SQS1,
+	QLCNIC_HW_PX_MAP_CRB_SQS2,
+	QLCNIC_HW_PX_MAP_CRB_SQS3,
+	QLCNIC_HW_PX_MAP_CRB_PGN0,
+	QLCNIC_HW_PX_MAP_CRB_PGN1,
+	QLCNIC_HW_PX_MAP_CRB_PGN2,
+	QLCNIC_HW_PX_MAP_CRB_PGN3,
+	QLCNIC_HW_PX_MAP_CRB_PGND,
+	QLCNIC_HW_PX_MAP_CRB_PGNI,
+	QLCNIC_HW_PX_MAP_CRB_PGS0,
+	QLCNIC_HW_PX_MAP_CRB_PGS1,
+	QLCNIC_HW_PX_MAP_CRB_PGS2,
+	QLCNIC_HW_PX_MAP_CRB_PGS3,
+	QLCNIC_HW_PX_MAP_CRB_PGSD,
+	QLCNIC_HW_PX_MAP_CRB_PGSI,
+	QLCNIC_HW_PX_MAP_CRB_SN,
+	QLCNIC_HW_PX_MAP_CRB_PGR2,
+	QLCNIC_HW_PX_MAP_CRB_EG,
+	QLCNIC_HW_PX_MAP_CRB_PH2,
+	QLCNIC_HW_PX_MAP_CRB_PS2,
+	QLCNIC_HW_PX_MAP_CRB_CAM,
+	QLCNIC_HW_PX_MAP_CRB_CAS0,
+	QLCNIC_HW_PX_MAP_CRB_CAS1,
+	QLCNIC_HW_PX_MAP_CRB_CAS2,
+	QLCNIC_HW_PX_MAP_CRB_C2C0,
+	QLCNIC_HW_PX_MAP_CRB_C2C1,
+	QLCNIC_HW_PX_MAP_CRB_TIMR,
+	QLCNIC_HW_PX_MAP_CRB_PGR3,
+	QLCNIC_HW_PX_MAP_CRB_RPMX1,
+	QLCNIC_HW_PX_MAP_CRB_RPMX2,
+	QLCNIC_HW_PX_MAP_CRB_RPMX3,
+	QLCNIC_HW_PX_MAP_CRB_RPMX4,
+	QLCNIC_HW_PX_MAP_CRB_RPMX5,
+	QLCNIC_HW_PX_MAP_CRB_RPMX6,
+	QLCNIC_HW_PX_MAP_CRB_RPMX7,
+	QLCNIC_HW_PX_MAP_CRB_XDMA,
+	QLCNIC_HW_PX_MAP_CRB_I2Q,
+	QLCNIC_HW_PX_MAP_CRB_ROMUSB,
+	QLCNIC_HW_PX_MAP_CRB_CAS3,
+	QLCNIC_HW_PX_MAP_CRB_RPMX0,
+	QLCNIC_HW_PX_MAP_CRB_RPMX8,
+	QLCNIC_HW_PX_MAP_CRB_RPMX9,
+	QLCNIC_HW_PX_MAP_CRB_OCM0,
+	QLCNIC_HW_PX_MAP_CRB_OCM1,
+	QLCNIC_HW_PX_MAP_CRB_SMB,
+	QLCNIC_HW_PX_MAP_CRB_I2C0,
+	QLCNIC_HW_PX_MAP_CRB_I2C1,
+	QLCNIC_HW_PX_MAP_CRB_LPC,
+	QLCNIC_HW_PX_MAP_CRB_PGNC,
+	QLCNIC_HW_PX_MAP_CRB_PGR0
+};
+
+#define	BIT_0	0x1
+#define	BIT_1	0x2
+#define	BIT_2	0x4
+#define	BIT_3	0x8
+#define	BIT_4	0x10
+#define	BIT_5	0x20
+#define	BIT_6	0x40
+#define	BIT_7	0x80
+#define	BIT_8	0x100
+#define	BIT_9	0x200
+#define	BIT_10	0x400
+#define	BIT_11	0x800
+#define	BIT_12	0x1000
+#define	BIT_13	0x2000
+#define	BIT_14	0x4000
+#define	BIT_15	0x8000
+#define	BIT_16	0x10000
+#define	BIT_17	0x20000
+#define	BIT_18	0x40000
+#define	BIT_19	0x80000
+#define	BIT_20	0x100000
+#define	BIT_21	0x200000
+#define	BIT_22	0x400000
+#define	BIT_23	0x800000
+#define	BIT_24	0x1000000
+#define	BIT_25	0x2000000
+#define	BIT_26	0x4000000
+#define	BIT_27	0x8000000
+#define	BIT_28	0x10000000
+#define	BIT_29	0x20000000
+#define	BIT_30	0x40000000
+#define	BIT_31	0x80000000
+
+/*  This field defines CRB adr [31:20] of the agents */
+
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_MN	\
+	((QLCNIC_HW_H0_CH_HUB_ADR << 7) | QLCNIC_HW_MN_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PH	\
+	((QLCNIC_HW_H0_CH_HUB_ADR << 7) | QLCNIC_HW_PH_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_MS	\
+	((QLCNIC_HW_H0_CH_HUB_ADR << 7) | QLCNIC_HW_MS_CRB_AGT_ADR)
+
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PS	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_PS_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_SS	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_SS_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX3	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_RPMX3_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_QMS	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_QMS_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_SQS0	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_SQGS0_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_SQS1	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_SQGS1_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_SQS2	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_SQGS2_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_SQS3	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_SQGS3_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_C2C0	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_C2C0_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_C2C1	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_C2C1_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX2	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_RPMX2_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX4	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_RPMX4_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX7	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_RPMX7_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX9	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_RPMX9_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_SMB	\
+	((QLCNIC_HW_H1_CH_HUB_ADR << 7) | QLCNIC_HW_SMB_CRB_AGT_ADR)
+
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_NIU	\
+	((QLCNIC_HW_H2_CH_HUB_ADR << 7) | QLCNIC_HW_NIU_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_I2C0	\
+	((QLCNIC_HW_H2_CH_HUB_ADR << 7) | QLCNIC_HW_I2C0_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_I2C1	\
+	((QLCNIC_HW_H2_CH_HUB_ADR << 7) | QLCNIC_HW_I2C1_CRB_AGT_ADR)
+
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_SRE	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_SRE_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_EG	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_EG_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX0	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_RPMX0_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_QMN	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_QM_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_SQN0	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_SQG0_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_SQN1	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_SQG1_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_SQN2	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_SQG2_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_SQN3	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_SQG3_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX1	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_RPMX1_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX5	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_RPMX5_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX6	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_RPMX6_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX8	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_RPMX8_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_CAS0	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_CAS0_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_CAS1	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_CAS1_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_CAS2	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_CAS2_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_CAS3	\
+	((QLCNIC_HW_H3_CH_HUB_ADR << 7) | QLCNIC_HW_CAS3_CRB_AGT_ADR)
+
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGNI	\
+	((QLCNIC_HW_H4_CH_HUB_ADR << 7) | QLCNIC_HW_PEGNI_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGND	\
+	((QLCNIC_HW_H4_CH_HUB_ADR << 7) | QLCNIC_HW_PEGND_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGN0	\
+	((QLCNIC_HW_H4_CH_HUB_ADR << 7) | QLCNIC_HW_PEGN0_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGN1	\
+	((QLCNIC_HW_H4_CH_HUB_ADR << 7) | QLCNIC_HW_PEGN1_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGN2	\
+	((QLCNIC_HW_H4_CH_HUB_ADR << 7) | QLCNIC_HW_PEGN2_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGN3	\
+	((QLCNIC_HW_H4_CH_HUB_ADR << 7) | QLCNIC_HW_PEGN3_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGN4	\
+	((QLCNIC_HW_H4_CH_HUB_ADR << 7) | QLCNIC_HW_PEGN4_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGNC	\
+	((QLCNIC_HW_H4_CH_HUB_ADR << 7) | QLCNIC_HW_PEGNC_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGR0	\
+	((QLCNIC_HW_H4_CH_HUB_ADR << 7) | QLCNIC_HW_PEGR0_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGR1	\
+	((QLCNIC_HW_H4_CH_HUB_ADR << 7) | QLCNIC_HW_PEGR1_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGR2	\
+	((QLCNIC_HW_H4_CH_HUB_ADR << 7) | QLCNIC_HW_PEGR2_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGR3	\
+	((QLCNIC_HW_H4_CH_HUB_ADR << 7) | QLCNIC_HW_PEGR3_CRB_AGT_ADR)
+
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGSI	\
+	((QLCNIC_HW_H5_CH_HUB_ADR << 7) | QLCNIC_HW_PEGSI_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGSD	\
+	((QLCNIC_HW_H5_CH_HUB_ADR << 7) | QLCNIC_HW_PEGSD_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGS0	\
+	((QLCNIC_HW_H5_CH_HUB_ADR << 7) | QLCNIC_HW_PEGS0_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGS1	\
+	((QLCNIC_HW_H5_CH_HUB_ADR << 7) | QLCNIC_HW_PEGS1_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGS2	\
+	((QLCNIC_HW_H5_CH_HUB_ADR << 7) | QLCNIC_HW_PEGS2_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGS3	\
+	((QLCNIC_HW_H5_CH_HUB_ADR << 7) | QLCNIC_HW_PEGS3_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_PGSC	\
+	((QLCNIC_HW_H5_CH_HUB_ADR << 7) | QLCNIC_HW_PEGSC_CRB_AGT_ADR)
+
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_CAM	\
+	((QLCNIC_HW_H6_CH_HUB_ADR << 7) | QLCNIC_HW_NCM_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_TIMR	\
+	((QLCNIC_HW_H6_CH_HUB_ADR << 7) | QLCNIC_HW_TMR_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_XDMA	\
+	((QLCNIC_HW_H6_CH_HUB_ADR << 7) | QLCNIC_HW_XDMA_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_SN	\
+	((QLCNIC_HW_H6_CH_HUB_ADR << 7) | QLCNIC_HW_SN_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_I2Q	\
+	((QLCNIC_HW_H6_CH_HUB_ADR << 7) | QLCNIC_HW_I2Q_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_ROMUSB	\
+	((QLCNIC_HW_H6_CH_HUB_ADR << 7) | QLCNIC_HW_ROMUSB_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_OCM0	\
+	((QLCNIC_HW_H6_CH_HUB_ADR << 7) | QLCNIC_HW_OCM0_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_OCM1	\
+	((QLCNIC_HW_H6_CH_HUB_ADR << 7) | QLCNIC_HW_OCM1_CRB_AGT_ADR)
+#define QLCNIC_HW_CRB_HUB_AGT_ADR_LPC	\
+	((QLCNIC_HW_H6_CH_HUB_ADR << 7) | QLCNIC_HW_LPC_CRB_AGT_ADR)
+
+#define QLCNIC_SRE_MISC		(QLCNIC_CRB_SRE + 0x0002c)
+
+#define QLCNIC_I2Q_CLR_PCI_HI	(QLCNIC_CRB_I2Q + 0x00034)
+
+#define ROMUSB_GLB		(QLCNIC_CRB_ROMUSB + 0x00000)
+#define ROMUSB_ROM		(QLCNIC_CRB_ROMUSB + 0x10000)
+
+#define QLCNIC_ROMUSB_GLB_STATUS	(ROMUSB_GLB + 0x0004)
+#define QLCNIC_ROMUSB_GLB_SW_RESET	(ROMUSB_GLB + 0x0008)
+#define QLCNIC_ROMUSB_GLB_PAD_GPIO_I	(ROMUSB_GLB + 0x000c)
+#define QLCNIC_ROMUSB_GLB_CAS_RST	(ROMUSB_GLB + 0x0038)
+#define QLCNIC_ROMUSB_GLB_TEST_MUX_SEL	(ROMUSB_GLB + 0x0044)
+#define QLCNIC_ROMUSB_GLB_PEGTUNE_DONE	(ROMUSB_GLB + 0x005c)
+#define QLCNIC_ROMUSB_GLB_CHIP_CLK_CTRL	(ROMUSB_GLB + 0x00A8)
+
+#define QLCNIC_ROMUSB_GPIO(n)		(ROMUSB_GLB + 0x60 + (4 * (n)))
+
+#define QLCNIC_ROMUSB_ROM_INSTR_OPCODE	(ROMUSB_ROM + 0x0004)
+#define QLCNIC_ROMUSB_ROM_ADDRESS	(ROMUSB_ROM + 0x0008)
+#define QLCNIC_ROMUSB_ROM_WDATA		(ROMUSB_ROM + 0x000c)
+#define QLCNIC_ROMUSB_ROM_ABYTE_CNT	(ROMUSB_ROM + 0x0010)
+#define QLCNIC_ROMUSB_ROM_DUMMY_BYTE_CNT (ROMUSB_ROM + 0x0014)
+#define QLCNIC_ROMUSB_ROM_RDATA		(ROMUSB_ROM + 0x0018)
+
+/******************************************************************************
+*
+*    Definitions specific to M25P flash
+*
+*******************************************************************************
+*/
+
+/* all are 1MB windows */
+
+#define QLCNIC_PCI_CRB_WINDOWSIZE	0x00100000
+#define QLCNIC_PCI_CRB_WINDOW(A)	\
+	(QLCNIC_PCI_CRBSPACE + (A)*QLCNIC_PCI_CRB_WINDOWSIZE)
+
+#define QLCNIC_CRB_NIU		QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_NIU)
+#define QLCNIC_CRB_SRE		QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_SRE)
+#define QLCNIC_CRB_ROMUSB	\
+	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_ROMUSB)
+#define QLCNIC_CRB_EPG		QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_EG)
+#define QLCNIC_CRB_I2Q		QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_I2Q)
+#define QLCNIC_CRB_TIMER	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_TIMR)
+#define QLCNIC_CRB_I2C0 	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_I2C0)
+#define QLCNIC_CRB_SMB		QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_SMB)
+#define QLCNIC_CRB_MAX		QLCNIC_PCI_CRB_WINDOW(64)
+
+#define QLCNIC_CRB_PCIX_HOST	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_PH)
+#define QLCNIC_CRB_PCIX_HOST2	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_PH2)
+#define QLCNIC_CRB_PEG_NET_0	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_PGN0)
+#define QLCNIC_CRB_PEG_NET_1	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_PGN1)
+#define QLCNIC_CRB_PEG_NET_2	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_PGN2)
+#define QLCNIC_CRB_PEG_NET_3	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_PGN3)
+#define QLCNIC_CRB_PEG_NET_4	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_SQS2)
+#define QLCNIC_CRB_PEG_NET_D	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_PGND)
+#define QLCNIC_CRB_PEG_NET_I	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_PGNI)
+#define QLCNIC_CRB_DDR_NET	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_MN)
+#define QLCNIC_CRB_QDR_NET	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_SN)
+
+#define QLCNIC_CRB_PCIX_MD	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_PS)
+#define QLCNIC_CRB_PCIE 	QLCNIC_CRB_PCIX_MD
+
+#define ISR_INT_VECTOR		(QLCNIC_PCIX_PS_REG(PCIX_INT_VECTOR))
+#define ISR_INT_MASK		(QLCNIC_PCIX_PS_REG(PCIX_INT_MASK))
+#define ISR_INT_MASK_SLOW	(QLCNIC_PCIX_PS_REG(PCIX_INT_MASK))
+#define ISR_INT_TARGET_STATUS	(QLCNIC_PCIX_PS_REG(PCIX_TARGET_STATUS))
+#define ISR_INT_TARGET_MASK	(QLCNIC_PCIX_PS_REG(PCIX_TARGET_MASK))
+#define ISR_INT_TARGET_STATUS_F1   (QLCNIC_PCIX_PS_REG(PCIX_TARGET_STATUS_F1))
+#define ISR_INT_TARGET_MASK_F1     (QLCNIC_PCIX_PS_REG(PCIX_TARGET_MASK_F1))
+#define ISR_INT_TARGET_STATUS_F2   (QLCNIC_PCIX_PS_REG(PCIX_TARGET_STATUS_F2))
+#define ISR_INT_TARGET_MASK_F2     (QLCNIC_PCIX_PS_REG(PCIX_TARGET_MASK_F2))
+#define ISR_INT_TARGET_STATUS_F3   (QLCNIC_PCIX_PS_REG(PCIX_TARGET_STATUS_F3))
+#define ISR_INT_TARGET_MASK_F3     (QLCNIC_PCIX_PS_REG(PCIX_TARGET_MASK_F3))
+#define ISR_INT_TARGET_STATUS_F4   (QLCNIC_PCIX_PS_REG(PCIX_TARGET_STATUS_F4))
+#define ISR_INT_TARGET_MASK_F4     (QLCNIC_PCIX_PS_REG(PCIX_TARGET_MASK_F4))
+#define ISR_INT_TARGET_STATUS_F5   (QLCNIC_PCIX_PS_REG(PCIX_TARGET_STATUS_F5))
+#define ISR_INT_TARGET_MASK_F5     (QLCNIC_PCIX_PS_REG(PCIX_TARGET_MASK_F5))
+#define ISR_INT_TARGET_STATUS_F6   (QLCNIC_PCIX_PS_REG(PCIX_TARGET_STATUS_F6))
+#define ISR_INT_TARGET_MASK_F6     (QLCNIC_PCIX_PS_REG(PCIX_TARGET_MASK_F6))
+#define ISR_INT_TARGET_STATUS_F7   (QLCNIC_PCIX_PS_REG(PCIX_TARGET_STATUS_F7))
+#define ISR_INT_TARGET_MASK_F7     (QLCNIC_PCIX_PS_REG(PCIX_TARGET_MASK_F7))
+
+#define QLCNIC_PCI_OCM0_2M	(0x000c0000UL)
+#define QLCNIC_PCI_CRBSPACE	(0x06000000UL)
+#define QLCNIC_PCI_CAMQM	(0x04800000UL)
+#define QLCNIC_PCI_CAMQM_END	(0x04800800UL)
+#define QLCNIC_PCI_CAMQM_2M_BASE	(0x000ff800UL)
+
+#define QLCNIC_CRB_CAM	QLCNIC_PCI_CRB_WINDOW(QLCNIC_HW_PX_MAP_CRB_CAM)
+
+#define QLCNIC_ADDR_DDR_NET	(0x0000000000000000ULL)
+#define QLCNIC_ADDR_DDR_NET_MAX (0x000000000fffffffULL)
+#define QLCNIC_ADDR_OCM0	(0x0000000200000000ULL)
+#define QLCNIC_ADDR_OCM0_MAX	(0x00000002000fffffULL)
+#define QLCNIC_ADDR_OCM1	(0x0000000200400000ULL)
+#define QLCNIC_ADDR_OCM1_MAX	(0x00000002004fffffULL)
+#define QLCNIC_ADDR_QDR_NET	(0x0000000300000000ULL)
+#define QLCNIC_ADDR_QDR_NET_MAX (0x0000000307ffffffULL)
+
+/*
+ *   Register offsets for MN
+ */
+#define QLCNIC_MIU_CONTROL	(0x000)
+#define QLCNIC_MIU_MN_CONTROL	(QLCNIC_CRB_DDR_NET+QLCNIC_MIU_CONTROL)
+
+/* 200ms delay in each loop */
+#define QLCNIC_NIU_PHY_WAITLEN		200000
+/* 10 seconds before we give up */
+#define QLCNIC_NIU_PHY_WAITMAX		50
+#define QLCNIC_NIU_MAX_GBE_PORTS	4
+#define QLCNIC_NIU_MAX_XG_PORTS		2
+
+#define QLCNIC_NIU_MODE			(QLCNIC_CRB_NIU + 0x00000)
+#define QLCNIC_NIU_GB_PAUSE_CTL		(QLCNIC_CRB_NIU + 0x0030c)
+#define QLCNIC_NIU_XG_PAUSE_CTL		(QLCNIC_CRB_NIU + 0x00098)
+
+#define QLCNIC_NIU_GB_MAC_CONFIG_0(I)		\
+		(QLCNIC_CRB_NIU + 0x30000 + (I)*0x10000)
+#define QLCNIC_NIU_GB_MAC_CONFIG_1(I)		\
+		(QLCNIC_CRB_NIU + 0x30004 + (I)*0x10000)
+
+#define MAX_CTL_CHECK	1000
+#define TEST_AGT_CTRL	(0x00)
+
+#define TA_CTL_START	BIT_0
+#define TA_CTL_ENABLE	BIT_1
+#define TA_CTL_WRITE	BIT_2
+#define TA_CTL_BUSY	BIT_3
+
+/* XG Link status */
+#define XG_LINK_UP	0x10
+#define XG_LINK_DOWN	0x20
+
+#define XG_LINK_UP_P3P	0x01
+#define XG_LINK_DOWN_P3P	0x02
+#define XG_LINK_STATE_P3P_MASK 0xf
+#define XG_LINK_STATE_P3P(pcifn, val) \
+	(((val) >> ((pcifn) * 4)) & XG_LINK_STATE_P3P_MASK)
+
+#define P3P_LINK_SPEED_MHZ	100
+#define P3P_LINK_SPEED_MASK	0xff
+#define P3P_LINK_SPEED_REG(pcifn)	\
+	(CRB_PF_LINK_SPEED_1 + (((pcifn) / 4) * 4))
+#define P3P_LINK_SPEED_VAL(pcifn, reg)	\
+	(((reg) >> (8 * ((pcifn) & 0x3))) & P3P_LINK_SPEED_MASK)
+
+#define QLCNIC_CAM_RAM_BASE	(QLCNIC_CRB_CAM + 0x02000)
+#define QLCNIC_CAM_RAM(reg)	(QLCNIC_CAM_RAM_BASE + (reg))
+#define QLCNIC_ROM_LOCK_ID	(QLCNIC_CAM_RAM(0x100))
+#define QLCNIC_PHY_LOCK_ID	(QLCNIC_CAM_RAM(0x120))
+#define QLCNIC_CRB_WIN_LOCK_ID	(QLCNIC_CAM_RAM(0x124))
+
+#define NIC_CRB_BASE		(QLCNIC_CAM_RAM(0x200))
+#define NIC_CRB_BASE_2		(QLCNIC_CAM_RAM(0x700))
+#define QLCNIC_REG(X)		(NIC_CRB_BASE+(X))
+#define QLCNIC_REG_2(X) 	(NIC_CRB_BASE_2+(X))
+
+#define QLCNIC_CDRP_MAX_ARGS	4
+#define QLCNIC_CDRP_ARG(i)	(QLCNIC_REG(0x18 + ((i) * 4)))
+
+#define QLCNIC_CDRP_CRB_OFFSET		(QLCNIC_REG(0x18))
+#define QLCNIC_SIGN_CRB_OFFSET		(QLCNIC_REG(0x28))
+
+#define CRB_XG_STATE_P3P		(QLCNIC_REG(0x98))
+#define CRB_PF_LINK_SPEED_1		(QLCNIC_REG(0xe8))
+#define CRB_DRIVER_VERSION		(QLCNIC_REG(0x2a0))
+
+#define CRB_FW_CAPABILITIES_2		(QLCNIC_CAM_RAM(0x12c))
+
+/*
+ * CrbPortPhanCntrHi/Lo is used to pass the address of HostPhantomIndex address
+ * which can be read by the Phantom host to get producer/consumer indexes from
+ * Phantom/Casper. If it is not HOST_SHARED_MEMORY, then the following
+ * registers will be used for the addresses of the ring's shared memory
+ * on the Phantom.
+ */
+
+#define qlcnic_get_temp_val(x)		((x) >> 16)
+#define qlcnic_get_temp_state(x)	((x) & 0xffff)
+#define qlcnic_encode_temp(val, state)	(((val) << 16) | (state))
+
+/*
+ * Temperature control.
+ */
+enum {
+	QLCNIC_TEMP_NORMAL = 0x1,	/* Normal operating range */
+	QLCNIC_TEMP_WARN,	/* Sound alert, temperature getting high */
+	QLCNIC_TEMP_PANIC	/* Fatal error, hardware has shut down. */
+};
+
+
+/* Lock IDs for PHY lock */
+#define PHY_LOCK_DRIVER		0x44524956
+
+#define PCIX_INT_VECTOR 	(0x10100)
+#define PCIX_INT_MASK		(0x10104)
+
+#define PCIX_OCM_WINDOW		(0x10800)
+#define PCIX_OCM_WINDOW_REG(func)	(PCIX_OCM_WINDOW + 0x4 * (func))
+
+#define PCIX_TARGET_STATUS	(0x10118)
+#define PCIX_TARGET_STATUS_F1	(0x10160)
+#define PCIX_TARGET_STATUS_F2	(0x10164)
+#define PCIX_TARGET_STATUS_F3	(0x10168)
+#define PCIX_TARGET_STATUS_F4	(0x10360)
+#define PCIX_TARGET_STATUS_F5	(0x10364)
+#define PCIX_TARGET_STATUS_F6	(0x10368)
+#define PCIX_TARGET_STATUS_F7	(0x1036c)
+
+#define PCIX_TARGET_MASK	(0x10128)
+#define PCIX_TARGET_MASK_F1	(0x10170)
+#define PCIX_TARGET_MASK_F2	(0x10174)
+#define PCIX_TARGET_MASK_F3	(0x10178)
+#define PCIX_TARGET_MASK_F4	(0x10370)
+#define PCIX_TARGET_MASK_F5	(0x10374)
+#define PCIX_TARGET_MASK_F6	(0x10378)
+#define PCIX_TARGET_MASK_F7	(0x1037c)
+
+#define PCIX_MSI_F(i)		(0x13000+((i)*4))
+
+#define QLCNIC_PCIX_PH_REG(reg)	(QLCNIC_CRB_PCIE + (reg))
+#define QLCNIC_PCIX_PS_REG(reg)	(QLCNIC_CRB_PCIX_MD + (reg))
+#define QLCNIC_PCIE_REG(reg)	(QLCNIC_CRB_PCIE + (reg))
+
+#define PCIE_SEM0_LOCK		(0x1c000)
+#define PCIE_SEM0_UNLOCK	(0x1c004)
+#define PCIE_SEM_LOCK(N)	(PCIE_SEM0_LOCK + 8*(N))
+#define PCIE_SEM_UNLOCK(N)	(PCIE_SEM0_UNLOCK + 8*(N))
+
+#define PCIE_SETUP_FUNCTION	(0x12040)
+#define PCIE_SETUP_FUNCTION2	(0x12048)
+#define PCIE_MISCCFG_RC         (0x1206c)
+#define PCIE_TGT_SPLIT_CHICKEN	(0x12080)
+#define PCIE_CHICKEN3		(0x120c8)
+
+#define ISR_INT_STATE_REG       (QLCNIC_PCIX_PS_REG(PCIE_MISCCFG_RC))
+#define PCIE_MAX_MASTER_SPLIT	(0x14048)
+
+#define QLCNIC_PORT_MODE_NONE		0
+#define QLCNIC_PORT_MODE_XG		1
+#define QLCNIC_PORT_MODE_GB		2
+#define QLCNIC_PORT_MODE_802_3_AP	3
+#define QLCNIC_PORT_MODE_AUTO_NEG	4
+#define QLCNIC_PORT_MODE_AUTO_NEG_1G	5
+#define QLCNIC_PORT_MODE_AUTO_NEG_XG	6
+#define QLCNIC_PORT_MODE_ADDR		(QLCNIC_CAM_RAM(0x24))
+#define QLCNIC_WOL_PORT_MODE		(QLCNIC_CAM_RAM(0x198))
+
+#define QLCNIC_WOL_CONFIG_NV		(QLCNIC_CAM_RAM(0x184))
+#define QLCNIC_WOL_CONFIG		(QLCNIC_CAM_RAM(0x188))
+
+#define QLCNIC_PEG_TUNE_MN_PRESENT	0x1
+#define QLCNIC_PEG_TUNE_CAPABILITY	(QLCNIC_CAM_RAM(0x02c))
+
+#define QLCNIC_DMA_WATCHDOG_CTRL	(QLCNIC_CAM_RAM(0x14))
+#define QLCNIC_ROM_DEV_INIT_TIMEOUT	(0x3e885c)
+#define QLCNIC_ROM_DRV_RESET_TIMEOUT	(0x3e8860)
+
+/* Device State */
+#define QLCNIC_DEV_COLD			0x1
+#define QLCNIC_DEV_INITIALIZING		0x2
+#define QLCNIC_DEV_READY		0x3
+#define QLCNIC_DEV_NEED_RESET		0x4
+#define QLCNIC_DEV_NEED_QUISCENT	0x5
+#define QLCNIC_DEV_FAILED		0x6
+#define QLCNIC_DEV_QUISCENT		0x7
+
+#define QLCNIC_DEV_BADBAD		0xbad0bad0
+
+#define QLCNIC_DEV_NPAR_NON_OPER	0 /* NON Operational */
+#define QLCNIC_DEV_NPAR_OPER		1 /* NPAR Operational */
+#define QLCNIC_DEV_NPAR_OPER_TIMEO	30 /* Operational time out */
+
+#define QLC_DEV_SET_REF_CNT(VAL, FN)		((VAL) |= (1 << (FN * 4)))
+#define QLC_DEV_CLR_REF_CNT(VAL, FN)		((VAL) &= ~(1 << (FN * 4)))
+#define QLC_DEV_SET_RST_RDY(VAL, FN)		((VAL) |= (1 << (FN * 4)))
+#define QLC_DEV_SET_QSCNT_RDY(VAL, FN)		((VAL) |= (2 << (FN * 4)))
+#define QLC_DEV_CLR_RST_QSCNT(VAL, FN)		((VAL) &= ~(3 << (FN * 4)))
+
+#define QLC_DEV_GET_DRV(VAL, FN)		(0xf & ((VAL) >> (FN * 4)))
+#define QLC_DEV_SET_DRV(VAL, FN)		((VAL) << (FN * 4))
+
+#define QLCNIC_TYPE_NIC		1
+#define QLCNIC_TYPE_FCOE		2
+#define QLCNIC_TYPE_ISCSI		3
+
+#define QLCNIC_RCODE_DRIVER_INFO		0x20000000
+#define QLCNIC_RCODE_DRIVER_CAN_RELOAD		BIT_30
+#define QLCNIC_RCODE_FATAL_ERROR		BIT_31
+#define QLCNIC_FWERROR_PEGNUM(code)		((code) & 0xff)
+#define QLCNIC_FWERROR_CODE(code)		((code >> 8) & 0x1fffff)
+#define QLCNIC_FWERROR_FAN_FAILURE		0x16
+
+#define FW_POLL_DELAY		(1 * HZ)
+#define FW_FAIL_THRESH		2
+
+#define QLCNIC_RESET_TIMEOUT_SECS	10
+#define QLCNIC_INIT_TIMEOUT_SECS	30
+#define QLCNIC_RCVPEG_CHECK_RETRY_COUNT	2000
+#define QLCNIC_RCVPEG_CHECK_DELAY	10
+#define QLCNIC_CMDPEG_CHECK_RETRY_COUNT	60
+#define QLCNIC_CMDPEG_CHECK_DELAY	500
+#define QLCNIC_HEARTBEAT_PERIOD_MSECS	200
+#define QLCNIC_HEARTBEAT_CHECK_RETRY_COUNT	10
+
+#define QLCNIC_MAX_MC_COUNT		38
+#define QLCNIC_MAX_UC_COUNT		512
+#define QLCNIC_WATCHDOG_TIMEOUTVALUE	5
+
+#define	ISR_MSI_INT_TRIGGER(FUNC) (QLCNIC_PCIX_PS_REG(PCIX_MSI_F(FUNC)))
+#define ISR_LEGACY_INT_TRIGGERED(VAL)	(((VAL) & 0x300) == 0x200)
+
+/*
+ * PCI Interrupt Vector Values.
+ */
+#define	PCIX_INT_VECTOR_BIT_F0	0x0080
+#define	PCIX_INT_VECTOR_BIT_F1	0x0100
+#define	PCIX_INT_VECTOR_BIT_F2	0x0200
+#define	PCIX_INT_VECTOR_BIT_F3	0x0400
+#define	PCIX_INT_VECTOR_BIT_F4	0x0800
+#define	PCIX_INT_VECTOR_BIT_F5	0x1000
+#define	PCIX_INT_VECTOR_BIT_F6	0x2000
+#define	PCIX_INT_VECTOR_BIT_F7	0x4000
+
+struct qlcnic_legacy_intr_set {
+	u32	int_vec_bit;
+	u32	tgt_status_reg;
+	u32	tgt_mask_reg;
+	u32	pci_int_reg;
+};
+
+#define QLCNIC_MSIX_BASE	0x132110
+#define QLCNIC_MAX_VLAN_FILTERS	64
+
+#define FLASH_ROM_WINDOW	0x42110030
+#define FLASH_ROM_DATA		0x42150000
+
+#define QLCNIC_FW_DUMP_REG1	0x00130060
+#define QLCNIC_FW_DUMP_REG2	0x001e0000
+#define QLCNIC_FLASH_SEM2_LK	0x0013C010
+#define QLCNIC_FLASH_SEM2_ULK	0x0013C014
+#define QLCNIC_FLASH_LOCK_ID	0x001B2100
+
+/* PCI function operational mode */
+enum {
+	QLCNIC_MGMT_FUNC	= 0,
+	QLCNIC_PRIV_FUNC	= 1,
+	QLCNIC_NON_PRIV_FUNC	= 2,
+	QLCNIC_SRIOV_PF_FUNC	= 3,
+	QLCNIC_SRIOV_VF_FUNC	= 4,
+	QLCNIC_UNKNOWN_FUNC_MODE = 5
+};
+
+enum {
+	QLCNIC_PORT_DEFAULTS	= 0,
+	QLCNIC_ADD_VLAN	= 1,
+	QLCNIC_DEL_VLAN	= 2
+};
+
+#define QLC_DEV_DRV_DEFAULT 0x11111111
+
+#define LSB(x)	((uint8_t)(x))
+#define MSB(x)	((uint8_t)((uint16_t)(x) >> 8))
+
+#define LSW(x)  ((uint16_t)((uint32_t)(x)))
+#define MSW(x)  ((uint16_t)((uint32_t)(x) >> 16))
+
+#define LSD(x)  ((uint32_t)((uint64_t)(x)))
+#define MSD(x)  ((uint32_t)((((uint64_t)(x)) >> 16) >> 16))
+
+#define QLCNIC_MS_CTRL			0x41000090
+#define QLCNIC_MS_ADDR_LO		0x41000094
+#define QLCNIC_MS_ADDR_HI		0x41000098
+#define QLCNIC_MS_WRTDATA_LO		0x410000A0
+#define QLCNIC_MS_WRTDATA_HI		0x410000A4
+#define QLCNIC_MS_WRTDATA_ULO		0x410000B0
+#define QLCNIC_MS_WRTDATA_UHI		0x410000B4
+#define QLCNIC_MS_RDDATA_LO		0x410000A8
+#define QLCNIC_MS_RDDATA_HI		0x410000AC
+#define QLCNIC_MS_RDDATA_ULO		0x410000B8
+#define QLCNIC_MS_RDDATA_UHI		0x410000BC
+
+#define QLCNIC_TA_WRITE_ENABLE	(TA_CTL_ENABLE | TA_CTL_WRITE)
+#define QLCNIC_TA_WRITE_START	(TA_CTL_START | TA_CTL_ENABLE | TA_CTL_WRITE)
+#define QLCNIC_TA_START_ENABLE	(TA_CTL_START | TA_CTL_ENABLE)
+
+#define	QLCNIC_LEGACY_INTR_CONFIG					\
+{									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F0,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS,		\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK, },		\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F1,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F1,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F1, },	\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F2,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F2,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F2, },	\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F3,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F3,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F3, },	\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F4,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F4,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F4, },	\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F5,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F5,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F5, },	\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F6,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F6,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F6, },	\
+									\
+	{								\
+		.int_vec_bit	=	PCIX_INT_VECTOR_BIT_F7,		\
+		.tgt_status_reg	=	ISR_INT_TARGET_STATUS_F7,	\
+		.tgt_mask_reg	=	ISR_INT_TARGET_MASK_F7, },	\
+}
+
+/* NIU REGS */
+
+#define _qlcnic_crb_get_bit(var, bit)  ((var >> bit) & 0x1)
+
+/*
+ * NIU GB MAC Config Register 0 (applies to GB0, GB1, GB2, GB3)
+ *
+ *	Bit 0 : enable_tx => 1:enable frame xmit, 0:disable
+ *	Bit 1 : tx_synced => R/O: xmit enable synched to xmit stream
+ *	Bit 2 : enable_rx => 1:enable frame recv, 0:disable
+ *	Bit 3 : rx_synced => R/O: recv enable synched to recv stream
+ *	Bit 4 : tx_flowctl => 1:enable pause frame generation, 0:disable
+ *	Bit 5 : rx_flowctl => 1:act on recv'd pause frames, 0:ignore
+ *	Bit 8 : loopback => 1:loop MAC xmits to MAC recvs, 0:normal
+ *	Bit 16: tx_reset_pb => 1:reset frame xmit protocol blk, 0:no-op
+ *	Bit 17: rx_reset_pb => 1:reset frame recv protocol blk, 0:no-op
+ *	Bit 18: tx_reset_mac => 1:reset data/ctl multiplexer blk, 0:no-op
+ *	Bit 19: rx_reset_mac => 1:reset ctl frames & timers blk, 0:no-op
+ *	Bit 31: soft_reset => 1:reset the MAC and the SERDES, 0:no-op
+ */
+#define qlcnic_gb_rx_flowctl(config_word)	\
+	((config_word) |= 1 << 5)
+#define qlcnic_gb_get_rx_flowctl(config_word)	\
+	_qlcnic_crb_get_bit((config_word), 5)
+#define qlcnic_gb_unset_rx_flowctl(config_word)	\
+	((config_word) &= ~(1 << 5))
+
+/*
+ * NIU GB Pause Ctl Register
+ */
+
+#define qlcnic_gb_set_gb0_mask(config_word)    \
+	((config_word) |= 1 << 0)
+#define qlcnic_gb_set_gb1_mask(config_word)    \
+	((config_word) |= 1 << 2)
+#define qlcnic_gb_set_gb2_mask(config_word)    \
+	((config_word) |= 1 << 4)
+#define qlcnic_gb_set_gb3_mask(config_word)    \
+	((config_word) |= 1 << 6)
+
+#define qlcnic_gb_get_gb0_mask(config_word)    \
+	_qlcnic_crb_get_bit((config_word), 0)
+#define qlcnic_gb_get_gb1_mask(config_word)    \
+	_qlcnic_crb_get_bit((config_word), 2)
+#define qlcnic_gb_get_gb2_mask(config_word)    \
+	_qlcnic_crb_get_bit((config_word), 4)
+#define qlcnic_gb_get_gb3_mask(config_word)    \
+	_qlcnic_crb_get_bit((config_word), 6)
+
+#define qlcnic_gb_unset_gb0_mask(config_word)  \
+	((config_word) &= ~(1 << 0))
+#define qlcnic_gb_unset_gb1_mask(config_word)  \
+	((config_word) &= ~(1 << 2))
+#define qlcnic_gb_unset_gb2_mask(config_word)  \
+	((config_word) &= ~(1 << 4))
+#define qlcnic_gb_unset_gb3_mask(config_word)  \
+	((config_word) &= ~(1 << 6))
+
+/*
+ * NIU XG Pause Ctl Register
+ *
+ *      Bit 0       : xg0_mask => 1:disable tx pause frames
+ *      Bit 1       : xg0_request => 1:request single pause frame
+ *      Bit 2       : xg0_on_off => 1:request is pause on, 0:off
+ *      Bit 3       : xg1_mask => 1:disable tx pause frames
+ *      Bit 4       : xg1_request => 1:request single pause frame
+ *      Bit 5       : xg1_on_off => 1:request is pause on, 0:off
+ */
+
+#define qlcnic_xg_set_xg0_mask(config_word)    \
+	((config_word) |= 1 << 0)
+#define qlcnic_xg_set_xg1_mask(config_word)    \
+	((config_word) |= 1 << 3)
+
+#define qlcnic_xg_get_xg0_mask(config_word)    \
+	_qlcnic_crb_get_bit((config_word), 0)
+#define qlcnic_xg_get_xg1_mask(config_word)    \
+	_qlcnic_crb_get_bit((config_word), 3)
+
+#define qlcnic_xg_unset_xg0_mask(config_word)  \
+	((config_word) &= ~(1 << 0))
+#define qlcnic_xg_unset_xg1_mask(config_word)  \
+	((config_word) &= ~(1 << 3))
+
+/*
+ * NIU XG Pause Ctl Register
+ *
+ *      Bit 0       : xg0_mask => 1:disable tx pause frames
+ *      Bit 1       : xg0_request => 1:request single pause frame
+ *      Bit 2       : xg0_on_off => 1:request is pause on, 0:off
+ *      Bit 3       : xg1_mask => 1:disable tx pause frames
+ *      Bit 4       : xg1_request => 1:request single pause frame
+ *      Bit 5       : xg1_on_off => 1:request is pause on, 0:off
+ */
+
+/*
+ * PHY-Specific MII control/status registers.
+ */
+#define QLCNIC_NIU_GB_MII_MGMT_ADDR_AUTONEG		4
+#define QLCNIC_NIU_GB_MII_MGMT_ADDR_PHY_STATUS		17
+
+/*
+ * PHY-Specific Status Register (reg 17).
+ *
+ * Bit 0      : jabber => 1:jabber detected, 0:not
+ * Bit 1      : polarity => 1:polarity reversed, 0:normal
+ * Bit 2      : recvpause => 1:receive pause enabled, 0:disabled
+ * Bit 3      : xmitpause => 1:transmit pause enabled, 0:disabled
+ * Bit 4      : energydetect => 1:sleep, 0:active
+ * Bit 5      : downshift => 1:downshift, 0:no downshift
+ * Bit 6      : crossover => 1:MDIX (crossover), 0:MDI (no crossover)
+ * Bits 7-9   : cablelen => not valid in 10Mb/s mode
+ *			0:<50m, 1:50-80m, 2:80-110m, 3:110-140m, 4:>140m
+ * Bit 10     : link => 1:link up, 0:link down
+ * Bit 11     : resolved => 1:speed and duplex resolved, 0:not yet
+ * Bit 12     : pagercvd => 1:page received, 0:page not received
+ * Bit 13     : duplex => 1:full duplex, 0:half duplex
+ * Bits 14-15 : speed => 0:10Mb/s, 1:100Mb/s, 2:1000Mb/s, 3:rsvd
+ */
+
+#define qlcnic_get_phy_speed(config_word) (((config_word) >> 14) & 0x03)
+
+#define qlcnic_set_phy_speed(config_word, val)	\
+		((config_word) |= ((val & 0x03) << 14))
+#define qlcnic_set_phy_duplex(config_word)	\
+		((config_word) |= 1 << 13)
+#define qlcnic_clear_phy_duplex(config_word)	\
+		((config_word) &= ~(1 << 13))
+
+#define qlcnic_get_phy_link(config_word)	\
+		_qlcnic_crb_get_bit(config_word, 10)
+#define qlcnic_get_phy_duplex(config_word)	\
+		_qlcnic_crb_get_bit(config_word, 13)
+
+#define QLCNIC_NIU_NON_PROMISC_MODE	0
+#define QLCNIC_NIU_PROMISC_MODE		1
+#define QLCNIC_NIU_ALLMULTI_MODE	2
+
+#define QLCNIC_PCIE_SEM_TIMEOUT	10000
+
+struct crb_128M_2M_sub_block_map {
+	unsigned valid;
+	unsigned start_128M;
+	unsigned end_128M;
+	unsigned start_2M;
+};
+
+struct crb_128M_2M_block_map{
+	struct crb_128M_2M_sub_block_map sub_block[16];
+};
+#endif				/* __QLCNIC_HDR_H_ */
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c
new file mode 100644
index 0000000..7848cf0
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c
@@ -0,0 +1,1704 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <linux/bitops.h>
+
+#include "qlcnic.h"
+#include "qlcnic_hdr.h"
+
+#define MASK(n) ((1ULL<<(n))-1)
+#define OCM_WIN_P3P(addr) (addr & 0xffc0000)
+
+#define GET_MEM_OFFS_2M(addr) (addr & MASK(18))
+
+#define CRB_BLK(off)	((off >> 20) & 0x3f)
+#define CRB_SUBBLK(off)	((off >> 16) & 0xf)
+#define CRB_WINDOW_2M	(0x130060)
+#define CRB_HI(off)	((crb_hub_agt[CRB_BLK(off)] << 20) | ((off) & 0xf0000))
+#define CRB_INDIRECT_2M	(0x1e0000UL)
+
+struct qlcnic_ms_reg_ctrl {
+	u32 ocm_window;
+	u32 control;
+	u32 hi;
+	u32 low;
+	u32 rd[4];
+	u32 wd[4];
+	u64 off;
+};
+
+#ifndef readq
+static inline u64 readq(void __iomem *addr)
+{
+	return readl(addr) | (((u64) readl(addr + 4)) << 32LL);
+}
+#endif
+
+#ifndef writeq
+static inline void writeq(u64 val, void __iomem *addr)
+{
+	writel(((u32) (val)), (addr));
+	writel(((u32) (val >> 32)), (addr + 4));
+}
+#endif
+
+static struct crb_128M_2M_block_map
+crb_128M_2M_map[64] __cacheline_aligned_in_smp = {
+    {{{0, 0,         0,         0} } },		/* 0: PCI */
+    {{{1, 0x0100000, 0x0102000, 0x120000},	/* 1: PCIE */
+	  {1, 0x0110000, 0x0120000, 0x130000},
+	  {1, 0x0120000, 0x0122000, 0x124000},
+	  {1, 0x0130000, 0x0132000, 0x126000},
+	  {1, 0x0140000, 0x0142000, 0x128000},
+	  {1, 0x0150000, 0x0152000, 0x12a000},
+	  {1, 0x0160000, 0x0170000, 0x110000},
+	  {1, 0x0170000, 0x0172000, 0x12e000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {1, 0x01e0000, 0x01e0800, 0x122000},
+	  {0, 0x0000000, 0x0000000, 0x000000} } },
+	{{{1, 0x0200000, 0x0210000, 0x180000} } },/* 2: MN */
+    {{{0, 0,         0,         0} } },	    /* 3: */
+    {{{1, 0x0400000, 0x0401000, 0x169000} } },/* 4: P2NR1 */
+    {{{1, 0x0500000, 0x0510000, 0x140000} } },/* 5: SRE   */
+    {{{1, 0x0600000, 0x0610000, 0x1c0000} } },/* 6: NIU   */
+    {{{1, 0x0700000, 0x0704000, 0x1b8000} } },/* 7: QM    */
+    {{{1, 0x0800000, 0x0802000, 0x170000},  /* 8: SQM0  */
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {1, 0x08f0000, 0x08f2000, 0x172000} } },
+    {{{1, 0x0900000, 0x0902000, 0x174000},	/* 9: SQM1*/
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {1, 0x09f0000, 0x09f2000, 0x176000} } },
+    {{{0, 0x0a00000, 0x0a02000, 0x178000},	/* 10: SQM2*/
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {1, 0x0af0000, 0x0af2000, 0x17a000} } },
+    {{{0, 0x0b00000, 0x0b02000, 0x17c000},	/* 11: SQM3*/
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {0, 0x0000000, 0x0000000, 0x000000},
+      {1, 0x0bf0000, 0x0bf2000, 0x17e000} } },
+	{{{1, 0x0c00000, 0x0c04000, 0x1d4000} } },/* 12: I2Q */
+	{{{1, 0x0d00000, 0x0d04000, 0x1a4000} } },/* 13: TMR */
+	{{{1, 0x0e00000, 0x0e04000, 0x1a0000} } },/* 14: ROMUSB */
+	{{{1, 0x0f00000, 0x0f01000, 0x164000} } },/* 15: PEG4 */
+	{{{0, 0x1000000, 0x1004000, 0x1a8000} } },/* 16: XDMA */
+	{{{1, 0x1100000, 0x1101000, 0x160000} } },/* 17: PEG0 */
+	{{{1, 0x1200000, 0x1201000, 0x161000} } },/* 18: PEG1 */
+	{{{1, 0x1300000, 0x1301000, 0x162000} } },/* 19: PEG2 */
+	{{{1, 0x1400000, 0x1401000, 0x163000} } },/* 20: PEG3 */
+	{{{1, 0x1500000, 0x1501000, 0x165000} } },/* 21: P2ND */
+	{{{1, 0x1600000, 0x1601000, 0x166000} } },/* 22: P2NI */
+	{{{0, 0,         0,         0} } },	/* 23: */
+	{{{0, 0,         0,         0} } },	/* 24: */
+	{{{0, 0,         0,         0} } },	/* 25: */
+	{{{0, 0,         0,         0} } },	/* 26: */
+	{{{0, 0,         0,         0} } },	/* 27: */
+	{{{0, 0,         0,         0} } },	/* 28: */
+	{{{1, 0x1d00000, 0x1d10000, 0x190000} } },/* 29: MS */
+    {{{1, 0x1e00000, 0x1e01000, 0x16a000} } },/* 30: P2NR2 */
+    {{{1, 0x1f00000, 0x1f10000, 0x150000} } },/* 31: EPG */
+	{{{0} } },				/* 32: PCI */
+	{{{1, 0x2100000, 0x2102000, 0x120000},	/* 33: PCIE */
+	  {1, 0x2110000, 0x2120000, 0x130000},
+	  {1, 0x2120000, 0x2122000, 0x124000},
+	  {1, 0x2130000, 0x2132000, 0x126000},
+	  {1, 0x2140000, 0x2142000, 0x128000},
+	  {1, 0x2150000, 0x2152000, 0x12a000},
+	  {1, 0x2160000, 0x2170000, 0x110000},
+	  {1, 0x2170000, 0x2172000, 0x12e000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000},
+	  {0, 0x0000000, 0x0000000, 0x000000} } },
+	{{{1, 0x2200000, 0x2204000, 0x1b0000} } },/* 34: CAM */
+	{{{0} } },				/* 35: */
+	{{{0} } },				/* 36: */
+	{{{0} } },				/* 37: */
+	{{{0} } },				/* 38: */
+	{{{0} } },				/* 39: */
+	{{{1, 0x2800000, 0x2804000, 0x1a4000} } },/* 40: TMR */
+	{{{1, 0x2900000, 0x2901000, 0x16b000} } },/* 41: P2NR3 */
+	{{{1, 0x2a00000, 0x2a00400, 0x1ac400} } },/* 42: RPMX1 */
+	{{{1, 0x2b00000, 0x2b00400, 0x1ac800} } },/* 43: RPMX2 */
+	{{{1, 0x2c00000, 0x2c00400, 0x1acc00} } },/* 44: RPMX3 */
+	{{{1, 0x2d00000, 0x2d00400, 0x1ad000} } },/* 45: RPMX4 */
+	{{{1, 0x2e00000, 0x2e00400, 0x1ad400} } },/* 46: RPMX5 */
+	{{{1, 0x2f00000, 0x2f00400, 0x1ad800} } },/* 47: RPMX6 */
+	{{{1, 0x3000000, 0x3000400, 0x1adc00} } },/* 48: RPMX7 */
+	{{{0, 0x3100000, 0x3104000, 0x1a8000} } },/* 49: XDMA */
+	{{{1, 0x3200000, 0x3204000, 0x1d4000} } },/* 50: I2Q */
+	{{{1, 0x3300000, 0x3304000, 0x1a0000} } },/* 51: ROMUSB */
+	{{{0} } },				/* 52: */
+	{{{1, 0x3500000, 0x3500400, 0x1ac000} } },/* 53: RPMX0 */
+	{{{1, 0x3600000, 0x3600400, 0x1ae000} } },/* 54: RPMX8 */
+	{{{1, 0x3700000, 0x3700400, 0x1ae400} } },/* 55: RPMX9 */
+	{{{1, 0x3800000, 0x3804000, 0x1d0000} } },/* 56: OCM0 */
+	{{{1, 0x3900000, 0x3904000, 0x1b4000} } },/* 57: CRYPTO */
+	{{{1, 0x3a00000, 0x3a04000, 0x1d8000} } },/* 58: SMB */
+	{{{0} } },				/* 59: I2C0 */
+	{{{0} } },				/* 60: I2C1 */
+	{{{1, 0x3d00000, 0x3d04000, 0x1d8000} } },/* 61: LPC */
+	{{{1, 0x3e00000, 0x3e01000, 0x167000} } },/* 62: P2NC */
+	{{{1, 0x3f00000, 0x3f01000, 0x168000} } }	/* 63: P2NR0 */
+};
+
+/*
+ * top 12 bits of crb internal address (hub, agent)
+ */
+static const unsigned crb_hub_agt[64] = {
+	0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PS,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_MN,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_MS,
+	0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_SRE,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_NIU,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_QMN,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_SQN0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_SQN1,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_SQN2,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_SQN3,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_I2Q,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_TIMR,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_ROMUSB,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PGN4,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_XDMA,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PGN0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PGN1,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PGN2,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PGN3,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PGND,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PGNI,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PGS0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PGS1,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PGS2,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PGS3,
+	0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PGSI,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_SN,
+	0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_EG,
+	0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PS,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_CAM,
+	0,
+	0,
+	0,
+	0,
+	0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_TIMR,
+	0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX1,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX2,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX3,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX4,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX5,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX6,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX7,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_XDMA,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_I2Q,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_ROMUSB,
+	0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX8,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_RPMX9,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_OCM0,
+	0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_SMB,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_I2C0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_I2C1,
+	0,
+	QLCNIC_HW_CRB_HUB_AGT_ADR_PGNC,
+	0,
+};
+
+static const u32 msi_tgt_status[8] = {
+	ISR_INT_TARGET_STATUS, ISR_INT_TARGET_STATUS_F1,
+	ISR_INT_TARGET_STATUS_F2, ISR_INT_TARGET_STATUS_F3,
+	ISR_INT_TARGET_STATUS_F4, ISR_INT_TARGET_STATUS_F5,
+	ISR_INT_TARGET_STATUS_F6, ISR_INT_TARGET_STATUS_F7
+};
+
+/*  PCI Windowing for DDR regions.  */
+
+#define QLCNIC_PCIE_SEM_TIMEOUT	10000
+
+static void qlcnic_read_window_reg(u32 addr, void __iomem *bar0, u32 *data)
+{
+	u32 dest;
+	void __iomem *val;
+
+	dest = addr & 0xFFFF0000;
+	val = bar0 + QLCNIC_FW_DUMP_REG1;
+	writel(dest, val);
+	readl(val);
+	val = bar0 + QLCNIC_FW_DUMP_REG2 + LSW(addr);
+	*data = readl(val);
+}
+
+static void qlcnic_write_window_reg(u32 addr, void __iomem *bar0, u32 data)
+{
+	u32 dest;
+	void __iomem *val;
+
+	dest = addr & 0xFFFF0000;
+	val = bar0 + QLCNIC_FW_DUMP_REG1;
+	writel(dest, val);
+	readl(val);
+	val = bar0 + QLCNIC_FW_DUMP_REG2 + LSW(addr);
+	writel(data, val);
+	readl(val);
+}
+
+int
+qlcnic_pcie_sem_lock(struct qlcnic_adapter *adapter, int sem, u32 id_reg)
+{
+	int timeout = 0, err = 0, done = 0;
+
+	while (!done) {
+		done = QLCRD32(adapter, QLCNIC_PCIE_REG(PCIE_SEM_LOCK(sem)),
+			       &err);
+		if (done == 1)
+			break;
+		if (++timeout >= QLCNIC_PCIE_SEM_TIMEOUT) {
+			if (id_reg) {
+				done = QLCRD32(adapter, id_reg, &err);
+				if (done != -1)
+					dev_err(&adapter->pdev->dev,
+						"Failed to acquire sem=%d lock held by=%d\n",
+						sem, done);
+				else
+					dev_err(&adapter->pdev->dev,
+						"Failed to acquire sem=%d lock",
+						sem);
+			} else {
+				dev_err(&adapter->pdev->dev,
+					"Failed to acquire sem=%d lock", sem);
+			}
+			return -EIO;
+		}
+		udelay(1200);
+	}
+
+	if (id_reg)
+		QLCWR32(adapter, id_reg, adapter->portnum);
+
+	return 0;
+}
+
+void
+qlcnic_pcie_sem_unlock(struct qlcnic_adapter *adapter, int sem)
+{
+	int err = 0;
+
+	QLCRD32(adapter, QLCNIC_PCIE_REG(PCIE_SEM_UNLOCK(sem)), &err);
+}
+
+int qlcnic_ind_rd(struct qlcnic_adapter *adapter, u32 addr)
+{
+	int err = 0;
+	u32 data;
+
+	if (qlcnic_82xx_check(adapter))
+		qlcnic_read_window_reg(addr, adapter->ahw->pci_base0, &data);
+	else {
+		data = QLCRD32(adapter, addr, &err);
+		if (err == -EIO)
+			return err;
+	}
+	return data;
+}
+
+int qlcnic_ind_wr(struct qlcnic_adapter *adapter, u32 addr, u32 data)
+{
+	int ret = 0;
+
+	if (qlcnic_82xx_check(adapter))
+		qlcnic_write_window_reg(addr, adapter->ahw->pci_base0, data);
+	else
+		ret = qlcnic_83xx_wrt_reg_indirect(adapter, addr, data);
+
+	return ret;
+}
+
+static int
+qlcnic_send_cmd_descs(struct qlcnic_adapter *adapter,
+		struct cmd_desc_type0 *cmd_desc_arr, int nr_desc)
+{
+	u32 i, producer;
+	struct qlcnic_cmd_buffer *pbuf;
+	struct cmd_desc_type0 *cmd_desc;
+	struct qlcnic_host_tx_ring *tx_ring;
+
+	i = 0;
+
+	if (!test_bit(__QLCNIC_FW_ATTACHED, &adapter->state))
+		return -EIO;
+
+	tx_ring = &adapter->tx_ring[0];
+	__netif_tx_lock_bh(tx_ring->txq);
+
+	producer = tx_ring->producer;
+
+	if (nr_desc >= qlcnic_tx_avail(tx_ring)) {
+		netif_tx_stop_queue(tx_ring->txq);
+		smp_mb();
+		if (qlcnic_tx_avail(tx_ring) > nr_desc) {
+			if (qlcnic_tx_avail(tx_ring) > TX_STOP_THRESH)
+				netif_tx_wake_queue(tx_ring->txq);
+		} else {
+			adapter->stats.xmit_off++;
+			__netif_tx_unlock_bh(tx_ring->txq);
+			return -EBUSY;
+		}
+	}
+
+	do {
+		cmd_desc = &cmd_desc_arr[i];
+
+		pbuf = &tx_ring->cmd_buf_arr[producer];
+		pbuf->skb = NULL;
+		pbuf->frag_count = 0;
+
+		memcpy(&tx_ring->desc_head[producer],
+		       cmd_desc, sizeof(struct cmd_desc_type0));
+
+		producer = get_next_index(producer, tx_ring->num_desc);
+		i++;
+
+	} while (i != nr_desc);
+
+	tx_ring->producer = producer;
+
+	qlcnic_update_cmd_producer(tx_ring);
+
+	__netif_tx_unlock_bh(tx_ring->txq);
+
+	return 0;
+}
+
+int qlcnic_82xx_sre_macaddr_change(struct qlcnic_adapter *adapter, u8 *addr,
+				   u16 vlan_id, u8 op)
+{
+	struct qlcnic_nic_req req;
+	struct qlcnic_mac_req *mac_req;
+	struct qlcnic_vlan_req *vlan_req;
+	u64 word;
+
+	memset(&req, 0, sizeof(struct qlcnic_nic_req));
+	req.qhdr = cpu_to_le64(QLCNIC_REQUEST << 23);
+
+	word = QLCNIC_MAC_EVENT | ((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+
+	mac_req = (struct qlcnic_mac_req *)&req.words[0];
+	mac_req->op = op;
+	memcpy(mac_req->mac_addr, addr, ETH_ALEN);
+
+	vlan_req = (struct qlcnic_vlan_req *)&req.words[1];
+	vlan_req->vlan_id = cpu_to_le16(vlan_id);
+
+	return qlcnic_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+}
+
+int qlcnic_nic_del_mac(struct qlcnic_adapter *adapter, const u8 *addr)
+{
+	struct qlcnic_mac_vlan_list *cur;
+	struct list_head *head;
+	int err = -EINVAL;
+
+	/* Delete MAC from the existing list */
+	list_for_each(head, &adapter->mac_list) {
+		cur = list_entry(head, struct qlcnic_mac_vlan_list, list);
+		if (ether_addr_equal(addr, cur->mac_addr)) {
+			err = qlcnic_sre_macaddr_change(adapter, cur->mac_addr,
+							0, QLCNIC_MAC_DEL);
+			if (err)
+				return err;
+			list_del(&cur->list);
+			kfree(cur);
+			return err;
+		}
+	}
+	return err;
+}
+
+int qlcnic_nic_add_mac(struct qlcnic_adapter *adapter, const u8 *addr, u16 vlan,
+		       enum qlcnic_mac_type mac_type)
+{
+	struct qlcnic_mac_vlan_list *cur;
+	struct list_head *head;
+
+	/* look up if already exists */
+	list_for_each(head, &adapter->mac_list) {
+		cur = list_entry(head, struct qlcnic_mac_vlan_list, list);
+		if (ether_addr_equal(addr, cur->mac_addr) &&
+		    cur->vlan_id == vlan)
+			return 0;
+	}
+
+	cur = kzalloc(sizeof(*cur), GFP_ATOMIC);
+	if (cur == NULL)
+		return -ENOMEM;
+
+	memcpy(cur->mac_addr, addr, ETH_ALEN);
+
+	if (qlcnic_sre_macaddr_change(adapter,
+				cur->mac_addr, vlan, QLCNIC_MAC_ADD)) {
+		kfree(cur);
+		return -EIO;
+	}
+
+	cur->vlan_id = vlan;
+	cur->mac_type = mac_type;
+
+	list_add_tail(&cur->list, &adapter->mac_list);
+	return 0;
+}
+
+void qlcnic_flush_mcast_mac(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_mac_vlan_list *cur;
+	struct list_head *head, *tmp;
+
+	list_for_each_safe(head, tmp, &adapter->mac_list) {
+		cur = list_entry(head, struct qlcnic_mac_vlan_list, list);
+		if (cur->mac_type != QLCNIC_MULTICAST_MAC)
+			continue;
+
+		qlcnic_sre_macaddr_change(adapter, cur->mac_addr,
+					  cur->vlan_id, QLCNIC_MAC_DEL);
+		list_del(&cur->list);
+		kfree(cur);
+	}
+}
+
+static void __qlcnic_set_multi(struct net_device *netdev, u16 vlan)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct netdev_hw_addr *ha;
+	static const u8 bcast_addr[ETH_ALEN] = {
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+	};
+	u32 mode = VPORT_MISS_MODE_DROP;
+
+	if (!test_bit(__QLCNIC_FW_ATTACHED, &adapter->state))
+		return;
+
+	qlcnic_nic_add_mac(adapter, adapter->mac_addr, vlan,
+			   QLCNIC_UNICAST_MAC);
+	qlcnic_nic_add_mac(adapter, bcast_addr, vlan, QLCNIC_BROADCAST_MAC);
+
+	if (netdev->flags & IFF_PROMISC) {
+		if (!(adapter->flags & QLCNIC_PROMISC_DISABLED))
+			mode = VPORT_MISS_MODE_ACCEPT_ALL;
+	} else if ((netdev->flags & IFF_ALLMULTI) ||
+		   (netdev_mc_count(netdev) > ahw->max_mc_count)) {
+		mode = VPORT_MISS_MODE_ACCEPT_MULTI;
+	} else if (!netdev_mc_empty(netdev)) {
+		qlcnic_flush_mcast_mac(adapter);
+		netdev_for_each_mc_addr(ha, netdev)
+			qlcnic_nic_add_mac(adapter, ha->addr, vlan,
+					   QLCNIC_MULTICAST_MAC);
+	}
+
+	/* configure unicast MAC address, if there is not sufficient space
+	 * to store all the unicast addresses then enable promiscuous mode
+	 */
+	if (netdev_uc_count(netdev) > ahw->max_uc_count) {
+		mode = VPORT_MISS_MODE_ACCEPT_ALL;
+	} else if (!netdev_uc_empty(netdev)) {
+		netdev_for_each_uc_addr(ha, netdev)
+			qlcnic_nic_add_mac(adapter, ha->addr, vlan,
+					   QLCNIC_UNICAST_MAC);
+	}
+
+	if (mode == VPORT_MISS_MODE_ACCEPT_ALL &&
+	    !adapter->fdb_mac_learn) {
+		qlcnic_alloc_lb_filters_mem(adapter);
+		adapter->drv_mac_learn = 1;
+		if (adapter->flags & QLCNIC_ESWITCH_ENABLED)
+			adapter->rx_mac_learn = true;
+	} else {
+		adapter->drv_mac_learn = 0;
+		adapter->rx_mac_learn = false;
+	}
+
+	qlcnic_nic_set_promisc(adapter, mode);
+}
+
+void qlcnic_set_multi(struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+
+	if (!test_bit(__QLCNIC_FW_ATTACHED, &adapter->state))
+		return;
+
+	if (qlcnic_sriov_vf_check(adapter))
+		qlcnic_sriov_vf_set_multi(netdev);
+	else
+		__qlcnic_set_multi(netdev, 0);
+}
+
+int qlcnic_82xx_nic_set_promisc(struct qlcnic_adapter *adapter, u32 mode)
+{
+	struct qlcnic_nic_req req;
+	u64 word;
+
+	memset(&req, 0, sizeof(struct qlcnic_nic_req));
+
+	req.qhdr = cpu_to_le64(QLCNIC_HOST_REQUEST << 23);
+
+	word = QLCNIC_H2C_OPCODE_SET_MAC_RECEIVE_MODE |
+			((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+
+	req.words[0] = cpu_to_le64(mode);
+
+	return qlcnic_send_cmd_descs(adapter,
+				(struct cmd_desc_type0 *)&req, 1);
+}
+
+void qlcnic_82xx_free_mac_list(struct qlcnic_adapter *adapter)
+{
+	struct list_head *head = &adapter->mac_list;
+	struct qlcnic_mac_vlan_list *cur;
+
+	while (!list_empty(head)) {
+		cur = list_entry(head->next, struct qlcnic_mac_vlan_list, list);
+		qlcnic_sre_macaddr_change(adapter,
+				cur->mac_addr, 0, QLCNIC_MAC_DEL);
+		list_del(&cur->list);
+		kfree(cur);
+	}
+}
+
+void qlcnic_prune_lb_filters(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_filter *tmp_fil;
+	struct hlist_node *n;
+	struct hlist_head *head;
+	int i;
+	unsigned long expires;
+	u8 cmd;
+
+	for (i = 0; i < adapter->fhash.fbucket_size; i++) {
+		head = &(adapter->fhash.fhead[i]);
+		hlist_for_each_entry_safe(tmp_fil, n, head, fnode) {
+			cmd =  tmp_fil->vlan_id ? QLCNIC_MAC_VLAN_DEL :
+						  QLCNIC_MAC_DEL;
+			expires = tmp_fil->ftime + QLCNIC_FILTER_AGE * HZ;
+			if (time_before(expires, jiffies)) {
+				qlcnic_sre_macaddr_change(adapter,
+							  tmp_fil->faddr,
+							  tmp_fil->vlan_id,
+							  cmd);
+				spin_lock_bh(&adapter->mac_learn_lock);
+				adapter->fhash.fnum--;
+				hlist_del(&tmp_fil->fnode);
+				spin_unlock_bh(&adapter->mac_learn_lock);
+				kfree(tmp_fil);
+			}
+		}
+	}
+	for (i = 0; i < adapter->rx_fhash.fbucket_size; i++) {
+		head = &(adapter->rx_fhash.fhead[i]);
+
+		hlist_for_each_entry_safe(tmp_fil, n, head, fnode)
+		{
+			expires = tmp_fil->ftime + QLCNIC_FILTER_AGE * HZ;
+			if (time_before(expires, jiffies)) {
+				spin_lock_bh(&adapter->rx_mac_learn_lock);
+				adapter->rx_fhash.fnum--;
+				hlist_del(&tmp_fil->fnode);
+				spin_unlock_bh(&adapter->rx_mac_learn_lock);
+				kfree(tmp_fil);
+			}
+		}
+	}
+}
+
+void qlcnic_delete_lb_filters(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_filter *tmp_fil;
+	struct hlist_node *n;
+	struct hlist_head *head;
+	int i;
+	u8 cmd;
+
+	for (i = 0; i < adapter->fhash.fbucket_size; i++) {
+		head = &(adapter->fhash.fhead[i]);
+		hlist_for_each_entry_safe(tmp_fil, n, head, fnode) {
+			cmd =  tmp_fil->vlan_id ? QLCNIC_MAC_VLAN_DEL :
+						  QLCNIC_MAC_DEL;
+			qlcnic_sre_macaddr_change(adapter,
+						  tmp_fil->faddr,
+						  tmp_fil->vlan_id,
+						  cmd);
+			spin_lock_bh(&adapter->mac_learn_lock);
+			adapter->fhash.fnum--;
+			hlist_del(&tmp_fil->fnode);
+			spin_unlock_bh(&adapter->mac_learn_lock);
+			kfree(tmp_fil);
+		}
+	}
+}
+
+static int qlcnic_set_fw_loopback(struct qlcnic_adapter *adapter, u8 flag)
+{
+	struct qlcnic_nic_req req;
+	int rv;
+
+	memset(&req, 0, sizeof(struct qlcnic_nic_req));
+
+	req.qhdr = cpu_to_le64(QLCNIC_HOST_REQUEST << 23);
+	req.req_hdr = cpu_to_le64(QLCNIC_H2C_OPCODE_CONFIG_LOOPBACK |
+		((u64) adapter->portnum << 16) | ((u64) 0x1 << 32));
+
+	req.words[0] = cpu_to_le64(flag);
+
+	rv = qlcnic_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0)
+		dev_err(&adapter->pdev->dev, "%sting loopback mode failed\n",
+				flag ? "Set" : "Reset");
+	return rv;
+}
+
+int qlcnic_82xx_set_lb_mode(struct qlcnic_adapter *adapter, u8 mode)
+{
+	if (qlcnic_set_fw_loopback(adapter, mode))
+		return -EIO;
+
+	if (qlcnic_nic_set_promisc(adapter,
+				   VPORT_MISS_MODE_ACCEPT_ALL)) {
+		qlcnic_set_fw_loopback(adapter, 0);
+		return -EIO;
+	}
+
+	msleep(1000);
+	return 0;
+}
+
+int qlcnic_82xx_clear_lb_mode(struct qlcnic_adapter *adapter, u8 mode)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	mode = VPORT_MISS_MODE_DROP;
+	qlcnic_set_fw_loopback(adapter, 0);
+
+	if (netdev->flags & IFF_PROMISC)
+		mode = VPORT_MISS_MODE_ACCEPT_ALL;
+	else if (netdev->flags & IFF_ALLMULTI)
+		mode = VPORT_MISS_MODE_ACCEPT_MULTI;
+
+	qlcnic_nic_set_promisc(adapter, mode);
+	msleep(1000);
+	return 0;
+}
+
+int qlcnic_82xx_read_phys_port_id(struct qlcnic_adapter *adapter)
+{
+	u8 mac[ETH_ALEN];
+	int ret;
+
+	ret = qlcnic_get_mac_address(adapter, mac,
+				     adapter->ahw->physical_port);
+	if (ret)
+		return ret;
+
+	memcpy(adapter->ahw->phys_port_id, mac, ETH_ALEN);
+	adapter->flags |= QLCNIC_HAS_PHYS_PORT_ID;
+
+	return 0;
+}
+
+int qlcnic_82xx_set_rx_coalesce(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_nic_req req;
+	int rv;
+
+	memset(&req, 0, sizeof(struct qlcnic_nic_req));
+
+	req.qhdr = cpu_to_le64(QLCNIC_HOST_REQUEST << 23);
+
+	req.req_hdr = cpu_to_le64(QLCNIC_CONFIG_INTR_COALESCE |
+		((u64) adapter->portnum << 16));
+
+	req.words[0] = cpu_to_le64(((u64) adapter->ahw->coal.flag) << 32);
+	req.words[2] = cpu_to_le64(adapter->ahw->coal.rx_packets |
+			((u64) adapter->ahw->coal.rx_time_us) << 16);
+	req.words[5] = cpu_to_le64(adapter->ahw->coal.timer_out |
+			((u64) adapter->ahw->coal.type) << 32 |
+			((u64) adapter->ahw->coal.sts_ring_mask) << 40);
+	rv = qlcnic_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0)
+		dev_err(&adapter->netdev->dev,
+			"Could not send interrupt coalescing parameters\n");
+
+	return rv;
+}
+
+/* Send the interrupt coalescing parameter set by ethtool to the card. */
+int qlcnic_82xx_config_intr_coalesce(struct qlcnic_adapter *adapter,
+				     struct ethtool_coalesce *ethcoal)
+{
+	struct qlcnic_nic_intr_coalesce *coal = &adapter->ahw->coal;
+	int rv;
+
+	coal->flag = QLCNIC_INTR_DEFAULT;
+	coal->rx_time_us = ethcoal->rx_coalesce_usecs;
+	coal->rx_packets = ethcoal->rx_max_coalesced_frames;
+
+	rv = qlcnic_82xx_set_rx_coalesce(adapter);
+
+	if (rv)
+		netdev_err(adapter->netdev,
+			   "Failed to set Rx coalescing parameters\n");
+
+	return rv;
+}
+
+#define QLCNIC_ENABLE_IPV4_LRO		BIT_0
+#define QLCNIC_ENABLE_IPV6_LRO		(BIT_1 | BIT_9)
+
+int qlcnic_82xx_config_hw_lro(struct qlcnic_adapter *adapter, int enable)
+{
+	struct qlcnic_nic_req req;
+	u64 word;
+	int rv;
+
+	if (!test_bit(__QLCNIC_FW_ATTACHED, &adapter->state))
+		return 0;
+
+	memset(&req, 0, sizeof(struct qlcnic_nic_req));
+
+	req.qhdr = cpu_to_le64(QLCNIC_HOST_REQUEST << 23);
+
+	word = QLCNIC_H2C_OPCODE_CONFIG_HW_LRO | ((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+
+	word = 0;
+	if (enable) {
+		word = QLCNIC_ENABLE_IPV4_LRO;
+		if (adapter->ahw->extra_capability[0] &
+		    QLCNIC_FW_CAP2_HW_LRO_IPV6)
+			word |= QLCNIC_ENABLE_IPV6_LRO;
+	}
+
+	req.words[0] = cpu_to_le64(word);
+
+	rv = qlcnic_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0)
+		dev_err(&adapter->netdev->dev,
+			"Could not send configure hw lro request\n");
+
+	return rv;
+}
+
+int qlcnic_config_bridged_mode(struct qlcnic_adapter *adapter, u32 enable)
+{
+	struct qlcnic_nic_req req;
+	u64 word;
+	int rv;
+
+	if (!!(adapter->flags & QLCNIC_BRIDGE_ENABLED) == enable)
+		return 0;
+
+	memset(&req, 0, sizeof(struct qlcnic_nic_req));
+
+	req.qhdr = cpu_to_le64(QLCNIC_HOST_REQUEST << 23);
+
+	word = QLCNIC_H2C_OPCODE_CONFIG_BRIDGING |
+		((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+
+	req.words[0] = cpu_to_le64(enable);
+
+	rv = qlcnic_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0)
+		dev_err(&adapter->netdev->dev,
+			"Could not send configure bridge mode request\n");
+
+	adapter->flags ^= QLCNIC_BRIDGE_ENABLED;
+
+	return rv;
+}
+
+
+#define QLCNIC_RSS_HASHTYPE_IP_TCP	0x3
+#define QLCNIC_ENABLE_TYPE_C_RSS	BIT_10
+#define QLCNIC_RSS_FEATURE_FLAG	(1ULL << 63)
+#define QLCNIC_RSS_IND_TABLE_MASK	0x7ULL
+
+int qlcnic_82xx_config_rss(struct qlcnic_adapter *adapter, int enable)
+{
+	struct qlcnic_nic_req req;
+	u64 word;
+	int i, rv;
+
+	static const u64 key[] = {
+		0xbeac01fa6a42b73bULL, 0x8030f20c77cb2da3ULL,
+		0xae7b30b4d0ca2bcbULL, 0x43a38fb04167253dULL,
+		0x255b0ec26d5a56daULL
+	};
+
+	memset(&req, 0, sizeof(struct qlcnic_nic_req));
+	req.qhdr = cpu_to_le64(QLCNIC_HOST_REQUEST << 23);
+
+	word = QLCNIC_H2C_OPCODE_CONFIG_RSS | ((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+
+	/*
+	 * RSS request:
+	 * bits 3-0: hash_method
+	 *      5-4: hash_type_ipv4
+	 *	7-6: hash_type_ipv6
+	 *	  8: enable
+	 *        9: use indirection table
+	 *       10: type-c rss
+	 *	 11: udp rss
+	 *    47-12: reserved
+	 *    62-48: indirection table mask
+	 *	 63: feature flag
+	 */
+	word =  ((u64)(QLCNIC_RSS_HASHTYPE_IP_TCP & 0x3) << 4) |
+		((u64)(QLCNIC_RSS_HASHTYPE_IP_TCP & 0x3) << 6) |
+		((u64)(enable & 0x1) << 8) |
+		((u64)QLCNIC_RSS_IND_TABLE_MASK << 48) |
+		(u64)QLCNIC_ENABLE_TYPE_C_RSS |
+		(u64)QLCNIC_RSS_FEATURE_FLAG;
+
+	req.words[0] = cpu_to_le64(word);
+	for (i = 0; i < 5; i++)
+		req.words[i+1] = cpu_to_le64(key[i]);
+
+	rv = qlcnic_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0)
+		dev_err(&adapter->netdev->dev, "could not configure RSS\n");
+
+	return rv;
+}
+
+void qlcnic_82xx_config_ipaddr(struct qlcnic_adapter *adapter,
+			       __be32 ip, int cmd)
+{
+	struct qlcnic_nic_req req;
+	struct qlcnic_ipaddr *ipa;
+	u64 word;
+	int rv;
+
+	memset(&req, 0, sizeof(struct qlcnic_nic_req));
+	req.qhdr = cpu_to_le64(QLCNIC_HOST_REQUEST << 23);
+
+	word = QLCNIC_H2C_OPCODE_CONFIG_IPADDR | ((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+
+	req.words[0] = cpu_to_le64(cmd);
+	ipa = (struct qlcnic_ipaddr *)&req.words[1];
+	ipa->ipv4 = ip;
+
+	rv = qlcnic_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0)
+		dev_err(&adapter->netdev->dev,
+				"could not notify %s IP 0x%x request\n",
+				(cmd == QLCNIC_IP_UP) ? "Add" : "Remove", ip);
+}
+
+int qlcnic_82xx_linkevent_request(struct qlcnic_adapter *adapter, int enable)
+{
+	struct qlcnic_nic_req req;
+	u64 word;
+	int rv;
+	memset(&req, 0, sizeof(struct qlcnic_nic_req));
+	req.qhdr = cpu_to_le64(QLCNIC_HOST_REQUEST << 23);
+
+	word = QLCNIC_H2C_OPCODE_GET_LINKEVENT | ((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+	req.words[0] = cpu_to_le64(enable | (enable << 8));
+	rv = qlcnic_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0)
+		dev_err(&adapter->netdev->dev,
+				"could not configure link notification\n");
+
+	return rv;
+}
+
+static int qlcnic_send_lro_cleanup(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_nic_req req;
+	u64 word;
+	int rv;
+
+	if (!test_bit(__QLCNIC_FW_ATTACHED, &adapter->state))
+		return 0;
+
+	memset(&req, 0, sizeof(struct qlcnic_nic_req));
+	req.qhdr = cpu_to_le64(QLCNIC_HOST_REQUEST << 23);
+
+	word = QLCNIC_H2C_OPCODE_LRO_REQUEST |
+		((u64)adapter->portnum << 16) |
+		((u64)QLCNIC_LRO_REQUEST_CLEANUP << 56) ;
+
+	req.req_hdr = cpu_to_le64(word);
+
+	rv = qlcnic_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv != 0)
+		dev_err(&adapter->netdev->dev,
+				 "could not cleanup lro flows\n");
+
+	return rv;
+}
+
+/*
+ * qlcnic_change_mtu - Change the Maximum Transfer Unit
+ * @returns 0 on success, negative on failure
+ */
+
+int qlcnic_change_mtu(struct net_device *netdev, int mtu)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	int rc = 0;
+
+	rc = qlcnic_fw_cmd_set_mtu(adapter, mtu);
+
+	if (!rc)
+		netdev->mtu = mtu;
+
+	return rc;
+}
+
+static netdev_features_t qlcnic_process_flags(struct qlcnic_adapter *adapter,
+					      netdev_features_t features)
+{
+	u32 offload_flags = adapter->offload_flags;
+
+	if (offload_flags & BIT_0) {
+		features |= NETIF_F_RXCSUM | NETIF_F_IP_CSUM |
+			    NETIF_F_IPV6_CSUM;
+		adapter->rx_csum = 1;
+		if (QLCNIC_IS_TSO_CAPABLE(adapter)) {
+			if (!(offload_flags & BIT_1))
+				features &= ~NETIF_F_TSO;
+			else
+				features |= NETIF_F_TSO;
+
+			if (!(offload_flags & BIT_2))
+				features &= ~NETIF_F_TSO6;
+			else
+				features |= NETIF_F_TSO6;
+		}
+	} else {
+		features &= ~(NETIF_F_RXCSUM |
+			      NETIF_F_IP_CSUM |
+			      NETIF_F_IPV6_CSUM);
+
+		if (QLCNIC_IS_TSO_CAPABLE(adapter))
+			features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
+		adapter->rx_csum = 0;
+	}
+
+	return features;
+}
+
+netdev_features_t qlcnic_fix_features(struct net_device *netdev,
+	netdev_features_t features)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	netdev_features_t changed;
+
+	if (qlcnic_82xx_check(adapter) &&
+	    (adapter->flags & QLCNIC_ESWITCH_ENABLED)) {
+		if (adapter->flags & QLCNIC_APP_CHANGED_FLAGS) {
+			features = qlcnic_process_flags(adapter, features);
+		} else {
+			changed = features ^ netdev->features;
+			features ^= changed & (NETIF_F_RXCSUM |
+					       NETIF_F_IP_CSUM |
+					       NETIF_F_IPV6_CSUM |
+					       NETIF_F_TSO |
+					       NETIF_F_TSO6);
+		}
+	}
+
+	if (!(features & NETIF_F_RXCSUM))
+		features &= ~NETIF_F_LRO;
+
+	return features;
+}
+
+
+int qlcnic_set_features(struct net_device *netdev, netdev_features_t features)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	netdev_features_t changed = netdev->features ^ features;
+	int hw_lro = (features & NETIF_F_LRO) ? QLCNIC_LRO_ENABLED : 0;
+
+	if (!(changed & NETIF_F_LRO))
+		return 0;
+
+	netdev->features ^= NETIF_F_LRO;
+
+	if (qlcnic_config_hw_lro(adapter, hw_lro))
+		return -EIO;
+
+	if (!hw_lro && qlcnic_82xx_check(adapter)) {
+		if (qlcnic_send_lro_cleanup(adapter))
+			return -EIO;
+	}
+
+	return 0;
+}
+
+/*
+ * Changes the CRB window to the specified window.
+ */
+ /* Returns < 0 if off is not valid,
+ *	 1 if window access is needed. 'off' is set to offset from
+ *	   CRB space in 128M pci map
+ *	 0 if no window access is needed. 'off' is set to 2M addr
+ * In: 'off' is offset from base in 128M pci map
+ */
+static int qlcnic_pci_get_crb_addr_2M(struct qlcnic_hardware_context *ahw,
+				      ulong off, void __iomem **addr)
+{
+	const struct crb_128M_2M_sub_block_map *m;
+
+	if ((off >= QLCNIC_CRB_MAX) || (off < QLCNIC_PCI_CRBSPACE))
+		return -EINVAL;
+
+	off -= QLCNIC_PCI_CRBSPACE;
+
+	/*
+	 * Try direct map
+	 */
+	m = &crb_128M_2M_map[CRB_BLK(off)].sub_block[CRB_SUBBLK(off)];
+
+	if (m->valid && (m->start_128M <= off) && (m->end_128M > off)) {
+		*addr = ahw->pci_base0 + m->start_2M +
+			(off - m->start_128M);
+		return 0;
+	}
+
+	/*
+	 * Not in direct map, use crb window
+	 */
+	*addr = ahw->pci_base0 + CRB_INDIRECT_2M + (off & MASK(16));
+	return 1;
+}
+
+/*
+ * In: 'off' is offset from CRB space in 128M pci map
+ * Out: 'off' is 2M pci map addr
+ * side effect: lock crb window
+ */
+static int
+qlcnic_pci_set_crbwindow_2M(struct qlcnic_adapter *adapter, ulong off)
+{
+	u32 window;
+	void __iomem *addr = adapter->ahw->pci_base0 + CRB_WINDOW_2M;
+
+	off -= QLCNIC_PCI_CRBSPACE;
+
+	window = CRB_HI(off);
+	if (window == 0) {
+		dev_err(&adapter->pdev->dev, "Invalid offset 0x%lx\n", off);
+		return -EIO;
+	}
+
+	writel(window, addr);
+	if (readl(addr) != window) {
+		if (printk_ratelimit())
+			dev_warn(&adapter->pdev->dev,
+				"failed to set CRB window to %d off 0x%lx\n",
+				window, off);
+		return -EIO;
+	}
+	return 0;
+}
+
+int qlcnic_82xx_hw_write_wx_2M(struct qlcnic_adapter *adapter, ulong off,
+			       u32 data)
+{
+	unsigned long flags;
+	int rv;
+	void __iomem *addr = NULL;
+
+	rv = qlcnic_pci_get_crb_addr_2M(adapter->ahw, off, &addr);
+
+	if (rv == 0) {
+		writel(data, addr);
+		return 0;
+	}
+
+	if (rv > 0) {
+		/* indirect access */
+		write_lock_irqsave(&adapter->ahw->crb_lock, flags);
+		crb_win_lock(adapter);
+		rv = qlcnic_pci_set_crbwindow_2M(adapter, off);
+		if (!rv)
+			writel(data, addr);
+		crb_win_unlock(adapter);
+		write_unlock_irqrestore(&adapter->ahw->crb_lock, flags);
+		return rv;
+	}
+
+	dev_err(&adapter->pdev->dev,
+			"%s: invalid offset: 0x%016lx\n", __func__, off);
+	dump_stack();
+	return -EIO;
+}
+
+int qlcnic_82xx_hw_read_wx_2M(struct qlcnic_adapter *adapter, ulong off,
+			      int *err)
+{
+	unsigned long flags;
+	int rv;
+	u32 data = -1;
+	void __iomem *addr = NULL;
+
+	rv = qlcnic_pci_get_crb_addr_2M(adapter->ahw, off, &addr);
+
+	if (rv == 0)
+		return readl(addr);
+
+	if (rv > 0) {
+		/* indirect access */
+		write_lock_irqsave(&adapter->ahw->crb_lock, flags);
+		crb_win_lock(adapter);
+		if (!qlcnic_pci_set_crbwindow_2M(adapter, off))
+			data = readl(addr);
+		crb_win_unlock(adapter);
+		write_unlock_irqrestore(&adapter->ahw->crb_lock, flags);
+		return data;
+	}
+
+	dev_err(&adapter->pdev->dev,
+			"%s: invalid offset: 0x%016lx\n", __func__, off);
+	dump_stack();
+	return -1;
+}
+
+void __iomem *qlcnic_get_ioaddr(struct qlcnic_hardware_context *ahw,
+				u32 offset)
+{
+	void __iomem *addr = NULL;
+
+	WARN_ON(qlcnic_pci_get_crb_addr_2M(ahw, offset, &addr));
+
+	return addr;
+}
+
+static int qlcnic_pci_mem_access_direct(struct qlcnic_adapter *adapter,
+					u32 window, u64 off, u64 *data, int op)
+{
+	void __iomem *addr;
+	u32 start;
+
+	mutex_lock(&adapter->ahw->mem_lock);
+
+	writel(window, adapter->ahw->ocm_win_crb);
+	/* read back to flush */
+	readl(adapter->ahw->ocm_win_crb);
+	start = QLCNIC_PCI_OCM0_2M + off;
+
+	addr = adapter->ahw->pci_base0 + start;
+
+	if (op == 0)	/* read */
+		*data = readq(addr);
+	else		/* write */
+		writeq(*data, addr);
+
+	/* Set window to 0 */
+	writel(0, adapter->ahw->ocm_win_crb);
+	readl(adapter->ahw->ocm_win_crb);
+
+	mutex_unlock(&adapter->ahw->mem_lock);
+	return 0;
+}
+
+static void
+qlcnic_pci_camqm_read_2M(struct qlcnic_adapter *adapter, u64 off, u64 *data)
+{
+	void __iomem *addr = adapter->ahw->pci_base0 +
+		QLCNIC_PCI_CAMQM_2M_BASE + (off - QLCNIC_PCI_CAMQM);
+
+	mutex_lock(&adapter->ahw->mem_lock);
+	*data = readq(addr);
+	mutex_unlock(&adapter->ahw->mem_lock);
+}
+
+static void
+qlcnic_pci_camqm_write_2M(struct qlcnic_adapter *adapter, u64 off, u64 data)
+{
+	void __iomem *addr = adapter->ahw->pci_base0 +
+		QLCNIC_PCI_CAMQM_2M_BASE + (off - QLCNIC_PCI_CAMQM);
+
+	mutex_lock(&adapter->ahw->mem_lock);
+	writeq(data, addr);
+	mutex_unlock(&adapter->ahw->mem_lock);
+}
+
+
+
+/* Set MS memory control data for different adapters */
+static void qlcnic_set_ms_controls(struct qlcnic_adapter *adapter, u64 off,
+				   struct qlcnic_ms_reg_ctrl *ms)
+{
+	ms->control = QLCNIC_MS_CTRL;
+	ms->low = QLCNIC_MS_ADDR_LO;
+	ms->hi = QLCNIC_MS_ADDR_HI;
+	if (off & 0xf) {
+		ms->wd[0] = QLCNIC_MS_WRTDATA_LO;
+		ms->rd[0] = QLCNIC_MS_RDDATA_LO;
+		ms->wd[1] = QLCNIC_MS_WRTDATA_HI;
+		ms->rd[1] = QLCNIC_MS_RDDATA_HI;
+		ms->wd[2] = QLCNIC_MS_WRTDATA_ULO;
+		ms->wd[3] = QLCNIC_MS_WRTDATA_UHI;
+		ms->rd[2] = QLCNIC_MS_RDDATA_ULO;
+		ms->rd[3] = QLCNIC_MS_RDDATA_UHI;
+	} else {
+		ms->wd[0] = QLCNIC_MS_WRTDATA_ULO;
+		ms->rd[0] = QLCNIC_MS_RDDATA_ULO;
+		ms->wd[1] = QLCNIC_MS_WRTDATA_UHI;
+		ms->rd[1] = QLCNIC_MS_RDDATA_UHI;
+		ms->wd[2] = QLCNIC_MS_WRTDATA_LO;
+		ms->wd[3] = QLCNIC_MS_WRTDATA_HI;
+		ms->rd[2] = QLCNIC_MS_RDDATA_LO;
+		ms->rd[3] = QLCNIC_MS_RDDATA_HI;
+	}
+
+	ms->ocm_window = OCM_WIN_P3P(off);
+	ms->off = GET_MEM_OFFS_2M(off);
+}
+
+int qlcnic_pci_mem_write_2M(struct qlcnic_adapter *adapter, u64 off, u64 data)
+{
+	int j, ret = 0;
+	u32 temp, off8;
+	struct qlcnic_ms_reg_ctrl ms;
+
+	/* Only 64-bit aligned access */
+	if (off & 7)
+		return -EIO;
+
+	memset(&ms, 0, sizeof(struct qlcnic_ms_reg_ctrl));
+	if (!(ADDR_IN_RANGE(off, QLCNIC_ADDR_QDR_NET,
+			    QLCNIC_ADDR_QDR_NET_MAX) ||
+	      ADDR_IN_RANGE(off, QLCNIC_ADDR_DDR_NET,
+			    QLCNIC_ADDR_DDR_NET_MAX)))
+		return -EIO;
+
+	qlcnic_set_ms_controls(adapter, off, &ms);
+
+	if (ADDR_IN_RANGE(off, QLCNIC_ADDR_OCM0, QLCNIC_ADDR_OCM0_MAX))
+		return qlcnic_pci_mem_access_direct(adapter, ms.ocm_window,
+						    ms.off, &data, 1);
+
+	off8 = off & ~0xf;
+
+	mutex_lock(&adapter->ahw->mem_lock);
+
+	qlcnic_ind_wr(adapter, ms.low, off8);
+	qlcnic_ind_wr(adapter, ms.hi, 0);
+
+	qlcnic_ind_wr(adapter, ms.control, TA_CTL_ENABLE);
+	qlcnic_ind_wr(adapter, ms.control, QLCNIC_TA_START_ENABLE);
+
+	for (j = 0; j < MAX_CTL_CHECK; j++) {
+		temp = qlcnic_ind_rd(adapter, ms.control);
+		if ((temp & TA_CTL_BUSY) == 0)
+			break;
+	}
+
+	if (j >= MAX_CTL_CHECK) {
+		ret = -EIO;
+		goto done;
+	}
+
+	/* This is the modify part of read-modify-write */
+	qlcnic_ind_wr(adapter, ms.wd[0], qlcnic_ind_rd(adapter, ms.rd[0]));
+	qlcnic_ind_wr(adapter, ms.wd[1], qlcnic_ind_rd(adapter, ms.rd[1]));
+	/* This is the write part of read-modify-write */
+	qlcnic_ind_wr(adapter, ms.wd[2], data & 0xffffffff);
+	qlcnic_ind_wr(adapter, ms.wd[3], (data >> 32) & 0xffffffff);
+
+	qlcnic_ind_wr(adapter, ms.control, QLCNIC_TA_WRITE_ENABLE);
+	qlcnic_ind_wr(adapter, ms.control, QLCNIC_TA_WRITE_START);
+
+	for (j = 0; j < MAX_CTL_CHECK; j++) {
+		temp = qlcnic_ind_rd(adapter, ms.control);
+		if ((temp & TA_CTL_BUSY) == 0)
+			break;
+	}
+
+	if (j >= MAX_CTL_CHECK) {
+		if (printk_ratelimit())
+			dev_err(&adapter->pdev->dev,
+					"failed to write through agent\n");
+		ret = -EIO;
+	} else
+		ret = 0;
+
+done:
+	mutex_unlock(&adapter->ahw->mem_lock);
+
+	return ret;
+}
+
+int qlcnic_pci_mem_read_2M(struct qlcnic_adapter *adapter, u64 off, u64 *data)
+{
+	int j, ret;
+	u32 temp, off8;
+	u64 val;
+	struct qlcnic_ms_reg_ctrl ms;
+
+	/* Only 64-bit aligned access */
+	if (off & 7)
+		return -EIO;
+	if (!(ADDR_IN_RANGE(off, QLCNIC_ADDR_QDR_NET,
+			    QLCNIC_ADDR_QDR_NET_MAX) ||
+	      ADDR_IN_RANGE(off, QLCNIC_ADDR_DDR_NET,
+			    QLCNIC_ADDR_DDR_NET_MAX)))
+		return -EIO;
+
+	memset(&ms, 0, sizeof(struct qlcnic_ms_reg_ctrl));
+	qlcnic_set_ms_controls(adapter, off, &ms);
+
+	if (ADDR_IN_RANGE(off, QLCNIC_ADDR_OCM0, QLCNIC_ADDR_OCM0_MAX))
+		return qlcnic_pci_mem_access_direct(adapter, ms.ocm_window,
+						    ms.off, data, 0);
+
+	mutex_lock(&adapter->ahw->mem_lock);
+
+	off8 = off & ~0xf;
+
+	qlcnic_ind_wr(adapter, ms.low, off8);
+	qlcnic_ind_wr(adapter, ms.hi, 0);
+
+	qlcnic_ind_wr(adapter, ms.control, TA_CTL_ENABLE);
+	qlcnic_ind_wr(adapter, ms.control, QLCNIC_TA_START_ENABLE);
+
+	for (j = 0; j < MAX_CTL_CHECK; j++) {
+		temp = qlcnic_ind_rd(adapter, ms.control);
+		if ((temp & TA_CTL_BUSY) == 0)
+			break;
+	}
+
+	if (j >= MAX_CTL_CHECK) {
+		if (printk_ratelimit())
+			dev_err(&adapter->pdev->dev,
+					"failed to read through agent\n");
+		ret = -EIO;
+	} else {
+
+		temp = qlcnic_ind_rd(adapter, ms.rd[3]);
+		val = (u64)temp << 32;
+		val |= qlcnic_ind_rd(adapter, ms.rd[2]);
+		*data = val;
+		ret = 0;
+	}
+
+	mutex_unlock(&adapter->ahw->mem_lock);
+
+	return ret;
+}
+
+int qlcnic_82xx_get_board_info(struct qlcnic_adapter *adapter)
+{
+	int offset, board_type, magic, err = 0;
+	struct pci_dev *pdev = adapter->pdev;
+
+	offset = QLCNIC_FW_MAGIC_OFFSET;
+	if (qlcnic_rom_fast_read(adapter, offset, &magic))
+		return -EIO;
+
+	if (magic != QLCNIC_BDINFO_MAGIC) {
+		dev_err(&pdev->dev, "invalid board config, magic=%08x\n",
+			magic);
+		return -EIO;
+	}
+
+	offset = QLCNIC_BRDTYPE_OFFSET;
+	if (qlcnic_rom_fast_read(adapter, offset, &board_type))
+		return -EIO;
+
+	adapter->ahw->board_type = board_type;
+
+	if (board_type == QLCNIC_BRDTYPE_P3P_4_GB_MM) {
+		u32 gpio = QLCRD32(adapter, QLCNIC_ROMUSB_GLB_PAD_GPIO_I, &err);
+		if (err == -EIO)
+			return err;
+		if ((gpio & 0x8000) == 0)
+			board_type = QLCNIC_BRDTYPE_P3P_10G_TP;
+	}
+
+	switch (board_type) {
+	case QLCNIC_BRDTYPE_P3P_HMEZ:
+	case QLCNIC_BRDTYPE_P3P_XG_LOM:
+	case QLCNIC_BRDTYPE_P3P_10G_CX4:
+	case QLCNIC_BRDTYPE_P3P_10G_CX4_LP:
+	case QLCNIC_BRDTYPE_P3P_IMEZ:
+	case QLCNIC_BRDTYPE_P3P_10G_SFP_PLUS:
+	case QLCNIC_BRDTYPE_P3P_10G_SFP_CT:
+	case QLCNIC_BRDTYPE_P3P_10G_SFP_QT:
+	case QLCNIC_BRDTYPE_P3P_10G_XFP:
+	case QLCNIC_BRDTYPE_P3P_10000_BASE_T:
+		adapter->ahw->port_type = QLCNIC_XGBE;
+		break;
+	case QLCNIC_BRDTYPE_P3P_REF_QG:
+	case QLCNIC_BRDTYPE_P3P_4_GB:
+	case QLCNIC_BRDTYPE_P3P_4_GB_MM:
+		adapter->ahw->port_type = QLCNIC_GBE;
+		break;
+	case QLCNIC_BRDTYPE_P3P_10G_TP:
+		adapter->ahw->port_type = (adapter->portnum < 2) ?
+			QLCNIC_XGBE : QLCNIC_GBE;
+		break;
+	default:
+		dev_err(&pdev->dev, "unknown board type %x\n", board_type);
+		adapter->ahw->port_type = QLCNIC_XGBE;
+		break;
+	}
+
+	return 0;
+}
+
+static int
+qlcnic_wol_supported(struct qlcnic_adapter *adapter)
+{
+	u32 wol_cfg;
+	int err = 0;
+
+	wol_cfg = QLCRD32(adapter, QLCNIC_WOL_CONFIG_NV, &err);
+	if (wol_cfg & (1UL << adapter->portnum)) {
+		wol_cfg = QLCRD32(adapter, QLCNIC_WOL_CONFIG, &err);
+		if (err == -EIO)
+			return err;
+		if (wol_cfg & (1 << adapter->portnum))
+			return 1;
+	}
+
+	return 0;
+}
+
+int qlcnic_82xx_config_led(struct qlcnic_adapter *adapter, u32 state, u32 rate)
+{
+	struct qlcnic_nic_req   req;
+	int rv;
+	u64 word;
+
+	memset(&req, 0, sizeof(struct qlcnic_nic_req));
+	req.qhdr = cpu_to_le64(QLCNIC_HOST_REQUEST << 23);
+
+	word = QLCNIC_H2C_OPCODE_CONFIG_LED | ((u64)adapter->portnum << 16);
+	req.req_hdr = cpu_to_le64(word);
+
+	req.words[0] = cpu_to_le64(((u64)rate << 32) | adapter->portnum);
+	req.words[1] = cpu_to_le64(state);
+
+	rv = qlcnic_send_cmd_descs(adapter, (struct cmd_desc_type0 *)&req, 1);
+	if (rv)
+		dev_err(&adapter->pdev->dev, "LED configuration failed.\n");
+
+	return rv;
+}
+
+void qlcnic_82xx_get_beacon_state(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_cmd_args cmd;
+	u8 beacon_state;
+	int err = 0;
+
+	if (ahw->extra_capability[0] & QLCNIC_FW_CAPABILITY_2_BEACON) {
+		err = qlcnic_alloc_mbx_args(&cmd, adapter,
+					    QLCNIC_CMD_GET_LED_STATUS);
+		if (!err) {
+			err = qlcnic_issue_cmd(adapter, &cmd);
+			if (err) {
+				netdev_err(adapter->netdev,
+					   "Failed to get current beacon state, err=%d\n",
+					   err);
+			} else {
+				beacon_state = cmd.rsp.arg[1];
+				if (beacon_state == QLCNIC_BEACON_DISABLE)
+					ahw->beacon_state = QLCNIC_BEACON_OFF;
+				else if (beacon_state == QLCNIC_BEACON_EANBLE)
+					ahw->beacon_state = QLCNIC_BEACON_ON;
+			}
+		}
+		qlcnic_free_mbx_args(&cmd);
+	}
+
+	return;
+}
+
+void qlcnic_82xx_get_func_no(struct qlcnic_adapter *adapter)
+{
+	void __iomem *msix_base_addr;
+	u32 func;
+	u32 msix_base;
+
+	pci_read_config_dword(adapter->pdev, QLCNIC_MSIX_TABLE_OFFSET, &func);
+	msix_base_addr = adapter->ahw->pci_base0 + QLCNIC_MSIX_BASE;
+	msix_base = readl(msix_base_addr);
+	func = (func - msix_base) / QLCNIC_MSIX_TBL_PGSIZE;
+	adapter->ahw->pci_func = func;
+}
+
+void qlcnic_82xx_read_crb(struct qlcnic_adapter *adapter, char *buf,
+			  loff_t offset, size_t size)
+{
+	int err = 0;
+	u32 data;
+	u64 qmdata;
+
+	if (ADDR_IN_RANGE(offset, QLCNIC_PCI_CAMQM, QLCNIC_PCI_CAMQM_END)) {
+		qlcnic_pci_camqm_read_2M(adapter, offset, &qmdata);
+		memcpy(buf, &qmdata, size);
+	} else {
+		data = QLCRD32(adapter, offset, &err);
+		memcpy(buf, &data, size);
+	}
+}
+
+void qlcnic_82xx_write_crb(struct qlcnic_adapter *adapter, char *buf,
+			   loff_t offset, size_t size)
+{
+	u32 data;
+	u64 qmdata;
+
+	if (ADDR_IN_RANGE(offset, QLCNIC_PCI_CAMQM, QLCNIC_PCI_CAMQM_END)) {
+		memcpy(&qmdata, buf, size);
+		qlcnic_pci_camqm_write_2M(adapter, offset, qmdata);
+	} else {
+		memcpy(&data, buf, size);
+		QLCWR32(adapter, offset, data);
+	}
+}
+
+int qlcnic_82xx_api_lock(struct qlcnic_adapter *adapter)
+{
+	return qlcnic_pcie_sem_lock(adapter, 5, 0);
+}
+
+void qlcnic_82xx_api_unlock(struct qlcnic_adapter *adapter)
+{
+	qlcnic_pcie_sem_unlock(adapter, 5);
+}
+
+int qlcnic_82xx_shutdown(struct pci_dev *pdev)
+{
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+	struct net_device *netdev = adapter->netdev;
+	int retval;
+
+	netif_device_detach(netdev);
+
+	qlcnic_cancel_idc_work(adapter);
+
+	if (netif_running(netdev))
+		qlcnic_down(adapter, netdev);
+
+	qlcnic_clr_all_drv_state(adapter, 0);
+
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+
+	retval = pci_save_state(pdev);
+	if (retval)
+		return retval;
+
+	if (qlcnic_wol_supported(adapter)) {
+		pci_enable_wake(pdev, PCI_D3cold, 1);
+		pci_enable_wake(pdev, PCI_D3hot, 1);
+	}
+
+	return 0;
+}
+
+int qlcnic_82xx_resume(struct qlcnic_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	int err;
+
+	err = qlcnic_start_firmware(adapter);
+	if (err) {
+		dev_err(&adapter->pdev->dev, "failed to start firmware\n");
+		return err;
+	}
+
+	if (netif_running(netdev)) {
+		err = qlcnic_up(adapter, netdev);
+		if (!err)
+			qlcnic_restore_indev_addr(netdev, NETDEV_UP);
+	}
+
+	netif_device_attach(netdev);
+	qlcnic_schedule_work(adapter, qlcnic_fw_poll_work, FW_POLL_DELAY);
+	return err;
+}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.h
new file mode 100644
index 0000000..4bb33af
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.h
@@ -0,0 +1,226 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#ifndef __QLCNIC_HW_H
+#define __QLCNIC_HW_H
+
+/* Common registers in 83xx and 82xx */
+enum qlcnic_regs {
+	QLCNIC_PEG_HALT_STATUS1 = 0,
+	QLCNIC_PEG_HALT_STATUS2,
+	QLCNIC_PEG_ALIVE_COUNTER,
+	QLCNIC_FLASH_LOCK_OWNER,
+	QLCNIC_FW_CAPABILITIES,
+	QLCNIC_CRB_DRV_ACTIVE,
+	QLCNIC_CRB_DEV_STATE,
+	QLCNIC_CRB_DRV_STATE,
+	QLCNIC_CRB_DRV_SCRATCH,
+	QLCNIC_CRB_DEV_PARTITION_INFO,
+	QLCNIC_CRB_DRV_IDC_VER,
+	QLCNIC_FW_VERSION_MAJOR,
+	QLCNIC_FW_VERSION_MINOR,
+	QLCNIC_FW_VERSION_SUB,
+	QLCNIC_CRB_DEV_NPAR_STATE,
+	QLCNIC_FW_IMG_VALID,
+	QLCNIC_CMDPEG_STATE,
+	QLCNIC_RCVPEG_STATE,
+	QLCNIC_ASIC_TEMP,
+	QLCNIC_FW_API,
+	QLCNIC_DRV_OP_MODE,
+	QLCNIC_FLASH_LOCK,
+	QLCNIC_FLASH_UNLOCK,
+};
+
+/* Read from an address offset from BAR0, existing registers */
+#define QLC_SHARED_REG_RD32(a, addr)			\
+	readl(((a)->ahw->pci_base0) + ((a)->ahw->reg_tbl[addr]))
+
+/* Write to an address offset from BAR0, existing registers */
+#define QLC_SHARED_REG_WR32(a, addr, value)		\
+	writel(value, ((a)->ahw->pci_base0) + ((a)->ahw->reg_tbl[addr]))
+
+/* Read from a direct address offset from BAR0, additional registers */
+#define QLCRDX(ahw, addr)	\
+	readl(((ahw)->pci_base0) + ((ahw)->ext_reg_tbl[addr]))
+
+/* Write to a direct address offset from BAR0, additional registers */
+#define QLCWRX(ahw, addr, value)	\
+	writel(value, (((ahw)->pci_base0) + ((ahw)->ext_reg_tbl[addr])))
+
+#define QLCNIC_CMD_CONFIGURE_IP_ADDR		0x1
+#define QLCNIC_CMD_CONFIG_INTRPT		0x2
+#define QLCNIC_CMD_CREATE_RX_CTX		0x7
+#define QLCNIC_CMD_DESTROY_RX_CTX		0x8
+#define QLCNIC_CMD_CREATE_TX_CTX		0x9
+#define QLCNIC_CMD_DESTROY_TX_CTX		0xa
+#define QLCNIC_CMD_CONFIGURE_LRO		0xC
+#define QLCNIC_CMD_CONFIGURE_MAC_LEARNING	0xD
+#define QLCNIC_CMD_GET_STATISTICS		0xF
+#define QLCNIC_CMD_INTRPT_TEST			0x11
+#define QLCNIC_CMD_SET_MTU			0x12
+#define QLCNIC_CMD_READ_PHY			0x13
+#define QLCNIC_CMD_WRITE_PHY			0x14
+#define QLCNIC_CMD_READ_HW_REG			0x15
+#define QLCNIC_CMD_GET_FLOW_CTL			0x16
+#define QLCNIC_CMD_SET_FLOW_CTL			0x17
+#define QLCNIC_CMD_READ_MAX_MTU			0x18
+#define QLCNIC_CMD_READ_MAX_LRO			0x19
+#define QLCNIC_CMD_MAC_ADDRESS			0x1f
+#define QLCNIC_CMD_GET_PCI_INFO			0x20
+#define QLCNIC_CMD_GET_NIC_INFO			0x21
+#define QLCNIC_CMD_SET_NIC_INFO			0x22
+#define QLCNIC_CMD_GET_ESWITCH_CAPABILITY	0x24
+#define QLCNIC_CMD_TOGGLE_ESWITCH		0x25
+#define QLCNIC_CMD_GET_ESWITCH_STATUS		0x26
+#define QLCNIC_CMD_SET_PORTMIRRORING		0x27
+#define QLCNIC_CMD_CONFIGURE_ESWITCH		0x28
+#define QLCNIC_CMD_GET_ESWITCH_PORT_CONFIG	0x29
+#define QLCNIC_CMD_GET_ESWITCH_STATS		0x2a
+#define QLCNIC_CMD_CONFIG_PORT			0x2e
+#define QLCNIC_CMD_TEMP_SIZE			0x2f
+#define QLCNIC_CMD_GET_TEMP_HDR			0x30
+#define QLCNIC_CMD_BC_EVENT_SETUP		0x31
+#define	QLCNIC_CMD_CONFIG_VPORT			0x32
+#define	QLCNIC_CMD_DCB_QUERY_CAP		0x34
+#define	QLCNIC_CMD_DCB_QUERY_PARAM		0x35
+#define QLCNIC_CMD_GET_MAC_STATS		0x37
+#define QLCNIC_CMD_82XX_SET_DRV_VER		0x38
+#define QLCNIC_CMD_MQ_TX_CONFIG_INTR		0x39
+#define QLCNIC_CMD_GET_LED_STATUS		0x3C
+#define QLCNIC_CMD_CONFIGURE_RSS		0x41
+#define QLCNIC_CMD_CONFIG_INTR_COAL		0x43
+#define QLCNIC_CMD_CONFIGURE_LED		0x44
+#define QLCNIC_CMD_CONFIG_MAC_VLAN		0x45
+#define QLCNIC_CMD_GET_LINK_EVENT		0x48
+#define QLCNIC_CMD_CONFIGURE_MAC_RX_MODE	0x49
+#define QLCNIC_CMD_CONFIGURE_HW_LRO		0x4A
+#define QLCNIC_CMD_SET_INGRESS_ENCAP		0x4E
+#define QLCNIC_CMD_INIT_NIC_FUNC		0x60
+#define QLCNIC_CMD_STOP_NIC_FUNC		0x61
+#define QLCNIC_CMD_IDC_ACK			0x63
+#define QLCNIC_CMD_SET_PORT_CONFIG		0x66
+#define QLCNIC_CMD_GET_PORT_CONFIG		0x67
+#define QLCNIC_CMD_GET_LINK_STATUS		0x68
+#define QLCNIC_CMD_SET_LED_CONFIG		0x69
+#define QLCNIC_CMD_GET_LED_CONFIG		0x6A
+#define QLCNIC_CMD_83XX_SET_DRV_VER		0x6F
+#define QLCNIC_CMD_ADD_RCV_RINGS		0x0B
+#define QLCNIC_CMD_83XX_EXTEND_ISCSI_DUMP_CAP	0x37
+
+#define QLCNIC_INTRPT_INTX			1
+#define QLCNIC_INTRPT_MSIX			3
+#define QLCNIC_INTRPT_ADD			1
+#define QLCNIC_INTRPT_DEL			2
+
+#define QLCNIC_GET_CURRENT_MAC			1
+#define QLCNIC_SET_STATION_MAC			2
+#define QLCNIC_GET_DEFAULT_MAC			3
+#define QLCNIC_GET_FAC_DEF_MAC			4
+#define QLCNIC_SET_FAC_DEF_MAC			5
+
+#define QLCNIC_MBX_LINK_EVENT		0x8001
+#define QLCNIC_MBX_BC_EVENT		0x8002
+#define QLCNIC_MBX_COMP_EVENT		0x8100
+#define QLCNIC_MBX_REQUEST_EVENT	0x8101
+#define QLCNIC_MBX_TIME_EXTEND_EVENT	0x8102
+#define QLCNIC_MBX_DCBX_CONFIG_CHANGE_EVENT	0x8110
+#define QLCNIC_MBX_SFP_INSERT_EVENT	0x8130
+#define QLCNIC_MBX_SFP_REMOVE_EVENT	0x8131
+
+struct qlcnic_mailbox_metadata {
+	u32 cmd;
+	u32 in_args;
+	u32 out_args;
+};
+
+/* Mailbox ownership */
+#define QLCNIC_GET_OWNER(val)	((val) & (BIT_0 | BIT_1))
+
+#define QLCNIC_SET_OWNER        1
+#define QLCNIC_CLR_OWNER        0
+#define QLCNIC_MBX_TIMEOUT      5000
+
+#define QLCNIC_MBX_RSP_OK	1
+#define QLCNIC_MBX_PORT_RSP_OK	0x1a
+#define QLCNIC_MBX_ASYNC_EVENT	BIT_15
+
+/* Set HW Tx ring limit for 82xx adapter. */
+#define QLCNIC_MAX_HW_TX_RINGS		8
+#define QLCNIC_MAX_HW_VNIC_TX_RINGS	4
+#define QLCNIC_MAX_TX_RINGS		8
+#define QLCNIC_MAX_SDS_RINGS		8
+
+struct qlcnic_pci_info;
+struct qlcnic_info;
+struct qlcnic_cmd_args;
+struct ethtool_stats;
+struct pci_device_id;
+struct qlcnic_host_sds_ring;
+struct qlcnic_host_tx_ring;
+struct qlcnic_hardware_context;
+struct qlcnic_adapter;
+struct qlcnic_fw_dump;
+
+int qlcnic_82xx_hw_read_wx_2M(struct qlcnic_adapter *adapter, ulong, int *);
+int qlcnic_82xx_hw_write_wx_2M(struct qlcnic_adapter *, ulong, u32);
+int qlcnic_82xx_config_hw_lro(struct qlcnic_adapter *adapter, int);
+int qlcnic_82xx_nic_set_promisc(struct qlcnic_adapter *adapter, u32);
+int qlcnic_82xx_napi_add(struct qlcnic_adapter *adapter,
+			 struct net_device *netdev);
+void qlcnic_82xx_get_beacon_state(struct qlcnic_adapter *);
+void qlcnic_82xx_change_filter(struct qlcnic_adapter *adapter,
+			       u64 *uaddr, u16 vlan_id);
+int qlcnic_82xx_config_intr_coalesce(struct qlcnic_adapter *,
+				     struct ethtool_coalesce *);
+int qlcnic_82xx_set_rx_coalesce(struct qlcnic_adapter *);
+int qlcnic_82xx_config_rss(struct qlcnic_adapter *adapter, int);
+void qlcnic_82xx_config_ipaddr(struct qlcnic_adapter *adapter,
+			       __be32, int);
+int qlcnic_82xx_linkevent_request(struct qlcnic_adapter *adapter, int);
+void qlcnic_82xx_process_rcv_ring_diag(struct qlcnic_host_sds_ring *sds_ring);
+int qlcnic_82xx_clear_lb_mode(struct qlcnic_adapter *adapter, u8);
+int qlcnic_82xx_set_lb_mode(struct qlcnic_adapter *, u8);
+void qlcnic_82xx_write_crb(struct qlcnic_adapter *, char *, loff_t, size_t);
+void qlcnic_82xx_read_crb(struct qlcnic_adapter *, char *, loff_t, size_t);
+int qlcnic_82xx_issue_cmd(struct qlcnic_adapter *adapter,
+			  struct qlcnic_cmd_args *);
+int qlcnic_82xx_mq_intrpt(struct qlcnic_adapter *, int);
+int qlcnic_82xx_config_intrpt(struct qlcnic_adapter *, u8);
+int qlcnic_82xx_fw_cmd_create_rx_ctx(struct qlcnic_adapter *);
+int qlcnic_82xx_fw_cmd_create_tx_ctx(struct qlcnic_adapter *,
+				     struct qlcnic_host_tx_ring *tx_ring, int);
+void qlcnic_82xx_fw_cmd_del_rx_ctx(struct qlcnic_adapter *);
+void qlcnic_82xx_fw_cmd_del_tx_ctx(struct qlcnic_adapter *,
+				   struct qlcnic_host_tx_ring *);
+int qlcnic_82xx_sre_macaddr_change(struct qlcnic_adapter *, u8 *, u16, u8);
+int qlcnic_82xx_get_mac_address(struct qlcnic_adapter *, u8*, u8);
+int qlcnic_82xx_get_nic_info(struct qlcnic_adapter *, struct qlcnic_info *, u8);
+int qlcnic_82xx_set_nic_info(struct qlcnic_adapter *, struct qlcnic_info *);
+int qlcnic_82xx_get_pci_info(struct qlcnic_adapter *, struct qlcnic_pci_info*);
+int qlcnic_82xx_alloc_mbx_args(struct qlcnic_cmd_args *,
+			       struct qlcnic_adapter *, u32);
+int qlcnic_82xx_hw_write_wx_2M(struct qlcnic_adapter *, ulong, u32);
+int qlcnic_82xx_get_board_info(struct qlcnic_adapter *);
+int qlcnic_82xx_config_led(struct qlcnic_adapter *, u32, u32);
+void qlcnic_82xx_get_func_no(struct qlcnic_adapter *);
+int qlcnic_82xx_api_lock(struct qlcnic_adapter *);
+void qlcnic_82xx_api_unlock(struct qlcnic_adapter *);
+void qlcnic_82xx_napi_enable(struct qlcnic_adapter *);
+void qlcnic_82xx_napi_disable(struct qlcnic_adapter *);
+void qlcnic_82xx_napi_del(struct qlcnic_adapter *);
+int qlcnic_82xx_shutdown(struct pci_dev *);
+int qlcnic_82xx_resume(struct qlcnic_adapter *);
+void qlcnic_clr_all_drv_state(struct qlcnic_adapter *adapter, u8 failed);
+void qlcnic_fw_poll_work(struct work_struct *work);
+
+u32 qlcnic_82xx_get_saved_state(void *, u32);
+void qlcnic_82xx_set_saved_state(void *, u32, u32);
+void qlcnic_82xx_cache_tmpl_hdr_values(struct qlcnic_fw_dump *);
+u32 qlcnic_82xx_get_cap_size(void *, int);
+void qlcnic_82xx_set_sys_info(void *, int, u32);
+void qlcnic_82xx_store_cap_mask(void *, u32);
+#endif				/* __QLCNIC_HW_H_ */
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c
new file mode 100644
index 0000000..c48a0e2
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c
@@ -0,0 +1,1306 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include "qlcnic.h"
+#include "qlcnic_hw.h"
+
+struct crb_addr_pair {
+	u32 addr;
+	u32 data;
+};
+
+#define QLCNIC_MAX_CRB_XFORM 60
+static unsigned int crb_addr_xform[QLCNIC_MAX_CRB_XFORM];
+
+#define crb_addr_transform(name) \
+	(crb_addr_xform[QLCNIC_HW_PX_MAP_CRB_##name] = \
+	QLCNIC_HW_CRB_HUB_AGT_ADR_##name << 20)
+
+#define QLCNIC_ADDR_ERROR (0xffffffff)
+
+static int
+qlcnic_check_fw_hearbeat(struct qlcnic_adapter *adapter);
+
+static void crb_addr_transform_setup(void)
+{
+	crb_addr_transform(XDMA);
+	crb_addr_transform(TIMR);
+	crb_addr_transform(SRE);
+	crb_addr_transform(SQN3);
+	crb_addr_transform(SQN2);
+	crb_addr_transform(SQN1);
+	crb_addr_transform(SQN0);
+	crb_addr_transform(SQS3);
+	crb_addr_transform(SQS2);
+	crb_addr_transform(SQS1);
+	crb_addr_transform(SQS0);
+	crb_addr_transform(RPMX7);
+	crb_addr_transform(RPMX6);
+	crb_addr_transform(RPMX5);
+	crb_addr_transform(RPMX4);
+	crb_addr_transform(RPMX3);
+	crb_addr_transform(RPMX2);
+	crb_addr_transform(RPMX1);
+	crb_addr_transform(RPMX0);
+	crb_addr_transform(ROMUSB);
+	crb_addr_transform(SN);
+	crb_addr_transform(QMN);
+	crb_addr_transform(QMS);
+	crb_addr_transform(PGNI);
+	crb_addr_transform(PGND);
+	crb_addr_transform(PGN3);
+	crb_addr_transform(PGN2);
+	crb_addr_transform(PGN1);
+	crb_addr_transform(PGN0);
+	crb_addr_transform(PGSI);
+	crb_addr_transform(PGSD);
+	crb_addr_transform(PGS3);
+	crb_addr_transform(PGS2);
+	crb_addr_transform(PGS1);
+	crb_addr_transform(PGS0);
+	crb_addr_transform(PS);
+	crb_addr_transform(PH);
+	crb_addr_transform(NIU);
+	crb_addr_transform(I2Q);
+	crb_addr_transform(EG);
+	crb_addr_transform(MN);
+	crb_addr_transform(MS);
+	crb_addr_transform(CAS2);
+	crb_addr_transform(CAS1);
+	crb_addr_transform(CAS0);
+	crb_addr_transform(CAM);
+	crb_addr_transform(C2C1);
+	crb_addr_transform(C2C0);
+	crb_addr_transform(SMB);
+	crb_addr_transform(OCM0);
+	crb_addr_transform(I2C0);
+}
+
+void qlcnic_release_rx_buffers(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_recv_context *recv_ctx;
+	struct qlcnic_host_rds_ring *rds_ring;
+	struct qlcnic_rx_buffer *rx_buf;
+	int i, ring;
+
+	recv_ctx = adapter->recv_ctx;
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &recv_ctx->rds_rings[ring];
+		for (i = 0; i < rds_ring->num_desc; ++i) {
+			rx_buf = &(rds_ring->rx_buf_arr[i]);
+			if (rx_buf->skb == NULL)
+				continue;
+
+			pci_unmap_single(adapter->pdev,
+					rx_buf->dma,
+					rds_ring->dma_size,
+					PCI_DMA_FROMDEVICE);
+
+			dev_kfree_skb_any(rx_buf->skb);
+		}
+	}
+}
+
+void qlcnic_reset_rx_buffers_list(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_recv_context *recv_ctx;
+	struct qlcnic_host_rds_ring *rds_ring;
+	struct qlcnic_rx_buffer *rx_buf;
+	int i, ring;
+
+	recv_ctx = adapter->recv_ctx;
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &recv_ctx->rds_rings[ring];
+
+		INIT_LIST_HEAD(&rds_ring->free_list);
+
+		rx_buf = rds_ring->rx_buf_arr;
+		for (i = 0; i < rds_ring->num_desc; i++) {
+			list_add_tail(&rx_buf->list,
+					&rds_ring->free_list);
+			rx_buf++;
+		}
+	}
+}
+
+void qlcnic_release_tx_buffers(struct qlcnic_adapter *adapter,
+			       struct qlcnic_host_tx_ring *tx_ring)
+{
+	struct qlcnic_cmd_buffer *cmd_buf;
+	struct qlcnic_skb_frag *buffrag;
+	int i, j;
+
+	spin_lock(&tx_ring->tx_clean_lock);
+
+	cmd_buf = tx_ring->cmd_buf_arr;
+	for (i = 0; i < tx_ring->num_desc; i++) {
+		buffrag = cmd_buf->frag_array;
+		if (buffrag->dma) {
+			pci_unmap_single(adapter->pdev, buffrag->dma,
+					 buffrag->length, PCI_DMA_TODEVICE);
+			buffrag->dma = 0ULL;
+		}
+		for (j = 1; j < cmd_buf->frag_count; j++) {
+			buffrag++;
+			if (buffrag->dma) {
+				pci_unmap_page(adapter->pdev, buffrag->dma,
+					       buffrag->length,
+					       PCI_DMA_TODEVICE);
+				buffrag->dma = 0ULL;
+			}
+		}
+		if (cmd_buf->skb) {
+			dev_kfree_skb_any(cmd_buf->skb);
+			cmd_buf->skb = NULL;
+		}
+		cmd_buf++;
+	}
+
+	spin_unlock(&tx_ring->tx_clean_lock);
+}
+
+void qlcnic_free_sw_resources(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_recv_context *recv_ctx;
+	struct qlcnic_host_rds_ring *rds_ring;
+	int ring;
+
+	recv_ctx = adapter->recv_ctx;
+
+	if (recv_ctx->rds_rings == NULL)
+		return;
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &recv_ctx->rds_rings[ring];
+		vfree(rds_ring->rx_buf_arr);
+		rds_ring->rx_buf_arr = NULL;
+	}
+	kfree(recv_ctx->rds_rings);
+}
+
+int qlcnic_alloc_sw_resources(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_recv_context *recv_ctx;
+	struct qlcnic_host_rds_ring *rds_ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_rx_buffer *rx_buf;
+	int ring, i;
+
+	recv_ctx = adapter->recv_ctx;
+
+	rds_ring = kcalloc(adapter->max_rds_rings,
+			   sizeof(struct qlcnic_host_rds_ring), GFP_KERNEL);
+	if (rds_ring == NULL)
+		goto err_out;
+
+	recv_ctx->rds_rings = rds_ring;
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &recv_ctx->rds_rings[ring];
+		switch (ring) {
+		case RCV_RING_NORMAL:
+			rds_ring->num_desc = adapter->num_rxd;
+			rds_ring->dma_size = QLCNIC_P3P_RX_BUF_MAX_LEN;
+			rds_ring->skb_size = rds_ring->dma_size + NET_IP_ALIGN;
+			break;
+
+		case RCV_RING_JUMBO:
+			rds_ring->num_desc = adapter->num_jumbo_rxd;
+			rds_ring->dma_size =
+				QLCNIC_P3P_RX_JUMBO_BUF_MAX_LEN;
+
+			if (adapter->ahw->capabilities &
+			    QLCNIC_FW_CAPABILITY_HW_LRO)
+				rds_ring->dma_size += QLCNIC_LRO_BUFFER_EXTRA;
+
+			rds_ring->skb_size =
+				rds_ring->dma_size + NET_IP_ALIGN;
+			break;
+		}
+		rds_ring->rx_buf_arr = vzalloc(RCV_BUFF_RINGSIZE(rds_ring));
+		if (rds_ring->rx_buf_arr == NULL)
+			goto err_out;
+
+		INIT_LIST_HEAD(&rds_ring->free_list);
+		/*
+		 * Now go through all of them, set reference handles
+		 * and put them in the queues.
+		 */
+		rx_buf = rds_ring->rx_buf_arr;
+		for (i = 0; i < rds_ring->num_desc; i++) {
+			list_add_tail(&rx_buf->list,
+					&rds_ring->free_list);
+			rx_buf->ref_handle = i;
+			rx_buf++;
+		}
+		spin_lock_init(&rds_ring->lock);
+	}
+
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		sds_ring->irq = adapter->msix_entries[ring].vector;
+		sds_ring->adapter = adapter;
+		sds_ring->num_desc = adapter->num_rxd;
+		if (qlcnic_82xx_check(adapter)) {
+			if (qlcnic_check_multi_tx(adapter) &&
+			    !adapter->ahw->diag_test)
+				sds_ring->tx_ring = &adapter->tx_ring[ring];
+			else
+				sds_ring->tx_ring = &adapter->tx_ring[0];
+		}
+		for (i = 0; i < NUM_RCV_DESC_RINGS; i++)
+			INIT_LIST_HEAD(&sds_ring->free_list[i]);
+	}
+
+	return 0;
+
+err_out:
+	qlcnic_free_sw_resources(adapter);
+	return -ENOMEM;
+}
+
+/*
+ * Utility to translate from internal Phantom CRB address
+ * to external PCI CRB address.
+ */
+static u32 qlcnic_decode_crb_addr(u32 addr)
+{
+	int i;
+	u32 base_addr, offset, pci_base;
+
+	crb_addr_transform_setup();
+
+	pci_base = QLCNIC_ADDR_ERROR;
+	base_addr = addr & 0xfff00000;
+	offset = addr & 0x000fffff;
+
+	for (i = 0; i < QLCNIC_MAX_CRB_XFORM; i++) {
+		if (crb_addr_xform[i] == base_addr) {
+			pci_base = i << 20;
+			break;
+		}
+	}
+	if (pci_base == QLCNIC_ADDR_ERROR)
+		return pci_base;
+	else
+		return pci_base + offset;
+}
+
+#define QLCNIC_MAX_ROM_WAIT_USEC	100
+
+static int qlcnic_wait_rom_done(struct qlcnic_adapter *adapter)
+{
+	long timeout = 0;
+	long done = 0;
+	int err = 0;
+
+	cond_resched();
+	while (done == 0) {
+		done = QLCRD32(adapter, QLCNIC_ROMUSB_GLB_STATUS, &err);
+		done &= 2;
+		if (++timeout >= QLCNIC_MAX_ROM_WAIT_USEC) {
+			dev_err(&adapter->pdev->dev,
+				"Timeout reached  waiting for rom done");
+			return -EIO;
+		}
+		udelay(1);
+	}
+	return 0;
+}
+
+static int do_rom_fast_read(struct qlcnic_adapter *adapter,
+			    u32 addr, u32 *valp)
+{
+	int err = 0;
+
+	QLCWR32(adapter, QLCNIC_ROMUSB_ROM_ADDRESS, addr);
+	QLCWR32(adapter, QLCNIC_ROMUSB_ROM_DUMMY_BYTE_CNT, 0);
+	QLCWR32(adapter, QLCNIC_ROMUSB_ROM_ABYTE_CNT, 3);
+	QLCWR32(adapter, QLCNIC_ROMUSB_ROM_INSTR_OPCODE, 0xb);
+	if (qlcnic_wait_rom_done(adapter)) {
+		dev_err(&adapter->pdev->dev, "Error waiting for rom done\n");
+		return -EIO;
+	}
+	/* reset abyte_cnt and dummy_byte_cnt */
+	QLCWR32(adapter, QLCNIC_ROMUSB_ROM_ABYTE_CNT, 0);
+	udelay(10);
+	QLCWR32(adapter, QLCNIC_ROMUSB_ROM_DUMMY_BYTE_CNT, 0);
+
+	*valp = QLCRD32(adapter, QLCNIC_ROMUSB_ROM_RDATA, &err);
+	if (err == -EIO)
+		return err;
+	return 0;
+}
+
+static int do_rom_fast_read_words(struct qlcnic_adapter *adapter, int addr,
+				  u8 *bytes, size_t size)
+{
+	int addridx;
+	int ret = 0;
+
+	for (addridx = addr; addridx < (addr + size); addridx += 4) {
+		int v;
+		ret = do_rom_fast_read(adapter, addridx, &v);
+		if (ret != 0)
+			break;
+		*(__le32 *)bytes = cpu_to_le32(v);
+		bytes += 4;
+	}
+
+	return ret;
+}
+
+int
+qlcnic_rom_fast_read_words(struct qlcnic_adapter *adapter, int addr,
+				u8 *bytes, size_t size)
+{
+	int ret;
+
+	ret = qlcnic_rom_lock(adapter);
+	if (ret < 0)
+		return ret;
+
+	ret = do_rom_fast_read_words(adapter, addr, bytes, size);
+
+	qlcnic_rom_unlock(adapter);
+	return ret;
+}
+
+int qlcnic_rom_fast_read(struct qlcnic_adapter *adapter, u32 addr, u32 *valp)
+{
+	int ret;
+
+	if (qlcnic_rom_lock(adapter) != 0)
+		return -EIO;
+
+	ret = do_rom_fast_read(adapter, addr, valp);
+	qlcnic_rom_unlock(adapter);
+	return ret;
+}
+
+int qlcnic_pinit_from_rom(struct qlcnic_adapter *adapter)
+{
+	int addr, err = 0;
+	int i, n, init_delay;
+	struct crb_addr_pair *buf;
+	unsigned offset;
+	u32 off, val;
+	struct pci_dev *pdev = adapter->pdev;
+
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CMDPEG_STATE, 0);
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_RCVPEG_STATE, 0);
+
+	/* Halt all the indiviual PEGs and other blocks */
+	/* disable all I2Q */
+	QLCWR32(adapter, QLCNIC_CRB_I2Q + 0x10, 0x0);
+	QLCWR32(adapter, QLCNIC_CRB_I2Q + 0x14, 0x0);
+	QLCWR32(adapter, QLCNIC_CRB_I2Q + 0x18, 0x0);
+	QLCWR32(adapter, QLCNIC_CRB_I2Q + 0x1c, 0x0);
+	QLCWR32(adapter, QLCNIC_CRB_I2Q + 0x20, 0x0);
+	QLCWR32(adapter, QLCNIC_CRB_I2Q + 0x24, 0x0);
+
+	/* disable all niu interrupts */
+	QLCWR32(adapter, QLCNIC_CRB_NIU + 0x40, 0xff);
+	/* disable xge rx/tx */
+	QLCWR32(adapter, QLCNIC_CRB_NIU + 0x70000, 0x00);
+	/* disable xg1 rx/tx */
+	QLCWR32(adapter, QLCNIC_CRB_NIU + 0x80000, 0x00);
+	/* disable sideband mac */
+	QLCWR32(adapter, QLCNIC_CRB_NIU + 0x90000, 0x00);
+	/* disable ap0 mac */
+	QLCWR32(adapter, QLCNIC_CRB_NIU + 0xa0000, 0x00);
+	/* disable ap1 mac */
+	QLCWR32(adapter, QLCNIC_CRB_NIU + 0xb0000, 0x00);
+
+	/* halt sre */
+	val = QLCRD32(adapter, QLCNIC_CRB_SRE + 0x1000, &err);
+	if (err == -EIO)
+		return err;
+	QLCWR32(adapter, QLCNIC_CRB_SRE + 0x1000, val & (~(0x1)));
+
+	/* halt epg */
+	QLCWR32(adapter, QLCNIC_CRB_EPG + 0x1300, 0x1);
+
+	/* halt timers */
+	QLCWR32(adapter, QLCNIC_CRB_TIMER + 0x0, 0x0);
+	QLCWR32(adapter, QLCNIC_CRB_TIMER + 0x8, 0x0);
+	QLCWR32(adapter, QLCNIC_CRB_TIMER + 0x10, 0x0);
+	QLCWR32(adapter, QLCNIC_CRB_TIMER + 0x18, 0x0);
+	QLCWR32(adapter, QLCNIC_CRB_TIMER + 0x100, 0x0);
+	QLCWR32(adapter, QLCNIC_CRB_TIMER + 0x200, 0x0);
+	/* halt pegs */
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_0 + 0x3c, 1);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_1 + 0x3c, 1);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_2 + 0x3c, 1);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_3 + 0x3c, 1);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_4 + 0x3c, 1);
+	msleep(20);
+
+	qlcnic_rom_unlock(adapter);
+	/* big hammer don't reset CAM block on reset */
+	QLCWR32(adapter, QLCNIC_ROMUSB_GLB_SW_RESET, 0xfeffffff);
+
+	/* Init HW CRB block */
+	if (qlcnic_rom_fast_read(adapter, 0, &n) != 0 || (n != 0xcafecafe) ||
+			qlcnic_rom_fast_read(adapter, 4, &n) != 0) {
+		dev_err(&pdev->dev, "ERROR Reading crb_init area: val:%x\n", n);
+		return -EIO;
+	}
+	offset = n & 0xffffU;
+	n = (n >> 16) & 0xffffU;
+
+	if (n >= 1024) {
+		dev_err(&pdev->dev, "QLOGIC card flash not initialized.\n");
+		return -EIO;
+	}
+
+	buf = kcalloc(n, sizeof(struct crb_addr_pair), GFP_KERNEL);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < n; i++) {
+		if (qlcnic_rom_fast_read(adapter, 8*i + 4*offset, &val) != 0 ||
+		qlcnic_rom_fast_read(adapter, 8*i + 4*offset + 4, &addr) != 0) {
+			kfree(buf);
+			return -EIO;
+		}
+
+		buf[i].addr = addr;
+		buf[i].data = val;
+	}
+
+	for (i = 0; i < n; i++) {
+
+		off = qlcnic_decode_crb_addr(buf[i].addr);
+		if (off == QLCNIC_ADDR_ERROR) {
+			dev_err(&pdev->dev, "CRB init value out of range %x\n",
+					buf[i].addr);
+			continue;
+		}
+		off += QLCNIC_PCI_CRBSPACE;
+
+		if (off & 1)
+			continue;
+
+		/* skipping cold reboot MAGIC */
+		if (off == QLCNIC_CAM_RAM(0x1fc))
+			continue;
+		if (off == (QLCNIC_CRB_I2C0 + 0x1c))
+			continue;
+		if (off == (ROMUSB_GLB + 0xbc)) /* do not reset PCI */
+			continue;
+		if (off == (ROMUSB_GLB + 0xa8))
+			continue;
+		if (off == (ROMUSB_GLB + 0xc8)) /* core clock */
+			continue;
+		if (off == (ROMUSB_GLB + 0x24)) /* MN clock */
+			continue;
+		if (off == (ROMUSB_GLB + 0x1c)) /* MS clock */
+			continue;
+		if ((off & 0x0ff00000) == QLCNIC_CRB_DDR_NET)
+			continue;
+		/* skip the function enable register */
+		if (off == QLCNIC_PCIE_REG(PCIE_SETUP_FUNCTION))
+			continue;
+		if (off == QLCNIC_PCIE_REG(PCIE_SETUP_FUNCTION2))
+			continue;
+		if ((off & 0x0ff00000) == QLCNIC_CRB_SMB)
+			continue;
+
+		init_delay = 1;
+		/* After writing this register, HW needs time for CRB */
+		/* to quiet down (else crb_window returns 0xffffffff) */
+		if (off == QLCNIC_ROMUSB_GLB_SW_RESET)
+			init_delay = 1000;
+
+		QLCWR32(adapter, off, buf[i].data);
+
+		msleep(init_delay);
+	}
+	kfree(buf);
+
+	/* Initialize protocol process engine */
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_D + 0xec, 0x1e);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_D + 0x4c, 8);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_I + 0x4c, 8);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_0 + 0x8, 0);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_0 + 0xc, 0);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_1 + 0x8, 0);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_1 + 0xc, 0);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_2 + 0x8, 0);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_2 + 0xc, 0);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_3 + 0x8, 0);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_3 + 0xc, 0);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_4 + 0x8, 0);
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_4 + 0xc, 0);
+	usleep_range(1000, 1500);
+
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_PEG_HALT_STATUS1, 0);
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_PEG_HALT_STATUS2, 0);
+
+	return 0;
+}
+
+static int qlcnic_cmd_peg_ready(struct qlcnic_adapter *adapter)
+{
+	u32 val;
+	int retries = QLCNIC_CMDPEG_CHECK_RETRY_COUNT;
+
+	do {
+		val = QLC_SHARED_REG_RD32(adapter, QLCNIC_CMDPEG_STATE);
+
+		switch (val) {
+		case PHAN_INITIALIZE_COMPLETE:
+		case PHAN_INITIALIZE_ACK:
+			return 0;
+		case PHAN_INITIALIZE_FAILED:
+			goto out_err;
+		default:
+			break;
+		}
+
+		msleep(QLCNIC_CMDPEG_CHECK_DELAY);
+
+	} while (--retries);
+
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CMDPEG_STATE,
+			    PHAN_INITIALIZE_FAILED);
+
+out_err:
+	dev_err(&adapter->pdev->dev, "Command Peg initialization not "
+		      "complete, state: 0x%x.\n", val);
+	return -EIO;
+}
+
+static int
+qlcnic_receive_peg_ready(struct qlcnic_adapter *adapter)
+{
+	u32 val;
+	int retries = QLCNIC_RCVPEG_CHECK_RETRY_COUNT;
+
+	do {
+		val = QLC_SHARED_REG_RD32(adapter, QLCNIC_RCVPEG_STATE);
+
+		if (val == PHAN_PEG_RCV_INITIALIZED)
+			return 0;
+
+		msleep(QLCNIC_RCVPEG_CHECK_DELAY);
+
+	} while (--retries);
+
+	dev_err(&adapter->pdev->dev, "Receive Peg initialization not complete, state: 0x%x.\n",
+		val);
+	return -EIO;
+}
+
+int
+qlcnic_check_fw_status(struct qlcnic_adapter *adapter)
+{
+	int err;
+
+	err = qlcnic_cmd_peg_ready(adapter);
+	if (err)
+		return err;
+
+	err = qlcnic_receive_peg_ready(adapter);
+	if (err)
+		return err;
+
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CMDPEG_STATE, PHAN_INITIALIZE_ACK);
+
+	return err;
+}
+
+int
+qlcnic_setup_idc_param(struct qlcnic_adapter *adapter) {
+
+	int timeo;
+	u32 val;
+
+	val = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DEV_PARTITION_INFO);
+	val = QLC_DEV_GET_DRV(val, adapter->portnum);
+	if ((val & 0x3) != QLCNIC_TYPE_NIC) {
+		dev_err(&adapter->pdev->dev,
+			"Not an Ethernet NIC func=%u\n", val);
+		return -EIO;
+	}
+	adapter->ahw->physical_port = (val >> 2);
+	if (qlcnic_rom_fast_read(adapter, QLCNIC_ROM_DEV_INIT_TIMEOUT, &timeo))
+		timeo = QLCNIC_INIT_TIMEOUT_SECS;
+
+	adapter->dev_init_timeo = timeo;
+
+	if (qlcnic_rom_fast_read(adapter, QLCNIC_ROM_DRV_RESET_TIMEOUT, &timeo))
+		timeo = QLCNIC_RESET_TIMEOUT_SECS;
+
+	adapter->reset_ack_timeo = timeo;
+
+	return 0;
+}
+
+static int qlcnic_get_flt_entry(struct qlcnic_adapter *adapter, u8 region,
+				struct qlcnic_flt_entry *region_entry)
+{
+	struct qlcnic_flt_header flt_hdr;
+	struct qlcnic_flt_entry *flt_entry;
+	int i = 0, ret;
+	u32 entry_size;
+
+	memset(region_entry, 0, sizeof(struct qlcnic_flt_entry));
+	ret = qlcnic_rom_fast_read_words(adapter, QLCNIC_FLT_LOCATION,
+					 (u8 *)&flt_hdr,
+					 sizeof(struct qlcnic_flt_header));
+	if (ret) {
+		dev_warn(&adapter->pdev->dev,
+			 "error reading flash layout header\n");
+		return -EIO;
+	}
+
+	entry_size = flt_hdr.len - sizeof(struct qlcnic_flt_header);
+	flt_entry = vzalloc(entry_size);
+	if (flt_entry == NULL)
+		return -EIO;
+
+	ret = qlcnic_rom_fast_read_words(adapter, QLCNIC_FLT_LOCATION +
+					 sizeof(struct qlcnic_flt_header),
+					 (u8 *)flt_entry, entry_size);
+	if (ret) {
+		dev_warn(&adapter->pdev->dev,
+			 "error reading flash layout entries\n");
+		goto err_out;
+	}
+
+	while (i < (entry_size/sizeof(struct qlcnic_flt_entry))) {
+		if (flt_entry[i].region == region)
+			break;
+		i++;
+	}
+	if (i >= (entry_size/sizeof(struct qlcnic_flt_entry))) {
+		dev_warn(&adapter->pdev->dev,
+			 "region=%x not found in %d regions\n", region, i);
+		ret = -EIO;
+		goto err_out;
+	}
+	memcpy(region_entry, &flt_entry[i], sizeof(struct qlcnic_flt_entry));
+
+err_out:
+	vfree(flt_entry);
+	return ret;
+}
+
+int
+qlcnic_check_flash_fw_ver(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_flt_entry fw_entry;
+	u32 ver = -1, min_ver;
+	int ret;
+
+	if (adapter->ahw->revision_id == QLCNIC_P3P_C0)
+		ret = qlcnic_get_flt_entry(adapter, QLCNIC_C0_FW_IMAGE_REGION,
+						 &fw_entry);
+	else
+		ret = qlcnic_get_flt_entry(adapter, QLCNIC_B0_FW_IMAGE_REGION,
+						 &fw_entry);
+
+	if (!ret)
+		/* 0-4:-signature,  4-8:-fw version */
+		qlcnic_rom_fast_read(adapter, fw_entry.start_addr + 4,
+				     (int *)&ver);
+	else
+		qlcnic_rom_fast_read(adapter, QLCNIC_FW_VERSION_OFFSET,
+				     (int *)&ver);
+
+	ver = QLCNIC_DECODE_VERSION(ver);
+	min_ver = QLCNIC_MIN_FW_VERSION;
+
+	if (ver < min_ver) {
+		dev_err(&adapter->pdev->dev,
+			"firmware version %d.%d.%d unsupported."
+			"Min supported version %d.%d.%d\n",
+			_major(ver), _minor(ver), _build(ver),
+			_major(min_ver), _minor(min_ver), _build(min_ver));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+qlcnic_has_mn(struct qlcnic_adapter *adapter)
+{
+	u32 capability = 0;
+	int err = 0;
+
+	capability = QLCRD32(adapter, QLCNIC_PEG_TUNE_CAPABILITY, &err);
+	if (err == -EIO)
+		return err;
+	if (capability & QLCNIC_PEG_TUNE_MN_PRESENT)
+		return 1;
+
+	return 0;
+}
+
+static
+struct uni_table_desc *qlcnic_get_table_desc(const u8 *unirom, int section)
+{
+	u32 i, entries;
+	struct uni_table_desc *directory = (struct uni_table_desc *) &unirom[0];
+	entries = le32_to_cpu(directory->num_entries);
+
+	for (i = 0; i < entries; i++) {
+
+		u32 offs = le32_to_cpu(directory->findex) +
+			   i * le32_to_cpu(directory->entry_size);
+		u32 tab_type = le32_to_cpu(*((__le32 *)&unirom[offs] + 8));
+
+		if (tab_type == section)
+			return (struct uni_table_desc *) &unirom[offs];
+	}
+
+	return NULL;
+}
+
+#define FILEHEADER_SIZE (14 * 4)
+
+static int
+qlcnic_validate_header(struct qlcnic_adapter *adapter)
+{
+	const u8 *unirom = adapter->fw->data;
+	struct uni_table_desc *directory = (struct uni_table_desc *) &unirom[0];
+	u32 entries, entry_size, tab_size, fw_file_size;
+
+	fw_file_size = adapter->fw->size;
+
+	if (fw_file_size < FILEHEADER_SIZE)
+		return -EINVAL;
+
+	entries = le32_to_cpu(directory->num_entries);
+	entry_size = le32_to_cpu(directory->entry_size);
+	tab_size = le32_to_cpu(directory->findex) + (entries * entry_size);
+
+	if (fw_file_size < tab_size)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int
+qlcnic_validate_bootld(struct qlcnic_adapter *adapter)
+{
+	struct uni_table_desc *tab_desc;
+	struct uni_data_desc *descr;
+	u32 offs, tab_size, data_size, idx;
+	const u8 *unirom = adapter->fw->data;
+	__le32 temp;
+
+	temp = *((__le32 *)&unirom[adapter->file_prd_off] +
+		 QLCNIC_UNI_BOOTLD_IDX_OFF);
+	idx = le32_to_cpu(temp);
+	tab_desc = qlcnic_get_table_desc(unirom, QLCNIC_UNI_DIR_SECT_BOOTLD);
+
+	if (!tab_desc)
+		return -EINVAL;
+
+	tab_size = le32_to_cpu(tab_desc->findex) +
+		   le32_to_cpu(tab_desc->entry_size) * (idx + 1);
+
+	if (adapter->fw->size < tab_size)
+		return -EINVAL;
+
+	offs = le32_to_cpu(tab_desc->findex) +
+	       le32_to_cpu(tab_desc->entry_size) * idx;
+	descr = (struct uni_data_desc *)&unirom[offs];
+
+	data_size = le32_to_cpu(descr->findex) + le32_to_cpu(descr->size);
+
+	if (adapter->fw->size < data_size)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int
+qlcnic_validate_fw(struct qlcnic_adapter *adapter)
+{
+	struct uni_table_desc *tab_desc;
+	struct uni_data_desc *descr;
+	const u8 *unirom = adapter->fw->data;
+	u32 offs, tab_size, data_size, idx;
+	__le32 temp;
+
+	temp = *((__le32 *)&unirom[adapter->file_prd_off] +
+		 QLCNIC_UNI_FIRMWARE_IDX_OFF);
+	idx = le32_to_cpu(temp);
+	tab_desc = qlcnic_get_table_desc(unirom, QLCNIC_UNI_DIR_SECT_FW);
+
+	if (!tab_desc)
+		return -EINVAL;
+
+	tab_size = le32_to_cpu(tab_desc->findex) +
+		   le32_to_cpu(tab_desc->entry_size) * (idx + 1);
+
+	if (adapter->fw->size < tab_size)
+		return -EINVAL;
+
+	offs = le32_to_cpu(tab_desc->findex) +
+	       le32_to_cpu(tab_desc->entry_size) * idx;
+	descr = (struct uni_data_desc *)&unirom[offs];
+	data_size = le32_to_cpu(descr->findex) + le32_to_cpu(descr->size);
+
+	if (adapter->fw->size < data_size)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int
+qlcnic_validate_product_offs(struct qlcnic_adapter *adapter)
+{
+	struct uni_table_desc *ptab_descr;
+	const u8 *unirom = adapter->fw->data;
+	int mn_present = qlcnic_has_mn(adapter);
+	u32 entries, entry_size, tab_size, i;
+	__le32 temp;
+
+	ptab_descr = qlcnic_get_table_desc(unirom,
+				QLCNIC_UNI_DIR_SECT_PRODUCT_TBL);
+	if (!ptab_descr)
+		return -EINVAL;
+
+	entries = le32_to_cpu(ptab_descr->num_entries);
+	entry_size = le32_to_cpu(ptab_descr->entry_size);
+	tab_size = le32_to_cpu(ptab_descr->findex) + (entries * entry_size);
+
+	if (adapter->fw->size < tab_size)
+		return -EINVAL;
+
+nomn:
+	for (i = 0; i < entries; i++) {
+
+		u32 flags, file_chiprev, offs;
+		u8 chiprev = adapter->ahw->revision_id;
+		u32 flagbit;
+
+		offs = le32_to_cpu(ptab_descr->findex) +
+		       i * le32_to_cpu(ptab_descr->entry_size);
+		temp = *((__le32 *)&unirom[offs] + QLCNIC_UNI_FLAGS_OFF);
+		flags = le32_to_cpu(temp);
+		temp = *((__le32 *)&unirom[offs] + QLCNIC_UNI_CHIP_REV_OFF);
+		file_chiprev = le32_to_cpu(temp);
+
+		flagbit = mn_present ? 1 : 2;
+
+		if ((chiprev == file_chiprev) &&
+					((1ULL << flagbit) & flags)) {
+			adapter->file_prd_off = offs;
+			return 0;
+		}
+	}
+	if (mn_present) {
+		mn_present = 0;
+		goto nomn;
+	}
+	return -EINVAL;
+}
+
+static int
+qlcnic_validate_unified_romimage(struct qlcnic_adapter *adapter)
+{
+	if (qlcnic_validate_header(adapter)) {
+		dev_err(&adapter->pdev->dev,
+				"unified image: header validation failed\n");
+		return -EINVAL;
+	}
+
+	if (qlcnic_validate_product_offs(adapter)) {
+		dev_err(&adapter->pdev->dev,
+				"unified image: product validation failed\n");
+		return -EINVAL;
+	}
+
+	if (qlcnic_validate_bootld(adapter)) {
+		dev_err(&adapter->pdev->dev,
+				"unified image: bootld validation failed\n");
+		return -EINVAL;
+	}
+
+	if (qlcnic_validate_fw(adapter)) {
+		dev_err(&adapter->pdev->dev,
+				"unified image: firmware validation failed\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static
+struct uni_data_desc *qlcnic_get_data_desc(struct qlcnic_adapter *adapter,
+			u32 section, u32 idx_offset)
+{
+	const u8 *unirom = adapter->fw->data;
+	struct uni_table_desc *tab_desc;
+	u32 offs, idx;
+	__le32 temp;
+
+	temp = *((__le32 *)&unirom[adapter->file_prd_off] + idx_offset);
+	idx = le32_to_cpu(temp);
+
+	tab_desc = qlcnic_get_table_desc(unirom, section);
+
+	if (tab_desc == NULL)
+		return NULL;
+
+	offs = le32_to_cpu(tab_desc->findex) +
+	       le32_to_cpu(tab_desc->entry_size) * idx;
+
+	return (struct uni_data_desc *)&unirom[offs];
+}
+
+static u8 *
+qlcnic_get_bootld_offs(struct qlcnic_adapter *adapter)
+{
+	u32 offs = QLCNIC_BOOTLD_START;
+	struct uni_data_desc *data_desc;
+
+	data_desc = qlcnic_get_data_desc(adapter, QLCNIC_UNI_DIR_SECT_BOOTLD,
+					 QLCNIC_UNI_BOOTLD_IDX_OFF);
+
+	if (adapter->ahw->fw_type == QLCNIC_UNIFIED_ROMIMAGE)
+		offs = le32_to_cpu(data_desc->findex);
+
+	return (u8 *)&adapter->fw->data[offs];
+}
+
+static u8 *
+qlcnic_get_fw_offs(struct qlcnic_adapter *adapter)
+{
+	u32 offs = QLCNIC_IMAGE_START;
+	struct uni_data_desc *data_desc;
+
+	data_desc = qlcnic_get_data_desc(adapter, QLCNIC_UNI_DIR_SECT_FW,
+					 QLCNIC_UNI_FIRMWARE_IDX_OFF);
+	if (adapter->ahw->fw_type == QLCNIC_UNIFIED_ROMIMAGE)
+		offs = le32_to_cpu(data_desc->findex);
+
+	return (u8 *)&adapter->fw->data[offs];
+}
+
+static u32 qlcnic_get_fw_size(struct qlcnic_adapter *adapter)
+{
+	struct uni_data_desc *data_desc;
+	const u8 *unirom = adapter->fw->data;
+
+	data_desc = qlcnic_get_data_desc(adapter, QLCNIC_UNI_DIR_SECT_FW,
+					 QLCNIC_UNI_FIRMWARE_IDX_OFF);
+
+	if (adapter->ahw->fw_type == QLCNIC_UNIFIED_ROMIMAGE)
+		return le32_to_cpu(data_desc->size);
+	else
+		return le32_to_cpu(*(__le32 *)&unirom[QLCNIC_FW_SIZE_OFFSET]);
+}
+
+static u32 qlcnic_get_fw_version(struct qlcnic_adapter *adapter)
+{
+	struct uni_data_desc *fw_data_desc;
+	const struct firmware *fw = adapter->fw;
+	u32 major, minor, sub;
+	__le32 version_offset;
+	const u8 *ver_str;
+	int i, ret;
+
+	if (adapter->ahw->fw_type != QLCNIC_UNIFIED_ROMIMAGE) {
+		version_offset = *(__le32 *)&fw->data[QLCNIC_FW_VERSION_OFFSET];
+		return le32_to_cpu(version_offset);
+	}
+
+	fw_data_desc = qlcnic_get_data_desc(adapter, QLCNIC_UNI_DIR_SECT_FW,
+			QLCNIC_UNI_FIRMWARE_IDX_OFF);
+	ver_str = fw->data + le32_to_cpu(fw_data_desc->findex) +
+		  le32_to_cpu(fw_data_desc->size) - 17;
+
+	for (i = 0; i < 12; i++) {
+		if (!strncmp(&ver_str[i], "REV=", 4)) {
+			ret = sscanf(&ver_str[i+4], "%u.%u.%u ",
+					&major, &minor, &sub);
+			if (ret != 3)
+				return 0;
+			else
+				return major + (minor << 8) + (sub << 16);
+		}
+	}
+
+	return 0;
+}
+
+static u32 qlcnic_get_bios_version(struct qlcnic_adapter *adapter)
+{
+	const struct firmware *fw = adapter->fw;
+	u32 bios_ver, prd_off = adapter->file_prd_off;
+	u8 *version_offset;
+	__le32 temp;
+
+	if (adapter->ahw->fw_type != QLCNIC_UNIFIED_ROMIMAGE) {
+		version_offset = (u8 *)&fw->data[QLCNIC_BIOS_VERSION_OFFSET];
+		return le32_to_cpu(*(__le32 *)version_offset);
+	}
+
+	temp = *((__le32 *)(&fw->data[prd_off]) + QLCNIC_UNI_BIOS_VERSION_OFF);
+	bios_ver = le32_to_cpu(temp);
+
+	return (bios_ver << 16) + ((bios_ver >> 8) & 0xff00) + (bios_ver >> 24);
+}
+
+static void qlcnic_rom_lock_recovery(struct qlcnic_adapter *adapter)
+{
+	if (qlcnic_pcie_sem_lock(adapter, 2, QLCNIC_ROM_LOCK_ID))
+		dev_info(&adapter->pdev->dev, "Resetting rom_lock\n");
+
+	qlcnic_pcie_sem_unlock(adapter, 2);
+}
+
+static int
+qlcnic_check_fw_hearbeat(struct qlcnic_adapter *adapter)
+{
+	u32 heartbeat, ret = -EIO;
+	int retries = QLCNIC_HEARTBEAT_CHECK_RETRY_COUNT;
+
+	adapter->heartbeat = QLC_SHARED_REG_RD32(adapter,
+						 QLCNIC_PEG_ALIVE_COUNTER);
+
+	do {
+		msleep(QLCNIC_HEARTBEAT_PERIOD_MSECS);
+		heartbeat = QLC_SHARED_REG_RD32(adapter,
+						QLCNIC_PEG_ALIVE_COUNTER);
+		if (heartbeat != adapter->heartbeat) {
+			ret = QLCNIC_RCODE_SUCCESS;
+			break;
+		}
+	} while (--retries);
+
+	return ret;
+}
+
+int
+qlcnic_need_fw_reset(struct qlcnic_adapter *adapter)
+{
+	if ((adapter->flags & QLCNIC_FW_HANG) ||
+			qlcnic_check_fw_hearbeat(adapter)) {
+		qlcnic_rom_lock_recovery(adapter);
+		return 1;
+	}
+
+	if (adapter->need_fw_reset)
+		return 1;
+
+	if (adapter->fw)
+		return 1;
+
+	return 0;
+}
+
+static const char *fw_name[] = {
+	QLCNIC_UNIFIED_ROMIMAGE_NAME,
+	QLCNIC_FLASH_ROMIMAGE_NAME,
+};
+
+int
+qlcnic_load_firmware(struct qlcnic_adapter *adapter)
+{
+	__le64 *ptr64;
+	u32 i, flashaddr, size;
+	const struct firmware *fw = adapter->fw;
+	struct pci_dev *pdev = adapter->pdev;
+
+	dev_info(&pdev->dev, "loading firmware from %s\n",
+		 fw_name[adapter->ahw->fw_type]);
+
+	if (fw) {
+		u64 data;
+
+		size = (QLCNIC_IMAGE_START - QLCNIC_BOOTLD_START) / 8;
+
+		ptr64 = (__le64 *)qlcnic_get_bootld_offs(adapter);
+		flashaddr = QLCNIC_BOOTLD_START;
+
+		for (i = 0; i < size; i++) {
+			data = le64_to_cpu(ptr64[i]);
+
+			if (qlcnic_pci_mem_write_2M(adapter, flashaddr, data))
+				return -EIO;
+
+			flashaddr += 8;
+		}
+
+		size = qlcnic_get_fw_size(adapter) / 8;
+
+		ptr64 = (__le64 *)qlcnic_get_fw_offs(adapter);
+		flashaddr = QLCNIC_IMAGE_START;
+
+		for (i = 0; i < size; i++) {
+			data = le64_to_cpu(ptr64[i]);
+
+			if (qlcnic_pci_mem_write_2M(adapter,
+						flashaddr, data))
+				return -EIO;
+
+			flashaddr += 8;
+		}
+
+		size = qlcnic_get_fw_size(adapter) % 8;
+		if (size) {
+			data = le64_to_cpu(ptr64[i]);
+
+			if (qlcnic_pci_mem_write_2M(adapter,
+						flashaddr, data))
+				return -EIO;
+		}
+
+	} else {
+		u64 data;
+		u32 hi, lo;
+		int ret;
+		struct qlcnic_flt_entry bootld_entry;
+
+		ret = qlcnic_get_flt_entry(adapter, QLCNIC_BOOTLD_REGION,
+					&bootld_entry);
+		if (!ret) {
+			size = bootld_entry.size / 8;
+			flashaddr = bootld_entry.start_addr;
+		} else {
+			size = (QLCNIC_IMAGE_START - QLCNIC_BOOTLD_START) / 8;
+			flashaddr = QLCNIC_BOOTLD_START;
+			dev_info(&pdev->dev,
+				"using legacy method to get flash fw region");
+		}
+
+		for (i = 0; i < size; i++) {
+			if (qlcnic_rom_fast_read(adapter,
+					flashaddr, (int *)&lo) != 0)
+				return -EIO;
+			if (qlcnic_rom_fast_read(adapter,
+					flashaddr + 4, (int *)&hi) != 0)
+				return -EIO;
+
+			data = (((u64)hi << 32) | lo);
+
+			if (qlcnic_pci_mem_write_2M(adapter,
+						flashaddr, data))
+				return -EIO;
+
+			flashaddr += 8;
+		}
+	}
+	usleep_range(1000, 1500);
+
+	QLCWR32(adapter, QLCNIC_CRB_PEG_NET_0 + 0x18, 0x1020);
+	QLCWR32(adapter, QLCNIC_ROMUSB_GLB_SW_RESET, 0x80001e);
+	return 0;
+}
+
+static int
+qlcnic_validate_firmware(struct qlcnic_adapter *adapter)
+{
+	u32 val;
+	u32 ver, bios, min_size;
+	struct pci_dev *pdev = adapter->pdev;
+	const struct firmware *fw = adapter->fw;
+	u8 fw_type = adapter->ahw->fw_type;
+
+	if (fw_type == QLCNIC_UNIFIED_ROMIMAGE) {
+		if (qlcnic_validate_unified_romimage(adapter))
+			return -EINVAL;
+
+		min_size = QLCNIC_UNI_FW_MIN_SIZE;
+	} else {
+		val = le32_to_cpu(*(__le32 *)&fw->data[QLCNIC_FW_MAGIC_OFFSET]);
+		if (val != QLCNIC_BDINFO_MAGIC)
+			return -EINVAL;
+
+		min_size = QLCNIC_FW_MIN_SIZE;
+	}
+
+	if (fw->size < min_size)
+		return -EINVAL;
+
+	val = qlcnic_get_fw_version(adapter);
+	ver = QLCNIC_DECODE_VERSION(val);
+
+	if (ver < QLCNIC_MIN_FW_VERSION) {
+		dev_err(&pdev->dev,
+				"%s: firmware version %d.%d.%d unsupported\n",
+		fw_name[fw_type], _major(ver), _minor(ver), _build(ver));
+		return -EINVAL;
+	}
+
+	val = qlcnic_get_bios_version(adapter);
+	qlcnic_rom_fast_read(adapter, QLCNIC_BIOS_VERSION_OFFSET, (int *)&bios);
+	if (val != bios) {
+		dev_err(&pdev->dev, "%s: firmware bios is incompatible\n",
+				fw_name[fw_type]);
+		return -EINVAL;
+	}
+
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_FW_IMG_VALID, QLCNIC_BDINFO_MAGIC);
+	return 0;
+}
+
+static void
+qlcnic_get_next_fwtype(struct qlcnic_adapter *adapter)
+{
+	u8 fw_type;
+
+	switch (adapter->ahw->fw_type) {
+	case QLCNIC_UNKNOWN_ROMIMAGE:
+		fw_type = QLCNIC_UNIFIED_ROMIMAGE;
+		break;
+
+	case QLCNIC_UNIFIED_ROMIMAGE:
+	default:
+		fw_type = QLCNIC_FLASH_ROMIMAGE;
+		break;
+	}
+
+	adapter->ahw->fw_type = fw_type;
+}
+
+
+
+void qlcnic_request_firmware(struct qlcnic_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	int rc;
+
+	adapter->ahw->fw_type = QLCNIC_UNKNOWN_ROMIMAGE;
+
+next:
+	qlcnic_get_next_fwtype(adapter);
+
+	if (adapter->ahw->fw_type == QLCNIC_FLASH_ROMIMAGE) {
+		adapter->fw = NULL;
+	} else {
+		rc = request_firmware(&adapter->fw,
+				      fw_name[adapter->ahw->fw_type],
+				      &pdev->dev);
+		if (rc != 0)
+			goto next;
+
+		rc = qlcnic_validate_firmware(adapter);
+		if (rc != 0) {
+			release_firmware(adapter->fw);
+			usleep_range(1000, 1500);
+			goto next;
+		}
+	}
+}
+
+
+void
+qlcnic_release_firmware(struct qlcnic_adapter *adapter)
+{
+	release_firmware(adapter->fw);
+	adapter->fw = NULL;
+}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
new file mode 100644
index 0000000..84dd830
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
@@ -0,0 +1,2230 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/checksum.h>
+#include <linux/printk.h>
+
+#include "qlcnic.h"
+
+#define QLCNIC_TX_ETHER_PKT		0x01
+#define QLCNIC_TX_TCP_PKT		0x02
+#define QLCNIC_TX_UDP_PKT		0x03
+#define QLCNIC_TX_IP_PKT		0x04
+#define QLCNIC_TX_TCP_LSO		0x05
+#define QLCNIC_TX_TCP_LSO6		0x06
+#define QLCNIC_TX_ENCAP_PKT		0x07
+#define QLCNIC_TX_ENCAP_LSO		0x08
+#define QLCNIC_TX_TCPV6_PKT		0x0b
+#define QLCNIC_TX_UDPV6_PKT		0x0c
+
+#define QLCNIC_FLAGS_VLAN_TAGGED	0x10
+#define QLCNIC_FLAGS_VLAN_OOB		0x40
+
+#define qlcnic_set_tx_vlan_tci(cmd_desc, v)	\
+	(cmd_desc)->vlan_TCI = cpu_to_le16(v);
+#define qlcnic_set_cmd_desc_port(cmd_desc, var)	\
+	((cmd_desc)->port_ctxid |= ((var) & 0x0F))
+#define qlcnic_set_cmd_desc_ctxid(cmd_desc, var)	\
+	((cmd_desc)->port_ctxid |= ((var) << 4 & 0xF0))
+
+#define qlcnic_set_tx_port(_desc, _port) \
+	((_desc)->port_ctxid = ((_port) & 0xf) | (((_port) << 4) & 0xf0))
+
+#define qlcnic_set_tx_flags_opcode(_desc, _flags, _opcode) \
+	((_desc)->flags_opcode |= \
+	cpu_to_le16(((_flags) & 0x7f) | (((_opcode) & 0x3f) << 7)))
+
+#define qlcnic_set_tx_frags_len(_desc, _frags, _len) \
+	((_desc)->nfrags__length = \
+	cpu_to_le32(((_frags) & 0xff) | (((_len) & 0xffffff) << 8)))
+
+/* owner bits of status_desc */
+#define STATUS_OWNER_HOST	(0x1ULL << 56)
+#define STATUS_OWNER_PHANTOM	(0x2ULL << 56)
+
+/* Status descriptor:
+   0-3 port, 4-7 status, 8-11 type, 12-27 total_length
+   28-43 reference_handle, 44-47 protocol, 48-52 pkt_offset
+   53-55 desc_cnt, 56-57 owner, 58-63 opcode
+ */
+#define qlcnic_get_sts_port(sts_data)	\
+	((sts_data) & 0x0F)
+#define qlcnic_get_sts_status(sts_data)	\
+	(((sts_data) >> 4) & 0x0F)
+#define qlcnic_get_sts_type(sts_data)	\
+	(((sts_data) >> 8) & 0x0F)
+#define qlcnic_get_sts_totallength(sts_data)	\
+	(((sts_data) >> 12) & 0xFFFF)
+#define qlcnic_get_sts_refhandle(sts_data)	\
+	(((sts_data) >> 28) & 0xFFFF)
+#define qlcnic_get_sts_prot(sts_data)	\
+	(((sts_data) >> 44) & 0x0F)
+#define qlcnic_get_sts_pkt_offset(sts_data)	\
+	(((sts_data) >> 48) & 0x1F)
+#define qlcnic_get_sts_desc_cnt(sts_data)	\
+	(((sts_data) >> 53) & 0x7)
+#define qlcnic_get_sts_opcode(sts_data)	\
+	(((sts_data) >> 58) & 0x03F)
+
+#define qlcnic_get_lro_sts_refhandle(sts_data) 	\
+	((sts_data) & 0x07FFF)
+#define qlcnic_get_lro_sts_length(sts_data)	\
+	(((sts_data) >> 16) & 0x0FFFF)
+#define qlcnic_get_lro_sts_l2_hdr_offset(sts_data)	\
+	(((sts_data) >> 32) & 0x0FF)
+#define qlcnic_get_lro_sts_l4_hdr_offset(sts_data)	\
+	(((sts_data) >> 40) & 0x0FF)
+#define qlcnic_get_lro_sts_timestamp(sts_data)	\
+	(((sts_data) >> 48) & 0x1)
+#define qlcnic_get_lro_sts_type(sts_data)	\
+	(((sts_data) >> 49) & 0x7)
+#define qlcnic_get_lro_sts_push_flag(sts_data)		\
+	(((sts_data) >> 52) & 0x1)
+#define qlcnic_get_lro_sts_seq_number(sts_data)		\
+	((sts_data) & 0x0FFFFFFFF)
+#define qlcnic_get_lro_sts_mss(sts_data1)		\
+	((sts_data1 >> 32) & 0x0FFFF)
+
+#define qlcnic_83xx_get_lro_sts_mss(sts) ((sts) & 0xffff)
+
+/* opcode field in status_desc */
+#define QLCNIC_SYN_OFFLOAD	0x03
+#define QLCNIC_RXPKT_DESC  	0x04
+#define QLCNIC_OLD_RXPKT_DESC	0x3f
+#define QLCNIC_RESPONSE_DESC	0x05
+#define QLCNIC_LRO_DESC  	0x12
+
+#define QLCNIC_TCP_HDR_SIZE		20
+#define QLCNIC_TCP_TS_OPTION_SIZE	12
+#define QLCNIC_FETCH_RING_ID(handle)	((handle) >> 63)
+#define QLCNIC_DESC_OWNER_FW		cpu_to_le64(STATUS_OWNER_PHANTOM)
+
+#define QLCNIC_TCP_TS_HDR_SIZE (QLCNIC_TCP_HDR_SIZE + QLCNIC_TCP_TS_OPTION_SIZE)
+
+/* for status field in status_desc */
+#define STATUS_CKSUM_LOOP	0
+#define STATUS_CKSUM_OK		2
+
+#define qlcnic_83xx_pktln(sts)		((sts >> 32) & 0x3FFF)
+#define qlcnic_83xx_hndl(sts)		((sts >> 48) & 0x7FFF)
+#define qlcnic_83xx_csum_status(sts)	((sts >> 39) & 7)
+#define qlcnic_83xx_opcode(sts)	((sts >> 42) & 0xF)
+#define qlcnic_83xx_vlan_tag(sts)	(((sts) >> 48) & 0xFFFF)
+#define qlcnic_83xx_lro_pktln(sts)	(((sts) >> 32) & 0x3FFF)
+#define qlcnic_83xx_l2_hdr_off(sts)	(((sts) >> 16) & 0xFF)
+#define qlcnic_83xx_l4_hdr_off(sts)	(((sts) >> 24) & 0xFF)
+#define qlcnic_83xx_pkt_cnt(sts)	(((sts) >> 16) & 0x7)
+#define qlcnic_83xx_is_tstamp(sts)	(((sts) >> 40) & 1)
+#define qlcnic_83xx_is_psh_bit(sts)	(((sts) >> 41) & 1)
+#define qlcnic_83xx_is_ip_align(sts)	(((sts) >> 46) & 1)
+#define qlcnic_83xx_has_vlan_tag(sts)	(((sts) >> 47) & 1)
+
+static int qlcnic_process_rcv_ring(struct qlcnic_host_sds_ring *sds_ring,
+				   int max);
+
+static struct sk_buff *qlcnic_process_rxbuf(struct qlcnic_adapter *,
+					    struct qlcnic_host_rds_ring *,
+					    u16, u16);
+
+static inline u8 qlcnic_mac_hash(u64 mac, u16 vlan)
+{
+	return (u8)((mac & 0xff) ^ ((mac >> 40) & 0xff) ^ (vlan & 0xff));
+}
+
+static inline u32 qlcnic_get_ref_handle(struct qlcnic_adapter *adapter,
+					u16 handle, u8 ring_id)
+{
+	if (qlcnic_83xx_check(adapter))
+		return handle | (ring_id << 15);
+	else
+		return handle;
+}
+
+static inline int qlcnic_82xx_is_lb_pkt(u64 sts_data)
+{
+	return (qlcnic_get_sts_status(sts_data) == STATUS_CKSUM_LOOP) ? 1 : 0;
+}
+
+static void qlcnic_delete_rx_list_mac(struct qlcnic_adapter *adapter,
+				      struct qlcnic_filter *fil,
+				      void *addr, u16 vlan_id)
+{
+	int ret;
+	u8 op;
+
+	op = vlan_id ? QLCNIC_MAC_VLAN_ADD : QLCNIC_MAC_ADD;
+	ret = qlcnic_sre_macaddr_change(adapter, addr, vlan_id, op);
+	if (ret)
+		return;
+
+	op = vlan_id ? QLCNIC_MAC_VLAN_DEL : QLCNIC_MAC_DEL;
+	ret = qlcnic_sre_macaddr_change(adapter, addr, vlan_id, op);
+	if (!ret) {
+		hlist_del(&fil->fnode);
+		adapter->rx_fhash.fnum--;
+	}
+}
+
+static struct qlcnic_filter *qlcnic_find_mac_filter(struct hlist_head *head,
+						    void *addr, u16 vlan_id)
+{
+	struct qlcnic_filter *tmp_fil = NULL;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_safe(tmp_fil, n, head, fnode) {
+		if (ether_addr_equal(tmp_fil->faddr, addr) &&
+		    tmp_fil->vlan_id == vlan_id)
+			return tmp_fil;
+	}
+
+	return NULL;
+}
+
+static void qlcnic_add_lb_filter(struct qlcnic_adapter *adapter,
+				 struct sk_buff *skb, int loopback_pkt, u16 vlan_id)
+{
+	struct ethhdr *phdr = (struct ethhdr *)(skb->data);
+	struct qlcnic_filter *fil, *tmp_fil;
+	struct hlist_head *head;
+	unsigned long time;
+	u64 src_addr = 0;
+	u8 hindex, op;
+	int ret;
+
+	if (!qlcnic_sriov_pf_check(adapter) || (vlan_id == 0xffff))
+		vlan_id = 0;
+
+	memcpy(&src_addr, phdr->h_source, ETH_ALEN);
+	hindex = qlcnic_mac_hash(src_addr, vlan_id) &
+		 (adapter->fhash.fbucket_size - 1);
+
+	if (loopback_pkt) {
+		if (adapter->rx_fhash.fnum >= adapter->rx_fhash.fmax)
+			return;
+
+		head = &(adapter->rx_fhash.fhead[hindex]);
+
+		tmp_fil = qlcnic_find_mac_filter(head, &src_addr, vlan_id);
+		if (tmp_fil) {
+			time = tmp_fil->ftime;
+			if (time_after(jiffies, QLCNIC_READD_AGE * HZ + time))
+				tmp_fil->ftime = jiffies;
+			return;
+		}
+
+		fil = kzalloc(sizeof(struct qlcnic_filter), GFP_ATOMIC);
+		if (!fil)
+			return;
+
+		fil->ftime = jiffies;
+		memcpy(fil->faddr, &src_addr, ETH_ALEN);
+		fil->vlan_id = vlan_id;
+		spin_lock(&adapter->rx_mac_learn_lock);
+		hlist_add_head(&(fil->fnode), head);
+		adapter->rx_fhash.fnum++;
+		spin_unlock(&adapter->rx_mac_learn_lock);
+	} else {
+		head = &adapter->fhash.fhead[hindex];
+
+		spin_lock(&adapter->mac_learn_lock);
+
+		tmp_fil = qlcnic_find_mac_filter(head, &src_addr, vlan_id);
+		if (tmp_fil) {
+			op = vlan_id ? QLCNIC_MAC_VLAN_DEL : QLCNIC_MAC_DEL;
+			ret = qlcnic_sre_macaddr_change(adapter,
+							(u8 *)&src_addr,
+							vlan_id, op);
+			if (!ret) {
+				hlist_del(&tmp_fil->fnode);
+				adapter->fhash.fnum--;
+			}
+
+			spin_unlock(&adapter->mac_learn_lock);
+
+			return;
+		}
+
+		spin_unlock(&adapter->mac_learn_lock);
+
+		head = &adapter->rx_fhash.fhead[hindex];
+
+		spin_lock(&adapter->rx_mac_learn_lock);
+
+		tmp_fil = qlcnic_find_mac_filter(head, &src_addr, vlan_id);
+		if (tmp_fil)
+			qlcnic_delete_rx_list_mac(adapter, tmp_fil, &src_addr,
+						  vlan_id);
+
+		spin_unlock(&adapter->rx_mac_learn_lock);
+	}
+}
+
+void qlcnic_82xx_change_filter(struct qlcnic_adapter *adapter, u64 *uaddr,
+			       u16 vlan_id)
+{
+	struct cmd_desc_type0 *hwdesc;
+	struct qlcnic_nic_req *req;
+	struct qlcnic_mac_req *mac_req;
+	struct qlcnic_vlan_req *vlan_req;
+	struct qlcnic_host_tx_ring *tx_ring = adapter->tx_ring;
+	u32 producer;
+	u64 word;
+
+	producer = tx_ring->producer;
+	hwdesc = &tx_ring->desc_head[tx_ring->producer];
+
+	req = (struct qlcnic_nic_req *)hwdesc;
+	memset(req, 0, sizeof(struct qlcnic_nic_req));
+	req->qhdr = cpu_to_le64(QLCNIC_REQUEST << 23);
+
+	word = QLCNIC_MAC_EVENT | ((u64)(adapter->portnum) << 16);
+	req->req_hdr = cpu_to_le64(word);
+
+	mac_req = (struct qlcnic_mac_req *)&(req->words[0]);
+	mac_req->op = vlan_id ? QLCNIC_MAC_VLAN_ADD : QLCNIC_MAC_ADD;
+	memcpy(mac_req->mac_addr, uaddr, ETH_ALEN);
+
+	vlan_req = (struct qlcnic_vlan_req *)&req->words[1];
+	vlan_req->vlan_id = cpu_to_le16(vlan_id);
+
+	tx_ring->producer = get_next_index(producer, tx_ring->num_desc);
+	smp_mb();
+}
+
+static void qlcnic_send_filter(struct qlcnic_adapter *adapter,
+			       struct cmd_desc_type0 *first_desc,
+			       struct sk_buff *skb)
+{
+	struct vlan_ethhdr *vh = (struct vlan_ethhdr *)(skb->data);
+	struct ethhdr *phdr = (struct ethhdr *)(skb->data);
+	u16 protocol = ntohs(skb->protocol);
+	struct qlcnic_filter *fil, *tmp_fil;
+	struct hlist_head *head;
+	struct hlist_node *n;
+	u64 src_addr = 0;
+	u16 vlan_id = 0;
+	u8 hindex, hval;
+
+	if (ether_addr_equal(phdr->h_source, adapter->mac_addr))
+		return;
+
+	if (adapter->flags & QLCNIC_VLAN_FILTERING) {
+		if (protocol == ETH_P_8021Q) {
+			vh = (struct vlan_ethhdr *)skb->data;
+			vlan_id = ntohs(vh->h_vlan_TCI);
+		} else if (skb_vlan_tag_present(skb)) {
+			vlan_id = skb_vlan_tag_get(skb);
+		}
+	}
+
+	memcpy(&src_addr, phdr->h_source, ETH_ALEN);
+	hval = qlcnic_mac_hash(src_addr, vlan_id);
+	hindex = hval & (adapter->fhash.fbucket_size - 1);
+	head = &(adapter->fhash.fhead[hindex]);
+
+	hlist_for_each_entry_safe(tmp_fil, n, head, fnode) {
+		if (ether_addr_equal(tmp_fil->faddr, (u8 *)&src_addr) &&
+		    tmp_fil->vlan_id == vlan_id) {
+			if (jiffies > (QLCNIC_READD_AGE * HZ + tmp_fil->ftime))
+				qlcnic_change_filter(adapter, &src_addr,
+						     vlan_id);
+			tmp_fil->ftime = jiffies;
+			return;
+		}
+	}
+
+	if (unlikely(adapter->fhash.fnum >= adapter->fhash.fmax)) {
+		adapter->stats.mac_filter_limit_overrun++;
+		return;
+	}
+
+	fil = kzalloc(sizeof(struct qlcnic_filter), GFP_ATOMIC);
+	if (!fil)
+		return;
+
+	qlcnic_change_filter(adapter, &src_addr, vlan_id);
+	fil->ftime = jiffies;
+	fil->vlan_id = vlan_id;
+	memcpy(fil->faddr, &src_addr, ETH_ALEN);
+	spin_lock(&adapter->mac_learn_lock);
+	hlist_add_head(&(fil->fnode), head);
+	adapter->fhash.fnum++;
+	spin_unlock(&adapter->mac_learn_lock);
+}
+
+#define QLCNIC_ENCAP_VXLAN_PKT		BIT_0
+#define QLCNIC_ENCAP_OUTER_L3_IP6	BIT_1
+#define QLCNIC_ENCAP_INNER_L3_IP6	BIT_2
+#define QLCNIC_ENCAP_INNER_L4_UDP	BIT_3
+#define QLCNIC_ENCAP_DO_L3_CSUM		BIT_4
+#define QLCNIC_ENCAP_DO_L4_CSUM		BIT_5
+
+static int qlcnic_tx_encap_pkt(struct qlcnic_adapter *adapter,
+			       struct cmd_desc_type0 *first_desc,
+			       struct sk_buff *skb,
+			       struct qlcnic_host_tx_ring *tx_ring)
+{
+	u8 opcode = 0, inner_hdr_len = 0, outer_hdr_len = 0, total_hdr_len = 0;
+	int copied, copy_len, descr_size;
+	u32 producer = tx_ring->producer;
+	struct cmd_desc_type0 *hwdesc;
+	u16 flags = 0, encap_descr = 0;
+
+	opcode = QLCNIC_TX_ETHER_PKT;
+	encap_descr = QLCNIC_ENCAP_VXLAN_PKT;
+
+	if (skb_is_gso(skb)) {
+		inner_hdr_len = skb_inner_transport_header(skb) +
+				inner_tcp_hdrlen(skb) -
+				skb_inner_mac_header(skb);
+
+		/* VXLAN header size = 8 */
+		outer_hdr_len = skb_transport_offset(skb) + 8 +
+				sizeof(struct udphdr);
+		first_desc->outer_hdr_length = outer_hdr_len;
+		total_hdr_len = inner_hdr_len + outer_hdr_len;
+		encap_descr |= QLCNIC_ENCAP_DO_L3_CSUM |
+			       QLCNIC_ENCAP_DO_L4_CSUM;
+		first_desc->mss = cpu_to_le16(skb_shinfo(skb)->gso_size);
+		first_desc->hdr_length = inner_hdr_len;
+
+		/* Copy inner and outer headers in Tx descriptor(s)
+		 * If total_hdr_len > cmd_desc_type0, use multiple
+		 * descriptors
+		 */
+		copied = 0;
+		descr_size = (int)sizeof(struct cmd_desc_type0);
+		while (copied < total_hdr_len) {
+			copy_len = min(descr_size, (total_hdr_len - copied));
+			hwdesc = &tx_ring->desc_head[producer];
+			tx_ring->cmd_buf_arr[producer].skb = NULL;
+			skb_copy_from_linear_data_offset(skb, copied,
+							 (char *)hwdesc,
+							 copy_len);
+			copied += copy_len;
+			producer = get_next_index(producer, tx_ring->num_desc);
+		}
+
+		tx_ring->producer = producer;
+
+		/* Make sure updated tx_ring->producer is visible
+		 * for qlcnic_tx_avail()
+		 */
+		smp_mb();
+		adapter->stats.encap_lso_frames++;
+
+		opcode = QLCNIC_TX_ENCAP_LSO;
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		if (inner_ip_hdr(skb)->version == 6) {
+			if (inner_ipv6_hdr(skb)->nexthdr == IPPROTO_UDP)
+				encap_descr |= QLCNIC_ENCAP_INNER_L4_UDP;
+		} else {
+			if (inner_ip_hdr(skb)->protocol == IPPROTO_UDP)
+				encap_descr |= QLCNIC_ENCAP_INNER_L4_UDP;
+		}
+
+		adapter->stats.encap_tx_csummed++;
+		opcode = QLCNIC_TX_ENCAP_PKT;
+	}
+
+	/* Prepare first 16 bits of byte offset 16 of Tx descriptor */
+	if (ip_hdr(skb)->version == 6)
+		encap_descr |= QLCNIC_ENCAP_OUTER_L3_IP6;
+
+	/* outer IP header's size in 32bit words size*/
+	encap_descr |= (skb_network_header_len(skb) >> 2) << 6;
+
+	/* outer IP header offset */
+	encap_descr |= skb_network_offset(skb) << 10;
+	first_desc->encap_descr = cpu_to_le16(encap_descr);
+
+	first_desc->tcp_hdr_offset = skb_inner_transport_header(skb) -
+				     skb->data;
+	first_desc->ip_hdr_offset = skb_inner_network_offset(skb);
+
+	qlcnic_set_tx_flags_opcode(first_desc, flags, opcode);
+
+	return 0;
+}
+
+static int qlcnic_tx_pkt(struct qlcnic_adapter *adapter,
+			 struct cmd_desc_type0 *first_desc, struct sk_buff *skb,
+			 struct qlcnic_host_tx_ring *tx_ring)
+{
+	u8 l4proto, opcode = 0, hdr_len = 0;
+	u16 flags = 0, vlan_tci = 0;
+	int copied, offset, copy_len, size;
+	struct cmd_desc_type0 *hwdesc;
+	struct vlan_ethhdr *vh;
+	u16 protocol = ntohs(skb->protocol);
+	u32 producer = tx_ring->producer;
+
+	if (protocol == ETH_P_8021Q) {
+		vh = (struct vlan_ethhdr *)skb->data;
+		flags = QLCNIC_FLAGS_VLAN_TAGGED;
+		vlan_tci = ntohs(vh->h_vlan_TCI);
+		protocol = ntohs(vh->h_vlan_encapsulated_proto);
+	} else if (skb_vlan_tag_present(skb)) {
+		flags = QLCNIC_FLAGS_VLAN_OOB;
+		vlan_tci = skb_vlan_tag_get(skb);
+	}
+	if (unlikely(adapter->tx_pvid)) {
+		if (vlan_tci && !(adapter->flags & QLCNIC_TAGGING_ENABLED))
+			return -EIO;
+		if (vlan_tci && (adapter->flags & QLCNIC_TAGGING_ENABLED))
+			goto set_flags;
+
+		flags = QLCNIC_FLAGS_VLAN_OOB;
+		vlan_tci = adapter->tx_pvid;
+	}
+set_flags:
+	qlcnic_set_tx_vlan_tci(first_desc, vlan_tci);
+	qlcnic_set_tx_flags_opcode(first_desc, flags, opcode);
+
+	if (*(skb->data) & BIT_0) {
+		flags |= BIT_0;
+		memcpy(&first_desc->eth_addr, skb->data, ETH_ALEN);
+	}
+	opcode = QLCNIC_TX_ETHER_PKT;
+	if (skb_is_gso(skb)) {
+		hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		first_desc->mss = cpu_to_le16(skb_shinfo(skb)->gso_size);
+		first_desc->hdr_length = hdr_len;
+		opcode = (protocol == ETH_P_IPV6) ? QLCNIC_TX_TCP_LSO6 :
+						    QLCNIC_TX_TCP_LSO;
+
+		/* For LSO, we need to copy the MAC/IP/TCP headers into
+		* the descriptor ring */
+		copied = 0;
+		offset = 2;
+
+		if (flags & QLCNIC_FLAGS_VLAN_OOB) {
+			first_desc->hdr_length += VLAN_HLEN;
+			first_desc->tcp_hdr_offset = VLAN_HLEN;
+			first_desc->ip_hdr_offset = VLAN_HLEN;
+
+			/* Only in case of TSO on vlan device */
+			flags |= QLCNIC_FLAGS_VLAN_TAGGED;
+
+			/* Create a TSO vlan header template for firmware */
+			hwdesc = &tx_ring->desc_head[producer];
+			tx_ring->cmd_buf_arr[producer].skb = NULL;
+
+			copy_len = min((int)sizeof(struct cmd_desc_type0) -
+				       offset, hdr_len + VLAN_HLEN);
+
+			vh = (struct vlan_ethhdr *)((char *) hwdesc + 2);
+			skb_copy_from_linear_data(skb, vh, 12);
+			vh->h_vlan_proto = htons(ETH_P_8021Q);
+			vh->h_vlan_TCI = htons(vlan_tci);
+
+			skb_copy_from_linear_data_offset(skb, 12,
+							 (char *)vh + 16,
+							 copy_len - 16);
+			copied = copy_len - VLAN_HLEN;
+			offset = 0;
+			producer = get_next_index(producer, tx_ring->num_desc);
+		}
+
+		while (copied < hdr_len) {
+			size = (int)sizeof(struct cmd_desc_type0) - offset;
+			copy_len = min(size, (hdr_len - copied));
+			hwdesc = &tx_ring->desc_head[producer];
+			tx_ring->cmd_buf_arr[producer].skb = NULL;
+			skb_copy_from_linear_data_offset(skb, copied,
+							 (char *)hwdesc +
+							 offset, copy_len);
+			copied += copy_len;
+			offset = 0;
+			producer = get_next_index(producer, tx_ring->num_desc);
+		}
+
+		tx_ring->producer = producer;
+		smp_mb();
+		adapter->stats.lso_frames++;
+
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		if (protocol == ETH_P_IP) {
+			l4proto = ip_hdr(skb)->protocol;
+
+			if (l4proto == IPPROTO_TCP)
+				opcode = QLCNIC_TX_TCP_PKT;
+			else if (l4proto == IPPROTO_UDP)
+				opcode = QLCNIC_TX_UDP_PKT;
+		} else if (protocol == ETH_P_IPV6) {
+			l4proto = ipv6_hdr(skb)->nexthdr;
+
+			if (l4proto == IPPROTO_TCP)
+				opcode = QLCNIC_TX_TCPV6_PKT;
+			else if (l4proto == IPPROTO_UDP)
+				opcode = QLCNIC_TX_UDPV6_PKT;
+		}
+	}
+	first_desc->tcp_hdr_offset += skb_transport_offset(skb);
+	first_desc->ip_hdr_offset += skb_network_offset(skb);
+	qlcnic_set_tx_flags_opcode(first_desc, flags, opcode);
+
+	return 0;
+}
+
+static int qlcnic_map_tx_skb(struct pci_dev *pdev, struct sk_buff *skb,
+			     struct qlcnic_cmd_buffer *pbuf)
+{
+	struct qlcnic_skb_frag *nf;
+	struct skb_frag_struct *frag;
+	int i, nr_frags;
+	dma_addr_t map;
+
+	nr_frags = skb_shinfo(skb)->nr_frags;
+	nf = &pbuf->frag_array[0];
+
+	map = pci_map_single(pdev, skb->data, skb_headlen(skb),
+			     PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(pdev, map))
+		goto out_err;
+
+	nf->dma = map;
+	nf->length = skb_headlen(skb);
+
+	for (i = 0; i < nr_frags; i++) {
+		frag = &skb_shinfo(skb)->frags[i];
+		nf = &pbuf->frag_array[i+1];
+		map = skb_frag_dma_map(&pdev->dev, frag, 0, skb_frag_size(frag),
+				       DMA_TO_DEVICE);
+		if (dma_mapping_error(&pdev->dev, map))
+			goto unwind;
+
+		nf->dma = map;
+		nf->length = skb_frag_size(frag);
+	}
+
+	return 0;
+
+unwind:
+	while (--i >= 0) {
+		nf = &pbuf->frag_array[i+1];
+		pci_unmap_page(pdev, nf->dma, nf->length, PCI_DMA_TODEVICE);
+	}
+
+	nf = &pbuf->frag_array[0];
+	pci_unmap_single(pdev, nf->dma, skb_headlen(skb), PCI_DMA_TODEVICE);
+
+out_err:
+	return -ENOMEM;
+}
+
+static void qlcnic_unmap_buffers(struct pci_dev *pdev, struct sk_buff *skb,
+				 struct qlcnic_cmd_buffer *pbuf)
+{
+	struct qlcnic_skb_frag *nf = &pbuf->frag_array[0];
+	int i, nr_frags = skb_shinfo(skb)->nr_frags;
+
+	for (i = 0; i < nr_frags; i++) {
+		nf = &pbuf->frag_array[i+1];
+		pci_unmap_page(pdev, nf->dma, nf->length, PCI_DMA_TODEVICE);
+	}
+
+	nf = &pbuf->frag_array[0];
+	pci_unmap_single(pdev, nf->dma, skb_headlen(skb), PCI_DMA_TODEVICE);
+	pbuf->skb = NULL;
+}
+
+static inline void qlcnic_clear_cmddesc(u64 *desc)
+{
+	desc[0] = 0ULL;
+	desc[2] = 0ULL;
+	desc[7] = 0ULL;
+}
+
+netdev_tx_t qlcnic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_host_tx_ring *tx_ring;
+	struct qlcnic_cmd_buffer *pbuf;
+	struct qlcnic_skb_frag *buffrag;
+	struct cmd_desc_type0 *hwdesc, *first_desc;
+	struct pci_dev *pdev;
+	struct ethhdr *phdr;
+	int i, k, frag_count, delta = 0;
+	u32 producer, num_txd;
+	u16 protocol;
+	bool l4_is_udp = false;
+
+	if (!test_bit(__QLCNIC_DEV_UP, &adapter->state)) {
+		netif_tx_stop_all_queues(netdev);
+		return NETDEV_TX_BUSY;
+	}
+
+	if (adapter->flags & QLCNIC_MACSPOOF) {
+		phdr = (struct ethhdr *)skb->data;
+		if (!ether_addr_equal(phdr->h_source, adapter->mac_addr))
+			goto drop_packet;
+	}
+
+	tx_ring = &adapter->tx_ring[skb_get_queue_mapping(skb)];
+	num_txd = tx_ring->num_desc;
+
+	frag_count = skb_shinfo(skb)->nr_frags + 1;
+
+	/* 14 frags supported for normal packet and
+	 * 32 frags supported for TSO packet
+	 */
+	if (!skb_is_gso(skb) && frag_count > QLCNIC_MAX_FRAGS_PER_TX) {
+		for (i = 0; i < (frag_count - QLCNIC_MAX_FRAGS_PER_TX); i++)
+			delta += skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+		if (!__pskb_pull_tail(skb, delta))
+			goto drop_packet;
+
+		frag_count = 1 + skb_shinfo(skb)->nr_frags;
+	}
+
+	if (unlikely(qlcnic_tx_avail(tx_ring) <= TX_STOP_THRESH)) {
+		netif_tx_stop_queue(tx_ring->txq);
+		if (qlcnic_tx_avail(tx_ring) > TX_STOP_THRESH) {
+			netif_tx_start_queue(tx_ring->txq);
+		} else {
+			tx_ring->tx_stats.xmit_off++;
+			return NETDEV_TX_BUSY;
+		}
+	}
+
+	producer = tx_ring->producer;
+	pbuf = &tx_ring->cmd_buf_arr[producer];
+	pdev = adapter->pdev;
+	first_desc = &tx_ring->desc_head[producer];
+	hwdesc = &tx_ring->desc_head[producer];
+	qlcnic_clear_cmddesc((u64 *)hwdesc);
+
+	if (qlcnic_map_tx_skb(pdev, skb, pbuf)) {
+		adapter->stats.tx_dma_map_error++;
+		goto drop_packet;
+	}
+
+	pbuf->skb = skb;
+	pbuf->frag_count = frag_count;
+
+	qlcnic_set_tx_frags_len(first_desc, frag_count, skb->len);
+	qlcnic_set_tx_port(first_desc, adapter->portnum);
+
+	for (i = 0; i < frag_count; i++) {
+		k = i % 4;
+
+		if ((k == 0) && (i > 0)) {
+			/* move to next desc.*/
+			producer = get_next_index(producer, num_txd);
+			hwdesc = &tx_ring->desc_head[producer];
+			qlcnic_clear_cmddesc((u64 *)hwdesc);
+			tx_ring->cmd_buf_arr[producer].skb = NULL;
+		}
+
+		buffrag = &pbuf->frag_array[i];
+		hwdesc->buffer_length[k] = cpu_to_le16(buffrag->length);
+		switch (k) {
+		case 0:
+			hwdesc->addr_buffer1 = cpu_to_le64(buffrag->dma);
+			break;
+		case 1:
+			hwdesc->addr_buffer2 = cpu_to_le64(buffrag->dma);
+			break;
+		case 2:
+			hwdesc->addr_buffer3 = cpu_to_le64(buffrag->dma);
+			break;
+		case 3:
+			hwdesc->addr_buffer4 = cpu_to_le64(buffrag->dma);
+			break;
+		}
+	}
+
+	tx_ring->producer = get_next_index(producer, num_txd);
+	smp_mb();
+
+	protocol = ntohs(skb->protocol);
+	if (protocol == ETH_P_IP)
+		l4_is_udp = ip_hdr(skb)->protocol == IPPROTO_UDP;
+	else if (protocol == ETH_P_IPV6)
+		l4_is_udp = ipv6_hdr(skb)->nexthdr == IPPROTO_UDP;
+
+	/* Check if it is a VXLAN packet */
+	if (!skb->encapsulation || !l4_is_udp ||
+	    !qlcnic_encap_tx_offload(adapter)) {
+		if (unlikely(qlcnic_tx_pkt(adapter, first_desc, skb,
+					   tx_ring)))
+			goto unwind_buff;
+	} else {
+		if (unlikely(qlcnic_tx_encap_pkt(adapter, first_desc,
+						 skb, tx_ring)))
+			goto unwind_buff;
+	}
+
+	if (adapter->drv_mac_learn)
+		qlcnic_send_filter(adapter, first_desc, skb);
+
+	tx_ring->tx_stats.tx_bytes += skb->len;
+	tx_ring->tx_stats.xmit_called++;
+
+	/* Ensure writes are complete before HW fetches Tx descriptors */
+	wmb();
+	qlcnic_update_cmd_producer(tx_ring);
+
+	return NETDEV_TX_OK;
+
+unwind_buff:
+	qlcnic_unmap_buffers(pdev, skb, pbuf);
+drop_packet:
+	adapter->stats.txdropped++;
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
+}
+
+void qlcnic_advert_link_change(struct qlcnic_adapter *adapter, int linkup)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (adapter->ahw->linkup && !linkup) {
+		netdev_info(netdev, "NIC Link is down\n");
+		adapter->ahw->linkup = 0;
+		netif_carrier_off(netdev);
+	} else if (!adapter->ahw->linkup && linkup) {
+		adapter->ahw->linkup = 1;
+
+		/* Do not advertise Link up to the stack if device
+		 * is in loopback mode
+		 */
+		if (qlcnic_83xx_check(adapter) && adapter->ahw->lb_mode) {
+			netdev_info(netdev, "NIC Link is up for loopback test\n");
+			return;
+		}
+
+		netdev_info(netdev, "NIC Link is up\n");
+		netif_carrier_on(netdev);
+	}
+}
+
+static int qlcnic_alloc_rx_skb(struct qlcnic_adapter *adapter,
+			       struct qlcnic_host_rds_ring *rds_ring,
+			       struct qlcnic_rx_buffer *buffer)
+{
+	struct sk_buff *skb;
+	dma_addr_t dma;
+	struct pci_dev *pdev = adapter->pdev;
+
+	skb = netdev_alloc_skb(adapter->netdev, rds_ring->skb_size);
+	if (!skb) {
+		adapter->stats.skb_alloc_failure++;
+		return -ENOMEM;
+	}
+
+	skb_reserve(skb, NET_IP_ALIGN);
+	dma = pci_map_single(pdev, skb->data,
+			     rds_ring->dma_size, PCI_DMA_FROMDEVICE);
+
+	if (pci_dma_mapping_error(pdev, dma)) {
+		adapter->stats.rx_dma_map_error++;
+		dev_kfree_skb_any(skb);
+		return -ENOMEM;
+	}
+
+	buffer->skb = skb;
+	buffer->dma = dma;
+
+	return 0;
+}
+
+static void qlcnic_post_rx_buffers_nodb(struct qlcnic_adapter *adapter,
+					struct qlcnic_host_rds_ring *rds_ring,
+					u8 ring_id)
+{
+	struct rcv_desc *pdesc;
+	struct qlcnic_rx_buffer *buffer;
+	int  count = 0;
+	uint32_t producer, handle;
+	struct list_head *head;
+
+	if (!spin_trylock(&rds_ring->lock))
+		return;
+
+	producer = rds_ring->producer;
+	head = &rds_ring->free_list;
+	while (!list_empty(head)) {
+		buffer = list_entry(head->next, struct qlcnic_rx_buffer, list);
+
+		if (!buffer->skb) {
+			if (qlcnic_alloc_rx_skb(adapter, rds_ring, buffer))
+				break;
+		}
+		count++;
+		list_del(&buffer->list);
+
+		/* make a rcv descriptor  */
+		pdesc = &rds_ring->desc_head[producer];
+		handle = qlcnic_get_ref_handle(adapter,
+					       buffer->ref_handle, ring_id);
+		pdesc->reference_handle = cpu_to_le16(handle);
+		pdesc->buffer_length = cpu_to_le32(rds_ring->dma_size);
+		pdesc->addr_buffer = cpu_to_le64(buffer->dma);
+		producer = get_next_index(producer, rds_ring->num_desc);
+	}
+	if (count) {
+		rds_ring->producer = producer;
+		writel((producer - 1) & (rds_ring->num_desc - 1),
+		       rds_ring->crb_rcv_producer);
+	}
+	spin_unlock(&rds_ring->lock);
+}
+
+static int qlcnic_process_cmd_ring(struct qlcnic_adapter *adapter,
+				   struct qlcnic_host_tx_ring *tx_ring,
+				   int budget)
+{
+	u32 sw_consumer, hw_consumer;
+	int i, done, count = 0;
+	struct qlcnic_cmd_buffer *buffer;
+	struct pci_dev *pdev = adapter->pdev;
+	struct net_device *netdev = adapter->netdev;
+	struct qlcnic_skb_frag *frag;
+
+	if (!spin_trylock(&tx_ring->tx_clean_lock))
+		return 1;
+
+	sw_consumer = tx_ring->sw_consumer;
+	hw_consumer = le32_to_cpu(*(tx_ring->hw_consumer));
+
+	while (sw_consumer != hw_consumer) {
+		buffer = &tx_ring->cmd_buf_arr[sw_consumer];
+		if (buffer->skb) {
+			frag = &buffer->frag_array[0];
+			pci_unmap_single(pdev, frag->dma, frag->length,
+					 PCI_DMA_TODEVICE);
+			frag->dma = 0ULL;
+			for (i = 1; i < buffer->frag_count; i++) {
+				frag++;
+				pci_unmap_page(pdev, frag->dma, frag->length,
+					       PCI_DMA_TODEVICE);
+				frag->dma = 0ULL;
+			}
+			tx_ring->tx_stats.xmit_finished++;
+			dev_kfree_skb_any(buffer->skb);
+			buffer->skb = NULL;
+		}
+
+		sw_consumer = get_next_index(sw_consumer, tx_ring->num_desc);
+		if (++count >= budget)
+			break;
+	}
+
+	tx_ring->sw_consumer = sw_consumer;
+
+	if (count && netif_running(netdev)) {
+		smp_mb();
+		if (netif_tx_queue_stopped(tx_ring->txq) &&
+		    netif_carrier_ok(netdev)) {
+			if (qlcnic_tx_avail(tx_ring) > TX_STOP_THRESH) {
+				netif_tx_wake_queue(tx_ring->txq);
+				tx_ring->tx_stats.xmit_on++;
+			}
+		}
+		adapter->tx_timeo_cnt = 0;
+	}
+	/*
+	 * If everything is freed up to consumer then check if the ring is full
+	 * If the ring is full then check if more needs to be freed and
+	 * schedule the call back again.
+	 *
+	 * This happens when there are 2 CPUs. One could be freeing and the
+	 * other filling it. If the ring is full when we get out of here and
+	 * the card has already interrupted the host then the host can miss the
+	 * interrupt.
+	 *
+	 * There is still a possible race condition and the host could miss an
+	 * interrupt. The card has to take care of this.
+	 */
+	hw_consumer = le32_to_cpu(*(tx_ring->hw_consumer));
+	done = (sw_consumer == hw_consumer);
+
+	spin_unlock(&tx_ring->tx_clean_lock);
+
+	return done;
+}
+
+static int qlcnic_poll(struct napi_struct *napi, int budget)
+{
+	int tx_complete, work_done;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_adapter *adapter;
+	struct qlcnic_host_tx_ring *tx_ring;
+
+	sds_ring = container_of(napi, struct qlcnic_host_sds_ring, napi);
+	adapter = sds_ring->adapter;
+	tx_ring = sds_ring->tx_ring;
+
+	tx_complete = qlcnic_process_cmd_ring(adapter, tx_ring,
+					      budget);
+	work_done = qlcnic_process_rcv_ring(sds_ring, budget);
+
+	/* Check if we need a repoll */
+	if (!tx_complete)
+		work_done = budget;
+
+	if (work_done < budget) {
+		napi_complete_done(&sds_ring->napi, work_done);
+		if (test_bit(__QLCNIC_DEV_UP, &adapter->state)) {
+			qlcnic_enable_sds_intr(adapter, sds_ring);
+			qlcnic_enable_tx_intr(adapter, tx_ring);
+		}
+	}
+
+	return work_done;
+}
+
+static int qlcnic_tx_poll(struct napi_struct *napi, int budget)
+{
+	struct qlcnic_host_tx_ring *tx_ring;
+	struct qlcnic_adapter *adapter;
+	int work_done;
+
+	tx_ring = container_of(napi, struct qlcnic_host_tx_ring, napi);
+	adapter = tx_ring->adapter;
+
+	work_done = qlcnic_process_cmd_ring(adapter, tx_ring, budget);
+	if (work_done) {
+		napi_complete(&tx_ring->napi);
+		if (test_bit(__QLCNIC_DEV_UP, &adapter->state))
+			qlcnic_enable_tx_intr(adapter, tx_ring);
+	} else {
+		/* As qlcnic_process_cmd_ring() returned 0, we need a repoll*/
+		work_done = budget;
+	}
+
+	return work_done;
+}
+
+static int qlcnic_rx_poll(struct napi_struct *napi, int budget)
+{
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_adapter *adapter;
+	int work_done;
+
+	sds_ring = container_of(napi, struct qlcnic_host_sds_ring, napi);
+	adapter = sds_ring->adapter;
+
+	work_done = qlcnic_process_rcv_ring(sds_ring, budget);
+
+	if (work_done < budget) {
+		napi_complete_done(&sds_ring->napi, work_done);
+		if (test_bit(__QLCNIC_DEV_UP, &adapter->state))
+			qlcnic_enable_sds_intr(adapter, sds_ring);
+	}
+
+	return work_done;
+}
+
+static void qlcnic_handle_linkevent(struct qlcnic_adapter *adapter,
+				    struct qlcnic_fw_msg *msg)
+{
+	u32 cable_OUI;
+	u16 cable_len, link_speed;
+	u8  link_status, module, duplex, autoneg, lb_status = 0;
+	struct net_device *netdev = adapter->netdev;
+
+	adapter->ahw->has_link_events = 1;
+
+	cable_OUI = msg->body[1] & 0xffffffff;
+	cable_len = (msg->body[1] >> 32) & 0xffff;
+	link_speed = (msg->body[1] >> 48) & 0xffff;
+
+	link_status = msg->body[2] & 0xff;
+	duplex = (msg->body[2] >> 16) & 0xff;
+	autoneg = (msg->body[2] >> 24) & 0xff;
+	lb_status = (msg->body[2] >> 32) & 0x3;
+
+	module = (msg->body[2] >> 8) & 0xff;
+	if (module == LINKEVENT_MODULE_TWINAX_UNSUPPORTED_CABLE)
+		dev_info(&netdev->dev,
+			 "unsupported cable: OUI 0x%x, length %d\n",
+			 cable_OUI, cable_len);
+	else if (module == LINKEVENT_MODULE_TWINAX_UNSUPPORTED_CABLELEN)
+		dev_info(&netdev->dev, "unsupported cable length %d\n",
+			 cable_len);
+
+	if (!link_status && (lb_status == QLCNIC_ILB_MODE ||
+	    lb_status == QLCNIC_ELB_MODE))
+		adapter->ahw->loopback_state |= QLCNIC_LINKEVENT;
+
+	qlcnic_advert_link_change(adapter, link_status);
+
+	if (duplex == LINKEVENT_FULL_DUPLEX)
+		adapter->ahw->link_duplex = DUPLEX_FULL;
+	else
+		adapter->ahw->link_duplex = DUPLEX_HALF;
+
+	adapter->ahw->module_type = module;
+	adapter->ahw->link_autoneg = autoneg;
+
+	if (link_status) {
+		adapter->ahw->link_speed = link_speed;
+	} else {
+		adapter->ahw->link_speed = SPEED_UNKNOWN;
+		adapter->ahw->link_duplex = DUPLEX_UNKNOWN;
+	}
+}
+
+static void qlcnic_handle_fw_message(int desc_cnt, int index,
+				     struct qlcnic_host_sds_ring *sds_ring)
+{
+	struct qlcnic_fw_msg msg;
+	struct status_desc *desc;
+	struct qlcnic_adapter *adapter;
+	struct device *dev;
+	int i = 0, opcode, ret;
+
+	while (desc_cnt > 0 && i < 8) {
+		desc = &sds_ring->desc_head[index];
+		msg.words[i++] = le64_to_cpu(desc->status_desc_data[0]);
+		msg.words[i++] = le64_to_cpu(desc->status_desc_data[1]);
+
+		index = get_next_index(index, sds_ring->num_desc);
+		desc_cnt--;
+	}
+
+	adapter = sds_ring->adapter;
+	dev = &adapter->pdev->dev;
+	opcode = qlcnic_get_nic_msg_opcode(msg.body[0]);
+
+	switch (opcode) {
+	case QLCNIC_C2H_OPCODE_GET_LINKEVENT_RESPONSE:
+		qlcnic_handle_linkevent(adapter, &msg);
+		break;
+	case QLCNIC_C2H_OPCODE_CONFIG_LOOPBACK:
+		ret = (u32)(msg.body[1]);
+		switch (ret) {
+		case 0:
+			adapter->ahw->loopback_state |= QLCNIC_LB_RESPONSE;
+			break;
+		case 1:
+			dev_info(dev, "loopback already in progress\n");
+			adapter->ahw->diag_cnt = -EINPROGRESS;
+			break;
+		case 2:
+			dev_info(dev, "loopback cable is not connected\n");
+			adapter->ahw->diag_cnt = -ENODEV;
+			break;
+		default:
+			dev_info(dev,
+				 "loopback configure request failed, err %x\n",
+				 ret);
+			adapter->ahw->diag_cnt = -EIO;
+			break;
+		}
+		break;
+	case QLCNIC_C2H_OPCODE_GET_DCB_AEN:
+		qlcnic_dcb_aen_handler(adapter->dcb, (void *)&msg);
+		break;
+	default:
+		break;
+	}
+}
+
+static struct sk_buff *qlcnic_process_rxbuf(struct qlcnic_adapter *adapter,
+					    struct qlcnic_host_rds_ring *ring,
+					    u16 index, u16 cksum)
+{
+	struct qlcnic_rx_buffer *buffer;
+	struct sk_buff *skb;
+
+	buffer = &ring->rx_buf_arr[index];
+	if (unlikely(buffer->skb == NULL)) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	pci_unmap_single(adapter->pdev, buffer->dma, ring->dma_size,
+			 PCI_DMA_FROMDEVICE);
+
+	skb = buffer->skb;
+	if (likely((adapter->netdev->features & NETIF_F_RXCSUM) &&
+		   (cksum == STATUS_CKSUM_OK || cksum == STATUS_CKSUM_LOOP))) {
+		adapter->stats.csummed++;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else {
+		skb_checksum_none_assert(skb);
+	}
+
+
+	buffer->skb = NULL;
+
+	return skb;
+}
+
+static inline int qlcnic_check_rx_tagging(struct qlcnic_adapter *adapter,
+					  struct sk_buff *skb, u16 *vlan_tag)
+{
+	struct ethhdr *eth_hdr;
+
+	if (!__vlan_get_tag(skb, vlan_tag)) {
+		eth_hdr = (struct ethhdr *)skb->data;
+		memmove(skb->data + VLAN_HLEN, eth_hdr, ETH_ALEN * 2);
+		skb_pull(skb, VLAN_HLEN);
+	}
+	if (!adapter->rx_pvid)
+		return 0;
+
+	if (*vlan_tag == adapter->rx_pvid) {
+		/* Outer vlan tag. Packet should follow non-vlan path */
+		*vlan_tag = 0xffff;
+		return 0;
+	}
+	if (adapter->flags & QLCNIC_TAGGING_ENABLED)
+		return 0;
+
+	return -EINVAL;
+}
+
+static struct qlcnic_rx_buffer *
+qlcnic_process_rcv(struct qlcnic_adapter *adapter,
+		   struct qlcnic_host_sds_ring *sds_ring, int ring,
+		   u64 sts_data0)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct qlcnic_rx_buffer *buffer;
+	struct sk_buff *skb;
+	struct qlcnic_host_rds_ring *rds_ring;
+	int index, length, cksum, pkt_offset, is_lb_pkt;
+	u16 vid = 0xffff, t_vid;
+
+	if (unlikely(ring >= adapter->max_rds_rings))
+		return NULL;
+
+	rds_ring = &recv_ctx->rds_rings[ring];
+
+	index = qlcnic_get_sts_refhandle(sts_data0);
+	if (unlikely(index >= rds_ring->num_desc))
+		return NULL;
+
+	buffer = &rds_ring->rx_buf_arr[index];
+	length = qlcnic_get_sts_totallength(sts_data0);
+	cksum  = qlcnic_get_sts_status(sts_data0);
+	pkt_offset = qlcnic_get_sts_pkt_offset(sts_data0);
+
+	skb = qlcnic_process_rxbuf(adapter, rds_ring, index, cksum);
+	if (!skb)
+		return buffer;
+
+	if (adapter->rx_mac_learn) {
+		t_vid = 0;
+		is_lb_pkt = qlcnic_82xx_is_lb_pkt(sts_data0);
+		qlcnic_add_lb_filter(adapter, skb, is_lb_pkt, t_vid);
+	}
+
+	if (length > rds_ring->skb_size)
+		skb_put(skb, rds_ring->skb_size);
+	else
+		skb_put(skb, length);
+
+	if (pkt_offset)
+		skb_pull(skb, pkt_offset);
+
+	if (unlikely(qlcnic_check_rx_tagging(adapter, skb, &vid))) {
+		adapter->stats.rxdropped++;
+		dev_kfree_skb(skb);
+		return buffer;
+	}
+
+	skb->protocol = eth_type_trans(skb, netdev);
+
+	if (vid != 0xffff)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
+
+	napi_gro_receive(&sds_ring->napi, skb);
+
+	adapter->stats.rx_pkts++;
+	adapter->stats.rxbytes += length;
+
+	return buffer;
+}
+
+#define QLC_TCP_HDR_SIZE            20
+#define QLC_TCP_TS_OPTION_SIZE      12
+#define QLC_TCP_TS_HDR_SIZE         (QLC_TCP_HDR_SIZE + QLC_TCP_TS_OPTION_SIZE)
+
+static struct qlcnic_rx_buffer *
+qlcnic_process_lro(struct qlcnic_adapter *adapter,
+		   int ring, u64 sts_data0, u64 sts_data1)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct qlcnic_rx_buffer *buffer;
+	struct sk_buff *skb;
+	struct qlcnic_host_rds_ring *rds_ring;
+	struct iphdr *iph;
+	struct ipv6hdr *ipv6h;
+	struct tcphdr *th;
+	bool push, timestamp;
+	int index, l2_hdr_offset, l4_hdr_offset, is_lb_pkt;
+	u16 lro_length, length, data_offset, t_vid, vid = 0xffff;
+	u32 seq_number;
+
+	if (unlikely(ring >= adapter->max_rds_rings))
+		return NULL;
+
+	rds_ring = &recv_ctx->rds_rings[ring];
+
+	index = qlcnic_get_lro_sts_refhandle(sts_data0);
+	if (unlikely(index >= rds_ring->num_desc))
+		return NULL;
+
+	buffer = &rds_ring->rx_buf_arr[index];
+
+	timestamp = qlcnic_get_lro_sts_timestamp(sts_data0);
+	lro_length = qlcnic_get_lro_sts_length(sts_data0);
+	l2_hdr_offset = qlcnic_get_lro_sts_l2_hdr_offset(sts_data0);
+	l4_hdr_offset = qlcnic_get_lro_sts_l4_hdr_offset(sts_data0);
+	push = qlcnic_get_lro_sts_push_flag(sts_data0);
+	seq_number = qlcnic_get_lro_sts_seq_number(sts_data1);
+
+	skb = qlcnic_process_rxbuf(adapter, rds_ring, index, STATUS_CKSUM_OK);
+	if (!skb)
+		return buffer;
+
+	if (adapter->rx_mac_learn) {
+		t_vid = 0;
+		is_lb_pkt = qlcnic_82xx_is_lb_pkt(sts_data0);
+		qlcnic_add_lb_filter(adapter, skb, is_lb_pkt, t_vid);
+	}
+
+	if (timestamp)
+		data_offset = l4_hdr_offset + QLC_TCP_TS_HDR_SIZE;
+	else
+		data_offset = l4_hdr_offset + QLC_TCP_HDR_SIZE;
+
+	skb_put(skb, lro_length + data_offset);
+	skb_pull(skb, l2_hdr_offset);
+
+	if (unlikely(qlcnic_check_rx_tagging(adapter, skb, &vid))) {
+		adapter->stats.rxdropped++;
+		dev_kfree_skb(skb);
+		return buffer;
+	}
+
+	skb->protocol = eth_type_trans(skb, netdev);
+
+	if (ntohs(skb->protocol) == ETH_P_IPV6) {
+		ipv6h = (struct ipv6hdr *)skb->data;
+		th = (struct tcphdr *)(skb->data + sizeof(struct ipv6hdr));
+		length = (th->doff << 2) + lro_length;
+		ipv6h->payload_len = htons(length);
+	} else {
+		iph = (struct iphdr *)skb->data;
+		th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
+		length = (iph->ihl << 2) + (th->doff << 2) + lro_length;
+		csum_replace2(&iph->check, iph->tot_len, htons(length));
+		iph->tot_len = htons(length);
+	}
+
+	th->psh = push;
+	th->seq = htonl(seq_number);
+	length = skb->len;
+
+	if (adapter->flags & QLCNIC_FW_LRO_MSS_CAP) {
+		skb_shinfo(skb)->gso_size = qlcnic_get_lro_sts_mss(sts_data1);
+		if (skb->protocol == htons(ETH_P_IPV6))
+			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+		else
+			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+	}
+
+	if (vid != 0xffff)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
+	netif_receive_skb(skb);
+
+	adapter->stats.lro_pkts++;
+	adapter->stats.lrobytes += length;
+
+	return buffer;
+}
+
+static int qlcnic_process_rcv_ring(struct qlcnic_host_sds_ring *sds_ring, int max)
+{
+	struct qlcnic_host_rds_ring *rds_ring;
+	struct qlcnic_adapter *adapter = sds_ring->adapter;
+	struct list_head *cur;
+	struct status_desc *desc;
+	struct qlcnic_rx_buffer *rxbuf;
+	int opcode, desc_cnt, count = 0;
+	u64 sts_data0, sts_data1;
+	u8 ring;
+	u32 consumer = sds_ring->consumer;
+
+	while (count < max) {
+		desc = &sds_ring->desc_head[consumer];
+		sts_data0 = le64_to_cpu(desc->status_desc_data[0]);
+
+		if (!(sts_data0 & STATUS_OWNER_HOST))
+			break;
+
+		desc_cnt = qlcnic_get_sts_desc_cnt(sts_data0);
+		opcode = qlcnic_get_sts_opcode(sts_data0);
+		switch (opcode) {
+		case QLCNIC_RXPKT_DESC:
+		case QLCNIC_OLD_RXPKT_DESC:
+		case QLCNIC_SYN_OFFLOAD:
+			ring = qlcnic_get_sts_type(sts_data0);
+			rxbuf = qlcnic_process_rcv(adapter, sds_ring, ring,
+						   sts_data0);
+			break;
+		case QLCNIC_LRO_DESC:
+			ring = qlcnic_get_lro_sts_type(sts_data0);
+			sts_data1 = le64_to_cpu(desc->status_desc_data[1]);
+			rxbuf = qlcnic_process_lro(adapter, ring, sts_data0,
+						   sts_data1);
+			break;
+		case QLCNIC_RESPONSE_DESC:
+			qlcnic_handle_fw_message(desc_cnt, consumer, sds_ring);
+		default:
+			goto skip;
+		}
+		WARN_ON(desc_cnt > 1);
+
+		if (likely(rxbuf))
+			list_add_tail(&rxbuf->list, &sds_ring->free_list[ring]);
+		else
+			adapter->stats.null_rxbuf++;
+skip:
+		for (; desc_cnt > 0; desc_cnt--) {
+			desc = &sds_ring->desc_head[consumer];
+			desc->status_desc_data[0] = QLCNIC_DESC_OWNER_FW;
+			consumer = get_next_index(consumer, sds_ring->num_desc);
+		}
+		count++;
+	}
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &adapter->recv_ctx->rds_rings[ring];
+		if (!list_empty(&sds_ring->free_list[ring])) {
+			list_for_each(cur, &sds_ring->free_list[ring]) {
+				rxbuf = list_entry(cur, struct qlcnic_rx_buffer,
+						   list);
+				qlcnic_alloc_rx_skb(adapter, rds_ring, rxbuf);
+			}
+			spin_lock(&rds_ring->lock);
+			list_splice_tail_init(&sds_ring->free_list[ring],
+					      &rds_ring->free_list);
+			spin_unlock(&rds_ring->lock);
+		}
+
+		qlcnic_post_rx_buffers_nodb(adapter, rds_ring, ring);
+	}
+
+	if (count) {
+		sds_ring->consumer = consumer;
+		writel(consumer, sds_ring->crb_sts_consumer);
+	}
+
+	return count;
+}
+
+void qlcnic_post_rx_buffers(struct qlcnic_adapter *adapter,
+			    struct qlcnic_host_rds_ring *rds_ring, u8 ring_id)
+{
+	struct rcv_desc *pdesc;
+	struct qlcnic_rx_buffer *buffer;
+	int count = 0;
+	u32 producer, handle;
+	struct list_head *head;
+
+	producer = rds_ring->producer;
+	head = &rds_ring->free_list;
+
+	while (!list_empty(head)) {
+
+		buffer = list_entry(head->next, struct qlcnic_rx_buffer, list);
+
+		if (!buffer->skb) {
+			if (qlcnic_alloc_rx_skb(adapter, rds_ring, buffer))
+				break;
+		}
+
+		count++;
+		list_del(&buffer->list);
+
+		/* make a rcv descriptor  */
+		pdesc = &rds_ring->desc_head[producer];
+		pdesc->addr_buffer = cpu_to_le64(buffer->dma);
+		handle = qlcnic_get_ref_handle(adapter, buffer->ref_handle,
+					       ring_id);
+		pdesc->reference_handle = cpu_to_le16(handle);
+		pdesc->buffer_length = cpu_to_le32(rds_ring->dma_size);
+		producer = get_next_index(producer, rds_ring->num_desc);
+	}
+
+	if (count) {
+		rds_ring->producer = producer;
+		writel((producer-1) & (rds_ring->num_desc-1),
+		       rds_ring->crb_rcv_producer);
+	}
+}
+
+static void dump_skb(struct sk_buff *skb, struct qlcnic_adapter *adapter)
+{
+	if (adapter->ahw->msg_enable & NETIF_MSG_DRV) {
+		char prefix[30];
+
+		scnprintf(prefix, sizeof(prefix), "%s: %s: ",
+			  dev_name(&adapter->pdev->dev), __func__);
+
+		print_hex_dump_debug(prefix, DUMP_PREFIX_NONE, 16, 1,
+				     skb->data, skb->len, true);
+	}
+}
+
+static void qlcnic_process_rcv_diag(struct qlcnic_adapter *adapter, int ring,
+				    u64 sts_data0)
+{
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct sk_buff *skb;
+	struct qlcnic_host_rds_ring *rds_ring;
+	int index, length, cksum, pkt_offset;
+
+	if (unlikely(ring >= adapter->max_rds_rings))
+		return;
+
+	rds_ring = &recv_ctx->rds_rings[ring];
+
+	index = qlcnic_get_sts_refhandle(sts_data0);
+	length = qlcnic_get_sts_totallength(sts_data0);
+	if (unlikely(index >= rds_ring->num_desc))
+		return;
+
+	cksum  = qlcnic_get_sts_status(sts_data0);
+	pkt_offset = qlcnic_get_sts_pkt_offset(sts_data0);
+
+	skb = qlcnic_process_rxbuf(adapter, rds_ring, index, cksum);
+	if (!skb)
+		return;
+
+	if (length > rds_ring->skb_size)
+		skb_put(skb, rds_ring->skb_size);
+	else
+		skb_put(skb, length);
+
+	if (pkt_offset)
+		skb_pull(skb, pkt_offset);
+
+	if (!qlcnic_check_loopback_buff(skb->data, adapter->mac_addr))
+		adapter->ahw->diag_cnt++;
+	else
+		dump_skb(skb, adapter);
+
+	dev_kfree_skb_any(skb);
+	adapter->stats.rx_pkts++;
+	adapter->stats.rxbytes += length;
+
+	return;
+}
+
+void qlcnic_82xx_process_rcv_ring_diag(struct qlcnic_host_sds_ring *sds_ring)
+{
+	struct qlcnic_adapter *adapter = sds_ring->adapter;
+	struct status_desc *desc;
+	u64 sts_data0;
+	int ring, opcode, desc_cnt;
+
+	u32 consumer = sds_ring->consumer;
+
+	desc = &sds_ring->desc_head[consumer];
+	sts_data0 = le64_to_cpu(desc->status_desc_data[0]);
+
+	if (!(sts_data0 & STATUS_OWNER_HOST))
+		return;
+
+	desc_cnt = qlcnic_get_sts_desc_cnt(sts_data0);
+	opcode = qlcnic_get_sts_opcode(sts_data0);
+	switch (opcode) {
+	case QLCNIC_RESPONSE_DESC:
+		qlcnic_handle_fw_message(desc_cnt, consumer, sds_ring);
+		break;
+	default:
+		ring = qlcnic_get_sts_type(sts_data0);
+		qlcnic_process_rcv_diag(adapter, ring, sts_data0);
+		break;
+	}
+
+	for (; desc_cnt > 0; desc_cnt--) {
+		desc = &sds_ring->desc_head[consumer];
+		desc->status_desc_data[0] = cpu_to_le64(STATUS_OWNER_PHANTOM);
+		consumer = get_next_index(consumer, sds_ring->num_desc);
+	}
+
+	sds_ring->consumer = consumer;
+	writel(consumer, sds_ring->crb_sts_consumer);
+}
+
+int qlcnic_82xx_napi_add(struct qlcnic_adapter *adapter,
+			 struct net_device *netdev)
+{
+	int ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct qlcnic_host_tx_ring *tx_ring;
+
+	if (qlcnic_alloc_sds_rings(recv_ctx, adapter->drv_sds_rings))
+		return -ENOMEM;
+
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		if (qlcnic_check_multi_tx(adapter) &&
+		    !adapter->ahw->diag_test) {
+			netif_napi_add(netdev, &sds_ring->napi, qlcnic_rx_poll,
+				       NAPI_POLL_WEIGHT);
+		} else {
+			if (ring == (adapter->drv_sds_rings - 1))
+				netif_napi_add(netdev, &sds_ring->napi,
+					       qlcnic_poll,
+					       NAPI_POLL_WEIGHT);
+			else
+				netif_napi_add(netdev, &sds_ring->napi,
+					       qlcnic_rx_poll,
+					       NAPI_POLL_WEIGHT);
+		}
+	}
+
+	if (qlcnic_alloc_tx_rings(adapter, netdev)) {
+		qlcnic_free_sds_rings(recv_ctx);
+		return -ENOMEM;
+	}
+
+	if (qlcnic_check_multi_tx(adapter) && !adapter->ahw->diag_test) {
+		for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+			tx_ring = &adapter->tx_ring[ring];
+			netif_tx_napi_add(netdev, &tx_ring->napi, qlcnic_tx_poll,
+				       NAPI_POLL_WEIGHT);
+		}
+	}
+
+	return 0;
+}
+
+void qlcnic_82xx_napi_del(struct qlcnic_adapter *adapter)
+{
+	int ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct qlcnic_host_tx_ring *tx_ring;
+
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		netif_napi_del(&sds_ring->napi);
+	}
+
+	qlcnic_free_sds_rings(adapter->recv_ctx);
+
+	if (qlcnic_check_multi_tx(adapter) && !adapter->ahw->diag_test) {
+		for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+			tx_ring = &adapter->tx_ring[ring];
+			netif_napi_del(&tx_ring->napi);
+		}
+	}
+
+	qlcnic_free_tx_rings(adapter);
+}
+
+void qlcnic_82xx_napi_enable(struct qlcnic_adapter *adapter)
+{
+	int ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_host_tx_ring *tx_ring;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+
+	if (adapter->is_up != QLCNIC_ADAPTER_UP_MAGIC)
+		return;
+
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		napi_enable(&sds_ring->napi);
+		qlcnic_enable_sds_intr(adapter, sds_ring);
+	}
+
+	if (qlcnic_check_multi_tx(adapter) &&
+	    (adapter->flags & QLCNIC_MSIX_ENABLED) &&
+	    !adapter->ahw->diag_test) {
+		for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+			tx_ring = &adapter->tx_ring[ring];
+			napi_enable(&tx_ring->napi);
+			qlcnic_enable_tx_intr(adapter, tx_ring);
+		}
+	}
+}
+
+void qlcnic_82xx_napi_disable(struct qlcnic_adapter *adapter)
+{
+	int ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_host_tx_ring *tx_ring;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+
+	if (adapter->is_up != QLCNIC_ADAPTER_UP_MAGIC)
+		return;
+
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		qlcnic_disable_sds_intr(adapter, sds_ring);
+		napi_synchronize(&sds_ring->napi);
+		napi_disable(&sds_ring->napi);
+	}
+
+	if ((adapter->flags & QLCNIC_MSIX_ENABLED) &&
+	    !adapter->ahw->diag_test &&
+	    qlcnic_check_multi_tx(adapter)) {
+		for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+			tx_ring = &adapter->tx_ring[ring];
+			qlcnic_disable_tx_intr(adapter, tx_ring);
+			napi_synchronize(&tx_ring->napi);
+			napi_disable(&tx_ring->napi);
+		}
+	}
+}
+
+#define QLC_83XX_NORMAL_LB_PKT	(1ULL << 36)
+#define QLC_83XX_LRO_LB_PKT	(1ULL << 46)
+
+static inline int qlcnic_83xx_is_lb_pkt(u64 sts_data, int lro_pkt)
+{
+	if (lro_pkt)
+		return (sts_data & QLC_83XX_LRO_LB_PKT) ? 1 : 0;
+	else
+		return (sts_data & QLC_83XX_NORMAL_LB_PKT) ? 1 : 0;
+}
+
+#define QLCNIC_ENCAP_LENGTH_MASK	0x7f
+
+static inline u8 qlcnic_encap_length(u64 sts_data)
+{
+	return sts_data & QLCNIC_ENCAP_LENGTH_MASK;
+}
+
+static struct qlcnic_rx_buffer *
+qlcnic_83xx_process_rcv(struct qlcnic_adapter *adapter,
+			struct qlcnic_host_sds_ring *sds_ring,
+			u8 ring, u64 sts_data[])
+{
+	struct net_device *netdev = adapter->netdev;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct qlcnic_rx_buffer *buffer;
+	struct sk_buff *skb;
+	struct qlcnic_host_rds_ring *rds_ring;
+	int index, length, cksum, is_lb_pkt;
+	u16 vid = 0xffff;
+	int err;
+
+	if (unlikely(ring >= adapter->max_rds_rings))
+		return NULL;
+
+	rds_ring = &recv_ctx->rds_rings[ring];
+
+	index = qlcnic_83xx_hndl(sts_data[0]);
+	if (unlikely(index >= rds_ring->num_desc))
+		return NULL;
+
+	buffer = &rds_ring->rx_buf_arr[index];
+	length = qlcnic_83xx_pktln(sts_data[0]);
+	cksum  = qlcnic_83xx_csum_status(sts_data[1]);
+	skb = qlcnic_process_rxbuf(adapter, rds_ring, index, cksum);
+	if (!skb)
+		return buffer;
+
+	if (length > rds_ring->skb_size)
+		skb_put(skb, rds_ring->skb_size);
+	else
+		skb_put(skb, length);
+
+	err = qlcnic_check_rx_tagging(adapter, skb, &vid);
+
+	if (adapter->rx_mac_learn) {
+		is_lb_pkt = qlcnic_83xx_is_lb_pkt(sts_data[1], 0);
+		qlcnic_add_lb_filter(adapter, skb, is_lb_pkt, vid);
+	}
+
+	if (unlikely(err)) {
+		adapter->stats.rxdropped++;
+		dev_kfree_skb(skb);
+		return buffer;
+	}
+
+	skb->protocol = eth_type_trans(skb, netdev);
+
+	if (qlcnic_encap_length(sts_data[1]) &&
+	    skb->ip_summed == CHECKSUM_UNNECESSARY) {
+		skb->csum_level = 1;
+		adapter->stats.encap_rx_csummed++;
+	}
+
+	if (vid != 0xffff)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
+
+	napi_gro_receive(&sds_ring->napi, skb);
+
+	adapter->stats.rx_pkts++;
+	adapter->stats.rxbytes += length;
+
+	return buffer;
+}
+
+static struct qlcnic_rx_buffer *
+qlcnic_83xx_process_lro(struct qlcnic_adapter *adapter,
+			u8 ring, u64 sts_data[])
+{
+	struct net_device *netdev = adapter->netdev;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct qlcnic_rx_buffer *buffer;
+	struct sk_buff *skb;
+	struct qlcnic_host_rds_ring *rds_ring;
+	struct iphdr *iph;
+	struct ipv6hdr *ipv6h;
+	struct tcphdr *th;
+	bool push;
+	int l2_hdr_offset, l4_hdr_offset;
+	int index, is_lb_pkt;
+	u16 lro_length, length, data_offset, gso_size;
+	u16 vid = 0xffff;
+	int err;
+
+	if (unlikely(ring >= adapter->max_rds_rings))
+		return NULL;
+
+	rds_ring = &recv_ctx->rds_rings[ring];
+
+	index = qlcnic_83xx_hndl(sts_data[0]);
+	if (unlikely(index >= rds_ring->num_desc))
+		return NULL;
+
+	buffer = &rds_ring->rx_buf_arr[index];
+
+	lro_length = qlcnic_83xx_lro_pktln(sts_data[0]);
+	l2_hdr_offset = qlcnic_83xx_l2_hdr_off(sts_data[1]);
+	l4_hdr_offset = qlcnic_83xx_l4_hdr_off(sts_data[1]);
+	push = qlcnic_83xx_is_psh_bit(sts_data[1]);
+
+	skb = qlcnic_process_rxbuf(adapter, rds_ring, index, STATUS_CKSUM_OK);
+	if (!skb)
+		return buffer;
+
+	if (qlcnic_83xx_is_tstamp(sts_data[1]))
+		data_offset = l4_hdr_offset + QLCNIC_TCP_TS_HDR_SIZE;
+	else
+		data_offset = l4_hdr_offset + QLCNIC_TCP_HDR_SIZE;
+
+	skb_put(skb, lro_length + data_offset);
+	skb_pull(skb, l2_hdr_offset);
+
+	err = qlcnic_check_rx_tagging(adapter, skb, &vid);
+
+	if (adapter->rx_mac_learn) {
+		is_lb_pkt = qlcnic_83xx_is_lb_pkt(sts_data[1], 1);
+		qlcnic_add_lb_filter(adapter, skb, is_lb_pkt, vid);
+	}
+
+	if (unlikely(err)) {
+		adapter->stats.rxdropped++;
+		dev_kfree_skb(skb);
+		return buffer;
+	}
+
+	skb->protocol = eth_type_trans(skb, netdev);
+	if (ntohs(skb->protocol) == ETH_P_IPV6) {
+		ipv6h = (struct ipv6hdr *)skb->data;
+		th = (struct tcphdr *)(skb->data + sizeof(struct ipv6hdr));
+
+		length = (th->doff << 2) + lro_length;
+		ipv6h->payload_len = htons(length);
+	} else {
+		iph = (struct iphdr *)skb->data;
+		th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
+		length = (iph->ihl << 2) + (th->doff << 2) + lro_length;
+		csum_replace2(&iph->check, iph->tot_len, htons(length));
+		iph->tot_len = htons(length);
+	}
+
+	th->psh = push;
+	length = skb->len;
+
+	if (adapter->flags & QLCNIC_FW_LRO_MSS_CAP) {
+		gso_size = qlcnic_83xx_get_lro_sts_mss(sts_data[0]);
+		skb_shinfo(skb)->gso_size = gso_size;
+		if (skb->protocol == htons(ETH_P_IPV6))
+			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+		else
+			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+	}
+
+	if (vid != 0xffff)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
+
+	netif_receive_skb(skb);
+
+	adapter->stats.lro_pkts++;
+	adapter->stats.lrobytes += length;
+	return buffer;
+}
+
+static int qlcnic_83xx_process_rcv_ring(struct qlcnic_host_sds_ring *sds_ring,
+					int max)
+{
+	struct qlcnic_host_rds_ring *rds_ring;
+	struct qlcnic_adapter *adapter = sds_ring->adapter;
+	struct list_head *cur;
+	struct status_desc *desc;
+	struct qlcnic_rx_buffer *rxbuf = NULL;
+	u8 ring;
+	u64 sts_data[2];
+	int count = 0, opcode;
+	u32 consumer = sds_ring->consumer;
+
+	while (count < max) {
+		desc = &sds_ring->desc_head[consumer];
+		sts_data[1] = le64_to_cpu(desc->status_desc_data[1]);
+		opcode = qlcnic_83xx_opcode(sts_data[1]);
+		if (!opcode)
+			break;
+		sts_data[0] = le64_to_cpu(desc->status_desc_data[0]);
+		ring = QLCNIC_FETCH_RING_ID(sts_data[0]);
+
+		switch (opcode) {
+		case QLC_83XX_REG_DESC:
+			rxbuf = qlcnic_83xx_process_rcv(adapter, sds_ring,
+							ring, sts_data);
+			break;
+		case QLC_83XX_LRO_DESC:
+			rxbuf = qlcnic_83xx_process_lro(adapter, ring,
+							sts_data);
+			break;
+		default:
+			dev_info(&adapter->pdev->dev,
+				 "Unknown opcode: 0x%x\n", opcode);
+			goto skip;
+		}
+
+		if (likely(rxbuf))
+			list_add_tail(&rxbuf->list, &sds_ring->free_list[ring]);
+		else
+			adapter->stats.null_rxbuf++;
+skip:
+		desc = &sds_ring->desc_head[consumer];
+		/* Reset the descriptor */
+		desc->status_desc_data[1] = 0;
+		consumer = get_next_index(consumer, sds_ring->num_desc);
+		count++;
+	}
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &adapter->recv_ctx->rds_rings[ring];
+		if (!list_empty(&sds_ring->free_list[ring])) {
+			list_for_each(cur, &sds_ring->free_list[ring]) {
+				rxbuf = list_entry(cur, struct qlcnic_rx_buffer,
+						   list);
+				qlcnic_alloc_rx_skb(adapter, rds_ring, rxbuf);
+			}
+			spin_lock(&rds_ring->lock);
+			list_splice_tail_init(&sds_ring->free_list[ring],
+					      &rds_ring->free_list);
+			spin_unlock(&rds_ring->lock);
+		}
+		qlcnic_post_rx_buffers_nodb(adapter, rds_ring, ring);
+	}
+	if (count) {
+		sds_ring->consumer = consumer;
+		writel(consumer, sds_ring->crb_sts_consumer);
+	}
+	return count;
+}
+
+static int qlcnic_83xx_msix_sriov_vf_poll(struct napi_struct *napi, int budget)
+{
+	int tx_complete;
+	int work_done;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_adapter *adapter;
+	struct qlcnic_host_tx_ring *tx_ring;
+
+	sds_ring = container_of(napi, struct qlcnic_host_sds_ring, napi);
+	adapter = sds_ring->adapter;
+	/* tx ring count = 1 */
+	tx_ring = adapter->tx_ring;
+
+	tx_complete = qlcnic_process_cmd_ring(adapter, tx_ring, budget);
+	work_done = qlcnic_83xx_process_rcv_ring(sds_ring, budget);
+
+	/* Check if we need a repoll */
+	if (!tx_complete)
+		work_done = budget;
+
+	if (work_done < budget) {
+		napi_complete_done(&sds_ring->napi, work_done);
+		qlcnic_enable_sds_intr(adapter, sds_ring);
+	}
+
+	return work_done;
+}
+
+static int qlcnic_83xx_poll(struct napi_struct *napi, int budget)
+{
+	int tx_complete;
+	int work_done;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_adapter *adapter;
+	struct qlcnic_host_tx_ring *tx_ring;
+
+	sds_ring = container_of(napi, struct qlcnic_host_sds_ring, napi);
+	adapter = sds_ring->adapter;
+	/* tx ring count = 1 */
+	tx_ring = adapter->tx_ring;
+
+	tx_complete = qlcnic_process_cmd_ring(adapter, tx_ring, budget);
+	work_done = qlcnic_83xx_process_rcv_ring(sds_ring, budget);
+
+	/* Check if we need a repoll */
+	if (!tx_complete)
+		work_done = budget;
+
+	if (work_done < budget) {
+		napi_complete_done(&sds_ring->napi, work_done);
+		qlcnic_enable_sds_intr(adapter, sds_ring);
+	}
+
+	return work_done;
+}
+
+static int qlcnic_83xx_msix_tx_poll(struct napi_struct *napi, int budget)
+{
+	int work_done;
+	struct qlcnic_host_tx_ring *tx_ring;
+	struct qlcnic_adapter *adapter;
+
+	tx_ring = container_of(napi, struct qlcnic_host_tx_ring, napi);
+	adapter = tx_ring->adapter;
+	work_done = qlcnic_process_cmd_ring(adapter, tx_ring, budget);
+	if (work_done) {
+		napi_complete(&tx_ring->napi);
+		if (test_bit(__QLCNIC_DEV_UP , &adapter->state))
+			qlcnic_enable_tx_intr(adapter, tx_ring);
+	} else {
+		/* need a repoll */
+		work_done = budget;
+	}
+
+	return work_done;
+}
+
+static int qlcnic_83xx_rx_poll(struct napi_struct *napi, int budget)
+{
+	int work_done;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_adapter *adapter;
+
+	sds_ring = container_of(napi, struct qlcnic_host_sds_ring, napi);
+	adapter = sds_ring->adapter;
+	work_done = qlcnic_83xx_process_rcv_ring(sds_ring, budget);
+	if (work_done < budget) {
+		napi_complete_done(&sds_ring->napi, work_done);
+		if (test_bit(__QLCNIC_DEV_UP, &adapter->state))
+			qlcnic_enable_sds_intr(adapter, sds_ring);
+	}
+
+	return work_done;
+}
+
+void qlcnic_83xx_napi_enable(struct qlcnic_adapter *adapter)
+{
+	int ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_host_tx_ring *tx_ring;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+
+	if (adapter->is_up != QLCNIC_ADAPTER_UP_MAGIC)
+		return;
+
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		napi_enable(&sds_ring->napi);
+		if (adapter->flags & QLCNIC_MSIX_ENABLED)
+			qlcnic_enable_sds_intr(adapter, sds_ring);
+	}
+
+	if ((adapter->flags & QLCNIC_MSIX_ENABLED) &&
+	    !(adapter->flags & QLCNIC_TX_INTR_SHARED)) {
+		for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+			tx_ring = &adapter->tx_ring[ring];
+			napi_enable(&tx_ring->napi);
+			qlcnic_enable_tx_intr(adapter, tx_ring);
+		}
+	}
+}
+
+void qlcnic_83xx_napi_disable(struct qlcnic_adapter *adapter)
+{
+	int ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct qlcnic_host_tx_ring *tx_ring;
+
+	if (adapter->is_up != QLCNIC_ADAPTER_UP_MAGIC)
+		return;
+
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		if (adapter->flags & QLCNIC_MSIX_ENABLED)
+			qlcnic_disable_sds_intr(adapter, sds_ring);
+		napi_synchronize(&sds_ring->napi);
+		napi_disable(&sds_ring->napi);
+	}
+
+	if ((adapter->flags & QLCNIC_MSIX_ENABLED) &&
+	    !(adapter->flags & QLCNIC_TX_INTR_SHARED)) {
+		for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+			tx_ring = &adapter->tx_ring[ring];
+			qlcnic_disable_tx_intr(adapter, tx_ring);
+			napi_synchronize(&tx_ring->napi);
+			napi_disable(&tx_ring->napi);
+		}
+	}
+}
+
+int qlcnic_83xx_napi_add(struct qlcnic_adapter *adapter,
+			 struct net_device *netdev)
+{
+	int ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_host_tx_ring *tx_ring;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+
+	if (qlcnic_alloc_sds_rings(recv_ctx, adapter->drv_sds_rings))
+		return -ENOMEM;
+
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		if (adapter->flags & QLCNIC_MSIX_ENABLED) {
+			if (!(adapter->flags & QLCNIC_TX_INTR_SHARED))
+				netif_napi_add(netdev, &sds_ring->napi,
+					       qlcnic_83xx_rx_poll,
+					       NAPI_POLL_WEIGHT);
+			else
+				netif_napi_add(netdev, &sds_ring->napi,
+					       qlcnic_83xx_msix_sriov_vf_poll,
+					       NAPI_POLL_WEIGHT);
+
+		} else {
+			netif_napi_add(netdev, &sds_ring->napi,
+				       qlcnic_83xx_poll,
+				       NAPI_POLL_WEIGHT);
+		}
+	}
+
+	if (qlcnic_alloc_tx_rings(adapter, netdev)) {
+		qlcnic_free_sds_rings(recv_ctx);
+		return -ENOMEM;
+	}
+
+	if ((adapter->flags & QLCNIC_MSIX_ENABLED) &&
+	    !(adapter->flags & QLCNIC_TX_INTR_SHARED)) {
+		for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+			tx_ring = &adapter->tx_ring[ring];
+			netif_tx_napi_add(netdev, &tx_ring->napi,
+				       qlcnic_83xx_msix_tx_poll,
+				       NAPI_POLL_WEIGHT);
+		}
+	}
+
+	return 0;
+}
+
+void qlcnic_83xx_napi_del(struct qlcnic_adapter *adapter)
+{
+	int ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct qlcnic_host_tx_ring *tx_ring;
+
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		netif_napi_del(&sds_ring->napi);
+	}
+
+	qlcnic_free_sds_rings(adapter->recv_ctx);
+
+	if ((adapter->flags & QLCNIC_MSIX_ENABLED) &&
+	    !(adapter->flags & QLCNIC_TX_INTR_SHARED)) {
+		for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+			tx_ring = &adapter->tx_ring[ring];
+			netif_napi_del(&tx_ring->napi);
+		}
+	}
+
+	qlcnic_free_tx_rings(adapter);
+}
+
+static void qlcnic_83xx_process_rcv_diag(struct qlcnic_adapter *adapter,
+					 int ring, u64 sts_data[])
+{
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct sk_buff *skb;
+	struct qlcnic_host_rds_ring *rds_ring;
+	int index, length;
+
+	if (unlikely(ring >= adapter->max_rds_rings))
+		return;
+
+	rds_ring = &recv_ctx->rds_rings[ring];
+	index = qlcnic_83xx_hndl(sts_data[0]);
+	if (unlikely(index >= rds_ring->num_desc))
+		return;
+
+	length = qlcnic_83xx_pktln(sts_data[0]);
+
+	skb = qlcnic_process_rxbuf(adapter, rds_ring, index, STATUS_CKSUM_OK);
+	if (!skb)
+		return;
+
+	if (length > rds_ring->skb_size)
+		skb_put(skb, rds_ring->skb_size);
+	else
+		skb_put(skb, length);
+
+	if (!qlcnic_check_loopback_buff(skb->data, adapter->mac_addr))
+		adapter->ahw->diag_cnt++;
+	else
+		dump_skb(skb, adapter);
+
+	dev_kfree_skb_any(skb);
+	return;
+}
+
+void qlcnic_83xx_process_rcv_ring_diag(struct qlcnic_host_sds_ring *sds_ring)
+{
+	struct qlcnic_adapter *adapter = sds_ring->adapter;
+	struct status_desc *desc;
+	u64 sts_data[2];
+	int ring, opcode;
+	u32 consumer = sds_ring->consumer;
+
+	desc = &sds_ring->desc_head[consumer];
+	sts_data[0] = le64_to_cpu(desc->status_desc_data[0]);
+	sts_data[1] = le64_to_cpu(desc->status_desc_data[1]);
+	opcode = qlcnic_83xx_opcode(sts_data[1]);
+	if (!opcode)
+		return;
+
+	ring = QLCNIC_FETCH_RING_ID(sts_data[0]);
+	qlcnic_83xx_process_rcv_diag(adapter, ring, sts_data);
+	desc = &sds_ring->desc_head[consumer];
+	desc->status_desc_data[0] = cpu_to_le64(STATUS_OWNER_PHANTOM);
+	consumer = get_next_index(consumer, sds_ring->num_desc);
+	sds_ring->consumer = consumer;
+	writel(consumer, sds_ring->crb_sts_consumer);
+}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
new file mode 100644
index 0000000..1b5f7d5
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -0,0 +1,4362 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/interrupt.h>
+#include <linux/swab.h>
+#include <linux/dma-mapping.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <linux/inetdevice.h>
+#include <linux/aer.h>
+#include <linux/log2.h>
+#include <linux/pci.h>
+#include <net/vxlan.h>
+
+#include "qlcnic.h"
+#include "qlcnic_sriov.h"
+#include "qlcnic_hw.h"
+
+MODULE_DESCRIPTION("QLogic 1/10 GbE Converged/Intelligent Ethernet Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(QLCNIC_LINUX_VERSIONID);
+MODULE_FIRMWARE(QLCNIC_UNIFIED_ROMIMAGE_NAME);
+
+char qlcnic_driver_name[] = "qlcnic";
+static const char qlcnic_driver_string[] = "QLogic 1/10 GbE "
+	"Converged/Intelligent Ethernet Driver v" QLCNIC_LINUX_VERSIONID;
+
+static int qlcnic_mac_learn;
+module_param(qlcnic_mac_learn, int, 0444);
+MODULE_PARM_DESC(qlcnic_mac_learn,
+		 "Mac Filter (0=learning is disabled, 1=Driver learning is enabled, 2=FDB learning is enabled)");
+
+int qlcnic_use_msi = 1;
+MODULE_PARM_DESC(use_msi, "MSI interrupt (0=disabled, 1=enabled)");
+module_param_named(use_msi, qlcnic_use_msi, int, 0444);
+
+int qlcnic_use_msi_x = 1;
+MODULE_PARM_DESC(use_msi_x, "MSI-X interrupt (0=disabled, 1=enabled)");
+module_param_named(use_msi_x, qlcnic_use_msi_x, int, 0444);
+
+int qlcnic_auto_fw_reset = 1;
+MODULE_PARM_DESC(auto_fw_reset, "Auto firmware reset (0=disabled, 1=enabled)");
+module_param_named(auto_fw_reset, qlcnic_auto_fw_reset, int, 0644);
+
+int qlcnic_load_fw_file;
+MODULE_PARM_DESC(load_fw_file, "Load firmware from (0=flash, 1=file, 2=POST in fast mode, 3= POST in medium mode, 4=POST in slow mode)");
+module_param_named(load_fw_file, qlcnic_load_fw_file, int, 0444);
+
+static int qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent);
+static void qlcnic_remove(struct pci_dev *pdev);
+static int qlcnic_open(struct net_device *netdev);
+static int qlcnic_close(struct net_device *netdev);
+static void qlcnic_tx_timeout(struct net_device *netdev);
+static void qlcnic_attach_work(struct work_struct *work);
+static void qlcnic_fwinit_work(struct work_struct *work);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void qlcnic_poll_controller(struct net_device *netdev);
+#endif
+
+static void qlcnic_idc_debug_info(struct qlcnic_adapter *adapter, u8 encoding);
+static int qlcnic_can_start_firmware(struct qlcnic_adapter *adapter);
+
+static irqreturn_t qlcnic_tmp_intr(int irq, void *data);
+static irqreturn_t qlcnic_intr(int irq, void *data);
+static irqreturn_t qlcnic_msi_intr(int irq, void *data);
+static irqreturn_t qlcnic_msix_intr(int irq, void *data);
+static irqreturn_t qlcnic_msix_tx_intr(int irq, void *data);
+
+static struct net_device_stats *qlcnic_get_stats(struct net_device *netdev);
+static int qlcnic_start_firmware(struct qlcnic_adapter *);
+
+static void qlcnic_free_lb_filters_mem(struct qlcnic_adapter *adapter);
+static void qlcnic_dev_set_npar_ready(struct qlcnic_adapter *);
+static int qlcnicvf_start_firmware(struct qlcnic_adapter *);
+static int qlcnic_vlan_rx_add(struct net_device *, __be16, u16);
+static int qlcnic_vlan_rx_del(struct net_device *, __be16, u16);
+
+static int qlcnic_82xx_setup_intr(struct qlcnic_adapter *);
+static void qlcnic_82xx_dev_request_reset(struct qlcnic_adapter *, u32);
+static irqreturn_t qlcnic_82xx_clear_legacy_intr(struct qlcnic_adapter *);
+static pci_ers_result_t qlcnic_82xx_io_slot_reset(struct pci_dev *);
+static int qlcnic_82xx_start_firmware(struct qlcnic_adapter *);
+static void qlcnic_82xx_io_resume(struct pci_dev *);
+static void qlcnic_82xx_set_mac_filter_count(struct qlcnic_adapter *);
+static pci_ers_result_t qlcnic_82xx_io_error_detected(struct pci_dev *,
+						      pci_channel_state_t);
+static u32 qlcnic_vlan_tx_check(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	if (adapter->pdev->device == PCI_DEVICE_ID_QLOGIC_QLE824X)
+		return ahw->capabilities & QLCNIC_FW_CAPABILITY_FVLANTX;
+	else
+		return 1;
+}
+
+/*  PCI Device ID Table  */
+#define ENTRY(device) \
+	{PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, (device)), \
+	.class = PCI_CLASS_NETWORK_ETHERNET << 8, .class_mask = ~0}
+
+static const struct pci_device_id qlcnic_pci_tbl[] = {
+	ENTRY(PCI_DEVICE_ID_QLOGIC_QLE824X),
+	ENTRY(PCI_DEVICE_ID_QLOGIC_QLE834X),
+	ENTRY(PCI_DEVICE_ID_QLOGIC_VF_QLE834X),
+	ENTRY(PCI_DEVICE_ID_QLOGIC_QLE8830),
+	ENTRY(PCI_DEVICE_ID_QLOGIC_VF_QLE8C30),
+	ENTRY(PCI_DEVICE_ID_QLOGIC_QLE844X),
+	ENTRY(PCI_DEVICE_ID_QLOGIC_VF_QLE844X),
+	{0,}
+};
+
+MODULE_DEVICE_TABLE(pci, qlcnic_pci_tbl);
+
+
+inline void qlcnic_update_cmd_producer(struct qlcnic_host_tx_ring *tx_ring)
+{
+	writel(tx_ring->producer, tx_ring->crb_cmd_producer);
+}
+
+static const u32 msi_tgt_status[8] = {
+	ISR_INT_TARGET_STATUS, ISR_INT_TARGET_STATUS_F1,
+	ISR_INT_TARGET_STATUS_F2, ISR_INT_TARGET_STATUS_F3,
+	ISR_INT_TARGET_STATUS_F4, ISR_INT_TARGET_STATUS_F5,
+	ISR_INT_TARGET_STATUS_F6, ISR_INT_TARGET_STATUS_F7
+};
+
+static const u32 qlcnic_reg_tbl[] = {
+	0x1B20A8,	/* PEG_HALT_STAT1 */
+	0x1B20AC,	/* PEG_HALT_STAT2 */
+	0x1B20B0,	/* FW_HEARTBEAT */
+	0x1B2100,	/* LOCK ID */
+	0x1B2128,	/* FW_CAPABILITIES */
+	0x1B2138,	/* drv active */
+	0x1B2140,	/* dev state */
+	0x1B2144,	/* drv state */
+	0x1B2148,	/* drv scratch */
+	0x1B214C,	/* dev partition info */
+	0x1B2174,	/* drv idc ver */
+	0x1B2150,	/* fw version major */
+	0x1B2154,	/* fw version minor */
+	0x1B2158,	/* fw version sub */
+	0x1B219C,	/* npar state */
+	0x1B21FC,	/* FW_IMG_VALID */
+	0x1B2250,	/* CMD_PEG_STATE */
+	0x1B233C,	/* RCV_PEG_STATE */
+	0x1B23B4,	/* ASIC TEMP */
+	0x1B216C,	/* FW api */
+	0x1B2170,	/* drv op mode */
+	0x13C010,	/* flash lock */
+	0x13C014,	/* flash unlock */
+};
+
+static const struct qlcnic_board_info qlcnic_boards[] = {
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE844X,
+	  0x0,
+	  0x0,
+	  "8400 series 10GbE Converged Network Adapter (TCP/IP Networking)" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE834X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x24e,
+	  "8300 Series Dual Port 10GbE Converged Network Adapter "
+	  "(TCP/IP Networking)" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE834X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x243,
+	  "8300 Series Single Port 10GbE Converged Network Adapter "
+	  "(TCP/IP Networking)" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE834X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x24a,
+	  "8300 Series Dual Port 10GbE Converged Network Adapter "
+	  "(TCP/IP Networking)" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE834X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x246,
+	  "8300 Series Dual Port 10GbE Converged Network Adapter "
+	  "(TCP/IP Networking)" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE834X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x252,
+	  "8300 Series Dual Port 10GbE Converged Network Adapter "
+	  "(TCP/IP Networking)" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE834X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x26e,
+	  "8300 Series Dual Port 10GbE Converged Network Adapter "
+	  "(TCP/IP Networking)" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE834X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x260,
+	  "8300 Series Dual Port 10GbE Converged Network Adapter "
+	  "(TCP/IP Networking)" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE834X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x266,
+	  "8300 Series Single Port 10GbE Converged Network Adapter "
+	  "(TCP/IP Networking)" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE834X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x269,
+	  "8300 Series Dual Port 10GbE Converged Network Adapter "
+	  "(TCP/IP Networking)" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE834X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x271,
+	  "8300 Series Dual Port 10GbE Converged Network Adapter "
+	  "(TCP/IP Networking)" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE834X,
+	  0x0, 0x0, "8300 Series 1/10GbE Controller" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE8830,
+	  0x0,
+	  0x0,
+	  "8830 Series 1/10GbE Controller" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE824X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x203,
+	  "8200 Series Single Port 10GbE Converged Network Adapter"
+	  "(TCP/IP Networking)" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE824X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x207,
+	  "8200 Series Dual Port 10GbE Converged Network Adapter"
+	  "(TCP/IP Networking)" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE824X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x20b,
+	  "3200 Series Dual Port 10Gb Intelligent Ethernet Adapter" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE824X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x20c,
+	  "3200 Series Quad Port 1Gb Intelligent Ethernet Adapter" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE824X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x20f,
+	  "3200 Series Single Port 10Gb Intelligent Ethernet Adapter" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE824X,
+	  0x103c, 0x3733,
+	  "NC523SFP 10Gb 2-port Server Adapter" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE824X,
+	  0x103c, 0x3346,
+	  "CN1000Q Dual Port Converged Network Adapter" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE824X,
+	  PCI_VENDOR_ID_QLOGIC,
+	  0x210,
+	  "QME8242-k 10GbE Dual Port Mezzanine Card" },
+	{ PCI_VENDOR_ID_QLOGIC,
+	  PCI_DEVICE_ID_QLOGIC_QLE824X,
+	  0x0, 0x0, "cLOM8214 1/10GbE Controller" },
+};
+
+#define NUM_SUPPORTED_BOARDS ARRAY_SIZE(qlcnic_boards)
+
+static const
+struct qlcnic_legacy_intr_set legacy_intr[] = QLCNIC_LEGACY_INTR_CONFIG;
+
+int qlcnic_alloc_sds_rings(struct qlcnic_recv_context *recv_ctx, int count)
+{
+	int size = sizeof(struct qlcnic_host_sds_ring) * count;
+
+	recv_ctx->sds_rings = kzalloc(size, GFP_KERNEL);
+
+	return recv_ctx->sds_rings == NULL;
+}
+
+void qlcnic_free_sds_rings(struct qlcnic_recv_context *recv_ctx)
+{
+	kfree(recv_ctx->sds_rings);
+	recv_ctx->sds_rings = NULL;
+}
+
+int qlcnic_read_mac_addr(struct qlcnic_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct pci_dev *pdev = adapter->pdev;
+	u8 mac_addr[ETH_ALEN];
+	int ret;
+
+	ret = qlcnic_get_mac_address(adapter, mac_addr,
+				     adapter->ahw->pci_func);
+	if (ret)
+		return ret;
+
+	memcpy(netdev->dev_addr, mac_addr, ETH_ALEN);
+	memcpy(adapter->mac_addr, netdev->dev_addr, netdev->addr_len);
+
+	/* set station address */
+
+	if (!is_valid_ether_addr(netdev->dev_addr))
+		dev_warn(&pdev->dev, "Bad MAC address %pM.\n",
+					netdev->dev_addr);
+
+	return 0;
+}
+
+static void qlcnic_delete_adapter_mac(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_mac_vlan_list *cur;
+	struct list_head *head;
+
+	list_for_each(head, &adapter->mac_list) {
+		cur = list_entry(head, struct qlcnic_mac_vlan_list, list);
+		if (ether_addr_equal_unaligned(adapter->mac_addr, cur->mac_addr)) {
+			qlcnic_sre_macaddr_change(adapter, cur->mac_addr,
+						  0, QLCNIC_MAC_DEL);
+			list_del(&cur->list);
+			kfree(cur);
+			return;
+		}
+	}
+}
+
+static int qlcnic_set_mac(struct net_device *netdev, void *p)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct sockaddr *addr = p;
+
+	if (qlcnic_sriov_vf_check(adapter))
+		return -EINVAL;
+
+	if ((adapter->flags & QLCNIC_MAC_OVERRIDE_DISABLED))
+		return -EOPNOTSUPP;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+
+	if (ether_addr_equal_unaligned(adapter->mac_addr, addr->sa_data) &&
+	    ether_addr_equal_unaligned(netdev->dev_addr, addr->sa_data))
+		return 0;
+
+	if (test_bit(__QLCNIC_DEV_UP, &adapter->state)) {
+		netif_device_detach(netdev);
+		qlcnic_napi_disable(adapter);
+	}
+
+	qlcnic_delete_adapter_mac(adapter);
+	memcpy(adapter->mac_addr, addr->sa_data, netdev->addr_len);
+	memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len);
+	qlcnic_set_multi(adapter->netdev);
+
+	if (test_bit(__QLCNIC_DEV_UP, &adapter->state)) {
+		netif_device_attach(netdev);
+		qlcnic_napi_enable(adapter);
+	}
+	return 0;
+}
+
+static int qlcnic_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+			struct net_device *netdev,
+			const unsigned char *addr, u16 vid)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	int err = -EOPNOTSUPP;
+
+	if (!adapter->fdb_mac_learn)
+		return ndo_dflt_fdb_del(ndm, tb, netdev, addr, vid);
+
+	if ((adapter->flags & QLCNIC_ESWITCH_ENABLED) ||
+	    qlcnic_sriov_check(adapter)) {
+		if (is_unicast_ether_addr(addr)) {
+			err = dev_uc_del(netdev, addr);
+			if (!err)
+				err = qlcnic_nic_del_mac(adapter, addr);
+		} else if (is_multicast_ether_addr(addr)) {
+			err = dev_mc_del(netdev, addr);
+		} else {
+			err =  -EINVAL;
+		}
+	}
+	return err;
+}
+
+static int qlcnic_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			struct net_device *netdev,
+			const unsigned char *addr, u16 vid, u16 flags)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	int err = 0;
+
+	if (!adapter->fdb_mac_learn)
+		return ndo_dflt_fdb_add(ndm, tb, netdev, addr, vid, flags);
+
+	if (!(adapter->flags & QLCNIC_ESWITCH_ENABLED) &&
+	    !qlcnic_sriov_check(adapter)) {
+		pr_info("%s: FDB e-switch is not enabled\n", __func__);
+		return -EOPNOTSUPP;
+	}
+
+	if (ether_addr_equal(addr, adapter->mac_addr))
+		return err;
+
+	if (is_unicast_ether_addr(addr)) {
+		if (netdev_uc_count(netdev) < adapter->ahw->max_uc_count)
+			err = dev_uc_add_excl(netdev, addr);
+		else
+			err = -ENOMEM;
+	} else if (is_multicast_ether_addr(addr)) {
+		err = dev_mc_add_excl(netdev, addr);
+	} else {
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int qlcnic_fdb_dump(struct sk_buff *skb, struct netlink_callback *ncb,
+			struct net_device *netdev,
+			struct net_device *filter_dev, int *idx)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	int err = 0;
+
+	if (!adapter->fdb_mac_learn)
+		return ndo_dflt_fdb_dump(skb, ncb, netdev, filter_dev, idx);
+
+	if ((adapter->flags & QLCNIC_ESWITCH_ENABLED) ||
+	    qlcnic_sriov_check(adapter))
+		err = ndo_dflt_fdb_dump(skb, ncb, netdev, filter_dev, idx);
+
+	return err;
+}
+
+static void qlcnic_82xx_cancel_idc_work(struct qlcnic_adapter *adapter)
+{
+	while (test_and_set_bit(__QLCNIC_RESETTING, &adapter->state))
+		usleep_range(10000, 11000);
+
+	if (!adapter->fw_work.work.func)
+		return;
+
+	cancel_delayed_work_sync(&adapter->fw_work);
+}
+
+static int qlcnic_get_phys_port_id(struct net_device *netdev,
+				   struct netdev_phys_item_id *ppid)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	if (!(adapter->flags & QLCNIC_HAS_PHYS_PORT_ID))
+		return -EOPNOTSUPP;
+
+	ppid->id_len = sizeof(ahw->phys_port_id);
+	memcpy(ppid->id, ahw->phys_port_id, ppid->id_len);
+
+	return 0;
+}
+
+static void qlcnic_add_vxlan_port(struct net_device *netdev,
+				  struct udp_tunnel_info *ti)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+		return;
+
+	/* Adapter supports only one VXLAN port. Use very first port
+	 * for enabling offload
+	 */
+	if (!qlcnic_encap_rx_offload(adapter))
+		return;
+	if (!ahw->vxlan_port_count) {
+		ahw->vxlan_port_count = 1;
+		ahw->vxlan_port = ntohs(ti->port);
+		adapter->flags |= QLCNIC_ADD_VXLAN_PORT;
+		return;
+	}
+	if (ahw->vxlan_port == ntohs(ti->port))
+		ahw->vxlan_port_count++;
+
+}
+
+static void qlcnic_del_vxlan_port(struct net_device *netdev,
+				  struct udp_tunnel_info *ti)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+		return;
+
+	if (!qlcnic_encap_rx_offload(adapter) || !ahw->vxlan_port_count ||
+	    (ahw->vxlan_port != ntohs(ti->port)))
+		return;
+
+	ahw->vxlan_port_count--;
+	if (!ahw->vxlan_port_count)
+		adapter->flags |= QLCNIC_DEL_VXLAN_PORT;
+}
+
+static netdev_features_t qlcnic_features_check(struct sk_buff *skb,
+					       struct net_device *dev,
+					       netdev_features_t features)
+{
+	features = vlan_features_check(skb, features);
+	return vxlan_features_check(skb, features);
+}
+
+static const struct net_device_ops qlcnic_netdev_ops = {
+	.ndo_open	   = qlcnic_open,
+	.ndo_stop	   = qlcnic_close,
+	.ndo_start_xmit    = qlcnic_xmit_frame,
+	.ndo_get_stats	   = qlcnic_get_stats,
+	.ndo_validate_addr = eth_validate_addr,
+	.ndo_set_rx_mode   = qlcnic_set_multi,
+	.ndo_set_mac_address    = qlcnic_set_mac,
+	.ndo_change_mtu	   = qlcnic_change_mtu,
+	.ndo_fix_features  = qlcnic_fix_features,
+	.ndo_set_features  = qlcnic_set_features,
+	.ndo_tx_timeout	   = qlcnic_tx_timeout,
+	.ndo_vlan_rx_add_vid	= qlcnic_vlan_rx_add,
+	.ndo_vlan_rx_kill_vid	= qlcnic_vlan_rx_del,
+	.ndo_fdb_add		= qlcnic_fdb_add,
+	.ndo_fdb_del		= qlcnic_fdb_del,
+	.ndo_fdb_dump		= qlcnic_fdb_dump,
+	.ndo_get_phys_port_id	= qlcnic_get_phys_port_id,
+	.ndo_udp_tunnel_add	= qlcnic_add_vxlan_port,
+	.ndo_udp_tunnel_del	= qlcnic_del_vxlan_port,
+	.ndo_features_check	= qlcnic_features_check,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = qlcnic_poll_controller,
+#endif
+#ifdef CONFIG_QLCNIC_SRIOV
+	.ndo_set_vf_mac		= qlcnic_sriov_set_vf_mac,
+	.ndo_set_vf_rate	= qlcnic_sriov_set_vf_tx_rate,
+	.ndo_get_vf_config	= qlcnic_sriov_get_vf_config,
+	.ndo_set_vf_vlan	= qlcnic_sriov_set_vf_vlan,
+	.ndo_set_vf_spoofchk	= qlcnic_sriov_set_vf_spoofchk,
+#endif
+};
+
+static const struct net_device_ops qlcnic_netdev_failed_ops = {
+	.ndo_open	   = qlcnic_open,
+};
+
+static struct qlcnic_nic_template qlcnic_ops = {
+	.config_bridged_mode	= qlcnic_config_bridged_mode,
+	.config_led		= qlcnic_82xx_config_led,
+	.start_firmware		= qlcnic_82xx_start_firmware,
+	.request_reset		= qlcnic_82xx_dev_request_reset,
+	.cancel_idc_work	= qlcnic_82xx_cancel_idc_work,
+	.napi_add		= qlcnic_82xx_napi_add,
+	.napi_del		= qlcnic_82xx_napi_del,
+	.config_ipaddr		= qlcnic_82xx_config_ipaddr,
+	.shutdown		= qlcnic_82xx_shutdown,
+	.resume			= qlcnic_82xx_resume,
+	.clear_legacy_intr	= qlcnic_82xx_clear_legacy_intr,
+};
+
+struct qlcnic_nic_template qlcnic_vf_ops = {
+	.config_bridged_mode	= qlcnicvf_config_bridged_mode,
+	.config_led		= qlcnicvf_config_led,
+	.start_firmware		= qlcnicvf_start_firmware
+};
+
+static struct qlcnic_hardware_ops qlcnic_hw_ops = {
+	.read_crb			= qlcnic_82xx_read_crb,
+	.write_crb			= qlcnic_82xx_write_crb,
+	.read_reg			= qlcnic_82xx_hw_read_wx_2M,
+	.write_reg			= qlcnic_82xx_hw_write_wx_2M,
+	.get_mac_address		= qlcnic_82xx_get_mac_address,
+	.setup_intr			= qlcnic_82xx_setup_intr,
+	.alloc_mbx_args			= qlcnic_82xx_alloc_mbx_args,
+	.mbx_cmd			= qlcnic_82xx_issue_cmd,
+	.get_func_no			= qlcnic_82xx_get_func_no,
+	.api_lock			= qlcnic_82xx_api_lock,
+	.api_unlock			= qlcnic_82xx_api_unlock,
+	.add_sysfs			= qlcnic_82xx_add_sysfs,
+	.remove_sysfs			= qlcnic_82xx_remove_sysfs,
+	.process_lb_rcv_ring_diag	= qlcnic_82xx_process_rcv_ring_diag,
+	.create_rx_ctx			= qlcnic_82xx_fw_cmd_create_rx_ctx,
+	.create_tx_ctx			= qlcnic_82xx_fw_cmd_create_tx_ctx,
+	.del_rx_ctx			= qlcnic_82xx_fw_cmd_del_rx_ctx,
+	.del_tx_ctx			= qlcnic_82xx_fw_cmd_del_tx_ctx,
+	.setup_link_event		= qlcnic_82xx_linkevent_request,
+	.get_nic_info			= qlcnic_82xx_get_nic_info,
+	.get_pci_info			= qlcnic_82xx_get_pci_info,
+	.set_nic_info			= qlcnic_82xx_set_nic_info,
+	.change_macvlan			= qlcnic_82xx_sre_macaddr_change,
+	.napi_enable			= qlcnic_82xx_napi_enable,
+	.napi_disable			= qlcnic_82xx_napi_disable,
+	.config_intr_coal		= qlcnic_82xx_config_intr_coalesce,
+	.config_rss			= qlcnic_82xx_config_rss,
+	.config_hw_lro			= qlcnic_82xx_config_hw_lro,
+	.config_loopback		= qlcnic_82xx_set_lb_mode,
+	.clear_loopback			= qlcnic_82xx_clear_lb_mode,
+	.config_promisc_mode		= qlcnic_82xx_nic_set_promisc,
+	.change_l2_filter		= qlcnic_82xx_change_filter,
+	.get_board_info			= qlcnic_82xx_get_board_info,
+	.set_mac_filter_count		= qlcnic_82xx_set_mac_filter_count,
+	.free_mac_list			= qlcnic_82xx_free_mac_list,
+	.read_phys_port_id		= qlcnic_82xx_read_phys_port_id,
+	.io_error_detected		= qlcnic_82xx_io_error_detected,
+	.io_slot_reset			= qlcnic_82xx_io_slot_reset,
+	.io_resume			= qlcnic_82xx_io_resume,
+	.get_beacon_state		= qlcnic_82xx_get_beacon_state,
+	.enable_sds_intr		= qlcnic_82xx_enable_sds_intr,
+	.disable_sds_intr		= qlcnic_82xx_disable_sds_intr,
+	.enable_tx_intr			= qlcnic_82xx_enable_tx_intr,
+	.disable_tx_intr		= qlcnic_82xx_disable_tx_intr,
+	.get_saved_state		= qlcnic_82xx_get_saved_state,
+	.set_saved_state		= qlcnic_82xx_set_saved_state,
+	.cache_tmpl_hdr_values		= qlcnic_82xx_cache_tmpl_hdr_values,
+	.get_cap_size			= qlcnic_82xx_get_cap_size,
+	.set_sys_info			= qlcnic_82xx_set_sys_info,
+	.store_cap_mask			= qlcnic_82xx_store_cap_mask,
+	.encap_rx_offload               = qlcnic_82xx_encap_rx_offload,
+	.encap_tx_offload               = qlcnic_82xx_encap_tx_offload,
+};
+
+static int qlcnic_check_multi_tx_capability(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	if (qlcnic_82xx_check(adapter) &&
+	    (ahw->extra_capability[0] & QLCNIC_FW_CAPABILITY_2_MULTI_TX)) {
+		test_and_set_bit(__QLCNIC_MULTI_TX_UNIQUE, &adapter->state);
+		return 0;
+	} else {
+		return 1;
+	}
+}
+
+static int qlcnic_max_rings(struct qlcnic_adapter *adapter, u8 ring_cnt,
+			    int queue_type)
+{
+	int num_rings, max_rings = QLCNIC_MAX_SDS_RINGS;
+
+	if (queue_type == QLCNIC_RX_QUEUE)
+		max_rings = adapter->max_sds_rings;
+	else if (queue_type == QLCNIC_TX_QUEUE)
+		max_rings = adapter->max_tx_rings;
+
+	num_rings = rounddown_pow_of_two(min_t(int, num_online_cpus(),
+					      max_rings));
+
+	if (ring_cnt > num_rings)
+		return num_rings;
+	else
+		return ring_cnt;
+}
+
+void qlcnic_set_tx_ring_count(struct qlcnic_adapter *adapter, u8 tx_cnt)
+{
+	/* 83xx adapter does not have max_tx_rings intialized in probe */
+	if (adapter->max_tx_rings)
+		adapter->drv_tx_rings = qlcnic_max_rings(adapter, tx_cnt,
+							 QLCNIC_TX_QUEUE);
+	else
+		adapter->drv_tx_rings = tx_cnt;
+}
+
+void qlcnic_set_sds_ring_count(struct qlcnic_adapter *adapter, u8 rx_cnt)
+{
+	/* 83xx adapter does not have max_sds_rings intialized in probe */
+	if (adapter->max_sds_rings)
+		adapter->drv_sds_rings = qlcnic_max_rings(adapter, rx_cnt,
+							  QLCNIC_RX_QUEUE);
+	else
+		adapter->drv_sds_rings = rx_cnt;
+}
+
+int qlcnic_setup_tss_rss_intr(struct qlcnic_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	int num_msix = 0, err = 0, vector;
+
+	adapter->flags &= ~QLCNIC_TSS_RSS;
+
+	if (adapter->drv_tss_rings > 0)
+		num_msix += adapter->drv_tss_rings;
+	else
+		num_msix += adapter->drv_tx_rings;
+
+	if (adapter->drv_rss_rings > 0)
+		num_msix += adapter->drv_rss_rings;
+	else
+		num_msix += adapter->drv_sds_rings;
+
+	if (qlcnic_83xx_check(adapter))
+		num_msix += 1;
+
+	if (!adapter->msix_entries) {
+		adapter->msix_entries = kcalloc(num_msix,
+						sizeof(struct msix_entry),
+						GFP_KERNEL);
+		if (!adapter->msix_entries)
+			return -ENOMEM;
+	}
+
+	for (vector = 0; vector < num_msix; vector++)
+		adapter->msix_entries[vector].entry = vector;
+
+restore:
+	err = pci_enable_msix_exact(pdev, adapter->msix_entries, num_msix);
+	if (err == -ENOSPC) {
+		if (!adapter->drv_tss_rings && !adapter->drv_rss_rings)
+			return err;
+
+		netdev_info(adapter->netdev,
+			    "Unable to allocate %d MSI-X vectors, Available vectors %d\n",
+			    num_msix, err);
+
+		num_msix = adapter->drv_tx_rings + adapter->drv_sds_rings;
+
+		/* Set rings to 0 so we can restore original TSS/RSS count */
+		adapter->drv_tss_rings = 0;
+		adapter->drv_rss_rings = 0;
+
+		if (qlcnic_83xx_check(adapter))
+			num_msix += 1;
+
+		netdev_info(adapter->netdev,
+			    "Restoring %d Tx, %d SDS rings for total %d vectors.\n",
+			    adapter->drv_tx_rings, adapter->drv_sds_rings,
+			    num_msix);
+
+		goto restore;
+	} else if (err < 0) {
+		return err;
+	}
+
+	adapter->ahw->num_msix = num_msix;
+	if (adapter->drv_tss_rings > 0)
+		adapter->drv_tx_rings = adapter->drv_tss_rings;
+
+	if (adapter->drv_rss_rings > 0)
+		adapter->drv_sds_rings = adapter->drv_rss_rings;
+
+	return 0;
+}
+
+int qlcnic_enable_msix(struct qlcnic_adapter *adapter, u32 num_msix)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	int err, vector;
+
+	if (!adapter->msix_entries) {
+		adapter->msix_entries = kcalloc(num_msix,
+						sizeof(struct msix_entry),
+						GFP_KERNEL);
+		if (!adapter->msix_entries)
+			return -ENOMEM;
+	}
+
+	adapter->flags &= ~(QLCNIC_MSI_ENABLED | QLCNIC_MSIX_ENABLED);
+
+	if (adapter->ahw->msix_supported) {
+enable_msix:
+		for (vector = 0; vector < num_msix; vector++)
+			adapter->msix_entries[vector].entry = vector;
+
+		err = pci_enable_msix_range(pdev,
+					    adapter->msix_entries, 1, num_msix);
+
+		if (err == num_msix) {
+			adapter->flags |= QLCNIC_MSIX_ENABLED;
+			adapter->ahw->num_msix = num_msix;
+			dev_info(&pdev->dev, "using msi-x interrupts\n");
+			return 0;
+		} else if (err > 0) {
+			pci_disable_msix(pdev);
+
+			dev_info(&pdev->dev,
+				 "Unable to allocate %d MSI-X vectors, Available vectors %d\n",
+				 num_msix, err);
+
+			if (qlcnic_82xx_check(adapter)) {
+				num_msix = rounddown_pow_of_two(err);
+				if (err < QLCNIC_82XX_MINIMUM_VECTOR)
+					return -ENOSPC;
+			} else {
+				num_msix = rounddown_pow_of_two(err - 1);
+				num_msix += 1;
+				if (err < QLCNIC_83XX_MINIMUM_VECTOR)
+					return -ENOSPC;
+			}
+
+			if (qlcnic_82xx_check(adapter) &&
+			    !qlcnic_check_multi_tx(adapter)) {
+				adapter->drv_sds_rings = num_msix;
+				adapter->drv_tx_rings = QLCNIC_SINGLE_RING;
+			} else {
+				/* Distribute vectors equally */
+				adapter->drv_tx_rings = num_msix / 2;
+				adapter->drv_sds_rings = adapter->drv_tx_rings;
+			}
+
+			if (num_msix) {
+				dev_info(&pdev->dev,
+					 "Trying to allocate %d MSI-X interrupt vectors\n",
+					 num_msix);
+				goto enable_msix;
+			}
+		} else {
+			dev_info(&pdev->dev,
+				 "Unable to allocate %d MSI-X vectors, err=%d\n",
+				 num_msix, err);
+			return err;
+		}
+	}
+
+	return -EIO;
+}
+
+static int qlcnic_82xx_calculate_msix_vector(struct qlcnic_adapter *adapter)
+{
+	int num_msix;
+
+	num_msix = adapter->drv_sds_rings;
+
+	if (qlcnic_check_multi_tx(adapter))
+		num_msix += adapter->drv_tx_rings;
+	else
+		num_msix += QLCNIC_SINGLE_RING;
+
+	return num_msix;
+}
+
+static int qlcnic_enable_msi_legacy(struct qlcnic_adapter *adapter)
+{
+	int err = 0;
+	u32 offset, mask_reg;
+	const struct qlcnic_legacy_intr_set *legacy_intrp;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct pci_dev *pdev = adapter->pdev;
+
+	if (qlcnic_use_msi && !pci_enable_msi(pdev)) {
+		adapter->flags |= QLCNIC_MSI_ENABLED;
+		offset = msi_tgt_status[adapter->ahw->pci_func];
+		adapter->tgt_status_reg = qlcnic_get_ioaddr(adapter->ahw,
+							    offset);
+		dev_info(&pdev->dev, "using msi interrupts\n");
+		adapter->msix_entries[0].vector = pdev->irq;
+		return err;
+	}
+
+	if (qlcnic_use_msi || qlcnic_use_msi_x)
+		return -EOPNOTSUPP;
+
+	legacy_intrp = &legacy_intr[adapter->ahw->pci_func];
+	adapter->ahw->int_vec_bit = legacy_intrp->int_vec_bit;
+	offset = legacy_intrp->tgt_status_reg;
+	adapter->tgt_status_reg = qlcnic_get_ioaddr(ahw, offset);
+	mask_reg = legacy_intrp->tgt_mask_reg;
+	adapter->tgt_mask_reg = qlcnic_get_ioaddr(ahw, mask_reg);
+	adapter->isr_int_vec = qlcnic_get_ioaddr(ahw, ISR_INT_VECTOR);
+	adapter->crb_int_state_reg = qlcnic_get_ioaddr(ahw, ISR_INT_STATE_REG);
+	dev_info(&pdev->dev, "using legacy interrupts\n");
+	adapter->msix_entries[0].vector = pdev->irq;
+	return err;
+}
+
+static int qlcnic_82xx_setup_intr(struct qlcnic_adapter *adapter)
+{
+	int num_msix, err = 0;
+
+	if (adapter->flags & QLCNIC_TSS_RSS) {
+		err = qlcnic_setup_tss_rss_intr(adapter);
+		if (err < 0)
+			return err;
+		num_msix = adapter->ahw->num_msix;
+	} else {
+		num_msix = qlcnic_82xx_calculate_msix_vector(adapter);
+
+		err = qlcnic_enable_msix(adapter, num_msix);
+		if (err == -ENOMEM)
+			return err;
+
+		if (!(adapter->flags & QLCNIC_MSIX_ENABLED)) {
+			qlcnic_disable_multi_tx(adapter);
+			adapter->drv_sds_rings = QLCNIC_SINGLE_RING;
+
+			err = qlcnic_enable_msi_legacy(adapter);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+int qlcnic_82xx_mq_intrpt(struct qlcnic_adapter *adapter, int op_type)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int err, i;
+
+	if (qlcnic_check_multi_tx(adapter) &&
+	    !ahw->diag_test &&
+	    (adapter->flags & QLCNIC_MSIX_ENABLED)) {
+		ahw->intr_tbl = vzalloc(ahw->num_msix *
+					sizeof(struct qlcnic_intrpt_config));
+		if (!ahw->intr_tbl)
+			return -ENOMEM;
+
+		for (i = 0; i < ahw->num_msix; i++) {
+			ahw->intr_tbl[i].type = QLCNIC_INTRPT_MSIX;
+			ahw->intr_tbl[i].id = i;
+			ahw->intr_tbl[i].src = 0;
+		}
+
+		err = qlcnic_82xx_config_intrpt(adapter, 1);
+		if (err)
+			dev_err(&adapter->pdev->dev,
+				"Failed to configure Interrupt for %d vector\n",
+				ahw->num_msix);
+		return err;
+	}
+
+	return 0;
+}
+
+void qlcnic_teardown_intr(struct qlcnic_adapter *adapter)
+{
+	if (adapter->flags & QLCNIC_MSIX_ENABLED)
+		pci_disable_msix(adapter->pdev);
+	if (adapter->flags & QLCNIC_MSI_ENABLED)
+		pci_disable_msi(adapter->pdev);
+
+	kfree(adapter->msix_entries);
+	adapter->msix_entries = NULL;
+
+	if (adapter->ahw->intr_tbl) {
+		vfree(adapter->ahw->intr_tbl);
+		adapter->ahw->intr_tbl = NULL;
+	}
+}
+
+static void qlcnic_cleanup_pci_map(struct qlcnic_hardware_context *ahw)
+{
+	if (ahw->pci_base0 != NULL)
+		iounmap(ahw->pci_base0);
+}
+
+static int qlcnic_get_act_pci_func(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_pci_info *pci_info;
+	int ret;
+
+	if (!(adapter->flags & QLCNIC_ESWITCH_ENABLED)) {
+		switch (ahw->port_type) {
+		case QLCNIC_GBE:
+			ahw->total_nic_func = QLCNIC_NIU_MAX_GBE_PORTS;
+			break;
+		case QLCNIC_XGBE:
+			ahw->total_nic_func = QLCNIC_NIU_MAX_XG_PORTS;
+			break;
+		}
+		return 0;
+	}
+
+	if (ahw->op_mode == QLCNIC_MGMT_FUNC)
+		return 0;
+
+	pci_info = kcalloc(ahw->max_vnic_func, sizeof(*pci_info), GFP_KERNEL);
+	if (!pci_info)
+		return -ENOMEM;
+
+	ret = qlcnic_get_pci_info(adapter, pci_info);
+	kfree(pci_info);
+	return ret;
+}
+
+static bool qlcnic_port_eswitch_cfg_capability(struct qlcnic_adapter *adapter)
+{
+	bool ret = false;
+
+	if (qlcnic_84xx_check(adapter)) {
+		ret = true;
+	} else if (qlcnic_83xx_check(adapter)) {
+		if (adapter->ahw->extra_capability[0] &
+		    QLCNIC_FW_CAPABILITY_2_PER_PORT_ESWITCH_CFG)
+			ret = true;
+		else
+			ret = false;
+	}
+
+	return ret;
+}
+
+int qlcnic_init_pci_info(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_pci_info *pci_info;
+	int i, id = 0, ret = 0, j = 0;
+	u16 act_pci_func;
+	u8 pfn;
+
+	pci_info = kcalloc(ahw->max_vnic_func, sizeof(*pci_info), GFP_KERNEL);
+	if (!pci_info)
+		return -ENOMEM;
+
+	ret = qlcnic_get_pci_info(adapter, pci_info);
+	if (ret)
+		goto err_pci_info;
+
+	act_pci_func = ahw->total_nic_func;
+
+	adapter->npars = kzalloc(sizeof(struct qlcnic_npar_info) *
+				 act_pci_func, GFP_KERNEL);
+	if (!adapter->npars) {
+		ret = -ENOMEM;
+		goto err_pci_info;
+	}
+
+	adapter->eswitch = kzalloc(sizeof(struct qlcnic_eswitch) *
+				QLCNIC_NIU_MAX_XG_PORTS, GFP_KERNEL);
+	if (!adapter->eswitch) {
+		ret = -ENOMEM;
+		goto err_npars;
+	}
+
+	for (i = 0; i < ahw->max_vnic_func; i++) {
+		pfn = pci_info[i].id;
+
+		if (pfn >= ahw->max_vnic_func) {
+			ret = -EINVAL;
+			dev_err(&adapter->pdev->dev, "%s: Invalid function 0x%x, max 0x%x\n",
+				__func__, pfn, ahw->max_vnic_func);
+			goto err_eswitch;
+		}
+
+		if (!pci_info[i].active ||
+		    (pci_info[i].type != QLCNIC_TYPE_NIC))
+			continue;
+
+		if (qlcnic_port_eswitch_cfg_capability(adapter)) {
+			if (!qlcnic_83xx_set_port_eswitch_status(adapter, pfn,
+								 &id))
+				adapter->npars[j].eswitch_status = true;
+			else
+				continue;
+		} else {
+			adapter->npars[j].eswitch_status = true;
+		}
+
+		adapter->npars[j].pci_func = pfn;
+		adapter->npars[j].active = (u8)pci_info[i].active;
+		adapter->npars[j].type = (u8)pci_info[i].type;
+		adapter->npars[j].phy_port = (u8)pci_info[i].default_port;
+		adapter->npars[j].min_bw = pci_info[i].tx_min_bw;
+		adapter->npars[j].max_bw = pci_info[i].tx_max_bw;
+
+		memcpy(&adapter->npars[j].mac, &pci_info[i].mac, ETH_ALEN);
+		j++;
+	}
+
+	/* Update eSwitch status for adapters without per port eSwitch
+	 * configuration capability
+	 */
+	if (!qlcnic_port_eswitch_cfg_capability(adapter)) {
+		for (i = 0; i < QLCNIC_NIU_MAX_XG_PORTS; i++)
+			adapter->eswitch[i].flags |= QLCNIC_SWITCH_ENABLE;
+	}
+
+	kfree(pci_info);
+	return 0;
+
+err_eswitch:
+	kfree(adapter->eswitch);
+	adapter->eswitch = NULL;
+err_npars:
+	kfree(adapter->npars);
+	adapter->npars = NULL;
+err_pci_info:
+	kfree(pci_info);
+
+	return ret;
+}
+
+static int
+qlcnic_set_function_modes(struct qlcnic_adapter *adapter)
+{
+	u8 id;
+	int ret;
+	u32 data = QLCNIC_MGMT_FUNC;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	ret = qlcnic_api_lock(adapter);
+	if (ret)
+		goto err_lock;
+
+	id = ahw->pci_func;
+	data = QLC_SHARED_REG_RD32(adapter, QLCNIC_DRV_OP_MODE);
+	data = (data & ~QLC_DEV_SET_DRV(0xf, id)) |
+	       QLC_DEV_SET_DRV(QLCNIC_MGMT_FUNC, id);
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_DRV_OP_MODE, data);
+	qlcnic_api_unlock(adapter);
+err_lock:
+	return ret;
+}
+
+static void qlcnic_check_vf(struct qlcnic_adapter *adapter,
+			    const struct pci_device_id *ent)
+{
+	u32 op_mode, priv_level;
+
+	/* Determine FW API version */
+	adapter->ahw->fw_hal_version = QLC_SHARED_REG_RD32(adapter,
+							   QLCNIC_FW_API);
+
+	/* Find PCI function number */
+	qlcnic_get_func_no(adapter);
+
+	/* Determine function privilege level */
+	op_mode = QLC_SHARED_REG_RD32(adapter, QLCNIC_DRV_OP_MODE);
+	if (op_mode == QLC_DEV_DRV_DEFAULT)
+		priv_level = QLCNIC_MGMT_FUNC;
+	else
+		priv_level = QLC_DEV_GET_DRV(op_mode, adapter->ahw->pci_func);
+
+	if (priv_level == QLCNIC_NON_PRIV_FUNC) {
+		adapter->ahw->op_mode = QLCNIC_NON_PRIV_FUNC;
+		dev_info(&adapter->pdev->dev,
+			"HAL Version: %d Non Privileged function\n",
+			 adapter->ahw->fw_hal_version);
+		adapter->nic_ops = &qlcnic_vf_ops;
+	} else
+		adapter->nic_ops = &qlcnic_ops;
+}
+
+#define QLCNIC_82XX_BAR0_LENGTH 0x00200000UL
+#define QLCNIC_83XX_BAR0_LENGTH 0x4000
+static void qlcnic_get_bar_length(u32 dev_id, ulong *bar)
+{
+	switch (dev_id) {
+	case PCI_DEVICE_ID_QLOGIC_QLE824X:
+		*bar = QLCNIC_82XX_BAR0_LENGTH;
+		break;
+	case PCI_DEVICE_ID_QLOGIC_QLE834X:
+	case PCI_DEVICE_ID_QLOGIC_QLE8830:
+	case PCI_DEVICE_ID_QLOGIC_QLE844X:
+	case PCI_DEVICE_ID_QLOGIC_VF_QLE834X:
+	case PCI_DEVICE_ID_QLOGIC_VF_QLE844X:
+	case PCI_DEVICE_ID_QLOGIC_VF_QLE8C30:
+		*bar = QLCNIC_83XX_BAR0_LENGTH;
+		break;
+	default:
+		*bar = 0;
+	}
+}
+
+static int qlcnic_setup_pci_map(struct pci_dev *pdev,
+				struct qlcnic_hardware_context *ahw)
+{
+	u32 offset;
+	void __iomem *mem_ptr0 = NULL;
+	unsigned long mem_len, pci_len0 = 0, bar0_len;
+
+	/* remap phys address */
+	mem_len = pci_resource_len(pdev, 0);
+
+	qlcnic_get_bar_length(pdev->device, &bar0_len);
+	if (mem_len >= bar0_len) {
+
+		mem_ptr0 = pci_ioremap_bar(pdev, 0);
+		if (mem_ptr0 == NULL) {
+			dev_err(&pdev->dev, "failed to map PCI bar 0\n");
+			return -EIO;
+		}
+		pci_len0 = mem_len;
+	} else {
+		return -EIO;
+	}
+
+	dev_info(&pdev->dev, "%dKB memory map\n", (int)(mem_len >> 10));
+
+	ahw->pci_base0 = mem_ptr0;
+	ahw->pci_len0 = pci_len0;
+	offset = QLCNIC_PCIX_PS_REG(PCIX_OCM_WINDOW_REG(ahw->pci_func));
+	qlcnic_get_ioaddr(ahw, offset);
+
+	return 0;
+}
+
+static bool qlcnic_validate_subsystem_id(struct qlcnic_adapter *adapter,
+					 int index)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	unsigned short subsystem_vendor;
+	bool ret = true;
+
+	subsystem_vendor = pdev->subsystem_vendor;
+
+	if (pdev->device == PCI_DEVICE_ID_QLOGIC_QLE824X ||
+	    pdev->device == PCI_DEVICE_ID_QLOGIC_QLE834X) {
+		if (qlcnic_boards[index].sub_vendor == subsystem_vendor &&
+		    qlcnic_boards[index].sub_device == pdev->subsystem_device)
+			ret = true;
+		else
+			ret = false;
+	}
+
+	return ret;
+}
+
+static void qlcnic_get_board_name(struct qlcnic_adapter *adapter, char *name)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	int i, found = 0;
+
+	for (i = 0; i < NUM_SUPPORTED_BOARDS; ++i) {
+		if (qlcnic_boards[i].vendor == pdev->vendor &&
+		    qlcnic_boards[i].device == pdev->device &&
+		    qlcnic_validate_subsystem_id(adapter, i)) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found)
+		sprintf(name, "%pM Gigabit Ethernet", adapter->mac_addr);
+	else
+		sprintf(name, "%pM: %s" , adapter->mac_addr,
+			qlcnic_boards[i].short_name);
+}
+
+static void
+qlcnic_check_options(struct qlcnic_adapter *adapter)
+{
+	int err;
+	u32 fw_major, fw_minor, fw_build, prev_fw_version;
+	struct pci_dev *pdev = adapter->pdev;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_fw_dump *fw_dump = &ahw->fw_dump;
+
+	prev_fw_version = adapter->fw_version;
+
+	fw_major = QLC_SHARED_REG_RD32(adapter, QLCNIC_FW_VERSION_MAJOR);
+	fw_minor = QLC_SHARED_REG_RD32(adapter, QLCNIC_FW_VERSION_MINOR);
+	fw_build = QLC_SHARED_REG_RD32(adapter, QLCNIC_FW_VERSION_SUB);
+
+	adapter->fw_version = QLCNIC_VERSION_CODE(fw_major, fw_minor, fw_build);
+
+	err = qlcnic_get_board_info(adapter);
+	if (err) {
+		dev_err(&pdev->dev, "Error getting board config info.\n");
+		return;
+	}
+	if (ahw->op_mode != QLCNIC_NON_PRIV_FUNC) {
+		if (fw_dump->tmpl_hdr == NULL ||
+				adapter->fw_version > prev_fw_version) {
+			vfree(fw_dump->tmpl_hdr);
+			if (!qlcnic_fw_cmd_get_minidump_temp(adapter))
+				dev_info(&pdev->dev,
+					"Supports FW dump capability\n");
+		}
+	}
+
+	dev_info(&pdev->dev, "Driver v%s, firmware v%d.%d.%d\n",
+		 QLCNIC_LINUX_VERSIONID, fw_major, fw_minor, fw_build);
+
+	if (adapter->ahw->port_type == QLCNIC_XGBE) {
+		if (adapter->flags & QLCNIC_ESWITCH_ENABLED) {
+			adapter->num_rxd = DEFAULT_RCV_DESCRIPTORS_VF;
+			adapter->max_rxd = MAX_RCV_DESCRIPTORS_VF;
+		} else {
+			adapter->num_rxd = DEFAULT_RCV_DESCRIPTORS_10G;
+			adapter->max_rxd = MAX_RCV_DESCRIPTORS_10G;
+		}
+
+		adapter->num_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_10G;
+		adapter->max_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_10G;
+
+	} else if (adapter->ahw->port_type == QLCNIC_GBE) {
+		adapter->num_rxd = DEFAULT_RCV_DESCRIPTORS_1G;
+		adapter->num_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_1G;
+		adapter->max_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_1G;
+		adapter->max_rxd = MAX_RCV_DESCRIPTORS_1G;
+	}
+
+	adapter->ahw->msix_supported = !!qlcnic_use_msi_x;
+
+	adapter->num_txd = MAX_CMD_DESCRIPTORS;
+
+	adapter->max_rds_rings = MAX_RDS_RINGS;
+}
+
+static int
+qlcnic_initialize_nic(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_info nic_info;
+	int err = 0;
+
+	memset(&nic_info, 0, sizeof(struct qlcnic_info));
+	err = qlcnic_get_nic_info(adapter, &nic_info, adapter->ahw->pci_func);
+	if (err)
+		return err;
+
+	adapter->ahw->physical_port = (u8)nic_info.phys_port;
+	adapter->ahw->switch_mode = nic_info.switch_mode;
+	adapter->ahw->max_tx_ques = nic_info.max_tx_ques;
+	adapter->ahw->max_rx_ques = nic_info.max_rx_ques;
+	adapter->ahw->capabilities = nic_info.capabilities;
+
+	if (adapter->ahw->capabilities & QLCNIC_FW_CAPABILITY_MORE_CAPS) {
+		u32 temp;
+		temp = QLCRD32(adapter, CRB_FW_CAPABILITIES_2, &err);
+		if (err == -EIO)
+			return err;
+		adapter->ahw->extra_capability[0] = temp;
+	} else {
+		adapter->ahw->extra_capability[0] = 0;
+	}
+
+	adapter->ahw->max_mac_filters = nic_info.max_mac_filters;
+	adapter->ahw->max_mtu = nic_info.max_mtu;
+
+	if (adapter->ahw->capabilities & BIT_6) {
+		adapter->flags |= QLCNIC_ESWITCH_ENABLED;
+		adapter->ahw->nic_mode = QLCNIC_VNIC_MODE;
+		adapter->max_tx_rings = QLCNIC_MAX_HW_VNIC_TX_RINGS;
+		adapter->max_sds_rings = QLCNIC_MAX_VNIC_SDS_RINGS;
+
+		dev_info(&adapter->pdev->dev, "vNIC mode enabled.\n");
+	} else {
+		adapter->ahw->nic_mode = QLCNIC_DEFAULT_MODE;
+		adapter->max_tx_rings = QLCNIC_MAX_HW_TX_RINGS;
+		adapter->max_sds_rings = QLCNIC_MAX_SDS_RINGS;
+		adapter->flags &= ~QLCNIC_ESWITCH_ENABLED;
+	}
+
+	return err;
+}
+
+void qlcnic_set_vlan_config(struct qlcnic_adapter *adapter,
+			    struct qlcnic_esw_func_cfg *esw_cfg)
+{
+	if (esw_cfg->discard_tagged)
+		adapter->flags &= ~QLCNIC_TAGGING_ENABLED;
+	else
+		adapter->flags |= QLCNIC_TAGGING_ENABLED;
+
+	if (esw_cfg->vlan_id) {
+		adapter->rx_pvid = esw_cfg->vlan_id;
+		adapter->tx_pvid = esw_cfg->vlan_id;
+	} else {
+		adapter->rx_pvid = 0;
+		adapter->tx_pvid = 0;
+	}
+}
+
+static int
+qlcnic_vlan_rx_add(struct net_device *netdev, __be16 proto, u16 vid)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	int err;
+
+	if (qlcnic_sriov_vf_check(adapter)) {
+		err = qlcnic_sriov_cfg_vf_guest_vlan(adapter, vid, 1);
+		if (err) {
+			netdev_err(netdev,
+				   "Cannot add VLAN filter for VLAN id %d, err=%d",
+				   vid, err);
+			return err;
+		}
+	}
+
+	set_bit(vid, adapter->vlans);
+	return 0;
+}
+
+static int
+qlcnic_vlan_rx_del(struct net_device *netdev, __be16 proto, u16 vid)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	int err;
+
+	if (qlcnic_sriov_vf_check(adapter)) {
+		err = qlcnic_sriov_cfg_vf_guest_vlan(adapter, vid, 0);
+		if (err) {
+			netdev_err(netdev,
+				   "Cannot delete VLAN filter for VLAN id %d, err=%d",
+				   vid, err);
+			return err;
+		}
+	}
+
+	qlcnic_restore_indev_addr(netdev, NETDEV_DOWN);
+	clear_bit(vid, adapter->vlans);
+	return 0;
+}
+
+void qlcnic_set_eswitch_port_features(struct qlcnic_adapter *adapter,
+				      struct qlcnic_esw_func_cfg *esw_cfg)
+{
+	adapter->flags &= ~(QLCNIC_MACSPOOF | QLCNIC_MAC_OVERRIDE_DISABLED |
+				QLCNIC_PROMISC_DISABLED);
+
+	if (esw_cfg->mac_anti_spoof)
+		adapter->flags |= QLCNIC_MACSPOOF;
+
+	if (!esw_cfg->mac_override)
+		adapter->flags |= QLCNIC_MAC_OVERRIDE_DISABLED;
+
+	if (!esw_cfg->promisc_mode)
+		adapter->flags |= QLCNIC_PROMISC_DISABLED;
+}
+
+int qlcnic_set_eswitch_port_config(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_esw_func_cfg esw_cfg;
+
+	if (!(adapter->flags & QLCNIC_ESWITCH_ENABLED))
+		return 0;
+
+	esw_cfg.pci_func = adapter->ahw->pci_func;
+	if (qlcnic_get_eswitch_port_config(adapter, &esw_cfg))
+			return -EIO;
+	qlcnic_set_vlan_config(adapter, &esw_cfg);
+	qlcnic_set_eswitch_port_features(adapter, &esw_cfg);
+	qlcnic_set_netdev_features(adapter, &esw_cfg);
+
+	return 0;
+}
+
+void qlcnic_set_netdev_features(struct qlcnic_adapter *adapter,
+				struct qlcnic_esw_func_cfg *esw_cfg)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (qlcnic_83xx_check(adapter))
+		return;
+
+	adapter->offload_flags = esw_cfg->offload_flags;
+	adapter->flags |= QLCNIC_APP_CHANGED_FLAGS;
+	netdev_update_features(netdev);
+	adapter->flags &= ~QLCNIC_APP_CHANGED_FLAGS;
+}
+
+static int
+qlcnic_check_eswitch_mode(struct qlcnic_adapter *adapter)
+{
+	u32 op_mode, priv_level;
+	int err = 0;
+
+	err = qlcnic_initialize_nic(adapter);
+	if (err)
+		return err;
+
+	if (adapter->flags & QLCNIC_ADAPTER_INITIALIZED)
+		return 0;
+
+	op_mode = QLC_SHARED_REG_RD32(adapter, QLCNIC_DRV_OP_MODE);
+	priv_level = QLC_DEV_GET_DRV(op_mode, adapter->ahw->pci_func);
+
+	if (op_mode == QLC_DEV_DRV_DEFAULT)
+		priv_level = QLCNIC_MGMT_FUNC;
+	else
+		priv_level = QLC_DEV_GET_DRV(op_mode, adapter->ahw->pci_func);
+
+	if (adapter->flags & QLCNIC_ESWITCH_ENABLED) {
+		if (priv_level == QLCNIC_MGMT_FUNC) {
+			adapter->ahw->op_mode = QLCNIC_MGMT_FUNC;
+			err = qlcnic_init_pci_info(adapter);
+			if (err)
+				return err;
+			/* Set privilege level for other functions */
+			qlcnic_set_function_modes(adapter);
+			dev_info(&adapter->pdev->dev,
+				"HAL Version: %d, Management function\n",
+				 adapter->ahw->fw_hal_version);
+		} else if (priv_level == QLCNIC_PRIV_FUNC) {
+			adapter->ahw->op_mode = QLCNIC_PRIV_FUNC;
+			dev_info(&adapter->pdev->dev,
+				"HAL Version: %d, Privileged function\n",
+				 adapter->ahw->fw_hal_version);
+		}
+	} else {
+		adapter->ahw->nic_mode = QLCNIC_DEFAULT_MODE;
+	}
+
+	adapter->flags |= QLCNIC_ADAPTER_INITIALIZED;
+
+	return err;
+}
+
+int qlcnic_set_default_offload_settings(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_esw_func_cfg esw_cfg;
+	struct qlcnic_npar_info *npar;
+	u8 i;
+
+	if (adapter->need_fw_reset)
+		return 0;
+
+	for (i = 0; i < adapter->ahw->total_nic_func; i++) {
+		if (!adapter->npars[i].eswitch_status)
+			continue;
+
+		memset(&esw_cfg, 0, sizeof(struct qlcnic_esw_func_cfg));
+		esw_cfg.pci_func = adapter->npars[i].pci_func;
+		esw_cfg.mac_override = BIT_0;
+		esw_cfg.promisc_mode = BIT_0;
+		if (qlcnic_82xx_check(adapter)) {
+			esw_cfg.offload_flags = BIT_0;
+			if (QLCNIC_IS_TSO_CAPABLE(adapter))
+				esw_cfg.offload_flags |= (BIT_1 | BIT_2);
+		}
+		if (qlcnic_config_switch_port(adapter, &esw_cfg))
+			return -EIO;
+		npar = &adapter->npars[i];
+		npar->pvid = esw_cfg.vlan_id;
+		npar->mac_override = esw_cfg.mac_override;
+		npar->mac_anti_spoof = esw_cfg.mac_anti_spoof;
+		npar->discard_tagged = esw_cfg.discard_tagged;
+		npar->promisc_mode = esw_cfg.promisc_mode;
+		npar->offload_flags = esw_cfg.offload_flags;
+	}
+
+	return 0;
+}
+
+
+static int
+qlcnic_reset_eswitch_config(struct qlcnic_adapter *adapter,
+			struct qlcnic_npar_info *npar, int pci_func)
+{
+	struct qlcnic_esw_func_cfg esw_cfg;
+	esw_cfg.op_mode = QLCNIC_PORT_DEFAULTS;
+	esw_cfg.pci_func = pci_func;
+	esw_cfg.vlan_id = npar->pvid;
+	esw_cfg.mac_override = npar->mac_override;
+	esw_cfg.discard_tagged = npar->discard_tagged;
+	esw_cfg.mac_anti_spoof = npar->mac_anti_spoof;
+	esw_cfg.offload_flags = npar->offload_flags;
+	esw_cfg.promisc_mode = npar->promisc_mode;
+	if (qlcnic_config_switch_port(adapter, &esw_cfg))
+		return -EIO;
+
+	esw_cfg.op_mode = QLCNIC_ADD_VLAN;
+	if (qlcnic_config_switch_port(adapter, &esw_cfg))
+		return -EIO;
+
+	return 0;
+}
+
+int qlcnic_reset_npar_config(struct qlcnic_adapter *adapter)
+{
+	int i, err;
+	struct qlcnic_npar_info *npar;
+	struct qlcnic_info nic_info;
+	u8 pci_func;
+
+	if (qlcnic_82xx_check(adapter))
+		if (!adapter->need_fw_reset)
+			return 0;
+
+	/* Set the NPAR config data after FW reset */
+	for (i = 0; i < adapter->ahw->total_nic_func; i++) {
+		npar = &adapter->npars[i];
+		pci_func = npar->pci_func;
+		if (!adapter->npars[i].eswitch_status)
+			continue;
+
+		memset(&nic_info, 0, sizeof(struct qlcnic_info));
+		err = qlcnic_get_nic_info(adapter, &nic_info, pci_func);
+		if (err)
+			return err;
+		nic_info.min_tx_bw = npar->min_bw;
+		nic_info.max_tx_bw = npar->max_bw;
+		err = qlcnic_set_nic_info(adapter, &nic_info);
+		if (err)
+			return err;
+
+		if (npar->enable_pm) {
+			err = qlcnic_config_port_mirroring(adapter,
+							   npar->dest_npar, 1,
+							   pci_func);
+			if (err)
+				return err;
+		}
+		err = qlcnic_reset_eswitch_config(adapter, npar, pci_func);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int qlcnic_check_npar_opertional(struct qlcnic_adapter *adapter)
+{
+	u8 npar_opt_timeo = QLCNIC_DEV_NPAR_OPER_TIMEO;
+	u32 npar_state;
+
+	if (adapter->ahw->op_mode == QLCNIC_MGMT_FUNC)
+		return 0;
+
+	npar_state = QLC_SHARED_REG_RD32(adapter,
+					 QLCNIC_CRB_DEV_NPAR_STATE);
+	while (npar_state != QLCNIC_DEV_NPAR_OPER && --npar_opt_timeo) {
+		msleep(1000);
+		npar_state = QLC_SHARED_REG_RD32(adapter,
+						 QLCNIC_CRB_DEV_NPAR_STATE);
+	}
+	if (!npar_opt_timeo) {
+		dev_err(&adapter->pdev->dev,
+			"Waiting for NPAR state to operational timeout\n");
+		return -EIO;
+	}
+	return 0;
+}
+
+static int
+qlcnic_set_mgmt_operations(struct qlcnic_adapter *adapter)
+{
+	int err;
+
+	if (!(adapter->flags & QLCNIC_ESWITCH_ENABLED) ||
+	    adapter->ahw->op_mode != QLCNIC_MGMT_FUNC)
+		return 0;
+
+	err = qlcnic_set_default_offload_settings(adapter);
+	if (err)
+		return err;
+
+	err = qlcnic_reset_npar_config(adapter);
+	if (err)
+		return err;
+
+	qlcnic_dev_set_npar_ready(adapter);
+
+	return err;
+}
+
+static int qlcnic_82xx_start_firmware(struct qlcnic_adapter *adapter)
+{
+	int err;
+
+	err = qlcnic_can_start_firmware(adapter);
+	if (err < 0)
+		return err;
+	else if (!err)
+		goto check_fw_status;
+
+	if (qlcnic_load_fw_file)
+		qlcnic_request_firmware(adapter);
+	else {
+		err = qlcnic_check_flash_fw_ver(adapter);
+		if (err)
+			goto err_out;
+
+		adapter->ahw->fw_type = QLCNIC_FLASH_ROMIMAGE;
+	}
+
+	err = qlcnic_need_fw_reset(adapter);
+	if (err == 0)
+		goto check_fw_status;
+
+	err = qlcnic_pinit_from_rom(adapter);
+	if (err)
+		goto err_out;
+
+	err = qlcnic_load_firmware(adapter);
+	if (err)
+		goto err_out;
+
+	qlcnic_release_firmware(adapter);
+	QLCWR32(adapter, CRB_DRIVER_VERSION, QLCNIC_DRIVER_VERSION);
+
+check_fw_status:
+	err = qlcnic_check_fw_status(adapter);
+	if (err)
+		goto err_out;
+
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DEV_STATE, QLCNIC_DEV_READY);
+	qlcnic_idc_debug_info(adapter, 1);
+	err = qlcnic_check_eswitch_mode(adapter);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"Memory allocation failed for eswitch\n");
+		goto err_out;
+	}
+	err = qlcnic_set_mgmt_operations(adapter);
+	if (err)
+		goto err_out;
+
+	qlcnic_check_options(adapter);
+	adapter->need_fw_reset = 0;
+
+	qlcnic_release_firmware(adapter);
+	return 0;
+
+err_out:
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DEV_STATE, QLCNIC_DEV_FAILED);
+	dev_err(&adapter->pdev->dev, "Device state set to failed\n");
+
+	qlcnic_release_firmware(adapter);
+	return err;
+}
+
+static int
+qlcnic_request_irq(struct qlcnic_adapter *adapter)
+{
+	irq_handler_t handler;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_host_tx_ring *tx_ring;
+	int err, ring, num_sds_rings;
+
+	unsigned long flags = 0;
+	struct net_device *netdev = adapter->netdev;
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+
+	if (adapter->ahw->diag_test == QLCNIC_INTERRUPT_TEST) {
+		if (qlcnic_82xx_check(adapter))
+			handler = qlcnic_tmp_intr;
+		else
+			handler = qlcnic_83xx_tmp_intr;
+		if (!QLCNIC_IS_MSI_FAMILY(adapter))
+			flags |= IRQF_SHARED;
+
+	} else {
+		if (adapter->flags & QLCNIC_MSIX_ENABLED)
+			handler = qlcnic_msix_intr;
+		else if (adapter->flags & QLCNIC_MSI_ENABLED)
+			handler = qlcnic_msi_intr;
+		else {
+			flags |= IRQF_SHARED;
+			if (qlcnic_82xx_check(adapter))
+				handler = qlcnic_intr;
+			else
+				handler = qlcnic_83xx_intr;
+		}
+	}
+	adapter->irq = netdev->irq;
+
+	if (adapter->ahw->diag_test != QLCNIC_LOOPBACK_TEST) {
+		if (qlcnic_82xx_check(adapter) ||
+		    (qlcnic_83xx_check(adapter) &&
+		     (adapter->flags & QLCNIC_MSIX_ENABLED))) {
+			num_sds_rings = adapter->drv_sds_rings;
+			for (ring = 0; ring < num_sds_rings; ring++) {
+				sds_ring = &recv_ctx->sds_rings[ring];
+				if (qlcnic_82xx_check(adapter) &&
+				    !qlcnic_check_multi_tx(adapter) &&
+				    (ring == (num_sds_rings - 1))) {
+					if (!(adapter->flags &
+					      QLCNIC_MSIX_ENABLED))
+						snprintf(sds_ring->name,
+							 sizeof(sds_ring->name),
+							 "qlcnic");
+					else
+						snprintf(sds_ring->name,
+							 sizeof(sds_ring->name),
+							 "%s-tx-0-rx-%d",
+							 netdev->name, ring);
+				} else {
+					snprintf(sds_ring->name,
+						 sizeof(sds_ring->name),
+						 "%s-rx-%d",
+						 netdev->name, ring);
+				}
+				err = request_irq(sds_ring->irq, handler, flags,
+						  sds_ring->name, sds_ring);
+				if (err)
+					return err;
+			}
+		}
+		if ((qlcnic_82xx_check(adapter) &&
+		     qlcnic_check_multi_tx(adapter)) ||
+		    (qlcnic_83xx_check(adapter) &&
+		     (adapter->flags & QLCNIC_MSIX_ENABLED) &&
+		     !(adapter->flags & QLCNIC_TX_INTR_SHARED))) {
+			handler = qlcnic_msix_tx_intr;
+			for (ring = 0; ring < adapter->drv_tx_rings;
+			     ring++) {
+				tx_ring = &adapter->tx_ring[ring];
+				snprintf(tx_ring->name, sizeof(tx_ring->name),
+					 "%s-tx-%d", netdev->name, ring);
+				err = request_irq(tx_ring->irq, handler, flags,
+						  tx_ring->name, tx_ring);
+				if (err)
+					return err;
+			}
+		}
+	}
+	return 0;
+}
+
+static void
+qlcnic_free_irq(struct qlcnic_adapter *adapter)
+{
+	int ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_host_tx_ring *tx_ring;
+
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+
+	if (adapter->ahw->diag_test != QLCNIC_LOOPBACK_TEST) {
+		if (qlcnic_82xx_check(adapter) ||
+		    (qlcnic_83xx_check(adapter) &&
+		     (adapter->flags & QLCNIC_MSIX_ENABLED))) {
+			for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+				sds_ring = &recv_ctx->sds_rings[ring];
+				free_irq(sds_ring->irq, sds_ring);
+			}
+		}
+		if ((qlcnic_83xx_check(adapter) &&
+		     !(adapter->flags & QLCNIC_TX_INTR_SHARED)) ||
+		    (qlcnic_82xx_check(adapter) &&
+		     qlcnic_check_multi_tx(adapter))) {
+			for (ring = 0; ring < adapter->drv_tx_rings;
+			     ring++) {
+				tx_ring = &adapter->tx_ring[ring];
+				if (tx_ring->irq)
+					free_irq(tx_ring->irq, tx_ring);
+			}
+		}
+	}
+}
+
+static void qlcnic_get_lro_mss_capability(struct qlcnic_adapter *adapter)
+{
+	u32 capab = 0;
+
+	if (qlcnic_82xx_check(adapter)) {
+		if (adapter->ahw->extra_capability[0] &
+		    QLCNIC_FW_CAPABILITY_2_LRO_MAX_TCP_SEG)
+			adapter->flags |= QLCNIC_FW_LRO_MSS_CAP;
+	} else {
+		capab = adapter->ahw->capabilities;
+		if (QLC_83XX_GET_FW_LRO_MSS_CAPABILITY(capab))
+			adapter->flags |= QLCNIC_FW_LRO_MSS_CAP;
+	}
+}
+
+static int qlcnic_config_def_intr_coalesce(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int err;
+
+	/* Initialize interrupt coalesce parameters */
+	ahw->coal.flag = QLCNIC_INTR_DEFAULT;
+
+	if (qlcnic_83xx_check(adapter)) {
+		ahw->coal.type = QLCNIC_INTR_COAL_TYPE_RX_TX;
+		ahw->coal.tx_time_us = QLCNIC_DEF_INTR_COALESCE_TX_TIME_US;
+		ahw->coal.tx_packets = QLCNIC_DEF_INTR_COALESCE_TX_PACKETS;
+		ahw->coal.rx_time_us = QLCNIC_DEF_INTR_COALESCE_RX_TIME_US;
+		ahw->coal.rx_packets = QLCNIC_DEF_INTR_COALESCE_RX_PACKETS;
+
+		err = qlcnic_83xx_set_rx_tx_intr_coal(adapter);
+	} else {
+		ahw->coal.type = QLCNIC_INTR_COAL_TYPE_RX;
+		ahw->coal.rx_time_us = QLCNIC_DEF_INTR_COALESCE_RX_TIME_US;
+		ahw->coal.rx_packets = QLCNIC_DEF_INTR_COALESCE_RX_PACKETS;
+
+		err = qlcnic_82xx_set_rx_coalesce(adapter);
+	}
+
+	return err;
+}
+
+int __qlcnic_up(struct qlcnic_adapter *adapter, struct net_device *netdev)
+{
+	int ring;
+	struct qlcnic_host_rds_ring *rds_ring;
+
+	if (adapter->is_up != QLCNIC_ADAPTER_UP_MAGIC)
+		return -EIO;
+
+	if (test_bit(__QLCNIC_DEV_UP, &adapter->state))
+		return 0;
+
+	if (qlcnic_set_eswitch_port_config(adapter))
+		return -EIO;
+
+	qlcnic_get_lro_mss_capability(adapter);
+
+	if (qlcnic_fw_create_ctx(adapter))
+		return -EIO;
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &adapter->recv_ctx->rds_rings[ring];
+		qlcnic_post_rx_buffers(adapter, rds_ring, ring);
+	}
+
+	qlcnic_set_multi(netdev);
+	qlcnic_fw_cmd_set_mtu(adapter, netdev->mtu);
+
+	adapter->ahw->linkup = 0;
+
+	if (adapter->drv_sds_rings > 1)
+		qlcnic_config_rss(adapter, 1);
+
+	qlcnic_config_def_intr_coalesce(adapter);
+
+	if (netdev->features & NETIF_F_LRO)
+		qlcnic_config_hw_lro(adapter, QLCNIC_LRO_ENABLED);
+
+	set_bit(__QLCNIC_DEV_UP, &adapter->state);
+	qlcnic_napi_enable(adapter);
+
+	qlcnic_linkevent_request(adapter, 1);
+
+	adapter->ahw->reset_context = 0;
+	netif_tx_start_all_queues(netdev);
+	return 0;
+}
+
+int qlcnic_up(struct qlcnic_adapter *adapter, struct net_device *netdev)
+{
+	int err = 0;
+
+	rtnl_lock();
+	if (netif_running(netdev))
+		err = __qlcnic_up(adapter, netdev);
+	rtnl_unlock();
+
+	return err;
+}
+
+void __qlcnic_down(struct qlcnic_adapter *adapter, struct net_device *netdev)
+{
+	int ring;
+
+	if (adapter->is_up != QLCNIC_ADAPTER_UP_MAGIC)
+		return;
+
+	if (!test_and_clear_bit(__QLCNIC_DEV_UP, &adapter->state))
+		return;
+
+	smp_mb();
+	netif_carrier_off(netdev);
+	adapter->ahw->linkup = 0;
+	netif_tx_disable(netdev);
+
+	qlcnic_free_mac_list(adapter);
+
+	if (adapter->fhash.fnum)
+		qlcnic_delete_lb_filters(adapter);
+
+	qlcnic_nic_set_promisc(adapter, QLCNIC_NIU_NON_PROMISC_MODE);
+	if (qlcnic_sriov_vf_check(adapter))
+		qlcnic_sriov_cleanup_async_list(&adapter->ahw->sriov->bc);
+
+	qlcnic_napi_disable(adapter);
+
+	qlcnic_fw_destroy_ctx(adapter);
+	adapter->flags &= ~QLCNIC_FW_LRO_MSS_CAP;
+
+	qlcnic_reset_rx_buffers_list(adapter);
+
+	for (ring = 0; ring < adapter->drv_tx_rings; ring++)
+		qlcnic_release_tx_buffers(adapter, &adapter->tx_ring[ring]);
+}
+
+/* Usage: During suspend and firmware recovery module */
+
+void qlcnic_down(struct qlcnic_adapter *adapter, struct net_device *netdev)
+{
+	rtnl_lock();
+	if (netif_running(netdev))
+		__qlcnic_down(adapter, netdev);
+	rtnl_unlock();
+
+}
+
+int
+qlcnic_attach(struct qlcnic_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct pci_dev *pdev = adapter->pdev;
+	int err;
+
+	if (adapter->is_up == QLCNIC_ADAPTER_UP_MAGIC)
+		return 0;
+
+	err = qlcnic_napi_add(adapter, netdev);
+	if (err)
+		return err;
+
+	err = qlcnic_alloc_sw_resources(adapter);
+	if (err) {
+		dev_err(&pdev->dev, "Error in setting sw resources\n");
+		goto err_out_napi_del;
+	}
+
+	err = qlcnic_alloc_hw_resources(adapter);
+	if (err) {
+		dev_err(&pdev->dev, "Error in setting hw resources\n");
+		goto err_out_free_sw;
+	}
+
+	err = qlcnic_request_irq(adapter);
+	if (err) {
+		dev_err(&pdev->dev, "failed to setup interrupt\n");
+		goto err_out_free_hw;
+	}
+
+	qlcnic_create_sysfs_entries(adapter);
+
+	if (qlcnic_encap_rx_offload(adapter))
+		udp_tunnel_get_rx_info(netdev);
+
+	adapter->is_up = QLCNIC_ADAPTER_UP_MAGIC;
+	return 0;
+
+err_out_free_hw:
+	qlcnic_free_hw_resources(adapter);
+err_out_free_sw:
+	qlcnic_free_sw_resources(adapter);
+err_out_napi_del:
+	qlcnic_napi_del(adapter);
+	return err;
+}
+
+void qlcnic_detach(struct qlcnic_adapter *adapter)
+{
+	if (adapter->is_up != QLCNIC_ADAPTER_UP_MAGIC)
+		return;
+
+	qlcnic_remove_sysfs_entries(adapter);
+
+	qlcnic_free_hw_resources(adapter);
+	qlcnic_release_rx_buffers(adapter);
+	qlcnic_free_irq(adapter);
+	qlcnic_napi_del(adapter);
+	qlcnic_free_sw_resources(adapter);
+
+	adapter->is_up = 0;
+}
+
+void qlcnic_diag_free_res(struct net_device *netdev, int drv_sds_rings)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_host_sds_ring *sds_ring;
+	int drv_tx_rings = adapter->drv_tx_rings;
+	int ring;
+
+	clear_bit(__QLCNIC_DEV_UP, &adapter->state);
+	if (adapter->ahw->diag_test == QLCNIC_INTERRUPT_TEST) {
+		for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+			sds_ring = &adapter->recv_ctx->sds_rings[ring];
+			qlcnic_disable_sds_intr(adapter, sds_ring);
+		}
+	}
+
+	qlcnic_fw_destroy_ctx(adapter);
+
+	qlcnic_detach(adapter);
+
+	adapter->ahw->diag_test = 0;
+	adapter->drv_sds_rings = drv_sds_rings;
+	adapter->drv_tx_rings = drv_tx_rings;
+
+	if (qlcnic_attach(adapter))
+		goto out;
+
+	if (netif_running(netdev))
+		__qlcnic_up(adapter, netdev);
+out:
+	netif_device_attach(netdev);
+}
+
+static int qlcnic_alloc_adapter_resources(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int err = 0;
+
+	adapter->recv_ctx = kzalloc(sizeof(struct qlcnic_recv_context),
+				GFP_KERNEL);
+	if (!adapter->recv_ctx) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	if (qlcnic_83xx_check(adapter)) {
+		ahw->coal.type = QLCNIC_INTR_COAL_TYPE_RX_TX;
+		ahw->coal.tx_time_us = QLCNIC_DEF_INTR_COALESCE_TX_TIME_US;
+		ahw->coal.tx_packets = QLCNIC_DEF_INTR_COALESCE_TX_PACKETS;
+		ahw->coal.rx_time_us = QLCNIC_DEF_INTR_COALESCE_RX_TIME_US;
+		ahw->coal.rx_packets = QLCNIC_DEF_INTR_COALESCE_RX_PACKETS;
+	} else {
+		ahw->coal.type = QLCNIC_INTR_COAL_TYPE_RX;
+		ahw->coal.rx_time_us = QLCNIC_DEF_INTR_COALESCE_RX_TIME_US;
+		ahw->coal.rx_packets = QLCNIC_DEF_INTR_COALESCE_RX_PACKETS;
+	}
+
+	/* clear stats */
+	memset(&adapter->stats, 0, sizeof(adapter->stats));
+err_out:
+	return err;
+}
+
+static void qlcnic_free_adapter_resources(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
+
+	kfree(adapter->recv_ctx);
+	adapter->recv_ctx = NULL;
+
+	if (fw_dump->tmpl_hdr) {
+		vfree(fw_dump->tmpl_hdr);
+		fw_dump->tmpl_hdr = NULL;
+	}
+
+	if (fw_dump->dma_buffer) {
+		dma_free_coherent(&adapter->pdev->dev, QLC_PEX_DMA_READ_SIZE,
+				  fw_dump->dma_buffer, fw_dump->phys_addr);
+		fw_dump->dma_buffer = NULL;
+	}
+
+	kfree(adapter->ahw->reset.buff);
+	adapter->ahw->fw_dump.tmpl_hdr = NULL;
+}
+
+int qlcnic_diag_alloc_res(struct net_device *netdev, int test)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_host_rds_ring *rds_ring;
+	int ring;
+	int ret;
+
+	netif_device_detach(netdev);
+
+	if (netif_running(netdev))
+		__qlcnic_down(adapter, netdev);
+
+	qlcnic_detach(adapter);
+
+	adapter->drv_sds_rings = QLCNIC_SINGLE_RING;
+	adapter->ahw->diag_test = test;
+	adapter->ahw->linkup = 0;
+
+	ret = qlcnic_attach(adapter);
+	if (ret) {
+		netif_device_attach(netdev);
+		return ret;
+	}
+
+	ret = qlcnic_fw_create_ctx(adapter);
+	if (ret) {
+		qlcnic_detach(adapter);
+		netif_device_attach(netdev);
+		return ret;
+	}
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &adapter->recv_ctx->rds_rings[ring];
+		qlcnic_post_rx_buffers(adapter, rds_ring, ring);
+	}
+
+	if (adapter->ahw->diag_test == QLCNIC_INTERRUPT_TEST) {
+		for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+			sds_ring = &adapter->recv_ctx->sds_rings[ring];
+			qlcnic_enable_sds_intr(adapter, sds_ring);
+		}
+	}
+
+	if (adapter->ahw->diag_test == QLCNIC_LOOPBACK_TEST) {
+		adapter->ahw->loopback_state = 0;
+		qlcnic_linkevent_request(adapter, 1);
+	}
+
+	set_bit(__QLCNIC_DEV_UP, &adapter->state);
+
+	return 0;
+}
+
+/* Reset context in hardware only */
+static int
+qlcnic_reset_hw_context(struct qlcnic_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (test_and_set_bit(__QLCNIC_RESETTING, &adapter->state))
+		return -EBUSY;
+
+	netif_device_detach(netdev);
+
+	qlcnic_down(adapter, netdev);
+
+	qlcnic_up(adapter, netdev);
+
+	netif_device_attach(netdev);
+
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	netdev_info(adapter->netdev, "%s: soft reset complete\n", __func__);
+	return 0;
+}
+
+int
+qlcnic_reset_context(struct qlcnic_adapter *adapter)
+{
+	int err = 0;
+	struct net_device *netdev = adapter->netdev;
+
+	if (test_and_set_bit(__QLCNIC_RESETTING, &adapter->state))
+		return -EBUSY;
+
+	if (adapter->is_up == QLCNIC_ADAPTER_UP_MAGIC) {
+
+		netif_device_detach(netdev);
+
+		if (netif_running(netdev))
+			__qlcnic_down(adapter, netdev);
+
+		qlcnic_detach(adapter);
+
+		if (netif_running(netdev)) {
+			err = qlcnic_attach(adapter);
+			if (!err) {
+				__qlcnic_up(adapter, netdev);
+				qlcnic_restore_indev_addr(netdev, NETDEV_UP);
+			}
+		}
+
+		netif_device_attach(netdev);
+	}
+
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	return err;
+}
+
+static void qlcnic_82xx_set_mac_filter_count(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u16 act_pci_fn = ahw->total_nic_func;
+	u16 count;
+
+	ahw->max_mc_count = QLCNIC_MAX_MC_COUNT;
+	if (act_pci_fn <= 2)
+		count = (QLCNIC_MAX_UC_COUNT - QLCNIC_MAX_MC_COUNT) /
+			 act_pci_fn;
+	else
+		count = (QLCNIC_LB_MAX_FILTERS - QLCNIC_MAX_MC_COUNT) /
+			 act_pci_fn;
+	ahw->max_uc_count = count;
+}
+
+static int qlcnic_set_real_num_queues(struct qlcnic_adapter *adapter,
+				      u8 tx_queues, u8 rx_queues)
+{
+	struct net_device *netdev = adapter->netdev;
+	int err = 0;
+
+	if (tx_queues) {
+		err = netif_set_real_num_tx_queues(netdev, tx_queues);
+		if (err) {
+			netdev_err(netdev, "failed to set %d Tx queues\n",
+				   tx_queues);
+			return err;
+		}
+	}
+
+	if (rx_queues) {
+		err = netif_set_real_num_rx_queues(netdev, rx_queues);
+		if (err)
+			netdev_err(netdev, "failed to set %d Rx queues\n",
+				   rx_queues);
+	}
+
+	return err;
+}
+
+int
+qlcnic_setup_netdev(struct qlcnic_adapter *adapter, struct net_device *netdev,
+		    int pci_using_dac)
+{
+	int err;
+	struct pci_dev *pdev = adapter->pdev;
+
+	adapter->rx_csum = 1;
+	adapter->ahw->mc_enabled = 0;
+	qlcnic_set_mac_filter_count(adapter);
+
+	netdev->netdev_ops	   = &qlcnic_netdev_ops;
+	netdev->watchdog_timeo     = QLCNIC_WATCHDOG_TIMEOUTVALUE * HZ;
+
+	qlcnic_change_mtu(netdev, netdev->mtu);
+
+	netdev->ethtool_ops = (qlcnic_sriov_vf_check(adapter)) ?
+		&qlcnic_sriov_vf_ethtool_ops : &qlcnic_ethtool_ops;
+
+	netdev->features |= (NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
+			     NETIF_F_IPV6_CSUM | NETIF_F_GRO |
+			     NETIF_F_HW_VLAN_CTAG_RX);
+	netdev->vlan_features |= (NETIF_F_SG | NETIF_F_IP_CSUM |
+				  NETIF_F_IPV6_CSUM);
+
+	if (QLCNIC_IS_TSO_CAPABLE(adapter)) {
+		netdev->features |= (NETIF_F_TSO | NETIF_F_TSO6);
+		netdev->vlan_features |= (NETIF_F_TSO | NETIF_F_TSO6);
+	}
+
+	if (pci_using_dac) {
+		netdev->features |= NETIF_F_HIGHDMA;
+		netdev->vlan_features |= NETIF_F_HIGHDMA;
+	}
+
+	if (qlcnic_vlan_tx_check(adapter))
+		netdev->features |= (NETIF_F_HW_VLAN_CTAG_TX);
+
+	if (qlcnic_sriov_vf_check(adapter))
+		netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+
+	if (adapter->ahw->capabilities & QLCNIC_FW_CAPABILITY_HW_LRO)
+		netdev->features |= NETIF_F_LRO;
+
+	if (qlcnic_encap_tx_offload(adapter)) {
+		netdev->features |= NETIF_F_GSO_UDP_TUNNEL;
+
+		/* encapsulation Tx offload supported by Adapter */
+		netdev->hw_enc_features = NETIF_F_IP_CSUM        |
+					  NETIF_F_GSO_UDP_TUNNEL |
+					  NETIF_F_TSO            |
+					  NETIF_F_TSO6;
+	}
+
+	if (qlcnic_encap_rx_offload(adapter))
+		netdev->hw_enc_features |= NETIF_F_RXCSUM;
+
+	netdev->hw_features = netdev->features;
+	netdev->priv_flags |= IFF_UNICAST_FLT;
+	netdev->irq = adapter->msix_entries[0].vector;
+
+	/* MTU range: 68 - 9600 */
+	netdev->min_mtu = P3P_MIN_MTU;
+	netdev->max_mtu = P3P_MAX_MTU;
+
+	err = qlcnic_set_real_num_queues(adapter, adapter->drv_tx_rings,
+					 adapter->drv_sds_rings);
+	if (err)
+		return err;
+
+	qlcnic_dcb_init_dcbnl_ops(adapter->dcb);
+
+	err = register_netdev(netdev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to register net device\n");
+		return err;
+	}
+
+	return 0;
+}
+
+static int qlcnic_set_dma_mask(struct pci_dev *pdev, int *pci_using_dac)
+{
+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) &&
+			!pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)))
+		*pci_using_dac = 1;
+	else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) &&
+			!pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)))
+		*pci_using_dac = 0;
+	else {
+		dev_err(&pdev->dev, "Unable to set DMA mask, aborting\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+void qlcnic_free_tx_rings(struct qlcnic_adapter *adapter)
+{
+	int ring;
+	struct qlcnic_host_tx_ring *tx_ring;
+
+	for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+		tx_ring = &adapter->tx_ring[ring];
+		if (tx_ring) {
+			vfree(tx_ring->cmd_buf_arr);
+			tx_ring->cmd_buf_arr = NULL;
+		}
+	}
+	kfree(adapter->tx_ring);
+}
+
+int qlcnic_alloc_tx_rings(struct qlcnic_adapter *adapter,
+			  struct net_device *netdev)
+{
+	int ring, vector, index;
+	struct qlcnic_host_tx_ring *tx_ring;
+	struct qlcnic_cmd_buffer *cmd_buf_arr;
+
+	tx_ring = kcalloc(adapter->drv_tx_rings,
+			  sizeof(struct qlcnic_host_tx_ring), GFP_KERNEL);
+	if (tx_ring == NULL)
+		return -ENOMEM;
+
+	adapter->tx_ring = tx_ring;
+
+	for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+		tx_ring = &adapter->tx_ring[ring];
+		tx_ring->num_desc = adapter->num_txd;
+		tx_ring->txq = netdev_get_tx_queue(netdev, ring);
+		cmd_buf_arr = vzalloc(TX_BUFF_RINGSIZE(tx_ring));
+		if (cmd_buf_arr == NULL) {
+			qlcnic_free_tx_rings(adapter);
+			return -ENOMEM;
+		}
+		tx_ring->cmd_buf_arr = cmd_buf_arr;
+		spin_lock_init(&tx_ring->tx_clean_lock);
+	}
+
+	if (qlcnic_83xx_check(adapter) ||
+	    (qlcnic_82xx_check(adapter) && qlcnic_check_multi_tx(adapter))) {
+		for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+			tx_ring = &adapter->tx_ring[ring];
+			tx_ring->adapter = adapter;
+			if (adapter->flags & QLCNIC_MSIX_ENABLED) {
+				index = adapter->drv_sds_rings + ring;
+				vector = adapter->msix_entries[index].vector;
+				tx_ring->irq = vector;
+			}
+		}
+	}
+
+	return 0;
+}
+
+void qlcnic_set_drv_version(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u32 fw_cmd = 0;
+
+	if (qlcnic_82xx_check(adapter))
+		fw_cmd = QLCNIC_CMD_82XX_SET_DRV_VER;
+	else if (qlcnic_83xx_check(adapter))
+		fw_cmd = QLCNIC_CMD_83XX_SET_DRV_VER;
+
+	if (ahw->extra_capability[0] & QLCNIC_FW_CAPABILITY_SET_DRV_VER)
+		qlcnic_fw_cmd_set_drv_version(adapter, fw_cmd);
+}
+
+/* Reset firmware API lock */
+static void qlcnic_reset_api_lock(struct qlcnic_adapter *adapter)
+{
+	qlcnic_api_lock(adapter);
+	qlcnic_api_unlock(adapter);
+}
+
+
+static int
+qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct net_device *netdev = NULL;
+	struct qlcnic_adapter *adapter = NULL;
+	struct qlcnic_hardware_context *ahw;
+	int err, pci_using_dac = -1;
+	char board_name[QLCNIC_MAX_BOARD_NAME_LEN + 19]; /* MAC + ": " + name */
+
+	err = pci_enable_device(pdev);
+	if (err)
+		return err;
+
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		err = -ENODEV;
+		goto err_out_disable_pdev;
+	}
+
+	err = qlcnic_set_dma_mask(pdev, &pci_using_dac);
+	if (err)
+		goto err_out_disable_pdev;
+
+	err = pci_request_regions(pdev, qlcnic_driver_name);
+	if (err)
+		goto err_out_disable_pdev;
+
+	pci_set_master(pdev);
+	pci_enable_pcie_error_reporting(pdev);
+
+	ahw = kzalloc(sizeof(struct qlcnic_hardware_context), GFP_KERNEL);
+	if (!ahw) {
+		err = -ENOMEM;
+		goto err_out_free_res;
+	}
+
+	switch (ent->device) {
+	case PCI_DEVICE_ID_QLOGIC_QLE824X:
+		ahw->hw_ops = &qlcnic_hw_ops;
+		ahw->reg_tbl = (u32 *) qlcnic_reg_tbl;
+		break;
+	case PCI_DEVICE_ID_QLOGIC_QLE834X:
+	case PCI_DEVICE_ID_QLOGIC_QLE8830:
+	case PCI_DEVICE_ID_QLOGIC_QLE844X:
+		qlcnic_83xx_register_map(ahw);
+		break;
+	case PCI_DEVICE_ID_QLOGIC_VF_QLE834X:
+	case PCI_DEVICE_ID_QLOGIC_VF_QLE8C30:
+	case PCI_DEVICE_ID_QLOGIC_VF_QLE844X:
+		qlcnic_sriov_vf_register_map(ahw);
+		break;
+	default:
+		goto err_out_free_hw_res;
+	}
+
+	err = qlcnic_setup_pci_map(pdev, ahw);
+	if (err)
+		goto err_out_free_hw_res;
+
+	netdev = alloc_etherdev_mq(sizeof(struct qlcnic_adapter),
+				   QLCNIC_MAX_TX_RINGS);
+	if (!netdev) {
+		err = -ENOMEM;
+		goto err_out_iounmap;
+	}
+
+	SET_NETDEV_DEV(netdev, &pdev->dev);
+
+	adapter = netdev_priv(netdev);
+	adapter->netdev  = netdev;
+	adapter->pdev    = pdev;
+	adapter->ahw = ahw;
+
+	adapter->qlcnic_wq = create_singlethread_workqueue("qlcnic");
+	if (adapter->qlcnic_wq == NULL) {
+		err = -ENOMEM;
+		dev_err(&pdev->dev, "Failed to create workqueue\n");
+		goto err_out_free_netdev;
+	}
+
+	err = qlcnic_alloc_adapter_resources(adapter);
+	if (err)
+		goto err_out_free_wq;
+
+	adapter->dev_rst_time = jiffies;
+	ahw->revision_id = pdev->revision;
+	ahw->max_vnic_func = qlcnic_get_vnic_func_count(adapter);
+	if (qlcnic_mac_learn == FDB_MAC_LEARN)
+		adapter->fdb_mac_learn = true;
+	else if (qlcnic_mac_learn == DRV_MAC_LEARN)
+		adapter->drv_mac_learn = true;
+
+	rwlock_init(&adapter->ahw->crb_lock);
+	mutex_init(&adapter->ahw->mem_lock);
+
+	INIT_LIST_HEAD(&adapter->mac_list);
+
+	qlcnic_register_dcb(adapter);
+
+	if (qlcnic_82xx_check(adapter)) {
+		qlcnic_check_vf(adapter, ent);
+		adapter->portnum = adapter->ahw->pci_func;
+		qlcnic_reset_api_lock(adapter);
+		err = qlcnic_start_firmware(adapter);
+		if (err) {
+			dev_err(&pdev->dev, "Loading fw failed.Please Reboot\n"
+				"\t\tIf reboot doesn't help, try flashing the card\n");
+			goto err_out_maintenance_mode;
+		}
+
+		/* compute and set default and max tx/sds rings */
+		if (adapter->ahw->msix_supported) {
+			if (qlcnic_check_multi_tx_capability(adapter) == 1)
+				qlcnic_set_tx_ring_count(adapter,
+							 QLCNIC_SINGLE_RING);
+			else
+				qlcnic_set_tx_ring_count(adapter,
+							 QLCNIC_DEF_TX_RINGS);
+			qlcnic_set_sds_ring_count(adapter,
+						  QLCNIC_DEF_SDS_RINGS);
+		} else {
+			qlcnic_set_tx_ring_count(adapter, QLCNIC_SINGLE_RING);
+			qlcnic_set_sds_ring_count(adapter, QLCNIC_SINGLE_RING);
+		}
+
+		err = qlcnic_setup_idc_param(adapter);
+		if (err)
+			goto err_out_free_hw;
+
+		adapter->flags |= QLCNIC_NEED_FLR;
+
+	} else if (qlcnic_83xx_check(adapter)) {
+		qlcnic_83xx_check_vf(adapter, ent);
+		adapter->portnum = adapter->ahw->pci_func;
+		err = qlcnic_83xx_init(adapter, pci_using_dac);
+		if (err) {
+			switch (err) {
+			case -ENOTRECOVERABLE:
+				dev_err(&pdev->dev, "Adapter initialization failed due to a faulty hardware\n");
+				dev_err(&pdev->dev, "Please replace the adapter with new one and return the faulty adapter for repair\n");
+				goto err_out_free_hw;
+			case -ENOMEM:
+				dev_err(&pdev->dev, "Adapter initialization failed. Please reboot\n");
+				goto err_out_free_hw;
+			case -EOPNOTSUPP:
+				dev_err(&pdev->dev, "Adapter initialization failed\n");
+				goto err_out_free_hw;
+			default:
+				dev_err(&pdev->dev, "Adapter initialization failed. Driver will load in maintenance mode to recover the adapter using the application\n");
+				goto err_out_maintenance_mode;
+			}
+		}
+
+		if (qlcnic_sriov_vf_check(adapter))
+			return 0;
+	} else {
+		dev_err(&pdev->dev,
+			"%s: failed. Please Reboot\n", __func__);
+		err = -ENODEV;
+		goto err_out_free_hw;
+	}
+
+	if (qlcnic_read_mac_addr(adapter))
+		dev_warn(&pdev->dev, "failed to read mac addr\n");
+
+	qlcnic_read_phys_port_id(adapter);
+
+	if (adapter->portnum == 0) {
+		qlcnic_get_board_name(adapter, board_name);
+
+		pr_info("%s: %s Board Chip rev 0x%x\n",
+			module_name(THIS_MODULE),
+			board_name, adapter->ahw->revision_id);
+	}
+
+	if (qlcnic_83xx_check(adapter) && !qlcnic_use_msi_x &&
+	    !!qlcnic_use_msi)
+		dev_warn(&pdev->dev,
+			 "Device does not support MSI interrupts\n");
+
+	if (qlcnic_82xx_check(adapter)) {
+		qlcnic_dcb_enable(adapter->dcb);
+		qlcnic_dcb_get_info(adapter->dcb);
+		err = qlcnic_setup_intr(adapter);
+
+		if (err) {
+			dev_err(&pdev->dev, "Failed to setup interrupt\n");
+			goto err_out_disable_msi;
+		}
+	}
+
+	err = qlcnic_get_act_pci_func(adapter);
+	if (err)
+		goto err_out_disable_mbx_intr;
+
+	if (adapter->portnum == 0)
+		qlcnic_set_drv_version(adapter);
+
+	err = qlcnic_setup_netdev(adapter, netdev, pci_using_dac);
+	if (err)
+		goto err_out_disable_mbx_intr;
+
+	pci_set_drvdata(pdev, adapter);
+
+	if (qlcnic_82xx_check(adapter))
+		qlcnic_schedule_work(adapter, qlcnic_fw_poll_work,
+				     FW_POLL_DELAY);
+
+	switch (adapter->ahw->port_type) {
+	case QLCNIC_GBE:
+		dev_info(&adapter->pdev->dev, "%s: GbE port initialized\n",
+				adapter->netdev->name);
+		break;
+	case QLCNIC_XGBE:
+		dev_info(&adapter->pdev->dev, "%s: XGbE port initialized\n",
+				adapter->netdev->name);
+		break;
+	}
+
+	if (adapter->drv_mac_learn)
+		qlcnic_alloc_lb_filters_mem(adapter);
+
+	qlcnic_add_sysfs(adapter);
+	qlcnic_register_hwmon_dev(adapter);
+	return 0;
+
+err_out_disable_mbx_intr:
+	if (qlcnic_83xx_check(adapter))
+		qlcnic_83xx_free_mbx_intr(adapter);
+
+err_out_disable_msi:
+	qlcnic_teardown_intr(adapter);
+	qlcnic_cancel_idc_work(adapter);
+	qlcnic_clr_all_drv_state(adapter, 0);
+
+err_out_free_hw:
+	qlcnic_free_adapter_resources(adapter);
+
+err_out_free_wq:
+	destroy_workqueue(adapter->qlcnic_wq);
+
+err_out_free_netdev:
+	free_netdev(netdev);
+
+err_out_iounmap:
+	qlcnic_cleanup_pci_map(ahw);
+
+err_out_free_hw_res:
+	kfree(ahw);
+
+err_out_free_res:
+	pci_release_regions(pdev);
+
+err_out_disable_pdev:
+	pci_disable_device(pdev);
+	return err;
+
+err_out_maintenance_mode:
+	set_bit(__QLCNIC_MAINTENANCE_MODE, &adapter->state);
+	netdev->netdev_ops = &qlcnic_netdev_failed_ops;
+	netdev->ethtool_ops = &qlcnic_ethtool_failed_ops;
+	ahw->port_type = QLCNIC_XGBE;
+
+	if (qlcnic_83xx_check(adapter))
+		adapter->tgt_status_reg = NULL;
+	else
+		ahw->board_type = QLCNIC_BRDTYPE_P3P_10G_SFP_PLUS;
+
+	err = register_netdev(netdev);
+
+	if (err) {
+		dev_err(&pdev->dev, "Failed to register net device\n");
+		qlcnic_clr_all_drv_state(adapter, 0);
+		goto err_out_free_hw;
+	}
+
+	pci_set_drvdata(pdev, adapter);
+	qlcnic_add_sysfs(adapter);
+
+	return 0;
+}
+
+static void qlcnic_remove(struct pci_dev *pdev)
+{
+	struct qlcnic_adapter *adapter;
+	struct net_device *netdev;
+	struct qlcnic_hardware_context *ahw;
+
+	adapter = pci_get_drvdata(pdev);
+	if (adapter == NULL)
+		return;
+
+	netdev = adapter->netdev;
+
+	qlcnic_cancel_idc_work(adapter);
+	qlcnic_sriov_pf_disable(adapter);
+	ahw = adapter->ahw;
+
+	unregister_netdev(netdev);
+	qlcnic_sriov_cleanup(adapter);
+
+	if (qlcnic_83xx_check(adapter)) {
+		qlcnic_83xx_initialize_nic(adapter, 0);
+		cancel_delayed_work_sync(&adapter->idc_aen_work);
+		qlcnic_83xx_free_mbx_intr(adapter);
+		qlcnic_83xx_detach_mailbox_work(adapter);
+		qlcnic_83xx_free_mailbox(ahw->mailbox);
+		kfree(ahw->fw_info);
+	}
+
+	qlcnic_dcb_free(adapter->dcb);
+	qlcnic_detach(adapter);
+	kfree(adapter->npars);
+	kfree(adapter->eswitch);
+
+	if (qlcnic_82xx_check(adapter))
+		qlcnic_clr_all_drv_state(adapter, 0);
+
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+
+	qlcnic_free_lb_filters_mem(adapter);
+
+	qlcnic_teardown_intr(adapter);
+
+	qlcnic_remove_sysfs(adapter);
+
+	qlcnic_unregister_hwmon_dev(adapter);
+
+	qlcnic_cleanup_pci_map(adapter->ahw);
+
+	qlcnic_release_firmware(adapter);
+
+	pci_disable_pcie_error_reporting(pdev);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+
+	if (adapter->qlcnic_wq) {
+		destroy_workqueue(adapter->qlcnic_wq);
+		adapter->qlcnic_wq = NULL;
+	}
+
+	qlcnic_free_adapter_resources(adapter);
+	kfree(ahw);
+	free_netdev(netdev);
+}
+
+static void qlcnic_shutdown(struct pci_dev *pdev)
+{
+	if (__qlcnic_shutdown(pdev))
+		return;
+
+	pci_disable_device(pdev);
+}
+
+#ifdef CONFIG_PM
+static int qlcnic_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	int retval;
+
+	retval = __qlcnic_shutdown(pdev);
+	if (retval)
+		return retval;
+
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+	return 0;
+}
+
+static int qlcnic_resume(struct pci_dev *pdev)
+{
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+	int err;
+
+	err = pci_enable_device(pdev);
+	if (err)
+		return err;
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_set_master(pdev);
+	pci_restore_state(pdev);
+
+	return  __qlcnic_resume(adapter);
+}
+#endif
+
+static int qlcnic_open(struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	int err;
+
+	if (test_bit(__QLCNIC_MAINTENANCE_MODE, &adapter->state)) {
+		netdev_err(netdev, "%s: Device is in non-operational state\n",
+			   __func__);
+
+		return -EIO;
+	}
+
+	netif_carrier_off(netdev);
+
+	err = qlcnic_attach(adapter);
+	if (err)
+		return err;
+
+	err = __qlcnic_up(adapter, netdev);
+	if (err)
+		qlcnic_detach(adapter);
+
+	return err;
+}
+
+/*
+ * qlcnic_close - Disables a network interface entry point
+ */
+static int qlcnic_close(struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+
+	__qlcnic_down(adapter, netdev);
+
+	return 0;
+}
+
+#define QLCNIC_VF_LB_BUCKET_SIZE 1
+
+void qlcnic_alloc_lb_filters_mem(struct qlcnic_adapter *adapter)
+{
+	void *head;
+	int i;
+	struct net_device *netdev = adapter->netdev;
+	u32 filter_size = 0;
+	u16 act_pci_func = 0;
+
+	if (adapter->fhash.fmax && adapter->fhash.fhead)
+		return;
+
+	act_pci_func = adapter->ahw->total_nic_func;
+	spin_lock_init(&adapter->mac_learn_lock);
+	spin_lock_init(&adapter->rx_mac_learn_lock);
+
+	if (qlcnic_sriov_vf_check(adapter)) {
+		filter_size = QLCNIC_83XX_SRIOV_VF_MAX_MAC - 1;
+		adapter->fhash.fbucket_size = QLCNIC_VF_LB_BUCKET_SIZE;
+	} else if (qlcnic_82xx_check(adapter)) {
+		filter_size = QLCNIC_LB_MAX_FILTERS;
+		adapter->fhash.fbucket_size = QLCNIC_LB_BUCKET_SIZE;
+	} else {
+		filter_size = QLC_83XX_LB_MAX_FILTERS;
+		adapter->fhash.fbucket_size = QLC_83XX_LB_BUCKET_SIZE;
+	}
+
+	head = kcalloc(adapter->fhash.fbucket_size,
+		       sizeof(struct hlist_head), GFP_ATOMIC);
+
+	if (!head)
+		return;
+
+	adapter->fhash.fmax = (filter_size / act_pci_func);
+	adapter->fhash.fhead = head;
+
+	netdev_info(netdev, "active nic func = %d, mac filter size=%d\n",
+		    act_pci_func, adapter->fhash.fmax);
+
+	for (i = 0; i < adapter->fhash.fbucket_size; i++)
+		INIT_HLIST_HEAD(&adapter->fhash.fhead[i]);
+
+	adapter->rx_fhash.fbucket_size = adapter->fhash.fbucket_size;
+
+	head = kcalloc(adapter->rx_fhash.fbucket_size,
+		       sizeof(struct hlist_head), GFP_ATOMIC);
+
+	if (!head)
+		return;
+
+	adapter->rx_fhash.fmax = (filter_size / act_pci_func);
+	adapter->rx_fhash.fhead = head;
+
+	for (i = 0; i < adapter->rx_fhash.fbucket_size; i++)
+		INIT_HLIST_HEAD(&adapter->rx_fhash.fhead[i]);
+}
+
+static void qlcnic_free_lb_filters_mem(struct qlcnic_adapter *adapter)
+{
+	if (adapter->fhash.fmax)
+		kfree(adapter->fhash.fhead);
+
+	adapter->fhash.fhead = NULL;
+	adapter->fhash.fmax = 0;
+
+	if (adapter->rx_fhash.fmax)
+		kfree(adapter->rx_fhash.fhead);
+
+	adapter->rx_fhash.fmax = 0;
+	adapter->rx_fhash.fhead = NULL;
+}
+
+int qlcnic_check_temp(struct qlcnic_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	u32 temp_state, temp_val, temp = 0;
+	int rv = 0;
+
+	if (qlcnic_83xx_check(adapter))
+		temp = QLCRDX(adapter->ahw, QLC_83XX_ASIC_TEMP);
+
+	if (qlcnic_82xx_check(adapter))
+		temp = QLC_SHARED_REG_RD32(adapter, QLCNIC_ASIC_TEMP);
+
+	temp_state = qlcnic_get_temp_state(temp);
+	temp_val = qlcnic_get_temp_val(temp);
+
+	if (temp_state == QLCNIC_TEMP_PANIC) {
+		dev_err(&netdev->dev,
+		       "Device temperature %d degrees C exceeds"
+		       " maximum allowed. Hardware has been shut down.\n",
+		       temp_val);
+		rv = 1;
+	} else if (temp_state == QLCNIC_TEMP_WARN) {
+		if (adapter->ahw->temp == QLCNIC_TEMP_NORMAL) {
+			dev_err(&netdev->dev,
+			       "Device temperature %d degrees C "
+			       "exceeds operating range."
+			       " Immediate action needed.\n",
+			       temp_val);
+		}
+	} else {
+		if (adapter->ahw->temp == QLCNIC_TEMP_WARN) {
+			dev_info(&netdev->dev,
+			       "Device temperature is now %d degrees C"
+			       " in normal range.\n", temp_val);
+		}
+	}
+	adapter->ahw->temp = temp_state;
+	return rv;
+}
+
+static inline void dump_tx_ring_desc(struct qlcnic_host_tx_ring *tx_ring)
+{
+	int i;
+	struct cmd_desc_type0 *tx_desc_info;
+
+	for (i = 0; i < tx_ring->num_desc; i++) {
+		tx_desc_info = &tx_ring->desc_head[i];
+		pr_info("TX Desc: %d\n", i);
+		print_hex_dump(KERN_INFO, "TX: ", DUMP_PREFIX_OFFSET, 16, 1,
+			       &tx_ring->desc_head[i],
+			       sizeof(struct cmd_desc_type0), true);
+	}
+}
+
+static void qlcnic_dump_rings(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
+	struct net_device *netdev = adapter->netdev;
+	struct qlcnic_host_rds_ring *rds_ring;
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_host_tx_ring *tx_ring;
+	int ring;
+
+	if (!netdev || !netif_running(netdev))
+		return;
+
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_ring = &recv_ctx->rds_rings[ring];
+		if (!rds_ring)
+			continue;
+		netdev_info(netdev,
+			    "rds_ring=%d crb_rcv_producer=%d producer=%u num_desc=%u\n",
+			     ring, readl(rds_ring->crb_rcv_producer),
+			     rds_ring->producer, rds_ring->num_desc);
+	}
+
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &(recv_ctx->sds_rings[ring]);
+		if (!sds_ring)
+			continue;
+		netdev_info(netdev,
+			    "sds_ring=%d crb_sts_consumer=%d consumer=%u crb_intr_mask=%d num_desc=%u\n",
+			    ring, readl(sds_ring->crb_sts_consumer),
+			    sds_ring->consumer, readl(sds_ring->crb_intr_mask),
+			    sds_ring->num_desc);
+	}
+
+	for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+		tx_ring = &adapter->tx_ring[ring];
+		if (!tx_ring)
+			continue;
+		netdev_info(netdev, "Tx ring=%d Context Id=0x%x\n",
+			    ring, tx_ring->ctx_id);
+		netdev_info(netdev,
+			    "xmit_finished=%llu, xmit_called=%llu, xmit_on=%llu, xmit_off=%llu\n",
+			    tx_ring->tx_stats.xmit_finished,
+			    tx_ring->tx_stats.xmit_called,
+			    tx_ring->tx_stats.xmit_on,
+			    tx_ring->tx_stats.xmit_off);
+
+		if (tx_ring->crb_intr_mask)
+			netdev_info(netdev, "crb_intr_mask=%d\n",
+				    readl(tx_ring->crb_intr_mask));
+
+		netdev_info(netdev,
+			    "hw_producer=%d, sw_producer=%d sw_consumer=%d, hw_consumer=%d\n",
+			    readl(tx_ring->crb_cmd_producer),
+			    tx_ring->producer, tx_ring->sw_consumer,
+			    le32_to_cpu(*(tx_ring->hw_consumer)));
+
+		netdev_info(netdev, "Total desc=%d, Available desc=%d\n",
+			    tx_ring->num_desc, qlcnic_tx_avail(tx_ring));
+
+		if (netif_msg_tx_err(adapter->ahw))
+			dump_tx_ring_desc(tx_ring);
+	}
+
+}
+
+static void qlcnic_tx_timeout(struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+
+	if (test_bit(__QLCNIC_RESETTING, &adapter->state))
+		return;
+
+	qlcnic_dump_rings(adapter);
+
+	if (++adapter->tx_timeo_cnt >= QLCNIC_MAX_TX_TIMEOUTS ||
+	    netif_msg_tx_err(adapter->ahw)) {
+		netdev_err(netdev, "Tx timeout, reset the adapter.\n");
+		if (qlcnic_82xx_check(adapter))
+			adapter->need_fw_reset = 1;
+		else if (qlcnic_83xx_check(adapter))
+			qlcnic_83xx_idc_request_reset(adapter,
+						      QLCNIC_FORCE_FW_DUMP_KEY);
+	} else {
+		netdev_err(netdev, "Tx timeout, reset adapter context.\n");
+		adapter->ahw->reset_context = 1;
+	}
+}
+
+static struct net_device_stats *qlcnic_get_stats(struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct net_device_stats *stats = &netdev->stats;
+
+	if (test_bit(__QLCNIC_DEV_UP, &adapter->state))
+		qlcnic_update_stats(adapter);
+
+	stats->rx_packets = adapter->stats.rx_pkts + adapter->stats.lro_pkts;
+	stats->tx_packets = adapter->stats.xmitfinished;
+	stats->rx_bytes = adapter->stats.rxbytes + adapter->stats.lrobytes;
+	stats->tx_bytes = adapter->stats.txbytes;
+	stats->rx_dropped = adapter->stats.rxdropped;
+	stats->tx_dropped = adapter->stats.txdropped;
+
+	return stats;
+}
+
+static irqreturn_t qlcnic_82xx_clear_legacy_intr(struct qlcnic_adapter *adapter)
+{
+	u32 status;
+
+	status = readl(adapter->isr_int_vec);
+
+	if (!(status & adapter->ahw->int_vec_bit))
+		return IRQ_NONE;
+
+	/* check interrupt state machine, to be sure */
+	status = readl(adapter->crb_int_state_reg);
+	if (!ISR_LEGACY_INT_TRIGGERED(status))
+		return IRQ_NONE;
+
+	writel(0xffffffff, adapter->tgt_status_reg);
+	/* read twice to ensure write is flushed */
+	readl(adapter->isr_int_vec);
+	readl(adapter->isr_int_vec);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t qlcnic_tmp_intr(int irq, void *data)
+{
+	struct qlcnic_host_sds_ring *sds_ring = data;
+	struct qlcnic_adapter *adapter = sds_ring->adapter;
+
+	if (adapter->flags & QLCNIC_MSIX_ENABLED)
+		goto done;
+	else if (adapter->flags & QLCNIC_MSI_ENABLED) {
+		writel(0xffffffff, adapter->tgt_status_reg);
+		goto done;
+	}
+
+	if (qlcnic_clear_legacy_intr(adapter) == IRQ_NONE)
+		return IRQ_NONE;
+
+done:
+	adapter->ahw->diag_cnt++;
+	qlcnic_enable_sds_intr(adapter, sds_ring);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t qlcnic_intr(int irq, void *data)
+{
+	struct qlcnic_host_sds_ring *sds_ring = data;
+	struct qlcnic_adapter *adapter = sds_ring->adapter;
+
+	if (qlcnic_clear_legacy_intr(adapter) == IRQ_NONE)
+		return IRQ_NONE;
+
+	napi_schedule(&sds_ring->napi);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t qlcnic_msi_intr(int irq, void *data)
+{
+	struct qlcnic_host_sds_ring *sds_ring = data;
+	struct qlcnic_adapter *adapter = sds_ring->adapter;
+
+	/* clear interrupt */
+	writel(0xffffffff, adapter->tgt_status_reg);
+
+	napi_schedule(&sds_ring->napi);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t qlcnic_msix_intr(int irq, void *data)
+{
+	struct qlcnic_host_sds_ring *sds_ring = data;
+
+	napi_schedule(&sds_ring->napi);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t qlcnic_msix_tx_intr(int irq, void *data)
+{
+	struct qlcnic_host_tx_ring *tx_ring = data;
+
+	napi_schedule(&tx_ring->napi);
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void qlcnic_poll_controller(struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_recv_context *recv_ctx;
+	struct qlcnic_host_tx_ring *tx_ring;
+	int ring;
+
+	if (!test_bit(__QLCNIC_DEV_UP, &adapter->state))
+		return;
+
+	recv_ctx = adapter->recv_ctx;
+
+	for (ring = 0; ring < adapter->drv_sds_rings; ring++) {
+		sds_ring = &recv_ctx->sds_rings[ring];
+		qlcnic_disable_sds_intr(adapter, sds_ring);
+		napi_schedule(&sds_ring->napi);
+	}
+
+	if (adapter->flags & QLCNIC_MSIX_ENABLED) {
+		/* Only Multi-Tx queue capable devices need to
+		 * schedule NAPI for TX rings
+		 */
+		if ((qlcnic_83xx_check(adapter) &&
+		     (adapter->flags & QLCNIC_TX_INTR_SHARED)) ||
+		    (qlcnic_82xx_check(adapter) &&
+		     !qlcnic_check_multi_tx(adapter)))
+			return;
+
+		for (ring = 0; ring < adapter->drv_tx_rings; ring++) {
+			tx_ring = &adapter->tx_ring[ring];
+			qlcnic_disable_tx_intr(adapter, tx_ring);
+			napi_schedule(&tx_ring->napi);
+		}
+	}
+}
+#endif
+
+static void
+qlcnic_idc_debug_info(struct qlcnic_adapter *adapter, u8 encoding)
+{
+	u32 val;
+
+	val = adapter->portnum & 0xf;
+	val |= encoding << 7;
+	val |= (jiffies - adapter->dev_rst_time) << 8;
+
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DRV_SCRATCH, val);
+	adapter->dev_rst_time = jiffies;
+}
+
+static int
+qlcnic_set_drv_state(struct qlcnic_adapter *adapter, u8 state)
+{
+	u32  val;
+
+	WARN_ON(state != QLCNIC_DEV_NEED_RESET &&
+			state != QLCNIC_DEV_NEED_QUISCENT);
+
+	if (qlcnic_api_lock(adapter))
+		return -EIO;
+
+	val = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DRV_STATE);
+
+	if (state == QLCNIC_DEV_NEED_RESET)
+		QLC_DEV_SET_RST_RDY(val, adapter->portnum);
+	else if (state == QLCNIC_DEV_NEED_QUISCENT)
+		QLC_DEV_SET_QSCNT_RDY(val, adapter->portnum);
+
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DRV_STATE, val);
+
+	qlcnic_api_unlock(adapter);
+
+	return 0;
+}
+
+static int
+qlcnic_clr_drv_state(struct qlcnic_adapter *adapter)
+{
+	u32  val;
+
+	if (qlcnic_api_lock(adapter))
+		return -EBUSY;
+
+	val = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DRV_STATE);
+	QLC_DEV_CLR_RST_QSCNT(val, adapter->portnum);
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DRV_STATE, val);
+
+	qlcnic_api_unlock(adapter);
+
+	return 0;
+}
+
+void qlcnic_clr_all_drv_state(struct qlcnic_adapter *adapter, u8 failed)
+{
+	u32  val;
+
+	if (qlcnic_api_lock(adapter))
+		goto err;
+
+	val = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DRV_ACTIVE);
+	QLC_DEV_CLR_REF_CNT(val, adapter->portnum);
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DRV_ACTIVE, val);
+
+	if (failed) {
+		QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DEV_STATE,
+				    QLCNIC_DEV_FAILED);
+		dev_info(&adapter->pdev->dev,
+				"Device state set to Failed. Please Reboot\n");
+	} else if (!(val & 0x11111111))
+		QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DEV_STATE,
+				    QLCNIC_DEV_COLD);
+
+	val = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DRV_STATE);
+	QLC_DEV_CLR_RST_QSCNT(val, adapter->portnum);
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DRV_STATE, val);
+
+	qlcnic_api_unlock(adapter);
+err:
+	adapter->fw_fail_cnt = 0;
+	adapter->flags &= ~QLCNIC_FW_HANG;
+	clear_bit(__QLCNIC_START_FW, &adapter->state);
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+}
+
+/* Grab api lock, before checking state */
+static int
+qlcnic_check_drv_state(struct qlcnic_adapter *adapter)
+{
+	int act, state, active_mask;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	state = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DRV_STATE);
+	act = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DRV_ACTIVE);
+
+	if (adapter->flags & QLCNIC_FW_RESET_OWNER) {
+		active_mask = (~(1 << (ahw->pci_func * 4)));
+		act = act & active_mask;
+	}
+
+	if (((state & 0x11111111) == (act & 0x11111111)) ||
+			((act & 0x11111111) == ((state >> 1) & 0x11111111)))
+		return 0;
+	else
+		return 1;
+}
+
+static int qlcnic_check_idc_ver(struct qlcnic_adapter *adapter)
+{
+	u32 val = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DRV_IDC_VER);
+
+	if (val != QLCNIC_DRV_IDC_VER) {
+		dev_warn(&adapter->pdev->dev, "IDC Version mismatch, driver's"
+			" idc ver = %x; reqd = %x\n", QLCNIC_DRV_IDC_VER, val);
+	}
+
+	return 0;
+}
+
+static int
+qlcnic_can_start_firmware(struct qlcnic_adapter *adapter)
+{
+	u32 val, prev_state;
+	u8 dev_init_timeo = adapter->dev_init_timeo;
+	u8 portnum = adapter->portnum;
+	u8 ret;
+
+	if (test_and_clear_bit(__QLCNIC_START_FW, &adapter->state))
+		return 1;
+
+	if (qlcnic_api_lock(adapter))
+		return -1;
+
+	val = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DRV_ACTIVE);
+	if (!(val & (1 << (portnum * 4)))) {
+		QLC_DEV_SET_REF_CNT(val, portnum);
+		QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DRV_ACTIVE, val);
+	}
+
+	prev_state = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DEV_STATE);
+	QLCDB(adapter, HW, "Device state = %u\n", prev_state);
+
+	switch (prev_state) {
+	case QLCNIC_DEV_COLD:
+		QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DEV_STATE,
+				    QLCNIC_DEV_INITIALIZING);
+		QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DRV_IDC_VER,
+				    QLCNIC_DRV_IDC_VER);
+		qlcnic_idc_debug_info(adapter, 0);
+		qlcnic_api_unlock(adapter);
+		return 1;
+
+	case QLCNIC_DEV_READY:
+		ret = qlcnic_check_idc_ver(adapter);
+		qlcnic_api_unlock(adapter);
+		return ret;
+
+	case QLCNIC_DEV_NEED_RESET:
+		val = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DRV_STATE);
+		QLC_DEV_SET_RST_RDY(val, portnum);
+		QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DRV_STATE, val);
+		break;
+
+	case QLCNIC_DEV_NEED_QUISCENT:
+		val = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DRV_STATE);
+		QLC_DEV_SET_QSCNT_RDY(val, portnum);
+		QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DRV_STATE, val);
+		break;
+
+	case QLCNIC_DEV_FAILED:
+		dev_err(&adapter->pdev->dev, "Device in failed state.\n");
+		qlcnic_api_unlock(adapter);
+		return -1;
+
+	case QLCNIC_DEV_INITIALIZING:
+	case QLCNIC_DEV_QUISCENT:
+		break;
+	}
+
+	qlcnic_api_unlock(adapter);
+
+	do {
+		msleep(1000);
+		prev_state = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DEV_STATE);
+
+		if (prev_state == QLCNIC_DEV_QUISCENT)
+			continue;
+	} while ((prev_state != QLCNIC_DEV_READY) && --dev_init_timeo);
+
+	if (!dev_init_timeo) {
+		dev_err(&adapter->pdev->dev,
+			"Waiting for device to initialize timeout\n");
+		return -1;
+	}
+
+	if (qlcnic_api_lock(adapter))
+		return -1;
+
+	val = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DRV_STATE);
+	QLC_DEV_CLR_RST_QSCNT(val, portnum);
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DRV_STATE, val);
+
+	ret = qlcnic_check_idc_ver(adapter);
+	qlcnic_api_unlock(adapter);
+
+	return ret;
+}
+
+static void
+qlcnic_fwinit_work(struct work_struct *work)
+{
+	struct qlcnic_adapter *adapter = container_of(work,
+			struct qlcnic_adapter, fw_work.work);
+	u32 dev_state = 0xf;
+	u32 val;
+
+	if (qlcnic_api_lock(adapter))
+		goto err_ret;
+
+	dev_state = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DEV_STATE);
+	if (dev_state == QLCNIC_DEV_QUISCENT ||
+	    dev_state == QLCNIC_DEV_NEED_QUISCENT) {
+		qlcnic_api_unlock(adapter);
+		qlcnic_schedule_work(adapter, qlcnic_fwinit_work,
+						FW_POLL_DELAY * 2);
+		return;
+	}
+
+	if (adapter->ahw->op_mode == QLCNIC_NON_PRIV_FUNC) {
+		qlcnic_api_unlock(adapter);
+		goto wait_npar;
+	}
+
+	if (dev_state == QLCNIC_DEV_INITIALIZING ||
+	    dev_state == QLCNIC_DEV_READY) {
+		dev_info(&adapter->pdev->dev, "Detected state change from "
+				"DEV_NEED_RESET, skipping ack check\n");
+		goto skip_ack_check;
+	}
+
+	if (adapter->fw_wait_cnt++ > adapter->reset_ack_timeo) {
+		dev_info(&adapter->pdev->dev, "Reset:Failed to get ack %d sec\n",
+					adapter->reset_ack_timeo);
+		goto skip_ack_check;
+	}
+
+	if (!qlcnic_check_drv_state(adapter)) {
+skip_ack_check:
+		dev_state = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DEV_STATE);
+
+		if (dev_state == QLCNIC_DEV_NEED_RESET) {
+			QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DEV_STATE,
+					    QLCNIC_DEV_INITIALIZING);
+			set_bit(__QLCNIC_START_FW, &adapter->state);
+			QLCDB(adapter, DRV, "Restarting fw\n");
+			qlcnic_idc_debug_info(adapter, 0);
+			val = QLC_SHARED_REG_RD32(adapter,
+						  QLCNIC_CRB_DRV_STATE);
+			QLC_DEV_SET_RST_RDY(val, adapter->portnum);
+			QLC_SHARED_REG_WR32(adapter,
+					    QLCNIC_CRB_DRV_STATE, val);
+		}
+
+		qlcnic_api_unlock(adapter);
+
+		rtnl_lock();
+		if (qlcnic_check_fw_dump_state(adapter) &&
+		    (adapter->flags & QLCNIC_FW_RESET_OWNER)) {
+			QLCDB(adapter, DRV, "Take FW dump\n");
+			qlcnic_dump_fw(adapter);
+			adapter->flags |= QLCNIC_FW_HANG;
+		}
+		rtnl_unlock();
+
+		adapter->flags &= ~QLCNIC_FW_RESET_OWNER;
+		if (!adapter->nic_ops->start_firmware(adapter)) {
+			qlcnic_schedule_work(adapter, qlcnic_attach_work, 0);
+			adapter->fw_wait_cnt = 0;
+			return;
+		}
+		goto err_ret;
+	}
+
+	qlcnic_api_unlock(adapter);
+
+wait_npar:
+	dev_state = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DEV_STATE);
+	QLCDB(adapter, HW, "Func waiting: Device state=%u\n", dev_state);
+
+	switch (dev_state) {
+	case QLCNIC_DEV_READY:
+		if (!qlcnic_start_firmware(adapter)) {
+			qlcnic_schedule_work(adapter, qlcnic_attach_work, 0);
+			adapter->fw_wait_cnt = 0;
+			return;
+		}
+	case QLCNIC_DEV_FAILED:
+		break;
+	default:
+		qlcnic_schedule_work(adapter,
+			qlcnic_fwinit_work, FW_POLL_DELAY);
+		return;
+	}
+
+err_ret:
+	dev_err(&adapter->pdev->dev, "Fwinit work failed state=%u "
+		"fw_wait_cnt=%u\n", dev_state, adapter->fw_wait_cnt);
+	netif_device_attach(adapter->netdev);
+	qlcnic_clr_all_drv_state(adapter, 0);
+}
+
+static void
+qlcnic_detach_work(struct work_struct *work)
+{
+	struct qlcnic_adapter *adapter = container_of(work,
+			struct qlcnic_adapter, fw_work.work);
+	struct net_device *netdev = adapter->netdev;
+	u32 status;
+
+	netif_device_detach(netdev);
+
+	/* Dont grab rtnl lock during Quiscent mode */
+	if (adapter->dev_state == QLCNIC_DEV_NEED_QUISCENT) {
+		if (netif_running(netdev))
+			__qlcnic_down(adapter, netdev);
+	} else
+		qlcnic_down(adapter, netdev);
+
+	status = QLC_SHARED_REG_RD32(adapter, QLCNIC_PEG_HALT_STATUS1);
+
+	if (status & QLCNIC_RCODE_FATAL_ERROR) {
+		dev_err(&adapter->pdev->dev,
+			"Detaching the device: peg halt status1=0x%x\n",
+					status);
+
+		if (QLCNIC_FWERROR_CODE(status) == QLCNIC_FWERROR_FAN_FAILURE) {
+			dev_err(&adapter->pdev->dev,
+			"On board active cooling fan failed. "
+				"Device has been halted.\n");
+			dev_err(&adapter->pdev->dev,
+				"Replace the adapter.\n");
+		}
+
+		goto err_ret;
+	}
+
+	if (adapter->ahw->temp == QLCNIC_TEMP_PANIC) {
+		dev_err(&adapter->pdev->dev, "Detaching the device: temp=%d\n",
+			adapter->ahw->temp);
+		goto err_ret;
+	}
+
+	/* Dont ack if this instance is the reset owner */
+	if (!(adapter->flags & QLCNIC_FW_RESET_OWNER)) {
+		if (qlcnic_set_drv_state(adapter, adapter->dev_state)) {
+			dev_err(&adapter->pdev->dev,
+				"Failed to set driver state,"
+					"detaching the device.\n");
+			goto err_ret;
+		}
+	}
+
+	adapter->fw_wait_cnt = 0;
+
+	qlcnic_schedule_work(adapter, qlcnic_fwinit_work, FW_POLL_DELAY);
+
+	return;
+
+err_ret:
+	netif_device_attach(netdev);
+	qlcnic_clr_all_drv_state(adapter, 1);
+}
+
+/*Transit NPAR state to NON Operational */
+static void
+qlcnic_set_npar_non_operational(struct qlcnic_adapter *adapter)
+{
+	u32 state;
+
+	state = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DEV_NPAR_STATE);
+	if (state == QLCNIC_DEV_NPAR_NON_OPER)
+		return;
+
+	if (qlcnic_api_lock(adapter))
+		return;
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DEV_NPAR_STATE,
+			    QLCNIC_DEV_NPAR_NON_OPER);
+	qlcnic_api_unlock(adapter);
+}
+
+static void qlcnic_82xx_dev_request_reset(struct qlcnic_adapter *adapter,
+					  u32 key)
+{
+	u32 state, xg_val = 0, gb_val = 0;
+
+	qlcnic_xg_set_xg0_mask(xg_val);
+	qlcnic_xg_set_xg1_mask(xg_val);
+	QLCWR32(adapter, QLCNIC_NIU_XG_PAUSE_CTL, xg_val);
+	qlcnic_gb_set_gb0_mask(gb_val);
+	qlcnic_gb_set_gb1_mask(gb_val);
+	qlcnic_gb_set_gb2_mask(gb_val);
+	qlcnic_gb_set_gb3_mask(gb_val);
+	QLCWR32(adapter, QLCNIC_NIU_GB_PAUSE_CTL, gb_val);
+	dev_info(&adapter->pdev->dev, "Pause control frames disabled"
+				" on all ports\n");
+	adapter->need_fw_reset = 1;
+
+	if (qlcnic_api_lock(adapter))
+		return;
+
+	state = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DEV_STATE);
+
+	if (test_bit(__QLCNIC_MAINTENANCE_MODE, &adapter->state)) {
+		netdev_err(adapter->netdev, "%s: Device is in non-operational state\n",
+			   __func__);
+		qlcnic_api_unlock(adapter);
+
+		return;
+	}
+
+	if (state == QLCNIC_DEV_READY) {
+		QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DEV_STATE,
+				    QLCNIC_DEV_NEED_RESET);
+		adapter->flags |= QLCNIC_FW_RESET_OWNER;
+		QLCDB(adapter, DRV, "NEED_RESET state set\n");
+		qlcnic_idc_debug_info(adapter, 0);
+	}
+
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DEV_NPAR_STATE,
+			    QLCNIC_DEV_NPAR_NON_OPER);
+	qlcnic_api_unlock(adapter);
+}
+
+/* Transit to NPAR READY state from NPAR NOT READY state */
+static void
+qlcnic_dev_set_npar_ready(struct qlcnic_adapter *adapter)
+{
+	if (qlcnic_api_lock(adapter))
+		return;
+
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DEV_NPAR_STATE,
+			    QLCNIC_DEV_NPAR_OPER);
+	QLCDB(adapter, DRV, "NPAR operational state set\n");
+
+	qlcnic_api_unlock(adapter);
+}
+
+void qlcnic_schedule_work(struct qlcnic_adapter *adapter,
+			  work_func_t func, int delay)
+{
+	if (test_bit(__QLCNIC_AER, &adapter->state))
+		return;
+
+	INIT_DELAYED_WORK(&adapter->fw_work, func);
+	queue_delayed_work(adapter->qlcnic_wq, &adapter->fw_work,
+			   round_jiffies_relative(delay));
+}
+
+static void
+qlcnic_attach_work(struct work_struct *work)
+{
+	struct qlcnic_adapter *adapter = container_of(work,
+				struct qlcnic_adapter, fw_work.work);
+	struct net_device *netdev = adapter->netdev;
+	u32 npar_state;
+
+	if (adapter->ahw->op_mode != QLCNIC_MGMT_FUNC) {
+		npar_state = QLC_SHARED_REG_RD32(adapter,
+						 QLCNIC_CRB_DEV_NPAR_STATE);
+		if (adapter->fw_wait_cnt++ > QLCNIC_DEV_NPAR_OPER_TIMEO)
+			qlcnic_clr_all_drv_state(adapter, 0);
+		else if (npar_state != QLCNIC_DEV_NPAR_OPER)
+			qlcnic_schedule_work(adapter, qlcnic_attach_work,
+							FW_POLL_DELAY);
+		else
+			goto attach;
+		QLCDB(adapter, DRV, "Waiting for NPAR state to operational\n");
+		return;
+	}
+attach:
+	qlcnic_dcb_get_info(adapter->dcb);
+
+	if (netif_running(netdev)) {
+		if (qlcnic_up(adapter, netdev))
+			goto done;
+
+		qlcnic_restore_indev_addr(netdev, NETDEV_UP);
+	}
+
+done:
+	netif_device_attach(netdev);
+	adapter->fw_fail_cnt = 0;
+	adapter->flags &= ~QLCNIC_FW_HANG;
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	if (adapter->portnum == 0)
+		qlcnic_set_drv_version(adapter);
+
+	if (!qlcnic_clr_drv_state(adapter))
+		qlcnic_schedule_work(adapter, qlcnic_fw_poll_work,
+							FW_POLL_DELAY);
+}
+
+static int
+qlcnic_check_health(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_fw_dump *fw_dump = &ahw->fw_dump;
+	u32 state = 0, heartbeat;
+	u32 peg_status;
+	int err = 0;
+
+	if (qlcnic_check_temp(adapter))
+		goto detach;
+
+	if (adapter->need_fw_reset)
+		qlcnic_dev_request_reset(adapter, 0);
+
+	state = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DEV_STATE);
+	if (state == QLCNIC_DEV_NEED_RESET) {
+		qlcnic_set_npar_non_operational(adapter);
+		adapter->need_fw_reset = 1;
+	} else if (state == QLCNIC_DEV_NEED_QUISCENT)
+		goto detach;
+
+	heartbeat = QLC_SHARED_REG_RD32(adapter, QLCNIC_PEG_ALIVE_COUNTER);
+	if (heartbeat != adapter->heartbeat) {
+		adapter->heartbeat = heartbeat;
+		adapter->fw_fail_cnt = 0;
+		if (adapter->need_fw_reset)
+			goto detach;
+
+		if (ahw->reset_context && qlcnic_auto_fw_reset)
+			qlcnic_reset_hw_context(adapter);
+
+		return 0;
+	}
+
+	if (++adapter->fw_fail_cnt < FW_FAIL_THRESH)
+		return 0;
+
+	adapter->flags |= QLCNIC_FW_HANG;
+
+	qlcnic_dev_request_reset(adapter, 0);
+
+	if (qlcnic_auto_fw_reset)
+		clear_bit(__QLCNIC_FW_ATTACHED, &adapter->state);
+
+	dev_err(&adapter->pdev->dev, "firmware hang detected\n");
+	peg_status = QLC_SHARED_REG_RD32(adapter, QLCNIC_PEG_HALT_STATUS1);
+	dev_err(&adapter->pdev->dev, "Dumping hw/fw registers\n"
+			"PEG_HALT_STATUS1: 0x%x, PEG_HALT_STATUS2: 0x%x,\n"
+			"PEG_NET_0_PC: 0x%x, PEG_NET_1_PC: 0x%x,\n"
+			"PEG_NET_2_PC: 0x%x, PEG_NET_3_PC: 0x%x,\n"
+			"PEG_NET_4_PC: 0x%x\n",
+			peg_status,
+			QLC_SHARED_REG_RD32(adapter, QLCNIC_PEG_HALT_STATUS2),
+			QLCRD32(adapter, QLCNIC_CRB_PEG_NET_0 + 0x3c, &err),
+			QLCRD32(adapter, QLCNIC_CRB_PEG_NET_1 + 0x3c, &err),
+			QLCRD32(adapter, QLCNIC_CRB_PEG_NET_2 + 0x3c, &err),
+			QLCRD32(adapter, QLCNIC_CRB_PEG_NET_3 + 0x3c, &err),
+			QLCRD32(adapter, QLCNIC_CRB_PEG_NET_4 + 0x3c, &err));
+	if (QLCNIC_FWERROR_CODE(peg_status) == 0x67)
+		dev_err(&adapter->pdev->dev,
+			"Firmware aborted with error code 0x00006700. "
+				"Device is being reset.\n");
+detach:
+	adapter->dev_state = (state == QLCNIC_DEV_NEED_QUISCENT) ? state :
+		QLCNIC_DEV_NEED_RESET;
+
+	if (qlcnic_auto_fw_reset && !test_and_set_bit(__QLCNIC_RESETTING,
+						      &adapter->state)) {
+
+		qlcnic_schedule_work(adapter, qlcnic_detach_work, 0);
+		QLCDB(adapter, DRV, "fw recovery scheduled.\n");
+	} else if (!qlcnic_auto_fw_reset && fw_dump->enable &&
+		   adapter->flags & QLCNIC_FW_RESET_OWNER) {
+		qlcnic_dump_fw(adapter);
+	}
+
+	return 1;
+}
+
+void qlcnic_fw_poll_work(struct work_struct *work)
+{
+	struct qlcnic_adapter *adapter = container_of(work,
+				struct qlcnic_adapter, fw_work.work);
+
+	if (test_bit(__QLCNIC_RESETTING, &adapter->state))
+		goto reschedule;
+
+
+	if (qlcnic_check_health(adapter))
+		return;
+
+	if (adapter->fhash.fnum)
+		qlcnic_prune_lb_filters(adapter);
+
+reschedule:
+	qlcnic_schedule_work(adapter, qlcnic_fw_poll_work, FW_POLL_DELAY);
+}
+
+static int qlcnic_is_first_func(struct pci_dev *pdev)
+{
+	struct pci_dev *oth_pdev;
+	int val = pdev->devfn;
+
+	while (val-- > 0) {
+		oth_pdev = pci_get_domain_bus_and_slot(pci_domain_nr
+			(pdev->bus), pdev->bus->number,
+			PCI_DEVFN(PCI_SLOT(pdev->devfn), val));
+		if (!oth_pdev)
+			continue;
+
+		if (oth_pdev->current_state != PCI_D3cold) {
+			pci_dev_put(oth_pdev);
+			return 0;
+		}
+		pci_dev_put(oth_pdev);
+	}
+	return 1;
+}
+
+static int qlcnic_attach_func(struct pci_dev *pdev)
+{
+	int err, first_func;
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+	struct net_device *netdev = adapter->netdev;
+
+	pdev->error_state = pci_channel_io_normal;
+
+	err = pci_enable_device(pdev);
+	if (err)
+		return err;
+
+	pci_set_master(pdev);
+	pci_restore_state(pdev);
+
+	first_func = qlcnic_is_first_func(pdev);
+
+	if (qlcnic_api_lock(adapter))
+		return -EINVAL;
+
+	if (adapter->ahw->op_mode != QLCNIC_NON_PRIV_FUNC && first_func) {
+		adapter->need_fw_reset = 1;
+		set_bit(__QLCNIC_START_FW, &adapter->state);
+		QLC_SHARED_REG_WR32(adapter, QLCNIC_CRB_DEV_STATE,
+				    QLCNIC_DEV_INITIALIZING);
+		QLCDB(adapter, DRV, "Restarting fw\n");
+	}
+	qlcnic_api_unlock(adapter);
+
+	err = qlcnic_start_firmware(adapter);
+	if (err)
+		return err;
+
+	qlcnic_clr_drv_state(adapter);
+	kfree(adapter->msix_entries);
+	adapter->msix_entries = NULL;
+	err = qlcnic_setup_intr(adapter);
+
+	if (err) {
+		kfree(adapter->msix_entries);
+		netdev_err(netdev, "failed to setup interrupt\n");
+		return err;
+	}
+
+	if (netif_running(netdev)) {
+		err = qlcnic_attach(adapter);
+		if (err) {
+			qlcnic_clr_all_drv_state(adapter, 1);
+			clear_bit(__QLCNIC_AER, &adapter->state);
+			netif_device_attach(netdev);
+			return err;
+		}
+
+		err = qlcnic_up(adapter, netdev);
+		if (err)
+			goto done;
+
+		qlcnic_restore_indev_addr(netdev, NETDEV_UP);
+	}
+ done:
+	netif_device_attach(netdev);
+	return err;
+}
+
+static pci_ers_result_t qlcnic_82xx_io_error_detected(struct pci_dev *pdev,
+						      pci_channel_state_t state)
+{
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+	struct net_device *netdev = adapter->netdev;
+
+	if (state == pci_channel_io_perm_failure)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	if (state == pci_channel_io_normal)
+		return PCI_ERS_RESULT_RECOVERED;
+
+	set_bit(__QLCNIC_AER, &adapter->state);
+	netif_device_detach(netdev);
+
+	cancel_delayed_work_sync(&adapter->fw_work);
+
+	if (netif_running(netdev))
+		qlcnic_down(adapter, netdev);
+
+	qlcnic_detach(adapter);
+	qlcnic_teardown_intr(adapter);
+
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t qlcnic_82xx_io_slot_reset(struct pci_dev *pdev)
+{
+	pci_ers_result_t res;
+
+	rtnl_lock();
+	res = qlcnic_attach_func(pdev) ? PCI_ERS_RESULT_DISCONNECT :
+					 PCI_ERS_RESULT_RECOVERED;
+	rtnl_unlock();
+
+	return res;
+}
+
+static void qlcnic_82xx_io_resume(struct pci_dev *pdev)
+{
+	u32 state;
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+	state = QLC_SHARED_REG_RD32(adapter, QLCNIC_CRB_DEV_STATE);
+	if (state == QLCNIC_DEV_READY && test_and_clear_bit(__QLCNIC_AER,
+							    &adapter->state))
+		qlcnic_schedule_work(adapter, qlcnic_fw_poll_work,
+				     FW_POLL_DELAY);
+}
+
+static pci_ers_result_t qlcnic_io_error_detected(struct pci_dev *pdev,
+						 pci_channel_state_t state)
+{
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+	struct qlcnic_hardware_ops *hw_ops = adapter->ahw->hw_ops;
+
+	if (hw_ops->io_error_detected) {
+		return hw_ops->io_error_detected(pdev, state);
+	} else {
+		dev_err(&pdev->dev, "AER error_detected handler not registered.\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+}
+
+static pci_ers_result_t qlcnic_io_slot_reset(struct pci_dev *pdev)
+{
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+	struct qlcnic_hardware_ops *hw_ops = adapter->ahw->hw_ops;
+
+	if (hw_ops->io_slot_reset) {
+		return hw_ops->io_slot_reset(pdev);
+	} else {
+		dev_err(&pdev->dev, "AER slot_reset handler not registered.\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+}
+
+static void qlcnic_io_resume(struct pci_dev *pdev)
+{
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+	struct qlcnic_hardware_ops *hw_ops = adapter->ahw->hw_ops;
+
+	if (hw_ops->io_resume)
+		hw_ops->io_resume(pdev);
+	else
+		dev_err(&pdev->dev, "AER resume handler not registered.\n");
+}
+
+
+static int
+qlcnicvf_start_firmware(struct qlcnic_adapter *adapter)
+{
+	int err;
+
+	err = qlcnic_can_start_firmware(adapter);
+	if (err)
+		return err;
+
+	err = qlcnic_check_npar_opertional(adapter);
+	if (err)
+		return err;
+
+	err = qlcnic_initialize_nic(adapter);
+	if (err)
+		return err;
+
+	qlcnic_check_options(adapter);
+
+	err = qlcnic_set_eswitch_port_config(adapter);
+	if (err)
+		return err;
+
+	adapter->need_fw_reset = 0;
+
+	return err;
+}
+
+int qlcnic_validate_rings(struct qlcnic_adapter *adapter, __u32 ring_cnt,
+			  int queue_type)
+{
+	struct net_device *netdev = adapter->netdev;
+	u8 max_hw_rings = 0;
+	char buf[8];
+	int cur_rings;
+
+	if (queue_type == QLCNIC_RX_QUEUE) {
+		max_hw_rings = adapter->max_sds_rings;
+		cur_rings = adapter->drv_sds_rings;
+		strcpy(buf, "SDS");
+	} else if (queue_type == QLCNIC_TX_QUEUE) {
+		max_hw_rings = adapter->max_tx_rings;
+		cur_rings = adapter->drv_tx_rings;
+		strcpy(buf, "Tx");
+	}
+
+	if (!is_power_of_2(ring_cnt)) {
+		netdev_err(netdev, "%s rings value should be a power of 2\n",
+			   buf);
+		return -EINVAL;
+	}
+
+	if (qlcnic_82xx_check(adapter) && (queue_type == QLCNIC_TX_QUEUE) &&
+	    !qlcnic_check_multi_tx(adapter)) {
+			netdev_err(netdev, "No Multi Tx queue support\n");
+			return -EINVAL;
+	}
+
+	if (ring_cnt > num_online_cpus()) {
+		netdev_err(netdev,
+			   "%s value[%u] should not be higher than, number of online CPUs\n",
+			   buf, num_online_cpus());
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int qlcnic_setup_rings(struct qlcnic_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	u8 tx_rings, rx_rings;
+	int err;
+
+	if (test_bit(__QLCNIC_RESETTING, &adapter->state))
+		return -EBUSY;
+
+	tx_rings = adapter->drv_tss_rings;
+	rx_rings = adapter->drv_rss_rings;
+
+	netif_device_detach(netdev);
+
+	err = qlcnic_set_real_num_queues(adapter, tx_rings, rx_rings);
+	if (err)
+		goto done;
+
+	if (netif_running(netdev))
+		__qlcnic_down(adapter, netdev);
+
+	qlcnic_detach(adapter);
+
+	if (qlcnic_83xx_check(adapter)) {
+		qlcnic_83xx_free_mbx_intr(adapter);
+		qlcnic_83xx_enable_mbx_poll(adapter);
+	}
+
+	qlcnic_teardown_intr(adapter);
+
+	err = qlcnic_setup_intr(adapter);
+	if (err) {
+		kfree(adapter->msix_entries);
+		netdev_err(netdev, "failed to setup interrupt\n");
+		return err;
+	}
+
+	/* Check if we need to update real_num_{tx|rx}_queues because
+	 * qlcnic_setup_intr() may change Tx/Rx rings size
+	 */
+	if ((tx_rings != adapter->drv_tx_rings) ||
+	    (rx_rings != adapter->drv_sds_rings)) {
+		err = qlcnic_set_real_num_queues(adapter,
+						 adapter->drv_tx_rings,
+						 adapter->drv_sds_rings);
+		if (err)
+			goto done;
+	}
+
+	if (qlcnic_83xx_check(adapter)) {
+		qlcnic_83xx_initialize_nic(adapter, 1);
+		err = qlcnic_83xx_setup_mbx_intr(adapter);
+		qlcnic_83xx_disable_mbx_poll(adapter);
+		if (err) {
+			dev_err(&adapter->pdev->dev,
+				"failed to setup mbx interrupt\n");
+			goto done;
+		}
+	}
+
+	if (netif_running(netdev)) {
+		err = qlcnic_attach(adapter);
+		if (err)
+			goto done;
+		err = __qlcnic_up(adapter, netdev);
+		if (err)
+			goto done;
+		qlcnic_restore_indev_addr(netdev, NETDEV_UP);
+	}
+done:
+	netif_device_attach(netdev);
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	return err;
+}
+
+#ifdef CONFIG_INET
+
+#define is_qlcnic_netdev(dev) (dev->netdev_ops == &qlcnic_netdev_ops)
+
+static void
+qlcnic_config_indev_addr(struct qlcnic_adapter *adapter,
+			struct net_device *dev, unsigned long event)
+{
+	struct in_device *indev;
+
+	indev = in_dev_get(dev);
+	if (!indev)
+		return;
+
+	for_ifa(indev) {
+		switch (event) {
+		case NETDEV_UP:
+			qlcnic_config_ipaddr(adapter,
+					ifa->ifa_address, QLCNIC_IP_UP);
+			break;
+		case NETDEV_DOWN:
+			qlcnic_config_ipaddr(adapter,
+					ifa->ifa_address, QLCNIC_IP_DOWN);
+			break;
+		default:
+			break;
+		}
+	} endfor_ifa(indev);
+
+	in_dev_put(indev);
+}
+
+void qlcnic_restore_indev_addr(struct net_device *netdev, unsigned long event)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct net_device *dev;
+	u16 vid;
+
+	qlcnic_config_indev_addr(adapter, netdev, event);
+
+	rcu_read_lock();
+	for_each_set_bit(vid, adapter->vlans, VLAN_N_VID) {
+		dev = __vlan_find_dev_deep_rcu(netdev, htons(ETH_P_8021Q), vid);
+		if (!dev)
+			continue;
+		qlcnic_config_indev_addr(adapter, dev, event);
+	}
+	rcu_read_unlock();
+}
+
+static int qlcnic_netdev_event(struct notifier_block *this,
+				 unsigned long event, void *ptr)
+{
+	struct qlcnic_adapter *adapter;
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+recheck:
+	if (dev == NULL)
+		goto done;
+
+	if (is_vlan_dev(dev)) {
+		dev = vlan_dev_real_dev(dev);
+		goto recheck;
+	}
+
+	if (!is_qlcnic_netdev(dev))
+		goto done;
+
+	adapter = netdev_priv(dev);
+
+	if (!adapter)
+		goto done;
+
+	if (!test_bit(__QLCNIC_DEV_UP, &adapter->state))
+		goto done;
+
+	qlcnic_config_indev_addr(adapter, dev, event);
+done:
+	return NOTIFY_DONE;
+}
+
+static int
+qlcnic_inetaddr_event(struct notifier_block *this,
+		unsigned long event, void *ptr)
+{
+	struct qlcnic_adapter *adapter;
+	struct net_device *dev;
+
+	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+
+	dev = ifa->ifa_dev ? ifa->ifa_dev->dev : NULL;
+
+recheck:
+	if (dev == NULL)
+		goto done;
+
+	if (is_vlan_dev(dev)) {
+		dev = vlan_dev_real_dev(dev);
+		goto recheck;
+	}
+
+	if (!is_qlcnic_netdev(dev))
+		goto done;
+
+	adapter = netdev_priv(dev);
+
+	if (!adapter)
+		goto done;
+
+	if (!test_bit(__QLCNIC_DEV_UP, &adapter->state))
+		goto done;
+
+	switch (event) {
+	case NETDEV_UP:
+		qlcnic_config_ipaddr(adapter, ifa->ifa_address, QLCNIC_IP_UP);
+
+		break;
+	case NETDEV_DOWN:
+		qlcnic_config_ipaddr(adapter, ifa->ifa_address, QLCNIC_IP_DOWN);
+
+		break;
+	default:
+		break;
+	}
+
+done:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block	qlcnic_netdev_cb = {
+	.notifier_call = qlcnic_netdev_event,
+};
+
+static struct notifier_block qlcnic_inetaddr_cb = {
+	.notifier_call = qlcnic_inetaddr_event,
+};
+#else
+void qlcnic_restore_indev_addr(struct net_device *dev, unsigned long event)
+{ }
+#endif
+static const struct pci_error_handlers qlcnic_err_handler = {
+	.error_detected = qlcnic_io_error_detected,
+	.slot_reset = qlcnic_io_slot_reset,
+	.resume = qlcnic_io_resume,
+};
+
+static struct pci_driver qlcnic_driver = {
+	.name = qlcnic_driver_name,
+	.id_table = qlcnic_pci_tbl,
+	.probe = qlcnic_probe,
+	.remove = qlcnic_remove,
+#ifdef CONFIG_PM
+	.suspend = qlcnic_suspend,
+	.resume = qlcnic_resume,
+#endif
+	.shutdown = qlcnic_shutdown,
+	.err_handler = &qlcnic_err_handler,
+#ifdef CONFIG_QLCNIC_SRIOV
+	.sriov_configure = qlcnic_pci_sriov_configure,
+#endif
+
+};
+
+static int __init qlcnic_init_module(void)
+{
+	int ret;
+
+	printk(KERN_INFO "%s\n", qlcnic_driver_string);
+
+#ifdef CONFIG_INET
+	register_netdevice_notifier(&qlcnic_netdev_cb);
+	register_inetaddr_notifier(&qlcnic_inetaddr_cb);
+#endif
+
+	ret = pci_register_driver(&qlcnic_driver);
+	if (ret) {
+#ifdef CONFIG_INET
+		unregister_inetaddr_notifier(&qlcnic_inetaddr_cb);
+		unregister_netdevice_notifier(&qlcnic_netdev_cb);
+#endif
+	}
+
+	return ret;
+}
+
+module_init(qlcnic_init_module);
+
+static void __exit qlcnic_exit_module(void)
+{
+	pci_unregister_driver(&qlcnic_driver);
+
+#ifdef CONFIG_INET
+	unregister_inetaddr_notifier(&qlcnic_inetaddr_cb);
+	unregister_netdevice_notifier(&qlcnic_netdev_cb);
+#endif
+}
+
+module_exit(qlcnic_exit_module);
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c
new file mode 100644
index 0000000..afa10a1
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c
@@ -0,0 +1,1451 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include <net/ip.h>
+
+#include "qlcnic.h"
+#include "qlcnic_hdr.h"
+#include "qlcnic_83xx_hw.h"
+#include "qlcnic_hw.h"
+
+#define QLC_83XX_MINIDUMP_FLASH		0x520000
+#define QLC_83XX_OCM_INDEX			3
+#define QLC_83XX_PCI_INDEX			0
+#define QLC_83XX_DMA_ENGINE_INDEX		8
+
+static const u32 qlcnic_ms_read_data[] = {
+	0x410000A8, 0x410000AC, 0x410000B8, 0x410000BC
+};
+
+#define QLCNIC_DUMP_WCRB	BIT_0
+#define QLCNIC_DUMP_RWCRB	BIT_1
+#define QLCNIC_DUMP_ANDCRB	BIT_2
+#define QLCNIC_DUMP_ORCRB	BIT_3
+#define QLCNIC_DUMP_POLLCRB	BIT_4
+#define QLCNIC_DUMP_RD_SAVE	BIT_5
+#define QLCNIC_DUMP_WRT_SAVED	BIT_6
+#define QLCNIC_DUMP_MOD_SAVE_ST	BIT_7
+#define QLCNIC_DUMP_SKIP	BIT_7
+
+#define QLCNIC_DUMP_MASK_MAX	0xff
+
+struct qlcnic_pex_dma_descriptor {
+	u32	read_data_size;
+	u32	dma_desc_cmd;
+	u32	src_addr_low;
+	u32	src_addr_high;
+	u32	dma_bus_addr_low;
+	u32	dma_bus_addr_high;
+	u32	rsvd[6];
+} __packed;
+
+struct qlcnic_common_entry_hdr {
+	u32     type;
+	u32     offset;
+	u32     cap_size;
+#if defined(__LITTLE_ENDIAN)
+	u8      mask;
+	u8      rsvd[2];
+	u8      flags;
+#else
+	u8      flags;
+	u8      rsvd[2];
+	u8      mask;
+#endif
+} __packed;
+
+struct __crb {
+	u32	addr;
+#if defined(__LITTLE_ENDIAN)
+	u8	stride;
+	u8	rsvd1[3];
+#else
+	u8	rsvd1[3];
+	u8	stride;
+#endif
+	u32	data_size;
+	u32	no_ops;
+	u32	rsvd2[4];
+} __packed;
+
+struct __ctrl {
+	u32	addr;
+#if defined(__LITTLE_ENDIAN)
+	u8	stride;
+	u8	index_a;
+	u16	timeout;
+#else
+	u16	timeout;
+	u8	index_a;
+	u8	stride;
+#endif
+	u32	data_size;
+	u32	no_ops;
+#if defined(__LITTLE_ENDIAN)
+	u8	opcode;
+	u8	index_v;
+	u8	shl_val;
+	u8	shr_val;
+#else
+	u8	shr_val;
+	u8	shl_val;
+	u8	index_v;
+	u8	opcode;
+#endif
+	u32	val1;
+	u32	val2;
+	u32	val3;
+} __packed;
+
+struct __cache {
+	u32	addr;
+#if defined(__LITTLE_ENDIAN)
+	u16	stride;
+	u16	init_tag_val;
+#else
+	u16	init_tag_val;
+	u16	stride;
+#endif
+	u32	size;
+	u32	no_ops;
+	u32	ctrl_addr;
+	u32	ctrl_val;
+	u32	read_addr;
+#if defined(__LITTLE_ENDIAN)
+	u8	read_addr_stride;
+	u8	read_addr_num;
+	u8	rsvd1[2];
+#else
+	u8	rsvd1[2];
+	u8	read_addr_num;
+	u8	read_addr_stride;
+#endif
+} __packed;
+
+struct __ocm {
+	u8	rsvd[8];
+	u32	size;
+	u32	no_ops;
+	u8	rsvd1[8];
+	u32	read_addr;
+	u32	read_addr_stride;
+} __packed;
+
+struct __mem {
+	u32	desc_card_addr;
+	u32	dma_desc_cmd;
+	u32	start_dma_cmd;
+	u32	rsvd[3];
+	u32	addr;
+	u32	size;
+} __packed;
+
+struct __mux {
+	u32	addr;
+	u8	rsvd[4];
+	u32	size;
+	u32	no_ops;
+	u32	val;
+	u32	val_stride;
+	u32	read_addr;
+	u8	rsvd2[4];
+} __packed;
+
+struct __queue {
+	u32	sel_addr;
+#if defined(__LITTLE_ENDIAN)
+	u16	stride;
+	u8	rsvd[2];
+#else
+	u8	rsvd[2];
+	u16	stride;
+#endif
+	u32	size;
+	u32	no_ops;
+	u8	rsvd2[8];
+	u32	read_addr;
+#if defined(__LITTLE_ENDIAN)
+	u8	read_addr_stride;
+	u8	read_addr_cnt;
+	u8	rsvd3[2];
+#else
+	u8	rsvd3[2];
+	u8	read_addr_cnt;
+	u8	read_addr_stride;
+#endif
+} __packed;
+
+struct __pollrd {
+	u32	sel_addr;
+	u32	read_addr;
+	u32	sel_val;
+#if defined(__LITTLE_ENDIAN)
+	u16	sel_val_stride;
+	u16	no_ops;
+#else
+	u16	no_ops;
+	u16	sel_val_stride;
+#endif
+	u32	poll_wait;
+	u32	poll_mask;
+	u32	data_size;
+	u8	rsvd[4];
+} __packed;
+
+struct __mux2 {
+	u32	sel_addr1;
+	u32	sel_addr2;
+	u32	sel_val1;
+	u32	sel_val2;
+	u32	no_ops;
+	u32	sel_val_mask;
+	u32	read_addr;
+#if defined(__LITTLE_ENDIAN)
+	u8	sel_val_stride;
+	u8	data_size;
+	u8	rsvd[2];
+#else
+	u8	rsvd[2];
+	u8	data_size;
+	u8	sel_val_stride;
+#endif
+} __packed;
+
+struct __pollrdmwr {
+	u32	addr1;
+	u32	addr2;
+	u32	val1;
+	u32	val2;
+	u32	poll_wait;
+	u32	poll_mask;
+	u32	mod_mask;
+	u32	data_size;
+} __packed;
+
+struct qlcnic_dump_entry {
+	struct qlcnic_common_entry_hdr hdr;
+	union {
+		struct __crb		crb;
+		struct __cache		cache;
+		struct __ocm		ocm;
+		struct __mem		mem;
+		struct __mux		mux;
+		struct __queue		que;
+		struct __ctrl		ctrl;
+		struct __pollrdmwr	pollrdmwr;
+		struct __mux2		mux2;
+		struct __pollrd		pollrd;
+	} region;
+} __packed;
+
+enum qlcnic_minidump_opcode {
+	QLCNIC_DUMP_NOP		= 0,
+	QLCNIC_DUMP_READ_CRB	= 1,
+	QLCNIC_DUMP_READ_MUX	= 2,
+	QLCNIC_DUMP_QUEUE	= 3,
+	QLCNIC_DUMP_BRD_CONFIG	= 4,
+	QLCNIC_DUMP_READ_OCM	= 6,
+	QLCNIC_DUMP_PEG_REG	= 7,
+	QLCNIC_DUMP_L1_DTAG	= 8,
+	QLCNIC_DUMP_L1_ITAG	= 9,
+	QLCNIC_DUMP_L1_DATA	= 11,
+	QLCNIC_DUMP_L1_INST	= 12,
+	QLCNIC_DUMP_L2_DTAG	= 21,
+	QLCNIC_DUMP_L2_ITAG	= 22,
+	QLCNIC_DUMP_L2_DATA	= 23,
+	QLCNIC_DUMP_L2_INST	= 24,
+	QLCNIC_DUMP_POLL_RD	= 35,
+	QLCNIC_READ_MUX2	= 36,
+	QLCNIC_READ_POLLRDMWR	= 37,
+	QLCNIC_DUMP_READ_ROM	= 71,
+	QLCNIC_DUMP_READ_MEM	= 72,
+	QLCNIC_DUMP_READ_CTRL	= 98,
+	QLCNIC_DUMP_TLHDR	= 99,
+	QLCNIC_DUMP_RDEND	= 255
+};
+
+inline u32 qlcnic_82xx_get_saved_state(void *t_hdr, u32 index)
+{
+	struct qlcnic_82xx_dump_template_hdr *hdr = t_hdr;
+
+	return hdr->saved_state[index];
+}
+
+inline void qlcnic_82xx_set_saved_state(void *t_hdr, u32 index,
+					u32 value)
+{
+	struct qlcnic_82xx_dump_template_hdr *hdr = t_hdr;
+
+	hdr->saved_state[index] = value;
+}
+
+void qlcnic_82xx_cache_tmpl_hdr_values(struct qlcnic_fw_dump *fw_dump)
+{
+	struct qlcnic_82xx_dump_template_hdr *hdr;
+
+	hdr = fw_dump->tmpl_hdr;
+	fw_dump->tmpl_hdr_size = hdr->size;
+	fw_dump->version = hdr->version;
+	fw_dump->num_entries = hdr->num_entries;
+	fw_dump->offset = hdr->offset;
+
+	hdr->drv_cap_mask = hdr->cap_mask;
+	fw_dump->cap_mask = hdr->cap_mask;
+
+	fw_dump->use_pex_dma = (hdr->capabilities & BIT_0) ? true : false;
+}
+
+inline u32 qlcnic_82xx_get_cap_size(void *t_hdr, int index)
+{
+	struct qlcnic_82xx_dump_template_hdr *hdr = t_hdr;
+
+	return hdr->cap_sizes[index];
+}
+
+void qlcnic_82xx_set_sys_info(void *t_hdr, int idx, u32 value)
+{
+	struct qlcnic_82xx_dump_template_hdr *hdr = t_hdr;
+
+	hdr->sys_info[idx] = value;
+}
+
+void qlcnic_82xx_store_cap_mask(void *tmpl_hdr, u32 mask)
+{
+	struct qlcnic_82xx_dump_template_hdr *hdr = tmpl_hdr;
+
+	hdr->drv_cap_mask = mask;
+}
+
+inline u32 qlcnic_83xx_get_saved_state(void *t_hdr, u32 index)
+{
+	struct qlcnic_83xx_dump_template_hdr *hdr = t_hdr;
+
+	return hdr->saved_state[index];
+}
+
+inline void qlcnic_83xx_set_saved_state(void *t_hdr, u32 index,
+					u32 value)
+{
+	struct qlcnic_83xx_dump_template_hdr *hdr = t_hdr;
+
+	hdr->saved_state[index] = value;
+}
+
+#define QLCNIC_TEMPLATE_VERSION (0x20001)
+
+void qlcnic_83xx_cache_tmpl_hdr_values(struct qlcnic_fw_dump *fw_dump)
+{
+	struct qlcnic_83xx_dump_template_hdr *hdr;
+
+	hdr = fw_dump->tmpl_hdr;
+	fw_dump->tmpl_hdr_size = hdr->size;
+	fw_dump->version = hdr->version;
+	fw_dump->num_entries = hdr->num_entries;
+	fw_dump->offset = hdr->offset;
+
+	hdr->drv_cap_mask = hdr->cap_mask;
+	fw_dump->cap_mask = hdr->cap_mask;
+
+	fw_dump->use_pex_dma = (fw_dump->version & 0xfffff) >=
+			       QLCNIC_TEMPLATE_VERSION;
+}
+
+inline u32 qlcnic_83xx_get_cap_size(void *t_hdr, int index)
+{
+	struct qlcnic_83xx_dump_template_hdr *hdr = t_hdr;
+
+	return hdr->cap_sizes[index];
+}
+
+void qlcnic_83xx_set_sys_info(void *t_hdr, int idx, u32 value)
+{
+	struct qlcnic_83xx_dump_template_hdr *hdr = t_hdr;
+
+	hdr->sys_info[idx] = value;
+}
+
+void qlcnic_83xx_store_cap_mask(void *tmpl_hdr, u32 mask)
+{
+	struct qlcnic_83xx_dump_template_hdr *hdr;
+
+	hdr = tmpl_hdr;
+	hdr->drv_cap_mask = mask;
+}
+
+struct qlcnic_dump_operations {
+	enum qlcnic_minidump_opcode opcode;
+	u32 (*handler)(struct qlcnic_adapter *, struct qlcnic_dump_entry *,
+		       __le32 *);
+};
+
+static u32 qlcnic_dump_crb(struct qlcnic_adapter *adapter,
+			   struct qlcnic_dump_entry *entry, __le32 *buffer)
+{
+	int i;
+	u32 addr, data;
+	struct __crb *crb = &entry->region.crb;
+
+	addr = crb->addr;
+
+	for (i = 0; i < crb->no_ops; i++) {
+		data = qlcnic_ind_rd(adapter, addr);
+		*buffer++ = cpu_to_le32(addr);
+		*buffer++ = cpu_to_le32(data);
+		addr += crb->stride;
+	}
+	return crb->no_ops * 2 * sizeof(u32);
+}
+
+static u32 qlcnic_dump_ctrl(struct qlcnic_adapter *adapter,
+			    struct qlcnic_dump_entry *entry, __le32 *buffer)
+{
+	void *hdr = adapter->ahw->fw_dump.tmpl_hdr;
+	struct __ctrl *ctr = &entry->region.ctrl;
+	int i, k, timeout = 0;
+	u32 addr, data, temp;
+	u8 no_ops;
+
+	addr = ctr->addr;
+	no_ops = ctr->no_ops;
+
+	for (i = 0; i < no_ops; i++) {
+		k = 0;
+		for (k = 0; k < 8; k++) {
+			if (!(ctr->opcode & (1 << k)))
+				continue;
+			switch (1 << k) {
+			case QLCNIC_DUMP_WCRB:
+				qlcnic_ind_wr(adapter, addr, ctr->val1);
+				break;
+			case QLCNIC_DUMP_RWCRB:
+				data = qlcnic_ind_rd(adapter, addr);
+				qlcnic_ind_wr(adapter, addr, data);
+				break;
+			case QLCNIC_DUMP_ANDCRB:
+				data = qlcnic_ind_rd(adapter, addr);
+				qlcnic_ind_wr(adapter, addr,
+					      (data & ctr->val2));
+				break;
+			case QLCNIC_DUMP_ORCRB:
+				data = qlcnic_ind_rd(adapter, addr);
+				qlcnic_ind_wr(adapter, addr,
+					      (data | ctr->val3));
+				break;
+			case QLCNIC_DUMP_POLLCRB:
+				while (timeout <= ctr->timeout) {
+					data = qlcnic_ind_rd(adapter, addr);
+					if ((data & ctr->val2) == ctr->val1)
+						break;
+					usleep_range(1000, 2000);
+					timeout++;
+				}
+				if (timeout > ctr->timeout) {
+					dev_info(&adapter->pdev->dev,
+					"Timed out, aborting poll CRB\n");
+					return -EINVAL;
+				}
+				break;
+			case QLCNIC_DUMP_RD_SAVE:
+				temp = ctr->index_a;
+				if (temp)
+					addr = qlcnic_get_saved_state(adapter,
+								      hdr,
+								      temp);
+				data = qlcnic_ind_rd(adapter, addr);
+				qlcnic_set_saved_state(adapter, hdr,
+						       ctr->index_v, data);
+				break;
+			case QLCNIC_DUMP_WRT_SAVED:
+				temp = ctr->index_v;
+				if (temp)
+					data = qlcnic_get_saved_state(adapter,
+								      hdr,
+								      temp);
+				else
+					data = ctr->val1;
+
+				temp = ctr->index_a;
+				if (temp)
+					addr = qlcnic_get_saved_state(adapter,
+								      hdr,
+								      temp);
+				qlcnic_ind_wr(adapter, addr, data);
+				break;
+			case QLCNIC_DUMP_MOD_SAVE_ST:
+				data = qlcnic_get_saved_state(adapter, hdr,
+							      ctr->index_v);
+				data <<= ctr->shl_val;
+				data >>= ctr->shr_val;
+				if (ctr->val2)
+					data &= ctr->val2;
+				data |= ctr->val3;
+				data += ctr->val1;
+				qlcnic_set_saved_state(adapter, hdr,
+						       ctr->index_v, data);
+				break;
+			default:
+				dev_info(&adapter->pdev->dev,
+					 "Unknown opcode\n");
+				break;
+			}
+		}
+		addr += ctr->stride;
+	}
+	return 0;
+}
+
+static u32 qlcnic_dump_mux(struct qlcnic_adapter *adapter,
+			   struct qlcnic_dump_entry *entry, __le32 *buffer)
+{
+	int loop;
+	u32 val, data = 0;
+	struct __mux *mux = &entry->region.mux;
+
+	val = mux->val;
+	for (loop = 0; loop < mux->no_ops; loop++) {
+		qlcnic_ind_wr(adapter, mux->addr, val);
+		data = qlcnic_ind_rd(adapter, mux->read_addr);
+		*buffer++ = cpu_to_le32(val);
+		*buffer++ = cpu_to_le32(data);
+		val += mux->val_stride;
+	}
+	return 2 * mux->no_ops * sizeof(u32);
+}
+
+static u32 qlcnic_dump_que(struct qlcnic_adapter *adapter,
+			   struct qlcnic_dump_entry *entry, __le32 *buffer)
+{
+	int i, loop;
+	u32 cnt, addr, data, que_id = 0;
+	struct __queue *que = &entry->region.que;
+
+	addr = que->read_addr;
+	cnt = que->read_addr_cnt;
+
+	for (loop = 0; loop < que->no_ops; loop++) {
+		qlcnic_ind_wr(adapter, que->sel_addr, que_id);
+		addr = que->read_addr;
+		for (i = 0; i < cnt; i++) {
+			data = qlcnic_ind_rd(adapter, addr);
+			*buffer++ = cpu_to_le32(data);
+			addr += que->read_addr_stride;
+		}
+		que_id += que->stride;
+	}
+	return que->no_ops * cnt * sizeof(u32);
+}
+
+static u32 qlcnic_dump_ocm(struct qlcnic_adapter *adapter,
+			   struct qlcnic_dump_entry *entry, __le32 *buffer)
+{
+	int i;
+	u32 data;
+	void __iomem *addr;
+	struct __ocm *ocm = &entry->region.ocm;
+
+	addr = adapter->ahw->pci_base0 + ocm->read_addr;
+	for (i = 0; i < ocm->no_ops; i++) {
+		data = readl(addr);
+		*buffer++ = cpu_to_le32(data);
+		addr += ocm->read_addr_stride;
+	}
+	return ocm->no_ops * sizeof(u32);
+}
+
+static u32 qlcnic_read_rom(struct qlcnic_adapter *adapter,
+			   struct qlcnic_dump_entry *entry, __le32 *buffer)
+{
+	int i, count = 0;
+	u32 fl_addr, size, val, lck_val, addr;
+	struct __mem *rom = &entry->region.mem;
+
+	fl_addr = rom->addr;
+	size = rom->size / 4;
+lock_try:
+	lck_val = QLC_SHARED_REG_RD32(adapter, QLCNIC_FLASH_LOCK);
+	if (!lck_val && count < MAX_CTL_CHECK) {
+		usleep_range(10000, 11000);
+		count++;
+		goto lock_try;
+	}
+	QLC_SHARED_REG_WR32(adapter, QLCNIC_FLASH_LOCK_OWNER,
+			    adapter->ahw->pci_func);
+	for (i = 0; i < size; i++) {
+		addr = fl_addr & 0xFFFF0000;
+		qlcnic_ind_wr(adapter, FLASH_ROM_WINDOW, addr);
+		addr = LSW(fl_addr) + FLASH_ROM_DATA;
+		val = qlcnic_ind_rd(adapter, addr);
+		fl_addr += 4;
+		*buffer++ = cpu_to_le32(val);
+	}
+	QLC_SHARED_REG_RD32(adapter, QLCNIC_FLASH_UNLOCK);
+	return rom->size;
+}
+
+static u32 qlcnic_dump_l1_cache(struct qlcnic_adapter *adapter,
+				struct qlcnic_dump_entry *entry, __le32 *buffer)
+{
+	int i;
+	u32 cnt, val, data, addr;
+	struct __cache *l1 = &entry->region.cache;
+
+	val = l1->init_tag_val;
+
+	for (i = 0; i < l1->no_ops; i++) {
+		qlcnic_ind_wr(adapter, l1->addr, val);
+		qlcnic_ind_wr(adapter, l1->ctrl_addr, LSW(l1->ctrl_val));
+		addr = l1->read_addr;
+		cnt = l1->read_addr_num;
+		while (cnt) {
+			data = qlcnic_ind_rd(adapter, addr);
+			*buffer++ = cpu_to_le32(data);
+			addr += l1->read_addr_stride;
+			cnt--;
+		}
+		val += l1->stride;
+	}
+	return l1->no_ops * l1->read_addr_num * sizeof(u32);
+}
+
+static u32 qlcnic_dump_l2_cache(struct qlcnic_adapter *adapter,
+				struct qlcnic_dump_entry *entry, __le32 *buffer)
+{
+	int i;
+	u32 cnt, val, data, addr;
+	u8 poll_mask, poll_to, time_out = 0;
+	struct __cache *l2 = &entry->region.cache;
+
+	val = l2->init_tag_val;
+	poll_mask = LSB(MSW(l2->ctrl_val));
+	poll_to = MSB(MSW(l2->ctrl_val));
+
+	for (i = 0; i < l2->no_ops; i++) {
+		qlcnic_ind_wr(adapter, l2->addr, val);
+		if (LSW(l2->ctrl_val))
+			qlcnic_ind_wr(adapter, l2->ctrl_addr,
+				      LSW(l2->ctrl_val));
+		if (!poll_mask)
+			goto skip_poll;
+		do {
+			data = qlcnic_ind_rd(adapter, l2->ctrl_addr);
+			if (!(data & poll_mask))
+				break;
+			usleep_range(1000, 2000);
+			time_out++;
+		} while (time_out <= poll_to);
+
+		if (time_out > poll_to) {
+			dev_err(&adapter->pdev->dev,
+				"Timeout exceeded in %s, aborting dump\n",
+				__func__);
+			return -EINVAL;
+		}
+skip_poll:
+		addr = l2->read_addr;
+		cnt = l2->read_addr_num;
+		while (cnt) {
+			data = qlcnic_ind_rd(adapter, addr);
+			*buffer++ = cpu_to_le32(data);
+			addr += l2->read_addr_stride;
+			cnt--;
+		}
+		val += l2->stride;
+	}
+	return l2->no_ops * l2->read_addr_num * sizeof(u32);
+}
+
+static u32 qlcnic_read_memory_test_agent(struct qlcnic_adapter *adapter,
+					 struct __mem *mem, __le32 *buffer,
+					 int *ret)
+{
+	u32 addr, data, test;
+	int i, reg_read;
+
+	reg_read = mem->size;
+	addr = mem->addr;
+	/* check for data size of multiple of 16 and 16 byte alignment */
+	if ((addr & 0xf) || (reg_read%16)) {
+		dev_info(&adapter->pdev->dev,
+			 "Unaligned memory addr:0x%x size:0x%x\n",
+			 addr, reg_read);
+		*ret = -EINVAL;
+		return 0;
+	}
+
+	mutex_lock(&adapter->ahw->mem_lock);
+
+	while (reg_read != 0) {
+		qlcnic_ind_wr(adapter, QLCNIC_MS_ADDR_LO, addr);
+		qlcnic_ind_wr(adapter, QLCNIC_MS_ADDR_HI, 0);
+		qlcnic_ind_wr(adapter, QLCNIC_MS_CTRL, QLCNIC_TA_START_ENABLE);
+
+		for (i = 0; i < MAX_CTL_CHECK; i++) {
+			test = qlcnic_ind_rd(adapter, QLCNIC_MS_CTRL);
+			if (!(test & TA_CTL_BUSY))
+				break;
+		}
+		if (i == MAX_CTL_CHECK) {
+			if (printk_ratelimit()) {
+				dev_err(&adapter->pdev->dev,
+					"failed to read through agent\n");
+				*ret = -EIO;
+				goto out;
+			}
+		}
+		for (i = 0; i < 4; i++) {
+			data = qlcnic_ind_rd(adapter, qlcnic_ms_read_data[i]);
+			*buffer++ = cpu_to_le32(data);
+		}
+		addr += 16;
+		reg_read -= 16;
+		ret += 16;
+	}
+out:
+	mutex_unlock(&adapter->ahw->mem_lock);
+	return mem->size;
+}
+
+/* DMA register base address */
+#define QLC_DMA_REG_BASE_ADDR(dma_no)	(0x77320000 + (dma_no * 0x10000))
+
+/* DMA register offsets w.r.t base address */
+#define QLC_DMA_CMD_BUFF_ADDR_LOW	0
+#define QLC_DMA_CMD_BUFF_ADDR_HI	4
+#define QLC_DMA_CMD_STATUS_CTRL		8
+
+static int qlcnic_start_pex_dma(struct qlcnic_adapter *adapter,
+				struct __mem *mem)
+{
+	struct device *dev = &adapter->pdev->dev;
+	u32 dma_no, dma_base_addr, temp_addr;
+	int i, ret, dma_sts;
+	void *tmpl_hdr;
+
+	tmpl_hdr = adapter->ahw->fw_dump.tmpl_hdr;
+	dma_no = qlcnic_get_saved_state(adapter, tmpl_hdr,
+					QLC_83XX_DMA_ENGINE_INDEX);
+	dma_base_addr = QLC_DMA_REG_BASE_ADDR(dma_no);
+
+	temp_addr = dma_base_addr + QLC_DMA_CMD_BUFF_ADDR_LOW;
+	ret = qlcnic_ind_wr(adapter, temp_addr, mem->desc_card_addr);
+	if (ret)
+		return ret;
+
+	temp_addr = dma_base_addr + QLC_DMA_CMD_BUFF_ADDR_HI;
+	ret = qlcnic_ind_wr(adapter, temp_addr, 0);
+	if (ret)
+		return ret;
+
+	temp_addr = dma_base_addr + QLC_DMA_CMD_STATUS_CTRL;
+	ret = qlcnic_ind_wr(adapter, temp_addr, mem->start_dma_cmd);
+	if (ret)
+		return ret;
+
+	/* Wait for DMA to complete */
+	temp_addr = dma_base_addr + QLC_DMA_CMD_STATUS_CTRL;
+	for (i = 0; i < 400; i++) {
+		dma_sts = qlcnic_ind_rd(adapter, temp_addr);
+
+		if (dma_sts & BIT_1)
+			usleep_range(250, 500);
+		else
+			break;
+	}
+
+	if (i >= 400) {
+		dev_info(dev, "PEX DMA operation timed out");
+		ret = -EIO;
+	}
+
+	return ret;
+}
+
+static u32 qlcnic_read_memory_pexdma(struct qlcnic_adapter *adapter,
+				     struct __mem *mem,
+				     __le32 *buffer, int *ret)
+{
+	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
+	u32 temp, dma_base_addr, size = 0, read_size = 0;
+	struct qlcnic_pex_dma_descriptor *dma_descr;
+	struct device *dev = &adapter->pdev->dev;
+	dma_addr_t dma_phys_addr;
+	void *dma_buffer;
+	void *tmpl_hdr;
+
+	tmpl_hdr = fw_dump->tmpl_hdr;
+
+	/* Check if DMA engine is available */
+	temp = qlcnic_get_saved_state(adapter, tmpl_hdr,
+				      QLC_83XX_DMA_ENGINE_INDEX);
+	dma_base_addr = QLC_DMA_REG_BASE_ADDR(temp);
+	temp = qlcnic_ind_rd(adapter,
+			     dma_base_addr + QLC_DMA_CMD_STATUS_CTRL);
+
+	if (!(temp & BIT_31)) {
+		dev_info(dev, "%s: DMA engine is not available\n", __func__);
+		*ret = -EIO;
+		return 0;
+	}
+
+	/* Create DMA descriptor */
+	dma_descr = kzalloc(sizeof(struct qlcnic_pex_dma_descriptor),
+			    GFP_KERNEL);
+	if (!dma_descr) {
+		*ret = -ENOMEM;
+		return 0;
+	}
+
+	/* dma_desc_cmd  0:15  = 0
+	 * dma_desc_cmd 16:19  = mem->dma_desc_cmd 0:3
+	 * dma_desc_cmd 20:23  = pci function number
+	 * dma_desc_cmd 24:31  = mem->dma_desc_cmd 8:15
+	 */
+	dma_phys_addr = fw_dump->phys_addr;
+	dma_buffer = fw_dump->dma_buffer;
+	temp = 0;
+	temp = mem->dma_desc_cmd & 0xff0f;
+	temp |= (adapter->ahw->pci_func & 0xf) << 4;
+	dma_descr->dma_desc_cmd = (temp << 16) & 0xffff0000;
+	dma_descr->dma_bus_addr_low = LSD(dma_phys_addr);
+	dma_descr->dma_bus_addr_high = MSD(dma_phys_addr);
+	dma_descr->src_addr_high = 0;
+
+	/* Collect memory dump using multiple DMA operations if required */
+	while (read_size < mem->size) {
+		if (mem->size - read_size >= QLC_PEX_DMA_READ_SIZE)
+			size = QLC_PEX_DMA_READ_SIZE;
+		else
+			size = mem->size - read_size;
+
+		dma_descr->src_addr_low = mem->addr + read_size;
+		dma_descr->read_data_size = size;
+
+		/* Write DMA descriptor to MS memory*/
+		temp = sizeof(struct qlcnic_pex_dma_descriptor) / 16;
+		*ret = qlcnic_ms_mem_write128(adapter, mem->desc_card_addr,
+					      (u32 *)dma_descr, temp);
+		if (*ret) {
+			dev_info(dev, "Failed to write DMA descriptor to MS memory at address 0x%x\n",
+				 mem->desc_card_addr);
+			goto free_dma_descr;
+		}
+
+		*ret = qlcnic_start_pex_dma(adapter, mem);
+		if (*ret) {
+			dev_info(dev, "Failed to start PEX DMA operation\n");
+			goto free_dma_descr;
+		}
+
+		memcpy(buffer, dma_buffer, size);
+		buffer += size / 4;
+		read_size += size;
+	}
+
+free_dma_descr:
+	kfree(dma_descr);
+
+	return read_size;
+}
+
+static u32 qlcnic_read_memory(struct qlcnic_adapter *adapter,
+			      struct qlcnic_dump_entry *entry, __le32 *buffer)
+{
+	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
+	struct device *dev = &adapter->pdev->dev;
+	struct __mem *mem = &entry->region.mem;
+	u32 data_size;
+	int ret = 0;
+
+	if (fw_dump->use_pex_dma) {
+		data_size = qlcnic_read_memory_pexdma(adapter, mem, buffer,
+						      &ret);
+		if (ret)
+			dev_info(dev,
+				 "Failed to read memory dump using PEX DMA: mask[0x%x]\n",
+				 entry->hdr.mask);
+		else
+			return data_size;
+	}
+
+	data_size = qlcnic_read_memory_test_agent(adapter, mem, buffer, &ret);
+	if (ret) {
+		dev_info(dev,
+			 "Failed to read memory dump using test agent method: mask[0x%x]\n",
+			 entry->hdr.mask);
+		return 0;
+	} else {
+		return data_size;
+	}
+}
+
+static u32 qlcnic_dump_nop(struct qlcnic_adapter *adapter,
+			   struct qlcnic_dump_entry *entry, __le32 *buffer)
+{
+	entry->hdr.flags |= QLCNIC_DUMP_SKIP;
+	return 0;
+}
+
+static int qlcnic_valid_dump_entry(struct device *dev,
+				   struct qlcnic_dump_entry *entry, u32 size)
+{
+	int ret = 1;
+	if (size != entry->hdr.cap_size) {
+		dev_err(dev,
+			"Invalid entry, Type:%d\tMask:%d\tSize:%dCap_size:%d\n",
+			entry->hdr.type, entry->hdr.mask, size,
+			entry->hdr.cap_size);
+		ret = 0;
+	}
+	return ret;
+}
+
+static u32 qlcnic_read_pollrdmwr(struct qlcnic_adapter *adapter,
+				 struct qlcnic_dump_entry *entry,
+				 __le32 *buffer)
+{
+	struct __pollrdmwr *poll = &entry->region.pollrdmwr;
+	u32 data, wait_count, poll_wait, temp;
+
+	poll_wait = poll->poll_wait;
+
+	qlcnic_ind_wr(adapter, poll->addr1, poll->val1);
+	wait_count = 0;
+
+	while (wait_count < poll_wait) {
+		data = qlcnic_ind_rd(adapter, poll->addr1);
+		if ((data & poll->poll_mask) != 0)
+			break;
+		wait_count++;
+	}
+
+	if (wait_count == poll_wait) {
+		dev_err(&adapter->pdev->dev,
+			"Timeout exceeded in %s, aborting dump\n",
+			__func__);
+		return 0;
+	}
+
+	data = qlcnic_ind_rd(adapter, poll->addr2) & poll->mod_mask;
+	qlcnic_ind_wr(adapter, poll->addr2, data);
+	qlcnic_ind_wr(adapter, poll->addr1, poll->val2);
+	wait_count = 0;
+
+	while (wait_count < poll_wait) {
+		temp = qlcnic_ind_rd(adapter, poll->addr1);
+		if ((temp & poll->poll_mask) != 0)
+			break;
+		wait_count++;
+	}
+
+	*buffer++ = cpu_to_le32(poll->addr2);
+	*buffer++ = cpu_to_le32(data);
+
+	return 2 * sizeof(u32);
+
+}
+
+static u32 qlcnic_read_pollrd(struct qlcnic_adapter *adapter,
+			      struct qlcnic_dump_entry *entry, __le32 *buffer)
+{
+	struct __pollrd *pollrd = &entry->region.pollrd;
+	u32 data, wait_count, poll_wait, sel_val;
+	int i;
+
+	poll_wait = pollrd->poll_wait;
+	sel_val = pollrd->sel_val;
+
+	for (i = 0; i < pollrd->no_ops; i++) {
+		qlcnic_ind_wr(adapter, pollrd->sel_addr, sel_val);
+		wait_count = 0;
+		while (wait_count < poll_wait) {
+			data = qlcnic_ind_rd(adapter, pollrd->sel_addr);
+			if ((data & pollrd->poll_mask) != 0)
+				break;
+			wait_count++;
+		}
+
+		if (wait_count == poll_wait) {
+			dev_err(&adapter->pdev->dev,
+				"Timeout exceeded in %s, aborting dump\n",
+				__func__);
+			return 0;
+		}
+
+		data = qlcnic_ind_rd(adapter, pollrd->read_addr);
+		*buffer++ = cpu_to_le32(sel_val);
+		*buffer++ = cpu_to_le32(data);
+		sel_val += pollrd->sel_val_stride;
+	}
+	return pollrd->no_ops * (2 * sizeof(u32));
+}
+
+static u32 qlcnic_read_mux2(struct qlcnic_adapter *adapter,
+			    struct qlcnic_dump_entry *entry, __le32 *buffer)
+{
+	struct __mux2 *mux2 = &entry->region.mux2;
+	u32 data;
+	u32 t_sel_val, sel_val1, sel_val2;
+	int i;
+
+	sel_val1 = mux2->sel_val1;
+	sel_val2 = mux2->sel_val2;
+
+	for (i = 0; i < mux2->no_ops; i++) {
+		qlcnic_ind_wr(adapter, mux2->sel_addr1, sel_val1);
+		t_sel_val = sel_val1 & mux2->sel_val_mask;
+		qlcnic_ind_wr(adapter, mux2->sel_addr2, t_sel_val);
+		data = qlcnic_ind_rd(adapter, mux2->read_addr);
+		*buffer++ = cpu_to_le32(t_sel_val);
+		*buffer++ = cpu_to_le32(data);
+		qlcnic_ind_wr(adapter, mux2->sel_addr1, sel_val2);
+		t_sel_val = sel_val2 & mux2->sel_val_mask;
+		qlcnic_ind_wr(adapter, mux2->sel_addr2, t_sel_val);
+		data = qlcnic_ind_rd(adapter, mux2->read_addr);
+		*buffer++ = cpu_to_le32(t_sel_val);
+		*buffer++ = cpu_to_le32(data);
+		sel_val1 += mux2->sel_val_stride;
+		sel_val2 += mux2->sel_val_stride;
+	}
+
+	return mux2->no_ops * (4 * sizeof(u32));
+}
+
+static u32 qlcnic_83xx_dump_rom(struct qlcnic_adapter *adapter,
+				struct qlcnic_dump_entry *entry, __le32 *buffer)
+{
+	u32 fl_addr, size;
+	struct __mem *rom = &entry->region.mem;
+
+	fl_addr = rom->addr;
+	size = rom->size / 4;
+
+	if (!qlcnic_83xx_lockless_flash_read32(adapter, fl_addr,
+					       (u8 *)buffer, size))
+		return rom->size;
+
+	return 0;
+}
+
+static const struct qlcnic_dump_operations qlcnic_fw_dump_ops[] = {
+	{QLCNIC_DUMP_NOP, qlcnic_dump_nop},
+	{QLCNIC_DUMP_READ_CRB, qlcnic_dump_crb},
+	{QLCNIC_DUMP_READ_MUX, qlcnic_dump_mux},
+	{QLCNIC_DUMP_QUEUE, qlcnic_dump_que},
+	{QLCNIC_DUMP_BRD_CONFIG, qlcnic_read_rom},
+	{QLCNIC_DUMP_READ_OCM, qlcnic_dump_ocm},
+	{QLCNIC_DUMP_PEG_REG, qlcnic_dump_ctrl},
+	{QLCNIC_DUMP_L1_DTAG, qlcnic_dump_l1_cache},
+	{QLCNIC_DUMP_L1_ITAG, qlcnic_dump_l1_cache},
+	{QLCNIC_DUMP_L1_DATA, qlcnic_dump_l1_cache},
+	{QLCNIC_DUMP_L1_INST, qlcnic_dump_l1_cache},
+	{QLCNIC_DUMP_L2_DTAG, qlcnic_dump_l2_cache},
+	{QLCNIC_DUMP_L2_ITAG, qlcnic_dump_l2_cache},
+	{QLCNIC_DUMP_L2_DATA, qlcnic_dump_l2_cache},
+	{QLCNIC_DUMP_L2_INST, qlcnic_dump_l2_cache},
+	{QLCNIC_DUMP_READ_ROM, qlcnic_read_rom},
+	{QLCNIC_DUMP_READ_MEM, qlcnic_read_memory},
+	{QLCNIC_DUMP_READ_CTRL, qlcnic_dump_ctrl},
+	{QLCNIC_DUMP_TLHDR, qlcnic_dump_nop},
+	{QLCNIC_DUMP_RDEND, qlcnic_dump_nop},
+};
+
+static const struct qlcnic_dump_operations qlcnic_83xx_fw_dump_ops[] = {
+	{QLCNIC_DUMP_NOP, qlcnic_dump_nop},
+	{QLCNIC_DUMP_READ_CRB, qlcnic_dump_crb},
+	{QLCNIC_DUMP_READ_MUX, qlcnic_dump_mux},
+	{QLCNIC_DUMP_QUEUE, qlcnic_dump_que},
+	{QLCNIC_DUMP_BRD_CONFIG, qlcnic_83xx_dump_rom},
+	{QLCNIC_DUMP_READ_OCM, qlcnic_dump_ocm},
+	{QLCNIC_DUMP_PEG_REG, qlcnic_dump_ctrl},
+	{QLCNIC_DUMP_L1_DTAG, qlcnic_dump_l1_cache},
+	{QLCNIC_DUMP_L1_ITAG, qlcnic_dump_l1_cache},
+	{QLCNIC_DUMP_L1_DATA, qlcnic_dump_l1_cache},
+	{QLCNIC_DUMP_L1_INST, qlcnic_dump_l1_cache},
+	{QLCNIC_DUMP_L2_DTAG, qlcnic_dump_l2_cache},
+	{QLCNIC_DUMP_L2_ITAG, qlcnic_dump_l2_cache},
+	{QLCNIC_DUMP_L2_DATA, qlcnic_dump_l2_cache},
+	{QLCNIC_DUMP_L2_INST, qlcnic_dump_l2_cache},
+	{QLCNIC_DUMP_POLL_RD, qlcnic_read_pollrd},
+	{QLCNIC_READ_MUX2, qlcnic_read_mux2},
+	{QLCNIC_READ_POLLRDMWR, qlcnic_read_pollrdmwr},
+	{QLCNIC_DUMP_READ_ROM, qlcnic_83xx_dump_rom},
+	{QLCNIC_DUMP_READ_MEM, qlcnic_read_memory},
+	{QLCNIC_DUMP_READ_CTRL, qlcnic_dump_ctrl},
+	{QLCNIC_DUMP_TLHDR, qlcnic_dump_nop},
+	{QLCNIC_DUMP_RDEND, qlcnic_dump_nop},
+};
+
+static uint32_t qlcnic_temp_checksum(uint32_t *temp_buffer, u32 temp_size)
+{
+	uint64_t sum = 0;
+	int count = temp_size / sizeof(uint32_t);
+	while (count-- > 0)
+		sum += *temp_buffer++;
+	while (sum >> 32)
+		sum = (sum & 0xFFFFFFFF) + (sum >> 32);
+	return ~sum;
+}
+
+static int qlcnic_fw_flash_get_minidump_temp(struct qlcnic_adapter *adapter,
+					     u8 *buffer, u32 size)
+{
+	int ret = 0;
+
+	if (qlcnic_82xx_check(adapter))
+		return -EIO;
+
+	if (qlcnic_83xx_lock_flash(adapter))
+		return -EIO;
+
+	ret = qlcnic_83xx_lockless_flash_read32(adapter,
+						QLC_83XX_MINIDUMP_FLASH,
+						buffer, size / sizeof(u32));
+
+	qlcnic_83xx_unlock_flash(adapter);
+
+	return ret;
+}
+
+static int
+qlcnic_fw_flash_get_minidump_temp_size(struct qlcnic_adapter *adapter,
+				       struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_83xx_dump_template_hdr tmp_hdr;
+	u32 size = sizeof(tmp_hdr) / sizeof(u32);
+	int ret = 0;
+
+	if (qlcnic_82xx_check(adapter))
+		return -EIO;
+
+	if (qlcnic_83xx_lock_flash(adapter))
+		return -EIO;
+
+	ret = qlcnic_83xx_lockless_flash_read32(adapter,
+						QLC_83XX_MINIDUMP_FLASH,
+						(u8 *)&tmp_hdr, size);
+
+	qlcnic_83xx_unlock_flash(adapter);
+
+	cmd->rsp.arg[2] = tmp_hdr.size;
+	cmd->rsp.arg[3] = tmp_hdr.version;
+
+	return ret;
+}
+
+static int qlcnic_fw_get_minidump_temp_size(struct qlcnic_adapter *adapter,
+					    u32 *version, u32 *temp_size,
+					    u8 *use_flash_temp)
+{
+	int err = 0;
+	struct qlcnic_cmd_args cmd;
+
+	if (qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_TEMP_SIZE))
+		return -ENOMEM;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err != QLCNIC_RCODE_SUCCESS) {
+		if (qlcnic_fw_flash_get_minidump_temp_size(adapter, &cmd)) {
+			qlcnic_free_mbx_args(&cmd);
+			return -EIO;
+		}
+		*use_flash_temp = 1;
+	}
+
+	*temp_size = cmd.rsp.arg[2];
+	*version = cmd.rsp.arg[3];
+	qlcnic_free_mbx_args(&cmd);
+
+	if (!(*temp_size))
+		return -EIO;
+
+	return 0;
+}
+
+static int __qlcnic_fw_cmd_get_minidump_temp(struct qlcnic_adapter *adapter,
+					     u32 *buffer, u32 temp_size)
+{
+	int err = 0, i;
+	void *tmp_addr;
+	__le32 *tmp_buf;
+	struct qlcnic_cmd_args cmd;
+	dma_addr_t tmp_addr_t = 0;
+
+	tmp_addr = dma_alloc_coherent(&adapter->pdev->dev, temp_size,
+				      &tmp_addr_t, GFP_KERNEL);
+	if (!tmp_addr)
+		return -ENOMEM;
+
+	if (qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_TEMP_HDR)) {
+		err = -ENOMEM;
+		goto free_mem;
+	}
+
+	cmd.req.arg[1] = LSD(tmp_addr_t);
+	cmd.req.arg[2] = MSD(tmp_addr_t);
+	cmd.req.arg[3] = temp_size;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+
+	tmp_buf = tmp_addr;
+	if (err == QLCNIC_RCODE_SUCCESS) {
+		for (i = 0; i < temp_size / sizeof(u32); i++)
+			*buffer++ = __le32_to_cpu(*tmp_buf++);
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+
+free_mem:
+	dma_free_coherent(&adapter->pdev->dev, temp_size, tmp_addr, tmp_addr_t);
+
+	return err;
+}
+
+int qlcnic_fw_cmd_get_minidump_temp(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw;
+	struct qlcnic_fw_dump *fw_dump;
+	u32 version, csum, *tmp_buf;
+	u8 use_flash_temp = 0;
+	u32 temp_size = 0;
+	void *temp_buffer;
+	int err;
+
+	ahw = adapter->ahw;
+	fw_dump = &ahw->fw_dump;
+	err = qlcnic_fw_get_minidump_temp_size(adapter, &version, &temp_size,
+					       &use_flash_temp);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"Can't get template size %d\n", err);
+		return -EIO;
+	}
+
+	fw_dump->tmpl_hdr = vzalloc(temp_size);
+	if (!fw_dump->tmpl_hdr)
+		return -ENOMEM;
+
+	tmp_buf = (u32 *)fw_dump->tmpl_hdr;
+	if (use_flash_temp)
+		goto flash_temp;
+
+	err = __qlcnic_fw_cmd_get_minidump_temp(adapter, tmp_buf, temp_size);
+
+	if (err) {
+flash_temp:
+		err = qlcnic_fw_flash_get_minidump_temp(adapter, (u8 *)tmp_buf,
+							temp_size);
+
+		if (err) {
+			dev_err(&adapter->pdev->dev,
+				"Failed to get minidump template header %d\n",
+				err);
+			vfree(fw_dump->tmpl_hdr);
+			fw_dump->tmpl_hdr = NULL;
+			return -EIO;
+		}
+	}
+
+	csum = qlcnic_temp_checksum((uint32_t *)tmp_buf, temp_size);
+
+	if (csum) {
+		dev_err(&adapter->pdev->dev,
+			"Template header checksum validation failed\n");
+		vfree(fw_dump->tmpl_hdr);
+		fw_dump->tmpl_hdr = NULL;
+		return -EIO;
+	}
+
+	qlcnic_cache_tmpl_hdr_values(adapter, fw_dump);
+
+	if (fw_dump->use_pex_dma) {
+		fw_dump->dma_buffer = NULL;
+		temp_buffer = dma_alloc_coherent(&adapter->pdev->dev,
+						 QLC_PEX_DMA_READ_SIZE,
+						 &fw_dump->phys_addr,
+						 GFP_KERNEL);
+		if (!temp_buffer)
+			fw_dump->use_pex_dma = false;
+		else
+			fw_dump->dma_buffer = temp_buffer;
+	}
+
+
+	dev_info(&adapter->pdev->dev,
+		 "Default minidump capture mask 0x%x\n",
+		 fw_dump->cap_mask);
+
+	qlcnic_enable_fw_dump_state(adapter);
+
+	return 0;
+}
+
+int qlcnic_dump_fw(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_fw_dump *fw_dump = &adapter->ahw->fw_dump;
+	const struct qlcnic_dump_operations *fw_dump_ops;
+	struct qlcnic_83xx_dump_template_hdr *hdr_83xx;
+	u32 entry_offset, dump, no_entries, buf_offset = 0;
+	int i, k, ops_cnt, ops_index, dump_size = 0;
+	struct device *dev = &adapter->pdev->dev;
+	struct qlcnic_hardware_context *ahw;
+	struct qlcnic_dump_entry *entry;
+	void *tmpl_hdr;
+	u32 ocm_window;
+	__le32 *buffer;
+	char mesg[64];
+	char *msg[] = {mesg, NULL};
+
+	ahw = adapter->ahw;
+	tmpl_hdr = fw_dump->tmpl_hdr;
+
+	/* Return if we don't have firmware dump template header */
+	if (!tmpl_hdr)
+		return -EIO;
+
+	if (!qlcnic_check_fw_dump_state(adapter)) {
+		dev_info(&adapter->pdev->dev, "Dump not enabled\n");
+		return -EIO;
+	}
+
+	if (fw_dump->clr) {
+		dev_info(&adapter->pdev->dev,
+			 "Previous dump not cleared, not capturing dump\n");
+		return -EIO;
+	}
+
+	netif_info(adapter->ahw, drv, adapter->netdev, "Take FW dump\n");
+	/* Calculate the size for dump data area only */
+	for (i = 2, k = 1; (i & QLCNIC_DUMP_MASK_MAX); i <<= 1, k++)
+		if (i & fw_dump->cap_mask)
+			dump_size += qlcnic_get_cap_size(adapter, tmpl_hdr, k);
+
+	if (!dump_size)
+		return -EIO;
+
+	fw_dump->data = vzalloc(dump_size);
+	if (!fw_dump->data)
+		return -ENOMEM;
+
+	buffer = fw_dump->data;
+	fw_dump->size = dump_size;
+	no_entries = fw_dump->num_entries;
+	entry_offset = fw_dump->offset;
+	qlcnic_set_sys_info(adapter, tmpl_hdr, 0, QLCNIC_DRIVER_VERSION);
+	qlcnic_set_sys_info(adapter, tmpl_hdr, 1, adapter->fw_version);
+
+	if (qlcnic_82xx_check(adapter)) {
+		ops_cnt = ARRAY_SIZE(qlcnic_fw_dump_ops);
+		fw_dump_ops = qlcnic_fw_dump_ops;
+	} else {
+		hdr_83xx = tmpl_hdr;
+		ops_cnt = ARRAY_SIZE(qlcnic_83xx_fw_dump_ops);
+		fw_dump_ops = qlcnic_83xx_fw_dump_ops;
+		ocm_window = hdr_83xx->ocm_wnd_reg[ahw->pci_func];
+		hdr_83xx->saved_state[QLC_83XX_OCM_INDEX] = ocm_window;
+		hdr_83xx->saved_state[QLC_83XX_PCI_INDEX] = ahw->pci_func;
+	}
+
+	for (i = 0; i < no_entries; i++) {
+		entry = tmpl_hdr + entry_offset;
+		if (!(entry->hdr.mask & fw_dump->cap_mask)) {
+			entry->hdr.flags |= QLCNIC_DUMP_SKIP;
+			entry_offset += entry->hdr.offset;
+			continue;
+		}
+
+		/* Find the handler for this entry */
+		ops_index = 0;
+		while (ops_index < ops_cnt) {
+			if (entry->hdr.type == fw_dump_ops[ops_index].opcode)
+				break;
+			ops_index++;
+		}
+
+		if (ops_index == ops_cnt) {
+			dev_info(dev, "Skipping unknown entry opcode %d\n",
+				 entry->hdr.type);
+			entry->hdr.flags |= QLCNIC_DUMP_SKIP;
+			entry_offset += entry->hdr.offset;
+			continue;
+		}
+
+		/* Collect dump for this entry */
+		dump = fw_dump_ops[ops_index].handler(adapter, entry, buffer);
+		if (!qlcnic_valid_dump_entry(dev, entry, dump)) {
+			entry->hdr.flags |= QLCNIC_DUMP_SKIP;
+			entry_offset += entry->hdr.offset;
+			continue;
+		}
+
+		buf_offset += entry->hdr.cap_size;
+		entry_offset += entry->hdr.offset;
+		buffer = fw_dump->data + buf_offset;
+	}
+
+	fw_dump->clr = 1;
+	snprintf(mesg, sizeof(mesg), "FW_DUMP=%s", adapter->netdev->name);
+	netdev_info(adapter->netdev,
+		    "Dump data %d bytes captured, dump data address = %p, template header size %d bytes, template address = %p\n",
+		    fw_dump->size, fw_dump->data, fw_dump->tmpl_hdr_size,
+		    fw_dump->tmpl_hdr);
+	/* Send a udev event to notify availability of FW dump */
+	kobject_uevent_env(&dev->kobj, KOBJ_CHANGE, msg);
+
+	return 0;
+}
+
+static inline bool
+qlcnic_83xx_md_check_extended_dump_capability(struct qlcnic_adapter *adapter)
+{
+	/* For special adapters (with 0x8830 device ID), where iSCSI firmware
+	 * dump needs to be captured as part of regular firmware dump
+	 * collection process, firmware exports it's capability through
+	 * capability registers
+	 */
+	return ((adapter->pdev->device == PCI_DEVICE_ID_QLOGIC_QLE8830) &&
+		(adapter->ahw->extra_capability[0] &
+		 QLCNIC_FW_CAPABILITY_2_EXT_ISCSI_DUMP));
+}
+
+void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter)
+{
+	u32 prev_version, current_version;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_fw_dump *fw_dump = &ahw->fw_dump;
+	struct pci_dev *pdev = adapter->pdev;
+	bool extended = false;
+	int ret;
+
+	prev_version = adapter->fw_version;
+	current_version = qlcnic_83xx_get_fw_version(adapter);
+
+	if (fw_dump->tmpl_hdr == NULL || current_version > prev_version) {
+		vfree(fw_dump->tmpl_hdr);
+
+		if (qlcnic_83xx_md_check_extended_dump_capability(adapter))
+			extended = !qlcnic_83xx_extend_md_capab(adapter);
+
+		ret = qlcnic_fw_cmd_get_minidump_temp(adapter);
+		if (ret)
+			return;
+
+		dev_info(&pdev->dev, "Supports FW dump capability\n");
+
+		/* Once we have minidump template with extended iSCSI dump
+		 * capability, update the minidump capture mask to 0x1f as
+		 * per FW requirement
+		 */
+		if (extended) {
+			struct qlcnic_83xx_dump_template_hdr *hdr;
+
+			hdr = fw_dump->tmpl_hdr;
+			hdr->drv_cap_mask = 0x1f;
+			fw_dump->cap_mask = 0x1f;
+			dev_info(&pdev->dev,
+				 "Extended iSCSI dump capability and updated capture mask to 0x1f\n");
+		}
+	}
+}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
new file mode 100644
index 0000000..5f32765
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
@@ -0,0 +1,278 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#ifndef _QLCNIC_83XX_SRIOV_H_
+#define _QLCNIC_83XX_SRIOV_H_
+
+#include <linux/types.h>
+#include <linux/pci.h>
+
+#include "qlcnic.h"
+
+extern const u32 qlcnic_83xx_reg_tbl[];
+extern const u32 qlcnic_83xx_ext_reg_tbl[];
+
+struct qlcnic_bc_payload {
+	u64 payload[126];
+};
+
+struct qlcnic_bc_hdr {
+#if defined(__LITTLE_ENDIAN)
+	u8	version;
+	u8	msg_type:4;
+	u8	rsvd1:3;
+	u8	op_type:1;
+	u8	num_cmds;
+	u8	num_frags;
+	u8	frag_num;
+	u8	cmd_op;
+	u16	seq_id;
+	u64	rsvd3;
+#elif defined(__BIG_ENDIAN)
+	u8	num_frags;
+	u8	num_cmds;
+	u8	op_type:1;
+	u8	rsvd1:3;
+	u8	msg_type:4;
+	u8	version;
+	u16	seq_id;
+	u8	cmd_op;
+	u8	frag_num;
+	u64	rsvd3;
+#endif
+};
+
+enum qlcnic_bc_commands {
+	QLCNIC_BC_CMD_CHANNEL_INIT = 0x0,
+	QLCNIC_BC_CMD_CHANNEL_TERM = 0x1,
+	QLCNIC_BC_CMD_GET_ACL = 0x2,
+	QLCNIC_BC_CMD_CFG_GUEST_VLAN = 0x3,
+};
+
+#define QLCNIC_83XX_SRIOV_VF_MAX_MAC 2
+#define QLC_BC_CMD 1
+
+struct qlcnic_trans_list {
+	/* Lock for manipulating list */
+	spinlock_t		lock;
+	struct list_head	wait_list;
+	int			count;
+};
+
+enum qlcnic_trans_state {
+	QLC_INIT = 0,
+	QLC_WAIT_FOR_CHANNEL_FREE,
+	QLC_WAIT_FOR_RESP,
+	QLC_ABORT,
+	QLC_END,
+};
+
+struct qlcnic_bc_trans {
+	u8				func_id;
+	u8				active;
+	u8				curr_rsp_frag;
+	u8				curr_req_frag;
+	u16				cmd_id;
+	u16				req_pay_size;
+	u16				rsp_pay_size;
+	u32				trans_id;
+	enum qlcnic_trans_state		trans_state;
+	struct list_head		list;
+	struct qlcnic_bc_hdr		*req_hdr;
+	struct qlcnic_bc_hdr		*rsp_hdr;
+	struct qlcnic_bc_payload	*req_pay;
+	struct qlcnic_bc_payload	*rsp_pay;
+	struct completion		resp_cmpl;
+	struct qlcnic_vf_info		*vf;
+};
+
+enum qlcnic_vf_state {
+	QLC_BC_VF_SEND = 0,
+	QLC_BC_VF_RECV,
+	QLC_BC_VF_CHANNEL,
+	QLC_BC_VF_STATE,
+	QLC_BC_VF_FLR,
+	QLC_BC_VF_SOFT_FLR,
+};
+
+enum qlcnic_vlan_mode {
+	QLC_NO_VLAN_MODE = 0,
+	QLC_PVID_MODE,
+	QLC_GUEST_VLAN_MODE,
+};
+
+struct qlcnic_resources {
+	u16 num_tx_mac_filters;
+	u16 num_rx_ucast_mac_filters;
+	u16 num_rx_mcast_mac_filters;
+
+	u16 num_txvlan_keys;
+
+	u16 num_rx_queues;
+	u16 num_tx_queues;
+
+	u16 num_rx_buf_rings;
+	u16 num_rx_status_rings;
+
+	u16 num_destip;
+	u32 num_lro_flows_supported;
+	u16 max_local_ipv6_addrs;
+	u16 max_remote_ipv6_addrs;
+};
+
+struct qlcnic_vport {
+	u16			handle;
+	u16			max_tx_bw;
+	u16			min_tx_bw;
+	u16			pvid;
+	u8			vlan_mode;
+	u8			qos;
+	bool			spoofchk;
+	u8			mac[6];
+};
+
+struct qlcnic_vf_info {
+	u8				pci_func;
+	u16				rx_ctx_id;
+	u16				tx_ctx_id;
+	u16				*sriov_vlans;
+	int				num_vlan;
+	unsigned long			state;
+	struct completion		ch_free_cmpl;
+	struct work_struct		trans_work;
+	struct work_struct		flr_work;
+	/* It synchronizes commands sent from VF */
+	struct mutex			send_cmd_lock;
+	struct qlcnic_bc_trans		*send_cmd;
+	struct qlcnic_bc_trans		*flr_trans;
+	struct qlcnic_trans_list	rcv_act;
+	struct qlcnic_trans_list	rcv_pend;
+	struct qlcnic_adapter		*adapter;
+	struct qlcnic_vport		*vp;
+	spinlock_t			vlan_list_lock;	/* Lock for VLAN list */
+};
+
+struct qlcnic_async_cmd {
+	struct list_head	list;
+	struct qlcnic_cmd_args	*cmd;
+};
+
+struct qlcnic_back_channel {
+	u16			trans_counter;
+	struct workqueue_struct *bc_trans_wq;
+	struct workqueue_struct *bc_async_wq;
+	struct workqueue_struct *bc_flr_wq;
+	struct qlcnic_adapter	*adapter;
+	struct list_head	async_cmd_list;
+	struct work_struct	vf_async_work;
+	spinlock_t		queue_lock; /* async_cmd_list queue lock */
+};
+
+struct qlcnic_sriov {
+	u16				vp_handle;
+	u8				num_vfs;
+	u8				any_vlan;
+	u8				vlan_mode;
+	u16				num_allowed_vlans;
+	u16				*allowed_vlans;
+	u16				vlan;
+	struct qlcnic_resources		ff_max;
+	struct qlcnic_back_channel	bc;
+	struct qlcnic_vf_info		*vf_info;
+};
+
+int qlcnic_sriov_init(struct qlcnic_adapter *, int);
+void qlcnic_sriov_cleanup(struct qlcnic_adapter *);
+void __qlcnic_sriov_cleanup(struct qlcnic_adapter *);
+void qlcnic_sriov_vf_register_map(struct qlcnic_hardware_context *);
+int qlcnic_sriov_vf_init(struct qlcnic_adapter *, int);
+void qlcnic_sriov_vf_set_ops(struct qlcnic_adapter *);
+int qlcnic_sriov_func_to_index(struct qlcnic_adapter *, u8);
+void qlcnic_sriov_handle_bc_event(struct qlcnic_adapter *, u32);
+int qlcnic_sriov_cfg_bc_intr(struct qlcnic_adapter *, u8);
+void qlcnic_sriov_cleanup_async_list(struct qlcnic_back_channel *);
+void qlcnic_sriov_cleanup_list(struct qlcnic_trans_list *);
+int __qlcnic_sriov_add_act_list(struct qlcnic_sriov *, struct qlcnic_vf_info *,
+				struct qlcnic_bc_trans *);
+int qlcnic_sriov_get_vf_vport_info(struct qlcnic_adapter *,
+				   struct qlcnic_info *, u16);
+int qlcnic_sriov_cfg_vf_guest_vlan(struct qlcnic_adapter *, u16, u8);
+void qlcnic_sriov_free_vlans(struct qlcnic_adapter *);
+void qlcnic_sriov_alloc_vlans(struct qlcnic_adapter *);
+bool qlcnic_sriov_check_any_vlan(struct qlcnic_vf_info *);
+void qlcnic_sriov_del_vlan_id(struct qlcnic_sriov *,
+			      struct qlcnic_vf_info *, u16);
+void qlcnic_sriov_add_vlan_id(struct qlcnic_sriov *,
+			      struct qlcnic_vf_info *, u16);
+
+static inline bool qlcnic_sriov_enable_check(struct qlcnic_adapter *adapter)
+{
+	return test_bit(__QLCNIC_SRIOV_ENABLE, &adapter->state) ? true : false;
+}
+
+#ifdef CONFIG_QLCNIC_SRIOV
+void qlcnic_sriov_pf_process_bc_cmd(struct qlcnic_adapter *,
+				    struct qlcnic_bc_trans *,
+				    struct qlcnic_cmd_args *);
+void qlcnic_sriov_pf_disable(struct qlcnic_adapter *);
+void qlcnic_sriov_pf_cleanup(struct qlcnic_adapter *);
+int qlcnic_pci_sriov_configure(struct pci_dev *, int);
+void qlcnic_pf_set_interface_id_create_rx_ctx(struct qlcnic_adapter *, u32 *);
+void qlcnic_pf_set_interface_id_create_tx_ctx(struct qlcnic_adapter *, u32 *);
+void qlcnic_pf_set_interface_id_del_rx_ctx(struct qlcnic_adapter *, u32 *);
+void qlcnic_pf_set_interface_id_del_tx_ctx(struct qlcnic_adapter *, u32 *);
+void qlcnic_pf_set_interface_id_promisc(struct qlcnic_adapter *, u32 *);
+void qlcnic_pf_set_interface_id_ipaddr(struct qlcnic_adapter *, u32 *);
+void qlcnic_pf_set_interface_id_macaddr(struct qlcnic_adapter *, u32 *);
+void qlcnic_sriov_pf_handle_flr(struct qlcnic_sriov *, struct qlcnic_vf_info *);
+bool qlcnic_sriov_soft_flr_check(struct qlcnic_adapter *,
+				 struct qlcnic_bc_trans *,
+				 struct qlcnic_vf_info *);
+void qlcnic_sriov_pf_reset(struct qlcnic_adapter *);
+int qlcnic_sriov_pf_reinit(struct qlcnic_adapter *);
+int qlcnic_sriov_set_vf_mac(struct net_device *, int, u8 *);
+int qlcnic_sriov_set_vf_tx_rate(struct net_device *, int, int, int);
+int qlcnic_sriov_get_vf_config(struct net_device *, int ,
+			       struct ifla_vf_info *);
+int qlcnic_sriov_set_vf_vlan(struct net_device *, int, u16, u8, __be16);
+int qlcnic_sriov_set_vf_spoofchk(struct net_device *, int, bool);
+#else
+static inline void qlcnic_sriov_pf_disable(struct qlcnic_adapter *adapter) {}
+static inline void qlcnic_sriov_pf_cleanup(struct qlcnic_adapter *adapter) {}
+static inline void
+qlcnic_pf_set_interface_id_create_rx_ctx(struct qlcnic_adapter *adapter,
+					 u32 *int_id) {}
+static inline void
+qlcnic_pf_set_interface_id_create_tx_ctx(struct qlcnic_adapter *adapter,
+					 u32 *int_id) {}
+static inline void
+qlcnic_pf_set_interface_id_del_rx_ctx(struct qlcnic_adapter *adapter,
+				      u32 *int_id) {}
+static inline void
+qlcnic_pf_set_interface_id_del_tx_ctx(struct qlcnic_adapter *adapter,
+				      u32 *int_id) {}
+static inline void
+qlcnic_pf_set_interface_id_ipaddr(struct qlcnic_adapter *adapter, u32 *int_id)
+{}
+static inline void
+qlcnic_pf_set_interface_id_macaddr(struct qlcnic_adapter *adapter, u32 *int_id)
+{}
+static inline void
+qlcnic_pf_set_interface_id_promisc(struct qlcnic_adapter *adapter, u32 *int_id)
+{}
+static inline void qlcnic_sriov_pf_handle_flr(struct qlcnic_sriov *sriov,
+					      struct qlcnic_vf_info *vf) {}
+static inline bool qlcnic_sriov_soft_flr_check(struct qlcnic_adapter *adapter,
+					       struct qlcnic_bc_trans *trans,
+					       struct qlcnic_vf_info *vf)
+{ return false; }
+static inline void qlcnic_sriov_pf_reset(struct qlcnic_adapter *adapter) {}
+static inline int qlcnic_sriov_pf_reinit(struct qlcnic_adapter *adapter)
+{ return 0; }
+#endif
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c
new file mode 100644
index 0000000..c58180f
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c
@@ -0,0 +1,2232 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include <linux/types.h>
+
+#include "qlcnic_sriov.h"
+#include "qlcnic.h"
+#include "qlcnic_83xx_hw.h"
+
+#define QLC_BC_COMMAND	0
+#define QLC_BC_RESPONSE	1
+
+#define QLC_MBOX_RESP_TIMEOUT		(10 * HZ)
+#define QLC_MBOX_CH_FREE_TIMEOUT	(10 * HZ)
+
+#define QLC_BC_MSG		0
+#define QLC_BC_CFREE		1
+#define QLC_BC_FLR		2
+#define QLC_BC_HDR_SZ		16
+#define QLC_BC_PAYLOAD_SZ	(1024 - QLC_BC_HDR_SZ)
+
+#define QLC_DEFAULT_RCV_DESCRIPTORS_SRIOV_VF		2048
+#define QLC_DEFAULT_JUMBO_RCV_DESCRIPTORS_SRIOV_VF	512
+
+#define QLC_83XX_VF_RESET_FAIL_THRESH	8
+#define QLC_BC_CMD_MAX_RETRY_CNT	5
+
+static void qlcnic_sriov_handle_async_issue_cmd(struct work_struct *work);
+static void qlcnic_sriov_vf_free_mac_list(struct qlcnic_adapter *);
+static int qlcnic_sriov_alloc_bc_mbx_args(struct qlcnic_cmd_args *, u32);
+static void qlcnic_sriov_vf_poll_dev_state(struct work_struct *);
+static void qlcnic_sriov_vf_cancel_fw_work(struct qlcnic_adapter *);
+static void qlcnic_sriov_cleanup_transaction(struct qlcnic_bc_trans *);
+static int qlcnic_sriov_issue_cmd(struct qlcnic_adapter *,
+				  struct qlcnic_cmd_args *);
+static int qlcnic_sriov_channel_cfg_cmd(struct qlcnic_adapter *, u8);
+static void qlcnic_sriov_process_bc_cmd(struct work_struct *);
+static int qlcnic_sriov_vf_shutdown(struct pci_dev *);
+static int qlcnic_sriov_vf_resume(struct qlcnic_adapter *);
+static int qlcnic_sriov_async_issue_cmd(struct qlcnic_adapter *,
+					struct qlcnic_cmd_args *);
+
+static struct qlcnic_hardware_ops qlcnic_sriov_vf_hw_ops = {
+	.read_crb			= qlcnic_83xx_read_crb,
+	.write_crb			= qlcnic_83xx_write_crb,
+	.read_reg			= qlcnic_83xx_rd_reg_indirect,
+	.write_reg			= qlcnic_83xx_wrt_reg_indirect,
+	.get_mac_address		= qlcnic_83xx_get_mac_address,
+	.setup_intr			= qlcnic_83xx_setup_intr,
+	.alloc_mbx_args			= qlcnic_83xx_alloc_mbx_args,
+	.mbx_cmd			= qlcnic_sriov_issue_cmd,
+	.get_func_no			= qlcnic_83xx_get_func_no,
+	.api_lock			= qlcnic_83xx_cam_lock,
+	.api_unlock			= qlcnic_83xx_cam_unlock,
+	.process_lb_rcv_ring_diag	= qlcnic_83xx_process_rcv_ring_diag,
+	.create_rx_ctx			= qlcnic_83xx_create_rx_ctx,
+	.create_tx_ctx			= qlcnic_83xx_create_tx_ctx,
+	.del_rx_ctx			= qlcnic_83xx_del_rx_ctx,
+	.del_tx_ctx			= qlcnic_83xx_del_tx_ctx,
+	.setup_link_event		= qlcnic_83xx_setup_link_event,
+	.get_nic_info			= qlcnic_83xx_get_nic_info,
+	.get_pci_info			= qlcnic_83xx_get_pci_info,
+	.set_nic_info			= qlcnic_83xx_set_nic_info,
+	.change_macvlan			= qlcnic_83xx_sre_macaddr_change,
+	.napi_enable			= qlcnic_83xx_napi_enable,
+	.napi_disable			= qlcnic_83xx_napi_disable,
+	.config_intr_coal		= qlcnic_83xx_config_intr_coal,
+	.config_rss			= qlcnic_83xx_config_rss,
+	.config_hw_lro			= qlcnic_83xx_config_hw_lro,
+	.config_promisc_mode		= qlcnic_83xx_nic_set_promisc,
+	.change_l2_filter		= qlcnic_83xx_change_l2_filter,
+	.get_board_info			= qlcnic_83xx_get_port_info,
+	.free_mac_list			= qlcnic_sriov_vf_free_mac_list,
+	.enable_sds_intr		= qlcnic_83xx_enable_sds_intr,
+	.disable_sds_intr		= qlcnic_83xx_disable_sds_intr,
+	.encap_rx_offload               = qlcnic_83xx_encap_rx_offload,
+	.encap_tx_offload               = qlcnic_83xx_encap_tx_offload,
+};
+
+static struct qlcnic_nic_template qlcnic_sriov_vf_ops = {
+	.config_bridged_mode	= qlcnic_config_bridged_mode,
+	.config_led		= qlcnic_config_led,
+	.cancel_idc_work        = qlcnic_sriov_vf_cancel_fw_work,
+	.napi_add		= qlcnic_83xx_napi_add,
+	.napi_del		= qlcnic_83xx_napi_del,
+	.shutdown		= qlcnic_sriov_vf_shutdown,
+	.resume			= qlcnic_sriov_vf_resume,
+	.config_ipaddr		= qlcnic_83xx_config_ipaddr,
+	.clear_legacy_intr	= qlcnic_83xx_clear_legacy_intr,
+};
+
+static const struct qlcnic_mailbox_metadata qlcnic_sriov_bc_mbx_tbl[] = {
+	{QLCNIC_BC_CMD_CHANNEL_INIT, 2, 2},
+	{QLCNIC_BC_CMD_CHANNEL_TERM, 2, 2},
+	{QLCNIC_BC_CMD_GET_ACL, 3, 14},
+	{QLCNIC_BC_CMD_CFG_GUEST_VLAN, 2, 2},
+};
+
+static inline bool qlcnic_sriov_bc_msg_check(u32 val)
+{
+	return (val & (1 << QLC_BC_MSG)) ? true : false;
+}
+
+static inline bool qlcnic_sriov_channel_free_check(u32 val)
+{
+	return (val & (1 << QLC_BC_CFREE)) ? true : false;
+}
+
+static inline bool qlcnic_sriov_flr_check(u32 val)
+{
+	return (val & (1 << QLC_BC_FLR)) ? true : false;
+}
+
+static inline u8 qlcnic_sriov_target_func_id(u32 val)
+{
+	return (val >> 4) & 0xff;
+}
+
+static int qlcnic_sriov_virtid_fn(struct qlcnic_adapter *adapter, int vf_id)
+{
+	struct pci_dev *dev = adapter->pdev;
+	int pos;
+	u16 stride, offset;
+
+	if (qlcnic_sriov_vf_check(adapter))
+		return 0;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV);
+	if (!pos)
+		return 0;
+	pci_read_config_word(dev, pos + PCI_SRIOV_VF_OFFSET, &offset);
+	pci_read_config_word(dev, pos + PCI_SRIOV_VF_STRIDE, &stride);
+
+	return (dev->devfn + offset + stride * vf_id) & 0xff;
+}
+
+int qlcnic_sriov_init(struct qlcnic_adapter *adapter, int num_vfs)
+{
+	struct qlcnic_sriov *sriov;
+	struct qlcnic_back_channel *bc;
+	struct workqueue_struct *wq;
+	struct qlcnic_vport *vp;
+	struct qlcnic_vf_info *vf;
+	int err, i;
+
+	if (!qlcnic_sriov_enable_check(adapter))
+		return -EIO;
+
+	sriov  = kzalloc(sizeof(struct qlcnic_sriov), GFP_KERNEL);
+	if (!sriov)
+		return -ENOMEM;
+
+	adapter->ahw->sriov = sriov;
+	sriov->num_vfs = num_vfs;
+	bc = &sriov->bc;
+	sriov->vf_info = kzalloc(sizeof(struct qlcnic_vf_info) *
+				 num_vfs, GFP_KERNEL);
+	if (!sriov->vf_info) {
+		err = -ENOMEM;
+		goto qlcnic_free_sriov;
+	}
+
+	wq = create_singlethread_workqueue("bc-trans");
+	if (wq == NULL) {
+		err = -ENOMEM;
+		dev_err(&adapter->pdev->dev,
+			"Cannot create bc-trans workqueue\n");
+		goto qlcnic_free_vf_info;
+	}
+
+	bc->bc_trans_wq = wq;
+
+	wq = create_singlethread_workqueue("async");
+	if (wq == NULL) {
+		err = -ENOMEM;
+		dev_err(&adapter->pdev->dev, "Cannot create async workqueue\n");
+		goto qlcnic_destroy_trans_wq;
+	}
+
+	bc->bc_async_wq =  wq;
+	INIT_LIST_HEAD(&bc->async_cmd_list);
+	INIT_WORK(&bc->vf_async_work, qlcnic_sriov_handle_async_issue_cmd);
+	spin_lock_init(&bc->queue_lock);
+	bc->adapter = adapter;
+
+	for (i = 0; i < num_vfs; i++) {
+		vf = &sriov->vf_info[i];
+		vf->adapter = adapter;
+		vf->pci_func = qlcnic_sriov_virtid_fn(adapter, i);
+		mutex_init(&vf->send_cmd_lock);
+		spin_lock_init(&vf->vlan_list_lock);
+		INIT_LIST_HEAD(&vf->rcv_act.wait_list);
+		INIT_LIST_HEAD(&vf->rcv_pend.wait_list);
+		spin_lock_init(&vf->rcv_act.lock);
+		spin_lock_init(&vf->rcv_pend.lock);
+		init_completion(&vf->ch_free_cmpl);
+
+		INIT_WORK(&vf->trans_work, qlcnic_sriov_process_bc_cmd);
+
+		if (qlcnic_sriov_pf_check(adapter)) {
+			vp = kzalloc(sizeof(struct qlcnic_vport), GFP_KERNEL);
+			if (!vp) {
+				err = -ENOMEM;
+				goto qlcnic_destroy_async_wq;
+			}
+			sriov->vf_info[i].vp = vp;
+			vp->vlan_mode = QLC_GUEST_VLAN_MODE;
+			vp->max_tx_bw = MAX_BW;
+			vp->min_tx_bw = MIN_BW;
+			vp->spoofchk = false;
+			random_ether_addr(vp->mac);
+			dev_info(&adapter->pdev->dev,
+				 "MAC Address %pM is configured for VF %d\n",
+				 vp->mac, i);
+		}
+	}
+
+	return 0;
+
+qlcnic_destroy_async_wq:
+	destroy_workqueue(bc->bc_async_wq);
+
+qlcnic_destroy_trans_wq:
+	destroy_workqueue(bc->bc_trans_wq);
+
+qlcnic_free_vf_info:
+	kfree(sriov->vf_info);
+
+qlcnic_free_sriov:
+	kfree(adapter->ahw->sriov);
+	return err;
+}
+
+void qlcnic_sriov_cleanup_list(struct qlcnic_trans_list *t_list)
+{
+	struct qlcnic_bc_trans *trans;
+	struct qlcnic_cmd_args cmd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&t_list->lock, flags);
+
+	while (!list_empty(&t_list->wait_list)) {
+		trans = list_first_entry(&t_list->wait_list,
+					 struct qlcnic_bc_trans, list);
+		list_del(&trans->list);
+		t_list->count--;
+		cmd.req.arg = (u32 *)trans->req_pay;
+		cmd.rsp.arg = (u32 *)trans->rsp_pay;
+		qlcnic_free_mbx_args(&cmd);
+		qlcnic_sriov_cleanup_transaction(trans);
+	}
+
+	spin_unlock_irqrestore(&t_list->lock, flags);
+}
+
+void __qlcnic_sriov_cleanup(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	struct qlcnic_back_channel *bc = &sriov->bc;
+	struct qlcnic_vf_info *vf;
+	int i;
+
+	if (!qlcnic_sriov_enable_check(adapter))
+		return;
+
+	qlcnic_sriov_cleanup_async_list(bc);
+	destroy_workqueue(bc->bc_async_wq);
+
+	for (i = 0; i < sriov->num_vfs; i++) {
+		vf = &sriov->vf_info[i];
+		qlcnic_sriov_cleanup_list(&vf->rcv_pend);
+		cancel_work_sync(&vf->trans_work);
+		qlcnic_sriov_cleanup_list(&vf->rcv_act);
+	}
+
+	destroy_workqueue(bc->bc_trans_wq);
+
+	for (i = 0; i < sriov->num_vfs; i++)
+		kfree(sriov->vf_info[i].vp);
+
+	kfree(sriov->vf_info);
+	kfree(adapter->ahw->sriov);
+}
+
+static void qlcnic_sriov_vf_cleanup(struct qlcnic_adapter *adapter)
+{
+	qlcnic_sriov_channel_cfg_cmd(adapter, QLCNIC_BC_CMD_CHANNEL_TERM);
+	qlcnic_sriov_cfg_bc_intr(adapter, 0);
+	__qlcnic_sriov_cleanup(adapter);
+}
+
+void qlcnic_sriov_cleanup(struct qlcnic_adapter *adapter)
+{
+	if (!test_bit(__QLCNIC_SRIOV_ENABLE, &adapter->state))
+		return;
+
+	qlcnic_sriov_free_vlans(adapter);
+
+	if (qlcnic_sriov_pf_check(adapter))
+		qlcnic_sriov_pf_cleanup(adapter);
+
+	if (qlcnic_sriov_vf_check(adapter))
+		qlcnic_sriov_vf_cleanup(adapter);
+}
+
+static int qlcnic_sriov_post_bc_msg(struct qlcnic_adapter *adapter, u32 *hdr,
+				    u32 *pay, u8 pci_func, u8 size)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_mailbox *mbx = ahw->mailbox;
+	struct qlcnic_cmd_args cmd;
+	unsigned long timeout;
+	int err;
+
+	memset(&cmd, 0, sizeof(struct qlcnic_cmd_args));
+	cmd.hdr = hdr;
+	cmd.pay = pay;
+	cmd.pay_size = size;
+	cmd.func_num = pci_func;
+	cmd.op_type = QLC_83XX_MBX_POST_BC_OP;
+	cmd.cmd_op = ((struct qlcnic_bc_hdr *)hdr)->cmd_op;
+
+	err = mbx->ops->enqueue_cmd(adapter, &cmd, &timeout);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"%s: Mailbox not available, cmd_op=0x%x, cmd_type=0x%x, pci_func=0x%x, op_mode=0x%x\n",
+			__func__, cmd.cmd_op, cmd.type, ahw->pci_func,
+			ahw->op_mode);
+		return err;
+	}
+
+	if (!wait_for_completion_timeout(&cmd.completion, timeout)) {
+		dev_err(&adapter->pdev->dev,
+			"%s: Mailbox command timed out, cmd_op=0x%x, cmd_type=0x%x, pci_func=0x%x, op_mode=0x%x\n",
+			__func__, cmd.cmd_op, cmd.type, ahw->pci_func,
+			ahw->op_mode);
+		flush_workqueue(mbx->work_q);
+	}
+
+	return cmd.rsp_opcode;
+}
+
+static void qlcnic_sriov_vf_cfg_buff_desc(struct qlcnic_adapter *adapter)
+{
+	adapter->num_rxd = QLC_DEFAULT_RCV_DESCRIPTORS_SRIOV_VF;
+	adapter->max_rxd = MAX_RCV_DESCRIPTORS_10G;
+	adapter->num_jumbo_rxd = QLC_DEFAULT_JUMBO_RCV_DESCRIPTORS_SRIOV_VF;
+	adapter->max_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_10G;
+	adapter->num_txd = MAX_CMD_DESCRIPTORS;
+	adapter->max_rds_rings = MAX_RDS_RINGS;
+}
+
+int qlcnic_sriov_get_vf_vport_info(struct qlcnic_adapter *adapter,
+				   struct qlcnic_info *npar_info, u16 vport_id)
+{
+	struct device *dev = &adapter->pdev->dev;
+	struct qlcnic_cmd_args cmd;
+	int err;
+	u32 status;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_NIC_INFO);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = vport_id << 16 | 0x1;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to get vport info, err=%d\n", err);
+		qlcnic_free_mbx_args(&cmd);
+		return err;
+	}
+
+	status = cmd.rsp.arg[2] & 0xffff;
+	if (status & BIT_0)
+		npar_info->min_tx_bw = MSW(cmd.rsp.arg[2]);
+	if (status & BIT_1)
+		npar_info->max_tx_bw = LSW(cmd.rsp.arg[3]);
+	if (status & BIT_2)
+		npar_info->max_tx_ques = MSW(cmd.rsp.arg[3]);
+	if (status & BIT_3)
+		npar_info->max_tx_mac_filters = LSW(cmd.rsp.arg[4]);
+	if (status & BIT_4)
+		npar_info->max_rx_mcast_mac_filters = MSW(cmd.rsp.arg[4]);
+	if (status & BIT_5)
+		npar_info->max_rx_ucast_mac_filters = LSW(cmd.rsp.arg[5]);
+	if (status & BIT_6)
+		npar_info->max_rx_ip_addr = MSW(cmd.rsp.arg[5]);
+	if (status & BIT_7)
+		npar_info->max_rx_lro_flow = LSW(cmd.rsp.arg[6]);
+	if (status & BIT_8)
+		npar_info->max_rx_status_rings = MSW(cmd.rsp.arg[6]);
+	if (status & BIT_9)
+		npar_info->max_rx_buf_rings = LSW(cmd.rsp.arg[7]);
+
+	npar_info->max_rx_ques = MSW(cmd.rsp.arg[7]);
+	npar_info->max_tx_vlan_keys = LSW(cmd.rsp.arg[8]);
+	npar_info->max_local_ipv6_addrs = MSW(cmd.rsp.arg[8]);
+	npar_info->max_remote_ipv6_addrs = LSW(cmd.rsp.arg[9]);
+
+	dev_info(dev, "\n\tmin_tx_bw: %d, max_tx_bw: %d max_tx_ques: %d,\n"
+		 "\tmax_tx_mac_filters: %d max_rx_mcast_mac_filters: %d,\n"
+		 "\tmax_rx_ucast_mac_filters: 0x%x, max_rx_ip_addr: %d,\n"
+		 "\tmax_rx_lro_flow: %d max_rx_status_rings: %d,\n"
+		 "\tmax_rx_buf_rings: %d, max_rx_ques: %d, max_tx_vlan_keys %d\n"
+		 "\tlocal_ipv6_addr: %d, remote_ipv6_addr: %d\n",
+		 npar_info->min_tx_bw, npar_info->max_tx_bw,
+		 npar_info->max_tx_ques, npar_info->max_tx_mac_filters,
+		 npar_info->max_rx_mcast_mac_filters,
+		 npar_info->max_rx_ucast_mac_filters, npar_info->max_rx_ip_addr,
+		 npar_info->max_rx_lro_flow, npar_info->max_rx_status_rings,
+		 npar_info->max_rx_buf_rings, npar_info->max_rx_ques,
+		 npar_info->max_tx_vlan_keys, npar_info->max_local_ipv6_addrs,
+		 npar_info->max_remote_ipv6_addrs);
+
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+static int qlcnic_sriov_set_pvid_mode(struct qlcnic_adapter *adapter,
+				      struct qlcnic_cmd_args *cmd)
+{
+	adapter->rx_pvid = MSW(cmd->rsp.arg[1]) & 0xffff;
+	adapter->flags &= ~QLCNIC_TAGGING_ENABLED;
+	return 0;
+}
+
+static int qlcnic_sriov_set_guest_vlan_mode(struct qlcnic_adapter *adapter,
+					    struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	int i, num_vlans;
+	u16 *vlans;
+
+	if (sriov->allowed_vlans)
+		return 0;
+
+	sriov->any_vlan = cmd->rsp.arg[2] & 0xf;
+	sriov->num_allowed_vlans = cmd->rsp.arg[2] >> 16;
+	dev_info(&adapter->pdev->dev, "Number of allowed Guest VLANs = %d\n",
+		 sriov->num_allowed_vlans);
+
+	qlcnic_sriov_alloc_vlans(adapter);
+
+	if (!sriov->any_vlan)
+		return 0;
+
+	num_vlans = sriov->num_allowed_vlans;
+	sriov->allowed_vlans = kzalloc(sizeof(u16) * num_vlans, GFP_KERNEL);
+	if (!sriov->allowed_vlans)
+		return -ENOMEM;
+
+	vlans = (u16 *)&cmd->rsp.arg[3];
+	for (i = 0; i < num_vlans; i++)
+		sriov->allowed_vlans[i] = vlans[i];
+
+	return 0;
+}
+
+static int qlcnic_sriov_get_vf_acl(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	struct qlcnic_cmd_args cmd;
+	int ret = 0;
+
+	memset(&cmd, 0, sizeof(cmd));
+	ret = qlcnic_sriov_alloc_bc_mbx_args(&cmd, QLCNIC_BC_CMD_GET_ACL);
+	if (ret)
+		return ret;
+
+	ret = qlcnic_issue_cmd(adapter, &cmd);
+	if (ret) {
+		dev_err(&adapter->pdev->dev, "Failed to get ACL, err=%d\n",
+			ret);
+	} else {
+		sriov->vlan_mode = cmd.rsp.arg[1] & 0x3;
+		switch (sriov->vlan_mode) {
+		case QLC_GUEST_VLAN_MODE:
+			ret = qlcnic_sriov_set_guest_vlan_mode(adapter, &cmd);
+			break;
+		case QLC_PVID_MODE:
+			ret = qlcnic_sriov_set_pvid_mode(adapter, &cmd);
+			break;
+		}
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+	return ret;
+}
+
+static int qlcnic_sriov_vf_init_driver(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_info nic_info;
+	int err;
+
+	err = qlcnic_sriov_get_vf_vport_info(adapter, &nic_info, 0);
+	if (err)
+		return err;
+
+	ahw->max_mc_count = nic_info.max_rx_mcast_mac_filters;
+
+	err = qlcnic_get_nic_info(adapter, &nic_info, ahw->pci_func);
+	if (err)
+		return -EIO;
+
+	if (qlcnic_83xx_get_port_info(adapter))
+		return -EIO;
+
+	qlcnic_sriov_vf_cfg_buff_desc(adapter);
+	adapter->flags |= QLCNIC_ADAPTER_INITIALIZED;
+	dev_info(&adapter->pdev->dev, "HAL Version: %d\n",
+		 adapter->ahw->fw_hal_version);
+
+	ahw->physical_port = (u8) nic_info.phys_port;
+	ahw->switch_mode = nic_info.switch_mode;
+	ahw->max_mtu = nic_info.max_mtu;
+	ahw->op_mode = nic_info.op_mode;
+	ahw->capabilities = nic_info.capabilities;
+	return 0;
+}
+
+static int qlcnic_sriov_setup_vf(struct qlcnic_adapter *adapter,
+				 int pci_using_dac)
+{
+	int err;
+
+	adapter->flags |= QLCNIC_VLAN_FILTERING;
+	adapter->ahw->total_nic_func = 1;
+	INIT_LIST_HEAD(&adapter->vf_mc_list);
+	if (!qlcnic_use_msi_x && !!qlcnic_use_msi)
+		dev_warn(&adapter->pdev->dev,
+			 "Device does not support MSI interrupts\n");
+
+	/* compute and set default and max tx/sds rings */
+	qlcnic_set_tx_ring_count(adapter, QLCNIC_SINGLE_RING);
+	qlcnic_set_sds_ring_count(adapter, QLCNIC_SINGLE_RING);
+
+	err = qlcnic_setup_intr(adapter);
+	if (err) {
+		dev_err(&adapter->pdev->dev, "Failed to setup interrupt\n");
+		goto err_out_disable_msi;
+	}
+
+	err = qlcnic_83xx_setup_mbx_intr(adapter);
+	if (err)
+		goto err_out_disable_msi;
+
+	err = qlcnic_sriov_init(adapter, 1);
+	if (err)
+		goto err_out_disable_mbx_intr;
+
+	err = qlcnic_sriov_cfg_bc_intr(adapter, 1);
+	if (err)
+		goto err_out_cleanup_sriov;
+
+	err = qlcnic_sriov_channel_cfg_cmd(adapter, QLCNIC_BC_CMD_CHANNEL_INIT);
+	if (err)
+		goto err_out_disable_bc_intr;
+
+	err = qlcnic_sriov_vf_init_driver(adapter);
+	if (err)
+		goto err_out_send_channel_term;
+
+	err = qlcnic_sriov_get_vf_acl(adapter);
+	if (err)
+		goto err_out_send_channel_term;
+
+	err = qlcnic_setup_netdev(adapter, adapter->netdev, pci_using_dac);
+	if (err)
+		goto err_out_send_channel_term;
+
+	pci_set_drvdata(adapter->pdev, adapter);
+	dev_info(&adapter->pdev->dev, "%s: XGbE port initialized\n",
+		 adapter->netdev->name);
+
+	qlcnic_schedule_work(adapter, qlcnic_sriov_vf_poll_dev_state,
+			     adapter->ahw->idc.delay);
+	return 0;
+
+err_out_send_channel_term:
+	qlcnic_sriov_channel_cfg_cmd(adapter, QLCNIC_BC_CMD_CHANNEL_TERM);
+
+err_out_disable_bc_intr:
+	qlcnic_sriov_cfg_bc_intr(adapter, 0);
+
+err_out_cleanup_sriov:
+	__qlcnic_sriov_cleanup(adapter);
+
+err_out_disable_mbx_intr:
+	qlcnic_83xx_free_mbx_intr(adapter);
+
+err_out_disable_msi:
+	qlcnic_teardown_intr(adapter);
+	return err;
+}
+
+static int qlcnic_sriov_check_dev_ready(struct qlcnic_adapter *adapter)
+{
+	u32 state;
+
+	do {
+		msleep(20);
+		if (++adapter->fw_fail_cnt > QLC_BC_CMD_MAX_RETRY_CNT)
+			return -EIO;
+		state = QLCRDX(adapter->ahw, QLC_83XX_IDC_DEV_STATE);
+	} while (state != QLC_83XX_IDC_DEV_READY);
+
+	return 0;
+}
+
+int qlcnic_sriov_vf_init(struct qlcnic_adapter *adapter, int pci_using_dac)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int err;
+
+	set_bit(QLC_83XX_MODULE_LOADED, &ahw->idc.status);
+	ahw->idc.delay = QLC_83XX_IDC_FW_POLL_DELAY;
+	ahw->reset_context = 0;
+	adapter->fw_fail_cnt = 0;
+	ahw->msix_supported = 1;
+	adapter->need_fw_reset = 0;
+	adapter->flags |= QLCNIC_TX_INTR_SHARED;
+
+	err = qlcnic_sriov_check_dev_ready(adapter);
+	if (err)
+		return err;
+
+	err = qlcnic_sriov_setup_vf(adapter, pci_using_dac);
+	if (err)
+		return err;
+
+	if (qlcnic_read_mac_addr(adapter))
+		dev_warn(&adapter->pdev->dev, "failed to read mac addr\n");
+
+	INIT_DELAYED_WORK(&adapter->idc_aen_work, qlcnic_83xx_idc_aen_work);
+
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	return 0;
+}
+
+void qlcnic_sriov_vf_set_ops(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+
+	ahw->op_mode = QLCNIC_SRIOV_VF_FUNC;
+	dev_info(&adapter->pdev->dev,
+		 "HAL Version: %d Non Privileged SRIOV function\n",
+		 ahw->fw_hal_version);
+	adapter->nic_ops = &qlcnic_sriov_vf_ops;
+	set_bit(__QLCNIC_SRIOV_ENABLE, &adapter->state);
+	return;
+}
+
+void qlcnic_sriov_vf_register_map(struct qlcnic_hardware_context *ahw)
+{
+	ahw->hw_ops		= &qlcnic_sriov_vf_hw_ops;
+	ahw->reg_tbl		= (u32 *)qlcnic_83xx_reg_tbl;
+	ahw->ext_reg_tbl	= (u32 *)qlcnic_83xx_ext_reg_tbl;
+}
+
+static u32 qlcnic_sriov_get_bc_paysize(u32 real_pay_size, u8 curr_frag)
+{
+	u32 pay_size;
+
+	pay_size = real_pay_size / ((curr_frag + 1) * QLC_BC_PAYLOAD_SZ);
+
+	if (pay_size)
+		pay_size = QLC_BC_PAYLOAD_SZ;
+	else
+		pay_size = real_pay_size % QLC_BC_PAYLOAD_SZ;
+
+	return pay_size;
+}
+
+int qlcnic_sriov_func_to_index(struct qlcnic_adapter *adapter, u8 pci_func)
+{
+	struct qlcnic_vf_info *vf_info = adapter->ahw->sriov->vf_info;
+	u8 i;
+
+	if (qlcnic_sriov_vf_check(adapter))
+		return 0;
+
+	for (i = 0; i < adapter->ahw->sriov->num_vfs; i++) {
+		if (vf_info[i].pci_func == pci_func)
+			return i;
+	}
+
+	return -EINVAL;
+}
+
+static inline int qlcnic_sriov_alloc_bc_trans(struct qlcnic_bc_trans **trans)
+{
+	*trans = kzalloc(sizeof(struct qlcnic_bc_trans), GFP_ATOMIC);
+	if (!*trans)
+		return -ENOMEM;
+
+	init_completion(&(*trans)->resp_cmpl);
+	return 0;
+}
+
+static inline int qlcnic_sriov_alloc_bc_msg(struct qlcnic_bc_hdr **hdr,
+					    u32 size)
+{
+	*hdr = kzalloc(sizeof(struct qlcnic_bc_hdr) * size, GFP_ATOMIC);
+	if (!*hdr)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int qlcnic_sriov_alloc_bc_mbx_args(struct qlcnic_cmd_args *mbx, u32 type)
+{
+	const struct qlcnic_mailbox_metadata *mbx_tbl;
+	int i, size;
+
+	mbx_tbl = qlcnic_sriov_bc_mbx_tbl;
+	size = ARRAY_SIZE(qlcnic_sriov_bc_mbx_tbl);
+
+	for (i = 0; i < size; i++) {
+		if (type == mbx_tbl[i].cmd) {
+			mbx->op_type = QLC_BC_CMD;
+			mbx->req.num = mbx_tbl[i].in_args;
+			mbx->rsp.num = mbx_tbl[i].out_args;
+			mbx->req.arg = kcalloc(mbx->req.num, sizeof(u32),
+					       GFP_ATOMIC);
+			if (!mbx->req.arg)
+				return -ENOMEM;
+			mbx->rsp.arg = kcalloc(mbx->rsp.num, sizeof(u32),
+					       GFP_ATOMIC);
+			if (!mbx->rsp.arg) {
+				kfree(mbx->req.arg);
+				mbx->req.arg = NULL;
+				return -ENOMEM;
+			}
+			mbx->req.arg[0] = (type | (mbx->req.num << 16) |
+					   (3 << 29));
+			mbx->rsp.arg[0] = (type & 0xffff) | mbx->rsp.num << 16;
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+static int qlcnic_sriov_prepare_bc_hdr(struct qlcnic_bc_trans *trans,
+				       struct qlcnic_cmd_args *cmd,
+				       u16 seq, u8 msg_type)
+{
+	struct qlcnic_bc_hdr *hdr;
+	int i;
+	u32 num_regs, bc_pay_sz;
+	u16 remainder;
+	u8 cmd_op, num_frags, t_num_frags;
+
+	bc_pay_sz = QLC_BC_PAYLOAD_SZ;
+	if (msg_type == QLC_BC_COMMAND) {
+		trans->req_pay = (struct qlcnic_bc_payload *)cmd->req.arg;
+		trans->rsp_pay = (struct qlcnic_bc_payload *)cmd->rsp.arg;
+		num_regs = cmd->req.num;
+		trans->req_pay_size = (num_regs * 4);
+		num_regs = cmd->rsp.num;
+		trans->rsp_pay_size = (num_regs * 4);
+		cmd_op = cmd->req.arg[0] & 0xff;
+		remainder = (trans->req_pay_size) % (bc_pay_sz);
+		num_frags = (trans->req_pay_size) / (bc_pay_sz);
+		if (remainder)
+			num_frags++;
+		t_num_frags = num_frags;
+		if (qlcnic_sriov_alloc_bc_msg(&trans->req_hdr, num_frags))
+			return -ENOMEM;
+		remainder = (trans->rsp_pay_size) % (bc_pay_sz);
+		num_frags = (trans->rsp_pay_size) / (bc_pay_sz);
+		if (remainder)
+			num_frags++;
+		if (qlcnic_sriov_alloc_bc_msg(&trans->rsp_hdr, num_frags))
+			return -ENOMEM;
+		num_frags  = t_num_frags;
+		hdr = trans->req_hdr;
+	}  else {
+		cmd->req.arg = (u32 *)trans->req_pay;
+		cmd->rsp.arg = (u32 *)trans->rsp_pay;
+		cmd_op = cmd->req.arg[0] & 0xff;
+		cmd->cmd_op = cmd_op;
+		remainder = (trans->rsp_pay_size) % (bc_pay_sz);
+		num_frags = (trans->rsp_pay_size) / (bc_pay_sz);
+		if (remainder)
+			num_frags++;
+		cmd->req.num = trans->req_pay_size / 4;
+		cmd->rsp.num = trans->rsp_pay_size / 4;
+		hdr = trans->rsp_hdr;
+		cmd->op_type = trans->req_hdr->op_type;
+	}
+
+	trans->trans_id = seq;
+	trans->cmd_id = cmd_op;
+	for (i = 0; i < num_frags; i++) {
+		hdr[i].version = 2;
+		hdr[i].msg_type = msg_type;
+		hdr[i].op_type = cmd->op_type;
+		hdr[i].num_cmds = 1;
+		hdr[i].num_frags = num_frags;
+		hdr[i].frag_num = i + 1;
+		hdr[i].cmd_op = cmd_op;
+		hdr[i].seq_id = seq;
+	}
+	return 0;
+}
+
+static void qlcnic_sriov_cleanup_transaction(struct qlcnic_bc_trans *trans)
+{
+	if (!trans)
+		return;
+	kfree(trans->req_hdr);
+	kfree(trans->rsp_hdr);
+	kfree(trans);
+}
+
+static int qlcnic_sriov_clear_trans(struct qlcnic_vf_info *vf,
+				    struct qlcnic_bc_trans *trans, u8 type)
+{
+	struct qlcnic_trans_list *t_list;
+	unsigned long flags;
+	int ret = 0;
+
+	if (type == QLC_BC_RESPONSE) {
+		t_list = &vf->rcv_act;
+		spin_lock_irqsave(&t_list->lock, flags);
+		t_list->count--;
+		list_del(&trans->list);
+		if (t_list->count > 0)
+			ret = 1;
+		spin_unlock_irqrestore(&t_list->lock, flags);
+	}
+	if (type == QLC_BC_COMMAND) {
+		while (test_and_set_bit(QLC_BC_VF_SEND, &vf->state))
+			msleep(100);
+		vf->send_cmd = NULL;
+		clear_bit(QLC_BC_VF_SEND, &vf->state);
+	}
+	return ret;
+}
+
+static void qlcnic_sriov_schedule_bc_cmd(struct qlcnic_sriov *sriov,
+					 struct qlcnic_vf_info *vf,
+					 work_func_t func)
+{
+	if (test_bit(QLC_BC_VF_FLR, &vf->state) ||
+	    vf->adapter->need_fw_reset)
+		return;
+
+	queue_work(sriov->bc.bc_trans_wq, &vf->trans_work);
+}
+
+static inline void qlcnic_sriov_wait_for_resp(struct qlcnic_bc_trans *trans)
+{
+	struct completion *cmpl = &trans->resp_cmpl;
+
+	if (wait_for_completion_timeout(cmpl, QLC_MBOX_RESP_TIMEOUT))
+		trans->trans_state = QLC_END;
+	else
+		trans->trans_state = QLC_ABORT;
+
+	return;
+}
+
+static void qlcnic_sriov_handle_multi_frags(struct qlcnic_bc_trans *trans,
+					    u8 type)
+{
+	if (type == QLC_BC_RESPONSE) {
+		trans->curr_rsp_frag++;
+		if (trans->curr_rsp_frag < trans->rsp_hdr->num_frags)
+			trans->trans_state = QLC_INIT;
+		else
+			trans->trans_state = QLC_END;
+	} else {
+		trans->curr_req_frag++;
+		if (trans->curr_req_frag < trans->req_hdr->num_frags)
+			trans->trans_state = QLC_INIT;
+		else
+			trans->trans_state = QLC_WAIT_FOR_RESP;
+	}
+}
+
+static void qlcnic_sriov_wait_for_channel_free(struct qlcnic_bc_trans *trans,
+					       u8 type)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct completion *cmpl = &vf->ch_free_cmpl;
+
+	if (!wait_for_completion_timeout(cmpl, QLC_MBOX_CH_FREE_TIMEOUT)) {
+		trans->trans_state = QLC_ABORT;
+		return;
+	}
+
+	clear_bit(QLC_BC_VF_CHANNEL, &vf->state);
+	qlcnic_sriov_handle_multi_frags(trans, type);
+}
+
+static void qlcnic_sriov_pull_bc_msg(struct qlcnic_adapter *adapter,
+				     u32 *hdr, u32 *pay, u32 size)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	u32 fw_mbx;
+	u8 i, max = 2, hdr_size, j;
+
+	hdr_size = (sizeof(struct qlcnic_bc_hdr) / sizeof(u32));
+	max = (size / sizeof(u32)) + hdr_size;
+
+	fw_mbx = readl(QLCNIC_MBX_FW(ahw, 0));
+	for (i = 2, j = 0; j < hdr_size; i++, j++)
+		*(hdr++) = readl(QLCNIC_MBX_FW(ahw, i));
+	for (; j < max; i++, j++)
+		*(pay++) = readl(QLCNIC_MBX_FW(ahw, i));
+}
+
+static int __qlcnic_sriov_issue_bc_post(struct qlcnic_vf_info *vf)
+{
+	int ret = -EBUSY;
+	u32 timeout = 10000;
+
+	do {
+		if (!test_and_set_bit(QLC_BC_VF_CHANNEL, &vf->state)) {
+			ret = 0;
+			break;
+		}
+		mdelay(1);
+	} while (--timeout);
+
+	return ret;
+}
+
+static int qlcnic_sriov_issue_bc_post(struct qlcnic_bc_trans *trans, u8 type)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	u32 pay_size, hdr_size;
+	u32 *hdr, *pay;
+	int ret;
+	u8 pci_func = trans->func_id;
+
+	if (__qlcnic_sriov_issue_bc_post(vf))
+		return -EBUSY;
+
+	if (type == QLC_BC_COMMAND) {
+		hdr = (u32 *)(trans->req_hdr + trans->curr_req_frag);
+		pay = (u32 *)(trans->req_pay + trans->curr_req_frag);
+		hdr_size = (sizeof(struct qlcnic_bc_hdr) / sizeof(u32));
+		pay_size = qlcnic_sriov_get_bc_paysize(trans->req_pay_size,
+						       trans->curr_req_frag);
+		pay_size = (pay_size / sizeof(u32));
+	} else {
+		hdr = (u32 *)(trans->rsp_hdr + trans->curr_rsp_frag);
+		pay = (u32 *)(trans->rsp_pay + trans->curr_rsp_frag);
+		hdr_size = (sizeof(struct qlcnic_bc_hdr) / sizeof(u32));
+		pay_size = qlcnic_sriov_get_bc_paysize(trans->rsp_pay_size,
+						       trans->curr_rsp_frag);
+		pay_size = (pay_size / sizeof(u32));
+	}
+
+	ret = qlcnic_sriov_post_bc_msg(vf->adapter, hdr, pay,
+				       pci_func, pay_size);
+	return ret;
+}
+
+static int __qlcnic_sriov_send_bc_msg(struct qlcnic_bc_trans *trans,
+				      struct qlcnic_vf_info *vf, u8 type)
+{
+	bool flag = true;
+	int err = -EIO;
+
+	while (flag) {
+		if (test_bit(QLC_BC_VF_FLR, &vf->state) ||
+		    vf->adapter->need_fw_reset)
+			trans->trans_state = QLC_ABORT;
+
+		switch (trans->trans_state) {
+		case QLC_INIT:
+			trans->trans_state = QLC_WAIT_FOR_CHANNEL_FREE;
+			if (qlcnic_sriov_issue_bc_post(trans, type))
+				trans->trans_state = QLC_ABORT;
+			break;
+		case QLC_WAIT_FOR_CHANNEL_FREE:
+			qlcnic_sriov_wait_for_channel_free(trans, type);
+			break;
+		case QLC_WAIT_FOR_RESP:
+			qlcnic_sriov_wait_for_resp(trans);
+			break;
+		case QLC_END:
+			err = 0;
+			flag = false;
+			break;
+		case QLC_ABORT:
+			err = -EIO;
+			flag = false;
+			clear_bit(QLC_BC_VF_CHANNEL, &vf->state);
+			break;
+		default:
+			err = -EIO;
+			flag = false;
+		}
+	}
+	return err;
+}
+
+static int qlcnic_sriov_send_bc_cmd(struct qlcnic_adapter *adapter,
+				    struct qlcnic_bc_trans *trans, int pci_func)
+{
+	struct qlcnic_vf_info *vf;
+	int err, index = qlcnic_sriov_func_to_index(adapter, pci_func);
+
+	if (index < 0)
+		return -EIO;
+
+	vf = &adapter->ahw->sriov->vf_info[index];
+	trans->vf = vf;
+	trans->func_id = pci_func;
+
+	if (!test_bit(QLC_BC_VF_STATE, &vf->state)) {
+		if (qlcnic_sriov_pf_check(adapter))
+			return -EIO;
+		if (qlcnic_sriov_vf_check(adapter) &&
+		    trans->cmd_id != QLCNIC_BC_CMD_CHANNEL_INIT)
+			return -EIO;
+	}
+
+	mutex_lock(&vf->send_cmd_lock);
+	vf->send_cmd = trans;
+	err = __qlcnic_sriov_send_bc_msg(trans, vf, QLC_BC_COMMAND);
+	qlcnic_sriov_clear_trans(vf, trans, QLC_BC_COMMAND);
+	mutex_unlock(&vf->send_cmd_lock);
+	return err;
+}
+
+static void __qlcnic_sriov_process_bc_cmd(struct qlcnic_adapter *adapter,
+					  struct qlcnic_bc_trans *trans,
+					  struct qlcnic_cmd_args *cmd)
+{
+#ifdef CONFIG_QLCNIC_SRIOV
+	if (qlcnic_sriov_pf_check(adapter)) {
+		qlcnic_sriov_pf_process_bc_cmd(adapter, trans, cmd);
+		return;
+	}
+#endif
+	cmd->rsp.arg[0] |= (0x9 << 25);
+	return;
+}
+
+static void qlcnic_sriov_process_bc_cmd(struct work_struct *work)
+{
+	struct qlcnic_vf_info *vf = container_of(work, struct qlcnic_vf_info,
+						 trans_work);
+	struct qlcnic_bc_trans *trans = NULL;
+	struct qlcnic_adapter *adapter  = vf->adapter;
+	struct qlcnic_cmd_args cmd;
+	u8 req;
+
+	if (adapter->need_fw_reset)
+		return;
+
+	if (test_bit(QLC_BC_VF_FLR, &vf->state))
+		return;
+
+	memset(&cmd, 0, sizeof(struct qlcnic_cmd_args));
+	trans = list_first_entry(&vf->rcv_act.wait_list,
+				 struct qlcnic_bc_trans, list);
+	adapter = vf->adapter;
+
+	if (qlcnic_sriov_prepare_bc_hdr(trans, &cmd, trans->req_hdr->seq_id,
+					QLC_BC_RESPONSE))
+		goto cleanup_trans;
+
+	__qlcnic_sriov_process_bc_cmd(adapter, trans, &cmd);
+	trans->trans_state = QLC_INIT;
+	__qlcnic_sriov_send_bc_msg(trans, vf, QLC_BC_RESPONSE);
+
+cleanup_trans:
+	qlcnic_free_mbx_args(&cmd);
+	req = qlcnic_sriov_clear_trans(vf, trans, QLC_BC_RESPONSE);
+	qlcnic_sriov_cleanup_transaction(trans);
+	if (req)
+		qlcnic_sriov_schedule_bc_cmd(adapter->ahw->sriov, vf,
+					     qlcnic_sriov_process_bc_cmd);
+}
+
+static void qlcnic_sriov_handle_bc_resp(struct qlcnic_bc_hdr *hdr,
+					struct qlcnic_vf_info *vf)
+{
+	struct qlcnic_bc_trans *trans;
+	u32 pay_size;
+
+	if (test_and_set_bit(QLC_BC_VF_SEND, &vf->state))
+		return;
+
+	trans = vf->send_cmd;
+
+	if (trans == NULL)
+		goto clear_send;
+
+	if (trans->trans_id != hdr->seq_id)
+		goto clear_send;
+
+	pay_size = qlcnic_sriov_get_bc_paysize(trans->rsp_pay_size,
+					       trans->curr_rsp_frag);
+	qlcnic_sriov_pull_bc_msg(vf->adapter,
+				 (u32 *)(trans->rsp_hdr + trans->curr_rsp_frag),
+				 (u32 *)(trans->rsp_pay + trans->curr_rsp_frag),
+				 pay_size);
+	if (++trans->curr_rsp_frag < trans->rsp_hdr->num_frags)
+		goto clear_send;
+
+	complete(&trans->resp_cmpl);
+
+clear_send:
+	clear_bit(QLC_BC_VF_SEND, &vf->state);
+}
+
+int __qlcnic_sriov_add_act_list(struct qlcnic_sriov *sriov,
+				struct qlcnic_vf_info *vf,
+				struct qlcnic_bc_trans *trans)
+{
+	struct qlcnic_trans_list *t_list = &vf->rcv_act;
+
+	t_list->count++;
+	list_add_tail(&trans->list, &t_list->wait_list);
+	if (t_list->count == 1)
+		qlcnic_sriov_schedule_bc_cmd(sriov, vf,
+					     qlcnic_sriov_process_bc_cmd);
+	return 0;
+}
+
+static int qlcnic_sriov_add_act_list(struct qlcnic_sriov *sriov,
+				     struct qlcnic_vf_info *vf,
+				     struct qlcnic_bc_trans *trans)
+{
+	struct qlcnic_trans_list *t_list = &vf->rcv_act;
+
+	spin_lock(&t_list->lock);
+
+	__qlcnic_sriov_add_act_list(sriov, vf, trans);
+
+	spin_unlock(&t_list->lock);
+	return 0;
+}
+
+static void qlcnic_sriov_handle_pending_trans(struct qlcnic_sriov *sriov,
+					      struct qlcnic_vf_info *vf,
+					      struct qlcnic_bc_hdr *hdr)
+{
+	struct qlcnic_bc_trans *trans = NULL;
+	struct list_head *node;
+	u32 pay_size, curr_frag;
+	u8 found = 0, active = 0;
+
+	spin_lock(&vf->rcv_pend.lock);
+	if (vf->rcv_pend.count > 0) {
+		list_for_each(node, &vf->rcv_pend.wait_list) {
+			trans = list_entry(node, struct qlcnic_bc_trans, list);
+			if (trans->trans_id == hdr->seq_id) {
+				found = 1;
+				break;
+			}
+		}
+	}
+
+	if (found) {
+		curr_frag = trans->curr_req_frag;
+		pay_size = qlcnic_sriov_get_bc_paysize(trans->req_pay_size,
+						       curr_frag);
+		qlcnic_sriov_pull_bc_msg(vf->adapter,
+					 (u32 *)(trans->req_hdr + curr_frag),
+					 (u32 *)(trans->req_pay + curr_frag),
+					 pay_size);
+		trans->curr_req_frag++;
+		if (trans->curr_req_frag >= hdr->num_frags) {
+			vf->rcv_pend.count--;
+			list_del(&trans->list);
+			active = 1;
+		}
+	}
+	spin_unlock(&vf->rcv_pend.lock);
+
+	if (active)
+		if (qlcnic_sriov_add_act_list(sriov, vf, trans))
+			qlcnic_sriov_cleanup_transaction(trans);
+
+	return;
+}
+
+static void qlcnic_sriov_handle_bc_cmd(struct qlcnic_sriov *sriov,
+				       struct qlcnic_bc_hdr *hdr,
+				       struct qlcnic_vf_info *vf)
+{
+	struct qlcnic_bc_trans *trans;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	struct qlcnic_cmd_args cmd;
+	u32 pay_size;
+	int err;
+	u8 cmd_op;
+
+	if (adapter->need_fw_reset)
+		return;
+
+	if (!test_bit(QLC_BC_VF_STATE, &vf->state) &&
+	    hdr->op_type != QLC_BC_CMD &&
+	    hdr->cmd_op != QLCNIC_BC_CMD_CHANNEL_INIT)
+		return;
+
+	if (hdr->frag_num > 1) {
+		qlcnic_sriov_handle_pending_trans(sriov, vf, hdr);
+		return;
+	}
+
+	memset(&cmd, 0, sizeof(struct qlcnic_cmd_args));
+	cmd_op = hdr->cmd_op;
+	if (qlcnic_sriov_alloc_bc_trans(&trans))
+		return;
+
+	if (hdr->op_type == QLC_BC_CMD)
+		err = qlcnic_sriov_alloc_bc_mbx_args(&cmd, cmd_op);
+	else
+		err = qlcnic_alloc_mbx_args(&cmd, adapter, cmd_op);
+
+	if (err) {
+		qlcnic_sriov_cleanup_transaction(trans);
+		return;
+	}
+
+	cmd.op_type = hdr->op_type;
+	if (qlcnic_sriov_prepare_bc_hdr(trans, &cmd, hdr->seq_id,
+					QLC_BC_COMMAND)) {
+		qlcnic_free_mbx_args(&cmd);
+		qlcnic_sriov_cleanup_transaction(trans);
+		return;
+	}
+
+	pay_size = qlcnic_sriov_get_bc_paysize(trans->req_pay_size,
+					 trans->curr_req_frag);
+	qlcnic_sriov_pull_bc_msg(vf->adapter,
+				 (u32 *)(trans->req_hdr + trans->curr_req_frag),
+				 (u32 *)(trans->req_pay + trans->curr_req_frag),
+				 pay_size);
+	trans->func_id = vf->pci_func;
+	trans->vf = vf;
+	trans->trans_id = hdr->seq_id;
+	trans->curr_req_frag++;
+
+	if (qlcnic_sriov_soft_flr_check(adapter, trans, vf))
+		return;
+
+	if (trans->curr_req_frag == trans->req_hdr->num_frags) {
+		if (qlcnic_sriov_add_act_list(sriov, vf, trans)) {
+			qlcnic_free_mbx_args(&cmd);
+			qlcnic_sriov_cleanup_transaction(trans);
+		}
+	} else {
+		spin_lock(&vf->rcv_pend.lock);
+		list_add_tail(&trans->list, &vf->rcv_pend.wait_list);
+		vf->rcv_pend.count++;
+		spin_unlock(&vf->rcv_pend.lock);
+	}
+}
+
+static void qlcnic_sriov_handle_msg_event(struct qlcnic_sriov *sriov,
+					  struct qlcnic_vf_info *vf)
+{
+	struct qlcnic_bc_hdr hdr;
+	u32 *ptr = (u32 *)&hdr;
+	u8 msg_type, i;
+
+	for (i = 2; i < 6; i++)
+		ptr[i - 2] = readl(QLCNIC_MBX_FW(vf->adapter->ahw, i));
+	msg_type = hdr.msg_type;
+
+	switch (msg_type) {
+	case QLC_BC_COMMAND:
+		qlcnic_sriov_handle_bc_cmd(sriov, &hdr, vf);
+		break;
+	case QLC_BC_RESPONSE:
+		qlcnic_sriov_handle_bc_resp(&hdr, vf);
+		break;
+	}
+}
+
+static void qlcnic_sriov_handle_flr_event(struct qlcnic_sriov *sriov,
+					  struct qlcnic_vf_info *vf)
+{
+	struct qlcnic_adapter *adapter = vf->adapter;
+
+	if (qlcnic_sriov_pf_check(adapter))
+		qlcnic_sriov_pf_handle_flr(sriov, vf);
+	else
+		dev_err(&adapter->pdev->dev,
+			"Invalid event to VF. VF should not get FLR event\n");
+}
+
+void qlcnic_sriov_handle_bc_event(struct qlcnic_adapter *adapter, u32 event)
+{
+	struct qlcnic_vf_info *vf;
+	struct qlcnic_sriov *sriov;
+	int index;
+	u8 pci_func;
+
+	sriov = adapter->ahw->sriov;
+	pci_func = qlcnic_sriov_target_func_id(event);
+	index = qlcnic_sriov_func_to_index(adapter, pci_func);
+
+	if (index < 0)
+		return;
+
+	vf = &sriov->vf_info[index];
+	vf->pci_func = pci_func;
+
+	if (qlcnic_sriov_channel_free_check(event))
+		complete(&vf->ch_free_cmpl);
+
+	if (qlcnic_sriov_flr_check(event)) {
+		qlcnic_sriov_handle_flr_event(sriov, vf);
+		return;
+	}
+
+	if (qlcnic_sriov_bc_msg_check(event))
+		qlcnic_sriov_handle_msg_event(sriov, vf);
+}
+
+int qlcnic_sriov_cfg_bc_intr(struct qlcnic_adapter *adapter, u8 enable)
+{
+	struct qlcnic_cmd_args cmd;
+	int err;
+
+	if (!test_bit(__QLCNIC_SRIOV_ENABLE, &adapter->state))
+		return 0;
+
+	if (qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_BC_EVENT_SETUP))
+		return -ENOMEM;
+
+	if (enable)
+		cmd.req.arg[1] = (1 << 4) | (1 << 5) | (1 << 6) | (1 << 7);
+
+	err = qlcnic_83xx_issue_cmd(adapter, &cmd);
+
+	if (err != QLCNIC_RCODE_SUCCESS) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to %s bc events, err=%d\n",
+			(enable ? "enable" : "disable"), err);
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+static int qlcnic_sriov_retry_bc_cmd(struct qlcnic_adapter *adapter,
+				     struct qlcnic_bc_trans *trans)
+{
+	u8 max = QLC_BC_CMD_MAX_RETRY_CNT;
+	u32 state;
+
+	state = QLCRDX(adapter->ahw, QLC_83XX_IDC_DEV_STATE);
+	if (state == QLC_83XX_IDC_DEV_READY) {
+		msleep(20);
+		clear_bit(QLC_BC_VF_CHANNEL, &trans->vf->state);
+		trans->trans_state = QLC_INIT;
+		if (++adapter->fw_fail_cnt > max)
+			return -EIO;
+		else
+			return 0;
+	}
+
+	return -EIO;
+}
+
+static int __qlcnic_sriov_issue_cmd(struct qlcnic_adapter *adapter,
+				  struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_mailbox *mbx = ahw->mailbox;
+	struct device *dev = &adapter->pdev->dev;
+	struct qlcnic_bc_trans *trans;
+	int err;
+	u32 rsp_data, opcode, mbx_err_code, rsp;
+	u16 seq = ++adapter->ahw->sriov->bc.trans_counter;
+	u8 func = ahw->pci_func;
+
+	rsp = qlcnic_sriov_alloc_bc_trans(&trans);
+	if (rsp)
+		goto free_cmd;
+
+	rsp = qlcnic_sriov_prepare_bc_hdr(trans, cmd, seq, QLC_BC_COMMAND);
+	if (rsp)
+		goto cleanup_transaction;
+
+retry:
+	if (!test_bit(QLC_83XX_MBX_READY, &mbx->status)) {
+		rsp = -EIO;
+		QLCDB(adapter, DRV, "MBX not Ready!(cmd 0x%x) for VF 0x%x\n",
+		      QLCNIC_MBX_RSP(cmd->req.arg[0]), func);
+		goto err_out;
+	}
+
+	err = qlcnic_sriov_send_bc_cmd(adapter, trans, func);
+	if (err) {
+		dev_err(dev, "MBX command 0x%x timed out for VF %d\n",
+			(cmd->req.arg[0] & 0xffff), func);
+		rsp = QLCNIC_RCODE_TIMEOUT;
+
+		/* After adapter reset PF driver may take some time to
+		 * respond to VF's request. Retry request till maximum retries.
+		 */
+		if ((trans->req_hdr->cmd_op == QLCNIC_BC_CMD_CHANNEL_INIT) &&
+		    !qlcnic_sriov_retry_bc_cmd(adapter, trans))
+			goto retry;
+
+		goto err_out;
+	}
+
+	rsp_data = cmd->rsp.arg[0];
+	mbx_err_code = QLCNIC_MBX_STATUS(rsp_data);
+	opcode = QLCNIC_MBX_RSP(cmd->req.arg[0]);
+
+	if ((mbx_err_code == QLCNIC_MBX_RSP_OK) ||
+	    (mbx_err_code == QLCNIC_MBX_PORT_RSP_OK)) {
+		rsp = QLCNIC_RCODE_SUCCESS;
+	} else {
+		if (cmd->type == QLC_83XX_MBX_CMD_NO_WAIT) {
+			rsp = QLCNIC_RCODE_SUCCESS;
+		} else {
+			rsp = mbx_err_code;
+			if (!rsp)
+				rsp = 1;
+
+			dev_err(dev,
+				"MBX command 0x%x failed with err:0x%x for VF %d\n",
+				opcode, mbx_err_code, func);
+		}
+	}
+
+err_out:
+	if (rsp == QLCNIC_RCODE_TIMEOUT) {
+		ahw->reset_context = 1;
+		adapter->need_fw_reset = 1;
+		clear_bit(QLC_83XX_MBX_READY, &mbx->status);
+	}
+
+cleanup_transaction:
+	qlcnic_sriov_cleanup_transaction(trans);
+
+free_cmd:
+	if (cmd->type == QLC_83XX_MBX_CMD_NO_WAIT) {
+		qlcnic_free_mbx_args(cmd);
+		kfree(cmd);
+	}
+
+	return rsp;
+}
+
+
+static int qlcnic_sriov_issue_cmd(struct qlcnic_adapter *adapter,
+				  struct qlcnic_cmd_args *cmd)
+{
+	if (cmd->type == QLC_83XX_MBX_CMD_NO_WAIT)
+		return qlcnic_sriov_async_issue_cmd(adapter, cmd);
+	else
+		return __qlcnic_sriov_issue_cmd(adapter, cmd);
+}
+
+static int qlcnic_sriov_channel_cfg_cmd(struct qlcnic_adapter *adapter, u8 cmd_op)
+{
+	struct qlcnic_cmd_args cmd;
+	struct qlcnic_vf_info *vf = &adapter->ahw->sriov->vf_info[0];
+	int ret;
+
+	memset(&cmd, 0, sizeof(cmd));
+	if (qlcnic_sriov_alloc_bc_mbx_args(&cmd, cmd_op))
+		return -ENOMEM;
+
+	ret = qlcnic_issue_cmd(adapter, &cmd);
+	if (ret) {
+		dev_err(&adapter->pdev->dev,
+			"Failed bc channel %s %d\n", cmd_op ? "term" : "init",
+			ret);
+		goto out;
+	}
+
+	cmd_op = (cmd.rsp.arg[0] & 0xff);
+	if (cmd.rsp.arg[0] >> 25 == 2)
+		return 2;
+	if (cmd_op == QLCNIC_BC_CMD_CHANNEL_INIT)
+		set_bit(QLC_BC_VF_STATE, &vf->state);
+	else
+		clear_bit(QLC_BC_VF_STATE, &vf->state);
+
+out:
+	qlcnic_free_mbx_args(&cmd);
+	return ret;
+}
+
+static void qlcnic_vf_add_mc_list(struct net_device *netdev, const u8 *mac,
+				  enum qlcnic_mac_type mac_type)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	struct qlcnic_vf_info *vf;
+	u16 vlan_id;
+	int i;
+
+	vf = &adapter->ahw->sriov->vf_info[0];
+
+	if (!qlcnic_sriov_check_any_vlan(vf)) {
+		qlcnic_nic_add_mac(adapter, mac, 0, mac_type);
+	} else {
+		spin_lock(&vf->vlan_list_lock);
+		for (i = 0; i < sriov->num_allowed_vlans; i++) {
+			vlan_id = vf->sriov_vlans[i];
+			if (vlan_id)
+				qlcnic_nic_add_mac(adapter, mac, vlan_id,
+						   mac_type);
+		}
+		spin_unlock(&vf->vlan_list_lock);
+		if (qlcnic_84xx_check(adapter))
+			qlcnic_nic_add_mac(adapter, mac, 0, mac_type);
+	}
+}
+
+void qlcnic_sriov_cleanup_async_list(struct qlcnic_back_channel *bc)
+{
+	struct list_head *head = &bc->async_cmd_list;
+	struct qlcnic_async_cmd *entry;
+
+	flush_workqueue(bc->bc_async_wq);
+	cancel_work_sync(&bc->vf_async_work);
+
+	spin_lock(&bc->queue_lock);
+	while (!list_empty(head)) {
+		entry = list_entry(head->next, struct qlcnic_async_cmd,
+				   list);
+		list_del(&entry->list);
+		kfree(entry->cmd);
+		kfree(entry);
+	}
+	spin_unlock(&bc->queue_lock);
+}
+
+void qlcnic_sriov_vf_set_multi(struct net_device *netdev)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	static const u8 bcast_addr[ETH_ALEN] = {
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+	};
+	struct netdev_hw_addr *ha;
+	u32 mode = VPORT_MISS_MODE_DROP;
+
+	if (!test_bit(__QLCNIC_FW_ATTACHED, &adapter->state))
+		return;
+
+	if (netdev->flags & IFF_PROMISC) {
+		if (!(adapter->flags & QLCNIC_PROMISC_DISABLED))
+			mode = VPORT_MISS_MODE_ACCEPT_ALL;
+	} else if ((netdev->flags & IFF_ALLMULTI) ||
+		   (netdev_mc_count(netdev) > ahw->max_mc_count)) {
+		mode = VPORT_MISS_MODE_ACCEPT_MULTI;
+	} else {
+		qlcnic_vf_add_mc_list(netdev, bcast_addr, QLCNIC_BROADCAST_MAC);
+		if (!netdev_mc_empty(netdev)) {
+			qlcnic_flush_mcast_mac(adapter);
+			netdev_for_each_mc_addr(ha, netdev)
+				qlcnic_vf_add_mc_list(netdev, ha->addr,
+						      QLCNIC_MULTICAST_MAC);
+		}
+	}
+
+	/* configure unicast MAC address, if there is not sufficient space
+	 * to store all the unicast addresses then enable promiscuous mode
+	 */
+	if (netdev_uc_count(netdev) > ahw->max_uc_count) {
+		mode = VPORT_MISS_MODE_ACCEPT_ALL;
+	} else if (!netdev_uc_empty(netdev)) {
+		netdev_for_each_uc_addr(ha, netdev)
+			qlcnic_vf_add_mc_list(netdev, ha->addr,
+					      QLCNIC_UNICAST_MAC);
+	}
+
+	if (adapter->pdev->is_virtfn) {
+		if (mode == VPORT_MISS_MODE_ACCEPT_ALL &&
+		    !adapter->fdb_mac_learn) {
+			qlcnic_alloc_lb_filters_mem(adapter);
+			adapter->drv_mac_learn = 1;
+			adapter->rx_mac_learn = true;
+		} else {
+			adapter->drv_mac_learn = 0;
+			adapter->rx_mac_learn = false;
+		}
+	}
+
+	qlcnic_nic_set_promisc(adapter, mode);
+}
+
+static void qlcnic_sriov_handle_async_issue_cmd(struct work_struct *work)
+{
+	struct qlcnic_async_cmd *entry, *tmp;
+	struct qlcnic_back_channel *bc;
+	struct qlcnic_cmd_args *cmd;
+	struct list_head *head;
+	LIST_HEAD(del_list);
+
+	bc = container_of(work, struct qlcnic_back_channel, vf_async_work);
+	head = &bc->async_cmd_list;
+
+	spin_lock(&bc->queue_lock);
+	list_splice_init(head, &del_list);
+	spin_unlock(&bc->queue_lock);
+
+	list_for_each_entry_safe(entry, tmp, &del_list, list) {
+		list_del(&entry->list);
+		cmd = entry->cmd;
+		__qlcnic_sriov_issue_cmd(bc->adapter, cmd);
+		kfree(entry);
+	}
+
+	if (!list_empty(head))
+		queue_work(bc->bc_async_wq, &bc->vf_async_work);
+
+	return;
+}
+
+static struct qlcnic_async_cmd *
+qlcnic_sriov_alloc_async_cmd(struct qlcnic_back_channel *bc,
+			     struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_async_cmd *entry = NULL;
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry)
+		return NULL;
+
+	entry->cmd = cmd;
+
+	spin_lock(&bc->queue_lock);
+	list_add_tail(&entry->list, &bc->async_cmd_list);
+	spin_unlock(&bc->queue_lock);
+
+	return entry;
+}
+
+static void qlcnic_sriov_schedule_async_cmd(struct qlcnic_back_channel *bc,
+					    struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_async_cmd *entry = NULL;
+
+	entry = qlcnic_sriov_alloc_async_cmd(bc, cmd);
+	if (!entry) {
+		qlcnic_free_mbx_args(cmd);
+		kfree(cmd);
+		return;
+	}
+
+	queue_work(bc->bc_async_wq, &bc->vf_async_work);
+}
+
+static int qlcnic_sriov_async_issue_cmd(struct qlcnic_adapter *adapter,
+					struct qlcnic_cmd_args *cmd)
+{
+
+	struct qlcnic_back_channel *bc = &adapter->ahw->sriov->bc;
+
+	if (adapter->need_fw_reset)
+		return -EIO;
+
+	qlcnic_sriov_schedule_async_cmd(bc, cmd);
+
+	return 0;
+}
+
+static int qlcnic_sriov_vf_reinit_driver(struct qlcnic_adapter *adapter)
+{
+	int err;
+
+	adapter->need_fw_reset = 0;
+	qlcnic_83xx_reinit_mbx_work(adapter->ahw->mailbox);
+	qlcnic_83xx_enable_mbx_interrupt(adapter);
+
+	err = qlcnic_sriov_cfg_bc_intr(adapter, 1);
+	if (err)
+		return err;
+
+	err = qlcnic_sriov_channel_cfg_cmd(adapter, QLCNIC_BC_CMD_CHANNEL_INIT);
+	if (err)
+		goto err_out_cleanup_bc_intr;
+
+	err = qlcnic_sriov_vf_init_driver(adapter);
+	if (err)
+		goto err_out_term_channel;
+
+	return 0;
+
+err_out_term_channel:
+	qlcnic_sriov_channel_cfg_cmd(adapter, QLCNIC_BC_CMD_CHANNEL_TERM);
+
+err_out_cleanup_bc_intr:
+	qlcnic_sriov_cfg_bc_intr(adapter, 0);
+	return err;
+}
+
+static void qlcnic_sriov_vf_attach(struct qlcnic_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (netif_running(netdev)) {
+		if (!qlcnic_up(adapter, netdev))
+			qlcnic_restore_indev_addr(netdev, NETDEV_UP);
+	}
+
+	netif_device_attach(netdev);
+}
+
+static void qlcnic_sriov_vf_detach(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_intrpt_config *intr_tbl = ahw->intr_tbl;
+	struct net_device *netdev = adapter->netdev;
+	u8 i, max_ints = ahw->num_msix - 1;
+
+	netif_device_detach(netdev);
+	qlcnic_83xx_detach_mailbox_work(adapter);
+	qlcnic_83xx_disable_mbx_intr(adapter);
+
+	if (netif_running(netdev))
+		qlcnic_down(adapter, netdev);
+
+	for (i = 0; i < max_ints; i++) {
+		intr_tbl[i].id = i;
+		intr_tbl[i].enabled = 0;
+		intr_tbl[i].src = 0;
+	}
+	ahw->reset_context = 0;
+}
+
+static int qlcnic_sriov_vf_handle_dev_ready(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct device *dev = &adapter->pdev->dev;
+	struct qlc_83xx_idc *idc = &ahw->idc;
+	u8 func = ahw->pci_func;
+	u32 state;
+
+	if ((idc->prev_state == QLC_83XX_IDC_DEV_NEED_RESET) ||
+	    (idc->prev_state == QLC_83XX_IDC_DEV_INIT)) {
+		if (!qlcnic_sriov_vf_reinit_driver(adapter)) {
+			qlcnic_sriov_vf_attach(adapter);
+			adapter->fw_fail_cnt = 0;
+			dev_info(dev,
+				 "%s: Reinitialization of VF 0x%x done after FW reset\n",
+				 __func__, func);
+		} else {
+			dev_err(dev,
+				"%s: Reinitialization of VF 0x%x failed after FW reset\n",
+				__func__, func);
+			state = QLCRDX(ahw, QLC_83XX_IDC_DEV_STATE);
+			dev_info(dev, "Current state 0x%x after FW reset\n",
+				 state);
+		}
+	}
+
+	return 0;
+}
+
+static int qlcnic_sriov_vf_handle_context_reset(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_mailbox *mbx = ahw->mailbox;
+	struct device *dev = &adapter->pdev->dev;
+	struct qlc_83xx_idc *idc = &ahw->idc;
+	u8 func = ahw->pci_func;
+	u32 state;
+
+	adapter->reset_ctx_cnt++;
+
+	/* Skip the context reset and check if FW is hung */
+	if (adapter->reset_ctx_cnt < 3) {
+		adapter->need_fw_reset = 1;
+		clear_bit(QLC_83XX_MBX_READY, &mbx->status);
+		dev_info(dev,
+			 "Resetting context, wait here to check if FW is in failed state\n");
+		return 0;
+	}
+
+	/* Check if number of resets exceed the threshold.
+	 * If it exceeds the threshold just fail the VF.
+	 */
+	if (adapter->reset_ctx_cnt > QLC_83XX_VF_RESET_FAIL_THRESH) {
+		clear_bit(QLC_83XX_MODULE_LOADED, &idc->status);
+		adapter->tx_timeo_cnt = 0;
+		adapter->fw_fail_cnt = 0;
+		adapter->reset_ctx_cnt = 0;
+		qlcnic_sriov_vf_detach(adapter);
+		dev_err(dev,
+			"Device context resets have exceeded the threshold, device interface will be shutdown\n");
+		return -EIO;
+	}
+
+	dev_info(dev, "Resetting context of VF 0x%x\n", func);
+	dev_info(dev, "%s: Context reset count %d for VF 0x%x\n",
+		 __func__, adapter->reset_ctx_cnt, func);
+	set_bit(__QLCNIC_RESETTING, &adapter->state);
+	adapter->need_fw_reset = 1;
+	clear_bit(QLC_83XX_MBX_READY, &mbx->status);
+	qlcnic_sriov_vf_detach(adapter);
+	adapter->need_fw_reset = 0;
+
+	if (!qlcnic_sriov_vf_reinit_driver(adapter)) {
+		qlcnic_sriov_vf_attach(adapter);
+		adapter->tx_timeo_cnt = 0;
+		adapter->reset_ctx_cnt = 0;
+		adapter->fw_fail_cnt = 0;
+		dev_info(dev, "Done resetting context for VF 0x%x\n", func);
+	} else {
+		dev_err(dev, "%s: Reinitialization of VF 0x%x failed\n",
+			__func__, func);
+		state = QLCRDX(ahw, QLC_83XX_IDC_DEV_STATE);
+		dev_info(dev, "%s: Current state 0x%x\n", __func__, state);
+	}
+
+	return 0;
+}
+
+static int qlcnic_sriov_vf_idc_ready_state(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int ret = 0;
+
+	if (ahw->idc.prev_state != QLC_83XX_IDC_DEV_READY)
+		ret = qlcnic_sriov_vf_handle_dev_ready(adapter);
+	else if (ahw->reset_context)
+		ret = qlcnic_sriov_vf_handle_context_reset(adapter);
+
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	return ret;
+}
+
+static int qlcnic_sriov_vf_idc_failed_state(struct qlcnic_adapter *adapter)
+{
+	struct qlc_83xx_idc *idc = &adapter->ahw->idc;
+
+	dev_err(&adapter->pdev->dev, "Device is in failed state\n");
+	if (idc->prev_state == QLC_83XX_IDC_DEV_READY)
+		qlcnic_sriov_vf_detach(adapter);
+
+	clear_bit(QLC_83XX_MODULE_LOADED, &idc->status);
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	return -EIO;
+}
+
+static int
+qlcnic_sriov_vf_idc_need_quiescent_state(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_mailbox *mbx = adapter->ahw->mailbox;
+	struct qlc_83xx_idc *idc = &adapter->ahw->idc;
+
+	dev_info(&adapter->pdev->dev, "Device is in quiescent state\n");
+	if (idc->prev_state == QLC_83XX_IDC_DEV_READY) {
+		set_bit(__QLCNIC_RESETTING, &adapter->state);
+		adapter->tx_timeo_cnt = 0;
+		adapter->reset_ctx_cnt = 0;
+		clear_bit(QLC_83XX_MBX_READY, &mbx->status);
+		qlcnic_sriov_vf_detach(adapter);
+	}
+
+	return 0;
+}
+
+static int qlcnic_sriov_vf_idc_init_reset_state(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_mailbox *mbx = adapter->ahw->mailbox;
+	struct qlc_83xx_idc *idc = &adapter->ahw->idc;
+	u8 func = adapter->ahw->pci_func;
+
+	if (idc->prev_state == QLC_83XX_IDC_DEV_READY) {
+		dev_err(&adapter->pdev->dev,
+			"Firmware hang detected by VF 0x%x\n", func);
+		set_bit(__QLCNIC_RESETTING, &adapter->state);
+		adapter->tx_timeo_cnt = 0;
+		adapter->reset_ctx_cnt = 0;
+		clear_bit(QLC_83XX_MBX_READY, &mbx->status);
+		qlcnic_sriov_vf_detach(adapter);
+	}
+	return 0;
+}
+
+static int qlcnic_sriov_vf_idc_unknown_state(struct qlcnic_adapter *adapter)
+{
+	dev_err(&adapter->pdev->dev, "%s: Device in unknown state\n", __func__);
+	return 0;
+}
+
+static void qlcnic_sriov_vf_periodic_tasks(struct qlcnic_adapter *adapter)
+{
+	if (adapter->fhash.fnum)
+		qlcnic_prune_lb_filters(adapter);
+}
+
+static void qlcnic_sriov_vf_poll_dev_state(struct work_struct *work)
+{
+	struct qlcnic_adapter *adapter;
+	struct qlc_83xx_idc *idc;
+	int ret = 0;
+
+	adapter = container_of(work, struct qlcnic_adapter, fw_work.work);
+	idc = &adapter->ahw->idc;
+	idc->curr_state = QLCRDX(adapter->ahw, QLC_83XX_IDC_DEV_STATE);
+
+	switch (idc->curr_state) {
+	case QLC_83XX_IDC_DEV_READY:
+		ret = qlcnic_sriov_vf_idc_ready_state(adapter);
+		break;
+	case QLC_83XX_IDC_DEV_NEED_RESET:
+	case QLC_83XX_IDC_DEV_INIT:
+		ret = qlcnic_sriov_vf_idc_init_reset_state(adapter);
+		break;
+	case QLC_83XX_IDC_DEV_NEED_QUISCENT:
+		ret = qlcnic_sriov_vf_idc_need_quiescent_state(adapter);
+		break;
+	case QLC_83XX_IDC_DEV_FAILED:
+		ret = qlcnic_sriov_vf_idc_failed_state(adapter);
+		break;
+	case QLC_83XX_IDC_DEV_QUISCENT:
+		break;
+	default:
+		ret = qlcnic_sriov_vf_idc_unknown_state(adapter);
+	}
+
+	idc->prev_state = idc->curr_state;
+	qlcnic_sriov_vf_periodic_tasks(adapter);
+
+	if (!ret && test_bit(QLC_83XX_MODULE_LOADED, &idc->status))
+		qlcnic_schedule_work(adapter, qlcnic_sriov_vf_poll_dev_state,
+				     idc->delay);
+}
+
+static void qlcnic_sriov_vf_cancel_fw_work(struct qlcnic_adapter *adapter)
+{
+	while (test_and_set_bit(__QLCNIC_RESETTING, &adapter->state))
+		msleep(20);
+
+	clear_bit(QLC_83XX_MODULE_LOADED, &adapter->ahw->idc.status);
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	cancel_delayed_work_sync(&adapter->fw_work);
+}
+
+static int qlcnic_sriov_check_vlan_id(struct qlcnic_sriov *sriov,
+				      struct qlcnic_vf_info *vf, u16 vlan_id)
+{
+	int i, err = -EINVAL;
+
+	if (!vf->sriov_vlans)
+		return err;
+
+	spin_lock_bh(&vf->vlan_list_lock);
+
+	for (i = 0; i < sriov->num_allowed_vlans; i++) {
+		if (vf->sriov_vlans[i] == vlan_id) {
+			err = 0;
+			break;
+		}
+	}
+
+	spin_unlock_bh(&vf->vlan_list_lock);
+	return err;
+}
+
+static int qlcnic_sriov_validate_num_vlans(struct qlcnic_sriov *sriov,
+					   struct qlcnic_vf_info *vf)
+{
+	int err = 0;
+
+	spin_lock_bh(&vf->vlan_list_lock);
+
+	if (vf->num_vlan >= sriov->num_allowed_vlans)
+		err = -EINVAL;
+
+	spin_unlock_bh(&vf->vlan_list_lock);
+	return err;
+}
+
+static int qlcnic_sriov_validate_vlan_cfg(struct qlcnic_adapter *adapter,
+					  u16 vid, u8 enable)
+{
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	struct qlcnic_vf_info *vf;
+	bool vlan_exist;
+	u8 allowed = 0;
+	int i;
+
+	vf = &adapter->ahw->sriov->vf_info[0];
+	vlan_exist = qlcnic_sriov_check_any_vlan(vf);
+	if (sriov->vlan_mode != QLC_GUEST_VLAN_MODE)
+		return -EINVAL;
+
+	if (enable) {
+		if (qlcnic_83xx_vf_check(adapter) && vlan_exist)
+			return -EINVAL;
+
+		if (qlcnic_sriov_validate_num_vlans(sriov, vf))
+			return -EINVAL;
+
+		if (sriov->any_vlan) {
+			for (i = 0; i < sriov->num_allowed_vlans; i++) {
+				if (sriov->allowed_vlans[i] == vid)
+					allowed = 1;
+			}
+
+			if (!allowed)
+				return -EINVAL;
+		}
+	} else {
+		if (!vlan_exist || qlcnic_sriov_check_vlan_id(sriov, vf, vid))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void qlcnic_sriov_vlan_operation(struct qlcnic_vf_info *vf, u16 vlan_id,
+					enum qlcnic_vlan_operations opcode)
+{
+	struct qlcnic_adapter *adapter = vf->adapter;
+	struct qlcnic_sriov *sriov;
+
+	sriov = adapter->ahw->sriov;
+
+	if (!vf->sriov_vlans)
+		return;
+
+	spin_lock_bh(&vf->vlan_list_lock);
+
+	switch (opcode) {
+	case QLC_VLAN_ADD:
+		qlcnic_sriov_add_vlan_id(sriov, vf, vlan_id);
+		break;
+	case QLC_VLAN_DELETE:
+		qlcnic_sriov_del_vlan_id(sriov, vf, vlan_id);
+		break;
+	default:
+		netdev_err(adapter->netdev, "Invalid VLAN operation\n");
+	}
+
+	spin_unlock_bh(&vf->vlan_list_lock);
+	return;
+}
+
+int qlcnic_sriov_cfg_vf_guest_vlan(struct qlcnic_adapter *adapter,
+				   u16 vid, u8 enable)
+{
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	struct net_device *netdev = adapter->netdev;
+	struct qlcnic_vf_info *vf;
+	struct qlcnic_cmd_args cmd;
+	int ret;
+
+	memset(&cmd, 0, sizeof(cmd));
+	if (vid == 0)
+		return 0;
+
+	vf = &adapter->ahw->sriov->vf_info[0];
+	ret = qlcnic_sriov_validate_vlan_cfg(adapter, vid, enable);
+	if (ret)
+		return ret;
+
+	ret = qlcnic_sriov_alloc_bc_mbx_args(&cmd,
+					     QLCNIC_BC_CMD_CFG_GUEST_VLAN);
+	if (ret)
+		return ret;
+
+	cmd.req.arg[1] = (enable & 1) | vid << 16;
+
+	qlcnic_sriov_cleanup_async_list(&sriov->bc);
+	ret = qlcnic_issue_cmd(adapter, &cmd);
+	if (ret) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to configure guest VLAN, err=%d\n", ret);
+	} else {
+		netif_addr_lock_bh(netdev);
+		qlcnic_free_mac_list(adapter);
+		netif_addr_unlock_bh(netdev);
+
+		if (enable)
+			qlcnic_sriov_vlan_operation(vf, vid, QLC_VLAN_ADD);
+		else
+			qlcnic_sriov_vlan_operation(vf, vid, QLC_VLAN_DELETE);
+
+		netif_addr_lock_bh(netdev);
+		qlcnic_set_multi(netdev);
+		netif_addr_unlock_bh(netdev);
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+	return ret;
+}
+
+static void qlcnic_sriov_vf_free_mac_list(struct qlcnic_adapter *adapter)
+{
+	struct list_head *head = &adapter->mac_list;
+	struct qlcnic_mac_vlan_list *cur;
+
+	while (!list_empty(head)) {
+		cur = list_entry(head->next, struct qlcnic_mac_vlan_list, list);
+		qlcnic_sre_macaddr_change(adapter, cur->mac_addr, cur->vlan_id,
+					  QLCNIC_MAC_DEL);
+		list_del(&cur->list);
+		kfree(cur);
+	}
+}
+
+
+static int qlcnic_sriov_vf_shutdown(struct pci_dev *pdev)
+{
+	struct qlcnic_adapter *adapter = pci_get_drvdata(pdev);
+	struct net_device *netdev = adapter->netdev;
+	int retval;
+
+	netif_device_detach(netdev);
+	qlcnic_cancel_idc_work(adapter);
+
+	if (netif_running(netdev))
+		qlcnic_down(adapter, netdev);
+
+	qlcnic_sriov_channel_cfg_cmd(adapter, QLCNIC_BC_CMD_CHANNEL_TERM);
+	qlcnic_sriov_cfg_bc_intr(adapter, 0);
+	qlcnic_83xx_disable_mbx_intr(adapter);
+	cancel_delayed_work_sync(&adapter->idc_aen_work);
+
+	retval = pci_save_state(pdev);
+	if (retval)
+		return retval;
+
+	return 0;
+}
+
+static int qlcnic_sriov_vf_resume(struct qlcnic_adapter *adapter)
+{
+	struct qlc_83xx_idc *idc = &adapter->ahw->idc;
+	struct net_device *netdev = adapter->netdev;
+	int err;
+
+	set_bit(QLC_83XX_MODULE_LOADED, &idc->status);
+	qlcnic_83xx_enable_mbx_interrupt(adapter);
+	err = qlcnic_sriov_cfg_bc_intr(adapter, 1);
+	if (err)
+		return err;
+
+	err = qlcnic_sriov_channel_cfg_cmd(adapter, QLCNIC_BC_CMD_CHANNEL_INIT);
+	if (!err) {
+		if (netif_running(netdev)) {
+			err = qlcnic_up(adapter, netdev);
+			if (!err)
+				qlcnic_restore_indev_addr(netdev, NETDEV_UP);
+		}
+	}
+
+	netif_device_attach(netdev);
+	qlcnic_schedule_work(adapter, qlcnic_sriov_vf_poll_dev_state,
+			     idc->delay);
+	return err;
+}
+
+void qlcnic_sriov_alloc_vlans(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	struct qlcnic_vf_info *vf;
+	int i;
+
+	for (i = 0; i < sriov->num_vfs; i++) {
+		vf = &sriov->vf_info[i];
+		vf->sriov_vlans = kcalloc(sriov->num_allowed_vlans,
+					  sizeof(*vf->sriov_vlans), GFP_KERNEL);
+	}
+}
+
+void qlcnic_sriov_free_vlans(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	struct qlcnic_vf_info *vf;
+	int i;
+
+	for (i = 0; i < sriov->num_vfs; i++) {
+		vf = &sriov->vf_info[i];
+		kfree(vf->sriov_vlans);
+		vf->sriov_vlans = NULL;
+	}
+}
+
+void qlcnic_sriov_add_vlan_id(struct qlcnic_sriov *sriov,
+			      struct qlcnic_vf_info *vf, u16 vlan_id)
+{
+	int i;
+
+	for (i = 0; i < sriov->num_allowed_vlans; i++) {
+		if (!vf->sriov_vlans[i]) {
+			vf->sriov_vlans[i] = vlan_id;
+			vf->num_vlan++;
+			return;
+		}
+	}
+}
+
+void qlcnic_sriov_del_vlan_id(struct qlcnic_sriov *sriov,
+			      struct qlcnic_vf_info *vf, u16 vlan_id)
+{
+	int i;
+
+	for (i = 0; i < sriov->num_allowed_vlans; i++) {
+		if (vf->sriov_vlans[i] == vlan_id) {
+			vf->sriov_vlans[i] = 0;
+			vf->num_vlan--;
+			return;
+		}
+	}
+}
+
+bool qlcnic_sriov_check_any_vlan(struct qlcnic_vf_info *vf)
+{
+	bool err = false;
+
+	spin_lock_bh(&vf->vlan_list_lock);
+
+	if (vf->num_vlan)
+		err = true;
+
+	spin_unlock_bh(&vf->vlan_list_lock);
+	return err;
+}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
new file mode 100644
index 0000000..50eaafa
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
@@ -0,0 +1,2051 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include <linux/types.h>
+
+#include "qlcnic_sriov.h"
+#include "qlcnic.h"
+
+#define QLCNIC_SRIOV_VF_MAX_MAC 7
+#define QLC_VF_MIN_TX_RATE	100
+#define QLC_VF_MAX_TX_RATE	9999
+#define QLC_MAC_OPCODE_MASK	0x7
+#define QLC_VF_FLOOD_BIT	BIT_16
+#define QLC_FLOOD_MODE		0x5
+#define QLC_SRIOV_ALLOW_VLAN0	BIT_19
+#define QLC_INTR_COAL_TYPE_MASK	0x7
+
+static int qlcnic_sriov_pf_get_vport_handle(struct qlcnic_adapter *, u8);
+
+struct qlcnic_sriov_cmd_handler {
+	int (*fn) (struct qlcnic_bc_trans *, struct qlcnic_cmd_args *);
+};
+
+struct qlcnic_sriov_fw_cmd_handler {
+	u32 cmd;
+	int (*fn) (struct qlcnic_bc_trans *, struct qlcnic_cmd_args *);
+};
+
+static int qlcnic_sriov_pf_set_vport_info(struct qlcnic_adapter *adapter,
+					  struct qlcnic_info *npar_info,
+					  u16 vport_id)
+{
+	struct qlcnic_cmd_args cmd;
+	int err;
+
+	if (qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_SET_NIC_INFO))
+		return -ENOMEM;
+
+	cmd.req.arg[1] = (vport_id << 16) | 0x1;
+	cmd.req.arg[2] = npar_info->bit_offsets;
+	cmd.req.arg[2] |= npar_info->min_tx_bw << 16;
+	cmd.req.arg[3] = npar_info->max_tx_bw | (npar_info->max_tx_ques << 16);
+	cmd.req.arg[4] = npar_info->max_tx_mac_filters;
+	cmd.req.arg[4] |= npar_info->max_rx_mcast_mac_filters << 16;
+	cmd.req.arg[5] = npar_info->max_rx_ucast_mac_filters |
+			 (npar_info->max_rx_ip_addr << 16);
+	cmd.req.arg[6] = npar_info->max_rx_lro_flow |
+			 (npar_info->max_rx_status_rings << 16);
+	cmd.req.arg[7] = npar_info->max_rx_buf_rings |
+			 (npar_info->max_rx_ques << 16);
+	cmd.req.arg[8] = npar_info->max_tx_vlan_keys;
+	cmd.req.arg[8] |= npar_info->max_local_ipv6_addrs << 16;
+	cmd.req.arg[9] = npar_info->max_remote_ipv6_addrs;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err)
+		dev_err(&adapter->pdev->dev,
+			"Failed to set vport info, err=%d\n", err);
+
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+static int qlcnic_sriov_pf_cal_res_limit(struct qlcnic_adapter *adapter,
+					 struct qlcnic_info *info, u16 func)
+{
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	struct qlcnic_resources *res = &sriov->ff_max;
+	u16 num_macs = sriov->num_allowed_vlans + 1;
+	int ret = -EIO, vpid, id;
+	struct qlcnic_vport *vp;
+	u32 num_vfs, max, temp;
+
+	vpid = qlcnic_sriov_pf_get_vport_handle(adapter, func);
+	if (vpid < 0)
+		return -EINVAL;
+
+	num_vfs = sriov->num_vfs;
+	max = num_vfs + 1;
+	info->bit_offsets = 0xffff;
+	info->max_tx_ques = res->num_tx_queues / max;
+
+	if (qlcnic_83xx_pf_check(adapter))
+		num_macs = QLCNIC_83XX_SRIOV_VF_MAX_MAC;
+
+	info->max_rx_mcast_mac_filters = res->num_rx_mcast_mac_filters;
+
+	if (adapter->ahw->pci_func == func) {
+		info->min_tx_bw = 0;
+		info->max_tx_bw = MAX_BW;
+
+		temp = res->num_rx_ucast_mac_filters - num_macs * num_vfs;
+		info->max_rx_ucast_mac_filters = temp;
+		temp = res->num_tx_mac_filters - num_macs * num_vfs;
+		info->max_tx_mac_filters = temp;
+		temp = num_macs * num_vfs * QLCNIC_SRIOV_VF_MAX_MAC;
+		temp = res->num_rx_mcast_mac_filters - temp;
+		info->max_rx_mcast_mac_filters = temp;
+
+		info->max_tx_ques = res->num_tx_queues - sriov->num_vfs;
+	} else {
+		id = qlcnic_sriov_func_to_index(adapter, func);
+		if (id < 0)
+			return id;
+		vp = sriov->vf_info[id].vp;
+		info->min_tx_bw = vp->min_tx_bw;
+		info->max_tx_bw = vp->max_tx_bw;
+
+		info->max_rx_ucast_mac_filters = num_macs;
+		info->max_tx_mac_filters = num_macs;
+		temp = num_macs * QLCNIC_SRIOV_VF_MAX_MAC;
+		info->max_rx_mcast_mac_filters = temp;
+
+		info->max_tx_ques = QLCNIC_SINGLE_RING;
+	}
+
+	info->max_rx_ip_addr = res->num_destip / max;
+	info->max_rx_status_rings = res->num_rx_status_rings / max;
+	info->max_rx_buf_rings = res->num_rx_buf_rings / max;
+	info->max_rx_ques = res->num_rx_queues / max;
+	info->max_rx_lro_flow = res->num_lro_flows_supported / max;
+	info->max_tx_vlan_keys = res->num_txvlan_keys;
+	info->max_local_ipv6_addrs = res->max_local_ipv6_addrs;
+	info->max_remote_ipv6_addrs = res->max_remote_ipv6_addrs;
+
+	ret = qlcnic_sriov_pf_set_vport_info(adapter, info, vpid);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void qlcnic_sriov_pf_set_ff_max_res(struct qlcnic_adapter *adapter,
+					   struct qlcnic_info *info)
+{
+	struct qlcnic_resources *ff_max = &adapter->ahw->sriov->ff_max;
+
+	ff_max->num_tx_mac_filters = info->max_tx_mac_filters;
+	ff_max->num_rx_ucast_mac_filters = info->max_rx_ucast_mac_filters;
+	ff_max->num_rx_mcast_mac_filters = info->max_rx_mcast_mac_filters;
+	ff_max->num_txvlan_keys = info->max_tx_vlan_keys;
+	ff_max->num_rx_queues = info->max_rx_ques;
+	ff_max->num_tx_queues = info->max_tx_ques;
+	ff_max->num_lro_flows_supported = info->max_rx_lro_flow;
+	ff_max->num_destip = info->max_rx_ip_addr;
+	ff_max->num_rx_buf_rings = info->max_rx_buf_rings;
+	ff_max->num_rx_status_rings = info->max_rx_status_rings;
+	ff_max->max_remote_ipv6_addrs = info->max_remote_ipv6_addrs;
+	ff_max->max_local_ipv6_addrs = info->max_local_ipv6_addrs;
+}
+
+static void qlcnic_sriov_set_vf_max_vlan(struct qlcnic_adapter *adapter,
+					 struct qlcnic_info *npar_info)
+{
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	int temp, total_fn;
+
+	temp = npar_info->max_rx_mcast_mac_filters;
+	total_fn = sriov->num_vfs + 1;
+
+	temp = temp / (QLCNIC_SRIOV_VF_MAX_MAC * total_fn);
+	sriov->num_allowed_vlans = temp - 1;
+
+	if (qlcnic_83xx_pf_check(adapter))
+		sriov->num_allowed_vlans = 1;
+
+	netdev_info(adapter->netdev, "Max Guest VLANs supported per VF = %d\n",
+		    sriov->num_allowed_vlans);
+}
+
+static int qlcnic_sriov_get_pf_info(struct qlcnic_adapter *adapter,
+				    struct qlcnic_info *npar_info)
+{
+	int err;
+	struct qlcnic_cmd_args cmd;
+
+	if (qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_GET_NIC_INFO))
+		return -ENOMEM;
+
+	cmd.req.arg[1] = 0x2;
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to get PF info, err=%d\n", err);
+		goto out;
+	}
+
+	npar_info->total_pf = cmd.rsp.arg[2] & 0xff;
+	npar_info->total_rss_engines = (cmd.rsp.arg[2] >> 8) & 0xff;
+	npar_info->max_vports = MSW(cmd.rsp.arg[2]);
+	npar_info->max_tx_ques =  LSW(cmd.rsp.arg[3]);
+	npar_info->max_tx_mac_filters = MSW(cmd.rsp.arg[3]);
+	npar_info->max_rx_mcast_mac_filters = LSW(cmd.rsp.arg[4]);
+	npar_info->max_rx_ucast_mac_filters = MSW(cmd.rsp.arg[4]);
+	npar_info->max_rx_ip_addr = LSW(cmd.rsp.arg[5]);
+	npar_info->max_rx_lro_flow = MSW(cmd.rsp.arg[5]);
+	npar_info->max_rx_status_rings = LSW(cmd.rsp.arg[6]);
+	npar_info->max_rx_buf_rings = MSW(cmd.rsp.arg[6]);
+	npar_info->max_rx_ques = LSW(cmd.rsp.arg[7]);
+	npar_info->max_tx_vlan_keys = MSW(cmd.rsp.arg[7]);
+	npar_info->max_local_ipv6_addrs = LSW(cmd.rsp.arg[8]);
+	npar_info->max_remote_ipv6_addrs = MSW(cmd.rsp.arg[8]);
+
+	qlcnic_sriov_set_vf_max_vlan(adapter, npar_info);
+	qlcnic_sriov_pf_set_ff_max_res(adapter, npar_info);
+	dev_info(&adapter->pdev->dev,
+		 "\n\ttotal_pf: %d,\n"
+		 "\n\ttotal_rss_engines: %d max_vports: %d max_tx_ques %d,\n"
+		 "\tmax_tx_mac_filters: %d max_rx_mcast_mac_filters: %d,\n"
+		 "\tmax_rx_ucast_mac_filters: 0x%x, max_rx_ip_addr: %d,\n"
+		 "\tmax_rx_lro_flow: %d max_rx_status_rings: %d,\n"
+		 "\tmax_rx_buf_rings: %d, max_rx_ques: %d, max_tx_vlan_keys %d\n"
+		 "\tmax_local_ipv6_addrs: %d, max_remote_ipv6_addrs: %d\n",
+		 npar_info->total_pf, npar_info->total_rss_engines,
+		 npar_info->max_vports, npar_info->max_tx_ques,
+		 npar_info->max_tx_mac_filters,
+		 npar_info->max_rx_mcast_mac_filters,
+		 npar_info->max_rx_ucast_mac_filters, npar_info->max_rx_ip_addr,
+		 npar_info->max_rx_lro_flow, npar_info->max_rx_status_rings,
+		 npar_info->max_rx_buf_rings, npar_info->max_rx_ques,
+		 npar_info->max_tx_vlan_keys, npar_info->max_local_ipv6_addrs,
+		 npar_info->max_remote_ipv6_addrs);
+
+out:
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+static void qlcnic_sriov_pf_reset_vport_handle(struct qlcnic_adapter *adapter,
+					       u8 func)
+{
+	struct qlcnic_sriov  *sriov = adapter->ahw->sriov;
+	struct qlcnic_vport *vp;
+	int index;
+
+	if (adapter->ahw->pci_func == func) {
+		sriov->vp_handle = 0;
+	} else {
+		index = qlcnic_sriov_func_to_index(adapter, func);
+		if (index < 0)
+			return;
+		vp = sriov->vf_info[index].vp;
+		vp->handle = 0;
+	}
+}
+
+static void qlcnic_sriov_pf_set_vport_handle(struct qlcnic_adapter *adapter,
+					     u16 vport_handle, u8 func)
+{
+	struct qlcnic_sriov  *sriov = adapter->ahw->sriov;
+	struct qlcnic_vport *vp;
+	int index;
+
+	if (adapter->ahw->pci_func == func) {
+		sriov->vp_handle = vport_handle;
+	} else {
+		index = qlcnic_sriov_func_to_index(adapter, func);
+		if (index < 0)
+			return;
+		vp = sriov->vf_info[index].vp;
+		vp->handle = vport_handle;
+	}
+}
+
+static int qlcnic_sriov_pf_get_vport_handle(struct qlcnic_adapter *adapter,
+					    u8 func)
+{
+	struct qlcnic_sriov  *sriov = adapter->ahw->sriov;
+	struct qlcnic_vf_info *vf_info;
+	int index;
+
+	if (adapter->ahw->pci_func == func) {
+		return sriov->vp_handle;
+	} else {
+		index = qlcnic_sriov_func_to_index(adapter, func);
+		if (index >= 0) {
+			vf_info = &sriov->vf_info[index];
+			return vf_info->vp->handle;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static int qlcnic_sriov_pf_config_vport(struct qlcnic_adapter *adapter,
+					u8 flag, u16 func)
+{
+	struct qlcnic_cmd_args cmd;
+	int ret;
+	int vpid;
+
+	if (qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_CONFIG_VPORT))
+		return -ENOMEM;
+
+	if (flag) {
+		cmd.req.arg[3] = func << 8;
+	} else {
+		vpid = qlcnic_sriov_pf_get_vport_handle(adapter, func);
+		if (vpid < 0) {
+			ret = -EINVAL;
+			goto out;
+		}
+		cmd.req.arg[3] = ((vpid & 0xffff) << 8) | 1;
+	}
+
+	ret = qlcnic_issue_cmd(adapter, &cmd);
+	if (ret) {
+		dev_err(&adapter->pdev->dev,
+			"Failed %s vport, err %d for func 0x%x\n",
+			(flag ? "enable" : "disable"), ret, func);
+		goto out;
+	}
+
+	if (flag) {
+		vpid = cmd.rsp.arg[2] & 0xffff;
+		qlcnic_sriov_pf_set_vport_handle(adapter, vpid, func);
+	} else {
+		qlcnic_sriov_pf_reset_vport_handle(adapter, func);
+	}
+
+out:
+	qlcnic_free_mbx_args(&cmd);
+	return ret;
+}
+
+static int qlcnic_sriov_pf_cfg_vlan_filtering(struct qlcnic_adapter *adapter,
+					      u8 enable)
+{
+	struct qlcnic_cmd_args cmd;
+	int err;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_SET_NIC_INFO);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = 0x4;
+	if (enable) {
+		adapter->flags |= QLCNIC_VLAN_FILTERING;
+		cmd.req.arg[1] |= BIT_16;
+		if (qlcnic_84xx_check(adapter))
+			cmd.req.arg[1] |= QLC_SRIOV_ALLOW_VLAN0;
+	} else {
+		adapter->flags &= ~QLCNIC_VLAN_FILTERING;
+	}
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err)
+		dev_err(&adapter->pdev->dev,
+			"Failed to configure VLAN filtering, err=%d\n", err);
+
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+/* On configuring VF flood bit, PFD will receive traffic from all VFs */
+static int qlcnic_sriov_pf_cfg_flood(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_cmd_args cmd;
+	int err;
+
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_SET_NIC_INFO);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = QLC_FLOOD_MODE | QLC_VF_FLOOD_BIT;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err)
+		dev_err(&adapter->pdev->dev,
+			"Failed to configure VF Flood bit on PF, err=%d\n",
+			err);
+
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+static int qlcnic_sriov_pf_cfg_eswitch(struct qlcnic_adapter *adapter,
+				       u8 func, u8 enable)
+{
+	struct qlcnic_cmd_args cmd;
+	int err = -EIO;
+
+	if (qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_TOGGLE_ESWITCH))
+		return -ENOMEM;
+
+	cmd.req.arg[0] |= (3 << 29);
+	cmd.req.arg[1] = ((func & 0xf) << 2) | BIT_6 | BIT_1;
+	if (enable)
+		cmd.req.arg[1] |= BIT_0;
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+
+	if (err != QLCNIC_RCODE_SUCCESS) {
+		dev_err(&adapter->pdev->dev,
+			"Failed to enable sriov eswitch%d\n", err);
+		err = -EIO;
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+static void qlcnic_sriov_pf_del_flr_queue(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	struct qlcnic_back_channel *bc = &sriov->bc;
+	int i;
+
+	for (i = 0; i < sriov->num_vfs; i++)
+		cancel_work_sync(&sriov->vf_info[i].flr_work);
+
+	destroy_workqueue(bc->bc_flr_wq);
+}
+
+static int qlcnic_sriov_pf_create_flr_queue(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_back_channel *bc = &adapter->ahw->sriov->bc;
+	struct workqueue_struct *wq;
+
+	wq = create_singlethread_workqueue("qlcnic-flr");
+	if (wq == NULL) {
+		dev_err(&adapter->pdev->dev, "Cannot create FLR workqueue\n");
+		return -ENOMEM;
+	}
+
+	bc->bc_flr_wq =  wq;
+	return 0;
+}
+
+void qlcnic_sriov_pf_cleanup(struct qlcnic_adapter *adapter)
+{
+	u8 func = adapter->ahw->pci_func;
+
+	if (!qlcnic_sriov_enable_check(adapter))
+		return;
+
+	qlcnic_sriov_pf_del_flr_queue(adapter);
+	qlcnic_sriov_cfg_bc_intr(adapter, 0);
+	qlcnic_sriov_pf_config_vport(adapter, 0, func);
+	qlcnic_sriov_pf_cfg_eswitch(adapter, func, 0);
+	qlcnic_sriov_pf_cfg_vlan_filtering(adapter, 0);
+	__qlcnic_sriov_cleanup(adapter);
+	adapter->ahw->op_mode = QLCNIC_MGMT_FUNC;
+	clear_bit(__QLCNIC_SRIOV_ENABLE, &adapter->state);
+}
+
+void qlcnic_sriov_pf_disable(struct qlcnic_adapter *adapter)
+{
+	if (!qlcnic_sriov_pf_check(adapter))
+		return;
+
+	if (!qlcnic_sriov_enable_check(adapter))
+		return;
+
+	pci_disable_sriov(adapter->pdev);
+	netdev_info(adapter->netdev,
+		    "SR-IOV is disabled successfully on port %d\n",
+		    adapter->portnum);
+}
+
+static int qlcnic_pci_sriov_disable(struct qlcnic_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (pci_vfs_assigned(adapter->pdev)) {
+		netdev_err(adapter->netdev,
+			   "SR-IOV VFs belonging to port %d are assigned to VMs. SR-IOV can not be disabled on this port\n",
+			   adapter->portnum);
+		netdev_info(adapter->netdev,
+			    "Please detach SR-IOV VFs belonging to port %d from VMs, and then try to disable SR-IOV on this port\n",
+			    adapter->portnum);
+		return -EPERM;
+	}
+
+	qlcnic_sriov_pf_disable(adapter);
+
+	rtnl_lock();
+	if (netif_running(netdev))
+		__qlcnic_down(adapter, netdev);
+
+	qlcnic_sriov_free_vlans(adapter);
+
+	qlcnic_sriov_pf_cleanup(adapter);
+
+	/* After disabling SRIOV re-init the driver in default mode
+	   configure opmode based on op_mode of function
+	 */
+	if (qlcnic_83xx_configure_opmode(adapter)) {
+		rtnl_unlock();
+		return -EIO;
+	}
+
+	if (netif_running(netdev))
+		__qlcnic_up(adapter, netdev);
+
+	rtnl_unlock();
+	return 0;
+}
+
+static int qlcnic_sriov_pf_init(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_info nic_info, pf_info, vp_info;
+	int err;
+	u8 func = ahw->pci_func;
+
+	if (!qlcnic_sriov_enable_check(adapter))
+		return 0;
+
+	err = qlcnic_sriov_pf_cfg_vlan_filtering(adapter, 1);
+	if (err)
+		return err;
+
+	if (qlcnic_84xx_check(adapter)) {
+		err = qlcnic_sriov_pf_cfg_flood(adapter);
+		if (err)
+			goto disable_vlan_filtering;
+	}
+
+	err = qlcnic_sriov_pf_cfg_eswitch(adapter, func, 1);
+	if (err)
+		goto disable_vlan_filtering;
+
+	err = qlcnic_sriov_pf_config_vport(adapter, 1, func);
+	if (err)
+		goto disable_eswitch;
+
+	err = qlcnic_sriov_get_pf_info(adapter, &pf_info);
+	if (err)
+		goto delete_vport;
+
+	err = qlcnic_get_nic_info(adapter, &nic_info, func);
+	if (err)
+		goto delete_vport;
+
+	err = qlcnic_sriov_pf_cal_res_limit(adapter, &vp_info, func);
+	if (err)
+		goto delete_vport;
+
+	err = qlcnic_sriov_cfg_bc_intr(adapter, 1);
+	if (err)
+		goto delete_vport;
+
+	ahw->physical_port = (u8) nic_info.phys_port;
+	ahw->switch_mode = nic_info.switch_mode;
+	ahw->max_mtu = nic_info.max_mtu;
+	ahw->capabilities = nic_info.capabilities;
+	ahw->nic_mode = QLC_83XX_SRIOV_MODE;
+	return err;
+
+delete_vport:
+	qlcnic_sriov_pf_config_vport(adapter, 0, func);
+
+disable_eswitch:
+	qlcnic_sriov_pf_cfg_eswitch(adapter, func, 0);
+
+disable_vlan_filtering:
+	qlcnic_sriov_pf_cfg_vlan_filtering(adapter, 0);
+
+	return err;
+}
+
+static int qlcnic_sriov_pf_enable(struct qlcnic_adapter *adapter, int num_vfs)
+{
+	int err;
+
+	if (!qlcnic_sriov_enable_check(adapter))
+		return 0;
+
+	err = pci_enable_sriov(adapter->pdev, num_vfs);
+	if (err)
+		qlcnic_sriov_pf_cleanup(adapter);
+
+	return err;
+}
+
+static int __qlcnic_pci_sriov_enable(struct qlcnic_adapter *adapter,
+				     int num_vfs)
+{
+	int err = 0;
+
+	set_bit(__QLCNIC_SRIOV_ENABLE, &adapter->state);
+	adapter->ahw->op_mode = QLCNIC_SRIOV_PF_FUNC;
+
+	err = qlcnic_sriov_init(adapter, num_vfs);
+	if (err)
+		goto clear_op_mode;
+
+	err = qlcnic_sriov_pf_create_flr_queue(adapter);
+	if (err)
+		goto sriov_cleanup;
+
+	err = qlcnic_sriov_pf_init(adapter);
+	if (err)
+		goto del_flr_queue;
+
+	qlcnic_sriov_alloc_vlans(adapter);
+
+	return err;
+
+del_flr_queue:
+	qlcnic_sriov_pf_del_flr_queue(adapter);
+
+sriov_cleanup:
+	__qlcnic_sriov_cleanup(adapter);
+
+clear_op_mode:
+	clear_bit(__QLCNIC_SRIOV_ENABLE, &adapter->state);
+	adapter->ahw->op_mode = QLCNIC_MGMT_FUNC;
+	return err;
+}
+
+static int qlcnic_pci_sriov_enable(struct qlcnic_adapter *adapter, int num_vfs)
+{
+	struct net_device *netdev = adapter->netdev;
+	int err;
+
+	if (!(adapter->flags & QLCNIC_MSIX_ENABLED)) {
+		netdev_err(netdev,
+			   "SR-IOV cannot be enabled, when legacy interrupts are enabled\n");
+		return -EIO;
+	}
+
+	rtnl_lock();
+	if (netif_running(netdev))
+		__qlcnic_down(adapter, netdev);
+
+	err = __qlcnic_pci_sriov_enable(adapter, num_vfs);
+	if (err)
+		goto error;
+
+	if (netif_running(netdev))
+		__qlcnic_up(adapter, netdev);
+
+	rtnl_unlock();
+	err = qlcnic_sriov_pf_enable(adapter, num_vfs);
+	if (!err) {
+		netdev_info(netdev,
+			    "SR-IOV is enabled successfully on port %d\n",
+			    adapter->portnum);
+		/* Return number of vfs enabled */
+		return num_vfs;
+	}
+
+	rtnl_lock();
+	if (netif_running(netdev))
+		__qlcnic_down(adapter, netdev);
+
+error:
+	if (!qlcnic_83xx_configure_opmode(adapter)) {
+		if (netif_running(netdev))
+			__qlcnic_up(adapter, netdev);
+	}
+
+	rtnl_unlock();
+	netdev_info(netdev, "Failed to enable SR-IOV on port %d\n",
+		    adapter->portnum);
+
+	return err;
+}
+
+int qlcnic_pci_sriov_configure(struct pci_dev *dev, int num_vfs)
+{
+	struct qlcnic_adapter *adapter = pci_get_drvdata(dev);
+	int err;
+
+	if (test_and_set_bit(__QLCNIC_RESETTING, &adapter->state))
+		return -EBUSY;
+
+	if (num_vfs == 0)
+		err = qlcnic_pci_sriov_disable(adapter);
+	else
+		err = qlcnic_pci_sriov_enable(adapter, num_vfs);
+
+	clear_bit(__QLCNIC_RESETTING, &adapter->state);
+	return err;
+}
+
+static int qlcnic_sriov_set_vf_acl(struct qlcnic_adapter *adapter, u8 func)
+{
+	struct qlcnic_cmd_args cmd;
+	struct qlcnic_vport *vp;
+	int err, id;
+	u8 *mac;
+
+	id = qlcnic_sriov_func_to_index(adapter, func);
+	if (id < 0)
+		return id;
+
+	vp = adapter->ahw->sriov->vf_info[id].vp;
+	err = qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_SET_NIC_INFO);
+	if (err)
+		return err;
+
+	cmd.req.arg[1] = 0x3 | func << 16;
+	if (vp->spoofchk == true) {
+		mac = vp->mac;
+		cmd.req.arg[2] |= BIT_1 | BIT_3 | BIT_8;
+		cmd.req.arg[4] = mac[5] | mac[4] << 8 | mac[3] << 16 |
+				 mac[2] << 24;
+		cmd.req.arg[5] = mac[1] | mac[0] << 8;
+	}
+
+	if (vp->vlan_mode == QLC_PVID_MODE) {
+		cmd.req.arg[2] |= BIT_6;
+		cmd.req.arg[3] |= vp->pvid << 8;
+	}
+
+	err = qlcnic_issue_cmd(adapter, &cmd);
+	if (err)
+		dev_err(&adapter->pdev->dev, "Failed to set ACL, err=%d\n",
+			err);
+
+	qlcnic_free_mbx_args(&cmd);
+	return err;
+}
+
+static int qlcnic_sriov_set_vf_vport_info(struct qlcnic_adapter *adapter,
+					  u16 func)
+{
+	struct qlcnic_info defvp_info;
+	int err;
+
+	err = qlcnic_sriov_pf_cal_res_limit(adapter, &defvp_info, func);
+	if (err)
+		return -EIO;
+
+	err = qlcnic_sriov_set_vf_acl(adapter, func);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int qlcnic_sriov_pf_channel_cfg_cmd(struct qlcnic_bc_trans *trans,
+					   struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_vport *vp = vf->vp;
+	struct qlcnic_adapter *adapter;
+	struct qlcnic_sriov *sriov;
+	u16 func = vf->pci_func;
+	size_t size;
+	int err;
+
+	adapter = vf->adapter;
+	sriov = adapter->ahw->sriov;
+
+	if (trans->req_hdr->cmd_op == QLCNIC_BC_CMD_CHANNEL_INIT) {
+		err = qlcnic_sriov_pf_config_vport(adapter, 1, func);
+		if (!err) {
+			err = qlcnic_sriov_set_vf_vport_info(adapter, func);
+			if (err)
+				qlcnic_sriov_pf_config_vport(adapter, 0, func);
+		}
+	} else {
+		if (vp->vlan_mode == QLC_GUEST_VLAN_MODE) {
+			size = sizeof(*vf->sriov_vlans);
+			size = size * sriov->num_allowed_vlans;
+			memset(vf->sriov_vlans, 0, size);
+		}
+
+		err = qlcnic_sriov_pf_config_vport(adapter, 0, func);
+	}
+
+	if (err)
+		goto err_out;
+
+	cmd->rsp.arg[0] |= (1 << 25);
+
+	if (trans->req_hdr->cmd_op == QLCNIC_BC_CMD_CHANNEL_INIT)
+		set_bit(QLC_BC_VF_STATE, &vf->state);
+	else
+		clear_bit(QLC_BC_VF_STATE, &vf->state);
+
+	return err;
+
+err_out:
+	cmd->rsp.arg[0] |= (2 << 25);
+	return err;
+}
+
+static int qlcnic_sriov_cfg_vf_def_mac(struct qlcnic_adapter *adapter,
+				       struct qlcnic_vf_info *vf,
+				       u16 vlan, u8 op)
+{
+	struct qlcnic_cmd_args *cmd;
+	struct qlcnic_macvlan_mbx mv;
+	struct qlcnic_vport *vp;
+	u8 *addr;
+	int err;
+	u32 *buf;
+	int vpid;
+
+	vp = vf->vp;
+
+	cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC);
+	if (!cmd)
+		return -ENOMEM;
+
+	err = qlcnic_alloc_mbx_args(cmd, adapter, QLCNIC_CMD_CONFIG_MAC_VLAN);
+	if (err)
+		goto free_cmd;
+
+	cmd->type = QLC_83XX_MBX_CMD_NO_WAIT;
+	vpid = qlcnic_sriov_pf_get_vport_handle(adapter, vf->pci_func);
+	if (vpid < 0) {
+		err = -EINVAL;
+		goto free_args;
+	}
+
+	if (vlan)
+		op = ((op == QLCNIC_MAC_ADD || op == QLCNIC_MAC_VLAN_ADD) ?
+		      QLCNIC_MAC_VLAN_ADD : QLCNIC_MAC_VLAN_DEL);
+
+	cmd->req.arg[1] = op | (1 << 8) | (3 << 6);
+	cmd->req.arg[1] |= ((vpid & 0xffff) << 16) | BIT_31;
+
+	addr = vp->mac;
+	mv.vlan = vlan;
+	mv.mac_addr0 = addr[0];
+	mv.mac_addr1 = addr[1];
+	mv.mac_addr2 = addr[2];
+	mv.mac_addr3 = addr[3];
+	mv.mac_addr4 = addr[4];
+	mv.mac_addr5 = addr[5];
+	buf = &cmd->req.arg[2];
+	memcpy(buf, &mv, sizeof(struct qlcnic_macvlan_mbx));
+
+	err = qlcnic_issue_cmd(adapter, cmd);
+
+	if (!err)
+		return err;
+
+free_args:
+	qlcnic_free_mbx_args(cmd);
+free_cmd:
+	kfree(cmd);
+	return err;
+}
+
+static int qlcnic_sriov_validate_create_rx_ctx(struct qlcnic_cmd_args *cmd)
+{
+	if ((cmd->req.arg[0] >> 29) != 0x3)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void qlcnic_83xx_cfg_default_mac_vlan(struct qlcnic_adapter *adapter,
+					     struct qlcnic_vf_info *vf,
+					     int opcode)
+{
+	struct qlcnic_sriov *sriov;
+	u16 vlan;
+	int i;
+
+	sriov = adapter->ahw->sriov;
+
+	spin_lock_bh(&vf->vlan_list_lock);
+	if (vf->num_vlan) {
+		for (i = 0; i < sriov->num_allowed_vlans; i++) {
+			vlan = vf->sriov_vlans[i];
+			if (vlan)
+				qlcnic_sriov_cfg_vf_def_mac(adapter, vf, vlan,
+							    opcode);
+		}
+	}
+	spin_unlock_bh(&vf->vlan_list_lock);
+
+	if (vf->vp->vlan_mode != QLC_PVID_MODE) {
+		if (qlcnic_83xx_pf_check(adapter) &&
+		    qlcnic_sriov_check_any_vlan(vf))
+			return;
+		qlcnic_sriov_cfg_vf_def_mac(adapter, vf, 0, opcode);
+	}
+}
+
+static int qlcnic_sriov_pf_create_rx_ctx_cmd(struct qlcnic_bc_trans *tran,
+					     struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = tran->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	struct qlcnic_rcv_mbx_out *mbx_out;
+	int err;
+
+	err = qlcnic_sriov_validate_create_rx_ctx(cmd);
+	if (err) {
+		cmd->rsp.arg[0] |= (0x6 << 25);
+		return err;
+	}
+
+	cmd->req.arg[6] = vf->vp->handle;
+	err = qlcnic_issue_cmd(adapter, cmd);
+
+	if (!err) {
+		mbx_out = (struct qlcnic_rcv_mbx_out *)&cmd->rsp.arg[1];
+		vf->rx_ctx_id = mbx_out->ctx_id;
+		qlcnic_83xx_cfg_default_mac_vlan(adapter, vf, QLCNIC_MAC_ADD);
+	} else {
+		vf->rx_ctx_id = 0;
+	}
+
+	return err;
+}
+
+static int qlcnic_sriov_pf_mac_address_cmd(struct qlcnic_bc_trans *trans,
+					   struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	u8 type, *mac;
+
+	type = cmd->req.arg[1];
+	switch (type) {
+	case QLCNIC_SET_STATION_MAC:
+	case QLCNIC_SET_FAC_DEF_MAC:
+		cmd->rsp.arg[0] = (2 << 25);
+		break;
+	case QLCNIC_GET_CURRENT_MAC:
+		cmd->rsp.arg[0] = (1 << 25);
+		mac = vf->vp->mac;
+		cmd->rsp.arg[2] = mac[1] | ((mac[0] << 8) & 0xff00);
+		cmd->rsp.arg[1] = mac[5] | ((mac[4] << 8) & 0xff00) |
+				  ((mac[3]) << 16 & 0xff0000) |
+				  ((mac[2]) << 24 & 0xff000000);
+	}
+
+	return 0;
+}
+
+static int qlcnic_sriov_validate_create_tx_ctx(struct qlcnic_cmd_args *cmd)
+{
+	if ((cmd->req.arg[0] >> 29) != 0x3)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qlcnic_sriov_pf_create_tx_ctx_cmd(struct qlcnic_bc_trans *trans,
+					     struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	struct qlcnic_tx_mbx_out *mbx_out;
+	int err;
+
+	err = qlcnic_sriov_validate_create_tx_ctx(cmd);
+	if (err) {
+		cmd->rsp.arg[0] |= (0x6 << 25);
+		return err;
+	}
+
+	cmd->req.arg[5] |= vf->vp->handle << 16;
+	err = qlcnic_issue_cmd(adapter, cmd);
+	if (!err) {
+		mbx_out = (struct qlcnic_tx_mbx_out *)&cmd->rsp.arg[2];
+		vf->tx_ctx_id = mbx_out->ctx_id;
+	} else {
+		vf->tx_ctx_id = 0;
+	}
+
+	return err;
+}
+
+static int qlcnic_sriov_validate_del_rx_ctx(struct qlcnic_vf_info *vf,
+					    struct qlcnic_cmd_args *cmd)
+{
+	if ((cmd->req.arg[0] >> 29) != 0x3)
+		return -EINVAL;
+
+	if ((cmd->req.arg[1] & 0xffff) != vf->rx_ctx_id)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qlcnic_sriov_pf_del_rx_ctx_cmd(struct qlcnic_bc_trans *trans,
+					  struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	int err;
+
+	err = qlcnic_sriov_validate_del_rx_ctx(vf, cmd);
+	if (err) {
+		cmd->rsp.arg[0] |= (0x6 << 25);
+		return err;
+	}
+
+	qlcnic_83xx_cfg_default_mac_vlan(adapter, vf, QLCNIC_MAC_DEL);
+	cmd->req.arg[1] |= vf->vp->handle << 16;
+	err = qlcnic_issue_cmd(adapter, cmd);
+
+	if (!err)
+		vf->rx_ctx_id = 0;
+
+	return err;
+}
+
+static int qlcnic_sriov_validate_del_tx_ctx(struct qlcnic_vf_info *vf,
+					    struct qlcnic_cmd_args *cmd)
+{
+	if ((cmd->req.arg[0] >> 29) != 0x3)
+		return -EINVAL;
+
+	if ((cmd->req.arg[1] & 0xffff) != vf->tx_ctx_id)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qlcnic_sriov_pf_del_tx_ctx_cmd(struct qlcnic_bc_trans *trans,
+					  struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	int err;
+
+	err = qlcnic_sriov_validate_del_tx_ctx(vf, cmd);
+	if (err) {
+		cmd->rsp.arg[0] |= (0x6 << 25);
+		return err;
+	}
+
+	cmd->req.arg[1] |= vf->vp->handle << 16;
+	err = qlcnic_issue_cmd(adapter, cmd);
+
+	if (!err)
+		vf->tx_ctx_id = 0;
+
+	return err;
+}
+
+static int qlcnic_sriov_validate_cfg_lro(struct qlcnic_vf_info *vf,
+					 struct qlcnic_cmd_args *cmd)
+{
+	if ((cmd->req.arg[1] >> 16) != vf->rx_ctx_id)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qlcnic_sriov_pf_cfg_lro_cmd(struct qlcnic_bc_trans *trans,
+				       struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	int err;
+
+	err = qlcnic_sriov_validate_cfg_lro(vf, cmd);
+	if (err) {
+		cmd->rsp.arg[0] |= (0x6 << 25);
+		return err;
+	}
+
+	err = qlcnic_issue_cmd(adapter, cmd);
+	return err;
+}
+
+static int qlcnic_sriov_pf_cfg_ip_cmd(struct qlcnic_bc_trans *trans,
+				      struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	int err = -EIO;
+	u8 op;
+
+	op =  cmd->req.arg[1] & 0xff;
+
+	cmd->req.arg[1] |= vf->vp->handle << 16;
+	cmd->req.arg[1] |= BIT_31;
+
+	err = qlcnic_issue_cmd(adapter, cmd);
+	return err;
+}
+
+static int qlcnic_sriov_validate_cfg_intrpt(struct qlcnic_vf_info *vf,
+					    struct qlcnic_cmd_args *cmd)
+{
+	if (((cmd->req.arg[1] >> 8) & 0xff) != vf->pci_func)
+		return -EINVAL;
+
+	if (!(cmd->req.arg[1] & BIT_16))
+		return -EINVAL;
+
+	if ((cmd->req.arg[1] & 0xff) != 0x1)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qlcnic_sriov_pf_cfg_intrpt_cmd(struct qlcnic_bc_trans *trans,
+					  struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	int err;
+
+	err = qlcnic_sriov_validate_cfg_intrpt(vf, cmd);
+	if (err)
+		cmd->rsp.arg[0] |= (0x6 << 25);
+	else
+		err = qlcnic_issue_cmd(adapter, cmd);
+
+	return err;
+}
+
+static int qlcnic_sriov_validate_mtu(struct qlcnic_adapter *adapter,
+				     struct qlcnic_vf_info *vf,
+				     struct qlcnic_cmd_args *cmd)
+{
+	if (cmd->req.arg[1] != vf->rx_ctx_id)
+		return -EINVAL;
+
+	if (cmd->req.arg[2] > adapter->ahw->max_mtu)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qlcnic_sriov_pf_set_mtu_cmd(struct qlcnic_bc_trans *trans,
+				       struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	int err;
+
+	err = qlcnic_sriov_validate_mtu(adapter, vf, cmd);
+	if (err)
+		cmd->rsp.arg[0] |= (0x6 << 25);
+	else
+		err = qlcnic_issue_cmd(adapter, cmd);
+
+	return err;
+}
+
+static int qlcnic_sriov_validate_get_nic_info(struct qlcnic_vf_info *vf,
+					      struct qlcnic_cmd_args *cmd)
+{
+	if (cmd->req.arg[1] & BIT_31) {
+		if (((cmd->req.arg[1] >> 16) & 0x7fff) != vf->pci_func)
+			return -EINVAL;
+	} else {
+		cmd->req.arg[1] |= vf->vp->handle << 16;
+	}
+
+	return 0;
+}
+
+static int qlcnic_sriov_pf_get_nic_info_cmd(struct qlcnic_bc_trans *trans,
+					    struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	int err;
+
+	err = qlcnic_sriov_validate_get_nic_info(vf, cmd);
+	if (err) {
+		cmd->rsp.arg[0] |= (0x6 << 25);
+		return err;
+	}
+
+	err = qlcnic_issue_cmd(adapter, cmd);
+	return err;
+}
+
+static int qlcnic_sriov_validate_cfg_rss(struct qlcnic_vf_info *vf,
+					 struct qlcnic_cmd_args *cmd)
+{
+	if (cmd->req.arg[1] != vf->rx_ctx_id)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qlcnic_sriov_pf_cfg_rss_cmd(struct qlcnic_bc_trans *trans,
+				       struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	int err;
+
+	err = qlcnic_sriov_validate_cfg_rss(vf, cmd);
+	if (err)
+		cmd->rsp.arg[0] |= (0x6 << 25);
+	else
+		err = qlcnic_issue_cmd(adapter, cmd);
+
+	return err;
+}
+
+static int qlcnic_sriov_validate_cfg_intrcoal(struct qlcnic_adapter *adapter,
+					      struct qlcnic_vf_info *vf,
+					      struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_nic_intr_coalesce *coal = &adapter->ahw->coal;
+	u16 ctx_id, pkts, time;
+	int err = -EINVAL;
+	u8 type;
+
+	type = cmd->req.arg[1] & QLC_INTR_COAL_TYPE_MASK;
+	ctx_id = cmd->req.arg[1] >> 16;
+	pkts = cmd->req.arg[2] & 0xffff;
+	time = cmd->req.arg[2] >> 16;
+
+	switch (type) {
+	case QLCNIC_INTR_COAL_TYPE_RX:
+		if (ctx_id != vf->rx_ctx_id || pkts > coal->rx_packets ||
+		    time < coal->rx_time_us)
+			goto err_label;
+		break;
+	case QLCNIC_INTR_COAL_TYPE_TX:
+		if (ctx_id != vf->tx_ctx_id || pkts > coal->tx_packets ||
+		    time < coal->tx_time_us)
+			goto err_label;
+		break;
+	default:
+		netdev_err(adapter->netdev, "Invalid coalescing type 0x%x received\n",
+			   type);
+		return err;
+	}
+
+	return 0;
+
+err_label:
+	netdev_err(adapter->netdev, "Expected: rx_ctx_id 0x%x rx_packets 0x%x rx_time_us 0x%x tx_ctx_id 0x%x tx_packets 0x%x tx_time_us 0x%x\n",
+		   vf->rx_ctx_id, coal->rx_packets, coal->rx_time_us,
+		   vf->tx_ctx_id, coal->tx_packets, coal->tx_time_us);
+	netdev_err(adapter->netdev, "Received: ctx_id 0x%x packets 0x%x time_us 0x%x type 0x%x\n",
+		   ctx_id, pkts, time, type);
+
+	return err;
+}
+
+static int qlcnic_sriov_pf_cfg_intrcoal_cmd(struct qlcnic_bc_trans *tran,
+					    struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = tran->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	int err;
+
+	err = qlcnic_sriov_validate_cfg_intrcoal(adapter, vf, cmd);
+	if (err) {
+		cmd->rsp.arg[0] |= (0x6 << 25);
+		return err;
+	}
+
+	err = qlcnic_issue_cmd(adapter, cmd);
+	return err;
+}
+
+static int qlcnic_sriov_validate_cfg_macvlan(struct qlcnic_adapter *adapter,
+					     struct qlcnic_vf_info *vf,
+					     struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vport *vp = vf->vp;
+	u8 op, new_op;
+
+	if (!(cmd->req.arg[1] & BIT_8))
+		return -EINVAL;
+
+	cmd->req.arg[1] |= (vf->vp->handle << 16);
+	cmd->req.arg[1] |= BIT_31;
+
+	if (vp->vlan_mode == QLC_PVID_MODE) {
+		op = cmd->req.arg[1] & 0x7;
+		cmd->req.arg[1] &= ~0x7;
+		new_op = (op == QLCNIC_MAC_ADD || op == QLCNIC_MAC_VLAN_ADD) ?
+			 QLCNIC_MAC_VLAN_ADD : QLCNIC_MAC_VLAN_DEL;
+		cmd->req.arg[3] |= vp->pvid << 16;
+		cmd->req.arg[1] |= new_op;
+	}
+
+	return 0;
+}
+
+static int qlcnic_sriov_pf_cfg_macvlan_cmd(struct qlcnic_bc_trans *trans,
+					   struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	int err;
+
+	err = qlcnic_sriov_validate_cfg_macvlan(adapter, vf, cmd);
+	if (err) {
+		cmd->rsp.arg[0] |= (0x6 << 25);
+		return err;
+	}
+
+	err = qlcnic_issue_cmd(adapter, cmd);
+	return err;
+}
+
+static int qlcnic_sriov_validate_linkevent(struct qlcnic_vf_info *vf,
+					   struct qlcnic_cmd_args *cmd)
+{
+	if ((cmd->req.arg[1] >> 16) != vf->rx_ctx_id)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qlcnic_sriov_pf_linkevent_cmd(struct qlcnic_bc_trans *trans,
+					 struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	int err;
+
+	err = qlcnic_sriov_validate_linkevent(vf, cmd);
+	if (err) {
+		cmd->rsp.arg[0] |= (0x6 << 25);
+		return err;
+	}
+
+	err = qlcnic_issue_cmd(adapter, cmd);
+	return err;
+}
+
+static int qlcnic_sriov_pf_cfg_promisc_cmd(struct qlcnic_bc_trans *trans,
+					   struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_adapter *adapter = vf->adapter;
+	int err;
+
+	cmd->req.arg[1] |= vf->vp->handle << 16;
+	cmd->req.arg[1] |= BIT_31;
+	err = qlcnic_issue_cmd(adapter, cmd);
+	return err;
+}
+
+static int qlcnic_sriov_pf_get_acl_cmd(struct qlcnic_bc_trans *trans,
+				       struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info *vf = trans->vf;
+	struct qlcnic_vport *vp = vf->vp;
+	u8 cmd_op, mode = vp->vlan_mode;
+	struct qlcnic_adapter *adapter;
+	struct qlcnic_sriov *sriov;
+
+	adapter = vf->adapter;
+	sriov = adapter->ahw->sriov;
+
+	cmd_op = trans->req_hdr->cmd_op;
+	cmd->rsp.arg[0] |= 1 << 25;
+
+	/* For 84xx adapter in case of PVID , PFD should send vlan mode as
+	 * QLC_NO_VLAN_MODE to VFD which is zero in mailbox response
+	 */
+	if (qlcnic_84xx_check(adapter) && mode == QLC_PVID_MODE)
+		return 0;
+
+	switch (mode) {
+	case QLC_GUEST_VLAN_MODE:
+		cmd->rsp.arg[1] = mode | 1 << 8;
+		cmd->rsp.arg[2] = sriov->num_allowed_vlans << 16;
+		break;
+	case QLC_PVID_MODE:
+		cmd->rsp.arg[1] = mode | 1 << 8 | vp->pvid << 16;
+		break;
+	}
+
+	return 0;
+}
+
+static int qlcnic_sriov_pf_del_guest_vlan(struct qlcnic_adapter *adapter,
+					  struct qlcnic_vf_info *vf,
+					  struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	u16 vlan;
+
+	if (!qlcnic_sriov_check_any_vlan(vf))
+		return -EINVAL;
+
+	vlan = cmd->req.arg[1] >> 16;
+	if (!vf->rx_ctx_id) {
+		qlcnic_sriov_del_vlan_id(sriov, vf, vlan);
+		return 0;
+	}
+
+	qlcnic_sriov_cfg_vf_def_mac(adapter, vf, vlan, QLCNIC_MAC_DEL);
+	qlcnic_sriov_del_vlan_id(sriov, vf, vlan);
+
+	if (qlcnic_83xx_pf_check(adapter))
+		qlcnic_sriov_cfg_vf_def_mac(adapter, vf,
+					    0, QLCNIC_MAC_ADD);
+	return 0;
+}
+
+static int qlcnic_sriov_pf_add_guest_vlan(struct qlcnic_adapter *adapter,
+					  struct qlcnic_vf_info *vf,
+					  struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	int err = -EIO;
+	u16 vlan;
+
+	if (qlcnic_83xx_pf_check(adapter) && qlcnic_sriov_check_any_vlan(vf))
+		return err;
+
+	vlan = cmd->req.arg[1] >> 16;
+
+	if (!vf->rx_ctx_id) {
+		qlcnic_sriov_add_vlan_id(sriov, vf, vlan);
+		return 0;
+	}
+
+	if (qlcnic_83xx_pf_check(adapter)) {
+		err = qlcnic_sriov_cfg_vf_def_mac(adapter, vf, 0,
+						  QLCNIC_MAC_DEL);
+		if (err)
+			return err;
+	}
+
+	err = qlcnic_sriov_cfg_vf_def_mac(adapter, vf, vlan, QLCNIC_MAC_ADD);
+
+	if (err) {
+		if (qlcnic_83xx_pf_check(adapter))
+			qlcnic_sriov_cfg_vf_def_mac(adapter, vf, 0,
+						    QLCNIC_MAC_ADD);
+		return err;
+	}
+
+	qlcnic_sriov_add_vlan_id(sriov, vf, vlan);
+	return err;
+}
+
+static int qlcnic_sriov_pf_cfg_guest_vlan_cmd(struct qlcnic_bc_trans *tran,
+					      struct qlcnic_cmd_args *cmd)
+{
+	struct qlcnic_vf_info  *vf = tran->vf;
+	struct qlcnic_adapter *adapter =  vf->adapter;
+	struct qlcnic_vport *vp = vf->vp;
+	int err = -EIO;
+	u8 op;
+
+	if (vp->vlan_mode != QLC_GUEST_VLAN_MODE) {
+		cmd->rsp.arg[0] |= 2 << 25;
+		return err;
+	}
+
+	op = cmd->req.arg[1] & 0xf;
+
+	if (op)
+		err = qlcnic_sriov_pf_add_guest_vlan(adapter, vf, cmd);
+	else
+		err = qlcnic_sriov_pf_del_guest_vlan(adapter, vf, cmd);
+
+	cmd->rsp.arg[0] |= err ? 2 << 25 : 1 << 25;
+	return err;
+}
+
+static const int qlcnic_pf_passthru_supp_cmds[] = {
+	QLCNIC_CMD_GET_STATISTICS,
+	QLCNIC_CMD_GET_PORT_CONFIG,
+	QLCNIC_CMD_GET_LINK_STATUS,
+	QLCNIC_CMD_INIT_NIC_FUNC,
+	QLCNIC_CMD_STOP_NIC_FUNC,
+};
+
+static const struct qlcnic_sriov_cmd_handler qlcnic_pf_bc_cmd_hdlr[] = {
+	[QLCNIC_BC_CMD_CHANNEL_INIT] = {&qlcnic_sriov_pf_channel_cfg_cmd},
+	[QLCNIC_BC_CMD_CHANNEL_TERM] = {&qlcnic_sriov_pf_channel_cfg_cmd},
+	[QLCNIC_BC_CMD_GET_ACL]	= {&qlcnic_sriov_pf_get_acl_cmd},
+	[QLCNIC_BC_CMD_CFG_GUEST_VLAN]	= {&qlcnic_sriov_pf_cfg_guest_vlan_cmd},
+};
+
+static const struct qlcnic_sriov_fw_cmd_handler qlcnic_pf_fw_cmd_hdlr[] = {
+	{QLCNIC_CMD_CREATE_RX_CTX, qlcnic_sriov_pf_create_rx_ctx_cmd},
+	{QLCNIC_CMD_CREATE_TX_CTX, qlcnic_sriov_pf_create_tx_ctx_cmd},
+	{QLCNIC_CMD_MAC_ADDRESS, qlcnic_sriov_pf_mac_address_cmd},
+	{QLCNIC_CMD_DESTROY_RX_CTX, qlcnic_sriov_pf_del_rx_ctx_cmd},
+	{QLCNIC_CMD_DESTROY_TX_CTX, qlcnic_sriov_pf_del_tx_ctx_cmd},
+	{QLCNIC_CMD_CONFIGURE_HW_LRO, qlcnic_sriov_pf_cfg_lro_cmd},
+	{QLCNIC_CMD_CONFIGURE_IP_ADDR, qlcnic_sriov_pf_cfg_ip_cmd},
+	{QLCNIC_CMD_CONFIG_INTRPT, qlcnic_sriov_pf_cfg_intrpt_cmd},
+	{QLCNIC_CMD_SET_MTU, qlcnic_sriov_pf_set_mtu_cmd},
+	{QLCNIC_CMD_GET_NIC_INFO, qlcnic_sriov_pf_get_nic_info_cmd},
+	{QLCNIC_CMD_CONFIGURE_RSS, qlcnic_sriov_pf_cfg_rss_cmd},
+	{QLCNIC_CMD_CONFIG_INTR_COAL, qlcnic_sriov_pf_cfg_intrcoal_cmd},
+	{QLCNIC_CMD_CONFIG_MAC_VLAN, qlcnic_sriov_pf_cfg_macvlan_cmd},
+	{QLCNIC_CMD_GET_LINK_EVENT, qlcnic_sriov_pf_linkevent_cmd},
+	{QLCNIC_CMD_CONFIGURE_MAC_RX_MODE, qlcnic_sriov_pf_cfg_promisc_cmd},
+};
+
+void qlcnic_sriov_pf_process_bc_cmd(struct qlcnic_adapter *adapter,
+				    struct qlcnic_bc_trans *trans,
+				    struct qlcnic_cmd_args *cmd)
+{
+	u8 size, cmd_op;
+
+	cmd_op = trans->req_hdr->cmd_op;
+
+	if (trans->req_hdr->op_type == QLC_BC_CMD) {
+		size = ARRAY_SIZE(qlcnic_pf_bc_cmd_hdlr);
+		if (cmd_op < size) {
+			qlcnic_pf_bc_cmd_hdlr[cmd_op].fn(trans, cmd);
+			return;
+		}
+	} else {
+		int i;
+		size = ARRAY_SIZE(qlcnic_pf_fw_cmd_hdlr);
+		for (i = 0; i < size; i++) {
+			if (cmd_op == qlcnic_pf_fw_cmd_hdlr[i].cmd) {
+				qlcnic_pf_fw_cmd_hdlr[i].fn(trans, cmd);
+				return;
+			}
+		}
+
+		size = ARRAY_SIZE(qlcnic_pf_passthru_supp_cmds);
+		for (i = 0; i < size; i++) {
+			if (cmd_op == qlcnic_pf_passthru_supp_cmds[i]) {
+				qlcnic_issue_cmd(adapter, cmd);
+				return;
+			}
+		}
+	}
+
+	cmd->rsp.arg[0] |= (0x9 << 25);
+}
+
+void qlcnic_pf_set_interface_id_create_rx_ctx(struct qlcnic_adapter *adapter,
+					     u32 *int_id)
+{
+	u16 vpid;
+
+	vpid = qlcnic_sriov_pf_get_vport_handle(adapter,
+						adapter->ahw->pci_func);
+	*int_id |= vpid;
+}
+
+void qlcnic_pf_set_interface_id_del_rx_ctx(struct qlcnic_adapter *adapter,
+					   u32 *int_id)
+{
+	u16 vpid;
+
+	vpid = qlcnic_sriov_pf_get_vport_handle(adapter,
+						adapter->ahw->pci_func);
+	*int_id |= vpid << 16;
+}
+
+void qlcnic_pf_set_interface_id_create_tx_ctx(struct qlcnic_adapter *adapter,
+					      u32 *int_id)
+{
+	int vpid;
+
+	vpid = qlcnic_sriov_pf_get_vport_handle(adapter,
+						adapter->ahw->pci_func);
+	*int_id |= vpid << 16;
+}
+
+void qlcnic_pf_set_interface_id_del_tx_ctx(struct qlcnic_adapter *adapter,
+					   u32 *int_id)
+{
+	u16 vpid;
+
+	vpid = qlcnic_sriov_pf_get_vport_handle(adapter,
+						adapter->ahw->pci_func);
+	*int_id |= vpid << 16;
+}
+
+void qlcnic_pf_set_interface_id_promisc(struct qlcnic_adapter *adapter,
+					u32 *int_id)
+{
+	u16 vpid;
+
+	vpid = qlcnic_sriov_pf_get_vport_handle(adapter,
+						adapter->ahw->pci_func);
+	*int_id |= (vpid << 16) | BIT_31;
+}
+
+void qlcnic_pf_set_interface_id_ipaddr(struct qlcnic_adapter *adapter,
+				       u32 *int_id)
+{
+	u16 vpid;
+
+	vpid = qlcnic_sriov_pf_get_vport_handle(adapter,
+						adapter->ahw->pci_func);
+	*int_id |= (vpid << 16) | BIT_31;
+}
+
+void qlcnic_pf_set_interface_id_macaddr(struct qlcnic_adapter *adapter,
+					u32 *int_id)
+{
+	u16 vpid;
+
+	vpid = qlcnic_sriov_pf_get_vport_handle(adapter,
+						adapter->ahw->pci_func);
+	*int_id |= (vpid << 16) | BIT_31;
+}
+
+static void qlcnic_sriov_del_rx_ctx(struct qlcnic_adapter *adapter,
+				    struct qlcnic_vf_info *vf)
+{
+	struct qlcnic_cmd_args cmd;
+	int vpid;
+
+	if (!vf->rx_ctx_id)
+		return;
+
+	if (qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_DESTROY_RX_CTX))
+		return;
+
+	vpid = qlcnic_sriov_pf_get_vport_handle(adapter, vf->pci_func);
+	if (vpid >= 0) {
+		cmd.req.arg[1] = vf->rx_ctx_id | (vpid & 0xffff) << 16;
+		if (qlcnic_issue_cmd(adapter, &cmd))
+			dev_err(&adapter->pdev->dev,
+				"Failed to delete Tx ctx in firmware for func 0x%x\n",
+				vf->pci_func);
+		else
+			vf->rx_ctx_id = 0;
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+}
+
+static void qlcnic_sriov_del_tx_ctx(struct qlcnic_adapter *adapter,
+				    struct qlcnic_vf_info *vf)
+{
+	struct qlcnic_cmd_args cmd;
+	int vpid;
+
+	if (!vf->tx_ctx_id)
+		return;
+
+	if (qlcnic_alloc_mbx_args(&cmd, adapter, QLCNIC_CMD_DESTROY_TX_CTX))
+		return;
+
+	vpid = qlcnic_sriov_pf_get_vport_handle(adapter, vf->pci_func);
+	if (vpid >= 0) {
+		cmd.req.arg[1] |= vf->tx_ctx_id | (vpid & 0xffff) << 16;
+		if (qlcnic_issue_cmd(adapter, &cmd))
+			dev_err(&adapter->pdev->dev,
+				"Failed to delete Tx ctx in firmware for func 0x%x\n",
+				vf->pci_func);
+		else
+			vf->tx_ctx_id = 0;
+	}
+
+	qlcnic_free_mbx_args(&cmd);
+}
+
+static int qlcnic_sriov_add_act_list_irqsave(struct qlcnic_sriov *sriov,
+					     struct qlcnic_vf_info *vf,
+					     struct qlcnic_bc_trans *trans)
+{
+	struct qlcnic_trans_list *t_list = &vf->rcv_act;
+	unsigned long flag;
+
+	spin_lock_irqsave(&t_list->lock, flag);
+
+	__qlcnic_sriov_add_act_list(sriov, vf, trans);
+
+	spin_unlock_irqrestore(&t_list->lock, flag);
+	return 0;
+}
+
+static void __qlcnic_sriov_process_flr(struct qlcnic_vf_info *vf)
+{
+	struct qlcnic_adapter *adapter = vf->adapter;
+
+	qlcnic_sriov_cleanup_list(&vf->rcv_pend);
+	cancel_work_sync(&vf->trans_work);
+	qlcnic_sriov_cleanup_list(&vf->rcv_act);
+
+	if (test_bit(QLC_BC_VF_SOFT_FLR, &vf->state)) {
+		qlcnic_sriov_del_tx_ctx(adapter, vf);
+		qlcnic_sriov_del_rx_ctx(adapter, vf);
+	}
+
+	qlcnic_sriov_pf_config_vport(adapter, 0, vf->pci_func);
+
+	clear_bit(QLC_BC_VF_FLR, &vf->state);
+	if (test_bit(QLC_BC_VF_SOFT_FLR, &vf->state)) {
+		qlcnic_sriov_add_act_list_irqsave(adapter->ahw->sriov, vf,
+						  vf->flr_trans);
+		clear_bit(QLC_BC_VF_SOFT_FLR, &vf->state);
+		vf->flr_trans = NULL;
+	}
+}
+
+static void qlcnic_sriov_pf_process_flr(struct work_struct *work)
+{
+	struct qlcnic_vf_info *vf;
+
+	vf = container_of(work, struct qlcnic_vf_info, flr_work);
+	__qlcnic_sriov_process_flr(vf);
+	return;
+}
+
+static void qlcnic_sriov_schedule_flr(struct qlcnic_sriov *sriov,
+				      struct qlcnic_vf_info *vf,
+				      work_func_t func)
+{
+	if (test_bit(__QLCNIC_RESETTING, &vf->adapter->state))
+		return;
+
+	INIT_WORK(&vf->flr_work, func);
+	queue_work(sriov->bc.bc_flr_wq, &vf->flr_work);
+}
+
+static void qlcnic_sriov_handle_soft_flr(struct qlcnic_adapter *adapter,
+					 struct qlcnic_bc_trans *trans,
+					 struct qlcnic_vf_info *vf)
+{
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+
+	set_bit(QLC_BC_VF_FLR, &vf->state);
+	clear_bit(QLC_BC_VF_STATE, &vf->state);
+	set_bit(QLC_BC_VF_SOFT_FLR, &vf->state);
+	vf->flr_trans = trans;
+	qlcnic_sriov_schedule_flr(sriov, vf, qlcnic_sriov_pf_process_flr);
+	netdev_info(adapter->netdev, "Software FLR for PCI func %d\n",
+		    vf->pci_func);
+}
+
+bool qlcnic_sriov_soft_flr_check(struct qlcnic_adapter *adapter,
+				 struct qlcnic_bc_trans *trans,
+				 struct qlcnic_vf_info *vf)
+{
+	struct qlcnic_bc_hdr *hdr = trans->req_hdr;
+
+	if ((hdr->cmd_op == QLCNIC_BC_CMD_CHANNEL_INIT) &&
+	    (hdr->op_type == QLC_BC_CMD) &&
+	     test_bit(QLC_BC_VF_STATE, &vf->state)) {
+		qlcnic_sriov_handle_soft_flr(adapter, trans, vf);
+		return true;
+	}
+
+	return false;
+}
+
+void qlcnic_sriov_pf_handle_flr(struct qlcnic_sriov *sriov,
+				struct qlcnic_vf_info *vf)
+{
+	struct net_device *dev = vf->adapter->netdev;
+	struct qlcnic_vport *vp = vf->vp;
+
+	if (!test_and_clear_bit(QLC_BC_VF_STATE, &vf->state)) {
+		clear_bit(QLC_BC_VF_FLR, &vf->state);
+		return;
+	}
+
+	if (test_and_set_bit(QLC_BC_VF_FLR, &vf->state)) {
+		netdev_info(dev, "FLR for PCI func %d in progress\n",
+			    vf->pci_func);
+		return;
+	}
+
+	if (vp->vlan_mode == QLC_GUEST_VLAN_MODE)
+		memset(vf->sriov_vlans, 0,
+		       sizeof(*vf->sriov_vlans) * sriov->num_allowed_vlans);
+
+	qlcnic_sriov_schedule_flr(sriov, vf, qlcnic_sriov_pf_process_flr);
+	netdev_info(dev, "FLR received for PCI func %d\n", vf->pci_func);
+}
+
+void qlcnic_sriov_pf_reset(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct qlcnic_sriov *sriov = ahw->sriov;
+	struct qlcnic_vf_info *vf;
+	u16 num_vfs = sriov->num_vfs;
+	int i;
+
+	for (i = 0; i < num_vfs; i++) {
+		vf = &sriov->vf_info[i];
+		vf->rx_ctx_id = 0;
+		vf->tx_ctx_id = 0;
+		cancel_work_sync(&vf->flr_work);
+		__qlcnic_sriov_process_flr(vf);
+		clear_bit(QLC_BC_VF_STATE, &vf->state);
+	}
+
+	qlcnic_sriov_pf_reset_vport_handle(adapter, ahw->pci_func);
+	QLCWRX(ahw, QLCNIC_MBX_INTR_ENBL, (ahw->num_msix - 1) << 8);
+}
+
+int qlcnic_sriov_pf_reinit(struct qlcnic_adapter *adapter)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int err;
+
+	if (!qlcnic_sriov_enable_check(adapter))
+		return 0;
+
+	ahw->op_mode = QLCNIC_SRIOV_PF_FUNC;
+
+	err = qlcnic_sriov_pf_init(adapter);
+	if (err)
+		return err;
+
+	dev_info(&adapter->pdev->dev, "%s: op_mode %d\n",
+		 __func__, ahw->op_mode);
+	return err;
+}
+
+int qlcnic_sriov_set_vf_mac(struct net_device *netdev, int vf, u8 *mac)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	int i, num_vfs;
+	struct qlcnic_vf_info *vf_info;
+	u8 *curr_mac;
+
+	if (!qlcnic_sriov_pf_check(adapter))
+		return -EOPNOTSUPP;
+
+	num_vfs = sriov->num_vfs;
+
+	if (!is_valid_ether_addr(mac) || vf >= num_vfs)
+		return -EINVAL;
+
+	if (ether_addr_equal(adapter->mac_addr, mac)) {
+		netdev_err(netdev, "MAC address is already in use by the PF\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < num_vfs; i++) {
+		vf_info = &sriov->vf_info[i];
+		if (ether_addr_equal(vf_info->vp->mac, mac)) {
+			netdev_err(netdev,
+				   "MAC address is already in use by VF %d\n",
+				   i);
+			return -EINVAL;
+		}
+	}
+
+	vf_info = &sriov->vf_info[vf];
+	curr_mac = vf_info->vp->mac;
+
+	if (test_bit(QLC_BC_VF_STATE, &vf_info->state)) {
+		netdev_err(netdev,
+			   "MAC address change failed for VF %d, as VF driver is loaded. Please unload VF driver and retry the operation\n",
+			   vf);
+		return -EOPNOTSUPP;
+	}
+
+	memcpy(curr_mac, mac, netdev->addr_len);
+	netdev_info(netdev, "MAC Address %pM  is configured for VF %d\n",
+		    mac, vf);
+	return 0;
+}
+
+int qlcnic_sriov_set_vf_tx_rate(struct net_device *netdev, int vf,
+				int min_tx_rate, int max_tx_rate)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	struct qlcnic_vf_info *vf_info;
+	struct qlcnic_info nic_info;
+	struct qlcnic_vport *vp;
+	u16 vpid;
+
+	if (!qlcnic_sriov_pf_check(adapter))
+		return -EOPNOTSUPP;
+
+	if (vf >= sriov->num_vfs)
+		return -EINVAL;
+
+	vf_info = &sriov->vf_info[vf];
+	vp = vf_info->vp;
+	vpid = vp->handle;
+
+	if (!min_tx_rate)
+		min_tx_rate = QLC_VF_MIN_TX_RATE;
+
+	if (max_tx_rate &&
+	    (max_tx_rate >= 10000 || max_tx_rate < min_tx_rate)) {
+		netdev_err(netdev,
+			   "Invalid max Tx rate, allowed range is [%d - %d]",
+			   min_tx_rate, QLC_VF_MAX_TX_RATE);
+		return -EINVAL;
+	}
+
+	if (!max_tx_rate)
+		max_tx_rate = 10000;
+
+	if (min_tx_rate &&
+	    (min_tx_rate > max_tx_rate || min_tx_rate < QLC_VF_MIN_TX_RATE)) {
+		netdev_err(netdev,
+			   "Invalid min Tx rate, allowed range is [%d - %d]",
+			   QLC_VF_MIN_TX_RATE, max_tx_rate);
+		return -EINVAL;
+	}
+
+	if (test_bit(QLC_BC_VF_STATE, &vf_info->state)) {
+		if (qlcnic_sriov_get_vf_vport_info(adapter, &nic_info, vpid))
+			return -EIO;
+
+		nic_info.max_tx_bw = max_tx_rate / 100;
+		nic_info.min_tx_bw = min_tx_rate / 100;
+		nic_info.bit_offsets = BIT_0;
+
+		if (qlcnic_sriov_pf_set_vport_info(adapter, &nic_info, vpid))
+			return -EIO;
+	}
+
+	vp->max_tx_bw = max_tx_rate / 100;
+	netdev_info(netdev,
+		    "Setting Max Tx rate %d (Mbps), %d %% of PF bandwidth, for VF %d\n",
+		    max_tx_rate, vp->max_tx_bw, vf);
+	vp->min_tx_bw = min_tx_rate / 100;
+	netdev_info(netdev,
+		    "Setting Min Tx rate %d (Mbps), %d %% of PF bandwidth, for VF %d\n",
+		    min_tx_rate, vp->min_tx_bw, vf);
+	return 0;
+}
+
+int qlcnic_sriov_set_vf_vlan(struct net_device *netdev, int vf,
+			     u16 vlan, u8 qos, __be16 vlan_proto)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	struct qlcnic_vf_info *vf_info;
+	struct qlcnic_vport *vp;
+
+	if (!qlcnic_sriov_pf_check(adapter))
+		return -EOPNOTSUPP;
+
+	if (vf >= sriov->num_vfs || qos > 7)
+		return -EINVAL;
+
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
+	if (vlan > MAX_VLAN_ID) {
+		netdev_err(netdev,
+			   "Invalid VLAN ID, allowed range is [0 - %d]\n",
+			   MAX_VLAN_ID);
+		return -EINVAL;
+	}
+
+	vf_info = &sriov->vf_info[vf];
+	vp = vf_info->vp;
+	if (test_bit(QLC_BC_VF_STATE, &vf_info->state)) {
+		netdev_err(netdev,
+			   "VLAN change failed for VF %d, as VF driver is loaded. Please unload VF driver and retry the operation\n",
+			   vf);
+		return -EOPNOTSUPP;
+	}
+
+	memset(vf_info->sriov_vlans, 0,
+	       sizeof(*vf_info->sriov_vlans) * sriov->num_allowed_vlans);
+
+	switch (vlan) {
+	case 4095:
+		vp->vlan_mode = QLC_GUEST_VLAN_MODE;
+		break;
+	case 0:
+		vp->vlan_mode = QLC_NO_VLAN_MODE;
+		vp->qos = 0;
+		break;
+	default:
+		vp->vlan_mode = QLC_PVID_MODE;
+		qlcnic_sriov_add_vlan_id(sriov, vf_info, vlan);
+		vp->qos = qos;
+		vp->pvid = vlan;
+	}
+
+	netdev_info(netdev, "Setting VLAN %d, QoS %d, for VF %d\n",
+		    vlan, qos, vf);
+	return 0;
+}
+
+static __u32 qlcnic_sriov_get_vf_vlan(struct qlcnic_adapter *adapter,
+				      struct qlcnic_vport *vp, int vf)
+{
+	__u32 vlan = 0;
+
+	switch (vp->vlan_mode) {
+	case QLC_PVID_MODE:
+		vlan = vp->pvid;
+		break;
+	case QLC_GUEST_VLAN_MODE:
+		vlan = MAX_VLAN_ID;
+		break;
+	case QLC_NO_VLAN_MODE:
+		vlan = 0;
+		break;
+	default:
+		netdev_info(adapter->netdev, "Invalid VLAN mode = %d for VF %d\n",
+			    vp->vlan_mode, vf);
+	}
+
+	return vlan;
+}
+
+int qlcnic_sriov_get_vf_config(struct net_device *netdev,
+			       int vf, struct ifla_vf_info *ivi)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	struct qlcnic_vport *vp;
+
+	if (!qlcnic_sriov_pf_check(adapter))
+		return -EOPNOTSUPP;
+
+	if (vf >= sriov->num_vfs)
+		return -EINVAL;
+
+	vp = sriov->vf_info[vf].vp;
+	memcpy(&ivi->mac, vp->mac, ETH_ALEN);
+	ivi->vlan = qlcnic_sriov_get_vf_vlan(adapter, vp, vf);
+	ivi->qos = vp->qos;
+	ivi->spoofchk = vp->spoofchk;
+	if (vp->max_tx_bw == MAX_BW)
+		ivi->max_tx_rate = 0;
+	else
+		ivi->max_tx_rate = vp->max_tx_bw * 100;
+	if (vp->min_tx_bw == MIN_BW)
+		ivi->min_tx_rate = 0;
+	else
+		ivi->min_tx_rate = vp->min_tx_bw * 100;
+
+	ivi->vf = vf;
+	return 0;
+}
+
+int qlcnic_sriov_set_vf_spoofchk(struct net_device *netdev, int vf, bool chk)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
+	struct qlcnic_vf_info *vf_info;
+	struct qlcnic_vport *vp;
+
+	if (!qlcnic_sriov_pf_check(adapter))
+		return -EOPNOTSUPP;
+
+	if (vf >= sriov->num_vfs)
+		return -EINVAL;
+
+	vf_info = &sriov->vf_info[vf];
+	vp = vf_info->vp;
+	if (test_bit(QLC_BC_VF_STATE, &vf_info->state)) {
+		netdev_err(netdev,
+			   "Spoof check change failed for VF %d, as VF driver is loaded. Please unload VF driver and retry the operation\n",
+			   vf);
+		return -EOPNOTSUPP;
+	}
+
+	vp->spoofchk = chk;
+	return 0;
+}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c
new file mode 100644
index 0000000..891f03a
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c
@@ -0,0 +1,1430 @@
+/*
+ * QLogic qlcnic NIC Driver
+ * Copyright (c) 2009-2013 QLogic Corporation
+ *
+ * See LICENSE.qlcnic for copyright and licensing details.
+ */
+
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/swab.h>
+#include <linux/dma-mapping.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <linux/inetdevice.h>
+#include <linux/sysfs.h>
+#include <linux/aer.h>
+#include <linux/log2.h>
+#ifdef CONFIG_QLCNIC_HWMON
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#endif
+
+#include "qlcnic.h"
+#include "qlcnic_hw.h"
+
+int qlcnicvf_config_bridged_mode(struct qlcnic_adapter *adapter, u32 enable)
+{
+	return -EOPNOTSUPP;
+}
+
+int qlcnicvf_config_led(struct qlcnic_adapter *adapter, u32 state, u32 rate)
+{
+	return -EOPNOTSUPP;
+}
+
+static ssize_t qlcnic_store_bridged_mode(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t len)
+{
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	unsigned long new;
+	int ret = -EINVAL;
+
+	if (!(adapter->ahw->capabilities & QLCNIC_FW_CAPABILITY_BDG))
+		goto err_out;
+
+	if (!test_bit(__QLCNIC_DEV_UP, &adapter->state))
+		goto err_out;
+
+	if (kstrtoul(buf, 2, &new))
+		goto err_out;
+
+	if (!qlcnic_config_bridged_mode(adapter, !!new))
+		ret = len;
+
+err_out:
+	return ret;
+}
+
+static ssize_t qlcnic_show_bridged_mode(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	int bridged_mode = 0;
+
+	if (adapter->ahw->capabilities & QLCNIC_FW_CAPABILITY_BDG)
+		bridged_mode = !!(adapter->flags & QLCNIC_BRIDGE_ENABLED);
+
+	return sprintf(buf, "%d\n", bridged_mode);
+}
+
+static ssize_t qlcnic_store_diag_mode(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t len)
+{
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	unsigned long new;
+
+	if (kstrtoul(buf, 2, &new))
+		return -EINVAL;
+
+	if (!!new != !!(adapter->flags & QLCNIC_DIAG_ENABLED))
+		adapter->flags ^= QLCNIC_DIAG_ENABLED;
+
+	return len;
+}
+
+static ssize_t qlcnic_show_diag_mode(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", !!(adapter->flags & QLCNIC_DIAG_ENABLED));
+}
+
+static int qlcnic_validate_beacon(struct qlcnic_adapter *adapter, u16 beacon,
+				  u8 *state, u8 *rate)
+{
+	*rate = LSB(beacon);
+	*state = MSB(beacon);
+
+	QLCDB(adapter, DRV, "rate %x state %x\n", *rate, *state);
+
+	if (!*state) {
+		*rate = __QLCNIC_MAX_LED_RATE;
+		return 0;
+	} else if (*state > __QLCNIC_MAX_LED_STATE) {
+		return -EINVAL;
+	}
+
+	if ((!*rate) || (*rate > __QLCNIC_MAX_LED_RATE))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qlcnic_83xx_store_beacon(struct qlcnic_adapter *adapter,
+				    const char *buf, size_t len)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	unsigned long h_beacon;
+	int err;
+
+	if (test_bit(__QLCNIC_RESETTING, &adapter->state))
+		return -EIO;
+
+	if (kstrtoul(buf, 2, &h_beacon))
+		return -EINVAL;
+
+	qlcnic_get_beacon_state(adapter);
+
+	if (ahw->beacon_state == h_beacon)
+		return len;
+
+	rtnl_lock();
+	if (!ahw->beacon_state) {
+		if (test_and_set_bit(__QLCNIC_LED_ENABLE, &adapter->state)) {
+			rtnl_unlock();
+			return -EBUSY;
+		}
+	}
+
+	if (h_beacon)
+		err = qlcnic_83xx_config_led(adapter, 1, h_beacon);
+	else
+		err = qlcnic_83xx_config_led(adapter, 0, !h_beacon);
+	if (!err)
+		ahw->beacon_state = h_beacon;
+
+	if (!ahw->beacon_state)
+		clear_bit(__QLCNIC_LED_ENABLE, &adapter->state);
+
+	rtnl_unlock();
+	return len;
+}
+
+static int qlcnic_82xx_store_beacon(struct qlcnic_adapter *adapter,
+				    const char *buf, size_t len)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int err, drv_sds_rings = adapter->drv_sds_rings;
+	u16 beacon;
+	u8 b_state, b_rate;
+
+	if (len != sizeof(u16))
+		return -EINVAL;
+
+	memcpy(&beacon, buf, sizeof(u16));
+	err = qlcnic_validate_beacon(adapter, beacon, &b_state, &b_rate);
+	if (err)
+		return err;
+
+	qlcnic_get_beacon_state(adapter);
+
+	if (ahw->beacon_state == b_state)
+		return len;
+
+	rtnl_lock();
+	if (!ahw->beacon_state) {
+		if (test_and_set_bit(__QLCNIC_LED_ENABLE, &adapter->state)) {
+			rtnl_unlock();
+			return -EBUSY;
+		}
+	}
+
+	if (test_bit(__QLCNIC_RESETTING, &adapter->state)) {
+		err = -EIO;
+		goto out;
+	}
+
+	if (!test_bit(__QLCNIC_DEV_UP, &adapter->state)) {
+		err = qlcnic_diag_alloc_res(adapter->netdev, QLCNIC_LED_TEST);
+		if (err)
+			goto out;
+		set_bit(__QLCNIC_DIAG_RES_ALLOC, &adapter->state);
+	}
+
+	err = qlcnic_config_led(adapter, b_state, b_rate);
+	if (!err) {
+		err = len;
+		ahw->beacon_state = b_state;
+	}
+
+	if (test_and_clear_bit(__QLCNIC_DIAG_RES_ALLOC, &adapter->state))
+		qlcnic_diag_free_res(adapter->netdev, drv_sds_rings);
+
+out:
+	if (!ahw->beacon_state)
+		clear_bit(__QLCNIC_LED_ENABLE, &adapter->state);
+	rtnl_unlock();
+
+	return err;
+}
+
+static ssize_t qlcnic_store_beacon(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t len)
+{
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	int err = 0;
+
+	if (adapter->ahw->op_mode == QLCNIC_NON_PRIV_FUNC) {
+		dev_warn(dev,
+			 "LED test not supported in non privileged mode\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (qlcnic_82xx_check(adapter))
+		err = qlcnic_82xx_store_beacon(adapter, buf, len);
+	else if (qlcnic_83xx_check(adapter))
+		err = qlcnic_83xx_store_beacon(adapter, buf, len);
+	else
+		return -EIO;
+
+	return err;
+}
+
+static ssize_t qlcnic_show_beacon(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n", adapter->ahw->beacon_state);
+}
+
+static int qlcnic_sysfs_validate_crb(struct qlcnic_adapter *adapter,
+				     loff_t offset, size_t size)
+{
+	size_t crb_size = 4;
+
+	if (!(adapter->flags & QLCNIC_DIAG_ENABLED))
+		return -EIO;
+
+	if (offset < QLCNIC_PCI_CRBSPACE) {
+		if (ADDR_IN_RANGE(offset, QLCNIC_PCI_CAMQM,
+				  QLCNIC_PCI_CAMQM_END))
+			crb_size = 8;
+		else
+			return -EINVAL;
+	}
+
+	if ((size != crb_size) || (offset & (crb_size-1)))
+		return  -EINVAL;
+
+	return 0;
+}
+
+static ssize_t qlcnic_sysfs_read_crb(struct file *filp, struct kobject *kobj,
+				     struct bin_attribute *attr, char *buf,
+				     loff_t offset, size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	int ret;
+
+	ret = qlcnic_sysfs_validate_crb(adapter, offset, size);
+	if (ret != 0)
+		return ret;
+	qlcnic_read_crb(adapter, buf, offset, size);
+	qlcnic_swap32_buffer((u32 *)buf, size / sizeof(u32));
+
+	return size;
+}
+
+static ssize_t qlcnic_sysfs_write_crb(struct file *filp, struct kobject *kobj,
+				      struct bin_attribute *attr, char *buf,
+				      loff_t offset, size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	int ret;
+
+	ret = qlcnic_sysfs_validate_crb(adapter, offset, size);
+	if (ret != 0)
+		return ret;
+
+	qlcnic_swap32_buffer((u32 *)buf, size / sizeof(u32));
+	qlcnic_write_crb(adapter, buf, offset, size);
+	return size;
+}
+
+static int qlcnic_sysfs_validate_mem(struct qlcnic_adapter *adapter,
+				     loff_t offset, size_t size)
+{
+	if (!(adapter->flags & QLCNIC_DIAG_ENABLED))
+		return -EIO;
+
+	if ((size != 8) || (offset & 0x7))
+		return  -EIO;
+
+	return 0;
+}
+
+static ssize_t qlcnic_sysfs_read_mem(struct file *filp, struct kobject *kobj,
+				     struct bin_attribute *attr, char *buf,
+				     loff_t offset, size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	u64 data;
+	int ret;
+
+	ret = qlcnic_sysfs_validate_mem(adapter, offset, size);
+	if (ret != 0)
+		return ret;
+
+	if (qlcnic_pci_mem_read_2M(adapter, offset, &data))
+		return -EIO;
+
+	memcpy(buf, &data, size);
+	qlcnic_swap32_buffer((u32 *)buf, size / sizeof(u32));
+
+	return size;
+}
+
+static ssize_t qlcnic_sysfs_write_mem(struct file *filp, struct kobject *kobj,
+				      struct bin_attribute *attr, char *buf,
+				      loff_t offset, size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	u64 data;
+	int ret;
+
+	ret = qlcnic_sysfs_validate_mem(adapter, offset, size);
+	if (ret != 0)
+		return ret;
+
+	qlcnic_swap32_buffer((u32 *)buf, size / sizeof(u32));
+	memcpy(&data, buf, size);
+
+	if (qlcnic_pci_mem_write_2M(adapter, offset, data))
+		return -EIO;
+
+	return size;
+}
+
+int qlcnic_is_valid_nic_func(struct qlcnic_adapter *adapter, u8 pci_func)
+{
+	int i;
+
+	for (i = 0; i < adapter->ahw->total_nic_func; i++) {
+		if (adapter->npars[i].pci_func == pci_func)
+			return i;
+	}
+
+	dev_err(&adapter->pdev->dev, "%s: Invalid nic function\n", __func__);
+	return -EINVAL;
+}
+
+static int validate_pm_config(struct qlcnic_adapter *adapter,
+			      struct qlcnic_pm_func_cfg *pm_cfg, int count)
+{
+	u8 src_pci_func, s_esw_id, d_esw_id;
+	u8 dest_pci_func;
+	int i, src_index, dest_index;
+
+	for (i = 0; i < count; i++) {
+		src_pci_func = pm_cfg[i].pci_func;
+		dest_pci_func = pm_cfg[i].dest_npar;
+		src_index = qlcnic_is_valid_nic_func(adapter, src_pci_func);
+		if (src_index < 0)
+			return -EINVAL;
+
+		dest_index = qlcnic_is_valid_nic_func(adapter, dest_pci_func);
+		if (dest_index < 0)
+			return -EINVAL;
+
+		s_esw_id = adapter->npars[src_index].phy_port;
+		d_esw_id = adapter->npars[dest_index].phy_port;
+
+		if (s_esw_id != d_esw_id)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static ssize_t qlcnic_sysfs_write_pm_config(struct file *filp,
+					    struct kobject *kobj,
+					    struct bin_attribute *attr,
+					    char *buf, loff_t offset,
+					    size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	struct qlcnic_pm_func_cfg *pm_cfg;
+	u32 id, action, pci_func;
+	int count, rem, i, ret, index;
+
+	count	= size / sizeof(struct qlcnic_pm_func_cfg);
+	rem	= size % sizeof(struct qlcnic_pm_func_cfg);
+	if (rem)
+		return -EINVAL;
+
+	qlcnic_swap32_buffer((u32 *)buf, size / sizeof(u32));
+	pm_cfg = (struct qlcnic_pm_func_cfg *)buf;
+	ret = validate_pm_config(adapter, pm_cfg, count);
+
+	if (ret)
+		return ret;
+	for (i = 0; i < count; i++) {
+		pci_func = pm_cfg[i].pci_func;
+		action = !!pm_cfg[i].action;
+		index = qlcnic_is_valid_nic_func(adapter, pci_func);
+		if (index < 0)
+			return -EINVAL;
+
+		id = adapter->npars[index].phy_port;
+		ret = qlcnic_config_port_mirroring(adapter, id,
+						   action, pci_func);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < count; i++) {
+		pci_func = pm_cfg[i].pci_func;
+		index = qlcnic_is_valid_nic_func(adapter, pci_func);
+		if (index < 0)
+			return -EINVAL;
+		id = adapter->npars[index].phy_port;
+		adapter->npars[index].enable_pm = !!pm_cfg[i].action;
+		adapter->npars[index].dest_npar = id;
+	}
+
+	return size;
+}
+
+static ssize_t qlcnic_sysfs_read_pm_config(struct file *filp,
+					   struct kobject *kobj,
+					   struct bin_attribute *attr,
+					   char *buf, loff_t offset,
+					   size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	struct qlcnic_pm_func_cfg *pm_cfg;
+	u8 pci_func;
+	u32 count;
+	int i;
+
+	memset(buf, 0, size);
+	pm_cfg = (struct qlcnic_pm_func_cfg *)buf;
+	count = size / sizeof(struct qlcnic_pm_func_cfg);
+	for (i = 0; i < adapter->ahw->total_nic_func; i++) {
+		pci_func = adapter->npars[i].pci_func;
+		if (pci_func >= count) {
+			dev_dbg(dev, "%s: Total nic functions[%d], App sent function count[%d]\n",
+				__func__, adapter->ahw->total_nic_func, count);
+			continue;
+		}
+		if (!adapter->npars[i].eswitch_status)
+			continue;
+
+		pm_cfg[pci_func].action = adapter->npars[i].enable_pm;
+		pm_cfg[pci_func].dest_npar = 0;
+		pm_cfg[pci_func].pci_func = i;
+	}
+	qlcnic_swap32_buffer((u32 *)buf, size / sizeof(u32));
+	return size;
+}
+
+static int validate_esw_config(struct qlcnic_adapter *adapter,
+			       struct qlcnic_esw_func_cfg *esw_cfg, int count)
+{
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	int i, ret;
+	u32 op_mode;
+	u8 pci_func;
+
+	if (qlcnic_82xx_check(adapter))
+		op_mode = readl(ahw->pci_base0 + QLCNIC_DRV_OP_MODE);
+	else
+		op_mode = QLCRDX(ahw, QLC_83XX_DRV_OP_MODE);
+
+	for (i = 0; i < count; i++) {
+		pci_func = esw_cfg[i].pci_func;
+		if (pci_func >= ahw->max_vnic_func)
+			return -EINVAL;
+
+		if (adapter->ahw->op_mode == QLCNIC_MGMT_FUNC)
+			if (qlcnic_is_valid_nic_func(adapter, pci_func) < 0)
+				return -EINVAL;
+
+		switch (esw_cfg[i].op_mode) {
+		case QLCNIC_PORT_DEFAULTS:
+			if (qlcnic_82xx_check(adapter)) {
+				ret = QLC_DEV_GET_DRV(op_mode, pci_func);
+			} else {
+				ret = QLC_83XX_GET_FUNC_PRIVILEGE(op_mode,
+								  pci_func);
+				esw_cfg[i].offload_flags = 0;
+			}
+
+			if (ret != QLCNIC_NON_PRIV_FUNC) {
+				if (esw_cfg[i].mac_anti_spoof != 0)
+					return -EINVAL;
+				if (esw_cfg[i].mac_override != 1)
+					return -EINVAL;
+				if (esw_cfg[i].promisc_mode != 1)
+					return -EINVAL;
+			}
+			break;
+		case QLCNIC_ADD_VLAN:
+			if (!IS_VALID_VLAN(esw_cfg[i].vlan_id))
+				return -EINVAL;
+			if (!esw_cfg[i].op_type)
+				return -EINVAL;
+			break;
+		case QLCNIC_DEL_VLAN:
+			if (!esw_cfg[i].op_type)
+				return -EINVAL;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static ssize_t qlcnic_sysfs_write_esw_config(struct file *file,
+					     struct kobject *kobj,
+					     struct bin_attribute *attr,
+					     char *buf, loff_t offset,
+					     size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	struct qlcnic_esw_func_cfg *esw_cfg;
+	struct qlcnic_npar_info *npar;
+	int count, rem, i, ret;
+	int index;
+	u8 op_mode = 0, pci_func;
+
+	count	= size / sizeof(struct qlcnic_esw_func_cfg);
+	rem	= size % sizeof(struct qlcnic_esw_func_cfg);
+	if (rem)
+		return -EINVAL;
+
+	qlcnic_swap32_buffer((u32 *)buf, size / sizeof(u32));
+	esw_cfg = (struct qlcnic_esw_func_cfg *)buf;
+	ret = validate_esw_config(adapter, esw_cfg, count);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < count; i++) {
+		if (adapter->ahw->op_mode == QLCNIC_MGMT_FUNC)
+			if (qlcnic_config_switch_port(adapter, &esw_cfg[i]))
+				return -EINVAL;
+
+		if (adapter->ahw->pci_func != esw_cfg[i].pci_func)
+			continue;
+
+		op_mode = esw_cfg[i].op_mode;
+		qlcnic_get_eswitch_port_config(adapter, &esw_cfg[i]);
+		esw_cfg[i].op_mode = op_mode;
+		esw_cfg[i].pci_func = adapter->ahw->pci_func;
+
+		switch (esw_cfg[i].op_mode) {
+		case QLCNIC_PORT_DEFAULTS:
+			qlcnic_set_eswitch_port_features(adapter, &esw_cfg[i]);
+			rtnl_lock();
+			qlcnic_set_netdev_features(adapter, &esw_cfg[i]);
+			rtnl_unlock();
+			break;
+		case QLCNIC_ADD_VLAN:
+			qlcnic_set_vlan_config(adapter, &esw_cfg[i]);
+			break;
+		case QLCNIC_DEL_VLAN:
+			esw_cfg[i].vlan_id = 0;
+			qlcnic_set_vlan_config(adapter, &esw_cfg[i]);
+			break;
+		}
+	}
+
+	if (adapter->ahw->op_mode != QLCNIC_MGMT_FUNC)
+		goto out;
+
+	for (i = 0; i < count; i++) {
+		pci_func = esw_cfg[i].pci_func;
+		index = qlcnic_is_valid_nic_func(adapter, pci_func);
+		if (index < 0)
+			return -EINVAL;
+		npar = &adapter->npars[index];
+		switch (esw_cfg[i].op_mode) {
+		case QLCNIC_PORT_DEFAULTS:
+			npar->promisc_mode = esw_cfg[i].promisc_mode;
+			npar->mac_override = esw_cfg[i].mac_override;
+			npar->offload_flags = esw_cfg[i].offload_flags;
+			npar->mac_anti_spoof = esw_cfg[i].mac_anti_spoof;
+			npar->discard_tagged = esw_cfg[i].discard_tagged;
+			break;
+		case QLCNIC_ADD_VLAN:
+			npar->pvid = esw_cfg[i].vlan_id;
+			break;
+		case QLCNIC_DEL_VLAN:
+			npar->pvid = 0;
+			break;
+		}
+	}
+out:
+	return size;
+}
+
+static ssize_t qlcnic_sysfs_read_esw_config(struct file *file,
+					    struct kobject *kobj,
+					    struct bin_attribute *attr,
+					    char *buf, loff_t offset,
+					    size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	struct qlcnic_esw_func_cfg *esw_cfg;
+	u8 pci_func;
+	u32 count;
+	int i;
+
+	memset(buf, 0, size);
+	esw_cfg = (struct qlcnic_esw_func_cfg *)buf;
+	count = size / sizeof(struct qlcnic_esw_func_cfg);
+	for (i = 0; i < adapter->ahw->total_nic_func; i++) {
+		pci_func = adapter->npars[i].pci_func;
+		if (pci_func >= count) {
+			dev_dbg(dev, "%s: Total nic functions[%d], App sent function count[%d]\n",
+				__func__, adapter->ahw->total_nic_func, count);
+			continue;
+		}
+		if (!adapter->npars[i].eswitch_status)
+			continue;
+
+		esw_cfg[pci_func].pci_func = pci_func;
+		if (qlcnic_get_eswitch_port_config(adapter, &esw_cfg[pci_func]))
+			return -EINVAL;
+	}
+	qlcnic_swap32_buffer((u32 *)buf, size / sizeof(u32));
+	return size;
+}
+
+static int validate_npar_config(struct qlcnic_adapter *adapter,
+				struct qlcnic_npar_func_cfg *np_cfg,
+				int count)
+{
+	u8 pci_func, i;
+
+	for (i = 0; i < count; i++) {
+		pci_func = np_cfg[i].pci_func;
+		if (qlcnic_is_valid_nic_func(adapter, pci_func) < 0)
+			return -EINVAL;
+
+		if (!IS_VALID_BW(np_cfg[i].min_bw) ||
+		    !IS_VALID_BW(np_cfg[i].max_bw))
+			return -EINVAL;
+	}
+	return 0;
+}
+
+static ssize_t qlcnic_sysfs_write_npar_config(struct file *file,
+					      struct kobject *kobj,
+					      struct bin_attribute *attr,
+					      char *buf, loff_t offset,
+					      size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	struct qlcnic_info nic_info;
+	struct qlcnic_npar_func_cfg *np_cfg;
+	int i, count, rem, ret, index;
+	u8 pci_func;
+
+	count	= size / sizeof(struct qlcnic_npar_func_cfg);
+	rem	= size % sizeof(struct qlcnic_npar_func_cfg);
+	if (rem)
+		return -EINVAL;
+
+	qlcnic_swap32_buffer((u32 *)buf, size / sizeof(u32));
+	np_cfg = (struct qlcnic_npar_func_cfg *)buf;
+	ret = validate_npar_config(adapter, np_cfg, count);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < count; i++) {
+		pci_func = np_cfg[i].pci_func;
+
+		memset(&nic_info, 0, sizeof(struct qlcnic_info));
+		ret = qlcnic_get_nic_info(adapter, &nic_info, pci_func);
+		if (ret)
+			return ret;
+		nic_info.pci_func = pci_func;
+		nic_info.min_tx_bw = np_cfg[i].min_bw;
+		nic_info.max_tx_bw = np_cfg[i].max_bw;
+		ret = qlcnic_set_nic_info(adapter, &nic_info);
+		if (ret)
+			return ret;
+		index = qlcnic_is_valid_nic_func(adapter, pci_func);
+		if (index < 0)
+			return -EINVAL;
+		adapter->npars[index].min_bw = nic_info.min_tx_bw;
+		adapter->npars[index].max_bw = nic_info.max_tx_bw;
+	}
+
+	return size;
+}
+
+static ssize_t qlcnic_sysfs_read_npar_config(struct file *file,
+					     struct kobject *kobj,
+					     struct bin_attribute *attr,
+					     char *buf, loff_t offset,
+					     size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	struct qlcnic_npar_func_cfg *np_cfg;
+	struct qlcnic_info nic_info;
+	u8 pci_func;
+	int i, ret;
+	u32 count;
+
+	memset(&nic_info, 0, sizeof(struct qlcnic_info));
+	memset(buf, 0, size);
+	np_cfg = (struct qlcnic_npar_func_cfg *)buf;
+
+	count = size / sizeof(struct qlcnic_npar_func_cfg);
+	for (i = 0; i < adapter->ahw->total_nic_func; i++) {
+		if (adapter->npars[i].pci_func >= count) {
+			dev_dbg(dev, "%s: Total nic functions[%d], App sent function count[%d]\n",
+				__func__, adapter->ahw->total_nic_func, count);
+			continue;
+		}
+		if (!adapter->npars[i].eswitch_status)
+			continue;
+		pci_func = adapter->npars[i].pci_func;
+		if (qlcnic_is_valid_nic_func(adapter, pci_func) < 0)
+			continue;
+		ret = qlcnic_get_nic_info(adapter, &nic_info, pci_func);
+		if (ret)
+			return ret;
+
+		np_cfg[pci_func].pci_func = pci_func;
+		np_cfg[pci_func].op_mode = (u8)nic_info.op_mode;
+		np_cfg[pci_func].port_num = nic_info.phys_port;
+		np_cfg[pci_func].fw_capab = nic_info.capabilities;
+		np_cfg[pci_func].min_bw = nic_info.min_tx_bw;
+		np_cfg[pci_func].max_bw = nic_info.max_tx_bw;
+		np_cfg[pci_func].max_tx_queues = nic_info.max_tx_ques;
+		np_cfg[pci_func].max_rx_queues = nic_info.max_rx_ques;
+	}
+	qlcnic_swap32_buffer((u32 *)buf, size / sizeof(u32));
+	return size;
+}
+
+static ssize_t qlcnic_sysfs_get_port_stats(struct file *file,
+					   struct kobject *kobj,
+					   struct bin_attribute *attr,
+					   char *buf, loff_t offset,
+					   size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	struct qlcnic_esw_statistics port_stats;
+	int ret;
+
+	if (qlcnic_83xx_check(adapter))
+		return -EOPNOTSUPP;
+
+	if (size != sizeof(struct qlcnic_esw_statistics))
+		return -EINVAL;
+
+	if (offset >= adapter->ahw->max_vnic_func)
+		return -EINVAL;
+
+	memset(&port_stats, 0, size);
+	ret = qlcnic_get_port_stats(adapter, offset, QLCNIC_QUERY_RX_COUNTER,
+				    &port_stats.rx);
+	if (ret)
+		return ret;
+
+	ret = qlcnic_get_port_stats(adapter, offset, QLCNIC_QUERY_TX_COUNTER,
+				    &port_stats.tx);
+	if (ret)
+		return ret;
+
+	memcpy(buf, &port_stats, size);
+	return size;
+}
+
+static ssize_t qlcnic_sysfs_get_esw_stats(struct file *file,
+					  struct kobject *kobj,
+					  struct bin_attribute *attr,
+					  char *buf, loff_t offset,
+					  size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	struct qlcnic_esw_statistics esw_stats;
+	int ret;
+
+	if (qlcnic_83xx_check(adapter))
+		return -EOPNOTSUPP;
+
+	if (size != sizeof(struct qlcnic_esw_statistics))
+		return -EINVAL;
+
+	if (offset >= QLCNIC_NIU_MAX_XG_PORTS)
+		return -EINVAL;
+
+	memset(&esw_stats, 0, size);
+	ret = qlcnic_get_eswitch_stats(adapter, offset, QLCNIC_QUERY_RX_COUNTER,
+				       &esw_stats.rx);
+	if (ret)
+		return ret;
+
+	ret = qlcnic_get_eswitch_stats(adapter, offset, QLCNIC_QUERY_TX_COUNTER,
+				       &esw_stats.tx);
+	if (ret)
+		return ret;
+
+	memcpy(buf, &esw_stats, size);
+	return size;
+}
+
+static ssize_t qlcnic_sysfs_clear_esw_stats(struct file *file,
+					    struct kobject *kobj,
+					    struct bin_attribute *attr,
+					    char *buf, loff_t offset,
+					    size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	int ret;
+
+	if (qlcnic_83xx_check(adapter))
+		return -EOPNOTSUPP;
+
+	if (offset >= QLCNIC_NIU_MAX_XG_PORTS)
+		return -EINVAL;
+
+	ret = qlcnic_clear_esw_stats(adapter, QLCNIC_STATS_ESWITCH, offset,
+				     QLCNIC_QUERY_RX_COUNTER);
+	if (ret)
+		return ret;
+
+	ret = qlcnic_clear_esw_stats(adapter, QLCNIC_STATS_ESWITCH, offset,
+				     QLCNIC_QUERY_TX_COUNTER);
+	if (ret)
+		return ret;
+
+	return size;
+}
+
+static ssize_t qlcnic_sysfs_clear_port_stats(struct file *file,
+					     struct kobject *kobj,
+					     struct bin_attribute *attr,
+					     char *buf, loff_t offset,
+					     size_t size)
+{
+
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	int ret;
+
+	if (qlcnic_83xx_check(adapter))
+		return -EOPNOTSUPP;
+
+	if (offset >= adapter->ahw->max_vnic_func)
+		return -EINVAL;
+
+	ret = qlcnic_clear_esw_stats(adapter, QLCNIC_STATS_PORT, offset,
+				     QLCNIC_QUERY_RX_COUNTER);
+	if (ret)
+		return ret;
+
+	ret = qlcnic_clear_esw_stats(adapter, QLCNIC_STATS_PORT, offset,
+				     QLCNIC_QUERY_TX_COUNTER);
+	if (ret)
+		return ret;
+
+	return size;
+}
+
+static ssize_t qlcnic_sysfs_read_pci_config(struct file *file,
+					    struct kobject *kobj,
+					    struct bin_attribute *attr,
+					    char *buf, loff_t offset,
+					    size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	struct qlcnic_pci_func_cfg *pci_cfg;
+	struct qlcnic_pci_info *pci_info;
+	int i, ret;
+	u32 count;
+
+	pci_info = kcalloc(size, sizeof(*pci_info), GFP_KERNEL);
+	if (!pci_info)
+		return -ENOMEM;
+
+	ret = qlcnic_get_pci_info(adapter, pci_info);
+	if (ret) {
+		kfree(pci_info);
+		return ret;
+	}
+
+	pci_cfg = (struct qlcnic_pci_func_cfg *)buf;
+	count = size / sizeof(struct qlcnic_pci_func_cfg);
+	qlcnic_swap32_buffer((u32 *)pci_info, size / sizeof(u32));
+	for (i = 0; i < count; i++) {
+		pci_cfg[i].pci_func = pci_info[i].id;
+		pci_cfg[i].func_type = pci_info[i].type;
+		pci_cfg[i].func_state = 0;
+		pci_cfg[i].port_num = pci_info[i].default_port;
+		pci_cfg[i].min_bw = pci_info[i].tx_min_bw;
+		pci_cfg[i].max_bw = pci_info[i].tx_max_bw;
+		memcpy(&pci_cfg[i].def_mac_addr, &pci_info[i].mac, ETH_ALEN);
+	}
+
+	kfree(pci_info);
+	return size;
+}
+
+static ssize_t qlcnic_83xx_sysfs_flash_read_handler(struct file *filp,
+						    struct kobject *kobj,
+						    struct bin_attribute *attr,
+						    char *buf, loff_t offset,
+						    size_t size)
+{
+	unsigned char *p_read_buf;
+	int  ret, count;
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+
+	if (!size)
+		return -EINVAL;
+
+	count = size / sizeof(u32);
+
+	if (size % sizeof(u32))
+		count++;
+
+	p_read_buf = kcalloc(size, sizeof(unsigned char), GFP_KERNEL);
+	if (!p_read_buf)
+		return -ENOMEM;
+	if (qlcnic_83xx_lock_flash(adapter) != 0) {
+		kfree(p_read_buf);
+		return -EIO;
+	}
+
+	ret = qlcnic_83xx_lockless_flash_read32(adapter, offset, p_read_buf,
+						count);
+
+	if (ret) {
+		qlcnic_83xx_unlock_flash(adapter);
+		kfree(p_read_buf);
+		return ret;
+	}
+
+	qlcnic_83xx_unlock_flash(adapter);
+	qlcnic_swap32_buffer((u32 *)p_read_buf, count);
+	memcpy(buf, p_read_buf, size);
+	kfree(p_read_buf);
+
+	return size;
+}
+
+static int qlcnic_83xx_sysfs_flash_bulk_write(struct qlcnic_adapter *adapter,
+					      char *buf, loff_t offset,
+					      size_t size)
+{
+	int  i, ret, count;
+	unsigned char *p_cache, *p_src;
+
+	p_cache = kcalloc(size, sizeof(unsigned char), GFP_KERNEL);
+	if (!p_cache)
+		return -ENOMEM;
+
+	count = size / sizeof(u32);
+	qlcnic_swap32_buffer((u32 *)buf, count);
+	memcpy(p_cache, buf, size);
+	p_src = p_cache;
+
+	if (qlcnic_83xx_lock_flash(adapter) != 0) {
+		kfree(p_cache);
+		return -EIO;
+	}
+
+	if (adapter->ahw->fdt.mfg_id == adapter->flash_mfg_id) {
+		ret = qlcnic_83xx_enable_flash_write(adapter);
+		if (ret) {
+			kfree(p_cache);
+			qlcnic_83xx_unlock_flash(adapter);
+			return -EIO;
+		}
+	}
+
+	for (i = 0; i < count / QLC_83XX_FLASH_WRITE_MAX; i++) {
+		ret = qlcnic_83xx_flash_bulk_write(adapter, offset,
+						   (u32 *)p_src,
+						   QLC_83XX_FLASH_WRITE_MAX);
+
+		if (ret) {
+			if (adapter->ahw->fdt.mfg_id == adapter->flash_mfg_id) {
+				ret = qlcnic_83xx_disable_flash_write(adapter);
+				if (ret) {
+					kfree(p_cache);
+					qlcnic_83xx_unlock_flash(adapter);
+					return -EIO;
+				}
+			}
+
+			kfree(p_cache);
+			qlcnic_83xx_unlock_flash(adapter);
+			return -EIO;
+		}
+
+		p_src = p_src + sizeof(u32)*QLC_83XX_FLASH_WRITE_MAX;
+		offset = offset + sizeof(u32)*QLC_83XX_FLASH_WRITE_MAX;
+	}
+
+	if (adapter->ahw->fdt.mfg_id == adapter->flash_mfg_id) {
+		ret = qlcnic_83xx_disable_flash_write(adapter);
+		if (ret) {
+			kfree(p_cache);
+			qlcnic_83xx_unlock_flash(adapter);
+			return -EIO;
+		}
+	}
+
+	kfree(p_cache);
+	qlcnic_83xx_unlock_flash(adapter);
+
+	return 0;
+}
+
+static int qlcnic_83xx_sysfs_flash_write(struct qlcnic_adapter *adapter,
+					 char *buf, loff_t offset, size_t size)
+{
+	int  i, ret, count;
+	unsigned char *p_cache, *p_src;
+
+	p_cache = kcalloc(size, sizeof(unsigned char), GFP_KERNEL);
+	if (!p_cache)
+		return -ENOMEM;
+
+	qlcnic_swap32_buffer((u32 *)buf, size / sizeof(u32));
+	memcpy(p_cache, buf, size);
+	p_src = p_cache;
+	count = size / sizeof(u32);
+
+	if (qlcnic_83xx_lock_flash(adapter) != 0) {
+		kfree(p_cache);
+		return -EIO;
+	}
+
+	if (adapter->ahw->fdt.mfg_id == adapter->flash_mfg_id) {
+		ret = qlcnic_83xx_enable_flash_write(adapter);
+		if (ret) {
+			kfree(p_cache);
+			qlcnic_83xx_unlock_flash(adapter);
+			return -EIO;
+		}
+	}
+
+	for (i = 0; i < count; i++) {
+		ret = qlcnic_83xx_flash_write32(adapter, offset, (u32 *)p_src);
+		if (ret) {
+			if (adapter->ahw->fdt.mfg_id == adapter->flash_mfg_id) {
+				ret = qlcnic_83xx_disable_flash_write(adapter);
+				if (ret) {
+					kfree(p_cache);
+					qlcnic_83xx_unlock_flash(adapter);
+					return -EIO;
+				}
+			}
+			kfree(p_cache);
+			qlcnic_83xx_unlock_flash(adapter);
+			return -EIO;
+		}
+
+		p_src = p_src + sizeof(u32);
+		offset = offset + sizeof(u32);
+	}
+
+	if (adapter->ahw->fdt.mfg_id == adapter->flash_mfg_id) {
+		ret = qlcnic_83xx_disable_flash_write(adapter);
+		if (ret) {
+			kfree(p_cache);
+			qlcnic_83xx_unlock_flash(adapter);
+			return -EIO;
+		}
+	}
+
+	kfree(p_cache);
+	qlcnic_83xx_unlock_flash(adapter);
+
+	return 0;
+}
+
+static ssize_t qlcnic_83xx_sysfs_flash_write_handler(struct file *filp,
+						     struct kobject *kobj,
+						     struct bin_attribute *attr,
+						     char *buf, loff_t offset,
+						     size_t size)
+{
+	int  ret;
+	static int flash_mode;
+	unsigned long data;
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+
+	ret = kstrtoul(buf, 16, &data);
+
+	switch (data) {
+	case QLC_83XX_FLASH_SECTOR_ERASE_CMD:
+		flash_mode = QLC_83XX_ERASE_MODE;
+		ret = qlcnic_83xx_erase_flash_sector(adapter, offset);
+		if (ret) {
+			dev_err(&adapter->pdev->dev,
+				"%s failed at %d\n", __func__, __LINE__);
+			return -EIO;
+		}
+		break;
+
+	case QLC_83XX_FLASH_BULK_WRITE_CMD:
+		flash_mode = QLC_83XX_BULK_WRITE_MODE;
+		break;
+
+	case QLC_83XX_FLASH_WRITE_CMD:
+		flash_mode = QLC_83XX_WRITE_MODE;
+		break;
+	default:
+		if (flash_mode == QLC_83XX_BULK_WRITE_MODE) {
+			ret = qlcnic_83xx_sysfs_flash_bulk_write(adapter, buf,
+								 offset, size);
+			if (ret) {
+				dev_err(&adapter->pdev->dev,
+					"%s failed at %d\n",
+					__func__, __LINE__);
+				return -EIO;
+			}
+		}
+
+		if (flash_mode == QLC_83XX_WRITE_MODE) {
+			ret = qlcnic_83xx_sysfs_flash_write(adapter, buf,
+							    offset, size);
+			if (ret) {
+				dev_err(&adapter->pdev->dev,
+					"%s failed at %d\n", __func__,
+					__LINE__);
+				return -EIO;
+			}
+		}
+	}
+
+	return size;
+}
+
+static const struct device_attribute dev_attr_bridged_mode = {
+	.attr = { .name = "bridged_mode", .mode = 0644 },
+	.show = qlcnic_show_bridged_mode,
+	.store = qlcnic_store_bridged_mode,
+};
+
+static const struct device_attribute dev_attr_diag_mode = {
+	.attr = { .name = "diag_mode", .mode = 0644 },
+	.show = qlcnic_show_diag_mode,
+	.store = qlcnic_store_diag_mode,
+};
+
+static const struct device_attribute dev_attr_beacon = {
+	.attr = { .name = "beacon", .mode = 0644 },
+	.show = qlcnic_show_beacon,
+	.store = qlcnic_store_beacon,
+};
+
+static const struct bin_attribute bin_attr_crb = {
+	.attr = { .name = "crb", .mode = 0644 },
+	.size = 0,
+	.read = qlcnic_sysfs_read_crb,
+	.write = qlcnic_sysfs_write_crb,
+};
+
+static const struct bin_attribute bin_attr_mem = {
+	.attr = { .name = "mem", .mode = 0644 },
+	.size = 0,
+	.read = qlcnic_sysfs_read_mem,
+	.write = qlcnic_sysfs_write_mem,
+};
+
+static const struct bin_attribute bin_attr_npar_config = {
+	.attr = { .name = "npar_config", .mode = 0644 },
+	.size = 0,
+	.read = qlcnic_sysfs_read_npar_config,
+	.write = qlcnic_sysfs_write_npar_config,
+};
+
+static const struct bin_attribute bin_attr_pci_config = {
+	.attr = { .name = "pci_config", .mode = 0644 },
+	.size = 0,
+	.read = qlcnic_sysfs_read_pci_config,
+	.write = NULL,
+};
+
+static const struct bin_attribute bin_attr_port_stats = {
+	.attr = { .name = "port_stats", .mode = 0644 },
+	.size = 0,
+	.read = qlcnic_sysfs_get_port_stats,
+	.write = qlcnic_sysfs_clear_port_stats,
+};
+
+static const struct bin_attribute bin_attr_esw_stats = {
+	.attr = { .name = "esw_stats", .mode = 0644 },
+	.size = 0,
+	.read = qlcnic_sysfs_get_esw_stats,
+	.write = qlcnic_sysfs_clear_esw_stats,
+};
+
+static const struct bin_attribute bin_attr_esw_config = {
+	.attr = { .name = "esw_config", .mode = 0644 },
+	.size = 0,
+	.read = qlcnic_sysfs_read_esw_config,
+	.write = qlcnic_sysfs_write_esw_config,
+};
+
+static const struct bin_attribute bin_attr_pm_config = {
+	.attr = { .name = "pm_config", .mode = 0644 },
+	.size = 0,
+	.read = qlcnic_sysfs_read_pm_config,
+	.write = qlcnic_sysfs_write_pm_config,
+};
+
+static const struct bin_attribute bin_attr_flash = {
+	.attr = { .name = "flash", .mode = 0644 },
+	.size = 0,
+	.read = qlcnic_83xx_sysfs_flash_read_handler,
+	.write = qlcnic_83xx_sysfs_flash_write_handler,
+};
+
+#ifdef CONFIG_QLCNIC_HWMON
+
+static ssize_t qlcnic_hwmon_show_temp(struct device *dev,
+				      struct device_attribute *dev_attr,
+				      char *buf)
+{
+	struct qlcnic_adapter *adapter = dev_get_drvdata(dev);
+	unsigned int temperature = 0, value = 0;
+
+	if (qlcnic_83xx_check(adapter))
+		value = QLCRDX(adapter->ahw, QLC_83XX_ASIC_TEMP);
+	else if (qlcnic_82xx_check(adapter))
+		value = QLC_SHARED_REG_RD32(adapter, QLCNIC_ASIC_TEMP);
+
+	temperature = qlcnic_get_temp_val(value);
+	/* display millidegree celcius */
+	temperature *= 1000;
+	return sprintf(buf, "%u\n", temperature);
+}
+
+/* hwmon-sysfs attributes */
+static SENSOR_DEVICE_ATTR(temp1_input, 0444,
+			  qlcnic_hwmon_show_temp, NULL, 1);
+
+static struct attribute *qlcnic_hwmon_attrs[] = {
+	&sensor_dev_attr_temp1_input.dev_attr.attr,
+	NULL
+};
+
+ATTRIBUTE_GROUPS(qlcnic_hwmon);
+
+void qlcnic_register_hwmon_dev(struct qlcnic_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	struct device *hwmon_dev;
+
+	/* Skip hwmon registration for a VF device */
+	if (qlcnic_sriov_vf_check(adapter)) {
+		adapter->ahw->hwmon_dev = NULL;
+		return;
+	}
+	hwmon_dev = hwmon_device_register_with_groups(dev, qlcnic_driver_name,
+						      adapter,
+						      qlcnic_hwmon_groups);
+	if (IS_ERR(hwmon_dev)) {
+		dev_err(dev, "Cannot register with hwmon, err=%ld\n",
+			PTR_ERR(hwmon_dev));
+		hwmon_dev = NULL;
+	}
+	adapter->ahw->hwmon_dev = hwmon_dev;
+}
+
+void qlcnic_unregister_hwmon_dev(struct qlcnic_adapter *adapter)
+{
+	struct device *hwmon_dev = adapter->ahw->hwmon_dev;
+	if (hwmon_dev) {
+		hwmon_device_unregister(hwmon_dev);
+		adapter->ahw->hwmon_dev = NULL;
+	}
+}
+#endif
+
+void qlcnic_create_sysfs_entries(struct qlcnic_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+
+	if (adapter->ahw->capabilities & QLCNIC_FW_CAPABILITY_BDG)
+		if (device_create_file(dev, &dev_attr_bridged_mode))
+			dev_warn(dev,
+				 "failed to create bridged_mode sysfs entry\n");
+}
+
+void qlcnic_remove_sysfs_entries(struct qlcnic_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+
+	if (adapter->ahw->capabilities & QLCNIC_FW_CAPABILITY_BDG)
+		device_remove_file(dev, &dev_attr_bridged_mode);
+}
+
+static void qlcnic_create_diag_entries(struct qlcnic_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+
+	if (device_create_bin_file(dev, &bin_attr_port_stats))
+		dev_info(dev, "failed to create port stats sysfs entry");
+
+	if (adapter->ahw->op_mode == QLCNIC_NON_PRIV_FUNC)
+		return;
+	if (device_create_file(dev, &dev_attr_diag_mode))
+		dev_info(dev, "failed to create diag_mode sysfs entry\n");
+	if (device_create_bin_file(dev, &bin_attr_crb))
+		dev_info(dev, "failed to create crb sysfs entry\n");
+	if (device_create_bin_file(dev, &bin_attr_mem))
+		dev_info(dev, "failed to create mem sysfs entry\n");
+
+	if (test_bit(__QLCNIC_MAINTENANCE_MODE, &adapter->state))
+		return;
+
+	if (device_create_bin_file(dev, &bin_attr_pci_config))
+		dev_info(dev, "failed to create pci config sysfs entry");
+
+	if (device_create_file(dev, &dev_attr_beacon))
+		dev_info(dev, "failed to create beacon sysfs entry");
+
+	if (!(adapter->flags & QLCNIC_ESWITCH_ENABLED))
+		return;
+	if (device_create_bin_file(dev, &bin_attr_esw_config))
+		dev_info(dev, "failed to create esw config sysfs entry");
+	if (adapter->ahw->op_mode != QLCNIC_MGMT_FUNC)
+		return;
+	if (device_create_bin_file(dev, &bin_attr_npar_config))
+		dev_info(dev, "failed to create npar config sysfs entry");
+	if (device_create_bin_file(dev, &bin_attr_pm_config))
+		dev_info(dev, "failed to create pm config sysfs entry");
+	if (device_create_bin_file(dev, &bin_attr_esw_stats))
+		dev_info(dev, "failed to create eswitch stats sysfs entry");
+}
+
+static void qlcnic_remove_diag_entries(struct qlcnic_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+
+	device_remove_bin_file(dev, &bin_attr_port_stats);
+
+	if (adapter->ahw->op_mode == QLCNIC_NON_PRIV_FUNC)
+		return;
+	device_remove_file(dev, &dev_attr_diag_mode);
+	device_remove_bin_file(dev, &bin_attr_crb);
+	device_remove_bin_file(dev, &bin_attr_mem);
+
+	if (test_bit(__QLCNIC_MAINTENANCE_MODE, &adapter->state))
+		return;
+
+	device_remove_bin_file(dev, &bin_attr_pci_config);
+	device_remove_file(dev, &dev_attr_beacon);
+	if (!(adapter->flags & QLCNIC_ESWITCH_ENABLED))
+		return;
+	device_remove_bin_file(dev, &bin_attr_esw_config);
+	if (adapter->ahw->op_mode != QLCNIC_MGMT_FUNC)
+		return;
+	device_remove_bin_file(dev, &bin_attr_npar_config);
+	device_remove_bin_file(dev, &bin_attr_pm_config);
+	device_remove_bin_file(dev, &bin_attr_esw_stats);
+}
+
+void qlcnic_82xx_add_sysfs(struct qlcnic_adapter *adapter)
+{
+	qlcnic_create_diag_entries(adapter);
+}
+
+void qlcnic_82xx_remove_sysfs(struct qlcnic_adapter *adapter)
+{
+	qlcnic_remove_diag_entries(adapter);
+}
+
+void qlcnic_83xx_add_sysfs(struct qlcnic_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+
+	qlcnic_create_diag_entries(adapter);
+
+	if (sysfs_create_bin_file(&dev->kobj, &bin_attr_flash))
+		dev_info(dev, "failed to create flash sysfs entry\n");
+}
+
+void qlcnic_83xx_remove_sysfs(struct qlcnic_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+
+	qlcnic_remove_diag_entries(adapter);
+	sysfs_remove_bin_file(&dev->kobj, &bin_attr_flash);
+}
diff --git a/drivers/net/ethernet/qlogic/qlge/Makefile b/drivers/net/ethernet/qlogic/qlge/Makefile
new file mode 100644
index 0000000..8a19765
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlge/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Qlogic 10GbE PCI Express ethernet driver
+#
+
+obj-$(CONFIG_QLGE) += qlge.o
+
+qlge-objs := qlge_main.o qlge_dbg.o qlge_mpi.o qlge_ethtool.o
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge.h b/drivers/net/ethernet/qlogic/qlge/qlge.h
new file mode 100644
index 0000000..3e71b65
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlge/qlge.h
@@ -0,0 +1,2354 @@
+/*
+ * QLogic QLA41xx NIC HBA Driver
+ * Copyright (c)  2003-2006 QLogic Corporation
+ *
+ * See LICENSE.qlge for copyright and licensing details.
+ */
+#ifndef _QLGE_H_
+#define _QLGE_H_
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+
+/*
+ * General definitions...
+ */
+#define DRV_NAME  	"qlge"
+#define DRV_STRING 	"QLogic 10 Gigabit PCI-E Ethernet Driver "
+#define DRV_VERSION	"1.00.00.35"
+
+#define WQ_ADDR_ALIGN	0x3	/* 4 byte alignment */
+
+#define QLGE_VENDOR_ID    0x1077
+#define QLGE_DEVICE_ID_8012	0x8012
+#define QLGE_DEVICE_ID_8000	0x8000
+#define QLGE_MEZZ_SSYS_ID_068	0x0068
+#define QLGE_MEZZ_SSYS_ID_180	0x0180
+#define MAX_CPUS 8
+#define MAX_TX_RINGS MAX_CPUS
+#define MAX_RX_RINGS ((MAX_CPUS * 2) + 1)
+
+#define NUM_TX_RING_ENTRIES	256
+#define NUM_RX_RING_ENTRIES	256
+
+#define NUM_SMALL_BUFFERS   512
+#define NUM_LARGE_BUFFERS   512
+#define DB_PAGE_SIZE 4096
+
+/* Calculate the number of (4k) pages required to
+ * contain a buffer queue of the given length.
+ */
+#define MAX_DB_PAGES_PER_BQ(x) \
+		(((x * sizeof(u64)) / DB_PAGE_SIZE) + \
+		(((x * sizeof(u64)) % DB_PAGE_SIZE) ? 1 : 0))
+
+#define RX_RING_SHADOW_SPACE	(sizeof(u64) + \
+		MAX_DB_PAGES_PER_BQ(NUM_SMALL_BUFFERS) * sizeof(u64) + \
+		MAX_DB_PAGES_PER_BQ(NUM_LARGE_BUFFERS) * sizeof(u64))
+#define LARGE_BUFFER_MAX_SIZE 8192
+#define LARGE_BUFFER_MIN_SIZE 2048
+
+#define MAX_CQ 128
+#define DFLT_COALESCE_WAIT 100	/* 100 usec wait for coalescing */
+#define MAX_INTER_FRAME_WAIT 10	/* 10 usec max interframe-wait for coalescing */
+#define DFLT_INTER_FRAME_WAIT (MAX_INTER_FRAME_WAIT/2)
+#define UDELAY_COUNT 3
+#define UDELAY_DELAY 100
+
+
+#define TX_DESC_PER_IOCB 8
+
+#if ((MAX_SKB_FRAGS - TX_DESC_PER_IOCB) + 2) > 0
+#define TX_DESC_PER_OAL ((MAX_SKB_FRAGS - TX_DESC_PER_IOCB) + 2)
+#else /* all other page sizes */
+#define TX_DESC_PER_OAL 0
+#endif
+
+/* Word shifting for converting 64-bit
+ * address to a series of 16-bit words.
+ * This is used for some MPI firmware
+ * mailbox commands.
+ */
+#define LSW(x)  ((u16)(x))
+#define MSW(x)  ((u16)((u32)(x) >> 16))
+#define LSD(x)  ((u32)((u64)(x)))
+#define MSD(x)  ((u32)((((u64)(x)) >> 32)))
+
+/* MPI test register definitions. This register
+ * is used for determining alternate NIC function's
+ * PCI->func number.
+ */
+enum {
+	MPI_TEST_FUNC_PORT_CFG = 0x1002,
+	MPI_TEST_FUNC_PRB_CTL = 0x100e,
+		MPI_TEST_FUNC_PRB_EN = 0x18a20000,
+	MPI_TEST_FUNC_RST_STS = 0x100a,
+		MPI_TEST_FUNC_RST_FRC = 0x00000003,
+	MPI_TEST_NIC_FUNC_MASK = 0x00000007,
+	MPI_TEST_NIC1_FUNCTION_ENABLE = (1 << 0),
+	MPI_TEST_NIC1_FUNCTION_MASK = 0x0000000e,
+	MPI_TEST_NIC1_FUNC_SHIFT = 1,
+	MPI_TEST_NIC2_FUNCTION_ENABLE = (1 << 4),
+	MPI_TEST_NIC2_FUNCTION_MASK = 0x000000e0,
+	MPI_TEST_NIC2_FUNC_SHIFT = 5,
+	MPI_TEST_FC1_FUNCTION_ENABLE = (1 << 8),
+	MPI_TEST_FC1_FUNCTION_MASK	= 0x00000e00,
+	MPI_TEST_FC1_FUNCTION_SHIFT = 9,
+	MPI_TEST_FC2_FUNCTION_ENABLE = (1 << 12),
+	MPI_TEST_FC2_FUNCTION_MASK = 0x0000e000,
+	MPI_TEST_FC2_FUNCTION_SHIFT = 13,
+
+	MPI_NIC_READ = 0x00000000,
+	MPI_NIC_REG_BLOCK = 0x00020000,
+	MPI_NIC_FUNCTION_SHIFT = 6,
+};
+
+/*
+ * Processor Address Register (PROC_ADDR) bit definitions.
+ */
+enum {
+
+	/* Misc. stuff */
+	MAILBOX_COUNT = 16,
+	MAILBOX_TIMEOUT = 5,
+
+	PROC_ADDR_RDY = (1 << 31),
+	PROC_ADDR_R = (1 << 30),
+	PROC_ADDR_ERR = (1 << 29),
+	PROC_ADDR_DA = (1 << 28),
+	PROC_ADDR_FUNC0_MBI = 0x00001180,
+	PROC_ADDR_FUNC0_MBO = (PROC_ADDR_FUNC0_MBI + MAILBOX_COUNT),
+	PROC_ADDR_FUNC0_CTL = 0x000011a1,
+	PROC_ADDR_FUNC2_MBI = 0x00001280,
+	PROC_ADDR_FUNC2_MBO = (PROC_ADDR_FUNC2_MBI + MAILBOX_COUNT),
+	PROC_ADDR_FUNC2_CTL = 0x000012a1,
+	PROC_ADDR_MPI_RISC = 0x00000000,
+	PROC_ADDR_MDE = 0x00010000,
+	PROC_ADDR_REGBLOCK = 0x00020000,
+	PROC_ADDR_RISC_REG = 0x00030000,
+};
+
+/*
+ * System Register (SYS) bit definitions.
+ */
+enum {
+	SYS_EFE = (1 << 0),
+	SYS_FAE = (1 << 1),
+	SYS_MDC = (1 << 2),
+	SYS_DST = (1 << 3),
+	SYS_DWC = (1 << 4),
+	SYS_EVW = (1 << 5),
+	SYS_OMP_DLY_MASK = 0x3f000000,
+	/*
+	 * There are no values defined as of edit #15.
+	 */
+	SYS_ODI = (1 << 14),
+};
+
+/*
+ *  Reset/Failover Register (RST_FO) bit definitions.
+ */
+enum {
+	RST_FO_TFO = (1 << 0),
+	RST_FO_RR_MASK = 0x00060000,
+	RST_FO_RR_CQ_CAM = 0x00000000,
+	RST_FO_RR_DROP = 0x00000002,
+	RST_FO_RR_DQ = 0x00000004,
+	RST_FO_RR_RCV_FUNC_CQ = 0x00000006,
+	RST_FO_FRB = (1 << 12),
+	RST_FO_MOP = (1 << 13),
+	RST_FO_REG = (1 << 14),
+	RST_FO_FR = (1 << 15),
+};
+
+/*
+ * Function Specific Control Register (FSC) bit definitions.
+ */
+enum {
+	FSC_DBRST_MASK = 0x00070000,
+	FSC_DBRST_256 = 0x00000000,
+	FSC_DBRST_512 = 0x00000001,
+	FSC_DBRST_768 = 0x00000002,
+	FSC_DBRST_1024 = 0x00000003,
+	FSC_DBL_MASK = 0x00180000,
+	FSC_DBL_DBRST = 0x00000000,
+	FSC_DBL_MAX_PLD = 0x00000008,
+	FSC_DBL_MAX_BRST = 0x00000010,
+	FSC_DBL_128_BYTES = 0x00000018,
+	FSC_EC = (1 << 5),
+	FSC_EPC_MASK = 0x00c00000,
+	FSC_EPC_INBOUND = (1 << 6),
+	FSC_EPC_OUTBOUND = (1 << 7),
+	FSC_VM_PAGESIZE_MASK = 0x07000000,
+	FSC_VM_PAGE_2K = 0x00000100,
+	FSC_VM_PAGE_4K = 0x00000200,
+	FSC_VM_PAGE_8K = 0x00000300,
+	FSC_VM_PAGE_64K = 0x00000600,
+	FSC_SH = (1 << 11),
+	FSC_DSB = (1 << 12),
+	FSC_STE = (1 << 13),
+	FSC_FE = (1 << 15),
+};
+
+/*
+ *  Host Command Status Register (CSR) bit definitions.
+ */
+enum {
+	CSR_ERR_STS_MASK = 0x0000003f,
+	/*
+	 * There are no valued defined as of edit #15.
+	 */
+	CSR_RR = (1 << 8),
+	CSR_HRI = (1 << 9),
+	CSR_RP = (1 << 10),
+	CSR_CMD_PARM_SHIFT = 22,
+	CSR_CMD_NOP = 0x00000000,
+	CSR_CMD_SET_RST = 0x10000000,
+	CSR_CMD_CLR_RST = 0x20000000,
+	CSR_CMD_SET_PAUSE = 0x30000000,
+	CSR_CMD_CLR_PAUSE = 0x40000000,
+	CSR_CMD_SET_H2R_INT = 0x50000000,
+	CSR_CMD_CLR_H2R_INT = 0x60000000,
+	CSR_CMD_PAR_EN = 0x70000000,
+	CSR_CMD_SET_BAD_PAR = 0x80000000,
+	CSR_CMD_CLR_BAD_PAR = 0x90000000,
+	CSR_CMD_CLR_R2PCI_INT = 0xa0000000,
+};
+
+/*
+ *  Configuration Register (CFG) bit definitions.
+ */
+enum {
+	CFG_LRQ = (1 << 0),
+	CFG_DRQ = (1 << 1),
+	CFG_LR = (1 << 2),
+	CFG_DR = (1 << 3),
+	CFG_LE = (1 << 5),
+	CFG_LCQ = (1 << 6),
+	CFG_DCQ = (1 << 7),
+	CFG_Q_SHIFT = 8,
+	CFG_Q_MASK = 0x7f000000,
+};
+
+/*
+ *  Status Register (STS) bit definitions.
+ */
+enum {
+	STS_FE = (1 << 0),
+	STS_PI = (1 << 1),
+	STS_PL0 = (1 << 2),
+	STS_PL1 = (1 << 3),
+	STS_PI0 = (1 << 4),
+	STS_PI1 = (1 << 5),
+	STS_FUNC_ID_MASK = 0x000000c0,
+	STS_FUNC_ID_SHIFT = 6,
+	STS_F0E = (1 << 8),
+	STS_F1E = (1 << 9),
+	STS_F2E = (1 << 10),
+	STS_F3E = (1 << 11),
+	STS_NFE = (1 << 12),
+};
+
+/*
+ * Interrupt Enable Register (INTR_EN) bit definitions.
+ */
+enum {
+	INTR_EN_INTR_MASK = 0x007f0000,
+	INTR_EN_TYPE_MASK = 0x03000000,
+	INTR_EN_TYPE_ENABLE = 0x00000100,
+	INTR_EN_TYPE_DISABLE = 0x00000200,
+	INTR_EN_TYPE_READ = 0x00000300,
+	INTR_EN_IHD = (1 << 13),
+	INTR_EN_IHD_MASK = (INTR_EN_IHD << 16),
+	INTR_EN_EI = (1 << 14),
+	INTR_EN_EN = (1 << 15),
+};
+
+/*
+ * Interrupt Mask Register (INTR_MASK) bit definitions.
+ */
+enum {
+	INTR_MASK_PI = (1 << 0),
+	INTR_MASK_HL0 = (1 << 1),
+	INTR_MASK_LH0 = (1 << 2),
+	INTR_MASK_HL1 = (1 << 3),
+	INTR_MASK_LH1 = (1 << 4),
+	INTR_MASK_SE = (1 << 5),
+	INTR_MASK_LSC = (1 << 6),
+	INTR_MASK_MC = (1 << 7),
+	INTR_MASK_LINK_IRQS = INTR_MASK_LSC | INTR_MASK_SE | INTR_MASK_MC,
+};
+
+/*
+ *  Register (REV_ID) bit definitions.
+ */
+enum {
+	REV_ID_MASK = 0x0000000f,
+	REV_ID_NICROLL_SHIFT = 0,
+	REV_ID_NICREV_SHIFT = 4,
+	REV_ID_XGROLL_SHIFT = 8,
+	REV_ID_XGREV_SHIFT = 12,
+	REV_ID_CHIPREV_SHIFT = 28,
+};
+
+/*
+ *  Force ECC Error Register (FRC_ECC_ERR) bit definitions.
+ */
+enum {
+	FRC_ECC_ERR_VW = (1 << 12),
+	FRC_ECC_ERR_VB = (1 << 13),
+	FRC_ECC_ERR_NI = (1 << 14),
+	FRC_ECC_ERR_NO = (1 << 15),
+	FRC_ECC_PFE_SHIFT = 16,
+	FRC_ECC_ERR_DO = (1 << 18),
+	FRC_ECC_P14 = (1 << 19),
+};
+
+/*
+ *  Error Status Register (ERR_STS) bit definitions.
+ */
+enum {
+	ERR_STS_NOF = (1 << 0),
+	ERR_STS_NIF = (1 << 1),
+	ERR_STS_DRP = (1 << 2),
+	ERR_STS_XGP = (1 << 3),
+	ERR_STS_FOU = (1 << 4),
+	ERR_STS_FOC = (1 << 5),
+	ERR_STS_FOF = (1 << 6),
+	ERR_STS_FIU = (1 << 7),
+	ERR_STS_FIC = (1 << 8),
+	ERR_STS_FIF = (1 << 9),
+	ERR_STS_MOF = (1 << 10),
+	ERR_STS_TA = (1 << 11),
+	ERR_STS_MA = (1 << 12),
+	ERR_STS_MPE = (1 << 13),
+	ERR_STS_SCE = (1 << 14),
+	ERR_STS_STE = (1 << 15),
+	ERR_STS_FOW = (1 << 16),
+	ERR_STS_UE = (1 << 17),
+	ERR_STS_MCH = (1 << 26),
+	ERR_STS_LOC_SHIFT = 27,
+};
+
+/*
+ *  RAM Debug Address Register (RAM_DBG_ADDR) bit definitions.
+ */
+enum {
+	RAM_DBG_ADDR_FW = (1 << 30),
+	RAM_DBG_ADDR_FR = (1 << 31),
+};
+
+/*
+ * Semaphore Register (SEM) bit definitions.
+ */
+enum {
+	/*
+	 * Example:
+	 * reg = SEM_XGMAC0_MASK | (SEM_SET << SEM_XGMAC0_SHIFT)
+	 */
+	SEM_CLEAR = 0,
+	SEM_SET = 1,
+	SEM_FORCE = 3,
+	SEM_XGMAC0_SHIFT = 0,
+	SEM_XGMAC1_SHIFT = 2,
+	SEM_ICB_SHIFT = 4,
+	SEM_MAC_ADDR_SHIFT = 6,
+	SEM_FLASH_SHIFT = 8,
+	SEM_PROBE_SHIFT = 10,
+	SEM_RT_IDX_SHIFT = 12,
+	SEM_PROC_REG_SHIFT = 14,
+	SEM_XGMAC0_MASK = 0x00030000,
+	SEM_XGMAC1_MASK = 0x000c0000,
+	SEM_ICB_MASK = 0x00300000,
+	SEM_MAC_ADDR_MASK = 0x00c00000,
+	SEM_FLASH_MASK = 0x03000000,
+	SEM_PROBE_MASK = 0x0c000000,
+	SEM_RT_IDX_MASK = 0x30000000,
+	SEM_PROC_REG_MASK = 0xc0000000,
+};
+
+/*
+ *  10G MAC Address  Register (XGMAC_ADDR) bit definitions.
+ */
+enum {
+	XGMAC_ADDR_RDY = (1 << 31),
+	XGMAC_ADDR_R = (1 << 30),
+	XGMAC_ADDR_XME = (1 << 29),
+
+	/* XGMAC control registers */
+	PAUSE_SRC_LO = 0x00000100,
+	PAUSE_SRC_HI = 0x00000104,
+	GLOBAL_CFG = 0x00000108,
+	GLOBAL_CFG_RESET = (1 << 0),
+	GLOBAL_CFG_JUMBO = (1 << 6),
+	GLOBAL_CFG_TX_STAT_EN = (1 << 10),
+	GLOBAL_CFG_RX_STAT_EN = (1 << 11),
+	TX_CFG = 0x0000010c,
+	TX_CFG_RESET = (1 << 0),
+	TX_CFG_EN = (1 << 1),
+	TX_CFG_PREAM = (1 << 2),
+	RX_CFG = 0x00000110,
+	RX_CFG_RESET = (1 << 0),
+	RX_CFG_EN = (1 << 1),
+	RX_CFG_PREAM = (1 << 2),
+	FLOW_CTL = 0x0000011c,
+	PAUSE_OPCODE = 0x00000120,
+	PAUSE_TIMER = 0x00000124,
+	PAUSE_FRM_DEST_LO = 0x00000128,
+	PAUSE_FRM_DEST_HI = 0x0000012c,
+	MAC_TX_PARAMS = 0x00000134,
+	MAC_TX_PARAMS_JUMBO = (1 << 31),
+	MAC_TX_PARAMS_SIZE_SHIFT = 16,
+	MAC_RX_PARAMS = 0x00000138,
+	MAC_SYS_INT = 0x00000144,
+	MAC_SYS_INT_MASK = 0x00000148,
+	MAC_MGMT_INT = 0x0000014c,
+	MAC_MGMT_IN_MASK = 0x00000150,
+	EXT_ARB_MODE = 0x000001fc,
+
+	/* XGMAC TX statistics  registers */
+	TX_PKTS = 0x00000200,
+	TX_BYTES = 0x00000208,
+	TX_MCAST_PKTS = 0x00000210,
+	TX_BCAST_PKTS = 0x00000218,
+	TX_UCAST_PKTS = 0x00000220,
+	TX_CTL_PKTS = 0x00000228,
+	TX_PAUSE_PKTS = 0x00000230,
+	TX_64_PKT = 0x00000238,
+	TX_65_TO_127_PKT = 0x00000240,
+	TX_128_TO_255_PKT = 0x00000248,
+	TX_256_511_PKT = 0x00000250,
+	TX_512_TO_1023_PKT = 0x00000258,
+	TX_1024_TO_1518_PKT = 0x00000260,
+	TX_1519_TO_MAX_PKT = 0x00000268,
+	TX_UNDERSIZE_PKT = 0x00000270,
+	TX_OVERSIZE_PKT = 0x00000278,
+
+	/* XGMAC statistics control registers */
+	RX_HALF_FULL_DET = 0x000002a0,
+	TX_HALF_FULL_DET = 0x000002a4,
+	RX_OVERFLOW_DET = 0x000002a8,
+	TX_OVERFLOW_DET = 0x000002ac,
+	RX_HALF_FULL_MASK = 0x000002b0,
+	TX_HALF_FULL_MASK = 0x000002b4,
+	RX_OVERFLOW_MASK = 0x000002b8,
+	TX_OVERFLOW_MASK = 0x000002bc,
+	STAT_CNT_CTL = 0x000002c0,
+	STAT_CNT_CTL_CLEAR_TX = (1 << 0),
+	STAT_CNT_CTL_CLEAR_RX = (1 << 1),
+	AUX_RX_HALF_FULL_DET = 0x000002d0,
+	AUX_TX_HALF_FULL_DET = 0x000002d4,
+	AUX_RX_OVERFLOW_DET = 0x000002d8,
+	AUX_TX_OVERFLOW_DET = 0x000002dc,
+	AUX_RX_HALF_FULL_MASK = 0x000002f0,
+	AUX_TX_HALF_FULL_MASK = 0x000002f4,
+	AUX_RX_OVERFLOW_MASK = 0x000002f8,
+	AUX_TX_OVERFLOW_MASK = 0x000002fc,
+
+	/* XGMAC RX statistics  registers */
+	RX_BYTES = 0x00000300,
+	RX_BYTES_OK = 0x00000308,
+	RX_PKTS = 0x00000310,
+	RX_PKTS_OK = 0x00000318,
+	RX_BCAST_PKTS = 0x00000320,
+	RX_MCAST_PKTS = 0x00000328,
+	RX_UCAST_PKTS = 0x00000330,
+	RX_UNDERSIZE_PKTS = 0x00000338,
+	RX_OVERSIZE_PKTS = 0x00000340,
+	RX_JABBER_PKTS = 0x00000348,
+	RX_UNDERSIZE_FCERR_PKTS = 0x00000350,
+	RX_DROP_EVENTS = 0x00000358,
+	RX_FCERR_PKTS = 0x00000360,
+	RX_ALIGN_ERR = 0x00000368,
+	RX_SYMBOL_ERR = 0x00000370,
+	RX_MAC_ERR = 0x00000378,
+	RX_CTL_PKTS = 0x00000380,
+	RX_PAUSE_PKTS = 0x00000388,
+	RX_64_PKTS = 0x00000390,
+	RX_65_TO_127_PKTS = 0x00000398,
+	RX_128_255_PKTS = 0x000003a0,
+	RX_256_511_PKTS = 0x000003a8,
+	RX_512_TO_1023_PKTS = 0x000003b0,
+	RX_1024_TO_1518_PKTS = 0x000003b8,
+	RX_1519_TO_MAX_PKTS = 0x000003c0,
+	RX_LEN_ERR_PKTS = 0x000003c8,
+
+	/* XGMAC MDIO control registers */
+	MDIO_TX_DATA = 0x00000400,
+	MDIO_RX_DATA = 0x00000410,
+	MDIO_CMD = 0x00000420,
+	MDIO_PHY_ADDR = 0x00000430,
+	MDIO_PORT = 0x00000440,
+	MDIO_STATUS = 0x00000450,
+
+	XGMAC_REGISTER_END = 0x00000740,
+};
+
+/*
+ *  Enhanced Transmission Schedule Registers (NIC_ETS,CNA_ETS) bit definitions.
+ */
+enum {
+	ETS_QUEUE_SHIFT = 29,
+	ETS_REF = (1 << 26),
+	ETS_RS = (1 << 27),
+	ETS_P = (1 << 28),
+	ETS_FC_COS_SHIFT = 23,
+};
+
+/*
+ *  Flash Address Register (FLASH_ADDR) bit definitions.
+ */
+enum {
+	FLASH_ADDR_RDY = (1 << 31),
+	FLASH_ADDR_R = (1 << 30),
+	FLASH_ADDR_ERR = (1 << 29),
+};
+
+/*
+ *  Stop CQ Processing Register (CQ_STOP) bit definitions.
+ */
+enum {
+	CQ_STOP_QUEUE_MASK = (0x007f0000),
+	CQ_STOP_TYPE_MASK = (0x03000000),
+	CQ_STOP_TYPE_START = 0x00000100,
+	CQ_STOP_TYPE_STOP = 0x00000200,
+	CQ_STOP_TYPE_READ = 0x00000300,
+	CQ_STOP_EN = (1 << 15),
+};
+
+/*
+ *  MAC Protocol Address Index Register (MAC_ADDR_IDX) bit definitions.
+ */
+enum {
+	MAC_ADDR_IDX_SHIFT = 4,
+	MAC_ADDR_TYPE_SHIFT = 16,
+	MAC_ADDR_TYPE_COUNT = 10,
+	MAC_ADDR_TYPE_MASK = 0x000f0000,
+	MAC_ADDR_TYPE_CAM_MAC = 0x00000000,
+	MAC_ADDR_TYPE_MULTI_MAC = 0x00010000,
+	MAC_ADDR_TYPE_VLAN = 0x00020000,
+	MAC_ADDR_TYPE_MULTI_FLTR = 0x00030000,
+	MAC_ADDR_TYPE_FC_MAC = 0x00040000,
+	MAC_ADDR_TYPE_MGMT_MAC = 0x00050000,
+	MAC_ADDR_TYPE_MGMT_VLAN = 0x00060000,
+	MAC_ADDR_TYPE_MGMT_V4 = 0x00070000,
+	MAC_ADDR_TYPE_MGMT_V6 = 0x00080000,
+	MAC_ADDR_TYPE_MGMT_TU_DP = 0x00090000,
+	MAC_ADDR_ADR = (1 << 25),
+	MAC_ADDR_RS = (1 << 26),
+	MAC_ADDR_E = (1 << 27),
+	MAC_ADDR_MR = (1 << 30),
+	MAC_ADDR_MW = (1 << 31),
+	MAX_MULTICAST_ENTRIES = 32,
+
+	/* Entry count and words per entry
+	 * for each address type in the filter.
+	 */
+	MAC_ADDR_MAX_CAM_ENTRIES = 512,
+	MAC_ADDR_MAX_CAM_WCOUNT = 3,
+	MAC_ADDR_MAX_MULTICAST_ENTRIES = 32,
+	MAC_ADDR_MAX_MULTICAST_WCOUNT = 2,
+	MAC_ADDR_MAX_VLAN_ENTRIES = 4096,
+	MAC_ADDR_MAX_VLAN_WCOUNT = 1,
+	MAC_ADDR_MAX_MCAST_FLTR_ENTRIES = 4096,
+	MAC_ADDR_MAX_MCAST_FLTR_WCOUNT = 1,
+	MAC_ADDR_MAX_FC_MAC_ENTRIES = 4,
+	MAC_ADDR_MAX_FC_MAC_WCOUNT = 2,
+	MAC_ADDR_MAX_MGMT_MAC_ENTRIES = 8,
+	MAC_ADDR_MAX_MGMT_MAC_WCOUNT = 2,
+	MAC_ADDR_MAX_MGMT_VLAN_ENTRIES = 16,
+	MAC_ADDR_MAX_MGMT_VLAN_WCOUNT = 1,
+	MAC_ADDR_MAX_MGMT_V4_ENTRIES = 4,
+	MAC_ADDR_MAX_MGMT_V4_WCOUNT = 1,
+	MAC_ADDR_MAX_MGMT_V6_ENTRIES = 4,
+	MAC_ADDR_MAX_MGMT_V6_WCOUNT = 4,
+	MAC_ADDR_MAX_MGMT_TU_DP_ENTRIES = 4,
+	MAC_ADDR_MAX_MGMT_TU_DP_WCOUNT = 1,
+};
+
+/*
+ *  MAC Protocol Address Index Register (SPLT_HDR) bit definitions.
+ */
+enum {
+	SPLT_HDR_EP = (1 << 31),
+};
+
+/*
+ *  FCoE Receive Configuration Register (FC_RCV_CFG) bit definitions.
+ */
+enum {
+	FC_RCV_CFG_ECT = (1 << 15),
+	FC_RCV_CFG_DFH = (1 << 20),
+	FC_RCV_CFG_DVF = (1 << 21),
+	FC_RCV_CFG_RCE = (1 << 27),
+	FC_RCV_CFG_RFE = (1 << 28),
+	FC_RCV_CFG_TEE = (1 << 29),
+	FC_RCV_CFG_TCE = (1 << 30),
+	FC_RCV_CFG_TFE = (1 << 31),
+};
+
+/*
+ *  NIC Receive Configuration Register (NIC_RCV_CFG) bit definitions.
+ */
+enum {
+	NIC_RCV_CFG_PPE = (1 << 0),
+	NIC_RCV_CFG_VLAN_MASK = 0x00060000,
+	NIC_RCV_CFG_VLAN_ALL = 0x00000000,
+	NIC_RCV_CFG_VLAN_MATCH_ONLY = 0x00000002,
+	NIC_RCV_CFG_VLAN_MATCH_AND_NON = 0x00000004,
+	NIC_RCV_CFG_VLAN_NONE_AND_NON = 0x00000006,
+	NIC_RCV_CFG_RV = (1 << 3),
+	NIC_RCV_CFG_DFQ_MASK = (0x7f000000),
+	NIC_RCV_CFG_DFQ_SHIFT = 8,
+	NIC_RCV_CFG_DFQ = 0,	/* HARDCODE default queue to 0. */
+};
+
+/*
+ *   Mgmt Receive Configuration Register (MGMT_RCV_CFG) bit definitions.
+ */
+enum {
+	MGMT_RCV_CFG_ARP = (1 << 0),
+	MGMT_RCV_CFG_DHC = (1 << 1),
+	MGMT_RCV_CFG_DHS = (1 << 2),
+	MGMT_RCV_CFG_NP = (1 << 3),
+	MGMT_RCV_CFG_I6N = (1 << 4),
+	MGMT_RCV_CFG_I6R = (1 << 5),
+	MGMT_RCV_CFG_DH6 = (1 << 6),
+	MGMT_RCV_CFG_UD1 = (1 << 7),
+	MGMT_RCV_CFG_UD0 = (1 << 8),
+	MGMT_RCV_CFG_BCT = (1 << 9),
+	MGMT_RCV_CFG_MCT = (1 << 10),
+	MGMT_RCV_CFG_DM = (1 << 11),
+	MGMT_RCV_CFG_RM = (1 << 12),
+	MGMT_RCV_CFG_STL = (1 << 13),
+	MGMT_RCV_CFG_VLAN_MASK = 0xc0000000,
+	MGMT_RCV_CFG_VLAN_ALL = 0x00000000,
+	MGMT_RCV_CFG_VLAN_MATCH_ONLY = 0x00004000,
+	MGMT_RCV_CFG_VLAN_MATCH_AND_NON = 0x00008000,
+	MGMT_RCV_CFG_VLAN_NONE_AND_NON = 0x0000c000,
+};
+
+/*
+ *  Routing Index Register (RT_IDX) bit definitions.
+ */
+enum {
+	RT_IDX_IDX_SHIFT = 8,
+	RT_IDX_TYPE_MASK = 0x000f0000,
+	RT_IDX_TYPE_SHIFT = 16,
+	RT_IDX_TYPE_RT = 0x00000000,
+	RT_IDX_TYPE_RT_INV = 0x00010000,
+	RT_IDX_TYPE_NICQ = 0x00020000,
+	RT_IDX_TYPE_NICQ_INV = 0x00030000,
+	RT_IDX_DST_MASK = 0x00700000,
+	RT_IDX_DST_RSS = 0x00000000,
+	RT_IDX_DST_CAM_Q = 0x00100000,
+	RT_IDX_DST_COS_Q = 0x00200000,
+	RT_IDX_DST_DFLT_Q = 0x00300000,
+	RT_IDX_DST_DEST_Q = 0x00400000,
+	RT_IDX_RS = (1 << 26),
+	RT_IDX_E = (1 << 27),
+	RT_IDX_MR = (1 << 30),
+	RT_IDX_MW = (1 << 31),
+
+	/* Nic Queue format - type 2 bits */
+	RT_IDX_BCAST = (1 << 0),
+	RT_IDX_MCAST = (1 << 1),
+	RT_IDX_MCAST_MATCH = (1 << 2),
+	RT_IDX_MCAST_REG_MATCH = (1 << 3),
+	RT_IDX_MCAST_HASH_MATCH = (1 << 4),
+	RT_IDX_FC_MACH = (1 << 5),
+	RT_IDX_ETH_FCOE = (1 << 6),
+	RT_IDX_CAM_HIT = (1 << 7),
+	RT_IDX_CAM_BIT0 = (1 << 8),
+	RT_IDX_CAM_BIT1 = (1 << 9),
+	RT_IDX_VLAN_TAG = (1 << 10),
+	RT_IDX_VLAN_MATCH = (1 << 11),
+	RT_IDX_VLAN_FILTER = (1 << 12),
+	RT_IDX_ETH_SKIP1 = (1 << 13),
+	RT_IDX_ETH_SKIP2 = (1 << 14),
+	RT_IDX_BCAST_MCAST_MATCH = (1 << 15),
+	RT_IDX_802_3 = (1 << 16),
+	RT_IDX_LLDP = (1 << 17),
+	RT_IDX_UNUSED018 = (1 << 18),
+	RT_IDX_UNUSED019 = (1 << 19),
+	RT_IDX_UNUSED20 = (1 << 20),
+	RT_IDX_UNUSED21 = (1 << 21),
+	RT_IDX_ERR = (1 << 22),
+	RT_IDX_VALID = (1 << 23),
+	RT_IDX_TU_CSUM_ERR = (1 << 24),
+	RT_IDX_IP_CSUM_ERR = (1 << 25),
+	RT_IDX_MAC_ERR = (1 << 26),
+	RT_IDX_RSS_TCP6 = (1 << 27),
+	RT_IDX_RSS_TCP4 = (1 << 28),
+	RT_IDX_RSS_IPV6 = (1 << 29),
+	RT_IDX_RSS_IPV4 = (1 << 30),
+	RT_IDX_RSS_MATCH = (1 << 31),
+
+	/* Hierarchy for the NIC Queue Mask */
+	RT_IDX_ALL_ERR_SLOT = 0,
+	RT_IDX_MAC_ERR_SLOT = 0,
+	RT_IDX_IP_CSUM_ERR_SLOT = 1,
+	RT_IDX_TCP_UDP_CSUM_ERR_SLOT = 2,
+	RT_IDX_BCAST_SLOT = 3,
+	RT_IDX_MCAST_MATCH_SLOT = 4,
+	RT_IDX_ALLMULTI_SLOT = 5,
+	RT_IDX_UNUSED6_SLOT = 6,
+	RT_IDX_UNUSED7_SLOT = 7,
+	RT_IDX_RSS_MATCH_SLOT = 8,
+	RT_IDX_RSS_IPV4_SLOT = 8,
+	RT_IDX_RSS_IPV6_SLOT = 9,
+	RT_IDX_RSS_TCP4_SLOT = 10,
+	RT_IDX_RSS_TCP6_SLOT = 11,
+	RT_IDX_CAM_HIT_SLOT = 12,
+	RT_IDX_UNUSED013 = 13,
+	RT_IDX_UNUSED014 = 14,
+	RT_IDX_PROMISCUOUS_SLOT = 15,
+	RT_IDX_MAX_RT_SLOTS = 8,
+	RT_IDX_MAX_NIC_SLOTS = 16,
+};
+
+/*
+ * Serdes Address Register (XG_SERDES_ADDR) bit definitions.
+ */
+enum {
+	XG_SERDES_ADDR_RDY = (1 << 31),
+	XG_SERDES_ADDR_R = (1 << 30),
+
+	XG_SERDES_ADDR_STS = 0x00001E06,
+	XG_SERDES_ADDR_XFI1_PWR_UP = 0x00000005,
+	XG_SERDES_ADDR_XFI2_PWR_UP = 0x0000000a,
+	XG_SERDES_ADDR_XAUI_PWR_DOWN = 0x00000001,
+
+	/* Serdes coredump definitions. */
+	XG_SERDES_XAUI_AN_START = 0x00000000,
+	XG_SERDES_XAUI_AN_END = 0x00000034,
+	XG_SERDES_XAUI_HSS_PCS_START = 0x00000800,
+	XG_SERDES_XAUI_HSS_PCS_END = 0x0000880,
+	XG_SERDES_XFI_AN_START = 0x00001000,
+	XG_SERDES_XFI_AN_END = 0x00001034,
+	XG_SERDES_XFI_TRAIN_START = 0x10001050,
+	XG_SERDES_XFI_TRAIN_END = 0x1000107C,
+	XG_SERDES_XFI_HSS_PCS_START = 0x00001800,
+	XG_SERDES_XFI_HSS_PCS_END = 0x00001838,
+	XG_SERDES_XFI_HSS_TX_START = 0x00001c00,
+	XG_SERDES_XFI_HSS_TX_END = 0x00001c1f,
+	XG_SERDES_XFI_HSS_RX_START = 0x00001c40,
+	XG_SERDES_XFI_HSS_RX_END = 0x00001c5f,
+	XG_SERDES_XFI_HSS_PLL_START = 0x00001e00,
+	XG_SERDES_XFI_HSS_PLL_END = 0x00001e1f,
+};
+
+/*
+ *  NIC Probe Mux Address Register (PRB_MX_ADDR) bit definitions.
+ */
+enum {
+	PRB_MX_ADDR_ARE = (1 << 16),
+	PRB_MX_ADDR_UP = (1 << 15),
+	PRB_MX_ADDR_SWP = (1 << 14),
+
+	/* Module select values. */
+	PRB_MX_ADDR_MAX_MODS = 21,
+	PRB_MX_ADDR_MOD_SEL_SHIFT = 9,
+	PRB_MX_ADDR_MOD_SEL_TBD = 0,
+	PRB_MX_ADDR_MOD_SEL_IDE1 = 1,
+	PRB_MX_ADDR_MOD_SEL_IDE2 = 2,
+	PRB_MX_ADDR_MOD_SEL_FRB = 3,
+	PRB_MX_ADDR_MOD_SEL_ODE1 = 4,
+	PRB_MX_ADDR_MOD_SEL_ODE2 = 5,
+	PRB_MX_ADDR_MOD_SEL_DA1 = 6,
+	PRB_MX_ADDR_MOD_SEL_DA2 = 7,
+	PRB_MX_ADDR_MOD_SEL_IMP1 = 8,
+	PRB_MX_ADDR_MOD_SEL_IMP2 = 9,
+	PRB_MX_ADDR_MOD_SEL_OMP1 = 10,
+	PRB_MX_ADDR_MOD_SEL_OMP2 = 11,
+	PRB_MX_ADDR_MOD_SEL_ORS1 = 12,
+	PRB_MX_ADDR_MOD_SEL_ORS2 = 13,
+	PRB_MX_ADDR_MOD_SEL_REG = 14,
+	PRB_MX_ADDR_MOD_SEL_MAC1 = 16,
+	PRB_MX_ADDR_MOD_SEL_MAC2 = 17,
+	PRB_MX_ADDR_MOD_SEL_VQM1 = 18,
+	PRB_MX_ADDR_MOD_SEL_VQM2 = 19,
+	PRB_MX_ADDR_MOD_SEL_MOP = 20,
+	/* Bit fields indicating which modules
+	 * are valid for each clock domain.
+	 */
+	PRB_MX_ADDR_VALID_SYS_MOD = 0x000f7ff7,
+	PRB_MX_ADDR_VALID_PCI_MOD = 0x000040c1,
+	PRB_MX_ADDR_VALID_XGM_MOD = 0x00037309,
+	PRB_MX_ADDR_VALID_FC_MOD = 0x00003001,
+	PRB_MX_ADDR_VALID_TOTAL = 34,
+
+	/* Clock domain values. */
+	PRB_MX_ADDR_CLOCK_SHIFT = 6,
+	PRB_MX_ADDR_SYS_CLOCK = 0,
+	PRB_MX_ADDR_PCI_CLOCK = 2,
+	PRB_MX_ADDR_FC_CLOCK = 5,
+	PRB_MX_ADDR_XGM_CLOCK = 6,
+
+	PRB_MX_ADDR_MAX_MUX = 64,
+};
+
+/*
+ * Control Register Set Map
+ */
+enum {
+	PROC_ADDR = 0,		/* Use semaphore */
+	PROC_DATA = 0x04,	/* Use semaphore */
+	SYS = 0x08,
+	RST_FO = 0x0c,
+	FSC = 0x10,
+	CSR = 0x14,
+	LED = 0x18,
+	ICB_RID = 0x1c,		/* Use semaphore */
+	ICB_L = 0x20,		/* Use semaphore */
+	ICB_H = 0x24,		/* Use semaphore */
+	CFG = 0x28,
+	BIOS_ADDR = 0x2c,
+	STS = 0x30,
+	INTR_EN = 0x34,
+	INTR_MASK = 0x38,
+	ISR1 = 0x3c,
+	ISR2 = 0x40,
+	ISR3 = 0x44,
+	ISR4 = 0x48,
+	REV_ID = 0x4c,
+	FRC_ECC_ERR = 0x50,
+	ERR_STS = 0x54,
+	RAM_DBG_ADDR = 0x58,
+	RAM_DBG_DATA = 0x5c,
+	ECC_ERR_CNT = 0x60,
+	SEM = 0x64,
+	GPIO_1 = 0x68,		/* Use semaphore */
+	GPIO_2 = 0x6c,		/* Use semaphore */
+	GPIO_3 = 0x70,		/* Use semaphore */
+	RSVD2 = 0x74,
+	XGMAC_ADDR = 0x78,	/* Use semaphore */
+	XGMAC_DATA = 0x7c,	/* Use semaphore */
+	NIC_ETS = 0x80,
+	CNA_ETS = 0x84,
+	FLASH_ADDR = 0x88,	/* Use semaphore */
+	FLASH_DATA = 0x8c,	/* Use semaphore */
+	CQ_STOP = 0x90,
+	PAGE_TBL_RID = 0x94,
+	WQ_PAGE_TBL_LO = 0x98,
+	WQ_PAGE_TBL_HI = 0x9c,
+	CQ_PAGE_TBL_LO = 0xa0,
+	CQ_PAGE_TBL_HI = 0xa4,
+	MAC_ADDR_IDX = 0xa8,	/* Use semaphore */
+	MAC_ADDR_DATA = 0xac,	/* Use semaphore */
+	COS_DFLT_CQ1 = 0xb0,
+	COS_DFLT_CQ2 = 0xb4,
+	ETYPE_SKIP1 = 0xb8,
+	ETYPE_SKIP2 = 0xbc,
+	SPLT_HDR = 0xc0,
+	FC_PAUSE_THRES = 0xc4,
+	NIC_PAUSE_THRES = 0xc8,
+	FC_ETHERTYPE = 0xcc,
+	FC_RCV_CFG = 0xd0,
+	NIC_RCV_CFG = 0xd4,
+	FC_COS_TAGS = 0xd8,
+	NIC_COS_TAGS = 0xdc,
+	MGMT_RCV_CFG = 0xe0,
+	RT_IDX = 0xe4,
+	RT_DATA = 0xe8,
+	RSVD7 = 0xec,
+	XG_SERDES_ADDR = 0xf0,
+	XG_SERDES_DATA = 0xf4,
+	PRB_MX_ADDR = 0xf8,	/* Use semaphore */
+	PRB_MX_DATA = 0xfc,	/* Use semaphore */
+};
+
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+#define SMALL_BUFFER_SIZE 256
+#define SMALL_BUF_MAP_SIZE SMALL_BUFFER_SIZE
+#define SPLT_SETTING  FSC_DBRST_1024
+#define SPLT_LEN 0
+#define QLGE_SB_PAD 0
+#else
+#define SMALL_BUFFER_SIZE 512
+#define SMALL_BUF_MAP_SIZE (SMALL_BUFFER_SIZE / 2)
+#define SPLT_SETTING  FSC_SH
+#define SPLT_LEN (SPLT_HDR_EP | \
+	min(SMALL_BUF_MAP_SIZE, 1023))
+#define QLGE_SB_PAD 32
+#endif
+
+/*
+ * CAM output format.
+ */
+enum {
+	CAM_OUT_ROUTE_FC = 0,
+	CAM_OUT_ROUTE_NIC = 1,
+	CAM_OUT_FUNC_SHIFT = 2,
+	CAM_OUT_RV = (1 << 4),
+	CAM_OUT_SH = (1 << 15),
+	CAM_OUT_CQ_ID_SHIFT = 5,
+};
+
+/*
+ * Mailbox  definitions
+ */
+enum {
+	/* Asynchronous Event Notifications */
+	AEN_SYS_ERR = 0x00008002,
+	AEN_LINK_UP = 0x00008011,
+	AEN_LINK_DOWN = 0x00008012,
+	AEN_IDC_CMPLT = 0x00008100,
+	AEN_IDC_REQ = 0x00008101,
+	AEN_IDC_EXT = 0x00008102,
+	AEN_DCBX_CHG = 0x00008110,
+	AEN_AEN_LOST = 0x00008120,
+	AEN_AEN_SFP_IN = 0x00008130,
+	AEN_AEN_SFP_OUT = 0x00008131,
+	AEN_FW_INIT_DONE = 0x00008400,
+	AEN_FW_INIT_FAIL = 0x00008401,
+
+	/* Mailbox Command Opcodes. */
+	MB_CMD_NOP = 0x00000000,
+	MB_CMD_EX_FW = 0x00000002,
+	MB_CMD_MB_TEST = 0x00000006,
+	MB_CMD_CSUM_TEST = 0x00000007,	/* Verify Checksum */
+	MB_CMD_ABOUT_FW = 0x00000008,
+	MB_CMD_COPY_RISC_RAM = 0x0000000a,
+	MB_CMD_LOAD_RISC_RAM = 0x0000000b,
+	MB_CMD_DUMP_RISC_RAM = 0x0000000c,
+	MB_CMD_WRITE_RAM = 0x0000000d,
+	MB_CMD_INIT_RISC_RAM = 0x0000000e,
+	MB_CMD_READ_RAM = 0x0000000f,
+	MB_CMD_STOP_FW = 0x00000014,
+	MB_CMD_MAKE_SYS_ERR = 0x0000002a,
+	MB_CMD_WRITE_SFP = 0x00000030,
+	MB_CMD_READ_SFP = 0x00000031,
+	MB_CMD_INIT_FW = 0x00000060,
+	MB_CMD_GET_IFCB = 0x00000061,
+	MB_CMD_GET_FW_STATE = 0x00000069,
+	MB_CMD_IDC_REQ = 0x00000100,	/* Inter-Driver Communication */
+	MB_CMD_IDC_ACK = 0x00000101,	/* Inter-Driver Communication */
+	MB_CMD_SET_WOL_MODE = 0x00000110,	/* Wake On Lan */
+	MB_WOL_DISABLE = 0,
+	MB_WOL_MAGIC_PKT = (1 << 1),
+	MB_WOL_FLTR = (1 << 2),
+	MB_WOL_UCAST = (1 << 3),
+	MB_WOL_MCAST = (1 << 4),
+	MB_WOL_BCAST = (1 << 5),
+	MB_WOL_LINK_UP = (1 << 6),
+	MB_WOL_LINK_DOWN = (1 << 7),
+	MB_WOL_MODE_ON = (1 << 16),		/* Wake on Lan Mode on */
+	MB_CMD_SET_WOL_FLTR = 0x00000111,	/* Wake On Lan Filter */
+	MB_CMD_CLEAR_WOL_FLTR = 0x00000112, /* Wake On Lan Filter */
+	MB_CMD_SET_WOL_MAGIC = 0x00000113,	/* Wake On Lan Magic Packet */
+	MB_CMD_CLEAR_WOL_MAGIC = 0x00000114,/* Wake On Lan Magic Packet */
+	MB_CMD_SET_WOL_IMMED = 0x00000115,
+	MB_CMD_PORT_RESET = 0x00000120,
+	MB_CMD_SET_PORT_CFG = 0x00000122,
+	MB_CMD_GET_PORT_CFG = 0x00000123,
+	MB_CMD_GET_LINK_STS = 0x00000124,
+	MB_CMD_SET_LED_CFG = 0x00000125, /* Set LED Configuration Register */
+		QL_LED_BLINK = 0x03e803e8,
+	MB_CMD_GET_LED_CFG = 0x00000126, /* Get LED Configuration Register */
+	MB_CMD_SET_MGMNT_TFK_CTL = 0x00000160, /* Set Mgmnt Traffic Control */
+	MB_SET_MPI_TFK_STOP = (1 << 0),
+	MB_SET_MPI_TFK_RESUME = (1 << 1),
+	MB_CMD_GET_MGMNT_TFK_CTL = 0x00000161, /* Get Mgmnt Traffic Control */
+	MB_GET_MPI_TFK_STOPPED = (1 << 0),
+	MB_GET_MPI_TFK_FIFO_EMPTY = (1 << 1),
+	/* Sub-commands for IDC request.
+	 * This describes the reason for the
+	 * IDC request.
+	 */
+	MB_CMD_IOP_NONE = 0x0000,
+	MB_CMD_IOP_PREP_UPDATE_MPI	= 0x0001,
+	MB_CMD_IOP_COMP_UPDATE_MPI	= 0x0002,
+	MB_CMD_IOP_PREP_LINK_DOWN	= 0x0010,
+	MB_CMD_IOP_DVR_START	 = 0x0100,
+	MB_CMD_IOP_FLASH_ACC	 = 0x0101,
+	MB_CMD_IOP_RESTART_MPI	= 0x0102,
+	MB_CMD_IOP_CORE_DUMP_MPI	= 0x0103,
+
+	/* Mailbox Command Status. */
+	MB_CMD_STS_GOOD = 0x00004000,	/* Success. */
+	MB_CMD_STS_INTRMDT = 0x00001000,	/* Intermediate Complete. */
+	MB_CMD_STS_INVLD_CMD = 0x00004001,	/* Invalid. */
+	MB_CMD_STS_XFC_ERR = 0x00004002,	/* Interface Error. */
+	MB_CMD_STS_CSUM_ERR = 0x00004003,	/* Csum Error. */
+	MB_CMD_STS_ERR = 0x00004005,	/* System Error. */
+	MB_CMD_STS_PARAM_ERR = 0x00004006,	/* Parameter Error. */
+};
+
+struct mbox_params {
+	u32 mbox_in[MAILBOX_COUNT];
+	u32 mbox_out[MAILBOX_COUNT];
+	int in_count;
+	int out_count;
+};
+
+struct flash_params_8012 {
+	u8 dev_id_str[4];
+	__le16 size;
+	__le16 csum;
+	__le16 ver;
+	__le16 sub_dev_id;
+	u8 mac_addr[6];
+	__le16 res;
+};
+
+/* 8000 device's flash is a different structure
+ * at a different offset in flash.
+ */
+#define FUNC0_FLASH_OFFSET 0x140200
+#define FUNC1_FLASH_OFFSET 0x140600
+
+/* Flash related data structures. */
+struct flash_params_8000 {
+	u8 dev_id_str[4];	/* "8000" */
+	__le16 ver;
+	__le16 size;
+	__le16 csum;
+	__le16 reserved0;
+	__le16 total_size;
+	__le16 entry_count;
+	u8 data_type0;
+	u8 data_size0;
+	u8 mac_addr[6];
+	u8 data_type1;
+	u8 data_size1;
+	u8 mac_addr1[6];
+	u8 data_type2;
+	u8 data_size2;
+	__le16 vlan_id;
+	u8 data_type3;
+	u8 data_size3;
+	__le16 last;
+	u8 reserved1[464];
+	__le16	subsys_ven_id;
+	__le16	subsys_dev_id;
+	u8 reserved2[4];
+};
+
+union flash_params {
+	struct flash_params_8012 flash_params_8012;
+	struct flash_params_8000 flash_params_8000;
+};
+
+/*
+ * doorbell space for the rx ring context
+ */
+struct rx_doorbell_context {
+	u32 cnsmr_idx;		/* 0x00 */
+	u32 valid;		/* 0x04 */
+	u32 reserved[4];	/* 0x08-0x14 */
+	u32 lbq_prod_idx;	/* 0x18 */
+	u32 sbq_prod_idx;	/* 0x1c */
+};
+
+/*
+ * doorbell space for the tx ring context
+ */
+struct tx_doorbell_context {
+	u32 prod_idx;		/* 0x00 */
+	u32 valid;		/* 0x04 */
+	u32 reserved[4];	/* 0x08-0x14 */
+	u32 lbq_prod_idx;	/* 0x18 */
+	u32 sbq_prod_idx;	/* 0x1c */
+};
+
+/* DATA STRUCTURES SHARED WITH HARDWARE. */
+struct tx_buf_desc {
+	__le64 addr;
+	__le32 len;
+#define TX_DESC_LEN_MASK	0x000fffff
+#define TX_DESC_C	0x40000000
+#define TX_DESC_E	0x80000000
+} __packed;
+
+/*
+ * IOCB Definitions...
+ */
+
+#define OPCODE_OB_MAC_IOCB 			0x01
+#define OPCODE_OB_MAC_TSO_IOCB		0x02
+#define OPCODE_IB_MAC_IOCB			0x20
+#define OPCODE_IB_MPI_IOCB			0x21
+#define OPCODE_IB_AE_IOCB			0x3f
+
+struct ob_mac_iocb_req {
+	u8 opcode;
+	u8 flags1;
+#define OB_MAC_IOCB_REQ_OI	0x01
+#define OB_MAC_IOCB_REQ_I	0x02
+#define OB_MAC_IOCB_REQ_D	0x08
+#define OB_MAC_IOCB_REQ_F	0x10
+	u8 flags2;
+	u8 flags3;
+#define OB_MAC_IOCB_DFP	0x02
+#define OB_MAC_IOCB_V	0x04
+	__le32 reserved1[2];
+	__le16 frame_len;
+#define OB_MAC_IOCB_LEN_MASK 0x3ffff
+	__le16 reserved2;
+	u32 tid;
+	u32 txq_idx;
+	__le32 reserved3;
+	__le16 vlan_tci;
+	__le16 reserved4;
+	struct tx_buf_desc tbd[TX_DESC_PER_IOCB];
+} __packed;
+
+struct ob_mac_iocb_rsp {
+	u8 opcode;		/* */
+	u8 flags1;		/* */
+#define OB_MAC_IOCB_RSP_OI	0x01	/* */
+#define OB_MAC_IOCB_RSP_I	0x02	/* */
+#define OB_MAC_IOCB_RSP_E	0x08	/* */
+#define OB_MAC_IOCB_RSP_S	0x10	/* too Short */
+#define OB_MAC_IOCB_RSP_L	0x20	/* too Large */
+#define OB_MAC_IOCB_RSP_P	0x40	/* Padded */
+	u8 flags2;		/* */
+	u8 flags3;		/* */
+#define OB_MAC_IOCB_RSP_B	0x80	/* */
+	u32 tid;
+	u32 txq_idx;
+	__le32 reserved[13];
+} __packed;
+
+struct ob_mac_tso_iocb_req {
+	u8 opcode;
+	u8 flags1;
+#define OB_MAC_TSO_IOCB_OI	0x01
+#define OB_MAC_TSO_IOCB_I	0x02
+#define OB_MAC_TSO_IOCB_D	0x08
+#define OB_MAC_TSO_IOCB_IP4	0x40
+#define OB_MAC_TSO_IOCB_IP6	0x80
+	u8 flags2;
+#define OB_MAC_TSO_IOCB_LSO	0x20
+#define OB_MAC_TSO_IOCB_UC	0x40
+#define OB_MAC_TSO_IOCB_TC	0x80
+	u8 flags3;
+#define OB_MAC_TSO_IOCB_IC	0x01
+#define OB_MAC_TSO_IOCB_DFP	0x02
+#define OB_MAC_TSO_IOCB_V	0x04
+	__le32 reserved1[2];
+	__le32 frame_len;
+	u32 tid;
+	u32 txq_idx;
+	__le16 total_hdrs_len;
+	__le16 net_trans_offset;
+#define OB_MAC_TRANSPORT_HDR_SHIFT 6
+	__le16 vlan_tci;
+	__le16 mss;
+	struct tx_buf_desc tbd[TX_DESC_PER_IOCB];
+} __packed;
+
+struct ob_mac_tso_iocb_rsp {
+	u8 opcode;
+	u8 flags1;
+#define OB_MAC_TSO_IOCB_RSP_OI	0x01
+#define OB_MAC_TSO_IOCB_RSP_I	0x02
+#define OB_MAC_TSO_IOCB_RSP_E	0x08
+#define OB_MAC_TSO_IOCB_RSP_S	0x10
+#define OB_MAC_TSO_IOCB_RSP_L	0x20
+#define OB_MAC_TSO_IOCB_RSP_P	0x40
+	u8 flags2;		/* */
+	u8 flags3;		/* */
+#define OB_MAC_TSO_IOCB_RSP_B	0x8000
+	u32 tid;
+	u32 txq_idx;
+	__le32 reserved2[13];
+} __packed;
+
+struct ib_mac_iocb_rsp {
+	u8 opcode;		/* 0x20 */
+	u8 flags1;
+#define IB_MAC_IOCB_RSP_OI	0x01	/* Override intr delay */
+#define IB_MAC_IOCB_RSP_I	0x02	/* Disable Intr Generation */
+#define IB_MAC_CSUM_ERR_MASK 0x1c	/* A mask to use for csum errs */
+#define IB_MAC_IOCB_RSP_TE	0x04	/* Checksum error */
+#define IB_MAC_IOCB_RSP_NU	0x08	/* No checksum rcvd */
+#define IB_MAC_IOCB_RSP_IE	0x10	/* IPv4 checksum error */
+#define IB_MAC_IOCB_RSP_M_MASK	0x60	/* Multicast info */
+#define IB_MAC_IOCB_RSP_M_NONE	0x00	/* Not mcast frame */
+#define IB_MAC_IOCB_RSP_M_HASH	0x20	/* HASH mcast frame */
+#define IB_MAC_IOCB_RSP_M_REG 	0x40	/* Registered mcast frame */
+#define IB_MAC_IOCB_RSP_M_PROM 	0x60	/* Promiscuous mcast frame */
+#define IB_MAC_IOCB_RSP_B	0x80	/* Broadcast frame */
+	u8 flags2;
+#define IB_MAC_IOCB_RSP_P	0x01	/* Promiscuous frame */
+#define IB_MAC_IOCB_RSP_V	0x02	/* Vlan tag present */
+#define IB_MAC_IOCB_RSP_ERR_MASK	0x1c	/*  */
+#define IB_MAC_IOCB_RSP_ERR_CODE_ERR	0x04
+#define IB_MAC_IOCB_RSP_ERR_OVERSIZE	0x08
+#define IB_MAC_IOCB_RSP_ERR_UNDERSIZE	0x10
+#define IB_MAC_IOCB_RSP_ERR_PREAMBLE	0x14
+#define IB_MAC_IOCB_RSP_ERR_FRAME_LEN	0x18
+#define IB_MAC_IOCB_RSP_ERR_CRC		0x1c
+#define IB_MAC_IOCB_RSP_U	0x20	/* UDP packet */
+#define IB_MAC_IOCB_RSP_T	0x40	/* TCP packet */
+#define IB_MAC_IOCB_RSP_FO	0x80	/* Failover port */
+	u8 flags3;
+#define IB_MAC_IOCB_RSP_RSS_MASK	0x07	/* RSS mask */
+#define IB_MAC_IOCB_RSP_M_NONE	0x00	/* No RSS match */
+#define IB_MAC_IOCB_RSP_M_IPV4	0x04	/* IPv4 RSS match */
+#define IB_MAC_IOCB_RSP_M_IPV6	0x02	/* IPv6 RSS match */
+#define IB_MAC_IOCB_RSP_M_TCP_V4 	0x05	/* TCP with IPv4 */
+#define IB_MAC_IOCB_RSP_M_TCP_V6 	0x03	/* TCP with IPv6 */
+#define IB_MAC_IOCB_RSP_V4	0x08	/* IPV4 */
+#define IB_MAC_IOCB_RSP_V6	0x10	/* IPV6 */
+#define IB_MAC_IOCB_RSP_IH	0x20	/* Split after IP header */
+#define IB_MAC_IOCB_RSP_DS	0x40	/* data is in small buffer */
+#define IB_MAC_IOCB_RSP_DL	0x80	/* data is in large buffer */
+	__le32 data_len;	/* */
+	__le64 data_addr;	/* */
+	__le32 rss;		/* */
+	__le16 vlan_id;		/* 12 bits */
+#define IB_MAC_IOCB_RSP_C	0x1000	/* VLAN CFI bit */
+#define IB_MAC_IOCB_RSP_COS_SHIFT	12	/* class of service value */
+#define IB_MAC_IOCB_RSP_VLAN_MASK	0x0ffff
+
+	__le16 reserved1;
+	__le32 reserved2[6];
+	u8 reserved3[3];
+	u8 flags4;
+#define IB_MAC_IOCB_RSP_HV	0x20
+#define IB_MAC_IOCB_RSP_HS	0x40
+#define IB_MAC_IOCB_RSP_HL	0x80
+	__le32 hdr_len;		/* */
+	__le64 hdr_addr;	/* */
+} __packed;
+
+struct ib_ae_iocb_rsp {
+	u8 opcode;
+	u8 flags1;
+#define IB_AE_IOCB_RSP_OI		0x01
+#define IB_AE_IOCB_RSP_I		0x02
+	u8 event;
+#define LINK_UP_EVENT              0x00
+#define LINK_DOWN_EVENT            0x01
+#define CAM_LOOKUP_ERR_EVENT       0x06
+#define SOFT_ECC_ERROR_EVENT       0x07
+#define MGMT_ERR_EVENT             0x08
+#define TEN_GIG_MAC_EVENT          0x09
+#define GPI0_H2L_EVENT       	0x10
+#define GPI0_L2H_EVENT       	0x20
+#define GPI1_H2L_EVENT       	0x11
+#define GPI1_L2H_EVENT       	0x21
+#define PCI_ERR_ANON_BUF_RD        0x40
+	u8 q_id;
+	__le32 reserved[15];
+} __packed;
+
+/*
+ * These three structures are for generic
+ * handling of ib and ob iocbs.
+ */
+struct ql_net_rsp_iocb {
+	u8 opcode;
+	u8 flags0;
+	__le16 length;
+	__le32 tid;
+	__le32 reserved[14];
+} __packed;
+
+struct net_req_iocb {
+	u8 opcode;
+	u8 flags0;
+	__le16 flags1;
+	__le32 tid;
+	__le32 reserved1[30];
+} __packed;
+
+/*
+ * tx ring initialization control block for chip.
+ * It is defined as:
+ * "Work Queue Initialization Control Block"
+ */
+struct wqicb {
+	__le16 len;
+#define Q_LEN_V		(1 << 4)
+#define Q_LEN_CPP_CONT	0x0000
+#define Q_LEN_CPP_16	0x0001
+#define Q_LEN_CPP_32	0x0002
+#define Q_LEN_CPP_64	0x0003
+#define Q_LEN_CPP_512	0x0006
+	__le16 flags;
+#define Q_PRI_SHIFT	1
+#define Q_FLAGS_LC	0x1000
+#define Q_FLAGS_LB	0x2000
+#define Q_FLAGS_LI	0x4000
+#define Q_FLAGS_LO	0x8000
+	__le16 cq_id_rss;
+#define Q_CQ_ID_RSS_RV 0x8000
+	__le16 rid;
+	__le64 addr;
+	__le64 cnsmr_idx_addr;
+} __packed;
+
+/*
+ * rx ring initialization control block for chip.
+ * It is defined as:
+ * "Completion Queue Initialization Control Block"
+ */
+struct cqicb {
+	u8 msix_vect;
+	u8 reserved1;
+	u8 reserved2;
+	u8 flags;
+#define FLAGS_LV	0x08
+#define FLAGS_LS	0x10
+#define FLAGS_LL	0x20
+#define FLAGS_LI	0x40
+#define FLAGS_LC	0x80
+	__le16 len;
+#define LEN_V		(1 << 4)
+#define LEN_CPP_CONT	0x0000
+#define LEN_CPP_32	0x0001
+#define LEN_CPP_64	0x0002
+#define LEN_CPP_128	0x0003
+	__le16 rid;
+	__le64 addr;
+	__le64 prod_idx_addr;
+	__le16 pkt_delay;
+	__le16 irq_delay;
+	__le64 lbq_addr;
+	__le16 lbq_buf_size;
+	__le16 lbq_len;		/* entry count */
+	__le64 sbq_addr;
+	__le16 sbq_buf_size;
+	__le16 sbq_len;		/* entry count */
+} __packed;
+
+struct ricb {
+	u8 base_cq;
+#define RSS_L4K 0x80
+	u8 flags;
+#define RSS_L6K 0x01
+#define RSS_LI  0x02
+#define RSS_LB  0x04
+#define RSS_LM  0x08
+#define RSS_RI4 0x10
+#define RSS_RT4 0x20
+#define RSS_RI6 0x40
+#define RSS_RT6 0x80
+	__le16 mask;
+	u8 hash_cq_id[1024];
+	__le32 ipv6_hash_key[10];
+	__le32 ipv4_hash_key[4];
+} __packed;
+
+/* SOFTWARE/DRIVER DATA STRUCTURES. */
+
+struct oal {
+	struct tx_buf_desc oal[TX_DESC_PER_OAL];
+};
+
+struct map_list {
+	DEFINE_DMA_UNMAP_ADDR(mapaddr);
+	DEFINE_DMA_UNMAP_LEN(maplen);
+};
+
+struct tx_ring_desc {
+	struct sk_buff *skb;
+	struct ob_mac_iocb_req *queue_entry;
+	u32 index;
+	struct oal oal;
+	struct map_list map[MAX_SKB_FRAGS + 2];
+	int map_cnt;
+	struct tx_ring_desc *next;
+};
+
+struct page_chunk {
+	struct page *page;	/* master page */
+	char *va;		/* virt addr for this chunk */
+	u64 map;		/* mapping for master */
+	unsigned int offset;	/* offset for this chunk */
+	unsigned int last_flag; /* flag set for last chunk in page */
+};
+
+struct bq_desc {
+	union {
+		struct page_chunk pg_chunk;
+		struct sk_buff *skb;
+	} p;
+	__le64 *addr;
+	u32 index;
+	DEFINE_DMA_UNMAP_ADDR(mapaddr);
+	DEFINE_DMA_UNMAP_LEN(maplen);
+};
+
+#define QL_TXQ_IDX(qdev, skb) (smp_processor_id()%(qdev->tx_ring_count))
+
+struct tx_ring {
+	/*
+	 * queue info.
+	 */
+	struct wqicb wqicb;	/* structure used to inform chip of new queue */
+	void *wq_base;		/* pci_alloc:virtual addr for tx */
+	dma_addr_t wq_base_dma;	/* pci_alloc:dma addr for tx */
+	__le32 *cnsmr_idx_sh_reg;	/* shadow copy of consumer idx */
+	dma_addr_t cnsmr_idx_sh_reg_dma;	/* dma-shadow copy of consumer */
+	u32 wq_size;		/* size in bytes of queue area */
+	u32 wq_len;		/* number of entries in queue */
+	void __iomem *prod_idx_db_reg;	/* doorbell area index reg at offset 0x00 */
+	void __iomem *valid_db_reg;	/* doorbell area valid reg at offset 0x04 */
+	u16 prod_idx;		/* current value for prod idx */
+	u16 cq_id;		/* completion (rx) queue for tx completions */
+	u8 wq_id;		/* queue id for this entry */
+	u8 reserved1[3];
+	struct tx_ring_desc *q;	/* descriptor list for the queue */
+	spinlock_t lock;
+	atomic_t tx_count;	/* counts down for every outstanding IO */
+	struct delayed_work tx_work;
+	struct ql_adapter *qdev;
+	u64 tx_packets;
+	u64 tx_bytes;
+	u64 tx_errors;
+};
+
+/*
+ * Type of inbound queue.
+ */
+enum {
+	DEFAULT_Q = 2,		/* Handles slow queue and chip/MPI events. */
+	TX_Q = 3,		/* Handles outbound completions. */
+	RX_Q = 4,		/* Handles inbound completions. */
+};
+
+struct rx_ring {
+	struct cqicb cqicb;	/* The chip's completion queue init control block. */
+
+	/* Completion queue elements. */
+	void *cq_base;
+	dma_addr_t cq_base_dma;
+	u32 cq_size;
+	u32 cq_len;
+	u16 cq_id;
+	__le32 *prod_idx_sh_reg;	/* Shadowed producer register. */
+	dma_addr_t prod_idx_sh_reg_dma;
+	void __iomem *cnsmr_idx_db_reg;	/* PCI doorbell mem area + 0 */
+	u32 cnsmr_idx;		/* current sw idx */
+	struct ql_net_rsp_iocb *curr_entry;	/* next entry on queue */
+	void __iomem *valid_db_reg;	/* PCI doorbell mem area + 0x04 */
+
+	/* Large buffer queue elements. */
+	u32 lbq_len;		/* entry count */
+	u32 lbq_size;		/* size in bytes of queue */
+	u32 lbq_buf_size;
+	void *lbq_base;
+	dma_addr_t lbq_base_dma;
+	void *lbq_base_indirect;
+	dma_addr_t lbq_base_indirect_dma;
+	struct page_chunk pg_chunk; /* current page for chunks */
+	struct bq_desc *lbq;	/* array of control blocks */
+	void __iomem *lbq_prod_idx_db_reg;	/* PCI doorbell mem area + 0x18 */
+	u32 lbq_prod_idx;	/* current sw prod idx */
+	u32 lbq_curr_idx;	/* next entry we expect */
+	u32 lbq_clean_idx;	/* beginning of new descs */
+	u32 lbq_free_cnt;	/* free buffer desc cnt */
+
+	/* Small buffer queue elements. */
+	u32 sbq_len;		/* entry count */
+	u32 sbq_size;		/* size in bytes of queue */
+	u32 sbq_buf_size;
+	void *sbq_base;
+	dma_addr_t sbq_base_dma;
+	void *sbq_base_indirect;
+	dma_addr_t sbq_base_indirect_dma;
+	struct bq_desc *sbq;	/* array of control blocks */
+	void __iomem *sbq_prod_idx_db_reg; /* PCI doorbell mem area + 0x1c */
+	u32 sbq_prod_idx;	/* current sw prod idx */
+	u32 sbq_curr_idx;	/* next entry we expect */
+	u32 sbq_clean_idx;	/* beginning of new descs */
+	u32 sbq_free_cnt;	/* free buffer desc cnt */
+
+	/* Misc. handler elements. */
+	u32 type;		/* Type of queue, tx, rx. */
+	u32 irq;		/* Which vector this ring is assigned. */
+	u32 cpu;		/* Which CPU this should run on. */
+	char name[IFNAMSIZ + 5];
+	struct napi_struct napi;
+	u8 reserved;
+	struct ql_adapter *qdev;
+	u64 rx_packets;
+	u64 rx_multicast;
+	u64 rx_bytes;
+	u64 rx_dropped;
+	u64 rx_errors;
+};
+
+/*
+ * RSS Initialization Control Block
+ */
+struct hash_id {
+	u8 value[4];
+};
+
+struct nic_stats {
+	/*
+	 * These stats come from offset 200h to 278h
+	 * in the XGMAC register.
+	 */
+	u64 tx_pkts;
+	u64 tx_bytes;
+	u64 tx_mcast_pkts;
+	u64 tx_bcast_pkts;
+	u64 tx_ucast_pkts;
+	u64 tx_ctl_pkts;
+	u64 tx_pause_pkts;
+	u64 tx_64_pkt;
+	u64 tx_65_to_127_pkt;
+	u64 tx_128_to_255_pkt;
+	u64 tx_256_511_pkt;
+	u64 tx_512_to_1023_pkt;
+	u64 tx_1024_to_1518_pkt;
+	u64 tx_1519_to_max_pkt;
+	u64 tx_undersize_pkt;
+	u64 tx_oversize_pkt;
+
+	/*
+	 * These stats come from offset 300h to 3C8h
+	 * in the XGMAC register.
+	 */
+	u64 rx_bytes;
+	u64 rx_bytes_ok;
+	u64 rx_pkts;
+	u64 rx_pkts_ok;
+	u64 rx_bcast_pkts;
+	u64 rx_mcast_pkts;
+	u64 rx_ucast_pkts;
+	u64 rx_undersize_pkts;
+	u64 rx_oversize_pkts;
+	u64 rx_jabber_pkts;
+	u64 rx_undersize_fcerr_pkts;
+	u64 rx_drop_events;
+	u64 rx_fcerr_pkts;
+	u64 rx_align_err;
+	u64 rx_symbol_err;
+	u64 rx_mac_err;
+	u64 rx_ctl_pkts;
+	u64 rx_pause_pkts;
+	u64 rx_64_pkts;
+	u64 rx_65_to_127_pkts;
+	u64 rx_128_255_pkts;
+	u64 rx_256_511_pkts;
+	u64 rx_512_to_1023_pkts;
+	u64 rx_1024_to_1518_pkts;
+	u64 rx_1519_to_max_pkts;
+	u64 rx_len_err_pkts;
+	/* Receive Mac Err stats */
+	u64 rx_code_err;
+	u64 rx_oversize_err;
+	u64 rx_undersize_err;
+	u64 rx_preamble_err;
+	u64 rx_frame_len_err;
+	u64 rx_crc_err;
+	u64 rx_err_count;
+	/*
+	 * These stats come from offset 500h to 5C8h
+	 * in the XGMAC register.
+	 */
+	u64 tx_cbfc_pause_frames0;
+	u64 tx_cbfc_pause_frames1;
+	u64 tx_cbfc_pause_frames2;
+	u64 tx_cbfc_pause_frames3;
+	u64 tx_cbfc_pause_frames4;
+	u64 tx_cbfc_pause_frames5;
+	u64 tx_cbfc_pause_frames6;
+	u64 tx_cbfc_pause_frames7;
+	u64 rx_cbfc_pause_frames0;
+	u64 rx_cbfc_pause_frames1;
+	u64 rx_cbfc_pause_frames2;
+	u64 rx_cbfc_pause_frames3;
+	u64 rx_cbfc_pause_frames4;
+	u64 rx_cbfc_pause_frames5;
+	u64 rx_cbfc_pause_frames6;
+	u64 rx_cbfc_pause_frames7;
+	u64 rx_nic_fifo_drop;
+};
+
+/* Firmware coredump internal register address/length pairs. */
+enum {
+	MPI_CORE_REGS_ADDR = 0x00030000,
+	MPI_CORE_REGS_CNT = 127,
+	MPI_CORE_SH_REGS_CNT = 16,
+	TEST_REGS_ADDR = 0x00001000,
+	TEST_REGS_CNT = 23,
+	RMII_REGS_ADDR = 0x00001040,
+	RMII_REGS_CNT = 64,
+	FCMAC1_REGS_ADDR = 0x00001080,
+	FCMAC2_REGS_ADDR = 0x000010c0,
+	FCMAC_REGS_CNT = 64,
+	FC1_MBX_REGS_ADDR = 0x00001100,
+	FC2_MBX_REGS_ADDR = 0x00001240,
+	FC_MBX_REGS_CNT = 64,
+	IDE_REGS_ADDR = 0x00001140,
+	IDE_REGS_CNT = 64,
+	NIC1_MBX_REGS_ADDR = 0x00001180,
+	NIC2_MBX_REGS_ADDR = 0x00001280,
+	NIC_MBX_REGS_CNT = 64,
+	SMBUS_REGS_ADDR = 0x00001200,
+	SMBUS_REGS_CNT = 64,
+	I2C_REGS_ADDR = 0x00001fc0,
+	I2C_REGS_CNT = 64,
+	MEMC_REGS_ADDR = 0x00003000,
+	MEMC_REGS_CNT = 256,
+	PBUS_REGS_ADDR = 0x00007c00,
+	PBUS_REGS_CNT = 256,
+	MDE_REGS_ADDR = 0x00010000,
+	MDE_REGS_CNT = 6,
+	CODE_RAM_ADDR = 0x00020000,
+	CODE_RAM_CNT = 0x2000,
+	MEMC_RAM_ADDR = 0x00100000,
+	MEMC_RAM_CNT = 0x2000,
+};
+
+#define MPI_COREDUMP_COOKIE 0x5555aaaa
+struct mpi_coredump_global_header {
+	u32	cookie;
+	u8	idString[16];
+	u32	timeLo;
+	u32	timeHi;
+	u32	imageSize;
+	u32	headerSize;
+	u8	info[220];
+};
+
+struct mpi_coredump_segment_header {
+	u32	cookie;
+	u32	segNum;
+	u32	segSize;
+	u32	extra;
+	u8	description[16];
+};
+
+/* Firmware coredump header segment numbers. */
+enum {
+	CORE_SEG_NUM = 1,
+	TEST_LOGIC_SEG_NUM = 2,
+	RMII_SEG_NUM = 3,
+	FCMAC1_SEG_NUM = 4,
+	FCMAC2_SEG_NUM = 5,
+	FC1_MBOX_SEG_NUM = 6,
+	IDE_SEG_NUM = 7,
+	NIC1_MBOX_SEG_NUM = 8,
+	SMBUS_SEG_NUM = 9,
+	FC2_MBOX_SEG_NUM = 10,
+	NIC2_MBOX_SEG_NUM = 11,
+	I2C_SEG_NUM = 12,
+	MEMC_SEG_NUM = 13,
+	PBUS_SEG_NUM = 14,
+	MDE_SEG_NUM = 15,
+	NIC1_CONTROL_SEG_NUM = 16,
+	NIC2_CONTROL_SEG_NUM = 17,
+	NIC1_XGMAC_SEG_NUM = 18,
+	NIC2_XGMAC_SEG_NUM = 19,
+	WCS_RAM_SEG_NUM = 20,
+	MEMC_RAM_SEG_NUM = 21,
+	XAUI_AN_SEG_NUM = 22,
+	XAUI_HSS_PCS_SEG_NUM = 23,
+	XFI_AN_SEG_NUM = 24,
+	XFI_TRAIN_SEG_NUM = 25,
+	XFI_HSS_PCS_SEG_NUM = 26,
+	XFI_HSS_TX_SEG_NUM = 27,
+	XFI_HSS_RX_SEG_NUM = 28,
+	XFI_HSS_PLL_SEG_NUM = 29,
+	MISC_NIC_INFO_SEG_NUM = 30,
+	INTR_STATES_SEG_NUM = 31,
+	CAM_ENTRIES_SEG_NUM = 32,
+	ROUTING_WORDS_SEG_NUM = 33,
+	ETS_SEG_NUM = 34,
+	PROBE_DUMP_SEG_NUM = 35,
+	ROUTING_INDEX_SEG_NUM = 36,
+	MAC_PROTOCOL_SEG_NUM = 37,
+	XAUI2_AN_SEG_NUM = 38,
+	XAUI2_HSS_PCS_SEG_NUM = 39,
+	XFI2_AN_SEG_NUM = 40,
+	XFI2_TRAIN_SEG_NUM = 41,
+	XFI2_HSS_PCS_SEG_NUM = 42,
+	XFI2_HSS_TX_SEG_NUM = 43,
+	XFI2_HSS_RX_SEG_NUM = 44,
+	XFI2_HSS_PLL_SEG_NUM = 45,
+	SEM_REGS_SEG_NUM = 50
+
+};
+
+/* There are 64 generic NIC registers. */
+#define NIC_REGS_DUMP_WORD_COUNT		64
+/* XGMAC word count. */
+#define XGMAC_DUMP_WORD_COUNT		(XGMAC_REGISTER_END / 4)
+/* Word counts for the SERDES blocks. */
+#define XG_SERDES_XAUI_AN_COUNT		14
+#define XG_SERDES_XAUI_HSS_PCS_COUNT	33
+#define XG_SERDES_XFI_AN_COUNT		14
+#define XG_SERDES_XFI_TRAIN_COUNT		12
+#define XG_SERDES_XFI_HSS_PCS_COUNT	15
+#define XG_SERDES_XFI_HSS_TX_COUNT		32
+#define XG_SERDES_XFI_HSS_RX_COUNT		32
+#define XG_SERDES_XFI_HSS_PLL_COUNT	32
+
+/* There are 2 CNA ETS and 8 NIC ETS registers. */
+#define ETS_REGS_DUMP_WORD_COUNT		10
+
+/* Each probe mux entry stores the probe type plus 64 entries
+ * that are each each 64-bits in length. There are a total of
+ * 34 (PRB_MX_ADDR_VALID_TOTAL) valid probes.
+ */
+#define PRB_MX_ADDR_PRB_WORD_COUNT		(1 + (PRB_MX_ADDR_MAX_MUX * 2))
+#define PRB_MX_DUMP_TOT_COUNT		(PRB_MX_ADDR_PRB_WORD_COUNT * \
+							PRB_MX_ADDR_VALID_TOTAL)
+/* Each routing entry consists of 4 32-bit words.
+ * They are route type, index, index word, and result.
+ * There are 2 route blocks with 8 entries each and
+ *  2 NIC blocks with 16 entries each.
+ * The totol entries is 48 with 4 words each.
+ */
+#define RT_IDX_DUMP_ENTRIES			48
+#define RT_IDX_DUMP_WORDS_PER_ENTRY	4
+#define RT_IDX_DUMP_TOT_WORDS		(RT_IDX_DUMP_ENTRIES * \
+						RT_IDX_DUMP_WORDS_PER_ENTRY)
+/* There are 10 address blocks in filter, each with
+ * different entry counts and different word-count-per-entry.
+ */
+#define MAC_ADDR_DUMP_ENTRIES \
+	((MAC_ADDR_MAX_CAM_ENTRIES * MAC_ADDR_MAX_CAM_WCOUNT) + \
+	(MAC_ADDR_MAX_MULTICAST_ENTRIES * MAC_ADDR_MAX_MULTICAST_WCOUNT) + \
+	(MAC_ADDR_MAX_VLAN_ENTRIES * MAC_ADDR_MAX_VLAN_WCOUNT) + \
+	(MAC_ADDR_MAX_MCAST_FLTR_ENTRIES * MAC_ADDR_MAX_MCAST_FLTR_WCOUNT) + \
+	(MAC_ADDR_MAX_FC_MAC_ENTRIES * MAC_ADDR_MAX_FC_MAC_WCOUNT) + \
+	(MAC_ADDR_MAX_MGMT_MAC_ENTRIES * MAC_ADDR_MAX_MGMT_MAC_WCOUNT) + \
+	(MAC_ADDR_MAX_MGMT_VLAN_ENTRIES * MAC_ADDR_MAX_MGMT_VLAN_WCOUNT) + \
+	(MAC_ADDR_MAX_MGMT_V4_ENTRIES * MAC_ADDR_MAX_MGMT_V4_WCOUNT) + \
+	(MAC_ADDR_MAX_MGMT_V6_ENTRIES * MAC_ADDR_MAX_MGMT_V6_WCOUNT) + \
+	(MAC_ADDR_MAX_MGMT_TU_DP_ENTRIES * MAC_ADDR_MAX_MGMT_TU_DP_WCOUNT))
+#define MAC_ADDR_DUMP_WORDS_PER_ENTRY	2
+#define MAC_ADDR_DUMP_TOT_WORDS		(MAC_ADDR_DUMP_ENTRIES * \
+						MAC_ADDR_DUMP_WORDS_PER_ENTRY)
+/* Maximum of 4 functions whose semaphore registeres are
+ * in the coredump.
+ */
+#define MAX_SEMAPHORE_FUNCTIONS		4
+/* Defines for access the MPI shadow registers. */
+#define RISC_124		0x0003007c
+#define RISC_127		0x0003007f
+#define SHADOW_OFFSET	0xb0000000
+#define SHADOW_REG_SHIFT	20
+
+struct ql_nic_misc {
+	u32 rx_ring_count;
+	u32 tx_ring_count;
+	u32 intr_count;
+	u32 function;
+};
+
+struct ql_reg_dump {
+
+	/* segment 0 */
+	struct mpi_coredump_global_header mpi_global_header;
+
+	/* segment 16 */
+	struct mpi_coredump_segment_header nic_regs_seg_hdr;
+	u32 nic_regs[64];
+
+	/* segment 30 */
+	struct mpi_coredump_segment_header misc_nic_seg_hdr;
+	struct ql_nic_misc misc_nic_info;
+
+	/* segment 31 */
+	/* one interrupt state for each CQ */
+	struct mpi_coredump_segment_header intr_states_seg_hdr;
+	u32 intr_states[MAX_CPUS];
+
+	/* segment 32 */
+	/* 3 cam words each for 16 unicast,
+	 * 2 cam words for each of 32 multicast.
+	 */
+	struct mpi_coredump_segment_header cam_entries_seg_hdr;
+	u32 cam_entries[(16 * 3) + (32 * 3)];
+
+	/* segment 33 */
+	struct mpi_coredump_segment_header nic_routing_words_seg_hdr;
+	u32 nic_routing_words[16];
+
+	/* segment 34 */
+	struct mpi_coredump_segment_header ets_seg_hdr;
+	u32 ets[8+2];
+};
+
+struct ql_mpi_coredump {
+	/* segment 0 */
+	struct mpi_coredump_global_header mpi_global_header;
+
+	/* segment 1 */
+	struct mpi_coredump_segment_header core_regs_seg_hdr;
+	u32 mpi_core_regs[MPI_CORE_REGS_CNT];
+	u32 mpi_core_sh_regs[MPI_CORE_SH_REGS_CNT];
+
+	/* segment 2 */
+	struct mpi_coredump_segment_header test_logic_regs_seg_hdr;
+	u32 test_logic_regs[TEST_REGS_CNT];
+
+	/* segment 3 */
+	struct mpi_coredump_segment_header rmii_regs_seg_hdr;
+	u32 rmii_regs[RMII_REGS_CNT];
+
+	/* segment 4 */
+	struct mpi_coredump_segment_header fcmac1_regs_seg_hdr;
+	u32 fcmac1_regs[FCMAC_REGS_CNT];
+
+	/* segment 5 */
+	struct mpi_coredump_segment_header fcmac2_regs_seg_hdr;
+	u32 fcmac2_regs[FCMAC_REGS_CNT];
+
+	/* segment 6 */
+	struct mpi_coredump_segment_header fc1_mbx_regs_seg_hdr;
+	u32 fc1_mbx_regs[FC_MBX_REGS_CNT];
+
+	/* segment 7 */
+	struct mpi_coredump_segment_header ide_regs_seg_hdr;
+	u32 ide_regs[IDE_REGS_CNT];
+
+	/* segment 8 */
+	struct mpi_coredump_segment_header nic1_mbx_regs_seg_hdr;
+	u32 nic1_mbx_regs[NIC_MBX_REGS_CNT];
+
+	/* segment 9 */
+	struct mpi_coredump_segment_header smbus_regs_seg_hdr;
+	u32 smbus_regs[SMBUS_REGS_CNT];
+
+	/* segment 10 */
+	struct mpi_coredump_segment_header fc2_mbx_regs_seg_hdr;
+	u32 fc2_mbx_regs[FC_MBX_REGS_CNT];
+
+	/* segment 11 */
+	struct mpi_coredump_segment_header nic2_mbx_regs_seg_hdr;
+	u32 nic2_mbx_regs[NIC_MBX_REGS_CNT];
+
+	/* segment 12 */
+	struct mpi_coredump_segment_header i2c_regs_seg_hdr;
+	u32 i2c_regs[I2C_REGS_CNT];
+	/* segment 13 */
+	struct mpi_coredump_segment_header memc_regs_seg_hdr;
+	u32 memc_regs[MEMC_REGS_CNT];
+
+	/* segment 14 */
+	struct mpi_coredump_segment_header pbus_regs_seg_hdr;
+	u32 pbus_regs[PBUS_REGS_CNT];
+
+	/* segment 15 */
+	struct mpi_coredump_segment_header mde_regs_seg_hdr;
+	u32 mde_regs[MDE_REGS_CNT];
+
+	/* segment 16 */
+	struct mpi_coredump_segment_header nic_regs_seg_hdr;
+	u32 nic_regs[NIC_REGS_DUMP_WORD_COUNT];
+
+	/* segment 17 */
+	struct mpi_coredump_segment_header nic2_regs_seg_hdr;
+	u32 nic2_regs[NIC_REGS_DUMP_WORD_COUNT];
+
+	/* segment 18 */
+	struct mpi_coredump_segment_header xgmac1_seg_hdr;
+	u32 xgmac1[XGMAC_DUMP_WORD_COUNT];
+
+	/* segment 19 */
+	struct mpi_coredump_segment_header xgmac2_seg_hdr;
+	u32 xgmac2[XGMAC_DUMP_WORD_COUNT];
+
+	/* segment 20 */
+	struct mpi_coredump_segment_header code_ram_seg_hdr;
+	u32 code_ram[CODE_RAM_CNT];
+
+	/* segment 21 */
+	struct mpi_coredump_segment_header memc_ram_seg_hdr;
+	u32 memc_ram[MEMC_RAM_CNT];
+
+	/* segment 22 */
+	struct mpi_coredump_segment_header xaui_an_hdr;
+	u32 serdes_xaui_an[XG_SERDES_XAUI_AN_COUNT];
+
+	/* segment 23 */
+	struct mpi_coredump_segment_header xaui_hss_pcs_hdr;
+	u32 serdes_xaui_hss_pcs[XG_SERDES_XAUI_HSS_PCS_COUNT];
+
+	/* segment 24 */
+	struct mpi_coredump_segment_header xfi_an_hdr;
+	u32 serdes_xfi_an[XG_SERDES_XFI_AN_COUNT];
+
+	/* segment 25 */
+	struct mpi_coredump_segment_header xfi_train_hdr;
+	u32 serdes_xfi_train[XG_SERDES_XFI_TRAIN_COUNT];
+
+	/* segment 26 */
+	struct mpi_coredump_segment_header xfi_hss_pcs_hdr;
+	u32 serdes_xfi_hss_pcs[XG_SERDES_XFI_HSS_PCS_COUNT];
+
+	/* segment 27 */
+	struct mpi_coredump_segment_header xfi_hss_tx_hdr;
+	u32 serdes_xfi_hss_tx[XG_SERDES_XFI_HSS_TX_COUNT];
+
+	/* segment 28 */
+	struct mpi_coredump_segment_header xfi_hss_rx_hdr;
+	u32 serdes_xfi_hss_rx[XG_SERDES_XFI_HSS_RX_COUNT];
+
+	/* segment 29 */
+	struct mpi_coredump_segment_header xfi_hss_pll_hdr;
+	u32 serdes_xfi_hss_pll[XG_SERDES_XFI_HSS_PLL_COUNT];
+
+	/* segment 30 */
+	struct mpi_coredump_segment_header misc_nic_seg_hdr;
+	struct ql_nic_misc misc_nic_info;
+
+	/* segment 31 */
+	/* one interrupt state for each CQ */
+	struct mpi_coredump_segment_header intr_states_seg_hdr;
+	u32 intr_states[MAX_RX_RINGS];
+
+	/* segment 32 */
+	/* 3 cam words each for 16 unicast,
+	 * 2 cam words for each of 32 multicast.
+	 */
+	struct mpi_coredump_segment_header cam_entries_seg_hdr;
+	u32 cam_entries[(16 * 3) + (32 * 3)];
+
+	/* segment 33 */
+	struct mpi_coredump_segment_header nic_routing_words_seg_hdr;
+	u32 nic_routing_words[16];
+	/* segment 34 */
+	struct mpi_coredump_segment_header ets_seg_hdr;
+	u32 ets[ETS_REGS_DUMP_WORD_COUNT];
+
+	/* segment 35 */
+	struct mpi_coredump_segment_header probe_dump_seg_hdr;
+	u32 probe_dump[PRB_MX_DUMP_TOT_COUNT];
+
+	/* segment 36 */
+	struct mpi_coredump_segment_header routing_reg_seg_hdr;
+	u32 routing_regs[RT_IDX_DUMP_TOT_WORDS];
+
+	/* segment 37 */
+	struct mpi_coredump_segment_header mac_prot_reg_seg_hdr;
+	u32 mac_prot_regs[MAC_ADDR_DUMP_TOT_WORDS];
+
+	/* segment 38 */
+	struct mpi_coredump_segment_header xaui2_an_hdr;
+	u32 serdes2_xaui_an[XG_SERDES_XAUI_AN_COUNT];
+
+	/* segment 39 */
+	struct mpi_coredump_segment_header xaui2_hss_pcs_hdr;
+	u32 serdes2_xaui_hss_pcs[XG_SERDES_XAUI_HSS_PCS_COUNT];
+
+	/* segment 40 */
+	struct mpi_coredump_segment_header xfi2_an_hdr;
+	u32 serdes2_xfi_an[XG_SERDES_XFI_AN_COUNT];
+
+	/* segment 41 */
+	struct mpi_coredump_segment_header xfi2_train_hdr;
+	u32 serdes2_xfi_train[XG_SERDES_XFI_TRAIN_COUNT];
+
+	/* segment 42 */
+	struct mpi_coredump_segment_header xfi2_hss_pcs_hdr;
+	u32 serdes2_xfi_hss_pcs[XG_SERDES_XFI_HSS_PCS_COUNT];
+
+	/* segment 43 */
+	struct mpi_coredump_segment_header xfi2_hss_tx_hdr;
+	u32 serdes2_xfi_hss_tx[XG_SERDES_XFI_HSS_TX_COUNT];
+
+	/* segment 44 */
+	struct mpi_coredump_segment_header xfi2_hss_rx_hdr;
+	u32 serdes2_xfi_hss_rx[XG_SERDES_XFI_HSS_RX_COUNT];
+
+	/* segment 45 */
+	struct mpi_coredump_segment_header xfi2_hss_pll_hdr;
+	u32 serdes2_xfi_hss_pll[XG_SERDES_XFI_HSS_PLL_COUNT];
+
+	/* segment 50 */
+	/* semaphore register for all 5 functions */
+	struct mpi_coredump_segment_header sem_regs_seg_hdr;
+	u32 sem_regs[MAX_SEMAPHORE_FUNCTIONS];
+};
+
+/*
+ * intr_context structure is used during initialization
+ * to hook the interrupts.  It is also used in a single
+ * irq environment as a context to the ISR.
+ */
+struct intr_context {
+	struct ql_adapter *qdev;
+	u32 intr;
+	u32 irq_mask;		/* Mask of which rings the vector services. */
+	u32 hooked;
+	u32 intr_en_mask;	/* value/mask used to enable this intr */
+	u32 intr_dis_mask;	/* value/mask used to disable this intr */
+	u32 intr_read_mask;	/* value/mask used to read this intr */
+	char name[IFNAMSIZ * 2];
+	atomic_t irq_cnt;	/* irq_cnt is used in single vector
+				 * environment.  It's incremented for each
+				 * irq handler that is scheduled.  When each
+				 * handler finishes it decrements irq_cnt and
+				 * enables interrupts if it's zero. */
+	irq_handler_t handler;
+};
+
+/* adapter flags definitions. */
+enum {
+	QL_ADAPTER_UP = 0,	/* Adapter has been brought up. */
+	QL_LEGACY_ENABLED = 1,
+	QL_MSI_ENABLED = 2,
+	QL_MSIX_ENABLED = 3,
+	QL_DMA64 = 4,
+	QL_PROMISCUOUS = 5,
+	QL_ALLMULTI = 6,
+	QL_PORT_CFG = 7,
+	QL_CAM_RT_SET = 8,
+	QL_SELFTEST = 9,
+	QL_LB_LINK_UP = 10,
+	QL_FRC_COREDUMP = 11,
+	QL_EEH_FATAL = 12,
+	QL_ASIC_RECOVERY = 14, /* We are in ascic recovery. */
+};
+
+/* link_status bit definitions */
+enum {
+	STS_LOOPBACK_MASK = 0x00000700,
+	STS_LOOPBACK_PCS = 0x00000100,
+	STS_LOOPBACK_HSS = 0x00000200,
+	STS_LOOPBACK_EXT = 0x00000300,
+	STS_PAUSE_MASK = 0x000000c0,
+	STS_PAUSE_STD = 0x00000040,
+	STS_PAUSE_PRI = 0x00000080,
+	STS_SPEED_MASK = 0x00000038,
+	STS_SPEED_100Mb = 0x00000000,
+	STS_SPEED_1Gb = 0x00000008,
+	STS_SPEED_10Gb = 0x00000010,
+	STS_LINK_TYPE_MASK = 0x00000007,
+	STS_LINK_TYPE_XFI = 0x00000001,
+	STS_LINK_TYPE_XAUI = 0x00000002,
+	STS_LINK_TYPE_XFI_BP = 0x00000003,
+	STS_LINK_TYPE_XAUI_BP = 0x00000004,
+	STS_LINK_TYPE_10GBASET = 0x00000005,
+};
+
+/* link_config bit definitions */
+enum {
+	CFG_JUMBO_FRAME_SIZE = 0x00010000,
+	CFG_PAUSE_MASK = 0x00000060,
+	CFG_PAUSE_STD = 0x00000020,
+	CFG_PAUSE_PRI = 0x00000040,
+	CFG_DCBX = 0x00000010,
+	CFG_LOOPBACK_MASK = 0x00000007,
+	CFG_LOOPBACK_PCS = 0x00000002,
+	CFG_LOOPBACK_HSS = 0x00000004,
+	CFG_LOOPBACK_EXT = 0x00000006,
+	CFG_DEFAULT_MAX_FRAME_SIZE = 0x00002580,
+};
+
+struct nic_operations {
+
+	int (*get_flash) (struct ql_adapter *);
+	int (*port_initialize) (struct ql_adapter *);
+};
+
+/*
+ * The main Adapter structure definition.
+ * This structure has all fields relevant to the hardware.
+ */
+struct ql_adapter {
+	struct ricb ricb;
+	unsigned long flags;
+	u32 wol;
+
+	struct nic_stats nic_stats;
+
+	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
+
+	/* PCI Configuration information for this device */
+	struct pci_dev *pdev;
+	struct net_device *ndev;	/* Parent NET device */
+
+	/* Hardware information */
+	u32 chip_rev_id;
+	u32 fw_rev_id;
+	u32 func;		/* PCI function for this adapter */
+	u32 alt_func;		/* PCI function for alternate adapter */
+	u32 port;		/* Port number this adapter */
+
+	spinlock_t adapter_lock;
+	spinlock_t hw_lock;
+	spinlock_t stats_lock;
+
+	/* PCI Bus Relative Register Addresses */
+	void __iomem *reg_base;
+	void __iomem *doorbell_area;
+	u32 doorbell_area_size;
+
+	u32 msg_enable;
+
+	/* Page for Shadow Registers */
+	void *rx_ring_shadow_reg_area;
+	dma_addr_t rx_ring_shadow_reg_dma;
+	void *tx_ring_shadow_reg_area;
+	dma_addr_t tx_ring_shadow_reg_dma;
+
+	u32 mailbox_in;
+	u32 mailbox_out;
+	struct mbox_params idc_mbc;
+	struct mutex	mpi_mutex;
+
+	int tx_ring_size;
+	int rx_ring_size;
+	u32 intr_count;
+	struct msix_entry *msi_x_entry;
+	struct intr_context intr_context[MAX_RX_RINGS];
+
+	int tx_ring_count;	/* One per online CPU. */
+	u32 rss_ring_count;	/* One per irq vector.  */
+	/*
+	 * rx_ring_count =
+	 *  (CPU count * outbound completion rx_ring) +
+	 *  (irq_vector_cnt * inbound (RSS) completion rx_ring)
+	 */
+	int rx_ring_count;
+	int ring_mem_size;
+	void *ring_mem;
+
+	struct rx_ring rx_ring[MAX_RX_RINGS];
+	struct tx_ring tx_ring[MAX_TX_RINGS];
+	unsigned int lbq_buf_order;
+
+	int rx_csum;
+	u32 default_rx_queue;
+
+	u16 rx_coalesce_usecs;	/* cqicb->int_delay */
+	u16 rx_max_coalesced_frames;	/* cqicb->pkt_int_delay */
+	u16 tx_coalesce_usecs;	/* cqicb->int_delay */
+	u16 tx_max_coalesced_frames;	/* cqicb->pkt_int_delay */
+
+	u32 xg_sem_mask;
+	u32 port_link_up;
+	u32 port_init;
+	u32 link_status;
+	struct ql_mpi_coredump *mpi_coredump;
+	u32 core_is_dumped;
+	u32 link_config;
+	u32 led_config;
+	u32 max_frame_size;
+
+	union flash_params flash;
+
+	struct workqueue_struct *workqueue;
+	struct delayed_work asic_reset_work;
+	struct delayed_work mpi_reset_work;
+	struct delayed_work mpi_work;
+	struct delayed_work mpi_port_cfg_work;
+	struct delayed_work mpi_idc_work;
+	struct delayed_work mpi_core_to_log;
+	struct completion ide_completion;
+	const struct nic_operations *nic_ops;
+	u16 device_id;
+	struct timer_list timer;
+	atomic_t lb_count;
+	/* Keep local copy of current mac address. */
+	char current_mac_addr[ETH_ALEN];
+};
+
+/*
+ * Typical Register accessor for memory mapped device.
+ */
+static inline u32 ql_read32(const struct ql_adapter *qdev, int reg)
+{
+	return readl(qdev->reg_base + reg);
+}
+
+/*
+ * Typical Register accessor for memory mapped device.
+ */
+static inline void ql_write32(const struct ql_adapter *qdev, int reg, u32 val)
+{
+	writel(val, qdev->reg_base + reg);
+}
+
+/*
+ * Doorbell Registers:
+ * Doorbell registers are virtual registers in the PCI memory space.
+ * The space is allocated by the chip during PCI initialization.  The
+ * device driver finds the doorbell address in BAR 3 in PCI config space.
+ * The registers are used to control outbound and inbound queues. For
+ * example, the producer index for an outbound queue.  Each queue uses
+ * 1 4k chunk of memory.  The lower half of the space is for outbound
+ * queues. The upper half is for inbound queues.
+ */
+static inline void ql_write_db_reg(u32 val, void __iomem *addr)
+{
+	writel(val, addr);
+	mmiowb();
+}
+
+/*
+ * Doorbell Registers:
+ * Doorbell registers are virtual registers in the PCI memory space.
+ * The space is allocated by the chip during PCI initialization.  The
+ * device driver finds the doorbell address in BAR 3 in PCI config space.
+ * The registers are used to control outbound and inbound queues. For
+ * example, the producer index for an outbound queue.  Each queue uses
+ * 1 4k chunk of memory.  The lower half of the space is for outbound
+ * queues. The upper half is for inbound queues.
+ * Caller has to guarantee ordering.
+ */
+static inline void ql_write_db_reg_relaxed(u32 val, void __iomem *addr)
+{
+	writel_relaxed(val, addr);
+}
+
+/*
+ * Shadow Registers:
+ * Outbound queues have a consumer index that is maintained by the chip.
+ * Inbound queues have a producer index that is maintained by the chip.
+ * For lower overhead, these registers are "shadowed" to host memory
+ * which allows the device driver to track the queue progress without
+ * PCI reads. When an entry is placed on an inbound queue, the chip will
+ * update the relevant index register and then copy the value to the
+ * shadow register in host memory.
+ */
+static inline u32 ql_read_sh_reg(__le32  *addr)
+{
+	u32 reg;
+	reg =  le32_to_cpu(*addr);
+	rmb();
+	return reg;
+}
+
+extern char qlge_driver_name[];
+extern const char qlge_driver_version[];
+extern const struct ethtool_ops qlge_ethtool_ops;
+
+int ql_sem_spinlock(struct ql_adapter *qdev, u32 sem_mask);
+void ql_sem_unlock(struct ql_adapter *qdev, u32 sem_mask);
+int ql_read_xgmac_reg(struct ql_adapter *qdev, u32 reg, u32 *data);
+int ql_get_mac_addr_reg(struct ql_adapter *qdev, u32 type, u16 index,
+			u32 *value);
+int ql_get_routing_reg(struct ql_adapter *qdev, u32 index, u32 *value);
+int ql_write_cfg(struct ql_adapter *qdev, void *ptr, int size, u32 bit,
+		 u16 q_id);
+void ql_queue_fw_error(struct ql_adapter *qdev);
+void ql_mpi_work(struct work_struct *work);
+void ql_mpi_reset_work(struct work_struct *work);
+void ql_mpi_core_to_log(struct work_struct *work);
+int ql_wait_reg_rdy(struct ql_adapter *qdev, u32 reg, u32 bit, u32 ebit);
+void ql_queue_asic_error(struct ql_adapter *qdev);
+u32 ql_enable_completion_interrupt(struct ql_adapter *qdev, u32 intr);
+void ql_set_ethtool_ops(struct net_device *ndev);
+int ql_read_xgmac_reg64(struct ql_adapter *qdev, u32 reg, u64 *data);
+void ql_mpi_idc_work(struct work_struct *work);
+void ql_mpi_port_cfg_work(struct work_struct *work);
+int ql_mb_get_fw_state(struct ql_adapter *qdev);
+int ql_cam_route_initialize(struct ql_adapter *qdev);
+int ql_read_mpi_reg(struct ql_adapter *qdev, u32 reg, u32 *data);
+int ql_write_mpi_reg(struct ql_adapter *qdev, u32 reg, u32 data);
+int ql_unpause_mpi_risc(struct ql_adapter *qdev);
+int ql_pause_mpi_risc(struct ql_adapter *qdev);
+int ql_hard_reset_mpi_risc(struct ql_adapter *qdev);
+int ql_soft_reset_mpi_risc(struct ql_adapter *qdev);
+int ql_dump_risc_ram_area(struct ql_adapter *qdev, void *buf, u32 ram_addr,
+			  int word_count);
+int ql_core_dump(struct ql_adapter *qdev, struct ql_mpi_coredump *mpi_coredump);
+int ql_mb_about_fw(struct ql_adapter *qdev);
+int ql_mb_wol_set_magic(struct ql_adapter *qdev, u32 enable_wol);
+int ql_mb_wol_mode(struct ql_adapter *qdev, u32 wol);
+int ql_mb_set_led_cfg(struct ql_adapter *qdev, u32 led_config);
+int ql_mb_get_led_cfg(struct ql_adapter *qdev);
+void ql_link_on(struct ql_adapter *qdev);
+void ql_link_off(struct ql_adapter *qdev);
+int ql_mb_set_mgmnt_traffic_ctl(struct ql_adapter *qdev, u32 control);
+int ql_mb_get_port_cfg(struct ql_adapter *qdev);
+int ql_mb_set_port_cfg(struct ql_adapter *qdev);
+int ql_wait_fifo_empty(struct ql_adapter *qdev);
+void ql_get_dump(struct ql_adapter *qdev, void *buff);
+netdev_tx_t ql_lb_send(struct sk_buff *skb, struct net_device *ndev);
+void ql_check_lb_frame(struct ql_adapter *, struct sk_buff *);
+int ql_own_firmware(struct ql_adapter *qdev);
+int ql_clean_lb_rx_ring(struct rx_ring *rx_ring, int budget);
+
+/* #define QL_ALL_DUMP */
+/* #define QL_REG_DUMP */
+/* #define QL_DEV_DUMP */
+/* #define QL_CB_DUMP */
+/* #define QL_IB_DUMP */
+/* #define QL_OB_DUMP */
+
+#ifdef QL_REG_DUMP
+void ql_dump_xgmac_control_regs(struct ql_adapter *qdev);
+void ql_dump_routing_entries(struct ql_adapter *qdev);
+void ql_dump_regs(struct ql_adapter *qdev);
+#define QL_DUMP_REGS(qdev) ql_dump_regs(qdev)
+#define QL_DUMP_ROUTE(qdev) ql_dump_routing_entries(qdev)
+#define QL_DUMP_XGMAC_CONTROL_REGS(qdev) ql_dump_xgmac_control_regs(qdev)
+#else
+#define QL_DUMP_REGS(qdev)
+#define QL_DUMP_ROUTE(qdev)
+#define QL_DUMP_XGMAC_CONTROL_REGS(qdev)
+#endif
+
+#ifdef QL_STAT_DUMP
+void ql_dump_stat(struct ql_adapter *qdev);
+#define QL_DUMP_STAT(qdev) ql_dump_stat(qdev)
+#else
+#define QL_DUMP_STAT(qdev)
+#endif
+
+#ifdef QL_DEV_DUMP
+void ql_dump_qdev(struct ql_adapter *qdev);
+#define QL_DUMP_QDEV(qdev) ql_dump_qdev(qdev)
+#else
+#define QL_DUMP_QDEV(qdev)
+#endif
+
+#ifdef QL_CB_DUMP
+void ql_dump_wqicb(struct wqicb *wqicb);
+void ql_dump_tx_ring(struct tx_ring *tx_ring);
+void ql_dump_ricb(struct ricb *ricb);
+void ql_dump_cqicb(struct cqicb *cqicb);
+void ql_dump_rx_ring(struct rx_ring *rx_ring);
+void ql_dump_hw_cb(struct ql_adapter *qdev, int size, u32 bit, u16 q_id);
+#define QL_DUMP_RICB(ricb) ql_dump_ricb(ricb)
+#define QL_DUMP_WQICB(wqicb) ql_dump_wqicb(wqicb)
+#define QL_DUMP_TX_RING(tx_ring) ql_dump_tx_ring(tx_ring)
+#define QL_DUMP_CQICB(cqicb) ql_dump_cqicb(cqicb)
+#define QL_DUMP_RX_RING(rx_ring) ql_dump_rx_ring(rx_ring)
+#define QL_DUMP_HW_CB(qdev, size, bit, q_id) \
+		ql_dump_hw_cb(qdev, size, bit, q_id)
+#else
+#define QL_DUMP_RICB(ricb)
+#define QL_DUMP_WQICB(wqicb)
+#define QL_DUMP_TX_RING(tx_ring)
+#define QL_DUMP_CQICB(cqicb)
+#define QL_DUMP_RX_RING(rx_ring)
+#define QL_DUMP_HW_CB(qdev, size, bit, q_id)
+#endif
+
+#ifdef QL_OB_DUMP
+void ql_dump_tx_desc(struct tx_buf_desc *tbd);
+void ql_dump_ob_mac_iocb(struct ob_mac_iocb_req *ob_mac_iocb);
+void ql_dump_ob_mac_rsp(struct ob_mac_iocb_rsp *ob_mac_rsp);
+#define QL_DUMP_OB_MAC_IOCB(ob_mac_iocb) ql_dump_ob_mac_iocb(ob_mac_iocb)
+#define QL_DUMP_OB_MAC_RSP(ob_mac_rsp) ql_dump_ob_mac_rsp(ob_mac_rsp)
+#else
+#define QL_DUMP_OB_MAC_IOCB(ob_mac_iocb)
+#define QL_DUMP_OB_MAC_RSP(ob_mac_rsp)
+#endif
+
+#ifdef QL_IB_DUMP
+void ql_dump_ib_mac_rsp(struct ib_mac_iocb_rsp *ib_mac_rsp);
+#define QL_DUMP_IB_MAC_RSP(ib_mac_rsp) ql_dump_ib_mac_rsp(ib_mac_rsp)
+#else
+#define QL_DUMP_IB_MAC_RSP(ib_mac_rsp)
+#endif
+
+#ifdef	QL_ALL_DUMP
+void ql_dump_all(struct ql_adapter *qdev);
+#define QL_DUMP_ALL(qdev) ql_dump_all(qdev)
+#else
+#define QL_DUMP_ALL(qdev)
+#endif
+
+#endif /* _QLGE_H_ */
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c
new file mode 100644
index 0000000..31389ab
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c
@@ -0,0 +1,2024 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+
+#include "qlge.h"
+
+/* Read a NIC register from the alternate function. */
+static u32 ql_read_other_func_reg(struct ql_adapter *qdev,
+						u32 reg)
+{
+	u32 register_to_read;
+	u32 reg_val;
+	unsigned int status = 0;
+
+	register_to_read = MPI_NIC_REG_BLOCK
+				| MPI_NIC_READ
+				| (qdev->alt_func << MPI_NIC_FUNCTION_SHIFT)
+				| reg;
+	status = ql_read_mpi_reg(qdev, register_to_read, &reg_val);
+	if (status != 0)
+		return 0xffffffff;
+
+	return reg_val;
+}
+
+/* Write a NIC register from the alternate function. */
+static int ql_write_other_func_reg(struct ql_adapter *qdev,
+					u32 reg, u32 reg_val)
+{
+	u32 register_to_read;
+	int status = 0;
+
+	register_to_read = MPI_NIC_REG_BLOCK
+				| MPI_NIC_READ
+				| (qdev->alt_func << MPI_NIC_FUNCTION_SHIFT)
+				| reg;
+	status = ql_write_mpi_reg(qdev, register_to_read, reg_val);
+
+	return status;
+}
+
+static int ql_wait_other_func_reg_rdy(struct ql_adapter *qdev, u32 reg,
+					u32 bit, u32 err_bit)
+{
+	u32 temp;
+	int count = 10;
+
+	while (count) {
+		temp = ql_read_other_func_reg(qdev, reg);
+
+		/* check for errors */
+		if (temp & err_bit)
+			return -1;
+		else if (temp & bit)
+			return 0;
+		mdelay(10);
+		count--;
+	}
+	return -1;
+}
+
+static int ql_read_other_func_serdes_reg(struct ql_adapter *qdev, u32 reg,
+							u32 *data)
+{
+	int status;
+
+	/* wait for reg to come ready */
+	status = ql_wait_other_func_reg_rdy(qdev, XG_SERDES_ADDR / 4,
+						XG_SERDES_ADDR_RDY, 0);
+	if (status)
+		goto exit;
+
+	/* set up for reg read */
+	ql_write_other_func_reg(qdev, XG_SERDES_ADDR/4, reg | PROC_ADDR_R);
+
+	/* wait for reg to come ready */
+	status = ql_wait_other_func_reg_rdy(qdev, XG_SERDES_ADDR / 4,
+						XG_SERDES_ADDR_RDY, 0);
+	if (status)
+		goto exit;
+
+	/* get the data */
+	*data = ql_read_other_func_reg(qdev, (XG_SERDES_DATA / 4));
+exit:
+	return status;
+}
+
+/* Read out the SERDES registers */
+static int ql_read_serdes_reg(struct ql_adapter *qdev, u32 reg, u32 *data)
+{
+	int status;
+
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev, XG_SERDES_ADDR, XG_SERDES_ADDR_RDY, 0);
+	if (status)
+		goto exit;
+
+	/* set up for reg read */
+	ql_write32(qdev, XG_SERDES_ADDR, reg | PROC_ADDR_R);
+
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev, XG_SERDES_ADDR, XG_SERDES_ADDR_RDY, 0);
+	if (status)
+		goto exit;
+
+	/* get the data */
+	*data = ql_read32(qdev, XG_SERDES_DATA);
+exit:
+	return status;
+}
+
+static void ql_get_both_serdes(struct ql_adapter *qdev, u32 addr,
+			u32 *direct_ptr, u32 *indirect_ptr,
+			unsigned int direct_valid, unsigned int indirect_valid)
+{
+	unsigned int status;
+
+	status = 1;
+	if (direct_valid)
+		status = ql_read_serdes_reg(qdev, addr, direct_ptr);
+	/* Dead fill any failures or invalids. */
+	if (status)
+		*direct_ptr = 0xDEADBEEF;
+
+	status = 1;
+	if (indirect_valid)
+		status = ql_read_other_func_serdes_reg(
+						qdev, addr, indirect_ptr);
+	/* Dead fill any failures or invalids. */
+	if (status)
+		*indirect_ptr = 0xDEADBEEF;
+}
+
+static int ql_get_serdes_regs(struct ql_adapter *qdev,
+				struct ql_mpi_coredump *mpi_coredump)
+{
+	int status;
+	unsigned int xfi_direct_valid, xfi_indirect_valid, xaui_direct_valid;
+	unsigned int xaui_indirect_valid, i;
+	u32 *direct_ptr, temp;
+	u32 *indirect_ptr;
+
+	xfi_direct_valid = xfi_indirect_valid = 0;
+	xaui_direct_valid = xaui_indirect_valid = 1;
+
+	/* The XAUI needs to be read out per port */
+	status = ql_read_other_func_serdes_reg(qdev,
+			XG_SERDES_XAUI_HSS_PCS_START, &temp);
+	if (status)
+		temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
+
+	if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
+				XG_SERDES_ADDR_XAUI_PWR_DOWN)
+		xaui_indirect_valid = 0;
+
+	status = ql_read_serdes_reg(qdev, XG_SERDES_XAUI_HSS_PCS_START, &temp);
+
+	if (status)
+		temp = XG_SERDES_ADDR_XAUI_PWR_DOWN;
+
+	if ((temp & XG_SERDES_ADDR_XAUI_PWR_DOWN) ==
+				XG_SERDES_ADDR_XAUI_PWR_DOWN)
+		xaui_direct_valid = 0;
+
+	/*
+	 * XFI register is shared so only need to read one
+	 * functions and then check the bits.
+	 */
+	status = ql_read_serdes_reg(qdev, XG_SERDES_ADDR_STS, &temp);
+	if (status)
+		temp = 0;
+
+	if ((temp & XG_SERDES_ADDR_XFI1_PWR_UP) ==
+					XG_SERDES_ADDR_XFI1_PWR_UP) {
+		/* now see if i'm NIC 1 or NIC 2 */
+		if (qdev->func & 1)
+			/* I'm NIC 2, so the indirect (NIC1) xfi is up. */
+			xfi_indirect_valid = 1;
+		else
+			xfi_direct_valid = 1;
+	}
+	if ((temp & XG_SERDES_ADDR_XFI2_PWR_UP) ==
+					XG_SERDES_ADDR_XFI2_PWR_UP) {
+		/* now see if i'm NIC 1 or NIC 2 */
+		if (qdev->func & 1)
+			/* I'm NIC 2, so the indirect (NIC1) xfi is up. */
+			xfi_direct_valid = 1;
+		else
+			xfi_indirect_valid = 1;
+	}
+
+	/* Get XAUI_AN register block. */
+	if (qdev->func & 1) {
+		/* Function 2 is direct	*/
+		direct_ptr = mpi_coredump->serdes2_xaui_an;
+		indirect_ptr = mpi_coredump->serdes_xaui_an;
+	} else {
+		/* Function 1 is direct	*/
+		direct_ptr = mpi_coredump->serdes_xaui_an;
+		indirect_ptr = mpi_coredump->serdes2_xaui_an;
+	}
+
+	for (i = 0; i <= 0x000000034; i += 4, direct_ptr++, indirect_ptr++)
+		ql_get_both_serdes(qdev, i, direct_ptr, indirect_ptr,
+					xaui_direct_valid, xaui_indirect_valid);
+
+	/* Get XAUI_HSS_PCS register block. */
+	if (qdev->func & 1) {
+		direct_ptr =
+			mpi_coredump->serdes2_xaui_hss_pcs;
+		indirect_ptr =
+			mpi_coredump->serdes_xaui_hss_pcs;
+	} else {
+		direct_ptr =
+			mpi_coredump->serdes_xaui_hss_pcs;
+		indirect_ptr =
+			mpi_coredump->serdes2_xaui_hss_pcs;
+	}
+
+	for (i = 0x800; i <= 0x880; i += 4, direct_ptr++, indirect_ptr++)
+		ql_get_both_serdes(qdev, i, direct_ptr, indirect_ptr,
+					xaui_direct_valid, xaui_indirect_valid);
+
+	/* Get XAUI_XFI_AN register block. */
+	if (qdev->func & 1) {
+		direct_ptr = mpi_coredump->serdes2_xfi_an;
+		indirect_ptr = mpi_coredump->serdes_xfi_an;
+	} else {
+		direct_ptr = mpi_coredump->serdes_xfi_an;
+		indirect_ptr = mpi_coredump->serdes2_xfi_an;
+	}
+
+	for (i = 0x1000; i <= 0x1034; i += 4, direct_ptr++, indirect_ptr++)
+		ql_get_both_serdes(qdev, i, direct_ptr, indirect_ptr,
+					xfi_direct_valid, xfi_indirect_valid);
+
+	/* Get XAUI_XFI_TRAIN register block. */
+	if (qdev->func & 1) {
+		direct_ptr = mpi_coredump->serdes2_xfi_train;
+		indirect_ptr =
+			mpi_coredump->serdes_xfi_train;
+	} else {
+		direct_ptr = mpi_coredump->serdes_xfi_train;
+		indirect_ptr =
+			mpi_coredump->serdes2_xfi_train;
+	}
+
+	for (i = 0x1050; i <= 0x107c; i += 4, direct_ptr++, indirect_ptr++)
+		ql_get_both_serdes(qdev, i, direct_ptr, indirect_ptr,
+					xfi_direct_valid, xfi_indirect_valid);
+
+	/* Get XAUI_XFI_HSS_PCS register block. */
+	if (qdev->func & 1) {
+		direct_ptr =
+			mpi_coredump->serdes2_xfi_hss_pcs;
+		indirect_ptr =
+			mpi_coredump->serdes_xfi_hss_pcs;
+	} else {
+		direct_ptr =
+			mpi_coredump->serdes_xfi_hss_pcs;
+		indirect_ptr =
+			mpi_coredump->serdes2_xfi_hss_pcs;
+	}
+
+	for (i = 0x1800; i <= 0x1838; i += 4, direct_ptr++, indirect_ptr++)
+		ql_get_both_serdes(qdev, i, direct_ptr, indirect_ptr,
+					xfi_direct_valid, xfi_indirect_valid);
+
+	/* Get XAUI_XFI_HSS_TX register block. */
+	if (qdev->func & 1) {
+		direct_ptr =
+			mpi_coredump->serdes2_xfi_hss_tx;
+		indirect_ptr =
+			mpi_coredump->serdes_xfi_hss_tx;
+	} else {
+		direct_ptr = mpi_coredump->serdes_xfi_hss_tx;
+		indirect_ptr =
+			mpi_coredump->serdes2_xfi_hss_tx;
+	}
+	for (i = 0x1c00; i <= 0x1c1f; i++, direct_ptr++, indirect_ptr++)
+		ql_get_both_serdes(qdev, i, direct_ptr, indirect_ptr,
+					xfi_direct_valid, xfi_indirect_valid);
+
+	/* Get XAUI_XFI_HSS_RX register block. */
+	if (qdev->func & 1) {
+		direct_ptr =
+			mpi_coredump->serdes2_xfi_hss_rx;
+		indirect_ptr =
+			mpi_coredump->serdes_xfi_hss_rx;
+	} else {
+		direct_ptr = mpi_coredump->serdes_xfi_hss_rx;
+		indirect_ptr =
+			mpi_coredump->serdes2_xfi_hss_rx;
+	}
+
+	for (i = 0x1c40; i <= 0x1c5f; i++, direct_ptr++, indirect_ptr++)
+		ql_get_both_serdes(qdev, i, direct_ptr, indirect_ptr,
+					xfi_direct_valid, xfi_indirect_valid);
+
+
+	/* Get XAUI_XFI_HSS_PLL register block. */
+	if (qdev->func & 1) {
+		direct_ptr =
+			mpi_coredump->serdes2_xfi_hss_pll;
+		indirect_ptr =
+			mpi_coredump->serdes_xfi_hss_pll;
+	} else {
+		direct_ptr =
+			mpi_coredump->serdes_xfi_hss_pll;
+		indirect_ptr =
+			mpi_coredump->serdes2_xfi_hss_pll;
+	}
+	for (i = 0x1e00; i <= 0x1e1f; i++, direct_ptr++, indirect_ptr++)
+		ql_get_both_serdes(qdev, i, direct_ptr, indirect_ptr,
+					xfi_direct_valid, xfi_indirect_valid);
+	return 0;
+}
+
+static int ql_read_other_func_xgmac_reg(struct ql_adapter *qdev, u32 reg,
+							u32 *data)
+{
+	int status = 0;
+
+	/* wait for reg to come ready */
+	status = ql_wait_other_func_reg_rdy(qdev, XGMAC_ADDR / 4,
+						XGMAC_ADDR_RDY, XGMAC_ADDR_XME);
+	if (status)
+		goto exit;
+
+	/* set up for reg read */
+	ql_write_other_func_reg(qdev, XGMAC_ADDR / 4, reg | XGMAC_ADDR_R);
+
+	/* wait for reg to come ready */
+	status = ql_wait_other_func_reg_rdy(qdev, XGMAC_ADDR / 4,
+						XGMAC_ADDR_RDY, XGMAC_ADDR_XME);
+	if (status)
+		goto exit;
+
+	/* get the data */
+	*data = ql_read_other_func_reg(qdev, XGMAC_DATA / 4);
+exit:
+	return status;
+}
+
+/* Read the 400 xgmac control/statistics registers
+ * skipping unused locations.
+ */
+static int ql_get_xgmac_regs(struct ql_adapter *qdev, u32 *buf,
+					unsigned int other_function)
+{
+	int status = 0;
+	int i;
+
+	for (i = PAUSE_SRC_LO; i < XGMAC_REGISTER_END; i += 4, buf++) {
+		/* We're reading 400 xgmac registers, but we filter out
+		 * serveral locations that are non-responsive to reads.
+		 */
+		if ((i == 0x00000114) ||
+			(i == 0x00000118) ||
+			(i == 0x0000013c) ||
+			(i == 0x00000140) ||
+			(i > 0x00000150 && i < 0x000001fc) ||
+			(i > 0x00000278 && i < 0x000002a0) ||
+			(i > 0x000002c0 && i < 0x000002cf) ||
+			(i > 0x000002dc && i < 0x000002f0) ||
+			(i > 0x000003c8 && i < 0x00000400) ||
+			(i > 0x00000400 && i < 0x00000410) ||
+			(i > 0x00000410 && i < 0x00000420) ||
+			(i > 0x00000420 && i < 0x00000430) ||
+			(i > 0x00000430 && i < 0x00000440) ||
+			(i > 0x00000440 && i < 0x00000450) ||
+			(i > 0x00000450 && i < 0x00000500) ||
+			(i > 0x0000054c && i < 0x00000568) ||
+			(i > 0x000005c8 && i < 0x00000600)) {
+			if (other_function)
+				status =
+				ql_read_other_func_xgmac_reg(qdev, i, buf);
+			else
+				status = ql_read_xgmac_reg(qdev, i, buf);
+
+			if (status)
+				*buf = 0xdeadbeef;
+			break;
+		}
+	}
+	return status;
+}
+
+static int ql_get_ets_regs(struct ql_adapter *qdev, u32 *buf)
+{
+	int status = 0;
+	int i;
+
+	for (i = 0; i < 8; i++, buf++) {
+		ql_write32(qdev, NIC_ETS, i << 29 | 0x08000000);
+		*buf = ql_read32(qdev, NIC_ETS);
+	}
+
+	for (i = 0; i < 2; i++, buf++) {
+		ql_write32(qdev, CNA_ETS, i << 29 | 0x08000000);
+		*buf = ql_read32(qdev, CNA_ETS);
+	}
+
+	return status;
+}
+
+static void ql_get_intr_states(struct ql_adapter *qdev, u32 *buf)
+{
+	int i;
+
+	for (i = 0; i < qdev->rx_ring_count; i++, buf++) {
+		ql_write32(qdev, INTR_EN,
+				qdev->intr_context[i].intr_read_mask);
+		*buf = ql_read32(qdev, INTR_EN);
+	}
+}
+
+static int ql_get_cam_entries(struct ql_adapter *qdev, u32 *buf)
+{
+	int i, status;
+	u32 value[3];
+
+	status = ql_sem_spinlock(qdev, SEM_MAC_ADDR_MASK);
+	if (status)
+		return status;
+
+	for (i = 0; i < 16; i++) {
+		status = ql_get_mac_addr_reg(qdev,
+					MAC_ADDR_TYPE_CAM_MAC, i, value);
+		if (status) {
+			netif_err(qdev, drv, qdev->ndev,
+				  "Failed read of mac index register\n");
+			goto err;
+		}
+		*buf++ = value[0];	/* lower MAC address */
+		*buf++ = value[1];	/* upper MAC address */
+		*buf++ = value[2];	/* output */
+	}
+	for (i = 0; i < 32; i++) {
+		status = ql_get_mac_addr_reg(qdev,
+					MAC_ADDR_TYPE_MULTI_MAC, i, value);
+		if (status) {
+			netif_err(qdev, drv, qdev->ndev,
+				  "Failed read of mac index register\n");
+			goto err;
+		}
+		*buf++ = value[0];	/* lower Mcast address */
+		*buf++ = value[1];	/* upper Mcast address */
+	}
+err:
+	ql_sem_unlock(qdev, SEM_MAC_ADDR_MASK);
+	return status;
+}
+
+static int ql_get_routing_entries(struct ql_adapter *qdev, u32 *buf)
+{
+	int status;
+	u32 value, i;
+
+	status = ql_sem_spinlock(qdev, SEM_RT_IDX_MASK);
+	if (status)
+		return status;
+
+	for (i = 0; i < 16; i++) {
+		status = ql_get_routing_reg(qdev, i, &value);
+		if (status) {
+			netif_err(qdev, drv, qdev->ndev,
+				  "Failed read of routing index register\n");
+			goto err;
+		} else {
+			*buf++ = value;
+		}
+	}
+err:
+	ql_sem_unlock(qdev, SEM_RT_IDX_MASK);
+	return status;
+}
+
+/* Read the MPI Processor shadow registers */
+static int ql_get_mpi_shadow_regs(struct ql_adapter *qdev, u32 *buf)
+{
+	u32 i;
+	int status;
+
+	for (i = 0; i < MPI_CORE_SH_REGS_CNT; i++, buf++) {
+		status = ql_write_mpi_reg(qdev, RISC_124,
+				(SHADOW_OFFSET | i << SHADOW_REG_SHIFT));
+		if (status)
+			goto end;
+		status = ql_read_mpi_reg(qdev, RISC_127, buf);
+		if (status)
+			goto end;
+	}
+end:
+	return status;
+}
+
+/* Read the MPI Processor core registers */
+static int ql_get_mpi_regs(struct ql_adapter *qdev, u32 *buf,
+				u32 offset, u32 count)
+{
+	int i, status = 0;
+	for (i = 0; i < count; i++, buf++) {
+		status = ql_read_mpi_reg(qdev, offset + i, buf);
+		if (status)
+			return status;
+	}
+	return status;
+}
+
+/* Read the ASIC probe dump */
+static unsigned int *ql_get_probe(struct ql_adapter *qdev, u32 clock,
+					u32 valid, u32 *buf)
+{
+	u32 module, mux_sel, probe, lo_val, hi_val;
+
+	for (module = 0; module < PRB_MX_ADDR_MAX_MODS; module++) {
+		if (!((valid >> module) & 1))
+			continue;
+		for (mux_sel = 0; mux_sel < PRB_MX_ADDR_MAX_MUX; mux_sel++) {
+			probe = clock
+				| PRB_MX_ADDR_ARE
+				| mux_sel
+				| (module << PRB_MX_ADDR_MOD_SEL_SHIFT);
+			ql_write32(qdev, PRB_MX_ADDR, probe);
+			lo_val = ql_read32(qdev, PRB_MX_DATA);
+			if (mux_sel == 0) {
+				*buf = probe;
+				buf++;
+			}
+			probe |= PRB_MX_ADDR_UP;
+			ql_write32(qdev, PRB_MX_ADDR, probe);
+			hi_val = ql_read32(qdev, PRB_MX_DATA);
+			*buf = lo_val;
+			buf++;
+			*buf = hi_val;
+			buf++;
+		}
+	}
+	return buf;
+}
+
+static int ql_get_probe_dump(struct ql_adapter *qdev, unsigned int *buf)
+{
+	/* First we have to enable the probe mux */
+	ql_write_mpi_reg(qdev, MPI_TEST_FUNC_PRB_CTL, MPI_TEST_FUNC_PRB_EN);
+	buf = ql_get_probe(qdev, PRB_MX_ADDR_SYS_CLOCK,
+			PRB_MX_ADDR_VALID_SYS_MOD, buf);
+	buf = ql_get_probe(qdev, PRB_MX_ADDR_PCI_CLOCK,
+			PRB_MX_ADDR_VALID_PCI_MOD, buf);
+	buf = ql_get_probe(qdev, PRB_MX_ADDR_XGM_CLOCK,
+			PRB_MX_ADDR_VALID_XGM_MOD, buf);
+	buf = ql_get_probe(qdev, PRB_MX_ADDR_FC_CLOCK,
+			PRB_MX_ADDR_VALID_FC_MOD, buf);
+	return 0;
+
+}
+
+/* Read out the routing index registers */
+static int ql_get_routing_index_registers(struct ql_adapter *qdev, u32 *buf)
+{
+	int status;
+	u32 type, index, index_max;
+	u32 result_index;
+	u32 result_data;
+	u32 val;
+
+	status = ql_sem_spinlock(qdev, SEM_RT_IDX_MASK);
+	if (status)
+		return status;
+
+	for (type = 0; type < 4; type++) {
+		if (type < 2)
+			index_max = 8;
+		else
+			index_max = 16;
+		for (index = 0; index < index_max; index++) {
+			val = RT_IDX_RS
+				| (type << RT_IDX_TYPE_SHIFT)
+				| (index << RT_IDX_IDX_SHIFT);
+			ql_write32(qdev, RT_IDX, val);
+			result_index = 0;
+			while ((result_index & RT_IDX_MR) == 0)
+				result_index = ql_read32(qdev, RT_IDX);
+			result_data = ql_read32(qdev, RT_DATA);
+			*buf = type;
+			buf++;
+			*buf = index;
+			buf++;
+			*buf = result_index;
+			buf++;
+			*buf = result_data;
+			buf++;
+		}
+	}
+	ql_sem_unlock(qdev, SEM_RT_IDX_MASK);
+	return status;
+}
+
+/* Read out the MAC protocol registers */
+static void ql_get_mac_protocol_registers(struct ql_adapter *qdev, u32 *buf)
+{
+	u32 result_index, result_data;
+	u32 type;
+	u32 index;
+	u32 offset;
+	u32 val;
+	u32 initial_val = MAC_ADDR_RS;
+	u32 max_index;
+	u32 max_offset;
+
+	for (type = 0; type < MAC_ADDR_TYPE_COUNT; type++) {
+		switch (type) {
+
+		case 0: /* CAM */
+			initial_val |= MAC_ADDR_ADR;
+			max_index = MAC_ADDR_MAX_CAM_ENTRIES;
+			max_offset = MAC_ADDR_MAX_CAM_WCOUNT;
+			break;
+		case 1: /* Multicast MAC Address */
+			max_index = MAC_ADDR_MAX_CAM_WCOUNT;
+			max_offset = MAC_ADDR_MAX_CAM_WCOUNT;
+			break;
+		case 2: /* VLAN filter mask */
+		case 3: /* MC filter mask */
+			max_index = MAC_ADDR_MAX_CAM_WCOUNT;
+			max_offset = MAC_ADDR_MAX_CAM_WCOUNT;
+			break;
+		case 4: /* FC MAC addresses */
+			max_index = MAC_ADDR_MAX_FC_MAC_ENTRIES;
+			max_offset = MAC_ADDR_MAX_FC_MAC_WCOUNT;
+			break;
+		case 5: /* Mgmt MAC addresses */
+			max_index = MAC_ADDR_MAX_MGMT_MAC_ENTRIES;
+			max_offset = MAC_ADDR_MAX_MGMT_MAC_WCOUNT;
+			break;
+		case 6: /* Mgmt VLAN addresses */
+			max_index = MAC_ADDR_MAX_MGMT_VLAN_ENTRIES;
+			max_offset = MAC_ADDR_MAX_MGMT_VLAN_WCOUNT;
+			break;
+		case 7: /* Mgmt IPv4 address */
+			max_index = MAC_ADDR_MAX_MGMT_V4_ENTRIES;
+			max_offset = MAC_ADDR_MAX_MGMT_V4_WCOUNT;
+			break;
+		case 8: /* Mgmt IPv6 address */
+			max_index = MAC_ADDR_MAX_MGMT_V6_ENTRIES;
+			max_offset = MAC_ADDR_MAX_MGMT_V6_WCOUNT;
+			break;
+		case 9: /* Mgmt TCP/UDP Dest port */
+			max_index = MAC_ADDR_MAX_MGMT_TU_DP_ENTRIES;
+			max_offset = MAC_ADDR_MAX_MGMT_TU_DP_WCOUNT;
+			break;
+		default:
+			pr_err("Bad type!!! 0x%08x\n", type);
+			max_index = 0;
+			max_offset = 0;
+			break;
+		}
+		for (index = 0; index < max_index; index++) {
+			for (offset = 0; offset < max_offset; offset++) {
+				val = initial_val
+					| (type << MAC_ADDR_TYPE_SHIFT)
+					| (index << MAC_ADDR_IDX_SHIFT)
+					| (offset);
+				ql_write32(qdev, MAC_ADDR_IDX, val);
+				result_index = 0;
+				while ((result_index & MAC_ADDR_MR) == 0) {
+					result_index = ql_read32(qdev,
+								MAC_ADDR_IDX);
+				}
+				result_data = ql_read32(qdev, MAC_ADDR_DATA);
+				*buf = result_index;
+				buf++;
+				*buf = result_data;
+				buf++;
+			}
+		}
+	}
+}
+
+static void ql_get_sem_registers(struct ql_adapter *qdev, u32 *buf)
+{
+	u32 func_num, reg, reg_val;
+	int status;
+
+	for (func_num = 0; func_num < MAX_SEMAPHORE_FUNCTIONS ; func_num++) {
+		reg = MPI_NIC_REG_BLOCK
+			| (func_num << MPI_NIC_FUNCTION_SHIFT)
+			| (SEM / 4);
+		status = ql_read_mpi_reg(qdev, reg, &reg_val);
+		*buf = reg_val;
+		/* if the read failed then dead fill the element. */
+		if (!status)
+			*buf = 0xdeadbeef;
+		buf++;
+	}
+}
+
+/* Create a coredump segment header */
+static void ql_build_coredump_seg_header(
+		struct mpi_coredump_segment_header *seg_hdr,
+		u32 seg_number, u32 seg_size, u8 *desc)
+{
+	memset(seg_hdr, 0, sizeof(struct mpi_coredump_segment_header));
+	seg_hdr->cookie = MPI_COREDUMP_COOKIE;
+	seg_hdr->segNum = seg_number;
+	seg_hdr->segSize = seg_size;
+	strncpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);
+}
+
+/*
+ * This function should be called when a coredump / probedump
+ * is to be extracted from the HBA. It is assumed there is a
+ * qdev structure that contains the base address of the register
+ * space for this function as well as a coredump structure that
+ * will contain the dump.
+ */
+int ql_core_dump(struct ql_adapter *qdev, struct ql_mpi_coredump *mpi_coredump)
+{
+	int status;
+	int i;
+
+	if (!mpi_coredump) {
+		netif_err(qdev, drv, qdev->ndev, "No memory allocated\n");
+		return -EINVAL;
+	}
+
+	/* Try to get the spinlock, but dont worry if
+	 * it isn't available.  If the firmware died it
+	 * might be holding the sem.
+	 */
+	ql_sem_spinlock(qdev, SEM_PROC_REG_MASK);
+
+	status = ql_pause_mpi_risc(qdev);
+	if (status) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Failed RISC pause. Status = 0x%.08x\n", status);
+		goto err;
+	}
+
+	/* Insert the global header */
+	memset(&(mpi_coredump->mpi_global_header), 0,
+		sizeof(struct mpi_coredump_global_header));
+	mpi_coredump->mpi_global_header.cookie = MPI_COREDUMP_COOKIE;
+	mpi_coredump->mpi_global_header.headerSize =
+		sizeof(struct mpi_coredump_global_header);
+	mpi_coredump->mpi_global_header.imageSize =
+		sizeof(struct ql_mpi_coredump);
+	strncpy(mpi_coredump->mpi_global_header.idString, "MPI Coredump",
+		sizeof(mpi_coredump->mpi_global_header.idString));
+
+	/* Get generic NIC reg dump */
+	ql_build_coredump_seg_header(&mpi_coredump->nic_regs_seg_hdr,
+			NIC1_CONTROL_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->nic_regs), "NIC1 Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->nic2_regs_seg_hdr,
+			NIC2_CONTROL_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->nic2_regs), "NIC2 Registers");
+
+	/* Get XGMac registers. (Segment 18, Rev C. step 21) */
+	ql_build_coredump_seg_header(&mpi_coredump->xgmac1_seg_hdr,
+			NIC1_XGMAC_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->xgmac1), "NIC1 XGMac Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xgmac2_seg_hdr,
+			NIC2_XGMAC_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->xgmac2), "NIC2 XGMac Registers");
+
+	if (qdev->func & 1) {
+		/* Odd means our function is NIC 2 */
+		for (i = 0; i < NIC_REGS_DUMP_WORD_COUNT; i++)
+			mpi_coredump->nic2_regs[i] =
+					 ql_read32(qdev, i * sizeof(u32));
+
+		for (i = 0; i < NIC_REGS_DUMP_WORD_COUNT; i++)
+			mpi_coredump->nic_regs[i] =
+			ql_read_other_func_reg(qdev, (i * sizeof(u32)) / 4);
+
+		ql_get_xgmac_regs(qdev, &mpi_coredump->xgmac2[0], 0);
+		ql_get_xgmac_regs(qdev, &mpi_coredump->xgmac1[0], 1);
+	} else {
+		/* Even means our function is NIC 1 */
+		for (i = 0; i < NIC_REGS_DUMP_WORD_COUNT; i++)
+			mpi_coredump->nic_regs[i] =
+					ql_read32(qdev, i * sizeof(u32));
+		for (i = 0; i < NIC_REGS_DUMP_WORD_COUNT; i++)
+			mpi_coredump->nic2_regs[i] =
+			ql_read_other_func_reg(qdev, (i * sizeof(u32)) / 4);
+
+		ql_get_xgmac_regs(qdev, &mpi_coredump->xgmac1[0], 0);
+		ql_get_xgmac_regs(qdev, &mpi_coredump->xgmac2[0], 1);
+	}
+
+	/* Rev C. Step 20a */
+	ql_build_coredump_seg_header(&mpi_coredump->xaui_an_hdr,
+			XAUI_AN_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes_xaui_an),
+			"XAUI AN Registers");
+
+	/* Rev C. Step 20b */
+	ql_build_coredump_seg_header(&mpi_coredump->xaui_hss_pcs_hdr,
+			XAUI_HSS_PCS_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes_xaui_hss_pcs),
+			"XAUI HSS PCS Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xfi_an_hdr, XFI_AN_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes_xfi_an),
+			"XFI AN Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xfi_train_hdr,
+			XFI_TRAIN_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes_xfi_train),
+			"XFI TRAIN Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xfi_hss_pcs_hdr,
+			XFI_HSS_PCS_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes_xfi_hss_pcs),
+			"XFI HSS PCS Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xfi_hss_tx_hdr,
+			XFI_HSS_TX_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes_xfi_hss_tx),
+			"XFI HSS TX Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xfi_hss_rx_hdr,
+			XFI_HSS_RX_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes_xfi_hss_rx),
+			"XFI HSS RX Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xfi_hss_pll_hdr,
+			XFI_HSS_PLL_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes_xfi_hss_pll),
+			"XFI HSS PLL Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xaui2_an_hdr,
+			XAUI2_AN_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes2_xaui_an),
+			"XAUI2 AN Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xaui2_hss_pcs_hdr,
+			XAUI2_HSS_PCS_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes2_xaui_hss_pcs),
+			"XAUI2 HSS PCS Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xfi2_an_hdr,
+			XFI2_AN_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes2_xfi_an),
+			"XFI2 AN Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xfi2_train_hdr,
+			XFI2_TRAIN_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes2_xfi_train),
+			"XFI2 TRAIN Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xfi2_hss_pcs_hdr,
+			XFI2_HSS_PCS_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes2_xfi_hss_pcs),
+			"XFI2 HSS PCS Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xfi2_hss_tx_hdr,
+			XFI2_HSS_TX_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes2_xfi_hss_tx),
+			"XFI2 HSS TX Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xfi2_hss_rx_hdr,
+			XFI2_HSS_RX_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes2_xfi_hss_rx),
+			"XFI2 HSS RX Registers");
+
+	ql_build_coredump_seg_header(&mpi_coredump->xfi2_hss_pll_hdr,
+			XFI2_HSS_PLL_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->serdes2_xfi_hss_pll),
+			"XFI2 HSS PLL Registers");
+
+	status = ql_get_serdes_regs(qdev, mpi_coredump);
+	if (status) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Failed Dump of Serdes Registers. Status = 0x%.08x\n",
+			  status);
+		goto err;
+	}
+
+	ql_build_coredump_seg_header(&mpi_coredump->core_regs_seg_hdr,
+				CORE_SEG_NUM,
+				sizeof(mpi_coredump->core_regs_seg_hdr) +
+				sizeof(mpi_coredump->mpi_core_regs) +
+				sizeof(mpi_coredump->mpi_core_sh_regs),
+				"Core Registers");
+
+	/* Get the MPI Core Registers */
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->mpi_core_regs[0],
+				 MPI_CORE_REGS_ADDR, MPI_CORE_REGS_CNT);
+	if (status)
+		goto err;
+	/* Get the 16 MPI shadow registers */
+	status = ql_get_mpi_shadow_regs(qdev,
+					&mpi_coredump->mpi_core_sh_regs[0]);
+	if (status)
+		goto err;
+
+	/* Get the Test Logic Registers */
+	ql_build_coredump_seg_header(&mpi_coredump->test_logic_regs_seg_hdr,
+				TEST_LOGIC_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->test_logic_regs),
+				"Test Logic Regs");
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->test_logic_regs[0],
+				 TEST_REGS_ADDR, TEST_REGS_CNT);
+	if (status)
+		goto err;
+
+	/* Get the RMII Registers */
+	ql_build_coredump_seg_header(&mpi_coredump->rmii_regs_seg_hdr,
+				RMII_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->rmii_regs),
+				"RMII Registers");
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->rmii_regs[0],
+				 RMII_REGS_ADDR, RMII_REGS_CNT);
+	if (status)
+		goto err;
+
+	/* Get the FCMAC1 Registers */
+	ql_build_coredump_seg_header(&mpi_coredump->fcmac1_regs_seg_hdr,
+				FCMAC1_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->fcmac1_regs),
+				"FCMAC1 Registers");
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->fcmac1_regs[0],
+				 FCMAC1_REGS_ADDR, FCMAC_REGS_CNT);
+	if (status)
+		goto err;
+
+	/* Get the FCMAC2 Registers */
+
+	ql_build_coredump_seg_header(&mpi_coredump->fcmac2_regs_seg_hdr,
+				FCMAC2_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->fcmac2_regs),
+				"FCMAC2 Registers");
+
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->fcmac2_regs[0],
+				 FCMAC2_REGS_ADDR, FCMAC_REGS_CNT);
+	if (status)
+		goto err;
+
+	/* Get the FC1 MBX Registers */
+	ql_build_coredump_seg_header(&mpi_coredump->fc1_mbx_regs_seg_hdr,
+				FC1_MBOX_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->fc1_mbx_regs),
+				"FC1 MBox Regs");
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->fc1_mbx_regs[0],
+				 FC1_MBX_REGS_ADDR, FC_MBX_REGS_CNT);
+	if (status)
+		goto err;
+
+	/* Get the IDE Registers */
+	ql_build_coredump_seg_header(&mpi_coredump->ide_regs_seg_hdr,
+				IDE_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->ide_regs),
+				"IDE Registers");
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->ide_regs[0],
+				 IDE_REGS_ADDR, IDE_REGS_CNT);
+	if (status)
+		goto err;
+
+	/* Get the NIC1 MBX Registers */
+	ql_build_coredump_seg_header(&mpi_coredump->nic1_mbx_regs_seg_hdr,
+				NIC1_MBOX_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->nic1_mbx_regs),
+				"NIC1 MBox Regs");
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->nic1_mbx_regs[0],
+				 NIC1_MBX_REGS_ADDR, NIC_MBX_REGS_CNT);
+	if (status)
+		goto err;
+
+	/* Get the SMBus Registers */
+	ql_build_coredump_seg_header(&mpi_coredump->smbus_regs_seg_hdr,
+				SMBUS_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->smbus_regs),
+				"SMBus Registers");
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->smbus_regs[0],
+				 SMBUS_REGS_ADDR, SMBUS_REGS_CNT);
+	if (status)
+		goto err;
+
+	/* Get the FC2 MBX Registers */
+	ql_build_coredump_seg_header(&mpi_coredump->fc2_mbx_regs_seg_hdr,
+				FC2_MBOX_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->fc2_mbx_regs),
+				"FC2 MBox Regs");
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->fc2_mbx_regs[0],
+				 FC2_MBX_REGS_ADDR, FC_MBX_REGS_CNT);
+	if (status)
+		goto err;
+
+	/* Get the NIC2 MBX Registers */
+	ql_build_coredump_seg_header(&mpi_coredump->nic2_mbx_regs_seg_hdr,
+				NIC2_MBOX_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->nic2_mbx_regs),
+				"NIC2 MBox Regs");
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->nic2_mbx_regs[0],
+				 NIC2_MBX_REGS_ADDR, NIC_MBX_REGS_CNT);
+	if (status)
+		goto err;
+
+	/* Get the I2C Registers */
+	ql_build_coredump_seg_header(&mpi_coredump->i2c_regs_seg_hdr,
+				I2C_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->i2c_regs),
+				"I2C Registers");
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->i2c_regs[0],
+				 I2C_REGS_ADDR, I2C_REGS_CNT);
+	if (status)
+		goto err;
+
+	/* Get the MEMC Registers */
+	ql_build_coredump_seg_header(&mpi_coredump->memc_regs_seg_hdr,
+				MEMC_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->memc_regs),
+				"MEMC Registers");
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->memc_regs[0],
+				 MEMC_REGS_ADDR, MEMC_REGS_CNT);
+	if (status)
+		goto err;
+
+	/* Get the PBus Registers */
+	ql_build_coredump_seg_header(&mpi_coredump->pbus_regs_seg_hdr,
+				PBUS_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->pbus_regs),
+				"PBUS Registers");
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->pbus_regs[0],
+				 PBUS_REGS_ADDR, PBUS_REGS_CNT);
+	if (status)
+		goto err;
+
+	/* Get the MDE Registers */
+	ql_build_coredump_seg_header(&mpi_coredump->mde_regs_seg_hdr,
+				MDE_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->mde_regs),
+				"MDE Registers");
+	status = ql_get_mpi_regs(qdev, &mpi_coredump->mde_regs[0],
+				 MDE_REGS_ADDR, MDE_REGS_CNT);
+	if (status)
+		goto err;
+
+	ql_build_coredump_seg_header(&mpi_coredump->misc_nic_seg_hdr,
+				MISC_NIC_INFO_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->misc_nic_info),
+				"MISC NIC INFO");
+	mpi_coredump->misc_nic_info.rx_ring_count = qdev->rx_ring_count;
+	mpi_coredump->misc_nic_info.tx_ring_count = qdev->tx_ring_count;
+	mpi_coredump->misc_nic_info.intr_count = qdev->intr_count;
+	mpi_coredump->misc_nic_info.function = qdev->func;
+
+	/* Segment 31 */
+	/* Get indexed register values. */
+	ql_build_coredump_seg_header(&mpi_coredump->intr_states_seg_hdr,
+				INTR_STATES_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->intr_states),
+				"INTR States");
+	ql_get_intr_states(qdev, &mpi_coredump->intr_states[0]);
+
+	ql_build_coredump_seg_header(&mpi_coredump->cam_entries_seg_hdr,
+				CAM_ENTRIES_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->cam_entries),
+				"CAM Entries");
+	status = ql_get_cam_entries(qdev, &mpi_coredump->cam_entries[0]);
+	if (status)
+		goto err;
+
+	ql_build_coredump_seg_header(&mpi_coredump->nic_routing_words_seg_hdr,
+				ROUTING_WORDS_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->nic_routing_words),
+				"Routing Words");
+	status = ql_get_routing_entries(qdev,
+			 &mpi_coredump->nic_routing_words[0]);
+	if (status)
+		goto err;
+
+	/* Segment 34 (Rev C. step 23) */
+	ql_build_coredump_seg_header(&mpi_coredump->ets_seg_hdr,
+				ETS_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->ets),
+				"ETS Registers");
+	status = ql_get_ets_regs(qdev, &mpi_coredump->ets[0]);
+	if (status)
+		goto err;
+
+	ql_build_coredump_seg_header(&mpi_coredump->probe_dump_seg_hdr,
+				PROBE_DUMP_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->probe_dump),
+				"Probe Dump");
+	ql_get_probe_dump(qdev, &mpi_coredump->probe_dump[0]);
+
+	ql_build_coredump_seg_header(&mpi_coredump->routing_reg_seg_hdr,
+				ROUTING_INDEX_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->routing_regs),
+				"Routing Regs");
+	status = ql_get_routing_index_registers(qdev,
+					&mpi_coredump->routing_regs[0]);
+	if (status)
+		goto err;
+
+	ql_build_coredump_seg_header(&mpi_coredump->mac_prot_reg_seg_hdr,
+				MAC_PROTOCOL_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->mac_prot_regs),
+				"MAC Prot Regs");
+	ql_get_mac_protocol_registers(qdev, &mpi_coredump->mac_prot_regs[0]);
+
+	/* Get the semaphore registers for all 5 functions */
+	ql_build_coredump_seg_header(&mpi_coredump->sem_regs_seg_hdr,
+			SEM_REGS_SEG_NUM,
+			sizeof(struct mpi_coredump_segment_header) +
+			sizeof(mpi_coredump->sem_regs),	"Sem Registers");
+
+	ql_get_sem_registers(qdev, &mpi_coredump->sem_regs[0]);
+
+	/* Prevent the mpi restarting while we dump the memory.*/
+	ql_write_mpi_reg(qdev, MPI_TEST_FUNC_RST_STS, MPI_TEST_FUNC_RST_FRC);
+
+	/* clear the pause */
+	status = ql_unpause_mpi_risc(qdev);
+	if (status) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Failed RISC unpause. Status = 0x%.08x\n", status);
+		goto err;
+	}
+
+	/* Reset the RISC so we can dump RAM */
+	status = ql_hard_reset_mpi_risc(qdev);
+	if (status) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Failed RISC reset. Status = 0x%.08x\n", status);
+		goto err;
+	}
+
+	ql_build_coredump_seg_header(&mpi_coredump->code_ram_seg_hdr,
+				WCS_RAM_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->code_ram),
+				"WCS RAM");
+	status = ql_dump_risc_ram_area(qdev, &mpi_coredump->code_ram[0],
+					CODE_RAM_ADDR, CODE_RAM_CNT);
+	if (status) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Failed Dump of CODE RAM. Status = 0x%.08x\n",
+			  status);
+		goto err;
+	}
+
+	/* Insert the segment header */
+	ql_build_coredump_seg_header(&mpi_coredump->memc_ram_seg_hdr,
+				MEMC_RAM_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->memc_ram),
+				"MEMC RAM");
+	status = ql_dump_risc_ram_area(qdev, &mpi_coredump->memc_ram[0],
+					MEMC_RAM_ADDR, MEMC_RAM_CNT);
+	if (status) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Failed Dump of MEMC RAM. Status = 0x%.08x\n",
+			  status);
+		goto err;
+	}
+err:
+	ql_sem_unlock(qdev, SEM_PROC_REG_MASK); /* does flush too */
+	return status;
+
+}
+
+static void ql_get_core_dump(struct ql_adapter *qdev)
+{
+	if (!ql_own_firmware(qdev)) {
+		netif_err(qdev, drv, qdev->ndev, "Don't own firmware!\n");
+		return;
+	}
+
+	if (!netif_running(qdev->ndev)) {
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Force Coredump can only be done from interface that is up\n");
+		return;
+	}
+	ql_queue_fw_error(qdev);
+}
+
+static void ql_gen_reg_dump(struct ql_adapter *qdev,
+			    struct ql_reg_dump *mpi_coredump)
+{
+	int i, status;
+
+
+	memset(&(mpi_coredump->mpi_global_header), 0,
+		sizeof(struct mpi_coredump_global_header));
+	mpi_coredump->mpi_global_header.cookie = MPI_COREDUMP_COOKIE;
+	mpi_coredump->mpi_global_header.headerSize =
+		sizeof(struct mpi_coredump_global_header);
+	mpi_coredump->mpi_global_header.imageSize =
+		sizeof(struct ql_reg_dump);
+	strncpy(mpi_coredump->mpi_global_header.idString, "MPI Coredump",
+		sizeof(mpi_coredump->mpi_global_header.idString));
+
+
+	/* segment 16 */
+	ql_build_coredump_seg_header(&mpi_coredump->misc_nic_seg_hdr,
+				MISC_NIC_INFO_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->misc_nic_info),
+				"MISC NIC INFO");
+	mpi_coredump->misc_nic_info.rx_ring_count = qdev->rx_ring_count;
+	mpi_coredump->misc_nic_info.tx_ring_count = qdev->tx_ring_count;
+	mpi_coredump->misc_nic_info.intr_count = qdev->intr_count;
+	mpi_coredump->misc_nic_info.function = qdev->func;
+
+	/* Segment 16, Rev C. Step 18 */
+	ql_build_coredump_seg_header(&mpi_coredump->nic_regs_seg_hdr,
+				NIC1_CONTROL_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->nic_regs),
+				"NIC Registers");
+	/* Get generic reg dump */
+	for (i = 0; i < 64; i++)
+		mpi_coredump->nic_regs[i] = ql_read32(qdev, i * sizeof(u32));
+
+	/* Segment 31 */
+	/* Get indexed register values. */
+	ql_build_coredump_seg_header(&mpi_coredump->intr_states_seg_hdr,
+				INTR_STATES_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->intr_states),
+				"INTR States");
+	ql_get_intr_states(qdev, &mpi_coredump->intr_states[0]);
+
+	ql_build_coredump_seg_header(&mpi_coredump->cam_entries_seg_hdr,
+				CAM_ENTRIES_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->cam_entries),
+				"CAM Entries");
+	status = ql_get_cam_entries(qdev, &mpi_coredump->cam_entries[0]);
+	if (status)
+		return;
+
+	ql_build_coredump_seg_header(&mpi_coredump->nic_routing_words_seg_hdr,
+				ROUTING_WORDS_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->nic_routing_words),
+				"Routing Words");
+	status = ql_get_routing_entries(qdev,
+			 &mpi_coredump->nic_routing_words[0]);
+	if (status)
+		return;
+
+	/* Segment 34 (Rev C. step 23) */
+	ql_build_coredump_seg_header(&mpi_coredump->ets_seg_hdr,
+				ETS_SEG_NUM,
+				sizeof(struct mpi_coredump_segment_header)
+				+ sizeof(mpi_coredump->ets),
+				"ETS Registers");
+	status = ql_get_ets_regs(qdev, &mpi_coredump->ets[0]);
+	if (status)
+		return;
+}
+
+void ql_get_dump(struct ql_adapter *qdev, void *buff)
+{
+	/*
+	 * If the dump has already been taken and is stored
+	 * in our internal buffer and if force dump is set then
+	 * just start the spool to dump it to the log file
+	 * and also, take a snapshot of the general regs to
+	 * to the user's buffer or else take complete dump
+	 * to the user's buffer if force is not set.
+	 */
+
+	if (!test_bit(QL_FRC_COREDUMP, &qdev->flags)) {
+		if (!ql_core_dump(qdev, buff))
+			ql_soft_reset_mpi_risc(qdev);
+		else
+			netif_err(qdev, drv, qdev->ndev, "coredump failed!\n");
+	} else {
+		ql_gen_reg_dump(qdev, buff);
+		ql_get_core_dump(qdev);
+	}
+}
+
+/* Coredump to messages log file using separate worker thread */
+void ql_mpi_core_to_log(struct work_struct *work)
+{
+	struct ql_adapter *qdev =
+		container_of(work, struct ql_adapter, mpi_core_to_log.work);
+	u32 *tmp, count;
+	int i;
+
+	count = sizeof(struct ql_mpi_coredump) / sizeof(u32);
+	tmp = (u32 *)qdev->mpi_coredump;
+	netif_printk(qdev, drv, KERN_DEBUG, qdev->ndev,
+		     "Core is dumping to log file!\n");
+
+	for (i = 0; i < count; i += 8) {
+		pr_err("%.08x: %.08x %.08x %.08x %.08x %.08x "
+			"%.08x %.08x %.08x\n", i,
+			tmp[i + 0],
+			tmp[i + 1],
+			tmp[i + 2],
+			tmp[i + 3],
+			tmp[i + 4],
+			tmp[i + 5],
+			tmp[i + 6],
+			tmp[i + 7]);
+		msleep(5);
+	}
+}
+
+#ifdef QL_REG_DUMP
+static void ql_dump_intr_states(struct ql_adapter *qdev)
+{
+	int i;
+	u32 value;
+	for (i = 0; i < qdev->intr_count; i++) {
+		ql_write32(qdev, INTR_EN, qdev->intr_context[i].intr_read_mask);
+		value = ql_read32(qdev, INTR_EN);
+		pr_err("%s: Interrupt %d is %s\n",
+		       qdev->ndev->name, i,
+		       (value & INTR_EN_EN ? "enabled" : "disabled"));
+	}
+}
+
+#define DUMP_XGMAC(qdev, reg)					\
+do {								\
+	u32 data;						\
+	ql_read_xgmac_reg(qdev, reg, &data);			\
+	pr_err("%s: %s = 0x%.08x\n", qdev->ndev->name, #reg, data); \
+} while (0)
+
+void ql_dump_xgmac_control_regs(struct ql_adapter *qdev)
+{
+	if (ql_sem_spinlock(qdev, qdev->xg_sem_mask)) {
+		pr_err("%s: Couldn't get xgmac sem\n", __func__);
+		return;
+	}
+	DUMP_XGMAC(qdev, PAUSE_SRC_LO);
+	DUMP_XGMAC(qdev, PAUSE_SRC_HI);
+	DUMP_XGMAC(qdev, GLOBAL_CFG);
+	DUMP_XGMAC(qdev, TX_CFG);
+	DUMP_XGMAC(qdev, RX_CFG);
+	DUMP_XGMAC(qdev, FLOW_CTL);
+	DUMP_XGMAC(qdev, PAUSE_OPCODE);
+	DUMP_XGMAC(qdev, PAUSE_TIMER);
+	DUMP_XGMAC(qdev, PAUSE_FRM_DEST_LO);
+	DUMP_XGMAC(qdev, PAUSE_FRM_DEST_HI);
+	DUMP_XGMAC(qdev, MAC_TX_PARAMS);
+	DUMP_XGMAC(qdev, MAC_RX_PARAMS);
+	DUMP_XGMAC(qdev, MAC_SYS_INT);
+	DUMP_XGMAC(qdev, MAC_SYS_INT_MASK);
+	DUMP_XGMAC(qdev, MAC_MGMT_INT);
+	DUMP_XGMAC(qdev, MAC_MGMT_IN_MASK);
+	DUMP_XGMAC(qdev, EXT_ARB_MODE);
+	ql_sem_unlock(qdev, qdev->xg_sem_mask);
+}
+
+static void ql_dump_ets_regs(struct ql_adapter *qdev)
+{
+}
+
+static void ql_dump_cam_entries(struct ql_adapter *qdev)
+{
+	int i;
+	u32 value[3];
+
+	i = ql_sem_spinlock(qdev, SEM_MAC_ADDR_MASK);
+	if (i)
+		return;
+	for (i = 0; i < 4; i++) {
+		if (ql_get_mac_addr_reg(qdev, MAC_ADDR_TYPE_CAM_MAC, i, value)) {
+			pr_err("%s: Failed read of mac index register\n",
+			       __func__);
+			return;
+		} else {
+			if (value[0])
+				pr_err("%s: CAM index %d CAM Lookup Lower = 0x%.08x:%.08x, Output = 0x%.08x\n",
+				       qdev->ndev->name, i, value[1], value[0],
+				       value[2]);
+		}
+	}
+	for (i = 0; i < 32; i++) {
+		if (ql_get_mac_addr_reg
+		    (qdev, MAC_ADDR_TYPE_MULTI_MAC, i, value)) {
+			pr_err("%s: Failed read of mac index register\n",
+			       __func__);
+			return;
+		} else {
+			if (value[0])
+				pr_err("%s: MCAST index %d CAM Lookup Lower = 0x%.08x:%.08x\n",
+				       qdev->ndev->name, i, value[1], value[0]);
+		}
+	}
+	ql_sem_unlock(qdev, SEM_MAC_ADDR_MASK);
+}
+
+void ql_dump_routing_entries(struct ql_adapter *qdev)
+{
+	int i;
+	u32 value;
+	i = ql_sem_spinlock(qdev, SEM_RT_IDX_MASK);
+	if (i)
+		return;
+	for (i = 0; i < 16; i++) {
+		value = 0;
+		if (ql_get_routing_reg(qdev, i, &value)) {
+			pr_err("%s: Failed read of routing index register\n",
+			       __func__);
+			return;
+		} else {
+			if (value)
+				pr_err("%s: Routing Mask %d = 0x%.08x\n",
+				       qdev->ndev->name, i, value);
+		}
+	}
+	ql_sem_unlock(qdev, SEM_RT_IDX_MASK);
+}
+
+#define DUMP_REG(qdev, reg)			\
+	pr_err("%-32s= 0x%x\n", #reg, ql_read32(qdev, reg))
+
+void ql_dump_regs(struct ql_adapter *qdev)
+{
+	pr_err("reg dump for function #%d\n", qdev->func);
+	DUMP_REG(qdev, SYS);
+	DUMP_REG(qdev, RST_FO);
+	DUMP_REG(qdev, FSC);
+	DUMP_REG(qdev, CSR);
+	DUMP_REG(qdev, ICB_RID);
+	DUMP_REG(qdev, ICB_L);
+	DUMP_REG(qdev, ICB_H);
+	DUMP_REG(qdev, CFG);
+	DUMP_REG(qdev, BIOS_ADDR);
+	DUMP_REG(qdev, STS);
+	DUMP_REG(qdev, INTR_EN);
+	DUMP_REG(qdev, INTR_MASK);
+	DUMP_REG(qdev, ISR1);
+	DUMP_REG(qdev, ISR2);
+	DUMP_REG(qdev, ISR3);
+	DUMP_REG(qdev, ISR4);
+	DUMP_REG(qdev, REV_ID);
+	DUMP_REG(qdev, FRC_ECC_ERR);
+	DUMP_REG(qdev, ERR_STS);
+	DUMP_REG(qdev, RAM_DBG_ADDR);
+	DUMP_REG(qdev, RAM_DBG_DATA);
+	DUMP_REG(qdev, ECC_ERR_CNT);
+	DUMP_REG(qdev, SEM);
+	DUMP_REG(qdev, GPIO_1);
+	DUMP_REG(qdev, GPIO_2);
+	DUMP_REG(qdev, GPIO_3);
+	DUMP_REG(qdev, XGMAC_ADDR);
+	DUMP_REG(qdev, XGMAC_DATA);
+	DUMP_REG(qdev, NIC_ETS);
+	DUMP_REG(qdev, CNA_ETS);
+	DUMP_REG(qdev, FLASH_ADDR);
+	DUMP_REG(qdev, FLASH_DATA);
+	DUMP_REG(qdev, CQ_STOP);
+	DUMP_REG(qdev, PAGE_TBL_RID);
+	DUMP_REG(qdev, WQ_PAGE_TBL_LO);
+	DUMP_REG(qdev, WQ_PAGE_TBL_HI);
+	DUMP_REG(qdev, CQ_PAGE_TBL_LO);
+	DUMP_REG(qdev, CQ_PAGE_TBL_HI);
+	DUMP_REG(qdev, COS_DFLT_CQ1);
+	DUMP_REG(qdev, COS_DFLT_CQ2);
+	DUMP_REG(qdev, SPLT_HDR);
+	DUMP_REG(qdev, FC_PAUSE_THRES);
+	DUMP_REG(qdev, NIC_PAUSE_THRES);
+	DUMP_REG(qdev, FC_ETHERTYPE);
+	DUMP_REG(qdev, FC_RCV_CFG);
+	DUMP_REG(qdev, NIC_RCV_CFG);
+	DUMP_REG(qdev, FC_COS_TAGS);
+	DUMP_REG(qdev, NIC_COS_TAGS);
+	DUMP_REG(qdev, MGMT_RCV_CFG);
+	DUMP_REG(qdev, XG_SERDES_ADDR);
+	DUMP_REG(qdev, XG_SERDES_DATA);
+	DUMP_REG(qdev, PRB_MX_ADDR);
+	DUMP_REG(qdev, PRB_MX_DATA);
+	ql_dump_intr_states(qdev);
+	ql_dump_xgmac_control_regs(qdev);
+	ql_dump_ets_regs(qdev);
+	ql_dump_cam_entries(qdev);
+	ql_dump_routing_entries(qdev);
+}
+#endif
+
+#ifdef QL_STAT_DUMP
+
+#define DUMP_STAT(qdev, stat)	\
+	pr_err("%s = %ld\n", #stat, (unsigned long)qdev->nic_stats.stat)
+
+void ql_dump_stat(struct ql_adapter *qdev)
+{
+	pr_err("%s: Enter\n", __func__);
+	DUMP_STAT(qdev, tx_pkts);
+	DUMP_STAT(qdev, tx_bytes);
+	DUMP_STAT(qdev, tx_mcast_pkts);
+	DUMP_STAT(qdev, tx_bcast_pkts);
+	DUMP_STAT(qdev, tx_ucast_pkts);
+	DUMP_STAT(qdev, tx_ctl_pkts);
+	DUMP_STAT(qdev, tx_pause_pkts);
+	DUMP_STAT(qdev, tx_64_pkt);
+	DUMP_STAT(qdev, tx_65_to_127_pkt);
+	DUMP_STAT(qdev, tx_128_to_255_pkt);
+	DUMP_STAT(qdev, tx_256_511_pkt);
+	DUMP_STAT(qdev, tx_512_to_1023_pkt);
+	DUMP_STAT(qdev, tx_1024_to_1518_pkt);
+	DUMP_STAT(qdev, tx_1519_to_max_pkt);
+	DUMP_STAT(qdev, tx_undersize_pkt);
+	DUMP_STAT(qdev, tx_oversize_pkt);
+	DUMP_STAT(qdev, rx_bytes);
+	DUMP_STAT(qdev, rx_bytes_ok);
+	DUMP_STAT(qdev, rx_pkts);
+	DUMP_STAT(qdev, rx_pkts_ok);
+	DUMP_STAT(qdev, rx_bcast_pkts);
+	DUMP_STAT(qdev, rx_mcast_pkts);
+	DUMP_STAT(qdev, rx_ucast_pkts);
+	DUMP_STAT(qdev, rx_undersize_pkts);
+	DUMP_STAT(qdev, rx_oversize_pkts);
+	DUMP_STAT(qdev, rx_jabber_pkts);
+	DUMP_STAT(qdev, rx_undersize_fcerr_pkts);
+	DUMP_STAT(qdev, rx_drop_events);
+	DUMP_STAT(qdev, rx_fcerr_pkts);
+	DUMP_STAT(qdev, rx_align_err);
+	DUMP_STAT(qdev, rx_symbol_err);
+	DUMP_STAT(qdev, rx_mac_err);
+	DUMP_STAT(qdev, rx_ctl_pkts);
+	DUMP_STAT(qdev, rx_pause_pkts);
+	DUMP_STAT(qdev, rx_64_pkts);
+	DUMP_STAT(qdev, rx_65_to_127_pkts);
+	DUMP_STAT(qdev, rx_128_255_pkts);
+	DUMP_STAT(qdev, rx_256_511_pkts);
+	DUMP_STAT(qdev, rx_512_to_1023_pkts);
+	DUMP_STAT(qdev, rx_1024_to_1518_pkts);
+	DUMP_STAT(qdev, rx_1519_to_max_pkts);
+	DUMP_STAT(qdev, rx_len_err_pkts);
+};
+#endif
+
+#ifdef QL_DEV_DUMP
+
+#define DUMP_QDEV_FIELD(qdev, type, field)		\
+	pr_err("qdev->%-24s = " type "\n", #field, qdev->field)
+#define DUMP_QDEV_DMA_FIELD(qdev, field)		\
+	pr_err("qdev->%-24s = %llx\n", #field, (unsigned long long)qdev->field)
+#define DUMP_QDEV_ARRAY(qdev, type, array, index, field) \
+	pr_err("%s[%d].%s = " type "\n",		 \
+	       #array, index, #field, qdev->array[index].field);
+void ql_dump_qdev(struct ql_adapter *qdev)
+{
+	int i;
+	DUMP_QDEV_FIELD(qdev, "%lx", flags);
+	DUMP_QDEV_FIELD(qdev, "%p", vlgrp);
+	DUMP_QDEV_FIELD(qdev, "%p", pdev);
+	DUMP_QDEV_FIELD(qdev, "%p", ndev);
+	DUMP_QDEV_FIELD(qdev, "%d", chip_rev_id);
+	DUMP_QDEV_FIELD(qdev, "%p", reg_base);
+	DUMP_QDEV_FIELD(qdev, "%p", doorbell_area);
+	DUMP_QDEV_FIELD(qdev, "%d", doorbell_area_size);
+	DUMP_QDEV_FIELD(qdev, "%x", msg_enable);
+	DUMP_QDEV_FIELD(qdev, "%p", rx_ring_shadow_reg_area);
+	DUMP_QDEV_DMA_FIELD(qdev, rx_ring_shadow_reg_dma);
+	DUMP_QDEV_FIELD(qdev, "%p", tx_ring_shadow_reg_area);
+	DUMP_QDEV_DMA_FIELD(qdev, tx_ring_shadow_reg_dma);
+	DUMP_QDEV_FIELD(qdev, "%d", intr_count);
+	if (qdev->msi_x_entry)
+		for (i = 0; i < qdev->intr_count; i++) {
+			DUMP_QDEV_ARRAY(qdev, "%d", msi_x_entry, i, vector);
+			DUMP_QDEV_ARRAY(qdev, "%d", msi_x_entry, i, entry);
+		}
+	for (i = 0; i < qdev->intr_count; i++) {
+		DUMP_QDEV_ARRAY(qdev, "%p", intr_context, i, qdev);
+		DUMP_QDEV_ARRAY(qdev, "%d", intr_context, i, intr);
+		DUMP_QDEV_ARRAY(qdev, "%d", intr_context, i, hooked);
+		DUMP_QDEV_ARRAY(qdev, "0x%08x", intr_context, i, intr_en_mask);
+		DUMP_QDEV_ARRAY(qdev, "0x%08x", intr_context, i, intr_dis_mask);
+		DUMP_QDEV_ARRAY(qdev, "0x%08x", intr_context, i, intr_read_mask);
+	}
+	DUMP_QDEV_FIELD(qdev, "%d", tx_ring_count);
+	DUMP_QDEV_FIELD(qdev, "%d", rx_ring_count);
+	DUMP_QDEV_FIELD(qdev, "%d", ring_mem_size);
+	DUMP_QDEV_FIELD(qdev, "%p", ring_mem);
+	DUMP_QDEV_FIELD(qdev, "%d", intr_count);
+	DUMP_QDEV_FIELD(qdev, "%p", tx_ring);
+	DUMP_QDEV_FIELD(qdev, "%d", rss_ring_count);
+	DUMP_QDEV_FIELD(qdev, "%p", rx_ring);
+	DUMP_QDEV_FIELD(qdev, "%d", default_rx_queue);
+	DUMP_QDEV_FIELD(qdev, "0x%08x", xg_sem_mask);
+	DUMP_QDEV_FIELD(qdev, "0x%08x", port_link_up);
+	DUMP_QDEV_FIELD(qdev, "0x%08x", port_init);
+}
+#endif
+
+#ifdef QL_CB_DUMP
+void ql_dump_wqicb(struct wqicb *wqicb)
+{
+	pr_err("Dumping wqicb stuff...\n");
+	pr_err("wqicb->len = 0x%x\n", le16_to_cpu(wqicb->len));
+	pr_err("wqicb->flags = %x\n", le16_to_cpu(wqicb->flags));
+	pr_err("wqicb->cq_id_rss = %d\n",
+	       le16_to_cpu(wqicb->cq_id_rss));
+	pr_err("wqicb->rid = 0x%x\n", le16_to_cpu(wqicb->rid));
+	pr_err("wqicb->wq_addr = 0x%llx\n",
+	       (unsigned long long) le64_to_cpu(wqicb->addr));
+	pr_err("wqicb->wq_cnsmr_idx_addr = 0x%llx\n",
+	       (unsigned long long) le64_to_cpu(wqicb->cnsmr_idx_addr));
+}
+
+void ql_dump_tx_ring(struct tx_ring *tx_ring)
+{
+	if (tx_ring == NULL)
+		return;
+	pr_err("===================== Dumping tx_ring %d ===============\n",
+	       tx_ring->wq_id);
+	pr_err("tx_ring->base = %p\n", tx_ring->wq_base);
+	pr_err("tx_ring->base_dma = 0x%llx\n",
+	       (unsigned long long) tx_ring->wq_base_dma);
+	pr_err("tx_ring->cnsmr_idx_sh_reg, addr = 0x%p, value = %d\n",
+	       tx_ring->cnsmr_idx_sh_reg,
+	       tx_ring->cnsmr_idx_sh_reg
+			? ql_read_sh_reg(tx_ring->cnsmr_idx_sh_reg) : 0);
+	pr_err("tx_ring->size = %d\n", tx_ring->wq_size);
+	pr_err("tx_ring->len = %d\n", tx_ring->wq_len);
+	pr_err("tx_ring->prod_idx_db_reg = %p\n", tx_ring->prod_idx_db_reg);
+	pr_err("tx_ring->valid_db_reg = %p\n", tx_ring->valid_db_reg);
+	pr_err("tx_ring->prod_idx = %d\n", tx_ring->prod_idx);
+	pr_err("tx_ring->cq_id = %d\n", tx_ring->cq_id);
+	pr_err("tx_ring->wq_id = %d\n", tx_ring->wq_id);
+	pr_err("tx_ring->q = %p\n", tx_ring->q);
+	pr_err("tx_ring->tx_count = %d\n", atomic_read(&tx_ring->tx_count));
+}
+
+void ql_dump_ricb(struct ricb *ricb)
+{
+	int i;
+	pr_err("===================== Dumping ricb ===============\n");
+	pr_err("Dumping ricb stuff...\n");
+
+	pr_err("ricb->base_cq = %d\n", ricb->base_cq & 0x1f);
+	pr_err("ricb->flags = %s%s%s%s%s%s%s%s%s\n",
+	       ricb->base_cq & RSS_L4K ? "RSS_L4K " : "",
+	       ricb->flags & RSS_L6K ? "RSS_L6K " : "",
+	       ricb->flags & RSS_LI ? "RSS_LI " : "",
+	       ricb->flags & RSS_LB ? "RSS_LB " : "",
+	       ricb->flags & RSS_LM ? "RSS_LM " : "",
+	       ricb->flags & RSS_RI4 ? "RSS_RI4 " : "",
+	       ricb->flags & RSS_RT4 ? "RSS_RT4 " : "",
+	       ricb->flags & RSS_RI6 ? "RSS_RI6 " : "",
+	       ricb->flags & RSS_RT6 ? "RSS_RT6 " : "");
+	pr_err("ricb->mask = 0x%.04x\n", le16_to_cpu(ricb->mask));
+	for (i = 0; i < 16; i++)
+		pr_err("ricb->hash_cq_id[%d] = 0x%.08x\n", i,
+		       le32_to_cpu(ricb->hash_cq_id[i]));
+	for (i = 0; i < 10; i++)
+		pr_err("ricb->ipv6_hash_key[%d] = 0x%.08x\n", i,
+		       le32_to_cpu(ricb->ipv6_hash_key[i]));
+	for (i = 0; i < 4; i++)
+		pr_err("ricb->ipv4_hash_key[%d] = 0x%.08x\n", i,
+		       le32_to_cpu(ricb->ipv4_hash_key[i]));
+}
+
+void ql_dump_cqicb(struct cqicb *cqicb)
+{
+	pr_err("Dumping cqicb stuff...\n");
+
+	pr_err("cqicb->msix_vect = %d\n", cqicb->msix_vect);
+	pr_err("cqicb->flags = %x\n", cqicb->flags);
+	pr_err("cqicb->len = %d\n", le16_to_cpu(cqicb->len));
+	pr_err("cqicb->addr = 0x%llx\n",
+	       (unsigned long long) le64_to_cpu(cqicb->addr));
+	pr_err("cqicb->prod_idx_addr = 0x%llx\n",
+	       (unsigned long long) le64_to_cpu(cqicb->prod_idx_addr));
+	pr_err("cqicb->pkt_delay = 0x%.04x\n",
+	       le16_to_cpu(cqicb->pkt_delay));
+	pr_err("cqicb->irq_delay = 0x%.04x\n",
+	       le16_to_cpu(cqicb->irq_delay));
+	pr_err("cqicb->lbq_addr = 0x%llx\n",
+	       (unsigned long long) le64_to_cpu(cqicb->lbq_addr));
+	pr_err("cqicb->lbq_buf_size = 0x%.04x\n",
+	       le16_to_cpu(cqicb->lbq_buf_size));
+	pr_err("cqicb->lbq_len = 0x%.04x\n",
+	       le16_to_cpu(cqicb->lbq_len));
+	pr_err("cqicb->sbq_addr = 0x%llx\n",
+	       (unsigned long long) le64_to_cpu(cqicb->sbq_addr));
+	pr_err("cqicb->sbq_buf_size = 0x%.04x\n",
+	       le16_to_cpu(cqicb->sbq_buf_size));
+	pr_err("cqicb->sbq_len = 0x%.04x\n",
+	       le16_to_cpu(cqicb->sbq_len));
+}
+
+void ql_dump_rx_ring(struct rx_ring *rx_ring)
+{
+	if (rx_ring == NULL)
+		return;
+	pr_err("===================== Dumping rx_ring %d ===============\n",
+	       rx_ring->cq_id);
+	pr_err("Dumping rx_ring %d, type = %s%s%s\n",
+	       rx_ring->cq_id, rx_ring->type == DEFAULT_Q ? "DEFAULT" : "",
+	       rx_ring->type == TX_Q ? "OUTBOUND COMPLETIONS" : "",
+	       rx_ring->type == RX_Q ? "INBOUND_COMPLETIONS" : "");
+	pr_err("rx_ring->cqicb = %p\n", &rx_ring->cqicb);
+	pr_err("rx_ring->cq_base = %p\n", rx_ring->cq_base);
+	pr_err("rx_ring->cq_base_dma = %llx\n",
+	       (unsigned long long) rx_ring->cq_base_dma);
+	pr_err("rx_ring->cq_size = %d\n", rx_ring->cq_size);
+	pr_err("rx_ring->cq_len = %d\n", rx_ring->cq_len);
+	pr_err("rx_ring->prod_idx_sh_reg, addr = 0x%p, value = %d\n",
+	       rx_ring->prod_idx_sh_reg,
+	       rx_ring->prod_idx_sh_reg
+			? ql_read_sh_reg(rx_ring->prod_idx_sh_reg) : 0);
+	pr_err("rx_ring->prod_idx_sh_reg_dma = %llx\n",
+	       (unsigned long long) rx_ring->prod_idx_sh_reg_dma);
+	pr_err("rx_ring->cnsmr_idx_db_reg = %p\n",
+	       rx_ring->cnsmr_idx_db_reg);
+	pr_err("rx_ring->cnsmr_idx = %d\n", rx_ring->cnsmr_idx);
+	pr_err("rx_ring->curr_entry = %p\n", rx_ring->curr_entry);
+	pr_err("rx_ring->valid_db_reg = %p\n", rx_ring->valid_db_reg);
+
+	pr_err("rx_ring->lbq_base = %p\n", rx_ring->lbq_base);
+	pr_err("rx_ring->lbq_base_dma = %llx\n",
+	       (unsigned long long) rx_ring->lbq_base_dma);
+	pr_err("rx_ring->lbq_base_indirect = %p\n",
+	       rx_ring->lbq_base_indirect);
+	pr_err("rx_ring->lbq_base_indirect_dma = %llx\n",
+	       (unsigned long long) rx_ring->lbq_base_indirect_dma);
+	pr_err("rx_ring->lbq = %p\n", rx_ring->lbq);
+	pr_err("rx_ring->lbq_len = %d\n", rx_ring->lbq_len);
+	pr_err("rx_ring->lbq_size = %d\n", rx_ring->lbq_size);
+	pr_err("rx_ring->lbq_prod_idx_db_reg = %p\n",
+	       rx_ring->lbq_prod_idx_db_reg);
+	pr_err("rx_ring->lbq_prod_idx = %d\n", rx_ring->lbq_prod_idx);
+	pr_err("rx_ring->lbq_curr_idx = %d\n", rx_ring->lbq_curr_idx);
+	pr_err("rx_ring->lbq_clean_idx = %d\n", rx_ring->lbq_clean_idx);
+	pr_err("rx_ring->lbq_free_cnt = %d\n", rx_ring->lbq_free_cnt);
+	pr_err("rx_ring->lbq_buf_size = %d\n", rx_ring->lbq_buf_size);
+
+	pr_err("rx_ring->sbq_base = %p\n", rx_ring->sbq_base);
+	pr_err("rx_ring->sbq_base_dma = %llx\n",
+	       (unsigned long long) rx_ring->sbq_base_dma);
+	pr_err("rx_ring->sbq_base_indirect = %p\n",
+	       rx_ring->sbq_base_indirect);
+	pr_err("rx_ring->sbq_base_indirect_dma = %llx\n",
+	       (unsigned long long) rx_ring->sbq_base_indirect_dma);
+	pr_err("rx_ring->sbq = %p\n", rx_ring->sbq);
+	pr_err("rx_ring->sbq_len = %d\n", rx_ring->sbq_len);
+	pr_err("rx_ring->sbq_size = %d\n", rx_ring->sbq_size);
+	pr_err("rx_ring->sbq_prod_idx_db_reg addr = %p\n",
+	       rx_ring->sbq_prod_idx_db_reg);
+	pr_err("rx_ring->sbq_prod_idx = %d\n", rx_ring->sbq_prod_idx);
+	pr_err("rx_ring->sbq_curr_idx = %d\n", rx_ring->sbq_curr_idx);
+	pr_err("rx_ring->sbq_clean_idx = %d\n", rx_ring->sbq_clean_idx);
+	pr_err("rx_ring->sbq_free_cnt = %d\n", rx_ring->sbq_free_cnt);
+	pr_err("rx_ring->sbq_buf_size = %d\n", rx_ring->sbq_buf_size);
+	pr_err("rx_ring->cq_id = %d\n", rx_ring->cq_id);
+	pr_err("rx_ring->irq = %d\n", rx_ring->irq);
+	pr_err("rx_ring->cpu = %d\n", rx_ring->cpu);
+	pr_err("rx_ring->qdev = %p\n", rx_ring->qdev);
+}
+
+void ql_dump_hw_cb(struct ql_adapter *qdev, int size, u32 bit, u16 q_id)
+{
+	void *ptr;
+
+	pr_err("%s: Enter\n", __func__);
+
+	ptr = kmalloc(size, GFP_ATOMIC);
+	if (ptr == NULL)
+		return;
+
+	if (ql_write_cfg(qdev, ptr, size, bit, q_id)) {
+		pr_err("%s: Failed to upload control block!\n", __func__);
+		goto fail_it;
+	}
+	switch (bit) {
+	case CFG_DRQ:
+		ql_dump_wqicb((struct wqicb *)ptr);
+		break;
+	case CFG_DCQ:
+		ql_dump_cqicb((struct cqicb *)ptr);
+		break;
+	case CFG_DR:
+		ql_dump_ricb((struct ricb *)ptr);
+		break;
+	default:
+		pr_err("%s: Invalid bit value = %x\n", __func__, bit);
+		break;
+	}
+fail_it:
+	kfree(ptr);
+}
+#endif
+
+#ifdef QL_OB_DUMP
+void ql_dump_tx_desc(struct tx_buf_desc *tbd)
+{
+	pr_err("tbd->addr  = 0x%llx\n",
+	       le64_to_cpu((u64) tbd->addr));
+	pr_err("tbd->len   = %d\n",
+	       le32_to_cpu(tbd->len & TX_DESC_LEN_MASK));
+	pr_err("tbd->flags = %s %s\n",
+	       tbd->len & TX_DESC_C ? "C" : ".",
+	       tbd->len & TX_DESC_E ? "E" : ".");
+	tbd++;
+	pr_err("tbd->addr  = 0x%llx\n",
+	       le64_to_cpu((u64) tbd->addr));
+	pr_err("tbd->len   = %d\n",
+	       le32_to_cpu(tbd->len & TX_DESC_LEN_MASK));
+	pr_err("tbd->flags = %s %s\n",
+	       tbd->len & TX_DESC_C ? "C" : ".",
+	       tbd->len & TX_DESC_E ? "E" : ".");
+	tbd++;
+	pr_err("tbd->addr  = 0x%llx\n",
+	       le64_to_cpu((u64) tbd->addr));
+	pr_err("tbd->len   = %d\n",
+	       le32_to_cpu(tbd->len & TX_DESC_LEN_MASK));
+	pr_err("tbd->flags = %s %s\n",
+	       tbd->len & TX_DESC_C ? "C" : ".",
+	       tbd->len & TX_DESC_E ? "E" : ".");
+
+}
+
+void ql_dump_ob_mac_iocb(struct ob_mac_iocb_req *ob_mac_iocb)
+{
+	struct ob_mac_tso_iocb_req *ob_mac_tso_iocb =
+	    (struct ob_mac_tso_iocb_req *)ob_mac_iocb;
+	struct tx_buf_desc *tbd;
+	u16 frame_len;
+
+	pr_err("%s\n", __func__);
+	pr_err("opcode         = %s\n",
+	       (ob_mac_iocb->opcode == OPCODE_OB_MAC_IOCB) ? "MAC" : "TSO");
+	pr_err("flags1          = %s %s %s %s %s\n",
+	       ob_mac_tso_iocb->flags1 & OB_MAC_TSO_IOCB_OI ? "OI" : "",
+	       ob_mac_tso_iocb->flags1 & OB_MAC_TSO_IOCB_I ? "I" : "",
+	       ob_mac_tso_iocb->flags1 & OB_MAC_TSO_IOCB_D ? "D" : "",
+	       ob_mac_tso_iocb->flags1 & OB_MAC_TSO_IOCB_IP4 ? "IP4" : "",
+	       ob_mac_tso_iocb->flags1 & OB_MAC_TSO_IOCB_IP6 ? "IP6" : "");
+	pr_err("flags2          = %s %s %s\n",
+	       ob_mac_tso_iocb->flags2 & OB_MAC_TSO_IOCB_LSO ? "LSO" : "",
+	       ob_mac_tso_iocb->flags2 & OB_MAC_TSO_IOCB_UC ? "UC" : "",
+	       ob_mac_tso_iocb->flags2 & OB_MAC_TSO_IOCB_TC ? "TC" : "");
+	pr_err("flags3          = %s %s %s\n",
+	       ob_mac_tso_iocb->flags3 & OB_MAC_TSO_IOCB_IC ? "IC" : "",
+	       ob_mac_tso_iocb->flags3 & OB_MAC_TSO_IOCB_DFP ? "DFP" : "",
+	       ob_mac_tso_iocb->flags3 & OB_MAC_TSO_IOCB_V ? "V" : "");
+	pr_err("tid = %x\n", ob_mac_iocb->tid);
+	pr_err("txq_idx = %d\n", ob_mac_iocb->txq_idx);
+	pr_err("vlan_tci      = %x\n", ob_mac_tso_iocb->vlan_tci);
+	if (ob_mac_iocb->opcode == OPCODE_OB_MAC_TSO_IOCB) {
+		pr_err("frame_len      = %d\n",
+		       le32_to_cpu(ob_mac_tso_iocb->frame_len));
+		pr_err("mss      = %d\n",
+		       le16_to_cpu(ob_mac_tso_iocb->mss));
+		pr_err("prot_hdr_len   = %d\n",
+		       le16_to_cpu(ob_mac_tso_iocb->total_hdrs_len));
+		pr_err("hdr_offset     = 0x%.04x\n",
+		       le16_to_cpu(ob_mac_tso_iocb->net_trans_offset));
+		frame_len = le32_to_cpu(ob_mac_tso_iocb->frame_len);
+	} else {
+		pr_err("frame_len      = %d\n",
+		       le16_to_cpu(ob_mac_iocb->frame_len));
+		frame_len = le16_to_cpu(ob_mac_iocb->frame_len);
+	}
+	tbd = &ob_mac_iocb->tbd[0];
+	ql_dump_tx_desc(tbd);
+}
+
+void ql_dump_ob_mac_rsp(struct ob_mac_iocb_rsp *ob_mac_rsp)
+{
+	pr_err("%s\n", __func__);
+	pr_err("opcode         = %d\n", ob_mac_rsp->opcode);
+	pr_err("flags          = %s %s %s %s %s %s %s\n",
+	       ob_mac_rsp->flags1 & OB_MAC_IOCB_RSP_OI ? "OI" : ".",
+	       ob_mac_rsp->flags1 & OB_MAC_IOCB_RSP_I ? "I" : ".",
+	       ob_mac_rsp->flags1 & OB_MAC_IOCB_RSP_E ? "E" : ".",
+	       ob_mac_rsp->flags1 & OB_MAC_IOCB_RSP_S ? "S" : ".",
+	       ob_mac_rsp->flags1 & OB_MAC_IOCB_RSP_L ? "L" : ".",
+	       ob_mac_rsp->flags1 & OB_MAC_IOCB_RSP_P ? "P" : ".",
+	       ob_mac_rsp->flags2 & OB_MAC_IOCB_RSP_B ? "B" : ".");
+	pr_err("tid = %x\n", ob_mac_rsp->tid);
+}
+#endif
+
+#ifdef QL_IB_DUMP
+void ql_dump_ib_mac_rsp(struct ib_mac_iocb_rsp *ib_mac_rsp)
+{
+	pr_err("%s\n", __func__);
+	pr_err("opcode         = 0x%x\n", ib_mac_rsp->opcode);
+	pr_err("flags1 = %s%s%s%s%s%s\n",
+	       ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_OI ? "OI " : "",
+	       ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_I ? "I " : "",
+	       ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_TE ? "TE " : "",
+	       ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_NU ? "NU " : "",
+	       ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_IE ? "IE " : "",
+	       ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_B ? "B " : "");
+
+	if (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK)
+		pr_err("%s%s%s Multicast\n",
+		       (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+		       IB_MAC_IOCB_RSP_M_HASH ? "Hash" : "",
+		       (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+		       IB_MAC_IOCB_RSP_M_REG ? "Registered" : "",
+		       (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+		       IB_MAC_IOCB_RSP_M_PROM ? "Promiscuous" : "");
+
+	pr_err("flags2 = %s%s%s%s%s\n",
+	       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_P) ? "P " : "",
+	       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_V) ? "V " : "",
+	       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_U) ? "U " : "",
+	       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_T) ? "T " : "",
+	       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_FO) ? "FO " : "");
+
+	if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK)
+		pr_err("%s%s%s%s%s error\n",
+		       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) ==
+		       IB_MAC_IOCB_RSP_ERR_OVERSIZE ? "oversize" : "",
+		       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) ==
+		       IB_MAC_IOCB_RSP_ERR_UNDERSIZE ? "undersize" : "",
+		       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) ==
+		       IB_MAC_IOCB_RSP_ERR_PREAMBLE ? "preamble" : "",
+		       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) ==
+		       IB_MAC_IOCB_RSP_ERR_FRAME_LEN ? "frame length" : "",
+		       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) ==
+		       IB_MAC_IOCB_RSP_ERR_CRC ? "CRC" : "");
+
+	pr_err("flags3 = %s%s\n",
+	       ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_DS ? "DS " : "",
+	       ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_DL ? "DL " : "");
+
+	if (ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_RSS_MASK)
+		pr_err("RSS flags = %s%s%s%s\n",
+		       ((ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_RSS_MASK) ==
+			IB_MAC_IOCB_RSP_M_IPV4) ? "IPv4 RSS" : "",
+		       ((ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_RSS_MASK) ==
+			IB_MAC_IOCB_RSP_M_IPV6) ? "IPv6 RSS " : "",
+		       ((ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_RSS_MASK) ==
+			IB_MAC_IOCB_RSP_M_TCP_V4) ? "TCP/IPv4 RSS" : "",
+		       ((ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_RSS_MASK) ==
+			IB_MAC_IOCB_RSP_M_TCP_V6) ? "TCP/IPv6 RSS" : "");
+
+	pr_err("data_len	= %d\n",
+	       le32_to_cpu(ib_mac_rsp->data_len));
+	pr_err("data_addr    = 0x%llx\n",
+	       (unsigned long long) le64_to_cpu(ib_mac_rsp->data_addr));
+	if (ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_RSS_MASK)
+		pr_err("rss    = %x\n",
+		       le32_to_cpu(ib_mac_rsp->rss));
+	if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_V)
+		pr_err("vlan_id    = %x\n",
+		       le16_to_cpu(ib_mac_rsp->vlan_id));
+
+	pr_err("flags4 = %s%s%s\n",
+		ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HV ? "HV " : "",
+		ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HS ? "HS " : "",
+		ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HL ? "HL " : "");
+
+	if (ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HV) {
+		pr_err("hdr length	= %d\n",
+		       le32_to_cpu(ib_mac_rsp->hdr_len));
+		pr_err("hdr addr    = 0x%llx\n",
+		       (unsigned long long) le64_to_cpu(ib_mac_rsp->hdr_addr));
+	}
+}
+#endif
+
+#ifdef QL_ALL_DUMP
+void ql_dump_all(struct ql_adapter *qdev)
+{
+	int i;
+
+	QL_DUMP_REGS(qdev);
+	QL_DUMP_QDEV(qdev);
+	for (i = 0; i < qdev->tx_ring_count; i++) {
+		QL_DUMP_TX_RING(&qdev->tx_ring[i]);
+		QL_DUMP_WQICB((struct wqicb *)&qdev->tx_ring[i]);
+	}
+	for (i = 0; i < qdev->rx_ring_count; i++) {
+		QL_DUMP_RX_RING(&qdev->rx_ring[i]);
+		QL_DUMP_CQICB((struct cqicb *)&qdev->rx_ring[i]);
+	}
+}
+#endif
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_ethtool.c b/drivers/net/ethernet/qlogic/qlge/qlge_ethtool.c
new file mode 100644
index 0000000..5edbd53
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_ethtool.c
@@ -0,0 +1,735 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/dmapool.h>
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+
+#include "qlge.h"
+
+struct ql_stats {
+	char stat_string[ETH_GSTRING_LEN];
+	int sizeof_stat;
+	int stat_offset;
+};
+
+#define QL_SIZEOF(m) FIELD_SIZEOF(struct ql_adapter, m)
+#define QL_OFF(m) offsetof(struct ql_adapter, m)
+
+static const struct ql_stats ql_gstrings_stats[] = {
+	{"tx_pkts", QL_SIZEOF(nic_stats.tx_pkts), QL_OFF(nic_stats.tx_pkts)},
+	{"tx_bytes", QL_SIZEOF(nic_stats.tx_bytes), QL_OFF(nic_stats.tx_bytes)},
+	{"tx_mcast_pkts", QL_SIZEOF(nic_stats.tx_mcast_pkts),
+					QL_OFF(nic_stats.tx_mcast_pkts)},
+	{"tx_bcast_pkts", QL_SIZEOF(nic_stats.tx_bcast_pkts),
+					QL_OFF(nic_stats.tx_bcast_pkts)},
+	{"tx_ucast_pkts", QL_SIZEOF(nic_stats.tx_ucast_pkts),
+					QL_OFF(nic_stats.tx_ucast_pkts)},
+	{"tx_ctl_pkts", QL_SIZEOF(nic_stats.tx_ctl_pkts),
+					QL_OFF(nic_stats.tx_ctl_pkts)},
+	{"tx_pause_pkts", QL_SIZEOF(nic_stats.tx_pause_pkts),
+					QL_OFF(nic_stats.tx_pause_pkts)},
+	{"tx_64_pkts", QL_SIZEOF(nic_stats.tx_64_pkt),
+					QL_OFF(nic_stats.tx_64_pkt)},
+	{"tx_65_to_127_pkts", QL_SIZEOF(nic_stats.tx_65_to_127_pkt),
+					QL_OFF(nic_stats.tx_65_to_127_pkt)},
+	{"tx_128_to_255_pkts", QL_SIZEOF(nic_stats.tx_128_to_255_pkt),
+					QL_OFF(nic_stats.tx_128_to_255_pkt)},
+	{"tx_256_511_pkts", QL_SIZEOF(nic_stats.tx_256_511_pkt),
+					QL_OFF(nic_stats.tx_256_511_pkt)},
+	{"tx_512_to_1023_pkts",	QL_SIZEOF(nic_stats.tx_512_to_1023_pkt),
+					QL_OFF(nic_stats.tx_512_to_1023_pkt)},
+	{"tx_1024_to_1518_pkts", QL_SIZEOF(nic_stats.tx_1024_to_1518_pkt),
+					QL_OFF(nic_stats.tx_1024_to_1518_pkt)},
+	{"tx_1519_to_max_pkts",	QL_SIZEOF(nic_stats.tx_1519_to_max_pkt),
+					QL_OFF(nic_stats.tx_1519_to_max_pkt)},
+	{"tx_undersize_pkts", QL_SIZEOF(nic_stats.tx_undersize_pkt),
+					QL_OFF(nic_stats.tx_undersize_pkt)},
+	{"tx_oversize_pkts", QL_SIZEOF(nic_stats.tx_oversize_pkt),
+					QL_OFF(nic_stats.tx_oversize_pkt)},
+	{"rx_bytes", QL_SIZEOF(nic_stats.rx_bytes), QL_OFF(nic_stats.rx_bytes)},
+	{"rx_bytes_ok",	QL_SIZEOF(nic_stats.rx_bytes_ok),
+					QL_OFF(nic_stats.rx_bytes_ok)},
+	{"rx_pkts", QL_SIZEOF(nic_stats.rx_pkts), QL_OFF(nic_stats.rx_pkts)},
+	{"rx_pkts_ok", QL_SIZEOF(nic_stats.rx_pkts_ok),
+					QL_OFF(nic_stats.rx_pkts_ok)},
+	{"rx_bcast_pkts", QL_SIZEOF(nic_stats.rx_bcast_pkts),
+					QL_OFF(nic_stats.rx_bcast_pkts)},
+	{"rx_mcast_pkts", QL_SIZEOF(nic_stats.rx_mcast_pkts),
+					QL_OFF(nic_stats.rx_mcast_pkts)},
+	{"rx_ucast_pkts", QL_SIZEOF(nic_stats.rx_ucast_pkts),
+					QL_OFF(nic_stats.rx_ucast_pkts)},
+	{"rx_undersize_pkts", QL_SIZEOF(nic_stats.rx_undersize_pkts),
+					QL_OFF(nic_stats.rx_undersize_pkts)},
+	{"rx_oversize_pkts", QL_SIZEOF(nic_stats.rx_oversize_pkts),
+					QL_OFF(nic_stats.rx_oversize_pkts)},
+	{"rx_jabber_pkts", QL_SIZEOF(nic_stats.rx_jabber_pkts),
+					QL_OFF(nic_stats.rx_jabber_pkts)},
+	{"rx_undersize_fcerr_pkts",
+		QL_SIZEOF(nic_stats.rx_undersize_fcerr_pkts),
+				QL_OFF(nic_stats.rx_undersize_fcerr_pkts)},
+	{"rx_drop_events", QL_SIZEOF(nic_stats.rx_drop_events),
+					QL_OFF(nic_stats.rx_drop_events)},
+	{"rx_fcerr_pkts", QL_SIZEOF(nic_stats.rx_fcerr_pkts),
+					QL_OFF(nic_stats.rx_fcerr_pkts)},
+	{"rx_align_err", QL_SIZEOF(nic_stats.rx_align_err),
+					QL_OFF(nic_stats.rx_align_err)},
+	{"rx_symbol_err", QL_SIZEOF(nic_stats.rx_symbol_err),
+					QL_OFF(nic_stats.rx_symbol_err)},
+	{"rx_mac_err", QL_SIZEOF(nic_stats.rx_mac_err),
+					QL_OFF(nic_stats.rx_mac_err)},
+	{"rx_ctl_pkts",	QL_SIZEOF(nic_stats.rx_ctl_pkts),
+					QL_OFF(nic_stats.rx_ctl_pkts)},
+	{"rx_pause_pkts", QL_SIZEOF(nic_stats.rx_pause_pkts),
+					QL_OFF(nic_stats.rx_pause_pkts)},
+	{"rx_64_pkts", QL_SIZEOF(nic_stats.rx_64_pkts),
+					QL_OFF(nic_stats.rx_64_pkts)},
+	{"rx_65_to_127_pkts", QL_SIZEOF(nic_stats.rx_65_to_127_pkts),
+					QL_OFF(nic_stats.rx_65_to_127_pkts)},
+	{"rx_128_255_pkts", QL_SIZEOF(nic_stats.rx_128_255_pkts),
+					QL_OFF(nic_stats.rx_128_255_pkts)},
+	{"rx_256_511_pkts", QL_SIZEOF(nic_stats.rx_256_511_pkts),
+					QL_OFF(nic_stats.rx_256_511_pkts)},
+	{"rx_512_to_1023_pkts",	QL_SIZEOF(nic_stats.rx_512_to_1023_pkts),
+					QL_OFF(nic_stats.rx_512_to_1023_pkts)},
+	{"rx_1024_to_1518_pkts", QL_SIZEOF(nic_stats.rx_1024_to_1518_pkts),
+					QL_OFF(nic_stats.rx_1024_to_1518_pkts)},
+	{"rx_1519_to_max_pkts",	QL_SIZEOF(nic_stats.rx_1519_to_max_pkts),
+					QL_OFF(nic_stats.rx_1519_to_max_pkts)},
+	{"rx_len_err_pkts", QL_SIZEOF(nic_stats.rx_len_err_pkts),
+					QL_OFF(nic_stats.rx_len_err_pkts)},
+	{"rx_code_err",	QL_SIZEOF(nic_stats.rx_code_err),
+					QL_OFF(nic_stats.rx_code_err)},
+	{"rx_oversize_err", QL_SIZEOF(nic_stats.rx_oversize_err),
+					QL_OFF(nic_stats.rx_oversize_err)},
+	{"rx_undersize_err", QL_SIZEOF(nic_stats.rx_undersize_err),
+					QL_OFF(nic_stats.rx_undersize_err)},
+	{"rx_preamble_err", QL_SIZEOF(nic_stats.rx_preamble_err),
+					QL_OFF(nic_stats.rx_preamble_err)},
+	{"rx_frame_len_err", QL_SIZEOF(nic_stats.rx_frame_len_err),
+					QL_OFF(nic_stats.rx_frame_len_err)},
+	{"rx_crc_err", QL_SIZEOF(nic_stats.rx_crc_err),
+					QL_OFF(nic_stats.rx_crc_err)},
+	{"rx_err_count", QL_SIZEOF(nic_stats.rx_err_count),
+					QL_OFF(nic_stats.rx_err_count)},
+	{"tx_cbfc_pause_frames0", QL_SIZEOF(nic_stats.tx_cbfc_pause_frames0),
+				QL_OFF(nic_stats.tx_cbfc_pause_frames0)},
+	{"tx_cbfc_pause_frames1", QL_SIZEOF(nic_stats.tx_cbfc_pause_frames1),
+				QL_OFF(nic_stats.tx_cbfc_pause_frames1)},
+	{"tx_cbfc_pause_frames2", QL_SIZEOF(nic_stats.tx_cbfc_pause_frames2),
+				QL_OFF(nic_stats.tx_cbfc_pause_frames2)},
+	{"tx_cbfc_pause_frames3", QL_SIZEOF(nic_stats.tx_cbfc_pause_frames3),
+				QL_OFF(nic_stats.tx_cbfc_pause_frames3)},
+	{"tx_cbfc_pause_frames4", QL_SIZEOF(nic_stats.tx_cbfc_pause_frames4),
+				QL_OFF(nic_stats.tx_cbfc_pause_frames4)},
+	{"tx_cbfc_pause_frames5", QL_SIZEOF(nic_stats.tx_cbfc_pause_frames5),
+				QL_OFF(nic_stats.tx_cbfc_pause_frames5)},
+	{"tx_cbfc_pause_frames6", QL_SIZEOF(nic_stats.tx_cbfc_pause_frames6),
+				QL_OFF(nic_stats.tx_cbfc_pause_frames6)},
+	{"tx_cbfc_pause_frames7", QL_SIZEOF(nic_stats.tx_cbfc_pause_frames7),
+				QL_OFF(nic_stats.tx_cbfc_pause_frames7)},
+	{"rx_cbfc_pause_frames0", QL_SIZEOF(nic_stats.rx_cbfc_pause_frames0),
+				QL_OFF(nic_stats.rx_cbfc_pause_frames0)},
+	{"rx_cbfc_pause_frames1", QL_SIZEOF(nic_stats.rx_cbfc_pause_frames1),
+				QL_OFF(nic_stats.rx_cbfc_pause_frames1)},
+	{"rx_cbfc_pause_frames2", QL_SIZEOF(nic_stats.rx_cbfc_pause_frames2),
+				QL_OFF(nic_stats.rx_cbfc_pause_frames2)},
+	{"rx_cbfc_pause_frames3", QL_SIZEOF(nic_stats.rx_cbfc_pause_frames3),
+				QL_OFF(nic_stats.rx_cbfc_pause_frames3)},
+	{"rx_cbfc_pause_frames4", QL_SIZEOF(nic_stats.rx_cbfc_pause_frames4),
+				QL_OFF(nic_stats.rx_cbfc_pause_frames4)},
+	{"rx_cbfc_pause_frames5", QL_SIZEOF(nic_stats.rx_cbfc_pause_frames5),
+				QL_OFF(nic_stats.rx_cbfc_pause_frames5)},
+	{"rx_cbfc_pause_frames6", QL_SIZEOF(nic_stats.rx_cbfc_pause_frames6),
+				QL_OFF(nic_stats.rx_cbfc_pause_frames6)},
+	{"rx_cbfc_pause_frames7", QL_SIZEOF(nic_stats.rx_cbfc_pause_frames7),
+				QL_OFF(nic_stats.rx_cbfc_pause_frames7)},
+	{"rx_nic_fifo_drop", QL_SIZEOF(nic_stats.rx_nic_fifo_drop),
+					QL_OFF(nic_stats.rx_nic_fifo_drop)},
+};
+
+static const char ql_gstrings_test[][ETH_GSTRING_LEN] = {
+	"Loopback test  (offline)"
+};
+#define QLGE_TEST_LEN (sizeof(ql_gstrings_test) / ETH_GSTRING_LEN)
+#define QLGE_STATS_LEN ARRAY_SIZE(ql_gstrings_stats)
+#define QLGE_RCV_MAC_ERR_STATS	7
+
+static int ql_update_ring_coalescing(struct ql_adapter *qdev)
+{
+	int i, status = 0;
+	struct rx_ring *rx_ring;
+	struct cqicb *cqicb;
+
+	if (!netif_running(qdev->ndev))
+		return status;
+
+	/* Skip the default queue, and update the outbound handler
+	 * queues if they changed.
+	 */
+	cqicb = (struct cqicb *)&qdev->rx_ring[qdev->rss_ring_count];
+	if (le16_to_cpu(cqicb->irq_delay) != qdev->tx_coalesce_usecs ||
+		le16_to_cpu(cqicb->pkt_delay) !=
+				qdev->tx_max_coalesced_frames) {
+		for (i = qdev->rss_ring_count; i < qdev->rx_ring_count; i++) {
+			rx_ring = &qdev->rx_ring[i];
+			cqicb = (struct cqicb *)rx_ring;
+			cqicb->irq_delay = cpu_to_le16(qdev->tx_coalesce_usecs);
+			cqicb->pkt_delay =
+			    cpu_to_le16(qdev->tx_max_coalesced_frames);
+			cqicb->flags = FLAGS_LI;
+			status = ql_write_cfg(qdev, cqicb, sizeof(*cqicb),
+						CFG_LCQ, rx_ring->cq_id);
+			if (status) {
+				netif_err(qdev, ifup, qdev->ndev,
+					  "Failed to load CQICB.\n");
+				goto exit;
+			}
+		}
+	}
+
+	/* Update the inbound (RSS) handler queues if they changed. */
+	cqicb = (struct cqicb *)&qdev->rx_ring[0];
+	if (le16_to_cpu(cqicb->irq_delay) != qdev->rx_coalesce_usecs ||
+		le16_to_cpu(cqicb->pkt_delay) !=
+					qdev->rx_max_coalesced_frames) {
+		for (i = 0; i < qdev->rss_ring_count; i++, rx_ring++) {
+			rx_ring = &qdev->rx_ring[i];
+			cqicb = (struct cqicb *)rx_ring;
+			cqicb->irq_delay = cpu_to_le16(qdev->rx_coalesce_usecs);
+			cqicb->pkt_delay =
+			    cpu_to_le16(qdev->rx_max_coalesced_frames);
+			cqicb->flags = FLAGS_LI;
+			status = ql_write_cfg(qdev, cqicb, sizeof(*cqicb),
+						CFG_LCQ, rx_ring->cq_id);
+			if (status) {
+				netif_err(qdev, ifup, qdev->ndev,
+					  "Failed to load CQICB.\n");
+				goto exit;
+			}
+		}
+	}
+exit:
+	return status;
+}
+
+static void ql_update_stats(struct ql_adapter *qdev)
+{
+	u32 i;
+	u64 data;
+	u64 *iter = &qdev->nic_stats.tx_pkts;
+
+	spin_lock(&qdev->stats_lock);
+	if (ql_sem_spinlock(qdev, qdev->xg_sem_mask)) {
+			netif_err(qdev, drv, qdev->ndev,
+				  "Couldn't get xgmac sem.\n");
+		goto quit;
+	}
+	/*
+	 * Get TX statistics.
+	 */
+	for (i = 0x200; i < 0x280; i += 8) {
+		if (ql_read_xgmac_reg64(qdev, i, &data)) {
+			netif_err(qdev, drv, qdev->ndev,
+				  "Error reading status register 0x%.04x.\n",
+				  i);
+			goto end;
+		} else
+			*iter = data;
+		iter++;
+	}
+
+	/*
+	 * Get RX statistics.
+	 */
+	for (i = 0x300; i < 0x3d0; i += 8) {
+		if (ql_read_xgmac_reg64(qdev, i, &data)) {
+			netif_err(qdev, drv, qdev->ndev,
+				  "Error reading status register 0x%.04x.\n",
+				  i);
+			goto end;
+		} else
+			*iter = data;
+		iter++;
+	}
+
+	/* Update receive mac error statistics */
+	iter += QLGE_RCV_MAC_ERR_STATS;
+
+	/*
+	 * Get Per-priority TX pause frame counter statistics.
+	 */
+	for (i = 0x500; i < 0x540; i += 8) {
+		if (ql_read_xgmac_reg64(qdev, i, &data)) {
+			netif_err(qdev, drv, qdev->ndev,
+				  "Error reading status register 0x%.04x.\n",
+				  i);
+			goto end;
+		} else
+			*iter = data;
+		iter++;
+	}
+
+	/*
+	 * Get Per-priority RX pause frame counter statistics.
+	 */
+	for (i = 0x568; i < 0x5a8; i += 8) {
+		if (ql_read_xgmac_reg64(qdev, i, &data)) {
+			netif_err(qdev, drv, qdev->ndev,
+				  "Error reading status register 0x%.04x.\n",
+				  i);
+			goto end;
+		} else
+			*iter = data;
+		iter++;
+	}
+
+	/*
+	 * Get RX NIC FIFO DROP statistics.
+	 */
+	if (ql_read_xgmac_reg64(qdev, 0x5b8, &data)) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Error reading status register 0x%.04x.\n", i);
+		goto end;
+	} else
+		*iter = data;
+end:
+	ql_sem_unlock(qdev, qdev->xg_sem_mask);
+quit:
+	spin_unlock(&qdev->stats_lock);
+
+	QL_DUMP_STAT(qdev);
+}
+
+static void ql_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
+{
+	int index;
+	switch (stringset) {
+	case ETH_SS_TEST:
+		memcpy(buf, *ql_gstrings_test, QLGE_TEST_LEN * ETH_GSTRING_LEN);
+		break;
+	case ETH_SS_STATS:
+		for (index = 0; index < QLGE_STATS_LEN; index++) {
+			memcpy(buf + index * ETH_GSTRING_LEN,
+				ql_gstrings_stats[index].stat_string,
+				ETH_GSTRING_LEN);
+		}
+		break;
+	}
+}
+
+static int ql_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_TEST:
+		return QLGE_TEST_LEN;
+	case ETH_SS_STATS:
+		return QLGE_STATS_LEN;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void
+ql_get_ethtool_stats(struct net_device *ndev,
+		     struct ethtool_stats *stats, u64 *data)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int index, length;
+
+	length = QLGE_STATS_LEN;
+	ql_update_stats(qdev);
+
+	for (index = 0; index < length; index++) {
+		char *p = (char *)qdev +
+			ql_gstrings_stats[index].stat_offset;
+		*data++ = (ql_gstrings_stats[index].sizeof_stat ==
+			sizeof(u64)) ? *(u64 *)p : (*(u32 *)p);
+	}
+}
+
+static int ql_get_link_ksettings(struct net_device *ndev,
+				 struct ethtool_link_ksettings *ecmd)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	u32 supported, advertising;
+
+	supported = SUPPORTED_10000baseT_Full;
+	advertising = ADVERTISED_10000baseT_Full;
+
+	if ((qdev->link_status & STS_LINK_TYPE_MASK) ==
+				STS_LINK_TYPE_10GBASET) {
+		supported |= (SUPPORTED_TP | SUPPORTED_Autoneg);
+		advertising |= (ADVERTISED_TP | ADVERTISED_Autoneg);
+		ecmd->base.port = PORT_TP;
+		ecmd->base.autoneg = AUTONEG_ENABLE;
+	} else {
+		supported |= SUPPORTED_FIBRE;
+		advertising |= ADVERTISED_FIBRE;
+		ecmd->base.port = PORT_FIBRE;
+	}
+
+	ecmd->base.speed = SPEED_10000;
+	ecmd->base.duplex = DUPLEX_FULL;
+
+	ethtool_convert_legacy_u32_to_link_mode(ecmd->link_modes.supported,
+						supported);
+	ethtool_convert_legacy_u32_to_link_mode(ecmd->link_modes.advertising,
+						advertising);
+
+	return 0;
+}
+
+static void ql_get_drvinfo(struct net_device *ndev,
+			   struct ethtool_drvinfo *drvinfo)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	strlcpy(drvinfo->driver, qlge_driver_name, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, qlge_driver_version,
+		sizeof(drvinfo->version));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "v%d.%d.%d",
+		 (qdev->fw_rev_id & 0x00ff0000) >> 16,
+		 (qdev->fw_rev_id & 0x0000ff00) >> 8,
+		 (qdev->fw_rev_id & 0x000000ff));
+	strlcpy(drvinfo->bus_info, pci_name(qdev->pdev),
+		sizeof(drvinfo->bus_info));
+}
+
+static void ql_get_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	unsigned short ssys_dev = qdev->pdev->subsystem_device;
+
+	/* WOL is only supported for mezz card. */
+	if (ssys_dev == QLGE_MEZZ_SSYS_ID_068 ||
+			ssys_dev == QLGE_MEZZ_SSYS_ID_180) {
+		wol->supported = WAKE_MAGIC;
+		wol->wolopts = qdev->wol;
+	}
+}
+
+static int ql_set_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	unsigned short ssys_dev = qdev->pdev->subsystem_device;
+
+	/* WOL is only supported for mezz card. */
+	if (ssys_dev != QLGE_MEZZ_SSYS_ID_068 &&
+			ssys_dev != QLGE_MEZZ_SSYS_ID_180) {
+		netif_info(qdev, drv, qdev->ndev,
+				"WOL is only supported for mezz card\n");
+		return -EOPNOTSUPP;
+	}
+	if (wol->wolopts & ~WAKE_MAGIC)
+		return -EINVAL;
+	qdev->wol = wol->wolopts;
+
+	netif_info(qdev, drv, qdev->ndev, "Set wol option 0x%x\n", qdev->wol);
+	return 0;
+}
+
+static int ql_set_phys_id(struct net_device *ndev,
+			  enum ethtool_phys_id_state state)
+
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		/* Save the current LED settings */
+		if (ql_mb_get_led_cfg(qdev))
+			return -EIO;
+
+		/* Start blinking */
+		ql_mb_set_led_cfg(qdev, QL_LED_BLINK);
+		return 0;
+
+	case ETHTOOL_ID_INACTIVE:
+		/* Restore LED settings */
+		if (ql_mb_set_led_cfg(qdev, qdev->led_config))
+			return -EIO;
+		return 0;
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static int ql_start_loopback(struct ql_adapter *qdev)
+{
+	if (netif_carrier_ok(qdev->ndev)) {
+		set_bit(QL_LB_LINK_UP, &qdev->flags);
+		netif_carrier_off(qdev->ndev);
+	} else
+		clear_bit(QL_LB_LINK_UP, &qdev->flags);
+	qdev->link_config |= CFG_LOOPBACK_PCS;
+	return ql_mb_set_port_cfg(qdev);
+}
+
+static void ql_stop_loopback(struct ql_adapter *qdev)
+{
+	qdev->link_config &= ~CFG_LOOPBACK_PCS;
+	ql_mb_set_port_cfg(qdev);
+	if (test_bit(QL_LB_LINK_UP, &qdev->flags)) {
+		netif_carrier_on(qdev->ndev);
+		clear_bit(QL_LB_LINK_UP, &qdev->flags);
+	}
+}
+
+static void ql_create_lb_frame(struct sk_buff *skb,
+					unsigned int frame_size)
+{
+	memset(skb->data, 0xFF, frame_size);
+	frame_size &= ~1;
+	memset(&skb->data[frame_size / 2], 0xAA, frame_size / 2 - 1);
+	memset(&skb->data[frame_size / 2 + 10], 0xBE, 1);
+	memset(&skb->data[frame_size / 2 + 12], 0xAF, 1);
+}
+
+void ql_check_lb_frame(struct ql_adapter *qdev,
+					struct sk_buff *skb)
+{
+	unsigned int frame_size = skb->len;
+
+	if ((*(skb->data + 3) == 0xFF) &&
+		(*(skb->data + frame_size / 2 + 10) == 0xBE) &&
+		(*(skb->data + frame_size / 2 + 12) == 0xAF)) {
+			atomic_dec(&qdev->lb_count);
+			return;
+	}
+}
+
+static int ql_run_loopback_test(struct ql_adapter *qdev)
+{
+	int i;
+	netdev_tx_t rc;
+	struct sk_buff *skb;
+	unsigned int size = SMALL_BUF_MAP_SIZE;
+
+	for (i = 0; i < 64; i++) {
+		skb = netdev_alloc_skb(qdev->ndev, size);
+		if (!skb)
+			return -ENOMEM;
+
+		skb->queue_mapping = 0;
+		skb_put(skb, size);
+		ql_create_lb_frame(skb, size);
+		rc = ql_lb_send(skb, qdev->ndev);
+		if (rc != NETDEV_TX_OK)
+			return -EPIPE;
+		atomic_inc(&qdev->lb_count);
+	}
+	/* Give queue time to settle before testing results. */
+	msleep(2);
+	ql_clean_lb_rx_ring(&qdev->rx_ring[0], 128);
+	return atomic_read(&qdev->lb_count) ? -EIO : 0;
+}
+
+static int ql_loopback_test(struct ql_adapter *qdev, u64 *data)
+{
+	*data = ql_start_loopback(qdev);
+	if (*data)
+		goto out;
+	*data = ql_run_loopback_test(qdev);
+out:
+	ql_stop_loopback(qdev);
+	return *data;
+}
+
+static void ql_self_test(struct net_device *ndev,
+				struct ethtool_test *eth_test, u64 *data)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	memset(data, 0, sizeof(u64) * QLGE_TEST_LEN);
+
+	if (netif_running(ndev)) {
+		set_bit(QL_SELFTEST, &qdev->flags);
+		if (eth_test->flags == ETH_TEST_FL_OFFLINE) {
+			/* Offline tests */
+			if (ql_loopback_test(qdev, &data[0]))
+				eth_test->flags |= ETH_TEST_FL_FAILED;
+
+		} else {
+			/* Online tests */
+			data[0] = 0;
+		}
+		clear_bit(QL_SELFTEST, &qdev->flags);
+		/* Give link time to come up after
+		 * port configuration changes.
+		 */
+		msleep_interruptible(4 * 1000);
+	} else {
+		netif_err(qdev, drv, qdev->ndev,
+			  "is down, Loopback test will fail.\n");
+		eth_test->flags |= ETH_TEST_FL_FAILED;
+	}
+}
+
+static int ql_get_regs_len(struct net_device *ndev)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	if (!test_bit(QL_FRC_COREDUMP, &qdev->flags))
+		return sizeof(struct ql_mpi_coredump);
+	else
+		return sizeof(struct ql_reg_dump);
+}
+
+static void ql_get_regs(struct net_device *ndev,
+			struct ethtool_regs *regs, void *p)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	ql_get_dump(qdev, p);
+	qdev->core_is_dumped = 0;
+	if (!test_bit(QL_FRC_COREDUMP, &qdev->flags))
+		regs->len = sizeof(struct ql_mpi_coredump);
+	else
+		regs->len = sizeof(struct ql_reg_dump);
+}
+
+static int ql_get_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+{
+	struct ql_adapter *qdev = netdev_priv(dev);
+
+	c->rx_coalesce_usecs = qdev->rx_coalesce_usecs;
+	c->tx_coalesce_usecs = qdev->tx_coalesce_usecs;
+
+	/* This chip coalesces as follows:
+	 * If a packet arrives, hold off interrupts until
+	 * cqicb->int_delay expires, but if no other packets arrive don't
+	 * wait longer than cqicb->pkt_int_delay. But ethtool doesn't use a
+	 * timer to coalesce on a frame basis.  So, we have to take ethtool's
+	 * max_coalesced_frames value and convert it to a delay in microseconds.
+	 * We do this by using a basic thoughput of 1,000,000 frames per
+	 * second @ (1024 bytes).  This means one frame per usec. So it's a
+	 * simple one to one ratio.
+	 */
+	c->rx_max_coalesced_frames = qdev->rx_max_coalesced_frames;
+	c->tx_max_coalesced_frames = qdev->tx_max_coalesced_frames;
+
+	return 0;
+}
+
+static int ql_set_coalesce(struct net_device *ndev, struct ethtool_coalesce *c)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	/* Validate user parameters. */
+	if (c->rx_coalesce_usecs > qdev->rx_ring_size / 2)
+		return -EINVAL;
+       /* Don't wait more than 10 usec. */
+	if (c->rx_max_coalesced_frames > MAX_INTER_FRAME_WAIT)
+		return -EINVAL;
+	if (c->tx_coalesce_usecs > qdev->tx_ring_size / 2)
+		return -EINVAL;
+	if (c->tx_max_coalesced_frames > MAX_INTER_FRAME_WAIT)
+		return -EINVAL;
+
+	/* Verify a change took place before updating the hardware. */
+	if (qdev->rx_coalesce_usecs == c->rx_coalesce_usecs &&
+	    qdev->tx_coalesce_usecs == c->tx_coalesce_usecs &&
+	    qdev->rx_max_coalesced_frames == c->rx_max_coalesced_frames &&
+	    qdev->tx_max_coalesced_frames == c->tx_max_coalesced_frames)
+		return 0;
+
+	qdev->rx_coalesce_usecs = c->rx_coalesce_usecs;
+	qdev->tx_coalesce_usecs = c->tx_coalesce_usecs;
+	qdev->rx_max_coalesced_frames = c->rx_max_coalesced_frames;
+	qdev->tx_max_coalesced_frames = c->tx_max_coalesced_frames;
+
+	return ql_update_ring_coalescing(qdev);
+}
+
+static void ql_get_pauseparam(struct net_device *netdev,
+			struct ethtool_pauseparam *pause)
+{
+	struct ql_adapter *qdev = netdev_priv(netdev);
+
+	ql_mb_get_port_cfg(qdev);
+	if (qdev->link_config & CFG_PAUSE_STD) {
+		pause->rx_pause = 1;
+		pause->tx_pause = 1;
+	}
+}
+
+static int ql_set_pauseparam(struct net_device *netdev,
+			struct ethtool_pauseparam *pause)
+{
+	struct ql_adapter *qdev = netdev_priv(netdev);
+	int status = 0;
+
+	if ((pause->rx_pause) && (pause->tx_pause))
+		qdev->link_config |= CFG_PAUSE_STD;
+	else if (!pause->rx_pause && !pause->tx_pause)
+		qdev->link_config &= ~CFG_PAUSE_STD;
+	else
+		return -EINVAL;
+
+	status = ql_mb_set_port_cfg(qdev);
+	return status;
+}
+
+static u32 ql_get_msglevel(struct net_device *ndev)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	return qdev->msg_enable;
+}
+
+static void ql_set_msglevel(struct net_device *ndev, u32 value)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	qdev->msg_enable = value;
+}
+
+const struct ethtool_ops qlge_ethtool_ops = {
+	.get_drvinfo = ql_get_drvinfo,
+	.get_wol = ql_get_wol,
+	.set_wol = ql_set_wol,
+	.get_regs_len	= ql_get_regs_len,
+	.get_regs = ql_get_regs,
+	.get_msglevel = ql_get_msglevel,
+	.set_msglevel = ql_set_msglevel,
+	.get_link = ethtool_op_get_link,
+	.set_phys_id		 = ql_set_phys_id,
+	.self_test		 = ql_self_test,
+	.get_pauseparam		 = ql_get_pauseparam,
+	.set_pauseparam		 = ql_set_pauseparam,
+	.get_coalesce = ql_get_coalesce,
+	.set_coalesce = ql_set_coalesce,
+	.get_sset_count = ql_get_sset_count,
+	.get_strings = ql_get_strings,
+	.get_ethtool_stats = ql_get_ethtool_stats,
+	.get_link_ksettings = ql_get_link_ksettings,
+};
+
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
new file mode 100644
index 0000000..8293c20
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -0,0 +1,5029 @@
+/*
+ * QLogic qlge NIC HBA Driver
+ * Copyright (c)  2003-2008 QLogic Corporation
+ * See LICENSE.qlge for copyright and licensing details.
+ * Author:     Linux qlge network device driver by
+ *                      Ron Mercer <ron.mercer@qlogic.com>
+ */
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/dmapool.h>
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/if_vlan.h>
+#include <linux/skbuff.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/prefetch.h>
+#include <net/ip6_checksum.h>
+
+#include "qlge.h"
+
+char qlge_driver_name[] = DRV_NAME;
+const char qlge_driver_version[] = DRV_VERSION;
+
+MODULE_AUTHOR("Ron Mercer <ron.mercer@qlogic.com>");
+MODULE_DESCRIPTION(DRV_STRING " ");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static const u32 default_msg =
+    NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK |
+/* NETIF_MSG_TIMER |	*/
+    NETIF_MSG_IFDOWN |
+    NETIF_MSG_IFUP |
+    NETIF_MSG_RX_ERR |
+    NETIF_MSG_TX_ERR |
+/*  NETIF_MSG_TX_QUEUED | */
+/*  NETIF_MSG_INTR | NETIF_MSG_TX_DONE | NETIF_MSG_RX_STATUS | */
+/* NETIF_MSG_PKTDATA | */
+    NETIF_MSG_HW | NETIF_MSG_WOL | 0;
+
+static int debug = -1;	/* defaults above */
+module_param(debug, int, 0664);
+MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+
+#define MSIX_IRQ 0
+#define MSI_IRQ 1
+#define LEG_IRQ 2
+static int qlge_irq_type = MSIX_IRQ;
+module_param(qlge_irq_type, int, 0664);
+MODULE_PARM_DESC(qlge_irq_type, "0 = MSI-X, 1 = MSI, 2 = Legacy.");
+
+static int qlge_mpi_coredump;
+module_param(qlge_mpi_coredump, int, 0);
+MODULE_PARM_DESC(qlge_mpi_coredump,
+		"Option to enable MPI firmware dump. "
+		"Default is OFF - Do Not allocate memory. ");
+
+static int qlge_force_coredump;
+module_param(qlge_force_coredump, int, 0);
+MODULE_PARM_DESC(qlge_force_coredump,
+		"Option to allow force of firmware core dump. "
+		"Default is OFF - Do not allow.");
+
+static const struct pci_device_id qlge_pci_tbl[] = {
+	{PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, QLGE_DEVICE_ID_8012)},
+	{PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, QLGE_DEVICE_ID_8000)},
+	/* required last entry */
+	{0,}
+};
+
+MODULE_DEVICE_TABLE(pci, qlge_pci_tbl);
+
+static int ql_wol(struct ql_adapter *);
+static void qlge_set_multicast_list(struct net_device *);
+static int ql_adapter_down(struct ql_adapter *);
+static int ql_adapter_up(struct ql_adapter *);
+
+/* This hardware semaphore causes exclusive access to
+ * resources shared between the NIC driver, MPI firmware,
+ * FCOE firmware and the FC driver.
+ */
+static int ql_sem_trylock(struct ql_adapter *qdev, u32 sem_mask)
+{
+	u32 sem_bits = 0;
+
+	switch (sem_mask) {
+	case SEM_XGMAC0_MASK:
+		sem_bits = SEM_SET << SEM_XGMAC0_SHIFT;
+		break;
+	case SEM_XGMAC1_MASK:
+		sem_bits = SEM_SET << SEM_XGMAC1_SHIFT;
+		break;
+	case SEM_ICB_MASK:
+		sem_bits = SEM_SET << SEM_ICB_SHIFT;
+		break;
+	case SEM_MAC_ADDR_MASK:
+		sem_bits = SEM_SET << SEM_MAC_ADDR_SHIFT;
+		break;
+	case SEM_FLASH_MASK:
+		sem_bits = SEM_SET << SEM_FLASH_SHIFT;
+		break;
+	case SEM_PROBE_MASK:
+		sem_bits = SEM_SET << SEM_PROBE_SHIFT;
+		break;
+	case SEM_RT_IDX_MASK:
+		sem_bits = SEM_SET << SEM_RT_IDX_SHIFT;
+		break;
+	case SEM_PROC_REG_MASK:
+		sem_bits = SEM_SET << SEM_PROC_REG_SHIFT;
+		break;
+	default:
+		netif_alert(qdev, probe, qdev->ndev, "bad Semaphore mask!.\n");
+		return -EINVAL;
+	}
+
+	ql_write32(qdev, SEM, sem_bits | sem_mask);
+	return !(ql_read32(qdev, SEM) & sem_bits);
+}
+
+int ql_sem_spinlock(struct ql_adapter *qdev, u32 sem_mask)
+{
+	unsigned int wait_count = 30;
+	do {
+		if (!ql_sem_trylock(qdev, sem_mask))
+			return 0;
+		udelay(100);
+	} while (--wait_count);
+	return -ETIMEDOUT;
+}
+
+void ql_sem_unlock(struct ql_adapter *qdev, u32 sem_mask)
+{
+	ql_write32(qdev, SEM, sem_mask);
+	ql_read32(qdev, SEM);	/* flush */
+}
+
+/* This function waits for a specific bit to come ready
+ * in a given register.  It is used mostly by the initialize
+ * process, but is also used in kernel thread API such as
+ * netdev->set_multi, netdev->set_mac_address, netdev->vlan_rx_add_vid.
+ */
+int ql_wait_reg_rdy(struct ql_adapter *qdev, u32 reg, u32 bit, u32 err_bit)
+{
+	u32 temp;
+	int count = UDELAY_COUNT;
+
+	while (count) {
+		temp = ql_read32(qdev, reg);
+
+		/* check for errors */
+		if (temp & err_bit) {
+			netif_alert(qdev, probe, qdev->ndev,
+				    "register 0x%.08x access error, value = 0x%.08x!.\n",
+				    reg, temp);
+			return -EIO;
+		} else if (temp & bit)
+			return 0;
+		udelay(UDELAY_DELAY);
+		count--;
+	}
+	netif_alert(qdev, probe, qdev->ndev,
+		    "Timed out waiting for reg %x to come ready.\n", reg);
+	return -ETIMEDOUT;
+}
+
+/* The CFG register is used to download TX and RX control blocks
+ * to the chip. This function waits for an operation to complete.
+ */
+static int ql_wait_cfg(struct ql_adapter *qdev, u32 bit)
+{
+	int count = UDELAY_COUNT;
+	u32 temp;
+
+	while (count) {
+		temp = ql_read32(qdev, CFG);
+		if (temp & CFG_LE)
+			return -EIO;
+		if (!(temp & bit))
+			return 0;
+		udelay(UDELAY_DELAY);
+		count--;
+	}
+	return -ETIMEDOUT;
+}
+
+
+/* Used to issue init control blocks to hw. Maps control block,
+ * sets address, triggers download, waits for completion.
+ */
+int ql_write_cfg(struct ql_adapter *qdev, void *ptr, int size, u32 bit,
+		 u16 q_id)
+{
+	u64 map;
+	int status = 0;
+	int direction;
+	u32 mask;
+	u32 value;
+
+	direction =
+	    (bit & (CFG_LRQ | CFG_LR | CFG_LCQ)) ? PCI_DMA_TODEVICE :
+	    PCI_DMA_FROMDEVICE;
+
+	map = pci_map_single(qdev->pdev, ptr, size, direction);
+	if (pci_dma_mapping_error(qdev->pdev, map)) {
+		netif_err(qdev, ifup, qdev->ndev, "Couldn't map DMA area.\n");
+		return -ENOMEM;
+	}
+
+	status = ql_sem_spinlock(qdev, SEM_ICB_MASK);
+	if (status)
+		return status;
+
+	status = ql_wait_cfg(qdev, bit);
+	if (status) {
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Timed out waiting for CFG to come ready.\n");
+		goto exit;
+	}
+
+	ql_write32(qdev, ICB_L, (u32) map);
+	ql_write32(qdev, ICB_H, (u32) (map >> 32));
+
+	mask = CFG_Q_MASK | (bit << 16);
+	value = bit | (q_id << CFG_Q_SHIFT);
+	ql_write32(qdev, CFG, (mask | value));
+
+	/*
+	 * Wait for the bit to clear after signaling hw.
+	 */
+	status = ql_wait_cfg(qdev, bit);
+exit:
+	ql_sem_unlock(qdev, SEM_ICB_MASK);	/* does flush too */
+	pci_unmap_single(qdev->pdev, map, size, direction);
+	return status;
+}
+
+/* Get a specific MAC address from the CAM.  Used for debug and reg dump. */
+int ql_get_mac_addr_reg(struct ql_adapter *qdev, u32 type, u16 index,
+			u32 *value)
+{
+	u32 offset = 0;
+	int status;
+
+	switch (type) {
+	case MAC_ADDR_TYPE_MULTI_MAC:
+	case MAC_ADDR_TYPE_CAM_MAC:
+		{
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, 0);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, (offset++) | /* offset */
+				   (index << MAC_ADDR_IDX_SHIFT) | /* index */
+				   MAC_ADDR_ADR | MAC_ADDR_RS | type); /* type */
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MR, 0);
+			if (status)
+				goto exit;
+			*value++ = ql_read32(qdev, MAC_ADDR_DATA);
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, 0);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, (offset++) | /* offset */
+				   (index << MAC_ADDR_IDX_SHIFT) | /* index */
+				   MAC_ADDR_ADR | MAC_ADDR_RS | type); /* type */
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MR, 0);
+			if (status)
+				goto exit;
+			*value++ = ql_read32(qdev, MAC_ADDR_DATA);
+			if (type == MAC_ADDR_TYPE_CAM_MAC) {
+				status =
+				    ql_wait_reg_rdy(qdev,
+					MAC_ADDR_IDX, MAC_ADDR_MW, 0);
+				if (status)
+					goto exit;
+				ql_write32(qdev, MAC_ADDR_IDX, (offset++) | /* offset */
+					   (index << MAC_ADDR_IDX_SHIFT) | /* index */
+					   MAC_ADDR_ADR | MAC_ADDR_RS | type); /* type */
+				status =
+				    ql_wait_reg_rdy(qdev, MAC_ADDR_IDX,
+						    MAC_ADDR_MR, 0);
+				if (status)
+					goto exit;
+				*value++ = ql_read32(qdev, MAC_ADDR_DATA);
+			}
+			break;
+		}
+	case MAC_ADDR_TYPE_VLAN:
+	case MAC_ADDR_TYPE_MULTI_FLTR:
+	default:
+		netif_crit(qdev, ifup, qdev->ndev,
+			   "Address type %d not yet supported.\n", type);
+		status = -EPERM;
+	}
+exit:
+	return status;
+}
+
+/* Set up a MAC, multicast or VLAN address for the
+ * inbound frame matching.
+ */
+static int ql_set_mac_addr_reg(struct ql_adapter *qdev, u8 *addr, u32 type,
+			       u16 index)
+{
+	u32 offset = 0;
+	int status = 0;
+
+	switch (type) {
+	case MAC_ADDR_TYPE_MULTI_MAC:
+		{
+			u32 upper = (addr[0] << 8) | addr[1];
+			u32 lower = (addr[2] << 24) | (addr[3] << 16) |
+					(addr[4] << 8) | (addr[5]);
+
+			status =
+				ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, 0);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, (offset++) |
+				(index << MAC_ADDR_IDX_SHIFT) |
+				type | MAC_ADDR_E);
+			ql_write32(qdev, MAC_ADDR_DATA, lower);
+			status =
+				ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, 0);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, (offset++) |
+				(index << MAC_ADDR_IDX_SHIFT) |
+				type | MAC_ADDR_E);
+
+			ql_write32(qdev, MAC_ADDR_DATA, upper);
+			status =
+				ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, 0);
+			if (status)
+				goto exit;
+			break;
+		}
+	case MAC_ADDR_TYPE_CAM_MAC:
+		{
+			u32 cam_output;
+			u32 upper = (addr[0] << 8) | addr[1];
+			u32 lower =
+			    (addr[2] << 24) | (addr[3] << 16) | (addr[4] << 8) |
+			    (addr[5]);
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, 0);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, (offset++) | /* offset */
+				   (index << MAC_ADDR_IDX_SHIFT) | /* index */
+				   type);	/* type */
+			ql_write32(qdev, MAC_ADDR_DATA, lower);
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, 0);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, (offset++) | /* offset */
+				   (index << MAC_ADDR_IDX_SHIFT) | /* index */
+				   type);	/* type */
+			ql_write32(qdev, MAC_ADDR_DATA, upper);
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, 0);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, (offset) |	/* offset */
+				   (index << MAC_ADDR_IDX_SHIFT) |	/* index */
+				   type);	/* type */
+			/* This field should also include the queue id
+			   and possibly the function id.  Right now we hardcode
+			   the route field to NIC core.
+			 */
+			cam_output = (CAM_OUT_ROUTE_NIC |
+				      (qdev->
+				       func << CAM_OUT_FUNC_SHIFT) |
+					(0 << CAM_OUT_CQ_ID_SHIFT));
+			if (qdev->ndev->features & NETIF_F_HW_VLAN_CTAG_RX)
+				cam_output |= CAM_OUT_RV;
+			/* route to NIC core */
+			ql_write32(qdev, MAC_ADDR_DATA, cam_output);
+			break;
+		}
+	case MAC_ADDR_TYPE_VLAN:
+		{
+			u32 enable_bit = *((u32 *) &addr[0]);
+			/* For VLAN, the addr actually holds a bit that
+			 * either enables or disables the vlan id we are
+			 * addressing. It's either MAC_ADDR_E on or off.
+			 * That's bit-27 we're talking about.
+			 */
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, 0);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, offset |	/* offset */
+				   (index << MAC_ADDR_IDX_SHIFT) |	/* index */
+				   type |	/* type */
+				   enable_bit);	/* enable/disable */
+			break;
+		}
+	case MAC_ADDR_TYPE_MULTI_FLTR:
+	default:
+		netif_crit(qdev, ifup, qdev->ndev,
+			   "Address type %d not yet supported.\n", type);
+		status = -EPERM;
+	}
+exit:
+	return status;
+}
+
+/* Set or clear MAC address in hardware. We sometimes
+ * have to clear it to prevent wrong frame routing
+ * especially in a bonding environment.
+ */
+static int ql_set_mac_addr(struct ql_adapter *qdev, int set)
+{
+	int status;
+	char zero_mac_addr[ETH_ALEN];
+	char *addr;
+
+	if (set) {
+		addr = &qdev->current_mac_addr[0];
+		netif_printk(qdev, ifup, KERN_DEBUG, qdev->ndev,
+			     "Set Mac addr %pM\n", addr);
+	} else {
+		eth_zero_addr(zero_mac_addr);
+		addr = &zero_mac_addr[0];
+		netif_printk(qdev, ifup, KERN_DEBUG, qdev->ndev,
+			     "Clearing MAC address\n");
+	}
+	status = ql_sem_spinlock(qdev, SEM_MAC_ADDR_MASK);
+	if (status)
+		return status;
+	status = ql_set_mac_addr_reg(qdev, (u8 *) addr,
+			MAC_ADDR_TYPE_CAM_MAC, qdev->func * MAX_CQ);
+	ql_sem_unlock(qdev, SEM_MAC_ADDR_MASK);
+	if (status)
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Failed to init mac address.\n");
+	return status;
+}
+
+void ql_link_on(struct ql_adapter *qdev)
+{
+	netif_err(qdev, link, qdev->ndev, "Link is up.\n");
+	netif_carrier_on(qdev->ndev);
+	ql_set_mac_addr(qdev, 1);
+}
+
+void ql_link_off(struct ql_adapter *qdev)
+{
+	netif_err(qdev, link, qdev->ndev, "Link is down.\n");
+	netif_carrier_off(qdev->ndev);
+	ql_set_mac_addr(qdev, 0);
+}
+
+/* Get a specific frame routing value from the CAM.
+ * Used for debug and reg dump.
+ */
+int ql_get_routing_reg(struct ql_adapter *qdev, u32 index, u32 *value)
+{
+	int status = 0;
+
+	status = ql_wait_reg_rdy(qdev, RT_IDX, RT_IDX_MW, 0);
+	if (status)
+		goto exit;
+
+	ql_write32(qdev, RT_IDX,
+		   RT_IDX_TYPE_NICQ | RT_IDX_RS | (index << RT_IDX_IDX_SHIFT));
+	status = ql_wait_reg_rdy(qdev, RT_IDX, RT_IDX_MR, 0);
+	if (status)
+		goto exit;
+	*value = ql_read32(qdev, RT_DATA);
+exit:
+	return status;
+}
+
+/* The NIC function for this chip has 16 routing indexes.  Each one can be used
+ * to route different frame types to various inbound queues.  We send broadcast/
+ * multicast/error frames to the default queue for slow handling,
+ * and CAM hit/RSS frames to the fast handling queues.
+ */
+static int ql_set_routing_reg(struct ql_adapter *qdev, u32 index, u32 mask,
+			      int enable)
+{
+	int status = -EINVAL; /* Return error if no mask match. */
+	u32 value = 0;
+
+	switch (mask) {
+	case RT_IDX_CAM_HIT:
+		{
+			value = RT_IDX_DST_CAM_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_CAM_HIT_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case RT_IDX_VALID:	/* Promiscuous Mode frames. */
+		{
+			value = RT_IDX_DST_DFLT_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_PROMISCUOUS_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case RT_IDX_ERR:	/* Pass up MAC,IP,TCP/UDP error frames. */
+		{
+			value = RT_IDX_DST_DFLT_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_ALL_ERR_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case RT_IDX_IP_CSUM_ERR: /* Pass up IP CSUM error frames. */
+		{
+			value = RT_IDX_DST_DFLT_Q | /* dest */
+				RT_IDX_TYPE_NICQ | /* type */
+				(RT_IDX_IP_CSUM_ERR_SLOT <<
+				RT_IDX_IDX_SHIFT); /* index */
+			break;
+		}
+	case RT_IDX_TU_CSUM_ERR: /* Pass up TCP/UDP CSUM error frames. */
+		{
+			value = RT_IDX_DST_DFLT_Q | /* dest */
+				RT_IDX_TYPE_NICQ | /* type */
+				(RT_IDX_TCP_UDP_CSUM_ERR_SLOT <<
+				RT_IDX_IDX_SHIFT); /* index */
+			break;
+		}
+	case RT_IDX_BCAST:	/* Pass up Broadcast frames to default Q. */
+		{
+			value = RT_IDX_DST_DFLT_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_BCAST_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case RT_IDX_MCAST:	/* Pass up All Multicast frames. */
+		{
+			value = RT_IDX_DST_DFLT_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_ALLMULTI_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case RT_IDX_MCAST_MATCH:	/* Pass up matched Multicast frames. */
+		{
+			value = RT_IDX_DST_DFLT_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_MCAST_MATCH_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case RT_IDX_RSS_MATCH:	/* Pass up matched RSS frames. */
+		{
+			value = RT_IDX_DST_RSS |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_RSS_MATCH_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case 0:		/* Clear the E-bit on an entry. */
+		{
+			value = RT_IDX_DST_DFLT_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (index << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	default:
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Mask type %d not yet supported.\n", mask);
+		status = -EPERM;
+		goto exit;
+	}
+
+	if (value) {
+		status = ql_wait_reg_rdy(qdev, RT_IDX, RT_IDX_MW, 0);
+		if (status)
+			goto exit;
+		value |= (enable ? RT_IDX_E : 0);
+		ql_write32(qdev, RT_IDX, value);
+		ql_write32(qdev, RT_DATA, enable ? mask : 0);
+	}
+exit:
+	return status;
+}
+
+static void ql_enable_interrupts(struct ql_adapter *qdev)
+{
+	ql_write32(qdev, INTR_EN, (INTR_EN_EI << 16) | INTR_EN_EI);
+}
+
+static void ql_disable_interrupts(struct ql_adapter *qdev)
+{
+	ql_write32(qdev, INTR_EN, (INTR_EN_EI << 16));
+}
+
+/* If we're running with multiple MSI-X vectors then we enable on the fly.
+ * Otherwise, we may have multiple outstanding workers and don't want to
+ * enable until the last one finishes. In this case, the irq_cnt gets
+ * incremented every time we queue a worker and decremented every time
+ * a worker finishes.  Once it hits zero we enable the interrupt.
+ */
+u32 ql_enable_completion_interrupt(struct ql_adapter *qdev, u32 intr)
+{
+	u32 var = 0;
+	unsigned long hw_flags = 0;
+	struct intr_context *ctx = qdev->intr_context + intr;
+
+	if (likely(test_bit(QL_MSIX_ENABLED, &qdev->flags) && intr)) {
+		/* Always enable if we're MSIX multi interrupts and
+		 * it's not the default (zeroeth) interrupt.
+		 */
+		ql_write32(qdev, INTR_EN,
+			   ctx->intr_en_mask);
+		var = ql_read32(qdev, STS);
+		return var;
+	}
+
+	spin_lock_irqsave(&qdev->hw_lock, hw_flags);
+	if (atomic_dec_and_test(&ctx->irq_cnt)) {
+		ql_write32(qdev, INTR_EN,
+			   ctx->intr_en_mask);
+		var = ql_read32(qdev, STS);
+	}
+	spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
+	return var;
+}
+
+static u32 ql_disable_completion_interrupt(struct ql_adapter *qdev, u32 intr)
+{
+	u32 var = 0;
+	struct intr_context *ctx;
+
+	/* HW disables for us if we're MSIX multi interrupts and
+	 * it's not the default (zeroeth) interrupt.
+	 */
+	if (likely(test_bit(QL_MSIX_ENABLED, &qdev->flags) && intr))
+		return 0;
+
+	ctx = qdev->intr_context + intr;
+	spin_lock(&qdev->hw_lock);
+	if (!atomic_read(&ctx->irq_cnt)) {
+		ql_write32(qdev, INTR_EN,
+		ctx->intr_dis_mask);
+		var = ql_read32(qdev, STS);
+	}
+	atomic_inc(&ctx->irq_cnt);
+	spin_unlock(&qdev->hw_lock);
+	return var;
+}
+
+static void ql_enable_all_completion_interrupts(struct ql_adapter *qdev)
+{
+	int i;
+	for (i = 0; i < qdev->intr_count; i++) {
+		/* The enable call does a atomic_dec_and_test
+		 * and enables only if the result is zero.
+		 * So we precharge it here.
+		 */
+		if (unlikely(!test_bit(QL_MSIX_ENABLED, &qdev->flags) ||
+			i == 0))
+			atomic_set(&qdev->intr_context[i].irq_cnt, 1);
+		ql_enable_completion_interrupt(qdev, i);
+	}
+
+}
+
+static int ql_validate_flash(struct ql_adapter *qdev, u32 size, const char *str)
+{
+	int status, i;
+	u16 csum = 0;
+	__le16 *flash = (__le16 *)&qdev->flash;
+
+	status = strncmp((char *)&qdev->flash, str, 4);
+	if (status) {
+		netif_err(qdev, ifup, qdev->ndev, "Invalid flash signature.\n");
+		return	status;
+	}
+
+	for (i = 0; i < size; i++)
+		csum += le16_to_cpu(*flash++);
+
+	if (csum)
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Invalid flash checksum, csum = 0x%.04x.\n", csum);
+
+	return csum;
+}
+
+static int ql_read_flash_word(struct ql_adapter *qdev, int offset, __le32 *data)
+{
+	int status = 0;
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev,
+			FLASH_ADDR, FLASH_ADDR_RDY, FLASH_ADDR_ERR);
+	if (status)
+		goto exit;
+	/* set up for reg read */
+	ql_write32(qdev, FLASH_ADDR, FLASH_ADDR_R | offset);
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev,
+			FLASH_ADDR, FLASH_ADDR_RDY, FLASH_ADDR_ERR);
+	if (status)
+		goto exit;
+	 /* This data is stored on flash as an array of
+	 * __le32.  Since ql_read32() returns cpu endian
+	 * we need to swap it back.
+	 */
+	*data = cpu_to_le32(ql_read32(qdev, FLASH_DATA));
+exit:
+	return status;
+}
+
+static int ql_get_8000_flash_params(struct ql_adapter *qdev)
+{
+	u32 i, size;
+	int status;
+	__le32 *p = (__le32 *)&qdev->flash;
+	u32 offset;
+	u8 mac_addr[6];
+
+	/* Get flash offset for function and adjust
+	 * for dword access.
+	 */
+	if (!qdev->port)
+		offset = FUNC0_FLASH_OFFSET / sizeof(u32);
+	else
+		offset = FUNC1_FLASH_OFFSET / sizeof(u32);
+
+	if (ql_sem_spinlock(qdev, SEM_FLASH_MASK))
+		return -ETIMEDOUT;
+
+	size = sizeof(struct flash_params_8000) / sizeof(u32);
+	for (i = 0; i < size; i++, p++) {
+		status = ql_read_flash_word(qdev, i+offset, p);
+		if (status) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Error reading flash.\n");
+			goto exit;
+		}
+	}
+
+	status = ql_validate_flash(qdev,
+			sizeof(struct flash_params_8000) / sizeof(u16),
+			"8000");
+	if (status) {
+		netif_err(qdev, ifup, qdev->ndev, "Invalid flash.\n");
+		status = -EINVAL;
+		goto exit;
+	}
+
+	/* Extract either manufacturer or BOFM modified
+	 * MAC address.
+	 */
+	if (qdev->flash.flash_params_8000.data_type1 == 2)
+		memcpy(mac_addr,
+			qdev->flash.flash_params_8000.mac_addr1,
+			qdev->ndev->addr_len);
+	else
+		memcpy(mac_addr,
+			qdev->flash.flash_params_8000.mac_addr,
+			qdev->ndev->addr_len);
+
+	if (!is_valid_ether_addr(mac_addr)) {
+		netif_err(qdev, ifup, qdev->ndev, "Invalid MAC address.\n");
+		status = -EINVAL;
+		goto exit;
+	}
+
+	memcpy(qdev->ndev->dev_addr,
+		mac_addr,
+		qdev->ndev->addr_len);
+
+exit:
+	ql_sem_unlock(qdev, SEM_FLASH_MASK);
+	return status;
+}
+
+static int ql_get_8012_flash_params(struct ql_adapter *qdev)
+{
+	int i;
+	int status;
+	__le32 *p = (__le32 *)&qdev->flash;
+	u32 offset = 0;
+	u32 size = sizeof(struct flash_params_8012) / sizeof(u32);
+
+	/* Second function's parameters follow the first
+	 * function's.
+	 */
+	if (qdev->port)
+		offset = size;
+
+	if (ql_sem_spinlock(qdev, SEM_FLASH_MASK))
+		return -ETIMEDOUT;
+
+	for (i = 0; i < size; i++, p++) {
+		status = ql_read_flash_word(qdev, i+offset, p);
+		if (status) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Error reading flash.\n");
+			goto exit;
+		}
+
+	}
+
+	status = ql_validate_flash(qdev,
+			sizeof(struct flash_params_8012) / sizeof(u16),
+			"8012");
+	if (status) {
+		netif_err(qdev, ifup, qdev->ndev, "Invalid flash.\n");
+		status = -EINVAL;
+		goto exit;
+	}
+
+	if (!is_valid_ether_addr(qdev->flash.flash_params_8012.mac_addr)) {
+		status = -EINVAL;
+		goto exit;
+	}
+
+	memcpy(qdev->ndev->dev_addr,
+		qdev->flash.flash_params_8012.mac_addr,
+		qdev->ndev->addr_len);
+
+exit:
+	ql_sem_unlock(qdev, SEM_FLASH_MASK);
+	return status;
+}
+
+/* xgmac register are located behind the xgmac_addr and xgmac_data
+ * register pair.  Each read/write requires us to wait for the ready
+ * bit before reading/writing the data.
+ */
+static int ql_write_xgmac_reg(struct ql_adapter *qdev, u32 reg, u32 data)
+{
+	int status;
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev,
+			XGMAC_ADDR, XGMAC_ADDR_RDY, XGMAC_ADDR_XME);
+	if (status)
+		return status;
+	/* write the data to the data reg */
+	ql_write32(qdev, XGMAC_DATA, data);
+	/* trigger the write */
+	ql_write32(qdev, XGMAC_ADDR, reg);
+	return status;
+}
+
+/* xgmac register are located behind the xgmac_addr and xgmac_data
+ * register pair.  Each read/write requires us to wait for the ready
+ * bit before reading/writing the data.
+ */
+int ql_read_xgmac_reg(struct ql_adapter *qdev, u32 reg, u32 *data)
+{
+	int status = 0;
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev,
+			XGMAC_ADDR, XGMAC_ADDR_RDY, XGMAC_ADDR_XME);
+	if (status)
+		goto exit;
+	/* set up for reg read */
+	ql_write32(qdev, XGMAC_ADDR, reg | XGMAC_ADDR_R);
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev,
+			XGMAC_ADDR, XGMAC_ADDR_RDY, XGMAC_ADDR_XME);
+	if (status)
+		goto exit;
+	/* get the data */
+	*data = ql_read32(qdev, XGMAC_DATA);
+exit:
+	return status;
+}
+
+/* This is used for reading the 64-bit statistics regs. */
+int ql_read_xgmac_reg64(struct ql_adapter *qdev, u32 reg, u64 *data)
+{
+	int status = 0;
+	u32 hi = 0;
+	u32 lo = 0;
+
+	status = ql_read_xgmac_reg(qdev, reg, &lo);
+	if (status)
+		goto exit;
+
+	status = ql_read_xgmac_reg(qdev, reg + 4, &hi);
+	if (status)
+		goto exit;
+
+	*data = (u64) lo | ((u64) hi << 32);
+
+exit:
+	return status;
+}
+
+static int ql_8000_port_initialize(struct ql_adapter *qdev)
+{
+	int status;
+	/*
+	 * Get MPI firmware version for driver banner
+	 * and ethool info.
+	 */
+	status = ql_mb_about_fw(qdev);
+	if (status)
+		goto exit;
+	status = ql_mb_get_fw_state(qdev);
+	if (status)
+		goto exit;
+	/* Wake up a worker to get/set the TX/RX frame sizes. */
+	queue_delayed_work(qdev->workqueue, &qdev->mpi_port_cfg_work, 0);
+exit:
+	return status;
+}
+
+/* Take the MAC Core out of reset.
+ * Enable statistics counting.
+ * Take the transmitter/receiver out of reset.
+ * This functionality may be done in the MPI firmware at a
+ * later date.
+ */
+static int ql_8012_port_initialize(struct ql_adapter *qdev)
+{
+	int status = 0;
+	u32 data;
+
+	if (ql_sem_trylock(qdev, qdev->xg_sem_mask)) {
+		/* Another function has the semaphore, so
+		 * wait for the port init bit to come ready.
+		 */
+		netif_info(qdev, link, qdev->ndev,
+			   "Another function has the semaphore, so wait for the port init bit to come ready.\n");
+		status = ql_wait_reg_rdy(qdev, STS, qdev->port_init, 0);
+		if (status) {
+			netif_crit(qdev, link, qdev->ndev,
+				   "Port initialize timed out.\n");
+		}
+		return status;
+	}
+
+	netif_info(qdev, link, qdev->ndev, "Got xgmac semaphore!.\n");
+	/* Set the core reset. */
+	status = ql_read_xgmac_reg(qdev, GLOBAL_CFG, &data);
+	if (status)
+		goto end;
+	data |= GLOBAL_CFG_RESET;
+	status = ql_write_xgmac_reg(qdev, GLOBAL_CFG, data);
+	if (status)
+		goto end;
+
+	/* Clear the core reset and turn on jumbo for receiver. */
+	data &= ~GLOBAL_CFG_RESET;	/* Clear core reset. */
+	data |= GLOBAL_CFG_JUMBO;	/* Turn on jumbo. */
+	data |= GLOBAL_CFG_TX_STAT_EN;
+	data |= GLOBAL_CFG_RX_STAT_EN;
+	status = ql_write_xgmac_reg(qdev, GLOBAL_CFG, data);
+	if (status)
+		goto end;
+
+	/* Enable transmitter, and clear it's reset. */
+	status = ql_read_xgmac_reg(qdev, TX_CFG, &data);
+	if (status)
+		goto end;
+	data &= ~TX_CFG_RESET;	/* Clear the TX MAC reset. */
+	data |= TX_CFG_EN;	/* Enable the transmitter. */
+	status = ql_write_xgmac_reg(qdev, TX_CFG, data);
+	if (status)
+		goto end;
+
+	/* Enable receiver and clear it's reset. */
+	status = ql_read_xgmac_reg(qdev, RX_CFG, &data);
+	if (status)
+		goto end;
+	data &= ~RX_CFG_RESET;	/* Clear the RX MAC reset. */
+	data |= RX_CFG_EN;	/* Enable the receiver. */
+	status = ql_write_xgmac_reg(qdev, RX_CFG, data);
+	if (status)
+		goto end;
+
+	/* Turn on jumbo. */
+	status =
+	    ql_write_xgmac_reg(qdev, MAC_TX_PARAMS, MAC_TX_PARAMS_JUMBO | (0x2580 << 16));
+	if (status)
+		goto end;
+	status =
+	    ql_write_xgmac_reg(qdev, MAC_RX_PARAMS, 0x2580);
+	if (status)
+		goto end;
+
+	/* Signal to the world that the port is enabled.        */
+	ql_write32(qdev, STS, ((qdev->port_init << 16) | qdev->port_init));
+end:
+	ql_sem_unlock(qdev, qdev->xg_sem_mask);
+	return status;
+}
+
+static inline unsigned int ql_lbq_block_size(struct ql_adapter *qdev)
+{
+	return PAGE_SIZE << qdev->lbq_buf_order;
+}
+
+/* Get the next large buffer. */
+static struct bq_desc *ql_get_curr_lbuf(struct rx_ring *rx_ring)
+{
+	struct bq_desc *lbq_desc = &rx_ring->lbq[rx_ring->lbq_curr_idx];
+	rx_ring->lbq_curr_idx++;
+	if (rx_ring->lbq_curr_idx == rx_ring->lbq_len)
+		rx_ring->lbq_curr_idx = 0;
+	rx_ring->lbq_free_cnt++;
+	return lbq_desc;
+}
+
+static struct bq_desc *ql_get_curr_lchunk(struct ql_adapter *qdev,
+		struct rx_ring *rx_ring)
+{
+	struct bq_desc *lbq_desc = ql_get_curr_lbuf(rx_ring);
+
+	pci_dma_sync_single_for_cpu(qdev->pdev,
+					dma_unmap_addr(lbq_desc, mapaddr),
+				    rx_ring->lbq_buf_size,
+					PCI_DMA_FROMDEVICE);
+
+	/* If it's the last chunk of our master page then
+	 * we unmap it.
+	 */
+	if ((lbq_desc->p.pg_chunk.offset + rx_ring->lbq_buf_size)
+					== ql_lbq_block_size(qdev))
+		pci_unmap_page(qdev->pdev,
+				lbq_desc->p.pg_chunk.map,
+				ql_lbq_block_size(qdev),
+				PCI_DMA_FROMDEVICE);
+	return lbq_desc;
+}
+
+/* Get the next small buffer. */
+static struct bq_desc *ql_get_curr_sbuf(struct rx_ring *rx_ring)
+{
+	struct bq_desc *sbq_desc = &rx_ring->sbq[rx_ring->sbq_curr_idx];
+	rx_ring->sbq_curr_idx++;
+	if (rx_ring->sbq_curr_idx == rx_ring->sbq_len)
+		rx_ring->sbq_curr_idx = 0;
+	rx_ring->sbq_free_cnt++;
+	return sbq_desc;
+}
+
+/* Update an rx ring index. */
+static void ql_update_cq(struct rx_ring *rx_ring)
+{
+	rx_ring->cnsmr_idx++;
+	rx_ring->curr_entry++;
+	if (unlikely(rx_ring->cnsmr_idx == rx_ring->cq_len)) {
+		rx_ring->cnsmr_idx = 0;
+		rx_ring->curr_entry = rx_ring->cq_base;
+	}
+}
+
+static void ql_write_cq_idx(struct rx_ring *rx_ring)
+{
+	ql_write_db_reg(rx_ring->cnsmr_idx, rx_ring->cnsmr_idx_db_reg);
+}
+
+static int ql_get_next_chunk(struct ql_adapter *qdev, struct rx_ring *rx_ring,
+						struct bq_desc *lbq_desc)
+{
+	if (!rx_ring->pg_chunk.page) {
+		u64 map;
+		rx_ring->pg_chunk.page = alloc_pages(__GFP_COMP | GFP_ATOMIC,
+						qdev->lbq_buf_order);
+		if (unlikely(!rx_ring->pg_chunk.page)) {
+			netif_err(qdev, drv, qdev->ndev,
+				  "page allocation failed.\n");
+			return -ENOMEM;
+		}
+		rx_ring->pg_chunk.offset = 0;
+		map = pci_map_page(qdev->pdev, rx_ring->pg_chunk.page,
+					0, ql_lbq_block_size(qdev),
+					PCI_DMA_FROMDEVICE);
+		if (pci_dma_mapping_error(qdev->pdev, map)) {
+			__free_pages(rx_ring->pg_chunk.page,
+					qdev->lbq_buf_order);
+			rx_ring->pg_chunk.page = NULL;
+			netif_err(qdev, drv, qdev->ndev,
+				  "PCI mapping failed.\n");
+			return -ENOMEM;
+		}
+		rx_ring->pg_chunk.map = map;
+		rx_ring->pg_chunk.va = page_address(rx_ring->pg_chunk.page);
+	}
+
+	/* Copy the current master pg_chunk info
+	 * to the current descriptor.
+	 */
+	lbq_desc->p.pg_chunk = rx_ring->pg_chunk;
+
+	/* Adjust the master page chunk for next
+	 * buffer get.
+	 */
+	rx_ring->pg_chunk.offset += rx_ring->lbq_buf_size;
+	if (rx_ring->pg_chunk.offset == ql_lbq_block_size(qdev)) {
+		rx_ring->pg_chunk.page = NULL;
+		lbq_desc->p.pg_chunk.last_flag = 1;
+	} else {
+		rx_ring->pg_chunk.va += rx_ring->lbq_buf_size;
+		get_page(rx_ring->pg_chunk.page);
+		lbq_desc->p.pg_chunk.last_flag = 0;
+	}
+	return 0;
+}
+/* Process (refill) a large buffer queue. */
+static void ql_update_lbq(struct ql_adapter *qdev, struct rx_ring *rx_ring)
+{
+	u32 clean_idx = rx_ring->lbq_clean_idx;
+	u32 start_idx = clean_idx;
+	struct bq_desc *lbq_desc;
+	u64 map;
+	int i;
+
+	while (rx_ring->lbq_free_cnt > 32) {
+		for (i = (rx_ring->lbq_clean_idx % 16); i < 16; i++) {
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "lbq: try cleaning clean_idx = %d.\n",
+				     clean_idx);
+			lbq_desc = &rx_ring->lbq[clean_idx];
+			if (ql_get_next_chunk(qdev, rx_ring, lbq_desc)) {
+				rx_ring->lbq_clean_idx = clean_idx;
+				netif_err(qdev, ifup, qdev->ndev,
+						"Could not get a page chunk, i=%d, clean_idx =%d .\n",
+						i, clean_idx);
+				return;
+			}
+
+			map = lbq_desc->p.pg_chunk.map +
+				lbq_desc->p.pg_chunk.offset;
+				dma_unmap_addr_set(lbq_desc, mapaddr, map);
+			dma_unmap_len_set(lbq_desc, maplen,
+					rx_ring->lbq_buf_size);
+				*lbq_desc->addr = cpu_to_le64(map);
+
+			pci_dma_sync_single_for_device(qdev->pdev, map,
+						rx_ring->lbq_buf_size,
+						PCI_DMA_FROMDEVICE);
+			clean_idx++;
+			if (clean_idx == rx_ring->lbq_len)
+				clean_idx = 0;
+		}
+
+		rx_ring->lbq_clean_idx = clean_idx;
+		rx_ring->lbq_prod_idx += 16;
+		if (rx_ring->lbq_prod_idx == rx_ring->lbq_len)
+			rx_ring->lbq_prod_idx = 0;
+		rx_ring->lbq_free_cnt -= 16;
+	}
+
+	if (start_idx != clean_idx) {
+		netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+			     "lbq: updating prod idx = %d.\n",
+			     rx_ring->lbq_prod_idx);
+		ql_write_db_reg(rx_ring->lbq_prod_idx,
+				rx_ring->lbq_prod_idx_db_reg);
+	}
+}
+
+/* Process (refill) a small buffer queue. */
+static void ql_update_sbq(struct ql_adapter *qdev, struct rx_ring *rx_ring)
+{
+	u32 clean_idx = rx_ring->sbq_clean_idx;
+	u32 start_idx = clean_idx;
+	struct bq_desc *sbq_desc;
+	u64 map;
+	int i;
+
+	while (rx_ring->sbq_free_cnt > 16) {
+		for (i = (rx_ring->sbq_clean_idx % 16); i < 16; i++) {
+			sbq_desc = &rx_ring->sbq[clean_idx];
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "sbq: try cleaning clean_idx = %d.\n",
+				     clean_idx);
+			if (sbq_desc->p.skb == NULL) {
+				netif_printk(qdev, rx_status, KERN_DEBUG,
+					     qdev->ndev,
+					     "sbq: getting new skb for index %d.\n",
+					     sbq_desc->index);
+				sbq_desc->p.skb =
+				    netdev_alloc_skb(qdev->ndev,
+						     SMALL_BUFFER_SIZE);
+				if (sbq_desc->p.skb == NULL) {
+					rx_ring->sbq_clean_idx = clean_idx;
+					return;
+				}
+				skb_reserve(sbq_desc->p.skb, QLGE_SB_PAD);
+				map = pci_map_single(qdev->pdev,
+						     sbq_desc->p.skb->data,
+						     rx_ring->sbq_buf_size,
+						     PCI_DMA_FROMDEVICE);
+				if (pci_dma_mapping_error(qdev->pdev, map)) {
+					netif_err(qdev, ifup, qdev->ndev,
+						  "PCI mapping failed.\n");
+					rx_ring->sbq_clean_idx = clean_idx;
+					dev_kfree_skb_any(sbq_desc->p.skb);
+					sbq_desc->p.skb = NULL;
+					return;
+				}
+				dma_unmap_addr_set(sbq_desc, mapaddr, map);
+				dma_unmap_len_set(sbq_desc, maplen,
+						  rx_ring->sbq_buf_size);
+				*sbq_desc->addr = cpu_to_le64(map);
+			}
+
+			clean_idx++;
+			if (clean_idx == rx_ring->sbq_len)
+				clean_idx = 0;
+		}
+		rx_ring->sbq_clean_idx = clean_idx;
+		rx_ring->sbq_prod_idx += 16;
+		if (rx_ring->sbq_prod_idx == rx_ring->sbq_len)
+			rx_ring->sbq_prod_idx = 0;
+		rx_ring->sbq_free_cnt -= 16;
+	}
+
+	if (start_idx != clean_idx) {
+		netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+			     "sbq: updating prod idx = %d.\n",
+			     rx_ring->sbq_prod_idx);
+		ql_write_db_reg(rx_ring->sbq_prod_idx,
+				rx_ring->sbq_prod_idx_db_reg);
+	}
+}
+
+static void ql_update_buffer_queues(struct ql_adapter *qdev,
+				    struct rx_ring *rx_ring)
+{
+	ql_update_sbq(qdev, rx_ring);
+	ql_update_lbq(qdev, rx_ring);
+}
+
+/* Unmaps tx buffers.  Can be called from send() if a pci mapping
+ * fails at some stage, or from the interrupt when a tx completes.
+ */
+static void ql_unmap_send(struct ql_adapter *qdev,
+			  struct tx_ring_desc *tx_ring_desc, int mapped)
+{
+	int i;
+	for (i = 0; i < mapped; i++) {
+		if (i == 0 || (i == 7 && mapped > 7)) {
+			/*
+			 * Unmap the skb->data area, or the
+			 * external sglist (AKA the Outbound
+			 * Address List (OAL)).
+			 * If its the zeroeth element, then it's
+			 * the skb->data area.  If it's the 7th
+			 * element and there is more than 6 frags,
+			 * then its an OAL.
+			 */
+			if (i == 7) {
+				netif_printk(qdev, tx_done, KERN_DEBUG,
+					     qdev->ndev,
+					     "unmapping OAL area.\n");
+			}
+			pci_unmap_single(qdev->pdev,
+					 dma_unmap_addr(&tx_ring_desc->map[i],
+							mapaddr),
+					 dma_unmap_len(&tx_ring_desc->map[i],
+						       maplen),
+					 PCI_DMA_TODEVICE);
+		} else {
+			netif_printk(qdev, tx_done, KERN_DEBUG, qdev->ndev,
+				     "unmapping frag %d.\n", i);
+			pci_unmap_page(qdev->pdev,
+				       dma_unmap_addr(&tx_ring_desc->map[i],
+						      mapaddr),
+				       dma_unmap_len(&tx_ring_desc->map[i],
+						     maplen), PCI_DMA_TODEVICE);
+		}
+	}
+
+}
+
+/* Map the buffers for this transmit.  This will return
+ * NETDEV_TX_BUSY or NETDEV_TX_OK based on success.
+ */
+static int ql_map_send(struct ql_adapter *qdev,
+		       struct ob_mac_iocb_req *mac_iocb_ptr,
+		       struct sk_buff *skb, struct tx_ring_desc *tx_ring_desc)
+{
+	int len = skb_headlen(skb);
+	dma_addr_t map;
+	int frag_idx, err, map_idx = 0;
+	struct tx_buf_desc *tbd = mac_iocb_ptr->tbd;
+	int frag_cnt = skb_shinfo(skb)->nr_frags;
+
+	if (frag_cnt) {
+		netif_printk(qdev, tx_queued, KERN_DEBUG, qdev->ndev,
+			     "frag_cnt = %d.\n", frag_cnt);
+	}
+	/*
+	 * Map the skb buffer first.
+	 */
+	map = pci_map_single(qdev->pdev, skb->data, len, PCI_DMA_TODEVICE);
+
+	err = pci_dma_mapping_error(qdev->pdev, map);
+	if (err) {
+		netif_err(qdev, tx_queued, qdev->ndev,
+			  "PCI mapping failed with error: %d\n", err);
+
+		return NETDEV_TX_BUSY;
+	}
+
+	tbd->len = cpu_to_le32(len);
+	tbd->addr = cpu_to_le64(map);
+	dma_unmap_addr_set(&tx_ring_desc->map[map_idx], mapaddr, map);
+	dma_unmap_len_set(&tx_ring_desc->map[map_idx], maplen, len);
+	map_idx++;
+
+	/*
+	 * This loop fills the remainder of the 8 address descriptors
+	 * in the IOCB.  If there are more than 7 fragments, then the
+	 * eighth address desc will point to an external list (OAL).
+	 * When this happens, the remainder of the frags will be stored
+	 * in this list.
+	 */
+	for (frag_idx = 0; frag_idx < frag_cnt; frag_idx++, map_idx++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[frag_idx];
+		tbd++;
+		if (frag_idx == 6 && frag_cnt > 7) {
+			/* Let's tack on an sglist.
+			 * Our control block will now
+			 * look like this:
+			 * iocb->seg[0] = skb->data
+			 * iocb->seg[1] = frag[0]
+			 * iocb->seg[2] = frag[1]
+			 * iocb->seg[3] = frag[2]
+			 * iocb->seg[4] = frag[3]
+			 * iocb->seg[5] = frag[4]
+			 * iocb->seg[6] = frag[5]
+			 * iocb->seg[7] = ptr to OAL (external sglist)
+			 * oal->seg[0] = frag[6]
+			 * oal->seg[1] = frag[7]
+			 * oal->seg[2] = frag[8]
+			 * oal->seg[3] = frag[9]
+			 * oal->seg[4] = frag[10]
+			 *      etc...
+			 */
+			/* Tack on the OAL in the eighth segment of IOCB. */
+			map = pci_map_single(qdev->pdev, &tx_ring_desc->oal,
+					     sizeof(struct oal),
+					     PCI_DMA_TODEVICE);
+			err = pci_dma_mapping_error(qdev->pdev, map);
+			if (err) {
+				netif_err(qdev, tx_queued, qdev->ndev,
+					  "PCI mapping outbound address list with error: %d\n",
+					  err);
+				goto map_error;
+			}
+
+			tbd->addr = cpu_to_le64(map);
+			/*
+			 * The length is the number of fragments
+			 * that remain to be mapped times the length
+			 * of our sglist (OAL).
+			 */
+			tbd->len =
+			    cpu_to_le32((sizeof(struct tx_buf_desc) *
+					 (frag_cnt - frag_idx)) | TX_DESC_C);
+			dma_unmap_addr_set(&tx_ring_desc->map[map_idx], mapaddr,
+					   map);
+			dma_unmap_len_set(&tx_ring_desc->map[map_idx], maplen,
+					  sizeof(struct oal));
+			tbd = (struct tx_buf_desc *)&tx_ring_desc->oal;
+			map_idx++;
+		}
+
+		map = skb_frag_dma_map(&qdev->pdev->dev, frag, 0, skb_frag_size(frag),
+				       DMA_TO_DEVICE);
+
+		err = dma_mapping_error(&qdev->pdev->dev, map);
+		if (err) {
+			netif_err(qdev, tx_queued, qdev->ndev,
+				  "PCI mapping frags failed with error: %d.\n",
+				  err);
+			goto map_error;
+		}
+
+		tbd->addr = cpu_to_le64(map);
+		tbd->len = cpu_to_le32(skb_frag_size(frag));
+		dma_unmap_addr_set(&tx_ring_desc->map[map_idx], mapaddr, map);
+		dma_unmap_len_set(&tx_ring_desc->map[map_idx], maplen,
+				  skb_frag_size(frag));
+
+	}
+	/* Save the number of segments we've mapped. */
+	tx_ring_desc->map_cnt = map_idx;
+	/* Terminate the last segment. */
+	tbd->len = cpu_to_le32(le32_to_cpu(tbd->len) | TX_DESC_E);
+	return NETDEV_TX_OK;
+
+map_error:
+	/*
+	 * If the first frag mapping failed, then i will be zero.
+	 * This causes the unmap of the skb->data area.  Otherwise
+	 * we pass in the number of frags that mapped successfully
+	 * so they can be umapped.
+	 */
+	ql_unmap_send(qdev, tx_ring_desc, map_idx);
+	return NETDEV_TX_BUSY;
+}
+
+/* Categorizing receive firmware frame errors */
+static void ql_categorize_rx_err(struct ql_adapter *qdev, u8 rx_err,
+				 struct rx_ring *rx_ring)
+{
+	struct nic_stats *stats = &qdev->nic_stats;
+
+	stats->rx_err_count++;
+	rx_ring->rx_errors++;
+
+	switch (rx_err & IB_MAC_IOCB_RSP_ERR_MASK) {
+	case IB_MAC_IOCB_RSP_ERR_CODE_ERR:
+		stats->rx_code_err++;
+		break;
+	case IB_MAC_IOCB_RSP_ERR_OVERSIZE:
+		stats->rx_oversize_err++;
+		break;
+	case IB_MAC_IOCB_RSP_ERR_UNDERSIZE:
+		stats->rx_undersize_err++;
+		break;
+	case IB_MAC_IOCB_RSP_ERR_PREAMBLE:
+		stats->rx_preamble_err++;
+		break;
+	case IB_MAC_IOCB_RSP_ERR_FRAME_LEN:
+		stats->rx_frame_len_err++;
+		break;
+	case IB_MAC_IOCB_RSP_ERR_CRC:
+		stats->rx_crc_err++;
+	default:
+		break;
+	}
+}
+
+/**
+ * ql_update_mac_hdr_len - helper routine to update the mac header length
+ * based on vlan tags if present
+ */
+static void ql_update_mac_hdr_len(struct ql_adapter *qdev,
+				  struct ib_mac_iocb_rsp *ib_mac_rsp,
+				  void *page, size_t *len)
+{
+	u16 *tags;
+
+	if (qdev->ndev->features & NETIF_F_HW_VLAN_CTAG_RX)
+		return;
+	if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_V) {
+		tags = (u16 *)page;
+		/* Look for stacked vlan tags in ethertype field */
+		if (tags[6] == ETH_P_8021Q &&
+		    tags[8] == ETH_P_8021Q)
+			*len += 2 * VLAN_HLEN;
+		else
+			*len += VLAN_HLEN;
+	}
+}
+
+/* Process an inbound completion from an rx ring. */
+static void ql_process_mac_rx_gro_page(struct ql_adapter *qdev,
+					struct rx_ring *rx_ring,
+					struct ib_mac_iocb_rsp *ib_mac_rsp,
+					u32 length,
+					u16 vlan_id)
+{
+	struct sk_buff *skb;
+	struct bq_desc *lbq_desc = ql_get_curr_lchunk(qdev, rx_ring);
+	struct napi_struct *napi = &rx_ring->napi;
+
+	/* Frame error, so drop the packet. */
+	if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) {
+		ql_categorize_rx_err(qdev, ib_mac_rsp->flags2, rx_ring);
+		put_page(lbq_desc->p.pg_chunk.page);
+		return;
+	}
+	napi->dev = qdev->ndev;
+
+	skb = napi_get_frags(napi);
+	if (!skb) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Couldn't get an skb, exiting.\n");
+		rx_ring->rx_dropped++;
+		put_page(lbq_desc->p.pg_chunk.page);
+		return;
+	}
+	prefetch(lbq_desc->p.pg_chunk.va);
+	__skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
+			     lbq_desc->p.pg_chunk.page,
+			     lbq_desc->p.pg_chunk.offset,
+			     length);
+
+	skb->len += length;
+	skb->data_len += length;
+	skb->truesize += length;
+	skb_shinfo(skb)->nr_frags++;
+
+	rx_ring->rx_packets++;
+	rx_ring->rx_bytes += length;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	skb_record_rx_queue(skb, rx_ring->cq_id);
+	if (vlan_id != 0xffff)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_id);
+	napi_gro_frags(napi);
+}
+
+/* Process an inbound completion from an rx ring. */
+static void ql_process_mac_rx_page(struct ql_adapter *qdev,
+					struct rx_ring *rx_ring,
+					struct ib_mac_iocb_rsp *ib_mac_rsp,
+					u32 length,
+					u16 vlan_id)
+{
+	struct net_device *ndev = qdev->ndev;
+	struct sk_buff *skb = NULL;
+	void *addr;
+	struct bq_desc *lbq_desc = ql_get_curr_lchunk(qdev, rx_ring);
+	struct napi_struct *napi = &rx_ring->napi;
+	size_t hlen = ETH_HLEN;
+
+	skb = netdev_alloc_skb(ndev, length);
+	if (!skb) {
+		rx_ring->rx_dropped++;
+		put_page(lbq_desc->p.pg_chunk.page);
+		return;
+	}
+
+	addr = lbq_desc->p.pg_chunk.va;
+	prefetch(addr);
+
+	/* Frame error, so drop the packet. */
+	if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) {
+		ql_categorize_rx_err(qdev, ib_mac_rsp->flags2, rx_ring);
+		goto err_out;
+	}
+
+	/* Update the MAC header length*/
+	ql_update_mac_hdr_len(qdev, ib_mac_rsp, addr, &hlen);
+
+	/* The max framesize filter on this chip is set higher than
+	 * MTU since FCoE uses 2k frames.
+	 */
+	if (skb->len > ndev->mtu + hlen) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Segment too small, dropping.\n");
+		rx_ring->rx_dropped++;
+		goto err_out;
+	}
+	skb_put_data(skb, addr, hlen);
+	netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+		     "%d bytes of headers and data in large. Chain page to new skb and pull tail.\n",
+		     length);
+	skb_fill_page_desc(skb, 0, lbq_desc->p.pg_chunk.page,
+				lbq_desc->p.pg_chunk.offset + hlen,
+				length - hlen);
+	skb->len += length - hlen;
+	skb->data_len += length - hlen;
+	skb->truesize += length - hlen;
+
+	rx_ring->rx_packets++;
+	rx_ring->rx_bytes += skb->len;
+	skb->protocol = eth_type_trans(skb, ndev);
+	skb_checksum_none_assert(skb);
+
+	if ((ndev->features & NETIF_F_RXCSUM) &&
+		!(ib_mac_rsp->flags1 & IB_MAC_CSUM_ERR_MASK)) {
+		/* TCP frame. */
+		if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_T) {
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "TCP checksum done!\n");
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		} else if ((ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_U) &&
+				(ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_V4)) {
+			/* Unfragmented ipv4 UDP frame. */
+			struct iphdr *iph =
+				(struct iphdr *)((u8 *)addr + hlen);
+			if (!(iph->frag_off &
+				htons(IP_MF|IP_OFFSET))) {
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+				netif_printk(qdev, rx_status, KERN_DEBUG,
+					     qdev->ndev,
+					     "UDP checksum done!\n");
+			}
+		}
+	}
+
+	skb_record_rx_queue(skb, rx_ring->cq_id);
+	if (vlan_id != 0xffff)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_id);
+	if (skb->ip_summed == CHECKSUM_UNNECESSARY)
+		napi_gro_receive(napi, skb);
+	else
+		netif_receive_skb(skb);
+	return;
+err_out:
+	dev_kfree_skb_any(skb);
+	put_page(lbq_desc->p.pg_chunk.page);
+}
+
+/* Process an inbound completion from an rx ring. */
+static void ql_process_mac_rx_skb(struct ql_adapter *qdev,
+					struct rx_ring *rx_ring,
+					struct ib_mac_iocb_rsp *ib_mac_rsp,
+					u32 length,
+					u16 vlan_id)
+{
+	struct net_device *ndev = qdev->ndev;
+	struct sk_buff *skb = NULL;
+	struct sk_buff *new_skb = NULL;
+	struct bq_desc *sbq_desc = ql_get_curr_sbuf(rx_ring);
+
+	skb = sbq_desc->p.skb;
+	/* Allocate new_skb and copy */
+	new_skb = netdev_alloc_skb(qdev->ndev, length + NET_IP_ALIGN);
+	if (new_skb == NULL) {
+		rx_ring->rx_dropped++;
+		return;
+	}
+	skb_reserve(new_skb, NET_IP_ALIGN);
+
+	pci_dma_sync_single_for_cpu(qdev->pdev,
+				    dma_unmap_addr(sbq_desc, mapaddr),
+				    dma_unmap_len(sbq_desc, maplen),
+				    PCI_DMA_FROMDEVICE);
+
+	skb_put_data(new_skb, skb->data, length);
+
+	pci_dma_sync_single_for_device(qdev->pdev,
+				       dma_unmap_addr(sbq_desc, mapaddr),
+				       dma_unmap_len(sbq_desc, maplen),
+				       PCI_DMA_FROMDEVICE);
+	skb = new_skb;
+
+	/* Frame error, so drop the packet. */
+	if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) {
+		ql_categorize_rx_err(qdev, ib_mac_rsp->flags2, rx_ring);
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	/* loopback self test for ethtool */
+	if (test_bit(QL_SELFTEST, &qdev->flags)) {
+		ql_check_lb_frame(qdev, skb);
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	/* The max framesize filter on this chip is set higher than
+	 * MTU since FCoE uses 2k frames.
+	 */
+	if (skb->len > ndev->mtu + ETH_HLEN) {
+		dev_kfree_skb_any(skb);
+		rx_ring->rx_dropped++;
+		return;
+	}
+
+	prefetch(skb->data);
+	if (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) {
+		netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+			     "%s Multicast.\n",
+			     (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+			     IB_MAC_IOCB_RSP_M_HASH ? "Hash" :
+			     (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+			     IB_MAC_IOCB_RSP_M_REG ? "Registered" :
+			     (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+			     IB_MAC_IOCB_RSP_M_PROM ? "Promiscuous" : "");
+	}
+	if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_P)
+		netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+			     "Promiscuous Packet.\n");
+
+	rx_ring->rx_packets++;
+	rx_ring->rx_bytes += skb->len;
+	skb->protocol = eth_type_trans(skb, ndev);
+	skb_checksum_none_assert(skb);
+
+	/* If rx checksum is on, and there are no
+	 * csum or frame errors.
+	 */
+	if ((ndev->features & NETIF_F_RXCSUM) &&
+		!(ib_mac_rsp->flags1 & IB_MAC_CSUM_ERR_MASK)) {
+		/* TCP frame. */
+		if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_T) {
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "TCP checksum done!\n");
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		} else if ((ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_U) &&
+				(ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_V4)) {
+			/* Unfragmented ipv4 UDP frame. */
+			struct iphdr *iph = (struct iphdr *) skb->data;
+			if (!(iph->frag_off &
+				htons(IP_MF|IP_OFFSET))) {
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+				netif_printk(qdev, rx_status, KERN_DEBUG,
+					     qdev->ndev,
+					     "UDP checksum done!\n");
+			}
+		}
+	}
+
+	skb_record_rx_queue(skb, rx_ring->cq_id);
+	if (vlan_id != 0xffff)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_id);
+	if (skb->ip_summed == CHECKSUM_UNNECESSARY)
+		napi_gro_receive(&rx_ring->napi, skb);
+	else
+		netif_receive_skb(skb);
+}
+
+static void ql_realign_skb(struct sk_buff *skb, int len)
+{
+	void *temp_addr = skb->data;
+
+	/* Undo the skb_reserve(skb,32) we did before
+	 * giving to hardware, and realign data on
+	 * a 2-byte boundary.
+	 */
+	skb->data -= QLGE_SB_PAD - NET_IP_ALIGN;
+	skb->tail -= QLGE_SB_PAD - NET_IP_ALIGN;
+	memmove(skb->data, temp_addr, len);
+}
+
+/*
+ * This function builds an skb for the given inbound
+ * completion.  It will be rewritten for readability in the near
+ * future, but for not it works well.
+ */
+static struct sk_buff *ql_build_rx_skb(struct ql_adapter *qdev,
+				       struct rx_ring *rx_ring,
+				       struct ib_mac_iocb_rsp *ib_mac_rsp)
+{
+	struct bq_desc *lbq_desc;
+	struct bq_desc *sbq_desc;
+	struct sk_buff *skb = NULL;
+	u32 length = le32_to_cpu(ib_mac_rsp->data_len);
+	u32 hdr_len = le32_to_cpu(ib_mac_rsp->hdr_len);
+	size_t hlen = ETH_HLEN;
+
+	/*
+	 * Handle the header buffer if present.
+	 */
+	if (ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HV &&
+	    ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HS) {
+		netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+			     "Header of %d bytes in small buffer.\n", hdr_len);
+		/*
+		 * Headers fit nicely into a small buffer.
+		 */
+		sbq_desc = ql_get_curr_sbuf(rx_ring);
+		pci_unmap_single(qdev->pdev,
+				dma_unmap_addr(sbq_desc, mapaddr),
+				dma_unmap_len(sbq_desc, maplen),
+				PCI_DMA_FROMDEVICE);
+		skb = sbq_desc->p.skb;
+		ql_realign_skb(skb, hdr_len);
+		skb_put(skb, hdr_len);
+		sbq_desc->p.skb = NULL;
+	}
+
+	/*
+	 * Handle the data buffer(s).
+	 */
+	if (unlikely(!length)) {	/* Is there data too? */
+		netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+			     "No Data buffer in this packet.\n");
+		return skb;
+	}
+
+	if (ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_DS) {
+		if (ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HS) {
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "Headers in small, data of %d bytes in small, combine them.\n",
+				     length);
+			/*
+			 * Data is less than small buffer size so it's
+			 * stuffed in a small buffer.
+			 * For this case we append the data
+			 * from the "data" small buffer to the "header" small
+			 * buffer.
+			 */
+			sbq_desc = ql_get_curr_sbuf(rx_ring);
+			pci_dma_sync_single_for_cpu(qdev->pdev,
+						    dma_unmap_addr
+						    (sbq_desc, mapaddr),
+						    dma_unmap_len
+						    (sbq_desc, maplen),
+						    PCI_DMA_FROMDEVICE);
+			skb_put_data(skb, sbq_desc->p.skb->data, length);
+			pci_dma_sync_single_for_device(qdev->pdev,
+						       dma_unmap_addr
+						       (sbq_desc,
+							mapaddr),
+						       dma_unmap_len
+						       (sbq_desc,
+							maplen),
+						       PCI_DMA_FROMDEVICE);
+		} else {
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "%d bytes in a single small buffer.\n",
+				     length);
+			sbq_desc = ql_get_curr_sbuf(rx_ring);
+			skb = sbq_desc->p.skb;
+			ql_realign_skb(skb, length);
+			skb_put(skb, length);
+			pci_unmap_single(qdev->pdev,
+					 dma_unmap_addr(sbq_desc,
+							mapaddr),
+					 dma_unmap_len(sbq_desc,
+						       maplen),
+					 PCI_DMA_FROMDEVICE);
+			sbq_desc->p.skb = NULL;
+		}
+	} else if (ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_DL) {
+		if (ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HS) {
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "Header in small, %d bytes in large. Chain large to small!\n",
+				     length);
+			/*
+			 * The data is in a single large buffer.  We
+			 * chain it to the header buffer's skb and let
+			 * it rip.
+			 */
+			lbq_desc = ql_get_curr_lchunk(qdev, rx_ring);
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "Chaining page at offset = %d, for %d bytes  to skb.\n",
+				     lbq_desc->p.pg_chunk.offset, length);
+			skb_fill_page_desc(skb, 0, lbq_desc->p.pg_chunk.page,
+						lbq_desc->p.pg_chunk.offset,
+						length);
+			skb->len += length;
+			skb->data_len += length;
+			skb->truesize += length;
+		} else {
+			/*
+			 * The headers and data are in a single large buffer. We
+			 * copy it to a new skb and let it go. This can happen with
+			 * jumbo mtu on a non-TCP/UDP frame.
+			 */
+			lbq_desc = ql_get_curr_lchunk(qdev, rx_ring);
+			skb = netdev_alloc_skb(qdev->ndev, length);
+			if (skb == NULL) {
+				netif_printk(qdev, probe, KERN_DEBUG, qdev->ndev,
+					     "No skb available, drop the packet.\n");
+				return NULL;
+			}
+			pci_unmap_page(qdev->pdev,
+				       dma_unmap_addr(lbq_desc,
+						      mapaddr),
+				       dma_unmap_len(lbq_desc, maplen),
+				       PCI_DMA_FROMDEVICE);
+			skb_reserve(skb, NET_IP_ALIGN);
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "%d bytes of headers and data in large. Chain page to new skb and pull tail.\n",
+				     length);
+			skb_fill_page_desc(skb, 0,
+						lbq_desc->p.pg_chunk.page,
+						lbq_desc->p.pg_chunk.offset,
+						length);
+			skb->len += length;
+			skb->data_len += length;
+			skb->truesize += length;
+			ql_update_mac_hdr_len(qdev, ib_mac_rsp,
+					      lbq_desc->p.pg_chunk.va,
+					      &hlen);
+			__pskb_pull_tail(skb, hlen);
+		}
+	} else {
+		/*
+		 * The data is in a chain of large buffers
+		 * pointed to by a small buffer.  We loop
+		 * thru and chain them to the our small header
+		 * buffer's skb.
+		 * frags:  There are 18 max frags and our small
+		 *         buffer will hold 32 of them. The thing is,
+		 *         we'll use 3 max for our 9000 byte jumbo
+		 *         frames.  If the MTU goes up we could
+		 *          eventually be in trouble.
+		 */
+		int size, i = 0;
+		sbq_desc = ql_get_curr_sbuf(rx_ring);
+		pci_unmap_single(qdev->pdev,
+				 dma_unmap_addr(sbq_desc, mapaddr),
+				 dma_unmap_len(sbq_desc, maplen),
+				 PCI_DMA_FROMDEVICE);
+		if (!(ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HS)) {
+			/*
+			 * This is an non TCP/UDP IP frame, so
+			 * the headers aren't split into a small
+			 * buffer.  We have to use the small buffer
+			 * that contains our sg list as our skb to
+			 * send upstairs. Copy the sg list here to
+			 * a local buffer and use it to find the
+			 * pages to chain.
+			 */
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "%d bytes of headers & data in chain of large.\n",
+				     length);
+			skb = sbq_desc->p.skb;
+			sbq_desc->p.skb = NULL;
+			skb_reserve(skb, NET_IP_ALIGN);
+		}
+		do {
+			lbq_desc = ql_get_curr_lchunk(qdev, rx_ring);
+			size = (length < rx_ring->lbq_buf_size) ? length :
+				rx_ring->lbq_buf_size;
+
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "Adding page %d to skb for %d bytes.\n",
+				     i, size);
+			skb_fill_page_desc(skb, i,
+						lbq_desc->p.pg_chunk.page,
+						lbq_desc->p.pg_chunk.offset,
+						size);
+			skb->len += size;
+			skb->data_len += size;
+			skb->truesize += size;
+			length -= size;
+			i++;
+		} while (length > 0);
+		ql_update_mac_hdr_len(qdev, ib_mac_rsp, lbq_desc->p.pg_chunk.va,
+				      &hlen);
+		__pskb_pull_tail(skb, hlen);
+	}
+	return skb;
+}
+
+/* Process an inbound completion from an rx ring. */
+static void ql_process_mac_split_rx_intr(struct ql_adapter *qdev,
+				   struct rx_ring *rx_ring,
+				   struct ib_mac_iocb_rsp *ib_mac_rsp,
+				   u16 vlan_id)
+{
+	struct net_device *ndev = qdev->ndev;
+	struct sk_buff *skb = NULL;
+
+	QL_DUMP_IB_MAC_RSP(ib_mac_rsp);
+
+	skb = ql_build_rx_skb(qdev, rx_ring, ib_mac_rsp);
+	if (unlikely(!skb)) {
+		netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+			     "No skb available, drop packet.\n");
+		rx_ring->rx_dropped++;
+		return;
+	}
+
+	/* Frame error, so drop the packet. */
+	if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) {
+		ql_categorize_rx_err(qdev, ib_mac_rsp->flags2, rx_ring);
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	/* The max framesize filter on this chip is set higher than
+	 * MTU since FCoE uses 2k frames.
+	 */
+	if (skb->len > ndev->mtu + ETH_HLEN) {
+		dev_kfree_skb_any(skb);
+		rx_ring->rx_dropped++;
+		return;
+	}
+
+	/* loopback self test for ethtool */
+	if (test_bit(QL_SELFTEST, &qdev->flags)) {
+		ql_check_lb_frame(qdev, skb);
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	prefetch(skb->data);
+	if (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) {
+		netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev, "%s Multicast.\n",
+			     (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+			     IB_MAC_IOCB_RSP_M_HASH ? "Hash" :
+			     (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+			     IB_MAC_IOCB_RSP_M_REG ? "Registered" :
+			     (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+			     IB_MAC_IOCB_RSP_M_PROM ? "Promiscuous" : "");
+		rx_ring->rx_multicast++;
+	}
+	if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_P) {
+		netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+			     "Promiscuous Packet.\n");
+	}
+
+	skb->protocol = eth_type_trans(skb, ndev);
+	skb_checksum_none_assert(skb);
+
+	/* If rx checksum is on, and there are no
+	 * csum or frame errors.
+	 */
+	if ((ndev->features & NETIF_F_RXCSUM) &&
+		!(ib_mac_rsp->flags1 & IB_MAC_CSUM_ERR_MASK)) {
+		/* TCP frame. */
+		if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_T) {
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "TCP checksum done!\n");
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		} else if ((ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_U) &&
+				(ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_V4)) {
+		/* Unfragmented ipv4 UDP frame. */
+			struct iphdr *iph = (struct iphdr *) skb->data;
+			if (!(iph->frag_off &
+				htons(IP_MF|IP_OFFSET))) {
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+				netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+					     "TCP checksum done!\n");
+			}
+		}
+	}
+
+	rx_ring->rx_packets++;
+	rx_ring->rx_bytes += skb->len;
+	skb_record_rx_queue(skb, rx_ring->cq_id);
+	if (vlan_id != 0xffff)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_id);
+	if (skb->ip_summed == CHECKSUM_UNNECESSARY)
+		napi_gro_receive(&rx_ring->napi, skb);
+	else
+		netif_receive_skb(skb);
+}
+
+/* Process an inbound completion from an rx ring. */
+static unsigned long ql_process_mac_rx_intr(struct ql_adapter *qdev,
+					struct rx_ring *rx_ring,
+					struct ib_mac_iocb_rsp *ib_mac_rsp)
+{
+	u32 length = le32_to_cpu(ib_mac_rsp->data_len);
+	u16 vlan_id = ((ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_V) &&
+			(qdev->ndev->features & NETIF_F_HW_VLAN_CTAG_RX)) ?
+			((le16_to_cpu(ib_mac_rsp->vlan_id) &
+			IB_MAC_IOCB_RSP_VLAN_MASK)) : 0xffff;
+
+	QL_DUMP_IB_MAC_RSP(ib_mac_rsp);
+
+	if (ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HV) {
+		/* The data and headers are split into
+		 * separate buffers.
+		 */
+		ql_process_mac_split_rx_intr(qdev, rx_ring, ib_mac_rsp,
+						vlan_id);
+	} else if (ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_DS) {
+		/* The data fit in a single small buffer.
+		 * Allocate a new skb, copy the data and
+		 * return the buffer to the free pool.
+		 */
+		ql_process_mac_rx_skb(qdev, rx_ring, ib_mac_rsp,
+						length, vlan_id);
+	} else if ((ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_DL) &&
+		!(ib_mac_rsp->flags1 & IB_MAC_CSUM_ERR_MASK) &&
+		(ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_T)) {
+		/* TCP packet in a page chunk that's been checksummed.
+		 * Tack it on to our GRO skb and let it go.
+		 */
+		ql_process_mac_rx_gro_page(qdev, rx_ring, ib_mac_rsp,
+						length, vlan_id);
+	} else if (ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_DL) {
+		/* Non-TCP packet in a page chunk. Allocate an
+		 * skb, tack it on frags, and send it up.
+		 */
+		ql_process_mac_rx_page(qdev, rx_ring, ib_mac_rsp,
+						length, vlan_id);
+	} else {
+		/* Non-TCP/UDP large frames that span multiple buffers
+		 * can be processed corrrectly by the split frame logic.
+		 */
+		ql_process_mac_split_rx_intr(qdev, rx_ring, ib_mac_rsp,
+						vlan_id);
+	}
+
+	return (unsigned long)length;
+}
+
+/* Process an outbound completion from an rx ring. */
+static void ql_process_mac_tx_intr(struct ql_adapter *qdev,
+				   struct ob_mac_iocb_rsp *mac_rsp)
+{
+	struct tx_ring *tx_ring;
+	struct tx_ring_desc *tx_ring_desc;
+
+	QL_DUMP_OB_MAC_RSP(mac_rsp);
+	tx_ring = &qdev->tx_ring[mac_rsp->txq_idx];
+	tx_ring_desc = &tx_ring->q[mac_rsp->tid];
+	ql_unmap_send(qdev, tx_ring_desc, tx_ring_desc->map_cnt);
+	tx_ring->tx_bytes += (tx_ring_desc->skb)->len;
+	tx_ring->tx_packets++;
+	dev_kfree_skb(tx_ring_desc->skb);
+	tx_ring_desc->skb = NULL;
+
+	if (unlikely(mac_rsp->flags1 & (OB_MAC_IOCB_RSP_E |
+					OB_MAC_IOCB_RSP_S |
+					OB_MAC_IOCB_RSP_L |
+					OB_MAC_IOCB_RSP_P | OB_MAC_IOCB_RSP_B))) {
+		if (mac_rsp->flags1 & OB_MAC_IOCB_RSP_E) {
+			netif_warn(qdev, tx_done, qdev->ndev,
+				   "Total descriptor length did not match transfer length.\n");
+		}
+		if (mac_rsp->flags1 & OB_MAC_IOCB_RSP_S) {
+			netif_warn(qdev, tx_done, qdev->ndev,
+				   "Frame too short to be valid, not sent.\n");
+		}
+		if (mac_rsp->flags1 & OB_MAC_IOCB_RSP_L) {
+			netif_warn(qdev, tx_done, qdev->ndev,
+				   "Frame too long, but sent anyway.\n");
+		}
+		if (mac_rsp->flags1 & OB_MAC_IOCB_RSP_B) {
+			netif_warn(qdev, tx_done, qdev->ndev,
+				   "PCI backplane error. Frame not sent.\n");
+		}
+	}
+	atomic_inc(&tx_ring->tx_count);
+}
+
+/* Fire up a handler to reset the MPI processor. */
+void ql_queue_fw_error(struct ql_adapter *qdev)
+{
+	ql_link_off(qdev);
+	queue_delayed_work(qdev->workqueue, &qdev->mpi_reset_work, 0);
+}
+
+void ql_queue_asic_error(struct ql_adapter *qdev)
+{
+	ql_link_off(qdev);
+	ql_disable_interrupts(qdev);
+	/* Clear adapter up bit to signal the recovery
+	 * process that it shouldn't kill the reset worker
+	 * thread
+	 */
+	clear_bit(QL_ADAPTER_UP, &qdev->flags);
+	/* Set asic recovery bit to indicate reset process that we are
+	 * in fatal error recovery process rather than normal close
+	 */
+	set_bit(QL_ASIC_RECOVERY, &qdev->flags);
+	queue_delayed_work(qdev->workqueue, &qdev->asic_reset_work, 0);
+}
+
+static void ql_process_chip_ae_intr(struct ql_adapter *qdev,
+				    struct ib_ae_iocb_rsp *ib_ae_rsp)
+{
+	switch (ib_ae_rsp->event) {
+	case MGMT_ERR_EVENT:
+		netif_err(qdev, rx_err, qdev->ndev,
+			  "Management Processor Fatal Error.\n");
+		ql_queue_fw_error(qdev);
+		return;
+
+	case CAM_LOOKUP_ERR_EVENT:
+		netdev_err(qdev->ndev, "Multiple CAM hits lookup occurred.\n");
+		netdev_err(qdev->ndev, "This event shouldn't occur.\n");
+		ql_queue_asic_error(qdev);
+		return;
+
+	case SOFT_ECC_ERROR_EVENT:
+		netdev_err(qdev->ndev, "Soft ECC error detected.\n");
+		ql_queue_asic_error(qdev);
+		break;
+
+	case PCI_ERR_ANON_BUF_RD:
+		netdev_err(qdev->ndev, "PCI error occurred when reading "
+					"anonymous buffers from rx_ring %d.\n",
+					ib_ae_rsp->q_id);
+		ql_queue_asic_error(qdev);
+		break;
+
+	default:
+		netif_err(qdev, drv, qdev->ndev, "Unexpected event %d.\n",
+			  ib_ae_rsp->event);
+		ql_queue_asic_error(qdev);
+		break;
+	}
+}
+
+static int ql_clean_outbound_rx_ring(struct rx_ring *rx_ring)
+{
+	struct ql_adapter *qdev = rx_ring->qdev;
+	u32 prod = ql_read_sh_reg(rx_ring->prod_idx_sh_reg);
+	struct ob_mac_iocb_rsp *net_rsp = NULL;
+	int count = 0;
+
+	struct tx_ring *tx_ring;
+	/* While there are entries in the completion queue. */
+	while (prod != rx_ring->cnsmr_idx) {
+
+		netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+			     "cq_id = %d, prod = %d, cnsmr = %d.\n.",
+			     rx_ring->cq_id, prod, rx_ring->cnsmr_idx);
+
+		net_rsp = (struct ob_mac_iocb_rsp *)rx_ring->curr_entry;
+		rmb();
+		switch (net_rsp->opcode) {
+
+		case OPCODE_OB_MAC_TSO_IOCB:
+		case OPCODE_OB_MAC_IOCB:
+			ql_process_mac_tx_intr(qdev, net_rsp);
+			break;
+		default:
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "Hit default case, not handled! dropping the packet, opcode = %x.\n",
+				     net_rsp->opcode);
+		}
+		count++;
+		ql_update_cq(rx_ring);
+		prod = ql_read_sh_reg(rx_ring->prod_idx_sh_reg);
+	}
+	if (!net_rsp)
+		return 0;
+	ql_write_cq_idx(rx_ring);
+	tx_ring = &qdev->tx_ring[net_rsp->txq_idx];
+	if (__netif_subqueue_stopped(qdev->ndev, tx_ring->wq_id)) {
+		if ((atomic_read(&tx_ring->tx_count) > (tx_ring->wq_len / 4)))
+			/*
+			 * The queue got stopped because the tx_ring was full.
+			 * Wake it up, because it's now at least 25% empty.
+			 */
+			netif_wake_subqueue(qdev->ndev, tx_ring->wq_id);
+	}
+
+	return count;
+}
+
+static int ql_clean_inbound_rx_ring(struct rx_ring *rx_ring, int budget)
+{
+	struct ql_adapter *qdev = rx_ring->qdev;
+	u32 prod = ql_read_sh_reg(rx_ring->prod_idx_sh_reg);
+	struct ql_net_rsp_iocb *net_rsp;
+	int count = 0;
+
+	/* While there are entries in the completion queue. */
+	while (prod != rx_ring->cnsmr_idx) {
+
+		netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+			     "cq_id = %d, prod = %d, cnsmr = %d.\n.",
+			     rx_ring->cq_id, prod, rx_ring->cnsmr_idx);
+
+		net_rsp = rx_ring->curr_entry;
+		rmb();
+		switch (net_rsp->opcode) {
+		case OPCODE_IB_MAC_IOCB:
+			ql_process_mac_rx_intr(qdev, rx_ring,
+					       (struct ib_mac_iocb_rsp *)
+					       net_rsp);
+			break;
+
+		case OPCODE_IB_AE_IOCB:
+			ql_process_chip_ae_intr(qdev, (struct ib_ae_iocb_rsp *)
+						net_rsp);
+			break;
+		default:
+			netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+				     "Hit default case, not handled! dropping the packet, opcode = %x.\n",
+				     net_rsp->opcode);
+			break;
+		}
+		count++;
+		ql_update_cq(rx_ring);
+		prod = ql_read_sh_reg(rx_ring->prod_idx_sh_reg);
+		if (count == budget)
+			break;
+	}
+	ql_update_buffer_queues(qdev, rx_ring);
+	ql_write_cq_idx(rx_ring);
+	return count;
+}
+
+static int ql_napi_poll_msix(struct napi_struct *napi, int budget)
+{
+	struct rx_ring *rx_ring = container_of(napi, struct rx_ring, napi);
+	struct ql_adapter *qdev = rx_ring->qdev;
+	struct rx_ring *trx_ring;
+	int i, work_done = 0;
+	struct intr_context *ctx = &qdev->intr_context[rx_ring->cq_id];
+
+	netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
+		     "Enter, NAPI POLL cq_id = %d.\n", rx_ring->cq_id);
+
+	/* Service the TX rings first.  They start
+	 * right after the RSS rings. */
+	for (i = qdev->rss_ring_count; i < qdev->rx_ring_count; i++) {
+		trx_ring = &qdev->rx_ring[i];
+		/* If this TX completion ring belongs to this vector and
+		 * it's not empty then service it.
+		 */
+		if ((ctx->irq_mask & (1 << trx_ring->cq_id)) &&
+			(ql_read_sh_reg(trx_ring->prod_idx_sh_reg) !=
+					trx_ring->cnsmr_idx)) {
+			netif_printk(qdev, intr, KERN_DEBUG, qdev->ndev,
+				     "%s: Servicing TX completion ring %d.\n",
+				     __func__, trx_ring->cq_id);
+			ql_clean_outbound_rx_ring(trx_ring);
+		}
+	}
+
+	/*
+	 * Now service the RSS ring if it's active.
+	 */
+	if (ql_read_sh_reg(rx_ring->prod_idx_sh_reg) !=
+					rx_ring->cnsmr_idx) {
+		netif_printk(qdev, intr, KERN_DEBUG, qdev->ndev,
+			     "%s: Servicing RX completion ring %d.\n",
+			     __func__, rx_ring->cq_id);
+		work_done = ql_clean_inbound_rx_ring(rx_ring, budget);
+	}
+
+	if (work_done < budget) {
+		napi_complete_done(napi, work_done);
+		ql_enable_completion_interrupt(qdev, rx_ring->irq);
+	}
+	return work_done;
+}
+
+static void qlge_vlan_mode(struct net_device *ndev, netdev_features_t features)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	if (features & NETIF_F_HW_VLAN_CTAG_RX) {
+		ql_write32(qdev, NIC_RCV_CFG, NIC_RCV_CFG_VLAN_MASK |
+				 NIC_RCV_CFG_VLAN_MATCH_AND_NON);
+	} else {
+		ql_write32(qdev, NIC_RCV_CFG, NIC_RCV_CFG_VLAN_MASK);
+	}
+}
+
+/**
+ * qlge_update_hw_vlan_features - helper routine to reinitialize the adapter
+ * based on the features to enable/disable hardware vlan accel
+ */
+static int qlge_update_hw_vlan_features(struct net_device *ndev,
+					netdev_features_t features)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int status = 0;
+	bool need_restart = netif_running(ndev);
+
+	if (need_restart) {
+		status = ql_adapter_down(qdev);
+		if (status) {
+			netif_err(qdev, link, qdev->ndev,
+				  "Failed to bring down the adapter\n");
+			return status;
+		}
+	}
+
+	/* update the features with resent change */
+	ndev->features = features;
+
+	if (need_restart) {
+		status = ql_adapter_up(qdev);
+		if (status) {
+			netif_err(qdev, link, qdev->ndev,
+				  "Failed to bring up the adapter\n");
+			return status;
+		}
+	}
+
+	return status;
+}
+
+static netdev_features_t qlge_fix_features(struct net_device *ndev,
+	netdev_features_t features)
+{
+	int err;
+
+	/* Update the behavior of vlan accel in the adapter */
+	err = qlge_update_hw_vlan_features(ndev, features);
+	if (err)
+		return err;
+
+	return features;
+}
+
+static int qlge_set_features(struct net_device *ndev,
+	netdev_features_t features)
+{
+	netdev_features_t changed = ndev->features ^ features;
+
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
+		qlge_vlan_mode(ndev, features);
+
+	return 0;
+}
+
+static int __qlge_vlan_rx_add_vid(struct ql_adapter *qdev, u16 vid)
+{
+	u32 enable_bit = MAC_ADDR_E;
+	int err;
+
+	err = ql_set_mac_addr_reg(qdev, (u8 *) &enable_bit,
+				  MAC_ADDR_TYPE_VLAN, vid);
+	if (err)
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Failed to init vlan address.\n");
+	return err;
+}
+
+static int qlge_vlan_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int status;
+	int err;
+
+	status = ql_sem_spinlock(qdev, SEM_MAC_ADDR_MASK);
+	if (status)
+		return status;
+
+	err = __qlge_vlan_rx_add_vid(qdev, vid);
+	set_bit(vid, qdev->active_vlans);
+
+	ql_sem_unlock(qdev, SEM_MAC_ADDR_MASK);
+
+	return err;
+}
+
+static int __qlge_vlan_rx_kill_vid(struct ql_adapter *qdev, u16 vid)
+{
+	u32 enable_bit = 0;
+	int err;
+
+	err = ql_set_mac_addr_reg(qdev, (u8 *) &enable_bit,
+				  MAC_ADDR_TYPE_VLAN, vid);
+	if (err)
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Failed to clear vlan address.\n");
+	return err;
+}
+
+static int qlge_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int status;
+	int err;
+
+	status = ql_sem_spinlock(qdev, SEM_MAC_ADDR_MASK);
+	if (status)
+		return status;
+
+	err = __qlge_vlan_rx_kill_vid(qdev, vid);
+	clear_bit(vid, qdev->active_vlans);
+
+	ql_sem_unlock(qdev, SEM_MAC_ADDR_MASK);
+
+	return err;
+}
+
+static void qlge_restore_vlan(struct ql_adapter *qdev)
+{
+	int status;
+	u16 vid;
+
+	status = ql_sem_spinlock(qdev, SEM_MAC_ADDR_MASK);
+	if (status)
+		return;
+
+	for_each_set_bit(vid, qdev->active_vlans, VLAN_N_VID)
+		__qlge_vlan_rx_add_vid(qdev, vid);
+
+	ql_sem_unlock(qdev, SEM_MAC_ADDR_MASK);
+}
+
+/* MSI-X Multiple Vector Interrupt Handler for inbound completions. */
+static irqreturn_t qlge_msix_rx_isr(int irq, void *dev_id)
+{
+	struct rx_ring *rx_ring = dev_id;
+	napi_schedule(&rx_ring->napi);
+	return IRQ_HANDLED;
+}
+
+/* This handles a fatal error, MPI activity, and the default
+ * rx_ring in an MSI-X multiple vector environment.
+ * In MSI/Legacy environment it also process the rest of
+ * the rx_rings.
+ */
+static irqreturn_t qlge_isr(int irq, void *dev_id)
+{
+	struct rx_ring *rx_ring = dev_id;
+	struct ql_adapter *qdev = rx_ring->qdev;
+	struct intr_context *intr_context = &qdev->intr_context[0];
+	u32 var;
+	int work_done = 0;
+
+	spin_lock(&qdev->hw_lock);
+	if (atomic_read(&qdev->intr_context[0].irq_cnt)) {
+		netif_printk(qdev, intr, KERN_DEBUG, qdev->ndev,
+			     "Shared Interrupt, Not ours!\n");
+		spin_unlock(&qdev->hw_lock);
+		return IRQ_NONE;
+	}
+	spin_unlock(&qdev->hw_lock);
+
+	var = ql_disable_completion_interrupt(qdev, intr_context->intr);
+
+	/*
+	 * Check for fatal error.
+	 */
+	if (var & STS_FE) {
+		ql_queue_asic_error(qdev);
+		netdev_err(qdev->ndev, "Got fatal error, STS = %x.\n", var);
+		var = ql_read32(qdev, ERR_STS);
+		netdev_err(qdev->ndev, "Resetting chip. "
+					"Error Status Register = 0x%x\n", var);
+		return IRQ_HANDLED;
+	}
+
+	/*
+	 * Check MPI processor activity.
+	 */
+	if ((var & STS_PI) &&
+		(ql_read32(qdev, INTR_MASK) & INTR_MASK_PI)) {
+		/*
+		 * We've got an async event or mailbox completion.
+		 * Handle it and clear the source of the interrupt.
+		 */
+		netif_err(qdev, intr, qdev->ndev,
+			  "Got MPI processor interrupt.\n");
+		ql_disable_completion_interrupt(qdev, intr_context->intr);
+		ql_write32(qdev, INTR_MASK, (INTR_MASK_PI << 16));
+		queue_delayed_work_on(smp_processor_id(),
+				qdev->workqueue, &qdev->mpi_work, 0);
+		work_done++;
+	}
+
+	/*
+	 * Get the bit-mask that shows the active queues for this
+	 * pass.  Compare it to the queues that this irq services
+	 * and call napi if there's a match.
+	 */
+	var = ql_read32(qdev, ISR1);
+	if (var & intr_context->irq_mask) {
+		netif_info(qdev, intr, qdev->ndev,
+			   "Waking handler for rx_ring[0].\n");
+		ql_disable_completion_interrupt(qdev, intr_context->intr);
+		napi_schedule(&rx_ring->napi);
+		work_done++;
+	}
+	ql_enable_completion_interrupt(qdev, intr_context->intr);
+	return work_done ? IRQ_HANDLED : IRQ_NONE;
+}
+
+static int ql_tso(struct sk_buff *skb, struct ob_mac_tso_iocb_req *mac_iocb_ptr)
+{
+
+	if (skb_is_gso(skb)) {
+		int err;
+		__be16 l3_proto = vlan_get_protocol(skb);
+
+		err = skb_cow_head(skb, 0);
+		if (err < 0)
+			return err;
+
+		mac_iocb_ptr->opcode = OPCODE_OB_MAC_TSO_IOCB;
+		mac_iocb_ptr->flags3 |= OB_MAC_TSO_IOCB_IC;
+		mac_iocb_ptr->frame_len = cpu_to_le32((u32) skb->len);
+		mac_iocb_ptr->total_hdrs_len =
+		    cpu_to_le16(skb_transport_offset(skb) + tcp_hdrlen(skb));
+		mac_iocb_ptr->net_trans_offset =
+		    cpu_to_le16(skb_network_offset(skb) |
+				skb_transport_offset(skb)
+				<< OB_MAC_TRANSPORT_HDR_SHIFT);
+		mac_iocb_ptr->mss = cpu_to_le16(skb_shinfo(skb)->gso_size);
+		mac_iocb_ptr->flags2 |= OB_MAC_TSO_IOCB_LSO;
+		if (likely(l3_proto == htons(ETH_P_IP))) {
+			struct iphdr *iph = ip_hdr(skb);
+			iph->check = 0;
+			mac_iocb_ptr->flags1 |= OB_MAC_TSO_IOCB_IP4;
+			tcp_hdr(skb)->check = ~csum_tcpudp_magic(iph->saddr,
+								 iph->daddr, 0,
+								 IPPROTO_TCP,
+								 0);
+		} else if (l3_proto == htons(ETH_P_IPV6)) {
+			mac_iocb_ptr->flags1 |= OB_MAC_TSO_IOCB_IP6;
+			tcp_hdr(skb)->check =
+			    ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+					     &ipv6_hdr(skb)->daddr,
+					     0, IPPROTO_TCP, 0);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static void ql_hw_csum_setup(struct sk_buff *skb,
+			     struct ob_mac_tso_iocb_req *mac_iocb_ptr)
+{
+	int len;
+	struct iphdr *iph = ip_hdr(skb);
+	__sum16 *check;
+	mac_iocb_ptr->opcode = OPCODE_OB_MAC_TSO_IOCB;
+	mac_iocb_ptr->frame_len = cpu_to_le32((u32) skb->len);
+	mac_iocb_ptr->net_trans_offset =
+		cpu_to_le16(skb_network_offset(skb) |
+		skb_transport_offset(skb) << OB_MAC_TRANSPORT_HDR_SHIFT);
+
+	mac_iocb_ptr->flags1 |= OB_MAC_TSO_IOCB_IP4;
+	len = (ntohs(iph->tot_len) - (iph->ihl << 2));
+	if (likely(iph->protocol == IPPROTO_TCP)) {
+		check = &(tcp_hdr(skb)->check);
+		mac_iocb_ptr->flags2 |= OB_MAC_TSO_IOCB_TC;
+		mac_iocb_ptr->total_hdrs_len =
+		    cpu_to_le16(skb_transport_offset(skb) +
+				(tcp_hdr(skb)->doff << 2));
+	} else {
+		check = &(udp_hdr(skb)->check);
+		mac_iocb_ptr->flags2 |= OB_MAC_TSO_IOCB_UC;
+		mac_iocb_ptr->total_hdrs_len =
+		    cpu_to_le16(skb_transport_offset(skb) +
+				sizeof(struct udphdr));
+	}
+	*check = ~csum_tcpudp_magic(iph->saddr,
+				    iph->daddr, len, iph->protocol, 0);
+}
+
+static netdev_tx_t qlge_send(struct sk_buff *skb, struct net_device *ndev)
+{
+	struct tx_ring_desc *tx_ring_desc;
+	struct ob_mac_iocb_req *mac_iocb_ptr;
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int tso;
+	struct tx_ring *tx_ring;
+	u32 tx_ring_idx = (u32) skb->queue_mapping;
+
+	tx_ring = &qdev->tx_ring[tx_ring_idx];
+
+	if (skb_padto(skb, ETH_ZLEN))
+		return NETDEV_TX_OK;
+
+	if (unlikely(atomic_read(&tx_ring->tx_count) < 2)) {
+		netif_info(qdev, tx_queued, qdev->ndev,
+			   "%s: BUG! shutting down tx queue %d due to lack of resources.\n",
+			   __func__, tx_ring_idx);
+		netif_stop_subqueue(ndev, tx_ring->wq_id);
+		tx_ring->tx_errors++;
+		return NETDEV_TX_BUSY;
+	}
+	tx_ring_desc = &tx_ring->q[tx_ring->prod_idx];
+	mac_iocb_ptr = tx_ring_desc->queue_entry;
+	memset((void *)mac_iocb_ptr, 0, sizeof(*mac_iocb_ptr));
+
+	mac_iocb_ptr->opcode = OPCODE_OB_MAC_IOCB;
+	mac_iocb_ptr->tid = tx_ring_desc->index;
+	/* We use the upper 32-bits to store the tx queue for this IO.
+	 * When we get the completion we can use it to establish the context.
+	 */
+	mac_iocb_ptr->txq_idx = tx_ring_idx;
+	tx_ring_desc->skb = skb;
+
+	mac_iocb_ptr->frame_len = cpu_to_le16((u16) skb->len);
+
+	if (skb_vlan_tag_present(skb)) {
+		netif_printk(qdev, tx_queued, KERN_DEBUG, qdev->ndev,
+			     "Adding a vlan tag %d.\n", skb_vlan_tag_get(skb));
+		mac_iocb_ptr->flags3 |= OB_MAC_IOCB_V;
+		mac_iocb_ptr->vlan_tci = cpu_to_le16(skb_vlan_tag_get(skb));
+	}
+	tso = ql_tso(skb, (struct ob_mac_tso_iocb_req *)mac_iocb_ptr);
+	if (tso < 0) {
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	} else if (unlikely(!tso) && (skb->ip_summed == CHECKSUM_PARTIAL)) {
+		ql_hw_csum_setup(skb,
+				 (struct ob_mac_tso_iocb_req *)mac_iocb_ptr);
+	}
+	if (ql_map_send(qdev, mac_iocb_ptr, skb, tx_ring_desc) !=
+			NETDEV_TX_OK) {
+		netif_err(qdev, tx_queued, qdev->ndev,
+			  "Could not map the segments.\n");
+		tx_ring->tx_errors++;
+		return NETDEV_TX_BUSY;
+	}
+	QL_DUMP_OB_MAC_IOCB(mac_iocb_ptr);
+	tx_ring->prod_idx++;
+	if (tx_ring->prod_idx == tx_ring->wq_len)
+		tx_ring->prod_idx = 0;
+	wmb();
+
+	ql_write_db_reg_relaxed(tx_ring->prod_idx, tx_ring->prod_idx_db_reg);
+	mmiowb();
+	netif_printk(qdev, tx_queued, KERN_DEBUG, qdev->ndev,
+		     "tx queued, slot %d, len %d\n",
+		     tx_ring->prod_idx, skb->len);
+
+	atomic_dec(&tx_ring->tx_count);
+
+	if (unlikely(atomic_read(&tx_ring->tx_count) < 2)) {
+		netif_stop_subqueue(ndev, tx_ring->wq_id);
+		if ((atomic_read(&tx_ring->tx_count) > (tx_ring->wq_len / 4)))
+			/*
+			 * The queue got stopped because the tx_ring was full.
+			 * Wake it up, because it's now at least 25% empty.
+			 */
+			netif_wake_subqueue(qdev->ndev, tx_ring->wq_id);
+	}
+	return NETDEV_TX_OK;
+}
+
+
+static void ql_free_shadow_space(struct ql_adapter *qdev)
+{
+	if (qdev->rx_ring_shadow_reg_area) {
+		pci_free_consistent(qdev->pdev,
+				    PAGE_SIZE,
+				    qdev->rx_ring_shadow_reg_area,
+				    qdev->rx_ring_shadow_reg_dma);
+		qdev->rx_ring_shadow_reg_area = NULL;
+	}
+	if (qdev->tx_ring_shadow_reg_area) {
+		pci_free_consistent(qdev->pdev,
+				    PAGE_SIZE,
+				    qdev->tx_ring_shadow_reg_area,
+				    qdev->tx_ring_shadow_reg_dma);
+		qdev->tx_ring_shadow_reg_area = NULL;
+	}
+}
+
+static int ql_alloc_shadow_space(struct ql_adapter *qdev)
+{
+	qdev->rx_ring_shadow_reg_area =
+		pci_zalloc_consistent(qdev->pdev, PAGE_SIZE,
+				      &qdev->rx_ring_shadow_reg_dma);
+	if (qdev->rx_ring_shadow_reg_area == NULL) {
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Allocation of RX shadow space failed.\n");
+		return -ENOMEM;
+	}
+
+	qdev->tx_ring_shadow_reg_area =
+		pci_zalloc_consistent(qdev->pdev, PAGE_SIZE,
+				      &qdev->tx_ring_shadow_reg_dma);
+	if (qdev->tx_ring_shadow_reg_area == NULL) {
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Allocation of TX shadow space failed.\n");
+		goto err_wqp_sh_area;
+	}
+	return 0;
+
+err_wqp_sh_area:
+	pci_free_consistent(qdev->pdev,
+			    PAGE_SIZE,
+			    qdev->rx_ring_shadow_reg_area,
+			    qdev->rx_ring_shadow_reg_dma);
+	return -ENOMEM;
+}
+
+static void ql_init_tx_ring(struct ql_adapter *qdev, struct tx_ring *tx_ring)
+{
+	struct tx_ring_desc *tx_ring_desc;
+	int i;
+	struct ob_mac_iocb_req *mac_iocb_ptr;
+
+	mac_iocb_ptr = tx_ring->wq_base;
+	tx_ring_desc = tx_ring->q;
+	for (i = 0; i < tx_ring->wq_len; i++) {
+		tx_ring_desc->index = i;
+		tx_ring_desc->skb = NULL;
+		tx_ring_desc->queue_entry = mac_iocb_ptr;
+		mac_iocb_ptr++;
+		tx_ring_desc++;
+	}
+	atomic_set(&tx_ring->tx_count, tx_ring->wq_len);
+}
+
+static void ql_free_tx_resources(struct ql_adapter *qdev,
+				 struct tx_ring *tx_ring)
+{
+	if (tx_ring->wq_base) {
+		pci_free_consistent(qdev->pdev, tx_ring->wq_size,
+				    tx_ring->wq_base, tx_ring->wq_base_dma);
+		tx_ring->wq_base = NULL;
+	}
+	kfree(tx_ring->q);
+	tx_ring->q = NULL;
+}
+
+static int ql_alloc_tx_resources(struct ql_adapter *qdev,
+				 struct tx_ring *tx_ring)
+{
+	tx_ring->wq_base =
+	    pci_alloc_consistent(qdev->pdev, tx_ring->wq_size,
+				 &tx_ring->wq_base_dma);
+
+	if ((tx_ring->wq_base == NULL) ||
+	    tx_ring->wq_base_dma & WQ_ADDR_ALIGN)
+		goto pci_alloc_err;
+
+	tx_ring->q =
+	    kmalloc(tx_ring->wq_len * sizeof(struct tx_ring_desc), GFP_KERNEL);
+	if (tx_ring->q == NULL)
+		goto err;
+
+	return 0;
+err:
+	pci_free_consistent(qdev->pdev, tx_ring->wq_size,
+			    tx_ring->wq_base, tx_ring->wq_base_dma);
+	tx_ring->wq_base = NULL;
+pci_alloc_err:
+	netif_err(qdev, ifup, qdev->ndev, "tx_ring alloc failed.\n");
+	return -ENOMEM;
+}
+
+static void ql_free_lbq_buffers(struct ql_adapter *qdev, struct rx_ring *rx_ring)
+{
+	struct bq_desc *lbq_desc;
+
+	uint32_t  curr_idx, clean_idx;
+
+	curr_idx = rx_ring->lbq_curr_idx;
+	clean_idx = rx_ring->lbq_clean_idx;
+	while (curr_idx != clean_idx) {
+		lbq_desc = &rx_ring->lbq[curr_idx];
+
+		if (lbq_desc->p.pg_chunk.last_flag) {
+			pci_unmap_page(qdev->pdev,
+				lbq_desc->p.pg_chunk.map,
+				ql_lbq_block_size(qdev),
+				       PCI_DMA_FROMDEVICE);
+			lbq_desc->p.pg_chunk.last_flag = 0;
+		}
+
+		put_page(lbq_desc->p.pg_chunk.page);
+		lbq_desc->p.pg_chunk.page = NULL;
+
+		if (++curr_idx == rx_ring->lbq_len)
+			curr_idx = 0;
+
+	}
+	if (rx_ring->pg_chunk.page) {
+		pci_unmap_page(qdev->pdev, rx_ring->pg_chunk.map,
+			ql_lbq_block_size(qdev), PCI_DMA_FROMDEVICE);
+		put_page(rx_ring->pg_chunk.page);
+		rx_ring->pg_chunk.page = NULL;
+	}
+}
+
+static void ql_free_sbq_buffers(struct ql_adapter *qdev, struct rx_ring *rx_ring)
+{
+	int i;
+	struct bq_desc *sbq_desc;
+
+	for (i = 0; i < rx_ring->sbq_len; i++) {
+		sbq_desc = &rx_ring->sbq[i];
+		if (sbq_desc == NULL) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "sbq_desc %d is NULL.\n", i);
+			return;
+		}
+		if (sbq_desc->p.skb) {
+			pci_unmap_single(qdev->pdev,
+					 dma_unmap_addr(sbq_desc, mapaddr),
+					 dma_unmap_len(sbq_desc, maplen),
+					 PCI_DMA_FROMDEVICE);
+			dev_kfree_skb(sbq_desc->p.skb);
+			sbq_desc->p.skb = NULL;
+		}
+	}
+}
+
+/* Free all large and small rx buffers associated
+ * with the completion queues for this device.
+ */
+static void ql_free_rx_buffers(struct ql_adapter *qdev)
+{
+	int i;
+	struct rx_ring *rx_ring;
+
+	for (i = 0; i < qdev->rx_ring_count; i++) {
+		rx_ring = &qdev->rx_ring[i];
+		if (rx_ring->lbq)
+			ql_free_lbq_buffers(qdev, rx_ring);
+		if (rx_ring->sbq)
+			ql_free_sbq_buffers(qdev, rx_ring);
+	}
+}
+
+static void ql_alloc_rx_buffers(struct ql_adapter *qdev)
+{
+	struct rx_ring *rx_ring;
+	int i;
+
+	for (i = 0; i < qdev->rx_ring_count; i++) {
+		rx_ring = &qdev->rx_ring[i];
+		if (rx_ring->type != TX_Q)
+			ql_update_buffer_queues(qdev, rx_ring);
+	}
+}
+
+static void ql_init_lbq_ring(struct ql_adapter *qdev,
+				struct rx_ring *rx_ring)
+{
+	int i;
+	struct bq_desc *lbq_desc;
+	__le64 *bq = rx_ring->lbq_base;
+
+	memset(rx_ring->lbq, 0, rx_ring->lbq_len * sizeof(struct bq_desc));
+	for (i = 0; i < rx_ring->lbq_len; i++) {
+		lbq_desc = &rx_ring->lbq[i];
+		memset(lbq_desc, 0, sizeof(*lbq_desc));
+		lbq_desc->index = i;
+		lbq_desc->addr = bq;
+		bq++;
+	}
+}
+
+static void ql_init_sbq_ring(struct ql_adapter *qdev,
+				struct rx_ring *rx_ring)
+{
+	int i;
+	struct bq_desc *sbq_desc;
+	__le64 *bq = rx_ring->sbq_base;
+
+	memset(rx_ring->sbq, 0, rx_ring->sbq_len * sizeof(struct bq_desc));
+	for (i = 0; i < rx_ring->sbq_len; i++) {
+		sbq_desc = &rx_ring->sbq[i];
+		memset(sbq_desc, 0, sizeof(*sbq_desc));
+		sbq_desc->index = i;
+		sbq_desc->addr = bq;
+		bq++;
+	}
+}
+
+static void ql_free_rx_resources(struct ql_adapter *qdev,
+				 struct rx_ring *rx_ring)
+{
+	/* Free the small buffer queue. */
+	if (rx_ring->sbq_base) {
+		pci_free_consistent(qdev->pdev,
+				    rx_ring->sbq_size,
+				    rx_ring->sbq_base, rx_ring->sbq_base_dma);
+		rx_ring->sbq_base = NULL;
+	}
+
+	/* Free the small buffer queue control blocks. */
+	kfree(rx_ring->sbq);
+	rx_ring->sbq = NULL;
+
+	/* Free the large buffer queue. */
+	if (rx_ring->lbq_base) {
+		pci_free_consistent(qdev->pdev,
+				    rx_ring->lbq_size,
+				    rx_ring->lbq_base, rx_ring->lbq_base_dma);
+		rx_ring->lbq_base = NULL;
+	}
+
+	/* Free the large buffer queue control blocks. */
+	kfree(rx_ring->lbq);
+	rx_ring->lbq = NULL;
+
+	/* Free the rx queue. */
+	if (rx_ring->cq_base) {
+		pci_free_consistent(qdev->pdev,
+				    rx_ring->cq_size,
+				    rx_ring->cq_base, rx_ring->cq_base_dma);
+		rx_ring->cq_base = NULL;
+	}
+}
+
+/* Allocate queues and buffers for this completions queue based
+ * on the values in the parameter structure. */
+static int ql_alloc_rx_resources(struct ql_adapter *qdev,
+				 struct rx_ring *rx_ring)
+{
+
+	/*
+	 * Allocate the completion queue for this rx_ring.
+	 */
+	rx_ring->cq_base =
+	    pci_alloc_consistent(qdev->pdev, rx_ring->cq_size,
+				 &rx_ring->cq_base_dma);
+
+	if (rx_ring->cq_base == NULL) {
+		netif_err(qdev, ifup, qdev->ndev, "rx_ring alloc failed.\n");
+		return -ENOMEM;
+	}
+
+	if (rx_ring->sbq_len) {
+		/*
+		 * Allocate small buffer queue.
+		 */
+		rx_ring->sbq_base =
+		    pci_alloc_consistent(qdev->pdev, rx_ring->sbq_size,
+					 &rx_ring->sbq_base_dma);
+
+		if (rx_ring->sbq_base == NULL) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Small buffer queue allocation failed.\n");
+			goto err_mem;
+		}
+
+		/*
+		 * Allocate small buffer queue control blocks.
+		 */
+		rx_ring->sbq = kmalloc_array(rx_ring->sbq_len,
+					     sizeof(struct bq_desc),
+					     GFP_KERNEL);
+		if (rx_ring->sbq == NULL)
+			goto err_mem;
+
+		ql_init_sbq_ring(qdev, rx_ring);
+	}
+
+	if (rx_ring->lbq_len) {
+		/*
+		 * Allocate large buffer queue.
+		 */
+		rx_ring->lbq_base =
+		    pci_alloc_consistent(qdev->pdev, rx_ring->lbq_size,
+					 &rx_ring->lbq_base_dma);
+
+		if (rx_ring->lbq_base == NULL) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Large buffer queue allocation failed.\n");
+			goto err_mem;
+		}
+		/*
+		 * Allocate large buffer queue control blocks.
+		 */
+		rx_ring->lbq = kmalloc_array(rx_ring->lbq_len,
+					     sizeof(struct bq_desc),
+					     GFP_KERNEL);
+		if (rx_ring->lbq == NULL)
+			goto err_mem;
+
+		ql_init_lbq_ring(qdev, rx_ring);
+	}
+
+	return 0;
+
+err_mem:
+	ql_free_rx_resources(qdev, rx_ring);
+	return -ENOMEM;
+}
+
+static void ql_tx_ring_clean(struct ql_adapter *qdev)
+{
+	struct tx_ring *tx_ring;
+	struct tx_ring_desc *tx_ring_desc;
+	int i, j;
+
+	/*
+	 * Loop through all queues and free
+	 * any resources.
+	 */
+	for (j = 0; j < qdev->tx_ring_count; j++) {
+		tx_ring = &qdev->tx_ring[j];
+		for (i = 0; i < tx_ring->wq_len; i++) {
+			tx_ring_desc = &tx_ring->q[i];
+			if (tx_ring_desc && tx_ring_desc->skb) {
+				netif_err(qdev, ifdown, qdev->ndev,
+					  "Freeing lost SKB %p, from queue %d, index %d.\n",
+					  tx_ring_desc->skb, j,
+					  tx_ring_desc->index);
+				ql_unmap_send(qdev, tx_ring_desc,
+					      tx_ring_desc->map_cnt);
+				dev_kfree_skb(tx_ring_desc->skb);
+				tx_ring_desc->skb = NULL;
+			}
+		}
+	}
+}
+
+static void ql_free_mem_resources(struct ql_adapter *qdev)
+{
+	int i;
+
+	for (i = 0; i < qdev->tx_ring_count; i++)
+		ql_free_tx_resources(qdev, &qdev->tx_ring[i]);
+	for (i = 0; i < qdev->rx_ring_count; i++)
+		ql_free_rx_resources(qdev, &qdev->rx_ring[i]);
+	ql_free_shadow_space(qdev);
+}
+
+static int ql_alloc_mem_resources(struct ql_adapter *qdev)
+{
+	int i;
+
+	/* Allocate space for our shadow registers and such. */
+	if (ql_alloc_shadow_space(qdev))
+		return -ENOMEM;
+
+	for (i = 0; i < qdev->rx_ring_count; i++) {
+		if (ql_alloc_rx_resources(qdev, &qdev->rx_ring[i]) != 0) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "RX resource allocation failed.\n");
+			goto err_mem;
+		}
+	}
+	/* Allocate tx queue resources */
+	for (i = 0; i < qdev->tx_ring_count; i++) {
+		if (ql_alloc_tx_resources(qdev, &qdev->tx_ring[i]) != 0) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "TX resource allocation failed.\n");
+			goto err_mem;
+		}
+	}
+	return 0;
+
+err_mem:
+	ql_free_mem_resources(qdev);
+	return -ENOMEM;
+}
+
+/* Set up the rx ring control block and pass it to the chip.
+ * The control block is defined as
+ * "Completion Queue Initialization Control Block", or cqicb.
+ */
+static int ql_start_rx_ring(struct ql_adapter *qdev, struct rx_ring *rx_ring)
+{
+	struct cqicb *cqicb = &rx_ring->cqicb;
+	void *shadow_reg = qdev->rx_ring_shadow_reg_area +
+		(rx_ring->cq_id * RX_RING_SHADOW_SPACE);
+	u64 shadow_reg_dma = qdev->rx_ring_shadow_reg_dma +
+		(rx_ring->cq_id * RX_RING_SHADOW_SPACE);
+	void __iomem *doorbell_area =
+	    qdev->doorbell_area + (DB_PAGE_SIZE * (128 + rx_ring->cq_id));
+	int err = 0;
+	u16 bq_len;
+	u64 tmp;
+	__le64 *base_indirect_ptr;
+	int page_entries;
+
+	/* Set up the shadow registers for this ring. */
+	rx_ring->prod_idx_sh_reg = shadow_reg;
+	rx_ring->prod_idx_sh_reg_dma = shadow_reg_dma;
+	*rx_ring->prod_idx_sh_reg = 0;
+	shadow_reg += sizeof(u64);
+	shadow_reg_dma += sizeof(u64);
+	rx_ring->lbq_base_indirect = shadow_reg;
+	rx_ring->lbq_base_indirect_dma = shadow_reg_dma;
+	shadow_reg += (sizeof(u64) * MAX_DB_PAGES_PER_BQ(rx_ring->lbq_len));
+	shadow_reg_dma += (sizeof(u64) * MAX_DB_PAGES_PER_BQ(rx_ring->lbq_len));
+	rx_ring->sbq_base_indirect = shadow_reg;
+	rx_ring->sbq_base_indirect_dma = shadow_reg_dma;
+
+	/* PCI doorbell mem area + 0x00 for consumer index register */
+	rx_ring->cnsmr_idx_db_reg = (u32 __iomem *) doorbell_area;
+	rx_ring->cnsmr_idx = 0;
+	rx_ring->curr_entry = rx_ring->cq_base;
+
+	/* PCI doorbell mem area + 0x04 for valid register */
+	rx_ring->valid_db_reg = doorbell_area + 0x04;
+
+	/* PCI doorbell mem area + 0x18 for large buffer consumer */
+	rx_ring->lbq_prod_idx_db_reg = (u32 __iomem *) (doorbell_area + 0x18);
+
+	/* PCI doorbell mem area + 0x1c */
+	rx_ring->sbq_prod_idx_db_reg = (u32 __iomem *) (doorbell_area + 0x1c);
+
+	memset((void *)cqicb, 0, sizeof(struct cqicb));
+	cqicb->msix_vect = rx_ring->irq;
+
+	bq_len = (rx_ring->cq_len == 65536) ? 0 : (u16) rx_ring->cq_len;
+	cqicb->len = cpu_to_le16(bq_len | LEN_V | LEN_CPP_CONT);
+
+	cqicb->addr = cpu_to_le64(rx_ring->cq_base_dma);
+
+	cqicb->prod_idx_addr = cpu_to_le64(rx_ring->prod_idx_sh_reg_dma);
+
+	/*
+	 * Set up the control block load flags.
+	 */
+	cqicb->flags = FLAGS_LC |	/* Load queue base address */
+	    FLAGS_LV |		/* Load MSI-X vector */
+	    FLAGS_LI;		/* Load irq delay values */
+	if (rx_ring->lbq_len) {
+		cqicb->flags |= FLAGS_LL;	/* Load lbq values */
+		tmp = (u64)rx_ring->lbq_base_dma;
+		base_indirect_ptr = rx_ring->lbq_base_indirect;
+		page_entries = 0;
+		do {
+			*base_indirect_ptr = cpu_to_le64(tmp);
+			tmp += DB_PAGE_SIZE;
+			base_indirect_ptr++;
+			page_entries++;
+		} while (page_entries < MAX_DB_PAGES_PER_BQ(rx_ring->lbq_len));
+		cqicb->lbq_addr =
+		    cpu_to_le64(rx_ring->lbq_base_indirect_dma);
+		bq_len = (rx_ring->lbq_buf_size == 65536) ? 0 :
+			(u16) rx_ring->lbq_buf_size;
+		cqicb->lbq_buf_size = cpu_to_le16(bq_len);
+		bq_len = (rx_ring->lbq_len == 65536) ? 0 :
+			(u16) rx_ring->lbq_len;
+		cqicb->lbq_len = cpu_to_le16(bq_len);
+		rx_ring->lbq_prod_idx = 0;
+		rx_ring->lbq_curr_idx = 0;
+		rx_ring->lbq_clean_idx = 0;
+		rx_ring->lbq_free_cnt = rx_ring->lbq_len;
+	}
+	if (rx_ring->sbq_len) {
+		cqicb->flags |= FLAGS_LS;	/* Load sbq values */
+		tmp = (u64)rx_ring->sbq_base_dma;
+		base_indirect_ptr = rx_ring->sbq_base_indirect;
+		page_entries = 0;
+		do {
+			*base_indirect_ptr = cpu_to_le64(tmp);
+			tmp += DB_PAGE_SIZE;
+			base_indirect_ptr++;
+			page_entries++;
+		} while (page_entries < MAX_DB_PAGES_PER_BQ(rx_ring->sbq_len));
+		cqicb->sbq_addr =
+		    cpu_to_le64(rx_ring->sbq_base_indirect_dma);
+		cqicb->sbq_buf_size =
+		    cpu_to_le16((u16)(rx_ring->sbq_buf_size));
+		bq_len = (rx_ring->sbq_len == 65536) ? 0 :
+			(u16) rx_ring->sbq_len;
+		cqicb->sbq_len = cpu_to_le16(bq_len);
+		rx_ring->sbq_prod_idx = 0;
+		rx_ring->sbq_curr_idx = 0;
+		rx_ring->sbq_clean_idx = 0;
+		rx_ring->sbq_free_cnt = rx_ring->sbq_len;
+	}
+	switch (rx_ring->type) {
+	case TX_Q:
+		cqicb->irq_delay = cpu_to_le16(qdev->tx_coalesce_usecs);
+		cqicb->pkt_delay = cpu_to_le16(qdev->tx_max_coalesced_frames);
+		break;
+	case RX_Q:
+		/* Inbound completion handling rx_rings run in
+		 * separate NAPI contexts.
+		 */
+		netif_napi_add(qdev->ndev, &rx_ring->napi, ql_napi_poll_msix,
+			       64);
+		cqicb->irq_delay = cpu_to_le16(qdev->rx_coalesce_usecs);
+		cqicb->pkt_delay = cpu_to_le16(qdev->rx_max_coalesced_frames);
+		break;
+	default:
+		netif_printk(qdev, ifup, KERN_DEBUG, qdev->ndev,
+			     "Invalid rx_ring->type = %d.\n", rx_ring->type);
+	}
+	err = ql_write_cfg(qdev, cqicb, sizeof(struct cqicb),
+			   CFG_LCQ, rx_ring->cq_id);
+	if (err) {
+		netif_err(qdev, ifup, qdev->ndev, "Failed to load CQICB.\n");
+		return err;
+	}
+	return err;
+}
+
+static int ql_start_tx_ring(struct ql_adapter *qdev, struct tx_ring *tx_ring)
+{
+	struct wqicb *wqicb = (struct wqicb *)tx_ring;
+	void __iomem *doorbell_area =
+	    qdev->doorbell_area + (DB_PAGE_SIZE * tx_ring->wq_id);
+	void *shadow_reg = qdev->tx_ring_shadow_reg_area +
+	    (tx_ring->wq_id * sizeof(u64));
+	u64 shadow_reg_dma = qdev->tx_ring_shadow_reg_dma +
+	    (tx_ring->wq_id * sizeof(u64));
+	int err = 0;
+
+	/*
+	 * Assign doorbell registers for this tx_ring.
+	 */
+	/* TX PCI doorbell mem area for tx producer index */
+	tx_ring->prod_idx_db_reg = (u32 __iomem *) doorbell_area;
+	tx_ring->prod_idx = 0;
+	/* TX PCI doorbell mem area + 0x04 */
+	tx_ring->valid_db_reg = doorbell_area + 0x04;
+
+	/*
+	 * Assign shadow registers for this tx_ring.
+	 */
+	tx_ring->cnsmr_idx_sh_reg = shadow_reg;
+	tx_ring->cnsmr_idx_sh_reg_dma = shadow_reg_dma;
+
+	wqicb->len = cpu_to_le16(tx_ring->wq_len | Q_LEN_V | Q_LEN_CPP_CONT);
+	wqicb->flags = cpu_to_le16(Q_FLAGS_LC |
+				   Q_FLAGS_LB | Q_FLAGS_LI | Q_FLAGS_LO);
+	wqicb->cq_id_rss = cpu_to_le16(tx_ring->cq_id);
+	wqicb->rid = 0;
+	wqicb->addr = cpu_to_le64(tx_ring->wq_base_dma);
+
+	wqicb->cnsmr_idx_addr = cpu_to_le64(tx_ring->cnsmr_idx_sh_reg_dma);
+
+	ql_init_tx_ring(qdev, tx_ring);
+
+	err = ql_write_cfg(qdev, wqicb, sizeof(*wqicb), CFG_LRQ,
+			   (u16) tx_ring->wq_id);
+	if (err) {
+		netif_err(qdev, ifup, qdev->ndev, "Failed to load tx_ring.\n");
+		return err;
+	}
+	return err;
+}
+
+static void ql_disable_msix(struct ql_adapter *qdev)
+{
+	if (test_bit(QL_MSIX_ENABLED, &qdev->flags)) {
+		pci_disable_msix(qdev->pdev);
+		clear_bit(QL_MSIX_ENABLED, &qdev->flags);
+		kfree(qdev->msi_x_entry);
+		qdev->msi_x_entry = NULL;
+	} else if (test_bit(QL_MSI_ENABLED, &qdev->flags)) {
+		pci_disable_msi(qdev->pdev);
+		clear_bit(QL_MSI_ENABLED, &qdev->flags);
+	}
+}
+
+/* We start by trying to get the number of vectors
+ * stored in qdev->intr_count. If we don't get that
+ * many then we reduce the count and try again.
+ */
+static void ql_enable_msix(struct ql_adapter *qdev)
+{
+	int i, err;
+
+	/* Get the MSIX vectors. */
+	if (qlge_irq_type == MSIX_IRQ) {
+		/* Try to alloc space for the msix struct,
+		 * if it fails then go to MSI/legacy.
+		 */
+		qdev->msi_x_entry = kcalloc(qdev->intr_count,
+					    sizeof(struct msix_entry),
+					    GFP_KERNEL);
+		if (!qdev->msi_x_entry) {
+			qlge_irq_type = MSI_IRQ;
+			goto msi;
+		}
+
+		for (i = 0; i < qdev->intr_count; i++)
+			qdev->msi_x_entry[i].entry = i;
+
+		err = pci_enable_msix_range(qdev->pdev, qdev->msi_x_entry,
+					    1, qdev->intr_count);
+		if (err < 0) {
+			kfree(qdev->msi_x_entry);
+			qdev->msi_x_entry = NULL;
+			netif_warn(qdev, ifup, qdev->ndev,
+				   "MSI-X Enable failed, trying MSI.\n");
+			qlge_irq_type = MSI_IRQ;
+		} else {
+			qdev->intr_count = err;
+			set_bit(QL_MSIX_ENABLED, &qdev->flags);
+			netif_info(qdev, ifup, qdev->ndev,
+				   "MSI-X Enabled, got %d vectors.\n",
+				   qdev->intr_count);
+			return;
+		}
+	}
+msi:
+	qdev->intr_count = 1;
+	if (qlge_irq_type == MSI_IRQ) {
+		if (!pci_enable_msi(qdev->pdev)) {
+			set_bit(QL_MSI_ENABLED, &qdev->flags);
+			netif_info(qdev, ifup, qdev->ndev,
+				   "Running with MSI interrupts.\n");
+			return;
+		}
+	}
+	qlge_irq_type = LEG_IRQ;
+	netif_printk(qdev, ifup, KERN_DEBUG, qdev->ndev,
+		     "Running with legacy interrupts.\n");
+}
+
+/* Each vector services 1 RSS ring and and 1 or more
+ * TX completion rings.  This function loops through
+ * the TX completion rings and assigns the vector that
+ * will service it.  An example would be if there are
+ * 2 vectors (so 2 RSS rings) and 8 TX completion rings.
+ * This would mean that vector 0 would service RSS ring 0
+ * and TX completion rings 0,1,2 and 3.  Vector 1 would
+ * service RSS ring 1 and TX completion rings 4,5,6 and 7.
+ */
+static void ql_set_tx_vect(struct ql_adapter *qdev)
+{
+	int i, j, vect;
+	u32 tx_rings_per_vector = qdev->tx_ring_count / qdev->intr_count;
+
+	if (likely(test_bit(QL_MSIX_ENABLED, &qdev->flags))) {
+		/* Assign irq vectors to TX rx_rings.*/
+		for (vect = 0, j = 0, i = qdev->rss_ring_count;
+					 i < qdev->rx_ring_count; i++) {
+			if (j == tx_rings_per_vector) {
+				vect++;
+				j = 0;
+			}
+			qdev->rx_ring[i].irq = vect;
+			j++;
+		}
+	} else {
+		/* For single vector all rings have an irq
+		 * of zero.
+		 */
+		for (i = 0; i < qdev->rx_ring_count; i++)
+			qdev->rx_ring[i].irq = 0;
+	}
+}
+
+/* Set the interrupt mask for this vector.  Each vector
+ * will service 1 RSS ring and 1 or more TX completion
+ * rings.  This function sets up a bit mask per vector
+ * that indicates which rings it services.
+ */
+static void ql_set_irq_mask(struct ql_adapter *qdev, struct intr_context *ctx)
+{
+	int j, vect = ctx->intr;
+	u32 tx_rings_per_vector = qdev->tx_ring_count / qdev->intr_count;
+
+	if (likely(test_bit(QL_MSIX_ENABLED, &qdev->flags))) {
+		/* Add the RSS ring serviced by this vector
+		 * to the mask.
+		 */
+		ctx->irq_mask = (1 << qdev->rx_ring[vect].cq_id);
+		/* Add the TX ring(s) serviced by this vector
+		 * to the mask. */
+		for (j = 0; j < tx_rings_per_vector; j++) {
+			ctx->irq_mask |=
+			(1 << qdev->rx_ring[qdev->rss_ring_count +
+			(vect * tx_rings_per_vector) + j].cq_id);
+		}
+	} else {
+		/* For single vector we just shift each queue's
+		 * ID into the mask.
+		 */
+		for (j = 0; j < qdev->rx_ring_count; j++)
+			ctx->irq_mask |= (1 << qdev->rx_ring[j].cq_id);
+	}
+}
+
+/*
+ * Here we build the intr_context structures based on
+ * our rx_ring count and intr vector count.
+ * The intr_context structure is used to hook each vector
+ * to possibly different handlers.
+ */
+static void ql_resolve_queues_to_irqs(struct ql_adapter *qdev)
+{
+	int i = 0;
+	struct intr_context *intr_context = &qdev->intr_context[0];
+
+	if (likely(test_bit(QL_MSIX_ENABLED, &qdev->flags))) {
+		/* Each rx_ring has it's
+		 * own intr_context since we have separate
+		 * vectors for each queue.
+		 */
+		for (i = 0; i < qdev->intr_count; i++, intr_context++) {
+			qdev->rx_ring[i].irq = i;
+			intr_context->intr = i;
+			intr_context->qdev = qdev;
+			/* Set up this vector's bit-mask that indicates
+			 * which queues it services.
+			 */
+			ql_set_irq_mask(qdev, intr_context);
+			/*
+			 * We set up each vectors enable/disable/read bits so
+			 * there's no bit/mask calculations in the critical path.
+			 */
+			intr_context->intr_en_mask =
+			    INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
+			    INTR_EN_TYPE_ENABLE | INTR_EN_IHD_MASK | INTR_EN_IHD
+			    | i;
+			intr_context->intr_dis_mask =
+			    INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
+			    INTR_EN_TYPE_DISABLE | INTR_EN_IHD_MASK |
+			    INTR_EN_IHD | i;
+			intr_context->intr_read_mask =
+			    INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
+			    INTR_EN_TYPE_READ | INTR_EN_IHD_MASK | INTR_EN_IHD |
+			    i;
+			if (i == 0) {
+				/* The first vector/queue handles
+				 * broadcast/multicast, fatal errors,
+				 * and firmware events.  This in addition
+				 * to normal inbound NAPI processing.
+				 */
+				intr_context->handler = qlge_isr;
+				sprintf(intr_context->name, "%s-rx-%d",
+					qdev->ndev->name, i);
+			} else {
+				/*
+				 * Inbound queues handle unicast frames only.
+				 */
+				intr_context->handler = qlge_msix_rx_isr;
+				sprintf(intr_context->name, "%s-rx-%d",
+					qdev->ndev->name, i);
+			}
+		}
+	} else {
+		/*
+		 * All rx_rings use the same intr_context since
+		 * there is only one vector.
+		 */
+		intr_context->intr = 0;
+		intr_context->qdev = qdev;
+		/*
+		 * We set up each vectors enable/disable/read bits so
+		 * there's no bit/mask calculations in the critical path.
+		 */
+		intr_context->intr_en_mask =
+		    INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK | INTR_EN_TYPE_ENABLE;
+		intr_context->intr_dis_mask =
+		    INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
+		    INTR_EN_TYPE_DISABLE;
+		intr_context->intr_read_mask =
+		    INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK | INTR_EN_TYPE_READ;
+		/*
+		 * Single interrupt means one handler for all rings.
+		 */
+		intr_context->handler = qlge_isr;
+		sprintf(intr_context->name, "%s-single_irq", qdev->ndev->name);
+		/* Set up this vector's bit-mask that indicates
+		 * which queues it services. In this case there is
+		 * a single vector so it will service all RSS and
+		 * TX completion rings.
+		 */
+		ql_set_irq_mask(qdev, intr_context);
+	}
+	/* Tell the TX completion rings which MSIx vector
+	 * they will be using.
+	 */
+	ql_set_tx_vect(qdev);
+}
+
+static void ql_free_irq(struct ql_adapter *qdev)
+{
+	int i;
+	struct intr_context *intr_context = &qdev->intr_context[0];
+
+	for (i = 0; i < qdev->intr_count; i++, intr_context++) {
+		if (intr_context->hooked) {
+			if (test_bit(QL_MSIX_ENABLED, &qdev->flags)) {
+				free_irq(qdev->msi_x_entry[i].vector,
+					 &qdev->rx_ring[i]);
+			} else {
+				free_irq(qdev->pdev->irq, &qdev->rx_ring[0]);
+			}
+		}
+	}
+	ql_disable_msix(qdev);
+}
+
+static int ql_request_irq(struct ql_adapter *qdev)
+{
+	int i;
+	int status = 0;
+	struct pci_dev *pdev = qdev->pdev;
+	struct intr_context *intr_context = &qdev->intr_context[0];
+
+	ql_resolve_queues_to_irqs(qdev);
+
+	for (i = 0; i < qdev->intr_count; i++, intr_context++) {
+		atomic_set(&intr_context->irq_cnt, 0);
+		if (test_bit(QL_MSIX_ENABLED, &qdev->flags)) {
+			status = request_irq(qdev->msi_x_entry[i].vector,
+					     intr_context->handler,
+					     0,
+					     intr_context->name,
+					     &qdev->rx_ring[i]);
+			if (status) {
+				netif_err(qdev, ifup, qdev->ndev,
+					  "Failed request for MSIX interrupt %d.\n",
+					  i);
+				goto err_irq;
+			}
+		} else {
+			netif_printk(qdev, ifup, KERN_DEBUG, qdev->ndev,
+				     "trying msi or legacy interrupts.\n");
+			netif_printk(qdev, ifup, KERN_DEBUG, qdev->ndev,
+				     "%s: irq = %d.\n", __func__, pdev->irq);
+			netif_printk(qdev, ifup, KERN_DEBUG, qdev->ndev,
+				     "%s: context->name = %s.\n", __func__,
+				     intr_context->name);
+			netif_printk(qdev, ifup, KERN_DEBUG, qdev->ndev,
+				     "%s: dev_id = 0x%p.\n", __func__,
+				     &qdev->rx_ring[0]);
+			status =
+			    request_irq(pdev->irq, qlge_isr,
+					test_bit(QL_MSI_ENABLED,
+						 &qdev->
+						 flags) ? 0 : IRQF_SHARED,
+					intr_context->name, &qdev->rx_ring[0]);
+			if (status)
+				goto err_irq;
+
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Hooked intr %d, queue type %s, with name %s.\n",
+				  i,
+				  qdev->rx_ring[0].type == DEFAULT_Q ?
+				  "DEFAULT_Q" :
+				  qdev->rx_ring[0].type == TX_Q ? "TX_Q" :
+				  qdev->rx_ring[0].type == RX_Q ? "RX_Q" : "",
+				  intr_context->name);
+		}
+		intr_context->hooked = 1;
+	}
+	return status;
+err_irq:
+	netif_err(qdev, ifup, qdev->ndev, "Failed to get the interrupts!!!\n");
+	ql_free_irq(qdev);
+	return status;
+}
+
+static int ql_start_rss(struct ql_adapter *qdev)
+{
+	static const u8 init_hash_seed[] = {
+		0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+		0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+		0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+		0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+		0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
+	};
+	struct ricb *ricb = &qdev->ricb;
+	int status = 0;
+	int i;
+	u8 *hash_id = (u8 *) ricb->hash_cq_id;
+
+	memset((void *)ricb, 0, sizeof(*ricb));
+
+	ricb->base_cq = RSS_L4K;
+	ricb->flags =
+		(RSS_L6K | RSS_LI | RSS_LB | RSS_LM | RSS_RT4 | RSS_RT6);
+	ricb->mask = cpu_to_le16((u16)(0x3ff));
+
+	/*
+	 * Fill out the Indirection Table.
+	 */
+	for (i = 0; i < 1024; i++)
+		hash_id[i] = (i & (qdev->rss_ring_count - 1));
+
+	memcpy((void *)&ricb->ipv6_hash_key[0], init_hash_seed, 40);
+	memcpy((void *)&ricb->ipv4_hash_key[0], init_hash_seed, 16);
+
+	status = ql_write_cfg(qdev, ricb, sizeof(*ricb), CFG_LR, 0);
+	if (status) {
+		netif_err(qdev, ifup, qdev->ndev, "Failed to load RICB.\n");
+		return status;
+	}
+	return status;
+}
+
+static int ql_clear_routing_entries(struct ql_adapter *qdev)
+{
+	int i, status = 0;
+
+	status = ql_sem_spinlock(qdev, SEM_RT_IDX_MASK);
+	if (status)
+		return status;
+	/* Clear all the entries in the routing table. */
+	for (i = 0; i < 16; i++) {
+		status = ql_set_routing_reg(qdev, i, 0, 0);
+		if (status) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Failed to init routing register for CAM packets.\n");
+			break;
+		}
+	}
+	ql_sem_unlock(qdev, SEM_RT_IDX_MASK);
+	return status;
+}
+
+/* Initialize the frame-to-queue routing. */
+static int ql_route_initialize(struct ql_adapter *qdev)
+{
+	int status = 0;
+
+	/* Clear all the entries in the routing table. */
+	status = ql_clear_routing_entries(qdev);
+	if (status)
+		return status;
+
+	status = ql_sem_spinlock(qdev, SEM_RT_IDX_MASK);
+	if (status)
+		return status;
+
+	status = ql_set_routing_reg(qdev, RT_IDX_IP_CSUM_ERR_SLOT,
+						RT_IDX_IP_CSUM_ERR, 1);
+	if (status) {
+		netif_err(qdev, ifup, qdev->ndev,
+			"Failed to init routing register "
+			"for IP CSUM error packets.\n");
+		goto exit;
+	}
+	status = ql_set_routing_reg(qdev, RT_IDX_TCP_UDP_CSUM_ERR_SLOT,
+						RT_IDX_TU_CSUM_ERR, 1);
+	if (status) {
+		netif_err(qdev, ifup, qdev->ndev,
+			"Failed to init routing register "
+			"for TCP/UDP CSUM error packets.\n");
+		goto exit;
+	}
+	status = ql_set_routing_reg(qdev, RT_IDX_BCAST_SLOT, RT_IDX_BCAST, 1);
+	if (status) {
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Failed to init routing register for broadcast packets.\n");
+		goto exit;
+	}
+	/* If we have more than one inbound queue, then turn on RSS in the
+	 * routing block.
+	 */
+	if (qdev->rss_ring_count > 1) {
+		status = ql_set_routing_reg(qdev, RT_IDX_RSS_MATCH_SLOT,
+					RT_IDX_RSS_MATCH, 1);
+		if (status) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Failed to init routing register for MATCH RSS packets.\n");
+			goto exit;
+		}
+	}
+
+	status = ql_set_routing_reg(qdev, RT_IDX_CAM_HIT_SLOT,
+				    RT_IDX_CAM_HIT, 1);
+	if (status)
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Failed to init routing register for CAM packets.\n");
+exit:
+	ql_sem_unlock(qdev, SEM_RT_IDX_MASK);
+	return status;
+}
+
+int ql_cam_route_initialize(struct ql_adapter *qdev)
+{
+	int status, set;
+
+	/* If check if the link is up and use to
+	 * determine if we are setting or clearing
+	 * the MAC address in the CAM.
+	 */
+	set = ql_read32(qdev, STS);
+	set &= qdev->port_link_up;
+	status = ql_set_mac_addr(qdev, set);
+	if (status) {
+		netif_err(qdev, ifup, qdev->ndev, "Failed to init mac address.\n");
+		return status;
+	}
+
+	status = ql_route_initialize(qdev);
+	if (status)
+		netif_err(qdev, ifup, qdev->ndev, "Failed to init routing table.\n");
+
+	return status;
+}
+
+static int ql_adapter_initialize(struct ql_adapter *qdev)
+{
+	u32 value, mask;
+	int i;
+	int status = 0;
+
+	/*
+	 * Set up the System register to halt on errors.
+	 */
+	value = SYS_EFE | SYS_FAE;
+	mask = value << 16;
+	ql_write32(qdev, SYS, mask | value);
+
+	/* Set the default queue, and VLAN behavior. */
+	value = NIC_RCV_CFG_DFQ;
+	mask = NIC_RCV_CFG_DFQ_MASK;
+	if (qdev->ndev->features & NETIF_F_HW_VLAN_CTAG_RX) {
+		value |= NIC_RCV_CFG_RV;
+		mask |= (NIC_RCV_CFG_RV << 16);
+	}
+	ql_write32(qdev, NIC_RCV_CFG, (mask | value));
+
+	/* Set the MPI interrupt to enabled. */
+	ql_write32(qdev, INTR_MASK, (INTR_MASK_PI << 16) | INTR_MASK_PI);
+
+	/* Enable the function, set pagesize, enable error checking. */
+	value = FSC_FE | FSC_EPC_INBOUND | FSC_EPC_OUTBOUND |
+	    FSC_EC | FSC_VM_PAGE_4K;
+	value |= SPLT_SETTING;
+
+	/* Set/clear header splitting. */
+	mask = FSC_VM_PAGESIZE_MASK |
+	    FSC_DBL_MASK | FSC_DBRST_MASK | (value << 16);
+	ql_write32(qdev, FSC, mask | value);
+
+	ql_write32(qdev, SPLT_HDR, SPLT_LEN);
+
+	/* Set RX packet routing to use port/pci function on which the
+	 * packet arrived on in addition to usual frame routing.
+	 * This is helpful on bonding where both interfaces can have
+	 * the same MAC address.
+	 */
+	ql_write32(qdev, RST_FO, RST_FO_RR_MASK | RST_FO_RR_RCV_FUNC_CQ);
+	/* Reroute all packets to our Interface.
+	 * They may have been routed to MPI firmware
+	 * due to WOL.
+	 */
+	value = ql_read32(qdev, MGMT_RCV_CFG);
+	value &= ~MGMT_RCV_CFG_RM;
+	mask = 0xffff0000;
+
+	/* Sticky reg needs clearing due to WOL. */
+	ql_write32(qdev, MGMT_RCV_CFG, mask);
+	ql_write32(qdev, MGMT_RCV_CFG, mask | value);
+
+	/* Default WOL is enable on Mezz cards */
+	if (qdev->pdev->subsystem_device == 0x0068 ||
+			qdev->pdev->subsystem_device == 0x0180)
+		qdev->wol = WAKE_MAGIC;
+
+	/* Start up the rx queues. */
+	for (i = 0; i < qdev->rx_ring_count; i++) {
+		status = ql_start_rx_ring(qdev, &qdev->rx_ring[i]);
+		if (status) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Failed to start rx ring[%d].\n", i);
+			return status;
+		}
+	}
+
+	/* If there is more than one inbound completion queue
+	 * then download a RICB to configure RSS.
+	 */
+	if (qdev->rss_ring_count > 1) {
+		status = ql_start_rss(qdev);
+		if (status) {
+			netif_err(qdev, ifup, qdev->ndev, "Failed to start RSS.\n");
+			return status;
+		}
+	}
+
+	/* Start up the tx queues. */
+	for (i = 0; i < qdev->tx_ring_count; i++) {
+		status = ql_start_tx_ring(qdev, &qdev->tx_ring[i]);
+		if (status) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Failed to start tx ring[%d].\n", i);
+			return status;
+		}
+	}
+
+	/* Initialize the port and set the max framesize. */
+	status = qdev->nic_ops->port_initialize(qdev);
+	if (status)
+		netif_err(qdev, ifup, qdev->ndev, "Failed to start port.\n");
+
+	/* Set up the MAC address and frame routing filter. */
+	status = ql_cam_route_initialize(qdev);
+	if (status) {
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Failed to init CAM/Routing tables.\n");
+		return status;
+	}
+
+	/* Start NAPI for the RSS queues. */
+	for (i = 0; i < qdev->rss_ring_count; i++)
+		napi_enable(&qdev->rx_ring[i].napi);
+
+	return status;
+}
+
+/* Issue soft reset to chip. */
+static int ql_adapter_reset(struct ql_adapter *qdev)
+{
+	u32 value;
+	int status = 0;
+	unsigned long end_jiffies;
+
+	/* Clear all the entries in the routing table. */
+	status = ql_clear_routing_entries(qdev);
+	if (status) {
+		netif_err(qdev, ifup, qdev->ndev, "Failed to clear routing bits.\n");
+		return status;
+	}
+
+	/* Check if bit is set then skip the mailbox command and
+	 * clear the bit, else we are in normal reset process.
+	 */
+	if (!test_bit(QL_ASIC_RECOVERY, &qdev->flags)) {
+		/* Stop management traffic. */
+		ql_mb_set_mgmnt_traffic_ctl(qdev, MB_SET_MPI_TFK_STOP);
+
+		/* Wait for the NIC and MGMNT FIFOs to empty. */
+		ql_wait_fifo_empty(qdev);
+	} else
+		clear_bit(QL_ASIC_RECOVERY, &qdev->flags);
+
+	ql_write32(qdev, RST_FO, (RST_FO_FR << 16) | RST_FO_FR);
+
+	end_jiffies = jiffies + usecs_to_jiffies(30);
+	do {
+		value = ql_read32(qdev, RST_FO);
+		if ((value & RST_FO_FR) == 0)
+			break;
+		cpu_relax();
+	} while (time_before(jiffies, end_jiffies));
+
+	if (value & RST_FO_FR) {
+		netif_err(qdev, ifdown, qdev->ndev,
+			  "ETIMEDOUT!!! errored out of resetting the chip!\n");
+		status = -ETIMEDOUT;
+	}
+
+	/* Resume management traffic. */
+	ql_mb_set_mgmnt_traffic_ctl(qdev, MB_SET_MPI_TFK_RESUME);
+	return status;
+}
+
+static void ql_display_dev_info(struct net_device *ndev)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	netif_info(qdev, probe, qdev->ndev,
+		   "Function #%d, Port %d, NIC Roll %d, NIC Rev = %d, "
+		   "XG Roll = %d, XG Rev = %d.\n",
+		   qdev->func,
+		   qdev->port,
+		   qdev->chip_rev_id & 0x0000000f,
+		   qdev->chip_rev_id >> 4 & 0x0000000f,
+		   qdev->chip_rev_id >> 8 & 0x0000000f,
+		   qdev->chip_rev_id >> 12 & 0x0000000f);
+	netif_info(qdev, probe, qdev->ndev,
+		   "MAC address %pM\n", ndev->dev_addr);
+}
+
+static int ql_wol(struct ql_adapter *qdev)
+{
+	int status = 0;
+	u32 wol = MB_WOL_DISABLE;
+
+	/* The CAM is still intact after a reset, but if we
+	 * are doing WOL, then we may need to program the
+	 * routing regs. We would also need to issue the mailbox
+	 * commands to instruct the MPI what to do per the ethtool
+	 * settings.
+	 */
+
+	if (qdev->wol & (WAKE_ARP | WAKE_MAGICSECURE | WAKE_PHY | WAKE_UCAST |
+			WAKE_MCAST | WAKE_BCAST)) {
+		netif_err(qdev, ifdown, qdev->ndev,
+			  "Unsupported WOL parameter. qdev->wol = 0x%x.\n",
+			  qdev->wol);
+		return -EINVAL;
+	}
+
+	if (qdev->wol & WAKE_MAGIC) {
+		status = ql_mb_wol_set_magic(qdev, 1);
+		if (status) {
+			netif_err(qdev, ifdown, qdev->ndev,
+				  "Failed to set magic packet on %s.\n",
+				  qdev->ndev->name);
+			return status;
+		} else
+			netif_info(qdev, drv, qdev->ndev,
+				   "Enabled magic packet successfully on %s.\n",
+				   qdev->ndev->name);
+
+		wol |= MB_WOL_MAGIC_PKT;
+	}
+
+	if (qdev->wol) {
+		wol |= MB_WOL_MODE_ON;
+		status = ql_mb_wol_mode(qdev, wol);
+		netif_err(qdev, drv, qdev->ndev,
+			  "WOL %s (wol code 0x%x) on %s\n",
+			  (status == 0) ? "Successfully set" : "Failed",
+			  wol, qdev->ndev->name);
+	}
+
+	return status;
+}
+
+static void ql_cancel_all_work_sync(struct ql_adapter *qdev)
+{
+
+	/* Don't kill the reset worker thread if we
+	 * are in the process of recovery.
+	 */
+	if (test_bit(QL_ADAPTER_UP, &qdev->flags))
+		cancel_delayed_work_sync(&qdev->asic_reset_work);
+	cancel_delayed_work_sync(&qdev->mpi_reset_work);
+	cancel_delayed_work_sync(&qdev->mpi_work);
+	cancel_delayed_work_sync(&qdev->mpi_idc_work);
+	cancel_delayed_work_sync(&qdev->mpi_core_to_log);
+	cancel_delayed_work_sync(&qdev->mpi_port_cfg_work);
+}
+
+static int ql_adapter_down(struct ql_adapter *qdev)
+{
+	int i, status = 0;
+
+	ql_link_off(qdev);
+
+	ql_cancel_all_work_sync(qdev);
+
+	for (i = 0; i < qdev->rss_ring_count; i++)
+		napi_disable(&qdev->rx_ring[i].napi);
+
+	clear_bit(QL_ADAPTER_UP, &qdev->flags);
+
+	ql_disable_interrupts(qdev);
+
+	ql_tx_ring_clean(qdev);
+
+	/* Call netif_napi_del() from common point.
+	 */
+	for (i = 0; i < qdev->rss_ring_count; i++)
+		netif_napi_del(&qdev->rx_ring[i].napi);
+
+	status = ql_adapter_reset(qdev);
+	if (status)
+		netif_err(qdev, ifdown, qdev->ndev, "reset(func #%d) FAILED!\n",
+			  qdev->func);
+	ql_free_rx_buffers(qdev);
+
+	return status;
+}
+
+static int ql_adapter_up(struct ql_adapter *qdev)
+{
+	int err = 0;
+
+	err = ql_adapter_initialize(qdev);
+	if (err) {
+		netif_info(qdev, ifup, qdev->ndev, "Unable to initialize adapter.\n");
+		goto err_init;
+	}
+	set_bit(QL_ADAPTER_UP, &qdev->flags);
+	ql_alloc_rx_buffers(qdev);
+	/* If the port is initialized and the
+	 * link is up the turn on the carrier.
+	 */
+	if ((ql_read32(qdev, STS) & qdev->port_init) &&
+			(ql_read32(qdev, STS) & qdev->port_link_up))
+		ql_link_on(qdev);
+	/* Restore rx mode. */
+	clear_bit(QL_ALLMULTI, &qdev->flags);
+	clear_bit(QL_PROMISCUOUS, &qdev->flags);
+	qlge_set_multicast_list(qdev->ndev);
+
+	/* Restore vlan setting. */
+	qlge_restore_vlan(qdev);
+
+	ql_enable_interrupts(qdev);
+	ql_enable_all_completion_interrupts(qdev);
+	netif_tx_start_all_queues(qdev->ndev);
+
+	return 0;
+err_init:
+	ql_adapter_reset(qdev);
+	return err;
+}
+
+static void ql_release_adapter_resources(struct ql_adapter *qdev)
+{
+	ql_free_mem_resources(qdev);
+	ql_free_irq(qdev);
+}
+
+static int ql_get_adapter_resources(struct ql_adapter *qdev)
+{
+	int status = 0;
+
+	if (ql_alloc_mem_resources(qdev)) {
+		netif_err(qdev, ifup, qdev->ndev, "Unable to  allocate memory.\n");
+		return -ENOMEM;
+	}
+	status = ql_request_irq(qdev);
+	return status;
+}
+
+static int qlge_close(struct net_device *ndev)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	/* If we hit pci_channel_io_perm_failure
+	 * failure condition, then we already
+	 * brought the adapter down.
+	 */
+	if (test_bit(QL_EEH_FATAL, &qdev->flags)) {
+		netif_err(qdev, drv, qdev->ndev, "EEH fatal did unload.\n");
+		clear_bit(QL_EEH_FATAL, &qdev->flags);
+		return 0;
+	}
+
+	/*
+	 * Wait for device to recover from a reset.
+	 * (Rarely happens, but possible.)
+	 */
+	while (!test_bit(QL_ADAPTER_UP, &qdev->flags))
+		msleep(1);
+	ql_adapter_down(qdev);
+	ql_release_adapter_resources(qdev);
+	return 0;
+}
+
+static int ql_configure_rings(struct ql_adapter *qdev)
+{
+	int i;
+	struct rx_ring *rx_ring;
+	struct tx_ring *tx_ring;
+	int cpu_cnt = min(MAX_CPUS, (int)num_online_cpus());
+	unsigned int lbq_buf_len = (qdev->ndev->mtu > 1500) ?
+		LARGE_BUFFER_MAX_SIZE : LARGE_BUFFER_MIN_SIZE;
+
+	qdev->lbq_buf_order = get_order(lbq_buf_len);
+
+	/* In a perfect world we have one RSS ring for each CPU
+	 * and each has it's own vector.  To do that we ask for
+	 * cpu_cnt vectors.  ql_enable_msix() will adjust the
+	 * vector count to what we actually get.  We then
+	 * allocate an RSS ring for each.
+	 * Essentially, we are doing min(cpu_count, msix_vector_count).
+	 */
+	qdev->intr_count = cpu_cnt;
+	ql_enable_msix(qdev);
+	/* Adjust the RSS ring count to the actual vector count. */
+	qdev->rss_ring_count = qdev->intr_count;
+	qdev->tx_ring_count = cpu_cnt;
+	qdev->rx_ring_count = qdev->tx_ring_count + qdev->rss_ring_count;
+
+	for (i = 0; i < qdev->tx_ring_count; i++) {
+		tx_ring = &qdev->tx_ring[i];
+		memset((void *)tx_ring, 0, sizeof(*tx_ring));
+		tx_ring->qdev = qdev;
+		tx_ring->wq_id = i;
+		tx_ring->wq_len = qdev->tx_ring_size;
+		tx_ring->wq_size =
+		    tx_ring->wq_len * sizeof(struct ob_mac_iocb_req);
+
+		/*
+		 * The completion queue ID for the tx rings start
+		 * immediately after the rss rings.
+		 */
+		tx_ring->cq_id = qdev->rss_ring_count + i;
+	}
+
+	for (i = 0; i < qdev->rx_ring_count; i++) {
+		rx_ring = &qdev->rx_ring[i];
+		memset((void *)rx_ring, 0, sizeof(*rx_ring));
+		rx_ring->qdev = qdev;
+		rx_ring->cq_id = i;
+		rx_ring->cpu = i % cpu_cnt;	/* CPU to run handler on. */
+		if (i < qdev->rss_ring_count) {
+			/*
+			 * Inbound (RSS) queues.
+			 */
+			rx_ring->cq_len = qdev->rx_ring_size;
+			rx_ring->cq_size =
+			    rx_ring->cq_len * sizeof(struct ql_net_rsp_iocb);
+			rx_ring->lbq_len = NUM_LARGE_BUFFERS;
+			rx_ring->lbq_size =
+			    rx_ring->lbq_len * sizeof(__le64);
+			rx_ring->lbq_buf_size = (u16)lbq_buf_len;
+			rx_ring->sbq_len = NUM_SMALL_BUFFERS;
+			rx_ring->sbq_size =
+			    rx_ring->sbq_len * sizeof(__le64);
+			rx_ring->sbq_buf_size = SMALL_BUF_MAP_SIZE;
+			rx_ring->type = RX_Q;
+		} else {
+			/*
+			 * Outbound queue handles outbound completions only.
+			 */
+			/* outbound cq is same size as tx_ring it services. */
+			rx_ring->cq_len = qdev->tx_ring_size;
+			rx_ring->cq_size =
+			    rx_ring->cq_len * sizeof(struct ql_net_rsp_iocb);
+			rx_ring->lbq_len = 0;
+			rx_ring->lbq_size = 0;
+			rx_ring->lbq_buf_size = 0;
+			rx_ring->sbq_len = 0;
+			rx_ring->sbq_size = 0;
+			rx_ring->sbq_buf_size = 0;
+			rx_ring->type = TX_Q;
+		}
+	}
+	return 0;
+}
+
+static int qlge_open(struct net_device *ndev)
+{
+	int err = 0;
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	err = ql_adapter_reset(qdev);
+	if (err)
+		return err;
+
+	err = ql_configure_rings(qdev);
+	if (err)
+		return err;
+
+	err = ql_get_adapter_resources(qdev);
+	if (err)
+		goto error_up;
+
+	err = ql_adapter_up(qdev);
+	if (err)
+		goto error_up;
+
+	return err;
+
+error_up:
+	ql_release_adapter_resources(qdev);
+	return err;
+}
+
+static int ql_change_rx_buffers(struct ql_adapter *qdev)
+{
+	struct rx_ring *rx_ring;
+	int i, status;
+	u32 lbq_buf_len;
+
+	/* Wait for an outstanding reset to complete. */
+	if (!test_bit(QL_ADAPTER_UP, &qdev->flags)) {
+		int i = 4;
+
+		while (--i && !test_bit(QL_ADAPTER_UP, &qdev->flags)) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Waiting for adapter UP...\n");
+			ssleep(1);
+		}
+
+		if (!i) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Timed out waiting for adapter UP\n");
+			return -ETIMEDOUT;
+		}
+	}
+
+	status = ql_adapter_down(qdev);
+	if (status)
+		goto error;
+
+	/* Get the new rx buffer size. */
+	lbq_buf_len = (qdev->ndev->mtu > 1500) ?
+		LARGE_BUFFER_MAX_SIZE : LARGE_BUFFER_MIN_SIZE;
+	qdev->lbq_buf_order = get_order(lbq_buf_len);
+
+	for (i = 0; i < qdev->rss_ring_count; i++) {
+		rx_ring = &qdev->rx_ring[i];
+		/* Set the new size. */
+		rx_ring->lbq_buf_size = lbq_buf_len;
+	}
+
+	status = ql_adapter_up(qdev);
+	if (status)
+		goto error;
+
+	return status;
+error:
+	netif_alert(qdev, ifup, qdev->ndev,
+		    "Driver up/down cycle failed, closing device.\n");
+	set_bit(QL_ADAPTER_UP, &qdev->flags);
+	dev_close(qdev->ndev);
+	return status;
+}
+
+static int qlge_change_mtu(struct net_device *ndev, int new_mtu)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int status;
+
+	if (ndev->mtu == 1500 && new_mtu == 9000) {
+		netif_err(qdev, ifup, qdev->ndev, "Changing to jumbo MTU.\n");
+	} else if (ndev->mtu == 9000 && new_mtu == 1500) {
+		netif_err(qdev, ifup, qdev->ndev, "Changing to normal MTU.\n");
+	} else
+		return -EINVAL;
+
+	queue_delayed_work(qdev->workqueue,
+			&qdev->mpi_port_cfg_work, 3*HZ);
+
+	ndev->mtu = new_mtu;
+
+	if (!netif_running(qdev->ndev)) {
+		return 0;
+	}
+
+	status = ql_change_rx_buffers(qdev);
+	if (status) {
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Changing MTU failed.\n");
+	}
+
+	return status;
+}
+
+static struct net_device_stats *qlge_get_stats(struct net_device
+					       *ndev)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	struct rx_ring *rx_ring = &qdev->rx_ring[0];
+	struct tx_ring *tx_ring = &qdev->tx_ring[0];
+	unsigned long pkts, mcast, dropped, errors, bytes;
+	int i;
+
+	/* Get RX stats. */
+	pkts = mcast = dropped = errors = bytes = 0;
+	for (i = 0; i < qdev->rss_ring_count; i++, rx_ring++) {
+			pkts += rx_ring->rx_packets;
+			bytes += rx_ring->rx_bytes;
+			dropped += rx_ring->rx_dropped;
+			errors += rx_ring->rx_errors;
+			mcast += rx_ring->rx_multicast;
+	}
+	ndev->stats.rx_packets = pkts;
+	ndev->stats.rx_bytes = bytes;
+	ndev->stats.rx_dropped = dropped;
+	ndev->stats.rx_errors = errors;
+	ndev->stats.multicast = mcast;
+
+	/* Get TX stats. */
+	pkts = errors = bytes = 0;
+	for (i = 0; i < qdev->tx_ring_count; i++, tx_ring++) {
+			pkts += tx_ring->tx_packets;
+			bytes += tx_ring->tx_bytes;
+			errors += tx_ring->tx_errors;
+	}
+	ndev->stats.tx_packets = pkts;
+	ndev->stats.tx_bytes = bytes;
+	ndev->stats.tx_errors = errors;
+	return &ndev->stats;
+}
+
+static void qlge_set_multicast_list(struct net_device *ndev)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	struct netdev_hw_addr *ha;
+	int i, status;
+
+	status = ql_sem_spinlock(qdev, SEM_RT_IDX_MASK);
+	if (status)
+		return;
+	/*
+	 * Set or clear promiscuous mode if a
+	 * transition is taking place.
+	 */
+	if (ndev->flags & IFF_PROMISC) {
+		if (!test_bit(QL_PROMISCUOUS, &qdev->flags)) {
+			if (ql_set_routing_reg
+			    (qdev, RT_IDX_PROMISCUOUS_SLOT, RT_IDX_VALID, 1)) {
+				netif_err(qdev, hw, qdev->ndev,
+					  "Failed to set promiscuous mode.\n");
+			} else {
+				set_bit(QL_PROMISCUOUS, &qdev->flags);
+			}
+		}
+	} else {
+		if (test_bit(QL_PROMISCUOUS, &qdev->flags)) {
+			if (ql_set_routing_reg
+			    (qdev, RT_IDX_PROMISCUOUS_SLOT, RT_IDX_VALID, 0)) {
+				netif_err(qdev, hw, qdev->ndev,
+					  "Failed to clear promiscuous mode.\n");
+			} else {
+				clear_bit(QL_PROMISCUOUS, &qdev->flags);
+			}
+		}
+	}
+
+	/*
+	 * Set or clear all multicast mode if a
+	 * transition is taking place.
+	 */
+	if ((ndev->flags & IFF_ALLMULTI) ||
+	    (netdev_mc_count(ndev) > MAX_MULTICAST_ENTRIES)) {
+		if (!test_bit(QL_ALLMULTI, &qdev->flags)) {
+			if (ql_set_routing_reg
+			    (qdev, RT_IDX_ALLMULTI_SLOT, RT_IDX_MCAST, 1)) {
+				netif_err(qdev, hw, qdev->ndev,
+					  "Failed to set all-multi mode.\n");
+			} else {
+				set_bit(QL_ALLMULTI, &qdev->flags);
+			}
+		}
+	} else {
+		if (test_bit(QL_ALLMULTI, &qdev->flags)) {
+			if (ql_set_routing_reg
+			    (qdev, RT_IDX_ALLMULTI_SLOT, RT_IDX_MCAST, 0)) {
+				netif_err(qdev, hw, qdev->ndev,
+					  "Failed to clear all-multi mode.\n");
+			} else {
+				clear_bit(QL_ALLMULTI, &qdev->flags);
+			}
+		}
+	}
+
+	if (!netdev_mc_empty(ndev)) {
+		status = ql_sem_spinlock(qdev, SEM_MAC_ADDR_MASK);
+		if (status)
+			goto exit;
+		i = 0;
+		netdev_for_each_mc_addr(ha, ndev) {
+			if (ql_set_mac_addr_reg(qdev, (u8 *) ha->addr,
+						MAC_ADDR_TYPE_MULTI_MAC, i)) {
+				netif_err(qdev, hw, qdev->ndev,
+					  "Failed to loadmulticast address.\n");
+				ql_sem_unlock(qdev, SEM_MAC_ADDR_MASK);
+				goto exit;
+			}
+			i++;
+		}
+		ql_sem_unlock(qdev, SEM_MAC_ADDR_MASK);
+		if (ql_set_routing_reg
+		    (qdev, RT_IDX_MCAST_MATCH_SLOT, RT_IDX_MCAST_MATCH, 1)) {
+			netif_err(qdev, hw, qdev->ndev,
+				  "Failed to set multicast match mode.\n");
+		} else {
+			set_bit(QL_ALLMULTI, &qdev->flags);
+		}
+	}
+exit:
+	ql_sem_unlock(qdev, SEM_RT_IDX_MASK);
+}
+
+static int qlge_set_mac_address(struct net_device *ndev, void *p)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	struct sockaddr *addr = p;
+	int status;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+	memcpy(ndev->dev_addr, addr->sa_data, ndev->addr_len);
+	/* Update local copy of current mac address. */
+	memcpy(qdev->current_mac_addr, ndev->dev_addr, ndev->addr_len);
+
+	status = ql_sem_spinlock(qdev, SEM_MAC_ADDR_MASK);
+	if (status)
+		return status;
+	status = ql_set_mac_addr_reg(qdev, (u8 *) ndev->dev_addr,
+			MAC_ADDR_TYPE_CAM_MAC, qdev->func * MAX_CQ);
+	if (status)
+		netif_err(qdev, hw, qdev->ndev, "Failed to load MAC address.\n");
+	ql_sem_unlock(qdev, SEM_MAC_ADDR_MASK);
+	return status;
+}
+
+static void qlge_tx_timeout(struct net_device *ndev)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	ql_queue_asic_error(qdev);
+}
+
+static void ql_asic_reset_work(struct work_struct *work)
+{
+	struct ql_adapter *qdev =
+	    container_of(work, struct ql_adapter, asic_reset_work.work);
+	int status;
+	rtnl_lock();
+	status = ql_adapter_down(qdev);
+	if (status)
+		goto error;
+
+	status = ql_adapter_up(qdev);
+	if (status)
+		goto error;
+
+	/* Restore rx mode. */
+	clear_bit(QL_ALLMULTI, &qdev->flags);
+	clear_bit(QL_PROMISCUOUS, &qdev->flags);
+	qlge_set_multicast_list(qdev->ndev);
+
+	rtnl_unlock();
+	return;
+error:
+	netif_alert(qdev, ifup, qdev->ndev,
+		    "Driver up/down cycle failed, closing device\n");
+
+	set_bit(QL_ADAPTER_UP, &qdev->flags);
+	dev_close(qdev->ndev);
+	rtnl_unlock();
+}
+
+static const struct nic_operations qla8012_nic_ops = {
+	.get_flash		= ql_get_8012_flash_params,
+	.port_initialize	= ql_8012_port_initialize,
+};
+
+static const struct nic_operations qla8000_nic_ops = {
+	.get_flash		= ql_get_8000_flash_params,
+	.port_initialize	= ql_8000_port_initialize,
+};
+
+/* Find the pcie function number for the other NIC
+ * on this chip.  Since both NIC functions share a
+ * common firmware we have the lowest enabled function
+ * do any common work.  Examples would be resetting
+ * after a fatal firmware error, or doing a firmware
+ * coredump.
+ */
+static int ql_get_alt_pcie_func(struct ql_adapter *qdev)
+{
+	int status = 0;
+	u32 temp;
+	u32 nic_func1, nic_func2;
+
+	status = ql_read_mpi_reg(qdev, MPI_TEST_FUNC_PORT_CFG,
+			&temp);
+	if (status)
+		return status;
+
+	nic_func1 = ((temp >> MPI_TEST_NIC1_FUNC_SHIFT) &
+			MPI_TEST_NIC_FUNC_MASK);
+	nic_func2 = ((temp >> MPI_TEST_NIC2_FUNC_SHIFT) &
+			MPI_TEST_NIC_FUNC_MASK);
+
+	if (qdev->func == nic_func1)
+		qdev->alt_func = nic_func2;
+	else if (qdev->func == nic_func2)
+		qdev->alt_func = nic_func1;
+	else
+		status = -EIO;
+
+	return status;
+}
+
+static int ql_get_board_info(struct ql_adapter *qdev)
+{
+	int status;
+	qdev->func =
+	    (ql_read32(qdev, STS) & STS_FUNC_ID_MASK) >> STS_FUNC_ID_SHIFT;
+	if (qdev->func > 3)
+		return -EIO;
+
+	status = ql_get_alt_pcie_func(qdev);
+	if (status)
+		return status;
+
+	qdev->port = (qdev->func < qdev->alt_func) ? 0 : 1;
+	if (qdev->port) {
+		qdev->xg_sem_mask = SEM_XGMAC1_MASK;
+		qdev->port_link_up = STS_PL1;
+		qdev->port_init = STS_PI1;
+		qdev->mailbox_in = PROC_ADDR_MPI_RISC | PROC_ADDR_FUNC2_MBI;
+		qdev->mailbox_out = PROC_ADDR_MPI_RISC | PROC_ADDR_FUNC2_MBO;
+	} else {
+		qdev->xg_sem_mask = SEM_XGMAC0_MASK;
+		qdev->port_link_up = STS_PL0;
+		qdev->port_init = STS_PI0;
+		qdev->mailbox_in = PROC_ADDR_MPI_RISC | PROC_ADDR_FUNC0_MBI;
+		qdev->mailbox_out = PROC_ADDR_MPI_RISC | PROC_ADDR_FUNC0_MBO;
+	}
+	qdev->chip_rev_id = ql_read32(qdev, REV_ID);
+	qdev->device_id = qdev->pdev->device;
+	if (qdev->device_id == QLGE_DEVICE_ID_8012)
+		qdev->nic_ops = &qla8012_nic_ops;
+	else if (qdev->device_id == QLGE_DEVICE_ID_8000)
+		qdev->nic_ops = &qla8000_nic_ops;
+	return status;
+}
+
+static void ql_release_all(struct pci_dev *pdev)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	if (qdev->workqueue) {
+		destroy_workqueue(qdev->workqueue);
+		qdev->workqueue = NULL;
+	}
+
+	if (qdev->reg_base)
+		iounmap(qdev->reg_base);
+	if (qdev->doorbell_area)
+		iounmap(qdev->doorbell_area);
+	vfree(qdev->mpi_coredump);
+	pci_release_regions(pdev);
+}
+
+static int ql_init_device(struct pci_dev *pdev, struct net_device *ndev,
+			  int cards_found)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int err = 0;
+
+	memset((void *)qdev, 0, sizeof(*qdev));
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "PCI device enable failed.\n");
+		return err;
+	}
+
+	qdev->ndev = ndev;
+	qdev->pdev = pdev;
+	pci_set_drvdata(pdev, ndev);
+
+	/* Set PCIe read request size */
+	err = pcie_set_readrq(pdev, 4096);
+	if (err) {
+		dev_err(&pdev->dev, "Set readrq failed.\n");
+		goto err_out1;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "PCI region request failed.\n");
+		return err;
+	}
+
+	pci_set_master(pdev);
+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+		set_bit(QL_DMA64, &qdev->flags);
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	} else {
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (!err)
+		       err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+	}
+
+	if (err) {
+		dev_err(&pdev->dev, "No usable DMA configuration.\n");
+		goto err_out2;
+	}
+
+	/* Set PCIe reset type for EEH to fundamental. */
+	pdev->needs_freset = 1;
+	pci_save_state(pdev);
+	qdev->reg_base =
+	    ioremap_nocache(pci_resource_start(pdev, 1),
+			    pci_resource_len(pdev, 1));
+	if (!qdev->reg_base) {
+		dev_err(&pdev->dev, "Register mapping failed.\n");
+		err = -ENOMEM;
+		goto err_out2;
+	}
+
+	qdev->doorbell_area_size = pci_resource_len(pdev, 3);
+	qdev->doorbell_area =
+	    ioremap_nocache(pci_resource_start(pdev, 3),
+			    pci_resource_len(pdev, 3));
+	if (!qdev->doorbell_area) {
+		dev_err(&pdev->dev, "Doorbell register mapping failed.\n");
+		err = -ENOMEM;
+		goto err_out2;
+	}
+
+	err = ql_get_board_info(qdev);
+	if (err) {
+		dev_err(&pdev->dev, "Register access failed.\n");
+		err = -EIO;
+		goto err_out2;
+	}
+	qdev->msg_enable = netif_msg_init(debug, default_msg);
+	spin_lock_init(&qdev->hw_lock);
+	spin_lock_init(&qdev->stats_lock);
+
+	if (qlge_mpi_coredump) {
+		qdev->mpi_coredump =
+			vmalloc(sizeof(struct ql_mpi_coredump));
+		if (qdev->mpi_coredump == NULL) {
+			err = -ENOMEM;
+			goto err_out2;
+		}
+		if (qlge_force_coredump)
+			set_bit(QL_FRC_COREDUMP, &qdev->flags);
+	}
+	/* make sure the EEPROM is good */
+	err = qdev->nic_ops->get_flash(qdev);
+	if (err) {
+		dev_err(&pdev->dev, "Invalid FLASH.\n");
+		goto err_out2;
+	}
+
+	/* Keep local copy of current mac address. */
+	memcpy(qdev->current_mac_addr, ndev->dev_addr, ndev->addr_len);
+
+	/* Set up the default ring sizes. */
+	qdev->tx_ring_size = NUM_TX_RING_ENTRIES;
+	qdev->rx_ring_size = NUM_RX_RING_ENTRIES;
+
+	/* Set up the coalescing parameters. */
+	qdev->rx_coalesce_usecs = DFLT_COALESCE_WAIT;
+	qdev->tx_coalesce_usecs = DFLT_COALESCE_WAIT;
+	qdev->rx_max_coalesced_frames = DFLT_INTER_FRAME_WAIT;
+	qdev->tx_max_coalesced_frames = DFLT_INTER_FRAME_WAIT;
+
+	/*
+	 * Set up the operating parameters.
+	 */
+	qdev->workqueue = alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM,
+						  ndev->name);
+	INIT_DELAYED_WORK(&qdev->asic_reset_work, ql_asic_reset_work);
+	INIT_DELAYED_WORK(&qdev->mpi_reset_work, ql_mpi_reset_work);
+	INIT_DELAYED_WORK(&qdev->mpi_work, ql_mpi_work);
+	INIT_DELAYED_WORK(&qdev->mpi_port_cfg_work, ql_mpi_port_cfg_work);
+	INIT_DELAYED_WORK(&qdev->mpi_idc_work, ql_mpi_idc_work);
+	INIT_DELAYED_WORK(&qdev->mpi_core_to_log, ql_mpi_core_to_log);
+	init_completion(&qdev->ide_completion);
+	mutex_init(&qdev->mpi_mutex);
+
+	if (!cards_found) {
+		dev_info(&pdev->dev, "%s\n", DRV_STRING);
+		dev_info(&pdev->dev, "Driver name: %s, Version: %s.\n",
+			 DRV_NAME, DRV_VERSION);
+	}
+	return 0;
+err_out2:
+	ql_release_all(pdev);
+err_out1:
+	pci_disable_device(pdev);
+	return err;
+}
+
+static const struct net_device_ops qlge_netdev_ops = {
+	.ndo_open		= qlge_open,
+	.ndo_stop		= qlge_close,
+	.ndo_start_xmit		= qlge_send,
+	.ndo_change_mtu		= qlge_change_mtu,
+	.ndo_get_stats		= qlge_get_stats,
+	.ndo_set_rx_mode	= qlge_set_multicast_list,
+	.ndo_set_mac_address	= qlge_set_mac_address,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_tx_timeout		= qlge_tx_timeout,
+	.ndo_fix_features	= qlge_fix_features,
+	.ndo_set_features	= qlge_set_features,
+	.ndo_vlan_rx_add_vid	= qlge_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= qlge_vlan_rx_kill_vid,
+};
+
+static void ql_timer(struct timer_list *t)
+{
+	struct ql_adapter *qdev = from_timer(qdev, t, timer);
+	u32 var = 0;
+
+	var = ql_read32(qdev, STS);
+	if (pci_channel_offline(qdev->pdev)) {
+		netif_err(qdev, ifup, qdev->ndev, "EEH STS = 0x%.08x.\n", var);
+		return;
+	}
+
+	mod_timer(&qdev->timer, jiffies + (5*HZ));
+}
+
+static int qlge_probe(struct pci_dev *pdev,
+		      const struct pci_device_id *pci_entry)
+{
+	struct net_device *ndev = NULL;
+	struct ql_adapter *qdev = NULL;
+	static int cards_found = 0;
+	int err = 0;
+
+	ndev = alloc_etherdev_mq(sizeof(struct ql_adapter),
+			min(MAX_CPUS, netif_get_num_default_rss_queues()));
+	if (!ndev)
+		return -ENOMEM;
+
+	err = ql_init_device(pdev, ndev, cards_found);
+	if (err < 0) {
+		free_netdev(ndev);
+		return err;
+	}
+
+	qdev = netdev_priv(ndev);
+	SET_NETDEV_DEV(ndev, &pdev->dev);
+	ndev->hw_features = NETIF_F_SG |
+			    NETIF_F_IP_CSUM |
+			    NETIF_F_TSO |
+			    NETIF_F_TSO_ECN |
+			    NETIF_F_HW_VLAN_CTAG_TX |
+			    NETIF_F_HW_VLAN_CTAG_RX |
+			    NETIF_F_HW_VLAN_CTAG_FILTER |
+			    NETIF_F_RXCSUM;
+	ndev->features = ndev->hw_features;
+	ndev->vlan_features = ndev->hw_features;
+	/* vlan gets same features (except vlan filter) */
+	ndev->vlan_features &= ~(NETIF_F_HW_VLAN_CTAG_FILTER |
+				 NETIF_F_HW_VLAN_CTAG_TX |
+				 NETIF_F_HW_VLAN_CTAG_RX);
+
+	if (test_bit(QL_DMA64, &qdev->flags))
+		ndev->features |= NETIF_F_HIGHDMA;
+
+	/*
+	 * Set up net_device structure.
+	 */
+	ndev->tx_queue_len = qdev->tx_ring_size;
+	ndev->irq = pdev->irq;
+
+	ndev->netdev_ops = &qlge_netdev_ops;
+	ndev->ethtool_ops = &qlge_ethtool_ops;
+	ndev->watchdog_timeo = 10 * HZ;
+
+	/* MTU range: this driver only supports 1500 or 9000, so this only
+	 * filters out values above or below, and we'll rely on
+	 * qlge_change_mtu to make sure only 1500 or 9000 are allowed
+	 */
+	ndev->min_mtu = ETH_DATA_LEN;
+	ndev->max_mtu = 9000;
+
+	err = register_netdev(ndev);
+	if (err) {
+		dev_err(&pdev->dev, "net device registration failed.\n");
+		ql_release_all(pdev);
+		pci_disable_device(pdev);
+		free_netdev(ndev);
+		return err;
+	}
+	/* Start up the timer to trigger EEH if
+	 * the bus goes dead
+	 */
+	timer_setup(&qdev->timer, ql_timer, TIMER_DEFERRABLE);
+	mod_timer(&qdev->timer, jiffies + (5*HZ));
+	ql_link_off(qdev);
+	ql_display_dev_info(ndev);
+	atomic_set(&qdev->lb_count, 0);
+	cards_found++;
+	return 0;
+}
+
+netdev_tx_t ql_lb_send(struct sk_buff *skb, struct net_device *ndev)
+{
+	return qlge_send(skb, ndev);
+}
+
+int ql_clean_lb_rx_ring(struct rx_ring *rx_ring, int budget)
+{
+	return ql_clean_inbound_rx_ring(rx_ring, budget);
+}
+
+static void qlge_remove(struct pci_dev *pdev)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	del_timer_sync(&qdev->timer);
+	ql_cancel_all_work_sync(qdev);
+	unregister_netdev(ndev);
+	ql_release_all(pdev);
+	pci_disable_device(pdev);
+	free_netdev(ndev);
+}
+
+/* Clean up resources without touching hardware. */
+static void ql_eeh_close(struct net_device *ndev)
+{
+	int i;
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	if (netif_carrier_ok(ndev)) {
+		netif_carrier_off(ndev);
+		netif_stop_queue(ndev);
+	}
+
+	/* Disabling the timer */
+	ql_cancel_all_work_sync(qdev);
+
+	for (i = 0; i < qdev->rss_ring_count; i++)
+		netif_napi_del(&qdev->rx_ring[i].napi);
+
+	clear_bit(QL_ADAPTER_UP, &qdev->flags);
+	ql_tx_ring_clean(qdev);
+	ql_free_rx_buffers(qdev);
+	ql_release_adapter_resources(qdev);
+}
+
+/*
+ * This callback is called by the PCI subsystem whenever
+ * a PCI bus error is detected.
+ */
+static pci_ers_result_t qlge_io_error_detected(struct pci_dev *pdev,
+					       enum pci_channel_state state)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	switch (state) {
+	case pci_channel_io_normal:
+		return PCI_ERS_RESULT_CAN_RECOVER;
+	case pci_channel_io_frozen:
+		netif_device_detach(ndev);
+		del_timer_sync(&qdev->timer);
+		if (netif_running(ndev))
+			ql_eeh_close(ndev);
+		pci_disable_device(pdev);
+		return PCI_ERS_RESULT_NEED_RESET;
+	case pci_channel_io_perm_failure:
+		dev_err(&pdev->dev,
+			"%s: pci_channel_io_perm_failure.\n", __func__);
+		del_timer_sync(&qdev->timer);
+		ql_eeh_close(ndev);
+		set_bit(QL_EEH_FATAL, &qdev->flags);
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	/* Request a slot reset. */
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/*
+ * This callback is called after the PCI buss has been reset.
+ * Basically, this tries to restart the card from scratch.
+ * This is a shortened version of the device probe/discovery code,
+ * it resembles the first-half of the () routine.
+ */
+static pci_ers_result_t qlge_io_slot_reset(struct pci_dev *pdev)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	pdev->error_state = pci_channel_io_normal;
+
+	pci_restore_state(pdev);
+	if (pci_enable_device(pdev)) {
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Cannot re-enable PCI device after reset.\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+	pci_set_master(pdev);
+
+	if (ql_adapter_reset(qdev)) {
+		netif_err(qdev, drv, qdev->ndev, "reset FAILED!\n");
+		set_bit(QL_EEH_FATAL, &qdev->flags);
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void qlge_io_resume(struct pci_dev *pdev)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int err = 0;
+
+	if (netif_running(ndev)) {
+		err = qlge_open(ndev);
+		if (err) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Device initialization failed after reset.\n");
+			return;
+		}
+	} else {
+		netif_err(qdev, ifup, qdev->ndev,
+			  "Device was not running prior to EEH.\n");
+	}
+	mod_timer(&qdev->timer, jiffies + (5*HZ));
+	netif_device_attach(ndev);
+}
+
+static const struct pci_error_handlers qlge_err_handler = {
+	.error_detected = qlge_io_error_detected,
+	.slot_reset = qlge_io_slot_reset,
+	.resume = qlge_io_resume,
+};
+
+static int qlge_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int err;
+
+	netif_device_detach(ndev);
+	del_timer_sync(&qdev->timer);
+
+	if (netif_running(ndev)) {
+		err = ql_adapter_down(qdev);
+		if (!err)
+			return err;
+	}
+
+	ql_wol(qdev);
+	err = pci_save_state(pdev);
+	if (err)
+		return err;
+
+	pci_disable_device(pdev);
+
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int qlge_resume(struct pci_dev *pdev)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int err;
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+	err = pci_enable_device(pdev);
+	if (err) {
+		netif_err(qdev, ifup, qdev->ndev, "Cannot enable PCI device from suspend\n");
+		return err;
+	}
+	pci_set_master(pdev);
+
+	pci_enable_wake(pdev, PCI_D3hot, 0);
+	pci_enable_wake(pdev, PCI_D3cold, 0);
+
+	if (netif_running(ndev)) {
+		err = ql_adapter_up(qdev);
+		if (err)
+			return err;
+	}
+
+	mod_timer(&qdev->timer, jiffies + (5*HZ));
+	netif_device_attach(ndev);
+
+	return 0;
+}
+#endif /* CONFIG_PM */
+
+static void qlge_shutdown(struct pci_dev *pdev)
+{
+	qlge_suspend(pdev, PMSG_SUSPEND);
+}
+
+static struct pci_driver qlge_driver = {
+	.name = DRV_NAME,
+	.id_table = qlge_pci_tbl,
+	.probe = qlge_probe,
+	.remove = qlge_remove,
+#ifdef CONFIG_PM
+	.suspend = qlge_suspend,
+	.resume = qlge_resume,
+#endif
+	.shutdown = qlge_shutdown,
+	.err_handler = &qlge_err_handler
+};
+
+module_pci_driver(qlge_driver);
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_mpi.c b/drivers/net/ethernet/qlogic/qlge/qlge_mpi.c
new file mode 100644
index 0000000..4be65d6
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_mpi.c
@@ -0,0 +1,1284 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "qlge.h"
+
+int ql_unpause_mpi_risc(struct ql_adapter *qdev)
+{
+	u32 tmp;
+
+	/* Un-pause the RISC */
+	tmp = ql_read32(qdev, CSR);
+	if (!(tmp & CSR_RP))
+		return -EIO;
+
+	ql_write32(qdev, CSR, CSR_CMD_CLR_PAUSE);
+	return 0;
+}
+
+int ql_pause_mpi_risc(struct ql_adapter *qdev)
+{
+	u32 tmp;
+	int count = UDELAY_COUNT;
+
+	/* Pause the RISC */
+	ql_write32(qdev, CSR, CSR_CMD_SET_PAUSE);
+	do {
+		tmp = ql_read32(qdev, CSR);
+		if (tmp & CSR_RP)
+			break;
+		mdelay(UDELAY_DELAY);
+		count--;
+	} while (count);
+	return (count == 0) ? -ETIMEDOUT : 0;
+}
+
+int ql_hard_reset_mpi_risc(struct ql_adapter *qdev)
+{
+	u32 tmp;
+	int count = UDELAY_COUNT;
+
+	/* Reset the RISC */
+	ql_write32(qdev, CSR, CSR_CMD_SET_RST);
+	do {
+		tmp = ql_read32(qdev, CSR);
+		if (tmp & CSR_RR) {
+			ql_write32(qdev, CSR, CSR_CMD_CLR_RST);
+			break;
+		}
+		mdelay(UDELAY_DELAY);
+		count--;
+	} while (count);
+	return (count == 0) ? -ETIMEDOUT : 0;
+}
+
+int ql_read_mpi_reg(struct ql_adapter *qdev, u32 reg, u32 *data)
+{
+	int status;
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev, PROC_ADDR, PROC_ADDR_RDY, PROC_ADDR_ERR);
+	if (status)
+		goto exit;
+	/* set up for reg read */
+	ql_write32(qdev, PROC_ADDR, reg | PROC_ADDR_R);
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev, PROC_ADDR, PROC_ADDR_RDY, PROC_ADDR_ERR);
+	if (status)
+		goto exit;
+	/* get the data */
+	*data = ql_read32(qdev, PROC_DATA);
+exit:
+	return status;
+}
+
+int ql_write_mpi_reg(struct ql_adapter *qdev, u32 reg, u32 data)
+{
+	int status = 0;
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev, PROC_ADDR, PROC_ADDR_RDY, PROC_ADDR_ERR);
+	if (status)
+		goto exit;
+	/* write the data to the data reg */
+	ql_write32(qdev, PROC_DATA, data);
+	/* trigger the write */
+	ql_write32(qdev, PROC_ADDR, reg);
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev, PROC_ADDR, PROC_ADDR_RDY, PROC_ADDR_ERR);
+	if (status)
+		goto exit;
+exit:
+	return status;
+}
+
+int ql_soft_reset_mpi_risc(struct ql_adapter *qdev)
+{
+	int status;
+	status = ql_write_mpi_reg(qdev, 0x00001010, 1);
+	return status;
+}
+
+/* Determine if we are in charge of the firwmare. If
+ * we are the lower of the 2 NIC pcie functions, or if
+ * we are the higher function and the lower function
+ * is not enabled.
+ */
+int ql_own_firmware(struct ql_adapter *qdev)
+{
+	u32 temp;
+
+	/* If we are the lower of the 2 NIC functions
+	 * on the chip the we are responsible for
+	 * core dump and firmware reset after an error.
+	 */
+	if (qdev->func < qdev->alt_func)
+		return 1;
+
+	/* If we are the higher of the 2 NIC functions
+	 * on the chip and the lower function is not
+	 * enabled, then we are responsible for
+	 * core dump and firmware reset after an error.
+	 */
+	temp =  ql_read32(qdev, STS);
+	if (!(temp & (1 << (8 + qdev->alt_func))))
+		return 1;
+
+	return 0;
+
+}
+
+static int ql_get_mb_sts(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	int i, status;
+
+	status = ql_sem_spinlock(qdev, SEM_PROC_REG_MASK);
+	if (status)
+		return -EBUSY;
+	for (i = 0; i < mbcp->out_count; i++) {
+		status =
+		    ql_read_mpi_reg(qdev, qdev->mailbox_out + i,
+				     &mbcp->mbox_out[i]);
+		if (status) {
+			netif_err(qdev, drv, qdev->ndev, "Failed mailbox read.\n");
+			break;
+		}
+	}
+	ql_sem_unlock(qdev, SEM_PROC_REG_MASK);	/* does flush too */
+	return status;
+}
+
+/* Wait for a single mailbox command to complete.
+ * Returns zero on success.
+ */
+static int ql_wait_mbx_cmd_cmplt(struct ql_adapter *qdev)
+{
+	int count = 100;
+	u32 value;
+
+	do {
+		value = ql_read32(qdev, STS);
+		if (value & STS_PI)
+			return 0;
+		mdelay(UDELAY_DELAY); /* 100ms */
+	} while (--count);
+	return -ETIMEDOUT;
+}
+
+/* Execute a single mailbox command.
+ * Caller must hold PROC_ADDR semaphore.
+ */
+static int ql_exec_mb_cmd(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	int i, status;
+
+	/*
+	 * Make sure there's nothing pending.
+	 * This shouldn't happen.
+	 */
+	if (ql_read32(qdev, CSR) & CSR_HRI)
+		return -EIO;
+
+	status = ql_sem_spinlock(qdev, SEM_PROC_REG_MASK);
+	if (status)
+		return status;
+
+	/*
+	 * Fill the outbound mailboxes.
+	 */
+	for (i = 0; i < mbcp->in_count; i++) {
+		status = ql_write_mpi_reg(qdev, qdev->mailbox_in + i,
+						mbcp->mbox_in[i]);
+		if (status)
+			goto end;
+	}
+	/*
+	 * Wake up the MPI firmware.
+	 */
+	ql_write32(qdev, CSR, CSR_CMD_SET_H2R_INT);
+end:
+	ql_sem_unlock(qdev, SEM_PROC_REG_MASK);
+	return status;
+}
+
+/* We are being asked by firmware to accept
+ * a change to the port.  This is only
+ * a change to max frame sizes (Tx/Rx), pause
+ * parameters, or loopback mode. We wake up a worker
+ * to handler processing this since a mailbox command
+ * will need to be sent to ACK the request.
+ */
+static int ql_idc_req_aen(struct ql_adapter *qdev)
+{
+	int status;
+	struct mbox_params *mbcp = &qdev->idc_mbc;
+
+	netif_err(qdev, drv, qdev->ndev, "Enter!\n");
+	/* Get the status data and start up a thread to
+	 * handle the request.
+	 */
+	mbcp->out_count = 4;
+	status = ql_get_mb_sts(qdev, mbcp);
+	if (status) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Could not read MPI, resetting ASIC!\n");
+		ql_queue_asic_error(qdev);
+	} else	{
+		/* Begin polled mode early so
+		 * we don't get another interrupt
+		 * when we leave mpi_worker.
+		 */
+		ql_write32(qdev, INTR_MASK, (INTR_MASK_PI << 16));
+		queue_delayed_work(qdev->workqueue, &qdev->mpi_idc_work, 0);
+	}
+	return status;
+}
+
+/* Process an inter-device event completion.
+ * If good, signal the caller's completion.
+ */
+static int ql_idc_cmplt_aen(struct ql_adapter *qdev)
+{
+	int status;
+	struct mbox_params *mbcp = &qdev->idc_mbc;
+	mbcp->out_count = 4;
+	status = ql_get_mb_sts(qdev, mbcp);
+	if (status) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Could not read MPI, resetting RISC!\n");
+		ql_queue_fw_error(qdev);
+	} else
+		/* Wake up the sleeping mpi_idc_work thread that is
+		 * waiting for this event.
+		 */
+		complete(&qdev->ide_completion);
+
+	return status;
+}
+
+static void ql_link_up(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	int status;
+	mbcp->out_count = 2;
+
+	status = ql_get_mb_sts(qdev, mbcp);
+	if (status) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "%s: Could not get mailbox status.\n", __func__);
+		return;
+	}
+
+	qdev->link_status = mbcp->mbox_out[1];
+	netif_err(qdev, drv, qdev->ndev, "Link Up.\n");
+
+	/* If we're coming back from an IDC event
+	 * then set up the CAM and frame routing.
+	 */
+	if (test_bit(QL_CAM_RT_SET, &qdev->flags)) {
+		status = ql_cam_route_initialize(qdev);
+		if (status) {
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Failed to init CAM/Routing tables.\n");
+			return;
+		} else
+			clear_bit(QL_CAM_RT_SET, &qdev->flags);
+	}
+
+	/* Queue up a worker to check the frame
+	 * size information, and fix it if it's not
+	 * to our liking.
+	 */
+	if (!test_bit(QL_PORT_CFG, &qdev->flags)) {
+		netif_err(qdev, drv, qdev->ndev, "Queue Port Config Worker!\n");
+		set_bit(QL_PORT_CFG, &qdev->flags);
+		/* Begin polled mode early so
+		 * we don't get another interrupt
+		 * when we leave mpi_worker dpc.
+		 */
+		ql_write32(qdev, INTR_MASK, (INTR_MASK_PI << 16));
+		queue_delayed_work(qdev->workqueue,
+				&qdev->mpi_port_cfg_work, 0);
+	}
+
+	ql_link_on(qdev);
+}
+
+static void ql_link_down(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	int status;
+
+	mbcp->out_count = 3;
+
+	status = ql_get_mb_sts(qdev, mbcp);
+	if (status)
+		netif_err(qdev, drv, qdev->ndev, "Link down AEN broken!\n");
+
+	ql_link_off(qdev);
+}
+
+static int ql_sfp_in(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	int status;
+
+	mbcp->out_count = 5;
+
+	status = ql_get_mb_sts(qdev, mbcp);
+	if (status)
+		netif_err(qdev, drv, qdev->ndev, "SFP in AEN broken!\n");
+	else
+		netif_err(qdev, drv, qdev->ndev, "SFP insertion detected.\n");
+
+	return status;
+}
+
+static int ql_sfp_out(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	int status;
+
+	mbcp->out_count = 1;
+
+	status = ql_get_mb_sts(qdev, mbcp);
+	if (status)
+		netif_err(qdev, drv, qdev->ndev, "SFP out AEN broken!\n");
+	else
+		netif_err(qdev, drv, qdev->ndev, "SFP removal detected.\n");
+
+	return status;
+}
+
+static int ql_aen_lost(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	int status;
+
+	mbcp->out_count = 6;
+
+	status = ql_get_mb_sts(qdev, mbcp);
+	if (status)
+		netif_err(qdev, drv, qdev->ndev, "Lost AEN broken!\n");
+	else {
+		int i;
+		netif_err(qdev, drv, qdev->ndev, "Lost AEN detected.\n");
+		for (i = 0; i < mbcp->out_count; i++)
+			netif_err(qdev, drv, qdev->ndev, "mbox_out[%d] = 0x%.08x.\n",
+				  i, mbcp->mbox_out[i]);
+
+	}
+
+	return status;
+}
+
+static void ql_init_fw_done(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	int status;
+
+	mbcp->out_count = 2;
+
+	status = ql_get_mb_sts(qdev, mbcp);
+	if (status) {
+		netif_err(qdev, drv, qdev->ndev, "Firmware did not initialize!\n");
+	} else {
+		netif_err(qdev, drv, qdev->ndev, "Firmware Revision  = 0x%.08x.\n",
+			  mbcp->mbox_out[1]);
+		qdev->fw_rev_id = mbcp->mbox_out[1];
+		status = ql_cam_route_initialize(qdev);
+		if (status)
+			netif_err(qdev, ifup, qdev->ndev,
+				  "Failed to init CAM/Routing tables.\n");
+	}
+}
+
+/* Process an async event and clear it unless it's an
+ * error condition.
+ *  This can get called iteratively from the mpi_work thread
+ *  when events arrive via an interrupt.
+ *  It also gets called when a mailbox command is polling for
+ *  it's completion. */
+static int ql_mpi_handler(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	int status;
+	int orig_count = mbcp->out_count;
+
+	/* Just get mailbox zero for now. */
+	mbcp->out_count = 1;
+	status = ql_get_mb_sts(qdev, mbcp);
+	if (status) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Could not read MPI, resetting ASIC!\n");
+		ql_queue_asic_error(qdev);
+		goto end;
+	}
+
+	switch (mbcp->mbox_out[0]) {
+
+	/* This case is only active when we arrive here
+	 * as a result of issuing a mailbox command to
+	 * the firmware.
+	 */
+	case MB_CMD_STS_INTRMDT:
+	case MB_CMD_STS_GOOD:
+	case MB_CMD_STS_INVLD_CMD:
+	case MB_CMD_STS_XFC_ERR:
+	case MB_CMD_STS_CSUM_ERR:
+	case MB_CMD_STS_ERR:
+	case MB_CMD_STS_PARAM_ERR:
+		/* We can only get mailbox status if we're polling from an
+		 * unfinished command.  Get the rest of the status data and
+		 * return back to the caller.
+		 * We only end up here when we're polling for a mailbox
+		 * command completion.
+		 */
+		mbcp->out_count = orig_count;
+		status = ql_get_mb_sts(qdev, mbcp);
+		return status;
+
+	/* We are being asked by firmware to accept
+	 * a change to the port.  This is only
+	 * a change to max frame sizes (Tx/Rx), pause
+	 * parameters, or loopback mode.
+	 */
+	case AEN_IDC_REQ:
+		status = ql_idc_req_aen(qdev);
+		break;
+
+	/* Process and inbound IDC event.
+	 * This will happen when we're trying to
+	 * change tx/rx max frame size, change pause
+	 * parameters or loopback mode.
+	 */
+	case AEN_IDC_CMPLT:
+	case AEN_IDC_EXT:
+		status = ql_idc_cmplt_aen(qdev);
+		break;
+
+	case AEN_LINK_UP:
+		ql_link_up(qdev, mbcp);
+		break;
+
+	case AEN_LINK_DOWN:
+		ql_link_down(qdev, mbcp);
+		break;
+
+	case AEN_FW_INIT_DONE:
+		/* If we're in process on executing the firmware,
+		 * then convert the status to normal mailbox status.
+		 */
+		if (mbcp->mbox_in[0] == MB_CMD_EX_FW) {
+			mbcp->out_count = orig_count;
+			status = ql_get_mb_sts(qdev, mbcp);
+			mbcp->mbox_out[0] = MB_CMD_STS_GOOD;
+			return status;
+		}
+		ql_init_fw_done(qdev, mbcp);
+		break;
+
+	case AEN_AEN_SFP_IN:
+		ql_sfp_in(qdev, mbcp);
+		break;
+
+	case AEN_AEN_SFP_OUT:
+		ql_sfp_out(qdev, mbcp);
+		break;
+
+	/* This event can arrive at boot time or after an
+	 * MPI reset if the firmware failed to initialize.
+	 */
+	case AEN_FW_INIT_FAIL:
+		/* If we're in process on executing the firmware,
+		 * then convert the status to normal mailbox status.
+		 */
+		if (mbcp->mbox_in[0] == MB_CMD_EX_FW) {
+			mbcp->out_count = orig_count;
+			status = ql_get_mb_sts(qdev, mbcp);
+			mbcp->mbox_out[0] = MB_CMD_STS_ERR;
+			return status;
+		}
+		netif_err(qdev, drv, qdev->ndev,
+			  "Firmware initialization failed.\n");
+		status = -EIO;
+		ql_queue_fw_error(qdev);
+		break;
+
+	case AEN_SYS_ERR:
+		netif_err(qdev, drv, qdev->ndev, "System Error.\n");
+		ql_queue_fw_error(qdev);
+		status = -EIO;
+		break;
+
+	case AEN_AEN_LOST:
+		ql_aen_lost(qdev, mbcp);
+		break;
+
+	case AEN_DCBX_CHG:
+		/* Need to support AEN 8110 */
+		break;
+	default:
+		netif_err(qdev, drv, qdev->ndev,
+			  "Unsupported AE %.08x.\n", mbcp->mbox_out[0]);
+		/* Clear the MPI firmware status. */
+	}
+end:
+	ql_write32(qdev, CSR, CSR_CMD_CLR_R2PCI_INT);
+	/* Restore the original mailbox count to
+	 * what the caller asked for.  This can get
+	 * changed when a mailbox command is waiting
+	 * for a response and an AEN arrives and
+	 * is handled.
+	 * */
+	mbcp->out_count = orig_count;
+	return status;
+}
+
+/* Execute a single mailbox command.
+ * mbcp is a pointer to an array of u32.  Each
+ * element in the array contains the value for it's
+ * respective mailbox register.
+ */
+static int ql_mailbox_command(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	int status;
+	unsigned long count;
+
+	mutex_lock(&qdev->mpi_mutex);
+
+	/* Begin polled mode for MPI */
+	ql_write32(qdev, INTR_MASK, (INTR_MASK_PI << 16));
+
+	/* Load the mailbox registers and wake up MPI RISC. */
+	status = ql_exec_mb_cmd(qdev, mbcp);
+	if (status)
+		goto end;
+
+
+	/* If we're generating a system error, then there's nothing
+	 * to wait for.
+	 */
+	if (mbcp->mbox_in[0] == MB_CMD_MAKE_SYS_ERR)
+		goto end;
+
+	/* Wait for the command to complete. We loop
+	 * here because some AEN might arrive while
+	 * we're waiting for the mailbox command to
+	 * complete. If more than 5 seconds expire we can
+	 * assume something is wrong. */
+	count = jiffies + HZ * MAILBOX_TIMEOUT;
+	do {
+		/* Wait for the interrupt to come in. */
+		status = ql_wait_mbx_cmd_cmplt(qdev);
+		if (status)
+			continue;
+
+		/* Process the event.  If it's an AEN, it
+		 * will be handled in-line or a worker
+		 * will be spawned. If it's our completion
+		 * we will catch it below.
+		 */
+		status = ql_mpi_handler(qdev, mbcp);
+		if (status)
+			goto end;
+
+		/* It's either the completion for our mailbox
+		 * command complete or an AEN.  If it's our
+		 * completion then get out.
+		 */
+		if (((mbcp->mbox_out[0] & 0x0000f000) ==
+					MB_CMD_STS_GOOD) ||
+			((mbcp->mbox_out[0] & 0x0000f000) ==
+					MB_CMD_STS_INTRMDT))
+			goto done;
+	} while (time_before(jiffies, count));
+
+	netif_err(qdev, drv, qdev->ndev,
+		  "Timed out waiting for mailbox complete.\n");
+	status = -ETIMEDOUT;
+	goto end;
+
+done:
+
+	/* Now we can clear the interrupt condition
+	 * and look at our status.
+	 */
+	ql_write32(qdev, CSR, CSR_CMD_CLR_R2PCI_INT);
+
+	if (((mbcp->mbox_out[0] & 0x0000f000) !=
+					MB_CMD_STS_GOOD) &&
+		((mbcp->mbox_out[0] & 0x0000f000) !=
+					MB_CMD_STS_INTRMDT)) {
+		status = -EIO;
+	}
+end:
+	/* End polled mode for MPI */
+	ql_write32(qdev, INTR_MASK, (INTR_MASK_PI << 16) | INTR_MASK_PI);
+	mutex_unlock(&qdev->mpi_mutex);
+	return status;
+}
+
+/* Get MPI firmware version. This will be used for
+ * driver banner and for ethtool info.
+ * Returns zero on success.
+ */
+int ql_mb_about_fw(struct ql_adapter *qdev)
+{
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+	int status = 0;
+
+	memset(mbcp, 0, sizeof(struct mbox_params));
+
+	mbcp->in_count = 1;
+	mbcp->out_count = 3;
+
+	mbcp->mbox_in[0] = MB_CMD_ABOUT_FW;
+
+	status = ql_mailbox_command(qdev, mbcp);
+	if (status)
+		return status;
+
+	if (mbcp->mbox_out[0] != MB_CMD_STS_GOOD) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Failed about firmware command\n");
+		status = -EIO;
+	}
+
+	/* Store the firmware version */
+	qdev->fw_rev_id = mbcp->mbox_out[1];
+
+	return status;
+}
+
+/* Get functional state for MPI firmware.
+ * Returns zero on success.
+ */
+int ql_mb_get_fw_state(struct ql_adapter *qdev)
+{
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+	int status = 0;
+
+	memset(mbcp, 0, sizeof(struct mbox_params));
+
+	mbcp->in_count = 1;
+	mbcp->out_count = 2;
+
+	mbcp->mbox_in[0] = MB_CMD_GET_FW_STATE;
+
+	status = ql_mailbox_command(qdev, mbcp);
+	if (status)
+		return status;
+
+	if (mbcp->mbox_out[0] != MB_CMD_STS_GOOD) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Failed Get Firmware State.\n");
+		status = -EIO;
+	}
+
+	/* If bit zero is set in mbx 1 then the firmware is
+	 * running, but not initialized.  This should never
+	 * happen.
+	 */
+	if (mbcp->mbox_out[1] & 1) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Firmware waiting for initialization.\n");
+		status = -EIO;
+	}
+
+	return status;
+}
+
+/* Send and ACK mailbox command to the firmware to
+ * let it continue with the change.
+ */
+static int ql_mb_idc_ack(struct ql_adapter *qdev)
+{
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+	int status = 0;
+
+	memset(mbcp, 0, sizeof(struct mbox_params));
+
+	mbcp->in_count = 5;
+	mbcp->out_count = 1;
+
+	mbcp->mbox_in[0] = MB_CMD_IDC_ACK;
+	mbcp->mbox_in[1] = qdev->idc_mbc.mbox_out[1];
+	mbcp->mbox_in[2] = qdev->idc_mbc.mbox_out[2];
+	mbcp->mbox_in[3] = qdev->idc_mbc.mbox_out[3];
+	mbcp->mbox_in[4] = qdev->idc_mbc.mbox_out[4];
+
+	status = ql_mailbox_command(qdev, mbcp);
+	if (status)
+		return status;
+
+	if (mbcp->mbox_out[0] != MB_CMD_STS_GOOD) {
+		netif_err(qdev, drv, qdev->ndev, "Failed IDC ACK send.\n");
+		status = -EIO;
+	}
+	return status;
+}
+
+/* Get link settings and maximum frame size settings
+ * for the current port.
+ * Most likely will block.
+ */
+int ql_mb_set_port_cfg(struct ql_adapter *qdev)
+{
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+	int status = 0;
+
+	memset(mbcp, 0, sizeof(struct mbox_params));
+
+	mbcp->in_count = 3;
+	mbcp->out_count = 1;
+
+	mbcp->mbox_in[0] = MB_CMD_SET_PORT_CFG;
+	mbcp->mbox_in[1] = qdev->link_config;
+	mbcp->mbox_in[2] = qdev->max_frame_size;
+
+
+	status = ql_mailbox_command(qdev, mbcp);
+	if (status)
+		return status;
+
+	if (mbcp->mbox_out[0] == MB_CMD_STS_INTRMDT) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Port Config sent, wait for IDC.\n");
+	} else	if (mbcp->mbox_out[0] != MB_CMD_STS_GOOD) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Failed Set Port Configuration.\n");
+		status = -EIO;
+	}
+	return status;
+}
+
+static int ql_mb_dump_ram(struct ql_adapter *qdev, u64 req_dma, u32 addr,
+	u32 size)
+{
+	int status = 0;
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+
+	memset(mbcp, 0, sizeof(struct mbox_params));
+
+	mbcp->in_count = 9;
+	mbcp->out_count = 1;
+
+	mbcp->mbox_in[0] = MB_CMD_DUMP_RISC_RAM;
+	mbcp->mbox_in[1] = LSW(addr);
+	mbcp->mbox_in[2] = MSW(req_dma);
+	mbcp->mbox_in[3] = LSW(req_dma);
+	mbcp->mbox_in[4] = MSW(size);
+	mbcp->mbox_in[5] = LSW(size);
+	mbcp->mbox_in[6] = MSW(MSD(req_dma));
+	mbcp->mbox_in[7] = LSW(MSD(req_dma));
+	mbcp->mbox_in[8] = MSW(addr);
+
+
+	status = ql_mailbox_command(qdev, mbcp);
+	if (status)
+		return status;
+
+	if (mbcp->mbox_out[0] != MB_CMD_STS_GOOD) {
+		netif_err(qdev, drv, qdev->ndev, "Failed to dump risc RAM.\n");
+		status = -EIO;
+	}
+	return status;
+}
+
+/* Issue a mailbox command to dump RISC RAM. */
+int ql_dump_risc_ram_area(struct ql_adapter *qdev, void *buf,
+		u32 ram_addr, int word_count)
+{
+	int status;
+	char *my_buf;
+	dma_addr_t buf_dma;
+
+	my_buf = pci_alloc_consistent(qdev->pdev, word_count * sizeof(u32),
+					&buf_dma);
+	if (!my_buf)
+		return -EIO;
+
+	status = ql_mb_dump_ram(qdev, buf_dma, ram_addr, word_count);
+	if (!status)
+		memcpy(buf, my_buf, word_count * sizeof(u32));
+
+	pci_free_consistent(qdev->pdev, word_count * sizeof(u32), my_buf,
+				buf_dma);
+	return status;
+}
+
+/* Get link settings and maximum frame size settings
+ * for the current port.
+ * Most likely will block.
+ */
+int ql_mb_get_port_cfg(struct ql_adapter *qdev)
+{
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+	int status = 0;
+
+	memset(mbcp, 0, sizeof(struct mbox_params));
+
+	mbcp->in_count = 1;
+	mbcp->out_count = 3;
+
+	mbcp->mbox_in[0] = MB_CMD_GET_PORT_CFG;
+
+	status = ql_mailbox_command(qdev, mbcp);
+	if (status)
+		return status;
+
+	if (mbcp->mbox_out[0] != MB_CMD_STS_GOOD) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Failed Get Port Configuration.\n");
+		status = -EIO;
+	} else	{
+		netif_printk(qdev, drv, KERN_DEBUG, qdev->ndev,
+			     "Passed Get Port Configuration.\n");
+		qdev->link_config = mbcp->mbox_out[1];
+		qdev->max_frame_size = mbcp->mbox_out[2];
+	}
+	return status;
+}
+
+int ql_mb_wol_mode(struct ql_adapter *qdev, u32 wol)
+{
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+	int status;
+
+	memset(mbcp, 0, sizeof(struct mbox_params));
+
+	mbcp->in_count = 2;
+	mbcp->out_count = 1;
+
+	mbcp->mbox_in[0] = MB_CMD_SET_WOL_MODE;
+	mbcp->mbox_in[1] = wol;
+
+
+	status = ql_mailbox_command(qdev, mbcp);
+	if (status)
+		return status;
+
+	if (mbcp->mbox_out[0] != MB_CMD_STS_GOOD) {
+		netif_err(qdev, drv, qdev->ndev, "Failed to set WOL mode.\n");
+		status = -EIO;
+	}
+	return status;
+}
+
+int ql_mb_wol_set_magic(struct ql_adapter *qdev, u32 enable_wol)
+{
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+	int status;
+	u8 *addr = qdev->ndev->dev_addr;
+
+	memset(mbcp, 0, sizeof(struct mbox_params));
+
+	mbcp->in_count = 8;
+	mbcp->out_count = 1;
+
+	mbcp->mbox_in[0] = MB_CMD_SET_WOL_MAGIC;
+	if (enable_wol) {
+		mbcp->mbox_in[1] = (u32)addr[0];
+		mbcp->mbox_in[2] = (u32)addr[1];
+		mbcp->mbox_in[3] = (u32)addr[2];
+		mbcp->mbox_in[4] = (u32)addr[3];
+		mbcp->mbox_in[5] = (u32)addr[4];
+		mbcp->mbox_in[6] = (u32)addr[5];
+		mbcp->mbox_in[7] = 0;
+	} else {
+		mbcp->mbox_in[1] = 0;
+		mbcp->mbox_in[2] = 1;
+		mbcp->mbox_in[3] = 1;
+		mbcp->mbox_in[4] = 1;
+		mbcp->mbox_in[5] = 1;
+		mbcp->mbox_in[6] = 1;
+		mbcp->mbox_in[7] = 0;
+	}
+
+	status = ql_mailbox_command(qdev, mbcp);
+	if (status)
+		return status;
+
+	if (mbcp->mbox_out[0] != MB_CMD_STS_GOOD) {
+		netif_err(qdev, drv, qdev->ndev, "Failed to set WOL mode.\n");
+		status = -EIO;
+	}
+	return status;
+}
+
+/* IDC - Inter Device Communication...
+ * Some firmware commands require consent of adjacent FCOE
+ * function.  This function waits for the OK, or a
+ * counter-request for a little more time.i
+ * The firmware will complete the request if the other
+ * function doesn't respond.
+ */
+static int ql_idc_wait(struct ql_adapter *qdev)
+{
+	int status = -ETIMEDOUT;
+	long wait_time = 1 * HZ;
+	struct mbox_params *mbcp = &qdev->idc_mbc;
+	do {
+		/* Wait here for the command to complete
+		 * via the IDC process.
+		 */
+		wait_time =
+			wait_for_completion_timeout(&qdev->ide_completion,
+							wait_time);
+		if (!wait_time) {
+			netif_err(qdev, drv, qdev->ndev, "IDC Timeout.\n");
+			break;
+		}
+		/* Now examine the response from the IDC process.
+		 * We might have a good completion or a request for
+		 * more wait time.
+		 */
+		if (mbcp->mbox_out[0] == AEN_IDC_EXT) {
+			netif_err(qdev, drv, qdev->ndev,
+				  "IDC Time Extension from function.\n");
+			wait_time += (mbcp->mbox_out[1] >> 8) & 0x0000000f;
+		} else if (mbcp->mbox_out[0] == AEN_IDC_CMPLT) {
+			netif_err(qdev, drv, qdev->ndev, "IDC Success.\n");
+			status = 0;
+			break;
+		} else {
+			netif_err(qdev, drv, qdev->ndev,
+				  "IDC: Invalid State 0x%.04x.\n",
+				  mbcp->mbox_out[0]);
+			status = -EIO;
+			break;
+		}
+	} while (wait_time);
+
+	return status;
+}
+
+int ql_mb_set_led_cfg(struct ql_adapter *qdev, u32 led_config)
+{
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+	int status;
+
+	memset(mbcp, 0, sizeof(struct mbox_params));
+
+	mbcp->in_count = 2;
+	mbcp->out_count = 1;
+
+	mbcp->mbox_in[0] = MB_CMD_SET_LED_CFG;
+	mbcp->mbox_in[1] = led_config;
+
+
+	status = ql_mailbox_command(qdev, mbcp);
+	if (status)
+		return status;
+
+	if (mbcp->mbox_out[0] != MB_CMD_STS_GOOD) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Failed to set LED Configuration.\n");
+		status = -EIO;
+	}
+
+	return status;
+}
+
+int ql_mb_get_led_cfg(struct ql_adapter *qdev)
+{
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+	int status;
+
+	memset(mbcp, 0, sizeof(struct mbox_params));
+
+	mbcp->in_count = 1;
+	mbcp->out_count = 2;
+
+	mbcp->mbox_in[0] = MB_CMD_GET_LED_CFG;
+
+	status = ql_mailbox_command(qdev, mbcp);
+	if (status)
+		return status;
+
+	if (mbcp->mbox_out[0] != MB_CMD_STS_GOOD) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Failed to get LED Configuration.\n");
+		status = -EIO;
+	} else
+		qdev->led_config = mbcp->mbox_out[1];
+
+	return status;
+}
+
+int ql_mb_set_mgmnt_traffic_ctl(struct ql_adapter *qdev, u32 control)
+{
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+	int status;
+
+	memset(mbcp, 0, sizeof(struct mbox_params));
+
+	mbcp->in_count = 1;
+	mbcp->out_count = 2;
+
+	mbcp->mbox_in[0] = MB_CMD_SET_MGMNT_TFK_CTL;
+	mbcp->mbox_in[1] = control;
+
+	status = ql_mailbox_command(qdev, mbcp);
+	if (status)
+		return status;
+
+	if (mbcp->mbox_out[0] == MB_CMD_STS_GOOD)
+		return status;
+
+	if (mbcp->mbox_out[0] == MB_CMD_STS_INVLD_CMD) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Command not supported by firmware.\n");
+		status = -EINVAL;
+	} else if (mbcp->mbox_out[0] == MB_CMD_STS_ERR) {
+		/* This indicates that the firmware is
+		 * already in the state we are trying to
+		 * change it to.
+		 */
+		netif_err(qdev, drv, qdev->ndev,
+			  "Command parameters make no change.\n");
+	}
+	return status;
+}
+
+/* Returns a negative error code or the mailbox command status. */
+static int ql_mb_get_mgmnt_traffic_ctl(struct ql_adapter *qdev, u32 *control)
+{
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+	int status;
+
+	memset(mbcp, 0, sizeof(struct mbox_params));
+	*control = 0;
+
+	mbcp->in_count = 1;
+	mbcp->out_count = 1;
+
+	mbcp->mbox_in[0] = MB_CMD_GET_MGMNT_TFK_CTL;
+
+	status = ql_mailbox_command(qdev, mbcp);
+	if (status)
+		return status;
+
+	if (mbcp->mbox_out[0] == MB_CMD_STS_GOOD) {
+		*control = mbcp->mbox_in[1];
+		return status;
+	}
+
+	if (mbcp->mbox_out[0] == MB_CMD_STS_INVLD_CMD) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Command not supported by firmware.\n");
+		status = -EINVAL;
+	} else if (mbcp->mbox_out[0] == MB_CMD_STS_ERR) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Failed to get MPI traffic control.\n");
+		status = -EIO;
+	}
+	return status;
+}
+
+int ql_wait_fifo_empty(struct ql_adapter *qdev)
+{
+	int count = 5;
+	u32 mgmnt_fifo_empty;
+	u32 nic_fifo_empty;
+
+	do {
+		nic_fifo_empty = ql_read32(qdev, STS) & STS_NFE;
+		ql_mb_get_mgmnt_traffic_ctl(qdev, &mgmnt_fifo_empty);
+		mgmnt_fifo_empty &= MB_GET_MPI_TFK_FIFO_EMPTY;
+		if (nic_fifo_empty && mgmnt_fifo_empty)
+			return 0;
+		msleep(100);
+	} while (count-- > 0);
+	return -ETIMEDOUT;
+}
+
+/* API called in work thread context to set new TX/RX
+ * maximum frame size values to match MTU.
+ */
+static int ql_set_port_cfg(struct ql_adapter *qdev)
+{
+	int status;
+	status = ql_mb_set_port_cfg(qdev);
+	if (status)
+		return status;
+	status = ql_idc_wait(qdev);
+	return status;
+}
+
+/* The following routines are worker threads that process
+ * events that may sleep waiting for completion.
+ */
+
+/* This thread gets the maximum TX and RX frame size values
+ * from the firmware and, if necessary, changes them to match
+ * the MTU setting.
+ */
+void ql_mpi_port_cfg_work(struct work_struct *work)
+{
+	struct ql_adapter *qdev =
+	    container_of(work, struct ql_adapter, mpi_port_cfg_work.work);
+	int status;
+
+	status = ql_mb_get_port_cfg(qdev);
+	if (status) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Bug: Failed to get port config data.\n");
+		goto err;
+	}
+
+	if (qdev->link_config & CFG_JUMBO_FRAME_SIZE &&
+			qdev->max_frame_size ==
+			CFG_DEFAULT_MAX_FRAME_SIZE)
+		goto end;
+
+	qdev->link_config |=	CFG_JUMBO_FRAME_SIZE;
+	qdev->max_frame_size = CFG_DEFAULT_MAX_FRAME_SIZE;
+	status = ql_set_port_cfg(qdev);
+	if (status) {
+		netif_err(qdev, drv, qdev->ndev,
+			  "Bug: Failed to set port config data.\n");
+		goto err;
+	}
+end:
+	clear_bit(QL_PORT_CFG, &qdev->flags);
+	return;
+err:
+	ql_queue_fw_error(qdev);
+	goto end;
+}
+
+/* Process an inter-device request.  This is issues by
+ * the firmware in response to another function requesting
+ * a change to the port. We set a flag to indicate a change
+ * has been made and then send a mailbox command ACKing
+ * the change request.
+ */
+void ql_mpi_idc_work(struct work_struct *work)
+{
+	struct ql_adapter *qdev =
+	    container_of(work, struct ql_adapter, mpi_idc_work.work);
+	int status;
+	struct mbox_params *mbcp = &qdev->idc_mbc;
+	u32 aen;
+	int timeout;
+
+	aen = mbcp->mbox_out[1] >> 16;
+	timeout = (mbcp->mbox_out[1] >> 8) & 0xf;
+
+	switch (aen) {
+	default:
+		netif_err(qdev, drv, qdev->ndev,
+			  "Bug: Unhandled IDC action.\n");
+		break;
+	case MB_CMD_PORT_RESET:
+	case MB_CMD_STOP_FW:
+		ql_link_off(qdev);
+	case MB_CMD_SET_PORT_CFG:
+		/* Signal the resulting link up AEN
+		 * that the frame routing and mac addr
+		 * needs to be set.
+		 * */
+		set_bit(QL_CAM_RT_SET, &qdev->flags);
+		/* Do ACK if required */
+		if (timeout) {
+			status = ql_mb_idc_ack(qdev);
+			if (status)
+				netif_err(qdev, drv, qdev->ndev,
+					  "Bug: No pending IDC!\n");
+		} else {
+			netif_printk(qdev, drv, KERN_DEBUG, qdev->ndev,
+				     "IDC ACK not required\n");
+			status = 0; /* success */
+		}
+		break;
+
+	/* These sub-commands issued by another (FCoE)
+	 * function are requesting to do an operation
+	 * on the shared resource (MPI environment).
+	 * We currently don't issue these so we just
+	 * ACK the request.
+	 */
+	case MB_CMD_IOP_RESTART_MPI:
+	case MB_CMD_IOP_PREP_LINK_DOWN:
+		/* Drop the link, reload the routing
+		 * table when link comes up.
+		 */
+		ql_link_off(qdev);
+		set_bit(QL_CAM_RT_SET, &qdev->flags);
+		/* Fall through. */
+	case MB_CMD_IOP_DVR_START:
+	case MB_CMD_IOP_FLASH_ACC:
+	case MB_CMD_IOP_CORE_DUMP_MPI:
+	case MB_CMD_IOP_PREP_UPDATE_MPI:
+	case MB_CMD_IOP_COMP_UPDATE_MPI:
+	case MB_CMD_IOP_NONE:	/*  an IDC without params */
+		/* Do ACK if required */
+		if (timeout) {
+			status = ql_mb_idc_ack(qdev);
+			if (status)
+				netif_err(qdev, drv, qdev->ndev,
+					  "Bug: No pending IDC!\n");
+		} else {
+			netif_printk(qdev, drv, KERN_DEBUG, qdev->ndev,
+				     "IDC ACK not required\n");
+			status = 0; /* success */
+		}
+		break;
+	}
+}
+
+void ql_mpi_work(struct work_struct *work)
+{
+	struct ql_adapter *qdev =
+	    container_of(work, struct ql_adapter, mpi_work.work);
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+	int err = 0;
+
+	mutex_lock(&qdev->mpi_mutex);
+	/* Begin polled mode for MPI */
+	ql_write32(qdev, INTR_MASK, (INTR_MASK_PI << 16));
+
+	while (ql_read32(qdev, STS) & STS_PI) {
+		memset(mbcp, 0, sizeof(struct mbox_params));
+		mbcp->out_count = 1;
+		/* Don't continue if an async event
+		 * did not complete properly.
+		 */
+		err = ql_mpi_handler(qdev, mbcp);
+		if (err)
+			break;
+	}
+
+	/* End polled mode for MPI */
+	ql_write32(qdev, INTR_MASK, (INTR_MASK_PI << 16) | INTR_MASK_PI);
+	mutex_unlock(&qdev->mpi_mutex);
+	ql_enable_completion_interrupt(qdev, 0);
+}
+
+void ql_mpi_reset_work(struct work_struct *work)
+{
+	struct ql_adapter *qdev =
+	    container_of(work, struct ql_adapter, mpi_reset_work.work);
+	cancel_delayed_work_sync(&qdev->mpi_work);
+	cancel_delayed_work_sync(&qdev->mpi_port_cfg_work);
+	cancel_delayed_work_sync(&qdev->mpi_idc_work);
+	/* If we're not the dominant NIC function,
+	 * then there is nothing to do.
+	 */
+	if (!ql_own_firmware(qdev)) {
+		netif_err(qdev, drv, qdev->ndev, "Don't own firmware!\n");
+		return;
+	}
+
+	if (qdev->mpi_coredump && !ql_core_dump(qdev, qdev->mpi_coredump)) {
+		netif_err(qdev, drv, qdev->ndev, "Core is dumped!\n");
+		qdev->core_is_dumped = 1;
+		queue_delayed_work(qdev->workqueue,
+			&qdev->mpi_core_to_log, 5 * HZ);
+	}
+	ql_soft_reset_mpi_risc(qdev);
+}
diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig
new file mode 100644
index 0000000..04008e0
--- /dev/null
+++ b/drivers/nvme/Kconfig
@@ -0,0 +1,6 @@
+menu "NVME Support"
+
+source "drivers/nvme/host/Kconfig"
+source "drivers/nvme/target/Kconfig"
+
+endmenu
diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile
new file mode 100644
index 0000000..0096a7f
--- /dev/null
+++ b/drivers/nvme/Makefile
@@ -0,0 +1,3 @@
+
+obj-y		+= host/
+obj-y		+= target/
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
new file mode 100644
index 0000000..88a8b59
--- /dev/null
+++ b/drivers/nvme/host/Kconfig
@@ -0,0 +1,59 @@
+config NVME_CORE
+	tristate
+
+config BLK_DEV_NVME
+	tristate "NVM Express block device"
+	depends on PCI && BLOCK
+	select NVME_CORE
+	---help---
+	  The NVM Express driver is for solid state drives directly
+	  connected to the PCI or PCI Express bus.  If you know you
+	  don't have one of these, it is safe to answer N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called nvme.
+
+config NVME_MULTIPATH
+	bool "NVMe multipath support"
+	depends on NVME_CORE
+	---help---
+	   This option enables support for multipath access to NVMe
+	   subsystems.  If this option is enabled only a single
+	   /dev/nvmeXnY device will show up for each NVMe namespaces,
+	   even if it is accessible through multiple controllers.
+
+config NVME_FABRICS
+	tristate
+
+config NVME_RDMA
+	tristate "NVM Express over Fabrics RDMA host driver"
+	depends on INFINIBAND && INFINIBAND_ADDR_TRANS && BLOCK
+	select NVME_CORE
+	select NVME_FABRICS
+	select SG_POOL
+	help
+	  This provides support for the NVMe over Fabrics protocol using
+	  the RDMA (Infiniband, RoCE, iWarp) transport.  This allows you
+	  to use remote block devices exported using the NVMe protocol set.
+
+	  To configure a NVMe over Fabrics controller use the nvme-cli tool
+	  from https://github.com/linux-nvme/nvme-cli.
+
+	  If unsure, say N.
+
+config NVME_FC
+	tristate "NVM Express over Fabrics FC host driver"
+	depends on BLOCK
+	depends on HAS_DMA
+	select NVME_CORE
+	select NVME_FABRICS
+	select SG_POOL
+	help
+	  This provides support for the NVMe over Fabrics protocol using
+	  the FC transport.  This allows you to use remote block devices
+	  exported using the NVMe protocol set.
+
+	  To configure a NVMe over Fabrics controller use the nvme-cli tool
+	  from https://github.com/linux-nvme/nvme-cli.
+
+	  If unsure, say N.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
new file mode 100644
index 0000000..aea459c
--- /dev/null
+++ b/drivers/nvme/host/Makefile
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y				+= -I$(src)
+
+obj-$(CONFIG_NVME_CORE)			+= nvme-core.o
+obj-$(CONFIG_BLK_DEV_NVME)		+= nvme.o
+obj-$(CONFIG_NVME_FABRICS)		+= nvme-fabrics.o
+obj-$(CONFIG_NVME_RDMA)			+= nvme-rdma.o
+obj-$(CONFIG_NVME_FC)			+= nvme-fc.o
+
+nvme-core-y				:= core.o
+nvme-core-$(CONFIG_TRACING)		+= trace.o
+nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
+nvme-core-$(CONFIG_NVM)			+= lightnvm.o
+nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
+
+nvme-y					+= pci.o
+
+nvme-fabrics-y				+= fabrics.o
+
+nvme-rdma-y				+= rdma.o
+
+nvme-fc-y				+= fc.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
new file mode 100644
index 0000000..b9ca782
--- /dev/null
+++ b/drivers/nvme/host/core.c
@@ -0,0 +1,3651 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/hdreg.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list_sort.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/pr.h>
+#include <linux/ptrace.h>
+#include <linux/nvme_ioctl.h>
+#include <linux/t10-pi.h>
+#include <linux/pm_qos.h>
+#include <asm/unaligned.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#include "nvme.h"
+#include "fabrics.h"
+
+#define NVME_MINORS		(1U << MINORBITS)
+
+unsigned int admin_timeout = 60;
+module_param(admin_timeout, uint, 0644);
+MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
+EXPORT_SYMBOL_GPL(admin_timeout);
+
+unsigned int nvme_io_timeout = 30;
+module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
+MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
+EXPORT_SYMBOL_GPL(nvme_io_timeout);
+
+static unsigned char shutdown_timeout = 5;
+module_param(shutdown_timeout, byte, 0644);
+MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
+
+static u8 nvme_max_retries = 5;
+module_param_named(max_retries, nvme_max_retries, byte, 0644);
+MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
+
+static unsigned long default_ps_max_latency_us = 100000;
+module_param(default_ps_max_latency_us, ulong, 0644);
+MODULE_PARM_DESC(default_ps_max_latency_us,
+		 "max power saving latency for new devices; use PM QOS to change per device");
+
+static bool force_apst;
+module_param(force_apst, bool, 0644);
+MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
+
+static bool streams;
+module_param(streams, bool, 0644);
+MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
+
+/*
+ * nvme_wq - hosts nvme related works that are not reset or delete
+ * nvme_reset_wq - hosts nvme reset works
+ * nvme_delete_wq - hosts nvme delete works
+ *
+ * nvme_wq will host works such are scan, aen handling, fw activation,
+ * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
+ * runs reset works which also flush works hosted on nvme_wq for
+ * serialization purposes. nvme_delete_wq host controller deletion
+ * works which flush reset works for serialization.
+ */
+struct workqueue_struct *nvme_wq;
+EXPORT_SYMBOL_GPL(nvme_wq);
+
+struct workqueue_struct *nvme_reset_wq;
+EXPORT_SYMBOL_GPL(nvme_reset_wq);
+
+struct workqueue_struct *nvme_delete_wq;
+EXPORT_SYMBOL_GPL(nvme_delete_wq);
+
+static DEFINE_IDA(nvme_subsystems_ida);
+static LIST_HEAD(nvme_subsystems);
+static DEFINE_MUTEX(nvme_subsystems_lock);
+
+static DEFINE_IDA(nvme_instance_ida);
+static dev_t nvme_chr_devt;
+static struct class *nvme_class;
+static struct class *nvme_subsys_class;
+
+static void nvme_ns_remove(struct nvme_ns *ns);
+static int nvme_revalidate_disk(struct gendisk *disk);
+static void nvme_put_subsystem(struct nvme_subsystem *subsys);
+
+int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
+{
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+		return -EBUSY;
+	if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
+		return -EBUSY;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
+
+int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
+{
+	int ret;
+
+	ret = nvme_reset_ctrl(ctrl);
+	if (!ret) {
+		flush_work(&ctrl->reset_work);
+		if (ctrl->state != NVME_CTRL_LIVE &&
+		    ctrl->state != NVME_CTRL_ADMIN_ONLY)
+			ret = -ENETRESET;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
+
+static void nvme_delete_ctrl_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(work, struct nvme_ctrl, delete_work);
+
+	dev_info(ctrl->device,
+		 "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
+
+	flush_work(&ctrl->reset_work);
+	nvme_stop_ctrl(ctrl);
+	nvme_remove_namespaces(ctrl);
+	ctrl->ops->delete_ctrl(ctrl);
+	nvme_uninit_ctrl(ctrl);
+	nvme_put_ctrl(ctrl);
+}
+
+int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
+		return -EBUSY;
+	if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
+		return -EBUSY;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
+
+int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
+{
+	int ret = 0;
+
+	/*
+	 * Keep a reference until the work is flushed since ->delete_ctrl
+	 * can free the controller.
+	 */
+	nvme_get_ctrl(ctrl);
+	ret = nvme_delete_ctrl(ctrl);
+	if (!ret)
+		flush_work(&ctrl->delete_work);
+	nvme_put_ctrl(ctrl);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
+
+static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
+{
+	return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
+}
+
+static blk_status_t nvme_error_status(struct request *req)
+{
+	switch (nvme_req(req)->status & 0x7ff) {
+	case NVME_SC_SUCCESS:
+		return BLK_STS_OK;
+	case NVME_SC_CAP_EXCEEDED:
+		return BLK_STS_NOSPC;
+	case NVME_SC_LBA_RANGE:
+		return BLK_STS_TARGET;
+	case NVME_SC_BAD_ATTRIBUTES:
+	case NVME_SC_ONCS_NOT_SUPPORTED:
+	case NVME_SC_INVALID_OPCODE:
+	case NVME_SC_INVALID_FIELD:
+	case NVME_SC_INVALID_NS:
+		return BLK_STS_NOTSUPP;
+	case NVME_SC_WRITE_FAULT:
+	case NVME_SC_READ_ERROR:
+	case NVME_SC_UNWRITTEN_BLOCK:
+	case NVME_SC_ACCESS_DENIED:
+	case NVME_SC_READ_ONLY:
+	case NVME_SC_COMPARE_FAILED:
+		return BLK_STS_MEDIUM;
+	case NVME_SC_GUARD_CHECK:
+	case NVME_SC_APPTAG_CHECK:
+	case NVME_SC_REFTAG_CHECK:
+	case NVME_SC_INVALID_PI:
+		return BLK_STS_PROTECTION;
+	case NVME_SC_RESERVATION_CONFLICT:
+		return BLK_STS_NEXUS;
+	default:
+		return BLK_STS_IOERR;
+	}
+}
+
+static inline bool nvme_req_needs_retry(struct request *req)
+{
+	if (blk_noretry_request(req))
+		return false;
+	if (nvme_req(req)->status & NVME_SC_DNR)
+		return false;
+	if (nvme_req(req)->retries >= nvme_max_retries)
+		return false;
+	return true;
+}
+
+void nvme_complete_rq(struct request *req)
+{
+	blk_status_t status = nvme_error_status(req);
+
+	trace_nvme_complete_rq(req);
+
+	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
+		if (nvme_req_needs_failover(req, status)) {
+			nvme_failover_req(req);
+			return;
+		}
+
+		if (!blk_queue_dying(req->q)) {
+			nvme_req(req)->retries++;
+			blk_mq_requeue_request(req, true);
+			return;
+		}
+	}
+	blk_mq_end_request(req, status);
+}
+EXPORT_SYMBOL_GPL(nvme_complete_rq);
+
+void nvme_cancel_request(struct request *req, void *data, bool reserved)
+{
+	if (!blk_mq_request_started(req))
+		return;
+
+	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
+				"Cancelling I/O %d", req->tag);
+
+	nvme_req(req)->status = NVME_SC_ABORT_REQ;
+	blk_mq_complete_request(req);
+
+}
+EXPORT_SYMBOL_GPL(nvme_cancel_request);
+
+bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
+		enum nvme_ctrl_state new_state)
+{
+	enum nvme_ctrl_state old_state;
+	unsigned long flags;
+	bool changed = false;
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+
+	old_state = ctrl->state;
+	switch (new_state) {
+	case NVME_CTRL_ADMIN_ONLY:
+		switch (old_state) {
+		case NVME_CTRL_CONNECTING:
+			changed = true;
+			/* FALLTHRU */
+		default:
+			break;
+		}
+		break;
+	case NVME_CTRL_LIVE:
+		switch (old_state) {
+		case NVME_CTRL_NEW:
+		case NVME_CTRL_RESETTING:
+		case NVME_CTRL_CONNECTING:
+			changed = true;
+			/* FALLTHRU */
+		default:
+			break;
+		}
+		break;
+	case NVME_CTRL_RESETTING:
+		switch (old_state) {
+		case NVME_CTRL_NEW:
+		case NVME_CTRL_LIVE:
+		case NVME_CTRL_ADMIN_ONLY:
+			changed = true;
+			/* FALLTHRU */
+		default:
+			break;
+		}
+		break;
+	case NVME_CTRL_CONNECTING:
+		switch (old_state) {
+		case NVME_CTRL_NEW:
+		case NVME_CTRL_RESETTING:
+			changed = true;
+			/* FALLTHRU */
+		default:
+			break;
+		}
+		break;
+	case NVME_CTRL_DELETING:
+		switch (old_state) {
+		case NVME_CTRL_LIVE:
+		case NVME_CTRL_ADMIN_ONLY:
+		case NVME_CTRL_RESETTING:
+		case NVME_CTRL_CONNECTING:
+			changed = true;
+			/* FALLTHRU */
+		default:
+			break;
+		}
+		break;
+	case NVME_CTRL_DEAD:
+		switch (old_state) {
+		case NVME_CTRL_DELETING:
+			changed = true;
+			/* FALLTHRU */
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (changed)
+		ctrl->state = new_state;
+
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+	if (changed && ctrl->state == NVME_CTRL_LIVE)
+		nvme_kick_requeue_lists(ctrl);
+	return changed;
+}
+EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
+
+static void nvme_free_ns_head(struct kref *ref)
+{
+	struct nvme_ns_head *head =
+		container_of(ref, struct nvme_ns_head, ref);
+
+	nvme_mpath_remove_disk(head);
+	ida_simple_remove(&head->subsys->ns_ida, head->instance);
+	list_del_init(&head->entry);
+	cleanup_srcu_struct(&head->srcu);
+	nvme_put_subsystem(head->subsys);
+	kfree(head);
+}
+
+static void nvme_put_ns_head(struct nvme_ns_head *head)
+{
+	kref_put(&head->ref, nvme_free_ns_head);
+}
+
+static void nvme_free_ns(struct kref *kref)
+{
+	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
+
+	if (ns->ndev)
+		nvme_nvm_unregister(ns);
+
+	put_disk(ns->disk);
+	nvme_put_ns_head(ns->head);
+	nvme_put_ctrl(ns->ctrl);
+	kfree(ns);
+}
+
+static void nvme_put_ns(struct nvme_ns *ns)
+{
+	kref_put(&ns->kref, nvme_free_ns);
+}
+
+static inline void nvme_clear_nvme_request(struct request *req)
+{
+	if (!(req->rq_flags & RQF_DONTPREP)) {
+		nvme_req(req)->retries = 0;
+		nvme_req(req)->flags = 0;
+		req->rq_flags |= RQF_DONTPREP;
+	}
+}
+
+struct request *nvme_alloc_request(struct request_queue *q,
+		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
+{
+	unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
+	struct request *req;
+
+	if (qid == NVME_QID_ANY) {
+		req = blk_mq_alloc_request(q, op, flags);
+	} else {
+		req = blk_mq_alloc_request_hctx(q, op, flags,
+				qid ? qid - 1 : 0);
+	}
+	if (IS_ERR(req))
+		return req;
+
+	req->cmd_flags |= REQ_FAILFAST_DRIVER;
+	nvme_clear_nvme_request(req);
+	nvme_req(req)->cmd = cmd;
+
+	return req;
+}
+EXPORT_SYMBOL_GPL(nvme_alloc_request);
+
+static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+
+	c.directive.opcode = nvme_admin_directive_send;
+	c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
+	c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
+	c.directive.dtype = NVME_DIR_IDENTIFY;
+	c.directive.tdtype = NVME_DIR_STREAMS;
+	c.directive.endir = enable ? NVME_DIR_ENDIR : 0;
+
+	return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
+}
+
+static int nvme_disable_streams(struct nvme_ctrl *ctrl)
+{
+	return nvme_toggle_streams(ctrl, false);
+}
+
+static int nvme_enable_streams(struct nvme_ctrl *ctrl)
+{
+	return nvme_toggle_streams(ctrl, true);
+}
+
+static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
+				  struct streams_directive_params *s, u32 nsid)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	memset(s, 0, sizeof(*s));
+
+	c.directive.opcode = nvme_admin_directive_recv;
+	c.directive.nsid = cpu_to_le32(nsid);
+	c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
+	c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
+	c.directive.dtype = NVME_DIR_STREAMS;
+
+	return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
+}
+
+static int nvme_configure_directives(struct nvme_ctrl *ctrl)
+{
+	struct streams_directive_params s;
+	int ret;
+
+	if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
+		return 0;
+	if (!streams)
+		return 0;
+
+	ret = nvme_enable_streams(ctrl);
+	if (ret)
+		return ret;
+
+	ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
+	if (ret)
+		return ret;
+
+	ctrl->nssa = le16_to_cpu(s.nssa);
+	if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
+		dev_info(ctrl->device, "too few streams (%u) available\n",
+					ctrl->nssa);
+		nvme_disable_streams(ctrl);
+		return 0;
+	}
+
+	ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
+	dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
+	return 0;
+}
+
+/*
+ * Check if 'req' has a write hint associated with it. If it does, assign
+ * a valid namespace stream to the write.
+ */
+static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
+				     struct request *req, u16 *control,
+				     u32 *dsmgmt)
+{
+	enum rw_hint streamid = req->write_hint;
+
+	if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
+		streamid = 0;
+	else {
+		streamid--;
+		if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
+			return;
+
+		*control |= NVME_RW_DTYPE_STREAMS;
+		*dsmgmt |= streamid << 16;
+	}
+
+	if (streamid < ARRAY_SIZE(req->q->write_hints))
+		req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
+}
+
+static inline void nvme_setup_flush(struct nvme_ns *ns,
+		struct nvme_command *cmnd)
+{
+	memset(cmnd, 0, sizeof(*cmnd));
+	cmnd->common.opcode = nvme_cmd_flush;
+	cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
+}
+
+static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
+		struct nvme_command *cmnd)
+{
+	unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
+	struct nvme_dsm_range *range;
+	struct bio *bio;
+
+	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
+	if (!range)
+		return BLK_STS_RESOURCE;
+
+	__rq_for_each_bio(bio, req) {
+		u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
+		u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
+
+		if (n < segments) {
+			range[n].cattr = cpu_to_le32(0);
+			range[n].nlb = cpu_to_le32(nlb);
+			range[n].slba = cpu_to_le64(slba);
+		}
+		n++;
+	}
+
+	if (WARN_ON_ONCE(n != segments)) {
+		kfree(range);
+		return BLK_STS_IOERR;
+	}
+
+	memset(cmnd, 0, sizeof(*cmnd));
+	cmnd->dsm.opcode = nvme_cmd_dsm;
+	cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
+	cmnd->dsm.nr = cpu_to_le32(segments - 1);
+	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
+
+	req->special_vec.bv_page = virt_to_page(range);
+	req->special_vec.bv_offset = offset_in_page(range);
+	req->special_vec.bv_len = sizeof(*range) * segments;
+	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+	return BLK_STS_OK;
+}
+
+static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
+		struct request *req, struct nvme_command *cmnd)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	u16 control = 0;
+	u32 dsmgmt = 0;
+
+	if (req->cmd_flags & REQ_FUA)
+		control |= NVME_RW_FUA;
+	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+		control |= NVME_RW_LR;
+
+	if (req->cmd_flags & REQ_RAHEAD)
+		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+
+	memset(cmnd, 0, sizeof(*cmnd));
+	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
+	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+
+	if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
+		nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
+
+	if (ns->ms) {
+		/*
+		 * If formated with metadata, the block layer always provides a
+		 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
+		 * we enable the PRACT bit for protection information or set the
+		 * namespace capacity to zero to prevent any I/O.
+		 */
+		if (!blk_integrity_rq(req)) {
+			if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
+				return BLK_STS_NOTSUPP;
+			control |= NVME_RW_PRINFO_PRACT;
+		}
+
+		switch (ns->pi_type) {
+		case NVME_NS_DPS_PI_TYPE3:
+			control |= NVME_RW_PRINFO_PRCHK_GUARD;
+			break;
+		case NVME_NS_DPS_PI_TYPE1:
+		case NVME_NS_DPS_PI_TYPE2:
+			control |= NVME_RW_PRINFO_PRCHK_GUARD |
+					NVME_RW_PRINFO_PRCHK_REF;
+			cmnd->rw.reftag = cpu_to_le32(
+					nvme_block_nr(ns, blk_rq_pos(req)));
+			break;
+		}
+	}
+
+	cmnd->rw.control = cpu_to_le16(control);
+	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
+	return 0;
+}
+
+blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
+		struct nvme_command *cmd)
+{
+	blk_status_t ret = BLK_STS_OK;
+
+	nvme_clear_nvme_request(req);
+
+	switch (req_op(req)) {
+	case REQ_OP_DRV_IN:
+	case REQ_OP_DRV_OUT:
+		memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
+		break;
+	case REQ_OP_FLUSH:
+		nvme_setup_flush(ns, cmd);
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		/* currently only aliased to deallocate for a few ctrls: */
+	case REQ_OP_DISCARD:
+		ret = nvme_setup_discard(ns, req, cmd);
+		break;
+	case REQ_OP_READ:
+	case REQ_OP_WRITE:
+		ret = nvme_setup_rw(ns, req, cmd);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return BLK_STS_IOERR;
+	}
+
+	cmd->common.command_id = req->tag;
+	if (ns)
+		trace_nvme_setup_nvm_cmd(req->q->id, cmd);
+	else
+		trace_nvme_setup_admin_cmd(cmd);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_setup_cmd);
+
+/*
+ * Returns 0 on success.  If the result is negative, it's a Linux error code;
+ * if the result is positive, it's an NVM Express status code
+ */
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		union nvme_result *result, void *buffer, unsigned bufflen,
+		unsigned timeout, int qid, int at_head,
+		blk_mq_req_flags_t flags)
+{
+	struct request *req;
+	int ret;
+
+	req = nvme_alloc_request(q, cmd, flags, qid);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+
+	if (buffer && bufflen) {
+		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
+		if (ret)
+			goto out;
+	}
+
+	blk_execute_rq(req->q, NULL, req, at_head);
+	if (result)
+		*result = nvme_req(req)->result;
+	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+		ret = -EINTR;
+	else
+		ret = nvme_req(req)->status;
+ out:
+	blk_mq_free_request(req);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
+
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buffer, unsigned bufflen)
+{
+	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
+			NVME_QID_ANY, 0, 0);
+}
+EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
+
+static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
+		unsigned len, u32 seed, bool write)
+{
+	struct bio_integrity_payload *bip;
+	int ret = -ENOMEM;
+	void *buf;
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	ret = -EFAULT;
+	if (write && copy_from_user(buf, ubuf, len))
+		goto out_free_meta;
+
+	bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
+	if (IS_ERR(bip)) {
+		ret = PTR_ERR(bip);
+		goto out_free_meta;
+	}
+
+	bip->bip_iter.bi_size = len;
+	bip->bip_iter.bi_sector = seed;
+	ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
+			offset_in_page(buf));
+	if (ret == len)
+		return buf;
+	ret = -ENOMEM;
+out_free_meta:
+	kfree(buf);
+out:
+	return ERR_PTR(ret);
+}
+
+static int nvme_submit_user_cmd(struct request_queue *q,
+		struct nvme_command *cmd, void __user *ubuffer,
+		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
+		u32 meta_seed, u32 *result, unsigned timeout)
+{
+	bool write = nvme_is_write(cmd);
+	struct nvme_ns *ns = q->queuedata;
+	struct gendisk *disk = ns ? ns->disk : NULL;
+	struct request *req;
+	struct bio *bio = NULL;
+	void *meta = NULL;
+	int ret;
+
+	req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+	nvme_req(req)->flags |= NVME_REQ_USERCMD;
+
+	if (ubuffer && bufflen) {
+		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
+				GFP_KERNEL);
+		if (ret)
+			goto out;
+		bio = req->bio;
+		bio->bi_disk = disk;
+		if (disk && meta_buffer && meta_len) {
+			meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
+					meta_seed, write);
+			if (IS_ERR(meta)) {
+				ret = PTR_ERR(meta);
+				goto out_unmap;
+			}
+			req->cmd_flags |= REQ_INTEGRITY;
+		}
+	}
+
+	blk_execute_rq(req->q, disk, req, 0);
+	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+		ret = -EINTR;
+	else
+		ret = nvme_req(req)->status;
+	if (result)
+		*result = le32_to_cpu(nvme_req(req)->result.u32);
+	if (meta && !ret && !write) {
+		if (copy_to_user(meta_buffer, meta, meta_len))
+			ret = -EFAULT;
+	}
+	kfree(meta);
+ out_unmap:
+	if (bio)
+		blk_rq_unmap_user(bio);
+ out:
+	blk_mq_free_request(req);
+	return ret;
+}
+
+static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
+{
+	struct nvme_ctrl *ctrl = rq->end_io_data;
+
+	blk_mq_free_request(rq);
+
+	if (status) {
+		dev_err(ctrl->device,
+			"failed nvme_keep_alive_end_io error=%d\n",
+				status);
+		return;
+	}
+
+	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+}
+
+static int nvme_keep_alive(struct nvme_ctrl *ctrl)
+{
+	struct request *rq;
+
+	rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED,
+			NVME_QID_ANY);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	rq->timeout = ctrl->kato * HZ;
+	rq->end_io_data = ctrl;
+
+	blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
+
+	return 0;
+}
+
+static void nvme_keep_alive_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
+			struct nvme_ctrl, ka_work);
+
+	if (nvme_keep_alive(ctrl)) {
+		/* allocation failure, reset the controller */
+		dev_err(ctrl->device, "keep-alive failed\n");
+		nvme_reset_ctrl(ctrl);
+		return;
+	}
+}
+
+static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
+{
+	if (unlikely(ctrl->kato == 0))
+		return;
+
+	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
+	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
+	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
+	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+}
+
+void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
+{
+	if (unlikely(ctrl->kato == 0))
+		return;
+
+	cancel_delayed_work_sync(&ctrl->ka_work);
+}
+EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
+
+static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
+{
+	struct nvme_command c = { };
+	int error;
+
+	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.cns = NVME_ID_CNS_CTRL;
+
+	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ctrl));
+	if (error)
+		kfree(*id);
+	return error;
+}
+
+static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
+		struct nvme_ns_ids *ids)
+{
+	struct nvme_command c = { };
+	int status;
+	void *data;
+	int pos;
+	int len;
+
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cpu_to_le32(nsid);
+	c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
+
+	data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
+				      NVME_IDENTIFY_DATA_SIZE);
+	if (status)
+		goto free_data;
+
+	for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
+		struct nvme_ns_id_desc *cur = data + pos;
+
+		if (cur->nidl == 0)
+			break;
+
+		switch (cur->nidt) {
+		case NVME_NIDT_EUI64:
+			if (cur->nidl != NVME_NIDT_EUI64_LEN) {
+				dev_warn(ctrl->device,
+					 "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n",
+					 cur->nidl);
+				goto free_data;
+			}
+			len = NVME_NIDT_EUI64_LEN;
+			memcpy(ids->eui64, data + pos + sizeof(*cur), len);
+			break;
+		case NVME_NIDT_NGUID:
+			if (cur->nidl != NVME_NIDT_NGUID_LEN) {
+				dev_warn(ctrl->device,
+					 "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n",
+					 cur->nidl);
+				goto free_data;
+			}
+			len = NVME_NIDT_NGUID_LEN;
+			memcpy(ids->nguid, data + pos + sizeof(*cur), len);
+			break;
+		case NVME_NIDT_UUID:
+			if (cur->nidl != NVME_NIDT_UUID_LEN) {
+				dev_warn(ctrl->device,
+					 "ctrl returned bogus length: %d for NVME_NIDT_UUID\n",
+					 cur->nidl);
+				goto free_data;
+			}
+			len = NVME_NIDT_UUID_LEN;
+			uuid_copy(&ids->uuid, data + pos + sizeof(*cur));
+			break;
+		default:
+			/* Skip unnkown types */
+			len = cur->nidl;
+			break;
+		}
+
+		len += sizeof(*cur);
+	}
+free_data:
+	kfree(data);
+	return status;
+}
+
+static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
+{
+	struct nvme_command c = { };
+
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
+	c.identify.nsid = cpu_to_le32(nsid);
+	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list,
+				    NVME_IDENTIFY_DATA_SIZE);
+}
+
+static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
+		unsigned nsid)
+{
+	struct nvme_id_ns *id;
+	struct nvme_command c = { };
+	int error;
+
+	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cpu_to_le32(nsid);
+	c.identify.cns = NVME_ID_CNS_NS;
+
+	id = kmalloc(sizeof(*id), GFP_KERNEL);
+	if (!id)
+		return NULL;
+
+	error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
+	if (error) {
+		dev_warn(ctrl->device, "Identify namespace failed\n");
+		kfree(id);
+		return NULL;
+	}
+
+	return id;
+}
+
+static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
+		      void *buffer, size_t buflen, u32 *result)
+{
+	struct nvme_command c;
+	union nvme_result res;
+	int ret;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_set_features;
+	c.features.fid = cpu_to_le32(fid);
+	c.features.dword11 = cpu_to_le32(dword11);
+
+	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
+			buffer, buflen, 0, NVME_QID_ANY, 0, 0);
+	if (ret >= 0 && result)
+		*result = le32_to_cpu(res.u32);
+	return ret;
+}
+
+int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
+{
+	u32 q_count = (*count - 1) | ((*count - 1) << 16);
+	u32 result;
+	int status, nr_io_queues;
+
+	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
+			&result);
+	if (status < 0)
+		return status;
+
+	/*
+	 * Degraded controllers might return an error when setting the queue
+	 * count.  We still want to be able to bring them online and offer
+	 * access to the admin queue, as that might be only way to fix them up.
+	 */
+	if (status > 0) {
+		dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
+		*count = 0;
+	} else {
+		nr_io_queues = min(result & 0xffff, result >> 16) + 1;
+		*count = min(*count, nr_io_queues);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_set_queue_count);
+
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+{
+	struct nvme_user_io io;
+	struct nvme_command c;
+	unsigned length, meta_len;
+	void __user *metadata;
+
+	if (copy_from_user(&io, uio, sizeof(io)))
+		return -EFAULT;
+	if (io.flags)
+		return -EINVAL;
+
+	switch (io.opcode) {
+	case nvme_cmd_write:
+	case nvme_cmd_read:
+	case nvme_cmd_compare:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	length = (io.nblocks + 1) << ns->lba_shift;
+	meta_len = (io.nblocks + 1) * ns->ms;
+	metadata = (void __user *)(uintptr_t)io.metadata;
+
+	if (ns->ext) {
+		length += meta_len;
+		meta_len = 0;
+	} else if (meta_len) {
+		if ((io.metadata & 3) || !io.metadata)
+			return -EINVAL;
+	}
+
+	memset(&c, 0, sizeof(c));
+	c.rw.opcode = io.opcode;
+	c.rw.flags = io.flags;
+	c.rw.nsid = cpu_to_le32(ns->head->ns_id);
+	c.rw.slba = cpu_to_le64(io.slba);
+	c.rw.length = cpu_to_le16(io.nblocks);
+	c.rw.control = cpu_to_le16(io.control);
+	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
+	c.rw.reftag = cpu_to_le32(io.reftag);
+	c.rw.apptag = cpu_to_le16(io.apptag);
+	c.rw.appmask = cpu_to_le16(io.appmask);
+
+	return nvme_submit_user_cmd(ns->queue, &c,
+			(void __user *)(uintptr_t)io.addr, length,
+			metadata, meta_len, io.slba, NULL, 0);
+}
+
+static u32 nvme_known_admin_effects(u8 opcode)
+{
+	switch (opcode) {
+	case nvme_admin_format_nvm:
+		return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
+					NVME_CMD_EFFECTS_CSE_MASK;
+	case nvme_admin_sanitize_nvm:
+		return NVME_CMD_EFFECTS_CSE_MASK;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+								u8 opcode)
+{
+	u32 effects = 0;
+
+	if (ns) {
+		if (ctrl->effects)
+			effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+		if (effects & ~NVME_CMD_EFFECTS_CSUPP)
+			dev_warn(ctrl->device,
+				 "IO command:%02x has unhandled effects:%08x\n",
+				 opcode, effects);
+		return 0;
+	}
+
+	if (ctrl->effects)
+		effects = le32_to_cpu(ctrl->effects->acs[opcode]);
+	else
+		effects = nvme_known_admin_effects(opcode);
+
+	/*
+	 * For simplicity, IO to all namespaces is quiesced even if the command
+	 * effects say only one namespace is affected.
+	 */
+	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+		nvme_start_freeze(ctrl);
+		nvme_wait_freeze(ctrl);
+	}
+	return effects;
+}
+
+static void nvme_update_formats(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns, *next;
+	LIST_HEAD(rm_list);
+
+	down_write(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		if (ns->disk && nvme_revalidate_disk(ns->disk)) {
+			list_move_tail(&ns->list, &rm_list);
+		}
+	}
+	up_write(&ctrl->namespaces_rwsem);
+
+	list_for_each_entry_safe(ns, next, &rm_list, list)
+		nvme_ns_remove(ns);
+}
+
+static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
+{
+	/*
+	 * Revalidate LBA changes prior to unfreezing. This is necessary to
+	 * prevent memory corruption if a logical block size was changed by
+	 * this command.
+	 */
+	if (effects & NVME_CMD_EFFECTS_LBCC)
+		nvme_update_formats(ctrl);
+	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK))
+		nvme_unfreeze(ctrl);
+	if (effects & NVME_CMD_EFFECTS_CCC)
+		nvme_init_identify(ctrl);
+	if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
+		nvme_queue_scan(ctrl);
+}
+
+static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+			struct nvme_passthru_cmd __user *ucmd)
+{
+	struct nvme_passthru_cmd cmd;
+	struct nvme_command c;
+	unsigned timeout = 0;
+	u32 effects;
+	int status;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+		return -EFAULT;
+	if (cmd.flags)
+		return -EINVAL;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = cmd.opcode;
+	c.common.flags = cmd.flags;
+	c.common.nsid = cpu_to_le32(cmd.nsid);
+	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
+	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
+	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
+	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
+	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
+	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+
+	if (cmd.timeout_ms)
+		timeout = msecs_to_jiffies(cmd.timeout_ms);
+
+	effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
+	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
+			(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
+			(void __user *)(uintptr_t)cmd.metadata, cmd.metadata,
+			0, &cmd.result, timeout);
+	nvme_passthru_end(ctrl, effects);
+
+	if (status >= 0) {
+		if (put_user(cmd.result, &ucmd->result))
+			return -EFAULT;
+	}
+
+	return status;
+}
+
+/*
+ * Issue ioctl requests on the first available path.  Note that unlike normal
+ * block layer requests we will not retry failed request on another controller.
+ */
+static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+		struct nvme_ns_head **head, int *srcu_idx)
+{
+#ifdef CONFIG_NVME_MULTIPATH
+	if (disk->fops == &nvme_ns_head_ops) {
+		*head = disk->private_data;
+		*srcu_idx = srcu_read_lock(&(*head)->srcu);
+		return nvme_find_path(*head);
+	}
+#endif
+	*head = NULL;
+	*srcu_idx = -1;
+	return disk->private_data;
+}
+
+static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
+{
+	if (head)
+		srcu_read_unlock(&head->srcu, idx);
+}
+
+static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case NVME_IOCTL_ID:
+		force_successful_syscall_return();
+		return ns->head->ns_id;
+	case NVME_IOCTL_ADMIN_CMD:
+		return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
+	case NVME_IOCTL_IO_CMD:
+		return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
+	case NVME_IOCTL_SUBMIT_IO:
+		return nvme_submit_io(ns, (void __user *)arg);
+	default:
+#ifdef CONFIG_NVM
+		if (ns->ndev)
+			return nvme_nvm_ioctl(ns, cmd, arg);
+#endif
+		if (is_sed_ioctl(cmd))
+			return sed_ioctl(ns->ctrl->opal_dev, cmd,
+					 (void __user *) arg);
+		return -ENOTTY;
+	}
+}
+
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+		unsigned int cmd, unsigned long arg)
+{
+	struct nvme_ns_head *head = NULL;
+	struct nvme_ns *ns;
+	int srcu_idx, ret;
+
+	ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+	if (unlikely(!ns))
+		ret = -EWOULDBLOCK;
+	else
+		ret = nvme_ns_ioctl(ns, cmd, arg);
+	nvme_put_ns_from_disk(head, srcu_idx);
+	return ret;
+}
+
+static int nvme_open(struct block_device *bdev, fmode_t mode)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+#ifdef CONFIG_NVME_MULTIPATH
+	/* should never be called due to GENHD_FL_HIDDEN */
+	if (WARN_ON_ONCE(ns->head->disk))
+		goto fail;
+#endif
+	if (!kref_get_unless_zero(&ns->kref))
+		goto fail;
+	if (!try_module_get(ns->ctrl->ops->module))
+		goto fail_put_ns;
+
+	return 0;
+
+fail_put_ns:
+	nvme_put_ns(ns);
+fail:
+	return -ENXIO;
+}
+
+static void nvme_release(struct gendisk *disk, fmode_t mode)
+{
+	struct nvme_ns *ns = disk->private_data;
+
+	module_put(ns->ctrl->ops->module);
+	nvme_put_ns(ns);
+}
+
+static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	/* some standard values */
+	geo->heads = 1 << 6;
+	geo->sectors = 1 << 5;
+	geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
+	return 0;
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
+{
+	struct blk_integrity integrity;
+
+	memset(&integrity, 0, sizeof(integrity));
+	switch (pi_type) {
+	case NVME_NS_DPS_PI_TYPE3:
+		integrity.profile = &t10_pi_type3_crc;
+		integrity.tag_size = sizeof(u16) + sizeof(u32);
+		integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+		break;
+	case NVME_NS_DPS_PI_TYPE1:
+	case NVME_NS_DPS_PI_TYPE2:
+		integrity.profile = &t10_pi_type1_crc;
+		integrity.tag_size = sizeof(u16);
+		integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+		break;
+	default:
+		integrity.profile = NULL;
+		break;
+	}
+	integrity.tuple_size = ms;
+	blk_integrity_register(disk, &integrity);
+	blk_queue_max_integrity_segments(disk->queue, 1);
+}
+#else
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
+{
+}
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+
+static void nvme_set_chunk_size(struct nvme_ns *ns)
+{
+	u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9));
+	blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
+}
+
+static void nvme_config_discard(struct nvme_ctrl *ctrl,
+		unsigned stream_alignment, struct request_queue *queue)
+{
+	u32 size = queue_logical_block_size(queue);
+
+	if (stream_alignment)
+		size *= stream_alignment;
+
+	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
+			NVME_DSM_MAX_RANGES);
+
+	queue->limits.discard_alignment = 0;
+	queue->limits.discard_granularity = size;
+
+	blk_queue_max_discard_sectors(queue, UINT_MAX);
+	blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
+	blk_queue_flag_set(QUEUE_FLAG_DISCARD, queue);
+
+	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+		blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
+}
+
+static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
+		struct nvme_id_ns *id, struct nvme_ns_ids *ids)
+{
+	memset(ids, 0, sizeof(*ids));
+
+	if (ctrl->vs >= NVME_VS(1, 1, 0))
+		memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
+	if (ctrl->vs >= NVME_VS(1, 2, 0))
+		memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
+	if (ctrl->vs >= NVME_VS(1, 3, 0)) {
+		 /* Don't treat error as fatal we potentially
+		  * already have a NGUID or EUI-64
+		  */
+		if (nvme_identify_ns_descs(ctrl, nsid, ids))
+			dev_warn(ctrl->device,
+				 "%s: Identify Descriptors failed\n", __func__);
+	}
+}
+
+static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
+{
+	return !uuid_is_null(&ids->uuid) ||
+		memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) ||
+		memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
+}
+
+static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
+{
+	return uuid_equal(&a->uuid, &b->uuid) &&
+		memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
+		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
+}
+
+static void nvme_update_disk_info(struct gendisk *disk,
+		struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+	sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
+	unsigned short bs = 1 << ns->lba_shift;
+	unsigned stream_alignment = 0;
+
+	if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
+		stream_alignment = ns->sws * ns->sgs;
+
+	blk_mq_freeze_queue(disk->queue);
+	blk_integrity_unregister(disk);
+
+	blk_queue_logical_block_size(disk->queue, bs);
+	blk_queue_physical_block_size(disk->queue, bs);
+	blk_queue_io_min(disk->queue, bs);
+
+	if (ns->ms && !ns->ext &&
+	    (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
+		nvme_init_integrity(disk, ns->ms, ns->pi_type);
+	if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk))
+		capacity = 0;
+	set_capacity(disk, capacity);
+
+	if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
+		nvme_config_discard(ns->ctrl, stream_alignment, disk->queue);
+	blk_mq_unfreeze_queue(disk->queue);
+}
+
+static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
+{
+	struct nvme_ns *ns = disk->private_data;
+
+	/*
+	 * If identify namespace failed, use default 512 byte block size so
+	 * block layer can use before failing read/write for 0 capacity.
+	 */
+	ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
+	if (ns->lba_shift == 0)
+		ns->lba_shift = 9;
+	ns->noiob = le16_to_cpu(id->noiob);
+	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+	/* the PI implementation requires metadata equal t10 pi tuple size */
+	if (ns->ms == sizeof(struct t10_pi_tuple))
+		ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+	else
+		ns->pi_type = 0;
+
+	if (ns->noiob)
+		nvme_set_chunk_size(ns);
+	nvme_update_disk_info(disk, ns, id);
+	if (ns->ndev)
+		nvme_nvm_update_nvm_info(ns);
+#ifdef CONFIG_NVME_MULTIPATH
+	if (ns->head->disk)
+		nvme_update_disk_info(ns->head->disk, ns, id);
+#endif
+}
+
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+	struct nvme_ns *ns = disk->private_data;
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct nvme_id_ns *id;
+	struct nvme_ns_ids ids;
+	int ret = 0;
+
+	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
+		set_capacity(disk, 0);
+		return -ENODEV;
+	}
+
+	id = nvme_identify_ns(ctrl, ns->head->ns_id);
+	if (!id)
+		return -ENODEV;
+
+	if (id->ncap == 0) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	__nvme_revalidate_disk(disk, id);
+	nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
+	if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
+		dev_err(ctrl->device,
+			"identifiers changed for nsid %d\n", ns->head->ns_id);
+		ret = -ENODEV;
+	}
+
+out:
+	kfree(id);
+	return ret;
+}
+
+static char nvme_pr_type(enum pr_type type)
+{
+	switch (type) {
+	case PR_WRITE_EXCLUSIVE:
+		return 1;
+	case PR_EXCLUSIVE_ACCESS:
+		return 2;
+	case PR_WRITE_EXCLUSIVE_REG_ONLY:
+		return 3;
+	case PR_EXCLUSIVE_ACCESS_REG_ONLY:
+		return 4;
+	case PR_WRITE_EXCLUSIVE_ALL_REGS:
+		return 5;
+	case PR_EXCLUSIVE_ACCESS_ALL_REGS:
+		return 6;
+	default:
+		return 0;
+	}
+};
+
+static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
+				u64 key, u64 sa_key, u8 op)
+{
+	struct nvme_ns_head *head = NULL;
+	struct nvme_ns *ns;
+	struct nvme_command c;
+	int srcu_idx, ret;
+	u8 data[16] = { 0, };
+
+	ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+	if (unlikely(!ns))
+		return -EWOULDBLOCK;
+
+	put_unaligned_le64(key, &data[0]);
+	put_unaligned_le64(sa_key, &data[8]);
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = op;
+	c.common.nsid = cpu_to_le32(ns->head->ns_id);
+	c.common.cdw10[0] = cpu_to_le32(cdw10);
+
+	ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+	nvme_put_ns_from_disk(head, srcu_idx);
+	return ret;
+}
+
+static int nvme_pr_register(struct block_device *bdev, u64 old,
+		u64 new, unsigned flags)
+{
+	u32 cdw10;
+
+	if (flags & ~PR_FL_IGNORE_KEY)
+		return -EOPNOTSUPP;
+
+	cdw10 = old ? 2 : 0;
+	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
+	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
+	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_reserve(struct block_device *bdev, u64 key,
+		enum pr_type type, unsigned flags)
+{
+	u32 cdw10;
+
+	if (flags & ~PR_FL_IGNORE_KEY)
+		return -EOPNOTSUPP;
+
+	cdw10 = nvme_pr_type(type) << 8;
+	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
+		enum pr_type type, bool abort)
+{
+	u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
+	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_clear(struct block_device *bdev, u64 key)
+{
+	u32 cdw10 = 1 | (key ? 1 << 3 : 0);
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+{
+	u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
+}
+
+static const struct pr_ops nvme_pr_ops = {
+	.pr_register	= nvme_pr_register,
+	.pr_reserve	= nvme_pr_reserve,
+	.pr_release	= nvme_pr_release,
+	.pr_preempt	= nvme_pr_preempt,
+	.pr_clear	= nvme_pr_clear,
+};
+
+#ifdef CONFIG_BLK_SED_OPAL
+int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
+		bool send)
+{
+	struct nvme_ctrl *ctrl = data;
+	struct nvme_command cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	if (send)
+		cmd.common.opcode = nvme_admin_security_send;
+	else
+		cmd.common.opcode = nvme_admin_security_recv;
+	cmd.common.nsid = 0;
+	cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
+	cmd.common.cdw10[1] = cpu_to_le32(len);
+
+	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
+				      ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
+}
+EXPORT_SYMBOL_GPL(nvme_sec_submit);
+#endif /* CONFIG_BLK_SED_OPAL */
+
+static const struct block_device_operations nvme_fops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= nvme_ioctl,
+	.compat_ioctl	= nvme_ioctl,
+	.open		= nvme_open,
+	.release	= nvme_release,
+	.getgeo		= nvme_getgeo,
+	.revalidate_disk= nvme_revalidate_disk,
+	.pr_ops		= &nvme_pr_ops,
+};
+
+#ifdef CONFIG_NVME_MULTIPATH
+static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
+{
+	struct nvme_ns_head *head = bdev->bd_disk->private_data;
+
+	if (!kref_get_unless_zero(&head->ref))
+		return -ENXIO;
+	return 0;
+}
+
+static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
+{
+	nvme_put_ns_head(disk->private_data);
+}
+
+const struct block_device_operations nvme_ns_head_ops = {
+	.owner		= THIS_MODULE,
+	.open		= nvme_ns_head_open,
+	.release	= nvme_ns_head_release,
+	.ioctl		= nvme_ioctl,
+	.compat_ioctl	= nvme_ioctl,
+	.getgeo		= nvme_getgeo,
+	.pr_ops		= &nvme_pr_ops,
+};
+#endif /* CONFIG_NVME_MULTIPATH */
+
+static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
+{
+	unsigned long timeout =
+		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+	u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
+	int ret;
+
+	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
+		if (csts == ~0)
+			return -ENODEV;
+		if ((csts & NVME_CSTS_RDY) == bit)
+			break;
+
+		msleep(100);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		if (time_after(jiffies, timeout)) {
+			dev_err(ctrl->device,
+				"Device not ready; aborting %s\n", enabled ?
+						"initialisation" : "reset");
+			return -ENODEV;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * If the device has been passed off to us in an enabled state, just clear
+ * the enabled bit.  The spec says we should set the 'shutdown notification
+ * bits', but doing so may cause the device to complete commands to the
+ * admin queue ... and we don't know what memory that might be pointing at!
+ */
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+{
+	int ret;
+
+	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
+	ctrl->ctrl_config &= ~NVME_CC_ENABLE;
+
+	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+	if (ret)
+		return ret;
+
+	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
+		msleep(NVME_QUIRK_DELAY_AMOUNT);
+
+	return nvme_wait_ready(ctrl, cap, false);
+}
+EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
+
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+{
+	/*
+	 * Default to a 4K page size, with the intention to update this
+	 * path in the future to accomodate architectures with differing
+	 * kernel and IO page sizes.
+	 */
+	unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
+	int ret;
+
+	if (page_shift < dev_page_min) {
+		dev_err(ctrl->device,
+			"Minimum device page size %u too large for host (%u)\n",
+			1 << dev_page_min, 1 << page_shift);
+		return -ENODEV;
+	}
+
+	ctrl->page_size = 1 << page_shift;
+
+	ctrl->ctrl_config = NVME_CC_CSS_NVM;
+	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
+	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
+	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
+	ctrl->ctrl_config |= NVME_CC_ENABLE;
+
+	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+	if (ret)
+		return ret;
+	return nvme_wait_ready(ctrl, cap, true);
+}
+EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
+
+int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
+{
+	unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
+	u32 csts;
+	int ret;
+
+	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
+	ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
+
+	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+	if (ret)
+		return ret;
+
+	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
+		if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
+			break;
+
+		msleep(100);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		if (time_after(jiffies, timeout)) {
+			dev_err(ctrl->device,
+				"Device shutdown incomplete; abort shutdown\n");
+			return -ENODEV;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
+
+static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
+		struct request_queue *q)
+{
+	bool vwc = false;
+
+	if (ctrl->max_hw_sectors) {
+		u32 max_segments =
+			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
+
+		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
+		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
+	}
+	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
+	    is_power_of_2(ctrl->max_hw_sectors))
+		blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
+	blk_queue_virt_boundary(q, ctrl->page_size - 1);
+	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+		vwc = true;
+	blk_queue_write_cache(q, vwc, vwc);
+}
+
+static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
+{
+	__le64 ts;
+	int ret;
+
+	if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
+		return 0;
+
+	ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
+	ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
+			NULL);
+	if (ret)
+		dev_warn_once(ctrl->device,
+			"could not set timestamp (%d)\n", ret);
+	return ret;
+}
+
+static int nvme_configure_apst(struct nvme_ctrl *ctrl)
+{
+	/*
+	 * APST (Autonomous Power State Transition) lets us program a
+	 * table of power state transitions that the controller will
+	 * perform automatically.  We configure it with a simple
+	 * heuristic: we are willing to spend at most 2% of the time
+	 * transitioning between power states.  Therefore, when running
+	 * in any given state, we will enter the next lower-power
+	 * non-operational state after waiting 50 * (enlat + exlat)
+	 * microseconds, as long as that state's exit latency is under
+	 * the requested maximum latency.
+	 *
+	 * We will not autonomously enter any non-operational state for
+	 * which the total latency exceeds ps_max_latency_us.  Users
+	 * can set ps_max_latency_us to zero to turn off APST.
+	 */
+
+	unsigned apste;
+	struct nvme_feat_auto_pst *table;
+	u64 max_lat_us = 0;
+	int max_ps = -1;
+	int ret;
+
+	/*
+	 * If APST isn't supported or if we haven't been initialized yet,
+	 * then don't do anything.
+	 */
+	if (!ctrl->apsta)
+		return 0;
+
+	if (ctrl->npss > 31) {
+		dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
+		return 0;
+	}
+
+	table = kzalloc(sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return 0;
+
+	if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
+		/* Turn off APST. */
+		apste = 0;
+		dev_dbg(ctrl->device, "APST disabled\n");
+	} else {
+		__le64 target = cpu_to_le64(0);
+		int state;
+
+		/*
+		 * Walk through all states from lowest- to highest-power.
+		 * According to the spec, lower-numbered states use more
+		 * power.  NPSS, despite the name, is the index of the
+		 * lowest-power state, not the number of states.
+		 */
+		for (state = (int)ctrl->npss; state >= 0; state--) {
+			u64 total_latency_us, exit_latency_us, transition_ms;
+
+			if (target)
+				table->entries[state] = target;
+
+			/*
+			 * Don't allow transitions to the deepest state
+			 * if it's quirked off.
+			 */
+			if (state == ctrl->npss &&
+			    (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
+				continue;
+
+			/*
+			 * Is this state a useful non-operational state for
+			 * higher-power states to autonomously transition to?
+			 */
+			if (!(ctrl->psd[state].flags &
+			      NVME_PS_FLAGS_NON_OP_STATE))
+				continue;
+
+			exit_latency_us =
+				(u64)le32_to_cpu(ctrl->psd[state].exit_lat);
+			if (exit_latency_us > ctrl->ps_max_latency_us)
+				continue;
+
+			total_latency_us =
+				exit_latency_us +
+				le32_to_cpu(ctrl->psd[state].entry_lat);
+
+			/*
+			 * This state is good.  Use it as the APST idle
+			 * target for higher power states.
+			 */
+			transition_ms = total_latency_us + 19;
+			do_div(transition_ms, 20);
+			if (transition_ms > (1 << 24) - 1)
+				transition_ms = (1 << 24) - 1;
+
+			target = cpu_to_le64((state << 3) |
+					     (transition_ms << 8));
+
+			if (max_ps == -1)
+				max_ps = state;
+
+			if (total_latency_us > max_lat_us)
+				max_lat_us = total_latency_us;
+		}
+
+		apste = 1;
+
+		if (max_ps == -1) {
+			dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
+		} else {
+			dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
+				max_ps, max_lat_us, (int)sizeof(*table), table);
+		}
+	}
+
+	ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
+				table, sizeof(*table), NULL);
+	if (ret)
+		dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
+
+	kfree(table);
+	return ret;
+}
+
+static void nvme_set_latency_tolerance(struct device *dev, s32 val)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	u64 latency;
+
+	switch (val) {
+	case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
+	case PM_QOS_LATENCY_ANY:
+		latency = U64_MAX;
+		break;
+
+	default:
+		latency = val;
+	}
+
+	if (ctrl->ps_max_latency_us != latency) {
+		ctrl->ps_max_latency_us = latency;
+		nvme_configure_apst(ctrl);
+	}
+}
+
+struct nvme_core_quirk_entry {
+	/*
+	 * NVMe model and firmware strings are padded with spaces.  For
+	 * simplicity, strings in the quirk table are padded with NULLs
+	 * instead.
+	 */
+	u16 vid;
+	const char *mn;
+	const char *fr;
+	unsigned long quirks;
+};
+
+static const struct nvme_core_quirk_entry core_quirks[] = {
+	{
+		/*
+		 * This Toshiba device seems to die using any APST states.  See:
+		 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
+		 */
+		.vid = 0x1179,
+		.mn = "THNSF5256GPUK TOSHIBA",
+		.quirks = NVME_QUIRK_NO_APST,
+	}
+};
+
+/* match is null-terminated but idstr is space-padded. */
+static bool string_matches(const char *idstr, const char *match, size_t len)
+{
+	size_t matchlen;
+
+	if (!match)
+		return true;
+
+	matchlen = strlen(match);
+	WARN_ON_ONCE(matchlen > len);
+
+	if (memcmp(idstr, match, matchlen))
+		return false;
+
+	for (; matchlen < len; matchlen++)
+		if (idstr[matchlen] != ' ')
+			return false;
+
+	return true;
+}
+
+static bool quirk_matches(const struct nvme_id_ctrl *id,
+			  const struct nvme_core_quirk_entry *q)
+{
+	return q->vid == le16_to_cpu(id->vid) &&
+		string_matches(id->mn, q->mn, sizeof(id->mn)) &&
+		string_matches(id->fr, q->fr, sizeof(id->fr));
+}
+
+static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
+		struct nvme_id_ctrl *id)
+{
+	size_t nqnlen;
+	int off;
+
+	nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
+	if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
+		strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
+		return;
+	}
+
+	if (ctrl->vs >= NVME_VS(1, 2, 1))
+		dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
+
+	/* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
+	off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
+			"nqn.2014.08.org.nvmexpress:%4x%4x",
+			le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
+	memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
+	off += sizeof(id->sn);
+	memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
+	off += sizeof(id->mn);
+	memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
+}
+
+static void __nvme_release_subsystem(struct nvme_subsystem *subsys)
+{
+	ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
+	kfree(subsys);
+}
+
+static void nvme_release_subsystem(struct device *dev)
+{
+	__nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev));
+}
+
+static void nvme_destroy_subsystem(struct kref *ref)
+{
+	struct nvme_subsystem *subsys =
+			container_of(ref, struct nvme_subsystem, ref);
+
+	mutex_lock(&nvme_subsystems_lock);
+	list_del(&subsys->entry);
+	mutex_unlock(&nvme_subsystems_lock);
+
+	ida_destroy(&subsys->ns_ida);
+	device_del(&subsys->dev);
+	put_device(&subsys->dev);
+}
+
+static void nvme_put_subsystem(struct nvme_subsystem *subsys)
+{
+	kref_put(&subsys->ref, nvme_destroy_subsystem);
+}
+
+static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
+{
+	struct nvme_subsystem *subsys;
+
+	lockdep_assert_held(&nvme_subsystems_lock);
+
+	list_for_each_entry(subsys, &nvme_subsystems, entry) {
+		if (strcmp(subsys->subnqn, subsysnqn))
+			continue;
+		if (!kref_get_unless_zero(&subsys->ref))
+			continue;
+		return subsys;
+	}
+
+	return NULL;
+}
+
+#define SUBSYS_ATTR_RO(_name, _mode, _show)			\
+	struct device_attribute subsys_attr_##_name = \
+		__ATTR(_name, _mode, _show, NULL)
+
+static ssize_t nvme_subsys_show_nqn(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct nvme_subsystem *subsys =
+		container_of(dev, struct nvme_subsystem, dev);
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn);
+}
+static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
+
+#define nvme_subsys_show_str_function(field)				\
+static ssize_t subsys_##field##_show(struct device *dev,		\
+			    struct device_attribute *attr, char *buf)	\
+{									\
+	struct nvme_subsystem *subsys =					\
+		container_of(dev, struct nvme_subsystem, dev);		\
+	return sprintf(buf, "%.*s\n",					\
+		       (int)sizeof(subsys->field), subsys->field);	\
+}									\
+static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
+
+nvme_subsys_show_str_function(model);
+nvme_subsys_show_str_function(serial);
+nvme_subsys_show_str_function(firmware_rev);
+
+static struct attribute *nvme_subsys_attrs[] = {
+	&subsys_attr_model.attr,
+	&subsys_attr_serial.attr,
+	&subsys_attr_firmware_rev.attr,
+	&subsys_attr_subsysnqn.attr,
+	NULL,
+};
+
+static struct attribute_group nvme_subsys_attrs_group = {
+	.attrs = nvme_subsys_attrs,
+};
+
+static const struct attribute_group *nvme_subsys_attrs_groups[] = {
+	&nvme_subsys_attrs_group,
+	NULL,
+};
+
+static int nvme_active_ctrls(struct nvme_subsystem *subsys)
+{
+	int count = 0;
+	struct nvme_ctrl *ctrl;
+
+	mutex_lock(&subsys->lock);
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		if (ctrl->state != NVME_CTRL_DELETING &&
+		    ctrl->state != NVME_CTRL_DEAD)
+			count++;
+	}
+	mutex_unlock(&subsys->lock);
+
+	return count;
+}
+
+static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+{
+	struct nvme_subsystem *subsys, *found;
+	int ret;
+
+	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
+	if (!subsys)
+		return -ENOMEM;
+	ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
+	if (ret < 0) {
+		kfree(subsys);
+		return ret;
+	}
+	subsys->instance = ret;
+	mutex_init(&subsys->lock);
+	kref_init(&subsys->ref);
+	INIT_LIST_HEAD(&subsys->ctrls);
+	INIT_LIST_HEAD(&subsys->nsheads);
+	nvme_init_subnqn(subsys, ctrl, id);
+	memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
+	memcpy(subsys->model, id->mn, sizeof(subsys->model));
+	memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
+	subsys->vendor_id = le16_to_cpu(id->vid);
+	subsys->cmic = id->cmic;
+
+	subsys->dev.class = nvme_subsys_class;
+	subsys->dev.release = nvme_release_subsystem;
+	subsys->dev.groups = nvme_subsys_attrs_groups;
+	dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
+	device_initialize(&subsys->dev);
+
+	mutex_lock(&nvme_subsystems_lock);
+	found = __nvme_find_get_subsystem(subsys->subnqn);
+	if (found) {
+		/*
+		 * Verify that the subsystem actually supports multiple
+		 * controllers, else bail out.
+		 */
+		if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
+			dev_err(ctrl->device,
+				"ignoring ctrl due to duplicate subnqn (%s).\n",
+				found->subnqn);
+			nvme_put_subsystem(found);
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		__nvme_release_subsystem(subsys);
+		subsys = found;
+	} else {
+		ret = device_add(&subsys->dev);
+		if (ret) {
+			dev_err(ctrl->device,
+				"failed to register subsystem device.\n");
+			goto out_unlock;
+		}
+		ida_init(&subsys->ns_ida);
+		list_add_tail(&subsys->entry, &nvme_subsystems);
+	}
+
+	ctrl->subsys = subsys;
+	mutex_unlock(&nvme_subsystems_lock);
+
+	if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
+			dev_name(ctrl->device))) {
+		dev_err(ctrl->device,
+			"failed to create sysfs link from subsystem.\n");
+		/* the transport driver will eventually put the subsystem */
+		return -EINVAL;
+	}
+
+	mutex_lock(&subsys->lock);
+	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
+	mutex_unlock(&subsys->lock);
+
+	return 0;
+
+out_unlock:
+	mutex_unlock(&nvme_subsystems_lock);
+	put_device(&subsys->dev);
+	return ret;
+}
+
+int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+		     u8 log_page, void *log,
+		     size_t size, u64 offset)
+{
+	struct nvme_command c = { };
+	unsigned long dwlen = size / 4 - 1;
+
+	c.get_log_page.opcode = nvme_admin_get_log_page;
+
+	if (ns)
+		c.get_log_page.nsid = cpu_to_le32(ns->head->ns_id);
+	else
+		c.get_log_page.nsid = cpu_to_le32(NVME_NSID_ALL);
+
+	c.get_log_page.lid = log_page;
+	c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
+	c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
+	c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
+	c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
+
+	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
+}
+
+static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
+			size_t size)
+{
+	return nvme_get_log_ext(ctrl, NULL, log_page, log, size, 0);
+}
+
+static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
+{
+	int ret;
+
+	if (!ctrl->effects)
+		ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
+
+	if (!ctrl->effects)
+		return 0;
+
+	ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects,
+					sizeof(*ctrl->effects));
+	if (ret) {
+		kfree(ctrl->effects);
+		ctrl->effects = NULL;
+	}
+	return ret;
+}
+
+/*
+ * Initialize the cached copies of the Identify data and various controller
+ * register in our nvme_ctrl structure.  This should be called as soon as
+ * the admin queue is fully up and running.
+ */
+int nvme_init_identify(struct nvme_ctrl *ctrl)
+{
+	struct nvme_id_ctrl *id;
+	u64 cap;
+	int ret, page_shift;
+	u32 max_hw_sectors;
+	bool prev_apst_enabled;
+
+	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
+	if (ret) {
+		dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
+		return ret;
+	}
+
+	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
+	if (ret) {
+		dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
+		return ret;
+	}
+	page_shift = NVME_CAP_MPSMIN(cap) + 12;
+
+	if (ctrl->vs >= NVME_VS(1, 1, 0))
+		ctrl->subsystem = NVME_CAP_NSSRC(cap);
+
+	ret = nvme_identify_ctrl(ctrl, &id);
+	if (ret) {
+		dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
+		return -EIO;
+	}
+
+	if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
+		ret = nvme_get_effects_log(ctrl);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (!ctrl->identified) {
+		int i;
+
+		ret = nvme_init_subsystem(ctrl, id);
+		if (ret)
+			goto out_free;
+
+		/*
+		 * Check for quirks.  Quirk can depend on firmware version,
+		 * so, in principle, the set of quirks present can change
+		 * across a reset.  As a possible future enhancement, we
+		 * could re-scan for quirks every time we reinitialize
+		 * the device, but we'd have to make sure that the driver
+		 * behaves intelligently if the quirks change.
+		 */
+		for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
+			if (quirk_matches(id, &core_quirks[i]))
+				ctrl->quirks |= core_quirks[i].quirks;
+		}
+	}
+
+	if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
+		dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
+		ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
+	}
+
+	ctrl->oacs = le16_to_cpu(id->oacs);
+	ctrl->oncs = le16_to_cpup(&id->oncs);
+	atomic_set(&ctrl->abort_limit, id->acl + 1);
+	ctrl->vwc = id->vwc;
+	ctrl->cntlid = le16_to_cpup(&id->cntlid);
+	if (id->mdts)
+		max_hw_sectors = 1 << (id->mdts + page_shift - 9);
+	else
+		max_hw_sectors = UINT_MAX;
+	ctrl->max_hw_sectors =
+		min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
+
+	nvme_set_queue_limits(ctrl, ctrl->admin_q);
+	ctrl->sgls = le32_to_cpu(id->sgls);
+	ctrl->kas = le16_to_cpu(id->kas);
+
+	if (id->rtd3e) {
+		/* us -> s */
+		u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
+
+		ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
+						 shutdown_timeout, 60);
+
+		if (ctrl->shutdown_timeout != shutdown_timeout)
+			dev_info(ctrl->device,
+				 "Shutdown timeout set to %u seconds\n",
+				 ctrl->shutdown_timeout);
+	} else
+		ctrl->shutdown_timeout = shutdown_timeout;
+
+	ctrl->npss = id->npss;
+	ctrl->apsta = id->apsta;
+	prev_apst_enabled = ctrl->apst_enabled;
+	if (ctrl->quirks & NVME_QUIRK_NO_APST) {
+		if (force_apst && id->apsta) {
+			dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
+			ctrl->apst_enabled = true;
+		} else {
+			ctrl->apst_enabled = false;
+		}
+	} else {
+		ctrl->apst_enabled = id->apsta;
+	}
+	memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
+
+	if (ctrl->ops->flags & NVME_F_FABRICS) {
+		ctrl->icdoff = le16_to_cpu(id->icdoff);
+		ctrl->ioccsz = le32_to_cpu(id->ioccsz);
+		ctrl->iorcsz = le32_to_cpu(id->iorcsz);
+		ctrl->maxcmd = le16_to_cpu(id->maxcmd);
+
+		/*
+		 * In fabrics we need to verify the cntlid matches the
+		 * admin connect
+		 */
+		if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
+			ret = -EINVAL;
+			goto out_free;
+		}
+
+		if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
+			dev_err(ctrl->device,
+				"keep-alive support is mandatory for fabrics\n");
+			ret = -EINVAL;
+			goto out_free;
+		}
+	} else {
+		ctrl->cntlid = le16_to_cpu(id->cntlid);
+		ctrl->hmpre = le32_to_cpu(id->hmpre);
+		ctrl->hmmin = le32_to_cpu(id->hmmin);
+		ctrl->hmminds = le32_to_cpu(id->hmminds);
+		ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
+	}
+
+	kfree(id);
+
+	if (ctrl->apst_enabled && !prev_apst_enabled)
+		dev_pm_qos_expose_latency_tolerance(ctrl->device);
+	else if (!ctrl->apst_enabled && prev_apst_enabled)
+		dev_pm_qos_hide_latency_tolerance(ctrl->device);
+
+	ret = nvme_configure_apst(ctrl);
+	if (ret < 0)
+		return ret;
+	
+	ret = nvme_configure_timestamp(ctrl);
+	if (ret < 0)
+		return ret;
+
+	ret = nvme_configure_directives(ctrl);
+	if (ret < 0)
+		return ret;
+
+	ctrl->identified = true;
+
+	return 0;
+
+out_free:
+	kfree(id);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_init_identify);
+
+static int nvme_dev_open(struct inode *inode, struct file *file)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(inode->i_cdev, struct nvme_ctrl, cdev);
+
+	switch (ctrl->state) {
+	case NVME_CTRL_LIVE:
+	case NVME_CTRL_ADMIN_ONLY:
+		break;
+	default:
+		return -EWOULDBLOCK;
+	}
+
+	file->private_data = ctrl;
+	return 0;
+}
+
+static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
+{
+	struct nvme_ns *ns;
+	int ret;
+
+	down_read(&ctrl->namespaces_rwsem);
+	if (list_empty(&ctrl->namespaces)) {
+		ret = -ENOTTY;
+		goto out_unlock;
+	}
+
+	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
+	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
+		dev_warn(ctrl->device,
+			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	dev_warn(ctrl->device,
+		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
+	kref_get(&ns->kref);
+	up_read(&ctrl->namespaces_rwsem);
+
+	ret = nvme_user_cmd(ctrl, ns, argp);
+	nvme_put_ns(ns);
+	return ret;
+
+out_unlock:
+	up_read(&ctrl->namespaces_rwsem);
+	return ret;
+}
+
+static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	struct nvme_ctrl *ctrl = file->private_data;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case NVME_IOCTL_ADMIN_CMD:
+		return nvme_user_cmd(ctrl, NULL, argp);
+	case NVME_IOCTL_IO_CMD:
+		return nvme_dev_user_cmd(ctrl, argp);
+	case NVME_IOCTL_RESET:
+		dev_warn(ctrl->device, "resetting controller\n");
+		return nvme_reset_ctrl_sync(ctrl);
+	case NVME_IOCTL_SUBSYS_RESET:
+		return nvme_reset_subsystem(ctrl);
+	case NVME_IOCTL_RESCAN:
+		nvme_queue_scan(ctrl);
+		return 0;
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct file_operations nvme_dev_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nvme_dev_open,
+	.unlocked_ioctl	= nvme_dev_ioctl,
+	.compat_ioctl	= nvme_dev_ioctl,
+};
+
+static ssize_t nvme_sysfs_reset(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	int ret;
+
+	ret = nvme_reset_ctrl_sync(ctrl);
+	if (ret < 0)
+		return ret;
+	return count;
+}
+static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
+
+static ssize_t nvme_sysfs_rescan(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	nvme_queue_scan(ctrl);
+	return count;
+}
+static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
+
+static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	if (disk->fops == &nvme_fops)
+		return nvme_get_ns_from_dev(dev)->head;
+	else
+		return disk->private_data;
+}
+
+static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	struct nvme_ns_head *head = dev_to_ns_head(dev);
+	struct nvme_ns_ids *ids = &head->ids;
+	struct nvme_subsystem *subsys = head->subsys;
+	int serial_len = sizeof(subsys->serial);
+	int model_len = sizeof(subsys->model);
+
+	if (!uuid_is_null(&ids->uuid))
+		return sprintf(buf, "uuid.%pU\n", &ids->uuid);
+
+	if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+		return sprintf(buf, "eui.%16phN\n", ids->nguid);
+
+	if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
+		return sprintf(buf, "eui.%8phN\n", ids->eui64);
+
+	while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
+				  subsys->serial[serial_len - 1] == '\0'))
+		serial_len--;
+	while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
+				 subsys->model[model_len - 1] == '\0'))
+		model_len--;
+
+	return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
+		serial_len, subsys->serial, model_len, subsys->model,
+		head->ns_id);
+}
+static DEVICE_ATTR_RO(wwid);
+
+static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
+}
+static DEVICE_ATTR_RO(nguid);
+
+static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
+
+	/* For backward compatibility expose the NGUID to userspace if
+	 * we have no UUID set
+	 */
+	if (uuid_is_null(&ids->uuid)) {
+		printk_ratelimited(KERN_WARNING
+				   "No UUID available providing old NGUID\n");
+		return sprintf(buf, "%pU\n", ids->nguid);
+	}
+	return sprintf(buf, "%pU\n", &ids->uuid);
+}
+static DEVICE_ATTR_RO(uuid);
+
+static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
+}
+static DEVICE_ATTR_RO(eui);
+
+static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
+}
+static DEVICE_ATTR_RO(nsid);
+
+static struct attribute *nvme_ns_id_attrs[] = {
+	&dev_attr_wwid.attr,
+	&dev_attr_uuid.attr,
+	&dev_attr_nguid.attr,
+	&dev_attr_eui.attr,
+	&dev_attr_nsid.attr,
+	NULL,
+};
+
+static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
+		struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
+
+	if (a == &dev_attr_uuid.attr) {
+		if (uuid_is_null(&ids->uuid) &&
+		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+			return 0;
+	}
+	if (a == &dev_attr_nguid.attr) {
+		if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+			return 0;
+	}
+	if (a == &dev_attr_eui.attr) {
+		if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
+			return 0;
+	}
+	return a->mode;
+}
+
+const struct attribute_group nvme_ns_id_attr_group = {
+	.attrs		= nvme_ns_id_attrs,
+	.is_visible	= nvme_ns_id_attrs_are_visible,
+};
+
+#define nvme_show_str_function(field)						\
+static ssize_t  field##_show(struct device *dev,				\
+			    struct device_attribute *attr, char *buf)		\
+{										\
+        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
+        return sprintf(buf, "%.*s\n",						\
+		(int)sizeof(ctrl->subsys->field), ctrl->subsys->field);		\
+}										\
+static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
+
+nvme_show_str_function(model);
+nvme_show_str_function(serial);
+nvme_show_str_function(firmware_rev);
+
+#define nvme_show_int_function(field)						\
+static ssize_t  field##_show(struct device *dev,				\
+			    struct device_attribute *attr, char *buf)		\
+{										\
+        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
+        return sprintf(buf, "%d\n", ctrl->field);	\
+}										\
+static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
+
+nvme_show_int_function(cntlid);
+
+static ssize_t nvme_sysfs_delete(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	if (device_remove_file_self(dev, attr))
+		nvme_delete_ctrl_sync(ctrl);
+	return count;
+}
+static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
+
+static ssize_t nvme_sysfs_show_transport(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
+}
+static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
+
+static ssize_t nvme_sysfs_show_state(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	static const char *const state_name[] = {
+		[NVME_CTRL_NEW]		= "new",
+		[NVME_CTRL_LIVE]	= "live",
+		[NVME_CTRL_ADMIN_ONLY]	= "only-admin",
+		[NVME_CTRL_RESETTING]	= "resetting",
+		[NVME_CTRL_CONNECTING]	= "connecting",
+		[NVME_CTRL_DELETING]	= "deleting",
+		[NVME_CTRL_DEAD]	= "dead",
+	};
+
+	if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
+	    state_name[ctrl->state])
+		return sprintf(buf, "%s\n", state_name[ctrl->state]);
+
+	return sprintf(buf, "unknown state\n");
+}
+
+static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
+
+static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn);
+}
+static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
+
+static ssize_t nvme_sysfs_show_address(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
+}
+static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
+
+static struct attribute *nvme_dev_attrs[] = {
+	&dev_attr_reset_controller.attr,
+	&dev_attr_rescan_controller.attr,
+	&dev_attr_model.attr,
+	&dev_attr_serial.attr,
+	&dev_attr_firmware_rev.attr,
+	&dev_attr_cntlid.attr,
+	&dev_attr_delete_controller.attr,
+	&dev_attr_transport.attr,
+	&dev_attr_subsysnqn.attr,
+	&dev_attr_address.attr,
+	&dev_attr_state.attr,
+	NULL
+};
+
+static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
+		struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
+		return 0;
+	if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
+		return 0;
+
+	return a->mode;
+}
+
+static struct attribute_group nvme_dev_attrs_group = {
+	.attrs		= nvme_dev_attrs,
+	.is_visible	= nvme_dev_attrs_are_visible,
+};
+
+static const struct attribute_group *nvme_dev_attr_groups[] = {
+	&nvme_dev_attrs_group,
+	NULL,
+};
+
+static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys,
+		unsigned nsid)
+{
+	struct nvme_ns_head *h;
+
+	lockdep_assert_held(&subsys->lock);
+
+	list_for_each_entry(h, &subsys->nsheads, entry) {
+		if (h->ns_id == nsid && kref_get_unless_zero(&h->ref))
+			return h;
+	}
+
+	return NULL;
+}
+
+static int __nvme_check_ids(struct nvme_subsystem *subsys,
+		struct nvme_ns_head *new)
+{
+	struct nvme_ns_head *h;
+
+	lockdep_assert_held(&subsys->lock);
+
+	list_for_each_entry(h, &subsys->nsheads, entry) {
+		if (nvme_ns_ids_valid(&new->ids) &&
+		    !list_empty(&h->list) &&
+		    nvme_ns_ids_equal(&new->ids, &h->ids))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
+		unsigned nsid, struct nvme_id_ns *id)
+{
+	struct nvme_ns_head *head;
+	int ret = -ENOMEM;
+
+	head = kzalloc(sizeof(*head), GFP_KERNEL);
+	if (!head)
+		goto out;
+	ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto out_free_head;
+	head->instance = ret;
+	INIT_LIST_HEAD(&head->list);
+	ret = init_srcu_struct(&head->srcu);
+	if (ret)
+		goto out_ida_remove;
+	head->subsys = ctrl->subsys;
+	head->ns_id = nsid;
+	kref_init(&head->ref);
+
+	nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
+
+	ret = __nvme_check_ids(ctrl->subsys, head);
+	if (ret) {
+		dev_err(ctrl->device,
+			"duplicate IDs for nsid %d\n", nsid);
+		goto out_cleanup_srcu;
+	}
+
+	ret = nvme_mpath_alloc_disk(ctrl, head);
+	if (ret)
+		goto out_cleanup_srcu;
+
+	list_add_tail(&head->entry, &ctrl->subsys->nsheads);
+
+	kref_get(&ctrl->subsys->ref);
+
+	return head;
+out_cleanup_srcu:
+	cleanup_srcu_struct(&head->srcu);
+out_ida_remove:
+	ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
+out_free_head:
+	kfree(head);
+out:
+	return ERR_PTR(ret);
+}
+
+static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
+		struct nvme_id_ns *id)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	bool is_shared = id->nmic & (1 << 0);
+	struct nvme_ns_head *head = NULL;
+	int ret = 0;
+
+	mutex_lock(&ctrl->subsys->lock);
+	if (is_shared)
+		head = __nvme_find_ns_head(ctrl->subsys, nsid);
+	if (!head) {
+		head = nvme_alloc_ns_head(ctrl, nsid, id);
+		if (IS_ERR(head)) {
+			ret = PTR_ERR(head);
+			goto out_unlock;
+		}
+	} else {
+		struct nvme_ns_ids ids;
+
+		nvme_report_ns_ids(ctrl, nsid, id, &ids);
+		if (!nvme_ns_ids_equal(&head->ids, &ids)) {
+			dev_err(ctrl->device,
+				"IDs don't match for shared namespace %d\n",
+					nsid);
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+	}
+
+	list_add_tail(&ns->siblings, &head->list);
+	ns->head = head;
+
+out_unlock:
+	mutex_unlock(&ctrl->subsys->lock);
+	return ret;
+}
+
+static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
+	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
+
+	return nsa->head->ns_id - nsb->head->ns_id;
+}
+
+static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+	struct nvme_ns *ns, *ret = NULL;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		if (ns->head->ns_id == nsid) {
+			if (!kref_get_unless_zero(&ns->kref))
+				continue;
+			ret = ns;
+			break;
+		}
+		if (ns->head->ns_id > nsid)
+			break;
+	}
+	up_read(&ctrl->namespaces_rwsem);
+	return ret;
+}
+
+static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
+{
+	struct streams_directive_params s;
+	int ret;
+
+	if (!ctrl->nr_streams)
+		return 0;
+
+	ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
+	if (ret)
+		return ret;
+
+	ns->sws = le32_to_cpu(s.sws);
+	ns->sgs = le16_to_cpu(s.sgs);
+
+	if (ns->sws) {
+		unsigned int bs = 1 << ns->lba_shift;
+
+		blk_queue_io_min(ns->queue, bs * ns->sws);
+		if (ns->sgs)
+			blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs);
+	}
+
+	return 0;
+}
+
+static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+	struct nvme_ns *ns;
+	struct gendisk *disk;
+	struct nvme_id_ns *id;
+	char disk_name[DISK_NAME_LEN];
+	int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
+
+	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
+	if (!ns)
+		return;
+
+	ns->queue = blk_mq_init_queue(ctrl->tagset);
+	if (IS_ERR(ns->queue))
+		goto out_free_ns;
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
+	ns->queue->queuedata = ns;
+	ns->ctrl = ctrl;
+
+	kref_init(&ns->kref);
+	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
+
+	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
+	nvme_set_queue_limits(ctrl, ns->queue);
+
+	id = nvme_identify_ns(ctrl, nsid);
+	if (!id)
+		goto out_free_queue;
+
+	if (id->ncap == 0)
+		goto out_free_id;
+
+	if (nvme_init_ns_head(ns, nsid, id))
+		goto out_free_id;
+	nvme_setup_streams_ns(ctrl, ns);
+	nvme_set_disk_name(disk_name, ns, ctrl, &flags);
+
+	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
+		if (nvme_nvm_register(ns, disk_name, node)) {
+			dev_warn(ctrl->device, "LightNVM init failure\n");
+			goto out_unlink_ns;
+		}
+	}
+
+	disk = alloc_disk_node(0, node);
+	if (!disk)
+		goto out_unlink_ns;
+
+	disk->fops = &nvme_fops;
+	disk->private_data = ns;
+	disk->queue = ns->queue;
+	disk->flags = flags;
+	memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
+	ns->disk = disk;
+
+	__nvme_revalidate_disk(disk, id);
+
+	down_write(&ctrl->namespaces_rwsem);
+	list_add_tail(&ns->list, &ctrl->namespaces);
+	up_write(&ctrl->namespaces_rwsem);
+
+	nvme_get_ctrl(ctrl);
+
+	kfree(id);
+
+	device_add_disk(ctrl->device, ns->disk);
+	if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
+					&nvme_ns_id_attr_group))
+		pr_warn("%s: failed to create sysfs group for identification\n",
+			ns->disk->disk_name);
+	if (ns->ndev && nvme_nvm_register_sysfs(ns))
+		pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
+			ns->disk->disk_name);
+
+	nvme_mpath_add_disk(ns->head);
+	nvme_fault_inject_init(ns);
+	return;
+ out_unlink_ns:
+	mutex_lock(&ctrl->subsys->lock);
+	list_del_rcu(&ns->siblings);
+	mutex_unlock(&ctrl->subsys->lock);
+ out_free_id:
+	kfree(id);
+ out_free_queue:
+	blk_cleanup_queue(ns->queue);
+ out_free_ns:
+	kfree(ns);
+}
+
+static void nvme_ns_remove(struct nvme_ns *ns)
+{
+	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
+		return;
+
+	nvme_fault_inject_fini(ns);
+	if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
+		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
+					&nvme_ns_id_attr_group);
+		if (ns->ndev)
+			nvme_nvm_unregister_sysfs(ns);
+		del_gendisk(ns->disk);
+		blk_cleanup_queue(ns->queue);
+		if (blk_get_integrity(ns->disk))
+			blk_integrity_unregister(ns->disk);
+	}
+
+	mutex_lock(&ns->ctrl->subsys->lock);
+	nvme_mpath_clear_current_path(ns);
+	list_del_rcu(&ns->siblings);
+	mutex_unlock(&ns->ctrl->subsys->lock);
+
+	down_write(&ns->ctrl->namespaces_rwsem);
+	list_del_init(&ns->list);
+	up_write(&ns->ctrl->namespaces_rwsem);
+
+	synchronize_srcu(&ns->head->srcu);
+	nvme_mpath_check_last_path(ns);
+	nvme_put_ns(ns);
+}
+
+static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+	struct nvme_ns *ns;
+
+	ns = nvme_find_get_ns(ctrl, nsid);
+	if (ns) {
+		if (ns->disk && revalidate_disk(ns->disk))
+			nvme_ns_remove(ns);
+		nvme_put_ns(ns);
+	} else
+		nvme_alloc_ns(ctrl, nsid);
+}
+
+static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
+					unsigned nsid)
+{
+	struct nvme_ns *ns, *next;
+	LIST_HEAD(rm_list);
+
+	down_write(&ctrl->namespaces_rwsem);
+	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
+		if (ns->head->ns_id > nsid)
+			list_move_tail(&ns->list, &rm_list);
+	}
+	up_write(&ctrl->namespaces_rwsem);
+
+	list_for_each_entry_safe(ns, next, &rm_list, list)
+		nvme_ns_remove(ns);
+
+}
+
+static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
+{
+	struct nvme_ns *ns;
+	__le32 *ns_list;
+	unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
+	int ret = 0;
+
+	ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
+	if (!ns_list)
+		return -ENOMEM;
+
+	for (i = 0; i < num_lists; i++) {
+		ret = nvme_identify_ns_list(ctrl, prev, ns_list);
+		if (ret)
+			goto free;
+
+		for (j = 0; j < min(nn, 1024U); j++) {
+			nsid = le32_to_cpu(ns_list[j]);
+			if (!nsid)
+				goto out;
+
+			nvme_validate_ns(ctrl, nsid);
+
+			while (++prev < nsid) {
+				ns = nvme_find_get_ns(ctrl, prev);
+				if (ns) {
+					nvme_ns_remove(ns);
+					nvme_put_ns(ns);
+				}
+			}
+		}
+		nn -= j;
+	}
+ out:
+	nvme_remove_invalid_namespaces(ctrl, prev);
+ free:
+	kfree(ns_list);
+	return ret;
+}
+
+static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
+{
+	unsigned i;
+
+	for (i = 1; i <= nn; i++)
+		nvme_validate_ns(ctrl, i);
+
+	nvme_remove_invalid_namespaces(ctrl, nn);
+}
+
+static void nvme_scan_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(work, struct nvme_ctrl, scan_work);
+	struct nvme_id_ctrl *id;
+	unsigned nn;
+
+	if (ctrl->state != NVME_CTRL_LIVE)
+		return;
+
+	WARN_ON_ONCE(!ctrl->tagset);
+
+	if (nvme_identify_ctrl(ctrl, &id))
+		return;
+
+	nn = le32_to_cpu(id->nn);
+	if (ctrl->vs >= NVME_VS(1, 1, 0) &&
+	    !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
+		if (!nvme_scan_ns_list(ctrl, nn))
+			goto done;
+	}
+	nvme_scan_ns_sequential(ctrl, nn);
+ done:
+	down_write(&ctrl->namespaces_rwsem);
+	list_sort(NULL, &ctrl->namespaces, ns_cmp);
+	up_write(&ctrl->namespaces_rwsem);
+	kfree(id);
+}
+
+void nvme_queue_scan(struct nvme_ctrl *ctrl)
+{
+	/*
+	 * Only new queue scan work when admin and IO queues are both alive
+	 */
+	if (ctrl->state == NVME_CTRL_LIVE)
+		queue_work(nvme_wq, &ctrl->scan_work);
+}
+EXPORT_SYMBOL_GPL(nvme_queue_scan);
+
+/*
+ * This function iterates the namespace list unlocked to allow recovery from
+ * controller failure. It is up to the caller to ensure the namespace list is
+ * not modified by scan work while this function is executing.
+ */
+void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns, *next;
+	LIST_HEAD(ns_list);
+
+	/*
+	 * The dead states indicates the controller was not gracefully
+	 * disconnected. In that case, we won't be able to flush any data while
+	 * removing the namespaces' disks; fail all the queues now to avoid
+	 * potentially having to clean up the failed sync later.
+	 */
+	if (ctrl->state == NVME_CTRL_DEAD)
+		nvme_kill_queues(ctrl);
+
+	down_write(&ctrl->namespaces_rwsem);
+	list_splice_init(&ctrl->namespaces, &ns_list);
+	up_write(&ctrl->namespaces_rwsem);
+
+	list_for_each_entry_safe(ns, next, &ns_list, list)
+		nvme_ns_remove(ns);
+}
+EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
+
+static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
+{
+	char *envp[2] = { NULL, NULL };
+	u32 aen_result = ctrl->aen_result;
+
+	ctrl->aen_result = 0;
+	if (!aen_result)
+		return;
+
+	envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
+	if (!envp[0])
+		return;
+	kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
+	kfree(envp[0]);
+}
+
+static void nvme_async_event_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(work, struct nvme_ctrl, async_event_work);
+
+	nvme_aen_uevent(ctrl);
+	ctrl->ops->submit_async_event(ctrl);
+}
+
+static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
+{
+
+	u32 csts;
+
+	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
+		return false;
+
+	if (csts == ~0)
+		return false;
+
+	return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
+}
+
+static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
+{
+	struct nvme_fw_slot_info_log *log;
+
+	log = kmalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		return;
+
+	if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log)))
+		dev_warn(ctrl->device,
+				"Get FW SLOT INFO log error\n");
+	kfree(log);
+}
+
+static void nvme_fw_act_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl = container_of(work,
+				struct nvme_ctrl, fw_act_work);
+	unsigned long fw_act_timeout;
+
+	if (ctrl->mtfa)
+		fw_act_timeout = jiffies +
+				msecs_to_jiffies(ctrl->mtfa * 100);
+	else
+		fw_act_timeout = jiffies +
+				msecs_to_jiffies(admin_timeout * 1000);
+
+	nvme_stop_queues(ctrl);
+	while (nvme_ctrl_pp_status(ctrl)) {
+		if (time_after(jiffies, fw_act_timeout)) {
+			dev_warn(ctrl->device,
+				"Fw activation timeout, reset controller\n");
+			nvme_reset_ctrl(ctrl);
+			break;
+		}
+		msleep(100);
+	}
+
+	if (ctrl->state != NVME_CTRL_LIVE)
+		return;
+
+	nvme_start_queues(ctrl);
+	/* read FW slot information to clear the AER */
+	nvme_get_fw_slot_info(ctrl);
+}
+
+void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
+		union nvme_result *res)
+{
+	u32 result = le32_to_cpu(res->u32);
+
+	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
+		return;
+
+	switch (result & 0x7) {
+	case NVME_AER_ERROR:
+	case NVME_AER_SMART:
+	case NVME_AER_CSS:
+	case NVME_AER_VS:
+		ctrl->aen_result = result;
+		break;
+	default:
+		break;
+	}
+
+	switch (result & 0xff07) {
+	case NVME_AER_NOTICE_NS_CHANGED:
+		dev_info(ctrl->device, "rescanning\n");
+		nvme_queue_scan(ctrl);
+		break;
+	case NVME_AER_NOTICE_FW_ACT_STARTING:
+		queue_work(nvme_wq, &ctrl->fw_act_work);
+		break;
+	default:
+		dev_warn(ctrl->device, "async event result %08x\n", result);
+	}
+	queue_work(nvme_wq, &ctrl->async_event_work);
+}
+EXPORT_SYMBOL_GPL(nvme_complete_async_event);
+
+void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
+{
+	nvme_stop_keep_alive(ctrl);
+	flush_work(&ctrl->async_event_work);
+	flush_work(&ctrl->scan_work);
+	cancel_work_sync(&ctrl->fw_act_work);
+	if (ctrl->ops->stop_ctrl)
+		ctrl->ops->stop_ctrl(ctrl);
+}
+EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
+
+void nvme_start_ctrl(struct nvme_ctrl *ctrl)
+{
+	if (ctrl->kato)
+		nvme_start_keep_alive(ctrl);
+
+	if (ctrl->queue_count > 1) {
+		nvme_queue_scan(ctrl);
+		queue_work(nvme_wq, &ctrl->async_event_work);
+		nvme_start_queues(ctrl);
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_start_ctrl);
+
+void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
+{
+	cdev_device_del(&ctrl->cdev, ctrl->device);
+}
+EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
+
+static void nvme_free_ctrl(struct device *dev)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(dev, struct nvme_ctrl, ctrl_device);
+	struct nvme_subsystem *subsys = ctrl->subsys;
+
+	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
+	kfree(ctrl->effects);
+
+	if (subsys) {
+		mutex_lock(&subsys->lock);
+		list_del(&ctrl->subsys_entry);
+		mutex_unlock(&subsys->lock);
+		sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
+	}
+
+	ctrl->ops->free_ctrl(ctrl);
+
+	if (subsys)
+		nvme_put_subsystem(subsys);
+}
+
+/*
+ * Initialize a NVMe controller structures.  This needs to be called during
+ * earliest initialization so that we have the initialized structured around
+ * during probing.
+ */
+int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
+		const struct nvme_ctrl_ops *ops, unsigned long quirks)
+{
+	int ret;
+
+	ctrl->state = NVME_CTRL_NEW;
+	spin_lock_init(&ctrl->lock);
+	INIT_LIST_HEAD(&ctrl->namespaces);
+	init_rwsem(&ctrl->namespaces_rwsem);
+	ctrl->dev = dev;
+	ctrl->ops = ops;
+	ctrl->quirks = quirks;
+	INIT_WORK(&ctrl->scan_work, nvme_scan_work);
+	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
+	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
+	INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
+
+	ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto out;
+	ctrl->instance = ret;
+
+	device_initialize(&ctrl->ctrl_device);
+	ctrl->device = &ctrl->ctrl_device;
+	ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance);
+	ctrl->device->class = nvme_class;
+	ctrl->device->parent = ctrl->dev;
+	ctrl->device->groups = nvme_dev_attr_groups;
+	ctrl->device->release = nvme_free_ctrl;
+	dev_set_drvdata(ctrl->device, ctrl);
+	ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
+	if (ret)
+		goto out_release_instance;
+
+	cdev_init(&ctrl->cdev, &nvme_dev_fops);
+	ctrl->cdev.owner = ops->module;
+	ret = cdev_device_add(&ctrl->cdev, ctrl->device);
+	if (ret)
+		goto out_free_name;
+
+	/*
+	 * Initialize latency tolerance controls.  The sysfs files won't
+	 * be visible to userspace unless the device actually supports APST.
+	 */
+	ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
+	dev_pm_qos_update_user_latency_tolerance(ctrl->device,
+		min(default_ps_max_latency_us, (unsigned long)S32_MAX));
+
+	return 0;
+out_free_name:
+	kfree_const(dev->kobj.name);
+out_release_instance:
+	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_init_ctrl);
+
+/**
+ * nvme_kill_queues(): Ends all namespace queues
+ * @ctrl: the dead controller that needs to end
+ *
+ * Call this function when the driver determines it is unable to get the
+ * controller in a state capable of servicing IO.
+ */
+void nvme_kill_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+
+	/* Forcibly unquiesce queues to avoid blocking dispatch */
+	if (ctrl->admin_q)
+		blk_mq_unquiesce_queue(ctrl->admin_q);
+
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		/*
+		 * Revalidating a dead namespace sets capacity to 0. This will
+		 * end buffered writers dirtying pages that can't be synced.
+		 */
+		if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+			continue;
+		revalidate_disk(ns->disk);
+		blk_set_queue_dying(ns->queue);
+
+		/* Forcibly unquiesce queues to avoid blocking dispatch */
+		blk_mq_unquiesce_queue(ns->queue);
+	}
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_kill_queues);
+
+void nvme_unfreeze(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_mq_unfreeze_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unfreeze);
+
+void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
+		if (timeout <= 0)
+			break;
+	}
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
+
+void nvme_wait_freeze(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_mq_freeze_queue_wait(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_wait_freeze);
+
+void nvme_start_freeze(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_freeze_queue_start(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_start_freeze);
+
+void nvme_stop_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_mq_quiesce_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_stop_queues);
+
+void nvme_start_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_mq_unquiesce_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_start_queues);
+
+int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
+{
+	if (!ctrl->ops->reinit_request)
+		return 0;
+
+	return blk_mq_tagset_iter(set, set->driver_data,
+			ctrl->ops->reinit_request);
+}
+EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
+
+int __init nvme_core_init(void)
+{
+	int result = -ENOMEM;
+
+	nvme_wq = alloc_workqueue("nvme-wq",
+			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+	if (!nvme_wq)
+		goto out;
+
+	nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
+			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+	if (!nvme_reset_wq)
+		goto destroy_wq;
+
+	nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
+			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+	if (!nvme_delete_wq)
+		goto destroy_reset_wq;
+
+	result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
+	if (result < 0)
+		goto destroy_delete_wq;
+
+	nvme_class = class_create(THIS_MODULE, "nvme");
+	if (IS_ERR(nvme_class)) {
+		result = PTR_ERR(nvme_class);
+		goto unregister_chrdev;
+	}
+
+	nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
+	if (IS_ERR(nvme_subsys_class)) {
+		result = PTR_ERR(nvme_subsys_class);
+		goto destroy_class;
+	}
+	return 0;
+
+destroy_class:
+	class_destroy(nvme_class);
+unregister_chrdev:
+	unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+destroy_delete_wq:
+	destroy_workqueue(nvme_delete_wq);
+destroy_reset_wq:
+	destroy_workqueue(nvme_reset_wq);
+destroy_wq:
+	destroy_workqueue(nvme_wq);
+out:
+	return result;
+}
+
+void nvme_core_exit(void)
+{
+	ida_destroy(&nvme_subsystems_ida);
+	class_destroy(nvme_subsys_class);
+	class_destroy(nvme_class);
+	unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+	destroy_workqueue(nvme_delete_wq);
+	destroy_workqueue(nvme_reset_wq);
+	destroy_workqueue(nvme_wq);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0");
+module_init(nvme_core_init);
+module_exit(nvme_core_exit);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
new file mode 100644
index 0000000..7ae732a
--- /dev/null
+++ b/drivers/nvme/host/fabrics.c
@@ -0,0 +1,1158 @@
+/*
+ * NVMe over Fabrics common host code.
+ * Copyright (c) 2015-2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include "nvme.h"
+#include "fabrics.h"
+
+static LIST_HEAD(nvmf_transports);
+static DECLARE_RWSEM(nvmf_transports_rwsem);
+
+static LIST_HEAD(nvmf_hosts);
+static DEFINE_MUTEX(nvmf_hosts_mutex);
+
+static struct nvmf_host *nvmf_default_host;
+
+static struct nvmf_host *__nvmf_host_find(const char *hostnqn)
+{
+	struct nvmf_host *host;
+
+	list_for_each_entry(host, &nvmf_hosts, list) {
+		if (!strcmp(host->nqn, hostnqn))
+			return host;
+	}
+
+	return NULL;
+}
+
+static struct nvmf_host *nvmf_host_add(const char *hostnqn)
+{
+	struct nvmf_host *host;
+
+	mutex_lock(&nvmf_hosts_mutex);
+	host = __nvmf_host_find(hostnqn);
+	if (host) {
+		kref_get(&host->ref);
+		goto out_unlock;
+	}
+
+	host = kmalloc(sizeof(*host), GFP_KERNEL);
+	if (!host)
+		goto out_unlock;
+
+	kref_init(&host->ref);
+	memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE);
+
+	list_add_tail(&host->list, &nvmf_hosts);
+out_unlock:
+	mutex_unlock(&nvmf_hosts_mutex);
+	return host;
+}
+
+static struct nvmf_host *nvmf_host_default(void)
+{
+	struct nvmf_host *host;
+
+	host = kmalloc(sizeof(*host), GFP_KERNEL);
+	if (!host)
+		return NULL;
+
+	kref_init(&host->ref);
+	uuid_gen(&host->id);
+	snprintf(host->nqn, NVMF_NQN_SIZE,
+		"nqn.2014-08.org.nvmexpress:uuid:%pUb", &host->id);
+
+	mutex_lock(&nvmf_hosts_mutex);
+	list_add_tail(&host->list, &nvmf_hosts);
+	mutex_unlock(&nvmf_hosts_mutex);
+
+	return host;
+}
+
+static void nvmf_host_destroy(struct kref *ref)
+{
+	struct nvmf_host *host = container_of(ref, struct nvmf_host, ref);
+
+	mutex_lock(&nvmf_hosts_mutex);
+	list_del(&host->list);
+	mutex_unlock(&nvmf_hosts_mutex);
+
+	kfree(host);
+}
+
+static void nvmf_host_put(struct nvmf_host *host)
+{
+	if (host)
+		kref_put(&host->ref, nvmf_host_destroy);
+}
+
+/**
+ * nvmf_get_address() -  Get address/port
+ * @ctrl:	Host NVMe controller instance which we got the address
+ * @buf:	OUTPUT parameter that will contain the address/port
+ * @size:	buffer size
+ */
+int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
+{
+	int len = 0;
+
+	if (ctrl->opts->mask & NVMF_OPT_TRADDR)
+		len += snprintf(buf, size, "traddr=%s", ctrl->opts->traddr);
+	if (ctrl->opts->mask & NVMF_OPT_TRSVCID)
+		len += snprintf(buf + len, size - len, "%strsvcid=%s",
+				(len) ? "," : "", ctrl->opts->trsvcid);
+	if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)
+		len += snprintf(buf + len, size - len, "%shost_traddr=%s",
+				(len) ? "," : "", ctrl->opts->host_traddr);
+	len += snprintf(buf + len, size - len, "\n");
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(nvmf_get_address);
+
+/**
+ * nvmf_reg_read32() -  NVMe Fabrics "Property Get" API function.
+ * @ctrl:	Host NVMe controller instance maintaining the admin
+ *		queue used to submit the property read command to
+ *		the allocated NVMe controller resource on the target system.
+ * @off:	Starting offset value of the targeted property
+ *		register (see the fabrics section of the NVMe standard).
+ * @val:	OUTPUT parameter that will contain the value of
+ *		the property after a successful read.
+ *
+ * Used by the host system to retrieve a 32-bit capsule property value
+ * from an NVMe controller on the target system.
+ *
+ * ("Capsule property" is an "PCIe register concept" applied to the
+ * NVMe fabrics space.)
+ *
+ * Return:
+ *	0: successful read
+ *	> 0: NVMe error status code
+ *	< 0: Linux errno error code
+ */
+int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
+{
+	struct nvme_command cmd;
+	union nvme_result res;
+	int ret;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.prop_get.opcode = nvme_fabrics_command;
+	cmd.prop_get.fctype = nvme_fabrics_type_property_get;
+	cmd.prop_get.offset = cpu_to_le32(off);
+
+	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
+			NVME_QID_ANY, 0, 0);
+
+	if (ret >= 0)
+		*val = le64_to_cpu(res.u64);
+	if (unlikely(ret != 0))
+		dev_err(ctrl->device,
+			"Property Get error: %d, offset %#x\n",
+			ret > 0 ? ret & ~NVME_SC_DNR : ret, off);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvmf_reg_read32);
+
+/**
+ * nvmf_reg_read64() -  NVMe Fabrics "Property Get" API function.
+ * @ctrl:	Host NVMe controller instance maintaining the admin
+ *		queue used to submit the property read command to
+ *		the allocated controller resource on the target system.
+ * @off:	Starting offset value of the targeted property
+ *		register (see the fabrics section of the NVMe standard).
+ * @val:	OUTPUT parameter that will contain the value of
+ *		the property after a successful read.
+ *
+ * Used by the host system to retrieve a 64-bit capsule property value
+ * from an NVMe controller on the target system.
+ *
+ * ("Capsule property" is an "PCIe register concept" applied to the
+ * NVMe fabrics space.)
+ *
+ * Return:
+ *	0: successful read
+ *	> 0: NVMe error status code
+ *	< 0: Linux errno error code
+ */
+int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
+{
+	struct nvme_command cmd;
+	union nvme_result res;
+	int ret;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.prop_get.opcode = nvme_fabrics_command;
+	cmd.prop_get.fctype = nvme_fabrics_type_property_get;
+	cmd.prop_get.attrib = 1;
+	cmd.prop_get.offset = cpu_to_le32(off);
+
+	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
+			NVME_QID_ANY, 0, 0);
+
+	if (ret >= 0)
+		*val = le64_to_cpu(res.u64);
+	if (unlikely(ret != 0))
+		dev_err(ctrl->device,
+			"Property Get error: %d, offset %#x\n",
+			ret > 0 ? ret & ~NVME_SC_DNR : ret, off);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvmf_reg_read64);
+
+/**
+ * nvmf_reg_write32() -  NVMe Fabrics "Property Write" API function.
+ * @ctrl:	Host NVMe controller instance maintaining the admin
+ *		queue used to submit the property read command to
+ *		the allocated NVMe controller resource on the target system.
+ * @off:	Starting offset value of the targeted property
+ *		register (see the fabrics section of the NVMe standard).
+ * @val:	Input parameter that contains the value to be
+ *		written to the property.
+ *
+ * Used by the NVMe host system to write a 32-bit capsule property value
+ * to an NVMe controller on the target system.
+ *
+ * ("Capsule property" is an "PCIe register concept" applied to the
+ * NVMe fabrics space.)
+ *
+ * Return:
+ *	0: successful write
+ *	> 0: NVMe error status code
+ *	< 0: Linux errno error code
+ */
+int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
+{
+	struct nvme_command cmd;
+	int ret;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.prop_set.opcode = nvme_fabrics_command;
+	cmd.prop_set.fctype = nvme_fabrics_type_property_set;
+	cmd.prop_set.attrib = 0;
+	cmd.prop_set.offset = cpu_to_le32(off);
+	cmd.prop_set.value = cpu_to_le64(val);
+
+	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0,
+			NVME_QID_ANY, 0, 0);
+	if (unlikely(ret))
+		dev_err(ctrl->device,
+			"Property Set error: %d, offset %#x\n",
+			ret > 0 ? ret & ~NVME_SC_DNR : ret, off);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvmf_reg_write32);
+
+/**
+ * nvmf_log_connect_error() - Error-parsing-diagnostic print
+ * out function for connect() errors.
+ *
+ * @ctrl: the specific /dev/nvmeX device that had the error.
+ *
+ * @errval: Error code to be decoded in a more human-friendly
+ *	    printout.
+ *
+ * @offset: For use with the NVMe error code NVME_SC_CONNECT_INVALID_PARAM.
+ *
+ * @cmd: This is the SQE portion of a submission capsule.
+ *
+ * @data: This is the "Data" portion of a submission capsule.
+ */
+static void nvmf_log_connect_error(struct nvme_ctrl *ctrl,
+		int errval, int offset, struct nvme_command *cmd,
+		struct nvmf_connect_data *data)
+{
+	int err_sctype = errval & (~NVME_SC_DNR);
+
+	switch (err_sctype) {
+
+	case (NVME_SC_CONNECT_INVALID_PARAM):
+		if (offset >> 16) {
+			char *inv_data = "Connect Invalid Data Parameter";
+
+			switch (offset & 0xffff) {
+			case (offsetof(struct nvmf_connect_data, cntlid)):
+				dev_err(ctrl->device,
+					"%s, cntlid: %d\n",
+					inv_data, data->cntlid);
+				break;
+			case (offsetof(struct nvmf_connect_data, hostnqn)):
+				dev_err(ctrl->device,
+					"%s, hostnqn \"%s\"\n",
+					inv_data, data->hostnqn);
+				break;
+			case (offsetof(struct nvmf_connect_data, subsysnqn)):
+				dev_err(ctrl->device,
+					"%s, subsysnqn \"%s\"\n",
+					inv_data, data->subsysnqn);
+				break;
+			default:
+				dev_err(ctrl->device,
+					"%s, starting byte offset: %d\n",
+				       inv_data, offset & 0xffff);
+				break;
+			}
+		} else {
+			char *inv_sqe = "Connect Invalid SQE Parameter";
+
+			switch (offset) {
+			case (offsetof(struct nvmf_connect_command, qid)):
+				dev_err(ctrl->device,
+				       "%s, qid %d\n",
+					inv_sqe, cmd->connect.qid);
+				break;
+			default:
+				dev_err(ctrl->device,
+					"%s, starting byte offset: %d\n",
+					inv_sqe, offset);
+			}
+		}
+		break;
+
+	case NVME_SC_CONNECT_INVALID_HOST:
+		dev_err(ctrl->device,
+			"Connect for subsystem %s is not allowed, hostnqn: %s\n",
+			data->subsysnqn, data->hostnqn);
+		break;
+
+	case NVME_SC_CONNECT_CTRL_BUSY:
+		dev_err(ctrl->device,
+			"Connect command failed: controller is busy or not available\n");
+		break;
+
+	case NVME_SC_CONNECT_FORMAT:
+		dev_err(ctrl->device,
+			"Connect incompatible format: %d",
+			cmd->connect.recfmt);
+		break;
+
+	default:
+		dev_err(ctrl->device,
+			"Connect command failed, error wo/DNR bit: %d\n",
+			err_sctype);
+		break;
+	} /* switch (err_sctype) */
+}
+
+/**
+ * nvmf_connect_admin_queue() - NVMe Fabrics Admin Queue "Connect"
+ *				API function.
+ * @ctrl:	Host nvme controller instance used to request
+ *              a new NVMe controller allocation on the target
+ *              system and  establish an NVMe Admin connection to
+ *              that controller.
+ *
+ * This function enables an NVMe host device to request a new allocation of
+ * an NVMe controller resource on a target system as well establish a
+ * fabrics-protocol connection of the NVMe Admin queue between the
+ * host system device and the allocated NVMe controller on the
+ * target system via a NVMe Fabrics "Connect" command.
+ *
+ * Return:
+ *	0: success
+ *	> 0: NVMe error status code
+ *	< 0: Linux errno error code
+ *
+ */
+int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
+{
+	struct nvme_command cmd;
+	union nvme_result res;
+	struct nvmf_connect_data *data;
+	int ret;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.connect.opcode = nvme_fabrics_command;
+	cmd.connect.fctype = nvme_fabrics_type_connect;
+	cmd.connect.qid = 0;
+	cmd.connect.sqsize = cpu_to_le16(NVME_AQ_DEPTH - 1);
+
+	/*
+	 * Set keep-alive timeout in seconds granularity (ms * 1000)
+	 * and add a grace period for controller kato enforcement
+	 */
+	cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 :
+		cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000);
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	uuid_copy(&data->hostid, &ctrl->opts->host->id);
+	data->cntlid = cpu_to_le16(0xffff);
+	strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
+	strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
+
+	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res,
+			data, sizeof(*data), 0, NVME_QID_ANY, 1,
+			BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+	if (ret) {
+		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
+				       &cmd, data);
+		goto out_free_data;
+	}
+
+	ctrl->cntlid = le16_to_cpu(res.u16);
+
+out_free_data:
+	kfree(data);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue);
+
+/**
+ * nvmf_connect_io_queue() - NVMe Fabrics I/O Queue "Connect"
+ *			     API function.
+ * @ctrl:	Host nvme controller instance used to establish an
+ *		NVMe I/O queue connection to the already allocated NVMe
+ *		controller on the target system.
+ * @qid:	NVMe I/O queue number for the new I/O connection between
+ *		host and target (note qid == 0 is illegal as this is
+ *		the Admin queue, per NVMe standard).
+ *
+ * This function issues a fabrics-protocol connection
+ * of a NVMe I/O queue (via NVMe Fabrics "Connect" command)
+ * between the host system device and the allocated NVMe controller
+ * on the target system.
+ *
+ * Return:
+ *	0: success
+ *	> 0: NVMe error status code
+ *	< 0: Linux errno error code
+ */
+int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
+{
+	struct nvme_command cmd;
+	struct nvmf_connect_data *data;
+	union nvme_result res;
+	int ret;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.connect.opcode = nvme_fabrics_command;
+	cmd.connect.fctype = nvme_fabrics_type_connect;
+	cmd.connect.qid = cpu_to_le16(qid);
+	cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize);
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	uuid_copy(&data->hostid, &ctrl->opts->host->id);
+	data->cntlid = cpu_to_le16(ctrl->cntlid);
+	strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
+	strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
+
+	ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res,
+			data, sizeof(*data), 0, qid, 1,
+			BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+	if (ret) {
+		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
+				       &cmd, data);
+	}
+	kfree(data);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
+
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
+{
+	if (ctrl->opts->max_reconnects != -1 &&
+	    ctrl->nr_reconnects < ctrl->opts->max_reconnects)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(nvmf_should_reconnect);
+
+/**
+ * nvmf_register_transport() - NVMe Fabrics Library registration function.
+ * @ops:	Transport ops instance to be registered to the
+ *		common fabrics library.
+ *
+ * API function that registers the type of specific transport fabric
+ * being implemented to the common NVMe fabrics library. Part of
+ * the overall init sequence of starting up a fabrics driver.
+ */
+int nvmf_register_transport(struct nvmf_transport_ops *ops)
+{
+	if (!ops->create_ctrl)
+		return -EINVAL;
+
+	down_write(&nvmf_transports_rwsem);
+	list_add_tail(&ops->entry, &nvmf_transports);
+	up_write(&nvmf_transports_rwsem);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvmf_register_transport);
+
+/**
+ * nvmf_unregister_transport() - NVMe Fabrics Library unregistration function.
+ * @ops:	Transport ops instance to be unregistered from the
+ *		common fabrics library.
+ *
+ * Fabrics API function that unregisters the type of specific transport
+ * fabric being implemented from the common NVMe fabrics library.
+ * Part of the overall exit sequence of unloading the implemented driver.
+ */
+void nvmf_unregister_transport(struct nvmf_transport_ops *ops)
+{
+	down_write(&nvmf_transports_rwsem);
+	list_del(&ops->entry);
+	up_write(&nvmf_transports_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvmf_unregister_transport);
+
+static struct nvmf_transport_ops *nvmf_lookup_transport(
+		struct nvmf_ctrl_options *opts)
+{
+	struct nvmf_transport_ops *ops;
+
+	lockdep_assert_held(&nvmf_transports_rwsem);
+
+	list_for_each_entry(ops, &nvmf_transports, entry) {
+		if (strcmp(ops->name, opts->transport) == 0)
+			return ops;
+	}
+
+	return NULL;
+}
+
+blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq,
+		bool queue_live, bool is_connected)
+{
+	struct nvme_command *cmd = nvme_req(rq)->cmd;
+
+	if (likely(ctrl->state == NVME_CTRL_LIVE && is_connected))
+		return BLK_STS_OK;
+
+	switch (ctrl->state) {
+	case NVME_CTRL_DELETING:
+		goto reject_io;
+
+	case NVME_CTRL_NEW:
+	case NVME_CTRL_CONNECTING:
+		if (!is_connected)
+			/*
+			 * This is the case of starting a new
+			 * association but connectivity was lost
+			 * before it was fully created. We need to
+			 * error the commands used to initialize the
+			 * controller so the reconnect can go into a
+			 * retry attempt. The commands should all be
+			 * marked REQ_FAILFAST_DRIVER, which will hit
+			 * the reject path below. Anything else will
+			 * be queued while the state settles.
+			 */
+			goto reject_or_queue_io;
+
+		if ((queue_live &&
+		     !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) ||
+		    (!queue_live && blk_rq_is_passthrough(rq) &&
+		     cmd->common.opcode == nvme_fabrics_command &&
+		     cmd->fabrics.fctype == nvme_fabrics_type_connect))
+			/*
+			 * If queue is live, allow only commands that
+			 * are internally generated pass through. These
+			 * are commands on the admin queue to initialize
+			 * the controller. This will reject any ioctl
+			 * admin cmds received while initializing.
+			 *
+			 * If the queue is not live, allow only a
+			 * connect command. This will reject any ioctl
+			 * admin cmd as well as initialization commands
+			 * if the controller reverted the queue to non-live.
+			 */
+			return BLK_STS_OK;
+
+		/*
+		 * fall-thru to the reject_or_queue_io clause
+		 */
+		break;
+
+	/* these cases fall-thru
+	 * case NVME_CTRL_LIVE:
+	 * case NVME_CTRL_RESETTING:
+	 */
+	default:
+		break;
+	}
+
+reject_or_queue_io:
+	/*
+	 * Any other new io is something we're not in a state to send
+	 * to the device. Default action is to busy it and retry it
+	 * after the controller state is recovered. However, anything
+	 * marked for failfast or nvme multipath is immediately failed.
+	 * Note: commands used to initialize the controller will be
+	 *  marked for failfast.
+	 * Note: nvme cli/ioctl commands are marked for failfast.
+	 */
+	if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
+		return BLK_STS_RESOURCE;
+
+reject_io:
+	nvme_req(rq)->status = NVME_SC_ABORT_REQ;
+	return BLK_STS_IOERR;
+}
+EXPORT_SYMBOL_GPL(nvmf_check_if_ready);
+
+static const match_table_t opt_tokens = {
+	{ NVMF_OPT_TRANSPORT,		"transport=%s"		},
+	{ NVMF_OPT_TRADDR,		"traddr=%s"		},
+	{ NVMF_OPT_TRSVCID,		"trsvcid=%s"		},
+	{ NVMF_OPT_NQN,			"nqn=%s"		},
+	{ NVMF_OPT_QUEUE_SIZE,		"queue_size=%d"		},
+	{ NVMF_OPT_NR_IO_QUEUES,	"nr_io_queues=%d"	},
+	{ NVMF_OPT_RECONNECT_DELAY,	"reconnect_delay=%d"	},
+	{ NVMF_OPT_CTRL_LOSS_TMO,	"ctrl_loss_tmo=%d"	},
+	{ NVMF_OPT_KATO,		"keep_alive_tmo=%d"	},
+	{ NVMF_OPT_HOSTNQN,		"hostnqn=%s"		},
+	{ NVMF_OPT_HOST_TRADDR,		"host_traddr=%s"	},
+	{ NVMF_OPT_HOST_ID,		"hostid=%s"		},
+	{ NVMF_OPT_DUP_CONNECT,		"duplicate_connect"	},
+	{ NVMF_OPT_ERR,			NULL			}
+};
+
+static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
+		const char *buf)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *options, *o, *p;
+	int token, ret = 0;
+	size_t nqnlen  = 0;
+	int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO;
+	uuid_t hostid;
+
+	/* Set defaults */
+	opts->queue_size = NVMF_DEF_QUEUE_SIZE;
+	opts->nr_io_queues = num_online_cpus();
+	opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
+	opts->kato = NVME_DEFAULT_KATO;
+	opts->duplicate_connect = false;
+
+	options = o = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	uuid_gen(&hostid);
+
+	while ((p = strsep(&o, ",\n")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, opt_tokens, args);
+		opts->mask |= token;
+		switch (token) {
+		case NVMF_OPT_TRANSPORT:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			kfree(opts->transport);
+			opts->transport = p;
+			break;
+		case NVMF_OPT_NQN:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			kfree(opts->subsysnqn);
+			opts->subsysnqn = p;
+			nqnlen = strlen(opts->subsysnqn);
+			if (nqnlen >= NVMF_NQN_SIZE) {
+				pr_err("%s needs to be < %d bytes\n",
+					opts->subsysnqn, NVMF_NQN_SIZE);
+				ret = -EINVAL;
+				goto out;
+			}
+			opts->discovery_nqn =
+				!(strcmp(opts->subsysnqn,
+					 NVME_DISC_SUBSYS_NAME));
+			if (opts->discovery_nqn) {
+				opts->kato = 0;
+				opts->nr_io_queues = 0;
+			}
+			break;
+		case NVMF_OPT_TRADDR:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			kfree(opts->traddr);
+			opts->traddr = p;
+			break;
+		case NVMF_OPT_TRSVCID:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			kfree(opts->trsvcid);
+			opts->trsvcid = p;
+			break;
+		case NVMF_OPT_QUEUE_SIZE:
+			if (match_int(args, &token)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (token < NVMF_MIN_QUEUE_SIZE ||
+			    token > NVMF_MAX_QUEUE_SIZE) {
+				pr_err("Invalid queue_size %d\n", token);
+				ret = -EINVAL;
+				goto out;
+			}
+			opts->queue_size = token;
+			break;
+		case NVMF_OPT_NR_IO_QUEUES:
+			if (match_int(args, &token)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (token <= 0) {
+				pr_err("Invalid number of IOQs %d\n", token);
+				ret = -EINVAL;
+				goto out;
+			}
+			if (opts->discovery_nqn) {
+				pr_debug("Ignoring nr_io_queues value for discovery controller\n");
+				break;
+			}
+
+			opts->nr_io_queues = min_t(unsigned int,
+					num_online_cpus(), token);
+			break;
+		case NVMF_OPT_KATO:
+			if (match_int(args, &token)) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			if (token < 0) {
+				pr_err("Invalid keep_alive_tmo %d\n", token);
+				ret = -EINVAL;
+				goto out;
+			} else if (token == 0 && !opts->discovery_nqn) {
+				/* Allowed for debug */
+				pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n");
+			}
+			opts->kato = token;
+
+			if (opts->discovery_nqn && opts->kato) {
+				pr_err("Discovery controllers cannot accept KATO != 0\n");
+				ret = -EINVAL;
+				goto out;
+			}
+
+			break;
+		case NVMF_OPT_CTRL_LOSS_TMO:
+			if (match_int(args, &token)) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			if (token < 0)
+				pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n");
+			ctrl_loss_tmo = token;
+			break;
+		case NVMF_OPT_HOSTNQN:
+			if (opts->host) {
+				pr_err("hostnqn already user-assigned: %s\n",
+				       opts->host->nqn);
+				ret = -EADDRINUSE;
+				goto out;
+			}
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			nqnlen = strlen(p);
+			if (nqnlen >= NVMF_NQN_SIZE) {
+				pr_err("%s needs to be < %d bytes\n",
+					p, NVMF_NQN_SIZE);
+				kfree(p);
+				ret = -EINVAL;
+				goto out;
+			}
+			nvmf_host_put(opts->host);
+			opts->host = nvmf_host_add(p);
+			kfree(p);
+			if (!opts->host) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			break;
+		case NVMF_OPT_RECONNECT_DELAY:
+			if (match_int(args, &token)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (token <= 0) {
+				pr_err("Invalid reconnect_delay %d\n", token);
+				ret = -EINVAL;
+				goto out;
+			}
+			opts->reconnect_delay = token;
+			break;
+		case NVMF_OPT_HOST_TRADDR:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			kfree(opts->host_traddr);
+			opts->host_traddr = p;
+			break;
+		case NVMF_OPT_HOST_ID:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = uuid_parse(p, &hostid);
+			if (ret) {
+				pr_err("Invalid hostid %s\n", p);
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			kfree(p);
+			break;
+		case NVMF_OPT_DUP_CONNECT:
+			opts->duplicate_connect = true;
+			break;
+		default:
+			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
+				p);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (ctrl_loss_tmo < 0)
+		opts->max_reconnects = -1;
+	else
+		opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
+						opts->reconnect_delay);
+
+	if (!opts->host) {
+		kref_get(&nvmf_default_host->ref);
+		opts->host = nvmf_default_host;
+	}
+
+	uuid_copy(&opts->host->id, &hostid);
+
+out:
+	kfree(options);
+	return ret;
+}
+
+static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts,
+		unsigned int required_opts)
+{
+	if ((opts->mask & required_opts) != required_opts) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) {
+			if ((opt_tokens[i].token & required_opts) &&
+			    !(opt_tokens[i].token & opts->mask)) {
+				pr_warn("missing parameter '%s'\n",
+					opt_tokens[i].pattern);
+			}
+		}
+
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts,
+		unsigned int allowed_opts)
+{
+	if (opts->mask & ~allowed_opts) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) {
+			if ((opt_tokens[i].token & opts->mask) &&
+			    (opt_tokens[i].token & ~allowed_opts)) {
+				pr_warn("invalid parameter '%s'\n",
+					opt_tokens[i].pattern);
+			}
+		}
+
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void nvmf_free_options(struct nvmf_ctrl_options *opts)
+{
+	nvmf_host_put(opts->host);
+	kfree(opts->transport);
+	kfree(opts->traddr);
+	kfree(opts->trsvcid);
+	kfree(opts->subsysnqn);
+	kfree(opts->host_traddr);
+	kfree(opts);
+}
+EXPORT_SYMBOL_GPL(nvmf_free_options);
+
+#define NVMF_REQUIRED_OPTS	(NVMF_OPT_TRANSPORT | NVMF_OPT_NQN)
+#define NVMF_ALLOWED_OPTS	(NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
+				 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
+				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT)
+
+static struct nvme_ctrl *
+nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
+{
+	struct nvmf_ctrl_options *opts;
+	struct nvmf_transport_ops *ops;
+	struct nvme_ctrl *ctrl;
+	int ret;
+
+	opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+	if (!opts)
+		return ERR_PTR(-ENOMEM);
+
+	ret = nvmf_parse_options(opts, buf);
+	if (ret)
+		goto out_free_opts;
+
+
+	request_module("nvme-%s", opts->transport);
+
+	/*
+	 * Check the generic options first as we need a valid transport for
+	 * the lookup below.  Then clear the generic flags so that transport
+	 * drivers don't have to care about them.
+	 */
+	ret = nvmf_check_required_opts(opts, NVMF_REQUIRED_OPTS);
+	if (ret)
+		goto out_free_opts;
+	opts->mask &= ~NVMF_REQUIRED_OPTS;
+
+	down_read(&nvmf_transports_rwsem);
+	ops = nvmf_lookup_transport(opts);
+	if (!ops) {
+		pr_info("no handler found for transport %s.\n",
+			opts->transport);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (!try_module_get(ops->module)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	ret = nvmf_check_required_opts(opts, ops->required_opts);
+	if (ret)
+		goto out_module_put;
+	ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS |
+				ops->allowed_opts | ops->required_opts);
+	if (ret)
+		goto out_module_put;
+
+	ctrl = ops->create_ctrl(dev, opts);
+	if (IS_ERR(ctrl)) {
+		ret = PTR_ERR(ctrl);
+		goto out_module_put;
+	}
+
+	if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
+		dev_warn(ctrl->device,
+			"controller returned incorrect NQN: \"%s\".\n",
+			ctrl->subsys->subnqn);
+		module_put(ops->module);
+		up_read(&nvmf_transports_rwsem);
+		nvme_delete_ctrl_sync(ctrl);
+		return ERR_PTR(-EINVAL);
+	}
+
+	module_put(ops->module);
+	up_read(&nvmf_transports_rwsem);
+	return ctrl;
+
+out_module_put:
+	module_put(ops->module);
+out_unlock:
+	up_read(&nvmf_transports_rwsem);
+out_free_opts:
+	nvmf_free_options(opts);
+	return ERR_PTR(ret);
+}
+
+static struct class *nvmf_class;
+static struct device *nvmf_device;
+static DEFINE_MUTEX(nvmf_dev_mutex);
+
+static ssize_t nvmf_dev_write(struct file *file, const char __user *ubuf,
+		size_t count, loff_t *pos)
+{
+	struct seq_file *seq_file = file->private_data;
+	struct nvme_ctrl *ctrl;
+	const char *buf;
+	int ret = 0;
+
+	if (count > PAGE_SIZE)
+		return -ENOMEM;
+
+	buf = memdup_user_nul(ubuf, count);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
+
+	mutex_lock(&nvmf_dev_mutex);
+	if (seq_file->private) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ctrl = nvmf_create_ctrl(nvmf_device, buf, count);
+	if (IS_ERR(ctrl)) {
+		ret = PTR_ERR(ctrl);
+		goto out_unlock;
+	}
+
+	seq_file->private = ctrl;
+
+out_unlock:
+	mutex_unlock(&nvmf_dev_mutex);
+	kfree(buf);
+	return ret ? ret : count;
+}
+
+static int nvmf_dev_show(struct seq_file *seq_file, void *private)
+{
+	struct nvme_ctrl *ctrl;
+	int ret = 0;
+
+	mutex_lock(&nvmf_dev_mutex);
+	ctrl = seq_file->private;
+	if (!ctrl) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	seq_printf(seq_file, "instance=%d,cntlid=%d\n",
+			ctrl->instance, ctrl->cntlid);
+
+out_unlock:
+	mutex_unlock(&nvmf_dev_mutex);
+	return ret;
+}
+
+static int nvmf_dev_open(struct inode *inode, struct file *file)
+{
+	/*
+	 * The miscdevice code initializes file->private_data, but doesn't
+	 * make use of it later.
+	 */
+	file->private_data = NULL;
+	return single_open(file, nvmf_dev_show, NULL);
+}
+
+static int nvmf_dev_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq_file = file->private_data;
+	struct nvme_ctrl *ctrl = seq_file->private;
+
+	if (ctrl)
+		nvme_put_ctrl(ctrl);
+	return single_release(inode, file);
+}
+
+static const struct file_operations nvmf_dev_fops = {
+	.owner		= THIS_MODULE,
+	.write		= nvmf_dev_write,
+	.read		= seq_read,
+	.open		= nvmf_dev_open,
+	.release	= nvmf_dev_release,
+};
+
+static struct miscdevice nvmf_misc = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name           = "nvme-fabrics",
+	.fops		= &nvmf_dev_fops,
+};
+
+static int __init nvmf_init(void)
+{
+	int ret;
+
+	nvmf_default_host = nvmf_host_default();
+	if (!nvmf_default_host)
+		return -ENOMEM;
+
+	nvmf_class = class_create(THIS_MODULE, "nvme-fabrics");
+	if (IS_ERR(nvmf_class)) {
+		pr_err("couldn't register class nvme-fabrics\n");
+		ret = PTR_ERR(nvmf_class);
+		goto out_free_host;
+	}
+
+	nvmf_device =
+		device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl");
+	if (IS_ERR(nvmf_device)) {
+		pr_err("couldn't create nvme-fabris device!\n");
+		ret = PTR_ERR(nvmf_device);
+		goto out_destroy_class;
+	}
+
+	ret = misc_register(&nvmf_misc);
+	if (ret) {
+		pr_err("couldn't register misc device: %d\n", ret);
+		goto out_destroy_device;
+	}
+
+	return 0;
+
+out_destroy_device:
+	device_destroy(nvmf_class, MKDEV(0, 0));
+out_destroy_class:
+	class_destroy(nvmf_class);
+out_free_host:
+	nvmf_host_put(nvmf_default_host);
+	return ret;
+}
+
+static void __exit nvmf_exit(void)
+{
+	misc_deregister(&nvmf_misc);
+	device_destroy(nvmf_class, MKDEV(0, 0));
+	class_destroy(nvmf_class);
+	nvmf_host_put(nvmf_default_host);
+
+	BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024);
+}
+
+MODULE_LICENSE("GPL v2");
+
+module_init(nvmf_init);
+module_exit(nvmf_exit);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
new file mode 100644
index 0000000..ef46c91
--- /dev/null
+++ b/drivers/nvme/host/fabrics.h
@@ -0,0 +1,163 @@
+/*
+ * NVMe over Fabrics common host code.
+ * Copyright (c) 2015-2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef _NVME_FABRICS_H
+#define _NVME_FABRICS_H 1
+
+#include <linux/in.h>
+#include <linux/inet.h>
+
+#define NVMF_MIN_QUEUE_SIZE	16
+#define NVMF_MAX_QUEUE_SIZE	1024
+#define NVMF_DEF_QUEUE_SIZE	128
+#define NVMF_DEF_RECONNECT_DELAY	10
+/* default to 600 seconds of reconnect attempts before giving up */
+#define NVMF_DEF_CTRL_LOSS_TMO		600
+
+/*
+ * Define a host as seen by the target.  We allocate one at boot, but also
+ * allow the override it when creating controllers.  This is both to provide
+ * persistence of the Host NQN over multiple boots, and to allow using
+ * multiple ones, for example in a container scenario.  Because we must not
+ * use different Host NQNs with the same Host ID we generate a Host ID and
+ * use this structure to keep track of the relation between the two.
+ */
+struct nvmf_host {
+	struct kref		ref;
+	struct list_head	list;
+	char			nqn[NVMF_NQN_SIZE];
+	uuid_t			id;
+};
+
+/**
+ * enum nvmf_parsing_opts - used to define the sysfs parsing options used.
+ */
+enum {
+	NVMF_OPT_ERR		= 0,
+	NVMF_OPT_TRANSPORT	= 1 << 0,
+	NVMF_OPT_NQN		= 1 << 1,
+	NVMF_OPT_TRADDR		= 1 << 2,
+	NVMF_OPT_TRSVCID	= 1 << 3,
+	NVMF_OPT_QUEUE_SIZE	= 1 << 4,
+	NVMF_OPT_NR_IO_QUEUES	= 1 << 5,
+	NVMF_OPT_TL_RETRY_COUNT	= 1 << 6,
+	NVMF_OPT_KATO		= 1 << 7,
+	NVMF_OPT_HOSTNQN	= 1 << 8,
+	NVMF_OPT_RECONNECT_DELAY = 1 << 9,
+	NVMF_OPT_HOST_TRADDR	= 1 << 10,
+	NVMF_OPT_CTRL_LOSS_TMO	= 1 << 11,
+	NVMF_OPT_HOST_ID	= 1 << 12,
+	NVMF_OPT_DUP_CONNECT	= 1 << 13,
+};
+
+/**
+ * struct nvmf_ctrl_options - Used to hold the options specified
+ *			      with the parsing opts enum.
+ * @mask:	Used by the fabrics library to parse through sysfs options
+ *		on adding a NVMe controller.
+ * @transport:	Holds the fabric transport "technology name" (for a lack of
+ *		better description) that will be used by an NVMe controller
+ *		being added.
+ * @subsysnqn:	Hold the fully qualified NQN subystem name (format defined
+ *		in the NVMe specification, "NVMe Qualified Names").
+ * @traddr:	The transport-specific TRADDR field for a port on the
+ *              subsystem which is adding a controller.
+ * @trsvcid:	The transport-specific TRSVCID field for a port on the
+ *              subsystem which is adding a controller.
+ * @host_traddr: A transport-specific field identifying the NVME host port
+ *              to use for the connection to the controller.
+ * @queue_size: Number of IO queue elements.
+ * @nr_io_queues: Number of controller IO queues that will be established.
+ * @reconnect_delay: Time between two consecutive reconnect attempts.
+ * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN.
+ * @kato:	Keep-alive timeout.
+ * @host:	Virtual NVMe host, contains the NQN and Host ID.
+ * @max_reconnects: maximum number of allowed reconnect attempts before removing
+ *              the controller, (-1) means reconnect forever, zero means remove
+ *              immediately;
+ */
+struct nvmf_ctrl_options {
+	unsigned		mask;
+	char			*transport;
+	char			*subsysnqn;
+	char			*traddr;
+	char			*trsvcid;
+	char			*host_traddr;
+	size_t			queue_size;
+	unsigned int		nr_io_queues;
+	unsigned int		reconnect_delay;
+	bool			discovery_nqn;
+	bool			duplicate_connect;
+	unsigned int		kato;
+	struct nvmf_host	*host;
+	int			max_reconnects;
+};
+
+/*
+ * struct nvmf_transport_ops - used to register a specific
+ *			       fabric implementation of NVMe fabrics.
+ * @entry:		Used by the fabrics library to add the new
+ *			registration entry to its linked-list internal tree.
+ * @module:             Transport module reference
+ * @name:		Name of the NVMe fabric driver implementation.
+ * @required_opts:	sysfs command-line options that must be specified
+ *			when adding a new NVMe controller.
+ * @allowed_opts:	sysfs command-line options that can be specified
+ *			when adding a new NVMe controller.
+ * @create_ctrl():	function pointer that points to a non-NVMe
+ *			implementation-specific fabric technology
+ *			that would go into starting up that fabric
+ *			for the purpose of conneciton to an NVMe controller
+ *			using that fabric technology.
+ *
+ * Notes:
+ *	1. At minimum, 'required_opts' and 'allowed_opts' should
+ *	   be set to the same enum parsing options defined earlier.
+ *	2. create_ctrl() must be defined (even if it does nothing)
+ */
+struct nvmf_transport_ops {
+	struct list_head	entry;
+	struct module		*module;
+	const char		*name;
+	int			required_opts;
+	int			allowed_opts;
+	struct nvme_ctrl	*(*create_ctrl)(struct device *dev,
+					struct nvmf_ctrl_options *opts);
+};
+
+static inline bool
+nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl,
+			struct nvmf_ctrl_options *opts)
+{
+	if (strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) ||
+	    strcmp(opts->host->nqn, ctrl->opts->host->nqn) ||
+	    memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t)))
+		return false;
+
+	return true;
+}
+
+int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
+int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
+int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
+int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
+int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
+int nvmf_register_transport(struct nvmf_transport_ops *ops);
+void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
+void nvmf_free_options(struct nvmf_ctrl_options *opts);
+int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
+blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl,
+	struct request *rq, bool queue_live, bool is_connected);
+
+#endif /* _NVME_FABRICS_H */
diff --git a/drivers/nvme/host/fault_inject.c b/drivers/nvme/host/fault_inject.c
new file mode 100644
index 0000000..0263226
--- /dev/null
+++ b/drivers/nvme/host/fault_inject.c
@@ -0,0 +1,79 @@
+/*
+ * fault injection support for nvme.
+ *
+ * Copyright (c) 2018, Oracle and/or its affiliates
+ *
+ */
+
+#include <linux/moduleparam.h>
+#include "nvme.h"
+
+static DECLARE_FAULT_ATTR(fail_default_attr);
+/* optional fault injection attributes boot time option:
+ * nvme_core.fail_request=<interval>,<probability>,<space>,<times>
+ */
+static char *fail_request;
+module_param(fail_request, charp, 0000);
+
+void nvme_fault_inject_init(struct nvme_ns *ns)
+{
+	struct dentry *dir, *parent;
+	char *name = ns->disk->disk_name;
+	struct nvme_fault_inject *fault_inj = &ns->fault_inject;
+	struct fault_attr *attr = &fault_inj->attr;
+
+	/* set default fault injection attribute */
+	if (fail_request)
+		setup_fault_attr(&fail_default_attr, fail_request);
+
+	/* create debugfs directory and attribute */
+	parent = debugfs_create_dir(name, NULL);
+	if (!parent) {
+		pr_warn("%s: failed to create debugfs directory\n", name);
+		return;
+	}
+
+	*attr = fail_default_attr;
+	dir = fault_create_debugfs_attr("fault_inject", parent, attr);
+	if (IS_ERR(dir)) {
+		pr_warn("%s: failed to create debugfs attr\n", name);
+		debugfs_remove_recursive(parent);
+		return;
+	}
+	ns->fault_inject.parent = parent;
+
+	/* create debugfs for status code and dont_retry */
+	fault_inj->status = NVME_SC_INVALID_OPCODE;
+	fault_inj->dont_retry = true;
+	debugfs_create_x16("status", 0600, dir,	&fault_inj->status);
+	debugfs_create_bool("dont_retry", 0600, dir, &fault_inj->dont_retry);
+}
+
+void nvme_fault_inject_fini(struct nvme_ns *ns)
+{
+	/* remove debugfs directories */
+	debugfs_remove_recursive(ns->fault_inject.parent);
+}
+
+void nvme_should_fail(struct request *req)
+{
+	struct gendisk *disk = req->rq_disk;
+	struct nvme_ns *ns = NULL;
+	u16 status;
+
+	/*
+	 * make sure this request is coming from a valid namespace
+	 */
+	if (!disk)
+		return;
+
+	ns = disk->private_data;
+	if (ns && should_fail(&ns->fault_inject.attr, 1)) {
+		/* inject status code and DNR bit */
+		status = ns->fault_inject.status;
+		if (ns->fault_inject.dont_retry)
+			status |= NVME_SC_DNR;
+		nvme_req(req)->status =	status;
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_should_fail);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
new file mode 100644
index 0000000..6cb26bc
--- /dev/null
+++ b/drivers/nvme/host/fc.c
@@ -0,0 +1,3365 @@
+/*
+ * Copyright (c) 2016 Avago Technologies.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful.
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
+ * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
+ * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
+ * See the GNU General Public License for more details, a copy of which
+ * can be found in the file COPYING included with this package
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <uapi/scsi/fc/fc_fs.h>
+#include <uapi/scsi/fc/fc_els.h>
+#include <linux/delay.h>
+
+#include "nvme.h"
+#include "fabrics.h"
+#include <linux/nvme-fc-driver.h>
+#include <linux/nvme-fc.h>
+
+
+/* *************************** Data Structures/Defines ****************** */
+
+
+enum nvme_fc_queue_flags {
+	NVME_FC_Q_CONNECTED = 0,
+	NVME_FC_Q_LIVE,
+};
+
+#define NVME_FC_DEFAULT_DEV_LOSS_TMO	60	/* seconds */
+
+struct nvme_fc_queue {
+	struct nvme_fc_ctrl	*ctrl;
+	struct device		*dev;
+	struct blk_mq_hw_ctx	*hctx;
+	void			*lldd_handle;
+	size_t			cmnd_capsule_len;
+	u32			qnum;
+	u32			rqcnt;
+	u32			seqno;
+
+	u64			connection_id;
+	atomic_t		csn;
+
+	unsigned long		flags;
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+enum nvme_fcop_flags {
+	FCOP_FLAGS_TERMIO	= (1 << 0),
+	FCOP_FLAGS_AEN		= (1 << 1),
+};
+
+struct nvmefc_ls_req_op {
+	struct nvmefc_ls_req	ls_req;
+
+	struct nvme_fc_rport	*rport;
+	struct nvme_fc_queue	*queue;
+	struct request		*rq;
+	u32			flags;
+
+	int			ls_error;
+	struct completion	ls_done;
+	struct list_head	lsreq_list;	/* rport->ls_req_list */
+	bool			req_queued;
+};
+
+enum nvme_fcpop_state {
+	FCPOP_STATE_UNINIT	= 0,
+	FCPOP_STATE_IDLE	= 1,
+	FCPOP_STATE_ACTIVE	= 2,
+	FCPOP_STATE_ABORTED	= 3,
+	FCPOP_STATE_COMPLETE	= 4,
+};
+
+struct nvme_fc_fcp_op {
+	struct nvme_request	nreq;		/*
+						 * nvme/host/core.c
+						 * requires this to be
+						 * the 1st element in the
+						 * private structure
+						 * associated with the
+						 * request.
+						 */
+	struct nvmefc_fcp_req	fcp_req;
+
+	struct nvme_fc_ctrl	*ctrl;
+	struct nvme_fc_queue	*queue;
+	struct request		*rq;
+
+	atomic_t		state;
+	u32			flags;
+	u32			rqno;
+	u32			nents;
+
+	struct nvme_fc_cmd_iu	cmd_iu;
+	struct nvme_fc_ersp_iu	rsp_iu;
+};
+
+struct nvme_fc_lport {
+	struct nvme_fc_local_port	localport;
+
+	struct ida			endp_cnt;
+	struct list_head		port_list;	/* nvme_fc_port_list */
+	struct list_head		endp_list;
+	struct device			*dev;	/* physical device for dma */
+	struct nvme_fc_port_template	*ops;
+	struct kref			ref;
+	atomic_t                        act_rport_cnt;
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+struct nvme_fc_rport {
+	struct nvme_fc_remote_port	remoteport;
+
+	struct list_head		endp_list; /* for lport->endp_list */
+	struct list_head		ctrl_list;
+	struct list_head		ls_req_list;
+	struct device			*dev;	/* physical device for dma */
+	struct nvme_fc_lport		*lport;
+	spinlock_t			lock;
+	struct kref			ref;
+	atomic_t                        act_ctrl_cnt;
+	unsigned long			dev_loss_end;
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+enum nvme_fcctrl_flags {
+	FCCTRL_TERMIO		= (1 << 0),
+};
+
+struct nvme_fc_ctrl {
+	spinlock_t		lock;
+	struct nvme_fc_queue	*queues;
+	struct device		*dev;
+	struct nvme_fc_lport	*lport;
+	struct nvme_fc_rport	*rport;
+	u32			cnum;
+
+	bool			assoc_active;
+	u64			association_id;
+
+	struct list_head	ctrl_list;	/* rport->ctrl_list */
+
+	struct blk_mq_tag_set	admin_tag_set;
+	struct blk_mq_tag_set	tag_set;
+
+	struct delayed_work	connect_work;
+
+	struct kref		ref;
+	u32			flags;
+	u32			iocnt;
+	wait_queue_head_t	ioabort_wait;
+
+	struct nvme_fc_fcp_op	aen_ops[NVME_NR_AEN_COMMANDS];
+
+	struct nvme_ctrl	ctrl;
+};
+
+static inline struct nvme_fc_ctrl *
+to_fc_ctrl(struct nvme_ctrl *ctrl)
+{
+	return container_of(ctrl, struct nvme_fc_ctrl, ctrl);
+}
+
+static inline struct nvme_fc_lport *
+localport_to_lport(struct nvme_fc_local_port *portptr)
+{
+	return container_of(portptr, struct nvme_fc_lport, localport);
+}
+
+static inline struct nvme_fc_rport *
+remoteport_to_rport(struct nvme_fc_remote_port *portptr)
+{
+	return container_of(portptr, struct nvme_fc_rport, remoteport);
+}
+
+static inline struct nvmefc_ls_req_op *
+ls_req_to_lsop(struct nvmefc_ls_req *lsreq)
+{
+	return container_of(lsreq, struct nvmefc_ls_req_op, ls_req);
+}
+
+static inline struct nvme_fc_fcp_op *
+fcp_req_to_fcp_op(struct nvmefc_fcp_req *fcpreq)
+{
+	return container_of(fcpreq, struct nvme_fc_fcp_op, fcp_req);
+}
+
+
+
+/* *************************** Globals **************************** */
+
+
+static DEFINE_SPINLOCK(nvme_fc_lock);
+
+static LIST_HEAD(nvme_fc_lport_list);
+static DEFINE_IDA(nvme_fc_local_port_cnt);
+static DEFINE_IDA(nvme_fc_ctrl_cnt);
+
+
+
+/*
+ * These items are short-term. They will eventually be moved into
+ * a generic FC class. See comments in module init.
+ */
+static struct class *fc_class;
+static struct device *fc_udev_device;
+
+
+/* *********************** FC-NVME Port Management ************************ */
+
+static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *,
+			struct nvme_fc_queue *, unsigned int);
+
+static void
+nvme_fc_free_lport(struct kref *ref)
+{
+	struct nvme_fc_lport *lport =
+		container_of(ref, struct nvme_fc_lport, ref);
+	unsigned long flags;
+
+	WARN_ON(lport->localport.port_state != FC_OBJSTATE_DELETED);
+	WARN_ON(!list_empty(&lport->endp_list));
+
+	/* remove from transport list */
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_del(&lport->port_list);
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num);
+	ida_destroy(&lport->endp_cnt);
+
+	put_device(lport->dev);
+
+	kfree(lport);
+}
+
+static void
+nvme_fc_lport_put(struct nvme_fc_lport *lport)
+{
+	kref_put(&lport->ref, nvme_fc_free_lport);
+}
+
+static int
+nvme_fc_lport_get(struct nvme_fc_lport *lport)
+{
+	return kref_get_unless_zero(&lport->ref);
+}
+
+
+static struct nvme_fc_lport *
+nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo,
+			struct nvme_fc_port_template *ops,
+			struct device *dev)
+{
+	struct nvme_fc_lport *lport;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+
+	list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
+		if (lport->localport.node_name != pinfo->node_name ||
+		    lport->localport.port_name != pinfo->port_name)
+			continue;
+
+		if (lport->dev != dev) {
+			lport = ERR_PTR(-EXDEV);
+			goto out_done;
+		}
+
+		if (lport->localport.port_state != FC_OBJSTATE_DELETED) {
+			lport = ERR_PTR(-EEXIST);
+			goto out_done;
+		}
+
+		if (!nvme_fc_lport_get(lport)) {
+			/*
+			 * fails if ref cnt already 0. If so,
+			 * act as if lport already deleted
+			 */
+			lport = NULL;
+			goto out_done;
+		}
+
+		/* resume the lport */
+
+		lport->ops = ops;
+		lport->localport.port_role = pinfo->port_role;
+		lport->localport.port_id = pinfo->port_id;
+		lport->localport.port_state = FC_OBJSTATE_ONLINE;
+
+		spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+		return lport;
+	}
+
+	lport = NULL;
+
+out_done:
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	return lport;
+}
+
+/**
+ * nvme_fc_register_localport - transport entry point called by an
+ *                              LLDD to register the existence of a NVME
+ *                              host FC port.
+ * @pinfo:     pointer to information about the port to be registered
+ * @template:  LLDD entrypoints and operational parameters for the port
+ * @dev:       physical hardware device node port corresponds to. Will be
+ *             used for DMA mappings
+ * @lport_p:   pointer to a local port pointer. Upon success, the routine
+ *             will allocate a nvme_fc_local_port structure and place its
+ *             address in the local port pointer. Upon failure, local port
+ *             pointer will be set to 0.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
+			struct nvme_fc_port_template *template,
+			struct device *dev,
+			struct nvme_fc_local_port **portptr)
+{
+	struct nvme_fc_lport *newrec;
+	unsigned long flags;
+	int ret, idx;
+
+	if (!template->localport_delete || !template->remoteport_delete ||
+	    !template->ls_req || !template->fcp_io ||
+	    !template->ls_abort || !template->fcp_abort ||
+	    !template->max_hw_queues || !template->max_sgl_segments ||
+	    !template->max_dif_sgl_segments || !template->dma_boundary) {
+		ret = -EINVAL;
+		goto out_reghost_failed;
+	}
+
+	/*
+	 * look to see if there is already a localport that had been
+	 * deregistered and in the process of waiting for all the
+	 * references to fully be removed.  If the references haven't
+	 * expired, we can simply re-enable the localport. Remoteports
+	 * and controller reconnections should resume naturally.
+	 */
+	newrec = nvme_fc_attach_to_unreg_lport(pinfo, template, dev);
+
+	/* found an lport, but something about its state is bad */
+	if (IS_ERR(newrec)) {
+		ret = PTR_ERR(newrec);
+		goto out_reghost_failed;
+
+	/* found existing lport, which was resumed */
+	} else if (newrec) {
+		*portptr = &newrec->localport;
+		return 0;
+	}
+
+	/* nothing found - allocate a new localport struct */
+
+	newrec = kmalloc((sizeof(*newrec) + template->local_priv_sz),
+			 GFP_KERNEL);
+	if (!newrec) {
+		ret = -ENOMEM;
+		goto out_reghost_failed;
+	}
+
+	idx = ida_simple_get(&nvme_fc_local_port_cnt, 0, 0, GFP_KERNEL);
+	if (idx < 0) {
+		ret = -ENOSPC;
+		goto out_fail_kfree;
+	}
+
+	if (!get_device(dev) && dev) {
+		ret = -ENODEV;
+		goto out_ida_put;
+	}
+
+	INIT_LIST_HEAD(&newrec->port_list);
+	INIT_LIST_HEAD(&newrec->endp_list);
+	kref_init(&newrec->ref);
+	atomic_set(&newrec->act_rport_cnt, 0);
+	newrec->ops = template;
+	newrec->dev = dev;
+	ida_init(&newrec->endp_cnt);
+	newrec->localport.private = &newrec[1];
+	newrec->localport.node_name = pinfo->node_name;
+	newrec->localport.port_name = pinfo->port_name;
+	newrec->localport.port_role = pinfo->port_role;
+	newrec->localport.port_id = pinfo->port_id;
+	newrec->localport.port_state = FC_OBJSTATE_ONLINE;
+	newrec->localport.port_num = idx;
+
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_add_tail(&newrec->port_list, &nvme_fc_lport_list);
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	if (dev)
+		dma_set_seg_boundary(dev, template->dma_boundary);
+
+	*portptr = &newrec->localport;
+	return 0;
+
+out_ida_put:
+	ida_simple_remove(&nvme_fc_local_port_cnt, idx);
+out_fail_kfree:
+	kfree(newrec);
+out_reghost_failed:
+	*portptr = NULL;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_register_localport);
+
+/**
+ * nvme_fc_unregister_localport - transport entry point called by an
+ *                              LLDD to deregister/remove a previously
+ *                              registered a NVME host FC port.
+ * @localport: pointer to the (registered) local port that is to be
+ *             deregistered.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr)
+{
+	struct nvme_fc_lport *lport = localport_to_lport(portptr);
+	unsigned long flags;
+
+	if (!portptr)
+		return -EINVAL;
+
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+
+	if (portptr->port_state != FC_OBJSTATE_ONLINE) {
+		spin_unlock_irqrestore(&nvme_fc_lock, flags);
+		return -EINVAL;
+	}
+	portptr->port_state = FC_OBJSTATE_DELETED;
+
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	if (atomic_read(&lport->act_rport_cnt) == 0)
+		lport->ops->localport_delete(&lport->localport);
+
+	nvme_fc_lport_put(lport);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport);
+
+/*
+ * TRADDR strings, per FC-NVME are fixed format:
+ *   "nn-0x<16hexdigits>:pn-0x<16hexdigits>" - 43 characters
+ * udev event will only differ by prefix of what field is
+ * being specified:
+ *    "NVMEFC_HOST_TRADDR=" or "NVMEFC_TRADDR=" - 19 max characters
+ *  19 + 43 + null_fudge = 64 characters
+ */
+#define FCNVME_TRADDR_LENGTH		64
+
+static void
+nvme_fc_signal_discovery_scan(struct nvme_fc_lport *lport,
+		struct nvme_fc_rport *rport)
+{
+	char hostaddr[FCNVME_TRADDR_LENGTH];	/* NVMEFC_HOST_TRADDR=...*/
+	char tgtaddr[FCNVME_TRADDR_LENGTH];	/* NVMEFC_TRADDR=...*/
+	char *envp[4] = { "FC_EVENT=nvmediscovery", hostaddr, tgtaddr, NULL };
+
+	if (!(rport->remoteport.port_role & FC_PORT_ROLE_NVME_DISCOVERY))
+		return;
+
+	snprintf(hostaddr, sizeof(hostaddr),
+		"NVMEFC_HOST_TRADDR=nn-0x%016llx:pn-0x%016llx",
+		lport->localport.node_name, lport->localport.port_name);
+	snprintf(tgtaddr, sizeof(tgtaddr),
+		"NVMEFC_TRADDR=nn-0x%016llx:pn-0x%016llx",
+		rport->remoteport.node_name, rport->remoteport.port_name);
+	kobject_uevent_env(&fc_udev_device->kobj, KOBJ_CHANGE, envp);
+}
+
+static void
+nvme_fc_free_rport(struct kref *ref)
+{
+	struct nvme_fc_rport *rport =
+		container_of(ref, struct nvme_fc_rport, ref);
+	struct nvme_fc_lport *lport =
+			localport_to_lport(rport->remoteport.localport);
+	unsigned long flags;
+
+	WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED);
+	WARN_ON(!list_empty(&rport->ctrl_list));
+
+	/* remove from lport list */
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_del(&rport->endp_list);
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num);
+
+	kfree(rport);
+
+	nvme_fc_lport_put(lport);
+}
+
+static void
+nvme_fc_rport_put(struct nvme_fc_rport *rport)
+{
+	kref_put(&rport->ref, nvme_fc_free_rport);
+}
+
+static int
+nvme_fc_rport_get(struct nvme_fc_rport *rport)
+{
+	return kref_get_unless_zero(&rport->ref);
+}
+
+static void
+nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl)
+{
+	switch (ctrl->ctrl.state) {
+	case NVME_CTRL_NEW:
+	case NVME_CTRL_CONNECTING:
+		/*
+		 * As all reconnects were suppressed, schedule a
+		 * connect.
+		 */
+		dev_info(ctrl->ctrl.device,
+			"NVME-FC{%d}: connectivity re-established. "
+			"Attempting reconnect\n", ctrl->cnum);
+
+		queue_delayed_work(nvme_wq, &ctrl->connect_work, 0);
+		break;
+
+	case NVME_CTRL_RESETTING:
+		/*
+		 * Controller is already in the process of terminating the
+		 * association. No need to do anything further. The reconnect
+		 * step will naturally occur after the reset completes.
+		 */
+		break;
+
+	default:
+		/* no action to take - let it delete */
+		break;
+	}
+}
+
+static struct nvme_fc_rport *
+nvme_fc_attach_to_suspended_rport(struct nvme_fc_lport *lport,
+				struct nvme_fc_port_info *pinfo)
+{
+	struct nvme_fc_rport *rport;
+	struct nvme_fc_ctrl *ctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+
+	list_for_each_entry(rport, &lport->endp_list, endp_list) {
+		if (rport->remoteport.node_name != pinfo->node_name ||
+		    rport->remoteport.port_name != pinfo->port_name)
+			continue;
+
+		if (!nvme_fc_rport_get(rport)) {
+			rport = ERR_PTR(-ENOLCK);
+			goto out_done;
+		}
+
+		spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+		spin_lock_irqsave(&rport->lock, flags);
+
+		/* has it been unregistered */
+		if (rport->remoteport.port_state != FC_OBJSTATE_DELETED) {
+			/* means lldd called us twice */
+			spin_unlock_irqrestore(&rport->lock, flags);
+			nvme_fc_rport_put(rport);
+			return ERR_PTR(-ESTALE);
+		}
+
+		rport->remoteport.port_role = pinfo->port_role;
+		rport->remoteport.port_id = pinfo->port_id;
+		rport->remoteport.port_state = FC_OBJSTATE_ONLINE;
+		rport->dev_loss_end = 0;
+
+		/*
+		 * kick off a reconnect attempt on all associations to the
+		 * remote port. A successful reconnects will resume i/o.
+		 */
+		list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list)
+			nvme_fc_resume_controller(ctrl);
+
+		spin_unlock_irqrestore(&rport->lock, flags);
+
+		return rport;
+	}
+
+	rport = NULL;
+
+out_done:
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	return rport;
+}
+
+static inline void
+__nvme_fc_set_dev_loss_tmo(struct nvme_fc_rport *rport,
+			struct nvme_fc_port_info *pinfo)
+{
+	if (pinfo->dev_loss_tmo)
+		rport->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo;
+	else
+		rport->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO;
+}
+
+/**
+ * nvme_fc_register_remoteport - transport entry point called by an
+ *                              LLDD to register the existence of a NVME
+ *                              subsystem FC port on its fabric.
+ * @localport: pointer to the (registered) local port that the remote
+ *             subsystem port is connected to.
+ * @pinfo:     pointer to information about the port to be registered
+ * @rport_p:   pointer to a remote port pointer. Upon success, the routine
+ *             will allocate a nvme_fc_remote_port structure and place its
+ *             address in the remote port pointer. Upon failure, remote port
+ *             pointer will be set to 0.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
+				struct nvme_fc_port_info *pinfo,
+				struct nvme_fc_remote_port **portptr)
+{
+	struct nvme_fc_lport *lport = localport_to_lport(localport);
+	struct nvme_fc_rport *newrec;
+	unsigned long flags;
+	int ret, idx;
+
+	if (!nvme_fc_lport_get(lport)) {
+		ret = -ESHUTDOWN;
+		goto out_reghost_failed;
+	}
+
+	/*
+	 * look to see if there is already a remoteport that is waiting
+	 * for a reconnect (within dev_loss_tmo) with the same WWN's.
+	 * If so, transition to it and reconnect.
+	 */
+	newrec = nvme_fc_attach_to_suspended_rport(lport, pinfo);
+
+	/* found an rport, but something about its state is bad */
+	if (IS_ERR(newrec)) {
+		ret = PTR_ERR(newrec);
+		goto out_lport_put;
+
+	/* found existing rport, which was resumed */
+	} else if (newrec) {
+		nvme_fc_lport_put(lport);
+		__nvme_fc_set_dev_loss_tmo(newrec, pinfo);
+		nvme_fc_signal_discovery_scan(lport, newrec);
+		*portptr = &newrec->remoteport;
+		return 0;
+	}
+
+	/* nothing found - allocate a new remoteport struct */
+
+	newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz),
+			 GFP_KERNEL);
+	if (!newrec) {
+		ret = -ENOMEM;
+		goto out_lport_put;
+	}
+
+	idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL);
+	if (idx < 0) {
+		ret = -ENOSPC;
+		goto out_kfree_rport;
+	}
+
+	INIT_LIST_HEAD(&newrec->endp_list);
+	INIT_LIST_HEAD(&newrec->ctrl_list);
+	INIT_LIST_HEAD(&newrec->ls_req_list);
+	kref_init(&newrec->ref);
+	atomic_set(&newrec->act_ctrl_cnt, 0);
+	spin_lock_init(&newrec->lock);
+	newrec->remoteport.localport = &lport->localport;
+	newrec->dev = lport->dev;
+	newrec->lport = lport;
+	newrec->remoteport.private = &newrec[1];
+	newrec->remoteport.port_role = pinfo->port_role;
+	newrec->remoteport.node_name = pinfo->node_name;
+	newrec->remoteport.port_name = pinfo->port_name;
+	newrec->remoteport.port_id = pinfo->port_id;
+	newrec->remoteport.port_state = FC_OBJSTATE_ONLINE;
+	newrec->remoteport.port_num = idx;
+	__nvme_fc_set_dev_loss_tmo(newrec, pinfo);
+
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_add_tail(&newrec->endp_list, &lport->endp_list);
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	nvme_fc_signal_discovery_scan(lport, newrec);
+
+	*portptr = &newrec->remoteport;
+	return 0;
+
+out_kfree_rport:
+	kfree(newrec);
+out_lport_put:
+	nvme_fc_lport_put(lport);
+out_reghost_failed:
+	*portptr = NULL;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport);
+
+static int
+nvme_fc_abort_lsops(struct nvme_fc_rport *rport)
+{
+	struct nvmefc_ls_req_op *lsop;
+	unsigned long flags;
+
+restart:
+	spin_lock_irqsave(&rport->lock, flags);
+
+	list_for_each_entry(lsop, &rport->ls_req_list, lsreq_list) {
+		if (!(lsop->flags & FCOP_FLAGS_TERMIO)) {
+			lsop->flags |= FCOP_FLAGS_TERMIO;
+			spin_unlock_irqrestore(&rport->lock, flags);
+			rport->lport->ops->ls_abort(&rport->lport->localport,
+						&rport->remoteport,
+						&lsop->ls_req);
+			goto restart;
+		}
+	}
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	return 0;
+}
+
+static void
+nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
+{
+	dev_info(ctrl->ctrl.device,
+		"NVME-FC{%d}: controller connectivity lost. Awaiting "
+		"Reconnect", ctrl->cnum);
+
+	switch (ctrl->ctrl.state) {
+	case NVME_CTRL_NEW:
+	case NVME_CTRL_LIVE:
+		/*
+		 * Schedule a controller reset. The reset will terminate the
+		 * association and schedule the reconnect timer.  Reconnects
+		 * will be attempted until either the ctlr_loss_tmo
+		 * (max_retries * connect_delay) expires or the remoteport's
+		 * dev_loss_tmo expires.
+		 */
+		if (nvme_reset_ctrl(&ctrl->ctrl)) {
+			dev_warn(ctrl->ctrl.device,
+				"NVME-FC{%d}: Couldn't schedule reset.\n",
+				ctrl->cnum);
+			nvme_delete_ctrl(&ctrl->ctrl);
+		}
+		break;
+
+	case NVME_CTRL_CONNECTING:
+		/*
+		 * The association has already been terminated and the
+		 * controller is attempting reconnects.  No need to do anything
+		 * futher.  Reconnects will be attempted until either the
+		 * ctlr_loss_tmo (max_retries * connect_delay) expires or the
+		 * remoteport's dev_loss_tmo expires.
+		 */
+		break;
+
+	case NVME_CTRL_RESETTING:
+		/*
+		 * Controller is already in the process of terminating the
+		 * association.  No need to do anything further. The reconnect
+		 * step will kick in naturally after the association is
+		 * terminated.
+		 */
+		break;
+
+	case NVME_CTRL_DELETING:
+	default:
+		/* no action to take - let it delete */
+		break;
+	}
+}
+
+/**
+ * nvme_fc_unregister_remoteport - transport entry point called by an
+ *                              LLDD to deregister/remove a previously
+ *                              registered a NVME subsystem FC port.
+ * @remoteport: pointer to the (registered) remote port that is to be
+ *              deregistered.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
+{
+	struct nvme_fc_rport *rport = remoteport_to_rport(portptr);
+	struct nvme_fc_ctrl *ctrl;
+	unsigned long flags;
+
+	if (!portptr)
+		return -EINVAL;
+
+	spin_lock_irqsave(&rport->lock, flags);
+
+	if (portptr->port_state != FC_OBJSTATE_ONLINE) {
+		spin_unlock_irqrestore(&rport->lock, flags);
+		return -EINVAL;
+	}
+	portptr->port_state = FC_OBJSTATE_DELETED;
+
+	rport->dev_loss_end = jiffies + (portptr->dev_loss_tmo * HZ);
+
+	list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
+		/* if dev_loss_tmo==0, dev loss is immediate */
+		if (!portptr->dev_loss_tmo) {
+			dev_warn(ctrl->ctrl.device,
+				"NVME-FC{%d}: controller connectivity lost.\n",
+				ctrl->cnum);
+			nvme_delete_ctrl(&ctrl->ctrl);
+		} else
+			nvme_fc_ctrl_connectivity_loss(ctrl);
+	}
+
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	nvme_fc_abort_lsops(rport);
+
+	if (atomic_read(&rport->act_ctrl_cnt) == 0)
+		rport->lport->ops->remoteport_delete(portptr);
+
+	/*
+	 * release the reference, which will allow, if all controllers
+	 * go away, which should only occur after dev_loss_tmo occurs,
+	 * for the rport to be torn down.
+	 */
+	nvme_fc_rport_put(rport);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport);
+
+/**
+ * nvme_fc_rescan_remoteport - transport entry point called by an
+ *                              LLDD to request a nvme device rescan.
+ * @remoteport: pointer to the (registered) remote port that is to be
+ *              rescanned.
+ *
+ * Returns: N/A
+ */
+void
+nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport)
+{
+	struct nvme_fc_rport *rport = remoteport_to_rport(remoteport);
+
+	nvme_fc_signal_discovery_scan(rport->lport, rport);
+}
+EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport);
+
+int
+nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *portptr,
+			u32 dev_loss_tmo)
+{
+	struct nvme_fc_rport *rport = remoteport_to_rport(portptr);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rport->lock, flags);
+
+	if (portptr->port_state != FC_OBJSTATE_ONLINE) {
+		spin_unlock_irqrestore(&rport->lock, flags);
+		return -EINVAL;
+	}
+
+	/* a dev_loss_tmo of 0 (immediate) is allowed to be set */
+	rport->remoteport.dev_loss_tmo = dev_loss_tmo;
+
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss);
+
+
+/* *********************** FC-NVME DMA Handling **************************** */
+
+/*
+ * The fcloop device passes in a NULL device pointer. Real LLD's will
+ * pass in a valid device pointer. If NULL is passed to the dma mapping
+ * routines, depending on the platform, it may or may not succeed, and
+ * may crash.
+ *
+ * As such:
+ * Wrapper all the dma routines and check the dev pointer.
+ *
+ * If simple mappings (return just a dma address, we'll noop them,
+ * returning a dma address of 0.
+ *
+ * On more complex mappings (dma_map_sg), a pseudo routine fills
+ * in the scatter list, setting all dma addresses to 0.
+ */
+
+static inline dma_addr_t
+fc_dma_map_single(struct device *dev, void *ptr, size_t size,
+		enum dma_data_direction dir)
+{
+	return dev ? dma_map_single(dev, ptr, size, dir) : (dma_addr_t)0L;
+}
+
+static inline int
+fc_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return dev ? dma_mapping_error(dev, dma_addr) : 0;
+}
+
+static inline void
+fc_dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
+	enum dma_data_direction dir)
+{
+	if (dev)
+		dma_unmap_single(dev, addr, size, dir);
+}
+
+static inline void
+fc_dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir)
+{
+	if (dev)
+		dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static inline void
+fc_dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir)
+{
+	if (dev)
+		dma_sync_single_for_device(dev, addr, size, dir);
+}
+
+/* pseudo dma_map_sg call */
+static int
+fc_map_sg(struct scatterlist *sg, int nents)
+{
+	struct scatterlist *s;
+	int i;
+
+	WARN_ON(nents == 0 || sg[0].length == 0);
+
+	for_each_sg(sg, s, nents, i) {
+		s->dma_address = 0L;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+		s->dma_length = s->length;
+#endif
+	}
+	return nents;
+}
+
+static inline int
+fc_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir)
+{
+	return dev ? dma_map_sg(dev, sg, nents, dir) : fc_map_sg(sg, nents);
+}
+
+static inline void
+fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir)
+{
+	if (dev)
+		dma_unmap_sg(dev, sg, nents, dir);
+}
+
+/* *********************** FC-NVME LS Handling **************************** */
+
+static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
+static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
+
+
+static void
+__nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop)
+{
+	struct nvme_fc_rport *rport = lsop->rport;
+	struct nvmefc_ls_req *lsreq = &lsop->ls_req;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rport->lock, flags);
+
+	if (!lsop->req_queued) {
+		spin_unlock_irqrestore(&rport->lock, flags);
+		return;
+	}
+
+	list_del(&lsop->lsreq_list);
+
+	lsop->req_queued = false;
+
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	fc_dma_unmap_single(rport->dev, lsreq->rqstdma,
+				  (lsreq->rqstlen + lsreq->rsplen),
+				  DMA_BIDIRECTIONAL);
+
+	nvme_fc_rport_put(rport);
+}
+
+static int
+__nvme_fc_send_ls_req(struct nvme_fc_rport *rport,
+		struct nvmefc_ls_req_op *lsop,
+		void (*done)(struct nvmefc_ls_req *req, int status))
+{
+	struct nvmefc_ls_req *lsreq = &lsop->ls_req;
+	unsigned long flags;
+	int ret = 0;
+
+	if (rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
+		return -ECONNREFUSED;
+
+	if (!nvme_fc_rport_get(rport))
+		return -ESHUTDOWN;
+
+	lsreq->done = done;
+	lsop->rport = rport;
+	lsop->req_queued = false;
+	INIT_LIST_HEAD(&lsop->lsreq_list);
+	init_completion(&lsop->ls_done);
+
+	lsreq->rqstdma = fc_dma_map_single(rport->dev, lsreq->rqstaddr,
+				  lsreq->rqstlen + lsreq->rsplen,
+				  DMA_BIDIRECTIONAL);
+	if (fc_dma_mapping_error(rport->dev, lsreq->rqstdma)) {
+		ret = -EFAULT;
+		goto out_putrport;
+	}
+	lsreq->rspdma = lsreq->rqstdma + lsreq->rqstlen;
+
+	spin_lock_irqsave(&rport->lock, flags);
+
+	list_add_tail(&lsop->lsreq_list, &rport->ls_req_list);
+
+	lsop->req_queued = true;
+
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	ret = rport->lport->ops->ls_req(&rport->lport->localport,
+					&rport->remoteport, lsreq);
+	if (ret)
+		goto out_unlink;
+
+	return 0;
+
+out_unlink:
+	lsop->ls_error = ret;
+	spin_lock_irqsave(&rport->lock, flags);
+	lsop->req_queued = false;
+	list_del(&lsop->lsreq_list);
+	spin_unlock_irqrestore(&rport->lock, flags);
+	fc_dma_unmap_single(rport->dev, lsreq->rqstdma,
+				  (lsreq->rqstlen + lsreq->rsplen),
+				  DMA_BIDIRECTIONAL);
+out_putrport:
+	nvme_fc_rport_put(rport);
+
+	return ret;
+}
+
+static void
+nvme_fc_send_ls_req_done(struct nvmefc_ls_req *lsreq, int status)
+{
+	struct nvmefc_ls_req_op *lsop = ls_req_to_lsop(lsreq);
+
+	lsop->ls_error = status;
+	complete(&lsop->ls_done);
+}
+
+static int
+nvme_fc_send_ls_req(struct nvme_fc_rport *rport, struct nvmefc_ls_req_op *lsop)
+{
+	struct nvmefc_ls_req *lsreq = &lsop->ls_req;
+	struct fcnvme_ls_rjt *rjt = lsreq->rspaddr;
+	int ret;
+
+	ret = __nvme_fc_send_ls_req(rport, lsop, nvme_fc_send_ls_req_done);
+
+	if (!ret) {
+		/*
+		 * No timeout/not interruptible as we need the struct
+		 * to exist until the lldd calls us back. Thus mandate
+		 * wait until driver calls back. lldd responsible for
+		 * the timeout action
+		 */
+		wait_for_completion(&lsop->ls_done);
+
+		__nvme_fc_finish_ls_req(lsop);
+
+		ret = lsop->ls_error;
+	}
+
+	if (ret)
+		return ret;
+
+	/* ACC or RJT payload ? */
+	if (rjt->w0.ls_cmd == FCNVME_LS_RJT)
+		return -ENXIO;
+
+	return 0;
+}
+
+static int
+nvme_fc_send_ls_req_async(struct nvme_fc_rport *rport,
+		struct nvmefc_ls_req_op *lsop,
+		void (*done)(struct nvmefc_ls_req *req, int status))
+{
+	/* don't wait for completion */
+
+	return __nvme_fc_send_ls_req(rport, lsop, done);
+}
+
+/* Validation Error indexes into the string table below */
+enum {
+	VERR_NO_ERROR		= 0,
+	VERR_LSACC		= 1,
+	VERR_LSDESC_RQST	= 2,
+	VERR_LSDESC_RQST_LEN	= 3,
+	VERR_ASSOC_ID		= 4,
+	VERR_ASSOC_ID_LEN	= 5,
+	VERR_CONN_ID		= 6,
+	VERR_CONN_ID_LEN	= 7,
+	VERR_CR_ASSOC		= 8,
+	VERR_CR_ASSOC_ACC_LEN	= 9,
+	VERR_CR_CONN		= 10,
+	VERR_CR_CONN_ACC_LEN	= 11,
+	VERR_DISCONN		= 12,
+	VERR_DISCONN_ACC_LEN	= 13,
+};
+
+static char *validation_errors[] = {
+	"OK",
+	"Not LS_ACC",
+	"Not LSDESC_RQST",
+	"Bad LSDESC_RQST Length",
+	"Not Association ID",
+	"Bad Association ID Length",
+	"Not Connection ID",
+	"Bad Connection ID Length",
+	"Not CR_ASSOC Rqst",
+	"Bad CR_ASSOC ACC Length",
+	"Not CR_CONN Rqst",
+	"Bad CR_CONN ACC Length",
+	"Not Disconnect Rqst",
+	"Bad Disconnect ACC Length",
+};
+
+static int
+nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
+	struct nvme_fc_queue *queue, u16 qsize, u16 ersp_ratio)
+{
+	struct nvmefc_ls_req_op *lsop;
+	struct nvmefc_ls_req *lsreq;
+	struct fcnvme_ls_cr_assoc_rqst *assoc_rqst;
+	struct fcnvme_ls_cr_assoc_acc *assoc_acc;
+	int ret, fcret = 0;
+
+	lsop = kzalloc((sizeof(*lsop) +
+			 ctrl->lport->ops->lsrqst_priv_sz +
+			 sizeof(*assoc_rqst) + sizeof(*assoc_acc)), GFP_KERNEL);
+	if (!lsop) {
+		ret = -ENOMEM;
+		goto out_no_memory;
+	}
+	lsreq = &lsop->ls_req;
+
+	lsreq->private = (void *)&lsop[1];
+	assoc_rqst = (struct fcnvme_ls_cr_assoc_rqst *)
+			(lsreq->private + ctrl->lport->ops->lsrqst_priv_sz);
+	assoc_acc = (struct fcnvme_ls_cr_assoc_acc *)&assoc_rqst[1];
+
+	assoc_rqst->w0.ls_cmd = FCNVME_LS_CREATE_ASSOCIATION;
+	assoc_rqst->desc_list_len =
+			cpu_to_be32(sizeof(struct fcnvme_lsdesc_cr_assoc_cmd));
+
+	assoc_rqst->assoc_cmd.desc_tag =
+			cpu_to_be32(FCNVME_LSDESC_CREATE_ASSOC_CMD);
+	assoc_rqst->assoc_cmd.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_cr_assoc_cmd));
+
+	assoc_rqst->assoc_cmd.ersp_ratio = cpu_to_be16(ersp_ratio);
+	assoc_rqst->assoc_cmd.sqsize = cpu_to_be16(qsize - 1);
+	/* Linux supports only Dynamic controllers */
+	assoc_rqst->assoc_cmd.cntlid = cpu_to_be16(0xffff);
+	uuid_copy(&assoc_rqst->assoc_cmd.hostid, &ctrl->ctrl.opts->host->id);
+	strncpy(assoc_rqst->assoc_cmd.hostnqn, ctrl->ctrl.opts->host->nqn,
+		min(FCNVME_ASSOC_HOSTNQN_LEN, NVMF_NQN_SIZE));
+	strncpy(assoc_rqst->assoc_cmd.subnqn, ctrl->ctrl.opts->subsysnqn,
+		min(FCNVME_ASSOC_SUBNQN_LEN, NVMF_NQN_SIZE));
+
+	lsop->queue = queue;
+	lsreq->rqstaddr = assoc_rqst;
+	lsreq->rqstlen = sizeof(*assoc_rqst);
+	lsreq->rspaddr = assoc_acc;
+	lsreq->rsplen = sizeof(*assoc_acc);
+	lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
+
+	ret = nvme_fc_send_ls_req(ctrl->rport, lsop);
+	if (ret)
+		goto out_free_buffer;
+
+	/* process connect LS completion */
+
+	/* validate the ACC response */
+	if (assoc_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
+		fcret = VERR_LSACC;
+	else if (assoc_acc->hdr.desc_list_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_ls_cr_assoc_acc)))
+		fcret = VERR_CR_ASSOC_ACC_LEN;
+	else if (assoc_acc->hdr.rqst.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_RQST))
+		fcret = VERR_LSDESC_RQST;
+	else if (assoc_acc->hdr.rqst.desc_len !=
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)))
+		fcret = VERR_LSDESC_RQST_LEN;
+	else if (assoc_acc->hdr.rqst.w0.ls_cmd != FCNVME_LS_CREATE_ASSOCIATION)
+		fcret = VERR_CR_ASSOC;
+	else if (assoc_acc->associd.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_ASSOC_ID))
+		fcret = VERR_ASSOC_ID;
+	else if (assoc_acc->associd.desc_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_assoc_id)))
+		fcret = VERR_ASSOC_ID_LEN;
+	else if (assoc_acc->connectid.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_CONN_ID))
+		fcret = VERR_CONN_ID;
+	else if (assoc_acc->connectid.desc_len !=
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_conn_id)))
+		fcret = VERR_CONN_ID_LEN;
+
+	if (fcret) {
+		ret = -EBADF;
+		dev_err(ctrl->dev,
+			"q %d connect failed: %s\n",
+			queue->qnum, validation_errors[fcret]);
+	} else {
+		ctrl->association_id =
+			be64_to_cpu(assoc_acc->associd.association_id);
+		queue->connection_id =
+			be64_to_cpu(assoc_acc->connectid.connection_id);
+		set_bit(NVME_FC_Q_CONNECTED, &queue->flags);
+	}
+
+out_free_buffer:
+	kfree(lsop);
+out_no_memory:
+	if (ret)
+		dev_err(ctrl->dev,
+			"queue %d connect admin queue failed (%d).\n",
+			queue->qnum, ret);
+	return ret;
+}
+
+static int
+nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
+			u16 qsize, u16 ersp_ratio)
+{
+	struct nvmefc_ls_req_op *lsop;
+	struct nvmefc_ls_req *lsreq;
+	struct fcnvme_ls_cr_conn_rqst *conn_rqst;
+	struct fcnvme_ls_cr_conn_acc *conn_acc;
+	int ret, fcret = 0;
+
+	lsop = kzalloc((sizeof(*lsop) +
+			 ctrl->lport->ops->lsrqst_priv_sz +
+			 sizeof(*conn_rqst) + sizeof(*conn_acc)), GFP_KERNEL);
+	if (!lsop) {
+		ret = -ENOMEM;
+		goto out_no_memory;
+	}
+	lsreq = &lsop->ls_req;
+
+	lsreq->private = (void *)&lsop[1];
+	conn_rqst = (struct fcnvme_ls_cr_conn_rqst *)
+			(lsreq->private + ctrl->lport->ops->lsrqst_priv_sz);
+	conn_acc = (struct fcnvme_ls_cr_conn_acc *)&conn_rqst[1];
+
+	conn_rqst->w0.ls_cmd = FCNVME_LS_CREATE_CONNECTION;
+	conn_rqst->desc_list_len = cpu_to_be32(
+				sizeof(struct fcnvme_lsdesc_assoc_id) +
+				sizeof(struct fcnvme_lsdesc_cr_conn_cmd));
+
+	conn_rqst->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID);
+	conn_rqst->associd.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_assoc_id));
+	conn_rqst->associd.association_id = cpu_to_be64(ctrl->association_id);
+	conn_rqst->connect_cmd.desc_tag =
+			cpu_to_be32(FCNVME_LSDESC_CREATE_CONN_CMD);
+	conn_rqst->connect_cmd.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_cr_conn_cmd));
+	conn_rqst->connect_cmd.ersp_ratio = cpu_to_be16(ersp_ratio);
+	conn_rqst->connect_cmd.qid  = cpu_to_be16(queue->qnum);
+	conn_rqst->connect_cmd.sqsize = cpu_to_be16(qsize - 1);
+
+	lsop->queue = queue;
+	lsreq->rqstaddr = conn_rqst;
+	lsreq->rqstlen = sizeof(*conn_rqst);
+	lsreq->rspaddr = conn_acc;
+	lsreq->rsplen = sizeof(*conn_acc);
+	lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
+
+	ret = nvme_fc_send_ls_req(ctrl->rport, lsop);
+	if (ret)
+		goto out_free_buffer;
+
+	/* process connect LS completion */
+
+	/* validate the ACC response */
+	if (conn_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
+		fcret = VERR_LSACC;
+	else if (conn_acc->hdr.desc_list_len !=
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_cr_conn_acc)))
+		fcret = VERR_CR_CONN_ACC_LEN;
+	else if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+		fcret = VERR_LSDESC_RQST;
+	else if (conn_acc->hdr.rqst.desc_len !=
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)))
+		fcret = VERR_LSDESC_RQST_LEN;
+	else if (conn_acc->hdr.rqst.w0.ls_cmd != FCNVME_LS_CREATE_CONNECTION)
+		fcret = VERR_CR_CONN;
+	else if (conn_acc->connectid.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_CONN_ID))
+		fcret = VERR_CONN_ID;
+	else if (conn_acc->connectid.desc_len !=
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_conn_id)))
+		fcret = VERR_CONN_ID_LEN;
+
+	if (fcret) {
+		ret = -EBADF;
+		dev_err(ctrl->dev,
+			"q %d connect failed: %s\n",
+			queue->qnum, validation_errors[fcret]);
+	} else {
+		queue->connection_id =
+			be64_to_cpu(conn_acc->connectid.connection_id);
+		set_bit(NVME_FC_Q_CONNECTED, &queue->flags);
+	}
+
+out_free_buffer:
+	kfree(lsop);
+out_no_memory:
+	if (ret)
+		dev_err(ctrl->dev,
+			"queue %d connect command failed (%d).\n",
+			queue->qnum, ret);
+	return ret;
+}
+
+static void
+nvme_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status)
+{
+	struct nvmefc_ls_req_op *lsop = ls_req_to_lsop(lsreq);
+
+	__nvme_fc_finish_ls_req(lsop);
+
+	/* fc-nvme iniator doesn't care about success or failure of cmd */
+
+	kfree(lsop);
+}
+
+/*
+ * This routine sends a FC-NVME LS to disconnect (aka terminate)
+ * the FC-NVME Association.  Terminating the association also
+ * terminates the FC-NVME connections (per queue, both admin and io
+ * queues) that are part of the association. E.g. things are torn
+ * down, and the related FC-NVME Association ID and Connection IDs
+ * become invalid.
+ *
+ * The behavior of the fc-nvme initiator is such that it's
+ * understanding of the association and connections will implicitly
+ * be torn down. The action is implicit as it may be due to a loss of
+ * connectivity with the fc-nvme target, so you may never get a
+ * response even if you tried.  As such, the action of this routine
+ * is to asynchronously send the LS, ignore any results of the LS, and
+ * continue on with terminating the association. If the fc-nvme target
+ * is present and receives the LS, it too can tear down.
+ */
+static void
+nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
+{
+	struct fcnvme_ls_disconnect_rqst *discon_rqst;
+	struct fcnvme_ls_disconnect_acc *discon_acc;
+	struct nvmefc_ls_req_op *lsop;
+	struct nvmefc_ls_req *lsreq;
+	int ret;
+
+	lsop = kzalloc((sizeof(*lsop) +
+			 ctrl->lport->ops->lsrqst_priv_sz +
+			 sizeof(*discon_rqst) + sizeof(*discon_acc)),
+			GFP_KERNEL);
+	if (!lsop)
+		/* couldn't sent it... too bad */
+		return;
+
+	lsreq = &lsop->ls_req;
+
+	lsreq->private = (void *)&lsop[1];
+	discon_rqst = (struct fcnvme_ls_disconnect_rqst *)
+			(lsreq->private + ctrl->lport->ops->lsrqst_priv_sz);
+	discon_acc = (struct fcnvme_ls_disconnect_acc *)&discon_rqst[1];
+
+	discon_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT;
+	discon_rqst->desc_list_len = cpu_to_be32(
+				sizeof(struct fcnvme_lsdesc_assoc_id) +
+				sizeof(struct fcnvme_lsdesc_disconn_cmd));
+
+	discon_rqst->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID);
+	discon_rqst->associd.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_assoc_id));
+
+	discon_rqst->associd.association_id = cpu_to_be64(ctrl->association_id);
+
+	discon_rqst->discon_cmd.desc_tag = cpu_to_be32(
+						FCNVME_LSDESC_DISCONN_CMD);
+	discon_rqst->discon_cmd.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_disconn_cmd));
+	discon_rqst->discon_cmd.scope = FCNVME_DISCONN_ASSOCIATION;
+	discon_rqst->discon_cmd.id = cpu_to_be64(ctrl->association_id);
+
+	lsreq->rqstaddr = discon_rqst;
+	lsreq->rqstlen = sizeof(*discon_rqst);
+	lsreq->rspaddr = discon_acc;
+	lsreq->rsplen = sizeof(*discon_acc);
+	lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
+
+	ret = nvme_fc_send_ls_req_async(ctrl->rport, lsop,
+				nvme_fc_disconnect_assoc_done);
+	if (ret)
+		kfree(lsop);
+
+	/* only meaningful part to terminating the association */
+	ctrl->association_id = 0;
+}
+
+
+/* *********************** NVME Ctrl Routines **************************** */
+
+static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);
+
+static int
+nvme_fc_reinit_request(void *data, struct request *rq)
+{
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
+
+	memset(cmdiu, 0, sizeof(*cmdiu));
+	cmdiu->scsi_id = NVME_CMD_SCSI_ID;
+	cmdiu->fc_id = NVME_CMD_FC_ID;
+	cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32));
+	memset(&op->rsp_iu, 0, sizeof(op->rsp_iu));
+
+	return 0;
+}
+
+static void
+__nvme_fc_exit_request(struct nvme_fc_ctrl *ctrl,
+		struct nvme_fc_fcp_op *op)
+{
+	fc_dma_unmap_single(ctrl->lport->dev, op->fcp_req.rspdma,
+				sizeof(op->rsp_iu), DMA_FROM_DEVICE);
+	fc_dma_unmap_single(ctrl->lport->dev, op->fcp_req.cmddma,
+				sizeof(op->cmd_iu), DMA_TO_DEVICE);
+
+	atomic_set(&op->state, FCPOP_STATE_UNINIT);
+}
+
+static void
+nvme_fc_exit_request(struct blk_mq_tag_set *set, struct request *rq,
+		unsigned int hctx_idx)
+{
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+
+	return __nvme_fc_exit_request(set->driver_data, op);
+}
+
+static int
+__nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
+{
+	unsigned long flags;
+	int opstate;
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+	opstate = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
+	if (opstate != FCPOP_STATE_ACTIVE)
+		atomic_set(&op->state, opstate);
+	else if (ctrl->flags & FCCTRL_TERMIO)
+		ctrl->iocnt++;
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+
+	if (opstate != FCPOP_STATE_ACTIVE)
+		return -ECANCELED;
+
+	ctrl->lport->ops->fcp_abort(&ctrl->lport->localport,
+					&ctrl->rport->remoteport,
+					op->queue->lldd_handle,
+					&op->fcp_req);
+
+	return 0;
+}
+
+static void
+nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops;
+	int i;
+
+	for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++)
+		__nvme_fc_abort_op(ctrl, aen_op);
+}
+
+static inline void
+__nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
+		struct nvme_fc_fcp_op *op, int opstate)
+{
+	unsigned long flags;
+
+	if (opstate == FCPOP_STATE_ABORTED) {
+		spin_lock_irqsave(&ctrl->lock, flags);
+		if (ctrl->flags & FCCTRL_TERMIO) {
+			if (!--ctrl->iocnt)
+				wake_up(&ctrl->ioabort_wait);
+		}
+		spin_unlock_irqrestore(&ctrl->lock, flags);
+	}
+}
+
+static void
+nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
+{
+	struct nvme_fc_fcp_op *op = fcp_req_to_fcp_op(req);
+	struct request *rq = op->rq;
+	struct nvmefc_fcp_req *freq = &op->fcp_req;
+	struct nvme_fc_ctrl *ctrl = op->ctrl;
+	struct nvme_fc_queue *queue = op->queue;
+	struct nvme_completion *cqe = &op->rsp_iu.cqe;
+	struct nvme_command *sqe = &op->cmd_iu.sqe;
+	__le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
+	union nvme_result result;
+	bool terminate_assoc = true;
+	int opstate;
+
+	/*
+	 * WARNING:
+	 * The current linux implementation of a nvme controller
+	 * allocates a single tag set for all io queues and sizes
+	 * the io queues to fully hold all possible tags. Thus, the
+	 * implementation does not reference or care about the sqhd
+	 * value as it never needs to use the sqhd/sqtail pointers
+	 * for submission pacing.
+	 *
+	 * This affects the FC-NVME implementation in two ways:
+	 * 1) As the value doesn't matter, we don't need to waste
+	 *    cycles extracting it from ERSPs and stamping it in the
+	 *    cases where the transport fabricates CQEs on successful
+	 *    completions.
+	 * 2) The FC-NVME implementation requires that delivery of
+	 *    ERSP completions are to go back to the nvme layer in order
+	 *    relative to the rsn, such that the sqhd value will always
+	 *    be "in order" for the nvme layer. As the nvme layer in
+	 *    linux doesn't care about sqhd, there's no need to return
+	 *    them in order.
+	 *
+	 * Additionally:
+	 * As the core nvme layer in linux currently does not look at
+	 * every field in the cqe - in cases where the FC transport must
+	 * fabricate a CQE, the following fields will not be set as they
+	 * are not referenced:
+	 *      cqe.sqid,  cqe.sqhd,  cqe.command_id
+	 *
+	 * Failure or error of an individual i/o, in a transport
+	 * detected fashion unrelated to the nvme completion status,
+	 * potentially cause the initiator and target sides to get out
+	 * of sync on SQ head/tail (aka outstanding io count allowed).
+	 * Per FC-NVME spec, failure of an individual command requires
+	 * the connection to be terminated, which in turn requires the
+	 * association to be terminated.
+	 */
+
+	opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
+
+	fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma,
+				sizeof(op->rsp_iu), DMA_FROM_DEVICE);
+
+	if (opstate == FCPOP_STATE_ABORTED)
+		status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
+	else if (freq->status)
+		status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+
+	/*
+	 * For the linux implementation, if we have an unsuccesful
+	 * status, they blk-mq layer can typically be called with the
+	 * non-zero status and the content of the cqe isn't important.
+	 */
+	if (status)
+		goto done;
+
+	/*
+	 * command completed successfully relative to the wire
+	 * protocol. However, validate anything received and
+	 * extract the status and result from the cqe (create it
+	 * where necessary).
+	 */
+
+	switch (freq->rcv_rsplen) {
+
+	case 0:
+	case NVME_FC_SIZEOF_ZEROS_RSP:
+		/*
+		 * No response payload or 12 bytes of payload (which
+		 * should all be zeros) are considered successful and
+		 * no payload in the CQE by the transport.
+		 */
+		if (freq->transferred_length !=
+			be32_to_cpu(op->cmd_iu.data_len)) {
+			status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+			goto done;
+		}
+		result.u64 = 0;
+		break;
+
+	case sizeof(struct nvme_fc_ersp_iu):
+		/*
+		 * The ERSP IU contains a full completion with CQE.
+		 * Validate ERSP IU and look at cqe.
+		 */
+		if (unlikely(be16_to_cpu(op->rsp_iu.iu_len) !=
+					(freq->rcv_rsplen / 4) ||
+			     be32_to_cpu(op->rsp_iu.xfrd_len) !=
+					freq->transferred_length ||
+			     op->rsp_iu.status_code ||
+			     sqe->common.command_id != cqe->command_id)) {
+			status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+			goto done;
+		}
+		result = cqe->result;
+		status = cqe->status;
+		break;
+
+	default:
+		status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+		goto done;
+	}
+
+	terminate_assoc = false;
+
+done:
+	if (op->flags & FCOP_FLAGS_AEN) {
+		nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
+		__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
+		atomic_set(&op->state, FCPOP_STATE_IDLE);
+		op->flags = FCOP_FLAGS_AEN;	/* clear other flags */
+		nvme_fc_ctrl_put(ctrl);
+		goto check_error;
+	}
+
+	/*
+	 * Force failures of commands if we're killing the controller
+	 * or have an error on a command used to create an new association
+	 */
+	if (status &&
+	    (blk_queue_dying(rq->q) ||
+	     ctrl->ctrl.state == NVME_CTRL_NEW ||
+	     ctrl->ctrl.state == NVME_CTRL_CONNECTING))
+		status |= cpu_to_le16(NVME_SC_DNR << 1);
+
+	__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
+	nvme_end_request(rq, status, result);
+
+check_error:
+	if (terminate_assoc)
+		nvme_fc_error_recovery(ctrl, "transport detected io error");
+}
+
+static int
+__nvme_fc_init_request(struct nvme_fc_ctrl *ctrl,
+		struct nvme_fc_queue *queue, struct nvme_fc_fcp_op *op,
+		struct request *rq, u32 rqno)
+{
+	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
+	int ret = 0;
+
+	memset(op, 0, sizeof(*op));
+	op->fcp_req.cmdaddr = &op->cmd_iu;
+	op->fcp_req.cmdlen = sizeof(op->cmd_iu);
+	op->fcp_req.rspaddr = &op->rsp_iu;
+	op->fcp_req.rsplen = sizeof(op->rsp_iu);
+	op->fcp_req.done = nvme_fc_fcpio_done;
+	op->fcp_req.first_sgl = (struct scatterlist *)&op[1];
+	op->fcp_req.private = &op->fcp_req.first_sgl[SG_CHUNK_SIZE];
+	op->ctrl = ctrl;
+	op->queue = queue;
+	op->rq = rq;
+	op->rqno = rqno;
+
+	cmdiu->scsi_id = NVME_CMD_SCSI_ID;
+	cmdiu->fc_id = NVME_CMD_FC_ID;
+	cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32));
+
+	op->fcp_req.cmddma = fc_dma_map_single(ctrl->lport->dev,
+				&op->cmd_iu, sizeof(op->cmd_iu), DMA_TO_DEVICE);
+	if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.cmddma)) {
+		dev_err(ctrl->dev,
+			"FCP Op failed - cmdiu dma mapping failed.\n");
+		ret = EFAULT;
+		goto out_on_error;
+	}
+
+	op->fcp_req.rspdma = fc_dma_map_single(ctrl->lport->dev,
+				&op->rsp_iu, sizeof(op->rsp_iu),
+				DMA_FROM_DEVICE);
+	if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.rspdma)) {
+		dev_err(ctrl->dev,
+			"FCP Op failed - rspiu dma mapping failed.\n");
+		ret = EFAULT;
+	}
+
+	atomic_set(&op->state, FCPOP_STATE_IDLE);
+out_on_error:
+	return ret;
+}
+
+static int
+nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq,
+		unsigned int hctx_idx, unsigned int numa_node)
+{
+	struct nvme_fc_ctrl *ctrl = set->driver_data;
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
+	struct nvme_fc_queue *queue = &ctrl->queues[queue_idx];
+
+	return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++);
+}
+
+static int
+nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_fcp_op *aen_op;
+	struct nvme_fc_cmd_iu *cmdiu;
+	struct nvme_command *sqe;
+	void *private;
+	int i, ret;
+
+	aen_op = ctrl->aen_ops;
+	for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
+		private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz,
+						GFP_KERNEL);
+		if (!private)
+			return -ENOMEM;
+
+		cmdiu = &aen_op->cmd_iu;
+		sqe = &cmdiu->sqe;
+		ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0],
+				aen_op, (struct request *)NULL,
+				(NVME_AQ_BLK_MQ_DEPTH + i));
+		if (ret) {
+			kfree(private);
+			return ret;
+		}
+
+		aen_op->flags = FCOP_FLAGS_AEN;
+		aen_op->fcp_req.first_sgl = NULL; /* no sg list */
+		aen_op->fcp_req.private = private;
+
+		memset(sqe, 0, sizeof(*sqe));
+		sqe->common.opcode = nvme_admin_async_event;
+		/* Note: core layer may overwrite the sqe.command_id value */
+		sqe->common.command_id = NVME_AQ_BLK_MQ_DEPTH + i;
+	}
+	return 0;
+}
+
+static void
+nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_fcp_op *aen_op;
+	int i;
+
+	aen_op = ctrl->aen_ops;
+	for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
+		if (!aen_op->fcp_req.private)
+			continue;
+
+		__nvme_fc_exit_request(ctrl, aen_op);
+
+		kfree(aen_op->fcp_req.private);
+		aen_op->fcp_req.private = NULL;
+	}
+}
+
+static inline void
+__nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, struct nvme_fc_ctrl *ctrl,
+		unsigned int qidx)
+{
+	struct nvme_fc_queue *queue = &ctrl->queues[qidx];
+
+	hctx->driver_data = queue;
+	queue->hctx = hctx;
+}
+
+static int
+nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_fc_ctrl *ctrl = data;
+
+	__nvme_fc_init_hctx(hctx, ctrl, hctx_idx + 1);
+
+	return 0;
+}
+
+static int
+nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_fc_ctrl *ctrl = data;
+
+	__nvme_fc_init_hctx(hctx, ctrl, hctx_idx);
+
+	return 0;
+}
+
+static void
+nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx)
+{
+	struct nvme_fc_queue *queue;
+
+	queue = &ctrl->queues[idx];
+	memset(queue, 0, sizeof(*queue));
+	queue->ctrl = ctrl;
+	queue->qnum = idx;
+	atomic_set(&queue->csn, 1);
+	queue->dev = ctrl->dev;
+
+	if (idx > 0)
+		queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
+	else
+		queue->cmnd_capsule_len = sizeof(struct nvme_command);
+
+	/*
+	 * Considered whether we should allocate buffers for all SQEs
+	 * and CQEs and dma map them - mapping their respective entries
+	 * into the request structures (kernel vm addr and dma address)
+	 * thus the driver could use the buffers/mappings directly.
+	 * It only makes sense if the LLDD would use them for its
+	 * messaging api. It's very unlikely most adapter api's would use
+	 * a native NVME sqe/cqe. More reasonable if FC-NVME IU payload
+	 * structures were used instead.
+	 */
+}
+
+/*
+ * This routine terminates a queue at the transport level.
+ * The transport has already ensured that all outstanding ios on
+ * the queue have been terminated.
+ * The transport will send a Disconnect LS request to terminate
+ * the queue's connection. Termination of the admin queue will also
+ * terminate the association at the target.
+ */
+static void
+nvme_fc_free_queue(struct nvme_fc_queue *queue)
+{
+	if (!test_and_clear_bit(NVME_FC_Q_CONNECTED, &queue->flags))
+		return;
+
+	clear_bit(NVME_FC_Q_LIVE, &queue->flags);
+	/*
+	 * Current implementation never disconnects a single queue.
+	 * It always terminates a whole association. So there is never
+	 * a disconnect(queue) LS sent to the target.
+	 */
+
+	queue->connection_id = 0;
+}
+
+static void
+__nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *ctrl,
+	struct nvme_fc_queue *queue, unsigned int qidx)
+{
+	if (ctrl->lport->ops->delete_queue)
+		ctrl->lport->ops->delete_queue(&ctrl->lport->localport, qidx,
+				queue->lldd_handle);
+	queue->lldd_handle = NULL;
+}
+
+static void
+nvme_fc_free_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->ctrl.queue_count; i++)
+		nvme_fc_free_queue(&ctrl->queues[i]);
+}
+
+static int
+__nvme_fc_create_hw_queue(struct nvme_fc_ctrl *ctrl,
+	struct nvme_fc_queue *queue, unsigned int qidx, u16 qsize)
+{
+	int ret = 0;
+
+	queue->lldd_handle = NULL;
+	if (ctrl->lport->ops->create_queue)
+		ret = ctrl->lport->ops->create_queue(&ctrl->lport->localport,
+				qidx, qsize, &queue->lldd_handle);
+
+	return ret;
+}
+
+static void
+nvme_fc_delete_hw_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_queue *queue = &ctrl->queues[ctrl->ctrl.queue_count - 1];
+	int i;
+
+	for (i = ctrl->ctrl.queue_count - 1; i >= 1; i--, queue--)
+		__nvme_fc_delete_hw_queue(ctrl, queue, i);
+}
+
+static int
+nvme_fc_create_hw_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
+{
+	struct nvme_fc_queue *queue = &ctrl->queues[1];
+	int i, ret;
+
+	for (i = 1; i < ctrl->ctrl.queue_count; i++, queue++) {
+		ret = __nvme_fc_create_hw_queue(ctrl, queue, i, qsize);
+		if (ret)
+			goto delete_queues;
+	}
+
+	return 0;
+
+delete_queues:
+	for (; i >= 0; i--)
+		__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[i], i);
+	return ret;
+}
+
+static int
+nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
+{
+	int i, ret = 0;
+
+	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
+		ret = nvme_fc_connect_queue(ctrl, &ctrl->queues[i], qsize,
+					(qsize / 5));
+		if (ret)
+			break;
+		ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
+		if (ret)
+			break;
+
+		set_bit(NVME_FC_Q_LIVE, &ctrl->queues[i].flags);
+	}
+
+	return ret;
+}
+
+static void
+nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->ctrl.queue_count; i++)
+		nvme_fc_init_queue(ctrl, i);
+}
+
+static void
+nvme_fc_ctrl_free(struct kref *ref)
+{
+	struct nvme_fc_ctrl *ctrl =
+		container_of(ref, struct nvme_fc_ctrl, ref);
+	unsigned long flags;
+
+	if (ctrl->ctrl.tagset) {
+		blk_cleanup_queue(ctrl->ctrl.connect_q);
+		blk_mq_free_tag_set(&ctrl->tag_set);
+	}
+
+	/* remove from rport list */
+	spin_lock_irqsave(&ctrl->rport->lock, flags);
+	list_del(&ctrl->ctrl_list);
+	spin_unlock_irqrestore(&ctrl->rport->lock, flags);
+
+	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+	blk_cleanup_queue(ctrl->ctrl.admin_q);
+	blk_mq_free_tag_set(&ctrl->admin_tag_set);
+
+	kfree(ctrl->queues);
+
+	put_device(ctrl->dev);
+	nvme_fc_rport_put(ctrl->rport);
+
+	ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum);
+	if (ctrl->ctrl.opts)
+		nvmf_free_options(ctrl->ctrl.opts);
+	kfree(ctrl);
+}
+
+static void
+nvme_fc_ctrl_put(struct nvme_fc_ctrl *ctrl)
+{
+	kref_put(&ctrl->ref, nvme_fc_ctrl_free);
+}
+
+static int
+nvme_fc_ctrl_get(struct nvme_fc_ctrl *ctrl)
+{
+	return kref_get_unless_zero(&ctrl->ref);
+}
+
+/*
+ * All accesses from nvme core layer done - can now free the
+ * controller. Called after last nvme_put_ctrl() call
+ */
+static void
+nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl)
+{
+	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+
+	WARN_ON(nctrl != &ctrl->ctrl);
+
+	nvme_fc_ctrl_put(ctrl);
+}
+
+static void
+nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
+{
+	/* only proceed if in LIVE state - e.g. on first error */
+	if (ctrl->ctrl.state != NVME_CTRL_LIVE)
+		return;
+
+	dev_warn(ctrl->ctrl.device,
+		"NVME-FC{%d}: transport association error detected: %s\n",
+		ctrl->cnum, errmsg);
+	dev_warn(ctrl->ctrl.device,
+		"NVME-FC{%d}: resetting controller\n", ctrl->cnum);
+
+	nvme_reset_ctrl(&ctrl->ctrl);
+}
+
+static enum blk_eh_timer_return
+nvme_fc_timeout(struct request *rq, bool reserved)
+{
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_ctrl *ctrl = op->ctrl;
+
+	/*
+	 * we can't individually ABTS an io without affecting the queue,
+	 * thus killing the queue, and thus the association.
+	 * So resolve by performing a controller reset, which will stop
+	 * the host/io stack, terminate the association on the link,
+	 * and recreate an association on the link.
+	 */
+	nvme_fc_error_recovery(ctrl, "io timeout error");
+
+	/*
+	 * the io abort has been initiated. Have the reset timer
+	 * restarted and the abort completion will complete the io
+	 * shortly. Avoids a synchronous wait while the abort finishes.
+	 */
+	return BLK_EH_RESET_TIMER;
+}
+
+static int
+nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
+		struct nvme_fc_fcp_op *op)
+{
+	struct nvmefc_fcp_req *freq = &op->fcp_req;
+	enum dma_data_direction dir;
+	int ret;
+
+	freq->sg_cnt = 0;
+
+	if (!blk_rq_payload_bytes(rq))
+		return 0;
+
+	freq->sg_table.sgl = freq->first_sgl;
+	ret = sg_alloc_table_chained(&freq->sg_table,
+			blk_rq_nr_phys_segments(rq), freq->sg_table.sgl);
+	if (ret)
+		return -ENOMEM;
+
+	op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl);
+	WARN_ON(op->nents > blk_rq_nr_phys_segments(rq));
+	dir = (rq_data_dir(rq) == WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+	freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl,
+				op->nents, dir);
+	if (unlikely(freq->sg_cnt <= 0)) {
+		sg_free_table_chained(&freq->sg_table, true);
+		freq->sg_cnt = 0;
+		return -EFAULT;
+	}
+
+	/*
+	 * TODO: blk_integrity_rq(rq)  for DIF
+	 */
+	return 0;
+}
+
+static void
+nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
+		struct nvme_fc_fcp_op *op)
+{
+	struct nvmefc_fcp_req *freq = &op->fcp_req;
+
+	if (!freq->sg_cnt)
+		return;
+
+	fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents,
+				((rq_data_dir(rq) == WRITE) ?
+					DMA_TO_DEVICE : DMA_FROM_DEVICE));
+
+	nvme_cleanup_cmd(rq);
+
+	sg_free_table_chained(&freq->sg_table, true);
+
+	freq->sg_cnt = 0;
+}
+
+/*
+ * In FC, the queue is a logical thing. At transport connect, the target
+ * creates its "queue" and returns a handle that is to be given to the
+ * target whenever it posts something to the corresponding SQ.  When an
+ * SQE is sent on a SQ, FC effectively considers the SQE, or rather the
+ * command contained within the SQE, an io, and assigns a FC exchange
+ * to it. The SQE and the associated SQ handle are sent in the initial
+ * CMD IU sents on the exchange. All transfers relative to the io occur
+ * as part of the exchange.  The CQE is the last thing for the io,
+ * which is transferred (explicitly or implicitly) with the RSP IU
+ * sent on the exchange. After the CQE is received, the FC exchange is
+ * terminaed and the Exchange may be used on a different io.
+ *
+ * The transport to LLDD api has the transport making a request for a
+ * new fcp io request to the LLDD. The LLDD then allocates a FC exchange
+ * resource and transfers the command. The LLDD will then process all
+ * steps to complete the io. Upon completion, the transport done routine
+ * is called.
+ *
+ * So - while the operation is outstanding to the LLDD, there is a link
+ * level FC exchange resource that is also outstanding. This must be
+ * considered in all cleanup operations.
+ */
+static blk_status_t
+nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
+	struct nvme_fc_fcp_op *op, u32 data_len,
+	enum nvmefc_fcp_datadir	io_dir)
+{
+	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
+	struct nvme_command *sqe = &cmdiu->sqe;
+	u32 csn;
+	int ret, opstate;
+
+	/*
+	 * before attempting to send the io, check to see if we believe
+	 * the target device is present
+	 */
+	if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
+		return BLK_STS_RESOURCE;
+
+	if (!nvme_fc_ctrl_get(ctrl))
+		return BLK_STS_IOERR;
+
+	/* format the FC-NVME CMD IU and fcp_req */
+	cmdiu->connection_id = cpu_to_be64(queue->connection_id);
+	csn = atomic_inc_return(&queue->csn);
+	cmdiu->csn = cpu_to_be32(csn);
+	cmdiu->data_len = cpu_to_be32(data_len);
+	switch (io_dir) {
+	case NVMEFC_FCP_WRITE:
+		cmdiu->flags = FCNVME_CMD_FLAGS_WRITE;
+		break;
+	case NVMEFC_FCP_READ:
+		cmdiu->flags = FCNVME_CMD_FLAGS_READ;
+		break;
+	case NVMEFC_FCP_NODATA:
+		cmdiu->flags = 0;
+		break;
+	}
+	op->fcp_req.payload_length = data_len;
+	op->fcp_req.io_dir = io_dir;
+	op->fcp_req.transferred_length = 0;
+	op->fcp_req.rcv_rsplen = 0;
+	op->fcp_req.status = NVME_SC_SUCCESS;
+	op->fcp_req.sqid = cpu_to_le16(queue->qnum);
+
+	/*
+	 * validate per fabric rules, set fields mandated by fabric spec
+	 * as well as those by FC-NVME spec.
+	 */
+	WARN_ON_ONCE(sqe->common.metadata);
+	sqe->common.flags |= NVME_CMD_SGL_METABUF;
+
+	/*
+	 * format SQE DPTR field per FC-NVME rules:
+	 *    type=0x5     Transport SGL Data Block Descriptor
+	 *    subtype=0xA  Transport-specific value
+	 *    address=0
+	 *    length=length of the data series
+	 */
+	sqe->rw.dptr.sgl.type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
+					NVME_SGL_FMT_TRANSPORT_A;
+	sqe->rw.dptr.sgl.length = cpu_to_le32(data_len);
+	sqe->rw.dptr.sgl.addr = 0;
+
+	if (!(op->flags & FCOP_FLAGS_AEN)) {
+		ret = nvme_fc_map_data(ctrl, op->rq, op);
+		if (ret < 0) {
+			nvme_cleanup_cmd(op->rq);
+			nvme_fc_ctrl_put(ctrl);
+			if (ret == -ENOMEM || ret == -EAGAIN)
+				return BLK_STS_RESOURCE;
+			return BLK_STS_IOERR;
+		}
+	}
+
+	fc_dma_sync_single_for_device(ctrl->lport->dev, op->fcp_req.cmddma,
+				  sizeof(op->cmd_iu), DMA_TO_DEVICE);
+
+	atomic_set(&op->state, FCPOP_STATE_ACTIVE);
+
+	if (!(op->flags & FCOP_FLAGS_AEN))
+		blk_mq_start_request(op->rq);
+
+	ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport,
+					&ctrl->rport->remoteport,
+					queue->lldd_handle, &op->fcp_req);
+
+	if (ret) {
+		opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
+		__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
+
+		if (!(op->flags & FCOP_FLAGS_AEN))
+			nvme_fc_unmap_data(ctrl, op->rq, op);
+
+		nvme_fc_ctrl_put(ctrl);
+
+		if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE &&
+				ret != -EBUSY)
+			return BLK_STS_IOERR;
+
+		return BLK_STS_RESOURCE;
+	}
+
+	return BLK_STS_OK;
+}
+
+static blk_status_t
+nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
+			const struct blk_mq_queue_data *bd)
+{
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_fc_queue *queue = hctx->driver_data;
+	struct nvme_fc_ctrl *ctrl = queue->ctrl;
+	struct request *rq = bd->rq;
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
+	struct nvme_command *sqe = &cmdiu->sqe;
+	enum nvmefc_fcp_datadir	io_dir;
+	u32 data_len;
+	blk_status_t ret;
+
+	ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq,
+		test_bit(NVME_FC_Q_LIVE, &queue->flags),
+		ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE);
+	if (unlikely(ret))
+		return ret;
+
+	ret = nvme_setup_cmd(ns, rq, sqe);
+	if (ret)
+		return ret;
+
+	data_len = blk_rq_payload_bytes(rq);
+	if (data_len)
+		io_dir = ((rq_data_dir(rq) == WRITE) ?
+					NVMEFC_FCP_WRITE : NVMEFC_FCP_READ);
+	else
+		io_dir = NVMEFC_FCP_NODATA;
+
+	return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir);
+}
+
+static struct blk_mq_tags *
+nvme_fc_tagset(struct nvme_fc_queue *queue)
+{
+	if (queue->qnum == 0)
+		return queue->ctrl->admin_tag_set.tags[queue->qnum];
+
+	return queue->ctrl->tag_set.tags[queue->qnum - 1];
+}
+
+static int
+nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+
+{
+	struct nvme_fc_queue *queue = hctx->driver_data;
+	struct nvme_fc_ctrl *ctrl = queue->ctrl;
+	struct request *req;
+	struct nvme_fc_fcp_op *op;
+
+	req = blk_mq_tag_to_rq(nvme_fc_tagset(queue), tag);
+	if (!req)
+		return 0;
+
+	op = blk_mq_rq_to_pdu(req);
+
+	if ((atomic_read(&op->state) == FCPOP_STATE_ACTIVE) &&
+		 (ctrl->lport->ops->poll_queue))
+		ctrl->lport->ops->poll_queue(&ctrl->lport->localport,
+						 queue->lldd_handle);
+
+	return ((atomic_read(&op->state) != FCPOP_STATE_ACTIVE));
+}
+
+static void
+nvme_fc_submit_async_event(struct nvme_ctrl *arg)
+{
+	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg);
+	struct nvme_fc_fcp_op *aen_op;
+	unsigned long flags;
+	bool terminating = false;
+	blk_status_t ret;
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+	if (ctrl->flags & FCCTRL_TERMIO)
+		terminating = true;
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+
+	if (terminating)
+		return;
+
+	aen_op = &ctrl->aen_ops[0];
+
+	ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0,
+					NVMEFC_FCP_NODATA);
+	if (ret)
+		dev_err(ctrl->ctrl.device,
+			"failed async event work\n");
+}
+
+static void
+nvme_fc_complete_rq(struct request *rq)
+{
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_ctrl *ctrl = op->ctrl;
+
+	atomic_set(&op->state, FCPOP_STATE_IDLE);
+
+	nvme_fc_unmap_data(ctrl, rq, op);
+	nvme_complete_rq(rq);
+	nvme_fc_ctrl_put(ctrl);
+}
+
+/*
+ * This routine is used by the transport when it needs to find active
+ * io on a queue that is to be terminated. The transport uses
+ * blk_mq_tagset_busy_itr() to find the busy requests, which then invoke
+ * this routine to kill them on a 1 by 1 basis.
+ *
+ * As FC allocates FC exchange for each io, the transport must contact
+ * the LLDD to terminate the exchange, thus releasing the FC exchange.
+ * After terminating the exchange the LLDD will call the transport's
+ * normal io done path for the request, but it will have an aborted
+ * status. The done path will return the io request back to the block
+ * layer with an error status.
+ */
+static void
+nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
+{
+	struct nvme_ctrl *nctrl = data;
+	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
+
+	if (!blk_mq_request_started(req))
+		return;
+
+	__nvme_fc_abort_op(ctrl, op);
+}
+
+
+static const struct blk_mq_ops nvme_fc_mq_ops = {
+	.queue_rq	= nvme_fc_queue_rq,
+	.complete	= nvme_fc_complete_rq,
+	.init_request	= nvme_fc_init_request,
+	.exit_request	= nvme_fc_exit_request,
+	.init_hctx	= nvme_fc_init_hctx,
+	.poll		= nvme_fc_poll,
+	.timeout	= nvme_fc_timeout,
+};
+
+static int
+nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+	unsigned int nr_io_queues;
+	int ret;
+
+	nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()),
+				ctrl->lport->ops->max_hw_queues);
+	ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
+	if (ret) {
+		dev_info(ctrl->ctrl.device,
+			"set_queue_count failed: %d\n", ret);
+		return ret;
+	}
+
+	ctrl->ctrl.queue_count = nr_io_queues + 1;
+	if (!nr_io_queues)
+		return 0;
+
+	nvme_fc_init_io_queues(ctrl);
+
+	memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
+	ctrl->tag_set.ops = &nvme_fc_mq_ops;
+	ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
+	ctrl->tag_set.reserved_tags = 1; /* fabric connect */
+	ctrl->tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	ctrl->tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
+					(SG_CHUNK_SIZE *
+						sizeof(struct scatterlist)) +
+					ctrl->lport->ops->fcprqst_priv_sz;
+	ctrl->tag_set.driver_data = ctrl;
+	ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1;
+	ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
+
+	ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
+	if (ret)
+		return ret;
+
+	ctrl->ctrl.tagset = &ctrl->tag_set;
+
+	ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
+	if (IS_ERR(ctrl->ctrl.connect_q)) {
+		ret = PTR_ERR(ctrl->ctrl.connect_q);
+		goto out_free_tag_set;
+	}
+
+	ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1);
+	if (ret)
+		goto out_cleanup_blk_queue;
+
+	ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1);
+	if (ret)
+		goto out_delete_hw_queues;
+
+	return 0;
+
+out_delete_hw_queues:
+	nvme_fc_delete_hw_io_queues(ctrl);
+out_cleanup_blk_queue:
+	blk_cleanup_queue(ctrl->ctrl.connect_q);
+out_free_tag_set:
+	blk_mq_free_tag_set(&ctrl->tag_set);
+	nvme_fc_free_io_queues(ctrl);
+
+	/* force put free routine to ignore io queues */
+	ctrl->ctrl.tagset = NULL;
+
+	return ret;
+}
+
+static int
+nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+	unsigned int nr_io_queues;
+	int ret;
+
+	nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()),
+				ctrl->lport->ops->max_hw_queues);
+	ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
+	if (ret) {
+		dev_info(ctrl->ctrl.device,
+			"set_queue_count failed: %d\n", ret);
+		return ret;
+	}
+
+	ctrl->ctrl.queue_count = nr_io_queues + 1;
+	/* check for io queues existing */
+	if (ctrl->ctrl.queue_count == 1)
+		return 0;
+
+	nvme_fc_init_io_queues(ctrl);
+
+	ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
+	if (ret)
+		goto out_free_io_queues;
+
+	ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1);
+	if (ret)
+		goto out_free_io_queues;
+
+	ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1);
+	if (ret)
+		goto out_delete_hw_queues;
+
+	blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues);
+
+	return 0;
+
+out_delete_hw_queues:
+	nvme_fc_delete_hw_io_queues(ctrl);
+out_free_io_queues:
+	nvme_fc_free_io_queues(ctrl);
+	return ret;
+}
+
+static void
+nvme_fc_rport_active_on_lport(struct nvme_fc_rport *rport)
+{
+	struct nvme_fc_lport *lport = rport->lport;
+
+	atomic_inc(&lport->act_rport_cnt);
+}
+
+static void
+nvme_fc_rport_inactive_on_lport(struct nvme_fc_rport *rport)
+{
+	struct nvme_fc_lport *lport = rport->lport;
+	u32 cnt;
+
+	cnt = atomic_dec_return(&lport->act_rport_cnt);
+	if (cnt == 0 && lport->localport.port_state == FC_OBJSTATE_DELETED)
+		lport->ops->localport_delete(&lport->localport);
+}
+
+static int
+nvme_fc_ctlr_active_on_rport(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_rport *rport = ctrl->rport;
+	u32 cnt;
+
+	if (ctrl->assoc_active)
+		return 1;
+
+	ctrl->assoc_active = true;
+	cnt = atomic_inc_return(&rport->act_ctrl_cnt);
+	if (cnt == 1)
+		nvme_fc_rport_active_on_lport(rport);
+
+	return 0;
+}
+
+static int
+nvme_fc_ctlr_inactive_on_rport(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_rport *rport = ctrl->rport;
+	struct nvme_fc_lport *lport = rport->lport;
+	u32 cnt;
+
+	/* ctrl->assoc_active=false will be set independently */
+
+	cnt = atomic_dec_return(&rport->act_ctrl_cnt);
+	if (cnt == 0) {
+		if (rport->remoteport.port_state == FC_OBJSTATE_DELETED)
+			lport->ops->remoteport_delete(&rport->remoteport);
+		nvme_fc_rport_inactive_on_lport(rport);
+	}
+
+	return 0;
+}
+
+/*
+ * This routine restarts the controller on the host side, and
+ * on the link side, recreates the controller association.
+ */
+static int
+nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+	int ret;
+	bool changed;
+
+	++ctrl->ctrl.nr_reconnects;
+
+	if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
+		return -ENODEV;
+
+	if (nvme_fc_ctlr_active_on_rport(ctrl))
+		return -ENOTUNIQ;
+
+	/*
+	 * Create the admin queue
+	 */
+
+	nvme_fc_init_queue(ctrl, 0);
+
+	ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0,
+				NVME_AQ_DEPTH);
+	if (ret)
+		goto out_free_queue;
+
+	ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0],
+				NVME_AQ_DEPTH, (NVME_AQ_DEPTH / 4));
+	if (ret)
+		goto out_delete_hw_queue;
+
+	if (ctrl->ctrl.state != NVME_CTRL_NEW)
+		blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+
+	ret = nvmf_connect_admin_queue(&ctrl->ctrl);
+	if (ret)
+		goto out_disconnect_admin_queue;
+
+	set_bit(NVME_FC_Q_LIVE, &ctrl->queues[0].flags);
+
+	/*
+	 * Check controller capabilities
+	 *
+	 * todo:- add code to check if ctrl attributes changed from
+	 * prior connection values
+	 */
+
+	ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap);
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"prop_get NVME_REG_CAP failed\n");
+		goto out_disconnect_admin_queue;
+	}
+
+	ctrl->ctrl.sqsize =
+		min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
+
+	ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+	if (ret)
+		goto out_disconnect_admin_queue;
+
+	ctrl->ctrl.max_hw_sectors =
+		(ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
+
+	ret = nvme_init_identify(&ctrl->ctrl);
+	if (ret)
+		goto out_disconnect_admin_queue;
+
+	/* sanity checks */
+
+	/* FC-NVME does not have other data in the capsule */
+	if (ctrl->ctrl.icdoff) {
+		dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n",
+				ctrl->ctrl.icdoff);
+		goto out_disconnect_admin_queue;
+	}
+
+	/* FC-NVME supports normal SGL Data Block Descriptors */
+
+	if (opts->queue_size > ctrl->ctrl.maxcmd) {
+		/* warn if maxcmd is lower than queue_size */
+		dev_warn(ctrl->ctrl.device,
+			"queue_size %zu > ctrl maxcmd %u, reducing "
+			"to queue_size\n",
+			opts->queue_size, ctrl->ctrl.maxcmd);
+		opts->queue_size = ctrl->ctrl.maxcmd;
+	}
+
+	if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
+		/* warn if sqsize is lower than queue_size */
+		dev_warn(ctrl->ctrl.device,
+			"queue_size %zu > ctrl sqsize %u, clamping down\n",
+			opts->queue_size, ctrl->ctrl.sqsize + 1);
+		opts->queue_size = ctrl->ctrl.sqsize + 1;
+	}
+
+	ret = nvme_fc_init_aen_ops(ctrl);
+	if (ret)
+		goto out_term_aen_ops;
+
+	/*
+	 * Create the io queues
+	 */
+
+	if (ctrl->ctrl.queue_count > 1) {
+		if (ctrl->ctrl.state == NVME_CTRL_NEW)
+			ret = nvme_fc_create_io_queues(ctrl);
+		else
+			ret = nvme_fc_reinit_io_queues(ctrl);
+		if (ret)
+			goto out_term_aen_ops;
+	}
+
+	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
+
+	ctrl->ctrl.nr_reconnects = 0;
+
+	if (changed)
+		nvme_start_ctrl(&ctrl->ctrl);
+
+	return 0;	/* Success */
+
+out_term_aen_ops:
+	nvme_fc_term_aen_ops(ctrl);
+out_disconnect_admin_queue:
+	/* send a Disconnect(association) LS to fc-nvme target */
+	nvme_fc_xmt_disconnect_assoc(ctrl);
+out_delete_hw_queue:
+	__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
+out_free_queue:
+	nvme_fc_free_queue(&ctrl->queues[0]);
+	ctrl->assoc_active = false;
+	nvme_fc_ctlr_inactive_on_rport(ctrl);
+
+	return ret;
+}
+
+/*
+ * This routine stops operation of the controller on the host side.
+ * On the host os stack side: Admin and IO queues are stopped,
+ *   outstanding ios on them terminated via FC ABTS.
+ * On the link side: the association is terminated.
+ */
+static void
+nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
+{
+	unsigned long flags;
+
+	if (!ctrl->assoc_active)
+		return;
+	ctrl->assoc_active = false;
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+	ctrl->flags |= FCCTRL_TERMIO;
+	ctrl->iocnt = 0;
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+
+	/*
+	 * If io queues are present, stop them and terminate all outstanding
+	 * ios on them. As FC allocates FC exchange for each io, the
+	 * transport must contact the LLDD to terminate the exchange,
+	 * thus releasing the FC exchange. We use blk_mq_tagset_busy_itr()
+	 * to tell us what io's are busy and invoke a transport routine
+	 * to kill them with the LLDD.  After terminating the exchange
+	 * the LLDD will call the transport's normal io done path, but it
+	 * will have an aborted status. The done path will return the
+	 * io requests back to the block layer as part of normal completions
+	 * (but with error status).
+	 */
+	if (ctrl->ctrl.queue_count > 1) {
+		nvme_stop_queues(&ctrl->ctrl);
+		blk_mq_tagset_busy_iter(&ctrl->tag_set,
+				nvme_fc_terminate_exchange, &ctrl->ctrl);
+	}
+
+	/*
+	 * Other transports, which don't have link-level contexts bound
+	 * to sqe's, would try to gracefully shutdown the controller by
+	 * writing the registers for shutdown and polling (call
+	 * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially
+	 * just aborted and we will wait on those contexts, and given
+	 * there was no indication of how live the controlelr is on the
+	 * link, don't send more io to create more contexts for the
+	 * shutdown. Let the controller fail via keepalive failure if
+	 * its still present.
+	 */
+
+	/*
+	 * clean up the admin queue. Same thing as above.
+	 * use blk_mq_tagset_busy_itr() and the transport routine to
+	 * terminate the exchanges.
+	 */
+	if (ctrl->ctrl.state != NVME_CTRL_NEW)
+		blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
+				nvme_fc_terminate_exchange, &ctrl->ctrl);
+
+	/* kill the aens as they are a separate path */
+	nvme_fc_abort_aen_ops(ctrl);
+
+	/* wait for all io that had to be aborted */
+	spin_lock_irq(&ctrl->lock);
+	wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock);
+	ctrl->flags &= ~FCCTRL_TERMIO;
+	spin_unlock_irq(&ctrl->lock);
+
+	nvme_fc_term_aen_ops(ctrl);
+
+	/*
+	 * send a Disconnect(association) LS to fc-nvme target
+	 * Note: could have been sent at top of process, but
+	 * cleaner on link traffic if after the aborts complete.
+	 * Note: if association doesn't exist, association_id will be 0
+	 */
+	if (ctrl->association_id)
+		nvme_fc_xmt_disconnect_assoc(ctrl);
+
+	if (ctrl->ctrl.tagset) {
+		nvme_fc_delete_hw_io_queues(ctrl);
+		nvme_fc_free_io_queues(ctrl);
+	}
+
+	__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
+	nvme_fc_free_queue(&ctrl->queues[0]);
+
+	/* re-enable the admin_q so anything new can fast fail */
+	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+
+	nvme_fc_ctlr_inactive_on_rport(ctrl);
+}
+
+static void
+nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+
+	cancel_delayed_work_sync(&ctrl->connect_work);
+	/*
+	 * kill the association on the link side.  this will block
+	 * waiting for io to terminate
+	 */
+	nvme_fc_delete_association(ctrl);
+
+	/* resume the io queues so that things will fast fail */
+	nvme_start_queues(nctrl);
+}
+
+static void
+nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
+{
+	struct nvme_fc_rport *rport = ctrl->rport;
+	struct nvme_fc_remote_port *portptr = &rport->remoteport;
+	unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ;
+	bool recon = true;
+
+	if (ctrl->ctrl.state != NVME_CTRL_CONNECTING)
+		return;
+
+	if (portptr->port_state == FC_OBJSTATE_ONLINE)
+		dev_info(ctrl->ctrl.device,
+			"NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
+			ctrl->cnum, status);
+	else if (time_after_eq(jiffies, rport->dev_loss_end))
+		recon = false;
+
+	if (recon && nvmf_should_reconnect(&ctrl->ctrl)) {
+		if (portptr->port_state == FC_OBJSTATE_ONLINE)
+			dev_info(ctrl->ctrl.device,
+				"NVME-FC{%d}: Reconnect attempt in %ld "
+				"seconds\n",
+				ctrl->cnum, recon_delay / HZ);
+		else if (time_after(jiffies + recon_delay, rport->dev_loss_end))
+			recon_delay = rport->dev_loss_end - jiffies;
+
+		queue_delayed_work(nvme_wq, &ctrl->connect_work, recon_delay);
+	} else {
+		if (portptr->port_state == FC_OBJSTATE_ONLINE)
+			dev_warn(ctrl->ctrl.device,
+				"NVME-FC{%d}: Max reconnect attempts (%d) "
+				"reached.\n",
+				ctrl->cnum, ctrl->ctrl.nr_reconnects);
+		else
+			dev_warn(ctrl->ctrl.device,
+				"NVME-FC{%d}: dev_loss_tmo (%d) expired "
+				"while waiting for remoteport connectivity.\n",
+				ctrl->cnum, portptr->dev_loss_tmo);
+		WARN_ON(nvme_delete_ctrl(&ctrl->ctrl));
+	}
+}
+
+static void
+nvme_fc_reset_ctrl_work(struct work_struct *work)
+{
+	struct nvme_fc_ctrl *ctrl =
+		container_of(work, struct nvme_fc_ctrl, ctrl.reset_work);
+	int ret;
+
+	nvme_stop_ctrl(&ctrl->ctrl);
+
+	/* will block will waiting for io to terminate */
+	nvme_fc_delete_association(ctrl);
+
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
+		dev_err(ctrl->ctrl.device,
+			"NVME-FC{%d}: error_recovery: Couldn't change state "
+			"to CONNECTING\n", ctrl->cnum);
+		return;
+	}
+
+	if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE)
+		ret = nvme_fc_create_association(ctrl);
+	else
+		ret = -ENOTCONN;
+
+	if (ret)
+		nvme_fc_reconnect_or_delete(ctrl, ret);
+	else
+		dev_info(ctrl->ctrl.device,
+			"NVME-FC{%d}: controller reset complete\n",
+			ctrl->cnum);
+}
+
+static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
+	.name			= "fc",
+	.module			= THIS_MODULE,
+	.flags			= NVME_F_FABRICS,
+	.reg_read32		= nvmf_reg_read32,
+	.reg_read64		= nvmf_reg_read64,
+	.reg_write32		= nvmf_reg_write32,
+	.free_ctrl		= nvme_fc_nvme_ctrl_freed,
+	.submit_async_event	= nvme_fc_submit_async_event,
+	.delete_ctrl		= nvme_fc_delete_ctrl,
+	.get_address		= nvmf_get_address,
+	.reinit_request		= nvme_fc_reinit_request,
+};
+
+static void
+nvme_fc_connect_ctrl_work(struct work_struct *work)
+{
+	int ret;
+
+	struct nvme_fc_ctrl *ctrl =
+			container_of(to_delayed_work(work),
+				struct nvme_fc_ctrl, connect_work);
+
+	ret = nvme_fc_create_association(ctrl);
+	if (ret)
+		nvme_fc_reconnect_or_delete(ctrl, ret);
+	else
+		dev_info(ctrl->ctrl.device,
+			"NVME-FC{%d}: controller reconnect complete\n",
+			ctrl->cnum);
+}
+
+
+static const struct blk_mq_ops nvme_fc_admin_mq_ops = {
+	.queue_rq	= nvme_fc_queue_rq,
+	.complete	= nvme_fc_complete_rq,
+	.init_request	= nvme_fc_init_request,
+	.exit_request	= nvme_fc_exit_request,
+	.init_hctx	= nvme_fc_init_admin_hctx,
+	.timeout	= nvme_fc_timeout,
+};
+
+
+/*
+ * Fails a controller request if it matches an existing controller
+ * (association) with the same tuple:
+ * <Host NQN, Host ID, local FC port, remote FC port, SUBSYS NQN>
+ *
+ * The ports don't need to be compared as they are intrinsically
+ * already matched by the port pointers supplied.
+ */
+static bool
+nvme_fc_existing_controller(struct nvme_fc_rport *rport,
+		struct nvmf_ctrl_options *opts)
+{
+	struct nvme_fc_ctrl *ctrl;
+	unsigned long flags;
+	bool found = false;
+
+	spin_lock_irqsave(&rport->lock, flags);
+	list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
+		found = nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts);
+		if (found)
+			break;
+	}
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	return found;
+}
+
+static struct nvme_ctrl *
+nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
+	struct nvme_fc_lport *lport, struct nvme_fc_rport *rport)
+{
+	struct nvme_fc_ctrl *ctrl;
+	unsigned long flags;
+	int ret, idx, retry;
+
+	if (!(rport->remoteport.port_role &
+	    (FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET))) {
+		ret = -EBADR;
+		goto out_fail;
+	}
+
+	if (!opts->duplicate_connect &&
+	    nvme_fc_existing_controller(rport, opts)) {
+		ret = -EALREADY;
+		goto out_fail;
+	}
+
+	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+	if (!ctrl) {
+		ret = -ENOMEM;
+		goto out_fail;
+	}
+
+	idx = ida_simple_get(&nvme_fc_ctrl_cnt, 0, 0, GFP_KERNEL);
+	if (idx < 0) {
+		ret = -ENOSPC;
+		goto out_free_ctrl;
+	}
+
+	ctrl->ctrl.opts = opts;
+	INIT_LIST_HEAD(&ctrl->ctrl_list);
+	ctrl->lport = lport;
+	ctrl->rport = rport;
+	ctrl->dev = lport->dev;
+	ctrl->cnum = idx;
+	ctrl->assoc_active = false;
+	init_waitqueue_head(&ctrl->ioabort_wait);
+
+	get_device(ctrl->dev);
+	kref_init(&ctrl->ref);
+
+	INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work);
+	INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work);
+	spin_lock_init(&ctrl->lock);
+
+	/* io queue count */
+	ctrl->ctrl.queue_count = min_t(unsigned int,
+				opts->nr_io_queues,
+				lport->ops->max_hw_queues);
+	ctrl->ctrl.queue_count++;	/* +1 for admin queue */
+
+	ctrl->ctrl.sqsize = opts->queue_size - 1;
+	ctrl->ctrl.kato = opts->kato;
+
+	ret = -ENOMEM;
+	ctrl->queues = kcalloc(ctrl->ctrl.queue_count,
+				sizeof(struct nvme_fc_queue), GFP_KERNEL);
+	if (!ctrl->queues)
+		goto out_free_ida;
+
+	memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
+	ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
+	ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+	ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
+	ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
+					(SG_CHUNK_SIZE *
+						sizeof(struct scatterlist)) +
+					ctrl->lport->ops->fcprqst_priv_sz;
+	ctrl->admin_tag_set.driver_data = ctrl;
+	ctrl->admin_tag_set.nr_hw_queues = 1;
+	ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
+	ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED;
+
+	ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
+	if (ret)
+		goto out_free_queues;
+	ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set;
+
+	ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+	if (IS_ERR(ctrl->ctrl.admin_q)) {
+		ret = PTR_ERR(ctrl->ctrl.admin_q);
+		goto out_free_admin_tag_set;
+	}
+
+	/*
+	 * Would have been nice to init io queues tag set as well.
+	 * However, we require interaction from the controller
+	 * for max io queue count before we can do so.
+	 * Defer this to the connect path.
+	 */
+
+	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0);
+	if (ret)
+		goto out_cleanup_admin_q;
+
+	/* at this point, teardown path changes to ref counting on nvme ctrl */
+
+	spin_lock_irqsave(&rport->lock, flags);
+	list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list);
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	/*
+	 * It's possible that transactions used to create the association
+	 * may fail. Examples: CreateAssociation LS or CreateIOConnection
+	 * LS gets dropped/corrupted/fails; or a frame gets dropped or a
+	 * command times out for one of the actions to init the controller
+	 * (Connect, Get/Set_Property, Set_Features, etc). Many of these
+	 * transport errors (frame drop, LS failure) inherently must kill
+	 * the association. The transport is coded so that any command used
+	 * to create the association (prior to a LIVE state transition
+	 * while NEW or CONNECTING) will fail if it completes in error or
+	 * times out.
+	 *
+	 * As such: as the connect request was mostly likely due to a
+	 * udev event that discovered the remote port, meaning there is
+	 * not an admin or script there to restart if the connect
+	 * request fails, retry the initial connection creation up to
+	 * three times before giving up and declaring failure.
+	 */
+	for (retry = 0; retry < 3; retry++) {
+		ret = nvme_fc_create_association(ctrl);
+		if (!ret)
+			break;
+	}
+
+	if (ret) {
+		nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING);
+		cancel_work_sync(&ctrl->ctrl.reset_work);
+		cancel_delayed_work_sync(&ctrl->connect_work);
+
+		/* couldn't schedule retry - fail out */
+		dev_err(ctrl->ctrl.device,
+			"NVME-FC{%d}: Connect retry failed\n", ctrl->cnum);
+
+		ctrl->ctrl.opts = NULL;
+
+		/* initiate nvme ctrl ref counting teardown */
+		nvme_uninit_ctrl(&ctrl->ctrl);
+
+		/* Remove core ctrl ref. */
+		nvme_put_ctrl(&ctrl->ctrl);
+
+		/* as we're past the point where we transition to the ref
+		 * counting teardown path, if we return a bad pointer here,
+		 * the calling routine, thinking it's prior to the
+		 * transition, will do an rport put. Since the teardown
+		 * path also does a rport put, we do an extra get here to
+		 * so proper order/teardown happens.
+		 */
+		nvme_fc_rport_get(rport);
+
+		if (ret > 0)
+			ret = -EIO;
+		return ERR_PTR(ret);
+	}
+
+	nvme_get_ctrl(&ctrl->ctrl);
+
+	dev_info(ctrl->ctrl.device,
+		"NVME-FC{%d}: new ctrl: NQN \"%s\"\n",
+		ctrl->cnum, ctrl->ctrl.opts->subsysnqn);
+
+	return &ctrl->ctrl;
+
+out_cleanup_admin_q:
+	blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_free_admin_tag_set:
+	blk_mq_free_tag_set(&ctrl->admin_tag_set);
+out_free_queues:
+	kfree(ctrl->queues);
+out_free_ida:
+	put_device(ctrl->dev);
+	ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum);
+out_free_ctrl:
+	kfree(ctrl);
+out_fail:
+	/* exit via here doesn't follow ctlr ref points */
+	return ERR_PTR(ret);
+}
+
+
+struct nvmet_fc_traddr {
+	u64	nn;
+	u64	pn;
+};
+
+static int
+__nvme_fc_parse_u64(substring_t *sstr, u64 *val)
+{
+	u64 token64;
+
+	if (match_u64(sstr, &token64))
+		return -EINVAL;
+	*val = token64;
+
+	return 0;
+}
+
+/*
+ * This routine validates and extracts the WWN's from the TRADDR string.
+ * As kernel parsers need the 0x to determine number base, universally
+ * build string to parse with 0x prefix before parsing name strings.
+ */
+static int
+nvme_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf, size_t blen)
+{
+	char name[2 + NVME_FC_TRADDR_HEXNAMELEN + 1];
+	substring_t wwn = { name, &name[sizeof(name)-1] };
+	int nnoffset, pnoffset;
+
+	/* validate it string one of the 2 allowed formats */
+	if (strnlen(buf, blen) == NVME_FC_TRADDR_MAXLENGTH &&
+			!strncmp(buf, "nn-0x", NVME_FC_TRADDR_OXNNLEN) &&
+			!strncmp(&buf[NVME_FC_TRADDR_MAX_PN_OFFSET],
+				"pn-0x", NVME_FC_TRADDR_OXNNLEN)) {
+		nnoffset = NVME_FC_TRADDR_OXNNLEN;
+		pnoffset = NVME_FC_TRADDR_MAX_PN_OFFSET +
+						NVME_FC_TRADDR_OXNNLEN;
+	} else if ((strnlen(buf, blen) == NVME_FC_TRADDR_MINLENGTH &&
+			!strncmp(buf, "nn-", NVME_FC_TRADDR_NNLEN) &&
+			!strncmp(&buf[NVME_FC_TRADDR_MIN_PN_OFFSET],
+				"pn-", NVME_FC_TRADDR_NNLEN))) {
+		nnoffset = NVME_FC_TRADDR_NNLEN;
+		pnoffset = NVME_FC_TRADDR_MIN_PN_OFFSET + NVME_FC_TRADDR_NNLEN;
+	} else
+		goto out_einval;
+
+	name[0] = '0';
+	name[1] = 'x';
+	name[2 + NVME_FC_TRADDR_HEXNAMELEN] = 0;
+
+	memcpy(&name[2], &buf[nnoffset], NVME_FC_TRADDR_HEXNAMELEN);
+	if (__nvme_fc_parse_u64(&wwn, &traddr->nn))
+		goto out_einval;
+
+	memcpy(&name[2], &buf[pnoffset], NVME_FC_TRADDR_HEXNAMELEN);
+	if (__nvme_fc_parse_u64(&wwn, &traddr->pn))
+		goto out_einval;
+
+	return 0;
+
+out_einval:
+	pr_warn("%s: bad traddr string\n", __func__);
+	return -EINVAL;
+}
+
+static struct nvme_ctrl *
+nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
+{
+	struct nvme_fc_lport *lport;
+	struct nvme_fc_rport *rport;
+	struct nvme_ctrl *ctrl;
+	struct nvmet_fc_traddr laddr = { 0L, 0L };
+	struct nvmet_fc_traddr raddr = { 0L, 0L };
+	unsigned long flags;
+	int ret;
+
+	ret = nvme_fc_parse_traddr(&raddr, opts->traddr, NVMF_TRADDR_SIZE);
+	if (ret || !raddr.nn || !raddr.pn)
+		return ERR_PTR(-EINVAL);
+
+	ret = nvme_fc_parse_traddr(&laddr, opts->host_traddr, NVMF_TRADDR_SIZE);
+	if (ret || !laddr.nn || !laddr.pn)
+		return ERR_PTR(-EINVAL);
+
+	/* find the host and remote ports to connect together */
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
+		if (lport->localport.node_name != laddr.nn ||
+		    lport->localport.port_name != laddr.pn)
+			continue;
+
+		list_for_each_entry(rport, &lport->endp_list, endp_list) {
+			if (rport->remoteport.node_name != raddr.nn ||
+			    rport->remoteport.port_name != raddr.pn)
+				continue;
+
+			/* if fail to get reference fall through. Will error */
+			if (!nvme_fc_rport_get(rport))
+				break;
+
+			spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+			ctrl = nvme_fc_init_ctrl(dev, opts, lport, rport);
+			if (IS_ERR(ctrl))
+				nvme_fc_rport_put(rport);
+			return ctrl;
+		}
+	}
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	return ERR_PTR(-ENOENT);
+}
+
+
+static struct nvmf_transport_ops nvme_fc_transport = {
+	.name		= "fc",
+	.module		= THIS_MODULE,
+	.required_opts	= NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR,
+	.allowed_opts	= NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO,
+	.create_ctrl	= nvme_fc_create_ctrl,
+};
+
+static int __init nvme_fc_init_module(void)
+{
+	int ret;
+
+	/*
+	 * NOTE:
+	 * It is expected that in the future the kernel will combine
+	 * the FC-isms that are currently under scsi and now being
+	 * added to by NVME into a new standalone FC class. The SCSI
+	 * and NVME protocols and their devices would be under this
+	 * new FC class.
+	 *
+	 * As we need something to post FC-specific udev events to,
+	 * specifically for nvme probe events, start by creating the
+	 * new device class.  When the new standalone FC class is
+	 * put in place, this code will move to a more generic
+	 * location for the class.
+	 */
+	fc_class = class_create(THIS_MODULE, "fc");
+	if (IS_ERR(fc_class)) {
+		pr_err("couldn't register class fc\n");
+		return PTR_ERR(fc_class);
+	}
+
+	/*
+	 * Create a device for the FC-centric udev events
+	 */
+	fc_udev_device = device_create(fc_class, NULL, MKDEV(0, 0), NULL,
+				"fc_udev_device");
+	if (IS_ERR(fc_udev_device)) {
+		pr_err("couldn't create fc_udev device!\n");
+		ret = PTR_ERR(fc_udev_device);
+		goto out_destroy_class;
+	}
+
+	ret = nvmf_register_transport(&nvme_fc_transport);
+	if (ret)
+		goto out_destroy_device;
+
+	return 0;
+
+out_destroy_device:
+	device_destroy(fc_class, MKDEV(0, 0));
+out_destroy_class:
+	class_destroy(fc_class);
+	return ret;
+}
+
+static void __exit nvme_fc_exit_module(void)
+{
+	/* sanity check - all lports should be removed */
+	if (!list_empty(&nvme_fc_lport_list))
+		pr_warn("%s: localport list not empty\n", __func__);
+
+	nvmf_unregister_transport(&nvme_fc_transport);
+
+	ida_destroy(&nvme_fc_local_port_cnt);
+	ida_destroy(&nvme_fc_ctrl_cnt);
+
+	device_destroy(fc_class, MKDEV(0, 0));
+	class_destroy(fc_class);
+}
+
+module_init(nvme_fc_init_module);
+module_exit(nvme_fc_exit_module);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
new file mode 100644
index 0000000..41279da
--- /dev/null
+++ b/drivers/nvme/host/lightnvm.c
@@ -0,0 +1,1309 @@
+/*
+ * nvme-lightnvm.c - LightNVM NVMe device
+ *
+ * Copyright (C) 2014-2015 IT University of Copenhagen
+ * Initial release: Matias Bjorling <mb@lightnvm.io>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include "nvme.h"
+
+#include <linux/nvme.h>
+#include <linux/bitops.h>
+#include <linux/lightnvm.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/sysctl.h>
+#include <uapi/linux/lightnvm.h>
+
+enum nvme_nvm_admin_opcode {
+	nvme_nvm_admin_identity		= 0xe2,
+	nvme_nvm_admin_get_bb_tbl	= 0xf2,
+	nvme_nvm_admin_set_bb_tbl	= 0xf1,
+};
+
+enum nvme_nvm_log_page {
+	NVME_NVM_LOG_REPORT_CHUNK	= 0xca,
+};
+
+struct nvme_nvm_ph_rw {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2;
+	__le64			metadata;
+	__le64			prp1;
+	__le64			prp2;
+	__le64			spba;
+	__le16			length;
+	__le16			control;
+	__le32			dsmgmt;
+	__le64			resv;
+};
+
+struct nvme_nvm_erase_blk {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd[2];
+	__le64			prp1;
+	__le64			prp2;
+	__le64			spba;
+	__le16			length;
+	__le16			control;
+	__le32			dsmgmt;
+	__le64			resv;
+};
+
+struct nvme_nvm_identity {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd[2];
+	__le64			prp1;
+	__le64			prp2;
+	__u32			rsvd11[6];
+};
+
+struct nvme_nvm_getbbtbl {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd[2];
+	__le64			prp1;
+	__le64			prp2;
+	__le64			spba;
+	__u32			rsvd4[4];
+};
+
+struct nvme_nvm_setbbtbl {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__le64			rsvd[2];
+	__le64			prp1;
+	__le64			prp2;
+	__le64			spba;
+	__le16			nlb;
+	__u8			value;
+	__u8			rsvd3;
+	__u32			rsvd4[3];
+};
+
+struct nvme_nvm_command {
+	union {
+		struct nvme_common_command common;
+		struct nvme_nvm_ph_rw ph_rw;
+		struct nvme_nvm_erase_blk erase;
+		struct nvme_nvm_identity identity;
+		struct nvme_nvm_getbbtbl get_bb;
+		struct nvme_nvm_setbbtbl set_bb;
+	};
+};
+
+struct nvme_nvm_id12_grp {
+	__u8			mtype;
+	__u8			fmtype;
+	__le16			res16;
+	__u8			num_ch;
+	__u8			num_lun;
+	__u8			num_pln;
+	__u8			rsvd1;
+	__le16			num_chk;
+	__le16			num_pg;
+	__le16			fpg_sz;
+	__le16			csecs;
+	__le16			sos;
+	__le16			rsvd2;
+	__le32			trdt;
+	__le32			trdm;
+	__le32			tprt;
+	__le32			tprm;
+	__le32			tbet;
+	__le32			tbem;
+	__le32			mpos;
+	__le32			mccap;
+	__le16			cpar;
+	__u8			reserved[906];
+} __packed;
+
+struct nvme_nvm_id12_addrf {
+	__u8			ch_offset;
+	__u8			ch_len;
+	__u8			lun_offset;
+	__u8			lun_len;
+	__u8			pln_offset;
+	__u8			pln_len;
+	__u8			blk_offset;
+	__u8			blk_len;
+	__u8			pg_offset;
+	__u8			pg_len;
+	__u8			sec_offset;
+	__u8			sec_len;
+	__u8			res[4];
+} __packed;
+
+struct nvme_nvm_id12 {
+	__u8			ver_id;
+	__u8			vmnt;
+	__u8			cgrps;
+	__u8			res;
+	__le32			cap;
+	__le32			dom;
+	struct nvme_nvm_id12_addrf ppaf;
+	__u8			resv[228];
+	struct nvme_nvm_id12_grp grp;
+	__u8			resv2[2880];
+} __packed;
+
+struct nvme_nvm_bb_tbl {
+	__u8	tblid[4];
+	__le16	verid;
+	__le16	revid;
+	__le32	rvsd1;
+	__le32	tblks;
+	__le32	tfact;
+	__le32	tgrown;
+	__le32	tdresv;
+	__le32	thresv;
+	__le32	rsvd2[8];
+	__u8	blk[0];
+};
+
+struct nvme_nvm_id20_addrf {
+	__u8			grp_len;
+	__u8			pu_len;
+	__u8			chk_len;
+	__u8			lba_len;
+	__u8			resv[4];
+};
+
+struct nvme_nvm_id20 {
+	__u8			mjr;
+	__u8			mnr;
+	__u8			resv[6];
+
+	struct nvme_nvm_id20_addrf lbaf;
+
+	__le32			mccap;
+	__u8			resv2[12];
+
+	__u8			wit;
+	__u8			resv3[31];
+
+	/* Geometry */
+	__le16			num_grp;
+	__le16			num_pu;
+	__le32			num_chk;
+	__le32			clba;
+	__u8			resv4[52];
+
+	/* Write data requirements */
+	__le32			ws_min;
+	__le32			ws_opt;
+	__le32			mw_cunits;
+	__le32			maxoc;
+	__le32			maxocpu;
+	__u8			resv5[44];
+
+	/* Performance related metrics */
+	__le32			trdt;
+	__le32			trdm;
+	__le32			twrt;
+	__le32			twrm;
+	__le32			tcrst;
+	__le32			tcrsm;
+	__u8			resv6[40];
+
+	/* Reserved area */
+	__u8			resv7[2816];
+
+	/* Vendor specific */
+	__u8			vs[1024];
+};
+
+struct nvme_nvm_chk_meta {
+	__u8	state;
+	__u8	type;
+	__u8	wi;
+	__u8	rsvd[5];
+	__le64	slba;
+	__le64	cnlb;
+	__le64	wp;
+};
+
+/*
+ * Check we didn't inadvertently grow the command struct
+ */
+static inline void _nvme_nvm_check_size(void)
+{
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_grp) != 960);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_addrf) != 16);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_id12) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_id20_addrf) != 8);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_id20) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) != 32);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) !=
+						sizeof(struct nvm_chk_meta));
+}
+
+static void nvme_nvm_set_addr_12(struct nvm_addrf_12 *dst,
+				 struct nvme_nvm_id12_addrf *src)
+{
+	dst->ch_len = src->ch_len;
+	dst->lun_len = src->lun_len;
+	dst->blk_len = src->blk_len;
+	dst->pg_len = src->pg_len;
+	dst->pln_len = src->pln_len;
+	dst->sec_len = src->sec_len;
+
+	dst->ch_offset = src->ch_offset;
+	dst->lun_offset = src->lun_offset;
+	dst->blk_offset = src->blk_offset;
+	dst->pg_offset = src->pg_offset;
+	dst->pln_offset = src->pln_offset;
+	dst->sec_offset = src->sec_offset;
+
+	dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset;
+	dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset;
+	dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset;
+	dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset;
+	dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset;
+	dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset;
+}
+
+static int nvme_nvm_setup_12(struct nvme_nvm_id12 *id,
+			     struct nvm_geo *geo)
+{
+	struct nvme_nvm_id12_grp *src;
+	int sec_per_pg, sec_per_pl, pg_per_blk;
+
+	if (id->cgrps != 1)
+		return -EINVAL;
+
+	src = &id->grp;
+
+	if (src->mtype != 0) {
+		pr_err("nvm: memory type not supported\n");
+		return -EINVAL;
+	}
+
+	/* 1.2 spec. only reports a single version id - unfold */
+	geo->major_ver_id = id->ver_id;
+	geo->minor_ver_id = 2;
+
+	/* Set compacted version for upper layers */
+	geo->version = NVM_OCSSD_SPEC_12;
+
+	geo->num_ch = src->num_ch;
+	geo->num_lun = src->num_lun;
+	geo->all_luns = geo->num_ch * geo->num_lun;
+
+	geo->num_chk = le16_to_cpu(src->num_chk);
+
+	geo->csecs = le16_to_cpu(src->csecs);
+	geo->sos = le16_to_cpu(src->sos);
+
+	pg_per_blk = le16_to_cpu(src->num_pg);
+	sec_per_pg = le16_to_cpu(src->fpg_sz) / geo->csecs;
+	sec_per_pl = sec_per_pg * src->num_pln;
+	geo->clba = sec_per_pl * pg_per_blk;
+
+	geo->all_chunks = geo->all_luns * geo->num_chk;
+	geo->total_secs = geo->clba * geo->all_chunks;
+
+	geo->ws_min = sec_per_pg;
+	geo->ws_opt = sec_per_pg;
+	geo->mw_cunits = geo->ws_opt << 3;	/* default to MLC safe values */
+
+	/* Do not impose values for maximum number of open blocks as it is
+	 * unspecified in 1.2. Users of 1.2 must be aware of this and eventually
+	 * specify these values through a quirk if restrictions apply.
+	 */
+	geo->maxoc = geo->all_luns * geo->num_chk;
+	geo->maxocpu = geo->num_chk;
+
+	geo->mccap = le32_to_cpu(src->mccap);
+
+	geo->trdt = le32_to_cpu(src->trdt);
+	geo->trdm = le32_to_cpu(src->trdm);
+	geo->tprt = le32_to_cpu(src->tprt);
+	geo->tprm = le32_to_cpu(src->tprm);
+	geo->tbet = le32_to_cpu(src->tbet);
+	geo->tbem = le32_to_cpu(src->tbem);
+
+	/* 1.2 compatibility */
+	geo->vmnt = id->vmnt;
+	geo->cap = le32_to_cpu(id->cap);
+	geo->dom = le32_to_cpu(id->dom);
+
+	geo->mtype = src->mtype;
+	geo->fmtype = src->fmtype;
+
+	geo->cpar = le16_to_cpu(src->cpar);
+	geo->mpos = le32_to_cpu(src->mpos);
+
+	geo->pln_mode = NVM_PLANE_SINGLE;
+
+	if (geo->mpos & 0x020202) {
+		geo->pln_mode = NVM_PLANE_DOUBLE;
+		geo->ws_opt <<= 1;
+	} else if (geo->mpos & 0x040404) {
+		geo->pln_mode = NVM_PLANE_QUAD;
+		geo->ws_opt <<= 2;
+	}
+
+	geo->num_pln = src->num_pln;
+	geo->num_pg = le16_to_cpu(src->num_pg);
+	geo->fpg_sz = le16_to_cpu(src->fpg_sz);
+
+	nvme_nvm_set_addr_12((struct nvm_addrf_12 *)&geo->addrf, &id->ppaf);
+
+	return 0;
+}
+
+static void nvme_nvm_set_addr_20(struct nvm_addrf *dst,
+				 struct nvme_nvm_id20_addrf *src)
+{
+	dst->ch_len = src->grp_len;
+	dst->lun_len = src->pu_len;
+	dst->chk_len = src->chk_len;
+	dst->sec_len = src->lba_len;
+
+	dst->sec_offset = 0;
+	dst->chk_offset = dst->sec_len;
+	dst->lun_offset = dst->chk_offset + dst->chk_len;
+	dst->ch_offset = dst->lun_offset + dst->lun_len;
+
+	dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset;
+	dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset;
+	dst->chk_mask = ((1ULL << dst->chk_len) - 1) << dst->chk_offset;
+	dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset;
+}
+
+static int nvme_nvm_setup_20(struct nvme_nvm_id20 *id,
+			     struct nvm_geo *geo)
+{
+	geo->major_ver_id = id->mjr;
+	geo->minor_ver_id = id->mnr;
+
+	/* Set compacted version for upper layers */
+	geo->version = NVM_OCSSD_SPEC_20;
+
+	if (!(geo->major_ver_id == 2 && geo->minor_ver_id == 0)) {
+		pr_err("nvm: OCSSD version not supported (v%d.%d)\n",
+				geo->major_ver_id, geo->minor_ver_id);
+		return -EINVAL;
+	}
+
+	geo->num_ch = le16_to_cpu(id->num_grp);
+	geo->num_lun = le16_to_cpu(id->num_pu);
+	geo->all_luns = geo->num_ch * geo->num_lun;
+
+	geo->num_chk = le32_to_cpu(id->num_chk);
+	geo->clba = le32_to_cpu(id->clba);
+
+	geo->all_chunks = geo->all_luns * geo->num_chk;
+	geo->total_secs = geo->clba * geo->all_chunks;
+
+	geo->ws_min = le32_to_cpu(id->ws_min);
+	geo->ws_opt = le32_to_cpu(id->ws_opt);
+	geo->mw_cunits = le32_to_cpu(id->mw_cunits);
+	geo->maxoc = le32_to_cpu(id->maxoc);
+	geo->maxocpu = le32_to_cpu(id->maxocpu);
+
+	geo->trdt = le32_to_cpu(id->trdt);
+	geo->trdm = le32_to_cpu(id->trdm);
+	geo->tprt = le32_to_cpu(id->twrt);
+	geo->tprm = le32_to_cpu(id->twrm);
+	geo->tbet = le32_to_cpu(id->tcrst);
+	geo->tbem = le32_to_cpu(id->tcrsm);
+
+	nvme_nvm_set_addr_20(&geo->addrf, &id->lbaf);
+
+	return 0;
+}
+
+static int nvme_nvm_identity(struct nvm_dev *nvmdev)
+{
+	struct nvme_ns *ns = nvmdev->q->queuedata;
+	struct nvme_nvm_id12 *id;
+	struct nvme_nvm_command c = {};
+	int ret;
+
+	c.identity.opcode = nvme_nvm_admin_identity;
+	c.identity.nsid = cpu_to_le32(ns->head->ns_id);
+
+	id = kmalloc(sizeof(struct nvme_nvm_id12), GFP_KERNEL);
+	if (!id)
+		return -ENOMEM;
+
+	ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
+				id, sizeof(struct nvme_nvm_id12));
+	if (ret) {
+		ret = -EIO;
+		goto out;
+	}
+
+	/*
+	 * The 1.2 and 2.0 specifications share the first byte in their geometry
+	 * command to make it possible to know what version a device implements.
+	 */
+	switch (id->ver_id) {
+	case 1:
+		ret = nvme_nvm_setup_12(id, &nvmdev->geo);
+		break;
+	case 2:
+		ret = nvme_nvm_setup_20((struct nvme_nvm_id20 *)id,
+							&nvmdev->geo);
+		break;
+	default:
+		dev_err(ns->ctrl->device, "OCSSD revision not supported (%d)\n",
+							id->ver_id);
+		ret = -EINVAL;
+	}
+
+out:
+	kfree(id);
+	return ret;
+}
+
+static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
+								u8 *blks)
+{
+	struct request_queue *q = nvmdev->q;
+	struct nvm_geo *geo = &nvmdev->geo;
+	struct nvme_ns *ns = q->queuedata;
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct nvme_nvm_command c = {};
+	struct nvme_nvm_bb_tbl *bb_tbl;
+	int nr_blks = geo->num_chk * geo->num_pln;
+	int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks;
+	int ret = 0;
+
+	c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl;
+	c.get_bb.nsid = cpu_to_le32(ns->head->ns_id);
+	c.get_bb.spba = cpu_to_le64(ppa.ppa);
+
+	bb_tbl = kzalloc(tblsz, GFP_KERNEL);
+	if (!bb_tbl)
+		return -ENOMEM;
+
+	ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c,
+								bb_tbl, tblsz);
+	if (ret) {
+		dev_err(ctrl->device, "get bad block table failed (%d)\n", ret);
+		ret = -EIO;
+		goto out;
+	}
+
+	if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' ||
+		bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') {
+		dev_err(ctrl->device, "bbt format mismatch\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (le16_to_cpu(bb_tbl->verid) != 1) {
+		ret = -EINVAL;
+		dev_err(ctrl->device, "bbt version not supported\n");
+		goto out;
+	}
+
+	if (le32_to_cpu(bb_tbl->tblks) != nr_blks) {
+		ret = -EINVAL;
+		dev_err(ctrl->device,
+				"bbt unsuspected blocks returned (%u!=%u)",
+				le32_to_cpu(bb_tbl->tblks), nr_blks);
+		goto out;
+	}
+
+	memcpy(blks, bb_tbl->blk, geo->num_chk * geo->num_pln);
+out:
+	kfree(bb_tbl);
+	return ret;
+}
+
+static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas,
+							int nr_ppas, int type)
+{
+	struct nvme_ns *ns = nvmdev->q->queuedata;
+	struct nvme_nvm_command c = {};
+	int ret = 0;
+
+	c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl;
+	c.set_bb.nsid = cpu_to_le32(ns->head->ns_id);
+	c.set_bb.spba = cpu_to_le64(ppas->ppa);
+	c.set_bb.nlb = cpu_to_le16(nr_ppas - 1);
+	c.set_bb.value = type;
+
+	ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
+								NULL, 0);
+	if (ret)
+		dev_err(ns->ctrl->device, "set bad block table failed (%d)\n",
+									ret);
+	return ret;
+}
+
+/*
+ * Expect the lba in device format
+ */
+static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
+				 struct nvm_chk_meta *meta,
+				 sector_t slba, int nchks)
+{
+	struct nvm_geo *geo = &ndev->geo;
+	struct nvme_ns *ns = ndev->q->queuedata;
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct nvme_nvm_chk_meta *dev_meta = (struct nvme_nvm_chk_meta *)meta;
+	struct ppa_addr ppa;
+	size_t left = nchks * sizeof(struct nvme_nvm_chk_meta);
+	size_t log_pos, offset, len;
+	int ret, i;
+
+	/* Normalize lba address space to obtain log offset */
+	ppa.ppa = slba;
+	ppa = dev_to_generic_addr(ndev, ppa);
+
+	log_pos = ppa.m.chk;
+	log_pos += ppa.m.pu * geo->num_chk;
+	log_pos += ppa.m.grp * geo->num_lun * geo->num_chk;
+
+	offset = log_pos * sizeof(struct nvme_nvm_chk_meta);
+
+	while (left) {
+		len = min_t(unsigned int, left, ctrl->max_hw_sectors << 9);
+
+		ret = nvme_get_log_ext(ctrl, ns, NVME_NVM_LOG_REPORT_CHUNK,
+				dev_meta, len, offset);
+		if (ret) {
+			dev_err(ctrl->device, "Get REPORT CHUNK log error\n");
+			break;
+		}
+
+		for (i = 0; i < len; i += sizeof(struct nvme_nvm_chk_meta)) {
+			meta->state = dev_meta->state;
+			meta->type = dev_meta->type;
+			meta->wi = dev_meta->wi;
+			meta->slba = le64_to_cpu(dev_meta->slba);
+			meta->cnlb = le64_to_cpu(dev_meta->cnlb);
+			meta->wp = le64_to_cpu(dev_meta->wp);
+
+			meta++;
+			dev_meta++;
+		}
+
+		offset += len;
+		left -= len;
+	}
+
+	return ret;
+}
+
+static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
+				    struct nvme_nvm_command *c)
+{
+	c->ph_rw.opcode = rqd->opcode;
+	c->ph_rw.nsid = cpu_to_le32(ns->head->ns_id);
+	c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa);
+	c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list);
+	c->ph_rw.control = cpu_to_le16(rqd->flags);
+	c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1);
+}
+
+static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
+{
+	struct nvm_rq *rqd = rq->end_io_data;
+
+	rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
+	rqd->error = nvme_req(rq)->status;
+	nvm_end_io(rqd);
+
+	kfree(nvme_req(rq)->cmd);
+	blk_mq_free_request(rq);
+}
+
+static struct request *nvme_nvm_alloc_request(struct request_queue *q,
+					      struct nvm_rq *rqd,
+					      struct nvme_nvm_command *cmd)
+{
+	struct nvme_ns *ns = q->queuedata;
+	struct request *rq;
+
+	nvme_nvm_rqtocmd(rqd, ns, cmd);
+
+	rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY);
+	if (IS_ERR(rq))
+		return rq;
+
+	rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
+
+	if (rqd->bio) {
+		blk_init_request_from_bio(rq, rqd->bio);
+	} else {
+		rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+		rq->__data_len = 0;
+	}
+
+	return rq;
+}
+
+static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+	struct request_queue *q = dev->q;
+	struct nvme_nvm_command *cmd;
+	struct request *rq;
+
+	cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	rq = nvme_nvm_alloc_request(q, rqd, cmd);
+	if (IS_ERR(rq)) {
+		kfree(cmd);
+		return PTR_ERR(rq);
+	}
+
+	rq->end_io_data = rqd;
+
+	blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
+
+	return 0;
+}
+
+static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+	struct request_queue *q = dev->q;
+	struct request *rq;
+	struct nvme_nvm_command cmd;
+	int ret = 0;
+
+	memset(&cmd, 0, sizeof(struct nvme_nvm_command));
+
+	rq = nvme_nvm_alloc_request(q, rqd, &cmd);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	/* I/Os can fail and the error is signaled through rqd. Callers must
+	 * handle the error accordingly.
+	 */
+	blk_execute_rq(q, NULL, rq, 0);
+	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
+		ret = -EINTR;
+
+	rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
+	rqd->error = nvme_req(rq)->status;
+
+	blk_mq_free_request(rq);
+
+	return ret;
+}
+
+static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
+{
+	struct nvme_ns *ns = nvmdev->q->queuedata;
+
+	return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0);
+}
+
+static void nvme_nvm_destroy_dma_pool(void *pool)
+{
+	struct dma_pool *dma_pool = pool;
+
+	dma_pool_destroy(dma_pool);
+}
+
+static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool,
+				    gfp_t mem_flags, dma_addr_t *dma_handler)
+{
+	return dma_pool_alloc(pool, mem_flags, dma_handler);
+}
+
+static void nvme_nvm_dev_dma_free(void *pool, void *addr,
+							dma_addr_t dma_handler)
+{
+	dma_pool_free(pool, addr, dma_handler);
+}
+
+static struct nvm_dev_ops nvme_nvm_dev_ops = {
+	.identity		= nvme_nvm_identity,
+
+	.get_bb_tbl		= nvme_nvm_get_bb_tbl,
+	.set_bb_tbl		= nvme_nvm_set_bb_tbl,
+
+	.get_chk_meta		= nvme_nvm_get_chk_meta,
+
+	.submit_io		= nvme_nvm_submit_io,
+	.submit_io_sync		= nvme_nvm_submit_io_sync,
+
+	.create_dma_pool	= nvme_nvm_create_dma_pool,
+	.destroy_dma_pool	= nvme_nvm_destroy_dma_pool,
+	.dev_dma_alloc		= nvme_nvm_dev_dma_alloc,
+	.dev_dma_free		= nvme_nvm_dev_dma_free,
+};
+
+static int nvme_nvm_submit_user_cmd(struct request_queue *q,
+				struct nvme_ns *ns,
+				struct nvme_nvm_command *vcmd,
+				void __user *ubuf, unsigned int bufflen,
+				void __user *meta_buf, unsigned int meta_len,
+				void __user *ppa_buf, unsigned int ppa_len,
+				u32 *result, u64 *status, unsigned int timeout)
+{
+	bool write = nvme_is_write((struct nvme_command *)vcmd);
+	struct nvm_dev *dev = ns->ndev;
+	struct gendisk *disk = ns->disk;
+	struct request *rq;
+	struct bio *bio = NULL;
+	__le64 *ppa_list = NULL;
+	dma_addr_t ppa_dma;
+	__le64 *metadata = NULL;
+	dma_addr_t metadata_dma;
+	DECLARE_COMPLETION_ONSTACK(wait);
+	int ret = 0;
+
+	rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0,
+			NVME_QID_ANY);
+	if (IS_ERR(rq)) {
+		ret = -ENOMEM;
+		goto err_cmd;
+	}
+
+	rq->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+
+	if (ppa_buf && ppa_len) {
+		ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma);
+		if (!ppa_list) {
+			ret = -ENOMEM;
+			goto err_rq;
+		}
+		if (copy_from_user(ppa_list, (void __user *)ppa_buf,
+						sizeof(u64) * (ppa_len + 1))) {
+			ret = -EFAULT;
+			goto err_ppa;
+		}
+		vcmd->ph_rw.spba = cpu_to_le64(ppa_dma);
+	} else {
+		vcmd->ph_rw.spba = cpu_to_le64((uintptr_t)ppa_buf);
+	}
+
+	if (ubuf && bufflen) {
+		ret = blk_rq_map_user(q, rq, NULL, ubuf, bufflen, GFP_KERNEL);
+		if (ret)
+			goto err_ppa;
+		bio = rq->bio;
+
+		if (meta_buf && meta_len) {
+			metadata = dma_pool_alloc(dev->dma_pool, GFP_KERNEL,
+								&metadata_dma);
+			if (!metadata) {
+				ret = -ENOMEM;
+				goto err_map;
+			}
+
+			if (write) {
+				if (copy_from_user(metadata,
+						(void __user *)meta_buf,
+						meta_len)) {
+					ret = -EFAULT;
+					goto err_meta;
+				}
+			}
+			vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma);
+		}
+
+		bio->bi_disk = disk;
+	}
+
+	blk_execute_rq(q, NULL, rq, 0);
+
+	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
+		ret = -EINTR;
+	else if (nvme_req(rq)->status & 0x7ff)
+		ret = -EIO;
+	if (result)
+		*result = nvme_req(rq)->status & 0x7ff;
+	if (status)
+		*status = le64_to_cpu(nvme_req(rq)->result.u64);
+
+	if (metadata && !ret && !write) {
+		if (copy_to_user(meta_buf, (void *)metadata, meta_len))
+			ret = -EFAULT;
+	}
+err_meta:
+	if (meta_buf && meta_len)
+		dma_pool_free(dev->dma_pool, metadata, metadata_dma);
+err_map:
+	if (bio)
+		blk_rq_unmap_user(bio);
+err_ppa:
+	if (ppa_buf && ppa_len)
+		dma_pool_free(dev->dma_pool, ppa_list, ppa_dma);
+err_rq:
+	blk_mq_free_request(rq);
+err_cmd:
+	return ret;
+}
+
+static int nvme_nvm_submit_vio(struct nvme_ns *ns,
+					struct nvm_user_vio __user *uvio)
+{
+	struct nvm_user_vio vio;
+	struct nvme_nvm_command c;
+	unsigned int length;
+	int ret;
+
+	if (copy_from_user(&vio, uvio, sizeof(vio)))
+		return -EFAULT;
+	if (vio.flags)
+		return -EINVAL;
+
+	memset(&c, 0, sizeof(c));
+	c.ph_rw.opcode = vio.opcode;
+	c.ph_rw.nsid = cpu_to_le32(ns->head->ns_id);
+	c.ph_rw.control = cpu_to_le16(vio.control);
+	c.ph_rw.length = cpu_to_le16(vio.nppas);
+
+	length = (vio.nppas + 1) << ns->lba_shift;
+
+	ret = nvme_nvm_submit_user_cmd(ns->queue, ns, &c,
+			(void __user *)(uintptr_t)vio.addr, length,
+			(void __user *)(uintptr_t)vio.metadata,
+							vio.metadata_len,
+			(void __user *)(uintptr_t)vio.ppa_list, vio.nppas,
+			&vio.result, &vio.status, 0);
+
+	if (ret && copy_to_user(uvio, &vio, sizeof(vio)))
+		return -EFAULT;
+
+	return ret;
+}
+
+static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
+					struct nvm_passthru_vio __user *uvcmd)
+{
+	struct nvm_passthru_vio vcmd;
+	struct nvme_nvm_command c;
+	struct request_queue *q;
+	unsigned int timeout = 0;
+	int ret;
+
+	if (copy_from_user(&vcmd, uvcmd, sizeof(vcmd)))
+		return -EFAULT;
+	if ((vcmd.opcode != 0xF2) && (!capable(CAP_SYS_ADMIN)))
+		return -EACCES;
+	if (vcmd.flags)
+		return -EINVAL;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = vcmd.opcode;
+	c.common.nsid = cpu_to_le32(ns->head->ns_id);
+	c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2);
+	c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3);
+	/* cdw11-12 */
+	c.ph_rw.length = cpu_to_le16(vcmd.nppas);
+	c.ph_rw.control  = cpu_to_le16(vcmd.control);
+	c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13);
+	c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14);
+	c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15);
+
+	if (vcmd.timeout_ms)
+		timeout = msecs_to_jiffies(vcmd.timeout_ms);
+
+	q = admin ? ns->ctrl->admin_q : ns->queue;
+
+	ret = nvme_nvm_submit_user_cmd(q, ns,
+			(struct nvme_nvm_command *)&c,
+			(void __user *)(uintptr_t)vcmd.addr, vcmd.data_len,
+			(void __user *)(uintptr_t)vcmd.metadata,
+							vcmd.metadata_len,
+			(void __user *)(uintptr_t)vcmd.ppa_list, vcmd.nppas,
+			&vcmd.result, &vcmd.status, timeout);
+
+	if (ret && copy_to_user(uvcmd, &vcmd, sizeof(vcmd)))
+		return -EFAULT;
+
+	return ret;
+}
+
+int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case NVME_NVM_IOCTL_ADMIN_VIO:
+		return nvme_nvm_user_vcmd(ns, 1, (void __user *)arg);
+	case NVME_NVM_IOCTL_IO_VIO:
+		return nvme_nvm_user_vcmd(ns, 0, (void __user *)arg);
+	case NVME_NVM_IOCTL_SUBMIT_VIO:
+		return nvme_nvm_submit_vio(ns, (void __user *)arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+void nvme_nvm_update_nvm_info(struct nvme_ns *ns)
+{
+	struct nvm_dev *ndev = ns->ndev;
+	struct nvm_geo *geo = &ndev->geo;
+
+	geo->csecs = 1 << ns->lba_shift;
+	geo->sos = ns->ms;
+}
+
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
+{
+	struct request_queue *q = ns->queue;
+	struct nvm_dev *dev;
+
+	_nvme_nvm_check_size();
+
+	dev = nvm_alloc_dev(node);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->q = q;
+	memcpy(dev->name, disk_name, DISK_NAME_LEN);
+	dev->ops = &nvme_nvm_dev_ops;
+	dev->private_data = ns;
+	ns->ndev = dev;
+
+	return nvm_register(dev);
+}
+
+void nvme_nvm_unregister(struct nvme_ns *ns)
+{
+	nvm_unregister(ns->ndev);
+}
+
+static ssize_t nvm_dev_attr_show(struct device *dev,
+		struct device_attribute *dattr, char *page)
+{
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+	struct nvm_dev *ndev = ns->ndev;
+	struct nvm_geo *geo = &ndev->geo;
+	struct attribute *attr;
+
+	if (!ndev)
+		return 0;
+
+	attr = &dattr->attr;
+
+	if (strcmp(attr->name, "version") == 0) {
+		if (geo->major_ver_id == 1)
+			return scnprintf(page, PAGE_SIZE, "%u\n",
+						geo->major_ver_id);
+		else
+			return scnprintf(page, PAGE_SIZE, "%u.%u\n",
+						geo->major_ver_id,
+						geo->minor_ver_id);
+	} else if (strcmp(attr->name, "capabilities") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->cap);
+	} else if (strcmp(attr->name, "read_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdt);
+	} else if (strcmp(attr->name, "read_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdm);
+	} else {
+		return scnprintf(page,
+				 PAGE_SIZE,
+				 "Unhandled attr(%s) in `%s`\n",
+				 attr->name, __func__);
+	}
+}
+
+static ssize_t nvm_dev_attr_show_ppaf(struct nvm_addrf_12 *ppaf, char *page)
+{
+	return scnprintf(page, PAGE_SIZE,
+		"0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+				ppaf->ch_offset, ppaf->ch_len,
+				ppaf->lun_offset, ppaf->lun_len,
+				ppaf->pln_offset, ppaf->pln_len,
+				ppaf->blk_offset, ppaf->blk_len,
+				ppaf->pg_offset, ppaf->pg_len,
+				ppaf->sec_offset, ppaf->sec_len);
+}
+
+static ssize_t nvm_dev_attr_show_12(struct device *dev,
+		struct device_attribute *dattr, char *page)
+{
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+	struct nvm_dev *ndev = ns->ndev;
+	struct nvm_geo *geo = &ndev->geo;
+	struct attribute *attr;
+
+	if (!ndev)
+		return 0;
+
+	attr = &dattr->attr;
+
+	if (strcmp(attr->name, "vendor_opcode") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->vmnt);
+	} else if (strcmp(attr->name, "device_mode") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->dom);
+	/* kept for compatibility */
+	} else if (strcmp(attr->name, "media_manager") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%s\n", "gennvm");
+	} else if (strcmp(attr->name, "ppa_format") == 0) {
+		return nvm_dev_attr_show_ppaf((void *)&geo->addrf, page);
+	} else if (strcmp(attr->name, "media_type") == 0) {	/* u8 */
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->mtype);
+	} else if (strcmp(attr->name, "flash_media_type") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->fmtype);
+	} else if (strcmp(attr->name, "num_channels") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch);
+	} else if (strcmp(attr->name, "num_luns") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun);
+	} else if (strcmp(attr->name, "num_planes") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pln);
+	} else if (strcmp(attr->name, "num_blocks") == 0) {	/* u16 */
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk);
+	} else if (strcmp(attr->name, "num_pages") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pg);
+	} else if (strcmp(attr->name, "page_size") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->fpg_sz);
+	} else if (strcmp(attr->name, "hw_sector_size") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->csecs);
+	} else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->sos);
+	} else if (strcmp(attr->name, "prog_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt);
+	} else if (strcmp(attr->name, "prog_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm);
+	} else if (strcmp(attr->name, "erase_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet);
+	} else if (strcmp(attr->name, "erase_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem);
+	} else if (strcmp(attr->name, "multiplane_modes") == 0) {
+		return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mpos);
+	} else if (strcmp(attr->name, "media_capabilities") == 0) {
+		return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mccap);
+	} else if (strcmp(attr->name, "max_phys_secs") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", NVM_MAX_VLBA);
+	} else {
+		return scnprintf(page, PAGE_SIZE,
+			"Unhandled attr(%s) in `%s`\n",
+			attr->name, __func__);
+	}
+}
+
+static ssize_t nvm_dev_attr_show_20(struct device *dev,
+		struct device_attribute *dattr, char *page)
+{
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+	struct nvm_dev *ndev = ns->ndev;
+	struct nvm_geo *geo = &ndev->geo;
+	struct attribute *attr;
+
+	if (!ndev)
+		return 0;
+
+	attr = &dattr->attr;
+
+	if (strcmp(attr->name, "groups") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch);
+	} else if (strcmp(attr->name, "punits") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun);
+	} else if (strcmp(attr->name, "chunks") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk);
+	} else if (strcmp(attr->name, "clba") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->clba);
+	} else if (strcmp(attr->name, "ws_min") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_min);
+	} else if (strcmp(attr->name, "ws_opt") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_opt);
+	} else if (strcmp(attr->name, "maxoc") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxoc);
+	} else if (strcmp(attr->name, "maxocpu") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxocpu);
+	} else if (strcmp(attr->name, "mw_cunits") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->mw_cunits);
+	} else if (strcmp(attr->name, "write_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt);
+	} else if (strcmp(attr->name, "write_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm);
+	} else if (strcmp(attr->name, "reset_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet);
+	} else if (strcmp(attr->name, "reset_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem);
+	} else {
+		return scnprintf(page, PAGE_SIZE,
+			"Unhandled attr(%s) in `%s`\n",
+			attr->name, __func__);
+	}
+}
+
+#define NVM_DEV_ATTR_RO(_name)					\
+	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL)
+#define NVM_DEV_ATTR_12_RO(_name)					\
+	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_12, NULL)
+#define NVM_DEV_ATTR_20_RO(_name)					\
+	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_20, NULL)
+
+/* general attributes */
+static NVM_DEV_ATTR_RO(version);
+static NVM_DEV_ATTR_RO(capabilities);
+
+static NVM_DEV_ATTR_RO(read_typ);
+static NVM_DEV_ATTR_RO(read_max);
+
+/* 1.2 values */
+static NVM_DEV_ATTR_12_RO(vendor_opcode);
+static NVM_DEV_ATTR_12_RO(device_mode);
+static NVM_DEV_ATTR_12_RO(ppa_format);
+static NVM_DEV_ATTR_12_RO(media_manager);
+static NVM_DEV_ATTR_12_RO(media_type);
+static NVM_DEV_ATTR_12_RO(flash_media_type);
+static NVM_DEV_ATTR_12_RO(num_channels);
+static NVM_DEV_ATTR_12_RO(num_luns);
+static NVM_DEV_ATTR_12_RO(num_planes);
+static NVM_DEV_ATTR_12_RO(num_blocks);
+static NVM_DEV_ATTR_12_RO(num_pages);
+static NVM_DEV_ATTR_12_RO(page_size);
+static NVM_DEV_ATTR_12_RO(hw_sector_size);
+static NVM_DEV_ATTR_12_RO(oob_sector_size);
+static NVM_DEV_ATTR_12_RO(prog_typ);
+static NVM_DEV_ATTR_12_RO(prog_max);
+static NVM_DEV_ATTR_12_RO(erase_typ);
+static NVM_DEV_ATTR_12_RO(erase_max);
+static NVM_DEV_ATTR_12_RO(multiplane_modes);
+static NVM_DEV_ATTR_12_RO(media_capabilities);
+static NVM_DEV_ATTR_12_RO(max_phys_secs);
+
+static struct attribute *nvm_dev_attrs_12[] = {
+	&dev_attr_version.attr,
+	&dev_attr_capabilities.attr,
+
+	&dev_attr_vendor_opcode.attr,
+	&dev_attr_device_mode.attr,
+	&dev_attr_media_manager.attr,
+	&dev_attr_ppa_format.attr,
+	&dev_attr_media_type.attr,
+	&dev_attr_flash_media_type.attr,
+	&dev_attr_num_channels.attr,
+	&dev_attr_num_luns.attr,
+	&dev_attr_num_planes.attr,
+	&dev_attr_num_blocks.attr,
+	&dev_attr_num_pages.attr,
+	&dev_attr_page_size.attr,
+	&dev_attr_hw_sector_size.attr,
+	&dev_attr_oob_sector_size.attr,
+	&dev_attr_read_typ.attr,
+	&dev_attr_read_max.attr,
+	&dev_attr_prog_typ.attr,
+	&dev_attr_prog_max.attr,
+	&dev_attr_erase_typ.attr,
+	&dev_attr_erase_max.attr,
+	&dev_attr_multiplane_modes.attr,
+	&dev_attr_media_capabilities.attr,
+	&dev_attr_max_phys_secs.attr,
+
+	NULL,
+};
+
+static const struct attribute_group nvm_dev_attr_group_12 = {
+	.name		= "lightnvm",
+	.attrs		= nvm_dev_attrs_12,
+};
+
+/* 2.0 values */
+static NVM_DEV_ATTR_20_RO(groups);
+static NVM_DEV_ATTR_20_RO(punits);
+static NVM_DEV_ATTR_20_RO(chunks);
+static NVM_DEV_ATTR_20_RO(clba);
+static NVM_DEV_ATTR_20_RO(ws_min);
+static NVM_DEV_ATTR_20_RO(ws_opt);
+static NVM_DEV_ATTR_20_RO(maxoc);
+static NVM_DEV_ATTR_20_RO(maxocpu);
+static NVM_DEV_ATTR_20_RO(mw_cunits);
+static NVM_DEV_ATTR_20_RO(write_typ);
+static NVM_DEV_ATTR_20_RO(write_max);
+static NVM_DEV_ATTR_20_RO(reset_typ);
+static NVM_DEV_ATTR_20_RO(reset_max);
+
+static struct attribute *nvm_dev_attrs_20[] = {
+	&dev_attr_version.attr,
+	&dev_attr_capabilities.attr,
+
+	&dev_attr_groups.attr,
+	&dev_attr_punits.attr,
+	&dev_attr_chunks.attr,
+	&dev_attr_clba.attr,
+	&dev_attr_ws_min.attr,
+	&dev_attr_ws_opt.attr,
+	&dev_attr_maxoc.attr,
+	&dev_attr_maxocpu.attr,
+	&dev_attr_mw_cunits.attr,
+
+	&dev_attr_read_typ.attr,
+	&dev_attr_read_max.attr,
+	&dev_attr_write_typ.attr,
+	&dev_attr_write_max.attr,
+	&dev_attr_reset_typ.attr,
+	&dev_attr_reset_max.attr,
+
+	NULL,
+};
+
+static const struct attribute_group nvm_dev_attr_group_20 = {
+	.name		= "lightnvm",
+	.attrs		= nvm_dev_attrs_20,
+};
+
+int nvme_nvm_register_sysfs(struct nvme_ns *ns)
+{
+	struct nvm_dev *ndev = ns->ndev;
+	struct nvm_geo *geo = &ndev->geo;
+
+	if (!ndev)
+		return -EINVAL;
+
+	switch (geo->major_ver_id) {
+	case 1:
+		return sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
+					&nvm_dev_attr_group_12);
+	case 2:
+		return sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
+					&nvm_dev_attr_group_20);
+	}
+
+	return -EINVAL;
+}
+
+void nvme_nvm_unregister_sysfs(struct nvme_ns *ns)
+{
+	struct nvm_dev *ndev = ns->ndev;
+	struct nvm_geo *geo = &ndev->geo;
+
+	switch (geo->major_ver_id) {
+	case 1:
+		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
+					&nvm_dev_attr_group_12);
+		break;
+	case 2:
+		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
+					&nvm_dev_attr_group_20);
+		break;
+	}
+}
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
new file mode 100644
index 0000000..d7b664a
--- /dev/null
+++ b/drivers/nvme/host/multipath.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2017 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/moduleparam.h>
+#include "nvme.h"
+
+static bool multipath = true;
+module_param(multipath, bool, 0444);
+MODULE_PARM_DESC(multipath,
+	"turn on native support for multiple controllers per subsystem");
+
+/*
+ * If multipathing is enabled we need to always use the subsystem instance
+ * number for numbering our devices to avoid conflicts between subsystems that
+ * have multiple controllers and thus use the multipath-aware subsystem node
+ * and those that have a single controller and use the controller node
+ * directly.
+ */
+void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
+			struct nvme_ctrl *ctrl, int *flags)
+{
+	if (!multipath) {
+		sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
+	} else if (ns->head->disk) {
+		sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
+				ctrl->cntlid, ns->head->instance);
+		*flags = GENHD_FL_HIDDEN;
+	} else {
+		sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
+				ns->head->instance);
+	}
+}
+
+void nvme_failover_req(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ns->head->requeue_lock, flags);
+	blk_steal_bios(&ns->head->requeue_list, req);
+	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
+	blk_mq_end_request(req, 0);
+
+	nvme_reset_ctrl(ns->ctrl);
+	kblockd_schedule_work(&ns->head->requeue_work);
+}
+
+bool nvme_req_needs_failover(struct request *req, blk_status_t error)
+{
+	if (!(req->cmd_flags & REQ_NVME_MPATH))
+		return false;
+	return blk_path_error(error);
+}
+
+void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		if (ns->head->disk)
+			kblockd_schedule_work(&ns->head->requeue_work);
+	}
+	up_read(&ctrl->namespaces_rwsem);
+}
+
+static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
+{
+	struct nvme_ns *ns;
+
+	list_for_each_entry_rcu(ns, &head->list, siblings) {
+		if (ns->ctrl->state == NVME_CTRL_LIVE) {
+			rcu_assign_pointer(head->current_path, ns);
+			return ns;
+		}
+	}
+
+	return NULL;
+}
+
+inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
+{
+	struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
+
+	if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE))
+		ns = __nvme_find_path(head);
+	return ns;
+}
+
+static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
+		struct bio *bio)
+{
+	struct nvme_ns_head *head = q->queuedata;
+	struct device *dev = disk_to_dev(head->disk);
+	struct nvme_ns *ns;
+	blk_qc_t ret = BLK_QC_T_NONE;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&head->srcu);
+	ns = nvme_find_path(head);
+	if (likely(ns)) {
+		bio->bi_disk = ns->disk;
+		bio->bi_opf |= REQ_NVME_MPATH;
+		ret = direct_make_request(bio);
+	} else if (!list_empty_careful(&head->list)) {
+		dev_warn_ratelimited(dev, "no path available - requeuing I/O\n");
+
+		spin_lock_irq(&head->requeue_lock);
+		bio_list_add(&head->requeue_list, bio);
+		spin_unlock_irq(&head->requeue_lock);
+	} else {
+		dev_warn_ratelimited(dev, "no path - failing I/O\n");
+
+		bio->bi_status = BLK_STS_IOERR;
+		bio_endio(bio);
+	}
+
+	srcu_read_unlock(&head->srcu, srcu_idx);
+	return ret;
+}
+
+static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
+{
+	struct nvme_ns_head *head = q->queuedata;
+	struct nvme_ns *ns;
+	bool found = false;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&head->srcu);
+	ns = srcu_dereference(head->current_path, &head->srcu);
+	if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE))
+		found = ns->queue->poll_fn(q, qc);
+	srcu_read_unlock(&head->srcu, srcu_idx);
+	return found;
+}
+
+static void nvme_requeue_work(struct work_struct *work)
+{
+	struct nvme_ns_head *head =
+		container_of(work, struct nvme_ns_head, requeue_work);
+	struct bio *bio, *next;
+
+	spin_lock_irq(&head->requeue_lock);
+	next = bio_list_get(&head->requeue_list);
+	spin_unlock_irq(&head->requeue_lock);
+
+	while ((bio = next) != NULL) {
+		next = bio->bi_next;
+		bio->bi_next = NULL;
+
+		/*
+		 * Reset disk to the mpath node and resubmit to select a new
+		 * path.
+		 */
+		bio->bi_disk = head->disk;
+		generic_make_request(bio);
+	}
+}
+
+int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
+{
+	struct request_queue *q;
+	bool vwc = false;
+
+	bio_list_init(&head->requeue_list);
+	spin_lock_init(&head->requeue_lock);
+	INIT_WORK(&head->requeue_work, nvme_requeue_work);
+
+	/*
+	 * Add a multipath node if the subsystems supports multiple controllers.
+	 * We also do this for private namespaces as the namespace sharing data could
+	 * change after a rescan.
+	 */
+	if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
+		return 0;
+
+	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
+	if (!q)
+		goto out;
+	q->queuedata = head;
+	blk_queue_make_request(q, nvme_ns_head_make_request);
+	q->poll_fn = nvme_ns_head_poll;
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+	/* set to a default value for 512 until disk is validated */
+	blk_queue_logical_block_size(q, 512);
+
+	/* we need to propagate up the VMC settings */
+	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+		vwc = true;
+	blk_queue_write_cache(q, vwc, vwc);
+
+	head->disk = alloc_disk(0);
+	if (!head->disk)
+		goto out_cleanup_queue;
+	head->disk->fops = &nvme_ns_head_ops;
+	head->disk->private_data = head;
+	head->disk->queue = q;
+	head->disk->flags = GENHD_FL_EXT_DEVT;
+	sprintf(head->disk->disk_name, "nvme%dn%d",
+			ctrl->subsys->instance, head->instance);
+	return 0;
+
+out_cleanup_queue:
+	blk_cleanup_queue(q);
+out:
+	return -ENOMEM;
+}
+
+void nvme_mpath_add_disk(struct nvme_ns_head *head)
+{
+	if (!head->disk)
+		return;
+
+	mutex_lock(&head->subsys->lock);
+	if (!(head->disk->flags & GENHD_FL_UP)) {
+		device_add_disk(&head->subsys->dev, head->disk);
+		if (sysfs_create_group(&disk_to_dev(head->disk)->kobj,
+				&nvme_ns_id_attr_group))
+			pr_warn("%s: failed to create sysfs group for identification\n",
+				head->disk->disk_name);
+	}
+	mutex_unlock(&head->subsys->lock);
+}
+
+void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+{
+	if (!head->disk)
+		return;
+	sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
+			   &nvme_ns_id_attr_group);
+	del_gendisk(head->disk);
+	blk_set_queue_dying(head->disk->queue);
+	/* make sure all pending bios are cleaned up */
+	kblockd_schedule_work(&head->requeue_work);
+	flush_work(&head->requeue_work);
+	blk_cleanup_queue(head->disk->queue);
+	put_disk(head->disk);
+}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
new file mode 100644
index 0000000..17d2f7c
--- /dev/null
+++ b/drivers/nvme/host/nvme.h
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2011-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _NVME_H
+#define _NVME_H
+
+#include <linux/nvme.h>
+#include <linux/cdev.h>
+#include <linux/pci.h>
+#include <linux/kref.h>
+#include <linux/blk-mq.h>
+#include <linux/lightnvm.h>
+#include <linux/sed-opal.h>
+#include <linux/fault-inject.h>
+
+extern unsigned int nvme_io_timeout;
+#define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
+
+extern unsigned int admin_timeout;
+#define ADMIN_TIMEOUT	(admin_timeout * HZ)
+
+#define NVME_DEFAULT_KATO	5
+#define NVME_KATO_GRACE		10
+
+extern struct workqueue_struct *nvme_wq;
+extern struct workqueue_struct *nvme_reset_wq;
+extern struct workqueue_struct *nvme_delete_wq;
+
+enum {
+	NVME_NS_LBA		= 0,
+	NVME_NS_LIGHTNVM	= 1,
+};
+
+/*
+ * List of workarounds for devices that required behavior not specified in
+ * the standard.
+ */
+enum nvme_quirks {
+	/*
+	 * Prefers I/O aligned to a stripe size specified in a vendor
+	 * specific Identify field.
+	 */
+	NVME_QUIRK_STRIPE_SIZE			= (1 << 0),
+
+	/*
+	 * The controller doesn't handle Identify value others than 0 or 1
+	 * correctly.
+	 */
+	NVME_QUIRK_IDENTIFY_CNS			= (1 << 1),
+
+	/*
+	 * The controller deterministically returns O's on reads to
+	 * logical blocks that deallocate was called on.
+	 */
+	NVME_QUIRK_DEALLOCATE_ZEROES		= (1 << 2),
+
+	/*
+	 * The controller needs a delay before starts checking the device
+	 * readiness, which is done by reading the NVME_CSTS_RDY bit.
+	 */
+	NVME_QUIRK_DELAY_BEFORE_CHK_RDY		= (1 << 3),
+
+	/*
+	 * APST should not be used.
+	 */
+	NVME_QUIRK_NO_APST			= (1 << 4),
+
+	/*
+	 * The deepest sleep state should not be used.
+	 */
+	NVME_QUIRK_NO_DEEPEST_PS		= (1 << 5),
+
+	/*
+	 * Supports the LighNVM command set if indicated in vs[1].
+	 */
+	NVME_QUIRK_LIGHTNVM			= (1 << 6),
+
+	/*
+	 * Set MEDIUM priority on SQ creation
+	 */
+	NVME_QUIRK_MEDIUM_PRIO_SQ		= (1 << 7),
+};
+
+/*
+ * Common request structure for NVMe passthrough.  All drivers must have
+ * this structure as the first member of their request-private data.
+ */
+struct nvme_request {
+	struct nvme_command	*cmd;
+	union nvme_result	result;
+	u8			retries;
+	u8			flags;
+	u16			status;
+};
+
+/*
+ * Mark a bio as coming in through the mpath node.
+ */
+#define REQ_NVME_MPATH		REQ_DRV
+
+enum {
+	NVME_REQ_CANCELLED		= (1 << 0),
+	NVME_REQ_USERCMD		= (1 << 1),
+};
+
+static inline struct nvme_request *nvme_req(struct request *req)
+{
+	return blk_mq_rq_to_pdu(req);
+}
+
+/* The below value is the specific amount of delay needed before checking
+ * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the
+ * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was
+ * found empirically.
+ */
+#define NVME_QUIRK_DELAY_AMOUNT		2300
+
+enum nvme_ctrl_state {
+	NVME_CTRL_NEW,
+	NVME_CTRL_LIVE,
+	NVME_CTRL_ADMIN_ONLY,    /* Only admin queue live */
+	NVME_CTRL_RESETTING,
+	NVME_CTRL_CONNECTING,
+	NVME_CTRL_DELETING,
+	NVME_CTRL_DEAD,
+};
+
+struct nvme_ctrl {
+	enum nvme_ctrl_state state;
+	bool identified;
+	spinlock_t lock;
+	const struct nvme_ctrl_ops *ops;
+	struct request_queue *admin_q;
+	struct request_queue *connect_q;
+	struct device *dev;
+	int instance;
+	struct blk_mq_tag_set *tagset;
+	struct blk_mq_tag_set *admin_tagset;
+	struct list_head namespaces;
+	struct rw_semaphore namespaces_rwsem;
+	struct device ctrl_device;
+	struct device *device;	/* char device */
+	struct cdev cdev;
+	struct work_struct reset_work;
+	struct work_struct delete_work;
+
+	struct nvme_subsystem *subsys;
+	struct list_head subsys_entry;
+
+	struct opal_dev *opal_dev;
+
+	char name[12];
+	u16 cntlid;
+
+	u32 ctrl_config;
+	u16 mtfa;
+	u32 queue_count;
+
+	u64 cap;
+	u32 page_size;
+	u32 max_hw_sectors;
+	u16 oncs;
+	u16 oacs;
+	u16 nssa;
+	u16 nr_streams;
+	atomic_t abort_limit;
+	u8 vwc;
+	u32 vs;
+	u32 sgls;
+	u16 kas;
+	u8 npss;
+	u8 apsta;
+	u32 aen_result;
+	unsigned int shutdown_timeout;
+	unsigned int kato;
+	bool subsystem;
+	unsigned long quirks;
+	struct nvme_id_power_state psd[32];
+	struct nvme_effects_log *effects;
+	struct work_struct scan_work;
+	struct work_struct async_event_work;
+	struct delayed_work ka_work;
+	struct nvme_command ka_cmd;
+	struct work_struct fw_act_work;
+
+	/* Power saving configuration */
+	u64 ps_max_latency_us;
+	bool apst_enabled;
+
+	/* PCIe only: */
+	u32 hmpre;
+	u32 hmmin;
+	u32 hmminds;
+	u16 hmmaxd;
+
+	/* Fabrics only */
+	u16 sqsize;
+	u32 ioccsz;
+	u32 iorcsz;
+	u16 icdoff;
+	u16 maxcmd;
+	int nr_reconnects;
+	struct nvmf_ctrl_options *opts;
+};
+
+struct nvme_subsystem {
+	int			instance;
+	struct device		dev;
+	/*
+	 * Because we unregister the device on the last put we need
+	 * a separate refcount.
+	 */
+	struct kref		ref;
+	struct list_head	entry;
+	struct mutex		lock;
+	struct list_head	ctrls;
+	struct list_head	nsheads;
+	char			subnqn[NVMF_NQN_SIZE];
+	char			serial[20];
+	char			model[40];
+	char			firmware_rev[8];
+	u8			cmic;
+	u16			vendor_id;
+	struct ida		ns_ida;
+};
+
+/*
+ * Container structure for uniqueue namespace identifiers.
+ */
+struct nvme_ns_ids {
+	u8	eui64[8];
+	u8	nguid[16];
+	uuid_t	uuid;
+};
+
+/*
+ * Anchor structure for namespaces.  There is one for each namespace in a
+ * NVMe subsystem that any of our controllers can see, and the namespace
+ * structure for each controller is chained of it.  For private namespaces
+ * there is a 1:1 relation to our namespace structures, that is ->list
+ * only ever has a single entry for private namespaces.
+ */
+struct nvme_ns_head {
+#ifdef CONFIG_NVME_MULTIPATH
+	struct gendisk		*disk;
+	struct nvme_ns __rcu	*current_path;
+	struct bio_list		requeue_list;
+	spinlock_t		requeue_lock;
+	struct work_struct	requeue_work;
+#endif
+	struct list_head	list;
+	struct srcu_struct      srcu;
+	struct nvme_subsystem	*subsys;
+	unsigned		ns_id;
+	struct nvme_ns_ids	ids;
+	struct list_head	entry;
+	struct kref		ref;
+	int			instance;
+};
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+struct nvme_fault_inject {
+	struct fault_attr attr;
+	struct dentry *parent;
+	bool dont_retry;	/* DNR, do not retry */
+	u16 status;		/* status code */
+};
+#endif
+
+struct nvme_ns {
+	struct list_head list;
+
+	struct nvme_ctrl *ctrl;
+	struct request_queue *queue;
+	struct gendisk *disk;
+	struct list_head siblings;
+	struct nvm_dev *ndev;
+	struct kref kref;
+	struct nvme_ns_head *head;
+
+	int lba_shift;
+	u16 ms;
+	u16 sgs;
+	u32 sws;
+	bool ext;
+	u8 pi_type;
+	unsigned long flags;
+#define NVME_NS_REMOVING 0
+#define NVME_NS_DEAD     1
+	u16 noiob;
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+	struct nvme_fault_inject fault_inject;
+#endif
+
+};
+
+struct nvme_ctrl_ops {
+	const char *name;
+	struct module *module;
+	unsigned int flags;
+#define NVME_F_FABRICS			(1 << 0)
+#define NVME_F_METADATA_SUPPORTED	(1 << 1)
+	int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
+	int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
+	int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
+	void (*free_ctrl)(struct nvme_ctrl *ctrl);
+	void (*submit_async_event)(struct nvme_ctrl *ctrl);
+	void (*delete_ctrl)(struct nvme_ctrl *ctrl);
+	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
+	int (*reinit_request)(void *data, struct request *rq);
+	void (*stop_ctrl)(struct nvme_ctrl *ctrl);
+};
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+void nvme_fault_inject_init(struct nvme_ns *ns);
+void nvme_fault_inject_fini(struct nvme_ns *ns);
+void nvme_should_fail(struct request *req);
+#else
+static inline void nvme_fault_inject_init(struct nvme_ns *ns) {}
+static inline void nvme_fault_inject_fini(struct nvme_ns *ns) {}
+static inline void nvme_should_fail(struct request *req) {}
+#endif
+
+static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
+{
+	u32 val = 0;
+
+	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val))
+		return false;
+	return val & NVME_CSTS_RDY;
+}
+
+static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
+{
+	if (!ctrl->subsystem)
+		return -ENOTTY;
+	return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
+}
+
+static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
+{
+	return (sector >> (ns->lba_shift - 9));
+}
+
+static inline void nvme_cleanup_cmd(struct request *req)
+{
+	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+		kfree(page_address(req->special_vec.bv_page) +
+		      req->special_vec.bv_offset);
+	}
+}
+
+static inline void nvme_end_request(struct request *req, __le16 status,
+		union nvme_result result)
+{
+	struct nvme_request *rq = nvme_req(req);
+
+	rq->status = le16_to_cpu(status) >> 1;
+	rq->result = result;
+	/* inject error when permitted by fault injection framework */
+	nvme_should_fail(req);
+	blk_mq_complete_request(req);
+}
+
+static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl)
+{
+	get_device(ctrl->device);
+}
+
+static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl)
+{
+	put_device(ctrl->device);
+}
+
+void nvme_complete_rq(struct request *req);
+void nvme_cancel_request(struct request *req, void *data, bool reserved);
+bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
+		enum nvme_ctrl_state new_state);
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
+int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
+int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
+		const struct nvme_ctrl_ops *ops, unsigned long quirks);
+void nvme_uninit_ctrl(struct nvme_ctrl *ctrl);
+void nvme_start_ctrl(struct nvme_ctrl *ctrl);
+void nvme_stop_ctrl(struct nvme_ctrl *ctrl);
+void nvme_put_ctrl(struct nvme_ctrl *ctrl);
+int nvme_init_identify(struct nvme_ctrl *ctrl);
+
+void nvme_queue_scan(struct nvme_ctrl *ctrl);
+void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
+
+int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
+		bool send);
+
+void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
+		union nvme_result *res);
+
+void nvme_stop_queues(struct nvme_ctrl *ctrl);
+void nvme_start_queues(struct nvme_ctrl *ctrl);
+void nvme_kill_queues(struct nvme_ctrl *ctrl);
+void nvme_unfreeze(struct nvme_ctrl *ctrl);
+void nvme_wait_freeze(struct nvme_ctrl *ctrl);
+void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
+void nvme_start_freeze(struct nvme_ctrl *ctrl);
+int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set);
+
+#define NVME_QID_ANY -1
+struct request *nvme_alloc_request(struct request_queue *q,
+		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
+blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
+		struct nvme_command *cmd);
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buf, unsigned bufflen);
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		union nvme_result *result, void *buffer, unsigned bufflen,
+		unsigned timeout, int qid, int at_head,
+		blk_mq_req_flags_t flags);
+int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
+void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
+int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
+int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
+int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
+int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
+
+int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+		u8 log_page, void *log, size_t size, u64 offset);
+
+extern const struct attribute_group nvme_ns_id_attr_group;
+extern const struct block_device_operations nvme_ns_head_ops;
+
+#ifdef CONFIG_NVME_MULTIPATH
+void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
+			struct nvme_ctrl *ctrl, int *flags);
+void nvme_failover_req(struct request *req);
+bool nvme_req_needs_failover(struct request *req, blk_status_t error);
+void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
+int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
+void nvme_mpath_add_disk(struct nvme_ns_head *head);
+void nvme_mpath_remove_disk(struct nvme_ns_head *head);
+
+static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
+{
+	struct nvme_ns_head *head = ns->head;
+
+	if (head && ns == srcu_dereference(head->current_path, &head->srcu))
+		rcu_assign_pointer(head->current_path, NULL);
+}
+struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
+
+static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
+{
+	struct nvme_ns_head *head = ns->head;
+
+	if (head->disk && list_empty(&head->list))
+		kblockd_schedule_work(&head->requeue_work);
+}
+
+#else
+/*
+ * Without the multipath code enabled, multiple controller per subsystems are
+ * visible as devices and thus we cannot use the subsystem instance.
+ */
+static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
+				      struct nvme_ctrl *ctrl, int *flags)
+{
+	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
+}
+
+static inline void nvme_failover_req(struct request *req)
+{
+}
+static inline bool nvme_req_needs_failover(struct request *req,
+					   blk_status_t error)
+{
+	return false;
+}
+static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
+{
+}
+static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
+		struct nvme_ns_head *head)
+{
+	return 0;
+}
+static inline void nvme_mpath_add_disk(struct nvme_ns_head *head)
+{
+}
+static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+{
+}
+static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
+{
+}
+static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
+{
+}
+#endif /* CONFIG_NVME_MULTIPATH */
+
+#ifdef CONFIG_NVM
+void nvme_nvm_update_nvm_info(struct nvme_ns *ns);
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
+void nvme_nvm_unregister(struct nvme_ns *ns);
+int nvme_nvm_register_sysfs(struct nvme_ns *ns);
+void nvme_nvm_unregister_sysfs(struct nvme_ns *ns);
+int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg);
+#else
+static inline void nvme_nvm_update_nvm_info(struct nvme_ns *ns) {};
+static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
+				    int node)
+{
+	return 0;
+}
+
+static inline void nvme_nvm_unregister(struct nvme_ns *ns) {};
+static inline int nvme_nvm_register_sysfs(struct nvme_ns *ns)
+{
+	return 0;
+}
+static inline void nvme_nvm_unregister_sysfs(struct nvme_ns *ns) {};
+static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd,
+							unsigned long arg)
+{
+	return -ENOTTY;
+}
+#endif /* CONFIG_NVM */
+
+static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
+{
+	return dev_to_disk(dev)->private_data;
+}
+
+int __init nvme_core_init(void);
+void nvme_core_exit(void);
+
+#endif /* _NVME_H */
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
new file mode 100644
index 0000000..17a0190
--- /dev/null
+++ b/drivers/nvme/host/pci.c
@@ -0,0 +1,2767 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/aer.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/blk-mq-pci.h>
+#include <linux/dmi.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/once.h>
+#include <linux/pci.h>
+#include <linux/t10-pi.h>
+#include <linux/types.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/sed-opal.h>
+
+#include "nvme.h"
+
+#define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
+#define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
+
+#define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
+
+static int use_threaded_interrupts;
+module_param(use_threaded_interrupts, int, 0);
+
+static bool use_cmb_sqes = true;
+module_param(use_cmb_sqes, bool, 0644);
+MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
+
+static unsigned int max_host_mem_size_mb = 128;
+module_param(max_host_mem_size_mb, uint, 0444);
+MODULE_PARM_DESC(max_host_mem_size_mb,
+	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
+
+static unsigned int sgl_threshold = SZ_32K;
+module_param(sgl_threshold, uint, 0644);
+MODULE_PARM_DESC(sgl_threshold,
+		"Use SGLs when average request segment size is larger or equal to "
+		"this size. Use 0 to disable SGLs.");
+
+static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
+static const struct kernel_param_ops io_queue_depth_ops = {
+	.set = io_queue_depth_set,
+	.get = param_get_int,
+};
+
+static int io_queue_depth = 1024;
+module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
+MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
+
+struct nvme_dev;
+struct nvme_queue;
+
+static void nvme_process_cq(struct nvme_queue *nvmeq);
+static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
+
+/*
+ * Represents an NVM Express device.  Each nvme_dev is a PCI function.
+ */
+struct nvme_dev {
+	struct nvme_queue *queues;
+	struct blk_mq_tag_set tagset;
+	struct blk_mq_tag_set admin_tagset;
+	u32 __iomem *dbs;
+	struct device *dev;
+	struct dma_pool *prp_page_pool;
+	struct dma_pool *prp_small_pool;
+	unsigned online_queues;
+	unsigned max_qid;
+	unsigned int num_vecs;
+	int q_depth;
+	u32 db_stride;
+	void __iomem *bar;
+	unsigned long bar_mapped_size;
+	struct work_struct remove_work;
+	struct mutex shutdown_lock;
+	bool subsystem;
+	void __iomem *cmb;
+	pci_bus_addr_t cmb_bus_addr;
+	u64 cmb_size;
+	u32 cmbsz;
+	u32 cmbloc;
+	struct nvme_ctrl ctrl;
+	struct completion ioq_wait;
+
+	/* shadow doorbell buffer support: */
+	u32 *dbbuf_dbs;
+	dma_addr_t dbbuf_dbs_dma_addr;
+	u32 *dbbuf_eis;
+	dma_addr_t dbbuf_eis_dma_addr;
+
+	/* host memory buffer support: */
+	u64 host_mem_size;
+	u32 nr_host_mem_descs;
+	dma_addr_t host_mem_descs_dma;
+	struct nvme_host_mem_buf_desc *host_mem_descs;
+	void **host_mem_desc_bufs;
+};
+
+static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
+{
+	int n = 0, ret;
+
+	ret = kstrtoint(val, 10, &n);
+	if (ret != 0 || n < 2)
+		return -EINVAL;
+
+	return param_set_int(val, kp);
+}
+
+static inline unsigned int sq_idx(unsigned int qid, u32 stride)
+{
+	return qid * 2 * stride;
+}
+
+static inline unsigned int cq_idx(unsigned int qid, u32 stride)
+{
+	return (qid * 2 + 1) * stride;
+}
+
+static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
+{
+	return container_of(ctrl, struct nvme_dev, ctrl);
+}
+
+/*
+ * An NVM Express queue.  Each device has at least two (one for admin
+ * commands and one for I/O commands).
+ */
+struct nvme_queue {
+	struct device *q_dmadev;
+	struct nvme_dev *dev;
+	spinlock_t q_lock;
+	struct nvme_command *sq_cmds;
+	struct nvme_command __iomem *sq_cmds_io;
+	volatile struct nvme_completion *cqes;
+	struct blk_mq_tags **tags;
+	dma_addr_t sq_dma_addr;
+	dma_addr_t cq_dma_addr;
+	u32 __iomem *q_db;
+	u16 q_depth;
+	s16 cq_vector;
+	u16 sq_tail;
+	u16 cq_head;
+	u16 qid;
+	u8 cq_phase;
+	u8 cqe_seen;
+	u32 *dbbuf_sq_db;
+	u32 *dbbuf_cq_db;
+	u32 *dbbuf_sq_ei;
+	u32 *dbbuf_cq_ei;
+};
+
+/*
+ * The nvme_iod describes the data in an I/O, including the list of PRP
+ * entries.  You can't see it in this data structure because C doesn't let
+ * me express that.  Use nvme_init_iod to ensure there's enough space
+ * allocated to store the PRP list.
+ */
+struct nvme_iod {
+	struct nvme_request req;
+	struct nvme_queue *nvmeq;
+	bool use_sgl;
+	int aborted;
+	int npages;		/* In the PRP list. 0 means small pool in use */
+	int nents;		/* Used in scatterlist */
+	int length;		/* Of data, in bytes */
+	dma_addr_t first_dma;
+	struct scatterlist meta_sg; /* metadata requires single contiguous buffer */
+	struct scatterlist *sg;
+	struct scatterlist inline_sg[0];
+};
+
+/*
+ * Check we didin't inadvertently grow the command struct
+ */
+static inline void _nvme_check_size(void)
+{
+	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
+}
+
+static inline unsigned int nvme_dbbuf_size(u32 stride)
+{
+	return ((num_possible_cpus() + 1) * 8 * stride);
+}
+
+static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
+{
+	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+	if (dev->dbbuf_dbs)
+		return 0;
+
+	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
+					    &dev->dbbuf_dbs_dma_addr,
+					    GFP_KERNEL);
+	if (!dev->dbbuf_dbs)
+		return -ENOMEM;
+	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
+					    &dev->dbbuf_eis_dma_addr,
+					    GFP_KERNEL);
+	if (!dev->dbbuf_eis) {
+		dma_free_coherent(dev->dev, mem_size,
+				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+		dev->dbbuf_dbs = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
+{
+	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+	if (dev->dbbuf_dbs) {
+		dma_free_coherent(dev->dev, mem_size,
+				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+		dev->dbbuf_dbs = NULL;
+	}
+	if (dev->dbbuf_eis) {
+		dma_free_coherent(dev->dev, mem_size,
+				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
+		dev->dbbuf_eis = NULL;
+	}
+}
+
+static void nvme_dbbuf_init(struct nvme_dev *dev,
+			    struct nvme_queue *nvmeq, int qid)
+{
+	if (!dev->dbbuf_dbs || !qid)
+		return;
+
+	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
+	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
+	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
+	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
+}
+
+static void nvme_dbbuf_set(struct nvme_dev *dev)
+{
+	struct nvme_command c;
+
+	if (!dev->dbbuf_dbs)
+		return;
+
+	memset(&c, 0, sizeof(c));
+	c.dbbuf.opcode = nvme_admin_dbbuf;
+	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
+	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);
+
+	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
+		dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
+		/* Free memory and continue on */
+		nvme_dbbuf_dma_free(dev);
+	}
+}
+
+static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
+{
+	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
+}
+
+/* Update dbbuf and return true if an MMIO is required */
+static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
+					      volatile u32 *dbbuf_ei)
+{
+	if (dbbuf_db) {
+		u16 old_value;
+
+		/*
+		 * Ensure that the queue is written before updating
+		 * the doorbell in memory
+		 */
+		wmb();
+
+		old_value = *dbbuf_db;
+		*dbbuf_db = value;
+
+		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Max size of iod being embedded in the request payload
+ */
+#define NVME_INT_PAGES		2
+#define NVME_INT_BYTES(dev)	(NVME_INT_PAGES * (dev)->ctrl.page_size)
+
+/*
+ * Will slightly overestimate the number of pages needed.  This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size, struct nvme_dev *dev)
+{
+	unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
+				      dev->ctrl.page_size);
+	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+/*
+ * Calculates the number of pages needed for the SGL segments. For example a 4k
+ * page can accommodate 256 SGL descriptors.
+ */
+static int nvme_pci_npages_sgl(unsigned int num_seg)
+{
+	return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
+}
+
+static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
+		unsigned int size, unsigned int nseg, bool use_sgl)
+{
+	size_t alloc_size;
+
+	if (use_sgl)
+		alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
+	else
+		alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);
+
+	return alloc_size + sizeof(struct scatterlist) * nseg;
+}
+
+static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl)
+{
+	unsigned int alloc_size = nvme_pci_iod_alloc_size(dev,
+				    NVME_INT_BYTES(dev), NVME_INT_PAGES,
+				    use_sgl);
+
+	return sizeof(struct nvme_iod) + alloc_size;
+}
+
+static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+				unsigned int hctx_idx)
+{
+	struct nvme_dev *dev = data;
+	struct nvme_queue *nvmeq = &dev->queues[0];
+
+	WARN_ON(hctx_idx != 0);
+	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
+	WARN_ON(nvmeq->tags);
+
+	hctx->driver_data = nvmeq;
+	nvmeq->tags = &dev->admin_tagset.tags[0];
+	return 0;
+}
+
+static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+
+	nvmeq->tags = NULL;
+}
+
+static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+			  unsigned int hctx_idx)
+{
+	struct nvme_dev *dev = data;
+	struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
+
+	if (!nvmeq->tags)
+		nvmeq->tags = &dev->tagset.tags[hctx_idx];
+
+	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
+	hctx->driver_data = nvmeq;
+	return 0;
+}
+
+static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
+		unsigned int hctx_idx, unsigned int numa_node)
+{
+	struct nvme_dev *dev = set->driver_data;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
+	struct nvme_queue *nvmeq = &dev->queues[queue_idx];
+
+	BUG_ON(!nvmeq);
+	iod->nvmeq = nvmeq;
+	return 0;
+}
+
+static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
+{
+	struct nvme_dev *dev = set->driver_data;
+
+	return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev),
+			dev->num_vecs > 1 ? 1 /* admin queue */ : 0);
+}
+
+/**
+ * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
+ * @nvmeq: The queue to use
+ * @cmd: The command to send
+ *
+ * Safe to use from interrupt context
+ */
+static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
+						struct nvme_command *cmd)
+{
+	u16 tail = nvmeq->sq_tail;
+
+	if (nvmeq->sq_cmds_io)
+		memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
+	else
+		memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+
+	if (++tail == nvmeq->q_depth)
+		tail = 0;
+	if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
+					      nvmeq->dbbuf_sq_ei))
+		writel(tail, nvmeq->q_db);
+	nvmeq->sq_tail = tail;
+}
+
+static void **nvme_pci_iod_list(struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
+}
+
+static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	int nseg = blk_rq_nr_phys_segments(req);
+	unsigned int avg_seg_size;
+
+	if (nseg == 0)
+		return false;
+
+	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+
+	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
+		return false;
+	if (!iod->nvmeq->qid)
+		return false;
+	if (!sgl_threshold || avg_seg_size < sgl_threshold)
+		return false;
+	return true;
+}
+
+static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
+	int nseg = blk_rq_nr_phys_segments(rq);
+	unsigned int size = blk_rq_payload_bytes(rq);
+
+	iod->use_sgl = nvme_pci_use_sgls(dev, rq);
+
+	if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
+		size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
+				iod->use_sgl);
+
+		iod->sg = kmalloc(alloc_size, GFP_ATOMIC);
+		if (!iod->sg)
+			return BLK_STS_RESOURCE;
+	} else {
+		iod->sg = iod->inline_sg;
+	}
+
+	iod->aborted = 0;
+	iod->npages = -1;
+	iod->nents = 0;
+	iod->length = size;
+
+	return BLK_STS_OK;
+}
+
+static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
+	dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
+
+	int i;
+
+	if (iod->npages == 0)
+		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
+			dma_addr);
+
+	for (i = 0; i < iod->npages; i++) {
+		void *addr = nvme_pci_iod_list(req)[i];
+
+		if (iod->use_sgl) {
+			struct nvme_sgl_desc *sg_list = addr;
+
+			next_dma_addr =
+			    le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
+		} else {
+			__le64 *prp_list = addr;
+
+			next_dma_addr = le64_to_cpu(prp_list[last_prp]);
+		}
+
+		dma_pool_free(dev->prp_page_pool, addr, dma_addr);
+		dma_addr = next_dma_addr;
+	}
+
+	if (iod->sg != iod->inline_sg)
+		kfree(iod->sg);
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+	if (be32_to_cpu(pi->ref_tag) == v)
+		pi->ref_tag = cpu_to_be32(p);
+}
+
+static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+	if (be32_to_cpu(pi->ref_tag) == p)
+		pi->ref_tag = cpu_to_be32(v);
+}
+
+/**
+ * nvme_dif_remap - remaps ref tags to bip seed and physical lba
+ *
+ * The virtual start sector is the one that was originally submitted by the
+ * block layer.	Due to partitioning, MD/DM cloning, etc. the actual physical
+ * start sector may be different. Remap protection information to match the
+ * physical LBA on writes, and back to the original seed on reads.
+ *
+ * Type 0 and 3 do not have a ref tag, so no remapping required.
+ */
+static void nvme_dif_remap(struct request *req,
+			void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
+{
+	struct nvme_ns *ns = req->rq_disk->private_data;
+	struct bio_integrity_payload *bip;
+	struct t10_pi_tuple *pi;
+	void *p, *pmap;
+	u32 i, nlb, ts, phys, virt;
+
+	if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
+		return;
+
+	bip = bio_integrity(req->bio);
+	if (!bip)
+		return;
+
+	pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;
+
+	p = pmap;
+	virt = bip_get_seed(bip);
+	phys = nvme_block_nr(ns, blk_rq_pos(req));
+	nlb = (blk_rq_bytes(req) >> ns->lba_shift);
+	ts = ns->disk->queue->integrity.tuple_size;
+
+	for (i = 0; i < nlb; i++, virt++, phys++) {
+		pi = (struct t10_pi_tuple *)p;
+		dif_swap(phys, virt, pi);
+		p += ts;
+	}
+	kunmap_atomic(pmap);
+}
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static void nvme_dif_remap(struct request *req,
+			void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
+{
+}
+static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+}
+static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+}
+#endif
+
+static void nvme_print_sgl(struct scatterlist *sgl, int nents)
+{
+	int i;
+	struct scatterlist *sg;
+
+	for_each_sg(sgl, sg, nents, i) {
+		dma_addr_t phys = sg_phys(sg);
+		pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
+			"dma_address:%pad dma_length:%d\n",
+			i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
+			sg_dma_len(sg));
+	}
+}
+
+static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
+		struct request *req, struct nvme_rw_command *cmnd)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct dma_pool *pool;
+	int length = blk_rq_payload_bytes(req);
+	struct scatterlist *sg = iod->sg;
+	int dma_len = sg_dma_len(sg);
+	u64 dma_addr = sg_dma_address(sg);
+	u32 page_size = dev->ctrl.page_size;
+	int offset = dma_addr & (page_size - 1);
+	__le64 *prp_list;
+	void **list = nvme_pci_iod_list(req);
+	dma_addr_t prp_dma;
+	int nprps, i;
+
+	length -= (page_size - offset);
+	if (length <= 0) {
+		iod->first_dma = 0;
+		goto done;
+	}
+
+	dma_len -= (page_size - offset);
+	if (dma_len) {
+		dma_addr += (page_size - offset);
+	} else {
+		sg = sg_next(sg);
+		dma_addr = sg_dma_address(sg);
+		dma_len = sg_dma_len(sg);
+	}
+
+	if (length <= page_size) {
+		iod->first_dma = dma_addr;
+		goto done;
+	}
+
+	nprps = DIV_ROUND_UP(length, page_size);
+	if (nprps <= (256 / 8)) {
+		pool = dev->prp_small_pool;
+		iod->npages = 0;
+	} else {
+		pool = dev->prp_page_pool;
+		iod->npages = 1;
+	}
+
+	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+	if (!prp_list) {
+		iod->first_dma = dma_addr;
+		iod->npages = -1;
+		return BLK_STS_RESOURCE;
+	}
+	list[0] = prp_list;
+	iod->first_dma = prp_dma;
+	i = 0;
+	for (;;) {
+		if (i == page_size >> 3) {
+			__le64 *old_prp_list = prp_list;
+			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+			if (!prp_list)
+				return BLK_STS_RESOURCE;
+			list[iod->npages++] = prp_list;
+			prp_list[0] = old_prp_list[i - 1];
+			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
+			i = 1;
+		}
+		prp_list[i++] = cpu_to_le64(dma_addr);
+		dma_len -= page_size;
+		dma_addr += page_size;
+		length -= page_size;
+		if (length <= 0)
+			break;
+		if (dma_len > 0)
+			continue;
+		if (unlikely(dma_len < 0))
+			goto bad_sgl;
+		sg = sg_next(sg);
+		dma_addr = sg_dma_address(sg);
+		dma_len = sg_dma_len(sg);
+	}
+
+done:
+	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
+
+	return BLK_STS_OK;
+
+ bad_sgl:
+	WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
+			"Invalid SGL for payload:%d nents:%d\n",
+			blk_rq_payload_bytes(req), iod->nents);
+	return BLK_STS_IOERR;
+}
+
+static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
+		struct scatterlist *sg)
+{
+	sge->addr = cpu_to_le64(sg_dma_address(sg));
+	sge->length = cpu_to_le32(sg_dma_len(sg));
+	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
+}
+
+static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
+		dma_addr_t dma_addr, int entries)
+{
+	sge->addr = cpu_to_le64(dma_addr);
+	if (entries < SGES_PER_PAGE) {
+		sge->length = cpu_to_le32(entries * sizeof(*sge));
+		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
+	} else {
+		sge->length = cpu_to_le32(PAGE_SIZE);
+		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
+	}
+}
+
+static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
+		struct request *req, struct nvme_rw_command *cmd, int entries)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct dma_pool *pool;
+	struct nvme_sgl_desc *sg_list;
+	struct scatterlist *sg = iod->sg;
+	dma_addr_t sgl_dma;
+	int i = 0;
+
+	/* setting the transfer type as SGL */
+	cmd->flags = NVME_CMD_SGL_METABUF;
+
+	if (entries == 1) {
+		nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
+		return BLK_STS_OK;
+	}
+
+	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
+		pool = dev->prp_small_pool;
+		iod->npages = 0;
+	} else {
+		pool = dev->prp_page_pool;
+		iod->npages = 1;
+	}
+
+	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
+	if (!sg_list) {
+		iod->npages = -1;
+		return BLK_STS_RESOURCE;
+	}
+
+	nvme_pci_iod_list(req)[0] = sg_list;
+	iod->first_dma = sgl_dma;
+
+	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
+
+	do {
+		if (i == SGES_PER_PAGE) {
+			struct nvme_sgl_desc *old_sg_desc = sg_list;
+			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];
+
+			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
+			if (!sg_list)
+				return BLK_STS_RESOURCE;
+
+			i = 0;
+			nvme_pci_iod_list(req)[iod->npages++] = sg_list;
+			sg_list[i++] = *link;
+			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
+		}
+
+		nvme_pci_sgl_set_data(&sg_list[i++], sg);
+		sg = sg_next(sg);
+	} while (--entries > 0);
+
+	return BLK_STS_OK;
+}
+
+static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
+		struct nvme_command *cmnd)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct request_queue *q = req->q;
+	enum dma_data_direction dma_dir = rq_data_dir(req) ?
+			DMA_TO_DEVICE : DMA_FROM_DEVICE;
+	blk_status_t ret = BLK_STS_IOERR;
+	int nr_mapped;
+
+	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
+	iod->nents = blk_rq_map_sg(q, req, iod->sg);
+	if (!iod->nents)
+		goto out;
+
+	ret = BLK_STS_RESOURCE;
+	nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
+			DMA_ATTR_NO_WARN);
+	if (!nr_mapped)
+		goto out;
+
+	if (iod->use_sgl)
+		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
+	else
+		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
+
+	if (ret != BLK_STS_OK)
+		goto out_unmap;
+
+	ret = BLK_STS_IOERR;
+	if (blk_integrity_rq(req)) {
+		if (blk_rq_count_integrity_sg(q, req->bio) != 1)
+			goto out_unmap;
+
+		sg_init_table(&iod->meta_sg, 1);
+		if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
+			goto out_unmap;
+
+		if (req_op(req) == REQ_OP_WRITE)
+			nvme_dif_remap(req, nvme_dif_prep);
+
+		if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
+			goto out_unmap;
+	}
+
+	if (blk_integrity_rq(req))
+		cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
+	return BLK_STS_OK;
+
+out_unmap:
+	dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
+out:
+	return ret;
+}
+
+static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	enum dma_data_direction dma_dir = rq_data_dir(req) ?
+			DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
+	if (iod->nents) {
+		dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
+		if (blk_integrity_rq(req)) {
+			if (req_op(req) == REQ_OP_READ)
+				nvme_dif_remap(req, nvme_dif_complete);
+			dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
+		}
+	}
+
+	nvme_cleanup_cmd(req);
+	nvme_free_iod(dev, req);
+}
+
+/*
+ * NOTE: ns is NULL when called on the admin queue.
+ */
+static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
+{
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_queue *nvmeq = hctx->driver_data;
+	struct nvme_dev *dev = nvmeq->dev;
+	struct request *req = bd->rq;
+	struct nvme_command cmnd;
+	blk_status_t ret;
+
+	ret = nvme_setup_cmd(ns, req, &cmnd);
+	if (ret)
+		return ret;
+
+	ret = nvme_init_iod(req, dev);
+	if (ret)
+		goto out_free_cmd;
+
+	if (blk_rq_nr_phys_segments(req)) {
+		ret = nvme_map_data(dev, req, &cmnd);
+		if (ret)
+			goto out_cleanup_iod;
+	}
+
+	blk_mq_start_request(req);
+
+	spin_lock_irq(&nvmeq->q_lock);
+	if (unlikely(nvmeq->cq_vector < 0)) {
+		ret = BLK_STS_IOERR;
+		spin_unlock_irq(&nvmeq->q_lock);
+		goto out_cleanup_iod;
+	}
+	__nvme_submit_cmd(nvmeq, &cmnd);
+	nvme_process_cq(nvmeq);
+	spin_unlock_irq(&nvmeq->q_lock);
+	return BLK_STS_OK;
+out_cleanup_iod:
+	nvme_free_iod(dev, req);
+out_free_cmd:
+	nvme_cleanup_cmd(req);
+	return ret;
+}
+
+static void nvme_pci_complete_rq(struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+	nvme_unmap_data(iod->nvmeq->dev, req);
+	nvme_complete_rq(req);
+}
+
+/* We read the CQE phase first to check if the rest of the entry is valid */
+static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head,
+		u16 phase)
+{
+	return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase;
+}
+
+static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
+{
+	u16 head = nvmeq->cq_head;
+
+	if (likely(nvmeq->cq_vector >= 0)) {
+		if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
+						      nvmeq->dbbuf_cq_ei))
+			writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+	}
+}
+
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
+		struct nvme_completion *cqe)
+{
+	struct request *req;
+
+	if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
+		dev_warn(nvmeq->dev->ctrl.device,
+			"invalid id %d completed on queue %d\n",
+			cqe->command_id, le16_to_cpu(cqe->sq_id));
+		return;
+	}
+
+	/*
+	 * AEN requests are special as they don't time out and can
+	 * survive any kind of queue freeze and often don't respond to
+	 * aborts.  We don't even bother to allocate a struct request
+	 * for them but rather special case them here.
+	 */
+	if (unlikely(nvmeq->qid == 0 &&
+			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
+		nvme_complete_async_event(&nvmeq->dev->ctrl,
+				cqe->status, &cqe->result);
+		return;
+	}
+
+	nvmeq->cqe_seen = 1;
+	req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
+	nvme_end_request(req, cqe->status, cqe->result);
+}
+
+static inline bool nvme_read_cqe(struct nvme_queue *nvmeq,
+		struct nvme_completion *cqe)
+{
+	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
+		*cqe = nvmeq->cqes[nvmeq->cq_head];
+
+		if (++nvmeq->cq_head == nvmeq->q_depth) {
+			nvmeq->cq_head = 0;
+			nvmeq->cq_phase = !nvmeq->cq_phase;
+		}
+		return true;
+	}
+	return false;
+}
+
+static void nvme_process_cq(struct nvme_queue *nvmeq)
+{
+	struct nvme_completion cqe;
+	int consumed = 0;
+
+	while (nvme_read_cqe(nvmeq, &cqe)) {
+		nvme_handle_cqe(nvmeq, &cqe);
+		consumed++;
+	}
+
+	if (consumed)
+		nvme_ring_cq_doorbell(nvmeq);
+}
+
+static irqreturn_t nvme_irq(int irq, void *data)
+{
+	irqreturn_t result;
+	struct nvme_queue *nvmeq = data;
+	spin_lock(&nvmeq->q_lock);
+	nvme_process_cq(nvmeq);
+	result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
+	nvmeq->cqe_seen = 0;
+	spin_unlock(&nvmeq->q_lock);
+	return result;
+}
+
+static irqreturn_t nvme_irq_check(int irq, void *data)
+{
+	struct nvme_queue *nvmeq = data;
+	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
+		return IRQ_WAKE_THREAD;
+	return IRQ_NONE;
+}
+
+static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
+{
+	struct nvme_completion cqe;
+	int found = 0, consumed = 0;
+
+	if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
+		return 0;
+
+	spin_lock_irq(&nvmeq->q_lock);
+	while (nvme_read_cqe(nvmeq, &cqe)) {
+		nvme_handle_cqe(nvmeq, &cqe);
+		consumed++;
+
+		if (tag == cqe.command_id) {
+			found = 1;
+			break;
+		}
+       }
+
+	if (consumed)
+		nvme_ring_cq_doorbell(nvmeq);
+	spin_unlock_irq(&nvmeq->q_lock);
+
+	return found;
+}
+
+static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+
+	return __nvme_poll(nvmeq, tag);
+}
+
+static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dev *dev = to_nvme_dev(ctrl);
+	struct nvme_queue *nvmeq = &dev->queues[0];
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_admin_async_event;
+	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
+
+	spin_lock_irq(&nvmeq->q_lock);
+	__nvme_submit_cmd(nvmeq, &c);
+	spin_unlock_irq(&nvmeq->q_lock);
+}
+
+static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.delete_queue.opcode = opcode;
+	c.delete_queue.qid = cpu_to_le16(id);
+
+	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
+}
+
+static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
+						struct nvme_queue *nvmeq)
+{
+	struct nvme_command c;
+	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
+
+	/*
+	 * Note: we (ab)use the fact that the prp fields survive if no data
+	 * is attached to the request.
+	 */
+	memset(&c, 0, sizeof(c));
+	c.create_cq.opcode = nvme_admin_create_cq;
+	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
+	c.create_cq.cqid = cpu_to_le16(qid);
+	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+	c.create_cq.cq_flags = cpu_to_le16(flags);
+	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
+
+	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
+}
+
+static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
+						struct nvme_queue *nvmeq)
+{
+	struct nvme_ctrl *ctrl = &dev->ctrl;
+	struct nvme_command c;
+	int flags = NVME_QUEUE_PHYS_CONTIG;
+
+	/*
+	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
+	 * set. Since URGENT priority is zeroes, it makes all queues
+	 * URGENT.
+	 */
+	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
+		flags |= NVME_SQ_PRIO_MEDIUM;
+
+	/*
+	 * Note: we (ab)use the fact that the prp fields survive if no data
+	 * is attached to the request.
+	 */
+	memset(&c, 0, sizeof(c));
+	c.create_sq.opcode = nvme_admin_create_sq;
+	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
+	c.create_sq.sqid = cpu_to_le16(qid);
+	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+	c.create_sq.sq_flags = cpu_to_le16(flags);
+	c.create_sq.cqid = cpu_to_le16(qid);
+
+	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
+}
+
+static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
+{
+	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
+}
+
+static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
+{
+	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
+}
+
+static void abort_endio(struct request *req, blk_status_t error)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = iod->nvmeq;
+
+	dev_warn(nvmeq->dev->ctrl.device,
+		 "Abort status: 0x%x", nvme_req(req)->status);
+	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
+	blk_mq_free_request(req);
+}
+
+static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
+{
+
+	/* If true, indicates loss of adapter communication, possibly by a
+	 * NVMe Subsystem reset.
+	 */
+	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
+
+	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
+	switch (dev->ctrl.state) {
+	case NVME_CTRL_RESETTING:
+	case NVME_CTRL_CONNECTING:
+		return false;
+	default:
+		break;
+	}
+
+	/* We shouldn't reset unless the controller is on fatal error state
+	 * _or_ if we lost the communication with it.
+	 */
+	if (!(csts & NVME_CSTS_CFS) && !nssro)
+		return false;
+
+	return true;
+}
+
+static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
+{
+	/* Read a config register to help see what died. */
+	u16 pci_status;
+	int result;
+
+	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
+				      &pci_status);
+	if (result == PCIBIOS_SUCCESSFUL)
+		dev_warn(dev->ctrl.device,
+			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
+			 csts, pci_status);
+	else
+		dev_warn(dev->ctrl.device,
+			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
+			 csts, result);
+}
+
+static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = iod->nvmeq;
+	struct nvme_dev *dev = nvmeq->dev;
+	struct request *abort_req;
+	struct nvme_command cmd;
+	u32 csts = readl(dev->bar + NVME_REG_CSTS);
+
+	/* If PCI error recovery process is happening, we cannot reset or
+	 * the recovery mechanism will surely fail.
+	 */
+	mb();
+	if (pci_channel_offline(to_pci_dev(dev->dev)))
+		return BLK_EH_RESET_TIMER;
+
+	/*
+	 * Reset immediately if the controller is failed
+	 */
+	if (nvme_should_reset(dev, csts)) {
+		nvme_warn_reset(dev, csts);
+		nvme_dev_disable(dev, false);
+		nvme_reset_ctrl(&dev->ctrl);
+		return BLK_EH_HANDLED;
+	}
+
+	/*
+	 * Did we miss an interrupt?
+	 */
+	if (__nvme_poll(nvmeq, req->tag)) {
+		dev_warn(dev->ctrl.device,
+			 "I/O %d QID %d timeout, completion polled\n",
+			 req->tag, nvmeq->qid);
+		return BLK_EH_HANDLED;
+	}
+
+	/*
+	 * Shutdown immediately if controller times out while starting. The
+	 * reset work will see the pci device disabled when it gets the forced
+	 * cancellation error. All outstanding requests are completed on
+	 * shutdown, so we return BLK_EH_HANDLED.
+	 */
+	switch (dev->ctrl.state) {
+	case NVME_CTRL_CONNECTING:
+	case NVME_CTRL_RESETTING:
+		dev_warn(dev->ctrl.device,
+			 "I/O %d QID %d timeout, disable controller\n",
+			 req->tag, nvmeq->qid);
+		nvme_dev_disable(dev, false);
+		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+		return BLK_EH_HANDLED;
+	default:
+		break;
+	}
+
+	/*
+ 	 * Shutdown the controller immediately and schedule a reset if the
+ 	 * command was already aborted once before and still hasn't been
+ 	 * returned to the driver, or if this is the admin queue.
+	 */
+	if (!nvmeq->qid || iod->aborted) {
+		dev_warn(dev->ctrl.device,
+			 "I/O %d QID %d timeout, reset controller\n",
+			 req->tag, nvmeq->qid);
+		nvme_dev_disable(dev, false);
+		nvme_reset_ctrl(&dev->ctrl);
+
+		/*
+		 * Mark the request as handled, since the inline shutdown
+		 * forces all outstanding requests to complete.
+		 */
+		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+		return BLK_EH_HANDLED;
+	}
+
+	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
+		atomic_inc(&dev->ctrl.abort_limit);
+		return BLK_EH_RESET_TIMER;
+	}
+	iod->aborted = 1;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.abort.opcode = nvme_admin_abort_cmd;
+	cmd.abort.cid = req->tag;
+	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
+
+	dev_warn(nvmeq->dev->ctrl.device,
+		"I/O %d QID %d timeout, aborting\n",
+		 req->tag, nvmeq->qid);
+
+	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
+			BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
+	if (IS_ERR(abort_req)) {
+		atomic_inc(&dev->ctrl.abort_limit);
+		return BLK_EH_RESET_TIMER;
+	}
+
+	abort_req->timeout = ADMIN_TIMEOUT;
+	abort_req->end_io_data = NULL;
+	blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
+
+	/*
+	 * The aborted req will be completed on receiving the abort req.
+	 * We enable the timer again. If hit twice, it'll cause a device reset,
+	 * as the device then is in a faulty state.
+	 */
+	return BLK_EH_RESET_TIMER;
+}
+
+static void nvme_free_queue(struct nvme_queue *nvmeq)
+{
+	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+	if (nvmeq->sq_cmds)
+		dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+}
+
+static void nvme_free_queues(struct nvme_dev *dev, int lowest)
+{
+	int i;
+
+	for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
+		dev->ctrl.queue_count--;
+		nvme_free_queue(&dev->queues[i]);
+	}
+}
+
+/**
+ * nvme_suspend_queue - put queue into suspended state
+ * @nvmeq - queue to suspend
+ */
+static int nvme_suspend_queue(struct nvme_queue *nvmeq)
+{
+	int vector;
+
+	spin_lock_irq(&nvmeq->q_lock);
+	if (nvmeq->cq_vector == -1) {
+		spin_unlock_irq(&nvmeq->q_lock);
+		return 1;
+	}
+	vector = nvmeq->cq_vector;
+	nvmeq->dev->online_queues--;
+	nvmeq->cq_vector = -1;
+	spin_unlock_irq(&nvmeq->q_lock);
+
+	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
+		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
+
+	pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq);
+
+	return 0;
+}
+
+static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
+{
+	struct nvme_queue *nvmeq = &dev->queues[0];
+
+	if (shutdown)
+		nvme_shutdown_ctrl(&dev->ctrl);
+	else
+		nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
+
+	spin_lock_irq(&nvmeq->q_lock);
+	nvme_process_cq(nvmeq);
+	spin_unlock_irq(&nvmeq->q_lock);
+}
+
+static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
+				int entry_size)
+{
+	int q_depth = dev->q_depth;
+	unsigned q_size_aligned = roundup(q_depth * entry_size,
+					  dev->ctrl.page_size);
+
+	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
+		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
+		mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
+		q_depth = div_u64(mem_per_q, entry_size);
+
+		/*
+		 * Ensure the reduced q_depth is above some threshold where it
+		 * would be better to map queues in system memory with the
+		 * original depth
+		 */
+		if (q_depth < 64)
+			return -ENOMEM;
+	}
+
+	return q_depth;
+}
+
+static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+				int qid, int depth)
+{
+	/* CMB SQEs will be mapped before creation */
+	if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS))
+		return 0;
+
+	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+					    &nvmeq->sq_dma_addr, GFP_KERNEL);
+	if (!nvmeq->sq_cmds)
+		return -ENOMEM;
+	return 0;
+}
+
+static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
+{
+	struct nvme_queue *nvmeq = &dev->queues[qid];
+
+	if (dev->ctrl.queue_count > qid)
+		return 0;
+
+	nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
+					  &nvmeq->cq_dma_addr, GFP_KERNEL);
+	if (!nvmeq->cqes)
+		goto free_nvmeq;
+
+	if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
+		goto free_cqdma;
+
+	nvmeq->q_dmadev = dev->dev;
+	nvmeq->dev = dev;
+	spin_lock_init(&nvmeq->q_lock);
+	nvmeq->cq_head = 0;
+	nvmeq->cq_phase = 1;
+	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
+	nvmeq->q_depth = depth;
+	nvmeq->qid = qid;
+	nvmeq->cq_vector = -1;
+	dev->ctrl.queue_count++;
+
+	return 0;
+
+ free_cqdma:
+	dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
+							nvmeq->cq_dma_addr);
+ free_nvmeq:
+	return -ENOMEM;
+}
+
+static int queue_request_irq(struct nvme_queue *nvmeq)
+{
+	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
+	int nr = nvmeq->dev->ctrl.instance;
+
+	if (use_threaded_interrupts) {
+		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
+				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
+	} else {
+		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
+				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
+	}
+}
+
+static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
+{
+	struct nvme_dev *dev = nvmeq->dev;
+
+	spin_lock_irq(&nvmeq->q_lock);
+	nvmeq->sq_tail = 0;
+	nvmeq->cq_head = 0;
+	nvmeq->cq_phase = 1;
+	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
+	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
+	nvme_dbbuf_init(dev, nvmeq, qid);
+	dev->online_queues++;
+	spin_unlock_irq(&nvmeq->q_lock);
+}
+
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
+{
+	struct nvme_dev *dev = nvmeq->dev;
+	int result;
+
+	if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
+		unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth),
+						      dev->ctrl.page_size);
+		nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
+		nvmeq->sq_cmds_io = dev->cmb + offset;
+	}
+
+	/*
+	 * A queue's vector matches the queue identifier unless the controller
+	 * has only one vector available.
+	 */
+	nvmeq->cq_vector = dev->num_vecs == 1 ? 0 : qid;
+	result = adapter_alloc_cq(dev, qid, nvmeq);
+	if (result < 0)
+		goto release_vector;
+
+	result = adapter_alloc_sq(dev, qid, nvmeq);
+	if (result < 0)
+		goto release_cq;
+
+	nvme_init_queue(nvmeq, qid);
+	result = queue_request_irq(nvmeq);
+	if (result < 0)
+		goto release_sq;
+
+	return result;
+
+ release_sq:
+	dev->online_queues--;
+	adapter_delete_sq(dev, qid);
+ release_cq:
+	adapter_delete_cq(dev, qid);
+ release_vector:
+	nvmeq->cq_vector = -1;
+	return result;
+}
+
+static const struct blk_mq_ops nvme_mq_admin_ops = {
+	.queue_rq	= nvme_queue_rq,
+	.complete	= nvme_pci_complete_rq,
+	.init_hctx	= nvme_admin_init_hctx,
+	.exit_hctx      = nvme_admin_exit_hctx,
+	.init_request	= nvme_init_request,
+	.timeout	= nvme_timeout,
+};
+
+static const struct blk_mq_ops nvme_mq_ops = {
+	.queue_rq	= nvme_queue_rq,
+	.complete	= nvme_pci_complete_rq,
+	.init_hctx	= nvme_init_hctx,
+	.init_request	= nvme_init_request,
+	.map_queues	= nvme_pci_map_queues,
+	.timeout	= nvme_timeout,
+	.poll		= nvme_poll,
+};
+
+static void nvme_dev_remove_admin(struct nvme_dev *dev)
+{
+	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
+		/*
+		 * If the controller was reset during removal, it's possible
+		 * user requests may be waiting on a stopped queue. Start the
+		 * queue to flush these to completion.
+		 */
+		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+		blk_cleanup_queue(dev->ctrl.admin_q);
+		blk_mq_free_tag_set(&dev->admin_tagset);
+	}
+}
+
+static int nvme_alloc_admin_tags(struct nvme_dev *dev)
+{
+	if (!dev->ctrl.admin_q) {
+		dev->admin_tagset.ops = &nvme_mq_admin_ops;
+		dev->admin_tagset.nr_hw_queues = 1;
+
+		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
+		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
+		dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
+		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
+		dev->admin_tagset.driver_data = dev;
+
+		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
+			return -ENOMEM;
+		dev->ctrl.admin_tagset = &dev->admin_tagset;
+
+		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
+		if (IS_ERR(dev->ctrl.admin_q)) {
+			blk_mq_free_tag_set(&dev->admin_tagset);
+			return -ENOMEM;
+		}
+		if (!blk_get_queue(dev->ctrl.admin_q)) {
+			nvme_dev_remove_admin(dev);
+			dev->ctrl.admin_q = NULL;
+			return -ENODEV;
+		}
+	} else
+		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+
+	return 0;
+}
+
+static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
+{
+	return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
+}
+
+static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
+{
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	if (size <= dev->bar_mapped_size)
+		return 0;
+	if (size > pci_resource_len(pdev, 0))
+		return -ENOMEM;
+	if (dev->bar)
+		iounmap(dev->bar);
+	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
+	if (!dev->bar) {
+		dev->bar_mapped_size = 0;
+		return -ENOMEM;
+	}
+	dev->bar_mapped_size = size;
+	dev->dbs = dev->bar + NVME_REG_DBS;
+
+	return 0;
+}
+
+static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
+{
+	int result;
+	u32 aqa;
+	struct nvme_queue *nvmeq;
+
+	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
+	if (result < 0)
+		return result;
+
+	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
+				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
+
+	if (dev->subsystem &&
+	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
+		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
+
+	result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
+	if (result < 0)
+		return result;
+
+	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
+	if (result)
+		return result;
+
+	nvmeq = &dev->queues[0];
+	aqa = nvmeq->q_depth - 1;
+	aqa |= aqa << 16;
+
+	writel(aqa, dev->bar + NVME_REG_AQA);
+	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
+	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
+
+	result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap);
+	if (result)
+		return result;
+
+	nvmeq->cq_vector = 0;
+	nvme_init_queue(nvmeq, 0);
+	result = queue_request_irq(nvmeq);
+	if (result) {
+		nvmeq->cq_vector = -1;
+		return result;
+	}
+
+	return result;
+}
+
+static int nvme_create_io_queues(struct nvme_dev *dev)
+{
+	unsigned i, max;
+	int ret = 0;
+
+	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
+		if (nvme_alloc_queue(dev, i, dev->q_depth)) {
+			ret = -ENOMEM;
+			break;
+		}
+	}
+
+	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
+	for (i = dev->online_queues; i <= max; i++) {
+		ret = nvme_create_queue(&dev->queues[i], i);
+		if (ret)
+			break;
+	}
+
+	/*
+	 * Ignore failing Create SQ/CQ commands, we can continue with less
+	 * than the desired amount of queues, and even a controller without
+	 * I/O queues can still be used to issue admin commands.  This might
+	 * be useful to upgrade a buggy firmware for example.
+	 */
+	return ret >= 0 ? 0 : ret;
+}
+
+static ssize_t nvme_cmb_show(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
+
+	return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
+		       ndev->cmbloc, ndev->cmbsz);
+}
+static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
+
+static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
+{
+	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
+
+	return 1ULL << (12 + 4 * szu);
+}
+
+static u32 nvme_cmb_size(struct nvme_dev *dev)
+{
+	return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
+}
+
+static void nvme_map_cmb(struct nvme_dev *dev)
+{
+	u64 size, offset;
+	resource_size_t bar_size;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	int bar;
+
+	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
+	if (!dev->cmbsz)
+		return;
+	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
+
+	if (!use_cmb_sqes)
+		return;
+
+	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
+	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
+	bar = NVME_CMB_BIR(dev->cmbloc);
+	bar_size = pci_resource_len(pdev, bar);
+
+	if (offset > bar_size)
+		return;
+
+	/*
+	 * Controllers may support a CMB size larger than their BAR,
+	 * for example, due to being behind a bridge. Reduce the CMB to
+	 * the reported size of the BAR
+	 */
+	if (size > bar_size - offset)
+		size = bar_size - offset;
+
+	dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
+	if (!dev->cmb)
+		return;
+	dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
+	dev->cmb_size = size;
+
+	if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
+				    &dev_attr_cmb.attr, NULL))
+		dev_warn(dev->ctrl.device,
+			 "failed to add sysfs attribute for CMB\n");
+}
+
+static inline void nvme_release_cmb(struct nvme_dev *dev)
+{
+	if (dev->cmb) {
+		iounmap(dev->cmb);
+		dev->cmb = NULL;
+		sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
+					     &dev_attr_cmb.attr, NULL);
+		dev->cmbsz = 0;
+	}
+}
+
+static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
+{
+	u64 dma_addr = dev->host_mem_descs_dma;
+	struct nvme_command c;
+	int ret;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode	= nvme_admin_set_features;
+	c.features.fid		= cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
+	c.features.dword11	= cpu_to_le32(bits);
+	c.features.dword12	= cpu_to_le32(dev->host_mem_size >>
+					      ilog2(dev->ctrl.page_size));
+	c.features.dword13	= cpu_to_le32(lower_32_bits(dma_addr));
+	c.features.dword14	= cpu_to_le32(upper_32_bits(dma_addr));
+	c.features.dword15	= cpu_to_le32(dev->nr_host_mem_descs);
+
+	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
+	if (ret) {
+		dev_warn(dev->ctrl.device,
+			 "failed to set host mem (err %d, flags %#x).\n",
+			 ret, bits);
+	}
+	return ret;
+}
+
+static void nvme_free_host_mem(struct nvme_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->nr_host_mem_descs; i++) {
+		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
+		size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;
+
+		dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i],
+				le64_to_cpu(desc->addr));
+	}
+
+	kfree(dev->host_mem_desc_bufs);
+	dev->host_mem_desc_bufs = NULL;
+	dma_free_coherent(dev->dev,
+			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
+			dev->host_mem_descs, dev->host_mem_descs_dma);
+	dev->host_mem_descs = NULL;
+	dev->nr_host_mem_descs = 0;
+}
+
+static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
+		u32 chunk_size)
+{
+	struct nvme_host_mem_buf_desc *descs;
+	u32 max_entries, len;
+	dma_addr_t descs_dma;
+	int i = 0;
+	void **bufs;
+	u64 size, tmp;
+
+	tmp = (preferred + chunk_size - 1);
+	do_div(tmp, chunk_size);
+	max_entries = tmp;
+
+	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
+		max_entries = dev->ctrl.hmmaxd;
+
+	descs = dma_zalloc_coherent(dev->dev, max_entries * sizeof(*descs),
+			&descs_dma, GFP_KERNEL);
+	if (!descs)
+		goto out;
+
+	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
+	if (!bufs)
+		goto out_free_descs;
+
+	for (size = 0; size < preferred && i < max_entries; size += len) {
+		dma_addr_t dma_addr;
+
+		len = min_t(u64, chunk_size, preferred - size);
+		bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
+				DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
+		if (!bufs[i])
+			break;
+
+		descs[i].addr = cpu_to_le64(dma_addr);
+		descs[i].size = cpu_to_le32(len / dev->ctrl.page_size);
+		i++;
+	}
+
+	if (!size)
+		goto out_free_bufs;
+
+	dev->nr_host_mem_descs = i;
+	dev->host_mem_size = size;
+	dev->host_mem_descs = descs;
+	dev->host_mem_descs_dma = descs_dma;
+	dev->host_mem_desc_bufs = bufs;
+	return 0;
+
+out_free_bufs:
+	while (--i >= 0) {
+		size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size;
+
+		dma_free_coherent(dev->dev, size, bufs[i],
+				le64_to_cpu(descs[i].addr));
+	}
+
+	kfree(bufs);
+out_free_descs:
+	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
+			descs_dma);
+out:
+	dev->host_mem_descs = NULL;
+	return -ENOMEM;
+}
+
+static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
+{
+	u32 chunk_size;
+
+	/* start big and work our way down */
+	for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
+	     chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
+	     chunk_size /= 2) {
+		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
+			if (!min || dev->host_mem_size >= min)
+				return 0;
+			nvme_free_host_mem(dev);
+		}
+	}
+
+	return -ENOMEM;
+}
+
+static int nvme_setup_host_mem(struct nvme_dev *dev)
+{
+	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
+	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
+	u64 min = (u64)dev->ctrl.hmmin * 4096;
+	u32 enable_bits = NVME_HOST_MEM_ENABLE;
+	int ret;
+
+	preferred = min(preferred, max);
+	if (min > max) {
+		dev_warn(dev->ctrl.device,
+			"min host memory (%lld MiB) above limit (%d MiB).\n",
+			min >> ilog2(SZ_1M), max_host_mem_size_mb);
+		nvme_free_host_mem(dev);
+		return 0;
+	}
+
+	/*
+	 * If we already have a buffer allocated check if we can reuse it.
+	 */
+	if (dev->host_mem_descs) {
+		if (dev->host_mem_size >= min)
+			enable_bits |= NVME_HOST_MEM_RETURN;
+		else
+			nvme_free_host_mem(dev);
+	}
+
+	if (!dev->host_mem_descs) {
+		if (nvme_alloc_host_mem(dev, min, preferred)) {
+			dev_warn(dev->ctrl.device,
+				"failed to allocate host memory buffer.\n");
+			return 0; /* controller must work without HMB */
+		}
+
+		dev_info(dev->ctrl.device,
+			"allocated %lld MiB host memory buffer.\n",
+			dev->host_mem_size >> ilog2(SZ_1M));
+	}
+
+	ret = nvme_set_host_mem(dev, enable_bits);
+	if (ret)
+		nvme_free_host_mem(dev);
+	return ret;
+}
+
+static int nvme_setup_io_queues(struct nvme_dev *dev)
+{
+	struct nvme_queue *adminq = &dev->queues[0];
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	int result, nr_io_queues;
+	unsigned long size;
+
+	struct irq_affinity affd = {
+		.pre_vectors = 1
+	};
+
+	nr_io_queues = num_possible_cpus();
+	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
+	if (result < 0)
+		return result;
+
+	if (nr_io_queues == 0)
+		return 0;
+
+	if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
+		result = nvme_cmb_qdepth(dev, nr_io_queues,
+				sizeof(struct nvme_command));
+		if (result > 0)
+			dev->q_depth = result;
+		else
+			nvme_release_cmb(dev);
+	}
+
+	do {
+		size = db_bar_size(dev, nr_io_queues);
+		result = nvme_remap_bar(dev, size);
+		if (!result)
+			break;
+		if (!--nr_io_queues)
+			return -ENOMEM;
+	} while (1);
+	adminq->q_db = dev->dbs;
+
+	/* Deregister the admin queue's interrupt */
+	pci_free_irq(pdev, 0, adminq);
+
+	/*
+	 * If we enable msix early due to not intx, disable it again before
+	 * setting up the full range we need.
+	 */
+	pci_free_irq_vectors(pdev);
+	result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1,
+			PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+	if (result <= 0)
+		return -EIO;
+	dev->num_vecs = result;
+	dev->max_qid = max(result - 1, 1);
+
+	/*
+	 * Should investigate if there's a performance win from allocating
+	 * more queues than interrupt vectors; it might allow the submission
+	 * path to scale better, even if the receive path is limited by the
+	 * number of interrupts.
+	 */
+
+	result = queue_request_irq(adminq);
+	if (result) {
+		adminq->cq_vector = -1;
+		return result;
+	}
+	return nvme_create_io_queues(dev);
+}
+
+static void nvme_del_queue_end(struct request *req, blk_status_t error)
+{
+	struct nvme_queue *nvmeq = req->end_io_data;
+
+	blk_mq_free_request(req);
+	complete(&nvmeq->dev->ioq_wait);
+}
+
+static void nvme_del_cq_end(struct request *req, blk_status_t error)
+{
+	struct nvme_queue *nvmeq = req->end_io_data;
+
+	if (!error) {
+		unsigned long flags;
+
+		/*
+		 * We might be called with the AQ q_lock held
+		 * and the I/O queue q_lock should always
+		 * nest inside the AQ one.
+		 */
+		spin_lock_irqsave_nested(&nvmeq->q_lock, flags,
+					SINGLE_DEPTH_NESTING);
+		nvme_process_cq(nvmeq);
+		spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+	}
+
+	nvme_del_queue_end(req, error);
+}
+
+static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
+{
+	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
+	struct request *req;
+	struct nvme_command cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.delete_queue.opcode = opcode;
+	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
+
+	req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->timeout = ADMIN_TIMEOUT;
+	req->end_io_data = nvmeq;
+
+	blk_execute_rq_nowait(q, NULL, req, false,
+			opcode == nvme_admin_delete_cq ?
+				nvme_del_cq_end : nvme_del_queue_end);
+	return 0;
+}
+
+static void nvme_disable_io_queues(struct nvme_dev *dev)
+{
+	int pass, queues = dev->online_queues - 1;
+	unsigned long timeout;
+	u8 opcode = nvme_admin_delete_sq;
+
+	for (pass = 0; pass < 2; pass++) {
+		int sent = 0, i = queues;
+
+		reinit_completion(&dev->ioq_wait);
+ retry:
+		timeout = ADMIN_TIMEOUT;
+		for (; i > 0; i--, sent++)
+			if (nvme_delete_queue(&dev->queues[i], opcode))
+				break;
+
+		while (sent--) {
+			timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
+			if (timeout == 0)
+				return;
+			if (i)
+				goto retry;
+		}
+		opcode = nvme_admin_delete_cq;
+	}
+}
+
+/*
+ * return error value only when tagset allocation failed
+ */
+static int nvme_dev_add(struct nvme_dev *dev)
+{
+	int ret;
+
+	if (!dev->ctrl.tagset) {
+		dev->tagset.ops = &nvme_mq_ops;
+		dev->tagset.nr_hw_queues = dev->online_queues - 1;
+		dev->tagset.timeout = NVME_IO_TIMEOUT;
+		dev->tagset.numa_node = dev_to_node(dev->dev);
+		dev->tagset.queue_depth =
+				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
+		dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false);
+		if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) {
+			dev->tagset.cmd_size = max(dev->tagset.cmd_size,
+					nvme_pci_cmd_size(dev, true));
+		}
+		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
+		dev->tagset.driver_data = dev;
+
+		ret = blk_mq_alloc_tag_set(&dev->tagset);
+		if (ret) {
+			dev_warn(dev->ctrl.device,
+				"IO queues tagset allocation failed %d\n", ret);
+			return ret;
+		}
+		dev->ctrl.tagset = &dev->tagset;
+
+		nvme_dbbuf_set(dev);
+	} else {
+		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
+
+		/* Free previously allocated queues that are no longer usable */
+		nvme_free_queues(dev, dev->online_queues);
+	}
+
+	return 0;
+}
+
+static int nvme_pci_enable(struct nvme_dev *dev)
+{
+	int result = -ENOMEM;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	if (pci_enable_device_mem(pdev))
+		return result;
+
+	pci_set_master(pdev);
+
+	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
+	    dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
+		goto disable;
+
+	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
+		result = -ENODEV;
+		goto disable;
+	}
+
+	/*
+	 * Some devices and/or platforms don't advertise or work with INTx
+	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
+	 * adjust this later.
+	 */
+	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
+	if (result < 0)
+		return result;
+
+	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
+
+	dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
+				io_queue_depth);
+	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
+	dev->dbs = dev->bar + 4096;
+
+	/*
+	 * Temporary fix for the Apple controller found in the MacBook8,1 and
+	 * some MacBook7,1 to avoid controller resets and data loss.
+	 */
+	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
+		dev->q_depth = 2;
+		dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
+			"set queue depth=%u to work around controller resets\n",
+			dev->q_depth);
+	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
+		   (pdev->device == 0xa821 || pdev->device == 0xa822) &&
+		   NVME_CAP_MQES(dev->ctrl.cap) == 0) {
+		dev->q_depth = 64;
+		dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
+                        "set queue depth=%u\n", dev->q_depth);
+	}
+
+	nvme_map_cmb(dev);
+
+	pci_enable_pcie_error_reporting(pdev);
+	pci_save_state(pdev);
+	return 0;
+
+ disable:
+	pci_disable_device(pdev);
+	return result;
+}
+
+static void nvme_dev_unmap(struct nvme_dev *dev)
+{
+	if (dev->bar)
+		iounmap(dev->bar);
+	pci_release_mem_regions(to_pci_dev(dev->dev));
+}
+
+static void nvme_pci_disable(struct nvme_dev *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	nvme_release_cmb(dev);
+	pci_free_irq_vectors(pdev);
+
+	if (pci_is_enabled(pdev)) {
+		pci_disable_pcie_error_reporting(pdev);
+		pci_disable_device(pdev);
+	}
+}
+
+static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
+{
+	int i;
+	bool dead = true;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	mutex_lock(&dev->shutdown_lock);
+	if (pci_is_enabled(pdev)) {
+		u32 csts = readl(dev->bar + NVME_REG_CSTS);
+
+		if (dev->ctrl.state == NVME_CTRL_LIVE ||
+		    dev->ctrl.state == NVME_CTRL_RESETTING)
+			nvme_start_freeze(&dev->ctrl);
+		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
+			pdev->error_state  != pci_channel_io_normal);
+	}
+
+	/*
+	 * Give the controller a chance to complete all entered requests if
+	 * doing a safe shutdown.
+	 */
+	if (!dead) {
+		if (shutdown)
+			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
+	}
+
+	nvme_stop_queues(&dev->ctrl);
+
+	if (!dead && dev->ctrl.queue_count > 0) {
+		/*
+		 * If the controller is still alive tell it to stop using the
+		 * host memory buffer.  In theory the shutdown / reset should
+		 * make sure that it doesn't access the host memoery anymore,
+		 * but I'd rather be safe than sorry..
+		 */
+		if (dev->host_mem_descs)
+			nvme_set_host_mem(dev, 0);
+		nvme_disable_io_queues(dev);
+		nvme_disable_admin_queue(dev, shutdown);
+	}
+	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
+		nvme_suspend_queue(&dev->queues[i]);
+
+	nvme_pci_disable(dev);
+
+	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
+	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
+
+	/*
+	 * The driver will not be starting up queues again if shutting down so
+	 * must flush all entered requests to their failed completion to avoid
+	 * deadlocking blk-mq hot-cpu notifier.
+	 */
+	if (shutdown)
+		nvme_start_queues(&dev->ctrl);
+	mutex_unlock(&dev->shutdown_lock);
+}
+
+static int nvme_setup_prp_pools(struct nvme_dev *dev)
+{
+	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
+						PAGE_SIZE, PAGE_SIZE, 0);
+	if (!dev->prp_page_pool)
+		return -ENOMEM;
+
+	/* Optimisation for I/Os between 4k and 128k */
+	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
+						256, 256, 0);
+	if (!dev->prp_small_pool) {
+		dma_pool_destroy(dev->prp_page_pool);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void nvme_release_prp_pools(struct nvme_dev *dev)
+{
+	dma_pool_destroy(dev->prp_page_pool);
+	dma_pool_destroy(dev->prp_small_pool);
+}
+
+static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dev *dev = to_nvme_dev(ctrl);
+
+	nvme_dbbuf_dma_free(dev);
+	put_device(dev->dev);
+	if (dev->tagset.tags)
+		blk_mq_free_tag_set(&dev->tagset);
+	if (dev->ctrl.admin_q)
+		blk_put_queue(dev->ctrl.admin_q);
+	kfree(dev->queues);
+	free_opal_dev(dev->ctrl.opal_dev);
+	kfree(dev);
+}
+
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+{
+	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
+
+	nvme_get_ctrl(&dev->ctrl);
+	nvme_dev_disable(dev, false);
+	if (!queue_work(nvme_wq, &dev->remove_work))
+		nvme_put_ctrl(&dev->ctrl);
+}
+
+static void nvme_reset_work(struct work_struct *work)
+{
+	struct nvme_dev *dev =
+		container_of(work, struct nvme_dev, ctrl.reset_work);
+	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
+	int result = -ENODEV;
+	enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
+
+	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
+		goto out;
+
+	/*
+	 * If we're called to reset a live controller first shut it down before
+	 * moving on.
+	 */
+	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
+		nvme_dev_disable(dev, false);
+
+	/*
+	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
+	 * initializing procedure here.
+	 */
+	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
+		dev_warn(dev->ctrl.device,
+			"failed to mark controller CONNECTING\n");
+		goto out;
+	}
+
+	result = nvme_pci_enable(dev);
+	if (result)
+		goto out;
+
+	result = nvme_pci_configure_admin_queue(dev);
+	if (result)
+		goto out;
+
+	result = nvme_alloc_admin_tags(dev);
+	if (result)
+		goto out;
+
+	result = nvme_init_identify(&dev->ctrl);
+	if (result)
+		goto out;
+
+	if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
+		if (!dev->ctrl.opal_dev)
+			dev->ctrl.opal_dev =
+				init_opal_dev(&dev->ctrl, &nvme_sec_submit);
+		else if (was_suspend)
+			opal_unlock_from_suspend(dev->ctrl.opal_dev);
+	} else {
+		free_opal_dev(dev->ctrl.opal_dev);
+		dev->ctrl.opal_dev = NULL;
+	}
+
+	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
+		result = nvme_dbbuf_dma_alloc(dev);
+		if (result)
+			dev_warn(dev->dev,
+				 "unable to allocate dma for dbbuf\n");
+	}
+
+	if (dev->ctrl.hmpre) {
+		result = nvme_setup_host_mem(dev);
+		if (result < 0)
+			goto out;
+	}
+
+	result = nvme_setup_io_queues(dev);
+	if (result)
+		goto out;
+
+	/*
+	 * Keep the controller around but remove all namespaces if we don't have
+	 * any working I/O queue.
+	 */
+	if (dev->online_queues < 2) {
+		dev_warn(dev->ctrl.device, "IO queues not created\n");
+		nvme_kill_queues(&dev->ctrl);
+		nvme_remove_namespaces(&dev->ctrl);
+		new_state = NVME_CTRL_ADMIN_ONLY;
+	} else {
+		nvme_start_queues(&dev->ctrl);
+		nvme_wait_freeze(&dev->ctrl);
+		/* hit this only when allocate tagset fails */
+		if (nvme_dev_add(dev))
+			new_state = NVME_CTRL_ADMIN_ONLY;
+		nvme_unfreeze(&dev->ctrl);
+	}
+
+	/*
+	 * If only admin queue live, keep it to do further investigation or
+	 * recovery.
+	 */
+	if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
+		dev_warn(dev->ctrl.device,
+			"failed to mark controller state %d\n", new_state);
+		goto out;
+	}
+
+	nvme_start_ctrl(&dev->ctrl);
+	return;
+
+ out:
+	nvme_remove_dead_ctrl(dev, result);
+}
+
+static void nvme_remove_dead_ctrl_work(struct work_struct *work)
+{
+	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	nvme_kill_queues(&dev->ctrl);
+	if (pci_get_drvdata(pdev))
+		device_release_driver(&pdev->dev);
+	nvme_put_ctrl(&dev->ctrl);
+}
+
+static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
+{
+	*val = readl(to_nvme_dev(ctrl)->bar + off);
+	return 0;
+}
+
+static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
+{
+	writel(val, to_nvme_dev(ctrl)->bar + off);
+	return 0;
+}
+
+static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
+{
+	*val = readq(to_nvme_dev(ctrl)->bar + off);
+	return 0;
+}
+
+static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
+{
+	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);
+
+	return snprintf(buf, size, "%s", dev_name(&pdev->dev));
+}
+
+static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
+	.name			= "pcie",
+	.module			= THIS_MODULE,
+	.flags			= NVME_F_METADATA_SUPPORTED,
+	.reg_read32		= nvme_pci_reg_read32,
+	.reg_write32		= nvme_pci_reg_write32,
+	.reg_read64		= nvme_pci_reg_read64,
+	.free_ctrl		= nvme_pci_free_ctrl,
+	.submit_async_event	= nvme_pci_submit_async_event,
+	.get_address		= nvme_pci_get_address,
+};
+
+static int nvme_dev_map(struct nvme_dev *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	if (pci_request_mem_regions(pdev, "nvme"))
+		return -ENODEV;
+
+	if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
+		goto release;
+
+	return 0;
+  release:
+	pci_release_mem_regions(pdev);
+	return -ENODEV;
+}
+
+static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
+{
+	if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
+		/*
+		 * Several Samsung devices seem to drop off the PCIe bus
+		 * randomly when APST is on and uses the deepest sleep state.
+		 * This has been observed on a Samsung "SM951 NVMe SAMSUNG
+		 * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
+		 * 950 PRO 256GB", but it seems to be restricted to two Dell
+		 * laptops.
+		 */
+		if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
+		    (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
+		     dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
+			return NVME_QUIRK_NO_DEEPEST_PS;
+	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
+		/*
+		 * Samsung SSD 960 EVO drops off the PCIe bus after system
+		 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
+		 * within few minutes after bootup on a Coffee Lake board -
+		 * ASUS PRIME Z370-A
+		 */
+		if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
+		    (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
+		     dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
+			return NVME_QUIRK_NO_APST;
+	}
+
+	return 0;
+}
+
+static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	int node, result = -ENOMEM;
+	struct nvme_dev *dev;
+	unsigned long quirks = id->driver_data;
+
+	node = dev_to_node(&pdev->dev);
+	if (node == NUMA_NO_NODE)
+		set_dev_node(&pdev->dev, first_memory_node);
+
+	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->queues = kcalloc_node(num_possible_cpus() + 1,
+			sizeof(struct nvme_queue), GFP_KERNEL, node);
+	if (!dev->queues)
+		goto free;
+
+	dev->dev = get_device(&pdev->dev);
+	pci_set_drvdata(pdev, dev);
+
+	result = nvme_dev_map(dev);
+	if (result)
+		goto put_pci;
+
+	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
+	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
+	mutex_init(&dev->shutdown_lock);
+	init_completion(&dev->ioq_wait);
+
+	result = nvme_setup_prp_pools(dev);
+	if (result)
+		goto unmap;
+
+	quirks |= check_vendor_combination_bug(pdev);
+
+	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
+			quirks);
+	if (result)
+		goto release_pools;
+
+	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
+
+	nvme_reset_ctrl(&dev->ctrl);
+
+	return 0;
+
+ release_pools:
+	nvme_release_prp_pools(dev);
+ unmap:
+	nvme_dev_unmap(dev);
+ put_pci:
+	put_device(dev->dev);
+ free:
+	kfree(dev->queues);
+	kfree(dev);
+	return result;
+}
+
+static void nvme_reset_prepare(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+	nvme_dev_disable(dev, false);
+}
+
+static void nvme_reset_done(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+	nvme_reset_ctrl_sync(&dev->ctrl);
+}
+
+static void nvme_shutdown(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+	nvme_dev_disable(dev, true);
+}
+
+/*
+ * The driver's remove may be called on a device in a partially initialized
+ * state. This function must not have any dependencies on the device state in
+ * order to proceed.
+ */
+static void nvme_remove(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+
+	cancel_work_sync(&dev->ctrl.reset_work);
+	pci_set_drvdata(pdev, NULL);
+
+	if (!pci_device_is_present(pdev)) {
+		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
+		nvme_dev_disable(dev, false);
+	}
+
+	flush_work(&dev->ctrl.reset_work);
+	nvme_stop_ctrl(&dev->ctrl);
+	nvme_remove_namespaces(&dev->ctrl);
+	nvme_dev_disable(dev, true);
+	nvme_free_host_mem(dev);
+	nvme_dev_remove_admin(dev);
+	nvme_free_queues(dev, 0);
+	nvme_uninit_ctrl(&dev->ctrl);
+	nvme_release_prp_pools(dev);
+	nvme_dev_unmap(dev);
+	nvme_put_ctrl(&dev->ctrl);
+}
+
+static int nvme_pci_sriov_configure(struct pci_dev *pdev, int numvfs)
+{
+	int ret = 0;
+
+	if (numvfs == 0) {
+		if (pci_vfs_assigned(pdev)) {
+			dev_warn(&pdev->dev,
+				"Cannot disable SR-IOV VFs while assigned\n");
+			return -EPERM;
+		}
+		pci_disable_sriov(pdev);
+		return 0;
+	}
+
+	ret = pci_enable_sriov(pdev, numvfs);
+	return ret ? ret : numvfs;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int nvme_suspend(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct nvme_dev *ndev = pci_get_drvdata(pdev);
+
+	nvme_dev_disable(ndev, true);
+	return 0;
+}
+
+static int nvme_resume(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct nvme_dev *ndev = pci_get_drvdata(pdev);
+
+	nvme_reset_ctrl(&ndev->ctrl);
+	return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
+
+static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
+						pci_channel_state_t state)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	/*
+	 * A frozen channel requires a reset. When detected, this method will
+	 * shutdown the controller to quiesce. The controller will be restarted
+	 * after the slot reset through driver's slot_reset callback.
+	 */
+	switch (state) {
+	case pci_channel_io_normal:
+		return PCI_ERS_RESULT_CAN_RECOVER;
+	case pci_channel_io_frozen:
+		dev_warn(dev->ctrl.device,
+			"frozen state error detected, reset controller\n");
+		nvme_dev_disable(dev, false);
+		return PCI_ERS_RESULT_NEED_RESET;
+	case pci_channel_io_perm_failure:
+		dev_warn(dev->ctrl.device,
+			"failure state error detected, request disconnect\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	dev_info(dev->ctrl.device, "restart after slot reset\n");
+	pci_restore_state(pdev);
+	nvme_reset_ctrl(&dev->ctrl);
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void nvme_error_resume(struct pci_dev *pdev)
+{
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+}
+
+static const struct pci_error_handlers nvme_err_handler = {
+	.error_detected	= nvme_error_detected,
+	.slot_reset	= nvme_slot_reset,
+	.resume		= nvme_error_resume,
+	.reset_prepare	= nvme_reset_prepare,
+	.reset_done	= nvme_reset_done,
+};
+
+static const struct pci_device_id nvme_id_table[] = {
+	{ PCI_VDEVICE(INTEL, 0x0953),
+		.driver_data = NVME_QUIRK_STRIPE_SIZE |
+				NVME_QUIRK_DEALLOCATE_ZEROES, },
+	{ PCI_VDEVICE(INTEL, 0x0a53),
+		.driver_data = NVME_QUIRK_STRIPE_SIZE |
+				NVME_QUIRK_DEALLOCATE_ZEROES, },
+	{ PCI_VDEVICE(INTEL, 0x0a54),
+		.driver_data = NVME_QUIRK_STRIPE_SIZE |
+				NVME_QUIRK_DEALLOCATE_ZEROES, },
+	{ PCI_VDEVICE(INTEL, 0x0a55),
+		.driver_data = NVME_QUIRK_STRIPE_SIZE |
+				NVME_QUIRK_DEALLOCATE_ZEROES, },
+	{ PCI_VDEVICE(INTEL, 0xf1a5),	/* Intel 600P/P3100 */
+		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+				NVME_QUIRK_MEDIUM_PRIO_SQ },
+	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
+		.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
+	{ PCI_DEVICE(0x1c58, 0x0003),	/* HGST adapter */
+		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+	{ PCI_DEVICE(0x1c58, 0x0023),	/* WDC SN200 adapter */
+		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+	{ PCI_DEVICE(0x1c5f, 0x0540),	/* Memblaze Pblaze4 adapter */
+		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+	{ PCI_DEVICE(0x144d, 0xa821),   /* Samsung PM1725 */
+		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+	{ PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
+		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+	{ PCI_DEVICE(0x1d1d, 0x1f1f),	/* LighNVM qemu device */
+		.driver_data = NVME_QUIRK_LIGHTNVM, },
+	{ PCI_DEVICE(0x1d1d, 0x2807),	/* CNEX WL */
+		.driver_data = NVME_QUIRK_LIGHTNVM, },
+	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, nvme_id_table);
+
+static struct pci_driver nvme_driver = {
+	.name		= "nvme",
+	.id_table	= nvme_id_table,
+	.probe		= nvme_probe,
+	.remove		= nvme_remove,
+	.shutdown	= nvme_shutdown,
+	.driver		= {
+		.pm	= &nvme_dev_pm_ops,
+	},
+	.sriov_configure = nvme_pci_sriov_configure,
+	.err_handler	= &nvme_err_handler,
+};
+
+static int __init nvme_init(void)
+{
+	return pci_register_driver(&nvme_driver);
+}
+
+static void __exit nvme_exit(void)
+{
+	pci_unregister_driver(&nvme_driver);
+	flush_workqueue(nvme_wq);
+	_nvme_check_size();
+}
+
+MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0");
+module_init(nvme_init);
+module_exit(nvme_exit);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
new file mode 100644
index 0000000..1eb4438
--- /dev/null
+++ b/drivers/nvme/host/rdma.c
@@ -0,0 +1,2083 @@
+/*
+ * NVMe over Fabrics RDMA host code.
+ * Copyright (c) 2015-2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <rdma/mr_pool.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/atomic.h>
+#include <linux/blk-mq.h>
+#include <linux/blk-mq-rdma.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+#include <linux/nvme.h>
+#include <asm/unaligned.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/nvme-rdma.h>
+
+#include "nvme.h"
+#include "fabrics.h"
+
+
+#define NVME_RDMA_CONNECT_TIMEOUT_MS	3000		/* 3 second */
+
+#define NVME_RDMA_MAX_SEGMENTS		256
+
+#define NVME_RDMA_MAX_INLINE_SEGMENTS	1
+
+struct nvme_rdma_device {
+	struct ib_device	*dev;
+	struct ib_pd		*pd;
+	struct kref		ref;
+	struct list_head	entry;
+};
+
+struct nvme_rdma_qe {
+	struct ib_cqe		cqe;
+	void			*data;
+	u64			dma;
+};
+
+struct nvme_rdma_queue;
+struct nvme_rdma_request {
+	struct nvme_request	req;
+	struct ib_mr		*mr;
+	struct nvme_rdma_qe	sqe;
+	union nvme_result	result;
+	__le16			status;
+	refcount_t		ref;
+	struct ib_sge		sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
+	u32			num_sge;
+	int			nents;
+	struct ib_reg_wr	reg_wr;
+	struct ib_cqe		reg_cqe;
+	struct nvme_rdma_queue  *queue;
+	struct sg_table		sg_table;
+	struct scatterlist	first_sgl[];
+};
+
+enum nvme_rdma_queue_flags {
+	NVME_RDMA_Q_ALLOCATED		= 0,
+	NVME_RDMA_Q_LIVE		= 1,
+	NVME_RDMA_Q_TR_READY		= 2,
+};
+
+struct nvme_rdma_queue {
+	struct nvme_rdma_qe	*rsp_ring;
+	int			queue_size;
+	size_t			cmnd_capsule_len;
+	struct nvme_rdma_ctrl	*ctrl;
+	struct nvme_rdma_device	*device;
+	struct ib_cq		*ib_cq;
+	struct ib_qp		*qp;
+
+	unsigned long		flags;
+	struct rdma_cm_id	*cm_id;
+	int			cm_error;
+	struct completion	cm_done;
+};
+
+struct nvme_rdma_ctrl {
+	/* read only in the hot path */
+	struct nvme_rdma_queue	*queues;
+
+	/* other member variables */
+	struct blk_mq_tag_set	tag_set;
+	struct work_struct	err_work;
+
+	struct nvme_rdma_qe	async_event_sqe;
+
+	struct delayed_work	reconnect_work;
+
+	struct list_head	list;
+
+	struct blk_mq_tag_set	admin_tag_set;
+	struct nvme_rdma_device	*device;
+
+	u32			max_fr_pages;
+
+	struct sockaddr_storage addr;
+	struct sockaddr_storage src_addr;
+
+	struct nvme_ctrl	ctrl;
+};
+
+static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
+{
+	return container_of(ctrl, struct nvme_rdma_ctrl, ctrl);
+}
+
+static LIST_HEAD(device_list);
+static DEFINE_MUTEX(device_list_mutex);
+
+static LIST_HEAD(nvme_rdma_ctrl_list);
+static DEFINE_MUTEX(nvme_rdma_ctrl_mutex);
+
+/*
+ * Disabling this option makes small I/O goes faster, but is fundamentally
+ * unsafe.  With it turned off we will have to register a global rkey that
+ * allows read and write access to all physical memory.
+ */
+static bool register_always = true;
+module_param(register_always, bool, 0444);
+MODULE_PARM_DESC(register_always,
+	 "Use memory registration even for contiguous memory regions");
+
+static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
+		struct rdma_cm_event *event);
+static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+
+static const struct blk_mq_ops nvme_rdma_mq_ops;
+static const struct blk_mq_ops nvme_rdma_admin_mq_ops;
+
+/* XXX: really should move to a generic header sooner or later.. */
+static inline void put_unaligned_le24(u32 val, u8 *p)
+{
+	*p++ = val;
+	*p++ = val >> 8;
+	*p++ = val >> 16;
+}
+
+static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
+{
+	return queue - queue->ctrl->queues;
+}
+
+static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
+{
+	return queue->cmnd_capsule_len - sizeof(struct nvme_command);
+}
+
+static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
+		size_t capsule_size, enum dma_data_direction dir)
+{
+	ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir);
+	kfree(qe->data);
+}
+
+static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
+		size_t capsule_size, enum dma_data_direction dir)
+{
+	qe->data = kzalloc(capsule_size, GFP_KERNEL);
+	if (!qe->data)
+		return -ENOMEM;
+
+	qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir);
+	if (ib_dma_mapping_error(ibdev, qe->dma)) {
+		kfree(qe->data);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void nvme_rdma_free_ring(struct ib_device *ibdev,
+		struct nvme_rdma_qe *ring, size_t ib_queue_size,
+		size_t capsule_size, enum dma_data_direction dir)
+{
+	int i;
+
+	for (i = 0; i < ib_queue_size; i++)
+		nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir);
+	kfree(ring);
+}
+
+static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
+		size_t ib_queue_size, size_t capsule_size,
+		enum dma_data_direction dir)
+{
+	struct nvme_rdma_qe *ring;
+	int i;
+
+	ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL);
+	if (!ring)
+		return NULL;
+
+	for (i = 0; i < ib_queue_size; i++) {
+		if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
+			goto out_free_ring;
+	}
+
+	return ring;
+
+out_free_ring:
+	nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir);
+	return NULL;
+}
+
+static void nvme_rdma_qp_event(struct ib_event *event, void *context)
+{
+	pr_debug("QP event %s (%d)\n",
+		 ib_event_msg(event->event), event->event);
+
+}
+
+static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
+{
+	wait_for_completion_interruptible_timeout(&queue->cm_done,
+			msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
+	return queue->cm_error;
+}
+
+static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
+{
+	struct nvme_rdma_device *dev = queue->device;
+	struct ib_qp_init_attr init_attr;
+	int ret;
+
+	memset(&init_attr, 0, sizeof(init_attr));
+	init_attr.event_handler = nvme_rdma_qp_event;
+	/* +1 for drain */
+	init_attr.cap.max_send_wr = factor * queue->queue_size + 1;
+	/* +1 for drain */
+	init_attr.cap.max_recv_wr = queue->queue_size + 1;
+	init_attr.cap.max_recv_sge = 1;
+	init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS;
+	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	init_attr.qp_type = IB_QPT_RC;
+	init_attr.send_cq = queue->ib_cq;
+	init_attr.recv_cq = queue->ib_cq;
+
+	ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
+
+	queue->qp = queue->cm_id->qp;
+	return ret;
+}
+
+static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
+		struct request *rq, unsigned int hctx_idx)
+{
+	struct nvme_rdma_ctrl *ctrl = set->driver_data;
+	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+	int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
+	struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
+	struct nvme_rdma_device *dev = queue->device;
+
+	nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
+			DMA_TO_DEVICE);
+}
+
+static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
+		struct request *rq, unsigned int hctx_idx,
+		unsigned int numa_node)
+{
+	struct nvme_rdma_ctrl *ctrl = set->driver_data;
+	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+	int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
+	struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
+	struct nvme_rdma_device *dev = queue->device;
+	struct ib_device *ibdev = dev->dev;
+	int ret;
+
+	ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
+			DMA_TO_DEVICE);
+	if (ret)
+		return ret;
+
+	req->queue = queue;
+
+	return 0;
+}
+
+static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_rdma_ctrl *ctrl = data;
+	struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
+
+	BUG_ON(hctx_idx >= ctrl->ctrl.queue_count);
+
+	hctx->driver_data = queue;
+	return 0;
+}
+
+static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_rdma_ctrl *ctrl = data;
+	struct nvme_rdma_queue *queue = &ctrl->queues[0];
+
+	BUG_ON(hctx_idx != 0);
+
+	hctx->driver_data = queue;
+	return 0;
+}
+
+static void nvme_rdma_free_dev(struct kref *ref)
+{
+	struct nvme_rdma_device *ndev =
+		container_of(ref, struct nvme_rdma_device, ref);
+
+	mutex_lock(&device_list_mutex);
+	list_del(&ndev->entry);
+	mutex_unlock(&device_list_mutex);
+
+	ib_dealloc_pd(ndev->pd);
+	kfree(ndev);
+}
+
+static void nvme_rdma_dev_put(struct nvme_rdma_device *dev)
+{
+	kref_put(&dev->ref, nvme_rdma_free_dev);
+}
+
+static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
+{
+	return kref_get_unless_zero(&dev->ref);
+}
+
+static struct nvme_rdma_device *
+nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
+{
+	struct nvme_rdma_device *ndev;
+
+	mutex_lock(&device_list_mutex);
+	list_for_each_entry(ndev, &device_list, entry) {
+		if (ndev->dev->node_guid == cm_id->device->node_guid &&
+		    nvme_rdma_dev_get(ndev))
+			goto out_unlock;
+	}
+
+	ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
+	if (!ndev)
+		goto out_err;
+
+	ndev->dev = cm_id->device;
+	kref_init(&ndev->ref);
+
+	ndev->pd = ib_alloc_pd(ndev->dev,
+		register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
+	if (IS_ERR(ndev->pd))
+		goto out_free_dev;
+
+	if (!(ndev->dev->attrs.device_cap_flags &
+	      IB_DEVICE_MEM_MGT_EXTENSIONS)) {
+		dev_err(&ndev->dev->dev,
+			"Memory registrations not supported.\n");
+		goto out_free_pd;
+	}
+
+	list_add(&ndev->entry, &device_list);
+out_unlock:
+	mutex_unlock(&device_list_mutex);
+	return ndev;
+
+out_free_pd:
+	ib_dealloc_pd(ndev->pd);
+out_free_dev:
+	kfree(ndev);
+out_err:
+	mutex_unlock(&device_list_mutex);
+	return NULL;
+}
+
+static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
+{
+	struct nvme_rdma_device *dev;
+	struct ib_device *ibdev;
+
+	if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags))
+		return;
+
+	dev = queue->device;
+	ibdev = dev->dev;
+
+	ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
+
+	/*
+	 * The cm_id object might have been destroyed during RDMA connection
+	 * establishment error flow to avoid getting other cma events, thus
+	 * the destruction of the QP shouldn't use rdma_cm API.
+	 */
+	ib_destroy_qp(queue->qp);
+	ib_free_cq(queue->ib_cq);
+
+	nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
+			sizeof(struct nvme_completion), DMA_FROM_DEVICE);
+
+	nvme_rdma_dev_put(dev);
+}
+
+static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
+{
+	return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
+		     ibdev->attrs.max_fast_reg_page_list_len);
+}
+
+static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
+{
+	struct ib_device *ibdev;
+	const int send_wr_factor = 3;			/* MR, SEND, INV */
+	const int cq_factor = send_wr_factor + 1;	/* + RECV */
+	int comp_vector, idx = nvme_rdma_queue_idx(queue);
+	int ret;
+
+	queue->device = nvme_rdma_find_get_device(queue->cm_id);
+	if (!queue->device) {
+		dev_err(queue->cm_id->device->dev.parent,
+			"no client data found!\n");
+		return -ECONNREFUSED;
+	}
+	ibdev = queue->device->dev;
+
+	/*
+	 * Spread I/O queues completion vectors according their queue index.
+	 * Admin queues can always go on completion vector 0.
+	 */
+	comp_vector = idx == 0 ? idx : idx - 1;
+
+	/* +1 for ib_stop_cq */
+	queue->ib_cq = ib_alloc_cq(ibdev, queue,
+				cq_factor * queue->queue_size + 1,
+				comp_vector, IB_POLL_SOFTIRQ);
+	if (IS_ERR(queue->ib_cq)) {
+		ret = PTR_ERR(queue->ib_cq);
+		goto out_put_dev;
+	}
+
+	ret = nvme_rdma_create_qp(queue, send_wr_factor);
+	if (ret)
+		goto out_destroy_ib_cq;
+
+	queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size,
+			sizeof(struct nvme_completion), DMA_FROM_DEVICE);
+	if (!queue->rsp_ring) {
+		ret = -ENOMEM;
+		goto out_destroy_qp;
+	}
+
+	ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
+			      queue->queue_size,
+			      IB_MR_TYPE_MEM_REG,
+			      nvme_rdma_get_max_fr_pages(ibdev));
+	if (ret) {
+		dev_err(queue->ctrl->ctrl.device,
+			"failed to initialize MR pool sized %d for QID %d\n",
+			queue->queue_size, idx);
+		goto out_destroy_ring;
+	}
+
+	set_bit(NVME_RDMA_Q_TR_READY, &queue->flags);
+
+	return 0;
+
+out_destroy_ring:
+	nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
+			    sizeof(struct nvme_completion), DMA_FROM_DEVICE);
+out_destroy_qp:
+	rdma_destroy_qp(queue->cm_id);
+out_destroy_ib_cq:
+	ib_free_cq(queue->ib_cq);
+out_put_dev:
+	nvme_rdma_dev_put(queue->device);
+	return ret;
+}
+
+static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
+		int idx, size_t queue_size)
+{
+	struct nvme_rdma_queue *queue;
+	struct sockaddr *src_addr = NULL;
+	int ret;
+
+	queue = &ctrl->queues[idx];
+	queue->ctrl = ctrl;
+	init_completion(&queue->cm_done);
+
+	if (idx > 0)
+		queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
+	else
+		queue->cmnd_capsule_len = sizeof(struct nvme_command);
+
+	queue->queue_size = queue_size;
+
+	queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
+			RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(queue->cm_id)) {
+		dev_info(ctrl->ctrl.device,
+			"failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
+		return PTR_ERR(queue->cm_id);
+	}
+
+	if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
+		src_addr = (struct sockaddr *)&ctrl->src_addr;
+
+	queue->cm_error = -ETIMEDOUT;
+	ret = rdma_resolve_addr(queue->cm_id, src_addr,
+			(struct sockaddr *)&ctrl->addr,
+			NVME_RDMA_CONNECT_TIMEOUT_MS);
+	if (ret) {
+		dev_info(ctrl->ctrl.device,
+			"rdma_resolve_addr failed (%d).\n", ret);
+		goto out_destroy_cm_id;
+	}
+
+	ret = nvme_rdma_wait_for_cm(queue);
+	if (ret) {
+		dev_info(ctrl->ctrl.device,
+			"rdma connection establishment failed (%d)\n", ret);
+		goto out_destroy_cm_id;
+	}
+
+	set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags);
+
+	return 0;
+
+out_destroy_cm_id:
+	rdma_destroy_id(queue->cm_id);
+	nvme_rdma_destroy_queue_ib(queue);
+	return ret;
+}
+
+static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
+{
+	if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
+		return;
+
+	rdma_disconnect(queue->cm_id);
+	ib_drain_qp(queue->qp);
+}
+
+static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
+{
+	if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
+		return;
+
+	if (nvme_rdma_queue_idx(queue) == 0) {
+		nvme_rdma_free_qe(queue->device->dev,
+			&queue->ctrl->async_event_sqe,
+			sizeof(struct nvme_command), DMA_TO_DEVICE);
+	}
+
+	nvme_rdma_destroy_queue_ib(queue);
+	rdma_destroy_id(queue->cm_id);
+}
+
+static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->ctrl.queue_count; i++)
+		nvme_rdma_free_queue(&ctrl->queues[i]);
+}
+
+static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->ctrl.queue_count; i++)
+		nvme_rdma_stop_queue(&ctrl->queues[i]);
+}
+
+static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
+{
+	int ret;
+
+	if (idx)
+		ret = nvmf_connect_io_queue(&ctrl->ctrl, idx);
+	else
+		ret = nvmf_connect_admin_queue(&ctrl->ctrl);
+
+	if (!ret)
+		set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags);
+	else
+		dev_info(ctrl->ctrl.device,
+			"failed to connect queue: %d ret=%d\n", idx, ret);
+	return ret;
+}
+
+static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl)
+{
+	int i, ret = 0;
+
+	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
+		ret = nvme_rdma_start_queue(ctrl, i);
+		if (ret)
+			goto out_stop_queues;
+	}
+
+	return 0;
+
+out_stop_queues:
+	for (i--; i >= 1; i--)
+		nvme_rdma_stop_queue(&ctrl->queues[i]);
+	return ret;
+}
+
+static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
+{
+	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+	struct ib_device *ibdev = ctrl->device->dev;
+	unsigned int nr_io_queues;
+	int i, ret;
+
+	nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
+
+	/*
+	 * we map queues according to the device irq vectors for
+	 * optimal locality so we don't need more queues than
+	 * completion vectors.
+	 */
+	nr_io_queues = min_t(unsigned int, nr_io_queues,
+				ibdev->num_comp_vectors);
+
+	ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
+	if (ret)
+		return ret;
+
+	ctrl->ctrl.queue_count = nr_io_queues + 1;
+	if (ctrl->ctrl.queue_count < 2)
+		return 0;
+
+	dev_info(ctrl->ctrl.device,
+		"creating %d I/O queues.\n", nr_io_queues);
+
+	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
+		ret = nvme_rdma_alloc_queue(ctrl, i,
+				ctrl->ctrl.sqsize + 1);
+		if (ret)
+			goto out_free_queues;
+	}
+
+	return 0;
+
+out_free_queues:
+	for (i--; i >= 1; i--)
+		nvme_rdma_free_queue(&ctrl->queues[i]);
+
+	return ret;
+}
+
+static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl,
+		struct blk_mq_tag_set *set)
+{
+	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+
+	blk_mq_free_tag_set(set);
+	nvme_rdma_dev_put(ctrl->device);
+}
+
+static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
+		bool admin)
+{
+	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+	struct blk_mq_tag_set *set;
+	int ret;
+
+	if (admin) {
+		set = &ctrl->admin_tag_set;
+		memset(set, 0, sizeof(*set));
+		set->ops = &nvme_rdma_admin_mq_ops;
+		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+		set->reserved_tags = 2; /* connect + keep-alive */
+		set->numa_node = NUMA_NO_NODE;
+		set->cmd_size = sizeof(struct nvme_rdma_request) +
+			SG_CHUNK_SIZE * sizeof(struct scatterlist);
+		set->driver_data = ctrl;
+		set->nr_hw_queues = 1;
+		set->timeout = ADMIN_TIMEOUT;
+		set->flags = BLK_MQ_F_NO_SCHED;
+	} else {
+		set = &ctrl->tag_set;
+		memset(set, 0, sizeof(*set));
+		set->ops = &nvme_rdma_mq_ops;
+		set->queue_depth = nctrl->opts->queue_size;
+		set->reserved_tags = 1; /* fabric connect */
+		set->numa_node = NUMA_NO_NODE;
+		set->flags = BLK_MQ_F_SHOULD_MERGE;
+		set->cmd_size = sizeof(struct nvme_rdma_request) +
+			SG_CHUNK_SIZE * sizeof(struct scatterlist);
+		set->driver_data = ctrl;
+		set->nr_hw_queues = nctrl->queue_count - 1;
+		set->timeout = NVME_IO_TIMEOUT;
+	}
+
+	ret = blk_mq_alloc_tag_set(set);
+	if (ret)
+		goto out;
+
+	/*
+	 * We need a reference on the device as long as the tag_set is alive,
+	 * as the MRs in the request structures need a valid ib_device.
+	 */
+	ret = nvme_rdma_dev_get(ctrl->device);
+	if (!ret) {
+		ret = -EINVAL;
+		goto out_free_tagset;
+	}
+
+	return set;
+
+out_free_tagset:
+	blk_mq_free_tag_set(set);
+out:
+	return ERR_PTR(ret);
+}
+
+static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
+		bool remove)
+{
+	nvme_rdma_stop_queue(&ctrl->queues[0]);
+	if (remove) {
+		blk_cleanup_queue(ctrl->ctrl.admin_q);
+		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
+	}
+	nvme_rdma_free_queue(&ctrl->queues[0]);
+}
+
+static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
+		bool new)
+{
+	int error;
+
+	error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
+	if (error)
+		return error;
+
+	ctrl->device = ctrl->queues[0].device;
+
+	ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
+
+	if (new) {
+		ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
+		if (IS_ERR(ctrl->ctrl.admin_tagset)) {
+			error = PTR_ERR(ctrl->ctrl.admin_tagset);
+			goto out_free_queue;
+		}
+
+		ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+		if (IS_ERR(ctrl->ctrl.admin_q)) {
+			error = PTR_ERR(ctrl->ctrl.admin_q);
+			goto out_free_tagset;
+		}
+	}
+
+	error = nvme_rdma_start_queue(ctrl, 0);
+	if (error)
+		goto out_cleanup_queue;
+
+	error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP,
+			&ctrl->ctrl.cap);
+	if (error) {
+		dev_err(ctrl->ctrl.device,
+			"prop_get NVME_REG_CAP failed\n");
+		goto out_cleanup_queue;
+	}
+
+	ctrl->ctrl.sqsize =
+		min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
+
+	error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+	if (error)
+		goto out_cleanup_queue;
+
+	ctrl->ctrl.max_hw_sectors =
+		(ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
+
+	error = nvme_init_identify(&ctrl->ctrl);
+	if (error)
+		goto out_cleanup_queue;
+
+	error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev,
+			&ctrl->async_event_sqe, sizeof(struct nvme_command),
+			DMA_TO_DEVICE);
+	if (error)
+		goto out_cleanup_queue;
+
+	return 0;
+
+out_cleanup_queue:
+	if (new)
+		blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_free_tagset:
+	if (new)
+		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
+out_free_queue:
+	nvme_rdma_free_queue(&ctrl->queues[0]);
+	return error;
+}
+
+static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
+		bool remove)
+{
+	nvme_rdma_stop_io_queues(ctrl);
+	if (remove) {
+		blk_cleanup_queue(ctrl->ctrl.connect_q);
+		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
+	}
+	nvme_rdma_free_io_queues(ctrl);
+}
+
+static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
+{
+	int ret;
+
+	ret = nvme_rdma_alloc_io_queues(ctrl);
+	if (ret)
+		return ret;
+
+	if (new) {
+		ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false);
+		if (IS_ERR(ctrl->ctrl.tagset)) {
+			ret = PTR_ERR(ctrl->ctrl.tagset);
+			goto out_free_io_queues;
+		}
+
+		ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
+		if (IS_ERR(ctrl->ctrl.connect_q)) {
+			ret = PTR_ERR(ctrl->ctrl.connect_q);
+			goto out_free_tag_set;
+		}
+	} else {
+		blk_mq_update_nr_hw_queues(&ctrl->tag_set,
+			ctrl->ctrl.queue_count - 1);
+	}
+
+	ret = nvme_rdma_start_io_queues(ctrl);
+	if (ret)
+		goto out_cleanup_connect_q;
+
+	return 0;
+
+out_cleanup_connect_q:
+	if (new)
+		blk_cleanup_queue(ctrl->ctrl.connect_q);
+out_free_tag_set:
+	if (new)
+		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
+out_free_io_queues:
+	nvme_rdma_free_io_queues(ctrl);
+	return ret;
+}
+
+static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+
+	cancel_work_sync(&ctrl->err_work);
+	cancel_delayed_work_sync(&ctrl->reconnect_work);
+}
+
+static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+
+	if (list_empty(&ctrl->list))
+		goto free_ctrl;
+
+	mutex_lock(&nvme_rdma_ctrl_mutex);
+	list_del(&ctrl->list);
+	mutex_unlock(&nvme_rdma_ctrl_mutex);
+
+	kfree(ctrl->queues);
+	nvmf_free_options(nctrl->opts);
+free_ctrl:
+	kfree(ctrl);
+}
+
+static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
+{
+	/* If we are resetting/deleting then do nothing */
+	if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
+		WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
+			ctrl->ctrl.state == NVME_CTRL_LIVE);
+		return;
+	}
+
+	if (nvmf_should_reconnect(&ctrl->ctrl)) {
+		dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n",
+			ctrl->ctrl.opts->reconnect_delay);
+		queue_delayed_work(nvme_wq, &ctrl->reconnect_work,
+				ctrl->ctrl.opts->reconnect_delay * HZ);
+	} else {
+		nvme_delete_ctrl(&ctrl->ctrl);
+	}
+}
+
+static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
+{
+	struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
+			struct nvme_rdma_ctrl, reconnect_work);
+	bool changed;
+	int ret;
+
+	++ctrl->ctrl.nr_reconnects;
+
+	ret = nvme_rdma_configure_admin_queue(ctrl, false);
+	if (ret)
+		goto requeue;
+
+	if (ctrl->ctrl.queue_count > 1) {
+		ret = nvme_rdma_configure_io_queues(ctrl, false);
+		if (ret)
+			goto destroy_admin;
+	}
+
+	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
+	if (!changed) {
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
+		return;
+	}
+
+	nvme_start_ctrl(&ctrl->ctrl);
+
+	dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
+			ctrl->ctrl.nr_reconnects);
+
+	ctrl->ctrl.nr_reconnects = 0;
+
+	return;
+
+destroy_admin:
+	nvme_rdma_destroy_admin_queue(ctrl, false);
+requeue:
+	dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
+			ctrl->ctrl.nr_reconnects);
+	nvme_rdma_reconnect_or_remove(ctrl);
+}
+
+static void nvme_rdma_error_recovery_work(struct work_struct *work)
+{
+	struct nvme_rdma_ctrl *ctrl = container_of(work,
+			struct nvme_rdma_ctrl, err_work);
+
+	nvme_stop_keep_alive(&ctrl->ctrl);
+
+	if (ctrl->ctrl.queue_count > 1) {
+		nvme_stop_queues(&ctrl->ctrl);
+		blk_mq_tagset_busy_iter(&ctrl->tag_set,
+					nvme_cancel_request, &ctrl->ctrl);
+		nvme_rdma_destroy_io_queues(ctrl, false);
+	}
+
+	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
+				nvme_cancel_request, &ctrl->ctrl);
+	nvme_rdma_destroy_admin_queue(ctrl, false);
+
+	/*
+	 * queues are not a live anymore, so restart the queues to fail fast
+	 * new IO
+	 */
+	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+	nvme_start_queues(&ctrl->ctrl);
+
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
+		return;
+	}
+
+	nvme_rdma_reconnect_or_remove(ctrl);
+}
+
+static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
+{
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
+		return;
+
+	queue_work(nvme_wq, &ctrl->err_work);
+}
+
+static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
+		const char *op)
+{
+	struct nvme_rdma_queue *queue = cq->cq_context;
+	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
+
+	if (ctrl->ctrl.state == NVME_CTRL_LIVE)
+		dev_info(ctrl->ctrl.device,
+			     "%s for CQE 0x%p failed with status %s (%d)\n",
+			     op, wc->wr_cqe,
+			     ib_wc_status_msg(wc->status), wc->status);
+	nvme_rdma_error_recovery(ctrl);
+}
+
+static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	if (unlikely(wc->status != IB_WC_SUCCESS))
+		nvme_rdma_wr_error(cq, wc, "MEMREG");
+}
+
+static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct nvme_rdma_request *req =
+		container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe);
+	struct request *rq = blk_mq_rq_from_pdu(req);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
+		return;
+	}
+
+	if (refcount_dec_and_test(&req->ref))
+		nvme_end_request(rq, req->status, req->result);
+
+}
+
+static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
+		struct nvme_rdma_request *req)
+{
+	struct ib_send_wr *bad_wr;
+	struct ib_send_wr wr = {
+		.opcode		    = IB_WR_LOCAL_INV,
+		.next		    = NULL,
+		.num_sge	    = 0,
+		.send_flags	    = IB_SEND_SIGNALED,
+		.ex.invalidate_rkey = req->mr->rkey,
+	};
+
+	req->reg_cqe.done = nvme_rdma_inv_rkey_done;
+	wr.wr_cqe = &req->reg_cqe;
+
+	return ib_post_send(queue->qp, &wr, &bad_wr);
+}
+
+static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
+		struct request *rq)
+{
+	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_rdma_device *dev = queue->device;
+	struct ib_device *ibdev = dev->dev;
+
+	if (!blk_rq_payload_bytes(rq))
+		return;
+
+	if (req->mr) {
+		ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
+		req->mr = NULL;
+	}
+
+	ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
+			req->nents, rq_data_dir(rq) ==
+				    WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+	nvme_cleanup_cmd(rq);
+	sg_free_table_chained(&req->sg_table, true);
+}
+
+static int nvme_rdma_set_sg_null(struct nvme_command *c)
+{
+	struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
+
+	sg->addr = 0;
+	put_unaligned_le24(0, sg->length);
+	put_unaligned_le32(0, sg->key);
+	sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
+	return 0;
+}
+
+static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
+		struct nvme_rdma_request *req, struct nvme_command *c)
+{
+	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+	req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
+	req->sge[1].length = sg_dma_len(req->sg_table.sgl);
+	req->sge[1].lkey = queue->device->pd->local_dma_lkey;
+
+	sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
+	sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
+	sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
+
+	req->num_sge++;
+	return 0;
+}
+
+static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
+		struct nvme_rdma_request *req, struct nvme_command *c)
+{
+	struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
+
+	sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl));
+	put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length);
+	put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
+	sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
+	return 0;
+}
+
+static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
+		struct nvme_rdma_request *req, struct nvme_command *c,
+		int count)
+{
+	struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
+	int nr;
+
+	req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs);
+	if (WARN_ON_ONCE(!req->mr))
+		return -EAGAIN;
+
+	/*
+	 * Align the MR to a 4K page size to match the ctrl page size and
+	 * the block virtual boundary.
+	 */
+	nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
+	if (unlikely(nr < count)) {
+		ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
+		req->mr = NULL;
+		if (nr < 0)
+			return nr;
+		return -EINVAL;
+	}
+
+	ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
+
+	req->reg_cqe.done = nvme_rdma_memreg_done;
+	memset(&req->reg_wr, 0, sizeof(req->reg_wr));
+	req->reg_wr.wr.opcode = IB_WR_REG_MR;
+	req->reg_wr.wr.wr_cqe = &req->reg_cqe;
+	req->reg_wr.wr.num_sge = 0;
+	req->reg_wr.mr = req->mr;
+	req->reg_wr.key = req->mr->rkey;
+	req->reg_wr.access = IB_ACCESS_LOCAL_WRITE |
+			     IB_ACCESS_REMOTE_READ |
+			     IB_ACCESS_REMOTE_WRITE;
+
+	sg->addr = cpu_to_le64(req->mr->iova);
+	put_unaligned_le24(req->mr->length, sg->length);
+	put_unaligned_le32(req->mr->rkey, sg->key);
+	sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) |
+			NVME_SGL_FMT_INVALIDATE;
+
+	return 0;
+}
+
+static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
+		struct request *rq, struct nvme_command *c)
+{
+	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_rdma_device *dev = queue->device;
+	struct ib_device *ibdev = dev->dev;
+	int count, ret;
+
+	req->num_sge = 1;
+	refcount_set(&req->ref, 2); /* send and recv completions */
+
+	c->common.flags |= NVME_CMD_SGL_METABUF;
+
+	if (!blk_rq_payload_bytes(rq))
+		return nvme_rdma_set_sg_null(c);
+
+	req->sg_table.sgl = req->first_sgl;
+	ret = sg_alloc_table_chained(&req->sg_table,
+			blk_rq_nr_phys_segments(rq), req->sg_table.sgl);
+	if (ret)
+		return -ENOMEM;
+
+	req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
+
+	count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
+		    rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+	if (unlikely(count <= 0)) {
+		sg_free_table_chained(&req->sg_table, true);
+		return -EIO;
+	}
+
+	if (count == 1) {
+		if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
+		    blk_rq_payload_bytes(rq) <=
+				nvme_rdma_inline_data_size(queue))
+			return nvme_rdma_map_sg_inline(queue, req, c);
+
+		if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY)
+			return nvme_rdma_map_sg_single(queue, req, c);
+	}
+
+	return nvme_rdma_map_sg_fr(queue, req, c, count);
+}
+
+static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct nvme_rdma_qe *qe =
+		container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
+	struct nvme_rdma_request *req =
+		container_of(qe, struct nvme_rdma_request, sqe);
+	struct request *rq = blk_mq_rq_from_pdu(req);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		nvme_rdma_wr_error(cq, wc, "SEND");
+		return;
+	}
+
+	if (refcount_dec_and_test(&req->ref))
+		nvme_end_request(rq, req->status, req->result);
+}
+
+static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
+		struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
+		struct ib_send_wr *first)
+{
+	struct ib_send_wr wr, *bad_wr;
+	int ret;
+
+	sge->addr   = qe->dma;
+	sge->length = sizeof(struct nvme_command),
+	sge->lkey   = queue->device->pd->local_dma_lkey;
+
+	wr.next       = NULL;
+	wr.wr_cqe     = &qe->cqe;
+	wr.sg_list    = sge;
+	wr.num_sge    = num_sge;
+	wr.opcode     = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+
+	if (first)
+		first->next = &wr;
+	else
+		first = &wr;
+
+	ret = ib_post_send(queue->qp, first, &bad_wr);
+	if (unlikely(ret)) {
+		dev_err(queue->ctrl->ctrl.device,
+			     "%s failed with error code %d\n", __func__, ret);
+	}
+	return ret;
+}
+
+static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue,
+		struct nvme_rdma_qe *qe)
+{
+	struct ib_recv_wr wr, *bad_wr;
+	struct ib_sge list;
+	int ret;
+
+	list.addr   = qe->dma;
+	list.length = sizeof(struct nvme_completion);
+	list.lkey   = queue->device->pd->local_dma_lkey;
+
+	qe->cqe.done = nvme_rdma_recv_done;
+
+	wr.next     = NULL;
+	wr.wr_cqe   = &qe->cqe;
+	wr.sg_list  = &list;
+	wr.num_sge  = 1;
+
+	ret = ib_post_recv(queue->qp, &wr, &bad_wr);
+	if (unlikely(ret)) {
+		dev_err(queue->ctrl->ctrl.device,
+			"%s failed with error code %d\n", __func__, ret);
+	}
+	return ret;
+}
+
+static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
+{
+	u32 queue_idx = nvme_rdma_queue_idx(queue);
+
+	if (queue_idx == 0)
+		return queue->ctrl->admin_tag_set.tags[queue_idx];
+	return queue->ctrl->tag_set.tags[queue_idx - 1];
+}
+
+static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	if (unlikely(wc->status != IB_WC_SUCCESS))
+		nvme_rdma_wr_error(cq, wc, "ASYNC");
+}
+
+static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
+{
+	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
+	struct nvme_rdma_queue *queue = &ctrl->queues[0];
+	struct ib_device *dev = queue->device->dev;
+	struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe;
+	struct nvme_command *cmd = sqe->data;
+	struct ib_sge sge;
+	int ret;
+
+	ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->common.opcode = nvme_admin_async_event;
+	cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
+	cmd->common.flags |= NVME_CMD_SGL_METABUF;
+	nvme_rdma_set_sg_null(cmd);
+
+	sqe->cqe.done = nvme_rdma_async_done;
+
+	ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
+			DMA_TO_DEVICE);
+
+	ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);
+	WARN_ON_ONCE(ret);
+}
+
+static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
+		struct nvme_completion *cqe, struct ib_wc *wc, int tag)
+{
+	struct request *rq;
+	struct nvme_rdma_request *req;
+	int ret = 0;
+
+	rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
+	if (!rq) {
+		dev_err(queue->ctrl->ctrl.device,
+			"tag 0x%x on QP %#x not found\n",
+			cqe->command_id, queue->qp->qp_num);
+		nvme_rdma_error_recovery(queue->ctrl);
+		return ret;
+	}
+	req = blk_mq_rq_to_pdu(rq);
+
+	req->status = cqe->status;
+	req->result = cqe->result;
+
+	if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
+		if (unlikely(wc->ex.invalidate_rkey != req->mr->rkey)) {
+			dev_err(queue->ctrl->ctrl.device,
+				"Bogus remote invalidation for rkey %#x\n",
+				req->mr->rkey);
+			nvme_rdma_error_recovery(queue->ctrl);
+		}
+	} else if (req->mr) {
+		ret = nvme_rdma_inv_rkey(queue, req);
+		if (unlikely(ret < 0)) {
+			dev_err(queue->ctrl->ctrl.device,
+				"Queueing INV WR for rkey %#x failed (%d)\n",
+				req->mr->rkey, ret);
+			nvme_rdma_error_recovery(queue->ctrl);
+		}
+		/* the local invalidation completion will end the request */
+		return 0;
+	}
+
+	if (refcount_dec_and_test(&req->ref)) {
+		if (rq->tag == tag)
+			ret = 1;
+		nvme_end_request(rq, req->status, req->result);
+	}
+
+	return ret;
+}
+
+static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
+{
+	struct nvme_rdma_qe *qe =
+		container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
+	struct nvme_rdma_queue *queue = cq->cq_context;
+	struct ib_device *ibdev = queue->device->dev;
+	struct nvme_completion *cqe = qe->data;
+	const size_t len = sizeof(struct nvme_completion);
+	int ret = 0;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		nvme_rdma_wr_error(cq, wc, "RECV");
+		return 0;
+	}
+
+	ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
+	/*
+	 * AEN requests are special as they don't time out and can
+	 * survive any kind of queue freeze and often don't respond to
+	 * aborts.  We don't even bother to allocate a struct request
+	 * for them but rather special case them here.
+	 */
+	if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
+			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
+		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
+				&cqe->result);
+	else
+		ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
+	ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
+
+	nvme_rdma_post_recv(queue, qe);
+	return ret;
+}
+
+static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	__nvme_rdma_recv_done(cq, wc, -1);
+}
+
+static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
+{
+	int ret, i;
+
+	for (i = 0; i < queue->queue_size; i++) {
+		ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
+		if (ret)
+			goto out_destroy_queue_ib;
+	}
+
+	return 0;
+
+out_destroy_queue_ib:
+	nvme_rdma_destroy_queue_ib(queue);
+	return ret;
+}
+
+static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
+		struct rdma_cm_event *ev)
+{
+	struct rdma_cm_id *cm_id = queue->cm_id;
+	int status = ev->status;
+	const char *rej_msg;
+	const struct nvme_rdma_cm_rej *rej_data;
+	u8 rej_data_len;
+
+	rej_msg = rdma_reject_msg(cm_id, status);
+	rej_data = rdma_consumer_reject_data(cm_id, ev, &rej_data_len);
+
+	if (rej_data && rej_data_len >= sizeof(u16)) {
+		u16 sts = le16_to_cpu(rej_data->sts);
+
+		dev_err(queue->ctrl->ctrl.device,
+		      "Connect rejected: status %d (%s) nvme status %d (%s).\n",
+		      status, rej_msg, sts, nvme_rdma_cm_msg(sts));
+	} else {
+		dev_err(queue->ctrl->ctrl.device,
+			"Connect rejected: status %d (%s).\n", status, rej_msg);
+	}
+
+	return -ECONNRESET;
+}
+
+static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
+{
+	int ret;
+
+	ret = nvme_rdma_create_queue_ib(queue);
+	if (ret)
+		return ret;
+
+	ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
+	if (ret) {
+		dev_err(queue->ctrl->ctrl.device,
+			"rdma_resolve_route failed (%d).\n",
+			queue->cm_error);
+		goto out_destroy_queue;
+	}
+
+	return 0;
+
+out_destroy_queue:
+	nvme_rdma_destroy_queue_ib(queue);
+	return ret;
+}
+
+static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
+{
+	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
+	struct rdma_conn_param param = { };
+	struct nvme_rdma_cm_req priv = { };
+	int ret;
+
+	param.qp_num = queue->qp->qp_num;
+	param.flow_control = 1;
+
+	param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom;
+	/* maximum retry count */
+	param.retry_count = 7;
+	param.rnr_retry_count = 7;
+	param.private_data = &priv;
+	param.private_data_len = sizeof(priv);
+
+	priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
+	priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue));
+	/*
+	 * set the admin queue depth to the minimum size
+	 * specified by the Fabrics standard.
+	 */
+	if (priv.qid == 0) {
+		priv.hrqsize = cpu_to_le16(NVME_AQ_DEPTH);
+		priv.hsqsize = cpu_to_le16(NVME_AQ_DEPTH - 1);
+	} else {
+		/*
+		 * current interpretation of the fabrics spec
+		 * is at minimum you make hrqsize sqsize+1, or a
+		 * 1's based representation of sqsize.
+		 */
+		priv.hrqsize = cpu_to_le16(queue->queue_size);
+		priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
+	}
+
+	ret = rdma_connect(queue->cm_id, &param);
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"rdma_connect failed (%d).\n", ret);
+		goto out_destroy_queue_ib;
+	}
+
+	return 0;
+
+out_destroy_queue_ib:
+	nvme_rdma_destroy_queue_ib(queue);
+	return ret;
+}
+
+static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
+		struct rdma_cm_event *ev)
+{
+	struct nvme_rdma_queue *queue = cm_id->context;
+	int cm_error = 0;
+
+	dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n",
+		rdma_event_msg(ev->event), ev->event,
+		ev->status, cm_id);
+
+	switch (ev->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		cm_error = nvme_rdma_addr_resolved(queue);
+		break;
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		cm_error = nvme_rdma_route_resolved(queue);
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		queue->cm_error = nvme_rdma_conn_established(queue);
+		/* complete cm_done regardless of success/failure */
+		complete(&queue->cm_done);
+		return 0;
+	case RDMA_CM_EVENT_REJECTED:
+		nvme_rdma_destroy_queue_ib(queue);
+		cm_error = nvme_rdma_conn_rejected(queue, ev);
+		break;
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+		nvme_rdma_destroy_queue_ib(queue);
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		dev_dbg(queue->ctrl->ctrl.device,
+			"CM error event %d\n", ev->event);
+		cm_error = -ECONNRESET;
+		break;
+	case RDMA_CM_EVENT_DISCONNECTED:
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		dev_dbg(queue->ctrl->ctrl.device,
+			"disconnect received - connection closed\n");
+		nvme_rdma_error_recovery(queue->ctrl);
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		/* device removal is handled via the ib_client API */
+		break;
+	default:
+		dev_err(queue->ctrl->ctrl.device,
+			"Unexpected RDMA CM event (%d)\n", ev->event);
+		nvme_rdma_error_recovery(queue->ctrl);
+		break;
+	}
+
+	if (cm_error) {
+		queue->cm_error = cm_error;
+		complete(&queue->cm_done);
+	}
+
+	return 0;
+}
+
+static enum blk_eh_timer_return
+nvme_rdma_timeout(struct request *rq, bool reserved)
+{
+	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+
+	dev_warn(req->queue->ctrl->ctrl.device,
+		 "I/O %d QID %d timeout, reset controller\n",
+		 rq->tag, nvme_rdma_queue_idx(req->queue));
+
+	/* queue error recovery */
+	nvme_rdma_error_recovery(req->queue->ctrl);
+
+	/* fail with DNR on cmd timeout */
+	nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+
+	return BLK_EH_HANDLED;
+}
+
+static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_rdma_queue *queue = hctx->driver_data;
+	struct request *rq = bd->rq;
+	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_rdma_qe *sqe = &req->sqe;
+	struct nvme_command *c = sqe->data;
+	struct ib_device *dev;
+	blk_status_t ret;
+	int err;
+
+	WARN_ON_ONCE(rq->tag < 0);
+
+	ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq,
+		test_bit(NVME_RDMA_Q_LIVE, &queue->flags), true);
+	if (unlikely(ret))
+		return ret;
+
+	dev = queue->device->dev;
+	ib_dma_sync_single_for_cpu(dev, sqe->dma,
+			sizeof(struct nvme_command), DMA_TO_DEVICE);
+
+	ret = nvme_setup_cmd(ns, rq, c);
+	if (ret)
+		return ret;
+
+	blk_mq_start_request(rq);
+
+	err = nvme_rdma_map_data(queue, rq, c);
+	if (unlikely(err < 0)) {
+		dev_err(queue->ctrl->ctrl.device,
+			     "Failed to map data (%d)\n", err);
+		nvme_cleanup_cmd(rq);
+		goto err;
+	}
+
+	sqe->cqe.done = nvme_rdma_send_done;
+
+	ib_dma_sync_single_for_device(dev, sqe->dma,
+			sizeof(struct nvme_command), DMA_TO_DEVICE);
+
+	err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
+			req->mr ? &req->reg_wr.wr : NULL);
+	if (unlikely(err)) {
+		nvme_rdma_unmap_data(queue, rq);
+		goto err;
+	}
+
+	return BLK_STS_OK;
+err:
+	if (err == -ENOMEM || err == -EAGAIN)
+		return BLK_STS_RESOURCE;
+	return BLK_STS_IOERR;
+}
+
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+{
+	struct nvme_rdma_queue *queue = hctx->driver_data;
+	struct ib_cq *cq = queue->ib_cq;
+	struct ib_wc wc;
+	int found = 0;
+
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		struct ib_cqe *cqe = wc.wr_cqe;
+
+		if (cqe) {
+			if (cqe->done == nvme_rdma_recv_done)
+				found |= __nvme_rdma_recv_done(cq, &wc, tag);
+			else
+				cqe->done(cq, &wc);
+		}
+	}
+
+	return found;
+}
+
+static void nvme_rdma_complete_rq(struct request *rq)
+{
+	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+
+	nvme_rdma_unmap_data(req->queue, rq);
+	nvme_complete_rq(rq);
+}
+
+static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
+{
+	struct nvme_rdma_ctrl *ctrl = set->driver_data;
+
+	return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0);
+}
+
+static const struct blk_mq_ops nvme_rdma_mq_ops = {
+	.queue_rq	= nvme_rdma_queue_rq,
+	.complete	= nvme_rdma_complete_rq,
+	.init_request	= nvme_rdma_init_request,
+	.exit_request	= nvme_rdma_exit_request,
+	.init_hctx	= nvme_rdma_init_hctx,
+	.poll		= nvme_rdma_poll,
+	.timeout	= nvme_rdma_timeout,
+	.map_queues	= nvme_rdma_map_queues,
+};
+
+static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
+	.queue_rq	= nvme_rdma_queue_rq,
+	.complete	= nvme_rdma_complete_rq,
+	.init_request	= nvme_rdma_init_request,
+	.exit_request	= nvme_rdma_exit_request,
+	.init_hctx	= nvme_rdma_init_admin_hctx,
+	.timeout	= nvme_rdma_timeout,
+};
+
+static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
+{
+	if (ctrl->ctrl.queue_count > 1) {
+		nvme_stop_queues(&ctrl->ctrl);
+		blk_mq_tagset_busy_iter(&ctrl->tag_set,
+					nvme_cancel_request, &ctrl->ctrl);
+		nvme_rdma_destroy_io_queues(ctrl, shutdown);
+	}
+
+	if (shutdown)
+		nvme_shutdown_ctrl(&ctrl->ctrl);
+	else
+		nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+
+	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
+				nvme_cancel_request, &ctrl->ctrl);
+	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+	nvme_rdma_destroy_admin_queue(ctrl, shutdown);
+}
+
+static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+	nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true);
+}
+
+static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
+{
+	struct nvme_rdma_ctrl *ctrl =
+		container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
+	int ret;
+	bool changed;
+
+	nvme_stop_ctrl(&ctrl->ctrl);
+	nvme_rdma_shutdown_ctrl(ctrl, false);
+
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
+		/* state change failure should never happen */
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ret = nvme_rdma_configure_admin_queue(ctrl, false);
+	if (ret)
+		goto out_fail;
+
+	if (ctrl->ctrl.queue_count > 1) {
+		ret = nvme_rdma_configure_io_queues(ctrl, false);
+		if (ret)
+			goto out_fail;
+	}
+
+	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
+	if (!changed) {
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
+		return;
+	}
+
+	nvme_start_ctrl(&ctrl->ctrl);
+
+	return;
+
+out_fail:
+	++ctrl->ctrl.nr_reconnects;
+	nvme_rdma_reconnect_or_remove(ctrl);
+}
+
+static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
+	.name			= "rdma",
+	.module			= THIS_MODULE,
+	.flags			= NVME_F_FABRICS,
+	.reg_read32		= nvmf_reg_read32,
+	.reg_read64		= nvmf_reg_read64,
+	.reg_write32		= nvmf_reg_write32,
+	.free_ctrl		= nvme_rdma_free_ctrl,
+	.submit_async_event	= nvme_rdma_submit_async_event,
+	.delete_ctrl		= nvme_rdma_delete_ctrl,
+	.get_address		= nvmf_get_address,
+	.stop_ctrl		= nvme_rdma_stop_ctrl,
+};
+
+static inline bool
+__nvme_rdma_options_match(struct nvme_rdma_ctrl *ctrl,
+	struct nvmf_ctrl_options *opts)
+{
+	char *stdport = __stringify(NVME_RDMA_IP_PORT);
+
+
+	if (!nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts) ||
+	    strcmp(opts->traddr, ctrl->ctrl.opts->traddr))
+		return false;
+
+	if (opts->mask & NVMF_OPT_TRSVCID &&
+	    ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
+		if (strcmp(opts->trsvcid, ctrl->ctrl.opts->trsvcid))
+			return false;
+	} else if (opts->mask & NVMF_OPT_TRSVCID) {
+		if (strcmp(opts->trsvcid, stdport))
+			return false;
+	} else if (ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
+		if (strcmp(stdport, ctrl->ctrl.opts->trsvcid))
+			return false;
+	}
+	/* else, it's a match as both have stdport. Fall to next checks */
+
+	/*
+	 * checking the local address is rough. In most cases, one
+	 * is not specified and the host port is selected by the stack.
+	 *
+	 * Assume no match if:
+	 *  local address is specified and address is not the same
+	 *  local address is not specified but remote is, or vice versa
+	 *    (admin using specific host_traddr when it matters).
+	 */
+	if (opts->mask & NVMF_OPT_HOST_TRADDR &&
+	    ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
+		if (strcmp(opts->host_traddr, ctrl->ctrl.opts->host_traddr))
+			return false;
+	} else if (opts->mask & NVMF_OPT_HOST_TRADDR ||
+		   ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
+		return false;
+	/*
+	 * if neither controller had an host port specified, assume it's
+	 * a match as everything else matched.
+	 */
+
+	return true;
+}
+
+/*
+ * Fails a connection request if it matches an existing controller
+ * (association) with the same tuple:
+ * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN>
+ *
+ * if local address is not specified in the request, it will match an
+ * existing controller with all the other parameters the same and no
+ * local port address specified as well.
+ *
+ * The ports don't need to be compared as they are intrinsically
+ * already matched by the port pointers supplied.
+ */
+static bool
+nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts)
+{
+	struct nvme_rdma_ctrl *ctrl;
+	bool found = false;
+
+	mutex_lock(&nvme_rdma_ctrl_mutex);
+	list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
+		found = __nvme_rdma_options_match(ctrl, opts);
+		if (found)
+			break;
+	}
+	mutex_unlock(&nvme_rdma_ctrl_mutex);
+
+	return found;
+}
+
+static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
+		struct nvmf_ctrl_options *opts)
+{
+	struct nvme_rdma_ctrl *ctrl;
+	int ret;
+	bool changed;
+	char *port;
+
+	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+	if (!ctrl)
+		return ERR_PTR(-ENOMEM);
+	ctrl->ctrl.opts = opts;
+	INIT_LIST_HEAD(&ctrl->list);
+
+	if (opts->mask & NVMF_OPT_TRSVCID)
+		port = opts->trsvcid;
+	else
+		port = __stringify(NVME_RDMA_IP_PORT);
+
+	ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+			opts->traddr, port, &ctrl->addr);
+	if (ret) {
+		pr_err("malformed address passed: %s:%s\n", opts->traddr, port);
+		goto out_free_ctrl;
+	}
+
+	if (opts->mask & NVMF_OPT_HOST_TRADDR) {
+		ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+			opts->host_traddr, NULL, &ctrl->src_addr);
+		if (ret) {
+			pr_err("malformed src address passed: %s\n",
+			       opts->host_traddr);
+			goto out_free_ctrl;
+		}
+	}
+
+	if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) {
+		ret = -EALREADY;
+		goto out_free_ctrl;
+	}
+
+	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
+				0 /* no quirks, we're perfect! */);
+	if (ret)
+		goto out_free_ctrl;
+
+	INIT_DELAYED_WORK(&ctrl->reconnect_work,
+			nvme_rdma_reconnect_ctrl_work);
+	INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
+	INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
+
+	ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
+	ctrl->ctrl.sqsize = opts->queue_size - 1;
+	ctrl->ctrl.kato = opts->kato;
+
+	ret = -ENOMEM;
+	ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
+				GFP_KERNEL);
+	if (!ctrl->queues)
+		goto out_uninit_ctrl;
+
+	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
+	WARN_ON_ONCE(!changed);
+
+	ret = nvme_rdma_configure_admin_queue(ctrl, true);
+	if (ret)
+		goto out_kfree_queues;
+
+	/* sanity check icdoff */
+	if (ctrl->ctrl.icdoff) {
+		dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
+		ret = -EINVAL;
+		goto out_remove_admin_queue;
+	}
+
+	/* sanity check keyed sgls */
+	if (!(ctrl->ctrl.sgls & (1 << 20))) {
+		dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not support\n");
+		ret = -EINVAL;
+		goto out_remove_admin_queue;
+	}
+
+	if (opts->queue_size > ctrl->ctrl.maxcmd) {
+		/* warn if maxcmd is lower than queue_size */
+		dev_warn(ctrl->ctrl.device,
+			"queue_size %zu > ctrl maxcmd %u, clamping down\n",
+			opts->queue_size, ctrl->ctrl.maxcmd);
+		opts->queue_size = ctrl->ctrl.maxcmd;
+	}
+
+	if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
+		/* warn if sqsize is lower than queue_size */
+		dev_warn(ctrl->ctrl.device,
+			"queue_size %zu > ctrl sqsize %u, clamping down\n",
+			opts->queue_size, ctrl->ctrl.sqsize + 1);
+		opts->queue_size = ctrl->ctrl.sqsize + 1;
+	}
+
+	if (opts->nr_io_queues) {
+		ret = nvme_rdma_configure_io_queues(ctrl, true);
+		if (ret)
+			goto out_remove_admin_queue;
+	}
+
+	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
+	WARN_ON_ONCE(!changed);
+
+	dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
+		ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
+
+	nvme_get_ctrl(&ctrl->ctrl);
+
+	mutex_lock(&nvme_rdma_ctrl_mutex);
+	list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
+	mutex_unlock(&nvme_rdma_ctrl_mutex);
+
+	nvme_start_ctrl(&ctrl->ctrl);
+
+	return &ctrl->ctrl;
+
+out_remove_admin_queue:
+	nvme_rdma_destroy_admin_queue(ctrl, true);
+out_kfree_queues:
+	kfree(ctrl->queues);
+out_uninit_ctrl:
+	nvme_uninit_ctrl(&ctrl->ctrl);
+	nvme_put_ctrl(&ctrl->ctrl);
+	if (ret > 0)
+		ret = -EIO;
+	return ERR_PTR(ret);
+out_free_ctrl:
+	kfree(ctrl);
+	return ERR_PTR(ret);
+}
+
+static struct nvmf_transport_ops nvme_rdma_transport = {
+	.name		= "rdma",
+	.module		= THIS_MODULE,
+	.required_opts	= NVMF_OPT_TRADDR,
+	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
+			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
+	.create_ctrl	= nvme_rdma_create_ctrl,
+};
+
+static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
+{
+	struct nvme_rdma_ctrl *ctrl;
+	struct nvme_rdma_device *ndev;
+	bool found = false;
+
+	mutex_lock(&device_list_mutex);
+	list_for_each_entry(ndev, &device_list, entry) {
+		if (ndev->dev == ib_device) {
+			found = true;
+			break;
+		}
+	}
+	mutex_unlock(&device_list_mutex);
+
+	if (!found)
+		return;
+
+	/* Delete all controllers using this device */
+	mutex_lock(&nvme_rdma_ctrl_mutex);
+	list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
+		if (ctrl->device->dev != ib_device)
+			continue;
+		nvme_delete_ctrl(&ctrl->ctrl);
+	}
+	mutex_unlock(&nvme_rdma_ctrl_mutex);
+
+	flush_workqueue(nvme_delete_wq);
+}
+
+static struct ib_client nvme_rdma_ib_client = {
+	.name   = "nvme_rdma",
+	.remove = nvme_rdma_remove_one
+};
+
+static int __init nvme_rdma_init_module(void)
+{
+	int ret;
+
+	ret = ib_register_client(&nvme_rdma_ib_client);
+	if (ret)
+		return ret;
+
+	ret = nvmf_register_transport(&nvme_rdma_transport);
+	if (ret)
+		goto err_unreg_client;
+
+	return 0;
+
+err_unreg_client:
+	ib_unregister_client(&nvme_rdma_ib_client);
+	return ret;
+}
+
+static void __exit nvme_rdma_cleanup_module(void)
+{
+	nvmf_unregister_transport(&nvme_rdma_transport);
+	ib_unregister_client(&nvme_rdma_ib_client);
+}
+
+module_init(nvme_rdma_init_module);
+module_exit(nvme_rdma_cleanup_module);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
new file mode 100644
index 0000000..41944bb
--- /dev/null
+++ b/drivers/nvme/host/trace.c
@@ -0,0 +1,130 @@
+/*
+ * NVM Express device driver tracepoints
+ * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <asm/unaligned.h>
+#include "trace.h"
+
+static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u16 sqid = get_unaligned_le16(cdw10);
+	u16 qsize = get_unaligned_le16(cdw10 + 2);
+	u16 sq_flags = get_unaligned_le16(cdw10 + 4);
+	u16 cqid = get_unaligned_le16(cdw10 + 6);
+
+
+	trace_seq_printf(p, "sqid=%u, qsize=%u, sq_flags=0x%x, cqid=%u",
+			 sqid, qsize, sq_flags, cqid);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u16 cqid = get_unaligned_le16(cdw10);
+	u16 qsize = get_unaligned_le16(cdw10 + 2);
+	u16 cq_flags = get_unaligned_le16(cdw10 + 4);
+	u16 irq_vector = get_unaligned_le16(cdw10 + 6);
+
+	trace_seq_printf(p, "cqid=%u, qsize=%u, cq_flags=0x%x, irq_vector=%u",
+			 cqid, qsize, cq_flags, irq_vector);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u8 cns = cdw10[0];
+	u16 ctrlid = get_unaligned_le16(cdw10 + 2);
+
+	trace_seq_printf(p, "cns=%u, ctrlid=%u", cns, ctrlid);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+
+
+static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u64 slba = get_unaligned_le64(cdw10);
+	u16 length = get_unaligned_le16(cdw10 + 8);
+	u16 control = get_unaligned_le16(cdw10 + 10);
+	u32 dsmgmt = get_unaligned_le32(cdw10 + 12);
+	u32 reftag = get_unaligned_le32(cdw10 +  16);
+
+	trace_seq_printf(p,
+			 "slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u",
+			 slba, length, control, dsmgmt, reftag);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	trace_seq_printf(p, "nr=%u, attributes=%u",
+			 get_unaligned_le32(cdw10),
+			 get_unaligned_le32(cdw10 + 4));
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	trace_seq_printf(p, "cdw10=%*ph", 24, cdw10);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
+				       u8 opcode, u8 *cdw10)
+{
+	switch (opcode) {
+	case nvme_admin_create_sq:
+		return nvme_trace_create_sq(p, cdw10);
+	case nvme_admin_create_cq:
+		return nvme_trace_create_cq(p, cdw10);
+	case nvme_admin_identify:
+		return nvme_trace_admin_identify(p, cdw10);
+	default:
+		return nvme_trace_common(p, cdw10);
+	}
+}
+
+const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
+				     u8 opcode, u8 *cdw10)
+{
+	switch (opcode) {
+	case nvme_cmd_read:
+	case nvme_cmd_write:
+	case nvme_cmd_write_zeroes:
+		return nvme_trace_read_write(p, cdw10);
+	case nvme_cmd_dsm:
+		return nvme_trace_dsm(p, cdw10);
+	default:
+		return nvme_trace_common(p, cdw10);
+	}
+}
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
new file mode 100644
index 0000000..ea91fcc
--- /dev/null
+++ b/drivers/nvme/host/trace.h
@@ -0,0 +1,165 @@
+/*
+ * NVM Express device driver tracepoints
+ * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nvme
+
+#if !defined(_TRACE_NVME_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NVME_H
+
+#include <linux/nvme.h>
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "nvme.h"
+
+#define nvme_admin_opcode_name(opcode)	{ opcode, #opcode }
+#define show_admin_opcode_name(val)					\
+	__print_symbolic(val,						\
+		nvme_admin_opcode_name(nvme_admin_delete_sq),		\
+		nvme_admin_opcode_name(nvme_admin_create_sq),		\
+		nvme_admin_opcode_name(nvme_admin_get_log_page),	\
+		nvme_admin_opcode_name(nvme_admin_delete_cq),		\
+		nvme_admin_opcode_name(nvme_admin_create_cq),		\
+		nvme_admin_opcode_name(nvme_admin_identify),		\
+		nvme_admin_opcode_name(nvme_admin_abort_cmd),		\
+		nvme_admin_opcode_name(nvme_admin_set_features),	\
+		nvme_admin_opcode_name(nvme_admin_get_features),	\
+		nvme_admin_opcode_name(nvme_admin_async_event),		\
+		nvme_admin_opcode_name(nvme_admin_ns_mgmt),		\
+		nvme_admin_opcode_name(nvme_admin_activate_fw),		\
+		nvme_admin_opcode_name(nvme_admin_download_fw),		\
+		nvme_admin_opcode_name(nvme_admin_ns_attach),		\
+		nvme_admin_opcode_name(nvme_admin_keep_alive),		\
+		nvme_admin_opcode_name(nvme_admin_directive_send),	\
+		nvme_admin_opcode_name(nvme_admin_directive_recv),	\
+		nvme_admin_opcode_name(nvme_admin_dbbuf),		\
+		nvme_admin_opcode_name(nvme_admin_format_nvm),		\
+		nvme_admin_opcode_name(nvme_admin_security_send),	\
+		nvme_admin_opcode_name(nvme_admin_security_recv),	\
+		nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
+
+const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
+				       u8 *cdw10);
+#define __parse_nvme_admin_cmd(opcode, cdw10) \
+	nvme_trace_parse_admin_cmd(p, opcode, cdw10)
+
+#define nvme_opcode_name(opcode)	{ opcode, #opcode }
+#define show_opcode_name(val)					\
+	__print_symbolic(val,					\
+		nvme_opcode_name(nvme_cmd_flush),		\
+		nvme_opcode_name(nvme_cmd_write),		\
+		nvme_opcode_name(nvme_cmd_read),		\
+		nvme_opcode_name(nvme_cmd_write_uncor),		\
+		nvme_opcode_name(nvme_cmd_compare),		\
+		nvme_opcode_name(nvme_cmd_write_zeroes),	\
+		nvme_opcode_name(nvme_cmd_dsm),			\
+		nvme_opcode_name(nvme_cmd_resv_register),	\
+		nvme_opcode_name(nvme_cmd_resv_report),		\
+		nvme_opcode_name(nvme_cmd_resv_acquire),	\
+		nvme_opcode_name(nvme_cmd_resv_release))
+
+const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
+				     u8 *cdw10);
+#define __parse_nvme_cmd(opcode, cdw10) \
+	nvme_trace_parse_nvm_cmd(p, opcode, cdw10)
+
+TRACE_EVENT(nvme_setup_admin_cmd,
+	    TP_PROTO(struct nvme_command *cmd),
+	    TP_ARGS(cmd),
+	    TP_STRUCT__entry(
+		    __field(u8, opcode)
+		    __field(u8, flags)
+		    __field(u16, cid)
+		    __field(u64, metadata)
+		    __array(u8, cdw10, 24)
+	    ),
+	    TP_fast_assign(
+		    __entry->opcode = cmd->common.opcode;
+		    __entry->flags = cmd->common.flags;
+		    __entry->cid = cmd->common.command_id;
+		    __entry->metadata = le64_to_cpu(cmd->common.metadata);
+		    memcpy(__entry->cdw10, cmd->common.cdw10,
+			   sizeof(__entry->cdw10));
+	    ),
+	    TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
+		      __entry->cid, __entry->flags, __entry->metadata,
+		      show_admin_opcode_name(__entry->opcode),
+		      __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10))
+);
+
+
+TRACE_EVENT(nvme_setup_nvm_cmd,
+	    TP_PROTO(int qid, struct nvme_command *cmd),
+	    TP_ARGS(qid, cmd),
+	    TP_STRUCT__entry(
+		    __field(int, qid)
+		    __field(u8, opcode)
+		    __field(u8, flags)
+		    __field(u16, cid)
+		    __field(u32, nsid)
+		    __field(u64, metadata)
+		    __array(u8, cdw10, 24)
+	    ),
+	    TP_fast_assign(
+		    __entry->qid = qid;
+		    __entry->opcode = cmd->common.opcode;
+		    __entry->flags = cmd->common.flags;
+		    __entry->cid = cmd->common.command_id;
+		    __entry->nsid = le32_to_cpu(cmd->common.nsid);
+		    __entry->metadata = le64_to_cpu(cmd->common.metadata);
+		    memcpy(__entry->cdw10, cmd->common.cdw10,
+			   sizeof(__entry->cdw10));
+	    ),
+	    TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
+		      __entry->qid, __entry->nsid, __entry->cid,
+		      __entry->flags, __entry->metadata,
+		      show_opcode_name(__entry->opcode),
+		      __parse_nvme_cmd(__entry->opcode, __entry->cdw10))
+);
+
+TRACE_EVENT(nvme_complete_rq,
+	    TP_PROTO(struct request *req),
+	    TP_ARGS(req),
+	    TP_STRUCT__entry(
+		    __field(int, qid)
+		    __field(int, cid)
+		    __field(u64, result)
+		    __field(u8, retries)
+		    __field(u8, flags)
+		    __field(u16, status)
+	    ),
+	    TP_fast_assign(
+		    __entry->qid = req->q->id;
+		    __entry->cid = req->tag;
+		    __entry->result = le64_to_cpu(nvme_req(req)->result.u64);
+		    __entry->retries = nvme_req(req)->retries;
+		    __entry->flags = nvme_req(req)->flags;
+		    __entry->status = nvme_req(req)->status;
+	    ),
+	    TP_printk("cmdid=%u, qid=%d, res=%llu, retries=%u, flags=0x%x, status=%u",
+		      __entry->cid, __entry->qid, __entry->result,
+		      __entry->retries, __entry->flags, __entry->status)
+
+);
+
+#endif /* _TRACE_NVME_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
new file mode 100644
index 0000000..3c7b61d
--- /dev/null
+++ b/drivers/nvme/target/Kconfig
@@ -0,0 +1,62 @@
+
+config NVME_TARGET
+	tristate "NVMe Target support"
+	depends on BLOCK
+	depends on CONFIGFS_FS
+	help
+	  This enabled target side support for the NVMe protocol, that is
+	  it allows the Linux kernel to implement NVMe subsystems and
+	  controllers and export Linux block devices as NVMe namespaces.
+	  You need to select at least one of the transports below to make this
+	  functionality useful.
+
+	  To configure the NVMe target you probably want to use the nvmetcli
+	  tool from http://git.infradead.org/users/hch/nvmetcli.git.
+
+config NVME_TARGET_LOOP
+	tristate "NVMe loopback device support"
+	depends on NVME_TARGET
+	select NVME_CORE
+	select NVME_FABRICS
+	select SG_POOL
+	help
+	  This enables the NVMe loopback device support, which can be useful
+	  to test NVMe host and target side features.
+
+	  If unsure, say N.
+
+config NVME_TARGET_RDMA
+	tristate "NVMe over Fabrics RDMA target support"
+	depends on INFINIBAND && INFINIBAND_ADDR_TRANS
+	depends on NVME_TARGET
+	select SGL_ALLOC
+	help
+	  This enables the NVMe RDMA target support, which allows exporting NVMe
+	  devices over RDMA.
+
+	  If unsure, say N.
+
+config NVME_TARGET_FC
+	tristate "NVMe over Fabrics FC target driver"
+	depends on NVME_TARGET
+	depends on HAS_DMA
+	select SGL_ALLOC
+	help
+	  This enables the NVMe FC target support, which allows exporting NVMe
+	  devices over FC.
+
+	  If unsure, say N.
+
+config NVME_TARGET_FCLOOP
+	tristate "NVMe over Fabrics FC Transport Loopback Test driver"
+	depends on NVME_TARGET
+	select NVME_CORE
+	select NVME_FABRICS
+	select SG_POOL
+	depends on NVME_FC
+	depends on NVME_TARGET_FC
+	help
+	  This enables the NVMe FC loopback test support, which can be useful
+	  to test NVMe-FC transport interfaces.
+
+	  If unsure, say N.
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
new file mode 100644
index 0000000..4882501
--- /dev/null
+++ b/drivers/nvme/target/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_NVME_TARGET)		+= nvmet.o
+obj-$(CONFIG_NVME_TARGET_LOOP)		+= nvme-loop.o
+obj-$(CONFIG_NVME_TARGET_RDMA)		+= nvmet-rdma.o
+obj-$(CONFIG_NVME_TARGET_FC)		+= nvmet-fc.o
+obj-$(CONFIG_NVME_TARGET_FCLOOP)	+= nvme-fcloop.o
+
+nvmet-y		+= core.o configfs.o admin-cmd.o io-cmd.o fabrics-cmd.o \
+			discovery.o
+nvme-loop-y	+= loop.o
+nvmet-rdma-y	+= rdma.o
+nvmet-fc-y	+= fc.o
+nvme-fcloop-y	+= fcloop.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
new file mode 100644
index 0000000..5e0e9fc
--- /dev/null
+++ b/drivers/nvme/target/admin-cmd.c
@@ -0,0 +1,611 @@
+/*
+ * NVMe admin command implementation.
+ * Copyright (c) 2015-2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/rculist.h>
+
+#include <generated/utsrelease.h>
+#include <asm/unaligned.h>
+#include "nvmet.h"
+
+u32 nvmet_get_log_page_len(struct nvme_command *cmd)
+{
+	u32 len = le16_to_cpu(cmd->get_log_page.numdu);
+
+	len <<= 16;
+	len += le16_to_cpu(cmd->get_log_page.numdl);
+	/* NUMD is a 0's based value */
+	len += 1;
+	len *= sizeof(u32);
+
+	return len;
+}
+
+static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
+		struct nvme_smart_log *slog)
+{
+	struct nvmet_ns *ns;
+	u64 host_reads, host_writes, data_units_read, data_units_written;
+
+	ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid);
+	if (!ns) {
+		pr_err("nvmet : Could not find namespace id : %d\n",
+				le32_to_cpu(req->cmd->get_log_page.nsid));
+		return NVME_SC_INVALID_NS;
+	}
+
+	host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]);
+	data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]);
+	host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]);
+	data_units_written = part_stat_read(ns->bdev->bd_part, sectors[WRITE]);
+
+	put_unaligned_le64(host_reads, &slog->host_reads[0]);
+	put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
+	put_unaligned_le64(host_writes, &slog->host_writes[0]);
+	put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
+	nvmet_put_namespace(ns);
+
+	return NVME_SC_SUCCESS;
+}
+
+static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
+		struct nvme_smart_log *slog)
+{
+	u64 host_reads = 0, host_writes = 0;
+	u64 data_units_read = 0, data_units_written = 0;
+	struct nvmet_ns *ns;
+	struct nvmet_ctrl *ctrl;
+
+	ctrl = req->sq->ctrl;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
+		host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]);
+		data_units_read +=
+			part_stat_read(ns->bdev->bd_part, sectors[READ]);
+		host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]);
+		data_units_written +=
+			part_stat_read(ns->bdev->bd_part, sectors[WRITE]);
+
+	}
+	rcu_read_unlock();
+
+	put_unaligned_le64(host_reads, &slog->host_reads[0]);
+	put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
+	put_unaligned_le64(host_writes, &slog->host_writes[0]);
+	put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
+
+	return NVME_SC_SUCCESS;
+}
+
+static u16 nvmet_get_smart_log(struct nvmet_req *req,
+		struct nvme_smart_log *slog)
+{
+	u16 status;
+
+	WARN_ON(req == NULL || slog == NULL);
+	if (req->cmd->get_log_page.nsid == cpu_to_le32(NVME_NSID_ALL))
+		status = nvmet_get_smart_log_all(req, slog);
+	else
+		status = nvmet_get_smart_log_nsid(req, slog);
+	return status;
+}
+
+static void nvmet_execute_get_log_page(struct nvmet_req *req)
+{
+	struct nvme_smart_log *smart_log;
+	size_t data_len = nvmet_get_log_page_len(req->cmd);
+	void *buf;
+	u16 status = 0;
+
+	buf = kzalloc(data_len, GFP_KERNEL);
+	if (!buf) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	switch (req->cmd->get_log_page.lid) {
+	case NVME_LOG_ERROR:
+		/*
+		 * We currently never set the More bit in the status field,
+		 * so all error log entries are invalid and can be zeroed out.
+		 * This is called a minum viable implementation (TM) of this
+		 * mandatory log page.
+		 */
+		break;
+	case NVME_LOG_SMART:
+		/*
+		 * XXX: fill out actual smart log
+		 *
+		 * We might have a hard time coming up with useful values for
+		 * many of the fields, and even when we have useful data
+		 * available (e.g. units or commands read/written) those aren't
+		 * persistent over power loss.
+		 */
+		if (data_len != sizeof(*smart_log)) {
+			status = NVME_SC_INTERNAL;
+			goto err;
+		}
+		smart_log = buf;
+		status = nvmet_get_smart_log(req, smart_log);
+		if (status)
+			goto err;
+		break;
+	case NVME_LOG_FW_SLOT:
+		/*
+		 * We only support a single firmware slot which always is
+		 * active, so we can zero out the whole firmware slot log and
+		 * still claim to fully implement this mandatory log page.
+		 */
+		break;
+	default:
+		BUG();
+	}
+
+	status = nvmet_copy_to_sgl(req, 0, buf, data_len);
+
+err:
+	kfree(buf);
+out:
+	nvmet_req_complete(req, status);
+}
+
+static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvme_id_ctrl *id;
+	u16 status = 0;
+	const char model[] = "Linux";
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	/* XXX: figure out how to assign real vendors IDs. */
+	id->vid = 0;
+	id->ssvid = 0;
+
+	memset(id->sn, ' ', sizeof(id->sn));
+	bin2hex(id->sn, &ctrl->subsys->serial,
+		min(sizeof(ctrl->subsys->serial), sizeof(id->sn) / 2));
+	memcpy_and_pad(id->mn, sizeof(id->mn), model, sizeof(model) - 1, ' ');
+	memcpy_and_pad(id->fr, sizeof(id->fr),
+		       UTS_RELEASE, strlen(UTS_RELEASE), ' ');
+
+	id->rab = 6;
+
+	/*
+	 * XXX: figure out how we can assign a IEEE OUI, but until then
+	 * the safest is to leave it as zeroes.
+	 */
+
+	/* we support multiple ports and multiples hosts: */
+	id->cmic = (1 << 0) | (1 << 1);
+
+	/* no limit on data transfer sizes for now */
+	id->mdts = 0;
+	id->cntlid = cpu_to_le16(ctrl->cntlid);
+	id->ver = cpu_to_le32(ctrl->subsys->ver);
+
+	/* XXX: figure out what to do about RTD3R/RTD3 */
+	id->oaes = cpu_to_le32(1 << 8);
+	id->ctratt = cpu_to_le32(1 << 0);
+
+	id->oacs = 0;
+
+	/*
+	 * We don't really have a practical limit on the number of abort
+	 * comands.  But we don't do anything useful for abort either, so
+	 * no point in allowing more abort commands than the spec requires.
+	 */
+	id->acl = 3;
+
+	id->aerl = NVMET_ASYNC_EVENTS - 1;
+
+	/* first slot is read-only, only one slot supported */
+	id->frmw = (1 << 0) | (1 << 1);
+	id->lpa = (1 << 0) | (1 << 2);
+	id->elpe = NVMET_ERROR_LOG_SLOTS - 1;
+	id->npss = 0;
+
+	/* We support keep-alive timeout in granularity of seconds */
+	id->kas = cpu_to_le16(NVMET_KAS);
+
+	id->sqes = (0x6 << 4) | 0x6;
+	id->cqes = (0x4 << 4) | 0x4;
+
+	/* no enforcement soft-limit for maxcmd - pick arbitrary high value */
+	id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
+
+	id->nn = cpu_to_le32(ctrl->subsys->max_nsid);
+	id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM |
+			NVME_CTRL_ONCS_WRITE_ZEROES);
+
+	/* XXX: don't report vwc if the underlying device is write through */
+	id->vwc = NVME_CTRL_VWC_PRESENT;
+
+	/*
+	 * We can't support atomic writes bigger than a LBA without support
+	 * from the backend device.
+	 */
+	id->awun = 0;
+	id->awupf = 0;
+
+	id->sgls = cpu_to_le32(1 << 0);	/* we always support SGLs */
+	if (ctrl->ops->has_keyed_sgls)
+		id->sgls |= cpu_to_le32(1 << 2);
+	if (ctrl->ops->sqe_inline_size)
+		id->sgls |= cpu_to_le32(1 << 20);
+
+	strcpy(id->subnqn, ctrl->subsys->subsysnqn);
+
+	/* Max command capsule size is sqe + single page of in-capsule data */
+	id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) +
+				  ctrl->ops->sqe_inline_size) / 16);
+	/* Max response capsule size is cqe */
+	id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16);
+
+	id->msdbd = ctrl->ops->msdbd;
+
+	/*
+	 * Meh, we don't really support any power state.  Fake up the same
+	 * values that qemu does.
+	 */
+	id->psd[0].max_power = cpu_to_le16(0x9c4);
+	id->psd[0].entry_lat = cpu_to_le32(0x10);
+	id->psd[0].exit_lat = cpu_to_le32(0x4);
+
+	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
+
+	kfree(id);
+out:
+	nvmet_req_complete(req, status);
+}
+
+static void nvmet_execute_identify_ns(struct nvmet_req *req)
+{
+	struct nvmet_ns *ns;
+	struct nvme_id_ns *id;
+	u16 status = 0;
+
+	ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
+	if (!ns) {
+		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
+		goto out;
+	}
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id) {
+		status = NVME_SC_INTERNAL;
+		goto out_put_ns;
+	}
+
+	/*
+	 * nuse = ncap = nsze isn't always true, but we have no way to find
+	 * that out from the underlying device.
+	 */
+	id->ncap = id->nuse = id->nsze =
+		cpu_to_le64(ns->size >> ns->blksize_shift);
+
+	/*
+	 * We just provide a single LBA format that matches what the
+	 * underlying device reports.
+	 */
+	id->nlbaf = 0;
+	id->flbas = 0;
+
+	/*
+	 * Our namespace might always be shared.  Not just with other
+	 * controllers, but also with any other user of the block device.
+	 */
+	id->nmic = (1 << 0);
+
+	memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le));
+
+	id->lbaf[0].ds = ns->blksize_shift;
+
+	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
+
+	kfree(id);
+out_put_ns:
+	nvmet_put_namespace(ns);
+out:
+	nvmet_req_complete(req, status);
+}
+
+static void nvmet_execute_identify_nslist(struct nvmet_req *req)
+{
+	static const int buf_size = NVME_IDENTIFY_DATA_SIZE;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_ns *ns;
+	u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid);
+	__le32 *list;
+	u16 status = 0;
+	int i = 0;
+
+	list = kzalloc(buf_size, GFP_KERNEL);
+	if (!list) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
+		if (ns->nsid <= min_nsid)
+			continue;
+		list[i++] = cpu_to_le32(ns->nsid);
+		if (i == buf_size / sizeof(__le32))
+			break;
+	}
+	rcu_read_unlock();
+
+	status = nvmet_copy_to_sgl(req, 0, list, buf_size);
+
+	kfree(list);
+out:
+	nvmet_req_complete(req, status);
+}
+
+static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
+				    void *id, off_t *off)
+{
+	struct nvme_ns_id_desc desc = {
+		.nidt = type,
+		.nidl = len,
+	};
+	u16 status;
+
+	status = nvmet_copy_to_sgl(req, *off, &desc, sizeof(desc));
+	if (status)
+		return status;
+	*off += sizeof(desc);
+
+	status = nvmet_copy_to_sgl(req, *off, id, len);
+	if (status)
+		return status;
+	*off += len;
+
+	return 0;
+}
+
+static void nvmet_execute_identify_desclist(struct nvmet_req *req)
+{
+	struct nvmet_ns *ns;
+	u16 status = 0;
+	off_t off = 0;
+
+	ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
+	if (!ns) {
+		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
+		goto out;
+	}
+
+	if (memchr_inv(&ns->uuid, 0, sizeof(ns->uuid))) {
+		status = nvmet_copy_ns_identifier(req, NVME_NIDT_UUID,
+						  NVME_NIDT_UUID_LEN,
+						  &ns->uuid, &off);
+		if (status)
+			goto out_put_ns;
+	}
+	if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) {
+		status = nvmet_copy_ns_identifier(req, NVME_NIDT_NGUID,
+						  NVME_NIDT_NGUID_LEN,
+						  &ns->nguid, &off);
+		if (status)
+			goto out_put_ns;
+	}
+
+	if (sg_zero_buffer(req->sg, req->sg_cnt, NVME_IDENTIFY_DATA_SIZE - off,
+			off) != NVME_IDENTIFY_DATA_SIZE - off)
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+out_put_ns:
+	nvmet_put_namespace(ns);
+out:
+	nvmet_req_complete(req, status);
+}
+
+/*
+ * A "minimum viable" abort implementation: the command is mandatory in the
+ * spec, but we are not required to do any useful work.  We couldn't really
+ * do a useful abort, so don't bother even with waiting for the command
+ * to be exectuted and return immediately telling the command to abort
+ * wasn't found.
+ */
+static void nvmet_execute_abort(struct nvmet_req *req)
+{
+	nvmet_set_result(req, 1);
+	nvmet_req_complete(req, 0);
+}
+
+static void nvmet_execute_set_features(struct nvmet_req *req)
+{
+	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u32 val32;
+	u16 status = 0;
+
+	switch (cdw10 & 0xff) {
+	case NVME_FEAT_NUM_QUEUES:
+		nvmet_set_result(req,
+			(subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16));
+		break;
+	case NVME_FEAT_KATO:
+		val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
+		req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
+		nvmet_set_result(req, req->sq->ctrl->kato);
+		break;
+	case NVME_FEAT_HOST_ID:
+		status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+		break;
+	default:
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		break;
+	}
+
+	nvmet_req_complete(req, status);
+}
+
+static void nvmet_execute_get_features(struct nvmet_req *req)
+{
+	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u16 status = 0;
+
+	switch (cdw10 & 0xff) {
+	/*
+	 * These features are mandatory in the spec, but we don't
+	 * have a useful way to implement them.  We'll eventually
+	 * need to come up with some fake values for these.
+	 */
+#if 0
+	case NVME_FEAT_ARBITRATION:
+		break;
+	case NVME_FEAT_POWER_MGMT:
+		break;
+	case NVME_FEAT_TEMP_THRESH:
+		break;
+	case NVME_FEAT_ERR_RECOVERY:
+		break;
+	case NVME_FEAT_IRQ_COALESCE:
+		break;
+	case NVME_FEAT_IRQ_CONFIG:
+		break;
+	case NVME_FEAT_WRITE_ATOMIC:
+		break;
+	case NVME_FEAT_ASYNC_EVENT:
+		break;
+#endif
+	case NVME_FEAT_VOLATILE_WC:
+		nvmet_set_result(req, 1);
+		break;
+	case NVME_FEAT_NUM_QUEUES:
+		nvmet_set_result(req,
+			(subsys->max_qid-1) | ((subsys->max_qid-1) << 16));
+		break;
+	case NVME_FEAT_KATO:
+		nvmet_set_result(req, req->sq->ctrl->kato * 1000);
+		break;
+	case NVME_FEAT_HOST_ID:
+		/* need 128-bit host identifier flag */
+		if (!(req->cmd->common.cdw10[1] & cpu_to_le32(1 << 0))) {
+			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+			break;
+		}
+
+		status = nvmet_copy_to_sgl(req, 0, &req->sq->ctrl->hostid,
+				sizeof(req->sq->ctrl->hostid));
+		break;
+	default:
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		break;
+	}
+
+	nvmet_req_complete(req, status);
+}
+
+static void nvmet_execute_async_event(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+
+	mutex_lock(&ctrl->lock);
+	if (ctrl->nr_async_event_cmds >= NVMET_ASYNC_EVENTS) {
+		mutex_unlock(&ctrl->lock);
+		nvmet_req_complete(req, NVME_SC_ASYNC_LIMIT | NVME_SC_DNR);
+		return;
+	}
+	ctrl->async_event_cmds[ctrl->nr_async_event_cmds++] = req;
+	mutex_unlock(&ctrl->lock);
+
+	schedule_work(&ctrl->async_event_work);
+}
+
+static void nvmet_execute_keep_alive(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+
+	pr_debug("ctrl %d update keep-alive timer for %d secs\n",
+		ctrl->cntlid, ctrl->kato);
+
+	mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
+	nvmet_req_complete(req, 0);
+}
+
+u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
+{
+	struct nvme_command *cmd = req->cmd;
+	u16 ret;
+
+	req->ns = NULL;
+
+	ret = nvmet_check_ctrl_status(req, cmd);
+	if (unlikely(ret))
+		return ret;
+
+	switch (cmd->common.opcode) {
+	case nvme_admin_get_log_page:
+		req->data_len = nvmet_get_log_page_len(cmd);
+
+		switch (cmd->get_log_page.lid) {
+		case NVME_LOG_ERROR:
+		case NVME_LOG_SMART:
+		case NVME_LOG_FW_SLOT:
+			req->execute = nvmet_execute_get_log_page;
+			return 0;
+		}
+		break;
+	case nvme_admin_identify:
+		req->data_len = NVME_IDENTIFY_DATA_SIZE;
+		switch (cmd->identify.cns) {
+		case NVME_ID_CNS_NS:
+			req->execute = nvmet_execute_identify_ns;
+			return 0;
+		case NVME_ID_CNS_CTRL:
+			req->execute = nvmet_execute_identify_ctrl;
+			return 0;
+		case NVME_ID_CNS_NS_ACTIVE_LIST:
+			req->execute = nvmet_execute_identify_nslist;
+			return 0;
+		case NVME_ID_CNS_NS_DESC_LIST:
+			req->execute = nvmet_execute_identify_desclist;
+			return 0;
+		}
+		break;
+	case nvme_admin_abort_cmd:
+		req->execute = nvmet_execute_abort;
+		req->data_len = 0;
+		return 0;
+	case nvme_admin_set_features:
+		req->execute = nvmet_execute_set_features;
+		req->data_len = 0;
+		return 0;
+	case nvme_admin_get_features:
+		req->execute = nvmet_execute_get_features;
+		req->data_len = 0;
+		return 0;
+	case nvme_admin_async_event:
+		req->execute = nvmet_execute_async_event;
+		req->data_len = 0;
+		return 0;
+	case nvme_admin_keep_alive:
+		req->execute = nvmet_execute_keep_alive;
+		req->data_len = 0;
+		return 0;
+	}
+
+	pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
+	       req->sq->qid);
+	return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+}
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
new file mode 100644
index 0000000..ad9ff27
--- /dev/null
+++ b/drivers/nvme/target/configfs.c
@@ -0,0 +1,1017 @@
+/*
+ * Configfs interface for the NVMe target.
+ * Copyright (c) 2015-2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/ctype.h>
+
+#include "nvmet.h"
+
+static const struct config_item_type nvmet_host_type;
+static const struct config_item_type nvmet_subsys_type;
+
+static const struct nvmet_transport_name {
+	u8		type;
+	const char	*name;
+} nvmet_transport_names[] = {
+	{ NVMF_TRTYPE_RDMA,	"rdma" },
+	{ NVMF_TRTYPE_FC,	"fc" },
+	{ NVMF_TRTYPE_LOOP,	"loop" },
+};
+
+/*
+ * nvmet_port Generic ConfigFS definitions.
+ * Used in any place in the ConfigFS tree that refers to an address.
+ */
+static ssize_t nvmet_addr_adrfam_show(struct config_item *item,
+		char *page)
+{
+	switch (to_nvmet_port(item)->disc_addr.adrfam) {
+	case NVMF_ADDR_FAMILY_IP4:
+		return sprintf(page, "ipv4\n");
+	case NVMF_ADDR_FAMILY_IP6:
+		return sprintf(page, "ipv6\n");
+	case NVMF_ADDR_FAMILY_IB:
+		return sprintf(page, "ib\n");
+	case NVMF_ADDR_FAMILY_FC:
+		return sprintf(page, "fc\n");
+	default:
+		return sprintf(page, "\n");
+	}
+}
+
+static ssize_t nvmet_addr_adrfam_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+
+	if (port->enabled) {
+		pr_err("Cannot modify address while enabled\n");
+		pr_err("Disable the address before modifying\n");
+		return -EACCES;
+	}
+
+	if (sysfs_streq(page, "ipv4")) {
+		port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP4;
+	} else if (sysfs_streq(page, "ipv6")) {
+		port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP6;
+	} else if (sysfs_streq(page, "ib")) {
+		port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IB;
+	} else if (sysfs_streq(page, "fc")) {
+		port->disc_addr.adrfam = NVMF_ADDR_FAMILY_FC;
+	} else {
+		pr_err("Invalid value '%s' for adrfam\n", page);
+		return -EINVAL;
+	}
+
+	return count;
+}
+
+CONFIGFS_ATTR(nvmet_, addr_adrfam);
+
+static ssize_t nvmet_addr_portid_show(struct config_item *item,
+		char *page)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+
+	return snprintf(page, PAGE_SIZE, "%d\n",
+			le16_to_cpu(port->disc_addr.portid));
+}
+
+static ssize_t nvmet_addr_portid_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+	u16 portid = 0;
+
+	if (kstrtou16(page, 0, &portid)) {
+		pr_err("Invalid value '%s' for portid\n", page);
+		return -EINVAL;
+	}
+
+	if (port->enabled) {
+		pr_err("Cannot modify address while enabled\n");
+		pr_err("Disable the address before modifying\n");
+		return -EACCES;
+	}
+	port->disc_addr.portid = cpu_to_le16(portid);
+	return count;
+}
+
+CONFIGFS_ATTR(nvmet_, addr_portid);
+
+static ssize_t nvmet_addr_traddr_show(struct config_item *item,
+		char *page)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+
+	return snprintf(page, PAGE_SIZE, "%s\n",
+			port->disc_addr.traddr);
+}
+
+static ssize_t nvmet_addr_traddr_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+
+	if (count > NVMF_TRADDR_SIZE) {
+		pr_err("Invalid value '%s' for traddr\n", page);
+		return -EINVAL;
+	}
+
+	if (port->enabled) {
+		pr_err("Cannot modify address while enabled\n");
+		pr_err("Disable the address before modifying\n");
+		return -EACCES;
+	}
+	return snprintf(port->disc_addr.traddr,
+			sizeof(port->disc_addr.traddr), "%s", page);
+}
+
+CONFIGFS_ATTR(nvmet_, addr_traddr);
+
+static ssize_t nvmet_addr_treq_show(struct config_item *item,
+		char *page)
+{
+	switch (to_nvmet_port(item)->disc_addr.treq) {
+	case NVMF_TREQ_NOT_SPECIFIED:
+		return sprintf(page, "not specified\n");
+	case NVMF_TREQ_REQUIRED:
+		return sprintf(page, "required\n");
+	case NVMF_TREQ_NOT_REQUIRED:
+		return sprintf(page, "not required\n");
+	default:
+		return sprintf(page, "\n");
+	}
+}
+
+static ssize_t nvmet_addr_treq_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+
+	if (port->enabled) {
+		pr_err("Cannot modify address while enabled\n");
+		pr_err("Disable the address before modifying\n");
+		return -EACCES;
+	}
+
+	if (sysfs_streq(page, "not specified")) {
+		port->disc_addr.treq = NVMF_TREQ_NOT_SPECIFIED;
+	} else if (sysfs_streq(page, "required")) {
+		port->disc_addr.treq = NVMF_TREQ_REQUIRED;
+	} else if (sysfs_streq(page, "not required")) {
+		port->disc_addr.treq = NVMF_TREQ_NOT_REQUIRED;
+	} else {
+		pr_err("Invalid value '%s' for treq\n", page);
+		return -EINVAL;
+	}
+
+	return count;
+}
+
+CONFIGFS_ATTR(nvmet_, addr_treq);
+
+static ssize_t nvmet_addr_trsvcid_show(struct config_item *item,
+		char *page)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+
+	return snprintf(page, PAGE_SIZE, "%s\n",
+			port->disc_addr.trsvcid);
+}
+
+static ssize_t nvmet_addr_trsvcid_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+
+	if (count > NVMF_TRSVCID_SIZE) {
+		pr_err("Invalid value '%s' for trsvcid\n", page);
+		return -EINVAL;
+	}
+	if (port->enabled) {
+		pr_err("Cannot modify address while enabled\n");
+		pr_err("Disable the address before modifying\n");
+		return -EACCES;
+	}
+	return snprintf(port->disc_addr.trsvcid,
+			sizeof(port->disc_addr.trsvcid), "%s", page);
+}
+
+CONFIGFS_ATTR(nvmet_, addr_trsvcid);
+
+static ssize_t nvmet_addr_trtype_show(struct config_item *item,
+		char *page)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(nvmet_transport_names); i++) {
+		if (port->disc_addr.trtype != nvmet_transport_names[i].type)
+			continue;
+		return sprintf(page, "%s\n", nvmet_transport_names[i].name);
+	}
+
+	return sprintf(page, "\n");
+}
+
+static void nvmet_port_init_tsas_rdma(struct nvmet_port *port)
+{
+	port->disc_addr.tsas.rdma.qptype = NVMF_RDMA_QPTYPE_CONNECTED;
+	port->disc_addr.tsas.rdma.prtype = NVMF_RDMA_PRTYPE_NOT_SPECIFIED;
+	port->disc_addr.tsas.rdma.cms = NVMF_RDMA_CMS_RDMA_CM;
+}
+
+static ssize_t nvmet_addr_trtype_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+	int i;
+
+	if (port->enabled) {
+		pr_err("Cannot modify address while enabled\n");
+		pr_err("Disable the address before modifying\n");
+		return -EACCES;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(nvmet_transport_names); i++) {
+		if (sysfs_streq(page, nvmet_transport_names[i].name))
+			goto found;
+	}
+
+	pr_err("Invalid value '%s' for trtype\n", page);
+	return -EINVAL;
+found:
+	memset(&port->disc_addr.tsas, 0, NVMF_TSAS_SIZE);
+	port->disc_addr.trtype = nvmet_transport_names[i].type;
+	if (port->disc_addr.trtype == NVMF_TRTYPE_RDMA)
+		nvmet_port_init_tsas_rdma(port);
+	return count;
+}
+
+CONFIGFS_ATTR(nvmet_, addr_trtype);
+
+/*
+ * Namespace structures & file operation functions below
+ */
+static ssize_t nvmet_ns_device_path_show(struct config_item *item, char *page)
+{
+	return sprintf(page, "%s\n", to_nvmet_ns(item)->device_path);
+}
+
+static ssize_t nvmet_ns_device_path_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_ns *ns = to_nvmet_ns(item);
+	struct nvmet_subsys *subsys = ns->subsys;
+	int ret;
+
+	mutex_lock(&subsys->lock);
+	ret = -EBUSY;
+	if (ns->enabled)
+		goto out_unlock;
+
+	kfree(ns->device_path);
+
+	ret = -ENOMEM;
+	ns->device_path = kstrdup(page, GFP_KERNEL);
+	if (!ns->device_path)
+		goto out_unlock;
+
+	mutex_unlock(&subsys->lock);
+	return count;
+
+out_unlock:
+	mutex_unlock(&subsys->lock);
+	return ret;
+}
+
+CONFIGFS_ATTR(nvmet_ns_, device_path);
+
+static ssize_t nvmet_ns_device_uuid_show(struct config_item *item, char *page)
+{
+	return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->uuid);
+}
+
+static ssize_t nvmet_ns_device_uuid_store(struct config_item *item,
+					  const char *page, size_t count)
+{
+	struct nvmet_ns *ns = to_nvmet_ns(item);
+	struct nvmet_subsys *subsys = ns->subsys;
+	int ret = 0;
+
+
+	mutex_lock(&subsys->lock);
+	if (ns->enabled) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+
+	if (uuid_parse(page, &ns->uuid))
+		ret = -EINVAL;
+
+out_unlock:
+	mutex_unlock(&subsys->lock);
+	return ret ? ret : count;
+}
+
+CONFIGFS_ATTR(nvmet_ns_, device_uuid);
+
+static ssize_t nvmet_ns_device_nguid_show(struct config_item *item, char *page)
+{
+	return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->nguid);
+}
+
+static ssize_t nvmet_ns_device_nguid_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_ns *ns = to_nvmet_ns(item);
+	struct nvmet_subsys *subsys = ns->subsys;
+	u8 nguid[16];
+	const char *p = page;
+	int i;
+	int ret = 0;
+
+	mutex_lock(&subsys->lock);
+	if (ns->enabled) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	for (i = 0; i < 16; i++) {
+		if (p + 2 > page + count) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		if (!isxdigit(p[0]) || !isxdigit(p[1])) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		nguid[i] = (hex_to_bin(p[0]) << 4) | hex_to_bin(p[1]);
+		p += 2;
+
+		if (*p == '-' || *p == ':')
+			p++;
+	}
+
+	memcpy(&ns->nguid, nguid, sizeof(nguid));
+out_unlock:
+	mutex_unlock(&subsys->lock);
+	return ret ? ret : count;
+}
+
+CONFIGFS_ATTR(nvmet_ns_, device_nguid);
+
+static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page)
+{
+	return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled);
+}
+
+static ssize_t nvmet_ns_enable_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_ns *ns = to_nvmet_ns(item);
+	bool enable;
+	int ret = 0;
+
+	if (strtobool(page, &enable))
+		return -EINVAL;
+
+	if (enable)
+		ret = nvmet_ns_enable(ns);
+	else
+		nvmet_ns_disable(ns);
+
+	return ret ? ret : count;
+}
+
+CONFIGFS_ATTR(nvmet_ns_, enable);
+
+static struct configfs_attribute *nvmet_ns_attrs[] = {
+	&nvmet_ns_attr_device_path,
+	&nvmet_ns_attr_device_nguid,
+	&nvmet_ns_attr_device_uuid,
+	&nvmet_ns_attr_enable,
+	NULL,
+};
+
+static void nvmet_ns_release(struct config_item *item)
+{
+	struct nvmet_ns *ns = to_nvmet_ns(item);
+
+	nvmet_ns_free(ns);
+}
+
+static struct configfs_item_operations nvmet_ns_item_ops = {
+	.release		= nvmet_ns_release,
+};
+
+static const struct config_item_type nvmet_ns_type = {
+	.ct_item_ops		= &nvmet_ns_item_ops,
+	.ct_attrs		= nvmet_ns_attrs,
+	.ct_owner		= THIS_MODULE,
+};
+
+static struct config_group *nvmet_ns_make(struct config_group *group,
+		const char *name)
+{
+	struct nvmet_subsys *subsys = namespaces_to_subsys(&group->cg_item);
+	struct nvmet_ns *ns;
+	int ret;
+	u32 nsid;
+
+	ret = kstrtou32(name, 0, &nsid);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+	if (nsid == 0 || nsid == NVME_NSID_ALL)
+		goto out;
+
+	ret = -ENOMEM;
+	ns = nvmet_ns_alloc(subsys, nsid);
+	if (!ns)
+		goto out;
+	config_group_init_type_name(&ns->group, name, &nvmet_ns_type);
+
+	pr_info("adding nsid %d to subsystem %s\n", nsid, subsys->subsysnqn);
+
+	return &ns->group;
+out:
+	return ERR_PTR(ret);
+}
+
+static struct configfs_group_operations nvmet_namespaces_group_ops = {
+	.make_group		= nvmet_ns_make,
+};
+
+static const struct config_item_type nvmet_namespaces_type = {
+	.ct_group_ops		= &nvmet_namespaces_group_ops,
+	.ct_owner		= THIS_MODULE,
+};
+
+static int nvmet_port_subsys_allow_link(struct config_item *parent,
+		struct config_item *target)
+{
+	struct nvmet_port *port = to_nvmet_port(parent->ci_parent);
+	struct nvmet_subsys *subsys;
+	struct nvmet_subsys_link *link, *p;
+	int ret;
+
+	if (target->ci_type != &nvmet_subsys_type) {
+		pr_err("can only link subsystems into the subsystems dir.!\n");
+		return -EINVAL;
+	}
+	subsys = to_subsys(target);
+	link = kmalloc(sizeof(*link), GFP_KERNEL);
+	if (!link)
+		return -ENOMEM;
+	link->subsys = subsys;
+
+	down_write(&nvmet_config_sem);
+	ret = -EEXIST;
+	list_for_each_entry(p, &port->subsystems, entry) {
+		if (p->subsys == subsys)
+			goto out_free_link;
+	}
+
+	if (list_empty(&port->subsystems)) {
+		ret = nvmet_enable_port(port);
+		if (ret)
+			goto out_free_link;
+	}
+
+	list_add_tail(&link->entry, &port->subsystems);
+	nvmet_genctr++;
+	up_write(&nvmet_config_sem);
+	return 0;
+
+out_free_link:
+	up_write(&nvmet_config_sem);
+	kfree(link);
+	return ret;
+}
+
+static void nvmet_port_subsys_drop_link(struct config_item *parent,
+		struct config_item *target)
+{
+	struct nvmet_port *port = to_nvmet_port(parent->ci_parent);
+	struct nvmet_subsys *subsys = to_subsys(target);
+	struct nvmet_subsys_link *p;
+
+	down_write(&nvmet_config_sem);
+	list_for_each_entry(p, &port->subsystems, entry) {
+		if (p->subsys == subsys)
+			goto found;
+	}
+	up_write(&nvmet_config_sem);
+	return;
+
+found:
+	list_del(&p->entry);
+	nvmet_genctr++;
+	if (list_empty(&port->subsystems))
+		nvmet_disable_port(port);
+	up_write(&nvmet_config_sem);
+	kfree(p);
+}
+
+static struct configfs_item_operations nvmet_port_subsys_item_ops = {
+	.allow_link		= nvmet_port_subsys_allow_link,
+	.drop_link		= nvmet_port_subsys_drop_link,
+};
+
+static const struct config_item_type nvmet_port_subsys_type = {
+	.ct_item_ops		= &nvmet_port_subsys_item_ops,
+	.ct_owner		= THIS_MODULE,
+};
+
+static int nvmet_allowed_hosts_allow_link(struct config_item *parent,
+		struct config_item *target)
+{
+	struct nvmet_subsys *subsys = to_subsys(parent->ci_parent);
+	struct nvmet_host *host;
+	struct nvmet_host_link *link, *p;
+	int ret;
+
+	if (target->ci_type != &nvmet_host_type) {
+		pr_err("can only link hosts into the allowed_hosts directory!\n");
+		return -EINVAL;
+	}
+
+	host = to_host(target);
+	link = kmalloc(sizeof(*link), GFP_KERNEL);
+	if (!link)
+		return -ENOMEM;
+	link->host = host;
+
+	down_write(&nvmet_config_sem);
+	ret = -EINVAL;
+	if (subsys->allow_any_host) {
+		pr_err("can't add hosts when allow_any_host is set!\n");
+		goto out_free_link;
+	}
+
+	ret = -EEXIST;
+	list_for_each_entry(p, &subsys->hosts, entry) {
+		if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host)))
+			goto out_free_link;
+	}
+	list_add_tail(&link->entry, &subsys->hosts);
+	nvmet_genctr++;
+	up_write(&nvmet_config_sem);
+	return 0;
+out_free_link:
+	up_write(&nvmet_config_sem);
+	kfree(link);
+	return ret;
+}
+
+static void nvmet_allowed_hosts_drop_link(struct config_item *parent,
+		struct config_item *target)
+{
+	struct nvmet_subsys *subsys = to_subsys(parent->ci_parent);
+	struct nvmet_host *host = to_host(target);
+	struct nvmet_host_link *p;
+
+	down_write(&nvmet_config_sem);
+	list_for_each_entry(p, &subsys->hosts, entry) {
+		if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host)))
+			goto found;
+	}
+	up_write(&nvmet_config_sem);
+	return;
+
+found:
+	list_del(&p->entry);
+	nvmet_genctr++;
+	up_write(&nvmet_config_sem);
+	kfree(p);
+}
+
+static struct configfs_item_operations nvmet_allowed_hosts_item_ops = {
+	.allow_link		= nvmet_allowed_hosts_allow_link,
+	.drop_link		= nvmet_allowed_hosts_drop_link,
+};
+
+static const struct config_item_type nvmet_allowed_hosts_type = {
+	.ct_item_ops		= &nvmet_allowed_hosts_item_ops,
+	.ct_owner		= THIS_MODULE,
+};
+
+static ssize_t nvmet_subsys_attr_allow_any_host_show(struct config_item *item,
+		char *page)
+{
+	return snprintf(page, PAGE_SIZE, "%d\n",
+		to_subsys(item)->allow_any_host);
+}
+
+static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_subsys *subsys = to_subsys(item);
+	bool allow_any_host;
+	int ret = 0;
+
+	if (strtobool(page, &allow_any_host))
+		return -EINVAL;
+
+	down_write(&nvmet_config_sem);
+	if (allow_any_host && !list_empty(&subsys->hosts)) {
+		pr_err("Can't set allow_any_host when explicit hosts are set!\n");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	subsys->allow_any_host = allow_any_host;
+out_unlock:
+	up_write(&nvmet_config_sem);
+	return ret ? ret : count;
+}
+
+CONFIGFS_ATTR(nvmet_subsys_, attr_allow_any_host);
+
+static ssize_t nvmet_subsys_attr_version_show(struct config_item *item,
+					      char *page)
+{
+	struct nvmet_subsys *subsys = to_subsys(item);
+
+	if (NVME_TERTIARY(subsys->ver))
+		return snprintf(page, PAGE_SIZE, "%d.%d.%d\n",
+				(int)NVME_MAJOR(subsys->ver),
+				(int)NVME_MINOR(subsys->ver),
+				(int)NVME_TERTIARY(subsys->ver));
+	else
+		return snprintf(page, PAGE_SIZE, "%d.%d\n",
+				(int)NVME_MAJOR(subsys->ver),
+				(int)NVME_MINOR(subsys->ver));
+}
+
+static ssize_t nvmet_subsys_attr_version_store(struct config_item *item,
+					       const char *page, size_t count)
+{
+	struct nvmet_subsys *subsys = to_subsys(item);
+	int major, minor, tertiary = 0;
+	int ret;
+
+
+	ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary);
+	if (ret != 2 && ret != 3)
+		return -EINVAL;
+
+	down_write(&nvmet_config_sem);
+	subsys->ver = NVME_VS(major, minor, tertiary);
+	up_write(&nvmet_config_sem);
+
+	return count;
+}
+CONFIGFS_ATTR(nvmet_subsys_, attr_version);
+
+static ssize_t nvmet_subsys_attr_serial_show(struct config_item *item,
+					     char *page)
+{
+	struct nvmet_subsys *subsys = to_subsys(item);
+
+	return snprintf(page, PAGE_SIZE, "%llx\n", subsys->serial);
+}
+
+static ssize_t nvmet_subsys_attr_serial_store(struct config_item *item,
+					      const char *page, size_t count)
+{
+	struct nvmet_subsys *subsys = to_subsys(item);
+
+	down_write(&nvmet_config_sem);
+	sscanf(page, "%llx\n", &subsys->serial);
+	up_write(&nvmet_config_sem);
+
+	return count;
+}
+CONFIGFS_ATTR(nvmet_subsys_, attr_serial);
+
+static struct configfs_attribute *nvmet_subsys_attrs[] = {
+	&nvmet_subsys_attr_attr_allow_any_host,
+	&nvmet_subsys_attr_attr_version,
+	&nvmet_subsys_attr_attr_serial,
+	NULL,
+};
+
+/*
+ * Subsystem structures & folder operation functions below
+ */
+static void nvmet_subsys_release(struct config_item *item)
+{
+	struct nvmet_subsys *subsys = to_subsys(item);
+
+	nvmet_subsys_del_ctrls(subsys);
+	nvmet_subsys_put(subsys);
+}
+
+static struct configfs_item_operations nvmet_subsys_item_ops = {
+	.release		= nvmet_subsys_release,
+};
+
+static const struct config_item_type nvmet_subsys_type = {
+	.ct_item_ops		= &nvmet_subsys_item_ops,
+	.ct_attrs		= nvmet_subsys_attrs,
+	.ct_owner		= THIS_MODULE,
+};
+
+static struct config_group *nvmet_subsys_make(struct config_group *group,
+		const char *name)
+{
+	struct nvmet_subsys *subsys;
+
+	if (sysfs_streq(name, NVME_DISC_SUBSYS_NAME)) {
+		pr_err("can't create discovery subsystem through configfs\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	subsys = nvmet_subsys_alloc(name, NVME_NQN_NVME);
+	if (!subsys)
+		return ERR_PTR(-ENOMEM);
+
+	config_group_init_type_name(&subsys->group, name, &nvmet_subsys_type);
+
+	config_group_init_type_name(&subsys->namespaces_group,
+			"namespaces", &nvmet_namespaces_type);
+	configfs_add_default_group(&subsys->namespaces_group, &subsys->group);
+
+	config_group_init_type_name(&subsys->allowed_hosts_group,
+			"allowed_hosts", &nvmet_allowed_hosts_type);
+	configfs_add_default_group(&subsys->allowed_hosts_group,
+			&subsys->group);
+
+	return &subsys->group;
+}
+
+static struct configfs_group_operations nvmet_subsystems_group_ops = {
+	.make_group		= nvmet_subsys_make,
+};
+
+static const struct config_item_type nvmet_subsystems_type = {
+	.ct_group_ops		= &nvmet_subsystems_group_ops,
+	.ct_owner		= THIS_MODULE,
+};
+
+static ssize_t nvmet_referral_enable_show(struct config_item *item,
+		char *page)
+{
+	return snprintf(page, PAGE_SIZE, "%d\n", to_nvmet_port(item)->enabled);
+}
+
+static ssize_t nvmet_referral_enable_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent);
+	struct nvmet_port *port = to_nvmet_port(item);
+	bool enable;
+
+	if (strtobool(page, &enable))
+		goto inval;
+
+	if (enable)
+		nvmet_referral_enable(parent, port);
+	else
+		nvmet_referral_disable(port);
+
+	return count;
+inval:
+	pr_err("Invalid value '%s' for enable\n", page);
+	return -EINVAL;
+}
+
+CONFIGFS_ATTR(nvmet_referral_, enable);
+
+/*
+ * Discovery Service subsystem definitions
+ */
+static struct configfs_attribute *nvmet_referral_attrs[] = {
+	&nvmet_attr_addr_adrfam,
+	&nvmet_attr_addr_portid,
+	&nvmet_attr_addr_treq,
+	&nvmet_attr_addr_traddr,
+	&nvmet_attr_addr_trsvcid,
+	&nvmet_attr_addr_trtype,
+	&nvmet_referral_attr_enable,
+	NULL,
+};
+
+static void nvmet_referral_release(struct config_item *item)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+
+	nvmet_referral_disable(port);
+	kfree(port);
+}
+
+static struct configfs_item_operations nvmet_referral_item_ops = {
+	.release	= nvmet_referral_release,
+};
+
+static const struct config_item_type nvmet_referral_type = {
+	.ct_owner	= THIS_MODULE,
+	.ct_attrs	= nvmet_referral_attrs,
+	.ct_item_ops	= &nvmet_referral_item_ops,
+};
+
+static struct config_group *nvmet_referral_make(
+		struct config_group *group, const char *name)
+{
+	struct nvmet_port *port;
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	if (!port)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&port->entry);
+	config_group_init_type_name(&port->group, name, &nvmet_referral_type);
+
+	return &port->group;
+}
+
+static struct configfs_group_operations nvmet_referral_group_ops = {
+	.make_group		= nvmet_referral_make,
+};
+
+static const struct config_item_type nvmet_referrals_type = {
+	.ct_owner	= THIS_MODULE,
+	.ct_group_ops	= &nvmet_referral_group_ops,
+};
+
+/*
+ * Ports definitions.
+ */
+static void nvmet_port_release(struct config_item *item)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+
+	kfree(port);
+}
+
+static struct configfs_attribute *nvmet_port_attrs[] = {
+	&nvmet_attr_addr_adrfam,
+	&nvmet_attr_addr_treq,
+	&nvmet_attr_addr_traddr,
+	&nvmet_attr_addr_trsvcid,
+	&nvmet_attr_addr_trtype,
+	NULL,
+};
+
+static struct configfs_item_operations nvmet_port_item_ops = {
+	.release		= nvmet_port_release,
+};
+
+static const struct config_item_type nvmet_port_type = {
+	.ct_attrs		= nvmet_port_attrs,
+	.ct_item_ops		= &nvmet_port_item_ops,
+	.ct_owner		= THIS_MODULE,
+};
+
+static struct config_group *nvmet_ports_make(struct config_group *group,
+		const char *name)
+{
+	struct nvmet_port *port;
+	u16 portid;
+
+	if (kstrtou16(name, 0, &portid))
+		return ERR_PTR(-EINVAL);
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	if (!port)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&port->entry);
+	INIT_LIST_HEAD(&port->subsystems);
+	INIT_LIST_HEAD(&port->referrals);
+
+	port->disc_addr.portid = cpu_to_le16(portid);
+	config_group_init_type_name(&port->group, name, &nvmet_port_type);
+
+	config_group_init_type_name(&port->subsys_group,
+			"subsystems", &nvmet_port_subsys_type);
+	configfs_add_default_group(&port->subsys_group, &port->group);
+
+	config_group_init_type_name(&port->referrals_group,
+			"referrals", &nvmet_referrals_type);
+	configfs_add_default_group(&port->referrals_group, &port->group);
+
+	return &port->group;
+}
+
+static struct configfs_group_operations nvmet_ports_group_ops = {
+	.make_group		= nvmet_ports_make,
+};
+
+static const struct config_item_type nvmet_ports_type = {
+	.ct_group_ops		= &nvmet_ports_group_ops,
+	.ct_owner		= THIS_MODULE,
+};
+
+static struct config_group nvmet_subsystems_group;
+static struct config_group nvmet_ports_group;
+
+static void nvmet_host_release(struct config_item *item)
+{
+	struct nvmet_host *host = to_host(item);
+
+	kfree(host);
+}
+
+static struct configfs_item_operations nvmet_host_item_ops = {
+	.release		= nvmet_host_release,
+};
+
+static const struct config_item_type nvmet_host_type = {
+	.ct_item_ops		= &nvmet_host_item_ops,
+	.ct_owner		= THIS_MODULE,
+};
+
+static struct config_group *nvmet_hosts_make_group(struct config_group *group,
+		const char *name)
+{
+	struct nvmet_host *host;
+
+	host = kzalloc(sizeof(*host), GFP_KERNEL);
+	if (!host)
+		return ERR_PTR(-ENOMEM);
+
+	config_group_init_type_name(&host->group, name, &nvmet_host_type);
+
+	return &host->group;
+}
+
+static struct configfs_group_operations nvmet_hosts_group_ops = {
+	.make_group		= nvmet_hosts_make_group,
+};
+
+static const struct config_item_type nvmet_hosts_type = {
+	.ct_group_ops		= &nvmet_hosts_group_ops,
+	.ct_owner		= THIS_MODULE,
+};
+
+static struct config_group nvmet_hosts_group;
+
+static const struct config_item_type nvmet_root_type = {
+	.ct_owner		= THIS_MODULE,
+};
+
+static struct configfs_subsystem nvmet_configfs_subsystem = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf	= "nvmet",
+			.ci_type	= &nvmet_root_type,
+		},
+	},
+};
+
+int __init nvmet_init_configfs(void)
+{
+	int ret;
+
+	config_group_init(&nvmet_configfs_subsystem.su_group);
+	mutex_init(&nvmet_configfs_subsystem.su_mutex);
+
+	config_group_init_type_name(&nvmet_subsystems_group,
+			"subsystems", &nvmet_subsystems_type);
+	configfs_add_default_group(&nvmet_subsystems_group,
+			&nvmet_configfs_subsystem.su_group);
+
+	config_group_init_type_name(&nvmet_ports_group,
+			"ports", &nvmet_ports_type);
+	configfs_add_default_group(&nvmet_ports_group,
+			&nvmet_configfs_subsystem.su_group);
+
+	config_group_init_type_name(&nvmet_hosts_group,
+			"hosts", &nvmet_hosts_type);
+	configfs_add_default_group(&nvmet_hosts_group,
+			&nvmet_configfs_subsystem.su_group);
+
+	ret = configfs_register_subsystem(&nvmet_configfs_subsystem);
+	if (ret) {
+		pr_err("configfs_register_subsystem: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+void __exit nvmet_exit_configfs(void)
+{
+	configfs_unregister_subsystem(&nvmet_configfs_subsystem);
+}
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
new file mode 100644
index 0000000..e95424f
--- /dev/null
+++ b/drivers/nvme/target/core.c
@@ -0,0 +1,1062 @@
+/*
+ * Common code for the NVMe target.
+ * Copyright (c) 2015-2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+
+#include "nvmet.h"
+
+static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
+static DEFINE_IDA(cntlid_ida);
+
+/*
+ * This read/write semaphore is used to synchronize access to configuration
+ * information on a target system that will result in discovery log page
+ * information change for at least one host.
+ * The full list of resources to protected by this semaphore is:
+ *
+ *  - subsystems list
+ *  - per-subsystem allowed hosts list
+ *  - allow_any_host subsystem attribute
+ *  - nvmet_genctr
+ *  - the nvmet_transports array
+ *
+ * When updating any of those lists/structures write lock should be obtained,
+ * while when reading (popolating discovery log page or checking host-subsystem
+ * link) read lock is obtained to allow concurrent reads.
+ */
+DECLARE_RWSEM(nvmet_config_sem);
+
+static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
+		const char *subsysnqn);
+
+u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
+		size_t len)
+{
+	if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len)
+		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
+	return 0;
+}
+
+u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
+{
+	if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len)
+		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
+	return 0;
+}
+
+static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
+{
+	struct nvmet_ns *ns;
+
+	if (list_empty(&subsys->namespaces))
+		return 0;
+
+	ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link);
+	return ns->nsid;
+}
+
+static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
+{
+	return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
+}
+
+static void nvmet_async_events_free(struct nvmet_ctrl *ctrl)
+{
+	struct nvmet_req *req;
+
+	while (1) {
+		mutex_lock(&ctrl->lock);
+		if (!ctrl->nr_async_event_cmds) {
+			mutex_unlock(&ctrl->lock);
+			return;
+		}
+
+		req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
+		mutex_unlock(&ctrl->lock);
+		nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR);
+	}
+}
+
+static void nvmet_async_event_work(struct work_struct *work)
+{
+	struct nvmet_ctrl *ctrl =
+		container_of(work, struct nvmet_ctrl, async_event_work);
+	struct nvmet_async_event *aen;
+	struct nvmet_req *req;
+
+	while (1) {
+		mutex_lock(&ctrl->lock);
+		aen = list_first_entry_or_null(&ctrl->async_events,
+				struct nvmet_async_event, entry);
+		if (!aen || !ctrl->nr_async_event_cmds) {
+			mutex_unlock(&ctrl->lock);
+			return;
+		}
+
+		req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
+		nvmet_set_result(req, nvmet_async_event_result(aen));
+
+		list_del(&aen->entry);
+		kfree(aen);
+
+		mutex_unlock(&ctrl->lock);
+		nvmet_req_complete(req, 0);
+	}
+}
+
+static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
+		u8 event_info, u8 log_page)
+{
+	struct nvmet_async_event *aen;
+
+	aen = kmalloc(sizeof(*aen), GFP_KERNEL);
+	if (!aen)
+		return;
+
+	aen->event_type = event_type;
+	aen->event_info = event_info;
+	aen->log_page = log_page;
+
+	mutex_lock(&ctrl->lock);
+	list_add_tail(&aen->entry, &ctrl->async_events);
+	mutex_unlock(&ctrl->lock);
+
+	schedule_work(&ctrl->async_event_work);
+}
+
+int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
+{
+	int ret = 0;
+
+	down_write(&nvmet_config_sem);
+	if (nvmet_transports[ops->type])
+		ret = -EINVAL;
+	else
+		nvmet_transports[ops->type] = ops;
+	up_write(&nvmet_config_sem);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvmet_register_transport);
+
+void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops)
+{
+	down_write(&nvmet_config_sem);
+	nvmet_transports[ops->type] = NULL;
+	up_write(&nvmet_config_sem);
+}
+EXPORT_SYMBOL_GPL(nvmet_unregister_transport);
+
+int nvmet_enable_port(struct nvmet_port *port)
+{
+	const struct nvmet_fabrics_ops *ops;
+	int ret;
+
+	lockdep_assert_held(&nvmet_config_sem);
+
+	ops = nvmet_transports[port->disc_addr.trtype];
+	if (!ops) {
+		up_write(&nvmet_config_sem);
+		request_module("nvmet-transport-%d", port->disc_addr.trtype);
+		down_write(&nvmet_config_sem);
+		ops = nvmet_transports[port->disc_addr.trtype];
+		if (!ops) {
+			pr_err("transport type %d not supported\n",
+				port->disc_addr.trtype);
+			return -EINVAL;
+		}
+	}
+
+	if (!try_module_get(ops->owner))
+		return -EINVAL;
+
+	ret = ops->add_port(port);
+	if (ret) {
+		module_put(ops->owner);
+		return ret;
+	}
+
+	port->enabled = true;
+	return 0;
+}
+
+void nvmet_disable_port(struct nvmet_port *port)
+{
+	const struct nvmet_fabrics_ops *ops;
+
+	lockdep_assert_held(&nvmet_config_sem);
+
+	port->enabled = false;
+
+	ops = nvmet_transports[port->disc_addr.trtype];
+	ops->remove_port(port);
+	module_put(ops->owner);
+}
+
+static void nvmet_keep_alive_timer(struct work_struct *work)
+{
+	struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
+			struct nvmet_ctrl, ka_work);
+
+	pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
+		ctrl->cntlid, ctrl->kato);
+
+	nvmet_ctrl_fatal_error(ctrl);
+}
+
+static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
+{
+	pr_debug("ctrl %d start keep-alive timer for %d secs\n",
+		ctrl->cntlid, ctrl->kato);
+
+	INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer);
+	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+}
+
+static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
+{
+	pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid);
+
+	cancel_delayed_work_sync(&ctrl->ka_work);
+}
+
+static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl,
+		__le32 nsid)
+{
+	struct nvmet_ns *ns;
+
+	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
+		if (ns->nsid == le32_to_cpu(nsid))
+			return ns;
+	}
+
+	return NULL;
+}
+
+struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid)
+{
+	struct nvmet_ns *ns;
+
+	rcu_read_lock();
+	ns = __nvmet_find_namespace(ctrl, nsid);
+	if (ns)
+		percpu_ref_get(&ns->ref);
+	rcu_read_unlock();
+
+	return ns;
+}
+
+static void nvmet_destroy_namespace(struct percpu_ref *ref)
+{
+	struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref);
+
+	complete(&ns->disable_done);
+}
+
+void nvmet_put_namespace(struct nvmet_ns *ns)
+{
+	percpu_ref_put(&ns->ref);
+}
+
+int nvmet_ns_enable(struct nvmet_ns *ns)
+{
+	struct nvmet_subsys *subsys = ns->subsys;
+	struct nvmet_ctrl *ctrl;
+	int ret = 0;
+
+	mutex_lock(&subsys->lock);
+	if (ns->enabled)
+		goto out_unlock;
+
+	ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE,
+			NULL);
+	if (IS_ERR(ns->bdev)) {
+		pr_err("failed to open block device %s: (%ld)\n",
+		       ns->device_path, PTR_ERR(ns->bdev));
+		ret = PTR_ERR(ns->bdev);
+		ns->bdev = NULL;
+		goto out_unlock;
+	}
+
+	ns->size = i_size_read(ns->bdev->bd_inode);
+	ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
+
+	ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
+				0, GFP_KERNEL);
+	if (ret)
+		goto out_blkdev_put;
+
+	if (ns->nsid > subsys->max_nsid)
+		subsys->max_nsid = ns->nsid;
+
+	/*
+	 * The namespaces list needs to be sorted to simplify the implementation
+	 * of the Identify Namepace List subcommand.
+	 */
+	if (list_empty(&subsys->namespaces)) {
+		list_add_tail_rcu(&ns->dev_link, &subsys->namespaces);
+	} else {
+		struct nvmet_ns *old;
+
+		list_for_each_entry_rcu(old, &subsys->namespaces, dev_link) {
+			BUG_ON(ns->nsid == old->nsid);
+			if (ns->nsid < old->nsid)
+				break;
+		}
+
+		list_add_tail_rcu(&ns->dev_link, &old->dev_link);
+	}
+
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
+		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
+
+	ns->enabled = true;
+	ret = 0;
+out_unlock:
+	mutex_unlock(&subsys->lock);
+	return ret;
+out_blkdev_put:
+	blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ);
+	ns->bdev = NULL;
+	goto out_unlock;
+}
+
+void nvmet_ns_disable(struct nvmet_ns *ns)
+{
+	struct nvmet_subsys *subsys = ns->subsys;
+	struct nvmet_ctrl *ctrl;
+
+	mutex_lock(&subsys->lock);
+	if (!ns->enabled)
+		goto out_unlock;
+
+	ns->enabled = false;
+	list_del_rcu(&ns->dev_link);
+	if (ns->nsid == subsys->max_nsid)
+		subsys->max_nsid = nvmet_max_nsid(subsys);
+	mutex_unlock(&subsys->lock);
+
+	/*
+	 * Now that we removed the namespaces from the lookup list, we
+	 * can kill the per_cpu ref and wait for any remaining references
+	 * to be dropped, as well as a RCU grace period for anyone only
+	 * using the namepace under rcu_read_lock().  Note that we can't
+	 * use call_rcu here as we need to ensure the namespaces have
+	 * been fully destroyed before unloading the module.
+	 */
+	percpu_ref_kill(&ns->ref);
+	synchronize_rcu();
+	wait_for_completion(&ns->disable_done);
+	percpu_ref_exit(&ns->ref);
+
+	mutex_lock(&subsys->lock);
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
+		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
+
+	if (ns->bdev)
+		blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ);
+out_unlock:
+	mutex_unlock(&subsys->lock);
+}
+
+void nvmet_ns_free(struct nvmet_ns *ns)
+{
+	nvmet_ns_disable(ns);
+
+	kfree(ns->device_path);
+	kfree(ns);
+}
+
+struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
+{
+	struct nvmet_ns *ns;
+
+	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	if (!ns)
+		return NULL;
+
+	INIT_LIST_HEAD(&ns->dev_link);
+	init_completion(&ns->disable_done);
+
+	ns->nsid = nsid;
+	ns->subsys = subsys;
+	uuid_gen(&ns->uuid);
+
+	return ns;
+}
+
+static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
+{
+	u32 old_sqhd, new_sqhd;
+	u16 sqhd;
+
+	if (status)
+		nvmet_set_status(req, status);
+
+	if (req->sq->size) {
+		do {
+			old_sqhd = req->sq->sqhd;
+			new_sqhd = (old_sqhd + 1) % req->sq->size;
+		} while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
+					old_sqhd);
+	}
+	sqhd = req->sq->sqhd & 0x0000FFFF;
+	req->rsp->sq_head = cpu_to_le16(sqhd);
+	req->rsp->sq_id = cpu_to_le16(req->sq->qid);
+	req->rsp->command_id = req->cmd->common.command_id;
+
+	if (req->ns)
+		nvmet_put_namespace(req->ns);
+	req->ops->queue_response(req);
+}
+
+void nvmet_req_complete(struct nvmet_req *req, u16 status)
+{
+	__nvmet_req_complete(req, status);
+	percpu_ref_put(&req->sq->ref);
+}
+EXPORT_SYMBOL_GPL(nvmet_req_complete);
+
+void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
+		u16 qid, u16 size)
+{
+	cq->qid = qid;
+	cq->size = size;
+
+	ctrl->cqs[qid] = cq;
+}
+
+void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
+		u16 qid, u16 size)
+{
+	sq->sqhd = 0;
+	sq->qid = qid;
+	sq->size = size;
+
+	ctrl->sqs[qid] = sq;
+}
+
+static void nvmet_confirm_sq(struct percpu_ref *ref)
+{
+	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
+
+	complete(&sq->confirm_done);
+}
+
+void nvmet_sq_destroy(struct nvmet_sq *sq)
+{
+	/*
+	 * If this is the admin queue, complete all AERs so that our
+	 * queue doesn't have outstanding requests on it.
+	 */
+	if (sq->ctrl && sq->ctrl->sqs && sq->ctrl->sqs[0] == sq)
+		nvmet_async_events_free(sq->ctrl);
+	percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq);
+	wait_for_completion(&sq->confirm_done);
+	wait_for_completion(&sq->free_done);
+	percpu_ref_exit(&sq->ref);
+
+	if (sq->ctrl) {
+		nvmet_ctrl_put(sq->ctrl);
+		sq->ctrl = NULL; /* allows reusing the queue later */
+	}
+}
+EXPORT_SYMBOL_GPL(nvmet_sq_destroy);
+
+static void nvmet_sq_free(struct percpu_ref *ref)
+{
+	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
+
+	complete(&sq->free_done);
+}
+
+int nvmet_sq_init(struct nvmet_sq *sq)
+{
+	int ret;
+
+	ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
+	if (ret) {
+		pr_err("percpu_ref init failed!\n");
+		return ret;
+	}
+	init_completion(&sq->free_done);
+	init_completion(&sq->confirm_done);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvmet_sq_init);
+
+bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
+		struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
+{
+	u8 flags = req->cmd->common.flags;
+	u16 status;
+
+	req->cq = cq;
+	req->sq = sq;
+	req->ops = ops;
+	req->sg = NULL;
+	req->sg_cnt = 0;
+	req->transfer_len = 0;
+	req->rsp->status = 0;
+	req->ns = NULL;
+
+	/* no support for fused commands yet */
+	if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		goto fail;
+	}
+
+	/*
+	 * For fabrics, PSDT field shall describe metadata pointer (MPTR) that
+	 * contains an address of a single contiguous physical buffer that is
+	 * byte aligned.
+	 */
+	if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		goto fail;
+	}
+
+	if (unlikely(!req->sq->ctrl))
+		/* will return an error for any Non-connect command: */
+		status = nvmet_parse_connect_cmd(req);
+	else if (likely(req->sq->qid != 0))
+		status = nvmet_parse_io_cmd(req);
+	else if (req->cmd->common.opcode == nvme_fabrics_command)
+		status = nvmet_parse_fabrics_cmd(req);
+	else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC)
+		status = nvmet_parse_discovery_cmd(req);
+	else
+		status = nvmet_parse_admin_cmd(req);
+
+	if (status)
+		goto fail;
+
+	if (unlikely(!percpu_ref_tryget_live(&sq->ref))) {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		goto fail;
+	}
+
+	return true;
+
+fail:
+	__nvmet_req_complete(req, status);
+	return false;
+}
+EXPORT_SYMBOL_GPL(nvmet_req_init);
+
+void nvmet_req_uninit(struct nvmet_req *req)
+{
+	percpu_ref_put(&req->sq->ref);
+	if (req->ns)
+		nvmet_put_namespace(req->ns);
+}
+EXPORT_SYMBOL_GPL(nvmet_req_uninit);
+
+void nvmet_req_execute(struct nvmet_req *req)
+{
+	if (unlikely(req->data_len != req->transfer_len))
+		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
+	else
+		req->execute(req);
+}
+EXPORT_SYMBOL_GPL(nvmet_req_execute);
+
+static inline bool nvmet_cc_en(u32 cc)
+{
+	return (cc >> NVME_CC_EN_SHIFT) & 0x1;
+}
+
+static inline u8 nvmet_cc_css(u32 cc)
+{
+	return (cc >> NVME_CC_CSS_SHIFT) & 0x7;
+}
+
+static inline u8 nvmet_cc_mps(u32 cc)
+{
+	return (cc >> NVME_CC_MPS_SHIFT) & 0xf;
+}
+
+static inline u8 nvmet_cc_ams(u32 cc)
+{
+	return (cc >> NVME_CC_AMS_SHIFT) & 0x7;
+}
+
+static inline u8 nvmet_cc_shn(u32 cc)
+{
+	return (cc >> NVME_CC_SHN_SHIFT) & 0x3;
+}
+
+static inline u8 nvmet_cc_iosqes(u32 cc)
+{
+	return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf;
+}
+
+static inline u8 nvmet_cc_iocqes(u32 cc)
+{
+	return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf;
+}
+
+static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
+{
+	lockdep_assert_held(&ctrl->lock);
+
+	if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
+	    nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES ||
+	    nvmet_cc_mps(ctrl->cc) != 0 ||
+	    nvmet_cc_ams(ctrl->cc) != 0 ||
+	    nvmet_cc_css(ctrl->cc) != 0) {
+		ctrl->csts = NVME_CSTS_CFS;
+		return;
+	}
+
+	ctrl->csts = NVME_CSTS_RDY;
+}
+
+static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
+{
+	lockdep_assert_held(&ctrl->lock);
+
+	/* XXX: tear down queues? */
+	ctrl->csts &= ~NVME_CSTS_RDY;
+	ctrl->cc = 0;
+}
+
+void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new)
+{
+	u32 old;
+
+	mutex_lock(&ctrl->lock);
+	old = ctrl->cc;
+	ctrl->cc = new;
+
+	if (nvmet_cc_en(new) && !nvmet_cc_en(old))
+		nvmet_start_ctrl(ctrl);
+	if (!nvmet_cc_en(new) && nvmet_cc_en(old))
+		nvmet_clear_ctrl(ctrl);
+	if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) {
+		nvmet_clear_ctrl(ctrl);
+		ctrl->csts |= NVME_CSTS_SHST_CMPLT;
+	}
+	if (!nvmet_cc_shn(new) && nvmet_cc_shn(old))
+		ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
+	mutex_unlock(&ctrl->lock);
+}
+
+static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
+{
+	/* command sets supported: NVMe command set: */
+	ctrl->cap = (1ULL << 37);
+	/* CC.EN timeout in 500msec units: */
+	ctrl->cap |= (15ULL << 24);
+	/* maximum queue entries supported: */
+	ctrl->cap |= NVMET_QUEUE_SIZE - 1;
+}
+
+u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
+		struct nvmet_req *req, struct nvmet_ctrl **ret)
+{
+	struct nvmet_subsys *subsys;
+	struct nvmet_ctrl *ctrl;
+	u16 status = 0;
+
+	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
+	if (!subsys) {
+		pr_warn("connect request for invalid subsystem %s!\n",
+			subsysnqn);
+		req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
+		return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
+	}
+
+	mutex_lock(&subsys->lock);
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		if (ctrl->cntlid == cntlid) {
+			if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) {
+				pr_warn("hostnqn mismatch.\n");
+				continue;
+			}
+			if (!kref_get_unless_zero(&ctrl->ref))
+				continue;
+
+			*ret = ctrl;
+			goto out;
+		}
+	}
+
+	pr_warn("could not find controller %d for subsys %s / host %s\n",
+		cntlid, subsysnqn, hostnqn);
+	req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid);
+	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
+
+out:
+	mutex_unlock(&subsys->lock);
+	nvmet_subsys_put(subsys);
+	return status;
+}
+
+u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
+{
+	if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
+		pr_err("got io cmd %d while CC.EN == 0 on qid = %d\n",
+		       cmd->common.opcode, req->sq->qid);
+		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+	}
+
+	if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
+		pr_err("got io cmd %d while CSTS.RDY == 0 on qid = %d\n",
+		       cmd->common.opcode, req->sq->qid);
+		req->ns = NULL;
+		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+	}
+	return 0;
+}
+
+static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
+		const char *hostnqn)
+{
+	struct nvmet_host_link *p;
+
+	if (subsys->allow_any_host)
+		return true;
+
+	list_for_each_entry(p, &subsys->hosts, entry) {
+		if (!strcmp(nvmet_host_name(p->host), hostnqn))
+			return true;
+	}
+
+	return false;
+}
+
+static bool nvmet_host_discovery_allowed(struct nvmet_req *req,
+		const char *hostnqn)
+{
+	struct nvmet_subsys_link *s;
+
+	list_for_each_entry(s, &req->port->subsystems, entry) {
+		if (__nvmet_host_allowed(s->subsys, hostnqn))
+			return true;
+	}
+
+	return false;
+}
+
+bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
+		const char *hostnqn)
+{
+	lockdep_assert_held(&nvmet_config_sem);
+
+	if (subsys->type == NVME_NQN_DISC)
+		return nvmet_host_discovery_allowed(req, hostnqn);
+	else
+		return __nvmet_host_allowed(subsys, hostnqn);
+}
+
+u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
+		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
+{
+	struct nvmet_subsys *subsys;
+	struct nvmet_ctrl *ctrl;
+	int ret;
+	u16 status;
+
+	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
+	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
+	if (!subsys) {
+		pr_warn("connect request for invalid subsystem %s!\n",
+			subsysnqn);
+		req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
+		goto out;
+	}
+
+	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
+	down_read(&nvmet_config_sem);
+	if (!nvmet_host_allowed(req, subsys, hostnqn)) {
+		pr_info("connect by host %s for subsystem %s not allowed\n",
+			hostnqn, subsysnqn);
+		req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
+		up_read(&nvmet_config_sem);
+		status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR;
+		goto out_put_subsystem;
+	}
+	up_read(&nvmet_config_sem);
+
+	status = NVME_SC_INTERNAL;
+	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+	if (!ctrl)
+		goto out_put_subsystem;
+	mutex_init(&ctrl->lock);
+
+	nvmet_init_cap(ctrl);
+
+	INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
+	INIT_LIST_HEAD(&ctrl->async_events);
+
+	memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
+	memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
+
+	kref_init(&ctrl->ref);
+	ctrl->subsys = subsys;
+
+	ctrl->cqs = kcalloc(subsys->max_qid + 1,
+			sizeof(struct nvmet_cq *),
+			GFP_KERNEL);
+	if (!ctrl->cqs)
+		goto out_free_ctrl;
+
+	ctrl->sqs = kcalloc(subsys->max_qid + 1,
+			sizeof(struct nvmet_sq *),
+			GFP_KERNEL);
+	if (!ctrl->sqs)
+		goto out_free_cqs;
+
+	ret = ida_simple_get(&cntlid_ida,
+			     NVME_CNTLID_MIN, NVME_CNTLID_MAX,
+			     GFP_KERNEL);
+	if (ret < 0) {
+		status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
+		goto out_free_sqs;
+	}
+	ctrl->cntlid = ret;
+
+	ctrl->ops = req->ops;
+	if (ctrl->subsys->type == NVME_NQN_DISC) {
+		/* Don't accept keep-alive timeout for discovery controllers */
+		if (kato) {
+			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+			goto out_remove_ida;
+		}
+
+		/*
+		 * Discovery controllers use some arbitrary high value in order
+		 * to cleanup stale discovery sessions
+		 *
+		 * From the latest base diff RC:
+		 * "The Keep Alive command is not supported by
+		 * Discovery controllers. A transport may specify a
+		 * fixed Discovery controller activity timeout value
+		 * (e.g., 2 minutes).  If no commands are received
+		 * by a Discovery controller within that time
+		 * period, the controller may perform the
+		 * actions for Keep Alive Timer expiration".
+		 */
+		ctrl->kato = NVMET_DISC_KATO;
+	} else {
+		/* keep-alive timeout in seconds */
+		ctrl->kato = DIV_ROUND_UP(kato, 1000);
+	}
+	nvmet_start_keep_alive_timer(ctrl);
+
+	mutex_lock(&subsys->lock);
+	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
+	mutex_unlock(&subsys->lock);
+
+	*ctrlp = ctrl;
+	return 0;
+
+out_remove_ida:
+	ida_simple_remove(&cntlid_ida, ctrl->cntlid);
+out_free_sqs:
+	kfree(ctrl->sqs);
+out_free_cqs:
+	kfree(ctrl->cqs);
+out_free_ctrl:
+	kfree(ctrl);
+out_put_subsystem:
+	nvmet_subsys_put(subsys);
+out:
+	return status;
+}
+
+static void nvmet_ctrl_free(struct kref *ref)
+{
+	struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
+	struct nvmet_subsys *subsys = ctrl->subsys;
+
+	mutex_lock(&subsys->lock);
+	list_del(&ctrl->subsys_entry);
+	mutex_unlock(&subsys->lock);
+
+	nvmet_stop_keep_alive_timer(ctrl);
+
+	flush_work(&ctrl->async_event_work);
+	cancel_work_sync(&ctrl->fatal_err_work);
+
+	ida_simple_remove(&cntlid_ida, ctrl->cntlid);
+
+	kfree(ctrl->sqs);
+	kfree(ctrl->cqs);
+	kfree(ctrl);
+
+	nvmet_subsys_put(subsys);
+}
+
+void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
+{
+	kref_put(&ctrl->ref, nvmet_ctrl_free);
+}
+
+static void nvmet_fatal_error_handler(struct work_struct *work)
+{
+	struct nvmet_ctrl *ctrl =
+			container_of(work, struct nvmet_ctrl, fatal_err_work);
+
+	pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid);
+	ctrl->ops->delete_ctrl(ctrl);
+}
+
+void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl)
+{
+	mutex_lock(&ctrl->lock);
+	if (!(ctrl->csts & NVME_CSTS_CFS)) {
+		ctrl->csts |= NVME_CSTS_CFS;
+		INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler);
+		schedule_work(&ctrl->fatal_err_work);
+	}
+	mutex_unlock(&ctrl->lock);
+}
+EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error);
+
+static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
+		const char *subsysnqn)
+{
+	struct nvmet_subsys_link *p;
+
+	if (!port)
+		return NULL;
+
+	if (!strncmp(NVME_DISC_SUBSYS_NAME, subsysnqn,
+			NVMF_NQN_SIZE)) {
+		if (!kref_get_unless_zero(&nvmet_disc_subsys->ref))
+			return NULL;
+		return nvmet_disc_subsys;
+	}
+
+	down_read(&nvmet_config_sem);
+	list_for_each_entry(p, &port->subsystems, entry) {
+		if (!strncmp(p->subsys->subsysnqn, subsysnqn,
+				NVMF_NQN_SIZE)) {
+			if (!kref_get_unless_zero(&p->subsys->ref))
+				break;
+			up_read(&nvmet_config_sem);
+			return p->subsys;
+		}
+	}
+	up_read(&nvmet_config_sem);
+	return NULL;
+}
+
+struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
+		enum nvme_subsys_type type)
+{
+	struct nvmet_subsys *subsys;
+
+	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
+	if (!subsys)
+		return NULL;
+
+	subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */
+	/* generate a random serial number as our controllers are ephemeral: */
+	get_random_bytes(&subsys->serial, sizeof(subsys->serial));
+
+	switch (type) {
+	case NVME_NQN_NVME:
+		subsys->max_qid = NVMET_NR_QUEUES;
+		break;
+	case NVME_NQN_DISC:
+		subsys->max_qid = 0;
+		break;
+	default:
+		pr_err("%s: Unknown Subsystem type - %d\n", __func__, type);
+		kfree(subsys);
+		return NULL;
+	}
+	subsys->type = type;
+	subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE,
+			GFP_KERNEL);
+	if (!subsys->subsysnqn) {
+		kfree(subsys);
+		return NULL;
+	}
+
+	kref_init(&subsys->ref);
+
+	mutex_init(&subsys->lock);
+	INIT_LIST_HEAD(&subsys->namespaces);
+	INIT_LIST_HEAD(&subsys->ctrls);
+	INIT_LIST_HEAD(&subsys->hosts);
+
+	return subsys;
+}
+
+static void nvmet_subsys_free(struct kref *ref)
+{
+	struct nvmet_subsys *subsys =
+		container_of(ref, struct nvmet_subsys, ref);
+
+	WARN_ON_ONCE(!list_empty(&subsys->namespaces));
+
+	kfree(subsys->subsysnqn);
+	kfree(subsys);
+}
+
+void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys)
+{
+	struct nvmet_ctrl *ctrl;
+
+	mutex_lock(&subsys->lock);
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
+		ctrl->ops->delete_ctrl(ctrl);
+	mutex_unlock(&subsys->lock);
+}
+
+void nvmet_subsys_put(struct nvmet_subsys *subsys)
+{
+	kref_put(&subsys->ref, nvmet_subsys_free);
+}
+
+static int __init nvmet_init(void)
+{
+	int error;
+
+	error = nvmet_init_discovery();
+	if (error)
+		goto out;
+
+	error = nvmet_init_configfs();
+	if (error)
+		goto out_exit_discovery;
+	return 0;
+
+out_exit_discovery:
+	nvmet_exit_discovery();
+out:
+	return error;
+}
+
+static void __exit nvmet_exit(void)
+{
+	nvmet_exit_configfs();
+	nvmet_exit_discovery();
+	ida_destroy(&cntlid_ida);
+
+	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
+	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
+}
+
+module_init(nvmet_init);
+module_exit(nvmet_exit);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
new file mode 100644
index 0000000..231e04e
--- /dev/null
+++ b/drivers/nvme/target/discovery.c
@@ -0,0 +1,244 @@
+/*
+ * Discovery service for the NVMe over Fabrics target.
+ * Copyright (C) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/slab.h>
+#include <generated/utsrelease.h>
+#include "nvmet.h"
+
+struct nvmet_subsys *nvmet_disc_subsys;
+
+u64 nvmet_genctr;
+
+void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port)
+{
+	down_write(&nvmet_config_sem);
+	if (list_empty(&port->entry)) {
+		list_add_tail(&port->entry, &parent->referrals);
+		port->enabled = true;
+		nvmet_genctr++;
+	}
+	up_write(&nvmet_config_sem);
+}
+
+void nvmet_referral_disable(struct nvmet_port *port)
+{
+	down_write(&nvmet_config_sem);
+	if (!list_empty(&port->entry)) {
+		port->enabled = false;
+		list_del_init(&port->entry);
+		nvmet_genctr++;
+	}
+	up_write(&nvmet_config_sem);
+}
+
+static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr,
+		struct nvmet_port *port, char *subsys_nqn, char *traddr,
+		u8 type, u32 numrec)
+{
+	struct nvmf_disc_rsp_page_entry *e = &hdr->entries[numrec];
+
+	e->trtype = port->disc_addr.trtype;
+	e->adrfam = port->disc_addr.adrfam;
+	e->treq = port->disc_addr.treq;
+	e->portid = port->disc_addr.portid;
+	/* we support only dynamic controllers */
+	e->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC);
+	e->asqsz = cpu_to_le16(NVME_AQ_DEPTH);
+	e->subtype = type;
+	memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE);
+	memcpy(e->traddr, traddr, NVMF_TRADDR_SIZE);
+	memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE);
+	strncpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
+}
+
+/*
+ * nvmet_set_disc_traddr - set a correct discovery log entry traddr
+ *
+ * IP based transports (e.g RDMA) can listen on "any" ipv4/ipv6 addresses
+ * (INADDR_ANY or IN6ADDR_ANY_INIT). The discovery log page traddr reply
+ * must not contain that "any" IP address. If the transport implements
+ * .disc_traddr, use it. this callback will set the discovery traddr
+ * from the req->port address in case the port in question listens
+ * "any" IP address.
+ */
+static void nvmet_set_disc_traddr(struct nvmet_req *req, struct nvmet_port *port,
+		char *traddr)
+{
+	if (req->ops->disc_traddr)
+		req->ops->disc_traddr(req, port, traddr);
+	else
+		memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
+}
+
+static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
+{
+	const int entry_size = sizeof(struct nvmf_disc_rsp_page_entry);
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmf_disc_rsp_page_hdr *hdr;
+	size_t data_len = nvmet_get_log_page_len(req->cmd);
+	size_t alloc_len = max(data_len, sizeof(*hdr));
+	int residual_len = data_len - sizeof(*hdr);
+	struct nvmet_subsys_link *p;
+	struct nvmet_port *r;
+	u32 numrec = 0;
+	u16 status = 0;
+
+	/*
+	 * Make sure we're passing at least a buffer of response header size.
+	 * If host provided data len is less than the header size, only the
+	 * number of bytes requested by host will be sent to host.
+	 */
+	hdr = kzalloc(alloc_len, GFP_KERNEL);
+	if (!hdr) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	down_read(&nvmet_config_sem);
+	list_for_each_entry(p, &req->port->subsystems, entry) {
+		if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn))
+			continue;
+		if (residual_len >= entry_size) {
+			char traddr[NVMF_TRADDR_SIZE];
+
+			nvmet_set_disc_traddr(req, req->port, traddr);
+			nvmet_format_discovery_entry(hdr, req->port,
+					p->subsys->subsysnqn, traddr,
+					NVME_NQN_NVME, numrec);
+			residual_len -= entry_size;
+		}
+		numrec++;
+	}
+
+	list_for_each_entry(r, &req->port->referrals, entry) {
+		if (residual_len >= entry_size) {
+			nvmet_format_discovery_entry(hdr, r,
+					NVME_DISC_SUBSYS_NAME,
+					r->disc_addr.traddr,
+					NVME_NQN_DISC, numrec);
+			residual_len -= entry_size;
+		}
+		numrec++;
+	}
+
+	hdr->genctr = cpu_to_le64(nvmet_genctr);
+	hdr->numrec = cpu_to_le64(numrec);
+	hdr->recfmt = cpu_to_le16(0);
+
+	up_read(&nvmet_config_sem);
+
+	status = nvmet_copy_to_sgl(req, 0, hdr, data_len);
+	kfree(hdr);
+out:
+	nvmet_req_complete(req, status);
+}
+
+static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvme_id_ctrl *id;
+	u16 status = 0;
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	memset(id->fr, ' ', sizeof(id->fr));
+	strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr));
+
+	/* no limit on data transfer sizes for now */
+	id->mdts = 0;
+	id->cntlid = cpu_to_le16(ctrl->cntlid);
+	id->ver = cpu_to_le32(ctrl->subsys->ver);
+	id->lpa = (1 << 2);
+
+	/* no enforcement soft-limit for maxcmd - pick arbitrary high value */
+	id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
+
+	id->sgls = cpu_to_le32(1 << 0);	/* we always support SGLs */
+	if (ctrl->ops->has_keyed_sgls)
+		id->sgls |= cpu_to_le32(1 << 2);
+	if (ctrl->ops->sqe_inline_size)
+		id->sgls |= cpu_to_le32(1 << 20);
+
+	strcpy(id->subnqn, ctrl->subsys->subsysnqn);
+
+	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
+
+	kfree(id);
+out:
+	nvmet_req_complete(req, status);
+}
+
+u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
+{
+	struct nvme_command *cmd = req->cmd;
+
+	req->ns = NULL;
+
+	if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
+		pr_err("got cmd %d while not ready\n",
+		       cmd->common.opcode);
+		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+	}
+
+	switch (cmd->common.opcode) {
+	case nvme_admin_get_log_page:
+		req->data_len = nvmet_get_log_page_len(cmd);
+
+		switch (cmd->get_log_page.lid) {
+		case NVME_LOG_DISC:
+			req->execute = nvmet_execute_get_disc_log_page;
+			return 0;
+		default:
+			pr_err("unsupported get_log_page lid %d\n",
+			       cmd->get_log_page.lid);
+		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+		}
+	case nvme_admin_identify:
+		req->data_len = NVME_IDENTIFY_DATA_SIZE;
+		switch (cmd->identify.cns) {
+		case NVME_ID_CNS_CTRL:
+			req->execute =
+				nvmet_execute_identify_disc_ctrl;
+			return 0;
+		default:
+			pr_err("unsupported identify cns %d\n",
+			       cmd->identify.cns);
+			return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+		}
+	default:
+		pr_err("unsupported cmd %d\n", cmd->common.opcode);
+		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+	}
+
+	pr_err("unhandled cmd %d\n", cmd->common.opcode);
+	return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+}
+
+int __init nvmet_init_discovery(void)
+{
+	nvmet_disc_subsys =
+		nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC);
+	if (!nvmet_disc_subsys)
+		return -ENOMEM;
+	return 0;
+}
+
+void nvmet_exit_discovery(void)
+{
+	nvmet_subsys_put(nvmet_disc_subsys);
+}
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
new file mode 100644
index 0000000..19e9e42
--- /dev/null
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -0,0 +1,264 @@
+/*
+ * NVMe Fabrics command implementation.
+ * Copyright (c) 2015-2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/blkdev.h>
+#include "nvmet.h"
+
+static void nvmet_execute_prop_set(struct nvmet_req *req)
+{
+	u16 status = 0;
+
+	if (!(req->cmd->prop_set.attrib & 1)) {
+		u64 val = le64_to_cpu(req->cmd->prop_set.value);
+
+		switch (le32_to_cpu(req->cmd->prop_set.offset)) {
+		case NVME_REG_CC:
+			nvmet_update_cc(req->sq->ctrl, val);
+			break;
+		default:
+			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+			break;
+		}
+	} else {
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
+
+	nvmet_req_complete(req, status);
+}
+
+static void nvmet_execute_prop_get(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	u16 status = 0;
+	u64 val = 0;
+
+	if (req->cmd->prop_get.attrib & 1) {
+		switch (le32_to_cpu(req->cmd->prop_get.offset)) {
+		case NVME_REG_CAP:
+			val = ctrl->cap;
+			break;
+		default:
+			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+			break;
+		}
+	} else {
+		switch (le32_to_cpu(req->cmd->prop_get.offset)) {
+		case NVME_REG_VS:
+			val = ctrl->subsys->ver;
+			break;
+		case NVME_REG_CC:
+			val = ctrl->cc;
+			break;
+		case NVME_REG_CSTS:
+			val = ctrl->csts;
+			break;
+		default:
+			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+			break;
+		}
+	}
+
+	req->rsp->result.u64 = cpu_to_le64(val);
+	nvmet_req_complete(req, status);
+}
+
+u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
+{
+	struct nvme_command *cmd = req->cmd;
+
+	req->ns = NULL;
+
+	switch (cmd->fabrics.fctype) {
+	case nvme_fabrics_type_property_set:
+		req->data_len = 0;
+		req->execute = nvmet_execute_prop_set;
+		break;
+	case nvme_fabrics_type_property_get:
+		req->data_len = 0;
+		req->execute = nvmet_execute_prop_get;
+		break;
+	default:
+		pr_err("received unknown capsule type 0x%x\n",
+			cmd->fabrics.fctype);
+		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+	}
+
+	return 0;
+}
+
+static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
+{
+	struct nvmf_connect_command *c = &req->cmd->connect;
+	u16 qid = le16_to_cpu(c->qid);
+	u16 sqsize = le16_to_cpu(c->sqsize);
+	struct nvmet_ctrl *old;
+
+	old = cmpxchg(&req->sq->ctrl, NULL, ctrl);
+	if (old) {
+		pr_warn("queue already connected!\n");
+		return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
+	}
+	if (!sqsize) {
+		pr_warn("queue size zero!\n");
+		return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
+	}
+
+	/* note: convert queue size from 0's-based value to 1's-based value */
+	nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1);
+	nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1);
+	return 0;
+}
+
+static void nvmet_execute_admin_connect(struct nvmet_req *req)
+{
+	struct nvmf_connect_command *c = &req->cmd->connect;
+	struct nvmf_connect_data *d;
+	struct nvmet_ctrl *ctrl = NULL;
+	u16 status = 0;
+
+	d = kmalloc(sizeof(*d), GFP_KERNEL);
+	if (!d) {
+		status = NVME_SC_INTERNAL;
+		goto complete;
+	}
+
+	status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+	if (status)
+		goto out;
+
+	/* zero out initial completion result, assign values as needed */
+	req->rsp->result.u32 = 0;
+
+	if (c->recfmt != 0) {
+		pr_warn("invalid connect version (%d).\n",
+			le16_to_cpu(c->recfmt));
+		status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR;
+		goto out;
+	}
+
+	if (unlikely(d->cntlid != cpu_to_le16(0xffff))) {
+		pr_warn("connect attempt for invalid controller ID %#x\n",
+			d->cntlid);
+		status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
+		req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid);
+		goto out;
+	}
+
+	status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
+				  le32_to_cpu(c->kato), &ctrl);
+	if (status)
+		goto out;
+	uuid_copy(&ctrl->hostid, &d->hostid);
+
+	status = nvmet_install_queue(ctrl, req);
+	if (status) {
+		nvmet_ctrl_put(ctrl);
+		goto out;
+	}
+
+	pr_info("creating controller %d for subsystem %s for NQN %s.\n",
+		ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn);
+	req->rsp->result.u16 = cpu_to_le16(ctrl->cntlid);
+
+out:
+	kfree(d);
+complete:
+	nvmet_req_complete(req, status);
+}
+
+static void nvmet_execute_io_connect(struct nvmet_req *req)
+{
+	struct nvmf_connect_command *c = &req->cmd->connect;
+	struct nvmf_connect_data *d;
+	struct nvmet_ctrl *ctrl = NULL;
+	u16 qid = le16_to_cpu(c->qid);
+	u16 status = 0;
+
+	d = kmalloc(sizeof(*d), GFP_KERNEL);
+	if (!d) {
+		status = NVME_SC_INTERNAL;
+		goto complete;
+	}
+
+	status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+	if (status)
+		goto out;
+
+	/* zero out initial completion result, assign values as needed */
+	req->rsp->result.u32 = 0;
+
+	if (c->recfmt != 0) {
+		pr_warn("invalid connect version (%d).\n",
+			le16_to_cpu(c->recfmt));
+		status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR;
+		goto out;
+	}
+
+	status = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn,
+				     le16_to_cpu(d->cntlid),
+				     req, &ctrl);
+	if (status)
+		goto out;
+
+	if (unlikely(qid > ctrl->subsys->max_qid)) {
+		pr_warn("invalid queue id (%d)\n", qid);
+		status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
+		req->rsp->result.u32 = IPO_IATTR_CONNECT_SQE(qid);
+		goto out_ctrl_put;
+	}
+
+	status = nvmet_install_queue(ctrl, req);
+	if (status) {
+		/* pass back cntlid that had the issue of installing queue */
+		req->rsp->result.u16 = cpu_to_le16(ctrl->cntlid);
+		goto out_ctrl_put;
+	}
+
+	pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
+
+out:
+	kfree(d);
+complete:
+	nvmet_req_complete(req, status);
+	return;
+
+out_ctrl_put:
+	nvmet_ctrl_put(ctrl);
+	goto out;
+}
+
+u16 nvmet_parse_connect_cmd(struct nvmet_req *req)
+{
+	struct nvme_command *cmd = req->cmd;
+
+	req->ns = NULL;
+
+	if (cmd->common.opcode != nvme_fabrics_command) {
+		pr_err("invalid command 0x%x on unconnected queue.\n",
+			cmd->fabrics.opcode);
+		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+	}
+	if (cmd->fabrics.fctype != nvme_fabrics_type_connect) {
+		pr_err("invalid capsule type 0x%x on unconnected queue.\n",
+			cmd->fabrics.fctype);
+		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+	}
+
+	req->data_len = sizeof(struct nvmf_connect_data);
+	if (cmd->connect.qid == 0)
+		req->execute = nvmet_execute_admin_connect;
+	else
+		req->execute = nvmet_execute_io_connect;
+	return 0;
+}
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
new file mode 100644
index 0000000..33ee8d3
--- /dev/null
+++ b/drivers/nvme/target/fc.c
@@ -0,0 +1,2550 @@
+/*
+ * Copyright (c) 2016 Avago Technologies.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful.
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
+ * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
+ * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
+ * See the GNU General Public License for more details, a copy of which
+ * can be found in the file COPYING included with this package
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blk-mq.h>
+#include <linux/parser.h>
+#include <linux/random.h>
+#include <uapi/scsi/fc/fc_fs.h>
+#include <uapi/scsi/fc/fc_els.h>
+
+#include "nvmet.h"
+#include <linux/nvme-fc-driver.h>
+#include <linux/nvme-fc.h>
+
+
+/* *************************** Data Structures/Defines ****************** */
+
+
+#define NVMET_LS_CTX_COUNT		4
+
+/* for this implementation, assume small single frame rqst/rsp */
+#define NVME_FC_MAX_LS_BUFFER_SIZE		2048
+
+struct nvmet_fc_tgtport;
+struct nvmet_fc_tgt_assoc;
+
+struct nvmet_fc_ls_iod {
+	struct nvmefc_tgt_ls_req	*lsreq;
+	struct nvmefc_tgt_fcp_req	*fcpreq;	/* only if RS */
+
+	struct list_head		ls_list;	/* tgtport->ls_list */
+
+	struct nvmet_fc_tgtport		*tgtport;
+	struct nvmet_fc_tgt_assoc	*assoc;
+
+	u8				*rqstbuf;
+	u8				*rspbuf;
+	u16				rqstdatalen;
+	dma_addr_t			rspdma;
+
+	struct scatterlist		sg[2];
+
+	struct work_struct		work;
+} __aligned(sizeof(unsigned long long));
+
+#define NVMET_FC_MAX_SEQ_LENGTH		(256 * 1024)
+#define NVMET_FC_MAX_XFR_SGENTS		(NVMET_FC_MAX_SEQ_LENGTH / PAGE_SIZE)
+
+enum nvmet_fcp_datadir {
+	NVMET_FCP_NODATA,
+	NVMET_FCP_WRITE,
+	NVMET_FCP_READ,
+	NVMET_FCP_ABORTED,
+};
+
+struct nvmet_fc_fcp_iod {
+	struct nvmefc_tgt_fcp_req	*fcpreq;
+
+	struct nvme_fc_cmd_iu		cmdiubuf;
+	struct nvme_fc_ersp_iu		rspiubuf;
+	dma_addr_t			rspdma;
+	struct scatterlist		*data_sg;
+	int				data_sg_cnt;
+	u32				offset;
+	enum nvmet_fcp_datadir		io_dir;
+	bool				active;
+	bool				abort;
+	bool				aborted;
+	bool				writedataactive;
+	spinlock_t			flock;
+
+	struct nvmet_req		req;
+	struct work_struct		work;
+	struct work_struct		done_work;
+	struct work_struct		defer_work;
+
+	struct nvmet_fc_tgtport		*tgtport;
+	struct nvmet_fc_tgt_queue	*queue;
+
+	struct list_head		fcp_list;	/* tgtport->fcp_list */
+};
+
+struct nvmet_fc_tgtport {
+
+	struct nvmet_fc_target_port	fc_target_port;
+
+	struct list_head		tgt_list; /* nvmet_fc_target_list */
+	struct device			*dev;	/* dev for dma mapping */
+	struct nvmet_fc_target_template	*ops;
+
+	struct nvmet_fc_ls_iod		*iod;
+	spinlock_t			lock;
+	struct list_head		ls_list;
+	struct list_head		ls_busylist;
+	struct list_head		assoc_list;
+	struct ida			assoc_cnt;
+	struct nvmet_port		*port;
+	struct kref			ref;
+	u32				max_sg_cnt;
+};
+
+struct nvmet_fc_defer_fcp_req {
+	struct list_head		req_list;
+	struct nvmefc_tgt_fcp_req	*fcp_req;
+};
+
+struct nvmet_fc_tgt_queue {
+	bool				ninetypercent;
+	u16				qid;
+	u16				sqsize;
+	u16				ersp_ratio;
+	__le16				sqhd;
+	int				cpu;
+	atomic_t			connected;
+	atomic_t			sqtail;
+	atomic_t			zrspcnt;
+	atomic_t			rsn;
+	spinlock_t			qlock;
+	struct nvmet_port		*port;
+	struct nvmet_cq			nvme_cq;
+	struct nvmet_sq			nvme_sq;
+	struct nvmet_fc_tgt_assoc	*assoc;
+	struct nvmet_fc_fcp_iod		*fod;		/* array of fcp_iods */
+	struct list_head		fod_list;
+	struct list_head		pending_cmd_list;
+	struct list_head		avail_defer_list;
+	struct workqueue_struct		*work_q;
+	struct kref			ref;
+} __aligned(sizeof(unsigned long long));
+
+struct nvmet_fc_tgt_assoc {
+	u64				association_id;
+	u32				a_id;
+	struct nvmet_fc_tgtport		*tgtport;
+	struct list_head		a_list;
+	struct nvmet_fc_tgt_queue	*queues[NVMET_NR_QUEUES + 1];
+	struct kref			ref;
+	struct work_struct		del_work;
+};
+
+
+static inline int
+nvmet_fc_iodnum(struct nvmet_fc_ls_iod *iodptr)
+{
+	return (iodptr - iodptr->tgtport->iod);
+}
+
+static inline int
+nvmet_fc_fodnum(struct nvmet_fc_fcp_iod *fodptr)
+{
+	return (fodptr - fodptr->queue->fod);
+}
+
+
+/*
+ * Association and Connection IDs:
+ *
+ * Association ID will have random number in upper 6 bytes and zero
+ *   in lower 2 bytes
+ *
+ * Connection IDs will be Association ID with QID or'd in lower 2 bytes
+ *
+ * note: Association ID = Connection ID for queue 0
+ */
+#define BYTES_FOR_QID			sizeof(u16)
+#define BYTES_FOR_QID_SHIFT		(BYTES_FOR_QID * 8)
+#define NVMET_FC_QUEUEID_MASK		((u64)((1 << BYTES_FOR_QID_SHIFT) - 1))
+
+static inline u64
+nvmet_fc_makeconnid(struct nvmet_fc_tgt_assoc *assoc, u16 qid)
+{
+	return (assoc->association_id | qid);
+}
+
+static inline u64
+nvmet_fc_getassociationid(u64 connectionid)
+{
+	return connectionid & ~NVMET_FC_QUEUEID_MASK;
+}
+
+static inline u16
+nvmet_fc_getqueueid(u64 connectionid)
+{
+	return (u16)(connectionid & NVMET_FC_QUEUEID_MASK);
+}
+
+static inline struct nvmet_fc_tgtport *
+targetport_to_tgtport(struct nvmet_fc_target_port *targetport)
+{
+	return container_of(targetport, struct nvmet_fc_tgtport,
+				 fc_target_port);
+}
+
+static inline struct nvmet_fc_fcp_iod *
+nvmet_req_to_fod(struct nvmet_req *nvme_req)
+{
+	return container_of(nvme_req, struct nvmet_fc_fcp_iod, req);
+}
+
+
+/* *************************** Globals **************************** */
+
+
+static DEFINE_SPINLOCK(nvmet_fc_tgtlock);
+
+static LIST_HEAD(nvmet_fc_target_list);
+static DEFINE_IDA(nvmet_fc_tgtport_cnt);
+
+
+static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work);
+static void nvmet_fc_handle_fcp_rqst_work(struct work_struct *work);
+static void nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work);
+static void nvmet_fc_fcp_rqst_op_defer_work(struct work_struct *work);
+static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc);
+static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc);
+static void nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue);
+static int nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue);
+static void nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport);
+static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport);
+static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
+					struct nvmet_fc_fcp_iod *fod);
+static void nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc);
+
+
+/* *********************** FC-NVME DMA Handling **************************** */
+
+/*
+ * The fcloop device passes in a NULL device pointer. Real LLD's will
+ * pass in a valid device pointer. If NULL is passed to the dma mapping
+ * routines, depending on the platform, it may or may not succeed, and
+ * may crash.
+ *
+ * As such:
+ * Wrapper all the dma routines and check the dev pointer.
+ *
+ * If simple mappings (return just a dma address, we'll noop them,
+ * returning a dma address of 0.
+ *
+ * On more complex mappings (dma_map_sg), a pseudo routine fills
+ * in the scatter list, setting all dma addresses to 0.
+ */
+
+static inline dma_addr_t
+fc_dma_map_single(struct device *dev, void *ptr, size_t size,
+		enum dma_data_direction dir)
+{
+	return dev ? dma_map_single(dev, ptr, size, dir) : (dma_addr_t)0L;
+}
+
+static inline int
+fc_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return dev ? dma_mapping_error(dev, dma_addr) : 0;
+}
+
+static inline void
+fc_dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
+	enum dma_data_direction dir)
+{
+	if (dev)
+		dma_unmap_single(dev, addr, size, dir);
+}
+
+static inline void
+fc_dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir)
+{
+	if (dev)
+		dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static inline void
+fc_dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir)
+{
+	if (dev)
+		dma_sync_single_for_device(dev, addr, size, dir);
+}
+
+/* pseudo dma_map_sg call */
+static int
+fc_map_sg(struct scatterlist *sg, int nents)
+{
+	struct scatterlist *s;
+	int i;
+
+	WARN_ON(nents == 0 || sg[0].length == 0);
+
+	for_each_sg(sg, s, nents, i) {
+		s->dma_address = 0L;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+		s->dma_length = s->length;
+#endif
+	}
+	return nents;
+}
+
+static inline int
+fc_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir)
+{
+	return dev ? dma_map_sg(dev, sg, nents, dir) : fc_map_sg(sg, nents);
+}
+
+static inline void
+fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir)
+{
+	if (dev)
+		dma_unmap_sg(dev, sg, nents, dir);
+}
+
+
+/* *********************** FC-NVME Port Management ************************ */
+
+
+static int
+nvmet_fc_alloc_ls_iodlist(struct nvmet_fc_tgtport *tgtport)
+{
+	struct nvmet_fc_ls_iod *iod;
+	int i;
+
+	iod = kcalloc(NVMET_LS_CTX_COUNT, sizeof(struct nvmet_fc_ls_iod),
+			GFP_KERNEL);
+	if (!iod)
+		return -ENOMEM;
+
+	tgtport->iod = iod;
+
+	for (i = 0; i < NVMET_LS_CTX_COUNT; iod++, i++) {
+		INIT_WORK(&iod->work, nvmet_fc_handle_ls_rqst_work);
+		iod->tgtport = tgtport;
+		list_add_tail(&iod->ls_list, &tgtport->ls_list);
+
+		iod->rqstbuf = kcalloc(2, NVME_FC_MAX_LS_BUFFER_SIZE,
+			GFP_KERNEL);
+		if (!iod->rqstbuf)
+			goto out_fail;
+
+		iod->rspbuf = iod->rqstbuf + NVME_FC_MAX_LS_BUFFER_SIZE;
+
+		iod->rspdma = fc_dma_map_single(tgtport->dev, iod->rspbuf,
+						NVME_FC_MAX_LS_BUFFER_SIZE,
+						DMA_TO_DEVICE);
+		if (fc_dma_mapping_error(tgtport->dev, iod->rspdma))
+			goto out_fail;
+	}
+
+	return 0;
+
+out_fail:
+	kfree(iod->rqstbuf);
+	list_del(&iod->ls_list);
+	for (iod--, i--; i >= 0; iod--, i--) {
+		fc_dma_unmap_single(tgtport->dev, iod->rspdma,
+				NVME_FC_MAX_LS_BUFFER_SIZE, DMA_TO_DEVICE);
+		kfree(iod->rqstbuf);
+		list_del(&iod->ls_list);
+	}
+
+	kfree(iod);
+
+	return -EFAULT;
+}
+
+static void
+nvmet_fc_free_ls_iodlist(struct nvmet_fc_tgtport *tgtport)
+{
+	struct nvmet_fc_ls_iod *iod = tgtport->iod;
+	int i;
+
+	for (i = 0; i < NVMET_LS_CTX_COUNT; iod++, i++) {
+		fc_dma_unmap_single(tgtport->dev,
+				iod->rspdma, NVME_FC_MAX_LS_BUFFER_SIZE,
+				DMA_TO_DEVICE);
+		kfree(iod->rqstbuf);
+		list_del(&iod->ls_list);
+	}
+	kfree(tgtport->iod);
+}
+
+static struct nvmet_fc_ls_iod *
+nvmet_fc_alloc_ls_iod(struct nvmet_fc_tgtport *tgtport)
+{
+	struct nvmet_fc_ls_iod *iod;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	iod = list_first_entry_or_null(&tgtport->ls_list,
+					struct nvmet_fc_ls_iod, ls_list);
+	if (iod)
+		list_move_tail(&iod->ls_list, &tgtport->ls_busylist);
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+	return iod;
+}
+
+
+static void
+nvmet_fc_free_ls_iod(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_ls_iod *iod)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	list_move(&iod->ls_list, &tgtport->ls_list);
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+}
+
+static void
+nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_tgt_queue *queue)
+{
+	struct nvmet_fc_fcp_iod *fod = queue->fod;
+	int i;
+
+	for (i = 0; i < queue->sqsize; fod++, i++) {
+		INIT_WORK(&fod->work, nvmet_fc_handle_fcp_rqst_work);
+		INIT_WORK(&fod->done_work, nvmet_fc_fcp_rqst_op_done_work);
+		INIT_WORK(&fod->defer_work, nvmet_fc_fcp_rqst_op_defer_work);
+		fod->tgtport = tgtport;
+		fod->queue = queue;
+		fod->active = false;
+		fod->abort = false;
+		fod->aborted = false;
+		fod->fcpreq = NULL;
+		list_add_tail(&fod->fcp_list, &queue->fod_list);
+		spin_lock_init(&fod->flock);
+
+		fod->rspdma = fc_dma_map_single(tgtport->dev, &fod->rspiubuf,
+					sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+		if (fc_dma_mapping_error(tgtport->dev, fod->rspdma)) {
+			list_del(&fod->fcp_list);
+			for (fod--, i--; i >= 0; fod--, i--) {
+				fc_dma_unmap_single(tgtport->dev, fod->rspdma,
+						sizeof(fod->rspiubuf),
+						DMA_TO_DEVICE);
+				fod->rspdma = 0L;
+				list_del(&fod->fcp_list);
+			}
+
+			return;
+		}
+	}
+}
+
+static void
+nvmet_fc_destroy_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_tgt_queue *queue)
+{
+	struct nvmet_fc_fcp_iod *fod = queue->fod;
+	int i;
+
+	for (i = 0; i < queue->sqsize; fod++, i++) {
+		if (fod->rspdma)
+			fc_dma_unmap_single(tgtport->dev, fod->rspdma,
+				sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+	}
+}
+
+static struct nvmet_fc_fcp_iod *
+nvmet_fc_alloc_fcp_iod(struct nvmet_fc_tgt_queue *queue)
+{
+	struct nvmet_fc_fcp_iod *fod;
+
+	lockdep_assert_held(&queue->qlock);
+
+	fod = list_first_entry_or_null(&queue->fod_list,
+					struct nvmet_fc_fcp_iod, fcp_list);
+	if (fod) {
+		list_del(&fod->fcp_list);
+		fod->active = true;
+		/*
+		 * no queue reference is taken, as it was taken by the
+		 * queue lookup just prior to the allocation. The iod
+		 * will "inherit" that reference.
+		 */
+	}
+	return fod;
+}
+
+
+static void
+nvmet_fc_queue_fcp_req(struct nvmet_fc_tgtport *tgtport,
+		       struct nvmet_fc_tgt_queue *queue,
+		       struct nvmefc_tgt_fcp_req *fcpreq)
+{
+	struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+
+	/*
+	 * put all admin cmds on hw queue id 0. All io commands go to
+	 * the respective hw queue based on a modulo basis
+	 */
+	fcpreq->hwqid = queue->qid ?
+			((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0;
+
+	if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR)
+		queue_work_on(queue->cpu, queue->work_q, &fod->work);
+	else
+		nvmet_fc_handle_fcp_rqst(tgtport, fod);
+}
+
+static void
+nvmet_fc_fcp_rqst_op_defer_work(struct work_struct *work)
+{
+	struct nvmet_fc_fcp_iod *fod =
+		container_of(work, struct nvmet_fc_fcp_iod, defer_work);
+
+	/* Submit deferred IO for processing */
+	nvmet_fc_queue_fcp_req(fod->tgtport, fod->queue, fod->fcpreq);
+
+}
+
+static void
+nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
+			struct nvmet_fc_fcp_iod *fod)
+{
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
+	struct nvmet_fc_defer_fcp_req *deferfcp;
+	unsigned long flags;
+
+	fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
+				sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+
+	fcpreq->nvmet_fc_private = NULL;
+
+	fod->active = false;
+	fod->abort = false;
+	fod->aborted = false;
+	fod->writedataactive = false;
+	fod->fcpreq = NULL;
+
+	tgtport->ops->fcp_req_release(&tgtport->fc_target_port, fcpreq);
+
+	/* release the queue lookup reference on the completed IO */
+	nvmet_fc_tgt_q_put(queue);
+
+	spin_lock_irqsave(&queue->qlock, flags);
+	deferfcp = list_first_entry_or_null(&queue->pending_cmd_list,
+				struct nvmet_fc_defer_fcp_req, req_list);
+	if (!deferfcp) {
+		list_add_tail(&fod->fcp_list, &fod->queue->fod_list);
+		spin_unlock_irqrestore(&queue->qlock, flags);
+		return;
+	}
+
+	/* Re-use the fod for the next pending cmd that was deferred */
+	list_del(&deferfcp->req_list);
+
+	fcpreq = deferfcp->fcp_req;
+
+	/* deferfcp can be reused for another IO at a later date */
+	list_add_tail(&deferfcp->req_list, &queue->avail_defer_list);
+
+	spin_unlock_irqrestore(&queue->qlock, flags);
+
+	/* Save NVME CMD IO in fod */
+	memcpy(&fod->cmdiubuf, fcpreq->rspaddr, fcpreq->rsplen);
+
+	/* Setup new fcpreq to be processed */
+	fcpreq->rspaddr = NULL;
+	fcpreq->rsplen  = 0;
+	fcpreq->nvmet_fc_private = fod;
+	fod->fcpreq = fcpreq;
+	fod->active = true;
+
+	/* inform LLDD IO is now being processed */
+	tgtport->ops->defer_rcv(&tgtport->fc_target_port, fcpreq);
+
+	/*
+	 * Leave the queue lookup get reference taken when
+	 * fod was originally allocated.
+	 */
+
+	queue_work(queue->work_q, &fod->defer_work);
+}
+
+static int
+nvmet_fc_queue_to_cpu(struct nvmet_fc_tgtport *tgtport, int qid)
+{
+	int cpu, idx, cnt;
+
+	if (tgtport->ops->max_hw_queues == 1)
+		return WORK_CPU_UNBOUND;
+
+	/* Simple cpu selection based on qid modulo active cpu count */
+	idx = !qid ? 0 : (qid - 1) % num_active_cpus();
+
+	/* find the n'th active cpu */
+	for (cpu = 0, cnt = 0; ; ) {
+		if (cpu_active(cpu)) {
+			if (cnt == idx)
+				break;
+			cnt++;
+		}
+		cpu = (cpu + 1) % num_possible_cpus();
+	}
+
+	return cpu;
+}
+
+static struct nvmet_fc_tgt_queue *
+nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
+			u16 qid, u16 sqsize)
+{
+	struct nvmet_fc_tgt_queue *queue;
+	unsigned long flags;
+	int ret;
+
+	if (qid > NVMET_NR_QUEUES)
+		return NULL;
+
+	queue = kzalloc((sizeof(*queue) +
+				(sizeof(struct nvmet_fc_fcp_iod) * sqsize)),
+				GFP_KERNEL);
+	if (!queue)
+		return NULL;
+
+	if (!nvmet_fc_tgt_a_get(assoc))
+		goto out_free_queue;
+
+	queue->work_q = alloc_workqueue("ntfc%d.%d.%d", 0, 0,
+				assoc->tgtport->fc_target_port.port_num,
+				assoc->a_id, qid);
+	if (!queue->work_q)
+		goto out_a_put;
+
+	queue->fod = (struct nvmet_fc_fcp_iod *)&queue[1];
+	queue->qid = qid;
+	queue->sqsize = sqsize;
+	queue->assoc = assoc;
+	queue->port = assoc->tgtport->port;
+	queue->cpu = nvmet_fc_queue_to_cpu(assoc->tgtport, qid);
+	INIT_LIST_HEAD(&queue->fod_list);
+	INIT_LIST_HEAD(&queue->avail_defer_list);
+	INIT_LIST_HEAD(&queue->pending_cmd_list);
+	atomic_set(&queue->connected, 0);
+	atomic_set(&queue->sqtail, 0);
+	atomic_set(&queue->rsn, 1);
+	atomic_set(&queue->zrspcnt, 0);
+	spin_lock_init(&queue->qlock);
+	kref_init(&queue->ref);
+
+	nvmet_fc_prep_fcp_iodlist(assoc->tgtport, queue);
+
+	ret = nvmet_sq_init(&queue->nvme_sq);
+	if (ret)
+		goto out_fail_iodlist;
+
+	WARN_ON(assoc->queues[qid]);
+	spin_lock_irqsave(&assoc->tgtport->lock, flags);
+	assoc->queues[qid] = queue;
+	spin_unlock_irqrestore(&assoc->tgtport->lock, flags);
+
+	return queue;
+
+out_fail_iodlist:
+	nvmet_fc_destroy_fcp_iodlist(assoc->tgtport, queue);
+	destroy_workqueue(queue->work_q);
+out_a_put:
+	nvmet_fc_tgt_a_put(assoc);
+out_free_queue:
+	kfree(queue);
+	return NULL;
+}
+
+
+static void
+nvmet_fc_tgt_queue_free(struct kref *ref)
+{
+	struct nvmet_fc_tgt_queue *queue =
+		container_of(ref, struct nvmet_fc_tgt_queue, ref);
+	unsigned long flags;
+
+	spin_lock_irqsave(&queue->assoc->tgtport->lock, flags);
+	queue->assoc->queues[queue->qid] = NULL;
+	spin_unlock_irqrestore(&queue->assoc->tgtport->lock, flags);
+
+	nvmet_fc_destroy_fcp_iodlist(queue->assoc->tgtport, queue);
+
+	nvmet_fc_tgt_a_put(queue->assoc);
+
+	destroy_workqueue(queue->work_q);
+
+	kfree(queue);
+}
+
+static void
+nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue)
+{
+	kref_put(&queue->ref, nvmet_fc_tgt_queue_free);
+}
+
+static int
+nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue)
+{
+	return kref_get_unless_zero(&queue->ref);
+}
+
+
+static void
+nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
+{
+	struct nvmet_fc_tgtport *tgtport = queue->assoc->tgtport;
+	struct nvmet_fc_fcp_iod *fod = queue->fod;
+	struct nvmet_fc_defer_fcp_req *deferfcp, *tempptr;
+	unsigned long flags;
+	int i, writedataactive;
+	bool disconnect;
+
+	disconnect = atomic_xchg(&queue->connected, 0);
+
+	spin_lock_irqsave(&queue->qlock, flags);
+	/* about outstanding io's */
+	for (i = 0; i < queue->sqsize; fod++, i++) {
+		if (fod->active) {
+			spin_lock(&fod->flock);
+			fod->abort = true;
+			writedataactive = fod->writedataactive;
+			spin_unlock(&fod->flock);
+			/*
+			 * only call lldd abort routine if waiting for
+			 * writedata. other outstanding ops should finish
+			 * on their own.
+			 */
+			if (writedataactive) {
+				spin_lock(&fod->flock);
+				fod->aborted = true;
+				spin_unlock(&fod->flock);
+				tgtport->ops->fcp_abort(
+					&tgtport->fc_target_port, fod->fcpreq);
+			}
+		}
+	}
+
+	/* Cleanup defer'ed IOs in queue */
+	list_for_each_entry_safe(deferfcp, tempptr, &queue->avail_defer_list,
+				req_list) {
+		list_del(&deferfcp->req_list);
+		kfree(deferfcp);
+	}
+
+	for (;;) {
+		deferfcp = list_first_entry_or_null(&queue->pending_cmd_list,
+				struct nvmet_fc_defer_fcp_req, req_list);
+		if (!deferfcp)
+			break;
+
+		list_del(&deferfcp->req_list);
+		spin_unlock_irqrestore(&queue->qlock, flags);
+
+		tgtport->ops->defer_rcv(&tgtport->fc_target_port,
+				deferfcp->fcp_req);
+
+		tgtport->ops->fcp_abort(&tgtport->fc_target_port,
+				deferfcp->fcp_req);
+
+		tgtport->ops->fcp_req_release(&tgtport->fc_target_port,
+				deferfcp->fcp_req);
+
+		/* release the queue lookup reference */
+		nvmet_fc_tgt_q_put(queue);
+
+		kfree(deferfcp);
+
+		spin_lock_irqsave(&queue->qlock, flags);
+	}
+	spin_unlock_irqrestore(&queue->qlock, flags);
+
+	flush_workqueue(queue->work_q);
+
+	if (disconnect)
+		nvmet_sq_destroy(&queue->nvme_sq);
+
+	nvmet_fc_tgt_q_put(queue);
+}
+
+static struct nvmet_fc_tgt_queue *
+nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport,
+				u64 connection_id)
+{
+	struct nvmet_fc_tgt_assoc *assoc;
+	struct nvmet_fc_tgt_queue *queue;
+	u64 association_id = nvmet_fc_getassociationid(connection_id);
+	u16 qid = nvmet_fc_getqueueid(connection_id);
+	unsigned long flags;
+
+	if (qid > NVMET_NR_QUEUES)
+		return NULL;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	list_for_each_entry(assoc, &tgtport->assoc_list, a_list) {
+		if (association_id == assoc->association_id) {
+			queue = assoc->queues[qid];
+			if (queue &&
+			    (!atomic_read(&queue->connected) ||
+			     !nvmet_fc_tgt_q_get(queue)))
+				queue = NULL;
+			spin_unlock_irqrestore(&tgtport->lock, flags);
+			return queue;
+		}
+	}
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+	return NULL;
+}
+
+static void
+nvmet_fc_delete_assoc(struct work_struct *work)
+{
+	struct nvmet_fc_tgt_assoc *assoc =
+		container_of(work, struct nvmet_fc_tgt_assoc, del_work);
+
+	nvmet_fc_delete_target_assoc(assoc);
+	nvmet_fc_tgt_a_put(assoc);
+}
+
+static struct nvmet_fc_tgt_assoc *
+nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport)
+{
+	struct nvmet_fc_tgt_assoc *assoc, *tmpassoc;
+	unsigned long flags;
+	u64 ran;
+	int idx;
+	bool needrandom = true;
+
+	assoc = kzalloc(sizeof(*assoc), GFP_KERNEL);
+	if (!assoc)
+		return NULL;
+
+	idx = ida_simple_get(&tgtport->assoc_cnt, 0, 0, GFP_KERNEL);
+	if (idx < 0)
+		goto out_free_assoc;
+
+	if (!nvmet_fc_tgtport_get(tgtport))
+		goto out_ida_put;
+
+	assoc->tgtport = tgtport;
+	assoc->a_id = idx;
+	INIT_LIST_HEAD(&assoc->a_list);
+	kref_init(&assoc->ref);
+	INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc);
+
+	while (needrandom) {
+		get_random_bytes(&ran, sizeof(ran) - BYTES_FOR_QID);
+		ran = ran << BYTES_FOR_QID_SHIFT;
+
+		spin_lock_irqsave(&tgtport->lock, flags);
+		needrandom = false;
+		list_for_each_entry(tmpassoc, &tgtport->assoc_list, a_list)
+			if (ran == tmpassoc->association_id) {
+				needrandom = true;
+				break;
+			}
+		if (!needrandom) {
+			assoc->association_id = ran;
+			list_add_tail(&assoc->a_list, &tgtport->assoc_list);
+		}
+		spin_unlock_irqrestore(&tgtport->lock, flags);
+	}
+
+	return assoc;
+
+out_ida_put:
+	ida_simple_remove(&tgtport->assoc_cnt, idx);
+out_free_assoc:
+	kfree(assoc);
+	return NULL;
+}
+
+static void
+nvmet_fc_target_assoc_free(struct kref *ref)
+{
+	struct nvmet_fc_tgt_assoc *assoc =
+		container_of(ref, struct nvmet_fc_tgt_assoc, ref);
+	struct nvmet_fc_tgtport *tgtport = assoc->tgtport;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	list_del(&assoc->a_list);
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+	ida_simple_remove(&tgtport->assoc_cnt, assoc->a_id);
+	kfree(assoc);
+	nvmet_fc_tgtport_put(tgtport);
+}
+
+static void
+nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc)
+{
+	kref_put(&assoc->ref, nvmet_fc_target_assoc_free);
+}
+
+static int
+nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc)
+{
+	return kref_get_unless_zero(&assoc->ref);
+}
+
+static void
+nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc)
+{
+	struct nvmet_fc_tgtport *tgtport = assoc->tgtport;
+	struct nvmet_fc_tgt_queue *queue;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	for (i = NVMET_NR_QUEUES; i >= 0; i--) {
+		queue = assoc->queues[i];
+		if (queue) {
+			if (!nvmet_fc_tgt_q_get(queue))
+				continue;
+			spin_unlock_irqrestore(&tgtport->lock, flags);
+			nvmet_fc_delete_target_queue(queue);
+			nvmet_fc_tgt_q_put(queue);
+			spin_lock_irqsave(&tgtport->lock, flags);
+		}
+	}
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+
+	nvmet_fc_tgt_a_put(assoc);
+}
+
+static struct nvmet_fc_tgt_assoc *
+nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport,
+				u64 association_id)
+{
+	struct nvmet_fc_tgt_assoc *assoc;
+	struct nvmet_fc_tgt_assoc *ret = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	list_for_each_entry(assoc, &tgtport->assoc_list, a_list) {
+		if (association_id == assoc->association_id) {
+			ret = assoc;
+			nvmet_fc_tgt_a_get(assoc);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+
+	return ret;
+}
+
+
+/**
+ * nvme_fc_register_targetport - transport entry point called by an
+ *                              LLDD to register the existence of a local
+ *                              NVME subystem FC port.
+ * @pinfo:     pointer to information about the port to be registered
+ * @template:  LLDD entrypoints and operational parameters for the port
+ * @dev:       physical hardware device node port corresponds to. Will be
+ *             used for DMA mappings
+ * @portptr:   pointer to a local port pointer. Upon success, the routine
+ *             will allocate a nvme_fc_local_port structure and place its
+ *             address in the local port pointer. Upon failure, local port
+ *             pointer will be set to NULL.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo,
+			struct nvmet_fc_target_template *template,
+			struct device *dev,
+			struct nvmet_fc_target_port **portptr)
+{
+	struct nvmet_fc_tgtport *newrec;
+	unsigned long flags;
+	int ret, idx;
+
+	if (!template->xmt_ls_rsp || !template->fcp_op ||
+	    !template->fcp_abort ||
+	    !template->fcp_req_release || !template->targetport_delete ||
+	    !template->max_hw_queues || !template->max_sgl_segments ||
+	    !template->max_dif_sgl_segments || !template->dma_boundary) {
+		ret = -EINVAL;
+		goto out_regtgt_failed;
+	}
+
+	newrec = kzalloc((sizeof(*newrec) + template->target_priv_sz),
+			 GFP_KERNEL);
+	if (!newrec) {
+		ret = -ENOMEM;
+		goto out_regtgt_failed;
+	}
+
+	idx = ida_simple_get(&nvmet_fc_tgtport_cnt, 0, 0, GFP_KERNEL);
+	if (idx < 0) {
+		ret = -ENOSPC;
+		goto out_fail_kfree;
+	}
+
+	if (!get_device(dev) && dev) {
+		ret = -ENODEV;
+		goto out_ida_put;
+	}
+
+	newrec->fc_target_port.node_name = pinfo->node_name;
+	newrec->fc_target_port.port_name = pinfo->port_name;
+	newrec->fc_target_port.private = &newrec[1];
+	newrec->fc_target_port.port_id = pinfo->port_id;
+	newrec->fc_target_port.port_num = idx;
+	INIT_LIST_HEAD(&newrec->tgt_list);
+	newrec->dev = dev;
+	newrec->ops = template;
+	spin_lock_init(&newrec->lock);
+	INIT_LIST_HEAD(&newrec->ls_list);
+	INIT_LIST_HEAD(&newrec->ls_busylist);
+	INIT_LIST_HEAD(&newrec->assoc_list);
+	kref_init(&newrec->ref);
+	ida_init(&newrec->assoc_cnt);
+	newrec->max_sg_cnt = min_t(u32, NVMET_FC_MAX_XFR_SGENTS,
+					template->max_sgl_segments);
+
+	ret = nvmet_fc_alloc_ls_iodlist(newrec);
+	if (ret) {
+		ret = -ENOMEM;
+		goto out_free_newrec;
+	}
+
+	spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+	list_add_tail(&newrec->tgt_list, &nvmet_fc_target_list);
+	spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
+
+	*portptr = &newrec->fc_target_port;
+	return 0;
+
+out_free_newrec:
+	put_device(dev);
+out_ida_put:
+	ida_simple_remove(&nvmet_fc_tgtport_cnt, idx);
+out_fail_kfree:
+	kfree(newrec);
+out_regtgt_failed:
+	*portptr = NULL;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_register_targetport);
+
+
+static void
+nvmet_fc_free_tgtport(struct kref *ref)
+{
+	struct nvmet_fc_tgtport *tgtport =
+		container_of(ref, struct nvmet_fc_tgtport, ref);
+	struct device *dev = tgtport->dev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+	list_del(&tgtport->tgt_list);
+	spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
+
+	nvmet_fc_free_ls_iodlist(tgtport);
+
+	/* let the LLDD know we've finished tearing it down */
+	tgtport->ops->targetport_delete(&tgtport->fc_target_port);
+
+	ida_simple_remove(&nvmet_fc_tgtport_cnt,
+			tgtport->fc_target_port.port_num);
+
+	ida_destroy(&tgtport->assoc_cnt);
+
+	kfree(tgtport);
+
+	put_device(dev);
+}
+
+static void
+nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport)
+{
+	kref_put(&tgtport->ref, nvmet_fc_free_tgtport);
+}
+
+static int
+nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport)
+{
+	return kref_get_unless_zero(&tgtport->ref);
+}
+
+static void
+__nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport)
+{
+	struct nvmet_fc_tgt_assoc *assoc, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	list_for_each_entry_safe(assoc, next,
+				&tgtport->assoc_list, a_list) {
+		if (!nvmet_fc_tgt_a_get(assoc))
+			continue;
+		spin_unlock_irqrestore(&tgtport->lock, flags);
+		nvmet_fc_delete_target_assoc(assoc);
+		nvmet_fc_tgt_a_put(assoc);
+		spin_lock_irqsave(&tgtport->lock, flags);
+	}
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+}
+
+/*
+ * nvmet layer has called to terminate an association
+ */
+static void
+nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
+{
+	struct nvmet_fc_tgtport *tgtport, *next;
+	struct nvmet_fc_tgt_assoc *assoc;
+	struct nvmet_fc_tgt_queue *queue;
+	unsigned long flags;
+	bool found_ctrl = false;
+
+	/* this is a bit ugly, but don't want to make locks layered */
+	spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+	list_for_each_entry_safe(tgtport, next, &nvmet_fc_target_list,
+			tgt_list) {
+		if (!nvmet_fc_tgtport_get(tgtport))
+			continue;
+		spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
+
+		spin_lock_irqsave(&tgtport->lock, flags);
+		list_for_each_entry(assoc, &tgtport->assoc_list, a_list) {
+			queue = assoc->queues[0];
+			if (queue && queue->nvme_sq.ctrl == ctrl) {
+				if (nvmet_fc_tgt_a_get(assoc))
+					found_ctrl = true;
+				break;
+			}
+		}
+		spin_unlock_irqrestore(&tgtport->lock, flags);
+
+		nvmet_fc_tgtport_put(tgtport);
+
+		if (found_ctrl) {
+			schedule_work(&assoc->del_work);
+			return;
+		}
+
+		spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+	}
+	spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
+}
+
+/**
+ * nvme_fc_unregister_targetport - transport entry point called by an
+ *                              LLDD to deregister/remove a previously
+ *                              registered a local NVME subsystem FC port.
+ * @tgtport: pointer to the (registered) target port that is to be
+ *           deregistered.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *target_port)
+{
+	struct nvmet_fc_tgtport *tgtport = targetport_to_tgtport(target_port);
+
+	/* terminate any outstanding associations */
+	__nvmet_fc_free_assocs(tgtport);
+
+	nvmet_fc_tgtport_put(tgtport);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_unregister_targetport);
+
+
+/* *********************** FC-NVME LS Handling **************************** */
+
+
+static void
+nvmet_fc_format_rsp_hdr(void *buf, u8 ls_cmd, __be32 desc_len, u8 rqst_ls_cmd)
+{
+	struct fcnvme_ls_acc_hdr *acc = buf;
+
+	acc->w0.ls_cmd = ls_cmd;
+	acc->desc_list_len = desc_len;
+	acc->rqst.desc_tag = cpu_to_be32(FCNVME_LSDESC_RQST);
+	acc->rqst.desc_len =
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst));
+	acc->rqst.w0.ls_cmd = rqst_ls_cmd;
+}
+
+static int
+nvmet_fc_format_rjt(void *buf, u16 buflen, u8 ls_cmd,
+			u8 reason, u8 explanation, u8 vendor)
+{
+	struct fcnvme_ls_rjt *rjt = buf;
+
+	nvmet_fc_format_rsp_hdr(buf, FCNVME_LSDESC_RQST,
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_rjt)),
+			ls_cmd);
+	rjt->rjt.desc_tag = cpu_to_be32(FCNVME_LSDESC_RJT);
+	rjt->rjt.desc_len = fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rjt));
+	rjt->rjt.reason_code = reason;
+	rjt->rjt.reason_explanation = explanation;
+	rjt->rjt.vendor = vendor;
+
+	return sizeof(struct fcnvme_ls_rjt);
+}
+
+/* Validation Error indexes into the string table below */
+enum {
+	VERR_NO_ERROR		= 0,
+	VERR_CR_ASSOC_LEN	= 1,
+	VERR_CR_ASSOC_RQST_LEN	= 2,
+	VERR_CR_ASSOC_CMD	= 3,
+	VERR_CR_ASSOC_CMD_LEN	= 4,
+	VERR_ERSP_RATIO		= 5,
+	VERR_ASSOC_ALLOC_FAIL	= 6,
+	VERR_QUEUE_ALLOC_FAIL	= 7,
+	VERR_CR_CONN_LEN	= 8,
+	VERR_CR_CONN_RQST_LEN	= 9,
+	VERR_ASSOC_ID		= 10,
+	VERR_ASSOC_ID_LEN	= 11,
+	VERR_NO_ASSOC		= 12,
+	VERR_CONN_ID		= 13,
+	VERR_CONN_ID_LEN	= 14,
+	VERR_NO_CONN		= 15,
+	VERR_CR_CONN_CMD	= 16,
+	VERR_CR_CONN_CMD_LEN	= 17,
+	VERR_DISCONN_LEN	= 18,
+	VERR_DISCONN_RQST_LEN	= 19,
+	VERR_DISCONN_CMD	= 20,
+	VERR_DISCONN_CMD_LEN	= 21,
+	VERR_DISCONN_SCOPE	= 22,
+	VERR_RS_LEN		= 23,
+	VERR_RS_RQST_LEN	= 24,
+	VERR_RS_CMD		= 25,
+	VERR_RS_CMD_LEN		= 26,
+	VERR_RS_RCTL		= 27,
+	VERR_RS_RO		= 28,
+};
+
+static char *validation_errors[] = {
+	"OK",
+	"Bad CR_ASSOC Length",
+	"Bad CR_ASSOC Rqst Length",
+	"Not CR_ASSOC Cmd",
+	"Bad CR_ASSOC Cmd Length",
+	"Bad Ersp Ratio",
+	"Association Allocation Failed",
+	"Queue Allocation Failed",
+	"Bad CR_CONN Length",
+	"Bad CR_CONN Rqst Length",
+	"Not Association ID",
+	"Bad Association ID Length",
+	"No Association",
+	"Not Connection ID",
+	"Bad Connection ID Length",
+	"No Connection",
+	"Not CR_CONN Cmd",
+	"Bad CR_CONN Cmd Length",
+	"Bad DISCONN Length",
+	"Bad DISCONN Rqst Length",
+	"Not DISCONN Cmd",
+	"Bad DISCONN Cmd Length",
+	"Bad Disconnect Scope",
+	"Bad RS Length",
+	"Bad RS Rqst Length",
+	"Not RS Cmd",
+	"Bad RS Cmd Length",
+	"Bad RS R_CTL",
+	"Bad RS Relative Offset",
+};
+
+static void
+nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_ls_iod *iod)
+{
+	struct fcnvme_ls_cr_assoc_rqst *rqst =
+				(struct fcnvme_ls_cr_assoc_rqst *)iod->rqstbuf;
+	struct fcnvme_ls_cr_assoc_acc *acc =
+				(struct fcnvme_ls_cr_assoc_acc *)iod->rspbuf;
+	struct nvmet_fc_tgt_queue *queue;
+	int ret = 0;
+
+	memset(acc, 0, sizeof(*acc));
+
+	/*
+	 * FC-NVME spec changes. There are initiators sending different
+	 * lengths as padding sizes for Create Association Cmd descriptor
+	 * was incorrect.
+	 * Accept anything of "minimum" length. Assume format per 1.15
+	 * spec (with HOSTID reduced to 16 bytes), ignore how long the
+	 * trailing pad length is.
+	 */
+	if (iod->rqstdatalen < FCNVME_LSDESC_CRA_RQST_MINLEN)
+		ret = VERR_CR_ASSOC_LEN;
+	else if (be32_to_cpu(rqst->desc_list_len) <
+			FCNVME_LSDESC_CRA_RQST_MIN_LISTLEN)
+		ret = VERR_CR_ASSOC_RQST_LEN;
+	else if (rqst->assoc_cmd.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_CREATE_ASSOC_CMD))
+		ret = VERR_CR_ASSOC_CMD;
+	else if (be32_to_cpu(rqst->assoc_cmd.desc_len) <
+			FCNVME_LSDESC_CRA_CMD_DESC_MIN_DESCLEN)
+		ret = VERR_CR_ASSOC_CMD_LEN;
+	else if (!rqst->assoc_cmd.ersp_ratio ||
+		 (be16_to_cpu(rqst->assoc_cmd.ersp_ratio) >=
+				be16_to_cpu(rqst->assoc_cmd.sqsize)))
+		ret = VERR_ERSP_RATIO;
+
+	else {
+		/* new association w/ admin queue */
+		iod->assoc = nvmet_fc_alloc_target_assoc(tgtport);
+		if (!iod->assoc)
+			ret = VERR_ASSOC_ALLOC_FAIL;
+		else {
+			queue = nvmet_fc_alloc_target_queue(iod->assoc, 0,
+					be16_to_cpu(rqst->assoc_cmd.sqsize));
+			if (!queue)
+				ret = VERR_QUEUE_ALLOC_FAIL;
+		}
+	}
+
+	if (ret) {
+		dev_err(tgtport->dev,
+			"Create Association LS failed: %s\n",
+			validation_errors[ret]);
+		iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
+				NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
+				FCNVME_RJT_RC_LOGIC,
+				FCNVME_RJT_EXP_NONE, 0);
+		return;
+	}
+
+	queue->ersp_ratio = be16_to_cpu(rqst->assoc_cmd.ersp_ratio);
+	atomic_set(&queue->connected, 1);
+	queue->sqhd = 0;	/* best place to init value */
+
+	/* format a response */
+
+	iod->lsreq->rsplen = sizeof(*acc);
+
+	nvmet_fc_format_rsp_hdr(acc, FCNVME_LS_ACC,
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_ls_cr_assoc_acc)),
+			FCNVME_LS_CREATE_ASSOCIATION);
+	acc->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID);
+	acc->associd.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_assoc_id));
+	acc->associd.association_id =
+			cpu_to_be64(nvmet_fc_makeconnid(iod->assoc, 0));
+	acc->connectid.desc_tag = cpu_to_be32(FCNVME_LSDESC_CONN_ID);
+	acc->connectid.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_conn_id));
+	acc->connectid.connection_id = acc->associd.association_id;
+}
+
+static void
+nvmet_fc_ls_create_connection(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_ls_iod *iod)
+{
+	struct fcnvme_ls_cr_conn_rqst *rqst =
+				(struct fcnvme_ls_cr_conn_rqst *)iod->rqstbuf;
+	struct fcnvme_ls_cr_conn_acc *acc =
+				(struct fcnvme_ls_cr_conn_acc *)iod->rspbuf;
+	struct nvmet_fc_tgt_queue *queue;
+	int ret = 0;
+
+	memset(acc, 0, sizeof(*acc));
+
+	if (iod->rqstdatalen < sizeof(struct fcnvme_ls_cr_conn_rqst))
+		ret = VERR_CR_CONN_LEN;
+	else if (rqst->desc_list_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_ls_cr_conn_rqst)))
+		ret = VERR_CR_CONN_RQST_LEN;
+	else if (rqst->associd.desc_tag != cpu_to_be32(FCNVME_LSDESC_ASSOC_ID))
+		ret = VERR_ASSOC_ID;
+	else if (rqst->associd.desc_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_assoc_id)))
+		ret = VERR_ASSOC_ID_LEN;
+	else if (rqst->connect_cmd.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_CREATE_CONN_CMD))
+		ret = VERR_CR_CONN_CMD;
+	else if (rqst->connect_cmd.desc_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_cr_conn_cmd)))
+		ret = VERR_CR_CONN_CMD_LEN;
+	else if (!rqst->connect_cmd.ersp_ratio ||
+		 (be16_to_cpu(rqst->connect_cmd.ersp_ratio) >=
+				be16_to_cpu(rqst->connect_cmd.sqsize)))
+		ret = VERR_ERSP_RATIO;
+
+	else {
+		/* new io queue */
+		iod->assoc = nvmet_fc_find_target_assoc(tgtport,
+				be64_to_cpu(rqst->associd.association_id));
+		if (!iod->assoc)
+			ret = VERR_NO_ASSOC;
+		else {
+			queue = nvmet_fc_alloc_target_queue(iod->assoc,
+					be16_to_cpu(rqst->connect_cmd.qid),
+					be16_to_cpu(rqst->connect_cmd.sqsize));
+			if (!queue)
+				ret = VERR_QUEUE_ALLOC_FAIL;
+
+			/* release get taken in nvmet_fc_find_target_assoc */
+			nvmet_fc_tgt_a_put(iod->assoc);
+		}
+	}
+
+	if (ret) {
+		dev_err(tgtport->dev,
+			"Create Connection LS failed: %s\n",
+			validation_errors[ret]);
+		iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
+				NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
+				(ret == VERR_NO_ASSOC) ?
+					FCNVME_RJT_RC_INV_ASSOC :
+					FCNVME_RJT_RC_LOGIC,
+				FCNVME_RJT_EXP_NONE, 0);
+		return;
+	}
+
+	queue->ersp_ratio = be16_to_cpu(rqst->connect_cmd.ersp_ratio);
+	atomic_set(&queue->connected, 1);
+	queue->sqhd = 0;	/* best place to init value */
+
+	/* format a response */
+
+	iod->lsreq->rsplen = sizeof(*acc);
+
+	nvmet_fc_format_rsp_hdr(acc, FCNVME_LS_ACC,
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_cr_conn_acc)),
+			FCNVME_LS_CREATE_CONNECTION);
+	acc->connectid.desc_tag = cpu_to_be32(FCNVME_LSDESC_CONN_ID);
+	acc->connectid.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_conn_id));
+	acc->connectid.connection_id =
+			cpu_to_be64(nvmet_fc_makeconnid(iod->assoc,
+				be16_to_cpu(rqst->connect_cmd.qid)));
+}
+
+static void
+nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_ls_iod *iod)
+{
+	struct fcnvme_ls_disconnect_rqst *rqst =
+			(struct fcnvme_ls_disconnect_rqst *)iod->rqstbuf;
+	struct fcnvme_ls_disconnect_acc *acc =
+			(struct fcnvme_ls_disconnect_acc *)iod->rspbuf;
+	struct nvmet_fc_tgt_queue *queue = NULL;
+	struct nvmet_fc_tgt_assoc *assoc;
+	int ret = 0;
+	bool del_assoc = false;
+
+	memset(acc, 0, sizeof(*acc));
+
+	if (iod->rqstdatalen < sizeof(struct fcnvme_ls_disconnect_rqst))
+		ret = VERR_DISCONN_LEN;
+	else if (rqst->desc_list_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_ls_disconnect_rqst)))
+		ret = VERR_DISCONN_RQST_LEN;
+	else if (rqst->associd.desc_tag != cpu_to_be32(FCNVME_LSDESC_ASSOC_ID))
+		ret = VERR_ASSOC_ID;
+	else if (rqst->associd.desc_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_assoc_id)))
+		ret = VERR_ASSOC_ID_LEN;
+	else if (rqst->discon_cmd.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_DISCONN_CMD))
+		ret = VERR_DISCONN_CMD;
+	else if (rqst->discon_cmd.desc_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_disconn_cmd)))
+		ret = VERR_DISCONN_CMD_LEN;
+	else if ((rqst->discon_cmd.scope != FCNVME_DISCONN_ASSOCIATION) &&
+			(rqst->discon_cmd.scope != FCNVME_DISCONN_CONNECTION))
+		ret = VERR_DISCONN_SCOPE;
+	else {
+		/* match an active association */
+		assoc = nvmet_fc_find_target_assoc(tgtport,
+				be64_to_cpu(rqst->associd.association_id));
+		iod->assoc = assoc;
+		if (assoc) {
+			if (rqst->discon_cmd.scope ==
+					FCNVME_DISCONN_CONNECTION) {
+				queue = nvmet_fc_find_target_queue(tgtport,
+						be64_to_cpu(
+							rqst->discon_cmd.id));
+				if (!queue) {
+					nvmet_fc_tgt_a_put(assoc);
+					ret = VERR_NO_CONN;
+				}
+			}
+		} else
+			ret = VERR_NO_ASSOC;
+	}
+
+	if (ret) {
+		dev_err(tgtport->dev,
+			"Disconnect LS failed: %s\n",
+			validation_errors[ret]);
+		iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
+				NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
+				(ret == VERR_NO_ASSOC) ?
+					FCNVME_RJT_RC_INV_ASSOC :
+					(ret == VERR_NO_CONN) ?
+						FCNVME_RJT_RC_INV_CONN :
+						FCNVME_RJT_RC_LOGIC,
+				FCNVME_RJT_EXP_NONE, 0);
+		return;
+	}
+
+	/* format a response */
+
+	iod->lsreq->rsplen = sizeof(*acc);
+
+	nvmet_fc_format_rsp_hdr(acc, FCNVME_LS_ACC,
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_ls_disconnect_acc)),
+			FCNVME_LS_DISCONNECT);
+
+
+	/* are we to delete a Connection ID (queue) */
+	if (queue) {
+		int qid = queue->qid;
+
+		nvmet_fc_delete_target_queue(queue);
+
+		/* release the get taken by find_target_queue */
+		nvmet_fc_tgt_q_put(queue);
+
+		/* tear association down if io queue terminated */
+		if (!qid)
+			del_assoc = true;
+	}
+
+	/* release get taken in nvmet_fc_find_target_assoc */
+	nvmet_fc_tgt_a_put(iod->assoc);
+
+	if (del_assoc)
+		nvmet_fc_delete_target_assoc(iod->assoc);
+}
+
+
+/* *********************** NVME Ctrl Routines **************************** */
+
+
+static void nvmet_fc_fcp_nvme_cmd_done(struct nvmet_req *nvme_req);
+
+static const struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops;
+
+static void
+nvmet_fc_xmt_ls_rsp_done(struct nvmefc_tgt_ls_req *lsreq)
+{
+	struct nvmet_fc_ls_iod *iod = lsreq->nvmet_fc_private;
+	struct nvmet_fc_tgtport *tgtport = iod->tgtport;
+
+	fc_dma_sync_single_for_cpu(tgtport->dev, iod->rspdma,
+				NVME_FC_MAX_LS_BUFFER_SIZE, DMA_TO_DEVICE);
+	nvmet_fc_free_ls_iod(tgtport, iod);
+	nvmet_fc_tgtport_put(tgtport);
+}
+
+static void
+nvmet_fc_xmt_ls_rsp(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_ls_iod *iod)
+{
+	int ret;
+
+	fc_dma_sync_single_for_device(tgtport->dev, iod->rspdma,
+				  NVME_FC_MAX_LS_BUFFER_SIZE, DMA_TO_DEVICE);
+
+	ret = tgtport->ops->xmt_ls_rsp(&tgtport->fc_target_port, iod->lsreq);
+	if (ret)
+		nvmet_fc_xmt_ls_rsp_done(iod->lsreq);
+}
+
+/*
+ * Actual processing routine for received FC-NVME LS Requests from the LLD
+ */
+static void
+nvmet_fc_handle_ls_rqst(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_ls_iod *iod)
+{
+	struct fcnvme_ls_rqst_w0 *w0 =
+			(struct fcnvme_ls_rqst_w0 *)iod->rqstbuf;
+
+	iod->lsreq->nvmet_fc_private = iod;
+	iod->lsreq->rspbuf = iod->rspbuf;
+	iod->lsreq->rspdma = iod->rspdma;
+	iod->lsreq->done = nvmet_fc_xmt_ls_rsp_done;
+	/* Be preventative. handlers will later set to valid length */
+	iod->lsreq->rsplen = 0;
+
+	iod->assoc = NULL;
+
+	/*
+	 * handlers:
+	 *   parse request input, execute the request, and format the
+	 *   LS response
+	 */
+	switch (w0->ls_cmd) {
+	case FCNVME_LS_CREATE_ASSOCIATION:
+		/* Creates Association and initial Admin Queue/Connection */
+		nvmet_fc_ls_create_association(tgtport, iod);
+		break;
+	case FCNVME_LS_CREATE_CONNECTION:
+		/* Creates an IO Queue/Connection */
+		nvmet_fc_ls_create_connection(tgtport, iod);
+		break;
+	case FCNVME_LS_DISCONNECT:
+		/* Terminate a Queue/Connection or the Association */
+		nvmet_fc_ls_disconnect(tgtport, iod);
+		break;
+	default:
+		iod->lsreq->rsplen = nvmet_fc_format_rjt(iod->rspbuf,
+				NVME_FC_MAX_LS_BUFFER_SIZE, w0->ls_cmd,
+				FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0);
+	}
+
+	nvmet_fc_xmt_ls_rsp(tgtport, iod);
+}
+
+/*
+ * Actual processing routine for received FC-NVME LS Requests from the LLD
+ */
+static void
+nvmet_fc_handle_ls_rqst_work(struct work_struct *work)
+{
+	struct nvmet_fc_ls_iod *iod =
+		container_of(work, struct nvmet_fc_ls_iod, work);
+	struct nvmet_fc_tgtport *tgtport = iod->tgtport;
+
+	nvmet_fc_handle_ls_rqst(tgtport, iod);
+}
+
+
+/**
+ * nvmet_fc_rcv_ls_req - transport entry point called by an LLDD
+ *                       upon the reception of a NVME LS request.
+ *
+ * The nvmet-fc layer will copy payload to an internal structure for
+ * processing.  As such, upon completion of the routine, the LLDD may
+ * immediately free/reuse the LS request buffer passed in the call.
+ *
+ * If this routine returns error, the LLDD should abort the exchange.
+ *
+ * @tgtport:    pointer to the (registered) target port the LS was
+ *              received on.
+ * @lsreq:      pointer to a lsreq request structure to be used to reference
+ *              the exchange corresponding to the LS.
+ * @lsreqbuf:   pointer to the buffer containing the LS Request
+ * @lsreqbuf_len: length, in bytes, of the received LS request
+ */
+int
+nvmet_fc_rcv_ls_req(struct nvmet_fc_target_port *target_port,
+			struct nvmefc_tgt_ls_req *lsreq,
+			void *lsreqbuf, u32 lsreqbuf_len)
+{
+	struct nvmet_fc_tgtport *tgtport = targetport_to_tgtport(target_port);
+	struct nvmet_fc_ls_iod *iod;
+
+	if (lsreqbuf_len > NVME_FC_MAX_LS_BUFFER_SIZE)
+		return -E2BIG;
+
+	if (!nvmet_fc_tgtport_get(tgtport))
+		return -ESHUTDOWN;
+
+	iod = nvmet_fc_alloc_ls_iod(tgtport);
+	if (!iod) {
+		nvmet_fc_tgtport_put(tgtport);
+		return -ENOENT;
+	}
+
+	iod->lsreq = lsreq;
+	iod->fcpreq = NULL;
+	memcpy(iod->rqstbuf, lsreqbuf, lsreqbuf_len);
+	iod->rqstdatalen = lsreqbuf_len;
+
+	schedule_work(&iod->work);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_rcv_ls_req);
+
+
+/*
+ * **********************
+ * Start of FCP handling
+ * **********************
+ */
+
+static int
+nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
+{
+	struct scatterlist *sg;
+	unsigned int nent;
+
+	sg = sgl_alloc(fod->req.transfer_len, GFP_KERNEL, &nent);
+	if (!sg)
+		goto out;
+
+	fod->data_sg = sg;
+	fod->data_sg_cnt = nent;
+	fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent,
+				((fod->io_dir == NVMET_FCP_WRITE) ?
+					DMA_FROM_DEVICE : DMA_TO_DEVICE));
+				/* note: write from initiator perspective */
+
+	return 0;
+
+out:
+	return NVME_SC_INTERNAL;
+}
+
+static void
+nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
+{
+	if (!fod->data_sg || !fod->data_sg_cnt)
+		return;
+
+	fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt,
+				((fod->io_dir == NVMET_FCP_WRITE) ?
+					DMA_FROM_DEVICE : DMA_TO_DEVICE));
+	sgl_free(fod->data_sg);
+	fod->data_sg = NULL;
+	fod->data_sg_cnt = 0;
+}
+
+
+static bool
+queue_90percent_full(struct nvmet_fc_tgt_queue *q, u32 sqhd)
+{
+	u32 sqtail, used;
+
+	/* egad, this is ugly. And sqtail is just a best guess */
+	sqtail = atomic_read(&q->sqtail) % q->sqsize;
+
+	used = (sqtail < sqhd) ? (sqtail + q->sqsize - sqhd) : (sqtail - sqhd);
+	return ((used * 10) >= (((u32)(q->sqsize - 1) * 9)));
+}
+
+/*
+ * Prep RSP payload.
+ * May be a NVMET_FCOP_RSP or NVMET_FCOP_READDATA_RSP op
+ */
+static void
+nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_fcp_iod *fod)
+{
+	struct nvme_fc_ersp_iu *ersp = &fod->rspiubuf;
+	struct nvme_common_command *sqe = &fod->cmdiubuf.sqe.common;
+	struct nvme_completion *cqe = &ersp->cqe;
+	u32 *cqewd = (u32 *)cqe;
+	bool send_ersp = false;
+	u32 rsn, rspcnt, xfr_length;
+
+	if (fod->fcpreq->op == NVMET_FCOP_READDATA_RSP)
+		xfr_length = fod->req.transfer_len;
+	else
+		xfr_length = fod->offset;
+
+	/*
+	 * check to see if we can send a 0's rsp.
+	 *   Note: to send a 0's response, the NVME-FC host transport will
+	 *   recreate the CQE. The host transport knows: sq id, SQHD (last
+	 *   seen in an ersp), and command_id. Thus it will create a
+	 *   zero-filled CQE with those known fields filled in. Transport
+	 *   must send an ersp for any condition where the cqe won't match
+	 *   this.
+	 *
+	 * Here are the FC-NVME mandated cases where we must send an ersp:
+	 *  every N responses, where N=ersp_ratio
+	 *  force fabric commands to send ersp's (not in FC-NVME but good
+	 *    practice)
+	 *  normal cmds: any time status is non-zero, or status is zero
+	 *     but words 0 or 1 are non-zero.
+	 *  the SQ is 90% or more full
+	 *  the cmd is a fused command
+	 *  transferred data length not equal to cmd iu length
+	 */
+	rspcnt = atomic_inc_return(&fod->queue->zrspcnt);
+	if (!(rspcnt % fod->queue->ersp_ratio) ||
+	    sqe->opcode == nvme_fabrics_command ||
+	    xfr_length != fod->req.transfer_len ||
+	    (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] ||
+	    (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) ||
+	    queue_90percent_full(fod->queue, le16_to_cpu(cqe->sq_head)))
+		send_ersp = true;
+
+	/* re-set the fields */
+	fod->fcpreq->rspaddr = ersp;
+	fod->fcpreq->rspdma = fod->rspdma;
+
+	if (!send_ersp) {
+		memset(ersp, 0, NVME_FC_SIZEOF_ZEROS_RSP);
+		fod->fcpreq->rsplen = NVME_FC_SIZEOF_ZEROS_RSP;
+	} else {
+		ersp->iu_len = cpu_to_be16(sizeof(*ersp)/sizeof(u32));
+		rsn = atomic_inc_return(&fod->queue->rsn);
+		ersp->rsn = cpu_to_be32(rsn);
+		ersp->xfrd_len = cpu_to_be32(xfr_length);
+		fod->fcpreq->rsplen = sizeof(*ersp);
+	}
+
+	fc_dma_sync_single_for_device(tgtport->dev, fod->rspdma,
+				  sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+}
+
+static void nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq);
+
+static void
+nvmet_fc_abort_op(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_fcp_iod *fod)
+{
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+
+	/* data no longer needed */
+	nvmet_fc_free_tgt_pgs(fod);
+
+	/*
+	 * if an ABTS was received or we issued the fcp_abort early
+	 * don't call abort routine again.
+	 */
+	/* no need to take lock - lock was taken earlier to get here */
+	if (!fod->aborted)
+		tgtport->ops->fcp_abort(&tgtport->fc_target_port, fcpreq);
+
+	nvmet_fc_free_fcp_iod(fod->queue, fod);
+}
+
+static void
+nvmet_fc_xmt_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_fcp_iod *fod)
+{
+	int ret;
+
+	fod->fcpreq->op = NVMET_FCOP_RSP;
+	fod->fcpreq->timeout = 0;
+
+	nvmet_fc_prep_fcp_rsp(tgtport, fod);
+
+	ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq);
+	if (ret)
+		nvmet_fc_abort_op(tgtport, fod);
+}
+
+static void
+nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_fcp_iod *fod, u8 op)
+{
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+	unsigned long flags;
+	u32 tlen;
+	int ret;
+
+	fcpreq->op = op;
+	fcpreq->offset = fod->offset;
+	fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC;
+
+	tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE,
+			(fod->req.transfer_len - fod->offset));
+	fcpreq->transfer_length = tlen;
+	fcpreq->transferred_length = 0;
+	fcpreq->fcp_error = 0;
+	fcpreq->rsplen = 0;
+
+	fcpreq->sg = &fod->data_sg[fod->offset / PAGE_SIZE];
+	fcpreq->sg_cnt = DIV_ROUND_UP(tlen, PAGE_SIZE);
+
+	/*
+	 * If the last READDATA request: check if LLDD supports
+	 * combined xfr with response.
+	 */
+	if ((op == NVMET_FCOP_READDATA) &&
+	    ((fod->offset + fcpreq->transfer_length) == fod->req.transfer_len) &&
+	    (tgtport->ops->target_features & NVMET_FCTGTFEAT_READDATA_RSP)) {
+		fcpreq->op = NVMET_FCOP_READDATA_RSP;
+		nvmet_fc_prep_fcp_rsp(tgtport, fod);
+	}
+
+	ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq);
+	if (ret) {
+		/*
+		 * should be ok to set w/o lock as its in the thread of
+		 * execution (not an async timer routine) and doesn't
+		 * contend with any clearing action
+		 */
+		fod->abort = true;
+
+		if (op == NVMET_FCOP_WRITEDATA) {
+			spin_lock_irqsave(&fod->flock, flags);
+			fod->writedataactive = false;
+			spin_unlock_irqrestore(&fod->flock, flags);
+			nvmet_req_complete(&fod->req, NVME_SC_INTERNAL);
+		} else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ {
+			fcpreq->fcp_error = ret;
+			fcpreq->transferred_length = 0;
+			nvmet_fc_xmt_fcp_op_done(fod->fcpreq);
+		}
+	}
+}
+
+static inline bool
+__nvmet_fc_fod_op_abort(struct nvmet_fc_fcp_iod *fod, bool abort)
+{
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
+
+	/* if in the middle of an io and we need to tear down */
+	if (abort) {
+		if (fcpreq->op == NVMET_FCOP_WRITEDATA) {
+			nvmet_req_complete(&fod->req, NVME_SC_INTERNAL);
+			return true;
+		}
+
+		nvmet_fc_abort_op(tgtport, fod);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * actual done handler for FCP operations when completed by the lldd
+ */
+static void
+nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
+{
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
+	unsigned long flags;
+	bool abort;
+
+	spin_lock_irqsave(&fod->flock, flags);
+	abort = fod->abort;
+	fod->writedataactive = false;
+	spin_unlock_irqrestore(&fod->flock, flags);
+
+	switch (fcpreq->op) {
+
+	case NVMET_FCOP_WRITEDATA:
+		if (__nvmet_fc_fod_op_abort(fod, abort))
+			return;
+		if (fcpreq->fcp_error ||
+		    fcpreq->transferred_length != fcpreq->transfer_length) {
+			spin_lock(&fod->flock);
+			fod->abort = true;
+			spin_unlock(&fod->flock);
+
+			nvmet_req_complete(&fod->req, NVME_SC_INTERNAL);
+			return;
+		}
+
+		fod->offset += fcpreq->transferred_length;
+		if (fod->offset != fod->req.transfer_len) {
+			spin_lock_irqsave(&fod->flock, flags);
+			fod->writedataactive = true;
+			spin_unlock_irqrestore(&fod->flock, flags);
+
+			/* transfer the next chunk */
+			nvmet_fc_transfer_fcp_data(tgtport, fod,
+						NVMET_FCOP_WRITEDATA);
+			return;
+		}
+
+		/* data transfer complete, resume with nvmet layer */
+		nvmet_req_execute(&fod->req);
+		break;
+
+	case NVMET_FCOP_READDATA:
+	case NVMET_FCOP_READDATA_RSP:
+		if (__nvmet_fc_fod_op_abort(fod, abort))
+			return;
+		if (fcpreq->fcp_error ||
+		    fcpreq->transferred_length != fcpreq->transfer_length) {
+			nvmet_fc_abort_op(tgtport, fod);
+			return;
+		}
+
+		/* success */
+
+		if (fcpreq->op == NVMET_FCOP_READDATA_RSP) {
+			/* data no longer needed */
+			nvmet_fc_free_tgt_pgs(fod);
+			nvmet_fc_free_fcp_iod(fod->queue, fod);
+			return;
+		}
+
+		fod->offset += fcpreq->transferred_length;
+		if (fod->offset != fod->req.transfer_len) {
+			/* transfer the next chunk */
+			nvmet_fc_transfer_fcp_data(tgtport, fod,
+						NVMET_FCOP_READDATA);
+			return;
+		}
+
+		/* data transfer complete, send response */
+
+		/* data no longer needed */
+		nvmet_fc_free_tgt_pgs(fod);
+
+		nvmet_fc_xmt_fcp_rsp(tgtport, fod);
+
+		break;
+
+	case NVMET_FCOP_RSP:
+		if (__nvmet_fc_fod_op_abort(fod, abort))
+			return;
+		nvmet_fc_free_fcp_iod(fod->queue, fod);
+		break;
+
+	default:
+		break;
+	}
+}
+
+static void
+nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work)
+{
+	struct nvmet_fc_fcp_iod *fod =
+		container_of(work, struct nvmet_fc_fcp_iod, done_work);
+
+	nvmet_fc_fod_op_done(fod);
+}
+
+static void
+nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
+{
+	struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+	struct nvmet_fc_tgt_queue *queue = fod->queue;
+
+	if (fod->tgtport->ops->target_features & NVMET_FCTGTFEAT_OPDONE_IN_ISR)
+		/* context switch so completion is not in ISR context */
+		queue_work_on(queue->cpu, queue->work_q, &fod->done_work);
+	else
+		nvmet_fc_fod_op_done(fod);
+}
+
+/*
+ * actual completion handler after execution by the nvmet layer
+ */
+static void
+__nvmet_fc_fcp_nvme_cmd_done(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_fcp_iod *fod, int status)
+{
+	struct nvme_common_command *sqe = &fod->cmdiubuf.sqe.common;
+	struct nvme_completion *cqe = &fod->rspiubuf.cqe;
+	unsigned long flags;
+	bool abort;
+
+	spin_lock_irqsave(&fod->flock, flags);
+	abort = fod->abort;
+	spin_unlock_irqrestore(&fod->flock, flags);
+
+	/* if we have a CQE, snoop the last sq_head value */
+	if (!status)
+		fod->queue->sqhd = cqe->sq_head;
+
+	if (abort) {
+		nvmet_fc_abort_op(tgtport, fod);
+		return;
+	}
+
+	/* if an error handling the cmd post initial parsing */
+	if (status) {
+		/* fudge up a failed CQE status for our transport error */
+		memset(cqe, 0, sizeof(*cqe));
+		cqe->sq_head = fod->queue->sqhd;	/* echo last cqe sqhd */
+		cqe->sq_id = cpu_to_le16(fod->queue->qid);
+		cqe->command_id = sqe->command_id;
+		cqe->status = cpu_to_le16(status);
+	} else {
+
+		/*
+		 * try to push the data even if the SQE status is non-zero.
+		 * There may be a status where data still was intended to
+		 * be moved
+		 */
+		if ((fod->io_dir == NVMET_FCP_READ) && (fod->data_sg_cnt)) {
+			/* push the data over before sending rsp */
+			nvmet_fc_transfer_fcp_data(tgtport, fod,
+						NVMET_FCOP_READDATA);
+			return;
+		}
+
+		/* writes & no data - fall thru */
+	}
+
+	/* data no longer needed */
+	nvmet_fc_free_tgt_pgs(fod);
+
+	nvmet_fc_xmt_fcp_rsp(tgtport, fod);
+}
+
+
+static void
+nvmet_fc_fcp_nvme_cmd_done(struct nvmet_req *nvme_req)
+{
+	struct nvmet_fc_fcp_iod *fod = nvmet_req_to_fod(nvme_req);
+	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
+
+	__nvmet_fc_fcp_nvme_cmd_done(tgtport, fod, 0);
+}
+
+
+/*
+ * Actual processing routine for received FC-NVME LS Requests from the LLD
+ */
+static void
+nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_fcp_iod *fod)
+{
+	struct nvme_fc_cmd_iu *cmdiu = &fod->cmdiubuf;
+	u32 xfrlen = be32_to_cpu(cmdiu->data_len);
+	int ret;
+
+	/*
+	 * Fused commands are currently not supported in the linux
+	 * implementation.
+	 *
+	 * As such, the implementation of the FC transport does not
+	 * look at the fused commands and order delivery to the upper
+	 * layer until we have both based on csn.
+	 */
+
+	fod->fcpreq->done = nvmet_fc_xmt_fcp_op_done;
+
+	if (cmdiu->flags & FCNVME_CMD_FLAGS_WRITE) {
+		fod->io_dir = NVMET_FCP_WRITE;
+		if (!nvme_is_write(&cmdiu->sqe))
+			goto transport_error;
+	} else if (cmdiu->flags & FCNVME_CMD_FLAGS_READ) {
+		fod->io_dir = NVMET_FCP_READ;
+		if (nvme_is_write(&cmdiu->sqe))
+			goto transport_error;
+	} else {
+		fod->io_dir = NVMET_FCP_NODATA;
+		if (xfrlen)
+			goto transport_error;
+	}
+
+	fod->req.cmd = &fod->cmdiubuf.sqe;
+	fod->req.rsp = &fod->rspiubuf.cqe;
+	fod->req.port = fod->queue->port;
+
+	/* clear any response payload */
+	memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf));
+
+	fod->data_sg = NULL;
+	fod->data_sg_cnt = 0;
+
+	ret = nvmet_req_init(&fod->req,
+				&fod->queue->nvme_cq,
+				&fod->queue->nvme_sq,
+				&nvmet_fc_tgt_fcp_ops);
+	if (!ret) {
+		/* bad SQE content or invalid ctrl state */
+		/* nvmet layer has already called op done to send rsp. */
+		return;
+	}
+
+	fod->req.transfer_len = xfrlen;
+
+	/* keep a running counter of tail position */
+	atomic_inc(&fod->queue->sqtail);
+
+	if (fod->req.transfer_len) {
+		ret = nvmet_fc_alloc_tgt_pgs(fod);
+		if (ret) {
+			nvmet_req_complete(&fod->req, ret);
+			return;
+		}
+	}
+	fod->req.sg = fod->data_sg;
+	fod->req.sg_cnt = fod->data_sg_cnt;
+	fod->offset = 0;
+
+	if (fod->io_dir == NVMET_FCP_WRITE) {
+		/* pull the data over before invoking nvmet layer */
+		nvmet_fc_transfer_fcp_data(tgtport, fod, NVMET_FCOP_WRITEDATA);
+		return;
+	}
+
+	/*
+	 * Reads or no data:
+	 *
+	 * can invoke the nvmet_layer now. If read data, cmd completion will
+	 * push the data
+	 */
+	nvmet_req_execute(&fod->req);
+	return;
+
+transport_error:
+	nvmet_fc_abort_op(tgtport, fod);
+}
+
+/*
+ * Actual processing routine for received FC-NVME LS Requests from the LLD
+ */
+static void
+nvmet_fc_handle_fcp_rqst_work(struct work_struct *work)
+{
+	struct nvmet_fc_fcp_iod *fod =
+		container_of(work, struct nvmet_fc_fcp_iod, work);
+	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
+
+	nvmet_fc_handle_fcp_rqst(tgtport, fod);
+}
+
+/**
+ * nvmet_fc_rcv_fcp_req - transport entry point called by an LLDD
+ *                       upon the reception of a NVME FCP CMD IU.
+ *
+ * Pass a FC-NVME FCP CMD IU received from the FC link to the nvmet-fc
+ * layer for processing.
+ *
+ * The nvmet_fc layer allocates a local job structure (struct
+ * nvmet_fc_fcp_iod) from the queue for the io and copies the
+ * CMD IU buffer to the job structure. As such, on a successful
+ * completion (returns 0), the LLDD may immediately free/reuse
+ * the CMD IU buffer passed in the call.
+ *
+ * However, in some circumstances, due to the packetized nature of FC
+ * and the api of the FC LLDD which may issue a hw command to send the
+ * response, but the LLDD may not get the hw completion for that command
+ * and upcall the nvmet_fc layer before a new command may be
+ * asynchronously received - its possible for a command to be received
+ * before the LLDD and nvmet_fc have recycled the job structure. It gives
+ * the appearance of more commands received than fits in the sq.
+ * To alleviate this scenario, a temporary queue is maintained in the
+ * transport for pending LLDD requests waiting for a queue job structure.
+ * In these "overrun" cases, a temporary queue element is allocated
+ * the LLDD request and CMD iu buffer information remembered, and the
+ * routine returns a -EOVERFLOW status. Subsequently, when a queue job
+ * structure is freed, it is immediately reallocated for anything on the
+ * pending request list. The LLDDs defer_rcv() callback is called,
+ * informing the LLDD that it may reuse the CMD IU buffer, and the io
+ * is then started normally with the transport.
+ *
+ * The LLDD, when receiving an -EOVERFLOW completion status, is to treat
+ * the completion as successful but must not reuse the CMD IU buffer
+ * until the LLDD's defer_rcv() callback has been called for the
+ * corresponding struct nvmefc_tgt_fcp_req pointer.
+ *
+ * If there is any other condition in which an error occurs, the
+ * transport will return a non-zero status indicating the error.
+ * In all cases other than -EOVERFLOW, the transport has not accepted the
+ * request and the LLDD should abort the exchange.
+ *
+ * @target_port: pointer to the (registered) target port the FCP CMD IU
+ *              was received on.
+ * @fcpreq:     pointer to a fcpreq request structure to be used to reference
+ *              the exchange corresponding to the FCP Exchange.
+ * @cmdiubuf:   pointer to the buffer containing the FCP CMD IU
+ * @cmdiubuf_len: length, in bytes, of the received FCP CMD IU
+ */
+int
+nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
+			struct nvmefc_tgt_fcp_req *fcpreq,
+			void *cmdiubuf, u32 cmdiubuf_len)
+{
+	struct nvmet_fc_tgtport *tgtport = targetport_to_tgtport(target_port);
+	struct nvme_fc_cmd_iu *cmdiu = cmdiubuf;
+	struct nvmet_fc_tgt_queue *queue;
+	struct nvmet_fc_fcp_iod *fod;
+	struct nvmet_fc_defer_fcp_req *deferfcp;
+	unsigned long flags;
+
+	/* validate iu, so the connection id can be used to find the queue */
+	if ((cmdiubuf_len != sizeof(*cmdiu)) ||
+			(cmdiu->scsi_id != NVME_CMD_SCSI_ID) ||
+			(cmdiu->fc_id != NVME_CMD_FC_ID) ||
+			(be16_to_cpu(cmdiu->iu_len) != (sizeof(*cmdiu)/4)))
+		return -EIO;
+
+	queue = nvmet_fc_find_target_queue(tgtport,
+				be64_to_cpu(cmdiu->connection_id));
+	if (!queue)
+		return -ENOTCONN;
+
+	/*
+	 * note: reference taken by find_target_queue
+	 * After successful fod allocation, the fod will inherit the
+	 * ownership of that reference and will remove the reference
+	 * when the fod is freed.
+	 */
+
+	spin_lock_irqsave(&queue->qlock, flags);
+
+	fod = nvmet_fc_alloc_fcp_iod(queue);
+	if (fod) {
+		spin_unlock_irqrestore(&queue->qlock, flags);
+
+		fcpreq->nvmet_fc_private = fod;
+		fod->fcpreq = fcpreq;
+
+		memcpy(&fod->cmdiubuf, cmdiubuf, cmdiubuf_len);
+
+		nvmet_fc_queue_fcp_req(tgtport, queue, fcpreq);
+
+		return 0;
+	}
+
+	if (!tgtport->ops->defer_rcv) {
+		spin_unlock_irqrestore(&queue->qlock, flags);
+		/* release the queue lookup reference */
+		nvmet_fc_tgt_q_put(queue);
+		return -ENOENT;
+	}
+
+	deferfcp = list_first_entry_or_null(&queue->avail_defer_list,
+			struct nvmet_fc_defer_fcp_req, req_list);
+	if (deferfcp) {
+		/* Just re-use one that was previously allocated */
+		list_del(&deferfcp->req_list);
+	} else {
+		spin_unlock_irqrestore(&queue->qlock, flags);
+
+		/* Now we need to dynamically allocate one */
+		deferfcp = kmalloc(sizeof(*deferfcp), GFP_KERNEL);
+		if (!deferfcp) {
+			/* release the queue lookup reference */
+			nvmet_fc_tgt_q_put(queue);
+			return -ENOMEM;
+		}
+		spin_lock_irqsave(&queue->qlock, flags);
+	}
+
+	/* For now, use rspaddr / rsplen to save payload information */
+	fcpreq->rspaddr = cmdiubuf;
+	fcpreq->rsplen  = cmdiubuf_len;
+	deferfcp->fcp_req = fcpreq;
+
+	/* defer processing till a fod becomes available */
+	list_add_tail(&deferfcp->req_list, &queue->pending_cmd_list);
+
+	/* NOTE: the queue lookup reference is still valid */
+
+	spin_unlock_irqrestore(&queue->qlock, flags);
+
+	return -EOVERFLOW;
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_req);
+
+/**
+ * nvmet_fc_rcv_fcp_abort - transport entry point called by an LLDD
+ *                       upon the reception of an ABTS for a FCP command
+ *
+ * Notify the transport that an ABTS has been received for a FCP command
+ * that had been given to the transport via nvmet_fc_rcv_fcp_req(). The
+ * LLDD believes the command is still being worked on
+ * (template_ops->fcp_req_release() has not been called).
+ *
+ * The transport will wait for any outstanding work (an op to the LLDD,
+ * which the lldd should complete with error due to the ABTS; or the
+ * completion from the nvmet layer of the nvme command), then will
+ * stop processing and call the nvmet_fc_rcv_fcp_req() callback to
+ * return the i/o context to the LLDD.  The LLDD may send the BA_ACC
+ * to the ABTS either after return from this function (assuming any
+ * outstanding op work has been terminated) or upon the callback being
+ * called.
+ *
+ * @target_port: pointer to the (registered) target port the FCP CMD IU
+ *              was received on.
+ * @fcpreq:     pointer to the fcpreq request structure that corresponds
+ *              to the exchange that received the ABTS.
+ */
+void
+nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *target_port,
+			struct nvmefc_tgt_fcp_req *fcpreq)
+{
+	struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+	struct nvmet_fc_tgt_queue *queue;
+	unsigned long flags;
+
+	if (!fod || fod->fcpreq != fcpreq)
+		/* job appears to have already completed, ignore abort */
+		return;
+
+	queue = fod->queue;
+
+	spin_lock_irqsave(&queue->qlock, flags);
+	if (fod->active) {
+		/*
+		 * mark as abort. The abort handler, invoked upon completion
+		 * of any work, will detect the aborted status and do the
+		 * callback.
+		 */
+		spin_lock(&fod->flock);
+		fod->abort = true;
+		fod->aborted = true;
+		spin_unlock(&fod->flock);
+	}
+	spin_unlock_irqrestore(&queue->qlock, flags);
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_abort);
+
+
+struct nvmet_fc_traddr {
+	u64	nn;
+	u64	pn;
+};
+
+static int
+__nvme_fc_parse_u64(substring_t *sstr, u64 *val)
+{
+	u64 token64;
+
+	if (match_u64(sstr, &token64))
+		return -EINVAL;
+	*val = token64;
+
+	return 0;
+}
+
+/*
+ * This routine validates and extracts the WWN's from the TRADDR string.
+ * As kernel parsers need the 0x to determine number base, universally
+ * build string to parse with 0x prefix before parsing name strings.
+ */
+static int
+nvme_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf, size_t blen)
+{
+	char name[2 + NVME_FC_TRADDR_HEXNAMELEN + 1];
+	substring_t wwn = { name, &name[sizeof(name)-1] };
+	int nnoffset, pnoffset;
+
+	/* validate it string one of the 2 allowed formats */
+	if (strnlen(buf, blen) == NVME_FC_TRADDR_MAXLENGTH &&
+			!strncmp(buf, "nn-0x", NVME_FC_TRADDR_OXNNLEN) &&
+			!strncmp(&buf[NVME_FC_TRADDR_MAX_PN_OFFSET],
+				"pn-0x", NVME_FC_TRADDR_OXNNLEN)) {
+		nnoffset = NVME_FC_TRADDR_OXNNLEN;
+		pnoffset = NVME_FC_TRADDR_MAX_PN_OFFSET +
+						NVME_FC_TRADDR_OXNNLEN;
+	} else if ((strnlen(buf, blen) == NVME_FC_TRADDR_MINLENGTH &&
+			!strncmp(buf, "nn-", NVME_FC_TRADDR_NNLEN) &&
+			!strncmp(&buf[NVME_FC_TRADDR_MIN_PN_OFFSET],
+				"pn-", NVME_FC_TRADDR_NNLEN))) {
+		nnoffset = NVME_FC_TRADDR_NNLEN;
+		pnoffset = NVME_FC_TRADDR_MIN_PN_OFFSET + NVME_FC_TRADDR_NNLEN;
+	} else
+		goto out_einval;
+
+	name[0] = '0';
+	name[1] = 'x';
+	name[2 + NVME_FC_TRADDR_HEXNAMELEN] = 0;
+
+	memcpy(&name[2], &buf[nnoffset], NVME_FC_TRADDR_HEXNAMELEN);
+	if (__nvme_fc_parse_u64(&wwn, &traddr->nn))
+		goto out_einval;
+
+	memcpy(&name[2], &buf[pnoffset], NVME_FC_TRADDR_HEXNAMELEN);
+	if (__nvme_fc_parse_u64(&wwn, &traddr->pn))
+		goto out_einval;
+
+	return 0;
+
+out_einval:
+	pr_warn("%s: bad traddr string\n", __func__);
+	return -EINVAL;
+}
+
+static int
+nvmet_fc_add_port(struct nvmet_port *port)
+{
+	struct nvmet_fc_tgtport *tgtport;
+	struct nvmet_fc_traddr traddr = { 0L, 0L };
+	unsigned long flags;
+	int ret;
+
+	/* validate the address info */
+	if ((port->disc_addr.trtype != NVMF_TRTYPE_FC) ||
+	    (port->disc_addr.adrfam != NVMF_ADDR_FAMILY_FC))
+		return -EINVAL;
+
+	/* map the traddr address info to a target port */
+
+	ret = nvme_fc_parse_traddr(&traddr, port->disc_addr.traddr,
+			sizeof(port->disc_addr.traddr));
+	if (ret)
+		return ret;
+
+	ret = -ENXIO;
+	spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+	list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) {
+		if ((tgtport->fc_target_port.node_name == traddr.nn) &&
+		    (tgtport->fc_target_port.port_name == traddr.pn)) {
+			tgtport->port = port;
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
+	return ret;
+}
+
+static void
+nvmet_fc_remove_port(struct nvmet_port *port)
+{
+	/* nothing to do */
+}
+
+static const struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = {
+	.owner			= THIS_MODULE,
+	.type			= NVMF_TRTYPE_FC,
+	.msdbd			= 1,
+	.add_port		= nvmet_fc_add_port,
+	.remove_port		= nvmet_fc_remove_port,
+	.queue_response		= nvmet_fc_fcp_nvme_cmd_done,
+	.delete_ctrl		= nvmet_fc_delete_ctrl,
+};
+
+static int __init nvmet_fc_init_module(void)
+{
+	return nvmet_register_transport(&nvmet_fc_tgt_fcp_ops);
+}
+
+static void __exit nvmet_fc_exit_module(void)
+{
+	/* sanity check - all lports should be removed */
+	if (!list_empty(&nvmet_fc_target_list))
+		pr_warn("%s: targetport list not empty\n", __func__);
+
+	nvmet_unregister_transport(&nvmet_fc_tgt_fcp_ops);
+
+	ida_destroy(&nvmet_fc_tgtport_cnt);
+}
+
+module_init(nvmet_fc_init_module);
+module_exit(nvmet_fc_exit_module);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
new file mode 100644
index 0000000..34712de
--- /dev/null
+++ b/drivers/nvme/target/fcloop.c
@@ -0,0 +1,1387 @@
+/*
+ * Copyright (c) 2016 Avago Technologies.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful.
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
+ * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
+ * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
+ * See the GNU General Public License for more details, a copy of which
+ * can be found in the file COPYING included with this package
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <uapi/scsi/fc/fc_fs.h>
+
+#include "../host/nvme.h"
+#include "../target/nvmet.h"
+#include <linux/nvme-fc-driver.h>
+#include <linux/nvme-fc.h>
+
+
+enum {
+	NVMF_OPT_ERR		= 0,
+	NVMF_OPT_WWNN		= 1 << 0,
+	NVMF_OPT_WWPN		= 1 << 1,
+	NVMF_OPT_ROLES		= 1 << 2,
+	NVMF_OPT_FCADDR		= 1 << 3,
+	NVMF_OPT_LPWWNN		= 1 << 4,
+	NVMF_OPT_LPWWPN		= 1 << 5,
+};
+
+struct fcloop_ctrl_options {
+	int			mask;
+	u64			wwnn;
+	u64			wwpn;
+	u32			roles;
+	u32			fcaddr;
+	u64			lpwwnn;
+	u64			lpwwpn;
+};
+
+static const match_table_t opt_tokens = {
+	{ NVMF_OPT_WWNN,	"wwnn=%s"	},
+	{ NVMF_OPT_WWPN,	"wwpn=%s"	},
+	{ NVMF_OPT_ROLES,	"roles=%d"	},
+	{ NVMF_OPT_FCADDR,	"fcaddr=%x"	},
+	{ NVMF_OPT_LPWWNN,	"lpwwnn=%s"	},
+	{ NVMF_OPT_LPWWPN,	"lpwwpn=%s"	},
+	{ NVMF_OPT_ERR,		NULL		}
+};
+
+static int
+fcloop_parse_options(struct fcloop_ctrl_options *opts,
+		const char *buf)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *options, *o, *p;
+	int token, ret = 0;
+	u64 token64;
+
+	options = o = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	while ((p = strsep(&o, ",\n")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, opt_tokens, args);
+		opts->mask |= token;
+		switch (token) {
+		case NVMF_OPT_WWNN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			opts->wwnn = token64;
+			break;
+		case NVMF_OPT_WWPN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			opts->wwpn = token64;
+			break;
+		case NVMF_OPT_ROLES:
+			if (match_int(args, &token)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			opts->roles = token;
+			break;
+		case NVMF_OPT_FCADDR:
+			if (match_hex(args, &token)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			opts->fcaddr = token;
+			break;
+		case NVMF_OPT_LPWWNN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			opts->lpwwnn = token64;
+			break;
+		case NVMF_OPT_LPWWPN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			opts->lpwwpn = token64;
+			break;
+		default:
+			pr_warn("unknown parameter or missing value '%s'\n", p);
+			ret = -EINVAL;
+			goto out_free_options;
+		}
+	}
+
+out_free_options:
+	kfree(options);
+	return ret;
+}
+
+
+static int
+fcloop_parse_nm_options(struct device *dev, u64 *nname, u64 *pname,
+		const char *buf)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *options, *o, *p;
+	int token, ret = 0;
+	u64 token64;
+
+	*nname = -1;
+	*pname = -1;
+
+	options = o = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	while ((p = strsep(&o, ",\n")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, opt_tokens, args);
+		switch (token) {
+		case NVMF_OPT_WWNN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			*nname = token64;
+			break;
+		case NVMF_OPT_WWPN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			*pname = token64;
+			break;
+		default:
+			pr_warn("unknown parameter or missing value '%s'\n", p);
+			ret = -EINVAL;
+			goto out_free_options;
+		}
+	}
+
+out_free_options:
+	kfree(options);
+
+	if (!ret) {
+		if (*nname == -1)
+			return -EINVAL;
+		if (*pname == -1)
+			return -EINVAL;
+	}
+
+	return ret;
+}
+
+
+#define LPORT_OPTS	(NVMF_OPT_WWNN | NVMF_OPT_WWPN)
+
+#define RPORT_OPTS	(NVMF_OPT_WWNN | NVMF_OPT_WWPN |  \
+			 NVMF_OPT_LPWWNN | NVMF_OPT_LPWWPN)
+
+#define TGTPORT_OPTS	(NVMF_OPT_WWNN | NVMF_OPT_WWPN)
+
+
+static DEFINE_SPINLOCK(fcloop_lock);
+static LIST_HEAD(fcloop_lports);
+static LIST_HEAD(fcloop_nports);
+
+struct fcloop_lport {
+	struct nvme_fc_local_port *localport;
+	struct list_head lport_list;
+	struct completion unreg_done;
+};
+
+struct fcloop_lport_priv {
+	struct fcloop_lport *lport;
+};
+
+struct fcloop_rport {
+	struct nvme_fc_remote_port *remoteport;
+	struct nvmet_fc_target_port *targetport;
+	struct fcloop_nport *nport;
+	struct fcloop_lport *lport;
+};
+
+struct fcloop_tport {
+	struct nvmet_fc_target_port *targetport;
+	struct nvme_fc_remote_port *remoteport;
+	struct fcloop_nport *nport;
+	struct fcloop_lport *lport;
+};
+
+struct fcloop_nport {
+	struct fcloop_rport *rport;
+	struct fcloop_tport *tport;
+	struct fcloop_lport *lport;
+	struct list_head nport_list;
+	struct kref ref;
+	u64 node_name;
+	u64 port_name;
+	u32 port_role;
+	u32 port_id;
+};
+
+struct fcloop_lsreq {
+	struct fcloop_tport		*tport;
+	struct nvmefc_ls_req		*lsreq;
+	struct work_struct		work;
+	struct nvmefc_tgt_ls_req	tgt_ls_req;
+	int				status;
+};
+
+enum {
+	INI_IO_START		= 0,
+	INI_IO_ACTIVE		= 1,
+	INI_IO_ABORTED		= 2,
+	INI_IO_COMPLETED	= 3,
+};
+
+struct fcloop_fcpreq {
+	struct fcloop_tport		*tport;
+	struct nvmefc_fcp_req		*fcpreq;
+	spinlock_t			reqlock;
+	u16				status;
+	u32				inistate;
+	bool				active;
+	bool				aborted;
+	struct kref			ref;
+	struct work_struct		fcp_rcv_work;
+	struct work_struct		abort_rcv_work;
+	struct work_struct		tio_done_work;
+	struct nvmefc_tgt_fcp_req	tgt_fcp_req;
+};
+
+struct fcloop_ini_fcpreq {
+	struct nvmefc_fcp_req		*fcpreq;
+	struct fcloop_fcpreq		*tfcp_req;
+	spinlock_t			inilock;
+};
+
+static inline struct fcloop_lsreq *
+tgt_ls_req_to_lsreq(struct nvmefc_tgt_ls_req *tgt_lsreq)
+{
+	return container_of(tgt_lsreq, struct fcloop_lsreq, tgt_ls_req);
+}
+
+static inline struct fcloop_fcpreq *
+tgt_fcp_req_to_fcpreq(struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+	return container_of(tgt_fcpreq, struct fcloop_fcpreq, tgt_fcp_req);
+}
+
+
+static int
+fcloop_create_queue(struct nvme_fc_local_port *localport,
+			unsigned int qidx, u16 qsize,
+			void **handle)
+{
+	*handle = localport;
+	return 0;
+}
+
+static void
+fcloop_delete_queue(struct nvme_fc_local_port *localport,
+			unsigned int idx, void *handle)
+{
+}
+
+
+/*
+ * Transmit of LS RSP done (e.g. buffers all set). call back up
+ * initiator "done" flows.
+ */
+static void
+fcloop_tgt_lsrqst_done_work(struct work_struct *work)
+{
+	struct fcloop_lsreq *tls_req =
+		container_of(work, struct fcloop_lsreq, work);
+	struct fcloop_tport *tport = tls_req->tport;
+	struct nvmefc_ls_req *lsreq = tls_req->lsreq;
+
+	if (tport->remoteport)
+		lsreq->done(lsreq, tls_req->status);
+}
+
+static int
+fcloop_ls_req(struct nvme_fc_local_port *localport,
+			struct nvme_fc_remote_port *remoteport,
+			struct nvmefc_ls_req *lsreq)
+{
+	struct fcloop_lsreq *tls_req = lsreq->private;
+	struct fcloop_rport *rport = remoteport->private;
+	int ret = 0;
+
+	tls_req->lsreq = lsreq;
+	INIT_WORK(&tls_req->work, fcloop_tgt_lsrqst_done_work);
+
+	if (!rport->targetport) {
+		tls_req->status = -ECONNREFUSED;
+		schedule_work(&tls_req->work);
+		return ret;
+	}
+
+	tls_req->status = 0;
+	tls_req->tport = rport->targetport->private;
+	ret = nvmet_fc_rcv_ls_req(rport->targetport, &tls_req->tgt_ls_req,
+				 lsreq->rqstaddr, lsreq->rqstlen);
+
+	return ret;
+}
+
+static int
+fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
+			struct nvmefc_tgt_ls_req *tgt_lsreq)
+{
+	struct fcloop_lsreq *tls_req = tgt_ls_req_to_lsreq(tgt_lsreq);
+	struct nvmefc_ls_req *lsreq = tls_req->lsreq;
+
+	memcpy(lsreq->rspaddr, tgt_lsreq->rspbuf,
+		((lsreq->rsplen < tgt_lsreq->rsplen) ?
+				lsreq->rsplen : tgt_lsreq->rsplen));
+	tgt_lsreq->done(tgt_lsreq);
+
+	schedule_work(&tls_req->work);
+
+	return 0;
+}
+
+static void
+fcloop_tfcp_req_free(struct kref *ref)
+{
+	struct fcloop_fcpreq *tfcp_req =
+		container_of(ref, struct fcloop_fcpreq, ref);
+
+	kfree(tfcp_req);
+}
+
+static void
+fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req)
+{
+	kref_put(&tfcp_req->ref, fcloop_tfcp_req_free);
+}
+
+static int
+fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req)
+{
+	return kref_get_unless_zero(&tfcp_req->ref);
+}
+
+static void
+fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
+			struct fcloop_fcpreq *tfcp_req, int status)
+{
+	struct fcloop_ini_fcpreq *inireq = NULL;
+
+	if (fcpreq) {
+		inireq = fcpreq->private;
+		spin_lock(&inireq->inilock);
+		inireq->tfcp_req = NULL;
+		spin_unlock(&inireq->inilock);
+
+		fcpreq->status = status;
+		fcpreq->done(fcpreq);
+	}
+
+	/* release original io reference on tgt struct */
+	fcloop_tfcp_req_put(tfcp_req);
+}
+
+static void
+fcloop_fcp_recv_work(struct work_struct *work)
+{
+	struct fcloop_fcpreq *tfcp_req =
+		container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
+	struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+	int ret = 0;
+	bool aborted = false;
+
+	spin_lock(&tfcp_req->reqlock);
+	switch (tfcp_req->inistate) {
+	case INI_IO_START:
+		tfcp_req->inistate = INI_IO_ACTIVE;
+		break;
+	case INI_IO_ABORTED:
+		aborted = true;
+		break;
+	default:
+		spin_unlock(&tfcp_req->reqlock);
+		WARN_ON(1);
+		return;
+	}
+	spin_unlock(&tfcp_req->reqlock);
+
+	if (unlikely(aborted))
+		ret = -ECANCELED;
+	else
+		ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
+				&tfcp_req->tgt_fcp_req,
+				fcpreq->cmdaddr, fcpreq->cmdlen);
+	if (ret)
+		fcloop_call_host_done(fcpreq, tfcp_req, ret);
+
+	return;
+}
+
+static void
+fcloop_fcp_abort_recv_work(struct work_struct *work)
+{
+	struct fcloop_fcpreq *tfcp_req =
+		container_of(work, struct fcloop_fcpreq, abort_rcv_work);
+	struct nvmefc_fcp_req *fcpreq;
+	bool completed = false;
+
+	spin_lock(&tfcp_req->reqlock);
+	fcpreq = tfcp_req->fcpreq;
+	switch (tfcp_req->inistate) {
+	case INI_IO_ABORTED:
+		break;
+	case INI_IO_COMPLETED:
+		completed = true;
+		break;
+	default:
+		spin_unlock(&tfcp_req->reqlock);
+		WARN_ON(1);
+		return;
+	}
+	spin_unlock(&tfcp_req->reqlock);
+
+	if (unlikely(completed)) {
+		/* remove reference taken in original abort downcall */
+		fcloop_tfcp_req_put(tfcp_req);
+		return;
+	}
+
+	if (tfcp_req->tport->targetport)
+		nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
+					&tfcp_req->tgt_fcp_req);
+
+	spin_lock(&tfcp_req->reqlock);
+	tfcp_req->fcpreq = NULL;
+	spin_unlock(&tfcp_req->reqlock);
+
+	fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
+	/* call_host_done releases reference for abort downcall */
+}
+
+/*
+ * FCP IO operation done by target completion.
+ * call back up initiator "done" flows.
+ */
+static void
+fcloop_tgt_fcprqst_done_work(struct work_struct *work)
+{
+	struct fcloop_fcpreq *tfcp_req =
+		container_of(work, struct fcloop_fcpreq, tio_done_work);
+	struct nvmefc_fcp_req *fcpreq;
+
+	spin_lock(&tfcp_req->reqlock);
+	fcpreq = tfcp_req->fcpreq;
+	tfcp_req->inistate = INI_IO_COMPLETED;
+	spin_unlock(&tfcp_req->reqlock);
+
+	fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status);
+}
+
+
+static int
+fcloop_fcp_req(struct nvme_fc_local_port *localport,
+			struct nvme_fc_remote_port *remoteport,
+			void *hw_queue_handle,
+			struct nvmefc_fcp_req *fcpreq)
+{
+	struct fcloop_rport *rport = remoteport->private;
+	struct fcloop_ini_fcpreq *inireq = fcpreq->private;
+	struct fcloop_fcpreq *tfcp_req;
+
+	if (!rport->targetport)
+		return -ECONNREFUSED;
+
+	tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_KERNEL);
+	if (!tfcp_req)
+		return -ENOMEM;
+
+	inireq->fcpreq = fcpreq;
+	inireq->tfcp_req = tfcp_req;
+	spin_lock_init(&inireq->inilock);
+
+	tfcp_req->fcpreq = fcpreq;
+	tfcp_req->tport = rport->targetport->private;
+	tfcp_req->inistate = INI_IO_START;
+	spin_lock_init(&tfcp_req->reqlock);
+	INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work);
+	INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work);
+	INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work);
+	kref_init(&tfcp_req->ref);
+
+	schedule_work(&tfcp_req->fcp_rcv_work);
+
+	return 0;
+}
+
+static void
+fcloop_fcp_copy_data(u8 op, struct scatterlist *data_sg,
+			struct scatterlist *io_sg, u32 offset, u32 length)
+{
+	void *data_p, *io_p;
+	u32 data_len, io_len, tlen;
+
+	io_p = sg_virt(io_sg);
+	io_len = io_sg->length;
+
+	for ( ; offset; ) {
+		tlen = min_t(u32, offset, io_len);
+		offset -= tlen;
+		io_len -= tlen;
+		if (!io_len) {
+			io_sg = sg_next(io_sg);
+			io_p = sg_virt(io_sg);
+			io_len = io_sg->length;
+		} else
+			io_p += tlen;
+	}
+
+	data_p = sg_virt(data_sg);
+	data_len = data_sg->length;
+
+	for ( ; length; ) {
+		tlen = min_t(u32, io_len, data_len);
+		tlen = min_t(u32, tlen, length);
+
+		if (op == NVMET_FCOP_WRITEDATA)
+			memcpy(data_p, io_p, tlen);
+		else
+			memcpy(io_p, data_p, tlen);
+
+		length -= tlen;
+
+		io_len -= tlen;
+		if ((!io_len) && (length)) {
+			io_sg = sg_next(io_sg);
+			io_p = sg_virt(io_sg);
+			io_len = io_sg->length;
+		} else
+			io_p += tlen;
+
+		data_len -= tlen;
+		if ((!data_len) && (length)) {
+			data_sg = sg_next(data_sg);
+			data_p = sg_virt(data_sg);
+			data_len = data_sg->length;
+		} else
+			data_p += tlen;
+	}
+}
+
+static int
+fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
+			struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+	struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+	struct nvmefc_fcp_req *fcpreq;
+	u32 rsplen = 0, xfrlen = 0;
+	int fcp_err = 0, active, aborted;
+	u8 op = tgt_fcpreq->op;
+
+	spin_lock(&tfcp_req->reqlock);
+	fcpreq = tfcp_req->fcpreq;
+	active = tfcp_req->active;
+	aborted = tfcp_req->aborted;
+	tfcp_req->active = true;
+	spin_unlock(&tfcp_req->reqlock);
+
+	if (unlikely(active))
+		/* illegal - call while i/o active */
+		return -EALREADY;
+
+	if (unlikely(aborted)) {
+		/* target transport has aborted i/o prior */
+		spin_lock(&tfcp_req->reqlock);
+		tfcp_req->active = false;
+		spin_unlock(&tfcp_req->reqlock);
+		tgt_fcpreq->transferred_length = 0;
+		tgt_fcpreq->fcp_error = -ECANCELED;
+		tgt_fcpreq->done(tgt_fcpreq);
+		return 0;
+	}
+
+	/*
+	 * if fcpreq is NULL, the I/O has been aborted (from
+	 * initiator side). For the target side, act as if all is well
+	 * but don't actually move data.
+	 */
+
+	switch (op) {
+	case NVMET_FCOP_WRITEDATA:
+		xfrlen = tgt_fcpreq->transfer_length;
+		if (fcpreq) {
+			fcloop_fcp_copy_data(op, tgt_fcpreq->sg,
+					fcpreq->first_sgl, tgt_fcpreq->offset,
+					xfrlen);
+			fcpreq->transferred_length += xfrlen;
+		}
+		break;
+
+	case NVMET_FCOP_READDATA:
+	case NVMET_FCOP_READDATA_RSP:
+		xfrlen = tgt_fcpreq->transfer_length;
+		if (fcpreq) {
+			fcloop_fcp_copy_data(op, tgt_fcpreq->sg,
+					fcpreq->first_sgl, tgt_fcpreq->offset,
+					xfrlen);
+			fcpreq->transferred_length += xfrlen;
+		}
+		if (op == NVMET_FCOP_READDATA)
+			break;
+
+		/* Fall-Thru to RSP handling */
+
+	case NVMET_FCOP_RSP:
+		if (fcpreq) {
+			rsplen = ((fcpreq->rsplen < tgt_fcpreq->rsplen) ?
+					fcpreq->rsplen : tgt_fcpreq->rsplen);
+			memcpy(fcpreq->rspaddr, tgt_fcpreq->rspaddr, rsplen);
+			if (rsplen < tgt_fcpreq->rsplen)
+				fcp_err = -E2BIG;
+			fcpreq->rcv_rsplen = rsplen;
+			fcpreq->status = 0;
+		}
+		tfcp_req->status = 0;
+		break;
+
+	default:
+		fcp_err = -EINVAL;
+		break;
+	}
+
+	spin_lock(&tfcp_req->reqlock);
+	tfcp_req->active = false;
+	spin_unlock(&tfcp_req->reqlock);
+
+	tgt_fcpreq->transferred_length = xfrlen;
+	tgt_fcpreq->fcp_error = fcp_err;
+	tgt_fcpreq->done(tgt_fcpreq);
+
+	return 0;
+}
+
+static void
+fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport,
+			struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+	struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+
+	/*
+	 * mark aborted only in case there were 2 threads in transport
+	 * (one doing io, other doing abort) and only kills ops posted
+	 * after the abort request
+	 */
+	spin_lock(&tfcp_req->reqlock);
+	tfcp_req->aborted = true;
+	spin_unlock(&tfcp_req->reqlock);
+
+	tfcp_req->status = NVME_SC_INTERNAL;
+
+	/*
+	 * nothing more to do. If io wasn't active, the transport should
+	 * immediately call the req_release. If it was active, the op
+	 * will complete, and the lldd should call req_release.
+	 */
+}
+
+static void
+fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
+			struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+	struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+
+	schedule_work(&tfcp_req->tio_done_work);
+}
+
+static void
+fcloop_ls_abort(struct nvme_fc_local_port *localport,
+			struct nvme_fc_remote_port *remoteport,
+				struct nvmefc_ls_req *lsreq)
+{
+}
+
+static void
+fcloop_fcp_abort(struct nvme_fc_local_port *localport,
+			struct nvme_fc_remote_port *remoteport,
+			void *hw_queue_handle,
+			struct nvmefc_fcp_req *fcpreq)
+{
+	struct fcloop_ini_fcpreq *inireq = fcpreq->private;
+	struct fcloop_fcpreq *tfcp_req;
+	bool abortio = true;
+
+	spin_lock(&inireq->inilock);
+	tfcp_req = inireq->tfcp_req;
+	if (tfcp_req)
+		fcloop_tfcp_req_get(tfcp_req);
+	spin_unlock(&inireq->inilock);
+
+	if (!tfcp_req)
+		/* abort has already been called */
+		return;
+
+	/* break initiator/target relationship for io */
+	spin_lock(&tfcp_req->reqlock);
+	switch (tfcp_req->inistate) {
+	case INI_IO_START:
+	case INI_IO_ACTIVE:
+		tfcp_req->inistate = INI_IO_ABORTED;
+		break;
+	case INI_IO_COMPLETED:
+		abortio = false;
+		break;
+	default:
+		spin_unlock(&tfcp_req->reqlock);
+		WARN_ON(1);
+		return;
+	}
+	spin_unlock(&tfcp_req->reqlock);
+
+	if (abortio)
+		/* leave the reference while the work item is scheduled */
+		WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work));
+	else  {
+		/*
+		 * as the io has already had the done callback made,
+		 * nothing more to do. So release the reference taken above
+		 */
+		fcloop_tfcp_req_put(tfcp_req);
+	}
+}
+
+static void
+fcloop_nport_free(struct kref *ref)
+{
+	struct fcloop_nport *nport =
+		container_of(ref, struct fcloop_nport, ref);
+	unsigned long flags;
+
+	spin_lock_irqsave(&fcloop_lock, flags);
+	list_del(&nport->nport_list);
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+
+	kfree(nport);
+}
+
+static void
+fcloop_nport_put(struct fcloop_nport *nport)
+{
+	kref_put(&nport->ref, fcloop_nport_free);
+}
+
+static int
+fcloop_nport_get(struct fcloop_nport *nport)
+{
+	return kref_get_unless_zero(&nport->ref);
+}
+
+static void
+fcloop_localport_delete(struct nvme_fc_local_port *localport)
+{
+	struct fcloop_lport_priv *lport_priv = localport->private;
+	struct fcloop_lport *lport = lport_priv->lport;
+
+	/* release any threads waiting for the unreg to complete */
+	complete(&lport->unreg_done);
+}
+
+static void
+fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport)
+{
+	struct fcloop_rport *rport = remoteport->private;
+
+	fcloop_nport_put(rport->nport);
+}
+
+static void
+fcloop_targetport_delete(struct nvmet_fc_target_port *targetport)
+{
+	struct fcloop_tport *tport = targetport->private;
+
+	fcloop_nport_put(tport->nport);
+}
+
+#define	FCLOOP_HW_QUEUES		4
+#define	FCLOOP_SGL_SEGS			256
+#define FCLOOP_DMABOUND_4G		0xFFFFFFFF
+
+static struct nvme_fc_port_template fctemplate = {
+	.localport_delete	= fcloop_localport_delete,
+	.remoteport_delete	= fcloop_remoteport_delete,
+	.create_queue		= fcloop_create_queue,
+	.delete_queue		= fcloop_delete_queue,
+	.ls_req			= fcloop_ls_req,
+	.fcp_io			= fcloop_fcp_req,
+	.ls_abort		= fcloop_ls_abort,
+	.fcp_abort		= fcloop_fcp_abort,
+	.max_hw_queues		= FCLOOP_HW_QUEUES,
+	.max_sgl_segments	= FCLOOP_SGL_SEGS,
+	.max_dif_sgl_segments	= FCLOOP_SGL_SEGS,
+	.dma_boundary		= FCLOOP_DMABOUND_4G,
+	/* sizes of additional private data for data structures */
+	.local_priv_sz		= sizeof(struct fcloop_lport_priv),
+	.remote_priv_sz		= sizeof(struct fcloop_rport),
+	.lsrqst_priv_sz		= sizeof(struct fcloop_lsreq),
+	.fcprqst_priv_sz	= sizeof(struct fcloop_ini_fcpreq),
+};
+
+static struct nvmet_fc_target_template tgttemplate = {
+	.targetport_delete	= fcloop_targetport_delete,
+	.xmt_ls_rsp		= fcloop_xmt_ls_rsp,
+	.fcp_op			= fcloop_fcp_op,
+	.fcp_abort		= fcloop_tgt_fcp_abort,
+	.fcp_req_release	= fcloop_fcp_req_release,
+	.max_hw_queues		= FCLOOP_HW_QUEUES,
+	.max_sgl_segments	= FCLOOP_SGL_SEGS,
+	.max_dif_sgl_segments	= FCLOOP_SGL_SEGS,
+	.dma_boundary		= FCLOOP_DMABOUND_4G,
+	/* optional features */
+	.target_features	= 0,
+	/* sizes of additional private data for data structures */
+	.target_priv_sz		= sizeof(struct fcloop_tport),
+};
+
+static ssize_t
+fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct nvme_fc_port_info pinfo;
+	struct fcloop_ctrl_options *opts;
+	struct nvme_fc_local_port *localport;
+	struct fcloop_lport *lport;
+	struct fcloop_lport_priv *lport_priv;
+	unsigned long flags;
+	int ret = -ENOMEM;
+
+	lport = kzalloc(sizeof(*lport), GFP_KERNEL);
+	if (!lport)
+		return -ENOMEM;
+
+	opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+	if (!opts)
+		goto out_free_lport;
+
+	ret = fcloop_parse_options(opts, buf);
+	if (ret)
+		goto out_free_opts;
+
+	/* everything there ? */
+	if ((opts->mask & LPORT_OPTS) != LPORT_OPTS) {
+		ret = -EINVAL;
+		goto out_free_opts;
+	}
+
+	memset(&pinfo, 0, sizeof(pinfo));
+	pinfo.node_name = opts->wwnn;
+	pinfo.port_name = opts->wwpn;
+	pinfo.port_role = opts->roles;
+	pinfo.port_id = opts->fcaddr;
+
+	ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport);
+	if (!ret) {
+		/* success */
+		lport_priv = localport->private;
+		lport_priv->lport = lport;
+
+		lport->localport = localport;
+		INIT_LIST_HEAD(&lport->lport_list);
+
+		spin_lock_irqsave(&fcloop_lock, flags);
+		list_add_tail(&lport->lport_list, &fcloop_lports);
+		spin_unlock_irqrestore(&fcloop_lock, flags);
+	}
+
+out_free_opts:
+	kfree(opts);
+out_free_lport:
+	/* free only if we're going to fail */
+	if (ret)
+		kfree(lport);
+
+	return ret ? ret : count;
+}
+
+
+static void
+__unlink_local_port(struct fcloop_lport *lport)
+{
+	list_del(&lport->lport_list);
+}
+
+static int
+__wait_localport_unreg(struct fcloop_lport *lport)
+{
+	int ret;
+
+	init_completion(&lport->unreg_done);
+
+	ret = nvme_fc_unregister_localport(lport->localport);
+
+	wait_for_completion(&lport->unreg_done);
+
+	kfree(lport);
+
+	return ret;
+}
+
+
+static ssize_t
+fcloop_delete_local_port(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct fcloop_lport *tlport, *lport = NULL;
+	u64 nodename, portname;
+	unsigned long flags;
+	int ret;
+
+	ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&fcloop_lock, flags);
+
+	list_for_each_entry(tlport, &fcloop_lports, lport_list) {
+		if (tlport->localport->node_name == nodename &&
+		    tlport->localport->port_name == portname) {
+			lport = tlport;
+			__unlink_local_port(lport);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+
+	if (!lport)
+		return -ENOENT;
+
+	ret = __wait_localport_unreg(lport);
+
+	return ret ? ret : count;
+}
+
+static struct fcloop_nport *
+fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
+{
+	struct fcloop_nport *newnport, *nport = NULL;
+	struct fcloop_lport *tmplport, *lport = NULL;
+	struct fcloop_ctrl_options *opts;
+	unsigned long flags;
+	u32 opts_mask = (remoteport) ? RPORT_OPTS : TGTPORT_OPTS;
+	int ret;
+
+	opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+	if (!opts)
+		return NULL;
+
+	ret = fcloop_parse_options(opts, buf);
+	if (ret)
+		goto out_free_opts;
+
+	/* everything there ? */
+	if ((opts->mask & opts_mask) != opts_mask) {
+		ret = -EINVAL;
+		goto out_free_opts;
+	}
+
+	newnport = kzalloc(sizeof(*newnport), GFP_KERNEL);
+	if (!newnport)
+		goto out_free_opts;
+
+	INIT_LIST_HEAD(&newnport->nport_list);
+	newnport->node_name = opts->wwnn;
+	newnport->port_name = opts->wwpn;
+	if (opts->mask & NVMF_OPT_ROLES)
+		newnport->port_role = opts->roles;
+	if (opts->mask & NVMF_OPT_FCADDR)
+		newnport->port_id = opts->fcaddr;
+	kref_init(&newnport->ref);
+
+	spin_lock_irqsave(&fcloop_lock, flags);
+
+	list_for_each_entry(tmplport, &fcloop_lports, lport_list) {
+		if (tmplport->localport->node_name == opts->wwnn &&
+		    tmplport->localport->port_name == opts->wwpn)
+			goto out_invalid_opts;
+
+		if (tmplport->localport->node_name == opts->lpwwnn &&
+		    tmplport->localport->port_name == opts->lpwwpn)
+			lport = tmplport;
+	}
+
+	if (remoteport) {
+		if (!lport)
+			goto out_invalid_opts;
+		newnport->lport = lport;
+	}
+
+	list_for_each_entry(nport, &fcloop_nports, nport_list) {
+		if (nport->node_name == opts->wwnn &&
+		    nport->port_name == opts->wwpn) {
+			if ((remoteport && nport->rport) ||
+			    (!remoteport && nport->tport)) {
+				nport = NULL;
+				goto out_invalid_opts;
+			}
+
+			fcloop_nport_get(nport);
+
+			spin_unlock_irqrestore(&fcloop_lock, flags);
+
+			if (remoteport)
+				nport->lport = lport;
+			if (opts->mask & NVMF_OPT_ROLES)
+				nport->port_role = opts->roles;
+			if (opts->mask & NVMF_OPT_FCADDR)
+				nport->port_id = opts->fcaddr;
+			goto out_free_newnport;
+		}
+	}
+
+	list_add_tail(&newnport->nport_list, &fcloop_nports);
+
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+
+	kfree(opts);
+	return newnport;
+
+out_invalid_opts:
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+out_free_newnport:
+	kfree(newnport);
+out_free_opts:
+	kfree(opts);
+	return nport;
+}
+
+static ssize_t
+fcloop_create_remote_port(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct nvme_fc_remote_port *remoteport;
+	struct fcloop_nport *nport;
+	struct fcloop_rport *rport;
+	struct nvme_fc_port_info pinfo;
+	int ret;
+
+	nport = fcloop_alloc_nport(buf, count, true);
+	if (!nport)
+		return -EIO;
+
+	memset(&pinfo, 0, sizeof(pinfo));
+	pinfo.node_name = nport->node_name;
+	pinfo.port_name = nport->port_name;
+	pinfo.port_role = nport->port_role;
+	pinfo.port_id = nport->port_id;
+
+	ret = nvme_fc_register_remoteport(nport->lport->localport,
+						&pinfo, &remoteport);
+	if (ret || !remoteport) {
+		fcloop_nport_put(nport);
+		return ret;
+	}
+
+	/* success */
+	rport = remoteport->private;
+	rport->remoteport = remoteport;
+	rport->targetport = (nport->tport) ?  nport->tport->targetport : NULL;
+	if (nport->tport) {
+		nport->tport->remoteport = remoteport;
+		nport->tport->lport = nport->lport;
+	}
+	rport->nport = nport;
+	rport->lport = nport->lport;
+	nport->rport = rport;
+
+	return count;
+}
+
+
+static struct fcloop_rport *
+__unlink_remote_port(struct fcloop_nport *nport)
+{
+	struct fcloop_rport *rport = nport->rport;
+
+	if (rport && nport->tport)
+		nport->tport->remoteport = NULL;
+	nport->rport = NULL;
+
+	return rport;
+}
+
+static int
+__remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport)
+{
+	if (!rport)
+		return -EALREADY;
+
+	return nvme_fc_unregister_remoteport(rport->remoteport);
+}
+
+static ssize_t
+fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct fcloop_nport *nport = NULL, *tmpport;
+	static struct fcloop_rport *rport;
+	u64 nodename, portname;
+	unsigned long flags;
+	int ret;
+
+	ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&fcloop_lock, flags);
+
+	list_for_each_entry(tmpport, &fcloop_nports, nport_list) {
+		if (tmpport->node_name == nodename &&
+		    tmpport->port_name == portname && tmpport->rport) {
+			nport = tmpport;
+			rport = __unlink_remote_port(nport);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+
+	if (!nport)
+		return -ENOENT;
+
+	ret = __remoteport_unreg(nport, rport);
+
+	return ret ? ret : count;
+}
+
+static ssize_t
+fcloop_create_target_port(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct nvmet_fc_target_port *targetport;
+	struct fcloop_nport *nport;
+	struct fcloop_tport *tport;
+	struct nvmet_fc_port_info tinfo;
+	int ret;
+
+	nport = fcloop_alloc_nport(buf, count, false);
+	if (!nport)
+		return -EIO;
+
+	tinfo.node_name = nport->node_name;
+	tinfo.port_name = nport->port_name;
+	tinfo.port_id = nport->port_id;
+
+	ret = nvmet_fc_register_targetport(&tinfo, &tgttemplate, NULL,
+						&targetport);
+	if (ret) {
+		fcloop_nport_put(nport);
+		return ret;
+	}
+
+	/* success */
+	tport = targetport->private;
+	tport->targetport = targetport;
+	tport->remoteport = (nport->rport) ?  nport->rport->remoteport : NULL;
+	if (nport->rport)
+		nport->rport->targetport = targetport;
+	tport->nport = nport;
+	tport->lport = nport->lport;
+	nport->tport = tport;
+
+	return count;
+}
+
+
+static struct fcloop_tport *
+__unlink_target_port(struct fcloop_nport *nport)
+{
+	struct fcloop_tport *tport = nport->tport;
+
+	if (tport && nport->rport)
+		nport->rport->targetport = NULL;
+	nport->tport = NULL;
+
+	return tport;
+}
+
+static int
+__targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport)
+{
+	if (!tport)
+		return -EALREADY;
+
+	return nvmet_fc_unregister_targetport(tport->targetport);
+}
+
+static ssize_t
+fcloop_delete_target_port(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct fcloop_nport *nport = NULL, *tmpport;
+	struct fcloop_tport *tport = NULL;
+	u64 nodename, portname;
+	unsigned long flags;
+	int ret;
+
+	ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&fcloop_lock, flags);
+
+	list_for_each_entry(tmpport, &fcloop_nports, nport_list) {
+		if (tmpport->node_name == nodename &&
+		    tmpport->port_name == portname && tmpport->tport) {
+			nport = tmpport;
+			tport = __unlink_target_port(nport);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+
+	if (!nport)
+		return -ENOENT;
+
+	ret = __targetport_unreg(nport, tport);
+
+	return ret ? ret : count;
+}
+
+
+static DEVICE_ATTR(add_local_port, 0200, NULL, fcloop_create_local_port);
+static DEVICE_ATTR(del_local_port, 0200, NULL, fcloop_delete_local_port);
+static DEVICE_ATTR(add_remote_port, 0200, NULL, fcloop_create_remote_port);
+static DEVICE_ATTR(del_remote_port, 0200, NULL, fcloop_delete_remote_port);
+static DEVICE_ATTR(add_target_port, 0200, NULL, fcloop_create_target_port);
+static DEVICE_ATTR(del_target_port, 0200, NULL, fcloop_delete_target_port);
+
+static struct attribute *fcloop_dev_attrs[] = {
+	&dev_attr_add_local_port.attr,
+	&dev_attr_del_local_port.attr,
+	&dev_attr_add_remote_port.attr,
+	&dev_attr_del_remote_port.attr,
+	&dev_attr_add_target_port.attr,
+	&dev_attr_del_target_port.attr,
+	NULL
+};
+
+static struct attribute_group fclopp_dev_attrs_group = {
+	.attrs		= fcloop_dev_attrs,
+};
+
+static const struct attribute_group *fcloop_dev_attr_groups[] = {
+	&fclopp_dev_attrs_group,
+	NULL,
+};
+
+static struct class *fcloop_class;
+static struct device *fcloop_device;
+
+
+static int __init fcloop_init(void)
+{
+	int ret;
+
+	fcloop_class = class_create(THIS_MODULE, "fcloop");
+	if (IS_ERR(fcloop_class)) {
+		pr_err("couldn't register class fcloop\n");
+		ret = PTR_ERR(fcloop_class);
+		return ret;
+	}
+
+	fcloop_device = device_create_with_groups(
+				fcloop_class, NULL, MKDEV(0, 0), NULL,
+				fcloop_dev_attr_groups, "ctl");
+	if (IS_ERR(fcloop_device)) {
+		pr_err("couldn't create ctl device!\n");
+		ret = PTR_ERR(fcloop_device);
+		goto out_destroy_class;
+	}
+
+	get_device(fcloop_device);
+
+	return 0;
+
+out_destroy_class:
+	class_destroy(fcloop_class);
+	return ret;
+}
+
+static void __exit fcloop_exit(void)
+{
+	struct fcloop_lport *lport;
+	struct fcloop_nport *nport;
+	struct fcloop_tport *tport;
+	struct fcloop_rport *rport;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&fcloop_lock, flags);
+
+	for (;;) {
+		nport = list_first_entry_or_null(&fcloop_nports,
+						typeof(*nport), nport_list);
+		if (!nport)
+			break;
+
+		tport = __unlink_target_port(nport);
+		rport = __unlink_remote_port(nport);
+
+		spin_unlock_irqrestore(&fcloop_lock, flags);
+
+		ret = __targetport_unreg(nport, tport);
+		if (ret)
+			pr_warn("%s: Failed deleting target port\n", __func__);
+
+		ret = __remoteport_unreg(nport, rport);
+		if (ret)
+			pr_warn("%s: Failed deleting remote port\n", __func__);
+
+		spin_lock_irqsave(&fcloop_lock, flags);
+	}
+
+	for (;;) {
+		lport = list_first_entry_or_null(&fcloop_lports,
+						typeof(*lport), lport_list);
+		if (!lport)
+			break;
+
+		__unlink_local_port(lport);
+
+		spin_unlock_irqrestore(&fcloop_lock, flags);
+
+		ret = __wait_localport_unreg(lport);
+		if (ret)
+			pr_warn("%s: Failed deleting local port\n", __func__);
+
+		spin_lock_irqsave(&fcloop_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+
+	put_device(fcloop_device);
+
+	device_destroy(fcloop_class, MKDEV(0, 0));
+	class_destroy(fcloop_class);
+}
+
+module_init(fcloop_init);
+module_exit(fcloop_exit);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
new file mode 100644
index 0000000..cd23441
--- /dev/null
+++ b/drivers/nvme/target/io-cmd.c
@@ -0,0 +1,230 @@
+/*
+ * NVMe I/O command implementation.
+ * Copyright (c) 2015-2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include "nvmet.h"
+
+static void nvmet_bio_done(struct bio *bio)
+{
+	struct nvmet_req *req = bio->bi_private;
+
+	nvmet_req_complete(req,
+		bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+
+	if (bio != &req->inline_bio)
+		bio_put(bio);
+}
+
+static inline u32 nvmet_rw_len(struct nvmet_req *req)
+{
+	return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) <<
+			req->ns->blksize_shift;
+}
+
+static void nvmet_execute_rw(struct nvmet_req *req)
+{
+	int sg_cnt = req->sg_cnt;
+	struct bio *bio = &req->inline_bio;
+	struct scatterlist *sg;
+	sector_t sector;
+	blk_qc_t cookie;
+	int op, op_flags = 0, i;
+
+	if (!req->sg_cnt) {
+		nvmet_req_complete(req, 0);
+		return;
+	}
+
+	if (req->cmd->rw.opcode == nvme_cmd_write) {
+		op = REQ_OP_WRITE;
+		op_flags = REQ_SYNC | REQ_IDLE;
+		if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
+			op_flags |= REQ_FUA;
+	} else {
+		op = REQ_OP_READ;
+	}
+
+	sector = le64_to_cpu(req->cmd->rw.slba);
+	sector <<= (req->ns->blksize_shift - 9);
+
+	bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
+	bio_set_dev(bio, req->ns->bdev);
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_private = req;
+	bio->bi_end_io = nvmet_bio_done;
+	bio_set_op_attrs(bio, op, op_flags);
+
+	for_each_sg(req->sg, sg, req->sg_cnt, i) {
+		while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset)
+				!= sg->length) {
+			struct bio *prev = bio;
+
+			bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
+			bio_set_dev(bio, req->ns->bdev);
+			bio->bi_iter.bi_sector = sector;
+			bio_set_op_attrs(bio, op, op_flags);
+
+			bio_chain(bio, prev);
+			submit_bio(prev);
+		}
+
+		sector += sg->length >> 9;
+		sg_cnt--;
+	}
+
+	cookie = submit_bio(bio);
+
+	blk_poll(bdev_get_queue(req->ns->bdev), cookie);
+}
+
+static void nvmet_execute_flush(struct nvmet_req *req)
+{
+	struct bio *bio = &req->inline_bio;
+
+	bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
+	bio_set_dev(bio, req->ns->bdev);
+	bio->bi_private = req;
+	bio->bi_end_io = nvmet_bio_done;
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+	submit_bio(bio);
+}
+
+static u16 nvmet_discard_range(struct nvmet_ns *ns,
+		struct nvme_dsm_range *range, struct bio **bio)
+{
+	int ret;
+
+	ret = __blkdev_issue_discard(ns->bdev,
+			le64_to_cpu(range->slba) << (ns->blksize_shift - 9),
+			le32_to_cpu(range->nlb) << (ns->blksize_shift - 9),
+			GFP_KERNEL, 0, bio);
+	if (ret && ret != -EOPNOTSUPP)
+		return NVME_SC_INTERNAL | NVME_SC_DNR;
+	return 0;
+}
+
+static void nvmet_execute_discard(struct nvmet_req *req)
+{
+	struct nvme_dsm_range range;
+	struct bio *bio = NULL;
+	int i;
+	u16 status;
+
+	for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) {
+		status = nvmet_copy_from_sgl(req, i * sizeof(range), &range,
+				sizeof(range));
+		if (status)
+			break;
+
+		status = nvmet_discard_range(req->ns, &range, &bio);
+		if (status)
+			break;
+	}
+
+	if (bio) {
+		bio->bi_private = req;
+		bio->bi_end_io = nvmet_bio_done;
+		if (status) {
+			bio->bi_status = BLK_STS_IOERR;
+			bio_endio(bio);
+		} else {
+			submit_bio(bio);
+		}
+	} else {
+		nvmet_req_complete(req, status);
+	}
+}
+
+static void nvmet_execute_dsm(struct nvmet_req *req)
+{
+	switch (le32_to_cpu(req->cmd->dsm.attributes)) {
+	case NVME_DSMGMT_AD:
+		nvmet_execute_discard(req);
+		return;
+	case NVME_DSMGMT_IDR:
+	case NVME_DSMGMT_IDW:
+	default:
+		/* Not supported yet */
+		nvmet_req_complete(req, 0);
+		return;
+	}
+}
+
+static void nvmet_execute_write_zeroes(struct nvmet_req *req)
+{
+	struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes;
+	struct bio *bio = NULL;
+	u16 status = NVME_SC_SUCCESS;
+	sector_t sector;
+	sector_t nr_sector;
+
+	sector = le64_to_cpu(write_zeroes->slba) <<
+		(req->ns->blksize_shift - 9);
+	nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) <<
+		(req->ns->blksize_shift - 9));
+
+	if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
+				GFP_KERNEL, &bio, 0))
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+
+	if (bio) {
+		bio->bi_private = req;
+		bio->bi_end_io = nvmet_bio_done;
+		submit_bio(bio);
+	} else {
+		nvmet_req_complete(req, status);
+	}
+}
+
+u16 nvmet_parse_io_cmd(struct nvmet_req *req)
+{
+	struct nvme_command *cmd = req->cmd;
+	u16 ret;
+
+	ret = nvmet_check_ctrl_status(req, cmd);
+	if (unlikely(ret)) {
+		req->ns = NULL;
+		return ret;
+	}
+
+	req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
+	if (unlikely(!req->ns))
+		return NVME_SC_INVALID_NS | NVME_SC_DNR;
+
+	switch (cmd->common.opcode) {
+	case nvme_cmd_read:
+	case nvme_cmd_write:
+		req->execute = nvmet_execute_rw;
+		req->data_len = nvmet_rw_len(req);
+		return 0;
+	case nvme_cmd_flush:
+		req->execute = nvmet_execute_flush;
+		req->data_len = 0;
+		return 0;
+	case nvme_cmd_dsm:
+		req->execute = nvmet_execute_dsm;
+		req->data_len = (le32_to_cpu(cmd->dsm.nr) + 1) *
+			sizeof(struct nvme_dsm_range);
+		return 0;
+	case nvme_cmd_write_zeroes:
+		req->execute = nvmet_execute_write_zeroes;
+		return 0;
+	default:
+		pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
+		       req->sq->qid);
+		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+	}
+}
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
new file mode 100644
index 0000000..27a8561
--- /dev/null
+++ b/drivers/nvme/target/loop.c
@@ -0,0 +1,721 @@
+/*
+ * NVMe over Fabrics loopback device.
+ * Copyright (c) 2015-2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/scatterlist.h>
+#include <linux/blk-mq.h>
+#include <linux/nvme.h>
+#include <linux/module.h>
+#include <linux/parser.h>
+#include "nvmet.h"
+#include "../host/nvme.h"
+#include "../host/fabrics.h"
+
+#define NVME_LOOP_MAX_SEGMENTS		256
+
+struct nvme_loop_iod {
+	struct nvme_request	nvme_req;
+	struct nvme_command	cmd;
+	struct nvme_completion	rsp;
+	struct nvmet_req	req;
+	struct nvme_loop_queue	*queue;
+	struct work_struct	work;
+	struct sg_table		sg_table;
+	struct scatterlist	first_sgl[];
+};
+
+struct nvme_loop_ctrl {
+	struct nvme_loop_queue	*queues;
+
+	struct blk_mq_tag_set	admin_tag_set;
+
+	struct list_head	list;
+	struct blk_mq_tag_set	tag_set;
+	struct nvme_loop_iod	async_event_iod;
+	struct nvme_ctrl	ctrl;
+
+	struct nvmet_ctrl	*target_ctrl;
+};
+
+static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
+{
+	return container_of(ctrl, struct nvme_loop_ctrl, ctrl);
+}
+
+enum nvme_loop_queue_flags {
+	NVME_LOOP_Q_LIVE	= 0,
+};
+
+struct nvme_loop_queue {
+	struct nvmet_cq		nvme_cq;
+	struct nvmet_sq		nvme_sq;
+	struct nvme_loop_ctrl	*ctrl;
+	unsigned long		flags;
+};
+
+static struct nvmet_port *nvmet_loop_port;
+
+static LIST_HEAD(nvme_loop_ctrl_list);
+static DEFINE_MUTEX(nvme_loop_ctrl_mutex);
+
+static void nvme_loop_queue_response(struct nvmet_req *nvme_req);
+static void nvme_loop_delete_ctrl(struct nvmet_ctrl *ctrl);
+
+static const struct nvmet_fabrics_ops nvme_loop_ops;
+
+static inline int nvme_loop_queue_idx(struct nvme_loop_queue *queue)
+{
+	return queue - queue->ctrl->queues;
+}
+
+static void nvme_loop_complete_rq(struct request *req)
+{
+	struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
+
+	nvme_cleanup_cmd(req);
+	sg_free_table_chained(&iod->sg_table, true);
+	nvme_complete_rq(req);
+}
+
+static struct blk_mq_tags *nvme_loop_tagset(struct nvme_loop_queue *queue)
+{
+	u32 queue_idx = nvme_loop_queue_idx(queue);
+
+	if (queue_idx == 0)
+		return queue->ctrl->admin_tag_set.tags[queue_idx];
+	return queue->ctrl->tag_set.tags[queue_idx - 1];
+}
+
+static void nvme_loop_queue_response(struct nvmet_req *req)
+{
+	struct nvme_loop_queue *queue =
+		container_of(req->sq, struct nvme_loop_queue, nvme_sq);
+	struct nvme_completion *cqe = req->rsp;
+
+	/*
+	 * AEN requests are special as they don't time out and can
+	 * survive any kind of queue freeze and often don't respond to
+	 * aborts.  We don't even bother to allocate a struct request
+	 * for them but rather special case them here.
+	 */
+	if (unlikely(nvme_loop_queue_idx(queue) == 0 &&
+			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
+		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
+				&cqe->result);
+	} else {
+		struct request *rq;
+
+		rq = blk_mq_tag_to_rq(nvme_loop_tagset(queue), cqe->command_id);
+		if (!rq) {
+			dev_err(queue->ctrl->ctrl.device,
+				"tag 0x%x on queue %d not found\n",
+				cqe->command_id, nvme_loop_queue_idx(queue));
+			return;
+		}
+
+		nvme_end_request(rq, cqe->status, cqe->result);
+	}
+}
+
+static void nvme_loop_execute_work(struct work_struct *work)
+{
+	struct nvme_loop_iod *iod =
+		container_of(work, struct nvme_loop_iod, work);
+
+	nvmet_req_execute(&iod->req);
+}
+
+static enum blk_eh_timer_return
+nvme_loop_timeout(struct request *rq, bool reserved)
+{
+	struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(rq);
+
+	/* queue error recovery */
+	nvme_reset_ctrl(&iod->queue->ctrl->ctrl);
+
+	/* fail with DNR on admin cmd timeout */
+	nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+
+	return BLK_EH_HANDLED;
+}
+
+static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_loop_queue *queue = hctx->driver_data;
+	struct request *req = bd->rq;
+	struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
+	blk_status_t ret;
+
+	ret = nvmf_check_if_ready(&queue->ctrl->ctrl, req,
+		test_bit(NVME_LOOP_Q_LIVE, &queue->flags), true);
+	if (unlikely(ret))
+		return ret;
+
+	ret = nvme_setup_cmd(ns, req, &iod->cmd);
+	if (ret)
+		return ret;
+
+	blk_mq_start_request(req);
+	iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
+	iod->req.port = nvmet_loop_port;
+	if (!nvmet_req_init(&iod->req, &queue->nvme_cq,
+			&queue->nvme_sq, &nvme_loop_ops))
+		return BLK_STS_OK;
+
+	if (blk_rq_payload_bytes(req)) {
+		iod->sg_table.sgl = iod->first_sgl;
+		if (sg_alloc_table_chained(&iod->sg_table,
+				blk_rq_nr_phys_segments(req),
+				iod->sg_table.sgl))
+			return BLK_STS_RESOURCE;
+
+		iod->req.sg = iod->sg_table.sgl;
+		iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl);
+		iod->req.transfer_len = blk_rq_payload_bytes(req);
+	}
+
+	schedule_work(&iod->work);
+	return BLK_STS_OK;
+}
+
+static void nvme_loop_submit_async_event(struct nvme_ctrl *arg)
+{
+	struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg);
+	struct nvme_loop_queue *queue = &ctrl->queues[0];
+	struct nvme_loop_iod *iod = &ctrl->async_event_iod;
+
+	memset(&iod->cmd, 0, sizeof(iod->cmd));
+	iod->cmd.common.opcode = nvme_admin_async_event;
+	iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
+	iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
+
+	if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq,
+			&nvme_loop_ops)) {
+		dev_err(ctrl->ctrl.device, "failed async event work\n");
+		return;
+	}
+
+	schedule_work(&iod->work);
+}
+
+static int nvme_loop_init_iod(struct nvme_loop_ctrl *ctrl,
+		struct nvme_loop_iod *iod, unsigned int queue_idx)
+{
+	iod->req.cmd = &iod->cmd;
+	iod->req.rsp = &iod->rsp;
+	iod->queue = &ctrl->queues[queue_idx];
+	INIT_WORK(&iod->work, nvme_loop_execute_work);
+	return 0;
+}
+
+static int nvme_loop_init_request(struct blk_mq_tag_set *set,
+		struct request *req, unsigned int hctx_idx,
+		unsigned int numa_node)
+{
+	struct nvme_loop_ctrl *ctrl = set->driver_data;
+
+	return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req),
+			(set == &ctrl->tag_set) ? hctx_idx + 1 : 0);
+}
+
+static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_loop_ctrl *ctrl = data;
+	struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1];
+
+	BUG_ON(hctx_idx >= ctrl->ctrl.queue_count);
+
+	hctx->driver_data = queue;
+	return 0;
+}
+
+static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_loop_ctrl *ctrl = data;
+	struct nvme_loop_queue *queue = &ctrl->queues[0];
+
+	BUG_ON(hctx_idx != 0);
+
+	hctx->driver_data = queue;
+	return 0;
+}
+
+static const struct blk_mq_ops nvme_loop_mq_ops = {
+	.queue_rq	= nvme_loop_queue_rq,
+	.complete	= nvme_loop_complete_rq,
+	.init_request	= nvme_loop_init_request,
+	.init_hctx	= nvme_loop_init_hctx,
+	.timeout	= nvme_loop_timeout,
+};
+
+static const struct blk_mq_ops nvme_loop_admin_mq_ops = {
+	.queue_rq	= nvme_loop_queue_rq,
+	.complete	= nvme_loop_complete_rq,
+	.init_request	= nvme_loop_init_request,
+	.init_hctx	= nvme_loop_init_admin_hctx,
+	.timeout	= nvme_loop_timeout,
+};
+
+static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
+{
+	clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
+	nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
+	blk_cleanup_queue(ctrl->ctrl.admin_q);
+	blk_mq_free_tag_set(&ctrl->admin_tag_set);
+}
+
+static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl);
+
+	if (list_empty(&ctrl->list))
+		goto free_ctrl;
+
+	mutex_lock(&nvme_loop_ctrl_mutex);
+	list_del(&ctrl->list);
+	mutex_unlock(&nvme_loop_ctrl_mutex);
+
+	if (nctrl->tagset) {
+		blk_cleanup_queue(ctrl->ctrl.connect_q);
+		blk_mq_free_tag_set(&ctrl->tag_set);
+	}
+	kfree(ctrl->queues);
+	nvmf_free_options(nctrl->opts);
+free_ctrl:
+	kfree(ctrl);
+}
+
+static void nvme_loop_destroy_io_queues(struct nvme_loop_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
+		clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags);
+		nvmet_sq_destroy(&ctrl->queues[i].nvme_sq);
+	}
+}
+
+static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl)
+{
+	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+	unsigned int nr_io_queues;
+	int ret, i;
+
+	nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
+	ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
+	if (ret || !nr_io_queues)
+		return ret;
+
+	dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues);
+
+	for (i = 1; i <= nr_io_queues; i++) {
+		ctrl->queues[i].ctrl = ctrl;
+		ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq);
+		if (ret)
+			goto out_destroy_queues;
+
+		ctrl->ctrl.queue_count++;
+	}
+
+	return 0;
+
+out_destroy_queues:
+	nvme_loop_destroy_io_queues(ctrl);
+	return ret;
+}
+
+static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl)
+{
+	int i, ret;
+
+	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
+		ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
+		if (ret)
+			return ret;
+		set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags);
+	}
+
+	return 0;
+}
+
+static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
+{
+	int error;
+
+	memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
+	ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops;
+	ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+	ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
+	ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
+		SG_CHUNK_SIZE * sizeof(struct scatterlist);
+	ctrl->admin_tag_set.driver_data = ctrl;
+	ctrl->admin_tag_set.nr_hw_queues = 1;
+	ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
+	ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED;
+
+	ctrl->queues[0].ctrl = ctrl;
+	error = nvmet_sq_init(&ctrl->queues[0].nvme_sq);
+	if (error)
+		return error;
+	ctrl->ctrl.queue_count = 1;
+
+	error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
+	if (error)
+		goto out_free_sq;
+	ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set;
+
+	ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+	if (IS_ERR(ctrl->ctrl.admin_q)) {
+		error = PTR_ERR(ctrl->ctrl.admin_q);
+		goto out_free_tagset;
+	}
+
+	error = nvmf_connect_admin_queue(&ctrl->ctrl);
+	if (error)
+		goto out_cleanup_queue;
+
+	set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
+
+	error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap);
+	if (error) {
+		dev_err(ctrl->ctrl.device,
+			"prop_get NVME_REG_CAP failed\n");
+		goto out_cleanup_queue;
+	}
+
+	ctrl->ctrl.sqsize =
+		min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
+
+	error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+	if (error)
+		goto out_cleanup_queue;
+
+	ctrl->ctrl.max_hw_sectors =
+		(NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9);
+
+	error = nvme_init_identify(&ctrl->ctrl);
+	if (error)
+		goto out_cleanup_queue;
+
+	return 0;
+
+out_cleanup_queue:
+	blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_free_tagset:
+	blk_mq_free_tag_set(&ctrl->admin_tag_set);
+out_free_sq:
+	nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
+	return error;
+}
+
+static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
+{
+	if (ctrl->ctrl.queue_count > 1) {
+		nvme_stop_queues(&ctrl->ctrl);
+		blk_mq_tagset_busy_iter(&ctrl->tag_set,
+					nvme_cancel_request, &ctrl->ctrl);
+		nvme_loop_destroy_io_queues(ctrl);
+	}
+
+	if (ctrl->ctrl.state == NVME_CTRL_LIVE)
+		nvme_shutdown_ctrl(&ctrl->ctrl);
+
+	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
+				nvme_cancel_request, &ctrl->ctrl);
+	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+	nvme_loop_destroy_admin_queue(ctrl);
+}
+
+static void nvme_loop_delete_ctrl_host(struct nvme_ctrl *ctrl)
+{
+	nvme_loop_shutdown_ctrl(to_loop_ctrl(ctrl));
+}
+
+static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl)
+{
+	struct nvme_loop_ctrl *ctrl;
+
+	mutex_lock(&nvme_loop_ctrl_mutex);
+	list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) {
+		if (ctrl->ctrl.cntlid == nctrl->cntlid)
+			nvme_delete_ctrl(&ctrl->ctrl);
+	}
+	mutex_unlock(&nvme_loop_ctrl_mutex);
+}
+
+static void nvme_loop_reset_ctrl_work(struct work_struct *work)
+{
+	struct nvme_loop_ctrl *ctrl =
+		container_of(work, struct nvme_loop_ctrl, ctrl.reset_work);
+	bool changed;
+	int ret;
+
+	nvme_stop_ctrl(&ctrl->ctrl);
+	nvme_loop_shutdown_ctrl(ctrl);
+
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
+		/* state change failure should never happen */
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ret = nvme_loop_configure_admin_queue(ctrl);
+	if (ret)
+		goto out_disable;
+
+	ret = nvme_loop_init_io_queues(ctrl);
+	if (ret)
+		goto out_destroy_admin;
+
+	ret = nvme_loop_connect_io_queues(ctrl);
+	if (ret)
+		goto out_destroy_io;
+
+	blk_mq_update_nr_hw_queues(&ctrl->tag_set,
+			ctrl->ctrl.queue_count - 1);
+
+	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
+	WARN_ON_ONCE(!changed);
+
+	nvme_start_ctrl(&ctrl->ctrl);
+
+	return;
+
+out_destroy_io:
+	nvme_loop_destroy_io_queues(ctrl);
+out_destroy_admin:
+	nvme_loop_destroy_admin_queue(ctrl);
+out_disable:
+	dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
+	nvme_uninit_ctrl(&ctrl->ctrl);
+	nvme_put_ctrl(&ctrl->ctrl);
+}
+
+static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = {
+	.name			= "loop",
+	.module			= THIS_MODULE,
+	.flags			= NVME_F_FABRICS,
+	.reg_read32		= nvmf_reg_read32,
+	.reg_read64		= nvmf_reg_read64,
+	.reg_write32		= nvmf_reg_write32,
+	.free_ctrl		= nvme_loop_free_ctrl,
+	.submit_async_event	= nvme_loop_submit_async_event,
+	.delete_ctrl		= nvme_loop_delete_ctrl_host,
+};
+
+static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
+{
+	int ret;
+
+	ret = nvme_loop_init_io_queues(ctrl);
+	if (ret)
+		return ret;
+
+	memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
+	ctrl->tag_set.ops = &nvme_loop_mq_ops;
+	ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
+	ctrl->tag_set.reserved_tags = 1; /* fabric connect */
+	ctrl->tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
+		SG_CHUNK_SIZE * sizeof(struct scatterlist);
+	ctrl->tag_set.driver_data = ctrl;
+	ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1;
+	ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
+	ctrl->ctrl.tagset = &ctrl->tag_set;
+
+	ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
+	if (ret)
+		goto out_destroy_queues;
+
+	ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
+	if (IS_ERR(ctrl->ctrl.connect_q)) {
+		ret = PTR_ERR(ctrl->ctrl.connect_q);
+		goto out_free_tagset;
+	}
+
+	ret = nvme_loop_connect_io_queues(ctrl);
+	if (ret)
+		goto out_cleanup_connect_q;
+
+	return 0;
+
+out_cleanup_connect_q:
+	blk_cleanup_queue(ctrl->ctrl.connect_q);
+out_free_tagset:
+	blk_mq_free_tag_set(&ctrl->tag_set);
+out_destroy_queues:
+	nvme_loop_destroy_io_queues(ctrl);
+	return ret;
+}
+
+static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
+		struct nvmf_ctrl_options *opts)
+{
+	struct nvme_loop_ctrl *ctrl;
+	bool changed;
+	int ret;
+
+	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+	if (!ctrl)
+		return ERR_PTR(-ENOMEM);
+	ctrl->ctrl.opts = opts;
+	INIT_LIST_HEAD(&ctrl->list);
+
+	INIT_WORK(&ctrl->ctrl.reset_work, nvme_loop_reset_ctrl_work);
+
+	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops,
+				0 /* no quirks, we're perfect! */);
+	if (ret)
+		goto out_put_ctrl;
+
+	ret = -ENOMEM;
+
+	ctrl->ctrl.sqsize = opts->queue_size - 1;
+	ctrl->ctrl.kato = opts->kato;
+
+	ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues),
+			GFP_KERNEL);
+	if (!ctrl->queues)
+		goto out_uninit_ctrl;
+
+	ret = nvme_loop_configure_admin_queue(ctrl);
+	if (ret)
+		goto out_free_queues;
+
+	if (opts->queue_size > ctrl->ctrl.maxcmd) {
+		/* warn if maxcmd is lower than queue_size */
+		dev_warn(ctrl->ctrl.device,
+			"queue_size %zu > ctrl maxcmd %u, clamping down\n",
+			opts->queue_size, ctrl->ctrl.maxcmd);
+		opts->queue_size = ctrl->ctrl.maxcmd;
+	}
+
+	if (opts->nr_io_queues) {
+		ret = nvme_loop_create_io_queues(ctrl);
+		if (ret)
+			goto out_remove_admin_queue;
+	}
+
+	nvme_loop_init_iod(ctrl, &ctrl->async_event_iod, 0);
+
+	dev_info(ctrl->ctrl.device,
+		 "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn);
+
+	nvme_get_ctrl(&ctrl->ctrl);
+
+	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
+	WARN_ON_ONCE(!changed);
+
+	mutex_lock(&nvme_loop_ctrl_mutex);
+	list_add_tail(&ctrl->list, &nvme_loop_ctrl_list);
+	mutex_unlock(&nvme_loop_ctrl_mutex);
+
+	nvme_start_ctrl(&ctrl->ctrl);
+
+	return &ctrl->ctrl;
+
+out_remove_admin_queue:
+	nvme_loop_destroy_admin_queue(ctrl);
+out_free_queues:
+	kfree(ctrl->queues);
+out_uninit_ctrl:
+	nvme_uninit_ctrl(&ctrl->ctrl);
+out_put_ctrl:
+	nvme_put_ctrl(&ctrl->ctrl);
+	if (ret > 0)
+		ret = -EIO;
+	return ERR_PTR(ret);
+}
+
+static int nvme_loop_add_port(struct nvmet_port *port)
+{
+	/*
+	 * XXX: disalow adding more than one port so
+	 * there is no connection rejections when a
+	 * a subsystem is assigned to a port for which
+	 * loop doesn't have a pointer.
+	 * This scenario would be possible if we allowed
+	 * more than one port to be added and a subsystem
+	 * was assigned to a port other than nvmet_loop_port.
+	 */
+
+	if (nvmet_loop_port)
+		return -EPERM;
+
+	nvmet_loop_port = port;
+	return 0;
+}
+
+static void nvme_loop_remove_port(struct nvmet_port *port)
+{
+	if (port == nvmet_loop_port)
+		nvmet_loop_port = NULL;
+}
+
+static const struct nvmet_fabrics_ops nvme_loop_ops = {
+	.owner		= THIS_MODULE,
+	.type		= NVMF_TRTYPE_LOOP,
+	.add_port	= nvme_loop_add_port,
+	.remove_port	= nvme_loop_remove_port,
+	.queue_response = nvme_loop_queue_response,
+	.delete_ctrl	= nvme_loop_delete_ctrl,
+};
+
+static struct nvmf_transport_ops nvme_loop_transport = {
+	.name		= "loop",
+	.module		= THIS_MODULE,
+	.create_ctrl	= nvme_loop_create_ctrl,
+};
+
+static int __init nvme_loop_init_module(void)
+{
+	int ret;
+
+	ret = nvmet_register_transport(&nvme_loop_ops);
+	if (ret)
+		return ret;
+
+	ret = nvmf_register_transport(&nvme_loop_transport);
+	if (ret)
+		nvmet_unregister_transport(&nvme_loop_ops);
+
+	return ret;
+}
+
+static void __exit nvme_loop_cleanup_module(void)
+{
+	struct nvme_loop_ctrl *ctrl, *next;
+
+	nvmf_unregister_transport(&nvme_loop_transport);
+	nvmet_unregister_transport(&nvme_loop_ops);
+
+	mutex_lock(&nvme_loop_ctrl_mutex);
+	list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list)
+		nvme_delete_ctrl(&ctrl->ctrl);
+	mutex_unlock(&nvme_loop_ctrl_mutex);
+
+	flush_workqueue(nvme_delete_wq);
+}
+
+module_init(nvme_loop_init_module);
+module_exit(nvme_loop_cleanup_module);
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("nvmet-transport-254"); /* 254 == NVMF_TRTYPE_LOOP */
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
new file mode 100644
index 0000000..15fd84a
--- /dev/null
+++ b/drivers/nvme/target/nvmet.h
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2015-2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _NVMET_H
+#define _NVMET_H
+
+#include <linux/dma-mapping.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/kref.h>
+#include <linux/percpu-refcount.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/uuid.h>
+#include <linux/nvme.h>
+#include <linux/configfs.h>
+#include <linux/rcupdate.h>
+#include <linux/blkdev.h>
+
+#define NVMET_ASYNC_EVENTS		4
+#define NVMET_ERROR_LOG_SLOTS		128
+
+/* Helper Macros when NVMe error is NVME_SC_CONNECT_INVALID_PARAM
+ * The 16 bit shift is to set IATTR bit to 1, which means offending
+ * offset starts in the data section of connect()
+ */
+#define IPO_IATTR_CONNECT_DATA(x)	\
+	(cpu_to_le32((1 << 16) | (offsetof(struct nvmf_connect_data, x))))
+#define IPO_IATTR_CONNECT_SQE(x)	\
+	(cpu_to_le32(offsetof(struct nvmf_connect_command, x)))
+
+struct nvmet_ns {
+	struct list_head	dev_link;
+	struct percpu_ref	ref;
+	struct block_device	*bdev;
+	u32			nsid;
+	u32			blksize_shift;
+	loff_t			size;
+	u8			nguid[16];
+	uuid_t			uuid;
+
+	bool			enabled;
+	struct nvmet_subsys	*subsys;
+	const char		*device_path;
+
+	struct config_group	device_group;
+	struct config_group	group;
+
+	struct completion	disable_done;
+};
+
+static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct nvmet_ns, group);
+}
+
+struct nvmet_cq {
+	u16			qid;
+	u16			size;
+};
+
+struct nvmet_sq {
+	struct nvmet_ctrl	*ctrl;
+	struct percpu_ref	ref;
+	u16			qid;
+	u16			size;
+	u32			sqhd;
+	struct completion	free_done;
+	struct completion	confirm_done;
+};
+
+/**
+ * struct nvmet_port -	Common structure to keep port
+ *				information for the target.
+ * @entry:		List head for holding a list of these elements.
+ * @disc_addr:		Address information is stored in a format defined
+ *				for a discovery log page entry.
+ * @group:		ConfigFS group for this element's folder.
+ * @priv:		Private data for the transport.
+ */
+struct nvmet_port {
+	struct list_head		entry;
+	struct nvmf_disc_rsp_page_entry	disc_addr;
+	struct config_group		group;
+	struct config_group		subsys_group;
+	struct list_head		subsystems;
+	struct config_group		referrals_group;
+	struct list_head		referrals;
+	void				*priv;
+	bool				enabled;
+};
+
+static inline struct nvmet_port *to_nvmet_port(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct nvmet_port,
+			group);
+}
+
+struct nvmet_ctrl {
+	struct nvmet_subsys	*subsys;
+	struct nvmet_cq		**cqs;
+	struct nvmet_sq		**sqs;
+
+	struct mutex		lock;
+	u64			cap;
+	u32			cc;
+	u32			csts;
+
+	uuid_t			hostid;
+	u16			cntlid;
+	u32			kato;
+
+	struct nvmet_req	*async_event_cmds[NVMET_ASYNC_EVENTS];
+	unsigned int		nr_async_event_cmds;
+	struct list_head	async_events;
+	struct work_struct	async_event_work;
+
+	struct list_head	subsys_entry;
+	struct kref		ref;
+	struct delayed_work	ka_work;
+	struct work_struct	fatal_err_work;
+
+	const struct nvmet_fabrics_ops *ops;
+
+	char			subsysnqn[NVMF_NQN_FIELD_LEN];
+	char			hostnqn[NVMF_NQN_FIELD_LEN];
+};
+
+struct nvmet_subsys {
+	enum nvme_subsys_type	type;
+
+	struct mutex		lock;
+	struct kref		ref;
+
+	struct list_head	namespaces;
+	unsigned int		max_nsid;
+
+	struct list_head	ctrls;
+
+	struct list_head	hosts;
+	bool			allow_any_host;
+
+	u16			max_qid;
+
+	u64			ver;
+	u64			serial;
+	char			*subsysnqn;
+
+	struct config_group	group;
+
+	struct config_group	namespaces_group;
+	struct config_group	allowed_hosts_group;
+};
+
+static inline struct nvmet_subsys *to_subsys(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct nvmet_subsys, group);
+}
+
+static inline struct nvmet_subsys *namespaces_to_subsys(
+		struct config_item *item)
+{
+	return container_of(to_config_group(item), struct nvmet_subsys,
+			namespaces_group);
+}
+
+struct nvmet_host {
+	struct config_group	group;
+};
+
+static inline struct nvmet_host *to_host(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct nvmet_host, group);
+}
+
+static inline char *nvmet_host_name(struct nvmet_host *host)
+{
+	return config_item_name(&host->group.cg_item);
+}
+
+struct nvmet_host_link {
+	struct list_head	entry;
+	struct nvmet_host	*host;
+};
+
+struct nvmet_subsys_link {
+	struct list_head	entry;
+	struct nvmet_subsys	*subsys;
+};
+
+struct nvmet_req;
+struct nvmet_fabrics_ops {
+	struct module *owner;
+	unsigned int type;
+	unsigned int sqe_inline_size;
+	unsigned int msdbd;
+	bool has_keyed_sgls : 1;
+	void (*queue_response)(struct nvmet_req *req);
+	int (*add_port)(struct nvmet_port *port);
+	void (*remove_port)(struct nvmet_port *port);
+	void (*delete_ctrl)(struct nvmet_ctrl *ctrl);
+	void (*disc_traddr)(struct nvmet_req *req,
+			struct nvmet_port *port, char *traddr);
+};
+
+#define NVMET_MAX_INLINE_BIOVEC	8
+
+struct nvmet_req {
+	struct nvme_command	*cmd;
+	struct nvme_completion	*rsp;
+	struct nvmet_sq		*sq;
+	struct nvmet_cq		*cq;
+	struct nvmet_ns		*ns;
+	struct scatterlist	*sg;
+	struct bio		inline_bio;
+	struct bio_vec		inline_bvec[NVMET_MAX_INLINE_BIOVEC];
+	int			sg_cnt;
+	/* data length as parsed from the command: */
+	size_t			data_len;
+	/* data length as parsed from the SGL descriptor: */
+	size_t			transfer_len;
+
+	struct nvmet_port	*port;
+
+	void (*execute)(struct nvmet_req *req);
+	const struct nvmet_fabrics_ops *ops;
+};
+
+static inline void nvmet_set_status(struct nvmet_req *req, u16 status)
+{
+	req->rsp->status = cpu_to_le16(status << 1);
+}
+
+static inline void nvmet_set_result(struct nvmet_req *req, u32 result)
+{
+	req->rsp->result.u32 = cpu_to_le32(result);
+}
+
+/*
+ * NVMe command writes actually are DMA reads for us on the target side.
+ */
+static inline enum dma_data_direction
+nvmet_data_dir(struct nvmet_req *req)
+{
+	return nvme_is_write(req->cmd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+}
+
+struct nvmet_async_event {
+	struct list_head	entry;
+	u8			event_type;
+	u8			event_info;
+	u8			log_page;
+};
+
+u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
+u16 nvmet_parse_io_cmd(struct nvmet_req *req);
+u16 nvmet_parse_admin_cmd(struct nvmet_req *req);
+u16 nvmet_parse_discovery_cmd(struct nvmet_req *req);
+u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
+
+bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
+		struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops);
+void nvmet_req_uninit(struct nvmet_req *req);
+void nvmet_req_execute(struct nvmet_req *req);
+void nvmet_req_complete(struct nvmet_req *req, u16 status);
+
+void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
+		u16 size);
+void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
+		u16 size);
+void nvmet_sq_destroy(struct nvmet_sq *sq);
+int nvmet_sq_init(struct nvmet_sq *sq);
+
+void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl);
+
+void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new);
+u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
+		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp);
+u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
+		struct nvmet_req *req, struct nvmet_ctrl **ret);
+void nvmet_ctrl_put(struct nvmet_ctrl *ctrl);
+u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd);
+
+struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
+		enum nvme_subsys_type type);
+void nvmet_subsys_put(struct nvmet_subsys *subsys);
+void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys);
+
+struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid);
+void nvmet_put_namespace(struct nvmet_ns *ns);
+int nvmet_ns_enable(struct nvmet_ns *ns);
+void nvmet_ns_disable(struct nvmet_ns *ns);
+struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid);
+void nvmet_ns_free(struct nvmet_ns *ns);
+
+int nvmet_register_transport(const struct nvmet_fabrics_ops *ops);
+void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops);
+
+int nvmet_enable_port(struct nvmet_port *port);
+void nvmet_disable_port(struct nvmet_port *port);
+
+void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port);
+void nvmet_referral_disable(struct nvmet_port *port);
+
+u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
+		size_t len);
+u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf,
+		size_t len);
+
+u32 nvmet_get_log_page_len(struct nvme_command *cmd);
+
+#define NVMET_QUEUE_SIZE	1024
+#define NVMET_NR_QUEUES		128
+#define NVMET_MAX_CMD		NVMET_QUEUE_SIZE
+#define NVMET_KAS		10
+#define NVMET_DISC_KATO		120
+
+int __init nvmet_init_configfs(void);
+void __exit nvmet_exit_configfs(void);
+
+int __init nvmet_init_discovery(void);
+void nvmet_exit_discovery(void);
+
+extern struct nvmet_subsys *nvmet_disc_subsys;
+extern u64 nvmet_genctr;
+extern struct rw_semaphore nvmet_config_sem;
+
+bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
+		const char *hostnqn);
+
+#endif /* _NVMET_H */
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
new file mode 100644
index 0000000..52e0c5d
--- /dev/null
+++ b/drivers/nvme/target/rdma.c
@@ -0,0 +1,1542 @@
+/*
+ * NVMe over Fabrics RDMA target.
+ * Copyright (c) 2015-2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/atomic.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/nvme.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/wait.h>
+#include <linux/inet.h>
+#include <asm/unaligned.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rw.h>
+
+#include <linux/nvme-rdma.h>
+#include "nvmet.h"
+
+/*
+ * We allow up to a page of inline data to go with the SQE
+ */
+#define NVMET_RDMA_INLINE_DATA_SIZE	PAGE_SIZE
+
+struct nvmet_rdma_cmd {
+	struct ib_sge		sge[2];
+	struct ib_cqe		cqe;
+	struct ib_recv_wr	wr;
+	struct scatterlist	inline_sg;
+	struct page		*inline_page;
+	struct nvme_command     *nvme_cmd;
+	struct nvmet_rdma_queue	*queue;
+};
+
+enum {
+	NVMET_RDMA_REQ_INLINE_DATA	= (1 << 0),
+	NVMET_RDMA_REQ_INVALIDATE_RKEY	= (1 << 1),
+};
+
+struct nvmet_rdma_rsp {
+	struct ib_sge		send_sge;
+	struct ib_cqe		send_cqe;
+	struct ib_send_wr	send_wr;
+
+	struct nvmet_rdma_cmd	*cmd;
+	struct nvmet_rdma_queue	*queue;
+
+	struct ib_cqe		read_cqe;
+	struct rdma_rw_ctx	rw;
+
+	struct nvmet_req	req;
+
+	u8			n_rdma;
+	u32			flags;
+	u32			invalidate_rkey;
+
+	struct list_head	wait_list;
+	struct list_head	free_list;
+};
+
+enum nvmet_rdma_queue_state {
+	NVMET_RDMA_Q_CONNECTING,
+	NVMET_RDMA_Q_LIVE,
+	NVMET_RDMA_Q_DISCONNECTING,
+};
+
+struct nvmet_rdma_queue {
+	struct rdma_cm_id	*cm_id;
+	struct nvmet_port	*port;
+	struct ib_cq		*cq;
+	atomic_t		sq_wr_avail;
+	struct nvmet_rdma_device *dev;
+	spinlock_t		state_lock;
+	enum nvmet_rdma_queue_state state;
+	struct nvmet_cq		nvme_cq;
+	struct nvmet_sq		nvme_sq;
+
+	struct nvmet_rdma_rsp	*rsps;
+	struct list_head	free_rsps;
+	spinlock_t		rsps_lock;
+	struct nvmet_rdma_cmd	*cmds;
+
+	struct work_struct	release_work;
+	struct list_head	rsp_wait_list;
+	struct list_head	rsp_wr_wait_list;
+	spinlock_t		rsp_wr_wait_lock;
+
+	int			idx;
+	int			host_qid;
+	int			recv_queue_size;
+	int			send_queue_size;
+
+	struct list_head	queue_list;
+};
+
+struct nvmet_rdma_device {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	struct ib_srq		*srq;
+	struct nvmet_rdma_cmd	*srq_cmds;
+	size_t			srq_size;
+	struct kref		ref;
+	struct list_head	entry;
+};
+
+static bool nvmet_rdma_use_srq;
+module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
+MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
+
+static DEFINE_IDA(nvmet_rdma_queue_ida);
+static LIST_HEAD(nvmet_rdma_queue_list);
+static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
+
+static LIST_HEAD(device_list);
+static DEFINE_MUTEX(device_list_mutex);
+
+static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
+static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
+static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
+static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
+static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
+
+static const struct nvmet_fabrics_ops nvmet_rdma_ops;
+
+/* XXX: really should move to a generic header sooner or later.. */
+static inline u32 get_unaligned_le24(const u8 *p)
+{
+	return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16;
+}
+
+static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
+{
+	return nvme_is_write(rsp->req.cmd) &&
+		rsp->req.transfer_len &&
+		!(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
+}
+
+static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
+{
+	return !nvme_is_write(rsp->req.cmd) &&
+		rsp->req.transfer_len &&
+		!rsp->req.rsp->status &&
+		!(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
+}
+
+static inline struct nvmet_rdma_rsp *
+nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
+{
+	struct nvmet_rdma_rsp *rsp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&queue->rsps_lock, flags);
+	rsp = list_first_entry(&queue->free_rsps,
+				struct nvmet_rdma_rsp, free_list);
+	list_del(&rsp->free_list);
+	spin_unlock_irqrestore(&queue->rsps_lock, flags);
+
+	return rsp;
+}
+
+static inline void
+nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
+	list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
+	spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
+}
+
+static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
+			struct nvmet_rdma_cmd *c, bool admin)
+{
+	/* NVMe command / RDMA RECV */
+	c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL);
+	if (!c->nvme_cmd)
+		goto out;
+
+	c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd,
+			sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(ndev->device, c->sge[0].addr))
+		goto out_free_cmd;
+
+	c->sge[0].length = sizeof(*c->nvme_cmd);
+	c->sge[0].lkey = ndev->pd->local_dma_lkey;
+
+	if (!admin) {
+		c->inline_page = alloc_pages(GFP_KERNEL,
+				get_order(NVMET_RDMA_INLINE_DATA_SIZE));
+		if (!c->inline_page)
+			goto out_unmap_cmd;
+		c->sge[1].addr = ib_dma_map_page(ndev->device,
+				c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE,
+				DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(ndev->device, c->sge[1].addr))
+			goto out_free_inline_page;
+		c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE;
+		c->sge[1].lkey = ndev->pd->local_dma_lkey;
+	}
+
+	c->cqe.done = nvmet_rdma_recv_done;
+
+	c->wr.wr_cqe = &c->cqe;
+	c->wr.sg_list = c->sge;
+	c->wr.num_sge = admin ? 1 : 2;
+
+	return 0;
+
+out_free_inline_page:
+	if (!admin) {
+		__free_pages(c->inline_page,
+				get_order(NVMET_RDMA_INLINE_DATA_SIZE));
+	}
+out_unmap_cmd:
+	ib_dma_unmap_single(ndev->device, c->sge[0].addr,
+			sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
+out_free_cmd:
+	kfree(c->nvme_cmd);
+
+out:
+	return -ENOMEM;
+}
+
+static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
+		struct nvmet_rdma_cmd *c, bool admin)
+{
+	if (!admin) {
+		ib_dma_unmap_page(ndev->device, c->sge[1].addr,
+				NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE);
+		__free_pages(c->inline_page,
+				get_order(NVMET_RDMA_INLINE_DATA_SIZE));
+	}
+	ib_dma_unmap_single(ndev->device, c->sge[0].addr,
+				sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
+	kfree(c->nvme_cmd);
+}
+
+static struct nvmet_rdma_cmd *
+nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev,
+		int nr_cmds, bool admin)
+{
+	struct nvmet_rdma_cmd *cmds;
+	int ret = -EINVAL, i;
+
+	cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL);
+	if (!cmds)
+		goto out;
+
+	for (i = 0; i < nr_cmds; i++) {
+		ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin);
+		if (ret)
+			goto out_free;
+	}
+
+	return cmds;
+
+out_free:
+	while (--i >= 0)
+		nvmet_rdma_free_cmd(ndev, cmds + i, admin);
+	kfree(cmds);
+out:
+	return ERR_PTR(ret);
+}
+
+static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
+		struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin)
+{
+	int i;
+
+	for (i = 0; i < nr_cmds; i++)
+		nvmet_rdma_free_cmd(ndev, cmds + i, admin);
+	kfree(cmds);
+}
+
+static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
+		struct nvmet_rdma_rsp *r)
+{
+	/* NVMe CQE / RDMA SEND */
+	r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL);
+	if (!r->req.rsp)
+		goto out;
+
+	r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp,
+			sizeof(*r->req.rsp), DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
+		goto out_free_rsp;
+
+	r->send_sge.length = sizeof(*r->req.rsp);
+	r->send_sge.lkey = ndev->pd->local_dma_lkey;
+
+	r->send_cqe.done = nvmet_rdma_send_done;
+
+	r->send_wr.wr_cqe = &r->send_cqe;
+	r->send_wr.sg_list = &r->send_sge;
+	r->send_wr.num_sge = 1;
+	r->send_wr.send_flags = IB_SEND_SIGNALED;
+
+	/* Data In / RDMA READ */
+	r->read_cqe.done = nvmet_rdma_read_data_done;
+	return 0;
+
+out_free_rsp:
+	kfree(r->req.rsp);
+out:
+	return -ENOMEM;
+}
+
+static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
+		struct nvmet_rdma_rsp *r)
+{
+	ib_dma_unmap_single(ndev->device, r->send_sge.addr,
+				sizeof(*r->req.rsp), DMA_TO_DEVICE);
+	kfree(r->req.rsp);
+}
+
+static int
+nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue)
+{
+	struct nvmet_rdma_device *ndev = queue->dev;
+	int nr_rsps = queue->recv_queue_size * 2;
+	int ret = -EINVAL, i;
+
+	queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp),
+			GFP_KERNEL);
+	if (!queue->rsps)
+		goto out;
+
+	for (i = 0; i < nr_rsps; i++) {
+		struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
+
+		ret = nvmet_rdma_alloc_rsp(ndev, rsp);
+		if (ret)
+			goto out_free;
+
+		list_add_tail(&rsp->free_list, &queue->free_rsps);
+	}
+
+	return 0;
+
+out_free:
+	while (--i >= 0) {
+		struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
+
+		list_del(&rsp->free_list);
+		nvmet_rdma_free_rsp(ndev, rsp);
+	}
+	kfree(queue->rsps);
+out:
+	return ret;
+}
+
+static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue)
+{
+	struct nvmet_rdma_device *ndev = queue->dev;
+	int i, nr_rsps = queue->recv_queue_size * 2;
+
+	for (i = 0; i < nr_rsps; i++) {
+		struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
+
+		list_del(&rsp->free_list);
+		nvmet_rdma_free_rsp(ndev, rsp);
+	}
+	kfree(queue->rsps);
+}
+
+static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
+		struct nvmet_rdma_cmd *cmd)
+{
+	struct ib_recv_wr *bad_wr;
+
+	ib_dma_sync_single_for_device(ndev->device,
+		cmd->sge[0].addr, cmd->sge[0].length,
+		DMA_FROM_DEVICE);
+
+	if (ndev->srq)
+		return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr);
+	return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr);
+}
+
+static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
+{
+	spin_lock(&queue->rsp_wr_wait_lock);
+	while (!list_empty(&queue->rsp_wr_wait_list)) {
+		struct nvmet_rdma_rsp *rsp;
+		bool ret;
+
+		rsp = list_entry(queue->rsp_wr_wait_list.next,
+				struct nvmet_rdma_rsp, wait_list);
+		list_del(&rsp->wait_list);
+
+		spin_unlock(&queue->rsp_wr_wait_lock);
+		ret = nvmet_rdma_execute_command(rsp);
+		spin_lock(&queue->rsp_wr_wait_lock);
+
+		if (!ret) {
+			list_add(&rsp->wait_list, &queue->rsp_wr_wait_list);
+			break;
+		}
+	}
+	spin_unlock(&queue->rsp_wr_wait_lock);
+}
+
+
+static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
+{
+	struct nvmet_rdma_queue *queue = rsp->queue;
+
+	atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
+
+	if (rsp->n_rdma) {
+		rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
+				queue->cm_id->port_num, rsp->req.sg,
+				rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
+	}
+
+	if (rsp->req.sg != &rsp->cmd->inline_sg)
+		sgl_free(rsp->req.sg);
+
+	if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
+		nvmet_rdma_process_wr_wait_list(queue);
+
+	nvmet_rdma_put_rsp(rsp);
+}
+
+static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
+{
+	if (queue->nvme_sq.ctrl) {
+		nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
+	} else {
+		/*
+		 * we didn't setup the controller yet in case
+		 * of admin connect error, just disconnect and
+		 * cleanup the queue
+		 */
+		nvmet_rdma_queue_disconnect(queue);
+	}
+}
+
+static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct nvmet_rdma_rsp *rsp =
+		container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
+
+	nvmet_rdma_release_rsp(rsp);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS &&
+		     wc->status != IB_WC_WR_FLUSH_ERR)) {
+		pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
+			wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
+		nvmet_rdma_error_comp(rsp->queue);
+	}
+}
+
+static void nvmet_rdma_queue_response(struct nvmet_req *req)
+{
+	struct nvmet_rdma_rsp *rsp =
+		container_of(req, struct nvmet_rdma_rsp, req);
+	struct rdma_cm_id *cm_id = rsp->queue->cm_id;
+	struct ib_send_wr *first_wr, *bad_wr;
+
+	if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
+		rsp->send_wr.opcode = IB_WR_SEND_WITH_INV;
+		rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey;
+	} else {
+		rsp->send_wr.opcode = IB_WR_SEND;
+	}
+
+	if (nvmet_rdma_need_data_out(rsp))
+		first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
+				cm_id->port_num, NULL, &rsp->send_wr);
+	else
+		first_wr = &rsp->send_wr;
+
+	nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
+
+	ib_dma_sync_single_for_device(rsp->queue->dev->device,
+		rsp->send_sge.addr, rsp->send_sge.length,
+		DMA_TO_DEVICE);
+
+	if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) {
+		pr_err("sending cmd response failed\n");
+		nvmet_rdma_release_rsp(rsp);
+	}
+}
+
+static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct nvmet_rdma_rsp *rsp =
+		container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
+	struct nvmet_rdma_queue *queue = cq->cq_context;
+
+	WARN_ON(rsp->n_rdma <= 0);
+	atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
+	rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
+			queue->cm_id->port_num, rsp->req.sg,
+			rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
+	rsp->n_rdma = 0;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		nvmet_req_uninit(&rsp->req);
+		nvmet_rdma_release_rsp(rsp);
+		if (wc->status != IB_WC_WR_FLUSH_ERR) {
+			pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n",
+				wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
+			nvmet_rdma_error_comp(queue);
+		}
+		return;
+	}
+
+	nvmet_req_execute(&rsp->req);
+}
+
+static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
+		u64 off)
+{
+	sg_init_table(&rsp->cmd->inline_sg, 1);
+	sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off);
+	rsp->req.sg = &rsp->cmd->inline_sg;
+	rsp->req.sg_cnt = 1;
+}
+
+static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
+{
+	struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl;
+	u64 off = le64_to_cpu(sgl->addr);
+	u32 len = le32_to_cpu(sgl->length);
+
+	if (!nvme_is_write(rsp->req.cmd))
+		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+
+	if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) {
+		pr_err("invalid inline data offset!\n");
+		return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
+	}
+
+	/* no data command? */
+	if (!len)
+		return 0;
+
+	nvmet_rdma_use_inline_sg(rsp, len, off);
+	rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
+	rsp->req.transfer_len += len;
+	return 0;
+}
+
+static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
+		struct nvme_keyed_sgl_desc *sgl, bool invalidate)
+{
+	struct rdma_cm_id *cm_id = rsp->queue->cm_id;
+	u64 addr = le64_to_cpu(sgl->addr);
+	u32 len = get_unaligned_le24(sgl->length);
+	u32 key = get_unaligned_le32(sgl->key);
+	int ret;
+
+	/* no data command? */
+	if (!len)
+		return 0;
+
+	rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt);
+	if (!rsp->req.sg)
+		return NVME_SC_INTERNAL;
+
+	ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
+			rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
+			nvmet_data_dir(&rsp->req));
+	if (ret < 0)
+		return NVME_SC_INTERNAL;
+	rsp->req.transfer_len += len;
+	rsp->n_rdma += ret;
+
+	if (invalidate) {
+		rsp->invalidate_rkey = key;
+		rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY;
+	}
+
+	return 0;
+}
+
+static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
+{
+	struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl;
+
+	switch (sgl->type >> 4) {
+	case NVME_SGL_FMT_DATA_DESC:
+		switch (sgl->type & 0xf) {
+		case NVME_SGL_FMT_OFFSET:
+			return nvmet_rdma_map_sgl_inline(rsp);
+		default:
+			pr_err("invalid SGL subtype: %#x\n", sgl->type);
+			return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		}
+	case NVME_KEY_SGL_FMT_DATA_DESC:
+		switch (sgl->type & 0xf) {
+		case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE:
+			return nvmet_rdma_map_sgl_keyed(rsp, sgl, true);
+		case NVME_SGL_FMT_ADDRESS:
+			return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
+		default:
+			pr_err("invalid SGL subtype: %#x\n", sgl->type);
+			return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		}
+	default:
+		pr_err("invalid SGL type: %#x\n", sgl->type);
+		return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
+	}
+}
+
+static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
+{
+	struct nvmet_rdma_queue *queue = rsp->queue;
+
+	if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
+			&queue->sq_wr_avail) < 0)) {
+		pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n",
+				1 + rsp->n_rdma, queue->idx,
+				queue->nvme_sq.ctrl->cntlid);
+		atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
+		return false;
+	}
+
+	if (nvmet_rdma_need_data_in(rsp)) {
+		if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp,
+				queue->cm_id->port_num, &rsp->read_cqe, NULL))
+			nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
+	} else {
+		nvmet_req_execute(&rsp->req);
+	}
+
+	return true;
+}
+
+static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
+		struct nvmet_rdma_rsp *cmd)
+{
+	u16 status;
+
+	ib_dma_sync_single_for_cpu(queue->dev->device,
+		cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length,
+		DMA_FROM_DEVICE);
+	ib_dma_sync_single_for_cpu(queue->dev->device,
+		cmd->send_sge.addr, cmd->send_sge.length,
+		DMA_TO_DEVICE);
+
+	if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
+			&queue->nvme_sq, &nvmet_rdma_ops))
+		return;
+
+	status = nvmet_rdma_map_sgl(cmd);
+	if (status)
+		goto out_err;
+
+	if (unlikely(!nvmet_rdma_execute_command(cmd))) {
+		spin_lock(&queue->rsp_wr_wait_lock);
+		list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list);
+		spin_unlock(&queue->rsp_wr_wait_lock);
+	}
+
+	return;
+
+out_err:
+	nvmet_req_complete(&cmd->req, status);
+}
+
+static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct nvmet_rdma_cmd *cmd =
+		container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
+	struct nvmet_rdma_queue *queue = cq->cq_context;
+	struct nvmet_rdma_rsp *rsp;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		if (wc->status != IB_WC_WR_FLUSH_ERR) {
+			pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
+				wc->wr_cqe, ib_wc_status_msg(wc->status),
+				wc->status);
+			nvmet_rdma_error_comp(queue);
+		}
+		return;
+	}
+
+	if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
+		pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
+		nvmet_rdma_error_comp(queue);
+		return;
+	}
+
+	cmd->queue = queue;
+	rsp = nvmet_rdma_get_rsp(queue);
+	rsp->queue = queue;
+	rsp->cmd = cmd;
+	rsp->flags = 0;
+	rsp->req.cmd = cmd->nvme_cmd;
+	rsp->req.port = queue->port;
+	rsp->n_rdma = 0;
+
+	if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&queue->state_lock, flags);
+		if (queue->state == NVMET_RDMA_Q_CONNECTING)
+			list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
+		else
+			nvmet_rdma_put_rsp(rsp);
+		spin_unlock_irqrestore(&queue->state_lock, flags);
+		return;
+	}
+
+	nvmet_rdma_handle_command(queue, rsp);
+}
+
+static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
+{
+	if (!ndev->srq)
+		return;
+
+	nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
+	ib_destroy_srq(ndev->srq);
+}
+
+static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
+{
+	struct ib_srq_init_attr srq_attr = { NULL, };
+	struct ib_srq *srq;
+	size_t srq_size;
+	int ret, i;
+
+	srq_size = 4095;	/* XXX: tune */
+
+	srq_attr.attr.max_wr = srq_size;
+	srq_attr.attr.max_sge = 2;
+	srq_attr.attr.srq_limit = 0;
+	srq_attr.srq_type = IB_SRQT_BASIC;
+	srq = ib_create_srq(ndev->pd, &srq_attr);
+	if (IS_ERR(srq)) {
+		/*
+		 * If SRQs aren't supported we just go ahead and use normal
+		 * non-shared receive queues.
+		 */
+		pr_info("SRQ requested but not supported.\n");
+		return 0;
+	}
+
+	ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
+	if (IS_ERR(ndev->srq_cmds)) {
+		ret = PTR_ERR(ndev->srq_cmds);
+		goto out_destroy_srq;
+	}
+
+	ndev->srq = srq;
+	ndev->srq_size = srq_size;
+
+	for (i = 0; i < srq_size; i++)
+		nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
+
+	return 0;
+
+out_destroy_srq:
+	ib_destroy_srq(srq);
+	return ret;
+}
+
+static void nvmet_rdma_free_dev(struct kref *ref)
+{
+	struct nvmet_rdma_device *ndev =
+		container_of(ref, struct nvmet_rdma_device, ref);
+
+	mutex_lock(&device_list_mutex);
+	list_del(&ndev->entry);
+	mutex_unlock(&device_list_mutex);
+
+	nvmet_rdma_destroy_srq(ndev);
+	ib_dealloc_pd(ndev->pd);
+
+	kfree(ndev);
+}
+
+static struct nvmet_rdma_device *
+nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
+{
+	struct nvmet_rdma_device *ndev;
+	int ret;
+
+	mutex_lock(&device_list_mutex);
+	list_for_each_entry(ndev, &device_list, entry) {
+		if (ndev->device->node_guid == cm_id->device->node_guid &&
+		    kref_get_unless_zero(&ndev->ref))
+			goto out_unlock;
+	}
+
+	ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
+	if (!ndev)
+		goto out_err;
+
+	ndev->device = cm_id->device;
+	kref_init(&ndev->ref);
+
+	ndev->pd = ib_alloc_pd(ndev->device, 0);
+	if (IS_ERR(ndev->pd))
+		goto out_free_dev;
+
+	if (nvmet_rdma_use_srq) {
+		ret = nvmet_rdma_init_srq(ndev);
+		if (ret)
+			goto out_free_pd;
+	}
+
+	list_add(&ndev->entry, &device_list);
+out_unlock:
+	mutex_unlock(&device_list_mutex);
+	pr_debug("added %s.\n", ndev->device->name);
+	return ndev;
+
+out_free_pd:
+	ib_dealloc_pd(ndev->pd);
+out_free_dev:
+	kfree(ndev);
+out_err:
+	mutex_unlock(&device_list_mutex);
+	return NULL;
+}
+
+static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
+{
+	struct ib_qp_init_attr qp_attr;
+	struct nvmet_rdma_device *ndev = queue->dev;
+	int comp_vector, nr_cqe, ret, i;
+
+	/*
+	 * Spread the io queues across completion vectors,
+	 * but still keep all admin queues on vector 0.
+	 */
+	comp_vector = !queue->host_qid ? 0 :
+		queue->idx % ndev->device->num_comp_vectors;
+
+	/*
+	 * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND.
+	 */
+	nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
+
+	queue->cq = ib_alloc_cq(ndev->device, queue,
+			nr_cqe + 1, comp_vector,
+			IB_POLL_WORKQUEUE);
+	if (IS_ERR(queue->cq)) {
+		ret = PTR_ERR(queue->cq);
+		pr_err("failed to create CQ cqe= %d ret= %d\n",
+		       nr_cqe + 1, ret);
+		goto out;
+	}
+
+	memset(&qp_attr, 0, sizeof(qp_attr));
+	qp_attr.qp_context = queue;
+	qp_attr.event_handler = nvmet_rdma_qp_event;
+	qp_attr.send_cq = queue->cq;
+	qp_attr.recv_cq = queue->cq;
+	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	qp_attr.qp_type = IB_QPT_RC;
+	/* +1 for drain */
+	qp_attr.cap.max_send_wr = queue->send_queue_size + 1;
+	qp_attr.cap.max_rdma_ctxs = queue->send_queue_size;
+	qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
+					ndev->device->attrs.max_sge);
+
+	if (ndev->srq) {
+		qp_attr.srq = ndev->srq;
+	} else {
+		/* +1 for drain */
+		qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
+		qp_attr.cap.max_recv_sge = 2;
+	}
+
+	ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
+	if (ret) {
+		pr_err("failed to create_qp ret= %d\n", ret);
+		goto err_destroy_cq;
+	}
+
+	atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
+
+	pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
+		 __func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
+		 qp_attr.cap.max_send_wr, queue->cm_id);
+
+	if (!ndev->srq) {
+		for (i = 0; i < queue->recv_queue_size; i++) {
+			queue->cmds[i].queue = queue;
+			nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
+		}
+	}
+
+out:
+	return ret;
+
+err_destroy_cq:
+	ib_free_cq(queue->cq);
+	goto out;
+}
+
+static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
+{
+	struct ib_qp *qp = queue->cm_id->qp;
+
+	ib_drain_qp(qp);
+	rdma_destroy_id(queue->cm_id);
+	ib_destroy_qp(qp);
+	ib_free_cq(queue->cq);
+}
+
+static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
+{
+	pr_debug("freeing queue %d\n", queue->idx);
+
+	nvmet_sq_destroy(&queue->nvme_sq);
+
+	nvmet_rdma_destroy_queue_ib(queue);
+	if (!queue->dev->srq) {
+		nvmet_rdma_free_cmds(queue->dev, queue->cmds,
+				queue->recv_queue_size,
+				!queue->host_qid);
+	}
+	nvmet_rdma_free_rsps(queue);
+	ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
+	kfree(queue);
+}
+
+static void nvmet_rdma_release_queue_work(struct work_struct *w)
+{
+	struct nvmet_rdma_queue *queue =
+		container_of(w, struct nvmet_rdma_queue, release_work);
+	struct nvmet_rdma_device *dev = queue->dev;
+
+	nvmet_rdma_free_queue(queue);
+
+	kref_put(&dev->ref, nvmet_rdma_free_dev);
+}
+
+static int
+nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn,
+				struct nvmet_rdma_queue *queue)
+{
+	struct nvme_rdma_cm_req *req;
+
+	req = (struct nvme_rdma_cm_req *)conn->private_data;
+	if (!req || conn->private_data_len == 0)
+		return NVME_RDMA_CM_INVALID_LEN;
+
+	if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0)
+		return NVME_RDMA_CM_INVALID_RECFMT;
+
+	queue->host_qid = le16_to_cpu(req->qid);
+
+	/*
+	 * req->hsqsize corresponds to our recv queue size plus 1
+	 * req->hrqsize corresponds to our send queue size
+	 */
+	queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1;
+	queue->send_queue_size = le16_to_cpu(req->hrqsize);
+
+	if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH)
+		return NVME_RDMA_CM_INVALID_HSQSIZE;
+
+	/* XXX: Should we enforce some kind of max for IO queues? */
+
+	return 0;
+}
+
+static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
+				enum nvme_rdma_cm_status status)
+{
+	struct nvme_rdma_cm_rej rej;
+
+	pr_debug("rejecting connect request: status %d (%s)\n",
+		 status, nvme_rdma_cm_msg(status));
+
+	rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
+	rej.sts = cpu_to_le16(status);
+
+	return rdma_reject(cm_id, (void *)&rej, sizeof(rej));
+}
+
+static struct nvmet_rdma_queue *
+nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
+		struct rdma_cm_id *cm_id,
+		struct rdma_cm_event *event)
+{
+	struct nvmet_rdma_queue *queue;
+	int ret;
+
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue) {
+		ret = NVME_RDMA_CM_NO_RSC;
+		goto out_reject;
+	}
+
+	ret = nvmet_sq_init(&queue->nvme_sq);
+	if (ret) {
+		ret = NVME_RDMA_CM_NO_RSC;
+		goto out_free_queue;
+	}
+
+	ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
+	if (ret)
+		goto out_destroy_sq;
+
+	/*
+	 * Schedules the actual release because calling rdma_destroy_id from
+	 * inside a CM callback would trigger a deadlock. (great API design..)
+	 */
+	INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
+	queue->dev = ndev;
+	queue->cm_id = cm_id;
+
+	spin_lock_init(&queue->state_lock);
+	queue->state = NVMET_RDMA_Q_CONNECTING;
+	INIT_LIST_HEAD(&queue->rsp_wait_list);
+	INIT_LIST_HEAD(&queue->rsp_wr_wait_list);
+	spin_lock_init(&queue->rsp_wr_wait_lock);
+	INIT_LIST_HEAD(&queue->free_rsps);
+	spin_lock_init(&queue->rsps_lock);
+	INIT_LIST_HEAD(&queue->queue_list);
+
+	queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
+	if (queue->idx < 0) {
+		ret = NVME_RDMA_CM_NO_RSC;
+		goto out_destroy_sq;
+	}
+
+	ret = nvmet_rdma_alloc_rsps(queue);
+	if (ret) {
+		ret = NVME_RDMA_CM_NO_RSC;
+		goto out_ida_remove;
+	}
+
+	if (!ndev->srq) {
+		queue->cmds = nvmet_rdma_alloc_cmds(ndev,
+				queue->recv_queue_size,
+				!queue->host_qid);
+		if (IS_ERR(queue->cmds)) {
+			ret = NVME_RDMA_CM_NO_RSC;
+			goto out_free_responses;
+		}
+	}
+
+	ret = nvmet_rdma_create_queue_ib(queue);
+	if (ret) {
+		pr_err("%s: creating RDMA queue failed (%d).\n",
+			__func__, ret);
+		ret = NVME_RDMA_CM_NO_RSC;
+		goto out_free_cmds;
+	}
+
+	return queue;
+
+out_free_cmds:
+	if (!ndev->srq) {
+		nvmet_rdma_free_cmds(queue->dev, queue->cmds,
+				queue->recv_queue_size,
+				!queue->host_qid);
+	}
+out_free_responses:
+	nvmet_rdma_free_rsps(queue);
+out_ida_remove:
+	ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
+out_destroy_sq:
+	nvmet_sq_destroy(&queue->nvme_sq);
+out_free_queue:
+	kfree(queue);
+out_reject:
+	nvmet_rdma_cm_reject(cm_id, ret);
+	return NULL;
+}
+
+static void nvmet_rdma_qp_event(struct ib_event *event, void *priv)
+{
+	struct nvmet_rdma_queue *queue = priv;
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		rdma_notify(queue->cm_id, event->event);
+		break;
+	default:
+		pr_err("received IB QP event: %s (%d)\n",
+		       ib_event_msg(event->event), event->event);
+		break;
+	}
+}
+
+static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
+		struct nvmet_rdma_queue *queue,
+		struct rdma_conn_param *p)
+{
+	struct rdma_conn_param  param = { };
+	struct nvme_rdma_cm_rep priv = { };
+	int ret = -ENOMEM;
+
+	param.rnr_retry_count = 7;
+	param.flow_control = 1;
+	param.initiator_depth = min_t(u8, p->initiator_depth,
+		queue->dev->device->attrs.max_qp_init_rd_atom);
+	param.private_data = &priv;
+	param.private_data_len = sizeof(priv);
+	priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
+	priv.crqsize = cpu_to_le16(queue->recv_queue_size);
+
+	ret = rdma_accept(cm_id, &param);
+	if (ret)
+		pr_err("rdma_accept failed (error code = %d)\n", ret);
+
+	return ret;
+}
+
+static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
+		struct rdma_cm_event *event)
+{
+	struct nvmet_rdma_device *ndev;
+	struct nvmet_rdma_queue *queue;
+	int ret = -EINVAL;
+
+	ndev = nvmet_rdma_find_get_device(cm_id);
+	if (!ndev) {
+		nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
+		return -ECONNREFUSED;
+	}
+
+	queue = nvmet_rdma_alloc_queue(ndev, cm_id, event);
+	if (!queue) {
+		ret = -ENOMEM;
+		goto put_device;
+	}
+	queue->port = cm_id->context;
+
+	if (queue->host_qid == 0) {
+		/* Let inflight controller teardown complete */
+		flush_scheduled_work();
+	}
+
+	ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
+	if (ret) {
+		schedule_work(&queue->release_work);
+		/* Destroying rdma_cm id is not needed here */
+		return 0;
+	}
+
+	mutex_lock(&nvmet_rdma_queue_mutex);
+	list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
+	mutex_unlock(&nvmet_rdma_queue_mutex);
+
+	return 0;
+
+put_device:
+	kref_put(&ndev->ref, nvmet_rdma_free_dev);
+
+	return ret;
+}
+
+static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&queue->state_lock, flags);
+	if (queue->state != NVMET_RDMA_Q_CONNECTING) {
+		pr_warn("trying to establish a connected queue\n");
+		goto out_unlock;
+	}
+	queue->state = NVMET_RDMA_Q_LIVE;
+
+	while (!list_empty(&queue->rsp_wait_list)) {
+		struct nvmet_rdma_rsp *cmd;
+
+		cmd = list_first_entry(&queue->rsp_wait_list,
+					struct nvmet_rdma_rsp, wait_list);
+		list_del(&cmd->wait_list);
+
+		spin_unlock_irqrestore(&queue->state_lock, flags);
+		nvmet_rdma_handle_command(queue, cmd);
+		spin_lock_irqsave(&queue->state_lock, flags);
+	}
+
+out_unlock:
+	spin_unlock_irqrestore(&queue->state_lock, flags);
+}
+
+static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
+{
+	bool disconnect = false;
+	unsigned long flags;
+
+	pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state);
+
+	spin_lock_irqsave(&queue->state_lock, flags);
+	switch (queue->state) {
+	case NVMET_RDMA_Q_CONNECTING:
+	case NVMET_RDMA_Q_LIVE:
+		queue->state = NVMET_RDMA_Q_DISCONNECTING;
+		disconnect = true;
+		break;
+	case NVMET_RDMA_Q_DISCONNECTING:
+		break;
+	}
+	spin_unlock_irqrestore(&queue->state_lock, flags);
+
+	if (disconnect) {
+		rdma_disconnect(queue->cm_id);
+		schedule_work(&queue->release_work);
+	}
+}
+
+static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
+{
+	bool disconnect = false;
+
+	mutex_lock(&nvmet_rdma_queue_mutex);
+	if (!list_empty(&queue->queue_list)) {
+		list_del_init(&queue->queue_list);
+		disconnect = true;
+	}
+	mutex_unlock(&nvmet_rdma_queue_mutex);
+
+	if (disconnect)
+		__nvmet_rdma_queue_disconnect(queue);
+}
+
+static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
+		struct nvmet_rdma_queue *queue)
+{
+	WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING);
+
+	mutex_lock(&nvmet_rdma_queue_mutex);
+	if (!list_empty(&queue->queue_list))
+		list_del_init(&queue->queue_list);
+	mutex_unlock(&nvmet_rdma_queue_mutex);
+
+	pr_err("failed to connect queue %d\n", queue->idx);
+	schedule_work(&queue->release_work);
+}
+
+/**
+ * nvme_rdma_device_removal() - Handle RDMA device removal
+ * @cm_id:	rdma_cm id, used for nvmet port
+ * @queue:      nvmet rdma queue (cm id qp_context)
+ *
+ * DEVICE_REMOVAL event notifies us that the RDMA device is about
+ * to unplug. Note that this event can be generated on a normal
+ * queue cm_id and/or a device bound listener cm_id (where in this
+ * case queue will be null).
+ *
+ * We registered an ib_client to handle device removal for queues,
+ * so we only need to handle the listening port cm_ids. In this case
+ * we nullify the priv to prevent double cm_id destruction and destroying
+ * the cm_id implicitely by returning a non-zero rc to the callout.
+ */
+static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
+		struct nvmet_rdma_queue *queue)
+{
+	struct nvmet_port *port;
+
+	if (queue) {
+		/*
+		 * This is a queue cm_id. we have registered
+		 * an ib_client to handle queues removal
+		 * so don't interfear and just return.
+		 */
+		return 0;
+	}
+
+	port = cm_id->context;
+
+	/*
+	 * This is a listener cm_id. Make sure that
+	 * future remove_port won't invoke a double
+	 * cm_id destroy. use atomic xchg to make sure
+	 * we don't compete with remove_port.
+	 */
+	if (xchg(&port->priv, NULL) != cm_id)
+		return 0;
+
+	/*
+	 * We need to return 1 so that the core will destroy
+	 * it's own ID.  What a great API design..
+	 */
+	return 1;
+}
+
+static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
+		struct rdma_cm_event *event)
+{
+	struct nvmet_rdma_queue *queue = NULL;
+	int ret = 0;
+
+	if (cm_id->qp)
+		queue = cm_id->qp->qp_context;
+
+	pr_debug("%s (%d): status %d id %p\n",
+		rdma_event_msg(event->event), event->event,
+		event->status, cm_id);
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		ret = nvmet_rdma_queue_connect(cm_id, event);
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		nvmet_rdma_queue_established(queue);
+		break;
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+	case RDMA_CM_EVENT_DISCONNECTED:
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		nvmet_rdma_queue_disconnect(queue);
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		ret = nvmet_rdma_device_removal(cm_id, queue);
+		break;
+	case RDMA_CM_EVENT_REJECTED:
+		pr_debug("Connection rejected: %s\n",
+			 rdma_reject_msg(cm_id, event->status));
+		/* FALLTHROUGH */
+	case RDMA_CM_EVENT_UNREACHABLE:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		nvmet_rdma_queue_connect_fail(cm_id, queue);
+		break;
+	default:
+		pr_err("received unrecognized RDMA CM event %d\n",
+			event->event);
+		break;
+	}
+
+	return ret;
+}
+
+static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
+{
+	struct nvmet_rdma_queue *queue;
+
+restart:
+	mutex_lock(&nvmet_rdma_queue_mutex);
+	list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
+		if (queue->nvme_sq.ctrl == ctrl) {
+			list_del_init(&queue->queue_list);
+			mutex_unlock(&nvmet_rdma_queue_mutex);
+
+			__nvmet_rdma_queue_disconnect(queue);
+			goto restart;
+		}
+	}
+	mutex_unlock(&nvmet_rdma_queue_mutex);
+}
+
+static int nvmet_rdma_add_port(struct nvmet_port *port)
+{
+	struct rdma_cm_id *cm_id;
+	struct sockaddr_storage addr = { };
+	__kernel_sa_family_t af;
+	int ret;
+
+	switch (port->disc_addr.adrfam) {
+	case NVMF_ADDR_FAMILY_IP4:
+		af = AF_INET;
+		break;
+	case NVMF_ADDR_FAMILY_IP6:
+		af = AF_INET6;
+		break;
+	default:
+		pr_err("address family %d not supported\n",
+				port->disc_addr.adrfam);
+		return -EINVAL;
+	}
+
+	ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
+			port->disc_addr.trsvcid, &addr);
+	if (ret) {
+		pr_err("malformed ip/port passed: %s:%s\n",
+			port->disc_addr.traddr, port->disc_addr.trsvcid);
+		return ret;
+	}
+
+	cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
+			RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(cm_id)) {
+		pr_err("CM ID creation failed\n");
+		return PTR_ERR(cm_id);
+	}
+
+	/*
+	 * Allow both IPv4 and IPv6 sockets to bind a single port
+	 * at the same time.
+	 */
+	ret = rdma_set_afonly(cm_id, 1);
+	if (ret) {
+		pr_err("rdma_set_afonly failed (%d)\n", ret);
+		goto out_destroy_id;
+	}
+
+	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
+	if (ret) {
+		pr_err("binding CM ID to %pISpcs failed (%d)\n",
+			(struct sockaddr *)&addr, ret);
+		goto out_destroy_id;
+	}
+
+	ret = rdma_listen(cm_id, 128);
+	if (ret) {
+		pr_err("listening to %pISpcs failed (%d)\n",
+			(struct sockaddr *)&addr, ret);
+		goto out_destroy_id;
+	}
+
+	pr_info("enabling port %d (%pISpcs)\n",
+		le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
+	port->priv = cm_id;
+	return 0;
+
+out_destroy_id:
+	rdma_destroy_id(cm_id);
+	return ret;
+}
+
+static void nvmet_rdma_remove_port(struct nvmet_port *port)
+{
+	struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
+
+	if (cm_id)
+		rdma_destroy_id(cm_id);
+}
+
+static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
+		struct nvmet_port *port, char *traddr)
+{
+	struct rdma_cm_id *cm_id = port->priv;
+
+	if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) {
+		struct nvmet_rdma_rsp *rsp =
+			container_of(req, struct nvmet_rdma_rsp, req);
+		struct rdma_cm_id *req_cm_id = rsp->queue->cm_id;
+		struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr;
+
+		sprintf(traddr, "%pISc", addr);
+	} else {
+		memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
+	}
+}
+
+static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
+	.owner			= THIS_MODULE,
+	.type			= NVMF_TRTYPE_RDMA,
+	.sqe_inline_size	= NVMET_RDMA_INLINE_DATA_SIZE,
+	.msdbd			= 1,
+	.has_keyed_sgls		= 1,
+	.add_port		= nvmet_rdma_add_port,
+	.remove_port		= nvmet_rdma_remove_port,
+	.queue_response		= nvmet_rdma_queue_response,
+	.delete_ctrl		= nvmet_rdma_delete_ctrl,
+	.disc_traddr		= nvmet_rdma_disc_port_addr,
+};
+
+static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
+{
+	struct nvmet_rdma_queue *queue, *tmp;
+	struct nvmet_rdma_device *ndev;
+	bool found = false;
+
+	mutex_lock(&device_list_mutex);
+	list_for_each_entry(ndev, &device_list, entry) {
+		if (ndev->device == ib_device) {
+			found = true;
+			break;
+		}
+	}
+	mutex_unlock(&device_list_mutex);
+
+	if (!found)
+		return;
+
+	/*
+	 * IB Device that is used by nvmet controllers is being removed,
+	 * delete all queues using this device.
+	 */
+	mutex_lock(&nvmet_rdma_queue_mutex);
+	list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
+				 queue_list) {
+		if (queue->dev->device != ib_device)
+			continue;
+
+		pr_info("Removing queue %d\n", queue->idx);
+		list_del_init(&queue->queue_list);
+		__nvmet_rdma_queue_disconnect(queue);
+	}
+	mutex_unlock(&nvmet_rdma_queue_mutex);
+
+	flush_scheduled_work();
+}
+
+static struct ib_client nvmet_rdma_ib_client = {
+	.name   = "nvmet_rdma",
+	.remove = nvmet_rdma_remove_one
+};
+
+static int __init nvmet_rdma_init(void)
+{
+	int ret;
+
+	ret = ib_register_client(&nvmet_rdma_ib_client);
+	if (ret)
+		return ret;
+
+	ret = nvmet_register_transport(&nvmet_rdma_ops);
+	if (ret)
+		goto err_ib_client;
+
+	return 0;
+
+err_ib_client:
+	ib_unregister_client(&nvmet_rdma_ib_client);
+	return ret;
+}
+
+static void __exit nvmet_rdma_exit(void)
+{
+	nvmet_unregister_transport(&nvmet_rdma_ops);
+	ib_unregister_client(&nvmet_rdma_ib_client);
+	WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
+	ida_destroy(&nvmet_rdma_queue_ida);
+}
+
+module_init(nvmet_rdma_init);
+module_exit(nvmet_rdma_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
new file mode 100644
index 0000000..11e89e5
--- /dev/null
+++ b/drivers/scsi/Kconfig
@@ -0,0 +1,1507 @@
+menu "SCSI device support"
+
+config SCSI_MOD
+       tristate
+       default y if SCSI=n || SCSI=y
+       default m if SCSI=m
+
+config RAID_ATTRS
+	tristate "RAID Transport Class"
+	default n
+	depends on BLOCK
+	depends on SCSI_MOD
+	---help---
+	  Provides RAID
+
+config SCSI
+	tristate "SCSI device support"
+	depends on BLOCK
+	select SCSI_DMA if HAS_DMA
+	select SG_POOL
+	select BLK_SCSI_REQUEST
+	---help---
+	  If you want to use a SCSI hard disk, SCSI tape drive, SCSI CD-ROM or
+	  any other SCSI device under Linux, say Y and make sure that you know
+	  the name of your SCSI host adapter (the card inside your computer
+	  that "speaks" the SCSI protocol, also called SCSI controller),
+	  because you will be asked for it.
+
+	  You also need to say Y here if you have a device which speaks
+	  the SCSI protocol.  Examples of this include the parallel port
+	  version of the IOMEGA ZIP drive, USB storage devices, Fibre
+	  Channel, and FireWire storage.
+
+	  To compile this driver as a module, choose M here and read
+	  <file:Documentation/scsi/scsi.txt>.
+	  The module will be called scsi_mod.
+
+	  However, do not compile this as a module if your root file system
+	  (the one containing the directory /) is located on a SCSI device.
+
+config SCSI_DMA
+	bool
+	default n
+
+config SCSI_NETLINK
+	bool
+	default	n
+	depends on NET
+
+config SCSI_MQ_DEFAULT
+	bool "SCSI: use blk-mq I/O path by default"
+	depends on SCSI
+	---help---
+	  This option enables the new blk-mq based I/O path for SCSI
+	  devices by default.  With the option the scsi_mod.use_blk_mq
+	  module/boot option defaults to Y, without it to N, but it can
+	  still be overridden either way.
+
+	  If unsure say N.
+
+config SCSI_PROC_FS
+	bool "legacy /proc/scsi/ support"
+	depends on SCSI && PROC_FS
+	default y
+	---help---
+	  This option enables support for the various files in
+	  /proc/scsi.  In Linux 2.6 this has been superseded by
+	  files in sysfs but many legacy applications rely on this.
+
+	  If unsure say Y.
+
+comment "SCSI support type (disk, tape, CD-ROM)"
+	depends on SCSI
+
+config BLK_DEV_SD
+	tristate "SCSI disk support"
+	depends on SCSI
+	---help---
+	  If you want to use SCSI hard disks, Fibre Channel disks,
+	  Serial ATA (SATA) or Parallel ATA (PATA) hard disks,
+	  USB storage or the SCSI or parallel port version of
+	  the IOMEGA ZIP drive, say Y and read the SCSI-HOWTO,
+	  the Disk-HOWTO and the Multi-Disk-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>. This is NOT for SCSI
+	  CD-ROMs.
+
+	  To compile this driver as a module, choose M here and read
+	  <file:Documentation/scsi/scsi.txt>.
+	  The module will be called sd_mod.
+
+	  Do not compile this driver as a module if your root file system
+	  (the one containing the directory /) is located on a SCSI disk.
+	  In this case, do not compile the driver for your SCSI host adapter
+	  (below) as a module either.
+
+config CHR_DEV_ST
+	tristate "SCSI tape support"
+	depends on SCSI
+	---help---
+	  If you want to use a SCSI tape drive under Linux, say Y and read the
+	  SCSI-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>, and
+	  <file:Documentation/scsi/st.txt> in the kernel source.  This is NOT
+	  for SCSI CD-ROMs.
+
+	  To compile this driver as a module, choose M here and read
+	  <file:Documentation/scsi/scsi.txt>. The module will be called st.
+
+config CHR_DEV_OSST
+	tristate "SCSI OnStream SC-x0 tape support"
+	depends on SCSI
+	---help---
+	  The OnStream SC-x0 SCSI tape drives cannot be driven by the
+	  standard st driver, but instead need this special osst driver and
+	  use the  /dev/osstX char device nodes (major 206).  Via usb-storage,
+	  you may be able to drive the USB-x0 and DI-x0 drives as well.
+	  Note that there is also a second generation of OnStream
+	  tape drives (ADR-x0) that supports the standard SCSI-2 commands for
+	  tapes (QIC-157) and can be driven by the standard driver st.
+	  For more information, you may have a look at the SCSI-HOWTO
+	  <http://www.tldp.org/docs.html#howto>  and
+	  <file:Documentation/scsi/osst.txt>  in the kernel source.
+	  More info on the OnStream driver may be found on
+	  <http://sourceforge.net/projects/osst/>
+	  Please also have a look at the standard st docu, as most of it
+	  applies to osst as well.
+
+	  To compile this driver as a module, choose M here and read
+	  <file:Documentation/scsi/scsi.txt>. The module will be called osst.
+
+config BLK_DEV_SR
+	tristate "SCSI CDROM support"
+	depends on SCSI && BLK_DEV
+	select CDROM
+	---help---
+	  If you want to use a CD or DVD drive attached to your computer
+	  by SCSI, FireWire, USB or ATAPI, say Y and read the SCSI-HOWTO
+	  and the CDROM-HOWTO at <http://www.tldp.org/docs.html#howto>.
+
+	  Make sure to say Y or M to "ISO 9660 CD-ROM file system support".
+
+	  To compile this driver as a module, choose M here and read
+	  <file:Documentation/scsi/scsi.txt>.
+	  The module will be called sr_mod.
+
+config BLK_DEV_SR_VENDOR
+	bool "Enable vendor-specific extensions (for SCSI CDROM)"
+	depends on BLK_DEV_SR
+	help
+	  This enables the usage of vendor specific SCSI commands. This is
+	  required to support multisession CDs with old NEC/TOSHIBA cdrom
+	  drives (and HP Writers). If you have such a drive and get the first
+	  session only, try saying Y here; everybody else says N.
+
+config CHR_DEV_SG
+	tristate "SCSI generic support"
+	depends on SCSI
+	---help---
+	  If you want to use SCSI scanners, synthesizers or CD-writers or just
+	  about anything having "SCSI" in its name other than hard disks,
+	  CD-ROMs or tapes, say Y here. These won't be supported by the kernel
+	  directly, so you need some additional software which knows how to
+	  talk to these devices using the SCSI protocol:
+
+	  For scanners, look at SANE (<http://www.sane-project.org/>). For CD
+	  writer software look at Cdrtools
+	  (<http://cdrecord.berlios.de/private/cdrecord.html>)
+	  and for burning a "disk at once": CDRDAO
+	  (<http://cdrdao.sourceforge.net/>). Cdparanoia is a high
+	  quality digital reader of audio CDs (<http://www.xiph.org/paranoia/>).
+	  For other devices, it's possible that you'll have to write the
+	  driver software yourself. Please read the file
+	  <file:Documentation/scsi/scsi-generic.txt> for more information.
+
+	  To compile this driver as a module, choose M here and read
+	  <file:Documentation/scsi/scsi.txt>. The module will be called sg.
+
+	  If unsure, say N.
+
+config CHR_DEV_SCH
+	tristate "SCSI media changer support"
+	depends on SCSI
+	---help---
+	  This is a driver for SCSI media changers.  Most common devices are
+	  tape libraries and MOD/CDROM jukeboxes.  *Real* jukeboxes, you
+	  don't need this for those tiny 6-slot cdrom changers.  Media
+	  changers are listed as "Type: Medium Changer" in /proc/scsi/scsi.
+	  If you have such hardware and want to use it with linux, say Y
+	  here.  Check <file:Documentation/scsi/scsi-changer.txt> for details.
+	
+	  If you want to compile this as a module ( = code which can be
+	  inserted in and removed from the running kernel whenever you want),
+	  say M here and read <file:Documentation/kbuild/modules.txt> and
+	  <file:Documentation/scsi/scsi.txt>. The module will be called ch.o.
+	  If unsure, say N.
+
+config SCSI_ENCLOSURE
+	tristate "SCSI Enclosure Support"
+	depends on SCSI && ENCLOSURE_SERVICES
+	depends on m || SCSI_SAS_ATTRS != m
+	help
+	  Enclosures are devices sitting on or in SCSI backplanes that
+	  manage devices.  If you have a disk cage, the chances are that
+	  it has an enclosure device.  Selecting this option will just allow
+	  certain enclosure conditions to be reported and is not required.
+
+config SCSI_CONSTANTS
+	bool "Verbose SCSI error reporting (kernel size += 36K)"
+	depends on SCSI
+	help
+	  The error messages regarding your SCSI hardware will be easier to
+	  understand if you say Y here; it will enlarge your kernel by about
+	  36 KB. If in doubt, say Y.
+
+config SCSI_LOGGING
+	bool "SCSI logging facility"
+	depends on SCSI
+	---help---
+	  This turns on a logging facility that can be used to debug a number
+	  of SCSI related problems.
+
+	  If you say Y here, no logging output will appear by default, but you
+	  can enable logging by saying Y to "/proc file system support" and
+	  "Sysctl support" below and executing the command
+
+	  echo <bitmask> > /proc/sys/dev/scsi/logging_level
+
+	  where <bitmask> is a four byte value representing the logging type
+	  and logging level for each type of logging selected.
+
+	  There are a number of logging types and you can find them in the
+	  source at <file:drivers/scsi/scsi_logging.h>. The logging levels
+	  are also described in that file and they determine the verbosity of
+	  the logging for each logging type.
+
+	  If you say N here, it may be harder to track down some types of SCSI
+	  problems. If you say Y here your kernel will be somewhat larger, but
+	  there should be no noticeable performance impact as long as you have
+	  logging turned off.
+
+config SCSI_SCAN_ASYNC
+	bool "Asynchronous SCSI scanning"
+	depends on SCSI
+	help
+	  The SCSI subsystem can probe for devices while the rest of the
+	  system continues booting, and even probe devices on different
+	  busses in parallel, leading to a significant speed-up.
+
+	  You can override this choice by specifying "scsi_mod.scan=sync"
+	  or async on the kernel's command line.
+
+	  Note that this setting also affects whether resuming from
+	  system suspend will be performed asynchronously.
+
+menu "SCSI Transports"
+	depends on SCSI
+
+config SCSI_SPI_ATTRS
+	tristate "Parallel SCSI (SPI) Transport Attributes"
+	depends on SCSI
+	help
+	  If you wish to export transport-specific information about
+	  each attached SCSI device to sysfs, say Y.  Otherwise, say N.
+
+config SCSI_FC_ATTRS
+	tristate "FiberChannel Transport Attributes"
+	depends on SCSI && NET
+	select BLK_DEV_BSGLIB
+	select SCSI_NETLINK
+	help
+	  If you wish to export transport-specific information about
+	  each attached FiberChannel device to sysfs, say Y.
+	  Otherwise, say N.
+
+config SCSI_ISCSI_ATTRS
+	tristate "iSCSI Transport Attributes"
+	depends on SCSI && NET
+	select BLK_DEV_BSGLIB
+	help
+	  If you wish to export transport-specific information about
+	  each attached iSCSI device to sysfs, say Y.
+	  Otherwise, say N.
+
+config SCSI_SAS_ATTRS
+	tristate "SAS Transport Attributes"
+	depends on SCSI
+	select BLK_DEV_BSGLIB
+	help
+	  If you wish to export transport-specific information about
+	  each attached SAS device to sysfs, say Y.
+
+source "drivers/scsi/libsas/Kconfig"
+
+config SCSI_SRP_ATTRS
+	tristate "SRP Transport Attributes"
+	depends on SCSI
+	help
+	  If you wish to export transport-specific information about
+	  each attached SRP device to sysfs, say Y.
+
+endmenu
+
+menuconfig SCSI_LOWLEVEL
+	bool "SCSI low-level drivers"
+	depends on SCSI!=n
+	default y
+
+if SCSI_LOWLEVEL && SCSI
+
+config ISCSI_TCP
+	tristate "iSCSI Initiator over TCP/IP"
+	depends on SCSI && INET
+	select CRYPTO
+	select CRYPTO_MD5
+	select CRYPTO_CRC32C
+	select SCSI_ISCSI_ATTRS
+	help
+	 The iSCSI Driver provides a host with the ability to access storage
+	 through an IP network. The driver uses the iSCSI protocol to transport
+	 SCSI requests and responses over a TCP/IP network between the host
+	 (the "initiator") and "targets".  Architecturally, the iSCSI driver
+	 combines with the host's TCP/IP stack, network drivers, and Network
+	 Interface Card (NIC) to provide the same functions as a SCSI or a
+	 Fibre Channel (FC) adapter driver with a Host Bus Adapter (HBA).
+
+	 To compile this driver as a module, choose M here: the
+	 module will be called iscsi_tcp.
+
+	 The userspace component needed to initialize the driver, documentation,
+	 and sample configuration files can be found here:
+
+	 http://open-iscsi.org
+
+config ISCSI_BOOT_SYSFS
+	tristate "iSCSI Boot Sysfs Interface"
+	default	n
+	help
+	  This option enables support for exposing iSCSI boot information
+	  via sysfs to userspace. If you wish to export this information,
+	  say Y. Otherwise, say N.
+
+source "drivers/scsi/cxgbi/Kconfig"
+source "drivers/scsi/bnx2i/Kconfig"
+source "drivers/scsi/bnx2fc/Kconfig"
+source "drivers/scsi/be2iscsi/Kconfig"
+source "drivers/scsi/cxlflash/Kconfig"
+
+config SGIWD93_SCSI
+	tristate "SGI WD93C93 SCSI Driver"
+	depends on SGI_HAS_WD93 && SCSI
+  	help
+	  If you have a Western Digital WD93 SCSI controller on
+	  an SGI MIPS system, say Y.  Otherwise, say N.
+
+config BLK_DEV_3W_XXXX_RAID
+	tristate "3ware 5/6/7/8xxx ATA-RAID support"
+	depends on PCI && SCSI
+	help
+	  3ware is the only hardware ATA-Raid product in Linux to date.
+	  This card is 2,4, or 8 channel master mode support only.
+	  SCSI support required!!!
+
+	  <http://www.3ware.com/>
+
+	  Please read the comments at the top of
+	  <file:drivers/scsi/3w-xxxx.c>.
+
+config SCSI_HPSA
+	tristate "HP Smart Array SCSI driver"
+	depends on PCI && SCSI
+	select CHECK_SIGNATURE
+	select SCSI_SAS_ATTRS
+	help
+	  This driver supports HP Smart Array Controllers (circa 2009).
+	  It is a SCSI alternative to the cciss driver, which is a block
+	  driver.  Anyone wishing to use HP Smart Array controllers who
+	  would prefer the devices be presented to linux as SCSI devices,
+	  rather than as generic block devices should say Y here.
+
+config SCSI_3W_9XXX
+	tristate "3ware 9xxx SATA-RAID support"
+	depends on PCI && SCSI
+	help
+	  This driver supports the 9000 series 3ware SATA-RAID cards.
+
+	  <http://www.amcc.com>
+
+	  Please read the comments at the top of
+	  <file:drivers/scsi/3w-9xxx.c>.
+
+config SCSI_3W_SAS
+	tristate "3ware 97xx SAS/SATA-RAID support"
+	depends on PCI && SCSI
+	help
+	  This driver supports the LSI 3ware 9750 6Gb/s SAS/SATA-RAID cards.
+
+	  <http://www.lsi.com>
+
+	  Please read the comments at the top of
+	  <file:drivers/scsi/3w-sas.c>.
+
+config SCSI_ACARD
+	tristate "ACARD SCSI support"
+	depends on PCI && SCSI
+	help
+	  This driver supports the ACARD SCSI host adapter.
+	  Support Chip <ATP870 ATP876 ATP880 ATP885>
+	  To compile this driver as a module, choose M here: the
+	  module will be called atp870u.
+
+config SCSI_AHA152X
+	tristate "Adaptec AHA152X/2825 support"
+	depends on ISA && SCSI
+	select SCSI_SPI_ATTRS
+	select CHECK_SIGNATURE
+	---help---
+	  This is a driver for the AHA-1510, AHA-1520, AHA-1522, and AHA-2825
+	  SCSI host adapters. It also works for the AVA-1505, but the IRQ etc.
+	  must be manually specified in this case.
+
+	  It is explained in section 3.3 of the SCSI-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>. You might also want to
+	  read the file <file:Documentation/scsi/aha152x.txt>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called aha152x.
+
+config SCSI_AHA1542
+	tristate "Adaptec AHA1542 support"
+	depends on ISA && SCSI && ISA_DMA_API
+	---help---
+	  This is support for a SCSI host adapter.  It is explained in section
+	  3.4 of the SCSI-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.  Note that Trantor was
+	  purchased by Adaptec, and some former Trantor products are being
+	  sold under the Adaptec name.  If it doesn't work out of the box, you
+	  may have to change some settings in <file:drivers/scsi/aha1542.h>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called aha1542.
+
+config SCSI_AHA1740
+	tristate "Adaptec AHA1740 support"
+	depends on EISA && SCSI
+	---help---
+	  This is support for a SCSI host adapter.  It is explained in section
+	  3.5 of the SCSI-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.  If it doesn't work out
+	  of the box, you may have to change some settings in
+	  <file:drivers/scsi/aha1740.h>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called aha1740.
+
+config SCSI_AACRAID
+	tristate "Adaptec AACRAID support"
+	depends on SCSI && PCI
+	help
+	  This driver supports a variety of Dell, HP, Adaptec, IBM and
+	  ICP storage products. For a list of supported products, refer
+	  to <file:Documentation/scsi/aacraid.txt>.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called aacraid.
+
+
+source "drivers/scsi/aic7xxx/Kconfig.aic7xxx"
+source "drivers/scsi/aic7xxx/Kconfig.aic79xx"
+source "drivers/scsi/aic94xx/Kconfig"
+source "drivers/scsi/hisi_sas/Kconfig"
+source "drivers/scsi/mvsas/Kconfig"
+
+config SCSI_MVUMI
+	tristate "Marvell UMI driver"
+	depends on SCSI && PCI
+	help
+	  Module for Marvell Universal Message Interface(UMI) driver
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mvumi.
+
+config SCSI_DPT_I2O
+	tristate "Adaptec I2O RAID support "
+	depends on SCSI && PCI && VIRT_TO_BUS
+	help
+	  This driver supports all of Adaptec's I2O based RAID controllers as 
+	  well as the DPT SmartRaid V cards.  This is an Adaptec maintained
+	  driver by Deanna Bonds.  See <file:Documentation/scsi/dpti.txt>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called dpt_i2o.
+
+config SCSI_ADVANSYS
+	tristate "AdvanSys SCSI support"
+	depends on SCSI
+	depends on ISA || EISA || PCI
+	depends on ISA_DMA_API || !ISA
+	help
+	  This is a driver for all SCSI host adapters manufactured by
+	  AdvanSys. It is documented in the kernel source in
+	  <file:drivers/scsi/advansys.c>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called advansys.
+
+config SCSI_ARCMSR
+	tristate "ARECA (ARC11xx/12xx/13xx/16xx) SATA/SAS RAID Host Adapter"
+	depends on PCI && SCSI
+	help
+	  This driver supports all of ARECA's SATA/SAS RAID controller cards.
+	  This is an ARECA-maintained driver by Erich Chen.
+	  If you have any problems, please mail to: <erich@areca.com.tw>.
+	  Areca supports Linux RAID config tools.
+	  Please link <http://www.areca.com.tw>
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called arcmsr (modprobe arcmsr).
+
+source "drivers/scsi/esas2r/Kconfig"
+source "drivers/scsi/megaraid/Kconfig.megaraid"
+source "drivers/scsi/mpt3sas/Kconfig"
+source "drivers/scsi/smartpqi/Kconfig"
+source "drivers/scsi/ufs/Kconfig"
+
+config SCSI_HPTIOP
+	tristate "HighPoint RocketRAID 3xxx/4xxx Controller support"
+	depends on SCSI && PCI
+	help
+	  This option enables support for HighPoint RocketRAID 3xxx/4xxx
+	  controllers.
+
+	  To compile this driver as a module, choose M here; the module
+	  will be called hptiop. If unsure, say N.
+
+config SCSI_BUSLOGIC
+	tristate "BusLogic SCSI support"
+	depends on (PCI || ISA || MCA) && SCSI && ISA_DMA_API && VIRT_TO_BUS
+	---help---
+	  This is support for BusLogic MultiMaster and FlashPoint SCSI Host
+	  Adapters. Consult the SCSI-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>, and the files
+	  <file:Documentation/scsi/BusLogic.txt> and
+	  <file:Documentation/scsi/FlashPoint.txt> for more information.
+	  Note that support for FlashPoint is only available for 32-bit
+	  x86 configurations.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called BusLogic.
+
+config SCSI_FLASHPOINT
+	bool "FlashPoint support"
+	depends on SCSI_BUSLOGIC && PCI
+	help
+	  This option allows you to add FlashPoint support to the
+	  BusLogic SCSI driver. The FlashPoint SCCB Manager code is
+	  substantial, so users of MultiMaster Host Adapters may not
+	  wish to include it.
+
+config VMWARE_PVSCSI
+	tristate "VMware PVSCSI driver support"
+	depends on PCI && SCSI && X86
+	help
+	  This driver supports VMware's para virtualized SCSI HBA.
+	  To compile this driver as a module, choose M here: the
+	  module will be called vmw_pvscsi.
+
+config XEN_SCSI_FRONTEND
+	tristate "XEN SCSI frontend driver"
+	depends on SCSI && XEN
+	select XEN_XENBUS_FRONTEND
+	help
+	  The XEN SCSI frontend driver allows the kernel to access SCSI Devices
+	  within another guest OS (usually Dom0).
+	  Only needed if the kernel is running in a XEN guest and generic
+	  SCSI access to a device is needed.
+
+config HYPERV_STORAGE
+	tristate "Microsoft Hyper-V virtual storage driver"
+	depends on SCSI && HYPERV
+	depends on m || SCSI_FC_ATTRS != m
+	default HYPERV
+	help
+	  Select this option to enable the Hyper-V virtual storage driver.
+
+config LIBFC
+	tristate "LibFC module"
+	depends on SCSI_FC_ATTRS
+	select CRC32
+	---help---
+	  Fibre Channel library module
+
+config LIBFCOE
+	tristate "LibFCoE module"
+	depends on LIBFC
+	---help---
+	  Library for Fibre Channel over Ethernet module
+
+config FCOE
+	tristate "FCoE module"
+	depends on PCI
+	depends on LIBFCOE
+	---help---
+	  Fibre Channel over Ethernet module
+
+config FCOE_FNIC
+	tristate "Cisco FNIC Driver"
+	depends on PCI && X86
+	depends on LIBFCOE
+	help
+	  This is support for the Cisco PCI-Express FCoE HBA.
+
+	  To compile this driver as a module, choose M here and read
+	  <file:Documentation/scsi/scsi.txt>.
+	  The module will be called fnic.
+
+config SCSI_SNIC
+	tristate "Cisco SNIC Driver"
+	depends on PCI && SCSI
+	help
+	  This is support for the Cisco PCI-Express SCSI HBA.
+
+	  To compile this driver as a module, choose M here and read
+	  <file:Documentation/scsi/scsi.txt>.
+	  The module will be called snic.
+
+config SCSI_SNIC_DEBUG_FS
+	bool "Cisco SNIC Driver Debugfs Support"
+	depends on SCSI_SNIC && DEBUG_FS
+	help
+	  This enables to list debugging information from SNIC Driver
+	  available via debugfs file system
+
+config SCSI_DMX3191D
+	tristate "DMX3191D SCSI support"
+	depends on PCI && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  This is support for Domex DMX3191D SCSI Host Adapters.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called dmx3191d.
+
+config SCSI_GDTH
+	tristate "Intel/ICP (former GDT SCSI Disk Array) RAID Controller support"
+	depends on (ISA || EISA || PCI) && SCSI && ISA_DMA_API
+	---help---
+	  Formerly called GDT SCSI Disk Array Controller Support.
+
+	  This is a driver for RAID/SCSI Disk Array Controllers (EISA/ISA/PCI) 
+	  manufactured by Intel Corporation/ICP vortex GmbH. It is documented
+	  in the kernel source in <file:drivers/scsi/gdth.c> and
+	  <file:drivers/scsi/gdth.h>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called gdth.
+
+config SCSI_ISCI
+	tristate "Intel(R) C600 Series Chipset SAS Controller"
+	depends on PCI && SCSI
+	depends on X86
+	select SCSI_SAS_LIBSAS
+	---help---
+	  This driver supports the 6Gb/s SAS capabilities of the storage
+	  control unit found in the Intel(R) C600 series chipset.
+
+config SCSI_GENERIC_NCR5380
+	tristate "Generic NCR5380/53c400 SCSI ISA card support"
+	depends on ISA && SCSI && HAS_IOPORT_MAP
+	select SCSI_SPI_ATTRS
+	---help---
+	  This is a driver for old ISA card SCSI controllers based on a
+	  NCR 5380, 53C80, 53C400, 53C400A, or DTC 436 device.
+	  Most boards such as the Trantor T130 fit this category, as do
+	  various 8-bit and 16-bit ISA cards bundled with SCSI scanners.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called g_NCR5380.
+
+config SCSI_IPS
+	tristate "IBM ServeRAID support"
+	depends on PCI && SCSI
+	---help---
+	  This is support for the IBM ServeRAID hardware RAID controllers.
+	  See <http://www.developer.ibm.com/welcome/netfinity/serveraid.html>
+	  and <http://www-947.ibm.com/support/entry/portal/docdisplay?brand=5000008&lndocid=SERV-RAID>
+	  for more information.  If this driver does not work correctly
+	  without modification please contact the author by email at
+	  <ipslinux@adaptec.com>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called ips.
+
+config SCSI_IBMVSCSI
+	tristate "IBM Virtual SCSI support"
+	depends on PPC_PSERIES
+	select SCSI_SRP_ATTRS
+	help
+	  This is the IBM POWER Virtual SCSI Client
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called ibmvscsi.
+
+config SCSI_IBMVSCSIS
+	tristate "IBM Virtual SCSI Server support"
+	depends on PPC_PSERIES && TARGET_CORE && SCSI && PCI
+	help
+	  This is the IBM POWER Virtual SCSI Target Server
+	  This driver uses the SRP protocol for communication between servers
+	  guest and/or the host that run on the same server.
+	  More information on VSCSI protocol can be found at www.power.org
+
+	  The userspace configuration needed to initialize the driver can be
+	  be found here:
+
+	  https://github.com/powervm/ibmvscsis/wiki/Configuration
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called ibmvscsis.
+
+config SCSI_IBMVFC
+	tristate "IBM Virtual FC support"
+	depends on PPC_PSERIES && SCSI
+	depends on SCSI_FC_ATTRS
+	help
+	  This is the IBM POWER Virtual FC Client
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called ibmvfc.
+
+config SCSI_IBMVFC_TRACE
+	bool "enable driver internal trace"
+	depends on SCSI_IBMVFC
+	default y
+	help
+	  If you say Y here, the driver will trace all commands issued
+	  to the adapter. Performance impact is minimal. Trace can be
+	  dumped using /sys/class/scsi_host/hostXX/trace.
+
+config SCSI_INITIO
+	tristate "Initio 9100U(W) support"
+	depends on PCI && SCSI
+	help
+	  This is support for the Initio 91XXU(W) SCSI host adapter.  Please
+	  read the SCSI-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called initio.
+
+config SCSI_INIA100
+	tristate "Initio INI-A100U2W support"
+	depends on PCI && SCSI
+	help
+	  This is support for the Initio INI-A100U2W SCSI host adapter.
+	  Please read the SCSI-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called a100u2w.
+
+config SCSI_PPA
+	tristate "IOMEGA parallel port (ppa - older drives)"
+	depends on SCSI && PARPORT_PC
+	---help---
+	  This driver supports older versions of IOMEGA's parallel port ZIP
+	  drive (a 100 MB removable media device).
+
+	  Note that you can say N here if you have the SCSI version of the ZIP
+	  drive: it will be supported automatically if you said Y to the
+	  generic "SCSI disk support", above.
+
+	  If you have the ZIP Plus drive or a more recent parallel port ZIP
+	  drive (if the supplied cable with the drive is labeled "AutoDetect")
+	  then you should say N here and Y to "IOMEGA parallel port (imm -
+	  newer drives)", below.
+
+	  For more information about this driver and how to use it you should
+	  read the file <file:Documentation/scsi/ppa.txt>.  You should also read
+	  the SCSI-HOWTO, which is available from
+	  <http://www.tldp.org/docs.html#howto>.  If you use this driver,
+	  you will still be able to use the parallel port for other tasks,
+	  such as a printer; it is safe to compile both drivers into the
+	  kernel.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called ppa.
+
+config SCSI_IMM
+	tristate "IOMEGA parallel port (imm - newer drives)"
+	depends on SCSI && PARPORT_PC
+	---help---
+	  This driver supports newer versions of IOMEGA's parallel port ZIP
+	  drive (a 100 MB removable media device).
+
+	  Note that you can say N here if you have the SCSI version of the ZIP
+	  drive: it will be supported automatically if you said Y to the
+	  generic "SCSI disk support", above.
+
+	  If you have the ZIP Plus drive or a more recent parallel port ZIP
+	  drive (if the supplied cable with the drive is labeled "AutoDetect")
+	  then you should say Y here; if you have an older ZIP drive, say N
+	  here and Y to "IOMEGA Parallel Port (ppa - older drives)", above.
+
+	  For more information about this driver and how to use it you should
+	  read the file <file:Documentation/scsi/ppa.txt>.  You should also read
+	  the SCSI-HOWTO, which is available from
+	  <http://www.tldp.org/docs.html#howto>.  If you use this driver,
+	  you will still be able to use the parallel port for other tasks,
+	  such as a printer; it is safe to compile both drivers into the
+	  kernel.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called imm.
+
+config SCSI_IZIP_EPP16
+	bool "ppa/imm option - Use slow (but safe) EPP-16"
+	depends on SCSI_PPA || SCSI_IMM
+	---help---
+	  EPP (Enhanced Parallel Port) is a standard for parallel ports which
+	  allows them to act as expansion buses that can handle up to 64
+	  peripheral devices.
+
+	  Some parallel port chipsets are slower than their motherboard, and
+	  so we have to control the state of the chipset's FIFO queue every
+	  now and then to avoid data loss. This will be done if you say Y
+	  here.
+
+	  Generally, saying Y is the safe option and slows things down a bit.
+
+config SCSI_IZIP_SLOW_CTR
+	bool "ppa/imm option - Assume slow parport control register"
+	depends on SCSI_PPA || SCSI_IMM
+	help
+	  Some parallel ports are known to have excessive delays between
+	  changing the parallel port control register and good data being
+	  available on the parallel port data/status register. This option
+	  forces a small delay (1.0 usec to be exact) after changing the
+	  control register to let things settle out. Enabling this option may
+	  result in a big drop in performance but some very old parallel ports
+	  (found in 386 vintage machines) will not work properly.
+
+	  Generally, saying N is fine.
+
+config SCSI_NCR_D700
+	tristate "NCR Dual 700 MCA SCSI support"
+	depends on MCA && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  This is a driver for the MicroChannel Dual 700 card produced by
+	  NCR and commonly used in 345x/35xx/4100 class machines.  It always
+	  tries to negotiate sync and uses tag command queueing.
+
+	  Unless you have an NCR manufactured machine, the chances are that
+	  you do not have this SCSI card, so say N.
+
+config SCSI_LASI700
+	tristate "HP Lasi SCSI support for 53c700/710"
+	depends on GSC && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  This is a driver for the SCSI controller in the Lasi chip found in
+	  many PA-RISC workstations & servers.  If you do not know whether you
+	  have a Lasi chip, it is safe to say "Y" here.
+
+config SCSI_SNI_53C710
+	tristate "SNI RM SCSI support for 53c710"
+	depends on SNI_RM && SCSI
+	select SCSI_SPI_ATTRS
+	select 53C700_LE_ON_BE
+	help
+	  This is a driver for the onboard SCSI controller found in older
+	  SNI RM workstations & servers.
+
+config 53C700_LE_ON_BE
+	bool
+	depends on SCSI_LASI700
+	default y
+
+config SCSI_STEX
+	tristate "Promise SuperTrak EX Series support"
+	depends on PCI && SCSI
+	---help---
+	  This driver supports Promise SuperTrak EX series storage controllers.
+
+	  Promise provides Linux RAID configuration utility for these
+	  controllers. Please visit <http://www.promise.com> to download.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called stex.
+
+config 53C700_BE_BUS
+	bool
+	depends on SCSI_A4000T || SCSI_ZORRO7XX || MVME16x_SCSI || BVME6000_SCSI
+	default y
+
+config SCSI_SYM53C8XX_2
+	tristate "SYM53C8XX Version 2 SCSI support"
+	depends on PCI && SCSI
+	select SCSI_SPI_ATTRS
+	---help---
+	  This driver supports the whole NCR53C8XX/SYM53C8XX family of
+	  PCI-SCSI controllers.  It also supports the subset of LSI53C10XX
+	  Ultra-160 controllers that are based on the SYM53C8XX SCRIPTS
+	  language.  It does not support LSI53C10XX Ultra-320 PCI-X SCSI
+	  controllers; you need to use the Fusion MPT driver for that.
+
+	  Please read <file:Documentation/scsi/sym53c8xx_2.txt> for more
+	  information.
+
+config SCSI_SYM53C8XX_DMA_ADDRESSING_MODE
+	int "DMA addressing mode"
+	depends on SCSI_SYM53C8XX_2
+	default "1"
+	---help---
+	  This option only applies to PCI-SCSI chips that are PCI DAC
+	  capable (875A, 895A, 896, 1010-33, 1010-66, 1000).
+
+	  When set to 0, the driver will program the chip to only perform
+	  32-bit DMA.  When set to 1, the chip will be able to perform DMA
+	  to addresses up to 1TB.  When set to 2, the driver supports the
+	  full 64-bit DMA address range, but can only address 16 segments
+	  of 4 GB each.  This limits the total addressable range to 64 GB.
+
+	  Most machines with less than 4GB of memory should use a setting
+	  of 0 for best performance.  If your machine has 4GB of memory
+	  or more, you should set this option to 1 (the default).
+
+	  The still experimental value 2 (64 bit DMA addressing with 16
+	  x 4GB segments limitation) can be used on systems that require
+	  PCI address bits past bit 39 to be set for the addressing of
+	  memory using PCI DAC cycles.
+
+config SCSI_SYM53C8XX_DEFAULT_TAGS
+	int "Default tagged command queue depth"
+	depends on SCSI_SYM53C8XX_2
+	default "16"
+	help
+	  This is the default value of the command queue depth the
+	  driver will announce to the generic SCSI layer for devices
+	  that support tagged command queueing. This value can be changed
+	  from the boot command line.  This is a soft limit that cannot
+	  exceed CONFIG_SCSI_SYM53C8XX_MAX_TAGS.
+
+config SCSI_SYM53C8XX_MAX_TAGS
+	int "Maximum number of queued commands"
+	depends on SCSI_SYM53C8XX_2
+	default "64"
+	help
+	  This option allows you to specify the maximum number of commands
+	  that can be queued to any device, when tagged command queuing is
+	  possible. The driver supports up to 256 queued commands per device.
+	  This value is used as a compiled-in hard limit.
+
+config SCSI_SYM53C8XX_MMIO
+	bool "Use memory mapped IO"
+	depends on SCSI_SYM53C8XX_2
+	default y
+	help
+	  Memory mapped IO is faster than Port IO.  Most people should
+	  answer Y here, but some machines may have problems.  If you have
+	  to answer N here, please report the problem to the maintainer.
+
+config SCSI_IPR
+	tristate "IBM Power Linux RAID adapter support"
+	depends on PCI && SCSI && ATA
+	select FW_LOADER
+	select IRQ_POLL
+	select SGL_ALLOC
+	---help---
+	  This driver supports the IBM Power Linux family RAID adapters.
+	  This includes IBM pSeries 5712, 5703, 5709, and 570A, as well
+	  as IBM iSeries 5702, 5703, 5709, and 570A.
+
+config SCSI_IPR_TRACE
+	bool "enable driver internal trace"
+	depends on SCSI_IPR
+	default y
+	help
+	  If you say Y here, the driver will trace all commands issued
+	  to the adapter. Performance impact is minimal. Trace can be
+	  dumped using /sys/bus/class/scsi_host/hostXX/trace.
+
+config SCSI_IPR_DUMP
+	bool "enable adapter dump support"
+	depends on SCSI_IPR
+	default y
+	help
+	  If you say Y here, the driver will support adapter crash dump.
+	  If you enable this support, the iprdump daemon can be used
+	  to capture adapter failure analysis information.
+
+config SCSI_ZALON
+	tristate "Zalon SCSI support"
+	depends on GSC && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  The Zalon is a GSC/HSC bus interface chip that sits between the
+	  PA-RISC processor and the NCR 53c720 SCSI controller on C100,
+	  C110, J200, J210 and some D, K & R-class machines.  It's also
+	  used on the add-in Bluefish, Barracuda & Shrike SCSI cards.
+	  Say Y here if you have one of these machines or cards.
+
+config SCSI_NCR_Q720
+	tristate "NCR Quad 720 MCA SCSI support"
+	depends on MCA && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  This is a driver for the MicroChannel Quad 720 card produced by
+	  NCR and commonly used in 345x/35xx/4100 class machines.  It always
+	  tries to negotiate sync and uses tag command queueing.
+
+	  Unless you have an NCR manufactured machine, the chances are that
+	  you do not have this SCSI card, so say N.
+
+config SCSI_NCR53C8XX_DEFAULT_TAGS
+	int "default tagged command queue depth"
+	depends on SCSI_ZALON || SCSI_NCR_Q720
+	default "8"
+	---help---
+	  "Tagged command queuing" is a feature of SCSI-2 which improves
+	  performance: the host adapter can send several SCSI commands to a
+	  device's queue even if previous commands haven't finished yet.
+	  Because the device is intelligent, it can optimize its operations
+	  (like head positioning) based on its own request queue. Some SCSI
+	  devices don't implement this properly; if you want to disable this
+	  feature, enter 0 or 1 here (it doesn't matter which).
+
+	  The default value is 8 and should be supported by most hard disks.
+	  This value can be overridden from the boot command line using the
+	  'tags' option as follows (example):
+	  'ncr53c8xx=tags:4/t2t3q16/t0u2q10' will set default queue depth to
+	  4, set queue depth to 16 for target 2 and target 3 on controller 0
+	  and set queue depth to 10 for target 0 / lun 2 on controller 1.
+
+	  The normal answer therefore is to go with the default 8 and to use
+	  a boot command line option for devices that need to use a different
+	  command queue depth.
+
+	  There is no safe option other than using good SCSI devices.
+
+config SCSI_NCR53C8XX_MAX_TAGS
+	int "maximum number of queued commands"
+	depends on SCSI_ZALON || SCSI_NCR_Q720
+	default "32"
+	---help---
+	  This option allows you to specify the maximum number of commands
+	  that can be queued to any device, when tagged command queuing is
+	  possible. The default value is 32. Minimum is 2, maximum is 64.
+	  Modern hard disks are able to support 64 tags and even more, but
+	  do not seem to be faster when more than 32 tags are being used.
+
+	  So, the normal answer here is to go with the default value 32 unless
+	  you are using very large hard disks with large cache (>= 1 MB) that
+	  are able to take advantage of more than 32 tagged commands.
+
+	  There is no safe option and the default answer is recommended.
+
+config SCSI_NCR53C8XX_SYNC
+	int "synchronous transfers frequency in MHz"
+	depends on SCSI_ZALON || SCSI_NCR_Q720
+	default "20"
+	---help---
+	  The SCSI Parallel Interface-2 Standard defines 5 classes of transfer
+	  rates: FAST-5, FAST-10, FAST-20, FAST-40 and FAST-80.  The numbers
+	  are respectively the maximum data transfer rates in mega-transfers
+	  per second for each class.  For example, a FAST-20 Wide 16 device is
+	  able to transfer data at 20 million 16 bit packets per second for a
+	  total rate of 40 MB/s.
+
+	  You may specify 0 if you want to only use asynchronous data
+	  transfers. This is the safest and slowest option. Otherwise, specify
+	  a value between 5 and 80, depending on the capability of your SCSI
+	  controller.  The higher the number, the faster the data transfer.
+	  Note that 80 should normally be ok since the driver decreases the
+	  value automatically according to the controller's capabilities.
+
+	  Your answer to this question is ignored for controllers with NVRAM,
+	  since the driver will get this information from the user set-up.  It
+	  also can be overridden using a boot setup option, as follows
+	  (example): 'ncr53c8xx=sync:12' will allow the driver to negotiate
+	  for FAST-20 synchronous data transfer (20 mega-transfers per
+	  second).
+
+	  The normal answer therefore is not to go with the default but to
+	  select the maximum value 80 allowing the driver to use the maximum
+	  value supported by each controller. If this causes problems with
+	  your SCSI devices, you should come back and decrease the value.
+
+	  There is no safe option other than using good cabling, right
+	  terminations and SCSI conformant devices.
+
+config SCSI_NCR53C8XX_NO_DISCONNECT
+	bool "not allow targets to disconnect"
+	depends on (SCSI_ZALON || SCSI_NCR_Q720) && SCSI_NCR53C8XX_DEFAULT_TAGS=0
+	help
+	  This option is only provided for safety if you suspect some SCSI
+	  device of yours to not support properly the target-disconnect
+	  feature. In that case, you would say Y here. In general however, to
+	  not allow targets to disconnect is not reasonable if there is more
+	  than 1 device on a SCSI bus. The normal answer therefore is N.
+
+config SCSI_QLOGIC_FAS
+	tristate "Qlogic FAS SCSI support"
+	depends on ISA && SCSI
+	---help---
+	  This is a driver for the ISA, VLB, and PCMCIA versions of the Qlogic
+	  FastSCSI! cards as well as any other card based on the FASXX chip
+	  (including the Control Concepts SCSI/IDE/SIO/PIO/FDC cards).
+
+	  This driver does NOT support the PCI versions of these cards. The
+	  PCI versions are supported by the Qlogic ISP driver ("Qlogic ISP
+	  SCSI support"), below.
+
+	  Information about this driver is contained in
+	  <file:Documentation/scsi/qlogicfas.txt>.  You should also read the
+	  SCSI-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called qlogicfas.
+
+config SCSI_QLOGIC_1280
+	tristate "Qlogic QLA 1240/1x80/1x160 SCSI support"
+	depends on PCI && SCSI
+	help
+	  Say Y if you have a QLogic ISP1240/1x80/1x160 SCSI host adapter.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called qla1280.
+
+config SCSI_QLOGICPTI
+	tristate "PTI Qlogic, ISP Driver"
+	depends on SBUS && SCSI
+	help
+	  This driver supports SBUS SCSI controllers from PTI or QLogic. These
+	  controllers are known under Solaris as qpti and in the openprom as
+	  PTI,ptisp or QLGC,isp. Note that PCI QLogic SCSI controllers are
+	  driven by a different driver.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called qlogicpti.
+
+source "drivers/scsi/qla2xxx/Kconfig"
+source "drivers/scsi/qla4xxx/Kconfig"
+source "drivers/scsi/qedi/Kconfig"
+source "drivers/scsi/qedf/Kconfig"
+
+config SCSI_LPFC
+	tristate "Emulex LightPulse Fibre Channel Support"
+	depends on PCI && SCSI
+	depends on SCSI_FC_ATTRS
+	depends on NVME_TARGET_FC || NVME_TARGET_FC=n
+	depends on NVME_FC || NVME_FC=n
+	select CRC_T10DIF
+	---help---
+          This lpfc driver supports the Emulex LightPulse
+          Family of Fibre Channel PCI host adapters.
+
+config SCSI_LPFC_DEBUG_FS
+	bool "Emulex LightPulse Fibre Channel debugfs Support"
+	depends on SCSI_LPFC && DEBUG_FS
+	---help---
+	  This makes debugging information from the lpfc driver
+	  available via the debugfs filesystem.
+
+config SCSI_SIM710
+	tristate "Simple 53c710 SCSI support (Compaq, NCR machines)"
+	depends on (EISA || MCA) && SCSI
+	select SCSI_SPI_ATTRS
+	---help---
+	  This driver is for NCR53c710 based SCSI host adapters.
+
+	  It currently supports Compaq EISA cards and NCR MCA cards
+
+config SCSI_DC395x
+	tristate "Tekram DC395(U/UW/F) and DC315(U) SCSI support"
+	depends on PCI && SCSI
+	---help---
+	  This driver supports PCI SCSI host adapters based on the ASIC
+	  TRM-S1040 chip, e.g Tekram DC395(U/UW/F) and DC315(U) variants.
+
+	  This driver works, but is still in experimental status. So better
+	  have a bootable disk and a backup in case of emergency.
+
+	  Documentation can be found in <file:Documentation/scsi/dc395x.txt>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called dc395x.
+
+config SCSI_AM53C974
+	tristate "Tekram DC390(T) and Am53/79C974 SCSI support (new driver)"
+	depends on PCI && SCSI
+	select SCSI_SPI_ATTRS
+	---help---
+	  This driver supports PCI SCSI host adapters based on the Am53C974A
+	  chip, e.g. Tekram DC390(T), DawiControl 2974 and some onboard
+	  PCscsi/PCnet (Am53/79C974) solutions.
+	  This is a new implementation base on the generic esp_scsi driver.
+
+	  Documentation can be found in <file:Documentation/scsi/tmscsim.txt>.
+
+	  Note that this driver does NOT support Tekram DC390W/U/F, which are
+	  based on NCR/Symbios chips. Use "NCR53C8XX SCSI support" for those.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called am53c974.
+
+config SCSI_NSP32
+	tristate "Workbit NinjaSCSI-32Bi/UDE support"
+	depends on PCI && SCSI && !64BIT
+	help
+	  This is support for the Workbit NinjaSCSI-32Bi/UDE PCI/Cardbus
+	  SCSI host adapter. Please read the SCSI-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called nsp32.
+
+config SCSI_WD719X
+	tristate "Western Digital WD7193/7197/7296 support"
+	depends on PCI && SCSI
+	select EEPROM_93CX6
+	---help---
+	  This is a driver for Western Digital WD7193, WD7197 and WD7296 PCI
+	  SCSI controllers (based on WD33C296A chip).
+
+config SCSI_DEBUG
+	tristate "SCSI debugging host and device simulator"
+	depends on SCSI
+	select CRC_T10DIF
+	help
+	  This pseudo driver simulates one or more hosts (SCSI initiators),
+	  each with one or more targets, each with one or more logical units.
+	  Defaults to one of each, creating a small RAM disk device. Many
+	  parameters found in the /sys/bus/pseudo/drivers/scsi_debug
+	  directory can be tweaked at run time.
+	  See <http://sg.danny.cz/sg/sdebug26.html> for more information.
+	  Mainly used for testing and best as a module. If unsure, say N.
+
+config SCSI_MESH
+	tristate "MESH (Power Mac internal SCSI) support"
+	depends on PPC32 && PPC_PMAC && SCSI
+	help
+	  Many Power Macintoshes and clones have a MESH (Macintosh Enhanced
+	  SCSI Hardware) SCSI bus adaptor (the 7200 doesn't, but all of the
+	  other Power Macintoshes do). Say Y to include support for this SCSI
+	  adaptor.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mesh.
+
+config SCSI_MESH_SYNC_RATE
+	int "maximum synchronous transfer rate (MB/s) (0 = async)"
+	depends on SCSI_MESH
+	default "5"
+	help
+	  On Power Macintoshes (and clones) where the MESH SCSI bus adaptor
+	  drives a bus which is entirely internal to the machine (such as the
+	  7500, 7600, 8500, etc.), the MESH is capable of synchronous
+	  operation at up to 10 MB/s. On machines where the SCSI bus
+	  controlled by the MESH can have external devices connected, it is
+	  usually rated at 5 MB/s. 5 is a safe value here unless you know the
+	  MESH SCSI bus is internal only; in that case you can say 10. Say 0
+	  to disable synchronous operation.
+
+config SCSI_MESH_RESET_DELAY_MS
+	int "initial bus reset delay (ms) (0 = no reset)"
+	depends on SCSI_MESH
+	default "4000"
+
+config SCSI_MAC53C94
+	tristate "53C94 (Power Mac external SCSI) support"
+	depends on PPC32 && PPC_PMAC && SCSI
+	help
+	  On Power Macintoshes (and clones) with two SCSI buses, the external
+	  SCSI bus is usually controlled by a 53C94 SCSI bus adaptor. Older
+	  machines which only have one SCSI bus, such as the 7200, also use
+	  the 53C94. Say Y to include support for the 53C94.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mac53c94.
+
+source "drivers/scsi/arm/Kconfig"
+
+config JAZZ_ESP
+	bool "MIPS JAZZ FAS216 SCSI support"
+	depends on MACH_JAZZ && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  This is the driver for the onboard SCSI host adapter of MIPS Magnum
+	  4000, Acer PICA, Olivetti M700-10 and a few other identical OEM
+	  systems.
+
+config A3000_SCSI
+	tristate "A3000 WD33C93A support"
+	depends on AMIGA && SCSI
+	help
+	  If you have an Amiga 3000 and have SCSI devices connected to the
+	  built-in SCSI controller, say Y. Otherwise, say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called a3000.
+
+config A2091_SCSI
+	tristate "A2091/A590 WD33C93A support"
+	depends on ZORRO && SCSI
+	help
+	  If you have a Commodore A2091 SCSI controller, say Y. Otherwise,
+	  say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called a2091.
+
+config GVP11_SCSI
+	tristate "GVP Series II WD33C93A support"
+	depends on ZORRO && SCSI
+	---help---
+	  If you have a Great Valley Products Series II SCSI controller,
+	  answer Y. Also say Y if you have a later model of GVP SCSI
+	  controller (such as the GVP A4008 or a Combo board). Otherwise,
+	  answer N. This driver does NOT work for the T-Rex series of
+	  accelerators from TekMagic and GVP-M.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called gvp11.
+
+config SCSI_A4000T
+	tristate "A4000T NCR53c710 SCSI support"
+	depends on AMIGA && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  If you have an Amiga 4000T and have SCSI devices connected to the
+	  built-in SCSI controller, say Y. Otherwise, say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called a4000t.
+
+config SCSI_ZORRO7XX
+	tristate "Zorro NCR53c710 SCSI support"
+	depends on ZORRO && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  Support for various NCR53c710-based SCSI controllers on Zorro
+	  expansion boards for the Amiga.
+	  This includes:
+	    - the Amiga 4091 Zorro III SCSI-2 controller,
+	    - the MacroSystem Development's WarpEngine Amiga SCSI-2 controller
+	      (info at
+	      <http://www.lysator.liu.se/amiga/ar/guide/ar310.guide?FEATURE5>),
+	    - the SCSI controller on the Phase5 Blizzard PowerUP 603e+
+	      accelerator card for the Amiga 1200,
+	    - the SCSI controller on the GVP Turbo 040/060 accelerator.
+
+config ATARI_SCSI
+	tristate "Atari native SCSI support"
+	depends on ATARI && SCSI
+	select SCSI_SPI_ATTRS
+	select NVRAM
+	---help---
+	  If you have an Atari with built-in NCR5380 SCSI controller (TT,
+	  Falcon, ...) say Y to get it supported. Of course also, if you have
+	  a compatible SCSI controller (e.g. for Medusa).
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called atari_scsi.
+
+	  This driver supports both styles of NCR integration into the
+	  system: the TT style (separate DMA), and the Falcon style (via
+	  ST-DMA, replacing ACSI).  It does NOT support other schemes, like
+	  in the Hades (without DMA).
+
+config MAC_SCSI
+	tristate "Macintosh NCR5380 SCSI"
+	depends on MAC && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  This is the NCR 5380 SCSI controller included on most of the 68030
+	  based Macintoshes.  If you have one of these say Y and read the
+	  SCSI-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+config SCSI_MAC_ESP
+	tristate "Macintosh NCR53c9[46] SCSI"
+	depends on MAC && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  This is the NCR 53c9x SCSI controller found on most of the 68040
+	  based Macintoshes.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called mac_esp.
+
+config MVME147_SCSI
+	bool "WD33C93 SCSI driver for MVME147"
+	depends on MVME147 && SCSI=y
+	select SCSI_SPI_ATTRS
+	help
+	  Support for the on-board SCSI controller on the Motorola MVME147
+	  single-board computer.
+
+config MVME16x_SCSI
+	tristate "NCR53C710 SCSI driver for MVME16x"
+	depends on MVME16x && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  The Motorola MVME162, 166, 167, 172 and 177 boards use the NCR53C710
+	  SCSI controller chip.  Almost everyone using one of these boards
+	  will want to say Y to this question.
+
+config BVME6000_SCSI
+	tristate "NCR53C710 SCSI driver for BVME6000"
+	depends on BVME6000 && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  The BVME4000 and BVME6000 boards from BVM Ltd use the NCR53C710
+	  SCSI controller chip.  Almost everyone using one of these boards
+	  will want to say Y to this question.
+
+config SUN3_SCSI
+	tristate "Sun3 NCR5380 SCSI"
+	depends on SUN3 && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  This option will enable support for the OBIO (onboard io) NCR5380
+	  SCSI controller found in the Sun 3/50 and 3/60, as well as for
+	  "Sun3" type VME scsi controllers also based on the NCR5380.
+	  General Linux information on the Sun 3 series (now discontinued)
+	  is at <http://www.angelfire.com/ca2/tech68k/sun3.html>.
+
+config SUN3X_ESP
+	bool "Sun3x ESP SCSI"
+	depends on SUN3X && SCSI=y
+	select SCSI_SPI_ATTRS
+	help
+	  The ESP was an on-board SCSI controller used on Sun 3/80
+	  machines.  Say Y here to compile in support for it.
+
+config SCSI_SUNESP
+	tristate "Sparc ESP Scsi Driver"
+	depends on SBUS && SCSI
+	select SCSI_SPI_ATTRS
+	help
+	  This is the driver for the Sun ESP SCSI host adapter. The ESP
+	  chipset is present in most SPARC SBUS-based computers and
+	  supports the Emulex family of ESP SCSI chips (esp100, esp100A,
+	  esp236, fas101, fas236) as well as the Qlogic fas366 SCSI chip.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called sun_esp.
+
+config ZFCP
+	tristate "FCP host bus adapter driver for IBM eServer zSeries"
+	depends on S390 && QDIO && SCSI
+	depends on SCSI_FC_ATTRS
+	help
+          If you want to access SCSI devices attached to your IBM eServer
+          zSeries by means of Fibre Channel interfaces say Y.
+          For details please refer to the documentation provided by IBM at
+          <http://oss.software.ibm.com/developerworks/opensource/linux390>
+
+          This driver is also available as a module. This module will be
+          called zfcp. If you want to compile it as a module, say M here
+          and read <file:Documentation/kbuild/modules.txt>.
+
+config SCSI_PMCRAID
+	tristate "PMC SIERRA Linux MaxRAID adapter support"
+	depends on PCI && SCSI && NET
+	select SGL_ALLOC
+	---help---
+	  This driver supports the PMC SIERRA MaxRAID adapters.
+
+config SCSI_PM8001
+	tristate "PMC-Sierra SPC 8001 SAS/SATA Based Host Adapter driver"
+	depends on PCI && SCSI
+	select SCSI_SAS_LIBSAS
+	help
+	  This driver supports PMC-Sierra PCIE SAS/SATA 8x6G SPC 8001 chip
+	  based host adapters.
+
+config SCSI_BFA_FC
+	tristate "Brocade BFA Fibre Channel Support"
+	depends on PCI && SCSI
+	depends on SCSI_FC_ATTRS
+	help
+	  This bfa driver supports all Brocade PCIe FC/FCOE host adapters.
+
+	  To compile this driver as a module, choose M here. The module will
+	  be called bfa.
+
+config SCSI_VIRTIO
+	tristate "virtio-scsi support"
+	depends on VIRTIO
+	help
+          This is the virtual HBA driver for virtio.  If the kernel will
+          be used in a virtual machine, say Y or M.
+
+source "drivers/scsi/csiostor/Kconfig"
+
+endif # SCSI_LOWLEVEL
+
+source "drivers/scsi/pcmcia/Kconfig"
+
+source "drivers/scsi/device_handler/Kconfig"
+
+source "drivers/scsi/osd/Kconfig"
+
+endmenu
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
new file mode 100644
index 0000000..56c9403
--- /dev/null
+++ b/drivers/scsi/Makefile
@@ -0,0 +1,205 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for linux/drivers/scsi
+#
+# 30 May 2000, Christoph Hellwig <hch@infradead.org>
+# Rewritten to use lists instead of if-statements.
+#
+# 20 Sep 2000, Torben Mathiasen <tmm@image.dk>
+# Changed link order to reflect new scsi initialization.
+#
+# *!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!
+# The link order must be, SCSI Core, SCSI HBA drivers, and
+# lastly SCSI peripheral drivers (disk/tape/cdrom/etc.) to
+# satisfy certain initialization assumptions in the SCSI layer.
+# *!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!
+
+
+CFLAGS_aha152x.o =   -DAHA152X_STAT -DAUTOCONF
+CFLAGS_gdth.o    = # -DDEBUG_GDTH=2 -D__SERIAL__ -D__COM2__ -DGDTH_STATISTICS
+
+obj-$(CONFIG_PCMCIA)		+= pcmcia/
+
+obj-$(CONFIG_SCSI)		+= scsi_mod.o
+
+obj-$(CONFIG_RAID_ATTRS)	+= raid_class.o
+
+# --- NOTE ORDERING HERE ---
+# For kernel non-modular link, transport attributes need to
+# be initialised before drivers
+# --------------------------
+obj-$(CONFIG_SCSI_SPI_ATTRS)	+= scsi_transport_spi.o
+obj-$(CONFIG_SCSI_FC_ATTRS) 	+= scsi_transport_fc.o
+obj-$(CONFIG_SCSI_ISCSI_ATTRS)	+= scsi_transport_iscsi.o
+obj-$(CONFIG_SCSI_SAS_ATTRS)	+= scsi_transport_sas.o
+obj-$(CONFIG_SCSI_SAS_LIBSAS)	+= libsas/
+obj-$(CONFIG_SCSI_SRP_ATTRS)	+= scsi_transport_srp.o
+obj-$(CONFIG_SCSI_DH)		+= device_handler/
+
+obj-$(CONFIG_LIBFC)		+= libfc/
+obj-$(CONFIG_LIBFCOE)		+= fcoe/
+obj-$(CONFIG_FCOE)		+= fcoe/
+obj-$(CONFIG_FCOE_FNIC)		+= fnic/
+obj-$(CONFIG_SCSI_SNIC)		+= snic/
+obj-$(CONFIG_SCSI_BNX2X_FCOE)	+= libfc/ fcoe/ bnx2fc/
+obj-$(CONFIG_QEDF)		+= qedf/
+obj-$(CONFIG_ISCSI_TCP) 	+= libiscsi.o	libiscsi_tcp.o iscsi_tcp.o
+obj-$(CONFIG_INFINIBAND_ISER) 	+= libiscsi.o
+obj-$(CONFIG_ISCSI_BOOT_SYSFS)	+= iscsi_boot_sysfs.o
+obj-$(CONFIG_SCSI_A4000T)	+= 53c700.o	a4000t.o
+obj-$(CONFIG_SCSI_ZORRO7XX)	+= 53c700.o	zorro7xx.o
+obj-$(CONFIG_A3000_SCSI)	+= a3000.o	wd33c93.o
+obj-$(CONFIG_A2091_SCSI)	+= a2091.o	wd33c93.o
+obj-$(CONFIG_GVP11_SCSI)	+= gvp11.o	wd33c93.o
+obj-$(CONFIG_MVME147_SCSI)	+= mvme147.o	wd33c93.o
+obj-$(CONFIG_SGIWD93_SCSI)	+= sgiwd93.o	wd33c93.o
+obj-$(CONFIG_ATARI_SCSI)	+= atari_scsi.o
+obj-$(CONFIG_MAC_SCSI)		+= mac_scsi.o
+obj-$(CONFIG_SCSI_MAC_ESP)	+= esp_scsi.o	mac_esp.o
+obj-$(CONFIG_SUN3_SCSI)		+= sun3_scsi.o  sun3_scsi_vme.o
+obj-$(CONFIG_MVME16x_SCSI)	+= 53c700.o	mvme16x_scsi.o
+obj-$(CONFIG_BVME6000_SCSI)	+= 53c700.o	bvme6000_scsi.o
+obj-$(CONFIG_SCSI_SIM710)	+= 53c700.o	sim710.o
+obj-$(CONFIG_SCSI_ADVANSYS)	+= advansys.o
+obj-$(CONFIG_SCSI_BUSLOGIC)	+= BusLogic.o
+obj-$(CONFIG_SCSI_DPT_I2O)	+= dpt_i2o.o
+obj-$(CONFIG_SCSI_ARCMSR)	+= arcmsr/
+obj-$(CONFIG_SCSI_AHA152X)	+= aha152x.o
+obj-$(CONFIG_SCSI_AHA1542)	+= aha1542.o
+obj-$(CONFIG_SCSI_AHA1740)	+= aha1740.o
+obj-$(CONFIG_SCSI_AIC7XXX)	+= aic7xxx/
+obj-$(CONFIG_SCSI_AIC79XX)	+= aic7xxx/
+obj-$(CONFIG_SCSI_AACRAID)	+= aacraid/
+obj-$(CONFIG_SCSI_AIC94XX)	+= aic94xx/
+obj-$(CONFIG_SCSI_PM8001)	+= pm8001/
+obj-$(CONFIG_SCSI_ISCI)		+= isci/
+obj-$(CONFIG_SCSI_IPS)		+= ips.o
+obj-$(CONFIG_SCSI_GENERIC_NCR5380) += g_NCR5380.o
+obj-$(CONFIG_SCSI_NCR_D700)	+= 53c700.o NCR_D700.o
+obj-$(CONFIG_SCSI_NCR_Q720)	+= NCR_Q720_mod.o
+obj-$(CONFIG_SCSI_QLOGIC_FAS)	+= qlogicfas408.o	qlogicfas.o
+obj-$(CONFIG_PCMCIA_QLOGIC)	+= qlogicfas408.o
+obj-$(CONFIG_SCSI_QLOGIC_1280)	+= qla1280.o 
+obj-$(CONFIG_SCSI_QLA_FC)	+= qla2xxx/
+obj-$(CONFIG_SCSI_QLA_ISCSI)	+= libiscsi.o qla4xxx/
+obj-$(CONFIG_SCSI_LPFC)		+= lpfc/
+obj-$(CONFIG_SCSI_BFA_FC)	+= bfa/
+obj-$(CONFIG_SCSI_CHELSIO_FCOE)	+= csiostor/
+obj-$(CONFIG_SCSI_DMX3191D)	+= dmx3191d.o
+obj-$(CONFIG_SCSI_HPSA)		+= hpsa.o
+obj-$(CONFIG_SCSI_SMARTPQI)	+= smartpqi/
+obj-$(CONFIG_SCSI_SYM53C8XX_2)	+= sym53c8xx_2/
+obj-$(CONFIG_SCSI_ZALON)	+= zalon7xx.o
+obj-$(CONFIG_SCSI_DC395x)	+= dc395x.o
+obj-$(CONFIG_SCSI_AM53C974)	+= esp_scsi.o	am53c974.o
+obj-$(CONFIG_CXLFLASH)		+= cxlflash/
+obj-$(CONFIG_MEGARAID_LEGACY)	+= megaraid.o
+obj-$(CONFIG_MEGARAID_NEWGEN)	+= megaraid/
+obj-$(CONFIG_MEGARAID_SAS)	+= megaraid/
+obj-$(CONFIG_SCSI_MPT3SAS)	+= mpt3sas/
+obj-$(CONFIG_SCSI_UFSHCD)	+= ufs/
+obj-$(CONFIG_SCSI_ACARD)	+= atp870u.o
+obj-$(CONFIG_SCSI_SUNESP)	+= esp_scsi.o	sun_esp.o
+obj-$(CONFIG_SCSI_GDTH)		+= gdth.o
+obj-$(CONFIG_SCSI_INITIO)	+= initio.o
+obj-$(CONFIG_SCSI_INIA100)	+= a100u2w.o
+obj-$(CONFIG_SCSI_QLOGICPTI)	+= qlogicpti.o
+obj-$(CONFIG_SCSI_MESH)		+= mesh.o
+obj-$(CONFIG_SCSI_MAC53C94)	+= mac53c94.o
+obj-$(CONFIG_BLK_DEV_3W_XXXX_RAID) += 3w-xxxx.o
+obj-$(CONFIG_SCSI_3W_9XXX)	+= 3w-9xxx.o
+obj-$(CONFIG_SCSI_3W_SAS)	+= 3w-sas.o
+obj-$(CONFIG_SCSI_PPA)		+= ppa.o
+obj-$(CONFIG_SCSI_IMM)		+= imm.o
+obj-$(CONFIG_JAZZ_ESP)		+= esp_scsi.o	jazz_esp.o
+obj-$(CONFIG_SUN3X_ESP)		+= esp_scsi.o	sun3x_esp.o
+obj-$(CONFIG_SCSI_LASI700)	+= 53c700.o lasi700.o
+obj-$(CONFIG_SCSI_SNI_53C710)	+= 53c700.o sni_53c710.o
+obj-$(CONFIG_SCSI_NSP32)	+= nsp32.o
+obj-$(CONFIG_SCSI_IPR)		+= ipr.o
+obj-$(CONFIG_SCSI_IBMVSCSI)	+= ibmvscsi/
+obj-$(CONFIG_SCSI_IBMVSCSIS)	+= ibmvscsi_tgt/
+obj-$(CONFIG_SCSI_IBMVFC)	+= ibmvscsi/
+obj-$(CONFIG_SCSI_HPTIOP)	+= hptiop.o
+obj-$(CONFIG_SCSI_STEX)		+= stex.o
+obj-$(CONFIG_SCSI_MVSAS)	+= mvsas/
+obj-$(CONFIG_SCSI_MVUMI)	+= mvumi.o
+obj-$(CONFIG_PS3_ROM)		+= ps3rom.o
+obj-$(CONFIG_SCSI_CXGB3_ISCSI)	+= libiscsi.o libiscsi_tcp.o cxgbi/
+obj-$(CONFIG_SCSI_CXGB4_ISCSI)	+= libiscsi.o libiscsi_tcp.o cxgbi/
+obj-$(CONFIG_SCSI_BNX2_ISCSI)	+= libiscsi.o bnx2i/
+obj-$(CONFIG_QEDI)          += libiscsi.o qedi/
+obj-$(CONFIG_BE2ISCSI)		+= libiscsi.o be2iscsi/
+obj-$(CONFIG_SCSI_ESAS2R)	+= esas2r/
+obj-$(CONFIG_SCSI_PMCRAID)	+= pmcraid.o
+obj-$(CONFIG_SCSI_VIRTIO)	+= virtio_scsi.o
+obj-$(CONFIG_VMWARE_PVSCSI)	+= vmw_pvscsi.o
+obj-$(CONFIG_XEN_SCSI_FRONTEND)	+= xen-scsifront.o
+obj-$(CONFIG_HYPERV_STORAGE)	+= hv_storvsc.o
+obj-$(CONFIG_SCSI_WD719X)	+= wd719x.o
+
+obj-$(CONFIG_ARM)		+= arm/
+
+obj-$(CONFIG_CHR_DEV_ST)	+= st.o
+obj-$(CONFIG_CHR_DEV_OSST)	+= osst.o
+obj-$(CONFIG_BLK_DEV_SD)	+= sd_mod.o
+obj-$(CONFIG_BLK_DEV_SR)	+= sr_mod.o
+obj-$(CONFIG_CHR_DEV_SG)	+= sg.o
+obj-$(CONFIG_CHR_DEV_SCH)	+= ch.o
+obj-$(CONFIG_SCSI_ENCLOSURE)	+= ses.o
+
+obj-$(CONFIG_SCSI_OSD_INITIATOR) += osd/
+obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/
+
+# This goes last, so that "real" scsi devices probe earlier
+obj-$(CONFIG_SCSI_DEBUG)	+= scsi_debug.o
+scsi_mod-y			+= scsi.o hosts.o scsi_ioctl.o \
+				   scsicam.o scsi_error.o scsi_lib.o
+scsi_mod-y			+= scsi_common.o
+scsi_mod-$(CONFIG_SCSI_CONSTANTS) += constants.o
+scsi_mod-$(CONFIG_SCSI_DMA)	+= scsi_lib_dma.o
+scsi_mod-y			+= scsi_scan.o scsi_sysfs.o scsi_devinfo.o
+scsi_mod-$(CONFIG_SCSI_NETLINK)	+= scsi_netlink.o
+scsi_mod-$(CONFIG_SYSCTL)	+= scsi_sysctl.o
+scsi_mod-$(CONFIG_SCSI_PROC_FS)	+= scsi_proc.o
+scsi_mod-$(CONFIG_BLK_DEBUG_FS)	+= scsi_debugfs.o
+scsi_mod-y			+= scsi_trace.o scsi_logging.o
+scsi_mod-$(CONFIG_PM)		+= scsi_pm.o
+scsi_mod-$(CONFIG_SCSI_DH)	+= scsi_dh.o
+
+hv_storvsc-y			:= storvsc_drv.o
+
+sd_mod-objs	:= sd.o
+sd_mod-$(CONFIG_BLK_DEV_INTEGRITY) += sd_dif.o
+sd_mod-$(CONFIG_BLK_DEV_ZONED) += sd_zbc.o
+
+sr_mod-objs	:= sr.o sr_ioctl.o sr_vendor.o
+ncr53c8xx-flags-$(CONFIG_SCSI_ZALON) \
+		:= -DCONFIG_NCR53C8XX_PREFETCH -DSCSI_NCR_BIG_ENDIAN \
+			-DCONFIG_SCSI_NCR53C8XX_NO_WORD_TRANSFERS
+CFLAGS_ncr53c8xx.o	:= $(ncr53c8xx-flags-y) $(ncr53c8xx-flags-m)
+zalon7xx-objs	:= zalon.o ncr53c8xx.o
+NCR_Q720_mod-objs	:= NCR_Q720.o ncr53c8xx.o
+
+# Files generated that shall be removed upon make clean
+clean-files :=	53c700_d.h 53c700_u.h scsi_devinfo_tbl.c
+
+$(obj)/53c700.o $(MODVERDIR)/$(obj)/53c700.ver: $(obj)/53c700_d.h
+
+$(obj)/scsi_sysfs.o: $(obj)/scsi_devinfo_tbl.c
+
+quiet_cmd_bflags = GEN     $@
+	cmd_bflags = sed -n 's/.*BLIST_\([A-Z0-9_]*\) *.*/BLIST_FLAG_NAME(\1),/p' $< > $@
+
+$(obj)/scsi_devinfo_tbl.c: include/scsi/scsi_devinfo.h
+	$(call if_changed,bflags)
+
+# If you want to play with the firmware, uncomment
+# GENERATE_FIRMWARE := 1
+
+ifdef GENERATE_FIRMWARE
+
+$(obj)/53c700_d.h: $(src)/53c700.scr $(src)/script_asm.pl
+	$(PERL) -s $(src)/script_asm.pl -ncr7x0_family $@ $(@:_d.h=_u.h) < $<
+
+endif
diff --git a/drivers/scsi/cxgbi/Kconfig b/drivers/scsi/cxgbi/Kconfig
new file mode 100644
index 0000000..17eb5d5
--- /dev/null
+++ b/drivers/scsi/cxgbi/Kconfig
@@ -0,0 +1,2 @@
+source "drivers/scsi/cxgbi/cxgb3i/Kconfig"
+source "drivers/scsi/cxgbi/cxgb4i/Kconfig"
diff --git a/drivers/scsi/cxgbi/Makefile b/drivers/scsi/cxgbi/Makefile
new file mode 100644
index 0000000..a73781a
--- /dev/null
+++ b/drivers/scsi/cxgbi/Makefile
@@ -0,0 +1,4 @@
+ccflags-y += -Idrivers/net/ethernet/chelsio/libcxgb
+
+obj-$(CONFIG_SCSI_CXGB3_ISCSI)	+= libcxgbi.o cxgb3i/
+obj-$(CONFIG_SCSI_CXGB4_ISCSI)	+= libcxgbi.o cxgb4i/
diff --git a/drivers/scsi/cxgbi/cxgb3i/Kbuild b/drivers/scsi/cxgbi/cxgb3i/Kbuild
new file mode 100644
index 0000000..663c52e
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb3i/Kbuild
@@ -0,0 +1,4 @@
+ccflags-y += -I$(srctree)/drivers/net/ethernet/chelsio/cxgb3
+ccflags-y += -I$(srctree)/drivers/net/ethernet/chelsio/libcxgb
+
+obj-$(CONFIG_SCSI_CXGB3_ISCSI) += cxgb3i.o
diff --git a/drivers/scsi/cxgbi/cxgb3i/Kconfig b/drivers/scsi/cxgbi/cxgb3i/Kconfig
new file mode 100644
index 0000000..f68c871
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb3i/Kconfig
@@ -0,0 +1,11 @@
+config SCSI_CXGB3_ISCSI
+	tristate "Chelsio T3 iSCSI support"
+	depends on PCI && INET && (IPV6 || IPV6=n)
+	select NETDEVICES
+	select ETHERNET
+	select NET_VENDOR_CHELSIO
+	select CHELSIO_T3
+	select CHELSIO_LIB
+	select SCSI_ISCSI_ATTRS
+	---help---
+	  This driver supports iSCSI offload for the Chelsio T3 devices.
diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
new file mode 100644
index 0000000..bf07735
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
@@ -0,0 +1,1405 @@
+/*
+ * cxgb3i_offload.c: Chelsio S3xx iscsi offloaded tcp connection management
+ *
+ * Copyright (C) 2003-2015 Chelsio Communications.  All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the LICENSE file included in this
+ * release for licensing terms and conditions.
+ *
+ * Written by:	Dimitris Michailidis (dm@chelsio.com)
+ *		Karen Xie (kxie@chelsio.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <scsi/scsi_host.h>
+
+#include "common.h"
+#include "t3_cpl.h"
+#include "t3cdev.h"
+#include "cxgb3_defs.h"
+#include "cxgb3_ctl_defs.h"
+#include "cxgb3_offload.h"
+#include "firmware_exports.h"
+#include "cxgb3i.h"
+
+static unsigned int dbg_level;
+#include "../libcxgbi.h"
+
+#define DRV_MODULE_NAME         "cxgb3i"
+#define DRV_MODULE_DESC         "Chelsio T3 iSCSI Driver"
+#define DRV_MODULE_VERSION	"2.0.1-ko"
+#define DRV_MODULE_RELDATE	"Apr. 2015"
+
+static char version[] =
+	DRV_MODULE_DESC " " DRV_MODULE_NAME
+	" v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+
+MODULE_AUTHOR("Chelsio Communications, Inc.");
+MODULE_DESCRIPTION(DRV_MODULE_DESC);
+MODULE_VERSION(DRV_MODULE_VERSION);
+MODULE_LICENSE("GPL");
+
+module_param(dbg_level, uint, 0644);
+MODULE_PARM_DESC(dbg_level, "debug flag (default=0)");
+
+static int cxgb3i_rcv_win = 256 * 1024;
+module_param(cxgb3i_rcv_win, int, 0644);
+MODULE_PARM_DESC(cxgb3i_rcv_win, "TCP receive window in bytes (default=256KB)");
+
+static int cxgb3i_snd_win = 128 * 1024;
+module_param(cxgb3i_snd_win, int, 0644);
+MODULE_PARM_DESC(cxgb3i_snd_win, "TCP send window in bytes (default=128KB)");
+
+static int cxgb3i_rx_credit_thres = 10 * 1024;
+module_param(cxgb3i_rx_credit_thres, int, 0644);
+MODULE_PARM_DESC(cxgb3i_rx_credit_thres,
+		 "RX credits return threshold in bytes (default=10KB)");
+
+static unsigned int cxgb3i_max_connect = 8 * 1024;
+module_param(cxgb3i_max_connect, uint, 0644);
+MODULE_PARM_DESC(cxgb3i_max_connect, "Max. # of connections (default=8092)");
+
+static unsigned int cxgb3i_sport_base = 20000;
+module_param(cxgb3i_sport_base, uint, 0644);
+MODULE_PARM_DESC(cxgb3i_sport_base, "starting port number (default=20000)");
+
+static void cxgb3i_dev_open(struct t3cdev *);
+static void cxgb3i_dev_close(struct t3cdev *);
+static void cxgb3i_dev_event_handler(struct t3cdev *, u32, u32);
+
+static struct cxgb3_client t3_client = {
+	.name = DRV_MODULE_NAME,
+	.handlers = cxgb3i_cpl_handlers,
+	.add = cxgb3i_dev_open,
+	.remove = cxgb3i_dev_close,
+	.event_handler = cxgb3i_dev_event_handler,
+};
+
+static struct scsi_host_template cxgb3i_host_template = {
+	.module		= THIS_MODULE,
+	.name		= DRV_MODULE_NAME,
+	.proc_name	= DRV_MODULE_NAME,
+	.can_queue	= CXGB3I_SCSI_HOST_QDEPTH,
+	.queuecommand	= iscsi_queuecommand,
+	.change_queue_depth = scsi_change_queue_depth,
+	.sg_tablesize	= SG_ALL,
+	.max_sectors	= 0xFFFF,
+	.cmd_per_lun	= ISCSI_DEF_CMD_PER_LUN,
+	.eh_timed_out	= iscsi_eh_cmd_timed_out,
+	.eh_abort_handler = iscsi_eh_abort,
+	.eh_device_reset_handler = iscsi_eh_device_reset,
+	.eh_target_reset_handler = iscsi_eh_recover_target,
+	.target_alloc	= iscsi_target_alloc,
+	.use_clustering	= DISABLE_CLUSTERING,
+	.this_id	= -1,
+	.track_queue_depth = 1,
+};
+
+static struct iscsi_transport cxgb3i_iscsi_transport = {
+	.owner		= THIS_MODULE,
+	.name		= DRV_MODULE_NAME,
+	/* owner and name should be set already */
+	.caps		= CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST
+				| CAP_DATADGST | CAP_DIGEST_OFFLOAD |
+				CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO,
+	.attr_is_visible	= cxgbi_attr_is_visible,
+	.get_host_param	= cxgbi_get_host_param,
+	.set_host_param	= cxgbi_set_host_param,
+	/* session management */
+	.create_session	= cxgbi_create_session,
+	.destroy_session	= cxgbi_destroy_session,
+	.get_session_param = iscsi_session_get_param,
+	/* connection management */
+	.create_conn	= cxgbi_create_conn,
+	.bind_conn	= cxgbi_bind_conn,
+	.destroy_conn	= iscsi_tcp_conn_teardown,
+	.start_conn	= iscsi_conn_start,
+	.stop_conn	= iscsi_conn_stop,
+	.get_conn_param	= iscsi_conn_get_param,
+	.set_param	= cxgbi_set_conn_param,
+	.get_stats	= cxgbi_get_conn_stats,
+	/* pdu xmit req from user space */
+	.send_pdu	= iscsi_conn_send_pdu,
+	/* task */
+	.init_task	= iscsi_tcp_task_init,
+	.xmit_task	= iscsi_tcp_task_xmit,
+	.cleanup_task	= cxgbi_cleanup_task,
+	/* pdu */
+	.alloc_pdu	= cxgbi_conn_alloc_pdu,
+	.init_pdu	= cxgbi_conn_init_pdu,
+	.xmit_pdu	= cxgbi_conn_xmit_pdu,
+	.parse_pdu_itt	= cxgbi_parse_pdu_itt,
+	/* TCP connect/disconnect */
+	.get_ep_param	= cxgbi_get_ep_param,
+	.ep_connect	= cxgbi_ep_connect,
+	.ep_poll	= cxgbi_ep_poll,
+	.ep_disconnect	= cxgbi_ep_disconnect,
+	/* Error recovery timeout call */
+	.session_recovery_timedout = iscsi_session_recovery_timedout,
+};
+
+static struct scsi_transport_template *cxgb3i_stt;
+
+/*
+ * CPL (Chelsio Protocol Language) defines a message passing interface between
+ * the host driver and Chelsio asic.
+ * The section below implments CPLs that related to iscsi tcp connection
+ * open/close/abort and data send/receive.
+ */
+
+static int push_tx_frames(struct cxgbi_sock *csk, int req_completion);
+
+static void send_act_open_req(struct cxgbi_sock *csk, struct sk_buff *skb,
+			      const struct l2t_entry *e)
+{
+	unsigned int wscale = cxgbi_sock_compute_wscale(csk->rcv_win);
+	struct cpl_act_open_req *req = (struct cpl_act_open_req *)skb->head;
+
+	skb->priority = CPL_PRIORITY_SETUP;
+
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, csk->atid));
+	req->local_port = csk->saddr.sin_port;
+	req->peer_port = csk->daddr.sin_port;
+	req->local_ip = csk->saddr.sin_addr.s_addr;
+	req->peer_ip = csk->daddr.sin_addr.s_addr;
+
+	req->opt0h = htonl(V_KEEP_ALIVE(1) | F_TCAM_BYPASS |
+			V_WND_SCALE(wscale) | V_MSS_IDX(csk->mss_idx) |
+			V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
+	req->opt0l = htonl(V_ULP_MODE(ULP2_MODE_ISCSI) |
+			V_RCV_BUFSIZ(csk->rcv_win >> 10));
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u, %pI4:%u-%pI4:%u, %u,%u,%u.\n",
+		csk, csk->state, csk->flags, csk->atid,
+		&req->local_ip, ntohs(req->local_port),
+		&req->peer_ip, ntohs(req->peer_port),
+		csk->mss_idx, e->idx, e->smt_idx);
+
+	l2t_send(csk->cdev->lldev, skb, csk->l2t);
+}
+
+static inline void act_open_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
+{
+	cxgbi_sock_act_open_req_arp_failure(NULL, skb);
+}
+
+/*
+ * CPL connection close request: host ->
+ *
+ * Close a connection by sending a CPL_CLOSE_CON_REQ message and queue it to
+ * the write queue (i.e., after any unsent txt data).
+ */
+static void send_close_req(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb = csk->cpl_close;
+	struct cpl_close_con_req *req = (struct cpl_close_con_req *)skb->head;
+	unsigned int tid = csk->tid;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	csk->cpl_close = NULL;
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
+	req->wr.wr_lo = htonl(V_WR_TID(tid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
+	req->rsvd = htonl(csk->write_seq);
+
+	cxgbi_sock_skb_entail(csk, skb);
+	if (csk->state >= CTP_ESTABLISHED)
+		push_tx_frames(csk, 1);
+}
+
+/*
+ * CPL connection abort request: host ->
+ *
+ * Send an ABORT_REQ message. Makes sure we do not send multiple ABORT_REQs
+ * for the same connection and also that we do not try to send a message
+ * after the connection has closed.
+ */
+static void abort_arp_failure(struct t3cdev *tdev, struct sk_buff *skb)
+{
+	struct cpl_abort_req *req = cplhdr(skb);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"t3dev 0x%p, tid %u, skb 0x%p.\n",
+		tdev, GET_TID(req), skb);
+	req->cmd = CPL_ABORT_NO_RST;
+	cxgb3_ofld_send(tdev, skb);
+}
+
+static void send_abort_req(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb = csk->cpl_abort_req;
+	struct cpl_abort_req *req;
+
+	if (unlikely(csk->state == CTP_ABORTING || !skb))
+		return;
+	cxgbi_sock_set_state(csk, CTP_ABORTING);
+	cxgbi_sock_set_flag(csk, CTPF_ABORT_RPL_PENDING);
+	/* Purge the send queue so we don't send anything after an abort. */
+	cxgbi_sock_purge_write_queue(csk);
+
+	csk->cpl_abort_req = NULL;
+	req = (struct cpl_abort_req *)skb->head;
+	skb->priority = CPL_PRIORITY_DATA;
+	set_arp_failure_handler(skb, abort_arp_failure);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
+	req->wr.wr_lo = htonl(V_WR_TID(csk->tid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, csk->tid));
+	req->rsvd0 = htonl(csk->snd_nxt);
+	req->rsvd1 = !cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT);
+	req->cmd = CPL_ABORT_SEND_RST;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u, snd_nxt %u, 0x%x.\n",
+		csk, csk->state, csk->flags, csk->tid, csk->snd_nxt,
+		req->rsvd1);
+
+	l2t_send(csk->cdev->lldev, skb, csk->l2t);
+}
+
+/*
+ * CPL connection abort reply: host ->
+ *
+ * Send an ABORT_RPL message in response of the ABORT_REQ received.
+ */
+static void send_abort_rpl(struct cxgbi_sock *csk, int rst_status)
+{
+	struct sk_buff *skb = csk->cpl_abort_rpl;
+	struct cpl_abort_rpl *rpl = (struct cpl_abort_rpl *)skb->head;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u, status %d.\n",
+		csk, csk->state, csk->flags, csk->tid, rst_status);
+
+	csk->cpl_abort_rpl = NULL;
+	skb->priority = CPL_PRIORITY_DATA;
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+	rpl->wr.wr_lo = htonl(V_WR_TID(csk->tid));
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, csk->tid));
+	rpl->cmd = rst_status;
+	cxgb3_ofld_send(csk->cdev->lldev, skb);
+}
+
+/*
+ * CPL connection rx data ack: host ->
+ * Send RX credits through an RX_DATA_ACK CPL message. Returns the number of
+ * credits sent.
+ */
+static u32 send_rx_credits(struct cxgbi_sock *csk, u32 credits)
+{
+	struct sk_buff *skb;
+	struct cpl_rx_data_ack *req;
+	u32 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx,%u, credit %u, dack %u.\n",
+		csk, csk->state, csk->flags, csk->tid, credits, dack);
+
+	skb = alloc_wr(sizeof(*req), 0, GFP_ATOMIC);
+	if (!skb) {
+		pr_info("csk 0x%p, credit %u, OOM.\n", csk, credits);
+		return 0;
+	}
+	req = (struct cpl_rx_data_ack *)skb->head;
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, csk->tid));
+	req->credit_dack = htonl(F_RX_DACK_CHANGE | V_RX_DACK_MODE(1) |
+				V_RX_CREDITS(credits));
+	skb->priority = CPL_PRIORITY_ACK;
+	cxgb3_ofld_send(csk->cdev->lldev, skb);
+	return credits;
+}
+
+/*
+ * CPL connection tx data: host ->
+ *
+ * Send iscsi PDU via TX_DATA CPL message. Returns the number of
+ * credits sent.
+ * Each TX_DATA consumes work request credit (wrs), so we need to keep track of
+ * how many we've used so far and how many are pending (i.e., yet ack'ed by T3).
+ */
+
+static unsigned int wrlen __read_mostly;
+static unsigned int skb_wrs[SKB_WR_LIST_SIZE] __read_mostly;
+
+static void init_wr_tab(unsigned int wr_len)
+{
+	int i;
+
+	if (skb_wrs[1])		/* already initialized */
+		return;
+	for (i = 1; i < SKB_WR_LIST_SIZE; i++) {
+		int sgl_len = (3 * i) / 2 + (i & 1);
+
+		sgl_len += 3;
+		skb_wrs[i] = (sgl_len <= wr_len
+			      ? 1 : 1 + (sgl_len - 2) / (wr_len - 1));
+	}
+	wrlen = wr_len * 8;
+}
+
+static inline void make_tx_data_wr(struct cxgbi_sock *csk, struct sk_buff *skb,
+				   int len, int req_completion)
+{
+	struct tx_data_wr *req;
+	struct l2t_entry *l2t = csk->l2t;
+
+	skb_reset_transport_header(skb);
+	req = __skb_push(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA) |
+			(req_completion ? F_WR_COMPL : 0));
+	req->wr_lo = htonl(V_WR_TID(csk->tid));
+	/* len includes the length of any HW ULP additions */
+	req->len = htonl(len);
+	/* V_TX_ULP_SUBMODE sets both the mode and submode */
+	req->flags = htonl(V_TX_ULP_SUBMODE(cxgbi_skcb_ulp_mode(skb)) |
+			   V_TX_SHOVE((skb_peek(&csk->write_queue) ? 0 : 1)));
+	req->sndseq = htonl(csk->snd_nxt);
+	req->param = htonl(V_TX_PORT(l2t->smt_idx));
+
+	if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) {
+		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
+				    V_TX_CPU_IDX(csk->rss_qid));
+		/* sendbuffer is in units of 32KB. */
+		req->param |= htonl(V_TX_SNDBUF(csk->snd_win >> 15));
+		cxgbi_sock_set_flag(csk, CTPF_TX_DATA_SENT);
+	}
+}
+
+/**
+ * push_tx_frames -- start transmit
+ * @c3cn: the offloaded connection
+ * @req_completion: request wr_ack or not
+ *
+ * Prepends TX_DATA_WR or CPL_CLOSE_CON_REQ headers to buffers waiting in a
+ * connection's send queue and sends them on to T3.  Must be called with the
+ * connection's lock held.  Returns the amount of send buffer space that was
+ * freed as a result of sending queued data to T3.
+ */
+
+static void arp_failure_skb_discard(struct t3cdev *dev, struct sk_buff *skb)
+{
+	kfree_skb(skb);
+}
+
+static int push_tx_frames(struct cxgbi_sock *csk, int req_completion)
+{
+	int total_size = 0;
+	struct sk_buff *skb;
+
+	if (unlikely(csk->state < CTP_ESTABLISHED ||
+		csk->state == CTP_CLOSE_WAIT_1 || csk->state >= CTP_ABORTING)) {
+			log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_TX,
+				"csk 0x%p,%u,0x%lx,%u, in closing state.\n",
+				csk, csk->state, csk->flags, csk->tid);
+		return 0;
+	}
+
+	while (csk->wr_cred && (skb = skb_peek(&csk->write_queue)) != NULL) {
+		int len = skb->len;	/* length before skb_push */
+		int frags = skb_shinfo(skb)->nr_frags + (len != skb->data_len);
+		int wrs_needed = skb_wrs[frags];
+
+		if (wrs_needed > 1 && len + sizeof(struct tx_data_wr) <= wrlen)
+			wrs_needed = 1;
+
+		WARN_ON(frags >= SKB_WR_LIST_SIZE || wrs_needed < 1);
+
+		if (csk->wr_cred < wrs_needed) {
+			log_debug(1 << CXGBI_DBG_PDU_TX,
+				"csk 0x%p, skb len %u/%u, frag %u, wr %d<%u.\n",
+				csk, skb->len, skb->data_len, frags,
+				wrs_needed, csk->wr_cred);
+			break;
+		}
+
+		__skb_unlink(skb, &csk->write_queue);
+		skb->priority = CPL_PRIORITY_DATA;
+		skb->csum = wrs_needed;	/* remember this until the WR_ACK */
+		csk->wr_cred -= wrs_needed;
+		csk->wr_una_cred += wrs_needed;
+		cxgbi_sock_enqueue_wr(csk, skb);
+
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p, enqueue, skb len %u/%u, frag %u, wr %d, "
+			"left %u, unack %u.\n",
+			csk, skb->len, skb->data_len, frags, skb->csum,
+			csk->wr_cred, csk->wr_una_cred);
+
+		if (likely(cxgbi_skcb_test_flag(skb, SKCBF_TX_NEED_HDR))) {
+			if ((req_completion &&
+				csk->wr_una_cred == wrs_needed) ||
+			     csk->wr_una_cred >= csk->wr_max_cred / 2) {
+				req_completion = 1;
+				csk->wr_una_cred = 0;
+			}
+			len += cxgbi_ulp_extra_len(cxgbi_skcb_ulp_mode(skb));
+			make_tx_data_wr(csk, skb, len, req_completion);
+			csk->snd_nxt += len;
+			cxgbi_skcb_clear_flag(skb, SKCBF_TX_NEED_HDR);
+		}
+		total_size += skb->truesize;
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p, tid 0x%x, send skb 0x%p.\n",
+			csk, csk->tid, skb);
+		set_arp_failure_handler(skb, arp_failure_skb_discard);
+		l2t_send(csk->cdev->lldev, skb, csk->l2t);
+	}
+	return total_size;
+}
+
+/*
+ * Process a CPL_ACT_ESTABLISH message: -> host
+ * Updates connection state from an active establish CPL message.  Runs with
+ * the connection lock held.
+ */
+
+static inline void free_atid(struct cxgbi_sock *csk)
+{
+	if (cxgbi_sock_flag(csk, CTPF_HAS_ATID)) {
+		cxgb3_free_atid(csk->cdev->lldev, csk->atid);
+		cxgbi_sock_clear_flag(csk, CTPF_HAS_ATID);
+		cxgbi_sock_put(csk);
+	}
+}
+
+static int do_act_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct cxgbi_sock *csk = ctx;
+	struct cpl_act_establish *req = cplhdr(skb);
+	unsigned int tid = GET_TID(req);
+	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"atid 0x%x,tid 0x%x, csk 0x%p,%u,0x%lx, isn %u.\n",
+		atid, atid, csk, csk->state, csk->flags, rcv_isn);
+
+	cxgbi_sock_get(csk);
+	cxgbi_sock_set_flag(csk, CTPF_HAS_TID);
+	csk->tid = tid;
+	cxgb3_insert_tid(csk->cdev->lldev, &t3_client, csk, tid);
+
+	free_atid(csk);
+
+	csk->rss_qid = G_QNUM(ntohs(skb->csum));
+
+	spin_lock_bh(&csk->lock);
+	if (csk->retry_timer.function) {
+		del_timer(&csk->retry_timer);
+		csk->retry_timer.function = NULL;
+	}
+
+	if (unlikely(csk->state != CTP_ACTIVE_OPEN))
+		pr_info("csk 0x%p,%u,0x%lx,%u, got EST.\n",
+			csk, csk->state, csk->flags, csk->tid);
+
+	csk->copied_seq = csk->rcv_wup = csk->rcv_nxt = rcv_isn;
+	if (csk->rcv_win > (M_RCV_BUFSIZ << 10))
+		csk->rcv_wup -= csk->rcv_win - (M_RCV_BUFSIZ << 10);
+
+	cxgbi_sock_established(csk, ntohl(req->snd_isn), ntohs(req->tcp_opt));
+
+	if (unlikely(cxgbi_sock_flag(csk, CTPF_ACTIVE_CLOSE_NEEDED)))
+		/* upper layer has requested closing */
+		send_abort_req(csk);
+	else {
+		if (skb_queue_len(&csk->write_queue))
+			push_tx_frames(csk, 1);
+		cxgbi_conn_tx_open(csk);
+	}
+
+	spin_unlock_bh(&csk->lock);
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process a CPL_ACT_OPEN_RPL message: -> host
+ * Handle active open failures.
+ */
+static int act_open_rpl_status_to_errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+		return -ECONNREFUSED;
+	case CPL_ERR_ARP_MISS:
+		return -EHOSTUNREACH;
+	case CPL_ERR_CONN_TIMEDOUT:
+		return -ETIMEDOUT;
+	case CPL_ERR_TCAM_FULL:
+		return -ENOMEM;
+	case CPL_ERR_CONN_EXIST:
+		return -EADDRINUSE;
+	default:
+		return -EIO;
+	}
+}
+
+static void act_open_retry_timer(struct timer_list *t)
+{
+	struct cxgbi_sock *csk = from_timer(csk, t, retry_timer);
+	struct sk_buff *skb;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+	skb = alloc_wr(sizeof(struct cpl_act_open_req), 0, GFP_ATOMIC);
+	if (!skb)
+		cxgbi_sock_fail_act_open(csk, -ENOMEM);
+	else {
+		skb->sk = (struct sock *)csk;
+		set_arp_failure_handler(skb, act_open_arp_failure);
+		send_act_open_req(csk, skb, csk->l2t);
+	}
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+}
+
+static int do_act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct cxgbi_sock *csk = ctx;
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+
+	pr_info("csk 0x%p,%u,0x%lx,%u, status %u, %pI4:%u-%pI4:%u.\n",
+		csk, csk->state, csk->flags, csk->atid, rpl->status,
+		&csk->saddr.sin_addr.s_addr, ntohs(csk->saddr.sin_port),
+		&csk->daddr.sin_addr.s_addr, ntohs(csk->daddr.sin_port));
+
+	if (rpl->status != CPL_ERR_TCAM_FULL &&
+	    rpl->status != CPL_ERR_CONN_EXIST &&
+	    rpl->status != CPL_ERR_ARP_MISS)
+		cxgb3_queue_tid_release(tdev, GET_TID(rpl));
+
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+	if (rpl->status == CPL_ERR_CONN_EXIST &&
+	    csk->retry_timer.function != act_open_retry_timer) {
+		csk->retry_timer.function = act_open_retry_timer;
+		mod_timer(&csk->retry_timer, jiffies + HZ / 2);
+	} else
+		cxgbi_sock_fail_act_open(csk,
+				act_open_rpl_status_to_errno(rpl->status));
+
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process PEER_CLOSE CPL messages: -> host
+ * Handle peer FIN.
+ */
+static int do_peer_close(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
+{
+	struct cxgbi_sock *csk = ctx;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	cxgbi_sock_rcv_peer_close(csk);
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process CLOSE_CONN_RPL CPL message: -> host
+ * Process a peer ACK to our FIN.
+ */
+static int do_close_con_rpl(struct t3cdev *cdev, struct sk_buff *skb,
+			    void *ctx)
+{
+	struct cxgbi_sock *csk = ctx;
+	struct cpl_close_con_rpl *rpl = cplhdr(skb);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u, snxt %u.\n",
+		csk, csk->state, csk->flags, csk->tid, ntohl(rpl->snd_nxt));
+
+	cxgbi_sock_rcv_close_conn_rpl(csk, ntohl(rpl->snd_nxt));
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process ABORT_REQ_RSS CPL message: -> host
+ * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
+ * request except that we need to reply to it.
+ */
+
+static int abort_status_to_errno(struct cxgbi_sock *csk, int abort_reason,
+				 int *need_rst)
+{
+	switch (abort_reason) {
+	case CPL_ERR_BAD_SYN: /* fall through */
+	case CPL_ERR_CONN_RESET:
+		return csk->state > CTP_ESTABLISHED ? -EPIPE : -ECONNRESET;
+	case CPL_ERR_XMIT_TIMEDOUT:
+	case CPL_ERR_PERSIST_TIMEDOUT:
+	case CPL_ERR_FINWAIT2_TIMEDOUT:
+	case CPL_ERR_KEEPALIVE_TIMEDOUT:
+		return -ETIMEDOUT;
+	default:
+		return -EIO;
+	}
+}
+
+static int do_abort_req(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
+{
+	const struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct cxgbi_sock *csk = ctx;
+	int rst_status = CPL_ABORT_NO_RST;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	if (req->status == CPL_ERR_RTX_NEG_ADVICE ||
+	    req->status == CPL_ERR_PERSIST_NEG_ADVICE) {
+		goto done;
+	}
+
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	if (!cxgbi_sock_flag(csk, CTPF_ABORT_REQ_RCVD)) {
+		cxgbi_sock_set_flag(csk, CTPF_ABORT_REQ_RCVD);
+		cxgbi_sock_set_state(csk, CTP_ABORTING);
+		goto out;
+	}
+
+	cxgbi_sock_clear_flag(csk, CTPF_ABORT_REQ_RCVD);
+	send_abort_rpl(csk, rst_status);
+
+	if (!cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING)) {
+		csk->err = abort_status_to_errno(csk, req->status, &rst_status);
+		cxgbi_sock_closed(csk);
+	}
+
+out:
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+done:
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process ABORT_RPL_RSS CPL message: -> host
+ * Process abort replies.  We only process these messages if we anticipate
+ * them as the coordination between SW and HW in this area is somewhat lacking
+ * and sometimes we get ABORT_RPLs after we are done with the connection that
+ * originated the ABORT_REQ.
+ */
+static int do_abort_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
+{
+	struct cpl_abort_rpl_rss *rpl = cplhdr(skb);
+	struct cxgbi_sock *csk = ctx;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"status 0x%x, csk 0x%p, s %u, 0x%lx.\n",
+		rpl->status, csk, csk ? csk->state : 0,
+		csk ? csk->flags : 0UL);
+	/*
+	 * Ignore replies to post-close aborts indicating that the abort was
+	 * requested too late.  These connections are terminated when we get
+	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
+	 * arrives the TID is either no longer used or it has been recycled.
+	 */
+	if (rpl->status == CPL_ERR_ABORT_FAILED)
+		goto rel_skb;
+	/*
+	 * Sometimes we've already closed the connection, e.g., a post-close
+	 * abort races with ABORT_REQ_RSS, the latter frees the connection
+	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
+	 * but FW turns the ABORT_REQ into a regular one and so we get
+	 * ABORT_RPL_RSS with status 0 and no connection.
+	 */
+	if (csk)
+		cxgbi_sock_rcv_abort_rpl(csk);
+rel_skb:
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process RX_ISCSI_HDR CPL message: -> host
+ * Handle received PDUs, the payload could be DDP'ed. If not, the payload
+ * follow after the bhs.
+ */
+static int do_iscsi_hdr(struct t3cdev *t3dev, struct sk_buff *skb, void *ctx)
+{
+	struct cxgbi_sock *csk = ctx;
+	struct cpl_iscsi_hdr *hdr_cpl = cplhdr(skb);
+	struct cpl_iscsi_hdr_norss data_cpl;
+	struct cpl_rx_data_ddp_norss ddp_cpl;
+	unsigned int hdr_len, data_len, status;
+	unsigned int len;
+	int err;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx,%u, skb 0x%p,%u.\n",
+		csk, csk->state, csk->flags, csk->tid, skb, skb->len);
+
+	spin_lock_bh(&csk->lock);
+
+	if (unlikely(csk->state >= CTP_PASSIVE_CLOSE)) {
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk 0x%p,%u,0x%lx,%u, bad state.\n",
+			csk, csk->state, csk->flags, csk->tid);
+		if (csk->state != CTP_ABORTING)
+			goto abort_conn;
+		else
+			goto discard;
+	}
+
+	cxgbi_skcb_tcp_seq(skb) = ntohl(hdr_cpl->seq);
+	cxgbi_skcb_flags(skb) = 0;
+
+	skb_reset_transport_header(skb);
+	__skb_pull(skb, sizeof(struct cpl_iscsi_hdr));
+
+	len = hdr_len = ntohs(hdr_cpl->len);
+	/* msg coalesce is off or not enough data received */
+	if (skb->len <= hdr_len) {
+		pr_err("%s: tid %u, CPL_ISCSI_HDR, skb len %u < %u.\n",
+			csk->cdev->ports[csk->port_id]->name, csk->tid,
+			skb->len, hdr_len);
+		goto abort_conn;
+	}
+	cxgbi_skcb_set_flag(skb, SKCBF_RX_COALESCED);
+
+	err = skb_copy_bits(skb, skb->len - sizeof(ddp_cpl), &ddp_cpl,
+			    sizeof(ddp_cpl));
+	if (err < 0) {
+		pr_err("%s: tid %u, copy cpl_ddp %u-%zu failed %d.\n",
+			csk->cdev->ports[csk->port_id]->name, csk->tid,
+			skb->len, sizeof(ddp_cpl), err);
+		goto abort_conn;
+	}
+
+	cxgbi_skcb_set_flag(skb, SKCBF_RX_STATUS);
+	cxgbi_skcb_rx_pdulen(skb) = ntohs(ddp_cpl.len);
+	cxgbi_skcb_rx_ddigest(skb) = ntohl(ddp_cpl.ulp_crc);
+	status = ntohl(ddp_cpl.ddp_status);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p, skb 0x%p,%u, pdulen %u, status 0x%x.\n",
+		csk, skb, skb->len, cxgbi_skcb_rx_pdulen(skb), status);
+
+	if (status & (1 << CPL_RX_DDP_STATUS_HCRC_SHIFT))
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_HCRC_ERR);
+	if (status & (1 << CPL_RX_DDP_STATUS_DCRC_SHIFT))
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_DCRC_ERR);
+	if (status & (1 << CPL_RX_DDP_STATUS_PAD_SHIFT))
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_PAD_ERR);
+
+	if (skb->len > (hdr_len + sizeof(ddp_cpl))) {
+		err = skb_copy_bits(skb, hdr_len, &data_cpl, sizeof(data_cpl));
+		if (err < 0) {
+			pr_err("%s: tid %u, cp %zu/%u failed %d.\n",
+				csk->cdev->ports[csk->port_id]->name,
+				csk->tid, sizeof(data_cpl), skb->len, err);
+			goto abort_conn;
+		}
+		data_len = ntohs(data_cpl.len);
+		log_debug(1 << CXGBI_DBG_DDP | 1 << CXGBI_DBG_PDU_RX,
+			"skb 0x%p, pdu not ddp'ed %u/%u, status 0x%x.\n",
+			skb, data_len, cxgbi_skcb_rx_pdulen(skb), status);
+		len += sizeof(data_cpl) + data_len;
+	} else if (status & (1 << CPL_RX_DDP_STATUS_DDP_SHIFT))
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_DATA_DDPD);
+
+	csk->rcv_nxt = ntohl(ddp_cpl.seq) + cxgbi_skcb_rx_pdulen(skb);
+	__pskb_trim(skb, len);
+	__skb_queue_tail(&csk->receive_queue, skb);
+	cxgbi_conn_pdu_ready(csk);
+
+	spin_unlock_bh(&csk->lock);
+	return 0;
+
+abort_conn:
+	send_abort_req(csk);
+discard:
+	spin_unlock_bh(&csk->lock);
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process TX_DATA_ACK CPL messages: -> host
+ * Process an acknowledgment of WR completion.  Advance snd_una and send the
+ * next batch of work requests from the write queue.
+ */
+static int do_wr_ack(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
+{
+	struct cxgbi_sock *csk = ctx;
+	struct cpl_wr_ack *hdr = cplhdr(skb);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx,%u, cr %u.\n",
+		csk, csk->state, csk->flags, csk->tid, ntohs(hdr->credits));
+
+	cxgbi_sock_rcv_wr_ack(csk, ntohs(hdr->credits), ntohl(hdr->snd_una), 1);
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * for each connection, pre-allocate skbs needed for close/abort requests. So
+ * that we can service the request right away.
+ */
+static int alloc_cpls(struct cxgbi_sock *csk)
+{
+	csk->cpl_close = alloc_wr(sizeof(struct cpl_close_con_req), 0,
+					GFP_KERNEL);
+	if (!csk->cpl_close)
+		return -ENOMEM;
+	csk->cpl_abort_req = alloc_wr(sizeof(struct cpl_abort_req), 0,
+					GFP_KERNEL);
+	if (!csk->cpl_abort_req)
+		goto free_cpl_skbs;
+
+	csk->cpl_abort_rpl = alloc_wr(sizeof(struct cpl_abort_rpl), 0,
+					GFP_KERNEL);
+	if (!csk->cpl_abort_rpl)
+		goto free_cpl_skbs;
+
+	return 0;
+
+free_cpl_skbs:
+	cxgbi_sock_free_cpl_skbs(csk);
+	return -ENOMEM;
+}
+
+/**
+ * release_offload_resources - release offload resource
+ * @c3cn: the offloaded iscsi tcp connection.
+ * Release resources held by an offload connection (TID, L2T entry, etc.)
+ */
+static void l2t_put(struct cxgbi_sock *csk)
+{
+	struct t3cdev *t3dev = (struct t3cdev *)csk->cdev->lldev;
+
+	if (csk->l2t) {
+		l2t_release(t3dev, csk->l2t);
+		csk->l2t = NULL;
+		cxgbi_sock_put(csk);
+	}
+}
+
+static void release_offload_resources(struct cxgbi_sock *csk)
+{
+	struct t3cdev *t3dev = (struct t3cdev *)csk->cdev->lldev;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	csk->rss_qid = 0;
+	cxgbi_sock_free_cpl_skbs(csk);
+
+	if (csk->wr_cred != csk->wr_max_cred) {
+		cxgbi_sock_purge_wr_queue(csk);
+		cxgbi_sock_reset_wr_list(csk);
+	}
+	l2t_put(csk);
+	if (cxgbi_sock_flag(csk, CTPF_HAS_ATID))
+		free_atid(csk);
+	else if (cxgbi_sock_flag(csk, CTPF_HAS_TID)) {
+		cxgb3_remove_tid(t3dev, (void *)csk, csk->tid);
+		cxgbi_sock_clear_flag(csk, CTPF_HAS_TID);
+		cxgbi_sock_put(csk);
+	}
+	csk->dst = NULL;
+	csk->cdev = NULL;
+}
+
+static void update_address(struct cxgbi_hba *chba)
+{
+	if (chba->ipv4addr) {
+		if (chba->vdev &&
+		    chba->ipv4addr != cxgb3i_get_private_ipv4addr(chba->vdev)) {
+			cxgb3i_set_private_ipv4addr(chba->vdev, chba->ipv4addr);
+			cxgb3i_set_private_ipv4addr(chba->ndev, 0);
+			pr_info("%s set %pI4.\n",
+				chba->vdev->name, &chba->ipv4addr);
+		} else if (chba->ipv4addr !=
+				cxgb3i_get_private_ipv4addr(chba->ndev)) {
+			cxgb3i_set_private_ipv4addr(chba->ndev, chba->ipv4addr);
+			pr_info("%s set %pI4.\n",
+				chba->ndev->name, &chba->ipv4addr);
+		}
+	} else if (cxgb3i_get_private_ipv4addr(chba->ndev)) {
+		if (chba->vdev)
+			cxgb3i_set_private_ipv4addr(chba->vdev, 0);
+		cxgb3i_set_private_ipv4addr(chba->ndev, 0);
+	}
+}
+
+static int init_act_open(struct cxgbi_sock *csk)
+{
+	struct dst_entry *dst = csk->dst;
+	struct cxgbi_device *cdev = csk->cdev;
+	struct t3cdev *t3dev = (struct t3cdev *)cdev->lldev;
+	struct net_device *ndev = cdev->ports[csk->port_id];
+	struct cxgbi_hba *chba = cdev->hbas[csk->port_id];
+	struct sk_buff *skb = NULL;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx.\n", csk, csk->state, csk->flags);
+
+	update_address(chba);
+	if (chba->ipv4addr)
+		csk->saddr.sin_addr.s_addr = chba->ipv4addr;
+
+	csk->rss_qid = 0;
+	csk->l2t = t3_l2t_get(t3dev, dst, ndev,
+			      &csk->daddr.sin_addr.s_addr);
+	if (!csk->l2t) {
+		pr_err("NO l2t available.\n");
+		return -EINVAL;
+	}
+	cxgbi_sock_get(csk);
+
+	csk->atid = cxgb3_alloc_atid(t3dev, &t3_client, csk);
+	if (csk->atid < 0) {
+		pr_err("NO atid available.\n");
+		goto rel_resource;
+	}
+	cxgbi_sock_set_flag(csk, CTPF_HAS_ATID);
+	cxgbi_sock_get(csk);
+
+	skb = alloc_wr(sizeof(struct cpl_act_open_req), 0, GFP_KERNEL);
+	if (!skb)
+		goto rel_resource;
+	skb->sk = (struct sock *)csk;
+	set_arp_failure_handler(skb, act_open_arp_failure);
+	csk->snd_win = cxgb3i_snd_win;
+	csk->rcv_win = cxgb3i_rcv_win;
+
+	csk->wr_max_cred = csk->wr_cred = T3C_DATA(t3dev)->max_wrs - 1;
+	csk->wr_una_cred = 0;
+	csk->mss_idx = cxgbi_sock_select_mss(csk, dst_mtu(dst));
+	cxgbi_sock_reset_wr_list(csk);
+	csk->err = 0;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx, %pI4:%u-%pI4:%u.\n",
+		csk, csk->state, csk->flags,
+		&csk->saddr.sin_addr.s_addr, ntohs(csk->saddr.sin_port),
+		&csk->daddr.sin_addr.s_addr, ntohs(csk->daddr.sin_port));
+
+	cxgbi_sock_set_state(csk, CTP_ACTIVE_OPEN);
+	send_act_open_req(csk, skb, csk->l2t);
+	return 0;
+
+rel_resource:
+	if (skb)
+		__kfree_skb(skb);
+	return -EINVAL;
+}
+
+cxgb3_cpl_handler_func cxgb3i_cpl_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH] = do_act_establish,
+	[CPL_ACT_OPEN_RPL] = do_act_open_rpl,
+	[CPL_PEER_CLOSE] = do_peer_close,
+	[CPL_ABORT_REQ_RSS] = do_abort_req,
+	[CPL_ABORT_RPL_RSS] = do_abort_rpl,
+	[CPL_CLOSE_CON_RPL] = do_close_con_rpl,
+	[CPL_TX_DMA_ACK] = do_wr_ack,
+	[CPL_ISCSI_HDR] = do_iscsi_hdr,
+};
+
+/**
+ * cxgb3i_ofld_init - allocate and initialize resources for each adapter found
+ * @cdev:	cxgbi adapter
+ */
+static int cxgb3i_ofld_init(struct cxgbi_device *cdev)
+{
+	struct t3cdev *t3dev = (struct t3cdev *)cdev->lldev;
+	struct adap_ports port;
+	struct ofld_page_info rx_page_info;
+	unsigned int wr_len;
+	int rc;
+
+	if (t3dev->ctl(t3dev, GET_WR_LEN, &wr_len) < 0 ||
+	    t3dev->ctl(t3dev, GET_PORTS, &port) < 0 ||
+	    t3dev->ctl(t3dev, GET_RX_PAGE_INFO, &rx_page_info) < 0) {
+		pr_warn("t3 0x%p, offload up, ioctl failed.\n", t3dev);
+		return -EINVAL;
+	}
+
+	if (cxgb3i_max_connect > CXGBI_MAX_CONN)
+		cxgb3i_max_connect = CXGBI_MAX_CONN;
+
+	rc = cxgbi_device_portmap_create(cdev, cxgb3i_sport_base,
+					cxgb3i_max_connect);
+	if (rc < 0)
+		return rc;
+
+	init_wr_tab(wr_len);
+	cdev->csk_release_offload_resources = release_offload_resources;
+	cdev->csk_push_tx_frames = push_tx_frames;
+	cdev->csk_send_abort_req = send_abort_req;
+	cdev->csk_send_close_req = send_close_req;
+	cdev->csk_send_rx_credits = send_rx_credits;
+	cdev->csk_alloc_cpls = alloc_cpls;
+	cdev->csk_init_act_open = init_act_open;
+
+	pr_info("cdev 0x%p, offload up, added.\n", cdev);
+	return 0;
+}
+
+/*
+ * functions to program the pagepod in h/w
+ */
+static inline void ulp_mem_io_set_hdr(struct sk_buff *skb, unsigned int addr)
+{
+	struct ulp_mem_io *req = (struct ulp_mem_io *)skb->head;
+
+	memset(req, 0, sizeof(*req));
+
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+	req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(addr >> 5) |
+				   V_ULPTX_CMD(ULP_MEM_WRITE));
+	req->len = htonl(V_ULP_MEMIO_DATA_LEN(IPPOD_SIZE >> 5) |
+			 V_ULPTX_NFLITS((IPPOD_SIZE >> 3) + 1));
+}
+
+static struct cxgbi_ppm *cdev2ppm(struct cxgbi_device *cdev)
+{
+	return ((struct t3cdev *)cdev->lldev)->ulp_iscsi;
+}
+
+static int ddp_set_map(struct cxgbi_ppm *ppm, struct cxgbi_sock *csk,
+		       struct cxgbi_task_tag_info *ttinfo)
+{
+	unsigned int idx = ttinfo->idx;
+	unsigned int npods = ttinfo->npods;
+	struct scatterlist *sg = ttinfo->sgl;
+	struct cxgbi_pagepod *ppod;
+	struct ulp_mem_io *req;
+	unsigned int sg_off;
+	unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ppm->llimit;
+	int i;
+
+	for (i = 0; i < npods; i++, idx++, pm_addr += IPPOD_SIZE) {
+		struct sk_buff *skb = alloc_wr(sizeof(struct ulp_mem_io) +
+					       IPPOD_SIZE, 0, GFP_ATOMIC);
+
+		if (!skb)
+			return -ENOMEM;
+		ulp_mem_io_set_hdr(skb, pm_addr);
+		req = (struct ulp_mem_io *)skb->head;
+		ppod = (struct cxgbi_pagepod *)(req + 1);
+		sg_off = i * PPOD_PAGES_MAX;
+		cxgbi_ddp_set_one_ppod(ppod, ttinfo, &sg,
+				       &sg_off);
+		skb->priority = CPL_PRIORITY_CONTROL;
+		cxgb3_ofld_send(ppm->lldev, skb);
+	}
+	return 0;
+}
+
+static void ddp_clear_map(struct cxgbi_device *cdev, struct cxgbi_ppm *ppm,
+			  struct cxgbi_task_tag_info *ttinfo)
+{
+	unsigned int idx = ttinfo->idx;
+	unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ppm->llimit;
+	unsigned int npods = ttinfo->npods;
+	int i;
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		  "cdev 0x%p, clear idx %u, npods %u.\n",
+		  cdev, idx, npods);
+
+	for (i = 0; i < npods; i++, idx++, pm_addr += IPPOD_SIZE) {
+		struct sk_buff *skb = alloc_wr(sizeof(struct ulp_mem_io) +
+					       IPPOD_SIZE, 0, GFP_ATOMIC);
+
+		if (!skb) {
+			pr_err("cdev 0x%p, clear ddp, %u,%d/%u, skb OOM.\n",
+			       cdev, idx, i, npods);
+			continue;
+		}
+		ulp_mem_io_set_hdr(skb, pm_addr);
+		skb->priority = CPL_PRIORITY_CONTROL;
+		cxgb3_ofld_send(ppm->lldev, skb);
+	}
+}
+
+static int ddp_setup_conn_pgidx(struct cxgbi_sock *csk,
+				       unsigned int tid, int pg_idx, bool reply)
+{
+	struct sk_buff *skb = alloc_wr(sizeof(struct cpl_set_tcb_field), 0,
+					GFP_KERNEL);
+	struct cpl_set_tcb_field *req;
+	u64 val = pg_idx < DDP_PGIDX_MAX ? pg_idx : 0;
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		"csk 0x%p, tid %u, pg_idx %d.\n", csk, tid, pg_idx);
+	if (!skb)
+		return -ENOMEM;
+
+	/* set up ulp submode and page size */
+	req = (struct cpl_set_tcb_field *)skb->head;
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
+	req->reply = V_NO_REPLY(reply ? 0 : 1);
+	req->cpu_idx = 0;
+	req->word = htons(31);
+	req->mask = cpu_to_be64(0xF0000000);
+	req->val = cpu_to_be64(val << 28);
+	skb->priority = CPL_PRIORITY_CONTROL;
+
+	cxgb3_ofld_send(csk->cdev->lldev, skb);
+	return 0;
+}
+
+/**
+ * cxgb3i_setup_conn_digest - setup conn. digest setting
+ * @csk: cxgb tcp socket
+ * @tid: connection id
+ * @hcrc: header digest enabled
+ * @dcrc: data digest enabled
+ * @reply: request reply from h/w
+ * set up the iscsi digest settings for a connection identified by tid
+ */
+static int ddp_setup_conn_digest(struct cxgbi_sock *csk, unsigned int tid,
+			     int hcrc, int dcrc, int reply)
+{
+	struct sk_buff *skb = alloc_wr(sizeof(struct cpl_set_tcb_field), 0,
+					GFP_KERNEL);
+	struct cpl_set_tcb_field *req;
+	u64 val = (hcrc ? 1 : 0) | (dcrc ? 2 : 0);
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		"csk 0x%p, tid %u, crc %d,%d.\n", csk, tid, hcrc, dcrc);
+	if (!skb)
+		return -ENOMEM;
+
+	/* set up ulp submode and page size */
+	req = (struct cpl_set_tcb_field *)skb->head;
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
+	req->reply = V_NO_REPLY(reply ? 0 : 1);
+	req->cpu_idx = 0;
+	req->word = htons(31);
+	req->mask = cpu_to_be64(0x0F000000);
+	req->val = cpu_to_be64(val << 24);
+	skb->priority = CPL_PRIORITY_CONTROL;
+
+	cxgb3_ofld_send(csk->cdev->lldev, skb);
+	return 0;
+}
+
+/**
+ * cxgb3i_ddp_init - initialize the cxgb3 adapter's ddp resource
+ * @cdev: cxgb3i adapter
+ * initialize the ddp pagepod manager for a given adapter
+ */
+static int cxgb3i_ddp_init(struct cxgbi_device *cdev)
+{
+	struct t3cdev *tdev = (struct t3cdev *)cdev->lldev;
+	struct net_device *ndev = cdev->ports[0];
+	struct cxgbi_tag_format tformat;
+	unsigned int ppmax, tagmask = 0;
+	struct ulp_iscsi_info uinfo;
+	int i, err;
+
+	err = tdev->ctl(tdev, ULP_ISCSI_GET_PARAMS, &uinfo);
+	if (err < 0) {
+		pr_err("%s, failed to get iscsi param %d.\n",
+		       ndev->name, err);
+		return err;
+	}
+	if (uinfo.llimit >= uinfo.ulimit) {
+		pr_warn("T3 %s, iscsi NOT enabled %u ~ %u!\n",
+			ndev->name, uinfo.llimit, uinfo.ulimit);
+		return -EACCES;
+	}
+
+	ppmax = (uinfo.ulimit - uinfo.llimit + 1) >> PPOD_SIZE_SHIFT;
+	tagmask = cxgbi_tagmask_set(ppmax);
+
+	pr_info("T3 %s: 0x%x~0x%x, 0x%x, tagmask 0x%x -> 0x%x.\n",
+		ndev->name, uinfo.llimit, uinfo.ulimit, ppmax, uinfo.tagmask,
+		tagmask);
+
+	memset(&tformat, 0, sizeof(struct cxgbi_tag_format));
+	for (i = 0; i < 4; i++)
+		tformat.pgsz_order[i] = uinfo.pgsz_factor[i];
+	cxgbi_tagmask_check(tagmask, &tformat);
+
+	cxgbi_ddp_ppm_setup(&tdev->ulp_iscsi, cdev, &tformat, ppmax,
+			    uinfo.llimit, uinfo.llimit, 0);
+	if (!(cdev->flags & CXGBI_FLAG_DDP_OFF)) {
+		uinfo.tagmask = tagmask;
+		uinfo.ulimit = uinfo.llimit + (ppmax << PPOD_SIZE_SHIFT);
+
+		err = tdev->ctl(tdev, ULP_ISCSI_SET_PARAMS, &uinfo);
+		if (err < 0) {
+			pr_err("T3 %s fail to set iscsi param %d.\n",
+			       ndev->name, err);
+			cdev->flags |= CXGBI_FLAG_DDP_OFF;
+		}
+		err = 0;
+	}
+
+	cdev->csk_ddp_setup_digest = ddp_setup_conn_digest;
+	cdev->csk_ddp_setup_pgidx = ddp_setup_conn_pgidx;
+	cdev->csk_ddp_set_map = ddp_set_map;
+	cdev->csk_ddp_clear_map = ddp_clear_map;
+	cdev->cdev2ppm = cdev2ppm;
+	cdev->tx_max_size = min_t(unsigned int, ULP2_MAX_PDU_PAYLOAD,
+				  uinfo.max_txsz - ISCSI_PDU_NONPAYLOAD_LEN);
+	cdev->rx_max_size = min_t(unsigned int, ULP2_MAX_PDU_PAYLOAD,
+				  uinfo.max_rxsz - ISCSI_PDU_NONPAYLOAD_LEN);
+
+	return 0;
+}
+
+static void cxgb3i_dev_close(struct t3cdev *t3dev)
+{
+	struct cxgbi_device *cdev = cxgbi_device_find_by_lldev(t3dev);
+
+	if (!cdev || cdev->flags & CXGBI_FLAG_ADAPTER_RESET) {
+		pr_info("0x%p close, f 0x%x.\n", cdev, cdev ? cdev->flags : 0);
+		return;
+	}
+
+	cxgbi_device_unregister(cdev);
+}
+
+/**
+ * cxgb3i_dev_open - init a t3 adapter structure and any h/w settings
+ * @t3dev: t3cdev adapter
+ */
+static void cxgb3i_dev_open(struct t3cdev *t3dev)
+{
+	struct cxgbi_device *cdev = cxgbi_device_find_by_lldev(t3dev);
+	struct adapter *adapter = tdev2adap(t3dev);
+	int i, err;
+
+	if (cdev) {
+		pr_info("0x%p, updating.\n", cdev);
+		return;
+	}
+
+	cdev = cxgbi_device_register(0, adapter->params.nports);
+	if (!cdev) {
+		pr_warn("device 0x%p register failed.\n", t3dev);
+		return;
+	}
+
+	cdev->flags = CXGBI_FLAG_DEV_T3 | CXGBI_FLAG_IPV4_SET;
+	cdev->lldev = t3dev;
+	cdev->pdev = adapter->pdev;
+	cdev->ports = adapter->port;
+	cdev->nports = adapter->params.nports;
+	cdev->mtus = adapter->params.mtus;
+	cdev->nmtus = NMTUS;
+	cdev->rx_credit_thres = cxgb3i_rx_credit_thres;
+	cdev->skb_tx_rsvd = CXGB3I_TX_HEADER_LEN;
+	cdev->skb_rx_extra = sizeof(struct cpl_iscsi_hdr_norss);
+	cdev->itp = &cxgb3i_iscsi_transport;
+
+	err = cxgb3i_ddp_init(cdev);
+	if (err) {
+		pr_info("0x%p ddp init failed\n", cdev);
+		goto err_out;
+	}
+
+	err = cxgb3i_ofld_init(cdev);
+	if (err) {
+		pr_info("0x%p offload init failed\n", cdev);
+		goto err_out;
+	}
+
+	err = cxgbi_hbas_add(cdev, CXGB3I_MAX_LUN, CXGBI_MAX_CONN,
+				&cxgb3i_host_template, cxgb3i_stt);
+	if (err)
+		goto err_out;
+
+	for (i = 0; i < cdev->nports; i++)
+		cdev->hbas[i]->ipv4addr =
+			cxgb3i_get_private_ipv4addr(cdev->ports[i]);
+
+	pr_info("cdev 0x%p, f 0x%x, t3dev 0x%p open, err %d.\n",
+		cdev, cdev ? cdev->flags : 0, t3dev, err);
+	return;
+
+err_out:
+	cxgbi_device_unregister(cdev);
+}
+
+static void cxgb3i_dev_event_handler(struct t3cdev *t3dev, u32 event, u32 port)
+{
+	struct cxgbi_device *cdev = cxgbi_device_find_by_lldev(t3dev);
+
+	log_debug(1 << CXGBI_DBG_TOE,
+		"0x%p, cdev 0x%p, event 0x%x, port 0x%x.\n",
+		t3dev, cdev, event, port);
+	if (!cdev)
+		return;
+
+	switch (event) {
+	case OFFLOAD_STATUS_DOWN:
+		cdev->flags |= CXGBI_FLAG_ADAPTER_RESET;
+		break;
+	case OFFLOAD_STATUS_UP:
+		cdev->flags &= ~CXGBI_FLAG_ADAPTER_RESET;
+		break;
+	}
+}
+
+/**
+ * cxgb3i_init_module - module init entry point
+ *
+ * initialize any driver wide global data structures and register itself
+ *	with the cxgb3 module
+ */
+static int __init cxgb3i_init_module(void)
+{
+	int rc;
+
+	printk(KERN_INFO "%s", version);
+
+	rc = cxgbi_iscsi_init(&cxgb3i_iscsi_transport, &cxgb3i_stt);
+	if (rc < 0)
+		return rc;
+
+	cxgb3_register_client(&t3_client);
+	return 0;
+}
+
+/**
+ * cxgb3i_exit_module - module cleanup/exit entry point
+ *
+ * go through the driver hba list and for each hba, release any resource held.
+ *	and unregisters iscsi transport and the cxgb3 module
+ */
+static void __exit cxgb3i_exit_module(void)
+{
+	cxgb3_unregister_client(&t3_client);
+	cxgbi_device_unregister_all(CXGBI_FLAG_DEV_T3);
+	cxgbi_iscsi_cleanup(&cxgb3i_iscsi_transport, &cxgb3i_stt);
+}
+
+module_init(cxgb3i_init_module);
+module_exit(cxgb3i_exit_module);
diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.h b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.h
new file mode 100644
index 0000000..b0430c9
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.h
@@ -0,0 +1,62 @@
+/*
+ * cxgb3i.h: Chelsio S3xx iSCSI driver.
+ *
+ * Copyright (c) 2008-2015 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie@chelsio.com)
+ */
+
+#ifndef __CXGB3I_H__
+#define __CXGB3I_H__
+
+#define CXGB3I_SCSI_HOST_QDEPTH 1024
+#define CXGB3I_MAX_LUN		512
+#define ISCSI_PDU_NONPAYLOAD_MAX \
+	(sizeof(struct iscsi_hdr) + ISCSI_MAX_AHS_SIZE + 2*ISCSI_DIGEST_SIZE)
+
+/*for TX: a skb must have a headroom of at least TX_HEADER_LEN bytes */
+#define CXGB3I_TX_HEADER_LEN \
+	(sizeof(struct tx_data_wr) + sizeof(struct sge_opaque_hdr))
+
+extern cxgb3_cpl_handler_func cxgb3i_cpl_handlers[NUM_CPL_CMDS];
+
+static inline unsigned int cxgb3i_get_private_ipv4addr(struct net_device *ndev)
+{
+	return ((struct port_info *)(netdev_priv(ndev)))->iscsi_ipv4addr;
+}
+
+static inline void cxgb3i_set_private_ipv4addr(struct net_device *ndev,
+						unsigned int addr)
+{
+	struct port_info *pi =  (struct port_info *)netdev_priv(ndev);
+
+	pi->iscsic.flags = addr ? 1 : 0;
+	pi->iscsi_ipv4addr = addr;
+	if (addr)
+		memcpy(pi->iscsic.mac_addr, ndev->dev_addr, ETH_ALEN);
+}
+
+struct cpl_iscsi_hdr_norss {
+	union opcode_tid ot;
+	u16 pdu_len_ddp;
+	u16 len;
+	u32 seq;
+	u16 urg;
+	u8 rsvd;
+	u8 status;
+};
+
+struct cpl_rx_data_ddp_norss {
+	union opcode_tid ot;
+	u16 urg;
+	u16 len;
+	u32 seq;
+	u32 nxt_seq;
+	u32 ulp_crc;
+	u32 ddp_status;
+};
+#endif
diff --git a/drivers/scsi/cxgbi/cxgb4i/Kbuild b/drivers/scsi/cxgbi/cxgb4i/Kbuild
new file mode 100644
index 0000000..38e03c2
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb4i/Kbuild
@@ -0,0 +1,4 @@
+ccflags-y += -I$(srctree)/drivers/net/ethernet/chelsio/cxgb4
+ccflags-y += -I$(srctree)/drivers/net/ethernet/chelsio/libcxgb
+
+obj-$(CONFIG_SCSI_CXGB4_ISCSI) += cxgb4i.o
diff --git a/drivers/scsi/cxgbi/cxgb4i/Kconfig b/drivers/scsi/cxgbi/cxgb4i/Kconfig
new file mode 100644
index 0000000..594f593
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb4i/Kconfig
@@ -0,0 +1,11 @@
+config SCSI_CXGB4_ISCSI
+	tristate "Chelsio T4 iSCSI support"
+	depends on PCI && INET && (IPV6 || IPV6=n)
+	select NETDEVICES
+	select ETHERNET
+	select NET_VENDOR_CHELSIO
+	select CHELSIO_T4
+	select CHELSIO_LIB
+	select SCSI_ISCSI_ATTRS
+	---help---
+	  This driver supports iSCSI offload for the Chelsio T4 devices.
diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
new file mode 100644
index 0000000..211da1d
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
@@ -0,0 +1,2171 @@
+/*
+ * cxgb4i.c: Chelsio T4 iSCSI driver.
+ *
+ * Copyright (c) 2010-2015 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by:	Karen Xie (kxie@chelsio.com)
+ *		Rakesh Ranjan (rranjan@chelsio.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <scsi/scsi_host.h>
+#include <net/tcp.h>
+#include <net/dst.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
+
+#include "t4_regs.h"
+#include "t4_msg.h"
+#include "cxgb4.h"
+#include "cxgb4_uld.h"
+#include "t4fw_api.h"
+#include "l2t.h"
+#include "cxgb4i.h"
+#include "clip_tbl.h"
+
+static unsigned int dbg_level;
+
+#include "../libcxgbi.h"
+
+#define	DRV_MODULE_NAME		"cxgb4i"
+#define DRV_MODULE_DESC		"Chelsio T4-T6 iSCSI Driver"
+#define	DRV_MODULE_VERSION	"0.9.5-ko"
+#define DRV_MODULE_RELDATE	"Apr. 2015"
+
+static char version[] =
+	DRV_MODULE_DESC " " DRV_MODULE_NAME
+	" v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+
+MODULE_AUTHOR("Chelsio Communications, Inc.");
+MODULE_DESCRIPTION(DRV_MODULE_DESC);
+MODULE_VERSION(DRV_MODULE_VERSION);
+MODULE_LICENSE("GPL");
+
+module_param(dbg_level, uint, 0644);
+MODULE_PARM_DESC(dbg_level, "Debug flag (default=0)");
+
+#define CXGB4I_DEFAULT_10G_RCV_WIN (256 * 1024)
+static int cxgb4i_rcv_win = -1;
+module_param(cxgb4i_rcv_win, int, 0644);
+MODULE_PARM_DESC(cxgb4i_rcv_win, "TCP reveive window in bytes");
+
+#define CXGB4I_DEFAULT_10G_SND_WIN (128 * 1024)
+static int cxgb4i_snd_win = -1;
+module_param(cxgb4i_snd_win, int, 0644);
+MODULE_PARM_DESC(cxgb4i_snd_win, "TCP send window in bytes");
+
+static int cxgb4i_rx_credit_thres = 10 * 1024;
+module_param(cxgb4i_rx_credit_thres, int, 0644);
+MODULE_PARM_DESC(cxgb4i_rx_credit_thres,
+		"RX credits return threshold in bytes (default=10KB)");
+
+static unsigned int cxgb4i_max_connect = (8 * 1024);
+module_param(cxgb4i_max_connect, uint, 0644);
+MODULE_PARM_DESC(cxgb4i_max_connect, "Maximum number of connections");
+
+static unsigned short cxgb4i_sport_base = 20000;
+module_param(cxgb4i_sport_base, ushort, 0644);
+MODULE_PARM_DESC(cxgb4i_sport_base, "Starting port number (default 20000)");
+
+typedef void (*cxgb4i_cplhandler_func)(struct cxgbi_device *, struct sk_buff *);
+
+static void *t4_uld_add(const struct cxgb4_lld_info *);
+static int t4_uld_rx_handler(void *, const __be64 *, const struct pkt_gl *);
+static int t4_uld_state_change(void *, enum cxgb4_state state);
+static inline int send_tx_flowc_wr(struct cxgbi_sock *);
+
+static const struct cxgb4_uld_info cxgb4i_uld_info = {
+	.name = DRV_MODULE_NAME,
+	.nrxq = MAX_ULD_QSETS,
+	.ntxq = MAX_ULD_QSETS,
+	.rxq_size = 1024,
+	.lro = false,
+	.add = t4_uld_add,
+	.rx_handler = t4_uld_rx_handler,
+	.state_change = t4_uld_state_change,
+};
+
+static struct scsi_host_template cxgb4i_host_template = {
+	.module		= THIS_MODULE,
+	.name		= DRV_MODULE_NAME,
+	.proc_name	= DRV_MODULE_NAME,
+	.can_queue	= CXGB4I_SCSI_HOST_QDEPTH,
+	.queuecommand	= iscsi_queuecommand,
+	.change_queue_depth = scsi_change_queue_depth,
+	.sg_tablesize	= SG_ALL,
+	.max_sectors	= 0xFFFF,
+	.cmd_per_lun	= ISCSI_DEF_CMD_PER_LUN,
+	.eh_timed_out	= iscsi_eh_cmd_timed_out,
+	.eh_abort_handler = iscsi_eh_abort,
+	.eh_device_reset_handler = iscsi_eh_device_reset,
+	.eh_target_reset_handler = iscsi_eh_recover_target,
+	.target_alloc	= iscsi_target_alloc,
+	.use_clustering	= DISABLE_CLUSTERING,
+	.this_id	= -1,
+	.track_queue_depth = 1,
+};
+
+static struct iscsi_transport cxgb4i_iscsi_transport = {
+	.owner		= THIS_MODULE,
+	.name		= DRV_MODULE_NAME,
+	.caps		= CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST |
+				CAP_DATADGST | CAP_DIGEST_OFFLOAD |
+				CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO,
+	.attr_is_visible	= cxgbi_attr_is_visible,
+	.get_host_param	= cxgbi_get_host_param,
+	.set_host_param	= cxgbi_set_host_param,
+	/* session management */
+	.create_session	= cxgbi_create_session,
+	.destroy_session	= cxgbi_destroy_session,
+	.get_session_param = iscsi_session_get_param,
+	/* connection management */
+	.create_conn	= cxgbi_create_conn,
+	.bind_conn		= cxgbi_bind_conn,
+	.destroy_conn	= iscsi_tcp_conn_teardown,
+	.start_conn		= iscsi_conn_start,
+	.stop_conn		= iscsi_conn_stop,
+	.get_conn_param	= iscsi_conn_get_param,
+	.set_param	= cxgbi_set_conn_param,
+	.get_stats	= cxgbi_get_conn_stats,
+	/* pdu xmit req from user space */
+	.send_pdu	= iscsi_conn_send_pdu,
+	/* task */
+	.init_task	= iscsi_tcp_task_init,
+	.xmit_task	= iscsi_tcp_task_xmit,
+	.cleanup_task	= cxgbi_cleanup_task,
+	/* pdu */
+	.alloc_pdu	= cxgbi_conn_alloc_pdu,
+	.init_pdu	= cxgbi_conn_init_pdu,
+	.xmit_pdu	= cxgbi_conn_xmit_pdu,
+	.parse_pdu_itt	= cxgbi_parse_pdu_itt,
+	/* TCP connect/disconnect */
+	.get_ep_param	= cxgbi_get_ep_param,
+	.ep_connect	= cxgbi_ep_connect,
+	.ep_poll	= cxgbi_ep_poll,
+	.ep_disconnect	= cxgbi_ep_disconnect,
+	/* Error recovery timeout call */
+	.session_recovery_timedout = iscsi_session_recovery_timedout,
+};
+
+static struct scsi_transport_template *cxgb4i_stt;
+
+/*
+ * CPL (Chelsio Protocol Language) defines a message passing interface between
+ * the host driver and Chelsio asic.
+ * The section below implments CPLs that related to iscsi tcp connection
+ * open/close/abort and data send/receive.
+ */
+
+#define RCV_BUFSIZ_MASK		0x3FFU
+#define MAX_IMM_TX_PKT_LEN	256
+
+static int push_tx_frames(struct cxgbi_sock *, int);
+
+/*
+ * is_ofld_imm - check whether a packet can be sent as immediate data
+ * @skb: the packet
+ *
+ * Returns true if a packet can be sent as an offload WR with immediate
+ * data.  We currently use the same limit as for Ethernet packets.
+ */
+static inline bool is_ofld_imm(const struct sk_buff *skb)
+{
+	int len = skb->len;
+
+	if (likely(cxgbi_skcb_test_flag(skb, SKCBF_TX_NEED_HDR)))
+		len += sizeof(struct fw_ofld_tx_data_wr);
+
+	return len <= MAX_IMM_TX_PKT_LEN;
+}
+
+static void send_act_open_req(struct cxgbi_sock *csk, struct sk_buff *skb,
+				struct l2t_entry *e)
+{
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(csk->cdev);
+	int wscale = cxgbi_sock_compute_wscale(csk->mss_idx);
+	unsigned long long opt0;
+	unsigned int opt2;
+	unsigned int qid_atid = ((unsigned int)csk->atid) |
+				 (((unsigned int)csk->rss_qid) << 14);
+
+	opt0 = KEEP_ALIVE_F |
+		WND_SCALE_V(wscale) |
+		MSS_IDX_V(csk->mss_idx) |
+		L2T_IDX_V(((struct l2t_entry *)csk->l2t)->idx) |
+		TX_CHAN_V(csk->tx_chan) |
+		SMAC_SEL_V(csk->smac_idx) |
+		ULP_MODE_V(ULP_MODE_ISCSI) |
+		RCV_BUFSIZ_V(csk->rcv_win >> 10);
+
+	opt2 = RX_CHANNEL_V(0) |
+		RSS_QUEUE_VALID_F |
+		RSS_QUEUE_V(csk->rss_qid);
+
+	if (is_t4(lldi->adapter_type)) {
+		struct cpl_act_open_req *req =
+				(struct cpl_act_open_req *)skb->head;
+
+		INIT_TP_WR(req, 0);
+		OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
+					qid_atid));
+		req->local_port = csk->saddr.sin_port;
+		req->peer_port = csk->daddr.sin_port;
+		req->local_ip = csk->saddr.sin_addr.s_addr;
+		req->peer_ip = csk->daddr.sin_addr.s_addr;
+		req->opt0 = cpu_to_be64(opt0);
+		req->params = cpu_to_be32(cxgb4_select_ntuple(
+					csk->cdev->ports[csk->port_id],
+					csk->l2t));
+		opt2 |= RX_FC_VALID_F;
+		req->opt2 = cpu_to_be32(opt2);
+
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk t4 0x%p, %pI4:%u-%pI4:%u, atid %d, qid %u.\n",
+			csk, &req->local_ip, ntohs(req->local_port),
+			&req->peer_ip, ntohs(req->peer_port),
+			csk->atid, csk->rss_qid);
+	} else if (is_t5(lldi->adapter_type)) {
+		struct cpl_t5_act_open_req *req =
+				(struct cpl_t5_act_open_req *)skb->head;
+		u32 isn = (prandom_u32() & ~7UL) - 1;
+
+		INIT_TP_WR(req, 0);
+		OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
+					qid_atid));
+		req->local_port = csk->saddr.sin_port;
+		req->peer_port = csk->daddr.sin_port;
+		req->local_ip = csk->saddr.sin_addr.s_addr;
+		req->peer_ip = csk->daddr.sin_addr.s_addr;
+		req->opt0 = cpu_to_be64(opt0);
+		req->params = cpu_to_be64(FILTER_TUPLE_V(
+				cxgb4_select_ntuple(
+					csk->cdev->ports[csk->port_id],
+					csk->l2t)));
+		req->rsvd = cpu_to_be32(isn);
+		opt2 |= T5_ISS_VALID;
+		opt2 |= T5_OPT_2_VALID_F;
+
+		req->opt2 = cpu_to_be32(opt2);
+
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk t5 0x%p, %pI4:%u-%pI4:%u, atid %d, qid %u.\n",
+			csk, &req->local_ip, ntohs(req->local_port),
+			&req->peer_ip, ntohs(req->peer_port),
+			csk->atid, csk->rss_qid);
+	} else {
+		struct cpl_t6_act_open_req *req =
+				(struct cpl_t6_act_open_req *)skb->head;
+		u32 isn = (prandom_u32() & ~7UL) - 1;
+
+		INIT_TP_WR(req, 0);
+		OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
+							    qid_atid));
+		req->local_port = csk->saddr.sin_port;
+		req->peer_port = csk->daddr.sin_port;
+		req->local_ip = csk->saddr.sin_addr.s_addr;
+		req->peer_ip = csk->daddr.sin_addr.s_addr;
+		req->opt0 = cpu_to_be64(opt0);
+		req->params = cpu_to_be64(FILTER_TUPLE_V(
+				cxgb4_select_ntuple(
+					csk->cdev->ports[csk->port_id],
+					csk->l2t)));
+		req->rsvd = cpu_to_be32(isn);
+
+		opt2 |= T5_ISS_VALID;
+		opt2 |= RX_FC_DISABLE_F;
+		opt2 |= T5_OPT_2_VALID_F;
+
+		req->opt2 = cpu_to_be32(opt2);
+		req->rsvd2 = cpu_to_be32(0);
+		req->opt3 = cpu_to_be32(0);
+
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			  "csk t6 0x%p, %pI4:%u-%pI4:%u, atid %d, qid %u.\n",
+			  csk, &req->local_ip, ntohs(req->local_port),
+			  &req->peer_ip, ntohs(req->peer_port),
+			  csk->atid, csk->rss_qid);
+	}
+
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, csk->port_id);
+
+	pr_info_ipaddr("t%d csk 0x%p,%u,0x%lx,%u, rss_qid %u.\n",
+		       (&csk->saddr), (&csk->daddr),
+		       CHELSIO_CHIP_VERSION(lldi->adapter_type), csk,
+		       csk->state, csk->flags, csk->atid, csk->rss_qid);
+
+	cxgb4_l2t_send(csk->cdev->ports[csk->port_id], skb, csk->l2t);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void send_act_open_req6(struct cxgbi_sock *csk, struct sk_buff *skb,
+			       struct l2t_entry *e)
+{
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(csk->cdev);
+	int wscale = cxgbi_sock_compute_wscale(csk->mss_idx);
+	unsigned long long opt0;
+	unsigned int opt2;
+	unsigned int qid_atid = ((unsigned int)csk->atid) |
+				 (((unsigned int)csk->rss_qid) << 14);
+
+	opt0 = KEEP_ALIVE_F |
+		WND_SCALE_V(wscale) |
+		MSS_IDX_V(csk->mss_idx) |
+		L2T_IDX_V(((struct l2t_entry *)csk->l2t)->idx) |
+		TX_CHAN_V(csk->tx_chan) |
+		SMAC_SEL_V(csk->smac_idx) |
+		ULP_MODE_V(ULP_MODE_ISCSI) |
+		RCV_BUFSIZ_V(csk->rcv_win >> 10);
+
+	opt2 = RX_CHANNEL_V(0) |
+		RSS_QUEUE_VALID_F |
+		RSS_QUEUE_V(csk->rss_qid);
+
+	if (is_t4(lldi->adapter_type)) {
+		struct cpl_act_open_req6 *req =
+			    (struct cpl_act_open_req6 *)skb->head;
+
+		INIT_TP_WR(req, 0);
+		OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
+							    qid_atid));
+		req->local_port = csk->saddr6.sin6_port;
+		req->peer_port = csk->daddr6.sin6_port;
+
+		req->local_ip_hi = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr);
+		req->local_ip_lo = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr +
+								    8);
+		req->peer_ip_hi = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr);
+		req->peer_ip_lo = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr +
+								    8);
+
+		req->opt0 = cpu_to_be64(opt0);
+
+		opt2 |= RX_FC_VALID_F;
+		req->opt2 = cpu_to_be32(opt2);
+
+		req->params = cpu_to_be32(cxgb4_select_ntuple(
+					  csk->cdev->ports[csk->port_id],
+					  csk->l2t));
+	} else if (is_t5(lldi->adapter_type)) {
+		struct cpl_t5_act_open_req6 *req =
+				(struct cpl_t5_act_open_req6 *)skb->head;
+
+		INIT_TP_WR(req, 0);
+		OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
+							    qid_atid));
+		req->local_port = csk->saddr6.sin6_port;
+		req->peer_port = csk->daddr6.sin6_port;
+		req->local_ip_hi = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr);
+		req->local_ip_lo = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr +
+									8);
+		req->peer_ip_hi = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr);
+		req->peer_ip_lo = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr +
+									8);
+		req->opt0 = cpu_to_be64(opt0);
+
+		opt2 |= T5_OPT_2_VALID_F;
+		req->opt2 = cpu_to_be32(opt2);
+
+		req->params = cpu_to_be64(FILTER_TUPLE_V(cxgb4_select_ntuple(
+					  csk->cdev->ports[csk->port_id],
+					  csk->l2t)));
+	} else {
+		struct cpl_t6_act_open_req6 *req =
+				(struct cpl_t6_act_open_req6 *)skb->head;
+
+		INIT_TP_WR(req, 0);
+		OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
+							    qid_atid));
+		req->local_port = csk->saddr6.sin6_port;
+		req->peer_port = csk->daddr6.sin6_port;
+		req->local_ip_hi = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr);
+		req->local_ip_lo = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr +
+									8);
+		req->peer_ip_hi = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr);
+		req->peer_ip_lo = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr +
+									8);
+		req->opt0 = cpu_to_be64(opt0);
+
+		opt2 |= RX_FC_DISABLE_F;
+		opt2 |= T5_OPT_2_VALID_F;
+
+		req->opt2 = cpu_to_be32(opt2);
+
+		req->params = cpu_to_be64(FILTER_TUPLE_V(cxgb4_select_ntuple(
+					  csk->cdev->ports[csk->port_id],
+					  csk->l2t)));
+
+		req->rsvd2 = cpu_to_be32(0);
+		req->opt3 = cpu_to_be32(0);
+	}
+
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, csk->port_id);
+
+	pr_info("t%d csk 0x%p,%u,0x%lx,%u, [%pI6]:%u-[%pI6]:%u, rss_qid %u.\n",
+		CHELSIO_CHIP_VERSION(lldi->adapter_type), csk, csk->state,
+		csk->flags, csk->atid,
+		&csk->saddr6.sin6_addr, ntohs(csk->saddr.sin_port),
+		&csk->daddr6.sin6_addr, ntohs(csk->daddr.sin_port),
+		csk->rss_qid);
+
+	cxgb4_l2t_send(csk->cdev->ports[csk->port_id], skb, csk->l2t);
+}
+#endif
+
+static void send_close_req(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb = csk->cpl_close;
+	struct cpl_close_con_req *req = (struct cpl_close_con_req *)skb->head;
+	unsigned int tid = csk->tid;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx, tid %u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+	csk->cpl_close = NULL;
+	set_wr_txq(skb, CPL_PRIORITY_DATA, csk->port_id);
+	INIT_TP_WR(req, tid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
+	req->rsvd = 0;
+
+	cxgbi_sock_skb_entail(csk, skb);
+	if (csk->state >= CTP_ESTABLISHED)
+		push_tx_frames(csk, 1);
+}
+
+static void abort_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk = (struct cxgbi_sock *)handle;
+	struct cpl_abort_req *req;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx, tid %u, abort.\n",
+		csk, csk->state, csk->flags, csk->tid);
+	req = (struct cpl_abort_req *)skb->data;
+	req->cmd = CPL_ABORT_NO_RST;
+	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+}
+
+static void send_abort_req(struct cxgbi_sock *csk)
+{
+	struct cpl_abort_req *req;
+	struct sk_buff *skb = csk->cpl_abort_req;
+
+	if (unlikely(csk->state == CTP_ABORTING) || !skb || !csk->cdev)
+		return;
+
+	if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) {
+		send_tx_flowc_wr(csk);
+		cxgbi_sock_set_flag(csk, CTPF_TX_DATA_SENT);
+	}
+
+	cxgbi_sock_set_state(csk, CTP_ABORTING);
+	cxgbi_sock_set_flag(csk, CTPF_ABORT_RPL_PENDING);
+	cxgbi_sock_purge_write_queue(csk);
+
+	csk->cpl_abort_req = NULL;
+	req = (struct cpl_abort_req *)skb->head;
+	set_wr_txq(skb, CPL_PRIORITY_DATA, csk->port_id);
+	req->cmd = CPL_ABORT_SEND_RST;
+	t4_set_arp_err_handler(skb, csk, abort_arp_failure);
+	INIT_TP_WR(req, csk->tid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, csk->tid));
+	req->rsvd0 = htonl(csk->snd_nxt);
+	req->rsvd1 = !cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u, snd_nxt %u, 0x%x.\n",
+		csk, csk->state, csk->flags, csk->tid, csk->snd_nxt,
+		req->rsvd1);
+
+	cxgb4_l2t_send(csk->cdev->ports[csk->port_id], skb, csk->l2t);
+}
+
+static void send_abort_rpl(struct cxgbi_sock *csk, int rst_status)
+{
+	struct sk_buff *skb = csk->cpl_abort_rpl;
+	struct cpl_abort_rpl *rpl = (struct cpl_abort_rpl *)skb->head;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u, status %d.\n",
+		csk, csk->state, csk->flags, csk->tid, rst_status);
+
+	csk->cpl_abort_rpl = NULL;
+	set_wr_txq(skb, CPL_PRIORITY_DATA, csk->port_id);
+	INIT_TP_WR(rpl, csk->tid);
+	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_RPL, csk->tid));
+	rpl->cmd = rst_status;
+	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+}
+
+/*
+ * CPL connection rx data ack: host ->
+ * Send RX credits through an RX_DATA_ACK CPL message. Returns the number of
+ * credits sent.
+ */
+static u32 send_rx_credits(struct cxgbi_sock *csk, u32 credits)
+{
+	struct sk_buff *skb;
+	struct cpl_rx_data_ack *req;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx,%u, credit %u.\n",
+		csk, csk->state, csk->flags, csk->tid, credits);
+
+	skb = alloc_wr(sizeof(*req), 0, GFP_ATOMIC);
+	if (!skb) {
+		pr_info("csk 0x%p, credit %u, OOM.\n", csk, credits);
+		return 0;
+	}
+	req = (struct cpl_rx_data_ack *)skb->head;
+
+	set_wr_txq(skb, CPL_PRIORITY_ACK, csk->port_id);
+	INIT_TP_WR(req, csk->tid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_RX_DATA_ACK,
+				      csk->tid));
+	req->credit_dack = cpu_to_be32(RX_CREDITS_V(credits)
+				       | RX_FORCE_ACK_F);
+	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+	return credits;
+}
+
+/*
+ * sgl_len - calculates the size of an SGL of the given capacity
+ * @n: the number of SGL entries
+ * Calculates the number of flits needed for a scatter/gather list that
+ * can hold the given number of entries.
+ */
+static inline unsigned int sgl_len(unsigned int n)
+{
+	n--;
+	return (3 * n) / 2 + (n & 1) + 2;
+}
+
+/*
+ * calc_tx_flits_ofld - calculate # of flits for an offload packet
+ * @skb: the packet
+ *
+ * Returns the number of flits needed for the given offload packet.
+ * These packets are already fully constructed and no additional headers
+ * will be added.
+ */
+static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb)
+{
+	unsigned int flits, cnt;
+
+	if (is_ofld_imm(skb))
+		return DIV_ROUND_UP(skb->len, 8);
+	flits = skb_transport_offset(skb) / 8;
+	cnt = skb_shinfo(skb)->nr_frags;
+	if (skb_tail_pointer(skb) != skb_transport_header(skb))
+		cnt++;
+	return flits + sgl_len(cnt);
+}
+
+#define FLOWC_WR_NPARAMS_MIN	9
+static inline int tx_flowc_wr_credits(int *nparamsp, int *flowclenp)
+{
+	int nparams, flowclen16, flowclen;
+
+	nparams = FLOWC_WR_NPARAMS_MIN;
+	flowclen = offsetof(struct fw_flowc_wr, mnemval[nparams]);
+	flowclen16 = DIV_ROUND_UP(flowclen, 16);
+	flowclen = flowclen16 * 16;
+	/*
+	 * Return the number of 16-byte credits used by the FlowC request.
+	 * Pass back the nparams and actual FlowC length if requested.
+	 */
+	if (nparamsp)
+		*nparamsp = nparams;
+	if (flowclenp)
+		*flowclenp = flowclen;
+
+	return flowclen16;
+}
+
+static inline int send_tx_flowc_wr(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb;
+	struct fw_flowc_wr *flowc;
+	int nparams, flowclen16, flowclen;
+
+	flowclen16 = tx_flowc_wr_credits(&nparams, &flowclen);
+	skb = alloc_wr(flowclen, 0, GFP_ATOMIC);
+	flowc = (struct fw_flowc_wr *)skb->head;
+	flowc->op_to_nparams =
+		htonl(FW_WR_OP_V(FW_FLOWC_WR) | FW_FLOWC_WR_NPARAMS_V(nparams));
+	flowc->flowid_len16 =
+		htonl(FW_WR_LEN16_V(flowclen16) | FW_WR_FLOWID_V(csk->tid));
+	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+	flowc->mnemval[0].val = htonl(csk->cdev->pfvf);
+	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+	flowc->mnemval[1].val = htonl(csk->tx_chan);
+	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+	flowc->mnemval[2].val = htonl(csk->tx_chan);
+	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+	flowc->mnemval[3].val = htonl(csk->rss_qid);
+	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT;
+	flowc->mnemval[4].val = htonl(csk->snd_nxt);
+	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;
+	flowc->mnemval[5].val = htonl(csk->rcv_nxt);
+	flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF;
+	flowc->mnemval[6].val = htonl(csk->snd_win);
+	flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;
+	flowc->mnemval[7].val = htonl(csk->advmss);
+	flowc->mnemval[8].mnemonic = 0;
+	flowc->mnemval[8].val = 0;
+	flowc->mnemval[8].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX;
+	flowc->mnemval[8].val = 16384;
+
+	set_wr_txq(skb, CPL_PRIORITY_DATA, csk->port_id);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p, tid 0x%x, %u,%u,%u,%u,%u,%u,%u.\n",
+		csk, csk->tid, 0, csk->tx_chan, csk->rss_qid,
+		csk->snd_nxt, csk->rcv_nxt, csk->snd_win,
+		csk->advmss);
+
+	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+
+	return flowclen16;
+}
+
+static inline void make_tx_data_wr(struct cxgbi_sock *csk, struct sk_buff *skb,
+				   int dlen, int len, u32 credits, int compl)
+{
+	struct fw_ofld_tx_data_wr *req;
+	unsigned int submode = cxgbi_skcb_ulp_mode(skb) & 3;
+	unsigned int wr_ulp_mode = 0, val;
+	bool imm = is_ofld_imm(skb);
+
+	req = __skb_push(skb, sizeof(*req));
+
+	if (imm) {
+		req->op_to_immdlen = htonl(FW_WR_OP_V(FW_OFLD_TX_DATA_WR) |
+					FW_WR_COMPL_F |
+					FW_WR_IMMDLEN_V(dlen));
+		req->flowid_len16 = htonl(FW_WR_FLOWID_V(csk->tid) |
+						FW_WR_LEN16_V(credits));
+	} else {
+		req->op_to_immdlen =
+			cpu_to_be32(FW_WR_OP_V(FW_OFLD_TX_DATA_WR) |
+					FW_WR_COMPL_F |
+					FW_WR_IMMDLEN_V(0));
+		req->flowid_len16 =
+			cpu_to_be32(FW_WR_FLOWID_V(csk->tid) |
+					FW_WR_LEN16_V(credits));
+	}
+	if (submode)
+		wr_ulp_mode = FW_OFLD_TX_DATA_WR_ULPMODE_V(ULP2_MODE_ISCSI) |
+				FW_OFLD_TX_DATA_WR_ULPSUBMODE_V(submode);
+	val = skb_peek(&csk->write_queue) ? 0 : 1;
+	req->tunnel_to_proxy = htonl(wr_ulp_mode |
+				     FW_OFLD_TX_DATA_WR_SHOVE_V(val));
+	req->plen = htonl(len);
+	if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT))
+		cxgbi_sock_set_flag(csk, CTPF_TX_DATA_SENT);
+}
+
+static void arp_failure_skb_discard(void *handle, struct sk_buff *skb)
+{
+	kfree_skb(skb);
+}
+
+static int push_tx_frames(struct cxgbi_sock *csk, int req_completion)
+{
+	int total_size = 0;
+	struct sk_buff *skb;
+
+	if (unlikely(csk->state < CTP_ESTABLISHED ||
+		csk->state == CTP_CLOSE_WAIT_1 || csk->state >= CTP_ABORTING)) {
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK |
+			 1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p,%u,0x%lx,%u, in closing state.\n",
+			csk, csk->state, csk->flags, csk->tid);
+		return 0;
+	}
+
+	while (csk->wr_cred && (skb = skb_peek(&csk->write_queue)) != NULL) {
+		int dlen = skb->len;
+		int len = skb->len;
+		unsigned int credits_needed;
+		int flowclen16 = 0;
+
+		skb_reset_transport_header(skb);
+		if (is_ofld_imm(skb))
+			credits_needed = DIV_ROUND_UP(dlen, 16);
+		else
+			credits_needed = DIV_ROUND_UP(
+						8 * calc_tx_flits_ofld(skb),
+						16);
+
+		if (likely(cxgbi_skcb_test_flag(skb, SKCBF_TX_NEED_HDR)))
+			credits_needed += DIV_ROUND_UP(
+					sizeof(struct fw_ofld_tx_data_wr),
+					16);
+
+		/*
+		 * Assumes the initial credits is large enough to support
+		 * fw_flowc_wr plus largest possible first payload
+		 */
+		if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) {
+			flowclen16 = send_tx_flowc_wr(csk);
+			csk->wr_cred -= flowclen16;
+			csk->wr_una_cred += flowclen16;
+			cxgbi_sock_set_flag(csk, CTPF_TX_DATA_SENT);
+		}
+
+		if (csk->wr_cred < credits_needed) {
+			log_debug(1 << CXGBI_DBG_PDU_TX,
+				"csk 0x%p, skb %u/%u, wr %d < %u.\n",
+				csk, skb->len, skb->data_len,
+				credits_needed, csk->wr_cred);
+			break;
+		}
+		__skb_unlink(skb, &csk->write_queue);
+		set_wr_txq(skb, CPL_PRIORITY_DATA, csk->port_id);
+		skb->csum = credits_needed + flowclen16;
+		csk->wr_cred -= credits_needed;
+		csk->wr_una_cred += credits_needed;
+		cxgbi_sock_enqueue_wr(csk, skb);
+
+		log_debug(1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p, skb %u/%u, wr %d, left %u, unack %u.\n",
+			csk, skb->len, skb->data_len, credits_needed,
+			csk->wr_cred, csk->wr_una_cred);
+
+		if (likely(cxgbi_skcb_test_flag(skb, SKCBF_TX_NEED_HDR))) {
+			len += cxgbi_ulp_extra_len(cxgbi_skcb_ulp_mode(skb));
+			make_tx_data_wr(csk, skb, dlen, len, credits_needed,
+					req_completion);
+			csk->snd_nxt += len;
+			cxgbi_skcb_clear_flag(skb, SKCBF_TX_NEED_HDR);
+		} else if (cxgbi_skcb_test_flag(skb, SKCBF_TX_FLAG_COMPL) &&
+			   (csk->wr_una_cred >= (csk->wr_max_cred / 2))) {
+			struct cpl_close_con_req *req =
+				(struct cpl_close_con_req *)skb->data;
+			req->wr.wr_hi |= htonl(FW_WR_COMPL_F);
+		}
+		total_size += skb->truesize;
+		t4_set_arp_err_handler(skb, csk, arp_failure_skb_discard);
+
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p,%u,0x%lx,%u, skb 0x%p, %u.\n",
+			csk, csk->state, csk->flags, csk->tid, skb, len);
+
+		cxgb4_l2t_send(csk->cdev->ports[csk->port_id], skb, csk->l2t);
+	}
+	return total_size;
+}
+
+static inline void free_atid(struct cxgbi_sock *csk)
+{
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(csk->cdev);
+
+	if (cxgbi_sock_flag(csk, CTPF_HAS_ATID)) {
+		cxgb4_free_atid(lldi->tids, csk->atid);
+		cxgbi_sock_clear_flag(csk, CTPF_HAS_ATID);
+		cxgbi_sock_put(csk);
+	}
+}
+
+static void do_act_establish(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_act_establish *req = (struct cpl_act_establish *)skb->data;
+	unsigned short tcp_opt = ntohs(req->tcp_opt);
+	unsigned int tid = GET_TID(req);
+	unsigned int atid = TID_TID_G(ntohl(req->tos_atid));
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+	u32 rcv_isn = be32_to_cpu(req->rcv_isn);
+
+	csk = lookup_atid(t, atid);
+	if (unlikely(!csk)) {
+		pr_err("NO conn. for atid %u, cdev 0x%p.\n", atid, cdev);
+		goto rel_skb;
+	}
+
+	if (csk->atid != atid) {
+		pr_err("bad conn atid %u, csk 0x%p,%u,0x%lx,tid %u, atid %u.\n",
+			atid, csk, csk->state, csk->flags, csk->tid, csk->atid);
+		goto rel_skb;
+	}
+
+	pr_info_ipaddr("atid 0x%x, tid 0x%x, csk 0x%p,%u,0x%lx, isn %u.\n",
+		       (&csk->saddr), (&csk->daddr),
+		       atid, tid, csk, csk->state, csk->flags, rcv_isn);
+
+	module_put(cdev->owner);
+
+	cxgbi_sock_get(csk);
+	csk->tid = tid;
+	cxgb4_insert_tid(lldi->tids, csk, tid, csk->csk_family);
+	cxgbi_sock_set_flag(csk, CTPF_HAS_TID);
+
+	free_atid(csk);
+
+	spin_lock_bh(&csk->lock);
+	if (unlikely(csk->state != CTP_ACTIVE_OPEN))
+		pr_info("csk 0x%p,%u,0x%lx,%u, got EST.\n",
+			csk, csk->state, csk->flags, csk->tid);
+
+	if (csk->retry_timer.function) {
+		del_timer(&csk->retry_timer);
+		csk->retry_timer.function = NULL;
+	}
+
+	csk->copied_seq = csk->rcv_wup = csk->rcv_nxt = rcv_isn;
+	/*
+	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
+	 * pass through opt0.
+	 */
+	if (csk->rcv_win > (RCV_BUFSIZ_MASK << 10))
+		csk->rcv_wup -= csk->rcv_win - (RCV_BUFSIZ_MASK << 10);
+
+	csk->advmss = lldi->mtus[TCPOPT_MSS_G(tcp_opt)] - 40;
+	if (TCPOPT_TSTAMP_G(tcp_opt))
+		csk->advmss -= 12;
+	if (csk->advmss < 128)
+		csk->advmss = 128;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p, mss_idx %u, advmss %u.\n",
+			csk, TCPOPT_MSS_G(tcp_opt), csk->advmss);
+
+	cxgbi_sock_established(csk, ntohl(req->snd_isn), ntohs(req->tcp_opt));
+
+	if (unlikely(cxgbi_sock_flag(csk, CTPF_ACTIVE_CLOSE_NEEDED)))
+		send_abort_req(csk);
+	else {
+		if (skb_queue_len(&csk->write_queue))
+			push_tx_frames(csk, 0);
+		cxgbi_conn_tx_open(csk);
+	}
+	spin_unlock_bh(&csk->lock);
+
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static int act_open_rpl_status_to_errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+		return -ECONNREFUSED;
+	case CPL_ERR_ARP_MISS:
+		return -EHOSTUNREACH;
+	case CPL_ERR_CONN_TIMEDOUT:
+		return -ETIMEDOUT;
+	case CPL_ERR_TCAM_FULL:
+		return -ENOMEM;
+	case CPL_ERR_CONN_EXIST:
+		return -EADDRINUSE;
+	default:
+		return -EIO;
+	}
+}
+
+static void csk_act_open_retry_timer(struct timer_list *t)
+{
+	struct sk_buff *skb = NULL;
+	struct cxgbi_sock *csk = from_timer(csk, t, retry_timer);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(csk->cdev);
+	void (*send_act_open_func)(struct cxgbi_sock *, struct sk_buff *,
+				   struct l2t_entry *);
+	int t4 = is_t4(lldi->adapter_type), size, size6;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	if (t4) {
+		size = sizeof(struct cpl_act_open_req);
+		size6 = sizeof(struct cpl_act_open_req6);
+	} else {
+		size = sizeof(struct cpl_t5_act_open_req);
+		size6 = sizeof(struct cpl_t5_act_open_req6);
+	}
+
+	if (csk->csk_family == AF_INET) {
+		send_act_open_func = send_act_open_req;
+		skb = alloc_wr(size, 0, GFP_ATOMIC);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		send_act_open_func = send_act_open_req6;
+		skb = alloc_wr(size6, 0, GFP_ATOMIC);
+#endif
+	}
+
+	if (!skb)
+		cxgbi_sock_fail_act_open(csk, -ENOMEM);
+	else {
+		skb->sk = (struct sock *)csk;
+		t4_set_arp_err_handler(skb, csk,
+				       cxgbi_sock_act_open_req_arp_failure);
+		send_act_open_func(csk, skb, csk->l2t);
+	}
+
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+
+}
+
+static inline bool is_neg_adv(unsigned int status)
+{
+	return status == CPL_ERR_RTX_NEG_ADVICE ||
+		status == CPL_ERR_KEEPALV_NEG_ADVICE ||
+		status == CPL_ERR_PERSIST_NEG_ADVICE;
+}
+
+static void do_act_open_rpl(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_act_open_rpl *rpl = (struct cpl_act_open_rpl *)skb->data;
+	unsigned int tid = GET_TID(rpl);
+	unsigned int atid =
+		TID_TID_G(AOPEN_ATID_G(be32_to_cpu(rpl->atid_status)));
+	unsigned int status = AOPEN_STATUS_G(be32_to_cpu(rpl->atid_status));
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_atid(t, atid);
+	if (unlikely(!csk)) {
+		pr_err("NO matching conn. atid %u, tid %u.\n", atid, tid);
+		goto rel_skb;
+	}
+
+	pr_info_ipaddr("tid %u/%u, status %u.\n"
+		       "csk 0x%p,%u,0x%lx. ", (&csk->saddr), (&csk->daddr),
+		       atid, tid, status, csk, csk->state, csk->flags);
+
+	if (is_neg_adv(status))
+		goto rel_skb;
+
+	module_put(cdev->owner);
+
+	if (status && status != CPL_ERR_TCAM_FULL &&
+	    status != CPL_ERR_CONN_EXIST &&
+	    status != CPL_ERR_ARP_MISS)
+		cxgb4_remove_tid(lldi->tids, csk->port_id, GET_TID(rpl),
+				 csk->csk_family);
+
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	if (status == CPL_ERR_CONN_EXIST &&
+	    csk->retry_timer.function != csk_act_open_retry_timer) {
+		csk->retry_timer.function = csk_act_open_retry_timer;
+		mod_timer(&csk->retry_timer, jiffies + HZ / 2);
+	} else
+		cxgbi_sock_fail_act_open(csk,
+					act_open_rpl_status_to_errno(status));
+
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void do_peer_close(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_peer_close *req = (struct cpl_peer_close *)skb->data;
+	unsigned int tid = GET_TID(req);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk)) {
+		pr_err("can't find connection for tid %u.\n", tid);
+		goto rel_skb;
+	}
+	pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u.\n",
+		       (&csk->saddr), (&csk->daddr),
+		       csk, csk->state, csk->flags, csk->tid);
+	cxgbi_sock_rcv_peer_close(csk);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void do_close_con_rpl(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_close_con_rpl *rpl = (struct cpl_close_con_rpl *)skb->data;
+	unsigned int tid = GET_TID(rpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk)) {
+		pr_err("can't find connection for tid %u.\n", tid);
+		goto rel_skb;
+	}
+	pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u.\n",
+		       (&csk->saddr), (&csk->daddr),
+		       csk, csk->state, csk->flags, csk->tid);
+	cxgbi_sock_rcv_close_conn_rpl(csk, ntohl(rpl->snd_nxt));
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static int abort_status_to_errno(struct cxgbi_sock *csk, int abort_reason,
+								int *need_rst)
+{
+	switch (abort_reason) {
+	case CPL_ERR_BAD_SYN: /* fall through */
+	case CPL_ERR_CONN_RESET:
+		return csk->state > CTP_ESTABLISHED ?
+			-EPIPE : -ECONNRESET;
+	case CPL_ERR_XMIT_TIMEDOUT:
+	case CPL_ERR_PERSIST_TIMEDOUT:
+	case CPL_ERR_FINWAIT2_TIMEDOUT:
+	case CPL_ERR_KEEPALIVE_TIMEDOUT:
+		return -ETIMEDOUT;
+	default:
+		return -EIO;
+	}
+}
+
+static void do_abort_req_rss(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_abort_req_rss *req = (struct cpl_abort_req_rss *)skb->data;
+	unsigned int tid = GET_TID(req);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+	int rst_status = CPL_ABORT_NO_RST;
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk)) {
+		pr_err("can't find connection for tid %u.\n", tid);
+		goto rel_skb;
+	}
+
+	pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u, status %u.\n",
+		       (&csk->saddr), (&csk->daddr),
+		       csk, csk->state, csk->flags, csk->tid, req->status);
+
+	if (is_neg_adv(req->status))
+		goto rel_skb;
+
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	cxgbi_sock_clear_flag(csk, CTPF_ABORT_REQ_RCVD);
+
+	if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) {
+		send_tx_flowc_wr(csk);
+		cxgbi_sock_set_flag(csk, CTPF_TX_DATA_SENT);
+	}
+
+	cxgbi_sock_set_flag(csk, CTPF_ABORT_REQ_RCVD);
+	cxgbi_sock_set_state(csk, CTP_ABORTING);
+
+	send_abort_rpl(csk, rst_status);
+
+	if (!cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING)) {
+		csk->err = abort_status_to_errno(csk, req->status, &rst_status);
+		cxgbi_sock_closed(csk);
+	}
+
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void do_abort_rpl_rss(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_abort_rpl_rss *rpl = (struct cpl_abort_rpl_rss *)skb->data;
+	unsigned int tid = GET_TID(rpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_tid(t, tid);
+	if (!csk)
+		goto rel_skb;
+
+	if (csk)
+		pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u, status %u.\n",
+			       (&csk->saddr), (&csk->daddr), csk,
+			       csk->state, csk->flags, csk->tid, rpl->status);
+
+	if (rpl->status == CPL_ERR_ABORT_FAILED)
+		goto rel_skb;
+
+	cxgbi_sock_rcv_abort_rpl(csk);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void do_rx_data(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_rx_data *cpl = (struct cpl_rx_data *)skb->data;
+	unsigned int tid = GET_TID(cpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_tid(t, tid);
+	if (!csk) {
+		pr_err("can't find connection for tid %u.\n", tid);
+	} else {
+		/* not expecting this, reset the connection. */
+		pr_err("csk 0x%p, tid %u, rcv cpl_rx_data.\n", csk, tid);
+		spin_lock_bh(&csk->lock);
+		send_abort_req(csk);
+		spin_unlock_bh(&csk->lock);
+	}
+	__kfree_skb(skb);
+}
+
+static void do_rx_iscsi_hdr(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_iscsi_hdr *cpl = (struct cpl_iscsi_hdr *)skb->data;
+	unsigned short pdu_len_ddp = be16_to_cpu(cpl->pdu_len_ddp);
+	unsigned int tid = GET_TID(cpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk)) {
+		pr_err("can't find conn. for tid %u.\n", tid);
+		goto rel_skb;
+	}
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx, tid %u, skb 0x%p,%u, 0x%x.\n",
+		csk, csk->state, csk->flags, csk->tid, skb, skb->len,
+		pdu_len_ddp);
+
+	spin_lock_bh(&csk->lock);
+
+	if (unlikely(csk->state >= CTP_PASSIVE_CLOSE)) {
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk 0x%p,%u,0x%lx,%u, bad state.\n",
+			csk, csk->state, csk->flags, csk->tid);
+		if (csk->state != CTP_ABORTING)
+			goto abort_conn;
+		else
+			goto discard;
+	}
+
+	cxgbi_skcb_tcp_seq(skb) = ntohl(cpl->seq);
+	cxgbi_skcb_flags(skb) = 0;
+
+	skb_reset_transport_header(skb);
+	__skb_pull(skb, sizeof(*cpl));
+	__pskb_trim(skb, ntohs(cpl->len));
+
+	if (!csk->skb_ulp_lhdr) {
+		unsigned char *bhs;
+		unsigned int hlen, dlen, plen;
+
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+			"csk 0x%p,%u,0x%lx, tid %u, skb 0x%p header.\n",
+			csk, csk->state, csk->flags, csk->tid, skb);
+		csk->skb_ulp_lhdr = skb;
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_HDR);
+
+		if (cxgbi_skcb_tcp_seq(skb) != csk->rcv_nxt) {
+			pr_info("tid %u, CPL_ISCSI_HDR, bad seq, 0x%x/0x%x.\n",
+				csk->tid, cxgbi_skcb_tcp_seq(skb),
+				csk->rcv_nxt);
+			goto abort_conn;
+		}
+
+		bhs = skb->data;
+		hlen = ntohs(cpl->len);
+		dlen = ntohl(*(unsigned int *)(bhs + 4)) & 0xFFFFFF;
+
+		plen = ISCSI_PDU_LEN_G(pdu_len_ddp);
+		if (is_t4(lldi->adapter_type))
+			plen -= 40;
+
+		if ((hlen + dlen) != plen) {
+			pr_info("tid 0x%x, CPL_ISCSI_HDR, pdu len "
+				"mismatch %u != %u + %u, seq 0x%x.\n",
+				csk->tid, plen, hlen, dlen,
+				cxgbi_skcb_tcp_seq(skb));
+			goto abort_conn;
+		}
+
+		cxgbi_skcb_rx_pdulen(skb) = (hlen + dlen + 3) & (~0x3);
+		if (dlen)
+			cxgbi_skcb_rx_pdulen(skb) += csk->dcrc_len;
+		csk->rcv_nxt += cxgbi_skcb_rx_pdulen(skb);
+
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+			"csk 0x%p, skb 0x%p, 0x%x,%u+%u,0x%x,0x%x.\n",
+			csk, skb, *bhs, hlen, dlen,
+			ntohl(*((unsigned int *)(bhs + 16))),
+			ntohl(*((unsigned int *)(bhs + 24))));
+
+	} else {
+		struct sk_buff *lskb = csk->skb_ulp_lhdr;
+
+		cxgbi_skcb_set_flag(lskb, SKCBF_RX_DATA);
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+			"csk 0x%p,%u,0x%lx, skb 0x%p data, 0x%p.\n",
+			csk, csk->state, csk->flags, skb, lskb);
+	}
+
+	__skb_queue_tail(&csk->receive_queue, skb);
+	spin_unlock_bh(&csk->lock);
+	return;
+
+abort_conn:
+	send_abort_req(csk);
+discard:
+	spin_unlock_bh(&csk->lock);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void do_rx_iscsi_data(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_iscsi_hdr *cpl = (struct cpl_iscsi_hdr *)skb->data;
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+	struct sk_buff *lskb;
+	u32 tid = GET_TID(cpl);
+	u16 pdu_len_ddp = be16_to_cpu(cpl->pdu_len_ddp);
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk)) {
+		pr_err("can't find conn. for tid %u.\n", tid);
+		goto rel_skb;
+	}
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		  "csk 0x%p,%u,0x%lx, tid %u, skb 0x%p,%u, 0x%x.\n",
+		  csk, csk->state, csk->flags, csk->tid, skb,
+		  skb->len, pdu_len_ddp);
+
+	spin_lock_bh(&csk->lock);
+
+	if (unlikely(csk->state >= CTP_PASSIVE_CLOSE)) {
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			  "csk 0x%p,%u,0x%lx,%u, bad state.\n",
+			  csk, csk->state, csk->flags, csk->tid);
+
+		if (csk->state != CTP_ABORTING)
+			goto abort_conn;
+		else
+			goto discard;
+	}
+
+	cxgbi_skcb_tcp_seq(skb) = be32_to_cpu(cpl->seq);
+	cxgbi_skcb_flags(skb) = 0;
+
+	skb_reset_transport_header(skb);
+	__skb_pull(skb, sizeof(*cpl));
+	__pskb_trim(skb, ntohs(cpl->len));
+
+	if (!csk->skb_ulp_lhdr)
+		csk->skb_ulp_lhdr = skb;
+
+	lskb = csk->skb_ulp_lhdr;
+	cxgbi_skcb_set_flag(lskb, SKCBF_RX_DATA);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		  "csk 0x%p,%u,0x%lx, skb 0x%p data, 0x%p.\n",
+		  csk, csk->state, csk->flags, skb, lskb);
+
+	__skb_queue_tail(&csk->receive_queue, skb);
+	spin_unlock_bh(&csk->lock);
+	return;
+
+abort_conn:
+	send_abort_req(csk);
+discard:
+	spin_unlock_bh(&csk->lock);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void
+cxgb4i_process_ddpvld(struct cxgbi_sock *csk,
+		      struct sk_buff *skb, u32 ddpvld)
+{
+	if (ddpvld & (1 << CPL_RX_DDP_STATUS_HCRC_SHIFT)) {
+		pr_info("csk 0x%p, lhdr 0x%p, status 0x%x, hcrc bad 0x%lx.\n",
+			csk, skb, ddpvld, cxgbi_skcb_flags(skb));
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_HCRC_ERR);
+	}
+
+	if (ddpvld & (1 << CPL_RX_DDP_STATUS_DCRC_SHIFT)) {
+		pr_info("csk 0x%p, lhdr 0x%p, status 0x%x, dcrc bad 0x%lx.\n",
+			csk, skb, ddpvld, cxgbi_skcb_flags(skb));
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_DCRC_ERR);
+	}
+
+	if (ddpvld & (1 << CPL_RX_DDP_STATUS_PAD_SHIFT)) {
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			  "csk 0x%p, lhdr 0x%p, status 0x%x, pad bad.\n",
+			  csk, skb, ddpvld);
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_PAD_ERR);
+	}
+
+	if ((ddpvld & (1 << CPL_RX_DDP_STATUS_DDP_SHIFT)) &&
+	    !cxgbi_skcb_test_flag(skb, SKCBF_RX_DATA)) {
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			  "csk 0x%p, lhdr 0x%p, 0x%x, data ddp'ed.\n",
+			  csk, skb, ddpvld);
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_DATA_DDPD);
+	}
+}
+
+static void do_rx_data_ddp(struct cxgbi_device *cdev,
+				  struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct sk_buff *lskb;
+	struct cpl_rx_data_ddp *rpl = (struct cpl_rx_data_ddp *)skb->data;
+	unsigned int tid = GET_TID(rpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+	u32 ddpvld = be32_to_cpu(rpl->ddpvld);
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk)) {
+		pr_err("can't find connection for tid %u.\n", tid);
+		goto rel_skb;
+	}
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx, skb 0x%p,0x%x, lhdr 0x%p.\n",
+		csk, csk->state, csk->flags, skb, ddpvld, csk->skb_ulp_lhdr);
+
+	spin_lock_bh(&csk->lock);
+
+	if (unlikely(csk->state >= CTP_PASSIVE_CLOSE)) {
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk 0x%p,%u,0x%lx,%u, bad state.\n",
+			csk, csk->state, csk->flags, csk->tid);
+		if (csk->state != CTP_ABORTING)
+			goto abort_conn;
+		else
+			goto discard;
+	}
+
+	if (!csk->skb_ulp_lhdr) {
+		pr_err("tid 0x%x, rcv RX_DATA_DDP w/o pdu bhs.\n", csk->tid);
+		goto abort_conn;
+	}
+
+	lskb = csk->skb_ulp_lhdr;
+	csk->skb_ulp_lhdr = NULL;
+
+	cxgbi_skcb_rx_ddigest(lskb) = ntohl(rpl->ulp_crc);
+
+	if (ntohs(rpl->len) != cxgbi_skcb_rx_pdulen(lskb))
+		pr_info("tid 0x%x, RX_DATA_DDP pdulen %u != %u.\n",
+			csk->tid, ntohs(rpl->len), cxgbi_skcb_rx_pdulen(lskb));
+
+	cxgb4i_process_ddpvld(csk, lskb, ddpvld);
+
+	log_debug(1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p, lskb 0x%p, f 0x%lx.\n",
+		csk, lskb, cxgbi_skcb_flags(lskb));
+
+	cxgbi_skcb_set_flag(lskb, SKCBF_RX_STATUS);
+	cxgbi_conn_pdu_ready(csk);
+	spin_unlock_bh(&csk->lock);
+	goto rel_skb;
+
+abort_conn:
+	send_abort_req(csk);
+discard:
+	spin_unlock_bh(&csk->lock);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void
+do_rx_iscsi_cmp(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_rx_iscsi_cmp *rpl = (struct cpl_rx_iscsi_cmp *)skb->data;
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+	struct sk_buff *data_skb = NULL;
+	u32 tid = GET_TID(rpl);
+	u32 ddpvld = be32_to_cpu(rpl->ddpvld);
+	u32 seq = be32_to_cpu(rpl->seq);
+	u16 pdu_len_ddp = be16_to_cpu(rpl->pdu_len_ddp);
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk)) {
+		pr_err("can't find connection for tid %u.\n", tid);
+		goto rel_skb;
+	}
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		  "csk 0x%p,%u,0x%lx, skb 0x%p,0x%x, lhdr 0x%p, len %u, "
+		  "pdu_len_ddp %u, status %u.\n",
+		  csk, csk->state, csk->flags, skb, ddpvld, csk->skb_ulp_lhdr,
+		  ntohs(rpl->len), pdu_len_ddp,  rpl->status);
+
+	spin_lock_bh(&csk->lock);
+
+	if (unlikely(csk->state >= CTP_PASSIVE_CLOSE)) {
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			  "csk 0x%p,%u,0x%lx,%u, bad state.\n",
+			  csk, csk->state, csk->flags, csk->tid);
+
+		if (csk->state != CTP_ABORTING)
+			goto abort_conn;
+		else
+			goto discard;
+	}
+
+	cxgbi_skcb_tcp_seq(skb) = seq;
+	cxgbi_skcb_flags(skb) = 0;
+	cxgbi_skcb_rx_pdulen(skb) = 0;
+
+	skb_reset_transport_header(skb);
+	__skb_pull(skb, sizeof(*rpl));
+	__pskb_trim(skb, be16_to_cpu(rpl->len));
+
+	csk->rcv_nxt = seq + pdu_len_ddp;
+
+	if (csk->skb_ulp_lhdr) {
+		data_skb = skb_peek(&csk->receive_queue);
+		if (!data_skb ||
+		    !cxgbi_skcb_test_flag(data_skb, SKCBF_RX_DATA)) {
+			pr_err("Error! freelist data not found 0x%p, tid %u\n",
+			       data_skb, tid);
+
+			goto abort_conn;
+		}
+		__skb_unlink(data_skb, &csk->receive_queue);
+
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_DATA);
+
+		__skb_queue_tail(&csk->receive_queue, skb);
+		__skb_queue_tail(&csk->receive_queue, data_skb);
+	} else {
+		 __skb_queue_tail(&csk->receive_queue, skb);
+	}
+
+	csk->skb_ulp_lhdr = NULL;
+
+	cxgbi_skcb_set_flag(skb, SKCBF_RX_HDR);
+	cxgbi_skcb_set_flag(skb, SKCBF_RX_STATUS);
+	cxgbi_skcb_set_flag(skb, SKCBF_RX_ISCSI_COMPL);
+	cxgbi_skcb_rx_ddigest(skb) = be32_to_cpu(rpl->ulp_crc);
+
+	cxgb4i_process_ddpvld(csk, skb, ddpvld);
+
+	log_debug(1 << CXGBI_DBG_PDU_RX, "csk 0x%p, skb 0x%p, f 0x%lx.\n",
+		  csk, skb, cxgbi_skcb_flags(skb));
+
+	cxgbi_conn_pdu_ready(csk);
+	spin_unlock_bh(&csk->lock);
+
+	return;
+
+abort_conn:
+	send_abort_req(csk);
+discard:
+	spin_unlock_bh(&csk->lock);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void do_fw4_ack(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_fw4_ack *rpl = (struct cpl_fw4_ack *)skb->data;
+	unsigned int tid = GET_TID(rpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk))
+		pr_err("can't find connection for tid %u.\n", tid);
+	else {
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk 0x%p,%u,0x%lx,%u.\n",
+			csk, csk->state, csk->flags, csk->tid);
+		cxgbi_sock_rcv_wr_ack(csk, rpl->credits, ntohl(rpl->snd_una),
+					rpl->seq_vld);
+	}
+	__kfree_skb(skb);
+}
+
+static void do_set_tcb_rpl(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cpl_set_tcb_rpl *rpl = (struct cpl_set_tcb_rpl *)skb->data;
+	unsigned int tid = GET_TID(rpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+	struct cxgbi_sock *csk;
+
+	csk = lookup_tid(t, tid);
+	if (!csk)
+		pr_err("can't find conn. for tid %u.\n", tid);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,%lx,%u, status 0x%x.\n",
+		csk, csk->state, csk->flags, csk->tid, rpl->status);
+
+	if (rpl->status != CPL_ERR_NONE)
+		pr_err("csk 0x%p,%u, SET_TCB_RPL status %u.\n",
+			csk, tid, rpl->status);
+
+	__kfree_skb(skb);
+}
+
+static int alloc_cpls(struct cxgbi_sock *csk)
+{
+	csk->cpl_close = alloc_wr(sizeof(struct cpl_close_con_req),
+					0, GFP_KERNEL);
+	if (!csk->cpl_close)
+		return -ENOMEM;
+
+	csk->cpl_abort_req = alloc_wr(sizeof(struct cpl_abort_req),
+					0, GFP_KERNEL);
+	if (!csk->cpl_abort_req)
+		goto free_cpls;
+
+	csk->cpl_abort_rpl = alloc_wr(sizeof(struct cpl_abort_rpl),
+					0, GFP_KERNEL);
+	if (!csk->cpl_abort_rpl)
+		goto free_cpls;
+	return 0;
+
+free_cpls:
+	cxgbi_sock_free_cpl_skbs(csk);
+	return -ENOMEM;
+}
+
+static inline void l2t_put(struct cxgbi_sock *csk)
+{
+	if (csk->l2t) {
+		cxgb4_l2t_release(csk->l2t);
+		csk->l2t = NULL;
+		cxgbi_sock_put(csk);
+	}
+}
+
+static void release_offload_resources(struct cxgbi_sock *csk)
+{
+	struct cxgb4_lld_info *lldi;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct net_device *ndev = csk->cdev->ports[csk->port_id];
+#endif
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	cxgbi_sock_free_cpl_skbs(csk);
+	cxgbi_sock_purge_write_queue(csk);
+	if (csk->wr_cred != csk->wr_max_cred) {
+		cxgbi_sock_purge_wr_queue(csk);
+		cxgbi_sock_reset_wr_list(csk);
+	}
+
+	l2t_put(csk);
+#if IS_ENABLED(CONFIG_IPV6)
+	if (csk->csk_family == AF_INET6)
+		cxgb4_clip_release(ndev,
+				   (const u32 *)&csk->saddr6.sin6_addr, 1);
+#endif
+
+	if (cxgbi_sock_flag(csk, CTPF_HAS_ATID))
+		free_atid(csk);
+	else if (cxgbi_sock_flag(csk, CTPF_HAS_TID)) {
+		lldi = cxgbi_cdev_priv(csk->cdev);
+		cxgb4_remove_tid(lldi->tids, 0, csk->tid,
+				 csk->csk_family);
+		cxgbi_sock_clear_flag(csk, CTPF_HAS_TID);
+		cxgbi_sock_put(csk);
+	}
+	csk->dst = NULL;
+}
+
+static int init_act_open(struct cxgbi_sock *csk)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct net_device *ndev = cdev->ports[csk->port_id];
+	struct sk_buff *skb = NULL;
+	struct neighbour *n = NULL;
+	void *daddr;
+	unsigned int step;
+	unsigned int rxq_idx;
+	unsigned int size, size6;
+	unsigned int linkspeed;
+	unsigned int rcv_winf, snd_winf;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	if (csk->csk_family == AF_INET)
+		daddr = &csk->daddr.sin_addr.s_addr;
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (csk->csk_family == AF_INET6)
+		daddr = &csk->daddr6.sin6_addr;
+#endif
+	else {
+		pr_err("address family 0x%x not supported\n", csk->csk_family);
+		goto rel_resource;
+	}
+
+	n = dst_neigh_lookup(csk->dst, daddr);
+
+	if (!n) {
+		pr_err("%s, can't get neighbour of csk->dst.\n", ndev->name);
+		goto rel_resource;
+	}
+
+	if (!(n->nud_state & NUD_VALID))
+		neigh_event_send(n, NULL);
+
+	csk->atid = cxgb4_alloc_atid(lldi->tids, csk);
+	if (csk->atid < 0) {
+		pr_err("%s, NO atid available.\n", ndev->name);
+		goto rel_resource_without_clip;
+	}
+	cxgbi_sock_set_flag(csk, CTPF_HAS_ATID);
+	cxgbi_sock_get(csk);
+
+	csk->l2t = cxgb4_l2t_get(lldi->l2t, n, ndev, 0);
+	if (!csk->l2t) {
+		pr_err("%s, cannot alloc l2t.\n", ndev->name);
+		goto rel_resource_without_clip;
+	}
+	cxgbi_sock_get(csk);
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (csk->csk_family == AF_INET6)
+		cxgb4_clip_get(ndev, (const u32 *)&csk->saddr6.sin6_addr, 1);
+#endif
+
+	if (is_t4(lldi->adapter_type)) {
+		size = sizeof(struct cpl_act_open_req);
+		size6 = sizeof(struct cpl_act_open_req6);
+	} else if (is_t5(lldi->adapter_type)) {
+		size = sizeof(struct cpl_t5_act_open_req);
+		size6 = sizeof(struct cpl_t5_act_open_req6);
+	} else {
+		size = sizeof(struct cpl_t6_act_open_req);
+		size6 = sizeof(struct cpl_t6_act_open_req6);
+	}
+
+	if (csk->csk_family == AF_INET)
+		skb = alloc_wr(size, 0, GFP_NOIO);
+#if IS_ENABLED(CONFIG_IPV6)
+	else
+		skb = alloc_wr(size6, 0, GFP_NOIO);
+#endif
+
+	if (!skb)
+		goto rel_resource;
+	skb->sk = (struct sock *)csk;
+	t4_set_arp_err_handler(skb, csk, cxgbi_sock_act_open_req_arp_failure);
+
+	if (!csk->mtu)
+		csk->mtu = dst_mtu(csk->dst);
+	cxgb4_best_mtu(lldi->mtus, csk->mtu, &csk->mss_idx);
+	csk->tx_chan = cxgb4_port_chan(ndev);
+	csk->smac_idx = cxgb4_tp_smt_idx(lldi->adapter_type,
+					 cxgb4_port_viid(ndev));
+	step = lldi->ntxq / lldi->nchan;
+	csk->txq_idx = cxgb4_port_idx(ndev) * step;
+	step = lldi->nrxq / lldi->nchan;
+	rxq_idx = (cxgb4_port_idx(ndev) * step) + (cdev->rxq_idx_cntr % step);
+	cdev->rxq_idx_cntr++;
+	csk->rss_qid = lldi->rxq_ids[rxq_idx];
+	linkspeed = ((struct port_info *)netdev_priv(ndev))->link_cfg.speed;
+	csk->snd_win = cxgb4i_snd_win;
+	csk->rcv_win = cxgb4i_rcv_win;
+	if (cxgb4i_rcv_win <= 0) {
+		csk->rcv_win = CXGB4I_DEFAULT_10G_RCV_WIN;
+		rcv_winf = linkspeed / SPEED_10000;
+		if (rcv_winf)
+			csk->rcv_win *= rcv_winf;
+	}
+	if (cxgb4i_snd_win <= 0) {
+		csk->snd_win = CXGB4I_DEFAULT_10G_SND_WIN;
+		snd_winf = linkspeed / SPEED_10000;
+		if (snd_winf)
+			csk->snd_win *= snd_winf;
+	}
+	csk->wr_cred = lldi->wr_cred -
+		       DIV_ROUND_UP(sizeof(struct cpl_abort_req), 16);
+	csk->wr_max_cred = csk->wr_cred;
+	csk->wr_una_cred = 0;
+	cxgbi_sock_reset_wr_list(csk);
+	csk->err = 0;
+
+	pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u,%u,%u, mtu %u,%u, smac %u.\n",
+		       (&csk->saddr), (&csk->daddr), csk, csk->state,
+		       csk->flags, csk->tx_chan, csk->txq_idx, csk->rss_qid,
+		       csk->mtu, csk->mss_idx, csk->smac_idx);
+
+	/* must wait for either a act_open_rpl or act_open_establish */
+	if (!try_module_get(cdev->owner)) {
+		pr_err("%s, try_module_get failed.\n", ndev->name);
+		goto rel_resource;
+	}
+
+	cxgbi_sock_set_state(csk, CTP_ACTIVE_OPEN);
+	if (csk->csk_family == AF_INET)
+		send_act_open_req(csk, skb, csk->l2t);
+#if IS_ENABLED(CONFIG_IPV6)
+	else
+		send_act_open_req6(csk, skb, csk->l2t);
+#endif
+	neigh_release(n);
+
+	return 0;
+
+rel_resource:
+#if IS_ENABLED(CONFIG_IPV6)
+	if (csk->csk_family == AF_INET6)
+		cxgb4_clip_release(ndev,
+				   (const u32 *)&csk->saddr6.sin6_addr, 1);
+#endif
+rel_resource_without_clip:
+	if (n)
+		neigh_release(n);
+	if (skb)
+		__kfree_skb(skb);
+	return -EINVAL;
+}
+
+static cxgb4i_cplhandler_func cxgb4i_cplhandlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH] = do_act_establish,
+	[CPL_ACT_OPEN_RPL] = do_act_open_rpl,
+	[CPL_PEER_CLOSE] = do_peer_close,
+	[CPL_ABORT_REQ_RSS] = do_abort_req_rss,
+	[CPL_ABORT_RPL_RSS] = do_abort_rpl_rss,
+	[CPL_CLOSE_CON_RPL] = do_close_con_rpl,
+	[CPL_FW4_ACK] = do_fw4_ack,
+	[CPL_ISCSI_HDR] = do_rx_iscsi_hdr,
+	[CPL_ISCSI_DATA] = do_rx_iscsi_data,
+	[CPL_SET_TCB_RPL] = do_set_tcb_rpl,
+	[CPL_RX_DATA_DDP] = do_rx_data_ddp,
+	[CPL_RX_ISCSI_DDP] = do_rx_data_ddp,
+	[CPL_RX_ISCSI_CMP] = do_rx_iscsi_cmp,
+	[CPL_RX_DATA] = do_rx_data,
+};
+
+static int cxgb4i_ofld_init(struct cxgbi_device *cdev)
+{
+	int rc;
+
+	if (cxgb4i_max_connect > CXGB4I_MAX_CONN)
+		cxgb4i_max_connect = CXGB4I_MAX_CONN;
+
+	rc = cxgbi_device_portmap_create(cdev, cxgb4i_sport_base,
+					cxgb4i_max_connect);
+	if (rc < 0)
+		return rc;
+
+	cdev->csk_release_offload_resources = release_offload_resources;
+	cdev->csk_push_tx_frames = push_tx_frames;
+	cdev->csk_send_abort_req = send_abort_req;
+	cdev->csk_send_close_req = send_close_req;
+	cdev->csk_send_rx_credits = send_rx_credits;
+	cdev->csk_alloc_cpls = alloc_cpls;
+	cdev->csk_init_act_open = init_act_open;
+
+	pr_info("cdev 0x%p, offload up, added.\n", cdev);
+	return 0;
+}
+
+static inline void
+ulp_mem_io_set_hdr(struct cxgbi_device *cdev,
+		   struct ulp_mem_io *req,
+		   unsigned int wr_len, unsigned int dlen,
+		   unsigned int pm_addr,
+		   int tid)
+{
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct ulptx_idata *idata = (struct ulptx_idata *)(req + 1);
+
+	INIT_ULPTX_WR(req, wr_len, 0, tid);
+	req->wr.wr_hi = htonl(FW_WR_OP_V(FW_ULPTX_WR) |
+		FW_WR_ATOMIC_V(0));
+	req->cmd = htonl(ULPTX_CMD_V(ULP_TX_MEM_WRITE) |
+		ULP_MEMIO_ORDER_V(is_t4(lldi->adapter_type)) |
+		T5_ULP_MEMIO_IMM_V(!is_t4(lldi->adapter_type)));
+	req->dlen = htonl(ULP_MEMIO_DATA_LEN_V(dlen >> 5));
+	req->lock_addr = htonl(ULP_MEMIO_ADDR_V(pm_addr >> 5));
+	req->len16 = htonl(DIV_ROUND_UP(wr_len - sizeof(req->wr), 16));
+
+	idata->cmd_more = htonl(ULPTX_CMD_V(ULP_TX_SC_IMM));
+	idata->len = htonl(dlen);
+}
+
+static struct sk_buff *
+ddp_ppod_init_idata(struct cxgbi_device *cdev,
+		    struct cxgbi_ppm *ppm,
+		    unsigned int idx, unsigned int npods,
+		    unsigned int tid)
+{
+	unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ppm->llimit;
+	unsigned int dlen = npods << PPOD_SIZE_SHIFT;
+	unsigned int wr_len = roundup(sizeof(struct ulp_mem_io) +
+				sizeof(struct ulptx_idata) + dlen, 16);
+	struct sk_buff *skb = alloc_wr(wr_len, 0, GFP_ATOMIC);
+
+	if (!skb) {
+		pr_err("%s: %s idx %u, npods %u, OOM.\n",
+		       __func__, ppm->ndev->name, idx, npods);
+		return NULL;
+	}
+
+	ulp_mem_io_set_hdr(cdev, (struct ulp_mem_io *)skb->head, wr_len, dlen,
+			   pm_addr, tid);
+
+	return skb;
+}
+
+static int ddp_ppod_write_idata(struct cxgbi_ppm *ppm, struct cxgbi_sock *csk,
+				struct cxgbi_task_tag_info *ttinfo,
+				unsigned int idx, unsigned int npods,
+				struct scatterlist **sg_pp,
+				unsigned int *sg_off)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct sk_buff *skb = ddp_ppod_init_idata(cdev, ppm, idx, npods,
+						  csk->tid);
+	struct ulp_mem_io *req;
+	struct ulptx_idata *idata;
+	struct cxgbi_pagepod *ppod;
+	int i;
+
+	if (!skb)
+		return -ENOMEM;
+
+	req = (struct ulp_mem_io *)skb->head;
+	idata = (struct ulptx_idata *)(req + 1);
+	ppod = (struct cxgbi_pagepod *)(idata + 1);
+
+	for (i = 0; i < npods; i++, ppod++)
+		cxgbi_ddp_set_one_ppod(ppod, ttinfo, sg_pp, sg_off);
+
+	cxgbi_skcb_set_flag(skb, SKCBF_TX_MEM_WRITE);
+	cxgbi_skcb_set_flag(skb, SKCBF_TX_FLAG_COMPL);
+	set_wr_txq(skb, CPL_PRIORITY_DATA, csk->port_id);
+
+	spin_lock_bh(&csk->lock);
+	cxgbi_sock_skb_entail(csk, skb);
+	spin_unlock_bh(&csk->lock);
+
+	return 0;
+}
+
+static int ddp_set_map(struct cxgbi_ppm *ppm, struct cxgbi_sock *csk,
+		       struct cxgbi_task_tag_info *ttinfo)
+{
+	unsigned int pidx = ttinfo->idx;
+	unsigned int npods = ttinfo->npods;
+	unsigned int i, cnt;
+	int err = 0;
+	struct scatterlist *sg = ttinfo->sgl;
+	unsigned int offset = 0;
+
+	ttinfo->cid = csk->port_id;
+
+	for (i = 0; i < npods; i += cnt, pidx += cnt) {
+		cnt = npods - i;
+
+		if (cnt > ULPMEM_IDATA_MAX_NPPODS)
+			cnt = ULPMEM_IDATA_MAX_NPPODS;
+		err = ddp_ppod_write_idata(ppm, csk, ttinfo, pidx, cnt,
+					   &sg, &offset);
+		if (err < 0)
+			break;
+	}
+
+	return err;
+}
+
+static int ddp_setup_conn_pgidx(struct cxgbi_sock *csk, unsigned int tid,
+				int pg_idx, bool reply)
+{
+	struct sk_buff *skb;
+	struct cpl_set_tcb_field *req;
+
+	if (!pg_idx || pg_idx >= DDP_PGIDX_MAX)
+		return 0;
+
+	skb = alloc_wr(sizeof(*req), 0, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	/*  set up ulp page size */
+	req = (struct cpl_set_tcb_field *)skb->head;
+	INIT_TP_WR(req, csk->tid);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, csk->tid));
+	req->reply_ctrl = htons(NO_REPLY_V(reply) | QUEUENO_V(csk->rss_qid));
+	req->word_cookie = htons(0);
+	req->mask = cpu_to_be64(0x3 << 8);
+	req->val = cpu_to_be64(pg_idx << 8);
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, csk->port_id);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p, tid 0x%x, pg_idx %u.\n", csk, csk->tid, pg_idx);
+
+	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+	return 0;
+}
+
+static int ddp_setup_conn_digest(struct cxgbi_sock *csk, unsigned int tid,
+				 int hcrc, int dcrc, int reply)
+{
+	struct sk_buff *skb;
+	struct cpl_set_tcb_field *req;
+
+	if (!hcrc && !dcrc)
+		return 0;
+
+	skb = alloc_wr(sizeof(*req), 0, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	csk->hcrc_len = (hcrc ? 4 : 0);
+	csk->dcrc_len = (dcrc ? 4 : 0);
+	/*  set up ulp submode */
+	req = (struct cpl_set_tcb_field *)skb->head;
+	INIT_TP_WR(req, tid);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
+	req->reply_ctrl = htons(NO_REPLY_V(reply) | QUEUENO_V(csk->rss_qid));
+	req->word_cookie = htons(0);
+	req->mask = cpu_to_be64(0x3 << 4);
+	req->val = cpu_to_be64(((hcrc ? ULP_CRC_HEADER : 0) |
+				(dcrc ? ULP_CRC_DATA : 0)) << 4);
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, csk->port_id);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p, tid 0x%x, crc %d,%d.\n", csk, csk->tid, hcrc, dcrc);
+
+	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+	return 0;
+}
+
+static struct cxgbi_ppm *cdev2ppm(struct cxgbi_device *cdev)
+{
+	return (struct cxgbi_ppm *)(*((struct cxgb4_lld_info *)
+				       (cxgbi_cdev_priv(cdev)))->iscsi_ppm);
+}
+
+static int cxgb4i_ddp_init(struct cxgbi_device *cdev)
+{
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct net_device *ndev = cdev->ports[0];
+	struct cxgbi_tag_format tformat;
+	unsigned int ppmax;
+	int i;
+
+	if (!lldi->vr->iscsi.size) {
+		pr_warn("%s, iscsi NOT enabled, check config!\n", ndev->name);
+		return -EACCES;
+	}
+
+	cdev->flags |= CXGBI_FLAG_USE_PPOD_OFLDQ;
+	ppmax = lldi->vr->iscsi.size >> PPOD_SIZE_SHIFT;
+
+	memset(&tformat, 0, sizeof(struct cxgbi_tag_format));
+	for (i = 0; i < 4; i++)
+		tformat.pgsz_order[i] = (lldi->iscsi_pgsz_order >> (i << 3))
+					 & 0xF;
+	cxgbi_tagmask_check(lldi->iscsi_tagmask, &tformat);
+
+	cxgbi_ddp_ppm_setup(lldi->iscsi_ppm, cdev, &tformat, ppmax,
+			    lldi->iscsi_llimit, lldi->vr->iscsi.start, 2);
+
+	cdev->csk_ddp_setup_digest = ddp_setup_conn_digest;
+	cdev->csk_ddp_setup_pgidx = ddp_setup_conn_pgidx;
+	cdev->csk_ddp_set_map = ddp_set_map;
+	cdev->tx_max_size = min_t(unsigned int, ULP2_MAX_PDU_PAYLOAD,
+				  lldi->iscsi_iolen - ISCSI_PDU_NONPAYLOAD_LEN);
+	cdev->rx_max_size = min_t(unsigned int, ULP2_MAX_PDU_PAYLOAD,
+				  lldi->iscsi_iolen - ISCSI_PDU_NONPAYLOAD_LEN);
+	cdev->cdev2ppm = cdev2ppm;
+
+	return 0;
+}
+
+static void *t4_uld_add(const struct cxgb4_lld_info *lldi)
+{
+	struct cxgbi_device *cdev;
+	struct port_info *pi;
+	int i, rc;
+
+	cdev = cxgbi_device_register(sizeof(*lldi), lldi->nports);
+	if (!cdev) {
+		pr_info("t4 device 0x%p, register failed.\n", lldi);
+		return NULL;
+	}
+	pr_info("0x%p,0x%x, ports %u,%s, chan %u, q %u,%u, wr %u.\n",
+		cdev, lldi->adapter_type, lldi->nports,
+		lldi->ports[0]->name, lldi->nchan, lldi->ntxq,
+		lldi->nrxq, lldi->wr_cred);
+	for (i = 0; i < lldi->nrxq; i++)
+		log_debug(1 << CXGBI_DBG_DEV,
+			"t4 0x%p, rxq id #%d: %u.\n",
+			cdev, i, lldi->rxq_ids[i]);
+
+	memcpy(cxgbi_cdev_priv(cdev), lldi, sizeof(*lldi));
+	cdev->flags = CXGBI_FLAG_DEV_T4;
+	cdev->pdev = lldi->pdev;
+	cdev->ports = lldi->ports;
+	cdev->nports = lldi->nports;
+	cdev->mtus = lldi->mtus;
+	cdev->nmtus = NMTUS;
+	cdev->rx_credit_thres = (CHELSIO_CHIP_VERSION(lldi->adapter_type) <=
+				 CHELSIO_T5) ? cxgb4i_rx_credit_thres : 0;
+	cdev->skb_tx_rsvd = CXGB4I_TX_HEADER_LEN;
+	cdev->skb_rx_extra = sizeof(struct cpl_iscsi_hdr);
+	cdev->itp = &cxgb4i_iscsi_transport;
+	cdev->owner = THIS_MODULE;
+
+	cdev->pfvf = FW_VIID_PFN_G(cxgb4_port_viid(lldi->ports[0]))
+			<< FW_VIID_PFN_S;
+	pr_info("cdev 0x%p,%s, pfvf %u.\n",
+		cdev, lldi->ports[0]->name, cdev->pfvf);
+
+	rc = cxgb4i_ddp_init(cdev);
+	if (rc) {
+		pr_info("t4 0x%p ddp init failed.\n", cdev);
+		goto err_out;
+	}
+	rc = cxgb4i_ofld_init(cdev);
+	if (rc) {
+		pr_info("t4 0x%p ofld init failed.\n", cdev);
+		goto err_out;
+	}
+
+	rc = cxgbi_hbas_add(cdev, CXGB4I_MAX_LUN, CXGBI_MAX_CONN,
+				&cxgb4i_host_template, cxgb4i_stt);
+	if (rc)
+		goto err_out;
+
+	for (i = 0; i < cdev->nports; i++) {
+		pi = netdev_priv(lldi->ports[i]);
+		cdev->hbas[i]->port_id = pi->port_id;
+	}
+	return cdev;
+
+err_out:
+	cxgbi_device_unregister(cdev);
+	return ERR_PTR(-ENOMEM);
+}
+
+#define RX_PULL_LEN	128
+static int t4_uld_rx_handler(void *handle, const __be64 *rsp,
+				const struct pkt_gl *pgl)
+{
+	const struct cpl_act_establish *rpl;
+	struct sk_buff *skb;
+	unsigned int opc;
+	struct cxgbi_device *cdev = handle;
+
+	if (pgl == NULL) {
+		unsigned int len = 64 - sizeof(struct rsp_ctrl) - 8;
+
+		skb = alloc_wr(len, 0, GFP_ATOMIC);
+		if (!skb)
+			goto nomem;
+		skb_copy_to_linear_data(skb, &rsp[1], len);
+	} else {
+		if (unlikely(*(u8 *)rsp != *(u8 *)pgl->va)) {
+			pr_info("? FL 0x%p,RSS%#llx,FL %#llx,len %u.\n",
+				pgl->va, be64_to_cpu(*rsp),
+				be64_to_cpu(*(u64 *)pgl->va),
+				pgl->tot_len);
+			return 0;
+		}
+		skb = cxgb4_pktgl_to_skb(pgl, RX_PULL_LEN, RX_PULL_LEN);
+		if (unlikely(!skb))
+			goto nomem;
+	}
+
+	rpl = (struct cpl_act_establish *)skb->data;
+	opc = rpl->ot.opcode;
+	log_debug(1 << CXGBI_DBG_TOE,
+		"cdev %p, opcode 0x%x(0x%x,0x%x), skb %p.\n",
+		 cdev, opc, rpl->ot.opcode_tid, ntohl(rpl->ot.opcode_tid), skb);
+	if (opc >= ARRAY_SIZE(cxgb4i_cplhandlers) || !cxgb4i_cplhandlers[opc]) {
+		pr_err("No handler for opcode 0x%x.\n", opc);
+		__kfree_skb(skb);
+	} else
+		cxgb4i_cplhandlers[opc](cdev, skb);
+
+	return 0;
+nomem:
+	log_debug(1 << CXGBI_DBG_TOE, "OOM bailing out.\n");
+	return 1;
+}
+
+static int t4_uld_state_change(void *handle, enum cxgb4_state state)
+{
+	struct cxgbi_device *cdev = handle;
+
+	switch (state) {
+	case CXGB4_STATE_UP:
+		pr_info("cdev 0x%p, UP.\n", cdev);
+		break;
+	case CXGB4_STATE_START_RECOVERY:
+		pr_info("cdev 0x%p, RECOVERY.\n", cdev);
+		/* close all connections */
+		break;
+	case CXGB4_STATE_DOWN:
+		pr_info("cdev 0x%p, DOWN.\n", cdev);
+		break;
+	case CXGB4_STATE_DETACH:
+		pr_info("cdev 0x%p, DETACH.\n", cdev);
+		cxgbi_device_unregister(cdev);
+		break;
+	default:
+		pr_info("cdev 0x%p, unknown state %d.\n", cdev, state);
+		break;
+	}
+	return 0;
+}
+
+static int __init cxgb4i_init_module(void)
+{
+	int rc;
+
+	printk(KERN_INFO "%s", version);
+
+	rc = cxgbi_iscsi_init(&cxgb4i_iscsi_transport, &cxgb4i_stt);
+	if (rc < 0)
+		return rc;
+	cxgb4_register_uld(CXGB4_ULD_ISCSI, &cxgb4i_uld_info);
+
+	return 0;
+}
+
+static void __exit cxgb4i_exit_module(void)
+{
+	cxgb4_unregister_uld(CXGB4_ULD_ISCSI);
+	cxgbi_device_unregister_all(CXGBI_FLAG_DEV_T4);
+	cxgbi_iscsi_cleanup(&cxgb4i_iscsi_transport, &cxgb4i_stt);
+}
+
+module_init(cxgb4i_init_module);
+module_exit(cxgb4i_exit_module);
diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.h b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.h
new file mode 100644
index 0000000..2fd9c76
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.h
@@ -0,0 +1,28 @@
+/*
+ * cxgb4i.h: Chelsio T4 iSCSI driver.
+ *
+ * Copyright (c) 2010-2015 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie@chelsio.com)
+ * Written by: Rakesh Ranjan (rranjan@chelsio.com)
+ */
+
+#ifndef	__CXGB4I_H__
+#define	__CXGB4I_H__
+
+#define	CXGB4I_SCSI_HOST_QDEPTH	1024
+#define	CXGB4I_MAX_CONN		16384
+#define	CXGB4I_MAX_TARGET	CXGB4I_MAX_CONN
+#define	CXGB4I_MAX_LUN		0x1000
+
+/* for TX: a skb must have a headroom of at least TX_HEADER_LEN bytes */
+#define CXGB4I_TX_HEADER_LEN \
+	(sizeof(struct fw_ofld_tx_data_wr) + sizeof(struct sge_opaque_hdr))
+
+#define T5_ISS_VALID		(1 << 18)
+
+#endif	/* __CXGB4I_H__ */
diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
new file mode 100644
index 0000000..3f3af5e
--- /dev/null
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -0,0 +1,2783 @@
+/*
+ * libcxgbi.c: Chelsio common library for T3/T4 iSCSI driver.
+ *
+ * Copyright (c) 2010-2015 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie@chelsio.com)
+ * Written by: Rakesh Ranjan (rranjan@chelsio.com)
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/skbuff.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/pci.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_host.h>
+#include <linux/if_vlan.h>
+#include <linux/inet.h>
+#include <net/dst.h>
+#include <net/route.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+
+#include <linux/inetdevice.h>	/* ip_dev_find */
+#include <linux/module.h>
+#include <net/tcp.h>
+
+static unsigned int dbg_level;
+
+#include "libcxgbi.h"
+
+#define DRV_MODULE_NAME		"libcxgbi"
+#define DRV_MODULE_DESC		"Chelsio iSCSI driver library"
+#define DRV_MODULE_VERSION	"0.9.1-ko"
+#define DRV_MODULE_RELDATE	"Apr. 2015"
+
+static char version[] =
+	DRV_MODULE_DESC " " DRV_MODULE_NAME
+	" v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+
+MODULE_AUTHOR("Chelsio Communications, Inc.");
+MODULE_DESCRIPTION(DRV_MODULE_DESC);
+MODULE_VERSION(DRV_MODULE_VERSION);
+MODULE_LICENSE("GPL");
+
+module_param(dbg_level, uint, 0644);
+MODULE_PARM_DESC(dbg_level, "libiscsi debug level (default=0)");
+
+
+/*
+ * cxgbi device management
+ * maintains a list of the cxgbi devices
+ */
+static LIST_HEAD(cdev_list);
+static DEFINE_MUTEX(cdev_mutex);
+
+static LIST_HEAD(cdev_rcu_list);
+static DEFINE_SPINLOCK(cdev_rcu_lock);
+
+static inline void cxgbi_decode_sw_tag(u32 sw_tag, int *idx, int *age)
+{
+	if (age)
+		*age = sw_tag & 0x7FFF;
+	if (idx)
+		*idx = (sw_tag >> 16) & 0x7FFF;
+}
+
+int cxgbi_device_portmap_create(struct cxgbi_device *cdev, unsigned int base,
+				unsigned int max_conn)
+{
+	struct cxgbi_ports_map *pmap = &cdev->pmap;
+
+	pmap->port_csk = cxgbi_alloc_big_mem(max_conn *
+					     sizeof(struct cxgbi_sock *),
+					     GFP_KERNEL);
+	if (!pmap->port_csk) {
+		pr_warn("cdev 0x%p, portmap OOM %u.\n", cdev, max_conn);
+		return -ENOMEM;
+	}
+
+	pmap->max_connect = max_conn;
+	pmap->sport_base = base;
+	spin_lock_init(&pmap->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_portmap_create);
+
+void cxgbi_device_portmap_cleanup(struct cxgbi_device *cdev)
+{
+	struct cxgbi_ports_map *pmap = &cdev->pmap;
+	struct cxgbi_sock *csk;
+	int i;
+
+	for (i = 0; i < pmap->max_connect; i++) {
+		if (pmap->port_csk[i]) {
+			csk = pmap->port_csk[i];
+			pmap->port_csk[i] = NULL;
+			log_debug(1 << CXGBI_DBG_SOCK,
+				"csk 0x%p, cdev 0x%p, offload down.\n",
+				csk, cdev);
+			spin_lock_bh(&csk->lock);
+			cxgbi_sock_set_flag(csk, CTPF_OFFLOAD_DOWN);
+			cxgbi_sock_closed(csk);
+			spin_unlock_bh(&csk->lock);
+			cxgbi_sock_put(csk);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_portmap_cleanup);
+
+static inline void cxgbi_device_destroy(struct cxgbi_device *cdev)
+{
+	log_debug(1 << CXGBI_DBG_DEV,
+		"cdev 0x%p, p# %u.\n", cdev, cdev->nports);
+	cxgbi_hbas_remove(cdev);
+	cxgbi_device_portmap_cleanup(cdev);
+	cxgbi_ppm_release(cdev->cdev2ppm(cdev));
+	if (cdev->pmap.max_connect)
+		cxgbi_free_big_mem(cdev->pmap.port_csk);
+	kfree(cdev);
+}
+
+struct cxgbi_device *cxgbi_device_register(unsigned int extra,
+					   unsigned int nports)
+{
+	struct cxgbi_device *cdev;
+
+	cdev = kzalloc(sizeof(*cdev) + extra + nports *
+			(sizeof(struct cxgbi_hba *) +
+			 sizeof(struct net_device *)),
+			GFP_KERNEL);
+	if (!cdev) {
+		pr_warn("nport %d, OOM.\n", nports);
+		return NULL;
+	}
+	cdev->ports = (struct net_device **)(cdev + 1);
+	cdev->hbas = (struct cxgbi_hba **)(((char*)cdev->ports) + nports *
+						sizeof(struct net_device *));
+	if (extra)
+		cdev->dd_data = ((char *)cdev->hbas) +
+				nports * sizeof(struct cxgbi_hba *);
+	spin_lock_init(&cdev->pmap.lock);
+
+	mutex_lock(&cdev_mutex);
+	list_add_tail(&cdev->list_head, &cdev_list);
+	mutex_unlock(&cdev_mutex);
+
+	spin_lock(&cdev_rcu_lock);
+	list_add_tail_rcu(&cdev->rcu_node, &cdev_rcu_list);
+	spin_unlock(&cdev_rcu_lock);
+
+	log_debug(1 << CXGBI_DBG_DEV,
+		"cdev 0x%p, p# %u.\n", cdev, nports);
+	return cdev;
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_register);
+
+void cxgbi_device_unregister(struct cxgbi_device *cdev)
+{
+	log_debug(1 << CXGBI_DBG_DEV,
+		"cdev 0x%p, p# %u,%s.\n",
+		cdev, cdev->nports, cdev->nports ? cdev->ports[0]->name : "");
+
+	mutex_lock(&cdev_mutex);
+	list_del(&cdev->list_head);
+	mutex_unlock(&cdev_mutex);
+
+	spin_lock(&cdev_rcu_lock);
+	list_del_rcu(&cdev->rcu_node);
+	spin_unlock(&cdev_rcu_lock);
+	synchronize_rcu();
+
+	cxgbi_device_destroy(cdev);
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_unregister);
+
+void cxgbi_device_unregister_all(unsigned int flag)
+{
+	struct cxgbi_device *cdev, *tmp;
+
+	mutex_lock(&cdev_mutex);
+	list_for_each_entry_safe(cdev, tmp, &cdev_list, list_head) {
+		if ((cdev->flags & flag) == flag) {
+			mutex_unlock(&cdev_mutex);
+			cxgbi_device_unregister(cdev);
+			mutex_lock(&cdev_mutex);
+		}
+	}
+	mutex_unlock(&cdev_mutex);
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_unregister_all);
+
+struct cxgbi_device *cxgbi_device_find_by_lldev(void *lldev)
+{
+	struct cxgbi_device *cdev, *tmp;
+
+	mutex_lock(&cdev_mutex);
+	list_for_each_entry_safe(cdev, tmp, &cdev_list, list_head) {
+		if (cdev->lldev == lldev) {
+			mutex_unlock(&cdev_mutex);
+			return cdev;
+		}
+	}
+	mutex_unlock(&cdev_mutex);
+
+	log_debug(1 << CXGBI_DBG_DEV,
+		"lldev 0x%p, NO match found.\n", lldev);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_find_by_lldev);
+
+struct cxgbi_device *cxgbi_device_find_by_netdev(struct net_device *ndev,
+						 int *port)
+{
+	struct net_device *vdev = NULL;
+	struct cxgbi_device *cdev, *tmp;
+	int i;
+
+	if (is_vlan_dev(ndev)) {
+		vdev = ndev;
+		ndev = vlan_dev_real_dev(ndev);
+		log_debug(1 << CXGBI_DBG_DEV,
+			"vlan dev %s -> %s.\n", vdev->name, ndev->name);
+	}
+
+	mutex_lock(&cdev_mutex);
+	list_for_each_entry_safe(cdev, tmp, &cdev_list, list_head) {
+		for (i = 0; i < cdev->nports; i++) {
+			if (ndev == cdev->ports[i]) {
+				cdev->hbas[i]->vdev = vdev;
+				mutex_unlock(&cdev_mutex);
+				if (port)
+					*port = i;
+				return cdev;
+			}
+		}
+	}
+	mutex_unlock(&cdev_mutex);
+	log_debug(1 << CXGBI_DBG_DEV,
+		"ndev 0x%p, %s, NO match found.\n", ndev, ndev->name);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_find_by_netdev);
+
+struct cxgbi_device *cxgbi_device_find_by_netdev_rcu(struct net_device *ndev,
+						     int *port)
+{
+	struct net_device *vdev = NULL;
+	struct cxgbi_device *cdev;
+	int i;
+
+	if (is_vlan_dev(ndev)) {
+		vdev = ndev;
+		ndev = vlan_dev_real_dev(ndev);
+		pr_info("vlan dev %s -> %s.\n", vdev->name, ndev->name);
+	}
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(cdev, &cdev_rcu_list, rcu_node) {
+		for (i = 0; i < cdev->nports; i++) {
+			if (ndev == cdev->ports[i]) {
+				cdev->hbas[i]->vdev = vdev;
+				rcu_read_unlock();
+				if (port)
+					*port = i;
+				return cdev;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	log_debug(1 << CXGBI_DBG_DEV,
+		  "ndev 0x%p, %s, NO match found.\n", ndev, ndev->name);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_find_by_netdev_rcu);
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct cxgbi_device *cxgbi_device_find_by_mac(struct net_device *ndev,
+						     int *port)
+{
+	struct net_device *vdev = NULL;
+	struct cxgbi_device *cdev, *tmp;
+	int i;
+
+	if (is_vlan_dev(ndev)) {
+		vdev = ndev;
+		ndev = vlan_dev_real_dev(ndev);
+		pr_info("vlan dev %s -> %s.\n", vdev->name, ndev->name);
+	}
+
+	mutex_lock(&cdev_mutex);
+	list_for_each_entry_safe(cdev, tmp, &cdev_list, list_head) {
+		for (i = 0; i < cdev->nports; i++) {
+			if (!memcmp(ndev->dev_addr, cdev->ports[i]->dev_addr,
+				    MAX_ADDR_LEN)) {
+				cdev->hbas[i]->vdev = vdev;
+				mutex_unlock(&cdev_mutex);
+				if (port)
+					*port = i;
+				return cdev;
+			}
+		}
+	}
+	mutex_unlock(&cdev_mutex);
+	log_debug(1 << CXGBI_DBG_DEV,
+		  "ndev 0x%p, %s, NO match mac found.\n",
+		  ndev, ndev->name);
+	return NULL;
+}
+#endif
+
+void cxgbi_hbas_remove(struct cxgbi_device *cdev)
+{
+	int i;
+	struct cxgbi_hba *chba;
+
+	log_debug(1 << CXGBI_DBG_DEV,
+		"cdev 0x%p, p#%u.\n", cdev, cdev->nports);
+
+	for (i = 0; i < cdev->nports; i++) {
+		chba = cdev->hbas[i];
+		if (chba) {
+			cdev->hbas[i] = NULL;
+			iscsi_host_remove(chba->shost);
+			pci_dev_put(cdev->pdev);
+			iscsi_host_free(chba->shost);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_hbas_remove);
+
+int cxgbi_hbas_add(struct cxgbi_device *cdev, u64 max_lun,
+		unsigned int max_id, struct scsi_host_template *sht,
+		struct scsi_transport_template *stt)
+{
+	struct cxgbi_hba *chba;
+	struct Scsi_Host *shost;
+	int i, err;
+
+	log_debug(1 << CXGBI_DBG_DEV, "cdev 0x%p, p#%u.\n", cdev, cdev->nports);
+
+	for (i = 0; i < cdev->nports; i++) {
+		shost = iscsi_host_alloc(sht, sizeof(*chba), 1);
+		if (!shost) {
+			pr_info("0x%p, p%d, %s, host alloc failed.\n",
+				cdev, i, cdev->ports[i]->name);
+			err = -ENOMEM;
+			goto err_out;
+		}
+
+		shost->transportt = stt;
+		shost->max_lun = max_lun;
+		shost->max_id = max_id;
+		shost->max_channel = 0;
+		shost->max_cmd_len = 16;
+
+		chba = iscsi_host_priv(shost);
+		chba->cdev = cdev;
+		chba->ndev = cdev->ports[i];
+		chba->shost = shost;
+
+		log_debug(1 << CXGBI_DBG_DEV,
+			"cdev 0x%p, p#%d %s: chba 0x%p.\n",
+			cdev, i, cdev->ports[i]->name, chba);
+
+		pci_dev_get(cdev->pdev);
+		err = iscsi_host_add(shost, &cdev->pdev->dev);
+		if (err) {
+			pr_info("cdev 0x%p, p#%d %s, host add failed.\n",
+				cdev, i, cdev->ports[i]->name);
+			pci_dev_put(cdev->pdev);
+			scsi_host_put(shost);
+			goto  err_out;
+		}
+
+		cdev->hbas[i] = chba;
+	}
+
+	return 0;
+
+err_out:
+	cxgbi_hbas_remove(cdev);
+	return err;
+}
+EXPORT_SYMBOL_GPL(cxgbi_hbas_add);
+
+/*
+ * iSCSI offload
+ *
+ * - source port management
+ *   To find a free source port in the port allocation map we use a very simple
+ *   rotor scheme to look for the next free port.
+ *
+ *   If a source port has been specified make sure that it doesn't collide with
+ *   our normal source port allocation map.  If it's outside the range of our
+ *   allocation/deallocation scheme just let them use it.
+ *
+ *   If the source port is outside our allocation range, the caller is
+ *   responsible for keeping track of their port usage.
+ */
+
+static struct cxgbi_sock *find_sock_on_port(struct cxgbi_device *cdev,
+					    unsigned char port_id)
+{
+	struct cxgbi_ports_map *pmap = &cdev->pmap;
+	unsigned int i;
+	unsigned int used;
+
+	if (!pmap->max_connect || !pmap->used)
+		return NULL;
+
+	spin_lock_bh(&pmap->lock);
+	used = pmap->used;
+	for (i = 0; used && i < pmap->max_connect; i++) {
+		struct cxgbi_sock *csk = pmap->port_csk[i];
+
+		if (csk) {
+			if (csk->port_id == port_id) {
+				spin_unlock_bh(&pmap->lock);
+				return csk;
+			}
+			used--;
+		}
+	}
+	spin_unlock_bh(&pmap->lock);
+
+	return NULL;
+}
+
+static int sock_get_port(struct cxgbi_sock *csk)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct cxgbi_ports_map *pmap = &cdev->pmap;
+	unsigned int start;
+	int idx;
+	__be16 *port;
+
+	if (!pmap->max_connect) {
+		pr_err("cdev 0x%p, p#%u %s, NO port map.\n",
+			   cdev, csk->port_id, cdev->ports[csk->port_id]->name);
+		return -EADDRNOTAVAIL;
+	}
+
+	if (csk->csk_family == AF_INET)
+		port = &csk->saddr.sin_port;
+	else /* ipv6 */
+		port = &csk->saddr6.sin6_port;
+
+	if (*port) {
+		pr_err("source port NON-ZERO %u.\n",
+			ntohs(*port));
+		return -EADDRINUSE;
+	}
+
+	spin_lock_bh(&pmap->lock);
+	if (pmap->used >= pmap->max_connect) {
+		spin_unlock_bh(&pmap->lock);
+		pr_info("cdev 0x%p, p#%u %s, ALL ports used.\n",
+			cdev, csk->port_id, cdev->ports[csk->port_id]->name);
+		return -EADDRNOTAVAIL;
+	}
+
+	start = idx = pmap->next;
+	do {
+		if (++idx >= pmap->max_connect)
+			idx = 0;
+		if (!pmap->port_csk[idx]) {
+			pmap->used++;
+			*port = htons(pmap->sport_base + idx);
+			pmap->next = idx;
+			pmap->port_csk[idx] = csk;
+			spin_unlock_bh(&pmap->lock);
+			cxgbi_sock_get(csk);
+			log_debug(1 << CXGBI_DBG_SOCK,
+				"cdev 0x%p, p#%u %s, p %u, %u.\n",
+				cdev, csk->port_id,
+				cdev->ports[csk->port_id]->name,
+				pmap->sport_base + idx, pmap->next);
+			return 0;
+		}
+	} while (idx != start);
+	spin_unlock_bh(&pmap->lock);
+
+	/* should not happen */
+	pr_warn("cdev 0x%p, p#%u %s, next %u?\n",
+		cdev, csk->port_id, cdev->ports[csk->port_id]->name,
+		pmap->next);
+	return -EADDRNOTAVAIL;
+}
+
+static void sock_put_port(struct cxgbi_sock *csk)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct cxgbi_ports_map *pmap = &cdev->pmap;
+	__be16 *port;
+
+	if (csk->csk_family == AF_INET)
+		port = &csk->saddr.sin_port;
+	else /* ipv6 */
+		port = &csk->saddr6.sin6_port;
+
+	if (*port) {
+		int idx = ntohs(*port) - pmap->sport_base;
+
+		*port = 0;
+		if (idx < 0 || idx >= pmap->max_connect) {
+			pr_err("cdev 0x%p, p#%u %s, port %u OOR.\n",
+				cdev, csk->port_id,
+				cdev->ports[csk->port_id]->name,
+				ntohs(*port));
+			return;
+		}
+
+		spin_lock_bh(&pmap->lock);
+		pmap->port_csk[idx] = NULL;
+		pmap->used--;
+		spin_unlock_bh(&pmap->lock);
+
+		log_debug(1 << CXGBI_DBG_SOCK,
+			"cdev 0x%p, p#%u %s, release %u.\n",
+			cdev, csk->port_id, cdev->ports[csk->port_id]->name,
+			pmap->sport_base + idx);
+
+		cxgbi_sock_put(csk);
+	}
+}
+
+/*
+ * iscsi tcp connection
+ */
+void cxgbi_sock_free_cpl_skbs(struct cxgbi_sock *csk)
+{
+	if (csk->cpl_close) {
+		kfree_skb(csk->cpl_close);
+		csk->cpl_close = NULL;
+	}
+	if (csk->cpl_abort_req) {
+		kfree_skb(csk->cpl_abort_req);
+		csk->cpl_abort_req = NULL;
+	}
+	if (csk->cpl_abort_rpl) {
+		kfree_skb(csk->cpl_abort_rpl);
+		csk->cpl_abort_rpl = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_free_cpl_skbs);
+
+static struct cxgbi_sock *cxgbi_sock_create(struct cxgbi_device *cdev)
+{
+	struct cxgbi_sock *csk = kzalloc(sizeof(*csk), GFP_NOIO);
+
+	if (!csk) {
+		pr_info("alloc csk %zu failed.\n", sizeof(*csk));
+		return NULL;
+	}
+
+	if (cdev->csk_alloc_cpls(csk) < 0) {
+		pr_info("csk 0x%p, alloc cpls failed.\n", csk);
+		kfree(csk);
+		return NULL;
+	}
+
+	spin_lock_init(&csk->lock);
+	kref_init(&csk->refcnt);
+	skb_queue_head_init(&csk->receive_queue);
+	skb_queue_head_init(&csk->write_queue);
+	timer_setup(&csk->retry_timer, NULL, 0);
+	rwlock_init(&csk->callback_lock);
+	csk->cdev = cdev;
+	csk->flags = 0;
+	cxgbi_sock_set_state(csk, CTP_CLOSED);
+
+	log_debug(1 << CXGBI_DBG_SOCK, "cdev 0x%p, new csk 0x%p.\n", cdev, csk);
+
+	return csk;
+}
+
+static struct rtable *find_route_ipv4(struct flowi4 *fl4,
+				      __be32 saddr, __be32 daddr,
+				      __be16 sport, __be16 dport, u8 tos,
+				      int ifindex)
+{
+	struct rtable *rt;
+
+	rt = ip_route_output_ports(&init_net, fl4, NULL, daddr, saddr,
+				   dport, sport, IPPROTO_TCP, tos, ifindex);
+	if (IS_ERR(rt))
+		return NULL;
+
+	return rt;
+}
+
+static struct cxgbi_sock *
+cxgbi_check_route(struct sockaddr *dst_addr, int ifindex)
+{
+	struct sockaddr_in *daddr = (struct sockaddr_in *)dst_addr;
+	struct dst_entry *dst;
+	struct net_device *ndev;
+	struct cxgbi_device *cdev;
+	struct rtable *rt = NULL;
+	struct neighbour *n;
+	struct flowi4 fl4;
+	struct cxgbi_sock *csk = NULL;
+	unsigned int mtu = 0;
+	int port = 0xFFFF;
+	int err = 0;
+
+	rt = find_route_ipv4(&fl4, 0, daddr->sin_addr.s_addr, 0,
+			     daddr->sin_port, 0, ifindex);
+	if (!rt) {
+		pr_info("no route to ipv4 0x%x, port %u.\n",
+			be32_to_cpu(daddr->sin_addr.s_addr),
+			be16_to_cpu(daddr->sin_port));
+		err = -ENETUNREACH;
+		goto err_out;
+	}
+	dst = &rt->dst;
+	n = dst_neigh_lookup(dst, &daddr->sin_addr.s_addr);
+	if (!n) {
+		err = -ENODEV;
+		goto rel_rt;
+	}
+	ndev = n->dev;
+
+	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+		pr_info("multi-cast route %pI4, port %u, dev %s.\n",
+			&daddr->sin_addr.s_addr, ntohs(daddr->sin_port),
+			ndev->name);
+		err = -ENETUNREACH;
+		goto rel_neigh;
+	}
+
+	if (ndev->flags & IFF_LOOPBACK) {
+		ndev = ip_dev_find(&init_net, daddr->sin_addr.s_addr);
+		mtu = ndev->mtu;
+		pr_info("rt dev %s, loopback -> %s, mtu %u.\n",
+			n->dev->name, ndev->name, mtu);
+	}
+
+	if (!(ndev->flags & IFF_UP) || !netif_carrier_ok(ndev)) {
+		pr_info("%s interface not up.\n", ndev->name);
+		err = -ENETDOWN;
+		goto rel_neigh;
+	}
+
+	cdev = cxgbi_device_find_by_netdev(ndev, &port);
+	if (!cdev) {
+		pr_info("dst %pI4, %s, NOT cxgbi device.\n",
+			&daddr->sin_addr.s_addr, ndev->name);
+		err = -ENETUNREACH;
+		goto rel_neigh;
+	}
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"route to %pI4 :%u, ndev p#%d,%s, cdev 0x%p.\n",
+		&daddr->sin_addr.s_addr, ntohs(daddr->sin_port),
+			   port, ndev->name, cdev);
+
+	csk = cxgbi_sock_create(cdev);
+	if (!csk) {
+		err = -ENOMEM;
+		goto rel_neigh;
+	}
+	csk->cdev = cdev;
+	csk->port_id = port;
+	csk->mtu = mtu;
+	csk->dst = dst;
+
+	csk->csk_family = AF_INET;
+	csk->daddr.sin_addr.s_addr = daddr->sin_addr.s_addr;
+	csk->daddr.sin_port = daddr->sin_port;
+	csk->daddr.sin_family = daddr->sin_family;
+	csk->saddr.sin_family = daddr->sin_family;
+	csk->saddr.sin_addr.s_addr = fl4.saddr;
+	neigh_release(n);
+
+	return csk;
+
+rel_neigh:
+	neigh_release(n);
+
+rel_rt:
+	ip_rt_put(rt);
+err_out:
+	return ERR_PTR(err);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct rt6_info *find_route_ipv6(const struct in6_addr *saddr,
+					const struct in6_addr *daddr,
+					int ifindex)
+{
+	struct flowi6 fl;
+
+	memset(&fl, 0, sizeof(fl));
+	fl.flowi6_oif = ifindex;
+	if (saddr)
+		memcpy(&fl.saddr, saddr, sizeof(struct in6_addr));
+	if (daddr)
+		memcpy(&fl.daddr, daddr, sizeof(struct in6_addr));
+	return (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+}
+
+static struct cxgbi_sock *
+cxgbi_check_route6(struct sockaddr *dst_addr, int ifindex)
+{
+	struct sockaddr_in6 *daddr6 = (struct sockaddr_in6 *)dst_addr;
+	struct dst_entry *dst;
+	struct net_device *ndev;
+	struct cxgbi_device *cdev;
+	struct rt6_info *rt = NULL;
+	struct neighbour *n;
+	struct in6_addr pref_saddr;
+	struct cxgbi_sock *csk = NULL;
+	unsigned int mtu = 0;
+	int port = 0xFFFF;
+	int err = 0;
+
+	rt = find_route_ipv6(NULL, &daddr6->sin6_addr, ifindex);
+
+	if (!rt) {
+		pr_info("no route to ipv6 %pI6 port %u\n",
+			daddr6->sin6_addr.s6_addr,
+			be16_to_cpu(daddr6->sin6_port));
+		err = -ENETUNREACH;
+		goto err_out;
+	}
+
+	dst = &rt->dst;
+
+	n = dst_neigh_lookup(dst, &daddr6->sin6_addr);
+
+	if (!n) {
+		pr_info("%pI6, port %u, dst no neighbour.\n",
+			daddr6->sin6_addr.s6_addr,
+			be16_to_cpu(daddr6->sin6_port));
+		err = -ENETUNREACH;
+		goto rel_rt;
+	}
+	ndev = n->dev;
+
+	if (!(ndev->flags & IFF_UP) || !netif_carrier_ok(ndev)) {
+		pr_info("%s interface not up.\n", ndev->name);
+		err = -ENETDOWN;
+		goto rel_rt;
+	}
+
+	if (ipv6_addr_is_multicast(&daddr6->sin6_addr)) {
+		pr_info("multi-cast route %pI6 port %u, dev %s.\n",
+			daddr6->sin6_addr.s6_addr,
+			ntohs(daddr6->sin6_port), ndev->name);
+		err = -ENETUNREACH;
+		goto rel_rt;
+	}
+
+	cdev = cxgbi_device_find_by_netdev(ndev, &port);
+	if (!cdev)
+		cdev = cxgbi_device_find_by_mac(ndev, &port);
+	if (!cdev) {
+		pr_info("dst %pI6 %s, NOT cxgbi device.\n",
+			daddr6->sin6_addr.s6_addr, ndev->name);
+		err = -ENETUNREACH;
+		goto rel_rt;
+	}
+	log_debug(1 << CXGBI_DBG_SOCK,
+		  "route to %pI6 :%u, ndev p#%d,%s, cdev 0x%p.\n",
+		  daddr6->sin6_addr.s6_addr, ntohs(daddr6->sin6_port), port,
+		  ndev->name, cdev);
+
+	csk = cxgbi_sock_create(cdev);
+	if (!csk) {
+		err = -ENOMEM;
+		goto rel_rt;
+	}
+	csk->cdev = cdev;
+	csk->port_id = port;
+	csk->mtu = mtu;
+	csk->dst = dst;
+
+	if (ipv6_addr_any(&rt->rt6i_prefsrc.addr)) {
+		struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt);
+
+		err = ipv6_dev_get_saddr(&init_net, idev ? idev->dev : NULL,
+					 &daddr6->sin6_addr, 0, &pref_saddr);
+		if (err) {
+			pr_info("failed to get source address to reach %pI6\n",
+				&daddr6->sin6_addr);
+			goto rel_rt;
+		}
+	} else {
+		pref_saddr = rt->rt6i_prefsrc.addr;
+	}
+
+	csk->csk_family = AF_INET6;
+	csk->daddr6.sin6_addr = daddr6->sin6_addr;
+	csk->daddr6.sin6_port = daddr6->sin6_port;
+	csk->daddr6.sin6_family = daddr6->sin6_family;
+	csk->saddr6.sin6_family = daddr6->sin6_family;
+	csk->saddr6.sin6_addr = pref_saddr;
+
+	neigh_release(n);
+	return csk;
+
+rel_rt:
+	if (n)
+		neigh_release(n);
+
+	ip6_rt_put(rt);
+	if (csk)
+		cxgbi_sock_closed(csk);
+err_out:
+	return ERR_PTR(err);
+}
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+
+void cxgbi_sock_established(struct cxgbi_sock *csk, unsigned int snd_isn,
+			unsigned int opt)
+{
+	csk->write_seq = csk->snd_nxt = csk->snd_una = snd_isn;
+	dst_confirm(csk->dst);
+	smp_mb();
+	cxgbi_sock_set_state(csk, CTP_ESTABLISHED);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_established);
+
+static void cxgbi_inform_iscsi_conn_closing(struct cxgbi_sock *csk)
+{
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"csk 0x%p, state %u, flags 0x%lx, conn 0x%p.\n",
+		csk, csk->state, csk->flags, csk->user_data);
+
+	if (csk->state != CTP_ESTABLISHED) {
+		read_lock_bh(&csk->callback_lock);
+		if (csk->user_data)
+			iscsi_conn_failure(csk->user_data,
+					ISCSI_ERR_TCP_CONN_CLOSE);
+		read_unlock_bh(&csk->callback_lock);
+	}
+}
+
+void cxgbi_sock_closed(struct cxgbi_sock *csk)
+{
+	log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n",
+		csk, (csk)->state, (csk)->flags, (csk)->tid);
+	cxgbi_sock_set_flag(csk, CTPF_ACTIVE_CLOSE_NEEDED);
+	if (csk->state == CTP_ACTIVE_OPEN || csk->state == CTP_CLOSED)
+		return;
+	if (csk->saddr.sin_port)
+		sock_put_port(csk);
+	if (csk->dst)
+		dst_release(csk->dst);
+	csk->cdev->csk_release_offload_resources(csk);
+	cxgbi_sock_set_state(csk, CTP_CLOSED);
+	cxgbi_inform_iscsi_conn_closing(csk);
+	cxgbi_sock_put(csk);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_closed);
+
+static void need_active_close(struct cxgbi_sock *csk)
+{
+	int data_lost;
+	int close_req = 0;
+
+	log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n",
+		csk, (csk)->state, (csk)->flags, (csk)->tid);
+	spin_lock_bh(&csk->lock);
+	if (csk->dst)
+		dst_confirm(csk->dst);
+	data_lost = skb_queue_len(&csk->receive_queue);
+	__skb_queue_purge(&csk->receive_queue);
+
+	if (csk->state == CTP_ACTIVE_OPEN)
+		cxgbi_sock_set_flag(csk, CTPF_ACTIVE_CLOSE_NEEDED);
+	else if (csk->state == CTP_ESTABLISHED) {
+		close_req = 1;
+		cxgbi_sock_set_state(csk, CTP_ACTIVE_CLOSE);
+	} else if (csk->state == CTP_PASSIVE_CLOSE) {
+		close_req = 1;
+		cxgbi_sock_set_state(csk, CTP_CLOSE_WAIT_2);
+	}
+
+	if (close_req) {
+		if (!cxgbi_sock_flag(csk, CTPF_LOGOUT_RSP_RCVD) ||
+		    data_lost)
+			csk->cdev->csk_send_abort_req(csk);
+		else
+			csk->cdev->csk_send_close_req(csk);
+	}
+
+	spin_unlock_bh(&csk->lock);
+}
+
+void cxgbi_sock_fail_act_open(struct cxgbi_sock *csk, int errno)
+{
+	pr_info("csk 0x%p,%u,%lx, %pI4:%u-%pI4:%u, err %d.\n",
+			csk, csk->state, csk->flags,
+			&csk->saddr.sin_addr.s_addr, csk->saddr.sin_port,
+			&csk->daddr.sin_addr.s_addr, csk->daddr.sin_port,
+			errno);
+
+	cxgbi_sock_set_state(csk, CTP_CONNECTING);
+	csk->err = errno;
+	cxgbi_sock_closed(csk);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_fail_act_open);
+
+void cxgbi_sock_act_open_req_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk = (struct cxgbi_sock *)skb->sk;
+	struct module *owner = csk->cdev->owner;
+
+	log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n",
+		csk, (csk)->state, (csk)->flags, (csk)->tid);
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+	if (csk->state == CTP_ACTIVE_OPEN)
+		cxgbi_sock_fail_act_open(csk, -EHOSTUNREACH);
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+	__kfree_skb(skb);
+
+	module_put(owner);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_act_open_req_arp_failure);
+
+void cxgbi_sock_rcv_abort_rpl(struct cxgbi_sock *csk)
+{
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	cxgbi_sock_set_flag(csk, CTPF_ABORT_RPL_RCVD);
+	if (cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING)) {
+		cxgbi_sock_clear_flag(csk, CTPF_ABORT_RPL_PENDING);
+		if (cxgbi_sock_flag(csk, CTPF_ABORT_REQ_RCVD))
+			pr_err("csk 0x%p,%u,0x%lx,%u,ABT_RPL_RSS.\n",
+			       csk, csk->state, csk->flags, csk->tid);
+		cxgbi_sock_closed(csk);
+	}
+
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_rcv_abort_rpl);
+
+void cxgbi_sock_rcv_peer_close(struct cxgbi_sock *csk)
+{
+	log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n",
+		csk, (csk)->state, (csk)->flags, (csk)->tid);
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	if (cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING))
+		goto done;
+
+	switch (csk->state) {
+	case CTP_ESTABLISHED:
+		cxgbi_sock_set_state(csk, CTP_PASSIVE_CLOSE);
+		break;
+	case CTP_ACTIVE_CLOSE:
+		cxgbi_sock_set_state(csk, CTP_CLOSE_WAIT_2);
+		break;
+	case CTP_CLOSE_WAIT_1:
+		cxgbi_sock_closed(csk);
+		break;
+	case CTP_ABORTING:
+		break;
+	default:
+		pr_err("csk 0x%p,%u,0x%lx,%u, bad state.\n",
+			csk, csk->state, csk->flags, csk->tid);
+	}
+	cxgbi_inform_iscsi_conn_closing(csk);
+done:
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_rcv_peer_close);
+
+void cxgbi_sock_rcv_close_conn_rpl(struct cxgbi_sock *csk, u32 snd_nxt)
+{
+	log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n",
+		csk, (csk)->state, (csk)->flags, (csk)->tid);
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	csk->snd_una = snd_nxt - 1;
+	if (cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING))
+		goto done;
+
+	switch (csk->state) {
+	case CTP_ACTIVE_CLOSE:
+		cxgbi_sock_set_state(csk, CTP_CLOSE_WAIT_1);
+		break;
+	case CTP_CLOSE_WAIT_1:
+	case CTP_CLOSE_WAIT_2:
+		cxgbi_sock_closed(csk);
+		break;
+	case CTP_ABORTING:
+		break;
+	default:
+		pr_err("csk 0x%p,%u,0x%lx,%u, bad state.\n",
+			csk, csk->state, csk->flags, csk->tid);
+	}
+done:
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_rcv_close_conn_rpl);
+
+void cxgbi_sock_rcv_wr_ack(struct cxgbi_sock *csk, unsigned int credits,
+			   unsigned int snd_una, int seq_chk)
+{
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk 0x%p,%u,0x%lx,%u, cr %u,%u+%u, snd_una %u,%d.\n",
+			csk, csk->state, csk->flags, csk->tid, credits,
+			csk->wr_cred, csk->wr_una_cred, snd_una, seq_chk);
+
+	spin_lock_bh(&csk->lock);
+
+	csk->wr_cred += credits;
+	if (csk->wr_una_cred > csk->wr_max_cred - csk->wr_cred)
+		csk->wr_una_cred = csk->wr_max_cred - csk->wr_cred;
+
+	while (credits) {
+		struct sk_buff *p = cxgbi_sock_peek_wr(csk);
+
+		if (unlikely(!p)) {
+			pr_err("csk 0x%p,%u,0x%lx,%u, cr %u,%u+%u, empty.\n",
+				csk, csk->state, csk->flags, csk->tid, credits,
+				csk->wr_cred, csk->wr_una_cred);
+			break;
+		}
+
+		if (unlikely(credits < p->csum)) {
+			pr_warn("csk 0x%p,%u,0x%lx,%u, cr %u,%u+%u, < %u.\n",
+				csk, csk->state, csk->flags, csk->tid,
+				credits, csk->wr_cred, csk->wr_una_cred,
+				p->csum);
+			p->csum -= credits;
+			break;
+		} else {
+			cxgbi_sock_dequeue_wr(csk);
+			credits -= p->csum;
+			kfree_skb(p);
+		}
+	}
+
+	cxgbi_sock_check_wr_invariants(csk);
+
+	if (seq_chk) {
+		if (unlikely(before(snd_una, csk->snd_una))) {
+			pr_warn("csk 0x%p,%u,0x%lx,%u, snd_una %u/%u.",
+				csk, csk->state, csk->flags, csk->tid, snd_una,
+				csk->snd_una);
+			goto done;
+		}
+
+		if (csk->snd_una != snd_una) {
+			csk->snd_una = snd_una;
+			dst_confirm(csk->dst);
+		}
+	}
+
+	if (skb_queue_len(&csk->write_queue)) {
+		if (csk->cdev->csk_push_tx_frames(csk, 0))
+			cxgbi_conn_tx_open(csk);
+	} else
+		cxgbi_conn_tx_open(csk);
+done:
+	spin_unlock_bh(&csk->lock);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_rcv_wr_ack);
+
+static unsigned int cxgbi_sock_find_best_mtu(struct cxgbi_sock *csk,
+					     unsigned short mtu)
+{
+	int i = 0;
+
+	while (i < csk->cdev->nmtus - 1 && csk->cdev->mtus[i + 1] <= mtu)
+		++i;
+
+	return i;
+}
+
+unsigned int cxgbi_sock_select_mss(struct cxgbi_sock *csk, unsigned int pmtu)
+{
+	unsigned int idx;
+	struct dst_entry *dst = csk->dst;
+
+	csk->advmss = dst_metric_advmss(dst);
+
+	if (csk->advmss > pmtu - 40)
+		csk->advmss = pmtu - 40;
+	if (csk->advmss < csk->cdev->mtus[0] - 40)
+		csk->advmss = csk->cdev->mtus[0] - 40;
+	idx = cxgbi_sock_find_best_mtu(csk, csk->advmss + 40);
+
+	return idx;
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_select_mss);
+
+void cxgbi_sock_skb_entail(struct cxgbi_sock *csk, struct sk_buff *skb)
+{
+	cxgbi_skcb_tcp_seq(skb) = csk->write_seq;
+	__skb_queue_tail(&csk->write_queue, skb);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_skb_entail);
+
+void cxgbi_sock_purge_wr_queue(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb;
+
+	while ((skb = cxgbi_sock_dequeue_wr(csk)) != NULL)
+		kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_purge_wr_queue);
+
+void cxgbi_sock_check_wr_invariants(const struct cxgbi_sock *csk)
+{
+	int pending = cxgbi_sock_count_pending_wrs(csk);
+
+	if (unlikely(csk->wr_cred + pending != csk->wr_max_cred))
+		pr_err("csk 0x%p, tid %u, credit %u + %u != %u.\n",
+			csk, csk->tid, csk->wr_cred, pending, csk->wr_max_cred);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_check_wr_invariants);
+
+static int cxgbi_sock_send_pdus(struct cxgbi_sock *csk, struct sk_buff *skb)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct sk_buff *next;
+	int err, copied = 0;
+
+	spin_lock_bh(&csk->lock);
+
+	if (csk->state != CTP_ESTABLISHED) {
+		log_debug(1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p,%u,0x%lx,%u, EAGAIN.\n",
+			csk, csk->state, csk->flags, csk->tid);
+		err = -EAGAIN;
+		goto out_err;
+	}
+
+	if (csk->err) {
+		log_debug(1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p,%u,0x%lx,%u, EPIPE %d.\n",
+			csk, csk->state, csk->flags, csk->tid, csk->err);
+		err = -EPIPE;
+		goto out_err;
+	}
+
+	if (csk->write_seq - csk->snd_una >= csk->snd_win) {
+		log_debug(1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p,%u,0x%lx,%u, FULL %u-%u >= %u.\n",
+			csk, csk->state, csk->flags, csk->tid, csk->write_seq,
+			csk->snd_una, csk->snd_win);
+		err = -ENOBUFS;
+		goto out_err;
+	}
+
+	while (skb) {
+		int frags = skb_shinfo(skb)->nr_frags +
+				(skb->len != skb->data_len);
+
+		if (unlikely(skb_headroom(skb) < cdev->skb_tx_rsvd)) {
+			pr_err("csk 0x%p, skb head %u < %u.\n",
+				csk, skb_headroom(skb), cdev->skb_tx_rsvd);
+			err = -EINVAL;
+			goto out_err;
+		}
+
+		if (frags >= SKB_WR_LIST_SIZE) {
+			pr_err("csk 0x%p, frags %d, %u,%u >%u.\n",
+				csk, skb_shinfo(skb)->nr_frags, skb->len,
+				skb->data_len, (uint)(SKB_WR_LIST_SIZE));
+			err = -EINVAL;
+			goto out_err;
+		}
+
+		next = skb->next;
+		skb->next = NULL;
+		cxgbi_skcb_set_flag(skb, SKCBF_TX_NEED_HDR);
+		cxgbi_sock_skb_entail(csk, skb);
+		copied += skb->len;
+		csk->write_seq += skb->len +
+				cxgbi_ulp_extra_len(cxgbi_skcb_ulp_mode(skb));
+		skb = next;
+	}
+
+	if (likely(skb_queue_len(&csk->write_queue)))
+		cdev->csk_push_tx_frames(csk, 1);
+done:
+	spin_unlock_bh(&csk->lock);
+	return copied;
+
+out_err:
+	if (copied == 0 && err == -EPIPE)
+		copied = csk->err ? csk->err : -EPIPE;
+	else
+		copied = err;
+	goto done;
+}
+
+static inline void
+scmd_get_params(struct scsi_cmnd *sc, struct scatterlist **sgl,
+		unsigned int *sgcnt, unsigned int *dlen,
+		unsigned int prot)
+{
+	struct scsi_data_buffer *sdb = prot ? scsi_prot(sc) : scsi_out(sc);
+
+	*sgl = sdb->table.sgl;
+	*sgcnt = sdb->table.nents;
+	*dlen = sdb->length;
+	/* Caution: for protection sdb, sdb->length is invalid */
+}
+
+void cxgbi_ddp_set_one_ppod(struct cxgbi_pagepod *ppod,
+			    struct cxgbi_task_tag_info *ttinfo,
+			    struct scatterlist **sg_pp, unsigned int *sg_off)
+{
+	struct scatterlist *sg = sg_pp ? *sg_pp : NULL;
+	unsigned int offset = sg_off ? *sg_off : 0;
+	dma_addr_t addr = 0UL;
+	unsigned int len = 0;
+	int i;
+
+	memcpy(ppod, &ttinfo->hdr, sizeof(struct cxgbi_pagepod_hdr));
+
+	if (sg) {
+		addr = sg_dma_address(sg);
+		len = sg_dma_len(sg);
+	}
+
+	for (i = 0; i < PPOD_PAGES_MAX; i++) {
+		if (sg) {
+			ppod->addr[i] = cpu_to_be64(addr + offset);
+			offset += PAGE_SIZE;
+			if (offset == (len + sg->offset)) {
+				offset = 0;
+				sg = sg_next(sg);
+				if (sg) {
+					addr = sg_dma_address(sg);
+					len = sg_dma_len(sg);
+				}
+			}
+		} else {
+			ppod->addr[i] = 0ULL;
+		}
+	}
+
+	/*
+	 * the fifth address needs to be repeated in the next ppod, so do
+	 * not move sg
+	 */
+	if (sg_pp) {
+		*sg_pp = sg;
+		*sg_off = offset;
+	}
+
+	if (offset == len) {
+		offset = 0;
+		sg = sg_next(sg);
+		if (sg) {
+			addr = sg_dma_address(sg);
+			len = sg_dma_len(sg);
+		}
+	}
+	ppod->addr[i] = sg ? cpu_to_be64(addr + offset) : 0ULL;
+}
+EXPORT_SYMBOL_GPL(cxgbi_ddp_set_one_ppod);
+
+/*
+ * APIs interacting with open-iscsi libraries
+ */
+
+static unsigned char padding[4];
+
+void cxgbi_ddp_ppm_setup(void **ppm_pp, struct cxgbi_device *cdev,
+			 struct cxgbi_tag_format *tformat, unsigned int ppmax,
+			 unsigned int llimit, unsigned int start,
+			 unsigned int rsvd_factor)
+{
+	int err = cxgbi_ppm_init(ppm_pp, cdev->ports[0], cdev->pdev,
+				cdev->lldev, tformat, ppmax, llimit, start,
+				rsvd_factor);
+
+	if (err >= 0) {
+		struct cxgbi_ppm *ppm = (struct cxgbi_ppm *)(*ppm_pp);
+
+		if (ppm->ppmax < 1024 ||
+		    ppm->tformat.pgsz_idx_dflt >= DDP_PGIDX_MAX)
+			cdev->flags |= CXGBI_FLAG_DDP_OFF;
+		err = 0;
+	} else {
+		cdev->flags |= CXGBI_FLAG_DDP_OFF;
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_ddp_ppm_setup);
+
+static int cxgbi_ddp_sgl_check(struct scatterlist *sgl, int nents)
+{
+	int i;
+	int last_sgidx = nents - 1;
+	struct scatterlist *sg = sgl;
+
+	for (i = 0; i < nents; i++, sg = sg_next(sg)) {
+		unsigned int len = sg->length + sg->offset;
+
+		if ((sg->offset & 0x3) || (i && sg->offset) ||
+		    ((i != last_sgidx) && len != PAGE_SIZE)) {
+			log_debug(1 << CXGBI_DBG_DDP,
+				  "sg %u/%u, %u,%u, not aligned.\n",
+				  i, nents, sg->offset, sg->length);
+			goto err_out;
+		}
+	}
+	return 0;
+err_out:
+	return -EINVAL;
+}
+
+static int cxgbi_ddp_reserve(struct cxgbi_conn *cconn,
+			     struct cxgbi_task_data *tdata, u32 sw_tag,
+			     unsigned int xferlen)
+{
+	struct cxgbi_sock *csk = cconn->cep->csk;
+	struct cxgbi_device *cdev = csk->cdev;
+	struct cxgbi_ppm *ppm = cdev->cdev2ppm(cdev);
+	struct cxgbi_task_tag_info *ttinfo = &tdata->ttinfo;
+	struct scatterlist *sgl = ttinfo->sgl;
+	unsigned int sgcnt = ttinfo->nents;
+	unsigned int sg_offset = sgl->offset;
+	int err;
+
+	if (cdev->flags & CXGBI_FLAG_DDP_OFF) {
+		log_debug(1 << CXGBI_DBG_DDP,
+			  "cdev 0x%p DDP off.\n", cdev);
+		return -EINVAL;
+	}
+
+	if (!ppm || xferlen < DDP_THRESHOLD || !sgcnt ||
+	    ppm->tformat.pgsz_idx_dflt >= DDP_PGIDX_MAX) {
+		log_debug(1 << CXGBI_DBG_DDP,
+			  "ppm 0x%p, pgidx %u, xfer %u, sgcnt %u, NO ddp.\n",
+			  ppm, ppm ? ppm->tformat.pgsz_idx_dflt : DDP_PGIDX_MAX,
+			  xferlen, ttinfo->nents);
+		return -EINVAL;
+	}
+
+	/* make sure the buffer is suitable for ddp */
+	if (cxgbi_ddp_sgl_check(sgl, sgcnt) < 0)
+		return -EINVAL;
+
+	ttinfo->nr_pages = (xferlen + sgl->offset + (1 << PAGE_SHIFT) - 1) >>
+			    PAGE_SHIFT;
+
+	/*
+	 * the ddp tag will be used for the itt in the outgoing pdu,
+	 * the itt genrated by libiscsi is saved in the ppm and can be
+	 * retrieved via the ddp tag
+	 */
+	err = cxgbi_ppm_ppods_reserve(ppm, ttinfo->nr_pages, 0, &ttinfo->idx,
+				      &ttinfo->tag, (unsigned long)sw_tag);
+	if (err < 0) {
+		cconn->ddp_full++;
+		return err;
+	}
+	ttinfo->npods = err;
+
+	 /* setup dma from scsi command sgl */
+	sgl->offset = 0;
+	err = dma_map_sg(&ppm->pdev->dev, sgl, sgcnt, DMA_FROM_DEVICE);
+	sgl->offset = sg_offset;
+	if (err == 0) {
+		pr_info("%s: 0x%x, xfer %u, sgl %u dma mapping err.\n",
+			__func__, sw_tag, xferlen, sgcnt);
+		goto rel_ppods;
+	}
+	if (err != ttinfo->nr_pages) {
+		log_debug(1 << CXGBI_DBG_DDP,
+			  "%s: sw tag 0x%x, xfer %u, sgl %u, dma count %d.\n",
+			  __func__, sw_tag, xferlen, sgcnt, err);
+	}
+
+	ttinfo->flags |= CXGBI_PPOD_INFO_FLAG_MAPPED;
+	ttinfo->cid = csk->port_id;
+
+	cxgbi_ppm_make_ppod_hdr(ppm, ttinfo->tag, csk->tid, sgl->offset,
+				xferlen, &ttinfo->hdr);
+
+	if (cdev->flags & CXGBI_FLAG_USE_PPOD_OFLDQ) {
+		/* write ppod from xmit_pdu (of iscsi_scsi_command pdu) */
+		ttinfo->flags |= CXGBI_PPOD_INFO_FLAG_VALID;
+	} else {
+		/* write ppod from control queue now */
+		err = cdev->csk_ddp_set_map(ppm, csk, ttinfo);
+		if (err < 0)
+			goto rel_ppods;
+	}
+
+	return 0;
+
+rel_ppods:
+	cxgbi_ppm_ppod_release(ppm, ttinfo->idx);
+
+	if (ttinfo->flags & CXGBI_PPOD_INFO_FLAG_MAPPED) {
+		ttinfo->flags &= ~CXGBI_PPOD_INFO_FLAG_MAPPED;
+		dma_unmap_sg(&ppm->pdev->dev, sgl, sgcnt, DMA_FROM_DEVICE);
+	}
+	return -EINVAL;
+}
+
+static void task_release_itt(struct iscsi_task *task, itt_t hdr_itt)
+{
+	struct scsi_cmnd *sc = task->sc;
+	struct iscsi_tcp_conn *tcp_conn = task->conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_device *cdev = cconn->chba->cdev;
+	struct cxgbi_ppm *ppm = cdev->cdev2ppm(cdev);
+	u32 tag = ntohl((__force u32)hdr_itt);
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		  "cdev 0x%p, task 0x%p, release tag 0x%x.\n",
+		  cdev, task, tag);
+	if (sc &&
+	    (scsi_bidi_cmnd(sc) || sc->sc_data_direction == DMA_FROM_DEVICE) &&
+	    cxgbi_ppm_is_ddp_tag(ppm, tag)) {
+		struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task);
+		struct cxgbi_task_tag_info *ttinfo = &tdata->ttinfo;
+
+		if (!(cdev->flags & CXGBI_FLAG_USE_PPOD_OFLDQ))
+			cdev->csk_ddp_clear_map(cdev, ppm, ttinfo);
+		cxgbi_ppm_ppod_release(ppm, ttinfo->idx);
+		dma_unmap_sg(&ppm->pdev->dev, ttinfo->sgl, ttinfo->nents,
+			     DMA_FROM_DEVICE);
+	}
+}
+
+static inline u32 cxgbi_build_sw_tag(u32 idx, u32 age)
+{
+	/* assume idx and age both are < 0x7FFF (32767) */
+	return (idx << 16) | age;
+}
+
+static int task_reserve_itt(struct iscsi_task *task, itt_t *hdr_itt)
+{
+	struct scsi_cmnd *sc = task->sc;
+	struct iscsi_conn *conn = task->conn;
+	struct iscsi_session *sess = conn->session;
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_device *cdev = cconn->chba->cdev;
+	struct cxgbi_ppm *ppm = cdev->cdev2ppm(cdev);
+	u32 sw_tag = cxgbi_build_sw_tag(task->itt, sess->age);
+	u32 tag = 0;
+	int err = -EINVAL;
+
+	if (sc &&
+	    (scsi_bidi_cmnd(sc) || sc->sc_data_direction == DMA_FROM_DEVICE)
+	) {
+		struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task);
+		struct cxgbi_task_tag_info *ttinfo = &tdata->ttinfo;
+
+		scmd_get_params(sc, &ttinfo->sgl, &ttinfo->nents,
+				&tdata->dlen, 0);
+		err = cxgbi_ddp_reserve(cconn, tdata, sw_tag, tdata->dlen);
+		if (!err)
+			tag = ttinfo->tag;
+		else
+			 log_debug(1 << CXGBI_DBG_DDP,
+				   "csk 0x%p, R task 0x%p, %u,%u, no ddp.\n",
+				   cconn->cep->csk, task, tdata->dlen,
+				   ttinfo->nents);
+	}
+
+	if (err < 0) {
+		err = cxgbi_ppm_make_non_ddp_tag(ppm, sw_tag, &tag);
+		if (err < 0)
+			return err;
+	}
+	/*  the itt need to sent in big-endian order */
+	*hdr_itt = (__force itt_t)htonl(tag);
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		  "cdev 0x%p, task 0x%p, 0x%x(0x%x,0x%x)->0x%x/0x%x.\n",
+		  cdev, task, sw_tag, task->itt, sess->age, tag, *hdr_itt);
+	return 0;
+}
+
+void cxgbi_parse_pdu_itt(struct iscsi_conn *conn, itt_t itt, int *idx, int *age)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_device *cdev = cconn->chba->cdev;
+	struct cxgbi_ppm *ppm = cdev->cdev2ppm(cdev);
+	u32 tag = ntohl((__force u32)itt);
+	u32 sw_bits;
+
+	if (ppm) {
+		if (cxgbi_ppm_is_ddp_tag(ppm, tag))
+			sw_bits = cxgbi_ppm_get_tag_caller_data(ppm, tag);
+		else
+			sw_bits = cxgbi_ppm_decode_non_ddp_tag(ppm, tag);
+	} else {
+		sw_bits = tag;
+	}
+
+	cxgbi_decode_sw_tag(sw_bits, idx, age);
+	log_debug(1 << CXGBI_DBG_DDP,
+		  "cdev 0x%p, tag 0x%x/0x%x, -> 0x%x(0x%x,0x%x).\n",
+		  cdev, tag, itt, sw_bits, idx ? *idx : 0xFFFFF,
+		  age ? *age : 0xFF);
+}
+EXPORT_SYMBOL_GPL(cxgbi_parse_pdu_itt);
+
+void cxgbi_conn_tx_open(struct cxgbi_sock *csk)
+{
+	struct iscsi_conn *conn = csk->user_data;
+
+	if (conn) {
+		log_debug(1 << CXGBI_DBG_SOCK,
+			"csk 0x%p, cid %d.\n", csk, conn->id);
+		iscsi_conn_queue_work(conn);
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_conn_tx_open);
+
+/*
+ * pdu receive, interact with libiscsi_tcp
+ */
+static inline int read_pdu_skb(struct iscsi_conn *conn,
+			       struct sk_buff *skb,
+			       unsigned int offset,
+			       int offloaded)
+{
+	int status = 0;
+	int bytes_read;
+
+	bytes_read = iscsi_tcp_recv_skb(conn, skb, offset, offloaded, &status);
+	switch (status) {
+	case ISCSI_TCP_CONN_ERR:
+		pr_info("skb 0x%p, off %u, %d, TCP_ERR.\n",
+			  skb, offset, offloaded);
+		return -EIO;
+	case ISCSI_TCP_SUSPENDED:
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			"skb 0x%p, off %u, %d, TCP_SUSPEND, rc %d.\n",
+			skb, offset, offloaded, bytes_read);
+		/* no transfer - just have caller flush queue */
+		return bytes_read;
+	case ISCSI_TCP_SKB_DONE:
+		pr_info("skb 0x%p, off %u, %d, TCP_SKB_DONE.\n",
+			skb, offset, offloaded);
+		/*
+		 * pdus should always fit in the skb and we should get
+		 * segment done notifcation.
+		 */
+		iscsi_conn_printk(KERN_ERR, conn, "Invalid pdu or skb.");
+		return -EFAULT;
+	case ISCSI_TCP_SEGMENT_DONE:
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			"skb 0x%p, off %u, %d, TCP_SEG_DONE, rc %d.\n",
+			skb, offset, offloaded, bytes_read);
+		return bytes_read;
+	default:
+		pr_info("skb 0x%p, off %u, %d, invalid status %d.\n",
+			skb, offset, offloaded, status);
+		return -EINVAL;
+	}
+}
+
+static int
+skb_read_pdu_bhs(struct cxgbi_sock *csk, struct iscsi_conn *conn,
+		 struct sk_buff *skb)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	int err;
+
+	log_debug(1 << CXGBI_DBG_PDU_RX,
+		"conn 0x%p, skb 0x%p, len %u, flag 0x%lx.\n",
+		conn, skb, skb->len, cxgbi_skcb_flags(skb));
+
+	if (!iscsi_tcp_recv_segment_is_hdr(tcp_conn)) {
+		pr_info("conn 0x%p, skb 0x%p, not hdr.\n", conn, skb);
+		iscsi_conn_failure(conn, ISCSI_ERR_PROTO);
+		return -EIO;
+	}
+
+	if (conn->hdrdgst_en &&
+	    cxgbi_skcb_test_flag(skb, SKCBF_RX_HCRC_ERR)) {
+		pr_info("conn 0x%p, skb 0x%p, hcrc.\n", conn, skb);
+		iscsi_conn_failure(conn, ISCSI_ERR_HDR_DGST);
+		return -EIO;
+	}
+
+	if (cxgbi_skcb_test_flag(skb, SKCBF_RX_ISCSI_COMPL) &&
+	    cxgbi_skcb_test_flag(skb, SKCBF_RX_DATA_DDPD)) {
+		/* If completion flag is set and data is directly
+		 * placed in to the host memory then update
+		 * task->exp_datasn to the datasn in completion
+		 * iSCSI hdr as T6 adapter generates completion only
+		 * for the last pdu of a sequence.
+		 */
+		itt_t itt = ((struct iscsi_data *)skb->data)->itt;
+		struct iscsi_task *task = iscsi_itt_to_ctask(conn, itt);
+		u32 data_sn = be32_to_cpu(((struct iscsi_data *)
+							skb->data)->datasn);
+		if (task && task->sc) {
+			struct iscsi_tcp_task *tcp_task = task->dd_data;
+
+			tcp_task->exp_datasn = data_sn;
+		}
+	}
+
+	err = read_pdu_skb(conn, skb, 0, 0);
+	if (likely(err >= 0)) {
+		struct iscsi_hdr *hdr = (struct iscsi_hdr *)skb->data;
+		u8 opcode = hdr->opcode & ISCSI_OPCODE_MASK;
+
+		if (unlikely(opcode == ISCSI_OP_LOGOUT_RSP))
+			cxgbi_sock_set_flag(csk, CTPF_LOGOUT_RSP_RCVD);
+	}
+
+	return err;
+}
+
+static int skb_read_pdu_data(struct iscsi_conn *conn, struct sk_buff *lskb,
+			     struct sk_buff *skb, unsigned int offset)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	bool offloaded = 0;
+	int opcode = tcp_conn->in.hdr->opcode & ISCSI_OPCODE_MASK;
+
+	log_debug(1 << CXGBI_DBG_PDU_RX,
+		"conn 0x%p, skb 0x%p, len %u, flag 0x%lx.\n",
+		conn, skb, skb->len, cxgbi_skcb_flags(skb));
+
+	if (conn->datadgst_en &&
+	    cxgbi_skcb_test_flag(lskb, SKCBF_RX_DCRC_ERR)) {
+		pr_info("conn 0x%p, skb 0x%p, dcrc 0x%lx.\n",
+			conn, lskb, cxgbi_skcb_flags(lskb));
+		iscsi_conn_failure(conn, ISCSI_ERR_DATA_DGST);
+		return -EIO;
+	}
+
+	if (iscsi_tcp_recv_segment_is_hdr(tcp_conn))
+		return 0;
+
+	/* coalesced, add header digest length */
+	if (lskb == skb && conn->hdrdgst_en)
+		offset += ISCSI_DIGEST_SIZE;
+
+	if (cxgbi_skcb_test_flag(lskb, SKCBF_RX_DATA_DDPD))
+		offloaded = 1;
+
+	if (opcode == ISCSI_OP_SCSI_DATA_IN)
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			"skb 0x%p, op 0x%x, itt 0x%x, %u %s ddp'ed.\n",
+			skb, opcode, ntohl(tcp_conn->in.hdr->itt),
+			tcp_conn->in.datalen, offloaded ? "is" : "not");
+
+	return read_pdu_skb(conn, skb, offset, offloaded);
+}
+
+static void csk_return_rx_credits(struct cxgbi_sock *csk, int copied)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	int must_send;
+	u32 credits;
+
+	log_debug(1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx,%u, seq %u, wup %u, thre %u, %u.\n",
+		csk, csk->state, csk->flags, csk->tid, csk->copied_seq,
+		csk->rcv_wup, cdev->rx_credit_thres,
+		csk->rcv_win);
+
+	if (!cdev->rx_credit_thres)
+		return;
+
+	if (csk->state != CTP_ESTABLISHED)
+		return;
+
+	credits = csk->copied_seq - csk->rcv_wup;
+	if (unlikely(!credits))
+		return;
+	must_send = credits + 16384 >= csk->rcv_win;
+	if (must_send || credits >= cdev->rx_credit_thres)
+		csk->rcv_wup += cdev->csk_send_rx_credits(csk, credits);
+}
+
+void cxgbi_conn_pdu_ready(struct cxgbi_sock *csk)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct iscsi_conn *conn = csk->user_data;
+	struct sk_buff *skb;
+	unsigned int read = 0;
+	int err = 0;
+
+	log_debug(1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p, conn 0x%p.\n", csk, conn);
+
+	if (unlikely(!conn || conn->suspend_rx)) {
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			"csk 0x%p, conn 0x%p, id %d, suspend_rx %lu!\n",
+			csk, conn, conn ? conn->id : 0xFF,
+			conn ? conn->suspend_rx : 0xFF);
+		return;
+	}
+
+	while (!err) {
+		skb = skb_peek(&csk->receive_queue);
+		if (!skb ||
+		    !(cxgbi_skcb_test_flag(skb, SKCBF_RX_STATUS))) {
+			if (skb)
+				log_debug(1 << CXGBI_DBG_PDU_RX,
+					"skb 0x%p, NOT ready 0x%lx.\n",
+					skb, cxgbi_skcb_flags(skb));
+			break;
+		}
+		__skb_unlink(skb, &csk->receive_queue);
+
+		read += cxgbi_skcb_rx_pdulen(skb);
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			"csk 0x%p, skb 0x%p,%u,f 0x%lx, pdu len %u.\n",
+			csk, skb, skb->len, cxgbi_skcb_flags(skb),
+			cxgbi_skcb_rx_pdulen(skb));
+
+		if (cxgbi_skcb_test_flag(skb, SKCBF_RX_COALESCED)) {
+			err = skb_read_pdu_bhs(csk, conn, skb);
+			if (err < 0) {
+				pr_err("coalesced bhs, csk 0x%p, skb 0x%p,%u, "
+					"f 0x%lx, plen %u.\n",
+					csk, skb, skb->len,
+					cxgbi_skcb_flags(skb),
+					cxgbi_skcb_rx_pdulen(skb));
+				goto skb_done;
+			}
+			err = skb_read_pdu_data(conn, skb, skb,
+						err + cdev->skb_rx_extra);
+			if (err < 0)
+				pr_err("coalesced data, csk 0x%p, skb 0x%p,%u, "
+					"f 0x%lx, plen %u.\n",
+					csk, skb, skb->len,
+					cxgbi_skcb_flags(skb),
+					cxgbi_skcb_rx_pdulen(skb));
+		} else {
+			err = skb_read_pdu_bhs(csk, conn, skb);
+			if (err < 0) {
+				pr_err("bhs, csk 0x%p, skb 0x%p,%u, "
+					"f 0x%lx, plen %u.\n",
+					csk, skb, skb->len,
+					cxgbi_skcb_flags(skb),
+					cxgbi_skcb_rx_pdulen(skb));
+				goto skb_done;
+			}
+
+			if (cxgbi_skcb_test_flag(skb, SKCBF_RX_DATA)) {
+				struct sk_buff *dskb;
+
+				dskb = skb_peek(&csk->receive_queue);
+				if (!dskb) {
+					pr_err("csk 0x%p, skb 0x%p,%u, f 0x%lx,"
+						" plen %u, NO data.\n",
+						csk, skb, skb->len,
+						cxgbi_skcb_flags(skb),
+						cxgbi_skcb_rx_pdulen(skb));
+					err = -EIO;
+					goto skb_done;
+				}
+				__skb_unlink(dskb, &csk->receive_queue);
+
+				err = skb_read_pdu_data(conn, skb, dskb, 0);
+				if (err < 0)
+					pr_err("data, csk 0x%p, skb 0x%p,%u, "
+						"f 0x%lx, plen %u, dskb 0x%p,"
+						"%u.\n",
+						csk, skb, skb->len,
+						cxgbi_skcb_flags(skb),
+						cxgbi_skcb_rx_pdulen(skb),
+						dskb, dskb->len);
+				__kfree_skb(dskb);
+			} else
+				err = skb_read_pdu_data(conn, skb, skb, 0);
+		}
+skb_done:
+		__kfree_skb(skb);
+
+		if (err < 0)
+			break;
+	}
+
+	log_debug(1 << CXGBI_DBG_PDU_RX, "csk 0x%p, read %u.\n", csk, read);
+	if (read) {
+		csk->copied_seq += read;
+		csk_return_rx_credits(csk, read);
+		conn->rxdata_octets += read;
+	}
+
+	if (err < 0) {
+		pr_info("csk 0x%p, 0x%p, rx failed %d, read %u.\n",
+			csk, conn, err, read);
+		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_conn_pdu_ready);
+
+static int sgl_seek_offset(struct scatterlist *sgl, unsigned int sgcnt,
+				unsigned int offset, unsigned int *off,
+				struct scatterlist **sgp)
+{
+	int i;
+	struct scatterlist *sg;
+
+	for_each_sg(sgl, sg, sgcnt, i) {
+		if (offset < sg->length) {
+			*off = offset;
+			*sgp = sg;
+			return 0;
+		}
+		offset -= sg->length;
+	}
+	return -EFAULT;
+}
+
+static int sgl_read_to_frags(struct scatterlist *sg, unsigned int sgoffset,
+				unsigned int dlen, struct page_frag *frags,
+				int frag_max)
+{
+	unsigned int datalen = dlen;
+	unsigned int sglen = sg->length - sgoffset;
+	struct page *page = sg_page(sg);
+	int i;
+
+	i = 0;
+	do {
+		unsigned int copy;
+
+		if (!sglen) {
+			sg = sg_next(sg);
+			if (!sg) {
+				pr_warn("sg %d NULL, len %u/%u.\n",
+					i, datalen, dlen);
+				return -EINVAL;
+			}
+			sgoffset = 0;
+			sglen = sg->length;
+			page = sg_page(sg);
+
+		}
+		copy = min(datalen, sglen);
+		if (i && page == frags[i - 1].page &&
+		    sgoffset + sg->offset ==
+			frags[i - 1].offset + frags[i - 1].size) {
+			frags[i - 1].size += copy;
+		} else {
+			if (i >= frag_max) {
+				pr_warn("too many pages %u, dlen %u.\n",
+					frag_max, dlen);
+				return -EINVAL;
+			}
+
+			frags[i].page = page;
+			frags[i].offset = sg->offset + sgoffset;
+			frags[i].size = copy;
+			i++;
+		}
+		datalen -= copy;
+		sgoffset += copy;
+		sglen -= copy;
+	} while (datalen);
+
+	return i;
+}
+
+int cxgbi_conn_alloc_pdu(struct iscsi_task *task, u8 opcode)
+{
+	struct iscsi_tcp_conn *tcp_conn = task->conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_device *cdev = cconn->chba->cdev;
+	struct iscsi_conn *conn = task->conn;
+	struct iscsi_tcp_task *tcp_task = task->dd_data;
+	struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task);
+	struct scsi_cmnd *sc = task->sc;
+	struct cxgbi_sock *csk = cconn->cep->csk;
+	struct net_device *ndev = cdev->ports[csk->port_id];
+	int headroom = SKB_TX_ISCSI_PDU_HEADER_MAX;
+
+	tcp_task->dd_data = tdata;
+	task->hdr = NULL;
+
+	if (SKB_MAX_HEAD(cdev->skb_tx_rsvd) > (512 * MAX_SKB_FRAGS) &&
+	    (opcode == ISCSI_OP_SCSI_DATA_OUT ||
+	     (opcode == ISCSI_OP_SCSI_CMD &&
+	      (scsi_bidi_cmnd(sc) || sc->sc_data_direction == DMA_TO_DEVICE))))
+		/* data could goes into skb head */
+		headroom += min_t(unsigned int,
+				SKB_MAX_HEAD(cdev->skb_tx_rsvd),
+				conn->max_xmit_dlength);
+
+	tdata->skb = alloc_skb(cdev->skb_tx_rsvd + headroom, GFP_ATOMIC);
+	if (!tdata->skb) {
+		ndev->stats.tx_dropped++;
+		return -ENOMEM;
+	}
+
+	skb_reserve(tdata->skb, cdev->skb_tx_rsvd);
+
+	if (task->sc) {
+		task->hdr = (struct iscsi_hdr *)tdata->skb->data;
+	} else {
+		task->hdr = kzalloc(SKB_TX_ISCSI_PDU_HEADER_MAX, GFP_ATOMIC);
+		if (!task->hdr) {
+			__kfree_skb(tdata->skb);
+			tdata->skb = NULL;
+			ndev->stats.tx_dropped++;
+			return -ENOMEM;
+		}
+	}
+	task->hdr_max = SKB_TX_ISCSI_PDU_HEADER_MAX; /* BHS + AHS */
+
+	/* data_out uses scsi_cmd's itt */
+	if (opcode != ISCSI_OP_SCSI_DATA_OUT)
+		task_reserve_itt(task, &task->hdr->itt);
+
+	log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX,
+		"task 0x%p, op 0x%x, skb 0x%p,%u+%u/%u, itt 0x%x.\n",
+		task, opcode, tdata->skb, cdev->skb_tx_rsvd, headroom,
+		conn->max_xmit_dlength, ntohl(task->hdr->itt));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_conn_alloc_pdu);
+
+static inline void tx_skb_setmode(struct sk_buff *skb, int hcrc, int dcrc)
+{
+	if (hcrc || dcrc) {
+		u8 submode = 0;
+
+		if (hcrc)
+			submode |= 1;
+		if (dcrc)
+			submode |= 2;
+		cxgbi_skcb_ulp_mode(skb) = (ULP2_MODE_ISCSI << 4) | submode;
+	} else
+		cxgbi_skcb_ulp_mode(skb) = 0;
+}
+
+int cxgbi_conn_init_pdu(struct iscsi_task *task, unsigned int offset,
+			      unsigned int count)
+{
+	struct iscsi_conn *conn = task->conn;
+	struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task);
+	struct sk_buff *skb = tdata->skb;
+	unsigned int datalen = count;
+	int i, padlen = iscsi_padding(count);
+	struct page *pg;
+
+	log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX,
+		"task 0x%p,0x%p, skb 0x%p, 0x%x,0x%x,0x%x, %u+%u.\n",
+		task, task->sc, skb, (*skb->data) & ISCSI_OPCODE_MASK,
+		ntohl(task->cmdsn), ntohl(task->hdr->itt), offset, count);
+
+	skb_put(skb, task->hdr_len);
+	tx_skb_setmode(skb, conn->hdrdgst_en, datalen ? conn->datadgst_en : 0);
+	if (!count)
+		return 0;
+
+	if (task->sc) {
+		struct scsi_data_buffer *sdb = scsi_out(task->sc);
+		struct scatterlist *sg = NULL;
+		int err;
+
+		tdata->offset = offset;
+		tdata->count = count;
+		err = sgl_seek_offset(
+					sdb->table.sgl, sdb->table.nents,
+					tdata->offset, &tdata->sgoffset, &sg);
+		if (err < 0) {
+			pr_warn("tpdu, sgl %u, bad offset %u/%u.\n",
+				sdb->table.nents, tdata->offset, sdb->length);
+			return err;
+		}
+		err = sgl_read_to_frags(sg, tdata->sgoffset, tdata->count,
+					tdata->frags, MAX_PDU_FRAGS);
+		if (err < 0) {
+			pr_warn("tpdu, sgl %u, bad offset %u + %u.\n",
+				sdb->table.nents, tdata->offset, tdata->count);
+			return err;
+		}
+		tdata->nr_frags = err;
+
+		if (tdata->nr_frags > MAX_SKB_FRAGS ||
+		    (padlen && tdata->nr_frags == MAX_SKB_FRAGS)) {
+			char *dst = skb->data + task->hdr_len;
+			struct page_frag *frag = tdata->frags;
+
+			/* data fits in the skb's headroom */
+			for (i = 0; i < tdata->nr_frags; i++, frag++) {
+				char *src = kmap_atomic(frag->page);
+
+				memcpy(dst, src+frag->offset, frag->size);
+				dst += frag->size;
+				kunmap_atomic(src);
+			}
+			if (padlen) {
+				memset(dst, 0, padlen);
+				padlen = 0;
+			}
+			skb_put(skb, count + padlen);
+		} else {
+			/* data fit into frag_list */
+			for (i = 0; i < tdata->nr_frags; i++) {
+				__skb_fill_page_desc(skb, i,
+						tdata->frags[i].page,
+						tdata->frags[i].offset,
+						tdata->frags[i].size);
+				skb_frag_ref(skb, i);
+			}
+			skb_shinfo(skb)->nr_frags = tdata->nr_frags;
+			skb->len += count;
+			skb->data_len += count;
+			skb->truesize += count;
+		}
+
+	} else {
+		pg = virt_to_page(task->data);
+
+		get_page(pg);
+		skb_fill_page_desc(skb, 0, pg, offset_in_page(task->data),
+					count);
+		skb->len += count;
+		skb->data_len += count;
+		skb->truesize += count;
+	}
+
+	if (padlen) {
+		i = skb_shinfo(skb)->nr_frags;
+		skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
+				virt_to_page(padding), offset_in_page(padding),
+				padlen);
+
+		skb->data_len += padlen;
+		skb->truesize += padlen;
+		skb->len += padlen;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_conn_init_pdu);
+
+int cxgbi_conn_xmit_pdu(struct iscsi_task *task)
+{
+	struct iscsi_tcp_conn *tcp_conn = task->conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task);
+	struct cxgbi_task_tag_info *ttinfo = &tdata->ttinfo;
+	struct sk_buff *skb = tdata->skb;
+	struct cxgbi_sock *csk = NULL;
+	unsigned int datalen;
+	int err;
+
+	if (!skb) {
+		log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX,
+			"task 0x%p\n", task);
+		return 0;
+	}
+
+	if (cconn && cconn->cep)
+		csk = cconn->cep->csk;
+	if (!csk) {
+		log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX,
+			  "task 0x%p, csk gone.\n", task);
+		return -EPIPE;
+	}
+
+	tdata->skb = NULL;
+	datalen = skb->data_len;
+
+	/* write ppod first if using ofldq to write ppod */
+	if (ttinfo->flags & CXGBI_PPOD_INFO_FLAG_VALID) {
+		struct cxgbi_ppm *ppm = csk->cdev->cdev2ppm(csk->cdev);
+
+		ttinfo->flags &= ~CXGBI_PPOD_INFO_FLAG_VALID;
+		if (csk->cdev->csk_ddp_set_map(ppm, csk, ttinfo) < 0)
+			pr_err("task 0x%p, ppod writing using ofldq failed.\n",
+			       task);
+			/* continue. Let fl get the data */
+	}
+
+	if (!task->sc)
+		memcpy(skb->data, task->hdr, SKB_TX_ISCSI_PDU_HEADER_MAX);
+
+	err = cxgbi_sock_send_pdus(cconn->cep->csk, skb);
+	if (err > 0) {
+		int pdulen = err;
+
+		log_debug(1 << CXGBI_DBG_PDU_TX,
+			"task 0x%p,0x%p, skb 0x%p, len %u/%u, rv %d.\n",
+			task, task->sc, skb, skb->len, skb->data_len, err);
+
+		if (task->conn->hdrdgst_en)
+			pdulen += ISCSI_DIGEST_SIZE;
+
+		if (datalen && task->conn->datadgst_en)
+			pdulen += ISCSI_DIGEST_SIZE;
+
+		task->conn->txdata_octets += pdulen;
+		return 0;
+	}
+
+	if (err == -EAGAIN || err == -ENOBUFS) {
+		log_debug(1 << CXGBI_DBG_PDU_TX,
+			"task 0x%p, skb 0x%p, len %u/%u, %d EAGAIN.\n",
+			task, skb, skb->len, skb->data_len, err);
+		/* reset skb to send when we are called again */
+		tdata->skb = skb;
+		return err;
+	}
+
+	log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX,
+		"itt 0x%x, skb 0x%p, len %u/%u, xmit err %d.\n",
+		task->itt, skb, skb->len, skb->data_len, err);
+
+	__kfree_skb(skb);
+
+	iscsi_conn_printk(KERN_ERR, task->conn, "xmit err %d.\n", err);
+	iscsi_conn_failure(task->conn, ISCSI_ERR_XMIT_FAILED);
+	return err;
+}
+EXPORT_SYMBOL_GPL(cxgbi_conn_xmit_pdu);
+
+void cxgbi_cleanup_task(struct iscsi_task *task)
+{
+	struct iscsi_tcp_task *tcp_task = task->dd_data;
+	struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task);
+
+	if (!tcp_task || !tdata || (tcp_task->dd_data != tdata)) {
+		pr_info("task 0x%p,0x%p, tcp_task 0x%p, tdata 0x%p/0x%p.\n",
+			task, task->sc, tcp_task,
+			tcp_task ? tcp_task->dd_data : NULL, tdata);
+		return;
+	}
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"task 0x%p, skb 0x%p, itt 0x%x.\n",
+		task, tdata->skb, task->hdr_itt);
+
+	tcp_task->dd_data = NULL;
+
+	if (!task->sc)
+		kfree(task->hdr);
+	task->hdr = NULL;
+
+	/*  never reached the xmit task callout */
+	if (tdata->skb) {
+		__kfree_skb(tdata->skb);
+		tdata->skb = NULL;
+	}
+
+	task_release_itt(task, task->hdr_itt);
+	memset(tdata, 0, sizeof(*tdata));
+
+	iscsi_tcp_cleanup_task(task);
+}
+EXPORT_SYMBOL_GPL(cxgbi_cleanup_task);
+
+void cxgbi_get_conn_stats(struct iscsi_cls_conn *cls_conn,
+				struct iscsi_stats *stats)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+
+	stats->txdata_octets = conn->txdata_octets;
+	stats->rxdata_octets = conn->rxdata_octets;
+	stats->scsicmd_pdus = conn->scsicmd_pdus_cnt;
+	stats->dataout_pdus = conn->dataout_pdus_cnt;
+	stats->scsirsp_pdus = conn->scsirsp_pdus_cnt;
+	stats->datain_pdus = conn->datain_pdus_cnt;
+	stats->r2t_pdus = conn->r2t_pdus_cnt;
+	stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt;
+	stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt;
+	stats->digest_err = 0;
+	stats->timeout_err = 0;
+	stats->custom_length = 1;
+	strcpy(stats->custom[0].desc, "eh_abort_cnt");
+	stats->custom[0].value = conn->eh_abort_cnt;
+}
+EXPORT_SYMBOL_GPL(cxgbi_get_conn_stats);
+
+static int cxgbi_conn_max_xmit_dlength(struct iscsi_conn *conn)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_device *cdev = cconn->chba->cdev;
+	unsigned int headroom = SKB_MAX_HEAD(cdev->skb_tx_rsvd);
+	unsigned int max_def = 512 * MAX_SKB_FRAGS;
+	unsigned int max = max(max_def, headroom);
+
+	max = min(cconn->chba->cdev->tx_max_size, max);
+	if (conn->max_xmit_dlength)
+		conn->max_xmit_dlength = min(conn->max_xmit_dlength, max);
+	else
+		conn->max_xmit_dlength = max;
+	cxgbi_align_pdu_size(conn->max_xmit_dlength);
+
+	return 0;
+}
+
+static int cxgbi_conn_max_recv_dlength(struct iscsi_conn *conn)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	unsigned int max = cconn->chba->cdev->rx_max_size;
+
+	cxgbi_align_pdu_size(max);
+
+	if (conn->max_recv_dlength) {
+		if (conn->max_recv_dlength > max) {
+			pr_err("MaxRecvDataSegmentLength %u > %u.\n",
+				conn->max_recv_dlength, max);
+			return -EINVAL;
+		}
+		conn->max_recv_dlength = min(conn->max_recv_dlength, max);
+		cxgbi_align_pdu_size(conn->max_recv_dlength);
+	} else
+		conn->max_recv_dlength = max;
+
+	return 0;
+}
+
+int cxgbi_set_conn_param(struct iscsi_cls_conn *cls_conn,
+			enum iscsi_param param, char *buf, int buflen)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_sock *csk = cconn->cep->csk;
+	int err;
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"cls_conn 0x%p, param %d, buf(%d) %s.\n",
+		cls_conn, param, buflen, buf);
+
+	switch (param) {
+	case ISCSI_PARAM_HDRDGST_EN:
+		err = iscsi_set_param(cls_conn, param, buf, buflen);
+		if (!err && conn->hdrdgst_en)
+			err = csk->cdev->csk_ddp_setup_digest(csk, csk->tid,
+							conn->hdrdgst_en,
+							conn->datadgst_en, 0);
+		break;
+	case ISCSI_PARAM_DATADGST_EN:
+		err = iscsi_set_param(cls_conn, param, buf, buflen);
+		if (!err && conn->datadgst_en)
+			err = csk->cdev->csk_ddp_setup_digest(csk, csk->tid,
+							conn->hdrdgst_en,
+							conn->datadgst_en, 0);
+		break;
+	case ISCSI_PARAM_MAX_R2T:
+		return iscsi_tcp_set_max_r2t(conn, buf);
+	case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		err = iscsi_set_param(cls_conn, param, buf, buflen);
+		if (!err)
+			err = cxgbi_conn_max_recv_dlength(conn);
+		break;
+	case ISCSI_PARAM_MAX_XMIT_DLENGTH:
+		err = iscsi_set_param(cls_conn, param, buf, buflen);
+		if (!err)
+			err = cxgbi_conn_max_xmit_dlength(conn);
+		break;
+	default:
+		return iscsi_set_param(cls_conn, param, buf, buflen);
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(cxgbi_set_conn_param);
+
+static inline int csk_print_port(struct cxgbi_sock *csk, char *buf)
+{
+	int len;
+
+	cxgbi_sock_get(csk);
+	len = sprintf(buf, "%hu\n", ntohs(csk->daddr.sin_port));
+	cxgbi_sock_put(csk);
+
+	return len;
+}
+
+static inline int csk_print_ip(struct cxgbi_sock *csk, char *buf)
+{
+	int len;
+
+	cxgbi_sock_get(csk);
+	if (csk->csk_family == AF_INET)
+		len = sprintf(buf, "%pI4",
+			      &csk->daddr.sin_addr.s_addr);
+	else
+		len = sprintf(buf, "%pI6",
+			      &csk->daddr6.sin6_addr);
+
+	cxgbi_sock_put(csk);
+
+	return len;
+}
+
+int cxgbi_get_ep_param(struct iscsi_endpoint *ep, enum iscsi_param param,
+		       char *buf)
+{
+	struct cxgbi_endpoint *cep = ep->dd_data;
+	struct cxgbi_sock *csk;
+	int len;
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"cls_conn 0x%p, param %d.\n", ep, param);
+
+	switch (param) {
+	case ISCSI_PARAM_CONN_PORT:
+	case ISCSI_PARAM_CONN_ADDRESS:
+		if (!cep)
+			return -ENOTCONN;
+
+		csk = cep->csk;
+		if (!csk)
+			return -ENOTCONN;
+
+		return iscsi_conn_get_addr_param((struct sockaddr_storage *)
+						 &csk->daddr, param, buf);
+	default:
+		return -ENOSYS;
+	}
+	return len;
+}
+EXPORT_SYMBOL_GPL(cxgbi_get_ep_param);
+
+struct iscsi_cls_conn *
+cxgbi_create_conn(struct iscsi_cls_session *cls_session, u32 cid)
+{
+	struct iscsi_cls_conn *cls_conn;
+	struct iscsi_conn *conn;
+	struct iscsi_tcp_conn *tcp_conn;
+	struct cxgbi_conn *cconn;
+
+	cls_conn = iscsi_tcp_conn_setup(cls_session, sizeof(*cconn), cid);
+	if (!cls_conn)
+		return NULL;
+
+	conn = cls_conn->dd_data;
+	tcp_conn = conn->dd_data;
+	cconn = tcp_conn->dd_data;
+	cconn->iconn = conn;
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"cid %u(0x%x), cls 0x%p,0x%p, conn 0x%p,0x%p,0x%p.\n",
+		cid, cid, cls_session, cls_conn, conn, tcp_conn, cconn);
+
+	return cls_conn;
+}
+EXPORT_SYMBOL_GPL(cxgbi_create_conn);
+
+int cxgbi_bind_conn(struct iscsi_cls_session *cls_session,
+				struct iscsi_cls_conn *cls_conn,
+				u64 transport_eph, int is_leading)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_ppm *ppm;
+	struct iscsi_endpoint *ep;
+	struct cxgbi_endpoint *cep;
+	struct cxgbi_sock *csk;
+	int err;
+
+	ep = iscsi_lookup_endpoint(transport_eph);
+	if (!ep)
+		return -EINVAL;
+
+	/*  setup ddp pagesize */
+	cep = ep->dd_data;
+	csk = cep->csk;
+
+	ppm = csk->cdev->cdev2ppm(csk->cdev);
+	err = csk->cdev->csk_ddp_setup_pgidx(csk, csk->tid,
+					     ppm->tformat.pgsz_idx_dflt, 0);
+	if (err < 0)
+		return err;
+
+	err = iscsi_conn_bind(cls_session, cls_conn, is_leading);
+	if (err)
+		return -EINVAL;
+
+	/*  calculate the tag idx bits needed for this conn based on cmds_max */
+	cconn->task_idx_bits = (__ilog2_u32(conn->session->cmds_max - 1)) + 1;
+
+	write_lock_bh(&csk->callback_lock);
+	csk->user_data = conn;
+	cconn->chba = cep->chba;
+	cconn->cep = cep;
+	cep->cconn = cconn;
+	write_unlock_bh(&csk->callback_lock);
+
+	cxgbi_conn_max_xmit_dlength(conn);
+	cxgbi_conn_max_recv_dlength(conn);
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"cls 0x%p,0x%p, ep 0x%p, cconn 0x%p, csk 0x%p.\n",
+		cls_session, cls_conn, ep, cconn, csk);
+	/*  init recv engine */
+	iscsi_tcp_hdr_recv_prep(tcp_conn);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_bind_conn);
+
+struct iscsi_cls_session *cxgbi_create_session(struct iscsi_endpoint *ep,
+						u16 cmds_max, u16 qdepth,
+						u32 initial_cmdsn)
+{
+	struct cxgbi_endpoint *cep;
+	struct cxgbi_hba *chba;
+	struct Scsi_Host *shost;
+	struct iscsi_cls_session *cls_session;
+	struct iscsi_session *session;
+
+	if (!ep) {
+		pr_err("missing endpoint.\n");
+		return NULL;
+	}
+
+	cep = ep->dd_data;
+	chba = cep->chba;
+	shost = chba->shost;
+
+	BUG_ON(chba != iscsi_host_priv(shost));
+
+	cls_session = iscsi_session_setup(chba->cdev->itp, shost,
+					cmds_max, 0,
+					sizeof(struct iscsi_tcp_task) +
+					sizeof(struct cxgbi_task_data),
+					initial_cmdsn, ISCSI_MAX_TARGET);
+	if (!cls_session)
+		return NULL;
+
+	session = cls_session->dd_data;
+	if (iscsi_tcp_r2tpool_alloc(session))
+		goto remove_session;
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"ep 0x%p, cls sess 0x%p.\n", ep, cls_session);
+	return cls_session;
+
+remove_session:
+	iscsi_session_teardown(cls_session);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(cxgbi_create_session);
+
+void cxgbi_destroy_session(struct iscsi_cls_session *cls_session)
+{
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"cls sess 0x%p.\n", cls_session);
+
+	iscsi_tcp_r2tpool_free(cls_session->dd_data);
+	iscsi_session_teardown(cls_session);
+}
+EXPORT_SYMBOL_GPL(cxgbi_destroy_session);
+
+int cxgbi_set_host_param(struct Scsi_Host *shost, enum iscsi_host_param param,
+			char *buf, int buflen)
+{
+	struct cxgbi_hba *chba = iscsi_host_priv(shost);
+
+	if (!chba->ndev) {
+		shost_printk(KERN_ERR, shost, "Could not get host param. "
+				"netdev for host not set.\n");
+		return -ENODEV;
+	}
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"shost 0x%p, hba 0x%p,%s, param %d, buf(%d) %s.\n",
+		shost, chba, chba->ndev->name, param, buflen, buf);
+
+	switch (param) {
+	case ISCSI_HOST_PARAM_IPADDRESS:
+	{
+		__be32 addr = in_aton(buf);
+		log_debug(1 << CXGBI_DBG_ISCSI,
+			"hba %s, req. ipv4 %pI4.\n", chba->ndev->name, &addr);
+		cxgbi_set_iscsi_ipv4(chba, addr);
+		return 0;
+	}
+	case ISCSI_HOST_PARAM_HWADDRESS:
+	case ISCSI_HOST_PARAM_NETDEV_NAME:
+		return 0;
+	default:
+		return iscsi_host_set_param(shost, param, buf, buflen);
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_set_host_param);
+
+int cxgbi_get_host_param(struct Scsi_Host *shost, enum iscsi_host_param param,
+			char *buf)
+{
+	struct cxgbi_hba *chba = iscsi_host_priv(shost);
+	int len = 0;
+
+	if (!chba->ndev) {
+		shost_printk(KERN_ERR, shost, "Could not get host param. "
+				"netdev for host not set.\n");
+		return -ENODEV;
+	}
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"shost 0x%p, hba 0x%p,%s, param %d.\n",
+		shost, chba, chba->ndev->name, param);
+
+	switch (param) {
+	case ISCSI_HOST_PARAM_HWADDRESS:
+		len = sysfs_format_mac(buf, chba->ndev->dev_addr, 6);
+		break;
+	case ISCSI_HOST_PARAM_NETDEV_NAME:
+		len = sprintf(buf, "%s\n", chba->ndev->name);
+		break;
+	case ISCSI_HOST_PARAM_IPADDRESS:
+	{
+		struct cxgbi_sock *csk = find_sock_on_port(chba->cdev,
+							   chba->port_id);
+		if (csk) {
+			len = sprintf(buf, "%pIS",
+				      (struct sockaddr *)&csk->saddr);
+		}
+		log_debug(1 << CXGBI_DBG_ISCSI,
+			  "hba %s, addr %s.\n", chba->ndev->name, buf);
+		break;
+	}
+	default:
+		return iscsi_host_get_param(shost, param, buf);
+	}
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(cxgbi_get_host_param);
+
+struct iscsi_endpoint *cxgbi_ep_connect(struct Scsi_Host *shost,
+					struct sockaddr *dst_addr,
+					int non_blocking)
+{
+	struct iscsi_endpoint *ep;
+	struct cxgbi_endpoint *cep;
+	struct cxgbi_hba *hba = NULL;
+	struct cxgbi_sock *csk;
+	int ifindex = 0;
+	int err = -EINVAL;
+
+	log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_SOCK,
+		"shost 0x%p, non_blocking %d, dst_addr 0x%p.\n",
+		shost, non_blocking, dst_addr);
+
+	if (shost) {
+		hba = iscsi_host_priv(shost);
+		if (!hba) {
+			pr_info("shost 0x%p, priv NULL.\n", shost);
+			goto err_out;
+		}
+
+		rtnl_lock();
+		if (!vlan_uses_dev(hba->ndev))
+			ifindex = hba->ndev->ifindex;
+		rtnl_unlock();
+	}
+
+	if (dst_addr->sa_family == AF_INET) {
+		csk = cxgbi_check_route(dst_addr, ifindex);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (dst_addr->sa_family == AF_INET6) {
+		csk = cxgbi_check_route6(dst_addr, ifindex);
+#endif
+	} else {
+		pr_info("address family 0x%x NOT supported.\n",
+			dst_addr->sa_family);
+		err = -EAFNOSUPPORT;
+		return (struct iscsi_endpoint *)ERR_PTR(err);
+	}
+
+	if (IS_ERR(csk))
+		return (struct iscsi_endpoint *)csk;
+	cxgbi_sock_get(csk);
+
+	if (!hba)
+		hba = csk->cdev->hbas[csk->port_id];
+	else if (hba != csk->cdev->hbas[csk->port_id]) {
+		pr_info("Could not connect through requested host %u"
+			"hba 0x%p != 0x%p (%u).\n",
+			shost->host_no, hba,
+			csk->cdev->hbas[csk->port_id], csk->port_id);
+		err = -ENOSPC;
+		goto release_conn;
+	}
+
+	err = sock_get_port(csk);
+	if (err)
+		goto release_conn;
+
+	cxgbi_sock_set_state(csk, CTP_CONNECTING);
+	err = csk->cdev->csk_init_act_open(csk);
+	if (err)
+		goto release_conn;
+
+	if (cxgbi_sock_is_closing(csk)) {
+		err = -ENOSPC;
+		pr_info("csk 0x%p is closing.\n", csk);
+		goto release_conn;
+	}
+
+	ep = iscsi_create_endpoint(sizeof(*cep));
+	if (!ep) {
+		err = -ENOMEM;
+		pr_info("iscsi alloc ep, OOM.\n");
+		goto release_conn;
+	}
+
+	cep = ep->dd_data;
+	cep->csk = csk;
+	cep->chba = hba;
+
+	log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_SOCK,
+		"ep 0x%p, cep 0x%p, csk 0x%p, hba 0x%p,%s.\n",
+		ep, cep, csk, hba, hba->ndev->name);
+	return ep;
+
+release_conn:
+	cxgbi_sock_put(csk);
+	cxgbi_sock_closed(csk);
+err_out:
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(cxgbi_ep_connect);
+
+int cxgbi_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
+{
+	struct cxgbi_endpoint *cep = ep->dd_data;
+	struct cxgbi_sock *csk = cep->csk;
+
+	if (!cxgbi_sock_is_established(csk))
+		return 0;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(cxgbi_ep_poll);
+
+void cxgbi_ep_disconnect(struct iscsi_endpoint *ep)
+{
+	struct cxgbi_endpoint *cep = ep->dd_data;
+	struct cxgbi_conn *cconn = cep->cconn;
+	struct cxgbi_sock *csk = cep->csk;
+
+	log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_SOCK,
+		"ep 0x%p, cep 0x%p, cconn 0x%p, csk 0x%p,%u,0x%lx.\n",
+		ep, cep, cconn, csk, csk->state, csk->flags);
+
+	if (cconn && cconn->iconn) {
+		iscsi_suspend_tx(cconn->iconn);
+		write_lock_bh(&csk->callback_lock);
+		cep->csk->user_data = NULL;
+		cconn->cep = NULL;
+		write_unlock_bh(&csk->callback_lock);
+	}
+	iscsi_destroy_endpoint(ep);
+
+	if (likely(csk->state >= CTP_ESTABLISHED))
+		need_active_close(csk);
+	else
+		cxgbi_sock_closed(csk);
+
+	cxgbi_sock_put(csk);
+}
+EXPORT_SYMBOL_GPL(cxgbi_ep_disconnect);
+
+int cxgbi_iscsi_init(struct iscsi_transport *itp,
+			struct scsi_transport_template **stt)
+{
+	*stt = iscsi_register_transport(itp);
+	if (*stt == NULL) {
+		pr_err("unable to register %s transport 0x%p.\n",
+			itp->name, itp);
+		return -ENODEV;
+	}
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"%s, registered iscsi transport 0x%p.\n",
+		itp->name, stt);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_iscsi_init);
+
+void cxgbi_iscsi_cleanup(struct iscsi_transport *itp,
+			struct scsi_transport_template **stt)
+{
+	if (*stt) {
+		log_debug(1 << CXGBI_DBG_ISCSI,
+			"de-register transport 0x%p, %s, stt 0x%p.\n",
+			itp, itp->name, *stt);
+		*stt = NULL;
+		iscsi_unregister_transport(itp);
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_iscsi_cleanup);
+
+umode_t cxgbi_attr_is_visible(int param_type, int param)
+{
+	switch (param_type) {
+	case ISCSI_HOST_PARAM:
+		switch (param) {
+		case ISCSI_HOST_PARAM_NETDEV_NAME:
+		case ISCSI_HOST_PARAM_HWADDRESS:
+		case ISCSI_HOST_PARAM_IPADDRESS:
+		case ISCSI_HOST_PARAM_INITIATOR_NAME:
+			return S_IRUGO;
+		default:
+			return 0;
+		}
+	case ISCSI_PARAM:
+		switch (param) {
+		case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		case ISCSI_PARAM_MAX_XMIT_DLENGTH:
+		case ISCSI_PARAM_HDRDGST_EN:
+		case ISCSI_PARAM_DATADGST_EN:
+		case ISCSI_PARAM_CONN_ADDRESS:
+		case ISCSI_PARAM_CONN_PORT:
+		case ISCSI_PARAM_EXP_STATSN:
+		case ISCSI_PARAM_PERSISTENT_ADDRESS:
+		case ISCSI_PARAM_PERSISTENT_PORT:
+		case ISCSI_PARAM_PING_TMO:
+		case ISCSI_PARAM_RECV_TMO:
+		case ISCSI_PARAM_INITIAL_R2T_EN:
+		case ISCSI_PARAM_MAX_R2T:
+		case ISCSI_PARAM_IMM_DATA_EN:
+		case ISCSI_PARAM_FIRST_BURST:
+		case ISCSI_PARAM_MAX_BURST:
+		case ISCSI_PARAM_PDU_INORDER_EN:
+		case ISCSI_PARAM_DATASEQ_INORDER_EN:
+		case ISCSI_PARAM_ERL:
+		case ISCSI_PARAM_TARGET_NAME:
+		case ISCSI_PARAM_TPGT:
+		case ISCSI_PARAM_USERNAME:
+		case ISCSI_PARAM_PASSWORD:
+		case ISCSI_PARAM_USERNAME_IN:
+		case ISCSI_PARAM_PASSWORD_IN:
+		case ISCSI_PARAM_FAST_ABORT:
+		case ISCSI_PARAM_ABORT_TMO:
+		case ISCSI_PARAM_LU_RESET_TMO:
+		case ISCSI_PARAM_TGT_RESET_TMO:
+		case ISCSI_PARAM_IFACE_NAME:
+		case ISCSI_PARAM_INITIATOR_NAME:
+			return S_IRUGO;
+		default:
+			return 0;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_attr_is_visible);
+
+static int __init libcxgbi_init_module(void)
+{
+	pr_info("%s", version);
+
+	BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, cb) <
+		     sizeof(struct cxgbi_skb_cb));
+	return 0;
+}
+
+static void __exit libcxgbi_exit_module(void)
+{
+	cxgbi_device_unregister_all(0xFF);
+	return;
+}
+
+module_init(libcxgbi_init_module);
+module_exit(libcxgbi_exit_module);
diff --git a/drivers/scsi/cxgbi/libcxgbi.h b/drivers/scsi/cxgbi/libcxgbi.h
new file mode 100644
index 0000000..dcb190e
--- /dev/null
+++ b/drivers/scsi/cxgbi/libcxgbi.h
@@ -0,0 +1,620 @@
+/*
+ * libcxgbi.h: Chelsio common library for T3/T4 iSCSI driver.
+ *
+ * Copyright (c) 2010-2015 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie@chelsio.com)
+ * Written by: Rakesh Ranjan (rranjan@chelsio.com)
+ */
+
+#ifndef	__LIBCXGBI_H__
+#define	__LIBCXGBI_H__
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/debugfs.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/version.h>
+#include <scsi/scsi_device.h>
+#include <scsi/libiscsi_tcp.h>
+
+#include <libcxgb_ppm.h>
+
+enum cxgbi_dbg_flag {
+	CXGBI_DBG_ISCSI,
+	CXGBI_DBG_DDP,
+	CXGBI_DBG_TOE,
+	CXGBI_DBG_SOCK,
+
+	CXGBI_DBG_PDU_TX,
+	CXGBI_DBG_PDU_RX,
+	CXGBI_DBG_DEV,
+};
+
+#define log_debug(level, fmt, ...)	\
+	do {	\
+		if (dbg_level & (level)) \
+			pr_info(fmt, ##__VA_ARGS__); \
+	} while (0)
+
+#define pr_info_ipaddr(fmt_trail,					\
+			addr1, addr2, args_trail...)			\
+do {									\
+	if (!((1 << CXGBI_DBG_SOCK) & dbg_level))			\
+		break;							\
+	pr_info("%pISpc - %pISpc, " fmt_trail,				\
+		addr1, addr2, args_trail);				\
+} while (0)
+
+/* max. connections per adapter */
+#define CXGBI_MAX_CONN		16384
+
+/* always allocate rooms for AHS */
+#define SKB_TX_ISCSI_PDU_HEADER_MAX	\
+	(sizeof(struct iscsi_hdr) + ISCSI_MAX_AHS_SIZE)
+
+#define	ISCSI_PDU_NONPAYLOAD_LEN	312 /* bhs(48) + ahs(256) + digest(8)*/
+
+/*
+ * align pdu size to multiple of 512 for better performance
+ */
+#define cxgbi_align_pdu_size(n) do { n = (n) & (~511); } while (0)
+
+#define ULP2_MODE_ISCSI		2
+
+#define ULP2_MAX_PKT_SIZE	16224
+#define ULP2_MAX_PDU_PAYLOAD	\
+	(ULP2_MAX_PKT_SIZE - ISCSI_PDU_NONPAYLOAD_LEN)
+
+/*
+ * For iscsi connections HW may inserts digest bytes into the pdu. Those digest
+ * bytes are not sent by the host but are part of the TCP payload and therefore
+ * consume TCP sequence space.
+ */
+static const unsigned int ulp2_extra_len[] = { 0, 4, 4, 8 };
+static inline unsigned int cxgbi_ulp_extra_len(int submode)
+{
+	return ulp2_extra_len[submode & 3];
+}
+
+#define CPL_RX_DDP_STATUS_DDP_SHIFT	16 /* ddp'able */
+#define CPL_RX_DDP_STATUS_PAD_SHIFT	19 /* pad error */
+#define CPL_RX_DDP_STATUS_HCRC_SHIFT	20 /* hcrc error */
+#define CPL_RX_DDP_STATUS_DCRC_SHIFT	21 /* dcrc error */
+
+/*
+ * sge_opaque_hdr -
+ * Opaque version of structure the SGE stores at skb->head of TX_DATA packets
+ * and for which we must reserve space.
+ */
+struct sge_opaque_hdr {
+	void *dev;
+	dma_addr_t addr[MAX_SKB_FRAGS + 1];
+};
+
+struct cxgbi_sock {
+	struct cxgbi_device *cdev;
+
+	int tid;
+	int atid;
+	unsigned long flags;
+	unsigned int mtu;
+	unsigned short rss_qid;
+	unsigned short txq_idx;
+	unsigned short advmss;
+	unsigned int tx_chan;
+	unsigned int rx_chan;
+	unsigned int mss_idx;
+	unsigned int smac_idx;
+	unsigned char port_id;
+	int wr_max_cred;
+	int wr_cred;
+	int wr_una_cred;
+	unsigned char hcrc_len;
+	unsigned char dcrc_len;
+
+	void *l2t;
+	struct sk_buff *wr_pending_head;
+	struct sk_buff *wr_pending_tail;
+	struct sk_buff *cpl_close;
+	struct sk_buff *cpl_abort_req;
+	struct sk_buff *cpl_abort_rpl;
+	struct sk_buff *skb_ulp_lhdr;
+	spinlock_t lock;
+	struct kref refcnt;
+	unsigned int state;
+	unsigned int csk_family;
+	union {
+		struct sockaddr_in saddr;
+		struct sockaddr_in6 saddr6;
+	};
+	union {
+		struct sockaddr_in daddr;
+		struct sockaddr_in6 daddr6;
+	};
+	struct dst_entry *dst;
+	struct sk_buff_head receive_queue;
+	struct sk_buff_head write_queue;
+	struct timer_list retry_timer;
+	int err;
+	rwlock_t callback_lock;
+	void *user_data;
+
+	u32 rcv_nxt;
+	u32 copied_seq;
+	u32 rcv_wup;
+	u32 snd_nxt;
+	u32 snd_una;
+	u32 write_seq;
+	u32 snd_win;
+	u32 rcv_win;
+};
+
+/*
+ * connection states
+ */
+enum cxgbi_sock_states{
+	CTP_CLOSED,
+	CTP_CONNECTING,
+	CTP_ACTIVE_OPEN,
+	CTP_ESTABLISHED,
+	CTP_ACTIVE_CLOSE,
+	CTP_PASSIVE_CLOSE,
+	CTP_CLOSE_WAIT_1,
+	CTP_CLOSE_WAIT_2,
+	CTP_ABORTING,
+};
+
+/*
+ * Connection flags -- many to track some close related events.
+ */
+enum cxgbi_sock_flags {
+	CTPF_ABORT_RPL_RCVD,	/*received one ABORT_RPL_RSS message */
+	CTPF_ABORT_REQ_RCVD,	/*received one ABORT_REQ_RSS message */
+	CTPF_ABORT_RPL_PENDING,	/* expecting an abort reply */
+	CTPF_TX_DATA_SENT,	/* already sent a TX_DATA WR */
+	CTPF_ACTIVE_CLOSE_NEEDED,/* need to be closed */
+	CTPF_HAS_ATID,		/* reserved atid */
+	CTPF_HAS_TID,		/* reserved hw tid */
+	CTPF_OFFLOAD_DOWN,	/* offload function off */
+	CTPF_LOGOUT_RSP_RCVD,   /* received logout response */
+};
+
+struct cxgbi_skb_rx_cb {
+	__u32 ddigest;
+	__u32 pdulen;
+};
+
+struct cxgbi_skb_tx_cb {
+	void *handle;
+	void *arp_err_handler;
+	struct sk_buff *wr_next;
+};
+
+enum cxgbi_skcb_flags {
+	SKCBF_TX_NEED_HDR,	/* packet needs a header */
+	SKCBF_TX_MEM_WRITE,     /* memory write */
+	SKCBF_TX_FLAG_COMPL,    /* wr completion flag */
+	SKCBF_RX_COALESCED,	/* received whole pdu */
+	SKCBF_RX_HDR,		/* received pdu header */
+	SKCBF_RX_DATA,		/* received pdu payload */
+	SKCBF_RX_STATUS,	/* received ddp status */
+	SKCBF_RX_ISCSI_COMPL,   /* received iscsi completion */
+	SKCBF_RX_DATA_DDPD,	/* pdu payload ddp'd */
+	SKCBF_RX_HCRC_ERR,	/* header digest error */
+	SKCBF_RX_DCRC_ERR,	/* data digest error */
+	SKCBF_RX_PAD_ERR,	/* padding byte error */
+};
+
+struct cxgbi_skb_cb {
+	union {
+		struct cxgbi_skb_rx_cb rx;
+		struct cxgbi_skb_tx_cb tx;
+	};
+	unsigned char ulp_mode;
+	unsigned long flags;
+	unsigned int seq;
+};
+
+#define CXGBI_SKB_CB(skb)	((struct cxgbi_skb_cb *)&((skb)->cb[0]))
+#define cxgbi_skcb_flags(skb)		(CXGBI_SKB_CB(skb)->flags)
+#define cxgbi_skcb_ulp_mode(skb)	(CXGBI_SKB_CB(skb)->ulp_mode)
+#define cxgbi_skcb_tcp_seq(skb)		(CXGBI_SKB_CB(skb)->seq)
+#define cxgbi_skcb_rx_ddigest(skb)	(CXGBI_SKB_CB(skb)->rx.ddigest)
+#define cxgbi_skcb_rx_pdulen(skb)	(CXGBI_SKB_CB(skb)->rx.pdulen)
+#define cxgbi_skcb_tx_wr_next(skb)	(CXGBI_SKB_CB(skb)->tx.wr_next)
+
+static inline void cxgbi_skcb_set_flag(struct sk_buff *skb,
+					enum cxgbi_skcb_flags flag)
+{
+	__set_bit(flag, &(cxgbi_skcb_flags(skb)));
+}
+
+static inline void cxgbi_skcb_clear_flag(struct sk_buff *skb,
+					enum cxgbi_skcb_flags flag)
+{
+	__clear_bit(flag, &(cxgbi_skcb_flags(skb)));
+}
+
+static inline int cxgbi_skcb_test_flag(const struct sk_buff *skb,
+				       enum cxgbi_skcb_flags flag)
+{
+	return test_bit(flag, &(cxgbi_skcb_flags(skb)));
+}
+
+static inline void cxgbi_sock_set_flag(struct cxgbi_sock *csk,
+					enum cxgbi_sock_flags flag)
+{
+	__set_bit(flag, &csk->flags);
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx, bit %d.\n",
+		csk, csk->state, csk->flags, flag);
+}
+
+static inline void cxgbi_sock_clear_flag(struct cxgbi_sock *csk,
+					enum cxgbi_sock_flags flag)
+{
+	__clear_bit(flag, &csk->flags);
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx, bit %d.\n",
+		csk, csk->state, csk->flags, flag);
+}
+
+static inline int cxgbi_sock_flag(struct cxgbi_sock *csk,
+				enum cxgbi_sock_flags flag)
+{
+	if (csk == NULL)
+		return 0;
+	return test_bit(flag, &csk->flags);
+}
+
+static inline void cxgbi_sock_set_state(struct cxgbi_sock *csk, int state)
+{
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx, state -> %u.\n",
+		csk, csk->state, csk->flags, state);
+	csk->state = state;
+}
+
+static inline void cxgbi_sock_free(struct kref *kref)
+{
+	struct cxgbi_sock *csk = container_of(kref,
+						struct cxgbi_sock,
+						refcnt);
+	if (csk) {
+		log_debug(1 << CXGBI_DBG_SOCK,
+			"free csk 0x%p, state %u, flags 0x%lx\n",
+			csk, csk->state, csk->flags);
+		kfree(csk);
+	}
+}
+
+static inline void __cxgbi_sock_put(const char *fn, struct cxgbi_sock *csk)
+{
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"%s, put csk 0x%p, ref %u-1.\n",
+		fn, csk, kref_read(&csk->refcnt));
+	kref_put(&csk->refcnt, cxgbi_sock_free);
+}
+#define cxgbi_sock_put(csk)	__cxgbi_sock_put(__func__, csk)
+
+static inline void __cxgbi_sock_get(const char *fn, struct cxgbi_sock *csk)
+{
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"%s, get csk 0x%p, ref %u+1.\n",
+		fn, csk, kref_read(&csk->refcnt));
+	kref_get(&csk->refcnt);
+}
+#define cxgbi_sock_get(csk)	__cxgbi_sock_get(__func__, csk)
+
+static inline int cxgbi_sock_is_closing(struct cxgbi_sock *csk)
+{
+	return csk->state >= CTP_ACTIVE_CLOSE;
+}
+
+static inline int cxgbi_sock_is_established(struct cxgbi_sock *csk)
+{
+	return csk->state == CTP_ESTABLISHED;
+}
+
+static inline void cxgbi_sock_purge_write_queue(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&csk->write_queue)))
+		__kfree_skb(skb);
+}
+
+static inline unsigned int cxgbi_sock_compute_wscale(unsigned int win)
+{
+	unsigned int wscale = 0;
+
+	while (wscale < 14 && (65535 << wscale) < win)
+		wscale++;
+	return wscale;
+}
+
+static inline struct sk_buff *alloc_wr(int wrlen, int dlen, gfp_t gfp)
+{
+	struct sk_buff *skb = alloc_skb(wrlen + dlen, gfp);
+
+	if (skb) {
+		__skb_put(skb, wrlen);
+		memset(skb->head, 0, wrlen + dlen);
+	} else
+		pr_info("alloc cpl wr skb %u+%u, OOM.\n", wrlen, dlen);
+	return skb;
+}
+
+
+/*
+ * The number of WRs needed for an skb depends on the number of fragments
+ * in the skb and whether it has any payload in its main body.  This maps the
+ * length of the gather list represented by an skb into the # of necessary WRs.
+ * The extra two fragments are for iscsi bhs and payload padding.
+ */
+#define SKB_WR_LIST_SIZE	 (MAX_SKB_FRAGS + 2)
+
+static inline void cxgbi_sock_reset_wr_list(struct cxgbi_sock *csk)
+{
+	csk->wr_pending_head = csk->wr_pending_tail = NULL;
+}
+
+static inline void cxgbi_sock_enqueue_wr(struct cxgbi_sock *csk,
+					  struct sk_buff *skb)
+{
+	cxgbi_skcb_tx_wr_next(skb) = NULL;
+	/*
+	 * We want to take an extra reference since both us and the driver
+	 * need to free the packet before it's really freed.
+	 */
+	skb_get(skb);
+
+	if (!csk->wr_pending_head)
+		csk->wr_pending_head = skb;
+	else
+		cxgbi_skcb_tx_wr_next(csk->wr_pending_tail) = skb;
+	csk->wr_pending_tail = skb;
+}
+
+static inline int cxgbi_sock_count_pending_wrs(const struct cxgbi_sock *csk)
+{
+	int n = 0;
+	const struct sk_buff *skb = csk->wr_pending_head;
+
+	while (skb) {
+		n += skb->csum;
+		skb = cxgbi_skcb_tx_wr_next(skb);
+	}
+	return n;
+}
+
+static inline struct sk_buff *cxgbi_sock_peek_wr(const struct cxgbi_sock *csk)
+{
+	return csk->wr_pending_head;
+}
+
+static inline struct sk_buff *cxgbi_sock_dequeue_wr(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb = csk->wr_pending_head;
+
+	if (likely(skb)) {
+		csk->wr_pending_head = cxgbi_skcb_tx_wr_next(skb);
+		cxgbi_skcb_tx_wr_next(skb) = NULL;
+	}
+	return skb;
+}
+
+void cxgbi_sock_check_wr_invariants(const struct cxgbi_sock *);
+void cxgbi_sock_purge_wr_queue(struct cxgbi_sock *);
+void cxgbi_sock_skb_entail(struct cxgbi_sock *, struct sk_buff *);
+void cxgbi_sock_fail_act_open(struct cxgbi_sock *, int);
+void cxgbi_sock_act_open_req_arp_failure(void *, struct sk_buff *);
+void cxgbi_sock_closed(struct cxgbi_sock *);
+void cxgbi_sock_established(struct cxgbi_sock *, unsigned int, unsigned int);
+void cxgbi_sock_rcv_abort_rpl(struct cxgbi_sock *);
+void cxgbi_sock_rcv_peer_close(struct cxgbi_sock *);
+void cxgbi_sock_rcv_close_conn_rpl(struct cxgbi_sock *, u32);
+void cxgbi_sock_rcv_wr_ack(struct cxgbi_sock *, unsigned int, unsigned int,
+				int);
+unsigned int cxgbi_sock_select_mss(struct cxgbi_sock *, unsigned int);
+void cxgbi_sock_free_cpl_skbs(struct cxgbi_sock *);
+
+struct cxgbi_hba {
+	struct net_device *ndev;
+	struct net_device *vdev;	/* vlan dev */
+	struct Scsi_Host *shost;
+	struct cxgbi_device *cdev;
+	__be32 ipv4addr;
+	unsigned char port_id;
+};
+
+struct cxgbi_ports_map {
+	unsigned int max_connect;
+	unsigned int used;
+	unsigned short sport_base;
+	spinlock_t lock;
+	unsigned int next;
+	struct cxgbi_sock **port_csk;
+};
+
+#define CXGBI_FLAG_DEV_T3		0x1
+#define CXGBI_FLAG_DEV_T4		0x2
+#define CXGBI_FLAG_ADAPTER_RESET	0x4
+#define CXGBI_FLAG_IPV4_SET		0x10
+#define CXGBI_FLAG_USE_PPOD_OFLDQ       0x40
+#define CXGBI_FLAG_DDP_OFF		0x100
+
+struct cxgbi_device {
+	struct list_head list_head;
+	struct list_head rcu_node;
+	unsigned int flags;
+	struct net_device **ports;
+	void *lldev;
+	struct cxgbi_hba **hbas;
+	const unsigned short *mtus;
+	unsigned char nmtus;
+	unsigned char nports;
+	struct pci_dev *pdev;
+	struct dentry *debugfs_root;
+	struct iscsi_transport *itp;
+	struct module *owner;
+
+	unsigned int pfvf;
+	unsigned int rx_credit_thres;
+	unsigned int skb_tx_rsvd;
+	unsigned int skb_rx_extra;	/* for msg coalesced mode */
+	unsigned int tx_max_size;
+	unsigned int rx_max_size;
+	unsigned int rxq_idx_cntr;
+	struct cxgbi_ports_map pmap;
+
+	void (*dev_ddp_cleanup)(struct cxgbi_device *);
+	struct cxgbi_ppm* (*cdev2ppm)(struct cxgbi_device *);
+	int (*csk_ddp_set_map)(struct cxgbi_ppm *, struct cxgbi_sock *,
+			       struct cxgbi_task_tag_info *);
+	void (*csk_ddp_clear_map)(struct cxgbi_device *cdev,
+				  struct cxgbi_ppm *,
+				  struct cxgbi_task_tag_info *);
+	int (*csk_ddp_setup_digest)(struct cxgbi_sock *,
+				unsigned int, int, int, int);
+	int (*csk_ddp_setup_pgidx)(struct cxgbi_sock *,
+				unsigned int, int, bool);
+
+	void (*csk_release_offload_resources)(struct cxgbi_sock *);
+	int (*csk_rx_pdu_ready)(struct cxgbi_sock *, struct sk_buff *);
+	u32 (*csk_send_rx_credits)(struct cxgbi_sock *, u32);
+	int (*csk_push_tx_frames)(struct cxgbi_sock *, int);
+	void (*csk_send_abort_req)(struct cxgbi_sock *);
+	void (*csk_send_close_req)(struct cxgbi_sock *);
+	int (*csk_alloc_cpls)(struct cxgbi_sock *);
+	int (*csk_init_act_open)(struct cxgbi_sock *);
+
+	void *dd_data;
+};
+#define cxgbi_cdev_priv(cdev)	((cdev)->dd_data)
+
+struct cxgbi_conn {
+	struct cxgbi_endpoint *cep;
+	struct iscsi_conn *iconn;
+	struct cxgbi_hba *chba;
+	u32 task_idx_bits;
+	unsigned int ddp_full;
+	unsigned int ddp_tag_full;
+};
+
+struct cxgbi_endpoint {
+	struct cxgbi_conn *cconn;
+	struct cxgbi_hba *chba;
+	struct cxgbi_sock *csk;
+};
+
+#define MAX_PDU_FRAGS	((ULP2_MAX_PDU_PAYLOAD + 512 - 1) / 512)
+struct cxgbi_task_data {
+	unsigned short nr_frags;
+	struct page_frag frags[MAX_PDU_FRAGS];
+	struct sk_buff *skb;
+	unsigned int dlen;
+	unsigned int offset;
+	unsigned int count;
+	unsigned int sgoffset;
+	struct cxgbi_task_tag_info ttinfo;
+};
+#define iscsi_task_cxgbi_data(task) \
+	((task)->dd_data + sizeof(struct iscsi_tcp_task))
+
+static inline void *cxgbi_alloc_big_mem(unsigned int size,
+					gfp_t gfp)
+{
+	void *p = kzalloc(size, gfp | __GFP_NOWARN);
+
+	if (!p)
+		p = vzalloc(size);
+
+	return p;
+}
+
+static inline void cxgbi_free_big_mem(void *addr)
+{
+	kvfree(addr);
+}
+
+static inline void cxgbi_set_iscsi_ipv4(struct cxgbi_hba *chba, __be32 ipaddr)
+{
+	if (chba->cdev->flags & CXGBI_FLAG_IPV4_SET)
+		chba->ipv4addr = ipaddr;
+	else
+		pr_info("set iscsi ipv4 NOT supported, using %s ipv4.\n",
+			chba->ndev->name);
+}
+
+struct cxgbi_device *cxgbi_device_register(unsigned int, unsigned int);
+void cxgbi_device_unregister(struct cxgbi_device *);
+void cxgbi_device_unregister_all(unsigned int flag);
+struct cxgbi_device *cxgbi_device_find_by_lldev(void *);
+struct cxgbi_device *cxgbi_device_find_by_netdev(struct net_device *, int *);
+struct cxgbi_device *cxgbi_device_find_by_netdev_rcu(struct net_device *,
+						     int *);
+int cxgbi_hbas_add(struct cxgbi_device *, u64, unsigned int,
+			struct scsi_host_template *,
+			struct scsi_transport_template *);
+void cxgbi_hbas_remove(struct cxgbi_device *);
+
+int cxgbi_device_portmap_create(struct cxgbi_device *cdev, unsigned int base,
+			unsigned int max_conn);
+void cxgbi_device_portmap_cleanup(struct cxgbi_device *cdev);
+
+void cxgbi_conn_tx_open(struct cxgbi_sock *);
+void cxgbi_conn_pdu_ready(struct cxgbi_sock *);
+int cxgbi_conn_alloc_pdu(struct iscsi_task *, u8);
+int cxgbi_conn_init_pdu(struct iscsi_task *, unsigned int , unsigned int);
+int cxgbi_conn_xmit_pdu(struct iscsi_task *);
+
+void cxgbi_cleanup_task(struct iscsi_task *task);
+
+umode_t cxgbi_attr_is_visible(int param_type, int param);
+void cxgbi_get_conn_stats(struct iscsi_cls_conn *, struct iscsi_stats *);
+int cxgbi_set_conn_param(struct iscsi_cls_conn *,
+			enum iscsi_param, char *, int);
+int cxgbi_get_ep_param(struct iscsi_endpoint *ep, enum iscsi_param, char *);
+struct iscsi_cls_conn *cxgbi_create_conn(struct iscsi_cls_session *, u32);
+int cxgbi_bind_conn(struct iscsi_cls_session *,
+			struct iscsi_cls_conn *, u64, int);
+void cxgbi_destroy_session(struct iscsi_cls_session *);
+struct iscsi_cls_session *cxgbi_create_session(struct iscsi_endpoint *,
+			u16, u16, u32);
+int cxgbi_set_host_param(struct Scsi_Host *,
+			enum iscsi_host_param, char *, int);
+int cxgbi_get_host_param(struct Scsi_Host *, enum iscsi_host_param, char *);
+struct iscsi_endpoint *cxgbi_ep_connect(struct Scsi_Host *,
+			struct sockaddr *, int);
+int cxgbi_ep_poll(struct iscsi_endpoint *, int);
+void cxgbi_ep_disconnect(struct iscsi_endpoint *);
+
+int cxgbi_iscsi_init(struct iscsi_transport *,
+			struct scsi_transport_template **);
+void cxgbi_iscsi_cleanup(struct iscsi_transport *,
+			struct scsi_transport_template **);
+void cxgbi_parse_pdu_itt(struct iscsi_conn *, itt_t, int *, int *);
+int cxgbi_ddp_init(struct cxgbi_device *, unsigned int, unsigned int,
+			unsigned int, unsigned int);
+int cxgbi_ddp_cleanup(struct cxgbi_device *);
+void cxgbi_ddp_page_size_factor(int *);
+void cxgbi_ddp_set_one_ppod(struct cxgbi_pagepod *,
+			    struct cxgbi_task_tag_info *,
+			    struct scatterlist **sg_pp, unsigned int *sg_off);
+void cxgbi_ddp_ppm_setup(void **ppm_pp, struct cxgbi_device *,
+			 struct cxgbi_tag_format *, unsigned int ppmax,
+			 unsigned int llimit, unsigned int start,
+			 unsigned int rsvd_factor);
+#endif	/*__LIBCXGBI_H__*/
diff --git a/drivers/scsi/qedf/Kconfig b/drivers/scsi/qedf/Kconfig
new file mode 100644
index 0000000..943f5ee
--- /dev/null
+++ b/drivers/scsi/qedf/Kconfig
@@ -0,0 +1,11 @@
+config QEDF
+	tristate "QLogic QEDF 25/40/100Gb FCoE Initiator Driver Support"
+	depends on PCI && SCSI
+	depends on QED
+        depends on LIBFC
+        depends on LIBFCOE
+	select QED_LL2
+	select QED_FCOE
+	---help---
+	This driver supports FCoE offload for the QLogic FastLinQ
+	41000 Series Converged Network Adapters.
diff --git a/drivers/scsi/qedf/Makefile b/drivers/scsi/qedf/Makefile
new file mode 100644
index 0000000..414f2a7
--- /dev/null
+++ b/drivers/scsi/qedf/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_QEDF) := qedf.o
+qedf-y = qedf_dbg.o qedf_main.o qedf_io.o qedf_fip.o \
+	 qedf_attr.o qedf_els.o drv_scsi_fw_funcs.o drv_fcoe_fw_funcs.o
+
+qedf-$(CONFIG_DEBUG_FS) += qedf_debugfs.o
diff --git a/drivers/scsi/qedf/drv_fcoe_fw_funcs.c b/drivers/scsi/qedf/drv_fcoe_fw_funcs.c
new file mode 100644
index 0000000..a980ef7
--- /dev/null
+++ b/drivers/scsi/qedf/drv_fcoe_fw_funcs.c
@@ -0,0 +1,199 @@
+/* QLogic FCoE Offload Driver
+ * Copyright (c) 2016-2017 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+#include "drv_fcoe_fw_funcs.h"
+#include "drv_scsi_fw_funcs.h"
+
+#define FCOE_RX_ID (0xFFFFu)
+
+static inline void init_common_sqe(struct fcoe_task_params *task_params,
+				   enum fcoe_sqe_request_type request_type)
+{
+	memset(task_params->sqe, 0, sizeof(*(task_params->sqe)));
+	SET_FIELD(task_params->sqe->flags, FCOE_WQE_REQ_TYPE,
+		  request_type);
+	task_params->sqe->task_id = task_params->itid;
+}
+
+int init_initiator_rw_fcoe_task(struct fcoe_task_params *task_params,
+				struct scsi_sgl_task_params *sgl_task_params,
+				struct regpair sense_data_buffer_phys_addr,
+				u32 task_retry_id,
+				u8 fcp_cmd_payload[32])
+{
+	struct e4_fcoe_task_context *ctx = task_params->context;
+	const u8 val_byte = ctx->ystorm_ag_context.byte0;
+	struct e4_ustorm_fcoe_task_ag_ctx *u_ag_ctx;
+	struct ystorm_fcoe_task_st_ctx *y_st_ctx;
+	struct tstorm_fcoe_task_st_ctx *t_st_ctx;
+	struct mstorm_fcoe_task_st_ctx *m_st_ctx;
+	u32 io_size, val;
+	bool slow_sgl;
+
+	memset(ctx, 0, sizeof(*(ctx)));
+	ctx->ystorm_ag_context.byte0 = val_byte;
+	slow_sgl = scsi_is_slow_sgl(sgl_task_params->num_sges,
+				    sgl_task_params->small_mid_sge);
+	io_size = (task_params->task_type == FCOE_TASK_TYPE_WRITE_INITIATOR ?
+		   task_params->tx_io_size : task_params->rx_io_size);
+
+	/* Ystorm ctx */
+	y_st_ctx = &ctx->ystorm_st_context;
+	y_st_ctx->data_2_trns_rem = cpu_to_le32(io_size);
+	y_st_ctx->task_rety_identifier = cpu_to_le32(task_retry_id);
+	y_st_ctx->task_type = (u8)task_params->task_type;
+	memcpy(&y_st_ctx->tx_info_union.fcp_cmd_payload,
+	       fcp_cmd_payload, sizeof(struct fcoe_fcp_cmd_payload));
+
+	/* Tstorm ctx */
+	t_st_ctx = &ctx->tstorm_st_context;
+	t_st_ctx->read_only.dev_type = (u8)(task_params->is_tape_device == 1 ?
+					    FCOE_TASK_DEV_TYPE_TAPE :
+					    FCOE_TASK_DEV_TYPE_DISK);
+	t_st_ctx->read_only.cid = cpu_to_le32(task_params->conn_cid);
+	val = cpu_to_le32(task_params->cq_rss_number);
+	t_st_ctx->read_only.glbl_q_num = val;
+	t_st_ctx->read_only.fcp_cmd_trns_size = cpu_to_le32(io_size);
+	t_st_ctx->read_only.task_type = (u8)task_params->task_type;
+	SET_FIELD(t_st_ctx->read_write.flags,
+		  FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME, 1);
+	t_st_ctx->read_write.rx_id = cpu_to_le16(FCOE_RX_ID);
+
+	/* Ustorm ctx */
+	u_ag_ctx = &ctx->ustorm_ag_context;
+	u_ag_ctx->global_cq_num = cpu_to_le32(task_params->cq_rss_number);
+
+	/* Mstorm buffer for sense/rsp data placement */
+	m_st_ctx = &ctx->mstorm_st_context;
+	val = cpu_to_le32(sense_data_buffer_phys_addr.hi);
+	m_st_ctx->rsp_buf_addr.hi = val;
+	val = cpu_to_le32(sense_data_buffer_phys_addr.lo);
+	m_st_ctx->rsp_buf_addr.lo = val;
+
+	if (task_params->task_type == FCOE_TASK_TYPE_WRITE_INITIATOR) {
+		/* Ystorm ctx */
+		y_st_ctx->expect_first_xfer = 1;
+
+		/* Set the amount of super SGEs. Can be up to 4. */
+		SET_FIELD(y_st_ctx->sgl_mode,
+			  YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE,
+			  (slow_sgl ? SCSI_TX_SLOW_SGL : SCSI_FAST_SGL));
+		init_scsi_sgl_context(&y_st_ctx->sgl_params,
+				      &y_st_ctx->data_desc,
+				      sgl_task_params);
+
+		/* Mstorm ctx */
+		SET_FIELD(m_st_ctx->flags,
+			  MSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE,
+			  (slow_sgl ? SCSI_TX_SLOW_SGL : SCSI_FAST_SGL));
+		m_st_ctx->sgl_params.sgl_num_sges =
+			cpu_to_le16(sgl_task_params->num_sges);
+	} else {
+		/* Tstorm ctx */
+		SET_FIELD(t_st_ctx->read_write.flags,
+			  FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE,
+			  (slow_sgl ? SCSI_TX_SLOW_SGL : SCSI_FAST_SGL));
+
+		/* Mstorm ctx */
+		m_st_ctx->data_2_trns_rem = cpu_to_le32(io_size);
+		init_scsi_sgl_context(&m_st_ctx->sgl_params,
+				      &m_st_ctx->data_desc,
+				      sgl_task_params);
+	}
+
+	/* Init Sqe */
+	init_common_sqe(task_params, SEND_FCOE_CMD);
+
+	return 0;
+}
+
+int init_initiator_midpath_unsolicited_fcoe_task(
+	struct fcoe_task_params *task_params,
+	struct fcoe_tx_mid_path_params *mid_path_fc_header,
+	struct scsi_sgl_task_params *tx_sgl_task_params,
+	struct scsi_sgl_task_params *rx_sgl_task_params,
+	u8 fw_to_place_fc_header)
+{
+	struct e4_fcoe_task_context *ctx = task_params->context;
+	const u8 val_byte = ctx->ystorm_ag_context.byte0;
+	struct e4_ustorm_fcoe_task_ag_ctx *u_ag_ctx;
+	struct ystorm_fcoe_task_st_ctx *y_st_ctx;
+	struct tstorm_fcoe_task_st_ctx *t_st_ctx;
+	struct mstorm_fcoe_task_st_ctx *m_st_ctx;
+	u32 val;
+
+	memset(ctx, 0, sizeof(*(ctx)));
+	ctx->ystorm_ag_context.byte0 = val_byte;
+
+	/* Init Ystorm */
+	y_st_ctx = &ctx->ystorm_st_context;
+	init_scsi_sgl_context(&y_st_ctx->sgl_params,
+			      &y_st_ctx->data_desc,
+			      tx_sgl_task_params);
+	SET_FIELD(y_st_ctx->sgl_mode,
+		  YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE, SCSI_FAST_SGL);
+	y_st_ctx->data_2_trns_rem = cpu_to_le32(task_params->tx_io_size);
+	y_st_ctx->task_type = (u8)task_params->task_type;
+	memcpy(&y_st_ctx->tx_info_union.tx_params.mid_path,
+	       mid_path_fc_header, sizeof(struct fcoe_tx_mid_path_params));
+
+	/* Init Mstorm */
+	m_st_ctx = &ctx->mstorm_st_context;
+	init_scsi_sgl_context(&m_st_ctx->sgl_params,
+			      &m_st_ctx->data_desc,
+			      rx_sgl_task_params);
+	SET_FIELD(m_st_ctx->flags,
+		  MSTORM_FCOE_TASK_ST_CTX_MP_INCLUDE_FC_HEADER,
+		  fw_to_place_fc_header);
+	m_st_ctx->data_2_trns_rem = cpu_to_le32(task_params->rx_io_size);
+
+	/* Init Tstorm */
+	t_st_ctx = &ctx->tstorm_st_context;
+	t_st_ctx->read_only.cid = cpu_to_le32(task_params->conn_cid);
+	val = cpu_to_le32(task_params->cq_rss_number);
+	t_st_ctx->read_only.glbl_q_num = val;
+	t_st_ctx->read_only.task_type = (u8)task_params->task_type;
+	SET_FIELD(t_st_ctx->read_write.flags,
+		  FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME, 1);
+	t_st_ctx->read_write.rx_id = cpu_to_le16(FCOE_RX_ID);
+
+	/* Init Ustorm */
+	u_ag_ctx = &ctx->ustorm_ag_context;
+	u_ag_ctx->global_cq_num = cpu_to_le32(task_params->cq_rss_number);
+
+	/* Init SQE */
+	init_common_sqe(task_params, SEND_FCOE_MIDPATH);
+	task_params->sqe->additional_info_union.burst_length =
+				    tx_sgl_task_params->total_buffer_size;
+	SET_FIELD(task_params->sqe->flags,
+		  FCOE_WQE_NUM_SGES, tx_sgl_task_params->num_sges);
+	SET_FIELD(task_params->sqe->flags, FCOE_WQE_SGL_MODE,
+		  SCSI_FAST_SGL);
+
+	return 0;
+}
+
+int init_initiator_abort_fcoe_task(struct fcoe_task_params *task_params)
+{
+	init_common_sqe(task_params, SEND_FCOE_ABTS_REQUEST);
+	return 0;
+}
+
+int init_initiator_cleanup_fcoe_task(struct fcoe_task_params *task_params)
+{
+	init_common_sqe(task_params, FCOE_EXCHANGE_CLEANUP);
+	return 0;
+}
+
+int init_initiator_sequence_recovery_fcoe_task(
+	struct fcoe_task_params *task_params, u32 desired_offset)
+{
+	init_common_sqe(task_params, FCOE_SEQUENCE_RECOVERY);
+	task_params->sqe->additional_info_union.seq_rec_updated_offset =
+								desired_offset;
+	return 0;
+}
diff --git a/drivers/scsi/qedf/drv_fcoe_fw_funcs.h b/drivers/scsi/qedf/drv_fcoe_fw_funcs.h
new file mode 100644
index 0000000..b5c236e
--- /dev/null
+++ b/drivers/scsi/qedf/drv_fcoe_fw_funcs.h
@@ -0,0 +1,93 @@
+/* QLogic FCoE Offload Driver
+ * Copyright (c) 2016-2017 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+#ifndef _FCOE_FW_FUNCS_H
+#define _FCOE_FW_FUNCS_H
+#include "drv_scsi_fw_funcs.h"
+#include "qedf_hsi.h"
+#include <linux/qed/qed_if.h>
+
+struct fcoe_task_params {
+	/* Output parameter [set/filled by the HSI function] */
+	struct e4_fcoe_task_context *context;
+
+	/* Output parameter [set/filled by the HSI function] */
+	struct fcoe_wqe *sqe;
+	enum fcoe_task_type task_type;
+	u32 tx_io_size; /* in bytes */
+	u32 rx_io_size; /* in bytes */
+	u32 conn_cid;
+	u16 itid;
+	u8 cq_rss_number;
+
+	 /* Whether it's Tape device or not (0=Disk, 1=Tape) */
+	u8 is_tape_device;
+};
+
+/**
+ * @brief init_initiator_rw_fcoe_task - Initializes FCoE task context for
+ * read/write task types and init fcoe_sqe
+ *
+ * @param task_params - Pointer to task parameters struct
+ * @param sgl_task_params - Pointer to SGL task params
+ * @param sense_data_buffer_phys_addr - Pointer to sense data buffer
+ * @param task_retry_id - retry identification - Used only for Tape device
+ * @param fcp_cmnd_payload - FCP CMD Payload
+ */
+int init_initiator_rw_fcoe_task(struct fcoe_task_params *task_params,
+	struct scsi_sgl_task_params *sgl_task_params,
+	struct regpair sense_data_buffer_phys_addr,
+	u32 task_retry_id,
+	u8 fcp_cmd_payload[32]);
+
+/**
+ * @brief init_initiator_midpath_fcoe_task - Initializes FCoE task context for
+ * midpath/unsolicited task types and init fcoe_sqe
+ *
+ * @param task_params - Pointer to task parameters struct
+ * @param mid_path_fc_header - FC header
+ * @param tx_sgl_task_params - Pointer to Tx SGL task params
+ * @param rx_sgl_task_params - Pointer to Rx SGL task params
+ * @param fw_to_place_fc_header	- Indication if the FW will place the FC header
+ * in addition to the data arrives.
+ */
+int init_initiator_midpath_unsolicited_fcoe_task(
+	struct fcoe_task_params *task_params,
+	struct fcoe_tx_mid_path_params *mid_path_fc_header,
+	struct scsi_sgl_task_params *tx_sgl_task_params,
+	struct scsi_sgl_task_params *rx_sgl_task_params,
+	u8 fw_to_place_fc_header);
+
+/**
+ * @brief init_initiator_abort_fcoe_task - Initializes FCoE task context for
+ * abort task types and init fcoe_sqe
+ *
+ * @param task_params - Pointer to task parameters struct
+ */
+int init_initiator_abort_fcoe_task(struct fcoe_task_params *task_params);
+
+/**
+ * @brief init_initiator_cleanup_fcoe_task - Initializes FCoE task context for
+ * cleanup task types and init fcoe_sqe
+ *
+ *
+ * @param task_params - Pointer to task parameters struct
+ */
+int init_initiator_cleanup_fcoe_task(struct fcoe_task_params *task_params);
+
+/**
+ * @brief init_initiator_cleanup_fcoe_task - Initializes FCoE task context for
+ * sequence recovery task types and init fcoe_sqe
+ *
+ *
+ * @param task_params - Pointer to task parameters struct
+ * @param desired_offset - The desired offest the task will be re-sent from
+ */
+int init_initiator_sequence_recovery_fcoe_task(
+	struct fcoe_task_params *task_params,
+	u32 desired_offset);
+#endif
diff --git a/drivers/scsi/qedf/drv_scsi_fw_funcs.c b/drivers/scsi/qedf/drv_scsi_fw_funcs.c
new file mode 100644
index 0000000..5d5095e
--- /dev/null
+++ b/drivers/scsi/qedf/drv_scsi_fw_funcs.c
@@ -0,0 +1,44 @@
+/* QLogic FCoE Offload Driver
+ * Copyright (c) 2016-2017 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+#include "drv_scsi_fw_funcs.h"
+
+#define SCSI_NUM_SGES_IN_CACHE 0x4
+
+bool scsi_is_slow_sgl(u16 num_sges, bool small_mid_sge)
+{
+	return (num_sges > SCSI_NUM_SGES_SLOW_SGL_THR && small_mid_sge);
+}
+
+void init_scsi_sgl_context(struct scsi_sgl_params *ctx_sgl_params,
+			   struct scsi_cached_sges *ctx_data_desc,
+			   struct scsi_sgl_task_params *sgl_task_params)
+{
+	/* no need to check for sgl_task_params->sgl validity */
+	u8 num_sges_to_init = sgl_task_params->num_sges >
+			      SCSI_NUM_SGES_IN_CACHE ? SCSI_NUM_SGES_IN_CACHE :
+			      sgl_task_params->num_sges;
+	u8 sge_index;
+	u32 val;
+
+	val = cpu_to_le32(sgl_task_params->sgl_phys_addr.lo);
+	ctx_sgl_params->sgl_addr.lo = val;
+	val = cpu_to_le32(sgl_task_params->sgl_phys_addr.hi);
+	ctx_sgl_params->sgl_addr.hi = val;
+	val = cpu_to_le32(sgl_task_params->total_buffer_size);
+	ctx_sgl_params->sgl_total_length = val;
+	ctx_sgl_params->sgl_num_sges = cpu_to_le16(sgl_task_params->num_sges);
+
+	for (sge_index = 0; sge_index < num_sges_to_init; sge_index++) {
+		val = cpu_to_le32(sgl_task_params->sgl[sge_index].sge_addr.lo);
+		ctx_data_desc->sge[sge_index].sge_addr.lo = val;
+		val = cpu_to_le32(sgl_task_params->sgl[sge_index].sge_addr.hi);
+		ctx_data_desc->sge[sge_index].sge_addr.hi = val;
+		val = cpu_to_le32(sgl_task_params->sgl[sge_index].sge_len);
+		ctx_data_desc->sge[sge_index].sge_len = val;
+	}
+}
diff --git a/drivers/scsi/qedf/drv_scsi_fw_funcs.h b/drivers/scsi/qedf/drv_scsi_fw_funcs.h
new file mode 100644
index 0000000..8fbe6e4
--- /dev/null
+++ b/drivers/scsi/qedf/drv_scsi_fw_funcs.h
@@ -0,0 +1,85 @@
+/* QLogic FCoE Offload Driver
+ * Copyright (c) 2016-2017 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+#ifndef _SCSI_FW_FUNCS_H
+#define _SCSI_FW_FUNCS_H
+#include <linux/qed/common_hsi.h>
+#include <linux/qed/storage_common.h>
+#include <linux/qed/fcoe_common.h>
+
+struct scsi_sgl_task_params {
+	struct scsi_sge *sgl;
+	struct regpair sgl_phys_addr;
+	u32 total_buffer_size;
+	u16 num_sges;
+
+	 /* true if SGL contains a small (< 4KB) SGE in middle(not 1st or last)
+	  * -> relevant for tx only
+	  */
+	bool small_mid_sge;
+};
+
+struct scsi_dif_task_params {
+	u32 initial_ref_tag;
+	bool initial_ref_tag_is_valid;
+	u16 application_tag;
+	u16 application_tag_mask;
+	u16 dif_block_size_log;
+	bool dif_on_network;
+	bool dif_on_host;
+	u8 host_guard_type;
+	u8 protection_type;
+	u8 ref_tag_mask;
+	bool crc_seed;
+
+	 /* Enable Connection error upon DIF error (segments with DIF errors are
+	  * dropped)
+	  */
+	bool tx_dif_conn_err_en;
+	bool ignore_app_tag;
+	bool keep_ref_tag_const;
+	bool validate_guard;
+	bool validate_app_tag;
+	bool validate_ref_tag;
+	bool forward_guard;
+	bool forward_app_tag;
+	bool forward_ref_tag;
+	bool forward_app_tag_with_mask;
+	bool forward_ref_tag_with_mask;
+};
+
+struct scsi_initiator_cmd_params {
+	 /* for cdb_size > default CDB size (extended CDB > 16 bytes) ->
+	  * pointer to the CDB buffer SGE
+	  */
+	struct scsi_sge extended_cdb_sge;
+
+	/* Physical address of sense data buffer for sense data - 256B buffer */
+	struct regpair sense_data_buffer_phys_addr;
+};
+
+/**
+ * @brief scsi_is_slow_sgl - checks for slow SGL
+ *
+ * @param num_sges - number of sges in SGL
+ * @param small_mid_sge - True is the SGL contains an SGE which is smaller than
+ * 4KB and its not the 1st or last SGE in the SGL
+ */
+bool scsi_is_slow_sgl(u16 num_sges, bool small_mid_sge);
+
+/**
+ * @brief init_scsi_sgl_context - initializes SGL task context
+ *
+ * @param sgl_params - SGL context parameters to initialize (output parameter)
+ * @param data_desc - context struct containing SGEs array to set (output
+ * parameter)
+ * @param sgl_task_params - SGL parameters (input)
+ */
+void init_scsi_sgl_context(struct scsi_sgl_params *sgl_params,
+	struct scsi_cached_sges *ctx_data_desc,
+	struct scsi_sgl_task_params *sgl_task_params);
+#endif
diff --git a/drivers/scsi/qedf/qedf.h b/drivers/scsi/qedf/qedf.h
new file mode 100644
index 0000000..c105a2e
--- /dev/null
+++ b/drivers/scsi/qedf/qedf.h
@@ -0,0 +1,545 @@
+/*
+ *  QLogic FCoE Offload Driver
+ *  Copyright (c) 2016-2017 Cavium Inc.
+ *
+ *  This software is available under the terms of the GNU General Public License
+ *  (GPL) Version 2, available from the file COPYING in the main directory of
+ *  this source tree.
+ */
+#ifndef _QEDFC_H_
+#define _QEDFC_H_
+
+#include <scsi/libfcoe.h>
+#include <scsi/libfc.h>
+#include <scsi/fc/fc_fip.h>
+#include <scsi/fc/fc_fc2.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/fc_encode.h>
+#include <linux/version.h>
+
+
+/* qedf_hsi.h needs to before included any qed includes */
+#include "qedf_hsi.h"
+
+#include <linux/qed/qed_if.h>
+#include <linux/qed/qed_fcoe_if.h>
+#include <linux/qed/qed_ll2_if.h>
+#include "qedf_version.h"
+#include "qedf_dbg.h"
+#include "drv_fcoe_fw_funcs.h"
+
+/* Helpers to extract upper and lower 32-bits of pointer */
+#define U64_HI(val) ((u32)(((u64)(val)) >> 32))
+#define U64_LO(val) ((u32)(((u64)(val)) & 0xffffffff))
+
+#define QEDF_DESCR "QLogic FCoE Offload Driver"
+#define QEDF_MODULE_NAME "qedf"
+
+#define QEDF_MIN_XID		0
+#define QEDF_MAX_SCSI_XID	(NUM_TASKS_PER_CONNECTION - 1)
+#define QEDF_MAX_ELS_XID	4095
+#define QEDF_FLOGI_RETRY_CNT	3
+#define QEDF_RPORT_RETRY_CNT	255
+#define QEDF_MAX_SESSIONS	1024
+#define QEDF_MAX_PAYLOAD	2048
+#define QEDF_MAX_BDS_PER_CMD	256
+#define QEDF_MAX_BD_LEN		0xffff
+#define QEDF_BD_SPLIT_SZ	0x1000
+#define QEDF_PAGE_SIZE		4096
+#define QED_HW_DMA_BOUNDARY     0xfff
+#define QEDF_MAX_SGLEN_FOR_CACHESGL		((1U << 16) - 1)
+#define QEDF_MFS		(QEDF_MAX_PAYLOAD + \
+	sizeof(struct fc_frame_header))
+#define QEDF_MAX_NPIV		64
+#define QEDF_TM_TIMEOUT		10
+#define QEDF_ABORT_TIMEOUT	10
+#define QEDF_CLEANUP_TIMEOUT	10
+#define QEDF_MAX_CDB_LEN	16
+
+#define UPSTREAM_REMOVE		1
+#define UPSTREAM_KEEP		1
+
+struct qedf_mp_req {
+	uint32_t req_len;
+	void *req_buf;
+	dma_addr_t req_buf_dma;
+	struct scsi_sge *mp_req_bd;
+	dma_addr_t mp_req_bd_dma;
+	struct fc_frame_header req_fc_hdr;
+
+	uint32_t resp_len;
+	void *resp_buf;
+	dma_addr_t resp_buf_dma;
+	struct scsi_sge *mp_resp_bd;
+	dma_addr_t mp_resp_bd_dma;
+	struct fc_frame_header resp_fc_hdr;
+};
+
+struct qedf_els_cb_arg {
+	struct qedf_ioreq *aborted_io_req;
+	struct qedf_ioreq *io_req;
+	u8 op; /* Used to keep track of ELS op */
+	uint16_t l2_oxid;
+	u32 offset; /* Used for sequence cleanup */
+	u8 r_ctl; /* Used for sequence cleanup */
+};
+
+enum qedf_ioreq_event {
+	QEDF_IOREQ_EV_ABORT_SUCCESS,
+	QEDF_IOREQ_EV_ABORT_FAILED,
+	QEDF_IOREQ_EV_SEND_RRQ,
+	QEDF_IOREQ_EV_ELS_TMO,
+	QEDF_IOREQ_EV_ELS_ERR_DETECT,
+	QEDF_IOREQ_EV_ELS_FLUSH,
+	QEDF_IOREQ_EV_CLEANUP_SUCCESS,
+	QEDF_IOREQ_EV_CLEANUP_FAILED,
+};
+
+#define FC_GOOD		0
+#define FCOE_FCP_RSP_FLAGS_FCP_RESID_OVER	(0x1<<2)
+#define FCOE_FCP_RSP_FLAGS_FCP_RESID_UNDER	(0x1<<3)
+#define CMD_SCSI_STATUS(Cmnd)			((Cmnd)->SCp.Status)
+#define FCOE_FCP_RSP_FLAGS_FCP_RSP_LEN_VALID	(0x1<<0)
+#define FCOE_FCP_RSP_FLAGS_FCP_SNS_LEN_VALID	(0x1<<1)
+struct qedf_ioreq {
+	struct list_head link;
+	uint16_t xid;
+	struct scsi_cmnd *sc_cmd;
+	bool use_slowpath; /* Use slow SGL for this I/O */
+#define QEDF_SCSI_CMD		1
+#define QEDF_TASK_MGMT_CMD	2
+#define QEDF_ABTS		3
+#define QEDF_ELS		4
+#define QEDF_CLEANUP		5
+#define QEDF_SEQ_CLEANUP	6
+	u8 cmd_type;
+#define QEDF_CMD_OUTSTANDING		0x0
+#define QEDF_CMD_IN_ABORT		0x1
+#define QEDF_CMD_IN_CLEANUP		0x2
+#define QEDF_CMD_SRR_SENT		0x3
+	u8 io_req_flags;
+	uint8_t tm_flags;
+	struct qedf_rport *fcport;
+	unsigned long flags;
+	enum qedf_ioreq_event event;
+	size_t data_xfer_len;
+	struct kref refcount;
+	struct qedf_cmd_mgr *cmd_mgr;
+	struct io_bdt *bd_tbl;
+	struct delayed_work timeout_work;
+	struct completion tm_done;
+	struct completion abts_done;
+	struct e4_fcoe_task_context *task;
+	struct fcoe_task_params *task_params;
+	struct scsi_sgl_task_params *sgl_task_params;
+	int idx;
+/*
+ * Need to allocate enough room for both sense data and FCP response data
+ * which has a max length of 8 bytes according to spec.
+ */
+#define QEDF_SCSI_SENSE_BUFFERSIZE	(SCSI_SENSE_BUFFERSIZE + 8)
+	uint8_t *sense_buffer;
+	dma_addr_t sense_buffer_dma;
+	u32 fcp_resid;
+	u32 fcp_rsp_len;
+	u32 fcp_sns_len;
+	u8 cdb_status;
+	u8 fcp_status;
+	u8 fcp_rsp_code;
+	u8 scsi_comp_flags;
+#define QEDF_MAX_REUSE		0xfff
+	u16 reuse_count;
+	struct qedf_mp_req mp_req;
+	void (*cb_func)(struct qedf_els_cb_arg *cb_arg);
+	struct qedf_els_cb_arg *cb_arg;
+	int fp_idx;
+	unsigned int cpu;
+	unsigned int int_cpu;
+#define QEDF_IOREQ_SLOW_SGE		0
+#define QEDF_IOREQ_SINGLE_SGE		1
+#define QEDF_IOREQ_FAST_SGE		2
+	u8 sge_type;
+	struct delayed_work rrq_work;
+
+	/* Used for sequence level recovery; i.e. REC/SRR */
+	uint32_t rx_buf_off;
+	uint32_t tx_buf_off;
+	uint32_t rx_id;
+	uint32_t task_retry_identifier;
+
+	/*
+	 * Used to tell if we need to return a SCSI command
+	 * during some form of error processing.
+	 */
+	bool return_scsi_cmd_on_abts;
+};
+
+extern struct workqueue_struct *qedf_io_wq;
+
+struct qedf_rport {
+	spinlock_t rport_lock;
+#define QEDF_RPORT_SESSION_READY 1
+#define QEDF_RPORT_UPLOADING_CONNECTION	2
+	unsigned long flags;
+	unsigned long retry_delay_timestamp;
+	struct fc_rport *rport;
+	struct fc_rport_priv *rdata;
+	struct qedf_ctx *qedf;
+	u32 handle; /* Handle from qed */
+	u32 fw_cid; /* fw_cid from qed */
+	void __iomem *p_doorbell;
+	/* Send queue management */
+	atomic_t free_sqes;
+	atomic_t num_active_ios;
+	struct fcoe_wqe *sq;
+	dma_addr_t sq_dma;
+	u16 sq_prod_idx;
+	u16 fw_sq_prod_idx;
+	u16 sq_con_idx;
+	u32 sq_mem_size;
+	void *sq_pbl;
+	dma_addr_t sq_pbl_dma;
+	u32 sq_pbl_size;
+	u32 sid;
+#define	QEDF_RPORT_TYPE_DISK		0
+#define	QEDF_RPORT_TYPE_TAPE		1
+	uint dev_type; /* Disk or tape */
+	struct list_head peers;
+};
+
+/* Used to contain LL2 skb's in ll2_skb_list */
+struct qedf_skb_work {
+	struct work_struct work;
+	struct sk_buff *skb;
+	struct qedf_ctx *qedf;
+};
+
+struct qedf_fastpath {
+#define	QEDF_SB_ID_NULL		0xffff
+	u16		sb_id;
+	struct qed_sb_info	*sb_info;
+	struct qedf_ctx *qedf;
+	/* Keep track of number of completions on this fastpath */
+	unsigned long completions;
+	uint32_t cq_num_entries;
+};
+
+/* Used to pass fastpath information needed to process CQEs */
+struct qedf_io_work {
+	struct work_struct work;
+	struct fcoe_cqe cqe;
+	struct qedf_ctx *qedf;
+	struct fc_frame *fp;
+};
+
+struct qedf_glbl_q_params {
+	u64	hw_p_cq;	/* Completion queue PBL */
+	u64	hw_p_rq;	/* Request queue PBL */
+	u64	hw_p_cmdq;	/* Command queue PBL */
+};
+
+struct global_queue {
+	struct fcoe_cqe *cq;
+	dma_addr_t cq_dma;
+	u32 cq_mem_size;
+	u32 cq_cons_idx; /* Completion queue consumer index */
+	u32 cq_prod_idx;
+
+	void *cq_pbl;
+	dma_addr_t cq_pbl_dma;
+	u32 cq_pbl_size;
+};
+
+/* I/O tracing entry */
+#define QEDF_IO_TRACE_SIZE		2048
+struct qedf_io_log {
+#define QEDF_IO_TRACE_REQ		0
+#define QEDF_IO_TRACE_RSP		1
+	uint8_t direction;
+	uint16_t task_id;
+	uint32_t port_id; /* Remote port fabric ID */
+	int lun;
+	unsigned char op; /* SCSI CDB */
+	uint8_t lba[4];
+	unsigned int bufflen; /* SCSI buffer length */
+	unsigned int sg_count; /* Number of SG elements */
+	int result; /* Result passed back to mid-layer */
+	unsigned long jiffies; /* Time stamp when I/O logged */
+	int refcount; /* Reference count for task id */
+	unsigned int req_cpu; /* CPU that the task is queued on */
+	unsigned int int_cpu; /* Interrupt CPU that the task is received on */
+	unsigned int rsp_cpu; /* CPU that task is returned on */
+	u8 sge_type; /* Did we take the slow, single or fast SGE path */
+};
+
+/* Number of entries in BDQ */
+#define QEDF_BDQ_SIZE			256
+#define QEDF_BDQ_BUF_SIZE		2072
+
+/* DMA coherent buffers for BDQ */
+struct qedf_bdq_buf {
+	void *buf_addr;
+	dma_addr_t buf_dma;
+};
+
+/* Main adapter struct */
+struct qedf_ctx {
+	struct qedf_dbg_ctx dbg_ctx;
+	struct fcoe_ctlr ctlr;
+	struct fc_lport *lport;
+	u8 data_src_addr[ETH_ALEN];
+#define QEDF_LINK_DOWN		0
+#define QEDF_LINK_UP		1
+	atomic_t link_state;
+#define QEDF_DCBX_PENDING	0
+#define QEDF_DCBX_DONE		1
+	atomic_t dcbx;
+	uint16_t max_scsi_xid;
+	uint16_t max_els_xid;
+#define QEDF_NULL_VLAN_ID	-1
+#define QEDF_FALLBACK_VLAN	1002
+#define QEDF_DEFAULT_PRIO	3
+	int vlan_id;
+	struct qed_dev *cdev;
+	struct qed_dev_fcoe_info dev_info;
+	struct qed_int_info int_info;
+	uint16_t last_command;
+	spinlock_t hba_lock;
+	struct pci_dev *pdev;
+	u64 wwnn;
+	u64 wwpn;
+	u8 __aligned(16) mac[ETH_ALEN];
+	struct list_head fcports;
+	atomic_t num_offloads;
+	unsigned int curr_conn_id;
+	struct workqueue_struct *ll2_recv_wq;
+	struct workqueue_struct *link_update_wq;
+	struct delayed_work link_update;
+	struct delayed_work link_recovery;
+	struct completion flogi_compl;
+	struct completion fipvlan_compl;
+
+	/*
+	 * Used to tell if we're in the window where we are waiting for
+	 * the link to come back up before informting fcoe that the link is
+	 * done.
+	 */
+	atomic_t link_down_tmo_valid;
+#define QEDF_TIMER_INTERVAL		(1 * HZ)
+	struct timer_list timer; /* One second book keeping timer */
+#define QEDF_DRAIN_ACTIVE		1
+#define QEDF_LL2_STARTED		2
+#define QEDF_UNLOADING			3
+#define QEDF_GRCDUMP_CAPTURE		4
+#define QEDF_IN_RECOVERY		5
+#define QEDF_DBG_STOP_IO		6
+	unsigned long flags; /* Miscellaneous state flags */
+	int fipvlan_retries;
+	u8 num_queues;
+	struct global_queue **global_queues;
+	/* Pointer to array of queue structures */
+	struct qedf_glbl_q_params *p_cpuq;
+	/* Physical address of array of queue structures */
+	dma_addr_t hw_p_cpuq;
+
+	struct qedf_bdq_buf bdq[QEDF_BDQ_SIZE];
+	void *bdq_pbl;
+	dma_addr_t bdq_pbl_dma;
+	size_t bdq_pbl_mem_size;
+	void *bdq_pbl_list;
+	dma_addr_t bdq_pbl_list_dma;
+	u8 bdq_pbl_list_num_entries;
+	void __iomem *bdq_primary_prod;
+	void __iomem *bdq_secondary_prod;
+	uint16_t bdq_prod_idx;
+
+	/* Structure for holding all the fastpath for this qedf_ctx */
+	struct qedf_fastpath *fp_array;
+	struct qed_fcoe_tid tasks;
+	struct qedf_cmd_mgr *cmd_mgr;
+	/* Holds the PF parameters we pass to qed to start he FCoE function */
+	struct qed_pf_params pf_params;
+	/* Used to time middle path ELS and TM commands */
+	struct workqueue_struct *timer_work_queue;
+
+#define QEDF_IO_WORK_MIN		64
+	mempool_t *io_mempool;
+	struct workqueue_struct *dpc_wq;
+
+	u32 slow_sge_ios;
+	u32 fast_sge_ios;
+	u32 single_sge_ios;
+
+	uint8_t	*grcdump;
+	uint32_t grcdump_size;
+
+	struct qedf_io_log io_trace_buf[QEDF_IO_TRACE_SIZE];
+	spinlock_t io_trace_lock;
+	uint16_t io_trace_idx;
+
+	bool stop_io_on_error;
+
+	u32 flogi_cnt;
+	u32 flogi_failed;
+
+	/* Used for fc statistics */
+	u64 input_requests;
+	u64 output_requests;
+	u64 control_requests;
+	u64 packet_aborts;
+	u64 alloc_failures;
+};
+
+struct io_bdt {
+	struct qedf_ioreq *io_req;
+	struct scsi_sge *bd_tbl;
+	dma_addr_t bd_tbl_dma;
+	u16 bd_valid;
+};
+
+struct qedf_cmd_mgr {
+	struct qedf_ctx *qedf;
+	u16 idx;
+	struct io_bdt **io_bdt_pool;
+#define FCOE_PARAMS_NUM_TASKS		2048
+	struct qedf_ioreq cmds[FCOE_PARAMS_NUM_TASKS];
+	spinlock_t lock;
+	atomic_t free_list_cnt;
+};
+
+/* Stolen from qed_cxt_api.h and adapted for qed_fcoe_info
+ * Usage:
+ *
+ * void *ptr;
+ * ptr = qedf_get_task_mem(&qedf->tasks, 128);
+ */
+static inline void *qedf_get_task_mem(struct qed_fcoe_tid *info, u32 tid)
+{
+	return (void *)(info->blocks[tid / info->num_tids_per_block] +
+			(tid % info->num_tids_per_block) * info->size);
+}
+
+static inline void qedf_stop_all_io(struct qedf_ctx *qedf)
+{
+	set_bit(QEDF_DBG_STOP_IO, &qedf->flags);
+}
+
+/*
+ * Externs
+ */
+#define QEDF_DEFAULT_LOG_MASK		0x3CFB6
+extern const struct qed_fcoe_ops *qed_ops;
+extern uint qedf_dump_frames;
+extern uint qedf_io_tracing;
+extern uint qedf_stop_io_on_error;
+extern uint qedf_link_down_tmo;
+#define QEDF_RETRY_DELAY_MAX		20 /* 2 seconds */
+extern bool qedf_retry_delay;
+extern uint qedf_debug;
+
+extern struct qedf_cmd_mgr *qedf_cmd_mgr_alloc(struct qedf_ctx *qedf);
+extern void qedf_cmd_mgr_free(struct qedf_cmd_mgr *cmgr);
+extern int qedf_queuecommand(struct Scsi_Host *host,
+	struct scsi_cmnd *sc_cmd);
+extern void qedf_fip_send(struct fcoe_ctlr *fip, struct sk_buff *skb);
+extern u8 *qedf_get_src_mac(struct fc_lport *lport);
+extern void qedf_fip_recv(struct qedf_ctx *qedf, struct sk_buff *skb);
+extern void qedf_fcoe_send_vlan_req(struct qedf_ctx *qedf);
+extern void qedf_scsi_completion(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
+	struct qedf_ioreq *io_req);
+extern void qedf_process_warning_compl(struct qedf_ctx *qedf,
+	struct fcoe_cqe *cqe, struct qedf_ioreq *io_req);
+extern void qedf_process_error_detect(struct qedf_ctx *qedf,
+	struct fcoe_cqe *cqe, struct qedf_ioreq *io_req);
+extern void qedf_flush_active_ios(struct qedf_rport *fcport, int lun);
+extern void qedf_release_cmd(struct kref *ref);
+extern int qedf_initiate_abts(struct qedf_ioreq *io_req,
+	bool return_scsi_cmd_on_abts);
+extern void qedf_process_abts_compl(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
+	struct qedf_ioreq *io_req);
+extern struct qedf_ioreq *qedf_alloc_cmd(struct qedf_rport *fcport,
+	u8 cmd_type);
+
+extern struct device_attribute *qedf_host_attrs[];
+extern void qedf_cmd_timer_set(struct qedf_ctx *qedf, struct qedf_ioreq *io_req,
+	unsigned int timer_msec);
+extern int qedf_init_mp_req(struct qedf_ioreq *io_req);
+extern void qedf_init_mp_task(struct qedf_ioreq *io_req,
+	struct e4_fcoe_task_context *task_ctx, struct fcoe_wqe *sqe);
+extern u16 qedf_get_sqe_idx(struct qedf_rport *fcport);
+extern void qedf_ring_doorbell(struct qedf_rport *fcport);
+extern void qedf_process_els_compl(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
+	struct qedf_ioreq *els_req);
+extern int qedf_send_rrq(struct qedf_ioreq *aborted_io_req);
+extern int qedf_send_adisc(struct qedf_rport *fcport, struct fc_frame *fp);
+extern int qedf_initiate_cleanup(struct qedf_ioreq *io_req,
+	bool return_scsi_cmd_on_abts);
+extern void qedf_process_cleanup_compl(struct qedf_ctx *qedf,
+	struct fcoe_cqe *cqe, struct qedf_ioreq *io_req);
+extern int qedf_initiate_tmf(struct scsi_cmnd *sc_cmd, u8 tm_flags);
+extern void qedf_process_tmf_compl(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
+	struct qedf_ioreq *io_req);
+extern void qedf_process_cqe(struct qedf_ctx *qedf, struct fcoe_cqe *cqe);
+extern void qedf_scsi_done(struct qedf_ctx *qedf, struct qedf_ioreq *io_req,
+	int result);
+extern void qedf_set_vlan_id(struct qedf_ctx *qedf, int vlan_id);
+extern void qedf_create_sysfs_ctx_attr(struct qedf_ctx *qedf);
+extern void qedf_remove_sysfs_ctx_attr(struct qedf_ctx *qedf);
+extern void qedf_capture_grc_dump(struct qedf_ctx *qedf);
+extern void qedf_wait_for_upload(struct qedf_ctx *qedf);
+extern void qedf_process_unsol_compl(struct qedf_ctx *qedf, uint16_t que_idx,
+	struct fcoe_cqe *cqe);
+extern void qedf_restart_rport(struct qedf_rport *fcport);
+extern int qedf_send_rec(struct qedf_ioreq *orig_io_req);
+extern int qedf_post_io_req(struct qedf_rport *fcport,
+	struct qedf_ioreq *io_req);
+extern void qedf_process_seq_cleanup_compl(struct qedf_ctx *qedf,
+	struct fcoe_cqe *cqe, struct qedf_ioreq *io_req);
+extern int qedf_send_flogi(struct qedf_ctx *qedf);
+extern void qedf_fp_io_handler(struct work_struct *work);
+
+#define FCOE_WORD_TO_BYTE  4
+#define QEDF_MAX_TASK_NUM	0xFFFF
+
+struct fip_vlan {
+	struct ethhdr eth;
+	struct fip_header fip;
+	struct {
+		struct fip_mac_desc mac;
+		struct fip_wwn_desc wwnn;
+	} desc;
+};
+
+/* SQ/CQ Sizes */
+#define GBL_RSVD_TASKS			16
+#define NUM_TASKS_PER_CONNECTION	1024
+#define NUM_RW_TASKS_PER_CONNECTION	512
+#define FCOE_PARAMS_CQ_NUM_ENTRIES	FCOE_PARAMS_NUM_TASKS
+
+#define FCOE_PARAMS_CMDQ_NUM_ENTRIES	FCOE_PARAMS_NUM_TASKS
+#define SQ_NUM_ENTRIES			NUM_TASKS_PER_CONNECTION
+
+#define QEDF_FCOE_PARAMS_GL_RQ_PI              0
+#define QEDF_FCOE_PARAMS_GL_CMD_PI             1
+
+#define QEDF_READ                     (1 << 1)
+#define QEDF_WRITE                    (1 << 0)
+#define MAX_FIBRE_LUNS			0xffffffff
+
+#define MIN_NUM_CPUS_MSIX(x)	min_t(u32, x->dev_info.num_cqs, \
+					num_online_cpus())
+
+/*
+ * PCI function probe defines
+ */
+/* Probe/remove called during normal PCI probe */
+#define	QEDF_MODE_NORMAL		0
+/* Probe/remove called from qed error recovery */
+#define QEDF_MODE_RECOVERY		1
+
+#define SUPPORTED_25000baseKR_Full    (1<<27)
+#define SUPPORTED_50000baseKR2_Full   (1<<28)
+#define SUPPORTED_100000baseKR4_Full  (1<<29)
+#define SUPPORTED_100000baseCR4_Full  (1<<30)
+
+#endif
diff --git a/drivers/scsi/qedf/qedf_attr.c b/drivers/scsi/qedf/qedf_attr.c
new file mode 100644
index 0000000..fa67276
--- /dev/null
+++ b/drivers/scsi/qedf/qedf_attr.c
@@ -0,0 +1,184 @@
+/*
+ *  QLogic FCoE Offload Driver
+ *  Copyright (c) 2016-2017 Cavium Inc.
+ *
+ *  This software is available under the terms of the GNU General Public License
+ *  (GPL) Version 2, available from the file COPYING in the main directory of
+ *  this source tree.
+ */
+#include "qedf.h"
+
+inline bool qedf_is_vport(struct qedf_ctx *qedf)
+{
+	return qedf->lport->vport != NULL;
+}
+
+/* Get base qedf for physical port from vport */
+static struct qedf_ctx *qedf_get_base_qedf(struct qedf_ctx *qedf)
+{
+	struct fc_lport *lport;
+	struct fc_lport *base_lport;
+
+	if (!(qedf_is_vport(qedf)))
+		return NULL;
+
+	lport = qedf->lport;
+	base_lport = shost_priv(vport_to_shost(lport->vport));
+	return lport_priv(base_lport);
+}
+
+static ssize_t
+qedf_fcoe_mac_show(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	struct fc_lport *lport = shost_priv(class_to_shost(dev));
+	u32 port_id;
+	u8 lport_src_id[3];
+	u8 fcoe_mac[6];
+
+	port_id = fc_host_port_id(lport->host);
+	lport_src_id[2] = (port_id & 0x000000FF);
+	lport_src_id[1] = (port_id & 0x0000FF00) >> 8;
+	lport_src_id[0] = (port_id & 0x00FF0000) >> 16;
+	fc_fcoe_set_mac(fcoe_mac, lport_src_id);
+
+	return scnprintf(buf, PAGE_SIZE, "%pM\n", fcoe_mac);
+}
+
+static ssize_t
+qedf_fka_period_show(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	struct fc_lport *lport = shost_priv(class_to_shost(dev));
+	struct qedf_ctx *qedf = lport_priv(lport);
+	int fka_period = -1;
+
+	if (qedf_is_vport(qedf))
+		qedf = qedf_get_base_qedf(qedf);
+
+	if (qedf->ctlr.sel_fcf)
+		fka_period = qedf->ctlr.sel_fcf->fka_period;
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", fka_period);
+}
+
+static DEVICE_ATTR(fcoe_mac, S_IRUGO, qedf_fcoe_mac_show, NULL);
+static DEVICE_ATTR(fka_period, S_IRUGO, qedf_fka_period_show, NULL);
+
+struct device_attribute *qedf_host_attrs[] = {
+	&dev_attr_fcoe_mac,
+	&dev_attr_fka_period,
+	NULL,
+};
+
+extern const struct qed_fcoe_ops *qed_ops;
+
+void qedf_capture_grc_dump(struct qedf_ctx *qedf)
+{
+	struct qedf_ctx *base_qedf;
+
+	/* Make sure we use the base qedf to take the GRC dump */
+	if (qedf_is_vport(qedf))
+		base_qedf = qedf_get_base_qedf(qedf);
+	else
+		base_qedf = qedf;
+
+	if (test_bit(QEDF_GRCDUMP_CAPTURE, &base_qedf->flags)) {
+		QEDF_INFO(&(base_qedf->dbg_ctx), QEDF_LOG_INFO,
+		    "GRC Dump already captured.\n");
+		return;
+	}
+
+
+	qedf_get_grc_dump(base_qedf->cdev, qed_ops->common,
+	    &base_qedf->grcdump, &base_qedf->grcdump_size);
+	QEDF_ERR(&(base_qedf->dbg_ctx), "GRC Dump captured.\n");
+	set_bit(QEDF_GRCDUMP_CAPTURE, &base_qedf->flags);
+	qedf_uevent_emit(base_qedf->lport->host, QEDF_UEVENT_CODE_GRCDUMP,
+	    NULL);
+}
+
+static ssize_t
+qedf_sysfs_read_grcdump(struct file *filep, struct kobject *kobj,
+			struct bin_attribute *ba, char *buf, loff_t off,
+			size_t count)
+{
+	ssize_t ret = 0;
+	struct fc_lport *lport = shost_priv(dev_to_shost(container_of(kobj,
+							struct device, kobj)));
+	struct qedf_ctx *qedf = lport_priv(lport);
+
+	if (test_bit(QEDF_GRCDUMP_CAPTURE, &qedf->flags)) {
+		ret = memory_read_from_buffer(buf, count, &off,
+		    qedf->grcdump, qedf->grcdump_size);
+	} else {
+		QEDF_ERR(&(qedf->dbg_ctx), "GRC Dump not captured!\n");
+	}
+
+	return ret;
+}
+
+static ssize_t
+qedf_sysfs_write_grcdump(struct file *filep, struct kobject *kobj,
+			struct bin_attribute *ba, char *buf, loff_t off,
+			size_t count)
+{
+	struct fc_lport *lport = NULL;
+	struct qedf_ctx *qedf = NULL;
+	long reading;
+	int ret = 0;
+	char msg[40];
+
+	if (off != 0)
+		return ret;
+
+
+	lport = shost_priv(dev_to_shost(container_of(kobj,
+	    struct device, kobj)));
+	qedf = lport_priv(lport);
+
+	buf[1] = 0;
+	ret = kstrtol(buf, 10, &reading);
+	if (ret) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Invalid input, err(%d)\n", ret);
+		return ret;
+	}
+
+	memset(msg, 0, sizeof(msg));
+	switch (reading) {
+	case 0:
+		memset(qedf->grcdump, 0, qedf->grcdump_size);
+		clear_bit(QEDF_GRCDUMP_CAPTURE, &qedf->flags);
+		break;
+	case 1:
+		qedf_capture_grc_dump(qedf);
+		break;
+	}
+
+	return count;
+}
+
+static struct bin_attribute sysfs_grcdump_attr = {
+	.attr = {
+		.name = "grcdump",
+		.mode = S_IRUSR | S_IWUSR,
+	},
+	.size = 0,
+	.read = qedf_sysfs_read_grcdump,
+	.write = qedf_sysfs_write_grcdump,
+};
+
+static struct sysfs_bin_attrs bin_file_entries[] = {
+	{"grcdump", &sysfs_grcdump_attr},
+	{NULL},
+};
+
+void qedf_create_sysfs_ctx_attr(struct qedf_ctx *qedf)
+{
+	qedf_create_sysfs_attr(qedf->lport->host, bin_file_entries);
+}
+
+void qedf_remove_sysfs_ctx_attr(struct qedf_ctx *qedf)
+{
+	qedf_remove_sysfs_attr(qedf->lport->host, bin_file_entries);
+}
diff --git a/drivers/scsi/qedf/qedf_dbg.c b/drivers/scsi/qedf/qedf_dbg.c
new file mode 100644
index 0000000..bd1cef2
--- /dev/null
+++ b/drivers/scsi/qedf/qedf_dbg.c
@@ -0,0 +1,195 @@
+/*
+ *  QLogic FCoE Offload Driver
+ *  Copyright (c) 2016 Cavium Inc.
+ *
+ *  This software is available under the terms of the GNU General Public License
+ *  (GPL) Version 2, available from the file COPYING in the main directory of
+ *  this source tree.
+ */
+#include "qedf_dbg.h"
+#include <linux/vmalloc.h>
+
+void
+qedf_dbg_err(struct qedf_dbg_ctx *qedf, const char *func, u32 line,
+	      const char *fmt, ...)
+{
+	va_list va;
+	struct va_format vaf;
+	char nfunc[32];
+
+	memset(nfunc, 0, sizeof(nfunc));
+	memcpy(nfunc, func, sizeof(nfunc) - 1);
+
+	va_start(va, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &va;
+
+	if (likely(qedf) && likely(qedf->pdev))
+		pr_err("[%s]:[%s:%d]:%d: %pV", dev_name(&(qedf->pdev->dev)),
+			nfunc, line, qedf->host_no, &vaf);
+	else
+		pr_err("[0000:00:00.0]:[%s:%d]: %pV", nfunc, line, &vaf);
+
+	va_end(va);
+}
+
+void
+qedf_dbg_warn(struct qedf_dbg_ctx *qedf, const char *func, u32 line,
+	       const char *fmt, ...)
+{
+	va_list va;
+	struct va_format vaf;
+	char nfunc[32];
+
+	memset(nfunc, 0, sizeof(nfunc));
+	memcpy(nfunc, func, sizeof(nfunc) - 1);
+
+	va_start(va, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &va;
+
+	if (!(qedf_debug & QEDF_LOG_WARN))
+		goto ret;
+
+	if (likely(qedf) && likely(qedf->pdev))
+		pr_warn("[%s]:[%s:%d]:%d: %pV", dev_name(&(qedf->pdev->dev)),
+			nfunc, line, qedf->host_no, &vaf);
+	else
+		pr_warn("[0000:00:00.0]:[%s:%d]: %pV", nfunc, line, &vaf);
+
+ret:
+	va_end(va);
+}
+
+void
+qedf_dbg_notice(struct qedf_dbg_ctx *qedf, const char *func, u32 line,
+		 const char *fmt, ...)
+{
+	va_list va;
+	struct va_format vaf;
+	char nfunc[32];
+
+	memset(nfunc, 0, sizeof(nfunc));
+	memcpy(nfunc, func, sizeof(nfunc) - 1);
+
+	va_start(va, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &va;
+
+	if (!(qedf_debug & QEDF_LOG_NOTICE))
+		goto ret;
+
+	if (likely(qedf) && likely(qedf->pdev))
+		pr_notice("[%s]:[%s:%d]:%d: %pV",
+			  dev_name(&(qedf->pdev->dev)), nfunc, line,
+			  qedf->host_no, &vaf);
+	else
+		pr_notice("[0000:00:00.0]:[%s:%d]: %pV", nfunc, line, &vaf);
+
+ret:
+	va_end(va);
+}
+
+void
+qedf_dbg_info(struct qedf_dbg_ctx *qedf, const char *func, u32 line,
+	       u32 level, const char *fmt, ...)
+{
+	va_list va;
+	struct va_format vaf;
+	char nfunc[32];
+
+	memset(nfunc, 0, sizeof(nfunc));
+	memcpy(nfunc, func, sizeof(nfunc) - 1);
+
+	va_start(va, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &va;
+
+	if (!(qedf_debug & level))
+		goto ret;
+
+	if (likely(qedf) && likely(qedf->pdev))
+		pr_info("[%s]:[%s:%d]:%d: %pV", dev_name(&(qedf->pdev->dev)),
+			nfunc, line, qedf->host_no, &vaf);
+	else
+		pr_info("[0000:00:00.0]:[%s:%d]: %pV", nfunc, line, &vaf);
+
+ret:
+	va_end(va);
+}
+
+int
+qedf_alloc_grc_dump_buf(u8 **buf, uint32_t len)
+{
+		*buf = vmalloc(len);
+		if (!(*buf))
+			return -ENOMEM;
+
+		memset(*buf, 0, len);
+		return 0;
+}
+
+void
+qedf_free_grc_dump_buf(uint8_t **buf)
+{
+		vfree(*buf);
+		*buf = NULL;
+}
+
+int
+qedf_get_grc_dump(struct qed_dev *cdev, const struct qed_common_ops *common,
+		   u8 **buf, uint32_t *grcsize)
+{
+	if (!*buf)
+		return -EINVAL;
+
+	return common->dbg_grc(cdev, *buf, grcsize);
+}
+
+void
+qedf_uevent_emit(struct Scsi_Host *shost, u32 code, char *msg)
+{
+	char event_string[40];
+	char *envp[] = {event_string, NULL};
+
+	memset(event_string, 0, sizeof(event_string));
+	switch (code) {
+	case QEDF_UEVENT_CODE_GRCDUMP:
+		if (msg)
+			strscpy(event_string, msg, sizeof(event_string));
+		else
+			sprintf(event_string, "GRCDUMP=%u", shost->host_no);
+		break;
+	default:
+		/* do nothing */
+		break;
+	}
+
+	kobject_uevent_env(&shost->shost_gendev.kobj, KOBJ_CHANGE, envp);
+}
+
+int
+qedf_create_sysfs_attr(struct Scsi_Host *shost, struct sysfs_bin_attrs *iter)
+{
+	int ret = 0;
+
+	for (; iter->name; iter++) {
+		ret = sysfs_create_bin_file(&shost->shost_gendev.kobj,
+					    iter->attr);
+		if (ret)
+			pr_err("Unable to create sysfs %s attr, err(%d).\n",
+			       iter->name, ret);
+	}
+	return ret;
+}
+
+void
+qedf_remove_sysfs_attr(struct Scsi_Host *shost, struct sysfs_bin_attrs *iter)
+{
+	for (; iter->name; iter++)
+		sysfs_remove_bin_file(&shost->shost_gendev.kobj, iter->attr);
+}
diff --git a/drivers/scsi/qedf/qedf_dbg.h b/drivers/scsi/qedf/qedf_dbg.h
new file mode 100644
index 0000000..77c27e8
--- /dev/null
+++ b/drivers/scsi/qedf/qedf_dbg.h
@@ -0,0 +1,160 @@
+/*
+ *  QLogic FCoE Offload Driver
+ *  Copyright (c) 2016-2017 Cavium Inc.
+ *
+ *  This software is available under the terms of the GNU General Public License
+ *  (GPL) Version 2, available from the file COPYING in the main directory of
+ *  this source tree.
+ */
+#ifndef _QEDF_DBG_H_
+#define _QEDF_DBG_H_
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <linux/string.h>
+#include <linux/version.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <scsi/scsi_transport.h>
+#include <linux/fs.h>
+
+#include <linux/qed/common_hsi.h>
+#include <linux/qed/qed_if.h>
+
+extern uint qedf_debug;
+
+/* Debug print level definitions */
+#define QEDF_LOG_DEFAULT	0x1		/* Set default logging mask */
+#define QEDF_LOG_INFO		0x2		/*
+						 * Informational logs,
+						 * MAC address, WWPN, WWNN
+						 */
+#define QEDF_LOG_DISC		0x4		/* Init, discovery, rport */
+#define QEDF_LOG_LL2		0x8		/* LL2, VLAN logs */
+#define QEDF_LOG_CONN		0x10		/* Connection setup, cleanup */
+#define QEDF_LOG_EVT		0x20		/* Events, link, mtu */
+#define QEDF_LOG_TIMER		0x40		/* Timer events */
+#define QEDF_LOG_MP_REQ	0x80		/* Middle Path (MP) logs */
+#define QEDF_LOG_SCSI_TM	0x100		/* SCSI Aborts, Task Mgmt */
+#define QEDF_LOG_UNSOL		0x200		/* unsolicited event logs */
+#define QEDF_LOG_IO		0x400		/* scsi cmd, completion */
+#define QEDF_LOG_MQ		0x800		/* Multi Queue logs */
+#define QEDF_LOG_BSG		0x1000		/* BSG logs */
+#define QEDF_LOG_DEBUGFS	0x2000		/* debugFS logs */
+#define QEDF_LOG_LPORT		0x4000		/* lport logs */
+#define QEDF_LOG_ELS		0x8000		/* ELS logs */
+#define QEDF_LOG_NPIV		0x10000		/* NPIV logs */
+#define QEDF_LOG_SESS		0x20000		/* Conection setup, cleanup */
+#define QEDF_LOG_TID		0x80000         /*
+						 * FW TID context acquire
+						 * free
+						 */
+#define QEDF_TRACK_TID		0x100000        /*
+						 * Track TID state. To be
+						 * enabled only at module load
+						 * and not run-time.
+						 */
+#define QEDF_TRACK_CMD_LIST    0x300000        /*
+						* Track active cmd list nodes,
+						* done with reference to TID,
+						* hence TRACK_TID also enabled.
+						*/
+#define QEDF_LOG_NOTICE	0x40000000	/* Notice logs */
+#define QEDF_LOG_WARN		0x80000000	/* Warning logs */
+
+/* Debug context structure */
+struct qedf_dbg_ctx {
+	unsigned int host_no;
+	struct pci_dev *pdev;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *bdf_dentry;
+#endif
+};
+
+#define QEDF_ERR(pdev, fmt, ...)	\
+		qedf_dbg_err(pdev, __func__, __LINE__, fmt, ## __VA_ARGS__)
+#define QEDF_WARN(pdev, fmt, ...)	\
+		qedf_dbg_warn(pdev, __func__, __LINE__, fmt, ## __VA_ARGS__)
+#define QEDF_NOTICE(pdev, fmt, ...)	\
+		qedf_dbg_notice(pdev, __func__, __LINE__, fmt, ## __VA_ARGS__)
+#define QEDF_INFO(pdev, level, fmt, ...)	\
+		qedf_dbg_info(pdev, __func__, __LINE__, level, fmt,	\
+			      ## __VA_ARGS__)
+__printf(4, 5)
+void qedf_dbg_err(struct qedf_dbg_ctx *qedf, const char *func, u32 line,
+			  const char *fmt, ...);
+__printf(4, 5)
+void qedf_dbg_warn(struct qedf_dbg_ctx *qedf, const char *func, u32 line,
+			   const char *, ...);
+__printf(4, 5)
+void qedf_dbg_notice(struct qedf_dbg_ctx *qedf, const char *func,
+			    u32 line, const char *, ...);
+__printf(5, 6)
+void qedf_dbg_info(struct qedf_dbg_ctx *qedf, const char *func, u32 line,
+			  u32 info, const char *fmt, ...);
+
+/* GRC Dump related defines */
+
+struct Scsi_Host;
+
+#define QEDF_UEVENT_CODE_GRCDUMP 0
+
+struct sysfs_bin_attrs {
+	char *name;
+	struct bin_attribute *attr;
+};
+
+extern int qedf_alloc_grc_dump_buf(uint8_t **buf, uint32_t len);
+extern void qedf_free_grc_dump_buf(uint8_t **buf);
+extern int qedf_get_grc_dump(struct qed_dev *cdev,
+			     const struct qed_common_ops *common, uint8_t **buf,
+			     uint32_t *grcsize);
+extern void qedf_uevent_emit(struct Scsi_Host *shost, u32 code, char *msg);
+extern int qedf_create_sysfs_attr(struct Scsi_Host *shost,
+				   struct sysfs_bin_attrs *iter);
+extern void qedf_remove_sysfs_attr(struct Scsi_Host *shost,
+				    struct sysfs_bin_attrs *iter);
+
+struct qedf_debugfs_ops {
+	char *name;
+	struct qedf_list_of_funcs *qedf_funcs;
+};
+
+extern const struct qedf_debugfs_ops qedf_debugfs_ops[];
+extern const struct file_operations qedf_dbg_fops[];
+
+#ifdef CONFIG_DEBUG_FS
+/* DebugFS related code */
+struct qedf_list_of_funcs {
+	char *oper_str;
+	ssize_t (*oper_func)(struct qedf_dbg_ctx *qedf);
+};
+
+#define qedf_dbg_fileops(drv, ops) \
+{ \
+	.owner  = THIS_MODULE, \
+	.open   = simple_open, \
+	.read   = drv##_dbg_##ops##_cmd_read, \
+	.write  = drv##_dbg_##ops##_cmd_write \
+}
+
+/* Used for debugfs sequential files */
+#define qedf_dbg_fileops_seq(drv, ops) \
+{ \
+	.owner = THIS_MODULE, \
+	.open = drv##_dbg_##ops##_open, \
+	.read = seq_read, \
+	.llseek = seq_lseek, \
+	.release = single_release, \
+}
+
+extern void qedf_dbg_host_init(struct qedf_dbg_ctx *qedf,
+				const struct qedf_debugfs_ops *dops,
+				const struct file_operations *fops);
+extern void qedf_dbg_host_exit(struct qedf_dbg_ctx *qedf);
+extern void qedf_dbg_init(char *drv_name);
+extern void qedf_dbg_exit(void);
+#endif /* CONFIG_DEBUG_FS */
+
+#endif /* _QEDF_DBG_H_ */
diff --git a/drivers/scsi/qedf/qedf_debugfs.c b/drivers/scsi/qedf/qedf_debugfs.c
new file mode 100644
index 0000000..c539a7a
--- /dev/null
+++ b/drivers/scsi/qedf/qedf_debugfs.c
@@ -0,0 +1,460 @@
+/*
+ *  QLogic FCoE Offload Driver
+ *  Copyright (c) 2016-2017 QLogic Corporation
+ *
+ *  This software is available under the terms of the GNU General Public License
+ *  (GPL) Version 2, available from the file COPYING in the main directory of
+ *  this source tree.
+ */
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+
+#include "qedf.h"
+#include "qedf_dbg.h"
+
+static struct dentry *qedf_dbg_root;
+
+/**
+ * qedf_dbg_host_init - setup the debugfs file for the pf
+ * @pf: the pf that is starting up
+ **/
+void
+qedf_dbg_host_init(struct qedf_dbg_ctx *qedf,
+		    const struct qedf_debugfs_ops *dops,
+		    const struct file_operations *fops)
+{
+	char host_dirname[32];
+	struct dentry *file_dentry = NULL;
+
+	QEDF_INFO(qedf, QEDF_LOG_DEBUGFS, "Creating debugfs host node\n");
+	/* create pf dir */
+	sprintf(host_dirname, "host%u", qedf->host_no);
+	qedf->bdf_dentry = debugfs_create_dir(host_dirname, qedf_dbg_root);
+	if (!qedf->bdf_dentry)
+		return;
+
+	/* create debugfs files */
+	while (dops) {
+		if (!(dops->name))
+			break;
+
+		file_dentry = debugfs_create_file(dops->name, 0600,
+						  qedf->bdf_dentry, qedf,
+						  fops);
+		if (!file_dentry) {
+			QEDF_INFO(qedf, QEDF_LOG_DEBUGFS,
+				   "Debugfs entry %s creation failed\n",
+				   dops->name);
+			debugfs_remove_recursive(qedf->bdf_dentry);
+			return;
+		}
+		dops++;
+		fops++;
+	}
+}
+
+/**
+ * qedf_dbg_host_exit - clear out the pf's debugfs entries
+ * @pf: the pf that is stopping
+ **/
+void
+qedf_dbg_host_exit(struct qedf_dbg_ctx *qedf)
+{
+	QEDF_INFO(qedf, QEDF_LOG_DEBUGFS, "Destroying debugfs host "
+		   "entry\n");
+	/* remove debugfs  entries of this PF */
+	debugfs_remove_recursive(qedf->bdf_dentry);
+	qedf->bdf_dentry = NULL;
+}
+
+/**
+ * qedf_dbg_init - start up debugfs for the driver
+ **/
+void
+qedf_dbg_init(char *drv_name)
+{
+	QEDF_INFO(NULL, QEDF_LOG_DEBUGFS, "Creating debugfs root node\n");
+
+	/* create qed dir in root of debugfs. NULL means debugfs root */
+	qedf_dbg_root = debugfs_create_dir(drv_name, NULL);
+	if (!qedf_dbg_root)
+		QEDF_INFO(NULL, QEDF_LOG_DEBUGFS, "Init of debugfs "
+			   "failed\n");
+}
+
+/**
+ * qedf_dbg_exit - clean out the driver's debugfs entries
+ **/
+void
+qedf_dbg_exit(void)
+{
+	QEDF_INFO(NULL, QEDF_LOG_DEBUGFS, "Destroying debugfs root "
+		   "entry\n");
+
+	/* remove qed dir in root of debugfs */
+	debugfs_remove_recursive(qedf_dbg_root);
+	qedf_dbg_root = NULL;
+}
+
+const struct qedf_debugfs_ops qedf_debugfs_ops[] = {
+	{ "fp_int", NULL },
+	{ "io_trace", NULL },
+	{ "debug", NULL },
+	{ "stop_io_on_error", NULL},
+	{ "driver_stats", NULL},
+	{ "clear_stats", NULL},
+	{ "offload_stats", NULL},
+	/* This must be last */
+	{ NULL, NULL }
+};
+
+DECLARE_PER_CPU(struct qedf_percpu_iothread_s, qedf_percpu_iothreads);
+
+static ssize_t
+qedf_dbg_fp_int_cmd_read(struct file *filp, char __user *buffer, size_t count,
+			 loff_t *ppos)
+{
+	size_t cnt = 0;
+	int id;
+	struct qedf_fastpath *fp = NULL;
+	struct qedf_dbg_ctx *qedf_dbg =
+				(struct qedf_dbg_ctx *)filp->private_data;
+	struct qedf_ctx *qedf = container_of(qedf_dbg,
+	    struct qedf_ctx, dbg_ctx);
+
+	QEDF_INFO(qedf_dbg, QEDF_LOG_DEBUGFS, "entered\n");
+
+	cnt = sprintf(buffer, "\nFastpath I/O completions\n\n");
+
+	for (id = 0; id < qedf->num_queues; id++) {
+		fp = &(qedf->fp_array[id]);
+		if (fp->sb_id == QEDF_SB_ID_NULL)
+			continue;
+		cnt += sprintf((buffer + cnt), "#%d: %lu\n", id,
+			       fp->completions);
+	}
+
+	cnt = min_t(int, count, cnt - *ppos);
+	*ppos += cnt;
+	return cnt;
+}
+
+static ssize_t
+qedf_dbg_fp_int_cmd_write(struct file *filp, const char __user *buffer,
+			  size_t count, loff_t *ppos)
+{
+	if (!count || *ppos)
+		return 0;
+
+	return count;
+}
+
+static ssize_t
+qedf_dbg_debug_cmd_read(struct file *filp, char __user *buffer, size_t count,
+			loff_t *ppos)
+{
+	int cnt;
+	struct qedf_dbg_ctx *qedf =
+				(struct qedf_dbg_ctx *)filp->private_data;
+
+	QEDF_INFO(qedf, QEDF_LOG_DEBUGFS, "entered\n");
+	cnt = sprintf(buffer, "debug mask = 0x%x\n", qedf_debug);
+
+	cnt = min_t(int, count, cnt - *ppos);
+	*ppos += cnt;
+	return cnt;
+}
+
+static ssize_t
+qedf_dbg_debug_cmd_write(struct file *filp, const char __user *buffer,
+			 size_t count, loff_t *ppos)
+{
+	uint32_t val;
+	void *kern_buf;
+	int rval;
+	struct qedf_dbg_ctx *qedf =
+	    (struct qedf_dbg_ctx *)filp->private_data;
+
+	if (!count || *ppos)
+		return 0;
+
+	kern_buf = memdup_user(buffer, count);
+	if (IS_ERR(kern_buf))
+		return PTR_ERR(kern_buf);
+
+	rval = kstrtouint(kern_buf, 10, &val);
+	kfree(kern_buf);
+	if (rval)
+		return rval;
+
+	if (val == 1)
+		qedf_debug = QEDF_DEFAULT_LOG_MASK;
+	else
+		qedf_debug = val;
+
+	QEDF_INFO(qedf, QEDF_LOG_DEBUGFS, "Setting debug=0x%x.\n", val);
+	return count;
+}
+
+static ssize_t
+qedf_dbg_stop_io_on_error_cmd_read(struct file *filp, char __user *buffer,
+				   size_t count, loff_t *ppos)
+{
+	int cnt;
+	struct qedf_dbg_ctx *qedf_dbg =
+				(struct qedf_dbg_ctx *)filp->private_data;
+	struct qedf_ctx *qedf = container_of(qedf_dbg,
+	    struct qedf_ctx, dbg_ctx);
+
+	QEDF_INFO(qedf_dbg, QEDF_LOG_DEBUGFS, "entered\n");
+	cnt = sprintf(buffer, "%s\n",
+	    qedf->stop_io_on_error ? "true" : "false");
+
+	cnt = min_t(int, count, cnt - *ppos);
+	*ppos += cnt;
+	return cnt;
+}
+
+static ssize_t
+qedf_dbg_stop_io_on_error_cmd_write(struct file *filp,
+				    const char __user *buffer, size_t count,
+				    loff_t *ppos)
+{
+	void *kern_buf;
+	struct qedf_dbg_ctx *qedf_dbg =
+				(struct qedf_dbg_ctx *)filp->private_data;
+	struct qedf_ctx *qedf = container_of(qedf_dbg, struct qedf_ctx,
+	    dbg_ctx);
+
+	QEDF_INFO(qedf_dbg, QEDF_LOG_DEBUGFS, "entered\n");
+
+	if (!count || *ppos)
+		return 0;
+
+	kern_buf = memdup_user(buffer, 6);
+	if (IS_ERR(kern_buf))
+		return PTR_ERR(kern_buf);
+
+	if (strncmp(kern_buf, "false", 5) == 0)
+		qedf->stop_io_on_error = false;
+	else if (strncmp(kern_buf, "true", 4) == 0)
+		qedf->stop_io_on_error = true;
+	else if (strncmp(kern_buf, "now", 3) == 0)
+		/* Trigger from user to stop all I/O on this host */
+		set_bit(QEDF_DBG_STOP_IO, &qedf->flags);
+
+	kfree(kern_buf);
+	return count;
+}
+
+static int
+qedf_io_trace_show(struct seq_file *s, void *unused)
+{
+	int i, idx = 0;
+	struct qedf_ctx *qedf = s->private;
+	struct qedf_dbg_ctx *qedf_dbg = &qedf->dbg_ctx;
+	struct qedf_io_log *io_log;
+	unsigned long flags;
+
+	if (!qedf_io_tracing) {
+		seq_puts(s, "I/O tracing not enabled.\n");
+		goto out;
+	}
+
+	QEDF_INFO(qedf_dbg, QEDF_LOG_DEBUGFS, "entered\n");
+
+	spin_lock_irqsave(&qedf->io_trace_lock, flags);
+	idx = qedf->io_trace_idx;
+	for (i = 0; i < QEDF_IO_TRACE_SIZE; i++) {
+		io_log = &qedf->io_trace_buf[idx];
+		seq_printf(s, "%d:", io_log->direction);
+		seq_printf(s, "0x%x:", io_log->task_id);
+		seq_printf(s, "0x%06x:", io_log->port_id);
+		seq_printf(s, "%d:", io_log->lun);
+		seq_printf(s, "0x%02x:", io_log->op);
+		seq_printf(s, "0x%02x%02x%02x%02x:", io_log->lba[0],
+		    io_log->lba[1], io_log->lba[2], io_log->lba[3]);
+		seq_printf(s, "%d:", io_log->bufflen);
+		seq_printf(s, "%d:", io_log->sg_count);
+		seq_printf(s, "0x%08x:", io_log->result);
+		seq_printf(s, "%lu:", io_log->jiffies);
+		seq_printf(s, "%d:", io_log->refcount);
+		seq_printf(s, "%d:", io_log->req_cpu);
+		seq_printf(s, "%d:", io_log->int_cpu);
+		seq_printf(s, "%d:", io_log->rsp_cpu);
+		seq_printf(s, "%d\n", io_log->sge_type);
+
+		idx++;
+		if (idx == QEDF_IO_TRACE_SIZE)
+			idx = 0;
+	}
+	spin_unlock_irqrestore(&qedf->io_trace_lock, flags);
+
+out:
+	return 0;
+}
+
+static int
+qedf_dbg_io_trace_open(struct inode *inode, struct file *file)
+{
+	struct qedf_dbg_ctx *qedf_dbg = inode->i_private;
+	struct qedf_ctx *qedf = container_of(qedf_dbg,
+	    struct qedf_ctx, dbg_ctx);
+
+	return single_open(file, qedf_io_trace_show, qedf);
+}
+
+static int
+qedf_driver_stats_show(struct seq_file *s, void *unused)
+{
+	struct qedf_ctx *qedf = s->private;
+	struct qedf_rport *fcport;
+	struct fc_rport_priv *rdata;
+
+	seq_printf(s, "cmg_mgr free io_reqs: %d\n",
+	    atomic_read(&qedf->cmd_mgr->free_list_cnt));
+	seq_printf(s, "slow SGEs: %d\n", qedf->slow_sge_ios);
+	seq_printf(s, "single SGEs: %d\n", qedf->single_sge_ios);
+	seq_printf(s, "fast SGEs: %d\n\n", qedf->fast_sge_ios);
+
+	seq_puts(s, "Offloaded ports:\n\n");
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(fcport, &qedf->fcports, peers) {
+		rdata = fcport->rdata;
+		if (rdata == NULL)
+			continue;
+		seq_printf(s, "%06x: free_sqes: %d, num_active_ios: %d\n",
+		    rdata->ids.port_id, atomic_read(&fcport->free_sqes),
+		    atomic_read(&fcport->num_active_ios));
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int
+qedf_dbg_driver_stats_open(struct inode *inode, struct file *file)
+{
+	struct qedf_dbg_ctx *qedf_dbg = inode->i_private;
+	struct qedf_ctx *qedf = container_of(qedf_dbg,
+	    struct qedf_ctx, dbg_ctx);
+
+	return single_open(file, qedf_driver_stats_show, qedf);
+}
+
+static ssize_t
+qedf_dbg_clear_stats_cmd_read(struct file *filp, char __user *buffer,
+				   size_t count, loff_t *ppos)
+{
+	int cnt = 0;
+
+	/* Essentially a read stub */
+	cnt = min_t(int, count, cnt - *ppos);
+	*ppos += cnt;
+	return cnt;
+}
+
+static ssize_t
+qedf_dbg_clear_stats_cmd_write(struct file *filp,
+				    const char __user *buffer, size_t count,
+				    loff_t *ppos)
+{
+	struct qedf_dbg_ctx *qedf_dbg =
+				(struct qedf_dbg_ctx *)filp->private_data;
+	struct qedf_ctx *qedf = container_of(qedf_dbg, struct qedf_ctx,
+	    dbg_ctx);
+
+	QEDF_INFO(qedf_dbg, QEDF_LOG_DEBUGFS, "Clearing stat counters.\n");
+
+	if (!count || *ppos)
+		return 0;
+
+	/* Clear stat counters exposed by 'stats' node */
+	qedf->slow_sge_ios = 0;
+	qedf->single_sge_ios = 0;
+	qedf->fast_sge_ios = 0;
+
+	return count;
+}
+
+static int
+qedf_offload_stats_show(struct seq_file *s, void *unused)
+{
+	struct qedf_ctx *qedf = s->private;
+	struct qed_fcoe_stats *fw_fcoe_stats;
+
+	fw_fcoe_stats = kmalloc(sizeof(struct qed_fcoe_stats), GFP_KERNEL);
+	if (!fw_fcoe_stats) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Could not allocate memory for "
+		    "fw_fcoe_stats.\n");
+		goto out;
+	}
+
+	/* Query firmware for offload stats */
+	qed_ops->get_stats(qedf->cdev, fw_fcoe_stats);
+
+	seq_printf(s, "fcoe_rx_byte_cnt=%llu\n"
+	    "fcoe_rx_data_pkt_cnt=%llu\n"
+	    "fcoe_rx_xfer_pkt_cnt=%llu\n"
+	    "fcoe_rx_other_pkt_cnt=%llu\n"
+	    "fcoe_silent_drop_pkt_cmdq_full_cnt=%u\n"
+	    "fcoe_silent_drop_pkt_crc_error_cnt=%u\n"
+	    "fcoe_silent_drop_pkt_task_invalid_cnt=%u\n"
+	    "fcoe_silent_drop_total_pkt_cnt=%u\n"
+	    "fcoe_silent_drop_pkt_rq_full_cnt=%u\n"
+	    "fcoe_tx_byte_cnt=%llu\n"
+	    "fcoe_tx_data_pkt_cnt=%llu\n"
+	    "fcoe_tx_xfer_pkt_cnt=%llu\n"
+	    "fcoe_tx_other_pkt_cnt=%llu\n",
+	    fw_fcoe_stats->fcoe_rx_byte_cnt,
+	    fw_fcoe_stats->fcoe_rx_data_pkt_cnt,
+	    fw_fcoe_stats->fcoe_rx_xfer_pkt_cnt,
+	    fw_fcoe_stats->fcoe_rx_other_pkt_cnt,
+	    fw_fcoe_stats->fcoe_silent_drop_pkt_cmdq_full_cnt,
+	    fw_fcoe_stats->fcoe_silent_drop_pkt_crc_error_cnt,
+	    fw_fcoe_stats->fcoe_silent_drop_pkt_task_invalid_cnt,
+	    fw_fcoe_stats->fcoe_silent_drop_total_pkt_cnt,
+	    fw_fcoe_stats->fcoe_silent_drop_pkt_rq_full_cnt,
+	    fw_fcoe_stats->fcoe_tx_byte_cnt,
+	    fw_fcoe_stats->fcoe_tx_data_pkt_cnt,
+	    fw_fcoe_stats->fcoe_tx_xfer_pkt_cnt,
+	    fw_fcoe_stats->fcoe_tx_other_pkt_cnt);
+
+	kfree(fw_fcoe_stats);
+out:
+	return 0;
+}
+
+static int
+qedf_dbg_offload_stats_open(struct inode *inode, struct file *file)
+{
+	struct qedf_dbg_ctx *qedf_dbg = inode->i_private;
+	struct qedf_ctx *qedf = container_of(qedf_dbg,
+	    struct qedf_ctx, dbg_ctx);
+
+	return single_open(file, qedf_offload_stats_show, qedf);
+}
+
+
+const struct file_operations qedf_dbg_fops[] = {
+	qedf_dbg_fileops(qedf, fp_int),
+	qedf_dbg_fileops_seq(qedf, io_trace),
+	qedf_dbg_fileops(qedf, debug),
+	qedf_dbg_fileops(qedf, stop_io_on_error),
+	qedf_dbg_fileops_seq(qedf, driver_stats),
+	qedf_dbg_fileops(qedf, clear_stats),
+	qedf_dbg_fileops_seq(qedf, offload_stats),
+	/* This must be last */
+	{ },
+};
+
+#else /* CONFIG_DEBUG_FS */
+void qedf_dbg_host_init(struct qedf_dbg_ctx *);
+void qedf_dbg_host_exit(struct qedf_dbg_ctx *);
+void qedf_dbg_init(char *);
+void qedf_dbg_exit(void);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/drivers/scsi/qedf/qedf_els.c b/drivers/scsi/qedf/qedf_els.c
new file mode 100644
index 0000000..aa22b11
--- /dev/null
+++ b/drivers/scsi/qedf/qedf_els.c
@@ -0,0 +1,966 @@
+/*
+ *  QLogic FCoE Offload Driver
+ *  Copyright (c) 2016-2017 Cavium Inc.
+ *
+ *  This software is available under the terms of the GNU General Public License
+ *  (GPL) Version 2, available from the file COPYING in the main directory of
+ *  this source tree.
+ */
+#include "qedf.h"
+
+/* It's assumed that the lock is held when calling this function. */
+static int qedf_initiate_els(struct qedf_rport *fcport, unsigned int op,
+	void *data, uint32_t data_len,
+	void (*cb_func)(struct qedf_els_cb_arg *cb_arg),
+	struct qedf_els_cb_arg *cb_arg, uint32_t timer_msec)
+{
+	struct qedf_ctx *qedf = fcport->qedf;
+	struct fc_lport *lport = qedf->lport;
+	struct qedf_ioreq *els_req;
+	struct qedf_mp_req *mp_req;
+	struct fc_frame_header *fc_hdr;
+	struct e4_fcoe_task_context *task;
+	int rc = 0;
+	uint32_t did, sid;
+	uint16_t xid;
+	uint32_t start_time = jiffies / HZ;
+	uint32_t current_time;
+	struct fcoe_wqe *sqe;
+	unsigned long flags;
+	u16 sqe_idx;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS, "Sending ELS\n");
+
+	rc = fc_remote_port_chkready(fcport->rport);
+	if (rc) {
+		QEDF_ERR(&(qedf->dbg_ctx), "els 0x%x: rport not ready\n", op);
+		rc = -EAGAIN;
+		goto els_err;
+	}
+	if (lport->state != LPORT_ST_READY || !(lport->link_up)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "els 0x%x: link is not ready\n",
+			  op);
+		rc = -EAGAIN;
+		goto els_err;
+	}
+
+	if (!test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "els 0x%x: fcport not ready\n", op);
+		rc = -EINVAL;
+		goto els_err;
+	}
+
+retry_els:
+	els_req = qedf_alloc_cmd(fcport, QEDF_ELS);
+	if (!els_req) {
+		current_time = jiffies / HZ;
+		if ((current_time - start_time) > 10) {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+				   "els: Failed els 0x%x\n", op);
+			rc = -ENOMEM;
+			goto els_err;
+		}
+		mdelay(20 * USEC_PER_MSEC);
+		goto retry_els;
+	}
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS, "initiate_els els_req = "
+		   "0x%p cb_arg = %p xid = %x\n", els_req, cb_arg,
+		   els_req->xid);
+	els_req->sc_cmd = NULL;
+	els_req->cmd_type = QEDF_ELS;
+	els_req->fcport = fcport;
+	els_req->cb_func = cb_func;
+	cb_arg->io_req = els_req;
+	cb_arg->op = op;
+	els_req->cb_arg = cb_arg;
+	els_req->data_xfer_len = data_len;
+
+	/* Record which cpu this request is associated with */
+	els_req->cpu = smp_processor_id();
+
+	mp_req = (struct qedf_mp_req *)&(els_req->mp_req);
+	rc = qedf_init_mp_req(els_req);
+	if (rc) {
+		QEDF_ERR(&(qedf->dbg_ctx), "ELS MP request init failed\n");
+		kref_put(&els_req->refcount, qedf_release_cmd);
+		goto els_err;
+	} else {
+		rc = 0;
+	}
+
+	/* Fill ELS Payload */
+	if ((op >= ELS_LS_RJT) && (op <= ELS_AUTH_ELS)) {
+		memcpy(mp_req->req_buf, data, data_len);
+	} else {
+		QEDF_ERR(&(qedf->dbg_ctx), "Invalid ELS op 0x%x\n", op);
+		els_req->cb_func = NULL;
+		els_req->cb_arg = NULL;
+		kref_put(&els_req->refcount, qedf_release_cmd);
+		rc = -EINVAL;
+	}
+
+	if (rc)
+		goto els_err;
+
+	/* Fill FC header */
+	fc_hdr = &(mp_req->req_fc_hdr);
+
+	did = fcport->rdata->ids.port_id;
+	sid = fcport->sid;
+
+	__fc_fill_fc_hdr(fc_hdr, FC_RCTL_ELS_REQ, did, sid,
+			   FC_TYPE_ELS, FC_FC_FIRST_SEQ | FC_FC_END_SEQ |
+			   FC_FC_SEQ_INIT, 0);
+
+	/* Obtain exchange id */
+	xid = els_req->xid;
+
+	spin_lock_irqsave(&fcport->rport_lock, flags);
+
+	sqe_idx = qedf_get_sqe_idx(fcport);
+	sqe = &fcport->sq[sqe_idx];
+	memset(sqe, 0, sizeof(struct fcoe_wqe));
+
+	/* Initialize task context for this IO request */
+	task = qedf_get_task_mem(&qedf->tasks, xid);
+	qedf_init_mp_task(els_req, task, sqe);
+
+	/* Put timer on original I/O request */
+	if (timer_msec)
+		qedf_cmd_timer_set(qedf, els_req, timer_msec);
+
+	/* Ring doorbell */
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS, "Ringing doorbell for ELS "
+		   "req\n");
+	qedf_ring_doorbell(fcport);
+	spin_unlock_irqrestore(&fcport->rport_lock, flags);
+els_err:
+	return rc;
+}
+
+void qedf_process_els_compl(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
+	struct qedf_ioreq *els_req)
+{
+	struct fcoe_task_context *task_ctx;
+	struct scsi_cmnd *sc_cmd;
+	uint16_t xid;
+	struct fcoe_cqe_midpath_info *mp_info;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS, "Entered with xid = 0x%x"
+		   " cmd_type = %d.\n", els_req->xid, els_req->cmd_type);
+
+	/* Kill the ELS timer */
+	cancel_delayed_work(&els_req->timeout_work);
+
+	xid = els_req->xid;
+	task_ctx = qedf_get_task_mem(&qedf->tasks, xid);
+	sc_cmd = els_req->sc_cmd;
+
+	/* Get ELS response length from CQE */
+	mp_info = &cqe->cqe_info.midpath_info;
+	els_req->mp_req.resp_len = mp_info->data_placement_size;
+
+	/* Parse ELS response */
+	if ((els_req->cb_func) && (els_req->cb_arg)) {
+		els_req->cb_func(els_req->cb_arg);
+		els_req->cb_arg = NULL;
+	}
+
+	kref_put(&els_req->refcount, qedf_release_cmd);
+}
+
+static void qedf_rrq_compl(struct qedf_els_cb_arg *cb_arg)
+{
+	struct qedf_ioreq *orig_io_req;
+	struct qedf_ioreq *rrq_req;
+	struct qedf_ctx *qedf;
+	int refcount;
+
+	rrq_req = cb_arg->io_req;
+	qedf = rrq_req->fcport->qedf;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS, "Entered.\n");
+
+	orig_io_req = cb_arg->aborted_io_req;
+
+	if (!orig_io_req)
+		goto out_free;
+
+	if (rrq_req->event != QEDF_IOREQ_EV_ELS_TMO &&
+	    rrq_req->event != QEDF_IOREQ_EV_ELS_ERR_DETECT)
+		cancel_delayed_work_sync(&orig_io_req->timeout_work);
+
+	refcount = kref_read(&orig_io_req->refcount);
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS, "rrq_compl: orig io = %p,"
+		   " orig xid = 0x%x, rrq_xid = 0x%x, refcount=%d\n",
+		   orig_io_req, orig_io_req->xid, rrq_req->xid, refcount);
+
+	/* This should return the aborted io_req to the command pool */
+	if (orig_io_req)
+		kref_put(&orig_io_req->refcount, qedf_release_cmd);
+
+out_free:
+	kfree(cb_arg);
+}
+
+/* Assumes kref is already held by caller */
+int qedf_send_rrq(struct qedf_ioreq *aborted_io_req)
+{
+
+	struct fc_els_rrq rrq;
+	struct qedf_rport *fcport;
+	struct fc_lport *lport;
+	struct qedf_els_cb_arg *cb_arg = NULL;
+	struct qedf_ctx *qedf;
+	uint32_t sid;
+	uint32_t r_a_tov;
+	int rc;
+
+	if (!aborted_io_req) {
+		QEDF_ERR(NULL, "abort_io_req is NULL.\n");
+		return -EINVAL;
+	}
+
+	fcport = aborted_io_req->fcport;
+
+	/* Check that fcport is still offloaded */
+	if (!test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+		QEDF_ERR(NULL, "fcport is no longer offloaded.\n");
+		return -EINVAL;
+	}
+
+	if (!fcport->qedf) {
+		QEDF_ERR(NULL, "fcport->qedf is NULL.\n");
+		return -EINVAL;
+	}
+
+	qedf = fcport->qedf;
+	lport = qedf->lport;
+	sid = fcport->sid;
+	r_a_tov = lport->r_a_tov;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS, "Sending RRQ orig "
+		   "io = %p, orig_xid = 0x%x\n", aborted_io_req,
+		   aborted_io_req->xid);
+	memset(&rrq, 0, sizeof(rrq));
+
+	cb_arg = kzalloc(sizeof(struct qedf_els_cb_arg), GFP_NOIO);
+	if (!cb_arg) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Unable to allocate cb_arg for "
+			  "RRQ\n");
+		rc = -ENOMEM;
+		goto rrq_err;
+	}
+
+	cb_arg->aborted_io_req = aborted_io_req;
+
+	rrq.rrq_cmd = ELS_RRQ;
+	hton24(rrq.rrq_s_id, sid);
+	rrq.rrq_ox_id = htons(aborted_io_req->xid);
+	rrq.rrq_rx_id =
+	    htons(aborted_io_req->task->tstorm_st_context.read_write.rx_id);
+
+	rc = qedf_initiate_els(fcport, ELS_RRQ, &rrq, sizeof(rrq),
+	    qedf_rrq_compl, cb_arg, r_a_tov);
+
+rrq_err:
+	if (rc) {
+		QEDF_ERR(&(qedf->dbg_ctx), "RRQ failed - release orig io "
+			  "req 0x%x\n", aborted_io_req->xid);
+		kfree(cb_arg);
+		kref_put(&aborted_io_req->refcount, qedf_release_cmd);
+	}
+	return rc;
+}
+
+static void qedf_process_l2_frame_compl(struct qedf_rport *fcport,
+					struct fc_frame *fp,
+					u16 l2_oxid)
+{
+	struct fc_lport *lport = fcport->qedf->lport;
+	struct fc_frame_header *fh;
+	u32 crc;
+
+	fh = (struct fc_frame_header *)fc_frame_header_get(fp);
+
+	/* Set the OXID we return to what libfc used */
+	if (l2_oxid != FC_XID_UNKNOWN)
+		fh->fh_ox_id = htons(l2_oxid);
+
+	/* Setup header fields */
+	fh->fh_r_ctl = FC_RCTL_ELS_REP;
+	fh->fh_type = FC_TYPE_ELS;
+	/* Last sequence, end sequence */
+	fh->fh_f_ctl[0] = 0x98;
+	hton24(fh->fh_d_id, lport->port_id);
+	hton24(fh->fh_s_id, fcport->rdata->ids.port_id);
+	fh->fh_rx_id = 0xffff;
+
+	/* Set frame attributes */
+	crc = fcoe_fc_crc(fp);
+	fc_frame_init(fp);
+	fr_dev(fp) = lport;
+	fr_sof(fp) = FC_SOF_I3;
+	fr_eof(fp) = FC_EOF_T;
+	fr_crc(fp) = cpu_to_le32(~crc);
+
+	/* Send completed request to libfc */
+	fc_exch_recv(lport, fp);
+}
+
+/*
+ * In instances where an ELS command times out we may need to restart the
+ * rport by logging out and then logging back in.
+ */
+void qedf_restart_rport(struct qedf_rport *fcport)
+{
+	struct fc_lport *lport;
+	struct fc_rport_priv *rdata;
+	u32 port_id;
+
+	if (!fcport)
+		return;
+
+	rdata = fcport->rdata;
+	if (rdata) {
+		lport = fcport->qedf->lport;
+		port_id = rdata->ids.port_id;
+		QEDF_ERR(&(fcport->qedf->dbg_ctx),
+		    "LOGO port_id=%x.\n", port_id);
+		fc_rport_logoff(rdata);
+		/* Recreate the rport and log back in */
+		rdata = fc_rport_create(lport, port_id);
+		if (rdata)
+			fc_rport_login(rdata);
+	}
+}
+
+static void qedf_l2_els_compl(struct qedf_els_cb_arg *cb_arg)
+{
+	struct qedf_ioreq *els_req;
+	struct qedf_rport *fcport;
+	struct qedf_mp_req *mp_req;
+	struct fc_frame *fp;
+	struct fc_frame_header *fh, *mp_fc_hdr;
+	void *resp_buf, *fc_payload;
+	u32 resp_len;
+	u16 l2_oxid;
+
+	l2_oxid = cb_arg->l2_oxid;
+	els_req = cb_arg->io_req;
+
+	if (!els_req) {
+		QEDF_ERR(NULL, "els_req is NULL.\n");
+		goto free_arg;
+	}
+
+	/*
+	 * If we are flushing the command just free the cb_arg as none of the
+	 * response data will be valid.
+	 */
+	if (els_req->event == QEDF_IOREQ_EV_ELS_FLUSH)
+		goto free_arg;
+
+	fcport = els_req->fcport;
+	mp_req = &(els_req->mp_req);
+	mp_fc_hdr = &(mp_req->resp_fc_hdr);
+	resp_len = mp_req->resp_len;
+	resp_buf = mp_req->resp_buf;
+
+	/*
+	 * If a middle path ELS command times out, don't try to return
+	 * the command but rather do any internal cleanup and then libfc
+	 * timeout the command and clean up its internal resources.
+	 */
+	if (els_req->event == QEDF_IOREQ_EV_ELS_TMO) {
+		/*
+		 * If ADISC times out, libfc will timeout the exchange and then
+		 * try to send a PLOGI which will timeout since the session is
+		 * still offloaded.  Force libfc to logout the session which
+		 * will offload the connection and allow the PLOGI response to
+		 * flow over the LL2 path.
+		 */
+		if (cb_arg->op == ELS_ADISC)
+			qedf_restart_rport(fcport);
+		return;
+	}
+
+	if (sizeof(struct fc_frame_header) + resp_len > QEDF_PAGE_SIZE) {
+		QEDF_ERR(&(fcport->qedf->dbg_ctx), "resp_len is "
+		   "beyond page size.\n");
+		goto free_arg;
+	}
+
+	fp = fc_frame_alloc(fcport->qedf->lport, resp_len);
+	if (!fp) {
+		QEDF_ERR(&(fcport->qedf->dbg_ctx),
+		    "fc_frame_alloc failure.\n");
+		return;
+	}
+
+	/* Copy frame header from firmware into fp */
+	fh = (struct fc_frame_header *)fc_frame_header_get(fp);
+	memcpy(fh, mp_fc_hdr, sizeof(struct fc_frame_header));
+
+	/* Copy payload from firmware into fp */
+	fc_payload = fc_frame_payload_get(fp, resp_len);
+	memcpy(fc_payload, resp_buf, resp_len);
+
+	QEDF_INFO(&(fcport->qedf->dbg_ctx), QEDF_LOG_ELS,
+	    "Completing OX_ID 0x%x back to libfc.\n", l2_oxid);
+	qedf_process_l2_frame_compl(fcport, fp, l2_oxid);
+
+free_arg:
+	kfree(cb_arg);
+}
+
+int qedf_send_adisc(struct qedf_rport *fcport, struct fc_frame *fp)
+{
+	struct fc_els_adisc *adisc;
+	struct fc_frame_header *fh;
+	struct fc_lport *lport = fcport->qedf->lport;
+	struct qedf_els_cb_arg *cb_arg = NULL;
+	struct qedf_ctx *qedf;
+	uint32_t r_a_tov = lport->r_a_tov;
+	int rc;
+
+	qedf = fcport->qedf;
+	fh = fc_frame_header_get(fp);
+
+	cb_arg = kzalloc(sizeof(struct qedf_els_cb_arg), GFP_NOIO);
+	if (!cb_arg) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Unable to allocate cb_arg for "
+			  "ADISC\n");
+		rc = -ENOMEM;
+		goto adisc_err;
+	}
+	cb_arg->l2_oxid = ntohs(fh->fh_ox_id);
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+	    "Sending ADISC ox_id=0x%x.\n", cb_arg->l2_oxid);
+
+	adisc = fc_frame_payload_get(fp, sizeof(*adisc));
+
+	rc = qedf_initiate_els(fcport, ELS_ADISC, adisc, sizeof(*adisc),
+	    qedf_l2_els_compl, cb_arg, r_a_tov);
+
+adisc_err:
+	if (rc) {
+		QEDF_ERR(&(qedf->dbg_ctx), "ADISC failed.\n");
+		kfree(cb_arg);
+	}
+	return rc;
+}
+
+static void qedf_srr_compl(struct qedf_els_cb_arg *cb_arg)
+{
+	struct qedf_ioreq *orig_io_req;
+	struct qedf_ioreq *srr_req;
+	struct qedf_mp_req *mp_req;
+	struct fc_frame_header *mp_fc_hdr, *fh;
+	struct fc_frame *fp;
+	void *resp_buf, *fc_payload;
+	u32 resp_len;
+	struct fc_lport *lport;
+	struct qedf_ctx *qedf;
+	int refcount;
+	u8 opcode;
+
+	srr_req = cb_arg->io_req;
+	qedf = srr_req->fcport->qedf;
+	lport = qedf->lport;
+
+	orig_io_req = cb_arg->aborted_io_req;
+
+	if (!orig_io_req)
+		goto out_free;
+
+	clear_bit(QEDF_CMD_SRR_SENT, &orig_io_req->flags);
+
+	if (srr_req->event != QEDF_IOREQ_EV_ELS_TMO &&
+	    srr_req->event != QEDF_IOREQ_EV_ELS_ERR_DETECT)
+		cancel_delayed_work_sync(&orig_io_req->timeout_work);
+
+	refcount = kref_read(&orig_io_req->refcount);
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS, "Entered: orig_io=%p,"
+		   " orig_io_xid=0x%x, rec_xid=0x%x, refcount=%d\n",
+		   orig_io_req, orig_io_req->xid, srr_req->xid, refcount);
+
+	/* If a SRR times out, simply free resources */
+	if (srr_req->event == QEDF_IOREQ_EV_ELS_TMO)
+		goto out_put;
+
+	/* Normalize response data into struct fc_frame */
+	mp_req = &(srr_req->mp_req);
+	mp_fc_hdr = &(mp_req->resp_fc_hdr);
+	resp_len = mp_req->resp_len;
+	resp_buf = mp_req->resp_buf;
+
+	fp = fc_frame_alloc(lport, resp_len);
+	if (!fp) {
+		QEDF_ERR(&(qedf->dbg_ctx),
+		    "fc_frame_alloc failure.\n");
+		goto out_put;
+	}
+
+	/* Copy frame header from firmware into fp */
+	fh = (struct fc_frame_header *)fc_frame_header_get(fp);
+	memcpy(fh, mp_fc_hdr, sizeof(struct fc_frame_header));
+
+	/* Copy payload from firmware into fp */
+	fc_payload = fc_frame_payload_get(fp, resp_len);
+	memcpy(fc_payload, resp_buf, resp_len);
+
+	opcode = fc_frame_payload_op(fp);
+	switch (opcode) {
+	case ELS_LS_ACC:
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+		    "SRR success.\n");
+		break;
+	case ELS_LS_RJT:
+		QEDF_INFO(&qedf->dbg_ctx, QEDF_LOG_ELS,
+		    "SRR rejected.\n");
+		qedf_initiate_abts(orig_io_req, true);
+		break;
+	}
+
+	fc_frame_free(fp);
+out_put:
+	/* Put reference for original command since SRR completed */
+	kref_put(&orig_io_req->refcount, qedf_release_cmd);
+out_free:
+	kfree(cb_arg);
+}
+
+static int qedf_send_srr(struct qedf_ioreq *orig_io_req, u32 offset, u8 r_ctl)
+{
+	struct fcp_srr srr;
+	struct qedf_ctx *qedf;
+	struct qedf_rport *fcport;
+	struct fc_lport *lport;
+	struct qedf_els_cb_arg *cb_arg = NULL;
+	u32 sid, r_a_tov;
+	int rc;
+
+	if (!orig_io_req) {
+		QEDF_ERR(NULL, "orig_io_req is NULL.\n");
+		return -EINVAL;
+	}
+
+	fcport = orig_io_req->fcport;
+
+	/* Check that fcport is still offloaded */
+	if (!test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+		QEDF_ERR(NULL, "fcport is no longer offloaded.\n");
+		return -EINVAL;
+	}
+
+	if (!fcport->qedf) {
+		QEDF_ERR(NULL, "fcport->qedf is NULL.\n");
+		return -EINVAL;
+	}
+
+	/* Take reference until SRR command completion */
+	kref_get(&orig_io_req->refcount);
+
+	qedf = fcport->qedf;
+	lport = qedf->lport;
+	sid = fcport->sid;
+	r_a_tov = lport->r_a_tov;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS, "Sending SRR orig_io=%p, "
+		   "orig_xid=0x%x\n", orig_io_req, orig_io_req->xid);
+	memset(&srr, 0, sizeof(srr));
+
+	cb_arg = kzalloc(sizeof(struct qedf_els_cb_arg), GFP_NOIO);
+	if (!cb_arg) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Unable to allocate cb_arg for "
+			  "SRR\n");
+		rc = -ENOMEM;
+		goto srr_err;
+	}
+
+	cb_arg->aborted_io_req = orig_io_req;
+
+	srr.srr_op = ELS_SRR;
+	srr.srr_ox_id = htons(orig_io_req->xid);
+	srr.srr_rx_id = htons(orig_io_req->rx_id);
+	srr.srr_rel_off = htonl(offset);
+	srr.srr_r_ctl = r_ctl;
+
+	rc = qedf_initiate_els(fcport, ELS_SRR, &srr, sizeof(srr),
+	    qedf_srr_compl, cb_arg, r_a_tov);
+
+srr_err:
+	if (rc) {
+		QEDF_ERR(&(qedf->dbg_ctx), "SRR failed - release orig_io_req"
+			  "=0x%x\n", orig_io_req->xid);
+		kfree(cb_arg);
+		/* If we fail to queue SRR, send ABTS to orig_io */
+		qedf_initiate_abts(orig_io_req, true);
+		kref_put(&orig_io_req->refcount, qedf_release_cmd);
+	} else
+		/* Tell other threads that SRR is in progress */
+		set_bit(QEDF_CMD_SRR_SENT, &orig_io_req->flags);
+
+	return rc;
+}
+
+static void qedf_initiate_seq_cleanup(struct qedf_ioreq *orig_io_req,
+	u32 offset, u8 r_ctl)
+{
+	struct qedf_rport *fcport;
+	unsigned long flags;
+	struct qedf_els_cb_arg *cb_arg;
+	struct fcoe_wqe *sqe;
+	u16 sqe_idx;
+
+	fcport = orig_io_req->fcport;
+
+	QEDF_INFO(&(fcport->qedf->dbg_ctx), QEDF_LOG_ELS,
+	    "Doing sequence cleanup for xid=0x%x offset=%u.\n",
+	    orig_io_req->xid, offset);
+
+	cb_arg = kzalloc(sizeof(struct qedf_els_cb_arg), GFP_NOIO);
+	if (!cb_arg) {
+		QEDF_ERR(&(fcport->qedf->dbg_ctx), "Unable to allocate cb_arg "
+			  "for sequence cleanup\n");
+		return;
+	}
+
+	/* Get reference for cleanup request */
+	kref_get(&orig_io_req->refcount);
+
+	orig_io_req->cmd_type = QEDF_SEQ_CLEANUP;
+	cb_arg->offset = offset;
+	cb_arg->r_ctl = r_ctl;
+	orig_io_req->cb_arg = cb_arg;
+
+	qedf_cmd_timer_set(fcport->qedf, orig_io_req,
+	    QEDF_CLEANUP_TIMEOUT * HZ);
+
+	spin_lock_irqsave(&fcport->rport_lock, flags);
+
+	sqe_idx = qedf_get_sqe_idx(fcport);
+	sqe = &fcport->sq[sqe_idx];
+	memset(sqe, 0, sizeof(struct fcoe_wqe));
+	orig_io_req->task_params->sqe = sqe;
+
+	init_initiator_sequence_recovery_fcoe_task(orig_io_req->task_params,
+						   offset);
+	qedf_ring_doorbell(fcport);
+
+	spin_unlock_irqrestore(&fcport->rport_lock, flags);
+}
+
+void qedf_process_seq_cleanup_compl(struct qedf_ctx *qedf,
+	struct fcoe_cqe *cqe, struct qedf_ioreq *io_req)
+{
+	int rc;
+	struct qedf_els_cb_arg *cb_arg;
+
+	cb_arg = io_req->cb_arg;
+
+	/* If we timed out just free resources */
+	if (io_req->event == QEDF_IOREQ_EV_ELS_TMO || !cqe)
+		goto free;
+
+	/* Kill the timer we put on the request */
+	cancel_delayed_work_sync(&io_req->timeout_work);
+
+	rc = qedf_send_srr(io_req, cb_arg->offset, cb_arg->r_ctl);
+	if (rc)
+		QEDF_ERR(&(qedf->dbg_ctx), "Unable to send SRR, I/O will "
+		    "abort, xid=0x%x.\n", io_req->xid);
+free:
+	kfree(cb_arg);
+	kref_put(&io_req->refcount, qedf_release_cmd);
+}
+
+static bool qedf_requeue_io_req(struct qedf_ioreq *orig_io_req)
+{
+	struct qedf_rport *fcport;
+	struct qedf_ioreq *new_io_req;
+	unsigned long flags;
+	bool rc = false;
+
+	fcport = orig_io_req->fcport;
+	if (!fcport) {
+		QEDF_ERR(NULL, "fcport is NULL.\n");
+		goto out;
+	}
+
+	if (!orig_io_req->sc_cmd) {
+		QEDF_ERR(&(fcport->qedf->dbg_ctx), "sc_cmd is NULL for "
+		    "xid=0x%x.\n", orig_io_req->xid);
+		goto out;
+	}
+
+	new_io_req = qedf_alloc_cmd(fcport, QEDF_SCSI_CMD);
+	if (!new_io_req) {
+		QEDF_ERR(&(fcport->qedf->dbg_ctx), "Could not allocate new "
+		    "io_req.\n");
+		goto out;
+	}
+
+	new_io_req->sc_cmd = orig_io_req->sc_cmd;
+
+	/*
+	 * This keeps the sc_cmd struct from being returned to the tape
+	 * driver and being requeued twice. We do need to put a reference
+	 * for the original I/O request since we will not do a SCSI completion
+	 * for it.
+	 */
+	orig_io_req->sc_cmd = NULL;
+	kref_put(&orig_io_req->refcount, qedf_release_cmd);
+
+	spin_lock_irqsave(&fcport->rport_lock, flags);
+
+	/* kref for new command released in qedf_post_io_req on error */
+	if (qedf_post_io_req(fcport, new_io_req)) {
+		QEDF_ERR(&(fcport->qedf->dbg_ctx), "Unable to post io_req\n");
+		/* Return SQE to pool */
+		atomic_inc(&fcport->free_sqes);
+	} else {
+		QEDF_INFO(&(fcport->qedf->dbg_ctx), QEDF_LOG_ELS,
+		    "Reissued SCSI command from  orig_xid=0x%x on "
+		    "new_xid=0x%x.\n", orig_io_req->xid, new_io_req->xid);
+		/*
+		 * Abort the original I/O but do not return SCSI command as
+		 * it has been reissued on another OX_ID.
+		 */
+		spin_unlock_irqrestore(&fcport->rport_lock, flags);
+		qedf_initiate_abts(orig_io_req, false);
+		goto out;
+	}
+
+	spin_unlock_irqrestore(&fcport->rport_lock, flags);
+out:
+	return rc;
+}
+
+
+static void qedf_rec_compl(struct qedf_els_cb_arg *cb_arg)
+{
+	struct qedf_ioreq *orig_io_req;
+	struct qedf_ioreq *rec_req;
+	struct qedf_mp_req *mp_req;
+	struct fc_frame_header *mp_fc_hdr, *fh;
+	struct fc_frame *fp;
+	void *resp_buf, *fc_payload;
+	u32 resp_len;
+	struct fc_lport *lport;
+	struct qedf_ctx *qedf;
+	int refcount;
+	enum fc_rctl r_ctl;
+	struct fc_els_ls_rjt *rjt;
+	struct fc_els_rec_acc *acc;
+	u8 opcode;
+	u32 offset, e_stat;
+	struct scsi_cmnd *sc_cmd;
+	bool srr_needed = false;
+
+	rec_req = cb_arg->io_req;
+	qedf = rec_req->fcport->qedf;
+	lport = qedf->lport;
+
+	orig_io_req = cb_arg->aborted_io_req;
+
+	if (!orig_io_req)
+		goto out_free;
+
+	if (rec_req->event != QEDF_IOREQ_EV_ELS_TMO &&
+	    rec_req->event != QEDF_IOREQ_EV_ELS_ERR_DETECT)
+		cancel_delayed_work_sync(&orig_io_req->timeout_work);
+
+	refcount = kref_read(&orig_io_req->refcount);
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS, "Entered: orig_io=%p,"
+		   " orig_io_xid=0x%x, rec_xid=0x%x, refcount=%d\n",
+		   orig_io_req, orig_io_req->xid, rec_req->xid, refcount);
+
+	/* If a REC times out, free resources */
+	if (rec_req->event == QEDF_IOREQ_EV_ELS_TMO)
+		goto out_put;
+
+	/* Normalize response data into struct fc_frame */
+	mp_req = &(rec_req->mp_req);
+	mp_fc_hdr = &(mp_req->resp_fc_hdr);
+	resp_len = mp_req->resp_len;
+	acc = resp_buf = mp_req->resp_buf;
+
+	fp = fc_frame_alloc(lport, resp_len);
+	if (!fp) {
+		QEDF_ERR(&(qedf->dbg_ctx),
+		    "fc_frame_alloc failure.\n");
+		goto out_put;
+	}
+
+	/* Copy frame header from firmware into fp */
+	fh = (struct fc_frame_header *)fc_frame_header_get(fp);
+	memcpy(fh, mp_fc_hdr, sizeof(struct fc_frame_header));
+
+	/* Copy payload from firmware into fp */
+	fc_payload = fc_frame_payload_get(fp, resp_len);
+	memcpy(fc_payload, resp_buf, resp_len);
+
+	opcode = fc_frame_payload_op(fp);
+	if (opcode == ELS_LS_RJT) {
+		rjt = fc_frame_payload_get(fp, sizeof(*rjt));
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+		    "Received LS_RJT for REC: er_reason=0x%x, "
+		    "er_explan=0x%x.\n", rjt->er_reason, rjt->er_explan);
+		/*
+		 * The following response(s) mean that we need to reissue the
+		 * request on another exchange.  We need to do this without
+		 * informing the upper layers lest it cause an application
+		 * error.
+		 */
+		if ((rjt->er_reason == ELS_RJT_LOGIC ||
+		    rjt->er_reason == ELS_RJT_UNAB) &&
+		    rjt->er_explan == ELS_EXPL_OXID_RXID) {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+			    "Handle CMD LOST case.\n");
+			qedf_requeue_io_req(orig_io_req);
+		}
+	} else if (opcode == ELS_LS_ACC) {
+		offset = ntohl(acc->reca_fc4value);
+		e_stat = ntohl(acc->reca_e_stat);
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+		    "Received LS_ACC for REC: offset=0x%x, e_stat=0x%x.\n",
+		    offset, e_stat);
+		if (e_stat & ESB_ST_SEQ_INIT)  {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+			    "Target has the seq init\n");
+			goto out_free_frame;
+		}
+		sc_cmd = orig_io_req->sc_cmd;
+		if (!sc_cmd) {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+			    "sc_cmd is NULL for xid=0x%x.\n",
+			    orig_io_req->xid);
+			goto out_free_frame;
+		}
+		/* SCSI write case */
+		if (sc_cmd->sc_data_direction == DMA_TO_DEVICE) {
+			if (offset == orig_io_req->data_xfer_len) {
+				QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+				    "WRITE - response lost.\n");
+				r_ctl = FC_RCTL_DD_CMD_STATUS;
+				srr_needed = true;
+				offset = 0;
+			} else {
+				QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+				    "WRITE - XFER_RDY/DATA lost.\n");
+				r_ctl = FC_RCTL_DD_DATA_DESC;
+				/* Use data from warning CQE instead of REC */
+				offset = orig_io_req->tx_buf_off;
+			}
+		/* SCSI read case */
+		} else {
+			if (orig_io_req->rx_buf_off ==
+			    orig_io_req->data_xfer_len) {
+				QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+				    "READ - response lost.\n");
+				srr_needed = true;
+				r_ctl = FC_RCTL_DD_CMD_STATUS;
+				offset = 0;
+			} else {
+				QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+				    "READ - DATA lost.\n");
+				/*
+				 * For read case we always set the offset to 0
+				 * for sequence recovery task.
+				 */
+				offset = 0;
+				r_ctl = FC_RCTL_DD_SOL_DATA;
+			}
+		}
+
+		if (srr_needed)
+			qedf_send_srr(orig_io_req, offset, r_ctl);
+		else
+			qedf_initiate_seq_cleanup(orig_io_req, offset, r_ctl);
+	}
+
+out_free_frame:
+	fc_frame_free(fp);
+out_put:
+	/* Put reference for original command since REC completed */
+	kref_put(&orig_io_req->refcount, qedf_release_cmd);
+out_free:
+	kfree(cb_arg);
+}
+
+/* Assumes kref is already held by caller */
+int qedf_send_rec(struct qedf_ioreq *orig_io_req)
+{
+
+	struct fc_els_rec rec;
+	struct qedf_rport *fcport;
+	struct fc_lport *lport;
+	struct qedf_els_cb_arg *cb_arg = NULL;
+	struct qedf_ctx *qedf;
+	uint32_t sid;
+	uint32_t r_a_tov;
+	int rc;
+
+	if (!orig_io_req) {
+		QEDF_ERR(NULL, "orig_io_req is NULL.\n");
+		return -EINVAL;
+	}
+
+	fcport = orig_io_req->fcport;
+
+	/* Check that fcport is still offloaded */
+	if (!test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+		QEDF_ERR(NULL, "fcport is no longer offloaded.\n");
+		return -EINVAL;
+	}
+
+	if (!fcport->qedf) {
+		QEDF_ERR(NULL, "fcport->qedf is NULL.\n");
+		return -EINVAL;
+	}
+
+	/* Take reference until REC command completion */
+	kref_get(&orig_io_req->refcount);
+
+	qedf = fcport->qedf;
+	lport = qedf->lport;
+	sid = fcport->sid;
+	r_a_tov = lport->r_a_tov;
+
+	memset(&rec, 0, sizeof(rec));
+
+	cb_arg = kzalloc(sizeof(struct qedf_els_cb_arg), GFP_NOIO);
+	if (!cb_arg) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Unable to allocate cb_arg for "
+			  "REC\n");
+		rc = -ENOMEM;
+		goto rec_err;
+	}
+
+	cb_arg->aborted_io_req = orig_io_req;
+
+	rec.rec_cmd = ELS_REC;
+	hton24(rec.rec_s_id, sid);
+	rec.rec_ox_id = htons(orig_io_req->xid);
+	rec.rec_rx_id =
+	    htons(orig_io_req->task->tstorm_st_context.read_write.rx_id);
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS, "Sending REC orig_io=%p, "
+	   "orig_xid=0x%x rx_id=0x%x\n", orig_io_req,
+	   orig_io_req->xid, rec.rec_rx_id);
+	rc = qedf_initiate_els(fcport, ELS_REC, &rec, sizeof(rec),
+	    qedf_rec_compl, cb_arg, r_a_tov);
+
+rec_err:
+	if (rc) {
+		QEDF_ERR(&(qedf->dbg_ctx), "REC failed - release orig_io_req"
+			  "=0x%x\n", orig_io_req->xid);
+		kfree(cb_arg);
+		kref_put(&orig_io_req->refcount, qedf_release_cmd);
+	}
+	return rc;
+}
diff --git a/drivers/scsi/qedf/qedf_fip.c b/drivers/scsi/qedf/qedf_fip.c
new file mode 100644
index 0000000..773558f
--- /dev/null
+++ b/drivers/scsi/qedf/qedf_fip.c
@@ -0,0 +1,246 @@
+/*
+ *  QLogic FCoE Offload Driver
+ *  Copyright (c) 2016-2017 Cavium Inc.
+ *
+ *  This software is available under the terms of the GNU General Public License
+ *  (GPL) Version 2, available from the file COPYING in the main directory of
+ *  this source tree.
+ */
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include "qedf.h"
+
+extern const struct qed_fcoe_ops *qed_ops;
+/*
+ * FIP VLAN functions that will eventually move to libfcoe.
+ */
+
+void qedf_fcoe_send_vlan_req(struct qedf_ctx *qedf)
+{
+	struct sk_buff *skb;
+	char *eth_fr;
+	int fr_len;
+	struct fip_vlan *vlan;
+#define MY_FIP_ALL_FCF_MACS        ((__u8[6]) { 1, 0x10, 0x18, 1, 0, 2 })
+	static u8 my_fcoe_all_fcfs[ETH_ALEN] = MY_FIP_ALL_FCF_MACS;
+
+	skb = dev_alloc_skb(sizeof(struct fip_vlan));
+	if (!skb)
+		return;
+
+	fr_len = sizeof(*vlan);
+	eth_fr = (char *)skb->data;
+	vlan = (struct fip_vlan *)eth_fr;
+
+	memset(vlan, 0, sizeof(*vlan));
+	ether_addr_copy(vlan->eth.h_source, qedf->mac);
+	ether_addr_copy(vlan->eth.h_dest, my_fcoe_all_fcfs);
+	vlan->eth.h_proto = htons(ETH_P_FIP);
+
+	vlan->fip.fip_ver = FIP_VER_ENCAPS(FIP_VER);
+	vlan->fip.fip_op = htons(FIP_OP_VLAN);
+	vlan->fip.fip_subcode = FIP_SC_VL_REQ;
+	vlan->fip.fip_dl_len = htons(sizeof(vlan->desc) / FIP_BPW);
+
+	vlan->desc.mac.fd_desc.fip_dtype = FIP_DT_MAC;
+	vlan->desc.mac.fd_desc.fip_dlen = sizeof(vlan->desc.mac) / FIP_BPW;
+	ether_addr_copy(vlan->desc.mac.fd_mac, qedf->mac);
+
+	vlan->desc.wwnn.fd_desc.fip_dtype = FIP_DT_NAME;
+	vlan->desc.wwnn.fd_desc.fip_dlen = sizeof(vlan->desc.wwnn) / FIP_BPW;
+	put_unaligned_be64(qedf->lport->wwnn, &vlan->desc.wwnn.fd_wwn);
+
+	skb_put(skb, sizeof(*vlan));
+	skb->protocol = htons(ETH_P_FIP);
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "Sending FIP VLAN "
+		   "request.");
+
+	if (atomic_read(&qedf->link_state) != QEDF_LINK_UP) {
+		QEDF_WARN(&(qedf->dbg_ctx), "Cannot send vlan request "
+		    "because link is not up.\n");
+
+		kfree_skb(skb);
+		return;
+	}
+	qed_ops->ll2->start_xmit(qedf->cdev, skb);
+}
+
+static void qedf_fcoe_process_vlan_resp(struct qedf_ctx *qedf,
+	struct sk_buff *skb)
+{
+	struct fip_header *fiph;
+	struct fip_desc *desc;
+	u16 vid = 0;
+	ssize_t rlen;
+	size_t dlen;
+
+	fiph = (struct fip_header *)(((void *)skb->data) + 2 * ETH_ALEN + 2);
+
+	rlen = ntohs(fiph->fip_dl_len) * 4;
+	desc = (struct fip_desc *)(fiph + 1);
+	while (rlen > 0) {
+		dlen = desc->fip_dlen * FIP_BPW;
+		switch (desc->fip_dtype) {
+		case FIP_DT_VLAN:
+			vid = ntohs(((struct fip_vlan_desc *)desc)->fd_vlan);
+			break;
+		}
+		desc = (struct fip_desc *)((char *)desc + dlen);
+		rlen -= dlen;
+	}
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "VLAN response, "
+		   "vid=0x%x.\n", vid);
+
+	if (vid > 0 && qedf->vlan_id != vid) {
+		qedf_set_vlan_id(qedf, vid);
+
+		/* Inform waiter that it's ok to call fcoe_ctlr_link up() */
+		if (!completion_done(&qedf->fipvlan_compl))
+			complete(&qedf->fipvlan_compl);
+	}
+}
+
+void qedf_fip_send(struct fcoe_ctlr *fip, struct sk_buff *skb)
+{
+	struct qedf_ctx *qedf = container_of(fip, struct qedf_ctx, ctlr);
+	struct ethhdr *eth_hdr;
+	struct fip_header *fiph;
+	u16 op, vlan_tci = 0;
+	u8 sub;
+
+	if (!test_bit(QEDF_LL2_STARTED, &qedf->flags)) {
+		QEDF_WARN(&(qedf->dbg_ctx), "LL2 not started\n");
+		kfree_skb(skb);
+		return;
+	}
+
+	fiph = (struct fip_header *) ((void *)skb->data + 2 * ETH_ALEN + 2);
+	eth_hdr = (struct ethhdr *)skb_mac_header(skb);
+	op = ntohs(fiph->fip_op);
+	sub = fiph->fip_subcode;
+
+	/*
+	 * Add VLAN tag to non-offload FIP frame based on current stored VLAN
+	 * for FIP/FCoE traffic.
+	 */
+	__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), qedf->vlan_id);
+
+	/* Get VLAN ID from skb for printing purposes */
+	__vlan_hwaccel_get_tag(skb, &vlan_tci);
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2, "FIP frame send: "
+	    "dest=%pM op=%x sub=%x vlan=%04x.", eth_hdr->h_dest, op, sub,
+	    ntohs(vlan_tci));
+	if (qedf_dump_frames)
+		print_hex_dump(KERN_WARNING, "fip ", DUMP_PREFIX_OFFSET, 16, 1,
+		    skb->data, skb->len, false);
+
+	qed_ops->ll2->start_xmit(qedf->cdev, skb);
+}
+
+/* Process incoming FIP frames. */
+void qedf_fip_recv(struct qedf_ctx *qedf, struct sk_buff *skb)
+{
+	struct ethhdr *eth_hdr;
+	struct fip_header *fiph;
+	struct fip_desc *desc;
+	struct fip_mac_desc *mp;
+	struct fip_wwn_desc *wp;
+	struct fip_vn_desc *vp;
+	size_t rlen, dlen;
+	u16 op;
+	u8 sub;
+	bool do_reset = false;
+
+	eth_hdr = (struct ethhdr *)skb_mac_header(skb);
+	fiph = (struct fip_header *) ((void *)skb->data + 2 * ETH_ALEN + 2);
+	op = ntohs(fiph->fip_op);
+	sub = fiph->fip_subcode;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2, "FIP frame received: "
+	    "skb=%p fiph=%p source=%pM op=%x sub=%x", skb, fiph,
+	    eth_hdr->h_source, op, sub);
+	if (qedf_dump_frames)
+		print_hex_dump(KERN_WARNING, "fip ", DUMP_PREFIX_OFFSET, 16, 1,
+		    skb->data, skb->len, false);
+
+	/* Handle FIP VLAN resp in the driver */
+	if (op == FIP_OP_VLAN && sub == FIP_SC_VL_NOTE) {
+		qedf_fcoe_process_vlan_resp(qedf, skb);
+		kfree_skb(skb);
+	} else if (op == FIP_OP_CTRL && sub == FIP_SC_CLR_VLINK) {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "Clear virtual "
+			   "link received.\n");
+
+		/* Check that an FCF has been selected by fcoe */
+		if (qedf->ctlr.sel_fcf == NULL) {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+			    "Dropping CVL since FCF has not been selected "
+			    "yet.");
+			return;
+		}
+
+		/*
+		 * We need to loop through the CVL descriptors to determine
+		 * if we want to reset the fcoe link
+		 */
+		rlen = ntohs(fiph->fip_dl_len) * FIP_BPW;
+		desc = (struct fip_desc *)(fiph + 1);
+		while (rlen >= sizeof(*desc)) {
+			dlen = desc->fip_dlen * FIP_BPW;
+			switch (desc->fip_dtype) {
+			case FIP_DT_MAC:
+				mp = (struct fip_mac_desc *)desc;
+				QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2,
+				    "fd_mac=%pM\n", mp->fd_mac);
+				if (ether_addr_equal(mp->fd_mac,
+				    qedf->ctlr.sel_fcf->fcf_mac))
+					do_reset = true;
+				break;
+			case FIP_DT_NAME:
+				wp = (struct fip_wwn_desc *)desc;
+				QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2,
+				    "fc_wwpn=%016llx.\n",
+				    get_unaligned_be64(&wp->fd_wwn));
+				break;
+			case FIP_DT_VN_ID:
+				vp = (struct fip_vn_desc *)desc;
+				QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2,
+				    "fd_fc_id=%x.\n", ntoh24(vp->fd_fc_id));
+				if (ntoh24(vp->fd_fc_id) ==
+				    qedf->lport->port_id)
+					do_reset = true;
+				break;
+			default:
+				/* Ignore anything else */
+				break;
+			}
+			desc = (struct fip_desc *)((char *)desc + dlen);
+			rlen -= dlen;
+		}
+
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2,
+		    "do_reset=%d.\n", do_reset);
+		if (do_reset) {
+			fcoe_ctlr_link_down(&qedf->ctlr);
+			qedf_wait_for_upload(qedf);
+			fcoe_ctlr_link_up(&qedf->ctlr);
+		}
+		kfree_skb(skb);
+	} else {
+		/* Everything else is handled by libfcoe */
+		__skb_pull(skb, ETH_HLEN);
+		fcoe_ctlr_recv(&qedf->ctlr, skb);
+	}
+}
+
+u8 *qedf_get_src_mac(struct fc_lport *lport)
+{
+	struct qedf_ctx *qedf = lport_priv(lport);
+
+	return qedf->data_src_addr;
+}
diff --git a/drivers/scsi/qedf/qedf_hsi.h b/drivers/scsi/qedf/qedf_hsi.h
new file mode 100644
index 0000000..503c1ae
--- /dev/null
+++ b/drivers/scsi/qedf/qedf_hsi.h
@@ -0,0 +1,354 @@
+/*
+ *  QLogic FCoE Offload Driver
+ *  Copyright (c) 2016-2017 Cavium Inc.
+ *
+ *  This software is available under the terms of the GNU General Public License
+ *  (GPL) Version 2, available from the file COPYING in the main directory of
+ *  this source tree.
+ */
+#ifndef __QEDF_HSI__
+#define __QEDF_HSI__
+/*
+ * Add include to common target
+ */
+#include <linux/qed/common_hsi.h>
+
+/*
+ * Add include to common storage target
+ */
+#include <linux/qed/storage_common.h>
+
+/*
+ * Add include to common fcoe target for both eCore and protocol driver
+ */
+#include <linux/qed/fcoe_common.h>
+
+
+/*
+ * FCoE CQ element ABTS information
+ */
+struct fcoe_abts_info {
+	u8 r_ctl /* R_CTL in the ABTS response frame */;
+	u8 reserved0;
+	__le16 rx_id;
+	__le32 reserved2[2];
+	__le32 fc_payload[3] /* ABTS FC payload response frame */;
+};
+
+
+/*
+ * FCoE class type
+ */
+enum fcoe_class_type {
+	FCOE_TASK_CLASS_TYPE_3,
+	FCOE_TASK_CLASS_TYPE_2,
+	MAX_FCOE_CLASS_TYPE
+};
+
+
+/*
+ * FCoE CMDQ element control information
+ */
+struct fcoe_cmdqe_control {
+	__le16 conn_id;
+	u8 num_additional_cmdqes;
+	u8 cmdType;
+	/* true for ABTS request cmdqe. used in Target mode */
+#define FCOE_CMDQE_CONTROL_ABTSREQCMD_MASK  0x1
+#define FCOE_CMDQE_CONTROL_ABTSREQCMD_SHIFT 0
+#define FCOE_CMDQE_CONTROL_RESERVED1_MASK   0x7F
+#define FCOE_CMDQE_CONTROL_RESERVED1_SHIFT  1
+	u8 reserved2[4];
+};
+
+/*
+ * FCoE control + payload CMDQ element
+ */
+struct fcoe_cmdqe {
+	struct fcoe_cmdqe_control hdr;
+	u8 fc_header[24];
+	__le32 fcp_cmd_payload[8];
+};
+
+
+
+/*
+ * FCP RSP flags
+ */
+struct fcoe_fcp_rsp_flags {
+	u8 flags;
+#define FCOE_FCP_RSP_FLAGS_FCP_RSP_LEN_VALID_MASK  0x1
+#define FCOE_FCP_RSP_FLAGS_FCP_RSP_LEN_VALID_SHIFT 0
+#define FCOE_FCP_RSP_FLAGS_FCP_SNS_LEN_VALID_MASK  0x1
+#define FCOE_FCP_RSP_FLAGS_FCP_SNS_LEN_VALID_SHIFT 1
+#define FCOE_FCP_RSP_FLAGS_FCP_RESID_OVER_MASK     0x1
+#define FCOE_FCP_RSP_FLAGS_FCP_RESID_OVER_SHIFT    2
+#define FCOE_FCP_RSP_FLAGS_FCP_RESID_UNDER_MASK    0x1
+#define FCOE_FCP_RSP_FLAGS_FCP_RESID_UNDER_SHIFT   3
+#define FCOE_FCP_RSP_FLAGS_FCP_CONF_REQ_MASK       0x1
+#define FCOE_FCP_RSP_FLAGS_FCP_CONF_REQ_SHIFT      4
+#define FCOE_FCP_RSP_FLAGS_FCP_BIDI_FLAGS_MASK     0x7
+#define FCOE_FCP_RSP_FLAGS_FCP_BIDI_FLAGS_SHIFT    5
+};
+
+/*
+ * FCoE CQ element response information
+ */
+struct fcoe_cqe_rsp_info {
+	struct fcoe_fcp_rsp_flags rsp_flags;
+	u8 scsi_status_code;
+	__le16 retry_delay_timer;
+	__le32 fcp_resid;
+	__le32 fcp_sns_len;
+	__le32 fcp_rsp_len;
+	__le16 rx_id;
+	u8 fw_error_flags;
+#define FCOE_CQE_RSP_INFO_FW_UNDERRUN_MASK  0x1 /* FW detected underrun */
+#define FCOE_CQE_RSP_INFO_FW_UNDERRUN_SHIFT 0
+#define FCOE_CQE_RSP_INFO_RESREVED_MASK     0x7F
+#define FCOE_CQE_RSP_INFO_RESREVED_SHIFT    1
+	u8 reserved;
+	__le32 fw_residual /* Residual bytes calculated by FW */;
+};
+
+/*
+ * FCoE CQ element Target completion information
+ */
+struct fcoe_cqe_target_info {
+	__le16 rx_id;
+	__le16 reserved0;
+	__le32 reserved1[5];
+};
+
+/*
+ * FCoE error/warning reporting entry
+ */
+struct fcoe_err_report_entry {
+	__le32 err_warn_bitmap_lo /* Error bitmap lower 32 bits */;
+	__le32 err_warn_bitmap_hi /* Error bitmap higher 32 bits */;
+	/* Buffer offset the beginning of the Sequence last transmitted */
+	__le32 tx_buf_off;
+	/* Buffer offset from the beginning of the Sequence last received */
+	__le32 rx_buf_off;
+	__le16 rx_id /* RX_ID of the associated task */;
+	__le16 reserved1;
+	__le32 reserved2;
+};
+
+/*
+ * FCoE CQ element middle path information
+ */
+struct fcoe_cqe_midpath_info {
+	__le32 data_placement_size;
+	__le16 rx_id;
+	__le16 reserved0;
+	__le32 reserved1[4];
+};
+
+/*
+ * FCoE CQ element unsolicited information
+ */
+struct fcoe_unsolic_info {
+	/* BD information: Physical address and opaque data */
+	struct scsi_bd bd_info;
+	__le16 conn_id /* Connection ID the frame is associated to */;
+	__le16 pkt_len /* Packet length */;
+	u8 reserved1[4];
+};
+
+/*
+ * FCoE warning reporting entry
+ */
+struct fcoe_warning_report_entry {
+	/* BD information: Physical address and opaque data */
+	struct scsi_bd bd_info;
+	/* Buffer offset the beginning of the Sequence last transmitted */
+	__le32 buf_off;
+	__le16 rx_id /* RX_ID of the associated task */;
+	__le16 reserved1;
+};
+
+/*
+ * FCoE CQ element information
+ */
+union fcoe_cqe_info {
+	struct fcoe_cqe_rsp_info rsp_info /* Response completion information */;
+	/* Target completion information */
+	struct fcoe_cqe_target_info target_info;
+	/* Error completion information */
+	struct fcoe_err_report_entry err_info;
+	struct fcoe_abts_info abts_info /* ABTS completion information */;
+	/* Middle path completion information */
+	struct fcoe_cqe_midpath_info midpath_info;
+	/* Unsolicited packet completion information */
+	struct fcoe_unsolic_info unsolic_info;
+	/* Warning completion information (Rec Tov expiration) */
+	struct fcoe_warning_report_entry warn_info;
+};
+
+/*
+ * FCoE CQ element
+ */
+struct fcoe_cqe {
+	__le32 cqe_data;
+	/* The task identifier (OX_ID) to be completed */
+#define FCOE_CQE_TASK_ID_MASK    0xFFFF
+#define FCOE_CQE_TASK_ID_SHIFT   0
+	/*
+	 * The CQE type: 0x0 Indicating on a pending work request completion.
+	 * 0x1 - Indicating on an unsolicited event notification. use enum
+	 * fcoe_cqe_type  (use enum fcoe_cqe_type)
+	 */
+#define FCOE_CQE_CQE_TYPE_MASK   0xF
+#define FCOE_CQE_CQE_TYPE_SHIFT  16
+#define FCOE_CQE_RESERVED0_MASK  0xFFF
+#define FCOE_CQE_RESERVED0_SHIFT 20
+	__le16 reserved1;
+	__le16 fw_cq_prod;
+	union fcoe_cqe_info cqe_info;
+};
+
+/*
+ * FCoE CQE type
+ */
+enum fcoe_cqe_type {
+	/* solicited response on a R/W or middle-path SQE */
+	FCOE_GOOD_COMPLETION_CQE_TYPE,
+	FCOE_UNSOLIC_CQE_TYPE /* unsolicited packet, RQ consumed */,
+	FCOE_ERROR_DETECTION_CQE_TYPE /* timer expiration, validation error */,
+	FCOE_WARNING_CQE_TYPE /* rec_tov or rr_tov timer expiration */,
+	FCOE_EXCH_CLEANUP_CQE_TYPE /* task cleanup completed */,
+	FCOE_ABTS_CQE_TYPE /* ABTS received and task cleaned */,
+	FCOE_DUMMY_CQE_TYPE /* just increment SQ CONS */,
+	/* Task was completed wight after sending a pkt to the target */
+	FCOE_LOCAL_COMP_CQE_TYPE,
+	MAX_FCOE_CQE_TYPE
+};
+
+/*
+ * FCoE fast path error codes
+ */
+enum fcoe_fp_error_warning_code {
+	FCOE_ERROR_CODE_XFER_OOO_RO /* XFER error codes */,
+	FCOE_ERROR_CODE_XFER_RO_NOT_ALIGNED,
+	FCOE_ERROR_CODE_XFER_NULL_BURST_LEN,
+	FCOE_ERROR_CODE_XFER_RO_GREATER_THAN_DATA2TRNS,
+	FCOE_ERROR_CODE_XFER_INVALID_PAYLOAD_SIZE,
+	FCOE_ERROR_CODE_XFER_TASK_TYPE_NOT_WRITE,
+	FCOE_ERROR_CODE_XFER_PEND_XFER_SET,
+	FCOE_ERROR_CODE_XFER_OPENED_SEQ,
+	FCOE_ERROR_CODE_XFER_FCTL,
+	FCOE_ERROR_CODE_FCP_RSP_BIDI_FLAGS_SET /* FCP RSP error codes */,
+	FCOE_ERROR_CODE_FCP_RSP_INVALID_LENGTH_FIELD,
+	FCOE_ERROR_CODE_FCP_RSP_INVALID_SNS_FIELD,
+	FCOE_ERROR_CODE_FCP_RSP_INVALID_PAYLOAD_SIZE,
+	FCOE_ERROR_CODE_FCP_RSP_PEND_XFER_SET,
+	FCOE_ERROR_CODE_FCP_RSP_OPENED_SEQ,
+	FCOE_ERROR_CODE_FCP_RSP_FCTL,
+	FCOE_ERROR_CODE_FCP_RSP_LAST_SEQ_RESET,
+	FCOE_ERROR_CODE_FCP_RSP_CONF_REQ_NOT_SUPPORTED_YET,
+	FCOE_ERROR_CODE_DATA_OOO_RO /* FCP DATA error codes */,
+	FCOE_ERROR_CODE_DATA_EXCEEDS_DEFINED_MAX_FRAME_SIZE,
+	FCOE_ERROR_CODE_DATA_EXCEEDS_DATA2TRNS,
+	FCOE_ERROR_CODE_DATA_SOFI3_SEQ_ACTIVE_SET,
+	FCOE_ERROR_CODE_DATA_SOFN_SEQ_ACTIVE_RESET,
+	FCOE_ERROR_CODE_DATA_EOFN_END_SEQ_SET,
+	FCOE_ERROR_CODE_DATA_EOFT_END_SEQ_RESET,
+	FCOE_ERROR_CODE_DATA_TASK_TYPE_NOT_READ,
+	FCOE_ERROR_CODE_DATA_FCTL_INITIATIR,
+	FCOE_ERROR_CODE_MIDPATH_INVALID_TYPE /* Middle path error codes */,
+	FCOE_ERROR_CODE_MIDPATH_SOFI3_SEQ_ACTIVE_SET,
+	FCOE_ERROR_CODE_MIDPATH_SOFN_SEQ_ACTIVE_RESET,
+	FCOE_ERROR_CODE_MIDPATH_EOFN_END_SEQ_SET,
+	FCOE_ERROR_CODE_MIDPATH_EOFT_END_SEQ_RESET,
+	FCOE_ERROR_CODE_MIDPATH_REPLY_FCTL,
+	FCOE_ERROR_CODE_MIDPATH_INVALID_REPLY,
+	FCOE_ERROR_CODE_MIDPATH_ELS_REPLY_RCTL,
+	FCOE_ERROR_CODE_COMMON_MIDDLE_FRAME_WITH_PAD /* Common error codes */,
+	FCOE_ERROR_CODE_COMMON_SEQ_INIT_IN_TCE,
+	FCOE_ERROR_CODE_COMMON_FC_HDR_RX_ID_MISMATCH,
+	FCOE_ERROR_CODE_COMMON_INCORRECT_SEQ_CNT,
+	FCOE_ERROR_CODE_COMMON_DATA_FC_HDR_FCP_TYPE_MISMATCH,
+	FCOE_ERROR_CODE_COMMON_DATA_NO_MORE_SGES,
+	FCOE_ERROR_CODE_COMMON_OPTIONAL_FC_HDR,
+	FCOE_ERROR_CODE_COMMON_READ_TCE_OX_ID_TOO_BIG,
+	FCOE_ERROR_CODE_COMMON_DATA_WAS_NOT_TRANSMITTED,
+	FCOE_ERROR_CODE_COMMON_TASK_DDF_RCTL_INFO_FIELD,
+	FCOE_ERROR_CODE_COMMON_TASK_INVALID_RCTL,
+	FCOE_ERROR_CODE_COMMON_TASK_RCTL_GENERAL_MISMATCH,
+	FCOE_ERROR_CODE_E_D_TOV_TIMER_EXPIRATION /* Timer error codes */,
+	FCOE_WARNING_CODE_REC_TOV_TIMER_EXPIRATION /* Timer error codes */,
+	FCOE_ERROR_CODE_RR_TOV_TIMER_EXPIRATION /* Timer error codes */,
+	/* ABTSrsp pckt arrived unexpected */
+	FCOE_ERROR_CODE_ABTS_REPLY_UNEXPECTED,
+	FCOE_ERROR_CODE_TARGET_MODE_FCP_RSP,
+	FCOE_ERROR_CODE_TARGET_MODE_FCP_XFER,
+	FCOE_ERROR_CODE_TARGET_MODE_DATA_TASK_TYPE_NOT_WRITE,
+	FCOE_ERROR_CODE_DATA_FCTL_TARGET,
+	FCOE_ERROR_CODE_TARGET_DATA_SIZE_NO_MATCH_XFER,
+	FCOE_ERROR_CODE_TARGET_DIF_CRC_CHECKSUM_ERROR,
+	FCOE_ERROR_CODE_TARGET_DIF_REF_TAG_ERROR,
+	FCOE_ERROR_CODE_TARGET_DIF_APP_TAG_ERROR,
+	MAX_FCOE_FP_ERROR_WARNING_CODE
+};
+
+
+/*
+ * FCoE RESPQ element
+ */
+struct fcoe_respqe {
+	__le16 ox_id /* OX_ID that is located in the FCP_RSP FC header */;
+	__le16 rx_id /* RX_ID that is located in the FCP_RSP FC header */;
+	__le32 additional_info;
+/* PARAM that is located in the FCP_RSP FC header */
+#define FCOE_RESPQE_PARAM_MASK            0xFFFFFF
+#define FCOE_RESPQE_PARAM_SHIFT           0
+/* Indication whther its Target-auto-rsp mode or not */
+#define FCOE_RESPQE_TARGET_AUTO_RSP_MASK  0xFF
+#define FCOE_RESPQE_TARGET_AUTO_RSP_SHIFT 24
+};
+
+
+/*
+ * FCoE slow path error codes
+ */
+enum fcoe_sp_error_code {
+	/* Error codes for Error Reporting in slow path flows */
+	FCOE_ERROR_CODE_SLOW_PATH_TOO_MANY_FUNCS,
+	FCOE_ERROR_SLOW_PATH_CODE_NO_LICENSE,
+	MAX_FCOE_SP_ERROR_CODE
+};
+
+/*
+ * FCoE task TX state
+ */
+enum fcoe_task_tx_state {
+	/* Initiate state after driver has initialized the task */
+	FCOE_TASK_TX_STATE_NORMAL,
+	/* Updated by TX path after complete transmitting unsolicited packet */
+	FCOE_TASK_TX_STATE_UNSOLICITED_COMPLETED,
+	/*
+	 * Updated by TX path after start processing the task requesting the
+	 * cleanup/abort operation
+	 */
+	FCOE_TASK_TX_STATE_CLEAN_REQ,
+	FCOE_TASK_TX_STATE_ABTS /* Updated by TX path during abort procedure */,
+	/* Updated by TX path during exchange cleanup procedure */
+	FCOE_TASK_TX_STATE_EXCLEANUP,
+	/*
+	 * Updated by TX path during exchange cleanup continuation task
+	 * procedure
+	 */
+	FCOE_TASK_TX_STATE_EXCLEANUP_TARGET_WRITE_CONT,
+	/* Updated by TX path during exchange cleanup first xfer procedure */
+	FCOE_TASK_TX_STATE_EXCLEANUP_TARGET_WRITE,
+	/* Updated by TX path during exchange cleanup read task in Target */
+	FCOE_TASK_TX_STATE_EXCLEANUP_TARGET_READ_OR_RSP,
+	/* Updated by TX path during target exchange cleanup procedure */
+	FCOE_TASK_TX_STATE_EXCLEANUP_TARGET_WRITE_LAST_CYCLE,
+	/* Updated by TX path during sequence recovery procedure */
+	FCOE_TASK_TX_STATE_SEQRECOVERY,
+	MAX_FCOE_TASK_TX_STATE
+};
+
+#endif /* __QEDF_HSI__ */
diff --git a/drivers/scsi/qedf/qedf_io.c b/drivers/scsi/qedf/qedf_io.c
new file mode 100644
index 0000000..50a50c4
--- /dev/null
+++ b/drivers/scsi/qedf/qedf_io.c
@@ -0,0 +1,2098 @@
+/*
+ *  QLogic FCoE Offload Driver
+ *  Copyright (c) 2016-2017 Cavium Inc.
+ *
+ *  This software is available under the terms of the GNU General Public License
+ *  (GPL) Version 2, available from the file COPYING in the main directory of
+ *  this source tree.
+ */
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include "qedf.h"
+#include <scsi/scsi_tcq.h>
+
+void qedf_cmd_timer_set(struct qedf_ctx *qedf, struct qedf_ioreq *io_req,
+	unsigned int timer_msec)
+{
+	queue_delayed_work(qedf->timer_work_queue, &io_req->timeout_work,
+	    msecs_to_jiffies(timer_msec));
+}
+
+static void qedf_cmd_timeout(struct work_struct *work)
+{
+
+	struct qedf_ioreq *io_req =
+	    container_of(work, struct qedf_ioreq, timeout_work.work);
+	struct qedf_ctx *qedf = io_req->fcport->qedf;
+	struct qedf_rport *fcport = io_req->fcport;
+	u8 op = 0;
+
+	switch (io_req->cmd_type) {
+	case QEDF_ABTS:
+		QEDF_ERR((&qedf->dbg_ctx), "ABTS timeout, xid=0x%x.\n",
+		    io_req->xid);
+		/* Cleanup timed out ABTS */
+		qedf_initiate_cleanup(io_req, true);
+		complete(&io_req->abts_done);
+
+		/*
+		 * Need to call kref_put for reference taken when initiate_abts
+		 * was called since abts_compl won't be called now that we've
+		 * cleaned up the task.
+		 */
+		kref_put(&io_req->refcount, qedf_release_cmd);
+
+		/*
+		 * Now that the original I/O and the ABTS are complete see
+		 * if we need to reconnect to the target.
+		 */
+		qedf_restart_rport(fcport);
+		break;
+	case QEDF_ELS:
+		kref_get(&io_req->refcount);
+		/*
+		 * Don't attempt to clean an ELS timeout as any subseqeunt
+		 * ABTS or cleanup requests just hang.  For now just free
+		 * the resources of the original I/O and the RRQ
+		 */
+		QEDF_ERR(&(qedf->dbg_ctx), "ELS timeout, xid=0x%x.\n",
+			  io_req->xid);
+		io_req->event = QEDF_IOREQ_EV_ELS_TMO;
+		/* Call callback function to complete command */
+		if (io_req->cb_func && io_req->cb_arg) {
+			op = io_req->cb_arg->op;
+			io_req->cb_func(io_req->cb_arg);
+			io_req->cb_arg = NULL;
+		}
+		qedf_initiate_cleanup(io_req, true);
+		kref_put(&io_req->refcount, qedf_release_cmd);
+		break;
+	case QEDF_SEQ_CLEANUP:
+		QEDF_ERR(&(qedf->dbg_ctx), "Sequence cleanup timeout, "
+		    "xid=0x%x.\n", io_req->xid);
+		qedf_initiate_cleanup(io_req, true);
+		io_req->event = QEDF_IOREQ_EV_ELS_TMO;
+		qedf_process_seq_cleanup_compl(qedf, NULL, io_req);
+		break;
+	default:
+		break;
+	}
+}
+
+void qedf_cmd_mgr_free(struct qedf_cmd_mgr *cmgr)
+{
+	struct io_bdt *bdt_info;
+	struct qedf_ctx *qedf = cmgr->qedf;
+	size_t bd_tbl_sz;
+	u16 min_xid = QEDF_MIN_XID;
+	u16 max_xid = (FCOE_PARAMS_NUM_TASKS - 1);
+	int num_ios;
+	int i;
+	struct qedf_ioreq *io_req;
+
+	num_ios = max_xid - min_xid + 1;
+
+	/* Free fcoe_bdt_ctx structures */
+	if (!cmgr->io_bdt_pool)
+		goto free_cmd_pool;
+
+	bd_tbl_sz = QEDF_MAX_BDS_PER_CMD * sizeof(struct scsi_sge);
+	for (i = 0; i < num_ios; i++) {
+		bdt_info = cmgr->io_bdt_pool[i];
+		if (bdt_info->bd_tbl) {
+			dma_free_coherent(&qedf->pdev->dev, bd_tbl_sz,
+			    bdt_info->bd_tbl, bdt_info->bd_tbl_dma);
+			bdt_info->bd_tbl = NULL;
+		}
+	}
+
+	/* Destroy io_bdt pool */
+	for (i = 0; i < num_ios; i++) {
+		kfree(cmgr->io_bdt_pool[i]);
+		cmgr->io_bdt_pool[i] = NULL;
+	}
+
+	kfree(cmgr->io_bdt_pool);
+	cmgr->io_bdt_pool = NULL;
+
+free_cmd_pool:
+
+	for (i = 0; i < num_ios; i++) {
+		io_req = &cmgr->cmds[i];
+		kfree(io_req->sgl_task_params);
+		kfree(io_req->task_params);
+		/* Make sure we free per command sense buffer */
+		if (io_req->sense_buffer)
+			dma_free_coherent(&qedf->pdev->dev,
+			    QEDF_SCSI_SENSE_BUFFERSIZE, io_req->sense_buffer,
+			    io_req->sense_buffer_dma);
+		cancel_delayed_work_sync(&io_req->rrq_work);
+	}
+
+	/* Free command manager itself */
+	vfree(cmgr);
+}
+
+static void qedf_handle_rrq(struct work_struct *work)
+{
+	struct qedf_ioreq *io_req =
+	    container_of(work, struct qedf_ioreq, rrq_work.work);
+
+	qedf_send_rrq(io_req);
+
+}
+
+struct qedf_cmd_mgr *qedf_cmd_mgr_alloc(struct qedf_ctx *qedf)
+{
+	struct qedf_cmd_mgr *cmgr;
+	struct io_bdt *bdt_info;
+	struct qedf_ioreq *io_req;
+	u16 xid;
+	int i;
+	int num_ios;
+	u16 min_xid = QEDF_MIN_XID;
+	u16 max_xid = (FCOE_PARAMS_NUM_TASKS - 1);
+
+	/* Make sure num_queues is already set before calling this function */
+	if (!qedf->num_queues) {
+		QEDF_ERR(&(qedf->dbg_ctx), "num_queues is not set.\n");
+		return NULL;
+	}
+
+	if (max_xid <= min_xid || max_xid == FC_XID_UNKNOWN) {
+		QEDF_WARN(&(qedf->dbg_ctx), "Invalid min_xid 0x%x and "
+			   "max_xid 0x%x.\n", min_xid, max_xid);
+		return NULL;
+	}
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "min xid 0x%x, max xid "
+		   "0x%x.\n", min_xid, max_xid);
+
+	num_ios = max_xid - min_xid + 1;
+
+	cmgr = vzalloc(sizeof(struct qedf_cmd_mgr));
+	if (!cmgr) {
+		QEDF_WARN(&(qedf->dbg_ctx), "Failed to alloc cmd mgr.\n");
+		return NULL;
+	}
+
+	cmgr->qedf = qedf;
+	spin_lock_init(&cmgr->lock);
+
+	/*
+	 * Initialize I/O request fields.
+	 */
+	xid = QEDF_MIN_XID;
+
+	for (i = 0; i < num_ios; i++) {
+		io_req = &cmgr->cmds[i];
+		INIT_DELAYED_WORK(&io_req->timeout_work, qedf_cmd_timeout);
+
+		io_req->xid = xid++;
+
+		INIT_DELAYED_WORK(&io_req->rrq_work, qedf_handle_rrq);
+
+		/* Allocate DMA memory to hold sense buffer */
+		io_req->sense_buffer = dma_alloc_coherent(&qedf->pdev->dev,
+		    QEDF_SCSI_SENSE_BUFFERSIZE, &io_req->sense_buffer_dma,
+		    GFP_KERNEL);
+		if (!io_req->sense_buffer)
+			goto mem_err;
+
+		/* Allocate task parameters to pass to f/w init funcions */
+		io_req->task_params = kzalloc(sizeof(*io_req->task_params),
+					      GFP_KERNEL);
+		if (!io_req->task_params) {
+			QEDF_ERR(&(qedf->dbg_ctx),
+				 "Failed to allocate task_params for xid=0x%x\n",
+				 i);
+			goto mem_err;
+		}
+
+		/*
+		 * Allocate scatter/gather list info to pass to f/w init
+		 * functions.
+		 */
+		io_req->sgl_task_params = kzalloc(
+		    sizeof(struct scsi_sgl_task_params), GFP_KERNEL);
+		if (!io_req->sgl_task_params) {
+			QEDF_ERR(&(qedf->dbg_ctx),
+				 "Failed to allocate sgl_task_params for xid=0x%x\n",
+				 i);
+			goto mem_err;
+		}
+	}
+
+	/* Allocate pool of io_bdts - one for each qedf_ioreq */
+	cmgr->io_bdt_pool = kmalloc_array(num_ios, sizeof(struct io_bdt *),
+	    GFP_KERNEL);
+
+	if (!cmgr->io_bdt_pool) {
+		QEDF_WARN(&(qedf->dbg_ctx), "Failed to alloc io_bdt_pool.\n");
+		goto mem_err;
+	}
+
+	for (i = 0; i < num_ios; i++) {
+		cmgr->io_bdt_pool[i] = kmalloc(sizeof(struct io_bdt),
+		    GFP_KERNEL);
+		if (!cmgr->io_bdt_pool[i]) {
+			QEDF_WARN(&(qedf->dbg_ctx),
+				  "Failed to alloc io_bdt_pool[%d].\n", i);
+			goto mem_err;
+		}
+	}
+
+	for (i = 0; i < num_ios; i++) {
+		bdt_info = cmgr->io_bdt_pool[i];
+		bdt_info->bd_tbl = dma_alloc_coherent(&qedf->pdev->dev,
+		    QEDF_MAX_BDS_PER_CMD * sizeof(struct scsi_sge),
+		    &bdt_info->bd_tbl_dma, GFP_KERNEL);
+		if (!bdt_info->bd_tbl) {
+			QEDF_WARN(&(qedf->dbg_ctx),
+				  "Failed to alloc bdt_tbl[%d].\n", i);
+			goto mem_err;
+		}
+	}
+	atomic_set(&cmgr->free_list_cnt, num_ios);
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+	    "cmgr->free_list_cnt=%d.\n",
+	    atomic_read(&cmgr->free_list_cnt));
+
+	return cmgr;
+
+mem_err:
+	qedf_cmd_mgr_free(cmgr);
+	return NULL;
+}
+
+struct qedf_ioreq *qedf_alloc_cmd(struct qedf_rport *fcport, u8 cmd_type)
+{
+	struct qedf_ctx *qedf = fcport->qedf;
+	struct qedf_cmd_mgr *cmd_mgr = qedf->cmd_mgr;
+	struct qedf_ioreq *io_req = NULL;
+	struct io_bdt *bd_tbl;
+	u16 xid;
+	uint32_t free_sqes;
+	int i;
+	unsigned long flags;
+
+	free_sqes = atomic_read(&fcport->free_sqes);
+
+	if (!free_sqes) {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "Returning NULL, free_sqes=%d.\n ",
+		    free_sqes);
+		goto out_failed;
+	}
+
+	/* Limit the number of outstanding R/W tasks */
+	if ((atomic_read(&fcport->num_active_ios) >=
+	    NUM_RW_TASKS_PER_CONNECTION)) {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "Returning NULL, num_active_ios=%d.\n",
+		    atomic_read(&fcport->num_active_ios));
+		goto out_failed;
+	}
+
+	/* Limit global TIDs certain tasks */
+	if (atomic_read(&cmd_mgr->free_list_cnt) <= GBL_RSVD_TASKS) {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "Returning NULL, free_list_cnt=%d.\n",
+		    atomic_read(&cmd_mgr->free_list_cnt));
+		goto out_failed;
+	}
+
+	spin_lock_irqsave(&cmd_mgr->lock, flags);
+	for (i = 0; i < FCOE_PARAMS_NUM_TASKS; i++) {
+		io_req = &cmd_mgr->cmds[cmd_mgr->idx];
+		cmd_mgr->idx++;
+		if (cmd_mgr->idx == FCOE_PARAMS_NUM_TASKS)
+			cmd_mgr->idx = 0;
+
+		/* Check to make sure command was previously freed */
+		if (!test_bit(QEDF_CMD_OUTSTANDING, &io_req->flags))
+			break;
+	}
+
+	if (i == FCOE_PARAMS_NUM_TASKS) {
+		spin_unlock_irqrestore(&cmd_mgr->lock, flags);
+		goto out_failed;
+	}
+
+	set_bit(QEDF_CMD_OUTSTANDING, &io_req->flags);
+	spin_unlock_irqrestore(&cmd_mgr->lock, flags);
+
+	atomic_inc(&fcport->num_active_ios);
+	atomic_dec(&fcport->free_sqes);
+	xid = io_req->xid;
+	atomic_dec(&cmd_mgr->free_list_cnt);
+
+	io_req->cmd_mgr = cmd_mgr;
+	io_req->fcport = fcport;
+
+	/* Hold the io_req against deletion */
+	kref_init(&io_req->refcount);
+
+	/* Bind io_bdt for this io_req */
+	/* Have a static link between io_req and io_bdt_pool */
+	bd_tbl = io_req->bd_tbl = cmd_mgr->io_bdt_pool[xid];
+	if (bd_tbl == NULL) {
+		QEDF_ERR(&(qedf->dbg_ctx), "bd_tbl is NULL, xid=%x.\n", xid);
+		kref_put(&io_req->refcount, qedf_release_cmd);
+		goto out_failed;
+	}
+	bd_tbl->io_req = io_req;
+	io_req->cmd_type = cmd_type;
+	io_req->tm_flags = 0;
+
+	/* Reset sequence offset data */
+	io_req->rx_buf_off = 0;
+	io_req->tx_buf_off = 0;
+	io_req->rx_id = 0xffff; /* No OX_ID */
+
+	return io_req;
+
+out_failed:
+	/* Record failure for stats and return NULL to caller */
+	qedf->alloc_failures++;
+	return NULL;
+}
+
+static void qedf_free_mp_resc(struct qedf_ioreq *io_req)
+{
+	struct qedf_mp_req *mp_req = &(io_req->mp_req);
+	struct qedf_ctx *qedf = io_req->fcport->qedf;
+	uint64_t sz = sizeof(struct scsi_sge);
+
+	/* clear tm flags */
+	if (mp_req->mp_req_bd) {
+		dma_free_coherent(&qedf->pdev->dev, sz,
+		    mp_req->mp_req_bd, mp_req->mp_req_bd_dma);
+		mp_req->mp_req_bd = NULL;
+	}
+	if (mp_req->mp_resp_bd) {
+		dma_free_coherent(&qedf->pdev->dev, sz,
+		    mp_req->mp_resp_bd, mp_req->mp_resp_bd_dma);
+		mp_req->mp_resp_bd = NULL;
+	}
+	if (mp_req->req_buf) {
+		dma_free_coherent(&qedf->pdev->dev, QEDF_PAGE_SIZE,
+		    mp_req->req_buf, mp_req->req_buf_dma);
+		mp_req->req_buf = NULL;
+	}
+	if (mp_req->resp_buf) {
+		dma_free_coherent(&qedf->pdev->dev, QEDF_PAGE_SIZE,
+		    mp_req->resp_buf, mp_req->resp_buf_dma);
+		mp_req->resp_buf = NULL;
+	}
+}
+
+void qedf_release_cmd(struct kref *ref)
+{
+	struct qedf_ioreq *io_req =
+	    container_of(ref, struct qedf_ioreq, refcount);
+	struct qedf_cmd_mgr *cmd_mgr = io_req->cmd_mgr;
+	struct qedf_rport *fcport = io_req->fcport;
+
+	if (io_req->cmd_type == QEDF_ELS ||
+	    io_req->cmd_type == QEDF_TASK_MGMT_CMD)
+		qedf_free_mp_resc(io_req);
+
+	atomic_inc(&cmd_mgr->free_list_cnt);
+	atomic_dec(&fcport->num_active_ios);
+	if (atomic_read(&fcport->num_active_ios) < 0)
+		QEDF_WARN(&(fcport->qedf->dbg_ctx), "active_ios < 0.\n");
+
+	/* Increment task retry identifier now that the request is released */
+	io_req->task_retry_identifier++;
+
+	clear_bit(QEDF_CMD_OUTSTANDING, &io_req->flags);
+}
+
+static int qedf_split_bd(struct qedf_ioreq *io_req, u64 addr, int sg_len,
+	int bd_index)
+{
+	struct scsi_sge *bd = io_req->bd_tbl->bd_tbl;
+	int frag_size, sg_frags;
+
+	sg_frags = 0;
+	while (sg_len) {
+		if (sg_len > QEDF_BD_SPLIT_SZ)
+			frag_size = QEDF_BD_SPLIT_SZ;
+		else
+			frag_size = sg_len;
+		bd[bd_index + sg_frags].sge_addr.lo = U64_LO(addr);
+		bd[bd_index + sg_frags].sge_addr.hi = U64_HI(addr);
+		bd[bd_index + sg_frags].sge_len = (uint16_t)frag_size;
+
+		addr += (u64)frag_size;
+		sg_frags++;
+		sg_len -= frag_size;
+	}
+	return sg_frags;
+}
+
+static int qedf_map_sg(struct qedf_ioreq *io_req)
+{
+	struct scsi_cmnd *sc = io_req->sc_cmd;
+	struct Scsi_Host *host = sc->device->host;
+	struct fc_lport *lport = shost_priv(host);
+	struct qedf_ctx *qedf = lport_priv(lport);
+	struct scsi_sge *bd = io_req->bd_tbl->bd_tbl;
+	struct scatterlist *sg;
+	int byte_count = 0;
+	int sg_count = 0;
+	int bd_count = 0;
+	int sg_frags;
+	unsigned int sg_len;
+	u64 addr, end_addr;
+	int i;
+
+	sg_count = dma_map_sg(&qedf->pdev->dev, scsi_sglist(sc),
+	    scsi_sg_count(sc), sc->sc_data_direction);
+
+	sg = scsi_sglist(sc);
+
+	/*
+	 * New condition to send single SGE as cached-SGL with length less
+	 * than 64k.
+	 */
+	if ((sg_count == 1) && (sg_dma_len(sg) <=
+	    QEDF_MAX_SGLEN_FOR_CACHESGL)) {
+		sg_len = sg_dma_len(sg);
+		addr = (u64)sg_dma_address(sg);
+
+		bd[bd_count].sge_addr.lo = (addr & 0xffffffff);
+		bd[bd_count].sge_addr.hi = (addr >> 32);
+		bd[bd_count].sge_len = (u16)sg_len;
+
+		return ++bd_count;
+	}
+
+	scsi_for_each_sg(sc, sg, sg_count, i) {
+		sg_len = sg_dma_len(sg);
+		addr = (u64)sg_dma_address(sg);
+		end_addr = (u64)(addr + sg_len);
+
+		/*
+		 * First s/g element in the list so check if the end_addr
+		 * is paged aligned. Also check to make sure the length is
+		 * at least page size.
+		 */
+		if ((i == 0) && (sg_count > 1) &&
+		    ((end_addr % QEDF_PAGE_SIZE) ||
+		    sg_len < QEDF_PAGE_SIZE))
+			io_req->use_slowpath = true;
+		/*
+		 * Last s/g element so check if the start address is paged
+		 * aligned.
+		 */
+		else if ((i == (sg_count - 1)) && (sg_count > 1) &&
+		    (addr % QEDF_PAGE_SIZE))
+			io_req->use_slowpath = true;
+		/*
+		 * Intermediate s/g element so check if start and end address
+		 * is page aligned.
+		 */
+		else if ((i != 0) && (i != (sg_count - 1)) &&
+		    ((addr % QEDF_PAGE_SIZE) || (end_addr % QEDF_PAGE_SIZE)))
+			io_req->use_slowpath = true;
+
+		if (sg_len > QEDF_MAX_BD_LEN) {
+			sg_frags = qedf_split_bd(io_req, addr, sg_len,
+			    bd_count);
+		} else {
+			sg_frags = 1;
+			bd[bd_count].sge_addr.lo = U64_LO(addr);
+			bd[bd_count].sge_addr.hi  = U64_HI(addr);
+			bd[bd_count].sge_len = (uint16_t)sg_len;
+		}
+
+		bd_count += sg_frags;
+		byte_count += sg_len;
+	}
+
+	if (byte_count != scsi_bufflen(sc))
+		QEDF_ERR(&(qedf->dbg_ctx), "byte_count = %d != "
+			  "scsi_bufflen = %d, task_id = 0x%x.\n", byte_count,
+			   scsi_bufflen(sc), io_req->xid);
+
+	return bd_count;
+}
+
+static int qedf_build_bd_list_from_sg(struct qedf_ioreq *io_req)
+{
+	struct scsi_cmnd *sc = io_req->sc_cmd;
+	struct scsi_sge *bd = io_req->bd_tbl->bd_tbl;
+	int bd_count;
+
+	if (scsi_sg_count(sc)) {
+		bd_count = qedf_map_sg(io_req);
+		if (bd_count == 0)
+			return -ENOMEM;
+	} else {
+		bd_count = 0;
+		bd[0].sge_addr.lo = bd[0].sge_addr.hi = 0;
+		bd[0].sge_len = 0;
+	}
+	io_req->bd_tbl->bd_valid = bd_count;
+
+	return 0;
+}
+
+static void qedf_build_fcp_cmnd(struct qedf_ioreq *io_req,
+				  struct fcp_cmnd *fcp_cmnd)
+{
+	struct scsi_cmnd *sc_cmd = io_req->sc_cmd;
+
+	/* fcp_cmnd is 32 bytes */
+	memset(fcp_cmnd, 0, FCP_CMND_LEN);
+
+	/* 8 bytes: SCSI LUN info */
+	int_to_scsilun(sc_cmd->device->lun,
+			(struct scsi_lun *)&fcp_cmnd->fc_lun);
+
+	/* 4 bytes: flag info */
+	fcp_cmnd->fc_pri_ta = 0;
+	fcp_cmnd->fc_tm_flags = io_req->tm_flags;
+	fcp_cmnd->fc_flags = io_req->io_req_flags;
+	fcp_cmnd->fc_cmdref = 0;
+
+	/* Populate data direction */
+	if (io_req->cmd_type == QEDF_TASK_MGMT_CMD) {
+		fcp_cmnd->fc_flags |= FCP_CFL_RDDATA;
+	} else {
+		if (sc_cmd->sc_data_direction == DMA_TO_DEVICE)
+			fcp_cmnd->fc_flags |= FCP_CFL_WRDATA;
+		else if (sc_cmd->sc_data_direction == DMA_FROM_DEVICE)
+			fcp_cmnd->fc_flags |= FCP_CFL_RDDATA;
+	}
+
+	fcp_cmnd->fc_pri_ta = FCP_PTA_SIMPLE;
+
+	/* 16 bytes: CDB information */
+	if (io_req->cmd_type != QEDF_TASK_MGMT_CMD)
+		memcpy(fcp_cmnd->fc_cdb, sc_cmd->cmnd, sc_cmd->cmd_len);
+
+	/* 4 bytes: FCP data length */
+	fcp_cmnd->fc_dl = htonl(io_req->data_xfer_len);
+}
+
+static void  qedf_init_task(struct qedf_rport *fcport, struct fc_lport *lport,
+	struct qedf_ioreq *io_req, struct e4_fcoe_task_context *task_ctx,
+	struct fcoe_wqe *sqe)
+{
+	enum fcoe_task_type task_type;
+	struct scsi_cmnd *sc_cmd = io_req->sc_cmd;
+	struct io_bdt *bd_tbl = io_req->bd_tbl;
+	u8 fcp_cmnd[32];
+	u32 tmp_fcp_cmnd[8];
+	int bd_count = 0;
+	struct qedf_ctx *qedf = fcport->qedf;
+	uint16_t cq_idx = smp_processor_id() % qedf->num_queues;
+	struct regpair sense_data_buffer_phys_addr;
+	u32 tx_io_size = 0;
+	u32 rx_io_size = 0;
+	int i, cnt;
+
+	/* Note init_initiator_rw_fcoe_task memsets the task context */
+	io_req->task = task_ctx;
+	memset(task_ctx, 0, sizeof(struct e4_fcoe_task_context));
+	memset(io_req->task_params, 0, sizeof(struct fcoe_task_params));
+	memset(io_req->sgl_task_params, 0, sizeof(struct scsi_sgl_task_params));
+
+	/* Set task type bassed on DMA directio of command */
+	if (io_req->cmd_type == QEDF_TASK_MGMT_CMD) {
+		task_type = FCOE_TASK_TYPE_READ_INITIATOR;
+	} else {
+		if (sc_cmd->sc_data_direction == DMA_TO_DEVICE) {
+			task_type = FCOE_TASK_TYPE_WRITE_INITIATOR;
+			tx_io_size = io_req->data_xfer_len;
+		} else {
+			task_type = FCOE_TASK_TYPE_READ_INITIATOR;
+			rx_io_size = io_req->data_xfer_len;
+		}
+	}
+
+	/* Setup the fields for fcoe_task_params */
+	io_req->task_params->context = task_ctx;
+	io_req->task_params->sqe = sqe;
+	io_req->task_params->task_type = task_type;
+	io_req->task_params->tx_io_size = tx_io_size;
+	io_req->task_params->rx_io_size = rx_io_size;
+	io_req->task_params->conn_cid = fcport->fw_cid;
+	io_req->task_params->itid = io_req->xid;
+	io_req->task_params->cq_rss_number = cq_idx;
+	io_req->task_params->is_tape_device = fcport->dev_type;
+
+	/* Fill in information for scatter/gather list */
+	if (io_req->cmd_type != QEDF_TASK_MGMT_CMD) {
+		bd_count = bd_tbl->bd_valid;
+		io_req->sgl_task_params->sgl = bd_tbl->bd_tbl;
+		io_req->sgl_task_params->sgl_phys_addr.lo =
+			U64_LO(bd_tbl->bd_tbl_dma);
+		io_req->sgl_task_params->sgl_phys_addr.hi =
+			U64_HI(bd_tbl->bd_tbl_dma);
+		io_req->sgl_task_params->num_sges = bd_count;
+		io_req->sgl_task_params->total_buffer_size =
+		    scsi_bufflen(io_req->sc_cmd);
+		io_req->sgl_task_params->small_mid_sge =
+			io_req->use_slowpath;
+	}
+
+	/* Fill in physical address of sense buffer */
+	sense_data_buffer_phys_addr.lo = U64_LO(io_req->sense_buffer_dma);
+	sense_data_buffer_phys_addr.hi = U64_HI(io_req->sense_buffer_dma);
+
+	/* fill FCP_CMND IU */
+	qedf_build_fcp_cmnd(io_req, (struct fcp_cmnd *)tmp_fcp_cmnd);
+
+	/* Swap fcp_cmnd since FC is big endian */
+	cnt = sizeof(struct fcp_cmnd) / sizeof(u32);
+	for (i = 0; i < cnt; i++) {
+		tmp_fcp_cmnd[i] = cpu_to_be32(tmp_fcp_cmnd[i]);
+	}
+	memcpy(fcp_cmnd, tmp_fcp_cmnd, sizeof(struct fcp_cmnd));
+
+	init_initiator_rw_fcoe_task(io_req->task_params,
+				    io_req->sgl_task_params,
+				    sense_data_buffer_phys_addr,
+				    io_req->task_retry_identifier, fcp_cmnd);
+
+	/* Increment SGL type counters */
+	if (bd_count == 1) {
+		qedf->single_sge_ios++;
+		io_req->sge_type = QEDF_IOREQ_SINGLE_SGE;
+	} else if (io_req->use_slowpath) {
+		qedf->slow_sge_ios++;
+		io_req->sge_type = QEDF_IOREQ_SLOW_SGE;
+	} else {
+		qedf->fast_sge_ios++;
+		io_req->sge_type = QEDF_IOREQ_FAST_SGE;
+	}
+}
+
+void qedf_init_mp_task(struct qedf_ioreq *io_req,
+	struct e4_fcoe_task_context *task_ctx, struct fcoe_wqe *sqe)
+{
+	struct qedf_mp_req *mp_req = &(io_req->mp_req);
+	struct qedf_rport *fcport = io_req->fcport;
+	struct qedf_ctx *qedf = io_req->fcport->qedf;
+	struct fc_frame_header *fc_hdr;
+	struct fcoe_tx_mid_path_params task_fc_hdr;
+	struct scsi_sgl_task_params tx_sgl_task_params;
+	struct scsi_sgl_task_params rx_sgl_task_params;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+		  "Initializing MP task for cmd_type=%d\n",
+		  io_req->cmd_type);
+
+	qedf->control_requests++;
+
+	memset(&tx_sgl_task_params, 0, sizeof(struct scsi_sgl_task_params));
+	memset(&rx_sgl_task_params, 0, sizeof(struct scsi_sgl_task_params));
+	memset(task_ctx, 0, sizeof(struct e4_fcoe_task_context));
+	memset(&task_fc_hdr, 0, sizeof(struct fcoe_tx_mid_path_params));
+
+	/* Setup the task from io_req for easy reference */
+	io_req->task = task_ctx;
+
+	/* Setup the fields for fcoe_task_params */
+	io_req->task_params->context = task_ctx;
+	io_req->task_params->sqe = sqe;
+	io_req->task_params->task_type = FCOE_TASK_TYPE_MIDPATH;
+	io_req->task_params->tx_io_size = io_req->data_xfer_len;
+	/* rx_io_size tells the f/w how large a response buffer we have */
+	io_req->task_params->rx_io_size = PAGE_SIZE;
+	io_req->task_params->conn_cid = fcport->fw_cid;
+	io_req->task_params->itid = io_req->xid;
+	/* Return middle path commands on CQ 0 */
+	io_req->task_params->cq_rss_number = 0;
+	io_req->task_params->is_tape_device = fcport->dev_type;
+
+	fc_hdr = &(mp_req->req_fc_hdr);
+	/* Set OX_ID and RX_ID based on driver task id */
+	fc_hdr->fh_ox_id = io_req->xid;
+	fc_hdr->fh_rx_id = htons(0xffff);
+
+	/* Set up FC header information */
+	task_fc_hdr.parameter = fc_hdr->fh_parm_offset;
+	task_fc_hdr.r_ctl = fc_hdr->fh_r_ctl;
+	task_fc_hdr.type = fc_hdr->fh_type;
+	task_fc_hdr.cs_ctl = fc_hdr->fh_cs_ctl;
+	task_fc_hdr.df_ctl = fc_hdr->fh_df_ctl;
+	task_fc_hdr.rx_id = fc_hdr->fh_rx_id;
+	task_fc_hdr.ox_id = fc_hdr->fh_ox_id;
+
+	/* Set up s/g list parameters for request buffer */
+	tx_sgl_task_params.sgl = mp_req->mp_req_bd;
+	tx_sgl_task_params.sgl_phys_addr.lo = U64_LO(mp_req->mp_req_bd_dma);
+	tx_sgl_task_params.sgl_phys_addr.hi = U64_HI(mp_req->mp_req_bd_dma);
+	tx_sgl_task_params.num_sges = 1;
+	/* Set PAGE_SIZE for now since sg element is that size ??? */
+	tx_sgl_task_params.total_buffer_size = io_req->data_xfer_len;
+	tx_sgl_task_params.small_mid_sge = 0;
+
+	/* Set up s/g list parameters for request buffer */
+	rx_sgl_task_params.sgl = mp_req->mp_resp_bd;
+	rx_sgl_task_params.sgl_phys_addr.lo = U64_LO(mp_req->mp_resp_bd_dma);
+	rx_sgl_task_params.sgl_phys_addr.hi = U64_HI(mp_req->mp_resp_bd_dma);
+	rx_sgl_task_params.num_sges = 1;
+	/* Set PAGE_SIZE for now since sg element is that size ??? */
+	rx_sgl_task_params.total_buffer_size = PAGE_SIZE;
+	rx_sgl_task_params.small_mid_sge = 0;
+
+
+	/*
+	 * Last arg is 0 as previous code did not set that we wanted the
+	 * fc header information.
+	 */
+	init_initiator_midpath_unsolicited_fcoe_task(io_req->task_params,
+						     &task_fc_hdr,
+						     &tx_sgl_task_params,
+						     &rx_sgl_task_params, 0);
+
+	/* Midpath requests always consume 1 SGE */
+	qedf->single_sge_ios++;
+}
+
+/* Presumed that fcport->rport_lock is held */
+u16 qedf_get_sqe_idx(struct qedf_rport *fcport)
+{
+	uint16_t total_sqe = (fcport->sq_mem_size)/(sizeof(struct fcoe_wqe));
+	u16 rval;
+
+	rval = fcport->sq_prod_idx;
+
+	/* Adjust ring index */
+	fcport->sq_prod_idx++;
+	fcport->fw_sq_prod_idx++;
+	if (fcport->sq_prod_idx == total_sqe)
+		fcport->sq_prod_idx = 0;
+
+	return rval;
+}
+
+void qedf_ring_doorbell(struct qedf_rport *fcport)
+{
+	struct fcoe_db_data dbell = { 0 };
+
+	dbell.agg_flags = 0;
+
+	dbell.params |= DB_DEST_XCM << FCOE_DB_DATA_DEST_SHIFT;
+	dbell.params |= DB_AGG_CMD_SET << FCOE_DB_DATA_AGG_CMD_SHIFT;
+	dbell.params |= DQ_XCM_FCOE_SQ_PROD_CMD <<
+	    FCOE_DB_DATA_AGG_VAL_SEL_SHIFT;
+
+	dbell.sq_prod = fcport->fw_sq_prod_idx;
+	writel(*(u32 *)&dbell, fcport->p_doorbell);
+	/* Make sure SQ index is updated so f/w prcesses requests in order */
+	wmb();
+	mmiowb();
+}
+
+static void qedf_trace_io(struct qedf_rport *fcport, struct qedf_ioreq *io_req,
+			  int8_t direction)
+{
+	struct qedf_ctx *qedf = fcport->qedf;
+	struct qedf_io_log *io_log;
+	struct scsi_cmnd *sc_cmd = io_req->sc_cmd;
+	unsigned long flags;
+	uint8_t op;
+
+	spin_lock_irqsave(&qedf->io_trace_lock, flags);
+
+	io_log = &qedf->io_trace_buf[qedf->io_trace_idx];
+	io_log->direction = direction;
+	io_log->task_id = io_req->xid;
+	io_log->port_id = fcport->rdata->ids.port_id;
+	io_log->lun = sc_cmd->device->lun;
+	io_log->op = op = sc_cmd->cmnd[0];
+	io_log->lba[0] = sc_cmd->cmnd[2];
+	io_log->lba[1] = sc_cmd->cmnd[3];
+	io_log->lba[2] = sc_cmd->cmnd[4];
+	io_log->lba[3] = sc_cmd->cmnd[5];
+	io_log->bufflen = scsi_bufflen(sc_cmd);
+	io_log->sg_count = scsi_sg_count(sc_cmd);
+	io_log->result = sc_cmd->result;
+	io_log->jiffies = jiffies;
+	io_log->refcount = kref_read(&io_req->refcount);
+
+	if (direction == QEDF_IO_TRACE_REQ) {
+		/* For requests we only care abot the submission CPU */
+		io_log->req_cpu = io_req->cpu;
+		io_log->int_cpu = 0;
+		io_log->rsp_cpu = 0;
+	} else if (direction == QEDF_IO_TRACE_RSP) {
+		io_log->req_cpu = io_req->cpu;
+		io_log->int_cpu = io_req->int_cpu;
+		io_log->rsp_cpu = smp_processor_id();
+	}
+
+	io_log->sge_type = io_req->sge_type;
+
+	qedf->io_trace_idx++;
+	if (qedf->io_trace_idx == QEDF_IO_TRACE_SIZE)
+		qedf->io_trace_idx = 0;
+
+	spin_unlock_irqrestore(&qedf->io_trace_lock, flags);
+}
+
+int qedf_post_io_req(struct qedf_rport *fcport, struct qedf_ioreq *io_req)
+{
+	struct scsi_cmnd *sc_cmd = io_req->sc_cmd;
+	struct Scsi_Host *host = sc_cmd->device->host;
+	struct fc_lport *lport = shost_priv(host);
+	struct qedf_ctx *qedf = lport_priv(lport);
+	struct e4_fcoe_task_context *task_ctx;
+	u16 xid;
+	enum fcoe_task_type req_type = 0;
+	struct fcoe_wqe *sqe;
+	u16 sqe_idx;
+
+	/* Initialize rest of io_req fileds */
+	io_req->data_xfer_len = scsi_bufflen(sc_cmd);
+	sc_cmd->SCp.ptr = (char *)io_req;
+	io_req->use_slowpath = false; /* Assume fast SGL by default */
+
+	/* Record which cpu this request is associated with */
+	io_req->cpu = smp_processor_id();
+
+	if (sc_cmd->sc_data_direction == DMA_FROM_DEVICE) {
+		req_type = FCOE_TASK_TYPE_READ_INITIATOR;
+		io_req->io_req_flags = QEDF_READ;
+		qedf->input_requests++;
+	} else if (sc_cmd->sc_data_direction == DMA_TO_DEVICE) {
+		req_type = FCOE_TASK_TYPE_WRITE_INITIATOR;
+		io_req->io_req_flags = QEDF_WRITE;
+		qedf->output_requests++;
+	} else {
+		io_req->io_req_flags = 0;
+		qedf->control_requests++;
+	}
+
+	xid = io_req->xid;
+
+	/* Build buffer descriptor list for firmware from sg list */
+	if (qedf_build_bd_list_from_sg(io_req)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "BD list creation failed.\n");
+		kref_put(&io_req->refcount, qedf_release_cmd);
+		return -EAGAIN;
+	}
+
+	if (!test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Session not offloaded yet.\n");
+		kref_put(&io_req->refcount, qedf_release_cmd);
+	}
+
+	/* Obtain free SQE */
+	sqe_idx = qedf_get_sqe_idx(fcport);
+	sqe = &fcport->sq[sqe_idx];
+	memset(sqe, 0, sizeof(struct fcoe_wqe));
+
+	/* Get the task context */
+	task_ctx = qedf_get_task_mem(&qedf->tasks, xid);
+	if (!task_ctx) {
+		QEDF_WARN(&(qedf->dbg_ctx), "task_ctx is NULL, xid=%d.\n",
+			   xid);
+		kref_put(&io_req->refcount, qedf_release_cmd);
+		return -EINVAL;
+	}
+
+	qedf_init_task(fcport, lport, io_req, task_ctx, sqe);
+
+	/* Ring doorbell */
+	qedf_ring_doorbell(fcport);
+
+	if (qedf_io_tracing && io_req->sc_cmd)
+		qedf_trace_io(fcport, io_req, QEDF_IO_TRACE_REQ);
+
+	return false;
+}
+
+int
+qedf_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc_cmd)
+{
+	struct fc_lport *lport = shost_priv(host);
+	struct qedf_ctx *qedf = lport_priv(lport);
+	struct fc_rport *rport = starget_to_rport(scsi_target(sc_cmd->device));
+	struct fc_rport_libfc_priv *rp = rport->dd_data;
+	struct qedf_rport *fcport;
+	struct qedf_ioreq *io_req;
+	int rc = 0;
+	int rval;
+	unsigned long flags = 0;
+
+
+	if (test_bit(QEDF_UNLOADING, &qedf->flags) ||
+	    test_bit(QEDF_DBG_STOP_IO, &qedf->flags)) {
+		sc_cmd->result = DID_NO_CONNECT << 16;
+		sc_cmd->scsi_done(sc_cmd);
+		return 0;
+	}
+
+	rval = fc_remote_port_chkready(rport);
+	if (rval) {
+		sc_cmd->result = rval;
+		sc_cmd->scsi_done(sc_cmd);
+		return 0;
+	}
+
+	/* Retry command if we are doing a qed drain operation */
+	if (test_bit(QEDF_DRAIN_ACTIVE, &qedf->flags)) {
+		rc = SCSI_MLQUEUE_HOST_BUSY;
+		goto exit_qcmd;
+	}
+
+	if (lport->state != LPORT_ST_READY ||
+	    atomic_read(&qedf->link_state) != QEDF_LINK_UP) {
+		rc = SCSI_MLQUEUE_HOST_BUSY;
+		goto exit_qcmd;
+	}
+
+	/* rport and tgt are allocated together, so tgt should be non-NULL */
+	fcport = (struct qedf_rport *)&rp[1];
+
+	if (!test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+		/*
+		 * Session is not offloaded yet. Let SCSI-ml retry
+		 * the command.
+		 */
+		rc = SCSI_MLQUEUE_TARGET_BUSY;
+		goto exit_qcmd;
+	}
+	if (fcport->retry_delay_timestamp) {
+		if (time_after(jiffies, fcport->retry_delay_timestamp)) {
+			fcport->retry_delay_timestamp = 0;
+		} else {
+			/* If retry_delay timer is active, flow off the ML */
+			rc = SCSI_MLQUEUE_TARGET_BUSY;
+			goto exit_qcmd;
+		}
+	}
+
+	io_req = qedf_alloc_cmd(fcport, QEDF_SCSI_CMD);
+	if (!io_req) {
+		rc = SCSI_MLQUEUE_HOST_BUSY;
+		goto exit_qcmd;
+	}
+
+	io_req->sc_cmd = sc_cmd;
+
+	/* Take fcport->rport_lock for posting to fcport send queue */
+	spin_lock_irqsave(&fcport->rport_lock, flags);
+	if (qedf_post_io_req(fcport, io_req)) {
+		QEDF_WARN(&(qedf->dbg_ctx), "Unable to post io_req\n");
+		/* Return SQE to pool */
+		atomic_inc(&fcport->free_sqes);
+		rc = SCSI_MLQUEUE_HOST_BUSY;
+	}
+	spin_unlock_irqrestore(&fcport->rport_lock, flags);
+
+exit_qcmd:
+	return rc;
+}
+
+static void qedf_parse_fcp_rsp(struct qedf_ioreq *io_req,
+				 struct fcoe_cqe_rsp_info *fcp_rsp)
+{
+	struct scsi_cmnd *sc_cmd = io_req->sc_cmd;
+	struct qedf_ctx *qedf = io_req->fcport->qedf;
+	u8 rsp_flags = fcp_rsp->rsp_flags.flags;
+	int fcp_sns_len = 0;
+	int fcp_rsp_len = 0;
+	uint8_t *rsp_info, *sense_data;
+
+	io_req->fcp_status = FC_GOOD;
+	io_req->fcp_resid = 0;
+	if (rsp_flags & (FCOE_FCP_RSP_FLAGS_FCP_RESID_OVER |
+	    FCOE_FCP_RSP_FLAGS_FCP_RESID_UNDER))
+		io_req->fcp_resid = fcp_rsp->fcp_resid;
+
+	io_req->scsi_comp_flags = rsp_flags;
+	CMD_SCSI_STATUS(sc_cmd) = io_req->cdb_status =
+	    fcp_rsp->scsi_status_code;
+
+	if (rsp_flags &
+	    FCOE_FCP_RSP_FLAGS_FCP_RSP_LEN_VALID)
+		fcp_rsp_len = fcp_rsp->fcp_rsp_len;
+
+	if (rsp_flags &
+	    FCOE_FCP_RSP_FLAGS_FCP_SNS_LEN_VALID)
+		fcp_sns_len = fcp_rsp->fcp_sns_len;
+
+	io_req->fcp_rsp_len = fcp_rsp_len;
+	io_req->fcp_sns_len = fcp_sns_len;
+	rsp_info = sense_data = io_req->sense_buffer;
+
+	/* fetch fcp_rsp_code */
+	if ((fcp_rsp_len == 4) || (fcp_rsp_len == 8)) {
+		/* Only for task management function */
+		io_req->fcp_rsp_code = rsp_info[3];
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "fcp_rsp_code = %d\n", io_req->fcp_rsp_code);
+		/* Adjust sense-data location. */
+		sense_data += fcp_rsp_len;
+	}
+
+	if (fcp_sns_len > SCSI_SENSE_BUFFERSIZE) {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "Truncating sense buffer\n");
+		fcp_sns_len = SCSI_SENSE_BUFFERSIZE;
+	}
+
+	/* The sense buffer can be NULL for TMF commands */
+	if (sc_cmd->sense_buffer) {
+		memset(sc_cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
+		if (fcp_sns_len)
+			memcpy(sc_cmd->sense_buffer, sense_data,
+			    fcp_sns_len);
+	}
+}
+
+static void qedf_unmap_sg_list(struct qedf_ctx *qedf, struct qedf_ioreq *io_req)
+{
+	struct scsi_cmnd *sc = io_req->sc_cmd;
+
+	if (io_req->bd_tbl->bd_valid && sc && scsi_sg_count(sc)) {
+		dma_unmap_sg(&qedf->pdev->dev, scsi_sglist(sc),
+		    scsi_sg_count(sc), sc->sc_data_direction);
+		io_req->bd_tbl->bd_valid = 0;
+	}
+}
+
+void qedf_scsi_completion(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
+	struct qedf_ioreq *io_req)
+{
+	u16 xid, rval;
+	struct e4_fcoe_task_context *task_ctx;
+	struct scsi_cmnd *sc_cmd;
+	struct fcoe_cqe_rsp_info *fcp_rsp;
+	struct qedf_rport *fcport;
+	int refcount;
+	u16 scope, qualifier = 0;
+	u8 fw_residual_flag = 0;
+
+	if (!io_req)
+		return;
+	if (!cqe)
+		return;
+
+	xid = io_req->xid;
+	task_ctx = qedf_get_task_mem(&qedf->tasks, xid);
+	sc_cmd = io_req->sc_cmd;
+	fcp_rsp = &cqe->cqe_info.rsp_info;
+
+	if (!sc_cmd) {
+		QEDF_WARN(&(qedf->dbg_ctx), "sc_cmd is NULL!\n");
+		return;
+	}
+
+	if (!sc_cmd->SCp.ptr) {
+		QEDF_WARN(&(qedf->dbg_ctx), "SCp.ptr is NULL, returned in "
+		    "another context.\n");
+		return;
+	}
+
+	if (!sc_cmd->request) {
+		QEDF_WARN(&(qedf->dbg_ctx), "sc_cmd->request is NULL, "
+		    "sc_cmd=%p.\n", sc_cmd);
+		return;
+	}
+
+	if (!sc_cmd->request->special) {
+		QEDF_WARN(&(qedf->dbg_ctx), "request->special is NULL so "
+		    "request not valid, sc_cmd=%p.\n", sc_cmd);
+		return;
+	}
+
+	if (!sc_cmd->request->q) {
+		QEDF_WARN(&(qedf->dbg_ctx), "request->q is NULL so request "
+		   "is not valid, sc_cmd=%p.\n", sc_cmd);
+		return;
+	}
+
+	fcport = io_req->fcport;
+
+	qedf_parse_fcp_rsp(io_req, fcp_rsp);
+
+	qedf_unmap_sg_list(qedf, io_req);
+
+	/* Check for FCP transport error */
+	if (io_req->fcp_rsp_len > 3 && io_req->fcp_rsp_code) {
+		QEDF_ERR(&(qedf->dbg_ctx),
+		    "FCP I/O protocol failure xid=0x%x fcp_rsp_len=%d "
+		    "fcp_rsp_code=%d.\n", io_req->xid, io_req->fcp_rsp_len,
+		    io_req->fcp_rsp_code);
+		sc_cmd->result = DID_BUS_BUSY << 16;
+		goto out;
+	}
+
+	fw_residual_flag = GET_FIELD(cqe->cqe_info.rsp_info.fw_error_flags,
+	    FCOE_CQE_RSP_INFO_FW_UNDERRUN);
+	if (fw_residual_flag) {
+		QEDF_ERR(&(qedf->dbg_ctx),
+		    "Firmware detected underrun: xid=0x%x fcp_rsp.flags=0x%02x "
+		    "fcp_resid=%d fw_residual=0x%x.\n", io_req->xid,
+		    fcp_rsp->rsp_flags.flags, io_req->fcp_resid,
+		    cqe->cqe_info.rsp_info.fw_residual);
+
+		if (io_req->cdb_status == 0)
+			sc_cmd->result = (DID_ERROR << 16) | io_req->cdb_status;
+		else
+			sc_cmd->result = (DID_OK << 16) | io_req->cdb_status;
+
+		/* Abort the command since we did not get all the data */
+		init_completion(&io_req->abts_done);
+		rval = qedf_initiate_abts(io_req, true);
+		if (rval) {
+			QEDF_ERR(&(qedf->dbg_ctx), "Failed to queue ABTS.\n");
+			sc_cmd->result = (DID_ERROR << 16) | io_req->cdb_status;
+		}
+
+		/*
+		 * Set resid to the whole buffer length so we won't try to resue
+		 * any previously data.
+		 */
+		scsi_set_resid(sc_cmd, scsi_bufflen(sc_cmd));
+		goto out;
+	}
+
+	switch (io_req->fcp_status) {
+	case FC_GOOD:
+		if (io_req->cdb_status == 0) {
+			/* Good I/O completion */
+			sc_cmd->result = DID_OK << 16;
+		} else {
+			refcount = kref_read(&io_req->refcount);
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+			    "%d:0:%d:%lld xid=0x%0x op=0x%02x "
+			    "lba=%02x%02x%02x%02x cdb_status=%d "
+			    "fcp_resid=0x%x refcount=%d.\n",
+			    qedf->lport->host->host_no, sc_cmd->device->id,
+			    sc_cmd->device->lun, io_req->xid,
+			    sc_cmd->cmnd[0], sc_cmd->cmnd[2], sc_cmd->cmnd[3],
+			    sc_cmd->cmnd[4], sc_cmd->cmnd[5],
+			    io_req->cdb_status, io_req->fcp_resid,
+			    refcount);
+			sc_cmd->result = (DID_OK << 16) | io_req->cdb_status;
+
+			if (io_req->cdb_status == SAM_STAT_TASK_SET_FULL ||
+			    io_req->cdb_status == SAM_STAT_BUSY) {
+				/*
+				 * Check whether we need to set retry_delay at
+				 * all based on retry_delay module parameter
+				 * and the status qualifier.
+				 */
+
+				/* Upper 2 bits */
+				scope = fcp_rsp->retry_delay_timer & 0xC000;
+				/* Lower 14 bits */
+				qualifier = fcp_rsp->retry_delay_timer & 0x3FFF;
+
+				if (qedf_retry_delay &&
+				    scope > 0 && qualifier > 0 &&
+				    qualifier <= 0x3FEF) {
+					/* Check we don't go over the max */
+					if (qualifier > QEDF_RETRY_DELAY_MAX)
+						qualifier =
+						    QEDF_RETRY_DELAY_MAX;
+					fcport->retry_delay_timestamp =
+					    jiffies + (qualifier * HZ / 10);
+				}
+			}
+		}
+		if (io_req->fcp_resid)
+			scsi_set_resid(sc_cmd, io_req->fcp_resid);
+		break;
+	default:
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO, "fcp_status=%d.\n",
+			   io_req->fcp_status);
+		break;
+	}
+
+out:
+	if (qedf_io_tracing)
+		qedf_trace_io(fcport, io_req, QEDF_IO_TRACE_RSP);
+
+	io_req->sc_cmd = NULL;
+	sc_cmd->SCp.ptr =  NULL;
+	sc_cmd->scsi_done(sc_cmd);
+	kref_put(&io_req->refcount, qedf_release_cmd);
+}
+
+/* Return a SCSI command in some other context besides a normal completion */
+void qedf_scsi_done(struct qedf_ctx *qedf, struct qedf_ioreq *io_req,
+	int result)
+{
+	u16 xid;
+	struct scsi_cmnd *sc_cmd;
+	int refcount;
+
+	if (!io_req)
+		return;
+
+	xid = io_req->xid;
+	sc_cmd = io_req->sc_cmd;
+
+	if (!sc_cmd) {
+		QEDF_WARN(&(qedf->dbg_ctx), "sc_cmd is NULL!\n");
+		return;
+	}
+
+	if (!sc_cmd->SCp.ptr) {
+		QEDF_WARN(&(qedf->dbg_ctx), "SCp.ptr is NULL, returned in "
+		    "another context.\n");
+		return;
+	}
+
+	qedf_unmap_sg_list(qedf, io_req);
+
+	sc_cmd->result = result << 16;
+	refcount = kref_read(&io_req->refcount);
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO, "%d:0:%d:%lld: Completing "
+	    "sc_cmd=%p result=0x%08x op=0x%02x lba=0x%02x%02x%02x%02x, "
+	    "allowed=%d retries=%d refcount=%d.\n",
+	    qedf->lport->host->host_no, sc_cmd->device->id,
+	    sc_cmd->device->lun, sc_cmd, sc_cmd->result, sc_cmd->cmnd[0],
+	    sc_cmd->cmnd[2], sc_cmd->cmnd[3], sc_cmd->cmnd[4],
+	    sc_cmd->cmnd[5], sc_cmd->allowed, sc_cmd->retries,
+	    refcount);
+
+	/*
+	 * Set resid to the whole buffer length so we won't try to resue any
+	 * previously read data
+	 */
+	scsi_set_resid(sc_cmd, scsi_bufflen(sc_cmd));
+
+	if (qedf_io_tracing)
+		qedf_trace_io(io_req->fcport, io_req, QEDF_IO_TRACE_RSP);
+
+	io_req->sc_cmd = NULL;
+	sc_cmd->SCp.ptr = NULL;
+	sc_cmd->scsi_done(sc_cmd);
+	kref_put(&io_req->refcount, qedf_release_cmd);
+}
+
+/*
+ * Handle warning type CQE completions. This is mainly used for REC timer
+ * popping.
+ */
+void qedf_process_warning_compl(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
+	struct qedf_ioreq *io_req)
+{
+	int rval, i;
+	struct qedf_rport *fcport = io_req->fcport;
+	u64 err_warn_bit_map;
+	u8 err_warn = 0xff;
+
+	if (!cqe)
+		return;
+
+	QEDF_ERR(&(io_req->fcport->qedf->dbg_ctx), "Warning CQE, "
+		  "xid=0x%x\n", io_req->xid);
+	QEDF_ERR(&(io_req->fcport->qedf->dbg_ctx),
+		  "err_warn_bitmap=%08x:%08x\n",
+		  le32_to_cpu(cqe->cqe_info.err_info.err_warn_bitmap_hi),
+		  le32_to_cpu(cqe->cqe_info.err_info.err_warn_bitmap_lo));
+	QEDF_ERR(&(io_req->fcport->qedf->dbg_ctx), "tx_buff_off=%08x, "
+		  "rx_buff_off=%08x, rx_id=%04x\n",
+		  le32_to_cpu(cqe->cqe_info.err_info.tx_buf_off),
+		  le32_to_cpu(cqe->cqe_info.err_info.rx_buf_off),
+		  le32_to_cpu(cqe->cqe_info.err_info.rx_id));
+
+	/* Normalize the error bitmap value to an just an unsigned int */
+	err_warn_bit_map = (u64)
+	    ((u64)cqe->cqe_info.err_info.err_warn_bitmap_hi << 32) |
+	    (u64)cqe->cqe_info.err_info.err_warn_bitmap_lo;
+	for (i = 0; i < 64; i++) {
+		if (err_warn_bit_map & (u64)((u64)1 << i)) {
+			err_warn = i;
+			break;
+		}
+	}
+
+	/* Check if REC TOV expired if this is a tape device */
+	if (fcport->dev_type == QEDF_RPORT_TYPE_TAPE) {
+		if (err_warn ==
+		    FCOE_WARNING_CODE_REC_TOV_TIMER_EXPIRATION) {
+			QEDF_ERR(&(qedf->dbg_ctx), "REC timer expired.\n");
+			if (!test_bit(QEDF_CMD_SRR_SENT, &io_req->flags)) {
+				io_req->rx_buf_off =
+				    cqe->cqe_info.err_info.rx_buf_off;
+				io_req->tx_buf_off =
+				    cqe->cqe_info.err_info.tx_buf_off;
+				io_req->rx_id = cqe->cqe_info.err_info.rx_id;
+				rval = qedf_send_rec(io_req);
+				/*
+				 * We only want to abort the io_req if we
+				 * can't queue the REC command as we want to
+				 * keep the exchange open for recovery.
+				 */
+				if (rval)
+					goto send_abort;
+			}
+			return;
+		}
+	}
+
+send_abort:
+	init_completion(&io_req->abts_done);
+	rval = qedf_initiate_abts(io_req, true);
+	if (rval)
+		QEDF_ERR(&(qedf->dbg_ctx), "Failed to queue ABTS.\n");
+}
+
+/* Cleanup a command when we receive an error detection completion */
+void qedf_process_error_detect(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
+	struct qedf_ioreq *io_req)
+{
+	int rval;
+
+	if (!cqe)
+		return;
+
+	QEDF_ERR(&(io_req->fcport->qedf->dbg_ctx), "Error detection CQE, "
+		  "xid=0x%x\n", io_req->xid);
+	QEDF_ERR(&(io_req->fcport->qedf->dbg_ctx),
+		  "err_warn_bitmap=%08x:%08x\n",
+		  le32_to_cpu(cqe->cqe_info.err_info.err_warn_bitmap_hi),
+		  le32_to_cpu(cqe->cqe_info.err_info.err_warn_bitmap_lo));
+	QEDF_ERR(&(io_req->fcport->qedf->dbg_ctx), "tx_buff_off=%08x, "
+		  "rx_buff_off=%08x, rx_id=%04x\n",
+		  le32_to_cpu(cqe->cqe_info.err_info.tx_buf_off),
+		  le32_to_cpu(cqe->cqe_info.err_info.rx_buf_off),
+		  le32_to_cpu(cqe->cqe_info.err_info.rx_id));
+
+	if (qedf->stop_io_on_error) {
+		qedf_stop_all_io(qedf);
+		return;
+	}
+
+	init_completion(&io_req->abts_done);
+	rval = qedf_initiate_abts(io_req, true);
+	if (rval)
+		QEDF_ERR(&(qedf->dbg_ctx), "Failed to queue ABTS.\n");
+}
+
+static void qedf_flush_els_req(struct qedf_ctx *qedf,
+	struct qedf_ioreq *els_req)
+{
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+	    "Flushing ELS request xid=0x%x refcount=%d.\n", els_req->xid,
+	    kref_read(&els_req->refcount));
+
+	/*
+	 * Need to distinguish this from a timeout when calling the
+	 * els_req->cb_func.
+	 */
+	els_req->event = QEDF_IOREQ_EV_ELS_FLUSH;
+
+	/* Cancel the timer */
+	cancel_delayed_work_sync(&els_req->timeout_work);
+
+	/* Call callback function to complete command */
+	if (els_req->cb_func && els_req->cb_arg) {
+		els_req->cb_func(els_req->cb_arg);
+		els_req->cb_arg = NULL;
+	}
+
+	/* Release kref for original initiate_els */
+	kref_put(&els_req->refcount, qedf_release_cmd);
+}
+
+/* A value of -1 for lun is a wild card that means flush all
+ * active SCSI I/Os for the target.
+ */
+void qedf_flush_active_ios(struct qedf_rport *fcport, int lun)
+{
+	struct qedf_ioreq *io_req;
+	struct qedf_ctx *qedf;
+	struct qedf_cmd_mgr *cmd_mgr;
+	int i, rc;
+
+	if (!fcport)
+		return;
+
+	qedf = fcport->qedf;
+	cmd_mgr = qedf->cmd_mgr;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO, "Flush active i/o's.\n");
+
+	for (i = 0; i < FCOE_PARAMS_NUM_TASKS; i++) {
+		io_req = &cmd_mgr->cmds[i];
+
+		if (!io_req)
+			continue;
+		if (io_req->fcport != fcport)
+			continue;
+		if (io_req->cmd_type == QEDF_ELS) {
+			rc = kref_get_unless_zero(&io_req->refcount);
+			if (!rc) {
+				QEDF_ERR(&(qedf->dbg_ctx),
+				    "Could not get kref for io_req=0x%p.\n",
+				    io_req);
+				continue;
+			}
+			qedf_flush_els_req(qedf, io_req);
+			/*
+			 * Release the kref and go back to the top of the
+			 * loop.
+			 */
+			goto free_cmd;
+		}
+
+		if (!io_req->sc_cmd)
+			continue;
+		if (lun > 0) {
+			if (io_req->sc_cmd->device->lun !=
+			    (u64)lun)
+				continue;
+		}
+
+		/*
+		 * Use kref_get_unless_zero in the unlikely case the command
+		 * we're about to flush was completed in the normal SCSI path
+		 */
+		rc = kref_get_unless_zero(&io_req->refcount);
+		if (!rc) {
+			QEDF_ERR(&(qedf->dbg_ctx), "Could not get kref for "
+			    "io_req=0x%p\n", io_req);
+			continue;
+		}
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "Cleanup xid=0x%x.\n", io_req->xid);
+
+		/* Cleanup task and return I/O mid-layer */
+		qedf_initiate_cleanup(io_req, true);
+
+free_cmd:
+		kref_put(&io_req->refcount, qedf_release_cmd);
+	}
+}
+
+/*
+ * Initiate a ABTS middle path command. Note that we don't have to initialize
+ * the task context for an ABTS task.
+ */
+int qedf_initiate_abts(struct qedf_ioreq *io_req, bool return_scsi_cmd_on_abts)
+{
+	struct fc_lport *lport;
+	struct qedf_rport *fcport = io_req->fcport;
+	struct fc_rport_priv *rdata;
+	struct qedf_ctx *qedf;
+	u16 xid;
+	u32 r_a_tov = 0;
+	int rc = 0;
+	unsigned long flags;
+	struct fcoe_wqe *sqe;
+	u16 sqe_idx;
+
+	/* Sanity check qedf_rport before dereferencing any pointers */
+	if (!test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+		QEDF_ERR(NULL, "tgt not offloaded\n");
+		rc = 1;
+		goto abts_err;
+	}
+
+	rdata = fcport->rdata;
+	r_a_tov = rdata->r_a_tov;
+	qedf = fcport->qedf;
+	lport = qedf->lport;
+
+	if (lport->state != LPORT_ST_READY || !(lport->link_up)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "link is not ready\n");
+		rc = 1;
+		goto abts_err;
+	}
+
+	if (atomic_read(&qedf->link_down_tmo_valid) > 0) {
+		QEDF_ERR(&(qedf->dbg_ctx), "link_down_tmo active.\n");
+		rc = 1;
+		goto abts_err;
+	}
+
+	/* Ensure room on SQ */
+	if (!atomic_read(&fcport->free_sqes)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "No SQ entries available\n");
+		rc = 1;
+		goto abts_err;
+	}
+
+
+	kref_get(&io_req->refcount);
+
+	xid = io_req->xid;
+	qedf->control_requests++;
+	qedf->packet_aborts++;
+
+	/* Set the return CPU to be the same as the request one */
+	io_req->cpu = smp_processor_id();
+
+	/* Set the command type to abort */
+	io_req->cmd_type = QEDF_ABTS;
+	io_req->return_scsi_cmd_on_abts = return_scsi_cmd_on_abts;
+
+	set_bit(QEDF_CMD_IN_ABORT, &io_req->flags);
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_SCSI_TM, "ABTS io_req xid = "
+		   "0x%x\n", xid);
+
+	qedf_cmd_timer_set(qedf, io_req, QEDF_ABORT_TIMEOUT * HZ);
+
+	spin_lock_irqsave(&fcport->rport_lock, flags);
+
+	sqe_idx = qedf_get_sqe_idx(fcport);
+	sqe = &fcport->sq[sqe_idx];
+	memset(sqe, 0, sizeof(struct fcoe_wqe));
+	io_req->task_params->sqe = sqe;
+
+	init_initiator_abort_fcoe_task(io_req->task_params);
+	qedf_ring_doorbell(fcport);
+
+	spin_unlock_irqrestore(&fcport->rport_lock, flags);
+
+	return rc;
+abts_err:
+	/*
+	 * If the ABTS task fails to queue then we need to cleanup the
+	 * task at the firmware.
+	 */
+	qedf_initiate_cleanup(io_req, return_scsi_cmd_on_abts);
+	return rc;
+}
+
+void qedf_process_abts_compl(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
+	struct qedf_ioreq *io_req)
+{
+	uint32_t r_ctl;
+	uint16_t xid;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_SCSI_TM, "Entered with xid = "
+		   "0x%x cmd_type = %d\n", io_req->xid, io_req->cmd_type);
+
+	cancel_delayed_work(&io_req->timeout_work);
+
+	xid = io_req->xid;
+	r_ctl = cqe->cqe_info.abts_info.r_ctl;
+
+	switch (r_ctl) {
+	case FC_RCTL_BA_ACC:
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_SCSI_TM,
+		    "ABTS response - ACC Send RRQ after R_A_TOV\n");
+		io_req->event = QEDF_IOREQ_EV_ABORT_SUCCESS;
+		/*
+		 * Dont release this cmd yet. It will be relesed
+		 * after we get RRQ response
+		 */
+		kref_get(&io_req->refcount);
+		queue_delayed_work(qedf->dpc_wq, &io_req->rrq_work,
+		    msecs_to_jiffies(qedf->lport->r_a_tov));
+		break;
+	/* For error cases let the cleanup return the command */
+	case FC_RCTL_BA_RJT:
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_SCSI_TM,
+		   "ABTS response - RJT\n");
+		io_req->event = QEDF_IOREQ_EV_ABORT_FAILED;
+		break;
+	default:
+		QEDF_ERR(&(qedf->dbg_ctx), "Unknown ABTS response\n");
+		break;
+	}
+
+	clear_bit(QEDF_CMD_IN_ABORT, &io_req->flags);
+
+	if (io_req->sc_cmd) {
+		if (io_req->return_scsi_cmd_on_abts)
+			qedf_scsi_done(qedf, io_req, DID_ERROR);
+	}
+
+	/* Notify eh_abort handler that ABTS is complete */
+	complete(&io_req->abts_done);
+
+	kref_put(&io_req->refcount, qedf_release_cmd);
+}
+
+int qedf_init_mp_req(struct qedf_ioreq *io_req)
+{
+	struct qedf_mp_req *mp_req;
+	struct scsi_sge *mp_req_bd;
+	struct scsi_sge *mp_resp_bd;
+	struct qedf_ctx *qedf = io_req->fcport->qedf;
+	dma_addr_t addr;
+	uint64_t sz;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_MP_REQ, "Entered.\n");
+
+	mp_req = (struct qedf_mp_req *)&(io_req->mp_req);
+	memset(mp_req, 0, sizeof(struct qedf_mp_req));
+
+	if (io_req->cmd_type != QEDF_ELS) {
+		mp_req->req_len = sizeof(struct fcp_cmnd);
+		io_req->data_xfer_len = mp_req->req_len;
+	} else
+		mp_req->req_len = io_req->data_xfer_len;
+
+	mp_req->req_buf = dma_alloc_coherent(&qedf->pdev->dev, QEDF_PAGE_SIZE,
+	    &mp_req->req_buf_dma, GFP_KERNEL);
+	if (!mp_req->req_buf) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Unable to alloc MP req buffer\n");
+		qedf_free_mp_resc(io_req);
+		return -ENOMEM;
+	}
+
+	mp_req->resp_buf = dma_alloc_coherent(&qedf->pdev->dev,
+	    QEDF_PAGE_SIZE, &mp_req->resp_buf_dma, GFP_KERNEL);
+	if (!mp_req->resp_buf) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Unable to alloc TM resp "
+			  "buffer\n");
+		qedf_free_mp_resc(io_req);
+		return -ENOMEM;
+	}
+
+	/* Allocate and map mp_req_bd and mp_resp_bd */
+	sz = sizeof(struct scsi_sge);
+	mp_req->mp_req_bd = dma_alloc_coherent(&qedf->pdev->dev, sz,
+	    &mp_req->mp_req_bd_dma, GFP_KERNEL);
+	if (!mp_req->mp_req_bd) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Unable to alloc MP req bd\n");
+		qedf_free_mp_resc(io_req);
+		return -ENOMEM;
+	}
+
+	mp_req->mp_resp_bd = dma_alloc_coherent(&qedf->pdev->dev, sz,
+	    &mp_req->mp_resp_bd_dma, GFP_KERNEL);
+	if (!mp_req->mp_resp_bd) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Unable to alloc MP resp bd\n");
+		qedf_free_mp_resc(io_req);
+		return -ENOMEM;
+	}
+
+	/* Fill bd table */
+	addr = mp_req->req_buf_dma;
+	mp_req_bd = mp_req->mp_req_bd;
+	mp_req_bd->sge_addr.lo = U64_LO(addr);
+	mp_req_bd->sge_addr.hi = U64_HI(addr);
+	mp_req_bd->sge_len = QEDF_PAGE_SIZE;
+
+	/*
+	 * MP buffer is either a task mgmt command or an ELS.
+	 * So the assumption is that it consumes a single bd
+	 * entry in the bd table
+	 */
+	mp_resp_bd = mp_req->mp_resp_bd;
+	addr = mp_req->resp_buf_dma;
+	mp_resp_bd->sge_addr.lo = U64_LO(addr);
+	mp_resp_bd->sge_addr.hi = U64_HI(addr);
+	mp_resp_bd->sge_len = QEDF_PAGE_SIZE;
+
+	return 0;
+}
+
+/*
+ * Last ditch effort to clear the port if it's stuck. Used only after a
+ * cleanup task times out.
+ */
+static void qedf_drain_request(struct qedf_ctx *qedf)
+{
+	if (test_bit(QEDF_DRAIN_ACTIVE, &qedf->flags)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "MCP drain already active.\n");
+		return;
+	}
+
+	/* Set bit to return all queuecommand requests as busy */
+	set_bit(QEDF_DRAIN_ACTIVE, &qedf->flags);
+
+	/* Call qed drain request for function. Should be synchronous */
+	qed_ops->common->drain(qedf->cdev);
+
+	/* Settle time for CQEs to be returned */
+	msleep(100);
+
+	/* Unplug and continue */
+	clear_bit(QEDF_DRAIN_ACTIVE, &qedf->flags);
+}
+
+/*
+ * Returns SUCCESS if the cleanup task does not timeout, otherwise return
+ * FAILURE.
+ */
+int qedf_initiate_cleanup(struct qedf_ioreq *io_req,
+	bool return_scsi_cmd_on_abts)
+{
+	struct qedf_rport *fcport;
+	struct qedf_ctx *qedf;
+	uint16_t xid;
+	struct e4_fcoe_task_context *task;
+	int tmo = 0;
+	int rc = SUCCESS;
+	unsigned long flags;
+	struct fcoe_wqe *sqe;
+	u16 sqe_idx;
+
+	fcport = io_req->fcport;
+	if (!fcport) {
+		QEDF_ERR(NULL, "fcport is NULL.\n");
+		return SUCCESS;
+	}
+
+	/* Sanity check qedf_rport before dereferencing any pointers */
+	if (!test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+		QEDF_ERR(NULL, "tgt not offloaded\n");
+		rc = 1;
+		return SUCCESS;
+	}
+
+	qedf = fcport->qedf;
+	if (!qedf) {
+		QEDF_ERR(NULL, "qedf is NULL.\n");
+		return SUCCESS;
+	}
+
+	if (!test_bit(QEDF_CMD_OUTSTANDING, &io_req->flags) ||
+	    test_bit(QEDF_CMD_IN_CLEANUP, &io_req->flags)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "io_req xid=0x%x already in "
+			  "cleanup processing or already completed.\n",
+			  io_req->xid);
+		return SUCCESS;
+	}
+
+	/* Ensure room on SQ */
+	if (!atomic_read(&fcport->free_sqes)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "No SQ entries available\n");
+		return FAILED;
+	}
+
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO, "Entered xid=0x%x\n",
+	    io_req->xid);
+
+	/* Cleanup cmds re-use the same TID as the original I/O */
+	xid = io_req->xid;
+	io_req->cmd_type = QEDF_CLEANUP;
+	io_req->return_scsi_cmd_on_abts = return_scsi_cmd_on_abts;
+
+	/* Set the return CPU to be the same as the request one */
+	io_req->cpu = smp_processor_id();
+
+	set_bit(QEDF_CMD_IN_CLEANUP, &io_req->flags);
+
+	task = qedf_get_task_mem(&qedf->tasks, xid);
+
+	init_completion(&io_req->tm_done);
+
+	spin_lock_irqsave(&fcport->rport_lock, flags);
+
+	sqe_idx = qedf_get_sqe_idx(fcport);
+	sqe = &fcport->sq[sqe_idx];
+	memset(sqe, 0, sizeof(struct fcoe_wqe));
+	io_req->task_params->sqe = sqe;
+
+	init_initiator_cleanup_fcoe_task(io_req->task_params);
+	qedf_ring_doorbell(fcport);
+
+	spin_unlock_irqrestore(&fcport->rport_lock, flags);
+
+	tmo = wait_for_completion_timeout(&io_req->tm_done,
+	    QEDF_CLEANUP_TIMEOUT * HZ);
+
+	if (!tmo) {
+		rc = FAILED;
+		/* Timeout case */
+		QEDF_ERR(&(qedf->dbg_ctx), "Cleanup command timeout, "
+			  "xid=%x.\n", io_req->xid);
+		clear_bit(QEDF_CMD_IN_CLEANUP, &io_req->flags);
+		/* Issue a drain request if cleanup task times out */
+		QEDF_ERR(&(qedf->dbg_ctx), "Issuing MCP drain request.\n");
+		qedf_drain_request(qedf);
+	}
+
+	if (io_req->sc_cmd) {
+		if (io_req->return_scsi_cmd_on_abts)
+			qedf_scsi_done(qedf, io_req, DID_ERROR);
+	}
+
+	if (rc == SUCCESS)
+		io_req->event = QEDF_IOREQ_EV_CLEANUP_SUCCESS;
+	else
+		io_req->event = QEDF_IOREQ_EV_CLEANUP_FAILED;
+
+	return rc;
+}
+
+void qedf_process_cleanup_compl(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
+	struct qedf_ioreq *io_req)
+{
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO, "Entered xid = 0x%x\n",
+		   io_req->xid);
+
+	clear_bit(QEDF_CMD_IN_CLEANUP, &io_req->flags);
+
+	/* Complete so we can finish cleaning up the I/O */
+	complete(&io_req->tm_done);
+}
+
+static int qedf_execute_tmf(struct qedf_rport *fcport, struct scsi_cmnd *sc_cmd,
+	uint8_t tm_flags)
+{
+	struct qedf_ioreq *io_req;
+	struct e4_fcoe_task_context *task;
+	struct qedf_ctx *qedf = fcport->qedf;
+	struct fc_lport *lport = qedf->lport;
+	int rc = 0;
+	uint16_t xid;
+	int tmo = 0;
+	unsigned long flags;
+	struct fcoe_wqe *sqe;
+	u16 sqe_idx;
+
+	if (!sc_cmd) {
+		QEDF_ERR(&(qedf->dbg_ctx), "invalid arg\n");
+		return FAILED;
+	}
+
+	if (!test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "fcport not offloaded\n");
+		rc = FAILED;
+		return FAILED;
+	}
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_SCSI_TM, "portid = 0x%x "
+		   "tm_flags = %d\n", fcport->rdata->ids.port_id, tm_flags);
+
+	io_req = qedf_alloc_cmd(fcport, QEDF_TASK_MGMT_CMD);
+	if (!io_req) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Failed TMF");
+		rc = -EAGAIN;
+		goto reset_tmf_err;
+	}
+
+	/* Initialize rest of io_req fields */
+	io_req->sc_cmd = sc_cmd;
+	io_req->fcport = fcport;
+	io_req->cmd_type = QEDF_TASK_MGMT_CMD;
+
+	/* Set the return CPU to be the same as the request one */
+	io_req->cpu = smp_processor_id();
+
+	/* Set TM flags */
+	io_req->io_req_flags = QEDF_READ;
+	io_req->data_xfer_len = 0;
+	io_req->tm_flags = tm_flags;
+
+	/* Default is to return a SCSI command when an error occurs */
+	io_req->return_scsi_cmd_on_abts = true;
+
+	/* Obtain exchange id */
+	xid = io_req->xid;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_SCSI_TM, "TMF io_req xid = "
+		   "0x%x\n", xid);
+
+	/* Initialize task context for this IO request */
+	task = qedf_get_task_mem(&qedf->tasks, xid);
+
+	init_completion(&io_req->tm_done);
+
+	spin_lock_irqsave(&fcport->rport_lock, flags);
+
+	sqe_idx = qedf_get_sqe_idx(fcport);
+	sqe = &fcport->sq[sqe_idx];
+	memset(sqe, 0, sizeof(struct fcoe_wqe));
+
+	qedf_init_task(fcport, lport, io_req, task, sqe);
+	qedf_ring_doorbell(fcport);
+
+	spin_unlock_irqrestore(&fcport->rport_lock, flags);
+
+	tmo = wait_for_completion_timeout(&io_req->tm_done,
+	    QEDF_TM_TIMEOUT * HZ);
+
+	if (!tmo) {
+		rc = FAILED;
+		QEDF_ERR(&(qedf->dbg_ctx), "wait for tm_cmpl timeout!\n");
+	} else {
+		/* Check TMF response code */
+		if (io_req->fcp_rsp_code == 0)
+			rc = SUCCESS;
+		else
+			rc = FAILED;
+	}
+
+	if (tm_flags == FCP_TMF_LUN_RESET)
+		qedf_flush_active_ios(fcport, (int)sc_cmd->device->lun);
+	else
+		qedf_flush_active_ios(fcport, -1);
+
+	kref_put(&io_req->refcount, qedf_release_cmd);
+
+	if (rc != SUCCESS) {
+		QEDF_ERR(&(qedf->dbg_ctx), "task mgmt command failed...\n");
+		rc = FAILED;
+	} else {
+		QEDF_ERR(&(qedf->dbg_ctx), "task mgmt command success...\n");
+		rc = SUCCESS;
+	}
+reset_tmf_err:
+	return rc;
+}
+
+int qedf_initiate_tmf(struct scsi_cmnd *sc_cmd, u8 tm_flags)
+{
+	struct fc_rport *rport = starget_to_rport(scsi_target(sc_cmd->device));
+	struct fc_rport_libfc_priv *rp = rport->dd_data;
+	struct qedf_rport *fcport = (struct qedf_rport *)&rp[1];
+	struct qedf_ctx *qedf;
+	struct fc_lport *lport;
+	int rc = SUCCESS;
+	int rval;
+
+	rval = fc_remote_port_chkready(rport);
+
+	if (rval) {
+		QEDF_ERR(NULL, "device_reset rport not ready\n");
+		rc = FAILED;
+		goto tmf_err;
+	}
+
+	if (fcport == NULL) {
+		QEDF_ERR(NULL, "device_reset: rport is NULL\n");
+		rc = FAILED;
+		goto tmf_err;
+	}
+
+	qedf = fcport->qedf;
+	lport = qedf->lport;
+
+	if (test_bit(QEDF_UNLOADING, &qedf->flags) ||
+	    test_bit(QEDF_DBG_STOP_IO, &qedf->flags)) {
+		rc = SUCCESS;
+		goto tmf_err;
+	}
+
+	if (lport->state != LPORT_ST_READY || !(lport->link_up)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "link is not ready\n");
+		rc = FAILED;
+		goto tmf_err;
+	}
+
+	rc = qedf_execute_tmf(fcport, sc_cmd, tm_flags);
+
+tmf_err:
+	return rc;
+}
+
+void qedf_process_tmf_compl(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
+	struct qedf_ioreq *io_req)
+{
+	struct fcoe_cqe_rsp_info *fcp_rsp;
+
+	fcp_rsp = &cqe->cqe_info.rsp_info;
+	qedf_parse_fcp_rsp(io_req, fcp_rsp);
+
+	io_req->sc_cmd = NULL;
+	complete(&io_req->tm_done);
+}
+
+void qedf_process_unsol_compl(struct qedf_ctx *qedf, uint16_t que_idx,
+	struct fcoe_cqe *cqe)
+{
+	unsigned long flags;
+	uint16_t tmp;
+	uint16_t pktlen = cqe->cqe_info.unsolic_info.pkt_len;
+	u32 payload_len, crc;
+	struct fc_frame_header *fh;
+	struct fc_frame *fp;
+	struct qedf_io_work *io_work;
+	u32 bdq_idx;
+	void *bdq_addr;
+	struct scsi_bd *p_bd_info;
+
+	p_bd_info = &cqe->cqe_info.unsolic_info.bd_info;
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_UNSOL,
+		  "address.hi=%x, address.lo=%x, opaque_data.hi=%x, opaque_data.lo=%x, bdq_prod_idx=%u, len=%u\n",
+		  le32_to_cpu(p_bd_info->address.hi),
+		  le32_to_cpu(p_bd_info->address.lo),
+		  le32_to_cpu(p_bd_info->opaque.fcoe_opaque.hi),
+		  le32_to_cpu(p_bd_info->opaque.fcoe_opaque.lo),
+		  qedf->bdq_prod_idx, pktlen);
+
+	bdq_idx = le32_to_cpu(p_bd_info->opaque.fcoe_opaque.lo);
+	if (bdq_idx >= QEDF_BDQ_SIZE) {
+		QEDF_ERR(&(qedf->dbg_ctx), "bdq_idx is out of range %d.\n",
+		    bdq_idx);
+		goto increment_prod;
+	}
+
+	bdq_addr = qedf->bdq[bdq_idx].buf_addr;
+	if (!bdq_addr) {
+		QEDF_ERR(&(qedf->dbg_ctx), "bdq_addr is NULL, dropping "
+		    "unsolicited packet.\n");
+		goto increment_prod;
+	}
+
+	if (qedf_dump_frames) {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_UNSOL,
+		    "BDQ frame is at addr=%p.\n", bdq_addr);
+		print_hex_dump(KERN_WARNING, "bdq ", DUMP_PREFIX_OFFSET, 16, 1,
+		    (void *)bdq_addr, pktlen, false);
+	}
+
+	/* Allocate frame */
+	payload_len = pktlen - sizeof(struct fc_frame_header);
+	fp = fc_frame_alloc(qedf->lport, payload_len);
+	if (!fp) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Could not allocate fp.\n");
+		goto increment_prod;
+	}
+
+	/* Copy data from BDQ buffer into fc_frame struct */
+	fh = (struct fc_frame_header *)fc_frame_header_get(fp);
+	memcpy(fh, (void *)bdq_addr, pktlen);
+
+	/* Initialize the frame so libfc sees it as a valid frame */
+	crc = fcoe_fc_crc(fp);
+	fc_frame_init(fp);
+	fr_dev(fp) = qedf->lport;
+	fr_sof(fp) = FC_SOF_I3;
+	fr_eof(fp) = FC_EOF_T;
+	fr_crc(fp) = cpu_to_le32(~crc);
+
+	/*
+	 * We need to return the frame back up to libfc in a non-atomic
+	 * context
+	 */
+	io_work = mempool_alloc(qedf->io_mempool, GFP_ATOMIC);
+	if (!io_work) {
+		QEDF_WARN(&(qedf->dbg_ctx), "Could not allocate "
+			   "work for I/O completion.\n");
+		fc_frame_free(fp);
+		goto increment_prod;
+	}
+	memset(io_work, 0, sizeof(struct qedf_io_work));
+
+	INIT_WORK(&io_work->work, qedf_fp_io_handler);
+
+	/* Copy contents of CQE for deferred processing */
+	memcpy(&io_work->cqe, cqe, sizeof(struct fcoe_cqe));
+
+	io_work->qedf = qedf;
+	io_work->fp = fp;
+
+	queue_work_on(smp_processor_id(), qedf_io_wq, &io_work->work);
+increment_prod:
+	spin_lock_irqsave(&qedf->hba_lock, flags);
+
+	/* Increment producer to let f/w know we've handled the frame */
+	qedf->bdq_prod_idx++;
+
+	/* Producer index wraps at uint16_t boundary */
+	if (qedf->bdq_prod_idx == 0xffff)
+		qedf->bdq_prod_idx = 0;
+
+	writew(qedf->bdq_prod_idx, qedf->bdq_primary_prod);
+	tmp = readw(qedf->bdq_primary_prod);
+	writew(qedf->bdq_prod_idx, qedf->bdq_secondary_prod);
+	tmp = readw(qedf->bdq_secondary_prod);
+
+	spin_unlock_irqrestore(&qedf->hba_lock, flags);
+}
diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c
new file mode 100644
index 0000000..284ccb5
--- /dev/null
+++ b/drivers/scsi/qedf/qedf_main.c
@@ -0,0 +1,3498 @@
+/*
+ *  QLogic FCoE Offload Driver
+ *  Copyright (c) 2016-2017 Cavium Inc.
+ *
+ *  This software is available under the terms of the GNU General Public License
+ *  (GPL) Version 2, available from the file COPYING in the main directory of
+ *  this source tree.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/kthread.h>
+#include <scsi/libfc.h>
+#include <scsi/scsi_host.h>
+#include <scsi/fc_frame.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/cpu.h>
+#include "qedf.h"
+#include "qedf_dbg.h"
+#include <uapi/linux/pci_regs.h>
+
+const struct qed_fcoe_ops *qed_ops;
+
+static int qedf_probe(struct pci_dev *pdev, const struct pci_device_id *id);
+static void qedf_remove(struct pci_dev *pdev);
+
+/*
+ * Driver module parameters.
+ */
+static unsigned int qedf_dev_loss_tmo = 60;
+module_param_named(dev_loss_tmo, qedf_dev_loss_tmo, int, S_IRUGO);
+MODULE_PARM_DESC(dev_loss_tmo,  " dev_loss_tmo setting for attached "
+	"remote ports (default 60)");
+
+uint qedf_debug = QEDF_LOG_INFO;
+module_param_named(debug, qedf_debug, uint, S_IRUGO);
+MODULE_PARM_DESC(debug, " Debug mask. Pass '1' to enable default debugging"
+	" mask");
+
+static uint qedf_fipvlan_retries = 30;
+module_param_named(fipvlan_retries, qedf_fipvlan_retries, int, S_IRUGO);
+MODULE_PARM_DESC(fipvlan_retries, " Number of FIP VLAN requests to attempt "
+	"before giving up (default 30)");
+
+static uint qedf_fallback_vlan = QEDF_FALLBACK_VLAN;
+module_param_named(fallback_vlan, qedf_fallback_vlan, int, S_IRUGO);
+MODULE_PARM_DESC(fallback_vlan, " VLAN ID to try if fip vlan request fails "
+	"(default 1002).");
+
+static uint qedf_default_prio = QEDF_DEFAULT_PRIO;
+module_param_named(default_prio, qedf_default_prio, int, S_IRUGO);
+MODULE_PARM_DESC(default_prio, " Default 802.1q priority for FIP and FCoE"
+	" traffic (default 3).");
+
+uint qedf_dump_frames;
+module_param_named(dump_frames, qedf_dump_frames, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dump_frames, " Print the skb data of FIP and FCoE frames "
+	"(default off)");
+
+static uint qedf_queue_depth;
+module_param_named(queue_depth, qedf_queue_depth, int, S_IRUGO);
+MODULE_PARM_DESC(queue_depth, " Sets the queue depth for all LUNs discovered "
+	"by the qedf driver. Default is 0 (use OS default).");
+
+uint qedf_io_tracing;
+module_param_named(io_tracing, qedf_io_tracing, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(io_tracing, " Enable logging of SCSI requests/completions "
+	"into trace buffer. (default off).");
+
+static uint qedf_max_lun = MAX_FIBRE_LUNS;
+module_param_named(max_lun, qedf_max_lun, int, S_IRUGO);
+MODULE_PARM_DESC(max_lun, " Sets the maximum luns per target that the driver "
+	"supports. (default 0xffffffff)");
+
+uint qedf_link_down_tmo;
+module_param_named(link_down_tmo, qedf_link_down_tmo, int, S_IRUGO);
+MODULE_PARM_DESC(link_down_tmo, " Delays informing the fcoe transport that the "
+	"link is down by N seconds.");
+
+bool qedf_retry_delay;
+module_param_named(retry_delay, qedf_retry_delay, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(retry_delay, " Enable/disable handling of FCP_RSP IU retry "
+	"delay handling (default off).");
+
+static uint qedf_dp_module;
+module_param_named(dp_module, qedf_dp_module, uint, S_IRUGO);
+MODULE_PARM_DESC(dp_module, " bit flags control for verbose printk passed "
+	"qed module during probe.");
+
+static uint qedf_dp_level = QED_LEVEL_NOTICE;
+module_param_named(dp_level, qedf_dp_level, uint, S_IRUGO);
+MODULE_PARM_DESC(dp_level, " printk verbosity control passed to qed module  "
+	"during probe (0-3: 0 more verbose).");
+
+struct workqueue_struct *qedf_io_wq;
+
+static struct fcoe_percpu_s qedf_global;
+static DEFINE_SPINLOCK(qedf_global_lock);
+
+static struct kmem_cache *qedf_io_work_cache;
+
+void qedf_set_vlan_id(struct qedf_ctx *qedf, int vlan_id)
+{
+	qedf->vlan_id = vlan_id;
+	qedf->vlan_id |= qedf_default_prio << VLAN_PRIO_SHIFT;
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "Setting vlan_id=%04x "
+		   "prio=%d.\n", vlan_id, qedf_default_prio);
+}
+
+/* Returns true if we have a valid vlan, false otherwise */
+static bool qedf_initiate_fipvlan_req(struct qedf_ctx *qedf)
+{
+	int rc;
+
+	if (atomic_read(&qedf->link_state) != QEDF_LINK_UP) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Link not up.\n");
+		return  false;
+	}
+
+	while (qedf->fipvlan_retries--) {
+		if (qedf->vlan_id > 0)
+			return true;
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+			   "Retry %d.\n", qedf->fipvlan_retries);
+		init_completion(&qedf->fipvlan_compl);
+		qedf_fcoe_send_vlan_req(qedf);
+		rc = wait_for_completion_timeout(&qedf->fipvlan_compl,
+		    1 * HZ);
+		if (rc > 0) {
+			fcoe_ctlr_link_up(&qedf->ctlr);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void qedf_handle_link_update(struct work_struct *work)
+{
+	struct qedf_ctx *qedf =
+	    container_of(work, struct qedf_ctx, link_update.work);
+	int rc;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "Entered.\n");
+
+	if (atomic_read(&qedf->link_state) == QEDF_LINK_UP) {
+		rc = qedf_initiate_fipvlan_req(qedf);
+		if (rc)
+			return;
+		/*
+		 * If we get here then we never received a repsonse to our
+		 * fip vlan request so set the vlan_id to the default and
+		 * tell FCoE that the link is up
+		 */
+		QEDF_WARN(&(qedf->dbg_ctx), "Did not receive FIP VLAN "
+			   "response, falling back to default VLAN %d.\n",
+			   qedf_fallback_vlan);
+		qedf_set_vlan_id(qedf, qedf_fallback_vlan);
+
+		/*
+		 * Zero out data_src_addr so we'll update it with the new
+		 * lport port_id
+		 */
+		eth_zero_addr(qedf->data_src_addr);
+		fcoe_ctlr_link_up(&qedf->ctlr);
+	} else if (atomic_read(&qedf->link_state) == QEDF_LINK_DOWN) {
+		/*
+		 * If we hit here and link_down_tmo_valid is still 1 it means
+		 * that link_down_tmo timed out so set it to 0 to make sure any
+		 * other readers have accurate state.
+		 */
+		atomic_set(&qedf->link_down_tmo_valid, 0);
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+		    "Calling fcoe_ctlr_link_down().\n");
+		fcoe_ctlr_link_down(&qedf->ctlr);
+		qedf_wait_for_upload(qedf);
+		/* Reset the number of FIP VLAN retries */
+		qedf->fipvlan_retries = qedf_fipvlan_retries;
+	}
+}
+
+#define	QEDF_FCOE_MAC_METHOD_GRANGED_MAC		1
+#define QEDF_FCOE_MAC_METHOD_FCF_MAP			2
+#define QEDF_FCOE_MAC_METHOD_FCOE_SET_MAC		3
+static void qedf_set_data_src_addr(struct qedf_ctx *qedf, struct fc_frame *fp)
+{
+	u8 *granted_mac;
+	struct fc_frame_header *fh = fc_frame_header_get(fp);
+	u8 fc_map[3];
+	int method = 0;
+
+	/* Get granted MAC address from FIP FLOGI payload */
+	granted_mac = fr_cb(fp)->granted_mac;
+
+	/*
+	 * We set the source MAC for FCoE traffic based on the Granted MAC
+	 * address from the switch.
+	 *
+	 * If granted_mac is non-zero, we used that.
+	 * If the granted_mac is zeroed out, created the FCoE MAC based on
+	 * the sel_fcf->fc_map and the d_id fo the FLOGI frame.
+	 * If sel_fcf->fc_map is 0 then we use the default FCF-MAC plus the
+	 * d_id of the FLOGI frame.
+	 */
+	if (!is_zero_ether_addr(granted_mac)) {
+		ether_addr_copy(qedf->data_src_addr, granted_mac);
+		method = QEDF_FCOE_MAC_METHOD_GRANGED_MAC;
+	} else if (qedf->ctlr.sel_fcf->fc_map != 0) {
+		hton24(fc_map, qedf->ctlr.sel_fcf->fc_map);
+		qedf->data_src_addr[0] = fc_map[0];
+		qedf->data_src_addr[1] = fc_map[1];
+		qedf->data_src_addr[2] = fc_map[2];
+		qedf->data_src_addr[3] = fh->fh_d_id[0];
+		qedf->data_src_addr[4] = fh->fh_d_id[1];
+		qedf->data_src_addr[5] = fh->fh_d_id[2];
+		method = QEDF_FCOE_MAC_METHOD_FCF_MAP;
+	} else {
+		fc_fcoe_set_mac(qedf->data_src_addr, fh->fh_d_id);
+		method = QEDF_FCOE_MAC_METHOD_FCOE_SET_MAC;
+	}
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+	    "QEDF data_src_mac=%pM method=%d.\n", qedf->data_src_addr, method);
+}
+
+static void qedf_flogi_resp(struct fc_seq *seq, struct fc_frame *fp,
+	void *arg)
+{
+	struct fc_exch *exch = fc_seq_exch(seq);
+	struct fc_lport *lport = exch->lp;
+	struct qedf_ctx *qedf = lport_priv(lport);
+
+	if (!qedf) {
+		QEDF_ERR(NULL, "qedf is NULL.\n");
+		return;
+	}
+
+	/*
+	 * If ERR_PTR is set then don't try to stat anything as it will cause
+	 * a crash when we access fp.
+	 */
+	if (IS_ERR(fp)) {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+		    "fp has IS_ERR() set.\n");
+		goto skip_stat;
+	}
+
+	/* Log stats for FLOGI reject */
+	if (fc_frame_payload_op(fp) == ELS_LS_RJT)
+		qedf->flogi_failed++;
+	else if (fc_frame_payload_op(fp) == ELS_LS_ACC) {
+		/* Set the source MAC we will use for FCoE traffic */
+		qedf_set_data_src_addr(qedf, fp);
+	}
+
+	/* Complete flogi_compl so we can proceed to sending ADISCs */
+	complete(&qedf->flogi_compl);
+
+skip_stat:
+	/* Report response to libfc */
+	fc_lport_flogi_resp(seq, fp, lport);
+}
+
+static struct fc_seq *qedf_elsct_send(struct fc_lport *lport, u32 did,
+	struct fc_frame *fp, unsigned int op,
+	void (*resp)(struct fc_seq *,
+	struct fc_frame *,
+	void *),
+	void *arg, u32 timeout)
+{
+	struct qedf_ctx *qedf = lport_priv(lport);
+
+	/*
+	 * Intercept FLOGI for statistic purposes. Note we use the resp
+	 * callback to tell if this is really a flogi.
+	 */
+	if (resp == fc_lport_flogi_resp) {
+		qedf->flogi_cnt++;
+		return fc_elsct_send(lport, did, fp, op, qedf_flogi_resp,
+		    arg, timeout);
+	}
+
+	return fc_elsct_send(lport, did, fp, op, resp, arg, timeout);
+}
+
+int qedf_send_flogi(struct qedf_ctx *qedf)
+{
+	struct fc_lport *lport;
+	struct fc_frame *fp;
+
+	lport = qedf->lport;
+
+	if (!lport->tt.elsct_send)
+		return -EINVAL;
+
+	fp = fc_frame_alloc(lport, sizeof(struct fc_els_flogi));
+	if (!fp) {
+		QEDF_ERR(&(qedf->dbg_ctx), "fc_frame_alloc failed.\n");
+		return -ENOMEM;
+	}
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS,
+	    "Sending FLOGI to reestablish session with switch.\n");
+	lport->tt.elsct_send(lport, FC_FID_FLOGI, fp,
+	    ELS_FLOGI, qedf_flogi_resp, lport, lport->r_a_tov);
+
+	init_completion(&qedf->flogi_compl);
+
+	return 0;
+}
+
+struct qedf_tmp_rdata_item {
+	struct fc_rport_priv *rdata;
+	struct list_head list;
+};
+
+/*
+ * This function is called if link_down_tmo is in use.  If we get a link up and
+ * link_down_tmo has not expired then use just FLOGI/ADISC to recover our
+ * sessions with targets.  Otherwise, just call fcoe_ctlr_link_up().
+ */
+static void qedf_link_recovery(struct work_struct *work)
+{
+	struct qedf_ctx *qedf =
+	    container_of(work, struct qedf_ctx, link_recovery.work);
+	struct qedf_rport *fcport;
+	struct fc_rport_priv *rdata;
+	struct qedf_tmp_rdata_item *rdata_item, *tmp_rdata_item;
+	bool rc;
+	int retries = 30;
+	int rval, i;
+	struct list_head rdata_login_list;
+
+	INIT_LIST_HEAD(&rdata_login_list);
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+	    "Link down tmo did not expire.\n");
+
+	/*
+	 * Essentially reset the fcoe_ctlr here without affecting the state
+	 * of the libfc structs.
+	 */
+	qedf->ctlr.state = FIP_ST_LINK_WAIT;
+	fcoe_ctlr_link_down(&qedf->ctlr);
+
+	/*
+	 * Bring the link up before we send the fipvlan request so libfcoe
+	 * can select a new fcf in parallel
+	 */
+	fcoe_ctlr_link_up(&qedf->ctlr);
+
+	/* Since the link when down and up to verify which vlan we're on */
+	qedf->fipvlan_retries = qedf_fipvlan_retries;
+	rc = qedf_initiate_fipvlan_req(qedf);
+	/* If getting the VLAN fails, set the VLAN to the fallback one */
+	if (!rc)
+		qedf_set_vlan_id(qedf, qedf_fallback_vlan);
+
+	/*
+	 * We need to wait for an FCF to be selected due to the
+	 * fcoe_ctlr_link_up other the FLOGI will be rejected.
+	 */
+	while (retries > 0) {
+		if (qedf->ctlr.sel_fcf) {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+			    "FCF reselected, proceeding with FLOGI.\n");
+			break;
+		}
+		msleep(500);
+		retries--;
+	}
+
+	if (retries < 1) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Exhausted retries waiting for "
+		    "FCF selection.\n");
+		return;
+	}
+
+	rval = qedf_send_flogi(qedf);
+	if (rval)
+		return;
+
+	/* Wait for FLOGI completion before proceeding with sending ADISCs */
+	i = wait_for_completion_timeout(&qedf->flogi_compl,
+	    qedf->lport->r_a_tov);
+	if (i == 0) {
+		QEDF_ERR(&(qedf->dbg_ctx), "FLOGI timed out.\n");
+		return;
+	}
+
+	/*
+	 * Call lport->tt.rport_login which will cause libfc to send an
+	 * ADISC since the rport is in state ready.
+	 */
+	rcu_read_lock();
+	list_for_each_entry_rcu(fcport, &qedf->fcports, peers) {
+		rdata = fcport->rdata;
+		if (rdata == NULL)
+			continue;
+		rdata_item = kzalloc(sizeof(struct qedf_tmp_rdata_item),
+		    GFP_ATOMIC);
+		if (!rdata_item)
+			continue;
+		if (kref_get_unless_zero(&rdata->kref)) {
+			rdata_item->rdata = rdata;
+			list_add(&rdata_item->list, &rdata_login_list);
+		} else
+			kfree(rdata_item);
+	}
+	rcu_read_unlock();
+	/*
+	 * Do the fc_rport_login outside of the rcu lock so we don't take a
+	 * mutex in an atomic context.
+	 */
+	list_for_each_entry_safe(rdata_item, tmp_rdata_item, &rdata_login_list,
+	    list) {
+		list_del(&rdata_item->list);
+		fc_rport_login(rdata_item->rdata);
+		kref_put(&rdata_item->rdata->kref, fc_rport_destroy);
+		kfree(rdata_item);
+	}
+}
+
+static void qedf_update_link_speed(struct qedf_ctx *qedf,
+	struct qed_link_output *link)
+{
+	struct fc_lport *lport = qedf->lport;
+
+	lport->link_speed = FC_PORTSPEED_UNKNOWN;
+	lport->link_supported_speeds = FC_PORTSPEED_UNKNOWN;
+
+	/* Set fc_host link speed */
+	switch (link->speed) {
+	case 10000:
+		lport->link_speed = FC_PORTSPEED_10GBIT;
+		break;
+	case 25000:
+		lport->link_speed = FC_PORTSPEED_25GBIT;
+		break;
+	case 40000:
+		lport->link_speed = FC_PORTSPEED_40GBIT;
+		break;
+	case 50000:
+		lport->link_speed = FC_PORTSPEED_50GBIT;
+		break;
+	case 100000:
+		lport->link_speed = FC_PORTSPEED_100GBIT;
+		break;
+	default:
+		lport->link_speed = FC_PORTSPEED_UNKNOWN;
+		break;
+	}
+
+	/*
+	 * Set supported link speed by querying the supported
+	 * capabilities of the link.
+	 */
+	if (link->supported_caps & SUPPORTED_10000baseKR_Full)
+		lport->link_supported_speeds |= FC_PORTSPEED_10GBIT;
+	if (link->supported_caps & SUPPORTED_25000baseKR_Full)
+		lport->link_supported_speeds |= FC_PORTSPEED_25GBIT;
+	if (link->supported_caps & SUPPORTED_40000baseLR4_Full)
+		lport->link_supported_speeds |= FC_PORTSPEED_40GBIT;
+	if (link->supported_caps & SUPPORTED_50000baseKR2_Full)
+		lport->link_supported_speeds |= FC_PORTSPEED_50GBIT;
+	if (link->supported_caps & SUPPORTED_100000baseKR4_Full)
+		lport->link_supported_speeds |= FC_PORTSPEED_100GBIT;
+	fc_host_supported_speeds(lport->host) = lport->link_supported_speeds;
+}
+
+static void qedf_link_update(void *dev, struct qed_link_output *link)
+{
+	struct qedf_ctx *qedf = (struct qedf_ctx *)dev;
+
+	if (link->link_up) {
+		QEDF_ERR(&(qedf->dbg_ctx), "LINK UP (%d GB/s).\n",
+		    link->speed / 1000);
+
+		/* Cancel any pending link down work */
+		cancel_delayed_work(&qedf->link_update);
+
+		atomic_set(&qedf->link_state, QEDF_LINK_UP);
+		qedf_update_link_speed(qedf, link);
+
+		if (atomic_read(&qedf->dcbx) == QEDF_DCBX_DONE) {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+			     "DCBx done.\n");
+			if (atomic_read(&qedf->link_down_tmo_valid) > 0)
+				queue_delayed_work(qedf->link_update_wq,
+				    &qedf->link_recovery, 0);
+			else
+				queue_delayed_work(qedf->link_update_wq,
+				    &qedf->link_update, 0);
+			atomic_set(&qedf->link_down_tmo_valid, 0);
+		}
+
+	} else {
+		QEDF_ERR(&(qedf->dbg_ctx), "LINK DOWN.\n");
+
+		atomic_set(&qedf->link_state, QEDF_LINK_DOWN);
+		atomic_set(&qedf->dcbx, QEDF_DCBX_PENDING);
+		/*
+		 * Flag that we're waiting for the link to come back up before
+		 * informing the fcoe layer of the event.
+		 */
+		if (qedf_link_down_tmo > 0) {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+			    "Starting link down tmo.\n");
+			atomic_set(&qedf->link_down_tmo_valid, 1);
+		}
+		qedf->vlan_id  = 0;
+		qedf_update_link_speed(qedf, link);
+		queue_delayed_work(qedf->link_update_wq, &qedf->link_update,
+		    qedf_link_down_tmo * HZ);
+	}
+}
+
+
+static void qedf_dcbx_handler(void *dev, struct qed_dcbx_get *get, u32 mib_type)
+{
+	struct qedf_ctx *qedf = (struct qedf_ctx *)dev;
+
+	QEDF_ERR(&(qedf->dbg_ctx), "DCBx event valid=%d enabled=%d fcoe "
+	    "prio=%d.\n", get->operational.valid, get->operational.enabled,
+	    get->operational.app_prio.fcoe);
+
+	if (get->operational.enabled && get->operational.valid) {
+		/* If DCBX was already negotiated on link up then just exit */
+		if (atomic_read(&qedf->dcbx) == QEDF_DCBX_DONE) {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+			    "DCBX already set on link up.\n");
+			return;
+		}
+
+		atomic_set(&qedf->dcbx, QEDF_DCBX_DONE);
+
+		if (atomic_read(&qedf->link_state) == QEDF_LINK_UP) {
+			if (atomic_read(&qedf->link_down_tmo_valid) > 0)
+				queue_delayed_work(qedf->link_update_wq,
+				    &qedf->link_recovery, 0);
+			else
+				queue_delayed_work(qedf->link_update_wq,
+				    &qedf->link_update, 0);
+			atomic_set(&qedf->link_down_tmo_valid, 0);
+		}
+	}
+
+}
+
+static u32 qedf_get_login_failures(void *cookie)
+{
+	struct qedf_ctx *qedf;
+
+	qedf = (struct qedf_ctx *)cookie;
+	return qedf->flogi_failed;
+}
+
+static struct qed_fcoe_cb_ops qedf_cb_ops = {
+	{
+		.link_update = qedf_link_update,
+		.dcbx_aen = qedf_dcbx_handler,
+	}
+};
+
+/*
+ * Various transport templates.
+ */
+
+static struct scsi_transport_template *qedf_fc_transport_template;
+static struct scsi_transport_template *qedf_fc_vport_transport_template;
+
+/*
+ * SCSI EH handlers
+ */
+static int qedf_eh_abort(struct scsi_cmnd *sc_cmd)
+{
+	struct fc_rport *rport = starget_to_rport(scsi_target(sc_cmd->device));
+	struct fc_rport_libfc_priv *rp = rport->dd_data;
+	struct qedf_rport *fcport;
+	struct fc_lport *lport;
+	struct qedf_ctx *qedf;
+	struct qedf_ioreq *io_req;
+	int rc = FAILED;
+	int rval;
+
+	if (fc_remote_port_chkready(rport)) {
+		QEDF_ERR(NULL, "rport not ready\n");
+		goto out;
+	}
+
+	lport = shost_priv(sc_cmd->device->host);
+	qedf = (struct qedf_ctx *)lport_priv(lport);
+
+	if ((lport->state != LPORT_ST_READY) || !(lport->link_up)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "link not ready.\n");
+		goto out;
+	}
+
+	fcport = (struct qedf_rport *)&rp[1];
+
+	io_req = (struct qedf_ioreq *)sc_cmd->SCp.ptr;
+	if (!io_req) {
+		QEDF_ERR(&(qedf->dbg_ctx), "io_req is NULL.\n");
+		rc = SUCCESS;
+		goto out;
+	}
+
+	if (!test_bit(QEDF_CMD_OUTSTANDING, &io_req->flags) ||
+	    test_bit(QEDF_CMD_IN_CLEANUP, &io_req->flags) ||
+	    test_bit(QEDF_CMD_IN_ABORT, &io_req->flags)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "io_req xid=0x%x already in "
+			  "cleanup or abort processing or already "
+			  "completed.\n", io_req->xid);
+		rc = SUCCESS;
+		goto out;
+	}
+
+	QEDF_ERR(&(qedf->dbg_ctx), "Aborting io_req sc_cmd=%p xid=0x%x "
+		  "fp_idx=%d.\n", sc_cmd, io_req->xid, io_req->fp_idx);
+
+	if (qedf->stop_io_on_error) {
+		qedf_stop_all_io(qedf);
+		rc = SUCCESS;
+		goto out;
+	}
+
+	init_completion(&io_req->abts_done);
+	rval = qedf_initiate_abts(io_req, true);
+	if (rval) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Failed to queue ABTS.\n");
+		goto out;
+	}
+
+	wait_for_completion(&io_req->abts_done);
+
+	if (io_req->event == QEDF_IOREQ_EV_ABORT_SUCCESS ||
+	    io_req->event == QEDF_IOREQ_EV_ABORT_FAILED ||
+	    io_req->event == QEDF_IOREQ_EV_CLEANUP_SUCCESS) {
+		/*
+		 * If we get a reponse to the abort this is success from
+		 * the perspective that all references to the command have
+		 * been removed from the driver and firmware
+		 */
+		rc = SUCCESS;
+	} else {
+		/* If the abort and cleanup failed then return a failure */
+		rc = FAILED;
+	}
+
+	if (rc == SUCCESS)
+		QEDF_ERR(&(qedf->dbg_ctx), "ABTS succeeded, xid=0x%x.\n",
+			  io_req->xid);
+	else
+		QEDF_ERR(&(qedf->dbg_ctx), "ABTS failed, xid=0x%x.\n",
+			  io_req->xid);
+
+out:
+	return rc;
+}
+
+static int qedf_eh_target_reset(struct scsi_cmnd *sc_cmd)
+{
+	QEDF_ERR(NULL, "TARGET RESET Issued...");
+	return qedf_initiate_tmf(sc_cmd, FCP_TMF_TGT_RESET);
+}
+
+static int qedf_eh_device_reset(struct scsi_cmnd *sc_cmd)
+{
+	QEDF_ERR(NULL, "LUN RESET Issued...\n");
+	return qedf_initiate_tmf(sc_cmd, FCP_TMF_LUN_RESET);
+}
+
+void qedf_wait_for_upload(struct qedf_ctx *qedf)
+{
+	while (1) {
+		if (atomic_read(&qedf->num_offloads))
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+			    "Waiting for all uploads to complete.\n");
+		else
+			break;
+		msleep(500);
+	}
+}
+
+/* Performs soft reset of qedf_ctx by simulating a link down/up */
+static void qedf_ctx_soft_reset(struct fc_lport *lport)
+{
+	struct qedf_ctx *qedf;
+
+	if (lport->vport) {
+		QEDF_ERR(NULL, "Cannot issue host reset on NPIV port.\n");
+		return;
+	}
+
+	qedf = lport_priv(lport);
+
+	/* For host reset, essentially do a soft link up/down */
+	atomic_set(&qedf->link_state, QEDF_LINK_DOWN);
+	atomic_set(&qedf->dcbx, QEDF_DCBX_PENDING);
+	queue_delayed_work(qedf->link_update_wq, &qedf->link_update,
+	    0);
+	qedf_wait_for_upload(qedf);
+	atomic_set(&qedf->link_state, QEDF_LINK_UP);
+	qedf->vlan_id  = 0;
+	queue_delayed_work(qedf->link_update_wq, &qedf->link_update,
+	    0);
+}
+
+/* Reset the host by gracefully logging out and then logging back in */
+static int qedf_eh_host_reset(struct scsi_cmnd *sc_cmd)
+{
+	struct fc_lport *lport;
+	struct qedf_ctx *qedf;
+
+	lport = shost_priv(sc_cmd->device->host);
+	qedf = lport_priv(lport);
+
+	if (atomic_read(&qedf->link_state) == QEDF_LINK_DOWN ||
+	    test_bit(QEDF_UNLOADING, &qedf->flags))
+		return FAILED;
+
+	QEDF_ERR(&(qedf->dbg_ctx), "HOST RESET Issued...");
+
+	qedf_ctx_soft_reset(lport);
+
+	return SUCCESS;
+}
+
+static int qedf_slave_configure(struct scsi_device *sdev)
+{
+	if (qedf_queue_depth) {
+		scsi_change_queue_depth(sdev, qedf_queue_depth);
+	}
+
+	return 0;
+}
+
+static struct scsi_host_template qedf_host_template = {
+	.module 	= THIS_MODULE,
+	.name 		= QEDF_MODULE_NAME,
+	.this_id 	= -1,
+	.cmd_per_lun	= 32,
+	.use_clustering = ENABLE_CLUSTERING,
+	.max_sectors 	= 0xffff,
+	.queuecommand 	= qedf_queuecommand,
+	.shost_attrs	= qedf_host_attrs,
+	.eh_abort_handler	= qedf_eh_abort,
+	.eh_device_reset_handler = qedf_eh_device_reset, /* lun reset */
+	.eh_target_reset_handler = qedf_eh_target_reset, /* target reset */
+	.eh_host_reset_handler  = qedf_eh_host_reset,
+	.slave_configure	= qedf_slave_configure,
+	.dma_boundary = QED_HW_DMA_BOUNDARY,
+	.sg_tablesize = QEDF_MAX_BDS_PER_CMD,
+	.can_queue = FCOE_PARAMS_NUM_TASKS,
+	.change_queue_depth = scsi_change_queue_depth,
+};
+
+static int qedf_get_paged_crc_eof(struct sk_buff *skb, int tlen)
+{
+	int rc;
+
+	spin_lock(&qedf_global_lock);
+	rc = fcoe_get_paged_crc_eof(skb, tlen, &qedf_global);
+	spin_unlock(&qedf_global_lock);
+
+	return rc;
+}
+
+static struct qedf_rport *qedf_fcport_lookup(struct qedf_ctx *qedf, u32 port_id)
+{
+	struct qedf_rport *fcport;
+	struct fc_rport_priv *rdata;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(fcport, &qedf->fcports, peers) {
+		rdata = fcport->rdata;
+		if (rdata == NULL)
+			continue;
+		if (rdata->ids.port_id == port_id) {
+			rcu_read_unlock();
+			return fcport;
+		}
+	}
+	rcu_read_unlock();
+
+	/* Return NULL to caller to let them know fcport was not found */
+	return NULL;
+}
+
+/* Transmits an ELS frame over an offloaded session */
+static int qedf_xmit_l2_frame(struct qedf_rport *fcport, struct fc_frame *fp)
+{
+	struct fc_frame_header *fh;
+	int rc = 0;
+
+	fh = fc_frame_header_get(fp);
+	if ((fh->fh_type == FC_TYPE_ELS) &&
+	    (fh->fh_r_ctl == FC_RCTL_ELS_REQ)) {
+		switch (fc_frame_payload_op(fp)) {
+		case ELS_ADISC:
+			qedf_send_adisc(fcport, fp);
+			rc = 1;
+			break;
+		}
+	}
+
+	return rc;
+}
+
+/**
+ * qedf_xmit - qedf FCoE frame transmit function
+ *
+ */
+static int qedf_xmit(struct fc_lport *lport, struct fc_frame *fp)
+{
+	struct fc_lport		*base_lport;
+	struct qedf_ctx		*qedf;
+	struct ethhdr		*eh;
+	struct fcoe_crc_eof	*cp;
+	struct sk_buff		*skb;
+	struct fc_frame_header	*fh;
+	struct fcoe_hdr		*hp;
+	u8			sof, eof;
+	u32			crc;
+	unsigned int		hlen, tlen, elen;
+	int			wlen;
+	struct fc_stats		*stats;
+	struct fc_lport *tmp_lport;
+	struct fc_lport *vn_port = NULL;
+	struct qedf_rport *fcport;
+	int rc;
+	u16 vlan_tci = 0;
+
+	qedf = (struct qedf_ctx *)lport_priv(lport);
+
+	fh = fc_frame_header_get(fp);
+	skb = fp_skb(fp);
+
+	/* Filter out traffic to other NPIV ports on the same host */
+	if (lport->vport)
+		base_lport = shost_priv(vport_to_shost(lport->vport));
+	else
+		base_lport = lport;
+
+	/* Flag if the destination is the base port */
+	if (base_lport->port_id == ntoh24(fh->fh_d_id)) {
+		vn_port = base_lport;
+	} else {
+		/* Got through the list of vports attached to the base_lport
+		 * and see if we have a match with the destination address.
+		 */
+		list_for_each_entry(tmp_lport, &base_lport->vports, list) {
+			if (tmp_lport->port_id == ntoh24(fh->fh_d_id)) {
+				vn_port = tmp_lport;
+				break;
+			}
+		}
+	}
+	if (vn_port && ntoh24(fh->fh_d_id) != FC_FID_FLOGI) {
+		struct fc_rport_priv *rdata = NULL;
+
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2,
+		    "Dropping FCoE frame to %06x.\n", ntoh24(fh->fh_d_id));
+		kfree_skb(skb);
+		rdata = fc_rport_lookup(lport, ntoh24(fh->fh_d_id));
+		if (rdata)
+			rdata->retries = lport->max_rport_retry_count;
+		return -EINVAL;
+	}
+	/* End NPIV filtering */
+
+	if (!qedf->ctlr.sel_fcf) {
+		kfree_skb(skb);
+		return 0;
+	}
+
+	if (!test_bit(QEDF_LL2_STARTED, &qedf->flags)) {
+		QEDF_WARN(&(qedf->dbg_ctx), "LL2 not started\n");
+		kfree_skb(skb);
+		return 0;
+	}
+
+	if (atomic_read(&qedf->link_state) != QEDF_LINK_UP) {
+		QEDF_WARN(&(qedf->dbg_ctx), "qedf link down\n");
+		kfree_skb(skb);
+		return 0;
+	}
+
+	if (unlikely(fh->fh_r_ctl == FC_RCTL_ELS_REQ)) {
+		if (fcoe_ctlr_els_send(&qedf->ctlr, lport, skb))
+			return 0;
+	}
+
+	/* Check to see if this needs to be sent on an offloaded session */
+	fcport = qedf_fcport_lookup(qedf, ntoh24(fh->fh_d_id));
+
+	if (fcport && test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+		rc = qedf_xmit_l2_frame(fcport, fp);
+		/*
+		 * If the frame was successfully sent over the middle path
+		 * then do not try to also send it over the LL2 path
+		 */
+		if (rc)
+			return 0;
+	}
+
+	sof = fr_sof(fp);
+	eof = fr_eof(fp);
+
+	elen = sizeof(struct ethhdr);
+	hlen = sizeof(struct fcoe_hdr);
+	tlen = sizeof(struct fcoe_crc_eof);
+	wlen = (skb->len - tlen + sizeof(crc)) / FCOE_WORD_TO_BYTE;
+
+	skb->ip_summed = CHECKSUM_NONE;
+	crc = fcoe_fc_crc(fp);
+
+	/* copy port crc and eof to the skb buff */
+	if (skb_is_nonlinear(skb)) {
+		skb_frag_t *frag;
+
+		if (qedf_get_paged_crc_eof(skb, tlen)) {
+			kfree_skb(skb);
+			return -ENOMEM;
+		}
+		frag = &skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags - 1];
+		cp = kmap_atomic(skb_frag_page(frag)) + frag->page_offset;
+	} else {
+		cp = skb_put(skb, tlen);
+	}
+
+	memset(cp, 0, sizeof(*cp));
+	cp->fcoe_eof = eof;
+	cp->fcoe_crc32 = cpu_to_le32(~crc);
+	if (skb_is_nonlinear(skb)) {
+		kunmap_atomic(cp);
+		cp = NULL;
+	}
+
+
+	/* adjust skb network/transport offsets to match mac/fcoe/port */
+	skb_push(skb, elen + hlen);
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb->mac_len = elen;
+	skb->protocol = htons(ETH_P_FCOE);
+
+	/*
+	 * Add VLAN tag to non-offload FCoE frame based on current stored VLAN
+	 * for FIP/FCoE traffic.
+	 */
+	__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), qedf->vlan_id);
+
+	/* fill up mac and fcoe headers */
+	eh = eth_hdr(skb);
+	eh->h_proto = htons(ETH_P_FCOE);
+	if (qedf->ctlr.map_dest)
+		fc_fcoe_set_mac(eh->h_dest, fh->fh_d_id);
+	else
+		/* insert GW address */
+		ether_addr_copy(eh->h_dest, qedf->ctlr.dest_addr);
+
+	/* Set the source MAC address */
+	ether_addr_copy(eh->h_source, qedf->data_src_addr);
+
+	hp = (struct fcoe_hdr *)(eh + 1);
+	memset(hp, 0, sizeof(*hp));
+	if (FC_FCOE_VER)
+		FC_FCOE_ENCAPS_VER(hp, FC_FCOE_VER);
+	hp->fcoe_sof = sof;
+
+	/*update tx stats */
+	stats = per_cpu_ptr(lport->stats, get_cpu());
+	stats->TxFrames++;
+	stats->TxWords += wlen;
+	put_cpu();
+
+	/* Get VLAN ID from skb for printing purposes */
+	__vlan_hwaccel_get_tag(skb, &vlan_tci);
+
+	/* send down to lld */
+	fr_dev(fp) = lport;
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2, "FCoE frame send: "
+	    "src=%06x dest=%06x r_ctl=%x type=%x vlan=%04x.\n",
+	    ntoh24(fh->fh_s_id), ntoh24(fh->fh_d_id), fh->fh_r_ctl, fh->fh_type,
+	    vlan_tci);
+	if (qedf_dump_frames)
+		print_hex_dump(KERN_WARNING, "fcoe: ", DUMP_PREFIX_OFFSET, 16,
+		    1, skb->data, skb->len, false);
+	qed_ops->ll2->start_xmit(qedf->cdev, skb);
+
+	return 0;
+}
+
+static int qedf_alloc_sq(struct qedf_ctx *qedf, struct qedf_rport *fcport)
+{
+	int rval = 0;
+	u32 *pbl;
+	dma_addr_t page;
+	int num_pages;
+
+	/* Calculate appropriate queue and PBL sizes */
+	fcport->sq_mem_size = SQ_NUM_ENTRIES * sizeof(struct fcoe_wqe);
+	fcport->sq_mem_size = ALIGN(fcport->sq_mem_size, QEDF_PAGE_SIZE);
+	fcport->sq_pbl_size = (fcport->sq_mem_size / QEDF_PAGE_SIZE) *
+	    sizeof(void *);
+	fcport->sq_pbl_size = fcport->sq_pbl_size + QEDF_PAGE_SIZE;
+
+	fcport->sq = dma_zalloc_coherent(&qedf->pdev->dev,
+	    fcport->sq_mem_size, &fcport->sq_dma, GFP_KERNEL);
+	if (!fcport->sq) {
+		QEDF_WARN(&(qedf->dbg_ctx), "Could not allocate send queue.\n");
+		rval = 1;
+		goto out;
+	}
+
+	fcport->sq_pbl = dma_zalloc_coherent(&qedf->pdev->dev,
+	    fcport->sq_pbl_size, &fcport->sq_pbl_dma, GFP_KERNEL);
+	if (!fcport->sq_pbl) {
+		QEDF_WARN(&(qedf->dbg_ctx), "Could not allocate send queue PBL.\n");
+		rval = 1;
+		goto out_free_sq;
+	}
+
+	/* Create PBL */
+	num_pages = fcport->sq_mem_size / QEDF_PAGE_SIZE;
+	page = fcport->sq_dma;
+	pbl = (u32 *)fcport->sq_pbl;
+
+	while (num_pages--) {
+		*pbl = U64_LO(page);
+		pbl++;
+		*pbl = U64_HI(page);
+		pbl++;
+		page += QEDF_PAGE_SIZE;
+	}
+
+	return rval;
+
+out_free_sq:
+	dma_free_coherent(&qedf->pdev->dev, fcport->sq_mem_size, fcport->sq,
+	    fcport->sq_dma);
+out:
+	return rval;
+}
+
+static void qedf_free_sq(struct qedf_ctx *qedf, struct qedf_rport *fcport)
+{
+	if (fcport->sq_pbl)
+		dma_free_coherent(&qedf->pdev->dev, fcport->sq_pbl_size,
+		    fcport->sq_pbl, fcport->sq_pbl_dma);
+	if (fcport->sq)
+		dma_free_coherent(&qedf->pdev->dev, fcport->sq_mem_size,
+		    fcport->sq, fcport->sq_dma);
+}
+
+static int qedf_offload_connection(struct qedf_ctx *qedf,
+	struct qedf_rport *fcport)
+{
+	struct qed_fcoe_params_offload conn_info;
+	u32 port_id;
+	int rval;
+	uint16_t total_sqe = (fcport->sq_mem_size / sizeof(struct fcoe_wqe));
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN, "Offloading connection "
+		   "portid=%06x.\n", fcport->rdata->ids.port_id);
+	rval = qed_ops->acquire_conn(qedf->cdev, &fcport->handle,
+	    &fcport->fw_cid, &fcport->p_doorbell);
+	if (rval) {
+		QEDF_WARN(&(qedf->dbg_ctx), "Could not acquire connection "
+			   "for portid=%06x.\n", fcport->rdata->ids.port_id);
+		rval = 1; /* For some reason qed returns 0 on failure here */
+		goto out;
+	}
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN, "portid=%06x "
+		   "fw_cid=%08x handle=%d.\n", fcport->rdata->ids.port_id,
+		   fcport->fw_cid, fcport->handle);
+
+	memset(&conn_info, 0, sizeof(struct qed_fcoe_params_offload));
+
+	/* Fill in the offload connection info */
+	conn_info.sq_pbl_addr = fcport->sq_pbl_dma;
+
+	conn_info.sq_curr_page_addr = (dma_addr_t)(*(u64 *)fcport->sq_pbl);
+	conn_info.sq_next_page_addr =
+	    (dma_addr_t)(*(u64 *)(fcport->sq_pbl + 8));
+
+	/* Need to use our FCoE MAC for the offload session */
+	ether_addr_copy(conn_info.src_mac, qedf->data_src_addr);
+
+	ether_addr_copy(conn_info.dst_mac, qedf->ctlr.dest_addr);
+
+	conn_info.tx_max_fc_pay_len = fcport->rdata->maxframe_size;
+	conn_info.e_d_tov_timer_val = qedf->lport->e_d_tov / 20;
+	conn_info.rec_tov_timer_val = 3; /* I think this is what E3 was */
+	conn_info.rx_max_fc_pay_len = fcport->rdata->maxframe_size;
+
+	/* Set VLAN data */
+	conn_info.vlan_tag = qedf->vlan_id <<
+	    FCOE_CONN_OFFLOAD_RAMROD_DATA_VLAN_ID_SHIFT;
+	conn_info.vlan_tag |=
+	    qedf_default_prio << FCOE_CONN_OFFLOAD_RAMROD_DATA_PRIORITY_SHIFT;
+	conn_info.flags |= (FCOE_CONN_OFFLOAD_RAMROD_DATA_B_VLAN_FLAG_MASK <<
+	    FCOE_CONN_OFFLOAD_RAMROD_DATA_B_VLAN_FLAG_SHIFT);
+
+	/* Set host port source id */
+	port_id = fc_host_port_id(qedf->lport->host);
+	fcport->sid = port_id;
+	conn_info.s_id.addr_hi = (port_id & 0x000000FF);
+	conn_info.s_id.addr_mid = (port_id & 0x0000FF00) >> 8;
+	conn_info.s_id.addr_lo = (port_id & 0x00FF0000) >> 16;
+
+	conn_info.max_conc_seqs_c3 = fcport->rdata->max_seq;
+
+	/* Set remote port destination id */
+	port_id = fcport->rdata->rport->port_id;
+	conn_info.d_id.addr_hi = (port_id & 0x000000FF);
+	conn_info.d_id.addr_mid = (port_id & 0x0000FF00) >> 8;
+	conn_info.d_id.addr_lo = (port_id & 0x00FF0000) >> 16;
+
+	conn_info.def_q_idx = 0; /* Default index for send queue? */
+
+	/* Set FC-TAPE specific flags if needed */
+	if (fcport->dev_type == QEDF_RPORT_TYPE_TAPE) {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN,
+		    "Enable CONF, REC for portid=%06x.\n",
+		    fcport->rdata->ids.port_id);
+		conn_info.flags |= 1 <<
+		    FCOE_CONN_OFFLOAD_RAMROD_DATA_B_CONF_REQ_SHIFT;
+		conn_info.flags |=
+		    ((fcport->rdata->sp_features & FC_SP_FT_SEQC) ? 1 : 0) <<
+		    FCOE_CONN_OFFLOAD_RAMROD_DATA_B_REC_VALID_SHIFT;
+	}
+
+	rval = qed_ops->offload_conn(qedf->cdev, fcport->handle, &conn_info);
+	if (rval) {
+		QEDF_WARN(&(qedf->dbg_ctx), "Could not offload connection "
+			   "for portid=%06x.\n", fcport->rdata->ids.port_id);
+		goto out_free_conn;
+	} else
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN, "Offload "
+			   "succeeded portid=%06x total_sqe=%d.\n",
+			   fcport->rdata->ids.port_id, total_sqe);
+
+	spin_lock_init(&fcport->rport_lock);
+	atomic_set(&fcport->free_sqes, total_sqe);
+	return 0;
+out_free_conn:
+	qed_ops->release_conn(qedf->cdev, fcport->handle);
+out:
+	return rval;
+}
+
+#define QEDF_TERM_BUFF_SIZE		10
+static void qedf_upload_connection(struct qedf_ctx *qedf,
+	struct qedf_rport *fcport)
+{
+	void *term_params;
+	dma_addr_t term_params_dma;
+
+	/* Term params needs to be a DMA coherent buffer as qed shared the
+	 * physical DMA address with the firmware. The buffer may be used in
+	 * the receive path so we may eventually have to move this.
+	 */
+	term_params = dma_alloc_coherent(&qedf->pdev->dev, QEDF_TERM_BUFF_SIZE,
+		&term_params_dma, GFP_KERNEL);
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN, "Uploading connection "
+		   "port_id=%06x.\n", fcport->rdata->ids.port_id);
+
+	qed_ops->destroy_conn(qedf->cdev, fcport->handle, term_params_dma);
+	qed_ops->release_conn(qedf->cdev, fcport->handle);
+
+	dma_free_coherent(&qedf->pdev->dev, QEDF_TERM_BUFF_SIZE, term_params,
+	    term_params_dma);
+}
+
+static void qedf_cleanup_fcport(struct qedf_ctx *qedf,
+	struct qedf_rport *fcport)
+{
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN, "Cleaning up portid=%06x.\n",
+	    fcport->rdata->ids.port_id);
+
+	/* Flush any remaining i/o's before we upload the connection */
+	qedf_flush_active_ios(fcport, -1);
+
+	if (test_and_clear_bit(QEDF_RPORT_SESSION_READY, &fcport->flags))
+		qedf_upload_connection(qedf, fcport);
+	qedf_free_sq(qedf, fcport);
+	fcport->rdata = NULL;
+	fcport->qedf = NULL;
+}
+
+/**
+ * This event_callback is called after successful completion of libfc
+ * initiated target login. qedf can proceed with initiating the session
+ * establishment.
+ */
+static void qedf_rport_event_handler(struct fc_lport *lport,
+				struct fc_rport_priv *rdata,
+				enum fc_rport_event event)
+{
+	struct qedf_ctx *qedf = lport_priv(lport);
+	struct fc_rport *rport = rdata->rport;
+	struct fc_rport_libfc_priv *rp;
+	struct qedf_rport *fcport;
+	u32 port_id;
+	int rval;
+	unsigned long flags;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "event = %d, "
+		   "port_id = 0x%x\n", event, rdata->ids.port_id);
+
+	switch (event) {
+	case RPORT_EV_READY:
+		if (!rport) {
+			QEDF_WARN(&(qedf->dbg_ctx), "rport is NULL.\n");
+			break;
+		}
+
+		rp = rport->dd_data;
+		fcport = (struct qedf_rport *)&rp[1];
+		fcport->qedf = qedf;
+
+		if (atomic_read(&qedf->num_offloads) >= QEDF_MAX_SESSIONS) {
+			QEDF_ERR(&(qedf->dbg_ctx), "Not offloading "
+			    "portid=0x%x as max number of offloaded sessions "
+			    "reached.\n", rdata->ids.port_id);
+			return;
+		}
+
+		/*
+		 * Don't try to offload the session again. Can happen when we
+		 * get an ADISC
+		 */
+		if (test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+			QEDF_WARN(&(qedf->dbg_ctx), "Session already "
+				   "offloaded, portid=0x%x.\n",
+				   rdata->ids.port_id);
+			return;
+		}
+
+		if (rport->port_id == FC_FID_DIR_SERV) {
+			/*
+			 * qedf_rport structure doesn't exist for
+			 * directory server.
+			 * We should not come here, as lport will
+			 * take care of fabric login
+			 */
+			QEDF_WARN(&(qedf->dbg_ctx), "rport struct does not "
+			    "exist for dir server port_id=%x\n",
+			    rdata->ids.port_id);
+			break;
+		}
+
+		if (rdata->spp_type != FC_TYPE_FCP) {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+			    "Not offloading since spp type isn't FCP\n");
+			break;
+		}
+		if (!(rdata->ids.roles & FC_RPORT_ROLE_FCP_TARGET)) {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+			    "Not FCP target so not offloading\n");
+			break;
+		}
+
+		fcport->rdata = rdata;
+		fcport->rport = rport;
+
+		rval = qedf_alloc_sq(qedf, fcport);
+		if (rval) {
+			qedf_cleanup_fcport(qedf, fcport);
+			break;
+		}
+
+		/* Set device type */
+		if (rdata->flags & FC_RP_FLAGS_RETRY &&
+		    rdata->ids.roles & FC_RPORT_ROLE_FCP_TARGET &&
+		    !(rdata->ids.roles & FC_RPORT_ROLE_FCP_INITIATOR)) {
+			fcport->dev_type = QEDF_RPORT_TYPE_TAPE;
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+			    "portid=%06x is a TAPE device.\n",
+			    rdata->ids.port_id);
+		} else {
+			fcport->dev_type = QEDF_RPORT_TYPE_DISK;
+		}
+
+		rval = qedf_offload_connection(qedf, fcport);
+		if (rval) {
+			qedf_cleanup_fcport(qedf, fcport);
+			break;
+		}
+
+		/* Add fcport to list of qedf_ctx list of offloaded ports */
+		spin_lock_irqsave(&qedf->hba_lock, flags);
+		list_add_rcu(&fcport->peers, &qedf->fcports);
+		spin_unlock_irqrestore(&qedf->hba_lock, flags);
+
+		/*
+		 * Set the session ready bit to let everyone know that this
+		 * connection is ready for I/O
+		 */
+		set_bit(QEDF_RPORT_SESSION_READY, &fcport->flags);
+		atomic_inc(&qedf->num_offloads);
+
+		break;
+	case RPORT_EV_LOGO:
+	case RPORT_EV_FAILED:
+	case RPORT_EV_STOP:
+		port_id = rdata->ids.port_id;
+		if (port_id == FC_FID_DIR_SERV)
+			break;
+
+		if (!rport) {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+			    "port_id=%x - rport notcreated Yet!!\n", port_id);
+			break;
+		}
+		rp = rport->dd_data;
+		/*
+		 * Perform session upload. Note that rdata->peers is already
+		 * removed from disc->rports list before we get this event.
+		 */
+		fcport = (struct qedf_rport *)&rp[1];
+
+		/* Only free this fcport if it is offloaded already */
+		if (test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+			set_bit(QEDF_RPORT_UPLOADING_CONNECTION, &fcport->flags);
+			qedf_cleanup_fcport(qedf, fcport);
+
+			/*
+			 * Remove fcport to list of qedf_ctx list of offloaded
+			 * ports
+			 */
+			spin_lock_irqsave(&qedf->hba_lock, flags);
+			list_del_rcu(&fcport->peers);
+			spin_unlock_irqrestore(&qedf->hba_lock, flags);
+
+			clear_bit(QEDF_RPORT_UPLOADING_CONNECTION,
+			    &fcport->flags);
+			atomic_dec(&qedf->num_offloads);
+		}
+
+		break;
+
+	case RPORT_EV_NONE:
+		break;
+	}
+}
+
+static void qedf_abort_io(struct fc_lport *lport)
+{
+	/* NO-OP but need to fill in the template */
+}
+
+static void qedf_fcp_cleanup(struct fc_lport *lport)
+{
+	/*
+	 * NO-OP but need to fill in template to prevent a NULL
+	 * function pointer dereference during link down. I/Os
+	 * will be flushed when port is uploaded.
+	 */
+}
+
+static struct libfc_function_template qedf_lport_template = {
+	.frame_send		= qedf_xmit,
+	.fcp_abort_io		= qedf_abort_io,
+	.fcp_cleanup		= qedf_fcp_cleanup,
+	.rport_event_callback	= qedf_rport_event_handler,
+	.elsct_send		= qedf_elsct_send,
+};
+
+static void qedf_fcoe_ctlr_setup(struct qedf_ctx *qedf)
+{
+	fcoe_ctlr_init(&qedf->ctlr, FIP_ST_AUTO);
+
+	qedf->ctlr.send = qedf_fip_send;
+	qedf->ctlr.get_src_addr = qedf_get_src_mac;
+	ether_addr_copy(qedf->ctlr.ctl_src_addr, qedf->mac);
+}
+
+static void qedf_setup_fdmi(struct qedf_ctx *qedf)
+{
+	struct fc_lport *lport = qedf->lport;
+	struct fc_host_attrs *fc_host = shost_to_fc_host(lport->host);
+	u8 buf[8];
+	int i, pos;
+
+	/*
+	 * fdmi_enabled needs to be set for libfc to execute FDMI registration.
+	 */
+	lport->fdmi_enabled = 1;
+
+	/*
+	 * Setup the necessary fc_host attributes to that will be used to fill
+	 * in the FDMI information.
+	 */
+
+	/* Get the PCI-e Device Serial Number Capability */
+	pos = pci_find_ext_capability(qedf->pdev, PCI_EXT_CAP_ID_DSN);
+	if (pos) {
+		pos += 4;
+		for (i = 0; i < 8; i++)
+			pci_read_config_byte(qedf->pdev, pos + i, &buf[i]);
+
+		snprintf(fc_host->serial_number,
+		    sizeof(fc_host->serial_number),
+		    "%02X%02X%02X%02X%02X%02X%02X%02X",
+		    buf[7], buf[6], buf[5], buf[4],
+		    buf[3], buf[2], buf[1], buf[0]);
+	} else
+		snprintf(fc_host->serial_number,
+		    sizeof(fc_host->serial_number), "Unknown");
+
+	snprintf(fc_host->manufacturer,
+	    sizeof(fc_host->manufacturer), "%s", "Cavium Inc.");
+
+	snprintf(fc_host->model, sizeof(fc_host->model), "%s", "QL41000");
+
+	snprintf(fc_host->model_description, sizeof(fc_host->model_description),
+	    "%s", "QLogic FastLinQ QL41000 Series 10/25/40/50GGbE Controller"
+	    "(FCoE)");
+
+	snprintf(fc_host->hardware_version, sizeof(fc_host->hardware_version),
+	    "Rev %d", qedf->pdev->revision);
+
+	snprintf(fc_host->driver_version, sizeof(fc_host->driver_version),
+	    "%s", QEDF_VERSION);
+
+	snprintf(fc_host->firmware_version, sizeof(fc_host->firmware_version),
+	    "%d.%d.%d.%d", FW_MAJOR_VERSION, FW_MINOR_VERSION,
+	    FW_REVISION_VERSION, FW_ENGINEERING_VERSION);
+}
+
+static int qedf_lport_setup(struct qedf_ctx *qedf)
+{
+	struct fc_lport *lport = qedf->lport;
+
+	lport->link_up = 0;
+	lport->max_retry_count = QEDF_FLOGI_RETRY_CNT;
+	lport->max_rport_retry_count = QEDF_RPORT_RETRY_CNT;
+	lport->service_params = (FCP_SPPF_INIT_FCN | FCP_SPPF_RD_XRDY_DIS |
+	    FCP_SPPF_RETRY | FCP_SPPF_CONF_COMPL);
+	lport->boot_time = jiffies;
+	lport->e_d_tov = 2 * 1000;
+	lport->r_a_tov = 10 * 1000;
+
+	/* Set NPIV support */
+	lport->does_npiv = 1;
+	fc_host_max_npiv_vports(lport->host) = QEDF_MAX_NPIV;
+
+	fc_set_wwnn(lport, qedf->wwnn);
+	fc_set_wwpn(lport, qedf->wwpn);
+
+	fcoe_libfc_config(lport, &qedf->ctlr, &qedf_lport_template, 0);
+
+	/* Allocate the exchange manager */
+	fc_exch_mgr_alloc(lport, FC_CLASS_3, qedf->max_scsi_xid + 1,
+	    qedf->max_els_xid, NULL);
+
+	if (fc_lport_init_stats(lport))
+		return -ENOMEM;
+
+	/* Finish lport config */
+	fc_lport_config(lport);
+
+	/* Set max frame size */
+	fc_set_mfs(lport, QEDF_MFS);
+	fc_host_maxframe_size(lport->host) = lport->mfs;
+
+	/* Set default dev_loss_tmo based on module parameter */
+	fc_host_dev_loss_tmo(lport->host) = qedf_dev_loss_tmo;
+
+	/* Set symbolic node name */
+	snprintf(fc_host_symbolic_name(lport->host), 256,
+	    "QLogic %s v%s", QEDF_MODULE_NAME, QEDF_VERSION);
+
+	qedf_setup_fdmi(qedf);
+
+	return 0;
+}
+
+/*
+ * NPIV functions
+ */
+
+static int qedf_vport_libfc_config(struct fc_vport *vport,
+	struct fc_lport *lport)
+{
+	lport->link_up = 0;
+	lport->qfull = 0;
+	lport->max_retry_count = QEDF_FLOGI_RETRY_CNT;
+	lport->max_rport_retry_count = QEDF_RPORT_RETRY_CNT;
+	lport->service_params = (FCP_SPPF_INIT_FCN | FCP_SPPF_RD_XRDY_DIS |
+	    FCP_SPPF_RETRY | FCP_SPPF_CONF_COMPL);
+	lport->boot_time = jiffies;
+	lport->e_d_tov = 2 * 1000;
+	lport->r_a_tov = 10 * 1000;
+	lport->does_npiv = 1; /* Temporary until we add NPIV support */
+
+	/* Allocate stats for vport */
+	if (fc_lport_init_stats(lport))
+		return -ENOMEM;
+
+	/* Finish lport config */
+	fc_lport_config(lport);
+
+	/* offload related configuration */
+	lport->crc_offload = 0;
+	lport->seq_offload = 0;
+	lport->lro_enabled = 0;
+	lport->lro_xid = 0;
+	lport->lso_max = 0;
+
+	return 0;
+}
+
+static int qedf_vport_create(struct fc_vport *vport, bool disabled)
+{
+	struct Scsi_Host *shost = vport_to_shost(vport);
+	struct fc_lport *n_port = shost_priv(shost);
+	struct fc_lport *vn_port;
+	struct qedf_ctx *base_qedf = lport_priv(n_port);
+	struct qedf_ctx *vport_qedf;
+
+	char buf[32];
+	int rc = 0;
+
+	rc = fcoe_validate_vport_create(vport);
+	if (rc) {
+		fcoe_wwn_to_str(vport->port_name, buf, sizeof(buf));
+		QEDF_WARN(&(base_qedf->dbg_ctx), "Failed to create vport, "
+			   "WWPN (0x%s) already exists.\n", buf);
+		goto err1;
+	}
+
+	if (atomic_read(&base_qedf->link_state) != QEDF_LINK_UP) {
+		QEDF_WARN(&(base_qedf->dbg_ctx), "Cannot create vport "
+			   "because link is not up.\n");
+		rc = -EIO;
+		goto err1;
+	}
+
+	vn_port = libfc_vport_create(vport, sizeof(struct qedf_ctx));
+	if (!vn_port) {
+		QEDF_WARN(&(base_qedf->dbg_ctx), "Could not create lport "
+			   "for vport.\n");
+		rc = -ENOMEM;
+		goto err1;
+	}
+
+	fcoe_wwn_to_str(vport->port_name, buf, sizeof(buf));
+	QEDF_ERR(&(base_qedf->dbg_ctx), "Creating NPIV port, WWPN=%s.\n",
+	    buf);
+
+	/* Copy some fields from base_qedf */
+	vport_qedf = lport_priv(vn_port);
+	memcpy(vport_qedf, base_qedf, sizeof(struct qedf_ctx));
+
+	/* Set qedf data specific to this vport */
+	vport_qedf->lport = vn_port;
+	/* Use same hba_lock as base_qedf */
+	vport_qedf->hba_lock = base_qedf->hba_lock;
+	vport_qedf->pdev = base_qedf->pdev;
+	vport_qedf->cmd_mgr = base_qedf->cmd_mgr;
+	init_completion(&vport_qedf->flogi_compl);
+	INIT_LIST_HEAD(&vport_qedf->fcports);
+
+	rc = qedf_vport_libfc_config(vport, vn_port);
+	if (rc) {
+		QEDF_ERR(&(base_qedf->dbg_ctx), "Could not allocate memory "
+		    "for lport stats.\n");
+		goto err2;
+	}
+
+	fc_set_wwnn(vn_port, vport->node_name);
+	fc_set_wwpn(vn_port, vport->port_name);
+	vport_qedf->wwnn = vn_port->wwnn;
+	vport_qedf->wwpn = vn_port->wwpn;
+
+	vn_port->host->transportt = qedf_fc_vport_transport_template;
+	vn_port->host->can_queue = QEDF_MAX_ELS_XID;
+	vn_port->host->max_lun = qedf_max_lun;
+	vn_port->host->sg_tablesize = QEDF_MAX_BDS_PER_CMD;
+	vn_port->host->max_cmd_len = QEDF_MAX_CDB_LEN;
+
+	rc = scsi_add_host(vn_port->host, &vport->dev);
+	if (rc) {
+		QEDF_WARN(&(base_qedf->dbg_ctx), "Error adding Scsi_Host.\n");
+		goto err2;
+	}
+
+	/* Set default dev_loss_tmo based on module parameter */
+	fc_host_dev_loss_tmo(vn_port->host) = qedf_dev_loss_tmo;
+
+	/* Init libfc stuffs */
+	memcpy(&vn_port->tt, &qedf_lport_template,
+		sizeof(qedf_lport_template));
+	fc_exch_init(vn_port);
+	fc_elsct_init(vn_port);
+	fc_lport_init(vn_port);
+	fc_disc_init(vn_port);
+	fc_disc_config(vn_port, vn_port);
+
+
+	/* Allocate the exchange manager */
+	shost = vport_to_shost(vport);
+	n_port = shost_priv(shost);
+	fc_exch_mgr_list_clone(n_port, vn_port);
+
+	/* Set max frame size */
+	fc_set_mfs(vn_port, QEDF_MFS);
+
+	fc_host_port_type(vn_port->host) = FC_PORTTYPE_UNKNOWN;
+
+	if (disabled) {
+		fc_vport_set_state(vport, FC_VPORT_DISABLED);
+	} else {
+		vn_port->boot_time = jiffies;
+		fc_fabric_login(vn_port);
+		fc_vport_setlink(vn_port);
+	}
+
+	QEDF_INFO(&(base_qedf->dbg_ctx), QEDF_LOG_NPIV, "vn_port=%p.\n",
+		   vn_port);
+
+	/* Set up debug context for vport */
+	vport_qedf->dbg_ctx.host_no = vn_port->host->host_no;
+	vport_qedf->dbg_ctx.pdev = base_qedf->pdev;
+
+err2:
+	scsi_host_put(vn_port->host);
+err1:
+	return rc;
+}
+
+static int qedf_vport_destroy(struct fc_vport *vport)
+{
+	struct Scsi_Host *shost = vport_to_shost(vport);
+	struct fc_lport *n_port = shost_priv(shost);
+	struct fc_lport *vn_port = vport->dd_data;
+
+	mutex_lock(&n_port->lp_mutex);
+	list_del(&vn_port->list);
+	mutex_unlock(&n_port->lp_mutex);
+
+	fc_fabric_logoff(vn_port);
+	fc_lport_destroy(vn_port);
+
+	/* Detach from scsi-ml */
+	fc_remove_host(vn_port->host);
+	scsi_remove_host(vn_port->host);
+
+	/*
+	 * Only try to release the exchange manager if the vn_port
+	 * configuration is complete.
+	 */
+	if (vn_port->state == LPORT_ST_READY)
+		fc_exch_mgr_free(vn_port);
+
+	/* Free memory used by statistical counters */
+	fc_lport_free_stats(vn_port);
+
+	/* Release Scsi_Host */
+	if (vn_port->host)
+		scsi_host_put(vn_port->host);
+
+	return 0;
+}
+
+static int qedf_vport_disable(struct fc_vport *vport, bool disable)
+{
+	struct fc_lport *lport = vport->dd_data;
+
+	if (disable) {
+		fc_vport_set_state(vport, FC_VPORT_DISABLED);
+		fc_fabric_logoff(lport);
+	} else {
+		lport->boot_time = jiffies;
+		fc_fabric_login(lport);
+		fc_vport_setlink(lport);
+	}
+	return 0;
+}
+
+/*
+ * During removal we need to wait for all the vports associated with a port
+ * to be destroyed so we avoid a race condition where libfc is still trying
+ * to reap vports while the driver remove function has already reaped the
+ * driver contexts associated with the physical port.
+ */
+static void qedf_wait_for_vport_destroy(struct qedf_ctx *qedf)
+{
+	struct fc_host_attrs *fc_host = shost_to_fc_host(qedf->lport->host);
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_NPIV,
+	    "Entered.\n");
+	while (fc_host->npiv_vports_inuse > 0) {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_NPIV,
+		    "Waiting for all vports to be reaped.\n");
+		msleep(1000);
+	}
+}
+
+/**
+ * qedf_fcoe_reset - Resets the fcoe
+ *
+ * @shost: shost the reset is from
+ *
+ * Returns: always 0
+ */
+static int qedf_fcoe_reset(struct Scsi_Host *shost)
+{
+	struct fc_lport *lport = shost_priv(shost);
+
+	qedf_ctx_soft_reset(lport);
+	return 0;
+}
+
+static struct fc_host_statistics *qedf_fc_get_host_stats(struct Scsi_Host
+	*shost)
+{
+	struct fc_host_statistics *qedf_stats;
+	struct fc_lport *lport = shost_priv(shost);
+	struct qedf_ctx *qedf = lport_priv(lport);
+	struct qed_fcoe_stats *fw_fcoe_stats;
+
+	qedf_stats = fc_get_host_stats(shost);
+
+	/* We don't collect offload stats for specific NPIV ports */
+	if (lport->vport)
+		goto out;
+
+	fw_fcoe_stats = kmalloc(sizeof(struct qed_fcoe_stats), GFP_KERNEL);
+	if (!fw_fcoe_stats) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Could not allocate memory for "
+		    "fw_fcoe_stats.\n");
+		goto out;
+	}
+
+	/* Query firmware for offload stats */
+	qed_ops->get_stats(qedf->cdev, fw_fcoe_stats);
+
+	/*
+	 * The expectation is that we add our offload stats to the stats
+	 * being maintained by libfc each time the fc_get_host_status callback
+	 * is invoked. The additions are not carried over for each call to
+	 * the fc_get_host_stats callback.
+	 */
+	qedf_stats->tx_frames += fw_fcoe_stats->fcoe_tx_data_pkt_cnt +
+	    fw_fcoe_stats->fcoe_tx_xfer_pkt_cnt +
+	    fw_fcoe_stats->fcoe_tx_other_pkt_cnt;
+	qedf_stats->rx_frames += fw_fcoe_stats->fcoe_rx_data_pkt_cnt +
+	    fw_fcoe_stats->fcoe_rx_xfer_pkt_cnt +
+	    fw_fcoe_stats->fcoe_rx_other_pkt_cnt;
+	qedf_stats->fcp_input_megabytes +=
+	    do_div(fw_fcoe_stats->fcoe_rx_byte_cnt, 1000000);
+	qedf_stats->fcp_output_megabytes +=
+	    do_div(fw_fcoe_stats->fcoe_tx_byte_cnt, 1000000);
+	qedf_stats->rx_words += fw_fcoe_stats->fcoe_rx_byte_cnt / 4;
+	qedf_stats->tx_words += fw_fcoe_stats->fcoe_tx_byte_cnt / 4;
+	qedf_stats->invalid_crc_count +=
+	    fw_fcoe_stats->fcoe_silent_drop_pkt_crc_error_cnt;
+	qedf_stats->dumped_frames =
+	    fw_fcoe_stats->fcoe_silent_drop_total_pkt_cnt;
+	qedf_stats->error_frames +=
+	    fw_fcoe_stats->fcoe_silent_drop_total_pkt_cnt;
+	qedf_stats->fcp_input_requests += qedf->input_requests;
+	qedf_stats->fcp_output_requests += qedf->output_requests;
+	qedf_stats->fcp_control_requests += qedf->control_requests;
+	qedf_stats->fcp_packet_aborts += qedf->packet_aborts;
+	qedf_stats->fcp_frame_alloc_failures += qedf->alloc_failures;
+
+	kfree(fw_fcoe_stats);
+out:
+	return qedf_stats;
+}
+
+static struct fc_function_template qedf_fc_transport_fn = {
+	.show_host_node_name = 1,
+	.show_host_port_name = 1,
+	.show_host_supported_classes = 1,
+	.show_host_supported_fc4s = 1,
+	.show_host_active_fc4s = 1,
+	.show_host_maxframe_size = 1,
+
+	.show_host_port_id = 1,
+	.show_host_supported_speeds = 1,
+	.get_host_speed = fc_get_host_speed,
+	.show_host_speed = 1,
+	.show_host_port_type = 1,
+	.get_host_port_state = fc_get_host_port_state,
+	.show_host_port_state = 1,
+	.show_host_symbolic_name = 1,
+
+	/*
+	 * Tell FC transport to allocate enough space to store the backpointer
+	 * for the associate qedf_rport struct.
+	 */
+	.dd_fcrport_size = (sizeof(struct fc_rport_libfc_priv) +
+				sizeof(struct qedf_rport)),
+	.show_rport_maxframe_size = 1,
+	.show_rport_supported_classes = 1,
+	.show_host_fabric_name = 1,
+	.show_starget_node_name = 1,
+	.show_starget_port_name = 1,
+	.show_starget_port_id = 1,
+	.set_rport_dev_loss_tmo = fc_set_rport_loss_tmo,
+	.show_rport_dev_loss_tmo = 1,
+	.get_fc_host_stats = qedf_fc_get_host_stats,
+	.issue_fc_host_lip = qedf_fcoe_reset,
+	.vport_create = qedf_vport_create,
+	.vport_delete = qedf_vport_destroy,
+	.vport_disable = qedf_vport_disable,
+	.bsg_request = fc_lport_bsg_request,
+};
+
+static struct fc_function_template qedf_fc_vport_transport_fn = {
+	.show_host_node_name = 1,
+	.show_host_port_name = 1,
+	.show_host_supported_classes = 1,
+	.show_host_supported_fc4s = 1,
+	.show_host_active_fc4s = 1,
+	.show_host_maxframe_size = 1,
+	.show_host_port_id = 1,
+	.show_host_supported_speeds = 1,
+	.get_host_speed = fc_get_host_speed,
+	.show_host_speed = 1,
+	.show_host_port_type = 1,
+	.get_host_port_state = fc_get_host_port_state,
+	.show_host_port_state = 1,
+	.show_host_symbolic_name = 1,
+	.dd_fcrport_size = (sizeof(struct fc_rport_libfc_priv) +
+				sizeof(struct qedf_rport)),
+	.show_rport_maxframe_size = 1,
+	.show_rport_supported_classes = 1,
+	.show_host_fabric_name = 1,
+	.show_starget_node_name = 1,
+	.show_starget_port_name = 1,
+	.show_starget_port_id = 1,
+	.set_rport_dev_loss_tmo = fc_set_rport_loss_tmo,
+	.show_rport_dev_loss_tmo = 1,
+	.get_fc_host_stats = fc_get_host_stats,
+	.issue_fc_host_lip = qedf_fcoe_reset,
+	.bsg_request = fc_lport_bsg_request,
+};
+
+static bool qedf_fp_has_work(struct qedf_fastpath *fp)
+{
+	struct qedf_ctx *qedf = fp->qedf;
+	struct global_queue *que;
+	struct qed_sb_info *sb_info = fp->sb_info;
+	struct status_block_e4 *sb = sb_info->sb_virt;
+	u16 prod_idx;
+
+	/* Get the pointer to the global CQ this completion is on */
+	que = qedf->global_queues[fp->sb_id];
+
+	/* Be sure all responses have been written to PI */
+	rmb();
+
+	/* Get the current firmware producer index */
+	prod_idx = sb->pi_array[QEDF_FCOE_PARAMS_GL_RQ_PI];
+
+	return (que->cq_prod_idx != prod_idx);
+}
+
+/*
+ * Interrupt handler code.
+ */
+
+/* Process completion queue and copy CQE contents for deferred processesing
+ *
+ * Return true if we should wake the I/O thread, false if not.
+ */
+static bool qedf_process_completions(struct qedf_fastpath *fp)
+{
+	struct qedf_ctx *qedf = fp->qedf;
+	struct qed_sb_info *sb_info = fp->sb_info;
+	struct status_block_e4 *sb = sb_info->sb_virt;
+	struct global_queue *que;
+	u16 prod_idx;
+	struct fcoe_cqe *cqe;
+	struct qedf_io_work *io_work;
+	int num_handled = 0;
+	unsigned int cpu;
+	struct qedf_ioreq *io_req = NULL;
+	u16 xid;
+	u16 new_cqes;
+	u32 comp_type;
+
+	/* Get the current firmware producer index */
+	prod_idx = sb->pi_array[QEDF_FCOE_PARAMS_GL_RQ_PI];
+
+	/* Get the pointer to the global CQ this completion is on */
+	que = qedf->global_queues[fp->sb_id];
+
+	/* Calculate the amount of new elements since last processing */
+	new_cqes = (prod_idx >= que->cq_prod_idx) ?
+	    (prod_idx - que->cq_prod_idx) :
+	    0x10000 - que->cq_prod_idx + prod_idx;
+
+	/* Save producer index */
+	que->cq_prod_idx = prod_idx;
+
+	while (new_cqes) {
+		fp->completions++;
+		num_handled++;
+		cqe = &que->cq[que->cq_cons_idx];
+
+		comp_type = (cqe->cqe_data >> FCOE_CQE_CQE_TYPE_SHIFT) &
+		    FCOE_CQE_CQE_TYPE_MASK;
+
+		/*
+		 * Process unsolicited CQEs directly in the interrupt handler
+		 * sine we need the fastpath ID
+		 */
+		if (comp_type == FCOE_UNSOLIC_CQE_TYPE) {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_UNSOL,
+			   "Unsolicated CQE.\n");
+			qedf_process_unsol_compl(qedf, fp->sb_id, cqe);
+			/*
+			 * Don't add a work list item.  Increment consumer
+			 * consumer index and move on.
+			 */
+			goto inc_idx;
+		}
+
+		xid = cqe->cqe_data & FCOE_CQE_TASK_ID_MASK;
+		io_req = &qedf->cmd_mgr->cmds[xid];
+
+		/*
+		 * Figure out which percpu thread we should queue this I/O
+		 * on.
+		 */
+		if (!io_req)
+			/* If there is not io_req assocated with this CQE
+			 * just queue it on CPU 0
+			 */
+			cpu = 0;
+		else {
+			cpu = io_req->cpu;
+			io_req->int_cpu = smp_processor_id();
+		}
+
+		io_work = mempool_alloc(qedf->io_mempool, GFP_ATOMIC);
+		if (!io_work) {
+			QEDF_WARN(&(qedf->dbg_ctx), "Could not allocate "
+				   "work for I/O completion.\n");
+			continue;
+		}
+		memset(io_work, 0, sizeof(struct qedf_io_work));
+
+		INIT_WORK(&io_work->work, qedf_fp_io_handler);
+
+		/* Copy contents of CQE for deferred processing */
+		memcpy(&io_work->cqe, cqe, sizeof(struct fcoe_cqe));
+
+		io_work->qedf = fp->qedf;
+		io_work->fp = NULL; /* Only used for unsolicited frames */
+
+		queue_work_on(cpu, qedf_io_wq, &io_work->work);
+
+inc_idx:
+		que->cq_cons_idx++;
+		if (que->cq_cons_idx == fp->cq_num_entries)
+			que->cq_cons_idx = 0;
+		new_cqes--;
+	}
+
+	return true;
+}
+
+
+/* MSI-X fastpath handler code */
+static irqreturn_t qedf_msix_handler(int irq, void *dev_id)
+{
+	struct qedf_fastpath *fp = dev_id;
+
+	if (!fp) {
+		QEDF_ERR(NULL, "fp is null.\n");
+		return IRQ_HANDLED;
+	}
+	if (!fp->sb_info) {
+		QEDF_ERR(NULL, "fp->sb_info in null.");
+		return IRQ_HANDLED;
+	}
+
+	/*
+	 * Disable interrupts for this status block while we process new
+	 * completions
+	 */
+	qed_sb_ack(fp->sb_info, IGU_INT_DISABLE, 0 /*do not update*/);
+
+	while (1) {
+		qedf_process_completions(fp);
+
+		if (qedf_fp_has_work(fp) == 0) {
+			/* Update the sb information */
+			qed_sb_update_sb_idx(fp->sb_info);
+
+			/* Check for more work */
+			rmb();
+
+			if (qedf_fp_has_work(fp) == 0) {
+				/* Re-enable interrupts */
+				qed_sb_ack(fp->sb_info, IGU_INT_ENABLE, 1);
+				return IRQ_HANDLED;
+			}
+		}
+	}
+
+	/* Do we ever want to break out of above loop? */
+	return IRQ_HANDLED;
+}
+
+/* simd handler for MSI/INTa */
+static void qedf_simd_int_handler(void *cookie)
+{
+	/* Cookie is qedf_ctx struct */
+	struct qedf_ctx *qedf = (struct qedf_ctx *)cookie;
+
+	QEDF_WARN(&(qedf->dbg_ctx), "qedf=%p.\n", qedf);
+}
+
+#define QEDF_SIMD_HANDLER_NUM		0
+static void qedf_sync_free_irqs(struct qedf_ctx *qedf)
+{
+	int i;
+
+	if (qedf->int_info.msix_cnt) {
+		for (i = 0; i < qedf->int_info.used_cnt; i++) {
+			synchronize_irq(qedf->int_info.msix[i].vector);
+			irq_set_affinity_hint(qedf->int_info.msix[i].vector,
+			    NULL);
+			irq_set_affinity_notifier(qedf->int_info.msix[i].vector,
+			    NULL);
+			free_irq(qedf->int_info.msix[i].vector,
+			    &qedf->fp_array[i]);
+		}
+	} else
+		qed_ops->common->simd_handler_clean(qedf->cdev,
+		    QEDF_SIMD_HANDLER_NUM);
+
+	qedf->int_info.used_cnt = 0;
+	qed_ops->common->set_fp_int(qedf->cdev, 0);
+}
+
+static int qedf_request_msix_irq(struct qedf_ctx *qedf)
+{
+	int i, rc, cpu;
+
+	cpu = cpumask_first(cpu_online_mask);
+	for (i = 0; i < qedf->num_queues; i++) {
+		rc = request_irq(qedf->int_info.msix[i].vector,
+		    qedf_msix_handler, 0, "qedf", &qedf->fp_array[i]);
+
+		if (rc) {
+			QEDF_WARN(&(qedf->dbg_ctx), "request_irq failed.\n");
+			qedf_sync_free_irqs(qedf);
+			return rc;
+		}
+
+		qedf->int_info.used_cnt++;
+		rc = irq_set_affinity_hint(qedf->int_info.msix[i].vector,
+		    get_cpu_mask(cpu));
+		cpu = cpumask_next(cpu, cpu_online_mask);
+	}
+
+	return 0;
+}
+
+static int qedf_setup_int(struct qedf_ctx *qedf)
+{
+	int rc = 0;
+
+	/*
+	 * Learn interrupt configuration
+	 */
+	rc = qed_ops->common->set_fp_int(qedf->cdev, num_online_cpus());
+	if (rc <= 0)
+		return 0;
+
+	rc  = qed_ops->common->get_fp_int(qedf->cdev, &qedf->int_info);
+	if (rc)
+		return 0;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "Number of msix_cnt = "
+		   "0x%x num of cpus = 0x%x\n", qedf->int_info.msix_cnt,
+		   num_online_cpus());
+
+	if (qedf->int_info.msix_cnt)
+		return qedf_request_msix_irq(qedf);
+
+	qed_ops->common->simd_handler_config(qedf->cdev, &qedf,
+	    QEDF_SIMD_HANDLER_NUM, qedf_simd_int_handler);
+	qedf->int_info.used_cnt = 1;
+
+	return 0;
+}
+
+/* Main function for libfc frame reception */
+static void qedf_recv_frame(struct qedf_ctx *qedf,
+	struct sk_buff *skb)
+{
+	u32 fr_len;
+	struct fc_lport *lport;
+	struct fc_frame_header *fh;
+	struct fcoe_crc_eof crc_eof;
+	struct fc_frame *fp;
+	u8 *mac = NULL;
+	u8 *dest_mac = NULL;
+	struct fcoe_hdr *hp;
+	struct qedf_rport *fcport;
+	struct fc_lport *vn_port;
+	u32 f_ctl;
+
+	lport = qedf->lport;
+	if (lport == NULL || lport->state == LPORT_ST_DISABLED) {
+		QEDF_WARN(NULL, "Invalid lport struct or lport disabled.\n");
+		kfree_skb(skb);
+		return;
+	}
+
+	if (skb_is_nonlinear(skb))
+		skb_linearize(skb);
+	mac = eth_hdr(skb)->h_source;
+	dest_mac = eth_hdr(skb)->h_dest;
+
+	/* Pull the header */
+	hp = (struct fcoe_hdr *)skb->data;
+	fh = (struct fc_frame_header *) skb_transport_header(skb);
+	skb_pull(skb, sizeof(struct fcoe_hdr));
+	fr_len = skb->len - sizeof(struct fcoe_crc_eof);
+
+	fp = (struct fc_frame *)skb;
+	fc_frame_init(fp);
+	fr_dev(fp) = lport;
+	fr_sof(fp) = hp->fcoe_sof;
+	if (skb_copy_bits(skb, fr_len, &crc_eof, sizeof(crc_eof))) {
+		kfree_skb(skb);
+		return;
+	}
+	fr_eof(fp) = crc_eof.fcoe_eof;
+	fr_crc(fp) = crc_eof.fcoe_crc32;
+	if (pskb_trim(skb, fr_len)) {
+		kfree_skb(skb);
+		return;
+	}
+
+	fh = fc_frame_header_get(fp);
+
+	/*
+	 * Invalid frame filters.
+	 */
+
+	if (fh->fh_r_ctl == FC_RCTL_DD_SOL_DATA &&
+	    fh->fh_type == FC_TYPE_FCP) {
+		/* Drop FCP data. We dont this in L2 path */
+		kfree_skb(skb);
+		return;
+	}
+	if (fh->fh_r_ctl == FC_RCTL_ELS_REQ &&
+	    fh->fh_type == FC_TYPE_ELS) {
+		switch (fc_frame_payload_op(fp)) {
+		case ELS_LOGO:
+			if (ntoh24(fh->fh_s_id) == FC_FID_FLOGI) {
+				/* drop non-FIP LOGO */
+				kfree_skb(skb);
+				return;
+			}
+			break;
+		}
+	}
+
+	if (fh->fh_r_ctl == FC_RCTL_BA_ABTS) {
+		/* Drop incoming ABTS */
+		kfree_skb(skb);
+		return;
+	}
+
+	if (ntoh24(&dest_mac[3]) != ntoh24(fh->fh_d_id)) {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2,
+		    "FC frame d_id mismatch with MAC %pM.\n", dest_mac);
+		return;
+	}
+
+	if (qedf->ctlr.state) {
+		if (!ether_addr_equal(mac, qedf->ctlr.dest_addr)) {
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2,
+			    "Wrong source address: mac:%pM dest_addr:%pM.\n",
+			    mac, qedf->ctlr.dest_addr);
+			kfree_skb(skb);
+			return;
+		}
+	}
+
+	vn_port = fc_vport_id_lookup(lport, ntoh24(fh->fh_d_id));
+
+	/*
+	 * If the destination ID from the frame header does not match what we
+	 * have on record for lport and the search for a NPIV port came up
+	 * empty then this is not addressed to our port so simply drop it.
+	 */
+	if (lport->port_id != ntoh24(fh->fh_d_id) && !vn_port) {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2,
+		    "Dropping frame due to destination mismatch: lport->port_id=%x fh->d_id=%x.\n",
+		    lport->port_id, ntoh24(fh->fh_d_id));
+		kfree_skb(skb);
+		return;
+	}
+
+	f_ctl = ntoh24(fh->fh_f_ctl);
+	if ((fh->fh_type == FC_TYPE_BLS) && (f_ctl & FC_FC_SEQ_CTX) &&
+	    (f_ctl & FC_FC_EX_CTX)) {
+		/* Drop incoming ABTS response that has both SEQ/EX CTX set */
+		kfree_skb(skb);
+		return;
+	}
+
+	/*
+	 * If a connection is uploading, drop incoming FCoE frames as there
+	 * is a small window where we could try to return a frame while libfc
+	 * is trying to clean things up.
+	 */
+
+	/* Get fcport associated with d_id if it exists */
+	fcport = qedf_fcport_lookup(qedf, ntoh24(fh->fh_d_id));
+
+	if (fcport && test_bit(QEDF_RPORT_UPLOADING_CONNECTION,
+	    &fcport->flags)) {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2,
+		    "Connection uploading, dropping fp=%p.\n", fp);
+		kfree_skb(skb);
+		return;
+	}
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2, "FCoE frame receive: "
+	    "skb=%p fp=%p src=%06x dest=%06x r_ctl=%x fh_type=%x.\n", skb, fp,
+	    ntoh24(fh->fh_s_id), ntoh24(fh->fh_d_id), fh->fh_r_ctl,
+	    fh->fh_type);
+	if (qedf_dump_frames)
+		print_hex_dump(KERN_WARNING, "fcoe: ", DUMP_PREFIX_OFFSET, 16,
+		    1, skb->data, skb->len, false);
+	fc_exch_recv(lport, fp);
+}
+
+static void qedf_ll2_process_skb(struct work_struct *work)
+{
+	struct qedf_skb_work *skb_work =
+	    container_of(work, struct qedf_skb_work, work);
+	struct qedf_ctx *qedf = skb_work->qedf;
+	struct sk_buff *skb = skb_work->skb;
+	struct ethhdr *eh;
+
+	if (!qedf) {
+		QEDF_ERR(NULL, "qedf is NULL\n");
+		goto err_out;
+	}
+
+	eh = (struct ethhdr *)skb->data;
+
+	/* Undo VLAN encapsulation */
+	if (eh->h_proto == htons(ETH_P_8021Q)) {
+		memmove((u8 *)eh + VLAN_HLEN, eh, ETH_ALEN * 2);
+		eh = skb_pull(skb, VLAN_HLEN);
+		skb_reset_mac_header(skb);
+	}
+
+	/*
+	 * Process either a FIP frame or FCoE frame based on the
+	 * protocol value.  If it's not either just drop the
+	 * frame.
+	 */
+	if (eh->h_proto == htons(ETH_P_FIP)) {
+		qedf_fip_recv(qedf, skb);
+		goto out;
+	} else if (eh->h_proto == htons(ETH_P_FCOE)) {
+		__skb_pull(skb, ETH_HLEN);
+		qedf_recv_frame(qedf, skb);
+		goto out;
+	} else
+		goto err_out;
+
+err_out:
+	kfree_skb(skb);
+out:
+	kfree(skb_work);
+	return;
+}
+
+static int qedf_ll2_rx(void *cookie, struct sk_buff *skb,
+	u32 arg1, u32 arg2)
+{
+	struct qedf_ctx *qedf = (struct qedf_ctx *)cookie;
+	struct qedf_skb_work *skb_work;
+
+	skb_work = kzalloc(sizeof(struct qedf_skb_work), GFP_ATOMIC);
+	if (!skb_work) {
+		QEDF_WARN(&(qedf->dbg_ctx), "Could not allocate skb_work so "
+			   "dropping frame.\n");
+		kfree_skb(skb);
+		return 0;
+	}
+
+	INIT_WORK(&skb_work->work, qedf_ll2_process_skb);
+	skb_work->skb = skb;
+	skb_work->qedf = qedf;
+	queue_work(qedf->ll2_recv_wq, &skb_work->work);
+
+	return 0;
+}
+
+static struct qed_ll2_cb_ops qedf_ll2_cb_ops = {
+	.rx_cb = qedf_ll2_rx,
+	.tx_cb = NULL,
+};
+
+/* Main thread to process I/O completions */
+void qedf_fp_io_handler(struct work_struct *work)
+{
+	struct qedf_io_work *io_work =
+	    container_of(work, struct qedf_io_work, work);
+	u32 comp_type;
+
+	/*
+	 * Deferred part of unsolicited CQE sends
+	 * frame to libfc.
+	 */
+	comp_type = (io_work->cqe.cqe_data >>
+	    FCOE_CQE_CQE_TYPE_SHIFT) &
+	    FCOE_CQE_CQE_TYPE_MASK;
+	if (comp_type == FCOE_UNSOLIC_CQE_TYPE &&
+	    io_work->fp)
+		fc_exch_recv(io_work->qedf->lport, io_work->fp);
+	else
+		qedf_process_cqe(io_work->qedf, &io_work->cqe);
+
+	kfree(io_work);
+}
+
+static int qedf_alloc_and_init_sb(struct qedf_ctx *qedf,
+	struct qed_sb_info *sb_info, u16 sb_id)
+{
+	struct status_block_e4 *sb_virt;
+	dma_addr_t sb_phys;
+	int ret;
+
+	sb_virt = dma_alloc_coherent(&qedf->pdev->dev,
+	    sizeof(struct status_block_e4), &sb_phys, GFP_KERNEL);
+
+	if (!sb_virt) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Status block allocation failed "
+			  "for id = %d.\n", sb_id);
+		return -ENOMEM;
+	}
+
+	ret = qed_ops->common->sb_init(qedf->cdev, sb_info, sb_virt, sb_phys,
+	    sb_id, QED_SB_TYPE_STORAGE);
+
+	if (ret) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Status block initialization "
+			  "failed for id = %d.\n", sb_id);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void qedf_free_sb(struct qedf_ctx *qedf, struct qed_sb_info *sb_info)
+{
+	if (sb_info->sb_virt)
+		dma_free_coherent(&qedf->pdev->dev, sizeof(*sb_info->sb_virt),
+		    (void *)sb_info->sb_virt, sb_info->sb_phys);
+}
+
+static void qedf_destroy_sb(struct qedf_ctx *qedf)
+{
+	int id;
+	struct qedf_fastpath *fp = NULL;
+
+	for (id = 0; id < qedf->num_queues; id++) {
+		fp = &(qedf->fp_array[id]);
+		if (fp->sb_id == QEDF_SB_ID_NULL)
+			break;
+		qedf_free_sb(qedf, fp->sb_info);
+		kfree(fp->sb_info);
+	}
+	kfree(qedf->fp_array);
+}
+
+static int qedf_prepare_sb(struct qedf_ctx *qedf)
+{
+	int id;
+	struct qedf_fastpath *fp;
+	int ret;
+
+	qedf->fp_array =
+	    kcalloc(qedf->num_queues, sizeof(struct qedf_fastpath),
+		GFP_KERNEL);
+
+	if (!qedf->fp_array) {
+		QEDF_ERR(&(qedf->dbg_ctx), "fastpath array allocation "
+			  "failed.\n");
+		return -ENOMEM;
+	}
+
+	for (id = 0; id < qedf->num_queues; id++) {
+		fp = &(qedf->fp_array[id]);
+		fp->sb_id = QEDF_SB_ID_NULL;
+		fp->sb_info = kcalloc(1, sizeof(*fp->sb_info), GFP_KERNEL);
+		if (!fp->sb_info) {
+			QEDF_ERR(&(qedf->dbg_ctx), "SB info struct "
+				  "allocation failed.\n");
+			goto err;
+		}
+		ret = qedf_alloc_and_init_sb(qedf, fp->sb_info, id);
+		if (ret) {
+			QEDF_ERR(&(qedf->dbg_ctx), "SB allocation and "
+				  "initialization failed.\n");
+			goto err;
+		}
+		fp->sb_id = id;
+		fp->qedf = qedf;
+		fp->cq_num_entries =
+		    qedf->global_queues[id]->cq_mem_size /
+		    sizeof(struct fcoe_cqe);
+	}
+err:
+	return 0;
+}
+
+void qedf_process_cqe(struct qedf_ctx *qedf, struct fcoe_cqe *cqe)
+{
+	u16 xid;
+	struct qedf_ioreq *io_req;
+	struct qedf_rport *fcport;
+	u32 comp_type;
+
+	comp_type = (cqe->cqe_data >> FCOE_CQE_CQE_TYPE_SHIFT) &
+	    FCOE_CQE_CQE_TYPE_MASK;
+
+	xid = cqe->cqe_data & FCOE_CQE_TASK_ID_MASK;
+	io_req = &qedf->cmd_mgr->cmds[xid];
+
+	/* Completion not for a valid I/O anymore so just return */
+	if (!io_req)
+		return;
+
+	fcport = io_req->fcport;
+
+	if (fcport == NULL) {
+		QEDF_ERR(&(qedf->dbg_ctx), "fcport is NULL.\n");
+		return;
+	}
+
+	/*
+	 * Check that fcport is offloaded.  If it isn't then the spinlock
+	 * isn't valid and shouldn't be taken. We should just return.
+	 */
+	if (!test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Session not offloaded yet.\n");
+		return;
+	}
+
+
+	switch (comp_type) {
+	case FCOE_GOOD_COMPLETION_CQE_TYPE:
+		atomic_inc(&fcport->free_sqes);
+		switch (io_req->cmd_type) {
+		case QEDF_SCSI_CMD:
+			qedf_scsi_completion(qedf, cqe, io_req);
+			break;
+		case QEDF_ELS:
+			qedf_process_els_compl(qedf, cqe, io_req);
+			break;
+		case QEDF_TASK_MGMT_CMD:
+			qedf_process_tmf_compl(qedf, cqe, io_req);
+			break;
+		case QEDF_SEQ_CLEANUP:
+			qedf_process_seq_cleanup_compl(qedf, cqe, io_req);
+			break;
+		}
+		break;
+	case FCOE_ERROR_DETECTION_CQE_TYPE:
+		atomic_inc(&fcport->free_sqes);
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "Error detect CQE.\n");
+		qedf_process_error_detect(qedf, cqe, io_req);
+		break;
+	case FCOE_EXCH_CLEANUP_CQE_TYPE:
+		atomic_inc(&fcport->free_sqes);
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "Cleanup CQE.\n");
+		qedf_process_cleanup_compl(qedf, cqe, io_req);
+		break;
+	case FCOE_ABTS_CQE_TYPE:
+		atomic_inc(&fcport->free_sqes);
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "Abort CQE.\n");
+		qedf_process_abts_compl(qedf, cqe, io_req);
+		break;
+	case FCOE_DUMMY_CQE_TYPE:
+		atomic_inc(&fcport->free_sqes);
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "Dummy CQE.\n");
+		break;
+	case FCOE_LOCAL_COMP_CQE_TYPE:
+		atomic_inc(&fcport->free_sqes);
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "Local completion CQE.\n");
+		break;
+	case FCOE_WARNING_CQE_TYPE:
+		atomic_inc(&fcport->free_sqes);
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "Warning CQE.\n");
+		qedf_process_warning_compl(qedf, cqe, io_req);
+		break;
+	case MAX_FCOE_CQE_TYPE:
+		atomic_inc(&fcport->free_sqes);
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "Max FCoE CQE.\n");
+		break;
+	default:
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
+		    "Default CQE.\n");
+		break;
+	}
+}
+
+static void qedf_free_bdq(struct qedf_ctx *qedf)
+{
+	int i;
+
+	if (qedf->bdq_pbl_list)
+		dma_free_coherent(&qedf->pdev->dev, QEDF_PAGE_SIZE,
+		    qedf->bdq_pbl_list, qedf->bdq_pbl_list_dma);
+
+	if (qedf->bdq_pbl)
+		dma_free_coherent(&qedf->pdev->dev, qedf->bdq_pbl_mem_size,
+		    qedf->bdq_pbl, qedf->bdq_pbl_dma);
+
+	for (i = 0; i < QEDF_BDQ_SIZE; i++) {
+		if (qedf->bdq[i].buf_addr) {
+			dma_free_coherent(&qedf->pdev->dev, QEDF_BDQ_BUF_SIZE,
+			    qedf->bdq[i].buf_addr, qedf->bdq[i].buf_dma);
+		}
+	}
+}
+
+static void qedf_free_global_queues(struct qedf_ctx *qedf)
+{
+	int i;
+	struct global_queue **gl = qedf->global_queues;
+
+	for (i = 0; i < qedf->num_queues; i++) {
+		if (!gl[i])
+			continue;
+
+		if (gl[i]->cq)
+			dma_free_coherent(&qedf->pdev->dev,
+			    gl[i]->cq_mem_size, gl[i]->cq, gl[i]->cq_dma);
+		if (gl[i]->cq_pbl)
+			dma_free_coherent(&qedf->pdev->dev, gl[i]->cq_pbl_size,
+			    gl[i]->cq_pbl, gl[i]->cq_pbl_dma);
+
+		kfree(gl[i]);
+	}
+
+	qedf_free_bdq(qedf);
+}
+
+static int qedf_alloc_bdq(struct qedf_ctx *qedf)
+{
+	int i;
+	struct scsi_bd *pbl;
+	u64 *list;
+	dma_addr_t page;
+
+	/* Alloc dma memory for BDQ buffers */
+	for (i = 0; i < QEDF_BDQ_SIZE; i++) {
+		qedf->bdq[i].buf_addr = dma_alloc_coherent(&qedf->pdev->dev,
+		    QEDF_BDQ_BUF_SIZE, &qedf->bdq[i].buf_dma, GFP_KERNEL);
+		if (!qedf->bdq[i].buf_addr) {
+			QEDF_ERR(&(qedf->dbg_ctx), "Could not allocate BDQ "
+			    "buffer %d.\n", i);
+			return -ENOMEM;
+		}
+	}
+
+	/* Alloc dma memory for BDQ page buffer list */
+	qedf->bdq_pbl_mem_size =
+	    QEDF_BDQ_SIZE * sizeof(struct scsi_bd);
+	qedf->bdq_pbl_mem_size =
+	    ALIGN(qedf->bdq_pbl_mem_size, QEDF_PAGE_SIZE);
+
+	qedf->bdq_pbl = dma_alloc_coherent(&qedf->pdev->dev,
+	    qedf->bdq_pbl_mem_size, &qedf->bdq_pbl_dma, GFP_KERNEL);
+	if (!qedf->bdq_pbl) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Could not allocate BDQ PBL.\n");
+		return -ENOMEM;
+	}
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+		  "BDQ PBL addr=0x%p dma=%pad\n",
+		  qedf->bdq_pbl, &qedf->bdq_pbl_dma);
+
+	/*
+	 * Populate BDQ PBL with physical and virtual address of individual
+	 * BDQ buffers
+	 */
+	pbl = (struct scsi_bd *)qedf->bdq_pbl;
+	for (i = 0; i < QEDF_BDQ_SIZE; i++) {
+		pbl->address.hi = cpu_to_le32(U64_HI(qedf->bdq[i].buf_dma));
+		pbl->address.lo = cpu_to_le32(U64_LO(qedf->bdq[i].buf_dma));
+		pbl->opaque.fcoe_opaque.hi = 0;
+		/* Opaque lo data is an index into the BDQ array */
+		pbl->opaque.fcoe_opaque.lo = cpu_to_le32(i);
+		pbl++;
+	}
+
+	/* Allocate list of PBL pages */
+	qedf->bdq_pbl_list = dma_zalloc_coherent(&qedf->pdev->dev,
+	    QEDF_PAGE_SIZE, &qedf->bdq_pbl_list_dma, GFP_KERNEL);
+	if (!qedf->bdq_pbl_list) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Could not allocate list of PBL pages.\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * Now populate PBL list with pages that contain pointers to the
+	 * individual buffers.
+	 */
+	qedf->bdq_pbl_list_num_entries = qedf->bdq_pbl_mem_size /
+	    QEDF_PAGE_SIZE;
+	list = (u64 *)qedf->bdq_pbl_list;
+	page = qedf->bdq_pbl_list_dma;
+	for (i = 0; i < qedf->bdq_pbl_list_num_entries; i++) {
+		*list = qedf->bdq_pbl_dma;
+		list++;
+		page += QEDF_PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+static int qedf_alloc_global_queues(struct qedf_ctx *qedf)
+{
+	u32 *list;
+	int i;
+	int status = 0, rc;
+	u32 *pbl;
+	dma_addr_t page;
+	int num_pages;
+
+	/* Allocate and map CQs, RQs */
+	/*
+	 * Number of global queues (CQ / RQ). This should
+	 * be <= number of available MSIX vectors for the PF
+	 */
+	if (!qedf->num_queues) {
+		QEDF_ERR(&(qedf->dbg_ctx), "No MSI-X vectors available!\n");
+		return 1;
+	}
+
+	/*
+	 * Make sure we allocated the PBL that will contain the physical
+	 * addresses of our queues
+	 */
+	if (!qedf->p_cpuq) {
+		status = 1;
+		goto mem_alloc_failure;
+	}
+
+	qedf->global_queues = kzalloc((sizeof(struct global_queue *)
+	    * qedf->num_queues), GFP_KERNEL);
+	if (!qedf->global_queues) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Unable to allocate global "
+			  "queues array ptr memory\n");
+		return -ENOMEM;
+	}
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+		   "qedf->global_queues=%p.\n", qedf->global_queues);
+
+	/* Allocate DMA coherent buffers for BDQ */
+	rc = qedf_alloc_bdq(qedf);
+	if (rc)
+		goto mem_alloc_failure;
+
+	/* Allocate a CQ and an associated PBL for each MSI-X vector */
+	for (i = 0; i < qedf->num_queues; i++) {
+		qedf->global_queues[i] = kzalloc(sizeof(struct global_queue),
+		    GFP_KERNEL);
+		if (!qedf->global_queues[i]) {
+			QEDF_WARN(&(qedf->dbg_ctx), "Unable to allocate "
+				   "global queue %d.\n", i);
+			status = -ENOMEM;
+			goto mem_alloc_failure;
+		}
+
+		qedf->global_queues[i]->cq_mem_size =
+		    FCOE_PARAMS_CQ_NUM_ENTRIES * sizeof(struct fcoe_cqe);
+		qedf->global_queues[i]->cq_mem_size =
+		    ALIGN(qedf->global_queues[i]->cq_mem_size, QEDF_PAGE_SIZE);
+
+		qedf->global_queues[i]->cq_pbl_size =
+		    (qedf->global_queues[i]->cq_mem_size /
+		    PAGE_SIZE) * sizeof(void *);
+		qedf->global_queues[i]->cq_pbl_size =
+		    ALIGN(qedf->global_queues[i]->cq_pbl_size, QEDF_PAGE_SIZE);
+
+		qedf->global_queues[i]->cq =
+		    dma_zalloc_coherent(&qedf->pdev->dev,
+			qedf->global_queues[i]->cq_mem_size,
+			&qedf->global_queues[i]->cq_dma, GFP_KERNEL);
+
+		if (!qedf->global_queues[i]->cq) {
+			QEDF_WARN(&(qedf->dbg_ctx), "Could not allocate cq.\n");
+			status = -ENOMEM;
+			goto mem_alloc_failure;
+		}
+
+		qedf->global_queues[i]->cq_pbl =
+		    dma_zalloc_coherent(&qedf->pdev->dev,
+			qedf->global_queues[i]->cq_pbl_size,
+			&qedf->global_queues[i]->cq_pbl_dma, GFP_KERNEL);
+
+		if (!qedf->global_queues[i]->cq_pbl) {
+			QEDF_WARN(&(qedf->dbg_ctx), "Could not allocate cq PBL.\n");
+			status = -ENOMEM;
+			goto mem_alloc_failure;
+		}
+
+		/* Create PBL */
+		num_pages = qedf->global_queues[i]->cq_mem_size /
+		    QEDF_PAGE_SIZE;
+		page = qedf->global_queues[i]->cq_dma;
+		pbl = (u32 *)qedf->global_queues[i]->cq_pbl;
+
+		while (num_pages--) {
+			*pbl = U64_LO(page);
+			pbl++;
+			*pbl = U64_HI(page);
+			pbl++;
+			page += QEDF_PAGE_SIZE;
+		}
+		/* Set the initial consumer index for cq */
+		qedf->global_queues[i]->cq_cons_idx = 0;
+	}
+
+	list = (u32 *)qedf->p_cpuq;
+
+	/*
+	 * The list is built as follows: CQ#0 PBL pointer, RQ#0 PBL pointer,
+	 * CQ#1 PBL pointer, RQ#1 PBL pointer, etc.  Each PBL pointer points
+	 * to the physical address which contains an array of pointers to
+	 * the physical addresses of the specific queue pages.
+	 */
+	for (i = 0; i < qedf->num_queues; i++) {
+		*list = U64_LO(qedf->global_queues[i]->cq_pbl_dma);
+		list++;
+		*list = U64_HI(qedf->global_queues[i]->cq_pbl_dma);
+		list++;
+		*list = U64_LO(0);
+		list++;
+		*list = U64_HI(0);
+		list++;
+	}
+
+	return 0;
+
+mem_alloc_failure:
+	qedf_free_global_queues(qedf);
+	return status;
+}
+
+static int qedf_set_fcoe_pf_param(struct qedf_ctx *qedf)
+{
+	u8 sq_num_pbl_pages;
+	u32 sq_mem_size;
+	u32 cq_mem_size;
+	u32 cq_num_entries;
+	int rval;
+
+	/*
+	 * The number of completion queues/fastpath interrupts/status blocks
+	 * we allocation is the minimum off:
+	 *
+	 * Number of CPUs
+	 * Number allocated by qed for our PCI function
+	 */
+	qedf->num_queues = MIN_NUM_CPUS_MSIX(qedf);
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "Number of CQs is %d.\n",
+		   qedf->num_queues);
+
+	qedf->p_cpuq = pci_alloc_consistent(qedf->pdev,
+	    qedf->num_queues * sizeof(struct qedf_glbl_q_params),
+	    &qedf->hw_p_cpuq);
+
+	if (!qedf->p_cpuq) {
+		QEDF_ERR(&(qedf->dbg_ctx), "pci_alloc_consistent failed.\n");
+		return 1;
+	}
+
+	rval = qedf_alloc_global_queues(qedf);
+	if (rval) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Global queue allocation "
+			  "failed.\n");
+		return 1;
+	}
+
+	/* Calculate SQ PBL size in the same manner as in qedf_sq_alloc() */
+	sq_mem_size = SQ_NUM_ENTRIES * sizeof(struct fcoe_wqe);
+	sq_mem_size = ALIGN(sq_mem_size, QEDF_PAGE_SIZE);
+	sq_num_pbl_pages = (sq_mem_size / QEDF_PAGE_SIZE);
+
+	/* Calculate CQ num entries */
+	cq_mem_size = FCOE_PARAMS_CQ_NUM_ENTRIES * sizeof(struct fcoe_cqe);
+	cq_mem_size = ALIGN(cq_mem_size, QEDF_PAGE_SIZE);
+	cq_num_entries = cq_mem_size / sizeof(struct fcoe_cqe);
+
+	memset(&(qedf->pf_params), 0, sizeof(qedf->pf_params));
+
+	/* Setup the value for fcoe PF */
+	qedf->pf_params.fcoe_pf_params.num_cons = QEDF_MAX_SESSIONS;
+	qedf->pf_params.fcoe_pf_params.num_tasks = FCOE_PARAMS_NUM_TASKS;
+	qedf->pf_params.fcoe_pf_params.glbl_q_params_addr =
+	    (u64)qedf->hw_p_cpuq;
+	qedf->pf_params.fcoe_pf_params.sq_num_pbl_pages = sq_num_pbl_pages;
+
+	qedf->pf_params.fcoe_pf_params.rq_buffer_log_size = 0;
+
+	qedf->pf_params.fcoe_pf_params.cq_num_entries = cq_num_entries;
+	qedf->pf_params.fcoe_pf_params.num_cqs = qedf->num_queues;
+
+	/* log_page_size: 12 for 4KB pages */
+	qedf->pf_params.fcoe_pf_params.log_page_size = ilog2(QEDF_PAGE_SIZE);
+
+	qedf->pf_params.fcoe_pf_params.mtu = 9000;
+	qedf->pf_params.fcoe_pf_params.gl_rq_pi = QEDF_FCOE_PARAMS_GL_RQ_PI;
+	qedf->pf_params.fcoe_pf_params.gl_cmd_pi = QEDF_FCOE_PARAMS_GL_CMD_PI;
+
+	/* BDQ address and size */
+	qedf->pf_params.fcoe_pf_params.bdq_pbl_base_addr[0] =
+	    qedf->bdq_pbl_list_dma;
+	qedf->pf_params.fcoe_pf_params.bdq_pbl_num_entries[0] =
+	    qedf->bdq_pbl_list_num_entries;
+	qedf->pf_params.fcoe_pf_params.rq_buffer_size = QEDF_BDQ_BUF_SIZE;
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+	    "bdq_list=%p bdq_pbl_list_dma=%llx bdq_pbl_list_entries=%d.\n",
+	    qedf->bdq_pbl_list,
+	    qedf->pf_params.fcoe_pf_params.bdq_pbl_base_addr[0],
+	    qedf->pf_params.fcoe_pf_params.bdq_pbl_num_entries[0]);
+
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+	    "cq_num_entries=%d.\n",
+	    qedf->pf_params.fcoe_pf_params.cq_num_entries);
+
+	return 0;
+}
+
+/* Free DMA coherent memory for array of queue pointers we pass to qed */
+static void qedf_free_fcoe_pf_param(struct qedf_ctx *qedf)
+{
+	size_t size = 0;
+
+	if (qedf->p_cpuq) {
+		size = qedf->num_queues * sizeof(struct qedf_glbl_q_params);
+		pci_free_consistent(qedf->pdev, size, qedf->p_cpuq,
+		    qedf->hw_p_cpuq);
+	}
+
+	qedf_free_global_queues(qedf);
+
+	if (qedf->global_queues)
+		kfree(qedf->global_queues);
+}
+
+/*
+ * PCI driver functions
+ */
+
+static const struct pci_device_id qedf_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, 0x165c) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, 0x8080) },
+	{0}
+};
+MODULE_DEVICE_TABLE(pci, qedf_pci_tbl);
+
+static struct pci_driver qedf_pci_driver = {
+	.name = QEDF_MODULE_NAME,
+	.id_table = qedf_pci_tbl,
+	.probe = qedf_probe,
+	.remove = qedf_remove,
+};
+
+static int __qedf_probe(struct pci_dev *pdev, int mode)
+{
+	int rc = -EINVAL;
+	struct fc_lport *lport;
+	struct qedf_ctx *qedf;
+	struct Scsi_Host *host;
+	bool is_vf = false;
+	struct qed_ll2_params params;
+	char host_buf[20];
+	struct qed_link_params link_params;
+	int status;
+	void *task_start, *task_end;
+	struct qed_slowpath_params slowpath_params;
+	struct qed_probe_params qed_params;
+	u16 tmp;
+
+	/*
+	 * When doing error recovery we didn't reap the lport so don't try
+	 * to reallocate it.
+	 */
+	if (mode != QEDF_MODE_RECOVERY) {
+		lport = libfc_host_alloc(&qedf_host_template,
+		    sizeof(struct qedf_ctx));
+
+		if (!lport) {
+			QEDF_ERR(NULL, "Could not allocate lport.\n");
+			rc = -ENOMEM;
+			goto err0;
+		}
+
+		/* Initialize qedf_ctx */
+		qedf = lport_priv(lport);
+		qedf->lport = lport;
+		qedf->ctlr.lp = lport;
+		qedf->pdev = pdev;
+		qedf->dbg_ctx.pdev = pdev;
+		qedf->dbg_ctx.host_no = lport->host->host_no;
+		spin_lock_init(&qedf->hba_lock);
+		INIT_LIST_HEAD(&qedf->fcports);
+		qedf->curr_conn_id = QEDF_MAX_SESSIONS - 1;
+		atomic_set(&qedf->num_offloads, 0);
+		qedf->stop_io_on_error = false;
+		pci_set_drvdata(pdev, qedf);
+		init_completion(&qedf->fipvlan_compl);
+
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_INFO,
+		   "QLogic FastLinQ FCoE Module qedf %s, "
+		   "FW %d.%d.%d.%d\n", QEDF_VERSION,
+		   FW_MAJOR_VERSION, FW_MINOR_VERSION, FW_REVISION_VERSION,
+		   FW_ENGINEERING_VERSION);
+	} else {
+		/* Init pointers during recovery */
+		qedf = pci_get_drvdata(pdev);
+		lport = qedf->lport;
+	}
+
+	host = lport->host;
+
+	/* Allocate mempool for qedf_io_work structs */
+	qedf->io_mempool = mempool_create_slab_pool(QEDF_IO_WORK_MIN,
+	    qedf_io_work_cache);
+	if (qedf->io_mempool == NULL) {
+		QEDF_ERR(&(qedf->dbg_ctx), "qedf->io_mempool is NULL.\n");
+		goto err1;
+	}
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_INFO, "qedf->io_mempool=%p.\n",
+	    qedf->io_mempool);
+
+	sprintf(host_buf, "qedf_%u_link",
+	    qedf->lport->host->host_no);
+	qedf->link_update_wq = create_workqueue(host_buf);
+	INIT_DELAYED_WORK(&qedf->link_update, qedf_handle_link_update);
+	INIT_DELAYED_WORK(&qedf->link_recovery, qedf_link_recovery);
+
+	qedf->fipvlan_retries = qedf_fipvlan_retries;
+
+	/*
+	 * Common probe. Takes care of basic hardware init and pci_*
+	 * functions.
+	 */
+	memset(&qed_params, 0, sizeof(qed_params));
+	qed_params.protocol = QED_PROTOCOL_FCOE;
+	qed_params.dp_module = qedf_dp_module;
+	qed_params.dp_level = qedf_dp_level;
+	qed_params.is_vf = is_vf;
+	qedf->cdev = qed_ops->common->probe(pdev, &qed_params);
+	if (!qedf->cdev) {
+		rc = -ENODEV;
+		goto err1;
+	}
+
+	/* Learn information crucial for qedf to progress */
+	rc = qed_ops->fill_dev_info(qedf->cdev, &qedf->dev_info);
+	if (rc) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Failed to dev info.\n");
+		goto err1;
+	}
+
+	/* queue allocation code should come here
+	 * order should be
+	 * 	slowpath_start
+	 * 	status block allocation
+	 *	interrupt registration (to get min number of queues)
+	 *	set_fcoe_pf_param
+	 *	qed_sp_fcoe_func_start
+	 */
+	rc = qedf_set_fcoe_pf_param(qedf);
+	if (rc) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Cannot set fcoe pf param.\n");
+		goto err2;
+	}
+	qed_ops->common->update_pf_params(qedf->cdev, &qedf->pf_params);
+
+	/* Record BDQ producer doorbell addresses */
+	qedf->bdq_primary_prod = qedf->dev_info.primary_dbq_rq_addr;
+	qedf->bdq_secondary_prod = qedf->dev_info.secondary_bdq_rq_addr;
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+	    "BDQ primary_prod=%p secondary_prod=%p.\n", qedf->bdq_primary_prod,
+	    qedf->bdq_secondary_prod);
+
+	qed_ops->register_ops(qedf->cdev, &qedf_cb_ops, qedf);
+
+	rc = qedf_prepare_sb(qedf);
+	if (rc) {
+
+		QEDF_ERR(&(qedf->dbg_ctx), "Cannot start slowpath.\n");
+		goto err2;
+	}
+
+	/* Start the Slowpath-process */
+	slowpath_params.int_mode = QED_INT_MODE_MSIX;
+	slowpath_params.drv_major = QEDF_DRIVER_MAJOR_VER;
+	slowpath_params.drv_minor = QEDF_DRIVER_MINOR_VER;
+	slowpath_params.drv_rev = QEDF_DRIVER_REV_VER;
+	slowpath_params.drv_eng = QEDF_DRIVER_ENG_VER;
+	strncpy(slowpath_params.name, "qedf", QED_DRV_VER_STR_SIZE);
+	rc = qed_ops->common->slowpath_start(qedf->cdev, &slowpath_params);
+	if (rc) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Cannot start slowpath.\n");
+		goto err2;
+	}
+
+	/*
+	 * update_pf_params needs to be called before and after slowpath
+	 * start
+	 */
+	qed_ops->common->update_pf_params(qedf->cdev, &qedf->pf_params);
+
+	/* Setup interrupts */
+	rc = qedf_setup_int(qedf);
+	if (rc)
+		goto err3;
+
+	rc = qed_ops->start(qedf->cdev, &qedf->tasks);
+	if (rc) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Cannot start FCoE function.\n");
+		goto err4;
+	}
+	task_start = qedf_get_task_mem(&qedf->tasks, 0);
+	task_end = qedf_get_task_mem(&qedf->tasks, MAX_TID_BLOCKS_FCOE - 1);
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "Task context start=%p, "
+		   "end=%p block_size=%u.\n", task_start, task_end,
+		   qedf->tasks.size);
+
+	/*
+	 * We need to write the number of BDs in the BDQ we've preallocated so
+	 * the f/w will do a prefetch and we'll get an unsolicited CQE when a
+	 * packet arrives.
+	 */
+	qedf->bdq_prod_idx = QEDF_BDQ_SIZE;
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+	    "Writing %d to primary and secondary BDQ doorbell registers.\n",
+	    qedf->bdq_prod_idx);
+	writew(qedf->bdq_prod_idx, qedf->bdq_primary_prod);
+	tmp = readw(qedf->bdq_primary_prod);
+	writew(qedf->bdq_prod_idx, qedf->bdq_secondary_prod);
+	tmp = readw(qedf->bdq_secondary_prod);
+
+	qed_ops->common->set_power_state(qedf->cdev, PCI_D0);
+
+	/* Now that the dev_info struct has been filled in set the MAC
+	 * address
+	 */
+	ether_addr_copy(qedf->mac, qedf->dev_info.common.hw_mac);
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "MAC address is %pM.\n",
+		   qedf->mac);
+
+	/*
+	 * Set the WWNN and WWPN in the following way:
+	 *
+	 * If the info we get from qed is non-zero then use that to set the
+	 * WWPN and WWNN. Otherwise fall back to use fcoe_wwn_from_mac() based
+	 * on the MAC address.
+	 */
+	if (qedf->dev_info.wwnn != 0 && qedf->dev_info.wwpn != 0) {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+		    "Setting WWPN and WWNN from qed dev_info.\n");
+		qedf->wwnn = qedf->dev_info.wwnn;
+		qedf->wwpn = qedf->dev_info.wwpn;
+	} else {
+		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+		    "Setting WWPN and WWNN using fcoe_wwn_from_mac().\n");
+		qedf->wwnn = fcoe_wwn_from_mac(qedf->mac, 1, 0);
+		qedf->wwpn = fcoe_wwn_from_mac(qedf->mac, 2, 0);
+	}
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,  "WWNN=%016llx "
+		   "WWPN=%016llx.\n", qedf->wwnn, qedf->wwpn);
+
+	sprintf(host_buf, "host_%d", host->host_no);
+	qed_ops->common->set_name(qedf->cdev, host_buf);
+
+
+	/* Set xid max values */
+	qedf->max_scsi_xid = QEDF_MAX_SCSI_XID;
+	qedf->max_els_xid = QEDF_MAX_ELS_XID;
+
+	/* Allocate cmd mgr */
+	qedf->cmd_mgr = qedf_cmd_mgr_alloc(qedf);
+	if (!qedf->cmd_mgr) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Failed to allocate cmd mgr.\n");
+		rc = -ENOMEM;
+		goto err5;
+	}
+
+	if (mode != QEDF_MODE_RECOVERY) {
+		host->transportt = qedf_fc_transport_template;
+		host->can_queue = QEDF_MAX_ELS_XID;
+		host->max_lun = qedf_max_lun;
+		host->max_cmd_len = QEDF_MAX_CDB_LEN;
+		rc = scsi_add_host(host, &pdev->dev);
+		if (rc)
+			goto err6;
+	}
+
+	memset(&params, 0, sizeof(params));
+	params.mtu = 9000;
+	ether_addr_copy(params.ll2_mac_address, qedf->mac);
+
+	/* Start LL2 processing thread */
+	snprintf(host_buf, 20, "qedf_%d_ll2", host->host_no);
+	qedf->ll2_recv_wq =
+		create_workqueue(host_buf);
+	if (!qedf->ll2_recv_wq) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Failed to LL2 workqueue.\n");
+		rc = -ENOMEM;
+		goto err7;
+	}
+
+#ifdef CONFIG_DEBUG_FS
+	qedf_dbg_host_init(&(qedf->dbg_ctx), qedf_debugfs_ops,
+			    qedf_dbg_fops);
+#endif
+
+	/* Start LL2 */
+	qed_ops->ll2->register_cb_ops(qedf->cdev, &qedf_ll2_cb_ops, qedf);
+	rc = qed_ops->ll2->start(qedf->cdev, &params);
+	if (rc) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Could not start Light L2.\n");
+		goto err7;
+	}
+	set_bit(QEDF_LL2_STARTED, &qedf->flags);
+
+	/* Set initial FIP/FCoE VLAN to NULL */
+	qedf->vlan_id = 0;
+
+	/*
+	 * No need to setup fcoe_ctlr or fc_lport objects during recovery since
+	 * they were not reaped during the unload process.
+	 */
+	if (mode != QEDF_MODE_RECOVERY) {
+		/* Setup imbedded fcoe controller */
+		qedf_fcoe_ctlr_setup(qedf);
+
+		/* Setup lport */
+		rc = qedf_lport_setup(qedf);
+		if (rc) {
+			QEDF_ERR(&(qedf->dbg_ctx),
+			    "qedf_lport_setup failed.\n");
+			goto err7;
+		}
+	}
+
+	sprintf(host_buf, "qedf_%u_timer", qedf->lport->host->host_no);
+	qedf->timer_work_queue =
+		create_workqueue(host_buf);
+	if (!qedf->timer_work_queue) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Failed to start timer "
+			  "workqueue.\n");
+		rc = -ENOMEM;
+		goto err7;
+	}
+
+	/* DPC workqueue is not reaped during recovery unload */
+	if (mode != QEDF_MODE_RECOVERY) {
+		sprintf(host_buf, "qedf_%u_dpc",
+		    qedf->lport->host->host_no);
+		qedf->dpc_wq = create_workqueue(host_buf);
+	}
+
+	/*
+	 * GRC dump and sysfs parameters are not reaped during the recovery
+	 * unload process.
+	 */
+	if (mode != QEDF_MODE_RECOVERY) {
+		qedf->grcdump_size = qed_ops->common->dbg_grc_size(qedf->cdev);
+		if (qedf->grcdump_size) {
+			rc = qedf_alloc_grc_dump_buf(&qedf->grcdump,
+			    qedf->grcdump_size);
+			if (rc) {
+				QEDF_ERR(&(qedf->dbg_ctx),
+				    "GRC Dump buffer alloc failed.\n");
+				qedf->grcdump = NULL;
+			}
+
+			QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+			    "grcdump: addr=%p, size=%u.\n",
+			    qedf->grcdump, qedf->grcdump_size);
+		}
+		qedf_create_sysfs_ctx_attr(qedf);
+
+		/* Initialize I/O tracing for this adapter */
+		spin_lock_init(&qedf->io_trace_lock);
+		qedf->io_trace_idx = 0;
+	}
+
+	init_completion(&qedf->flogi_compl);
+
+	memset(&link_params, 0, sizeof(struct qed_link_params));
+	link_params.link_up = true;
+	status = qed_ops->common->set_link(qedf->cdev, &link_params);
+	if (status)
+		QEDF_WARN(&(qedf->dbg_ctx), "set_link failed.\n");
+
+	/* Start/restart discovery */
+	if (mode == QEDF_MODE_RECOVERY)
+		fcoe_ctlr_link_up(&qedf->ctlr);
+	else
+		fc_fabric_login(lport);
+
+	/* All good */
+	return 0;
+
+err7:
+	if (qedf->ll2_recv_wq)
+		destroy_workqueue(qedf->ll2_recv_wq);
+	fc_remove_host(qedf->lport->host);
+	scsi_remove_host(qedf->lport->host);
+#ifdef CONFIG_DEBUG_FS
+	qedf_dbg_host_exit(&(qedf->dbg_ctx));
+#endif
+err6:
+	qedf_cmd_mgr_free(qedf->cmd_mgr);
+err5:
+	qed_ops->stop(qedf->cdev);
+err4:
+	qedf_free_fcoe_pf_param(qedf);
+	qedf_sync_free_irqs(qedf);
+err3:
+	qed_ops->common->slowpath_stop(qedf->cdev);
+err2:
+	qed_ops->common->remove(qedf->cdev);
+err1:
+	scsi_host_put(lport->host);
+err0:
+	return rc;
+}
+
+static int qedf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	return __qedf_probe(pdev, QEDF_MODE_NORMAL);
+}
+
+static void __qedf_remove(struct pci_dev *pdev, int mode)
+{
+	struct qedf_ctx *qedf;
+
+	if (!pdev) {
+		QEDF_ERR(NULL, "pdev is NULL.\n");
+		return;
+	}
+
+	qedf = pci_get_drvdata(pdev);
+
+	/*
+	 * Prevent race where we're in board disable work and then try to
+	 * rmmod the module.
+	 */
+	if (test_bit(QEDF_UNLOADING, &qedf->flags)) {
+		QEDF_ERR(&qedf->dbg_ctx, "Already removing PCI function.\n");
+		return;
+	}
+
+	if (mode != QEDF_MODE_RECOVERY)
+		set_bit(QEDF_UNLOADING, &qedf->flags);
+
+	/* Logoff the fabric to upload all connections */
+	if (mode == QEDF_MODE_RECOVERY)
+		fcoe_ctlr_link_down(&qedf->ctlr);
+	else
+		fc_fabric_logoff(qedf->lport);
+	qedf_wait_for_upload(qedf);
+
+#ifdef CONFIG_DEBUG_FS
+	qedf_dbg_host_exit(&(qedf->dbg_ctx));
+#endif
+
+	/* Stop any link update handling */
+	cancel_delayed_work_sync(&qedf->link_update);
+	destroy_workqueue(qedf->link_update_wq);
+	qedf->link_update_wq = NULL;
+
+	if (qedf->timer_work_queue)
+		destroy_workqueue(qedf->timer_work_queue);
+
+	/* Stop Light L2 */
+	clear_bit(QEDF_LL2_STARTED, &qedf->flags);
+	qed_ops->ll2->stop(qedf->cdev);
+	if (qedf->ll2_recv_wq)
+		destroy_workqueue(qedf->ll2_recv_wq);
+
+	/* Stop fastpath */
+	qedf_sync_free_irqs(qedf);
+	qedf_destroy_sb(qedf);
+
+	/*
+	 * During recovery don't destroy OS constructs that represent the
+	 * physical port.
+	 */
+	if (mode != QEDF_MODE_RECOVERY) {
+		qedf_free_grc_dump_buf(&qedf->grcdump);
+		qedf_remove_sysfs_ctx_attr(qedf);
+
+		/* Remove all SCSI/libfc/libfcoe structures */
+		fcoe_ctlr_destroy(&qedf->ctlr);
+		fc_lport_destroy(qedf->lport);
+		fc_remove_host(qedf->lport->host);
+		scsi_remove_host(qedf->lport->host);
+	}
+
+	qedf_cmd_mgr_free(qedf->cmd_mgr);
+
+	if (mode != QEDF_MODE_RECOVERY) {
+		fc_exch_mgr_free(qedf->lport);
+		fc_lport_free_stats(qedf->lport);
+
+		/* Wait for all vports to be reaped */
+		qedf_wait_for_vport_destroy(qedf);
+	}
+
+	/*
+	 * Now that all connections have been uploaded we can stop the
+	 * rest of the qed operations
+	 */
+	qed_ops->stop(qedf->cdev);
+
+	if (mode != QEDF_MODE_RECOVERY) {
+		if (qedf->dpc_wq) {
+			/* Stop general DPC handling */
+			destroy_workqueue(qedf->dpc_wq);
+			qedf->dpc_wq = NULL;
+		}
+	}
+
+	/* Final shutdown for the board */
+	qedf_free_fcoe_pf_param(qedf);
+	if (mode != QEDF_MODE_RECOVERY) {
+		qed_ops->common->set_power_state(qedf->cdev, PCI_D0);
+		pci_set_drvdata(pdev, NULL);
+	}
+	qed_ops->common->slowpath_stop(qedf->cdev);
+	qed_ops->common->remove(qedf->cdev);
+
+	mempool_destroy(qedf->io_mempool);
+
+	/* Only reap the Scsi_host on a real removal */
+	if (mode != QEDF_MODE_RECOVERY)
+		scsi_host_put(qedf->lport->host);
+}
+
+static void qedf_remove(struct pci_dev *pdev)
+{
+	/* Check to make sure this function wasn't already disabled */
+	if (!atomic_read(&pdev->enable_cnt))
+		return;
+
+	__qedf_remove(pdev, QEDF_MODE_NORMAL);
+}
+
+/*
+ * Module Init/Remove
+ */
+
+static int __init qedf_init(void)
+{
+	int ret;
+
+	/* If debug=1 passed, set the default log mask */
+	if (qedf_debug == QEDF_LOG_DEFAULT)
+		qedf_debug = QEDF_DEFAULT_LOG_MASK;
+
+	/* Print driver banner */
+	QEDF_INFO(NULL, QEDF_LOG_INFO, "%s v%s.\n", QEDF_DESCR,
+		   QEDF_VERSION);
+
+	/* Create kmem_cache for qedf_io_work structs */
+	qedf_io_work_cache = kmem_cache_create("qedf_io_work_cache",
+	    sizeof(struct qedf_io_work), 0, SLAB_HWCACHE_ALIGN, NULL);
+	if (qedf_io_work_cache == NULL) {
+		QEDF_ERR(NULL, "qedf_io_work_cache is NULL.\n");
+		goto err1;
+	}
+	QEDF_INFO(NULL, QEDF_LOG_DISC, "qedf_io_work_cache=%p.\n",
+	    qedf_io_work_cache);
+
+	qed_ops = qed_get_fcoe_ops();
+	if (!qed_ops) {
+		QEDF_ERR(NULL, "Failed to get qed fcoe operations\n");
+		goto err1;
+	}
+
+#ifdef CONFIG_DEBUG_FS
+	qedf_dbg_init("qedf");
+#endif
+
+	qedf_fc_transport_template =
+	    fc_attach_transport(&qedf_fc_transport_fn);
+	if (!qedf_fc_transport_template) {
+		QEDF_ERR(NULL, "Could not register with FC transport\n");
+		goto err2;
+	}
+
+	qedf_fc_vport_transport_template =
+		fc_attach_transport(&qedf_fc_vport_transport_fn);
+	if (!qedf_fc_vport_transport_template) {
+		QEDF_ERR(NULL, "Could not register vport template with FC "
+			  "transport\n");
+		goto err3;
+	}
+
+	qedf_io_wq = create_workqueue("qedf_io_wq");
+	if (!qedf_io_wq) {
+		QEDF_ERR(NULL, "Could not create qedf_io_wq.\n");
+		goto err4;
+	}
+
+	qedf_cb_ops.get_login_failures = qedf_get_login_failures;
+
+	ret = pci_register_driver(&qedf_pci_driver);
+	if (ret) {
+		QEDF_ERR(NULL, "Failed to register driver\n");
+		goto err5;
+	}
+
+	return 0;
+
+err5:
+	destroy_workqueue(qedf_io_wq);
+err4:
+	fc_release_transport(qedf_fc_vport_transport_template);
+err3:
+	fc_release_transport(qedf_fc_transport_template);
+err2:
+#ifdef CONFIG_DEBUG_FS
+	qedf_dbg_exit();
+#endif
+	qed_put_fcoe_ops();
+err1:
+	return -EINVAL;
+}
+
+static void __exit qedf_cleanup(void)
+{
+	pci_unregister_driver(&qedf_pci_driver);
+
+	destroy_workqueue(qedf_io_wq);
+
+	fc_release_transport(qedf_fc_vport_transport_template);
+	fc_release_transport(qedf_fc_transport_template);
+#ifdef CONFIG_DEBUG_FS
+	qedf_dbg_exit();
+#endif
+	qed_put_fcoe_ops();
+
+	kmem_cache_destroy(qedf_io_work_cache);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("QLogic QEDF 25/40/50/100Gb FCoE Driver");
+MODULE_AUTHOR("QLogic Corporation");
+MODULE_VERSION(QEDF_VERSION);
+module_init(qedf_init);
+module_exit(qedf_cleanup);
diff --git a/drivers/scsi/qedf/qedf_version.h b/drivers/scsi/qedf/qedf_version.h
new file mode 100644
index 0000000..c247805
--- /dev/null
+++ b/drivers/scsi/qedf/qedf_version.h
@@ -0,0 +1,15 @@
+/*
+ *  QLogic FCoE Offload Driver
+ *  Copyright (c) 2016-2017 Cavium Inc.
+ *
+ *  This software is available under the terms of the GNU General Public License
+ *  (GPL) Version 2, available from the file COPYING in the main directory of
+ *  this source tree.
+ */
+
+#define QEDF_VERSION		"8.33.0.20"
+#define QEDF_DRIVER_MAJOR_VER		8
+#define QEDF_DRIVER_MINOR_VER		33
+#define QEDF_DRIVER_REV_VER		0
+#define QEDF_DRIVER_ENG_VER		20
+
diff --git a/drivers/scsi/qedi/Kconfig b/drivers/scsi/qedi/Kconfig
new file mode 100644
index 0000000..d1db92d
--- /dev/null
+++ b/drivers/scsi/qedi/Kconfig
@@ -0,0 +1,12 @@
+config QEDI
+	tristate "QLogic QEDI 25/40/100Gb iSCSI Initiator Driver Support"
+	depends on PCI && SCSI && UIO
+	depends on QED
+	select SCSI_ISCSI_ATTRS
+	select QED_LL2
+	select QED_OOO
+	select QED_ISCSI
+	select ISCSI_BOOT_SYSFS
+	---help---
+	This driver supports iSCSI offload for the QLogic FastLinQ
+	41000 Series Converged Network Adapters.
diff --git a/drivers/scsi/qedi/Makefile b/drivers/scsi/qedi/Makefile
new file mode 100644
index 0000000..90a6925
--- /dev/null
+++ b/drivers/scsi/qedi/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_QEDI) := qedi.o
+qedi-y := qedi_main.o qedi_iscsi.o qedi_fw.o qedi_sysfs.o \
+	    qedi_dbg.o qedi_fw_api.o
+
+qedi-$(CONFIG_DEBUG_FS) += qedi_debugfs.o
diff --git a/drivers/scsi/qedi/qedi.h b/drivers/scsi/qedi/qedi.h
new file mode 100644
index 0000000..b8b22ce
--- /dev/null
+++ b/drivers/scsi/qedi/qedi.h
@@ -0,0 +1,381 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QEDI_H_
+#define _QEDI_H_
+
+#define __PREVENT_QED_HSI__
+
+#include <scsi/scsi_transport_iscsi.h>
+#include <scsi/libiscsi.h>
+#include <scsi/scsi_host.h>
+#include <linux/uio_driver.h>
+
+#include "qedi_hsi.h"
+#include <linux/qed/qed_if.h>
+#include "qedi_dbg.h"
+#include <linux/qed/qed_iscsi_if.h>
+#include <linux/qed/qed_ll2_if.h>
+#include "qedi_version.h"
+#include "qedi_nvm_iscsi_cfg.h"
+
+#define QEDI_MODULE_NAME		"qedi"
+
+struct qedi_endpoint;
+
+#ifndef GET_FIELD2
+#define GET_FIELD2(value, name) \
+	(((value) & (name ## _MASK)) >> (name ## _OFFSET))
+#endif
+
+/*
+ * PCI function probe defines
+ */
+#define QEDI_MODE_NORMAL	0
+#define QEDI_MODE_RECOVERY	1
+
+#define ISCSI_WQE_SET_PTU_INVALIDATE	1
+#define QEDI_MAX_ISCSI_TASK		4096
+#define QEDI_MAX_TASK_NUM		0x0FFF
+#define QEDI_MAX_ISCSI_CONNS_PER_HBA	1024
+#define QEDI_ISCSI_MAX_BDS_PER_CMD	255	/* Firmware max BDs is 255 */
+#define MAX_OUSTANDING_TASKS_PER_CON	1024
+
+#define QEDI_MAX_BD_LEN		0xffff
+#define QEDI_BD_SPLIT_SZ	0x1000
+#define QEDI_PAGE_SIZE		4096
+#define QEDI_FAST_SGE_COUNT	4
+/* MAX Length for cached SGL */
+#define MAX_SGLEN_FOR_CACHESGL	((1U << 16) - 1)
+
+#define MIN_NUM_CPUS_MSIX(x)	min_t(u32, x->dev_info.num_cqs, \
+					num_online_cpus())
+
+#define QEDI_LOCAL_PORT_MIN     60000
+#define QEDI_LOCAL_PORT_MAX     61024
+#define QEDI_LOCAL_PORT_RANGE   (QEDI_LOCAL_PORT_MAX - QEDI_LOCAL_PORT_MIN)
+#define QEDI_LOCAL_PORT_INVALID	0xffff
+#define TX_RX_RING		16
+#define RX_RING			(TX_RX_RING - 1)
+#define LL2_SINGLE_BUF_SIZE	0x400
+#define QEDI_PAGE_SIZE		4096
+#define QEDI_PAGE_ALIGN(addr)	ALIGN(addr, QEDI_PAGE_SIZE)
+#define QEDI_PAGE_MASK		(~((QEDI_PAGE_SIZE) - 1))
+
+#define QEDI_PAGE_SIZE		4096
+#define QEDI_HW_DMA_BOUNDARY	0xfff
+#define QEDI_PATH_HANDLE	0xFE0000000UL
+
+enum qedi_nvm_tgts {
+	QEDI_NVM_TGT_PRI,
+	QEDI_NVM_TGT_SEC,
+};
+
+struct qedi_uio_ctrl {
+	/* meta data */
+	u32 uio_hsi_version;
+
+	/* user writes */
+	u32 host_tx_prod;
+	u32 host_rx_cons;
+	u32 host_rx_bd_cons;
+	u32 host_tx_pkt_len;
+	u32 host_rx_cons_cnt;
+
+	/* driver writes */
+	u32 hw_tx_cons;
+	u32 hw_rx_prod;
+	u32 hw_rx_bd_prod;
+	u32 hw_rx_prod_cnt;
+
+	/* other */
+	u8 mac_addr[6];
+	u8 reserve[2];
+};
+
+struct qedi_rx_bd {
+	u32 rx_pkt_index;
+	u32 rx_pkt_len;
+	u16 vlan_id;
+};
+
+#define QEDI_RX_DESC_CNT	(QEDI_PAGE_SIZE / sizeof(struct qedi_rx_bd))
+#define QEDI_MAX_RX_DESC_CNT	(QEDI_RX_DESC_CNT - 1)
+#define QEDI_NUM_RX_BD		(QEDI_RX_DESC_CNT * 1)
+#define QEDI_MAX_RX_BD		(QEDI_NUM_RX_BD - 1)
+
+#define QEDI_NEXT_RX_IDX(x)	((((x) & (QEDI_MAX_RX_DESC_CNT)) ==	\
+				  (QEDI_MAX_RX_DESC_CNT - 1)) ?		\
+				 (x) + 2 : (x) + 1)
+
+struct qedi_uio_dev {
+	struct uio_info		qedi_uinfo;
+	u32			uio_dev;
+	struct list_head	list;
+
+	u32			ll2_ring_size;
+	void			*ll2_ring;
+
+	u32			ll2_buf_size;
+	void			*ll2_buf;
+
+	void			*rx_pkt;
+	void			*tx_pkt;
+
+	struct qedi_ctx		*qedi;
+	struct pci_dev		*pdev;
+	void			*uctrl;
+};
+
+/* List to maintain the skb pointers */
+struct skb_work_list {
+	struct list_head list;
+	struct sk_buff *skb;
+	u16 vlan_id;
+};
+
+/* Queue sizes in number of elements */
+#define QEDI_SQ_SIZE		MAX_OUSTANDING_TASKS_PER_CON
+#define QEDI_CQ_SIZE		2048
+#define QEDI_CMDQ_SIZE		QEDI_MAX_ISCSI_TASK
+#define QEDI_PROTO_CQ_PROD_IDX	0
+
+struct qedi_glbl_q_params {
+	u64 hw_p_cq;	/* Completion queue PBL */
+	u64 hw_p_rq;	/* Request queue PBL */
+	u64 hw_p_cmdq;	/* Command queue PBL */
+};
+
+struct global_queue {
+	union iscsi_cqe *cq;
+	dma_addr_t cq_dma;
+	u32 cq_mem_size;
+	u32 cq_cons_idx; /* Completion queue consumer index */
+
+	void *cq_pbl;
+	dma_addr_t cq_pbl_dma;
+	u32 cq_pbl_size;
+
+};
+
+struct qedi_fastpath {
+	struct qed_sb_info	*sb_info;
+	u16			sb_id;
+#define QEDI_NAME_SIZE		16
+	char			name[QEDI_NAME_SIZE];
+	struct qedi_ctx         *qedi;
+};
+
+/* Used to pass fastpath information needed to process CQEs */
+struct qedi_io_work {
+	struct list_head list;
+	struct iscsi_cqe_solicited cqe;
+	u16	que_idx;
+};
+
+/**
+ * struct iscsi_cid_queue - Per adapter iscsi cid queue
+ *
+ * @cid_que_base:           queue base memory
+ * @cid_que:                queue memory pointer
+ * @cid_q_prod_idx:         produce index
+ * @cid_q_cons_idx:         consumer index
+ * @cid_q_max_idx:          max index. used to detect wrap around condition
+ * @cid_free_cnt:           queue size
+ * @conn_cid_tbl:           iscsi cid to conn structure mapping table
+ *
+ * Per adapter iSCSI CID Queue
+ */
+struct iscsi_cid_queue {
+	void *cid_que_base;
+	u32 *cid_que;
+	u32 cid_q_prod_idx;
+	u32 cid_q_cons_idx;
+	u32 cid_q_max_idx;
+	u32 cid_free_cnt;
+	struct qedi_conn **conn_cid_tbl;
+};
+
+struct qedi_portid_tbl {
+	spinlock_t      lock;	/* Port id lock */
+	u16             start;
+	u16             max;
+	u16             next;
+	unsigned long   *table;
+};
+
+struct qedi_itt_map {
+	__le32	itt;
+	struct qedi_cmd *p_cmd;
+};
+
+/* I/O tracing entry */
+#define QEDI_IO_TRACE_SIZE             2048
+struct qedi_io_log {
+#define QEDI_IO_TRACE_REQ              0
+#define QEDI_IO_TRACE_RSP              1
+	u8 direction;
+	u16 task_id;
+	u32 cid;
+	u32 port_id;	/* Remote port fabric ID */
+	int lun;
+	u8 op;		/* SCSI CDB */
+	u8 lba[4];
+	unsigned int bufflen;	/* SCSI buffer length */
+	unsigned int sg_count;	/* Number of SG elements */
+	u8 fast_sgs;		/* number of fast sgls */
+	u8 slow_sgs;		/* number of slow sgls */
+	u8 cached_sgs;		/* number of cached sgls */
+	int result;		/* Result passed back to mid-layer */
+	unsigned long jiffies;	/* Time stamp when I/O logged */
+	int refcount;		/* Reference count for task id */
+	unsigned int blk_req_cpu; /* CPU that the task is queued on by
+				   * blk layer
+				   */
+	unsigned int req_cpu;	/* CPU that the task is queued on */
+	unsigned int intr_cpu;	/* Interrupt CPU that the task is received on */
+	unsigned int blk_rsp_cpu;/* CPU that task is actually processed and
+				  * returned to blk layer
+				  */
+	bool cached_sge;
+	bool slow_sge;
+	bool fast_sge;
+};
+
+/* Number of entries in BDQ */
+#define QEDI_BDQ_NUM		256
+#define QEDI_BDQ_BUF_SIZE	256
+
+/* DMA coherent buffers for BDQ */
+struct qedi_bdq_buf {
+	void *buf_addr;
+	dma_addr_t buf_dma;
+};
+
+/* Main port level struct */
+struct qedi_ctx {
+	struct qedi_dbg_ctx dbg_ctx;
+	struct Scsi_Host *shost;
+	struct pci_dev *pdev;
+	struct qed_dev *cdev;
+	struct qed_dev_iscsi_info dev_info;
+	struct qed_int_info int_info;
+	struct qedi_glbl_q_params *p_cpuq;
+	struct global_queue **global_queues;
+	/* uio declaration */
+	struct qedi_uio_dev *udev;
+	struct list_head ll2_skb_list;
+	spinlock_t ll2_lock;	/* Light L2 lock */
+	spinlock_t hba_lock;	/* per port lock */
+	struct task_struct *ll2_recv_thread;
+	unsigned long flags;
+#define UIO_DEV_OPENED		1
+#define QEDI_IOTHREAD_WAKE	2
+#define QEDI_IN_RECOVERY	5
+#define QEDI_IN_OFFLINE		6
+
+	u8 mac[ETH_ALEN];
+	u32 src_ip[4];
+	u8 ip_type;
+
+	/* Physical address of above array */
+	dma_addr_t hw_p_cpuq;
+
+	struct qedi_bdq_buf bdq[QEDI_BDQ_NUM];
+	void *bdq_pbl;
+	dma_addr_t bdq_pbl_dma;
+	size_t bdq_pbl_mem_size;
+	void *bdq_pbl_list;
+	dma_addr_t bdq_pbl_list_dma;
+	u8 bdq_pbl_list_num_entries;
+	struct nvm_iscsi_cfg *iscsi_cfg;
+	dma_addr_t nvm_buf_dma;
+	void __iomem *bdq_primary_prod;
+	void __iomem *bdq_secondary_prod;
+	u16 bdq_prod_idx;
+	u16 rq_num_entries;
+
+	u32 max_sqes;
+	u8 num_queues;
+	u32 max_active_conns;
+
+	struct iscsi_cid_queue cid_que;
+	struct qedi_endpoint **ep_tbl;
+	struct qedi_portid_tbl lcl_port_tbl;
+
+	/* Rx fast path intr context */
+	struct qed_sb_info	*sb_array;
+	struct qedi_fastpath	*fp_array;
+	struct qed_iscsi_tid	tasks;
+
+#define QEDI_LINK_DOWN		0
+#define QEDI_LINK_UP		1
+	atomic_t link_state;
+
+#define QEDI_RESERVE_TASK_ID	0
+#define MAX_ISCSI_TASK_ENTRIES	4096
+#define QEDI_INVALID_TASK_ID	(MAX_ISCSI_TASK_ENTRIES + 1)
+	unsigned long task_idx_map[MAX_ISCSI_TASK_ENTRIES / BITS_PER_LONG];
+	struct qedi_itt_map *itt_map;
+	u16 tid_reuse_count[QEDI_MAX_ISCSI_TASK];
+	struct qed_pf_params pf_params;
+
+	struct workqueue_struct *tmf_thread;
+	struct workqueue_struct *offload_thread;
+
+	u16 ll2_mtu;
+
+	struct workqueue_struct *dpc_wq;
+
+	spinlock_t task_idx_lock;	/* To protect gbl context */
+	s32 last_tidx_alloc;
+	s32 last_tidx_clear;
+
+	struct qedi_io_log io_trace_buf[QEDI_IO_TRACE_SIZE];
+	spinlock_t io_trace_lock;	/* prtect trace Log buf */
+	u16 io_trace_idx;
+	unsigned int intr_cpu;
+	u32 cached_sgls;
+	bool use_cached_sge;
+	u32 slow_sgls;
+	bool use_slow_sge;
+	u32 fast_sgls;
+	bool use_fast_sge;
+
+	atomic_t num_offloads;
+#define SYSFS_FLAG_FW_SEL_BOOT 2
+#define IPV6_LEN	41
+#define IPV4_LEN	17
+	struct iscsi_boot_kset *boot_kset;
+};
+
+struct qedi_work {
+	struct list_head list;
+	struct qedi_ctx *qedi;
+	union iscsi_cqe cqe;
+	u16     que_idx;
+	bool is_solicited;
+};
+
+struct qedi_percpu_s {
+	struct task_struct *iothread;
+	struct list_head work_list;
+	spinlock_t p_work_lock;		/* Per cpu worker lock */
+};
+
+static inline void *qedi_get_task_mem(struct qed_iscsi_tid *info, u32 tid)
+{
+	return (info->blocks[tid / info->num_tids_per_block] +
+		(tid % info->num_tids_per_block) * info->size);
+}
+
+#define QEDI_U64_HI(val) ((u32)(((u64)(val)) >> 32))
+#define QEDI_U64_LO(val) ((u32)(((u64)(val)) & 0xffffffff))
+
+#endif /* _QEDI_H_ */
diff --git a/drivers/scsi/qedi/qedi_dbg.c b/drivers/scsi/qedi/qedi_dbg.c
new file mode 100644
index 0000000..8fd28b0
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_dbg.c
@@ -0,0 +1,146 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include "qedi_dbg.h"
+#include <linux/vmalloc.h>
+
+void
+qedi_dbg_err(struct qedi_dbg_ctx *qedi, const char *func, u32 line,
+	     const char *fmt, ...)
+{
+	va_list va;
+	struct va_format vaf;
+	char nfunc[32];
+
+	memset(nfunc, 0, sizeof(nfunc));
+	memcpy(nfunc, func, sizeof(nfunc) - 1);
+
+	va_start(va, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &va;
+
+	if (likely(qedi) && likely(qedi->pdev))
+		pr_err("[%s]:[%s:%d]:%d: %pV", dev_name(&qedi->pdev->dev),
+		       nfunc, line, qedi->host_no, &vaf);
+	else
+		pr_err("[0000:00:00.0]:[%s:%d]: %pV", nfunc, line, &vaf);
+
+	va_end(va);
+}
+
+void
+qedi_dbg_warn(struct qedi_dbg_ctx *qedi, const char *func, u32 line,
+	      const char *fmt, ...)
+{
+	va_list va;
+	struct va_format vaf;
+	char nfunc[32];
+
+	memset(nfunc, 0, sizeof(nfunc));
+	memcpy(nfunc, func, sizeof(nfunc) - 1);
+
+	va_start(va, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &va;
+
+	if (!(qedi_dbg_log & QEDI_LOG_WARN))
+		goto ret;
+
+	if (likely(qedi) && likely(qedi->pdev))
+		pr_warn("[%s]:[%s:%d]:%d: %pV", dev_name(&qedi->pdev->dev),
+			nfunc, line, qedi->host_no, &vaf);
+	else
+		pr_warn("[0000:00:00.0]:[%s:%d]: %pV", nfunc, line, &vaf);
+
+ret:
+	va_end(va);
+}
+
+void
+qedi_dbg_notice(struct qedi_dbg_ctx *qedi, const char *func, u32 line,
+		const char *fmt, ...)
+{
+	va_list va;
+	struct va_format vaf;
+	char nfunc[32];
+
+	memset(nfunc, 0, sizeof(nfunc));
+	memcpy(nfunc, func, sizeof(nfunc) - 1);
+
+	va_start(va, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &va;
+
+	if (!(qedi_dbg_log & QEDI_LOG_NOTICE))
+		goto ret;
+
+	if (likely(qedi) && likely(qedi->pdev))
+		pr_notice("[%s]:[%s:%d]:%d: %pV",
+			  dev_name(&qedi->pdev->dev), nfunc, line,
+			  qedi->host_no, &vaf);
+	else
+		pr_notice("[0000:00:00.0]:[%s:%d]: %pV", nfunc, line, &vaf);
+
+ret:
+	va_end(va);
+}
+
+void
+qedi_dbg_info(struct qedi_dbg_ctx *qedi, const char *func, u32 line,
+	      u32 level, const char *fmt, ...)
+{
+	va_list va;
+	struct va_format vaf;
+	char nfunc[32];
+
+	memset(nfunc, 0, sizeof(nfunc));
+	memcpy(nfunc, func, sizeof(nfunc) - 1);
+
+	va_start(va, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &va;
+
+	if (!(qedi_dbg_log & level))
+		goto ret;
+
+	if (likely(qedi) && likely(qedi->pdev))
+		pr_info("[%s]:[%s:%d]:%d: %pV", dev_name(&qedi->pdev->dev),
+			nfunc, line, qedi->host_no, &vaf);
+	else
+		pr_info("[0000:00:00.0]:[%s:%d]: %pV", nfunc, line, &vaf);
+
+ret:
+	va_end(va);
+}
+
+int
+qedi_create_sysfs_attr(struct Scsi_Host *shost, struct sysfs_bin_attrs *iter)
+{
+	int ret = 0;
+
+	for (; iter->name; iter++) {
+		ret = sysfs_create_bin_file(&shost->shost_gendev.kobj,
+					    iter->attr);
+		if (ret)
+			pr_err("Unable to create sysfs %s attr, err(%d).\n",
+			       iter->name, ret);
+	}
+	return ret;
+}
+
+void
+qedi_remove_sysfs_attr(struct Scsi_Host *shost, struct sysfs_bin_attrs *iter)
+{
+	for (; iter->name; iter++)
+		sysfs_remove_bin_file(&shost->shost_gendev.kobj, iter->attr);
+}
diff --git a/drivers/scsi/qedi/qedi_dbg.h b/drivers/scsi/qedi/qedi_dbg.h
new file mode 100644
index 0000000..0bc9c31
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_dbg.h
@@ -0,0 +1,142 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QEDI_DBG_H_
+#define _QEDI_DBG_H_
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <linux/string.h>
+#include <linux/version.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <scsi/scsi_transport.h>
+#include <scsi/scsi_transport_iscsi.h>
+#include <linux/fs.h>
+
+#define __PREVENT_QED_HSI__
+#include <linux/qed/common_hsi.h>
+#include <linux/qed/qed_if.h>
+
+extern uint qedi_dbg_log;
+
+/* Debug print level definitions */
+#define QEDI_LOG_DEFAULT	0x1		/* Set default logging mask */
+#define QEDI_LOG_INFO		0x2		/* Informational logs,
+						 * MAC address, WWPN, WWNN
+						 */
+#define QEDI_LOG_DISC		0x4		/* Init, discovery, rport */
+#define QEDI_LOG_LL2		0x8		/* LL2, VLAN logs */
+#define QEDI_LOG_CONN		0x10		/* Connection setup, cleanup */
+#define QEDI_LOG_EVT		0x20		/* Events, link, mtu */
+#define QEDI_LOG_TIMER		0x40		/* Timer events */
+#define QEDI_LOG_MP_REQ		0x80		/* Middle Path (MP) logs */
+#define QEDI_LOG_SCSI_TM	0x100		/* SCSI Aborts, Task Mgmt */
+#define QEDI_LOG_UNSOL		0x200		/* unsolicited event logs */
+#define QEDI_LOG_IO		0x400		/* scsi cmd, completion */
+#define QEDI_LOG_MQ		0x800		/* Multi Queue logs */
+#define QEDI_LOG_BSG		0x1000		/* BSG logs */
+#define QEDI_LOG_DEBUGFS	0x2000		/* debugFS logs */
+#define QEDI_LOG_LPORT		0x4000		/* lport logs */
+#define QEDI_LOG_ELS		0x8000		/* ELS logs */
+#define QEDI_LOG_NPIV		0x10000		/* NPIV logs */
+#define QEDI_LOG_SESS		0x20000		/* Conection setup, cleanup */
+#define QEDI_LOG_UIO		0x40000		/* iSCSI UIO logs */
+#define QEDI_LOG_TID		0x80000         /* FW TID context acquire,
+						 * free
+						 */
+#define QEDI_TRACK_TID		0x100000        /* Track TID state. To be
+						 * enabled only at module load
+						 * and not run-time.
+						 */
+#define QEDI_TRACK_CMD_LIST    0x300000        /* Track active cmd list nodes,
+						* done with reference to TID,
+						* hence TRACK_TID also enabled.
+						*/
+#define QEDI_LOG_NOTICE		0x40000000	/* Notice logs */
+#define QEDI_LOG_WARN		0x80000000	/* Warning logs */
+
+/* Debug context structure */
+struct qedi_dbg_ctx {
+	unsigned int host_no;
+	struct pci_dev *pdev;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *bdf_dentry;
+#endif
+};
+
+#define QEDI_ERR(pdev, fmt, ...)	\
+		qedi_dbg_err(pdev, __func__, __LINE__, fmt, ## __VA_ARGS__)
+#define QEDI_WARN(pdev, fmt, ...)	\
+		qedi_dbg_warn(pdev, __func__, __LINE__, fmt, ## __VA_ARGS__)
+#define QEDI_NOTICE(pdev, fmt, ...)	\
+		qedi_dbg_notice(pdev, __func__, __LINE__, fmt, ## __VA_ARGS__)
+#define QEDI_INFO(pdev, level, fmt, ...)	\
+		qedi_dbg_info(pdev, __func__, __LINE__, level, fmt,	\
+			      ## __VA_ARGS__)
+
+void qedi_dbg_err(struct qedi_dbg_ctx *qedi, const char *func, u32 line,
+		  const char *fmt, ...);
+void qedi_dbg_warn(struct qedi_dbg_ctx *qedi, const char *func, u32 line,
+		   const char *fmt, ...);
+void qedi_dbg_notice(struct qedi_dbg_ctx *qedi, const char *func, u32 line,
+		     const char *fmt, ...);
+void qedi_dbg_info(struct qedi_dbg_ctx *qedi, const char *func, u32 line,
+		   u32 info, const char *fmt, ...);
+
+struct Scsi_Host;
+
+struct sysfs_bin_attrs {
+	char *name;
+	struct bin_attribute *attr;
+};
+
+int qedi_create_sysfs_attr(struct Scsi_Host *shost,
+			   struct sysfs_bin_attrs *iter);
+void qedi_remove_sysfs_attr(struct Scsi_Host *shost,
+			    struct sysfs_bin_attrs *iter);
+
+/* DebugFS related code */
+struct qedi_list_of_funcs {
+	char *oper_str;
+	ssize_t (*oper_func)(struct qedi_dbg_ctx *qedi);
+};
+
+struct qedi_debugfs_ops {
+	char *name;
+	struct qedi_list_of_funcs *qedi_funcs;
+};
+
+#define qedi_dbg_fileops(drv, ops) \
+{ \
+	.owner  = THIS_MODULE, \
+	.open   = simple_open, \
+	.read   = drv##_dbg_##ops##_cmd_read, \
+	.write  = drv##_dbg_##ops##_cmd_write \
+}
+
+/* Used for debugfs sequential files */
+#define qedi_dbg_fileops_seq(drv, ops) \
+{ \
+	.owner = THIS_MODULE, \
+	.open = drv##_dbg_##ops##_open, \
+	.read = seq_read, \
+	.llseek = seq_lseek, \
+	.release = single_release, \
+}
+
+void qedi_dbg_host_init(struct qedi_dbg_ctx *qedi,
+			const struct qedi_debugfs_ops *dops,
+			const struct file_operations *fops);
+void qedi_dbg_host_exit(struct qedi_dbg_ctx *qedi);
+void qedi_dbg_init(char *drv_name);
+void qedi_dbg_exit(void);
+
+#endif /* _QEDI_DBG_H_ */
diff --git a/drivers/scsi/qedi/qedi_debugfs.c b/drivers/scsi/qedi/qedi_debugfs.c
new file mode 100644
index 0000000..fd914ca
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_debugfs.c
@@ -0,0 +1,244 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include "qedi.h"
+#include "qedi_dbg.h"
+
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+
+int qedi_do_not_recover;
+static struct dentry *qedi_dbg_root;
+
+void
+qedi_dbg_host_init(struct qedi_dbg_ctx *qedi,
+		   const struct qedi_debugfs_ops *dops,
+		   const struct file_operations *fops)
+{
+	char host_dirname[32];
+	struct dentry *file_dentry = NULL;
+
+	sprintf(host_dirname, "host%u", qedi->host_no);
+	qedi->bdf_dentry = debugfs_create_dir(host_dirname, qedi_dbg_root);
+	if (!qedi->bdf_dentry)
+		return;
+
+	while (dops) {
+		if (!(dops->name))
+			break;
+
+		file_dentry = debugfs_create_file(dops->name, 0600,
+						  qedi->bdf_dentry, qedi,
+						  fops);
+		if (!file_dentry) {
+			QEDI_INFO(qedi, QEDI_LOG_DEBUGFS,
+				  "Debugfs entry %s creation failed\n",
+				  dops->name);
+			debugfs_remove_recursive(qedi->bdf_dentry);
+			return;
+		}
+		dops++;
+		fops++;
+	}
+}
+
+void
+qedi_dbg_host_exit(struct qedi_dbg_ctx *qedi)
+{
+	debugfs_remove_recursive(qedi->bdf_dentry);
+	qedi->bdf_dentry = NULL;
+}
+
+void
+qedi_dbg_init(char *drv_name)
+{
+	qedi_dbg_root = debugfs_create_dir(drv_name, NULL);
+	if (!qedi_dbg_root)
+		QEDI_INFO(NULL, QEDI_LOG_DEBUGFS, "Init of debugfs failed\n");
+}
+
+void
+qedi_dbg_exit(void)
+{
+	debugfs_remove_recursive(qedi_dbg_root);
+	qedi_dbg_root = NULL;
+}
+
+static ssize_t
+qedi_dbg_do_not_recover_enable(struct qedi_dbg_ctx *qedi_dbg)
+{
+	if (!qedi_do_not_recover)
+		qedi_do_not_recover = 1;
+
+	QEDI_INFO(qedi_dbg, QEDI_LOG_DEBUGFS, "do_not_recover=%d\n",
+		  qedi_do_not_recover);
+	return 0;
+}
+
+static ssize_t
+qedi_dbg_do_not_recover_disable(struct qedi_dbg_ctx *qedi_dbg)
+{
+	if (qedi_do_not_recover)
+		qedi_do_not_recover = 0;
+
+	QEDI_INFO(qedi_dbg, QEDI_LOG_DEBUGFS, "do_not_recover=%d\n",
+		  qedi_do_not_recover);
+	return 0;
+}
+
+static struct qedi_list_of_funcs qedi_dbg_do_not_recover_ops[] = {
+	{ "enable", qedi_dbg_do_not_recover_enable },
+	{ "disable", qedi_dbg_do_not_recover_disable },
+	{ NULL, NULL }
+};
+
+const struct qedi_debugfs_ops qedi_debugfs_ops[] = {
+	{ "gbl_ctx", NULL },
+	{ "do_not_recover", qedi_dbg_do_not_recover_ops},
+	{ "io_trace", NULL },
+	{ NULL, NULL }
+};
+
+static ssize_t
+qedi_dbg_do_not_recover_cmd_write(struct file *filp, const char __user *buffer,
+				  size_t count, loff_t *ppos)
+{
+	size_t cnt = 0;
+	struct qedi_dbg_ctx *qedi_dbg =
+			(struct qedi_dbg_ctx *)filp->private_data;
+	struct qedi_list_of_funcs *lof = qedi_dbg_do_not_recover_ops;
+
+	if (*ppos)
+		return 0;
+
+	while (lof) {
+		if (!(lof->oper_str))
+			break;
+
+		if (!strncmp(lof->oper_str, buffer, strlen(lof->oper_str))) {
+			cnt = lof->oper_func(qedi_dbg);
+			break;
+		}
+
+		lof++;
+	}
+	return (count - cnt);
+}
+
+static ssize_t
+qedi_dbg_do_not_recover_cmd_read(struct file *filp, char __user *buffer,
+				 size_t count, loff_t *ppos)
+{
+	size_t cnt = 0;
+
+	if (*ppos)
+		return 0;
+
+	cnt = sprintf(buffer, "do_not_recover=%d\n", qedi_do_not_recover);
+	cnt = min_t(int, count, cnt - *ppos);
+	*ppos += cnt;
+	return cnt;
+}
+
+static int
+qedi_gbl_ctx_show(struct seq_file *s, void *unused)
+{
+	struct qedi_fastpath *fp = NULL;
+	struct qed_sb_info *sb_info = NULL;
+	struct status_block_e4 *sb = NULL;
+	struct global_queue *que = NULL;
+	int id;
+	u16 prod_idx;
+	struct qedi_ctx *qedi = s->private;
+	unsigned long flags;
+
+	seq_puts(s, " DUMP CQ CONTEXT:\n");
+
+	for (id = 0; id < MIN_NUM_CPUS_MSIX(qedi); id++) {
+		spin_lock_irqsave(&qedi->hba_lock, flags);
+		seq_printf(s, "=========FAST CQ PATH [%d] ==========\n", id);
+		fp = &qedi->fp_array[id];
+		sb_info = fp->sb_info;
+		sb = sb_info->sb_virt;
+		prod_idx = (sb->pi_array[QEDI_PROTO_CQ_PROD_IDX] &
+			    STATUS_BLOCK_E4_PROD_INDEX_MASK);
+		seq_printf(s, "SB PROD IDX: %d\n", prod_idx);
+		que = qedi->global_queues[fp->sb_id];
+		seq_printf(s, "DRV CONS IDX: %d\n", que->cq_cons_idx);
+		seq_printf(s, "CQ complete host memory: %d\n", fp->sb_id);
+		seq_puts(s, "=========== END ==================\n\n\n");
+		spin_unlock_irqrestore(&qedi->hba_lock, flags);
+	}
+	return 0;
+}
+
+static int
+qedi_dbg_gbl_ctx_open(struct inode *inode, struct file *file)
+{
+	struct qedi_dbg_ctx *qedi_dbg = inode->i_private;
+	struct qedi_ctx *qedi = container_of(qedi_dbg, struct qedi_ctx,
+					     dbg_ctx);
+
+	return single_open(file, qedi_gbl_ctx_show, qedi);
+}
+
+static int
+qedi_io_trace_show(struct seq_file *s, void *unused)
+{
+	int id, idx = 0;
+	struct qedi_ctx *qedi = s->private;
+	struct qedi_io_log *io_log;
+	unsigned long flags;
+
+	seq_puts(s, " DUMP IO LOGS:\n");
+	spin_lock_irqsave(&qedi->io_trace_lock, flags);
+	idx = qedi->io_trace_idx;
+	for (id = 0; id < QEDI_IO_TRACE_SIZE; id++) {
+		io_log = &qedi->io_trace_buf[idx];
+		seq_printf(s, "iodir-%d:", io_log->direction);
+		seq_printf(s, "tid-0x%x:", io_log->task_id);
+		seq_printf(s, "cid-0x%x:", io_log->cid);
+		seq_printf(s, "lun-%d:", io_log->lun);
+		seq_printf(s, "op-0x%02x:", io_log->op);
+		seq_printf(s, "0x%02x%02x%02x%02x:", io_log->lba[0],
+			   io_log->lba[1], io_log->lba[2], io_log->lba[3]);
+		seq_printf(s, "buflen-%d:", io_log->bufflen);
+		seq_printf(s, "sgcnt-%d:", io_log->sg_count);
+		seq_printf(s, "res-0x%08x:", io_log->result);
+		seq_printf(s, "jif-%lu:", io_log->jiffies);
+		seq_printf(s, "blk_req_cpu-%d:", io_log->blk_req_cpu);
+		seq_printf(s, "req_cpu-%d:", io_log->req_cpu);
+		seq_printf(s, "intr_cpu-%d:", io_log->intr_cpu);
+		seq_printf(s, "blk_rsp_cpu-%d\n", io_log->blk_rsp_cpu);
+
+		idx++;
+		if (idx == QEDI_IO_TRACE_SIZE)
+			idx = 0;
+	}
+	spin_unlock_irqrestore(&qedi->io_trace_lock, flags);
+	return 0;
+}
+
+static int
+qedi_dbg_io_trace_open(struct inode *inode, struct file *file)
+{
+	struct qedi_dbg_ctx *qedi_dbg = inode->i_private;
+	struct qedi_ctx *qedi = container_of(qedi_dbg, struct qedi_ctx,
+					     dbg_ctx);
+
+	return single_open(file, qedi_io_trace_show, qedi);
+}
+
+const struct file_operations qedi_dbg_fops[] = {
+	qedi_dbg_fileops_seq(qedi, gbl_ctx),
+	qedi_dbg_fileops(qedi, do_not_recover),
+	qedi_dbg_fileops_seq(qedi, io_trace),
+	{ },
+};
diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c
new file mode 100644
index 0000000..25d763a
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_fw.c
@@ -0,0 +1,2233 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include <linux/blkdev.h>
+#include <scsi/scsi_tcq.h>
+#include <linux/delay.h>
+
+#include "qedi.h"
+#include "qedi_iscsi.h"
+#include "qedi_gbl.h"
+#include "qedi_fw_iscsi.h"
+#include "qedi_fw_scsi.h"
+
+static int qedi_send_iscsi_tmf(struct qedi_conn *qedi_conn,
+			       struct iscsi_task *mtask);
+
+void qedi_iscsi_unmap_sg_list(struct qedi_cmd *cmd)
+{
+	struct scsi_cmnd *sc = cmd->scsi_cmd;
+
+	if (cmd->io_tbl.sge_valid && sc) {
+		cmd->io_tbl.sge_valid = 0;
+		scsi_dma_unmap(sc);
+	}
+}
+
+static void qedi_process_logout_resp(struct qedi_ctx *qedi,
+				     union iscsi_cqe *cqe,
+				     struct iscsi_task *task,
+				     struct qedi_conn *qedi_conn)
+{
+	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+	struct iscsi_logout_rsp *resp_hdr;
+	struct iscsi_session *session = conn->session;
+	struct iscsi_logout_response_hdr *cqe_logout_response;
+	struct qedi_cmd *cmd;
+
+	cmd = (struct qedi_cmd *)task->dd_data;
+	cqe_logout_response = &cqe->cqe_common.iscsi_hdr.logout_response;
+	spin_lock(&session->back_lock);
+	resp_hdr = (struct iscsi_logout_rsp *)&qedi_conn->gen_pdu.resp_hdr;
+	memset(resp_hdr, 0, sizeof(struct iscsi_hdr));
+	resp_hdr->opcode = cqe_logout_response->opcode;
+	resp_hdr->flags = cqe_logout_response->flags;
+	resp_hdr->hlength = 0;
+
+	resp_hdr->itt = build_itt(cqe->cqe_solicited.itid, conn->session->age);
+	resp_hdr->statsn = cpu_to_be32(cqe_logout_response->stat_sn);
+	resp_hdr->exp_cmdsn = cpu_to_be32(cqe_logout_response->exp_cmd_sn);
+	resp_hdr->max_cmdsn = cpu_to_be32(cqe_logout_response->max_cmd_sn);
+
+	resp_hdr->t2wait = cpu_to_be32(cqe_logout_response->time_2_wait);
+	resp_hdr->t2retain = cpu_to_be32(cqe_logout_response->time_2_retain);
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_TID,
+		  "Freeing tid=0x%x for cid=0x%x\n",
+		  cmd->task_id, qedi_conn->iscsi_conn_id);
+
+	if (likely(cmd->io_cmd_in_list)) {
+		cmd->io_cmd_in_list = false;
+		list_del_init(&cmd->io_cmd);
+		qedi_conn->active_cmd_count--;
+	} else {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+			  "Active cmd list node already deleted, tid=0x%x, cid=0x%x, io_cmd_node=%p\n",
+			  cmd->task_id, qedi_conn->iscsi_conn_id,
+			  &cmd->io_cmd);
+	}
+
+	cmd->state = RESPONSE_RECEIVED;
+	qedi_clear_task_idx(qedi, cmd->task_id);
+	__iscsi_complete_pdu(conn, (struct iscsi_hdr *)resp_hdr, NULL, 0);
+
+	spin_unlock(&session->back_lock);
+}
+
+static void qedi_process_text_resp(struct qedi_ctx *qedi,
+				   union iscsi_cqe *cqe,
+				   struct iscsi_task *task,
+				   struct qedi_conn *qedi_conn)
+{
+	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+	struct iscsi_session *session = conn->session;
+	struct e4_iscsi_task_context *task_ctx;
+	struct iscsi_text_rsp *resp_hdr_ptr;
+	struct iscsi_text_response_hdr *cqe_text_response;
+	struct qedi_cmd *cmd;
+	int pld_len;
+
+	cmd = (struct qedi_cmd *)task->dd_data;
+	task_ctx = qedi_get_task_mem(&qedi->tasks, cmd->task_id);
+
+	cqe_text_response = &cqe->cqe_common.iscsi_hdr.text_response;
+	spin_lock(&session->back_lock);
+	resp_hdr_ptr =  (struct iscsi_text_rsp *)&qedi_conn->gen_pdu.resp_hdr;
+	memset(resp_hdr_ptr, 0, sizeof(struct iscsi_hdr));
+	resp_hdr_ptr->opcode = cqe_text_response->opcode;
+	resp_hdr_ptr->flags = cqe_text_response->flags;
+	resp_hdr_ptr->hlength = 0;
+
+	hton24(resp_hdr_ptr->dlength,
+	       (cqe_text_response->hdr_second_dword &
+		ISCSI_TEXT_RESPONSE_HDR_DATA_SEG_LEN_MASK));
+
+	resp_hdr_ptr->itt = build_itt(cqe->cqe_solicited.itid,
+				      conn->session->age);
+	resp_hdr_ptr->ttt = cqe_text_response->ttt;
+	resp_hdr_ptr->statsn = cpu_to_be32(cqe_text_response->stat_sn);
+	resp_hdr_ptr->exp_cmdsn = cpu_to_be32(cqe_text_response->exp_cmd_sn);
+	resp_hdr_ptr->max_cmdsn = cpu_to_be32(cqe_text_response->max_cmd_sn);
+
+	pld_len = cqe_text_response->hdr_second_dword &
+		  ISCSI_TEXT_RESPONSE_HDR_DATA_SEG_LEN_MASK;
+	qedi_conn->gen_pdu.resp_wr_ptr = qedi_conn->gen_pdu.resp_buf + pld_len;
+
+	memset(task_ctx, '\0', sizeof(*task_ctx));
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_TID,
+		  "Freeing tid=0x%x for cid=0x%x\n",
+		  cmd->task_id, qedi_conn->iscsi_conn_id);
+
+	if (likely(cmd->io_cmd_in_list)) {
+		cmd->io_cmd_in_list = false;
+		list_del_init(&cmd->io_cmd);
+		qedi_conn->active_cmd_count--;
+	} else {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+			  "Active cmd list node already deleted, tid=0x%x, cid=0x%x, io_cmd_node=%p\n",
+			  cmd->task_id, qedi_conn->iscsi_conn_id,
+			  &cmd->io_cmd);
+	}
+
+	cmd->state = RESPONSE_RECEIVED;
+	qedi_clear_task_idx(qedi, cmd->task_id);
+
+	__iscsi_complete_pdu(conn, (struct iscsi_hdr *)resp_hdr_ptr,
+			     qedi_conn->gen_pdu.resp_buf,
+			     (qedi_conn->gen_pdu.resp_wr_ptr -
+			      qedi_conn->gen_pdu.resp_buf));
+	spin_unlock(&session->back_lock);
+}
+
+static void qedi_tmf_resp_work(struct work_struct *work)
+{
+	struct qedi_cmd *qedi_cmd =
+				container_of(work, struct qedi_cmd, tmf_work);
+	struct qedi_conn *qedi_conn = qedi_cmd->conn;
+	struct qedi_ctx *qedi = qedi_conn->qedi;
+	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+	struct iscsi_session *session = conn->session;
+	struct iscsi_tm_rsp *resp_hdr_ptr;
+	struct iscsi_cls_session *cls_sess;
+	int rval = 0;
+
+	set_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags);
+	resp_hdr_ptr =  (struct iscsi_tm_rsp *)qedi_cmd->tmf_resp_buf;
+	cls_sess = iscsi_conn_to_session(qedi_conn->cls_conn);
+
+	iscsi_block_session(session->cls_session);
+	rval = qedi_cleanup_all_io(qedi, qedi_conn, qedi_cmd->task, true);
+	if (rval) {
+		qedi_clear_task_idx(qedi, qedi_cmd->task_id);
+		iscsi_unblock_session(session->cls_session);
+		goto exit_tmf_resp;
+	}
+
+	iscsi_unblock_session(session->cls_session);
+	qedi_clear_task_idx(qedi, qedi_cmd->task_id);
+
+	spin_lock(&session->back_lock);
+	__iscsi_complete_pdu(conn, (struct iscsi_hdr *)resp_hdr_ptr, NULL, 0);
+	spin_unlock(&session->back_lock);
+
+exit_tmf_resp:
+	kfree(resp_hdr_ptr);
+	clear_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags);
+}
+
+static void qedi_process_tmf_resp(struct qedi_ctx *qedi,
+				  union iscsi_cqe *cqe,
+				  struct iscsi_task *task,
+				  struct qedi_conn *qedi_conn)
+
+{
+	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+	struct iscsi_session *session = conn->session;
+	struct iscsi_tmf_response_hdr *cqe_tmp_response;
+	struct iscsi_tm_rsp *resp_hdr_ptr;
+	struct iscsi_tm *tmf_hdr;
+	struct qedi_cmd *qedi_cmd = NULL;
+
+	cqe_tmp_response = &cqe->cqe_common.iscsi_hdr.tmf_response;
+
+	qedi_cmd = task->dd_data;
+	qedi_cmd->tmf_resp_buf = kzalloc(sizeof(*resp_hdr_ptr), GFP_ATOMIC);
+	if (!qedi_cmd->tmf_resp_buf) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Failed to allocate resp buf, cid=0x%x\n",
+			  qedi_conn->iscsi_conn_id);
+		return;
+	}
+
+	spin_lock(&session->back_lock);
+	resp_hdr_ptr =  (struct iscsi_tm_rsp *)qedi_cmd->tmf_resp_buf;
+	memset(resp_hdr_ptr, 0, sizeof(struct iscsi_tm_rsp));
+
+	/* Fill up the header */
+	resp_hdr_ptr->opcode = cqe_tmp_response->opcode;
+	resp_hdr_ptr->flags = cqe_tmp_response->hdr_flags;
+	resp_hdr_ptr->response = cqe_tmp_response->hdr_response;
+	resp_hdr_ptr->hlength = 0;
+
+	hton24(resp_hdr_ptr->dlength,
+	       (cqe_tmp_response->hdr_second_dword &
+		ISCSI_TMF_RESPONSE_HDR_DATA_SEG_LEN_MASK));
+	resp_hdr_ptr->itt = build_itt(cqe->cqe_solicited.itid,
+				      conn->session->age);
+	resp_hdr_ptr->statsn = cpu_to_be32(cqe_tmp_response->stat_sn);
+	resp_hdr_ptr->exp_cmdsn  = cpu_to_be32(cqe_tmp_response->exp_cmd_sn);
+	resp_hdr_ptr->max_cmdsn = cpu_to_be32(cqe_tmp_response->max_cmd_sn);
+
+	tmf_hdr = (struct iscsi_tm *)qedi_cmd->task->hdr;
+
+	if (likely(qedi_cmd->io_cmd_in_list)) {
+		qedi_cmd->io_cmd_in_list = false;
+		list_del_init(&qedi_cmd->io_cmd);
+		qedi_conn->active_cmd_count--;
+	}
+
+	if (((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
+	      ISCSI_TM_FUNC_LOGICAL_UNIT_RESET) ||
+	    ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
+	      ISCSI_TM_FUNC_TARGET_WARM_RESET) ||
+	    ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
+	      ISCSI_TM_FUNC_TARGET_COLD_RESET)) {
+		INIT_WORK(&qedi_cmd->tmf_work, qedi_tmf_resp_work);
+		queue_work(qedi->tmf_thread, &qedi_cmd->tmf_work);
+		goto unblock_sess;
+	}
+
+	qedi_clear_task_idx(qedi, qedi_cmd->task_id);
+
+	__iscsi_complete_pdu(conn, (struct iscsi_hdr *)resp_hdr_ptr, NULL, 0);
+	kfree(resp_hdr_ptr);
+
+unblock_sess:
+	spin_unlock(&session->back_lock);
+}
+
+static void qedi_process_login_resp(struct qedi_ctx *qedi,
+				    union iscsi_cqe *cqe,
+				    struct iscsi_task *task,
+				    struct qedi_conn *qedi_conn)
+{
+	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+	struct iscsi_session *session = conn->session;
+	struct e4_iscsi_task_context *task_ctx;
+	struct iscsi_login_rsp *resp_hdr_ptr;
+	struct iscsi_login_response_hdr *cqe_login_response;
+	struct qedi_cmd *cmd;
+	int pld_len;
+
+	cmd = (struct qedi_cmd *)task->dd_data;
+
+	cqe_login_response = &cqe->cqe_common.iscsi_hdr.login_response;
+	task_ctx = qedi_get_task_mem(&qedi->tasks, cmd->task_id);
+
+	spin_lock(&session->back_lock);
+	resp_hdr_ptr =  (struct iscsi_login_rsp *)&qedi_conn->gen_pdu.resp_hdr;
+	memset(resp_hdr_ptr, 0, sizeof(struct iscsi_login_rsp));
+	resp_hdr_ptr->opcode = cqe_login_response->opcode;
+	resp_hdr_ptr->flags = cqe_login_response->flags_attr;
+	resp_hdr_ptr->hlength = 0;
+
+	hton24(resp_hdr_ptr->dlength,
+	       (cqe_login_response->hdr_second_dword &
+		ISCSI_LOGIN_RESPONSE_HDR_DATA_SEG_LEN_MASK));
+	resp_hdr_ptr->itt = build_itt(cqe->cqe_solicited.itid,
+				      conn->session->age);
+	resp_hdr_ptr->tsih = cqe_login_response->tsih;
+	resp_hdr_ptr->statsn = cpu_to_be32(cqe_login_response->stat_sn);
+	resp_hdr_ptr->exp_cmdsn = cpu_to_be32(cqe_login_response->exp_cmd_sn);
+	resp_hdr_ptr->max_cmdsn = cpu_to_be32(cqe_login_response->max_cmd_sn);
+	resp_hdr_ptr->status_class = cqe_login_response->status_class;
+	resp_hdr_ptr->status_detail = cqe_login_response->status_detail;
+	pld_len = cqe_login_response->hdr_second_dword &
+		  ISCSI_LOGIN_RESPONSE_HDR_DATA_SEG_LEN_MASK;
+	qedi_conn->gen_pdu.resp_wr_ptr = qedi_conn->gen_pdu.resp_buf + pld_len;
+
+	if (likely(cmd->io_cmd_in_list)) {
+		cmd->io_cmd_in_list = false;
+		list_del_init(&cmd->io_cmd);
+		qedi_conn->active_cmd_count--;
+	}
+
+	memset(task_ctx, '\0', sizeof(*task_ctx));
+
+	__iscsi_complete_pdu(conn, (struct iscsi_hdr *)resp_hdr_ptr,
+			     qedi_conn->gen_pdu.resp_buf,
+			     (qedi_conn->gen_pdu.resp_wr_ptr -
+			     qedi_conn->gen_pdu.resp_buf));
+
+	spin_unlock(&session->back_lock);
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_TID,
+		  "Freeing tid=0x%x for cid=0x%x\n",
+		  cmd->task_id, qedi_conn->iscsi_conn_id);
+	cmd->state = RESPONSE_RECEIVED;
+	qedi_clear_task_idx(qedi, cmd->task_id);
+}
+
+static void qedi_get_rq_bdq_buf(struct qedi_ctx *qedi,
+				struct iscsi_cqe_unsolicited *cqe,
+				char *ptr, int len)
+{
+	u16 idx = 0;
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+		  "pld_len [%d], bdq_prod_idx [%d], idx [%d]\n",
+		  len, qedi->bdq_prod_idx,
+		  (qedi->bdq_prod_idx % qedi->rq_num_entries));
+
+	/* Obtain buffer address from rqe_opaque */
+	idx = cqe->rqe_opaque;
+	if (idx > (QEDI_BDQ_NUM - 1)) {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+			  "wrong idx %d returned by FW, dropping the unsolicited pkt\n",
+			  idx);
+		return;
+	}
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+		  "rqe_opaque [0x%p], idx [%d]\n", cqe->rqe_opaque, idx);
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+		  "unsol_cqe_type = %d\n", cqe->unsol_cqe_type);
+	switch (cqe->unsol_cqe_type) {
+	case ISCSI_CQE_UNSOLICITED_SINGLE:
+	case ISCSI_CQE_UNSOLICITED_FIRST:
+		if (len)
+			memcpy(ptr, (void *)qedi->bdq[idx].buf_addr, len);
+		break;
+	case ISCSI_CQE_UNSOLICITED_MIDDLE:
+	case ISCSI_CQE_UNSOLICITED_LAST:
+		break;
+	default:
+		break;
+	}
+}
+
+static void qedi_put_rq_bdq_buf(struct qedi_ctx *qedi,
+				struct iscsi_cqe_unsolicited *cqe,
+				int count)
+{
+	u16 tmp;
+	u16 idx = 0;
+	struct scsi_bd *pbl;
+
+	/* Obtain buffer address from rqe_opaque */
+	idx = cqe->rqe_opaque;
+	if (idx > (QEDI_BDQ_NUM - 1)) {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+			  "wrong idx %d returned by FW, dropping the unsolicited pkt\n",
+			  idx);
+		return;
+	}
+
+	pbl = (struct scsi_bd *)qedi->bdq_pbl;
+	pbl += (qedi->bdq_prod_idx % qedi->rq_num_entries);
+	pbl->address.hi = cpu_to_le32(QEDI_U64_HI(qedi->bdq[idx].buf_dma));
+	pbl->address.lo = cpu_to_le32(QEDI_U64_LO(qedi->bdq[idx].buf_dma));
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+		  "pbl [0x%p] pbl->address hi [0x%llx] lo [0x%llx] idx [%d]\n",
+		  pbl, pbl->address.hi, pbl->address.lo, idx);
+	pbl->opaque.iscsi_opaque.reserved_zero[0] = 0;
+	pbl->opaque.iscsi_opaque.reserved_zero[1] = 0;
+	pbl->opaque.iscsi_opaque.reserved_zero[2] = 0;
+	pbl->opaque.iscsi_opaque.opaque = cpu_to_le32(idx);
+
+	/* Increment producer to let f/w know we've handled the frame */
+	qedi->bdq_prod_idx += count;
+
+	writew(qedi->bdq_prod_idx, qedi->bdq_primary_prod);
+	tmp = readw(qedi->bdq_primary_prod);
+
+	writew(qedi->bdq_prod_idx, qedi->bdq_secondary_prod);
+	tmp = readw(qedi->bdq_secondary_prod);
+}
+
+static void qedi_unsol_pdu_adjust_bdq(struct qedi_ctx *qedi,
+				      struct iscsi_cqe_unsolicited *cqe,
+				      u32 pdu_len, u32 num_bdqs,
+				      char *bdq_data)
+{
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+		  "num_bdqs [%d]\n", num_bdqs);
+
+	qedi_get_rq_bdq_buf(qedi, cqe, bdq_data, pdu_len);
+	qedi_put_rq_bdq_buf(qedi, cqe, (num_bdqs + 1));
+}
+
+static int qedi_process_nopin_mesg(struct qedi_ctx *qedi,
+				   union iscsi_cqe *cqe,
+				   struct iscsi_task *task,
+				   struct qedi_conn *qedi_conn, u16 que_idx)
+{
+	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+	struct iscsi_session *session = conn->session;
+	struct iscsi_nop_in_hdr *cqe_nop_in;
+	struct iscsi_nopin *hdr;
+	struct qedi_cmd *cmd;
+	int tgt_async_nop = 0;
+	u32 lun[2];
+	u32 pdu_len, num_bdqs;
+	char bdq_data[QEDI_BDQ_BUF_SIZE];
+	unsigned long flags;
+
+	spin_lock_bh(&session->back_lock);
+	cqe_nop_in = &cqe->cqe_common.iscsi_hdr.nop_in;
+
+	pdu_len = cqe_nop_in->hdr_second_dword &
+		  ISCSI_NOP_IN_HDR_DATA_SEG_LEN_MASK;
+	num_bdqs = pdu_len / QEDI_BDQ_BUF_SIZE;
+
+	hdr = (struct iscsi_nopin *)&qedi_conn->gen_pdu.resp_hdr;
+	memset(hdr, 0, sizeof(struct iscsi_hdr));
+	hdr->opcode = cqe_nop_in->opcode;
+	hdr->max_cmdsn = cpu_to_be32(cqe_nop_in->max_cmd_sn);
+	hdr->exp_cmdsn = cpu_to_be32(cqe_nop_in->exp_cmd_sn);
+	hdr->statsn = cpu_to_be32(cqe_nop_in->stat_sn);
+	hdr->ttt = cpu_to_be32(cqe_nop_in->ttt);
+
+	if (cqe->cqe_common.cqe_type == ISCSI_CQE_TYPE_UNSOLICITED) {
+		spin_lock_irqsave(&qedi->hba_lock, flags);
+		qedi_unsol_pdu_adjust_bdq(qedi, &cqe->cqe_unsolicited,
+					  pdu_len, num_bdqs, bdq_data);
+		hdr->itt = RESERVED_ITT;
+		tgt_async_nop = 1;
+		spin_unlock_irqrestore(&qedi->hba_lock, flags);
+		goto done;
+	}
+
+	/* Response to one of our nop-outs */
+	if (task) {
+		cmd = task->dd_data;
+		hdr->flags = ISCSI_FLAG_CMD_FINAL;
+		hdr->itt = build_itt(cqe->cqe_solicited.itid,
+				     conn->session->age);
+		lun[0] = 0xffffffff;
+		lun[1] = 0xffffffff;
+		memcpy(&hdr->lun, lun, sizeof(struct scsi_lun));
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_TID,
+			  "Freeing tid=0x%x for cid=0x%x\n",
+			  cmd->task_id, qedi_conn->iscsi_conn_id);
+		cmd->state = RESPONSE_RECEIVED;
+		spin_lock(&qedi_conn->list_lock);
+		if (likely(cmd->io_cmd_in_list)) {
+			cmd->io_cmd_in_list = false;
+			list_del_init(&cmd->io_cmd);
+			qedi_conn->active_cmd_count--;
+		}
+
+		spin_unlock(&qedi_conn->list_lock);
+		qedi_clear_task_idx(qedi, cmd->task_id);
+	}
+
+done:
+	__iscsi_complete_pdu(conn, (struct iscsi_hdr *)hdr, bdq_data, pdu_len);
+
+	spin_unlock_bh(&session->back_lock);
+	return tgt_async_nop;
+}
+
+static void qedi_process_async_mesg(struct qedi_ctx *qedi,
+				    union iscsi_cqe *cqe,
+				    struct iscsi_task *task,
+				    struct qedi_conn *qedi_conn,
+				    u16 que_idx)
+{
+	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+	struct iscsi_session *session = conn->session;
+	struct iscsi_async_msg_hdr *cqe_async_msg;
+	struct iscsi_async *resp_hdr;
+	u32 lun[2];
+	u32 pdu_len, num_bdqs;
+	char bdq_data[QEDI_BDQ_BUF_SIZE];
+	unsigned long flags;
+
+	spin_lock_bh(&session->back_lock);
+
+	cqe_async_msg = &cqe->cqe_common.iscsi_hdr.async_msg;
+	pdu_len = cqe_async_msg->hdr_second_dword &
+		ISCSI_ASYNC_MSG_HDR_DATA_SEG_LEN_MASK;
+	num_bdqs = pdu_len / QEDI_BDQ_BUF_SIZE;
+
+	if (cqe->cqe_common.cqe_type == ISCSI_CQE_TYPE_UNSOLICITED) {
+		spin_lock_irqsave(&qedi->hba_lock, flags);
+		qedi_unsol_pdu_adjust_bdq(qedi, &cqe->cqe_unsolicited,
+					  pdu_len, num_bdqs, bdq_data);
+		spin_unlock_irqrestore(&qedi->hba_lock, flags);
+	}
+
+	resp_hdr = (struct iscsi_async *)&qedi_conn->gen_pdu.resp_hdr;
+	memset(resp_hdr, 0, sizeof(struct iscsi_hdr));
+	resp_hdr->opcode = cqe_async_msg->opcode;
+	resp_hdr->flags = 0x80;
+
+	lun[0] = cpu_to_be32(cqe_async_msg->lun.lo);
+	lun[1] = cpu_to_be32(cqe_async_msg->lun.hi);
+	memcpy(&resp_hdr->lun, lun, sizeof(struct scsi_lun));
+	resp_hdr->exp_cmdsn = cpu_to_be32(cqe_async_msg->exp_cmd_sn);
+	resp_hdr->max_cmdsn = cpu_to_be32(cqe_async_msg->max_cmd_sn);
+	resp_hdr->statsn = cpu_to_be32(cqe_async_msg->stat_sn);
+
+	resp_hdr->async_event = cqe_async_msg->async_event;
+	resp_hdr->async_vcode = cqe_async_msg->async_vcode;
+
+	resp_hdr->param1 = cpu_to_be16(cqe_async_msg->param1_rsrv);
+	resp_hdr->param2 = cpu_to_be16(cqe_async_msg->param2_rsrv);
+	resp_hdr->param3 = cpu_to_be16(cqe_async_msg->param3_rsrv);
+
+	__iscsi_complete_pdu(conn, (struct iscsi_hdr *)resp_hdr, bdq_data,
+			     pdu_len);
+
+	spin_unlock_bh(&session->back_lock);
+}
+
+static void qedi_process_reject_mesg(struct qedi_ctx *qedi,
+				     union iscsi_cqe *cqe,
+				     struct iscsi_task *task,
+				     struct qedi_conn *qedi_conn,
+				     uint16_t que_idx)
+{
+	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+	struct iscsi_session *session = conn->session;
+	struct iscsi_reject_hdr *cqe_reject;
+	struct iscsi_reject *hdr;
+	u32 pld_len, num_bdqs;
+	unsigned long flags;
+
+	spin_lock_bh(&session->back_lock);
+	cqe_reject = &cqe->cqe_common.iscsi_hdr.reject;
+	pld_len = cqe_reject->hdr_second_dword &
+		  ISCSI_REJECT_HDR_DATA_SEG_LEN_MASK;
+	num_bdqs = pld_len / QEDI_BDQ_BUF_SIZE;
+
+	if (cqe->cqe_common.cqe_type == ISCSI_CQE_TYPE_UNSOLICITED) {
+		spin_lock_irqsave(&qedi->hba_lock, flags);
+		qedi_unsol_pdu_adjust_bdq(qedi, &cqe->cqe_unsolicited,
+					  pld_len, num_bdqs, conn->data);
+		spin_unlock_irqrestore(&qedi->hba_lock, flags);
+	}
+	hdr = (struct iscsi_reject *)&qedi_conn->gen_pdu.resp_hdr;
+	memset(hdr, 0, sizeof(struct iscsi_hdr));
+	hdr->opcode = cqe_reject->opcode;
+	hdr->reason = cqe_reject->hdr_reason;
+	hdr->flags = cqe_reject->hdr_flags;
+	hton24(hdr->dlength, (cqe_reject->hdr_second_dword &
+			      ISCSI_REJECT_HDR_DATA_SEG_LEN_MASK));
+	hdr->max_cmdsn = cpu_to_be32(cqe_reject->max_cmd_sn);
+	hdr->exp_cmdsn = cpu_to_be32(cqe_reject->exp_cmd_sn);
+	hdr->statsn = cpu_to_be32(cqe_reject->stat_sn);
+	hdr->ffffffff = cpu_to_be32(0xffffffff);
+
+	__iscsi_complete_pdu(conn, (struct iscsi_hdr *)hdr,
+			     conn->data, pld_len);
+	spin_unlock_bh(&session->back_lock);
+}
+
+static void qedi_scsi_completion(struct qedi_ctx *qedi,
+				 union iscsi_cqe *cqe,
+				 struct iscsi_task *task,
+				 struct iscsi_conn *conn)
+{
+	struct scsi_cmnd *sc_cmd;
+	struct qedi_cmd *cmd = task->dd_data;
+	struct iscsi_session *session = conn->session;
+	struct iscsi_scsi_rsp *hdr;
+	struct iscsi_data_in_hdr *cqe_data_in;
+	int datalen = 0;
+	struct qedi_conn *qedi_conn;
+	u32 iscsi_cid;
+	u8 cqe_err_bits = 0;
+
+	iscsi_cid  = cqe->cqe_common.conn_id;
+	qedi_conn = qedi->cid_que.conn_cid_tbl[iscsi_cid];
+
+	cqe_data_in = &cqe->cqe_common.iscsi_hdr.data_in;
+	cqe_err_bits =
+		cqe->cqe_common.error_bitmap.error_bits.cqe_error_status_bits;
+
+	spin_lock_bh(&session->back_lock);
+	/* get the scsi command */
+	sc_cmd = cmd->scsi_cmd;
+
+	if (!sc_cmd) {
+		QEDI_WARN(&qedi->dbg_ctx, "sc_cmd is NULL!\n");
+		goto error;
+	}
+
+	if (!sc_cmd->SCp.ptr) {
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "SCp.ptr is NULL, returned in another context.\n");
+		goto error;
+	}
+
+	if (!sc_cmd->request) {
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "sc_cmd->request is NULL, sc_cmd=%p.\n",
+			  sc_cmd);
+		goto error;
+	}
+
+	if (!sc_cmd->request->special) {
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "request->special is NULL so request not valid, sc_cmd=%p.\n",
+			  sc_cmd);
+		goto error;
+	}
+
+	if (!sc_cmd->request->q) {
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "request->q is NULL so request is not valid, sc_cmd=%p.\n",
+			  sc_cmd);
+		goto error;
+	}
+
+	qedi_iscsi_unmap_sg_list(cmd);
+
+	hdr = (struct iscsi_scsi_rsp *)task->hdr;
+	hdr->opcode = cqe_data_in->opcode;
+	hdr->max_cmdsn = cpu_to_be32(cqe_data_in->max_cmd_sn);
+	hdr->exp_cmdsn = cpu_to_be32(cqe_data_in->exp_cmd_sn);
+	hdr->itt = build_itt(cqe->cqe_solicited.itid, conn->session->age);
+	hdr->response = cqe_data_in->reserved1;
+	hdr->cmd_status = cqe_data_in->status_rsvd;
+	hdr->flags = cqe_data_in->flags;
+	hdr->residual_count = cpu_to_be32(cqe_data_in->residual_count);
+
+	if (hdr->cmd_status == SAM_STAT_CHECK_CONDITION) {
+		datalen = cqe_data_in->reserved2 &
+			  ISCSI_COMMON_HDR_DATA_SEG_LEN_MASK;
+		memcpy((char *)conn->data, (char *)cmd->sense_buffer, datalen);
+	}
+
+	/* If f/w reports data underrun err then set residual to IO transfer
+	 * length, set Underrun flag and clear Overrun flag explicitly
+	 */
+	if (unlikely(cqe_err_bits &&
+		     GET_FIELD(cqe_err_bits, CQE_ERROR_BITMAP_UNDER_RUN_ERR))) {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+			  "Under flow itt=0x%x proto flags=0x%x tid=0x%x cid 0x%x fw resid 0x%x sc dlen 0x%x\n",
+			  hdr->itt, cqe_data_in->flags, cmd->task_id,
+			  qedi_conn->iscsi_conn_id, hdr->residual_count,
+			  scsi_bufflen(sc_cmd));
+		hdr->residual_count = cpu_to_be32(scsi_bufflen(sc_cmd));
+		hdr->flags |= ISCSI_FLAG_CMD_UNDERFLOW;
+		hdr->flags &= (~ISCSI_FLAG_CMD_OVERFLOW);
+	}
+
+	spin_lock(&qedi_conn->list_lock);
+	if (likely(cmd->io_cmd_in_list)) {
+		cmd->io_cmd_in_list = false;
+		list_del_init(&cmd->io_cmd);
+		qedi_conn->active_cmd_count--;
+	}
+	spin_unlock(&qedi_conn->list_lock);
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_TID,
+		  "Freeing tid=0x%x for cid=0x%x\n",
+		  cmd->task_id, qedi_conn->iscsi_conn_id);
+	cmd->state = RESPONSE_RECEIVED;
+	if (qedi_io_tracing)
+		qedi_trace_io(qedi, task, cmd->task_id, QEDI_IO_TRACE_RSP);
+
+	qedi_clear_task_idx(qedi, cmd->task_id);
+	__iscsi_complete_pdu(conn, (struct iscsi_hdr *)hdr,
+			     conn->data, datalen);
+error:
+	spin_unlock_bh(&session->back_lock);
+}
+
+static void qedi_mtask_completion(struct qedi_ctx *qedi,
+				  union iscsi_cqe *cqe,
+				  struct iscsi_task *task,
+				  struct qedi_conn *conn, uint16_t que_idx)
+{
+	struct iscsi_conn *iscsi_conn;
+	u32 hdr_opcode;
+
+	hdr_opcode = cqe->cqe_common.iscsi_hdr.common.hdr_first_byte;
+	iscsi_conn = conn->cls_conn->dd_data;
+
+	switch (hdr_opcode) {
+	case ISCSI_OPCODE_SCSI_RESPONSE:
+	case ISCSI_OPCODE_DATA_IN:
+		qedi_scsi_completion(qedi, cqe, task, iscsi_conn);
+		break;
+	case ISCSI_OPCODE_LOGIN_RESPONSE:
+		qedi_process_login_resp(qedi, cqe, task, conn);
+		break;
+	case ISCSI_OPCODE_TMF_RESPONSE:
+		qedi_process_tmf_resp(qedi, cqe, task, conn);
+		break;
+	case ISCSI_OPCODE_TEXT_RESPONSE:
+		qedi_process_text_resp(qedi, cqe, task, conn);
+		break;
+	case ISCSI_OPCODE_LOGOUT_RESPONSE:
+		qedi_process_logout_resp(qedi, cqe, task, conn);
+		break;
+	case ISCSI_OPCODE_NOP_IN:
+		qedi_process_nopin_mesg(qedi, cqe, task, conn, que_idx);
+		break;
+	default:
+		QEDI_ERR(&qedi->dbg_ctx, "unknown opcode\n");
+	}
+}
+
+static void qedi_process_nopin_local_cmpl(struct qedi_ctx *qedi,
+					  struct iscsi_cqe_solicited *cqe,
+					  struct iscsi_task *task,
+					  struct qedi_conn *qedi_conn)
+{
+	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+	struct iscsi_session *session = conn->session;
+	struct qedi_cmd *cmd = task->dd_data;
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_UNSOL,
+		  "itid=0x%x, cmd task id=0x%x\n",
+		  cqe->itid, cmd->task_id);
+
+	cmd->state = RESPONSE_RECEIVED;
+	qedi_clear_task_idx(qedi, cmd->task_id);
+
+	spin_lock_bh(&session->back_lock);
+	__iscsi_put_task(task);
+	spin_unlock_bh(&session->back_lock);
+}
+
+static void qedi_process_cmd_cleanup_resp(struct qedi_ctx *qedi,
+					  struct iscsi_cqe_solicited *cqe,
+					  struct iscsi_task *task,
+					  struct iscsi_conn *conn)
+{
+	struct qedi_work_map *work, *work_tmp;
+	u32 proto_itt = cqe->itid;
+	u32 ptmp_itt = 0;
+	itt_t protoitt = 0;
+	int found = 0;
+	struct qedi_cmd *qedi_cmd = NULL;
+	u32 rtid = 0;
+	u32 iscsi_cid;
+	struct qedi_conn *qedi_conn;
+	struct qedi_cmd *dbg_cmd;
+	struct iscsi_task *mtask;
+	struct iscsi_tm *tmf_hdr = NULL;
+
+	iscsi_cid = cqe->conn_id;
+	qedi_conn = qedi->cid_que.conn_cid_tbl[iscsi_cid];
+	if (!qedi_conn) {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+			  "icid not found 0x%x\n", cqe->conn_id);
+		return;
+	}
+
+	/* Based on this itt get the corresponding qedi_cmd */
+	spin_lock_bh(&qedi_conn->tmf_work_lock);
+	list_for_each_entry_safe(work, work_tmp, &qedi_conn->tmf_work_list,
+				 list) {
+		if (work->rtid == proto_itt) {
+			/* We found the command */
+			qedi_cmd = work->qedi_cmd;
+			if (!qedi_cmd->list_tmf_work) {
+				QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
+					  "TMF work not found, cqe->tid=0x%x, cid=0x%x\n",
+					  proto_itt, qedi_conn->iscsi_conn_id);
+				WARN_ON(1);
+			}
+			found = 1;
+			mtask = qedi_cmd->task;
+			tmf_hdr = (struct iscsi_tm *)mtask->hdr;
+			rtid = work->rtid;
+
+			list_del_init(&work->list);
+			kfree(work);
+			qedi_cmd->list_tmf_work = NULL;
+		}
+	}
+	spin_unlock_bh(&qedi_conn->tmf_work_lock);
+
+	if (found) {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
+			  "TMF work, cqe->tid=0x%x, tmf flags=0x%x, cid=0x%x\n",
+			  proto_itt, tmf_hdr->flags, qedi_conn->iscsi_conn_id);
+
+		if ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
+		    ISCSI_TM_FUNC_ABORT_TASK) {
+			spin_lock_bh(&conn->session->back_lock);
+
+			protoitt = build_itt(get_itt(tmf_hdr->rtt),
+					     conn->session->age);
+			task = iscsi_itt_to_task(conn, protoitt);
+
+			spin_unlock_bh(&conn->session->back_lock);
+
+			if (!task) {
+				QEDI_NOTICE(&qedi->dbg_ctx,
+					    "IO task completed, tmf rtt=0x%x, cid=0x%x\n",
+					    get_itt(tmf_hdr->rtt),
+					    qedi_conn->iscsi_conn_id);
+				return;
+			}
+
+			dbg_cmd = task->dd_data;
+
+			QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
+				  "Abort tmf rtt=0x%x, i/o itt=0x%x, i/o tid=0x%x, cid=0x%x\n",
+				  get_itt(tmf_hdr->rtt), get_itt(task->itt),
+				  dbg_cmd->task_id, qedi_conn->iscsi_conn_id);
+
+			if (qedi_cmd->state == CLEANUP_WAIT_FAILED)
+				qedi_cmd->state = CLEANUP_RECV;
+
+			qedi_clear_task_idx(qedi_conn->qedi, rtid);
+
+			spin_lock(&qedi_conn->list_lock);
+			list_del_init(&dbg_cmd->io_cmd);
+			qedi_conn->active_cmd_count--;
+			spin_unlock(&qedi_conn->list_lock);
+			qedi_cmd->state = CLEANUP_RECV;
+			wake_up_interruptible(&qedi_conn->wait_queue);
+		}
+	} else if (qedi_conn->cmd_cleanup_req > 0) {
+		spin_lock_bh(&conn->session->back_lock);
+		qedi_get_proto_itt(qedi, cqe->itid, &ptmp_itt);
+		protoitt = build_itt(ptmp_itt, conn->session->age);
+		task = iscsi_itt_to_task(conn, protoitt);
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
+			  "cleanup io itid=0x%x, protoitt=0x%x, cmd_cleanup_cmpl=%d, cid=0x%x\n",
+			  cqe->itid, protoitt, qedi_conn->cmd_cleanup_cmpl,
+			  qedi_conn->iscsi_conn_id);
+
+		spin_unlock_bh(&conn->session->back_lock);
+		if (!task) {
+			QEDI_NOTICE(&qedi->dbg_ctx,
+				    "task is null, itid=0x%x, cid=0x%x\n",
+				    cqe->itid, qedi_conn->iscsi_conn_id);
+			return;
+		}
+		qedi_conn->cmd_cleanup_cmpl++;
+		wake_up(&qedi_conn->wait_queue);
+
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_TID,
+			  "Freeing tid=0x%x for cid=0x%x\n",
+			  cqe->itid, qedi_conn->iscsi_conn_id);
+		qedi_clear_task_idx(qedi_conn->qedi, cqe->itid);
+
+	} else {
+		qedi_get_proto_itt(qedi, cqe->itid, &ptmp_itt);
+		protoitt = build_itt(ptmp_itt, conn->session->age);
+		task = iscsi_itt_to_task(conn, protoitt);
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Delayed or untracked cleanup response, itt=0x%x, tid=0x%x, cid=0x%x, task=%p\n",
+			 protoitt, cqe->itid, qedi_conn->iscsi_conn_id, task);
+	}
+}
+
+void qedi_fp_process_cqes(struct qedi_work *work)
+{
+	struct qedi_ctx *qedi = work->qedi;
+	union iscsi_cqe *cqe = &work->cqe;
+	struct iscsi_task *task = NULL;
+	struct iscsi_nopout *nopout_hdr;
+	struct qedi_conn *q_conn;
+	struct iscsi_conn *conn;
+	struct qedi_cmd *qedi_cmd;
+	u32 comp_type;
+	u32 iscsi_cid;
+	u32 hdr_opcode;
+	u16 que_idx = work->que_idx;
+	u8 cqe_err_bits = 0;
+
+	comp_type = cqe->cqe_common.cqe_type;
+	hdr_opcode = cqe->cqe_common.iscsi_hdr.common.hdr_first_byte;
+	cqe_err_bits =
+		cqe->cqe_common.error_bitmap.error_bits.cqe_error_status_bits;
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+		  "fw_cid=0x%x, cqe type=0x%x, opcode=0x%x\n",
+		  cqe->cqe_common.conn_id, comp_type, hdr_opcode);
+
+	if (comp_type >= MAX_ISCSI_CQES_TYPE) {
+		QEDI_WARN(&qedi->dbg_ctx, "Invalid CqE type\n");
+		return;
+	}
+
+	iscsi_cid  = cqe->cqe_common.conn_id;
+	q_conn = qedi->cid_que.conn_cid_tbl[iscsi_cid];
+	if (!q_conn) {
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "Session no longer exists for cid=0x%x!!\n",
+			  iscsi_cid);
+		return;
+	}
+
+	conn = q_conn->cls_conn->dd_data;
+
+	if (unlikely(cqe_err_bits &&
+		     GET_FIELD(cqe_err_bits,
+			       CQE_ERROR_BITMAP_DATA_DIGEST_ERR))) {
+		iscsi_conn_failure(conn, ISCSI_ERR_DATA_DGST);
+		return;
+	}
+
+	switch (comp_type) {
+	case ISCSI_CQE_TYPE_SOLICITED:
+	case ISCSI_CQE_TYPE_SOLICITED_WITH_SENSE:
+		qedi_cmd = container_of(work, struct qedi_cmd, cqe_work);
+		task = qedi_cmd->task;
+		if (!task) {
+			QEDI_WARN(&qedi->dbg_ctx, "task is NULL\n");
+			return;
+		}
+
+		/* Process NOPIN local completion */
+		nopout_hdr = (struct iscsi_nopout *)task->hdr;
+		if ((nopout_hdr->itt == RESERVED_ITT) &&
+		    (cqe->cqe_solicited.itid != (u16)RESERVED_ITT)) {
+			qedi_process_nopin_local_cmpl(qedi, &cqe->cqe_solicited,
+						      task, q_conn);
+		} else {
+			cqe->cqe_solicited.itid =
+					       qedi_get_itt(cqe->cqe_solicited);
+			/* Process other solicited responses */
+			qedi_mtask_completion(qedi, cqe, task, q_conn, que_idx);
+		}
+		break;
+	case ISCSI_CQE_TYPE_UNSOLICITED:
+		switch (hdr_opcode) {
+		case ISCSI_OPCODE_NOP_IN:
+			qedi_process_nopin_mesg(qedi, cqe, task, q_conn,
+						que_idx);
+			break;
+		case ISCSI_OPCODE_ASYNC_MSG:
+			qedi_process_async_mesg(qedi, cqe, task, q_conn,
+						que_idx);
+			break;
+		case ISCSI_OPCODE_REJECT:
+			qedi_process_reject_mesg(qedi, cqe, task, q_conn,
+						 que_idx);
+			break;
+		}
+		goto exit_fp_process;
+	case ISCSI_CQE_TYPE_DUMMY:
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM, "Dummy CqE\n");
+		goto exit_fp_process;
+	case ISCSI_CQE_TYPE_TASK_CLEANUP:
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM, "CleanUp CqE\n");
+		qedi_process_cmd_cleanup_resp(qedi, &cqe->cqe_solicited, task,
+					      conn);
+		goto exit_fp_process;
+	default:
+		QEDI_ERR(&qedi->dbg_ctx, "Error cqe.\n");
+		break;
+	}
+
+exit_fp_process:
+	return;
+}
+
+static void qedi_ring_doorbell(struct qedi_conn *qedi_conn)
+{
+	struct iscsi_db_data dbell = { 0 };
+
+	dbell.agg_flags = 0;
+
+	dbell.params |= DB_DEST_XCM << ISCSI_DB_DATA_DEST_SHIFT;
+	dbell.params |= DB_AGG_CMD_SET << ISCSI_DB_DATA_AGG_CMD_SHIFT;
+	dbell.params |=
+		   DQ_XCM_ISCSI_SQ_PROD_CMD << ISCSI_DB_DATA_AGG_VAL_SEL_SHIFT;
+
+	dbell.sq_prod = qedi_conn->ep->fw_sq_prod_idx;
+	writel(*(u32 *)&dbell, qedi_conn->ep->p_doorbell);
+
+	/* Make sure fw write idx is coherent, and include both memory barriers
+	 * as a failsafe as for some architectures the call is the same but on
+	 * others they are two different assembly operations.
+	 */
+	wmb();
+	mmiowb();
+	QEDI_INFO(&qedi_conn->qedi->dbg_ctx, QEDI_LOG_MP_REQ,
+		  "prod_idx=0x%x, fw_prod_idx=0x%x, cid=0x%x\n",
+		  qedi_conn->ep->sq_prod_idx, qedi_conn->ep->fw_sq_prod_idx,
+		  qedi_conn->iscsi_conn_id);
+}
+
+static u16 qedi_get_wqe_idx(struct qedi_conn *qedi_conn)
+{
+	struct qedi_endpoint *ep;
+	u16 rval;
+
+	ep = qedi_conn->ep;
+	rval = ep->sq_prod_idx;
+
+	/* Increament SQ index */
+	ep->sq_prod_idx++;
+	ep->fw_sq_prod_idx++;
+	if (ep->sq_prod_idx == QEDI_SQ_SIZE)
+		ep->sq_prod_idx = 0;
+
+	return rval;
+}
+
+int qedi_send_iscsi_login(struct qedi_conn *qedi_conn,
+			  struct iscsi_task *task)
+{
+	struct iscsi_login_req_hdr login_req_pdu_header;
+	struct scsi_sgl_task_params tx_sgl_task_params;
+	struct scsi_sgl_task_params rx_sgl_task_params;
+	struct iscsi_task_params task_params;
+	struct e4_iscsi_task_context *fw_task_ctx;
+	struct qedi_ctx *qedi = qedi_conn->qedi;
+	struct iscsi_login_req *login_hdr;
+	struct scsi_sge *resp_sge = NULL;
+	struct qedi_cmd *qedi_cmd;
+	struct qedi_endpoint *ep;
+	s16 tid = 0;
+	u16 sq_idx = 0;
+	int rval = 0;
+
+	resp_sge = (struct scsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
+	qedi_cmd = (struct qedi_cmd *)task->dd_data;
+	ep = qedi_conn->ep;
+	login_hdr = (struct iscsi_login_req *)task->hdr;
+
+	tid = qedi_get_task_idx(qedi);
+	if (tid == -1)
+		return -ENOMEM;
+
+	fw_task_ctx =
+	     (struct e4_iscsi_task_context *)qedi_get_task_mem(&qedi->tasks,
+							       tid);
+	memset(fw_task_ctx, 0, sizeof(struct e4_iscsi_task_context));
+
+	qedi_cmd->task_id = tid;
+
+	memset(&task_params, 0, sizeof(task_params));
+	memset(&login_req_pdu_header, 0, sizeof(login_req_pdu_header));
+	memset(&tx_sgl_task_params, 0, sizeof(tx_sgl_task_params));
+	memset(&rx_sgl_task_params, 0, sizeof(rx_sgl_task_params));
+	/* Update header info */
+	login_req_pdu_header.opcode = login_hdr->opcode;
+	login_req_pdu_header.version_min = login_hdr->min_version;
+	login_req_pdu_header.version_max = login_hdr->max_version;
+	login_req_pdu_header.flags_attr = login_hdr->flags;
+	login_req_pdu_header.isid_tabc = swab32p((u32 *)login_hdr->isid);
+	login_req_pdu_header.isid_d = swab16p((u16 *)&login_hdr->isid[4]);
+
+	login_req_pdu_header.tsih = login_hdr->tsih;
+	login_req_pdu_header.hdr_second_dword = ntoh24(login_hdr->dlength);
+
+	qedi_update_itt_map(qedi, tid, task->itt, qedi_cmd);
+	login_req_pdu_header.itt = qedi_set_itt(tid, get_itt(task->itt));
+	login_req_pdu_header.cid = qedi_conn->iscsi_conn_id;
+	login_req_pdu_header.cmd_sn = be32_to_cpu(login_hdr->cmdsn);
+	login_req_pdu_header.exp_stat_sn = be32_to_cpu(login_hdr->exp_statsn);
+	login_req_pdu_header.exp_stat_sn = 0;
+
+	/* Fill tx AHS and rx buffer */
+	tx_sgl_task_params.sgl =
+			       (struct scsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
+	tx_sgl_task_params.sgl_phys_addr.lo =
+					 (u32)(qedi_conn->gen_pdu.req_dma_addr);
+	tx_sgl_task_params.sgl_phys_addr.hi =
+			      (u32)((u64)qedi_conn->gen_pdu.req_dma_addr >> 32);
+	tx_sgl_task_params.total_buffer_size = ntoh24(login_hdr->dlength);
+	tx_sgl_task_params.num_sges = 1;
+
+	rx_sgl_task_params.sgl =
+			      (struct scsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
+	rx_sgl_task_params.sgl_phys_addr.lo =
+					(u32)(qedi_conn->gen_pdu.resp_dma_addr);
+	rx_sgl_task_params.sgl_phys_addr.hi =
+			     (u32)((u64)qedi_conn->gen_pdu.resp_dma_addr >> 32);
+	rx_sgl_task_params.total_buffer_size = resp_sge->sge_len;
+	rx_sgl_task_params.num_sges = 1;
+
+	/* Fill fw input params */
+	task_params.context = fw_task_ctx;
+	task_params.conn_icid = (u16)qedi_conn->iscsi_conn_id;
+	task_params.itid = tid;
+	task_params.cq_rss_number = 0;
+	task_params.tx_io_size = ntoh24(login_hdr->dlength);
+	task_params.rx_io_size = resp_sge->sge_len;
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+	task_params.sqe = &ep->sq[sq_idx];
+
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+	rval = init_initiator_login_request_task(&task_params,
+						 &login_req_pdu_header,
+						 &tx_sgl_task_params,
+						 &rx_sgl_task_params);
+	if (rval)
+		return -1;
+
+	spin_lock(&qedi_conn->list_lock);
+	list_add_tail(&qedi_cmd->io_cmd, &qedi_conn->active_cmd_list);
+	qedi_cmd->io_cmd_in_list = true;
+	qedi_conn->active_cmd_count++;
+	spin_unlock(&qedi_conn->list_lock);
+
+	qedi_ring_doorbell(qedi_conn);
+	return 0;
+}
+
+int qedi_send_iscsi_logout(struct qedi_conn *qedi_conn,
+			   struct iscsi_task *task)
+{
+	struct iscsi_logout_req_hdr logout_pdu_header;
+	struct scsi_sgl_task_params tx_sgl_task_params;
+	struct scsi_sgl_task_params rx_sgl_task_params;
+	struct iscsi_task_params task_params;
+	struct e4_iscsi_task_context *fw_task_ctx;
+	struct iscsi_logout *logout_hdr = NULL;
+	struct qedi_ctx *qedi = qedi_conn->qedi;
+	struct qedi_cmd *qedi_cmd;
+	struct qedi_endpoint *ep;
+	s16 tid = 0;
+	u16 sq_idx = 0;
+	int rval = 0;
+
+	qedi_cmd = (struct qedi_cmd *)task->dd_data;
+	logout_hdr = (struct iscsi_logout *)task->hdr;
+	ep = qedi_conn->ep;
+
+	tid = qedi_get_task_idx(qedi);
+	if (tid == -1)
+		return -ENOMEM;
+
+	fw_task_ctx =
+	     (struct e4_iscsi_task_context *)qedi_get_task_mem(&qedi->tasks,
+							       tid);
+	memset(fw_task_ctx, 0, sizeof(struct e4_iscsi_task_context));
+
+	qedi_cmd->task_id = tid;
+
+	memset(&task_params, 0, sizeof(task_params));
+	memset(&logout_pdu_header, 0, sizeof(logout_pdu_header));
+	memset(&tx_sgl_task_params, 0, sizeof(tx_sgl_task_params));
+	memset(&rx_sgl_task_params, 0, sizeof(rx_sgl_task_params));
+
+	/* Update header info */
+	logout_pdu_header.opcode = logout_hdr->opcode;
+	logout_pdu_header.reason_code = 0x80 | logout_hdr->flags;
+	qedi_update_itt_map(qedi, tid, task->itt, qedi_cmd);
+	logout_pdu_header.itt = qedi_set_itt(tid, get_itt(task->itt));
+	logout_pdu_header.exp_stat_sn = be32_to_cpu(logout_hdr->exp_statsn);
+	logout_pdu_header.cmd_sn = be32_to_cpu(logout_hdr->cmdsn);
+	logout_pdu_header.cid = qedi_conn->iscsi_conn_id;
+
+	/* Fill fw input params */
+	task_params.context = fw_task_ctx;
+	task_params.conn_icid = (u16)qedi_conn->iscsi_conn_id;
+	task_params.itid = tid;
+	task_params.cq_rss_number = 0;
+	task_params.tx_io_size = 0;
+	task_params.rx_io_size = 0;
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+	task_params.sqe = &ep->sq[sq_idx];
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+
+	rval = init_initiator_logout_request_task(&task_params,
+						  &logout_pdu_header,
+						  NULL, NULL);
+	if (rval)
+		return -1;
+
+	spin_lock(&qedi_conn->list_lock);
+	list_add_tail(&qedi_cmd->io_cmd, &qedi_conn->active_cmd_list);
+	qedi_cmd->io_cmd_in_list = true;
+	qedi_conn->active_cmd_count++;
+	spin_unlock(&qedi_conn->list_lock);
+
+	qedi_ring_doorbell(qedi_conn);
+	return 0;
+}
+
+int qedi_cleanup_all_io(struct qedi_ctx *qedi, struct qedi_conn *qedi_conn,
+			struct iscsi_task *task, bool in_recovery)
+{
+	int rval;
+	struct iscsi_task *ctask;
+	struct qedi_cmd *cmd, *cmd_tmp;
+	struct iscsi_tm *tmf_hdr;
+	unsigned int lun = 0;
+	bool lun_reset = false;
+	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+	struct iscsi_session *session = conn->session;
+
+	/* From recovery, task is NULL or from tmf resp valid task */
+	if (task) {
+		tmf_hdr = (struct iscsi_tm *)task->hdr;
+
+		if ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
+			ISCSI_TM_FUNC_LOGICAL_UNIT_RESET) {
+			lun_reset = true;
+			lun = scsilun_to_int(&tmf_hdr->lun);
+		}
+	}
+
+	qedi_conn->cmd_cleanup_req = 0;
+	qedi_conn->cmd_cleanup_cmpl = 0;
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
+		  "active_cmd_count=%d, cid=0x%x, in_recovery=%d, lun_reset=%d\n",
+		  qedi_conn->active_cmd_count, qedi_conn->iscsi_conn_id,
+		  in_recovery, lun_reset);
+
+	if (lun_reset)
+		spin_lock_bh(&session->back_lock);
+
+	spin_lock(&qedi_conn->list_lock);
+
+	list_for_each_entry_safe(cmd, cmd_tmp, &qedi_conn->active_cmd_list,
+				 io_cmd) {
+		ctask = cmd->task;
+		if (ctask == task)
+			continue;
+
+		if (lun_reset) {
+			if (cmd->scsi_cmd && cmd->scsi_cmd->device) {
+				QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
+					  "tid=0x%x itt=0x%x scsi_cmd_ptr=%p device=%p task_state=%d cmd_state=0%x cid=0x%x\n",
+					  cmd->task_id, get_itt(ctask->itt),
+					  cmd->scsi_cmd, cmd->scsi_cmd->device,
+					  ctask->state, cmd->state,
+					  qedi_conn->iscsi_conn_id);
+				if (cmd->scsi_cmd->device->lun != lun)
+					continue;
+			}
+		}
+		qedi_conn->cmd_cleanup_req++;
+		qedi_iscsi_cleanup_task(ctask, true);
+
+		list_del_init(&cmd->io_cmd);
+		qedi_conn->active_cmd_count--;
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "Deleted active cmd list node io_cmd=%p, cid=0x%x\n",
+			  &cmd->io_cmd, qedi_conn->iscsi_conn_id);
+	}
+
+	spin_unlock(&qedi_conn->list_lock);
+
+	if (lun_reset)
+		spin_unlock_bh(&session->back_lock);
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
+		  "cmd_cleanup_req=%d, cid=0x%x\n",
+		  qedi_conn->cmd_cleanup_req,
+		  qedi_conn->iscsi_conn_id);
+
+	rval  = wait_event_interruptible_timeout(qedi_conn->wait_queue,
+						 ((qedi_conn->cmd_cleanup_req ==
+						 qedi_conn->cmd_cleanup_cmpl) ||
+						 qedi_conn->ep),
+						 5 * HZ);
+	if (rval) {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
+			  "i/o cmd_cleanup_req=%d, equal to cmd_cleanup_cmpl=%d, cid=0x%x\n",
+			  qedi_conn->cmd_cleanup_req,
+			  qedi_conn->cmd_cleanup_cmpl,
+			  qedi_conn->iscsi_conn_id);
+
+		return 0;
+	}
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
+		  "i/o cmd_cleanup_req=%d, not equal to cmd_cleanup_cmpl=%d, cid=0x%x\n",
+		  qedi_conn->cmd_cleanup_req,
+		  qedi_conn->cmd_cleanup_cmpl,
+		  qedi_conn->iscsi_conn_id);
+
+	iscsi_host_for_each_session(qedi->shost,
+				    qedi_mark_device_missing);
+	qedi_ops->common->drain(qedi->cdev);
+
+	/* Enable IOs for all other sessions except current.*/
+	if (!wait_event_interruptible_timeout(qedi_conn->wait_queue,
+					      (qedi_conn->cmd_cleanup_req ==
+					       qedi_conn->cmd_cleanup_cmpl),
+					      5 * HZ)) {
+		iscsi_host_for_each_session(qedi->shost,
+					    qedi_mark_device_available);
+		return -1;
+	}
+
+	iscsi_host_for_each_session(qedi->shost,
+				    qedi_mark_device_available);
+
+	return 0;
+}
+
+void qedi_clearsq(struct qedi_ctx *qedi, struct qedi_conn *qedi_conn,
+		  struct iscsi_task *task)
+{
+	struct qedi_endpoint *qedi_ep;
+	int rval;
+
+	qedi_ep = qedi_conn->ep;
+	qedi_conn->cmd_cleanup_req = 0;
+	qedi_conn->cmd_cleanup_cmpl = 0;
+
+	if (!qedi_ep) {
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "Cannot proceed, ep already disconnected, cid=0x%x\n",
+			  qedi_conn->iscsi_conn_id);
+		return;
+	}
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+		  "Clearing SQ for cid=0x%x, conn=%p, ep=%p\n",
+		  qedi_conn->iscsi_conn_id, qedi_conn, qedi_ep);
+
+	qedi_ops->clear_sq(qedi->cdev, qedi_ep->handle);
+
+	rval = qedi_cleanup_all_io(qedi, qedi_conn, task, true);
+	if (rval) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "fatal error, need hard reset, cid=0x%x\n",
+			 qedi_conn->iscsi_conn_id);
+		WARN_ON(1);
+	}
+}
+
+static int qedi_wait_for_cleanup_request(struct qedi_ctx *qedi,
+					 struct qedi_conn *qedi_conn,
+					 struct iscsi_task *task,
+					 struct qedi_cmd *qedi_cmd,
+					 struct qedi_work_map *list_work)
+{
+	struct qedi_cmd *cmd = (struct qedi_cmd *)task->dd_data;
+	int wait;
+
+	wait  = wait_event_interruptible_timeout(qedi_conn->wait_queue,
+						 ((qedi_cmd->state ==
+						   CLEANUP_RECV) ||
+						 ((qedi_cmd->type == TYPEIO) &&
+						  (cmd->state ==
+						   RESPONSE_RECEIVED))),
+						 5 * HZ);
+	if (!wait) {
+		qedi_cmd->state = CLEANUP_WAIT_FAILED;
+
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
+			  "Cleanup timedout tid=0x%x, issue connection recovery, cid=0x%x\n",
+			  cmd->task_id, qedi_conn->iscsi_conn_id);
+
+		return -1;
+	}
+	return 0;
+}
+
+static void qedi_tmf_work(struct work_struct *work)
+{
+	struct qedi_cmd *qedi_cmd =
+		container_of(work, struct qedi_cmd, tmf_work);
+	struct qedi_conn *qedi_conn = qedi_cmd->conn;
+	struct qedi_ctx *qedi = qedi_conn->qedi;
+	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+	struct iscsi_cls_session *cls_sess;
+	struct qedi_work_map *list_work = NULL;
+	struct iscsi_task *mtask;
+	struct qedi_cmd *cmd;
+	struct iscsi_task *ctask;
+	struct iscsi_tm *tmf_hdr;
+	s16 rval = 0;
+	s16 tid = 0;
+
+	mtask = qedi_cmd->task;
+	tmf_hdr = (struct iscsi_tm *)mtask->hdr;
+	cls_sess = iscsi_conn_to_session(qedi_conn->cls_conn);
+	set_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags);
+
+	ctask = iscsi_itt_to_task(conn, tmf_hdr->rtt);
+	if (!ctask || !ctask->sc) {
+		QEDI_ERR(&qedi->dbg_ctx, "Task already completed\n");
+		goto abort_ret;
+	}
+
+	cmd = (struct qedi_cmd *)ctask->dd_data;
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+		  "Abort tmf rtt=0x%x, cmd itt=0x%x, cmd tid=0x%x, cid=0x%x\n",
+		  get_itt(tmf_hdr->rtt), get_itt(ctask->itt), cmd->task_id,
+		  qedi_conn->iscsi_conn_id);
+
+	if (qedi_do_not_recover) {
+		QEDI_ERR(&qedi->dbg_ctx, "DONT SEND CLEANUP/ABORT %d\n",
+			 qedi_do_not_recover);
+		goto abort_ret;
+	}
+
+	list_work = kzalloc(sizeof(*list_work), GFP_ATOMIC);
+	if (!list_work) {
+		QEDI_ERR(&qedi->dbg_ctx, "Memory allocation failed\n");
+		goto abort_ret;
+	}
+
+	qedi_cmd->type = TYPEIO;
+	list_work->qedi_cmd = qedi_cmd;
+	list_work->rtid = cmd->task_id;
+	list_work->state = QEDI_WORK_SCHEDULED;
+	qedi_cmd->list_tmf_work = list_work;
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
+		  "Queue tmf work=%p, list node=%p, cid=0x%x, tmf flags=0x%x\n",
+		  list_work->ptr_tmf_work, list_work, qedi_conn->iscsi_conn_id,
+		  tmf_hdr->flags);
+
+	spin_lock_bh(&qedi_conn->tmf_work_lock);
+	list_add_tail(&list_work->list, &qedi_conn->tmf_work_list);
+	spin_unlock_bh(&qedi_conn->tmf_work_lock);
+
+	qedi_iscsi_cleanup_task(ctask, false);
+
+	rval = qedi_wait_for_cleanup_request(qedi, qedi_conn, ctask, qedi_cmd,
+					     list_work);
+	if (rval == -1) {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+			  "FW cleanup got escalated, cid=0x%x\n",
+			  qedi_conn->iscsi_conn_id);
+		goto ldel_exit;
+	}
+
+	tid = qedi_get_task_idx(qedi);
+	if (tid == -1) {
+		QEDI_ERR(&qedi->dbg_ctx, "Invalid tid, cid=0x%x\n",
+			 qedi_conn->iscsi_conn_id);
+		goto ldel_exit;
+	}
+
+	qedi_cmd->task_id = tid;
+	qedi_send_iscsi_tmf(qedi_conn, qedi_cmd->task);
+
+abort_ret:
+	clear_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags);
+	return;
+
+ldel_exit:
+	spin_lock_bh(&qedi_conn->tmf_work_lock);
+	if (!qedi_cmd->list_tmf_work) {
+		list_del_init(&list_work->list);
+		qedi_cmd->list_tmf_work = NULL;
+		kfree(list_work);
+	}
+	spin_unlock_bh(&qedi_conn->tmf_work_lock);
+
+	spin_lock(&qedi_conn->list_lock);
+	list_del_init(&cmd->io_cmd);
+	qedi_conn->active_cmd_count--;
+	spin_unlock(&qedi_conn->list_lock);
+
+	clear_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags);
+}
+
+static int qedi_send_iscsi_tmf(struct qedi_conn *qedi_conn,
+			       struct iscsi_task *mtask)
+{
+	struct iscsi_tmf_request_hdr tmf_pdu_header;
+	struct iscsi_task_params task_params;
+	struct qedi_ctx *qedi = qedi_conn->qedi;
+	struct e4_iscsi_task_context *fw_task_ctx;
+	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+	struct iscsi_task *ctask;
+	struct iscsi_tm *tmf_hdr;
+	struct qedi_cmd *qedi_cmd;
+	struct qedi_cmd *cmd;
+	struct qedi_endpoint *ep;
+	u32 scsi_lun[2];
+	s16 tid = 0;
+	u16 sq_idx = 0;
+	int rval = 0;
+
+	tmf_hdr = (struct iscsi_tm *)mtask->hdr;
+	qedi_cmd = (struct qedi_cmd *)mtask->dd_data;
+	ep = qedi_conn->ep;
+	if (!ep)
+		return -ENODEV;
+
+	tid = qedi_get_task_idx(qedi);
+	if (tid == -1)
+		return -ENOMEM;
+
+	fw_task_ctx =
+	     (struct e4_iscsi_task_context *)qedi_get_task_mem(&qedi->tasks,
+							       tid);
+	memset(fw_task_ctx, 0, sizeof(struct e4_iscsi_task_context));
+
+	qedi_cmd->task_id = tid;
+
+	memset(&task_params, 0, sizeof(task_params));
+	memset(&tmf_pdu_header, 0, sizeof(tmf_pdu_header));
+
+	/* Update header info */
+	qedi_update_itt_map(qedi, tid, mtask->itt, qedi_cmd);
+	tmf_pdu_header.itt = qedi_set_itt(tid, get_itt(mtask->itt));
+	tmf_pdu_header.cmd_sn = be32_to_cpu(tmf_hdr->cmdsn);
+
+	memcpy(scsi_lun, &tmf_hdr->lun, sizeof(struct scsi_lun));
+	tmf_pdu_header.lun.lo = be32_to_cpu(scsi_lun[0]);
+	tmf_pdu_header.lun.hi = be32_to_cpu(scsi_lun[1]);
+
+	if ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
+	     ISCSI_TM_FUNC_ABORT_TASK) {
+		ctask = iscsi_itt_to_task(conn, tmf_hdr->rtt);
+		if (!ctask || !ctask->sc) {
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "Could not get reference task\n");
+			return 0;
+		}
+		cmd = (struct qedi_cmd *)ctask->dd_data;
+		tmf_pdu_header.rtt =
+				qedi_set_itt(cmd->task_id,
+					     get_itt(tmf_hdr->rtt));
+	} else {
+		tmf_pdu_header.rtt = ISCSI_RESERVED_TAG;
+	}
+
+	tmf_pdu_header.opcode = tmf_hdr->opcode;
+	tmf_pdu_header.function = tmf_hdr->flags;
+	tmf_pdu_header.hdr_second_dword = ntoh24(tmf_hdr->dlength);
+	tmf_pdu_header.ref_cmd_sn = be32_to_cpu(tmf_hdr->refcmdsn);
+
+	/* Fill fw input params */
+	task_params.context = fw_task_ctx;
+	task_params.conn_icid = (u16)qedi_conn->iscsi_conn_id;
+	task_params.itid = tid;
+	task_params.cq_rss_number = 0;
+	task_params.tx_io_size = 0;
+	task_params.rx_io_size = 0;
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+	task_params.sqe = &ep->sq[sq_idx];
+
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+	rval = init_initiator_tmf_request_task(&task_params,
+					       &tmf_pdu_header);
+	if (rval)
+		return -1;
+
+	spin_lock(&qedi_conn->list_lock);
+	list_add_tail(&qedi_cmd->io_cmd, &qedi_conn->active_cmd_list);
+	qedi_cmd->io_cmd_in_list = true;
+	qedi_conn->active_cmd_count++;
+	spin_unlock(&qedi_conn->list_lock);
+
+	qedi_ring_doorbell(qedi_conn);
+	return 0;
+}
+
+int qedi_iscsi_abort_work(struct qedi_conn *qedi_conn,
+			  struct iscsi_task *mtask)
+{
+	struct qedi_ctx *qedi = qedi_conn->qedi;
+	struct iscsi_tm *tmf_hdr;
+	struct qedi_cmd *qedi_cmd = (struct qedi_cmd *)mtask->dd_data;
+	s16 tid = 0;
+
+	tmf_hdr = (struct iscsi_tm *)mtask->hdr;
+	qedi_cmd->task = mtask;
+
+	/* If abort task then schedule the work and return */
+	if ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
+	    ISCSI_TM_FUNC_ABORT_TASK) {
+		qedi_cmd->state = CLEANUP_WAIT;
+		INIT_WORK(&qedi_cmd->tmf_work, qedi_tmf_work);
+		queue_work(qedi->tmf_thread, &qedi_cmd->tmf_work);
+
+	} else if (((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
+		    ISCSI_TM_FUNC_LOGICAL_UNIT_RESET) ||
+		   ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
+		    ISCSI_TM_FUNC_TARGET_WARM_RESET) ||
+		   ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
+		    ISCSI_TM_FUNC_TARGET_COLD_RESET)) {
+		tid = qedi_get_task_idx(qedi);
+		if (tid == -1) {
+			QEDI_ERR(&qedi->dbg_ctx, "Invalid tid, cid=0x%x\n",
+				 qedi_conn->iscsi_conn_id);
+			return -1;
+		}
+		qedi_cmd->task_id = tid;
+
+		qedi_send_iscsi_tmf(qedi_conn, qedi_cmd->task);
+
+	} else {
+		QEDI_ERR(&qedi->dbg_ctx, "Invalid tmf, cid=0x%x\n",
+			 qedi_conn->iscsi_conn_id);
+		return -1;
+	}
+
+	return 0;
+}
+
+int qedi_send_iscsi_text(struct qedi_conn *qedi_conn,
+			 struct iscsi_task *task)
+{
+	struct iscsi_text_request_hdr text_request_pdu_header;
+	struct scsi_sgl_task_params tx_sgl_task_params;
+	struct scsi_sgl_task_params rx_sgl_task_params;
+	struct iscsi_task_params task_params;
+	struct e4_iscsi_task_context *fw_task_ctx;
+	struct qedi_ctx *qedi = qedi_conn->qedi;
+	struct iscsi_text *text_hdr;
+	struct scsi_sge *req_sge = NULL;
+	struct scsi_sge *resp_sge = NULL;
+	struct qedi_cmd *qedi_cmd;
+	struct qedi_endpoint *ep;
+	s16 tid = 0;
+	u16 sq_idx = 0;
+	int rval = 0;
+
+	req_sge = (struct scsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
+	resp_sge = (struct scsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
+	qedi_cmd = (struct qedi_cmd *)task->dd_data;
+	text_hdr = (struct iscsi_text *)task->hdr;
+	ep = qedi_conn->ep;
+
+	tid = qedi_get_task_idx(qedi);
+	if (tid == -1)
+		return -ENOMEM;
+
+	fw_task_ctx =
+	     (struct e4_iscsi_task_context *)qedi_get_task_mem(&qedi->tasks,
+							       tid);
+	memset(fw_task_ctx, 0, sizeof(struct e4_iscsi_task_context));
+
+	qedi_cmd->task_id = tid;
+
+	memset(&task_params, 0, sizeof(task_params));
+	memset(&text_request_pdu_header, 0, sizeof(text_request_pdu_header));
+	memset(&tx_sgl_task_params, 0, sizeof(tx_sgl_task_params));
+	memset(&rx_sgl_task_params, 0, sizeof(rx_sgl_task_params));
+
+	/* Update header info */
+	text_request_pdu_header.opcode = text_hdr->opcode;
+	text_request_pdu_header.flags_attr = text_hdr->flags;
+
+	qedi_update_itt_map(qedi, tid, task->itt, qedi_cmd);
+	text_request_pdu_header.itt = qedi_set_itt(tid, get_itt(task->itt));
+	text_request_pdu_header.ttt = text_hdr->ttt;
+	text_request_pdu_header.cmd_sn = be32_to_cpu(text_hdr->cmdsn);
+	text_request_pdu_header.exp_stat_sn = be32_to_cpu(text_hdr->exp_statsn);
+	text_request_pdu_header.hdr_second_dword = ntoh24(text_hdr->dlength);
+
+	/* Fill tx AHS and rx buffer */
+	tx_sgl_task_params.sgl =
+			       (struct scsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
+	tx_sgl_task_params.sgl_phys_addr.lo =
+					 (u32)(qedi_conn->gen_pdu.req_dma_addr);
+	tx_sgl_task_params.sgl_phys_addr.hi =
+			      (u32)((u64)qedi_conn->gen_pdu.req_dma_addr >> 32);
+	tx_sgl_task_params.total_buffer_size = req_sge->sge_len;
+	tx_sgl_task_params.num_sges = 1;
+
+	rx_sgl_task_params.sgl =
+			      (struct scsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
+	rx_sgl_task_params.sgl_phys_addr.lo =
+					(u32)(qedi_conn->gen_pdu.resp_dma_addr);
+	rx_sgl_task_params.sgl_phys_addr.hi =
+			     (u32)((u64)qedi_conn->gen_pdu.resp_dma_addr >> 32);
+	rx_sgl_task_params.total_buffer_size = resp_sge->sge_len;
+	rx_sgl_task_params.num_sges = 1;
+
+	/* Fill fw input params */
+	task_params.context = fw_task_ctx;
+	task_params.conn_icid = (u16)qedi_conn->iscsi_conn_id;
+	task_params.itid = tid;
+	task_params.cq_rss_number = 0;
+	task_params.tx_io_size = ntoh24(text_hdr->dlength);
+	task_params.rx_io_size = resp_sge->sge_len;
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+	task_params.sqe = &ep->sq[sq_idx];
+
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+	rval = init_initiator_text_request_task(&task_params,
+						&text_request_pdu_header,
+						&tx_sgl_task_params,
+						&rx_sgl_task_params);
+	if (rval)
+		return -1;
+
+	spin_lock(&qedi_conn->list_lock);
+	list_add_tail(&qedi_cmd->io_cmd, &qedi_conn->active_cmd_list);
+	qedi_cmd->io_cmd_in_list = true;
+	qedi_conn->active_cmd_count++;
+	spin_unlock(&qedi_conn->list_lock);
+
+	qedi_ring_doorbell(qedi_conn);
+	return 0;
+}
+
+int qedi_send_iscsi_nopout(struct qedi_conn *qedi_conn,
+			   struct iscsi_task *task,
+			   char *datap, int data_len, int unsol)
+{
+	struct iscsi_nop_out_hdr nop_out_pdu_header;
+	struct scsi_sgl_task_params tx_sgl_task_params;
+	struct scsi_sgl_task_params rx_sgl_task_params;
+	struct iscsi_task_params task_params;
+	struct qedi_ctx *qedi = qedi_conn->qedi;
+	struct e4_iscsi_task_context *fw_task_ctx;
+	struct iscsi_nopout *nopout_hdr;
+	struct scsi_sge *resp_sge = NULL;
+	struct qedi_cmd *qedi_cmd;
+	struct qedi_endpoint *ep;
+	u32 scsi_lun[2];
+	s16 tid = 0;
+	u16 sq_idx = 0;
+	int rval = 0;
+
+	resp_sge = (struct scsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
+	qedi_cmd = (struct qedi_cmd *)task->dd_data;
+	nopout_hdr = (struct iscsi_nopout *)task->hdr;
+	ep = qedi_conn->ep;
+
+	tid = qedi_get_task_idx(qedi);
+	if (tid == -1)
+		return -ENOMEM;
+
+	fw_task_ctx =
+	     (struct e4_iscsi_task_context *)qedi_get_task_mem(&qedi->tasks,
+							       tid);
+	memset(fw_task_ctx, 0, sizeof(struct e4_iscsi_task_context));
+
+	qedi_cmd->task_id = tid;
+
+	memset(&task_params, 0, sizeof(task_params));
+	memset(&nop_out_pdu_header, 0, sizeof(nop_out_pdu_header));
+	memset(&tx_sgl_task_params, 0, sizeof(tx_sgl_task_params));
+	memset(&rx_sgl_task_params, 0, sizeof(rx_sgl_task_params));
+
+	/* Update header info */
+	nop_out_pdu_header.opcode = nopout_hdr->opcode;
+	SET_FIELD(nop_out_pdu_header.flags_attr, ISCSI_NOP_OUT_HDR_CONST1, 1);
+	SET_FIELD(nop_out_pdu_header.flags_attr, ISCSI_NOP_OUT_HDR_RSRV, 0);
+
+	memcpy(scsi_lun, &nopout_hdr->lun, sizeof(struct scsi_lun));
+	nop_out_pdu_header.lun.lo = be32_to_cpu(scsi_lun[0]);
+	nop_out_pdu_header.lun.hi = be32_to_cpu(scsi_lun[1]);
+	nop_out_pdu_header.cmd_sn = be32_to_cpu(nopout_hdr->cmdsn);
+	nop_out_pdu_header.exp_stat_sn = be32_to_cpu(nopout_hdr->exp_statsn);
+
+	qedi_update_itt_map(qedi, tid, task->itt, qedi_cmd);
+
+	if (nopout_hdr->ttt != ISCSI_TTT_ALL_ONES) {
+		nop_out_pdu_header.itt = be32_to_cpu(nopout_hdr->itt);
+		nop_out_pdu_header.ttt = be32_to_cpu(nopout_hdr->ttt);
+	} else {
+		nop_out_pdu_header.itt = qedi_set_itt(tid, get_itt(task->itt));
+		nop_out_pdu_header.ttt = ISCSI_TTT_ALL_ONES;
+
+		spin_lock(&qedi_conn->list_lock);
+		list_add_tail(&qedi_cmd->io_cmd, &qedi_conn->active_cmd_list);
+		qedi_cmd->io_cmd_in_list = true;
+		qedi_conn->active_cmd_count++;
+		spin_unlock(&qedi_conn->list_lock);
+	}
+
+	/* Fill tx AHS and rx buffer */
+	if (data_len) {
+		tx_sgl_task_params.sgl =
+			       (struct scsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
+		tx_sgl_task_params.sgl_phys_addr.lo =
+					 (u32)(qedi_conn->gen_pdu.req_dma_addr);
+		tx_sgl_task_params.sgl_phys_addr.hi =
+			      (u32)((u64)qedi_conn->gen_pdu.req_dma_addr >> 32);
+		tx_sgl_task_params.total_buffer_size = data_len;
+		tx_sgl_task_params.num_sges = 1;
+
+		rx_sgl_task_params.sgl =
+			      (struct scsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
+		rx_sgl_task_params.sgl_phys_addr.lo =
+					(u32)(qedi_conn->gen_pdu.resp_dma_addr);
+		rx_sgl_task_params.sgl_phys_addr.hi =
+			     (u32)((u64)qedi_conn->gen_pdu.resp_dma_addr >> 32);
+		rx_sgl_task_params.total_buffer_size = resp_sge->sge_len;
+		rx_sgl_task_params.num_sges = 1;
+	}
+
+	/* Fill fw input params */
+	task_params.context = fw_task_ctx;
+	task_params.conn_icid = (u16)qedi_conn->iscsi_conn_id;
+	task_params.itid = tid;
+	task_params.cq_rss_number = 0;
+	task_params.tx_io_size = data_len;
+	task_params.rx_io_size = resp_sge->sge_len;
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+	task_params.sqe = &ep->sq[sq_idx];
+
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+	rval = init_initiator_nop_out_task(&task_params,
+					   &nop_out_pdu_header,
+					   &tx_sgl_task_params,
+					   &rx_sgl_task_params);
+	if (rval)
+		return -1;
+
+	qedi_ring_doorbell(qedi_conn);
+	return 0;
+}
+
+static int qedi_split_bd(struct qedi_cmd *cmd, u64 addr, int sg_len,
+			 int bd_index)
+{
+	struct scsi_sge *bd = cmd->io_tbl.sge_tbl;
+	int frag_size, sg_frags;
+
+	sg_frags = 0;
+
+	while (sg_len) {
+		if (addr % QEDI_PAGE_SIZE)
+			frag_size =
+				   (QEDI_PAGE_SIZE - (addr % QEDI_PAGE_SIZE));
+		else
+			frag_size = (sg_len > QEDI_BD_SPLIT_SZ) ? 0 :
+				    (sg_len % QEDI_BD_SPLIT_SZ);
+
+		if (frag_size == 0)
+			frag_size = QEDI_BD_SPLIT_SZ;
+
+		bd[bd_index + sg_frags].sge_addr.lo = (addr & 0xffffffff);
+		bd[bd_index + sg_frags].sge_addr.hi = (addr >> 32);
+		bd[bd_index + sg_frags].sge_len = (u16)frag_size;
+		QEDI_INFO(&cmd->conn->qedi->dbg_ctx, QEDI_LOG_IO,
+			  "split sge %d: addr=%llx, len=%x",
+			  (bd_index + sg_frags), addr, frag_size);
+
+		addr += (u64)frag_size;
+		sg_frags++;
+		sg_len -= frag_size;
+	}
+	return sg_frags;
+}
+
+static int qedi_map_scsi_sg(struct qedi_ctx *qedi, struct qedi_cmd *cmd)
+{
+	struct scsi_cmnd *sc = cmd->scsi_cmd;
+	struct scsi_sge *bd = cmd->io_tbl.sge_tbl;
+	struct scatterlist *sg;
+	int byte_count = 0;
+	int bd_count = 0;
+	int sg_count;
+	int sg_len;
+	int sg_frags;
+	u64 addr, end_addr;
+	int i;
+
+	WARN_ON(scsi_sg_count(sc) > QEDI_ISCSI_MAX_BDS_PER_CMD);
+
+	sg_count = dma_map_sg(&qedi->pdev->dev, scsi_sglist(sc),
+			      scsi_sg_count(sc), sc->sc_data_direction);
+
+	/*
+	 * New condition to send single SGE as cached-SGL.
+	 * Single SGE with length less than 64K.
+	 */
+	sg = scsi_sglist(sc);
+	if ((sg_count == 1) && (sg_dma_len(sg) <= MAX_SGLEN_FOR_CACHESGL)) {
+		sg_len = sg_dma_len(sg);
+		addr = (u64)sg_dma_address(sg);
+
+		bd[bd_count].sge_addr.lo = (addr & 0xffffffff);
+		bd[bd_count].sge_addr.hi = (addr >> 32);
+		bd[bd_count].sge_len = (u16)sg_len;
+
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_IO,
+			  "single-cached-sgl: bd_count:%d addr=%llx, len=%x",
+			  sg_count, addr, sg_len);
+
+		return ++bd_count;
+	}
+
+	scsi_for_each_sg(sc, sg, sg_count, i) {
+		sg_len = sg_dma_len(sg);
+		addr = (u64)sg_dma_address(sg);
+		end_addr = (addr + sg_len);
+
+		/*
+		 * first sg elem in the 'list',
+		 * check if end addr is page-aligned.
+		 */
+		if ((i == 0) && (sg_count > 1) && (end_addr % QEDI_PAGE_SIZE))
+			cmd->use_slowpath = true;
+
+		/*
+		 * last sg elem in the 'list',
+		 * check if start addr is page-aligned.
+		 */
+		else if ((i == (sg_count - 1)) &&
+			 (sg_count > 1) && (addr % QEDI_PAGE_SIZE))
+			cmd->use_slowpath = true;
+
+		/*
+		 * middle sg elements in list,
+		 * check if start and end addr is page-aligned
+		 */
+		else if ((i != 0) && (i != (sg_count - 1)) &&
+			 ((addr % QEDI_PAGE_SIZE) ||
+			 (end_addr % QEDI_PAGE_SIZE)))
+			cmd->use_slowpath = true;
+
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_IO, "sg[%d] size=0x%x",
+			  i, sg_len);
+
+		if (sg_len > QEDI_BD_SPLIT_SZ) {
+			sg_frags = qedi_split_bd(cmd, addr, sg_len, bd_count);
+		} else {
+			sg_frags = 1;
+			bd[bd_count].sge_addr.lo = addr & 0xffffffff;
+			bd[bd_count].sge_addr.hi = addr >> 32;
+			bd[bd_count].sge_len = sg_len;
+		}
+		byte_count += sg_len;
+		bd_count += sg_frags;
+	}
+
+	if (byte_count != scsi_bufflen(sc))
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "byte_count = %d != scsi_bufflen = %d\n", byte_count,
+			 scsi_bufflen(sc));
+	else
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_IO, "byte_count = %d\n",
+			  byte_count);
+
+	WARN_ON(byte_count != scsi_bufflen(sc));
+
+	return bd_count;
+}
+
+static void qedi_iscsi_map_sg_list(struct qedi_cmd *cmd)
+{
+	int bd_count;
+	struct scsi_cmnd *sc = cmd->scsi_cmd;
+
+	if (scsi_sg_count(sc)) {
+		bd_count  = qedi_map_scsi_sg(cmd->conn->qedi, cmd);
+		if (bd_count == 0)
+			return;
+	} else {
+		struct scsi_sge *bd = cmd->io_tbl.sge_tbl;
+
+		bd[0].sge_addr.lo = 0;
+		bd[0].sge_addr.hi = 0;
+		bd[0].sge_len = 0;
+		bd_count = 0;
+	}
+	cmd->io_tbl.sge_valid = bd_count;
+}
+
+static void qedi_cpy_scsi_cdb(struct scsi_cmnd *sc, u32 *dstp)
+{
+	u32 dword;
+	int lpcnt;
+	u8 *srcp;
+
+	lpcnt = sc->cmd_len / sizeof(dword);
+	srcp = (u8 *)sc->cmnd;
+	while (lpcnt--) {
+		memcpy(&dword, (const void *)srcp, 4);
+		*dstp = cpu_to_be32(dword);
+		srcp += 4;
+		dstp++;
+	}
+	if (sc->cmd_len & 0x3) {
+		dword = (u32)srcp[0] | ((u32)srcp[1] << 8);
+		*dstp = cpu_to_be32(dword);
+	}
+}
+
+void qedi_trace_io(struct qedi_ctx *qedi, struct iscsi_task *task,
+		   u16 tid, int8_t direction)
+{
+	struct qedi_io_log *io_log;
+	struct iscsi_conn *conn = task->conn;
+	struct qedi_conn *qedi_conn = conn->dd_data;
+	struct scsi_cmnd *sc_cmd = task->sc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qedi->io_trace_lock, flags);
+
+	io_log = &qedi->io_trace_buf[qedi->io_trace_idx];
+	io_log->direction = direction;
+	io_log->task_id = tid;
+	io_log->cid = qedi_conn->iscsi_conn_id;
+	io_log->lun = sc_cmd->device->lun;
+	io_log->op = sc_cmd->cmnd[0];
+	io_log->lba[0] = sc_cmd->cmnd[2];
+	io_log->lba[1] = sc_cmd->cmnd[3];
+	io_log->lba[2] = sc_cmd->cmnd[4];
+	io_log->lba[3] = sc_cmd->cmnd[5];
+	io_log->bufflen = scsi_bufflen(sc_cmd);
+	io_log->sg_count = scsi_sg_count(sc_cmd);
+	io_log->fast_sgs = qedi->fast_sgls;
+	io_log->cached_sgs = qedi->cached_sgls;
+	io_log->slow_sgs = qedi->slow_sgls;
+	io_log->cached_sge = qedi->use_cached_sge;
+	io_log->slow_sge = qedi->use_slow_sge;
+	io_log->fast_sge = qedi->use_fast_sge;
+	io_log->result = sc_cmd->result;
+	io_log->jiffies = jiffies;
+	io_log->blk_req_cpu = smp_processor_id();
+
+	if (direction == QEDI_IO_TRACE_REQ) {
+		/* For requests we only care about the submission CPU */
+		io_log->req_cpu = smp_processor_id() % qedi->num_queues;
+		io_log->intr_cpu = 0;
+		io_log->blk_rsp_cpu = 0;
+	} else if (direction == QEDI_IO_TRACE_RSP) {
+		io_log->req_cpu = smp_processor_id() % qedi->num_queues;
+		io_log->intr_cpu = qedi->intr_cpu;
+		io_log->blk_rsp_cpu = smp_processor_id();
+	}
+
+	qedi->io_trace_idx++;
+	if (qedi->io_trace_idx == QEDI_IO_TRACE_SIZE)
+		qedi->io_trace_idx = 0;
+
+	qedi->use_cached_sge = false;
+	qedi->use_slow_sge = false;
+	qedi->use_fast_sge = false;
+
+	spin_unlock_irqrestore(&qedi->io_trace_lock, flags);
+}
+
+int qedi_iscsi_send_ioreq(struct iscsi_task *task)
+{
+	struct iscsi_conn *conn = task->conn;
+	struct iscsi_session *session = conn->session;
+	struct Scsi_Host *shost = iscsi_session_to_shost(session->cls_session);
+	struct qedi_ctx *qedi = iscsi_host_priv(shost);
+	struct qedi_conn *qedi_conn = conn->dd_data;
+	struct qedi_cmd *cmd = task->dd_data;
+	struct scsi_cmnd *sc = task->sc;
+	struct iscsi_cmd_hdr cmd_pdu_header;
+	struct scsi_sgl_task_params tx_sgl_task_params;
+	struct scsi_sgl_task_params rx_sgl_task_params;
+	struct scsi_sgl_task_params *prx_sgl = NULL;
+	struct scsi_sgl_task_params *ptx_sgl = NULL;
+	struct iscsi_task_params task_params;
+	struct iscsi_conn_params conn_params;
+	struct scsi_initiator_cmd_params cmd_params;
+	struct e4_iscsi_task_context *fw_task_ctx;
+	struct iscsi_cls_conn *cls_conn;
+	struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr;
+	enum iscsi_task_type task_type = MAX_ISCSI_TASK_TYPE;
+	struct qedi_endpoint *ep;
+	u32 scsi_lun[2];
+	s16 tid = 0;
+	u16 sq_idx = 0;
+	u16 cq_idx;
+	int rval = 0;
+
+	ep = qedi_conn->ep;
+	cls_conn = qedi_conn->cls_conn;
+	conn = cls_conn->dd_data;
+
+	qedi_iscsi_map_sg_list(cmd);
+	int_to_scsilun(sc->device->lun, (struct scsi_lun *)scsi_lun);
+
+	tid = qedi_get_task_idx(qedi);
+	if (tid == -1)
+		return -ENOMEM;
+
+	fw_task_ctx =
+	     (struct e4_iscsi_task_context *)qedi_get_task_mem(&qedi->tasks,
+							       tid);
+	memset(fw_task_ctx, 0, sizeof(struct e4_iscsi_task_context));
+
+	cmd->task_id = tid;
+
+	memset(&task_params, 0, sizeof(task_params));
+	memset(&cmd_pdu_header, 0, sizeof(cmd_pdu_header));
+	memset(&tx_sgl_task_params, 0, sizeof(tx_sgl_task_params));
+	memset(&rx_sgl_task_params, 0, sizeof(rx_sgl_task_params));
+	memset(&conn_params, 0, sizeof(conn_params));
+	memset(&cmd_params, 0, sizeof(cmd_params));
+
+	cq_idx = smp_processor_id() % qedi->num_queues;
+	/* Update header info */
+	SET_FIELD(cmd_pdu_header.flags_attr, ISCSI_CMD_HDR_ATTR,
+		  ISCSI_ATTR_SIMPLE);
+	if (hdr->cdb[0] != TEST_UNIT_READY) {
+		if (sc->sc_data_direction == DMA_TO_DEVICE) {
+			SET_FIELD(cmd_pdu_header.flags_attr,
+				  ISCSI_CMD_HDR_WRITE, 1);
+			task_type = ISCSI_TASK_TYPE_INITIATOR_WRITE;
+		} else {
+			SET_FIELD(cmd_pdu_header.flags_attr,
+				  ISCSI_CMD_HDR_READ, 1);
+			task_type = ISCSI_TASK_TYPE_INITIATOR_READ;
+		}
+	}
+
+	cmd_pdu_header.lun.lo = be32_to_cpu(scsi_lun[0]);
+	cmd_pdu_header.lun.hi = be32_to_cpu(scsi_lun[1]);
+
+	qedi_update_itt_map(qedi, tid, task->itt, cmd);
+	cmd_pdu_header.itt = qedi_set_itt(tid, get_itt(task->itt));
+	cmd_pdu_header.expected_transfer_length = cpu_to_be32(hdr->data_length);
+	cmd_pdu_header.hdr_second_dword = ntoh24(hdr->dlength);
+	cmd_pdu_header.cmd_sn = be32_to_cpu(hdr->cmdsn);
+	cmd_pdu_header.hdr_first_byte = hdr->opcode;
+	qedi_cpy_scsi_cdb(sc, (u32 *)cmd_pdu_header.cdb);
+
+	/* Fill tx AHS and rx buffer */
+	if (task_type == ISCSI_TASK_TYPE_INITIATOR_WRITE) {
+		tx_sgl_task_params.sgl = cmd->io_tbl.sge_tbl;
+		tx_sgl_task_params.sgl_phys_addr.lo =
+						 (u32)(cmd->io_tbl.sge_tbl_dma);
+		tx_sgl_task_params.sgl_phys_addr.hi =
+				      (u32)((u64)cmd->io_tbl.sge_tbl_dma >> 32);
+		tx_sgl_task_params.total_buffer_size = scsi_bufflen(sc);
+		tx_sgl_task_params.num_sges = cmd->io_tbl.sge_valid;
+		if (cmd->use_slowpath)
+			tx_sgl_task_params.small_mid_sge = true;
+	} else if (task_type == ISCSI_TASK_TYPE_INITIATOR_READ) {
+		rx_sgl_task_params.sgl = cmd->io_tbl.sge_tbl;
+		rx_sgl_task_params.sgl_phys_addr.lo =
+						 (u32)(cmd->io_tbl.sge_tbl_dma);
+		rx_sgl_task_params.sgl_phys_addr.hi =
+				      (u32)((u64)cmd->io_tbl.sge_tbl_dma >> 32);
+		rx_sgl_task_params.total_buffer_size = scsi_bufflen(sc);
+		rx_sgl_task_params.num_sges = cmd->io_tbl.sge_valid;
+	}
+
+	/* Add conn param */
+	conn_params.first_burst_length = conn->session->first_burst;
+	conn_params.max_send_pdu_length = conn->max_xmit_dlength;
+	conn_params.max_burst_length = conn->session->max_burst;
+	if (conn->session->initial_r2t_en)
+		conn_params.initial_r2t = true;
+	if (conn->session->imm_data_en)
+		conn_params.immediate_data = true;
+
+	/* Add cmd params */
+	cmd_params.sense_data_buffer_phys_addr.lo = (u32)cmd->sense_buffer_dma;
+	cmd_params.sense_data_buffer_phys_addr.hi =
+					(u32)((u64)cmd->sense_buffer_dma >> 32);
+	/* Fill fw input params */
+	task_params.context = fw_task_ctx;
+	task_params.conn_icid = (u16)qedi_conn->iscsi_conn_id;
+	task_params.itid = tid;
+	task_params.cq_rss_number = cq_idx;
+	if (task_type == ISCSI_TASK_TYPE_INITIATOR_WRITE)
+		task_params.tx_io_size = scsi_bufflen(sc);
+	else if (task_type == ISCSI_TASK_TYPE_INITIATOR_READ)
+		task_params.rx_io_size = scsi_bufflen(sc);
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+	task_params.sqe = &ep->sq[sq_idx];
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_IO,
+		  "%s: %s-SGL: sg_len=0x%x num_sges=0x%x first-sge-lo=0x%x first-sge-hi=0x%x\n",
+		  (task_type == ISCSI_TASK_TYPE_INITIATOR_WRITE) ?
+		  "Write " : "Read ", (cmd->io_tbl.sge_valid == 1) ?
+		  "Single" : (cmd->use_slowpath ? "SLOW" : "FAST"),
+		  (u16)cmd->io_tbl.sge_valid, scsi_bufflen(sc),
+		  (u32)(cmd->io_tbl.sge_tbl_dma),
+		  (u32)((u64)cmd->io_tbl.sge_tbl_dma >> 32));
+
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+
+	if (task_params.tx_io_size != 0)
+		ptx_sgl = &tx_sgl_task_params;
+	if (task_params.rx_io_size != 0)
+		prx_sgl = &rx_sgl_task_params;
+
+	rval = init_initiator_rw_iscsi_task(&task_params, &conn_params,
+					    &cmd_params, &cmd_pdu_header,
+					    ptx_sgl, prx_sgl,
+					    NULL);
+	if (rval)
+		return -1;
+
+	spin_lock(&qedi_conn->list_lock);
+	list_add_tail(&cmd->io_cmd, &qedi_conn->active_cmd_list);
+	cmd->io_cmd_in_list = true;
+	qedi_conn->active_cmd_count++;
+	spin_unlock(&qedi_conn->list_lock);
+
+	qedi_ring_doorbell(qedi_conn);
+	return 0;
+}
+
+int qedi_iscsi_cleanup_task(struct iscsi_task *task, bool mark_cmd_node_deleted)
+{
+	struct iscsi_task_params task_params;
+	struct qedi_endpoint *ep;
+	struct iscsi_conn *conn = task->conn;
+	struct qedi_conn *qedi_conn = conn->dd_data;
+	struct qedi_cmd *cmd = task->dd_data;
+	u16 sq_idx = 0;
+	int rval = 0;
+
+	QEDI_INFO(&qedi_conn->qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
+		  "issue cleanup tid=0x%x itt=0x%x task_state=%d cmd_state=0%x cid=0x%x\n",
+		  cmd->task_id, get_itt(task->itt), task->state,
+		  cmd->state, qedi_conn->iscsi_conn_id);
+
+	memset(&task_params, 0, sizeof(task_params));
+	ep = qedi_conn->ep;
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+
+	task_params.sqe = &ep->sq[sq_idx];
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+	task_params.itid = cmd->task_id;
+
+	rval = init_cleanup_task(&task_params);
+	if (rval)
+		return rval;
+
+	qedi_ring_doorbell(qedi_conn);
+	return 0;
+}
diff --git a/drivers/scsi/qedi/qedi_fw_api.c b/drivers/scsi/qedi/qedi_fw_api.c
new file mode 100644
index 0000000..a269da1
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_fw_api.c
@@ -0,0 +1,803 @@
+/* QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include "qedi_hsi.h"
+#include <linux/qed/qed_if.h>
+
+#include "qedi_fw_iscsi.h"
+#include "qedi_fw_scsi.h"
+
+#define SCSI_NUM_SGES_IN_CACHE 0x4
+
+static bool scsi_is_slow_sgl(u16 num_sges, bool small_mid_sge)
+{
+	return (num_sges > SCSI_NUM_SGES_SLOW_SGL_THR && small_mid_sge);
+}
+
+static
+void init_scsi_sgl_context(struct scsi_sgl_params *ctx_sgl_params,
+			   struct scsi_cached_sges *ctx_data_desc,
+			   struct scsi_sgl_task_params *sgl_task_params)
+{
+	u8 sge_index;
+	u8 num_sges;
+	u32 val;
+
+	num_sges = (sgl_task_params->num_sges > SCSI_NUM_SGES_IN_CACHE) ?
+			     SCSI_NUM_SGES_IN_CACHE : sgl_task_params->num_sges;
+
+	/* sgl params */
+	val = cpu_to_le32(sgl_task_params->sgl_phys_addr.lo);
+	ctx_sgl_params->sgl_addr.lo = val;
+	val = cpu_to_le32(sgl_task_params->sgl_phys_addr.hi);
+	ctx_sgl_params->sgl_addr.hi = val;
+	val = cpu_to_le32(sgl_task_params->total_buffer_size);
+	ctx_sgl_params->sgl_total_length = val;
+	ctx_sgl_params->sgl_num_sges = cpu_to_le16(sgl_task_params->num_sges);
+
+	for (sge_index = 0; sge_index < num_sges; sge_index++) {
+		val = cpu_to_le32(sgl_task_params->sgl[sge_index].sge_addr.lo);
+		ctx_data_desc->sge[sge_index].sge_addr.lo = val;
+		val = cpu_to_le32(sgl_task_params->sgl[sge_index].sge_addr.hi);
+		ctx_data_desc->sge[sge_index].sge_addr.hi = val;
+		val = cpu_to_le32(sgl_task_params->sgl[sge_index].sge_len);
+		ctx_data_desc->sge[sge_index].sge_len = val;
+	}
+}
+
+static u32 calc_rw_task_size(struct iscsi_task_params *task_params,
+			     enum iscsi_task_type task_type,
+			     struct scsi_sgl_task_params *sgl_task_params,
+			     struct scsi_dif_task_params *dif_task_params)
+{
+	u32 io_size;
+
+	if (task_type == ISCSI_TASK_TYPE_INITIATOR_WRITE ||
+	    task_type == ISCSI_TASK_TYPE_TARGET_READ)
+		io_size = task_params->tx_io_size;
+	else
+		io_size = task_params->rx_io_size;
+
+	if (!io_size)
+		return 0;
+
+	if (!dif_task_params)
+		return io_size;
+
+	return !dif_task_params->dif_on_network ?
+	       io_size : sgl_task_params->total_buffer_size;
+}
+
+static void
+init_dif_context_flags(struct iscsi_dif_flags *ctx_dif_flags,
+		       struct scsi_dif_task_params *dif_task_params)
+{
+	if (!dif_task_params)
+		return;
+
+	SET_FIELD(ctx_dif_flags->flags, ISCSI_DIF_FLAGS_PROT_INTERVAL_SIZE_LOG,
+		  dif_task_params->dif_block_size_log);
+	SET_FIELD(ctx_dif_flags->flags, ISCSI_DIF_FLAGS_DIF_TO_PEER,
+		  dif_task_params->dif_on_network ? 1 : 0);
+	SET_FIELD(ctx_dif_flags->flags, ISCSI_DIF_FLAGS_HOST_INTERFACE,
+		  dif_task_params->dif_on_host ? 1 : 0);
+}
+
+static void init_sqe(struct iscsi_task_params *task_params,
+		     struct scsi_sgl_task_params *sgl_task_params,
+		     struct scsi_dif_task_params *dif_task_params,
+		     struct iscsi_common_hdr *pdu_header,
+		     struct scsi_initiator_cmd_params *cmd_params,
+		     enum iscsi_task_type task_type,
+		     bool is_cleanup)
+{
+	if (!task_params->sqe)
+		return;
+
+	memset(task_params->sqe, 0, sizeof(*task_params->sqe));
+	task_params->sqe->task_id = cpu_to_le16(task_params->itid);
+	if (is_cleanup) {
+		SET_FIELD(task_params->sqe->flags, ISCSI_WQE_WQE_TYPE,
+			  ISCSI_WQE_TYPE_TASK_CLEANUP);
+		return;
+	}
+
+	switch (task_type) {
+	case ISCSI_TASK_TYPE_INITIATOR_WRITE:
+	{
+		u32 buf_size = 0;
+		u32 num_sges = 0;
+
+		init_dif_context_flags(&task_params->sqe->prot_flags,
+				       dif_task_params);
+
+		SET_FIELD(task_params->sqe->flags, ISCSI_WQE_WQE_TYPE,
+			  ISCSI_WQE_TYPE_NORMAL);
+
+		if (task_params->tx_io_size) {
+			buf_size = calc_rw_task_size(task_params, task_type,
+						     sgl_task_params,
+						     dif_task_params);
+
+		if (scsi_is_slow_sgl(sgl_task_params->num_sges,
+				     sgl_task_params->small_mid_sge))
+			num_sges = ISCSI_WQE_NUM_SGES_SLOWIO;
+		else
+			num_sges = min(sgl_task_params->num_sges,
+				       (u16)SCSI_NUM_SGES_SLOW_SGL_THR);
+	}
+
+	SET_FIELD(task_params->sqe->flags, ISCSI_WQE_NUM_SGES, num_sges);
+	SET_FIELD(task_params->sqe->contlen_cdbsize, ISCSI_WQE_CONT_LEN,
+		  buf_size);
+
+	if (GET_FIELD(pdu_header->hdr_second_dword,
+		      ISCSI_CMD_HDR_TOTAL_AHS_LEN))
+		SET_FIELD(task_params->sqe->contlen_cdbsize, ISCSI_WQE_CDB_SIZE,
+			  cmd_params->extended_cdb_sge.sge_len);
+	}
+		break;
+	case ISCSI_TASK_TYPE_INITIATOR_READ:
+		SET_FIELD(task_params->sqe->flags, ISCSI_WQE_WQE_TYPE,
+			  ISCSI_WQE_TYPE_NORMAL);
+
+		if (GET_FIELD(pdu_header->hdr_second_dword,
+			      ISCSI_CMD_HDR_TOTAL_AHS_LEN))
+			SET_FIELD(task_params->sqe->contlen_cdbsize,
+				  ISCSI_WQE_CDB_SIZE,
+				  cmd_params->extended_cdb_sge.sge_len);
+		break;
+	case ISCSI_TASK_TYPE_LOGIN_RESPONSE:
+	case ISCSI_TASK_TYPE_MIDPATH:
+	{
+		bool advance_statsn = true;
+
+		if (task_type == ISCSI_TASK_TYPE_LOGIN_RESPONSE)
+			SET_FIELD(task_params->sqe->flags, ISCSI_WQE_WQE_TYPE,
+				  ISCSI_WQE_TYPE_LOGIN);
+		else
+			SET_FIELD(task_params->sqe->flags, ISCSI_WQE_WQE_TYPE,
+				  ISCSI_WQE_TYPE_MIDDLE_PATH);
+
+		if (task_type == ISCSI_TASK_TYPE_MIDPATH) {
+			u8 opcode = GET_FIELD(pdu_header->hdr_first_byte,
+					      ISCSI_COMMON_HDR_OPCODE);
+
+			if (opcode != ISCSI_OPCODE_TEXT_RESPONSE &&
+			    (opcode != ISCSI_OPCODE_NOP_IN ||
+			    pdu_header->itt == ISCSI_TTT_ALL_ONES))
+				advance_statsn = false;
+		}
+
+		SET_FIELD(task_params->sqe->flags, ISCSI_WQE_RESPONSE,
+			  advance_statsn ? 1 : 0);
+
+		if (task_params->tx_io_size) {
+			SET_FIELD(task_params->sqe->contlen_cdbsize,
+				  ISCSI_WQE_CONT_LEN, task_params->tx_io_size);
+
+		if (scsi_is_slow_sgl(sgl_task_params->num_sges,
+				     sgl_task_params->small_mid_sge))
+			SET_FIELD(task_params->sqe->flags, ISCSI_WQE_NUM_SGES,
+				  ISCSI_WQE_NUM_SGES_SLOWIO);
+		else
+			SET_FIELD(task_params->sqe->flags, ISCSI_WQE_NUM_SGES,
+				  min(sgl_task_params->num_sges,
+				      (u16)SCSI_NUM_SGES_SLOW_SGL_THR));
+		}
+	}
+		break;
+	default:
+		break;
+	}
+}
+
+static void init_default_iscsi_task(struct iscsi_task_params *task_params,
+				    struct data_hdr *pdu_header,
+				    enum iscsi_task_type task_type)
+{
+	struct e4_iscsi_task_context *context;
+	u32 val;
+	u16 index;
+	u8 val_byte;
+
+	context = task_params->context;
+	val_byte = context->mstorm_ag_context.cdu_validation;
+	memset(context, 0, sizeof(*context));
+	context->mstorm_ag_context.cdu_validation = val_byte;
+
+	for (index = 0; index <
+	     ARRAY_SIZE(context->ystorm_st_context.pdu_hdr.data.data);
+	     index++) {
+		val = cpu_to_le32(pdu_header->data[index]);
+		context->ystorm_st_context.pdu_hdr.data.data[index] = val;
+	}
+
+	context->mstorm_st_context.task_type = task_type;
+	context->mstorm_ag_context.task_cid =
+					    cpu_to_le16(task_params->conn_icid);
+
+	SET_FIELD(context->ustorm_ag_context.flags1,
+		  E4_USTORM_ISCSI_TASK_AG_CTX_R2T2RECV, 1);
+
+	context->ustorm_st_context.task_type = task_type;
+	context->ustorm_st_context.cq_rss_number = task_params->cq_rss_number;
+	context->ustorm_ag_context.icid = cpu_to_le16(task_params->conn_icid);
+}
+
+static
+void init_initiator_rw_cdb_ystorm_context(struct ystorm_iscsi_task_st_ctx *ystc,
+					  struct scsi_initiator_cmd_params *cmd)
+{
+	union iscsi_task_hdr *ctx_pdu_hdr = &ystc->pdu_hdr;
+	u32 val;
+
+	if (!cmd->extended_cdb_sge.sge_len)
+		return;
+
+	SET_FIELD(ctx_pdu_hdr->ext_cdb_cmd.hdr_second_dword,
+		  ISCSI_EXT_CDB_CMD_HDR_CDB_SIZE,
+		  cmd->extended_cdb_sge.sge_len);
+	val = cpu_to_le32(cmd->extended_cdb_sge.sge_addr.lo);
+	ctx_pdu_hdr->ext_cdb_cmd.cdb_sge.sge_addr.lo = val;
+	val = cpu_to_le32(cmd->extended_cdb_sge.sge_addr.hi);
+	ctx_pdu_hdr->ext_cdb_cmd.cdb_sge.sge_addr.hi = val;
+	val = cpu_to_le32(cmd->extended_cdb_sge.sge_len);
+	ctx_pdu_hdr->ext_cdb_cmd.cdb_sge.sge_len  = val;
+}
+
+static
+void init_ustorm_task_contexts(struct ustorm_iscsi_task_st_ctx *ustorm_st_cxt,
+			struct e4_ustorm_iscsi_task_ag_ctx *ustorm_ag_cxt,
+			u32 remaining_recv_len, u32 expected_data_transfer_len,
+			u8 num_sges, bool tx_dif_conn_err_en)
+{
+	u32 val;
+
+	ustorm_st_cxt->rem_rcv_len = cpu_to_le32(remaining_recv_len);
+	ustorm_ag_cxt->exp_data_acked = cpu_to_le32(expected_data_transfer_len);
+	val = cpu_to_le32(expected_data_transfer_len);
+	ustorm_st_cxt->exp_data_transfer_len = val;
+	SET_FIELD(ustorm_st_cxt->reg1.reg1_map, ISCSI_REG1_NUM_SGES, num_sges);
+	SET_FIELD(ustorm_ag_cxt->flags2,
+		  E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_EN,
+		  tx_dif_conn_err_en ? 1 : 0);
+}
+
+static
+void set_rw_exp_data_acked_and_cont_len(struct e4_iscsi_task_context *context,
+					struct iscsi_conn_params  *conn_params,
+					enum iscsi_task_type task_type,
+					u32 task_size,
+					u32 exp_data_transfer_len,
+					u8 total_ahs_length)
+{
+	u32 max_unsolicited_data = 0, val;
+
+	if (total_ahs_length &&
+	    (task_type == ISCSI_TASK_TYPE_INITIATOR_WRITE ||
+	     task_type == ISCSI_TASK_TYPE_INITIATOR_READ))
+		SET_FIELD(context->ustorm_st_context.flags2,
+			  USTORM_ISCSI_TASK_ST_CTX_AHS_EXIST, 1);
+
+	switch (task_type) {
+	case ISCSI_TASK_TYPE_INITIATOR_WRITE:
+		if (!conn_params->initial_r2t)
+			max_unsolicited_data = conn_params->first_burst_length;
+		else if (conn_params->immediate_data)
+			max_unsolicited_data =
+					  min(conn_params->first_burst_length,
+					      conn_params->max_send_pdu_length);
+
+		context->ustorm_ag_context.exp_data_acked =
+				   cpu_to_le32(total_ahs_length == 0 ?
+						min(exp_data_transfer_len,
+						    max_unsolicited_data) :
+						((u32)(total_ahs_length +
+						       ISCSI_AHS_CNTL_SIZE)));
+		break;
+	case ISCSI_TASK_TYPE_TARGET_READ:
+		val = cpu_to_le32(exp_data_transfer_len);
+		context->ustorm_ag_context.exp_data_acked = val;
+		break;
+	case ISCSI_TASK_TYPE_INITIATOR_READ:
+		context->ustorm_ag_context.exp_data_acked =
+					cpu_to_le32((total_ahs_length == 0 ? 0 :
+						     total_ahs_length +
+						     ISCSI_AHS_CNTL_SIZE));
+		break;
+	case ISCSI_TASK_TYPE_TARGET_WRITE:
+		val = cpu_to_le32(task_size);
+		context->ustorm_ag_context.exp_cont_len = val;
+		break;
+	default:
+		break;
+	}
+}
+
+static
+void init_rtdif_task_context(struct rdif_task_context *rdif_context,
+			     struct tdif_task_context *tdif_context,
+			     struct scsi_dif_task_params *dif_task_params,
+			     enum iscsi_task_type task_type)
+{
+	u32 val;
+
+	if (!dif_task_params->dif_on_network || !dif_task_params->dif_on_host)
+		return;
+
+	if (task_type == ISCSI_TASK_TYPE_TARGET_WRITE ||
+	    task_type == ISCSI_TASK_TYPE_INITIATOR_READ) {
+		rdif_context->app_tag_value =
+				  cpu_to_le16(dif_task_params->application_tag);
+		rdif_context->partial_crc_value = cpu_to_le16(0xffff);
+		val = cpu_to_le32(dif_task_params->initial_ref_tag);
+		rdif_context->initial_ref_tag = val;
+		rdif_context->app_tag_mask =
+			     cpu_to_le16(dif_task_params->application_tag_mask);
+		SET_FIELD(rdif_context->flags0, RDIF_TASK_CONTEXT_CRC_SEED,
+			  dif_task_params->crc_seed ? 1 : 0);
+		SET_FIELD(rdif_context->flags0,
+			  RDIF_TASK_CONTEXT_HOST_GUARD_TYPE,
+			  dif_task_params->host_guard_type);
+		SET_FIELD(rdif_context->flags0,
+			  RDIF_TASK_CONTEXT_PROTECTION_TYPE,
+			  dif_task_params->protection_type);
+		SET_FIELD(rdif_context->flags0,
+			  RDIF_TASK_CONTEXT_INITIAL_REF_TAG_VALID, 1);
+		SET_FIELD(rdif_context->flags0,
+			  RDIF_TASK_CONTEXT_KEEP_REF_TAG_CONST,
+			  dif_task_params->keep_ref_tag_const ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_VALIDATE_APP_TAG,
+			  (dif_task_params->validate_app_tag &&
+			  dif_task_params->dif_on_network) ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_VALIDATE_GUARD,
+			  (dif_task_params->validate_guard &&
+			  dif_task_params->dif_on_network) ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_VALIDATE_REF_TAG,
+			  (dif_task_params->validate_ref_tag &&
+			  dif_task_params->dif_on_network) ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_HOST_INTERFACE,
+			  dif_task_params->dif_on_host ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_NETWORK_INTERFACE,
+			  dif_task_params->dif_on_network ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_FORWARD_GUARD,
+			  dif_task_params->forward_guard ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_FORWARD_APP_TAG,
+			  dif_task_params->forward_app_tag ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_FORWARD_REF_TAG,
+			  dif_task_params->forward_ref_tag ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_FORWARD_APP_TAG_WITH_MASK,
+			  dif_task_params->forward_app_tag_with_mask ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_FORWARD_REF_TAG_WITH_MASK,
+			  dif_task_params->forward_ref_tag_with_mask ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_INTERVAL_SIZE,
+			  dif_task_params->dif_block_size_log - 9);
+		SET_FIELD(rdif_context->state,
+			  RDIF_TASK_CONTEXT_REF_TAG_MASK,
+			  dif_task_params->ref_tag_mask);
+		SET_FIELD(rdif_context->state, RDIF_TASK_CONTEXT_IGNORE_APP_TAG,
+			  dif_task_params->ignore_app_tag);
+	}
+
+	if (task_type == ISCSI_TASK_TYPE_TARGET_READ ||
+	    task_type == ISCSI_TASK_TYPE_INITIATOR_WRITE) {
+		tdif_context->app_tag_value =
+				  cpu_to_le16(dif_task_params->application_tag);
+		tdif_context->partial_crc_value_b =
+		       cpu_to_le16(dif_task_params->crc_seed ? 0xffff : 0x0000);
+		tdif_context->partial_crc_value_a =
+		       cpu_to_le16(dif_task_params->crc_seed ? 0xffff : 0x0000);
+		SET_FIELD(tdif_context->flags0, TDIF_TASK_CONTEXT_CRC_SEED,
+			  dif_task_params->crc_seed ? 1 : 0);
+
+		SET_FIELD(tdif_context->flags0,
+			  TDIF_TASK_CONTEXT_SET_ERROR_WITH_EOP,
+			  dif_task_params->tx_dif_conn_err_en ? 1 : 0);
+		SET_FIELD(tdif_context->flags1, TDIF_TASK_CONTEXT_FORWARD_GUARD,
+			  dif_task_params->forward_guard   ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_FORWARD_APP_TAG,
+			  dif_task_params->forward_app_tag ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_FORWARD_REF_TAG,
+			  dif_task_params->forward_ref_tag ? 1 : 0);
+		SET_FIELD(tdif_context->flags1, TDIF_TASK_CONTEXT_INTERVAL_SIZE,
+			  dif_task_params->dif_block_size_log - 9);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_HOST_INTERFACE,
+			  dif_task_params->dif_on_host    ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_NETWORK_INTERFACE,
+			  dif_task_params->dif_on_network ? 1 : 0);
+		val = cpu_to_le32(dif_task_params->initial_ref_tag);
+		tdif_context->initial_ref_tag = val;
+		tdif_context->app_tag_mask =
+			     cpu_to_le16(dif_task_params->application_tag_mask);
+		SET_FIELD(tdif_context->flags0,
+			  TDIF_TASK_CONTEXT_HOST_GUARD_TYPE,
+			  dif_task_params->host_guard_type);
+		SET_FIELD(tdif_context->flags0,
+			  TDIF_TASK_CONTEXT_PROTECTION_TYPE,
+			  dif_task_params->protection_type);
+		SET_FIELD(tdif_context->flags0,
+			  TDIF_TASK_CONTEXT_INITIAL_REF_TAG_VALID,
+			  dif_task_params->initial_ref_tag_is_valid ? 1 : 0);
+		SET_FIELD(tdif_context->flags0,
+			  TDIF_TASK_CONTEXT_KEEP_REF_TAG_CONST,
+			  dif_task_params->keep_ref_tag_const ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_VALIDATE_GUARD,
+			  (dif_task_params->validate_guard &&
+			   dif_task_params->dif_on_host) ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_VALIDATE_APP_TAG,
+			  (dif_task_params->validate_app_tag &&
+			  dif_task_params->dif_on_host) ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_VALIDATE_REF_TAG,
+			  (dif_task_params->validate_ref_tag &&
+			   dif_task_params->dif_on_host) ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_FORWARD_APP_TAG_WITH_MASK,
+			  dif_task_params->forward_app_tag_with_mask ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_FORWARD_REF_TAG_WITH_MASK,
+			  dif_task_params->forward_ref_tag_with_mask ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_REF_TAG_MASK,
+			  dif_task_params->ref_tag_mask);
+		SET_FIELD(tdif_context->flags0,
+			  TDIF_TASK_CONTEXT_IGNORE_APP_TAG,
+			  dif_task_params->ignore_app_tag ? 1 : 0);
+	}
+}
+
+static void set_local_completion_context(struct e4_iscsi_task_context *context)
+{
+	SET_FIELD(context->ystorm_st_context.state.flags,
+		  YSTORM_ISCSI_TASK_STATE_LOCAL_COMP, 1);
+	SET_FIELD(context->ustorm_st_context.flags,
+		  USTORM_ISCSI_TASK_ST_CTX_LOCAL_COMP, 1);
+}
+
+static int init_rw_iscsi_task(struct iscsi_task_params *task_params,
+			      enum iscsi_task_type task_type,
+			      struct iscsi_conn_params *conn_params,
+			      struct iscsi_common_hdr *pdu_header,
+			      struct scsi_sgl_task_params *sgl_task_params,
+			      struct scsi_initiator_cmd_params *cmd_params,
+			      struct scsi_dif_task_params *dif_task_params)
+{
+	u32 exp_data_transfer_len = conn_params->max_burst_length;
+	struct e4_iscsi_task_context *cxt;
+	bool slow_io = false;
+	u32 task_size, val;
+	u8 num_sges = 0;
+
+	task_size = calc_rw_task_size(task_params, task_type, sgl_task_params,
+				      dif_task_params);
+
+	init_default_iscsi_task(task_params, (struct data_hdr *)pdu_header,
+				task_type);
+
+	cxt = task_params->context;
+
+
+	if (task_type == ISCSI_TASK_TYPE_TARGET_READ) {
+		set_local_completion_context(cxt);
+	} else if (task_type == ISCSI_TASK_TYPE_TARGET_WRITE) {
+		val = cpu_to_le32(task_size +
+			   ((struct iscsi_r2t_hdr *)pdu_header)->buffer_offset);
+		cxt->ystorm_st_context.pdu_hdr.r2t.desired_data_trns_len = val;
+		cxt->mstorm_st_context.expected_itt =
+						   cpu_to_le32(pdu_header->itt);
+	} else {
+		val = cpu_to_le32(task_size);
+		cxt->ystorm_st_context.pdu_hdr.cmd.expected_transfer_length =
+									    val;
+		init_initiator_rw_cdb_ystorm_context(&cxt->ystorm_st_context,
+						     cmd_params);
+		val = cpu_to_le32(cmd_params->sense_data_buffer_phys_addr.lo);
+		cxt->mstorm_st_context.sense_db.lo = val;
+
+		val = cpu_to_le32(cmd_params->sense_data_buffer_phys_addr.hi);
+		cxt->mstorm_st_context.sense_db.hi = val;
+	}
+
+	if (task_params->tx_io_size) {
+		init_dif_context_flags(&cxt->ystorm_st_context.state.dif_flags,
+				       dif_task_params);
+		init_dif_context_flags(&cxt->ustorm_st_context.dif_flags,
+				       dif_task_params);
+		init_scsi_sgl_context(&cxt->ystorm_st_context.state.sgl_params,
+				      &cxt->ystorm_st_context.state.data_desc,
+				      sgl_task_params);
+
+		slow_io = scsi_is_slow_sgl(sgl_task_params->num_sges,
+					   sgl_task_params->small_mid_sge);
+
+		num_sges = !slow_io ? min_t(u16, sgl_task_params->num_sges,
+					    (u16)SCSI_NUM_SGES_SLOW_SGL_THR) :
+				      ISCSI_WQE_NUM_SGES_SLOWIO;
+
+		if (slow_io) {
+			SET_FIELD(cxt->ystorm_st_context.state.flags,
+				  YSTORM_ISCSI_TASK_STATE_SLOW_IO, 1);
+		}
+	} else if (task_params->rx_io_size) {
+		init_dif_context_flags(&cxt->mstorm_st_context.dif_flags,
+				       dif_task_params);
+		init_scsi_sgl_context(&cxt->mstorm_st_context.sgl_params,
+				      &cxt->mstorm_st_context.data_desc,
+				      sgl_task_params);
+		num_sges = !scsi_is_slow_sgl(sgl_task_params->num_sges,
+				sgl_task_params->small_mid_sge) ?
+				min_t(u16, sgl_task_params->num_sges,
+				      (u16)SCSI_NUM_SGES_SLOW_SGL_THR) :
+				ISCSI_WQE_NUM_SGES_SLOWIO;
+		cxt->mstorm_st_context.rem_task_size = cpu_to_le32(task_size);
+	}
+
+	if (exp_data_transfer_len > task_size  ||
+	    task_type != ISCSI_TASK_TYPE_TARGET_WRITE)
+		exp_data_transfer_len = task_size;
+
+	init_ustorm_task_contexts(&task_params->context->ustorm_st_context,
+				  &task_params->context->ustorm_ag_context,
+				  task_size, exp_data_transfer_len, num_sges,
+				  dif_task_params ?
+				  dif_task_params->tx_dif_conn_err_en : false);
+
+	set_rw_exp_data_acked_and_cont_len(task_params->context, conn_params,
+					   task_type, task_size,
+					   exp_data_transfer_len,
+					GET_FIELD(pdu_header->hdr_second_dword,
+						  ISCSI_CMD_HDR_TOTAL_AHS_LEN));
+
+	if (dif_task_params)
+		init_rtdif_task_context(&task_params->context->rdif_context,
+					&task_params->context->tdif_context,
+					dif_task_params, task_type);
+
+	init_sqe(task_params, sgl_task_params, dif_task_params, pdu_header,
+		 cmd_params, task_type, false);
+
+	return 0;
+}
+
+int init_initiator_rw_iscsi_task(struct iscsi_task_params *task_params,
+				 struct iscsi_conn_params *conn_params,
+				 struct scsi_initiator_cmd_params *cmd_params,
+				 struct iscsi_cmd_hdr *cmd_header,
+				 struct scsi_sgl_task_params *tx_sgl_params,
+				 struct scsi_sgl_task_params *rx_sgl_params,
+				 struct scsi_dif_task_params *dif_task_params)
+{
+	if (GET_FIELD(cmd_header->flags_attr, ISCSI_CMD_HDR_WRITE))
+		return init_rw_iscsi_task(task_params,
+					  ISCSI_TASK_TYPE_INITIATOR_WRITE,
+					  conn_params,
+					  (struct iscsi_common_hdr *)cmd_header,
+					  tx_sgl_params, cmd_params,
+					  dif_task_params);
+	else if (GET_FIELD(cmd_header->flags_attr, ISCSI_CMD_HDR_READ) ||
+		 (task_params->rx_io_size == 0 && task_params->tx_io_size == 0))
+		return init_rw_iscsi_task(task_params,
+					  ISCSI_TASK_TYPE_INITIATOR_READ,
+					  conn_params,
+					  (struct iscsi_common_hdr *)cmd_header,
+					  rx_sgl_params, cmd_params,
+					  dif_task_params);
+	else
+		return -1;
+}
+
+int init_initiator_login_request_task(struct iscsi_task_params *task_params,
+				      struct iscsi_login_req_hdr  *login_header,
+				      struct scsi_sgl_task_params *tx_params,
+				      struct scsi_sgl_task_params *rx_params)
+{
+	struct e4_iscsi_task_context *cxt;
+
+	cxt = task_params->context;
+
+	init_default_iscsi_task(task_params,
+				(struct data_hdr *)login_header,
+				ISCSI_TASK_TYPE_MIDPATH);
+
+	init_ustorm_task_contexts(&cxt->ustorm_st_context,
+				  &cxt->ustorm_ag_context,
+				  task_params->rx_io_size ?
+				  rx_params->total_buffer_size : 0,
+				  task_params->tx_io_size ?
+				  tx_params->total_buffer_size : 0, 0,
+				  0);
+
+	if (task_params->tx_io_size)
+		init_scsi_sgl_context(&cxt->ystorm_st_context.state.sgl_params,
+				      &cxt->ystorm_st_context.state.data_desc,
+				      tx_params);
+
+	if (task_params->rx_io_size)
+		init_scsi_sgl_context(&cxt->mstorm_st_context.sgl_params,
+				      &cxt->mstorm_st_context.data_desc,
+				      rx_params);
+
+	cxt->mstorm_st_context.rem_task_size =
+			cpu_to_le32(task_params->rx_io_size ?
+				    rx_params->total_buffer_size : 0);
+
+	init_sqe(task_params, tx_params, NULL,
+		 (struct iscsi_common_hdr *)login_header, NULL,
+		 ISCSI_TASK_TYPE_MIDPATH, false);
+
+	return 0;
+}
+
+int init_initiator_nop_out_task(struct iscsi_task_params *task_params,
+				struct iscsi_nop_out_hdr *nop_out_pdu_header,
+				struct scsi_sgl_task_params *tx_sgl_task_params,
+				struct scsi_sgl_task_params *rx_sgl_task_params)
+{
+	struct e4_iscsi_task_context *cxt;
+
+	cxt = task_params->context;
+
+	init_default_iscsi_task(task_params,
+				(struct data_hdr *)nop_out_pdu_header,
+				ISCSI_TASK_TYPE_MIDPATH);
+
+	if (nop_out_pdu_header->itt == ISCSI_ITT_ALL_ONES)
+		set_local_completion_context(task_params->context);
+
+	if (task_params->tx_io_size)
+		init_scsi_sgl_context(&cxt->ystorm_st_context.state.sgl_params,
+				      &cxt->ystorm_st_context.state.data_desc,
+				      tx_sgl_task_params);
+
+	if (task_params->rx_io_size)
+		init_scsi_sgl_context(&cxt->mstorm_st_context.sgl_params,
+				      &cxt->mstorm_st_context.data_desc,
+				      rx_sgl_task_params);
+
+	init_ustorm_task_contexts(&cxt->ustorm_st_context,
+				  &cxt->ustorm_ag_context,
+				  task_params->rx_io_size ?
+				  rx_sgl_task_params->total_buffer_size : 0,
+				  task_params->tx_io_size ?
+				  tx_sgl_task_params->total_buffer_size : 0,
+				  0, 0);
+
+	cxt->mstorm_st_context.rem_task_size =
+				cpu_to_le32(task_params->rx_io_size ?
+					rx_sgl_task_params->total_buffer_size :
+					0);
+
+	init_sqe(task_params, tx_sgl_task_params, NULL,
+		 (struct iscsi_common_hdr *)nop_out_pdu_header, NULL,
+		 ISCSI_TASK_TYPE_MIDPATH, false);
+
+	return 0;
+}
+
+int init_initiator_logout_request_task(struct iscsi_task_params *task_params,
+				       struct iscsi_logout_req_hdr *logout_hdr,
+				       struct scsi_sgl_task_params *tx_params,
+				       struct scsi_sgl_task_params *rx_params)
+{
+	struct e4_iscsi_task_context *cxt;
+
+	cxt = task_params->context;
+
+	init_default_iscsi_task(task_params,
+				(struct data_hdr *)logout_hdr,
+				ISCSI_TASK_TYPE_MIDPATH);
+
+	if (task_params->tx_io_size)
+		init_scsi_sgl_context(&cxt->ystorm_st_context.state.sgl_params,
+				      &cxt->ystorm_st_context.state.data_desc,
+				      tx_params);
+
+	if (task_params->rx_io_size)
+		init_scsi_sgl_context(&cxt->mstorm_st_context.sgl_params,
+				      &cxt->mstorm_st_context.data_desc,
+				      rx_params);
+
+	init_ustorm_task_contexts(&cxt->ustorm_st_context,
+				  &cxt->ustorm_ag_context,
+				  task_params->rx_io_size ?
+				  rx_params->total_buffer_size : 0,
+				  task_params->tx_io_size ?
+				  tx_params->total_buffer_size : 0,
+				  0, 0);
+
+	cxt->mstorm_st_context.rem_task_size =
+					cpu_to_le32(task_params->rx_io_size ?
+					rx_params->total_buffer_size : 0);
+
+	init_sqe(task_params, tx_params, NULL,
+		 (struct iscsi_common_hdr *)logout_hdr, NULL,
+		 ISCSI_TASK_TYPE_MIDPATH, false);
+
+	return 0;
+}
+
+int init_initiator_tmf_request_task(struct iscsi_task_params *task_params,
+				    struct iscsi_tmf_request_hdr *tmf_header)
+{
+	init_default_iscsi_task(task_params, (struct data_hdr *)tmf_header,
+				ISCSI_TASK_TYPE_MIDPATH);
+
+	init_sqe(task_params, NULL, NULL,
+		 (struct iscsi_common_hdr *)tmf_header, NULL,
+		 ISCSI_TASK_TYPE_MIDPATH, false);
+
+	return 0;
+}
+
+int init_initiator_text_request_task(struct iscsi_task_params *task_params,
+				     struct iscsi_text_request_hdr *text_header,
+				     struct scsi_sgl_task_params *tx_params,
+				     struct scsi_sgl_task_params *rx_params)
+{
+	struct e4_iscsi_task_context *cxt;
+
+	cxt = task_params->context;
+
+	init_default_iscsi_task(task_params,
+				(struct data_hdr *)text_header,
+				ISCSI_TASK_TYPE_MIDPATH);
+
+	if (task_params->tx_io_size)
+		init_scsi_sgl_context(&cxt->ystorm_st_context.state.sgl_params,
+				      &cxt->ystorm_st_context.state.data_desc,
+				      tx_params);
+
+	if (task_params->rx_io_size)
+		init_scsi_sgl_context(&cxt->mstorm_st_context.sgl_params,
+				      &cxt->mstorm_st_context.data_desc,
+				      rx_params);
+
+	cxt->mstorm_st_context.rem_task_size =
+				cpu_to_le32(task_params->rx_io_size ?
+					rx_params->total_buffer_size : 0);
+
+	init_ustorm_task_contexts(&cxt->ustorm_st_context,
+				  &cxt->ustorm_ag_context,
+				  task_params->rx_io_size ?
+				  rx_params->total_buffer_size : 0,
+				  task_params->tx_io_size ?
+				  tx_params->total_buffer_size : 0, 0, 0);
+
+	init_sqe(task_params, tx_params, NULL,
+		 (struct iscsi_common_hdr *)text_header, NULL,
+		 ISCSI_TASK_TYPE_MIDPATH, false);
+
+	return 0;
+}
+
+int init_cleanup_task(struct iscsi_task_params *task_params)
+{
+	init_sqe(task_params, NULL, NULL, NULL, NULL, ISCSI_TASK_TYPE_MIDPATH,
+		 true);
+	return 0;
+}
diff --git a/drivers/scsi/qedi/qedi_fw_iscsi.h b/drivers/scsi/qedi/qedi_fw_iscsi.h
new file mode 100644
index 0000000..c3deb77
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_fw_iscsi.h
@@ -0,0 +1,117 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QEDI_FW_ISCSI_H_
+#define _QEDI_FW_ISCSI_H_
+
+#include "qedi_fw_scsi.h"
+
+struct iscsi_task_params {
+	struct e4_iscsi_task_context *context;
+	struct iscsi_wqe	  *sqe;
+	u32			  tx_io_size;
+	u32			  rx_io_size;
+	u16			  conn_icid;
+	u16			  itid;
+	u8			  cq_rss_number;
+};
+
+struct iscsi_conn_params {
+	u32	first_burst_length;
+	u32	max_send_pdu_length;
+	u32	max_burst_length;
+	bool	initial_r2t;
+	bool	immediate_data;
+};
+
+/* @brief init_initiator_read_iscsi_task - initializes iSCSI Initiator Read
+ * task context.
+ *
+ * @param task_params	  - Pointer to task parameters struct
+ * @param conn_params	  - Connection Parameters
+ * @param cmd_params	  - command specific parameters
+ * @param cmd_pdu_header  - PDU Header Parameters
+ * @param sgl_task_params - Pointer to SGL task params
+ * @param dif_task_params - Pointer to DIF parameters struct
+ */
+int init_initiator_rw_iscsi_task(struct iscsi_task_params *task_params,
+				 struct iscsi_conn_params *conn_params,
+				 struct scsi_initiator_cmd_params *cmd_params,
+				 struct iscsi_cmd_hdr *cmd_pdu_header,
+				 struct scsi_sgl_task_params *tx_sgl_params,
+				 struct scsi_sgl_task_params *rx_sgl_params,
+				 struct scsi_dif_task_params *dif_task_params);
+
+/* @brief init_initiator_login_request_task - initializes iSCSI Initiator Login
+ * Request task context.
+ *
+ * @param task_params		  - Pointer to task parameters struct
+ * @param login_req_pdu_header    - PDU Header Parameters
+ * @param tx_sgl_task_params	  - Pointer to SGL task params
+ * @param rx_sgl_task_params	  - Pointer to SGL task params
+ */
+int init_initiator_login_request_task(struct iscsi_task_params *task_params,
+				      struct iscsi_login_req_hdr *login_header,
+				      struct scsi_sgl_task_params *tx_params,
+				      struct scsi_sgl_task_params *rx_params);
+
+/* @brief init_initiator_nop_out_task - initializes iSCSI Initiator NOP Out
+ * task context.
+ *
+ * @param task_params		- Pointer to task parameters struct
+ * @param nop_out_pdu_header    - PDU Header Parameters
+ * @param tx_sgl_task_params	- Pointer to SGL task params
+ * @param rx_sgl_task_params	- Pointer to SGL task params
+ */
+int init_initiator_nop_out_task(struct iscsi_task_params *task_params,
+				struct iscsi_nop_out_hdr *nop_out_pdu_header,
+				struct scsi_sgl_task_params *tx_sgl_params,
+				struct scsi_sgl_task_params *rx_sgl_params);
+
+/* @brief init_initiator_logout_request_task - initializes iSCSI Initiator
+ * Logout Request task context.
+ *
+ * @param task_params		- Pointer to task parameters struct
+ * @param logout_pdu_header  - PDU Header Parameters
+ * @param tx_sgl_task_params	- Pointer to SGL task params
+ * @param rx_sgl_task_params	- Pointer to SGL task params
+ */
+int init_initiator_logout_request_task(struct iscsi_task_params *task_params,
+				       struct iscsi_logout_req_hdr *logout_hdr,
+				       struct scsi_sgl_task_params *tx_params,
+				       struct scsi_sgl_task_params *rx_params);
+
+/* @brief init_initiator_tmf_request_task - initializes iSCSI Initiator TMF
+ * task context.
+ *
+ * @param task_params	- Pointer to task parameters struct
+ * @param tmf_pdu_header - PDU Header Parameters
+ */
+int init_initiator_tmf_request_task(struct iscsi_task_params *task_params,
+				    struct iscsi_tmf_request_hdr *tmf_header);
+
+/* @brief init_initiator_text_request_task - initializes iSCSI Initiator Text
+ * Request task context.
+ *
+ * @param task_params		     - Pointer to task parameters struct
+ * @param text_request_pdu_header    - PDU Header Parameters
+ * @param tx_sgl_task_params	     - Pointer to Tx SGL task params
+ * @param rx_sgl_task_params	     - Pointer to Rx SGL task params
+ */
+int init_initiator_text_request_task(struct iscsi_task_params *task_params,
+				     struct iscsi_text_request_hdr *text_header,
+				     struct scsi_sgl_task_params *tx_params,
+				     struct scsi_sgl_task_params *rx_params);
+
+/* @brief init_cleanup_task - initializes Clean task (SQE)
+ *
+ * @param task_params - Pointer to task parameters struct
+ */
+int init_cleanup_task(struct iscsi_task_params *task_params);
+#endif
diff --git a/drivers/scsi/qedi/qedi_fw_scsi.h b/drivers/scsi/qedi/qedi_fw_scsi.h
new file mode 100644
index 0000000..cdaf918
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_fw_scsi.h
@@ -0,0 +1,55 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QEDI_FW_SCSI_H_
+#define _QEDI_FW_SCSI_H_
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include "qedi_hsi.h"
+#include <linux/qed/qed_if.h>
+
+struct scsi_sgl_task_params {
+	struct scsi_sge	*sgl;
+	struct regpair	sgl_phys_addr;
+	u32		total_buffer_size;
+	u16		num_sges;
+	bool		small_mid_sge;
+};
+
+struct scsi_dif_task_params {
+	u32	initial_ref_tag;
+	bool	initial_ref_tag_is_valid;
+	u16	application_tag;
+	u16	application_tag_mask;
+	u16	dif_block_size_log;
+	bool	dif_on_network;
+	bool	dif_on_host;
+	u8	host_guard_type;
+	u8	protection_type;
+	u8	ref_tag_mask;
+	bool	crc_seed;
+	bool	tx_dif_conn_err_en;
+	bool	ignore_app_tag;
+	bool	keep_ref_tag_const;
+	bool	validate_guard;
+	bool	validate_app_tag;
+	bool	validate_ref_tag;
+	bool	forward_guard;
+	bool	forward_app_tag;
+	bool	forward_ref_tag;
+	bool	forward_app_tag_with_mask;
+	bool	forward_ref_tag_with_mask;
+};
+
+struct scsi_initiator_cmd_params {
+	struct scsi_sge	extended_cdb_sge;
+	struct regpair	sense_data_buffer_phys_addr;
+};
+#endif
diff --git a/drivers/scsi/qedi/qedi_gbl.h b/drivers/scsi/qedi/qedi_gbl.h
new file mode 100644
index 0000000..a2aa06e
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_gbl.h
@@ -0,0 +1,80 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QEDI_GBL_H_
+#define _QEDI_GBL_H_
+
+#include "qedi_iscsi.h"
+
+#ifdef CONFIG_DEBUG_FS
+extern int qedi_do_not_recover;
+#else
+#define qedi_do_not_recover (0)
+#endif
+
+extern uint qedi_io_tracing;
+
+extern struct scsi_host_template qedi_host_template;
+extern struct iscsi_transport qedi_iscsi_transport;
+extern const struct qed_iscsi_ops *qedi_ops;
+extern const struct qedi_debugfs_ops qedi_debugfs_ops[];
+extern const struct file_operations qedi_dbg_fops[];
+extern struct device_attribute *qedi_shost_attrs[];
+
+int qedi_alloc_sq(struct qedi_ctx *qedi, struct qedi_endpoint *ep);
+void qedi_free_sq(struct qedi_ctx *qedi, struct qedi_endpoint *ep);
+
+int qedi_send_iscsi_login(struct qedi_conn *qedi_conn,
+			  struct iscsi_task *task);
+int qedi_send_iscsi_logout(struct qedi_conn *qedi_conn,
+			   struct iscsi_task *task);
+int qedi_iscsi_abort_work(struct qedi_conn *qedi_conn,
+			  struct iscsi_task *mtask);
+int qedi_send_iscsi_text(struct qedi_conn *qedi_conn,
+			 struct iscsi_task *task);
+int qedi_send_iscsi_nopout(struct qedi_conn *qedi_conn,
+			   struct iscsi_task *task,
+			   char *datap, int data_len, int unsol);
+int qedi_iscsi_send_ioreq(struct iscsi_task *task);
+int qedi_get_task_idx(struct qedi_ctx *qedi);
+void qedi_clear_task_idx(struct qedi_ctx *qedi, int idx);
+int qedi_iscsi_cleanup_task(struct iscsi_task *task,
+			    bool mark_cmd_node_deleted);
+void qedi_iscsi_unmap_sg_list(struct qedi_cmd *cmd);
+void qedi_update_itt_map(struct qedi_ctx *qedi, u32 tid, u32 proto_itt,
+			 struct qedi_cmd *qedi_cmd);
+void qedi_get_proto_itt(struct qedi_ctx *qedi, u32 tid, u32 *proto_itt);
+void qedi_get_task_tid(struct qedi_ctx *qedi, u32 itt, int16_t *tid);
+void qedi_process_iscsi_error(struct qedi_endpoint *ep,
+			      struct iscsi_eqe_data *data);
+void qedi_start_conn_recovery(struct qedi_ctx *qedi,
+			      struct qedi_conn *qedi_conn);
+struct qedi_conn *qedi_get_conn_from_id(struct qedi_ctx *qedi, u32 iscsi_cid);
+void qedi_process_tcp_error(struct qedi_endpoint *ep,
+			    struct iscsi_eqe_data *data);
+void qedi_mark_device_missing(struct iscsi_cls_session *cls_session);
+void qedi_mark_device_available(struct iscsi_cls_session *cls_session);
+void qedi_reset_host_mtu(struct qedi_ctx *qedi, u16 mtu);
+int qedi_recover_all_conns(struct qedi_ctx *qedi);
+void qedi_fp_process_cqes(struct qedi_work *work);
+int qedi_cleanup_all_io(struct qedi_ctx *qedi,
+			struct qedi_conn *qedi_conn,
+			struct iscsi_task *task, bool in_recovery);
+void qedi_trace_io(struct qedi_ctx *qedi, struct iscsi_task *task,
+		   u16 tid, int8_t direction);
+int qedi_alloc_id(struct qedi_portid_tbl *id_tbl, u16 id);
+u16 qedi_alloc_new_id(struct qedi_portid_tbl *id_tbl);
+void qedi_free_id(struct qedi_portid_tbl *id_tbl, u16 id);
+int qedi_create_sysfs_ctx_attr(struct qedi_ctx *qedi);
+void qedi_remove_sysfs_ctx_attr(struct qedi_ctx *qedi);
+void qedi_clearsq(struct qedi_ctx *qedi,
+		  struct qedi_conn *qedi_conn,
+		  struct iscsi_task *task);
+
+#endif
diff --git a/drivers/scsi/qedi/qedi_hsi.h b/drivers/scsi/qedi/qedi_hsi.h
new file mode 100644
index 0000000..8ca44c7
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_hsi.h
@@ -0,0 +1,52 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+#ifndef __QEDI_HSI__
+#define __QEDI_HSI__
+/*
+ * Add include to common target
+ */
+#include <linux/qed/common_hsi.h>
+
+/*
+ * Add include to common storage target
+ */
+#include <linux/qed/storage_common.h>
+
+/*
+ * Add include to common TCP target
+ */
+#include <linux/qed/tcp_common.h>
+
+/*
+ * Add include to common iSCSI target for both eCore and protocol driver
+ */
+#include <linux/qed/iscsi_common.h>
+
+/*
+ * iSCSI CMDQ element
+ */
+struct iscsi_cmdqe {
+	__le16 conn_id;
+	u8 invalid_command;
+	u8 cmd_hdr_type;
+	__le32 reserved1[2];
+	__le32 cmd_payload[13];
+};
+
+/*
+ * iSCSI CMD header type
+ */
+enum iscsi_cmd_hdr_type {
+	ISCSI_CMD_HDR_TYPE_BHS_ONLY /* iSCSI BHS with no expected AHS */,
+	ISCSI_CMD_HDR_TYPE_BHS_W_AHS /* iSCSI BHS with expected AHS */,
+	ISCSI_CMD_HDR_TYPE_AHS /* iSCSI AHS */,
+	MAX_ISCSI_CMD_HDR_TYPE
+};
+
+#endif /* __QEDI_HSI__ */
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
new file mode 100644
index 0000000..7ec7f6e
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -0,0 +1,1622 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <scsi/scsi_tcq.h>
+
+#include "qedi.h"
+#include "qedi_iscsi.h"
+#include "qedi_gbl.h"
+
+int qedi_recover_all_conns(struct qedi_ctx *qedi)
+{
+	struct qedi_conn *qedi_conn;
+	int i;
+
+	for (i = 0; i < qedi->max_active_conns; i++) {
+		qedi_conn = qedi_get_conn_from_id(qedi, i);
+		if (!qedi_conn)
+			continue;
+
+		qedi_start_conn_recovery(qedi, qedi_conn);
+	}
+
+	return SUCCESS;
+}
+
+static int qedi_eh_host_reset(struct scsi_cmnd *cmd)
+{
+	struct Scsi_Host *shost = cmd->device->host;
+	struct qedi_ctx *qedi;
+
+	qedi = iscsi_host_priv(shost);
+
+	return qedi_recover_all_conns(qedi);
+}
+
+struct scsi_host_template qedi_host_template = {
+	.module = THIS_MODULE,
+	.name = "QLogic QEDI 25/40/100Gb iSCSI Initiator Driver",
+	.proc_name = QEDI_MODULE_NAME,
+	.queuecommand = iscsi_queuecommand,
+	.eh_timed_out = iscsi_eh_cmd_timed_out,
+	.eh_abort_handler = iscsi_eh_abort,
+	.eh_device_reset_handler = iscsi_eh_device_reset,
+	.eh_target_reset_handler = iscsi_eh_recover_target,
+	.eh_host_reset_handler = qedi_eh_host_reset,
+	.target_alloc = iscsi_target_alloc,
+	.change_queue_depth = scsi_change_queue_depth,
+	.can_queue = QEDI_MAX_ISCSI_TASK,
+	.this_id = -1,
+	.sg_tablesize = QEDI_ISCSI_MAX_BDS_PER_CMD,
+	.max_sectors = 0xffff,
+	.dma_boundary = QEDI_HW_DMA_BOUNDARY,
+	.cmd_per_lun = 128,
+	.use_clustering = ENABLE_CLUSTERING,
+	.shost_attrs = qedi_shost_attrs,
+};
+
+static void qedi_conn_free_login_resources(struct qedi_ctx *qedi,
+					   struct qedi_conn *qedi_conn)
+{
+	if (qedi_conn->gen_pdu.resp_bd_tbl) {
+		dma_free_coherent(&qedi->pdev->dev, QEDI_PAGE_SIZE,
+				  qedi_conn->gen_pdu.resp_bd_tbl,
+				  qedi_conn->gen_pdu.resp_bd_dma);
+		qedi_conn->gen_pdu.resp_bd_tbl = NULL;
+	}
+
+	if (qedi_conn->gen_pdu.req_bd_tbl) {
+		dma_free_coherent(&qedi->pdev->dev, QEDI_PAGE_SIZE,
+				  qedi_conn->gen_pdu.req_bd_tbl,
+				  qedi_conn->gen_pdu.req_bd_dma);
+		qedi_conn->gen_pdu.req_bd_tbl = NULL;
+	}
+
+	if (qedi_conn->gen_pdu.resp_buf) {
+		dma_free_coherent(&qedi->pdev->dev,
+				  ISCSI_DEF_MAX_RECV_SEG_LEN,
+				  qedi_conn->gen_pdu.resp_buf,
+				  qedi_conn->gen_pdu.resp_dma_addr);
+		qedi_conn->gen_pdu.resp_buf = NULL;
+	}
+
+	if (qedi_conn->gen_pdu.req_buf) {
+		dma_free_coherent(&qedi->pdev->dev,
+				  ISCSI_DEF_MAX_RECV_SEG_LEN,
+				  qedi_conn->gen_pdu.req_buf,
+				  qedi_conn->gen_pdu.req_dma_addr);
+		qedi_conn->gen_pdu.req_buf = NULL;
+	}
+}
+
+static int qedi_conn_alloc_login_resources(struct qedi_ctx *qedi,
+					   struct qedi_conn *qedi_conn)
+{
+	qedi_conn->gen_pdu.req_buf =
+		dma_alloc_coherent(&qedi->pdev->dev,
+				   ISCSI_DEF_MAX_RECV_SEG_LEN,
+				   &qedi_conn->gen_pdu.req_dma_addr,
+				   GFP_KERNEL);
+	if (!qedi_conn->gen_pdu.req_buf)
+		goto login_req_buf_failure;
+
+	qedi_conn->gen_pdu.req_buf_size = 0;
+	qedi_conn->gen_pdu.req_wr_ptr = qedi_conn->gen_pdu.req_buf;
+
+	qedi_conn->gen_pdu.resp_buf =
+		dma_alloc_coherent(&qedi->pdev->dev,
+				   ISCSI_DEF_MAX_RECV_SEG_LEN,
+				   &qedi_conn->gen_pdu.resp_dma_addr,
+				   GFP_KERNEL);
+	if (!qedi_conn->gen_pdu.resp_buf)
+		goto login_resp_buf_failure;
+
+	qedi_conn->gen_pdu.resp_buf_size = ISCSI_DEF_MAX_RECV_SEG_LEN;
+	qedi_conn->gen_pdu.resp_wr_ptr = qedi_conn->gen_pdu.resp_buf;
+
+	qedi_conn->gen_pdu.req_bd_tbl =
+		dma_alloc_coherent(&qedi->pdev->dev, QEDI_PAGE_SIZE,
+				   &qedi_conn->gen_pdu.req_bd_dma, GFP_KERNEL);
+	if (!qedi_conn->gen_pdu.req_bd_tbl)
+		goto login_req_bd_tbl_failure;
+
+	qedi_conn->gen_pdu.resp_bd_tbl =
+		dma_alloc_coherent(&qedi->pdev->dev, QEDI_PAGE_SIZE,
+				   &qedi_conn->gen_pdu.resp_bd_dma,
+				   GFP_KERNEL);
+	if (!qedi_conn->gen_pdu.resp_bd_tbl)
+		goto login_resp_bd_tbl_failure;
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SESS,
+		  "Allocation successful, cid=0x%x\n",
+		  qedi_conn->iscsi_conn_id);
+	return 0;
+
+login_resp_bd_tbl_failure:
+	dma_free_coherent(&qedi->pdev->dev, QEDI_PAGE_SIZE,
+			  qedi_conn->gen_pdu.req_bd_tbl,
+			  qedi_conn->gen_pdu.req_bd_dma);
+	qedi_conn->gen_pdu.req_bd_tbl = NULL;
+
+login_req_bd_tbl_failure:
+	dma_free_coherent(&qedi->pdev->dev, ISCSI_DEF_MAX_RECV_SEG_LEN,
+			  qedi_conn->gen_pdu.resp_buf,
+			  qedi_conn->gen_pdu.resp_dma_addr);
+	qedi_conn->gen_pdu.resp_buf = NULL;
+login_resp_buf_failure:
+	dma_free_coherent(&qedi->pdev->dev, ISCSI_DEF_MAX_RECV_SEG_LEN,
+			  qedi_conn->gen_pdu.req_buf,
+			  qedi_conn->gen_pdu.req_dma_addr);
+	qedi_conn->gen_pdu.req_buf = NULL;
+login_req_buf_failure:
+	iscsi_conn_printk(KERN_ERR, qedi_conn->cls_conn->dd_data,
+			  "login resource alloc failed!!\n");
+	return -ENOMEM;
+}
+
+static void qedi_destroy_cmd_pool(struct qedi_ctx *qedi,
+				  struct iscsi_session *session)
+{
+	int i;
+
+	for (i = 0; i < session->cmds_max; i++) {
+		struct iscsi_task *task = session->cmds[i];
+		struct qedi_cmd *cmd = task->dd_data;
+
+		if (cmd->io_tbl.sge_tbl)
+			dma_free_coherent(&qedi->pdev->dev,
+					  QEDI_ISCSI_MAX_BDS_PER_CMD *
+					  sizeof(struct scsi_sge),
+					  cmd->io_tbl.sge_tbl,
+					  cmd->io_tbl.sge_tbl_dma);
+
+		if (cmd->sense_buffer)
+			dma_free_coherent(&qedi->pdev->dev,
+					  SCSI_SENSE_BUFFERSIZE,
+					  cmd->sense_buffer,
+					  cmd->sense_buffer_dma);
+	}
+}
+
+static int qedi_alloc_sget(struct qedi_ctx *qedi, struct iscsi_session *session,
+			   struct qedi_cmd *cmd)
+{
+	struct qedi_io_bdt *io = &cmd->io_tbl;
+	struct scsi_sge *sge;
+
+	io->sge_tbl = dma_alloc_coherent(&qedi->pdev->dev,
+					 QEDI_ISCSI_MAX_BDS_PER_CMD *
+					 sizeof(*sge),
+					 &io->sge_tbl_dma, GFP_KERNEL);
+	if (!io->sge_tbl) {
+		iscsi_session_printk(KERN_ERR, session,
+				     "Could not allocate BD table.\n");
+		return -ENOMEM;
+	}
+
+	io->sge_valid = 0;
+	return 0;
+}
+
+static int qedi_setup_cmd_pool(struct qedi_ctx *qedi,
+			       struct iscsi_session *session)
+{
+	int i;
+
+	for (i = 0; i < session->cmds_max; i++) {
+		struct iscsi_task *task = session->cmds[i];
+		struct qedi_cmd *cmd = task->dd_data;
+
+		task->hdr = &cmd->hdr;
+		task->hdr_max = sizeof(struct iscsi_hdr);
+
+		if (qedi_alloc_sget(qedi, session, cmd))
+			goto free_sgets;
+
+		cmd->sense_buffer = dma_alloc_coherent(&qedi->pdev->dev,
+						       SCSI_SENSE_BUFFERSIZE,
+						       &cmd->sense_buffer_dma,
+						       GFP_KERNEL);
+		if (!cmd->sense_buffer)
+			goto free_sgets;
+	}
+
+	return 0;
+
+free_sgets:
+	qedi_destroy_cmd_pool(qedi, session);
+	return -ENOMEM;
+}
+
+static struct iscsi_cls_session *
+qedi_session_create(struct iscsi_endpoint *ep, u16 cmds_max,
+		    u16 qdepth, uint32_t initial_cmdsn)
+{
+	struct Scsi_Host *shost;
+	struct iscsi_cls_session *cls_session;
+	struct qedi_ctx *qedi;
+	struct qedi_endpoint *qedi_ep;
+
+	if (!ep)
+		return NULL;
+
+	qedi_ep = ep->dd_data;
+	shost = qedi_ep->qedi->shost;
+	qedi = iscsi_host_priv(shost);
+
+	if (cmds_max > qedi->max_sqes)
+		cmds_max = qedi->max_sqes;
+	else if (cmds_max < QEDI_SQ_WQES_MIN)
+		cmds_max = QEDI_SQ_WQES_MIN;
+
+	cls_session = iscsi_session_setup(&qedi_iscsi_transport, shost,
+					  cmds_max, 0, sizeof(struct qedi_cmd),
+					  initial_cmdsn, ISCSI_MAX_TARGET);
+	if (!cls_session) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Failed to setup session for ep=%p\n", qedi_ep);
+		return NULL;
+	}
+
+	if (qedi_setup_cmd_pool(qedi, cls_session->dd_data)) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Failed to setup cmd pool for ep=%p\n", qedi_ep);
+		goto session_teardown;
+	}
+
+	return cls_session;
+
+session_teardown:
+	iscsi_session_teardown(cls_session);
+	return NULL;
+}
+
+static void qedi_session_destroy(struct iscsi_cls_session *cls_session)
+{
+	struct iscsi_session *session = cls_session->dd_data;
+	struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
+	struct qedi_ctx *qedi = iscsi_host_priv(shost);
+
+	qedi_destroy_cmd_pool(qedi, session);
+	iscsi_session_teardown(cls_session);
+}
+
+static struct iscsi_cls_conn *
+qedi_conn_create(struct iscsi_cls_session *cls_session, uint32_t cid)
+{
+	struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
+	struct qedi_ctx *qedi = iscsi_host_priv(shost);
+	struct iscsi_cls_conn *cls_conn;
+	struct qedi_conn *qedi_conn;
+	struct iscsi_conn *conn;
+
+	cls_conn = iscsi_conn_setup(cls_session, sizeof(*qedi_conn),
+				    cid);
+	if (!cls_conn) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "conn_new: iscsi conn setup failed, cid=0x%x, cls_sess=%p!\n",
+			 cid, cls_session);
+		return NULL;
+	}
+
+	conn = cls_conn->dd_data;
+	qedi_conn = conn->dd_data;
+	qedi_conn->cls_conn = cls_conn;
+	qedi_conn->qedi = qedi;
+	qedi_conn->ep = NULL;
+	qedi_conn->active_cmd_count = 0;
+	INIT_LIST_HEAD(&qedi_conn->active_cmd_list);
+	spin_lock_init(&qedi_conn->list_lock);
+
+	if (qedi_conn_alloc_login_resources(qedi, qedi_conn)) {
+		iscsi_conn_printk(KERN_ALERT, conn,
+				  "conn_new: login resc alloc failed, cid=0x%x, cls_sess=%p!!\n",
+				   cid, cls_session);
+		goto free_conn;
+	}
+
+	return cls_conn;
+
+free_conn:
+	iscsi_conn_teardown(cls_conn);
+	return NULL;
+}
+
+void qedi_mark_device_missing(struct iscsi_cls_session *cls_session)
+{
+	iscsi_block_session(cls_session);
+}
+
+void qedi_mark_device_available(struct iscsi_cls_session *cls_session)
+{
+	iscsi_unblock_session(cls_session);
+}
+
+static int qedi_bind_conn_to_iscsi_cid(struct qedi_ctx *qedi,
+				       struct qedi_conn *qedi_conn)
+{
+	u32 iscsi_cid = qedi_conn->iscsi_conn_id;
+
+	if (qedi->cid_que.conn_cid_tbl[iscsi_cid]) {
+		iscsi_conn_printk(KERN_ALERT, qedi_conn->cls_conn->dd_data,
+				  "conn bind - entry #%d not free\n",
+				  iscsi_cid);
+		return -EBUSY;
+	}
+
+	qedi->cid_que.conn_cid_tbl[iscsi_cid] = qedi_conn;
+	return 0;
+}
+
+struct qedi_conn *qedi_get_conn_from_id(struct qedi_ctx *qedi, u32 iscsi_cid)
+{
+	if (!qedi->cid_que.conn_cid_tbl) {
+		QEDI_ERR(&qedi->dbg_ctx, "missing conn<->cid table\n");
+		return NULL;
+
+	} else if (iscsi_cid >= qedi->max_active_conns) {
+		QEDI_ERR(&qedi->dbg_ctx, "wrong cid #%d\n", iscsi_cid);
+		return NULL;
+	}
+	return qedi->cid_que.conn_cid_tbl[iscsi_cid];
+}
+
+static int qedi_conn_bind(struct iscsi_cls_session *cls_session,
+			  struct iscsi_cls_conn *cls_conn,
+			  u64 transport_fd, int is_leading)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct qedi_conn *qedi_conn = conn->dd_data;
+	struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
+	struct qedi_ctx *qedi = iscsi_host_priv(shost);
+	struct qedi_endpoint *qedi_ep;
+	struct iscsi_endpoint *ep;
+
+	ep = iscsi_lookup_endpoint(transport_fd);
+	if (!ep)
+		return -EINVAL;
+
+	qedi_ep = ep->dd_data;
+	if ((qedi_ep->state == EP_STATE_TCP_FIN_RCVD) ||
+	    (qedi_ep->state == EP_STATE_TCP_RST_RCVD))
+		return -EINVAL;
+
+	if (iscsi_conn_bind(cls_session, cls_conn, is_leading))
+		return -EINVAL;
+
+	qedi_ep->conn = qedi_conn;
+	qedi_conn->ep = qedi_ep;
+	qedi_conn->iscsi_conn_id = qedi_ep->iscsi_cid;
+	qedi_conn->fw_cid = qedi_ep->fw_cid;
+	qedi_conn->cmd_cleanup_req = 0;
+	qedi_conn->cmd_cleanup_cmpl = 0;
+
+	if (qedi_bind_conn_to_iscsi_cid(qedi, qedi_conn))
+		return -EINVAL;
+
+	spin_lock_init(&qedi_conn->tmf_work_lock);
+	INIT_LIST_HEAD(&qedi_conn->tmf_work_list);
+	init_waitqueue_head(&qedi_conn->wait_queue);
+	return 0;
+}
+
+static int qedi_iscsi_update_conn(struct qedi_ctx *qedi,
+				  struct qedi_conn *qedi_conn)
+{
+	struct qed_iscsi_params_update *conn_info;
+	struct iscsi_cls_conn *cls_conn = qedi_conn->cls_conn;
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct qedi_endpoint *qedi_ep;
+	int rval;
+
+	qedi_ep = qedi_conn->ep;
+
+	conn_info = kzalloc(sizeof(*conn_info), GFP_KERNEL);
+	if (!conn_info) {
+		QEDI_ERR(&qedi->dbg_ctx, "memory alloc failed\n");
+		return -ENOMEM;
+	}
+
+	conn_info->update_flag = 0;
+
+	if (conn->hdrdgst_en)
+		SET_FIELD(conn_info->update_flag,
+			  ISCSI_CONN_UPDATE_RAMROD_PARAMS_HD_EN, true);
+	if (conn->datadgst_en)
+		SET_FIELD(conn_info->update_flag,
+			  ISCSI_CONN_UPDATE_RAMROD_PARAMS_DD_EN, true);
+	if (conn->session->initial_r2t_en)
+		SET_FIELD(conn_info->update_flag,
+			  ISCSI_CONN_UPDATE_RAMROD_PARAMS_INITIAL_R2T,
+			  true);
+	if (conn->session->imm_data_en)
+		SET_FIELD(conn_info->update_flag,
+			  ISCSI_CONN_UPDATE_RAMROD_PARAMS_IMMEDIATE_DATA,
+			  true);
+
+	conn_info->max_seq_size = conn->session->max_burst;
+	conn_info->max_recv_pdu_length = conn->max_recv_dlength;
+	conn_info->max_send_pdu_length = conn->max_xmit_dlength;
+	conn_info->first_seq_length = conn->session->first_burst;
+	conn_info->exp_stat_sn = conn->exp_statsn;
+
+	rval = qedi_ops->update_conn(qedi->cdev, qedi_ep->handle,
+				     conn_info);
+	if (rval) {
+		rval = -ENXIO;
+		QEDI_ERR(&qedi->dbg_ctx, "Could not update connection\n");
+	}
+
+	kfree(conn_info);
+	return rval;
+}
+
+static u16 qedi_calc_mss(u16 pmtu, u8 is_ipv6, u8 tcp_ts_en, u8 vlan_en)
+{
+	u16 mss = 0;
+	u16 hdrs = TCP_HDR_LEN;
+
+	if (is_ipv6)
+		hdrs += IPV6_HDR_LEN;
+	else
+		hdrs += IPV4_HDR_LEN;
+
+	if (vlan_en)
+		hdrs += VLAN_LEN;
+
+	mss = pmtu - hdrs;
+
+	if (tcp_ts_en)
+		mss -= TCP_OPTION_LEN;
+
+	if (!mss)
+		mss = DEF_MSS;
+
+	return mss;
+}
+
+static int qedi_iscsi_offload_conn(struct qedi_endpoint *qedi_ep)
+{
+	struct qedi_ctx *qedi = qedi_ep->qedi;
+	struct qed_iscsi_params_offload *conn_info;
+	int rval;
+	int i;
+
+	conn_info = kzalloc(sizeof(*conn_info), GFP_KERNEL);
+	if (!conn_info) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Failed to allocate memory ep=%p\n", qedi_ep);
+		return -ENOMEM;
+	}
+
+	ether_addr_copy(conn_info->src.mac, qedi_ep->src_mac);
+	ether_addr_copy(conn_info->dst.mac, qedi_ep->dst_mac);
+
+	conn_info->src.ip[0] = ntohl(qedi_ep->src_addr[0]);
+	conn_info->dst.ip[0] = ntohl(qedi_ep->dst_addr[0]);
+
+	if (qedi_ep->ip_type == TCP_IPV4) {
+		conn_info->ip_version = 0;
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+			  "After ntohl: src_addr=%pI4, dst_addr=%pI4\n",
+			  qedi_ep->src_addr, qedi_ep->dst_addr);
+	} else {
+		for (i = 1; i < 4; i++) {
+			conn_info->src.ip[i] = ntohl(qedi_ep->src_addr[i]);
+			conn_info->dst.ip[i] = ntohl(qedi_ep->dst_addr[i]);
+		}
+
+		conn_info->ip_version = 1;
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+			  "After ntohl: src_addr=%pI6, dst_addr=%pI6\n",
+			  qedi_ep->src_addr, qedi_ep->dst_addr);
+	}
+
+	conn_info->src.port = qedi_ep->src_port;
+	conn_info->dst.port = qedi_ep->dst_port;
+
+	conn_info->layer_code = ISCSI_SLOW_PATH_LAYER_CODE;
+	conn_info->sq_pbl_addr = qedi_ep->sq_pbl_dma;
+	conn_info->vlan_id = qedi_ep->vlan_id;
+
+	SET_FIELD(conn_info->tcp_flags, TCP_OFFLOAD_PARAMS_TS_EN, 1);
+	SET_FIELD(conn_info->tcp_flags, TCP_OFFLOAD_PARAMS_DA_EN, 1);
+	SET_FIELD(conn_info->tcp_flags, TCP_OFFLOAD_PARAMS_DA_CNT_EN, 1);
+	SET_FIELD(conn_info->tcp_flags, TCP_OFFLOAD_PARAMS_KA_EN, 1);
+
+	conn_info->default_cq = (qedi_ep->fw_cid % qedi->num_queues);
+
+	conn_info->ka_max_probe_cnt = DEF_KA_MAX_PROBE_COUNT;
+	conn_info->dup_ack_theshold = 3;
+	conn_info->rcv_wnd = 65535;
+
+	conn_info->ss_thresh = 65535;
+	conn_info->srtt = 300;
+	conn_info->rtt_var = 150;
+	conn_info->flow_label = 0;
+	conn_info->ka_timeout = DEF_KA_TIMEOUT;
+	conn_info->ka_interval = DEF_KA_INTERVAL;
+	conn_info->max_rt_time = DEF_MAX_RT_TIME;
+	conn_info->ttl = DEF_TTL;
+	conn_info->tos_or_tc = DEF_TOS;
+	conn_info->remote_port = qedi_ep->dst_port;
+	conn_info->local_port = qedi_ep->src_port;
+
+	conn_info->mss = qedi_calc_mss(qedi_ep->pmtu,
+				       (qedi_ep->ip_type == TCP_IPV6),
+				       1, (qedi_ep->vlan_id != 0));
+
+	conn_info->cwnd = DEF_MAX_CWND * conn_info->mss;
+	conn_info->rcv_wnd_scale = 4;
+	conn_info->da_timeout_value = 200;
+	conn_info->ack_frequency = 2;
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+		  "Default cq index [%d], mss [%d]\n",
+		  conn_info->default_cq, conn_info->mss);
+
+	rval = qedi_ops->offload_conn(qedi->cdev, qedi_ep->handle, conn_info);
+	if (rval)
+		QEDI_ERR(&qedi->dbg_ctx, "offload_conn returned %d, ep=%p\n",
+			 rval, qedi_ep);
+
+	kfree(conn_info);
+	return rval;
+}
+
+static int qedi_conn_start(struct iscsi_cls_conn *cls_conn)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct qedi_conn *qedi_conn = conn->dd_data;
+	struct qedi_ctx *qedi;
+	int rval;
+
+	qedi = qedi_conn->qedi;
+
+	rval = qedi_iscsi_update_conn(qedi, qedi_conn);
+	if (rval) {
+		iscsi_conn_printk(KERN_ALERT, conn,
+				  "conn_start: FW oflload conn failed.\n");
+		rval = -EINVAL;
+		goto start_err;
+	}
+
+	clear_bit(QEDI_CONN_FW_CLEANUP, &qedi_conn->flags);
+	qedi_conn->abrt_conn = 0;
+
+	rval = iscsi_conn_start(cls_conn);
+	if (rval) {
+		iscsi_conn_printk(KERN_ALERT, conn,
+				  "iscsi_conn_start: FW oflload conn failed!!\n");
+	}
+
+start_err:
+	return rval;
+}
+
+static void qedi_conn_destroy(struct iscsi_cls_conn *cls_conn)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct qedi_conn *qedi_conn = conn->dd_data;
+	struct Scsi_Host *shost;
+	struct qedi_ctx *qedi;
+
+	shost = iscsi_session_to_shost(iscsi_conn_to_session(cls_conn));
+	qedi = iscsi_host_priv(shost);
+
+	qedi_conn_free_login_resources(qedi, qedi_conn);
+	iscsi_conn_teardown(cls_conn);
+}
+
+static int qedi_ep_get_param(struct iscsi_endpoint *ep,
+			     enum iscsi_param param, char *buf)
+{
+	struct qedi_endpoint *qedi_ep = ep->dd_data;
+	int len;
+
+	if (!qedi_ep)
+		return -ENOTCONN;
+
+	switch (param) {
+	case ISCSI_PARAM_CONN_PORT:
+		len = sprintf(buf, "%hu\n", qedi_ep->dst_port);
+		break;
+	case ISCSI_PARAM_CONN_ADDRESS:
+		if (qedi_ep->ip_type == TCP_IPV4)
+			len = sprintf(buf, "%pI4\n", qedi_ep->dst_addr);
+		else
+			len = sprintf(buf, "%pI6\n", qedi_ep->dst_addr);
+		break;
+	default:
+		return -ENOTCONN;
+	}
+
+	return len;
+}
+
+static int qedi_host_get_param(struct Scsi_Host *shost,
+			       enum iscsi_host_param param, char *buf)
+{
+	struct qedi_ctx *qedi;
+	int len;
+
+	qedi = iscsi_host_priv(shost);
+
+	switch (param) {
+	case ISCSI_HOST_PARAM_HWADDRESS:
+		len = sysfs_format_mac(buf, qedi->mac, 6);
+		break;
+	case ISCSI_HOST_PARAM_NETDEV_NAME:
+		len = sprintf(buf, "host%d\n", shost->host_no);
+		break;
+	case ISCSI_HOST_PARAM_IPADDRESS:
+		if (qedi->ip_type == TCP_IPV4)
+			len = sprintf(buf, "%pI4\n", qedi->src_ip);
+		else
+			len = sprintf(buf, "%pI6\n", qedi->src_ip);
+		break;
+	default:
+		return iscsi_host_get_param(shost, param, buf);
+	}
+
+	return len;
+}
+
+static void qedi_conn_get_stats(struct iscsi_cls_conn *cls_conn,
+				struct iscsi_stats *stats)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct qed_iscsi_stats iscsi_stats;
+	struct Scsi_Host *shost;
+	struct qedi_ctx *qedi;
+
+	shost = iscsi_session_to_shost(iscsi_conn_to_session(cls_conn));
+	qedi = iscsi_host_priv(shost);
+	qedi_ops->get_stats(qedi->cdev, &iscsi_stats);
+
+	conn->txdata_octets = iscsi_stats.iscsi_tx_bytes_cnt;
+	conn->rxdata_octets = iscsi_stats.iscsi_rx_bytes_cnt;
+	conn->dataout_pdus_cnt = (uint32_t)iscsi_stats.iscsi_tx_data_pdu_cnt;
+	conn->datain_pdus_cnt = (uint32_t)iscsi_stats.iscsi_rx_data_pdu_cnt;
+	conn->r2t_pdus_cnt = (uint32_t)iscsi_stats.iscsi_rx_r2t_pdu_cnt;
+
+	stats->txdata_octets = conn->txdata_octets;
+	stats->rxdata_octets = conn->rxdata_octets;
+	stats->scsicmd_pdus = conn->scsicmd_pdus_cnt;
+	stats->dataout_pdus = conn->dataout_pdus_cnt;
+	stats->scsirsp_pdus = conn->scsirsp_pdus_cnt;
+	stats->datain_pdus = conn->datain_pdus_cnt;
+	stats->r2t_pdus = conn->r2t_pdus_cnt;
+	stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt;
+	stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt;
+	stats->digest_err = 0;
+	stats->timeout_err = 0;
+	strcpy(stats->custom[0].desc, "eh_abort_cnt");
+	stats->custom[0].value = conn->eh_abort_cnt;
+	stats->custom_length = 1;
+}
+
+static void qedi_iscsi_prep_generic_pdu_bd(struct qedi_conn *qedi_conn)
+{
+	struct scsi_sge *bd_tbl;
+
+	bd_tbl = (struct scsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
+
+	bd_tbl->sge_addr.hi =
+		(u32)((u64)qedi_conn->gen_pdu.req_dma_addr >> 32);
+	bd_tbl->sge_addr.lo = (u32)qedi_conn->gen_pdu.req_dma_addr;
+	bd_tbl->sge_len = qedi_conn->gen_pdu.req_wr_ptr -
+				qedi_conn->gen_pdu.req_buf;
+	bd_tbl = (struct scsi_sge  *)qedi_conn->gen_pdu.resp_bd_tbl;
+	bd_tbl->sge_addr.hi =
+			(u32)((u64)qedi_conn->gen_pdu.resp_dma_addr >> 32);
+	bd_tbl->sge_addr.lo = (u32)qedi_conn->gen_pdu.resp_dma_addr;
+	bd_tbl->sge_len = ISCSI_DEF_MAX_RECV_SEG_LEN;
+}
+
+static int qedi_iscsi_send_generic_request(struct iscsi_task *task)
+{
+	struct qedi_cmd *cmd = task->dd_data;
+	struct qedi_conn *qedi_conn = cmd->conn;
+	char *buf;
+	int data_len;
+	int rc = 0;
+
+	qedi_iscsi_prep_generic_pdu_bd(qedi_conn);
+	switch (task->hdr->opcode & ISCSI_OPCODE_MASK) {
+	case ISCSI_OP_LOGIN:
+		qedi_send_iscsi_login(qedi_conn, task);
+		break;
+	case ISCSI_OP_NOOP_OUT:
+		data_len = qedi_conn->gen_pdu.req_buf_size;
+		buf = qedi_conn->gen_pdu.req_buf;
+		if (data_len)
+			rc = qedi_send_iscsi_nopout(qedi_conn, task,
+						    buf, data_len, 1);
+		else
+			rc = qedi_send_iscsi_nopout(qedi_conn, task,
+						    NULL, 0, 1);
+		break;
+	case ISCSI_OP_LOGOUT:
+		rc = qedi_send_iscsi_logout(qedi_conn, task);
+		break;
+	case ISCSI_OP_SCSI_TMFUNC:
+		rc = qedi_iscsi_abort_work(qedi_conn, task);
+		break;
+	case ISCSI_OP_TEXT:
+		rc = qedi_send_iscsi_text(qedi_conn, task);
+		break;
+	default:
+		iscsi_conn_printk(KERN_ALERT, qedi_conn->cls_conn->dd_data,
+				  "unsupported op 0x%x\n", task->hdr->opcode);
+	}
+
+	return rc;
+}
+
+static int qedi_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task)
+{
+	struct qedi_conn *qedi_conn = conn->dd_data;
+	struct qedi_cmd *cmd = task->dd_data;
+
+	memset(qedi_conn->gen_pdu.req_buf, 0, ISCSI_DEF_MAX_RECV_SEG_LEN);
+
+	qedi_conn->gen_pdu.req_buf_size = task->data_count;
+
+	if (task->data_count) {
+		memcpy(qedi_conn->gen_pdu.req_buf, task->data,
+		       task->data_count);
+		qedi_conn->gen_pdu.req_wr_ptr =
+			qedi_conn->gen_pdu.req_buf + task->data_count;
+	}
+
+	cmd->conn = conn->dd_data;
+	cmd->scsi_cmd = NULL;
+	return qedi_iscsi_send_generic_request(task);
+}
+
+static int qedi_task_xmit(struct iscsi_task *task)
+{
+	struct iscsi_conn *conn = task->conn;
+	struct qedi_conn *qedi_conn = conn->dd_data;
+	struct qedi_cmd *cmd = task->dd_data;
+	struct scsi_cmnd *sc = task->sc;
+
+	cmd->state = 0;
+	cmd->task = NULL;
+	cmd->use_slowpath = false;
+	cmd->conn = qedi_conn;
+	cmd->task = task;
+	cmd->io_cmd_in_list = false;
+	INIT_LIST_HEAD(&cmd->io_cmd);
+
+	if (!sc)
+		return qedi_mtask_xmit(conn, task);
+
+	cmd->scsi_cmd = sc;
+	return qedi_iscsi_send_ioreq(task);
+}
+
+static struct iscsi_endpoint *
+qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr,
+		int non_blocking)
+{
+	struct qedi_ctx *qedi;
+	struct iscsi_endpoint *ep;
+	struct qedi_endpoint *qedi_ep;
+	struct sockaddr_in *addr;
+	struct sockaddr_in6 *addr6;
+	struct qed_dev *cdev  =  NULL;
+	struct qedi_uio_dev *udev = NULL;
+	struct iscsi_path path_req;
+	u32 msg_type = ISCSI_KEVENT_IF_DOWN;
+	u32 iscsi_cid = QEDI_CID_RESERVED;
+	u16 len = 0;
+	char *buf = NULL;
+	int ret, tmp;
+
+	if (!shost) {
+		ret = -ENXIO;
+		QEDI_ERR(NULL, "shost is NULL\n");
+		return ERR_PTR(ret);
+	}
+
+	if (qedi_do_not_recover) {
+		ret = -ENOMEM;
+		return ERR_PTR(ret);
+	}
+
+	qedi = iscsi_host_priv(shost);
+	cdev = qedi->cdev;
+	udev = qedi->udev;
+
+	if (test_bit(QEDI_IN_OFFLINE, &qedi->flags) ||
+	    test_bit(QEDI_IN_RECOVERY, &qedi->flags)) {
+		ret = -ENOMEM;
+		return ERR_PTR(ret);
+	}
+
+	ep = iscsi_create_endpoint(sizeof(struct qedi_endpoint));
+	if (!ep) {
+		QEDI_ERR(&qedi->dbg_ctx, "endpoint create fail\n");
+		ret = -ENOMEM;
+		return ERR_PTR(ret);
+	}
+	qedi_ep = ep->dd_data;
+	memset(qedi_ep, 0, sizeof(struct qedi_endpoint));
+	qedi_ep->state = EP_STATE_IDLE;
+	qedi_ep->iscsi_cid = (u32)-1;
+	qedi_ep->qedi = qedi;
+
+	if (dst_addr->sa_family == AF_INET) {
+		addr = (struct sockaddr_in *)dst_addr;
+		memcpy(qedi_ep->dst_addr, &addr->sin_addr.s_addr,
+		       sizeof(struct in_addr));
+		qedi_ep->dst_port = ntohs(addr->sin_port);
+		qedi_ep->ip_type = TCP_IPV4;
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+			  "dst_addr=%pI4, dst_port=%u\n",
+			  qedi_ep->dst_addr, qedi_ep->dst_port);
+	} else if (dst_addr->sa_family == AF_INET6) {
+		addr6 = (struct sockaddr_in6 *)dst_addr;
+		memcpy(qedi_ep->dst_addr, &addr6->sin6_addr,
+		       sizeof(struct in6_addr));
+		qedi_ep->dst_port = ntohs(addr6->sin6_port);
+		qedi_ep->ip_type = TCP_IPV6;
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+			  "dst_addr=%pI6, dst_port=%u\n",
+			  qedi_ep->dst_addr, qedi_ep->dst_port);
+	} else {
+		QEDI_ERR(&qedi->dbg_ctx, "Invalid endpoint\n");
+	}
+
+	if (atomic_read(&qedi->link_state) != QEDI_LINK_UP) {
+		QEDI_WARN(&qedi->dbg_ctx, "qedi link down\n");
+		ret = -ENXIO;
+		goto ep_conn_exit;
+	}
+
+	ret = qedi_alloc_sq(qedi, qedi_ep);
+	if (ret)
+		goto ep_conn_exit;
+
+	ret = qedi_ops->acquire_conn(qedi->cdev, &qedi_ep->handle,
+				     &qedi_ep->fw_cid, &qedi_ep->p_doorbell);
+
+	if (ret) {
+		QEDI_ERR(&qedi->dbg_ctx, "Could not acquire connection\n");
+		ret = -ENXIO;
+		goto ep_free_sq;
+	}
+
+	iscsi_cid = qedi_ep->handle;
+	qedi_ep->iscsi_cid = iscsi_cid;
+
+	init_waitqueue_head(&qedi_ep->ofld_wait);
+	init_waitqueue_head(&qedi_ep->tcp_ofld_wait);
+	qedi_ep->state = EP_STATE_OFLDCONN_START;
+	qedi->ep_tbl[iscsi_cid] = qedi_ep;
+
+	buf = (char *)&path_req;
+	len = sizeof(path_req);
+	memset(&path_req, 0, len);
+
+	msg_type = ISCSI_KEVENT_PATH_REQ;
+	path_req.handle = (u64)qedi_ep->iscsi_cid;
+	path_req.pmtu = qedi->ll2_mtu;
+	qedi_ep->pmtu = qedi->ll2_mtu;
+	if (qedi_ep->ip_type == TCP_IPV4) {
+		memcpy(&path_req.dst.v4_addr, &qedi_ep->dst_addr,
+		       sizeof(struct in_addr));
+		path_req.ip_addr_len = 4;
+	} else {
+		memcpy(&path_req.dst.v6_addr, &qedi_ep->dst_addr,
+		       sizeof(struct in6_addr));
+		path_req.ip_addr_len = 16;
+	}
+
+	ret = iscsi_offload_mesg(shost, &qedi_iscsi_transport, msg_type, buf,
+				 len);
+	if (ret) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "iscsi_offload_mesg() failed for cid=0x%x ret=%d\n",
+			 iscsi_cid, ret);
+		goto ep_rel_conn;
+	}
+
+	atomic_inc(&qedi->num_offloads);
+	return ep;
+
+ep_rel_conn:
+	qedi->ep_tbl[iscsi_cid] = NULL;
+	tmp = qedi_ops->release_conn(qedi->cdev, qedi_ep->handle);
+	if (tmp)
+		QEDI_WARN(&qedi->dbg_ctx, "release_conn returned %d\n",
+			  tmp);
+ep_free_sq:
+	qedi_free_sq(qedi, qedi_ep);
+ep_conn_exit:
+	iscsi_destroy_endpoint(ep);
+	return ERR_PTR(ret);
+}
+
+static int qedi_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
+{
+	struct qedi_endpoint *qedi_ep;
+	int ret = 0;
+
+	if (qedi_do_not_recover)
+		return 1;
+
+	qedi_ep = ep->dd_data;
+	if (qedi_ep->state == EP_STATE_IDLE ||
+	    qedi_ep->state == EP_STATE_OFLDCONN_FAILED)
+		return -1;
+
+	if (qedi_ep->state == EP_STATE_OFLDCONN_COMPL)
+		ret = 1;
+
+	ret = wait_event_interruptible_timeout(qedi_ep->ofld_wait,
+					       QEDI_OFLD_WAIT_STATE(qedi_ep),
+					       msecs_to_jiffies(timeout_ms));
+
+	if (qedi_ep->state == EP_STATE_OFLDCONN_FAILED)
+		ret = -1;
+
+	if (ret > 0)
+		return 1;
+	else if (!ret)
+		return 0;
+	else
+		return ret;
+}
+
+static void qedi_cleanup_active_cmd_list(struct qedi_conn *qedi_conn)
+{
+	struct qedi_cmd *cmd, *cmd_tmp;
+
+	list_for_each_entry_safe(cmd, cmd_tmp, &qedi_conn->active_cmd_list,
+				 io_cmd) {
+		list_del_init(&cmd->io_cmd);
+		qedi_conn->active_cmd_count--;
+	}
+}
+
+static void qedi_ep_disconnect(struct iscsi_endpoint *ep)
+{
+	struct qedi_endpoint *qedi_ep;
+	struct qedi_conn *qedi_conn = NULL;
+	struct iscsi_conn *conn = NULL;
+	struct qedi_ctx *qedi;
+	int ret = 0;
+	int wait_delay = 20 * HZ;
+	int abrt_conn = 0;
+	int count = 10;
+
+	qedi_ep = ep->dd_data;
+	qedi = qedi_ep->qedi;
+
+	flush_work(&qedi_ep->offload_work);
+
+	if (qedi_ep->conn) {
+		qedi_conn = qedi_ep->conn;
+		conn = qedi_conn->cls_conn->dd_data;
+		iscsi_suspend_queue(conn);
+		abrt_conn = qedi_conn->abrt_conn;
+
+		while (count--)	{
+			if (!test_bit(QEDI_CONN_FW_CLEANUP,
+				      &qedi_conn->flags)) {
+				break;
+			}
+			msleep(1000);
+		}
+
+		if (test_bit(QEDI_IN_RECOVERY, &qedi->flags)) {
+			if (qedi_do_not_recover) {
+				QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+					  "Do not recover cid=0x%x\n",
+					  qedi_ep->iscsi_cid);
+				goto ep_exit_recover;
+			}
+			QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+				  "Reset recovery cid=0x%x, qedi_ep=%p, state=0x%x\n",
+				  qedi_ep->iscsi_cid, qedi_ep, qedi_ep->state);
+			qedi_cleanup_active_cmd_list(qedi_conn);
+			goto ep_release_conn;
+		}
+	}
+
+	if (qedi_do_not_recover)
+		goto ep_exit_recover;
+
+	switch (qedi_ep->state) {
+	case EP_STATE_OFLDCONN_START:
+		goto ep_release_conn;
+	case EP_STATE_OFLDCONN_FAILED:
+			break;
+	case EP_STATE_OFLDCONN_COMPL:
+		if (unlikely(!qedi_conn))
+			break;
+
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+			  "Active cmd count=%d, abrt_conn=%d, ep state=0x%x, cid=0x%x, qedi_conn=%p\n",
+			  qedi_conn->active_cmd_count, abrt_conn,
+			  qedi_ep->state,
+			  qedi_ep->iscsi_cid,
+			  qedi_ep->conn
+			  );
+
+		if (!qedi_conn->active_cmd_count)
+			abrt_conn = 0;
+		else
+			abrt_conn = 1;
+
+		if (abrt_conn)
+			qedi_clearsq(qedi, qedi_conn, NULL);
+		break;
+	default:
+		break;
+	}
+
+	qedi_ep->state = EP_STATE_DISCONN_START;
+	ret = qedi_ops->destroy_conn(qedi->cdev, qedi_ep->handle, abrt_conn);
+	if (ret) {
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "destroy_conn failed returned %d\n", ret);
+	} else {
+		ret = wait_event_interruptible_timeout(
+					qedi_ep->tcp_ofld_wait,
+					(qedi_ep->state !=
+					 EP_STATE_DISCONN_START),
+					wait_delay);
+		if ((ret <= 0) || (qedi_ep->state == EP_STATE_DISCONN_START)) {
+			QEDI_WARN(&qedi->dbg_ctx,
+				  "Destroy conn timedout or interrupted, ret=%d, delay=%d, cid=0x%x\n",
+				  ret, wait_delay, qedi_ep->iscsi_cid);
+		}
+	}
+
+ep_release_conn:
+	ret = qedi_ops->release_conn(qedi->cdev, qedi_ep->handle);
+	if (ret)
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "release_conn returned %d, cid=0x%x\n",
+			  ret, qedi_ep->iscsi_cid);
+ep_exit_recover:
+	qedi_ep->state = EP_STATE_IDLE;
+	qedi->ep_tbl[qedi_ep->iscsi_cid] = NULL;
+	qedi->cid_que.conn_cid_tbl[qedi_ep->iscsi_cid] = NULL;
+	qedi_free_id(&qedi->lcl_port_tbl, qedi_ep->src_port);
+	qedi_free_sq(qedi, qedi_ep);
+
+	if (qedi_conn)
+		qedi_conn->ep = NULL;
+
+	qedi_ep->conn = NULL;
+	qedi_ep->qedi = NULL;
+	atomic_dec(&qedi->num_offloads);
+
+	iscsi_destroy_endpoint(ep);
+}
+
+static int qedi_data_avail(struct qedi_ctx *qedi, u16 vlanid)
+{
+	struct qed_dev *cdev = qedi->cdev;
+	struct qedi_uio_dev *udev;
+	struct qedi_uio_ctrl *uctrl;
+	struct sk_buff *skb;
+	u32 len;
+	int rc = 0;
+
+	udev = qedi->udev;
+	if (!udev) {
+		QEDI_ERR(&qedi->dbg_ctx, "udev is NULL.\n");
+		return -EINVAL;
+	}
+
+	uctrl = (struct qedi_uio_ctrl *)udev->uctrl;
+	if (!uctrl) {
+		QEDI_ERR(&qedi->dbg_ctx, "uctlr is NULL.\n");
+		return -EINVAL;
+	}
+
+	len = uctrl->host_tx_pkt_len;
+	if (!len) {
+		QEDI_ERR(&qedi->dbg_ctx, "Invalid len %u\n", len);
+		return -EINVAL;
+	}
+
+	skb = alloc_skb(len, GFP_ATOMIC);
+	if (!skb) {
+		QEDI_ERR(&qedi->dbg_ctx, "alloc_skb failed\n");
+		return -EINVAL;
+	}
+
+	skb_put(skb, len);
+	memcpy(skb->data, udev->tx_pkt, len);
+	skb->ip_summed = CHECKSUM_NONE;
+
+	if (vlanid)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlanid);
+
+	rc = qedi_ops->ll2->start_xmit(cdev, skb);
+	if (rc) {
+		QEDI_ERR(&qedi->dbg_ctx, "ll2 start_xmit returned %d\n",
+			 rc);
+		kfree_skb(skb);
+	}
+
+	uctrl->host_tx_pkt_len = 0;
+	uctrl->hw_tx_cons++;
+
+	return rc;
+}
+
+static void qedi_offload_work(struct work_struct *work)
+{
+	struct qedi_endpoint *qedi_ep =
+		container_of(work, struct qedi_endpoint, offload_work);
+	struct qedi_ctx *qedi;
+	int wait_delay = 20 * HZ;
+	int ret;
+
+	qedi = qedi_ep->qedi;
+
+	ret = qedi_iscsi_offload_conn(qedi_ep);
+	if (ret) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "offload error: iscsi_cid=%u, qedi_ep=%p, ret=%d\n",
+			 qedi_ep->iscsi_cid, qedi_ep, ret);
+		qedi_ep->state = EP_STATE_OFLDCONN_FAILED;
+		return;
+	}
+
+	ret = wait_event_interruptible_timeout(qedi_ep->tcp_ofld_wait,
+					       (qedi_ep->state ==
+					       EP_STATE_OFLDCONN_COMPL),
+					       wait_delay);
+	if ((ret <= 0) || (qedi_ep->state != EP_STATE_OFLDCONN_COMPL)) {
+		qedi_ep->state = EP_STATE_OFLDCONN_FAILED;
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Offload conn TIMEOUT iscsi_cid=%u, qedi_ep=%p\n",
+			 qedi_ep->iscsi_cid, qedi_ep);
+	}
+}
+
+static int qedi_set_path(struct Scsi_Host *shost, struct iscsi_path *path_data)
+{
+	struct qedi_ctx *qedi;
+	struct qedi_endpoint *qedi_ep;
+	int ret = 0;
+	u32 iscsi_cid;
+	u16 port_id = 0;
+
+	if (!shost) {
+		ret = -ENXIO;
+		QEDI_ERR(NULL, "shost is NULL\n");
+		return ret;
+	}
+
+	if (strcmp(shost->hostt->proc_name, "qedi")) {
+		ret = -ENXIO;
+		QEDI_ERR(NULL, "shost %s is invalid\n",
+			 shost->hostt->proc_name);
+		return ret;
+	}
+
+	qedi = iscsi_host_priv(shost);
+	if (path_data->handle == QEDI_PATH_HANDLE) {
+		ret = qedi_data_avail(qedi, path_data->vlan_id);
+		goto set_path_exit;
+	}
+
+	iscsi_cid = (u32)path_data->handle;
+	qedi_ep = qedi->ep_tbl[iscsi_cid];
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+		  "iscsi_cid=0x%x, qedi_ep=%p\n", iscsi_cid, qedi_ep);
+	if (!qedi_ep) {
+		ret = -EINVAL;
+		goto set_path_exit;
+	}
+
+	if (!is_valid_ether_addr(&path_data->mac_addr[0])) {
+		QEDI_NOTICE(&qedi->dbg_ctx, "dst mac NOT VALID\n");
+		ret = -EIO;
+		goto set_path_exit;
+	}
+
+	ether_addr_copy(&qedi_ep->src_mac[0], &qedi->mac[0]);
+	ether_addr_copy(&qedi_ep->dst_mac[0], &path_data->mac_addr[0]);
+
+	qedi_ep->vlan_id = path_data->vlan_id;
+	if (path_data->pmtu < DEF_PATH_MTU) {
+		qedi_ep->pmtu = qedi->ll2_mtu;
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+			  "MTU cannot be %u, using default MTU %u\n",
+			   path_data->pmtu, qedi_ep->pmtu);
+	}
+
+	if (path_data->pmtu != qedi->ll2_mtu) {
+		if (path_data->pmtu > JUMBO_MTU) {
+			ret = -EINVAL;
+			QEDI_ERR(NULL, "Invalid MTU %u\n", path_data->pmtu);
+			goto set_path_exit;
+		}
+
+		qedi_reset_host_mtu(qedi, path_data->pmtu);
+		qedi_ep->pmtu = qedi->ll2_mtu;
+	}
+
+	port_id = qedi_ep->src_port;
+	if (port_id >= QEDI_LOCAL_PORT_MIN &&
+	    port_id < QEDI_LOCAL_PORT_MAX) {
+		if (qedi_alloc_id(&qedi->lcl_port_tbl, port_id))
+			port_id = 0;
+	} else {
+		port_id = 0;
+	}
+
+	if (!port_id) {
+		port_id = qedi_alloc_new_id(&qedi->lcl_port_tbl);
+		if (port_id == QEDI_LOCAL_PORT_INVALID) {
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "Failed to allocate port id for iscsi_cid=0x%x\n",
+				 iscsi_cid);
+			ret = -ENOMEM;
+			goto set_path_exit;
+		}
+	}
+
+	qedi_ep->src_port = port_id;
+
+	if (qedi_ep->ip_type == TCP_IPV4) {
+		memcpy(&qedi_ep->src_addr[0], &path_data->src.v4_addr,
+		       sizeof(struct in_addr));
+		memcpy(&qedi->src_ip[0], &path_data->src.v4_addr,
+		       sizeof(struct in_addr));
+		qedi->ip_type = TCP_IPV4;
+
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+			  "src addr:port=%pI4:%u, dst addr:port=%pI4:%u\n",
+			  qedi_ep->src_addr, qedi_ep->src_port,
+			  qedi_ep->dst_addr, qedi_ep->dst_port);
+	} else {
+		memcpy(&qedi_ep->src_addr[0], &path_data->src.v6_addr,
+		       sizeof(struct in6_addr));
+		memcpy(&qedi->src_ip[0], &path_data->src.v6_addr,
+		       sizeof(struct in6_addr));
+		qedi->ip_type = TCP_IPV6;
+
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+			  "src addr:port=%pI6:%u, dst addr:port=%pI6:%u\n",
+			  qedi_ep->src_addr, qedi_ep->src_port,
+			  qedi_ep->dst_addr, qedi_ep->dst_port);
+	}
+
+	INIT_WORK(&qedi_ep->offload_work, qedi_offload_work);
+	queue_work(qedi->offload_thread, &qedi_ep->offload_work);
+
+	ret = 0;
+
+set_path_exit:
+	return ret;
+}
+
+static umode_t qedi_attr_is_visible(int param_type, int param)
+{
+	switch (param_type) {
+	case ISCSI_HOST_PARAM:
+		switch (param) {
+		case ISCSI_HOST_PARAM_NETDEV_NAME:
+		case ISCSI_HOST_PARAM_HWADDRESS:
+		case ISCSI_HOST_PARAM_IPADDRESS:
+			return 0444;
+		default:
+			return 0;
+		}
+	case ISCSI_PARAM:
+		switch (param) {
+		case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		case ISCSI_PARAM_MAX_XMIT_DLENGTH:
+		case ISCSI_PARAM_HDRDGST_EN:
+		case ISCSI_PARAM_DATADGST_EN:
+		case ISCSI_PARAM_CONN_ADDRESS:
+		case ISCSI_PARAM_CONN_PORT:
+		case ISCSI_PARAM_EXP_STATSN:
+		case ISCSI_PARAM_PERSISTENT_ADDRESS:
+		case ISCSI_PARAM_PERSISTENT_PORT:
+		case ISCSI_PARAM_PING_TMO:
+		case ISCSI_PARAM_RECV_TMO:
+		case ISCSI_PARAM_INITIAL_R2T_EN:
+		case ISCSI_PARAM_MAX_R2T:
+		case ISCSI_PARAM_IMM_DATA_EN:
+		case ISCSI_PARAM_FIRST_BURST:
+		case ISCSI_PARAM_MAX_BURST:
+		case ISCSI_PARAM_PDU_INORDER_EN:
+		case ISCSI_PARAM_DATASEQ_INORDER_EN:
+		case ISCSI_PARAM_ERL:
+		case ISCSI_PARAM_TARGET_NAME:
+		case ISCSI_PARAM_TPGT:
+		case ISCSI_PARAM_USERNAME:
+		case ISCSI_PARAM_PASSWORD:
+		case ISCSI_PARAM_USERNAME_IN:
+		case ISCSI_PARAM_PASSWORD_IN:
+		case ISCSI_PARAM_FAST_ABORT:
+		case ISCSI_PARAM_ABORT_TMO:
+		case ISCSI_PARAM_LU_RESET_TMO:
+		case ISCSI_PARAM_TGT_RESET_TMO:
+		case ISCSI_PARAM_IFACE_NAME:
+		case ISCSI_PARAM_INITIATOR_NAME:
+		case ISCSI_PARAM_BOOT_ROOT:
+		case ISCSI_PARAM_BOOT_NIC:
+		case ISCSI_PARAM_BOOT_TARGET:
+			return 0444;
+		default:
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
+static void qedi_cleanup_task(struct iscsi_task *task)
+{
+	if (!task->sc || task->state == ISCSI_TASK_PENDING) {
+		QEDI_INFO(NULL, QEDI_LOG_IO, "Returning ref_cnt=%d\n",
+			  refcount_read(&task->refcount));
+		return;
+	}
+
+	qedi_iscsi_unmap_sg_list(task->dd_data);
+}
+
+struct iscsi_transport qedi_iscsi_transport = {
+	.owner = THIS_MODULE,
+	.name = QEDI_MODULE_NAME,
+	.caps = CAP_RECOVERY_L0 | CAP_HDRDGST | CAP_MULTI_R2T | CAP_DATADGST |
+		CAP_DATA_PATH_OFFLOAD | CAP_TEXT_NEGO,
+	.create_session = qedi_session_create,
+	.destroy_session = qedi_session_destroy,
+	.create_conn = qedi_conn_create,
+	.bind_conn = qedi_conn_bind,
+	.start_conn = qedi_conn_start,
+	.stop_conn = iscsi_conn_stop,
+	.destroy_conn = qedi_conn_destroy,
+	.set_param = iscsi_set_param,
+	.get_ep_param = qedi_ep_get_param,
+	.get_conn_param = iscsi_conn_get_param,
+	.get_session_param = iscsi_session_get_param,
+	.get_host_param = qedi_host_get_param,
+	.send_pdu = iscsi_conn_send_pdu,
+	.get_stats = qedi_conn_get_stats,
+	.xmit_task = qedi_task_xmit,
+	.cleanup_task = qedi_cleanup_task,
+	.session_recovery_timedout = iscsi_session_recovery_timedout,
+	.ep_connect = qedi_ep_connect,
+	.ep_poll = qedi_ep_poll,
+	.ep_disconnect = qedi_ep_disconnect,
+	.set_path = qedi_set_path,
+	.attr_is_visible = qedi_attr_is_visible,
+};
+
+void qedi_start_conn_recovery(struct qedi_ctx *qedi,
+			      struct qedi_conn *qedi_conn)
+{
+	struct iscsi_cls_session *cls_sess;
+	struct iscsi_cls_conn *cls_conn;
+	struct iscsi_conn *conn;
+
+	cls_conn = qedi_conn->cls_conn;
+	conn = cls_conn->dd_data;
+	cls_sess = iscsi_conn_to_session(cls_conn);
+
+	if (iscsi_is_session_online(cls_sess)) {
+		qedi_conn->abrt_conn = 1;
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Failing connection, state=0x%x, cid=0x%x\n",
+			 conn->session->state, qedi_conn->iscsi_conn_id);
+		iscsi_conn_failure(qedi_conn->cls_conn->dd_data,
+				   ISCSI_ERR_CONN_FAILED);
+	}
+}
+
+static const struct {
+	enum iscsi_error_types error_code;
+	char *err_string;
+} qedi_iscsi_error[] = {
+	{ ISCSI_STATUS_NONE,
+	  "tcp_error none"
+	},
+	{ ISCSI_CONN_ERROR_TASK_CID_MISMATCH,
+	  "task cid mismatch"
+	},
+	{ ISCSI_CONN_ERROR_TASK_NOT_VALID,
+	  "invalid task"
+	},
+	{ ISCSI_CONN_ERROR_RQ_RING_IS_FULL,
+	  "rq ring full"
+	},
+	{ ISCSI_CONN_ERROR_CMDQ_RING_IS_FULL,
+	  "cmdq ring full"
+	},
+	{ ISCSI_CONN_ERROR_HQE_CACHING_FAILED,
+	  "sge caching failed"
+	},
+	{ ISCSI_CONN_ERROR_HEADER_DIGEST_ERROR,
+	  "hdr digest error"
+	},
+	{ ISCSI_CONN_ERROR_LOCAL_COMPLETION_ERROR,
+	  "local cmpl error"
+	},
+	{ ISCSI_CONN_ERROR_DATA_OVERRUN,
+	  "invalid task"
+	},
+	{ ISCSI_CONN_ERROR_OUT_OF_SGES_ERROR,
+	  "out of sge error"
+	},
+	{ ISCSI_CONN_ERROR_TCP_IP_FRAGMENT_ERROR,
+	  "tcp ip fragment error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_AHS_LEN,
+	  "AHS len protocol error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_ITT_OUT_OF_RANGE,
+	  "itt out of range error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_DATA_SEG_LEN_EXCEEDS_PDU_SIZE,
+	  "data seg more than pdu size"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_INVALID_OPCODE,
+	  "invalid opcode"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_INVALID_OPCODE_BEFORE_UPDATE,
+	  "invalid opcode before update"
+	},
+	{ ISCSI_CONN_ERROR_UNVALID_NOPIN_DSL,
+	  "unexpected opcode"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_R2T_CARRIES_NO_DATA,
+	  "r2t carries no data"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_DATA_SN,
+	  "data sn error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_DATA_IN_TTT,
+	  "data TTT error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_R2T_TTT,
+	  "r2t TTT error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_R2T_BUFFER_OFFSET,
+	  "buffer offset error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_BUFFER_OFFSET_OOO,
+	  "buffer offset ooo"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_R2T_SN,
+	  "data seg len 0"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_DESIRED_DATA_TRNS_LEN_0,
+	  "data xer len error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_DESIRED_DATA_TRNS_LEN_1,
+	  "data xer len1 error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_DESIRED_DATA_TRNS_LEN_2,
+	  "data xer len2 error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_LUN,
+	  "protocol lun error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_F_BIT_ZERO,
+	  "f bit zero error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_EXP_STAT_SN,
+	  "exp stat sn error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_DSL_NOT_ZERO,
+	  "dsl not zero error"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_INVALID_DSL,
+	  "invalid dsl"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_DATA_SEG_LEN_TOO_BIG,
+	  "data seg len too big"
+	},
+	{ ISCSI_CONN_ERROR_PROTOCOL_ERR_OUTSTANDING_R2T_COUNT,
+	  "outstanding r2t count error"
+	},
+	{ ISCSI_CONN_ERROR_SENSE_DATA_LENGTH,
+	  "sense datalen error"
+	},
+};
+
+char *qedi_get_iscsi_error(enum iscsi_error_types err_code)
+{
+	int i;
+	char *msg = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(qedi_iscsi_error); i++) {
+		if (qedi_iscsi_error[i].error_code == err_code) {
+			msg = qedi_iscsi_error[i].err_string;
+			break;
+		}
+	}
+	return msg;
+}
+
+void qedi_process_iscsi_error(struct qedi_endpoint *ep,
+			      struct iscsi_eqe_data *data)
+{
+	struct qedi_conn *qedi_conn;
+	struct qedi_ctx *qedi;
+	char warn_notice[] = "iscsi_warning";
+	char error_notice[] = "iscsi_error";
+	char unknown_msg[] = "Unknown error";
+	char *message;
+	int need_recovery = 0;
+	u32 err_mask = 0;
+	char *msg;
+
+	if (!ep)
+		return;
+
+	qedi_conn = ep->conn;
+	if (!qedi_conn)
+		return;
+
+	qedi = ep->qedi;
+
+	QEDI_ERR(&qedi->dbg_ctx, "async event iscsi error:0x%x\n",
+		 data->error_code);
+
+	if (err_mask) {
+		need_recovery = 0;
+		message = warn_notice;
+	} else {
+		need_recovery = 1;
+		message = error_notice;
+	}
+
+	msg = qedi_get_iscsi_error(data->error_code);
+	if (!msg) {
+		need_recovery = 0;
+		msg = unknown_msg;
+	}
+
+	iscsi_conn_printk(KERN_ALERT,
+			  qedi_conn->cls_conn->dd_data,
+			  "qedi: %s - %s\n", message, msg);
+
+	if (need_recovery)
+		qedi_start_conn_recovery(qedi_conn->qedi, qedi_conn);
+}
+
+void qedi_process_tcp_error(struct qedi_endpoint *ep,
+			    struct iscsi_eqe_data *data)
+{
+	struct qedi_conn *qedi_conn;
+
+	if (!ep)
+		return;
+
+	qedi_conn = ep->conn;
+	if (!qedi_conn)
+		return;
+
+	QEDI_ERR(&ep->qedi->dbg_ctx, "async event TCP error:0x%x\n",
+		 data->error_code);
+
+	qedi_start_conn_recovery(qedi_conn->qedi, qedi_conn);
+}
diff --git a/drivers/scsi/qedi/qedi_iscsi.h b/drivers/scsi/qedi/qedi_iscsi.h
new file mode 100644
index 0000000..ea13151
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_iscsi.h
@@ -0,0 +1,232 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QEDI_ISCSI_H_
+#define _QEDI_ISCSI_H_
+
+#include <linux/socket.h>
+#include <linux/completion.h>
+#include "qedi.h"
+
+#define ISCSI_MAX_SESS_PER_HBA	4096
+
+#define DEF_KA_TIMEOUT		7200000
+#define DEF_KA_INTERVAL		10000
+#define DEF_KA_MAX_PROBE_COUNT	10
+#define DEF_TOS			0
+#define DEF_TTL			0xfe
+#define DEF_SND_SEQ_SCALE	0
+#define DEF_RCV_BUF		0xffff
+#define DEF_SND_BUF		0xffff
+#define DEF_SEED		0
+#define DEF_MAX_RT_TIME		8000
+#define DEF_MAX_DA_COUNT        2
+#define DEF_SWS_TIMER		1000
+#define DEF_MAX_CWND		2
+#define DEF_PATH_MTU		1500
+#define DEF_MSS			1460
+#define DEF_LL2_MTU		1560
+#define JUMBO_MTU		9000
+
+#define MIN_MTU         576 /* rfc 793 */
+#define IPV4_HDR_LEN    20
+#define IPV6_HDR_LEN    40
+#define TCP_HDR_LEN     20
+#define TCP_OPTION_LEN  12
+#define VLAN_LEN         4
+
+enum {
+	EP_STATE_IDLE                   = 0x0,
+	EP_STATE_ACQRCONN_START         = 0x1,
+	EP_STATE_ACQRCONN_COMPL         = 0x2,
+	EP_STATE_OFLDCONN_START         = 0x4,
+	EP_STATE_OFLDCONN_COMPL         = 0x8,
+	EP_STATE_DISCONN_START          = 0x10,
+	EP_STATE_DISCONN_COMPL          = 0x20,
+	EP_STATE_CLEANUP_START          = 0x40,
+	EP_STATE_CLEANUP_CMPL           = 0x80,
+	EP_STATE_TCP_FIN_RCVD           = 0x100,
+	EP_STATE_TCP_RST_RCVD           = 0x200,
+	EP_STATE_LOGOUT_SENT            = 0x400,
+	EP_STATE_LOGOUT_RESP_RCVD       = 0x800,
+	EP_STATE_CLEANUP_FAILED         = 0x1000,
+	EP_STATE_OFLDCONN_FAILED        = 0x2000,
+	EP_STATE_CONNECT_FAILED         = 0x4000,
+	EP_STATE_DISCONN_TIMEDOUT       = 0x8000,
+};
+
+struct qedi_conn;
+
+struct qedi_endpoint {
+	struct qedi_ctx *qedi;
+	u32 dst_addr[4];
+	u32 src_addr[4];
+	u16 src_port;
+	u16 dst_port;
+	u16 vlan_id;
+	u16 pmtu;
+	u8 src_mac[ETH_ALEN];
+	u8 dst_mac[ETH_ALEN];
+	u8 ip_type;
+	int state;
+	wait_queue_head_t ofld_wait;
+	wait_queue_head_t tcp_ofld_wait;
+	u32 iscsi_cid;
+	/* identifier of the connection from qed */
+	u32 handle;
+	u32 fw_cid;
+	void __iomem *p_doorbell;
+
+	/* Send queue management */
+	struct iscsi_wqe *sq;
+	dma_addr_t sq_dma;
+
+	u16 sq_prod_idx;
+	u16 fw_sq_prod_idx;
+	u16 sq_con_idx;
+	u32 sq_mem_size;
+
+	void *sq_pbl;
+	dma_addr_t sq_pbl_dma;
+	u32 sq_pbl_size;
+	struct qedi_conn *conn;
+	struct work_struct offload_work;
+};
+
+#define QEDI_SQ_WQES_MIN	16
+
+struct qedi_io_bdt {
+	struct scsi_sge *sge_tbl;
+	dma_addr_t sge_tbl_dma;
+	u16 sge_valid;
+};
+
+/**
+ * struct generic_pdu_resc - login pdu resource structure
+ *
+ * @req_buf:            driver buffer used to stage payload associated with
+ *                      the login request
+ * @req_dma_addr:       dma address for iscsi login request payload buffer
+ * @req_buf_size:       actual login request payload length
+ * @req_wr_ptr:         pointer into login request buffer when next data is
+ *                      to be written
+ * @resp_hdr:           iscsi header where iscsi login response header is to
+ *                      be recreated
+ * @resp_buf:           buffer to stage login response payload
+ * @resp_dma_addr:      login response payload buffer dma address
+ * @resp_buf_size:      login response paylod length
+ * @resp_wr_ptr:        pointer into login response buffer when next data is
+ *                      to be written
+ * @req_bd_tbl:         iscsi login request payload BD table
+ * @req_bd_dma:         login request BD table dma address
+ * @resp_bd_tbl:        iscsi login response payload BD table
+ * @resp_bd_dma:        login request BD table dma address
+ *
+ * following structure defines buffer info for generic pdus such as iSCSI Login,
+ *      Logout and NOP
+ */
+struct generic_pdu_resc {
+	char *req_buf;
+	dma_addr_t req_dma_addr;
+	u32 req_buf_size;
+	char *req_wr_ptr;
+	struct iscsi_hdr resp_hdr;
+	char *resp_buf;
+	dma_addr_t resp_dma_addr;
+	u32 resp_buf_size;
+	char *resp_wr_ptr;
+	char *req_bd_tbl;
+	dma_addr_t req_bd_dma;
+	char *resp_bd_tbl;
+	dma_addr_t resp_bd_dma;
+};
+
+struct qedi_conn {
+	struct iscsi_cls_conn *cls_conn;
+	struct qedi_ctx *qedi;
+	struct qedi_endpoint *ep;
+	struct list_head active_cmd_list;
+	spinlock_t list_lock;		/* internal conn lock */
+	u32 active_cmd_count;
+	u32 cmd_cleanup_req;
+	u32 cmd_cleanup_cmpl;
+
+	u32 iscsi_conn_id;
+	int itt;
+	int abrt_conn;
+#define QEDI_CID_RESERVED	0x5AFF
+	u32 fw_cid;
+	/*
+	 * Buffer for login negotiation process
+	 */
+	struct generic_pdu_resc gen_pdu;
+
+	struct list_head tmf_work_list;
+	wait_queue_head_t wait_queue;
+	spinlock_t tmf_work_lock;	/* tmf work lock */
+	unsigned long flags;
+#define QEDI_CONN_FW_CLEANUP	1
+};
+
+struct qedi_cmd {
+	struct list_head io_cmd;
+	bool io_cmd_in_list;
+	struct iscsi_hdr hdr;
+	struct qedi_conn *conn;
+	struct scsi_cmnd *scsi_cmd;
+	struct scatterlist *sg;
+	struct qedi_io_bdt io_tbl;
+	struct e4_iscsi_task_context request;
+	unsigned char *sense_buffer;
+	dma_addr_t sense_buffer_dma;
+	u16 task_id;
+
+	/* field populated for tmf work queue */
+	struct iscsi_task *task;
+	struct work_struct tmf_work;
+	int state;
+#define CLEANUP_WAIT	1
+#define CLEANUP_RECV	2
+#define CLEANUP_WAIT_FAILED	3
+#define CLEANUP_NOT_REQUIRED	4
+#define LUN_RESET_RESPONSE_RECEIVED	5
+#define RESPONSE_RECEIVED	6
+
+	int type;
+#define TYPEIO		1
+#define TYPERESET	2
+
+	struct qedi_work_map *list_tmf_work;
+	/* slowpath management */
+	bool use_slowpath;
+
+	struct iscsi_tm_rsp *tmf_resp_buf;
+	struct qedi_work cqe_work;
+};
+
+struct qedi_work_map {
+	struct list_head list;
+	struct qedi_cmd *qedi_cmd;
+	int rtid;
+
+	int state;
+#define QEDI_WORK_QUEUED	1
+#define QEDI_WORK_SCHEDULED	2
+#define QEDI_WORK_EXIT		3
+
+	struct work_struct *ptr_tmf_work;
+};
+
+#define qedi_set_itt(task_id, itt) ((u32)(((task_id) & 0xffff) | ((itt) << 16)))
+#define qedi_get_itt(cqe) (cqe.iscsi_hdr.cmd.itt >> 16)
+
+#define QEDI_OFLD_WAIT_STATE(q) ((q)->state == EP_STATE_OFLDCONN_FAILED || \
+				(q)->state == EP_STATE_OFLDCONN_COMPL)
+
+#endif /* _QEDI_ISCSI_H_ */
diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
new file mode 100644
index 0000000..4da3592
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -0,0 +1,2506 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/kernel.h>
+#include <linux/if_arp.h>
+#include <scsi/iscsi_if.h>
+#include <linux/inet.h>
+#include <net/arp.h>
+#include <linux/list.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/if_vlan.h>
+#include <linux/cpu.h>
+#include <linux/iscsi_boot_sysfs.h>
+
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_eh.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi.h>
+
+#include "qedi.h"
+#include "qedi_gbl.h"
+#include "qedi_iscsi.h"
+
+static uint qedi_fw_debug;
+module_param(qedi_fw_debug, uint, 0644);
+MODULE_PARM_DESC(qedi_fw_debug, " Firmware debug level 0(default) to 3");
+
+uint qedi_dbg_log = QEDI_LOG_WARN | QEDI_LOG_SCSI_TM;
+module_param(qedi_dbg_log, uint, 0644);
+MODULE_PARM_DESC(qedi_dbg_log, " Default debug level");
+
+uint qedi_io_tracing;
+module_param(qedi_io_tracing, uint, 0644);
+MODULE_PARM_DESC(qedi_io_tracing,
+		 " Enable logging of SCSI requests/completions into trace buffer. (default off).");
+
+const struct qed_iscsi_ops *qedi_ops;
+static struct scsi_transport_template *qedi_scsi_transport;
+static struct pci_driver qedi_pci_driver;
+static DEFINE_PER_CPU(struct qedi_percpu_s, qedi_percpu);
+static LIST_HEAD(qedi_udev_list);
+/* Static function declaration */
+static int qedi_alloc_global_queues(struct qedi_ctx *qedi);
+static void qedi_free_global_queues(struct qedi_ctx *qedi);
+static struct qedi_cmd *qedi_get_cmd_from_tid(struct qedi_ctx *qedi, u32 tid);
+static void qedi_reset_uio_rings(struct qedi_uio_dev *udev);
+static void qedi_ll2_free_skbs(struct qedi_ctx *qedi);
+
+static int qedi_iscsi_event_cb(void *context, u8 fw_event_code, void *fw_handle)
+{
+	struct qedi_ctx *qedi;
+	struct qedi_endpoint *qedi_ep;
+	struct iscsi_eqe_data *data;
+	int rval = 0;
+
+	if (!context || !fw_handle) {
+		QEDI_ERR(NULL, "Recv event with ctx NULL\n");
+		return -EINVAL;
+	}
+
+	qedi = (struct qedi_ctx *)context;
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+		  "Recv Event %d fw_handle %p\n", fw_event_code, fw_handle);
+
+	data = (struct iscsi_eqe_data *)fw_handle;
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+		  "icid=0x%x conn_id=0x%x err-code=0x%x error-pdu-opcode-reserved=0x%x\n",
+		   data->icid, data->conn_id, data->error_code,
+		   data->error_pdu_opcode_reserved);
+
+	qedi_ep = qedi->ep_tbl[data->icid];
+
+	if (!qedi_ep) {
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "Cannot process event, ep already disconnected, cid=0x%x\n",
+			   data->icid);
+		WARN_ON(1);
+		return -ENODEV;
+	}
+
+	switch (fw_event_code) {
+	case ISCSI_EVENT_TYPE_ASYN_CONNECT_COMPLETE:
+		if (qedi_ep->state == EP_STATE_OFLDCONN_START)
+			qedi_ep->state = EP_STATE_OFLDCONN_COMPL;
+
+		wake_up_interruptible(&qedi_ep->tcp_ofld_wait);
+		break;
+	case ISCSI_EVENT_TYPE_ASYN_TERMINATE_DONE:
+		qedi_ep->state = EP_STATE_DISCONN_COMPL;
+		wake_up_interruptible(&qedi_ep->tcp_ofld_wait);
+		break;
+	case ISCSI_EVENT_TYPE_ISCSI_CONN_ERROR:
+		qedi_process_iscsi_error(qedi_ep, data);
+		break;
+	case ISCSI_EVENT_TYPE_ASYN_ABORT_RCVD:
+	case ISCSI_EVENT_TYPE_ASYN_SYN_RCVD:
+	case ISCSI_EVENT_TYPE_ASYN_MAX_RT_TIME:
+	case ISCSI_EVENT_TYPE_ASYN_MAX_RT_CNT:
+	case ISCSI_EVENT_TYPE_ASYN_MAX_KA_PROBES_CNT:
+	case ISCSI_EVENT_TYPE_ASYN_FIN_WAIT2:
+	case ISCSI_EVENT_TYPE_TCP_CONN_ERROR:
+		qedi_process_tcp_error(qedi_ep, data);
+		break;
+	default:
+		QEDI_ERR(&qedi->dbg_ctx, "Recv Unknown Event %u\n",
+			 fw_event_code);
+	}
+
+	return rval;
+}
+
+static int qedi_uio_open(struct uio_info *uinfo, struct inode *inode)
+{
+	struct qedi_uio_dev *udev = uinfo->priv;
+	struct qedi_ctx *qedi = udev->qedi;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (udev->uio_dev != -1)
+		return -EBUSY;
+
+	rtnl_lock();
+	udev->uio_dev = iminor(inode);
+	qedi_reset_uio_rings(udev);
+	set_bit(UIO_DEV_OPENED, &qedi->flags);
+	rtnl_unlock();
+
+	return 0;
+}
+
+static int qedi_uio_close(struct uio_info *uinfo, struct inode *inode)
+{
+	struct qedi_uio_dev *udev = uinfo->priv;
+	struct qedi_ctx *qedi = udev->qedi;
+
+	udev->uio_dev = -1;
+	clear_bit(UIO_DEV_OPENED, &qedi->flags);
+	qedi_ll2_free_skbs(qedi);
+	return 0;
+}
+
+static void __qedi_free_uio_rings(struct qedi_uio_dev *udev)
+{
+	if (udev->uctrl) {
+		free_page((unsigned long)udev->uctrl);
+		udev->uctrl = NULL;
+	}
+
+	if (udev->ll2_ring) {
+		free_page((unsigned long)udev->ll2_ring);
+		udev->ll2_ring = NULL;
+	}
+
+	if (udev->ll2_buf) {
+		free_pages((unsigned long)udev->ll2_buf, 2);
+		udev->ll2_buf = NULL;
+	}
+}
+
+static void __qedi_free_uio(struct qedi_uio_dev *udev)
+{
+	uio_unregister_device(&udev->qedi_uinfo);
+
+	__qedi_free_uio_rings(udev);
+
+	pci_dev_put(udev->pdev);
+	kfree(udev);
+}
+
+static void qedi_free_uio(struct qedi_uio_dev *udev)
+{
+	if (!udev)
+		return;
+
+	list_del_init(&udev->list);
+	__qedi_free_uio(udev);
+}
+
+static void qedi_reset_uio_rings(struct qedi_uio_dev *udev)
+{
+	struct qedi_ctx *qedi = NULL;
+	struct qedi_uio_ctrl *uctrl = NULL;
+
+	qedi = udev->qedi;
+	uctrl = udev->uctrl;
+
+	spin_lock_bh(&qedi->ll2_lock);
+	uctrl->host_rx_cons = 0;
+	uctrl->hw_rx_prod = 0;
+	uctrl->hw_rx_bd_prod = 0;
+	uctrl->host_rx_bd_cons = 0;
+
+	memset(udev->ll2_ring, 0, udev->ll2_ring_size);
+	memset(udev->ll2_buf, 0, udev->ll2_buf_size);
+	spin_unlock_bh(&qedi->ll2_lock);
+}
+
+static int __qedi_alloc_uio_rings(struct qedi_uio_dev *udev)
+{
+	int rc = 0;
+
+	if (udev->ll2_ring || udev->ll2_buf)
+		return rc;
+
+	/* Memory for control area.  */
+	udev->uctrl = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!udev->uctrl)
+		return -ENOMEM;
+
+	/* Allocating memory for LL2 ring  */
+	udev->ll2_ring_size = QEDI_PAGE_SIZE;
+	udev->ll2_ring = (void *)get_zeroed_page(GFP_KERNEL | __GFP_COMP);
+	if (!udev->ll2_ring) {
+		rc = -ENOMEM;
+		goto exit_alloc_ring;
+	}
+
+	/* Allocating memory for Tx/Rx pkt buffer */
+	udev->ll2_buf_size = TX_RX_RING * LL2_SINGLE_BUF_SIZE;
+	udev->ll2_buf_size = QEDI_PAGE_ALIGN(udev->ll2_buf_size);
+	udev->ll2_buf = (void *)__get_free_pages(GFP_KERNEL | __GFP_COMP |
+						 __GFP_ZERO, 2);
+	if (!udev->ll2_buf) {
+		rc = -ENOMEM;
+		goto exit_alloc_buf;
+	}
+	return rc;
+
+exit_alloc_buf:
+	free_page((unsigned long)udev->ll2_ring);
+	udev->ll2_ring = NULL;
+exit_alloc_ring:
+	return rc;
+}
+
+static int qedi_alloc_uio_rings(struct qedi_ctx *qedi)
+{
+	struct qedi_uio_dev *udev = NULL;
+	int rc = 0;
+
+	list_for_each_entry(udev, &qedi_udev_list, list) {
+		if (udev->pdev == qedi->pdev) {
+			udev->qedi = qedi;
+			if (__qedi_alloc_uio_rings(udev)) {
+				udev->qedi = NULL;
+				return -ENOMEM;
+			}
+			qedi->udev = udev;
+			return 0;
+		}
+	}
+
+	udev = kzalloc(sizeof(*udev), GFP_KERNEL);
+	if (!udev) {
+		rc = -ENOMEM;
+		goto err_udev;
+	}
+
+	udev->uio_dev = -1;
+
+	udev->qedi = qedi;
+	udev->pdev = qedi->pdev;
+
+	rc = __qedi_alloc_uio_rings(udev);
+	if (rc)
+		goto err_uctrl;
+
+	list_add(&udev->list, &qedi_udev_list);
+
+	pci_dev_get(udev->pdev);
+	qedi->udev = udev;
+
+	udev->tx_pkt = udev->ll2_buf;
+	udev->rx_pkt = udev->ll2_buf + LL2_SINGLE_BUF_SIZE;
+	return 0;
+
+ err_uctrl:
+	kfree(udev);
+ err_udev:
+	return -ENOMEM;
+}
+
+static int qedi_init_uio(struct qedi_ctx *qedi)
+{
+	struct qedi_uio_dev *udev = qedi->udev;
+	struct uio_info *uinfo;
+	int ret = 0;
+
+	if (!udev)
+		return -ENOMEM;
+
+	uinfo = &udev->qedi_uinfo;
+
+	uinfo->mem[0].addr = (unsigned long)udev->uctrl;
+	uinfo->mem[0].size = sizeof(struct qedi_uio_ctrl);
+	uinfo->mem[0].memtype = UIO_MEM_LOGICAL;
+
+	uinfo->mem[1].addr = (unsigned long)udev->ll2_ring;
+	uinfo->mem[1].size = udev->ll2_ring_size;
+	uinfo->mem[1].memtype = UIO_MEM_LOGICAL;
+
+	uinfo->mem[2].addr = (unsigned long)udev->ll2_buf;
+	uinfo->mem[2].size = udev->ll2_buf_size;
+	uinfo->mem[2].memtype = UIO_MEM_LOGICAL;
+
+	uinfo->name = "qedi_uio";
+	uinfo->version = QEDI_MODULE_VERSION;
+	uinfo->irq = UIO_IRQ_CUSTOM;
+
+	uinfo->open = qedi_uio_open;
+	uinfo->release = qedi_uio_close;
+
+	if (udev->uio_dev == -1) {
+		if (!uinfo->priv) {
+			uinfo->priv = udev;
+
+			ret = uio_register_device(&udev->pdev->dev, uinfo);
+			if (ret) {
+				QEDI_ERR(&qedi->dbg_ctx,
+					 "UIO registration failed\n");
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int qedi_alloc_and_init_sb(struct qedi_ctx *qedi,
+				  struct qed_sb_info *sb_info, u16 sb_id)
+{
+	struct status_block_e4 *sb_virt;
+	dma_addr_t sb_phys;
+	int ret;
+
+	sb_virt = dma_alloc_coherent(&qedi->pdev->dev,
+				     sizeof(struct status_block_e4), &sb_phys,
+				     GFP_KERNEL);
+	if (!sb_virt) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Status block allocation failed for id = %d.\n",
+			  sb_id);
+		return -ENOMEM;
+	}
+
+	ret = qedi_ops->common->sb_init(qedi->cdev, sb_info, sb_virt, sb_phys,
+				       sb_id, QED_SB_TYPE_STORAGE);
+	if (ret) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Status block initialization failed for id = %d.\n",
+			  sb_id);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void qedi_free_sb(struct qedi_ctx *qedi)
+{
+	struct qed_sb_info *sb_info;
+	int id;
+
+	for (id = 0; id < MIN_NUM_CPUS_MSIX(qedi); id++) {
+		sb_info = &qedi->sb_array[id];
+		if (sb_info->sb_virt)
+			dma_free_coherent(&qedi->pdev->dev,
+					  sizeof(*sb_info->sb_virt),
+					  (void *)sb_info->sb_virt,
+					  sb_info->sb_phys);
+	}
+}
+
+static void qedi_free_fp(struct qedi_ctx *qedi)
+{
+	kfree(qedi->fp_array);
+	kfree(qedi->sb_array);
+}
+
+static void qedi_destroy_fp(struct qedi_ctx *qedi)
+{
+	qedi_free_sb(qedi);
+	qedi_free_fp(qedi);
+}
+
+static int qedi_alloc_fp(struct qedi_ctx *qedi)
+{
+	int ret = 0;
+
+	qedi->fp_array = kcalloc(MIN_NUM_CPUS_MSIX(qedi),
+				 sizeof(struct qedi_fastpath), GFP_KERNEL);
+	if (!qedi->fp_array) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "fastpath fp array allocation failed.\n");
+		return -ENOMEM;
+	}
+
+	qedi->sb_array = kcalloc(MIN_NUM_CPUS_MSIX(qedi),
+				 sizeof(struct qed_sb_info), GFP_KERNEL);
+	if (!qedi->sb_array) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "fastpath sb array allocation failed.\n");
+		ret = -ENOMEM;
+		goto free_fp;
+	}
+
+	return ret;
+
+free_fp:
+	qedi_free_fp(qedi);
+	return ret;
+}
+
+static void qedi_int_fp(struct qedi_ctx *qedi)
+{
+	struct qedi_fastpath *fp;
+	int id;
+
+	memset(qedi->fp_array, 0, MIN_NUM_CPUS_MSIX(qedi) *
+	       sizeof(*qedi->fp_array));
+	memset(qedi->sb_array, 0, MIN_NUM_CPUS_MSIX(qedi) *
+	       sizeof(*qedi->sb_array));
+
+	for (id = 0; id < MIN_NUM_CPUS_MSIX(qedi); id++) {
+		fp = &qedi->fp_array[id];
+		fp->sb_info = &qedi->sb_array[id];
+		fp->sb_id = id;
+		fp->qedi = qedi;
+		snprintf(fp->name, sizeof(fp->name), "%s-fp-%d",
+			 "qedi", id);
+
+		/* fp_array[i] ---- irq cookie
+		 * So init data which is needed in int ctx
+		 */
+	}
+}
+
+static int qedi_prepare_fp(struct qedi_ctx *qedi)
+{
+	struct qedi_fastpath *fp;
+	int id, ret = 0;
+
+	ret = qedi_alloc_fp(qedi);
+	if (ret)
+		goto err;
+
+	qedi_int_fp(qedi);
+
+	for (id = 0; id < MIN_NUM_CPUS_MSIX(qedi); id++) {
+		fp = &qedi->fp_array[id];
+		ret = qedi_alloc_and_init_sb(qedi, fp->sb_info, fp->sb_id);
+		if (ret) {
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "SB allocation and initialization failed.\n");
+			ret = -EIO;
+			goto err_init;
+		}
+	}
+
+	return 0;
+
+err_init:
+	qedi_free_sb(qedi);
+	qedi_free_fp(qedi);
+err:
+	return ret;
+}
+
+static int qedi_setup_cid_que(struct qedi_ctx *qedi)
+{
+	int i;
+
+	qedi->cid_que.cid_que_base = kmalloc_array(qedi->max_active_conns,
+						   sizeof(u32), GFP_KERNEL);
+	if (!qedi->cid_que.cid_que_base)
+		return -ENOMEM;
+
+	qedi->cid_que.conn_cid_tbl = kmalloc_array(qedi->max_active_conns,
+						   sizeof(struct qedi_conn *),
+						   GFP_KERNEL);
+	if (!qedi->cid_que.conn_cid_tbl) {
+		kfree(qedi->cid_que.cid_que_base);
+		qedi->cid_que.cid_que_base = NULL;
+		return -ENOMEM;
+	}
+
+	qedi->cid_que.cid_que = (u32 *)qedi->cid_que.cid_que_base;
+	qedi->cid_que.cid_q_prod_idx = 0;
+	qedi->cid_que.cid_q_cons_idx = 0;
+	qedi->cid_que.cid_q_max_idx = qedi->max_active_conns;
+	qedi->cid_que.cid_free_cnt = qedi->max_active_conns;
+
+	for (i = 0; i < qedi->max_active_conns; i++) {
+		qedi->cid_que.cid_que[i] = i;
+		qedi->cid_que.conn_cid_tbl[i] = NULL;
+	}
+
+	return 0;
+}
+
+static void qedi_release_cid_que(struct qedi_ctx *qedi)
+{
+	kfree(qedi->cid_que.cid_que_base);
+	qedi->cid_que.cid_que_base = NULL;
+
+	kfree(qedi->cid_que.conn_cid_tbl);
+	qedi->cid_que.conn_cid_tbl = NULL;
+}
+
+static int qedi_init_id_tbl(struct qedi_portid_tbl *id_tbl, u16 size,
+			    u16 start_id, u16 next)
+{
+	id_tbl->start = start_id;
+	id_tbl->max = size;
+	id_tbl->next = next;
+	spin_lock_init(&id_tbl->lock);
+	id_tbl->table = kzalloc(DIV_ROUND_UP(size, 32) * 4, GFP_KERNEL);
+	if (!id_tbl->table)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void qedi_free_id_tbl(struct qedi_portid_tbl *id_tbl)
+{
+	kfree(id_tbl->table);
+	id_tbl->table = NULL;
+}
+
+int qedi_alloc_id(struct qedi_portid_tbl *id_tbl, u16 id)
+{
+	int ret = -1;
+
+	id -= id_tbl->start;
+	if (id >= id_tbl->max)
+		return ret;
+
+	spin_lock(&id_tbl->lock);
+	if (!test_bit(id, id_tbl->table)) {
+		set_bit(id, id_tbl->table);
+		ret = 0;
+	}
+	spin_unlock(&id_tbl->lock);
+	return ret;
+}
+
+u16 qedi_alloc_new_id(struct qedi_portid_tbl *id_tbl)
+{
+	u16 id;
+
+	spin_lock(&id_tbl->lock);
+	id = find_next_zero_bit(id_tbl->table, id_tbl->max, id_tbl->next);
+	if (id >= id_tbl->max) {
+		id = QEDI_LOCAL_PORT_INVALID;
+		if (id_tbl->next != 0) {
+			id = find_first_zero_bit(id_tbl->table, id_tbl->next);
+			if (id >= id_tbl->next)
+				id = QEDI_LOCAL_PORT_INVALID;
+		}
+	}
+
+	if (id < id_tbl->max) {
+		set_bit(id, id_tbl->table);
+		id_tbl->next = (id + 1) & (id_tbl->max - 1);
+		id += id_tbl->start;
+	}
+
+	spin_unlock(&id_tbl->lock);
+
+	return id;
+}
+
+void qedi_free_id(struct qedi_portid_tbl *id_tbl, u16 id)
+{
+	if (id == QEDI_LOCAL_PORT_INVALID)
+		return;
+
+	id -= id_tbl->start;
+	if (id >= id_tbl->max)
+		return;
+
+	clear_bit(id, id_tbl->table);
+}
+
+static void qedi_cm_free_mem(struct qedi_ctx *qedi)
+{
+	kfree(qedi->ep_tbl);
+	qedi->ep_tbl = NULL;
+	qedi_free_id_tbl(&qedi->lcl_port_tbl);
+}
+
+static int qedi_cm_alloc_mem(struct qedi_ctx *qedi)
+{
+	u16 port_id;
+
+	qedi->ep_tbl = kzalloc((qedi->max_active_conns *
+				sizeof(struct qedi_endpoint *)), GFP_KERNEL);
+	if (!qedi->ep_tbl)
+		return -ENOMEM;
+	port_id = prandom_u32() % QEDI_LOCAL_PORT_RANGE;
+	if (qedi_init_id_tbl(&qedi->lcl_port_tbl, QEDI_LOCAL_PORT_RANGE,
+			     QEDI_LOCAL_PORT_MIN, port_id)) {
+		qedi_cm_free_mem(qedi);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static struct qedi_ctx *qedi_host_alloc(struct pci_dev *pdev)
+{
+	struct Scsi_Host *shost;
+	struct qedi_ctx *qedi = NULL;
+
+	shost = iscsi_host_alloc(&qedi_host_template,
+				 sizeof(struct qedi_ctx), 0);
+	if (!shost) {
+		QEDI_ERR(NULL, "Could not allocate shost\n");
+		goto exit_setup_shost;
+	}
+
+	shost->max_id = QEDI_MAX_ISCSI_CONNS_PER_HBA;
+	shost->max_channel = 0;
+	shost->max_lun = ~0;
+	shost->max_cmd_len = 16;
+	shost->transportt = qedi_scsi_transport;
+
+	qedi = iscsi_host_priv(shost);
+	memset(qedi, 0, sizeof(*qedi));
+	qedi->shost = shost;
+	qedi->dbg_ctx.host_no = shost->host_no;
+	qedi->pdev = pdev;
+	qedi->dbg_ctx.pdev = pdev;
+	qedi->max_active_conns = ISCSI_MAX_SESS_PER_HBA;
+	qedi->max_sqes = QEDI_SQ_SIZE;
+
+	if (shost_use_blk_mq(shost))
+		shost->nr_hw_queues = MIN_NUM_CPUS_MSIX(qedi);
+
+	pci_set_drvdata(pdev, qedi);
+
+exit_setup_shost:
+	return qedi;
+}
+
+static int qedi_ll2_rx(void *cookie, struct sk_buff *skb, u32 arg1, u32 arg2)
+{
+	struct qedi_ctx *qedi = (struct qedi_ctx *)cookie;
+	struct qedi_uio_dev *udev;
+	struct qedi_uio_ctrl *uctrl;
+	struct skb_work_list *work;
+	u32 prod;
+
+	if (!qedi) {
+		QEDI_ERR(NULL, "qedi is NULL\n");
+		return -1;
+	}
+
+	if (!test_bit(UIO_DEV_OPENED, &qedi->flags)) {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_UIO,
+			  "UIO DEV is not opened\n");
+		kfree_skb(skb);
+		return 0;
+	}
+
+	udev = qedi->udev;
+	uctrl = udev->uctrl;
+
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work) {
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "Could not allocate work so dropping frame.\n");
+		kfree_skb(skb);
+		return 0;
+	}
+
+	INIT_LIST_HEAD(&work->list);
+	work->skb = skb;
+
+	if (skb_vlan_tag_present(skb))
+		work->vlan_id = skb_vlan_tag_get(skb);
+
+	if (work->vlan_id)
+		__vlan_insert_tag(work->skb, htons(ETH_P_8021Q), work->vlan_id);
+
+	spin_lock_bh(&qedi->ll2_lock);
+	list_add_tail(&work->list, &qedi->ll2_skb_list);
+
+	++uctrl->hw_rx_prod_cnt;
+	prod = (uctrl->hw_rx_prod + 1) % RX_RING;
+	if (prod != uctrl->host_rx_cons) {
+		uctrl->hw_rx_prod = prod;
+		spin_unlock_bh(&qedi->ll2_lock);
+		wake_up_process(qedi->ll2_recv_thread);
+		return 0;
+	}
+
+	spin_unlock_bh(&qedi->ll2_lock);
+	return 0;
+}
+
+/* map this skb to iscsiuio mmaped region */
+static int qedi_ll2_process_skb(struct qedi_ctx *qedi, struct sk_buff *skb,
+				u16 vlan_id)
+{
+	struct qedi_uio_dev *udev = NULL;
+	struct qedi_uio_ctrl *uctrl = NULL;
+	struct qedi_rx_bd rxbd;
+	struct qedi_rx_bd *p_rxbd;
+	u32 rx_bd_prod;
+	void *pkt;
+	int len = 0;
+
+	if (!qedi) {
+		QEDI_ERR(NULL, "qedi is NULL\n");
+		return -1;
+	}
+
+	udev = qedi->udev;
+	uctrl = udev->uctrl;
+	pkt = udev->rx_pkt + (uctrl->hw_rx_prod * LL2_SINGLE_BUF_SIZE);
+	len = min_t(u32, skb->len, (u32)LL2_SINGLE_BUF_SIZE);
+	memcpy(pkt, skb->data, len);
+
+	memset(&rxbd, 0, sizeof(rxbd));
+	rxbd.rx_pkt_index = uctrl->hw_rx_prod;
+	rxbd.rx_pkt_len = len;
+	rxbd.vlan_id = vlan_id;
+
+	uctrl->hw_rx_bd_prod = (uctrl->hw_rx_bd_prod + 1) % QEDI_NUM_RX_BD;
+	rx_bd_prod = uctrl->hw_rx_bd_prod;
+	p_rxbd = (struct qedi_rx_bd *)udev->ll2_ring;
+	p_rxbd += rx_bd_prod;
+
+	memcpy(p_rxbd, &rxbd, sizeof(rxbd));
+
+	/* notify the iscsiuio about new packet */
+	uio_event_notify(&udev->qedi_uinfo);
+
+	return 0;
+}
+
+static void qedi_ll2_free_skbs(struct qedi_ctx *qedi)
+{
+	struct skb_work_list *work, *work_tmp;
+
+	spin_lock_bh(&qedi->ll2_lock);
+	list_for_each_entry_safe(work, work_tmp, &qedi->ll2_skb_list, list) {
+		list_del(&work->list);
+		if (work->skb)
+			kfree_skb(work->skb);
+		kfree(work);
+	}
+	spin_unlock_bh(&qedi->ll2_lock);
+}
+
+static int qedi_ll2_recv_thread(void *arg)
+{
+	struct qedi_ctx *qedi = (struct qedi_ctx *)arg;
+	struct skb_work_list *work, *work_tmp;
+
+	set_user_nice(current, -20);
+
+	while (!kthread_should_stop()) {
+		spin_lock_bh(&qedi->ll2_lock);
+		list_for_each_entry_safe(work, work_tmp, &qedi->ll2_skb_list,
+					 list) {
+			list_del(&work->list);
+			qedi_ll2_process_skb(qedi, work->skb, work->vlan_id);
+			kfree_skb(work->skb);
+			kfree(work);
+		}
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock_bh(&qedi->ll2_lock);
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int qedi_set_iscsi_pf_param(struct qedi_ctx *qedi)
+{
+	u8 num_sq_pages;
+	u32 log_page_size;
+	int rval = 0;
+
+
+	num_sq_pages = (MAX_OUSTANDING_TASKS_PER_CON * 8) / PAGE_SIZE;
+
+	qedi->num_queues = MIN_NUM_CPUS_MSIX(qedi);
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+		  "Number of CQ count is %d\n", qedi->num_queues);
+
+	memset(&qedi->pf_params.iscsi_pf_params, 0,
+	       sizeof(qedi->pf_params.iscsi_pf_params));
+
+	qedi->p_cpuq = pci_alloc_consistent(qedi->pdev,
+			qedi->num_queues * sizeof(struct qedi_glbl_q_params),
+			&qedi->hw_p_cpuq);
+	if (!qedi->p_cpuq) {
+		QEDI_ERR(&qedi->dbg_ctx, "pci_alloc_consistent fail\n");
+		rval = -1;
+		goto err_alloc_mem;
+	}
+
+	rval = qedi_alloc_global_queues(qedi);
+	if (rval) {
+		QEDI_ERR(&qedi->dbg_ctx, "Global queue allocation failed.\n");
+		rval = -1;
+		goto err_alloc_mem;
+	}
+
+	qedi->pf_params.iscsi_pf_params.num_cons = QEDI_MAX_ISCSI_CONNS_PER_HBA;
+	qedi->pf_params.iscsi_pf_params.num_tasks = QEDI_MAX_ISCSI_TASK;
+	qedi->pf_params.iscsi_pf_params.half_way_close_timeout = 10;
+	qedi->pf_params.iscsi_pf_params.num_sq_pages_in_ring = num_sq_pages;
+	qedi->pf_params.iscsi_pf_params.num_r2tq_pages_in_ring = num_sq_pages;
+	qedi->pf_params.iscsi_pf_params.num_uhq_pages_in_ring = num_sq_pages;
+	qedi->pf_params.iscsi_pf_params.num_queues = qedi->num_queues;
+	qedi->pf_params.iscsi_pf_params.debug_mode = qedi_fw_debug;
+	qedi->pf_params.iscsi_pf_params.two_msl_timer = 4000;
+	qedi->pf_params.iscsi_pf_params.max_fin_rt = 2;
+
+	for (log_page_size = 0 ; log_page_size < 32 ; log_page_size++) {
+		if ((1 << log_page_size) == PAGE_SIZE)
+			break;
+	}
+	qedi->pf_params.iscsi_pf_params.log_page_size = log_page_size;
+
+	qedi->pf_params.iscsi_pf_params.glbl_q_params_addr =
+							   (u64)qedi->hw_p_cpuq;
+
+	/* RQ BDQ initializations.
+	 * rq_num_entries: suggested value for Initiator is 16 (4KB RQ)
+	 * rqe_log_size: 8 for 256B RQE
+	 */
+	qedi->pf_params.iscsi_pf_params.rqe_log_size = 8;
+	/* BDQ address and size */
+	qedi->pf_params.iscsi_pf_params.bdq_pbl_base_addr[BDQ_ID_RQ] =
+							qedi->bdq_pbl_list_dma;
+	qedi->pf_params.iscsi_pf_params.bdq_pbl_num_entries[BDQ_ID_RQ] =
+						qedi->bdq_pbl_list_num_entries;
+	qedi->pf_params.iscsi_pf_params.rq_buffer_size = QEDI_BDQ_BUF_SIZE;
+
+	/* cq_num_entries: num_tasks + rq_num_entries */
+	qedi->pf_params.iscsi_pf_params.cq_num_entries = 2048;
+
+	qedi->pf_params.iscsi_pf_params.gl_rq_pi = QEDI_PROTO_CQ_PROD_IDX;
+	qedi->pf_params.iscsi_pf_params.gl_cmd_pi = 1;
+
+err_alloc_mem:
+	return rval;
+}
+
+/* Free DMA coherent memory for array of queue pointers we pass to qed */
+static void qedi_free_iscsi_pf_param(struct qedi_ctx *qedi)
+{
+	size_t size = 0;
+
+	if (qedi->p_cpuq) {
+		size = qedi->num_queues * sizeof(struct qedi_glbl_q_params);
+		pci_free_consistent(qedi->pdev, size, qedi->p_cpuq,
+				    qedi->hw_p_cpuq);
+	}
+
+	qedi_free_global_queues(qedi);
+
+	kfree(qedi->global_queues);
+}
+
+static void qedi_link_update(void *dev, struct qed_link_output *link)
+{
+	struct qedi_ctx *qedi = (struct qedi_ctx *)dev;
+
+	if (link->link_up) {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO, "Link Up event.\n");
+		atomic_set(&qedi->link_state, QEDI_LINK_UP);
+	} else {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+			  "Link Down event.\n");
+		atomic_set(&qedi->link_state, QEDI_LINK_DOWN);
+	}
+}
+
+static struct qed_iscsi_cb_ops qedi_cb_ops = {
+	{
+		.link_update =		qedi_link_update,
+	}
+};
+
+static int qedi_queue_cqe(struct qedi_ctx *qedi, union iscsi_cqe *cqe,
+			  u16 que_idx, struct qedi_percpu_s *p)
+{
+	struct qedi_work *qedi_work;
+	struct qedi_conn *q_conn;
+	struct iscsi_conn *conn;
+	struct qedi_cmd *qedi_cmd;
+	u32 iscsi_cid;
+	int rc = 0;
+
+	iscsi_cid  = cqe->cqe_common.conn_id;
+	q_conn = qedi->cid_que.conn_cid_tbl[iscsi_cid];
+	if (!q_conn) {
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "Session no longer exists for cid=0x%x!!\n",
+			  iscsi_cid);
+		return -1;
+	}
+	conn = q_conn->cls_conn->dd_data;
+
+	switch (cqe->cqe_common.cqe_type) {
+	case ISCSI_CQE_TYPE_SOLICITED:
+	case ISCSI_CQE_TYPE_SOLICITED_WITH_SENSE:
+		qedi_cmd = qedi_get_cmd_from_tid(qedi, cqe->cqe_solicited.itid);
+		if (!qedi_cmd) {
+			rc = -1;
+			break;
+		}
+		INIT_LIST_HEAD(&qedi_cmd->cqe_work.list);
+		qedi_cmd->cqe_work.qedi = qedi;
+		memcpy(&qedi_cmd->cqe_work.cqe, cqe, sizeof(union iscsi_cqe));
+		qedi_cmd->cqe_work.que_idx = que_idx;
+		qedi_cmd->cqe_work.is_solicited = true;
+		list_add_tail(&qedi_cmd->cqe_work.list, &p->work_list);
+		break;
+	case ISCSI_CQE_TYPE_UNSOLICITED:
+	case ISCSI_CQE_TYPE_DUMMY:
+	case ISCSI_CQE_TYPE_TASK_CLEANUP:
+		qedi_work = kzalloc(sizeof(*qedi_work), GFP_ATOMIC);
+		if (!qedi_work) {
+			rc = -1;
+			break;
+		}
+		INIT_LIST_HEAD(&qedi_work->list);
+		qedi_work->qedi = qedi;
+		memcpy(&qedi_work->cqe, cqe, sizeof(union iscsi_cqe));
+		qedi_work->que_idx = que_idx;
+		qedi_work->is_solicited = false;
+		list_add_tail(&qedi_work->list, &p->work_list);
+		break;
+	default:
+		rc = -1;
+		QEDI_ERR(&qedi->dbg_ctx, "FW Error cqe.\n");
+	}
+	return rc;
+}
+
+static bool qedi_process_completions(struct qedi_fastpath *fp)
+{
+	struct qedi_ctx *qedi = fp->qedi;
+	struct qed_sb_info *sb_info = fp->sb_info;
+	struct status_block_e4 *sb = sb_info->sb_virt;
+	struct qedi_percpu_s *p = NULL;
+	struct global_queue *que;
+	u16 prod_idx;
+	unsigned long flags;
+	union iscsi_cqe *cqe;
+	int cpu;
+	int ret;
+
+	/* Get the current firmware producer index */
+	prod_idx = sb->pi_array[QEDI_PROTO_CQ_PROD_IDX];
+
+	if (prod_idx >= QEDI_CQ_SIZE)
+		prod_idx = prod_idx % QEDI_CQ_SIZE;
+
+	que = qedi->global_queues[fp->sb_id];
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_IO,
+		  "Before: global queue=%p prod_idx=%d cons_idx=%d, sb_id=%d\n",
+		  que, prod_idx, que->cq_cons_idx, fp->sb_id);
+
+	qedi->intr_cpu = fp->sb_id;
+	cpu = smp_processor_id();
+	p = &per_cpu(qedi_percpu, cpu);
+
+	if (unlikely(!p->iothread))
+		WARN_ON(1);
+
+	spin_lock_irqsave(&p->p_work_lock, flags);
+	while (que->cq_cons_idx != prod_idx) {
+		cqe = &que->cq[que->cq_cons_idx];
+
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_IO,
+			  "cqe=%p prod_idx=%d cons_idx=%d.\n",
+			  cqe, prod_idx, que->cq_cons_idx);
+
+		ret = qedi_queue_cqe(qedi, cqe, fp->sb_id, p);
+		if (ret)
+			QEDI_WARN(&qedi->dbg_ctx,
+				  "Dropping CQE 0x%x for cid=0x%x.\n",
+				  que->cq_cons_idx, cqe->cqe_common.conn_id);
+
+		que->cq_cons_idx++;
+		if (que->cq_cons_idx == QEDI_CQ_SIZE)
+			que->cq_cons_idx = 0;
+	}
+	wake_up_process(p->iothread);
+	spin_unlock_irqrestore(&p->p_work_lock, flags);
+
+	return true;
+}
+
+static bool qedi_fp_has_work(struct qedi_fastpath *fp)
+{
+	struct qedi_ctx *qedi = fp->qedi;
+	struct global_queue *que;
+	struct qed_sb_info *sb_info = fp->sb_info;
+	struct status_block_e4 *sb = sb_info->sb_virt;
+	u16 prod_idx;
+
+	barrier();
+
+	/* Get the current firmware producer index */
+	prod_idx = sb->pi_array[QEDI_PROTO_CQ_PROD_IDX];
+
+	/* Get the pointer to the global CQ this completion is on */
+	que = qedi->global_queues[fp->sb_id];
+
+	/* prod idx wrap around uint16 */
+	if (prod_idx >= QEDI_CQ_SIZE)
+		prod_idx = prod_idx % QEDI_CQ_SIZE;
+
+	return (que->cq_cons_idx != prod_idx);
+}
+
+/* MSI-X fastpath handler code */
+static irqreturn_t qedi_msix_handler(int irq, void *dev_id)
+{
+	struct qedi_fastpath *fp = dev_id;
+	struct qedi_ctx *qedi = fp->qedi;
+	bool wake_io_thread = true;
+
+	qed_sb_ack(fp->sb_info, IGU_INT_DISABLE, 0);
+
+process_again:
+	wake_io_thread = qedi_process_completions(fp);
+	if (wake_io_thread) {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_DISC,
+			  "process already running\n");
+	}
+
+	if (qedi_fp_has_work(fp) == 0)
+		qed_sb_update_sb_idx(fp->sb_info);
+
+	/* Check for more work */
+	rmb();
+
+	if (qedi_fp_has_work(fp) == 0)
+		qed_sb_ack(fp->sb_info, IGU_INT_ENABLE, 1);
+	else
+		goto process_again;
+
+	return IRQ_HANDLED;
+}
+
+/* simd handler for MSI/INTa */
+static void qedi_simd_int_handler(void *cookie)
+{
+	/* Cookie is qedi_ctx struct */
+	struct qedi_ctx *qedi = (struct qedi_ctx *)cookie;
+
+	QEDI_WARN(&qedi->dbg_ctx, "qedi=%p.\n", qedi);
+}
+
+#define QEDI_SIMD_HANDLER_NUM		0
+static void qedi_sync_free_irqs(struct qedi_ctx *qedi)
+{
+	int i;
+
+	if (qedi->int_info.msix_cnt) {
+		for (i = 0; i < qedi->int_info.used_cnt; i++) {
+			synchronize_irq(qedi->int_info.msix[i].vector);
+			irq_set_affinity_hint(qedi->int_info.msix[i].vector,
+					      NULL);
+			free_irq(qedi->int_info.msix[i].vector,
+				 &qedi->fp_array[i]);
+		}
+	} else {
+		qedi_ops->common->simd_handler_clean(qedi->cdev,
+						     QEDI_SIMD_HANDLER_NUM);
+	}
+
+	qedi->int_info.used_cnt = 0;
+	qedi_ops->common->set_fp_int(qedi->cdev, 0);
+}
+
+static int qedi_request_msix_irq(struct qedi_ctx *qedi)
+{
+	int i, rc, cpu;
+
+	cpu = cpumask_first(cpu_online_mask);
+	for (i = 0; i < MIN_NUM_CPUS_MSIX(qedi); i++) {
+		rc = request_irq(qedi->int_info.msix[i].vector,
+				 qedi_msix_handler, 0, "qedi",
+				 &qedi->fp_array[i]);
+
+		if (rc) {
+			QEDI_WARN(&qedi->dbg_ctx, "request_irq failed.\n");
+			qedi_sync_free_irqs(qedi);
+			return rc;
+		}
+		qedi->int_info.used_cnt++;
+		rc = irq_set_affinity_hint(qedi->int_info.msix[i].vector,
+					   get_cpu_mask(cpu));
+		cpu = cpumask_next(cpu, cpu_online_mask);
+	}
+
+	return 0;
+}
+
+static int qedi_setup_int(struct qedi_ctx *qedi)
+{
+	int rc = 0;
+
+	rc = qedi_ops->common->set_fp_int(qedi->cdev, num_online_cpus());
+	rc = qedi_ops->common->get_fp_int(qedi->cdev, &qedi->int_info);
+	if (rc)
+		goto exit_setup_int;
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_DISC,
+		  "Number of msix_cnt = 0x%x num of cpus = 0x%x\n",
+		   qedi->int_info.msix_cnt, num_online_cpus());
+
+	if (qedi->int_info.msix_cnt) {
+		rc = qedi_request_msix_irq(qedi);
+		goto exit_setup_int;
+	} else {
+		qedi_ops->common->simd_handler_config(qedi->cdev, &qedi,
+						      QEDI_SIMD_HANDLER_NUM,
+						      qedi_simd_int_handler);
+		qedi->int_info.used_cnt = 1;
+	}
+
+exit_setup_int:
+	return rc;
+}
+
+static void qedi_free_nvm_iscsi_cfg(struct qedi_ctx *qedi)
+{
+	if (qedi->iscsi_cfg)
+		dma_free_coherent(&qedi->pdev->dev,
+				  sizeof(struct nvm_iscsi_cfg),
+				  qedi->iscsi_cfg, qedi->nvm_buf_dma);
+}
+
+static int qedi_alloc_nvm_iscsi_cfg(struct qedi_ctx *qedi)
+{
+	qedi->iscsi_cfg = dma_zalloc_coherent(&qedi->pdev->dev,
+					     sizeof(struct nvm_iscsi_cfg),
+					     &qedi->nvm_buf_dma, GFP_KERNEL);
+	if (!qedi->iscsi_cfg) {
+		QEDI_ERR(&qedi->dbg_ctx, "Could not allocate NVM BUF.\n");
+		return -ENOMEM;
+	}
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+		  "NVM BUF addr=0x%p dma=0x%llx.\n", qedi->iscsi_cfg,
+		  qedi->nvm_buf_dma);
+
+	return 0;
+}
+
+static void qedi_free_bdq(struct qedi_ctx *qedi)
+{
+	int i;
+
+	if (qedi->bdq_pbl_list)
+		dma_free_coherent(&qedi->pdev->dev, PAGE_SIZE,
+				  qedi->bdq_pbl_list, qedi->bdq_pbl_list_dma);
+
+	if (qedi->bdq_pbl)
+		dma_free_coherent(&qedi->pdev->dev, qedi->bdq_pbl_mem_size,
+				  qedi->bdq_pbl, qedi->bdq_pbl_dma);
+
+	for (i = 0; i < QEDI_BDQ_NUM; i++) {
+		if (qedi->bdq[i].buf_addr) {
+			dma_free_coherent(&qedi->pdev->dev, QEDI_BDQ_BUF_SIZE,
+					  qedi->bdq[i].buf_addr,
+					  qedi->bdq[i].buf_dma);
+		}
+	}
+}
+
+static void qedi_free_global_queues(struct qedi_ctx *qedi)
+{
+	int i;
+	struct global_queue **gl = qedi->global_queues;
+
+	for (i = 0; i < qedi->num_queues; i++) {
+		if (!gl[i])
+			continue;
+
+		if (gl[i]->cq)
+			dma_free_coherent(&qedi->pdev->dev, gl[i]->cq_mem_size,
+					  gl[i]->cq, gl[i]->cq_dma);
+		if (gl[i]->cq_pbl)
+			dma_free_coherent(&qedi->pdev->dev, gl[i]->cq_pbl_size,
+					  gl[i]->cq_pbl, gl[i]->cq_pbl_dma);
+
+		kfree(gl[i]);
+	}
+	qedi_free_bdq(qedi);
+	qedi_free_nvm_iscsi_cfg(qedi);
+}
+
+static int qedi_alloc_bdq(struct qedi_ctx *qedi)
+{
+	int i;
+	struct scsi_bd *pbl;
+	u64 *list;
+	dma_addr_t page;
+
+	/* Alloc dma memory for BDQ buffers */
+	for (i = 0; i < QEDI_BDQ_NUM; i++) {
+		qedi->bdq[i].buf_addr =
+				dma_alloc_coherent(&qedi->pdev->dev,
+						   QEDI_BDQ_BUF_SIZE,
+						   &qedi->bdq[i].buf_dma,
+						   GFP_KERNEL);
+		if (!qedi->bdq[i].buf_addr) {
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "Could not allocate BDQ buffer %d.\n", i);
+			return -ENOMEM;
+		}
+	}
+
+	/* Alloc dma memory for BDQ page buffer list */
+	qedi->bdq_pbl_mem_size = QEDI_BDQ_NUM * sizeof(struct scsi_bd);
+	qedi->bdq_pbl_mem_size = ALIGN(qedi->bdq_pbl_mem_size, PAGE_SIZE);
+	qedi->rq_num_entries = qedi->bdq_pbl_mem_size / sizeof(struct scsi_bd);
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN, "rq_num_entries = %d.\n",
+		  qedi->rq_num_entries);
+
+	qedi->bdq_pbl = dma_alloc_coherent(&qedi->pdev->dev,
+					   qedi->bdq_pbl_mem_size,
+					   &qedi->bdq_pbl_dma, GFP_KERNEL);
+	if (!qedi->bdq_pbl) {
+		QEDI_ERR(&qedi->dbg_ctx, "Could not allocate BDQ PBL.\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * Populate BDQ PBL with physical and virtual address of individual
+	 * BDQ buffers
+	 */
+	pbl = (struct scsi_bd  *)qedi->bdq_pbl;
+	for (i = 0; i < QEDI_BDQ_NUM; i++) {
+		pbl->address.hi =
+				cpu_to_le32(QEDI_U64_HI(qedi->bdq[i].buf_dma));
+		pbl->address.lo =
+				cpu_to_le32(QEDI_U64_LO(qedi->bdq[i].buf_dma));
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+			  "pbl [0x%p] pbl->address hi [0x%llx] lo [0x%llx], idx [%d]\n",
+			  pbl, pbl->address.hi, pbl->address.lo, i);
+		pbl->opaque.iscsi_opaque.reserved_zero[0] = 0;
+		pbl->opaque.iscsi_opaque.reserved_zero[1] = 0;
+		pbl->opaque.iscsi_opaque.reserved_zero[2] = 0;
+		pbl->opaque.iscsi_opaque.opaque = cpu_to_le16(i);
+		pbl++;
+	}
+
+	/* Allocate list of PBL pages */
+	qedi->bdq_pbl_list = dma_zalloc_coherent(&qedi->pdev->dev, PAGE_SIZE,
+						 &qedi->bdq_pbl_list_dma,
+						 GFP_KERNEL);
+	if (!qedi->bdq_pbl_list) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Could not allocate list of PBL pages.\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * Now populate PBL list with pages that contain pointers to the
+	 * individual buffers.
+	 */
+	qedi->bdq_pbl_list_num_entries = qedi->bdq_pbl_mem_size / PAGE_SIZE;
+	list = (u64 *)qedi->bdq_pbl_list;
+	page = qedi->bdq_pbl_list_dma;
+	for (i = 0; i < qedi->bdq_pbl_list_num_entries; i++) {
+		*list = qedi->bdq_pbl_dma;
+		list++;
+		page += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+static int qedi_alloc_global_queues(struct qedi_ctx *qedi)
+{
+	u32 *list;
+	int i;
+	int status = 0, rc;
+	u32 *pbl;
+	dma_addr_t page;
+	int num_pages;
+
+	/*
+	 * Number of global queues (CQ / RQ). This should
+	 * be <= number of available MSIX vectors for the PF
+	 */
+	if (!qedi->num_queues) {
+		QEDI_ERR(&qedi->dbg_ctx, "No MSI-X vectors available!\n");
+		return 1;
+	}
+
+	/* Make sure we allocated the PBL that will contain the physical
+	 * addresses of our queues
+	 */
+	if (!qedi->p_cpuq) {
+		status = 1;
+		goto mem_alloc_failure;
+	}
+
+	qedi->global_queues = kzalloc((sizeof(struct global_queue *) *
+				       qedi->num_queues), GFP_KERNEL);
+	if (!qedi->global_queues) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Unable to allocate global queues array ptr memory\n");
+		return -ENOMEM;
+	}
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_DISC,
+		  "qedi->global_queues=%p.\n", qedi->global_queues);
+
+	/* Allocate DMA coherent buffers for BDQ */
+	rc = qedi_alloc_bdq(qedi);
+	if (rc)
+		goto mem_alloc_failure;
+
+	/* Allocate DMA coherent buffers for NVM_ISCSI_CFG */
+	rc = qedi_alloc_nvm_iscsi_cfg(qedi);
+	if (rc)
+		goto mem_alloc_failure;
+
+	/* Allocate a CQ and an associated PBL for each MSI-X
+	 * vector.
+	 */
+	for (i = 0; i < qedi->num_queues; i++) {
+		qedi->global_queues[i] =
+					kzalloc(sizeof(*qedi->global_queues[0]),
+						GFP_KERNEL);
+		if (!qedi->global_queues[i]) {
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "Unable to allocation global queue %d.\n", i);
+			goto mem_alloc_failure;
+		}
+
+		qedi->global_queues[i]->cq_mem_size =
+		    (QEDI_CQ_SIZE + 8) * sizeof(union iscsi_cqe);
+		qedi->global_queues[i]->cq_mem_size =
+		    (qedi->global_queues[i]->cq_mem_size +
+		    (QEDI_PAGE_SIZE - 1));
+
+		qedi->global_queues[i]->cq_pbl_size =
+		    (qedi->global_queues[i]->cq_mem_size /
+		    QEDI_PAGE_SIZE) * sizeof(void *);
+		qedi->global_queues[i]->cq_pbl_size =
+		    (qedi->global_queues[i]->cq_pbl_size +
+		    (QEDI_PAGE_SIZE - 1));
+
+		qedi->global_queues[i]->cq = dma_zalloc_coherent(&qedi->pdev->dev,
+								 qedi->global_queues[i]->cq_mem_size,
+								 &qedi->global_queues[i]->cq_dma,
+								 GFP_KERNEL);
+
+		if (!qedi->global_queues[i]->cq) {
+			QEDI_WARN(&qedi->dbg_ctx,
+				  "Could not allocate cq.\n");
+			status = -ENOMEM;
+			goto mem_alloc_failure;
+		}
+		qedi->global_queues[i]->cq_pbl = dma_zalloc_coherent(&qedi->pdev->dev,
+								     qedi->global_queues[i]->cq_pbl_size,
+								     &qedi->global_queues[i]->cq_pbl_dma,
+								     GFP_KERNEL);
+
+		if (!qedi->global_queues[i]->cq_pbl) {
+			QEDI_WARN(&qedi->dbg_ctx,
+				  "Could not allocate cq PBL.\n");
+			status = -ENOMEM;
+			goto mem_alloc_failure;
+		}
+
+		/* Create PBL */
+		num_pages = qedi->global_queues[i]->cq_mem_size /
+		    QEDI_PAGE_SIZE;
+		page = qedi->global_queues[i]->cq_dma;
+		pbl = (u32 *)qedi->global_queues[i]->cq_pbl;
+
+		while (num_pages--) {
+			*pbl = (u32)page;
+			pbl++;
+			*pbl = (u32)((u64)page >> 32);
+			pbl++;
+			page += QEDI_PAGE_SIZE;
+		}
+	}
+
+	list = (u32 *)qedi->p_cpuq;
+
+	/*
+	 * The list is built as follows: CQ#0 PBL pointer, RQ#0 PBL pointer,
+	 * CQ#1 PBL pointer, RQ#1 PBL pointer, etc.  Each PBL pointer points
+	 * to the physical address which contains an array of pointers to the
+	 * physical addresses of the specific queue pages.
+	 */
+	for (i = 0; i < qedi->num_queues; i++) {
+		*list = (u32)qedi->global_queues[i]->cq_pbl_dma;
+		list++;
+		*list = (u32)((u64)qedi->global_queues[i]->cq_pbl_dma >> 32);
+		list++;
+
+		*list = (u32)0;
+		list++;
+		*list = (u32)((u64)0 >> 32);
+		list++;
+	}
+
+	return 0;
+
+mem_alloc_failure:
+	qedi_free_global_queues(qedi);
+	return status;
+}
+
+int qedi_alloc_sq(struct qedi_ctx *qedi, struct qedi_endpoint *ep)
+{
+	int rval = 0;
+	u32 *pbl;
+	dma_addr_t page;
+	int num_pages;
+
+	if (!ep)
+		return -EIO;
+
+	/* Calculate appropriate queue and PBL sizes */
+	ep->sq_mem_size = QEDI_SQ_SIZE * sizeof(struct iscsi_wqe);
+	ep->sq_mem_size += QEDI_PAGE_SIZE - 1;
+
+	ep->sq_pbl_size = (ep->sq_mem_size / QEDI_PAGE_SIZE) * sizeof(void *);
+	ep->sq_pbl_size = ep->sq_pbl_size + QEDI_PAGE_SIZE;
+
+	ep->sq = dma_zalloc_coherent(&qedi->pdev->dev, ep->sq_mem_size,
+				     &ep->sq_dma, GFP_KERNEL);
+	if (!ep->sq) {
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "Could not allocate send queue.\n");
+		rval = -ENOMEM;
+		goto out;
+	}
+	ep->sq_pbl = dma_zalloc_coherent(&qedi->pdev->dev, ep->sq_pbl_size,
+					 &ep->sq_pbl_dma, GFP_KERNEL);
+	if (!ep->sq_pbl) {
+		QEDI_WARN(&qedi->dbg_ctx,
+			  "Could not allocate send queue PBL.\n");
+		rval = -ENOMEM;
+		goto out_free_sq;
+	}
+
+	/* Create PBL */
+	num_pages = ep->sq_mem_size / QEDI_PAGE_SIZE;
+	page = ep->sq_dma;
+	pbl = (u32 *)ep->sq_pbl;
+
+	while (num_pages--) {
+		*pbl = (u32)page;
+		pbl++;
+		*pbl = (u32)((u64)page >> 32);
+		pbl++;
+		page += QEDI_PAGE_SIZE;
+	}
+
+	return rval;
+
+out_free_sq:
+	dma_free_coherent(&qedi->pdev->dev, ep->sq_mem_size, ep->sq,
+			  ep->sq_dma);
+out:
+	return rval;
+}
+
+void qedi_free_sq(struct qedi_ctx *qedi, struct qedi_endpoint *ep)
+{
+	if (ep->sq_pbl)
+		dma_free_coherent(&qedi->pdev->dev, ep->sq_pbl_size, ep->sq_pbl,
+				  ep->sq_pbl_dma);
+	if (ep->sq)
+		dma_free_coherent(&qedi->pdev->dev, ep->sq_mem_size, ep->sq,
+				  ep->sq_dma);
+}
+
+int qedi_get_task_idx(struct qedi_ctx *qedi)
+{
+	s16 tmp_idx;
+
+again:
+	tmp_idx = find_first_zero_bit(qedi->task_idx_map,
+				      MAX_ISCSI_TASK_ENTRIES);
+
+	if (tmp_idx >= MAX_ISCSI_TASK_ENTRIES) {
+		QEDI_ERR(&qedi->dbg_ctx, "FW task context pool is full.\n");
+		tmp_idx = -1;
+		goto err_idx;
+	}
+
+	if (test_and_set_bit(tmp_idx, qedi->task_idx_map))
+		goto again;
+
+err_idx:
+	return tmp_idx;
+}
+
+void qedi_clear_task_idx(struct qedi_ctx *qedi, int idx)
+{
+	if (!test_and_clear_bit(idx, qedi->task_idx_map))
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "FW task context, already cleared, tid=0x%x\n", idx);
+}
+
+void qedi_update_itt_map(struct qedi_ctx *qedi, u32 tid, u32 proto_itt,
+			 struct qedi_cmd *cmd)
+{
+	qedi->itt_map[tid].itt = proto_itt;
+	qedi->itt_map[tid].p_cmd = cmd;
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+		  "update itt map tid=0x%x, with proto itt=0x%x\n", tid,
+		  qedi->itt_map[tid].itt);
+}
+
+void qedi_get_task_tid(struct qedi_ctx *qedi, u32 itt, s16 *tid)
+{
+	u16 i;
+
+	for (i = 0; i < MAX_ISCSI_TASK_ENTRIES; i++) {
+		if (qedi->itt_map[i].itt == itt) {
+			*tid = i;
+			QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+				  "Ref itt=0x%x, found at tid=0x%x\n",
+				  itt, *tid);
+			return;
+		}
+	}
+
+	WARN_ON(1);
+}
+
+void qedi_get_proto_itt(struct qedi_ctx *qedi, u32 tid, u32 *proto_itt)
+{
+	*proto_itt = qedi->itt_map[tid].itt;
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_CONN,
+		  "Get itt map tid [0x%x with proto itt[0x%x]",
+		  tid, *proto_itt);
+}
+
+struct qedi_cmd *qedi_get_cmd_from_tid(struct qedi_ctx *qedi, u32 tid)
+{
+	struct qedi_cmd *cmd = NULL;
+
+	if (tid >= MAX_ISCSI_TASK_ENTRIES)
+		return NULL;
+
+	cmd = qedi->itt_map[tid].p_cmd;
+	if (cmd->task_id != tid)
+		return NULL;
+
+	qedi->itt_map[tid].p_cmd = NULL;
+
+	return cmd;
+}
+
+static int qedi_alloc_itt(struct qedi_ctx *qedi)
+{
+	qedi->itt_map = kcalloc(MAX_ISCSI_TASK_ENTRIES,
+				sizeof(struct qedi_itt_map), GFP_KERNEL);
+	if (!qedi->itt_map) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Unable to allocate itt map array memory\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void qedi_free_itt(struct qedi_ctx *qedi)
+{
+	kfree(qedi->itt_map);
+}
+
+static struct qed_ll2_cb_ops qedi_ll2_cb_ops = {
+	.rx_cb = qedi_ll2_rx,
+	.tx_cb = NULL,
+};
+
+static int qedi_percpu_io_thread(void *arg)
+{
+	struct qedi_percpu_s *p = arg;
+	struct qedi_work *work, *tmp;
+	unsigned long flags;
+	LIST_HEAD(work_list);
+
+	set_user_nice(current, -20);
+
+	while (!kthread_should_stop()) {
+		spin_lock_irqsave(&p->p_work_lock, flags);
+		while (!list_empty(&p->work_list)) {
+			list_splice_init(&p->work_list, &work_list);
+			spin_unlock_irqrestore(&p->p_work_lock, flags);
+
+			list_for_each_entry_safe(work, tmp, &work_list, list) {
+				list_del_init(&work->list);
+				qedi_fp_process_cqes(work);
+				if (!work->is_solicited)
+					kfree(work);
+			}
+			cond_resched();
+			spin_lock_irqsave(&p->p_work_lock, flags);
+		}
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock_irqrestore(&p->p_work_lock, flags);
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+static int qedi_cpu_online(unsigned int cpu)
+{
+	struct qedi_percpu_s *p = this_cpu_ptr(&qedi_percpu);
+	struct task_struct *thread;
+
+	thread = kthread_create_on_node(qedi_percpu_io_thread, (void *)p,
+					cpu_to_node(cpu),
+					"qedi_thread/%d", cpu);
+	if (IS_ERR(thread))
+		return PTR_ERR(thread);
+
+	kthread_bind(thread, cpu);
+	p->iothread = thread;
+	wake_up_process(thread);
+	return 0;
+}
+
+static int qedi_cpu_offline(unsigned int cpu)
+{
+	struct qedi_percpu_s *p = this_cpu_ptr(&qedi_percpu);
+	struct qedi_work *work, *tmp;
+	struct task_struct *thread;
+
+	spin_lock_bh(&p->p_work_lock);
+	thread = p->iothread;
+	p->iothread = NULL;
+
+	list_for_each_entry_safe(work, tmp, &p->work_list, list) {
+		list_del_init(&work->list);
+		qedi_fp_process_cqes(work);
+		if (!work->is_solicited)
+			kfree(work);
+	}
+
+	spin_unlock_bh(&p->p_work_lock);
+	if (thread)
+		kthread_stop(thread);
+	return 0;
+}
+
+void qedi_reset_host_mtu(struct qedi_ctx *qedi, u16 mtu)
+{
+	struct qed_ll2_params params;
+
+	qedi_recover_all_conns(qedi);
+
+	qedi_ops->ll2->stop(qedi->cdev);
+	qedi_ll2_free_skbs(qedi);
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO, "old MTU %u, new MTU %u\n",
+		  qedi->ll2_mtu, mtu);
+	memset(&params, 0, sizeof(params));
+	qedi->ll2_mtu = mtu;
+	params.mtu = qedi->ll2_mtu + IPV6_HDR_LEN + TCP_HDR_LEN;
+	params.drop_ttl0_packets = 0;
+	params.rx_vlan_stripping = 1;
+	ether_addr_copy(params.ll2_mac_address, qedi->dev_info.common.hw_mac);
+	qedi_ops->ll2->start(qedi->cdev, &params);
+}
+
+/**
+ * qedi_get_nvram_block: - Scan through the iSCSI NVRAM block (while accounting
+ * for gaps) for the matching absolute-pf-id of the QEDI device.
+ */
+static struct nvm_iscsi_block *
+qedi_get_nvram_block(struct qedi_ctx *qedi)
+{
+	int i;
+	u8 pf;
+	u32 flags;
+	struct nvm_iscsi_block *block;
+
+	pf = qedi->dev_info.common.abs_pf_id;
+	block = &qedi->iscsi_cfg->block[0];
+	for (i = 0; i < NUM_OF_ISCSI_PF_SUPPORTED; i++, block++) {
+		flags = ((block->id) & NVM_ISCSI_CFG_BLK_CTRL_FLAG_MASK) >>
+			NVM_ISCSI_CFG_BLK_CTRL_FLAG_OFFSET;
+		if (flags & (NVM_ISCSI_CFG_BLK_CTRL_FLAG_IS_NOT_EMPTY |
+				NVM_ISCSI_CFG_BLK_CTRL_FLAG_PF_MAPPED) &&
+			(pf == (block->id & NVM_ISCSI_CFG_BLK_MAPPED_PF_ID_MASK)
+				>> NVM_ISCSI_CFG_BLK_MAPPED_PF_ID_OFFSET))
+			return block;
+	}
+	return NULL;
+}
+
+static ssize_t qedi_show_boot_eth_info(void *data, int type, char *buf)
+{
+	struct qedi_ctx *qedi = data;
+	struct nvm_iscsi_initiator *initiator;
+	int rc = 1;
+	u32 ipv6_en, dhcp_en, ip_len;
+	struct nvm_iscsi_block *block;
+	char *fmt, *ip, *sub, *gw;
+
+	block = qedi_get_nvram_block(qedi);
+	if (!block)
+		return 0;
+
+	initiator = &block->initiator;
+	ipv6_en = block->generic.ctrl_flags &
+		  NVM_ISCSI_CFG_GEN_IPV6_ENABLED;
+	dhcp_en = block->generic.ctrl_flags &
+		  NVM_ISCSI_CFG_GEN_DHCP_TCPIP_CONFIG_ENABLED;
+	/* Static IP assignments. */
+	fmt = ipv6_en ? "%pI6\n" : "%pI4\n";
+	ip = ipv6_en ? initiator->ipv6.addr.byte : initiator->ipv4.addr.byte;
+	ip_len = ipv6_en ? IPV6_LEN : IPV4_LEN;
+	sub = ipv6_en ? initiator->ipv6.subnet_mask.byte :
+	      initiator->ipv4.subnet_mask.byte;
+	gw = ipv6_en ? initiator->ipv6.gateway.byte :
+	     initiator->ipv4.gateway.byte;
+	/* DHCP IP adjustments. */
+	fmt = dhcp_en ? "%s\n" : fmt;
+	if (dhcp_en) {
+		ip = ipv6_en ? "0::0" : "0.0.0.0";
+		sub = ip;
+		gw = ip;
+		ip_len = ipv6_en ? 5 : 8;
+	}
+
+	switch (type) {
+	case ISCSI_BOOT_ETH_IP_ADDR:
+		rc = snprintf(buf, ip_len, fmt, ip);
+		break;
+	case ISCSI_BOOT_ETH_SUBNET_MASK:
+		rc = snprintf(buf, ip_len, fmt, sub);
+		break;
+	case ISCSI_BOOT_ETH_GATEWAY:
+		rc = snprintf(buf, ip_len, fmt, gw);
+		break;
+	case ISCSI_BOOT_ETH_FLAGS:
+		rc = snprintf(buf, 3, "%hhd\n",
+			      SYSFS_FLAG_FW_SEL_BOOT);
+		break;
+	case ISCSI_BOOT_ETH_INDEX:
+		rc = snprintf(buf, 3, "0\n");
+		break;
+	case ISCSI_BOOT_ETH_MAC:
+		rc = sysfs_format_mac(buf, qedi->mac, ETH_ALEN);
+		break;
+	case ISCSI_BOOT_ETH_VLAN:
+		rc = snprintf(buf, 12, "%d\n",
+			      GET_FIELD2(initiator->generic_cont0,
+					 NVM_ISCSI_CFG_INITIATOR_VLAN));
+		break;
+	case ISCSI_BOOT_ETH_ORIGIN:
+		if (dhcp_en)
+			rc = snprintf(buf, 3, "3\n");
+		break;
+	default:
+		rc = 0;
+		break;
+	}
+
+	return rc;
+}
+
+static umode_t qedi_eth_get_attr_visibility(void *data, int type)
+{
+	int rc = 1;
+
+	switch (type) {
+	case ISCSI_BOOT_ETH_FLAGS:
+	case ISCSI_BOOT_ETH_MAC:
+	case ISCSI_BOOT_ETH_INDEX:
+	case ISCSI_BOOT_ETH_IP_ADDR:
+	case ISCSI_BOOT_ETH_SUBNET_MASK:
+	case ISCSI_BOOT_ETH_GATEWAY:
+	case ISCSI_BOOT_ETH_ORIGIN:
+	case ISCSI_BOOT_ETH_VLAN:
+		rc = 0444;
+		break;
+	default:
+		rc = 0;
+		break;
+	}
+	return rc;
+}
+
+static ssize_t qedi_show_boot_ini_info(void *data, int type, char *buf)
+{
+	struct qedi_ctx *qedi = data;
+	struct nvm_iscsi_initiator *initiator;
+	int rc;
+	struct nvm_iscsi_block *block;
+
+	block = qedi_get_nvram_block(qedi);
+	if (!block)
+		return 0;
+
+	initiator = &block->initiator;
+
+	switch (type) {
+	case ISCSI_BOOT_INI_INITIATOR_NAME:
+		rc = sprintf(buf, "%.*s\n", NVM_ISCSI_CFG_ISCSI_NAME_MAX_LEN,
+			     initiator->initiator_name.byte);
+		break;
+	default:
+		rc = 0;
+		break;
+	}
+	return rc;
+}
+
+static umode_t qedi_ini_get_attr_visibility(void *data, int type)
+{
+	int rc;
+
+	switch (type) {
+	case ISCSI_BOOT_INI_INITIATOR_NAME:
+		rc = 0444;
+		break;
+	default:
+		rc = 0;
+		break;
+	}
+	return rc;
+}
+
+static ssize_t
+qedi_show_boot_tgt_info(struct qedi_ctx *qedi, int type,
+			char *buf, enum qedi_nvm_tgts idx)
+{
+	int rc = 1;
+	u32 ctrl_flags, ipv6_en, chap_en, mchap_en, ip_len;
+	struct nvm_iscsi_block *block;
+	char *chap_name, *chap_secret;
+	char *mchap_name, *mchap_secret;
+
+	block = qedi_get_nvram_block(qedi);
+	if (!block)
+		goto exit_show_tgt_info;
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_EVT,
+		  "Port:%d, tgt_idx:%d\n",
+		  GET_FIELD2(block->id, NVM_ISCSI_CFG_BLK_MAPPED_PF_ID), idx);
+
+	ctrl_flags = block->target[idx].ctrl_flags &
+		     NVM_ISCSI_CFG_TARGET_ENABLED;
+
+	if (!ctrl_flags) {
+		QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_EVT,
+			  "Target disabled\n");
+		goto exit_show_tgt_info;
+	}
+
+	ipv6_en = block->generic.ctrl_flags &
+		  NVM_ISCSI_CFG_GEN_IPV6_ENABLED;
+	ip_len = ipv6_en ? IPV6_LEN : IPV4_LEN;
+	chap_en = block->generic.ctrl_flags &
+		  NVM_ISCSI_CFG_GEN_CHAP_ENABLED;
+	chap_name = chap_en ? block->initiator.chap_name.byte : NULL;
+	chap_secret = chap_en ? block->initiator.chap_password.byte : NULL;
+
+	mchap_en = block->generic.ctrl_flags &
+		  NVM_ISCSI_CFG_GEN_CHAP_MUTUAL_ENABLED;
+	mchap_name = mchap_en ? block->target[idx].chap_name.byte : NULL;
+	mchap_secret = mchap_en ? block->target[idx].chap_password.byte : NULL;
+
+	switch (type) {
+	case ISCSI_BOOT_TGT_NAME:
+		rc = sprintf(buf, "%.*s\n", NVM_ISCSI_CFG_ISCSI_NAME_MAX_LEN,
+			     block->target[idx].target_name.byte);
+		break;
+	case ISCSI_BOOT_TGT_IP_ADDR:
+		if (ipv6_en)
+			rc = snprintf(buf, ip_len, "%pI6\n",
+				      block->target[idx].ipv6_addr.byte);
+		else
+			rc = snprintf(buf, ip_len, "%pI4\n",
+				      block->target[idx].ipv4_addr.byte);
+		break;
+	case ISCSI_BOOT_TGT_PORT:
+		rc = snprintf(buf, 12, "%d\n",
+			      GET_FIELD2(block->target[idx].generic_cont0,
+					 NVM_ISCSI_CFG_TARGET_TCP_PORT));
+		break;
+	case ISCSI_BOOT_TGT_LUN:
+		rc = snprintf(buf, 22, "%.*d\n",
+			      block->target[idx].lun.value[1],
+			      block->target[idx].lun.value[0]);
+		break;
+	case ISCSI_BOOT_TGT_CHAP_NAME:
+		rc = sprintf(buf, "%.*s\n", NVM_ISCSI_CFG_CHAP_NAME_MAX_LEN,
+			     chap_name);
+		break;
+	case ISCSI_BOOT_TGT_CHAP_SECRET:
+		rc = sprintf(buf, "%.*s\n", NVM_ISCSI_CFG_CHAP_NAME_MAX_LEN,
+			     chap_secret);
+		break;
+	case ISCSI_BOOT_TGT_REV_CHAP_NAME:
+		rc = sprintf(buf, "%.*s\n", NVM_ISCSI_CFG_CHAP_NAME_MAX_LEN,
+			     mchap_name);
+		break;
+	case ISCSI_BOOT_TGT_REV_CHAP_SECRET:
+		rc = sprintf(buf, "%.*s\n", NVM_ISCSI_CFG_CHAP_NAME_MAX_LEN,
+			     mchap_secret);
+		break;
+	case ISCSI_BOOT_TGT_FLAGS:
+		rc = snprintf(buf, 3, "%hhd\n", SYSFS_FLAG_FW_SEL_BOOT);
+		break;
+	case ISCSI_BOOT_TGT_NIC_ASSOC:
+		rc = snprintf(buf, 3, "0\n");
+		break;
+	default:
+		rc = 0;
+		break;
+	}
+
+exit_show_tgt_info:
+	return rc;
+}
+
+static ssize_t qedi_show_boot_tgt_pri_info(void *data, int type, char *buf)
+{
+	struct qedi_ctx *qedi = data;
+
+	return qedi_show_boot_tgt_info(qedi, type, buf, QEDI_NVM_TGT_PRI);
+}
+
+static ssize_t qedi_show_boot_tgt_sec_info(void *data, int type, char *buf)
+{
+	struct qedi_ctx *qedi = data;
+
+	return qedi_show_boot_tgt_info(qedi, type, buf, QEDI_NVM_TGT_SEC);
+}
+
+static umode_t qedi_tgt_get_attr_visibility(void *data, int type)
+{
+	int rc;
+
+	switch (type) {
+	case ISCSI_BOOT_TGT_NAME:
+	case ISCSI_BOOT_TGT_IP_ADDR:
+	case ISCSI_BOOT_TGT_PORT:
+	case ISCSI_BOOT_TGT_LUN:
+	case ISCSI_BOOT_TGT_CHAP_NAME:
+	case ISCSI_BOOT_TGT_CHAP_SECRET:
+	case ISCSI_BOOT_TGT_REV_CHAP_NAME:
+	case ISCSI_BOOT_TGT_REV_CHAP_SECRET:
+	case ISCSI_BOOT_TGT_NIC_ASSOC:
+	case ISCSI_BOOT_TGT_FLAGS:
+		rc = 0444;
+		break;
+	default:
+		rc = 0;
+		break;
+	}
+	return rc;
+}
+
+static void qedi_boot_release(void *data)
+{
+	struct qedi_ctx *qedi = data;
+
+	scsi_host_put(qedi->shost);
+}
+
+static int qedi_get_boot_info(struct qedi_ctx *qedi)
+{
+	int ret = 1;
+	u16 len;
+
+	len = sizeof(struct nvm_iscsi_cfg);
+
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+		  "Get NVM iSCSI CFG image\n");
+	ret = qedi_ops->common->nvm_get_image(qedi->cdev,
+					      QED_NVM_IMAGE_ISCSI_CFG,
+					      (char *)qedi->iscsi_cfg, len);
+	if (ret)
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Could not get NVM image. ret = %d\n", ret);
+
+	return ret;
+}
+
+static int qedi_setup_boot_info(struct qedi_ctx *qedi)
+{
+	struct iscsi_boot_kobj *boot_kobj;
+
+	if (qedi_get_boot_info(qedi))
+		return -EPERM;
+
+	qedi->boot_kset = iscsi_boot_create_host_kset(qedi->shost->host_no);
+	if (!qedi->boot_kset)
+		goto kset_free;
+
+	if (!scsi_host_get(qedi->shost))
+		goto kset_free;
+
+	boot_kobj = iscsi_boot_create_target(qedi->boot_kset, 0, qedi,
+					     qedi_show_boot_tgt_pri_info,
+					     qedi_tgt_get_attr_visibility,
+					     qedi_boot_release);
+	if (!boot_kobj)
+		goto put_host;
+
+	if (!scsi_host_get(qedi->shost))
+		goto kset_free;
+
+	boot_kobj = iscsi_boot_create_target(qedi->boot_kset, 1, qedi,
+					     qedi_show_boot_tgt_sec_info,
+					     qedi_tgt_get_attr_visibility,
+					     qedi_boot_release);
+	if (!boot_kobj)
+		goto put_host;
+
+	if (!scsi_host_get(qedi->shost))
+		goto kset_free;
+
+	boot_kobj = iscsi_boot_create_initiator(qedi->boot_kset, 0, qedi,
+						qedi_show_boot_ini_info,
+						qedi_ini_get_attr_visibility,
+						qedi_boot_release);
+	if (!boot_kobj)
+		goto put_host;
+
+	if (!scsi_host_get(qedi->shost))
+		goto kset_free;
+
+	boot_kobj = iscsi_boot_create_ethernet(qedi->boot_kset, 0, qedi,
+					       qedi_show_boot_eth_info,
+					       qedi_eth_get_attr_visibility,
+					       qedi_boot_release);
+	if (!boot_kobj)
+		goto put_host;
+
+	return 0;
+
+put_host:
+	scsi_host_put(qedi->shost);
+kset_free:
+	iscsi_boot_destroy_kset(qedi->boot_kset);
+	return -ENOMEM;
+}
+
+static void __qedi_remove(struct pci_dev *pdev, int mode)
+{
+	struct qedi_ctx *qedi = pci_get_drvdata(pdev);
+
+	if (qedi->tmf_thread) {
+		flush_workqueue(qedi->tmf_thread);
+		destroy_workqueue(qedi->tmf_thread);
+		qedi->tmf_thread = NULL;
+	}
+
+	if (qedi->offload_thread) {
+		flush_workqueue(qedi->offload_thread);
+		destroy_workqueue(qedi->offload_thread);
+		qedi->offload_thread = NULL;
+	}
+
+#ifdef CONFIG_DEBUG_FS
+	qedi_dbg_host_exit(&qedi->dbg_ctx);
+#endif
+	if (!test_bit(QEDI_IN_OFFLINE, &qedi->flags))
+		qedi_ops->common->set_power_state(qedi->cdev, PCI_D0);
+
+	qedi_sync_free_irqs(qedi);
+
+	if (!test_bit(QEDI_IN_OFFLINE, &qedi->flags)) {
+		qedi_ops->stop(qedi->cdev);
+		qedi_ops->ll2->stop(qedi->cdev);
+	}
+
+	if (mode == QEDI_MODE_NORMAL)
+		qedi_free_iscsi_pf_param(qedi);
+
+	if (!test_bit(QEDI_IN_OFFLINE, &qedi->flags)) {
+		qedi_ops->common->slowpath_stop(qedi->cdev);
+		qedi_ops->common->remove(qedi->cdev);
+	}
+
+	qedi_destroy_fp(qedi);
+
+	if (mode == QEDI_MODE_NORMAL) {
+		qedi_release_cid_que(qedi);
+		qedi_cm_free_mem(qedi);
+		qedi_free_uio(qedi->udev);
+		qedi_free_itt(qedi);
+
+		iscsi_host_remove(qedi->shost);
+		iscsi_host_free(qedi->shost);
+
+		if (qedi->ll2_recv_thread) {
+			kthread_stop(qedi->ll2_recv_thread);
+			qedi->ll2_recv_thread = NULL;
+		}
+		qedi_ll2_free_skbs(qedi);
+
+		if (qedi->boot_kset)
+			iscsi_boot_destroy_kset(qedi->boot_kset);
+	}
+}
+
+static int __qedi_probe(struct pci_dev *pdev, int mode)
+{
+	struct qedi_ctx *qedi;
+	struct qed_ll2_params params;
+	u32 dp_module = 0;
+	u8 dp_level = 0;
+	bool is_vf = false;
+	char host_buf[16];
+	struct qed_link_params link_params;
+	struct qed_slowpath_params sp_params;
+	struct qed_probe_params qed_params;
+	void *task_start, *task_end;
+	int rc;
+	u16 tmp;
+
+	if (mode != QEDI_MODE_RECOVERY) {
+		qedi = qedi_host_alloc(pdev);
+		if (!qedi) {
+			rc = -ENOMEM;
+			goto exit_probe;
+		}
+	} else {
+		qedi = pci_get_drvdata(pdev);
+	}
+
+	memset(&qed_params, 0, sizeof(qed_params));
+	qed_params.protocol = QED_PROTOCOL_ISCSI;
+	qed_params.dp_module = dp_module;
+	qed_params.dp_level = dp_level;
+	qed_params.is_vf = is_vf;
+	qedi->cdev = qedi_ops->common->probe(pdev, &qed_params);
+	if (!qedi->cdev) {
+		rc = -ENODEV;
+		QEDI_ERR(&qedi->dbg_ctx, "Cannot initialize hardware\n");
+		goto free_host;
+	}
+
+	atomic_set(&qedi->link_state, QEDI_LINK_DOWN);
+
+	rc = qedi_ops->fill_dev_info(qedi->cdev, &qedi->dev_info);
+	if (rc)
+		goto free_host;
+
+	if (mode != QEDI_MODE_RECOVERY) {
+		rc = qedi_set_iscsi_pf_param(qedi);
+		if (rc) {
+			rc = -ENOMEM;
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "Set iSCSI pf param fail\n");
+			goto free_host;
+		}
+	}
+
+	qedi_ops->common->update_pf_params(qedi->cdev, &qedi->pf_params);
+
+	rc = qedi_prepare_fp(qedi);
+	if (rc) {
+		QEDI_ERR(&qedi->dbg_ctx, "Cannot start slowpath.\n");
+		goto free_pf_params;
+	}
+
+	/* Start the Slowpath-process */
+	memset(&sp_params, 0, sizeof(struct qed_slowpath_params));
+	sp_params.int_mode = QED_INT_MODE_MSIX;
+	sp_params.drv_major = QEDI_DRIVER_MAJOR_VER;
+	sp_params.drv_minor = QEDI_DRIVER_MINOR_VER;
+	sp_params.drv_rev = QEDI_DRIVER_REV_VER;
+	sp_params.drv_eng = QEDI_DRIVER_ENG_VER;
+	strlcpy(sp_params.name, "qedi iSCSI", QED_DRV_VER_STR_SIZE);
+	rc = qedi_ops->common->slowpath_start(qedi->cdev, &sp_params);
+	if (rc) {
+		QEDI_ERR(&qedi->dbg_ctx, "Cannot start slowpath\n");
+		goto stop_hw;
+	}
+
+	/* update_pf_params needs to be called before and after slowpath
+	 * start
+	 */
+	qedi_ops->common->update_pf_params(qedi->cdev, &qedi->pf_params);
+
+	rc = qedi_setup_int(qedi);
+	if (rc)
+		goto stop_iscsi_func;
+
+	qedi_ops->common->set_power_state(qedi->cdev, PCI_D0);
+
+	/* Learn information crucial for qedi to progress */
+	rc = qedi_ops->fill_dev_info(qedi->cdev, &qedi->dev_info);
+	if (rc)
+		goto stop_iscsi_func;
+
+	/* Record BDQ producer doorbell addresses */
+	qedi->bdq_primary_prod = qedi->dev_info.primary_dbq_rq_addr;
+	qedi->bdq_secondary_prod = qedi->dev_info.secondary_bdq_rq_addr;
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_DISC,
+		  "BDQ primary_prod=%p secondary_prod=%p.\n",
+		  qedi->bdq_primary_prod,
+		  qedi->bdq_secondary_prod);
+
+	/*
+	 * We need to write the number of BDs in the BDQ we've preallocated so
+	 * the f/w will do a prefetch and we'll get an unsolicited CQE when a
+	 * packet arrives.
+	 */
+	qedi->bdq_prod_idx = QEDI_BDQ_NUM;
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_DISC,
+		  "Writing %d to primary and secondary BDQ doorbell registers.\n",
+		  qedi->bdq_prod_idx);
+	writew(qedi->bdq_prod_idx, qedi->bdq_primary_prod);
+	tmp = readw(qedi->bdq_primary_prod);
+	writew(qedi->bdq_prod_idx, qedi->bdq_secondary_prod);
+	tmp = readw(qedi->bdq_secondary_prod);
+
+	ether_addr_copy(qedi->mac, qedi->dev_info.common.hw_mac);
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_DISC, "MAC address is %pM.\n",
+		  qedi->mac);
+
+	sprintf(host_buf, "host_%d", qedi->shost->host_no);
+	qedi_ops->common->set_name(qedi->cdev, host_buf);
+
+	qedi_ops->register_ops(qedi->cdev, &qedi_cb_ops, qedi);
+
+	memset(&params, 0, sizeof(params));
+	params.mtu = DEF_PATH_MTU + IPV6_HDR_LEN + TCP_HDR_LEN;
+	qedi->ll2_mtu = DEF_PATH_MTU;
+	params.drop_ttl0_packets = 0;
+	params.rx_vlan_stripping = 1;
+	ether_addr_copy(params.ll2_mac_address, qedi->dev_info.common.hw_mac);
+
+	if (mode != QEDI_MODE_RECOVERY) {
+		/* set up rx path */
+		INIT_LIST_HEAD(&qedi->ll2_skb_list);
+		spin_lock_init(&qedi->ll2_lock);
+		/* start qedi context */
+		spin_lock_init(&qedi->hba_lock);
+		spin_lock_init(&qedi->task_idx_lock);
+	}
+	qedi_ops->ll2->register_cb_ops(qedi->cdev, &qedi_ll2_cb_ops, qedi);
+	qedi_ops->ll2->start(qedi->cdev, &params);
+
+	if (mode != QEDI_MODE_RECOVERY) {
+		qedi->ll2_recv_thread = kthread_run(qedi_ll2_recv_thread,
+						    (void *)qedi,
+						    "qedi_ll2_thread");
+	}
+
+	rc = qedi_ops->start(qedi->cdev, &qedi->tasks,
+			     qedi, qedi_iscsi_event_cb);
+	if (rc) {
+		rc = -ENODEV;
+		QEDI_ERR(&qedi->dbg_ctx, "Cannot start iSCSI function\n");
+		goto stop_slowpath;
+	}
+
+	task_start = qedi_get_task_mem(&qedi->tasks, 0);
+	task_end = qedi_get_task_mem(&qedi->tasks, MAX_TID_BLOCKS_ISCSI - 1);
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_DISC,
+		  "Task context start=%p, end=%p block_size=%u.\n",
+		   task_start, task_end, qedi->tasks.size);
+
+	memset(&link_params, 0, sizeof(link_params));
+	link_params.link_up = true;
+	rc = qedi_ops->common->set_link(qedi->cdev, &link_params);
+	if (rc) {
+		QEDI_WARN(&qedi->dbg_ctx, "Link set up failed.\n");
+		atomic_set(&qedi->link_state, QEDI_LINK_DOWN);
+	}
+
+#ifdef CONFIG_DEBUG_FS
+	qedi_dbg_host_init(&qedi->dbg_ctx, qedi_debugfs_ops,
+			   qedi_dbg_fops);
+#endif
+	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+		  "QLogic FastLinQ iSCSI Module qedi %s, FW %d.%d.%d.%d\n",
+		  QEDI_MODULE_VERSION, FW_MAJOR_VERSION, FW_MINOR_VERSION,
+		  FW_REVISION_VERSION, FW_ENGINEERING_VERSION);
+
+	if (mode == QEDI_MODE_NORMAL) {
+		if (iscsi_host_add(qedi->shost, &pdev->dev)) {
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "Could not add iscsi host\n");
+			rc = -ENOMEM;
+			goto remove_host;
+		}
+
+		/* Allocate uio buffers */
+		rc = qedi_alloc_uio_rings(qedi);
+		if (rc) {
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "UIO alloc ring failed err=%d\n", rc);
+			goto remove_host;
+		}
+
+		rc = qedi_init_uio(qedi);
+		if (rc) {
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "UIO init failed, err=%d\n", rc);
+			goto free_uio;
+		}
+
+		/* host the array on iscsi_conn */
+		rc = qedi_setup_cid_que(qedi);
+		if (rc) {
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "Could not setup cid que\n");
+			goto free_uio;
+		}
+
+		rc = qedi_cm_alloc_mem(qedi);
+		if (rc) {
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "Could not alloc cm memory\n");
+			goto free_cid_que;
+		}
+
+		rc = qedi_alloc_itt(qedi);
+		if (rc) {
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "Could not alloc itt memory\n");
+			goto free_cid_que;
+		}
+
+		sprintf(host_buf, "host_%d", qedi->shost->host_no);
+		qedi->tmf_thread = create_singlethread_workqueue(host_buf);
+		if (!qedi->tmf_thread) {
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "Unable to start tmf thread!\n");
+			rc = -ENODEV;
+			goto free_cid_que;
+		}
+
+		sprintf(host_buf, "qedi_ofld%d", qedi->shost->host_no);
+		qedi->offload_thread = create_workqueue(host_buf);
+		if (!qedi->offload_thread) {
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "Unable to start offload thread!\n");
+			rc = -ENODEV;
+			goto free_cid_que;
+		}
+
+		/* F/w needs 1st task context memory entry for performance */
+		set_bit(QEDI_RESERVE_TASK_ID, qedi->task_idx_map);
+		atomic_set(&qedi->num_offloads, 0);
+
+		if (qedi_setup_boot_info(qedi))
+			QEDI_ERR(&qedi->dbg_ctx,
+				 "No iSCSI boot target configured\n");
+	}
+
+	return 0;
+
+free_cid_que:
+	qedi_release_cid_que(qedi);
+free_uio:
+	qedi_free_uio(qedi->udev);
+remove_host:
+#ifdef CONFIG_DEBUG_FS
+	qedi_dbg_host_exit(&qedi->dbg_ctx);
+#endif
+	iscsi_host_remove(qedi->shost);
+stop_iscsi_func:
+	qedi_ops->stop(qedi->cdev);
+stop_slowpath:
+	qedi_ops->common->slowpath_stop(qedi->cdev);
+stop_hw:
+	qedi_ops->common->remove(qedi->cdev);
+free_pf_params:
+	qedi_free_iscsi_pf_param(qedi);
+free_host:
+	iscsi_host_free(qedi->shost);
+exit_probe:
+	return rc;
+}
+
+static int qedi_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	return __qedi_probe(pdev, QEDI_MODE_NORMAL);
+}
+
+static void qedi_remove(struct pci_dev *pdev)
+{
+	__qedi_remove(pdev, QEDI_MODE_NORMAL);
+}
+
+static struct pci_device_id qedi_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, 0x165E) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, 0x8084) },
+	{ 0 },
+};
+MODULE_DEVICE_TABLE(pci, qedi_pci_tbl);
+
+static enum cpuhp_state qedi_cpuhp_state;
+
+static struct pci_driver qedi_pci_driver = {
+	.name = QEDI_MODULE_NAME,
+	.id_table = qedi_pci_tbl,
+	.probe = qedi_probe,
+	.remove = qedi_remove,
+};
+
+static int __init qedi_init(void)
+{
+	struct qedi_percpu_s *p;
+	int cpu, rc = 0;
+
+	qedi_ops = qed_get_iscsi_ops();
+	if (!qedi_ops) {
+		QEDI_ERR(NULL, "Failed to get qed iSCSI operations\n");
+		return -EINVAL;
+	}
+
+#ifdef CONFIG_DEBUG_FS
+	qedi_dbg_init("qedi");
+#endif
+
+	qedi_scsi_transport = iscsi_register_transport(&qedi_iscsi_transport);
+	if (!qedi_scsi_transport) {
+		QEDI_ERR(NULL, "Could not register qedi transport");
+		rc = -ENOMEM;
+		goto exit_qedi_init_1;
+	}
+
+	for_each_possible_cpu(cpu) {
+		p = &per_cpu(qedi_percpu, cpu);
+		INIT_LIST_HEAD(&p->work_list);
+		spin_lock_init(&p->p_work_lock);
+		p->iothread = NULL;
+	}
+
+	rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "scsi/qedi:online",
+			       qedi_cpu_online, qedi_cpu_offline);
+	if (rc < 0)
+		goto exit_qedi_init_2;
+	qedi_cpuhp_state = rc;
+
+	rc = pci_register_driver(&qedi_pci_driver);
+	if (rc) {
+		QEDI_ERR(NULL, "Failed to register driver\n");
+		goto exit_qedi_hp;
+	}
+
+	return 0;
+
+exit_qedi_hp:
+	cpuhp_remove_state(qedi_cpuhp_state);
+exit_qedi_init_2:
+	iscsi_unregister_transport(&qedi_iscsi_transport);
+exit_qedi_init_1:
+#ifdef CONFIG_DEBUG_FS
+	qedi_dbg_exit();
+#endif
+	qed_put_iscsi_ops();
+	return rc;
+}
+
+static void __exit qedi_cleanup(void)
+{
+	pci_unregister_driver(&qedi_pci_driver);
+	cpuhp_remove_state(qedi_cpuhp_state);
+	iscsi_unregister_transport(&qedi_iscsi_transport);
+
+#ifdef CONFIG_DEBUG_FS
+	qedi_dbg_exit();
+#endif
+	qed_put_iscsi_ops();
+}
+
+MODULE_DESCRIPTION("QLogic FastLinQ 4xxxx iSCSI Module");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("QLogic Corporation");
+MODULE_VERSION(QEDI_MODULE_VERSION);
+module_init(qedi_init);
+module_exit(qedi_cleanup);
diff --git a/drivers/scsi/qedi/qedi_nvm_iscsi_cfg.h b/drivers/scsi/qedi/qedi_nvm_iscsi_cfg.h
new file mode 100644
index 0000000..df39b69
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_nvm_iscsi_cfg.h
@@ -0,0 +1,210 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef NVM_ISCSI_CFG_H
+#define NVM_ISCSI_CFG_H
+
+#define NUM_OF_ISCSI_TARGET_PER_PF    4   /* Defined as per the
+					   * ISCSI IBFT constraint
+					   */
+#define NUM_OF_ISCSI_PF_SUPPORTED     4   /* One PF per Port -
+					   * assuming 4 port card
+					   */
+
+#define NVM_ISCSI_CFG_DHCP_NAME_MAX_LEN  256
+
+union nvm_iscsi_dhcp_vendor_id {
+	u32 value[NVM_ISCSI_CFG_DHCP_NAME_MAX_LEN / 4];
+	u8  byte[NVM_ISCSI_CFG_DHCP_NAME_MAX_LEN];
+};
+
+#define NVM_ISCSI_IPV4_ADDR_BYTE_LEN 4
+union nvm_iscsi_ipv4_addr {
+	u32 addr;
+	u8  byte[NVM_ISCSI_IPV4_ADDR_BYTE_LEN];
+};
+
+#define NVM_ISCSI_IPV6_ADDR_BYTE_LEN 16
+union nvm_iscsi_ipv6_addr {
+	u32 addr[4];
+	u8  byte[NVM_ISCSI_IPV6_ADDR_BYTE_LEN];
+};
+
+struct nvm_iscsi_initiator_ipv4 {
+	union nvm_iscsi_ipv4_addr addr;				/* 0x0 */
+	union nvm_iscsi_ipv4_addr subnet_mask;			/* 0x4 */
+	union nvm_iscsi_ipv4_addr gateway;			/* 0x8 */
+	union nvm_iscsi_ipv4_addr primary_dns;			/* 0xC */
+	union nvm_iscsi_ipv4_addr secondary_dns;		/* 0x10 */
+	union nvm_iscsi_ipv4_addr dhcp_addr;			/* 0x14 */
+
+	union nvm_iscsi_ipv4_addr isns_server;			/* 0x18 */
+	union nvm_iscsi_ipv4_addr slp_server;			/* 0x1C */
+	union nvm_iscsi_ipv4_addr primay_radius_server;		/* 0x20 */
+	union nvm_iscsi_ipv4_addr secondary_radius_server;	/* 0x24 */
+
+	union nvm_iscsi_ipv4_addr rsvd[4];			/* 0x28 */
+};
+
+struct nvm_iscsi_initiator_ipv6 {
+	union nvm_iscsi_ipv6_addr addr;				/* 0x0 */
+	union nvm_iscsi_ipv6_addr subnet_mask;			/* 0x10 */
+	union nvm_iscsi_ipv6_addr gateway;			/* 0x20 */
+	union nvm_iscsi_ipv6_addr primary_dns;			/* 0x30 */
+	union nvm_iscsi_ipv6_addr secondary_dns;		/* 0x40 */
+	union nvm_iscsi_ipv6_addr dhcp_addr;			/* 0x50 */
+
+	union nvm_iscsi_ipv6_addr isns_server;			/* 0x60 */
+	union nvm_iscsi_ipv6_addr slp_server;			/* 0x70 */
+	union nvm_iscsi_ipv6_addr primay_radius_server;		/* 0x80 */
+	union nvm_iscsi_ipv6_addr secondary_radius_server;	/* 0x90 */
+
+	union nvm_iscsi_ipv6_addr rsvd[3];			/* 0xA0 */
+
+	u32   config;						/* 0xD0 */
+#define NVM_ISCSI_CFG_INITIATOR_IPV6_SUBNET_MASK_PREFIX_MASK      0x000000FF
+#define NVM_ISCSI_CFG_INITIATOR_IPV6_SUBNET_MASK_PREFIX_OFFSET    0
+
+	u32   rsvd_1[3];
+};
+
+#define NVM_ISCSI_CFG_ISCSI_NAME_MAX_LEN  256
+union nvm_iscsi_name {
+	u32 value[NVM_ISCSI_CFG_ISCSI_NAME_MAX_LEN / 4];
+	u8  byte[NVM_ISCSI_CFG_ISCSI_NAME_MAX_LEN];
+};
+
+#define NVM_ISCSI_CFG_CHAP_NAME_MAX_LEN  256
+union nvm_iscsi_chap_name {
+	u32 value[NVM_ISCSI_CFG_CHAP_NAME_MAX_LEN / 4];
+	u8  byte[NVM_ISCSI_CFG_CHAP_NAME_MAX_LEN];
+};
+
+#define NVM_ISCSI_CFG_CHAP_PWD_MAX_LEN  16 /* md5 need per RFC1996
+					    * is 16 octets
+					    */
+union nvm_iscsi_chap_password {
+	u32 value[NVM_ISCSI_CFG_CHAP_PWD_MAX_LEN / 4];
+	u8 byte[NVM_ISCSI_CFG_CHAP_PWD_MAX_LEN];
+};
+
+union nvm_iscsi_lun {
+	u8  byte[8];
+	u32 value[2];
+};
+
+struct nvm_iscsi_generic {
+	u32 ctrl_flags;						/* 0x0 */
+#define NVM_ISCSI_CFG_GEN_CHAP_ENABLED                 BIT(0)
+#define NVM_ISCSI_CFG_GEN_DHCP_TCPIP_CONFIG_ENABLED    BIT(1)
+#define NVM_ISCSI_CFG_GEN_DHCP_ISCSI_CONFIG_ENABLED    BIT(2)
+#define NVM_ISCSI_CFG_GEN_IPV6_ENABLED                 BIT(3)
+#define NVM_ISCSI_CFG_GEN_IPV4_FALLBACK_ENABLED        BIT(4)
+#define NVM_ISCSI_CFG_GEN_ISNS_WORLD_LOGIN             BIT(5)
+#define NVM_ISCSI_CFG_GEN_ISNS_SELECTIVE_LOGIN         BIT(6)
+#define NVM_ISCSI_CFG_GEN_ADDR_REDIRECT_ENABLED	       BIT(7)
+#define NVM_ISCSI_CFG_GEN_CHAP_MUTUAL_ENABLED          BIT(8)
+
+	u32 timeout;						/* 0x4 */
+#define NVM_ISCSI_CFG_GEN_DHCP_REQUEST_TIMEOUT_MASK       0x0000FFFF
+#define NVM_ISCSI_CFG_GEN_DHCP_REQUEST_TIMEOUT_OFFSET     0
+#define NVM_ISCSI_CFG_GEN_PORT_LOGIN_TIMEOUT_MASK         0xFFFF0000
+#define NVM_ISCSI_CFG_GEN_PORT_LOGIN_TIMEOUT_OFFSET       16
+
+	union nvm_iscsi_dhcp_vendor_id  dhcp_vendor_id;		/* 0x8  */
+	u32 rsvd[62];						/* 0x108 */
+};
+
+struct nvm_iscsi_initiator {
+	struct nvm_iscsi_initiator_ipv4 ipv4;			/* 0x0 */
+	struct nvm_iscsi_initiator_ipv6 ipv6;			/* 0x38 */
+
+	union nvm_iscsi_name           initiator_name;		/* 0x118 */
+	union nvm_iscsi_chap_name      chap_name;		/* 0x218 */
+	union nvm_iscsi_chap_password  chap_password;		/* 0x318 */
+
+	u32 generic_cont0;					/* 0x398 */
+#define NVM_ISCSI_CFG_INITIATOR_VLAN_MASK		0x0000FFFF
+#define NVM_ISCSI_CFG_INITIATOR_VLAN_OFFSET		0
+#define NVM_ISCSI_CFG_INITIATOR_IP_VERSION_MASK		0x00030000
+#define NVM_ISCSI_CFG_INITIATOR_IP_VERSION_OFFSET	16
+#define NVM_ISCSI_CFG_INITIATOR_IP_VERSION_4		1
+#define NVM_ISCSI_CFG_INITIATOR_IP_VERSION_6		2
+#define NVM_ISCSI_CFG_INITIATOR_IP_VERSION_4_AND_6	3
+
+	u32 ctrl_flags;
+#define NVM_ISCSI_CFG_INITIATOR_IP_VERSION_PRIORITY_V6     BIT(0)
+#define NVM_ISCSI_CFG_INITIATOR_VLAN_ENABLED               BIT(1)
+
+	u32 rsvd[116];						/* 0x32C */
+};
+
+struct nvm_iscsi_target {
+	u32 ctrl_flags;						/* 0x0 */
+#define NVM_ISCSI_CFG_TARGET_ENABLED            BIT(0)
+#define NVM_ISCSI_CFG_BOOT_TIME_LOGIN_STATUS    BIT(1)
+
+	u32 generic_cont0;					/* 0x4 */
+#define NVM_ISCSI_CFG_TARGET_TCP_PORT_MASK      0x0000FFFF
+#define NVM_ISCSI_CFG_TARGET_TCP_PORT_OFFSET    0
+
+	u32 ip_ver;
+#define NVM_ISCSI_CFG_IPv4       4
+#define NVM_ISCSI_CFG_IPv6       6
+
+	u32 rsvd_1[7];						/* 0x24 */
+	union nvm_iscsi_ipv4_addr ipv4_addr;			/* 0x28 */
+	union nvm_iscsi_ipv6_addr ipv6_addr;			/* 0x2C */
+	union nvm_iscsi_lun lun;				/* 0x3C */
+
+	union nvm_iscsi_name           target_name;		/* 0x44 */
+	union nvm_iscsi_chap_name      chap_name;		/* 0x144 */
+	union nvm_iscsi_chap_password  chap_password;		/* 0x244 */
+
+	u32 rsvd_2[107];					/* 0x2C4 */
+};
+
+struct nvm_iscsi_block {
+	u32 id;							/* 0x0 */
+#define NVM_ISCSI_CFG_BLK_MAPPED_PF_ID_MASK         0x0000000F
+#define NVM_ISCSI_CFG_BLK_MAPPED_PF_ID_OFFSET       0
+#define NVM_ISCSI_CFG_BLK_CTRL_FLAG_MASK            0x00000FF0
+#define NVM_ISCSI_CFG_BLK_CTRL_FLAG_OFFSET          4
+#define NVM_ISCSI_CFG_BLK_CTRL_FLAG_IS_NOT_EMPTY    BIT(0)
+#define NVM_ISCSI_CFG_BLK_CTRL_FLAG_PF_MAPPED       BIT(1)
+
+	u32 rsvd_1[5];						/* 0x4 */
+
+	struct nvm_iscsi_generic     generic;			/* 0x18 */
+	struct nvm_iscsi_initiator   initiator;			/* 0x218 */
+	struct nvm_iscsi_target      target[NUM_OF_ISCSI_TARGET_PER_PF];
+								/* 0x718 */
+
+	u32 rsvd_2[58];						/* 0x1718 */
+	/* total size - 0x1800 - 6K block */
+};
+
+struct nvm_iscsi_cfg {
+	u32 id;							/* 0x0 */
+#define NVM_ISCSI_CFG_BLK_VERSION_MINOR_MASK     0x000000FF
+#define NVM_ISCSI_CFG_BLK_VERSION_MAJOR_MASK     0x0000FF00
+#define NVM_ISCSI_CFG_BLK_SIGNATURE_MASK         0xFFFF0000
+#define NVM_ISCSI_CFG_BLK_SIGNATURE              0x49430000 /* IC - Iscsi
+							     * Config
+							     */
+
+#define NVM_ISCSI_CFG_BLK_VERSION_MAJOR          0
+#define NVM_ISCSI_CFG_BLK_VERSION_MINOR          10
+#define NVM_ISCSI_CFG_BLK_VERSION ((NVM_ISCSI_CFG_BLK_VERSION_MAJOR << 8) | \
+				   NVM_ISCSI_CFG_BLK_VERSION_MINOR)
+
+	struct nvm_iscsi_block	block[NUM_OF_ISCSI_PF_SUPPORTED]; /* 0x4 */
+};
+
+#endif
diff --git a/drivers/scsi/qedi/qedi_sysfs.c b/drivers/scsi/qedi/qedi_sysfs.c
new file mode 100644
index 0000000..b10c48b
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_sysfs.c
@@ -0,0 +1,52 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include "qedi.h"
+#include "qedi_gbl.h"
+#include "qedi_iscsi.h"
+#include "qedi_dbg.h"
+
+static inline struct qedi_ctx *qedi_dev_to_hba(struct device *dev)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+
+	return iscsi_host_priv(shost);
+}
+
+static ssize_t qedi_show_port_state(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct qedi_ctx *qedi = qedi_dev_to_hba(dev);
+
+	if (atomic_read(&qedi->link_state) == QEDI_LINK_UP)
+		return sprintf(buf, "Online\n");
+	else
+		return sprintf(buf, "Linkdown\n");
+}
+
+static ssize_t qedi_show_speed(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct qedi_ctx *qedi = qedi_dev_to_hba(dev);
+	struct qed_link_output if_link;
+
+	qedi_ops->common->get_link(qedi->cdev, &if_link);
+
+	return sprintf(buf, "%d Gbit\n", if_link.speed / 1000);
+}
+
+static DEVICE_ATTR(port_state, 0444, qedi_show_port_state, NULL);
+static DEVICE_ATTR(speed, 0444, qedi_show_speed, NULL);
+
+struct device_attribute *qedi_shost_attrs[] = {
+	&dev_attr_port_state,
+	&dev_attr_speed,
+	NULL
+};
diff --git a/drivers/scsi/qedi/qedi_version.h b/drivers/scsi/qedi/qedi_version.h
new file mode 100644
index 0000000..8a0e523
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_version.h
@@ -0,0 +1,14 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#define QEDI_MODULE_VERSION	"8.33.0.20"
+#define QEDI_DRIVER_MAJOR_VER		8
+#define QEDI_DRIVER_MINOR_VER		33
+#define QEDI_DRIVER_REV_VER		0
+#define QEDI_DRIVER_ENG_VER		20
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
new file mode 100644
index 0000000..99f1db5
--- /dev/null
+++ b/drivers/scsi/scsi_priv.h
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _SCSI_PRIV_H
+#define _SCSI_PRIV_H
+
+#include <linux/device.h>
+#include <linux/async.h>
+#include <scsi/scsi_device.h>
+
+struct request_queue;
+struct request;
+struct scsi_cmnd;
+struct scsi_device;
+struct scsi_target;
+struct scsi_host_template;
+struct Scsi_Host;
+struct scsi_nl_hdr;
+
+
+/*
+ * Scsi Error Handler Flags
+ */
+#define SCSI_EH_ABORT_SCHEDULED	0x0002	/* Abort has been scheduled */
+
+#define SCSI_SENSE_VALID(scmd) \
+	(((scmd)->sense_buffer[0] & 0x70) == 0x70)
+
+/* hosts.c */
+extern int scsi_init_hosts(void);
+extern void scsi_exit_hosts(void);
+
+/* scsi.c */
+extern bool scsi_use_blk_mq;
+int scsi_init_sense_cache(struct Scsi_Host *shost);
+void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd);
+#ifdef CONFIG_SCSI_LOGGING
+void scsi_log_send(struct scsi_cmnd *cmd);
+void scsi_log_completion(struct scsi_cmnd *cmd, int disposition);
+#else
+static inline void scsi_log_send(struct scsi_cmnd *cmd) 
+	{ };
+static inline void scsi_log_completion(struct scsi_cmnd *cmd, int disposition)
+	{ };
+#endif
+
+/* scsi_devinfo.c */
+
+/* list of keys for the lists */
+enum scsi_devinfo_key {
+	SCSI_DEVINFO_GLOBAL = 0,
+	SCSI_DEVINFO_SPI,
+};
+
+extern blist_flags_t scsi_get_device_flags(struct scsi_device *sdev,
+					   const unsigned char *vendor,
+					   const unsigned char *model);
+extern blist_flags_t scsi_get_device_flags_keyed(struct scsi_device *sdev,
+						 const unsigned char *vendor,
+						 const unsigned char *model,
+						 enum scsi_devinfo_key key);
+extern int scsi_dev_info_list_add_keyed(int compatible, char *vendor,
+					char *model, char *strflags,
+					blist_flags_t flags,
+					enum scsi_devinfo_key key);
+extern int scsi_dev_info_list_del_keyed(char *vendor, char *model,
+					enum scsi_devinfo_key key);
+extern int scsi_dev_info_add_list(enum scsi_devinfo_key key, const char *name);
+extern int scsi_dev_info_remove_list(enum scsi_devinfo_key key);
+
+extern int __init scsi_init_devinfo(void);
+extern void scsi_exit_devinfo(void);
+
+/* scsi_error.c */
+extern void scmd_eh_abort_handler(struct work_struct *work);
+extern enum blk_eh_timer_return scsi_times_out(struct request *req);
+extern int scsi_error_handler(void *host);
+extern int scsi_decide_disposition(struct scsi_cmnd *cmd);
+extern void scsi_eh_wakeup(struct Scsi_Host *shost);
+extern void scsi_eh_scmd_add(struct scsi_cmnd *);
+void scsi_eh_ready_devs(struct Scsi_Host *shost,
+			struct list_head *work_q,
+			struct list_head *done_q);
+int scsi_eh_get_sense(struct list_head *work_q,
+		      struct list_head *done_q);
+int scsi_noretry_cmd(struct scsi_cmnd *scmd);
+
+/* scsi_lib.c */
+extern void scsi_add_cmd_to_list(struct scsi_cmnd *cmd);
+extern void scsi_del_cmd_from_list(struct scsi_cmnd *cmd);
+extern int scsi_maybe_unblock_host(struct scsi_device *sdev);
+extern void scsi_device_unbusy(struct scsi_device *sdev);
+extern void scsi_queue_insert(struct scsi_cmnd *cmd, int reason);
+extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
+extern void scsi_run_host_queues(struct Scsi_Host *shost);
+extern void scsi_requeue_run_queue(struct work_struct *work);
+extern struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev);
+extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev);
+extern void scsi_start_queue(struct scsi_device *sdev);
+extern int scsi_mq_setup_tags(struct Scsi_Host *shost);
+extern void scsi_mq_destroy_tags(struct Scsi_Host *shost);
+extern int scsi_init_queue(void);
+extern void scsi_exit_queue(void);
+extern void scsi_evt_thread(struct work_struct *work);
+struct request_queue;
+struct request;
+
+/* scsi_proc.c */
+#ifdef CONFIG_SCSI_PROC_FS
+extern void scsi_proc_hostdir_add(struct scsi_host_template *);
+extern void scsi_proc_hostdir_rm(struct scsi_host_template *);
+extern void scsi_proc_host_add(struct Scsi_Host *);
+extern void scsi_proc_host_rm(struct Scsi_Host *);
+extern int scsi_init_procfs(void);
+extern void scsi_exit_procfs(void);
+#else
+# define scsi_proc_hostdir_add(sht)	do { } while (0)
+# define scsi_proc_hostdir_rm(sht)	do { } while (0)
+# define scsi_proc_host_add(shost)	do { } while (0)
+# define scsi_proc_host_rm(shost)	do { } while (0)
+# define scsi_init_procfs()		(0)
+# define scsi_exit_procfs()		do { } while (0)
+#endif /* CONFIG_PROC_FS */
+
+/* scsi_scan.c */
+extern char scsi_scan_type[];
+extern int scsi_complete_async_scans(void);
+extern int scsi_scan_host_selected(struct Scsi_Host *, unsigned int,
+				   unsigned int, u64, enum scsi_scan_mode);
+extern void scsi_forget_host(struct Scsi_Host *);
+extern void scsi_rescan_device(struct device *);
+
+/* scsi_sysctl.c */
+#ifdef CONFIG_SYSCTL
+extern int scsi_init_sysctl(void);
+extern void scsi_exit_sysctl(void);
+#else
+# define scsi_init_sysctl()		(0)
+# define scsi_exit_sysctl()		do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+/* scsi_sysfs.c */
+extern int scsi_sysfs_add_sdev(struct scsi_device *);
+extern int scsi_sysfs_add_host(struct Scsi_Host *);
+extern int scsi_sysfs_register(void);
+extern void scsi_sysfs_unregister(void);
+extern void scsi_sysfs_device_initialize(struct scsi_device *);
+extern int scsi_sysfs_target_initialize(struct scsi_device *);
+extern struct scsi_transport_template blank_transport_template;
+extern void __scsi_remove_device(struct scsi_device *);
+
+extern struct bus_type scsi_bus_type;
+extern const struct attribute_group *scsi_sysfs_shost_attr_groups[];
+
+/* scsi_netlink.c */
+#ifdef CONFIG_SCSI_NETLINK
+extern void scsi_netlink_init(void);
+extern void scsi_netlink_exit(void);
+extern struct sock *scsi_nl_sock;
+#else
+static inline void scsi_netlink_init(void) {}
+static inline void scsi_netlink_exit(void) {}
+#endif
+
+/* scsi_pm.c */
+#ifdef CONFIG_PM
+extern const struct dev_pm_ops scsi_bus_pm_ops;
+
+extern void scsi_autopm_get_target(struct scsi_target *);
+extern void scsi_autopm_put_target(struct scsi_target *);
+extern int scsi_autopm_get_host(struct Scsi_Host *);
+extern void scsi_autopm_put_host(struct Scsi_Host *);
+#else
+static inline void scsi_autopm_get_target(struct scsi_target *t) {}
+static inline void scsi_autopm_put_target(struct scsi_target *t) {}
+static inline int scsi_autopm_get_host(struct Scsi_Host *h) { return 0; }
+static inline void scsi_autopm_put_host(struct Scsi_Host *h) {}
+#endif /* CONFIG_PM */
+
+extern struct async_domain scsi_sd_pm_domain;
+extern struct async_domain scsi_sd_probe_domain;
+
+/* scsi_dh.c */
+#ifdef CONFIG_SCSI_DH
+void scsi_dh_add_device(struct scsi_device *sdev);
+void scsi_dh_release_device(struct scsi_device *sdev);
+#else
+static inline void scsi_dh_add_device(struct scsi_device *sdev) { }
+static inline void scsi_dh_release_device(struct scsi_device *sdev) { }
+#endif
+
+/* 
+ * internal scsi timeout functions: for use by mid-layer and transport
+ * classes.
+ */
+
+#define SCSI_DEVICE_BLOCK_MAX_TIMEOUT	600	/* units in seconds */
+
+#endif /* _SCSI_PRIV_H */
diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c
new file mode 100644
index 0000000..456ce9f
--- /dev/null
+++ b/drivers/scsi/scsi_transport_srp.c
@@ -0,0 +1,909 @@
+/*
+ * SCSI RDMA (SRP) transport class
+ *
+ * Copyright (C) 2007 FUJITA Tomonori <tomof@acm.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_transport.h>
+#include <scsi/scsi_transport_srp.h>
+#include "scsi_priv.h"
+
+struct srp_host_attrs {
+	atomic_t next_port_id;
+};
+#define to_srp_host_attrs(host)	((struct srp_host_attrs *)(host)->shost_data)
+
+#define SRP_HOST_ATTRS 0
+#define SRP_RPORT_ATTRS 8
+
+struct srp_internal {
+	struct scsi_transport_template t;
+	struct srp_function_template *f;
+
+	struct device_attribute *host_attrs[SRP_HOST_ATTRS + 1];
+
+	struct device_attribute *rport_attrs[SRP_RPORT_ATTRS + 1];
+	struct transport_container rport_attr_cont;
+};
+
+static int scsi_is_srp_rport(const struct device *dev);
+
+#define to_srp_internal(tmpl) container_of(tmpl, struct srp_internal, t)
+
+#define	dev_to_rport(d)	container_of(d, struct srp_rport, dev)
+#define transport_class_to_srp_rport(dev) dev_to_rport((dev)->parent)
+static inline struct Scsi_Host *rport_to_shost(struct srp_rport *r)
+{
+	return dev_to_shost(r->dev.parent);
+}
+
+static int find_child_rport(struct device *dev, void *data)
+{
+	struct device **child = data;
+
+	if (scsi_is_srp_rport(dev)) {
+		WARN_ON_ONCE(*child);
+		*child = dev;
+	}
+	return 0;
+}
+
+static inline struct srp_rport *shost_to_rport(struct Scsi_Host *shost)
+{
+	struct device *child = NULL;
+
+	WARN_ON_ONCE(device_for_each_child(&shost->shost_gendev, &child,
+					   find_child_rport) < 0);
+	return child ? dev_to_rport(child) : NULL;
+}
+
+/**
+ * srp_tmo_valid() - check timeout combination validity
+ * @reconnect_delay: Reconnect delay in seconds.
+ * @fast_io_fail_tmo: Fast I/O fail timeout in seconds.
+ * @dev_loss_tmo: Device loss timeout in seconds.
+ *
+ * The combination of the timeout parameters must be such that SCSI commands
+ * are finished in a reasonable time. Hence do not allow the fast I/O fail
+ * timeout to exceed SCSI_DEVICE_BLOCK_MAX_TIMEOUT nor allow dev_loss_tmo to
+ * exceed that limit if failing I/O fast has been disabled. Furthermore, these
+ * parameters must be such that multipath can detect failed paths timely.
+ * Hence do not allow all three parameters to be disabled simultaneously.
+ */
+int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, long dev_loss_tmo)
+{
+	if (reconnect_delay < 0 && fast_io_fail_tmo < 0 && dev_loss_tmo < 0)
+		return -EINVAL;
+	if (reconnect_delay == 0)
+		return -EINVAL;
+	if (fast_io_fail_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
+		return -EINVAL;
+	if (fast_io_fail_tmo < 0 &&
+	    dev_loss_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
+		return -EINVAL;
+	if (dev_loss_tmo >= LONG_MAX / HZ)
+		return -EINVAL;
+	if (fast_io_fail_tmo >= 0 && dev_loss_tmo >= 0 &&
+	    fast_io_fail_tmo >= dev_loss_tmo)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(srp_tmo_valid);
+
+static int srp_host_setup(struct transport_container *tc, struct device *dev,
+			  struct device *cdev)
+{
+	struct Scsi_Host *shost = dev_to_shost(dev);
+	struct srp_host_attrs *srp_host = to_srp_host_attrs(shost);
+
+	atomic_set(&srp_host->next_port_id, 0);
+	return 0;
+}
+
+static DECLARE_TRANSPORT_CLASS(srp_host_class, "srp_host", srp_host_setup,
+			       NULL, NULL);
+
+static DECLARE_TRANSPORT_CLASS(srp_rport_class, "srp_remote_ports",
+			       NULL, NULL, NULL);
+
+static ssize_t
+show_srp_rport_id(struct device *dev, struct device_attribute *attr,
+		  char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	return sprintf(buf, "%16phC\n", rport->port_id);
+}
+
+static DEVICE_ATTR(port_id, S_IRUGO, show_srp_rport_id, NULL);
+
+static const struct {
+	u32 value;
+	char *name;
+} srp_rport_role_names[] = {
+	{SRP_RPORT_ROLE_INITIATOR, "SRP Initiator"},
+	{SRP_RPORT_ROLE_TARGET, "SRP Target"},
+};
+
+static ssize_t
+show_srp_rport_roles(struct device *dev, struct device_attribute *attr,
+		     char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	int i;
+	char *name = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(srp_rport_role_names); i++)
+		if (srp_rport_role_names[i].value == rport->roles) {
+			name = srp_rport_role_names[i].name;
+			break;
+		}
+	return sprintf(buf, "%s\n", name ? : "unknown");
+}
+
+static DEVICE_ATTR(roles, S_IRUGO, show_srp_rport_roles, NULL);
+
+static ssize_t store_srp_rport_delete(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	struct Scsi_Host *shost = dev_to_shost(dev);
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+
+	if (i->f->rport_delete) {
+		i->f->rport_delete(rport);
+		return count;
+	} else {
+		return -ENOSYS;
+	}
+}
+
+static DEVICE_ATTR(delete, S_IWUSR, NULL, store_srp_rport_delete);
+
+static ssize_t show_srp_rport_state(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	static const char *const state_name[] = {
+		[SRP_RPORT_RUNNING]	= "running",
+		[SRP_RPORT_BLOCKED]	= "blocked",
+		[SRP_RPORT_FAIL_FAST]	= "fail-fast",
+		[SRP_RPORT_LOST]	= "lost",
+	};
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	enum srp_rport_state state = rport->state;
+
+	return sprintf(buf, "%s\n",
+		       (unsigned)state < ARRAY_SIZE(state_name) ?
+		       state_name[state] : "???");
+}
+
+static DEVICE_ATTR(state, S_IRUGO, show_srp_rport_state, NULL);
+
+static ssize_t srp_show_tmo(char *buf, int tmo)
+{
+	return tmo >= 0 ? sprintf(buf, "%d\n", tmo) : sprintf(buf, "off\n");
+}
+
+int srp_parse_tmo(int *tmo, const char *buf)
+{
+	int res = 0;
+
+	if (strncmp(buf, "off", 3) != 0)
+		res = kstrtoint(buf, 0, tmo);
+	else
+		*tmo = -1;
+
+	return res;
+}
+EXPORT_SYMBOL(srp_parse_tmo);
+
+static ssize_t show_reconnect_delay(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+	return srp_show_tmo(buf, rport->reconnect_delay);
+}
+
+static ssize_t store_reconnect_delay(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, const size_t count)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	int res, delay;
+
+	res = srp_parse_tmo(&delay, buf);
+	if (res)
+		goto out;
+	res = srp_tmo_valid(delay, rport->fast_io_fail_tmo,
+			    rport->dev_loss_tmo);
+	if (res)
+		goto out;
+
+	if (rport->reconnect_delay <= 0 && delay > 0 &&
+	    rport->state != SRP_RPORT_RUNNING) {
+		queue_delayed_work(system_long_wq, &rport->reconnect_work,
+				   delay * HZ);
+	} else if (delay <= 0) {
+		cancel_delayed_work(&rport->reconnect_work);
+	}
+	rport->reconnect_delay = delay;
+	res = count;
+
+out:
+	return res;
+}
+
+static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, show_reconnect_delay,
+		   store_reconnect_delay);
+
+static ssize_t show_failed_reconnects(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+	return sprintf(buf, "%d\n", rport->failed_reconnects);
+}
+
+static DEVICE_ATTR(failed_reconnects, S_IRUGO, show_failed_reconnects, NULL);
+
+static ssize_t show_srp_rport_fast_io_fail_tmo(struct device *dev,
+					       struct device_attribute *attr,
+					       char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+	return srp_show_tmo(buf, rport->fast_io_fail_tmo);
+}
+
+static ssize_t store_srp_rport_fast_io_fail_tmo(struct device *dev,
+						struct device_attribute *attr,
+						const char *buf, size_t count)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	int res;
+	int fast_io_fail_tmo;
+
+	res = srp_parse_tmo(&fast_io_fail_tmo, buf);
+	if (res)
+		goto out;
+	res = srp_tmo_valid(rport->reconnect_delay, fast_io_fail_tmo,
+			    rport->dev_loss_tmo);
+	if (res)
+		goto out;
+	rport->fast_io_fail_tmo = fast_io_fail_tmo;
+	res = count;
+
+out:
+	return res;
+}
+
+static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR,
+		   show_srp_rport_fast_io_fail_tmo,
+		   store_srp_rport_fast_io_fail_tmo);
+
+static ssize_t show_srp_rport_dev_loss_tmo(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+	return srp_show_tmo(buf, rport->dev_loss_tmo);
+}
+
+static ssize_t store_srp_rport_dev_loss_tmo(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	int res;
+	int dev_loss_tmo;
+
+	res = srp_parse_tmo(&dev_loss_tmo, buf);
+	if (res)
+		goto out;
+	res = srp_tmo_valid(rport->reconnect_delay, rport->fast_io_fail_tmo,
+			    dev_loss_tmo);
+	if (res)
+		goto out;
+	rport->dev_loss_tmo = dev_loss_tmo;
+	res = count;
+
+out:
+	return res;
+}
+
+static DEVICE_ATTR(dev_loss_tmo, S_IRUGO | S_IWUSR,
+		   show_srp_rport_dev_loss_tmo,
+		   store_srp_rport_dev_loss_tmo);
+
+static int srp_rport_set_state(struct srp_rport *rport,
+			       enum srp_rport_state new_state)
+{
+	enum srp_rport_state old_state = rport->state;
+
+	lockdep_assert_held(&rport->mutex);
+
+	switch (new_state) {
+	case SRP_RPORT_RUNNING:
+		switch (old_state) {
+		case SRP_RPORT_LOST:
+			goto invalid;
+		default:
+			break;
+		}
+		break;
+	case SRP_RPORT_BLOCKED:
+		switch (old_state) {
+		case SRP_RPORT_RUNNING:
+			break;
+		default:
+			goto invalid;
+		}
+		break;
+	case SRP_RPORT_FAIL_FAST:
+		switch (old_state) {
+		case SRP_RPORT_LOST:
+			goto invalid;
+		default:
+			break;
+		}
+		break;
+	case SRP_RPORT_LOST:
+		break;
+	}
+	rport->state = new_state;
+	return 0;
+
+invalid:
+	return -EINVAL;
+}
+
+/**
+ * srp_reconnect_work() - reconnect and schedule a new attempt if necessary
+ * @work: Work structure used for scheduling this operation.
+ */
+static void srp_reconnect_work(struct work_struct *work)
+{
+	struct srp_rport *rport = container_of(to_delayed_work(work),
+					struct srp_rport, reconnect_work);
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	int delay, res;
+
+	res = srp_reconnect_rport(rport);
+	if (res != 0) {
+		shost_printk(KERN_ERR, shost,
+			     "reconnect attempt %d failed (%d)\n",
+			     ++rport->failed_reconnects, res);
+		delay = rport->reconnect_delay *
+			min(100, max(1, rport->failed_reconnects - 10));
+		if (delay > 0)
+			queue_delayed_work(system_long_wq,
+					   &rport->reconnect_work, delay * HZ);
+	}
+}
+
+static void __rport_fail_io_fast(struct srp_rport *rport)
+{
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	struct srp_internal *i;
+
+	lockdep_assert_held(&rport->mutex);
+
+	if (srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST))
+		return;
+	/*
+	 * Call scsi_target_block() to wait for ongoing shost->queuecommand()
+	 * calls before invoking i->f->terminate_rport_io().
+	 */
+	scsi_target_block(rport->dev.parent);
+	scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE);
+
+	/* Involve the LLD if possible to terminate all I/O on the rport. */
+	i = to_srp_internal(shost->transportt);
+	if (i->f->terminate_rport_io)
+		i->f->terminate_rport_io(rport);
+}
+
+/**
+ * rport_fast_io_fail_timedout() - fast I/O failure timeout handler
+ * @work: Work structure used for scheduling this operation.
+ */
+static void rport_fast_io_fail_timedout(struct work_struct *work)
+{
+	struct srp_rport *rport = container_of(to_delayed_work(work),
+					struct srp_rport, fast_io_fail_work);
+	struct Scsi_Host *shost = rport_to_shost(rport);
+
+	pr_info("fast_io_fail_tmo expired for SRP %s / %s.\n",
+		dev_name(&rport->dev), dev_name(&shost->shost_gendev));
+
+	mutex_lock(&rport->mutex);
+	if (rport->state == SRP_RPORT_BLOCKED)
+		__rport_fail_io_fast(rport);
+	mutex_unlock(&rport->mutex);
+}
+
+/**
+ * rport_dev_loss_timedout() - device loss timeout handler
+ * @work: Work structure used for scheduling this operation.
+ */
+static void rport_dev_loss_timedout(struct work_struct *work)
+{
+	struct srp_rport *rport = container_of(to_delayed_work(work),
+					struct srp_rport, dev_loss_work);
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+
+	pr_info("dev_loss_tmo expired for SRP %s / %s.\n",
+		dev_name(&rport->dev), dev_name(&shost->shost_gendev));
+
+	mutex_lock(&rport->mutex);
+	WARN_ON(srp_rport_set_state(rport, SRP_RPORT_LOST) != 0);
+	scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE);
+	mutex_unlock(&rport->mutex);
+
+	i->f->rport_delete(rport);
+}
+
+static void __srp_start_tl_fail_timers(struct srp_rport *rport)
+{
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	int delay, fast_io_fail_tmo, dev_loss_tmo;
+
+	lockdep_assert_held(&rport->mutex);
+
+	delay = rport->reconnect_delay;
+	fast_io_fail_tmo = rport->fast_io_fail_tmo;
+	dev_loss_tmo = rport->dev_loss_tmo;
+	pr_debug("%s current state: %d\n", dev_name(&shost->shost_gendev),
+		 rport->state);
+
+	if (rport->state == SRP_RPORT_LOST)
+		return;
+	if (delay > 0)
+		queue_delayed_work(system_long_wq, &rport->reconnect_work,
+				   1UL * delay * HZ);
+	if ((fast_io_fail_tmo >= 0 || dev_loss_tmo >= 0) &&
+	    srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) {
+		pr_debug("%s new state: %d\n", dev_name(&shost->shost_gendev),
+			 rport->state);
+		scsi_target_block(&shost->shost_gendev);
+		if (fast_io_fail_tmo >= 0)
+			queue_delayed_work(system_long_wq,
+					   &rport->fast_io_fail_work,
+					   1UL * fast_io_fail_tmo * HZ);
+		if (dev_loss_tmo >= 0)
+			queue_delayed_work(system_long_wq,
+					   &rport->dev_loss_work,
+					   1UL * dev_loss_tmo * HZ);
+	}
+}
+
+/**
+ * srp_start_tl_fail_timers() - start the transport layer failure timers
+ * @rport: SRP target port.
+ *
+ * Start the transport layer fast I/O failure and device loss timers. Do not
+ * modify a timer that was already started.
+ */
+void srp_start_tl_fail_timers(struct srp_rport *rport)
+{
+	mutex_lock(&rport->mutex);
+	__srp_start_tl_fail_timers(rport);
+	mutex_unlock(&rport->mutex);
+}
+EXPORT_SYMBOL(srp_start_tl_fail_timers);
+
+/**
+ * srp_reconnect_rport() - reconnect to an SRP target port
+ * @rport: SRP target port.
+ *
+ * Blocks SCSI command queueing before invoking reconnect() such that
+ * queuecommand() won't be invoked concurrently with reconnect() from outside
+ * the SCSI EH. This is important since a reconnect() implementation may
+ * reallocate resources needed by queuecommand().
+ *
+ * Notes:
+ * - This function neither waits until outstanding requests have finished nor
+ *   tries to abort these. It is the responsibility of the reconnect()
+ *   function to finish outstanding commands before reconnecting to the target
+ *   port.
+ * - It is the responsibility of the caller to ensure that the resources
+ *   reallocated by the reconnect() function won't be used while this function
+ *   is in progress. One possible strategy is to invoke this function from
+ *   the context of the SCSI EH thread only. Another possible strategy is to
+ *   lock the rport mutex inside each SCSI LLD callback that can be invoked by
+ *   the SCSI EH (the scsi_host_template.eh_*() functions and also the
+ *   scsi_host_template.queuecommand() function).
+ */
+int srp_reconnect_rport(struct srp_rport *rport)
+{
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+	struct scsi_device *sdev;
+	int res;
+
+	pr_debug("SCSI host %s\n", dev_name(&shost->shost_gendev));
+
+	res = mutex_lock_interruptible(&rport->mutex);
+	if (res)
+		goto out;
+	scsi_target_block(&shost->shost_gendev);
+	res = rport->state != SRP_RPORT_LOST ? i->f->reconnect(rport) : -ENODEV;
+	pr_debug("%s (state %d): transport.reconnect() returned %d\n",
+		 dev_name(&shost->shost_gendev), rport->state, res);
+	if (res == 0) {
+		cancel_delayed_work(&rport->fast_io_fail_work);
+		cancel_delayed_work(&rport->dev_loss_work);
+
+		rport->failed_reconnects = 0;
+		srp_rport_set_state(rport, SRP_RPORT_RUNNING);
+		scsi_target_unblock(&shost->shost_gendev, SDEV_RUNNING);
+		/*
+		 * If the SCSI error handler has offlined one or more devices,
+		 * invoking scsi_target_unblock() won't change the state of
+		 * these devices into running so do that explicitly.
+		 */
+		shost_for_each_device(sdev, shost) {
+			mutex_lock(&sdev->state_mutex);
+			if (sdev->sdev_state == SDEV_OFFLINE)
+				sdev->sdev_state = SDEV_RUNNING;
+			mutex_unlock(&sdev->state_mutex);
+		}
+	} else if (rport->state == SRP_RPORT_RUNNING) {
+		/*
+		 * srp_reconnect_rport() has been invoked with fast_io_fail
+		 * and dev_loss off. Mark the port as failed and start the TL
+		 * failure timers if these had not yet been started.
+		 */
+		__rport_fail_io_fast(rport);
+		scsi_target_unblock(&shost->shost_gendev,
+				    SDEV_TRANSPORT_OFFLINE);
+		__srp_start_tl_fail_timers(rport);
+	} else if (rport->state != SRP_RPORT_BLOCKED) {
+		scsi_target_unblock(&shost->shost_gendev,
+				    SDEV_TRANSPORT_OFFLINE);
+	}
+	mutex_unlock(&rport->mutex);
+
+out:
+	return res;
+}
+EXPORT_SYMBOL(srp_reconnect_rport);
+
+/**
+ * srp_timed_out() - SRP transport intercept of the SCSI timeout EH
+ * @scmd: SCSI command.
+ *
+ * If a timeout occurs while an rport is in the blocked state, ask the SCSI
+ * EH to continue waiting (BLK_EH_RESET_TIMER). Otherwise let the SCSI core
+ * handle the timeout (BLK_EH_NOT_HANDLED).
+ *
+ * Note: This function is called from soft-IRQ context and with the request
+ * queue lock held.
+ */
+enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd)
+{
+	struct scsi_device *sdev = scmd->device;
+	struct Scsi_Host *shost = sdev->host;
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+	struct srp_rport *rport = shost_to_rport(shost);
+
+	pr_debug("timeout for sdev %s\n", dev_name(&sdev->sdev_gendev));
+	return rport && rport->fast_io_fail_tmo < 0 &&
+		rport->dev_loss_tmo < 0 &&
+		i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ?
+		BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED;
+}
+EXPORT_SYMBOL(srp_timed_out);
+
+static void srp_rport_release(struct device *dev)
+{
+	struct srp_rport *rport = dev_to_rport(dev);
+
+	put_device(dev->parent);
+	kfree(rport);
+}
+
+static int scsi_is_srp_rport(const struct device *dev)
+{
+	return dev->release == srp_rport_release;
+}
+
+static int srp_rport_match(struct attribute_container *cont,
+			   struct device *dev)
+{
+	struct Scsi_Host *shost;
+	struct srp_internal *i;
+
+	if (!scsi_is_srp_rport(dev))
+		return 0;
+
+	shost = dev_to_shost(dev->parent);
+	if (!shost->transportt)
+		return 0;
+	if (shost->transportt->host_attrs.ac.class != &srp_host_class.class)
+		return 0;
+
+	i = to_srp_internal(shost->transportt);
+	return &i->rport_attr_cont.ac == cont;
+}
+
+static int srp_host_match(struct attribute_container *cont, struct device *dev)
+{
+	struct Scsi_Host *shost;
+	struct srp_internal *i;
+
+	if (!scsi_is_host_device(dev))
+		return 0;
+
+	shost = dev_to_shost(dev);
+	if (!shost->transportt)
+		return 0;
+	if (shost->transportt->host_attrs.ac.class != &srp_host_class.class)
+		return 0;
+
+	i = to_srp_internal(shost->transportt);
+	return &i->t.host_attrs.ac == cont;
+}
+
+/**
+ * srp_rport_get() - increment rport reference count
+ * @rport: SRP target port.
+ */
+void srp_rport_get(struct srp_rport *rport)
+{
+	get_device(&rport->dev);
+}
+EXPORT_SYMBOL(srp_rport_get);
+
+/**
+ * srp_rport_put() - decrement rport reference count
+ * @rport: SRP target port.
+ */
+void srp_rport_put(struct srp_rport *rport)
+{
+	put_device(&rport->dev);
+}
+EXPORT_SYMBOL(srp_rport_put);
+
+/**
+ * srp_rport_add - add a SRP remote port to the device hierarchy
+ * @shost:	scsi host the remote port is connected to.
+ * @ids:	The port id for the remote port.
+ *
+ * Publishes a port to the rest of the system.
+ */
+struct srp_rport *srp_rport_add(struct Scsi_Host *shost,
+				struct srp_rport_identifiers *ids)
+{
+	struct srp_rport *rport;
+	struct device *parent = &shost->shost_gendev;
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+	int id, ret;
+
+	rport = kzalloc(sizeof(*rport), GFP_KERNEL);
+	if (!rport)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&rport->mutex);
+
+	device_initialize(&rport->dev);
+
+	rport->dev.parent = get_device(parent);
+	rport->dev.release = srp_rport_release;
+
+	memcpy(rport->port_id, ids->port_id, sizeof(rport->port_id));
+	rport->roles = ids->roles;
+
+	if (i->f->reconnect)
+		rport->reconnect_delay = i->f->reconnect_delay ?
+			*i->f->reconnect_delay : 10;
+	INIT_DELAYED_WORK(&rport->reconnect_work, srp_reconnect_work);
+	rport->fast_io_fail_tmo = i->f->fast_io_fail_tmo ?
+		*i->f->fast_io_fail_tmo : 15;
+	rport->dev_loss_tmo = i->f->dev_loss_tmo ? *i->f->dev_loss_tmo : 60;
+	INIT_DELAYED_WORK(&rport->fast_io_fail_work,
+			  rport_fast_io_fail_timedout);
+	INIT_DELAYED_WORK(&rport->dev_loss_work, rport_dev_loss_timedout);
+
+	id = atomic_inc_return(&to_srp_host_attrs(shost)->next_port_id);
+	dev_set_name(&rport->dev, "port-%d:%d", shost->host_no, id);
+
+	transport_setup_device(&rport->dev);
+
+	ret = device_add(&rport->dev);
+	if (ret) {
+		transport_destroy_device(&rport->dev);
+		put_device(&rport->dev);
+		return ERR_PTR(ret);
+	}
+
+	transport_add_device(&rport->dev);
+	transport_configure_device(&rport->dev);
+
+	return rport;
+}
+EXPORT_SYMBOL_GPL(srp_rport_add);
+
+/**
+ * srp_rport_del  -  remove a SRP remote port
+ * @rport:	SRP remote port to remove
+ *
+ * Removes the specified SRP remote port.
+ */
+void srp_rport_del(struct srp_rport *rport)
+{
+	struct device *dev = &rport->dev;
+
+	transport_remove_device(dev);
+	device_del(dev);
+	transport_destroy_device(dev);
+
+	put_device(dev);
+}
+EXPORT_SYMBOL_GPL(srp_rport_del);
+
+static int do_srp_rport_del(struct device *dev, void *data)
+{
+	if (scsi_is_srp_rport(dev))
+		srp_rport_del(dev_to_rport(dev));
+	return 0;
+}
+
+/**
+ * srp_remove_host  -  tear down a Scsi_Host's SRP data structures
+ * @shost:	Scsi Host that is torn down
+ *
+ * Removes all SRP remote ports for a given Scsi_Host.
+ * Must be called just before scsi_remove_host for SRP HBAs.
+ */
+void srp_remove_host(struct Scsi_Host *shost)
+{
+	device_for_each_child(&shost->shost_gendev, NULL, do_srp_rport_del);
+}
+EXPORT_SYMBOL_GPL(srp_remove_host);
+
+/**
+ * srp_stop_rport_timers - stop the transport layer recovery timers
+ * @rport: SRP remote port for which to stop the timers.
+ *
+ * Must be called after srp_remove_host() and scsi_remove_host(). The caller
+ * must hold a reference on the rport (rport->dev) and on the SCSI host
+ * (rport->dev.parent).
+ */
+void srp_stop_rport_timers(struct srp_rport *rport)
+{
+	mutex_lock(&rport->mutex);
+	if (rport->state == SRP_RPORT_BLOCKED)
+		__rport_fail_io_fast(rport);
+	srp_rport_set_state(rport, SRP_RPORT_LOST);
+	mutex_unlock(&rport->mutex);
+
+	cancel_delayed_work_sync(&rport->reconnect_work);
+	cancel_delayed_work_sync(&rport->fast_io_fail_work);
+	cancel_delayed_work_sync(&rport->dev_loss_work);
+}
+EXPORT_SYMBOL_GPL(srp_stop_rport_timers);
+
+/**
+ * srp_attach_transport  -  instantiate SRP transport template
+ * @ft:		SRP transport class function template
+ */
+struct scsi_transport_template *
+srp_attach_transport(struct srp_function_template *ft)
+{
+	int count;
+	struct srp_internal *i;
+
+	i = kzalloc(sizeof(*i), GFP_KERNEL);
+	if (!i)
+		return NULL;
+
+	i->t.host_size = sizeof(struct srp_host_attrs);
+	i->t.host_attrs.ac.attrs = &i->host_attrs[0];
+	i->t.host_attrs.ac.class = &srp_host_class.class;
+	i->t.host_attrs.ac.match = srp_host_match;
+	i->host_attrs[0] = NULL;
+	transport_container_register(&i->t.host_attrs);
+
+	i->rport_attr_cont.ac.attrs = &i->rport_attrs[0];
+	i->rport_attr_cont.ac.class = &srp_rport_class.class;
+	i->rport_attr_cont.ac.match = srp_rport_match;
+
+	count = 0;
+	i->rport_attrs[count++] = &dev_attr_port_id;
+	i->rport_attrs[count++] = &dev_attr_roles;
+	if (ft->has_rport_state) {
+		i->rport_attrs[count++] = &dev_attr_state;
+		i->rport_attrs[count++] = &dev_attr_fast_io_fail_tmo;
+		i->rport_attrs[count++] = &dev_attr_dev_loss_tmo;
+	}
+	if (ft->reconnect) {
+		i->rport_attrs[count++] = &dev_attr_reconnect_delay;
+		i->rport_attrs[count++] = &dev_attr_failed_reconnects;
+	}
+	if (ft->rport_delete)
+		i->rport_attrs[count++] = &dev_attr_delete;
+	i->rport_attrs[count++] = NULL;
+	BUG_ON(count > ARRAY_SIZE(i->rport_attrs));
+
+	transport_container_register(&i->rport_attr_cont);
+
+	i->f = ft;
+
+	return &i->t;
+}
+EXPORT_SYMBOL_GPL(srp_attach_transport);
+
+/**
+ * srp_release_transport  -  release SRP transport template instance
+ * @t:		transport template instance
+ */
+void srp_release_transport(struct scsi_transport_template *t)
+{
+	struct srp_internal *i = to_srp_internal(t);
+
+	transport_container_unregister(&i->t.host_attrs);
+	transport_container_unregister(&i->rport_attr_cont);
+
+	kfree(i);
+}
+EXPORT_SYMBOL_GPL(srp_release_transport);
+
+static __init int srp_transport_init(void)
+{
+	int ret;
+
+	ret = transport_class_register(&srp_host_class);
+	if (ret)
+		return ret;
+	ret = transport_class_register(&srp_rport_class);
+	if (ret)
+		goto unregister_host_class;
+
+	return 0;
+unregister_host_class:
+	transport_class_unregister(&srp_host_class);
+	return ret;
+}
+
+static void __exit srp_transport_exit(void)
+{
+	transport_class_unregister(&srp_host_class);
+	transport_class_unregister(&srp_rport_class);
+}
+
+MODULE_AUTHOR("FUJITA Tomonori");
+MODULE_DESCRIPTION("SRP Transport Attributes");
+MODULE_LICENSE("GPL");
+
+module_init(srp_transport_init);
+module_exit(srp_transport_exit);
diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h
new file mode 100644
index 0000000..b4ade19
--- /dev/null
+++ b/include/linux/blk-mq-rdma.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_BLK_MQ_RDMA_H
+#define _LINUX_BLK_MQ_RDMA_H
+
+struct blk_mq_tag_set;
+struct ib_device;
+
+int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set,
+		struct ib_device *dev, int first_vec);
+
+#endif /* _LINUX_BLK_MQ_RDMA_H */
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
new file mode 100644
index 0000000..e94290b
--- /dev/null
+++ b/include/linux/cgroup_rdma.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#ifndef _CGROUP_RDMA_H
+#define _CGROUP_RDMA_H
+
+#include <linux/cgroup.h>
+
+enum rdmacg_resource_type {
+	RDMACG_RESOURCE_HCA_HANDLE,
+	RDMACG_RESOURCE_HCA_OBJECT,
+	RDMACG_RESOURCE_MAX,
+};
+
+#ifdef CONFIG_CGROUP_RDMA
+
+struct rdma_cgroup {
+	struct cgroup_subsys_state	css;
+
+	/*
+	 * head to keep track of all resource pools
+	 * that belongs to this cgroup.
+	 */
+	struct list_head		rpools;
+};
+
+struct rdmacg_device {
+	struct list_head	dev_node;
+	struct list_head	rpools;
+	char			*name;
+};
+
+/*
+ * APIs for RDMA/IB stack to publish when a device wants to
+ * participate in resource accounting
+ */
+int rdmacg_register_device(struct rdmacg_device *device);
+void rdmacg_unregister_device(struct rdmacg_device *device);
+
+/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+		      struct rdmacg_device *device,
+		      enum rdmacg_resource_type index);
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device,
+		     enum rdmacg_resource_type index);
+#endif	/* CONFIG_CGROUP_RDMA */
+#endif	/* _CGROUP_RDMA_H */
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
new file mode 100644
index 0000000..7b74afc
--- /dev/null
+++ b/include/linux/mlx4/cmd.h
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_CMD_H
+#define MLX4_CMD_H
+
+#include <linux/dma-mapping.h>
+#include <linux/if_link.h>
+#include <linux/mlx4/device.h>
+#include <linux/netdevice.h>
+
+enum {
+	/* initialization and general commands */
+	MLX4_CMD_SYS_EN		 = 0x1,
+	MLX4_CMD_SYS_DIS	 = 0x2,
+	MLX4_CMD_MAP_FA		 = 0xfff,
+	MLX4_CMD_UNMAP_FA	 = 0xffe,
+	MLX4_CMD_RUN_FW		 = 0xff6,
+	MLX4_CMD_MOD_STAT_CFG	 = 0x34,
+	MLX4_CMD_QUERY_DEV_CAP	 = 0x3,
+	MLX4_CMD_QUERY_FW	 = 0x4,
+	MLX4_CMD_ENABLE_LAM	 = 0xff8,
+	MLX4_CMD_DISABLE_LAM	 = 0xff7,
+	MLX4_CMD_QUERY_DDR	 = 0x5,
+	MLX4_CMD_QUERY_ADAPTER	 = 0x6,
+	MLX4_CMD_INIT_HCA	 = 0x7,
+	MLX4_CMD_CLOSE_HCA	 = 0x8,
+	MLX4_CMD_INIT_PORT	 = 0x9,
+	MLX4_CMD_CLOSE_PORT	 = 0xa,
+	MLX4_CMD_QUERY_HCA	 = 0xb,
+	MLX4_CMD_QUERY_PORT	 = 0x43,
+	MLX4_CMD_SENSE_PORT	 = 0x4d,
+	MLX4_CMD_HW_HEALTH_CHECK = 0x50,
+	MLX4_CMD_SET_PORT	 = 0xc,
+	MLX4_CMD_SET_NODE	 = 0x5a,
+	MLX4_CMD_QUERY_FUNC	 = 0x56,
+	MLX4_CMD_ACCESS_DDR	 = 0x2e,
+	MLX4_CMD_MAP_ICM	 = 0xffa,
+	MLX4_CMD_UNMAP_ICM	 = 0xff9,
+	MLX4_CMD_MAP_ICM_AUX	 = 0xffc,
+	MLX4_CMD_UNMAP_ICM_AUX	 = 0xffb,
+	MLX4_CMD_SET_ICM_SIZE	 = 0xffd,
+	MLX4_CMD_ACCESS_REG	 = 0x3b,
+	MLX4_CMD_ALLOCATE_VPP	 = 0x80,
+	MLX4_CMD_SET_VPORT_QOS	 = 0x81,
+
+	/*master notify fw on finish for slave's flr*/
+	MLX4_CMD_INFORM_FLR_DONE = 0x5b,
+	MLX4_CMD_VIRT_PORT_MAP   = 0x5c,
+	MLX4_CMD_GET_OP_REQ      = 0x59,
+
+	/* TPT commands */
+	MLX4_CMD_SW2HW_MPT	 = 0xd,
+	MLX4_CMD_QUERY_MPT	 = 0xe,
+	MLX4_CMD_HW2SW_MPT	 = 0xf,
+	MLX4_CMD_READ_MTT	 = 0x10,
+	MLX4_CMD_WRITE_MTT	 = 0x11,
+	MLX4_CMD_SYNC_TPT	 = 0x2f,
+
+	/* EQ commands */
+	MLX4_CMD_MAP_EQ		 = 0x12,
+	MLX4_CMD_SW2HW_EQ	 = 0x13,
+	MLX4_CMD_HW2SW_EQ	 = 0x14,
+	MLX4_CMD_QUERY_EQ	 = 0x15,
+
+	/* CQ commands */
+	MLX4_CMD_SW2HW_CQ	 = 0x16,
+	MLX4_CMD_HW2SW_CQ	 = 0x17,
+	MLX4_CMD_QUERY_CQ	 = 0x18,
+	MLX4_CMD_MODIFY_CQ	 = 0x2c,
+
+	/* SRQ commands */
+	MLX4_CMD_SW2HW_SRQ	 = 0x35,
+	MLX4_CMD_HW2SW_SRQ	 = 0x36,
+	MLX4_CMD_QUERY_SRQ	 = 0x37,
+	MLX4_CMD_ARM_SRQ	 = 0x40,
+
+	/* QP/EE commands */
+	MLX4_CMD_RST2INIT_QP	 = 0x19,
+	MLX4_CMD_INIT2RTR_QP	 = 0x1a,
+	MLX4_CMD_RTR2RTS_QP	 = 0x1b,
+	MLX4_CMD_RTS2RTS_QP	 = 0x1c,
+	MLX4_CMD_SQERR2RTS_QP	 = 0x1d,
+	MLX4_CMD_2ERR_QP	 = 0x1e,
+	MLX4_CMD_RTS2SQD_QP	 = 0x1f,
+	MLX4_CMD_SQD2SQD_QP	 = 0x38,
+	MLX4_CMD_SQD2RTS_QP	 = 0x20,
+	MLX4_CMD_2RST_QP	 = 0x21,
+	MLX4_CMD_QUERY_QP	 = 0x22,
+	MLX4_CMD_INIT2INIT_QP	 = 0x2d,
+	MLX4_CMD_SUSPEND_QP	 = 0x32,
+	MLX4_CMD_UNSUSPEND_QP	 = 0x33,
+	MLX4_CMD_UPDATE_QP	 = 0x61,
+	/* special QP and management commands */
+	MLX4_CMD_CONF_SPECIAL_QP = 0x23,
+	MLX4_CMD_MAD_IFC	 = 0x24,
+	MLX4_CMD_MAD_DEMUX	 = 0x203,
+
+	/* multicast commands */
+	MLX4_CMD_READ_MCG	 = 0x25,
+	MLX4_CMD_WRITE_MCG	 = 0x26,
+	MLX4_CMD_MGID_HASH	 = 0x27,
+
+	/* miscellaneous commands */
+	MLX4_CMD_DIAG_RPRT	 = 0x30,
+	MLX4_CMD_NOP		 = 0x31,
+	MLX4_CMD_CONFIG_DEV	 = 0x3a,
+	MLX4_CMD_ACCESS_MEM	 = 0x2e,
+	MLX4_CMD_SET_VEP	 = 0x52,
+
+	/* Ethernet specific commands */
+	MLX4_CMD_SET_VLAN_FLTR	 = 0x47,
+	MLX4_CMD_SET_MCAST_FLTR	 = 0x48,
+	MLX4_CMD_DUMP_ETH_STATS	 = 0x49,
+
+	/* Communication channel commands */
+	MLX4_CMD_ARM_COMM_CHANNEL = 0x57,
+	MLX4_CMD_GEN_EQE	 = 0x58,
+
+	/* virtual commands */
+	MLX4_CMD_ALLOC_RES	 = 0xf00,
+	MLX4_CMD_FREE_RES	 = 0xf01,
+	MLX4_CMD_MCAST_ATTACH	 = 0xf05,
+	MLX4_CMD_UCAST_ATTACH	 = 0xf06,
+	MLX4_CMD_PROMISC         = 0xf08,
+	MLX4_CMD_QUERY_FUNC_CAP  = 0xf0a,
+	MLX4_CMD_QP_ATTACH	 = 0xf0b,
+
+	/* debug commands */
+	MLX4_CMD_QUERY_DEBUG_MSG = 0x2a,
+	MLX4_CMD_SET_DEBUG_MSG	 = 0x2b,
+
+	/* statistics commands */
+	MLX4_CMD_QUERY_IF_STAT	 = 0X54,
+	MLX4_CMD_SET_IF_STAT	 = 0X55,
+
+	/* register/delete flow steering network rules */
+	MLX4_QP_FLOW_STEERING_ATTACH = 0x65,
+	MLX4_QP_FLOW_STEERING_DETACH = 0x66,
+	MLX4_FLOW_STEERING_IB_UC_QP_RANGE = 0x64,
+
+	/* Update and read QCN parameters */
+	MLX4_CMD_CONGESTION_CTRL_OPCODE = 0x68,
+};
+
+enum {
+	MLX4_CMD_TIME_CLASS_A	= 60000,
+	MLX4_CMD_TIME_CLASS_B	= 60000,
+	MLX4_CMD_TIME_CLASS_C	= 60000,
+};
+
+enum {
+	/* virtual to physical port mapping opcode modifiers */
+	MLX4_GET_PORT_VIRT2PHY = 0x0,
+	MLX4_SET_PORT_VIRT2PHY = 0x1,
+};
+
+enum {
+	MLX4_MAILBOX_SIZE	= 4096,
+	MLX4_ACCESS_MEM_ALIGN	= 256,
+};
+
+enum {
+	/* Set port opcode modifiers */
+	MLX4_SET_PORT_IB_OPCODE		= 0x0,
+	MLX4_SET_PORT_ETH_OPCODE	= 0x1,
+	MLX4_SET_PORT_BEACON_OPCODE	= 0x4,
+};
+
+enum {
+	/* Set port Ethernet input modifiers */
+	MLX4_SET_PORT_GENERAL   = 0x0,
+	MLX4_SET_PORT_RQP_CALC  = 0x1,
+	MLX4_SET_PORT_MAC_TABLE = 0x2,
+	MLX4_SET_PORT_VLAN_TABLE = 0x3,
+	MLX4_SET_PORT_PRIO_MAP  = 0x4,
+	MLX4_SET_PORT_GID_TABLE = 0x5,
+	MLX4_SET_PORT_PRIO2TC	= 0x8,
+	MLX4_SET_PORT_SCHEDULER = 0x9,
+	MLX4_SET_PORT_VXLAN	= 0xB,
+	MLX4_SET_PORT_ROCE_ADDR	= 0xD
+};
+
+enum {
+	MLX4_CMD_MAD_DEMUX_CONFIG	= 0,
+	MLX4_CMD_MAD_DEMUX_QUERY_STATE	= 1,
+	MLX4_CMD_MAD_DEMUX_QUERY_RESTR	= 2, /* Query mad demux restrictions */
+};
+
+enum {
+	MLX4_CMD_WRAPPED,
+	MLX4_CMD_NATIVE
+};
+
+/*
+ * MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP -
+ * Receive checksum value is reported in CQE also for non TCP/UDP packets.
+ *
+ * MLX4_RX_CSUM_MODE_L4 -
+ * L4_CSUM bit in CQE, which indicates whether or not L4 checksum
+ * was validated correctly, is supported.
+ *
+ * MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP -
+ * IP_OK CQE's field is supported also for non TCP/UDP IP packets.
+ *
+ * MLX4_RX_CSUM_MODE_MULTI_VLAN -
+ * Receive Checksum offload is supported for packets with more than 2 vlan headers.
+ */
+enum mlx4_rx_csum_mode {
+	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP		= 1UL << 0,
+	MLX4_RX_CSUM_MODE_L4				= 1UL << 1,
+	MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP		= 1UL << 2,
+	MLX4_RX_CSUM_MODE_MULTI_VLAN			= 1UL << 3
+};
+
+struct mlx4_config_dev_params {
+	u16	vxlan_udp_dport;
+	u8	rx_csum_flags_port_1;
+	u8	rx_csum_flags_port_2;
+};
+
+enum mlx4_en_congestion_control_algorithm {
+	MLX4_CTRL_ALGO_802_1_QAU_REACTION_POINT = 0,
+};
+
+enum mlx4_en_congestion_control_opmod {
+	MLX4_CONGESTION_CONTROL_GET_PARAMS,
+	MLX4_CONGESTION_CONTROL_GET_STATISTICS,
+	MLX4_CONGESTION_CONTROL_SET_PARAMS = 4,
+};
+
+struct mlx4_dev;
+
+struct mlx4_cmd_mailbox {
+	void		       *buf;
+	dma_addr_t		dma;
+};
+
+int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+	       int out_is_imm, u32 in_modifier, u8 op_modifier,
+	       u16 op, unsigned long timeout, int native);
+
+/* Invoke a command with no output parameter */
+static inline int mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u32 in_modifier,
+			   u8 op_modifier, u16 op, unsigned long timeout,
+			   int native)
+{
+	return __mlx4_cmd(dev, in_param, NULL, 0, in_modifier,
+			  op_modifier, op, timeout, native);
+}
+
+/* Invoke a command with an output mailbox */
+static inline int mlx4_cmd_box(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+			       u32 in_modifier, u8 op_modifier, u16 op,
+			       unsigned long timeout, int native)
+{
+	return __mlx4_cmd(dev, in_param, &out_param, 0, in_modifier,
+			  op_modifier, op, timeout, native);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static inline int mlx4_cmd_imm(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			       u32 in_modifier, u8 op_modifier, u16 op,
+			       unsigned long timeout, int native)
+{
+	return __mlx4_cmd(dev, in_param, out_param, 1, in_modifier,
+			  op_modifier, op, timeout, native);
+}
+
+struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev);
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox);
+
+int mlx4_get_counter_stats(struct mlx4_dev *dev, int counter_index,
+			   struct mlx4_counter *counter_stats, int reset);
+int mlx4_get_vf_stats(struct mlx4_dev *dev, int port, int vf_idx,
+		      struct ifla_vf_stats *vf_stats);
+u32 mlx4_comm_get_version(void);
+int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u8 *mac);
+int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan,
+		     u8 qos, __be16 proto);
+int mlx4_set_vf_rate(struct mlx4_dev *dev, int port, int vf, int min_tx_rate,
+		     int max_tx_rate);
+int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting);
+int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_info *ivf);
+int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state);
+int mlx4_config_dev_retrieval(struct mlx4_dev *dev,
+			      struct mlx4_config_dev_params *params);
+void mlx4_cmd_wake_completions(struct mlx4_dev *dev);
+void mlx4_report_internal_err_comm_event(struct mlx4_dev *dev);
+/*
+ * mlx4_get_slave_default_vlan -
+ * return true if VST ( default vlan)
+ * if VST, will return vlan & qos (if not NULL)
+ */
+bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave,
+				 u16 *vlan, u8 *qos);
+
+#define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8)
+#define COMM_CHAN_EVENT_INTERNAL_ERR (1 << 17)
+
+#endif /* MLX4_CMD_H */
diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h
new file mode 100644
index 0000000..508e8cc
--- /dev/null
+++ b/include/linux/mlx4/cq.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_CQ_H
+#define MLX4_CQ_H
+
+#include <linux/types.h>
+#include <uapi/linux/if_ether.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+struct mlx4_cqe {
+	__be32			vlan_my_qpn;
+	__be32			immed_rss_invalid;
+	__be32			g_mlpath_rqpn;
+	__be16			sl_vid;
+	union {
+		struct {
+			__be16	rlid;
+			__be16  status;
+			u8      ipv6_ext_mask;
+			u8      badfcs_enc;
+		};
+		u8  smac[ETH_ALEN];
+	};
+	__be32			byte_cnt;
+	__be16			wqe_index;
+	__be16			checksum;
+	u8			reserved[3];
+	u8			owner_sr_opcode;
+};
+
+struct mlx4_err_cqe {
+	__be32			my_qpn;
+	u32			reserved1[5];
+	__be16			wqe_index;
+	u8			vendor_err_syndrome;
+	u8			syndrome;
+	u8			reserved2[3];
+	u8			owner_sr_opcode;
+};
+
+struct mlx4_ts_cqe {
+	__be32			vlan_my_qpn;
+	__be32			immed_rss_invalid;
+	__be32			g_mlpath_rqpn;
+	__be32			timestamp_hi;
+	__be16			status;
+	u8			ipv6_ext_mask;
+	u8			badfcs_enc;
+	__be32			byte_cnt;
+	__be16			wqe_index;
+	__be16			checksum;
+	u8			reserved;
+	__be16			timestamp_lo;
+	u8			owner_sr_opcode;
+} __packed;
+
+enum {
+	MLX4_CQE_L2_TUNNEL_IPOK		= 1 << 31,
+	MLX4_CQE_CVLAN_PRESENT_MASK	= 1 << 29,
+	MLX4_CQE_SVLAN_PRESENT_MASK	= 1 << 30,
+	MLX4_CQE_L2_TUNNEL		= 1 << 27,
+	MLX4_CQE_L2_TUNNEL_CSUM		= 1 << 26,
+	MLX4_CQE_L2_TUNNEL_IPV4		= 1 << 25,
+
+	MLX4_CQE_QPN_MASK		= 0xffffff,
+	MLX4_CQE_VID_MASK		= 0xfff,
+};
+
+enum {
+	MLX4_CQE_OWNER_MASK	= 0x80,
+	MLX4_CQE_IS_SEND_MASK	= 0x40,
+	MLX4_CQE_OPCODE_MASK	= 0x1f
+};
+
+enum {
+	MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR		= 0x01,
+	MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR		= 0x02,
+	MLX4_CQE_SYNDROME_LOCAL_PROT_ERR		= 0x04,
+	MLX4_CQE_SYNDROME_WR_FLUSH_ERR			= 0x05,
+	MLX4_CQE_SYNDROME_MW_BIND_ERR			= 0x06,
+	MLX4_CQE_SYNDROME_BAD_RESP_ERR			= 0x10,
+	MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR		= 0x11,
+	MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR		= 0x12,
+	MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR		= 0x13,
+	MLX4_CQE_SYNDROME_REMOTE_OP_ERR			= 0x14,
+	MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR	= 0x15,
+	MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR		= 0x16,
+	MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR		= 0x22,
+};
+
+enum {
+	MLX4_CQE_STATUS_IPV4		= 1 << 6,
+	MLX4_CQE_STATUS_IPV4F		= 1 << 7,
+	MLX4_CQE_STATUS_IPV6		= 1 << 8,
+	MLX4_CQE_STATUS_IPV4OPT		= 1 << 9,
+	MLX4_CQE_STATUS_TCP		= 1 << 10,
+	MLX4_CQE_STATUS_UDP		= 1 << 11,
+	MLX4_CQE_STATUS_IPOK		= 1 << 12,
+};
+
+enum {
+	MLX4_CQE_LLC                     = 1,
+	MLX4_CQE_SNAP                    = 1 << 1,
+	MLX4_CQE_BAD_FCS                 = 1 << 4,
+};
+
+#define MLX4_MAX_CQ_PERIOD (BIT(16) - 1)
+#define MLX4_MAX_CQ_COUNT (BIT(16) - 1)
+
+static inline void mlx4_cq_arm(struct mlx4_cq *cq, u32 cmd,
+			       void __iomem *uar_page,
+			       spinlock_t *doorbell_lock)
+{
+	__be32 doorbell[2];
+	u32 sn;
+	u32 ci;
+
+	sn = cq->arm_sn & 3;
+	ci = cq->cons_index & 0xffffff;
+
+	*cq->arm_db = cpu_to_be32(sn << 28 | cmd | ci);
+
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	doorbell[0] = cpu_to_be32(sn << 28 | cmd | cq->cqn);
+	doorbell[1] = cpu_to_be32(ci);
+
+	mlx4_write64(doorbell, uar_page + MLX4_CQ_DOORBELL, doorbell_lock);
+}
+
+static inline void mlx4_cq_set_ci(struct mlx4_cq *cq)
+{
+	*cq->set_ci_db = cpu_to_be32(cq->cons_index & 0xffffff);
+}
+
+enum {
+	MLX4_CQ_DB_REQ_NOT_SOL		= 1 << 24,
+	MLX4_CQ_DB_REQ_NOT		= 2 << 24
+};
+
+int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   u16 count, u16 period);
+int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   int entries, struct mlx4_mtt *mtt);
+
+#endif /* MLX4_CQ_H */
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
new file mode 100644
index 0000000..81d0799
--- /dev/null
+++ b/include/linux/mlx4/device.h
@@ -0,0 +1,1595 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DEVICE_H
+#define MLX4_DEVICE_H
+
+#include <linux/if_ether.h>
+#include <linux/pci.h>
+#include <linux/completion.h>
+#include <linux/radix-tree.h>
+#include <linux/cpu_rmap.h>
+#include <linux/crash_dump.h>
+
+#include <linux/refcount.h>
+
+#include <linux/timecounter.h>
+
+#define DEFAULT_UAR_PAGE_SHIFT  12
+
+#define MAX_MSIX_P_PORT		17
+#define MAX_MSIX		64
+#define MIN_MSIX_P_PORT		5
+#define MLX4_IS_LEGACY_EQ_MODE(dev_cap) ((dev_cap).num_comp_vectors < \
+					 (dev_cap).num_ports * MIN_MSIX_P_PORT)
+
+#define MLX4_MAX_100M_UNITS_VAL		255	/*
+						 * work around: can't set values
+						 * greater then this value when
+						 * using 100 Mbps units.
+						 */
+#define MLX4_RATELIMIT_100M_UNITS	3	/* 100 Mbps */
+#define MLX4_RATELIMIT_1G_UNITS		4	/* 1 Gbps */
+#define MLX4_RATELIMIT_DEFAULT		0x00ff
+
+#define MLX4_ROCE_MAX_GIDS	128
+#define MLX4_ROCE_PF_GIDS	16
+
+enum {
+	MLX4_FLAG_MSI_X		= 1 << 0,
+	MLX4_FLAG_OLD_PORT_CMDS	= 1 << 1,
+	MLX4_FLAG_MASTER	= 1 << 2,
+	MLX4_FLAG_SLAVE		= 1 << 3,
+	MLX4_FLAG_SRIOV		= 1 << 4,
+	MLX4_FLAG_OLD_REG_MAC	= 1 << 6,
+	MLX4_FLAG_BONDED	= 1 << 7,
+	MLX4_FLAG_SECURE_HOST	= 1 << 8,
+};
+
+enum {
+	MLX4_PORT_CAP_IS_SM	= 1 << 1,
+	MLX4_PORT_CAP_DEV_MGMT_SUP = 1 << 19,
+};
+
+enum {
+	MLX4_MAX_PORTS		= 2,
+	MLX4_MAX_PORT_PKEYS	= 128,
+	MLX4_MAX_PORT_GIDS	= 128
+};
+
+/* base qkey for use in sriov tunnel-qp/proxy-qp communication.
+ * These qkeys must not be allowed for general use. This is a 64k range,
+ * and to test for violation, we use the mask (protect against future chg).
+ */
+#define MLX4_RESERVED_QKEY_BASE  (0xFFFF0000)
+#define MLX4_RESERVED_QKEY_MASK  (0xFFFF0000)
+
+enum {
+	MLX4_BOARD_ID_LEN = 64
+};
+
+enum {
+	MLX4_MAX_NUM_PF		= 16,
+	MLX4_MAX_NUM_VF		= 126,
+	MLX4_MAX_NUM_VF_P_PORT  = 64,
+	MLX4_MFUNC_MAX		= 128,
+	MLX4_MAX_EQ_NUM		= 1024,
+	MLX4_MFUNC_EQ_NUM	= 4,
+	MLX4_MFUNC_MAX_EQES     = 8,
+	MLX4_MFUNC_EQE_MASK     = (MLX4_MFUNC_MAX_EQES - 1)
+};
+
+/* Driver supports 3 different device methods to manage traffic steering:
+ *	-device managed - High level API for ib and eth flow steering. FW is
+ *			  managing flow steering tables.
+ *	- B0 steering mode - Common low level API for ib and (if supported) eth.
+ *	- A0 steering mode - Limited low level API for eth. In case of IB,
+ *			     B0 mode is in use.
+ */
+enum {
+	MLX4_STEERING_MODE_A0,
+	MLX4_STEERING_MODE_B0,
+	MLX4_STEERING_MODE_DEVICE_MANAGED
+};
+
+enum {
+	MLX4_STEERING_DMFS_A0_DEFAULT,
+	MLX4_STEERING_DMFS_A0_DYNAMIC,
+	MLX4_STEERING_DMFS_A0_STATIC,
+	MLX4_STEERING_DMFS_A0_DISABLE,
+	MLX4_STEERING_DMFS_A0_NOT_SUPPORTED
+};
+
+static inline const char *mlx4_steering_mode_str(int steering_mode)
+{
+	switch (steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		return "A0 steering";
+
+	case MLX4_STEERING_MODE_B0:
+		return "B0 steering";
+
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return "Device managed flow steering";
+
+	default:
+		return "Unrecognize steering mode";
+	}
+}
+
+enum {
+	MLX4_TUNNEL_OFFLOAD_MODE_NONE,
+	MLX4_TUNNEL_OFFLOAD_MODE_VXLAN
+};
+
+enum {
+	MLX4_DEV_CAP_FLAG_RC		= 1LL <<  0,
+	MLX4_DEV_CAP_FLAG_UC		= 1LL <<  1,
+	MLX4_DEV_CAP_FLAG_UD		= 1LL <<  2,
+	MLX4_DEV_CAP_FLAG_XRC		= 1LL <<  3,
+	MLX4_DEV_CAP_FLAG_SRQ		= 1LL <<  6,
+	MLX4_DEV_CAP_FLAG_IPOIB_CSUM	= 1LL <<  7,
+	MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1LL <<  8,
+	MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1LL <<  9,
+	MLX4_DEV_CAP_FLAG_DPDP		= 1LL << 12,
+	MLX4_DEV_CAP_FLAG_BLH		= 1LL << 15,
+	MLX4_DEV_CAP_FLAG_MEM_WINDOW	= 1LL << 16,
+	MLX4_DEV_CAP_FLAG_APM		= 1LL << 17,
+	MLX4_DEV_CAP_FLAG_ATOMIC	= 1LL << 18,
+	MLX4_DEV_CAP_FLAG_RAW_MCAST	= 1LL << 19,
+	MLX4_DEV_CAP_FLAG_UD_AV_PORT	= 1LL << 20,
+	MLX4_DEV_CAP_FLAG_UD_MCAST	= 1LL << 21,
+	MLX4_DEV_CAP_FLAG_IBOE		= 1LL << 30,
+	MLX4_DEV_CAP_FLAG_UC_LOOPBACK	= 1LL << 32,
+	MLX4_DEV_CAP_FLAG_FCS_KEEP	= 1LL << 34,
+	MLX4_DEV_CAP_FLAG_WOL_PORT1	= 1LL << 37,
+	MLX4_DEV_CAP_FLAG_WOL_PORT2	= 1LL << 38,
+	MLX4_DEV_CAP_FLAG_UDP_RSS	= 1LL << 40,
+	MLX4_DEV_CAP_FLAG_VEP_UC_STEER	= 1LL << 41,
+	MLX4_DEV_CAP_FLAG_VEP_MC_STEER	= 1LL << 42,
+	MLX4_DEV_CAP_FLAG_COUNTERS	= 1LL << 48,
+	MLX4_DEV_CAP_FLAG_RSS_IP_FRAG   = 1LL << 52,
+	MLX4_DEV_CAP_FLAG_SET_ETH_SCHED = 1LL << 53,
+	MLX4_DEV_CAP_FLAG_SENSE_SUPPORT	= 1LL << 55,
+	MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV = 1LL << 59,
+	MLX4_DEV_CAP_FLAG_64B_EQE	= 1LL << 61,
+	MLX4_DEV_CAP_FLAG_64B_CQE	= 1LL << 62
+};
+
+enum {
+	MLX4_DEV_CAP_FLAG2_RSS			= 1LL <<  0,
+	MLX4_DEV_CAP_FLAG2_RSS_TOP		= 1LL <<  1,
+	MLX4_DEV_CAP_FLAG2_RSS_XOR		= 1LL <<  2,
+	MLX4_DEV_CAP_FLAG2_FS_EN		= 1LL <<  3,
+	MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN	= 1LL <<  4,
+	MLX4_DEV_CAP_FLAG2_TS			= 1LL <<  5,
+	MLX4_DEV_CAP_FLAG2_VLAN_CONTROL		= 1LL <<  6,
+	MLX4_DEV_CAP_FLAG2_FSM			= 1LL <<  7,
+	MLX4_DEV_CAP_FLAG2_UPDATE_QP		= 1LL <<  8,
+	MLX4_DEV_CAP_FLAG2_DMFS_IPOIB		= 1LL <<  9,
+	MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS	= 1LL <<  10,
+	MLX4_DEV_CAP_FLAG2_MAD_DEMUX		= 1LL <<  11,
+	MLX4_DEV_CAP_FLAG2_CQE_STRIDE		= 1LL <<  12,
+	MLX4_DEV_CAP_FLAG2_EQE_STRIDE		= 1LL <<  13,
+	MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL        = 1LL <<  14,
+	MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP	= 1LL <<  15,
+	MLX4_DEV_CAP_FLAG2_CONFIG_DEV		= 1LL <<  16,
+	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  17,
+	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18,
+	MLX4_DEV_CAP_FLAG2_FS_A0		= 1LL <<  19,
+	MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 20,
+	MLX4_DEV_CAP_FLAG2_PORT_REMAP		= 1LL <<  21,
+	MLX4_DEV_CAP_FLAG2_QCN			= 1LL <<  22,
+	MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT	= 1LL <<  23,
+	MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN         = 1LL <<  24,
+	MLX4_DEV_CAP_FLAG2_QOS_VPP		= 1LL <<  25,
+	MLX4_DEV_CAP_FLAG2_ETS_CFG		= 1LL <<  26,
+	MLX4_DEV_CAP_FLAG2_PORT_BEACON		= 1LL <<  27,
+	MLX4_DEV_CAP_FLAG2_IGNORE_FCS		= 1LL <<  28,
+	MLX4_DEV_CAP_FLAG2_PHV_EN		= 1LL <<  29,
+	MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN	= 1LL <<  30,
+	MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB = 1ULL << 31,
+	MLX4_DEV_CAP_FLAG2_LB_SRC_CHK           = 1ULL << 32,
+	MLX4_DEV_CAP_FLAG2_ROCE_V1_V2		= 1ULL <<  33,
+	MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER   = 1ULL <<  34,
+	MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT	= 1ULL <<  35,
+	MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP          = 1ULL <<  36,
+	MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT = 1ULL << 37,
+	MLX4_DEV_CAP_FLAG2_USER_MAC_EN		= 1ULL << 38,
+};
+
+enum {
+	MLX4_QUERY_FUNC_FLAGS_BF_RES_QP		= 1LL << 0,
+	MLX4_QUERY_FUNC_FLAGS_A0_RES_QP		= 1LL << 1
+};
+
+enum {
+	MLX4_VF_CAP_FLAG_RESET			= 1 << 0
+};
+
+/* bit enums for an 8-bit flags field indicating special use
+ * QPs which require special handling in qp_reserve_range.
+ * Currently, this only includes QPs used by the ETH interface,
+ * where we expect to use blueflame.  These QPs must not have
+ * bits 6 and 7 set in their qp number.
+ *
+ * This enum may use only bits 0..7.
+ */
+enum {
+	MLX4_RESERVE_A0_QP	= 1 << 6,
+	MLX4_RESERVE_ETH_BF_QP	= 1 << 7,
+};
+
+enum {
+	MLX4_DEV_CAP_64B_EQE_ENABLED	= 1LL << 0,
+	MLX4_DEV_CAP_64B_CQE_ENABLED	= 1LL << 1,
+	MLX4_DEV_CAP_CQE_STRIDE_ENABLED	= 1LL << 2,
+	MLX4_DEV_CAP_EQE_STRIDE_ENABLED	= 1LL << 3
+};
+
+enum {
+	MLX4_FUNC_CAP_64B_EQE_CQE	= 1L << 0,
+	MLX4_FUNC_CAP_EQE_CQE_STRIDE	= 1L << 1,
+	MLX4_FUNC_CAP_DMFS_A0_STATIC	= 1L << 2
+};
+
+
+#define MLX4_ATTR_EXTENDED_PORT_INFO	cpu_to_be16(0xff90)
+
+enum {
+	MLX4_BMME_FLAG_WIN_TYPE_2B	= 1 <<  1,
+	MLX4_BMME_FLAG_LOCAL_INV	= 1 <<  6,
+	MLX4_BMME_FLAG_REMOTE_INV	= 1 <<  7,
+	MLX4_BMME_FLAG_TYPE_2_WIN	= 1 <<  9,
+	MLX4_BMME_FLAG_RESERVED_LKEY	= 1 << 10,
+	MLX4_BMME_FLAG_FAST_REG_WR	= 1 << 11,
+	MLX4_BMME_FLAG_ROCE_V1_V2	= 1 << 19,
+	MLX4_BMME_FLAG_PORT_REMAP	= 1 << 24,
+	MLX4_BMME_FLAG_VSD_INIT2RTR	= 1 << 28,
+};
+
+enum {
+	MLX4_FLAG_PORT_REMAP		= MLX4_BMME_FLAG_PORT_REMAP,
+	MLX4_FLAG_ROCE_V1_V2		= MLX4_BMME_FLAG_ROCE_V1_V2
+};
+
+enum mlx4_event {
+	MLX4_EVENT_TYPE_COMP		   = 0x00,
+	MLX4_EVENT_TYPE_PATH_MIG	   = 0x01,
+	MLX4_EVENT_TYPE_COMM_EST	   = 0x02,
+	MLX4_EVENT_TYPE_SQ_DRAINED	   = 0x03,
+	MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE	   = 0x13,
+	MLX4_EVENT_TYPE_SRQ_LIMIT	   = 0x14,
+	MLX4_EVENT_TYPE_CQ_ERROR	   = 0x04,
+	MLX4_EVENT_TYPE_WQ_CATAS_ERROR	   = 0x05,
+	MLX4_EVENT_TYPE_EEC_CATAS_ERROR	   = 0x06,
+	MLX4_EVENT_TYPE_PATH_MIG_FAILED	   = 0x07,
+	MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MLX4_EVENT_TYPE_WQ_ACCESS_ERROR	   = 0x11,
+	MLX4_EVENT_TYPE_SRQ_CATAS_ERROR	   = 0x12,
+	MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR  = 0x08,
+	MLX4_EVENT_TYPE_PORT_CHANGE	   = 0x09,
+	MLX4_EVENT_TYPE_EQ_OVERFLOW	   = 0x0f,
+	MLX4_EVENT_TYPE_ECC_DETECT	   = 0x0e,
+	MLX4_EVENT_TYPE_CMD		   = 0x0a,
+	MLX4_EVENT_TYPE_VEP_UPDATE	   = 0x19,
+	MLX4_EVENT_TYPE_COMM_CHANNEL	   = 0x18,
+	MLX4_EVENT_TYPE_OP_REQUIRED	   = 0x1a,
+	MLX4_EVENT_TYPE_FATAL_WARNING	   = 0x1b,
+	MLX4_EVENT_TYPE_FLR_EVENT	   = 0x1c,
+	MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT = 0x1d,
+	MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT  = 0x3e,
+	MLX4_EVENT_TYPE_NONE		   = 0xff,
+};
+
+enum {
+	MLX4_PORT_CHANGE_SUBTYPE_DOWN	= 1,
+	MLX4_PORT_CHANGE_SUBTYPE_ACTIVE	= 4
+};
+
+enum {
+	MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_BAD_CABLE		= 1,
+	MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_UNSUPPORTED_CABLE	= 2,
+};
+
+enum {
+	MLX4_FATAL_WARNING_SUBTYPE_WARMING = 0,
+};
+
+enum slave_port_state {
+	SLAVE_PORT_DOWN = 0,
+	SLAVE_PENDING_UP,
+	SLAVE_PORT_UP,
+};
+
+enum slave_port_gen_event {
+	SLAVE_PORT_GEN_EVENT_DOWN = 0,
+	SLAVE_PORT_GEN_EVENT_UP,
+	SLAVE_PORT_GEN_EVENT_NONE,
+};
+
+enum slave_port_state_event {
+	MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
+	MLX4_PORT_STATE_DEV_EVENT_PORT_UP,
+	MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID,
+	MLX4_PORT_STATE_IB_EVENT_GID_INVALID,
+};
+
+enum {
+	MLX4_PERM_LOCAL_READ	= 1 << 10,
+	MLX4_PERM_LOCAL_WRITE	= 1 << 11,
+	MLX4_PERM_REMOTE_READ	= 1 << 12,
+	MLX4_PERM_REMOTE_WRITE	= 1 << 13,
+	MLX4_PERM_ATOMIC	= 1 << 14,
+	MLX4_PERM_BIND_MW	= 1 << 15,
+	MLX4_PERM_MASK		= 0xFC00
+};
+
+enum {
+	MLX4_OPCODE_NOP			= 0x00,
+	MLX4_OPCODE_SEND_INVAL		= 0x01,
+	MLX4_OPCODE_RDMA_WRITE		= 0x08,
+	MLX4_OPCODE_RDMA_WRITE_IMM	= 0x09,
+	MLX4_OPCODE_SEND		= 0x0a,
+	MLX4_OPCODE_SEND_IMM		= 0x0b,
+	MLX4_OPCODE_LSO			= 0x0e,
+	MLX4_OPCODE_RDMA_READ		= 0x10,
+	MLX4_OPCODE_ATOMIC_CS		= 0x11,
+	MLX4_OPCODE_ATOMIC_FA		= 0x12,
+	MLX4_OPCODE_MASKED_ATOMIC_CS	= 0x14,
+	MLX4_OPCODE_MASKED_ATOMIC_FA	= 0x15,
+	MLX4_OPCODE_BIND_MW		= 0x18,
+	MLX4_OPCODE_FMR			= 0x19,
+	MLX4_OPCODE_LOCAL_INVAL		= 0x1b,
+	MLX4_OPCODE_CONFIG_CMD		= 0x1f,
+
+	MLX4_RECV_OPCODE_RDMA_WRITE_IMM	= 0x00,
+	MLX4_RECV_OPCODE_SEND		= 0x01,
+	MLX4_RECV_OPCODE_SEND_IMM	= 0x02,
+	MLX4_RECV_OPCODE_SEND_INVAL	= 0x03,
+
+	MLX4_CQE_OPCODE_ERROR		= 0x1e,
+	MLX4_CQE_OPCODE_RESIZE		= 0x16,
+};
+
+enum {
+	MLX4_STAT_RATE_OFFSET	= 5
+};
+
+enum mlx4_protocol {
+	MLX4_PROT_IB_IPV6 = 0,
+	MLX4_PROT_ETH,
+	MLX4_PROT_IB_IPV4,
+	MLX4_PROT_FCOE
+};
+
+enum {
+	MLX4_MTT_FLAG_PRESENT		= 1
+};
+
+enum mlx4_qp_region {
+	MLX4_QP_REGION_FW = 0,
+	MLX4_QP_REGION_RSS_RAW_ETH,
+	MLX4_QP_REGION_BOTTOM = MLX4_QP_REGION_RSS_RAW_ETH,
+	MLX4_QP_REGION_ETH_ADDR,
+	MLX4_QP_REGION_FC_ADDR,
+	MLX4_QP_REGION_FC_EXCH,
+	MLX4_NUM_QP_REGION
+};
+
+enum mlx4_port_type {
+	MLX4_PORT_TYPE_NONE	= 0,
+	MLX4_PORT_TYPE_IB	= 1,
+	MLX4_PORT_TYPE_ETH	= 2,
+	MLX4_PORT_TYPE_AUTO	= 3
+};
+
+enum mlx4_special_vlan_idx {
+	MLX4_NO_VLAN_IDX        = 0,
+	MLX4_VLAN_MISS_IDX,
+	MLX4_VLAN_REGULAR
+};
+
+enum mlx4_steer_type {
+	MLX4_MC_STEER = 0,
+	MLX4_UC_STEER,
+	MLX4_NUM_STEERS
+};
+
+enum mlx4_resource_usage {
+	MLX4_RES_USAGE_NONE,
+	MLX4_RES_USAGE_DRIVER,
+	MLX4_RES_USAGE_USER_VERBS,
+};
+
+enum {
+	MLX4_NUM_FEXCH          = 64 * 1024,
+};
+
+enum {
+	MLX4_MAX_FAST_REG_PAGES = 511,
+};
+
+enum {
+	/*
+	 * Max wqe size for rdma read is 512 bytes, so this
+	 * limits our max_sge_rd as the wqe needs to fit:
+	 * - ctrl segment (16 bytes)
+	 * - rdma segment (16 bytes)
+	 * - scatter elements (16 bytes each)
+	 */
+	MLX4_MAX_SGE_RD	= (512 - 16 - 16) / 16
+};
+
+enum {
+	MLX4_DEV_PMC_SUBTYPE_GUID_INFO	 = 0x14,
+	MLX4_DEV_PMC_SUBTYPE_PORT_INFO	 = 0x15,
+	MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE	 = 0x16,
+	MLX4_DEV_PMC_SUBTYPE_SL_TO_VL_MAP = 0x17,
+};
+
+/* Port mgmt change event handling */
+enum {
+	MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK	= 1 << 0,
+	MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK		= 1 << 1,
+	MLX4_EQ_PORT_INFO_LID_CHANGE_MASK		= 1 << 2,
+	MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK		= 1 << 3,
+	MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK	= 1 << 4,
+};
+
+union sl2vl_tbl_to_u64 {
+	u8	sl8[8];
+	u64	sl64;
+};
+
+enum {
+	MLX4_DEVICE_STATE_UP			= 1 << 0,
+	MLX4_DEVICE_STATE_INTERNAL_ERROR	= 1 << 1,
+};
+
+enum {
+	MLX4_INTERFACE_STATE_UP		= 1 << 0,
+	MLX4_INTERFACE_STATE_DELETION	= 1 << 1,
+	MLX4_INTERFACE_STATE_NOWAIT	= 1 << 2,
+};
+
+#define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
+			     MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)
+
+enum mlx4_module_id {
+	MLX4_MODULE_ID_SFP              = 0x3,
+	MLX4_MODULE_ID_QSFP             = 0xC,
+	MLX4_MODULE_ID_QSFP_PLUS        = 0xD,
+	MLX4_MODULE_ID_QSFP28           = 0x11,
+};
+
+enum { /* rl */
+	MLX4_QP_RATE_LIMIT_NONE		= 0,
+	MLX4_QP_RATE_LIMIT_KBS		= 1,
+	MLX4_QP_RATE_LIMIT_MBS		= 2,
+	MLX4_QP_RATE_LIMIT_GBS		= 3
+};
+
+struct mlx4_rate_limit_caps {
+	u16	num_rates; /* Number of different rates */
+	u8	min_unit;
+	u16	min_val;
+	u8	max_unit;
+	u16	max_val;
+};
+
+static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor)
+{
+	return (major << 32) | (minor << 16) | subminor;
+}
+
+struct mlx4_phys_caps {
+	u32			gid_phys_table_len[MLX4_MAX_PORTS + 1];
+	u32			pkey_phys_table_len[MLX4_MAX_PORTS + 1];
+	u32			num_phys_eqs;
+	u32			base_sqpn;
+	u32			base_proxy_sqpn;
+	u32			base_tunnel_sqpn;
+};
+
+struct mlx4_spec_qps {
+	u32 qp0_qkey;
+	u32 qp0_proxy;
+	u32 qp0_tunnel;
+	u32 qp1_proxy;
+	u32 qp1_tunnel;
+};
+
+struct mlx4_caps {
+	u64			fw_ver;
+	u32			function;
+	int			num_ports;
+	int			vl_cap[MLX4_MAX_PORTS + 1];
+	int			ib_mtu_cap[MLX4_MAX_PORTS + 1];
+	__be32			ib_port_def_cap[MLX4_MAX_PORTS + 1];
+	u64			def_mac[MLX4_MAX_PORTS + 1];
+	int			eth_mtu_cap[MLX4_MAX_PORTS + 1];
+	int			gid_table_len[MLX4_MAX_PORTS + 1];
+	int			pkey_table_len[MLX4_MAX_PORTS + 1];
+	int			trans_type[MLX4_MAX_PORTS + 1];
+	int			vendor_oui[MLX4_MAX_PORTS + 1];
+	int			wavelength[MLX4_MAX_PORTS + 1];
+	u64			trans_code[MLX4_MAX_PORTS + 1];
+	int			local_ca_ack_delay;
+	int			num_uars;
+	u32			uar_page_size;
+	int			bf_reg_size;
+	int			bf_regs_per_page;
+	int			max_sq_sg;
+	int			max_rq_sg;
+	int			num_qps;
+	int			max_wqes;
+	int			max_sq_desc_sz;
+	int			max_rq_desc_sz;
+	int			max_qp_init_rdma;
+	int			max_qp_dest_rdma;
+	int			max_tc_eth;
+	struct mlx4_spec_qps   *spec_qps;
+	int			num_srqs;
+	int			max_srq_wqes;
+	int			max_srq_sge;
+	int			reserved_srqs;
+	int			num_cqs;
+	int			max_cqes;
+	int			reserved_cqs;
+	int			num_sys_eqs;
+	int			num_eqs;
+	int			reserved_eqs;
+	int			num_comp_vectors;
+	int			num_mpts;
+	int			max_fmr_maps;
+	int			num_mtts;
+	int			fmr_reserved_mtts;
+	int			reserved_mtts;
+	int			reserved_mrws;
+	int			reserved_uars;
+	int			num_mgms;
+	int			num_amgms;
+	int			reserved_mcgs;
+	int			num_qp_per_mgm;
+	int			steering_mode;
+	int			dmfs_high_steer_mode;
+	int			fs_log_max_ucast_qp_range_size;
+	int			num_pds;
+	int			reserved_pds;
+	int			max_xrcds;
+	int			reserved_xrcds;
+	int			mtt_entry_sz;
+	u32			max_msg_sz;
+	u32			page_size_cap;
+	u64			flags;
+	u64			flags2;
+	u32			bmme_flags;
+	u32			reserved_lkey;
+	u16			stat_rate_support;
+	u8			port_width_cap[MLX4_MAX_PORTS + 1];
+	int			max_gso_sz;
+	int			max_rss_tbl_sz;
+	int                     reserved_qps_cnt[MLX4_NUM_QP_REGION];
+	int			reserved_qps;
+	int                     reserved_qps_base[MLX4_NUM_QP_REGION];
+	int                     log_num_macs;
+	int                     log_num_vlans;
+	enum mlx4_port_type	port_type[MLX4_MAX_PORTS + 1];
+	u8			supported_type[MLX4_MAX_PORTS + 1];
+	u8                      suggested_type[MLX4_MAX_PORTS + 1];
+	u8                      default_sense[MLX4_MAX_PORTS + 1];
+	u32			port_mask[MLX4_MAX_PORTS + 1];
+	enum mlx4_port_type	possible_type[MLX4_MAX_PORTS + 1];
+	u32			max_counters;
+	u8			port_ib_mtu[MLX4_MAX_PORTS + 1];
+	u16			sqp_demux;
+	u32			eqe_size;
+	u32			cqe_size;
+	u8			eqe_factor;
+	u32			userspace_caps; /* userspace must be aware of these */
+	u32			function_caps;  /* VFs must be aware of these */
+	u16			hca_core_clock;
+	u64			phys_port_id[MLX4_MAX_PORTS + 1];
+	int			tunnel_offload_mode;
+	u8			rx_checksum_flags_port[MLX4_MAX_PORTS + 1];
+	u8			phv_bit[MLX4_MAX_PORTS + 1];
+	u8			alloc_res_qp_mask;
+	u32			dmfs_high_rate_qpn_base;
+	u32			dmfs_high_rate_qpn_range;
+	u32			vf_caps;
+	bool			wol_port[MLX4_MAX_PORTS + 1];
+	struct mlx4_rate_limit_caps rl_caps;
+};
+
+struct mlx4_buf_list {
+	void		       *buf;
+	dma_addr_t		map;
+};
+
+struct mlx4_buf {
+	struct mlx4_buf_list	direct;
+	struct mlx4_buf_list   *page_list;
+	int			nbufs;
+	int			npages;
+	int			page_shift;
+};
+
+struct mlx4_mtt {
+	u32			offset;
+	int			order;
+	int			page_shift;
+};
+
+enum {
+	MLX4_DB_PER_PAGE = PAGE_SIZE / 4
+};
+
+struct mlx4_db_pgdir {
+	struct list_head	list;
+	DECLARE_BITMAP(order0, MLX4_DB_PER_PAGE);
+	DECLARE_BITMAP(order1, MLX4_DB_PER_PAGE / 2);
+	unsigned long	       *bits[2];
+	__be32		       *db_page;
+	dma_addr_t		db_dma;
+};
+
+struct mlx4_ib_user_db_page;
+
+struct mlx4_db {
+	__be32			*db;
+	union {
+		struct mlx4_db_pgdir		*pgdir;
+		struct mlx4_ib_user_db_page	*user_page;
+	}			u;
+	dma_addr_t		dma;
+	int			index;
+	int			order;
+};
+
+struct mlx4_hwq_resources {
+	struct mlx4_db		db;
+	struct mlx4_mtt		mtt;
+	struct mlx4_buf		buf;
+};
+
+struct mlx4_mr {
+	struct mlx4_mtt		mtt;
+	u64			iova;
+	u64			size;
+	u32			key;
+	u32			pd;
+	u32			access;
+	int			enabled;
+};
+
+enum mlx4_mw_type {
+	MLX4_MW_TYPE_1 = 1,
+	MLX4_MW_TYPE_2 = 2,
+};
+
+struct mlx4_mw {
+	u32			key;
+	u32			pd;
+	enum mlx4_mw_type	type;
+	int			enabled;
+};
+
+struct mlx4_fmr {
+	struct mlx4_mr		mr;
+	struct mlx4_mpt_entry  *mpt;
+	__be64		       *mtts;
+	dma_addr_t		dma_handle;
+	int			max_pages;
+	int			max_maps;
+	int			maps;
+	u8			page_shift;
+};
+
+struct mlx4_uar {
+	unsigned long		pfn;
+	int			index;
+	struct list_head	bf_list;
+	unsigned		free_bf_bmap;
+	void __iomem	       *map;
+	void __iomem	       *bf_map;
+};
+
+struct mlx4_bf {
+	unsigned int		offset;
+	int			buf_size;
+	struct mlx4_uar	       *uar;
+	void __iomem	       *reg;
+};
+
+struct mlx4_cq {
+	void (*comp)		(struct mlx4_cq *);
+	void (*event)		(struct mlx4_cq *, enum mlx4_event);
+
+	struct mlx4_uar	       *uar;
+
+	u32			cons_index;
+
+	u16                     irq;
+	__be32		       *set_ci_db;
+	__be32		       *arm_db;
+	int			arm_sn;
+
+	int			cqn;
+	unsigned		vector;
+
+	refcount_t		refcount;
+	struct completion	free;
+	struct {
+		struct list_head list;
+		void (*comp)(struct mlx4_cq *);
+		void		*priv;
+	} tasklet_ctx;
+	int		reset_notify_added;
+	struct list_head	reset_notify;
+	u8			usage;
+};
+
+struct mlx4_qp {
+	void (*event)		(struct mlx4_qp *, enum mlx4_event);
+
+	int			qpn;
+
+	refcount_t		refcount;
+	struct completion	free;
+	u8			usage;
+};
+
+struct mlx4_srq {
+	void (*event)		(struct mlx4_srq *, enum mlx4_event);
+
+	int			srqn;
+	int			max;
+	int			max_gs;
+	int			wqe_shift;
+
+	refcount_t		refcount;
+	struct completion	free;
+};
+
+struct mlx4_av {
+	__be32			port_pd;
+	u8			reserved1;
+	u8			g_slid;
+	__be16			dlid;
+	u8			reserved2;
+	u8			gid_index;
+	u8			stat_rate;
+	u8			hop_limit;
+	__be32			sl_tclass_flowlabel;
+	u8			dgid[16];
+};
+
+struct mlx4_eth_av {
+	__be32		port_pd;
+	u8		reserved1;
+	u8		smac_idx;
+	u16		reserved2;
+	u8		reserved3;
+	u8		gid_index;
+	u8		stat_rate;
+	u8		hop_limit;
+	__be32		sl_tclass_flowlabel;
+	u8		dgid[16];
+	u8		s_mac[6];
+	u8		reserved4[2];
+	__be16		vlan;
+	u8		mac[ETH_ALEN];
+};
+
+union mlx4_ext_av {
+	struct mlx4_av		ib;
+	struct mlx4_eth_av	eth;
+};
+
+/* Counters should be saturate once they reach their maximum value */
+#define ASSIGN_32BIT_COUNTER(counter, value) do {	\
+	if ((value) > U32_MAX)				\
+		counter = cpu_to_be32(U32_MAX);		\
+	else						\
+		counter = cpu_to_be32(value);		\
+} while (0)
+
+struct mlx4_counter {
+	u8	reserved1[3];
+	u8	counter_mode;
+	__be32	num_ifc;
+	u32	reserved2[2];
+	__be64	rx_frames;
+	__be64	rx_bytes;
+	__be64	tx_frames;
+	__be64	tx_bytes;
+};
+
+struct mlx4_quotas {
+	int qp;
+	int cq;
+	int srq;
+	int mpt;
+	int mtt;
+	int counter;
+	int xrcd;
+};
+
+struct mlx4_vf_dev {
+	u8			min_port;
+	u8			n_ports;
+};
+
+enum mlx4_pci_status {
+	MLX4_PCI_STATUS_DISABLED,
+	MLX4_PCI_STATUS_ENABLED,
+};
+
+struct mlx4_dev_persistent {
+	struct pci_dev	       *pdev;
+	struct mlx4_dev	       *dev;
+	int                     nvfs[MLX4_MAX_PORTS + 1];
+	int			num_vfs;
+	enum mlx4_port_type curr_port_type[MLX4_MAX_PORTS + 1];
+	enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1];
+	struct work_struct      catas_work;
+	struct workqueue_struct *catas_wq;
+	struct mutex	device_state_mutex; /* protect HW state */
+	u8		state;
+	struct mutex	interface_state_mutex; /* protect SW state */
+	u8	interface_state;
+	struct mutex		pci_status_mutex; /* sync pci state */
+	enum mlx4_pci_status	pci_status;
+};
+
+struct mlx4_dev {
+	struct mlx4_dev_persistent *persist;
+	unsigned long		flags;
+	unsigned long		num_slaves;
+	struct mlx4_caps	caps;
+	struct mlx4_phys_caps	phys_caps;
+	struct mlx4_quotas	quotas;
+	struct radix_tree_root	qp_table_tree;
+	u8			rev_id;
+	u8			port_random_macs;
+	char			board_id[MLX4_BOARD_ID_LEN];
+	int			numa_node;
+	int			oper_log_mgm_entry_size;
+	u64			regid_promisc_array[MLX4_MAX_PORTS + 1];
+	u64			regid_allmulti_array[MLX4_MAX_PORTS + 1];
+	struct mlx4_vf_dev     *dev_vfs;
+	u8  uar_page_shift;
+};
+
+struct mlx4_clock_params {
+	u64 offset;
+	u8 bar;
+	u8 size;
+};
+
+struct mlx4_eqe {
+	u8			reserved1;
+	u8			type;
+	u8			reserved2;
+	u8			subtype;
+	union {
+		u32		raw[6];
+		struct {
+			__be32	cqn;
+		} __packed comp;
+		struct {
+			u16	reserved1;
+			__be16	token;
+			u32	reserved2;
+			u8	reserved3[3];
+			u8	status;
+			__be64	out_param;
+		} __packed cmd;
+		struct {
+			__be32	qpn;
+		} __packed qp;
+		struct {
+			__be32	srqn;
+		} __packed srq;
+		struct {
+			__be32	cqn;
+			u32	reserved1;
+			u8	reserved2[3];
+			u8	syndrome;
+		} __packed cq_err;
+		struct {
+			u32	reserved1[2];
+			__be32	port;
+		} __packed port_change;
+		struct {
+			#define COMM_CHANNEL_BIT_ARRAY_SIZE	4
+			u32 reserved;
+			u32 bit_vec[COMM_CHANNEL_BIT_ARRAY_SIZE];
+		} __packed comm_channel_arm;
+		struct {
+			u8	port;
+			u8	reserved[3];
+			__be64	mac;
+		} __packed mac_update;
+		struct {
+			__be32	slave_id;
+		} __packed flr_event;
+		struct {
+			__be16  current_temperature;
+			__be16  warning_threshold;
+		} __packed warming;
+		struct {
+			u8 reserved[3];
+			u8 port;
+			union {
+				struct {
+					__be16 mstr_sm_lid;
+					__be16 port_lid;
+					__be32 changed_attr;
+					u8 reserved[3];
+					u8 mstr_sm_sl;
+					__be64 gid_prefix;
+				} __packed port_info;
+				struct {
+					__be32 block_ptr;
+					__be32 tbl_entries_mask;
+				} __packed tbl_change_info;
+				struct {
+					u8 sl2vl_table[8];
+				} __packed sl2vl_tbl_change_info;
+			} params;
+		} __packed port_mgmt_change;
+		struct {
+			u8 reserved[3];
+			u8 port;
+			u32 reserved1[5];
+		} __packed bad_cable;
+	}			event;
+	u8			slave_id;
+	u8			reserved3[2];
+	u8			owner;
+} __packed;
+
+struct mlx4_init_port_param {
+	int			set_guid0;
+	int			set_node_guid;
+	int			set_si_guid;
+	u16			mtu;
+	int			port_width_cap;
+	u16			vl_cap;
+	u16			max_gid;
+	u16			max_pkey;
+	u64			guid0;
+	u64			node_guid;
+	u64			si_guid;
+};
+
+#define MAD_IFC_DATA_SZ 192
+/* MAD IFC Mailbox */
+struct mlx4_mad_ifc {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	__be16	status;
+	__be16	class_specific;
+	__be64	tid;
+	__be16	attr_id;
+	__be16	resv;
+	__be32	attr_mod;
+	__be64	mkey;
+	__be16	dr_slid;
+	__be16	dr_dlid;
+	u8	reserved[28];
+	u8	data[MAD_IFC_DATA_SZ];
+} __packed;
+
+#define mlx4_foreach_port(port, dev, type)				\
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	\
+		if ((type) == (dev)->caps.port_mask[(port)])
+
+#define mlx4_foreach_ib_transport_port(port, dev)                         \
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)       \
+		if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \
+		    ((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_ETH))
+
+#define MLX4_INVALID_SLAVE_ID	0xFF
+#define MLX4_SINK_COUNTER_INDEX(dev)	(dev->caps.max_counters - 1)
+
+void handle_port_mgmt_change_event(struct work_struct *work);
+
+static inline int mlx4_master_func_num(struct mlx4_dev *dev)
+{
+	return dev->caps.function;
+}
+
+static inline int mlx4_is_master(struct mlx4_dev *dev)
+{
+	return dev->flags & MLX4_FLAG_MASTER;
+}
+
+static inline int mlx4_num_reserved_sqps(struct mlx4_dev *dev)
+{
+	return dev->phys_caps.base_sqpn + 8 +
+		16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev);
+}
+
+static inline int mlx4_is_qp_reserved(struct mlx4_dev *dev, u32 qpn)
+{
+	return (qpn < dev->phys_caps.base_sqpn + 8 +
+		16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev) &&
+		qpn >= dev->phys_caps.base_sqpn) ||
+	       (qpn < dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW]);
+}
+
+static inline int mlx4_is_guest_proxy(struct mlx4_dev *dev, int slave, u32 qpn)
+{
+	int guest_proxy_base = dev->phys_caps.base_proxy_sqpn + slave * 8;
+
+	if (qpn >= guest_proxy_base && qpn < guest_proxy_base + 8)
+		return 1;
+
+	return 0;
+}
+
+static inline int mlx4_is_mfunc(struct mlx4_dev *dev)
+{
+	return dev->flags & (MLX4_FLAG_SLAVE | MLX4_FLAG_MASTER);
+}
+
+static inline int mlx4_is_slave(struct mlx4_dev *dev)
+{
+	return dev->flags & MLX4_FLAG_SLAVE;
+}
+
+static inline int mlx4_is_eth(struct mlx4_dev *dev, int port)
+{
+	return dev->caps.port_type[port] == MLX4_PORT_TYPE_IB ? 0 : 1;
+}
+
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+		   struct mlx4_buf *buf);
+void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
+static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset)
+{
+	if (buf->nbufs == 1)
+		return buf->direct.buf + offset;
+	else
+		return buf->page_list[offset >> PAGE_SHIFT].buf +
+			(offset & (PAGE_SIZE - 1));
+}
+
+int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn);
+void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn);
+int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn);
+void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn);
+
+int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar);
+void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar);
+int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node);
+void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf);
+
+int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
+		  struct mlx4_mtt *mtt);
+void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
+u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
+
+int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
+		  int npages, int page_shift, struct mlx4_mr *mr);
+int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mw_alloc(struct mlx4_dev *dev, u32 pd, enum mlx4_mw_type type,
+		  struct mlx4_mw *mw);
+void mlx4_mw_free(struct mlx4_dev *dev, struct mlx4_mw *mw);
+int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw);
+int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   int start_index, int npages, u64 *page_list);
+int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		       struct mlx4_buf *buf);
+
+int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order);
+void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db);
+
+int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+		       int size);
+void mlx4_free_hwq_res(struct mlx4_dev *mdev, struct mlx4_hwq_resources *wqres,
+		       int size);
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
+		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
+		  unsigned vector, int collapsed, int timestamp_en);
+void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			  int *base, u8 flags, u8 usage);
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp);
+void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp);
+
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcdn,
+		   struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq);
+void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq);
+int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark);
+int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark);
+
+int mlx4_INIT_PORT(struct mlx4_dev *dev, int port);
+int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port);
+
+int mlx4_unicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			int block_mcast_loopback, enum mlx4_protocol prot);
+int mlx4_unicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			enum mlx4_protocol prot);
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  u8 port, int block_mcast_loopback,
+			  enum mlx4_protocol protocol, u64 *reg_id);
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol protocol, u64 reg_id);
+
+enum {
+	MLX4_DOMAIN_UVERBS	= 0x1000,
+	MLX4_DOMAIN_ETHTOOL     = 0x2000,
+	MLX4_DOMAIN_RFS         = 0x3000,
+	MLX4_DOMAIN_NIC    = 0x5000,
+};
+
+enum mlx4_net_trans_rule_id {
+	MLX4_NET_TRANS_RULE_ID_ETH = 0,
+	MLX4_NET_TRANS_RULE_ID_IB,
+	MLX4_NET_TRANS_RULE_ID_IPV6,
+	MLX4_NET_TRANS_RULE_ID_IPV4,
+	MLX4_NET_TRANS_RULE_ID_TCP,
+	MLX4_NET_TRANS_RULE_ID_UDP,
+	MLX4_NET_TRANS_RULE_ID_VXLAN,
+	MLX4_NET_TRANS_RULE_NUM, /* should be last */
+};
+
+extern const u16 __sw_id_hw[];
+
+static inline int map_hw_to_sw_id(u16 header_id)
+{
+
+	int i;
+	for (i = 0; i < MLX4_NET_TRANS_RULE_NUM; i++) {
+		if (header_id == __sw_id_hw[i])
+			return i;
+	}
+	return -EINVAL;
+}
+
+enum mlx4_net_trans_promisc_mode {
+	MLX4_FS_REGULAR = 1,
+	MLX4_FS_ALL_DEFAULT,
+	MLX4_FS_MC_DEFAULT,
+	MLX4_FS_MIRROR_RX_PORT,
+	MLX4_FS_MIRROR_SX_PORT,
+	MLX4_FS_UC_SNIFFER,
+	MLX4_FS_MC_SNIFFER,
+	MLX4_FS_MODE_NUM, /* should be last */
+};
+
+struct mlx4_spec_eth {
+	u8	dst_mac[ETH_ALEN];
+	u8	dst_mac_msk[ETH_ALEN];
+	u8	src_mac[ETH_ALEN];
+	u8	src_mac_msk[ETH_ALEN];
+	u8	ether_type_enable;
+	__be16	ether_type;
+	__be16	vlan_id_msk;
+	__be16	vlan_id;
+};
+
+struct mlx4_spec_tcp_udp {
+	__be16 dst_port;
+	__be16 dst_port_msk;
+	__be16 src_port;
+	__be16 src_port_msk;
+};
+
+struct mlx4_spec_ipv4 {
+	__be32 dst_ip;
+	__be32 dst_ip_msk;
+	__be32 src_ip;
+	__be32 src_ip_msk;
+};
+
+struct mlx4_spec_ib {
+	__be32  l3_qpn;
+	__be32	qpn_msk;
+	u8	dst_gid[16];
+	u8	dst_gid_msk[16];
+};
+
+struct mlx4_spec_vxlan {
+	__be32 vni;
+	__be32 vni_mask;
+
+};
+
+struct mlx4_spec_list {
+	struct	list_head list;
+	enum	mlx4_net_trans_rule_id id;
+	union {
+		struct mlx4_spec_eth eth;
+		struct mlx4_spec_ib ib;
+		struct mlx4_spec_ipv4 ipv4;
+		struct mlx4_spec_tcp_udp tcp_udp;
+		struct mlx4_spec_vxlan vxlan;
+	};
+};
+
+enum mlx4_net_trans_hw_rule_queue {
+	MLX4_NET_TRANS_Q_FIFO,
+	MLX4_NET_TRANS_Q_LIFO,
+};
+
+struct mlx4_net_trans_rule {
+	struct	list_head list;
+	enum	mlx4_net_trans_hw_rule_queue queue_mode;
+	bool	exclusive;
+	bool	allow_loopback;
+	enum	mlx4_net_trans_promisc_mode promisc_mode;
+	u8	port;
+	u16	priority;
+	u32	qpn;
+};
+
+struct mlx4_net_trans_rule_hw_ctrl {
+	__be16 prio;
+	u8 type;
+	u8 flags;
+	u8 rsvd1;
+	u8 funcid;
+	u8 vep;
+	u8 port;
+	__be32 qpn;
+	__be32 rsvd2;
+};
+
+struct mlx4_net_trans_rule_hw_ib {
+	u8 size;
+	u8 rsvd1;
+	__be16 id;
+	u32 rsvd2;
+	__be32 l3_qpn;
+	__be32 qpn_mask;
+	u8 dst_gid[16];
+	u8 dst_gid_msk[16];
+} __packed;
+
+struct mlx4_net_trans_rule_hw_eth {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	u8	rsvd1[6];
+	u8	dst_mac[6];
+	u16	rsvd2;
+	u8	dst_mac_msk[6];
+	u16	rsvd3;
+	u8	src_mac[6];
+	u16	rsvd4;
+	u8	src_mac_msk[6];
+	u8      rsvd5;
+	u8      ether_type_enable;
+	__be16  ether_type;
+	__be16  vlan_tag_msk;
+	__be16  vlan_tag;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_tcp_udp {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	__be16	rsvd1[3];
+	__be16	dst_port;
+	__be16	rsvd2;
+	__be16	dst_port_msk;
+	__be16	rsvd3;
+	__be16	src_port;
+	__be16	rsvd4;
+	__be16	src_port_msk;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_ipv4 {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	__be32	rsvd1;
+	__be32	dst_ip;
+	__be32	dst_ip_msk;
+	__be32	src_ip;
+	__be32	src_ip_msk;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_vxlan {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	__be32	rsvd1;
+	__be32	vni;
+	__be32	vni_mask;
+} __packed;
+
+struct _rule_hw {
+	union {
+		struct {
+			u8 size;
+			u8 rsvd;
+			__be16 id;
+		};
+		struct mlx4_net_trans_rule_hw_eth eth;
+		struct mlx4_net_trans_rule_hw_ib ib;
+		struct mlx4_net_trans_rule_hw_ipv4 ipv4;
+		struct mlx4_net_trans_rule_hw_tcp_udp tcp_udp;
+		struct mlx4_net_trans_rule_hw_vxlan vxlan;
+	};
+};
+
+enum {
+	VXLAN_STEER_BY_OUTER_MAC	= 1 << 0,
+	VXLAN_STEER_BY_OUTER_VLAN	= 1 << 1,
+	VXLAN_STEER_BY_VSID_VNI		= 1 << 2,
+	VXLAN_STEER_BY_INNER_MAC	= 1 << 3,
+	VXLAN_STEER_BY_INNER_VLAN	= 1 << 4,
+};
+
+enum {
+	MLX4_OP_MOD_QUERY_TRANSPORT_CI_ERRORS = 0x2,
+};
+
+int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn,
+				enum mlx4_net_trans_promisc_mode mode);
+int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_net_trans_promisc_mode mode);
+int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port);
+int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port);
+int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port);
+int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port);
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
+
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port);
+int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac);
+int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
+			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx);
+int mlx4_SET_PORT_user_mac(struct mlx4_dev *dev, u8 port, u8 *user_mac);
+int mlx4_SET_PORT_user_mtu(struct mlx4_dev *dev, u8 port, u16 user_mtu);
+int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
+			   u8 promisc);
+int mlx4_SET_PORT_BEACON(struct mlx4_dev *dev, u8 port, u16 time);
+int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port,
+			    u8 ignore_fcs_value);
+int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable);
+int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val);
+int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv);
+int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port,
+				      bool *vlan_offload_disabled);
+void mlx4_handle_eth_header_mcast_prio(struct mlx4_net_trans_rule_hw_ctrl *ctrl,
+				       struct _rule_hw *eth_header);
+int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx);
+int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx);
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan);
+
+int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
+		      int npages, u64 iova, u32 *lkey, u32 *rkey);
+int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
+		   int max_maps, u8 page_shift, struct mlx4_fmr *fmr);
+int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
+void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
+		    u32 *lkey, u32 *rkey);
+int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
+int mlx4_SYNC_TPT(struct mlx4_dev *dev);
+int mlx4_test_interrupt(struct mlx4_dev *dev, int vector);
+int mlx4_test_async(struct mlx4_dev *dev);
+int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier,
+			     const u32 offset[], u32 value[],
+			     size_t array_len, u8 port);
+u32 mlx4_get_eqs_per_port(struct mlx4_dev *dev, u8 port);
+bool mlx4_is_eq_vector_valid(struct mlx4_dev *dev, u8 port, int vector);
+struct cpu_rmap *mlx4_get_cpu_rmap(struct mlx4_dev *dev, int port);
+int mlx4_assign_eq(struct mlx4_dev *dev, u8 port, int *vector);
+void mlx4_release_eq(struct mlx4_dev *dev, int vec);
+
+int mlx4_is_eq_shared(struct mlx4_dev *dev, int vector);
+int mlx4_eq_get_irq(struct mlx4_dev *dev, int vec);
+
+int mlx4_get_phys_port_id(struct mlx4_dev *dev);
+int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port);
+int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port);
+
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx, u8 usage);
+void mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
+int mlx4_get_default_counter_index(struct mlx4_dev *dev, int port);
+
+void mlx4_set_admin_guid(struct mlx4_dev *dev, __be64 guid, int entry,
+			 int port);
+__be64 mlx4_get_admin_guid(struct mlx4_dev *dev, int entry, int port);
+void mlx4_set_random_admin_guid(struct mlx4_dev *dev, int entry, int port);
+int mlx4_flow_attach(struct mlx4_dev *dev,
+		     struct mlx4_net_trans_rule *rule, u64 *reg_id);
+int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id);
+int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
+				    enum mlx4_net_trans_promisc_mode flow_type);
+int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev,
+				  enum mlx4_net_trans_rule_id id);
+int mlx4_hw_rule_sz(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id);
+
+int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr,
+			  int port, int qpn, u16 prio, u64 *reg_id);
+
+void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port,
+			  int i, int val);
+
+int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey);
+
+int mlx4_is_slave_active(struct mlx4_dev *dev, int slave);
+int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port);
+int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port);
+int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr);
+int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port, u8 port_subtype_change);
+enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port);
+int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, int event, enum slave_port_gen_event *gen_event);
+
+void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid);
+__be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave);
+
+int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid,
+				 int *slave_id);
+int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id,
+				 u8 *gid);
+
+int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn,
+				      u32 max_range_qpn);
+
+u64 mlx4_read_clock(struct mlx4_dev *dev);
+
+struct mlx4_active_ports {
+	DECLARE_BITMAP(ports, MLX4_MAX_PORTS);
+};
+/* Returns a bitmap of the physical ports which are assigned to slave */
+struct mlx4_active_ports mlx4_get_active_ports(struct mlx4_dev *dev, int slave);
+
+/* Returns the physical port that represents the virtual port of the slave, */
+/* or a value < 0 in case of an error. If a slave has 2 ports, the identity */
+/* mapping is returned.							    */
+int mlx4_slave_convert_port(struct mlx4_dev *dev, int slave, int port);
+
+struct mlx4_slaves_pport {
+	DECLARE_BITMAP(slaves, MLX4_MFUNC_MAX);
+};
+/* Returns a bitmap of all slaves that are assigned to port. */
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport(struct mlx4_dev *dev,
+						   int port);
+
+/* Returns a bitmap of all slaves that are assigned exactly to all the */
+/* the ports that are set in crit_ports.			       */
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport_actv(
+		struct mlx4_dev *dev,
+		const struct mlx4_active_ports *crit_ports);
+
+/* Returns the slave's virtual port that represents the physical port. */
+int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port);
+
+int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port);
+
+int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port);
+int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis);
+int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port);
+int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2);
+int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
+int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port);
+int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
+				 int enable);
+int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+		       struct mlx4_mpt_entry ***mpt_entry);
+int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+			 struct mlx4_mpt_entry **mpt_entry);
+int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
+			 u32 pdn);
+int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
+			     struct mlx4_mpt_entry *mpt_entry,
+			     u32 access);
+void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
+			struct mlx4_mpt_entry **mpt_entry);
+void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
+			    u64 iova, u64 size, int npages,
+			    int page_shift, struct mlx4_mpt_entry *mpt_entry);
+
+int mlx4_get_module_info(struct mlx4_dev *dev, u8 port,
+			 u16 offset, u16 size, u8 *data);
+int mlx4_max_tc(struct mlx4_dev *dev);
+
+/* Returns true if running in low memory profile (kdump kernel) */
+static inline bool mlx4_low_memory_profile(void)
+{
+	return is_kdump_kernel();
+}
+
+/* ACCESS REG commands */
+enum mlx4_access_reg_method {
+	MLX4_ACCESS_REG_QUERY = 0x1,
+	MLX4_ACCESS_REG_WRITE = 0x2,
+};
+
+/* ACCESS PTYS Reg command */
+enum mlx4_ptys_proto {
+	MLX4_PTYS_IB = 1<<0,
+	MLX4_PTYS_EN = 1<<2,
+};
+
+enum mlx4_ptys_flags {
+	MLX4_PTYS_AN_DISABLE_CAP   = 1 << 5,
+	MLX4_PTYS_AN_DISABLE_ADMIN = 1 << 6,
+};
+
+struct mlx4_ptys_reg {
+	u8 flags;
+	u8 local_port;
+	u8 resrvd2;
+	u8 proto_mask;
+	__be32 resrvd3[2];
+	__be32 eth_proto_cap;
+	__be16 ib_width_cap;
+	__be16 ib_speed_cap;
+	__be32 resrvd4;
+	__be32 eth_proto_admin;
+	__be16 ib_width_admin;
+	__be16 ib_speed_admin;
+	__be32 resrvd5;
+	__be32 eth_proto_oper;
+	__be16 ib_width_oper;
+	__be16 ib_speed_oper;
+	__be32 resrvd6;
+	__be32 eth_proto_lp_adv;
+} __packed;
+
+int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev,
+			 enum mlx4_access_reg_method method,
+			 struct mlx4_ptys_reg *ptys_reg);
+
+int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
+				   struct mlx4_clock_params *params);
+
+static inline int mlx4_to_hw_uar_index(struct mlx4_dev *dev, int index)
+{
+	return (index << (PAGE_SHIFT - dev->uar_page_shift));
+}
+
+static inline int mlx4_get_num_reserved_uar(struct mlx4_dev *dev)
+{
+	/* The first 128 UARs are used for EQ doorbells */
+	return (128 >> (PAGE_SHIFT - dev->uar_page_shift));
+}
+#endif /* MLX4_DEVICE_H */
diff --git a/include/linux/mlx4/doorbell.h b/include/linux/mlx4/doorbell.h
new file mode 100644
index 0000000..f31bba2
--- /dev/null
+++ b/include/linux/mlx4/doorbell.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DOORBELL_H
+#define MLX4_DOORBELL_H
+
+#include <linux/types.h>
+#include <linux/io.h>
+
+#define MLX4_SEND_DOORBELL    0x14
+#define MLX4_CQ_DOORBELL      0x20
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MLX4_DECLARE_DOORBELL_LOCK(name)
+#define MLX4_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MLX4_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mlx4_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	__raw_writeq(*(u64 *) val, dest);
+}
+
+#else
+
+/*
+ * Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MLX4_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MLX4_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MLX4_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mlx4_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel((__force u32) val[0], dest);
+	__raw_writel((__force u32) val[1], dest + 4);
+	spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+#endif
+
+#endif /* MLX4_DOORBELL_H */
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
new file mode 100644
index 0000000..a858bcb
--- /dev/null
+++ b/include/linux/mlx4/driver.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DRIVER_H
+#define MLX4_DRIVER_H
+
+#include <net/devlink.h>
+#include <linux/mlx4/device.h>
+
+struct mlx4_dev;
+
+#define MLX4_MAC_MASK	   0xffffffffffffULL
+
+enum mlx4_dev_event {
+	MLX4_DEV_EVENT_CATASTROPHIC_ERROR,
+	MLX4_DEV_EVENT_PORT_UP,
+	MLX4_DEV_EVENT_PORT_DOWN,
+	MLX4_DEV_EVENT_PORT_REINIT,
+	MLX4_DEV_EVENT_PORT_MGMT_CHANGE,
+	MLX4_DEV_EVENT_SLAVE_INIT,
+	MLX4_DEV_EVENT_SLAVE_SHUTDOWN,
+};
+
+enum {
+	MLX4_INTFF_BONDING	= 1 << 0
+};
+
+struct mlx4_interface {
+	void *			(*add)	 (struct mlx4_dev *dev);
+	void			(*remove)(struct mlx4_dev *dev, void *context);
+	void			(*event) (struct mlx4_dev *dev, void *context,
+					  enum mlx4_dev_event event, unsigned long param);
+	void *			(*get_dev)(struct mlx4_dev *dev, void *context, u8 port);
+	void			(*activate)(struct mlx4_dev *dev, void *context);
+	struct list_head	list;
+	enum mlx4_protocol	protocol;
+	int			flags;
+};
+
+int mlx4_register_interface(struct mlx4_interface *intf);
+void mlx4_unregister_interface(struct mlx4_interface *intf);
+
+int mlx4_bond(struct mlx4_dev *dev);
+int mlx4_unbond(struct mlx4_dev *dev);
+static inline int mlx4_is_bonded(struct mlx4_dev *dev)
+{
+	return !!(dev->flags & MLX4_FLAG_BONDED);
+}
+
+static inline int mlx4_is_mf_bonded(struct mlx4_dev *dev)
+{
+	return (mlx4_is_bonded(dev) && mlx4_is_mfunc(dev));
+}
+
+struct mlx4_port_map {
+	u8	port1;
+	u8	port2;
+};
+
+int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p);
+
+void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port);
+
+struct devlink_port *mlx4_get_devlink_port(struct mlx4_dev *dev, int port);
+
+static inline u64 mlx4_mac_to_u64(u8 *addr)
+{
+	u64 mac = 0;
+	int i;
+
+	for (i = 0; i < ETH_ALEN; i++) {
+		mac <<= 8;
+		mac |= addr[i];
+	}
+	return mac;
+}
+
+static inline void mlx4_u64_to_mac(u8 *addr, u64 mac)
+{
+	int i;
+
+	for (i = ETH_ALEN; i > 0; i--) {
+		addr[i - 1] = mac & 0xFF;
+		mac >>= 8;
+	}
+}
+
+#endif /* MLX4_DRIVER_H */
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
new file mode 100644
index 0000000..8e2828d
--- /dev/null
+++ b/include/linux/mlx4/qp.h
@@ -0,0 +1,506 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_QP_H
+#define MLX4_QP_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#include <linux/mlx4/device.h>
+
+#define MLX4_INVALID_LKEY	0x100
+
+enum mlx4_qp_optpar {
+	MLX4_QP_OPTPAR_ALT_ADDR_PATH		= 1 << 0,
+	MLX4_QP_OPTPAR_RRE			= 1 << 1,
+	MLX4_QP_OPTPAR_RAE			= 1 << 2,
+	MLX4_QP_OPTPAR_RWE			= 1 << 3,
+	MLX4_QP_OPTPAR_PKEY_INDEX		= 1 << 4,
+	MLX4_QP_OPTPAR_Q_KEY			= 1 << 5,
+	MLX4_QP_OPTPAR_RNR_TIMEOUT		= 1 << 6,
+	MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH	= 1 << 7,
+	MLX4_QP_OPTPAR_SRA_MAX			= 1 << 8,
+	MLX4_QP_OPTPAR_RRA_MAX			= 1 << 9,
+	MLX4_QP_OPTPAR_PM_STATE			= 1 << 10,
+	MLX4_QP_OPTPAR_RETRY_COUNT		= 1 << 12,
+	MLX4_QP_OPTPAR_RNR_RETRY		= 1 << 13,
+	MLX4_QP_OPTPAR_ACK_TIMEOUT		= 1 << 14,
+	MLX4_QP_OPTPAR_SCHED_QUEUE		= 1 << 16,
+	MLX4_QP_OPTPAR_COUNTER_INDEX		= 1 << 20,
+	MLX4_QP_OPTPAR_VLAN_STRIPPING		= 1 << 21,
+};
+
+enum mlx4_qp_state {
+	MLX4_QP_STATE_RST			= 0,
+	MLX4_QP_STATE_INIT			= 1,
+	MLX4_QP_STATE_RTR			= 2,
+	MLX4_QP_STATE_RTS			= 3,
+	MLX4_QP_STATE_SQER			= 4,
+	MLX4_QP_STATE_SQD			= 5,
+	MLX4_QP_STATE_ERR			= 6,
+	MLX4_QP_STATE_SQ_DRAINING		= 7,
+	MLX4_QP_NUM_STATE
+};
+
+enum {
+	MLX4_QP_ST_RC				= 0x0,
+	MLX4_QP_ST_UC				= 0x1,
+	MLX4_QP_ST_RD				= 0x2,
+	MLX4_QP_ST_UD				= 0x3,
+	MLX4_QP_ST_XRC				= 0x6,
+	MLX4_QP_ST_MLX				= 0x7
+};
+
+enum {
+	MLX4_QP_PM_MIGRATED			= 0x3,
+	MLX4_QP_PM_ARMED			= 0x0,
+	MLX4_QP_PM_REARM			= 0x1
+};
+
+enum {
+	/* params1 */
+	MLX4_QP_BIT_SRE				= 1 << 15,
+	MLX4_QP_BIT_SWE				= 1 << 14,
+	MLX4_QP_BIT_SAE				= 1 << 13,
+	/* params2 */
+	MLX4_QP_BIT_RRE				= 1 << 15,
+	MLX4_QP_BIT_RWE				= 1 << 14,
+	MLX4_QP_BIT_RAE				= 1 << 13,
+	MLX4_QP_BIT_FPP				= 1 <<	3,
+	MLX4_QP_BIT_RIC				= 1 <<	4,
+};
+
+enum {
+	MLX4_RSS_HASH_XOR			= 0,
+	MLX4_RSS_HASH_TOP			= 1,
+
+	MLX4_RSS_UDP_IPV6			= 1 << 0,
+	MLX4_RSS_UDP_IPV4			= 1 << 1,
+	MLX4_RSS_TCP_IPV6			= 1 << 2,
+	MLX4_RSS_IPV6				= 1 << 3,
+	MLX4_RSS_TCP_IPV4			= 1 << 4,
+	MLX4_RSS_IPV4				= 1 << 5,
+
+	MLX4_RSS_BY_OUTER_HEADERS		= 0 << 6,
+	MLX4_RSS_BY_INNER_HEADERS		= 2 << 6,
+	MLX4_RSS_BY_INNER_HEADERS_IPONLY	= 3 << 6,
+
+	/* offset of mlx4_rss_context within mlx4_qp_context.pri_path */
+	MLX4_RSS_OFFSET_IN_QPC_PRI_PATH		= 0x24,
+	/* offset of being RSS indirection QP within mlx4_qp_context.flags */
+	MLX4_RSS_QPC_FLAG_OFFSET		= 13,
+};
+
+#define MLX4_EN_RSS_KEY_SIZE 40
+
+struct mlx4_rss_context {
+	__be32			base_qpn;
+	__be32			default_qpn;
+	u16			reserved;
+	u8			hash_fn;
+	u8			flags;
+	__be32			rss_key[MLX4_EN_RSS_KEY_SIZE / sizeof(__be32)];
+	__be32			base_qpn_udp;
+};
+
+struct mlx4_qp_path {
+	u8			fl;
+	union {
+		u8			vlan_control;
+		u8			control;
+	};
+	u8			disable_pkey_check;
+	u8			pkey_index;
+	u8			counter_index;
+	u8			grh_mylmc;
+	__be16			rlid;
+	u8			ackto;
+	u8			mgid_index;
+	u8			static_rate;
+	u8			hop_limit;
+	__be32			tclass_flowlabel;
+	u8			rgid[16];
+	u8			sched_queue;
+	u8			vlan_index;
+	u8			feup;
+	u8			fvl_rx;
+	u8			reserved4[2];
+	u8			dmac[ETH_ALEN];
+};
+
+enum { /* fl */
+	MLX4_FL_CV	= 1 << 6,
+	MLX4_FL_SV	= 1 << 5,
+	MLX4_FL_ETH_HIDE_CQE_VLAN	= 1 << 2,
+	MLX4_FL_ETH_SRC_CHECK_MC_LB	= 1 << 1,
+	MLX4_FL_ETH_SRC_CHECK_UC_LB	= 1 << 0,
+};
+
+enum { /* control */
+	MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER	= 1 << 7,
+};
+
+enum { /* vlan_control */
+	MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED	= 1 << 6,
+	MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED	= 1 << 5, /* 802.1p priority tag */
+	MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED	= 1 << 4,
+	MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED	= 1 << 2,
+	MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED	= 1 << 1, /* 802.1p priority tag */
+	MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED	= 1 << 0
+};
+
+enum { /* feup */
+	MLX4_FEUP_FORCE_ETH_UP          = 1 << 6, /* force Eth UP */
+	MLX4_FSM_FORCE_ETH_SRC_MAC      = 1 << 5, /* force Source MAC */
+	MLX4_FVL_FORCE_ETH_VLAN         = 1 << 3  /* force Eth vlan */
+};
+
+enum { /* fvl_rx */
+	MLX4_FVL_RX_FORCE_ETH_VLAN      = 1 << 0 /* enforce Eth rx vlan */
+};
+
+struct mlx4_qp_context {
+	__be32			flags;
+	__be32			pd;
+	u8			mtu_msgmax;
+	u8			rq_size_stride;
+	u8			sq_size_stride;
+	u8			rlkey_roce_mode;
+	__be32			usr_page;
+	__be32			local_qpn;
+	__be32			remote_qpn;
+	struct			mlx4_qp_path pri_path;
+	struct			mlx4_qp_path alt_path;
+	__be32			params1;
+	u32			reserved1;
+	__be32			next_send_psn;
+	__be32			cqn_send;
+	__be16                  roce_entropy;
+	__be16                  reserved2[3];
+	__be32			last_acked_psn;
+	__be32			ssn;
+	__be32			params2;
+	__be32			rnr_nextrecvpsn;
+	__be32			xrcd;
+	__be32			cqn_recv;
+	__be64			db_rec_addr;
+	__be32			qkey;
+	__be32			srqn;
+	__be32			msn;
+	__be16			rq_wqe_counter;
+	__be16			sq_wqe_counter;
+	u32			reserved3;
+	__be16			rate_limit_params;
+	u8			reserved4;
+	u8			qos_vport;
+	__be32			param3;
+	__be32			nummmcpeers_basemkey;
+	u8			log_page_size;
+	u8			reserved5[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	u32			reserved6[10];
+};
+
+struct mlx4_update_qp_context {
+	__be64			qp_mask;
+	__be64			primary_addr_path_mask;
+	__be64			secondary_addr_path_mask;
+	u64			reserved1;
+	struct mlx4_qp_context	qp_context;
+	u64			reserved2[58];
+};
+
+enum {
+	MLX4_UPD_QP_MASK_PM_STATE	= 32,
+	MLX4_UPD_QP_MASK_VSD		= 33,
+	MLX4_UPD_QP_MASK_QOS_VPP	= 34,
+	MLX4_UPD_QP_MASK_RATE_LIMIT	= 35,
+};
+
+enum {
+	MLX4_UPD_QP_PATH_MASK_PKEY_INDEX		= 0 + 32,
+	MLX4_UPD_QP_PATH_MASK_FSM			= 1 + 32,
+	MLX4_UPD_QP_PATH_MASK_MAC_INDEX			= 2 + 32,
+	MLX4_UPD_QP_PATH_MASK_FVL			= 3 + 32,
+	MLX4_UPD_QP_PATH_MASK_CV			= 4 + 32,
+	MLX4_UPD_QP_PATH_MASK_VLAN_INDEX		= 5 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_HIDE_CQE_VLAN		= 6 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_UNTAGGED	= 7 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_1P		= 8 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_TAGGED	= 9 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_UNTAGGED	= 10 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_1P		= 11 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_TAGGED	= 12 + 32,
+	MLX4_UPD_QP_PATH_MASK_FEUP			= 13 + 32,
+	MLX4_UPD_QP_PATH_MASK_SCHED_QUEUE		= 14 + 32,
+	MLX4_UPD_QP_PATH_MASK_IF_COUNTER_INDEX		= 15 + 32,
+	MLX4_UPD_QP_PATH_MASK_FVL_RX			= 16 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_UC_LB	= 18 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB	= 19 + 32,
+	MLX4_UPD_QP_PATH_MASK_SV			= 22 + 32,
+};
+
+enum { /* param3 */
+	MLX4_STRIP_VLAN = 1 << 30
+};
+
+/* Which firmware version adds support for NEC (NoErrorCompletion) bit */
+#define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232)
+
+enum {
+	MLX4_WQE_CTRL_NEC		= 1 << 29,
+	MLX4_WQE_CTRL_IIP		= 1 << 28,
+	MLX4_WQE_CTRL_ILP		= 1 << 27,
+	MLX4_WQE_CTRL_FENCE		= 1 << 6,
+	MLX4_WQE_CTRL_CQ_UPDATE		= 3 << 2,
+	MLX4_WQE_CTRL_SOLICITED		= 1 << 1,
+	MLX4_WQE_CTRL_IP_CSUM		= 1 << 4,
+	MLX4_WQE_CTRL_TCP_UDP_CSUM	= 1 << 5,
+	MLX4_WQE_CTRL_INS_CVLAN		= 1 << 6,
+	MLX4_WQE_CTRL_INS_SVLAN		= 1 << 7,
+	MLX4_WQE_CTRL_STRONG_ORDER	= 1 << 7,
+	MLX4_WQE_CTRL_FORCE_LOOPBACK	= 1 << 0,
+};
+
+union mlx4_wqe_qpn_vlan {
+	struct {
+		__be16	vlan_tag;
+		u8	ins_vlan;
+		u8	fence_size;
+	};
+	__be32		bf_qpn;
+};
+
+struct mlx4_wqe_ctrl_seg {
+	__be32			owner_opcode;
+	union mlx4_wqe_qpn_vlan	qpn_vlan;
+	/*
+	 * High 24 bits are SRC remote buffer; low 8 bits are flags:
+	 * [7]   SO (strong ordering)
+	 * [5]   TCP/UDP checksum
+	 * [4]   IP checksum
+	 * [3:2] C (generate completion queue entry)
+	 * [1]   SE (solicited event)
+	 * [0]   FL (force loopback)
+	 */
+	union {
+		__be32			srcrb_flags;
+		__be16			srcrb_flags16[2];
+	};
+	/*
+	 * imm is immediate data for send/RDMA write w/ immediate;
+	 * also invalidation key for send with invalidate; input
+	 * modifier for WQEs on CCQs.
+	 */
+	__be32			imm;
+};
+
+enum {
+	MLX4_WQE_MLX_VL15	= 1 << 17,
+	MLX4_WQE_MLX_SLR	= 1 << 16
+};
+
+struct mlx4_wqe_mlx_seg {
+	u8			owner;
+	u8			reserved1[2];
+	u8			opcode;
+	__be16			sched_prio;
+	u8			reserved2;
+	u8			size;
+	/*
+	 * [17]    VL15
+	 * [16]    SLR
+	 * [15:12] static rate
+	 * [11:8]  SL
+	 * [4]     ICRC
+	 * [3:2]   C
+	 * [0]     FL (force loopback)
+	 */
+	__be32			flags;
+	__be16			rlid;
+	u16			reserved3;
+};
+
+struct mlx4_wqe_datagram_seg {
+	__be32			av[8];
+	__be32			dqpn;
+	__be32			qkey;
+	__be16			vlan;
+	u8			mac[ETH_ALEN];
+};
+
+struct mlx4_wqe_lso_seg {
+	__be32			mss_hdr_size;
+	__be32			header[0];
+};
+
+enum mlx4_wqe_bind_seg_flags2 {
+	MLX4_WQE_BIND_ZERO_BASED = (1 << 30),
+	MLX4_WQE_BIND_TYPE_2     = (1 << 31),
+};
+
+struct mlx4_wqe_bind_seg {
+	__be32			flags1;
+	__be32			flags2;
+	__be32			new_rkey;
+	__be32			lkey;
+	__be64			addr;
+	__be64			length;
+};
+
+enum {
+	MLX4_WQE_FMR_PERM_LOCAL_READ	= 1 << 27,
+	MLX4_WQE_FMR_PERM_LOCAL_WRITE	= 1 << 28,
+	MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ	= 1 << 29,
+	MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE	= 1 << 30,
+	MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC	= 1 << 31
+};
+
+struct mlx4_wqe_fmr_seg {
+	__be32			flags;
+	__be32			mem_key;
+	__be64			buf_list;
+	__be64			start_addr;
+	__be64			reg_len;
+	__be32			offset;
+	__be32			page_size;
+	u32			reserved[2];
+};
+
+struct mlx4_wqe_fmr_ext_seg {
+	u8			flags;
+	u8			reserved;
+	__be16			app_mask;
+	__be16			wire_app_tag;
+	__be16			mem_app_tag;
+	__be32			wire_ref_tag_base;
+	__be32			mem_ref_tag_base;
+};
+
+struct mlx4_wqe_local_inval_seg {
+	u64			reserved1;
+	__be32			mem_key;
+	u32			reserved2;
+	u64			reserved3[2];
+};
+
+struct mlx4_wqe_raddr_seg {
+	__be64			raddr;
+	__be32			rkey;
+	u32			reserved;
+};
+
+struct mlx4_wqe_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+};
+
+struct mlx4_wqe_masked_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+	__be64			swap_add_mask;
+	__be64			compare_mask;
+};
+
+struct mlx4_wqe_data_seg {
+	__be32			byte_count;
+	__be32			lkey;
+	__be64			addr;
+};
+
+enum {
+	MLX4_INLINE_ALIGN	= 64,
+	MLX4_INLINE_SEG		= 1 << 31,
+};
+
+struct mlx4_wqe_inline_seg {
+	__be32			byte_count;
+};
+
+enum mlx4_update_qp_attr {
+	MLX4_UPDATE_QP_SMAC		= 1 << 0,
+	MLX4_UPDATE_QP_VSD		= 1 << 1,
+	MLX4_UPDATE_QP_RATE_LIMIT	= 1 << 2,
+	MLX4_UPDATE_QP_QOS_VPORT	= 1 << 3,
+	MLX4_UPDATE_QP_ETH_SRC_CHECK_MC_LB      = 1 << 4,
+	MLX4_UPDATE_QP_SUPPORTED_ATTRS	= (1 << 5) - 1
+};
+
+enum mlx4_update_qp_params_flags {
+	MLX4_UPDATE_QP_PARAMS_FLAGS_ETH_CHECK_MC_LB     = 1 << 0,
+	MLX4_UPDATE_QP_PARAMS_FLAGS_VSD_ENABLE		= 1 << 1,
+};
+
+struct mlx4_update_qp_params {
+	u8	smac_index;
+	u8	qos_vport;
+	u32	flags;
+	u16	rate_unit;
+	u16	rate_val;
+};
+
+struct mlx4_qp *mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn);
+int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn,
+		   enum mlx4_update_qp_attr attr,
+		   struct mlx4_update_qp_params *params);
+int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		   struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar,
+		   int sqd_event, struct mlx4_qp *qp);
+
+int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
+		  struct mlx4_qp_context *context);
+
+int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     struct mlx4_qp_context *context,
+		     struct mlx4_qp *qp, enum mlx4_qp_state *qp_state);
+
+static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
+{
+	return radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1));
+}
+
+void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);
+
+static inline u16 folded_qp(u32 q)
+{
+	u16 res;
+
+	res = ((q & 0xff) ^ ((q & 0xff0000) >> 16)) | (q & 0xff00);
+	return res;
+}
+
+u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn);
+
+#endif /* MLX4_QP_H */
diff --git a/include/linux/mlx4/srq.h b/include/linux/mlx4/srq.h
new file mode 100644
index 0000000..192e0f7
--- /dev/null
+++ b/include/linux/mlx4/srq.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_SRQ_H
+#define MLX4_SRQ_H
+
+struct mlx4_wqe_srq_next_seg {
+	u16			reserved1;
+	__be16			next_wqe_index;
+	u32			reserved2[3];
+};
+
+struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn);
+
+#endif /* MLX4_SRQ_H */
diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h
new file mode 100644
index 0000000..70e7e56
--- /dev/null
+++ b/include/linux/mlx5/accel.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5_ACCEL_H__
+#define __MLX5_ACCEL_H__
+
+#include <linux/mlx5/driver.h>
+
+enum mlx5_accel_esp_aes_gcm_keymat_iv_algo {
+	MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ,
+};
+
+enum mlx5_accel_esp_flags {
+	MLX5_ACCEL_ESP_FLAGS_TUNNEL            = 0,    /* Default */
+	MLX5_ACCEL_ESP_FLAGS_TRANSPORT         = 1UL << 0,
+	MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED     = 1UL << 1,
+	MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP = 1UL << 2,
+};
+
+enum mlx5_accel_esp_action {
+	MLX5_ACCEL_ESP_ACTION_DECRYPT,
+	MLX5_ACCEL_ESP_ACTION_ENCRYPT,
+};
+
+enum mlx5_accel_esp_keymats {
+	MLX5_ACCEL_ESP_KEYMAT_AES_NONE,
+	MLX5_ACCEL_ESP_KEYMAT_AES_GCM,
+};
+
+enum mlx5_accel_esp_replay {
+	MLX5_ACCEL_ESP_REPLAY_NONE,
+	MLX5_ACCEL_ESP_REPLAY_BMP,
+};
+
+struct aes_gcm_keymat {
+	u64   seq_iv;
+	enum mlx5_accel_esp_aes_gcm_keymat_iv_algo iv_algo;
+
+	u32   salt;
+	u32   icv_len;
+
+	u32   key_len;
+	u32   aes_key[256 / 32];
+};
+
+struct mlx5_accel_esp_xfrm_attrs {
+	enum mlx5_accel_esp_action action;
+	u32   esn;
+	u32   spi;
+	u32   seq;
+	u32   tfc_pad;
+	u32   flags;
+	u32   sa_handle;
+	enum mlx5_accel_esp_replay replay_type;
+	union {
+		struct {
+			u32 size;
+
+		} bmp;
+	} replay;
+	enum mlx5_accel_esp_keymats keymat_type;
+	union {
+		struct aes_gcm_keymat aes_gcm;
+	} keymat;
+};
+
+struct mlx5_accel_esp_xfrm {
+	struct mlx5_core_dev  *mdev;
+	struct mlx5_accel_esp_xfrm_attrs attrs;
+};
+
+enum {
+	MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA = 1UL << 0,
+};
+
+enum mlx5_accel_ipsec_cap {
+	MLX5_ACCEL_IPSEC_CAP_DEVICE		= 1 << 0,
+	MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA	= 1 << 1,
+	MLX5_ACCEL_IPSEC_CAP_ESP		= 1 << 2,
+	MLX5_ACCEL_IPSEC_CAP_IPV6		= 1 << 3,
+	MLX5_ACCEL_IPSEC_CAP_LSO		= 1 << 4,
+	MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER	= 1 << 5,
+	MLX5_ACCEL_IPSEC_CAP_ESN		= 1 << 6,
+	MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN	= 1 << 7,
+};
+
+#ifdef CONFIG_MLX5_ACCEL
+
+u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev);
+
+struct mlx5_accel_esp_xfrm *
+mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev,
+			   const struct mlx5_accel_esp_xfrm_attrs *attrs,
+			   u32 flags);
+void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm);
+int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
+			       const struct mlx5_accel_esp_xfrm_attrs *attrs);
+
+#else
+
+static inline u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev) { return 0; }
+
+static inline struct mlx5_accel_esp_xfrm *
+mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev,
+			   const struct mlx5_accel_esp_xfrm_attrs *attrs,
+			   u32 flags) { return ERR_PTR(-EOPNOTSUPP); }
+static inline void
+mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm) {}
+static inline int
+mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
+			   const struct mlx5_accel_esp_xfrm_attrs *attrs) { return -EOPNOTSUPP; }
+
+#endif
+#endif
diff --git a/include/linux/mlx5/cmd.h b/include/linux/mlx5/cmd.h
new file mode 100644
index 0000000..68cd08f
--- /dev/null
+++ b/include/linux/mlx5/cmd.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_CMD_H
+#define MLX5_CMD_H
+
+#include <linux/types.h>
+
+struct manage_pages_layout {
+	u64	ptr;
+	u32	reserved;
+	u16	num_entries;
+	u16	func_id;
+};
+
+
+struct mlx5_cmd_alloc_uar_imm_out {
+	u32	rsvd[3];
+	u32	uarn;
+};
+
+#endif /* MLX5_CMD_H */
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
new file mode 100644
index 0000000..0ef6138
--- /dev/null
+++ b/include/linux/mlx5/cq.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_CORE_CQ_H
+#define MLX5_CORE_CQ_H
+
+#include <rdma/ib_verbs.h>
+#include <linux/mlx5/driver.h>
+#include <linux/refcount.h>
+
+struct mlx5_core_cq {
+	u32			cqn;
+	int			cqe_sz;
+	__be32		       *set_ci_db;
+	__be32		       *arm_db;
+	struct mlx5_uars_page  *uar;
+	refcount_t		refcount;
+	struct completion	free;
+	unsigned		vector;
+	unsigned int		irqn;
+	void (*comp)		(struct mlx5_core_cq *);
+	void (*event)		(struct mlx5_core_cq *, enum mlx5_event);
+	u32			cons_index;
+	unsigned		arm_sn;
+	struct mlx5_rsc_debug	*dbg;
+	int			pid;
+	struct {
+		struct list_head list;
+		void (*comp)(struct mlx5_core_cq *);
+		void		*priv;
+	} tasklet_ctx;
+	int			reset_notify_added;
+	struct list_head	reset_notify;
+	struct mlx5_eq		*eq;
+};
+
+
+enum {
+	MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR		= 0x01,
+	MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR		= 0x02,
+	MLX5_CQE_SYNDROME_LOCAL_PROT_ERR		= 0x04,
+	MLX5_CQE_SYNDROME_WR_FLUSH_ERR			= 0x05,
+	MLX5_CQE_SYNDROME_MW_BIND_ERR			= 0x06,
+	MLX5_CQE_SYNDROME_BAD_RESP_ERR			= 0x10,
+	MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR		= 0x11,
+	MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR		= 0x12,
+	MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR		= 0x13,
+	MLX5_CQE_SYNDROME_REMOTE_OP_ERR			= 0x14,
+	MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR	= 0x15,
+	MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR		= 0x16,
+	MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR		= 0x22,
+};
+
+enum {
+	MLX5_CQE_OWNER_MASK	= 1,
+	MLX5_CQE_REQ		= 0,
+	MLX5_CQE_RESP_WR_IMM	= 1,
+	MLX5_CQE_RESP_SEND	= 2,
+	MLX5_CQE_RESP_SEND_IMM	= 3,
+	MLX5_CQE_RESP_SEND_INV	= 4,
+	MLX5_CQE_RESIZE_CQ	= 5,
+	MLX5_CQE_SIG_ERR	= 12,
+	MLX5_CQE_REQ_ERR	= 13,
+	MLX5_CQE_RESP_ERR	= 14,
+	MLX5_CQE_INVALID	= 15,
+};
+
+enum {
+	MLX5_CQ_MODIFY_PERIOD	= 1 << 0,
+	MLX5_CQ_MODIFY_COUNT	= 1 << 1,
+	MLX5_CQ_MODIFY_OVERRUN	= 1 << 2,
+};
+
+enum {
+	MLX5_CQ_OPMOD_RESIZE		= 1,
+	MLX5_MODIFY_CQ_MASK_LOG_SIZE	= 1 << 0,
+	MLX5_MODIFY_CQ_MASK_PG_OFFSET	= 1 << 1,
+	MLX5_MODIFY_CQ_MASK_PG_SIZE	= 1 << 2,
+};
+
+struct mlx5_cq_modify_params {
+	int	type;
+	union {
+		struct {
+			u32	page_offset;
+			u8	log_cq_size;
+		} resize;
+
+		struct {
+		} moder;
+
+		struct {
+		} mapping;
+	} params;
+};
+
+enum {
+	CQE_SIZE_64 = 0,
+	CQE_SIZE_128 = 1,
+	CQE_SIZE_128_PAD = 2,
+};
+
+#define MLX5_MAX_CQ_PERIOD (BIT(__mlx5_bit_sz(cqc, cq_period)) - 1)
+#define MLX5_MAX_CQ_COUNT (BIT(__mlx5_bit_sz(cqc, cq_max_count)) - 1)
+
+static inline int cqe_sz_to_mlx_sz(u8 size, int padding_128_en)
+{
+	return padding_128_en ? CQE_SIZE_128_PAD :
+				size == 64 ? CQE_SIZE_64 : CQE_SIZE_128;
+}
+
+static inline void mlx5_cq_set_ci(struct mlx5_core_cq *cq)
+{
+	*cq->set_ci_db = cpu_to_be32(cq->cons_index & 0xffffff);
+}
+
+enum {
+	MLX5_CQ_DB_REQ_NOT_SOL		= 1 << 24,
+	MLX5_CQ_DB_REQ_NOT		= 0 << 24
+};
+
+static inline void mlx5_cq_arm(struct mlx5_core_cq *cq, u32 cmd,
+			       void __iomem *uar_page,
+			       u32 cons_index)
+{
+	__be32 doorbell[2];
+	u32 sn;
+	u32 ci;
+
+	sn = cq->arm_sn & 3;
+	ci = cons_index & 0xffffff;
+
+	*cq->arm_db = cpu_to_be32(sn << 28 | cmd | ci);
+
+	/* Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	doorbell[0] = cpu_to_be32(sn << 28 | cmd | ci);
+	doorbell[1] = cpu_to_be32(cq->cqn);
+
+	mlx5_write64(doorbell, uar_page + MLX5_CQ_DOORBELL, NULL);
+}
+
+static inline void mlx5_cq_hold(struct mlx5_core_cq *cq)
+{
+	refcount_inc(&cq->refcount);
+}
+
+static inline void mlx5_cq_put(struct mlx5_core_cq *cq)
+{
+	if (refcount_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+}
+
+int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			u32 *in, int inlen);
+int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
+int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+		       u32 *out, int outlen);
+int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			u32 *in, int inlen);
+int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev,
+				   struct mlx5_core_cq *cq, u16 cq_period,
+				   u16 cq_max_count);
+static inline void mlx5_dump_err_cqe(struct mlx5_core_dev *dev,
+				     struct mlx5_err_cqe *err_cqe)
+{
+	print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, err_cqe,
+		       sizeof(*err_cqe), false);
+}
+int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
+void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
+
+#endif /* MLX5_CORE_CQ_H */
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
new file mode 100644
index 0000000..2bc27f8
--- /dev/null
+++ b/include/linux/mlx5/device.h
@@ -0,0 +1,1229 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_DEVICE_H
+#define MLX5_DEVICE_H
+
+#include <linux/types.h>
+#include <rdma/ib_verbs.h>
+#include <linux/mlx5/mlx5_ifc.h>
+
+#if defined(__LITTLE_ENDIAN)
+#define MLX5_SET_HOST_ENDIANNESS	0
+#elif defined(__BIG_ENDIAN)
+#define MLX5_SET_HOST_ENDIANNESS	0x80
+#else
+#error Host endianness not defined
+#endif
+
+/* helper macros */
+#define __mlx5_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)0)
+#define __mlx5_bit_sz(typ, fld) sizeof(__mlx5_nullp(typ)->fld)
+#define __mlx5_bit_off(typ, fld) (offsetof(struct mlx5_ifc_##typ##_bits, fld))
+#define __mlx5_16_off(typ, fld) (__mlx5_bit_off(typ, fld) / 16)
+#define __mlx5_dw_off(typ, fld) (__mlx5_bit_off(typ, fld) / 32)
+#define __mlx5_64_off(typ, fld) (__mlx5_bit_off(typ, fld) / 64)
+#define __mlx5_16_bit_off(typ, fld) (16 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0xf))
+#define __mlx5_dw_bit_off(typ, fld) (32 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0x1f))
+#define __mlx5_mask(typ, fld) ((u32)((1ull << __mlx5_bit_sz(typ, fld)) - 1))
+#define __mlx5_dw_mask(typ, fld) (__mlx5_mask(typ, fld) << __mlx5_dw_bit_off(typ, fld))
+#define __mlx5_mask16(typ, fld) ((u16)((1ull << __mlx5_bit_sz(typ, fld)) - 1))
+#define __mlx5_16_mask(typ, fld) (__mlx5_mask16(typ, fld) << __mlx5_16_bit_off(typ, fld))
+#define __mlx5_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits)
+
+#define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8)
+#define MLX5_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8)
+#define MLX5_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32)
+#define MLX5_ST_SZ_QW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 64)
+#define MLX5_UN_SZ_BYTES(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 8)
+#define MLX5_UN_SZ_DW(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 32)
+#define MLX5_BYTE_OFF(typ, fld) (__mlx5_bit_off(typ, fld) / 8)
+#define MLX5_ADDR_OF(typ, p, fld) ((char *)(p) + MLX5_BYTE_OFF(typ, fld))
+
+/* insert a value to a struct */
+#define MLX5_SET(typ, p, fld, v) do { \
+	u32 _v = v; \
+	BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 32);             \
+	*((__be32 *)(p) + __mlx5_dw_off(typ, fld)) = \
+	cpu_to_be32((be32_to_cpu(*((__be32 *)(p) + __mlx5_dw_off(typ, fld))) & \
+		     (~__mlx5_dw_mask(typ, fld))) | (((_v) & __mlx5_mask(typ, fld)) \
+		     << __mlx5_dw_bit_off(typ, fld))); \
+} while (0)
+
+#define MLX5_ARRAY_SET(typ, p, fld, idx, v) do { \
+	BUILD_BUG_ON(__mlx5_bit_off(typ, fld) % 32); \
+	MLX5_SET(typ, p, fld[idx], v); \
+} while (0)
+
+#define MLX5_SET_TO_ONES(typ, p, fld) do { \
+	BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 32);             \
+	*((__be32 *)(p) + __mlx5_dw_off(typ, fld)) = \
+	cpu_to_be32((be32_to_cpu(*((__be32 *)(p) + __mlx5_dw_off(typ, fld))) & \
+		     (~__mlx5_dw_mask(typ, fld))) | ((__mlx5_mask(typ, fld)) \
+		     << __mlx5_dw_bit_off(typ, fld))); \
+} while (0)
+
+#define MLX5_GET(typ, p, fld) ((be32_to_cpu(*((__be32 *)(p) +\
+__mlx5_dw_off(typ, fld))) >> __mlx5_dw_bit_off(typ, fld)) & \
+__mlx5_mask(typ, fld))
+
+#define MLX5_GET_PR(typ, p, fld) ({ \
+	u32 ___t = MLX5_GET(typ, p, fld); \
+	pr_debug(#fld " = 0x%x\n", ___t); \
+	___t; \
+})
+
+#define __MLX5_SET64(typ, p, fld, v) do { \
+	BUILD_BUG_ON(__mlx5_bit_sz(typ, fld) != 64); \
+	*((__be64 *)(p) + __mlx5_64_off(typ, fld)) = cpu_to_be64(v); \
+} while (0)
+
+#define MLX5_SET64(typ, p, fld, v) do { \
+	BUILD_BUG_ON(__mlx5_bit_off(typ, fld) % 64); \
+	__MLX5_SET64(typ, p, fld, v); \
+} while (0)
+
+#define MLX5_ARRAY_SET64(typ, p, fld, idx, v) do { \
+	BUILD_BUG_ON(__mlx5_bit_off(typ, fld) % 64); \
+	__MLX5_SET64(typ, p, fld[idx], v); \
+} while (0)
+
+#define MLX5_GET64(typ, p, fld) be64_to_cpu(*((__be64 *)(p) + __mlx5_64_off(typ, fld)))
+
+#define MLX5_GET64_PR(typ, p, fld) ({ \
+	u64 ___t = MLX5_GET64(typ, p, fld); \
+	pr_debug(#fld " = 0x%llx\n", ___t); \
+	___t; \
+})
+
+#define MLX5_GET16(typ, p, fld) ((be16_to_cpu(*((__be16 *)(p) +\
+__mlx5_16_off(typ, fld))) >> __mlx5_16_bit_off(typ, fld)) & \
+__mlx5_mask16(typ, fld))
+
+#define MLX5_SET16(typ, p, fld, v) do { \
+	u16 _v = v; \
+	BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 16);             \
+	*((__be16 *)(p) + __mlx5_16_off(typ, fld)) = \
+	cpu_to_be16((be16_to_cpu(*((__be16 *)(p) + __mlx5_16_off(typ, fld))) & \
+		     (~__mlx5_16_mask(typ, fld))) | (((_v) & __mlx5_mask16(typ, fld)) \
+		     << __mlx5_16_bit_off(typ, fld))); \
+} while (0)
+
+/* Big endian getters */
+#define MLX5_GET64_BE(typ, p, fld) (*((__be64 *)(p) +\
+	__mlx5_64_off(typ, fld)))
+
+#define MLX5_GET_BE(type_t, typ, p, fld) ({				  \
+		type_t tmp;						  \
+		switch (sizeof(tmp)) {					  \
+		case sizeof(u8):					  \
+			tmp = (__force type_t)MLX5_GET(typ, p, fld);	  \
+			break;						  \
+		case sizeof(u16):					  \
+			tmp = (__force type_t)cpu_to_be16(MLX5_GET(typ, p, fld)); \
+			break;						  \
+		case sizeof(u32):					  \
+			tmp = (__force type_t)cpu_to_be32(MLX5_GET(typ, p, fld)); \
+			break;						  \
+		case sizeof(u64):					  \
+			tmp = (__force type_t)MLX5_GET64_BE(typ, p, fld); \
+			break;						  \
+			}						  \
+		tmp;							  \
+		})
+
+enum mlx5_inline_modes {
+	MLX5_INLINE_MODE_NONE,
+	MLX5_INLINE_MODE_L2,
+	MLX5_INLINE_MODE_IP,
+	MLX5_INLINE_MODE_TCP_UDP,
+};
+
+enum {
+	MLX5_MAX_COMMANDS		= 32,
+	MLX5_CMD_DATA_BLOCK_SIZE	= 512,
+	MLX5_PCI_CMD_XPORT		= 7,
+	MLX5_MKEY_BSF_OCTO_SIZE		= 4,
+	MLX5_MAX_PSVS			= 4,
+};
+
+enum {
+	MLX5_EXTENDED_UD_AV		= 0x80000000,
+};
+
+enum {
+	MLX5_CQ_STATE_ARMED		= 9,
+	MLX5_CQ_STATE_ALWAYS_ARMED	= 0xb,
+	MLX5_CQ_STATE_FIRED		= 0xa,
+};
+
+enum {
+	MLX5_STAT_RATE_OFFSET	= 5,
+};
+
+enum {
+	MLX5_INLINE_SEG = 0x80000000,
+};
+
+enum {
+	MLX5_HW_START_PADDING = MLX5_INLINE_SEG,
+};
+
+enum {
+	MLX5_MIN_PKEY_TABLE_SIZE = 128,
+	MLX5_MAX_LOG_PKEY_TABLE  = 5,
+};
+
+enum {
+	MLX5_MKEY_INBOX_PG_ACCESS = 1 << 31
+};
+
+enum {
+	MLX5_PFAULT_SUBTYPE_WQE = 0,
+	MLX5_PFAULT_SUBTYPE_RDMA = 1,
+};
+
+enum {
+	MLX5_PERM_LOCAL_READ	= 1 << 2,
+	MLX5_PERM_LOCAL_WRITE	= 1 << 3,
+	MLX5_PERM_REMOTE_READ	= 1 << 4,
+	MLX5_PERM_REMOTE_WRITE	= 1 << 5,
+	MLX5_PERM_ATOMIC	= 1 << 6,
+	MLX5_PERM_UMR_EN	= 1 << 7,
+};
+
+enum {
+	MLX5_PCIE_CTRL_SMALL_FENCE	= 1 << 0,
+	MLX5_PCIE_CTRL_RELAXED_ORDERING	= 1 << 2,
+	MLX5_PCIE_CTRL_NO_SNOOP		= 1 << 3,
+	MLX5_PCIE_CTRL_TLP_PROCE_EN	= 1 << 6,
+	MLX5_PCIE_CTRL_TPH_MASK		= 3 << 4,
+};
+
+enum {
+	MLX5_EN_RD	= (u64)1,
+	MLX5_EN_WR	= (u64)2
+};
+
+enum {
+	MLX5_ADAPTER_PAGE_SHIFT		= 12,
+	MLX5_ADAPTER_PAGE_SIZE		= 1 << MLX5_ADAPTER_PAGE_SHIFT,
+};
+
+enum {
+	MLX5_BFREGS_PER_UAR		= 4,
+	MLX5_MAX_UARS			= 1 << 8,
+	MLX5_NON_FP_BFREGS_PER_UAR	= 2,
+	MLX5_FP_BFREGS_PER_UAR		= MLX5_BFREGS_PER_UAR -
+					  MLX5_NON_FP_BFREGS_PER_UAR,
+	MLX5_MAX_BFREGS			= MLX5_MAX_UARS *
+					  MLX5_NON_FP_BFREGS_PER_UAR,
+	MLX5_UARS_IN_PAGE		= PAGE_SIZE / MLX5_ADAPTER_PAGE_SIZE,
+	MLX5_NON_FP_BFREGS_IN_PAGE	= MLX5_NON_FP_BFREGS_PER_UAR * MLX5_UARS_IN_PAGE,
+	MLX5_MIN_DYN_BFREGS		= 512,
+	MLX5_MAX_DYN_BFREGS		= 1024,
+};
+
+enum {
+	MLX5_MKEY_MASK_LEN		= 1ull << 0,
+	MLX5_MKEY_MASK_PAGE_SIZE	= 1ull << 1,
+	MLX5_MKEY_MASK_START_ADDR	= 1ull << 6,
+	MLX5_MKEY_MASK_PD		= 1ull << 7,
+	MLX5_MKEY_MASK_EN_RINVAL	= 1ull << 8,
+	MLX5_MKEY_MASK_EN_SIGERR	= 1ull << 9,
+	MLX5_MKEY_MASK_BSF_EN		= 1ull << 12,
+	MLX5_MKEY_MASK_KEY		= 1ull << 13,
+	MLX5_MKEY_MASK_QPN		= 1ull << 14,
+	MLX5_MKEY_MASK_LR		= 1ull << 17,
+	MLX5_MKEY_MASK_LW		= 1ull << 18,
+	MLX5_MKEY_MASK_RR		= 1ull << 19,
+	MLX5_MKEY_MASK_RW		= 1ull << 20,
+	MLX5_MKEY_MASK_A		= 1ull << 21,
+	MLX5_MKEY_MASK_SMALL_FENCE	= 1ull << 23,
+	MLX5_MKEY_MASK_FREE		= 1ull << 29,
+};
+
+enum {
+	MLX5_UMR_TRANSLATION_OFFSET_EN	= (1 << 4),
+
+	MLX5_UMR_CHECK_NOT_FREE		= (1 << 5),
+	MLX5_UMR_CHECK_FREE		= (2 << 5),
+
+	MLX5_UMR_INLINE			= (1 << 7),
+};
+
+#define MLX5_UMR_MTT_ALIGNMENT 0x40
+#define MLX5_UMR_MTT_MASK      (MLX5_UMR_MTT_ALIGNMENT - 1)
+#define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT
+
+#define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8)
+
+enum {
+	MLX5_EVENT_QUEUE_TYPE_QP = 0,
+	MLX5_EVENT_QUEUE_TYPE_RQ = 1,
+	MLX5_EVENT_QUEUE_TYPE_SQ = 2,
+	MLX5_EVENT_QUEUE_TYPE_DCT = 6,
+};
+
+enum mlx5_event {
+	MLX5_EVENT_TYPE_COMP		   = 0x0,
+
+	MLX5_EVENT_TYPE_PATH_MIG	   = 0x01,
+	MLX5_EVENT_TYPE_COMM_EST	   = 0x02,
+	MLX5_EVENT_TYPE_SQ_DRAINED	   = 0x03,
+	MLX5_EVENT_TYPE_SRQ_LAST_WQE	   = 0x13,
+	MLX5_EVENT_TYPE_SRQ_RQ_LIMIT	   = 0x14,
+
+	MLX5_EVENT_TYPE_CQ_ERROR	   = 0x04,
+	MLX5_EVENT_TYPE_WQ_CATAS_ERROR	   = 0x05,
+	MLX5_EVENT_TYPE_PATH_MIG_FAILED	   = 0x07,
+	MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MLX5_EVENT_TYPE_WQ_ACCESS_ERROR	   = 0x11,
+	MLX5_EVENT_TYPE_SRQ_CATAS_ERROR	   = 0x12,
+
+	MLX5_EVENT_TYPE_INTERNAL_ERROR	   = 0x08,
+	MLX5_EVENT_TYPE_PORT_CHANGE	   = 0x09,
+	MLX5_EVENT_TYPE_GPIO_EVENT	   = 0x15,
+	MLX5_EVENT_TYPE_PORT_MODULE_EVENT  = 0x16,
+	MLX5_EVENT_TYPE_REMOTE_CONFIG	   = 0x19,
+	MLX5_EVENT_TYPE_GENERAL_EVENT	   = 0x22,
+	MLX5_EVENT_TYPE_PPS_EVENT          = 0x25,
+
+	MLX5_EVENT_TYPE_DB_BF_CONGESTION   = 0x1a,
+	MLX5_EVENT_TYPE_STALL_EVENT	   = 0x1b,
+
+	MLX5_EVENT_TYPE_CMD		   = 0x0a,
+	MLX5_EVENT_TYPE_PAGE_REQUEST	   = 0xb,
+
+	MLX5_EVENT_TYPE_PAGE_FAULT	   = 0xc,
+	MLX5_EVENT_TYPE_NIC_VPORT_CHANGE   = 0xd,
+
+	MLX5_EVENT_TYPE_DCT_DRAINED        = 0x1c,
+
+	MLX5_EVENT_TYPE_FPGA_ERROR         = 0x20,
+};
+
+enum {
+	MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT = 0x1,
+};
+
+enum {
+	MLX5_PORT_CHANGE_SUBTYPE_DOWN		= 1,
+	MLX5_PORT_CHANGE_SUBTYPE_ACTIVE		= 4,
+	MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED	= 5,
+	MLX5_PORT_CHANGE_SUBTYPE_LID		= 6,
+	MLX5_PORT_CHANGE_SUBTYPE_PKEY		= 7,
+	MLX5_PORT_CHANGE_SUBTYPE_GUID		= 8,
+	MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG	= 9,
+};
+
+enum {
+	MLX5_DEV_CAP_FLAG_XRC		= 1LL <<  3,
+	MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1LL <<  8,
+	MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1LL <<  9,
+	MLX5_DEV_CAP_FLAG_APM		= 1LL << 17,
+	MLX5_DEV_CAP_FLAG_ATOMIC	= 1LL << 18,
+	MLX5_DEV_CAP_FLAG_BLOCK_MCAST	= 1LL << 23,
+	MLX5_DEV_CAP_FLAG_ON_DMND_PG	= 1LL << 24,
+	MLX5_DEV_CAP_FLAG_CQ_MODER	= 1LL << 29,
+	MLX5_DEV_CAP_FLAG_RESIZE_CQ	= 1LL << 30,
+	MLX5_DEV_CAP_FLAG_DCT		= 1LL << 37,
+	MLX5_DEV_CAP_FLAG_SIG_HAND_OVER	= 1LL << 40,
+	MLX5_DEV_CAP_FLAG_CMDIF_CSUM	= 3LL << 46,
+};
+
+enum {
+	MLX5_ROCE_VERSION_1		= 0,
+	MLX5_ROCE_VERSION_2		= 2,
+};
+
+enum {
+	MLX5_ROCE_VERSION_1_CAP		= 1 << MLX5_ROCE_VERSION_1,
+	MLX5_ROCE_VERSION_2_CAP		= 1 << MLX5_ROCE_VERSION_2,
+};
+
+enum {
+	MLX5_ROCE_L3_TYPE_IPV4		= 0,
+	MLX5_ROCE_L3_TYPE_IPV6		= 1,
+};
+
+enum {
+	MLX5_ROCE_L3_TYPE_IPV4_CAP	= 1 << 1,
+	MLX5_ROCE_L3_TYPE_IPV6_CAP	= 1 << 2,
+};
+
+enum {
+	MLX5_OPCODE_NOP			= 0x00,
+	MLX5_OPCODE_SEND_INVAL		= 0x01,
+	MLX5_OPCODE_RDMA_WRITE		= 0x08,
+	MLX5_OPCODE_RDMA_WRITE_IMM	= 0x09,
+	MLX5_OPCODE_SEND		= 0x0a,
+	MLX5_OPCODE_SEND_IMM		= 0x0b,
+	MLX5_OPCODE_LSO			= 0x0e,
+	MLX5_OPCODE_RDMA_READ		= 0x10,
+	MLX5_OPCODE_ATOMIC_CS		= 0x11,
+	MLX5_OPCODE_ATOMIC_FA		= 0x12,
+	MLX5_OPCODE_ATOMIC_MASKED_CS	= 0x14,
+	MLX5_OPCODE_ATOMIC_MASKED_FA	= 0x15,
+	MLX5_OPCODE_BIND_MW		= 0x18,
+	MLX5_OPCODE_CONFIG_CMD		= 0x1f,
+
+	MLX5_RECV_OPCODE_RDMA_WRITE_IMM	= 0x00,
+	MLX5_RECV_OPCODE_SEND		= 0x01,
+	MLX5_RECV_OPCODE_SEND_IMM	= 0x02,
+	MLX5_RECV_OPCODE_SEND_INVAL	= 0x03,
+
+	MLX5_CQE_OPCODE_ERROR		= 0x1e,
+	MLX5_CQE_OPCODE_RESIZE		= 0x16,
+
+	MLX5_OPCODE_SET_PSV		= 0x20,
+	MLX5_OPCODE_GET_PSV		= 0x21,
+	MLX5_OPCODE_CHECK_PSV		= 0x22,
+	MLX5_OPCODE_RGET_PSV		= 0x26,
+	MLX5_OPCODE_RCHECK_PSV		= 0x27,
+
+	MLX5_OPCODE_UMR			= 0x25,
+
+};
+
+enum {
+	MLX5_SET_PORT_RESET_QKEY	= 0,
+	MLX5_SET_PORT_GUID0		= 16,
+	MLX5_SET_PORT_NODE_GUID		= 17,
+	MLX5_SET_PORT_SYS_GUID		= 18,
+	MLX5_SET_PORT_GID_TABLE		= 19,
+	MLX5_SET_PORT_PKEY_TABLE	= 20,
+};
+
+enum {
+	MLX5_BW_NO_LIMIT   = 0,
+	MLX5_100_MBPS_UNIT = 3,
+	MLX5_GBPS_UNIT	   = 4,
+};
+
+enum {
+	MLX5_MAX_PAGE_SHIFT		= 31
+};
+
+enum {
+	MLX5_CAP_OFF_CMDIF_CSUM		= 46,
+};
+
+enum {
+	/*
+	 * Max wqe size for rdma read is 512 bytes, so this
+	 * limits our max_sge_rd as the wqe needs to fit:
+	 * - ctrl segment (16 bytes)
+	 * - rdma segment (16 bytes)
+	 * - scatter elements (16 bytes each)
+	 */
+	MLX5_MAX_SGE_RD	= (512 - 16 - 16) / 16
+};
+
+enum mlx5_odp_transport_cap_bits {
+	MLX5_ODP_SUPPORT_SEND	 = 1 << 31,
+	MLX5_ODP_SUPPORT_RECV	 = 1 << 30,
+	MLX5_ODP_SUPPORT_WRITE	 = 1 << 29,
+	MLX5_ODP_SUPPORT_READ	 = 1 << 28,
+};
+
+struct mlx5_odp_caps {
+	char reserved[0x10];
+	struct {
+		__be32			rc_odp_caps;
+		__be32			uc_odp_caps;
+		__be32			ud_odp_caps;
+	} per_transport_caps;
+	char reserved2[0xe4];
+};
+
+struct mlx5_cmd_layout {
+	u8		type;
+	u8		rsvd0[3];
+	__be32		inlen;
+	__be64		in_ptr;
+	__be32		in[4];
+	__be32		out[4];
+	__be64		out_ptr;
+	__be32		outlen;
+	u8		token;
+	u8		sig;
+	u8		rsvd1;
+	u8		status_own;
+};
+
+struct health_buffer {
+	__be32		assert_var[5];
+	__be32		rsvd0[3];
+	__be32		assert_exit_ptr;
+	__be32		assert_callra;
+	__be32		rsvd1[2];
+	__be32		fw_ver;
+	__be32		hw_id;
+	__be32		rsvd2;
+	u8		irisc_index;
+	u8		synd;
+	__be16		ext_synd;
+};
+
+struct mlx5_init_seg {
+	__be32			fw_rev;
+	__be32			cmdif_rev_fw_sub;
+	__be32			rsvd0[2];
+	__be32			cmdq_addr_h;
+	__be32			cmdq_addr_l_sz;
+	__be32			cmd_dbell;
+	__be32			rsvd1[120];
+	__be32			initializing;
+	struct health_buffer	health;
+	__be32			rsvd2[880];
+	__be32			internal_timer_h;
+	__be32			internal_timer_l;
+	__be32			rsvd3[2];
+	__be32			health_counter;
+	__be32			rsvd4[1019];
+	__be64			ieee1588_clk;
+	__be32			ieee1588_clk_type;
+	__be32			clr_intx;
+};
+
+struct mlx5_eqe_comp {
+	__be32	reserved[6];
+	__be32	cqn;
+};
+
+struct mlx5_eqe_qp_srq {
+	__be32	reserved1[5];
+	u8	type;
+	u8	reserved2[3];
+	__be32	qp_srq_n;
+};
+
+struct mlx5_eqe_cq_err {
+	__be32	cqn;
+	u8	reserved1[7];
+	u8	syndrome;
+};
+
+struct mlx5_eqe_port_state {
+	u8	reserved0[8];
+	u8	port;
+};
+
+struct mlx5_eqe_gpio {
+	__be32	reserved0[2];
+	__be64	gpio_event;
+};
+
+struct mlx5_eqe_congestion {
+	u8	type;
+	u8	rsvd0;
+	u8	congestion_level;
+};
+
+struct mlx5_eqe_stall_vl {
+	u8	rsvd0[3];
+	u8	port_vl;
+};
+
+struct mlx5_eqe_cmd {
+	__be32	vector;
+	__be32	rsvd[6];
+};
+
+struct mlx5_eqe_page_req {
+	u8		rsvd0[2];
+	__be16		func_id;
+	__be32		num_pages;
+	__be32		rsvd1[5];
+};
+
+struct mlx5_eqe_page_fault {
+	__be32 bytes_committed;
+	union {
+		struct {
+			u16     reserved1;
+			__be16  wqe_index;
+			u16	reserved2;
+			__be16  packet_length;
+			__be32  token;
+			u8	reserved4[8];
+			__be32  pftype_wq;
+		} __packed wqe;
+		struct {
+			__be32  r_key;
+			u16	reserved1;
+			__be16  packet_length;
+			__be32  rdma_op_len;
+			__be64  rdma_va;
+			__be32  pftype_token;
+		} __packed rdma;
+	} __packed;
+} __packed;
+
+struct mlx5_eqe_vport_change {
+	u8		rsvd0[2];
+	__be16		vport_num;
+	__be32		rsvd1[6];
+} __packed;
+
+struct mlx5_eqe_port_module {
+	u8        reserved_at_0[1];
+	u8        module;
+	u8        reserved_at_2[1];
+	u8        module_status;
+	u8        reserved_at_4[2];
+	u8        error_type;
+} __packed;
+
+struct mlx5_eqe_pps {
+	u8		rsvd0[3];
+	u8		pin;
+	u8		rsvd1[4];
+	union {
+		struct {
+			__be32		time_sec;
+			__be32		time_nsec;
+		};
+		struct {
+			__be64		time_stamp;
+		};
+	};
+	u8		rsvd2[12];
+} __packed;
+
+struct mlx5_eqe_dct {
+	__be32  reserved[6];
+	__be32  dctn;
+};
+
+union ev_data {
+	__be32				raw[7];
+	struct mlx5_eqe_cmd		cmd;
+	struct mlx5_eqe_comp		comp;
+	struct mlx5_eqe_qp_srq		qp_srq;
+	struct mlx5_eqe_cq_err		cq_err;
+	struct mlx5_eqe_port_state	port;
+	struct mlx5_eqe_gpio		gpio;
+	struct mlx5_eqe_congestion	cong;
+	struct mlx5_eqe_stall_vl	stall_vl;
+	struct mlx5_eqe_page_req	req_pages;
+	struct mlx5_eqe_page_fault	page_fault;
+	struct mlx5_eqe_vport_change	vport_change;
+	struct mlx5_eqe_port_module	port_module;
+	struct mlx5_eqe_pps		pps;
+	struct mlx5_eqe_dct             dct;
+} __packed;
+
+struct mlx5_eqe {
+	u8		rsvd0;
+	u8		type;
+	u8		rsvd1;
+	u8		sub_type;
+	__be32		rsvd2[7];
+	union ev_data	data;
+	__be16		rsvd3;
+	u8		signature;
+	u8		owner;
+} __packed;
+
+struct mlx5_cmd_prot_block {
+	u8		data[MLX5_CMD_DATA_BLOCK_SIZE];
+	u8		rsvd0[48];
+	__be64		next;
+	__be32		block_num;
+	u8		rsvd1;
+	u8		token;
+	u8		ctrl_sig;
+	u8		sig;
+};
+
+enum {
+	MLX5_CQE_SYND_FLUSHED_IN_ERROR = 5,
+};
+
+struct mlx5_err_cqe {
+	u8	rsvd0[32];
+	__be32	srqn;
+	u8	rsvd1[18];
+	u8	vendor_err_synd;
+	u8	syndrome;
+	__be32	s_wqe_opcode_qpn;
+	__be16	wqe_counter;
+	u8	signature;
+	u8	op_own;
+};
+
+struct mlx5_cqe64 {
+	u8		outer_l3_tunneled;
+	u8		rsvd0;
+	__be16		wqe_id;
+	u8		lro_tcppsh_abort_dupack;
+	u8		lro_min_ttl;
+	__be16		lro_tcp_win;
+	__be32		lro_ack_seq_num;
+	__be32		rss_hash_result;
+	u8		rss_hash_type;
+	u8		ml_path;
+	u8		rsvd20[2];
+	__be16		check_sum;
+	__be16		slid;
+	__be32		flags_rqpn;
+	u8		hds_ip_ext;
+	u8		l4_l3_hdr_type;
+	__be16		vlan_info;
+	__be32		srqn; /* [31:24]: lro_num_seg, [23:0]: srqn */
+	__be32		imm_inval_pkey;
+	u8		rsvd40[4];
+	__be32		byte_cnt;
+	__be32		timestamp_h;
+	__be32		timestamp_l;
+	__be32		sop_drop_qpn;
+	__be16		wqe_counter;
+	u8		signature;
+	u8		op_own;
+};
+
+struct mlx5_mini_cqe8 {
+	union {
+		__be32 rx_hash_result;
+		struct {
+			__be16 checksum;
+			__be16 rsvd;
+		};
+		struct {
+			__be16 wqe_counter;
+			u8  s_wqe_opcode;
+			u8  reserved;
+		} s_wqe_info;
+	};
+	__be32 byte_cnt;
+};
+
+enum {
+	MLX5_NO_INLINE_DATA,
+	MLX5_INLINE_DATA32_SEG,
+	MLX5_INLINE_DATA64_SEG,
+	MLX5_COMPRESSED,
+};
+
+enum {
+	MLX5_CQE_FORMAT_CSUM = 0x1,
+};
+
+#define MLX5_MINI_CQE_ARRAY_SIZE 8
+
+static inline int mlx5_get_cqe_format(struct mlx5_cqe64 *cqe)
+{
+	return (cqe->op_own >> 2) & 0x3;
+}
+
+static inline u8 get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe)
+{
+	return (cqe->lro_tcppsh_abort_dupack >> 6) & 1;
+}
+
+static inline u8 get_cqe_l4_hdr_type(struct mlx5_cqe64 *cqe)
+{
+	return (cqe->l4_l3_hdr_type >> 4) & 0x7;
+}
+
+static inline u8 get_cqe_l3_hdr_type(struct mlx5_cqe64 *cqe)
+{
+	return (cqe->l4_l3_hdr_type >> 2) & 0x3;
+}
+
+static inline u8 cqe_is_tunneled(struct mlx5_cqe64 *cqe)
+{
+	return cqe->outer_l3_tunneled & 0x1;
+}
+
+static inline int cqe_has_vlan(struct mlx5_cqe64 *cqe)
+{
+	return !!(cqe->l4_l3_hdr_type & 0x1);
+}
+
+static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe)
+{
+	u32 hi, lo;
+
+	hi = be32_to_cpu(cqe->timestamp_h);
+	lo = be32_to_cpu(cqe->timestamp_l);
+
+	return (u64)lo | ((u64)hi << 32);
+}
+
+#define MLX5_MPWQE_LOG_NUM_STRIDES_BASE	(9)
+#define MLX5_MPWQE_LOG_STRIDE_SZ_BASE	(6)
+
+struct mpwrq_cqe_bc {
+	__be16	filler_consumed_strides;
+	__be16	byte_cnt;
+};
+
+static inline u16 mpwrq_get_cqe_byte_cnt(struct mlx5_cqe64 *cqe)
+{
+	struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
+
+	return be16_to_cpu(bc->byte_cnt);
+}
+
+static inline u16 mpwrq_get_cqe_bc_consumed_strides(struct mpwrq_cqe_bc *bc)
+{
+	return 0x7fff & be16_to_cpu(bc->filler_consumed_strides);
+}
+
+static inline u16 mpwrq_get_cqe_consumed_strides(struct mlx5_cqe64 *cqe)
+{
+	struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
+
+	return mpwrq_get_cqe_bc_consumed_strides(bc);
+}
+
+static inline bool mpwrq_is_filler_cqe(struct mlx5_cqe64 *cqe)
+{
+	struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
+
+	return 0x8000 & be16_to_cpu(bc->filler_consumed_strides);
+}
+
+static inline u16 mpwrq_get_cqe_stride_index(struct mlx5_cqe64 *cqe)
+{
+	return be16_to_cpu(cqe->wqe_counter);
+}
+
+enum {
+	CQE_L4_HDR_TYPE_NONE			= 0x0,
+	CQE_L4_HDR_TYPE_TCP_NO_ACK		= 0x1,
+	CQE_L4_HDR_TYPE_UDP			= 0x2,
+	CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA		= 0x3,
+	CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA	= 0x4,
+};
+
+enum {
+	CQE_RSS_HTYPE_IP	= 0x3 << 2,
+	/* cqe->rss_hash_type[3:2] - IP destination selected for hash
+	 * (00 = none,  01 = IPv4, 10 = IPv6, 11 = Reserved)
+	 */
+	CQE_RSS_HTYPE_L4	= 0x3 << 6,
+	/* cqe->rss_hash_type[7:6] - L4 destination selected for hash
+	 * (00 = none, 01 = TCP. 10 = UDP, 11 = IPSEC.SPI
+	 */
+};
+
+enum {
+	MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH	= 0x0,
+	MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6	= 0x1,
+	MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4	= 0x2,
+};
+
+enum {
+	CQE_L2_OK	= 1 << 0,
+	CQE_L3_OK	= 1 << 1,
+	CQE_L4_OK	= 1 << 2,
+};
+
+struct mlx5_sig_err_cqe {
+	u8		rsvd0[16];
+	__be32		expected_trans_sig;
+	__be32		actual_trans_sig;
+	__be32		expected_reftag;
+	__be32		actual_reftag;
+	__be16		syndrome;
+	u8		rsvd22[2];
+	__be32		mkey;
+	__be64		err_offset;
+	u8		rsvd30[8];
+	__be32		qpn;
+	u8		rsvd38[2];
+	u8		signature;
+	u8		op_own;
+};
+
+struct mlx5_wqe_srq_next_seg {
+	u8			rsvd0[2];
+	__be16			next_wqe_index;
+	u8			signature;
+	u8			rsvd1[11];
+};
+
+union mlx5_ext_cqe {
+	struct ib_grh	grh;
+	u8		inl[64];
+};
+
+struct mlx5_cqe128 {
+	union mlx5_ext_cqe	inl_grh;
+	struct mlx5_cqe64	cqe64;
+};
+
+enum {
+	MLX5_MKEY_STATUS_FREE = 1 << 6,
+};
+
+enum {
+	MLX5_MKEY_REMOTE_INVAL	= 1 << 24,
+	MLX5_MKEY_FLAG_SYNC_UMR = 1 << 29,
+	MLX5_MKEY_BSF_EN	= 1 << 30,
+	MLX5_MKEY_LEN64		= 1 << 31,
+};
+
+struct mlx5_mkey_seg {
+	/* This is a two bit field occupying bits 31-30.
+	 * bit 31 is always 0,
+	 * bit 30 is zero for regular MRs and 1 (e.g free) for UMRs that do not have tanslation
+	 */
+	u8		status;
+	u8		pcie_control;
+	u8		flags;
+	u8		version;
+	__be32		qpn_mkey7_0;
+	u8		rsvd1[4];
+	__be32		flags_pd;
+	__be64		start_addr;
+	__be64		len;
+	__be32		bsfs_octo_size;
+	u8		rsvd2[16];
+	__be32		xlt_oct_size;
+	u8		rsvd3[3];
+	u8		log2_page_size;
+	u8		rsvd4[4];
+};
+
+#define MLX5_ATTR_EXTENDED_PORT_INFO	cpu_to_be16(0xff90)
+
+enum {
+	MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO	= 1 <<  0
+};
+
+enum {
+	VPORT_STATE_DOWN		= 0x0,
+	VPORT_STATE_UP			= 0x1,
+};
+
+enum {
+	MLX5_ESW_VPORT_ADMIN_STATE_DOWN  = 0x0,
+	MLX5_ESW_VPORT_ADMIN_STATE_UP    = 0x1,
+	MLX5_ESW_VPORT_ADMIN_STATE_AUTO  = 0x2,
+};
+
+enum {
+	MLX5_L3_PROT_TYPE_IPV4		= 0,
+	MLX5_L3_PROT_TYPE_IPV6		= 1,
+};
+
+enum {
+	MLX5_L4_PROT_TYPE_TCP		= 0,
+	MLX5_L4_PROT_TYPE_UDP		= 1,
+};
+
+enum {
+	MLX5_HASH_FIELD_SEL_SRC_IP	= 1 << 0,
+	MLX5_HASH_FIELD_SEL_DST_IP	= 1 << 1,
+	MLX5_HASH_FIELD_SEL_L4_SPORT	= 1 << 2,
+	MLX5_HASH_FIELD_SEL_L4_DPORT	= 1 << 3,
+	MLX5_HASH_FIELD_SEL_IPSEC_SPI	= 1 << 4,
+};
+
+enum {
+	MLX5_MATCH_OUTER_HEADERS	= 1 << 0,
+	MLX5_MATCH_MISC_PARAMETERS	= 1 << 1,
+	MLX5_MATCH_INNER_HEADERS	= 1 << 2,
+
+};
+
+enum {
+	MLX5_FLOW_TABLE_TYPE_NIC_RCV	= 0,
+	MLX5_FLOW_TABLE_TYPE_ESWITCH	= 4,
+};
+
+enum {
+	MLX5_FLOW_CONTEXT_DEST_TYPE_VPORT	= 0,
+	MLX5_FLOW_CONTEXT_DEST_TYPE_FLOW_TABLE	= 1,
+	MLX5_FLOW_CONTEXT_DEST_TYPE_TIR		= 2,
+};
+
+enum mlx5_list_type {
+	MLX5_NVPRT_LIST_TYPE_UC   = 0x0,
+	MLX5_NVPRT_LIST_TYPE_MC   = 0x1,
+	MLX5_NVPRT_LIST_TYPE_VLAN = 0x2,
+};
+
+enum {
+	MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE = 0x0,
+	MLX5_RQC_RQ_TYPE_MEMORY_RQ_RPM    = 0x1,
+};
+
+enum mlx5_wol_mode {
+	MLX5_WOL_DISABLE        = 0,
+	MLX5_WOL_SECURED_MAGIC  = 1 << 1,
+	MLX5_WOL_MAGIC          = 1 << 2,
+	MLX5_WOL_ARP            = 1 << 3,
+	MLX5_WOL_BROADCAST      = 1 << 4,
+	MLX5_WOL_MULTICAST      = 1 << 5,
+	MLX5_WOL_UNICAST        = 1 << 6,
+	MLX5_WOL_PHY_ACTIVITY   = 1 << 7,
+};
+
+/* MLX5 DEV CAPs */
+
+/* TODO: EAT.ME */
+enum mlx5_cap_mode {
+	HCA_CAP_OPMOD_GET_MAX	= 0,
+	HCA_CAP_OPMOD_GET_CUR	= 1,
+};
+
+enum mlx5_cap_type {
+	MLX5_CAP_GENERAL = 0,
+	MLX5_CAP_ETHERNET_OFFLOADS,
+	MLX5_CAP_ODP,
+	MLX5_CAP_ATOMIC,
+	MLX5_CAP_ROCE,
+	MLX5_CAP_IPOIB_OFFLOADS,
+	MLX5_CAP_IPOIB_ENHANCED_OFFLOADS,
+	MLX5_CAP_FLOW_TABLE,
+	MLX5_CAP_ESWITCH_FLOW_TABLE,
+	MLX5_CAP_ESWITCH,
+	MLX5_CAP_RESERVED,
+	MLX5_CAP_VECTOR_CALC,
+	MLX5_CAP_QOS,
+	MLX5_CAP_DEBUG,
+	MLX5_CAP_RESERVED_14,
+	MLX5_CAP_DEV_MEM,
+	/* NUM OF CAP Types */
+	MLX5_CAP_NUM
+};
+
+enum mlx5_pcam_reg_groups {
+	MLX5_PCAM_REGS_5000_TO_507F                 = 0x0,
+};
+
+enum mlx5_pcam_feature_groups {
+	MLX5_PCAM_FEATURE_ENHANCED_FEATURES         = 0x0,
+};
+
+enum mlx5_mcam_reg_groups {
+	MLX5_MCAM_REGS_FIRST_128                    = 0x0,
+};
+
+enum mlx5_mcam_feature_groups {
+	MLX5_MCAM_FEATURE_ENHANCED_FEATURES         = 0x0,
+};
+
+enum mlx5_qcam_reg_groups {
+	MLX5_QCAM_REGS_FIRST_128                    = 0x0,
+};
+
+enum mlx5_qcam_feature_groups {
+	MLX5_QCAM_FEATURE_ENHANCED_FEATURES         = 0x0,
+};
+
+/* GET Dev Caps macros */
+#define MLX5_CAP_GEN(mdev, cap) \
+	MLX5_GET(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap)
+
+#define MLX5_CAP_GEN_MAX(mdev, cap) \
+	MLX5_GET(cmd_hca_cap, mdev->caps.hca_max[MLX5_CAP_GENERAL], cap)
+
+#define MLX5_CAP_ETH(mdev, cap) \
+	MLX5_GET(per_protocol_networking_offload_caps,\
+		 mdev->caps.hca_cur[MLX5_CAP_ETHERNET_OFFLOADS], cap)
+
+#define MLX5_CAP_ETH_MAX(mdev, cap) \
+	MLX5_GET(per_protocol_networking_offload_caps,\
+		 mdev->caps.hca_max[MLX5_CAP_ETHERNET_OFFLOADS], cap)
+
+#define MLX5_CAP_IPOIB_ENHANCED(mdev, cap) \
+	MLX5_GET(per_protocol_networking_offload_caps,\
+		 mdev->caps.hca_cur[MLX5_CAP_IPOIB_ENHANCED_OFFLOADS], cap)
+
+#define MLX5_CAP_ROCE(mdev, cap) \
+	MLX5_GET(roce_cap, mdev->caps.hca_cur[MLX5_CAP_ROCE], cap)
+
+#define MLX5_CAP_ROCE_MAX(mdev, cap) \
+	MLX5_GET(roce_cap, mdev->caps.hca_max[MLX5_CAP_ROCE], cap)
+
+#define MLX5_CAP_ATOMIC(mdev, cap) \
+	MLX5_GET(atomic_caps, mdev->caps.hca_cur[MLX5_CAP_ATOMIC], cap)
+
+#define MLX5_CAP_ATOMIC_MAX(mdev, cap) \
+	MLX5_GET(atomic_caps, mdev->caps.hca_max[MLX5_CAP_ATOMIC], cap)
+
+#define MLX5_CAP_FLOWTABLE(mdev, cap) \
+	MLX5_GET(flow_table_nic_cap, mdev->caps.hca_cur[MLX5_CAP_FLOW_TABLE], cap)
+
+#define MLX5_CAP_FLOWTABLE_MAX(mdev, cap) \
+	MLX5_GET(flow_table_nic_cap, mdev->caps.hca_max[MLX5_CAP_FLOW_TABLE], cap)
+
+#define MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) \
+	MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.cap)
+
+#define MLX5_CAP_FLOWTABLE_NIC_RX_MAX(mdev, cap) \
+	MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_receive.cap)
+
+#define MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) \
+	MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive_sniffer.cap)
+
+#define MLX5_CAP_FLOWTABLE_SNIFFER_RX_MAX(mdev, cap) \
+	MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_receive_sniffer.cap)
+
+#define MLX5_CAP_FLOWTABLE_SNIFFER_TX(mdev, cap) \
+	MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_transmit_sniffer.cap)
+
+#define MLX5_CAP_FLOWTABLE_SNIFFER_TX_MAX(mdev, cap) \
+	MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_transmit_sniffer.cap)
+
+#define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \
+	MLX5_GET(flow_table_eswitch_cap, \
+		 mdev->caps.hca_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
+
+#define MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, cap) \
+	MLX5_GET(flow_table_eswitch_cap, \
+		 mdev->caps.hca_max[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
+
+#define MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_nic_esw_fdb.cap)
+
+#define MLX5_CAP_ESW_FLOWTABLE_FDB_MAX(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_nic_esw_fdb.cap)
+
+#define MLX5_CAP_ESW_EGRESS_ACL(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_esw_acl_egress.cap)
+
+#define MLX5_CAP_ESW_EGRESS_ACL_MAX(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_esw_acl_egress.cap)
+
+#define MLX5_CAP_ESW_INGRESS_ACL(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_esw_acl_ingress.cap)
+
+#define MLX5_CAP_ESW_INGRESS_ACL_MAX(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_esw_acl_ingress.cap)
+
+#define MLX5_CAP_ESW(mdev, cap) \
+	MLX5_GET(e_switch_cap, \
+		 mdev->caps.hca_cur[MLX5_CAP_ESWITCH], cap)
+
+#define MLX5_CAP_ESW_MAX(mdev, cap) \
+	MLX5_GET(e_switch_cap, \
+		 mdev->caps.hca_max[MLX5_CAP_ESWITCH], cap)
+
+#define MLX5_CAP_ODP(mdev, cap)\
+	MLX5_GET(odp_cap, mdev->caps.hca_cur[MLX5_CAP_ODP], cap)
+
+#define MLX5_CAP_VECTOR_CALC(mdev, cap) \
+	MLX5_GET(vector_calc_cap, \
+		 mdev->caps.hca_cur[MLX5_CAP_VECTOR_CALC], cap)
+
+#define MLX5_CAP_QOS(mdev, cap)\
+	MLX5_GET(qos_cap, mdev->caps.hca_cur[MLX5_CAP_QOS], cap)
+
+#define MLX5_CAP_DEBUG(mdev, cap)\
+	MLX5_GET(debug_cap, mdev->caps.hca_cur[MLX5_CAP_DEBUG], cap)
+
+#define MLX5_CAP_PCAM_FEATURE(mdev, fld) \
+	MLX5_GET(pcam_reg, (mdev)->caps.pcam, feature_cap_mask.enhanced_features.fld)
+
+#define MLX5_CAP_MCAM_REG(mdev, reg) \
+	MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_access_reg_cap_mask.access_regs.reg)
+
+#define MLX5_CAP_MCAM_FEATURE(mdev, fld) \
+	MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_feature_cap_mask.enhanced_features.fld)
+
+#define MLX5_CAP_QCAM_REG(mdev, fld) \
+	MLX5_GET(qcam_reg, (mdev)->caps.qcam, qos_access_reg_cap_mask.reg_cap.fld)
+
+#define MLX5_CAP_QCAM_FEATURE(mdev, fld) \
+	MLX5_GET(qcam_reg, (mdev)->caps.qcam, qos_feature_cap_mask.feature_cap.fld)
+
+#define MLX5_CAP_FPGA(mdev, cap) \
+	MLX5_GET(fpga_cap, (mdev)->caps.fpga, cap)
+
+#define MLX5_CAP64_FPGA(mdev, cap) \
+	MLX5_GET64(fpga_cap, (mdev)->caps.fpga, cap)
+
+#define MLX5_CAP_DEV_MEM(mdev, cap)\
+	MLX5_GET(device_mem_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_MEM], cap)
+
+#define MLX5_CAP64_DEV_MEM(mdev, cap)\
+	MLX5_GET64(device_mem_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_MEM], cap)
+
+enum {
+	MLX5_CMD_STAT_OK			= 0x0,
+	MLX5_CMD_STAT_INT_ERR			= 0x1,
+	MLX5_CMD_STAT_BAD_OP_ERR		= 0x2,
+	MLX5_CMD_STAT_BAD_PARAM_ERR		= 0x3,
+	MLX5_CMD_STAT_BAD_SYS_STATE_ERR		= 0x4,
+	MLX5_CMD_STAT_BAD_RES_ERR		= 0x5,
+	MLX5_CMD_STAT_RES_BUSY			= 0x6,
+	MLX5_CMD_STAT_LIM_ERR			= 0x8,
+	MLX5_CMD_STAT_BAD_RES_STATE_ERR		= 0x9,
+	MLX5_CMD_STAT_IX_ERR			= 0xa,
+	MLX5_CMD_STAT_NO_RES_ERR		= 0xf,
+	MLX5_CMD_STAT_BAD_INP_LEN_ERR		= 0x50,
+	MLX5_CMD_STAT_BAD_OUTP_LEN_ERR		= 0x51,
+	MLX5_CMD_STAT_BAD_QP_STATE_ERR		= 0x10,
+	MLX5_CMD_STAT_BAD_PKT_ERR		= 0x30,
+	MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR	= 0x40,
+};
+
+enum {
+	MLX5_IEEE_802_3_COUNTERS_GROUP	      = 0x0,
+	MLX5_RFC_2863_COUNTERS_GROUP	      = 0x1,
+	MLX5_RFC_2819_COUNTERS_GROUP	      = 0x2,
+	MLX5_RFC_3635_COUNTERS_GROUP	      = 0x3,
+	MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP = 0x5,
+	MLX5_PER_PRIORITY_COUNTERS_GROUP      = 0x10,
+	MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11,
+	MLX5_PHYSICAL_LAYER_COUNTERS_GROUP    = 0x12,
+	MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP = 0x16,
+	MLX5_INFINIBAND_PORT_COUNTERS_GROUP   = 0x20,
+};
+
+enum {
+	MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP       = 0x0,
+};
+
+static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz)
+{
+	if (pkey_sz > MLX5_MAX_LOG_PKEY_TABLE)
+		return 0;
+	return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz;
+}
+
+#define MLX5_BY_PASS_NUM_REGULAR_PRIOS 16
+#define MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS 16
+#define MLX5_BY_PASS_NUM_MULTICAST_PRIOS 1
+#define MLX5_BY_PASS_NUM_PRIOS (MLX5_BY_PASS_NUM_REGULAR_PRIOS +\
+				MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS +\
+				MLX5_BY_PASS_NUM_MULTICAST_PRIOS)
+
+#endif /* MLX5_DEVICE_H */
diff --git a/include/linux/mlx5/doorbell.h b/include/linux/mlx5/doorbell.h
new file mode 100644
index 0000000..0787de2
--- /dev/null
+++ b/include/linux/mlx5/doorbell.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_DOORBELL_H
+#define MLX5_DOORBELL_H
+
+#define MLX5_BF_OFFSET	      0x800
+#define MLX5_CQ_DOORBELL      0x20
+
+#if BITS_PER_LONG == 64
+/* Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MLX5_DECLARE_DOORBELL_LOCK(name)
+#define MLX5_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MLX5_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mlx5_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	__raw_writeq(*(u64 *)val, dest);
+}
+
+#else
+
+/* Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MLX5_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MLX5_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MLX5_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mlx5_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	if (doorbell_lock)
+		spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel((__force u32) val[0], dest);
+	__raw_writel((__force u32) val[1], dest + 4);
+	if (doorbell_lock)
+		spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+#endif
+
+#endif /* MLX5_DOORBELL_H */
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
new file mode 100644
index 0000000..d703774
--- /dev/null
+++ b/include/linux/mlx5/driver.h
@@ -0,0 +1,1292 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_DRIVER_H
+#define MLX5_DRIVER_H
+
+#include <linux/kernel.h>
+#include <linux/completion.h>
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <linux/spinlock_types.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/radix-tree.h>
+#include <linux/workqueue.h>
+#include <linux/mempool.h>
+#include <linux/interrupt.h>
+#include <linux/idr.h>
+
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/doorbell.h>
+#include <linux/mlx5/srq.h>
+#include <linux/timecounter.h>
+#include <linux/ptp_clock_kernel.h>
+
+enum {
+	MLX5_BOARD_ID_LEN = 64,
+	MLX5_MAX_NAME_LEN = 16,
+};
+
+enum {
+	/* one minute for the sake of bringup. Generally, commands must always
+	 * complete and we may need to increase this timeout value
+	 */
+	MLX5_CMD_TIMEOUT_MSEC	= 60 * 1000,
+	MLX5_CMD_WQ_MAX_NAME	= 32,
+};
+
+enum {
+	CMD_OWNER_SW		= 0x0,
+	CMD_OWNER_HW		= 0x1,
+	CMD_STATUS_SUCCESS	= 0,
+};
+
+enum mlx5_sqp_t {
+	MLX5_SQP_SMI		= 0,
+	MLX5_SQP_GSI		= 1,
+	MLX5_SQP_IEEE_1588	= 2,
+	MLX5_SQP_SNIFFER	= 3,
+	MLX5_SQP_SYNC_UMR	= 4,
+};
+
+enum {
+	MLX5_MAX_PORTS	= 2,
+};
+
+enum {
+	MLX5_EQ_VEC_PAGES	 = 0,
+	MLX5_EQ_VEC_CMD		 = 1,
+	MLX5_EQ_VEC_ASYNC	 = 2,
+	MLX5_EQ_VEC_PFAULT	 = 3,
+	MLX5_EQ_VEC_COMP_BASE,
+};
+
+enum {
+	MLX5_MAX_IRQ_NAME	= 32
+};
+
+enum {
+	MLX5_ATOMIC_MODE_IB_COMP	= 1 << 16,
+	MLX5_ATOMIC_MODE_CX		= 2 << 16,
+	MLX5_ATOMIC_MODE_8B		= 3 << 16,
+	MLX5_ATOMIC_MODE_16B		= 4 << 16,
+	MLX5_ATOMIC_MODE_32B		= 5 << 16,
+	MLX5_ATOMIC_MODE_64B		= 6 << 16,
+	MLX5_ATOMIC_MODE_128B		= 7 << 16,
+	MLX5_ATOMIC_MODE_256B		= 8 << 16,
+};
+
+enum {
+	MLX5_REG_QPTS            = 0x4002,
+	MLX5_REG_QETCR		 = 0x4005,
+	MLX5_REG_QTCT		 = 0x400a,
+	MLX5_REG_QPDPM           = 0x4013,
+	MLX5_REG_QCAM            = 0x4019,
+	MLX5_REG_DCBX_PARAM      = 0x4020,
+	MLX5_REG_DCBX_APP        = 0x4021,
+	MLX5_REG_FPGA_CAP	 = 0x4022,
+	MLX5_REG_FPGA_CTRL	 = 0x4023,
+	MLX5_REG_FPGA_ACCESS_REG = 0x4024,
+	MLX5_REG_PCAP		 = 0x5001,
+	MLX5_REG_PMTU		 = 0x5003,
+	MLX5_REG_PTYS		 = 0x5004,
+	MLX5_REG_PAOS		 = 0x5006,
+	MLX5_REG_PFCC            = 0x5007,
+	MLX5_REG_PPCNT		 = 0x5008,
+	MLX5_REG_PMAOS		 = 0x5012,
+	MLX5_REG_PUDE		 = 0x5009,
+	MLX5_REG_PMPE		 = 0x5010,
+	MLX5_REG_PELC		 = 0x500e,
+	MLX5_REG_PVLC		 = 0x500f,
+	MLX5_REG_PCMR		 = 0x5041,
+	MLX5_REG_PMLP		 = 0x5002,
+	MLX5_REG_PCAM		 = 0x507f,
+	MLX5_REG_NODE_DESC	 = 0x6001,
+	MLX5_REG_HOST_ENDIANNESS = 0x7004,
+	MLX5_REG_MCIA		 = 0x9014,
+	MLX5_REG_MLCR		 = 0x902b,
+	MLX5_REG_MPCNT		 = 0x9051,
+	MLX5_REG_MTPPS		 = 0x9053,
+	MLX5_REG_MTPPSE		 = 0x9054,
+	MLX5_REG_MCQI		 = 0x9061,
+	MLX5_REG_MCC		 = 0x9062,
+	MLX5_REG_MCDA		 = 0x9063,
+	MLX5_REG_MCAM		 = 0x907f,
+};
+
+enum mlx5_qpts_trust_state {
+	MLX5_QPTS_TRUST_PCP  = 1,
+	MLX5_QPTS_TRUST_DSCP = 2,
+};
+
+enum mlx5_dcbx_oper_mode {
+	MLX5E_DCBX_PARAM_VER_OPER_HOST  = 0x0,
+	MLX5E_DCBX_PARAM_VER_OPER_AUTO  = 0x3,
+};
+
+enum mlx5_dct_atomic_mode {
+	MLX5_ATOMIC_MODE_DCT_OFF        = 20,
+	MLX5_ATOMIC_MODE_DCT_NONE       = 0 << MLX5_ATOMIC_MODE_DCT_OFF,
+	MLX5_ATOMIC_MODE_DCT_IB_COMP    = 1 << MLX5_ATOMIC_MODE_DCT_OFF,
+	MLX5_ATOMIC_MODE_DCT_CX         = 2 << MLX5_ATOMIC_MODE_DCT_OFF,
+};
+
+enum {
+	MLX5_ATOMIC_OPS_CMP_SWAP	= 1 << 0,
+	MLX5_ATOMIC_OPS_FETCH_ADD	= 1 << 1,
+};
+
+enum mlx5_page_fault_resume_flags {
+	MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0,
+	MLX5_PAGE_FAULT_RESUME_WRITE	 = 1 << 1,
+	MLX5_PAGE_FAULT_RESUME_RDMA	 = 1 << 2,
+	MLX5_PAGE_FAULT_RESUME_ERROR	 = 1 << 7,
+};
+
+enum dbg_rsc_type {
+	MLX5_DBG_RSC_QP,
+	MLX5_DBG_RSC_EQ,
+	MLX5_DBG_RSC_CQ,
+};
+
+enum port_state_policy {
+	MLX5_POLICY_DOWN	= 0,
+	MLX5_POLICY_UP		= 1,
+	MLX5_POLICY_FOLLOW	= 2,
+	MLX5_POLICY_INVALID	= 0xffffffff
+};
+
+struct mlx5_field_desc {
+	struct dentry	       *dent;
+	int			i;
+};
+
+struct mlx5_rsc_debug {
+	struct mlx5_core_dev   *dev;
+	void		       *object;
+	enum dbg_rsc_type	type;
+	struct dentry	       *root;
+	struct mlx5_field_desc	fields[0];
+};
+
+enum mlx5_dev_event {
+	MLX5_DEV_EVENT_SYS_ERROR,
+	MLX5_DEV_EVENT_PORT_UP,
+	MLX5_DEV_EVENT_PORT_DOWN,
+	MLX5_DEV_EVENT_PORT_INITIALIZED,
+	MLX5_DEV_EVENT_LID_CHANGE,
+	MLX5_DEV_EVENT_PKEY_CHANGE,
+	MLX5_DEV_EVENT_GUID_CHANGE,
+	MLX5_DEV_EVENT_CLIENT_REREG,
+	MLX5_DEV_EVENT_PPS,
+	MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT,
+};
+
+enum mlx5_port_status {
+	MLX5_PORT_UP        = 1,
+	MLX5_PORT_DOWN      = 2,
+};
+
+enum mlx5_eq_type {
+	MLX5_EQ_TYPE_COMP,
+	MLX5_EQ_TYPE_ASYNC,
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	MLX5_EQ_TYPE_PF,
+#endif
+};
+
+struct mlx5_bfreg_info {
+	u32		       *sys_pages;
+	int			num_low_latency_bfregs;
+	unsigned int	       *count;
+
+	/*
+	 * protect bfreg allocation data structs
+	 */
+	struct mutex		lock;
+	u32			ver;
+	bool			lib_uar_4k;
+	u32			num_sys_pages;
+	u32			num_static_sys_pages;
+	u32			total_num_bfregs;
+	u32			num_dyn_bfregs;
+};
+
+struct mlx5_cmd_first {
+	__be32		data[4];
+};
+
+struct mlx5_cmd_msg {
+	struct list_head		list;
+	struct cmd_msg_cache	       *parent;
+	u32				len;
+	struct mlx5_cmd_first		first;
+	struct mlx5_cmd_mailbox	       *next;
+};
+
+struct mlx5_cmd_debug {
+	struct dentry	       *dbg_root;
+	struct dentry	       *dbg_in;
+	struct dentry	       *dbg_out;
+	struct dentry	       *dbg_outlen;
+	struct dentry	       *dbg_status;
+	struct dentry	       *dbg_run;
+	void		       *in_msg;
+	void		       *out_msg;
+	u8			status;
+	u16			inlen;
+	u16			outlen;
+};
+
+struct cmd_msg_cache {
+	/* protect block chain allocations
+	 */
+	spinlock_t		lock;
+	struct list_head	head;
+	unsigned int		max_inbox_size;
+	unsigned int		num_ent;
+};
+
+enum {
+	MLX5_NUM_COMMAND_CACHES = 5,
+};
+
+struct mlx5_cmd_stats {
+	u64		sum;
+	u64		n;
+	struct dentry  *root;
+	struct dentry  *avg;
+	struct dentry  *count;
+	/* protect command average calculations */
+	spinlock_t	lock;
+};
+
+struct mlx5_cmd {
+	void	       *cmd_alloc_buf;
+	dma_addr_t	alloc_dma;
+	int		alloc_size;
+	void	       *cmd_buf;
+	dma_addr_t	dma;
+	u16		cmdif_rev;
+	u8		log_sz;
+	u8		log_stride;
+	int		max_reg_cmds;
+	int		events;
+	u32 __iomem    *vector;
+
+	/* protect command queue allocations
+	 */
+	spinlock_t	alloc_lock;
+
+	/* protect token allocations
+	 */
+	spinlock_t	token_lock;
+	u8		token;
+	unsigned long	bitmask;
+	char		wq_name[MLX5_CMD_WQ_MAX_NAME];
+	struct workqueue_struct *wq;
+	struct semaphore sem;
+	struct semaphore pages_sem;
+	int	mode;
+	struct mlx5_cmd_work_ent *ent_arr[MLX5_MAX_COMMANDS];
+	struct dma_pool *pool;
+	struct mlx5_cmd_debug dbg;
+	struct cmd_msg_cache cache[MLX5_NUM_COMMAND_CACHES];
+	int checksum_disabled;
+	struct mlx5_cmd_stats stats[MLX5_CMD_OP_MAX];
+};
+
+struct mlx5_port_caps {
+	int	gid_table_len;
+	int	pkey_table_len;
+	u8	ext_port_cap;
+	bool	has_smi;
+};
+
+struct mlx5_cmd_mailbox {
+	void	       *buf;
+	dma_addr_t	dma;
+	struct mlx5_cmd_mailbox *next;
+};
+
+struct mlx5_buf_list {
+	void		       *buf;
+	dma_addr_t		map;
+};
+
+struct mlx5_frag_buf {
+	struct mlx5_buf_list	*frags;
+	int			npages;
+	int			size;
+	u8			page_shift;
+};
+
+struct mlx5_frag_buf_ctrl {
+	struct mlx5_frag_buf	frag_buf;
+	u32			sz_m1;
+	u32			frag_sz_m1;
+	u8			log_sz;
+	u8			log_stride;
+	u8			log_frag_strides;
+};
+
+struct mlx5_eq_tasklet {
+	struct list_head list;
+	struct list_head process_list;
+	struct tasklet_struct task;
+	/* lock on completion tasklet list */
+	spinlock_t lock;
+};
+
+struct mlx5_eq_pagefault {
+	struct work_struct       work;
+	/* Pagefaults lock */
+	spinlock_t		 lock;
+	struct workqueue_struct *wq;
+	mempool_t		*pool;
+};
+
+struct mlx5_cq_table {
+	/* protect radix tree */
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+};
+
+struct mlx5_eq {
+	struct mlx5_core_dev   *dev;
+	struct mlx5_cq_table	cq_table;
+	__be32 __iomem	       *doorbell;
+	u32			cons_index;
+	struct mlx5_frag_buf	buf;
+	int			size;
+	unsigned int		irqn;
+	u8			eqn;
+	int			nent;
+	u64			mask;
+	struct list_head	list;
+	int			index;
+	struct mlx5_rsc_debug	*dbg;
+	enum mlx5_eq_type	type;
+	union {
+		struct mlx5_eq_tasklet   tasklet_ctx;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+		struct mlx5_eq_pagefault pf_ctx;
+#endif
+	};
+};
+
+struct mlx5_core_psv {
+	u32	psv_idx;
+	struct psv_layout {
+		u32	pd;
+		u16	syndrome;
+		u16	reserved;
+		u16	bg;
+		u16	app_tag;
+		u32	ref_tag;
+	} psv;
+};
+
+struct mlx5_core_sig_ctx {
+	struct mlx5_core_psv	psv_memory;
+	struct mlx5_core_psv	psv_wire;
+	struct ib_sig_err       err_item;
+	bool			sig_status_checked;
+	bool			sig_err_exists;
+	u32			sigerr_count;
+};
+
+enum {
+	MLX5_MKEY_MR = 1,
+	MLX5_MKEY_MW,
+};
+
+struct mlx5_core_mkey {
+	u64			iova;
+	u64			size;
+	u32			key;
+	u32			pd;
+	u32			type;
+};
+
+#define MLX5_24BIT_MASK		((1 << 24) - 1)
+
+enum mlx5_res_type {
+	MLX5_RES_QP	= MLX5_EVENT_QUEUE_TYPE_QP,
+	MLX5_RES_RQ	= MLX5_EVENT_QUEUE_TYPE_RQ,
+	MLX5_RES_SQ	= MLX5_EVENT_QUEUE_TYPE_SQ,
+	MLX5_RES_SRQ	= 3,
+	MLX5_RES_XSRQ	= 4,
+	MLX5_RES_XRQ	= 5,
+	MLX5_RES_DCT	= MLX5_EVENT_QUEUE_TYPE_DCT,
+};
+
+struct mlx5_core_rsc_common {
+	enum mlx5_res_type	res;
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx5_core_srq {
+	struct mlx5_core_rsc_common	common; /* must be first */
+	u32		srqn;
+	int		max;
+	size_t		max_gs;
+	size_t		max_avail_gather;
+	int		wqe_shift;
+	void (*event)	(struct mlx5_core_srq *, enum mlx5_event);
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx5_eq_table {
+	void __iomem	       *update_ci;
+	void __iomem	       *update_arm_ci;
+	struct list_head	comp_eqs_list;
+	struct mlx5_eq		pages_eq;
+	struct mlx5_eq		async_eq;
+	struct mlx5_eq		cmd_eq;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	struct mlx5_eq		pfault_eq;
+#endif
+	int			num_comp_vectors;
+	/* protect EQs list
+	 */
+	spinlock_t		lock;
+};
+
+struct mlx5_uars_page {
+	void __iomem	       *map;
+	bool			wc;
+	u32			index;
+	struct list_head	list;
+	unsigned int		bfregs;
+	unsigned long	       *reg_bitmap; /* for non fast path bf regs */
+	unsigned long	       *fp_bitmap;
+	unsigned int		reg_avail;
+	unsigned int		fp_avail;
+	struct kref		ref_count;
+	struct mlx5_core_dev   *mdev;
+};
+
+struct mlx5_bfreg_head {
+	/* protect blue flame registers allocations */
+	struct mutex		lock;
+	struct list_head	list;
+};
+
+struct mlx5_bfreg_data {
+	struct mlx5_bfreg_head	reg_head;
+	struct mlx5_bfreg_head	wc_head;
+};
+
+struct mlx5_sq_bfreg {
+	void __iomem	       *map;
+	struct mlx5_uars_page  *up;
+	bool			wc;
+	u32			index;
+	unsigned int		offset;
+};
+
+struct mlx5_core_health {
+	struct health_buffer __iomem   *health;
+	__be32 __iomem		       *health_counter;
+	struct timer_list		timer;
+	u32				prev;
+	int				miss_counter;
+	bool				sick;
+	/* wq spinlock to synchronize draining */
+	spinlock_t			wq_lock;
+	struct workqueue_struct	       *wq;
+	unsigned long			flags;
+	struct work_struct		work;
+	struct delayed_work		recover_work;
+};
+
+struct mlx5_qp_table {
+	/* protect radix tree
+	 */
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+};
+
+struct mlx5_srq_table {
+	/* protect radix tree
+	 */
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+};
+
+struct mlx5_mkey_table {
+	/* protect radix tree
+	 */
+	rwlock_t		lock;
+	struct radix_tree_root	tree;
+};
+
+struct mlx5_vf_context {
+	int	enabled;
+	u64	port_guid;
+	u64	node_guid;
+	enum port_state_policy	policy;
+};
+
+struct mlx5_core_sriov {
+	struct mlx5_vf_context	*vfs_ctx;
+	int			num_vfs;
+	int			enabled_vfs;
+};
+
+struct mlx5_irq_info {
+	cpumask_var_t mask;
+	char name[MLX5_MAX_IRQ_NAME];
+};
+
+struct mlx5_fc_stats {
+	struct rb_root counters;
+	struct list_head addlist;
+	/* protect addlist add/splice operations */
+	spinlock_t addlist_lock;
+
+	struct workqueue_struct *wq;
+	struct delayed_work work;
+	unsigned long next_query;
+	unsigned long sampling_interval; /* jiffies */
+};
+
+struct mlx5_mpfs;
+struct mlx5_eswitch;
+struct mlx5_lag;
+struct mlx5_pagefault;
+
+struct mlx5_rate_limit {
+	u32			rate;
+	u32			max_burst_sz;
+	u16			typical_pkt_sz;
+};
+
+struct mlx5_rl_entry {
+	struct mlx5_rate_limit	rl;
+	u16                     index;
+	u16                     refcount;
+};
+
+struct mlx5_rl_table {
+	/* protect rate limit table */
+	struct mutex            rl_lock;
+	u16                     max_size;
+	u32                     max_rate;
+	u32                     min_rate;
+	struct mlx5_rl_entry   *rl_entry;
+};
+
+enum port_module_event_status_type {
+	MLX5_MODULE_STATUS_PLUGGED   = 0x1,
+	MLX5_MODULE_STATUS_UNPLUGGED = 0x2,
+	MLX5_MODULE_STATUS_ERROR     = 0x3,
+	MLX5_MODULE_STATUS_NUM       = 0x3,
+};
+
+enum  port_module_event_error_type {
+	MLX5_MODULE_EVENT_ERROR_POWER_BUDGET_EXCEEDED,
+	MLX5_MODULE_EVENT_ERROR_LONG_RANGE_FOR_NON_MLNX_CABLE_MODULE,
+	MLX5_MODULE_EVENT_ERROR_BUS_STUCK,
+	MLX5_MODULE_EVENT_ERROR_NO_EEPROM_RETRY_TIMEOUT,
+	MLX5_MODULE_EVENT_ERROR_ENFORCE_PART_NUMBER_LIST,
+	MLX5_MODULE_EVENT_ERROR_UNKNOWN_IDENTIFIER,
+	MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE,
+	MLX5_MODULE_EVENT_ERROR_BAD_CABLE,
+	MLX5_MODULE_EVENT_ERROR_UNKNOWN,
+	MLX5_MODULE_EVENT_ERROR_NUM,
+};
+
+struct mlx5_port_module_event_stats {
+	u64 status_counters[MLX5_MODULE_STATUS_NUM];
+	u64 error_counters[MLX5_MODULE_EVENT_ERROR_NUM];
+};
+
+struct mlx5_priv {
+	char			name[MLX5_MAX_NAME_LEN];
+	struct mlx5_eq_table	eq_table;
+	struct mlx5_irq_info	*irq_info;
+
+	/* pages stuff */
+	struct workqueue_struct *pg_wq;
+	struct rb_root		page_root;
+	int			fw_pages;
+	atomic_t		reg_pages;
+	struct list_head	free_list;
+	int			vfs_pages;
+
+	struct mlx5_core_health health;
+
+	struct mlx5_srq_table	srq_table;
+
+	/* start: qp staff */
+	struct mlx5_qp_table	qp_table;
+	struct dentry	       *qp_debugfs;
+	struct dentry	       *eq_debugfs;
+	struct dentry	       *cq_debugfs;
+	struct dentry	       *cmdif_debugfs;
+	/* end: qp staff */
+
+	/* start: mkey staff */
+	struct mlx5_mkey_table	mkey_table;
+	/* end: mkey staff */
+
+	/* start: alloc staff */
+	/* protect buffer alocation according to numa node */
+	struct mutex            alloc_mutex;
+	int                     numa_node;
+
+	struct mutex            pgdir_mutex;
+	struct list_head        pgdir_list;
+	/* end: alloc staff */
+	struct dentry	       *dbg_root;
+
+	/* protect mkey key part */
+	spinlock_t		mkey_lock;
+	u8			mkey_key;
+
+	struct list_head        dev_list;
+	struct list_head        ctx_list;
+	spinlock_t              ctx_lock;
+
+	struct list_head	waiting_events_list;
+	bool			is_accum_events;
+
+	struct mlx5_flow_steering *steering;
+	struct mlx5_mpfs        *mpfs;
+	struct mlx5_eswitch     *eswitch;
+	struct mlx5_core_sriov	sriov;
+	struct mlx5_lag		*lag;
+	unsigned long		pci_dev_data;
+	struct mlx5_fc_stats		fc_stats;
+	struct mlx5_rl_table            rl_table;
+
+	struct mlx5_port_module_event_stats  pme_stats;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	void		      (*pfault)(struct mlx5_core_dev *dev,
+					void *context,
+					struct mlx5_pagefault *pfault);
+	void		       *pfault_ctx;
+	struct srcu_struct      pfault_srcu;
+#endif
+	struct mlx5_bfreg_data		bfregs;
+	struct mlx5_uars_page	       *uar;
+};
+
+enum mlx5_device_state {
+	MLX5_DEVICE_STATE_UP,
+	MLX5_DEVICE_STATE_INTERNAL_ERROR,
+};
+
+enum mlx5_interface_state {
+	MLX5_INTERFACE_STATE_UP = BIT(0),
+};
+
+enum mlx5_pci_status {
+	MLX5_PCI_STATUS_DISABLED,
+	MLX5_PCI_STATUS_ENABLED,
+};
+
+enum mlx5_pagefault_type_flags {
+	MLX5_PFAULT_REQUESTOR = 1 << 0,
+	MLX5_PFAULT_WRITE     = 1 << 1,
+	MLX5_PFAULT_RDMA      = 1 << 2,
+};
+
+/* Contains the details of a pagefault. */
+struct mlx5_pagefault {
+	u32			bytes_committed;
+	u32			token;
+	u8			event_subtype;
+	u8			type;
+	union {
+		/* Initiator or send message responder pagefault details. */
+		struct {
+			/* Received packet size, only valid for responders. */
+			u32	packet_size;
+			/*
+			 * Number of resource holding WQE, depends on type.
+			 */
+			u32	wq_num;
+			/*
+			 * WQE index. Refers to either the send queue or
+			 * receive queue, according to event_subtype.
+			 */
+			u16	wqe_index;
+		} wqe;
+		/* RDMA responder pagefault details */
+		struct {
+			u32	r_key;
+			/*
+			 * Received packet size, minimal size page fault
+			 * resolution required for forward progress.
+			 */
+			u32	packet_size;
+			u32	rdma_op_len;
+			u64	rdma_va;
+		} rdma;
+	};
+
+	struct mlx5_eq	       *eq;
+	struct work_struct	work;
+};
+
+struct mlx5_td {
+	struct list_head tirs_list;
+	u32              tdn;
+};
+
+struct mlx5e_resources {
+	u32                        pdn;
+	struct mlx5_td             td;
+	struct mlx5_core_mkey      mkey;
+	struct mlx5_sq_bfreg       bfreg;
+};
+
+#define MLX5_MAX_RESERVED_GIDS 8
+
+struct mlx5_rsvd_gids {
+	unsigned int start;
+	unsigned int count;
+	struct ida ida;
+};
+
+#define MAX_PIN_NUM	8
+struct mlx5_pps {
+	u8                         pin_caps[MAX_PIN_NUM];
+	struct work_struct         out_work;
+	u64                        start[MAX_PIN_NUM];
+	u8                         enabled;
+};
+
+struct mlx5_clock {
+	rwlock_t                   lock;
+	struct cyclecounter        cycles;
+	struct timecounter         tc;
+	struct hwtstamp_config     hwtstamp_config;
+	u32                        nominal_c_mult;
+	unsigned long              overflow_period;
+	struct delayed_work        overflow_work;
+	struct mlx5_core_dev      *mdev;
+	struct ptp_clock          *ptp;
+	struct ptp_clock_info      ptp_info;
+	struct mlx5_pps            pps_info;
+};
+
+struct mlx5_core_dev {
+	struct pci_dev	       *pdev;
+	/* sync pci state */
+	struct mutex		pci_status_mutex;
+	enum mlx5_pci_status	pci_status;
+	u8			rev_id;
+	char			board_id[MLX5_BOARD_ID_LEN];
+	struct mlx5_cmd		cmd;
+	struct mlx5_port_caps	port_caps[MLX5_MAX_PORTS];
+	struct {
+		u32 hca_cur[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
+		u32 hca_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
+		u32 pcam[MLX5_ST_SZ_DW(pcam_reg)];
+		u32 mcam[MLX5_ST_SZ_DW(mcam_reg)];
+		u32 fpga[MLX5_ST_SZ_DW(fpga_cap)];
+		u32 qcam[MLX5_ST_SZ_DW(qcam_reg)];
+	} caps;
+	phys_addr_t		iseg_base;
+	struct mlx5_init_seg __iomem *iseg;
+	enum mlx5_device_state	state;
+	/* sync interface state */
+	struct mutex		intf_state_mutex;
+	unsigned long		intf_state;
+	void			(*event) (struct mlx5_core_dev *dev,
+					  enum mlx5_dev_event event,
+					  unsigned long param);
+	struct mlx5_priv	priv;
+	struct mlx5_profile	*profile;
+	atomic_t		num_qps;
+	u32			issi;
+	struct mlx5e_resources  mlx5e_res;
+	struct {
+		struct mlx5_rsvd_gids	reserved_gids;
+		u32			roce_en;
+	} roce;
+#ifdef CONFIG_MLX5_FPGA
+	struct mlx5_fpga_device *fpga;
+#endif
+#ifdef CONFIG_RFS_ACCEL
+	struct cpu_rmap         *rmap;
+#endif
+	struct mlx5_clock        clock;
+	struct mlx5_ib_clock_info  *clock_info;
+	struct page             *clock_info_page;
+};
+
+struct mlx5_db {
+	__be32			*db;
+	union {
+		struct mlx5_db_pgdir		*pgdir;
+		struct mlx5_ib_user_db_page	*user_page;
+	}			u;
+	dma_addr_t		dma;
+	int			index;
+};
+
+enum {
+	MLX5_COMP_EQ_SIZE = 1024,
+};
+
+enum {
+	MLX5_PTYS_IB = 1 << 0,
+	MLX5_PTYS_EN = 1 << 2,
+};
+
+typedef void (*mlx5_cmd_cbk_t)(int status, void *context);
+
+enum {
+	MLX5_CMD_ENT_STATE_PENDING_COMP,
+};
+
+struct mlx5_cmd_work_ent {
+	unsigned long		state;
+	struct mlx5_cmd_msg    *in;
+	struct mlx5_cmd_msg    *out;
+	void		       *uout;
+	int			uout_size;
+	mlx5_cmd_cbk_t		callback;
+	struct delayed_work	cb_timeout_work;
+	void		       *context;
+	int			idx;
+	struct completion	done;
+	struct mlx5_cmd        *cmd;
+	struct work_struct	work;
+	struct mlx5_cmd_layout *lay;
+	int			ret;
+	int			page_queue;
+	u8			status;
+	u8			token;
+	u64			ts1;
+	u64			ts2;
+	u16			op;
+	bool			polling;
+};
+
+struct mlx5_pas {
+	u64	pa;
+	u8	log_sz;
+};
+
+enum phy_port_state {
+	MLX5_AAA_111
+};
+
+struct mlx5_hca_vport_context {
+	u32			field_select;
+	bool			sm_virt_aware;
+	bool			has_smi;
+	bool			has_raw;
+	enum port_state_policy	policy;
+	enum phy_port_state	phys_state;
+	enum ib_port_state	vport_state;
+	u8			port_physical_state;
+	u64			sys_image_guid;
+	u64			port_guid;
+	u64			node_guid;
+	u32			cap_mask1;
+	u32			cap_mask1_perm;
+	u32			cap_mask2;
+	u32			cap_mask2_perm;
+	u16			lid;
+	u8			init_type_reply; /* bitmask: see ib spec 14.2.5.6 InitTypeReply */
+	u8			lmc;
+	u8			subnet_timeout;
+	u16			sm_lid;
+	u8			sm_sl;
+	u16			qkey_violation_counter;
+	u16			pkey_violation_counter;
+	bool			grh_required;
+};
+
+static inline void *mlx5_buf_offset(struct mlx5_frag_buf *buf, int offset)
+{
+		return buf->frags->buf + offset;
+}
+
+#define STRUCT_FIELD(header, field) \
+	.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field),      \
+	.struct_size_bytes   = sizeof((struct ib_unpacked_ ## header *)0)->field
+
+static inline struct mlx5_core_dev *pci2mlx5_core_dev(struct pci_dev *pdev)
+{
+	return pci_get_drvdata(pdev);
+}
+
+extern struct dentry *mlx5_debugfs_root;
+
+static inline u16 fw_rev_maj(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->fw_rev) & 0xffff;
+}
+
+static inline u16 fw_rev_min(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->fw_rev) >> 16;
+}
+
+static inline u16 fw_rev_sub(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->cmdif_rev_fw_sub) & 0xffff;
+}
+
+static inline u16 cmdif_rev(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16;
+}
+
+static inline u32 mlx5_base_mkey(const u32 key)
+{
+	return key & 0xffffff00u;
+}
+
+static inline void mlx5_core_init_cq_frag_buf(struct mlx5_frag_buf_ctrl *fbc,
+					      void *cqc)
+{
+	fbc->log_stride	= 6 + MLX5_GET(cqc, cqc, cqe_sz);
+	fbc->log_sz	= MLX5_GET(cqc, cqc, log_cq_size);
+	fbc->sz_m1	= (1 << fbc->log_sz) - 1;
+	fbc->log_frag_strides = PAGE_SHIFT - fbc->log_stride;
+	fbc->frag_sz_m1	= (1 << fbc->log_frag_strides) - 1;
+}
+
+static inline void *mlx5_frag_buf_get_wqe(struct mlx5_frag_buf_ctrl *fbc,
+					  u32 ix)
+{
+	unsigned int frag = (ix >> fbc->log_frag_strides);
+
+	return fbc->frag_buf.frags[frag].buf +
+		((fbc->frag_sz_m1 & ix) << fbc->log_stride);
+}
+
+int mlx5_cmd_init(struct mlx5_core_dev *dev);
+void mlx5_cmd_cleanup(struct mlx5_core_dev *dev);
+void mlx5_cmd_use_events(struct mlx5_core_dev *dev);
+void mlx5_cmd_use_polling(struct mlx5_core_dev *dev);
+
+int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+		  int out_size);
+int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size,
+		     void *out, int out_size, mlx5_cmd_cbk_t callback,
+		     void *context);
+int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size,
+			  void *out, int out_size);
+void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome);
+
+int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type);
+int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
+int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
+void mlx5_health_cleanup(struct mlx5_core_dev *dev);
+int mlx5_health_init(struct mlx5_core_dev *dev);
+void mlx5_start_health_poll(struct mlx5_core_dev *dev);
+void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
+void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
+void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
+void mlx5_drain_health_recovery(struct mlx5_core_dev *dev);
+int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+			struct mlx5_frag_buf *buf, int node);
+int mlx5_buf_alloc(struct mlx5_core_dev *dev,
+		   int size, struct mlx5_frag_buf *buf);
+void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf);
+int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+			     struct mlx5_frag_buf *buf, int node);
+void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf);
+struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
+						      gfp_t flags, int npages);
+void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
+				 struct mlx5_cmd_mailbox *head);
+int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_srq_attr *in);
+int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq);
+int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			struct mlx5_srq_attr *out);
+int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+		      u16 lwm, int is_srq);
+void mlx5_init_mkey_table(struct mlx5_core_dev *dev);
+void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev);
+int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
+			     struct mlx5_core_mkey *mkey,
+			     u32 *in, int inlen,
+			     u32 *out, int outlen,
+			     mlx5_cmd_cbk_t callback, void *context);
+int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
+			  struct mlx5_core_mkey *mkey,
+			  u32 *in, int inlen);
+int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev,
+			   struct mlx5_core_mkey *mkey);
+int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey,
+			 u32 *out, int outlen);
+int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey,
+			     u32 *mkey);
+int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn);
+int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn);
+int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
+		      u16 opmod, u8 port);
+void mlx5_pagealloc_init(struct mlx5_core_dev *dev);
+void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev);
+int mlx5_pagealloc_start(struct mlx5_core_dev *dev);
+void mlx5_pagealloc_stop(struct mlx5_core_dev *dev);
+void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
+				 s32 npages);
+int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot);
+int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev);
+void mlx5_register_debugfs(void);
+void mlx5_unregister_debugfs(void);
+
+void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas);
+void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas);
+void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type);
+void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type);
+struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn);
+int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
+		    unsigned int *irqn);
+int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);
+int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);
+
+int mlx5_qp_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev);
+int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in,
+			 int size_in, void *data_out, int size_out,
+			 u16 reg_num, int arg, int write);
+
+int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db);
+int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db,
+		       int node);
+void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db);
+
+const char *mlx5_command_str(int command);
+int mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev);
+int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn,
+			 int npsvs, u32 *sig_index);
+int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num);
+void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common);
+int mlx5_query_odp_caps(struct mlx5_core_dev *dev,
+			struct mlx5_odp_caps *odp_caps);
+int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev,
+			     u8 port_num, void *out, size_t sz);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token,
+				u32 wq_num, u8 type, int error);
+#endif
+
+int mlx5_init_rl_table(struct mlx5_core_dev *dev);
+void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
+int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u16 *index,
+		     struct mlx5_rate_limit *rl);
+void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, struct mlx5_rate_limit *rl);
+bool mlx5_rl_is_in_range(struct mlx5_core_dev *dev, u32 rate);
+bool mlx5_rl_are_equal(struct mlx5_rate_limit *rl_0,
+		       struct mlx5_rate_limit *rl_1);
+int mlx5_alloc_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg,
+		     bool map_wc, bool fast_path);
+void mlx5_free_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg);
+
+unsigned int mlx5_core_reserved_gids_count(struct mlx5_core_dev *dev);
+int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index,
+			   u8 roce_version, u8 roce_l3_type, const u8 *gid,
+			   const u8 *mac, bool vlan, u16 vlan_id, u8 port_num);
+
+static inline int fw_initializing(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->initializing) >> 31;
+}
+
+static inline u32 mlx5_mkey_to_idx(u32 mkey)
+{
+	return mkey >> 8;
+}
+
+static inline u32 mlx5_idx_to_mkey(u32 mkey_idx)
+{
+	return mkey_idx << 8;
+}
+
+static inline u8 mlx5_mkey_variant(u32 mkey)
+{
+	return mkey & 0xff;
+}
+
+enum {
+	MLX5_PROF_MASK_QP_SIZE		= (u64)1 << 0,
+	MLX5_PROF_MASK_MR_CACHE		= (u64)1 << 1,
+};
+
+enum {
+	MR_CACHE_LAST_STD_ENTRY = 20,
+	MLX5_IMR_MTT_CACHE_ENTRY,
+	MLX5_IMR_KSM_CACHE_ENTRY,
+	MAX_MR_CACHE_ENTRIES
+};
+
+enum {
+	MLX5_INTERFACE_PROTOCOL_IB  = 0,
+	MLX5_INTERFACE_PROTOCOL_ETH = 1,
+};
+
+struct mlx5_interface {
+	void *			(*add)(struct mlx5_core_dev *dev);
+	void			(*remove)(struct mlx5_core_dev *dev, void *context);
+	int			(*attach)(struct mlx5_core_dev *dev, void *context);
+	void			(*detach)(struct mlx5_core_dev *dev, void *context);
+	void			(*event)(struct mlx5_core_dev *dev, void *context,
+					 enum mlx5_dev_event event, unsigned long param);
+	void			(*pfault)(struct mlx5_core_dev *dev,
+					  void *context,
+					  struct mlx5_pagefault *pfault);
+	void *                  (*get_dev)(void *context);
+	int			protocol;
+	struct list_head	list;
+};
+
+void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol);
+int mlx5_register_interface(struct mlx5_interface *intf);
+void mlx5_unregister_interface(struct mlx5_interface *intf);
+int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id);
+
+int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev);
+int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev);
+bool mlx5_lag_is_active(struct mlx5_core_dev *dev);
+struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
+int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
+				 u64 *values,
+				 int num_counters,
+				 size_t *offsets);
+struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev);
+void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up);
+
+#ifndef CONFIG_MLX5_CORE_IPOIB
+static inline
+struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
+					  struct ib_device *ibdev,
+					  const char *name,
+					  void (*setup)(struct net_device *))
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void mlx5_rdma_netdev_free(struct net_device *netdev) {}
+#else
+struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
+					  struct ib_device *ibdev,
+					  const char *name,
+					  void (*setup)(struct net_device *));
+void mlx5_rdma_netdev_free(struct net_device *netdev);
+#endif /* CONFIG_MLX5_CORE_IPOIB */
+
+struct mlx5_profile {
+	u64	mask;
+	u8	log_max_qp;
+	struct {
+		int	size;
+		int	limit;
+	} mr_cache[MAX_MR_CACHE_ENTRIES];
+};
+
+enum {
+	MLX5_PCI_DEV_IS_VF		= 1 << 0,
+};
+
+static inline int mlx5_core_is_pf(struct mlx5_core_dev *dev)
+{
+	return !(dev->priv.pci_dev_data & MLX5_PCI_DEV_IS_VF);
+}
+
+#define MLX5_TOTAL_VPORTS(mdev) (1 + pci_sriov_get_totalvfs((mdev)->pdev))
+#define MLX5_VPORT_MANAGER(mdev) \
+	(MLX5_CAP_GEN(mdev, vport_group_manager) && \
+	 (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && \
+	 mlx5_core_is_pf(mdev))
+
+static inline int mlx5_get_gid_table_len(u16 param)
+{
+	if (param > 4) {
+		pr_warn("gid table length is zero\n");
+		return 0;
+	}
+
+	return 8 * (1 << param);
+}
+
+static inline bool mlx5_rl_is_supported(struct mlx5_core_dev *dev)
+{
+	return !!(dev->priv.rl_table.max_size);
+}
+
+static inline int mlx5_core_is_mp_slave(struct mlx5_core_dev *dev)
+{
+	return MLX5_CAP_GEN(dev, affiliate_nic_vport_criteria) &&
+	       MLX5_CAP_GEN(dev, num_vhca_ports) <= 1;
+}
+
+static inline int mlx5_core_is_mp_master(struct mlx5_core_dev *dev)
+{
+	return MLX5_CAP_GEN(dev, num_vhca_ports) > 1;
+}
+
+static inline int mlx5_core_mp_enabled(struct mlx5_core_dev *dev)
+{
+	return mlx5_core_is_mp_slave(dev) ||
+	       mlx5_core_is_mp_master(dev);
+}
+
+static inline int mlx5_core_native_port_num(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_core_mp_enabled(dev))
+		return 1;
+
+	return MLX5_CAP_GEN(dev, native_port_num);
+}
+
+enum {
+	MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32,
+};
+
+static inline const struct cpumask *
+mlx5_get_vector_affinity_hint(struct mlx5_core_dev *dev, int vector)
+{
+	return dev->priv.irq_info[vector].mask;
+}
+
+#endif /* MLX5_DRIVER_H */
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
new file mode 100644
index 0000000..d3c9db4
--- /dev/null
+++ b/include/linux/mlx5/eswitch.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _MLX5_ESWITCH_
+#define _MLX5_ESWITCH_
+
+#include <linux/mlx5/driver.h>
+
+enum {
+	SRIOV_NONE,
+	SRIOV_LEGACY,
+	SRIOV_OFFLOADS
+};
+
+enum {
+	REP_ETH,
+	REP_IB,
+	NUM_REP_TYPES,
+};
+
+struct mlx5_eswitch_rep;
+struct mlx5_eswitch_rep_if {
+	int		       (*load)(struct mlx5_core_dev *dev,
+				       struct mlx5_eswitch_rep *rep);
+	void		       (*unload)(struct mlx5_eswitch_rep *rep);
+	void		       *(*get_proto_dev)(struct mlx5_eswitch_rep *rep);
+	void			*priv;
+	bool		       valid;
+};
+
+struct mlx5_eswitch_rep {
+	struct mlx5_eswitch_rep_if rep_if[NUM_REP_TYPES];
+	u16		       vport;
+	u8		       hw_id[ETH_ALEN];
+	u16		       vlan;
+	u32		       vlan_refcount;
+};
+
+void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
+				     int vport_index,
+				     struct mlx5_eswitch_rep_if *rep_if,
+				     u8 rep_type);
+void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
+				       int vport_index,
+				       u8 rep_type);
+void *mlx5_eswitch_get_proto_dev(struct mlx5_eswitch *esw,
+				 int vport,
+				 u8 rep_type);
+struct mlx5_eswitch_rep *mlx5_eswitch_vport_rep(struct mlx5_eswitch *esw,
+						int vport);
+void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type);
+u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw);
+struct mlx5_flow_handle *
+mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw,
+				    int vport, u32 sqn);
+#endif
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
new file mode 100644
index 0000000..47aecc4
--- /dev/null
+++ b/include/linux/mlx5/fs.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MLX5_FS_
+#define _MLX5_FS_
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/mlx5_ifc.h>
+
+#define MLX5_FS_DEFAULT_FLOW_TAG 0x0
+
+enum {
+	MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO	= 1 << 16,
+	MLX5_FLOW_CONTEXT_ACTION_ENCRYPT	= 1 << 17,
+	MLX5_FLOW_CONTEXT_ACTION_DECRYPT	= 1 << 18,
+};
+
+enum {
+	MLX5_FLOW_TABLE_TUNNEL_EN = BIT(0),
+};
+
+#define LEFTOVERS_RULE_NUM	 2
+static inline void build_leftovers_ft_param(int *priority,
+					    int *n_ent,
+					    int *n_grp)
+{
+	*priority = 0; /* Priority of leftovers_prio-0 */
+	*n_ent = LEFTOVERS_RULE_NUM;
+	*n_grp = LEFTOVERS_RULE_NUM;
+}
+
+enum mlx5_flow_namespace_type {
+	MLX5_FLOW_NAMESPACE_BYPASS,
+	MLX5_FLOW_NAMESPACE_LAG,
+	MLX5_FLOW_NAMESPACE_OFFLOADS,
+	MLX5_FLOW_NAMESPACE_ETHTOOL,
+	MLX5_FLOW_NAMESPACE_KERNEL,
+	MLX5_FLOW_NAMESPACE_LEFTOVERS,
+	MLX5_FLOW_NAMESPACE_ANCHOR,
+	MLX5_FLOW_NAMESPACE_FDB,
+	MLX5_FLOW_NAMESPACE_ESW_EGRESS,
+	MLX5_FLOW_NAMESPACE_ESW_INGRESS,
+	MLX5_FLOW_NAMESPACE_SNIFFER_RX,
+	MLX5_FLOW_NAMESPACE_SNIFFER_TX,
+	MLX5_FLOW_NAMESPACE_EGRESS,
+};
+
+struct mlx5_flow_table;
+struct mlx5_flow_group;
+struct mlx5_flow_namespace;
+struct mlx5_flow_handle;
+
+struct mlx5_flow_spec {
+	u8   match_criteria_enable;
+	u32  match_criteria[MLX5_ST_SZ_DW(fte_match_param)];
+	u32  match_value[MLX5_ST_SZ_DW(fte_match_param)];
+};
+
+struct mlx5_flow_destination {
+	enum mlx5_flow_destination_type	type;
+	union {
+		u32			tir_num;
+		struct mlx5_flow_table	*ft;
+		u32			vport_num;
+		struct mlx5_fc		*counter;
+	};
+};
+
+struct mlx5_flow_namespace *
+mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
+			enum mlx5_flow_namespace_type type);
+struct mlx5_flow_namespace *
+mlx5_get_flow_vport_acl_namespace(struct mlx5_core_dev *dev,
+				  enum mlx5_flow_namespace_type type,
+				  int vport);
+
+struct mlx5_flow_table *
+mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
+				    int prio,
+				    int num_flow_table_entries,
+				    int max_num_groups,
+				    u32 level,
+				    u32 flags);
+
+struct mlx5_flow_table_attr {
+	int prio;
+	int max_fte;
+	u32 level;
+	u32 flags;
+};
+
+struct mlx5_flow_table *
+mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
+		       struct mlx5_flow_table_attr *ft_attr);
+
+struct mlx5_flow_table *
+mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
+			     int prio,
+			     int num_flow_table_entries,
+			     u32 level, u16 vport);
+struct mlx5_flow_table *mlx5_create_lag_demux_flow_table(
+					       struct mlx5_flow_namespace *ns,
+					       int prio, u32 level);
+int mlx5_destroy_flow_table(struct mlx5_flow_table *ft);
+
+/* inbox should be set with the following values:
+ * start_flow_index
+ * end_flow_index
+ * match_criteria_enable
+ * match_criteria
+ */
+struct mlx5_flow_group *
+mlx5_create_flow_group(struct mlx5_flow_table *ft, u32 *in);
+void mlx5_destroy_flow_group(struct mlx5_flow_group *fg);
+
+struct mlx5_fs_vlan {
+        u16 ethtype;
+        u16 vid;
+        u8  prio;
+};
+
+struct mlx5_flow_act {
+	u32 action;
+	bool has_flow_tag;
+	u32 flow_tag;
+	u32 encap_id;
+	u32 modify_id;
+	uintptr_t esp_id;
+	struct mlx5_fs_vlan vlan;
+};
+
+#define MLX5_DECLARE_FLOW_ACT(name) \
+	struct mlx5_flow_act name = {MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,\
+				     MLX5_FS_DEFAULT_FLOW_TAG, 0, 0}
+
+/* Single destination per rule.
+ * Group ID is implied by the match criteria.
+ */
+struct mlx5_flow_handle *
+mlx5_add_flow_rules(struct mlx5_flow_table *ft,
+		    struct mlx5_flow_spec *spec,
+		    struct mlx5_flow_act *flow_act,
+		    struct mlx5_flow_destination *dest,
+		    int dest_num);
+void mlx5_del_flow_rules(struct mlx5_flow_handle *fr);
+
+int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler,
+				 struct mlx5_flow_destination *new_dest,
+				 struct mlx5_flow_destination *old_dest);
+
+struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_handle *handler);
+struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
+void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
+void mlx5_fc_query_cached(struct mlx5_fc *counter,
+			  u64 *bytes, u64 *packets, u64 *lastuse);
+int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
+int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
+
+#endif
diff --git a/include/linux/mlx5/fs_helpers.h b/include/linux/mlx5/fs_helpers.h
new file mode 100644
index 0000000..9db21cd
--- /dev/null
+++ b/include/linux/mlx5/fs_helpers.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MLX5_FS_HELPERS_
+#define _MLX5_FS_HELPERS_
+
+#include <linux/mlx5/mlx5_ifc.h>
+
+#define MLX5_FS_IPV4_VERSION 4
+#define MLX5_FS_IPV6_VERSION 6
+
+static inline bool mlx5_fs_is_ipsec_flow(const u32 *match_c)
+{
+	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					   misc_parameters);
+
+	return MLX5_GET(fte_match_set_misc, misc_params_c, outer_esp_spi);
+}
+
+static inline bool _mlx5_fs_is_outer_ipproto_flow(const u32 *match_c,
+						  const u32 *match_v, u8 match)
+{
+	const void *headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					     outer_headers);
+	const void *headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					     outer_headers);
+
+	return MLX5_GET(fte_match_set_lyr_2_4, headers_c, ip_protocol) == 0xff &&
+		MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol) == match;
+}
+
+static inline bool mlx5_fs_is_outer_tcp_flow(const u32 *match_c,
+					     const u32 *match_v)
+{
+	return _mlx5_fs_is_outer_ipproto_flow(match_c, match_v, IPPROTO_TCP);
+}
+
+static inline bool mlx5_fs_is_outer_udp_flow(const u32 *match_c,
+					     const u32 *match_v)
+{
+	return _mlx5_fs_is_outer_ipproto_flow(match_c, match_v, IPPROTO_UDP);
+}
+
+static inline bool mlx5_fs_is_vxlan_flow(const u32 *match_c)
+{
+	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					   misc_parameters);
+
+	return MLX5_GET(fte_match_set_misc, misc_params_c, vxlan_vni);
+}
+
+static inline bool _mlx5_fs_is_outer_ipv_flow(struct mlx5_core_dev *mdev,
+					      const u32 *match_c,
+					      const u32 *match_v, int version)
+{
+	int match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+						  ft_field_support.outer_ip_version);
+	const void *headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					     outer_headers);
+	const void *headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					     outer_headers);
+
+	if (!match_ipv) {
+		u16 ethertype;
+
+		switch (version) {
+		case MLX5_FS_IPV4_VERSION:
+			ethertype = ETH_P_IP;
+			break;
+		case MLX5_FS_IPV6_VERSION:
+			ethertype = ETH_P_IPV6;
+			break;
+		default:
+			return false;
+		}
+
+		return MLX5_GET(fte_match_set_lyr_2_4, headers_c,
+				ethertype) == 0xffff &&
+			MLX5_GET(fte_match_set_lyr_2_4, headers_v,
+				 ethertype) == ethertype;
+	}
+
+	return MLX5_GET(fte_match_set_lyr_2_4, headers_c,
+			ip_version) == 0xf &&
+		MLX5_GET(fte_match_set_lyr_2_4, headers_v,
+			 ip_version) == version;
+}
+
+static inline bool
+mlx5_fs_is_outer_ipv4_flow(struct mlx5_core_dev *mdev, const u32 *match_c,
+			   const u32 *match_v)
+{
+	return _mlx5_fs_is_outer_ipv_flow(mdev, match_c, match_v,
+					  MLX5_FS_IPV4_VERSION);
+}
+
+static inline bool
+mlx5_fs_is_outer_ipv6_flow(struct mlx5_core_dev *mdev, const u32 *match_c,
+			   const u32 *match_v)
+{
+	return _mlx5_fs_is_outer_ipv_flow(mdev, match_c, match_v,
+					  MLX5_FS_IPV6_VERSION);
+}
+
+static inline bool mlx5_fs_is_outer_ipsec_flow(const u32 *match_c)
+{
+	void *misc_params_c =
+			MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters);
+
+	return MLX5_GET(fte_match_set_misc, misc_params_c, outer_esp_spi);
+}
+
+#endif
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
new file mode 100644
index 0000000..1aad455
--- /dev/null
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -0,0 +1,9044 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+*/
+#ifndef MLX5_IFC_H
+#define MLX5_IFC_H
+
+#include "mlx5_ifc_fpga.h"
+
+enum {
+	MLX5_EVENT_TYPE_CODING_COMPLETION_EVENTS                   = 0x0,
+	MLX5_EVENT_TYPE_CODING_PATH_MIGRATED_SUCCEEDED             = 0x1,
+	MLX5_EVENT_TYPE_CODING_COMMUNICATION_ESTABLISHED           = 0x2,
+	MLX5_EVENT_TYPE_CODING_SEND_QUEUE_DRAINED                  = 0x3,
+	MLX5_EVENT_TYPE_CODING_LAST_WQE_REACHED                    = 0x13,
+	MLX5_EVENT_TYPE_CODING_SRQ_LIMIT                           = 0x14,
+	MLX5_EVENT_TYPE_CODING_DCT_ALL_CONNECTIONS_CLOSED          = 0x1c,
+	MLX5_EVENT_TYPE_CODING_DCT_ACCESS_KEY_VIOLATION            = 0x1d,
+	MLX5_EVENT_TYPE_CODING_CQ_ERROR                            = 0x4,
+	MLX5_EVENT_TYPE_CODING_LOCAL_WQ_CATASTROPHIC_ERROR         = 0x5,
+	MLX5_EVENT_TYPE_CODING_PATH_MIGRATION_FAILED               = 0x7,
+	MLX5_EVENT_TYPE_CODING_PAGE_FAULT_EVENT                    = 0xc,
+	MLX5_EVENT_TYPE_CODING_INVALID_REQUEST_LOCAL_WQ_ERROR      = 0x10,
+	MLX5_EVENT_TYPE_CODING_LOCAL_ACCESS_VIOLATION_WQ_ERROR     = 0x11,
+	MLX5_EVENT_TYPE_CODING_LOCAL_SRQ_CATASTROPHIC_ERROR        = 0x12,
+	MLX5_EVENT_TYPE_CODING_INTERNAL_ERROR                      = 0x8,
+	MLX5_EVENT_TYPE_CODING_PORT_STATE_CHANGE                   = 0x9,
+	MLX5_EVENT_TYPE_CODING_GPIO_EVENT                          = 0x15,
+	MLX5_EVENT_TYPE_CODING_REMOTE_CONFIGURATION_PROTOCOL_EVENT = 0x19,
+	MLX5_EVENT_TYPE_CODING_DOORBELL_BLUEFLAME_CONGESTION_EVENT = 0x1a,
+	MLX5_EVENT_TYPE_CODING_STALL_VL_EVENT                      = 0x1b,
+	MLX5_EVENT_TYPE_CODING_DROPPED_PACKET_LOGGED_EVENT         = 0x1f,
+	MLX5_EVENT_TYPE_CODING_COMMAND_INTERFACE_COMPLETION        = 0xa,
+	MLX5_EVENT_TYPE_CODING_PAGE_REQUEST                        = 0xb,
+	MLX5_EVENT_TYPE_CODING_FPGA_ERROR                          = 0x20,
+};
+
+enum {
+	MLX5_MODIFY_TIR_BITMASK_LRO                   = 0x0,
+	MLX5_MODIFY_TIR_BITMASK_INDIRECT_TABLE        = 0x1,
+	MLX5_MODIFY_TIR_BITMASK_HASH                  = 0x2,
+	MLX5_MODIFY_TIR_BITMASK_TUNNELED_OFFLOAD_EN   = 0x3
+};
+
+enum {
+	MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE        = 0x0,
+	MLX5_SET_HCA_CAP_OP_MOD_ATOMIC                = 0x3,
+};
+
+enum {
+	MLX5_CMD_OP_QUERY_HCA_CAP                 = 0x100,
+	MLX5_CMD_OP_QUERY_ADAPTER                 = 0x101,
+	MLX5_CMD_OP_INIT_HCA                      = 0x102,
+	MLX5_CMD_OP_TEARDOWN_HCA                  = 0x103,
+	MLX5_CMD_OP_ENABLE_HCA                    = 0x104,
+	MLX5_CMD_OP_DISABLE_HCA                   = 0x105,
+	MLX5_CMD_OP_QUERY_PAGES                   = 0x107,
+	MLX5_CMD_OP_MANAGE_PAGES                  = 0x108,
+	MLX5_CMD_OP_SET_HCA_CAP                   = 0x109,
+	MLX5_CMD_OP_QUERY_ISSI                    = 0x10a,
+	MLX5_CMD_OP_SET_ISSI                      = 0x10b,
+	MLX5_CMD_OP_SET_DRIVER_VERSION            = 0x10d,
+	MLX5_CMD_OP_CREATE_MKEY                   = 0x200,
+	MLX5_CMD_OP_QUERY_MKEY                    = 0x201,
+	MLX5_CMD_OP_DESTROY_MKEY                  = 0x202,
+	MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS        = 0x203,
+	MLX5_CMD_OP_PAGE_FAULT_RESUME             = 0x204,
+	MLX5_CMD_OP_ALLOC_MEMIC                   = 0x205,
+	MLX5_CMD_OP_DEALLOC_MEMIC                 = 0x206,
+	MLX5_CMD_OP_CREATE_EQ                     = 0x301,
+	MLX5_CMD_OP_DESTROY_EQ                    = 0x302,
+	MLX5_CMD_OP_QUERY_EQ                      = 0x303,
+	MLX5_CMD_OP_GEN_EQE                       = 0x304,
+	MLX5_CMD_OP_CREATE_CQ                     = 0x400,
+	MLX5_CMD_OP_DESTROY_CQ                    = 0x401,
+	MLX5_CMD_OP_QUERY_CQ                      = 0x402,
+	MLX5_CMD_OP_MODIFY_CQ                     = 0x403,
+	MLX5_CMD_OP_CREATE_QP                     = 0x500,
+	MLX5_CMD_OP_DESTROY_QP                    = 0x501,
+	MLX5_CMD_OP_RST2INIT_QP                   = 0x502,
+	MLX5_CMD_OP_INIT2RTR_QP                   = 0x503,
+	MLX5_CMD_OP_RTR2RTS_QP                    = 0x504,
+	MLX5_CMD_OP_RTS2RTS_QP                    = 0x505,
+	MLX5_CMD_OP_SQERR2RTS_QP                  = 0x506,
+	MLX5_CMD_OP_2ERR_QP                       = 0x507,
+	MLX5_CMD_OP_2RST_QP                       = 0x50a,
+	MLX5_CMD_OP_QUERY_QP                      = 0x50b,
+	MLX5_CMD_OP_SQD_RTS_QP                    = 0x50c,
+	MLX5_CMD_OP_INIT2INIT_QP                  = 0x50e,
+	MLX5_CMD_OP_CREATE_PSV                    = 0x600,
+	MLX5_CMD_OP_DESTROY_PSV                   = 0x601,
+	MLX5_CMD_OP_CREATE_SRQ                    = 0x700,
+	MLX5_CMD_OP_DESTROY_SRQ                   = 0x701,
+	MLX5_CMD_OP_QUERY_SRQ                     = 0x702,
+	MLX5_CMD_OP_ARM_RQ                        = 0x703,
+	MLX5_CMD_OP_CREATE_XRC_SRQ                = 0x705,
+	MLX5_CMD_OP_DESTROY_XRC_SRQ               = 0x706,
+	MLX5_CMD_OP_QUERY_XRC_SRQ                 = 0x707,
+	MLX5_CMD_OP_ARM_XRC_SRQ                   = 0x708,
+	MLX5_CMD_OP_CREATE_DCT                    = 0x710,
+	MLX5_CMD_OP_DESTROY_DCT                   = 0x711,
+	MLX5_CMD_OP_DRAIN_DCT                     = 0x712,
+	MLX5_CMD_OP_QUERY_DCT                     = 0x713,
+	MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION     = 0x714,
+	MLX5_CMD_OP_CREATE_XRQ                    = 0x717,
+	MLX5_CMD_OP_DESTROY_XRQ                   = 0x718,
+	MLX5_CMD_OP_QUERY_XRQ                     = 0x719,
+	MLX5_CMD_OP_ARM_XRQ                       = 0x71a,
+	MLX5_CMD_OP_QUERY_VPORT_STATE             = 0x750,
+	MLX5_CMD_OP_MODIFY_VPORT_STATE            = 0x751,
+	MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT       = 0x752,
+	MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT      = 0x753,
+	MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT       = 0x754,
+	MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT      = 0x755,
+	MLX5_CMD_OP_QUERY_ROCE_ADDRESS            = 0x760,
+	MLX5_CMD_OP_SET_ROCE_ADDRESS              = 0x761,
+	MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT       = 0x762,
+	MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT      = 0x763,
+	MLX5_CMD_OP_QUERY_HCA_VPORT_GID           = 0x764,
+	MLX5_CMD_OP_QUERY_HCA_VPORT_PKEY          = 0x765,
+	MLX5_CMD_OP_QUERY_VNIC_ENV                = 0x76f,
+	MLX5_CMD_OP_QUERY_VPORT_COUNTER           = 0x770,
+	MLX5_CMD_OP_ALLOC_Q_COUNTER               = 0x771,
+	MLX5_CMD_OP_DEALLOC_Q_COUNTER             = 0x772,
+	MLX5_CMD_OP_QUERY_Q_COUNTER               = 0x773,
+	MLX5_CMD_OP_SET_PP_RATE_LIMIT             = 0x780,
+	MLX5_CMD_OP_QUERY_RATE_LIMIT              = 0x781,
+	MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT      = 0x782,
+	MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT     = 0x783,
+	MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT       = 0x784,
+	MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT      = 0x785,
+	MLX5_CMD_OP_CREATE_QOS_PARA_VPORT         = 0x786,
+	MLX5_CMD_OP_DESTROY_QOS_PARA_VPORT        = 0x787,
+	MLX5_CMD_OP_ALLOC_PD                      = 0x800,
+	MLX5_CMD_OP_DEALLOC_PD                    = 0x801,
+	MLX5_CMD_OP_ALLOC_UAR                     = 0x802,
+	MLX5_CMD_OP_DEALLOC_UAR                   = 0x803,
+	MLX5_CMD_OP_CONFIG_INT_MODERATION         = 0x804,
+	MLX5_CMD_OP_ACCESS_REG                    = 0x805,
+	MLX5_CMD_OP_ATTACH_TO_MCG                 = 0x806,
+	MLX5_CMD_OP_DETACH_FROM_MCG               = 0x807,
+	MLX5_CMD_OP_GET_DROPPED_PACKET_LOG        = 0x80a,
+	MLX5_CMD_OP_MAD_IFC                       = 0x50d,
+	MLX5_CMD_OP_QUERY_MAD_DEMUX               = 0x80b,
+	MLX5_CMD_OP_SET_MAD_DEMUX                 = 0x80c,
+	MLX5_CMD_OP_NOP                           = 0x80d,
+	MLX5_CMD_OP_ALLOC_XRCD                    = 0x80e,
+	MLX5_CMD_OP_DEALLOC_XRCD                  = 0x80f,
+	MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN        = 0x816,
+	MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN      = 0x817,
+	MLX5_CMD_OP_QUERY_CONG_STATUS             = 0x822,
+	MLX5_CMD_OP_MODIFY_CONG_STATUS            = 0x823,
+	MLX5_CMD_OP_QUERY_CONG_PARAMS             = 0x824,
+	MLX5_CMD_OP_MODIFY_CONG_PARAMS            = 0x825,
+	MLX5_CMD_OP_QUERY_CONG_STATISTICS         = 0x826,
+	MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT           = 0x827,
+	MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT        = 0x828,
+	MLX5_CMD_OP_SET_L2_TABLE_ENTRY            = 0x829,
+	MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY          = 0x82a,
+	MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY         = 0x82b,
+	MLX5_CMD_OP_SET_WOL_ROL                   = 0x830,
+	MLX5_CMD_OP_QUERY_WOL_ROL                 = 0x831,
+	MLX5_CMD_OP_CREATE_LAG                    = 0x840,
+	MLX5_CMD_OP_MODIFY_LAG                    = 0x841,
+	MLX5_CMD_OP_QUERY_LAG                     = 0x842,
+	MLX5_CMD_OP_DESTROY_LAG                   = 0x843,
+	MLX5_CMD_OP_CREATE_VPORT_LAG              = 0x844,
+	MLX5_CMD_OP_DESTROY_VPORT_LAG             = 0x845,
+	MLX5_CMD_OP_CREATE_TIR                    = 0x900,
+	MLX5_CMD_OP_MODIFY_TIR                    = 0x901,
+	MLX5_CMD_OP_DESTROY_TIR                   = 0x902,
+	MLX5_CMD_OP_QUERY_TIR                     = 0x903,
+	MLX5_CMD_OP_CREATE_SQ                     = 0x904,
+	MLX5_CMD_OP_MODIFY_SQ                     = 0x905,
+	MLX5_CMD_OP_DESTROY_SQ                    = 0x906,
+	MLX5_CMD_OP_QUERY_SQ                      = 0x907,
+	MLX5_CMD_OP_CREATE_RQ                     = 0x908,
+	MLX5_CMD_OP_MODIFY_RQ                     = 0x909,
+	MLX5_CMD_OP_SET_DELAY_DROP_PARAMS         = 0x910,
+	MLX5_CMD_OP_DESTROY_RQ                    = 0x90a,
+	MLX5_CMD_OP_QUERY_RQ                      = 0x90b,
+	MLX5_CMD_OP_CREATE_RMP                    = 0x90c,
+	MLX5_CMD_OP_MODIFY_RMP                    = 0x90d,
+	MLX5_CMD_OP_DESTROY_RMP                   = 0x90e,
+	MLX5_CMD_OP_QUERY_RMP                     = 0x90f,
+	MLX5_CMD_OP_CREATE_TIS                    = 0x912,
+	MLX5_CMD_OP_MODIFY_TIS                    = 0x913,
+	MLX5_CMD_OP_DESTROY_TIS                   = 0x914,
+	MLX5_CMD_OP_QUERY_TIS                     = 0x915,
+	MLX5_CMD_OP_CREATE_RQT                    = 0x916,
+	MLX5_CMD_OP_MODIFY_RQT                    = 0x917,
+	MLX5_CMD_OP_DESTROY_RQT                   = 0x918,
+	MLX5_CMD_OP_QUERY_RQT                     = 0x919,
+	MLX5_CMD_OP_SET_FLOW_TABLE_ROOT		  = 0x92f,
+	MLX5_CMD_OP_CREATE_FLOW_TABLE             = 0x930,
+	MLX5_CMD_OP_DESTROY_FLOW_TABLE            = 0x931,
+	MLX5_CMD_OP_QUERY_FLOW_TABLE              = 0x932,
+	MLX5_CMD_OP_CREATE_FLOW_GROUP             = 0x933,
+	MLX5_CMD_OP_DESTROY_FLOW_GROUP            = 0x934,
+	MLX5_CMD_OP_QUERY_FLOW_GROUP              = 0x935,
+	MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY          = 0x936,
+	MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY        = 0x937,
+	MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY       = 0x938,
+	MLX5_CMD_OP_ALLOC_FLOW_COUNTER            = 0x939,
+	MLX5_CMD_OP_DEALLOC_FLOW_COUNTER          = 0x93a,
+	MLX5_CMD_OP_QUERY_FLOW_COUNTER            = 0x93b,
+	MLX5_CMD_OP_MODIFY_FLOW_TABLE             = 0x93c,
+	MLX5_CMD_OP_ALLOC_ENCAP_HEADER            = 0x93d,
+	MLX5_CMD_OP_DEALLOC_ENCAP_HEADER          = 0x93e,
+	MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT   = 0x940,
+	MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT = 0x941,
+	MLX5_CMD_OP_FPGA_CREATE_QP                = 0x960,
+	MLX5_CMD_OP_FPGA_MODIFY_QP                = 0x961,
+	MLX5_CMD_OP_FPGA_QUERY_QP                 = 0x962,
+	MLX5_CMD_OP_FPGA_DESTROY_QP               = 0x963,
+	MLX5_CMD_OP_FPGA_QUERY_QP_COUNTERS        = 0x964,
+	MLX5_CMD_OP_MAX
+};
+
+struct mlx5_ifc_flow_table_fields_supported_bits {
+	u8         outer_dmac[0x1];
+	u8         outer_smac[0x1];
+	u8         outer_ether_type[0x1];
+	u8         outer_ip_version[0x1];
+	u8         outer_first_prio[0x1];
+	u8         outer_first_cfi[0x1];
+	u8         outer_first_vid[0x1];
+	u8         outer_ipv4_ttl[0x1];
+	u8         outer_second_prio[0x1];
+	u8         outer_second_cfi[0x1];
+	u8         outer_second_vid[0x1];
+	u8         reserved_at_b[0x1];
+	u8         outer_sip[0x1];
+	u8         outer_dip[0x1];
+	u8         outer_frag[0x1];
+	u8         outer_ip_protocol[0x1];
+	u8         outer_ip_ecn[0x1];
+	u8         outer_ip_dscp[0x1];
+	u8         outer_udp_sport[0x1];
+	u8         outer_udp_dport[0x1];
+	u8         outer_tcp_sport[0x1];
+	u8         outer_tcp_dport[0x1];
+	u8         outer_tcp_flags[0x1];
+	u8         outer_gre_protocol[0x1];
+	u8         outer_gre_key[0x1];
+	u8         outer_vxlan_vni[0x1];
+	u8         reserved_at_1a[0x5];
+	u8         source_eswitch_port[0x1];
+
+	u8         inner_dmac[0x1];
+	u8         inner_smac[0x1];
+	u8         inner_ether_type[0x1];
+	u8         inner_ip_version[0x1];
+	u8         inner_first_prio[0x1];
+	u8         inner_first_cfi[0x1];
+	u8         inner_first_vid[0x1];
+	u8         reserved_at_27[0x1];
+	u8         inner_second_prio[0x1];
+	u8         inner_second_cfi[0x1];
+	u8         inner_second_vid[0x1];
+	u8         reserved_at_2b[0x1];
+	u8         inner_sip[0x1];
+	u8         inner_dip[0x1];
+	u8         inner_frag[0x1];
+	u8         inner_ip_protocol[0x1];
+	u8         inner_ip_ecn[0x1];
+	u8         inner_ip_dscp[0x1];
+	u8         inner_udp_sport[0x1];
+	u8         inner_udp_dport[0x1];
+	u8         inner_tcp_sport[0x1];
+	u8         inner_tcp_dport[0x1];
+	u8         inner_tcp_flags[0x1];
+	u8         reserved_at_37[0x9];
+	u8         reserved_at_40[0x17];
+	u8	   outer_esp_spi[0x1];
+	u8	   reserved_at_58[0x2];
+	u8         bth_dst_qp[0x1];
+
+	u8         reserved_at_5b[0x25];
+};
+
+struct mlx5_ifc_flow_table_prop_layout_bits {
+	u8         ft_support[0x1];
+	u8         reserved_at_1[0x1];
+	u8         flow_counter[0x1];
+	u8	   flow_modify_en[0x1];
+	u8         modify_root[0x1];
+	u8         identified_miss_table_mode[0x1];
+	u8         flow_table_modify[0x1];
+	u8         encap[0x1];
+	u8         decap[0x1];
+	u8         reserved_at_9[0x1];
+	u8         pop_vlan[0x1];
+	u8         push_vlan[0x1];
+	u8         reserved_at_c[0x14];
+
+	u8         reserved_at_20[0x2];
+	u8         log_max_ft_size[0x6];
+	u8         log_max_modify_header_context[0x8];
+	u8         max_modify_header_actions[0x8];
+	u8         max_ft_level[0x8];
+
+	u8         reserved_at_40[0x20];
+
+	u8         reserved_at_60[0x18];
+	u8         log_max_ft_num[0x8];
+
+	u8         reserved_at_80[0x18];
+	u8         log_max_destination[0x8];
+
+	u8         log_max_flow_counter[0x8];
+	u8         reserved_at_a8[0x10];
+	u8         log_max_flow[0x8];
+
+	u8         reserved_at_c0[0x40];
+
+	struct mlx5_ifc_flow_table_fields_supported_bits ft_field_support;
+
+	struct mlx5_ifc_flow_table_fields_supported_bits ft_field_bitmask_support;
+};
+
+struct mlx5_ifc_odp_per_transport_service_cap_bits {
+	u8         send[0x1];
+	u8         receive[0x1];
+	u8         write[0x1];
+	u8         read[0x1];
+	u8         atomic[0x1];
+	u8         srq_receive[0x1];
+	u8         reserved_at_6[0x1a];
+};
+
+struct mlx5_ifc_ipv4_layout_bits {
+	u8         reserved_at_0[0x60];
+
+	u8         ipv4[0x20];
+};
+
+struct mlx5_ifc_ipv6_layout_bits {
+	u8         ipv6[16][0x8];
+};
+
+union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits {
+	struct mlx5_ifc_ipv6_layout_bits ipv6_layout;
+	struct mlx5_ifc_ipv4_layout_bits ipv4_layout;
+	u8         reserved_at_0[0x80];
+};
+
+struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
+	u8         smac_47_16[0x20];
+
+	u8         smac_15_0[0x10];
+	u8         ethertype[0x10];
+
+	u8         dmac_47_16[0x20];
+
+	u8         dmac_15_0[0x10];
+	u8         first_prio[0x3];
+	u8         first_cfi[0x1];
+	u8         first_vid[0xc];
+
+	u8         ip_protocol[0x8];
+	u8         ip_dscp[0x6];
+	u8         ip_ecn[0x2];
+	u8         cvlan_tag[0x1];
+	u8         svlan_tag[0x1];
+	u8         frag[0x1];
+	u8         ip_version[0x4];
+	u8         tcp_flags[0x9];
+
+	u8         tcp_sport[0x10];
+	u8         tcp_dport[0x10];
+
+	u8         reserved_at_c0[0x18];
+	u8         ttl_hoplimit[0x8];
+
+	u8         udp_sport[0x10];
+	u8         udp_dport[0x10];
+
+	union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits src_ipv4_src_ipv6;
+
+	union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits dst_ipv4_dst_ipv6;
+};
+
+struct mlx5_ifc_fte_match_set_misc_bits {
+	u8         reserved_at_0[0x8];
+	u8         source_sqn[0x18];
+
+	u8         reserved_at_20[0x10];
+	u8         source_port[0x10];
+
+	u8         outer_second_prio[0x3];
+	u8         outer_second_cfi[0x1];
+	u8         outer_second_vid[0xc];
+	u8         inner_second_prio[0x3];
+	u8         inner_second_cfi[0x1];
+	u8         inner_second_vid[0xc];
+
+	u8         outer_second_cvlan_tag[0x1];
+	u8         inner_second_cvlan_tag[0x1];
+	u8         outer_second_svlan_tag[0x1];
+	u8         inner_second_svlan_tag[0x1];
+	u8         reserved_at_64[0xc];
+	u8         gre_protocol[0x10];
+
+	u8         gre_key_h[0x18];
+	u8         gre_key_l[0x8];
+
+	u8         vxlan_vni[0x18];
+	u8         reserved_at_b8[0x8];
+
+	u8         reserved_at_c0[0x20];
+
+	u8         reserved_at_e0[0xc];
+	u8         outer_ipv6_flow_label[0x14];
+
+	u8         reserved_at_100[0xc];
+	u8         inner_ipv6_flow_label[0x14];
+
+	u8         reserved_at_120[0x28];
+	u8         bth_dst_qp[0x18];
+	u8	   reserved_at_160[0x20];
+	u8	   outer_esp_spi[0x20];
+	u8         reserved_at_1a0[0x60];
+};
+
+struct mlx5_ifc_cmd_pas_bits {
+	u8         pa_h[0x20];
+
+	u8         pa_l[0x14];
+	u8         reserved_at_34[0xc];
+};
+
+struct mlx5_ifc_uint64_bits {
+	u8         hi[0x20];
+
+	u8         lo[0x20];
+};
+
+enum {
+	MLX5_ADS_STAT_RATE_NO_LIMIT  = 0x0,
+	MLX5_ADS_STAT_RATE_2_5GBPS   = 0x7,
+	MLX5_ADS_STAT_RATE_10GBPS    = 0x8,
+	MLX5_ADS_STAT_RATE_30GBPS    = 0x9,
+	MLX5_ADS_STAT_RATE_5GBPS     = 0xa,
+	MLX5_ADS_STAT_RATE_20GBPS    = 0xb,
+	MLX5_ADS_STAT_RATE_40GBPS    = 0xc,
+	MLX5_ADS_STAT_RATE_60GBPS    = 0xd,
+	MLX5_ADS_STAT_RATE_80GBPS    = 0xe,
+	MLX5_ADS_STAT_RATE_120GBPS   = 0xf,
+};
+
+struct mlx5_ifc_ads_bits {
+	u8         fl[0x1];
+	u8         free_ar[0x1];
+	u8         reserved_at_2[0xe];
+	u8         pkey_index[0x10];
+
+	u8         reserved_at_20[0x8];
+	u8         grh[0x1];
+	u8         mlid[0x7];
+	u8         rlid[0x10];
+
+	u8         ack_timeout[0x5];
+	u8         reserved_at_45[0x3];
+	u8         src_addr_index[0x8];
+	u8         reserved_at_50[0x4];
+	u8         stat_rate[0x4];
+	u8         hop_limit[0x8];
+
+	u8         reserved_at_60[0x4];
+	u8         tclass[0x8];
+	u8         flow_label[0x14];
+
+	u8         rgid_rip[16][0x8];
+
+	u8         reserved_at_100[0x4];
+	u8         f_dscp[0x1];
+	u8         f_ecn[0x1];
+	u8         reserved_at_106[0x1];
+	u8         f_eth_prio[0x1];
+	u8         ecn[0x2];
+	u8         dscp[0x6];
+	u8         udp_sport[0x10];
+
+	u8         dei_cfi[0x1];
+	u8         eth_prio[0x3];
+	u8         sl[0x4];
+	u8         vhca_port_num[0x8];
+	u8         rmac_47_32[0x10];
+
+	u8         rmac_31_0[0x20];
+};
+
+struct mlx5_ifc_flow_table_nic_cap_bits {
+	u8         nic_rx_multi_path_tirs[0x1];
+	u8         nic_rx_multi_path_tirs_fts[0x1];
+	u8         allow_sniffer_and_nic_rx_shared_tir[0x1];
+	u8         reserved_at_3[0x1fd];
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive;
+
+	u8         reserved_at_400[0x200];
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_sniffer;
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit;
+
+	u8         reserved_at_a00[0x200];
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer;
+
+	u8         reserved_at_e00[0x7200];
+};
+
+struct mlx5_ifc_flow_table_eswitch_cap_bits {
+	u8     reserved_at_0[0x200];
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb;
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_ingress;
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_egress;
+
+	u8      reserved_at_800[0x7800];
+};
+
+struct mlx5_ifc_e_switch_cap_bits {
+	u8         vport_svlan_strip[0x1];
+	u8         vport_cvlan_strip[0x1];
+	u8         vport_svlan_insert[0x1];
+	u8         vport_cvlan_insert_if_not_exist[0x1];
+	u8         vport_cvlan_insert_overwrite[0x1];
+	u8         reserved_at_5[0x19];
+	u8         nic_vport_node_guid_modify[0x1];
+	u8         nic_vport_port_guid_modify[0x1];
+
+	u8         vxlan_encap_decap[0x1];
+	u8         nvgre_encap_decap[0x1];
+	u8         reserved_at_22[0x9];
+	u8         log_max_encap_headers[0x5];
+	u8         reserved_2b[0x6];
+	u8         max_encap_header_size[0xa];
+
+	u8         reserved_40[0x7c0];
+
+};
+
+struct mlx5_ifc_qos_cap_bits {
+	u8         packet_pacing[0x1];
+	u8         esw_scheduling[0x1];
+	u8         esw_bw_share[0x1];
+	u8         esw_rate_limit[0x1];
+	u8         reserved_at_4[0x1];
+	u8         packet_pacing_burst_bound[0x1];
+	u8         packet_pacing_typical_size[0x1];
+	u8         reserved_at_7[0x19];
+
+	u8         reserved_at_20[0x20];
+
+	u8         packet_pacing_max_rate[0x20];
+
+	u8         packet_pacing_min_rate[0x20];
+
+	u8         reserved_at_80[0x10];
+	u8         packet_pacing_rate_table_size[0x10];
+
+	u8         esw_element_type[0x10];
+	u8         esw_tsar_type[0x10];
+
+	u8         reserved_at_c0[0x10];
+	u8         max_qos_para_vport[0x10];
+
+	u8         max_tsar_bw_share[0x20];
+
+	u8         reserved_at_100[0x700];
+};
+
+struct mlx5_ifc_debug_cap_bits {
+	u8         reserved_at_0[0x20];
+
+	u8         reserved_at_20[0x2];
+	u8         stall_detect[0x1];
+	u8         reserved_at_23[0x1d];
+
+	u8         reserved_at_40[0x7c0];
+};
+
+struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
+	u8         csum_cap[0x1];
+	u8         vlan_cap[0x1];
+	u8         lro_cap[0x1];
+	u8         lro_psh_flag[0x1];
+	u8         lro_time_stamp[0x1];
+	u8         reserved_at_5[0x2];
+	u8         wqe_vlan_insert[0x1];
+	u8         self_lb_en_modifiable[0x1];
+	u8         reserved_at_9[0x2];
+	u8         max_lso_cap[0x5];
+	u8         multi_pkt_send_wqe[0x2];
+	u8	   wqe_inline_mode[0x2];
+	u8         rss_ind_tbl_cap[0x4];
+	u8         reg_umr_sq[0x1];
+	u8         scatter_fcs[0x1];
+	u8         enhanced_multi_pkt_send_wqe[0x1];
+	u8         tunnel_lso_const_out_ip_id[0x1];
+	u8         reserved_at_1c[0x2];
+	u8         tunnel_stateless_gre[0x1];
+	u8         tunnel_stateless_vxlan[0x1];
+
+	u8         swp[0x1];
+	u8         swp_csum[0x1];
+	u8         swp_lso[0x1];
+	u8         reserved_at_23[0x1b];
+	u8         max_geneve_opt_len[0x1];
+	u8         tunnel_stateless_geneve_rx[0x1];
+
+	u8         reserved_at_40[0x10];
+	u8         lro_min_mss_size[0x10];
+
+	u8         reserved_at_60[0x120];
+
+	u8         lro_timer_supported_periods[4][0x20];
+
+	u8         reserved_at_200[0x600];
+};
+
+struct mlx5_ifc_roce_cap_bits {
+	u8         roce_apm[0x1];
+	u8         reserved_at_1[0x1f];
+
+	u8         reserved_at_20[0x60];
+
+	u8         reserved_at_80[0xc];
+	u8         l3_type[0x4];
+	u8         reserved_at_90[0x8];
+	u8         roce_version[0x8];
+
+	u8         reserved_at_a0[0x10];
+	u8         r_roce_dest_udp_port[0x10];
+
+	u8         r_roce_max_src_udp_port[0x10];
+	u8         r_roce_min_src_udp_port[0x10];
+
+	u8         reserved_at_e0[0x10];
+	u8         roce_address_table_size[0x10];
+
+	u8         reserved_at_100[0x700];
+};
+
+struct mlx5_ifc_device_mem_cap_bits {
+	u8         memic[0x1];
+	u8         reserved_at_1[0x1f];
+
+	u8         reserved_at_20[0xb];
+	u8         log_min_memic_alloc_size[0x5];
+	u8         reserved_at_30[0x8];
+	u8	   log_max_memic_addr_alignment[0x8];
+
+	u8         memic_bar_start_addr[0x40];
+
+	u8         memic_bar_size[0x20];
+
+	u8         max_memic_size[0x20];
+
+	u8         reserved_at_c0[0x740];
+};
+
+enum {
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_1_BYTE     = 0x0,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_2_BYTES    = 0x2,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_4_BYTES    = 0x4,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_8_BYTES    = 0x8,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_16_BYTES   = 0x10,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_32_BYTES   = 0x20,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_64_BYTES   = 0x40,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_128_BYTES  = 0x80,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_256_BYTES  = 0x100,
+};
+
+enum {
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_1_BYTE     = 0x1,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_2_BYTES    = 0x2,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_4_BYTES    = 0x4,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_8_BYTES    = 0x8,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_16_BYTES   = 0x10,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_32_BYTES   = 0x20,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_64_BYTES   = 0x40,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_128_BYTES  = 0x80,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_256_BYTES  = 0x100,
+};
+
+struct mlx5_ifc_atomic_caps_bits {
+	u8         reserved_at_0[0x40];
+
+	u8         atomic_req_8B_endianness_mode[0x2];
+	u8         reserved_at_42[0x4];
+	u8         supported_atomic_req_8B_endianness_mode_1[0x1];
+
+	u8         reserved_at_47[0x19];
+
+	u8         reserved_at_60[0x20];
+
+	u8         reserved_at_80[0x10];
+	u8         atomic_operations[0x10];
+
+	u8         reserved_at_a0[0x10];
+	u8         atomic_size_qp[0x10];
+
+	u8         reserved_at_c0[0x10];
+	u8         atomic_size_dc[0x10];
+
+	u8         reserved_at_e0[0x720];
+};
+
+struct mlx5_ifc_odp_cap_bits {
+	u8         reserved_at_0[0x40];
+
+	u8         sig[0x1];
+	u8         reserved_at_41[0x1f];
+
+	u8         reserved_at_60[0x20];
+
+	struct mlx5_ifc_odp_per_transport_service_cap_bits rc_odp_caps;
+
+	struct mlx5_ifc_odp_per_transport_service_cap_bits uc_odp_caps;
+
+	struct mlx5_ifc_odp_per_transport_service_cap_bits ud_odp_caps;
+
+	u8         reserved_at_e0[0x720];
+};
+
+struct mlx5_ifc_calc_op {
+	u8        reserved_at_0[0x10];
+	u8        reserved_at_10[0x9];
+	u8        op_swap_endianness[0x1];
+	u8        op_min[0x1];
+	u8        op_xor[0x1];
+	u8        op_or[0x1];
+	u8        op_and[0x1];
+	u8        op_max[0x1];
+	u8        op_add[0x1];
+};
+
+struct mlx5_ifc_vector_calc_cap_bits {
+	u8         calc_matrix[0x1];
+	u8         reserved_at_1[0x1f];
+	u8         reserved_at_20[0x8];
+	u8         max_vec_count[0x8];
+	u8         reserved_at_30[0xd];
+	u8         max_chunk_size[0x3];
+	struct mlx5_ifc_calc_op calc0;
+	struct mlx5_ifc_calc_op calc1;
+	struct mlx5_ifc_calc_op calc2;
+	struct mlx5_ifc_calc_op calc3;
+
+	u8         reserved_at_e0[0x720];
+};
+
+enum {
+	MLX5_WQ_TYPE_LINKED_LIST  = 0x0,
+	MLX5_WQ_TYPE_CYCLIC       = 0x1,
+	MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ = 0x2,
+	MLX5_WQ_TYPE_CYCLIC_STRIDING_RQ = 0x3,
+};
+
+enum {
+	MLX5_WQ_END_PAD_MODE_NONE   = 0x0,
+	MLX5_WQ_END_PAD_MODE_ALIGN  = 0x1,
+};
+
+enum {
+	MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_8_GID_ENTRIES    = 0x0,
+	MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_16_GID_ENTRIES   = 0x1,
+	MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_32_GID_ENTRIES   = 0x2,
+	MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_64_GID_ENTRIES   = 0x3,
+	MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_128_GID_ENTRIES  = 0x4,
+};
+
+enum {
+	MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_128_ENTRIES  = 0x0,
+	MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_256_ENTRIES  = 0x1,
+	MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_512_ENTRIES  = 0x2,
+	MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_1K_ENTRIES   = 0x3,
+	MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_2K_ENTRIES   = 0x4,
+	MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_4K_ENTRIES   = 0x5,
+};
+
+enum {
+	MLX5_CMD_HCA_CAP_PORT_TYPE_IB        = 0x0,
+	MLX5_CMD_HCA_CAP_PORT_TYPE_ETHERNET  = 0x1,
+};
+
+enum {
+	MLX5_CMD_HCA_CAP_CMDIF_CHECKSUM_DISABLED       = 0x0,
+	MLX5_CMD_HCA_CAP_CMDIF_CHECKSUM_INITIAL_STATE  = 0x1,
+	MLX5_CMD_HCA_CAP_CMDIF_CHECKSUM_ENABLED        = 0x3,
+};
+
+enum {
+	MLX5_CAP_PORT_TYPE_IB  = 0x0,
+	MLX5_CAP_PORT_TYPE_ETH = 0x1,
+};
+
+enum {
+	MLX5_CAP_UMR_FENCE_STRONG	= 0x0,
+	MLX5_CAP_UMR_FENCE_SMALL	= 0x1,
+	MLX5_CAP_UMR_FENCE_NONE		= 0x2,
+};
+
+struct mlx5_ifc_cmd_hca_cap_bits {
+	u8         reserved_at_0[0x30];
+	u8         vhca_id[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	u8         log_max_srq_sz[0x8];
+	u8         log_max_qp_sz[0x8];
+	u8         reserved_at_90[0xb];
+	u8         log_max_qp[0x5];
+
+	u8         reserved_at_a0[0xb];
+	u8         log_max_srq[0x5];
+	u8         reserved_at_b0[0x10];
+
+	u8         reserved_at_c0[0x8];
+	u8         log_max_cq_sz[0x8];
+	u8         reserved_at_d0[0xb];
+	u8         log_max_cq[0x5];
+
+	u8         log_max_eq_sz[0x8];
+	u8         reserved_at_e8[0x2];
+	u8         log_max_mkey[0x6];
+	u8         reserved_at_f0[0xc];
+	u8         log_max_eq[0x4];
+
+	u8         max_indirection[0x8];
+	u8         fixed_buffer_size[0x1];
+	u8         log_max_mrw_sz[0x7];
+	u8         force_teardown[0x1];
+	u8         reserved_at_111[0x1];
+	u8         log_max_bsf_list_size[0x6];
+	u8         umr_extended_translation_offset[0x1];
+	u8         null_mkey[0x1];
+	u8         log_max_klm_list_size[0x6];
+
+	u8         reserved_at_120[0xa];
+	u8         log_max_ra_req_dc[0x6];
+	u8         reserved_at_130[0xa];
+	u8         log_max_ra_res_dc[0x6];
+
+	u8         reserved_at_140[0xa];
+	u8         log_max_ra_req_qp[0x6];
+	u8         reserved_at_150[0xa];
+	u8         log_max_ra_res_qp[0x6];
+
+	u8         end_pad[0x1];
+	u8         cc_query_allowed[0x1];
+	u8         cc_modify_allowed[0x1];
+	u8         start_pad[0x1];
+	u8         cache_line_128byte[0x1];
+	u8         reserved_at_165[0xa];
+	u8         qcam_reg[0x1];
+	u8         gid_table_size[0x10];
+
+	u8         out_of_seq_cnt[0x1];
+	u8         vport_counters[0x1];
+	u8         retransmission_q_counters[0x1];
+	u8         debug[0x1];
+	u8         modify_rq_counter_set_id[0x1];
+	u8         rq_delay_drop[0x1];
+	u8         max_qp_cnt[0xa];
+	u8         pkey_table_size[0x10];
+
+	u8         vport_group_manager[0x1];
+	u8         vhca_group_manager[0x1];
+	u8         ib_virt[0x1];
+	u8         eth_virt[0x1];
+	u8         vnic_env_queue_counters[0x1];
+	u8         ets[0x1];
+	u8         nic_flow_table[0x1];
+	u8         eswitch_flow_table[0x1];
+	u8         device_memory[0x1];
+	u8         mcam_reg[0x1];
+	u8         pcam_reg[0x1];
+	u8         local_ca_ack_delay[0x5];
+	u8         port_module_event[0x1];
+	u8         enhanced_error_q_counters[0x1];
+	u8         ports_check[0x1];
+	u8         reserved_at_1b3[0x1];
+	u8         disable_link_up[0x1];
+	u8         beacon_led[0x1];
+	u8         port_type[0x2];
+	u8         num_ports[0x8];
+
+	u8         reserved_at_1c0[0x1];
+	u8         pps[0x1];
+	u8         pps_modify[0x1];
+	u8         log_max_msg[0x5];
+	u8         reserved_at_1c8[0x4];
+	u8         max_tc[0x4];
+	u8         reserved_at_1d0[0x1];
+	u8         dcbx[0x1];
+	u8         general_notification_event[0x1];
+	u8         reserved_at_1d3[0x2];
+	u8         fpga[0x1];
+	u8         rol_s[0x1];
+	u8         rol_g[0x1];
+	u8         reserved_at_1d8[0x1];
+	u8         wol_s[0x1];
+	u8         wol_g[0x1];
+	u8         wol_a[0x1];
+	u8         wol_b[0x1];
+	u8         wol_m[0x1];
+	u8         wol_u[0x1];
+	u8         wol_p[0x1];
+
+	u8         stat_rate_support[0x10];
+	u8         reserved_at_1f0[0xc];
+	u8         cqe_version[0x4];
+
+	u8         compact_address_vector[0x1];
+	u8         striding_rq[0x1];
+	u8         reserved_at_202[0x1];
+	u8         ipoib_enhanced_offloads[0x1];
+	u8         ipoib_basic_offloads[0x1];
+	u8         reserved_at_205[0x1];
+	u8         repeated_block_disabled[0x1];
+	u8         umr_modify_entity_size_disabled[0x1];
+	u8         umr_modify_atomic_disabled[0x1];
+	u8         umr_indirect_mkey_disabled[0x1];
+	u8         umr_fence[0x2];
+	u8         reserved_at_20c[0x3];
+	u8         drain_sigerr[0x1];
+	u8         cmdif_checksum[0x2];
+	u8         sigerr_cqe[0x1];
+	u8         reserved_at_213[0x1];
+	u8         wq_signature[0x1];
+	u8         sctr_data_cqe[0x1];
+	u8         reserved_at_216[0x1];
+	u8         sho[0x1];
+	u8         tph[0x1];
+	u8         rf[0x1];
+	u8         dct[0x1];
+	u8         qos[0x1];
+	u8         eth_net_offloads[0x1];
+	u8         roce[0x1];
+	u8         atomic[0x1];
+	u8         reserved_at_21f[0x1];
+
+	u8         cq_oi[0x1];
+	u8         cq_resize[0x1];
+	u8         cq_moderation[0x1];
+	u8         reserved_at_223[0x3];
+	u8         cq_eq_remap[0x1];
+	u8         pg[0x1];
+	u8         block_lb_mc[0x1];
+	u8         reserved_at_229[0x1];
+	u8         scqe_break_moderation[0x1];
+	u8         cq_period_start_from_cqe[0x1];
+	u8         cd[0x1];
+	u8         reserved_at_22d[0x1];
+	u8         apm[0x1];
+	u8         vector_calc[0x1];
+	u8         umr_ptr_rlky[0x1];
+	u8	   imaicl[0x1];
+	u8         reserved_at_232[0x4];
+	u8         qkv[0x1];
+	u8         pkv[0x1];
+	u8         set_deth_sqpn[0x1];
+	u8         reserved_at_239[0x3];
+	u8         xrc[0x1];
+	u8         ud[0x1];
+	u8         uc[0x1];
+	u8         rc[0x1];
+
+	u8         uar_4k[0x1];
+	u8         reserved_at_241[0x9];
+	u8         uar_sz[0x6];
+	u8         reserved_at_250[0x8];
+	u8         log_pg_sz[0x8];
+
+	u8         bf[0x1];
+	u8         driver_version[0x1];
+	u8         pad_tx_eth_packet[0x1];
+	u8         reserved_at_263[0x8];
+	u8         log_bf_reg_size[0x5];
+
+	u8         reserved_at_270[0xb];
+	u8         lag_master[0x1];
+	u8         num_lag_ports[0x4];
+
+	u8         reserved_at_280[0x10];
+	u8         max_wqe_sz_sq[0x10];
+
+	u8         reserved_at_2a0[0x10];
+	u8         max_wqe_sz_rq[0x10];
+
+	u8         max_flow_counter_31_16[0x10];
+	u8         max_wqe_sz_sq_dc[0x10];
+
+	u8         reserved_at_2e0[0x7];
+	u8         max_qp_mcg[0x19];
+
+	u8         reserved_at_300[0x18];
+	u8         log_max_mcg[0x8];
+
+	u8         reserved_at_320[0x3];
+	u8         log_max_transport_domain[0x5];
+	u8         reserved_at_328[0x3];
+	u8         log_max_pd[0x5];
+	u8         reserved_at_330[0xb];
+	u8         log_max_xrcd[0x5];
+
+	u8         nic_receive_steering_discard[0x1];
+	u8         receive_discard_vport_down[0x1];
+	u8         transmit_discard_vport_down[0x1];
+	u8         reserved_at_343[0x5];
+	u8         log_max_flow_counter_bulk[0x8];
+	u8         max_flow_counter_15_0[0x10];
+
+
+	u8         reserved_at_360[0x3];
+	u8         log_max_rq[0x5];
+	u8         reserved_at_368[0x3];
+	u8         log_max_sq[0x5];
+	u8         reserved_at_370[0x3];
+	u8         log_max_tir[0x5];
+	u8         reserved_at_378[0x3];
+	u8         log_max_tis[0x5];
+
+	u8         basic_cyclic_rcv_wqe[0x1];
+	u8         reserved_at_381[0x2];
+	u8         log_max_rmp[0x5];
+	u8         reserved_at_388[0x3];
+	u8         log_max_rqt[0x5];
+	u8         reserved_at_390[0x3];
+	u8         log_max_rqt_size[0x5];
+	u8         reserved_at_398[0x3];
+	u8         log_max_tis_per_sq[0x5];
+
+	u8         ext_stride_num_range[0x1];
+	u8         reserved_at_3a1[0x2];
+	u8         log_max_stride_sz_rq[0x5];
+	u8         reserved_at_3a8[0x3];
+	u8         log_min_stride_sz_rq[0x5];
+	u8         reserved_at_3b0[0x3];
+	u8         log_max_stride_sz_sq[0x5];
+	u8         reserved_at_3b8[0x3];
+	u8         log_min_stride_sz_sq[0x5];
+
+	u8         hairpin[0x1];
+	u8         reserved_at_3c1[0x2];
+	u8         log_max_hairpin_queues[0x5];
+	u8         reserved_at_3c8[0x3];
+	u8         log_max_hairpin_wq_data_sz[0x5];
+	u8         reserved_at_3d0[0x3];
+	u8         log_max_hairpin_num_packets[0x5];
+	u8         reserved_at_3d8[0x3];
+	u8         log_max_wq_sz[0x5];
+
+	u8         nic_vport_change_event[0x1];
+	u8         disable_local_lb_uc[0x1];
+	u8         disable_local_lb_mc[0x1];
+	u8         log_min_hairpin_wq_data_sz[0x5];
+	u8         reserved_at_3e8[0x3];
+	u8         log_max_vlan_list[0x5];
+	u8         reserved_at_3f0[0x3];
+	u8         log_max_current_mc_list[0x5];
+	u8         reserved_at_3f8[0x3];
+	u8         log_max_current_uc_list[0x5];
+
+	u8         reserved_at_400[0x80];
+
+	u8         reserved_at_480[0x3];
+	u8         log_max_l2_table[0x5];
+	u8         reserved_at_488[0x8];
+	u8         log_uar_page_sz[0x10];
+
+	u8         reserved_at_4a0[0x20];
+	u8         device_frequency_mhz[0x20];
+	u8         device_frequency_khz[0x20];
+
+	u8         reserved_at_500[0x20];
+	u8	   num_of_uars_per_page[0x20];
+	u8         reserved_at_540[0x40];
+
+	u8         reserved_at_580[0x3d];
+	u8         cqe_128_always[0x1];
+	u8         cqe_compression_128[0x1];
+	u8         cqe_compression[0x1];
+
+	u8         cqe_compression_timeout[0x10];
+	u8         cqe_compression_max_num[0x10];
+
+	u8         reserved_at_5e0[0x10];
+	u8         tag_matching[0x1];
+	u8         rndv_offload_rc[0x1];
+	u8         rndv_offload_dc[0x1];
+	u8         log_tag_matching_list_sz[0x5];
+	u8         reserved_at_5f8[0x3];
+	u8         log_max_xrq[0x5];
+
+	u8	   affiliate_nic_vport_criteria[0x8];
+	u8	   native_port_num[0x8];
+	u8	   num_vhca_ports[0x8];
+	u8	   reserved_at_618[0x6];
+	u8	   sw_owner_id[0x1];
+	u8	   reserved_at_61f[0x1e1];
+};
+
+enum mlx5_flow_destination_type {
+	MLX5_FLOW_DESTINATION_TYPE_VPORT        = 0x0,
+	MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE   = 0x1,
+	MLX5_FLOW_DESTINATION_TYPE_TIR          = 0x2,
+
+	MLX5_FLOW_DESTINATION_TYPE_PORT         = 0x99,
+	MLX5_FLOW_DESTINATION_TYPE_COUNTER      = 0x100,
+};
+
+struct mlx5_ifc_dest_format_struct_bits {
+	u8         destination_type[0x8];
+	u8         destination_id[0x18];
+
+	u8         reserved_at_20[0x20];
+};
+
+struct mlx5_ifc_flow_counter_list_bits {
+	u8         flow_counter_id[0x20];
+
+	u8         reserved_at_20[0x20];
+};
+
+union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits {
+	struct mlx5_ifc_dest_format_struct_bits dest_format_struct;
+	struct mlx5_ifc_flow_counter_list_bits flow_counter_list;
+	u8         reserved_at_0[0x40];
+};
+
+struct mlx5_ifc_fte_match_param_bits {
+	struct mlx5_ifc_fte_match_set_lyr_2_4_bits outer_headers;
+
+	struct mlx5_ifc_fte_match_set_misc_bits misc_parameters;
+
+	struct mlx5_ifc_fte_match_set_lyr_2_4_bits inner_headers;
+
+	u8         reserved_at_600[0xa00];
+};
+
+enum {
+	MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_SRC_IP     = 0x0,
+	MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_DST_IP     = 0x1,
+	MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_SPORT   = 0x2,
+	MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_DPORT   = 0x3,
+	MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_IPSEC_SPI  = 0x4,
+};
+
+struct mlx5_ifc_rx_hash_field_select_bits {
+	u8         l3_prot_type[0x1];
+	u8         l4_prot_type[0x1];
+	u8         selected_fields[0x1e];
+};
+
+enum {
+	MLX5_WQ_WQ_TYPE_WQ_LINKED_LIST  = 0x0,
+	MLX5_WQ_WQ_TYPE_WQ_CYCLIC       = 0x1,
+};
+
+enum {
+	MLX5_WQ_END_PADDING_MODE_END_PAD_NONE   = 0x0,
+	MLX5_WQ_END_PADDING_MODE_END_PAD_ALIGN  = 0x1,
+};
+
+struct mlx5_ifc_wq_bits {
+	u8         wq_type[0x4];
+	u8         wq_signature[0x1];
+	u8         end_padding_mode[0x2];
+	u8         cd_slave[0x1];
+	u8         reserved_at_8[0x18];
+
+	u8         hds_skip_first_sge[0x1];
+	u8         log2_hds_buf_size[0x3];
+	u8         reserved_at_24[0x7];
+	u8         page_offset[0x5];
+	u8         lwm[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         pd[0x18];
+
+	u8         reserved_at_60[0x8];
+	u8         uar_page[0x18];
+
+	u8         dbr_addr[0x40];
+
+	u8         hw_counter[0x20];
+
+	u8         sw_counter[0x20];
+
+	u8         reserved_at_100[0xc];
+	u8         log_wq_stride[0x4];
+	u8         reserved_at_110[0x3];
+	u8         log_wq_pg_sz[0x5];
+	u8         reserved_at_118[0x3];
+	u8         log_wq_sz[0x5];
+
+	u8         reserved_at_120[0x3];
+	u8         log_hairpin_num_packets[0x5];
+	u8         reserved_at_128[0x3];
+	u8         log_hairpin_data_sz[0x5];
+
+	u8         reserved_at_130[0x4];
+	u8         log_wqe_num_of_strides[0x4];
+	u8         two_byte_shift_en[0x1];
+	u8         reserved_at_139[0x4];
+	u8         log_wqe_stride_size[0x3];
+
+	u8         reserved_at_140[0x4c0];
+
+	struct mlx5_ifc_cmd_pas_bits pas[0];
+};
+
+struct mlx5_ifc_rq_num_bits {
+	u8         reserved_at_0[0x8];
+	u8         rq_num[0x18];
+};
+
+struct mlx5_ifc_mac_address_layout_bits {
+	u8         reserved_at_0[0x10];
+	u8         mac_addr_47_32[0x10];
+
+	u8         mac_addr_31_0[0x20];
+};
+
+struct mlx5_ifc_vlan_layout_bits {
+	u8         reserved_at_0[0x14];
+	u8         vlan[0x0c];
+
+	u8         reserved_at_20[0x20];
+};
+
+struct mlx5_ifc_cong_control_r_roce_ecn_np_bits {
+	u8         reserved_at_0[0xa0];
+
+	u8         min_time_between_cnps[0x20];
+
+	u8         reserved_at_c0[0x12];
+	u8         cnp_dscp[0x6];
+	u8         reserved_at_d8[0x4];
+	u8         cnp_prio_mode[0x1];
+	u8         cnp_802p_prio[0x3];
+
+	u8         reserved_at_e0[0x720];
+};
+
+struct mlx5_ifc_cong_control_r_roce_ecn_rp_bits {
+	u8         reserved_at_0[0x60];
+
+	u8         reserved_at_60[0x4];
+	u8         clamp_tgt_rate[0x1];
+	u8         reserved_at_65[0x3];
+	u8         clamp_tgt_rate_after_time_inc[0x1];
+	u8         reserved_at_69[0x17];
+
+	u8         reserved_at_80[0x20];
+
+	u8         rpg_time_reset[0x20];
+
+	u8         rpg_byte_reset[0x20];
+
+	u8         rpg_threshold[0x20];
+
+	u8         rpg_max_rate[0x20];
+
+	u8         rpg_ai_rate[0x20];
+
+	u8         rpg_hai_rate[0x20];
+
+	u8         rpg_gd[0x20];
+
+	u8         rpg_min_dec_fac[0x20];
+
+	u8         rpg_min_rate[0x20];
+
+	u8         reserved_at_1c0[0xe0];
+
+	u8         rate_to_set_on_first_cnp[0x20];
+
+	u8         dce_tcp_g[0x20];
+
+	u8         dce_tcp_rtt[0x20];
+
+	u8         rate_reduce_monitor_period[0x20];
+
+	u8         reserved_at_320[0x20];
+
+	u8         initial_alpha_value[0x20];
+
+	u8         reserved_at_360[0x4a0];
+};
+
+struct mlx5_ifc_cong_control_802_1qau_rp_bits {
+	u8         reserved_at_0[0x80];
+
+	u8         rppp_max_rps[0x20];
+
+	u8         rpg_time_reset[0x20];
+
+	u8         rpg_byte_reset[0x20];
+
+	u8         rpg_threshold[0x20];
+
+	u8         rpg_max_rate[0x20];
+
+	u8         rpg_ai_rate[0x20];
+
+	u8         rpg_hai_rate[0x20];
+
+	u8         rpg_gd[0x20];
+
+	u8         rpg_min_dec_fac[0x20];
+
+	u8         rpg_min_rate[0x20];
+
+	u8         reserved_at_1c0[0x640];
+};
+
+enum {
+	MLX5_RESIZE_FIELD_SELECT_RESIZE_FIELD_SELECT_LOG_CQ_SIZE    = 0x1,
+	MLX5_RESIZE_FIELD_SELECT_RESIZE_FIELD_SELECT_PAGE_OFFSET    = 0x2,
+	MLX5_RESIZE_FIELD_SELECT_RESIZE_FIELD_SELECT_LOG_PAGE_SIZE  = 0x4,
+};
+
+struct mlx5_ifc_resize_field_select_bits {
+	u8         resize_field_select[0x20];
+};
+
+enum {
+	MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_CQ_PERIOD     = 0x1,
+	MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_CQ_MAX_COUNT  = 0x2,
+	MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_OI            = 0x4,
+	MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_C_EQN         = 0x8,
+};
+
+struct mlx5_ifc_modify_field_select_bits {
+	u8         modify_field_select[0x20];
+};
+
+struct mlx5_ifc_field_select_r_roce_np_bits {
+	u8         field_select_r_roce_np[0x20];
+};
+
+struct mlx5_ifc_field_select_r_roce_rp_bits {
+	u8         field_select_r_roce_rp[0x20];
+};
+
+enum {
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPPP_MAX_RPS     = 0x4,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_TIME_RESET   = 0x8,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_BYTE_RESET   = 0x10,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_THRESHOLD    = 0x20,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_MAX_RATE     = 0x40,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_AI_RATE      = 0x80,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_HAI_RATE     = 0x100,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_GD           = 0x200,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_MIN_DEC_FAC  = 0x400,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_MIN_RATE     = 0x800,
+};
+
+struct mlx5_ifc_field_select_802_1qau_rp_bits {
+	u8         field_select_8021qaurp[0x20];
+};
+
+struct mlx5_ifc_phys_layer_cntrs_bits {
+	u8         time_since_last_clear_high[0x20];
+
+	u8         time_since_last_clear_low[0x20];
+
+	u8         symbol_errors_high[0x20];
+
+	u8         symbol_errors_low[0x20];
+
+	u8         sync_headers_errors_high[0x20];
+
+	u8         sync_headers_errors_low[0x20];
+
+	u8         edpl_bip_errors_lane0_high[0x20];
+
+	u8         edpl_bip_errors_lane0_low[0x20];
+
+	u8         edpl_bip_errors_lane1_high[0x20];
+
+	u8         edpl_bip_errors_lane1_low[0x20];
+
+	u8         edpl_bip_errors_lane2_high[0x20];
+
+	u8         edpl_bip_errors_lane2_low[0x20];
+
+	u8         edpl_bip_errors_lane3_high[0x20];
+
+	u8         edpl_bip_errors_lane3_low[0x20];
+
+	u8         fc_fec_corrected_blocks_lane0_high[0x20];
+
+	u8         fc_fec_corrected_blocks_lane0_low[0x20];
+
+	u8         fc_fec_corrected_blocks_lane1_high[0x20];
+
+	u8         fc_fec_corrected_blocks_lane1_low[0x20];
+
+	u8         fc_fec_corrected_blocks_lane2_high[0x20];
+
+	u8         fc_fec_corrected_blocks_lane2_low[0x20];
+
+	u8         fc_fec_corrected_blocks_lane3_high[0x20];
+
+	u8         fc_fec_corrected_blocks_lane3_low[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane0_high[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane0_low[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane1_high[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane1_low[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane2_high[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane2_low[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane3_high[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane3_low[0x20];
+
+	u8         rs_fec_corrected_blocks_high[0x20];
+
+	u8         rs_fec_corrected_blocks_low[0x20];
+
+	u8         rs_fec_uncorrectable_blocks_high[0x20];
+
+	u8         rs_fec_uncorrectable_blocks_low[0x20];
+
+	u8         rs_fec_no_errors_blocks_high[0x20];
+
+	u8         rs_fec_no_errors_blocks_low[0x20];
+
+	u8         rs_fec_single_error_blocks_high[0x20];
+
+	u8         rs_fec_single_error_blocks_low[0x20];
+
+	u8         rs_fec_corrected_symbols_total_high[0x20];
+
+	u8         rs_fec_corrected_symbols_total_low[0x20];
+
+	u8         rs_fec_corrected_symbols_lane0_high[0x20];
+
+	u8         rs_fec_corrected_symbols_lane0_low[0x20];
+
+	u8         rs_fec_corrected_symbols_lane1_high[0x20];
+
+	u8         rs_fec_corrected_symbols_lane1_low[0x20];
+
+	u8         rs_fec_corrected_symbols_lane2_high[0x20];
+
+	u8         rs_fec_corrected_symbols_lane2_low[0x20];
+
+	u8         rs_fec_corrected_symbols_lane3_high[0x20];
+
+	u8         rs_fec_corrected_symbols_lane3_low[0x20];
+
+	u8         link_down_events[0x20];
+
+	u8         successful_recovery_events[0x20];
+
+	u8         reserved_at_640[0x180];
+};
+
+struct mlx5_ifc_phys_layer_statistical_cntrs_bits {
+	u8         time_since_last_clear_high[0x20];
+
+	u8         time_since_last_clear_low[0x20];
+
+	u8         phy_received_bits_high[0x20];
+
+	u8         phy_received_bits_low[0x20];
+
+	u8         phy_symbol_errors_high[0x20];
+
+	u8         phy_symbol_errors_low[0x20];
+
+	u8         phy_corrected_bits_high[0x20];
+
+	u8         phy_corrected_bits_low[0x20];
+
+	u8         phy_corrected_bits_lane0_high[0x20];
+
+	u8         phy_corrected_bits_lane0_low[0x20];
+
+	u8         phy_corrected_bits_lane1_high[0x20];
+
+	u8         phy_corrected_bits_lane1_low[0x20];
+
+	u8         phy_corrected_bits_lane2_high[0x20];
+
+	u8         phy_corrected_bits_lane2_low[0x20];
+
+	u8         phy_corrected_bits_lane3_high[0x20];
+
+	u8         phy_corrected_bits_lane3_low[0x20];
+
+	u8         reserved_at_200[0x5c0];
+};
+
+struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits {
+	u8	   symbol_error_counter[0x10];
+
+	u8         link_error_recovery_counter[0x8];
+
+	u8         link_downed_counter[0x8];
+
+	u8         port_rcv_errors[0x10];
+
+	u8         port_rcv_remote_physical_errors[0x10];
+
+	u8         port_rcv_switch_relay_errors[0x10];
+
+	u8         port_xmit_discards[0x10];
+
+	u8         port_xmit_constraint_errors[0x8];
+
+	u8         port_rcv_constraint_errors[0x8];
+
+	u8         reserved_at_70[0x8];
+
+	u8         link_overrun_errors[0x8];
+
+	u8	   reserved_at_80[0x10];
+
+	u8         vl_15_dropped[0x10];
+
+	u8	   reserved_at_a0[0x80];
+
+	u8         port_xmit_wait[0x20];
+};
+
+struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits {
+	u8         transmit_queue_high[0x20];
+
+	u8         transmit_queue_low[0x20];
+
+	u8         reserved_at_40[0x780];
+};
+
+struct mlx5_ifc_eth_per_prio_grp_data_layout_bits {
+	u8         rx_octets_high[0x20];
+
+	u8         rx_octets_low[0x20];
+
+	u8         reserved_at_40[0xc0];
+
+	u8         rx_frames_high[0x20];
+
+	u8         rx_frames_low[0x20];
+
+	u8         tx_octets_high[0x20];
+
+	u8         tx_octets_low[0x20];
+
+	u8         reserved_at_180[0xc0];
+
+	u8         tx_frames_high[0x20];
+
+	u8         tx_frames_low[0x20];
+
+	u8         rx_pause_high[0x20];
+
+	u8         rx_pause_low[0x20];
+
+	u8         rx_pause_duration_high[0x20];
+
+	u8         rx_pause_duration_low[0x20];
+
+	u8         tx_pause_high[0x20];
+
+	u8         tx_pause_low[0x20];
+
+	u8         tx_pause_duration_high[0x20];
+
+	u8         tx_pause_duration_low[0x20];
+
+	u8         rx_pause_transition_high[0x20];
+
+	u8         rx_pause_transition_low[0x20];
+
+	u8         reserved_at_3c0[0x40];
+
+	u8         device_stall_minor_watermark_cnt_high[0x20];
+
+	u8         device_stall_minor_watermark_cnt_low[0x20];
+
+	u8         device_stall_critical_watermark_cnt_high[0x20];
+
+	u8         device_stall_critical_watermark_cnt_low[0x20];
+
+	u8         reserved_at_480[0x340];
+};
+
+struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits {
+	u8         port_transmit_wait_high[0x20];
+
+	u8         port_transmit_wait_low[0x20];
+
+	u8         reserved_at_40[0x100];
+
+	u8         rx_buffer_almost_full_high[0x20];
+
+	u8         rx_buffer_almost_full_low[0x20];
+
+	u8         rx_buffer_full_high[0x20];
+
+	u8         rx_buffer_full_low[0x20];
+
+	u8         reserved_at_1c0[0x600];
+};
+
+struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits {
+	u8         dot3stats_alignment_errors_high[0x20];
+
+	u8         dot3stats_alignment_errors_low[0x20];
+
+	u8         dot3stats_fcs_errors_high[0x20];
+
+	u8         dot3stats_fcs_errors_low[0x20];
+
+	u8         dot3stats_single_collision_frames_high[0x20];
+
+	u8         dot3stats_single_collision_frames_low[0x20];
+
+	u8         dot3stats_multiple_collision_frames_high[0x20];
+
+	u8         dot3stats_multiple_collision_frames_low[0x20];
+
+	u8         dot3stats_sqe_test_errors_high[0x20];
+
+	u8         dot3stats_sqe_test_errors_low[0x20];
+
+	u8         dot3stats_deferred_transmissions_high[0x20];
+
+	u8         dot3stats_deferred_transmissions_low[0x20];
+
+	u8         dot3stats_late_collisions_high[0x20];
+
+	u8         dot3stats_late_collisions_low[0x20];
+
+	u8         dot3stats_excessive_collisions_high[0x20];
+
+	u8         dot3stats_excessive_collisions_low[0x20];
+
+	u8         dot3stats_internal_mac_transmit_errors_high[0x20];
+
+	u8         dot3stats_internal_mac_transmit_errors_low[0x20];
+
+	u8         dot3stats_carrier_sense_errors_high[0x20];
+
+	u8         dot3stats_carrier_sense_errors_low[0x20];
+
+	u8         dot3stats_frame_too_longs_high[0x20];
+
+	u8         dot3stats_frame_too_longs_low[0x20];
+
+	u8         dot3stats_internal_mac_receive_errors_high[0x20];
+
+	u8         dot3stats_internal_mac_receive_errors_low[0x20];
+
+	u8         dot3stats_symbol_errors_high[0x20];
+
+	u8         dot3stats_symbol_errors_low[0x20];
+
+	u8         dot3control_in_unknown_opcodes_high[0x20];
+
+	u8         dot3control_in_unknown_opcodes_low[0x20];
+
+	u8         dot3in_pause_frames_high[0x20];
+
+	u8         dot3in_pause_frames_low[0x20];
+
+	u8         dot3out_pause_frames_high[0x20];
+
+	u8         dot3out_pause_frames_low[0x20];
+
+	u8         reserved_at_400[0x3c0];
+};
+
+struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits {
+	u8         ether_stats_drop_events_high[0x20];
+
+	u8         ether_stats_drop_events_low[0x20];
+
+	u8         ether_stats_octets_high[0x20];
+
+	u8         ether_stats_octets_low[0x20];
+
+	u8         ether_stats_pkts_high[0x20];
+
+	u8         ether_stats_pkts_low[0x20];
+
+	u8         ether_stats_broadcast_pkts_high[0x20];
+
+	u8         ether_stats_broadcast_pkts_low[0x20];
+
+	u8         ether_stats_multicast_pkts_high[0x20];
+
+	u8         ether_stats_multicast_pkts_low[0x20];
+
+	u8         ether_stats_crc_align_errors_high[0x20];
+
+	u8         ether_stats_crc_align_errors_low[0x20];
+
+	u8         ether_stats_undersize_pkts_high[0x20];
+
+	u8         ether_stats_undersize_pkts_low[0x20];
+
+	u8         ether_stats_oversize_pkts_high[0x20];
+
+	u8         ether_stats_oversize_pkts_low[0x20];
+
+	u8         ether_stats_fragments_high[0x20];
+
+	u8         ether_stats_fragments_low[0x20];
+
+	u8         ether_stats_jabbers_high[0x20];
+
+	u8         ether_stats_jabbers_low[0x20];
+
+	u8         ether_stats_collisions_high[0x20];
+
+	u8         ether_stats_collisions_low[0x20];
+
+	u8         ether_stats_pkts64octets_high[0x20];
+
+	u8         ether_stats_pkts64octets_low[0x20];
+
+	u8         ether_stats_pkts65to127octets_high[0x20];
+
+	u8         ether_stats_pkts65to127octets_low[0x20];
+
+	u8         ether_stats_pkts128to255octets_high[0x20];
+
+	u8         ether_stats_pkts128to255octets_low[0x20];
+
+	u8         ether_stats_pkts256to511octets_high[0x20];
+
+	u8         ether_stats_pkts256to511octets_low[0x20];
+
+	u8         ether_stats_pkts512to1023octets_high[0x20];
+
+	u8         ether_stats_pkts512to1023octets_low[0x20];
+
+	u8         ether_stats_pkts1024to1518octets_high[0x20];
+
+	u8         ether_stats_pkts1024to1518octets_low[0x20];
+
+	u8         ether_stats_pkts1519to2047octets_high[0x20];
+
+	u8         ether_stats_pkts1519to2047octets_low[0x20];
+
+	u8         ether_stats_pkts2048to4095octets_high[0x20];
+
+	u8         ether_stats_pkts2048to4095octets_low[0x20];
+
+	u8         ether_stats_pkts4096to8191octets_high[0x20];
+
+	u8         ether_stats_pkts4096to8191octets_low[0x20];
+
+	u8         ether_stats_pkts8192to10239octets_high[0x20];
+
+	u8         ether_stats_pkts8192to10239octets_low[0x20];
+
+	u8         reserved_at_540[0x280];
+};
+
+struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits {
+	u8         if_in_octets_high[0x20];
+
+	u8         if_in_octets_low[0x20];
+
+	u8         if_in_ucast_pkts_high[0x20];
+
+	u8         if_in_ucast_pkts_low[0x20];
+
+	u8         if_in_discards_high[0x20];
+
+	u8         if_in_discards_low[0x20];
+
+	u8         if_in_errors_high[0x20];
+
+	u8         if_in_errors_low[0x20];
+
+	u8         if_in_unknown_protos_high[0x20];
+
+	u8         if_in_unknown_protos_low[0x20];
+
+	u8         if_out_octets_high[0x20];
+
+	u8         if_out_octets_low[0x20];
+
+	u8         if_out_ucast_pkts_high[0x20];
+
+	u8         if_out_ucast_pkts_low[0x20];
+
+	u8         if_out_discards_high[0x20];
+
+	u8         if_out_discards_low[0x20];
+
+	u8         if_out_errors_high[0x20];
+
+	u8         if_out_errors_low[0x20];
+
+	u8         if_in_multicast_pkts_high[0x20];
+
+	u8         if_in_multicast_pkts_low[0x20];
+
+	u8         if_in_broadcast_pkts_high[0x20];
+
+	u8         if_in_broadcast_pkts_low[0x20];
+
+	u8         if_out_multicast_pkts_high[0x20];
+
+	u8         if_out_multicast_pkts_low[0x20];
+
+	u8         if_out_broadcast_pkts_high[0x20];
+
+	u8         if_out_broadcast_pkts_low[0x20];
+
+	u8         reserved_at_340[0x480];
+};
+
+struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits {
+	u8         a_frames_transmitted_ok_high[0x20];
+
+	u8         a_frames_transmitted_ok_low[0x20];
+
+	u8         a_frames_received_ok_high[0x20];
+
+	u8         a_frames_received_ok_low[0x20];
+
+	u8         a_frame_check_sequence_errors_high[0x20];
+
+	u8         a_frame_check_sequence_errors_low[0x20];
+
+	u8         a_alignment_errors_high[0x20];
+
+	u8         a_alignment_errors_low[0x20];
+
+	u8         a_octets_transmitted_ok_high[0x20];
+
+	u8         a_octets_transmitted_ok_low[0x20];
+
+	u8         a_octets_received_ok_high[0x20];
+
+	u8         a_octets_received_ok_low[0x20];
+
+	u8         a_multicast_frames_xmitted_ok_high[0x20];
+
+	u8         a_multicast_frames_xmitted_ok_low[0x20];
+
+	u8         a_broadcast_frames_xmitted_ok_high[0x20];
+
+	u8         a_broadcast_frames_xmitted_ok_low[0x20];
+
+	u8         a_multicast_frames_received_ok_high[0x20];
+
+	u8         a_multicast_frames_received_ok_low[0x20];
+
+	u8         a_broadcast_frames_received_ok_high[0x20];
+
+	u8         a_broadcast_frames_received_ok_low[0x20];
+
+	u8         a_in_range_length_errors_high[0x20];
+
+	u8         a_in_range_length_errors_low[0x20];
+
+	u8         a_out_of_range_length_field_high[0x20];
+
+	u8         a_out_of_range_length_field_low[0x20];
+
+	u8         a_frame_too_long_errors_high[0x20];
+
+	u8         a_frame_too_long_errors_low[0x20];
+
+	u8         a_symbol_error_during_carrier_high[0x20];
+
+	u8         a_symbol_error_during_carrier_low[0x20];
+
+	u8         a_mac_control_frames_transmitted_high[0x20];
+
+	u8         a_mac_control_frames_transmitted_low[0x20];
+
+	u8         a_mac_control_frames_received_high[0x20];
+
+	u8         a_mac_control_frames_received_low[0x20];
+
+	u8         a_unsupported_opcodes_received_high[0x20];
+
+	u8         a_unsupported_opcodes_received_low[0x20];
+
+	u8         a_pause_mac_ctrl_frames_received_high[0x20];
+
+	u8         a_pause_mac_ctrl_frames_received_low[0x20];
+
+	u8         a_pause_mac_ctrl_frames_transmitted_high[0x20];
+
+	u8         a_pause_mac_ctrl_frames_transmitted_low[0x20];
+
+	u8         reserved_at_4c0[0x300];
+};
+
+struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits {
+	u8         life_time_counter_high[0x20];
+
+	u8         life_time_counter_low[0x20];
+
+	u8         rx_errors[0x20];
+
+	u8         tx_errors[0x20];
+
+	u8         l0_to_recovery_eieos[0x20];
+
+	u8         l0_to_recovery_ts[0x20];
+
+	u8         l0_to_recovery_framing[0x20];
+
+	u8         l0_to_recovery_retrain[0x20];
+
+	u8         crc_error_dllp[0x20];
+
+	u8         crc_error_tlp[0x20];
+
+	u8         tx_overflow_buffer_pkt_high[0x20];
+
+	u8         tx_overflow_buffer_pkt_low[0x20];
+
+	u8         outbound_stalled_reads[0x20];
+
+	u8         outbound_stalled_writes[0x20];
+
+	u8         outbound_stalled_reads_events[0x20];
+
+	u8         outbound_stalled_writes_events[0x20];
+
+	u8         reserved_at_200[0x5c0];
+};
+
+struct mlx5_ifc_cmd_inter_comp_event_bits {
+	u8         command_completion_vector[0x20];
+
+	u8         reserved_at_20[0xc0];
+};
+
+struct mlx5_ifc_stall_vl_event_bits {
+	u8         reserved_at_0[0x18];
+	u8         port_num[0x1];
+	u8         reserved_at_19[0x3];
+	u8         vl[0x4];
+
+	u8         reserved_at_20[0xa0];
+};
+
+struct mlx5_ifc_db_bf_congestion_event_bits {
+	u8         event_subtype[0x8];
+	u8         reserved_at_8[0x8];
+	u8         congestion_level[0x8];
+	u8         reserved_at_18[0x8];
+
+	u8         reserved_at_20[0xa0];
+};
+
+struct mlx5_ifc_gpio_event_bits {
+	u8         reserved_at_0[0x60];
+
+	u8         gpio_event_hi[0x20];
+
+	u8         gpio_event_lo[0x20];
+
+	u8         reserved_at_a0[0x40];
+};
+
+struct mlx5_ifc_port_state_change_event_bits {
+	u8         reserved_at_0[0x40];
+
+	u8         port_num[0x4];
+	u8         reserved_at_44[0x1c];
+
+	u8         reserved_at_60[0x80];
+};
+
+struct mlx5_ifc_dropped_packet_logged_bits {
+	u8         reserved_at_0[0xe0];
+};
+
+enum {
+	MLX5_CQ_ERROR_SYNDROME_CQ_OVERRUN                 = 0x1,
+	MLX5_CQ_ERROR_SYNDROME_CQ_ACCESS_VIOLATION_ERROR  = 0x2,
+};
+
+struct mlx5_ifc_cq_error_bits {
+	u8         reserved_at_0[0x8];
+	u8         cqn[0x18];
+
+	u8         reserved_at_20[0x20];
+
+	u8         reserved_at_40[0x18];
+	u8         syndrome[0x8];
+
+	u8         reserved_at_60[0x80];
+};
+
+struct mlx5_ifc_rdma_page_fault_event_bits {
+	u8         bytes_committed[0x20];
+
+	u8         r_key[0x20];
+
+	u8         reserved_at_40[0x10];
+	u8         packet_len[0x10];
+
+	u8         rdma_op_len[0x20];
+
+	u8         rdma_va[0x40];
+
+	u8         reserved_at_c0[0x5];
+	u8         rdma[0x1];
+	u8         write[0x1];
+	u8         requestor[0x1];
+	u8         qp_number[0x18];
+};
+
+struct mlx5_ifc_wqe_associated_page_fault_event_bits {
+	u8         bytes_committed[0x20];
+
+	u8         reserved_at_20[0x10];
+	u8         wqe_index[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         len[0x10];
+
+	u8         reserved_at_60[0x60];
+
+	u8         reserved_at_c0[0x5];
+	u8         rdma[0x1];
+	u8         write_read[0x1];
+	u8         requestor[0x1];
+	u8         qpn[0x18];
+};
+
+struct mlx5_ifc_qp_events_bits {
+	u8         reserved_at_0[0xa0];
+
+	u8         type[0x8];
+	u8         reserved_at_a8[0x18];
+
+	u8         reserved_at_c0[0x8];
+	u8         qpn_rqn_sqn[0x18];
+};
+
+struct mlx5_ifc_dct_events_bits {
+	u8         reserved_at_0[0xc0];
+
+	u8         reserved_at_c0[0x8];
+	u8         dct_number[0x18];
+};
+
+struct mlx5_ifc_comp_event_bits {
+	u8         reserved_at_0[0xc0];
+
+	u8         reserved_at_c0[0x8];
+	u8         cq_number[0x18];
+};
+
+enum {
+	MLX5_QPC_STATE_RST        = 0x0,
+	MLX5_QPC_STATE_INIT       = 0x1,
+	MLX5_QPC_STATE_RTR        = 0x2,
+	MLX5_QPC_STATE_RTS        = 0x3,
+	MLX5_QPC_STATE_SQER       = 0x4,
+	MLX5_QPC_STATE_ERR        = 0x6,
+	MLX5_QPC_STATE_SQD        = 0x7,
+	MLX5_QPC_STATE_SUSPENDED  = 0x9,
+};
+
+enum {
+	MLX5_QPC_ST_RC            = 0x0,
+	MLX5_QPC_ST_UC            = 0x1,
+	MLX5_QPC_ST_UD            = 0x2,
+	MLX5_QPC_ST_XRC           = 0x3,
+	MLX5_QPC_ST_DCI           = 0x5,
+	MLX5_QPC_ST_QP0           = 0x7,
+	MLX5_QPC_ST_QP1           = 0x8,
+	MLX5_QPC_ST_RAW_DATAGRAM  = 0x9,
+	MLX5_QPC_ST_REG_UMR       = 0xc,
+};
+
+enum {
+	MLX5_QPC_PM_STATE_ARMED     = 0x0,
+	MLX5_QPC_PM_STATE_REARM     = 0x1,
+	MLX5_QPC_PM_STATE_RESERVED  = 0x2,
+	MLX5_QPC_PM_STATE_MIGRATED  = 0x3,
+};
+
+enum {
+	MLX5_QPC_OFFLOAD_TYPE_RNDV  = 0x1,
+};
+
+enum {
+	MLX5_QPC_END_PADDING_MODE_SCATTER_AS_IS                = 0x0,
+	MLX5_QPC_END_PADDING_MODE_PAD_TO_CACHE_LINE_ALIGNMENT  = 0x1,
+};
+
+enum {
+	MLX5_QPC_MTU_256_BYTES        = 0x1,
+	MLX5_QPC_MTU_512_BYTES        = 0x2,
+	MLX5_QPC_MTU_1K_BYTES         = 0x3,
+	MLX5_QPC_MTU_2K_BYTES         = 0x4,
+	MLX5_QPC_MTU_4K_BYTES         = 0x5,
+	MLX5_QPC_MTU_RAW_ETHERNET_QP  = 0x7,
+};
+
+enum {
+	MLX5_QPC_ATOMIC_MODE_IB_SPEC     = 0x1,
+	MLX5_QPC_ATOMIC_MODE_ONLY_8B     = 0x2,
+	MLX5_QPC_ATOMIC_MODE_UP_TO_8B    = 0x3,
+	MLX5_QPC_ATOMIC_MODE_UP_TO_16B   = 0x4,
+	MLX5_QPC_ATOMIC_MODE_UP_TO_32B   = 0x5,
+	MLX5_QPC_ATOMIC_MODE_UP_TO_64B   = 0x6,
+	MLX5_QPC_ATOMIC_MODE_UP_TO_128B  = 0x7,
+	MLX5_QPC_ATOMIC_MODE_UP_TO_256B  = 0x8,
+};
+
+enum {
+	MLX5_QPC_CS_REQ_DISABLE    = 0x0,
+	MLX5_QPC_CS_REQ_UP_TO_32B  = 0x11,
+	MLX5_QPC_CS_REQ_UP_TO_64B  = 0x22,
+};
+
+enum {
+	MLX5_QPC_CS_RES_DISABLE    = 0x0,
+	MLX5_QPC_CS_RES_UP_TO_32B  = 0x1,
+	MLX5_QPC_CS_RES_UP_TO_64B  = 0x2,
+};
+
+struct mlx5_ifc_qpc_bits {
+	u8         state[0x4];
+	u8         lag_tx_port_affinity[0x4];
+	u8         st[0x8];
+	u8         reserved_at_10[0x3];
+	u8         pm_state[0x2];
+	u8         reserved_at_15[0x3];
+	u8         offload_type[0x4];
+	u8         end_padding_mode[0x2];
+	u8         reserved_at_1e[0x2];
+
+	u8         wq_signature[0x1];
+	u8         block_lb_mc[0x1];
+	u8         atomic_like_write_en[0x1];
+	u8         latency_sensitive[0x1];
+	u8         reserved_at_24[0x1];
+	u8         drain_sigerr[0x1];
+	u8         reserved_at_26[0x2];
+	u8         pd[0x18];
+
+	u8         mtu[0x3];
+	u8         log_msg_max[0x5];
+	u8         reserved_at_48[0x1];
+	u8         log_rq_size[0x4];
+	u8         log_rq_stride[0x3];
+	u8         no_sq[0x1];
+	u8         log_sq_size[0x4];
+	u8         reserved_at_55[0x6];
+	u8         rlky[0x1];
+	u8         ulp_stateless_offload_mode[0x4];
+
+	u8         counter_set_id[0x8];
+	u8         uar_page[0x18];
+
+	u8         reserved_at_80[0x8];
+	u8         user_index[0x18];
+
+	u8         reserved_at_a0[0x3];
+	u8         log_page_size[0x5];
+	u8         remote_qpn[0x18];
+
+	struct mlx5_ifc_ads_bits primary_address_path;
+
+	struct mlx5_ifc_ads_bits secondary_address_path;
+
+	u8         log_ack_req_freq[0x4];
+	u8         reserved_at_384[0x4];
+	u8         log_sra_max[0x3];
+	u8         reserved_at_38b[0x2];
+	u8         retry_count[0x3];
+	u8         rnr_retry[0x3];
+	u8         reserved_at_393[0x1];
+	u8         fre[0x1];
+	u8         cur_rnr_retry[0x3];
+	u8         cur_retry_count[0x3];
+	u8         reserved_at_39b[0x5];
+
+	u8         reserved_at_3a0[0x20];
+
+	u8         reserved_at_3c0[0x8];
+	u8         next_send_psn[0x18];
+
+	u8         reserved_at_3e0[0x8];
+	u8         cqn_snd[0x18];
+
+	u8         reserved_at_400[0x8];
+	u8         deth_sqpn[0x18];
+
+	u8         reserved_at_420[0x20];
+
+	u8         reserved_at_440[0x8];
+	u8         last_acked_psn[0x18];
+
+	u8         reserved_at_460[0x8];
+	u8         ssn[0x18];
+
+	u8         reserved_at_480[0x8];
+	u8         log_rra_max[0x3];
+	u8         reserved_at_48b[0x1];
+	u8         atomic_mode[0x4];
+	u8         rre[0x1];
+	u8         rwe[0x1];
+	u8         rae[0x1];
+	u8         reserved_at_493[0x1];
+	u8         page_offset[0x6];
+	u8         reserved_at_49a[0x3];
+	u8         cd_slave_receive[0x1];
+	u8         cd_slave_send[0x1];
+	u8         cd_master[0x1];
+
+	u8         reserved_at_4a0[0x3];
+	u8         min_rnr_nak[0x5];
+	u8         next_rcv_psn[0x18];
+
+	u8         reserved_at_4c0[0x8];
+	u8         xrcd[0x18];
+
+	u8         reserved_at_4e0[0x8];
+	u8         cqn_rcv[0x18];
+
+	u8         dbr_addr[0x40];
+
+	u8         q_key[0x20];
+
+	u8         reserved_at_560[0x5];
+	u8         rq_type[0x3];
+	u8         srqn_rmpn_xrqn[0x18];
+
+	u8         reserved_at_580[0x8];
+	u8         rmsn[0x18];
+
+	u8         hw_sq_wqebb_counter[0x10];
+	u8         sw_sq_wqebb_counter[0x10];
+
+	u8         hw_rq_counter[0x20];
+
+	u8         sw_rq_counter[0x20];
+
+	u8         reserved_at_600[0x20];
+
+	u8         reserved_at_620[0xf];
+	u8         cgs[0x1];
+	u8         cs_req[0x8];
+	u8         cs_res[0x8];
+
+	u8         dc_access_key[0x40];
+
+	u8         reserved_at_680[0xc0];
+};
+
+struct mlx5_ifc_roce_addr_layout_bits {
+	u8         source_l3_address[16][0x8];
+
+	u8         reserved_at_80[0x3];
+	u8         vlan_valid[0x1];
+	u8         vlan_id[0xc];
+	u8         source_mac_47_32[0x10];
+
+	u8         source_mac_31_0[0x20];
+
+	u8         reserved_at_c0[0x14];
+	u8         roce_l3_type[0x4];
+	u8         roce_version[0x8];
+
+	u8         reserved_at_e0[0x20];
+};
+
+union mlx5_ifc_hca_cap_union_bits {
+	struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap;
+	struct mlx5_ifc_odp_cap_bits odp_cap;
+	struct mlx5_ifc_atomic_caps_bits atomic_caps;
+	struct mlx5_ifc_roce_cap_bits roce_cap;
+	struct mlx5_ifc_per_protocol_networking_offload_caps_bits per_protocol_networking_offload_caps;
+	struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap;
+	struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap;
+	struct mlx5_ifc_e_switch_cap_bits e_switch_cap;
+	struct mlx5_ifc_vector_calc_cap_bits vector_calc_cap;
+	struct mlx5_ifc_qos_cap_bits qos_cap;
+	struct mlx5_ifc_fpga_cap_bits fpga_cap;
+	u8         reserved_at_0[0x8000];
+};
+
+enum {
+	MLX5_FLOW_CONTEXT_ACTION_ALLOW     = 0x1,
+	MLX5_FLOW_CONTEXT_ACTION_DROP      = 0x2,
+	MLX5_FLOW_CONTEXT_ACTION_FWD_DEST  = 0x4,
+	MLX5_FLOW_CONTEXT_ACTION_COUNT     = 0x8,
+	MLX5_FLOW_CONTEXT_ACTION_ENCAP     = 0x10,
+	MLX5_FLOW_CONTEXT_ACTION_DECAP     = 0x20,
+	MLX5_FLOW_CONTEXT_ACTION_MOD_HDR   = 0x40,
+	MLX5_FLOW_CONTEXT_ACTION_VLAN_POP  = 0x80,
+	MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH = 0x100,
+};
+
+struct mlx5_ifc_vlan_bits {
+	u8         ethtype[0x10];
+	u8         prio[0x3];
+	u8         cfi[0x1];
+	u8         vid[0xc];
+};
+
+struct mlx5_ifc_flow_context_bits {
+	struct mlx5_ifc_vlan_bits push_vlan;
+
+	u8         group_id[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         flow_tag[0x18];
+
+	u8         reserved_at_60[0x10];
+	u8         action[0x10];
+
+	u8         reserved_at_80[0x8];
+	u8         destination_list_size[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         flow_counter_list_size[0x18];
+
+	u8         encap_id[0x20];
+
+	u8         modify_header_id[0x20];
+
+	u8         reserved_at_100[0x100];
+
+	struct mlx5_ifc_fte_match_param_bits match_value;
+
+	u8         reserved_at_1200[0x600];
+
+	union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits destination[0];
+};
+
+enum {
+	MLX5_XRC_SRQC_STATE_GOOD   = 0x0,
+	MLX5_XRC_SRQC_STATE_ERROR  = 0x1,
+};
+
+struct mlx5_ifc_xrc_srqc_bits {
+	u8         state[0x4];
+	u8         log_xrc_srq_size[0x4];
+	u8         reserved_at_8[0x18];
+
+	u8         wq_signature[0x1];
+	u8         cont_srq[0x1];
+	u8         reserved_at_22[0x1];
+	u8         rlky[0x1];
+	u8         basic_cyclic_rcv_wqe[0x1];
+	u8         log_rq_stride[0x3];
+	u8         xrcd[0x18];
+
+	u8         page_offset[0x6];
+	u8         reserved_at_46[0x2];
+	u8         cqn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	u8         user_index_equal_xrc_srqn[0x1];
+	u8         reserved_at_81[0x1];
+	u8         log_page_size[0x6];
+	u8         user_index[0x18];
+
+	u8         reserved_at_a0[0x20];
+
+	u8         reserved_at_c0[0x8];
+	u8         pd[0x18];
+
+	u8         lwm[0x10];
+	u8         wqe_cnt[0x10];
+
+	u8         reserved_at_100[0x40];
+
+	u8         db_record_addr_h[0x20];
+
+	u8         db_record_addr_l[0x1e];
+	u8         reserved_at_17e[0x2];
+
+	u8         reserved_at_180[0x80];
+};
+
+struct mlx5_ifc_vnic_diagnostic_statistics_bits {
+	u8         counter_error_queues[0x20];
+
+	u8         total_error_queues[0x20];
+
+	u8         send_queue_priority_update_flow[0x20];
+
+	u8         reserved_at_60[0x20];
+
+	u8         nic_receive_steering_discard[0x40];
+
+	u8         receive_discard_vport_down[0x40];
+
+	u8         transmit_discard_vport_down[0x40];
+
+	u8         reserved_at_140[0xec0];
+};
+
+struct mlx5_ifc_traffic_counter_bits {
+	u8         packets[0x40];
+
+	u8         octets[0x40];
+};
+
+struct mlx5_ifc_tisc_bits {
+	u8         strict_lag_tx_port_affinity[0x1];
+	u8         reserved_at_1[0x3];
+	u8         lag_tx_port_affinity[0x04];
+
+	u8         reserved_at_8[0x4];
+	u8         prio[0x4];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x100];
+
+	u8         reserved_at_120[0x8];
+	u8         transport_domain[0x18];
+
+	u8         reserved_at_140[0x8];
+	u8         underlay_qpn[0x18];
+	u8         reserved_at_160[0x3a0];
+};
+
+enum {
+	MLX5_TIRC_DISP_TYPE_DIRECT    = 0x0,
+	MLX5_TIRC_DISP_TYPE_INDIRECT  = 0x1,
+};
+
+enum {
+	MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO  = 0x1,
+	MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO  = 0x2,
+};
+
+enum {
+	MLX5_RX_HASH_FN_NONE           = 0x0,
+	MLX5_RX_HASH_FN_INVERTED_XOR8  = 0x1,
+	MLX5_RX_HASH_FN_TOEPLITZ       = 0x2,
+};
+
+enum {
+	MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_    = 0x1,
+	MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST_  = 0x2,
+};
+
+struct mlx5_ifc_tirc_bits {
+	u8         reserved_at_0[0x20];
+
+	u8         disp_type[0x4];
+	u8         reserved_at_24[0x1c];
+
+	u8         reserved_at_40[0x40];
+
+	u8         reserved_at_80[0x4];
+	u8         lro_timeout_period_usecs[0x10];
+	u8         lro_enable_mask[0x4];
+	u8         lro_max_ip_payload_size[0x8];
+
+	u8         reserved_at_a0[0x40];
+
+	u8         reserved_at_e0[0x8];
+	u8         inline_rqn[0x18];
+
+	u8         rx_hash_symmetric[0x1];
+	u8         reserved_at_101[0x1];
+	u8         tunneled_offload_en[0x1];
+	u8         reserved_at_103[0x5];
+	u8         indirect_table[0x18];
+
+	u8         rx_hash_fn[0x4];
+	u8         reserved_at_124[0x2];
+	u8         self_lb_block[0x2];
+	u8         transport_domain[0x18];
+
+	u8         rx_hash_toeplitz_key[10][0x20];
+
+	struct mlx5_ifc_rx_hash_field_select_bits rx_hash_field_selector_outer;
+
+	struct mlx5_ifc_rx_hash_field_select_bits rx_hash_field_selector_inner;
+
+	u8         reserved_at_2c0[0x4c0];
+};
+
+enum {
+	MLX5_SRQC_STATE_GOOD   = 0x0,
+	MLX5_SRQC_STATE_ERROR  = 0x1,
+};
+
+struct mlx5_ifc_srqc_bits {
+	u8         state[0x4];
+	u8         log_srq_size[0x4];
+	u8         reserved_at_8[0x18];
+
+	u8         wq_signature[0x1];
+	u8         cont_srq[0x1];
+	u8         reserved_at_22[0x1];
+	u8         rlky[0x1];
+	u8         reserved_at_24[0x1];
+	u8         log_rq_stride[0x3];
+	u8         xrcd[0x18];
+
+	u8         page_offset[0x6];
+	u8         reserved_at_46[0x2];
+	u8         cqn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	u8         reserved_at_80[0x2];
+	u8         log_page_size[0x6];
+	u8         reserved_at_88[0x18];
+
+	u8         reserved_at_a0[0x20];
+
+	u8         reserved_at_c0[0x8];
+	u8         pd[0x18];
+
+	u8         lwm[0x10];
+	u8         wqe_cnt[0x10];
+
+	u8         reserved_at_100[0x40];
+
+	u8         dbr_addr[0x40];
+
+	u8         reserved_at_180[0x80];
+};
+
+enum {
+	MLX5_SQC_STATE_RST  = 0x0,
+	MLX5_SQC_STATE_RDY  = 0x1,
+	MLX5_SQC_STATE_ERR  = 0x3,
+};
+
+struct mlx5_ifc_sqc_bits {
+	u8         rlky[0x1];
+	u8         cd_master[0x1];
+	u8         fre[0x1];
+	u8         flush_in_error_en[0x1];
+	u8         allow_multi_pkt_send_wqe[0x1];
+	u8	   min_wqe_inline_mode[0x3];
+	u8         state[0x4];
+	u8         reg_umr[0x1];
+	u8         allow_swp[0x1];
+	u8         hairpin[0x1];
+	u8         reserved_at_f[0x11];
+
+	u8         reserved_at_20[0x8];
+	u8         user_index[0x18];
+
+	u8         reserved_at_40[0x8];
+	u8         cqn[0x18];
+
+	u8         reserved_at_60[0x8];
+	u8         hairpin_peer_rq[0x18];
+
+	u8         reserved_at_80[0x10];
+	u8         hairpin_peer_vhca[0x10];
+
+	u8         reserved_at_a0[0x50];
+
+	u8         packet_pacing_rate_limit_index[0x10];
+	u8         tis_lst_sz[0x10];
+	u8         reserved_at_110[0x10];
+
+	u8         reserved_at_120[0x40];
+
+	u8         reserved_at_160[0x8];
+	u8         tis_num_0[0x18];
+
+	struct mlx5_ifc_wq_bits wq;
+};
+
+enum {
+	SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR = 0x0,
+	SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT = 0x1,
+	SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC = 0x2,
+	SCHEDULING_CONTEXT_ELEMENT_TYPE_PARA_VPORT_TC = 0x3,
+};
+
+struct mlx5_ifc_scheduling_context_bits {
+	u8         element_type[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         element_attributes[0x20];
+
+	u8         parent_element_id[0x20];
+
+	u8         reserved_at_60[0x40];
+
+	u8         bw_share[0x20];
+
+	u8         max_average_bw[0x20];
+
+	u8         reserved_at_e0[0x120];
+};
+
+struct mlx5_ifc_rqtc_bits {
+	u8         reserved_at_0[0xa0];
+
+	u8         reserved_at_a0[0x10];
+	u8         rqt_max_size[0x10];
+
+	u8         reserved_at_c0[0x10];
+	u8         rqt_actual_size[0x10];
+
+	u8         reserved_at_e0[0x6a0];
+
+	struct mlx5_ifc_rq_num_bits rq_num[0];
+};
+
+enum {
+	MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE  = 0x0,
+	MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_RMP     = 0x1,
+};
+
+enum {
+	MLX5_RQC_STATE_RST  = 0x0,
+	MLX5_RQC_STATE_RDY  = 0x1,
+	MLX5_RQC_STATE_ERR  = 0x3,
+};
+
+struct mlx5_ifc_rqc_bits {
+	u8         rlky[0x1];
+	u8	   delay_drop_en[0x1];
+	u8         scatter_fcs[0x1];
+	u8         vsd[0x1];
+	u8         mem_rq_type[0x4];
+	u8         state[0x4];
+	u8         reserved_at_c[0x1];
+	u8         flush_in_error_en[0x1];
+	u8         hairpin[0x1];
+	u8         reserved_at_f[0x11];
+
+	u8         reserved_at_20[0x8];
+	u8         user_index[0x18];
+
+	u8         reserved_at_40[0x8];
+	u8         cqn[0x18];
+
+	u8         counter_set_id[0x8];
+	u8         reserved_at_68[0x18];
+
+	u8         reserved_at_80[0x8];
+	u8         rmpn[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         hairpin_peer_sq[0x18];
+
+	u8         reserved_at_c0[0x10];
+	u8         hairpin_peer_vhca[0x10];
+
+	u8         reserved_at_e0[0xa0];
+
+	struct mlx5_ifc_wq_bits wq;
+};
+
+enum {
+	MLX5_RMPC_STATE_RDY  = 0x1,
+	MLX5_RMPC_STATE_ERR  = 0x3,
+};
+
+struct mlx5_ifc_rmpc_bits {
+	u8         reserved_at_0[0x8];
+	u8         state[0x4];
+	u8         reserved_at_c[0x14];
+
+	u8         basic_cyclic_rcv_wqe[0x1];
+	u8         reserved_at_21[0x1f];
+
+	u8         reserved_at_40[0x140];
+
+	struct mlx5_ifc_wq_bits wq;
+};
+
+struct mlx5_ifc_nic_vport_context_bits {
+	u8         reserved_at_0[0x5];
+	u8         min_wqe_inline_mode[0x3];
+	u8         reserved_at_8[0x15];
+	u8         disable_mc_local_lb[0x1];
+	u8         disable_uc_local_lb[0x1];
+	u8         roce_en[0x1];
+
+	u8         arm_change_event[0x1];
+	u8         reserved_at_21[0x1a];
+	u8         event_on_mtu[0x1];
+	u8         event_on_promisc_change[0x1];
+	u8         event_on_vlan_change[0x1];
+	u8         event_on_mc_address_change[0x1];
+	u8         event_on_uc_address_change[0x1];
+
+	u8         reserved_at_40[0xc];
+
+	u8	   affiliation_criteria[0x4];
+	u8	   affiliated_vhca_id[0x10];
+
+	u8	   reserved_at_60[0xd0];
+
+	u8         mtu[0x10];
+
+	u8         system_image_guid[0x40];
+	u8         port_guid[0x40];
+	u8         node_guid[0x40];
+
+	u8         reserved_at_200[0x140];
+	u8         qkey_violation_counter[0x10];
+	u8         reserved_at_350[0x430];
+
+	u8         promisc_uc[0x1];
+	u8         promisc_mc[0x1];
+	u8         promisc_all[0x1];
+	u8         reserved_at_783[0x2];
+	u8         allowed_list_type[0x3];
+	u8         reserved_at_788[0xc];
+	u8         allowed_list_size[0xc];
+
+	struct mlx5_ifc_mac_address_layout_bits permanent_address;
+
+	u8         reserved_at_7e0[0x20];
+
+	u8         current_uc_mac_address[0][0x40];
+};
+
+enum {
+	MLX5_MKC_ACCESS_MODE_PA    = 0x0,
+	MLX5_MKC_ACCESS_MODE_MTT   = 0x1,
+	MLX5_MKC_ACCESS_MODE_KLMS  = 0x2,
+	MLX5_MKC_ACCESS_MODE_KSM   = 0x3,
+	MLX5_MKC_ACCESS_MODE_MEMIC = 0x5,
+};
+
+struct mlx5_ifc_mkc_bits {
+	u8         reserved_at_0[0x1];
+	u8         free[0x1];
+	u8         reserved_at_2[0x1];
+	u8         access_mode_4_2[0x3];
+	u8         reserved_at_6[0x7];
+	u8         relaxed_ordering_write[0x1];
+	u8         reserved_at_e[0x1];
+	u8         small_fence_on_rdma_read_response[0x1];
+	u8         umr_en[0x1];
+	u8         a[0x1];
+	u8         rw[0x1];
+	u8         rr[0x1];
+	u8         lw[0x1];
+	u8         lr[0x1];
+	u8         access_mode_1_0[0x2];
+	u8         reserved_at_18[0x8];
+
+	u8         qpn[0x18];
+	u8         mkey_7_0[0x8];
+
+	u8         reserved_at_40[0x20];
+
+	u8         length64[0x1];
+	u8         bsf_en[0x1];
+	u8         sync_umr[0x1];
+	u8         reserved_at_63[0x2];
+	u8         expected_sigerr_count[0x1];
+	u8         reserved_at_66[0x1];
+	u8         en_rinval[0x1];
+	u8         pd[0x18];
+
+	u8         start_addr[0x40];
+
+	u8         len[0x40];
+
+	u8         bsf_octword_size[0x20];
+
+	u8         reserved_at_120[0x80];
+
+	u8         translations_octword_size[0x20];
+
+	u8         reserved_at_1c0[0x1b];
+	u8         log_page_size[0x5];
+
+	u8         reserved_at_1e0[0x20];
+};
+
+struct mlx5_ifc_pkey_bits {
+	u8         reserved_at_0[0x10];
+	u8         pkey[0x10];
+};
+
+struct mlx5_ifc_array128_auto_bits {
+	u8         array128_auto[16][0x8];
+};
+
+struct mlx5_ifc_hca_vport_context_bits {
+	u8         field_select[0x20];
+
+	u8         reserved_at_20[0xe0];
+
+	u8         sm_virt_aware[0x1];
+	u8         has_smi[0x1];
+	u8         has_raw[0x1];
+	u8         grh_required[0x1];
+	u8         reserved_at_104[0xc];
+	u8         port_physical_state[0x4];
+	u8         vport_state_policy[0x4];
+	u8         port_state[0x4];
+	u8         vport_state[0x4];
+
+	u8         reserved_at_120[0x20];
+
+	u8         system_image_guid[0x40];
+
+	u8         port_guid[0x40];
+
+	u8         node_guid[0x40];
+
+	u8         cap_mask1[0x20];
+
+	u8         cap_mask1_field_select[0x20];
+
+	u8         cap_mask2[0x20];
+
+	u8         cap_mask2_field_select[0x20];
+
+	u8         reserved_at_280[0x80];
+
+	u8         lid[0x10];
+	u8         reserved_at_310[0x4];
+	u8         init_type_reply[0x4];
+	u8         lmc[0x3];
+	u8         subnet_timeout[0x5];
+
+	u8         sm_lid[0x10];
+	u8         sm_sl[0x4];
+	u8         reserved_at_334[0xc];
+
+	u8         qkey_violation_counter[0x10];
+	u8         pkey_violation_counter[0x10];
+
+	u8         reserved_at_360[0xca0];
+};
+
+struct mlx5_ifc_esw_vport_context_bits {
+	u8         reserved_at_0[0x3];
+	u8         vport_svlan_strip[0x1];
+	u8         vport_cvlan_strip[0x1];
+	u8         vport_svlan_insert[0x1];
+	u8         vport_cvlan_insert[0x2];
+	u8         reserved_at_8[0x18];
+
+	u8         reserved_at_20[0x20];
+
+	u8         svlan_cfi[0x1];
+	u8         svlan_pcp[0x3];
+	u8         svlan_id[0xc];
+	u8         cvlan_cfi[0x1];
+	u8         cvlan_pcp[0x3];
+	u8         cvlan_id[0xc];
+
+	u8         reserved_at_60[0x7a0];
+};
+
+enum {
+	MLX5_EQC_STATUS_OK                = 0x0,
+	MLX5_EQC_STATUS_EQ_WRITE_FAILURE  = 0xa,
+};
+
+enum {
+	MLX5_EQC_ST_ARMED  = 0x9,
+	MLX5_EQC_ST_FIRED  = 0xa,
+};
+
+struct mlx5_ifc_eqc_bits {
+	u8         status[0x4];
+	u8         reserved_at_4[0x9];
+	u8         ec[0x1];
+	u8         oi[0x1];
+	u8         reserved_at_f[0x5];
+	u8         st[0x4];
+	u8         reserved_at_18[0x8];
+
+	u8         reserved_at_20[0x20];
+
+	u8         reserved_at_40[0x14];
+	u8         page_offset[0x6];
+	u8         reserved_at_5a[0x6];
+
+	u8         reserved_at_60[0x3];
+	u8         log_eq_size[0x5];
+	u8         uar_page[0x18];
+
+	u8         reserved_at_80[0x20];
+
+	u8         reserved_at_a0[0x18];
+	u8         intr[0x8];
+
+	u8         reserved_at_c0[0x3];
+	u8         log_page_size[0x5];
+	u8         reserved_at_c8[0x18];
+
+	u8         reserved_at_e0[0x60];
+
+	u8         reserved_at_140[0x8];
+	u8         consumer_counter[0x18];
+
+	u8         reserved_at_160[0x8];
+	u8         producer_counter[0x18];
+
+	u8         reserved_at_180[0x80];
+};
+
+enum {
+	MLX5_DCTC_STATE_ACTIVE    = 0x0,
+	MLX5_DCTC_STATE_DRAINING  = 0x1,
+	MLX5_DCTC_STATE_DRAINED   = 0x2,
+};
+
+enum {
+	MLX5_DCTC_CS_RES_DISABLE    = 0x0,
+	MLX5_DCTC_CS_RES_NA         = 0x1,
+	MLX5_DCTC_CS_RES_UP_TO_64B  = 0x2,
+};
+
+enum {
+	MLX5_DCTC_MTU_256_BYTES  = 0x1,
+	MLX5_DCTC_MTU_512_BYTES  = 0x2,
+	MLX5_DCTC_MTU_1K_BYTES   = 0x3,
+	MLX5_DCTC_MTU_2K_BYTES   = 0x4,
+	MLX5_DCTC_MTU_4K_BYTES   = 0x5,
+};
+
+struct mlx5_ifc_dctc_bits {
+	u8         reserved_at_0[0x4];
+	u8         state[0x4];
+	u8         reserved_at_8[0x18];
+
+	u8         reserved_at_20[0x8];
+	u8         user_index[0x18];
+
+	u8         reserved_at_40[0x8];
+	u8         cqn[0x18];
+
+	u8         counter_set_id[0x8];
+	u8         atomic_mode[0x4];
+	u8         rre[0x1];
+	u8         rwe[0x1];
+	u8         rae[0x1];
+	u8         atomic_like_write_en[0x1];
+	u8         latency_sensitive[0x1];
+	u8         rlky[0x1];
+	u8         free_ar[0x1];
+	u8         reserved_at_73[0xd];
+
+	u8         reserved_at_80[0x8];
+	u8         cs_res[0x8];
+	u8         reserved_at_90[0x3];
+	u8         min_rnr_nak[0x5];
+	u8         reserved_at_98[0x8];
+
+	u8         reserved_at_a0[0x8];
+	u8         srqn_xrqn[0x18];
+
+	u8         reserved_at_c0[0x8];
+	u8         pd[0x18];
+
+	u8         tclass[0x8];
+	u8         reserved_at_e8[0x4];
+	u8         flow_label[0x14];
+
+	u8         dc_access_key[0x40];
+
+	u8         reserved_at_140[0x5];
+	u8         mtu[0x3];
+	u8         port[0x8];
+	u8         pkey_index[0x10];
+
+	u8         reserved_at_160[0x8];
+	u8         my_addr_index[0x8];
+	u8         reserved_at_170[0x8];
+	u8         hop_limit[0x8];
+
+	u8         dc_access_key_violation_count[0x20];
+
+	u8         reserved_at_1a0[0x14];
+	u8         dei_cfi[0x1];
+	u8         eth_prio[0x3];
+	u8         ecn[0x2];
+	u8         dscp[0x6];
+
+	u8         reserved_at_1c0[0x40];
+};
+
+enum {
+	MLX5_CQC_STATUS_OK             = 0x0,
+	MLX5_CQC_STATUS_CQ_OVERFLOW    = 0x9,
+	MLX5_CQC_STATUS_CQ_WRITE_FAIL  = 0xa,
+};
+
+enum {
+	MLX5_CQC_CQE_SZ_64_BYTES   = 0x0,
+	MLX5_CQC_CQE_SZ_128_BYTES  = 0x1,
+};
+
+enum {
+	MLX5_CQC_ST_SOLICITED_NOTIFICATION_REQUEST_ARMED  = 0x6,
+	MLX5_CQC_ST_NOTIFICATION_REQUEST_ARMED            = 0x9,
+	MLX5_CQC_ST_FIRED                                 = 0xa,
+};
+
+enum {
+	MLX5_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
+	MLX5_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
+	MLX5_CQ_PERIOD_NUM_MODES
+};
+
+struct mlx5_ifc_cqc_bits {
+	u8         status[0x4];
+	u8         reserved_at_4[0x4];
+	u8         cqe_sz[0x3];
+	u8         cc[0x1];
+	u8         reserved_at_c[0x1];
+	u8         scqe_break_moderation_en[0x1];
+	u8         oi[0x1];
+	u8         cq_period_mode[0x2];
+	u8         cqe_comp_en[0x1];
+	u8         mini_cqe_res_format[0x2];
+	u8         st[0x4];
+	u8         reserved_at_18[0x8];
+
+	u8         reserved_at_20[0x20];
+
+	u8         reserved_at_40[0x14];
+	u8         page_offset[0x6];
+	u8         reserved_at_5a[0x6];
+
+	u8         reserved_at_60[0x3];
+	u8         log_cq_size[0x5];
+	u8         uar_page[0x18];
+
+	u8         reserved_at_80[0x4];
+	u8         cq_period[0xc];
+	u8         cq_max_count[0x10];
+
+	u8         reserved_at_a0[0x18];
+	u8         c_eqn[0x8];
+
+	u8         reserved_at_c0[0x3];
+	u8         log_page_size[0x5];
+	u8         reserved_at_c8[0x18];
+
+	u8         reserved_at_e0[0x20];
+
+	u8         reserved_at_100[0x8];
+	u8         last_notified_index[0x18];
+
+	u8         reserved_at_120[0x8];
+	u8         last_solicit_index[0x18];
+
+	u8         reserved_at_140[0x8];
+	u8         consumer_counter[0x18];
+
+	u8         reserved_at_160[0x8];
+	u8         producer_counter[0x18];
+
+	u8         reserved_at_180[0x40];
+
+	u8         dbr_addr[0x40];
+};
+
+union mlx5_ifc_cong_control_roce_ecn_auto_bits {
+	struct mlx5_ifc_cong_control_802_1qau_rp_bits cong_control_802_1qau_rp;
+	struct mlx5_ifc_cong_control_r_roce_ecn_rp_bits cong_control_r_roce_ecn_rp;
+	struct mlx5_ifc_cong_control_r_roce_ecn_np_bits cong_control_r_roce_ecn_np;
+	u8         reserved_at_0[0x800];
+};
+
+struct mlx5_ifc_query_adapter_param_block_bits {
+	u8         reserved_at_0[0xc0];
+
+	u8         reserved_at_c0[0x8];
+	u8         ieee_vendor_id[0x18];
+
+	u8         reserved_at_e0[0x10];
+	u8         vsd_vendor_id[0x10];
+
+	u8         vsd[208][0x8];
+
+	u8         vsd_contd_psid[16][0x8];
+};
+
+enum {
+	MLX5_XRQC_STATE_GOOD   = 0x0,
+	MLX5_XRQC_STATE_ERROR  = 0x1,
+};
+
+enum {
+	MLX5_XRQC_TOPOLOGY_NO_SPECIAL_TOPOLOGY = 0x0,
+	MLX5_XRQC_TOPOLOGY_TAG_MATCHING        = 0x1,
+};
+
+enum {
+	MLX5_XRQC_OFFLOAD_RNDV = 0x1,
+};
+
+struct mlx5_ifc_tag_matching_topology_context_bits {
+	u8         log_matching_list_sz[0x4];
+	u8         reserved_at_4[0xc];
+	u8         append_next_index[0x10];
+
+	u8         sw_phase_cnt[0x10];
+	u8         hw_phase_cnt[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_xrqc_bits {
+	u8         state[0x4];
+	u8         rlkey[0x1];
+	u8         reserved_at_5[0xf];
+	u8         topology[0x4];
+	u8         reserved_at_18[0x4];
+	u8         offload[0x4];
+
+	u8         reserved_at_20[0x8];
+	u8         user_index[0x18];
+
+	u8         reserved_at_40[0x8];
+	u8         cqn[0x18];
+
+	u8         reserved_at_60[0xa0];
+
+	struct mlx5_ifc_tag_matching_topology_context_bits tag_matching_topology_context;
+
+	u8         reserved_at_180[0x280];
+
+	struct mlx5_ifc_wq_bits wq;
+};
+
+union mlx5_ifc_modify_field_select_resize_field_select_auto_bits {
+	struct mlx5_ifc_modify_field_select_bits modify_field_select;
+	struct mlx5_ifc_resize_field_select_bits resize_field_select;
+	u8         reserved_at_0[0x20];
+};
+
+union mlx5_ifc_field_select_802_1_r_roce_auto_bits {
+	struct mlx5_ifc_field_select_802_1qau_rp_bits field_select_802_1qau_rp;
+	struct mlx5_ifc_field_select_r_roce_rp_bits field_select_r_roce_rp;
+	struct mlx5_ifc_field_select_r_roce_np_bits field_select_r_roce_np;
+	u8         reserved_at_0[0x20];
+};
+
+union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits {
+	struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits eth_802_3_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits eth_2863_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits eth_3635_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits eth_extended_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_per_prio_grp_data_layout_bits eth_per_prio_grp_data_layout;
+	struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout;
+	struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits ib_port_cntrs_grp_data_layout;
+	struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs;
+	struct mlx5_ifc_phys_layer_statistical_cntrs_bits phys_layer_statistical_cntrs;
+	u8         reserved_at_0[0x7c0];
+};
+
+union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits {
+	struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits pcie_perf_cntrs_grp_data_layout;
+	u8         reserved_at_0[0x7c0];
+};
+
+union mlx5_ifc_event_auto_bits {
+	struct mlx5_ifc_comp_event_bits comp_event;
+	struct mlx5_ifc_dct_events_bits dct_events;
+	struct mlx5_ifc_qp_events_bits qp_events;
+	struct mlx5_ifc_wqe_associated_page_fault_event_bits wqe_associated_page_fault_event;
+	struct mlx5_ifc_rdma_page_fault_event_bits rdma_page_fault_event;
+	struct mlx5_ifc_cq_error_bits cq_error;
+	struct mlx5_ifc_dropped_packet_logged_bits dropped_packet_logged;
+	struct mlx5_ifc_port_state_change_event_bits port_state_change_event;
+	struct mlx5_ifc_gpio_event_bits gpio_event;
+	struct mlx5_ifc_db_bf_congestion_event_bits db_bf_congestion_event;
+	struct mlx5_ifc_stall_vl_event_bits stall_vl_event;
+	struct mlx5_ifc_cmd_inter_comp_event_bits cmd_inter_comp_event;
+	u8         reserved_at_0[0xe0];
+};
+
+struct mlx5_ifc_health_buffer_bits {
+	u8         reserved_at_0[0x100];
+
+	u8         assert_existptr[0x20];
+
+	u8         assert_callra[0x20];
+
+	u8         reserved_at_140[0x40];
+
+	u8         fw_version[0x20];
+
+	u8         hw_id[0x20];
+
+	u8         reserved_at_1c0[0x20];
+
+	u8         irisc_index[0x8];
+	u8         synd[0x8];
+	u8         ext_synd[0x10];
+};
+
+struct mlx5_ifc_register_loopback_control_bits {
+	u8         no_lb[0x1];
+	u8         reserved_at_1[0x7];
+	u8         port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x60];
+};
+
+struct mlx5_ifc_vport_tc_element_bits {
+	u8         traffic_class[0x4];
+	u8         reserved_at_4[0xc];
+	u8         vport_number[0x10];
+};
+
+struct mlx5_ifc_vport_element_bits {
+	u8         reserved_at_0[0x10];
+	u8         vport_number[0x10];
+};
+
+enum {
+	TSAR_ELEMENT_TSAR_TYPE_DWRR = 0x0,
+	TSAR_ELEMENT_TSAR_TYPE_ROUND_ROBIN = 0x1,
+	TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2,
+};
+
+struct mlx5_ifc_tsar_element_bits {
+	u8         reserved_at_0[0x8];
+	u8         tsar_type[0x8];
+	u8         reserved_at_10[0x10];
+};
+
+enum {
+	MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_SUCCESS = 0x0,
+	MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL = 0x1,
+};
+
+struct mlx5_ifc_teardown_hca_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x3f];
+
+	u8         force_state[0x1];
+};
+
+enum {
+	MLX5_TEARDOWN_HCA_IN_PROFILE_GRACEFUL_CLOSE  = 0x0,
+	MLX5_TEARDOWN_HCA_IN_PROFILE_FORCE_CLOSE     = 0x1,
+};
+
+struct mlx5_ifc_teardown_hca_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         profile[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_sqerr2rts_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_sqerr2rts_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_at_a0[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_at_800[0x80];
+};
+
+struct mlx5_ifc_sqd2rts_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_sqd2rts_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_at_a0[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_at_800[0x80];
+};
+
+struct mlx5_ifc_set_roce_address_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_roce_address_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         roce_address_index[0x10];
+	u8         reserved_at_50[0xc];
+	u8	   vhca_port_num[0x4];
+
+	u8         reserved_at_60[0x20];
+
+	struct mlx5_ifc_roce_addr_layout_bits roce_address;
+};
+
+struct mlx5_ifc_set_mad_demux_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+enum {
+	MLX5_SET_MAD_DEMUX_IN_DEMUX_MODE_PASS_ALL   = 0x0,
+	MLX5_SET_MAD_DEMUX_IN_DEMUX_MODE_SELECTIVE  = 0x2,
+};
+
+struct mlx5_ifc_set_mad_demux_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x20];
+
+	u8         reserved_at_60[0x6];
+	u8         demux_mode[0x2];
+	u8         reserved_at_68[0x18];
+};
+
+struct mlx5_ifc_set_l2_table_entry_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_l2_table_entry_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x60];
+
+	u8         reserved_at_a0[0x8];
+	u8         table_index[0x18];
+
+	u8         reserved_at_c0[0x20];
+
+	u8         reserved_at_e0[0x13];
+	u8         vlan_valid[0x1];
+	u8         vlan[0xc];
+
+	struct mlx5_ifc_mac_address_layout_bits mac_address;
+
+	u8         reserved_at_140[0xc0];
+};
+
+struct mlx5_ifc_set_issi_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_issi_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         current_issi[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_set_hca_cap_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_hca_cap_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	union mlx5_ifc_hca_cap_union_bits capability;
+};
+
+enum {
+	MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION    = 0x0,
+	MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_TAG  = 0x1,
+	MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST    = 0x2,
+	MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS    = 0x3
+};
+
+struct mlx5_ifc_set_fte_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_fte_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         table_type[0x8];
+	u8         reserved_at_88[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_at_c0[0x18];
+	u8         modify_enable_mask[0x8];
+
+	u8         reserved_at_e0[0x20];
+
+	u8         flow_index[0x20];
+
+	u8         reserved_at_120[0xe0];
+
+	struct mlx5_ifc_flow_context_bits flow_context;
+};
+
+struct mlx5_ifc_rts2rts_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_rts2rts_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_at_a0[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_at_800[0x80];
+};
+
+struct mlx5_ifc_rtr2rts_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_rtr2rts_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_at_a0[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_at_800[0x80];
+};
+
+struct mlx5_ifc_rst2init_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_rst2init_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_at_a0[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_at_800[0x80];
+};
+
+struct mlx5_ifc_query_xrq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_xrqc_bits xrq_context;
+};
+
+struct mlx5_ifc_query_xrq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         xrqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_xrc_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_xrc_srqc_bits xrc_srq_context_entry;
+
+	u8         reserved_at_280[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_query_xrc_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         xrc_srqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+enum {
+	MLX5_QUERY_VPORT_STATE_OUT_STATE_DOWN  = 0x0,
+	MLX5_QUERY_VPORT_STATE_OUT_STATE_UP    = 0x1,
+};
+
+struct mlx5_ifc_query_vport_state_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x20];
+
+	u8         reserved_at_60[0x18];
+	u8         admin_state[0x4];
+	u8         state[0x4];
+};
+
+enum {
+	MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT  = 0x0,
+	MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT   = 0x1,
+};
+
+struct mlx5_ifc_query_vport_state_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_vnic_env_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_vnic_diagnostic_statistics_bits vport_env;
+};
+
+enum {
+	MLX5_QUERY_VNIC_ENV_IN_OP_MOD_VPORT_DIAG_STATISTICS  = 0x0,
+};
+
+struct mlx5_ifc_query_vnic_env_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_vport_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_traffic_counter_bits received_errors;
+
+	struct mlx5_ifc_traffic_counter_bits transmit_errors;
+
+	struct mlx5_ifc_traffic_counter_bits received_ib_unicast;
+
+	struct mlx5_ifc_traffic_counter_bits transmitted_ib_unicast;
+
+	struct mlx5_ifc_traffic_counter_bits received_ib_multicast;
+
+	struct mlx5_ifc_traffic_counter_bits transmitted_ib_multicast;
+
+	struct mlx5_ifc_traffic_counter_bits received_eth_broadcast;
+
+	struct mlx5_ifc_traffic_counter_bits transmitted_eth_broadcast;
+
+	struct mlx5_ifc_traffic_counter_bits received_eth_unicast;
+
+	struct mlx5_ifc_traffic_counter_bits transmitted_eth_unicast;
+
+	struct mlx5_ifc_traffic_counter_bits received_eth_multicast;
+
+	struct mlx5_ifc_traffic_counter_bits transmitted_eth_multicast;
+
+	u8         reserved_at_680[0xa00];
+};
+
+enum {
+	MLX5_QUERY_VPORT_COUNTER_IN_OP_MOD_VPORT_COUNTERS  = 0x0,
+};
+
+struct mlx5_ifc_query_vport_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xb];
+	u8	   port_num[0x4];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x60];
+
+	u8         clear[0x1];
+	u8         reserved_at_c1[0x1f];
+
+	u8         reserved_at_e0[0x20];
+};
+
+struct mlx5_ifc_query_tis_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_tisc_bits tis_context;
+};
+
+struct mlx5_ifc_query_tis_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         tisn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_tir_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_tirc_bits tir_context;
+};
+
+struct mlx5_ifc_query_tir_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         tirn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_srqc_bits srq_context_entry;
+
+	u8         reserved_at_280[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_query_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         srqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_sq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_sqc_bits sq_context;
+};
+
+struct mlx5_ifc_query_sq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         sqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_special_contexts_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         dump_fill_mkey[0x20];
+
+	u8         resd_lkey[0x20];
+
+	u8         null_mkey[0x20];
+
+	u8         reserved_at_a0[0x60];
+};
+
+struct mlx5_ifc_query_special_contexts_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_query_scheduling_element_out_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_scheduling_context_bits scheduling_context;
+
+	u8         reserved_at_300[0x100];
+};
+
+enum {
+	SCHEDULING_HIERARCHY_E_SWITCH = 0x2,
+};
+
+struct mlx5_ifc_query_scheduling_element_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         scheduling_hierarchy[0x8];
+	u8         reserved_at_48[0x18];
+
+	u8         scheduling_element_id[0x20];
+
+	u8         reserved_at_80[0x180];
+};
+
+struct mlx5_ifc_query_rqt_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_rqtc_bits rqt_context;
+};
+
+struct mlx5_ifc_query_rqt_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         rqtn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_rq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_rqc_bits rq_context;
+};
+
+struct mlx5_ifc_query_rq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         rqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_roce_address_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_roce_addr_layout_bits roce_address;
+};
+
+struct mlx5_ifc_query_roce_address_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         roce_address_index[0x10];
+	u8         reserved_at_50[0xc];
+	u8	   vhca_port_num[0x4];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_rmp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_rmpc_bits rmp_context;
+};
+
+struct mlx5_ifc_query_rmp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         rmpn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_at_a0[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_at_800[0x80];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_query_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_q_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         rx_write_requests[0x20];
+
+	u8         reserved_at_a0[0x20];
+
+	u8         rx_read_requests[0x20];
+
+	u8         reserved_at_e0[0x20];
+
+	u8         rx_atomic_requests[0x20];
+
+	u8         reserved_at_120[0x20];
+
+	u8         rx_dct_connect[0x20];
+
+	u8         reserved_at_160[0x20];
+
+	u8         out_of_buffer[0x20];
+
+	u8         reserved_at_1a0[0x20];
+
+	u8         out_of_sequence[0x20];
+
+	u8         reserved_at_1e0[0x20];
+
+	u8         duplicate_request[0x20];
+
+	u8         reserved_at_220[0x20];
+
+	u8         rnr_nak_retry_err[0x20];
+
+	u8         reserved_at_260[0x20];
+
+	u8         packet_seq_err[0x20];
+
+	u8         reserved_at_2a0[0x20];
+
+	u8         implied_nak_seq_err[0x20];
+
+	u8         reserved_at_2e0[0x20];
+
+	u8         local_ack_timeout_err[0x20];
+
+	u8         reserved_at_320[0xa0];
+
+	u8         resp_local_length_error[0x20];
+
+	u8         req_local_length_error[0x20];
+
+	u8         resp_local_qp_error[0x20];
+
+	u8         local_operation_error[0x20];
+
+	u8         resp_local_protection[0x20];
+
+	u8         req_local_protection[0x20];
+
+	u8         resp_cqe_error[0x20];
+
+	u8         req_cqe_error[0x20];
+
+	u8         req_mw_binding[0x20];
+
+	u8         req_bad_response[0x20];
+
+	u8         req_remote_invalid_request[0x20];
+
+	u8         resp_remote_invalid_request[0x20];
+
+	u8         req_remote_access_errors[0x20];
+
+	u8	   resp_remote_access_errors[0x20];
+
+	u8         req_remote_operation_errors[0x20];
+
+	u8         req_transport_retries_exceeded[0x20];
+
+	u8         cq_overflow[0x20];
+
+	u8         resp_cqe_flush_error[0x20];
+
+	u8         req_cqe_flush_error[0x20];
+
+	u8         reserved_at_620[0x1e0];
+};
+
+struct mlx5_ifc_query_q_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x80];
+
+	u8         clear[0x1];
+	u8         reserved_at_c1[0x1f];
+
+	u8         reserved_at_e0[0x18];
+	u8         counter_set_id[0x8];
+};
+
+struct mlx5_ifc_query_pages_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x10];
+	u8         function_id[0x10];
+
+	u8         num_pages[0x20];
+};
+
+enum {
+	MLX5_QUERY_PAGES_IN_OP_MOD_BOOT_PAGES     = 0x1,
+	MLX5_QUERY_PAGES_IN_OP_MOD_INIT_PAGES     = 0x2,
+	MLX5_QUERY_PAGES_IN_OP_MOD_REGULAR_PAGES  = 0x3,
+};
+
+struct mlx5_ifc_query_pages_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         function_id[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_nic_vport_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_nic_vport_context_bits nic_vport_context;
+};
+
+struct mlx5_ifc_query_nic_vport_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x5];
+	u8         allowed_list_type[0x3];
+	u8         reserved_at_68[0x18];
+};
+
+struct mlx5_ifc_query_mkey_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_mkc_bits memory_key_mkey_entry;
+
+	u8         reserved_at_280[0x600];
+
+	u8         bsf0_klm0_pas_mtt0_1[16][0x8];
+
+	u8         bsf1_klm1_pas_mtt2_3[16][0x8];
+};
+
+struct mlx5_ifc_query_mkey_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         mkey_index[0x18];
+
+	u8         pg_access[0x1];
+	u8         reserved_at_61[0x1f];
+};
+
+struct mlx5_ifc_query_mad_demux_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         mad_dumux_parameters_block[0x20];
+};
+
+struct mlx5_ifc_query_mad_demux_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_query_l2_table_entry_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0xa0];
+
+	u8         reserved_at_e0[0x13];
+	u8         vlan_valid[0x1];
+	u8         vlan[0xc];
+
+	struct mlx5_ifc_mac_address_layout_bits mac_address;
+
+	u8         reserved_at_140[0xc0];
+};
+
+struct mlx5_ifc_query_l2_table_entry_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x60];
+
+	u8         reserved_at_a0[0x8];
+	u8         table_index[0x18];
+
+	u8         reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_query_issi_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x10];
+	u8         current_issi[0x10];
+
+	u8         reserved_at_60[0xa0];
+
+	u8         reserved_at_100[76][0x8];
+	u8         supported_issi_dw0[0x20];
+};
+
+struct mlx5_ifc_query_issi_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_driver_version_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_set_driver_version_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+	u8         driver_version[64][0x8];
+};
+
+struct mlx5_ifc_query_hca_vport_pkey_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_pkey_bits pkey[0];
+};
+
+struct mlx5_ifc_query_hca_vport_pkey_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xb];
+	u8         port_num[0x4];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x10];
+	u8         pkey_index[0x10];
+};
+
+enum {
+	MLX5_HCA_VPORT_SEL_PORT_GUID	= 1 << 0,
+	MLX5_HCA_VPORT_SEL_NODE_GUID	= 1 << 1,
+	MLX5_HCA_VPORT_SEL_STATE_POLICY	= 1 << 2,
+};
+
+struct mlx5_ifc_query_hca_vport_gid_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x20];
+
+	u8         gids_num[0x10];
+	u8         reserved_at_70[0x10];
+
+	struct mlx5_ifc_array128_auto_bits gid[0];
+};
+
+struct mlx5_ifc_query_hca_vport_gid_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xb];
+	u8         port_num[0x4];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x10];
+	u8         gid_index[0x10];
+};
+
+struct mlx5_ifc_query_hca_vport_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_hca_vport_context_bits hca_vport_context;
+};
+
+struct mlx5_ifc_query_hca_vport_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xb];
+	u8         port_num[0x4];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_hca_cap_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	union mlx5_ifc_hca_cap_union_bits capability;
+};
+
+struct mlx5_ifc_query_hca_cap_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_query_flow_table_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x80];
+
+	u8         reserved_at_c0[0x8];
+	u8         level[0x8];
+	u8         reserved_at_d0[0x8];
+	u8         log_size[0x8];
+
+	u8         reserved_at_e0[0x120];
+};
+
+struct mlx5_ifc_query_flow_table_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	u8         table_type[0x8];
+	u8         reserved_at_88[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_query_fte_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x1c0];
+
+	struct mlx5_ifc_flow_context_bits flow_context;
+};
+
+struct mlx5_ifc_query_fte_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	u8         table_type[0x8];
+	u8         reserved_at_88[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_at_c0[0x40];
+
+	u8         flow_index[0x20];
+
+	u8         reserved_at_120[0xe0];
+};
+
+enum {
+	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_OUTER_HEADERS    = 0x0,
+	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS  = 0x1,
+	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_INNER_HEADERS    = 0x2,
+};
+
+struct mlx5_ifc_query_flow_group_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0xa0];
+
+	u8         start_flow_index[0x20];
+
+	u8         reserved_at_100[0x20];
+
+	u8         end_flow_index[0x20];
+
+	u8         reserved_at_140[0xa0];
+
+	u8         reserved_at_1e0[0x18];
+	u8         match_criteria_enable[0x8];
+
+	struct mlx5_ifc_fte_match_param_bits match_criteria;
+
+	u8         reserved_at_1200[0xe00];
+};
+
+struct mlx5_ifc_query_flow_group_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	u8         table_type[0x8];
+	u8         reserved_at_88[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         table_id[0x18];
+
+	u8         group_id[0x20];
+
+	u8         reserved_at_e0[0x120];
+};
+
+struct mlx5_ifc_query_flow_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_traffic_counter_bits flow_statistics[0];
+};
+
+struct mlx5_ifc_query_flow_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x80];
+
+	u8         clear[0x1];
+	u8         reserved_at_c1[0xf];
+	u8         num_of_counters[0x10];
+
+	u8         flow_counter_id[0x20];
+};
+
+struct mlx5_ifc_query_esw_vport_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_esw_vport_context_bits esw_vport_context;
+};
+
+struct mlx5_ifc_query_esw_vport_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_modify_esw_vport_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_esw_vport_context_fields_select_bits {
+	u8         reserved_at_0[0x1c];
+	u8         vport_cvlan_insert[0x1];
+	u8         vport_svlan_insert[0x1];
+	u8         vport_cvlan_strip[0x1];
+	u8         vport_svlan_strip[0x1];
+};
+
+struct mlx5_ifc_modify_esw_vport_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	struct mlx5_ifc_esw_vport_context_fields_select_bits field_select;
+
+	struct mlx5_ifc_esw_vport_context_bits esw_vport_context;
+};
+
+struct mlx5_ifc_query_eq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_eqc_bits eq_context_entry;
+
+	u8         reserved_at_280[0x40];
+
+	u8         event_bitmask[0x40];
+
+	u8         reserved_at_300[0x580];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_query_eq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x18];
+	u8         eq_number[0x8];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_encap_header_in_bits {
+	u8         reserved_at_0[0x5];
+	u8         header_type[0x3];
+	u8         reserved_at_8[0xe];
+	u8         encap_header_size[0xa];
+
+	u8         reserved_at_20[0x10];
+	u8         encap_header[2][0x8];
+
+	u8         more_encap_header[0][0x8];
+};
+
+struct mlx5_ifc_query_encap_header_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0xa0];
+
+	struct mlx5_ifc_encap_header_in_bits encap_header[0];
+};
+
+struct mlx5_ifc_query_encap_header_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         encap_id[0x20];
+
+	u8         reserved_at_60[0xa0];
+};
+
+struct mlx5_ifc_alloc_encap_header_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         encap_id[0x20];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_encap_header_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0xa0];
+
+	struct mlx5_ifc_encap_header_in_bits encap_header;
+};
+
+struct mlx5_ifc_dealloc_encap_header_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_encap_header_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         encap_id[0x20];
+
+	u8         reserved_60[0x20];
+};
+
+struct mlx5_ifc_set_action_in_bits {
+	u8         action_type[0x4];
+	u8         field[0xc];
+	u8         reserved_at_10[0x3];
+	u8         offset[0x5];
+	u8         reserved_at_18[0x3];
+	u8         length[0x5];
+
+	u8         data[0x20];
+};
+
+struct mlx5_ifc_add_action_in_bits {
+	u8         action_type[0x4];
+	u8         field[0xc];
+	u8         reserved_at_10[0x10];
+
+	u8         data[0x20];
+};
+
+union mlx5_ifc_set_action_in_add_action_in_auto_bits {
+	struct mlx5_ifc_set_action_in_bits set_action_in;
+	struct mlx5_ifc_add_action_in_bits add_action_in;
+	u8         reserved_at_0[0x40];
+};
+
+enum {
+	MLX5_ACTION_TYPE_SET   = 0x1,
+	MLX5_ACTION_TYPE_ADD   = 0x2,
+};
+
+enum {
+	MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16    = 0x1,
+	MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0     = 0x2,
+	MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE     = 0x3,
+	MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16    = 0x4,
+	MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0     = 0x5,
+	MLX5_ACTION_IN_FIELD_OUT_IP_DSCP       = 0x6,
+	MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS     = 0x7,
+	MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT     = 0x8,
+	MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT     = 0x9,
+	MLX5_ACTION_IN_FIELD_OUT_IP_TTL        = 0xa,
+	MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT     = 0xb,
+	MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT     = 0xc,
+	MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96  = 0xd,
+	MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64   = 0xe,
+	MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32   = 0xf,
+	MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0    = 0x10,
+	MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96  = 0x11,
+	MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64   = 0x12,
+	MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32   = 0x13,
+	MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0    = 0x14,
+	MLX5_ACTION_IN_FIELD_OUT_SIPV4         = 0x15,
+	MLX5_ACTION_IN_FIELD_OUT_DIPV4         = 0x16,
+	MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT = 0x47,
+};
+
+struct mlx5_ifc_alloc_modify_header_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         modify_header_id[0x20];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_modify_header_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x20];
+
+	u8         table_type[0x8];
+	u8         reserved_at_68[0x10];
+	u8         num_of_actions[0x8];
+
+	union mlx5_ifc_set_action_in_add_action_in_auto_bits actions[0];
+};
+
+struct mlx5_ifc_dealloc_modify_header_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_modify_header_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         modify_header_id[0x20];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_dct_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_dctc_bits dct_context_entry;
+
+	u8         reserved_at_280[0x180];
+};
+
+struct mlx5_ifc_query_dct_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         dctn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_cq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_cqc_bits cq_context;
+
+	u8         reserved_at_280[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_query_cq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         cqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_cong_status_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x20];
+
+	u8         enable[0x1];
+	u8         tag_enable[0x1];
+	u8         reserved_at_62[0x1e];
+};
+
+struct mlx5_ifc_query_cong_status_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x18];
+	u8         priority[0x4];
+	u8         cong_protocol[0x4];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_cong_statistics_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         rp_cur_flows[0x20];
+
+	u8         sum_flows[0x20];
+
+	u8         rp_cnp_ignored_high[0x20];
+
+	u8         rp_cnp_ignored_low[0x20];
+
+	u8         rp_cnp_handled_high[0x20];
+
+	u8         rp_cnp_handled_low[0x20];
+
+	u8         reserved_at_140[0x100];
+
+	u8         time_stamp_high[0x20];
+
+	u8         time_stamp_low[0x20];
+
+	u8         accumulators_period[0x20];
+
+	u8         np_ecn_marked_roce_packets_high[0x20];
+
+	u8         np_ecn_marked_roce_packets_low[0x20];
+
+	u8         np_cnp_sent_high[0x20];
+
+	u8         np_cnp_sent_low[0x20];
+
+	u8         reserved_at_320[0x560];
+};
+
+struct mlx5_ifc_query_cong_statistics_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         clear[0x1];
+	u8         reserved_at_41[0x1f];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_cong_params_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	union mlx5_ifc_cong_control_roce_ecn_auto_bits congestion_parameters;
+};
+
+struct mlx5_ifc_query_cong_params_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x1c];
+	u8         cong_protocol[0x4];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_adapter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_query_adapter_param_block_bits query_adapter_struct;
+};
+
+struct mlx5_ifc_query_adapter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_qp_2rst_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_qp_2rst_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_qp_2err_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_qp_2err_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_page_fault_resume_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_page_fault_resume_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         error[0x1];
+	u8         reserved_at_41[0x4];
+	u8         page_fault_type[0x3];
+	u8         wq_number[0x18];
+
+	u8         reserved_at_60[0x8];
+	u8         token[0x18];
+};
+
+struct mlx5_ifc_nop_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_nop_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_vport_state_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_vport_state_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x18];
+	u8         admin_state[0x4];
+	u8         reserved_at_7c[0x4];
+};
+
+struct mlx5_ifc_modify_tis_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_tis_bitmask_bits {
+	u8         reserved_at_0[0x20];
+
+	u8         reserved_at_20[0x1d];
+	u8         lag_tx_port_affinity[0x1];
+	u8         strict_lag_tx_port_affinity[0x1];
+	u8         prio[0x1];
+};
+
+struct mlx5_ifc_modify_tis_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         tisn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	struct mlx5_ifc_modify_tis_bitmask_bits bitmask;
+
+	u8         reserved_at_c0[0x40];
+
+	struct mlx5_ifc_tisc_bits ctx;
+};
+
+struct mlx5_ifc_modify_tir_bitmask_bits {
+	u8	   reserved_at_0[0x20];
+
+	u8         reserved_at_20[0x1b];
+	u8         self_lb_en[0x1];
+	u8         reserved_at_3c[0x1];
+	u8         hash[0x1];
+	u8         reserved_at_3e[0x1];
+	u8         lro[0x1];
+};
+
+struct mlx5_ifc_modify_tir_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_tir_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         tirn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	struct mlx5_ifc_modify_tir_bitmask_bits bitmask;
+
+	u8         reserved_at_c0[0x40];
+
+	struct mlx5_ifc_tirc_bits ctx;
+};
+
+struct mlx5_ifc_modify_sq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_sq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         sq_state[0x4];
+	u8         reserved_at_44[0x4];
+	u8         sqn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	u8         modify_bitmask[0x40];
+
+	u8         reserved_at_c0[0x40];
+
+	struct mlx5_ifc_sqc_bits ctx;
+};
+
+struct mlx5_ifc_modify_scheduling_element_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x1c0];
+};
+
+enum {
+	MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE = 0x1,
+	MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW = 0x2,
+};
+
+struct mlx5_ifc_modify_scheduling_element_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         scheduling_hierarchy[0x8];
+	u8         reserved_at_48[0x18];
+
+	u8         scheduling_element_id[0x20];
+
+	u8         reserved_at_80[0x20];
+
+	u8         modify_bitmask[0x20];
+
+	u8         reserved_at_c0[0x40];
+
+	struct mlx5_ifc_scheduling_context_bits scheduling_context;
+
+	u8         reserved_at_300[0x100];
+};
+
+struct mlx5_ifc_modify_rqt_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_rqt_bitmask_bits {
+	u8	   reserved_at_0[0x20];
+
+	u8         reserved_at_20[0x1f];
+	u8         rqn_list[0x1];
+};
+
+struct mlx5_ifc_modify_rqt_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         rqtn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	struct mlx5_ifc_rqt_bitmask_bits bitmask;
+
+	u8         reserved_at_c0[0x40];
+
+	struct mlx5_ifc_rqtc_bits ctx;
+};
+
+struct mlx5_ifc_modify_rq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+enum {
+	MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD = 1ULL << 1,
+	MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_SCATTER_FCS = 1ULL << 2,
+	MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID = 1ULL << 3,
+};
+
+struct mlx5_ifc_modify_rq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         rq_state[0x4];
+	u8         reserved_at_44[0x4];
+	u8         rqn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	u8         modify_bitmask[0x40];
+
+	u8         reserved_at_c0[0x40];
+
+	struct mlx5_ifc_rqc_bits ctx;
+};
+
+struct mlx5_ifc_modify_rmp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_rmp_bitmask_bits {
+	u8	   reserved_at_0[0x20];
+
+	u8         reserved_at_20[0x1f];
+	u8         lwm[0x1];
+};
+
+struct mlx5_ifc_modify_rmp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         rmp_state[0x4];
+	u8         reserved_at_44[0x4];
+	u8         rmpn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	struct mlx5_ifc_rmp_bitmask_bits bitmask;
+
+	u8         reserved_at_c0[0x40];
+
+	struct mlx5_ifc_rmpc_bits ctx;
+};
+
+struct mlx5_ifc_modify_nic_vport_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_nic_vport_field_select_bits {
+	u8         reserved_at_0[0x12];
+	u8	   affiliation[0x1];
+	u8	   reserved_at_e[0x1];
+	u8         disable_uc_local_lb[0x1];
+	u8         disable_mc_local_lb[0x1];
+	u8         node_guid[0x1];
+	u8         port_guid[0x1];
+	u8         min_inline[0x1];
+	u8         mtu[0x1];
+	u8         change_event[0x1];
+	u8         promisc[0x1];
+	u8         permanent_address[0x1];
+	u8         addresses_list[0x1];
+	u8         roce_en[0x1];
+	u8         reserved_at_1f[0x1];
+};
+
+struct mlx5_ifc_modify_nic_vport_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	struct mlx5_ifc_modify_nic_vport_field_select_bits field_select;
+
+	u8         reserved_at_80[0x780];
+
+	struct mlx5_ifc_nic_vport_context_bits nic_vport_context;
+};
+
+struct mlx5_ifc_modify_hca_vport_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_hca_vport_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xb];
+	u8         port_num[0x4];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	struct mlx5_ifc_hca_vport_context_bits hca_vport_context;
+};
+
+struct mlx5_ifc_modify_cq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+enum {
+	MLX5_MODIFY_CQ_IN_OP_MOD_MODIFY_CQ  = 0x0,
+	MLX5_MODIFY_CQ_IN_OP_MOD_RESIZE_CQ  = 0x1,
+};
+
+struct mlx5_ifc_modify_cq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         cqn[0x18];
+
+	union mlx5_ifc_modify_field_select_resize_field_select_auto_bits modify_field_select_resize_field_select;
+
+	struct mlx5_ifc_cqc_bits cq_context;
+
+	u8         reserved_at_280[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_modify_cong_status_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_cong_status_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x18];
+	u8         priority[0x4];
+	u8         cong_protocol[0x4];
+
+	u8         enable[0x1];
+	u8         tag_enable[0x1];
+	u8         reserved_at_62[0x1e];
+};
+
+struct mlx5_ifc_modify_cong_params_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_cong_params_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x1c];
+	u8         cong_protocol[0x4];
+
+	union mlx5_ifc_field_select_802_1_r_roce_auto_bits field_select;
+
+	u8         reserved_at_80[0x80];
+
+	union mlx5_ifc_cong_control_roce_ecn_auto_bits congestion_parameters;
+};
+
+struct mlx5_ifc_manage_pages_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         output_num_entries[0x20];
+
+	u8         reserved_at_60[0x20];
+
+	u8         pas[0][0x40];
+};
+
+enum {
+	MLX5_MANAGE_PAGES_IN_OP_MOD_ALLOCATION_FAIL     = 0x0,
+	MLX5_MANAGE_PAGES_IN_OP_MOD_ALLOCATION_SUCCESS  = 0x1,
+	MLX5_MANAGE_PAGES_IN_OP_MOD_HCA_RETURN_PAGES    = 0x2,
+};
+
+struct mlx5_ifc_manage_pages_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         function_id[0x10];
+
+	u8         input_num_entries[0x20];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_mad_ifc_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         response_mad_packet[256][0x8];
+};
+
+struct mlx5_ifc_mad_ifc_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         remote_lid[0x10];
+	u8         reserved_at_50[0x8];
+	u8         port[0x8];
+
+	u8         reserved_at_60[0x20];
+
+	u8         mad[256][0x8];
+};
+
+struct mlx5_ifc_init_hca_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_init_hca_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+	u8	   sw_owner_id[4][0x20];
+};
+
+struct mlx5_ifc_init2rtr_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_init2rtr_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_at_a0[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_at_800[0x80];
+};
+
+struct mlx5_ifc_init2init_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_init2init_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_at_a0[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_at_800[0x80];
+};
+
+struct mlx5_ifc_get_dropped_packet_log_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         packet_headers_log[128][0x8];
+
+	u8         packet_syndrome[64][0x8];
+};
+
+struct mlx5_ifc_get_dropped_packet_log_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_gen_eqe_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x18];
+	u8         eq_number[0x8];
+
+	u8         reserved_at_60[0x20];
+
+	u8         eqe[64][0x8];
+};
+
+struct mlx5_ifc_gen_eq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_enable_hca_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x20];
+};
+
+struct mlx5_ifc_enable_hca_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         function_id[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_drain_dct_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_drain_dct_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         dctn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_disable_hca_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x20];
+};
+
+struct mlx5_ifc_disable_hca_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         function_id[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_detach_from_mcg_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_detach_from_mcg_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	u8         multicast_gid[16][0x8];
+};
+
+struct mlx5_ifc_destroy_xrq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_xrq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         xrqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_xrc_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_xrc_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         xrc_srqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_tis_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_tis_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         tisn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_tir_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_tir_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         tirn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         srqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_sq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_sq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         sqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_scheduling_element_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x1c0];
+};
+
+struct mlx5_ifc_destroy_scheduling_element_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         scheduling_hierarchy[0x8];
+	u8         reserved_at_48[0x18];
+
+	u8         scheduling_element_id[0x20];
+
+	u8         reserved_at_80[0x180];
+};
+
+struct mlx5_ifc_destroy_rqt_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_rqt_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         rqtn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_rq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_rq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         rqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_set_delay_drop_params_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x20];
+
+	u8         reserved_at_60[0x10];
+	u8         delay_drop_timeout[0x10];
+};
+
+struct mlx5_ifc_set_delay_drop_params_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_rmp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_rmp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         rmpn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_psv_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_psv_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         psvn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_mkey_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_mkey_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         mkey_index[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_flow_table_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_flow_table_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         table_type[0x8];
+	u8         reserved_at_88[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_destroy_flow_group_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_flow_group_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         table_type[0x8];
+	u8         reserved_at_88[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         table_id[0x18];
+
+	u8         group_id[0x20];
+
+	u8         reserved_at_e0[0x120];
+};
+
+struct mlx5_ifc_destroy_eq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_eq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x18];
+	u8         eq_number[0x8];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_dct_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_dct_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         dctn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_cq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_cq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         cqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_delete_vxlan_udp_dport_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_delete_vxlan_udp_dport_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x20];
+
+	u8         reserved_at_60[0x10];
+	u8         vxlan_udp_port[0x10];
+};
+
+struct mlx5_ifc_delete_l2_table_entry_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_delete_l2_table_entry_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x60];
+
+	u8         reserved_at_a0[0x8];
+	u8         table_index[0x18];
+
+	u8         reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_delete_fte_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_delete_fte_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         table_type[0x8];
+	u8         reserved_at_88[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_at_c0[0x40];
+
+	u8         flow_index[0x20];
+
+	u8         reserved_at_120[0xe0];
+};
+
+struct mlx5_ifc_dealloc_xrcd_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_xrcd_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         xrcd[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_uar_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_uar_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         uar[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_transport_domain_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_transport_domain_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         transport_domain[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_q_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_q_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x18];
+	u8         counter_set_id[0x8];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_pd_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_pd_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         pd[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_flow_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_flow_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         flow_counter_id[0x20];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_xrq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         xrqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_xrq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_xrqc_bits xrq_context;
+};
+
+struct mlx5_ifc_create_xrc_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         xrc_srqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_xrc_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_xrc_srqc_bits xrc_srq_context_entry;
+
+	u8         reserved_at_280[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_create_tis_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         tisn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_tis_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_tisc_bits ctx;
+};
+
+struct mlx5_ifc_create_tir_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         tirn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_tir_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_tirc_bits ctx;
+};
+
+struct mlx5_ifc_create_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         srqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_srqc_bits srq_context_entry;
+
+	u8         reserved_at_280[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_create_sq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         sqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_sq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_sqc_bits ctx;
+};
+
+struct mlx5_ifc_create_scheduling_element_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         scheduling_element_id[0x20];
+
+	u8         reserved_at_a0[0x160];
+};
+
+struct mlx5_ifc_create_scheduling_element_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         scheduling_hierarchy[0x8];
+	u8         reserved_at_48[0x18];
+
+	u8         reserved_at_60[0xa0];
+
+	struct mlx5_ifc_scheduling_context_bits scheduling_context;
+
+	u8         reserved_at_300[0x100];
+};
+
+struct mlx5_ifc_create_rqt_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         rqtn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_rqt_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_rqtc_bits rqt_context;
+};
+
+struct mlx5_ifc_create_rq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         rqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_rq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_rqc_bits ctx;
+};
+
+struct mlx5_ifc_create_rmp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         rmpn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_rmp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_rmpc_bits ctx;
+};
+
+struct mlx5_ifc_create_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_at_a0[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_at_800[0x80];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_create_psv_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         reserved_at_80[0x8];
+	u8         psv0_index[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         psv1_index[0x18];
+
+	u8         reserved_at_c0[0x8];
+	u8         psv2_index[0x18];
+
+	u8         reserved_at_e0[0x8];
+	u8         psv3_index[0x18];
+};
+
+struct mlx5_ifc_create_psv_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         num_psv[0x4];
+	u8         reserved_at_44[0x4];
+	u8         pd[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_mkey_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         mkey_index[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_mkey_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x20];
+
+	u8         pg_access[0x1];
+	u8         reserved_at_61[0x1f];
+
+	struct mlx5_ifc_mkc_bits memory_key_mkey_entry;
+
+	u8         reserved_at_280[0x80];
+
+	u8         translations_octword_actual_size[0x20];
+
+	u8         reserved_at_320[0x560];
+
+	u8         klm_pas_mtt[0][0x20];
+};
+
+struct mlx5_ifc_create_flow_table_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_flow_table_context_bits {
+	u8         encap_en[0x1];
+	u8         decap_en[0x1];
+	u8         reserved_at_2[0x2];
+	u8         table_miss_action[0x4];
+	u8         level[0x8];
+	u8         reserved_at_10[0x8];
+	u8         log_size[0x8];
+
+	u8         reserved_at_20[0x8];
+	u8         table_miss_id[0x18];
+
+	u8         reserved_at_40[0x8];
+	u8         lag_master_next_table_id[0x18];
+
+	u8         reserved_at_60[0xe0];
+};
+
+struct mlx5_ifc_create_flow_table_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         table_type[0x8];
+	u8         reserved_at_88[0x18];
+
+	u8         reserved_at_a0[0x20];
+
+	struct mlx5_ifc_flow_table_context_bits flow_table_context;
+};
+
+struct mlx5_ifc_create_flow_group_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         group_id[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+enum {
+	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS    = 0x0,
+	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS  = 0x1,
+	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS    = 0x2,
+};
+
+struct mlx5_ifc_create_flow_group_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         table_type[0x8];
+	u8         reserved_at_88[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_at_c0[0x20];
+
+	u8         start_flow_index[0x20];
+
+	u8         reserved_at_100[0x20];
+
+	u8         end_flow_index[0x20];
+
+	u8         reserved_at_140[0xa0];
+
+	u8         reserved_at_1e0[0x18];
+	u8         match_criteria_enable[0x8];
+
+	struct mlx5_ifc_fte_match_param_bits match_criteria;
+
+	u8         reserved_at_1200[0xe00];
+};
+
+struct mlx5_ifc_create_eq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x18];
+	u8         eq_number[0x8];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_eq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_eqc_bits eq_context_entry;
+
+	u8         reserved_at_280[0x40];
+
+	u8         event_bitmask[0x40];
+
+	u8         reserved_at_300[0x580];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_create_dct_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         dctn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_dct_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_dctc_bits dct_context_entry;
+
+	u8         reserved_at_280[0x180];
+};
+
+struct mlx5_ifc_create_cq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         cqn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_cq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_cqc_bits cq_context;
+
+	u8         reserved_at_280[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_config_int_moderation_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x4];
+	u8         min_delay[0xc];
+	u8         int_vector[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+enum {
+	MLX5_CONFIG_INT_MODERATION_IN_OP_MOD_WRITE  = 0x0,
+	MLX5_CONFIG_INT_MODERATION_IN_OP_MOD_READ   = 0x1,
+};
+
+struct mlx5_ifc_config_int_moderation_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x4];
+	u8         min_delay[0xc];
+	u8         int_vector[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_attach_to_mcg_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_attach_to_mcg_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	u8         multicast_gid[16][0x8];
+};
+
+struct mlx5_ifc_arm_xrq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_arm_xrq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         xrqn[0x18];
+
+	u8         reserved_at_60[0x10];
+	u8         lwm[0x10];
+};
+
+struct mlx5_ifc_arm_xrc_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+enum {
+	MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ  = 0x1,
+};
+
+struct mlx5_ifc_arm_xrc_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         xrc_srqn[0x18];
+
+	u8         reserved_at_60[0x10];
+	u8         lwm[0x10];
+};
+
+struct mlx5_ifc_arm_rq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+enum {
+	MLX5_ARM_RQ_IN_OP_MOD_SRQ = 0x1,
+	MLX5_ARM_RQ_IN_OP_MOD_XRQ = 0x2,
+};
+
+struct mlx5_ifc_arm_rq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         srq_number[0x18];
+
+	u8         reserved_at_60[0x10];
+	u8         lwm[0x10];
+};
+
+struct mlx5_ifc_arm_dct_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_arm_dct_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         dct_number[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_xrcd_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         xrcd[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_xrcd_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_alloc_uar_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         uar[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_uar_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_alloc_transport_domain_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         transport_domain[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_transport_domain_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_alloc_q_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x18];
+	u8         counter_set_id[0x8];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_q_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_alloc_pd_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         pd[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_pd_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_alloc_flow_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         flow_counter_id[0x20];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_flow_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_add_vxlan_udp_dport_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_add_vxlan_udp_dport_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x20];
+
+	u8         reserved_at_60[0x10];
+	u8         vxlan_udp_port[0x10];
+};
+
+struct mlx5_ifc_set_pp_rate_limit_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_pp_rate_limit_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         rate_limit_index[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         rate_limit[0x20];
+
+	u8	   burst_upper_bound[0x20];
+
+	u8         reserved_at_c0[0x10];
+	u8	   typical_packet_size[0x10];
+
+	u8         reserved_at_e0[0x120];
+};
+
+struct mlx5_ifc_access_register_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         register_data[0][0x20];
+};
+
+enum {
+	MLX5_ACCESS_REGISTER_IN_OP_MOD_WRITE  = 0x0,
+	MLX5_ACCESS_REGISTER_IN_OP_MOD_READ   = 0x1,
+};
+
+struct mlx5_ifc_access_register_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         register_id[0x10];
+
+	u8         argument[0x20];
+
+	u8         register_data[0][0x20];
+};
+
+struct mlx5_ifc_sltp_reg_bits {
+	u8         status[0x4];
+	u8         version[0x4];
+	u8         local_port[0x8];
+	u8         pnat[0x2];
+	u8         reserved_at_12[0x2];
+	u8         lane[0x4];
+	u8         reserved_at_18[0x8];
+
+	u8         reserved_at_20[0x20];
+
+	u8         reserved_at_40[0x7];
+	u8         polarity[0x1];
+	u8         ob_tap0[0x8];
+	u8         ob_tap1[0x8];
+	u8         ob_tap2[0x8];
+
+	u8         reserved_at_60[0xc];
+	u8         ob_preemp_mode[0x4];
+	u8         ob_reg[0x8];
+	u8         ob_bias[0x8];
+
+	u8         reserved_at_80[0x20];
+};
+
+struct mlx5_ifc_slrg_reg_bits {
+	u8         status[0x4];
+	u8         version[0x4];
+	u8         local_port[0x8];
+	u8         pnat[0x2];
+	u8         reserved_at_12[0x2];
+	u8         lane[0x4];
+	u8         reserved_at_18[0x8];
+
+	u8         time_to_link_up[0x10];
+	u8         reserved_at_30[0xc];
+	u8         grade_lane_speed[0x4];
+
+	u8         grade_version[0x8];
+	u8         grade[0x18];
+
+	u8         reserved_at_60[0x4];
+	u8         height_grade_type[0x4];
+	u8         height_grade[0x18];
+
+	u8         height_dz[0x10];
+	u8         height_dv[0x10];
+
+	u8         reserved_at_a0[0x10];
+	u8         height_sigma[0x10];
+
+	u8         reserved_at_c0[0x20];
+
+	u8         reserved_at_e0[0x4];
+	u8         phase_grade_type[0x4];
+	u8         phase_grade[0x18];
+
+	u8         reserved_at_100[0x8];
+	u8         phase_eo_pos[0x8];
+	u8         reserved_at_110[0x8];
+	u8         phase_eo_neg[0x8];
+
+	u8         ffe_set_tested[0x10];
+	u8         test_errors_per_lane[0x10];
+};
+
+struct mlx5_ifc_pvlc_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x1c];
+	u8         vl_hw_cap[0x4];
+
+	u8         reserved_at_40[0x1c];
+	u8         vl_admin[0x4];
+
+	u8         reserved_at_60[0x1c];
+	u8         vl_operational[0x4];
+};
+
+struct mlx5_ifc_pude_reg_bits {
+	u8         swid[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x4];
+	u8         admin_status[0x4];
+	u8         reserved_at_18[0x4];
+	u8         oper_status[0x4];
+
+	u8         reserved_at_20[0x60];
+};
+
+struct mlx5_ifc_ptys_reg_bits {
+	u8         reserved_at_0[0x1];
+	u8         an_disable_admin[0x1];
+	u8         an_disable_cap[0x1];
+	u8         reserved_at_3[0x5];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0xd];
+	u8         proto_mask[0x3];
+
+	u8         an_status[0x4];
+	u8         reserved_at_24[0x3c];
+
+	u8         eth_proto_capability[0x20];
+
+	u8         ib_link_width_capability[0x10];
+	u8         ib_proto_capability[0x10];
+
+	u8         reserved_at_a0[0x20];
+
+	u8         eth_proto_admin[0x20];
+
+	u8         ib_link_width_admin[0x10];
+	u8         ib_proto_admin[0x10];
+
+	u8         reserved_at_100[0x20];
+
+	u8         eth_proto_oper[0x20];
+
+	u8         ib_link_width_oper[0x10];
+	u8         ib_proto_oper[0x10];
+
+	u8         reserved_at_160[0x1c];
+	u8         connector_type[0x4];
+
+	u8         eth_proto_lp_advertise[0x20];
+
+	u8         reserved_at_1a0[0x60];
+};
+
+struct mlx5_ifc_mlcr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x20];
+
+	u8         beacon_duration[0x10];
+	u8         reserved_at_40[0x10];
+
+	u8         beacon_remain[0x10];
+};
+
+struct mlx5_ifc_ptas_reg_bits {
+	u8         reserved_at_0[0x20];
+
+	u8         algorithm_options[0x10];
+	u8         reserved_at_30[0x4];
+	u8         repetitions_mode[0x4];
+	u8         num_of_repetitions[0x8];
+
+	u8         grade_version[0x8];
+	u8         height_grade_type[0x4];
+	u8         phase_grade_type[0x4];
+	u8         height_grade_weight[0x8];
+	u8         phase_grade_weight[0x8];
+
+	u8         gisim_measure_bits[0x10];
+	u8         adaptive_tap_measure_bits[0x10];
+
+	u8         ber_bath_high_error_threshold[0x10];
+	u8         ber_bath_mid_error_threshold[0x10];
+
+	u8         ber_bath_low_error_threshold[0x10];
+	u8         one_ratio_high_threshold[0x10];
+
+	u8         one_ratio_high_mid_threshold[0x10];
+	u8         one_ratio_low_mid_threshold[0x10];
+
+	u8         one_ratio_low_threshold[0x10];
+	u8         ndeo_error_threshold[0x10];
+
+	u8         mixer_offset_step_size[0x10];
+	u8         reserved_at_110[0x8];
+	u8         mix90_phase_for_voltage_bath[0x8];
+
+	u8         mixer_offset_start[0x10];
+	u8         mixer_offset_end[0x10];
+
+	u8         reserved_at_140[0x15];
+	u8         ber_test_time[0xb];
+};
+
+struct mlx5_ifc_pspa_reg_bits {
+	u8         swid[0x8];
+	u8         local_port[0x8];
+	u8         sub_port[0x8];
+	u8         reserved_at_18[0x8];
+
+	u8         reserved_at_20[0x20];
+};
+
+struct mlx5_ifc_pqdr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x5];
+	u8         prio[0x3];
+	u8         reserved_at_18[0x6];
+	u8         mode[0x2];
+
+	u8         reserved_at_20[0x20];
+
+	u8         reserved_at_40[0x10];
+	u8         min_threshold[0x10];
+
+	u8         reserved_at_60[0x10];
+	u8         max_threshold[0x10];
+
+	u8         reserved_at_80[0x10];
+	u8         mark_probability_denominator[0x10];
+
+	u8         reserved_at_a0[0x60];
+};
+
+struct mlx5_ifc_ppsc_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x60];
+
+	u8         reserved_at_80[0x1c];
+	u8         wrps_admin[0x4];
+
+	u8         reserved_at_a0[0x1c];
+	u8         wrps_status[0x4];
+
+	u8         reserved_at_c0[0x8];
+	u8         up_threshold[0x8];
+	u8         reserved_at_d0[0x8];
+	u8         down_threshold[0x8];
+
+	u8         reserved_at_e0[0x20];
+
+	u8         reserved_at_100[0x1c];
+	u8         srps_admin[0x4];
+
+	u8         reserved_at_120[0x1c];
+	u8         srps_status[0x4];
+
+	u8         reserved_at_140[0x40];
+};
+
+struct mlx5_ifc_pplr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x8];
+	u8         lb_cap[0x8];
+	u8         reserved_at_30[0x8];
+	u8         lb_en[0x8];
+};
+
+struct mlx5_ifc_pplm_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x20];
+
+	u8         port_profile_mode[0x8];
+	u8         static_port_profile[0x8];
+	u8         active_port_profile[0x8];
+	u8         reserved_at_58[0x8];
+
+	u8         retransmission_active[0x8];
+	u8         fec_mode_active[0x18];
+
+	u8         reserved_at_80[0x20];
+};
+
+struct mlx5_ifc_ppcnt_reg_bits {
+	u8         swid[0x8];
+	u8         local_port[0x8];
+	u8         pnat[0x2];
+	u8         reserved_at_12[0x8];
+	u8         grp[0x6];
+
+	u8         clr[0x1];
+	u8         reserved_at_21[0x1c];
+	u8         prio_tc[0x3];
+
+	union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set;
+};
+
+struct mlx5_ifc_mpcnt_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         pcie_index[0x8];
+	u8         reserved_at_10[0xa];
+	u8         grp[0x6];
+
+	u8         clr[0x1];
+	u8         reserved_at_21[0x1f];
+
+	union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits counter_set;
+};
+
+struct mlx5_ifc_ppad_reg_bits {
+	u8         reserved_at_0[0x3];
+	u8         single_mac[0x1];
+	u8         reserved_at_4[0x4];
+	u8         local_port[0x8];
+	u8         mac_47_32[0x10];
+
+	u8         mac_31_0[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_pmtu_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         max_mtu[0x10];
+	u8         reserved_at_30[0x10];
+
+	u8         admin_mtu[0x10];
+	u8         reserved_at_50[0x10];
+
+	u8         oper_mtu[0x10];
+	u8         reserved_at_70[0x10];
+};
+
+struct mlx5_ifc_pmpr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         module[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x18];
+	u8         attenuation_5g[0x8];
+
+	u8         reserved_at_40[0x18];
+	u8         attenuation_7g[0x8];
+
+	u8         reserved_at_60[0x18];
+	u8         attenuation_12g[0x8];
+};
+
+struct mlx5_ifc_pmpe_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         module[0x8];
+	u8         reserved_at_10[0xc];
+	u8         module_status[0x4];
+
+	u8         reserved_at_20[0x60];
+};
+
+struct mlx5_ifc_pmpc_reg_bits {
+	u8         module_state_updated[32][0x8];
+};
+
+struct mlx5_ifc_pmlpn_reg_bits {
+	u8         reserved_at_0[0x4];
+	u8         mlpn_status[0x4];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         e[0x1];
+	u8         reserved_at_21[0x1f];
+};
+
+struct mlx5_ifc_pmlp_reg_bits {
+	u8         rxtx[0x1];
+	u8         reserved_at_1[0x7];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x8];
+	u8         width[0x8];
+
+	u8         lane0_module_mapping[0x20];
+
+	u8         lane1_module_mapping[0x20];
+
+	u8         lane2_module_mapping[0x20];
+
+	u8         lane3_module_mapping[0x20];
+
+	u8         reserved_at_a0[0x160];
+};
+
+struct mlx5_ifc_pmaos_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         module[0x8];
+	u8         reserved_at_10[0x4];
+	u8         admin_status[0x4];
+	u8         reserved_at_18[0x4];
+	u8         oper_status[0x4];
+
+	u8         ase[0x1];
+	u8         ee[0x1];
+	u8         reserved_at_22[0x1c];
+	u8         e[0x2];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_plpc_reg_bits {
+	u8         reserved_at_0[0x4];
+	u8         profile_id[0xc];
+	u8         reserved_at_10[0x4];
+	u8         proto_mask[0x4];
+	u8         reserved_at_18[0x8];
+
+	u8         reserved_at_20[0x10];
+	u8         lane_speed[0x10];
+
+	u8         reserved_at_40[0x17];
+	u8         lpbf[0x1];
+	u8         fec_mode_policy[0x8];
+
+	u8         retransmission_capability[0x8];
+	u8         fec_mode_capability[0x18];
+
+	u8         retransmission_support_admin[0x8];
+	u8         fec_mode_support_admin[0x18];
+
+	u8         retransmission_request_admin[0x8];
+	u8         fec_mode_request_admin[0x18];
+
+	u8         reserved_at_c0[0x80];
+};
+
+struct mlx5_ifc_plib_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x8];
+	u8         ib_port[0x8];
+
+	u8         reserved_at_20[0x60];
+};
+
+struct mlx5_ifc_plbf_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0xd];
+	u8         lbf_mode[0x3];
+
+	u8         reserved_at_20[0x20];
+};
+
+struct mlx5_ifc_pipg_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         dic[0x1];
+	u8         reserved_at_21[0x19];
+	u8         ipg[0x4];
+	u8         reserved_at_3e[0x2];
+};
+
+struct mlx5_ifc_pifr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0xe0];
+
+	u8         port_filter[8][0x20];
+
+	u8         port_filter_update_en[8][0x20];
+};
+
+struct mlx5_ifc_pfcc_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0xb];
+	u8         ppan_mask_n[0x1];
+	u8         minor_stall_mask[0x1];
+	u8         critical_stall_mask[0x1];
+	u8         reserved_at_1e[0x2];
+
+	u8         ppan[0x4];
+	u8         reserved_at_24[0x4];
+	u8         prio_mask_tx[0x8];
+	u8         reserved_at_30[0x8];
+	u8         prio_mask_rx[0x8];
+
+	u8         pptx[0x1];
+	u8         aptx[0x1];
+	u8         pptx_mask_n[0x1];
+	u8         reserved_at_43[0x5];
+	u8         pfctx[0x8];
+	u8         reserved_at_50[0x10];
+
+	u8         pprx[0x1];
+	u8         aprx[0x1];
+	u8         pprx_mask_n[0x1];
+	u8         reserved_at_63[0x5];
+	u8         pfcrx[0x8];
+	u8         reserved_at_70[0x10];
+
+	u8         device_stall_minor_watermark[0x10];
+	u8         device_stall_critical_watermark[0x10];
+
+	u8         reserved_at_a0[0x60];
+};
+
+struct mlx5_ifc_pelc_reg_bits {
+	u8         op[0x4];
+	u8         reserved_at_4[0x4];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         op_admin[0x8];
+	u8         op_capability[0x8];
+	u8         op_request[0x8];
+	u8         op_active[0x8];
+
+	u8         admin[0x40];
+
+	u8         capability[0x40];
+
+	u8         request[0x40];
+
+	u8         active[0x40];
+
+	u8         reserved_at_140[0x80];
+};
+
+struct mlx5_ifc_peir_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0xc];
+	u8         error_count[0x4];
+	u8         reserved_at_30[0x10];
+
+	u8         reserved_at_40[0xc];
+	u8         lane[0x4];
+	u8         reserved_at_50[0x8];
+	u8         error_type[0x8];
+};
+
+struct mlx5_ifc_pcam_enhanced_features_bits {
+	u8         reserved_at_0[0x76];
+
+	u8         pfcc_mask[0x1];
+	u8         reserved_at_77[0x4];
+	u8         rx_buffer_fullness_counters[0x1];
+	u8         ptys_connector_type[0x1];
+	u8         reserved_at_7d[0x1];
+	u8         ppcnt_discard_group[0x1];
+	u8         ppcnt_statistical_group[0x1];
+};
+
+struct mlx5_ifc_pcam_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         feature_group[0x8];
+	u8         reserved_at_10[0x8];
+	u8         access_reg_group[0x8];
+
+	u8         reserved_at_20[0x20];
+
+	union {
+		u8         reserved_at_0[0x80];
+	} port_access_reg_cap_mask;
+
+	u8         reserved_at_c0[0x80];
+
+	union {
+		struct mlx5_ifc_pcam_enhanced_features_bits enhanced_features;
+		u8         reserved_at_0[0x80];
+	} feature_cap_mask;
+
+	u8         reserved_at_1c0[0xc0];
+};
+
+struct mlx5_ifc_mcam_enhanced_features_bits {
+	u8         reserved_at_0[0x7b];
+	u8         pcie_outbound_stalled[0x1];
+	u8         tx_overflow_buffer_pkt[0x1];
+	u8         mtpps_enh_out_per_adj[0x1];
+	u8         mtpps_fs[0x1];
+	u8         pcie_performance_group[0x1];
+};
+
+struct mlx5_ifc_mcam_access_reg_bits {
+	u8         reserved_at_0[0x1c];
+	u8         mcda[0x1];
+	u8         mcc[0x1];
+	u8         mcqi[0x1];
+	u8         reserved_at_1f[0x1];
+
+	u8         regs_95_to_64[0x20];
+	u8         regs_63_to_32[0x20];
+	u8         regs_31_to_0[0x20];
+};
+
+struct mlx5_ifc_mcam_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         feature_group[0x8];
+	u8         reserved_at_10[0x8];
+	u8         access_reg_group[0x8];
+
+	u8         reserved_at_20[0x20];
+
+	union {
+		struct mlx5_ifc_mcam_access_reg_bits access_regs;
+		u8         reserved_at_0[0x80];
+	} mng_access_reg_cap_mask;
+
+	u8         reserved_at_c0[0x80];
+
+	union {
+		struct mlx5_ifc_mcam_enhanced_features_bits enhanced_features;
+		u8         reserved_at_0[0x80];
+	} mng_feature_cap_mask;
+
+	u8         reserved_at_1c0[0x80];
+};
+
+struct mlx5_ifc_qcam_access_reg_cap_mask {
+	u8         qcam_access_reg_cap_mask_127_to_20[0x6C];
+	u8         qpdpm[0x1];
+	u8         qcam_access_reg_cap_mask_18_to_4[0x0F];
+	u8         qdpm[0x1];
+	u8         qpts[0x1];
+	u8         qcap[0x1];
+	u8         qcam_access_reg_cap_mask_0[0x1];
+};
+
+struct mlx5_ifc_qcam_qos_feature_cap_mask {
+	u8         qcam_qos_feature_cap_mask_127_to_1[0x7F];
+	u8         qpts_trust_both[0x1];
+};
+
+struct mlx5_ifc_qcam_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         feature_group[0x8];
+	u8         reserved_at_10[0x8];
+	u8         access_reg_group[0x8];
+	u8         reserved_at_20[0x20];
+
+	union {
+		struct mlx5_ifc_qcam_access_reg_cap_mask reg_cap;
+		u8  reserved_at_0[0x80];
+	} qos_access_reg_cap_mask;
+
+	u8         reserved_at_c0[0x80];
+
+	union {
+		struct mlx5_ifc_qcam_qos_feature_cap_mask feature_cap;
+		u8  reserved_at_0[0x80];
+	} qos_feature_cap_mask;
+
+	u8         reserved_at_1c0[0x80];
+};
+
+struct mlx5_ifc_pcap_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         port_capability_mask[4][0x20];
+};
+
+struct mlx5_ifc_paos_reg_bits {
+	u8         swid[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x4];
+	u8         admin_status[0x4];
+	u8         reserved_at_18[0x4];
+	u8         oper_status[0x4];
+
+	u8         ase[0x1];
+	u8         ee[0x1];
+	u8         reserved_at_22[0x1c];
+	u8         e[0x2];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_pamp_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         opamp_group[0x8];
+	u8         reserved_at_10[0xc];
+	u8         opamp_group_type[0x4];
+
+	u8         start_index[0x10];
+	u8         reserved_at_30[0x4];
+	u8         num_of_indices[0xc];
+
+	u8         index_data[18][0x10];
+};
+
+struct mlx5_ifc_pcmr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x2e];
+	u8         fcs_cap[0x1];
+	u8         reserved_at_3f[0x1f];
+	u8         fcs_chk[0x1];
+	u8         reserved_at_5f[0x1];
+};
+
+struct mlx5_ifc_lane_2_module_mapping_bits {
+	u8         reserved_at_0[0x6];
+	u8         rx_lane[0x2];
+	u8         reserved_at_8[0x6];
+	u8         tx_lane[0x2];
+	u8         reserved_at_10[0x8];
+	u8         module[0x8];
+};
+
+struct mlx5_ifc_bufferx_reg_bits {
+	u8         reserved_at_0[0x6];
+	u8         lossy[0x1];
+	u8         epsb[0x1];
+	u8         reserved_at_8[0xc];
+	u8         size[0xc];
+
+	u8         xoff_threshold[0x10];
+	u8         xon_threshold[0x10];
+};
+
+struct mlx5_ifc_set_node_in_bits {
+	u8         node_description[64][0x8];
+};
+
+struct mlx5_ifc_register_power_settings_bits {
+	u8         reserved_at_0[0x18];
+	u8         power_settings_level[0x8];
+
+	u8         reserved_at_20[0x60];
+};
+
+struct mlx5_ifc_register_host_endianness_bits {
+	u8         he[0x1];
+	u8         reserved_at_1[0x1f];
+
+	u8         reserved_at_20[0x60];
+};
+
+struct mlx5_ifc_umr_pointer_desc_argument_bits {
+	u8         reserved_at_0[0x20];
+
+	u8         mkey[0x20];
+
+	u8         addressh_63_32[0x20];
+
+	u8         addressl_31_0[0x20];
+};
+
+struct mlx5_ifc_ud_adrs_vector_bits {
+	u8         dc_key[0x40];
+
+	u8         ext[0x1];
+	u8         reserved_at_41[0x7];
+	u8         destination_qp_dct[0x18];
+
+	u8         static_rate[0x4];
+	u8         sl_eth_prio[0x4];
+	u8         fl[0x1];
+	u8         mlid[0x7];
+	u8         rlid_udp_sport[0x10];
+
+	u8         reserved_at_80[0x20];
+
+	u8         rmac_47_16[0x20];
+
+	u8         rmac_15_0[0x10];
+	u8         tclass[0x8];
+	u8         hop_limit[0x8];
+
+	u8         reserved_at_e0[0x1];
+	u8         grh[0x1];
+	u8         reserved_at_e2[0x2];
+	u8         src_addr_index[0x8];
+	u8         flow_label[0x14];
+
+	u8         rgid_rip[16][0x8];
+};
+
+struct mlx5_ifc_pages_req_event_bits {
+	u8         reserved_at_0[0x10];
+	u8         function_id[0x10];
+
+	u8         num_pages[0x20];
+
+	u8         reserved_at_40[0xa0];
+};
+
+struct mlx5_ifc_eqe_bits {
+	u8         reserved_at_0[0x8];
+	u8         event_type[0x8];
+	u8         reserved_at_10[0x8];
+	u8         event_sub_type[0x8];
+
+	u8         reserved_at_20[0xe0];
+
+	union mlx5_ifc_event_auto_bits event_data;
+
+	u8         reserved_at_1e0[0x10];
+	u8         signature[0x8];
+	u8         reserved_at_1f8[0x7];
+	u8         owner[0x1];
+};
+
+enum {
+	MLX5_CMD_QUEUE_ENTRY_TYPE_PCIE_CMD_IF_TRANSPORT  = 0x7,
+};
+
+struct mlx5_ifc_cmd_queue_entry_bits {
+	u8         type[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         input_length[0x20];
+
+	u8         input_mailbox_pointer_63_32[0x20];
+
+	u8         input_mailbox_pointer_31_9[0x17];
+	u8         reserved_at_77[0x9];
+
+	u8         command_input_inline_data[16][0x8];
+
+	u8         command_output_inline_data[16][0x8];
+
+	u8         output_mailbox_pointer_63_32[0x20];
+
+	u8         output_mailbox_pointer_31_9[0x17];
+	u8         reserved_at_1b7[0x9];
+
+	u8         output_length[0x20];
+
+	u8         token[0x8];
+	u8         signature[0x8];
+	u8         reserved_at_1f0[0x8];
+	u8         status[0x7];
+	u8         ownership[0x1];
+};
+
+struct mlx5_ifc_cmd_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         command_output[0x20];
+};
+
+struct mlx5_ifc_cmd_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         command[0][0x20];
+};
+
+struct mlx5_ifc_cmd_if_box_bits {
+	u8         mailbox_data[512][0x8];
+
+	u8         reserved_at_1000[0x180];
+
+	u8         next_pointer_63_32[0x20];
+
+	u8         next_pointer_31_10[0x16];
+	u8         reserved_at_11b6[0xa];
+
+	u8         block_number[0x20];
+
+	u8         reserved_at_11e0[0x8];
+	u8         token[0x8];
+	u8         ctrl_signature[0x8];
+	u8         signature[0x8];
+};
+
+struct mlx5_ifc_mtt_bits {
+	u8         ptag_63_32[0x20];
+
+	u8         ptag_31_8[0x18];
+	u8         reserved_at_38[0x6];
+	u8         wr_en[0x1];
+	u8         rd_en[0x1];
+};
+
+struct mlx5_ifc_query_wol_rol_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x10];
+	u8         rol_mode[0x8];
+	u8         wol_mode[0x8];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_wol_rol_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_wol_rol_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_wol_rol_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         rol_mode_valid[0x1];
+	u8         wol_mode_valid[0x1];
+	u8         reserved_at_42[0xe];
+	u8         rol_mode[0x8];
+	u8         wol_mode[0x8];
+
+	u8         reserved_at_60[0x20];
+};
+
+enum {
+	MLX5_INITIAL_SEG_NIC_INTERFACE_FULL_DRIVER  = 0x0,
+	MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED     = 0x1,
+	MLX5_INITIAL_SEG_NIC_INTERFACE_NO_DRAM_NIC  = 0x2,
+};
+
+enum {
+	MLX5_INITIAL_SEG_NIC_INTERFACE_SUPPORTED_FULL_DRIVER  = 0x0,
+	MLX5_INITIAL_SEG_NIC_INTERFACE_SUPPORTED_DISABLED     = 0x1,
+	MLX5_INITIAL_SEG_NIC_INTERFACE_SUPPORTED_NO_DRAM_NIC  = 0x2,
+};
+
+enum {
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_FW_INTERNAL_ERR              = 0x1,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_DEAD_IRISC                   = 0x7,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_HW_FATAL_ERR                 = 0x8,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_FW_CRC_ERR                   = 0x9,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_FETCH_PCI_ERR            = 0xa,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PAGE_ERR                 = 0xb,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_ASYNCHRONOUS_EQ_BUF_OVERRUN  = 0xc,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_IN_ERR                    = 0xd,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_INV                       = 0xe,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_FFSER_ERR                    = 0xf,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_HIGH_TEMP_ERR                = 0x10,
+};
+
+struct mlx5_ifc_initial_seg_bits {
+	u8         fw_rev_minor[0x10];
+	u8         fw_rev_major[0x10];
+
+	u8         cmd_interface_rev[0x10];
+	u8         fw_rev_subminor[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	u8         cmdq_phy_addr_63_32[0x20];
+
+	u8         cmdq_phy_addr_31_12[0x14];
+	u8         reserved_at_b4[0x2];
+	u8         nic_interface[0x2];
+	u8         log_cmdq_size[0x4];
+	u8         log_cmdq_stride[0x4];
+
+	u8         command_doorbell_vector[0x20];
+
+	u8         reserved_at_e0[0xf00];
+
+	u8         initializing[0x1];
+	u8         reserved_at_fe1[0x4];
+	u8         nic_interface_supported[0x3];
+	u8         reserved_at_fe8[0x18];
+
+	struct mlx5_ifc_health_buffer_bits health_buffer;
+
+	u8         no_dram_nic_offset[0x20];
+
+	u8         reserved_at_1220[0x6e40];
+
+	u8         reserved_at_8060[0x1f];
+	u8         clear_int[0x1];
+
+	u8         health_syndrome[0x8];
+	u8         health_counter[0x18];
+
+	u8         reserved_at_80a0[0x17fc0];
+};
+
+struct mlx5_ifc_mtpps_reg_bits {
+	u8         reserved_at_0[0xc];
+	u8         cap_number_of_pps_pins[0x4];
+	u8         reserved_at_10[0x4];
+	u8         cap_max_num_of_pps_in_pins[0x4];
+	u8         reserved_at_18[0x4];
+	u8         cap_max_num_of_pps_out_pins[0x4];
+
+	u8         reserved_at_20[0x24];
+	u8         cap_pin_3_mode[0x4];
+	u8         reserved_at_48[0x4];
+	u8         cap_pin_2_mode[0x4];
+	u8         reserved_at_50[0x4];
+	u8         cap_pin_1_mode[0x4];
+	u8         reserved_at_58[0x4];
+	u8         cap_pin_0_mode[0x4];
+
+	u8         reserved_at_60[0x4];
+	u8         cap_pin_7_mode[0x4];
+	u8         reserved_at_68[0x4];
+	u8         cap_pin_6_mode[0x4];
+	u8         reserved_at_70[0x4];
+	u8         cap_pin_5_mode[0x4];
+	u8         reserved_at_78[0x4];
+	u8         cap_pin_4_mode[0x4];
+
+	u8         field_select[0x20];
+	u8         reserved_at_a0[0x60];
+
+	u8         enable[0x1];
+	u8         reserved_at_101[0xb];
+	u8         pattern[0x4];
+	u8         reserved_at_110[0x4];
+	u8         pin_mode[0x4];
+	u8         pin[0x8];
+
+	u8         reserved_at_120[0x20];
+
+	u8         time_stamp[0x40];
+
+	u8         out_pulse_duration[0x10];
+	u8         out_periodic_adjustment[0x10];
+	u8         enhanced_out_periodic_adjustment[0x20];
+
+	u8         reserved_at_1c0[0x20];
+};
+
+struct mlx5_ifc_mtppse_reg_bits {
+	u8         reserved_at_0[0x18];
+	u8         pin[0x8];
+	u8         event_arm[0x1];
+	u8         reserved_at_21[0x1b];
+	u8         event_generation_mode[0x4];
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_mcqi_cap_bits {
+	u8         supported_info_bitmask[0x20];
+
+	u8         component_size[0x20];
+
+	u8         max_component_size[0x20];
+
+	u8         log_mcda_word_size[0x4];
+	u8         reserved_at_64[0xc];
+	u8         mcda_max_write_size[0x10];
+
+	u8         rd_en[0x1];
+	u8         reserved_at_81[0x1];
+	u8         match_chip_id[0x1];
+	u8         match_psid[0x1];
+	u8         check_user_timestamp[0x1];
+	u8         match_base_guid_mac[0x1];
+	u8         reserved_at_86[0x1a];
+};
+
+struct mlx5_ifc_mcqi_reg_bits {
+	u8         read_pending_component[0x1];
+	u8         reserved_at_1[0xf];
+	u8         component_index[0x10];
+
+	u8         reserved_at_20[0x20];
+
+	u8         reserved_at_40[0x1b];
+	u8         info_type[0x5];
+
+	u8         info_size[0x20];
+
+	u8         offset[0x20];
+
+	u8         reserved_at_a0[0x10];
+	u8         data_size[0x10];
+
+	u8         data[0][0x20];
+};
+
+struct mlx5_ifc_mcc_reg_bits {
+	u8         reserved_at_0[0x4];
+	u8         time_elapsed_since_last_cmd[0xc];
+	u8         reserved_at_10[0x8];
+	u8         instruction[0x8];
+
+	u8         reserved_at_20[0x10];
+	u8         component_index[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         update_handle[0x18];
+
+	u8         handle_owner_type[0x4];
+	u8         handle_owner_host_id[0x4];
+	u8         reserved_at_68[0x1];
+	u8         control_progress[0x7];
+	u8         error_code[0x8];
+	u8         reserved_at_78[0x4];
+	u8         control_state[0x4];
+
+	u8         component_size[0x20];
+
+	u8         reserved_at_a0[0x60];
+};
+
+struct mlx5_ifc_mcda_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         update_handle[0x18];
+
+	u8         offset[0x20];
+
+	u8         reserved_at_40[0x10];
+	u8         size[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         data[0][0x20];
+};
+
+union mlx5_ifc_ports_control_registers_document_bits {
+	struct mlx5_ifc_bufferx_reg_bits bufferx_reg;
+	struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits eth_2863_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits eth_3635_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits eth_802_3_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits eth_extended_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_per_prio_grp_data_layout_bits eth_per_prio_grp_data_layout;
+	struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout;
+	struct mlx5_ifc_lane_2_module_mapping_bits lane_2_module_mapping;
+	struct mlx5_ifc_pamp_reg_bits pamp_reg;
+	struct mlx5_ifc_paos_reg_bits paos_reg;
+	struct mlx5_ifc_pcap_reg_bits pcap_reg;
+	struct mlx5_ifc_peir_reg_bits peir_reg;
+	struct mlx5_ifc_pelc_reg_bits pelc_reg;
+	struct mlx5_ifc_pfcc_reg_bits pfcc_reg;
+	struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits ib_port_cntrs_grp_data_layout;
+	struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs;
+	struct mlx5_ifc_pifr_reg_bits pifr_reg;
+	struct mlx5_ifc_pipg_reg_bits pipg_reg;
+	struct mlx5_ifc_plbf_reg_bits plbf_reg;
+	struct mlx5_ifc_plib_reg_bits plib_reg;
+	struct mlx5_ifc_plpc_reg_bits plpc_reg;
+	struct mlx5_ifc_pmaos_reg_bits pmaos_reg;
+	struct mlx5_ifc_pmlp_reg_bits pmlp_reg;
+	struct mlx5_ifc_pmlpn_reg_bits pmlpn_reg;
+	struct mlx5_ifc_pmpc_reg_bits pmpc_reg;
+	struct mlx5_ifc_pmpe_reg_bits pmpe_reg;
+	struct mlx5_ifc_pmpr_reg_bits pmpr_reg;
+	struct mlx5_ifc_pmtu_reg_bits pmtu_reg;
+	struct mlx5_ifc_ppad_reg_bits ppad_reg;
+	struct mlx5_ifc_ppcnt_reg_bits ppcnt_reg;
+	struct mlx5_ifc_mpcnt_reg_bits mpcnt_reg;
+	struct mlx5_ifc_pplm_reg_bits pplm_reg;
+	struct mlx5_ifc_pplr_reg_bits pplr_reg;
+	struct mlx5_ifc_ppsc_reg_bits ppsc_reg;
+	struct mlx5_ifc_pqdr_reg_bits pqdr_reg;
+	struct mlx5_ifc_pspa_reg_bits pspa_reg;
+	struct mlx5_ifc_ptas_reg_bits ptas_reg;
+	struct mlx5_ifc_ptys_reg_bits ptys_reg;
+	struct mlx5_ifc_mlcr_reg_bits mlcr_reg;
+	struct mlx5_ifc_pude_reg_bits pude_reg;
+	struct mlx5_ifc_pvlc_reg_bits pvlc_reg;
+	struct mlx5_ifc_slrg_reg_bits slrg_reg;
+	struct mlx5_ifc_sltp_reg_bits sltp_reg;
+	struct mlx5_ifc_mtpps_reg_bits mtpps_reg;
+	struct mlx5_ifc_mtppse_reg_bits mtppse_reg;
+	struct mlx5_ifc_fpga_access_reg_bits fpga_access_reg;
+	struct mlx5_ifc_fpga_ctrl_bits fpga_ctrl_bits;
+	struct mlx5_ifc_fpga_cap_bits fpga_cap_bits;
+	struct mlx5_ifc_mcqi_reg_bits mcqi_reg;
+	struct mlx5_ifc_mcc_reg_bits mcc_reg;
+	struct mlx5_ifc_mcda_reg_bits mcda_reg;
+	u8         reserved_at_0[0x60e0];
+};
+
+union mlx5_ifc_debug_enhancements_document_bits {
+	struct mlx5_ifc_health_buffer_bits health_buffer;
+	u8         reserved_at_0[0x200];
+};
+
+union mlx5_ifc_uplink_pci_interface_document_bits {
+	struct mlx5_ifc_initial_seg_bits initial_seg;
+	u8         reserved_at_0[0x20060];
+};
+
+struct mlx5_ifc_set_flow_table_root_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_flow_table_root_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         table_type[0x8];
+	u8         reserved_at_88[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_at_c0[0x8];
+	u8         underlay_qpn[0x18];
+	u8         reserved_at_e0[0x120];
+};
+
+enum {
+	MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID     = (1UL << 0),
+	MLX5_MODIFY_FLOW_TABLE_LAG_NEXT_TABLE_ID = (1UL << 15),
+};
+
+struct mlx5_ifc_modify_flow_table_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_flow_table_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_at_41[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_at_60[0x10];
+	u8         modify_field_select[0x10];
+
+	u8         table_type[0x8];
+	u8         reserved_at_88[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         table_id[0x18];
+
+	struct mlx5_ifc_flow_table_context_bits flow_table_context;
+};
+
+struct mlx5_ifc_ets_tcn_config_reg_bits {
+	u8         g[0x1];
+	u8         b[0x1];
+	u8         r[0x1];
+	u8         reserved_at_3[0x9];
+	u8         group[0x4];
+	u8         reserved_at_10[0x9];
+	u8         bw_allocation[0x7];
+
+	u8         reserved_at_20[0xc];
+	u8         max_bw_units[0x4];
+	u8         reserved_at_30[0x8];
+	u8         max_bw_value[0x8];
+};
+
+struct mlx5_ifc_ets_global_config_reg_bits {
+	u8         reserved_at_0[0x2];
+	u8         r[0x1];
+	u8         reserved_at_3[0x1d];
+
+	u8         reserved_at_20[0xc];
+	u8         max_bw_units[0x4];
+	u8         reserved_at_30[0x8];
+	u8         max_bw_value[0x8];
+};
+
+struct mlx5_ifc_qetc_reg_bits {
+	u8                                         reserved_at_0[0x8];
+	u8                                         port_number[0x8];
+	u8                                         reserved_at_10[0x30];
+
+	struct mlx5_ifc_ets_tcn_config_reg_bits    tc_configuration[0x8];
+	struct mlx5_ifc_ets_global_config_reg_bits global_configuration;
+};
+
+struct mlx5_ifc_qpdpm_dscp_reg_bits {
+	u8         e[0x1];
+	u8         reserved_at_01[0x0b];
+	u8         prio[0x04];
+};
+
+struct mlx5_ifc_qpdpm_reg_bits {
+	u8                                     reserved_at_0[0x8];
+	u8                                     local_port[0x8];
+	u8                                     reserved_at_10[0x10];
+	struct mlx5_ifc_qpdpm_dscp_reg_bits    dscp[64];
+};
+
+struct mlx5_ifc_qpts_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x2d];
+	u8         trust_state[0x3];
+};
+
+struct mlx5_ifc_qtct_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         port_number[0x8];
+	u8         reserved_at_10[0xd];
+	u8         prio[0x3];
+
+	u8         reserved_at_20[0x1d];
+	u8         tclass[0x3];
+};
+
+struct mlx5_ifc_mcia_reg_bits {
+	u8         l[0x1];
+	u8         reserved_at_1[0x7];
+	u8         module[0x8];
+	u8         reserved_at_10[0x8];
+	u8         status[0x8];
+
+	u8         i2c_device_address[0x8];
+	u8         page_number[0x8];
+	u8         device_address[0x10];
+
+	u8         reserved_at_40[0x10];
+	u8         size[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         dword_0[0x20];
+	u8         dword_1[0x20];
+	u8         dword_2[0x20];
+	u8         dword_3[0x20];
+	u8         dword_4[0x20];
+	u8         dword_5[0x20];
+	u8         dword_6[0x20];
+	u8         dword_7[0x20];
+	u8         dword_8[0x20];
+	u8         dword_9[0x20];
+	u8         dword_10[0x20];
+	u8         dword_11[0x20];
+};
+
+struct mlx5_ifc_dcbx_param_bits {
+	u8         dcbx_cee_cap[0x1];
+	u8         dcbx_ieee_cap[0x1];
+	u8         dcbx_standby_cap[0x1];
+	u8         reserved_at_0[0x5];
+	u8         port_number[0x8];
+	u8         reserved_at_10[0xa];
+	u8         max_application_table_size[6];
+	u8         reserved_at_20[0x15];
+	u8         version_oper[0x3];
+	u8         reserved_at_38[5];
+	u8         version_admin[0x3];
+	u8         willing_admin[0x1];
+	u8         reserved_at_41[0x3];
+	u8         pfc_cap_oper[0x4];
+	u8         reserved_at_48[0x4];
+	u8         pfc_cap_admin[0x4];
+	u8         reserved_at_50[0x4];
+	u8         num_of_tc_oper[0x4];
+	u8         reserved_at_58[0x4];
+	u8         num_of_tc_admin[0x4];
+	u8         remote_willing[0x1];
+	u8         reserved_at_61[3];
+	u8         remote_pfc_cap[4];
+	u8         reserved_at_68[0x14];
+	u8         remote_num_of_tc[0x4];
+	u8         reserved_at_80[0x18];
+	u8         error[0x8];
+	u8         reserved_at_a0[0x160];
+};
+
+struct mlx5_ifc_lagc_bits {
+	u8         reserved_at_0[0x1d];
+	u8         lag_state[0x3];
+
+	u8         reserved_at_20[0x14];
+	u8         tx_remap_affinity_2[0x4];
+	u8         reserved_at_38[0x4];
+	u8         tx_remap_affinity_1[0x4];
+};
+
+struct mlx5_ifc_create_lag_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_create_lag_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	struct mlx5_ifc_lagc_bits ctx;
+};
+
+struct mlx5_ifc_modify_lag_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_lag_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x20];
+	u8         field_select[0x20];
+
+	struct mlx5_ifc_lagc_bits ctx;
+};
+
+struct mlx5_ifc_query_lag_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_lagc_bits ctx;
+};
+
+struct mlx5_ifc_query_lag_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_lag_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_lag_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_create_vport_lag_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_create_vport_lag_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_vport_lag_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_vport_lag_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_alloc_memic_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_30[0x20];
+
+	u8	   reserved_at_40[0x18];
+	u8	   log_memic_addr_alignment[0x8];
+
+	u8         range_start_addr[0x40];
+
+	u8         range_size[0x20];
+
+	u8         memic_size[0x20];
+};
+
+struct mlx5_ifc_alloc_memic_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         memic_start_addr[0x40];
+};
+
+struct mlx5_ifc_dealloc_memic_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	u8         memic_start_addr[0x40];
+
+	u8         memic_size[0x20];
+
+	u8         reserved_at_e0[0x20];
+};
+
+struct mlx5_ifc_dealloc_memic_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+#endif /* MLX5_IFC_H */
diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h
new file mode 100644
index 0000000..ec05249
--- /dev/null
+++ b/include/linux/mlx5/mlx5_ifc_fpga.h
@@ -0,0 +1,522 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef MLX5_IFC_FPGA_H
+#define MLX5_IFC_FPGA_H
+
+enum {
+	MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX = 0x2c9,
+};
+
+enum {
+	MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_IPSEC    = 0x2,
+};
+
+struct mlx5_ifc_fpga_shell_caps_bits {
+	u8         max_num_qps[0x10];
+	u8         reserved_at_10[0x8];
+	u8         total_rcv_credits[0x8];
+
+	u8         reserved_at_20[0xe];
+	u8         qp_type[0x2];
+	u8         reserved_at_30[0x5];
+	u8         rae[0x1];
+	u8         rwe[0x1];
+	u8         rre[0x1];
+	u8         reserved_at_38[0x4];
+	u8         dc[0x1];
+	u8         ud[0x1];
+	u8         uc[0x1];
+	u8         rc[0x1];
+
+	u8         reserved_at_40[0x1a];
+	u8         log_ddr_size[0x6];
+
+	u8         max_fpga_qp_msg_size[0x20];
+
+	u8         reserved_at_80[0x180];
+};
+
+struct mlx5_ifc_fpga_cap_bits {
+	u8         fpga_id[0x8];
+	u8         fpga_device[0x18];
+
+	u8         register_file_ver[0x20];
+
+	u8         fpga_ctrl_modify[0x1];
+	u8         reserved_at_41[0x5];
+	u8         access_reg_query_mode[0x2];
+	u8         reserved_at_48[0x6];
+	u8         access_reg_modify_mode[0x2];
+	u8         reserved_at_50[0x10];
+
+	u8         reserved_at_60[0x20];
+
+	u8         image_version[0x20];
+
+	u8         image_date[0x20];
+
+	u8         image_time[0x20];
+
+	u8         shell_version[0x20];
+
+	u8         reserved_at_100[0x80];
+
+	struct mlx5_ifc_fpga_shell_caps_bits shell_caps;
+
+	u8         reserved_at_380[0x8];
+	u8         ieee_vendor_id[0x18];
+
+	u8         sandbox_product_version[0x10];
+	u8         sandbox_product_id[0x10];
+
+	u8         sandbox_basic_caps[0x20];
+
+	u8         reserved_at_3e0[0x10];
+	u8         sandbox_extended_caps_len[0x10];
+
+	u8         sandbox_extended_caps_addr[0x40];
+
+	u8         fpga_ddr_start_addr[0x40];
+
+	u8         fpga_cr_space_start_addr[0x40];
+
+	u8         fpga_ddr_size[0x20];
+
+	u8         fpga_cr_space_size[0x20];
+
+	u8         reserved_at_500[0x300];
+};
+
+enum {
+	MLX5_FPGA_CTRL_OPERATION_LOAD                = 0x1,
+	MLX5_FPGA_CTRL_OPERATION_RESET               = 0x2,
+	MLX5_FPGA_CTRL_OPERATION_FLASH_SELECT        = 0x3,
+	MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_ON   = 0x4,
+	MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_OFF  = 0x5,
+	MLX5_FPGA_CTRL_OPERATION_RESET_SANDBOX       = 0x6,
+};
+
+struct mlx5_ifc_fpga_ctrl_bits {
+	u8         reserved_at_0[0x8];
+	u8         operation[0x8];
+	u8         reserved_at_10[0x8];
+	u8         status[0x8];
+
+	u8         reserved_at_20[0x8];
+	u8         flash_select_admin[0x8];
+	u8         reserved_at_30[0x8];
+	u8         flash_select_oper[0x8];
+
+	u8         reserved_at_40[0x40];
+};
+
+enum {
+	MLX5_FPGA_ERROR_EVENT_SYNDROME_CORRUPTED_DDR        = 0x1,
+	MLX5_FPGA_ERROR_EVENT_SYNDROME_FLASH_TIMEOUT        = 0x2,
+	MLX5_FPGA_ERROR_EVENT_SYNDROME_INTERNAL_LINK_ERROR  = 0x3,
+	MLX5_FPGA_ERROR_EVENT_SYNDROME_WATCHDOG_FAILURE     = 0x4,
+	MLX5_FPGA_ERROR_EVENT_SYNDROME_I2C_FAILURE          = 0x5,
+	MLX5_FPGA_ERROR_EVENT_SYNDROME_IMAGE_CHANGED        = 0x6,
+	MLX5_FPGA_ERROR_EVENT_SYNDROME_TEMPERATURE_CRITICAL = 0x7,
+};
+
+struct mlx5_ifc_fpga_error_event_bits {
+	u8         reserved_at_0[0x40];
+
+	u8         reserved_at_40[0x18];
+	u8         syndrome[0x8];
+
+	u8         reserved_at_60[0x80];
+};
+
+#define MLX5_FPGA_ACCESS_REG_SIZE_MAX 64
+
+struct mlx5_ifc_fpga_access_reg_bits {
+	u8         reserved_at_0[0x20];
+
+	u8         reserved_at_20[0x10];
+	u8         size[0x10];
+
+	u8         address[0x40];
+
+	u8         data[0][0x8];
+};
+
+enum mlx5_ifc_fpga_qp_state {
+	MLX5_FPGA_QPC_STATE_INIT    = 0x0,
+	MLX5_FPGA_QPC_STATE_ACTIVE  = 0x1,
+	MLX5_FPGA_QPC_STATE_ERROR   = 0x2,
+};
+
+enum mlx5_ifc_fpga_qp_type {
+	MLX5_FPGA_QPC_QP_TYPE_SHELL_QP    = 0x0,
+	MLX5_FPGA_QPC_QP_TYPE_SANDBOX_QP  = 0x1,
+};
+
+enum mlx5_ifc_fpga_qp_service_type {
+	MLX5_FPGA_QPC_ST_RC  = 0x0,
+};
+
+struct mlx5_ifc_fpga_qpc_bits {
+	u8         state[0x4];
+	u8         reserved_at_4[0x1b];
+	u8         qp_type[0x1];
+
+	u8         reserved_at_20[0x4];
+	u8         st[0x4];
+	u8         reserved_at_28[0x10];
+	u8         traffic_class[0x8];
+
+	u8         ether_type[0x10];
+	u8         prio[0x3];
+	u8         dei[0x1];
+	u8         vid[0xc];
+
+	u8         reserved_at_60[0x20];
+
+	u8         reserved_at_80[0x8];
+	u8         next_rcv_psn[0x18];
+
+	u8         reserved_at_a0[0x8];
+	u8         next_send_psn[0x18];
+
+	u8         reserved_at_c0[0x10];
+	u8         pkey[0x10];
+
+	u8         reserved_at_e0[0x8];
+	u8         remote_qpn[0x18];
+
+	u8         reserved_at_100[0x15];
+	u8         rnr_retry[0x3];
+	u8         reserved_at_118[0x5];
+	u8         retry_count[0x3];
+
+	u8         reserved_at_120[0x20];
+
+	u8         reserved_at_140[0x10];
+	u8         remote_mac_47_32[0x10];
+
+	u8         remote_mac_31_0[0x20];
+
+	u8         remote_ip[16][0x8];
+
+	u8         reserved_at_200[0x40];
+
+	u8         reserved_at_240[0x10];
+	u8         fpga_mac_47_32[0x10];
+
+	u8         fpga_mac_31_0[0x20];
+
+	u8         fpga_ip[16][0x8];
+};
+
+struct mlx5_ifc_fpga_create_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_fpga_qpc_bits fpga_qpc;
+};
+
+struct mlx5_ifc_fpga_create_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         fpga_qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+
+	struct mlx5_ifc_fpga_qpc_bits fpga_qpc;
+};
+
+struct mlx5_ifc_fpga_modify_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         fpga_qpn[0x18];
+
+	u8         field_select[0x20];
+
+	struct mlx5_ifc_fpga_qpc_bits fpga_qpc;
+};
+
+struct mlx5_ifc_fpga_modify_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_fpga_query_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         fpga_qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_fpga_query_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_fpga_qpc_bits fpga_qpc;
+};
+
+struct mlx5_ifc_fpga_query_qp_counters_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         clear[0x1];
+	u8         reserved_at_41[0x7];
+	u8         fpga_qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_fpga_query_qp_counters_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         rx_ack_packets[0x40];
+
+	u8         rx_send_packets[0x40];
+
+	u8         tx_ack_packets[0x40];
+
+	u8         tx_send_packets[0x40];
+
+	u8         rx_total_drop[0x40];
+
+	u8         reserved_at_1c0[0x1c0];
+};
+
+struct mlx5_ifc_fpga_destroy_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         fpga_qpn[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_fpga_destroy_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_ipsec_extended_cap_bits {
+	u8         encapsulation[0x20];
+
+	u8         reserved_0[0x12];
+	u8         v2_command[0x1];
+	u8         udp_encap[0x1];
+	u8         rx_no_trailer[0x1];
+	u8         ipv4_fragment[0x1];
+	u8         ipv6[0x1];
+	u8         esn[0x1];
+	u8         lso[0x1];
+	u8         transport_and_tunnel_mode[0x1];
+	u8         tunnel_mode[0x1];
+	u8         transport_mode[0x1];
+	u8         ah_esp[0x1];
+	u8         esp[0x1];
+	u8         ah[0x1];
+	u8         ipv4_options[0x1];
+
+	u8         auth_alg[0x20];
+
+	u8         enc_alg[0x20];
+
+	u8         sa_cap[0x20];
+
+	u8         reserved_1[0x10];
+	u8         number_of_ipsec_counters[0x10];
+
+	u8         ipsec_counters_addr_low[0x20];
+	u8         ipsec_counters_addr_high[0x20];
+};
+
+struct mlx5_ifc_ipsec_counters_bits {
+	u8         dec_in_packets[0x40];
+
+	u8         dec_out_packets[0x40];
+
+	u8         dec_bypass_packets[0x40];
+
+	u8         enc_in_packets[0x40];
+
+	u8         enc_out_packets[0x40];
+
+	u8         enc_bypass_packets[0x40];
+
+	u8         drop_dec_packets[0x40];
+
+	u8         failed_auth_dec_packets[0x40];
+
+	u8         drop_enc_packets[0x40];
+
+	u8         success_add_sa[0x40];
+
+	u8         fail_add_sa[0x40];
+
+	u8         success_delete_sa[0x40];
+
+	u8         fail_delete_sa[0x40];
+
+	u8         dropped_cmd[0x40];
+};
+
+enum mlx5_ifc_fpga_ipsec_response_syndrome {
+	MLX5_FPGA_IPSEC_RESPONSE_SUCCESS = 0,
+	MLX5_FPGA_IPSEC_RESPONSE_ILLEGAL_REQUEST = 1,
+	MLX5_FPGA_IPSEC_RESPONSE_SADB_ISSUE = 2,
+	MLX5_FPGA_IPSEC_RESPONSE_WRITE_RESPONSE_ISSUE = 3,
+};
+
+struct mlx5_ifc_fpga_ipsec_cmd_resp {
+	__be32 syndrome;
+	union {
+		__be32 sw_sa_handle;
+		__be32 flags;
+	};
+	u8 reserved[24];
+} __packed;
+
+enum mlx5_ifc_fpga_ipsec_cmd_opcode {
+	MLX5_FPGA_IPSEC_CMD_OP_ADD_SA = 0,
+	MLX5_FPGA_IPSEC_CMD_OP_DEL_SA = 1,
+	MLX5_FPGA_IPSEC_CMD_OP_ADD_SA_V2 = 2,
+	MLX5_FPGA_IPSEC_CMD_OP_DEL_SA_V2 = 3,
+	MLX5_FPGA_IPSEC_CMD_OP_MOD_SA_V2 = 4,
+	MLX5_FPGA_IPSEC_CMD_OP_SET_CAP = 5,
+};
+
+enum mlx5_ifc_fpga_ipsec_cap {
+	MLX5_FPGA_IPSEC_CAP_NO_TRAILER = BIT(0),
+};
+
+struct mlx5_ifc_fpga_ipsec_cmd_cap {
+	__be32 cmd;
+	__be32 flags;
+	u8 reserved[24];
+} __packed;
+
+enum mlx5_ifc_fpga_ipsec_sa_flags {
+	MLX5_FPGA_IPSEC_SA_ESN_EN = BIT(0),
+	MLX5_FPGA_IPSEC_SA_ESN_OVERLAP = BIT(1),
+	MLX5_FPGA_IPSEC_SA_IPV6 = BIT(2),
+	MLX5_FPGA_IPSEC_SA_DIR_SX = BIT(3),
+	MLX5_FPGA_IPSEC_SA_SPI_EN = BIT(4),
+	MLX5_FPGA_IPSEC_SA_SA_VALID = BIT(5),
+	MLX5_FPGA_IPSEC_SA_IP_ESP = BIT(6),
+	MLX5_FPGA_IPSEC_SA_IP_AH = BIT(7),
+};
+
+enum mlx5_ifc_fpga_ipsec_sa_enc_mode {
+	MLX5_FPGA_IPSEC_SA_ENC_MODE_NONE = 0,
+	MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_128_AUTH_128 = 1,
+	MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_256_AUTH_128 = 3,
+};
+
+struct mlx5_ifc_fpga_ipsec_sa_v1 {
+	__be32 cmd;
+	u8 key_enc[32];
+	u8 key_auth[32];
+	__be32 sip[4];
+	__be32 dip[4];
+	union {
+		struct {
+			__be32 reserved;
+			u8 salt_iv[8];
+			__be32 salt;
+		} __packed gcm;
+		struct {
+			u8 salt[16];
+		} __packed cbc;
+	};
+	__be32 spi;
+	__be32 sw_sa_handle;
+	__be16 tfclen;
+	u8 enc_mode;
+	u8 reserved1[2];
+	u8 flags;
+	u8 reserved2[2];
+};
+
+struct mlx5_ifc_fpga_ipsec_sa {
+	struct mlx5_ifc_fpga_ipsec_sa_v1 ipsec_sa_v1;
+	__be16 udp_sp;
+	__be16 udp_dp;
+	u8 reserved1[4];
+	__be32 esn;
+	__be16 vid;	/* only 12 bits, rest is reserved */
+	__be16 reserved2;
+} __packed;
+
+#endif /* MLX5_IFC_FPGA_H */
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
new file mode 100644
index 0000000..34aed60
--- /dev/null
+++ b/include/linux/mlx5/port.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_PORT_H__
+#define __MLX5_PORT_H__
+
+#include <linux/mlx5/driver.h>
+
+enum mlx5_beacon_duration {
+	MLX5_BEACON_DURATION_OFF = 0x0,
+	MLX5_BEACON_DURATION_INF = 0xffff,
+};
+
+enum mlx5_module_id {
+	MLX5_MODULE_ID_SFP              = 0x3,
+	MLX5_MODULE_ID_QSFP             = 0xC,
+	MLX5_MODULE_ID_QSFP_PLUS        = 0xD,
+	MLX5_MODULE_ID_QSFP28           = 0x11,
+};
+
+enum mlx5_an_status {
+	MLX5_AN_UNAVAILABLE = 0,
+	MLX5_AN_COMPLETE    = 1,
+	MLX5_AN_FAILED      = 2,
+	MLX5_AN_LINK_UP     = 3,
+	MLX5_AN_LINK_DOWN   = 4,
+};
+
+#define MLX5_EEPROM_MAX_BYTES			32
+#define MLX5_EEPROM_IDENTIFIER_BYTE_MASK	0x000000ff
+#define MLX5_I2C_ADDR_LOW		0x50
+#define MLX5_I2C_ADDR_HIGH		0x51
+#define MLX5_EEPROM_PAGE_LENGTH		256
+
+enum mlx5e_link_mode {
+	MLX5E_1000BASE_CX_SGMII	 = 0,
+	MLX5E_1000BASE_KX	 = 1,
+	MLX5E_10GBASE_CX4	 = 2,
+	MLX5E_10GBASE_KX4	 = 3,
+	MLX5E_10GBASE_KR	 = 4,
+	MLX5E_20GBASE_KR2	 = 5,
+	MLX5E_40GBASE_CR4	 = 6,
+	MLX5E_40GBASE_KR4	 = 7,
+	MLX5E_56GBASE_R4	 = 8,
+	MLX5E_10GBASE_CR	 = 12,
+	MLX5E_10GBASE_SR	 = 13,
+	MLX5E_10GBASE_ER	 = 14,
+	MLX5E_40GBASE_SR4	 = 15,
+	MLX5E_40GBASE_LR4	 = 16,
+	MLX5E_50GBASE_SR2	 = 18,
+	MLX5E_100GBASE_CR4	 = 20,
+	MLX5E_100GBASE_SR4	 = 21,
+	MLX5E_100GBASE_KR4	 = 22,
+	MLX5E_100GBASE_LR4	 = 23,
+	MLX5E_100BASE_TX	 = 24,
+	MLX5E_1000BASE_T	 = 25,
+	MLX5E_10GBASE_T		 = 26,
+	MLX5E_25GBASE_CR	 = 27,
+	MLX5E_25GBASE_KR	 = 28,
+	MLX5E_25GBASE_SR	 = 29,
+	MLX5E_50GBASE_CR2	 = 30,
+	MLX5E_50GBASE_KR2	 = 31,
+	MLX5E_LINK_MODES_NUMBER,
+};
+
+enum mlx5e_connector_type {
+	MLX5E_PORT_UNKNOWN	= 0,
+	MLX5E_PORT_NONE			= 1,
+	MLX5E_PORT_TP			= 2,
+	MLX5E_PORT_AUI			= 3,
+	MLX5E_PORT_BNC			= 4,
+	MLX5E_PORT_MII			= 5,
+	MLX5E_PORT_FIBRE		= 6,
+	MLX5E_PORT_DA			= 7,
+	MLX5E_PORT_OTHER		= 8,
+	MLX5E_CONNECTOR_TYPE_NUMBER,
+};
+
+#define MLX5E_PROT_MASK(link_mode) (1 << link_mode)
+
+#define PORT_MODULE_EVENT_MODULE_STATUS_MASK 0xF
+#define PORT_MODULE_EVENT_ERROR_TYPE_MASK         0xF
+
+int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps);
+int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
+			 int ptys_size, int proto_mask, u8 local_port);
+int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev,
+			      u32 *proto_cap, int proto_mask);
+int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev,
+				u32 *proto_admin, int proto_mask);
+int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev,
+				    u8 *link_width_oper, u8 local_port);
+int mlx5_query_port_ib_proto_oper(struct mlx5_core_dev *dev,
+				  u8 *proto_oper, u8 local_port);
+int mlx5_query_port_eth_proto_oper(struct mlx5_core_dev *dev,
+				   u32 *proto_oper, u8 local_port);
+int mlx5_set_port_ptys(struct mlx5_core_dev *dev, bool an_disable,
+		       u32 proto_admin, int proto_mask);
+void mlx5_toggle_port_link(struct mlx5_core_dev *dev);
+int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
+			       enum mlx5_port_status status);
+int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
+				 enum mlx5_port_status *status);
+int mlx5_set_port_beacon(struct mlx5_core_dev *dev, u16 beacon_duration);
+void mlx5_query_port_autoneg(struct mlx5_core_dev *dev, int proto_mask,
+			     u8 *an_status,
+			     u8 *an_disable_cap, u8 *an_disable_admin);
+
+int mlx5_set_port_mtu(struct mlx5_core_dev *dev, u16 mtu, u8 port);
+void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, u16 *max_mtu, u8 port);
+void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, u16 *oper_mtu,
+			      u8 port);
+
+int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
+			      u8 *vl_hw_cap, u8 local_port);
+
+int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause);
+int mlx5_query_port_pause(struct mlx5_core_dev *dev,
+			  u32 *rx_pause, u32 *tx_pause);
+
+int mlx5_set_port_pfc(struct mlx5_core_dev *dev, u8 pfc_en_tx, u8 pfc_en_rx);
+int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx,
+			u8 *pfc_en_rx);
+
+int mlx5_set_port_stall_watermark(struct mlx5_core_dev *dev,
+				  u16 stall_critical_watermark,
+				  u16 stall_minor_watermark);
+int mlx5_query_port_stall_watermark(struct mlx5_core_dev *dev,
+				    u16 *stall_critical_watermark, u16 *stall_minor_watermark);
+
+int mlx5_max_tc(struct mlx5_core_dev *mdev);
+
+int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc);
+int mlx5_query_port_prio_tc(struct mlx5_core_dev *mdev,
+			    u8 prio, u8 *tc);
+int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group);
+int mlx5_query_port_tc_group(struct mlx5_core_dev *mdev,
+			     u8 tc, u8 *tc_group);
+int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw);
+int mlx5_query_port_tc_bw_alloc(struct mlx5_core_dev *mdev,
+				u8 tc, u8 *bw_pct);
+int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev,
+				    u8 *max_bw_value,
+				    u8 *max_bw_unit);
+int mlx5_query_port_ets_rate_limit(struct mlx5_core_dev *mdev,
+				   u8 *max_bw_value,
+				   u8 *max_bw_unit);
+int mlx5_set_port_wol(struct mlx5_core_dev *mdev, u8 wol_mode);
+int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode);
+
+int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable);
+void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
+			 bool *enabled);
+int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
+			     u16 offset, u16 size, u8 *data);
+
+int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out);
+int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in);
+
+int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state);
+int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state);
+int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio);
+int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio);
+#endif /* __MLX5_PORT_H__ */
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
new file mode 100644
index 0000000..4778d41
--- /dev/null
+++ b/include/linux/mlx5/qp.h
@@ -0,0 +1,643 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_QP_H
+#define MLX5_QP_H
+
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/driver.h>
+
+#define MLX5_INVALID_LKEY	0x100
+#define MLX5_SIG_WQE_SIZE	(MLX5_SEND_WQE_BB * 5)
+#define MLX5_DIF_SIZE		8
+#define MLX5_STRIDE_BLOCK_OP	0x400
+#define MLX5_CPY_GRD_MASK	0xc0
+#define MLX5_CPY_APP_MASK	0x30
+#define MLX5_CPY_REF_MASK	0x0f
+#define MLX5_BSF_INC_REFTAG	(1 << 6)
+#define MLX5_BSF_INL_VALID	(1 << 15)
+#define MLX5_BSF_REFRESH_DIF	(1 << 14)
+#define MLX5_BSF_REPEAT_BLOCK	(1 << 7)
+#define MLX5_BSF_APPTAG_ESCAPE	0x1
+#define MLX5_BSF_APPREF_ESCAPE	0x2
+
+enum mlx5_qp_optpar {
+	MLX5_QP_OPTPAR_ALT_ADDR_PATH		= 1 << 0,
+	MLX5_QP_OPTPAR_RRE			= 1 << 1,
+	MLX5_QP_OPTPAR_RAE			= 1 << 2,
+	MLX5_QP_OPTPAR_RWE			= 1 << 3,
+	MLX5_QP_OPTPAR_PKEY_INDEX		= 1 << 4,
+	MLX5_QP_OPTPAR_Q_KEY			= 1 << 5,
+	MLX5_QP_OPTPAR_RNR_TIMEOUT		= 1 << 6,
+	MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH	= 1 << 7,
+	MLX5_QP_OPTPAR_SRA_MAX			= 1 << 8,
+	MLX5_QP_OPTPAR_RRA_MAX			= 1 << 9,
+	MLX5_QP_OPTPAR_PM_STATE			= 1 << 10,
+	MLX5_QP_OPTPAR_RETRY_COUNT		= 1 << 12,
+	MLX5_QP_OPTPAR_RNR_RETRY		= 1 << 13,
+	MLX5_QP_OPTPAR_ACK_TIMEOUT		= 1 << 14,
+	MLX5_QP_OPTPAR_PRI_PORT			= 1 << 16,
+	MLX5_QP_OPTPAR_SRQN			= 1 << 18,
+	MLX5_QP_OPTPAR_CQN_RCV			= 1 << 19,
+	MLX5_QP_OPTPAR_DC_HS			= 1 << 20,
+	MLX5_QP_OPTPAR_DC_KEY			= 1 << 21,
+};
+
+enum mlx5_qp_state {
+	MLX5_QP_STATE_RST			= 0,
+	MLX5_QP_STATE_INIT			= 1,
+	MLX5_QP_STATE_RTR			= 2,
+	MLX5_QP_STATE_RTS			= 3,
+	MLX5_QP_STATE_SQER			= 4,
+	MLX5_QP_STATE_SQD			= 5,
+	MLX5_QP_STATE_ERR			= 6,
+	MLX5_QP_STATE_SQ_DRAINING		= 7,
+	MLX5_QP_STATE_SUSPENDED			= 9,
+	MLX5_QP_NUM_STATE,
+	MLX5_QP_STATE,
+	MLX5_QP_STATE_BAD,
+};
+
+enum {
+	MLX5_SQ_STATE_NA	= MLX5_SQC_STATE_ERR + 1,
+	MLX5_SQ_NUM_STATE	= MLX5_SQ_STATE_NA + 1,
+	MLX5_RQ_STATE_NA	= MLX5_RQC_STATE_ERR + 1,
+	MLX5_RQ_NUM_STATE	= MLX5_RQ_STATE_NA + 1,
+};
+
+enum {
+	MLX5_QP_ST_RC				= 0x0,
+	MLX5_QP_ST_UC				= 0x1,
+	MLX5_QP_ST_UD				= 0x2,
+	MLX5_QP_ST_XRC				= 0x3,
+	MLX5_QP_ST_MLX				= 0x4,
+	MLX5_QP_ST_DCI				= 0x5,
+	MLX5_QP_ST_DCT				= 0x6,
+	MLX5_QP_ST_QP0				= 0x7,
+	MLX5_QP_ST_QP1				= 0x8,
+	MLX5_QP_ST_RAW_ETHERTYPE		= 0x9,
+	MLX5_QP_ST_RAW_IPV6			= 0xa,
+	MLX5_QP_ST_SNIFFER			= 0xb,
+	MLX5_QP_ST_SYNC_UMR			= 0xe,
+	MLX5_QP_ST_PTP_1588			= 0xd,
+	MLX5_QP_ST_REG_UMR			= 0xc,
+	MLX5_QP_ST_MAX
+};
+
+enum {
+	MLX5_QP_PM_MIGRATED			= 0x3,
+	MLX5_QP_PM_ARMED			= 0x0,
+	MLX5_QP_PM_REARM			= 0x1
+};
+
+enum {
+	MLX5_NON_ZERO_RQ	= 0x0,
+	MLX5_SRQ_RQ		= 0x1,
+	MLX5_CRQ_RQ		= 0x2,
+	MLX5_ZERO_LEN_RQ	= 0x3
+};
+
+/* TODO REM */
+enum {
+	/* params1 */
+	MLX5_QP_BIT_SRE				= 1 << 15,
+	MLX5_QP_BIT_SWE				= 1 << 14,
+	MLX5_QP_BIT_SAE				= 1 << 13,
+	/* params2 */
+	MLX5_QP_BIT_RRE				= 1 << 15,
+	MLX5_QP_BIT_RWE				= 1 << 14,
+	MLX5_QP_BIT_RAE				= 1 << 13,
+	MLX5_QP_BIT_RIC				= 1 <<	4,
+	MLX5_QP_BIT_CC_SLAVE_RECV		= 1 <<  2,
+	MLX5_QP_BIT_CC_SLAVE_SEND		= 1 <<  1,
+	MLX5_QP_BIT_CC_MASTER			= 1 <<  0
+};
+
+enum {
+	MLX5_WQE_CTRL_CQ_UPDATE		= 2 << 2,
+	MLX5_WQE_CTRL_CQ_UPDATE_AND_EQE	= 3 << 2,
+	MLX5_WQE_CTRL_SOLICITED		= 1 << 1,
+};
+
+enum {
+	MLX5_SEND_WQE_DS	= 16,
+	MLX5_SEND_WQE_BB	= 64,
+};
+
+#define MLX5_SEND_WQEBB_NUM_DS	(MLX5_SEND_WQE_BB / MLX5_SEND_WQE_DS)
+
+enum {
+	MLX5_SEND_WQE_MAX_WQEBBS	= 16,
+};
+
+enum {
+	MLX5_WQE_FMR_PERM_LOCAL_READ	= 1 << 27,
+	MLX5_WQE_FMR_PERM_LOCAL_WRITE	= 1 << 28,
+	MLX5_WQE_FMR_PERM_REMOTE_READ	= 1 << 29,
+	MLX5_WQE_FMR_PERM_REMOTE_WRITE	= 1 << 30,
+	MLX5_WQE_FMR_PERM_ATOMIC	= 1 << 31
+};
+
+enum {
+	MLX5_FENCE_MODE_NONE			= 0 << 5,
+	MLX5_FENCE_MODE_INITIATOR_SMALL		= 1 << 5,
+	MLX5_FENCE_MODE_FENCE			= 2 << 5,
+	MLX5_FENCE_MODE_STRONG_ORDERING		= 3 << 5,
+	MLX5_FENCE_MODE_SMALL_AND_FENCE		= 4 << 5,
+};
+
+enum {
+	MLX5_RCV_DBR	= 0,
+	MLX5_SND_DBR	= 1,
+};
+
+enum {
+	MLX5_FLAGS_INLINE	= 1<<7,
+	MLX5_FLAGS_CHECK_FREE   = 1<<5,
+};
+
+struct mlx5_wqe_fmr_seg {
+	__be32			flags;
+	__be32			mem_key;
+	__be64			buf_list;
+	__be64			start_addr;
+	__be64			reg_len;
+	__be32			offset;
+	__be32			page_size;
+	u32			reserved[2];
+};
+
+struct mlx5_wqe_ctrl_seg {
+	__be32			opmod_idx_opcode;
+	__be32			qpn_ds;
+	u8			signature;
+	u8			rsvd[2];
+	u8			fm_ce_se;
+	__be32			imm;
+};
+
+#define MLX5_WQE_CTRL_DS_MASK 0x3f
+#define MLX5_WQE_CTRL_QPN_MASK 0xffffff00
+#define MLX5_WQE_CTRL_QPN_SHIFT 8
+#define MLX5_WQE_DS_UNITS 16
+#define MLX5_WQE_CTRL_OPCODE_MASK 0xff
+#define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00
+#define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8
+
+enum {
+	MLX5_ETH_WQE_L3_INNER_CSUM      = 1 << 4,
+	MLX5_ETH_WQE_L4_INNER_CSUM      = 1 << 5,
+	MLX5_ETH_WQE_L3_CSUM            = 1 << 6,
+	MLX5_ETH_WQE_L4_CSUM            = 1 << 7,
+};
+
+enum {
+	MLX5_ETH_WQE_SVLAN              = 1 << 0,
+	MLX5_ETH_WQE_INSERT_VLAN        = 1 << 15,
+};
+
+enum {
+	MLX5_ETH_WQE_SWP_INNER_L3_IPV6  = 1 << 0,
+	MLX5_ETH_WQE_SWP_INNER_L4_UDP   = 1 << 1,
+	MLX5_ETH_WQE_SWP_OUTER_L3_IPV6  = 1 << 4,
+	MLX5_ETH_WQE_SWP_OUTER_L4_UDP   = 1 << 5,
+};
+
+struct mlx5_wqe_eth_seg {
+	u8              swp_outer_l4_offset;
+	u8              swp_outer_l3_offset;
+	u8              swp_inner_l4_offset;
+	u8              swp_inner_l3_offset;
+	u8              cs_flags;
+	u8              swp_flags;
+	__be16          mss;
+	__be32          rsvd2;
+	union {
+		struct {
+			__be16 sz;
+			u8     start[2];
+		} inline_hdr;
+		struct {
+			__be16 type;
+			__be16 vlan_tci;
+		} insert;
+	};
+};
+
+struct mlx5_wqe_xrc_seg {
+	__be32			xrc_srqn;
+	u8			rsvd[12];
+};
+
+struct mlx5_wqe_masked_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+	__be64			swap_add_mask;
+	__be64			compare_mask;
+};
+
+struct mlx5_base_av {
+	union {
+		struct {
+			__be32	qkey;
+			__be32	reserved;
+		} qkey;
+		__be64	dc_key;
+	} key;
+	__be32	dqp_dct;
+	u8	stat_rate_sl;
+	u8	fl_mlid;
+	union {
+		__be16	rlid;
+		__be16  udp_sport;
+	};
+};
+
+struct mlx5_av {
+	union {
+		struct {
+			__be32	qkey;
+			__be32	reserved;
+		} qkey;
+		__be64	dc_key;
+	} key;
+	__be32	dqp_dct;
+	u8	stat_rate_sl;
+	u8	fl_mlid;
+	union {
+		__be16	rlid;
+		__be16  udp_sport;
+	};
+	u8	reserved0[4];
+	u8	rmac[6];
+	u8	tclass;
+	u8	hop_limit;
+	__be32	grh_gid_fl;
+	u8	rgid[16];
+};
+
+struct mlx5_ib_ah {
+	struct ib_ah		ibah;
+	struct mlx5_av		av;
+};
+
+static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mlx5_ib_ah, ibah);
+}
+
+struct mlx5_wqe_datagram_seg {
+	struct mlx5_av	av;
+};
+
+struct mlx5_wqe_raddr_seg {
+	__be64			raddr;
+	__be32			rkey;
+	u32			reserved;
+};
+
+struct mlx5_wqe_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+};
+
+struct mlx5_wqe_data_seg {
+	__be32			byte_count;
+	__be32			lkey;
+	__be64			addr;
+};
+
+struct mlx5_wqe_umr_ctrl_seg {
+	u8		flags;
+	u8		rsvd0[3];
+	__be16		xlt_octowords;
+	union {
+		__be16	xlt_offset;
+		__be16	bsf_octowords;
+	};
+	__be64		mkey_mask;
+	__be32		xlt_offset_47_16;
+	u8		rsvd1[28];
+};
+
+struct mlx5_seg_set_psv {
+	__be32		psv_num;
+	__be16		syndrome;
+	__be16		status;
+	__be32		transient_sig;
+	__be32		ref_tag;
+};
+
+struct mlx5_seg_get_psv {
+	u8		rsvd[19];
+	u8		num_psv;
+	__be32		l_key;
+	__be64		va;
+	__be32		psv_index[4];
+};
+
+struct mlx5_seg_check_psv {
+	u8		rsvd0[2];
+	__be16		err_coalescing_op;
+	u8		rsvd1[2];
+	__be16		xport_err_op;
+	u8		rsvd2[2];
+	__be16		xport_err_mask;
+	u8		rsvd3[7];
+	u8		num_psv;
+	__be32		l_key;
+	__be64		va;
+	__be32		psv_index[4];
+};
+
+struct mlx5_rwqe_sig {
+	u8	rsvd0[4];
+	u8	signature;
+	u8	rsvd1[11];
+};
+
+struct mlx5_wqe_signature_seg {
+	u8	rsvd0[4];
+	u8	signature;
+	u8	rsvd1[11];
+};
+
+#define MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK 0x3ff
+
+struct mlx5_wqe_inline_seg {
+	__be32	byte_count;
+};
+
+enum mlx5_sig_type {
+	MLX5_DIF_CRC = 0x1,
+	MLX5_DIF_IPCS = 0x2,
+};
+
+struct mlx5_bsf_inl {
+	__be16		vld_refresh;
+	__be16		dif_apptag;
+	__be32		dif_reftag;
+	u8		sig_type;
+	u8		rp_inv_seed;
+	u8		rsvd[3];
+	u8		dif_inc_ref_guard_check;
+	__be16		dif_app_bitmask_check;
+};
+
+struct mlx5_bsf {
+	struct mlx5_bsf_basic {
+		u8		bsf_size_sbs;
+		u8		check_byte_mask;
+		union {
+			u8	copy_byte_mask;
+			u8	bs_selector;
+			u8	rsvd_wflags;
+		} wire;
+		union {
+			u8	bs_selector;
+			u8	rsvd_mflags;
+		} mem;
+		__be32		raw_data_size;
+		__be32		w_bfs_psv;
+		__be32		m_bfs_psv;
+	} basic;
+	struct mlx5_bsf_ext {
+		__be32		t_init_gen_pro_size;
+		__be32		rsvd_epi_size;
+		__be32		w_tfs_psv;
+		__be32		m_tfs_psv;
+	} ext;
+	struct mlx5_bsf_inl	w_inl;
+	struct mlx5_bsf_inl	m_inl;
+};
+
+struct mlx5_mtt {
+	__be64		ptag;
+};
+
+struct mlx5_klm {
+	__be32		bcount;
+	__be32		key;
+	__be64		va;
+};
+
+struct mlx5_stride_block_entry {
+	__be16		stride;
+	__be16		bcount;
+	__be32		key;
+	__be64		va;
+};
+
+struct mlx5_stride_block_ctrl_seg {
+	__be32		bcount_per_cycle;
+	__be32		op;
+	__be32		repeat_count;
+	u16		rsvd;
+	__be16		num_entries;
+};
+
+struct mlx5_core_qp {
+	struct mlx5_core_rsc_common	common; /* must be first */
+	void (*event)		(struct mlx5_core_qp *, int);
+	int			qpn;
+	struct mlx5_rsc_debug	*dbg;
+	int			pid;
+};
+
+struct mlx5_core_dct {
+	struct mlx5_core_qp	mqp;
+	struct completion	drained;
+};
+
+struct mlx5_qp_path {
+	u8			fl_free_ar;
+	u8			rsvd3;
+	__be16			pkey_index;
+	u8			rsvd0;
+	u8			grh_mlid;
+	__be16			rlid;
+	u8			ackto_lt;
+	u8			mgid_index;
+	u8			static_rate;
+	u8			hop_limit;
+	__be32			tclass_flowlabel;
+	union {
+		u8		rgid[16];
+		u8		rip[16];
+	};
+	u8			f_dscp_ecn_prio;
+	u8			ecn_dscp;
+	__be16			udp_sport;
+	u8			dci_cfi_prio_sl;
+	u8			port;
+	u8			rmac[6];
+};
+
+/* FIXME: use mlx5_ifc.h qpc */
+struct mlx5_qp_context {
+	__be32			flags;
+	__be32			flags_pd;
+	u8			mtu_msgmax;
+	u8			rq_size_stride;
+	__be16			sq_crq_size;
+	__be32			qp_counter_set_usr_page;
+	__be32			wire_qpn;
+	__be32			log_pg_sz_remote_qpn;
+	struct			mlx5_qp_path pri_path;
+	struct			mlx5_qp_path alt_path;
+	__be32			params1;
+	u8			reserved2[4];
+	__be32			next_send_psn;
+	__be32			cqn_send;
+	__be32			deth_sqpn;
+	u8			reserved3[4];
+	__be32			last_acked_psn;
+	__be32			ssn;
+	__be32			params2;
+	__be32			rnr_nextrecvpsn;
+	__be32			xrcd;
+	__be32			cqn_recv;
+	__be64			db_rec_addr;
+	__be32			qkey;
+	__be32			rq_type_srqn;
+	__be32			rmsn;
+	__be16			hw_sq_wqe_counter;
+	__be16			sw_sq_wqe_counter;
+	__be16			hw_rcyclic_byte_counter;
+	__be16			hw_rq_counter;
+	__be16			sw_rcyclic_byte_counter;
+	__be16			sw_rq_counter;
+	u8			rsvd0[5];
+	u8			cgs;
+	u8			cs_req;
+	u8			cs_res;
+	__be64			dc_access_key;
+	u8			rsvd1[24];
+};
+
+static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u32 qpn)
+{
+	return radix_tree_lookup(&dev->priv.qp_table.tree, qpn);
+}
+
+static inline struct mlx5_core_mkey *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u32 key)
+{
+	return radix_tree_lookup(&dev->priv.mkey_table.tree, key);
+}
+
+int mlx5_core_create_dct(struct mlx5_core_dev *dev,
+			 struct mlx5_core_dct *qp,
+			 u32 *in, int inlen);
+int mlx5_core_create_qp(struct mlx5_core_dev *dev,
+			struct mlx5_core_qp *qp,
+			u32 *in,
+			int inlen);
+int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 opcode,
+			u32 opt_param_mask, void *qpc,
+			struct mlx5_core_qp *qp);
+int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
+			 struct mlx5_core_qp *qp);
+int mlx5_core_destroy_dct(struct mlx5_core_dev *dev,
+			  struct mlx5_core_dct *dct);
+int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp,
+		       u32 *out, int outlen);
+int mlx5_core_dct_query(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct,
+			u32 *out, int outlen);
+
+int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev,
+			     u32 timeout_usec);
+
+int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn);
+int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn);
+void mlx5_init_qp_table(struct mlx5_core_dev *dev);
+void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev);
+int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
+void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
+int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *rq);
+void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev,
+				  struct mlx5_core_qp *rq);
+int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *sq);
+void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
+				  struct mlx5_core_qp *sq);
+int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id);
+int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id);
+int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id,
+			      int reset, void *out, int out_size);
+
+static inline const char *mlx5_qp_type_str(int type)
+{
+	switch (type) {
+	case MLX5_QP_ST_RC: return "RC";
+	case MLX5_QP_ST_UC: return "C";
+	case MLX5_QP_ST_UD: return "UD";
+	case MLX5_QP_ST_XRC: return "XRC";
+	case MLX5_QP_ST_MLX: return "MLX";
+	case MLX5_QP_ST_QP0: return "QP0";
+	case MLX5_QP_ST_QP1: return "QP1";
+	case MLX5_QP_ST_RAW_ETHERTYPE: return "RAW_ETHERTYPE";
+	case MLX5_QP_ST_RAW_IPV6: return "RAW_IPV6";
+	case MLX5_QP_ST_SNIFFER: return "SNIFFER";
+	case MLX5_QP_ST_SYNC_UMR: return "SYNC_UMR";
+	case MLX5_QP_ST_PTP_1588: return "PTP_1588";
+	case MLX5_QP_ST_REG_UMR: return "REG_UMR";
+	default: return "Invalid transport type";
+	}
+}
+
+static inline const char *mlx5_qp_state_str(int state)
+{
+	switch (state) {
+	case MLX5_QP_STATE_RST:
+	return "RST";
+	case MLX5_QP_STATE_INIT:
+	return "INIT";
+	case MLX5_QP_STATE_RTR:
+	return "RTR";
+	case MLX5_QP_STATE_RTS:
+	return "RTS";
+	case MLX5_QP_STATE_SQER:
+	return "SQER";
+	case MLX5_QP_STATE_SQD:
+	return "SQD";
+	case MLX5_QP_STATE_ERR:
+	return "ERR";
+	case MLX5_QP_STATE_SQ_DRAINING:
+	return "SQ_DRAINING";
+	case MLX5_QP_STATE_SUSPENDED:
+	return "SUSPENDED";
+	default: return "Invalid QP state";
+	}
+}
+
+#endif /* MLX5_QP_H */
diff --git a/include/linux/mlx5/srq.h b/include/linux/mlx5/srq.h
new file mode 100644
index 0000000..24ff23e
--- /dev/null
+++ b/include/linux/mlx5/srq.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_SRQ_H
+#define MLX5_SRQ_H
+
+#include <linux/mlx5/driver.h>
+
+enum {
+	MLX5_SRQ_FLAG_ERR    = (1 << 0),
+	MLX5_SRQ_FLAG_WQ_SIG = (1 << 1),
+	MLX5_SRQ_FLAG_RNDV   = (1 << 2),
+};
+
+struct mlx5_srq_attr {
+	u32 type;
+	u32 flags;
+	u32 log_size;
+	u32 wqe_shift;
+	u32 log_page_size;
+	u32 wqe_cnt;
+	u32 srqn;
+	u32 xrcd;
+	u32 page_offset;
+	u32 cqn;
+	u32 pd;
+	u32 lwm;
+	u32 user_index;
+	u64 db_record;
+	__be64 *pas;
+	u32 tm_log_list_size;
+	u32 tm_next_tag;
+	u32 tm_hw_phase_cnt;
+	u32 tm_sw_phase_cnt;
+};
+
+struct mlx5_core_dev;
+
+void mlx5_init_srq_table(struct mlx5_core_dev *dev);
+void mlx5_cleanup_srq_table(struct mlx5_core_dev *dev);
+
+#endif /* MLX5_SRQ_H */
diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h
new file mode 100644
index 0000000..83a33a1
--- /dev/null
+++ b/include/linux/mlx5/transobj.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __TRANSOBJ_H__
+#define __TRANSOBJ_H__
+
+#include <linux/mlx5/driver.h>
+
+int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn);
+void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn);
+int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			u32 *rqn);
+int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen);
+void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn);
+int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out);
+int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			u32 *sqn);
+int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen);
+void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn);
+int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out);
+int mlx5_core_query_sq_state(struct mlx5_core_dev *dev, u32 sqn, u8 *state);
+int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *tirn);
+int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
+			 int inlen);
+void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn);
+int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *tisn);
+int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in,
+			 int inlen);
+void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn);
+int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *rmpn);
+int mlx5_core_modify_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen);
+int mlx5_core_destroy_rmp(struct mlx5_core_dev *dev, u32 rmpn);
+int mlx5_core_query_rmp(struct mlx5_core_dev *dev, u32 rmpn, u32 *out);
+int mlx5_core_arm_rmp(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm);
+int mlx5_core_create_xsrq(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			  u32 *rmpn);
+int mlx5_core_destroy_xsrq(struct mlx5_core_dev *dev, u32 rmpn);
+int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm);
+
+int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *rqtn);
+int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in,
+			 int inlen);
+void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn);
+
+struct mlx5_hairpin_params {
+	u8  log_data_size;
+	u8  log_num_packets;
+	u16 q_counter;
+	int num_channels;
+};
+
+struct mlx5_hairpin {
+	struct mlx5_core_dev *func_mdev;
+	struct mlx5_core_dev *peer_mdev;
+
+	int num_channels;
+
+	u32 *rqn;
+	u32 *sqn;
+};
+
+struct mlx5_hairpin *
+mlx5_core_hairpin_create(struct mlx5_core_dev *func_mdev,
+			 struct mlx5_core_dev *peer_mdev,
+			 struct mlx5_hairpin_params *params);
+
+void mlx5_core_hairpin_destroy(struct mlx5_hairpin *pair);
+#endif /* __TRANSOBJ_H__ */
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
new file mode 100644
index 0000000..9208cb8
--- /dev/null
+++ b/include/linux/mlx5/vport.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_VPORT_H__
+#define __MLX5_VPORT_H__
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/device.h>
+
+enum {
+	MLX5_CAP_INLINE_MODE_L2,
+	MLX5_CAP_INLINE_MODE_VPORT_CONTEXT,
+	MLX5_CAP_INLINE_MODE_NOT_REQUIRED,
+};
+
+u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport);
+u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
+				u16 vport);
+int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
+				  u16 vport, u8 state);
+int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
+				     u16 vport, u8 *addr);
+int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
+				    u16 vport, u8 *min_inline);
+void mlx5_query_min_inline(struct mlx5_core_dev *mdev, u8 *min_inline);
+int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev,
+				     u16 vport, u8 min_inline);
+int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev,
+				      u16 vport, u8 *addr);
+int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu);
+int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu);
+int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
+					   u64 *system_image_guid);
+int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid);
+int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
+				    u32 vport, u64 node_guid);
+int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
+					u16 *qkey_viol_cntr);
+int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
+			     u8 port_num, u16  vf_num, u16 gid_index,
+			     union ib_gid *gid);
+int mlx5_query_hca_vport_pkey(struct mlx5_core_dev *dev, u8 other_vport,
+			      u8 port_num, u16 vf_num, u16 pkey_index,
+			      u16 *pkey);
+int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev,
+				 u8 other_vport, u8 port_num,
+				 u16 vf_num,
+				 struct mlx5_hca_vport_context *rep);
+int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev,
+					   u64 *sys_image_guid);
+int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev,
+				   u64 *node_guid);
+int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev,
+				  u32 vport,
+				  enum mlx5_list_type list_type,
+				  u8 addr_list[][ETH_ALEN],
+				  int *list_size);
+int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev,
+				   enum mlx5_list_type list_type,
+				   u8 addr_list[][ETH_ALEN],
+				   int list_size);
+int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev,
+				 u32 vport,
+				 int *promisc_uc,
+				 int *promisc_mc,
+				 int *promisc_all);
+int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev,
+				  int promisc_uc,
+				  int promisc_mc,
+				  int promisc_all);
+int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev,
+			       u32 vport,
+			       u16 vlans[],
+			       int *size);
+int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev,
+				u16 vlans[],
+				int list_size);
+
+int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev);
+int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev);
+int mlx5_query_vport_down_stats(struct mlx5_core_dev *mdev, u16 vport,
+				u64 *rx_discard_vport_down,
+				u64 *tx_discard_vport_down);
+int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport,
+				  int vf, u8 port_num, void *out,
+				  size_t out_sz);
+int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev,
+				       u8 other_vport, u8 port_num,
+				       int vf,
+				       struct mlx5_hca_vport_context *req);
+int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable);
+int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status);
+
+int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev,
+				       struct mlx5_core_dev *port_mdev);
+int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev);
+#endif /* __MLX5_VPORT_H__ */
diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h
new file mode 100644
index 0000000..a72fd04
--- /dev/null
+++ b/include/linux/nvme-rdma.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_NVME_RDMA_H
+#define _LINUX_NVME_RDMA_H
+
+enum nvme_rdma_cm_fmt {
+	NVME_RDMA_CM_FMT_1_0 = 0x0,
+};
+
+enum nvme_rdma_cm_status {
+	NVME_RDMA_CM_INVALID_LEN	= 0x01,
+	NVME_RDMA_CM_INVALID_RECFMT	= 0x02,
+	NVME_RDMA_CM_INVALID_QID	= 0x03,
+	NVME_RDMA_CM_INVALID_HSQSIZE	= 0x04,
+	NVME_RDMA_CM_INVALID_HRQSIZE	= 0x05,
+	NVME_RDMA_CM_NO_RSC		= 0x06,
+	NVME_RDMA_CM_INVALID_IRD	= 0x07,
+	NVME_RDMA_CM_INVALID_ORD	= 0x08,
+};
+
+static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status)
+{
+	switch (status) {
+	case NVME_RDMA_CM_INVALID_LEN:
+		return "invalid length";
+	case NVME_RDMA_CM_INVALID_RECFMT:
+		return "invalid record format";
+	case NVME_RDMA_CM_INVALID_QID:
+		return "invalid queue ID";
+	case NVME_RDMA_CM_INVALID_HSQSIZE:
+		return "invalid host SQ size";
+	case NVME_RDMA_CM_INVALID_HRQSIZE:
+		return "invalid host RQ size";
+	case NVME_RDMA_CM_NO_RSC:
+		return "resource not found";
+	case NVME_RDMA_CM_INVALID_IRD:
+		return "invalid IRD";
+	case NVME_RDMA_CM_INVALID_ORD:
+		return "Invalid ORD";
+	default:
+		return "unrecognized reason";
+	}
+}
+
+/**
+ * struct nvme_rdma_cm_req - rdma connect request
+ *
+ * @recfmt:        format of the RDMA Private Data
+ * @qid:           queue Identifier for the Admin or I/O Queue
+ * @hrqsize:       host receive queue size to be created
+ * @hsqsize:       host send queue size to be created
+ */
+struct nvme_rdma_cm_req {
+	__le16		recfmt;
+	__le16		qid;
+	__le16		hrqsize;
+	__le16		hsqsize;
+	u8		rsvd[24];
+};
+
+/**
+ * struct nvme_rdma_cm_rep - rdma connect reply
+ *
+ * @recfmt:        format of the RDMA Private Data
+ * @crqsize:       controller receive queue size
+ */
+struct nvme_rdma_cm_rep {
+	__le16		recfmt;
+	__le16		crqsize;
+	u8		rsvd[28];
+};
+
+/**
+ * struct nvme_rdma_cm_rej - rdma connect reject
+ *
+ * @recfmt:        format of the RDMA Private Data
+ * @fsts:          error status for the associated connect request
+ */
+struct nvme_rdma_cm_rej {
+	__le16		recfmt;
+	__le16		sts;
+};
+
+#endif /* _LINUX_NVME_RDMA_H */
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
new file mode 100644
index 0000000..4112e2b
--- /dev/null
+++ b/include/linux/nvme.h
@@ -0,0 +1,1196 @@
+/*
+ * Definitions for the NVM Express interface
+ * Copyright (c) 2011-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_NVME_H
+#define _LINUX_NVME_H
+
+#include <linux/types.h>
+#include <linux/uuid.h>
+
+/* NQN names in commands fields specified one size */
+#define NVMF_NQN_FIELD_LEN	256
+
+/* However the max length of a qualified name is another size */
+#define NVMF_NQN_SIZE		223
+
+#define NVMF_TRSVCID_SIZE	32
+#define NVMF_TRADDR_SIZE	256
+#define NVMF_TSAS_SIZE		256
+
+#define NVME_DISC_SUBSYS_NAME	"nqn.2014-08.org.nvmexpress.discovery"
+
+#define NVME_RDMA_IP_PORT	4420
+
+#define NVME_NSID_ALL		0xffffffff
+
+enum nvme_subsys_type {
+	NVME_NQN_DISC	= 1,		/* Discovery type target subsystem */
+	NVME_NQN_NVME	= 2,		/* NVME type target subsystem */
+};
+
+/* Address Family codes for Discovery Log Page entry ADRFAM field */
+enum {
+	NVMF_ADDR_FAMILY_PCI	= 0,	/* PCIe */
+	NVMF_ADDR_FAMILY_IP4	= 1,	/* IP4 */
+	NVMF_ADDR_FAMILY_IP6	= 2,	/* IP6 */
+	NVMF_ADDR_FAMILY_IB	= 3,	/* InfiniBand */
+	NVMF_ADDR_FAMILY_FC	= 4,	/* Fibre Channel */
+};
+
+/* Transport Type codes for Discovery Log Page entry TRTYPE field */
+enum {
+	NVMF_TRTYPE_RDMA	= 1,	/* RDMA */
+	NVMF_TRTYPE_FC		= 2,	/* Fibre Channel */
+	NVMF_TRTYPE_LOOP	= 254,	/* Reserved for host usage */
+	NVMF_TRTYPE_MAX,
+};
+
+/* Transport Requirements codes for Discovery Log Page entry TREQ field */
+enum {
+	NVMF_TREQ_NOT_SPECIFIED	= 0,	/* Not specified */
+	NVMF_TREQ_REQUIRED	= 1,	/* Required */
+	NVMF_TREQ_NOT_REQUIRED	= 2,	/* Not Required */
+};
+
+/* RDMA QP Service Type codes for Discovery Log Page entry TSAS
+ * RDMA_QPTYPE field
+ */
+enum {
+	NVMF_RDMA_QPTYPE_CONNECTED	= 1, /* Reliable Connected */
+	NVMF_RDMA_QPTYPE_DATAGRAM	= 2, /* Reliable Datagram */
+};
+
+/* RDMA QP Service Type codes for Discovery Log Page entry TSAS
+ * RDMA_QPTYPE field
+ */
+enum {
+	NVMF_RDMA_PRTYPE_NOT_SPECIFIED	= 1, /* No Provider Specified */
+	NVMF_RDMA_PRTYPE_IB		= 2, /* InfiniBand */
+	NVMF_RDMA_PRTYPE_ROCE		= 3, /* InfiniBand RoCE */
+	NVMF_RDMA_PRTYPE_ROCEV2		= 4, /* InfiniBand RoCEV2 */
+	NVMF_RDMA_PRTYPE_IWARP		= 5, /* IWARP */
+};
+
+/* RDMA Connection Management Service Type codes for Discovery Log Page
+ * entry TSAS RDMA_CMS field
+ */
+enum {
+	NVMF_RDMA_CMS_RDMA_CM	= 1, /* Sockets based endpoint addressing */
+};
+
+#define NVME_AQ_DEPTH		32
+#define NVME_NR_AEN_COMMANDS	1
+#define NVME_AQ_BLK_MQ_DEPTH	(NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
+
+/*
+ * Subtract one to leave an empty queue entry for 'Full Queue' condition. See
+ * NVM-Express 1.2 specification, section 4.1.2.
+ */
+#define NVME_AQ_MQ_TAG_DEPTH	(NVME_AQ_BLK_MQ_DEPTH - 1)
+
+enum {
+	NVME_REG_CAP	= 0x0000,	/* Controller Capabilities */
+	NVME_REG_VS	= 0x0008,	/* Version */
+	NVME_REG_INTMS	= 0x000c,	/* Interrupt Mask Set */
+	NVME_REG_INTMC	= 0x0010,	/* Interrupt Mask Clear */
+	NVME_REG_CC	= 0x0014,	/* Controller Configuration */
+	NVME_REG_CSTS	= 0x001c,	/* Controller Status */
+	NVME_REG_NSSR	= 0x0020,	/* NVM Subsystem Reset */
+	NVME_REG_AQA	= 0x0024,	/* Admin Queue Attributes */
+	NVME_REG_ASQ	= 0x0028,	/* Admin SQ Base Address */
+	NVME_REG_ACQ	= 0x0030,	/* Admin CQ Base Address */
+	NVME_REG_CMBLOC = 0x0038,	/* Controller Memory Buffer Location */
+	NVME_REG_CMBSZ	= 0x003c,	/* Controller Memory Buffer Size */
+	NVME_REG_DBS	= 0x1000,	/* SQ 0 Tail Doorbell */
+};
+
+#define NVME_CAP_MQES(cap)	((cap) & 0xffff)
+#define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
+#define NVME_CAP_STRIDE(cap)	(((cap) >> 32) & 0xf)
+#define NVME_CAP_NSSRC(cap)	(((cap) >> 36) & 0x1)
+#define NVME_CAP_MPSMIN(cap)	(((cap) >> 48) & 0xf)
+#define NVME_CAP_MPSMAX(cap)	(((cap) >> 52) & 0xf)
+
+#define NVME_CMB_BIR(cmbloc)	((cmbloc) & 0x7)
+#define NVME_CMB_OFST(cmbloc)	(((cmbloc) >> 12) & 0xfffff)
+
+enum {
+	NVME_CMBSZ_SQS		= 1 << 0,
+	NVME_CMBSZ_CQS		= 1 << 1,
+	NVME_CMBSZ_LISTS	= 1 << 2,
+	NVME_CMBSZ_RDS		= 1 << 3,
+	NVME_CMBSZ_WDS		= 1 << 4,
+
+	NVME_CMBSZ_SZ_SHIFT	= 12,
+	NVME_CMBSZ_SZ_MASK	= 0xfffff,
+
+	NVME_CMBSZ_SZU_SHIFT	= 8,
+	NVME_CMBSZ_SZU_MASK	= 0xf,
+};
+
+/*
+ * Submission and Completion Queue Entry Sizes for the NVM command set.
+ * (In bytes and specified as a power of two (2^n)).
+ */
+#define NVME_NVM_IOSQES		6
+#define NVME_NVM_IOCQES		4
+
+enum {
+	NVME_CC_ENABLE		= 1 << 0,
+	NVME_CC_CSS_NVM		= 0 << 4,
+	NVME_CC_EN_SHIFT	= 0,
+	NVME_CC_CSS_SHIFT	= 4,
+	NVME_CC_MPS_SHIFT	= 7,
+	NVME_CC_AMS_SHIFT	= 11,
+	NVME_CC_SHN_SHIFT	= 14,
+	NVME_CC_IOSQES_SHIFT	= 16,
+	NVME_CC_IOCQES_SHIFT	= 20,
+	NVME_CC_AMS_RR		= 0 << NVME_CC_AMS_SHIFT,
+	NVME_CC_AMS_WRRU	= 1 << NVME_CC_AMS_SHIFT,
+	NVME_CC_AMS_VS		= 7 << NVME_CC_AMS_SHIFT,
+	NVME_CC_SHN_NONE	= 0 << NVME_CC_SHN_SHIFT,
+	NVME_CC_SHN_NORMAL	= 1 << NVME_CC_SHN_SHIFT,
+	NVME_CC_SHN_ABRUPT	= 2 << NVME_CC_SHN_SHIFT,
+	NVME_CC_SHN_MASK	= 3 << NVME_CC_SHN_SHIFT,
+	NVME_CC_IOSQES		= NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT,
+	NVME_CC_IOCQES		= NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT,
+	NVME_CSTS_RDY		= 1 << 0,
+	NVME_CSTS_CFS		= 1 << 1,
+	NVME_CSTS_NSSRO		= 1 << 4,
+	NVME_CSTS_PP		= 1 << 5,
+	NVME_CSTS_SHST_NORMAL	= 0 << 2,
+	NVME_CSTS_SHST_OCCUR	= 1 << 2,
+	NVME_CSTS_SHST_CMPLT	= 2 << 2,
+	NVME_CSTS_SHST_MASK	= 3 << 2,
+};
+
+struct nvme_id_power_state {
+	__le16			max_power;	/* centiwatts */
+	__u8			rsvd2;
+	__u8			flags;
+	__le32			entry_lat;	/* microseconds */
+	__le32			exit_lat;	/* microseconds */
+	__u8			read_tput;
+	__u8			read_lat;
+	__u8			write_tput;
+	__u8			write_lat;
+	__le16			idle_power;
+	__u8			idle_scale;
+	__u8			rsvd19;
+	__le16			active_power;
+	__u8			active_work_scale;
+	__u8			rsvd23[9];
+};
+
+enum {
+	NVME_PS_FLAGS_MAX_POWER_SCALE	= 1 << 0,
+	NVME_PS_FLAGS_NON_OP_STATE	= 1 << 1,
+};
+
+struct nvme_id_ctrl {
+	__le16			vid;
+	__le16			ssvid;
+	char			sn[20];
+	char			mn[40];
+	char			fr[8];
+	__u8			rab;
+	__u8			ieee[3];
+	__u8			cmic;
+	__u8			mdts;
+	__le16			cntlid;
+	__le32			ver;
+	__le32			rtd3r;
+	__le32			rtd3e;
+	__le32			oaes;
+	__le32			ctratt;
+	__u8			rsvd100[156];
+	__le16			oacs;
+	__u8			acl;
+	__u8			aerl;
+	__u8			frmw;
+	__u8			lpa;
+	__u8			elpe;
+	__u8			npss;
+	__u8			avscc;
+	__u8			apsta;
+	__le16			wctemp;
+	__le16			cctemp;
+	__le16			mtfa;
+	__le32			hmpre;
+	__le32			hmmin;
+	__u8			tnvmcap[16];
+	__u8			unvmcap[16];
+	__le32			rpmbs;
+	__le16			edstt;
+	__u8			dsto;
+	__u8			fwug;
+	__le16			kas;
+	__le16			hctma;
+	__le16			mntmt;
+	__le16			mxtmt;
+	__le32			sanicap;
+	__le32			hmminds;
+	__le16			hmmaxd;
+	__u8			rsvd338[174];
+	__u8			sqes;
+	__u8			cqes;
+	__le16			maxcmd;
+	__le32			nn;
+	__le16			oncs;
+	__le16			fuses;
+	__u8			fna;
+	__u8			vwc;
+	__le16			awun;
+	__le16			awupf;
+	__u8			nvscc;
+	__u8			rsvd531;
+	__le16			acwu;
+	__u8			rsvd534[2];
+	__le32			sgls;
+	__u8			rsvd540[228];
+	char			subnqn[256];
+	__u8			rsvd1024[768];
+	__le32			ioccsz;
+	__le32			iorcsz;
+	__le16			icdoff;
+	__u8			ctrattr;
+	__u8			msdbd;
+	__u8			rsvd1804[244];
+	struct nvme_id_power_state	psd[32];
+	__u8			vs[1024];
+};
+
+enum {
+	NVME_CTRL_ONCS_COMPARE			= 1 << 0,
+	NVME_CTRL_ONCS_WRITE_UNCORRECTABLE	= 1 << 1,
+	NVME_CTRL_ONCS_DSM			= 1 << 2,
+	NVME_CTRL_ONCS_WRITE_ZEROES		= 1 << 3,
+	NVME_CTRL_ONCS_TIMESTAMP		= 1 << 6,
+	NVME_CTRL_VWC_PRESENT			= 1 << 0,
+	NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
+	NVME_CTRL_OACS_DIRECTIVES		= 1 << 5,
+	NVME_CTRL_OACS_DBBUF_SUPP		= 1 << 8,
+	NVME_CTRL_LPA_CMD_EFFECTS_LOG		= 1 << 1,
+};
+
+struct nvme_lbaf {
+	__le16			ms;
+	__u8			ds;
+	__u8			rp;
+};
+
+struct nvme_id_ns {
+	__le64			nsze;
+	__le64			ncap;
+	__le64			nuse;
+	__u8			nsfeat;
+	__u8			nlbaf;
+	__u8			flbas;
+	__u8			mc;
+	__u8			dpc;
+	__u8			dps;
+	__u8			nmic;
+	__u8			rescap;
+	__u8			fpi;
+	__u8			rsvd33;
+	__le16			nawun;
+	__le16			nawupf;
+	__le16			nacwu;
+	__le16			nabsn;
+	__le16			nabo;
+	__le16			nabspf;
+	__le16			noiob;
+	__u8			nvmcap[16];
+	__u8			rsvd64[40];
+	__u8			nguid[16];
+	__u8			eui64[8];
+	struct nvme_lbaf	lbaf[16];
+	__u8			rsvd192[192];
+	__u8			vs[3712];
+};
+
+enum {
+	NVME_ID_CNS_NS			= 0x00,
+	NVME_ID_CNS_CTRL		= 0x01,
+	NVME_ID_CNS_NS_ACTIVE_LIST	= 0x02,
+	NVME_ID_CNS_NS_DESC_LIST	= 0x03,
+	NVME_ID_CNS_NS_PRESENT_LIST	= 0x10,
+	NVME_ID_CNS_NS_PRESENT		= 0x11,
+	NVME_ID_CNS_CTRL_NS_LIST	= 0x12,
+	NVME_ID_CNS_CTRL_LIST		= 0x13,
+};
+
+enum {
+	NVME_DIR_IDENTIFY		= 0x00,
+	NVME_DIR_STREAMS		= 0x01,
+	NVME_DIR_SND_ID_OP_ENABLE	= 0x01,
+	NVME_DIR_SND_ST_OP_REL_ID	= 0x01,
+	NVME_DIR_SND_ST_OP_REL_RSC	= 0x02,
+	NVME_DIR_RCV_ID_OP_PARAM	= 0x01,
+	NVME_DIR_RCV_ST_OP_PARAM	= 0x01,
+	NVME_DIR_RCV_ST_OP_STATUS	= 0x02,
+	NVME_DIR_RCV_ST_OP_RESOURCE	= 0x03,
+	NVME_DIR_ENDIR			= 0x01,
+};
+
+enum {
+	NVME_NS_FEAT_THIN	= 1 << 0,
+	NVME_NS_FLBAS_LBA_MASK	= 0xf,
+	NVME_NS_FLBAS_META_EXT	= 0x10,
+	NVME_LBAF_RP_BEST	= 0,
+	NVME_LBAF_RP_BETTER	= 1,
+	NVME_LBAF_RP_GOOD	= 2,
+	NVME_LBAF_RP_DEGRADED	= 3,
+	NVME_NS_DPC_PI_LAST	= 1 << 4,
+	NVME_NS_DPC_PI_FIRST	= 1 << 3,
+	NVME_NS_DPC_PI_TYPE3	= 1 << 2,
+	NVME_NS_DPC_PI_TYPE2	= 1 << 1,
+	NVME_NS_DPC_PI_TYPE1	= 1 << 0,
+	NVME_NS_DPS_PI_FIRST	= 1 << 3,
+	NVME_NS_DPS_PI_MASK	= 0x7,
+	NVME_NS_DPS_PI_TYPE1	= 1,
+	NVME_NS_DPS_PI_TYPE2	= 2,
+	NVME_NS_DPS_PI_TYPE3	= 3,
+};
+
+struct nvme_ns_id_desc {
+	__u8 nidt;
+	__u8 nidl;
+	__le16 reserved;
+};
+
+#define NVME_NIDT_EUI64_LEN	8
+#define NVME_NIDT_NGUID_LEN	16
+#define NVME_NIDT_UUID_LEN	16
+
+enum {
+	NVME_NIDT_EUI64		= 0x01,
+	NVME_NIDT_NGUID		= 0x02,
+	NVME_NIDT_UUID		= 0x03,
+};
+
+struct nvme_smart_log {
+	__u8			critical_warning;
+	__u8			temperature[2];
+	__u8			avail_spare;
+	__u8			spare_thresh;
+	__u8			percent_used;
+	__u8			rsvd6[26];
+	__u8			data_units_read[16];
+	__u8			data_units_written[16];
+	__u8			host_reads[16];
+	__u8			host_writes[16];
+	__u8			ctrl_busy_time[16];
+	__u8			power_cycles[16];
+	__u8			power_on_hours[16];
+	__u8			unsafe_shutdowns[16];
+	__u8			media_errors[16];
+	__u8			num_err_log_entries[16];
+	__le32			warning_temp_time;
+	__le32			critical_comp_time;
+	__le16			temp_sensor[8];
+	__u8			rsvd216[296];
+};
+
+struct nvme_fw_slot_info_log {
+	__u8			afi;
+	__u8			rsvd1[7];
+	__le64			frs[7];
+	__u8			rsvd64[448];
+};
+
+enum {
+	NVME_CMD_EFFECTS_CSUPP		= 1 << 0,
+	NVME_CMD_EFFECTS_LBCC		= 1 << 1,
+	NVME_CMD_EFFECTS_NCC		= 1 << 2,
+	NVME_CMD_EFFECTS_NIC		= 1 << 3,
+	NVME_CMD_EFFECTS_CCC		= 1 << 4,
+	NVME_CMD_EFFECTS_CSE_MASK	= 3 << 16,
+};
+
+struct nvme_effects_log {
+	__le32 acs[256];
+	__le32 iocs[256];
+	__u8   resv[2048];
+};
+
+enum {
+	NVME_SMART_CRIT_SPARE		= 1 << 0,
+	NVME_SMART_CRIT_TEMPERATURE	= 1 << 1,
+	NVME_SMART_CRIT_RELIABILITY	= 1 << 2,
+	NVME_SMART_CRIT_MEDIA		= 1 << 3,
+	NVME_SMART_CRIT_VOLATILE_MEMORY	= 1 << 4,
+};
+
+enum {
+	NVME_AER_ERROR			= 0,
+	NVME_AER_SMART			= 1,
+	NVME_AER_CSS			= 6,
+	NVME_AER_VS			= 7,
+	NVME_AER_NOTICE_NS_CHANGED	= 0x0002,
+	NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102,
+};
+
+struct nvme_lba_range_type {
+	__u8			type;
+	__u8			attributes;
+	__u8			rsvd2[14];
+	__u64			slba;
+	__u64			nlb;
+	__u8			guid[16];
+	__u8			rsvd48[16];
+};
+
+enum {
+	NVME_LBART_TYPE_FS	= 0x01,
+	NVME_LBART_TYPE_RAID	= 0x02,
+	NVME_LBART_TYPE_CACHE	= 0x03,
+	NVME_LBART_TYPE_SWAP	= 0x04,
+
+	NVME_LBART_ATTRIB_TEMP	= 1 << 0,
+	NVME_LBART_ATTRIB_HIDE	= 1 << 1,
+};
+
+struct nvme_reservation_status {
+	__le32	gen;
+	__u8	rtype;
+	__u8	regctl[2];
+	__u8	resv5[2];
+	__u8	ptpls;
+	__u8	resv10[13];
+	struct {
+		__le16	cntlid;
+		__u8	rcsts;
+		__u8	resv3[5];
+		__le64	hostid;
+		__le64	rkey;
+	} regctl_ds[];
+};
+
+enum nvme_async_event_type {
+	NVME_AER_TYPE_ERROR	= 0,
+	NVME_AER_TYPE_SMART	= 1,
+	NVME_AER_TYPE_NOTICE	= 2,
+};
+
+/* I/O commands */
+
+enum nvme_opcode {
+	nvme_cmd_flush		= 0x00,
+	nvme_cmd_write		= 0x01,
+	nvme_cmd_read		= 0x02,
+	nvme_cmd_write_uncor	= 0x04,
+	nvme_cmd_compare	= 0x05,
+	nvme_cmd_write_zeroes	= 0x08,
+	nvme_cmd_dsm		= 0x09,
+	nvme_cmd_resv_register	= 0x0d,
+	nvme_cmd_resv_report	= 0x0e,
+	nvme_cmd_resv_acquire	= 0x11,
+	nvme_cmd_resv_release	= 0x15,
+};
+
+/*
+ * Descriptor subtype - lower 4 bits of nvme_(keyed_)sgl_desc identifier
+ *
+ * @NVME_SGL_FMT_ADDRESS:     absolute address of the data block
+ * @NVME_SGL_FMT_OFFSET:      relative offset of the in-capsule data block
+ * @NVME_SGL_FMT_TRANSPORT_A: transport defined format, value 0xA
+ * @NVME_SGL_FMT_INVALIDATE:  RDMA transport specific remote invalidation
+ *                            request subtype
+ */
+enum {
+	NVME_SGL_FMT_ADDRESS		= 0x00,
+	NVME_SGL_FMT_OFFSET		= 0x01,
+	NVME_SGL_FMT_TRANSPORT_A	= 0x0A,
+	NVME_SGL_FMT_INVALIDATE		= 0x0f,
+};
+
+/*
+ * Descriptor type - upper 4 bits of nvme_(keyed_)sgl_desc identifier
+ *
+ * For struct nvme_sgl_desc:
+ *   @NVME_SGL_FMT_DATA_DESC:		data block descriptor
+ *   @NVME_SGL_FMT_SEG_DESC:		sgl segment descriptor
+ *   @NVME_SGL_FMT_LAST_SEG_DESC:	last sgl segment descriptor
+ *
+ * For struct nvme_keyed_sgl_desc:
+ *   @NVME_KEY_SGL_FMT_DATA_DESC:	keyed data block descriptor
+ *
+ * Transport-specific SGL types:
+ *   @NVME_TRANSPORT_SGL_DATA_DESC:	Transport SGL data dlock descriptor
+ */
+enum {
+	NVME_SGL_FMT_DATA_DESC		= 0x00,
+	NVME_SGL_FMT_SEG_DESC		= 0x02,
+	NVME_SGL_FMT_LAST_SEG_DESC	= 0x03,
+	NVME_KEY_SGL_FMT_DATA_DESC	= 0x04,
+	NVME_TRANSPORT_SGL_DATA_DESC	= 0x05,
+};
+
+struct nvme_sgl_desc {
+	__le64	addr;
+	__le32	length;
+	__u8	rsvd[3];
+	__u8	type;
+};
+
+struct nvme_keyed_sgl_desc {
+	__le64	addr;
+	__u8	length[3];
+	__u8	key[4];
+	__u8	type;
+};
+
+union nvme_data_ptr {
+	struct {
+		__le64	prp1;
+		__le64	prp2;
+	};
+	struct nvme_sgl_desc	sgl;
+	struct nvme_keyed_sgl_desc ksgl;
+};
+
+/*
+ * Lowest two bits of our flags field (FUSE field in the spec):
+ *
+ * @NVME_CMD_FUSE_FIRST:   Fused Operation, first command
+ * @NVME_CMD_FUSE_SECOND:  Fused Operation, second command
+ *
+ * Highest two bits in our flags field (PSDT field in the spec):
+ *
+ * @NVME_CMD_PSDT_SGL_METABUF:	Use SGLS for this transfer,
+ *	If used, MPTR contains addr of single physical buffer (byte aligned).
+ * @NVME_CMD_PSDT_SGL_METASEG:	Use SGLS for this transfer,
+ *	If used, MPTR contains an address of an SGL segment containing
+ *	exactly 1 SGL descriptor (qword aligned).
+ */
+enum {
+	NVME_CMD_FUSE_FIRST	= (1 << 0),
+	NVME_CMD_FUSE_SECOND	= (1 << 1),
+
+	NVME_CMD_SGL_METABUF	= (1 << 6),
+	NVME_CMD_SGL_METASEG	= (1 << 7),
+	NVME_CMD_SGL_ALL	= NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG,
+};
+
+struct nvme_common_command {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__le32			cdw2[2];
+	__le64			metadata;
+	union nvme_data_ptr	dptr;
+	__le32			cdw10[6];
+};
+
+struct nvme_rw_command {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2;
+	__le64			metadata;
+	union nvme_data_ptr	dptr;
+	__le64			slba;
+	__le16			length;
+	__le16			control;
+	__le32			dsmgmt;
+	__le32			reftag;
+	__le16			apptag;
+	__le16			appmask;
+};
+
+enum {
+	NVME_RW_LR			= 1 << 15,
+	NVME_RW_FUA			= 1 << 14,
+	NVME_RW_DSM_FREQ_UNSPEC		= 0,
+	NVME_RW_DSM_FREQ_TYPICAL	= 1,
+	NVME_RW_DSM_FREQ_RARE		= 2,
+	NVME_RW_DSM_FREQ_READS		= 3,
+	NVME_RW_DSM_FREQ_WRITES		= 4,
+	NVME_RW_DSM_FREQ_RW		= 5,
+	NVME_RW_DSM_FREQ_ONCE		= 6,
+	NVME_RW_DSM_FREQ_PREFETCH	= 7,
+	NVME_RW_DSM_FREQ_TEMP		= 8,
+	NVME_RW_DSM_LATENCY_NONE	= 0 << 4,
+	NVME_RW_DSM_LATENCY_IDLE	= 1 << 4,
+	NVME_RW_DSM_LATENCY_NORM	= 2 << 4,
+	NVME_RW_DSM_LATENCY_LOW		= 3 << 4,
+	NVME_RW_DSM_SEQ_REQ		= 1 << 6,
+	NVME_RW_DSM_COMPRESSED		= 1 << 7,
+	NVME_RW_PRINFO_PRCHK_REF	= 1 << 10,
+	NVME_RW_PRINFO_PRCHK_APP	= 1 << 11,
+	NVME_RW_PRINFO_PRCHK_GUARD	= 1 << 12,
+	NVME_RW_PRINFO_PRACT		= 1 << 13,
+	NVME_RW_DTYPE_STREAMS		= 1 << 4,
+};
+
+struct nvme_dsm_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2[2];
+	union nvme_data_ptr	dptr;
+	__le32			nr;
+	__le32			attributes;
+	__u32			rsvd12[4];
+};
+
+enum {
+	NVME_DSMGMT_IDR		= 1 << 0,
+	NVME_DSMGMT_IDW		= 1 << 1,
+	NVME_DSMGMT_AD		= 1 << 2,
+};
+
+#define NVME_DSM_MAX_RANGES	256
+
+struct nvme_dsm_range {
+	__le32			cattr;
+	__le32			nlb;
+	__le64			slba;
+};
+
+struct nvme_write_zeroes_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2;
+	__le64			metadata;
+	union nvme_data_ptr	dptr;
+	__le64			slba;
+	__le16			length;
+	__le16			control;
+	__le32			dsmgmt;
+	__le32			reftag;
+	__le16			apptag;
+	__le16			appmask;
+};
+
+/* Features */
+
+struct nvme_feat_auto_pst {
+	__le64 entries[32];
+};
+
+enum {
+	NVME_HOST_MEM_ENABLE	= (1 << 0),
+	NVME_HOST_MEM_RETURN	= (1 << 1),
+};
+
+/* Admin commands */
+
+enum nvme_admin_opcode {
+	nvme_admin_delete_sq		= 0x00,
+	nvme_admin_create_sq		= 0x01,
+	nvme_admin_get_log_page		= 0x02,
+	nvme_admin_delete_cq		= 0x04,
+	nvme_admin_create_cq		= 0x05,
+	nvme_admin_identify		= 0x06,
+	nvme_admin_abort_cmd		= 0x08,
+	nvme_admin_set_features		= 0x09,
+	nvme_admin_get_features		= 0x0a,
+	nvme_admin_async_event		= 0x0c,
+	nvme_admin_ns_mgmt		= 0x0d,
+	nvme_admin_activate_fw		= 0x10,
+	nvme_admin_download_fw		= 0x11,
+	nvme_admin_ns_attach		= 0x15,
+	nvme_admin_keep_alive		= 0x18,
+	nvme_admin_directive_send	= 0x19,
+	nvme_admin_directive_recv	= 0x1a,
+	nvme_admin_dbbuf		= 0x7C,
+	nvme_admin_format_nvm		= 0x80,
+	nvme_admin_security_send	= 0x81,
+	nvme_admin_security_recv	= 0x82,
+	nvme_admin_sanitize_nvm		= 0x84,
+};
+
+enum {
+	NVME_QUEUE_PHYS_CONTIG	= (1 << 0),
+	NVME_CQ_IRQ_ENABLED	= (1 << 1),
+	NVME_SQ_PRIO_URGENT	= (0 << 1),
+	NVME_SQ_PRIO_HIGH	= (1 << 1),
+	NVME_SQ_PRIO_MEDIUM	= (2 << 1),
+	NVME_SQ_PRIO_LOW	= (3 << 1),
+	NVME_FEAT_ARBITRATION	= 0x01,
+	NVME_FEAT_POWER_MGMT	= 0x02,
+	NVME_FEAT_LBA_RANGE	= 0x03,
+	NVME_FEAT_TEMP_THRESH	= 0x04,
+	NVME_FEAT_ERR_RECOVERY	= 0x05,
+	NVME_FEAT_VOLATILE_WC	= 0x06,
+	NVME_FEAT_NUM_QUEUES	= 0x07,
+	NVME_FEAT_IRQ_COALESCE	= 0x08,
+	NVME_FEAT_IRQ_CONFIG	= 0x09,
+	NVME_FEAT_WRITE_ATOMIC	= 0x0a,
+	NVME_FEAT_ASYNC_EVENT	= 0x0b,
+	NVME_FEAT_AUTO_PST	= 0x0c,
+	NVME_FEAT_HOST_MEM_BUF	= 0x0d,
+	NVME_FEAT_TIMESTAMP	= 0x0e,
+	NVME_FEAT_KATO		= 0x0f,
+	NVME_FEAT_SW_PROGRESS	= 0x80,
+	NVME_FEAT_HOST_ID	= 0x81,
+	NVME_FEAT_RESV_MASK	= 0x82,
+	NVME_FEAT_RESV_PERSIST	= 0x83,
+	NVME_LOG_ERROR		= 0x01,
+	NVME_LOG_SMART		= 0x02,
+	NVME_LOG_FW_SLOT	= 0x03,
+	NVME_LOG_CMD_EFFECTS	= 0x05,
+	NVME_LOG_DISC		= 0x70,
+	NVME_LOG_RESERVATION	= 0x80,
+	NVME_FWACT_REPL		= (0 << 3),
+	NVME_FWACT_REPL_ACTV	= (1 << 3),
+	NVME_FWACT_ACTV		= (2 << 3),
+};
+
+struct nvme_identify {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2[2];
+	union nvme_data_ptr	dptr;
+	__u8			cns;
+	__u8			rsvd3;
+	__le16			ctrlid;
+	__u32			rsvd11[5];
+};
+
+#define NVME_IDENTIFY_DATA_SIZE 4096
+
+struct nvme_features {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2[2];
+	union nvme_data_ptr	dptr;
+	__le32			fid;
+	__le32			dword11;
+	__le32                  dword12;
+	__le32                  dword13;
+	__le32                  dword14;
+	__le32                  dword15;
+};
+
+struct nvme_host_mem_buf_desc {
+	__le64			addr;
+	__le32			size;
+	__u32			rsvd;
+};
+
+struct nvme_create_cq {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__u32			rsvd1[5];
+	__le64			prp1;
+	__u64			rsvd8;
+	__le16			cqid;
+	__le16			qsize;
+	__le16			cq_flags;
+	__le16			irq_vector;
+	__u32			rsvd12[4];
+};
+
+struct nvme_create_sq {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__u32			rsvd1[5];
+	__le64			prp1;
+	__u64			rsvd8;
+	__le16			sqid;
+	__le16			qsize;
+	__le16			sq_flags;
+	__le16			cqid;
+	__u32			rsvd12[4];
+};
+
+struct nvme_delete_queue {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__u32			rsvd1[9];
+	__le16			qid;
+	__u16			rsvd10;
+	__u32			rsvd11[5];
+};
+
+struct nvme_abort_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__u32			rsvd1[9];
+	__le16			sqid;
+	__u16			cid;
+	__u32			rsvd11[5];
+};
+
+struct nvme_download_firmware {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__u32			rsvd1[5];
+	union nvme_data_ptr	dptr;
+	__le32			numd;
+	__le32			offset;
+	__u32			rsvd12[4];
+};
+
+struct nvme_format_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2[4];
+	__le32			cdw10;
+	__u32			rsvd11[5];
+};
+
+struct nvme_get_log_page_command {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2[2];
+	union nvme_data_ptr	dptr;
+	__u8			lid;
+	__u8			rsvd10;
+	__le16			numdl;
+	__le16			numdu;
+	__u16			rsvd11;
+	__le32			lpol;
+	__le32			lpou;
+	__u32			rsvd14[2];
+};
+
+struct nvme_directive_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2[2];
+	union nvme_data_ptr	dptr;
+	__le32			numd;
+	__u8			doper;
+	__u8			dtype;
+	__le16			dspec;
+	__u8			endir;
+	__u8			tdtype;
+	__u16			rsvd15;
+
+	__u32			rsvd16[3];
+};
+
+/*
+ * Fabrics subcommands.
+ */
+enum nvmf_fabrics_opcode {
+	nvme_fabrics_command		= 0x7f,
+};
+
+enum nvmf_capsule_command {
+	nvme_fabrics_type_property_set	= 0x00,
+	nvme_fabrics_type_connect	= 0x01,
+	nvme_fabrics_type_property_get	= 0x04,
+};
+
+struct nvmf_common_command {
+	__u8	opcode;
+	__u8	resv1;
+	__u16	command_id;
+	__u8	fctype;
+	__u8	resv2[35];
+	__u8	ts[24];
+};
+
+/*
+ * The legal cntlid range a NVMe Target will provide.
+ * Note that cntlid of value 0 is considered illegal in the fabrics world.
+ * Devices based on earlier specs did not have the subsystem concept;
+ * therefore, those devices had their cntlid value set to 0 as a result.
+ */
+#define NVME_CNTLID_MIN		1
+#define NVME_CNTLID_MAX		0xffef
+#define NVME_CNTLID_DYNAMIC	0xffff
+
+#define MAX_DISC_LOGS	255
+
+/* Discovery log page entry */
+struct nvmf_disc_rsp_page_entry {
+	__u8		trtype;
+	__u8		adrfam;
+	__u8		subtype;
+	__u8		treq;
+	__le16		portid;
+	__le16		cntlid;
+	__le16		asqsz;
+	__u8		resv8[22];
+	char		trsvcid[NVMF_TRSVCID_SIZE];
+	__u8		resv64[192];
+	char		subnqn[NVMF_NQN_FIELD_LEN];
+	char		traddr[NVMF_TRADDR_SIZE];
+	union tsas {
+		char		common[NVMF_TSAS_SIZE];
+		struct rdma {
+			__u8	qptype;
+			__u8	prtype;
+			__u8	cms;
+			__u8	resv3[5];
+			__u16	pkey;
+			__u8	resv10[246];
+		} rdma;
+	} tsas;
+};
+
+/* Discovery log page header */
+struct nvmf_disc_rsp_page_hdr {
+	__le64		genctr;
+	__le64		numrec;
+	__le16		recfmt;
+	__u8		resv14[1006];
+	struct nvmf_disc_rsp_page_entry entries[0];
+};
+
+struct nvmf_connect_command {
+	__u8		opcode;
+	__u8		resv1;
+	__u16		command_id;
+	__u8		fctype;
+	__u8		resv2[19];
+	union nvme_data_ptr dptr;
+	__le16		recfmt;
+	__le16		qid;
+	__le16		sqsize;
+	__u8		cattr;
+	__u8		resv3;
+	__le32		kato;
+	__u8		resv4[12];
+};
+
+struct nvmf_connect_data {
+	uuid_t		hostid;
+	__le16		cntlid;
+	char		resv4[238];
+	char		subsysnqn[NVMF_NQN_FIELD_LEN];
+	char		hostnqn[NVMF_NQN_FIELD_LEN];
+	char		resv5[256];
+};
+
+struct nvmf_property_set_command {
+	__u8		opcode;
+	__u8		resv1;
+	__u16		command_id;
+	__u8		fctype;
+	__u8		resv2[35];
+	__u8		attrib;
+	__u8		resv3[3];
+	__le32		offset;
+	__le64		value;
+	__u8		resv4[8];
+};
+
+struct nvmf_property_get_command {
+	__u8		opcode;
+	__u8		resv1;
+	__u16		command_id;
+	__u8		fctype;
+	__u8		resv2[35];
+	__u8		attrib;
+	__u8		resv3[3];
+	__le32		offset;
+	__u8		resv4[16];
+};
+
+struct nvme_dbbuf {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__u32			rsvd1[5];
+	__le64			prp1;
+	__le64			prp2;
+	__u32			rsvd12[6];
+};
+
+struct streams_directive_params {
+	__le16	msl;
+	__le16	nssa;
+	__le16	nsso;
+	__u8	rsvd[10];
+	__le32	sws;
+	__le16	sgs;
+	__le16	nsa;
+	__le16	nso;
+	__u8	rsvd2[6];
+};
+
+struct nvme_command {
+	union {
+		struct nvme_common_command common;
+		struct nvme_rw_command rw;
+		struct nvme_identify identify;
+		struct nvme_features features;
+		struct nvme_create_cq create_cq;
+		struct nvme_create_sq create_sq;
+		struct nvme_delete_queue delete_queue;
+		struct nvme_download_firmware dlfw;
+		struct nvme_format_cmd format;
+		struct nvme_dsm_cmd dsm;
+		struct nvme_write_zeroes_cmd write_zeroes;
+		struct nvme_abort_cmd abort;
+		struct nvme_get_log_page_command get_log_page;
+		struct nvmf_common_command fabrics;
+		struct nvmf_connect_command connect;
+		struct nvmf_property_set_command prop_set;
+		struct nvmf_property_get_command prop_get;
+		struct nvme_dbbuf dbbuf;
+		struct nvme_directive_cmd directive;
+	};
+};
+
+static inline bool nvme_is_write(struct nvme_command *cmd)
+{
+	/*
+	 * What a mess...
+	 *
+	 * Why can't we simply have a Fabrics In and Fabrics out command?
+	 */
+	if (unlikely(cmd->common.opcode == nvme_fabrics_command))
+		return cmd->fabrics.fctype & 1;
+	return cmd->common.opcode & 1;
+}
+
+enum {
+	/*
+	 * Generic Command Status:
+	 */
+	NVME_SC_SUCCESS			= 0x0,
+	NVME_SC_INVALID_OPCODE		= 0x1,
+	NVME_SC_INVALID_FIELD		= 0x2,
+	NVME_SC_CMDID_CONFLICT		= 0x3,
+	NVME_SC_DATA_XFER_ERROR		= 0x4,
+	NVME_SC_POWER_LOSS		= 0x5,
+	NVME_SC_INTERNAL		= 0x6,
+	NVME_SC_ABORT_REQ		= 0x7,
+	NVME_SC_ABORT_QUEUE		= 0x8,
+	NVME_SC_FUSED_FAIL		= 0x9,
+	NVME_SC_FUSED_MISSING		= 0xa,
+	NVME_SC_INVALID_NS		= 0xb,
+	NVME_SC_CMD_SEQ_ERROR		= 0xc,
+	NVME_SC_SGL_INVALID_LAST	= 0xd,
+	NVME_SC_SGL_INVALID_COUNT	= 0xe,
+	NVME_SC_SGL_INVALID_DATA	= 0xf,
+	NVME_SC_SGL_INVALID_METADATA	= 0x10,
+	NVME_SC_SGL_INVALID_TYPE	= 0x11,
+
+	NVME_SC_SGL_INVALID_OFFSET	= 0x16,
+	NVME_SC_SGL_INVALID_SUBTYPE	= 0x17,
+
+	NVME_SC_LBA_RANGE		= 0x80,
+	NVME_SC_CAP_EXCEEDED		= 0x81,
+	NVME_SC_NS_NOT_READY		= 0x82,
+	NVME_SC_RESERVATION_CONFLICT	= 0x83,
+
+	/*
+	 * Command Specific Status:
+	 */
+	NVME_SC_CQ_INVALID		= 0x100,
+	NVME_SC_QID_INVALID		= 0x101,
+	NVME_SC_QUEUE_SIZE		= 0x102,
+	NVME_SC_ABORT_LIMIT		= 0x103,
+	NVME_SC_ABORT_MISSING		= 0x104,
+	NVME_SC_ASYNC_LIMIT		= 0x105,
+	NVME_SC_FIRMWARE_SLOT		= 0x106,
+	NVME_SC_FIRMWARE_IMAGE		= 0x107,
+	NVME_SC_INVALID_VECTOR		= 0x108,
+	NVME_SC_INVALID_LOG_PAGE	= 0x109,
+	NVME_SC_INVALID_FORMAT		= 0x10a,
+	NVME_SC_FW_NEEDS_CONV_RESET	= 0x10b,
+	NVME_SC_INVALID_QUEUE		= 0x10c,
+	NVME_SC_FEATURE_NOT_SAVEABLE	= 0x10d,
+	NVME_SC_FEATURE_NOT_CHANGEABLE	= 0x10e,
+	NVME_SC_FEATURE_NOT_PER_NS	= 0x10f,
+	NVME_SC_FW_NEEDS_SUBSYS_RESET	= 0x110,
+	NVME_SC_FW_NEEDS_RESET		= 0x111,
+	NVME_SC_FW_NEEDS_MAX_TIME	= 0x112,
+	NVME_SC_FW_ACIVATE_PROHIBITED	= 0x113,
+	NVME_SC_OVERLAPPING_RANGE	= 0x114,
+	NVME_SC_NS_INSUFFICENT_CAP	= 0x115,
+	NVME_SC_NS_ID_UNAVAILABLE	= 0x116,
+	NVME_SC_NS_ALREADY_ATTACHED	= 0x118,
+	NVME_SC_NS_IS_PRIVATE		= 0x119,
+	NVME_SC_NS_NOT_ATTACHED		= 0x11a,
+	NVME_SC_THIN_PROV_NOT_SUPP	= 0x11b,
+	NVME_SC_CTRL_LIST_INVALID	= 0x11c,
+
+	/*
+	 * I/O Command Set Specific - NVM commands:
+	 */
+	NVME_SC_BAD_ATTRIBUTES		= 0x180,
+	NVME_SC_INVALID_PI		= 0x181,
+	NVME_SC_READ_ONLY		= 0x182,
+	NVME_SC_ONCS_NOT_SUPPORTED	= 0x183,
+
+	/*
+	 * I/O Command Set Specific - Fabrics commands:
+	 */
+	NVME_SC_CONNECT_FORMAT		= 0x180,
+	NVME_SC_CONNECT_CTRL_BUSY	= 0x181,
+	NVME_SC_CONNECT_INVALID_PARAM	= 0x182,
+	NVME_SC_CONNECT_RESTART_DISC	= 0x183,
+	NVME_SC_CONNECT_INVALID_HOST	= 0x184,
+
+	NVME_SC_DISCOVERY_RESTART	= 0x190,
+	NVME_SC_AUTH_REQUIRED		= 0x191,
+
+	/*
+	 * Media and Data Integrity Errors:
+	 */
+	NVME_SC_WRITE_FAULT		= 0x280,
+	NVME_SC_READ_ERROR		= 0x281,
+	NVME_SC_GUARD_CHECK		= 0x282,
+	NVME_SC_APPTAG_CHECK		= 0x283,
+	NVME_SC_REFTAG_CHECK		= 0x284,
+	NVME_SC_COMPARE_FAILED		= 0x285,
+	NVME_SC_ACCESS_DENIED		= 0x286,
+	NVME_SC_UNWRITTEN_BLOCK		= 0x287,
+
+	NVME_SC_DNR			= 0x4000,
+};
+
+struct nvme_completion {
+	/*
+	 * Used by Admin and Fabrics commands to return data:
+	 */
+	union nvme_result {
+		__le16	u16;
+		__le32	u32;
+		__le64	u64;
+	} result;
+	__le16	sq_head;	/* how much of this queue may be reclaimed */
+	__le16	sq_id;		/* submission queue that generated this entry */
+	__u16	command_id;	/* of the command which completed */
+	__le16	status;		/* did the command fail, and if so, why? */
+};
+
+#define NVME_VS(major, minor, tertiary) \
+	(((major) << 16) | ((minor) << 8) | (tertiary))
+
+#define NVME_MAJOR(ver)		((ver) >> 16)
+#define NVME_MINOR(ver)		(((ver) >> 8) & 0xff)
+#define NVME_TERTIARY(ver)	((ver) & 0xff)
+
+#endif /* _LINUX_NVME_H */
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
new file mode 100644
index 0000000..13c8ab1
--- /dev/null
+++ b/include/linux/qed/common_hsi.h
@@ -0,0 +1,1391 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2016  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _COMMON_HSI_H
+#define _COMMON_HSI_H
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+
+/* dma_addr_t manip */
+#define PTR_LO(x)		((u32)(((uintptr_t)(x)) & 0xffffffff))
+#define PTR_HI(x)		((u32)((((uintptr_t)(x)) >> 16) >> 16))
+#define DMA_LO_LE(x)		cpu_to_le32(lower_32_bits(x))
+#define DMA_HI_LE(x)		cpu_to_le32(upper_32_bits(x))
+#define DMA_REGPAIR_LE(x, val)	do { \
+					(x).hi = DMA_HI_LE((val)); \
+					(x).lo = DMA_LO_LE((val)); \
+				} while (0)
+
+#define HILO_GEN(hi, lo, type)		((((type)(hi)) << 32) + (lo))
+#define HILO_64(hi, lo) \
+	HILO_GEN(le32_to_cpu(hi), le32_to_cpu(lo), u64)
+#define HILO_64_REGPAIR(regpair) ({ \
+	typeof(regpair) __regpair = (regpair); \
+	HILO_64(__regpair.hi, __regpair.lo); })
+#define HILO_DMA_REGPAIR(regpair)	((dma_addr_t)HILO_64_REGPAIR(regpair))
+
+#ifndef __COMMON_HSI__
+#define __COMMON_HSI__
+
+/********************************/
+/* PROTOCOL COMMON FW CONSTANTS */
+/********************************/
+
+#define X_FINAL_CLEANUP_AGG_INT			1
+
+#define EVENT_RING_PAGE_SIZE_BYTES		4096
+
+#define NUM_OF_GLOBAL_QUEUES			128
+#define COMMON_QUEUE_ENTRY_MAX_BYTE_SIZE	64
+
+#define ISCSI_CDU_TASK_SEG_TYPE			0
+#define FCOE_CDU_TASK_SEG_TYPE			0
+#define RDMA_CDU_TASK_SEG_TYPE			1
+
+#define FW_ASSERT_GENERAL_ATTN_IDX		32
+
+#define MAX_PINNED_CCFC				32
+
+/* Queue Zone sizes in bytes */
+#define TSTORM_QZONE_SIZE	8
+#define MSTORM_QZONE_SIZE	16
+#define USTORM_QZONE_SIZE	8
+#define XSTORM_QZONE_SIZE	8
+#define YSTORM_QZONE_SIZE	0
+#define PSTORM_QZONE_SIZE	0
+
+#define MSTORM_VF_ZONE_DEFAULT_SIZE_LOG		7
+#define ETH_MAX_NUM_RX_QUEUES_PER_VF_DEFAULT	16
+#define ETH_MAX_NUM_RX_QUEUES_PER_VF_DOUBLE	48
+#define ETH_MAX_NUM_RX_QUEUES_PER_VF_QUAD	112
+
+/********************************/
+/* CORE (LIGHT L2) FW CONSTANTS */
+/********************************/
+
+#define CORE_LL2_MAX_RAMROD_PER_CON	8
+#define CORE_LL2_TX_BD_PAGE_SIZE_BYTES	4096
+#define CORE_LL2_RX_BD_PAGE_SIZE_BYTES	4096
+#define CORE_LL2_RX_CQE_PAGE_SIZE_BYTES	4096
+#define CORE_LL2_RX_NUM_NEXT_PAGE_BDS	1
+
+#define CORE_LL2_TX_MAX_BDS_PER_PACKET	12
+
+#define CORE_SPQE_PAGE_SIZE_BYTES	4096
+
+#define MAX_NUM_LL2_RX_QUEUES		48
+#define MAX_NUM_LL2_TX_STATS_COUNTERS	48
+
+#define FW_MAJOR_VERSION	8
+#define FW_MINOR_VERSION	33
+#define FW_REVISION_VERSION     11
+#define FW_ENGINEERING_VERSION	0
+
+/***********************/
+/* COMMON HW CONSTANTS */
+/***********************/
+
+/* PCI functions */
+#define MAX_NUM_PORTS_K2	(4)
+#define MAX_NUM_PORTS_BB	(2)
+#define MAX_NUM_PORTS		(MAX_NUM_PORTS_K2)
+
+#define MAX_NUM_PFS_K2		(16)
+#define MAX_NUM_PFS_BB		(8)
+#define MAX_NUM_PFS		(MAX_NUM_PFS_K2)
+#define MAX_NUM_OF_PFS_IN_CHIP	(16) /* On both engines */
+
+#define MAX_NUM_VFS_K2	(192)
+#define MAX_NUM_VFS_BB	(120)
+#define MAX_NUM_VFS	(MAX_NUM_VFS_K2)
+
+#define MAX_NUM_FUNCTIONS_BB	(MAX_NUM_PFS_BB + MAX_NUM_VFS_BB)
+#define MAX_NUM_FUNCTIONS	(MAX_NUM_PFS + MAX_NUM_VFS)
+
+#define MAX_FUNCTION_NUMBER_BB	(MAX_NUM_PFS + MAX_NUM_VFS_BB)
+#define MAX_FUNCTION_NUMBER	(MAX_NUM_PFS + MAX_NUM_VFS)
+
+#define MAX_NUM_VPORTS_K2	(208)
+#define MAX_NUM_VPORTS_BB	(160)
+#define MAX_NUM_VPORTS		(MAX_NUM_VPORTS_K2)
+
+#define MAX_NUM_L2_QUEUES_K2	(320)
+#define MAX_NUM_L2_QUEUES_BB	(256)
+#define MAX_NUM_L2_QUEUES	(MAX_NUM_L2_QUEUES_K2)
+
+/* Traffic classes in network-facing blocks (PBF, BTB, NIG, BRB, PRS and QM) */
+#define NUM_PHYS_TCS_4PORT_K2	(4)
+#define NUM_OF_PHYS_TCS		(8)
+#define PURE_LB_TC		NUM_OF_PHYS_TCS
+#define NUM_TCS_4PORT_K2	(NUM_PHYS_TCS_4PORT_K2 + 1)
+#define NUM_OF_TCS		(NUM_OF_PHYS_TCS + 1)
+
+/* CIDs */
+#define NUM_OF_CONNECTION_TYPES_E4	(8)
+#define NUM_OF_LCIDS			(320)
+#define NUM_OF_LTIDS			(320)
+
+/* Global PXP windows (GTT) */
+#define NUM_OF_GTT		19
+#define GTT_DWORD_SIZE_BITS	10
+#define GTT_BYTE_SIZE_BITS	(GTT_DWORD_SIZE_BITS + 2)
+#define GTT_DWORD_SIZE		BIT(GTT_DWORD_SIZE_BITS)
+
+/* Tools Version */
+#define TOOLS_VERSION	10
+
+/*****************/
+/* CDU CONSTANTS */
+/*****************/
+
+#define CDU_SEG_TYPE_OFFSET_REG_TYPE_SHIFT			(17)
+#define CDU_SEG_TYPE_OFFSET_REG_OFFSET_MASK			(0x1ffff)
+
+#define CDU_VF_FL_SEG_TYPE_OFFSET_REG_TYPE_SHIFT		(12)
+#define CDU_VF_FL_SEG_TYPE_OFFSET_REG_OFFSET_MASK		(0xfff)
+
+#define CDU_CONTEXT_VALIDATION_CFG_ENABLE_SHIFT			(0)
+#define CDU_CONTEXT_VALIDATION_CFG_VALIDATION_TYPE_SHIFT	(1)
+#define CDU_CONTEXT_VALIDATION_CFG_USE_TYPE			(2)
+#define CDU_CONTEXT_VALIDATION_CFG_USE_REGION			(3)
+#define CDU_CONTEXT_VALIDATION_CFG_USE_CID			(4)
+#define CDU_CONTEXT_VALIDATION_CFG_USE_ACTIVE			(5)
+
+/*****************/
+/* DQ CONSTANTS  */
+/*****************/
+
+/* DEMS */
+#define DQ_DEMS_LEGACY			0
+#define DQ_DEMS_TOE_MORE_TO_SEND	3
+#define DQ_DEMS_TOE_LOCAL_ADV_WND	4
+#define DQ_DEMS_ROCE_CQ_CONS		7
+
+/* XCM agg val selection (HW) */
+#define DQ_XCM_AGG_VAL_SEL_WORD2	0
+#define DQ_XCM_AGG_VAL_SEL_WORD3	1
+#define DQ_XCM_AGG_VAL_SEL_WORD4	2
+#define DQ_XCM_AGG_VAL_SEL_WORD5	3
+#define DQ_XCM_AGG_VAL_SEL_REG3		4
+#define DQ_XCM_AGG_VAL_SEL_REG4		5
+#define DQ_XCM_AGG_VAL_SEL_REG5		6
+#define DQ_XCM_AGG_VAL_SEL_REG6		7
+
+/* XCM agg val selection (FW) */
+#define	DQ_XCM_CORE_TX_BD_CONS_CMD		DQ_XCM_AGG_VAL_SEL_WORD3
+#define	DQ_XCM_CORE_TX_BD_PROD_CMD		DQ_XCM_AGG_VAL_SEL_WORD4
+#define	DQ_XCM_CORE_SPQ_PROD_CMD		DQ_XCM_AGG_VAL_SEL_WORD4
+#define	DQ_XCM_ETH_EDPM_NUM_BDS_CMD		DQ_XCM_AGG_VAL_SEL_WORD2
+#define	DQ_XCM_ETH_TX_BD_CONS_CMD		DQ_XCM_AGG_VAL_SEL_WORD3
+#define	DQ_XCM_ETH_TX_BD_PROD_CMD		DQ_XCM_AGG_VAL_SEL_WORD4
+#define	DQ_XCM_ETH_GO_TO_BD_CONS_CMD		DQ_XCM_AGG_VAL_SEL_WORD5
+#define DQ_XCM_FCOE_SQ_CONS_CMD			DQ_XCM_AGG_VAL_SEL_WORD3
+#define DQ_XCM_FCOE_SQ_PROD_CMD			DQ_XCM_AGG_VAL_SEL_WORD4
+#define DQ_XCM_FCOE_X_FERQ_PROD_CMD		DQ_XCM_AGG_VAL_SEL_WORD5
+#define DQ_XCM_ISCSI_SQ_CONS_CMD		DQ_XCM_AGG_VAL_SEL_WORD3
+#define DQ_XCM_ISCSI_SQ_PROD_CMD		DQ_XCM_AGG_VAL_SEL_WORD4
+#define DQ_XCM_ISCSI_MORE_TO_SEND_SEQ_CMD	DQ_XCM_AGG_VAL_SEL_REG3
+#define DQ_XCM_ISCSI_EXP_STAT_SN_CMD		DQ_XCM_AGG_VAL_SEL_REG6
+#define DQ_XCM_ROCE_SQ_PROD_CMD			DQ_XCM_AGG_VAL_SEL_WORD4
+#define DQ_XCM_TOE_TX_BD_PROD_CMD		DQ_XCM_AGG_VAL_SEL_WORD4
+#define DQ_XCM_TOE_MORE_TO_SEND_SEQ_CMD		DQ_XCM_AGG_VAL_SEL_REG3
+#define DQ_XCM_TOE_LOCAL_ADV_WND_SEQ_CMD	DQ_XCM_AGG_VAL_SEL_REG4
+
+/* UCM agg val selection (HW) */
+#define	DQ_UCM_AGG_VAL_SEL_WORD0	0
+#define	DQ_UCM_AGG_VAL_SEL_WORD1	1
+#define	DQ_UCM_AGG_VAL_SEL_WORD2	2
+#define	DQ_UCM_AGG_VAL_SEL_WORD3	3
+#define	DQ_UCM_AGG_VAL_SEL_REG0		4
+#define	DQ_UCM_AGG_VAL_SEL_REG1		5
+#define	DQ_UCM_AGG_VAL_SEL_REG2		6
+#define	DQ_UCM_AGG_VAL_SEL_REG3		7
+
+/* UCM agg val selection (FW) */
+#define DQ_UCM_ETH_PMD_TX_CONS_CMD	DQ_UCM_AGG_VAL_SEL_WORD2
+#define DQ_UCM_ETH_PMD_RX_CONS_CMD	DQ_UCM_AGG_VAL_SEL_WORD3
+#define DQ_UCM_ROCE_CQ_CONS_CMD		DQ_UCM_AGG_VAL_SEL_REG0
+#define DQ_UCM_ROCE_CQ_PROD_CMD		DQ_UCM_AGG_VAL_SEL_REG2
+
+/* TCM agg val selection (HW) */
+#define	DQ_TCM_AGG_VAL_SEL_WORD0	0
+#define	DQ_TCM_AGG_VAL_SEL_WORD1	1
+#define	DQ_TCM_AGG_VAL_SEL_WORD2	2
+#define	DQ_TCM_AGG_VAL_SEL_WORD3	3
+#define	DQ_TCM_AGG_VAL_SEL_REG1		4
+#define	DQ_TCM_AGG_VAL_SEL_REG2		5
+#define	DQ_TCM_AGG_VAL_SEL_REG6		6
+#define	DQ_TCM_AGG_VAL_SEL_REG9		7
+
+/* TCM agg val selection (FW) */
+#define DQ_TCM_L2B_BD_PROD_CMD \
+	DQ_TCM_AGG_VAL_SEL_WORD1
+#define DQ_TCM_ROCE_RQ_PROD_CMD	\
+	DQ_TCM_AGG_VAL_SEL_WORD0
+
+/* XCM agg counter flag selection (HW) */
+#define	DQ_XCM_AGG_FLG_SHIFT_BIT14	0
+#define	DQ_XCM_AGG_FLG_SHIFT_BIT15	1
+#define	DQ_XCM_AGG_FLG_SHIFT_CF12	2
+#define	DQ_XCM_AGG_FLG_SHIFT_CF13	3
+#define	DQ_XCM_AGG_FLG_SHIFT_CF18	4
+#define	DQ_XCM_AGG_FLG_SHIFT_CF19	5
+#define	DQ_XCM_AGG_FLG_SHIFT_CF22	6
+#define	DQ_XCM_AGG_FLG_SHIFT_CF23	7
+
+/* XCM agg counter flag selection (FW) */
+#define DQ_XCM_CORE_DQ_CF_CMD			BIT(DQ_XCM_AGG_FLG_SHIFT_CF18)
+#define DQ_XCM_CORE_TERMINATE_CMD		BIT(DQ_XCM_AGG_FLG_SHIFT_CF19)
+#define DQ_XCM_CORE_SLOW_PATH_CMD		BIT(DQ_XCM_AGG_FLG_SHIFT_CF22)
+#define DQ_XCM_ETH_DQ_CF_CMD			BIT(DQ_XCM_AGG_FLG_SHIFT_CF18)
+#define DQ_XCM_ETH_TERMINATE_CMD		BIT(DQ_XCM_AGG_FLG_SHIFT_CF19)
+#define DQ_XCM_ETH_SLOW_PATH_CMD		BIT(DQ_XCM_AGG_FLG_SHIFT_CF22)
+#define DQ_XCM_ETH_TPH_EN_CMD			BIT(DQ_XCM_AGG_FLG_SHIFT_CF23)
+#define DQ_XCM_FCOE_SLOW_PATH_CMD		BIT(DQ_XCM_AGG_FLG_SHIFT_CF22)
+#define DQ_XCM_ISCSI_DQ_FLUSH_CMD		BIT(DQ_XCM_AGG_FLG_SHIFT_CF19)
+#define DQ_XCM_ISCSI_SLOW_PATH_CMD		BIT(DQ_XCM_AGG_FLG_SHIFT_CF22)
+#define DQ_XCM_ISCSI_PROC_ONLY_CLEANUP_CMD	BIT(DQ_XCM_AGG_FLG_SHIFT_CF23)
+#define DQ_XCM_TOE_DQ_FLUSH_CMD			BIT(DQ_XCM_AGG_FLG_SHIFT_CF19)
+#define DQ_XCM_TOE_SLOW_PATH_CMD		BIT(DQ_XCM_AGG_FLG_SHIFT_CF22)
+
+/* UCM agg counter flag selection (HW) */
+#define	DQ_UCM_AGG_FLG_SHIFT_CF0	0
+#define	DQ_UCM_AGG_FLG_SHIFT_CF1	1
+#define	DQ_UCM_AGG_FLG_SHIFT_CF3	2
+#define	DQ_UCM_AGG_FLG_SHIFT_CF4	3
+#define	DQ_UCM_AGG_FLG_SHIFT_CF5	4
+#define	DQ_UCM_AGG_FLG_SHIFT_CF6	5
+#define	DQ_UCM_AGG_FLG_SHIFT_RULE0EN	6
+#define	DQ_UCM_AGG_FLG_SHIFT_RULE1EN	7
+
+/* UCM agg counter flag selection (FW) */
+#define DQ_UCM_ETH_PMD_TX_ARM_CMD	BIT(DQ_UCM_AGG_FLG_SHIFT_CF4)
+#define DQ_UCM_ETH_PMD_RX_ARM_CMD	BIT(DQ_UCM_AGG_FLG_SHIFT_CF5)
+#define DQ_UCM_ROCE_CQ_ARM_SE_CF_CMD	BIT(DQ_UCM_AGG_FLG_SHIFT_CF4)
+#define DQ_UCM_ROCE_CQ_ARM_CF_CMD	BIT(DQ_UCM_AGG_FLG_SHIFT_CF5)
+#define DQ_UCM_TOE_TIMER_STOP_ALL_CMD	BIT(DQ_UCM_AGG_FLG_SHIFT_CF3)
+#define DQ_UCM_TOE_SLOW_PATH_CF_CMD	BIT(DQ_UCM_AGG_FLG_SHIFT_CF4)
+#define DQ_UCM_TOE_DQ_CF_CMD		BIT(DQ_UCM_AGG_FLG_SHIFT_CF5)
+
+/* TCM agg counter flag selection (HW) */
+#define DQ_TCM_AGG_FLG_SHIFT_CF0	0
+#define DQ_TCM_AGG_FLG_SHIFT_CF1	1
+#define DQ_TCM_AGG_FLG_SHIFT_CF2	2
+#define DQ_TCM_AGG_FLG_SHIFT_CF3	3
+#define DQ_TCM_AGG_FLG_SHIFT_CF4	4
+#define DQ_TCM_AGG_FLG_SHIFT_CF5	5
+#define DQ_TCM_AGG_FLG_SHIFT_CF6	6
+#define DQ_TCM_AGG_FLG_SHIFT_CF7	7
+/* TCM agg counter flag selection (FW) */
+#define DQ_TCM_FCOE_FLUSH_Q0_CMD	BIT(DQ_TCM_AGG_FLG_SHIFT_CF1)
+#define DQ_TCM_FCOE_DUMMY_TIMER_CMD	BIT(DQ_TCM_AGG_FLG_SHIFT_CF2)
+#define DQ_TCM_FCOE_TIMER_STOP_ALL_CMD	BIT(DQ_TCM_AGG_FLG_SHIFT_CF3)
+#define DQ_TCM_ISCSI_FLUSH_Q0_CMD	BIT(DQ_TCM_AGG_FLG_SHIFT_CF1)
+#define DQ_TCM_ISCSI_TIMER_STOP_ALL_CMD	BIT(DQ_TCM_AGG_FLG_SHIFT_CF3)
+#define DQ_TCM_TOE_FLUSH_Q0_CMD		BIT(DQ_TCM_AGG_FLG_SHIFT_CF1)
+#define DQ_TCM_TOE_TIMER_STOP_ALL_CMD	BIT(DQ_TCM_AGG_FLG_SHIFT_CF3)
+#define DQ_TCM_IWARP_POST_RQ_CF_CMD	BIT(DQ_TCM_AGG_FLG_SHIFT_CF1)
+
+/* PWM address mapping */
+#define DQ_PWM_OFFSET_DPM_BASE		0x0
+#define DQ_PWM_OFFSET_DPM_END		0x27
+#define DQ_PWM_OFFSET_XCM16_BASE	0x40
+#define DQ_PWM_OFFSET_XCM32_BASE	0x44
+#define DQ_PWM_OFFSET_UCM16_BASE	0x48
+#define DQ_PWM_OFFSET_UCM32_BASE	0x4C
+#define DQ_PWM_OFFSET_UCM16_4		0x50
+#define DQ_PWM_OFFSET_TCM16_BASE	0x58
+#define DQ_PWM_OFFSET_TCM32_BASE	0x5C
+#define DQ_PWM_OFFSET_XCM_FLAGS		0x68
+#define DQ_PWM_OFFSET_UCM_FLAGS		0x69
+#define DQ_PWM_OFFSET_TCM_FLAGS		0x6B
+
+#define DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD		(DQ_PWM_OFFSET_XCM16_BASE + 2)
+#define DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT	(DQ_PWM_OFFSET_UCM32_BASE)
+#define DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_16BIT	(DQ_PWM_OFFSET_UCM16_4)
+#define DQ_PWM_OFFSET_UCM_RDMA_INT_TIMEOUT	(DQ_PWM_OFFSET_UCM16_BASE + 2)
+#define DQ_PWM_OFFSET_UCM_RDMA_ARM_FLAGS	(DQ_PWM_OFFSET_UCM_FLAGS)
+#define DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD		(DQ_PWM_OFFSET_TCM16_BASE + 1)
+#define DQ_PWM_OFFSET_TCM_IWARP_RQ_PROD		(DQ_PWM_OFFSET_TCM16_BASE + 3)
+
+#define	DQ_REGION_SHIFT			(12)
+
+/* DPM */
+#define	DQ_DPM_WQE_BUFF_SIZE		(320)
+
+/* Conn type ranges */
+#define	DQ_CONN_TYPE_RANGE_SHIFT	(4)
+
+/*****************/
+/* QM CONSTANTS  */
+/*****************/
+
+/* Number of TX queues in the QM */
+#define MAX_QM_TX_QUEUES_K2	512
+#define MAX_QM_TX_QUEUES_BB	448
+#define MAX_QM_TX_QUEUES	MAX_QM_TX_QUEUES_K2
+
+/* Number of Other queues in the QM */
+#define MAX_QM_OTHER_QUEUES_BB	64
+#define MAX_QM_OTHER_QUEUES_K2	128
+#define MAX_QM_OTHER_QUEUES	MAX_QM_OTHER_QUEUES_K2
+
+/* Number of queues in a PF queue group */
+#define QM_PF_QUEUE_GROUP_SIZE	8
+
+/* The size of a single queue element in bytes */
+#define QM_PQ_ELEMENT_SIZE	4
+
+/* Base number of Tx PQs in the CM PQ representation.
+ * Should be used when storing PQ IDs in CM PQ registers and context.
+ */
+#define CM_TX_PQ_BASE		0x200
+
+/* Number of global Vport/QCN rate limiters */
+#define MAX_QM_GLOBAL_RLS	256
+
+/* QM registers data */
+#define QM_LINE_CRD_REG_WIDTH		16
+#define QM_LINE_CRD_REG_SIGN_BIT	BIT((QM_LINE_CRD_REG_WIDTH - 1))
+#define QM_BYTE_CRD_REG_WIDTH		24
+#define QM_BYTE_CRD_REG_SIGN_BIT	BIT((QM_BYTE_CRD_REG_WIDTH - 1))
+#define QM_WFQ_CRD_REG_WIDTH		32
+#define QM_WFQ_CRD_REG_SIGN_BIT		BIT((QM_WFQ_CRD_REG_WIDTH - 1))
+#define QM_RL_CRD_REG_WIDTH		32
+#define QM_RL_CRD_REG_SIGN_BIT		BIT((QM_RL_CRD_REG_WIDTH - 1))
+
+/*****************/
+/* CAU CONSTANTS */
+/*****************/
+
+#define CAU_FSM_ETH_RX  0
+#define CAU_FSM_ETH_TX  1
+
+/* Number of Protocol Indices per Status Block */
+#define PIS_PER_SB_E4	12
+
+#define CAU_HC_STOPPED_STATE	3
+#define CAU_HC_DISABLE_STATE	4
+#define CAU_HC_ENABLE_STATE	0
+
+/*****************/
+/* IGU CONSTANTS */
+/*****************/
+
+#define MAX_SB_PER_PATH_K2	(368)
+#define MAX_SB_PER_PATH_BB	(288)
+#define MAX_TOT_SB_PER_PATH \
+	MAX_SB_PER_PATH_K2
+
+#define MAX_SB_PER_PF_MIMD	129
+#define MAX_SB_PER_PF_SIMD	64
+#define MAX_SB_PER_VF		64
+
+/* Memory addresses on the BAR for the IGU Sub Block */
+#define IGU_MEM_BASE			0x0000
+
+#define IGU_MEM_MSIX_BASE		0x0000
+#define IGU_MEM_MSIX_UPPER		0x0101
+#define IGU_MEM_MSIX_RESERVED_UPPER	0x01ff
+
+#define IGU_MEM_PBA_MSIX_BASE		0x0200
+#define IGU_MEM_PBA_MSIX_UPPER		0x0202
+#define IGU_MEM_PBA_MSIX_RESERVED_UPPER	0x03ff
+
+#define IGU_CMD_INT_ACK_BASE		0x0400
+#define IGU_CMD_INT_ACK_UPPER		(IGU_CMD_INT_ACK_BASE +	\
+					 MAX_TOT_SB_PER_PATH - 1)
+#define IGU_CMD_INT_ACK_RESERVED_UPPER	0x05ff
+
+#define IGU_CMD_ATTN_BIT_UPD_UPPER	0x05f0
+#define IGU_CMD_ATTN_BIT_SET_UPPER	0x05f1
+#define IGU_CMD_ATTN_BIT_CLR_UPPER	0x05f2
+
+#define IGU_REG_SISR_MDPC_WMASK_UPPER		0x05f3
+#define IGU_REG_SISR_MDPC_WMASK_LSB_UPPER	0x05f4
+#define IGU_REG_SISR_MDPC_WMASK_MSB_UPPER	0x05f5
+#define IGU_REG_SISR_MDPC_WOMASK_UPPER		0x05f6
+
+#define IGU_CMD_PROD_UPD_BASE			0x0600
+#define IGU_CMD_PROD_UPD_UPPER			(IGU_CMD_PROD_UPD_BASE +\
+						 MAX_TOT_SB_PER_PATH - 1)
+#define IGU_CMD_PROD_UPD_RESERVED_UPPER		0x07ff
+
+/*****************/
+/* PXP CONSTANTS */
+/*****************/
+
+/* Bars for Blocks */
+#define PXP_BAR_GRC	0
+#define PXP_BAR_TSDM	0
+#define PXP_BAR_USDM	0
+#define PXP_BAR_XSDM	0
+#define PXP_BAR_MSDM	0
+#define PXP_BAR_YSDM	0
+#define PXP_BAR_PSDM	0
+#define PXP_BAR_IGU	0
+#define PXP_BAR_DQ	1
+
+/* PTT and GTT */
+#define PXP_PER_PF_ENTRY_SIZE		8
+#define PXP_NUM_GLOBAL_WINDOWS		243
+#define PXP_GLOBAL_ENTRY_SIZE		4
+#define PXP_ADMIN_WINDOW_ALLOWED_LENGTH	4
+#define PXP_PF_WINDOW_ADMIN_START	0
+#define PXP_PF_WINDOW_ADMIN_LENGTH	0x1000
+#define PXP_PF_WINDOW_ADMIN_END		(PXP_PF_WINDOW_ADMIN_START + \
+					 PXP_PF_WINDOW_ADMIN_LENGTH - 1)
+#define PXP_PF_WINDOW_ADMIN_PER_PF_START	0
+#define PXP_PF_WINDOW_ADMIN_PER_PF_LENGTH	(PXP_NUM_PF_WINDOWS * \
+						 PXP_PER_PF_ENTRY_SIZE)
+#define PXP_PF_WINDOW_ADMIN_PER_PF_END	(PXP_PF_WINDOW_ADMIN_PER_PF_START + \
+					 PXP_PF_WINDOW_ADMIN_PER_PF_LENGTH - 1)
+#define PXP_PF_WINDOW_ADMIN_GLOBAL_START	0x200
+#define PXP_PF_WINDOW_ADMIN_GLOBAL_LENGTH	(PXP_NUM_GLOBAL_WINDOWS * \
+						 PXP_GLOBAL_ENTRY_SIZE)
+#define PXP_PF_WINDOW_ADMIN_GLOBAL_END \
+		(PXP_PF_WINDOW_ADMIN_GLOBAL_START + \
+		 PXP_PF_WINDOW_ADMIN_GLOBAL_LENGTH - 1)
+#define PXP_PF_GLOBAL_PRETEND_ADDR	0x1f0
+#define PXP_PF_ME_OPAQUE_MASK_ADDR	0xf4
+#define PXP_PF_ME_OPAQUE_ADDR		0x1f8
+#define PXP_PF_ME_CONCRETE_ADDR		0x1fc
+
+#define PXP_NUM_PF_WINDOWS	12
+#define PXP_EXTERNAL_BAR_PF_WINDOW_START	0x1000
+#define PXP_EXTERNAL_BAR_PF_WINDOW_NUM		PXP_NUM_PF_WINDOWS
+#define PXP_EXTERNAL_BAR_PF_WINDOW_SINGLE_SIZE	0x1000
+#define PXP_EXTERNAL_BAR_PF_WINDOW_LENGTH \
+	(PXP_EXTERNAL_BAR_PF_WINDOW_NUM * \
+	 PXP_EXTERNAL_BAR_PF_WINDOW_SINGLE_SIZE)
+#define PXP_EXTERNAL_BAR_PF_WINDOW_END \
+	(PXP_EXTERNAL_BAR_PF_WINDOW_START + \
+	 PXP_EXTERNAL_BAR_PF_WINDOW_LENGTH - 1)
+
+#define PXP_EXTERNAL_BAR_GLOBAL_WINDOW_START \
+	(PXP_EXTERNAL_BAR_PF_WINDOW_END + 1)
+#define PXP_EXTERNAL_BAR_GLOBAL_WINDOW_NUM		PXP_NUM_GLOBAL_WINDOWS
+#define PXP_EXTERNAL_BAR_GLOBAL_WINDOW_SINGLE_SIZE	0x1000
+#define PXP_EXTERNAL_BAR_GLOBAL_WINDOW_LENGTH \
+	(PXP_EXTERNAL_BAR_GLOBAL_WINDOW_NUM * \
+	 PXP_EXTERNAL_BAR_GLOBAL_WINDOW_SINGLE_SIZE)
+#define PXP_EXTERNAL_BAR_GLOBAL_WINDOW_END \
+	(PXP_EXTERNAL_BAR_GLOBAL_WINDOW_START + \
+	 PXP_EXTERNAL_BAR_GLOBAL_WINDOW_LENGTH - 1)
+
+/* PF BAR */
+#define PXP_BAR0_START_GRC		0x0000
+#define PXP_BAR0_GRC_LENGTH		0x1C00000
+#define PXP_BAR0_END_GRC		(PXP_BAR0_START_GRC + \
+					 PXP_BAR0_GRC_LENGTH - 1)
+
+#define PXP_BAR0_START_IGU		0x1C00000
+#define PXP_BAR0_IGU_LENGTH		0x10000
+#define PXP_BAR0_END_IGU		(PXP_BAR0_START_IGU + \
+					 PXP_BAR0_IGU_LENGTH - 1)
+
+#define PXP_BAR0_START_TSDM		0x1C80000
+#define PXP_BAR0_SDM_LENGTH		0x40000
+#define PXP_BAR0_SDM_RESERVED_LENGTH	0x40000
+#define PXP_BAR0_END_TSDM		(PXP_BAR0_START_TSDM + \
+					 PXP_BAR0_SDM_LENGTH - 1)
+
+#define PXP_BAR0_START_MSDM		0x1D00000
+#define PXP_BAR0_END_MSDM		(PXP_BAR0_START_MSDM + \
+					 PXP_BAR0_SDM_LENGTH - 1)
+
+#define PXP_BAR0_START_USDM		0x1D80000
+#define PXP_BAR0_END_USDM		(PXP_BAR0_START_USDM + \
+					 PXP_BAR0_SDM_LENGTH - 1)
+
+#define PXP_BAR0_START_XSDM		0x1E00000
+#define PXP_BAR0_END_XSDM		(PXP_BAR0_START_XSDM + \
+					 PXP_BAR0_SDM_LENGTH - 1)
+
+#define PXP_BAR0_START_YSDM		0x1E80000
+#define PXP_BAR0_END_YSDM		(PXP_BAR0_START_YSDM + \
+					 PXP_BAR0_SDM_LENGTH - 1)
+
+#define PXP_BAR0_START_PSDM		0x1F00000
+#define PXP_BAR0_END_PSDM		(PXP_BAR0_START_PSDM + \
+					 PXP_BAR0_SDM_LENGTH - 1)
+
+#define PXP_BAR0_FIRST_INVALID_ADDRESS	(PXP_BAR0_END_PSDM + 1)
+
+/* VF BAR */
+#define PXP_VF_BAR0			0
+
+#define PXP_VF_BAR0_START_IGU		0
+#define PXP_VF_BAR0_IGU_LENGTH		0x3000
+#define PXP_VF_BAR0_END_IGU		(PXP_VF_BAR0_START_IGU + \
+					 PXP_VF_BAR0_IGU_LENGTH - 1)
+
+#define PXP_VF_BAR0_START_DQ		0x3000
+#define PXP_VF_BAR0_DQ_LENGTH		0x200
+#define PXP_VF_BAR0_DQ_OPAQUE_OFFSET	0
+#define PXP_VF_BAR0_ME_OPAQUE_ADDRESS	(PXP_VF_BAR0_START_DQ +	\
+					 PXP_VF_BAR0_DQ_OPAQUE_OFFSET)
+#define PXP_VF_BAR0_ME_CONCRETE_ADDRESS	(PXP_VF_BAR0_ME_OPAQUE_ADDRESS \
+					 + 4)
+#define PXP_VF_BAR0_END_DQ		(PXP_VF_BAR0_START_DQ +	\
+					 PXP_VF_BAR0_DQ_LENGTH - 1)
+
+#define PXP_VF_BAR0_START_TSDM_ZONE_B	0x3200
+#define PXP_VF_BAR0_SDM_LENGTH_ZONE_B	0x200
+#define PXP_VF_BAR0_END_TSDM_ZONE_B	(PXP_VF_BAR0_START_TSDM_ZONE_B + \
+					 PXP_VF_BAR0_SDM_LENGTH_ZONE_B - 1)
+
+#define PXP_VF_BAR0_START_MSDM_ZONE_B	0x3400
+#define PXP_VF_BAR0_END_MSDM_ZONE_B	(PXP_VF_BAR0_START_MSDM_ZONE_B + \
+					 PXP_VF_BAR0_SDM_LENGTH_ZONE_B - 1)
+
+#define PXP_VF_BAR0_START_USDM_ZONE_B	0x3600
+#define PXP_VF_BAR0_END_USDM_ZONE_B	(PXP_VF_BAR0_START_USDM_ZONE_B + \
+					 PXP_VF_BAR0_SDM_LENGTH_ZONE_B - 1)
+
+#define PXP_VF_BAR0_START_XSDM_ZONE_B	0x3800
+#define PXP_VF_BAR0_END_XSDM_ZONE_B	(PXP_VF_BAR0_START_XSDM_ZONE_B + \
+					 PXP_VF_BAR0_SDM_LENGTH_ZONE_B - 1)
+
+#define PXP_VF_BAR0_START_YSDM_ZONE_B	0x3a00
+#define PXP_VF_BAR0_END_YSDM_ZONE_B	(PXP_VF_BAR0_START_YSDM_ZONE_B + \
+					 PXP_VF_BAR0_SDM_LENGTH_ZONE_B - 1)
+
+#define PXP_VF_BAR0_START_PSDM_ZONE_B	0x3c00
+#define PXP_VF_BAR0_END_PSDM_ZONE_B	(PXP_VF_BAR0_START_PSDM_ZONE_B + \
+					 PXP_VF_BAR0_SDM_LENGTH_ZONE_B - 1)
+
+#define PXP_VF_BAR0_START_GRC		0x3E00
+#define PXP_VF_BAR0_GRC_LENGTH		0x200
+#define PXP_VF_BAR0_END_GRC		(PXP_VF_BAR0_START_GRC + \
+					 PXP_VF_BAR0_GRC_LENGTH - 1)
+
+#define PXP_VF_BAR0_START_SDM_ZONE_A	0x4000
+#define PXP_VF_BAR0_END_SDM_ZONE_A	0x10000
+
+#define PXP_VF_BAR0_START_IGU2		0x10000
+#define PXP_VF_BAR0_IGU2_LENGTH		0xD000
+#define PXP_VF_BAR0_END_IGU2		(PXP_VF_BAR0_START_IGU2 + \
+					 PXP_VF_BAR0_IGU2_LENGTH - 1)
+
+#define PXP_VF_BAR0_GRC_WINDOW_LENGTH	32
+
+#define PXP_ILT_PAGE_SIZE_NUM_BITS_MIN	12
+#define PXP_ILT_BLOCK_FACTOR_MULTIPLIER	1024
+
+/* ILT Records */
+#define PXP_NUM_ILT_RECORDS_BB 7600
+#define PXP_NUM_ILT_RECORDS_K2 11000
+#define MAX_NUM_ILT_RECORDS MAX(PXP_NUM_ILT_RECORDS_BB, PXP_NUM_ILT_RECORDS_K2)
+
+/* Host Interface */
+#define PXP_QUEUES_ZONE_MAX_NUM	320
+
+/*****************/
+/* PRM CONSTANTS */
+/*****************/
+#define PRM_DMA_PAD_BYTES_NUM	2
+
+/*****************/
+/* SDMs CONSTANTS  */
+/*****************/
+
+#define SDM_OP_GEN_TRIG_NONE		0
+#define SDM_OP_GEN_TRIG_WAKE_THREAD	1
+#define SDM_OP_GEN_TRIG_AGG_INT		2
+#define SDM_OP_GEN_TRIG_LOADER		4
+#define SDM_OP_GEN_TRIG_INDICATE_ERROR  6
+#define SDM_OP_GEN_TRIG_INC_ORDER_CNT   9
+
+/********************/
+/* Completion types */
+/********************/
+
+#define SDM_COMP_TYPE_NONE		0
+#define SDM_COMP_TYPE_WAKE_THREAD	1
+#define SDM_COMP_TYPE_AGG_INT		2
+#define SDM_COMP_TYPE_CM		3
+#define SDM_COMP_TYPE_LOADER		4
+#define SDM_COMP_TYPE_PXP		5
+#define SDM_COMP_TYPE_INDICATE_ERROR	6
+#define SDM_COMP_TYPE_RELEASE_THREAD	7
+#define SDM_COMP_TYPE_RAM		8
+#define SDM_COMP_TYPE_INC_ORDER_CNT	9
+
+/*****************/
+/* PBF CONSTANTS */
+/*****************/
+
+/* Number of PBF command queue lines. Each line is 32B. */
+#define PBF_MAX_CMD_LINES	3328
+
+/* Number of BTB blocks. Each block is 256B. */
+#define BTB_MAX_BLOCKS		1440
+
+/*****************/
+/* PRS CONSTANTS */
+/*****************/
+
+#define PRS_GFT_CAM_LINES_NO_MATCH	31
+
+/* Interrupt coalescing TimeSet */
+struct coalescing_timeset {
+	u8 value;
+#define	COALESCING_TIMESET_TIMESET_MASK		0x7F
+#define	COALESCING_TIMESET_TIMESET_SHIFT	0
+#define	COALESCING_TIMESET_VALID_MASK		0x1
+#define	COALESCING_TIMESET_VALID_SHIFT		7
+};
+
+struct common_queue_zone {
+	__le16 ring_drv_data_consumer;
+	__le16 reserved;
+};
+
+/* ETH Rx producers data */
+struct eth_rx_prod_data {
+	__le16 bd_prod;
+	__le16 cqe_prod;
+};
+
+struct tcp_ulp_connect_done_params {
+	__le16 mss;
+	u8 snd_wnd_scale;
+	u8 flags;
+#define TCP_ULP_CONNECT_DONE_PARAMS_TS_EN_MASK		0x1
+#define TCP_ULP_CONNECT_DONE_PARAMS_TS_EN_SHIFT		0
+#define TCP_ULP_CONNECT_DONE_PARAMS_RESERVED_MASK	0x7F
+#define TCP_ULP_CONNECT_DONE_PARAMS_RESERVED_SHIFT	1
+};
+
+struct iscsi_connect_done_results {
+	__le16 icid;
+	__le16 conn_id;
+	struct tcp_ulp_connect_done_params params;
+};
+
+struct iscsi_eqe_data {
+	__le16 icid;
+	__le16 conn_id;
+	__le16 reserved;
+	u8 error_code;
+	u8 error_pdu_opcode_reserved;
+#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_MASK		0x3F
+#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_SHIFT		0
+#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_VALID_MASK	0x1
+#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_VALID_SHIFT	 6
+#define ISCSI_EQE_DATA_RESERVED0_MASK			0x1
+#define ISCSI_EQE_DATA_RESERVED0_SHIFT			7
+};
+
+/* Multi function mode */
+enum mf_mode {
+	ERROR_MODE /* Unsupported mode */,
+	MF_OVLAN,
+	MF_NPAR,
+	MAX_MF_MODE
+};
+
+/* Per-protocol connection types */
+enum protocol_type {
+	PROTOCOLID_ISCSI,
+	PROTOCOLID_FCOE,
+	PROTOCOLID_ROCE,
+	PROTOCOLID_CORE,
+	PROTOCOLID_ETH,
+	PROTOCOLID_IWARP,
+	PROTOCOLID_RESERVED0,
+	PROTOCOLID_PREROCE,
+	PROTOCOLID_COMMON,
+	PROTOCOLID_RESERVED1,
+	MAX_PROTOCOL_TYPE
+};
+
+struct regpair {
+	__le32 lo;
+	__le32 hi;
+};
+
+/* RoCE Destroy Event Data */
+struct rdma_eqe_destroy_qp {
+	__le32 cid;
+	u8 reserved[4];
+};
+
+/* RDMA Event Data Union */
+union rdma_eqe_data {
+	struct regpair async_handle;
+	struct rdma_eqe_destroy_qp rdma_destroy_qp_data;
+};
+
+/* Ustorm Queue Zone */
+struct ustorm_eth_queue_zone {
+	struct coalescing_timeset int_coalescing_timeset;
+	u8 reserved[3];
+};
+
+struct ustorm_queue_zone {
+	struct ustorm_eth_queue_zone eth;
+	struct common_queue_zone common;
+};
+
+/* Status block structure */
+struct cau_pi_entry {
+	__le32 prod;
+#define CAU_PI_ENTRY_PROD_VAL_MASK	0xFFFF
+#define CAU_PI_ENTRY_PROD_VAL_SHIFT	0
+#define CAU_PI_ENTRY_PI_TIMESET_MASK	0x7F
+#define CAU_PI_ENTRY_PI_TIMESET_SHIFT	16
+#define CAU_PI_ENTRY_FSM_SEL_MASK	0x1
+#define CAU_PI_ENTRY_FSM_SEL_SHIFT	23
+#define CAU_PI_ENTRY_RESERVED_MASK	0xFF
+#define CAU_PI_ENTRY_RESERVED_SHIFT	24
+};
+
+/* Status block structure */
+struct cau_sb_entry {
+	__le32 data;
+#define CAU_SB_ENTRY_SB_PROD_MASK	0xFFFFFF
+#define CAU_SB_ENTRY_SB_PROD_SHIFT	0
+#define CAU_SB_ENTRY_STATE0_MASK	0xF
+#define CAU_SB_ENTRY_STATE0_SHIFT	24
+#define CAU_SB_ENTRY_STATE1_MASK	0xF
+#define CAU_SB_ENTRY_STATE1_SHIFT	28
+	__le32 params;
+#define CAU_SB_ENTRY_SB_TIMESET0_MASK	0x7F
+#define CAU_SB_ENTRY_SB_TIMESET0_SHIFT	0
+#define CAU_SB_ENTRY_SB_TIMESET1_MASK	0x7F
+#define CAU_SB_ENTRY_SB_TIMESET1_SHIFT	7
+#define CAU_SB_ENTRY_TIMER_RES0_MASK	0x3
+#define CAU_SB_ENTRY_TIMER_RES0_SHIFT	14
+#define CAU_SB_ENTRY_TIMER_RES1_MASK	0x3
+#define CAU_SB_ENTRY_TIMER_RES1_SHIFT	16
+#define CAU_SB_ENTRY_VF_NUMBER_MASK	0xFF
+#define CAU_SB_ENTRY_VF_NUMBER_SHIFT	18
+#define CAU_SB_ENTRY_VF_VALID_MASK	0x1
+#define CAU_SB_ENTRY_VF_VALID_SHIFT	26
+#define CAU_SB_ENTRY_PF_NUMBER_MASK	0xF
+#define CAU_SB_ENTRY_PF_NUMBER_SHIFT	27
+#define CAU_SB_ENTRY_TPH_MASK		0x1
+#define CAU_SB_ENTRY_TPH_SHIFT		31
+};
+
+/* Igu cleanup bit values to distinguish between clean or producer consumer
+ * update.
+ */
+enum command_type_bit {
+	IGU_COMMAND_TYPE_NOP = 0,
+	IGU_COMMAND_TYPE_SET = 1,
+	MAX_COMMAND_TYPE_BIT
+};
+
+/* Core doorbell data */
+struct core_db_data {
+	u8 params;
+#define CORE_DB_DATA_DEST_MASK		0x3
+#define CORE_DB_DATA_DEST_SHIFT		0
+#define CORE_DB_DATA_AGG_CMD_MASK	0x3
+#define CORE_DB_DATA_AGG_CMD_SHIFT	2
+#define CORE_DB_DATA_BYPASS_EN_MASK	0x1
+#define CORE_DB_DATA_BYPASS_EN_SHIFT	4
+#define CORE_DB_DATA_RESERVED_MASK	0x1
+#define CORE_DB_DATA_RESERVED_SHIFT	5
+#define CORE_DB_DATA_AGG_VAL_SEL_MASK	0x3
+#define CORE_DB_DATA_AGG_VAL_SEL_SHIFT	6
+	u8 agg_flags;
+	__le16 spq_prod;
+};
+
+/* Enum of doorbell aggregative command selection */
+enum db_agg_cmd_sel {
+	DB_AGG_CMD_NOP,
+	DB_AGG_CMD_SET,
+	DB_AGG_CMD_ADD,
+	DB_AGG_CMD_MAX,
+	MAX_DB_AGG_CMD_SEL
+};
+
+/* Enum of doorbell destination */
+enum db_dest {
+	DB_DEST_XCM,
+	DB_DEST_UCM,
+	DB_DEST_TCM,
+	DB_NUM_DESTINATIONS,
+	MAX_DB_DEST
+};
+
+/* Enum of doorbell DPM types */
+enum db_dpm_type {
+	DPM_LEGACY,
+	DPM_RDMA,
+	DPM_L2_INLINE,
+	DPM_L2_BD,
+	MAX_DB_DPM_TYPE
+};
+
+/* Structure for doorbell data, in L2 DPM mode, for 1st db in a DPM burst */
+struct db_l2_dpm_data {
+	__le16 icid;
+	__le16 bd_prod;
+	__le32 params;
+#define DB_L2_DPM_DATA_SIZE_MASK	0x3F
+#define DB_L2_DPM_DATA_SIZE_SHIFT	0
+#define DB_L2_DPM_DATA_DPM_TYPE_MASK	0x3
+#define DB_L2_DPM_DATA_DPM_TYPE_SHIFT	6
+#define DB_L2_DPM_DATA_NUM_BDS_MASK	0xFF
+#define DB_L2_DPM_DATA_NUM_BDS_SHIFT	8
+#define DB_L2_DPM_DATA_PKT_SIZE_MASK	0x7FF
+#define DB_L2_DPM_DATA_PKT_SIZE_SHIFT	16
+#define DB_L2_DPM_DATA_RESERVED0_MASK	0x1
+#define DB_L2_DPM_DATA_RESERVED0_SHIFT 27
+#define DB_L2_DPM_DATA_SGE_NUM_MASK	0x7
+#define DB_L2_DPM_DATA_SGE_NUM_SHIFT	28
+#define DB_L2_DPM_DATA_GFS_SRC_EN_MASK	0x1
+#define DB_L2_DPM_DATA_GFS_SRC_EN_SHIFT	31
+};
+
+/* Structure for SGE in a DPM doorbell of type DPM_L2_BD */
+struct db_l2_dpm_sge {
+	struct regpair addr;
+	__le16 nbytes;
+	__le16 bitfields;
+#define DB_L2_DPM_SGE_TPH_ST_INDEX_MASK		0x1FF
+#define DB_L2_DPM_SGE_TPH_ST_INDEX_SHIFT	0
+#define DB_L2_DPM_SGE_RESERVED0_MASK		0x3
+#define DB_L2_DPM_SGE_RESERVED0_SHIFT		9
+#define DB_L2_DPM_SGE_ST_VALID_MASK		0x1
+#define DB_L2_DPM_SGE_ST_VALID_SHIFT		11
+#define DB_L2_DPM_SGE_RESERVED1_MASK		0xF
+#define DB_L2_DPM_SGE_RESERVED1_SHIFT		12
+	__le32 reserved2;
+};
+
+/* Structure for doorbell address, in legacy mode */
+struct db_legacy_addr {
+	__le32 addr;
+#define DB_LEGACY_ADDR_RESERVED0_MASK	0x3
+#define DB_LEGACY_ADDR_RESERVED0_SHIFT	0
+#define DB_LEGACY_ADDR_DEMS_MASK	0x7
+#define DB_LEGACY_ADDR_DEMS_SHIFT	2
+#define DB_LEGACY_ADDR_ICID_MASK	0x7FFFFFF
+#define DB_LEGACY_ADDR_ICID_SHIFT	5
+};
+
+/* Structure for doorbell address, in PWM mode */
+struct db_pwm_addr {
+	__le32 addr;
+#define DB_PWM_ADDR_RESERVED0_MASK	0x7
+#define DB_PWM_ADDR_RESERVED0_SHIFT	0
+#define DB_PWM_ADDR_OFFSET_MASK		0x7F
+#define DB_PWM_ADDR_OFFSET_SHIFT	3
+#define DB_PWM_ADDR_WID_MASK		0x3
+#define DB_PWM_ADDR_WID_SHIFT		10
+#define DB_PWM_ADDR_DPI_MASK		0xFFFF
+#define DB_PWM_ADDR_DPI_SHIFT		12
+#define DB_PWM_ADDR_RESERVED1_MASK	0xF
+#define DB_PWM_ADDR_RESERVED1_SHIFT	28
+};
+
+/* Parameters to RDMA firmware, passed in EDPM doorbell */
+struct db_rdma_dpm_params {
+	__le32 params;
+#define DB_RDMA_DPM_PARAMS_SIZE_MASK			0x3F
+#define DB_RDMA_DPM_PARAMS_SIZE_SHIFT			0
+#define DB_RDMA_DPM_PARAMS_DPM_TYPE_MASK		0x3
+#define DB_RDMA_DPM_PARAMS_DPM_TYPE_SHIFT		6
+#define DB_RDMA_DPM_PARAMS_OPCODE_MASK			0xFF
+#define DB_RDMA_DPM_PARAMS_OPCODE_SHIFT			8
+#define DB_RDMA_DPM_PARAMS_WQE_SIZE_MASK		0x7FF
+#define DB_RDMA_DPM_PARAMS_WQE_SIZE_SHIFT		16
+#define DB_RDMA_DPM_PARAMS_RESERVED0_MASK		0x1
+#define DB_RDMA_DPM_PARAMS_RESERVED0_SHIFT		27
+#define DB_RDMA_DPM_PARAMS_COMPLETION_FLG_MASK		0x1
+#define DB_RDMA_DPM_PARAMS_COMPLETION_FLG_SHIFT		28
+#define DB_RDMA_DPM_PARAMS_S_FLG_MASK			0x1
+#define DB_RDMA_DPM_PARAMS_S_FLG_SHIFT			29
+#define DB_RDMA_DPM_PARAMS_RESERVED1_MASK		0x1
+#define DB_RDMA_DPM_PARAMS_RESERVED1_SHIFT		30
+#define DB_RDMA_DPM_PARAMS_CONN_TYPE_IS_IWARP_MASK	0x1
+#define DB_RDMA_DPM_PARAMS_CONN_TYPE_IS_IWARP_SHIFT	31
+};
+
+/* Structure for doorbell data, in RDMA DPM mode, for the first doorbell in a
+ * DPM burst.
+ */
+struct db_rdma_dpm_data {
+	__le16 icid;
+	__le16 prod_val;
+	struct db_rdma_dpm_params params;
+};
+
+/* Igu interrupt command */
+enum igu_int_cmd {
+	IGU_INT_ENABLE	= 0,
+	IGU_INT_DISABLE = 1,
+	IGU_INT_NOP	= 2,
+	IGU_INT_NOP2	= 3,
+	MAX_IGU_INT_CMD
+};
+
+/* IGU producer or consumer update command */
+struct igu_prod_cons_update {
+	__le32 sb_id_and_flags;
+#define IGU_PROD_CONS_UPDATE_SB_INDEX_MASK		0xFFFFFF
+#define IGU_PROD_CONS_UPDATE_SB_INDEX_SHIFT		0
+#define IGU_PROD_CONS_UPDATE_UPDATE_FLAG_MASK		0x1
+#define IGU_PROD_CONS_UPDATE_UPDATE_FLAG_SHIFT		24
+#define IGU_PROD_CONS_UPDATE_ENABLE_INT_MASK		0x3
+#define IGU_PROD_CONS_UPDATE_ENABLE_INT_SHIFT		25
+#define IGU_PROD_CONS_UPDATE_SEGMENT_ACCESS_MASK	0x1
+#define IGU_PROD_CONS_UPDATE_SEGMENT_ACCESS_SHIFT	27
+#define IGU_PROD_CONS_UPDATE_TIMER_MASK_MASK		0x1
+#define IGU_PROD_CONS_UPDATE_TIMER_MASK_SHIFT		28
+#define IGU_PROD_CONS_UPDATE_RESERVED0_MASK		0x3
+#define IGU_PROD_CONS_UPDATE_RESERVED0_SHIFT		29
+#define IGU_PROD_CONS_UPDATE_COMMAND_TYPE_MASK		0x1
+#define IGU_PROD_CONS_UPDATE_COMMAND_TYPE_SHIFT		31
+	__le32 reserved1;
+};
+
+/* Igu segments access for default status block only */
+enum igu_seg_access {
+	IGU_SEG_ACCESS_REG	= 0,
+	IGU_SEG_ACCESS_ATTN	= 1,
+	MAX_IGU_SEG_ACCESS
+};
+
+/* Enumeration for L3 type field of parsing_and_err_flags.
+ * L3Type: 0 - unknown (not ip), 1 - Ipv4, 2 - Ipv6
+ * (This field can be filled according to the last-ethertype)
+ */
+enum l3_type {
+	e_l3_type_unknown,
+	e_l3_type_ipv4,
+	e_l3_type_ipv6,
+	MAX_L3_TYPE
+};
+
+/* Enumeration for l4Protocol field of parsing_and_err_flags.
+ * L4-protocol: 0 - none, 1 - TCP, 2 - UDP.
+ * If the packet is IPv4 fragment, and its not the first fragment, the
+ * protocol-type should be set to none.
+ */
+enum l4_protocol {
+	e_l4_protocol_none,
+	e_l4_protocol_tcp,
+	e_l4_protocol_udp,
+	MAX_L4_PROTOCOL
+};
+
+/* Parsing and error flags field */
+struct parsing_and_err_flags {
+	__le16 flags;
+#define PARSING_AND_ERR_FLAGS_L3TYPE_MASK			0x3
+#define PARSING_AND_ERR_FLAGS_L3TYPE_SHIFT			0
+#define PARSING_AND_ERR_FLAGS_L4PROTOCOL_MASK			0x3
+#define PARSING_AND_ERR_FLAGS_L4PROTOCOL_SHIFT			2
+#define PARSING_AND_ERR_FLAGS_IPV4FRAG_MASK			0x1
+#define PARSING_AND_ERR_FLAGS_IPV4FRAG_SHIFT			4
+#define PARSING_AND_ERR_FLAGS_TAG8021QEXIST_MASK		0x1
+#define PARSING_AND_ERR_FLAGS_TAG8021QEXIST_SHIFT		5
+#define PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED_MASK		0x1
+#define PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED_SHIFT	6
+#define PARSING_AND_ERR_FLAGS_TIMESYNCPKT_MASK			0x1
+#define PARSING_AND_ERR_FLAGS_TIMESYNCPKT_SHIFT			7
+#define PARSING_AND_ERR_FLAGS_TIMESTAMPRECORDED_MASK		0x1
+#define PARSING_AND_ERR_FLAGS_TIMESTAMPRECORDED_SHIFT		8
+#define PARSING_AND_ERR_FLAGS_IPHDRERROR_MASK			0x1
+#define PARSING_AND_ERR_FLAGS_IPHDRERROR_SHIFT			9
+#define PARSING_AND_ERR_FLAGS_L4CHKSMERROR_MASK			0x1
+#define PARSING_AND_ERR_FLAGS_L4CHKSMERROR_SHIFT		10
+#define PARSING_AND_ERR_FLAGS_TUNNELEXIST_MASK			0x1
+#define PARSING_AND_ERR_FLAGS_TUNNELEXIST_SHIFT			11
+#define PARSING_AND_ERR_FLAGS_TUNNEL8021QTAGEXIST_MASK		0x1
+#define PARSING_AND_ERR_FLAGS_TUNNEL8021QTAGEXIST_SHIFT		12
+#define PARSING_AND_ERR_FLAGS_TUNNELIPHDRERROR_MASK		0x1
+#define PARSING_AND_ERR_FLAGS_TUNNELIPHDRERROR_SHIFT		13
+#define PARSING_AND_ERR_FLAGS_TUNNELL4CHKSMWASCALCULATED_MASK	0x1
+#define PARSING_AND_ERR_FLAGS_TUNNELL4CHKSMWASCALCULATED_SHIFT	14
+#define PARSING_AND_ERR_FLAGS_TUNNELL4CHKSMERROR_MASK		0x1
+#define PARSING_AND_ERR_FLAGS_TUNNELL4CHKSMERROR_SHIFT		15
+};
+
+/* Parsing error flags bitmap */
+struct parsing_err_flags {
+	__le16 flags;
+#define PARSING_ERR_FLAGS_MAC_ERROR_MASK				0x1
+#define PARSING_ERR_FLAGS_MAC_ERROR_SHIFT				0
+#define PARSING_ERR_FLAGS_TRUNC_ERROR_MASK				0x1
+#define PARSING_ERR_FLAGS_TRUNC_ERROR_SHIFT				1
+#define PARSING_ERR_FLAGS_PKT_TOO_SMALL_MASK				0x1
+#define PARSING_ERR_FLAGS_PKT_TOO_SMALL_SHIFT				2
+#define PARSING_ERR_FLAGS_ANY_HDR_MISSING_TAG_MASK			0x1
+#define PARSING_ERR_FLAGS_ANY_HDR_MISSING_TAG_SHIFT			3
+#define PARSING_ERR_FLAGS_ANY_HDR_IP_VER_MISMTCH_MASK			0x1
+#define PARSING_ERR_FLAGS_ANY_HDR_IP_VER_MISMTCH_SHIFT			4
+#define PARSING_ERR_FLAGS_ANY_HDR_IP_V4_HDR_LEN_TOO_SMALL_MASK		0x1
+#define PARSING_ERR_FLAGS_ANY_HDR_IP_V4_HDR_LEN_TOO_SMALL_SHIFT		5
+#define PARSING_ERR_FLAGS_ANY_HDR_IP_BAD_TOTAL_LEN_MASK			0x1
+#define PARSING_ERR_FLAGS_ANY_HDR_IP_BAD_TOTAL_LEN_SHIFT		6
+#define PARSING_ERR_FLAGS_IP_V4_CHKSM_ERROR_MASK			0x1
+#define PARSING_ERR_FLAGS_IP_V4_CHKSM_ERROR_SHIFT			7
+#define PARSING_ERR_FLAGS_ANY_HDR_L4_IP_LEN_MISMTCH_MASK		0x1
+#define PARSING_ERR_FLAGS_ANY_HDR_L4_IP_LEN_MISMTCH_SHIFT		8
+#define PARSING_ERR_FLAGS_ZERO_UDP_IP_V6_CHKSM_MASK			0x1
+#define PARSING_ERR_FLAGS_ZERO_UDP_IP_V6_CHKSM_SHIFT			9
+#define PARSING_ERR_FLAGS_INNER_L4_CHKSM_ERROR_MASK			0x1
+#define PARSING_ERR_FLAGS_INNER_L4_CHKSM_ERROR_SHIFT			10
+#define PARSING_ERR_FLAGS_ANY_HDR_ZERO_TTL_OR_HOP_LIM_MASK		0x1
+#define PARSING_ERR_FLAGS_ANY_HDR_ZERO_TTL_OR_HOP_LIM_SHIFT		11
+#define PARSING_ERR_FLAGS_NON_8021Q_TAG_EXISTS_IN_BOTH_HDRS_MASK	0x1
+#define PARSING_ERR_FLAGS_NON_8021Q_TAG_EXISTS_IN_BOTH_HDRS_SHIFT	12
+#define PARSING_ERR_FLAGS_GENEVE_OPTION_OVERSIZED_MASK			0x1
+#define PARSING_ERR_FLAGS_GENEVE_OPTION_OVERSIZED_SHIFT			13
+#define PARSING_ERR_FLAGS_TUNNEL_IP_V4_CHKSM_ERROR_MASK			0x1
+#define PARSING_ERR_FLAGS_TUNNEL_IP_V4_CHKSM_ERROR_SHIFT		14
+#define PARSING_ERR_FLAGS_TUNNEL_L4_CHKSM_ERROR_MASK			0x1
+#define PARSING_ERR_FLAGS_TUNNEL_L4_CHKSM_ERROR_SHIFT			15
+};
+
+/* Pb context */
+struct pb_context {
+	__le32 crc[4];
+};
+
+/* Concrete Function ID */
+struct pxp_concrete_fid {
+	__le16 fid;
+#define PXP_CONCRETE_FID_PFID_MASK	0xF
+#define PXP_CONCRETE_FID_PFID_SHIFT	0
+#define PXP_CONCRETE_FID_PORT_MASK	0x3
+#define PXP_CONCRETE_FID_PORT_SHIFT	4
+#define PXP_CONCRETE_FID_PATH_MASK	0x1
+#define PXP_CONCRETE_FID_PATH_SHIFT	6
+#define PXP_CONCRETE_FID_VFVALID_MASK	0x1
+#define PXP_CONCRETE_FID_VFVALID_SHIFT	7
+#define PXP_CONCRETE_FID_VFID_MASK	0xFF
+#define PXP_CONCRETE_FID_VFID_SHIFT	8
+};
+
+/* Concrete Function ID */
+struct pxp_pretend_concrete_fid {
+	__le16 fid;
+#define PXP_PRETEND_CONCRETE_FID_PFID_MASK	0xF
+#define PXP_PRETEND_CONCRETE_FID_PFID_SHIFT	0
+#define PXP_PRETEND_CONCRETE_FID_RESERVED_MASK	0x7
+#define PXP_PRETEND_CONCRETE_FID_RESERVED_SHIFT	4
+#define PXP_PRETEND_CONCRETE_FID_VFVALID_MASK	0x1
+#define PXP_PRETEND_CONCRETE_FID_VFVALID_SHIFT	7
+#define PXP_PRETEND_CONCRETE_FID_VFID_MASK	0xFF
+#define PXP_PRETEND_CONCRETE_FID_VFID_SHIFT	8
+};
+
+/* Function ID */
+union pxp_pretend_fid {
+	struct pxp_pretend_concrete_fid concrete_fid;
+	__le16 opaque_fid;
+};
+
+/* Pxp Pretend Command Register */
+struct pxp_pretend_cmd {
+	union pxp_pretend_fid fid;
+	__le16 control;
+#define PXP_PRETEND_CMD_PATH_MASK		0x1
+#define PXP_PRETEND_CMD_PATH_SHIFT		0
+#define PXP_PRETEND_CMD_USE_PORT_MASK		0x1
+#define PXP_PRETEND_CMD_USE_PORT_SHIFT		1
+#define PXP_PRETEND_CMD_PORT_MASK		0x3
+#define PXP_PRETEND_CMD_PORT_SHIFT		2
+#define PXP_PRETEND_CMD_RESERVED0_MASK		0xF
+#define PXP_PRETEND_CMD_RESERVED0_SHIFT		4
+#define PXP_PRETEND_CMD_RESERVED1_MASK		0xF
+#define PXP_PRETEND_CMD_RESERVED1_SHIFT		8
+#define PXP_PRETEND_CMD_PRETEND_PATH_MASK	0x1
+#define PXP_PRETEND_CMD_PRETEND_PATH_SHIFT	12
+#define PXP_PRETEND_CMD_PRETEND_PORT_MASK	0x1
+#define PXP_PRETEND_CMD_PRETEND_PORT_SHIFT	13
+#define PXP_PRETEND_CMD_PRETEND_FUNCTION_MASK	0x1
+#define PXP_PRETEND_CMD_PRETEND_FUNCTION_SHIFT	14
+#define PXP_PRETEND_CMD_IS_CONCRETE_MASK	0x1
+#define PXP_PRETEND_CMD_IS_CONCRETE_SHIFT	15
+};
+
+/* PTT Record in PXP Admin Window */
+struct pxp_ptt_entry {
+	__le32 offset;
+#define PXP_PTT_ENTRY_OFFSET_MASK	0x7FFFFF
+#define PXP_PTT_ENTRY_OFFSET_SHIFT	0
+#define PXP_PTT_ENTRY_RESERVED0_MASK	0x1FF
+#define PXP_PTT_ENTRY_RESERVED0_SHIFT	23
+	struct pxp_pretend_cmd pretend;
+};
+
+/* VF Zone A Permission Register */
+struct pxp_vf_zone_a_permission {
+	__le32 control;
+#define PXP_VF_ZONE_A_PERMISSION_VFID_MASK		0xFF
+#define PXP_VF_ZONE_A_PERMISSION_VFID_SHIFT		0
+#define PXP_VF_ZONE_A_PERMISSION_VALID_MASK		0x1
+#define PXP_VF_ZONE_A_PERMISSION_VALID_SHIFT		8
+#define PXP_VF_ZONE_A_PERMISSION_RESERVED0_MASK		0x7F
+#define PXP_VF_ZONE_A_PERMISSION_RESERVED0_SHIFT	9
+#define PXP_VF_ZONE_A_PERMISSION_RESERVED1_MASK		0xFFFF
+#define PXP_VF_ZONE_A_PERMISSION_RESERVED1_SHIFT	16
+};
+
+/* Rdif context */
+struct rdif_task_context {
+	__le32 initial_ref_tag;
+	__le16 app_tag_value;
+	__le16 app_tag_mask;
+	u8 flags0;
+#define RDIF_TASK_CONTEXT_IGNORE_APP_TAG_MASK		0x1
+#define RDIF_TASK_CONTEXT_IGNORE_APP_TAG_SHIFT		0
+#define RDIF_TASK_CONTEXT_INITIAL_REF_TAG_VALID_MASK	0x1
+#define RDIF_TASK_CONTEXT_INITIAL_REF_TAG_VALID_SHIFT	1
+#define RDIF_TASK_CONTEXT_HOST_GUARD_TYPE_MASK		0x1
+#define RDIF_TASK_CONTEXT_HOST_GUARD_TYPE_SHIFT		2
+#define RDIF_TASK_CONTEXT_SET_ERROR_WITH_EOP_MASK	0x1
+#define RDIF_TASK_CONTEXT_SET_ERROR_WITH_EOP_SHIFT	3
+#define RDIF_TASK_CONTEXT_PROTECTION_TYPE_MASK		0x3
+#define RDIF_TASK_CONTEXT_PROTECTION_TYPE_SHIFT		4
+#define RDIF_TASK_CONTEXT_CRC_SEED_MASK			0x1
+#define RDIF_TASK_CONTEXT_CRC_SEED_SHIFT		6
+#define RDIF_TASK_CONTEXT_KEEP_REF_TAG_CONST_MASK	0x1
+#define RDIF_TASK_CONTEXT_KEEP_REF_TAG_CONST_SHIFT	7
+	u8 partial_dif_data[7];
+	__le16 partial_crc_value;
+	__le16 partial_checksum_value;
+	__le32 offset_in_io;
+	__le16 flags1;
+#define RDIF_TASK_CONTEXT_VALIDATE_GUARD_MASK			0x1
+#define RDIF_TASK_CONTEXT_VALIDATE_GUARD_SHIFT			0
+#define RDIF_TASK_CONTEXT_VALIDATE_APP_TAG_MASK			0x1
+#define RDIF_TASK_CONTEXT_VALIDATE_APP_TAG_SHIFT		1
+#define RDIF_TASK_CONTEXT_VALIDATE_REF_TAG_MASK			0x1
+#define RDIF_TASK_CONTEXT_VALIDATE_REF_TAG_SHIFT		2
+#define RDIF_TASK_CONTEXT_FORWARD_GUARD_MASK			0x1
+#define RDIF_TASK_CONTEXT_FORWARD_GUARD_SHIFT			3
+#define RDIF_TASK_CONTEXT_FORWARD_APP_TAG_MASK			0x1
+#define RDIF_TASK_CONTEXT_FORWARD_APP_TAG_SHIFT			4
+#define RDIF_TASK_CONTEXT_FORWARD_REF_TAG_MASK			0x1
+#define RDIF_TASK_CONTEXT_FORWARD_REF_TAG_SHIFT			5
+#define RDIF_TASK_CONTEXT_INTERVAL_SIZE_MASK			0x7
+#define RDIF_TASK_CONTEXT_INTERVAL_SIZE_SHIFT			6
+#define RDIF_TASK_CONTEXT_HOST_INTERFACE_MASK			0x3
+#define RDIF_TASK_CONTEXT_HOST_INTERFACE_SHIFT			9
+#define RDIF_TASK_CONTEXT_DIF_BEFORE_DATA_MASK			0x1
+#define RDIF_TASK_CONTEXT_DIF_BEFORE_DATA_SHIFT			11
+#define RDIF_TASK_CONTEXT_RESERVED0_MASK			0x1
+#define RDIF_TASK_CONTEXT_RESERVED0_SHIFT			12
+#define RDIF_TASK_CONTEXT_NETWORK_INTERFACE_MASK		0x1
+#define RDIF_TASK_CONTEXT_NETWORK_INTERFACE_SHIFT		13
+#define RDIF_TASK_CONTEXT_FORWARD_APP_TAG_WITH_MASK_MASK	0x1
+#define RDIF_TASK_CONTEXT_FORWARD_APP_TAG_WITH_MASK_SHIFT	14
+#define RDIF_TASK_CONTEXT_FORWARD_REF_TAG_WITH_MASK_MASK	0x1
+#define RDIF_TASK_CONTEXT_FORWARD_REF_TAG_WITH_MASK_SHIFT	15
+	__le16 state;
+#define RDIF_TASK_CONTEXT_RECEIVED_DIF_BYTES_LEFT_MASK		0xF
+#define RDIF_TASK_CONTEXT_RECEIVED_DIF_BYTES_LEFT_SHIFT		0
+#define RDIF_TASK_CONTEXT_TRANSMITED_DIF_BYTES_LEFT_MASK	0xF
+#define RDIF_TASK_CONTEXT_TRANSMITED_DIF_BYTES_LEFT_SHIFT	4
+#define RDIF_TASK_CONTEXT_ERROR_IN_IO_MASK			0x1
+#define RDIF_TASK_CONTEXT_ERROR_IN_IO_SHIFT			8
+#define RDIF_TASK_CONTEXT_CHECKSUM_OVERFLOW_MASK		0x1
+#define RDIF_TASK_CONTEXT_CHECKSUM_OVERFLOW_SHIFT		9
+#define RDIF_TASK_CONTEXT_REF_TAG_MASK_MASK			0xF
+#define RDIF_TASK_CONTEXT_REF_TAG_MASK_SHIFT			10
+#define RDIF_TASK_CONTEXT_RESERVED1_MASK			0x3
+#define RDIF_TASK_CONTEXT_RESERVED1_SHIFT			14
+	__le32 reserved2;
+};
+
+/* Status block structure */
+struct status_block_e4 {
+	__le16	pi_array[PIS_PER_SB_E4];
+	__le32	sb_num;
+#define STATUS_BLOCK_E4_SB_NUM_MASK	0x1FF
+#define STATUS_BLOCK_E4_SB_NUM_SHIFT	0
+#define STATUS_BLOCK_E4_ZERO_PAD_MASK	0x7F
+#define STATUS_BLOCK_E4_ZERO_PAD_SHIFT	9
+#define STATUS_BLOCK_E4_ZERO_PAD2_MASK	0xFFFF
+#define STATUS_BLOCK_E4_ZERO_PAD2_SHIFT	16
+	__le32 prod_index;
+#define STATUS_BLOCK_E4_PROD_INDEX_MASK		0xFFFFFF
+#define STATUS_BLOCK_E4_PROD_INDEX_SHIFT	0
+#define STATUS_BLOCK_E4_ZERO_PAD3_MASK		0xFF
+#define STATUS_BLOCK_E4_ZERO_PAD3_SHIFT		24
+};
+
+/* Tdif context */
+struct tdif_task_context {
+	__le32 initial_ref_tag;
+	__le16 app_tag_value;
+	__le16 app_tag_mask;
+	__le16 partial_crc_value_b;
+	__le16 partial_checksum_value_b;
+	__le16 stateB;
+#define TDIF_TASK_CONTEXT_RECEIVED_DIF_BYTES_LEFT_B_MASK	0xF
+#define TDIF_TASK_CONTEXT_RECEIVED_DIF_BYTES_LEFT_B_SHIFT	0
+#define TDIF_TASK_CONTEXT_TRANSMITED_DIF_BYTES_LEFT_B_MASK	0xF
+#define TDIF_TASK_CONTEXT_TRANSMITED_DIF_BYTES_LEFT_B_SHIFT	4
+#define TDIF_TASK_CONTEXT_ERROR_IN_IO_B_MASK			0x1
+#define TDIF_TASK_CONTEXT_ERROR_IN_IO_B_SHIFT			8
+#define TDIF_TASK_CONTEXT_CHECKSUM_VERFLOW_MASK			0x1
+#define TDIF_TASK_CONTEXT_CHECKSUM_VERFLOW_SHIFT		9
+#define TDIF_TASK_CONTEXT_RESERVED0_MASK			0x3F
+#define TDIF_TASK_CONTEXT_RESERVED0_SHIFT			10
+	u8 reserved1;
+	u8 flags0;
+#define TDIF_TASK_CONTEXT_IGNORE_APP_TAG_MASK			0x1
+#define TDIF_TASK_CONTEXT_IGNORE_APP_TAG_SHIFT			0
+#define TDIF_TASK_CONTEXT_INITIAL_REF_TAG_VALID_MASK		0x1
+#define TDIF_TASK_CONTEXT_INITIAL_REF_TAG_VALID_SHIFT		1
+#define TDIF_TASK_CONTEXT_HOST_GUARD_TYPE_MASK			0x1
+#define TDIF_TASK_CONTEXT_HOST_GUARD_TYPE_SHIFT			2
+#define TDIF_TASK_CONTEXT_SET_ERROR_WITH_EOP_MASK		0x1
+#define TDIF_TASK_CONTEXT_SET_ERROR_WITH_EOP_SHIFT		3
+#define TDIF_TASK_CONTEXT_PROTECTION_TYPE_MASK			0x3
+#define TDIF_TASK_CONTEXT_PROTECTION_TYPE_SHIFT			4
+#define TDIF_TASK_CONTEXT_CRC_SEED_MASK				0x1
+#define TDIF_TASK_CONTEXT_CRC_SEED_SHIFT			6
+#define TDIF_TASK_CONTEXT_RESERVED2_MASK			0x1
+#define TDIF_TASK_CONTEXT_RESERVED2_SHIFT			7
+	__le32 flags1;
+#define TDIF_TASK_CONTEXT_VALIDATE_GUARD_MASK			0x1
+#define TDIF_TASK_CONTEXT_VALIDATE_GUARD_SHIFT			0
+#define TDIF_TASK_CONTEXT_VALIDATE_APP_TAG_MASK			0x1
+#define TDIF_TASK_CONTEXT_VALIDATE_APP_TAG_SHIFT		1
+#define TDIF_TASK_CONTEXT_VALIDATE_REF_TAG_MASK			0x1
+#define TDIF_TASK_CONTEXT_VALIDATE_REF_TAG_SHIFT		2
+#define TDIF_TASK_CONTEXT_FORWARD_GUARD_MASK			0x1
+#define TDIF_TASK_CONTEXT_FORWARD_GUARD_SHIFT			3
+#define TDIF_TASK_CONTEXT_FORWARD_APP_TAG_MASK			0x1
+#define TDIF_TASK_CONTEXT_FORWARD_APP_TAG_SHIFT			4
+#define TDIF_TASK_CONTEXT_FORWARD_REF_TAG_MASK			0x1
+#define TDIF_TASK_CONTEXT_FORWARD_REF_TAG_SHIFT			5
+#define TDIF_TASK_CONTEXT_INTERVAL_SIZE_MASK			0x7
+#define TDIF_TASK_CONTEXT_INTERVAL_SIZE_SHIFT			6
+#define TDIF_TASK_CONTEXT_HOST_INTERFACE_MASK			0x3
+#define TDIF_TASK_CONTEXT_HOST_INTERFACE_SHIFT			9
+#define TDIF_TASK_CONTEXT_DIF_BEFORE_DATA_MASK			0x1
+#define TDIF_TASK_CONTEXT_DIF_BEFORE_DATA_SHIFT			11
+#define TDIF_TASK_CONTEXT_RESERVED3_MASK			0x1
+#define TDIF_TASK_CONTEXT_RESERVED3_SHIFT			12
+#define TDIF_TASK_CONTEXT_NETWORK_INTERFACE_MASK		0x1
+#define TDIF_TASK_CONTEXT_NETWORK_INTERFACE_SHIFT		13
+#define TDIF_TASK_CONTEXT_RECEIVED_DIF_BYTES_LEFT_A_MASK	0xF
+#define TDIF_TASK_CONTEXT_RECEIVED_DIF_BYTES_LEFT_A_SHIFT	14
+#define TDIF_TASK_CONTEXT_TRANSMITED_DIF_BYTES_LEFT_A_MASK	0xF
+#define TDIF_TASK_CONTEXT_TRANSMITED_DIF_BYTES_LEFT_A_SHIFT	18
+#define TDIF_TASK_CONTEXT_ERROR_IN_IO_A_MASK			0x1
+#define TDIF_TASK_CONTEXT_ERROR_IN_IO_A_SHIFT			22
+#define TDIF_TASK_CONTEXT_CHECKSUM_OVERFLOW_A_MASK		0x1
+#define TDIF_TASK_CONTEXT_CHECKSUM_OVERFLOW_A_SHIFT		23
+#define TDIF_TASK_CONTEXT_REF_TAG_MASK_MASK			0xF
+#define TDIF_TASK_CONTEXT_REF_TAG_MASK_SHIFT			24
+#define TDIF_TASK_CONTEXT_FORWARD_APP_TAG_WITH_MASK_MASK	0x1
+#define TDIF_TASK_CONTEXT_FORWARD_APP_TAG_WITH_MASK_SHIFT	28
+#define TDIF_TASK_CONTEXT_FORWARD_REF_TAG_WITH_MASK_MASK	0x1
+#define TDIF_TASK_CONTEXT_FORWARD_REF_TAG_WITH_MASK_SHIFT	29
+#define TDIF_TASK_CONTEXT_KEEP_REF_TAG_CONST_MASK		0x1
+#define TDIF_TASK_CONTEXT_KEEP_REF_TAG_CONST_SHIFT		30
+#define TDIF_TASK_CONTEXT_RESERVED4_MASK			0x1
+#define TDIF_TASK_CONTEXT_RESERVED4_SHIFT			31
+	__le32 offset_in_io_b;
+	__le16 partial_crc_value_a;
+	__le16 partial_checksum_value_a;
+	__le32 offset_in_io_a;
+	u8 partial_dif_data_a[8];
+	u8 partial_dif_data_b[8];
+};
+
+/* Timers context */
+struct timers_context {
+	__le32 logical_client_0;
+#define TIMERS_CONTEXT_EXPIRATIONTIMELC0_MASK	0x7FFFFFF
+#define TIMERS_CONTEXT_EXPIRATIONTIMELC0_SHIFT	0
+#define TIMERS_CONTEXT_RESERVED0_MASK		0x1
+#define TIMERS_CONTEXT_RESERVED0_SHIFT		27
+#define TIMERS_CONTEXT_VALIDLC0_MASK		0x1
+#define TIMERS_CONTEXT_VALIDLC0_SHIFT		28
+#define TIMERS_CONTEXT_ACTIVELC0_MASK		0x1
+#define TIMERS_CONTEXT_ACTIVELC0_SHIFT		29
+#define TIMERS_CONTEXT_RESERVED1_MASK		0x3
+#define TIMERS_CONTEXT_RESERVED1_SHIFT		30
+	__le32 logical_client_1;
+#define TIMERS_CONTEXT_EXPIRATIONTIMELC1_MASK	0x7FFFFFF
+#define TIMERS_CONTEXT_EXPIRATIONTIMELC1_SHIFT	0
+#define TIMERS_CONTEXT_RESERVED2_MASK		0x1
+#define TIMERS_CONTEXT_RESERVED2_SHIFT		27
+#define TIMERS_CONTEXT_VALIDLC1_MASK		0x1
+#define TIMERS_CONTEXT_VALIDLC1_SHIFT		28
+#define TIMERS_CONTEXT_ACTIVELC1_MASK		0x1
+#define TIMERS_CONTEXT_ACTIVELC1_SHIFT		29
+#define TIMERS_CONTEXT_RESERVED3_MASK		0x3
+#define TIMERS_CONTEXT_RESERVED3_SHIFT		30
+	__le32 logical_client_2;
+#define TIMERS_CONTEXT_EXPIRATIONTIMELC2_MASK	0x7FFFFFF
+#define TIMERS_CONTEXT_EXPIRATIONTIMELC2_SHIFT	0
+#define TIMERS_CONTEXT_RESERVED4_MASK		0x1
+#define TIMERS_CONTEXT_RESERVED4_SHIFT		27
+#define TIMERS_CONTEXT_VALIDLC2_MASK		0x1
+#define TIMERS_CONTEXT_VALIDLC2_SHIFT		28
+#define TIMERS_CONTEXT_ACTIVELC2_MASK		0x1
+#define TIMERS_CONTEXT_ACTIVELC2_SHIFT		29
+#define TIMERS_CONTEXT_RESERVED5_MASK		0x3
+#define TIMERS_CONTEXT_RESERVED5_SHIFT		30
+	__le32 host_expiration_fields;
+#define TIMERS_CONTEXT_HOSTEXPRIRATIONVALUE_MASK	0x7FFFFFF
+#define TIMERS_CONTEXT_HOSTEXPRIRATIONVALUE_SHIFT	0
+#define TIMERS_CONTEXT_RESERVED6_MASK			0x1
+#define TIMERS_CONTEXT_RESERVED6_SHIFT			27
+#define TIMERS_CONTEXT_HOSTEXPRIRATIONVALID_MASK	0x1
+#define TIMERS_CONTEXT_HOSTEXPRIRATIONVALID_SHIFT	 28
+#define TIMERS_CONTEXT_RESERVED7_MASK			0x7
+#define TIMERS_CONTEXT_RESERVED7_SHIFT			29
+};
+
+/* Enum for next_protocol field of tunnel_parsing_flags / tunnelTypeDesc */
+enum tunnel_next_protocol {
+	e_unknown = 0,
+	e_l2 = 1,
+	e_ipv4 = 2,
+	e_ipv6 = 3,
+	MAX_TUNNEL_NEXT_PROTOCOL
+};
+
+#endif /* __COMMON_HSI__ */
+#endif
diff --git a/include/linux/qed/eth_common.h b/include/linux/qed/eth_common.h
new file mode 100644
index 0000000..d9416ad
--- /dev/null
+++ b/include/linux/qed/eth_common.h
@@ -0,0 +1,481 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ETH_COMMON__
+#define __ETH_COMMON__
+
+/********************/
+/* ETH FW CONSTANTS */
+/********************/
+
+#define ETH_HSI_VER_MAJOR		3
+#define ETH_HSI_VER_MINOR		10
+
+#define ETH_HSI_VER_NO_PKT_LEN_TUNN	5
+
+#define ETH_CACHE_LINE_SIZE		64
+#define ETH_RX_CQE_GAP			32
+#define ETH_MAX_RAMROD_PER_CON		8
+#define ETH_TX_BD_PAGE_SIZE_BYTES	4096
+#define ETH_RX_BD_PAGE_SIZE_BYTES	4096
+#define ETH_RX_CQE_PAGE_SIZE_BYTES	4096
+#define ETH_RX_NUM_NEXT_PAGE_BDS	2
+
+#define ETH_MAX_TUNN_LSO_INNER_IPV4_OFFSET	253
+#define ETH_MAX_TUNN_LSO_INNER_IPV6_OFFSET	251
+
+#define ETH_TX_MIN_BDS_PER_NON_LSO_PKT			1
+#define ETH_TX_MAX_BDS_PER_NON_LSO_PACKET		18
+#define ETH_TX_MAX_BDS_PER_LSO_PACKET			255
+#define ETH_TX_MAX_LSO_HDR_NBD				4
+#define ETH_TX_MIN_BDS_PER_LSO_PKT			3
+#define ETH_TX_MIN_BDS_PER_TUNN_IPV6_WITH_EXT_PKT	3
+#define ETH_TX_MIN_BDS_PER_IPV6_WITH_EXT_PKT		2
+#define ETH_TX_MIN_BDS_PER_PKT_W_LOOPBACK_MODE		2
+#define ETH_TX_MAX_NON_LSO_PKT_LEN		(9700 - (4 + 4 + 12 + 8))
+#define ETH_TX_MAX_LSO_HDR_BYTES			510
+#define ETH_TX_LSO_WINDOW_BDS_NUM			(18 - 1)
+#define ETH_TX_LSO_WINDOW_MIN_LEN			9700
+#define ETH_TX_MAX_LSO_PAYLOAD_LEN			0xFE000
+#define ETH_TX_NUM_SAME_AS_LAST_ENTRIES			320
+#define ETH_TX_INACTIVE_SAME_AS_LAST			0xFFFF
+
+#define ETH_NUM_STATISTIC_COUNTERS			MAX_NUM_VPORTS
+#define ETH_NUM_STATISTIC_COUNTERS_DOUBLE_VF_ZONE \
+	(ETH_NUM_STATISTIC_COUNTERS - MAX_NUM_VFS / 2)
+#define ETH_NUM_STATISTIC_COUNTERS_QUAD_VF_ZONE \
+	(ETH_NUM_STATISTIC_COUNTERS - 3 * MAX_NUM_VFS / 4)
+
+/* Maximum number of buffers, used for RX packet placement */
+#define ETH_RX_MAX_BUFF_PER_PKT		5
+#define ETH_RX_BD_THRESHOLD		12
+
+/* Num of MAC/VLAN filters */
+#define ETH_NUM_MAC_FILTERS		512
+#define ETH_NUM_VLAN_FILTERS		512
+
+/* Approx. multicast constants */
+#define ETH_MULTICAST_BIN_FROM_MAC_SEED	0
+#define ETH_MULTICAST_MAC_BINS		256
+#define ETH_MULTICAST_MAC_BINS_IN_REGS	(ETH_MULTICAST_MAC_BINS / 32)
+
+/* Ethernet vport update constants */
+#define ETH_FILTER_RULES_COUNT		10
+#define ETH_RSS_IND_TABLE_ENTRIES_NUM	128
+#define ETH_RSS_KEY_SIZE_REGS		10
+#define ETH_RSS_ENGINE_NUM_K2		207
+#define ETH_RSS_ENGINE_NUM_BB		127
+
+/* TPA constants */
+#define ETH_TPA_MAX_AGGS_NUM		64
+#define ETH_TPA_CQE_START_LEN_LIST_SIZE	ETH_RX_MAX_BUFF_PER_PKT
+#define ETH_TPA_CQE_CONT_LEN_LIST_SIZE	6
+#define ETH_TPA_CQE_END_LEN_LIST_SIZE	4
+
+/* Control frame check constants */
+#define ETH_CTL_FRAME_ETH_TYPE_NUM	4
+
+/* GFS constants */
+#define ETH_GFT_TRASHCAN_VPORT         0x1FF	/* GFT drop flow vport number */
+
+/* Destination port mode */
+enum dest_port_mode {
+	DEST_PORT_PHY,
+	DEST_PORT_LOOPBACK,
+	DEST_PORT_PHY_LOOPBACK,
+	DEST_PORT_DROP,
+	MAX_DEST_PORT_MODE
+};
+
+/* Ethernet address type */
+enum eth_addr_type {
+	BROADCAST_ADDRESS,
+	MULTICAST_ADDRESS,
+	UNICAST_ADDRESS,
+	UNKNOWN_ADDRESS,
+	MAX_ETH_ADDR_TYPE
+};
+
+struct eth_tx_1st_bd_flags {
+	u8 bitfields;
+#define ETH_TX_1ST_BD_FLAGS_START_BD_MASK		0x1
+#define ETH_TX_1ST_BD_FLAGS_START_BD_SHIFT		0
+#define ETH_TX_1ST_BD_FLAGS_FORCE_VLAN_MODE_MASK	0x1
+#define ETH_TX_1ST_BD_FLAGS_FORCE_VLAN_MODE_SHIFT	1
+#define ETH_TX_1ST_BD_FLAGS_IP_CSUM_MASK		0x1
+#define ETH_TX_1ST_BD_FLAGS_IP_CSUM_SHIFT		2
+#define ETH_TX_1ST_BD_FLAGS_L4_CSUM_MASK		0x1
+#define ETH_TX_1ST_BD_FLAGS_L4_CSUM_SHIFT		3
+#define ETH_TX_1ST_BD_FLAGS_VLAN_INSERTION_MASK		0x1
+#define ETH_TX_1ST_BD_FLAGS_VLAN_INSERTION_SHIFT	4
+#define ETH_TX_1ST_BD_FLAGS_LSO_MASK			0x1
+#define ETH_TX_1ST_BD_FLAGS_LSO_SHIFT			5
+#define ETH_TX_1ST_BD_FLAGS_TUNN_IP_CSUM_MASK		0x1
+#define ETH_TX_1ST_BD_FLAGS_TUNN_IP_CSUM_SHIFT		6
+#define ETH_TX_1ST_BD_FLAGS_TUNN_L4_CSUM_MASK		0x1
+#define ETH_TX_1ST_BD_FLAGS_TUNN_L4_CSUM_SHIFT		7
+};
+
+/* The parsing information data fo rthe first tx bd of a given packet */
+struct eth_tx_data_1st_bd {
+	__le16 vlan;
+	u8 nbds;
+	struct eth_tx_1st_bd_flags bd_flags;
+	__le16 bitfields;
+#define ETH_TX_DATA_1ST_BD_TUNN_FLAG_MASK	0x1
+#define ETH_TX_DATA_1ST_BD_TUNN_FLAG_SHIFT	0
+#define ETH_TX_DATA_1ST_BD_RESERVED0_MASK	0x1
+#define ETH_TX_DATA_1ST_BD_RESERVED0_SHIFT	1
+#define ETH_TX_DATA_1ST_BD_PKT_LEN_MASK		0x3FFF
+#define ETH_TX_DATA_1ST_BD_PKT_LEN_SHIFT	2
+};
+
+/* The parsing information data for the second tx bd of a given packet */
+struct eth_tx_data_2nd_bd {
+	__le16 tunn_ip_size;
+	__le16	bitfields1;
+#define ETH_TX_DATA_2ND_BD_TUNN_INNER_L2_HDR_SIZE_W_MASK	0xF
+#define ETH_TX_DATA_2ND_BD_TUNN_INNER_L2_HDR_SIZE_W_SHIFT	0
+#define ETH_TX_DATA_2ND_BD_TUNN_INNER_ETH_TYPE_MASK		0x3
+#define ETH_TX_DATA_2ND_BD_TUNN_INNER_ETH_TYPE_SHIFT		4
+#define ETH_TX_DATA_2ND_BD_DEST_PORT_MODE_MASK			0x3
+#define ETH_TX_DATA_2ND_BD_DEST_PORT_MODE_SHIFT			6
+#define ETH_TX_DATA_2ND_BD_START_BD_MASK			0x1
+#define ETH_TX_DATA_2ND_BD_START_BD_SHIFT			8
+#define ETH_TX_DATA_2ND_BD_TUNN_TYPE_MASK			0x3
+#define ETH_TX_DATA_2ND_BD_TUNN_TYPE_SHIFT			9
+#define ETH_TX_DATA_2ND_BD_TUNN_INNER_IPV6_MASK			0x1
+#define ETH_TX_DATA_2ND_BD_TUNN_INNER_IPV6_SHIFT		11
+#define ETH_TX_DATA_2ND_BD_IPV6_EXT_MASK			0x1
+#define ETH_TX_DATA_2ND_BD_IPV6_EXT_SHIFT			12
+#define ETH_TX_DATA_2ND_BD_TUNN_IPV6_EXT_MASK			0x1
+#define ETH_TX_DATA_2ND_BD_TUNN_IPV6_EXT_SHIFT			13
+#define ETH_TX_DATA_2ND_BD_L4_UDP_MASK				0x1
+#define ETH_TX_DATA_2ND_BD_L4_UDP_SHIFT				14
+#define ETH_TX_DATA_2ND_BD_L4_PSEUDO_CSUM_MODE_MASK		0x1
+#define ETH_TX_DATA_2ND_BD_L4_PSEUDO_CSUM_MODE_SHIFT		15
+	__le16 bitfields2;
+#define ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_MASK		0x1FFF
+#define ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_SHIFT		0
+#define ETH_TX_DATA_2ND_BD_RESERVED0_MASK			0x7
+#define ETH_TX_DATA_2ND_BD_RESERVED0_SHIFT			13
+};
+
+/* Firmware data for L2-EDPM packet */
+struct eth_edpm_fw_data {
+	struct eth_tx_data_1st_bd data_1st_bd;
+	struct eth_tx_data_2nd_bd data_2nd_bd;
+	__le32 reserved;
+};
+
+/* Tunneling parsing flags */
+struct eth_tunnel_parsing_flags {
+	u8 flags;
+#define	ETH_TUNNEL_PARSING_FLAGS_TYPE_MASK		0x3
+#define	ETH_TUNNEL_PARSING_FLAGS_TYPE_SHIFT		0
+#define	ETH_TUNNEL_PARSING_FLAGS_TENNANT_ID_EXIST_MASK	0x1
+#define	ETH_TUNNEL_PARSING_FLAGS_TENNANT_ID_EXIST_SHIFT	2
+#define	ETH_TUNNEL_PARSING_FLAGS_NEXT_PROTOCOL_MASK	0x3
+#define	ETH_TUNNEL_PARSING_FLAGS_NEXT_PROTOCOL_SHIFT	3
+#define	ETH_TUNNEL_PARSING_FLAGS_FIRSTHDRIPMATCH_MASK	0x1
+#define	ETH_TUNNEL_PARSING_FLAGS_FIRSTHDRIPMATCH_SHIFT	5
+#define	ETH_TUNNEL_PARSING_FLAGS_IPV4_FRAGMENT_MASK	0x1
+#define	ETH_TUNNEL_PARSING_FLAGS_IPV4_FRAGMENT_SHIFT	6
+#define	ETH_TUNNEL_PARSING_FLAGS_IPV4_OPTIONS_MASK	0x1
+#define	ETH_TUNNEL_PARSING_FLAGS_IPV4_OPTIONS_SHIFT	7
+};
+
+/* PMD flow control bits */
+struct eth_pmd_flow_flags {
+	u8 flags;
+#define ETH_PMD_FLOW_FLAGS_VALID_MASK		0x1
+#define ETH_PMD_FLOW_FLAGS_VALID_SHIFT		0
+#define ETH_PMD_FLOW_FLAGS_TOGGLE_MASK		0x1
+#define ETH_PMD_FLOW_FLAGS_TOGGLE_SHIFT		1
+#define ETH_PMD_FLOW_FLAGS_RESERVED_MASK	0x3F
+#define ETH_PMD_FLOW_FLAGS_RESERVED_SHIFT	2
+};
+
+/* Regular ETH Rx FP CQE */
+struct eth_fast_path_rx_reg_cqe {
+	u8 type;
+	u8 bitfields;
+#define ETH_FAST_PATH_RX_REG_CQE_RSS_HASH_TYPE_MASK	0x7
+#define ETH_FAST_PATH_RX_REG_CQE_RSS_HASH_TYPE_SHIFT	0
+#define ETH_FAST_PATH_RX_REG_CQE_TC_MASK		0xF
+#define ETH_FAST_PATH_RX_REG_CQE_TC_SHIFT		3
+#define ETH_FAST_PATH_RX_REG_CQE_RESERVED0_MASK		0x1
+#define ETH_FAST_PATH_RX_REG_CQE_RESERVED0_SHIFT	7
+	__le16 pkt_len;
+	struct parsing_and_err_flags pars_flags;
+	__le16 vlan_tag;
+	__le32 rss_hash;
+	__le16 len_on_first_bd;
+	u8 placement_offset;
+	struct eth_tunnel_parsing_flags tunnel_pars_flags;
+	u8 bd_num;
+	u8 reserved;
+	__le16 flow_id;
+	u8 reserved1[11];
+	struct eth_pmd_flow_flags pmd_flags;
+};
+
+/* TPA-continue ETH Rx FP CQE */
+struct eth_fast_path_rx_tpa_cont_cqe {
+	u8 type;
+	u8 tpa_agg_index;
+	__le16 len_list[ETH_TPA_CQE_CONT_LEN_LIST_SIZE];
+	u8 reserved;
+	u8 reserved1;
+	__le16 reserved2[ETH_TPA_CQE_CONT_LEN_LIST_SIZE];
+	u8 reserved3[3];
+	struct eth_pmd_flow_flags pmd_flags;
+};
+
+/* TPA-end ETH Rx FP CQE */
+struct eth_fast_path_rx_tpa_end_cqe {
+	u8 type;
+	u8 tpa_agg_index;
+	__le16 total_packet_len;
+	u8 num_of_bds;
+	u8 end_reason;
+	__le16 num_of_coalesced_segs;
+	__le32 ts_delta;
+	__le16 len_list[ETH_TPA_CQE_END_LEN_LIST_SIZE];
+	__le16 reserved3[ETH_TPA_CQE_END_LEN_LIST_SIZE];
+	__le16 reserved1;
+	u8 reserved2;
+	struct eth_pmd_flow_flags pmd_flags;
+};
+
+/* TPA-start ETH Rx FP CQE */
+struct eth_fast_path_rx_tpa_start_cqe {
+	u8 type;
+	u8 bitfields;
+#define ETH_FAST_PATH_RX_TPA_START_CQE_RSS_HASH_TYPE_MASK	0x7
+#define ETH_FAST_PATH_RX_TPA_START_CQE_RSS_HASH_TYPE_SHIFT	0
+#define ETH_FAST_PATH_RX_TPA_START_CQE_TC_MASK			0xF
+#define ETH_FAST_PATH_RX_TPA_START_CQE_TC_SHIFT			3
+#define ETH_FAST_PATH_RX_TPA_START_CQE_RESERVED0_MASK		0x1
+#define ETH_FAST_PATH_RX_TPA_START_CQE_RESERVED0_SHIFT		7
+	__le16 seg_len;
+	struct parsing_and_err_flags pars_flags;
+	__le16 vlan_tag;
+	__le32 rss_hash;
+	__le16 len_on_first_bd;
+	u8 placement_offset;
+	struct eth_tunnel_parsing_flags tunnel_pars_flags;
+	u8 tpa_agg_index;
+	u8 header_len;
+	__le16 ext_bd_len_list[ETH_TPA_CQE_START_LEN_LIST_SIZE];
+	__le16 flow_id;
+	u8 reserved;
+	struct eth_pmd_flow_flags pmd_flags;
+};
+
+/* The L4 pseudo checksum mode for Ethernet */
+enum eth_l4_pseudo_checksum_mode {
+	ETH_L4_PSEUDO_CSUM_CORRECT_LENGTH,
+	ETH_L4_PSEUDO_CSUM_ZERO_LENGTH,
+	MAX_ETH_L4_PSEUDO_CHECKSUM_MODE
+};
+
+struct eth_rx_bd {
+	struct regpair addr;
+};
+
+/* Regular ETH Rx SP CQE */
+struct eth_slow_path_rx_cqe {
+	u8 type;
+	u8 ramrod_cmd_id;
+	u8 error_flag;
+	u8 reserved[25];
+	__le16 echo;
+	u8 reserved1;
+	struct eth_pmd_flow_flags pmd_flags;
+};
+
+/* Union for all ETH Rx CQE types */
+union eth_rx_cqe {
+	struct eth_fast_path_rx_reg_cqe fast_path_regular;
+	struct eth_fast_path_rx_tpa_start_cqe fast_path_tpa_start;
+	struct eth_fast_path_rx_tpa_cont_cqe fast_path_tpa_cont;
+	struct eth_fast_path_rx_tpa_end_cqe fast_path_tpa_end;
+	struct eth_slow_path_rx_cqe slow_path;
+};
+
+/* ETH Rx CQE type */
+enum eth_rx_cqe_type {
+	ETH_RX_CQE_TYPE_UNUSED,
+	ETH_RX_CQE_TYPE_REGULAR,
+	ETH_RX_CQE_TYPE_SLOW_PATH,
+	ETH_RX_CQE_TYPE_TPA_START,
+	ETH_RX_CQE_TYPE_TPA_CONT,
+	ETH_RX_CQE_TYPE_TPA_END,
+	MAX_ETH_RX_CQE_TYPE
+};
+
+struct eth_rx_pmd_cqe {
+	union eth_rx_cqe cqe;
+	u8 reserved[ETH_RX_CQE_GAP];
+};
+
+enum eth_rx_tunn_type {
+	ETH_RX_NO_TUNN,
+	ETH_RX_TUNN_GENEVE,
+	ETH_RX_TUNN_GRE,
+	ETH_RX_TUNN_VXLAN,
+	MAX_ETH_RX_TUNN_TYPE
+};
+
+/* Aggregation end reason. */
+enum eth_tpa_end_reason {
+	ETH_AGG_END_UNUSED,
+	ETH_AGG_END_SP_UPDATE,
+	ETH_AGG_END_MAX_LEN,
+	ETH_AGG_END_LAST_SEG,
+	ETH_AGG_END_TIMEOUT,
+	ETH_AGG_END_NOT_CONSISTENT,
+	ETH_AGG_END_OUT_OF_ORDER,
+	ETH_AGG_END_NON_TPA_SEG,
+	MAX_ETH_TPA_END_REASON
+};
+
+/* The first tx bd of a given packet */
+struct eth_tx_1st_bd {
+	struct regpair addr;
+	__le16 nbytes;
+	struct eth_tx_data_1st_bd data;
+};
+
+/* The second tx bd of a given packet */
+struct eth_tx_2nd_bd {
+	struct regpair addr;
+	__le16 nbytes;
+	struct eth_tx_data_2nd_bd data;
+};
+
+/* The parsing information data for the third tx bd of a given packet */
+struct eth_tx_data_3rd_bd {
+	__le16 lso_mss;
+	__le16 bitfields;
+#define ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_MASK	0xF
+#define ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_SHIFT	0
+#define ETH_TX_DATA_3RD_BD_HDR_NBD_MASK		0xF
+#define ETH_TX_DATA_3RD_BD_HDR_NBD_SHIFT	4
+#define ETH_TX_DATA_3RD_BD_START_BD_MASK	0x1
+#define ETH_TX_DATA_3RD_BD_START_BD_SHIFT	8
+#define ETH_TX_DATA_3RD_BD_RESERVED0_MASK	0x7F
+#define ETH_TX_DATA_3RD_BD_RESERVED0_SHIFT	9
+	u8 tunn_l4_hdr_start_offset_w;
+	u8 tunn_hdr_size_w;
+};
+
+/* The third tx bd of a given packet */
+struct eth_tx_3rd_bd {
+	struct regpair addr;
+	__le16 nbytes;
+	struct eth_tx_data_3rd_bd data;
+};
+
+/* Complementary information for the regular tx bd of a given packet */
+struct eth_tx_data_bd {
+	__le16 reserved0;
+	__le16 bitfields;
+#define ETH_TX_DATA_BD_RESERVED1_MASK	0xFF
+#define ETH_TX_DATA_BD_RESERVED1_SHIFT	0
+#define ETH_TX_DATA_BD_START_BD_MASK	0x1
+#define ETH_TX_DATA_BD_START_BD_SHIFT	8
+#define ETH_TX_DATA_BD_RESERVED2_MASK	0x7F
+#define ETH_TX_DATA_BD_RESERVED2_SHIFT	9
+	__le16 reserved3;
+};
+
+/* The common non-special TX BD ring element */
+struct eth_tx_bd {
+	struct regpair addr;
+	__le16 nbytes;
+	struct eth_tx_data_bd data;
+};
+
+union eth_tx_bd_types {
+	struct eth_tx_1st_bd first_bd;
+	struct eth_tx_2nd_bd second_bd;
+	struct eth_tx_3rd_bd third_bd;
+	struct eth_tx_bd reg_bd;
+};
+
+/* Mstorm Queue Zone */
+enum eth_tx_tunn_type {
+	ETH_TX_TUNN_GENEVE,
+	ETH_TX_TUNN_TTAG,
+	ETH_TX_TUNN_GRE,
+	ETH_TX_TUNN_VXLAN,
+	MAX_ETH_TX_TUNN_TYPE
+};
+
+/* Ystorm Queue Zone */
+struct xstorm_eth_queue_zone {
+	struct coalescing_timeset int_coalescing_timeset;
+	u8 reserved[7];
+};
+
+/* ETH doorbell data */
+struct eth_db_data {
+	u8 params;
+#define ETH_DB_DATA_DEST_MASK		0x3
+#define ETH_DB_DATA_DEST_SHIFT		0
+#define ETH_DB_DATA_AGG_CMD_MASK	0x3
+#define ETH_DB_DATA_AGG_CMD_SHIFT	2
+#define ETH_DB_DATA_BYPASS_EN_MASK	0x1
+#define ETH_DB_DATA_BYPASS_EN_SHIFT	4
+#define ETH_DB_DATA_RESERVED_MASK	0x1
+#define ETH_DB_DATA_RESERVED_SHIFT	5
+#define ETH_DB_DATA_AGG_VAL_SEL_MASK	0x3
+#define ETH_DB_DATA_AGG_VAL_SEL_SHIFT	6
+	u8 agg_flags;
+	__le16 bd_prod;
+};
+
+/* RSS hash type */
+enum rss_hash_type {
+	RSS_HASH_TYPE_DEFAULT = 0,
+	RSS_HASH_TYPE_IPV4 = 1,
+	RSS_HASH_TYPE_TCP_IPV4 = 2,
+	RSS_HASH_TYPE_IPV6 = 3,
+	RSS_HASH_TYPE_TCP_IPV6 = 4,
+	RSS_HASH_TYPE_UDP_IPV4 = 5,
+	RSS_HASH_TYPE_UDP_IPV6 = 6,
+	MAX_RSS_HASH_TYPE
+};
+
+#endif /* __ETH_COMMON__ */
diff --git a/include/linux/qed/fcoe_common.h b/include/linux/qed/fcoe_common.h
new file mode 100644
index 0000000..22077c5
--- /dev/null
+++ b/include/linux/qed/fcoe_common.h
@@ -0,0 +1,744 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef __FCOE_COMMON__
+#define __FCOE_COMMON__
+
+/*********************/
+/* FCOE FW CONSTANTS */
+/*********************/
+
+#define FC_ABTS_REPLY_MAX_PAYLOAD_LEN	12
+
+/* The fcoe storm task context protection-information of Ystorm */
+struct protection_info_ctx {
+	__le16 flags;
+#define PROTECTION_INFO_CTX_HOST_INTERFACE_MASK		0x3
+#define PROTECTION_INFO_CTX_HOST_INTERFACE_SHIFT	0
+#define PROTECTION_INFO_CTX_DIF_TO_PEER_MASK		0x1
+#define PROTECTION_INFO_CTX_DIF_TO_PEER_SHIFT		2
+#define PROTECTION_INFO_CTX_VALIDATE_DIX_APP_TAG_MASK	0x1
+#define PROTECTION_INFO_CTX_VALIDATE_DIX_APP_TAG_SHIFT	3
+#define PROTECTION_INFO_CTX_INTERVAL_SIZE_LOG_MASK	0xF
+#define PROTECTION_INFO_CTX_INTERVAL_SIZE_LOG_SHIFT	4
+#define PROTECTION_INFO_CTX_VALIDATE_DIX_REF_TAG_MASK	0x1
+#define PROTECTION_INFO_CTX_VALIDATE_DIX_REF_TAG_SHIFT	8
+#define PROTECTION_INFO_CTX_RESERVED0_MASK		0x7F
+#define PROTECTION_INFO_CTX_RESERVED0_SHIFT		9
+	u8 dix_block_size;
+	u8 dst_size;
+};
+
+/* The fcoe storm task context protection-information of Ystorm */
+union protection_info_union_ctx {
+	struct protection_info_ctx info;
+	__le32 value;
+};
+
+/* FCP CMD payload */
+struct fcoe_fcp_cmd_payload {
+	__le32 opaque[8];
+};
+
+/* FCP RSP payload */
+struct fcoe_fcp_rsp_payload {
+	__le32 opaque[6];
+};
+
+/* FCP RSP payload */
+struct fcp_rsp_payload_padded {
+	struct fcoe_fcp_rsp_payload rsp_payload;
+	__le32 reserved[2];
+};
+
+/* FCP RSP payload */
+struct fcoe_fcp_xfer_payload {
+	__le32 opaque[3];
+};
+
+/* FCP RSP payload */
+struct fcp_xfer_payload_padded {
+	struct fcoe_fcp_xfer_payload xfer_payload;
+	__le32 reserved[5];
+};
+
+/* Task params */
+struct fcoe_tx_data_params {
+	__le32 data_offset;
+	__le32 offset_in_io;
+	u8 flags;
+#define FCOE_TX_DATA_PARAMS_OFFSET_IN_IO_VALID_MASK	0x1
+#define FCOE_TX_DATA_PARAMS_OFFSET_IN_IO_VALID_SHIFT	0
+#define FCOE_TX_DATA_PARAMS_DROP_DATA_MASK		0x1
+#define FCOE_TX_DATA_PARAMS_DROP_DATA_SHIFT		1
+#define FCOE_TX_DATA_PARAMS_AFTER_SEQ_REC_MASK		0x1
+#define FCOE_TX_DATA_PARAMS_AFTER_SEQ_REC_SHIFT		2
+#define FCOE_TX_DATA_PARAMS_RESERVED0_MASK		0x1F
+#define FCOE_TX_DATA_PARAMS_RESERVED0_SHIFT		3
+	u8 dif_residual;
+	__le16 seq_cnt;
+	__le16 single_sge_saved_offset;
+	__le16 next_dif_offset;
+	__le16 seq_id;
+	__le16 reserved3;
+};
+
+/* Middle path parameters: FC header fields provided by the driver */
+struct fcoe_tx_mid_path_params {
+	__le32 parameter;
+	u8 r_ctl;
+	u8 type;
+	u8 cs_ctl;
+	u8 df_ctl;
+	__le16 rx_id;
+	__le16 ox_id;
+};
+
+/* Task params */
+struct fcoe_tx_params {
+	struct fcoe_tx_data_params data;
+	struct fcoe_tx_mid_path_params mid_path;
+};
+
+/* Union of FCP CMD payload \ TX params \ ABTS \ Cleanup */
+union fcoe_tx_info_union_ctx {
+	struct fcoe_fcp_cmd_payload fcp_cmd_payload;
+	struct fcp_rsp_payload_padded fcp_rsp_payload;
+	struct fcp_xfer_payload_padded fcp_xfer_payload;
+	struct fcoe_tx_params tx_params;
+};
+
+/* Data sgl */
+struct fcoe_slow_sgl_ctx {
+	struct regpair base_sgl_addr;
+	__le16 curr_sge_off;
+	__le16 remainder_num_sges;
+	__le16 curr_sgl_index;
+	__le16 reserved;
+};
+
+/* Union of DIX SGL \ cached DIX sges */
+union fcoe_dix_desc_ctx {
+	struct fcoe_slow_sgl_ctx dix_sgl;
+	struct scsi_sge cached_dix_sge;
+};
+
+/* The fcoe storm task context of Ystorm */
+struct ystorm_fcoe_task_st_ctx {
+	u8 task_type;
+	u8 sgl_mode;
+#define YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_MASK	0x1
+#define YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_SHIFT	0
+#define YSTORM_FCOE_TASK_ST_CTX_RSRV_MASK		0x7F
+#define YSTORM_FCOE_TASK_ST_CTX_RSRV_SHIFT		1
+	u8 cached_dix_sge;
+	u8 expect_first_xfer;
+	__le32 num_pbf_zero_write;
+	union protection_info_union_ctx protection_info_union;
+	__le32 data_2_trns_rem;
+	struct scsi_sgl_params sgl_params;
+	u8 reserved1[12];
+	union fcoe_tx_info_union_ctx tx_info_union;
+	union fcoe_dix_desc_ctx dix_desc;
+	struct scsi_cached_sges data_desc;
+	__le16 ox_id;
+	__le16 rx_id;
+	__le32 task_rety_identifier;
+	u8 reserved2[8];
+};
+
+struct e4_ystorm_fcoe_task_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	__le16 word0;
+	u8 flags0;
+#define E4_YSTORM_FCOE_TASK_AG_CTX_NIBBLE0_MASK		0xF
+#define E4_YSTORM_FCOE_TASK_AG_CTX_NIBBLE0_SHIFT	0
+#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT0_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT0_SHIFT		4
+#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT1_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT		5
+#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT2_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT2_SHIFT		6
+#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT3_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT3_SHIFT		7
+	u8 flags1;
+#define E4_YSTORM_FCOE_TASK_AG_CTX_CF0_MASK		0x3
+#define E4_YSTORM_FCOE_TASK_AG_CTX_CF0_SHIFT		0
+#define E4_YSTORM_FCOE_TASK_AG_CTX_CF1_MASK		0x3
+#define E4_YSTORM_FCOE_TASK_AG_CTX_CF1_SHIFT		2
+#define E4_YSTORM_FCOE_TASK_AG_CTX_CF2SPECIAL_MASK	0x3
+#define E4_YSTORM_FCOE_TASK_AG_CTX_CF2SPECIAL_SHIFT	4
+#define E4_YSTORM_FCOE_TASK_AG_CTX_CF0EN_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_CF0EN_SHIFT		6
+#define E4_YSTORM_FCOE_TASK_AG_CTX_CF1EN_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT		7
+	u8 flags2;
+#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT4_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT4_SHIFT		0
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT	1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT	2
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT	3
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT	4
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT	5
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT	6
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK		0x1
+#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT	7
+	u8 byte2;
+	__le32 reg0;
+	u8 byte3;
+	u8 byte4;
+	__le16 rx_id;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le16 word5;
+	__le32 reg1;
+	__le32 reg2;
+};
+
+struct e4_tstorm_fcoe_task_ag_ctx {
+	u8 reserved;
+	u8 byte1;
+	__le16 icid;
+	u8 flags0;
+#define E4_TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK		0xF
+#define E4_TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
+#define E4_TSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
+#define E4_TSTORM_FCOE_TASK_AG_CTX_BIT1_MASK			0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT			5
+#define E4_TSTORM_FCOE_TASK_AG_CTX_WAIT_ABTS_RSP_F_MASK		0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_WAIT_ABTS_RSP_F_SHIFT	6
+#define E4_TSTORM_FCOE_TASK_AG_CTX_VALID_MASK			0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_VALID_SHIFT			7
+	u8 flags1;
+#define E4_TSTORM_FCOE_TASK_AG_CTX_FALSE_RR_TOV_MASK	0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_FALSE_RR_TOV_SHIFT	0
+#define E4_TSTORM_FCOE_TASK_AG_CTX_BIT5_MASK		0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_BIT5_SHIFT		1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_MASK	0x3
+#define E4_TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_SHIFT	2
+#define E4_TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_MASK	0x3
+#define E4_TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_SHIFT	4
+#define E4_TSTORM_FCOE_TASK_AG_CTX_CF2_MASK		0x3
+#define E4_TSTORM_FCOE_TASK_AG_CTX_CF2_SHIFT		6
+	u8 flags2;
+#define E4_TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_MASK		0x3
+#define E4_TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_SHIFT		0
+#define E4_TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_MASK		0x3
+#define E4_TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_SHIFT		2
+#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_MASK		0x3
+#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_SHIFT		4
+#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_MASK		0x3
+#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_SHIFT	6
+	u8 flags3;
+#define E4_TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_MASK		0x3
+#define E4_TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_SHIFT		0
+#define E4_TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_EN_MASK	0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_EN_SHIFT	2
+#define E4_TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_EN_MASK		0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_EN_SHIFT		3
+#define E4_TSTORM_FCOE_TASK_AG_CTX_CF2EN_MASK			0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT			4
+#define E4_TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_EN_MASK	0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_EN_SHIFT	5
+#define E4_TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_MASK	0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_SHIFT	6
+#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_EN_MASK		0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_EN_SHIFT		7
+	u8 flags4;
+#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_EN_MASK	0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_EN_SHIFT	0
+#define E4_TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_EN_MASK	0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_EN_SHIFT	1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK			0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT		2
+#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK			0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT		3
+#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK			0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT		4
+#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK			0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT		5
+#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK			0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT		6
+#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK			0x1
+#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT		7
+	u8 cleanup_state;
+	__le16 last_sent_tid;
+	__le32 rec_rr_tov_exp_timeout;
+	u8 byte3;
+	u8 byte4;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le32 data_offset_end_of_seq;
+	__le32 data_offset_next;
+};
+
+/* Cached data sges */
+struct fcoe_exp_ro {
+	__le32 data_offset;
+	__le32 reserved;
+};
+
+/* Union of Cleanup address \ expected relative offsets */
+union fcoe_cleanup_addr_exp_ro_union {
+	struct regpair abts_rsp_fc_payload_hi;
+	struct fcoe_exp_ro exp_ro;
+};
+
+/* Fields coppied from ABTSrsp pckt */
+struct fcoe_abts_pkt {
+	__le32 abts_rsp_fc_payload_lo;
+	__le16 abts_rsp_rx_id;
+	u8 abts_rsp_rctl;
+	u8 reserved2;
+};
+
+/* FW read- write (modifyable) part The fcoe task storm context of Tstorm */
+struct fcoe_tstorm_fcoe_task_st_ctx_read_write {
+	union fcoe_cleanup_addr_exp_ro_union cleanup_addr_exp_ro_union;
+	__le16 flags;
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_MASK	0x1
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_SHIFT	0
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_MASK	0x1
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_SHIFT	1
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_MASK		0x1
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_SHIFT	2
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_MASK	0x1
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_SHIFT	3
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_MASK	0x1
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_SHIFT	4
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_MASK	0x1
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_SHIFT	5
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_MASK		0x3
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_SHIFT	6
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_MASK		0xFF
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_SHIFT		8
+	__le16 seq_cnt;
+	u8 seq_id;
+	u8 ooo_rx_seq_id;
+	__le16 rx_id;
+	struct fcoe_abts_pkt abts_data;
+	__le32 e_d_tov_exp_timeout_val;
+	__le16 ooo_rx_seq_cnt;
+	__le16 reserved1;
+};
+
+/* FW read only part The fcoe task storm context of Tstorm */
+struct fcoe_tstorm_fcoe_task_st_ctx_read_only {
+	u8 task_type;
+	u8 dev_type;
+	u8 conf_supported;
+	u8 glbl_q_num;
+	__le32 cid;
+	__le32 fcp_cmd_trns_size;
+	__le32 rsrv;
+};
+
+/** The fcoe task storm context of Tstorm */
+struct tstorm_fcoe_task_st_ctx {
+	struct fcoe_tstorm_fcoe_task_st_ctx_read_write read_write;
+	struct fcoe_tstorm_fcoe_task_st_ctx_read_only read_only;
+};
+
+struct e4_mstorm_fcoe_task_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	__le16 icid;
+	u8 flags0;
+#define E4_MSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK		0xF
+#define E4_MSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
+#define E4_MSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
+#define E4_MSTORM_FCOE_TASK_AG_CTX_CQE_PLACED_MASK		0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_CQE_PLACED_SHIFT		5
+#define E4_MSTORM_FCOE_TASK_AG_CTX_BIT2_MASK			0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_BIT2_SHIFT			6
+#define E4_MSTORM_FCOE_TASK_AG_CTX_BIT3_MASK			0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_BIT3_SHIFT			7
+	u8 flags1;
+#define E4_MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_MASK		0x3
+#define E4_MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_SHIFT		0
+#define E4_MSTORM_FCOE_TASK_AG_CTX_CF1_MASK			0x3
+#define E4_MSTORM_FCOE_TASK_AG_CTX_CF1_SHIFT			2
+#define E4_MSTORM_FCOE_TASK_AG_CTX_CF2_MASK			0x3
+#define E4_MSTORM_FCOE_TASK_AG_CTX_CF2_SHIFT			4
+#define E4_MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_MASK	0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_SHIFT	6
+#define E4_MSTORM_FCOE_TASK_AG_CTX_CF1EN_MASK			0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT			7
+	u8 flags2;
+#define E4_MSTORM_FCOE_TASK_AG_CTX_CF2EN_MASK			0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT			0
+#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK			0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT		1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK			0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT		2
+#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK			0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT		3
+#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK			0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT		4
+#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK			0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT		5
+#define E4_MSTORM_FCOE_TASK_AG_CTX_XFER_PLACEMENT_EN_MASK	0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_XFER_PLACEMENT_EN_SHIFT	6
+#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK			0x1
+#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT		7
+	u8 cleanup_state;
+	__le32 received_bytes;
+	u8 byte3;
+	u8 glbl_q_num;
+	__le16 word1;
+	__le16 tid_to_xfer;
+	__le16 word3;
+	__le16 word4;
+	__le16 word5;
+	__le32 expected_bytes;
+	__le32 reg2;
+};
+
+/* The fcoe task storm context of Mstorm */
+struct mstorm_fcoe_task_st_ctx {
+	struct regpair rsp_buf_addr;
+	__le32 rsrv[2];
+	struct scsi_sgl_params sgl_params;
+	__le32 data_2_trns_rem;
+	__le32 data_buffer_offset;
+	__le16 parent_id;
+	__le16 flags;
+#define MSTORM_FCOE_TASK_ST_CTX_INTERVAL_SIZE_LOG_MASK		0xF
+#define MSTORM_FCOE_TASK_ST_CTX_INTERVAL_SIZE_LOG_SHIFT		0
+#define MSTORM_FCOE_TASK_ST_CTX_HOST_INTERFACE_MASK		0x3
+#define MSTORM_FCOE_TASK_ST_CTX_HOST_INTERFACE_SHIFT		4
+#define MSTORM_FCOE_TASK_ST_CTX_DIF_TO_PEER_MASK		0x1
+#define MSTORM_FCOE_TASK_ST_CTX_DIF_TO_PEER_SHIFT		6
+#define MSTORM_FCOE_TASK_ST_CTX_MP_INCLUDE_FC_HEADER_MASK	0x1
+#define MSTORM_FCOE_TASK_ST_CTX_MP_INCLUDE_FC_HEADER_SHIFT	7
+#define MSTORM_FCOE_TASK_ST_CTX_DIX_BLOCK_SIZE_MASK		0x3
+#define MSTORM_FCOE_TASK_ST_CTX_DIX_BLOCK_SIZE_SHIFT		8
+#define MSTORM_FCOE_TASK_ST_CTX_VALIDATE_DIX_REF_TAG_MASK	0x1
+#define MSTORM_FCOE_TASK_ST_CTX_VALIDATE_DIX_REF_TAG_SHIFT	10
+#define MSTORM_FCOE_TASK_ST_CTX_DIX_CACHED_SGE_FLG_MASK		0x1
+#define MSTORM_FCOE_TASK_ST_CTX_DIX_CACHED_SGE_FLG_SHIFT	11
+#define MSTORM_FCOE_TASK_ST_CTX_DIF_SUPPORTED_MASK		0x1
+#define MSTORM_FCOE_TASK_ST_CTX_DIF_SUPPORTED_SHIFT		12
+#define MSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_MASK		0x1
+#define MSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_SHIFT		13
+#define MSTORM_FCOE_TASK_ST_CTX_RESERVED_MASK			0x3
+#define MSTORM_FCOE_TASK_ST_CTX_RESERVED_SHIFT			14
+	struct scsi_cached_sges data_desc;
+};
+
+struct e4_ustorm_fcoe_task_ag_ctx {
+	u8 reserved;
+	u8 byte1;
+	__le16 icid;
+	u8 flags0;
+#define E4_USTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK		0xF
+#define E4_USTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
+#define E4_USTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
+#define E4_USTORM_FCOE_TASK_AG_CTX_BIT1_MASK			0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT			5
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF0_MASK			0x3
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF0_SHIFT			6
+	u8 flags1;
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF1_MASK		0x3
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF1_SHIFT		0
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF2_MASK		0x3
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF2_SHIFT		2
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF3_MASK		0x3
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF3_SHIFT		4
+#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_MASK	0x3
+#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_SHIFT	6
+	u8 flags2;
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF0EN_MASK			0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF0EN_SHIFT			0
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF1EN_MASK			0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT			1
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF2EN_MASK			0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT			2
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF3EN_MASK			0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_CF3EN_SHIFT			3
+#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_EN_MASK		0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_EN_SHIFT	4
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK			0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT		5
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK			0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT		6
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK			0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT		7
+	u8 flags3;
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK		0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT	0
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK		0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT	1
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK		0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT	2
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK		0x1
+#define E4_USTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT	3
+#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_TYPE_MASK	0xF
+#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT	4
+	__le32 dif_err_intervals;
+	__le32 dif_error_1st_interval;
+	__le32 global_cq_num;
+	__le32 reg3;
+	__le32 reg4;
+	__le32 reg5;
+};
+
+/* FCoE task context */
+struct e4_fcoe_task_context {
+	struct ystorm_fcoe_task_st_ctx ystorm_st_context;
+	struct regpair ystorm_st_padding[2];
+	struct tdif_task_context tdif_context;
+	struct e4_ystorm_fcoe_task_ag_ctx ystorm_ag_context;
+	struct e4_tstorm_fcoe_task_ag_ctx tstorm_ag_context;
+	struct timers_context timer_context;
+	struct tstorm_fcoe_task_st_ctx tstorm_st_context;
+	struct regpair tstorm_st_padding[2];
+	struct e4_mstorm_fcoe_task_ag_ctx mstorm_ag_context;
+	struct mstorm_fcoe_task_st_ctx mstorm_st_context;
+	struct e4_ustorm_fcoe_task_ag_ctx ustorm_ag_context;
+	struct rdif_task_context rdif_context;
+};
+
+/* FCoE additional WQE (Sq/XferQ) information */
+union fcoe_additional_info_union {
+	__le32 previous_tid;
+	__le32 parent_tid;
+	__le32 burst_length;
+	__le32 seq_rec_updated_offset;
+};
+
+/* FCoE Ramrod Command IDs */
+enum fcoe_completion_status {
+	FCOE_COMPLETION_STATUS_SUCCESS,
+	FCOE_COMPLETION_STATUS_FCOE_VER_ERR,
+	FCOE_COMPLETION_STATUS_SRC_MAC_ADD_ARR_ERR,
+	MAX_FCOE_COMPLETION_STATUS
+};
+
+/* FC address (SID/DID) network presentation */
+struct fc_addr_nw {
+	u8 addr_lo;
+	u8 addr_mid;
+	u8 addr_hi;
+};
+
+/* FCoE connection offload */
+struct fcoe_conn_offload_ramrod_data {
+	struct regpair sq_pbl_addr;
+	struct regpair sq_curr_page_addr;
+	struct regpair sq_next_page_addr;
+	struct regpair xferq_pbl_addr;
+	struct regpair xferq_curr_page_addr;
+	struct regpair xferq_next_page_addr;
+	struct regpair respq_pbl_addr;
+	struct regpair respq_curr_page_addr;
+	struct regpair respq_next_page_addr;
+	__le16 dst_mac_addr_lo;
+	__le16 dst_mac_addr_mid;
+	__le16 dst_mac_addr_hi;
+	__le16 src_mac_addr_lo;
+	__le16 src_mac_addr_mid;
+	__le16 src_mac_addr_hi;
+	__le16 tx_max_fc_pay_len;
+	__le16 e_d_tov_timer_val;
+	__le16 rx_max_fc_pay_len;
+	__le16 vlan_tag;
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_VLAN_ID_MASK	0xFFF
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_VLAN_ID_SHIFT	0
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_CFI_MASK		0x1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_CFI_SHIFT		12
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_PRIORITY_MASK	0x7
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_PRIORITY_SHIFT	13
+	__le16 physical_q0;
+	__le16 rec_rr_tov_timer_val;
+	struct fc_addr_nw s_id;
+	u8 max_conc_seqs_c3;
+	struct fc_addr_nw d_id;
+	u8 flags;
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_CONT_INCR_SEQ_CNT_MASK	0x1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_CONT_INCR_SEQ_CNT_SHIFT	0
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_CONF_REQ_MASK		0x1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_CONF_REQ_SHIFT		1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_REC_VALID_MASK		0x1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_REC_VALID_SHIFT		2
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_VLAN_FLAG_MASK		0x1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_VLAN_FLAG_SHIFT		3
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_SINGLE_VLAN_MASK	0x1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_B_SINGLE_VLAN_SHIFT	4
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_MODE_MASK			0x3
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_MODE_SHIFT		5
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_RESERVED0_MASK		0x1
+#define FCOE_CONN_OFFLOAD_RAMROD_DATA_RESERVED0_SHIFT		7
+	__le16 conn_id;
+	u8 def_q_idx;
+	u8 reserved[5];
+};
+
+/* FCoE terminate connection request */
+struct fcoe_conn_terminate_ramrod_data {
+	struct regpair terminate_params_addr;
+};
+
+/* FCoE device type */
+enum fcoe_device_type {
+	FCOE_TASK_DEV_TYPE_DISK,
+	FCOE_TASK_DEV_TYPE_TAPE,
+	MAX_FCOE_DEVICE_TYPE
+};
+
+/* Data sgl */
+struct fcoe_fast_sgl_ctx {
+	struct regpair sgl_start_addr;
+	__le32 sgl_byte_offset;
+	__le16 task_reuse_cnt;
+	__le16 init_offset_in_first_sge;
+};
+
+/* FCoE firmware function init */
+struct fcoe_init_func_ramrod_data {
+	struct scsi_init_func_params func_params;
+	struct scsi_init_func_queues q_params;
+	__le16 mtu;
+	__le16 sq_num_pages_in_pbl;
+	__le32 reserved[3];
+};
+
+/* FCoE: Mode of the connection: Target or Initiator or both */
+enum fcoe_mode_type {
+	FCOE_INITIATOR_MODE = 0x0,
+	FCOE_TARGET_MODE = 0x1,
+	FCOE_BOTH_OR_NOT_CHOSEN = 0x3,
+	MAX_FCOE_MODE_TYPE
+};
+
+/* Per PF FCoE receive path statistics - tStorm RAM structure */
+struct fcoe_rx_stat {
+	struct regpair fcoe_rx_byte_cnt;
+	struct regpair fcoe_rx_data_pkt_cnt;
+	struct regpair fcoe_rx_xfer_pkt_cnt;
+	struct regpair fcoe_rx_other_pkt_cnt;
+	__le32 fcoe_silent_drop_pkt_cmdq_full_cnt;
+	__le32 fcoe_silent_drop_pkt_rq_full_cnt;
+	__le32 fcoe_silent_drop_pkt_crc_error_cnt;
+	__le32 fcoe_silent_drop_pkt_task_invalid_cnt;
+	__le32 fcoe_silent_drop_total_pkt_cnt;
+	__le32 rsrv;
+};
+
+/* FCoE SQE request type */
+enum fcoe_sqe_request_type {
+	SEND_FCOE_CMD,
+	SEND_FCOE_MIDPATH,
+	SEND_FCOE_ABTS_REQUEST,
+	FCOE_EXCHANGE_CLEANUP,
+	FCOE_SEQUENCE_RECOVERY,
+	SEND_FCOE_XFER_RDY,
+	SEND_FCOE_RSP,
+	SEND_FCOE_RSP_WITH_SENSE_DATA,
+	SEND_FCOE_TARGET_DATA,
+	SEND_FCOE_INITIATOR_DATA,
+	SEND_FCOE_XFER_CONTINUATION_RDY,
+	SEND_FCOE_TARGET_ABTS_RSP,
+	MAX_FCOE_SQE_REQUEST_TYPE
+};
+
+/* FCoe statistics request */
+struct fcoe_stat_ramrod_data {
+	struct regpair stat_params_addr;
+};
+
+/* FCoE task type */
+enum fcoe_task_type {
+	FCOE_TASK_TYPE_WRITE_INITIATOR,
+	FCOE_TASK_TYPE_READ_INITIATOR,
+	FCOE_TASK_TYPE_MIDPATH,
+	FCOE_TASK_TYPE_UNSOLICITED,
+	FCOE_TASK_TYPE_ABTS,
+	FCOE_TASK_TYPE_EXCHANGE_CLEANUP,
+	FCOE_TASK_TYPE_SEQUENCE_CLEANUP,
+	FCOE_TASK_TYPE_WRITE_TARGET,
+	FCOE_TASK_TYPE_READ_TARGET,
+	FCOE_TASK_TYPE_RSP,
+	FCOE_TASK_TYPE_RSP_SENSE_DATA,
+	FCOE_TASK_TYPE_ABTS_TARGET,
+	FCOE_TASK_TYPE_ENUM_SIZE,
+	MAX_FCOE_TASK_TYPE
+};
+
+/* Per PF FCoE transmit path statistics - pStorm RAM structure */
+struct fcoe_tx_stat {
+	struct regpair fcoe_tx_byte_cnt;
+	struct regpair fcoe_tx_data_pkt_cnt;
+	struct regpair fcoe_tx_xfer_pkt_cnt;
+	struct regpair fcoe_tx_other_pkt_cnt;
+};
+
+/* FCoE SQ/XferQ element */
+struct fcoe_wqe {
+	__le16 task_id;
+	__le16 flags;
+#define FCOE_WQE_REQ_TYPE_MASK		0xF
+#define FCOE_WQE_REQ_TYPE_SHIFT		0
+#define FCOE_WQE_SGL_MODE_MASK		0x1
+#define FCOE_WQE_SGL_MODE_SHIFT		4
+#define FCOE_WQE_CONTINUATION_MASK	0x1
+#define FCOE_WQE_CONTINUATION_SHIFT	5
+#define FCOE_WQE_SEND_AUTO_RSP_MASK	0x1
+#define FCOE_WQE_SEND_AUTO_RSP_SHIFT	6
+#define FCOE_WQE_RESERVED_MASK		0x1
+#define FCOE_WQE_RESERVED_SHIFT		7
+#define FCOE_WQE_NUM_SGES_MASK		0xF
+#define FCOE_WQE_NUM_SGES_SHIFT		8
+#define FCOE_WQE_RESERVED1_MASK		0xF
+#define FCOE_WQE_RESERVED1_SHIFT	12
+	union fcoe_additional_info_union additional_info_union;
+};
+
+/* FCoE XFRQ element */
+struct xfrqe_prot_flags {
+	u8 flags;
+#define XFRQE_PROT_FLAGS_PROT_INTERVAL_SIZE_LOG_MASK	0xF
+#define XFRQE_PROT_FLAGS_PROT_INTERVAL_SIZE_LOG_SHIFT	0
+#define XFRQE_PROT_FLAGS_DIF_TO_PEER_MASK		0x1
+#define XFRQE_PROT_FLAGS_DIF_TO_PEER_SHIFT		4
+#define XFRQE_PROT_FLAGS_HOST_INTERFACE_MASK		0x3
+#define XFRQE_PROT_FLAGS_HOST_INTERFACE_SHIFT		5
+#define XFRQE_PROT_FLAGS_RESERVED_MASK			0x1
+#define XFRQE_PROT_FLAGS_RESERVED_SHIFT			7
+};
+
+/* FCoE doorbell data */
+struct fcoe_db_data {
+	u8 params;
+#define FCOE_DB_DATA_DEST_MASK		0x3
+#define FCOE_DB_DATA_DEST_SHIFT		0
+#define FCOE_DB_DATA_AGG_CMD_MASK	0x3
+#define FCOE_DB_DATA_AGG_CMD_SHIFT	2
+#define FCOE_DB_DATA_BYPASS_EN_MASK	0x1
+#define FCOE_DB_DATA_BYPASS_EN_SHIFT	4
+#define FCOE_DB_DATA_RESERVED_MASK	0x1
+#define FCOE_DB_DATA_RESERVED_SHIFT	5
+#define FCOE_DB_DATA_AGG_VAL_SEL_MASK	0x3
+#define FCOE_DB_DATA_AGG_VAL_SEL_SHIFT	6
+	u8 agg_flags;
+	__le16 sq_prod;
+};
+
+#endif /* __FCOE_COMMON__ */
diff --git a/include/linux/qed/iscsi_common.h b/include/linux/qed/iscsi_common.h
new file mode 100644
index 0000000..938df61
--- /dev/null
+++ b/include/linux/qed/iscsi_common.h
@@ -0,0 +1,1572 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ISCSI_COMMON__
+#define __ISCSI_COMMON__
+
+/**********************/
+/* ISCSI FW CONSTANTS */
+/**********************/
+
+/* iSCSI HSI constants */
+#define ISCSI_DEFAULT_MTU	(1500)
+
+/* KWQ (kernel work queue) layer codes */
+#define ISCSI_SLOW_PATH_LAYER_CODE	(6)
+
+/* iSCSI parameter defaults */
+#define ISCSI_DEFAULT_HEADER_DIGEST		(0)
+#define ISCSI_DEFAULT_DATA_DIGEST		(0)
+#define ISCSI_DEFAULT_INITIAL_R2T		(1)
+#define ISCSI_DEFAULT_IMMEDIATE_DATA		(1)
+#define ISCSI_DEFAULT_MAX_PDU_LENGTH		(0x2000)
+#define ISCSI_DEFAULT_FIRST_BURST_LENGTH	(0x10000)
+#define ISCSI_DEFAULT_MAX_BURST_LENGTH		(0x40000)
+#define ISCSI_DEFAULT_MAX_OUTSTANDING_R2T	(1)
+
+/* iSCSI parameter limits */
+#define ISCSI_MIN_VAL_MAX_PDU_LENGTH		(0x200)
+#define ISCSI_MAX_VAL_MAX_PDU_LENGTH		(0xffffff)
+#define ISCSI_MIN_VAL_BURST_LENGTH		(0x200)
+#define ISCSI_MAX_VAL_BURST_LENGTH		(0xffffff)
+#define ISCSI_MIN_VAL_MAX_OUTSTANDING_R2T	(1)
+#define ISCSI_MAX_VAL_MAX_OUTSTANDING_R2T	(0xff)
+
+#define ISCSI_AHS_CNTL_SIZE	4
+
+#define ISCSI_WQE_NUM_SGES_SLOWIO	(0xf)
+
+/* iSCSI reserved params */
+#define ISCSI_ITT_ALL_ONES	(0xffffffff)
+#define ISCSI_TTT_ALL_ONES	(0xffffffff)
+
+#define ISCSI_OPTION_1_OFF_CHIP_TCP	1
+#define ISCSI_OPTION_2_ON_CHIP_TCP	2
+
+#define ISCSI_INITIATOR_MODE	0
+#define ISCSI_TARGET_MODE	1
+
+/* iSCSI request op codes */
+#define ISCSI_OPCODE_NOP_OUT		(0)
+#define ISCSI_OPCODE_SCSI_CMD		(1)
+#define ISCSI_OPCODE_TMF_REQUEST	(2)
+#define ISCSI_OPCODE_LOGIN_REQUEST	(3)
+#define ISCSI_OPCODE_TEXT_REQUEST	(4)
+#define ISCSI_OPCODE_DATA_OUT		(5)
+#define ISCSI_OPCODE_LOGOUT_REQUEST	(6)
+
+/* iSCSI response/messages op codes */
+#define ISCSI_OPCODE_NOP_IN		(0x20)
+#define ISCSI_OPCODE_SCSI_RESPONSE	(0x21)
+#define ISCSI_OPCODE_TMF_RESPONSE	(0x22)
+#define ISCSI_OPCODE_LOGIN_RESPONSE	(0x23)
+#define ISCSI_OPCODE_TEXT_RESPONSE	(0x24)
+#define ISCSI_OPCODE_DATA_IN		(0x25)
+#define ISCSI_OPCODE_LOGOUT_RESPONSE	(0x26)
+#define ISCSI_OPCODE_R2T		(0x31)
+#define ISCSI_OPCODE_ASYNC_MSG		(0x32)
+#define ISCSI_OPCODE_REJECT		(0x3f)
+
+/* iSCSI stages */
+#define ISCSI_STAGE_SECURITY_NEGOTIATION		(0)
+#define ISCSI_STAGE_LOGIN_OPERATIONAL_NEGOTIATION	(1)
+#define ISCSI_STAGE_FULL_FEATURE_PHASE			(3)
+
+/* iSCSI CQE errors */
+#define CQE_ERROR_BITMAP_DATA_DIGEST		(0x08)
+#define CQE_ERROR_BITMAP_RCV_ON_INVALID_CONN	(0x10)
+#define CQE_ERROR_BITMAP_DATA_TRUNCATED		(0x20)
+
+/* Union of data bd_opaque/ tq_tid */
+union bd_opaque_tq_union {
+	__le16 bd_opaque;
+	__le16 tq_tid;
+};
+
+/* ISCSI SGL entry */
+struct cqe_error_bitmap {
+	u8 cqe_error_status_bits;
+#define CQE_ERROR_BITMAP_DIF_ERR_BITS_MASK		0x7
+#define CQE_ERROR_BITMAP_DIF_ERR_BITS_SHIFT		0
+#define CQE_ERROR_BITMAP_DATA_DIGEST_ERR_MASK		0x1
+#define CQE_ERROR_BITMAP_DATA_DIGEST_ERR_SHIFT		3
+#define CQE_ERROR_BITMAP_RCV_ON_INVALID_CONN_MASK	0x1
+#define CQE_ERROR_BITMAP_RCV_ON_INVALID_CONN_SHIFT	4
+#define CQE_ERROR_BITMAP_DATA_TRUNCATED_ERR_MASK	0x1
+#define CQE_ERROR_BITMAP_DATA_TRUNCATED_ERR_SHIFT	5
+#define CQE_ERROR_BITMAP_UNDER_RUN_ERR_MASK		0x1
+#define CQE_ERROR_BITMAP_UNDER_RUN_ERR_SHIFT		6
+#define CQE_ERROR_BITMAP_RESERVED2_MASK			0x1
+#define CQE_ERROR_BITMAP_RESERVED2_SHIFT		7
+};
+
+union cqe_error_status {
+	u8 error_status;
+	struct cqe_error_bitmap error_bits;
+};
+
+/* iSCSI Login Response PDU header */
+struct data_hdr {
+	__le32 data[12];
+};
+
+struct lun_mapper_addr_reserved {
+	struct regpair lun_mapper_addr;
+	u8 reserved0[8];
+};
+
+/* rdif conetxt for dif on immediate */
+struct dif_on_immediate_params {
+	__le32 initial_ref_tag;
+	__le16 application_tag;
+	__le16 application_tag_mask;
+	__le16 flags1;
+#define DIF_ON_IMMEDIATE_PARAMS_VALIDATE_GUARD_MASK		0x1
+#define DIF_ON_IMMEDIATE_PARAMS_VALIDATE_GUARD_SHIFT		0
+#define DIF_ON_IMMEDIATE_PARAMS_VALIDATE_APP_TAG_MASK		0x1
+#define DIF_ON_IMMEDIATE_PARAMS_VALIDATE_APP_TAG_SHIFT		1
+#define DIF_ON_IMMEDIATE_PARAMS_VALIDATE_REF_TAG_MASK		0x1
+#define DIF_ON_IMMEDIATE_PARAMS_VALIDATE_REF_TAG_SHIFT		2
+#define DIF_ON_IMMEDIATE_PARAMS_FORWARD_GUARD_MASK		0x1
+#define DIF_ON_IMMEDIATE_PARAMS_FORWARD_GUARD_SHIFT		3
+#define DIF_ON_IMMEDIATE_PARAMS_FORWARD_APP_TAG_MASK		0x1
+#define DIF_ON_IMMEDIATE_PARAMS_FORWARD_APP_TAG_SHIFT		4
+#define DIF_ON_IMMEDIATE_PARAMS_FORWARD_REF_TAG_MASK		0x1
+#define DIF_ON_IMMEDIATE_PARAMS_FORWARD_REF_TAG_SHIFT		5
+#define DIF_ON_IMMEDIATE_PARAMS_INTERVAL_SIZE_MASK		0x1
+#define DIF_ON_IMMEDIATE_PARAMS_INTERVAL_SIZE_SHIFT		6
+#define DIF_ON_IMMEDIATE_PARAMS_NETWORK_INTERFACE_MASK		0x1
+#define DIF_ON_IMMEDIATE_PARAMS_NETWORK_INTERFACE_SHIFT		7
+#define DIF_ON_IMMEDIATE_PARAMS_HOST_INTERFACE_MASK		0x3
+#define DIF_ON_IMMEDIATE_PARAMS_HOST_INTERFACE_SHIFT		8
+#define DIF_ON_IMMEDIATE_PARAMS_REF_TAG_MASK_MASK		0xF
+#define DIF_ON_IMMEDIATE_PARAMS_REF_TAG_MASK_SHIFT		10
+#define DIF_ON_IMMEDIATE_PARAMS_FORWARD_APP_TAG_WITH_MASK_MASK	0x1
+#define DIF_ON_IMMEDIATE_PARAMS_FORWARD_APP_TAG_WITH_MASK_SHIFT	14
+#define DIF_ON_IMMEDIATE_PARAMS_FORWARD_REF_TAG_WITH_MASK_MASK	0x1
+#define DIF_ON_IMMEDIATE_PARAMS_FORWARD_REF_TAG_WITH_MASK_SHIFT	15
+	u8 flags0;
+#define DIF_ON_IMMEDIATE_PARAMS_RESERVED_MASK			0x1
+#define DIF_ON_IMMEDIATE_PARAMS_RESERVED_SHIFT			0
+#define DIF_ON_IMMEDIATE_PARAMS_IGNORE_APP_TAG_MASK		0x1
+#define DIF_ON_IMMEDIATE_PARAMS_IGNORE_APP_TAG_SHIFT		1
+#define DIF_ON_IMMEDIATE_PARAMS_INITIAL_REF_TAG_IS_VALID_MASK	0x1
+#define DIF_ON_IMMEDIATE_PARAMS_INITIAL_REF_TAG_IS_VALID_SHIFT	2
+#define DIF_ON_IMMEDIATE_PARAMS_HOST_GUARD_TYPE_MASK		0x1
+#define DIF_ON_IMMEDIATE_PARAMS_HOST_GUARD_TYPE_SHIFT		3
+#define DIF_ON_IMMEDIATE_PARAMS_PROTECTION_TYPE_MASK		0x3
+#define DIF_ON_IMMEDIATE_PARAMS_PROTECTION_TYPE_SHIFT		4
+#define DIF_ON_IMMEDIATE_PARAMS_CRC_SEED_MASK			0x1
+#define DIF_ON_IMMEDIATE_PARAMS_CRC_SEED_SHIFT			6
+#define DIF_ON_IMMEDIATE_PARAMS_KEEP_REF_TAG_CONST_MASK		0x1
+#define DIF_ON_IMMEDIATE_PARAMS_KEEP_REF_TAG_CONST_SHIFT	7
+	u8 reserved_zero[5];
+};
+
+/* iSCSI dif on immediate mode attributes union */
+union dif_configuration_params {
+	struct lun_mapper_addr_reserved lun_mapper_address;
+	struct dif_on_immediate_params def_dif_conf;
+};
+
+/* Union of data/r2t sequence number */
+union iscsi_seq_num {
+	__le16 data_sn;
+	__le16 r2t_sn;
+};
+
+/* iSCSI DIF flags */
+struct iscsi_dif_flags {
+	u8 flags;
+#define ISCSI_DIF_FLAGS_PROT_INTERVAL_SIZE_LOG_MASK	0xF
+#define ISCSI_DIF_FLAGS_PROT_INTERVAL_SIZE_LOG_SHIFT	0
+#define ISCSI_DIF_FLAGS_DIF_TO_PEER_MASK		0x1
+#define ISCSI_DIF_FLAGS_DIF_TO_PEER_SHIFT		4
+#define ISCSI_DIF_FLAGS_HOST_INTERFACE_MASK		0x7
+#define ISCSI_DIF_FLAGS_HOST_INTERFACE_SHIFT		5
+};
+
+/* The iscsi storm task context of Ystorm */
+struct ystorm_iscsi_task_state {
+	struct scsi_cached_sges data_desc;
+	struct scsi_sgl_params sgl_params;
+	__le32 exp_r2t_sn;
+	__le32 buffer_offset;
+	union iscsi_seq_num seq_num;
+	struct iscsi_dif_flags dif_flags;
+	u8 flags;
+#define YSTORM_ISCSI_TASK_STATE_LOCAL_COMP_MASK		0x1
+#define YSTORM_ISCSI_TASK_STATE_LOCAL_COMP_SHIFT	0
+#define YSTORM_ISCSI_TASK_STATE_SLOW_IO_MASK		0x1
+#define YSTORM_ISCSI_TASK_STATE_SLOW_IO_SHIFT		1
+#define YSTORM_ISCSI_TASK_STATE_SET_DIF_OFFSET_MASK	0x1
+#define YSTORM_ISCSI_TASK_STATE_SET_DIF_OFFSET_SHIFT	2
+#define YSTORM_ISCSI_TASK_STATE_RESERVED0_MASK		0x1F
+#define YSTORM_ISCSI_TASK_STATE_RESERVED0_SHIFT		3
+};
+
+/* The iscsi storm task context of Ystorm */
+struct ystorm_iscsi_task_rxmit_opt {
+	__le32 fast_rxmit_sge_offset;
+	__le32 scan_start_buffer_offset;
+	__le32 fast_rxmit_buffer_offset;
+	u8 scan_start_sgl_index;
+	u8 fast_rxmit_sgl_index;
+	__le16 reserved;
+};
+
+/* iSCSI Common PDU header */
+struct iscsi_common_hdr {
+	u8 hdr_status;
+	u8 hdr_response;
+	u8 hdr_flags;
+	u8 hdr_first_byte;
+#define ISCSI_COMMON_HDR_OPCODE_MASK		0x3F
+#define ISCSI_COMMON_HDR_OPCODE_SHIFT		0
+#define ISCSI_COMMON_HDR_IMM_MASK		0x1
+#define ISCSI_COMMON_HDR_IMM_SHIFT		6
+#define ISCSI_COMMON_HDR_RSRV_MASK		0x1
+#define ISCSI_COMMON_HDR_RSRV_SHIFT		7
+	__le32 hdr_second_dword;
+#define ISCSI_COMMON_HDR_DATA_SEG_LEN_MASK	0xFFFFFF
+#define ISCSI_COMMON_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_COMMON_HDR_TOTAL_AHS_LEN_MASK	0xFF
+#define ISCSI_COMMON_HDR_TOTAL_AHS_LEN_SHIFT	24
+	struct regpair lun_reserved;
+	__le32 itt;
+	__le32 ttt;
+	__le32 cmdstat_sn;
+	__le32 exp_statcmd_sn;
+	__le32 max_cmd_sn;
+	__le32 data[3];
+};
+
+/* iSCSI Command PDU header */
+struct iscsi_cmd_hdr {
+	__le16 reserved1;
+	u8 flags_attr;
+#define ISCSI_CMD_HDR_ATTR_MASK			0x7
+#define ISCSI_CMD_HDR_ATTR_SHIFT		0
+#define ISCSI_CMD_HDR_RSRV_MASK			0x3
+#define ISCSI_CMD_HDR_RSRV_SHIFT		3
+#define ISCSI_CMD_HDR_WRITE_MASK		0x1
+#define ISCSI_CMD_HDR_WRITE_SHIFT		5
+#define ISCSI_CMD_HDR_READ_MASK			0x1
+#define ISCSI_CMD_HDR_READ_SHIFT		6
+#define ISCSI_CMD_HDR_FINAL_MASK		0x1
+#define ISCSI_CMD_HDR_FINAL_SHIFT		7
+	u8 hdr_first_byte;
+#define ISCSI_CMD_HDR_OPCODE_MASK		0x3F
+#define ISCSI_CMD_HDR_OPCODE_SHIFT		0
+#define ISCSI_CMD_HDR_IMM_MASK			0x1
+#define ISCSI_CMD_HDR_IMM_SHIFT			6
+#define ISCSI_CMD_HDR_RSRV1_MASK		0x1
+#define ISCSI_CMD_HDR_RSRV1_SHIFT		7
+	__le32 hdr_second_dword;
+#define ISCSI_CMD_HDR_DATA_SEG_LEN_MASK		0xFFFFFF
+#define ISCSI_CMD_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_CMD_HDR_TOTAL_AHS_LEN_MASK	0xFF
+#define ISCSI_CMD_HDR_TOTAL_AHS_LEN_SHIFT	24
+	struct regpair lun;
+	__le32 itt;
+	__le32 expected_transfer_length;
+	__le32 cmd_sn;
+	__le32 exp_stat_sn;
+	__le32 cdb[4];
+};
+
+/* iSCSI Command PDU header with Extended CDB (Initiator Mode) */
+struct iscsi_ext_cdb_cmd_hdr {
+	__le16 reserved1;
+	u8 flags_attr;
+#define ISCSI_EXT_CDB_CMD_HDR_ATTR_MASK		0x7
+#define ISCSI_EXT_CDB_CMD_HDR_ATTR_SHIFT	0
+#define ISCSI_EXT_CDB_CMD_HDR_RSRV_MASK		0x3
+#define ISCSI_EXT_CDB_CMD_HDR_RSRV_SHIFT	3
+#define ISCSI_EXT_CDB_CMD_HDR_WRITE_MASK	0x1
+#define ISCSI_EXT_CDB_CMD_HDR_WRITE_SHIFT	5
+#define ISCSI_EXT_CDB_CMD_HDR_READ_MASK		0x1
+#define ISCSI_EXT_CDB_CMD_HDR_READ_SHIFT	6
+#define ISCSI_EXT_CDB_CMD_HDR_FINAL_MASK	0x1
+#define ISCSI_EXT_CDB_CMD_HDR_FINAL_SHIFT	7
+	u8 opcode;
+	__le32 hdr_second_dword;
+#define ISCSI_EXT_CDB_CMD_HDR_DATA_SEG_LEN_MASK		0xFFFFFF
+#define ISCSI_EXT_CDB_CMD_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_EXT_CDB_CMD_HDR_CDB_SIZE_MASK		0xFF
+#define ISCSI_EXT_CDB_CMD_HDR_CDB_SIZE_SHIFT		24
+	struct regpair lun;
+	__le32 itt;
+	__le32 expected_transfer_length;
+	__le32 cmd_sn;
+	__le32 exp_stat_sn;
+	struct scsi_sge cdb_sge;
+};
+
+/* iSCSI login request PDU header */
+struct iscsi_login_req_hdr {
+	u8 version_min;
+	u8 version_max;
+	u8 flags_attr;
+#define ISCSI_LOGIN_REQ_HDR_NSG_MASK	0x3
+#define ISCSI_LOGIN_REQ_HDR_NSG_SHIFT	0
+#define ISCSI_LOGIN_REQ_HDR_CSG_MASK	0x3
+#define ISCSI_LOGIN_REQ_HDR_CSG_SHIFT	2
+#define ISCSI_LOGIN_REQ_HDR_RSRV_MASK	0x3
+#define ISCSI_LOGIN_REQ_HDR_RSRV_SHIFT	4
+#define ISCSI_LOGIN_REQ_HDR_C_MASK	0x1
+#define ISCSI_LOGIN_REQ_HDR_C_SHIFT	6
+#define ISCSI_LOGIN_REQ_HDR_T_MASK	0x1
+#define ISCSI_LOGIN_REQ_HDR_T_SHIFT	7
+	u8 opcode;
+	__le32 hdr_second_dword;
+#define ISCSI_LOGIN_REQ_HDR_DATA_SEG_LEN_MASK	0xFFFFFF
+#define ISCSI_LOGIN_REQ_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_LOGIN_REQ_HDR_TOTAL_AHS_LEN_MASK	0xFF
+#define ISCSI_LOGIN_REQ_HDR_TOTAL_AHS_LEN_SHIFT	24
+	__le32 isid_tabc;
+	__le16 tsih;
+	__le16 isid_d;
+	__le32 itt;
+	__le16 reserved1;
+	__le16 cid;
+	__le32 cmd_sn;
+	__le32 exp_stat_sn;
+	__le32 reserved2[4];
+};
+
+/* iSCSI logout request PDU header */
+struct iscsi_logout_req_hdr {
+	__le16 reserved0;
+	u8 reason_code;
+	u8 opcode;
+	__le32 reserved1;
+	__le32 reserved2[2];
+	__le32 itt;
+	__le16 reserved3;
+	__le16 cid;
+	__le32 cmd_sn;
+	__le32 exp_stat_sn;
+	__le32 reserved4[4];
+};
+
+/* iSCSI Data-out PDU header */
+struct iscsi_data_out_hdr {
+	__le16 reserved1;
+	u8 flags_attr;
+#define ISCSI_DATA_OUT_HDR_RSRV_MASK	0x7F
+#define ISCSI_DATA_OUT_HDR_RSRV_SHIFT	0
+#define ISCSI_DATA_OUT_HDR_FINAL_MASK	0x1
+#define ISCSI_DATA_OUT_HDR_FINAL_SHIFT	7
+	u8 opcode;
+	__le32 reserved2;
+	struct regpair lun;
+	__le32 itt;
+	__le32 ttt;
+	__le32 reserved3;
+	__le32 exp_stat_sn;
+	__le32 reserved4;
+	__le32 data_sn;
+	__le32 buffer_offset;
+	__le32 reserved5;
+};
+
+/* iSCSI Data-in PDU header */
+struct iscsi_data_in_hdr {
+	u8 status_rsvd;
+	u8 reserved1;
+	u8 flags;
+#define ISCSI_DATA_IN_HDR_STATUS_MASK		0x1
+#define ISCSI_DATA_IN_HDR_STATUS_SHIFT		0
+#define ISCSI_DATA_IN_HDR_UNDERFLOW_MASK	0x1
+#define ISCSI_DATA_IN_HDR_UNDERFLOW_SHIFT	1
+#define ISCSI_DATA_IN_HDR_OVERFLOW_MASK		0x1
+#define ISCSI_DATA_IN_HDR_OVERFLOW_SHIFT	2
+#define ISCSI_DATA_IN_HDR_RSRV_MASK		0x7
+#define ISCSI_DATA_IN_HDR_RSRV_SHIFT		3
+#define ISCSI_DATA_IN_HDR_ACK_MASK		0x1
+#define ISCSI_DATA_IN_HDR_ACK_SHIFT		6
+#define ISCSI_DATA_IN_HDR_FINAL_MASK		0x1
+#define ISCSI_DATA_IN_HDR_FINAL_SHIFT		7
+	u8 opcode;
+	__le32 reserved2;
+	struct regpair lun;
+	__le32 itt;
+	__le32 ttt;
+	__le32 stat_sn;
+	__le32 exp_cmd_sn;
+	__le32 max_cmd_sn;
+	__le32 data_sn;
+	__le32 buffer_offset;
+	__le32 residual_count;
+};
+
+/* iSCSI R2T PDU header */
+struct iscsi_r2t_hdr {
+	u8 reserved0[3];
+	u8 opcode;
+	__le32 reserved2;
+	struct regpair lun;
+	__le32 itt;
+	__le32 ttt;
+	__le32 stat_sn;
+	__le32 exp_cmd_sn;
+	__le32 max_cmd_sn;
+	__le32 r2t_sn;
+	__le32 buffer_offset;
+	__le32 desired_data_trns_len;
+};
+
+/* iSCSI NOP-out PDU header */
+struct iscsi_nop_out_hdr {
+	__le16 reserved1;
+	u8 flags_attr;
+#define ISCSI_NOP_OUT_HDR_RSRV_MASK	0x7F
+#define ISCSI_NOP_OUT_HDR_RSRV_SHIFT	0
+#define ISCSI_NOP_OUT_HDR_CONST1_MASK	0x1
+#define ISCSI_NOP_OUT_HDR_CONST1_SHIFT	7
+	u8 opcode;
+	__le32 reserved2;
+	struct regpair lun;
+	__le32 itt;
+	__le32 ttt;
+	__le32 cmd_sn;
+	__le32 exp_stat_sn;
+	__le32 reserved3;
+	__le32 reserved4;
+	__le32 reserved5;
+	__le32 reserved6;
+};
+
+/* iSCSI NOP-in PDU header */
+struct iscsi_nop_in_hdr {
+	__le16 reserved0;
+	u8 flags_attr;
+#define ISCSI_NOP_IN_HDR_RSRV_MASK	0x7F
+#define ISCSI_NOP_IN_HDR_RSRV_SHIFT	0
+#define ISCSI_NOP_IN_HDR_CONST1_MASK	0x1
+#define ISCSI_NOP_IN_HDR_CONST1_SHIFT	7
+	u8 opcode;
+	__le32 hdr_second_dword;
+#define ISCSI_NOP_IN_HDR_DATA_SEG_LEN_MASK	0xFFFFFF
+#define ISCSI_NOP_IN_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_NOP_IN_HDR_TOTAL_AHS_LEN_MASK	0xFF
+#define ISCSI_NOP_IN_HDR_TOTAL_AHS_LEN_SHIFT	24
+	struct regpair lun;
+	__le32 itt;
+	__le32 ttt;
+	__le32 stat_sn;
+	__le32 exp_cmd_sn;
+	__le32 max_cmd_sn;
+	__le32 reserved5;
+	__le32 reserved6;
+	__le32 reserved7;
+};
+
+/* iSCSI Login Response PDU header */
+struct iscsi_login_response_hdr {
+	u8 version_active;
+	u8 version_max;
+	u8 flags_attr;
+#define ISCSI_LOGIN_RESPONSE_HDR_NSG_MASK	0x3
+#define ISCSI_LOGIN_RESPONSE_HDR_NSG_SHIFT	0
+#define ISCSI_LOGIN_RESPONSE_HDR_CSG_MASK	0x3
+#define ISCSI_LOGIN_RESPONSE_HDR_CSG_SHIFT	2
+#define ISCSI_LOGIN_RESPONSE_HDR_RSRV_MASK	0x3
+#define ISCSI_LOGIN_RESPONSE_HDR_RSRV_SHIFT	4
+#define ISCSI_LOGIN_RESPONSE_HDR_C_MASK		0x1
+#define ISCSI_LOGIN_RESPONSE_HDR_C_SHIFT	6
+#define ISCSI_LOGIN_RESPONSE_HDR_T_MASK		0x1
+#define ISCSI_LOGIN_RESPONSE_HDR_T_SHIFT	7
+	u8 opcode;
+	__le32 hdr_second_dword;
+#define ISCSI_LOGIN_RESPONSE_HDR_DATA_SEG_LEN_MASK	0xFFFFFF
+#define ISCSI_LOGIN_RESPONSE_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_LOGIN_RESPONSE_HDR_TOTAL_AHS_LEN_MASK	0xFF
+#define ISCSI_LOGIN_RESPONSE_HDR_TOTAL_AHS_LEN_SHIFT	24
+	__le32 isid_tabc;
+	__le16 tsih;
+	__le16 isid_d;
+	__le32 itt;
+	__le32 reserved1;
+	__le32 stat_sn;
+	__le32 exp_cmd_sn;
+	__le32 max_cmd_sn;
+	__le16 reserved2;
+	u8 status_detail;
+	u8 status_class;
+	__le32 reserved4[2];
+};
+
+/* iSCSI Logout Response PDU header */
+struct iscsi_logout_response_hdr {
+	u8 reserved1;
+	u8 response;
+	u8 flags;
+	u8 opcode;
+	__le32 hdr_second_dword;
+#define ISCSI_LOGOUT_RESPONSE_HDR_DATA_SEG_LEN_MASK	0xFFFFFF
+#define ISCSI_LOGOUT_RESPONSE_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_LOGOUT_RESPONSE_HDR_TOTAL_AHS_LEN_MASK	0xFF
+#define ISCSI_LOGOUT_RESPONSE_HDR_TOTAL_AHS_LEN_SHIFT	24
+	__le32 reserved2[2];
+	__le32 itt;
+	__le32 reserved3;
+	__le32 stat_sn;
+	__le32 exp_cmd_sn;
+	__le32 max_cmd_sn;
+	__le32 reserved4;
+	__le16 time_2_retain;
+	__le16 time_2_wait;
+	__le32 reserved5[1];
+};
+
+/* iSCSI Text Request PDU header */
+struct iscsi_text_request_hdr {
+	__le16 reserved0;
+	u8 flags_attr;
+#define ISCSI_TEXT_REQUEST_HDR_RSRV_MASK	0x3F
+#define ISCSI_TEXT_REQUEST_HDR_RSRV_SHIFT	0
+#define ISCSI_TEXT_REQUEST_HDR_C_MASK		0x1
+#define ISCSI_TEXT_REQUEST_HDR_C_SHIFT		6
+#define ISCSI_TEXT_REQUEST_HDR_F_MASK		0x1
+#define ISCSI_TEXT_REQUEST_HDR_F_SHIFT		7
+	u8 opcode;
+	__le32 hdr_second_dword;
+#define ISCSI_TEXT_REQUEST_HDR_DATA_SEG_LEN_MASK	0xFFFFFF
+#define ISCSI_TEXT_REQUEST_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_TEXT_REQUEST_HDR_TOTAL_AHS_LEN_MASK	0xFF
+#define ISCSI_TEXT_REQUEST_HDR_TOTAL_AHS_LEN_SHIFT	24
+	struct regpair lun;
+	__le32 itt;
+	__le32 ttt;
+	__le32 cmd_sn;
+	__le32 exp_stat_sn;
+	__le32 reserved4[4];
+};
+
+/* iSCSI Text Response PDU header */
+struct iscsi_text_response_hdr {
+	__le16 reserved1;
+	u8 flags;
+#define ISCSI_TEXT_RESPONSE_HDR_RSRV_MASK	0x3F
+#define ISCSI_TEXT_RESPONSE_HDR_RSRV_SHIFT	0
+#define ISCSI_TEXT_RESPONSE_HDR_C_MASK		0x1
+#define ISCSI_TEXT_RESPONSE_HDR_C_SHIFT		6
+#define ISCSI_TEXT_RESPONSE_HDR_F_MASK		0x1
+#define ISCSI_TEXT_RESPONSE_HDR_F_SHIFT		7
+	u8 opcode;
+	__le32 hdr_second_dword;
+#define ISCSI_TEXT_RESPONSE_HDR_DATA_SEG_LEN_MASK	0xFFFFFF
+#define ISCSI_TEXT_RESPONSE_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_TEXT_RESPONSE_HDR_TOTAL_AHS_LEN_MASK	0xFF
+#define ISCSI_TEXT_RESPONSE_HDR_TOTAL_AHS_LEN_SHIFT	24
+	struct regpair lun;
+	__le32 itt;
+	__le32 ttt;
+	__le32 stat_sn;
+	__le32 exp_cmd_sn;
+	__le32 max_cmd_sn;
+	__le32 reserved4[3];
+};
+
+/* iSCSI TMF Request PDU header */
+struct iscsi_tmf_request_hdr {
+	__le16 reserved0;
+	u8 function;
+	u8 opcode;
+	__le32 hdr_second_dword;
+#define ISCSI_TMF_REQUEST_HDR_DATA_SEG_LEN_MASK		0xFFFFFF
+#define ISCSI_TMF_REQUEST_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_TMF_REQUEST_HDR_TOTAL_AHS_LEN_MASK	0xFF
+#define ISCSI_TMF_REQUEST_HDR_TOTAL_AHS_LEN_SHIFT	24
+	struct regpair lun;
+	__le32 itt;
+	__le32 rtt;
+	__le32 cmd_sn;
+	__le32 exp_stat_sn;
+	__le32 ref_cmd_sn;
+	__le32 exp_data_sn;
+	__le32 reserved4[2];
+};
+
+struct iscsi_tmf_response_hdr {
+	u8 reserved2;
+	u8 hdr_response;
+	u8 hdr_flags;
+	u8 opcode;
+	__le32 hdr_second_dword;
+#define ISCSI_TMF_RESPONSE_HDR_DATA_SEG_LEN_MASK	0xFFFFFF
+#define ISCSI_TMF_RESPONSE_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_TMF_RESPONSE_HDR_TOTAL_AHS_LEN_MASK	0xFF
+#define ISCSI_TMF_RESPONSE_HDR_TOTAL_AHS_LEN_SHIFT	24
+	struct regpair reserved0;
+	__le32 itt;
+	__le32 reserved1;
+	__le32 stat_sn;
+	__le32 exp_cmd_sn;
+	__le32 max_cmd_sn;
+	__le32 reserved4[3];
+};
+
+/* iSCSI Response PDU header */
+struct iscsi_response_hdr {
+	u8 hdr_status;
+	u8 hdr_response;
+	u8 hdr_flags;
+	u8 opcode;
+	__le32 hdr_second_dword;
+#define ISCSI_RESPONSE_HDR_DATA_SEG_LEN_MASK	0xFFFFFF
+#define ISCSI_RESPONSE_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_RESPONSE_HDR_TOTAL_AHS_LEN_MASK	0xFF
+#define ISCSI_RESPONSE_HDR_TOTAL_AHS_LEN_SHIFT	24
+	struct regpair lun;
+	__le32 itt;
+	__le32 snack_tag;
+	__le32 stat_sn;
+	__le32 exp_cmd_sn;
+	__le32 max_cmd_sn;
+	__le32 exp_data_sn;
+	__le32 bi_residual_count;
+	__le32 residual_count;
+};
+
+/* iSCSI Reject PDU header */
+struct iscsi_reject_hdr {
+	u8 reserved4;
+	u8 hdr_reason;
+	u8 hdr_flags;
+	u8 opcode;
+	__le32 hdr_second_dword;
+#define ISCSI_REJECT_HDR_DATA_SEG_LEN_MASK	0xFFFFFF
+#define ISCSI_REJECT_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_REJECT_HDR_TOTAL_AHS_LEN_MASK	0xFF
+#define ISCSI_REJECT_HDR_TOTAL_AHS_LEN_SHIFT	24
+	struct regpair reserved0;
+	__le32 all_ones;
+	__le32 reserved2;
+	__le32 stat_sn;
+	__le32 exp_cmd_sn;
+	__le32 max_cmd_sn;
+	__le32 data_sn;
+	__le32 reserved3[2];
+};
+
+/* iSCSI Asynchronous Message PDU header */
+struct iscsi_async_msg_hdr {
+	__le16 reserved0;
+	u8 flags_attr;
+#define ISCSI_ASYNC_MSG_HDR_RSRV_MASK		0x7F
+#define ISCSI_ASYNC_MSG_HDR_RSRV_SHIFT		0
+#define ISCSI_ASYNC_MSG_HDR_CONST1_MASK		0x1
+#define ISCSI_ASYNC_MSG_HDR_CONST1_SHIFT	7
+	u8 opcode;
+	__le32 hdr_second_dword;
+#define ISCSI_ASYNC_MSG_HDR_DATA_SEG_LEN_MASK	0xFFFFFF
+#define ISCSI_ASYNC_MSG_HDR_DATA_SEG_LEN_SHIFT	0
+#define ISCSI_ASYNC_MSG_HDR_TOTAL_AHS_LEN_MASK	0xFF
+#define ISCSI_ASYNC_MSG_HDR_TOTAL_AHS_LEN_SHIFT	24
+	struct regpair lun;
+	__le32 all_ones;
+	__le32 reserved1;
+	__le32 stat_sn;
+	__le32 exp_cmd_sn;
+	__le32 max_cmd_sn;
+	__le16 param1_rsrv;
+	u8 async_vcode;
+	u8 async_event;
+	__le16 param3_rsrv;
+	__le16 param2_rsrv;
+	__le32 reserved7;
+};
+
+/* PDU header part of Ystorm task context */
+union iscsi_task_hdr {
+	struct iscsi_common_hdr common;
+	struct data_hdr data;
+	struct iscsi_cmd_hdr cmd;
+	struct iscsi_ext_cdb_cmd_hdr ext_cdb_cmd;
+	struct iscsi_login_req_hdr login_req;
+	struct iscsi_logout_req_hdr logout_req;
+	struct iscsi_data_out_hdr data_out;
+	struct iscsi_data_in_hdr data_in;
+	struct iscsi_r2t_hdr r2t;
+	struct iscsi_nop_out_hdr nop_out;
+	struct iscsi_nop_in_hdr nop_in;
+	struct iscsi_login_response_hdr login_response;
+	struct iscsi_logout_response_hdr logout_response;
+	struct iscsi_text_request_hdr text_request;
+	struct iscsi_text_response_hdr text_response;
+	struct iscsi_tmf_request_hdr tmf_request;
+	struct iscsi_tmf_response_hdr tmf_response;
+	struct iscsi_response_hdr response;
+	struct iscsi_reject_hdr reject;
+	struct iscsi_async_msg_hdr async_msg;
+};
+
+/* The iscsi storm task context of Ystorm */
+struct ystorm_iscsi_task_st_ctx {
+	struct ystorm_iscsi_task_state state;
+	struct ystorm_iscsi_task_rxmit_opt rxmit_opt;
+	union iscsi_task_hdr pdu_hdr;
+};
+
+struct e4_ystorm_iscsi_task_ag_ctx {
+	u8 reserved;
+	u8 byte1;
+	__le16 word0;
+	u8 flags0;
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_MASK	0xF
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_SHIFT	0
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT0_MASK		0x1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT0_SHIFT		4
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK		0x1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT		5
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_VALID_MASK		0x1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT		6
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_TTT_VALID_MASK   0x1	/* bit3 */
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_TTT_VALID_SHIFT  7
+	u8 flags1;
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0_MASK		0x3
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0_SHIFT		0
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF1_MASK		0x3
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT		2
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF2SPECIAL_MASK	0x3
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF2SPECIAL_SHIFT	4
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0EN_MASK		0x1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0EN_SHIFT		6
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK		0x1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT		7
+	u8 flags2;
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT4_MASK		0x1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT4_SHIFT		0
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK	0x1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT	1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK	0x1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT	2
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK	0x1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT	3
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK	0x1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT	4
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK	0x1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT	5
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK	0x1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT	6
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK	0x1
+#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT	7
+	u8 byte2;
+	__le32 TTT;
+	u8 byte3;
+	u8 byte4;
+	__le16 word1;
+};
+
+struct e4_mstorm_iscsi_task_ag_ctx {
+	u8 cdu_validation;
+	u8 byte1;
+	__le16 task_cid;
+	u8 flags0;
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_MASK	0xF
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK			0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT			5
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_VALID_MASK			0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT			6
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_FLAG_MASK	0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_FLAG_SHIFT	7
+	u8 flags1;
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_MASK	0x3
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_SHIFT	0
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF1_MASK			0x3
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT			2
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF2_MASK			0x3
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF2_SHIFT			4
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_EN_MASK	0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_EN_SHIFT	6
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK			0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT			7
+	u8 flags2;
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF2EN_MASK		0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF2EN_SHIFT		0
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK	0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT	1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK	0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT	2
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK	0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT	3
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK	0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT	4
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK	0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT	5
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK	0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT	6
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK	0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT	7
+	u8 byte2;
+	__le32 reg0;
+	u8 byte3;
+	u8 byte4;
+	__le16 word1;
+};
+
+struct e4_ustorm_iscsi_task_ag_ctx {
+	u8 reserved;
+	u8 state;
+	__le16 icid;
+	u8 flags0;
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_MASK	0xF
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
+#define E4_USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
+#define E4_USTORM_ISCSI_TASK_AG_CTX_BIT1_MASK			0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT			5
+#define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_MASK		0x3
+#define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_SHIFT		6
+	u8 flags1;
+#define E4_USTORM_ISCSI_TASK_AG_CTX_RESERVED1_MASK	0x3
+#define E4_USTORM_ISCSI_TASK_AG_CTX_RESERVED1_SHIFT	0
+#define E4_USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_MASK	0x3
+#define E4_USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_SHIFT	2
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CF3_MASK		0x3
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CF3_SHIFT		4
+#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_MASK	0x3
+#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_SHIFT	6
+	u8 flags2;
+#define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_EN_MASK	0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_EN_SHIFT	0
+#define E4_USTORM_ISCSI_TASK_AG_CTX_DISABLE_DATA_ACKED_MASK	0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_DISABLE_DATA_ACKED_SHIFT	1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_EN_MASK		0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_EN_SHIFT		2
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CF3EN_MASK			0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CF3EN_SHIFT			3
+#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_EN_MASK	0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_EN_SHIFT	4
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CMP_DATA_TOTAL_EXP_EN_MASK	0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CMP_DATA_TOTAL_EXP_EN_SHIFT	5
+#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK		0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT		6
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_MASK	0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_SHIFT	7
+	u8 flags3;
+#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK		0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT		0
+#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK		0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT		1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK		0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT		2
+#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK		0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT		3
+#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_TYPE_MASK		0xF
+#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT	4
+	__le32 dif_err_intervals;
+	__le32 dif_error_1st_interval;
+	__le32 rcv_cont_len;
+	__le32 exp_cont_len;
+	__le32 total_data_acked;
+	__le32 exp_data_acked;
+	u8 next_tid_valid;
+	u8 byte3;
+	__le16 word1;
+	__le16 next_tid;
+	__le16 word3;
+	__le32 hdr_residual_count;
+	__le32 exp_r2t_sn;
+};
+
+/* The iscsi storm task context of Mstorm */
+struct mstorm_iscsi_task_st_ctx {
+	struct scsi_cached_sges data_desc;
+	struct scsi_sgl_params sgl_params;
+	__le32 rem_task_size;
+	__le32 data_buffer_offset;
+	u8 task_type;
+	struct iscsi_dif_flags dif_flags;
+	__le16 dif_task_icid;
+	struct regpair sense_db;
+	__le32 expected_itt;
+	__le32 reserved1;
+};
+
+struct iscsi_reg1 {
+	__le32 reg1_map;
+#define ISCSI_REG1_NUM_SGES_MASK	0xF
+#define ISCSI_REG1_NUM_SGES_SHIFT	0
+#define ISCSI_REG1_RESERVED1_MASK	0xFFFFFFF
+#define ISCSI_REG1_RESERVED1_SHIFT	4
+};
+
+struct tqe_opaque {
+	__le16 opaque[2];
+};
+
+/* The iscsi storm task context of Ustorm */
+struct ustorm_iscsi_task_st_ctx {
+	__le32 rem_rcv_len;
+	__le32 exp_data_transfer_len;
+	__le32 exp_data_sn;
+	struct regpair lun;
+	struct iscsi_reg1 reg1;
+	u8 flags2;
+#define USTORM_ISCSI_TASK_ST_CTX_AHS_EXIST_MASK		0x1
+#define USTORM_ISCSI_TASK_ST_CTX_AHS_EXIST_SHIFT	0
+#define USTORM_ISCSI_TASK_ST_CTX_RESERVED1_MASK		0x7F
+#define USTORM_ISCSI_TASK_ST_CTX_RESERVED1_SHIFT	1
+	struct iscsi_dif_flags dif_flags;
+	__le16 reserved3;
+	struct tqe_opaque tqe_opaque_list;
+	__le32 reserved5;
+	__le32 reserved6;
+	__le32 reserved7;
+	u8 task_type;
+	u8 error_flags;
+#define USTORM_ISCSI_TASK_ST_CTX_DATA_DIGEST_ERROR_MASK		0x1
+#define USTORM_ISCSI_TASK_ST_CTX_DATA_DIGEST_ERROR_SHIFT	0
+#define USTORM_ISCSI_TASK_ST_CTX_DATA_TRUNCATED_ERROR_MASK	0x1
+#define USTORM_ISCSI_TASK_ST_CTX_DATA_TRUNCATED_ERROR_SHIFT	1
+#define USTORM_ISCSI_TASK_ST_CTX_UNDER_RUN_ERROR_MASK		0x1
+#define USTORM_ISCSI_TASK_ST_CTX_UNDER_RUN_ERROR_SHIFT		2
+#define USTORM_ISCSI_TASK_ST_CTX_RESERVED8_MASK			0x1F
+#define USTORM_ISCSI_TASK_ST_CTX_RESERVED8_SHIFT		3
+	u8 flags;
+#define USTORM_ISCSI_TASK_ST_CTX_CQE_WRITE_MASK			0x3
+#define USTORM_ISCSI_TASK_ST_CTX_CQE_WRITE_SHIFT		0
+#define USTORM_ISCSI_TASK_ST_CTX_LOCAL_COMP_MASK		0x1
+#define USTORM_ISCSI_TASK_ST_CTX_LOCAL_COMP_SHIFT		2
+#define USTORM_ISCSI_TASK_ST_CTX_Q0_R2TQE_WRITE_MASK		0x1
+#define USTORM_ISCSI_TASK_ST_CTX_Q0_R2TQE_WRITE_SHIFT		3
+#define USTORM_ISCSI_TASK_ST_CTX_TOTAL_DATA_ACKED_DONE_MASK	0x1
+#define USTORM_ISCSI_TASK_ST_CTX_TOTAL_DATA_ACKED_DONE_SHIFT	4
+#define USTORM_ISCSI_TASK_ST_CTX_HQ_SCANNED_DONE_MASK		0x1
+#define USTORM_ISCSI_TASK_ST_CTX_HQ_SCANNED_DONE_SHIFT		5
+#define USTORM_ISCSI_TASK_ST_CTX_R2T2RECV_DONE_MASK		0x1
+#define USTORM_ISCSI_TASK_ST_CTX_R2T2RECV_DONE_SHIFT		6
+#define USTORM_ISCSI_TASK_ST_CTX_RESERVED0_MASK			0x1
+#define USTORM_ISCSI_TASK_ST_CTX_RESERVED0_SHIFT		7
+	u8 cq_rss_number;
+};
+
+/* iscsi task context */
+struct e4_iscsi_task_context {
+	struct ystorm_iscsi_task_st_ctx ystorm_st_context;
+	struct e4_ystorm_iscsi_task_ag_ctx ystorm_ag_context;
+	struct regpair ystorm_ag_padding[2];
+	struct tdif_task_context tdif_context;
+	struct e4_mstorm_iscsi_task_ag_ctx mstorm_ag_context;
+	struct regpair mstorm_ag_padding[2];
+	struct e4_ustorm_iscsi_task_ag_ctx ustorm_ag_context;
+	struct mstorm_iscsi_task_st_ctx mstorm_st_context;
+	struct ustorm_iscsi_task_st_ctx ustorm_st_context;
+	struct rdif_task_context rdif_context;
+};
+
+/* iSCSI connection offload params passed by driver to FW in ISCSI offload
+ * ramrod.
+ */
+struct iscsi_conn_offload_params {
+	struct regpair sq_pbl_addr;
+	struct regpair r2tq_pbl_addr;
+	struct regpair xhq_pbl_addr;
+	struct regpair uhq_pbl_addr;
+	__le32 initial_ack;
+	__le16 physical_q0;
+	__le16 physical_q1;
+	u8 flags;
+#define ISCSI_CONN_OFFLOAD_PARAMS_TCP_ON_CHIP_1B_MASK	0x1
+#define ISCSI_CONN_OFFLOAD_PARAMS_TCP_ON_CHIP_1B_SHIFT	0
+#define ISCSI_CONN_OFFLOAD_PARAMS_TARGET_MODE_MASK	0x1
+#define ISCSI_CONN_OFFLOAD_PARAMS_TARGET_MODE_SHIFT	1
+#define ISCSI_CONN_OFFLOAD_PARAMS_RESTRICTED_MODE_MASK	0x1
+#define ISCSI_CONN_OFFLOAD_PARAMS_RESTRICTED_MODE_SHIFT	2
+#define ISCSI_CONN_OFFLOAD_PARAMS_RESERVED1_MASK	0x1F
+#define ISCSI_CONN_OFFLOAD_PARAMS_RESERVED1_SHIFT	3
+	u8 pbl_page_size_log;
+	u8 pbe_page_size_log;
+	u8 default_cq;
+	__le32 stat_sn;
+};
+
+/* iSCSI connection statistics */
+struct iscsi_conn_stats_params {
+	struct regpair iscsi_tcp_tx_packets_cnt;
+	struct regpair iscsi_tcp_tx_bytes_cnt;
+	struct regpair iscsi_tcp_tx_rxmit_cnt;
+	struct regpair iscsi_tcp_rx_packets_cnt;
+	struct regpair iscsi_tcp_rx_bytes_cnt;
+	struct regpair iscsi_tcp_rx_dup_ack_cnt;
+	__le32 iscsi_tcp_rx_chksum_err_cnt;
+	__le32 reserved;
+};
+
+/* spe message header */
+struct iscsi_slow_path_hdr {
+	u8 op_code;
+	u8 flags;
+#define ISCSI_SLOW_PATH_HDR_RESERVED0_MASK	0xF
+#define ISCSI_SLOW_PATH_HDR_RESERVED0_SHIFT	0
+#define ISCSI_SLOW_PATH_HDR_LAYER_CODE_MASK	0x7
+#define ISCSI_SLOW_PATH_HDR_LAYER_CODE_SHIFT	4
+#define ISCSI_SLOW_PATH_HDR_RESERVED1_MASK	0x1
+#define ISCSI_SLOW_PATH_HDR_RESERVED1_SHIFT	7
+};
+
+/* iSCSI connection update params passed by driver to FW in ISCSI update
+ *ramrod.
+ */
+struct iscsi_conn_update_ramrod_params {
+	struct iscsi_slow_path_hdr hdr;
+	__le16 conn_id;
+	__le32 fw_cid;
+	u8 flags;
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_HD_EN_MASK		0x1
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_HD_EN_SHIFT		0
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DD_EN_MASK		0x1
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DD_EN_SHIFT		1
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_INITIAL_R2T_MASK	0x1
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_INITIAL_R2T_SHIFT	2
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_IMMEDIATE_DATA_MASK	0x1
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_IMMEDIATE_DATA_SHIFT	3
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_BLOCK_SIZE_MASK	0x1
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_BLOCK_SIZE_SHIFT	4
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_ON_HOST_EN_MASK	0x1
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_ON_HOST_EN_SHIFT	5
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_ON_IMM_EN_MASK	0x1
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_ON_IMM_EN_SHIFT	6
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_LUN_MAPPER_EN_MASK	0x1
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_LUN_MAPPER_EN_SHIFT	7
+	u8 reserved0[3];
+	__le32 max_seq_size;
+	__le32 max_send_pdu_length;
+	__le32 max_recv_pdu_length;
+	__le32 first_seq_length;
+	__le32 exp_stat_sn;
+	union dif_configuration_params dif_on_imme_params;
+};
+
+/* iSCSI CQ element */
+struct iscsi_cqe_common {
+	__le16 conn_id;
+	u8 cqe_type;
+	union cqe_error_status error_bitmap;
+	__le32 reserved[3];
+	union iscsi_task_hdr iscsi_hdr;
+};
+
+/* iSCSI CQ element */
+struct iscsi_cqe_solicited {
+	__le16 conn_id;
+	u8 cqe_type;
+	union cqe_error_status error_bitmap;
+	__le16 itid;
+	u8 task_type;
+	u8 fw_dbg_field;
+	u8 caused_conn_err;
+	u8 reserved0[3];
+	__le32 data_truncated_bytes;
+	union iscsi_task_hdr iscsi_hdr;
+};
+
+/* iSCSI CQ element */
+struct iscsi_cqe_unsolicited {
+	__le16 conn_id;
+	u8 cqe_type;
+	union cqe_error_status error_bitmap;
+	__le16 reserved0;
+	u8 reserved1;
+	u8 unsol_cqe_type;
+	__le16 rqe_opaque;
+	__le16 reserved2[3];
+	union iscsi_task_hdr iscsi_hdr;
+};
+
+/* iSCSI CQ element */
+union iscsi_cqe {
+	struct iscsi_cqe_common cqe_common;
+	struct iscsi_cqe_solicited cqe_solicited;
+	struct iscsi_cqe_unsolicited cqe_unsolicited;
+};
+
+/* iSCSI CQE type */
+enum iscsi_cqes_type {
+	ISCSI_CQE_TYPE_SOLICITED = 1,
+	ISCSI_CQE_TYPE_UNSOLICITED,
+	ISCSI_CQE_TYPE_SOLICITED_WITH_SENSE,
+	ISCSI_CQE_TYPE_TASK_CLEANUP,
+	ISCSI_CQE_TYPE_DUMMY,
+	MAX_ISCSI_CQES_TYPE
+};
+
+/* iSCSI CQE type */
+enum iscsi_cqe_unsolicited_type {
+	ISCSI_CQE_UNSOLICITED_NONE,
+	ISCSI_CQE_UNSOLICITED_SINGLE,
+	ISCSI_CQE_UNSOLICITED_FIRST,
+	ISCSI_CQE_UNSOLICITED_MIDDLE,
+	ISCSI_CQE_UNSOLICITED_LAST,
+	MAX_ISCSI_CQE_UNSOLICITED_TYPE
+};
+
+/* iscsi debug modes */
+struct iscsi_debug_modes {
+	u8 flags;
+#define ISCSI_DEBUG_MODES_ASSERT_IF_RX_CONN_ERROR_MASK			0x1
+#define ISCSI_DEBUG_MODES_ASSERT_IF_RX_CONN_ERROR_SHIFT			0
+#define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_RESET_MASK			0x1
+#define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_RESET_SHIFT			1
+#define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_FIN_MASK			0x1
+#define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_FIN_SHIFT			2
+#define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_CLEANUP_MASK			0x1
+#define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_CLEANUP_SHIFT			3
+#define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_REJECT_OR_ASYNC_MASK		0x1
+#define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_REJECT_OR_ASYNC_SHIFT		4
+#define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_NOP_MASK			0x1
+#define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_NOP_SHIFT			5
+#define ISCSI_DEBUG_MODES_ASSERT_IF_DIF_OR_DATA_DIGEST_ERROR_MASK	0x1
+#define ISCSI_DEBUG_MODES_ASSERT_IF_DIF_OR_DATA_DIGEST_ERROR_SHIFT	6
+#define ISCSI_DEBUG_MODES_ASSERT_IF_HQ_CORRUPT_MASK			0x1
+#define ISCSI_DEBUG_MODES_ASSERT_IF_HQ_CORRUPT_SHIFT			7
+};
+
+/* iSCSI kernel completion queue IDs */
+enum iscsi_eqe_opcode {
+	ISCSI_EVENT_TYPE_INIT_FUNC = 0,
+	ISCSI_EVENT_TYPE_DESTROY_FUNC,
+	ISCSI_EVENT_TYPE_OFFLOAD_CONN,
+	ISCSI_EVENT_TYPE_UPDATE_CONN,
+	ISCSI_EVENT_TYPE_CLEAR_SQ,
+	ISCSI_EVENT_TYPE_TERMINATE_CONN,
+	ISCSI_EVENT_TYPE_MAC_UPDATE_CONN,
+	ISCSI_EVENT_TYPE_COLLECT_STATS_CONN,
+	ISCSI_EVENT_TYPE_ASYN_CONNECT_COMPLETE,
+	ISCSI_EVENT_TYPE_ASYN_TERMINATE_DONE,
+	ISCSI_EVENT_TYPE_START_OF_ERROR_TYPES = 10,
+	ISCSI_EVENT_TYPE_ASYN_ABORT_RCVD,
+	ISCSI_EVENT_TYPE_ASYN_CLOSE_RCVD,
+	ISCSI_EVENT_TYPE_ASYN_SYN_RCVD,
+	ISCSI_EVENT_TYPE_ASYN_MAX_RT_TIME,
+	ISCSI_EVENT_TYPE_ASYN_MAX_RT_CNT,
+	ISCSI_EVENT_TYPE_ASYN_MAX_KA_PROBES_CNT,
+	ISCSI_EVENT_TYPE_ASYN_FIN_WAIT2,
+	ISCSI_EVENT_TYPE_ISCSI_CONN_ERROR,
+	ISCSI_EVENT_TYPE_TCP_CONN_ERROR,
+	MAX_ISCSI_EQE_OPCODE
+};
+
+/* iSCSI EQE and CQE completion status */
+enum iscsi_error_types {
+	ISCSI_STATUS_NONE = 0,
+	ISCSI_CQE_ERROR_UNSOLICITED_RCV_ON_INVALID_CONN = 1,
+	ISCSI_CONN_ERROR_TASK_CID_MISMATCH,
+	ISCSI_CONN_ERROR_TASK_NOT_VALID,
+	ISCSI_CONN_ERROR_RQ_RING_IS_FULL,
+	ISCSI_CONN_ERROR_CMDQ_RING_IS_FULL,
+	ISCSI_CONN_ERROR_HQE_CACHING_FAILED,
+	ISCSI_CONN_ERROR_HEADER_DIGEST_ERROR,
+	ISCSI_CONN_ERROR_LOCAL_COMPLETION_ERROR,
+	ISCSI_CONN_ERROR_DATA_OVERRUN,
+	ISCSI_CONN_ERROR_OUT_OF_SGES_ERROR,
+	ISCSI_CONN_ERROR_IP_OPTIONS_ERROR,
+	ISCSI_CONN_ERROR_PRS_ERRORS,
+	ISCSI_CONN_ERROR_CONNECT_INVALID_TCP_OPTION,
+	ISCSI_CONN_ERROR_TCP_IP_FRAGMENT_ERROR,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_AHS_LEN,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_AHS_TYPE,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_ITT_OUT_OF_RANGE,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_TTT_OUT_OF_RANGE,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_DATA_SEG_LEN_EXCEEDS_PDU_SIZE,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_INVALID_OPCODE,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_INVALID_OPCODE_BEFORE_UPDATE,
+	ISCSI_CONN_ERROR_UNVALID_NOPIN_DSL,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_R2T_CARRIES_NO_DATA,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_DATA_SN,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_DATA_IN_TTT,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_DATA_OUT_ITT,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_R2T_TTT,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_R2T_BUFFER_OFFSET,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_BUFFER_OFFSET_OOO,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_R2T_SN,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_DESIRED_DATA_TRNS_LEN_0,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_DESIRED_DATA_TRNS_LEN_1,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_DESIRED_DATA_TRNS_LEN_2,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_LUN,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_F_BIT_ZERO,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_F_BIT_ZERO_S_BIT_ONE,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_EXP_STAT_SN,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_DSL_NOT_ZERO,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_INVALID_DSL,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_DATA_SEG_LEN_TOO_BIG,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_OUTSTANDING_R2T_COUNT,
+	ISCSI_CONN_ERROR_PROTOCOL_ERR_DIF_TX,
+	ISCSI_CONN_ERROR_SENSE_DATA_LENGTH,
+	ISCSI_CONN_ERROR_DATA_PLACEMENT_ERROR,
+	ISCSI_CONN_ERROR_INVALID_ITT,
+	ISCSI_ERROR_UNKNOWN,
+	MAX_ISCSI_ERROR_TYPES
+};
+
+/* iSCSI Ramrod Command IDs */
+enum iscsi_ramrod_cmd_id {
+	ISCSI_RAMROD_CMD_ID_UNUSED = 0,
+	ISCSI_RAMROD_CMD_ID_INIT_FUNC = 1,
+	ISCSI_RAMROD_CMD_ID_DESTROY_FUNC = 2,
+	ISCSI_RAMROD_CMD_ID_OFFLOAD_CONN = 3,
+	ISCSI_RAMROD_CMD_ID_UPDATE_CONN = 4,
+	ISCSI_RAMROD_CMD_ID_TERMINATION_CONN = 5,
+	ISCSI_RAMROD_CMD_ID_CLEAR_SQ = 6,
+	ISCSI_RAMROD_CMD_ID_MAC_UPDATE = 7,
+	ISCSI_RAMROD_CMD_ID_CONN_STATS = 8,
+	MAX_ISCSI_RAMROD_CMD_ID
+};
+
+/* iSCSI connection termination request */
+struct iscsi_spe_conn_mac_update {
+	struct iscsi_slow_path_hdr hdr;
+	__le16 conn_id;
+	__le32 fw_cid;
+	__le16 remote_mac_addr_lo;
+	__le16 remote_mac_addr_mid;
+	__le16 remote_mac_addr_hi;
+	u8 reserved0[2];
+};
+
+/* iSCSI and TCP connection (Option 1) offload params passed by driver to FW in
+ * iSCSI offload ramrod.
+ */
+struct iscsi_spe_conn_offload {
+	struct iscsi_slow_path_hdr hdr;
+	__le16 conn_id;
+	__le32 fw_cid;
+	struct iscsi_conn_offload_params iscsi;
+	struct tcp_offload_params tcp;
+};
+
+/* iSCSI and TCP connection(Option 2) offload params passed by driver to FW in
+ * iSCSI offload ramrod.
+ */
+struct iscsi_spe_conn_offload_option2 {
+	struct iscsi_slow_path_hdr hdr;
+	__le16 conn_id;
+	__le32 fw_cid;
+	struct iscsi_conn_offload_params iscsi;
+	struct tcp_offload_params_opt2 tcp;
+};
+
+/* iSCSI collect connection statistics request */
+struct iscsi_spe_conn_statistics {
+	struct iscsi_slow_path_hdr hdr;
+	__le16 conn_id;
+	__le32 fw_cid;
+	u8 reset_stats;
+	u8 reserved0[7];
+	struct regpair stats_cnts_addr;
+};
+
+/* iSCSI connection termination request */
+struct iscsi_spe_conn_termination {
+	struct iscsi_slow_path_hdr hdr;
+	__le16 conn_id;
+	__le32 fw_cid;
+	u8 abortive;
+	u8 reserved0[7];
+	struct regpair queue_cnts_addr;
+	struct regpair query_params_addr;
+};
+
+/* iSCSI firmware function destroy parameters */
+struct iscsi_spe_func_dstry {
+	struct iscsi_slow_path_hdr hdr;
+	__le16 reserved0;
+	__le32 reserved1;
+};
+
+/* iSCSI firmware function init parameters */
+struct iscsi_spe_func_init {
+	struct iscsi_slow_path_hdr hdr;
+	__le16 half_way_close_timeout;
+	u8 num_sq_pages_in_ring;
+	u8 num_r2tq_pages_in_ring;
+	u8 num_uhq_pages_in_ring;
+	u8 ll2_rx_queue_id;
+	u8 flags;
+#define ISCSI_SPE_FUNC_INIT_COUNTERS_EN_MASK	0x1
+#define ISCSI_SPE_FUNC_INIT_COUNTERS_EN_SHIFT	0
+#define ISCSI_SPE_FUNC_INIT_RESERVED0_MASK	0x7F
+#define ISCSI_SPE_FUNC_INIT_RESERVED0_SHIFT	1
+	struct iscsi_debug_modes debug_mode;
+	__le16 reserved1;
+	__le32 reserved2;
+	struct scsi_init_func_params func_params;
+	struct scsi_init_func_queues q_params;
+};
+
+/* iSCSI task type */
+enum iscsi_task_type {
+	ISCSI_TASK_TYPE_INITIATOR_WRITE,
+	ISCSI_TASK_TYPE_INITIATOR_READ,
+	ISCSI_TASK_TYPE_MIDPATH,
+	ISCSI_TASK_TYPE_UNSOLIC,
+	ISCSI_TASK_TYPE_EXCHCLEANUP,
+	ISCSI_TASK_TYPE_IRRELEVANT,
+	ISCSI_TASK_TYPE_TARGET_WRITE,
+	ISCSI_TASK_TYPE_TARGET_READ,
+	ISCSI_TASK_TYPE_TARGET_RESPONSE,
+	ISCSI_TASK_TYPE_LOGIN_RESPONSE,
+	ISCSI_TASK_TYPE_TARGET_IMM_W_DIF,
+	MAX_ISCSI_TASK_TYPE
+};
+
+/* iSCSI DesiredDataTransferLength/ttt union */
+union iscsi_ttt_txlen_union {
+	__le32 desired_tx_len;
+	__le32 ttt;
+};
+
+/* iSCSI uHQ element */
+struct iscsi_uhqe {
+	__le32 reg1;
+#define ISCSI_UHQE_PDU_PAYLOAD_LEN_MASK		0xFFFFF
+#define ISCSI_UHQE_PDU_PAYLOAD_LEN_SHIFT	0
+#define ISCSI_UHQE_LOCAL_COMP_MASK		0x1
+#define ISCSI_UHQE_LOCAL_COMP_SHIFT		20
+#define ISCSI_UHQE_TOGGLE_BIT_MASK		0x1
+#define ISCSI_UHQE_TOGGLE_BIT_SHIFT		21
+#define ISCSI_UHQE_PURE_PAYLOAD_MASK		0x1
+#define ISCSI_UHQE_PURE_PAYLOAD_SHIFT		22
+#define ISCSI_UHQE_LOGIN_RESPONSE_PDU_MASK	0x1
+#define ISCSI_UHQE_LOGIN_RESPONSE_PDU_SHIFT	23
+#define ISCSI_UHQE_TASK_ID_HI_MASK		0xFF
+#define ISCSI_UHQE_TASK_ID_HI_SHIFT		24
+	__le32 reg2;
+#define ISCSI_UHQE_BUFFER_OFFSET_MASK	0xFFFFFF
+#define ISCSI_UHQE_BUFFER_OFFSET_SHIFT	0
+#define ISCSI_UHQE_TASK_ID_LO_MASK	0xFF
+#define ISCSI_UHQE_TASK_ID_LO_SHIFT	24
+};
+
+/* iSCSI WQ element */
+struct iscsi_wqe {
+	__le16 task_id;
+	u8 flags;
+#define ISCSI_WQE_WQE_TYPE_MASK		0x7
+#define ISCSI_WQE_WQE_TYPE_SHIFT	0
+#define ISCSI_WQE_NUM_SGES_MASK		0xF
+#define ISCSI_WQE_NUM_SGES_SHIFT	3
+#define ISCSI_WQE_RESPONSE_MASK		0x1
+#define ISCSI_WQE_RESPONSE_SHIFT	7
+	struct iscsi_dif_flags prot_flags;
+	__le32 contlen_cdbsize;
+#define ISCSI_WQE_CONT_LEN_MASK		0xFFFFFF
+#define ISCSI_WQE_CONT_LEN_SHIFT	0
+#define ISCSI_WQE_CDB_SIZE_MASK		0xFF
+#define ISCSI_WQE_CDB_SIZE_SHIFT	24
+};
+
+/* iSCSI wqe type */
+enum iscsi_wqe_type {
+	ISCSI_WQE_TYPE_NORMAL,
+	ISCSI_WQE_TYPE_TASK_CLEANUP,
+	ISCSI_WQE_TYPE_MIDDLE_PATH,
+	ISCSI_WQE_TYPE_LOGIN,
+	ISCSI_WQE_TYPE_FIRST_R2T_CONT,
+	ISCSI_WQE_TYPE_NONFIRST_R2T_CONT,
+	ISCSI_WQE_TYPE_RESPONSE,
+	MAX_ISCSI_WQE_TYPE
+};
+
+/* iSCSI xHQ element */
+struct iscsi_xhqe {
+	union iscsi_ttt_txlen_union ttt_or_txlen;
+	__le32 exp_stat_sn;
+	struct iscsi_dif_flags prot_flags;
+	u8 total_ahs_length;
+	u8 opcode;
+	u8 flags;
+#define ISCSI_XHQE_FINAL_MASK		0x1
+#define ISCSI_XHQE_FINAL_SHIFT		0
+#define ISCSI_XHQE_STATUS_BIT_MASK	0x1
+#define ISCSI_XHQE_STATUS_BIT_SHIFT	1
+#define ISCSI_XHQE_NUM_SGES_MASK	0xF
+#define ISCSI_XHQE_NUM_SGES_SHIFT	2
+#define ISCSI_XHQE_RESERVED0_MASK	0x3
+#define ISCSI_XHQE_RESERVED0_SHIFT	6
+	union iscsi_seq_num seq_num;
+	__le16 reserved1;
+};
+
+/* Per PF iSCSI receive path statistics - mStorm RAM structure */
+struct mstorm_iscsi_stats_drv {
+	struct regpair iscsi_rx_dropped_pdus_task_not_valid;
+	struct regpair iscsi_rx_dup_ack_cnt;
+};
+
+/* Per PF iSCSI transmit path statistics - pStorm RAM structure */
+struct pstorm_iscsi_stats_drv {
+	struct regpair iscsi_tx_bytes_cnt;
+	struct regpair iscsi_tx_packet_cnt;
+};
+
+/* Per PF iSCSI receive path statistics - tStorm RAM structure */
+struct tstorm_iscsi_stats_drv {
+	struct regpair iscsi_rx_bytes_cnt;
+	struct regpair iscsi_rx_packet_cnt;
+	struct regpair iscsi_rx_new_ooo_isle_events_cnt;
+	struct regpair iscsi_rx_tcp_payload_bytes_cnt;
+	struct regpair iscsi_rx_tcp_pkt_cnt;
+	struct regpair iscsi_rx_pure_ack_cnt;
+	__le32 iscsi_cmdq_threshold_cnt;
+	__le32 iscsi_rq_threshold_cnt;
+	__le32 iscsi_immq_threshold_cnt;
+};
+
+/* Per PF iSCSI receive path statistics - uStorm RAM structure */
+struct ustorm_iscsi_stats_drv {
+	struct regpair iscsi_rx_data_pdu_cnt;
+	struct regpair iscsi_rx_r2t_pdu_cnt;
+	struct regpair iscsi_rx_total_pdu_cnt;
+};
+
+/* Per PF iSCSI transmit path statistics - xStorm RAM structure */
+struct xstorm_iscsi_stats_drv {
+	struct regpair iscsi_tx_go_to_slow_start_event_cnt;
+	struct regpair iscsi_tx_fast_retransmit_event_cnt;
+	struct regpair iscsi_tx_pure_ack_cnt;
+	struct regpair iscsi_tx_delayed_ack_cnt;
+};
+
+/* Per PF iSCSI transmit path statistics - yStorm RAM structure */
+struct ystorm_iscsi_stats_drv {
+	struct regpair iscsi_tx_data_pdu_cnt;
+	struct regpair iscsi_tx_r2t_pdu_cnt;
+	struct regpair iscsi_tx_total_pdu_cnt;
+	struct regpair iscsi_tx_tcp_payload_bytes_cnt;
+	struct regpair iscsi_tx_tcp_pkt_cnt;
+};
+
+struct e4_tstorm_iscsi_task_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	__le16 word0;
+	u8 flags0;
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_MASK	0xF
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_SHIFT	0
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT0_MASK		0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT0_SHIFT		4
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK		0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT		5
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT2_MASK		0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT2_SHIFT		6
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT3_MASK		0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT3_SHIFT		7
+	u8 flags1;
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT4_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT4_SHIFT	0
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT5_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT5_SHIFT	1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF0_MASK	0x3
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF0_SHIFT	2
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF1_MASK	0x3
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT	4
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF2_MASK	0x3
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF2_SHIFT	6
+	u8 flags2;
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF3_MASK	0x3
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF3_SHIFT	0
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF4_MASK	0x3
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF4_SHIFT	2
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF5_MASK	0x3
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF5_SHIFT	4
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF6_MASK	0x3
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF6_SHIFT	6
+	u8 flags3;
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF7_MASK	0x3
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF7_SHIFT	0
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF0EN_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF0EN_SHIFT	2
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT	3
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF2EN_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF2EN_SHIFT	4
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF3EN_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF3EN_SHIFT	5
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF4EN_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF4EN_SHIFT	6
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF5EN_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF5EN_SHIFT	7
+	u8 flags4;
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF6EN_MASK		0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF6EN_SHIFT		0
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF7EN_MASK		0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF7EN_SHIFT		1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT	2
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT	3
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT	4
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT	5
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT	6
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK	0x1
+#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT	7
+	u8 byte2;
+	__le16 word1;
+	__le32 reg0;
+	u8 byte3;
+	u8 byte4;
+	__le16 word2;
+	__le16 word3;
+	__le16 word4;
+	__le32 reg1;
+	__le32 reg2;
+};
+
+/* iSCSI doorbell data */
+struct iscsi_db_data {
+	u8 params;
+#define ISCSI_DB_DATA_DEST_MASK		0x3
+#define ISCSI_DB_DATA_DEST_SHIFT	0
+#define ISCSI_DB_DATA_AGG_CMD_MASK	0x3
+#define ISCSI_DB_DATA_AGG_CMD_SHIFT	2
+#define ISCSI_DB_DATA_BYPASS_EN_MASK	0x1
+#define ISCSI_DB_DATA_BYPASS_EN_SHIFT	4
+#define ISCSI_DB_DATA_RESERVED_MASK	0x1
+#define ISCSI_DB_DATA_RESERVED_SHIFT	5
+#define ISCSI_DB_DATA_AGG_VAL_SEL_MASK	0x3
+#define ISCSI_DB_DATA_AGG_VAL_SEL_SHIFT	6
+	u8 agg_flags;
+	__le16 sq_prod;
+};
+
+#endif /* __ISCSI_COMMON__ */
diff --git a/include/linux/qed/iwarp_common.h b/include/linux/qed/iwarp_common.h
new file mode 100644
index 0000000..c6cfd39
--- /dev/null
+++ b/include/linux/qed/iwarp_common.h
@@ -0,0 +1,56 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __IWARP_COMMON__
+#define __IWARP_COMMON__
+
+#include <linux/qed/rdma_common.h>
+
+/************************/
+/* IWARP FW CONSTANTS	*/
+/************************/
+
+#define IWARP_ACTIVE_MODE 0
+#define IWARP_PASSIVE_MODE 1
+
+#define IWARP_SHARED_QUEUE_PAGE_SIZE		(0x8000)
+#define IWARP_SHARED_QUEUE_PAGE_RQ_PBL_OFFSET	(0x4000)
+#define IWARP_SHARED_QUEUE_PAGE_RQ_PBL_MAX_SIZE	(0x1000)
+#define IWARP_SHARED_QUEUE_PAGE_SQ_PBL_OFFSET	(0x5000)
+#define IWARP_SHARED_QUEUE_PAGE_SQ_PBL_MAX_SIZE	(0x3000)
+
+#define IWARP_REQ_MAX_INLINE_DATA_SIZE		(128)
+#define IWARP_REQ_MAX_SINGLE_SQ_WQE_SIZE	(176)
+
+#define IWARP_MAX_QPS				(64 * 1024)
+
+#endif /* __IWARP_COMMON__ */
diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h
new file mode 100644
index 0000000..59ddf9a
--- /dev/null
+++ b/include/linux/qed/qed_chain.h
@@ -0,0 +1,692 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_CHAIN_H
+#define _QED_CHAIN_H
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/qed/common_hsi.h>
+
+enum qed_chain_mode {
+	/* Each Page contains a next pointer at its end */
+	QED_CHAIN_MODE_NEXT_PTR,
+
+	/* Chain is a single page (next ptr) is unrequired */
+	QED_CHAIN_MODE_SINGLE,
+
+	/* Page pointers are located in a side list */
+	QED_CHAIN_MODE_PBL,
+};
+
+enum qed_chain_use_mode {
+	QED_CHAIN_USE_TO_PRODUCE,		/* Chain starts empty */
+	QED_CHAIN_USE_TO_CONSUME,		/* Chain starts full */
+	QED_CHAIN_USE_TO_CONSUME_PRODUCE,	/* Chain starts empty */
+};
+
+enum qed_chain_cnt_type {
+	/* The chain's size/prod/cons are kept in 16-bit variables */
+	QED_CHAIN_CNT_TYPE_U16,
+
+	/* The chain's size/prod/cons are kept in 32-bit variables  */
+	QED_CHAIN_CNT_TYPE_U32,
+};
+
+struct qed_chain_next {
+	struct regpair	next_phys;
+	void		*next_virt;
+};
+
+struct qed_chain_pbl_u16 {
+	u16 prod_page_idx;
+	u16 cons_page_idx;
+};
+
+struct qed_chain_pbl_u32 {
+	u32 prod_page_idx;
+	u32 cons_page_idx;
+};
+
+struct qed_chain_ext_pbl {
+	dma_addr_t p_pbl_phys;
+	void *p_pbl_virt;
+};
+
+struct qed_chain_u16 {
+	/* Cyclic index of next element to produce/consme */
+	u16 prod_idx;
+	u16 cons_idx;
+};
+
+struct qed_chain_u32 {
+	/* Cyclic index of next element to produce/consme */
+	u32 prod_idx;
+	u32 cons_idx;
+};
+
+struct qed_chain {
+	/* fastpath portion of the chain - required for commands such
+	 * as produce / consume.
+	 */
+	/* Point to next element to produce/consume */
+	void *p_prod_elem;
+	void *p_cons_elem;
+
+	/* Fastpath portions of the PBL [if exists] */
+	struct {
+		/* Table for keeping the virtual addresses of the chain pages,
+		 * respectively to the physical addresses in the pbl table.
+		 */
+		void **pp_virt_addr_tbl;
+
+		union {
+			struct qed_chain_pbl_u16 u16;
+			struct qed_chain_pbl_u32 u32;
+		} c;
+	} pbl;
+
+	union {
+		struct qed_chain_u16 chain16;
+		struct qed_chain_u32 chain32;
+	} u;
+
+	/* Capacity counts only usable elements */
+	u32 capacity;
+	u32 page_cnt;
+
+	enum qed_chain_mode mode;
+
+	/* Elements information for fast calculations */
+	u16 elem_per_page;
+	u16 elem_per_page_mask;
+	u16 elem_size;
+	u16 next_page_mask;
+	u16 usable_per_page;
+	u8 elem_unusable;
+
+	u8 cnt_type;
+
+	/* Slowpath of the chain - required for initialization and destruction,
+	 * but isn't involved in regular functionality.
+	 */
+
+	/* Base address of a pre-allocated buffer for pbl */
+	struct {
+		dma_addr_t p_phys_table;
+		void *p_virt_table;
+	} pbl_sp;
+
+	/* Address of first page of the chain - the address is required
+	 * for fastpath operation [consume/produce] but only for the the SINGLE
+	 * flavour which isn't considered fastpath [== SPQ].
+	 */
+	void *p_virt_addr;
+	dma_addr_t p_phys_addr;
+
+	/* Total number of elements [for entire chain] */
+	u32 size;
+
+	u8 intended_use;
+
+	bool b_external_pbl;
+};
+
+#define QED_CHAIN_PBL_ENTRY_SIZE        (8)
+#define QED_CHAIN_PAGE_SIZE             (0x1000)
+#define ELEMS_PER_PAGE(elem_size)       (QED_CHAIN_PAGE_SIZE / (elem_size))
+
+#define UNUSABLE_ELEMS_PER_PAGE(elem_size, mode)	 \
+	(((mode) == QED_CHAIN_MODE_NEXT_PTR) ?		 \
+	 (u8)(1 + ((sizeof(struct qed_chain_next) - 1) / \
+		   (elem_size))) : 0)
+
+#define USABLE_ELEMS_PER_PAGE(elem_size, mode) \
+	((u32)(ELEMS_PER_PAGE(elem_size) -     \
+	       UNUSABLE_ELEMS_PER_PAGE(elem_size, mode)))
+
+#define QED_CHAIN_PAGE_CNT(elem_cnt, elem_size, mode) \
+	DIV_ROUND_UP(elem_cnt, USABLE_ELEMS_PER_PAGE(elem_size, mode))
+
+#define is_chain_u16(p) ((p)->cnt_type == QED_CHAIN_CNT_TYPE_U16)
+#define is_chain_u32(p) ((p)->cnt_type == QED_CHAIN_CNT_TYPE_U32)
+
+/* Accessors */
+static inline u16 qed_chain_get_prod_idx(struct qed_chain *p_chain)
+{
+	return p_chain->u.chain16.prod_idx;
+}
+
+static inline u16 qed_chain_get_cons_idx(struct qed_chain *p_chain)
+{
+	return p_chain->u.chain16.cons_idx;
+}
+
+static inline u32 qed_chain_get_cons_idx_u32(struct qed_chain *p_chain)
+{
+	return p_chain->u.chain32.cons_idx;
+}
+
+static inline u16 qed_chain_get_elem_left(struct qed_chain *p_chain)
+{
+	u16 used;
+
+	used = (u16) (((u32)0x10000 +
+		       (u32)p_chain->u.chain16.prod_idx) -
+		      (u32)p_chain->u.chain16.cons_idx);
+	if (p_chain->mode == QED_CHAIN_MODE_NEXT_PTR)
+		used -= p_chain->u.chain16.prod_idx / p_chain->elem_per_page -
+		    p_chain->u.chain16.cons_idx / p_chain->elem_per_page;
+
+	return (u16)(p_chain->capacity - used);
+}
+
+static inline u32 qed_chain_get_elem_left_u32(struct qed_chain *p_chain)
+{
+	u32 used;
+
+	used = (u32) (((u64)0x100000000ULL +
+		       (u64)p_chain->u.chain32.prod_idx) -
+		      (u64)p_chain->u.chain32.cons_idx);
+	if (p_chain->mode == QED_CHAIN_MODE_NEXT_PTR)
+		used -= p_chain->u.chain32.prod_idx / p_chain->elem_per_page -
+		    p_chain->u.chain32.cons_idx / p_chain->elem_per_page;
+
+	return p_chain->capacity - used;
+}
+
+static inline u16 qed_chain_get_usable_per_page(struct qed_chain *p_chain)
+{
+	return p_chain->usable_per_page;
+}
+
+static inline u8 qed_chain_get_unusable_per_page(struct qed_chain *p_chain)
+{
+	return p_chain->elem_unusable;
+}
+
+static inline u32 qed_chain_get_page_cnt(struct qed_chain *p_chain)
+{
+	return p_chain->page_cnt;
+}
+
+static inline dma_addr_t qed_chain_get_pbl_phys(struct qed_chain *p_chain)
+{
+	return p_chain->pbl_sp.p_phys_table;
+}
+
+/**
+ * @brief qed_chain_advance_page -
+ *
+ * Advance the next element accros pages for a linked chain
+ *
+ * @param p_chain
+ * @param p_next_elem
+ * @param idx_to_inc
+ * @param page_to_inc
+ */
+static inline void
+qed_chain_advance_page(struct qed_chain *p_chain,
+		       void **p_next_elem, void *idx_to_inc, void *page_to_inc)
+{
+	struct qed_chain_next *p_next = NULL;
+	u32 page_index = 0;
+
+	switch (p_chain->mode) {
+	case QED_CHAIN_MODE_NEXT_PTR:
+		p_next = *p_next_elem;
+		*p_next_elem = p_next->next_virt;
+		if (is_chain_u16(p_chain))
+			*(u16 *)idx_to_inc += p_chain->elem_unusable;
+		else
+			*(u32 *)idx_to_inc += p_chain->elem_unusable;
+		break;
+	case QED_CHAIN_MODE_SINGLE:
+		*p_next_elem = p_chain->p_virt_addr;
+		break;
+
+	case QED_CHAIN_MODE_PBL:
+		if (is_chain_u16(p_chain)) {
+			if (++(*(u16 *)page_to_inc) == p_chain->page_cnt)
+				*(u16 *)page_to_inc = 0;
+			page_index = *(u16 *)page_to_inc;
+		} else {
+			if (++(*(u32 *)page_to_inc) == p_chain->page_cnt)
+				*(u32 *)page_to_inc = 0;
+			page_index = *(u32 *)page_to_inc;
+		}
+		*p_next_elem = p_chain->pbl.pp_virt_addr_tbl[page_index];
+	}
+}
+
+#define is_unusable_idx(p, idx)	\
+	(((p)->u.chain16.idx & (p)->elem_per_page_mask) == (p)->usable_per_page)
+
+#define is_unusable_idx_u32(p, idx) \
+	(((p)->u.chain32.idx & (p)->elem_per_page_mask) == (p)->usable_per_page)
+#define is_unusable_next_idx(p, idx)				 \
+	((((p)->u.chain16.idx + 1) & (p)->elem_per_page_mask) == \
+	 (p)->usable_per_page)
+
+#define is_unusable_next_idx_u32(p, idx)			 \
+	((((p)->u.chain32.idx + 1) & (p)->elem_per_page_mask) == \
+	 (p)->usable_per_page)
+
+#define test_and_skip(p, idx)						   \
+	do {						\
+		if (is_chain_u16(p)) {					   \
+			if (is_unusable_idx(p, idx))			   \
+				(p)->u.chain16.idx += (p)->elem_unusable;  \
+		} else {						   \
+			if (is_unusable_idx_u32(p, idx))		   \
+				(p)->u.chain32.idx += (p)->elem_unusable;  \
+		}					\
+	} while (0)
+
+/**
+ * @brief qed_chain_return_produced -
+ *
+ * A chain in which the driver "Produces" elements should use this API
+ * to indicate previous produced elements are now consumed.
+ *
+ * @param p_chain
+ */
+static inline void qed_chain_return_produced(struct qed_chain *p_chain)
+{
+	if (is_chain_u16(p_chain))
+		p_chain->u.chain16.cons_idx++;
+	else
+		p_chain->u.chain32.cons_idx++;
+	test_and_skip(p_chain, cons_idx);
+}
+
+/**
+ * @brief qed_chain_produce -
+ *
+ * A chain in which the driver "Produces" elements should use this to get
+ * a pointer to the next element which can be "Produced". It's driver
+ * responsibility to validate that the chain has room for new element.
+ *
+ * @param p_chain
+ *
+ * @return void*, a pointer to next element
+ */
+static inline void *qed_chain_produce(struct qed_chain *p_chain)
+{
+	void *p_ret = NULL, *p_prod_idx, *p_prod_page_idx;
+
+	if (is_chain_u16(p_chain)) {
+		if ((p_chain->u.chain16.prod_idx &
+		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
+			p_prod_idx = &p_chain->u.chain16.prod_idx;
+			p_prod_page_idx = &p_chain->pbl.c.u16.prod_page_idx;
+			qed_chain_advance_page(p_chain, &p_chain->p_prod_elem,
+					       p_prod_idx, p_prod_page_idx);
+		}
+		p_chain->u.chain16.prod_idx++;
+	} else {
+		if ((p_chain->u.chain32.prod_idx &
+		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
+			p_prod_idx = &p_chain->u.chain32.prod_idx;
+			p_prod_page_idx = &p_chain->pbl.c.u32.prod_page_idx;
+			qed_chain_advance_page(p_chain, &p_chain->p_prod_elem,
+					       p_prod_idx, p_prod_page_idx);
+		}
+		p_chain->u.chain32.prod_idx++;
+	}
+
+	p_ret = p_chain->p_prod_elem;
+	p_chain->p_prod_elem = (void *)(((u8 *)p_chain->p_prod_elem) +
+					p_chain->elem_size);
+
+	return p_ret;
+}
+
+/**
+ * @brief qed_chain_get_capacity -
+ *
+ * Get the maximum number of BDs in chain
+ *
+ * @param p_chain
+ * @param num
+ *
+ * @return number of unusable BDs
+ */
+static inline u32 qed_chain_get_capacity(struct qed_chain *p_chain)
+{
+	return p_chain->capacity;
+}
+
+/**
+ * @brief qed_chain_recycle_consumed -
+ *
+ * Returns an element which was previously consumed;
+ * Increments producers so they could be written to FW.
+ *
+ * @param p_chain
+ */
+static inline void qed_chain_recycle_consumed(struct qed_chain *p_chain)
+{
+	test_and_skip(p_chain, prod_idx);
+	if (is_chain_u16(p_chain))
+		p_chain->u.chain16.prod_idx++;
+	else
+		p_chain->u.chain32.prod_idx++;
+}
+
+/**
+ * @brief qed_chain_consume -
+ *
+ * A Chain in which the driver utilizes data written by a different source
+ * (i.e., FW) should use this to access passed buffers.
+ *
+ * @param p_chain
+ *
+ * @return void*, a pointer to the next buffer written
+ */
+static inline void *qed_chain_consume(struct qed_chain *p_chain)
+{
+	void *p_ret = NULL, *p_cons_idx, *p_cons_page_idx;
+
+	if (is_chain_u16(p_chain)) {
+		if ((p_chain->u.chain16.cons_idx &
+		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
+			p_cons_idx = &p_chain->u.chain16.cons_idx;
+			p_cons_page_idx = &p_chain->pbl.c.u16.cons_page_idx;
+			qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
+					       p_cons_idx, p_cons_page_idx);
+		}
+		p_chain->u.chain16.cons_idx++;
+	} else {
+		if ((p_chain->u.chain32.cons_idx &
+		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
+			p_cons_idx = &p_chain->u.chain32.cons_idx;
+			p_cons_page_idx = &p_chain->pbl.c.u32.cons_page_idx;
+			qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
+					       p_cons_idx, p_cons_page_idx);
+		}
+		p_chain->u.chain32.cons_idx++;
+	}
+
+	p_ret = p_chain->p_cons_elem;
+	p_chain->p_cons_elem = (void *)(((u8 *)p_chain->p_cons_elem) +
+					p_chain->elem_size);
+
+	return p_ret;
+}
+
+/**
+ * @brief qed_chain_reset - Resets the chain to its start state
+ *
+ * @param p_chain pointer to a previously allocted chain
+ */
+static inline void qed_chain_reset(struct qed_chain *p_chain)
+{
+	u32 i;
+
+	if (is_chain_u16(p_chain)) {
+		p_chain->u.chain16.prod_idx = 0;
+		p_chain->u.chain16.cons_idx = 0;
+	} else {
+		p_chain->u.chain32.prod_idx = 0;
+		p_chain->u.chain32.cons_idx = 0;
+	}
+	p_chain->p_cons_elem = p_chain->p_virt_addr;
+	p_chain->p_prod_elem = p_chain->p_virt_addr;
+
+	if (p_chain->mode == QED_CHAIN_MODE_PBL) {
+		/* Use (page_cnt - 1) as a reset value for the prod/cons page's
+		 * indices, to avoid unnecessary page advancing on the first
+		 * call to qed_chain_produce/consume. Instead, the indices
+		 * will be advanced to page_cnt and then will be wrapped to 0.
+		 */
+		u32 reset_val = p_chain->page_cnt - 1;
+
+		if (is_chain_u16(p_chain)) {
+			p_chain->pbl.c.u16.prod_page_idx = (u16)reset_val;
+			p_chain->pbl.c.u16.cons_page_idx = (u16)reset_val;
+		} else {
+			p_chain->pbl.c.u32.prod_page_idx = reset_val;
+			p_chain->pbl.c.u32.cons_page_idx = reset_val;
+		}
+	}
+
+	switch (p_chain->intended_use) {
+	case QED_CHAIN_USE_TO_CONSUME:
+		/* produce empty elements */
+		for (i = 0; i < p_chain->capacity; i++)
+			qed_chain_recycle_consumed(p_chain);
+		break;
+
+	case QED_CHAIN_USE_TO_CONSUME_PRODUCE:
+	case QED_CHAIN_USE_TO_PRODUCE:
+	default:
+		/* Do nothing */
+		break;
+	}
+}
+
+/**
+ * @brief qed_chain_init - Initalizes a basic chain struct
+ *
+ * @param p_chain
+ * @param p_virt_addr
+ * @param p_phys_addr	physical address of allocated buffer's beginning
+ * @param page_cnt	number of pages in the allocated buffer
+ * @param elem_size	size of each element in the chain
+ * @param intended_use
+ * @param mode
+ */
+static inline void qed_chain_init_params(struct qed_chain *p_chain,
+					 u32 page_cnt,
+					 u8 elem_size,
+					 enum qed_chain_use_mode intended_use,
+					 enum qed_chain_mode mode,
+					 enum qed_chain_cnt_type cnt_type)
+{
+	/* chain fixed parameters */
+	p_chain->p_virt_addr = NULL;
+	p_chain->p_phys_addr = 0;
+	p_chain->elem_size	= elem_size;
+	p_chain->intended_use = (u8)intended_use;
+	p_chain->mode		= mode;
+	p_chain->cnt_type = (u8)cnt_type;
+
+	p_chain->elem_per_page = ELEMS_PER_PAGE(elem_size);
+	p_chain->usable_per_page = USABLE_ELEMS_PER_PAGE(elem_size, mode);
+	p_chain->elem_per_page_mask = p_chain->elem_per_page - 1;
+	p_chain->elem_unusable = UNUSABLE_ELEMS_PER_PAGE(elem_size, mode);
+	p_chain->next_page_mask = (p_chain->usable_per_page &
+				   p_chain->elem_per_page_mask);
+
+	p_chain->page_cnt = page_cnt;
+	p_chain->capacity = p_chain->usable_per_page * page_cnt;
+	p_chain->size = p_chain->elem_per_page * page_cnt;
+
+	p_chain->pbl_sp.p_phys_table = 0;
+	p_chain->pbl_sp.p_virt_table = NULL;
+	p_chain->pbl.pp_virt_addr_tbl = NULL;
+}
+
+/**
+ * @brief qed_chain_init_mem -
+ *
+ * Initalizes a basic chain struct with its chain buffers
+ *
+ * @param p_chain
+ * @param p_virt_addr	virtual address of allocated buffer's beginning
+ * @param p_phys_addr	physical address of allocated buffer's beginning
+ *
+ */
+static inline void qed_chain_init_mem(struct qed_chain *p_chain,
+				      void *p_virt_addr, dma_addr_t p_phys_addr)
+{
+	p_chain->p_virt_addr = p_virt_addr;
+	p_chain->p_phys_addr = p_phys_addr;
+}
+
+/**
+ * @brief qed_chain_init_pbl_mem -
+ *
+ * Initalizes a basic chain struct with its pbl buffers
+ *
+ * @param p_chain
+ * @param p_virt_pbl	pointer to a pre allocated side table which will hold
+ *                      virtual page addresses.
+ * @param p_phys_pbl	pointer to a pre-allocated side table which will hold
+ *                      physical page addresses.
+ * @param pp_virt_addr_tbl
+ *                      pointer to a pre-allocated side table which will hold
+ *                      the virtual addresses of the chain pages.
+ *
+ */
+static inline void qed_chain_init_pbl_mem(struct qed_chain *p_chain,
+					  void *p_virt_pbl,
+					  dma_addr_t p_phys_pbl,
+					  void **pp_virt_addr_tbl)
+{
+	p_chain->pbl_sp.p_phys_table = p_phys_pbl;
+	p_chain->pbl_sp.p_virt_table = p_virt_pbl;
+	p_chain->pbl.pp_virt_addr_tbl = pp_virt_addr_tbl;
+}
+
+/**
+ * @brief qed_chain_init_next_ptr_elem -
+ *
+ * Initalizes a next pointer element
+ *
+ * @param p_chain
+ * @param p_virt_curr	virtual address of a chain page of which the next
+ *                      pointer element is initialized
+ * @param p_virt_next	virtual address of the next chain page
+ * @param p_phys_next	physical address of the next chain page
+ *
+ */
+static inline void
+qed_chain_init_next_ptr_elem(struct qed_chain *p_chain,
+			     void *p_virt_curr,
+			     void *p_virt_next, dma_addr_t p_phys_next)
+{
+	struct qed_chain_next *p_next;
+	u32 size;
+
+	size = p_chain->elem_size * p_chain->usable_per_page;
+	p_next = (struct qed_chain_next *)((u8 *)p_virt_curr + size);
+
+	DMA_REGPAIR_LE(p_next->next_phys, p_phys_next);
+
+	p_next->next_virt = p_virt_next;
+}
+
+/**
+ * @brief qed_chain_get_last_elem -
+ *
+ * Returns a pointer to the last element of the chain
+ *
+ * @param p_chain
+ *
+ * @return void*
+ */
+static inline void *qed_chain_get_last_elem(struct qed_chain *p_chain)
+{
+	struct qed_chain_next *p_next = NULL;
+	void *p_virt_addr = NULL;
+	u32 size, last_page_idx;
+
+	if (!p_chain->p_virt_addr)
+		goto out;
+
+	switch (p_chain->mode) {
+	case QED_CHAIN_MODE_NEXT_PTR:
+		size = p_chain->elem_size * p_chain->usable_per_page;
+		p_virt_addr = p_chain->p_virt_addr;
+		p_next = (struct qed_chain_next *)((u8 *)p_virt_addr + size);
+		while (p_next->next_virt != p_chain->p_virt_addr) {
+			p_virt_addr = p_next->next_virt;
+			p_next = (struct qed_chain_next *)((u8 *)p_virt_addr +
+							   size);
+		}
+		break;
+	case QED_CHAIN_MODE_SINGLE:
+		p_virt_addr = p_chain->p_virt_addr;
+		break;
+	case QED_CHAIN_MODE_PBL:
+		last_page_idx = p_chain->page_cnt - 1;
+		p_virt_addr = p_chain->pbl.pp_virt_addr_tbl[last_page_idx];
+		break;
+	}
+	/* p_virt_addr points at this stage to the last page of the chain */
+	size = p_chain->elem_size * (p_chain->usable_per_page - 1);
+	p_virt_addr = (u8 *)p_virt_addr + size;
+out:
+	return p_virt_addr;
+}
+
+/**
+ * @brief qed_chain_set_prod - sets the prod to the given value
+ *
+ * @param prod_idx
+ * @param p_prod_elem
+ */
+static inline void qed_chain_set_prod(struct qed_chain *p_chain,
+				      u32 prod_idx, void *p_prod_elem)
+{
+	if (is_chain_u16(p_chain))
+		p_chain->u.chain16.prod_idx = (u16) prod_idx;
+	else
+		p_chain->u.chain32.prod_idx = prod_idx;
+	p_chain->p_prod_elem = p_prod_elem;
+}
+
+/**
+ * @brief qed_chain_pbl_zero_mem - set chain memory to 0
+ *
+ * @param p_chain
+ */
+static inline void qed_chain_pbl_zero_mem(struct qed_chain *p_chain)
+{
+	u32 i, page_cnt;
+
+	if (p_chain->mode != QED_CHAIN_MODE_PBL)
+		return;
+
+	page_cnt = qed_chain_get_page_cnt(p_chain);
+
+	for (i = 0; i < page_cnt; i++)
+		memset(p_chain->pbl.pp_virt_addr_tbl[i], 0,
+		       QED_CHAIN_PAGE_SIZE);
+}
+
+#endif
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
new file mode 100644
index 0000000..147d08c
--- /dev/null
+++ b/include/linux/qed/qed_eth_if.h
@@ -0,0 +1,360 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_ETH_IF_H
+#define _QED_ETH_IF_H
+
+#include <linux/list.h>
+#include <linux/if_link.h>
+#include <linux/qed/eth_common.h>
+#include <linux/qed/qed_if.h>
+#include <linux/qed/qed_iov_if.h>
+
+struct qed_queue_start_common_params {
+	/* Should always be relative to entity sending this. */
+	u8 vport_id;
+	u16 queue_id;
+
+	/* Relative, but relevant only for PFs */
+	u8 stats_id;
+
+	struct qed_sb_info *p_sb;
+	u8 sb_idx;
+};
+
+struct qed_rxq_start_ret_params {
+	void __iomem *p_prod;
+	void *p_handle;
+};
+
+struct qed_txq_start_ret_params {
+	void __iomem *p_doorbell;
+	void *p_handle;
+};
+
+enum qed_filter_config_mode {
+	QED_FILTER_CONFIG_MODE_DISABLE,
+	QED_FILTER_CONFIG_MODE_5_TUPLE,
+	QED_FILTER_CONFIG_MODE_L4_PORT,
+	QED_FILTER_CONFIG_MODE_IP_DEST,
+};
+
+struct qed_ntuple_filter_params {
+	/* Physically mapped address containing header of buffer to be used
+	 * as filter.
+	 */
+	dma_addr_t addr;
+
+	/* Length of header in bytes */
+	u16 length;
+
+	/* Relative queue-id to receive classified packet */
+#define QED_RFS_NTUPLE_QID_RSS ((u16)-1)
+	u16 qid;
+
+	/* Identifier can either be according to vport-id or vfid */
+	bool b_is_vf;
+	u8 vport_id;
+	u8 vf_id;
+
+	/* true iff this filter is to be added. Else to be removed */
+	bool b_is_add;
+};
+
+struct qed_dev_eth_info {
+	struct qed_dev_info common;
+
+	u8	num_queues;
+	u8	num_tc;
+
+	u8	port_mac[ETH_ALEN];
+	u16	num_vlan_filters;
+	u16	num_mac_filters;
+
+	/* Legacy VF - this affects the datapath, so qede has to know */
+	bool is_legacy;
+
+	/* Might depend on available resources [in case of VF] */
+	bool xdp_supported;
+};
+
+struct qed_update_vport_rss_params {
+	void	*rss_ind_table[128];
+	u32	rss_key[10];
+	u8	rss_caps;
+};
+
+struct qed_update_vport_params {
+	u8 vport_id;
+	u8 update_vport_active_flg;
+	u8 vport_active_flg;
+	u8 update_tx_switching_flg;
+	u8 tx_switching_flg;
+	u8 update_accept_any_vlan_flg;
+	u8 accept_any_vlan;
+	u8 update_rss_flg;
+	struct qed_update_vport_rss_params rss_params;
+};
+
+struct qed_start_vport_params {
+	bool remove_inner_vlan;
+	bool handle_ptp_pkts;
+	bool gro_enable;
+	bool drop_ttl0;
+	u8 vport_id;
+	u16 mtu;
+	bool clear_stats;
+};
+
+enum qed_filter_rx_mode_type {
+	QED_FILTER_RX_MODE_TYPE_REGULAR,
+	QED_FILTER_RX_MODE_TYPE_MULTI_PROMISC,
+	QED_FILTER_RX_MODE_TYPE_PROMISC,
+};
+
+enum qed_filter_xcast_params_type {
+	QED_FILTER_XCAST_TYPE_ADD,
+	QED_FILTER_XCAST_TYPE_DEL,
+	QED_FILTER_XCAST_TYPE_REPLACE,
+};
+
+struct qed_filter_ucast_params {
+	enum qed_filter_xcast_params_type type;
+	u8 vlan_valid;
+	u16 vlan;
+	u8 mac_valid;
+	unsigned char mac[ETH_ALEN];
+};
+
+struct qed_filter_mcast_params {
+	enum qed_filter_xcast_params_type type;
+	u8 num;
+	unsigned char mac[64][ETH_ALEN];
+};
+
+union qed_filter_type_params {
+	enum qed_filter_rx_mode_type accept_flags;
+	struct qed_filter_ucast_params ucast;
+	struct qed_filter_mcast_params mcast;
+};
+
+enum qed_filter_type {
+	QED_FILTER_TYPE_UCAST,
+	QED_FILTER_TYPE_MCAST,
+	QED_FILTER_TYPE_RX_MODE,
+	QED_MAX_FILTER_TYPES,
+};
+
+struct qed_filter_params {
+	enum qed_filter_type type;
+	union qed_filter_type_params filter;
+};
+
+struct qed_tunn_params {
+	u16 vxlan_port;
+	u8 update_vxlan_port;
+	u16 geneve_port;
+	u8 update_geneve_port;
+};
+
+struct qed_eth_cb_ops {
+	struct qed_common_cb_ops common;
+	void (*force_mac) (void *dev, u8 *mac, bool forced);
+	void (*ports_update)(void *dev, u16 vxlan_port, u16 geneve_port);
+};
+
+#define QED_MAX_PHC_DRIFT_PPB   291666666
+
+enum qed_ptp_filter_type {
+	QED_PTP_FILTER_NONE,
+	QED_PTP_FILTER_ALL,
+	QED_PTP_FILTER_V1_L4_EVENT,
+	QED_PTP_FILTER_V1_L4_GEN,
+	QED_PTP_FILTER_V2_L4_EVENT,
+	QED_PTP_FILTER_V2_L4_GEN,
+	QED_PTP_FILTER_V2_L2_EVENT,
+	QED_PTP_FILTER_V2_L2_GEN,
+	QED_PTP_FILTER_V2_EVENT,
+	QED_PTP_FILTER_V2_GEN
+};
+
+enum qed_ptp_hwtstamp_tx_type {
+	QED_PTP_HWTSTAMP_TX_OFF,
+	QED_PTP_HWTSTAMP_TX_ON,
+};
+
+#ifdef CONFIG_DCB
+/* Prototype declaration of qed_eth_dcbnl_ops should match with the declaration
+ * of dcbnl_rtnl_ops structure.
+ */
+struct qed_eth_dcbnl_ops {
+	/* IEEE 802.1Qaz std */
+	int (*ieee_getpfc)(struct qed_dev *cdev, struct ieee_pfc *pfc);
+	int (*ieee_setpfc)(struct qed_dev *cdev, struct ieee_pfc *pfc);
+	int (*ieee_getets)(struct qed_dev *cdev, struct ieee_ets *ets);
+	int (*ieee_setets)(struct qed_dev *cdev, struct ieee_ets *ets);
+	int (*ieee_peer_getets)(struct qed_dev *cdev, struct ieee_ets *ets);
+	int (*ieee_peer_getpfc)(struct qed_dev *cdev, struct ieee_pfc *pfc);
+	int (*ieee_getapp)(struct qed_dev *cdev, struct dcb_app *app);
+	int (*ieee_setapp)(struct qed_dev *cdev, struct dcb_app *app);
+
+	/* CEE std */
+	u8 (*getstate)(struct qed_dev *cdev);
+	u8 (*setstate)(struct qed_dev *cdev, u8 state);
+	void (*getpgtccfgtx)(struct qed_dev *cdev, int prio, u8 *prio_type,
+			     u8 *pgid, u8 *bw_pct, u8 *up_map);
+	void (*getpgbwgcfgtx)(struct qed_dev *cdev, int pgid, u8 *bw_pct);
+	void (*getpgtccfgrx)(struct qed_dev *cdev, int prio, u8 *prio_type,
+			     u8 *pgid, u8 *bw_pct, u8 *up_map);
+	void (*getpgbwgcfgrx)(struct qed_dev *cdev, int pgid, u8 *bw_pct);
+	void (*getpfccfg)(struct qed_dev *cdev, int prio, u8 *setting);
+	void (*setpfccfg)(struct qed_dev *cdev, int prio, u8 setting);
+	u8 (*getcap)(struct qed_dev *cdev, int capid, u8 *cap);
+	int (*getnumtcs)(struct qed_dev *cdev, int tcid, u8 *num);
+	u8 (*getpfcstate)(struct qed_dev *cdev);
+	int (*getapp)(struct qed_dev *cdev, u8 idtype, u16 id);
+	u8 (*getfeatcfg)(struct qed_dev *cdev, int featid, u8 *flags);
+
+	/* DCBX configuration */
+	u8 (*getdcbx)(struct qed_dev *cdev);
+	void (*setpgtccfgtx)(struct qed_dev *cdev, int prio,
+			     u8 pri_type, u8 pgid, u8 bw_pct, u8 up_map);
+	void (*setpgtccfgrx)(struct qed_dev *cdev, int prio,
+			     u8 pri_type, u8 pgid, u8 bw_pct, u8 up_map);
+	void (*setpgbwgcfgtx)(struct qed_dev *cdev, int pgid, u8 bw_pct);
+	void (*setpgbwgcfgrx)(struct qed_dev *cdev, int pgid, u8 bw_pct);
+	u8 (*setall)(struct qed_dev *cdev);
+	int (*setnumtcs)(struct qed_dev *cdev, int tcid, u8 num);
+	void (*setpfcstate)(struct qed_dev *cdev, u8 state);
+	int (*setapp)(struct qed_dev *cdev, u8 idtype, u16 idval, u8 up);
+	u8 (*setdcbx)(struct qed_dev *cdev, u8 state);
+	u8 (*setfeatcfg)(struct qed_dev *cdev, int featid, u8 flags);
+
+	/* Peer apps */
+	int (*peer_getappinfo)(struct qed_dev *cdev,
+			       struct dcb_peer_app_info *info,
+			       u16 *app_count);
+	int (*peer_getapptable)(struct qed_dev *cdev, struct dcb_app *table);
+
+	/* CEE peer */
+	int (*cee_peer_getpfc)(struct qed_dev *cdev, struct cee_pfc *pfc);
+	int (*cee_peer_getpg)(struct qed_dev *cdev, struct cee_pg *pg);
+};
+#endif
+
+struct qed_eth_ptp_ops {
+	int (*cfg_filters)(struct qed_dev *, enum qed_ptp_filter_type,
+			   enum qed_ptp_hwtstamp_tx_type);
+	int (*read_rx_ts)(struct qed_dev *, u64 *);
+	int (*read_tx_ts)(struct qed_dev *, u64 *);
+	int (*read_cc)(struct qed_dev *, u64 *);
+	int (*disable)(struct qed_dev *);
+	int (*adjfreq)(struct qed_dev *, s32);
+	int (*enable)(struct qed_dev *);
+};
+
+struct qed_eth_ops {
+	const struct qed_common_ops *common;
+#ifdef CONFIG_QED_SRIOV
+	const struct qed_iov_hv_ops *iov;
+#endif
+#ifdef CONFIG_DCB
+	const struct qed_eth_dcbnl_ops *dcb;
+#endif
+	const struct qed_eth_ptp_ops *ptp;
+
+	int (*fill_dev_info)(struct qed_dev *cdev,
+			     struct qed_dev_eth_info *info);
+
+	void (*register_ops)(struct qed_dev *cdev,
+			     struct qed_eth_cb_ops *ops,
+			     void *cookie);
+
+	 bool(*check_mac) (struct qed_dev *cdev, u8 *mac);
+
+	int (*vport_start)(struct qed_dev *cdev,
+			   struct qed_start_vport_params *params);
+
+	int (*vport_stop)(struct qed_dev *cdev,
+			  u8 vport_id);
+
+	int (*vport_update)(struct qed_dev *cdev,
+			    struct qed_update_vport_params *params);
+
+	int (*q_rx_start)(struct qed_dev *cdev,
+			  u8 rss_num,
+			  struct qed_queue_start_common_params *params,
+			  u16 bd_max_bytes,
+			  dma_addr_t bd_chain_phys_addr,
+			  dma_addr_t cqe_pbl_addr,
+			  u16 cqe_pbl_size,
+			  struct qed_rxq_start_ret_params *ret_params);
+
+	int (*q_rx_stop)(struct qed_dev *cdev, u8 rss_id, void *handle);
+
+	int (*q_tx_start)(struct qed_dev *cdev,
+			  u8 rss_num,
+			  struct qed_queue_start_common_params *params,
+			  dma_addr_t pbl_addr,
+			  u16 pbl_size,
+			  struct qed_txq_start_ret_params *ret_params);
+
+	int (*q_tx_stop)(struct qed_dev *cdev, u8 rss_id, void *handle);
+
+	int (*filter_config)(struct qed_dev *cdev,
+			     struct qed_filter_params *params);
+
+	int (*fastpath_stop)(struct qed_dev *cdev);
+
+	int (*eth_cqe_completion)(struct qed_dev *cdev,
+				  u8 rss_id,
+				  struct eth_slow_path_rx_cqe *cqe);
+
+	void (*get_vport_stats)(struct qed_dev *cdev,
+				struct qed_eth_stats *stats);
+
+	int (*tunn_config)(struct qed_dev *cdev,
+			   struct qed_tunn_params *params);
+
+	int (*ntuple_filter_config)(struct qed_dev *cdev,
+				    void *cookie,
+				    struct qed_ntuple_filter_params *params);
+
+	int (*configure_arfs_searcher)(struct qed_dev *cdev,
+				       enum qed_filter_config_mode mode);
+	int (*get_coalesce)(struct qed_dev *cdev, u16 *coal, void *handle);
+};
+
+const struct qed_eth_ops *qed_get_eth_ops(void);
+void qed_put_eth_ops(void);
+
+#endif
diff --git a/include/linux/qed/qed_fcoe_if.h b/include/linux/qed/qed_fcoe_if.h
new file mode 100644
index 0000000..4608248
--- /dev/null
+++ b/include/linux/qed/qed_fcoe_if.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _QED_FCOE_IF_H
+#define _QED_FCOE_IF_H
+#include <linux/types.h>
+#include <linux/qed/qed_if.h>
+struct qed_fcoe_stats {
+	u64 fcoe_rx_byte_cnt;
+	u64 fcoe_rx_data_pkt_cnt;
+	u64 fcoe_rx_xfer_pkt_cnt;
+	u64 fcoe_rx_other_pkt_cnt;
+	u32 fcoe_silent_drop_pkt_cmdq_full_cnt;
+	u32 fcoe_silent_drop_pkt_rq_full_cnt;
+	u32 fcoe_silent_drop_pkt_crc_error_cnt;
+	u32 fcoe_silent_drop_pkt_task_invalid_cnt;
+	u32 fcoe_silent_drop_total_pkt_cnt;
+
+	u64 fcoe_tx_byte_cnt;
+	u64 fcoe_tx_data_pkt_cnt;
+	u64 fcoe_tx_xfer_pkt_cnt;
+	u64 fcoe_tx_other_pkt_cnt;
+};
+
+struct qed_dev_fcoe_info {
+	struct qed_dev_info common;
+
+	void __iomem *primary_dbq_rq_addr;
+	void __iomem *secondary_bdq_rq_addr;
+
+	u64 wwpn;
+	u64 wwnn;
+
+	u8 num_cqs;
+};
+
+struct qed_fcoe_params_offload {
+	dma_addr_t sq_pbl_addr;
+	dma_addr_t sq_curr_page_addr;
+	dma_addr_t sq_next_page_addr;
+
+	u8 src_mac[ETH_ALEN];
+	u8 dst_mac[ETH_ALEN];
+
+	u16 tx_max_fc_pay_len;
+	u16 e_d_tov_timer_val;
+	u16 rec_tov_timer_val;
+	u16 rx_max_fc_pay_len;
+	u16 vlan_tag;
+
+	struct fc_addr_nw s_id;
+	u8 max_conc_seqs_c3;
+	struct fc_addr_nw d_id;
+	u8 flags;
+	u8 def_q_idx;
+};
+
+#define MAX_TID_BLOCKS_FCOE (512)
+struct qed_fcoe_tid {
+	u32 size;		/* In bytes per task */
+	u32 num_tids_per_block;
+	u8 *blocks[MAX_TID_BLOCKS_FCOE];
+};
+
+struct qed_fcoe_cb_ops {
+	struct qed_common_cb_ops common;
+	 u32 (*get_login_failures)(void *cookie);
+};
+
+void qed_fcoe_set_pf_params(struct qed_dev *cdev,
+			    struct qed_fcoe_pf_params *params);
+
+/**
+ * struct qed_fcoe_ops - qed FCoE operations.
+ * @common:		common operations pointer
+ * @fill_dev_info:	fills FCoE specific information
+ *			@param cdev
+ *			@param info
+ *			@return 0 on sucesss, otherwise error value.
+ * @register_ops:	register FCoE operations
+ *			@param cdev
+ *			@param ops - specified using qed_iscsi_cb_ops
+ *			@param cookie - driver private
+ * @ll2:		light L2 operations pointer
+ * @start:		fcoe in FW
+ *			@param cdev
+ *			@param tasks - qed will fill information about tasks
+ *			return 0 on success, otherwise error value.
+ * @stop:		stops fcoe in FW
+ *			@param cdev
+ *			return 0 on success, otherwise error value.
+ * @acquire_conn:	acquire a new fcoe connection
+ *			@param cdev
+ *			@param handle - qed will fill handle that should be
+ *				used henceforth as identifier of the
+ *				connection.
+ *			@param p_doorbell - qed will fill the address of the
+ *				doorbell.
+ *			return 0 on sucesss, otherwise error value.
+ * @release_conn:	release a previously acquired fcoe connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			return 0 on success, otherwise error value.
+ * @offload_conn:	configures an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@param conn_info - the configuration to use for the
+ *				offload.
+ *			return 0 on success, otherwise error value.
+ * @destroy_conn:	stops an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@param terminate_params
+ *			return 0 on success, otherwise error value.
+ * @get_stats:		gets FCoE related statistics
+ *			@param cdev
+ *			@param stats - pointer to struck that would be filled
+ *				we stats
+ *			return 0 on success, error otherwise.
+ */
+struct qed_fcoe_ops {
+	const struct qed_common_ops *common;
+
+	int (*fill_dev_info)(struct qed_dev *cdev,
+			     struct qed_dev_fcoe_info *info);
+
+	void (*register_ops)(struct qed_dev *cdev,
+			     struct qed_fcoe_cb_ops *ops, void *cookie);
+
+	const struct qed_ll2_ops *ll2;
+
+	int (*start)(struct qed_dev *cdev, struct qed_fcoe_tid *tasks);
+
+	int (*stop)(struct qed_dev *cdev);
+
+	int (*acquire_conn)(struct qed_dev *cdev,
+			    u32 *handle,
+			    u32 *fw_cid, void __iomem **p_doorbell);
+
+	int (*release_conn)(struct qed_dev *cdev, u32 handle);
+
+	int (*offload_conn)(struct qed_dev *cdev,
+			    u32 handle,
+			    struct qed_fcoe_params_offload *conn_info);
+	int (*destroy_conn)(struct qed_dev *cdev,
+			    u32 handle, dma_addr_t terminate_params);
+
+	int (*get_stats)(struct qed_dev *cdev, struct qed_fcoe_stats *stats);
+};
+
+const struct qed_fcoe_ops *qed_get_fcoe_ops(void);
+void qed_put_fcoe_ops(void);
+#endif
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
new file mode 100644
index 0000000..b5b2bc9
--- /dev/null
+++ b/include/linux/qed/qed_if.h
@@ -0,0 +1,1034 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_IF_H
+#define _QED_IF_H
+
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/io.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/qed/common_hsi.h>
+#include <linux/qed/qed_chain.h>
+
+enum dcbx_protocol_type {
+	DCBX_PROTOCOL_ISCSI,
+	DCBX_PROTOCOL_FCOE,
+	DCBX_PROTOCOL_ROCE,
+	DCBX_PROTOCOL_ROCE_V2,
+	DCBX_PROTOCOL_ETH,
+	DCBX_MAX_PROTOCOL_TYPE
+};
+
+#define QED_ROCE_PROTOCOL_INDEX (3)
+
+#define QED_LLDP_CHASSIS_ID_STAT_LEN 4
+#define QED_LLDP_PORT_ID_STAT_LEN 4
+#define QED_DCBX_MAX_APP_PROTOCOL 32
+#define QED_MAX_PFC_PRIORITIES 8
+#define QED_DCBX_DSCP_SIZE 64
+
+struct qed_dcbx_lldp_remote {
+	u32 peer_chassis_id[QED_LLDP_CHASSIS_ID_STAT_LEN];
+	u32 peer_port_id[QED_LLDP_PORT_ID_STAT_LEN];
+	bool enable_rx;
+	bool enable_tx;
+	u32 tx_interval;
+	u32 max_credit;
+};
+
+struct qed_dcbx_lldp_local {
+	u32 local_chassis_id[QED_LLDP_CHASSIS_ID_STAT_LEN];
+	u32 local_port_id[QED_LLDP_PORT_ID_STAT_LEN];
+};
+
+struct qed_dcbx_app_prio {
+	u8 roce;
+	u8 roce_v2;
+	u8 fcoe;
+	u8 iscsi;
+	u8 eth;
+};
+
+struct qed_dbcx_pfc_params {
+	bool willing;
+	bool enabled;
+	u8 prio[QED_MAX_PFC_PRIORITIES];
+	u8 max_tc;
+};
+
+enum qed_dcbx_sf_ieee_type {
+	QED_DCBX_SF_IEEE_ETHTYPE,
+	QED_DCBX_SF_IEEE_TCP_PORT,
+	QED_DCBX_SF_IEEE_UDP_PORT,
+	QED_DCBX_SF_IEEE_TCP_UDP_PORT
+};
+
+struct qed_app_entry {
+	bool ethtype;
+	enum qed_dcbx_sf_ieee_type sf_ieee;
+	bool enabled;
+	u8 prio;
+	u16 proto_id;
+	enum dcbx_protocol_type proto_type;
+};
+
+struct qed_dcbx_params {
+	struct qed_app_entry app_entry[QED_DCBX_MAX_APP_PROTOCOL];
+	u16 num_app_entries;
+	bool app_willing;
+	bool app_valid;
+	bool app_error;
+	bool ets_willing;
+	bool ets_enabled;
+	bool ets_cbs;
+	bool valid;
+	u8 ets_pri_tc_tbl[QED_MAX_PFC_PRIORITIES];
+	u8 ets_tc_bw_tbl[QED_MAX_PFC_PRIORITIES];
+	u8 ets_tc_tsa_tbl[QED_MAX_PFC_PRIORITIES];
+	struct qed_dbcx_pfc_params pfc;
+	u8 max_ets_tc;
+};
+
+struct qed_dcbx_admin_params {
+	struct qed_dcbx_params params;
+	bool valid;
+};
+
+struct qed_dcbx_remote_params {
+	struct qed_dcbx_params params;
+	bool valid;
+};
+
+struct qed_dcbx_operational_params {
+	struct qed_dcbx_app_prio app_prio;
+	struct qed_dcbx_params params;
+	bool valid;
+	bool enabled;
+	bool ieee;
+	bool cee;
+	bool local;
+	u32 err;
+};
+
+struct qed_dcbx_get {
+	struct qed_dcbx_operational_params operational;
+	struct qed_dcbx_lldp_remote lldp_remote;
+	struct qed_dcbx_lldp_local lldp_local;
+	struct qed_dcbx_remote_params remote;
+	struct qed_dcbx_admin_params local;
+};
+
+enum qed_nvm_images {
+	QED_NVM_IMAGE_ISCSI_CFG,
+	QED_NVM_IMAGE_FCOE_CFG,
+};
+
+struct qed_link_eee_params {
+	u32 tx_lpi_timer;
+#define QED_EEE_1G_ADV		BIT(0)
+#define QED_EEE_10G_ADV		BIT(1)
+
+	/* Capabilities are represented using QED_EEE_*_ADV values */
+	u8 adv_caps;
+	u8 lp_adv_caps;
+	bool enable;
+	bool tx_lpi_enable;
+};
+
+enum qed_led_mode {
+	QED_LED_MODE_OFF,
+	QED_LED_MODE_ON,
+	QED_LED_MODE_RESTORE
+};
+
+#define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \
+					    (void __iomem *)(reg_addr))
+
+#define DIRECT_REG_RD(reg_addr) readl((void __iomem *)(reg_addr))
+
+#define QED_COALESCE_MAX 0x1FF
+#define QED_DEFAULT_RX_USECS 12
+#define QED_DEFAULT_TX_USECS 48
+
+/* forward */
+struct qed_dev;
+
+struct qed_eth_pf_params {
+	/* The following parameters are used during HW-init
+	 * and these parameters need to be passed as arguments
+	 * to update_pf_params routine invoked before slowpath start
+	 */
+	u16 num_cons;
+
+	/* per-VF number of CIDs */
+	u8 num_vf_cons;
+#define ETH_PF_PARAMS_VF_CONS_DEFAULT	(32)
+
+	/* To enable arfs, previous to HW-init a positive number needs to be
+	 * set [as filters require allocated searcher ILT memory].
+	 * This will set the maximal number of configured steering-filters.
+	 */
+	u32 num_arfs_filters;
+};
+
+struct qed_fcoe_pf_params {
+	/* The following parameters are used during protocol-init */
+	u64 glbl_q_params_addr;
+	u64 bdq_pbl_base_addr[2];
+
+	/* The following parameters are used during HW-init
+	 * and these parameters need to be passed as arguments
+	 * to update_pf_params routine invoked before slowpath start
+	 */
+	u16 num_cons;
+	u16 num_tasks;
+
+	/* The following parameters are used during protocol-init */
+	u16 sq_num_pbl_pages;
+
+	u16 cq_num_entries;
+	u16 cmdq_num_entries;
+	u16 rq_buffer_log_size;
+	u16 mtu;
+	u16 dummy_icid;
+	u16 bdq_xoff_threshold[2];
+	u16 bdq_xon_threshold[2];
+	u16 rq_buffer_size;
+	u8 num_cqs;		/* num of global CQs */
+	u8 log_page_size;
+	u8 gl_rq_pi;
+	u8 gl_cmd_pi;
+	u8 debug_mode;
+	u8 is_target;
+	u8 bdq_pbl_num_entries[2];
+};
+
+/* Most of the the parameters below are described in the FW iSCSI / TCP HSI */
+struct qed_iscsi_pf_params {
+	u64 glbl_q_params_addr;
+	u64 bdq_pbl_base_addr[3];
+	u16 cq_num_entries;
+	u16 cmdq_num_entries;
+	u32 two_msl_timer;
+	u16 tx_sws_timer;
+
+	/* The following parameters are used during HW-init
+	 * and these parameters need to be passed as arguments
+	 * to update_pf_params routine invoked before slowpath start
+	 */
+	u16 num_cons;
+	u16 num_tasks;
+
+	/* The following parameters are used during protocol-init */
+	u16 half_way_close_timeout;
+	u16 bdq_xoff_threshold[3];
+	u16 bdq_xon_threshold[3];
+	u16 cmdq_xoff_threshold;
+	u16 cmdq_xon_threshold;
+	u16 rq_buffer_size;
+
+	u8 num_sq_pages_in_ring;
+	u8 num_r2tq_pages_in_ring;
+	u8 num_uhq_pages_in_ring;
+	u8 num_queues;
+	u8 log_page_size;
+	u8 rqe_log_size;
+	u8 max_fin_rt;
+	u8 gl_rq_pi;
+	u8 gl_cmd_pi;
+	u8 debug_mode;
+	u8 ll2_ooo_queue_id;
+
+	u8 is_target;
+	u8 is_soc_en;
+	u8 soc_num_of_blocks_log;
+	u8 bdq_pbl_num_entries[3];
+};
+
+struct qed_rdma_pf_params {
+	/* Supplied to QED during resource allocation (may affect the ILT and
+	 * the doorbell BAR).
+	 */
+	u32 min_dpis;		/* number of requested DPIs */
+	u32 num_qps;		/* number of requested Queue Pairs */
+	u32 num_srqs;		/* number of requested SRQ */
+	u8 roce_edpm_mode;	/* see QED_ROCE_EDPM_MODE_ENABLE */
+	u8 gl_pi;		/* protocol index */
+
+	/* Will allocate rate limiters to be used with QPs */
+	u8 enable_dcqcn;
+};
+
+struct qed_pf_params {
+	struct qed_eth_pf_params eth_pf_params;
+	struct qed_fcoe_pf_params fcoe_pf_params;
+	struct qed_iscsi_pf_params iscsi_pf_params;
+	struct qed_rdma_pf_params rdma_pf_params;
+};
+
+enum qed_int_mode {
+	QED_INT_MODE_INTA,
+	QED_INT_MODE_MSIX,
+	QED_INT_MODE_MSI,
+	QED_INT_MODE_POLL,
+};
+
+struct qed_sb_info {
+	struct status_block_e4 *sb_virt;
+	dma_addr_t sb_phys;
+	u32 sb_ack; /* Last given ack */
+	u16 igu_sb_id;
+	void __iomem *igu_addr;
+	u8 flags;
+#define QED_SB_INFO_INIT	0x1
+#define QED_SB_INFO_SETUP	0x2
+
+	struct qed_dev *cdev;
+};
+
+enum qed_dev_type {
+	QED_DEV_TYPE_BB,
+	QED_DEV_TYPE_AH,
+};
+
+struct qed_dev_info {
+	unsigned long	pci_mem_start;
+	unsigned long	pci_mem_end;
+	unsigned int	pci_irq;
+	u8		num_hwfns;
+
+	u8		hw_mac[ETH_ALEN];
+	bool		is_mf_default;
+
+	/* FW version */
+	u16		fw_major;
+	u16		fw_minor;
+	u16		fw_rev;
+	u16		fw_eng;
+
+	/* MFW version */
+	u32		mfw_rev;
+#define QED_MFW_VERSION_0_MASK		0x000000FF
+#define QED_MFW_VERSION_0_OFFSET	0
+#define QED_MFW_VERSION_1_MASK		0x0000FF00
+#define QED_MFW_VERSION_1_OFFSET	8
+#define QED_MFW_VERSION_2_MASK		0x00FF0000
+#define QED_MFW_VERSION_2_OFFSET	16
+#define QED_MFW_VERSION_3_MASK		0xFF000000
+#define QED_MFW_VERSION_3_OFFSET	24
+
+	u32		flash_size;
+	u8		mf_mode;
+	bool		tx_switching;
+	bool		rdma_supported;
+	u16		mtu;
+
+	bool wol_support;
+
+	/* MBI version */
+	u32 mbi_version;
+#define QED_MBI_VERSION_0_MASK		0x000000FF
+#define QED_MBI_VERSION_0_OFFSET	0
+#define QED_MBI_VERSION_1_MASK		0x0000FF00
+#define QED_MBI_VERSION_1_OFFSET	8
+#define QED_MBI_VERSION_2_MASK		0x00FF0000
+#define QED_MBI_VERSION_2_OFFSET	16
+
+	enum qed_dev_type dev_type;
+
+	/* Output parameters for qede */
+	bool		vxlan_enable;
+	bool		gre_enable;
+	bool		geneve_enable;
+
+	u8		abs_pf_id;
+};
+
+enum qed_sb_type {
+	QED_SB_TYPE_L2_QUEUE,
+	QED_SB_TYPE_CNQ,
+	QED_SB_TYPE_STORAGE,
+};
+
+enum qed_protocol {
+	QED_PROTOCOL_ETH,
+	QED_PROTOCOL_ISCSI,
+	QED_PROTOCOL_FCOE,
+};
+
+enum qed_link_mode_bits {
+	QED_LM_FIBRE_BIT = BIT(0),
+	QED_LM_Autoneg_BIT = BIT(1),
+	QED_LM_Asym_Pause_BIT = BIT(2),
+	QED_LM_Pause_BIT = BIT(3),
+	QED_LM_1000baseT_Half_BIT = BIT(4),
+	QED_LM_1000baseT_Full_BIT = BIT(5),
+	QED_LM_10000baseKR_Full_BIT = BIT(6),
+	QED_LM_25000baseKR_Full_BIT = BIT(7),
+	QED_LM_40000baseLR4_Full_BIT = BIT(8),
+	QED_LM_50000baseKR2_Full_BIT = BIT(9),
+	QED_LM_100000baseKR4_Full_BIT = BIT(10),
+	QED_LM_COUNT = 11
+};
+
+struct qed_link_params {
+	bool	link_up;
+
+#define QED_LINK_OVERRIDE_SPEED_AUTONEG         BIT(0)
+#define QED_LINK_OVERRIDE_SPEED_ADV_SPEEDS      BIT(1)
+#define QED_LINK_OVERRIDE_SPEED_FORCED_SPEED    BIT(2)
+#define QED_LINK_OVERRIDE_PAUSE_CONFIG          BIT(3)
+#define QED_LINK_OVERRIDE_LOOPBACK_MODE         BIT(4)
+#define QED_LINK_OVERRIDE_EEE_CONFIG            BIT(5)
+	u32	override_flags;
+	bool	autoneg;
+	u32	adv_speeds;
+	u32	forced_speed;
+#define QED_LINK_PAUSE_AUTONEG_ENABLE           BIT(0)
+#define QED_LINK_PAUSE_RX_ENABLE                BIT(1)
+#define QED_LINK_PAUSE_TX_ENABLE                BIT(2)
+	u32	pause_config;
+#define QED_LINK_LOOPBACK_NONE                  BIT(0)
+#define QED_LINK_LOOPBACK_INT_PHY               BIT(1)
+#define QED_LINK_LOOPBACK_EXT_PHY               BIT(2)
+#define QED_LINK_LOOPBACK_EXT                   BIT(3)
+#define QED_LINK_LOOPBACK_MAC                   BIT(4)
+	u32	loopback_mode;
+	struct qed_link_eee_params eee;
+};
+
+struct qed_link_output {
+	bool	link_up;
+
+	/* In QED_LM_* defs */
+	u32	supported_caps;
+	u32	advertised_caps;
+	u32	lp_caps;
+
+	u32	speed;                  /* In Mb/s */
+	u8	duplex;                 /* In DUPLEX defs */
+	u8	port;                   /* In PORT defs */
+	bool	autoneg;
+	u32	pause_config;
+
+	/* EEE - capability & param */
+	bool eee_supported;
+	bool eee_active;
+	u8 sup_caps;
+	struct qed_link_eee_params eee;
+};
+
+struct qed_probe_params {
+	enum qed_protocol protocol;
+	u32 dp_module;
+	u8 dp_level;
+	bool is_vf;
+};
+
+#define QED_DRV_VER_STR_SIZE 12
+struct qed_slowpath_params {
+	u32	int_mode;
+	u8	drv_major;
+	u8	drv_minor;
+	u8	drv_rev;
+	u8	drv_eng;
+	u8	name[QED_DRV_VER_STR_SIZE];
+};
+
+#define ILT_PAGE_SIZE_TCFC 0x8000 /* 32KB */
+
+struct qed_int_info {
+	struct msix_entry	*msix;
+	u8			msix_cnt;
+
+	/* This should be updated by the protocol driver */
+	u8			used_cnt;
+};
+
+#define QED_NVM_SIGNATURE 0x12435687
+
+enum qed_nvm_flash_cmd {
+	QED_NVM_FLASH_CMD_FILE_DATA = 0x2,
+	QED_NVM_FLASH_CMD_FILE_START = 0x3,
+	QED_NVM_FLASH_CMD_NVM_CHANGE = 0x4,
+	QED_NVM_FLASH_CMD_NVM_MAX,
+};
+
+struct qed_common_cb_ops {
+	void (*arfs_filter_op)(void *dev, void *fltr, u8 fw_rc);
+	void	(*link_update)(void			*dev,
+			       struct qed_link_output	*link);
+	void	(*dcbx_aen)(void *dev, struct qed_dcbx_get *get, u32 mib_type);
+};
+
+struct qed_selftest_ops {
+/**
+ * @brief selftest_interrupt - Perform interrupt test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_interrupt)(struct qed_dev *cdev);
+
+/**
+ * @brief selftest_memory - Perform memory test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_memory)(struct qed_dev *cdev);
+
+/**
+ * @brief selftest_register - Perform register test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_register)(struct qed_dev *cdev);
+
+/**
+ * @brief selftest_clock - Perform clock test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_clock)(struct qed_dev *cdev);
+
+/**
+ * @brief selftest_nvram - Perform nvram test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_nvram) (struct qed_dev *cdev);
+};
+
+struct qed_common_ops {
+	struct qed_selftest_ops *selftest;
+
+	struct qed_dev*	(*probe)(struct pci_dev *dev,
+				 struct qed_probe_params *params);
+
+	void		(*remove)(struct qed_dev *cdev);
+
+	int		(*set_power_state)(struct qed_dev *cdev,
+					   pci_power_t state);
+
+	void (*set_name) (struct qed_dev *cdev, char name[]);
+
+	/* Client drivers need to make this call before slowpath_start.
+	 * PF params required for the call before slowpath_start is
+	 * documented within the qed_pf_params structure definition.
+	 */
+	void		(*update_pf_params)(struct qed_dev *cdev,
+					    struct qed_pf_params *params);
+	int		(*slowpath_start)(struct qed_dev *cdev,
+					  struct qed_slowpath_params *params);
+
+	int		(*slowpath_stop)(struct qed_dev *cdev);
+
+	/* Requests to use `cnt' interrupts for fastpath.
+	 * upon success, returns number of interrupts allocated for fastpath.
+	 */
+	int		(*set_fp_int)(struct qed_dev *cdev,
+				      u16 cnt);
+
+	/* Fills `info' with pointers required for utilizing interrupts */
+	int		(*get_fp_int)(struct qed_dev *cdev,
+				      struct qed_int_info *info);
+
+	u32		(*sb_init)(struct qed_dev *cdev,
+				   struct qed_sb_info *sb_info,
+				   void *sb_virt_addr,
+				   dma_addr_t sb_phy_addr,
+				   u16 sb_id,
+				   enum qed_sb_type type);
+
+	u32		(*sb_release)(struct qed_dev *cdev,
+				      struct qed_sb_info *sb_info,
+				      u16 sb_id);
+
+	void		(*simd_handler_config)(struct qed_dev *cdev,
+					       void *token,
+					       int index,
+					       void (*handler)(void *));
+
+	void		(*simd_handler_clean)(struct qed_dev *cdev,
+					      int index);
+	int (*dbg_grc)(struct qed_dev *cdev,
+		       void *buffer, u32 *num_dumped_bytes);
+
+	int (*dbg_grc_size)(struct qed_dev *cdev);
+
+	int (*dbg_all_data) (struct qed_dev *cdev, void *buffer);
+
+	int (*dbg_all_data_size) (struct qed_dev *cdev);
+
+/**
+ * @brief can_link_change - can the instance change the link or not
+ *
+ * @param cdev
+ *
+ * @return true if link-change is allowed, false otherwise.
+ */
+	bool (*can_link_change)(struct qed_dev *cdev);
+
+/**
+ * @brief set_link - set links according to params
+ *
+ * @param cdev
+ * @param params - values used to override the default link configuration
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int		(*set_link)(struct qed_dev *cdev,
+				    struct qed_link_params *params);
+
+/**
+ * @brief get_link - returns the current link state.
+ *
+ * @param cdev
+ * @param if_link - structure to be filled with current link configuration.
+ */
+	void		(*get_link)(struct qed_dev *cdev,
+				    struct qed_link_output *if_link);
+
+/**
+ * @brief - drains chip in case Tx completions fail to arrive due to pause.
+ *
+ * @param cdev
+ */
+	int		(*drain)(struct qed_dev *cdev);
+
+/**
+ * @brief update_msglvl - update module debug level
+ *
+ * @param cdev
+ * @param dp_module
+ * @param dp_level
+ */
+	void		(*update_msglvl)(struct qed_dev *cdev,
+					 u32 dp_module,
+					 u8 dp_level);
+
+	int		(*chain_alloc)(struct qed_dev *cdev,
+				       enum qed_chain_use_mode intended_use,
+				       enum qed_chain_mode mode,
+				       enum qed_chain_cnt_type cnt_type,
+				       u32 num_elems,
+				       size_t elem_size,
+				       struct qed_chain *p_chain,
+				       struct qed_chain_ext_pbl *ext_pbl);
+
+	void		(*chain_free)(struct qed_dev *cdev,
+				      struct qed_chain *p_chain);
+
+/**
+ * @brief nvm_flash - Flash nvm data.
+ *
+ * @param cdev
+ * @param name - file containing the data
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*nvm_flash)(struct qed_dev *cdev, const char *name);
+
+/**
+ * @brief nvm_get_image - reads an entire image from nvram
+ *
+ * @param cdev
+ * @param type - type of the request nvram image
+ * @param buf - preallocated buffer to fill with the image
+ * @param len - length of the allocated buffer
+ *
+ * @return 0 on success, error otherwise
+ */
+	int (*nvm_get_image)(struct qed_dev *cdev,
+			     enum qed_nvm_images type, u8 *buf, u16 len);
+
+/**
+ * @brief set_coalesce - Configure Rx coalesce value in usec
+ *
+ * @param cdev
+ * @param rx_coal - Rx coalesce value in usec
+ * @param tx_coal - Tx coalesce value in usec
+ * @param qid - Queue index
+ * @param sb_id - Status Block Id
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*set_coalesce)(struct qed_dev *cdev,
+			    u16 rx_coal, u16 tx_coal, void *handle);
+
+/**
+ * @brief set_led - Configure LED mode
+ *
+ * @param cdev
+ * @param mode - LED mode
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*set_led)(struct qed_dev *cdev,
+		       enum qed_led_mode mode);
+
+/**
+ * @brief update_drv_state - API to inform the change in the driver state.
+ *
+ * @param cdev
+ * @param active
+ *
+ */
+	int (*update_drv_state)(struct qed_dev *cdev, bool active);
+
+/**
+ * @brief update_mac - API to inform the change in the mac address
+ *
+ * @param cdev
+ * @param mac
+ *
+ */
+	int (*update_mac)(struct qed_dev *cdev, u8 *mac);
+
+/**
+ * @brief update_mtu - API to inform the change in the mtu
+ *
+ * @param cdev
+ * @param mtu
+ *
+ */
+	int (*update_mtu)(struct qed_dev *cdev, u16 mtu);
+
+/**
+ * @brief update_wol - update of changes in the WoL configuration
+ *
+ * @param cdev
+ * @param enabled - true iff WoL should be enabled.
+ */
+	int (*update_wol) (struct qed_dev *cdev, bool enabled);
+};
+
+#define MASK_FIELD(_name, _value) \
+	((_value) &= (_name ## _MASK))
+
+#define FIELD_VALUE(_name, _value) \
+	((_value & _name ## _MASK) << _name ## _SHIFT)
+
+#define SET_FIELD(value, name, flag)			       \
+	do {						       \
+		(value) &= ~(name ## _MASK << name ## _SHIFT); \
+		(value) |= (((u64)flag) << (name ## _SHIFT));  \
+	} while (0)
+
+#define GET_FIELD(value, name) \
+	(((value) >> (name ## _SHIFT)) & name ## _MASK)
+
+/* Debug print definitions */
+#define DP_ERR(cdev, fmt, ...)					\
+	do {							\
+		pr_err("[%s:%d(%s)]" fmt,			\
+		       __func__, __LINE__,			\
+		       DP_NAME(cdev) ? DP_NAME(cdev) : "",	\
+		       ## __VA_ARGS__);				\
+	} while (0)
+
+#define DP_NOTICE(cdev, fmt, ...)				      \
+	do {							      \
+		if (unlikely((cdev)->dp_level <= QED_LEVEL_NOTICE)) { \
+			pr_notice("[%s:%d(%s)]" fmt,		      \
+				  __func__, __LINE__,		      \
+				  DP_NAME(cdev) ? DP_NAME(cdev) : "", \
+				  ## __VA_ARGS__);		      \
+								      \
+		}						      \
+	} while (0)
+
+#define DP_INFO(cdev, fmt, ...)					      \
+	do {							      \
+		if (unlikely((cdev)->dp_level <= QED_LEVEL_INFO)) {   \
+			pr_notice("[%s:%d(%s)]" fmt,		      \
+				  __func__, __LINE__,		      \
+				  DP_NAME(cdev) ? DP_NAME(cdev) : "", \
+				  ## __VA_ARGS__);		      \
+		}						      \
+	} while (0)
+
+#define DP_VERBOSE(cdev, module, fmt, ...)				\
+	do {								\
+		if (unlikely(((cdev)->dp_level <= QED_LEVEL_VERBOSE) &&	\
+			     ((cdev)->dp_module & module))) {		\
+			pr_notice("[%s:%d(%s)]" fmt,			\
+				  __func__, __LINE__,			\
+				  DP_NAME(cdev) ? DP_NAME(cdev) : "",	\
+				  ## __VA_ARGS__);			\
+		}							\
+	} while (0)
+
+enum DP_LEVEL {
+	QED_LEVEL_VERBOSE	= 0x0,
+	QED_LEVEL_INFO		= 0x1,
+	QED_LEVEL_NOTICE	= 0x2,
+	QED_LEVEL_ERR		= 0x3,
+};
+
+#define QED_LOG_LEVEL_SHIFT     (30)
+#define QED_LOG_VERBOSE_MASK    (0x3fffffff)
+#define QED_LOG_INFO_MASK       (0x40000000)
+#define QED_LOG_NOTICE_MASK     (0x80000000)
+
+enum DP_MODULE {
+	QED_MSG_SPQ	= 0x10000,
+	QED_MSG_STATS	= 0x20000,
+	QED_MSG_DCB	= 0x40000,
+	QED_MSG_IOV	= 0x80000,
+	QED_MSG_SP	= 0x100000,
+	QED_MSG_STORAGE = 0x200000,
+	QED_MSG_CXT	= 0x800000,
+	QED_MSG_LL2	= 0x1000000,
+	QED_MSG_ILT	= 0x2000000,
+	QED_MSG_RDMA	= 0x4000000,
+	QED_MSG_DEBUG	= 0x8000000,
+	/* to be added...up to 0x8000000 */
+};
+
+enum qed_mf_mode {
+	QED_MF_DEFAULT,
+	QED_MF_OVLAN,
+	QED_MF_NPAR,
+};
+
+struct qed_eth_stats_common {
+	u64	no_buff_discards;
+	u64	packet_too_big_discard;
+	u64	ttl0_discard;
+	u64	rx_ucast_bytes;
+	u64	rx_mcast_bytes;
+	u64	rx_bcast_bytes;
+	u64	rx_ucast_pkts;
+	u64	rx_mcast_pkts;
+	u64	rx_bcast_pkts;
+	u64	mftag_filter_discards;
+	u64	mac_filter_discards;
+	u64	tx_ucast_bytes;
+	u64	tx_mcast_bytes;
+	u64	tx_bcast_bytes;
+	u64	tx_ucast_pkts;
+	u64	tx_mcast_pkts;
+	u64	tx_bcast_pkts;
+	u64	tx_err_drop_pkts;
+	u64	tpa_coalesced_pkts;
+	u64	tpa_coalesced_events;
+	u64	tpa_aborts_num;
+	u64	tpa_not_coalesced_pkts;
+	u64	tpa_coalesced_bytes;
+
+	/* port */
+	u64	rx_64_byte_packets;
+	u64	rx_65_to_127_byte_packets;
+	u64	rx_128_to_255_byte_packets;
+	u64	rx_256_to_511_byte_packets;
+	u64	rx_512_to_1023_byte_packets;
+	u64	rx_1024_to_1518_byte_packets;
+	u64	rx_crc_errors;
+	u64	rx_mac_crtl_frames;
+	u64	rx_pause_frames;
+	u64	rx_pfc_frames;
+	u64	rx_align_errors;
+	u64	rx_carrier_errors;
+	u64	rx_oversize_packets;
+	u64	rx_jabbers;
+	u64	rx_undersize_packets;
+	u64	rx_fragments;
+	u64	tx_64_byte_packets;
+	u64	tx_65_to_127_byte_packets;
+	u64	tx_128_to_255_byte_packets;
+	u64	tx_256_to_511_byte_packets;
+	u64	tx_512_to_1023_byte_packets;
+	u64	tx_1024_to_1518_byte_packets;
+	u64	tx_pause_frames;
+	u64	tx_pfc_frames;
+	u64	brb_truncates;
+	u64	brb_discards;
+	u64	rx_mac_bytes;
+	u64	rx_mac_uc_packets;
+	u64	rx_mac_mc_packets;
+	u64	rx_mac_bc_packets;
+	u64	rx_mac_frames_ok;
+	u64	tx_mac_bytes;
+	u64	tx_mac_uc_packets;
+	u64	tx_mac_mc_packets;
+	u64	tx_mac_bc_packets;
+	u64	tx_mac_ctrl_frames;
+};
+
+struct qed_eth_stats_bb {
+	u64 rx_1519_to_1522_byte_packets;
+	u64 rx_1519_to_2047_byte_packets;
+	u64 rx_2048_to_4095_byte_packets;
+	u64 rx_4096_to_9216_byte_packets;
+	u64 rx_9217_to_16383_byte_packets;
+	u64 tx_1519_to_2047_byte_packets;
+	u64 tx_2048_to_4095_byte_packets;
+	u64 tx_4096_to_9216_byte_packets;
+	u64 tx_9217_to_16383_byte_packets;
+	u64 tx_lpi_entry_count;
+	u64 tx_total_collisions;
+};
+
+struct qed_eth_stats_ah {
+	u64 rx_1519_to_max_byte_packets;
+	u64 tx_1519_to_max_byte_packets;
+};
+
+struct qed_eth_stats {
+	struct qed_eth_stats_common common;
+
+	union {
+		struct qed_eth_stats_bb bb;
+		struct qed_eth_stats_ah ah;
+	};
+};
+
+#define QED_SB_IDX              0x0002
+
+#define RX_PI           0
+#define TX_PI(tc)       (RX_PI + 1 + tc)
+
+struct qed_sb_cnt_info {
+	/* Original, current, and free SBs for PF */
+	int orig;
+	int cnt;
+	int free_cnt;
+
+	/* Original, current and free SBS for child VFs */
+	int iov_orig;
+	int iov_cnt;
+	int free_cnt_iov;
+};
+
+static inline u16 qed_sb_update_sb_idx(struct qed_sb_info *sb_info)
+{
+	u32 prod = 0;
+	u16 rc = 0;
+
+	prod = le32_to_cpu(sb_info->sb_virt->prod_index) &
+	       STATUS_BLOCK_E4_PROD_INDEX_MASK;
+	if (sb_info->sb_ack != prod) {
+		sb_info->sb_ack = prod;
+		rc |= QED_SB_IDX;
+	}
+
+	/* Let SB update */
+	mmiowb();
+	return rc;
+}
+
+/**
+ *
+ * @brief This function creates an update command for interrupts that is
+ *        written to the IGU.
+ *
+ * @param sb_info       - This is the structure allocated and
+ *                 initialized per status block. Assumption is
+ *                 that it was initialized using qed_sb_init
+ * @param int_cmd       - Enable/Disable/Nop
+ * @param upd_flg       - whether igu consumer should be
+ *                 updated.
+ *
+ * @return inline void
+ */
+static inline void qed_sb_ack(struct qed_sb_info *sb_info,
+			      enum igu_int_cmd int_cmd,
+			      u8 upd_flg)
+{
+	struct igu_prod_cons_update igu_ack = { 0 };
+
+	igu_ack.sb_id_and_flags =
+		((sb_info->sb_ack << IGU_PROD_CONS_UPDATE_SB_INDEX_SHIFT) |
+		 (upd_flg << IGU_PROD_CONS_UPDATE_UPDATE_FLAG_SHIFT) |
+		 (int_cmd << IGU_PROD_CONS_UPDATE_ENABLE_INT_SHIFT) |
+		 (IGU_SEG_ACCESS_REG <<
+		  IGU_PROD_CONS_UPDATE_SEGMENT_ACCESS_SHIFT));
+
+	DIRECT_REG_WR(sb_info->igu_addr, igu_ack.sb_id_and_flags);
+
+	/* Both segments (interrupts & acks) are written to same place address;
+	 * Need to guarantee all commands will be received (in-order) by HW.
+	 */
+	mmiowb();
+	barrier();
+}
+
+static inline void __internal_ram_wr(void *p_hwfn,
+				     void __iomem *addr,
+				     int size,
+				     u32 *data)
+
+{
+	unsigned int i;
+
+	for (i = 0; i < size / sizeof(*data); i++)
+		DIRECT_REG_WR(&((u32 __iomem *)addr)[i], data[i]);
+}
+
+static inline void internal_ram_wr(void __iomem *addr,
+				   int size,
+				   u32 *data)
+{
+	__internal_ram_wr(NULL, addr, size, data);
+}
+
+enum qed_rss_caps {
+	QED_RSS_IPV4		= 0x1,
+	QED_RSS_IPV6		= 0x2,
+	QED_RSS_IPV4_TCP	= 0x4,
+	QED_RSS_IPV6_TCP	= 0x8,
+	QED_RSS_IPV4_UDP	= 0x10,
+	QED_RSS_IPV6_UDP	= 0x20,
+};
+
+#define QED_RSS_IND_TABLE_SIZE 128
+#define QED_RSS_KEY_SIZE 10 /* size in 32b chunks */
+#endif
diff --git a/include/linux/qed/qed_iov_if.h b/include/linux/qed/qed_iov_if.h
new file mode 100644
index 0000000..ac2e6a3
--- /dev/null
+++ b/include/linux/qed/qed_iov_if.h
@@ -0,0 +1,60 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_IOV_IF_H
+#define _QED_IOV_IF_H
+
+#include <linux/qed/qed_if.h>
+
+/* Structs used by PF to control and manipulate child VFs */
+struct qed_iov_hv_ops {
+	int (*configure)(struct qed_dev *cdev, int num_vfs_param);
+
+	int (*set_mac) (struct qed_dev *cdev, u8 *mac, int vfid);
+
+	int (*set_vlan) (struct qed_dev *cdev, u16 vid, int vfid);
+
+	int (*get_config) (struct qed_dev *cdev, int vf_id,
+			   struct ifla_vf_info *ivi);
+
+	int (*set_link_state) (struct qed_dev *cdev, int vf_id,
+			       int link_state);
+
+	int (*set_spoof) (struct qed_dev *cdev, int vfid, bool val);
+
+	int (*set_rate) (struct qed_dev *cdev, int vfid,
+			 u32 min_rate, u32 max_rate);
+
+	int (*set_trust) (struct qed_dev *cdev, int vfid, bool trust);
+};
+
+#endif
diff --git a/include/linux/qed/qed_iscsi_if.h b/include/linux/qed/qed_iscsi_if.h
new file mode 100644
index 0000000..d0df1be
--- /dev/null
+++ b/include/linux/qed/qed_iscsi_if.h
@@ -0,0 +1,260 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_ISCSI_IF_H
+#define _QED_ISCSI_IF_H
+#include <linux/types.h>
+#include <linux/qed/qed_if.h>
+
+typedef int (*iscsi_event_cb_t) (void *context,
+				 u8 fw_event_code, void *fw_handle);
+struct qed_iscsi_stats {
+	u64 iscsi_rx_bytes_cnt;
+	u64 iscsi_rx_packet_cnt;
+	u64 iscsi_rx_new_ooo_isle_events_cnt;
+	u32 iscsi_cmdq_threshold_cnt;
+	u32 iscsi_rq_threshold_cnt;
+	u32 iscsi_immq_threshold_cnt;
+
+	u64 iscsi_rx_dropped_pdus_task_not_valid;
+
+	u64 iscsi_rx_data_pdu_cnt;
+	u64 iscsi_rx_r2t_pdu_cnt;
+	u64 iscsi_rx_total_pdu_cnt;
+
+	u64 iscsi_tx_go_to_slow_start_event_cnt;
+	u64 iscsi_tx_fast_retransmit_event_cnt;
+
+	u64 iscsi_tx_data_pdu_cnt;
+	u64 iscsi_tx_r2t_pdu_cnt;
+	u64 iscsi_tx_total_pdu_cnt;
+
+	u64 iscsi_tx_bytes_cnt;
+	u64 iscsi_tx_packet_cnt;
+};
+
+struct qed_dev_iscsi_info {
+	struct qed_dev_info common;
+
+	void __iomem *primary_dbq_rq_addr;
+	void __iomem *secondary_bdq_rq_addr;
+
+	u8 num_cqs;
+};
+
+struct qed_iscsi_id_params {
+	u8 mac[ETH_ALEN];
+	u32 ip[4];
+	u16 port;
+};
+
+struct qed_iscsi_params_offload {
+	u8 layer_code;
+	dma_addr_t sq_pbl_addr;
+	u32 initial_ack;
+
+	struct qed_iscsi_id_params src;
+	struct qed_iscsi_id_params dst;
+	u16 vlan_id;
+	u8 tcp_flags;
+	u8 ip_version;
+	u8 default_cq;
+
+	u8 ka_max_probe_cnt;
+	u8 dup_ack_theshold;
+	u32 rcv_next;
+	u32 snd_una;
+	u32 snd_next;
+	u32 snd_max;
+	u32 snd_wnd;
+	u32 rcv_wnd;
+	u32 snd_wl1;
+	u32 cwnd;
+	u32 ss_thresh;
+	u16 srtt;
+	u16 rtt_var;
+	u32 ts_recent;
+	u32 ts_recent_age;
+	u32 total_rt;
+	u32 ka_timeout_delta;
+	u32 rt_timeout_delta;
+	u8 dup_ack_cnt;
+	u8 snd_wnd_probe_cnt;
+	u8 ka_probe_cnt;
+	u8 rt_cnt;
+	u32 flow_label;
+	u32 ka_timeout;
+	u32 ka_interval;
+	u32 max_rt_time;
+	u32 initial_rcv_wnd;
+	u8 ttl;
+	u8 tos_or_tc;
+	u16 remote_port;
+	u16 local_port;
+	u16 mss;
+	u8 snd_wnd_scale;
+	u8 rcv_wnd_scale;
+	u16 da_timeout_value;
+	u8 ack_frequency;
+};
+
+struct qed_iscsi_params_update {
+	u8 update_flag;
+#define QED_ISCSI_CONN_HD_EN            BIT(0)
+#define QED_ISCSI_CONN_DD_EN            BIT(1)
+#define QED_ISCSI_CONN_INITIAL_R2T      BIT(2)
+#define QED_ISCSI_CONN_IMMEDIATE_DATA   BIT(3)
+
+	u32 max_seq_size;
+	u32 max_recv_pdu_length;
+	u32 max_send_pdu_length;
+	u32 first_seq_length;
+	u32 exp_stat_sn;
+};
+
+#define MAX_TID_BLOCKS_ISCSI (512)
+struct qed_iscsi_tid {
+	u32 size;		/* In bytes per task */
+	u32 num_tids_per_block;
+	u8 *blocks[MAX_TID_BLOCKS_ISCSI];
+};
+
+struct qed_iscsi_cb_ops {
+	struct qed_common_cb_ops common;
+};
+
+/**
+ * struct qed_iscsi_ops - qed iSCSI operations.
+ * @common:		common operations pointer
+ * @ll2:		light L2 operations pointer
+ * @fill_dev_info:	fills iSCSI specific information
+ *			@param cdev
+ *			@param info
+ *			@return 0 on sucesss, otherwise error value.
+ * @register_ops:	register iscsi operations
+ *			@param cdev
+ *			@param ops - specified using qed_iscsi_cb_ops
+ *			@param cookie - driver private
+ * @start:		iscsi in FW
+ *			@param cdev
+ *			@param tasks - qed will fill information about tasks
+ *			return 0 on success, otherwise error value.
+ * @stop:		iscsi in FW
+ *			@param cdev
+ *			return 0 on success, otherwise error value.
+ * @acquire_conn:	acquire a new iscsi connection
+ *			@param cdev
+ *			@param handle - qed will fill handle that should be
+ *				used henceforth as identifier of the
+ *				connection.
+ *			@param p_doorbell - qed will fill the address of the
+ *				doorbell.
+ *			@return 0 on sucesss, otherwise error value.
+ * @release_conn:	release a previously acquired iscsi connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@return 0 on success, otherwise error value.
+ * @offload_conn:	configures an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@param conn_info - the configuration to use for the
+ *				offload.
+ *			@return 0 on success, otherwise error value.
+ * @update_conn:	updates an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@param conn_info - the configuration to use for the
+ *				offload.
+ *			@return 0 on success, otherwise error value.
+ * @destroy_conn:	stops an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@return 0 on success, otherwise error value.
+ * @clear_sq:		clear all task in sq
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@return 0 on success, otherwise error value.
+ * @get_stats:		iSCSI related statistics
+ *			@param cdev
+ *			@param stats - pointer to struck that would be filled
+ *				we stats
+ *			@return 0 on success, error otherwise.
+ * @change_mac		Change MAC of interface
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@param mac - new MAC to configure.
+ *			@return 0 on success, otherwise error value.
+ */
+struct qed_iscsi_ops {
+	const struct qed_common_ops *common;
+
+	const struct qed_ll2_ops *ll2;
+
+	int (*fill_dev_info)(struct qed_dev *cdev,
+			     struct qed_dev_iscsi_info *info);
+
+	void (*register_ops)(struct qed_dev *cdev,
+			     struct qed_iscsi_cb_ops *ops, void *cookie);
+
+	int (*start)(struct qed_dev *cdev,
+		     struct qed_iscsi_tid *tasks,
+		     void *event_context, iscsi_event_cb_t async_event_cb);
+
+	int (*stop)(struct qed_dev *cdev);
+
+	int (*acquire_conn)(struct qed_dev *cdev,
+			    u32 *handle,
+			    u32 *fw_cid, void __iomem **p_doorbell);
+
+	int (*release_conn)(struct qed_dev *cdev, u32 handle);
+
+	int (*offload_conn)(struct qed_dev *cdev,
+			    u32 handle,
+			    struct qed_iscsi_params_offload *conn_info);
+
+	int (*update_conn)(struct qed_dev *cdev,
+			   u32 handle,
+			   struct qed_iscsi_params_update *conn_info);
+
+	int (*destroy_conn)(struct qed_dev *cdev, u32 handle, u8 abrt_conn);
+
+	int (*clear_sq)(struct qed_dev *cdev, u32 handle);
+
+	int (*get_stats)(struct qed_dev *cdev,
+			 struct qed_iscsi_stats *stats);
+
+	int (*change_mac)(struct qed_dev *cdev, u32 handle, const u8 *mac);
+};
+
+const struct qed_iscsi_ops *qed_get_iscsi_ops(void);
+void qed_put_iscsi_ops(void);
+#endif
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
new file mode 100644
index 0000000..266c1fb
--- /dev/null
+++ b/include/linux/qed/qed_ll2_if.h
@@ -0,0 +1,299 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QED_LL2_IF_H
+#define _QED_LL2_IF_H
+
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/skbuff.h>
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/qed/qed_if.h>
+
+enum qed_ll2_conn_type {
+	QED_LL2_TYPE_FCOE,
+	QED_LL2_TYPE_ISCSI,
+	QED_LL2_TYPE_TEST,
+	QED_LL2_TYPE_OOO,
+	QED_LL2_TYPE_RESERVED2,
+	QED_LL2_TYPE_ROCE,
+	QED_LL2_TYPE_IWARP,
+	QED_LL2_TYPE_RESERVED3,
+	MAX_QED_LL2_RX_CONN_TYPE
+};
+
+enum qed_ll2_roce_flavor_type {
+	QED_LL2_ROCE,
+	QED_LL2_RROCE,
+	MAX_QED_LL2_ROCE_FLAVOR_TYPE
+};
+
+enum qed_ll2_tx_dest {
+	QED_LL2_TX_DEST_NW, /* Light L2 TX Destination to the Network */
+	QED_LL2_TX_DEST_LB, /* Light L2 TX Destination to the Loopback */
+	QED_LL2_TX_DEST_DROP, /* Light L2 Drop the TX packet */
+	QED_LL2_TX_DEST_MAX
+};
+
+enum qed_ll2_error_handle {
+	QED_LL2_DROP_PACKET,
+	QED_LL2_DO_NOTHING,
+	QED_LL2_ASSERT,
+};
+
+struct qed_ll2_stats {
+	u64 gsi_invalid_hdr;
+	u64 gsi_invalid_pkt_length;
+	u64 gsi_unsupported_pkt_typ;
+	u64 gsi_crcchksm_error;
+
+	u64 packet_too_big_discard;
+	u64 no_buff_discard;
+
+	u64 rcv_ucast_bytes;
+	u64 rcv_mcast_bytes;
+	u64 rcv_bcast_bytes;
+	u64 rcv_ucast_pkts;
+	u64 rcv_mcast_pkts;
+	u64 rcv_bcast_pkts;
+
+	u64 sent_ucast_bytes;
+	u64 sent_mcast_bytes;
+	u64 sent_bcast_bytes;
+	u64 sent_ucast_pkts;
+	u64 sent_mcast_pkts;
+	u64 sent_bcast_pkts;
+};
+
+struct qed_ll2_comp_rx_data {
+	void *cookie;
+	dma_addr_t rx_buf_addr;
+	u16 parse_flags;
+	u16 err_flags;
+	u16 vlan;
+	bool b_last_packet;
+	u8 connection_handle;
+
+	union {
+		u16 packet_length;
+		u16 data_length;
+	} length;
+
+	u32 opaque_data_0;
+	u32 opaque_data_1;
+
+	/* GSI only */
+	u32 src_qp;
+	u16 qp_id;
+
+	union {
+		u8 placement_offset;
+		u8 data_length_error;
+	} u;
+};
+
+typedef
+void (*qed_ll2_complete_rx_packet_cb)(void *cxt,
+				      struct qed_ll2_comp_rx_data *data);
+
+typedef
+void (*qed_ll2_release_rx_packet_cb)(void *cxt,
+				     u8 connection_handle,
+				     void *cookie,
+				     dma_addr_t rx_buf_addr,
+				     bool b_last_packet);
+
+typedef
+void (*qed_ll2_complete_tx_packet_cb)(void *cxt,
+				      u8 connection_handle,
+				      void *cookie,
+				      dma_addr_t first_frag_addr,
+				      bool b_last_fragment,
+				      bool b_last_packet);
+
+typedef
+void (*qed_ll2_release_tx_packet_cb)(void *cxt,
+				     u8 connection_handle,
+				     void *cookie,
+				     dma_addr_t first_frag_addr,
+				     bool b_last_fragment, bool b_last_packet);
+
+typedef
+void (*qed_ll2_slowpath_cb)(void *cxt, u8 connection_handle,
+			    u32 opaque_data_0, u32 opaque_data_1);
+
+struct qed_ll2_cbs {
+	qed_ll2_complete_rx_packet_cb rx_comp_cb;
+	qed_ll2_release_rx_packet_cb rx_release_cb;
+	qed_ll2_complete_tx_packet_cb tx_comp_cb;
+	qed_ll2_release_tx_packet_cb tx_release_cb;
+	qed_ll2_slowpath_cb slowpath_cb;
+	void *cookie;
+};
+
+struct qed_ll2_acquire_data_inputs {
+	enum qed_ll2_conn_type conn_type;
+	u16 mtu;
+	u16 rx_num_desc;
+	u16 rx_num_ooo_buffers;
+	u8 rx_drop_ttl0_flg;
+	u8 rx_vlan_removal_en;
+	u16 tx_num_desc;
+	u8 tx_max_bds_per_packet;
+	u8 tx_tc;
+	enum qed_ll2_tx_dest tx_dest;
+	enum qed_ll2_error_handle ai_err_packet_too_big;
+	enum qed_ll2_error_handle ai_err_no_buf;
+	bool secondary_queue;
+	u8 gsi_enable;
+};
+
+struct qed_ll2_acquire_data {
+	struct qed_ll2_acquire_data_inputs input;
+	const struct qed_ll2_cbs *cbs;
+
+	/* Output container for LL2 connection's handle */
+	u8 *p_connection_handle;
+};
+
+struct qed_ll2_tx_pkt_info {
+	void *cookie;
+	dma_addr_t first_frag;
+	enum qed_ll2_tx_dest tx_dest;
+	enum qed_ll2_roce_flavor_type qed_roce_flavor;
+	u16 vlan;
+	u16 l4_hdr_offset_w;	/* from start of packet */
+	u16 first_frag_len;
+	u8 num_of_bds;
+	u8 bd_flags;
+	bool enable_ip_cksum;
+	bool enable_l4_cksum;
+	bool calc_ip_len;
+};
+
+#define QED_LL2_UNUSED_HANDLE   (0xff)
+
+struct qed_ll2_cb_ops {
+	int (*rx_cb)(void *, struct sk_buff *, u32, u32);
+	int (*tx_cb)(void *, struct sk_buff *, bool);
+};
+
+struct qed_ll2_params {
+	u16 mtu;
+	bool drop_ttl0_packets;
+	bool rx_vlan_stripping;
+	u8 tx_tc;
+	bool frags_mapped;
+	u8 ll2_mac_address[ETH_ALEN];
+};
+
+struct qed_ll2_ops {
+/**
+ * @brief start - initializes ll2
+ *
+ * @param cdev
+ * @param params - protocol driver configuration for the ll2.
+ *
+ * @return 0 on success, otherwise error value.
+ */
+	int (*start)(struct qed_dev *cdev, struct qed_ll2_params *params);
+
+/**
+ * @brief stop - stops the ll2
+ *
+ * @param cdev
+ *
+ * @return 0 on success, otherwise error value.
+ */
+	int (*stop)(struct qed_dev *cdev);
+
+/**
+ * @brief start_xmit - transmits an skb over the ll2 interface
+ *
+ * @param cdev
+ * @param skb
+ *
+ * @return 0 on success, otherwise error value.
+ */
+	int (*start_xmit)(struct qed_dev *cdev, struct sk_buff *skb);
+
+/**
+ * @brief register_cb_ops - protocol driver register the callback for Rx/Tx
+ * packets. Should be called before `start'.
+ *
+ * @param cdev
+ * @param cookie - to be passed to the callback functions.
+ * @param ops - the callback functions to register for Rx / Tx.
+ *
+ * @return 0 on success, otherwise error value.
+ */
+	void (*register_cb_ops)(struct qed_dev *cdev,
+				const struct qed_ll2_cb_ops *ops,
+				void *cookie);
+
+/**
+ * @brief get LL2 related statistics
+ *
+ * @param cdev
+ * @param stats - pointer to struct that would be filled with stats
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*get_stats)(struct qed_dev *cdev, struct qed_ll2_stats *stats);
+};
+
+#ifdef CONFIG_QED_LL2
+int qed_ll2_alloc_if(struct qed_dev *);
+void qed_ll2_dealloc_if(struct qed_dev *);
+#else
+static const struct qed_ll2_ops qed_ll2_ops_pass = {
+	.start = NULL,
+	.stop = NULL,
+	.start_xmit = NULL,
+	.register_cb_ops = NULL,
+	.get_stats = NULL,
+};
+
+static inline int qed_ll2_alloc_if(struct qed_dev *cdev)
+{
+	return 0;
+}
+
+static inline void qed_ll2_dealloc_if(struct qed_dev *cdev)
+{
+}
+#endif
+#endif
diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h
new file mode 100644
index 0000000..4dd72ba
--- /dev/null
+++ b/include/linux/qed/qed_rdma_if.h
@@ -0,0 +1,696 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _QED_RDMA_IF_H
+#define _QED_RDMA_IF_H
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/qed/qed_if.h>
+#include <linux/qed/qed_ll2_if.h>
+#include <linux/qed/rdma_common.h>
+
+enum qed_roce_ll2_tx_dest {
+	/* Light L2 TX Destination to the Network */
+	QED_ROCE_LL2_TX_DEST_NW,
+
+	/* Light L2 TX Destination to the Loopback */
+	QED_ROCE_LL2_TX_DEST_LB,
+	QED_ROCE_LL2_TX_DEST_MAX
+};
+
+#define QED_RDMA_MAX_CNQ_SIZE               (0xFFFF)
+
+/* rdma interface */
+
+enum qed_roce_qp_state {
+	QED_ROCE_QP_STATE_RESET,
+	QED_ROCE_QP_STATE_INIT,
+	QED_ROCE_QP_STATE_RTR,
+	QED_ROCE_QP_STATE_RTS,
+	QED_ROCE_QP_STATE_SQD,
+	QED_ROCE_QP_STATE_ERR,
+	QED_ROCE_QP_STATE_SQE
+};
+
+enum qed_rdma_tid_type {
+	QED_RDMA_TID_REGISTERED_MR,
+	QED_RDMA_TID_FMR,
+	QED_RDMA_TID_MW_TYPE1,
+	QED_RDMA_TID_MW_TYPE2A
+};
+
+struct qed_rdma_events {
+	void *context;
+	void (*affiliated_event)(void *context, u8 fw_event_code,
+				 void *fw_handle);
+	void (*unaffiliated_event)(void *context, u8 event_code);
+};
+
+struct qed_rdma_device {
+	u32 vendor_id;
+	u32 vendor_part_id;
+	u32 hw_ver;
+	u64 fw_ver;
+
+	u64 node_guid;
+	u64 sys_image_guid;
+
+	u8 max_cnq;
+	u8 max_sge;
+	u8 max_srq_sge;
+	u16 max_inline;
+	u32 max_wqe;
+	u32 max_srq_wqe;
+	u8 max_qp_resp_rd_atomic_resc;
+	u8 max_qp_req_rd_atomic_resc;
+	u64 max_dev_resp_rd_atomic_resc;
+	u32 max_cq;
+	u32 max_qp;
+	u32 max_srq;
+	u32 max_mr;
+	u64 max_mr_size;
+	u32 max_cqe;
+	u32 max_mw;
+	u32 max_fmr;
+	u32 max_mr_mw_fmr_pbl;
+	u64 max_mr_mw_fmr_size;
+	u32 max_pd;
+	u32 max_ah;
+	u8 max_pkey;
+	u16 max_srq_wr;
+	u8 max_stats_queues;
+	u32 dev_caps;
+
+	/* Abilty to support RNR-NAK generation */
+
+#define QED_RDMA_DEV_CAP_RNR_NAK_MASK                           0x1
+#define QED_RDMA_DEV_CAP_RNR_NAK_SHIFT                  0
+	/* Abilty to support shutdown port */
+#define QED_RDMA_DEV_CAP_SHUTDOWN_PORT_MASK                     0x1
+#define QED_RDMA_DEV_CAP_SHUTDOWN_PORT_SHIFT                    1
+	/* Abilty to support port active event */
+#define QED_RDMA_DEV_CAP_PORT_ACTIVE_EVENT_MASK         0x1
+#define QED_RDMA_DEV_CAP_PORT_ACTIVE_EVENT_SHIFT                2
+	/* Abilty to support port change event */
+#define QED_RDMA_DEV_CAP_PORT_CHANGE_EVENT_MASK         0x1
+#define QED_RDMA_DEV_CAP_PORT_CHANGE_EVENT_SHIFT                3
+	/* Abilty to support system image GUID */
+#define QED_RDMA_DEV_CAP_SYS_IMAGE_MASK                 0x1
+#define QED_RDMA_DEV_CAP_SYS_IMAGE_SHIFT                        4
+	/* Abilty to support bad P_Key counter support */
+#define QED_RDMA_DEV_CAP_BAD_PKEY_CNT_MASK                      0x1
+#define QED_RDMA_DEV_CAP_BAD_PKEY_CNT_SHIFT                     5
+	/* Abilty to support atomic operations */
+#define QED_RDMA_DEV_CAP_ATOMIC_OP_MASK                 0x1
+#define QED_RDMA_DEV_CAP_ATOMIC_OP_SHIFT                        6
+#define QED_RDMA_DEV_CAP_RESIZE_CQ_MASK                 0x1
+#define QED_RDMA_DEV_CAP_RESIZE_CQ_SHIFT                        7
+	/* Abilty to support modifying the maximum number of
+	 * outstanding work requests per QP
+	 */
+#define QED_RDMA_DEV_CAP_RESIZE_MAX_WR_MASK                     0x1
+#define QED_RDMA_DEV_CAP_RESIZE_MAX_WR_SHIFT                    8
+	/* Abilty to support automatic path migration */
+#define QED_RDMA_DEV_CAP_AUTO_PATH_MIG_MASK                     0x1
+#define QED_RDMA_DEV_CAP_AUTO_PATH_MIG_SHIFT                    9
+	/* Abilty to support the base memory management extensions */
+#define QED_RDMA_DEV_CAP_BASE_MEMORY_EXT_MASK                   0x1
+#define QED_RDMA_DEV_CAP_BASE_MEMORY_EXT_SHIFT          10
+#define QED_RDMA_DEV_CAP_BASE_QUEUE_EXT_MASK                    0x1
+#define QED_RDMA_DEV_CAP_BASE_QUEUE_EXT_SHIFT                   11
+	/* Abilty to support multipile page sizes per memory region */
+#define QED_RDMA_DEV_CAP_MULTI_PAGE_PER_MR_EXT_MASK             0x1
+#define QED_RDMA_DEV_CAP_MULTI_PAGE_PER_MR_EXT_SHIFT            12
+	/* Abilty to support block list physical buffer list */
+#define QED_RDMA_DEV_CAP_BLOCK_MODE_MASK                        0x1
+#define QED_RDMA_DEV_CAP_BLOCK_MODE_SHIFT                       13
+	/* Abilty to support zero based virtual addresses */
+#define QED_RDMA_DEV_CAP_ZBVA_MASK                              0x1
+#define QED_RDMA_DEV_CAP_ZBVA_SHIFT                             14
+	/* Abilty to support local invalidate fencing */
+#define QED_RDMA_DEV_CAP_LOCAL_INV_FENCE_MASK                   0x1
+#define QED_RDMA_DEV_CAP_LOCAL_INV_FENCE_SHIFT          15
+	/* Abilty to support Loopback on QP */
+#define QED_RDMA_DEV_CAP_LB_INDICATOR_MASK                      0x1
+#define QED_RDMA_DEV_CAP_LB_INDICATOR_SHIFT                     16
+	u64 page_size_caps;
+	u8 dev_ack_delay;
+	u32 reserved_lkey;
+	u32 bad_pkey_counter;
+	struct qed_rdma_events events;
+};
+
+enum qed_port_state {
+	QED_RDMA_PORT_UP,
+	QED_RDMA_PORT_DOWN,
+};
+
+enum qed_roce_capability {
+	QED_ROCE_V1 = 1 << 0,
+	QED_ROCE_V2 = 1 << 1,
+};
+
+struct qed_rdma_port {
+	enum qed_port_state port_state;
+	int link_speed;
+	u64 max_msg_size;
+	u8 source_gid_table_len;
+	void *source_gid_table_ptr;
+	u8 pkey_table_len;
+	void *pkey_table_ptr;
+	u32 pkey_bad_counter;
+	enum qed_roce_capability capability;
+};
+
+struct qed_rdma_cnq_params {
+	u8 num_pbl_pages;
+	u64 pbl_ptr;
+};
+
+/* The CQ Mode affects the CQ doorbell transaction size.
+ * 64/32 bit machines should configure to 32/16 bits respectively.
+ */
+enum qed_rdma_cq_mode {
+	QED_RDMA_CQ_MODE_16_BITS,
+	QED_RDMA_CQ_MODE_32_BITS,
+};
+
+struct qed_roce_dcqcn_params {
+	u8 notification_point;
+	u8 reaction_point;
+
+	/* fields for notification point */
+	u32 cnp_send_timeout;
+
+	/* fields for reaction point */
+	u32 rl_bc_rate;
+	u16 rl_max_rate;
+	u16 rl_r_ai;
+	u16 rl_r_hai;
+	u16 dcqcn_g;
+	u32 dcqcn_k_us;
+	u32 dcqcn_timeout_us;
+};
+
+struct qed_rdma_start_in_params {
+	struct qed_rdma_events *events;
+	struct qed_rdma_cnq_params cnq_pbl_list[128];
+	u8 desired_cnq;
+	enum qed_rdma_cq_mode cq_mode;
+	struct qed_roce_dcqcn_params dcqcn_params;
+	u16 max_mtu;
+	u8 mac_addr[ETH_ALEN];
+	u8 iwarp_flags;
+};
+
+struct qed_rdma_add_user_out_params {
+	u16 dpi;
+	u64 dpi_addr;
+	u64 dpi_phys_addr;
+	u32 dpi_size;
+	u16 wid_count;
+};
+
+enum roce_mode {
+	ROCE_V1,
+	ROCE_V2_IPV4,
+	ROCE_V2_IPV6,
+	MAX_ROCE_MODE
+};
+
+union qed_gid {
+	u8 bytes[16];
+	u16 words[8];
+	u32 dwords[4];
+	u64 qwords[2];
+	u32 ipv4_addr;
+};
+
+struct qed_rdma_register_tid_in_params {
+	u32 itid;
+	enum qed_rdma_tid_type tid_type;
+	u8 key;
+	u16 pd;
+	bool local_read;
+	bool local_write;
+	bool remote_read;
+	bool remote_write;
+	bool remote_atomic;
+	bool mw_bind;
+	u64 pbl_ptr;
+	bool pbl_two_level;
+	u8 pbl_page_size_log;
+	u8 page_size_log;
+	u32 fbo;
+	u64 length;
+	u64 vaddr;
+	bool zbva;
+	bool phy_mr;
+	bool dma_mr;
+
+	bool dif_enabled;
+	u64 dif_error_addr;
+	u64 dif_runt_addr;
+};
+
+struct qed_rdma_create_cq_in_params {
+	u32 cq_handle_lo;
+	u32 cq_handle_hi;
+	u32 cq_size;
+	u16 dpi;
+	bool pbl_two_level;
+	u64 pbl_ptr;
+	u16 pbl_num_pages;
+	u8 pbl_page_size_log;
+	u8 cnq_id;
+	u16 int_timeout;
+};
+
+struct qed_rdma_create_srq_in_params {
+	u64 pbl_base_addr;
+	u64 prod_pair_addr;
+	u16 num_pages;
+	u16 pd_id;
+	u16 page_size;
+};
+
+struct qed_rdma_destroy_cq_in_params {
+	u16 icid;
+};
+
+struct qed_rdma_destroy_cq_out_params {
+	u16 num_cq_notif;
+};
+
+struct qed_rdma_create_qp_in_params {
+	u32 qp_handle_lo;
+	u32 qp_handle_hi;
+	u32 qp_handle_async_lo;
+	u32 qp_handle_async_hi;
+	bool use_srq;
+	bool signal_all;
+	bool fmr_and_reserved_lkey;
+	u16 pd;
+	u16 dpi;
+	u16 sq_cq_id;
+	u16 sq_num_pages;
+	u64 sq_pbl_ptr;
+	u8 max_sq_sges;
+	u16 rq_cq_id;
+	u16 rq_num_pages;
+	u64 rq_pbl_ptr;
+	u16 srq_id;
+	u8 stats_queue;
+};
+
+struct qed_rdma_create_qp_out_params {
+	u32 qp_id;
+	u16 icid;
+	void *rq_pbl_virt;
+	dma_addr_t rq_pbl_phys;
+	void *sq_pbl_virt;
+	dma_addr_t sq_pbl_phys;
+};
+
+struct qed_rdma_modify_qp_in_params {
+	u32 modify_flags;
+#define QED_RDMA_MODIFY_QP_VALID_NEW_STATE_MASK               0x1
+#define QED_RDMA_MODIFY_QP_VALID_NEW_STATE_SHIFT              0
+#define QED_ROCE_MODIFY_QP_VALID_PKEY_MASK                    0x1
+#define QED_ROCE_MODIFY_QP_VALID_PKEY_SHIFT                   1
+#define QED_RDMA_MODIFY_QP_VALID_RDMA_OPS_EN_MASK             0x1
+#define QED_RDMA_MODIFY_QP_VALID_RDMA_OPS_EN_SHIFT            2
+#define QED_ROCE_MODIFY_QP_VALID_DEST_QP_MASK                 0x1
+#define QED_ROCE_MODIFY_QP_VALID_DEST_QP_SHIFT                3
+#define QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR_MASK          0x1
+#define QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR_SHIFT         4
+#define QED_ROCE_MODIFY_QP_VALID_RQ_PSN_MASK                  0x1
+#define QED_ROCE_MODIFY_QP_VALID_RQ_PSN_SHIFT                 5
+#define QED_ROCE_MODIFY_QP_VALID_SQ_PSN_MASK                  0x1
+#define QED_ROCE_MODIFY_QP_VALID_SQ_PSN_SHIFT                 6
+#define QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_REQ_MASK       0x1
+#define QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_REQ_SHIFT      7
+#define QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_RESP_MASK      0x1
+#define QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_RESP_SHIFT     8
+#define QED_ROCE_MODIFY_QP_VALID_ACK_TIMEOUT_MASK             0x1
+#define QED_ROCE_MODIFY_QP_VALID_ACK_TIMEOUT_SHIFT            9
+#define QED_ROCE_MODIFY_QP_VALID_RETRY_CNT_MASK               0x1
+#define QED_ROCE_MODIFY_QP_VALID_RETRY_CNT_SHIFT              10
+#define QED_ROCE_MODIFY_QP_VALID_RNR_RETRY_CNT_MASK           0x1
+#define QED_ROCE_MODIFY_QP_VALID_RNR_RETRY_CNT_SHIFT          11
+#define QED_ROCE_MODIFY_QP_VALID_MIN_RNR_NAK_TIMER_MASK       0x1
+#define QED_ROCE_MODIFY_QP_VALID_MIN_RNR_NAK_TIMER_SHIFT      12
+#define QED_ROCE_MODIFY_QP_VALID_E2E_FLOW_CONTROL_EN_MASK     0x1
+#define QED_ROCE_MODIFY_QP_VALID_E2E_FLOW_CONTROL_EN_SHIFT    13
+#define QED_ROCE_MODIFY_QP_VALID_ROCE_MODE_MASK               0x1
+#define QED_ROCE_MODIFY_QP_VALID_ROCE_MODE_SHIFT              14
+
+	enum qed_roce_qp_state new_state;
+	u16 pkey;
+	bool incoming_rdma_read_en;
+	bool incoming_rdma_write_en;
+	bool incoming_atomic_en;
+	bool e2e_flow_control_en;
+	u32 dest_qp;
+	bool lb_indication;
+	u16 mtu;
+	u8 traffic_class_tos;
+	u8 hop_limit_ttl;
+	u32 flow_label;
+	union qed_gid sgid;
+	union qed_gid dgid;
+	u16 udp_src_port;
+
+	u16 vlan_id;
+
+	u32 rq_psn;
+	u32 sq_psn;
+	u8 max_rd_atomic_resp;
+	u8 max_rd_atomic_req;
+	u32 ack_timeout;
+	u8 retry_cnt;
+	u8 rnr_retry_cnt;
+	u8 min_rnr_nak_timer;
+	bool sqd_async;
+	u8 remote_mac_addr[6];
+	u8 local_mac_addr[6];
+	bool use_local_mac;
+	enum roce_mode roce_mode;
+};
+
+struct qed_rdma_query_qp_out_params {
+	enum qed_roce_qp_state state;
+	u32 rq_psn;
+	u32 sq_psn;
+	bool draining;
+	u16 mtu;
+	u32 dest_qp;
+	bool incoming_rdma_read_en;
+	bool incoming_rdma_write_en;
+	bool incoming_atomic_en;
+	bool e2e_flow_control_en;
+	union qed_gid sgid;
+	union qed_gid dgid;
+	u32 flow_label;
+	u8 hop_limit_ttl;
+	u8 traffic_class_tos;
+	u32 timeout;
+	u8 rnr_retry;
+	u8 retry_cnt;
+	u8 min_rnr_nak_timer;
+	u16 pkey_index;
+	u8 max_rd_atomic;
+	u8 max_dest_rd_atomic;
+	bool sqd_async;
+};
+
+struct qed_rdma_create_srq_out_params {
+	u16 srq_id;
+};
+
+struct qed_rdma_destroy_srq_in_params {
+	u16 srq_id;
+};
+
+struct qed_rdma_modify_srq_in_params {
+	u32 wqe_limit;
+	u16 srq_id;
+};
+
+struct qed_rdma_stats_out_params {
+	u64 sent_bytes;
+	u64 sent_pkts;
+	u64 rcv_bytes;
+	u64 rcv_pkts;
+};
+
+struct qed_rdma_counters_out_params {
+	u64 pd_count;
+	u64 max_pd;
+	u64 dpi_count;
+	u64 max_dpi;
+	u64 cq_count;
+	u64 max_cq;
+	u64 qp_count;
+	u64 max_qp;
+	u64 tid_count;
+	u64 max_tid;
+};
+
+#define QED_ROCE_TX_HEAD_FAILURE        (1)
+#define QED_ROCE_TX_FRAG_FAILURE        (2)
+
+enum qed_iwarp_event_type {
+	QED_IWARP_EVENT_MPA_REQUEST,	  /* Passive side request received */
+	QED_IWARP_EVENT_PASSIVE_COMPLETE, /* ack on mpa response */
+	QED_IWARP_EVENT_ACTIVE_COMPLETE,  /* Active side reply received */
+	QED_IWARP_EVENT_DISCONNECT,
+	QED_IWARP_EVENT_CLOSE,
+	QED_IWARP_EVENT_IRQ_FULL,
+	QED_IWARP_EVENT_RQ_EMPTY,
+	QED_IWARP_EVENT_LLP_TIMEOUT,
+	QED_IWARP_EVENT_REMOTE_PROTECTION_ERROR,
+	QED_IWARP_EVENT_CQ_OVERFLOW,
+	QED_IWARP_EVENT_QP_CATASTROPHIC,
+	QED_IWARP_EVENT_ACTIVE_MPA_REPLY,
+	QED_IWARP_EVENT_LOCAL_ACCESS_ERROR,
+	QED_IWARP_EVENT_REMOTE_OPERATION_ERROR,
+	QED_IWARP_EVENT_TERMINATE_RECEIVED
+};
+
+enum qed_tcp_ip_version {
+	QED_TCP_IPV4,
+	QED_TCP_IPV6,
+};
+
+struct qed_iwarp_cm_info {
+	enum qed_tcp_ip_version ip_version;
+	u32 remote_ip[4];
+	u32 local_ip[4];
+	u16 remote_port;
+	u16 local_port;
+	u16 vlan;
+	u8 ord;
+	u8 ird;
+	u16 private_data_len;
+	const void *private_data;
+};
+
+struct qed_iwarp_cm_event_params {
+	enum qed_iwarp_event_type event;
+	const struct qed_iwarp_cm_info *cm_info;
+	void *ep_context;	/* To be passed to accept call */
+	int status;
+};
+
+typedef int (*iwarp_event_handler) (void *context,
+				    struct qed_iwarp_cm_event_params *event);
+
+struct qed_iwarp_connect_in {
+	iwarp_event_handler event_cb;
+	void *cb_context;
+	struct qed_rdma_qp *qp;
+	struct qed_iwarp_cm_info cm_info;
+	u16 mss;
+	u8 remote_mac_addr[ETH_ALEN];
+	u8 local_mac_addr[ETH_ALEN];
+};
+
+struct qed_iwarp_connect_out {
+	void *ep_context;
+};
+
+struct qed_iwarp_listen_in {
+	iwarp_event_handler event_cb;
+	void *cb_context;	/* passed to event_cb */
+	u32 max_backlog;
+	enum qed_tcp_ip_version ip_version;
+	u32 ip_addr[4];
+	u16 port;
+	u16 vlan;
+};
+
+struct qed_iwarp_listen_out {
+	void *handle;
+};
+
+struct qed_iwarp_accept_in {
+	void *ep_context;
+	void *cb_context;
+	struct qed_rdma_qp *qp;
+	const void *private_data;
+	u16 private_data_len;
+	u8 ord;
+	u8 ird;
+};
+
+struct qed_iwarp_reject_in {
+	void *ep_context;
+	void *cb_context;
+	const void *private_data;
+	u16 private_data_len;
+};
+
+struct qed_iwarp_send_rtr_in {
+	void *ep_context;
+};
+
+struct qed_roce_ll2_header {
+	void *vaddr;
+	dma_addr_t baddr;
+	size_t len;
+};
+
+struct qed_roce_ll2_buffer {
+	dma_addr_t baddr;
+	size_t len;
+};
+
+struct qed_roce_ll2_packet {
+	struct qed_roce_ll2_header header;
+	int n_seg;
+	struct qed_roce_ll2_buffer payload[RDMA_MAX_SGE_PER_SQ_WQE];
+	int roce_mode;
+	enum qed_roce_ll2_tx_dest tx_dest;
+};
+
+enum qed_rdma_type {
+	QED_RDMA_TYPE_ROCE,
+	QED_RDMA_TYPE_IWARP
+};
+
+struct qed_dev_rdma_info {
+	struct qed_dev_info common;
+	enum qed_rdma_type rdma_type;
+	u8 user_dpm_enabled;
+};
+
+struct qed_rdma_ops {
+	const struct qed_common_ops *common;
+
+	int (*fill_dev_info)(struct qed_dev *cdev,
+			     struct qed_dev_rdma_info *info);
+	void *(*rdma_get_rdma_ctx)(struct qed_dev *cdev);
+
+	int (*rdma_init)(struct qed_dev *dev,
+			 struct qed_rdma_start_in_params *iparams);
+
+	int (*rdma_add_user)(void *rdma_cxt,
+			     struct qed_rdma_add_user_out_params *oparams);
+
+	void (*rdma_remove_user)(void *rdma_cxt, u16 dpi);
+	int (*rdma_stop)(void *rdma_cxt);
+	struct qed_rdma_device* (*rdma_query_device)(void *rdma_cxt);
+	struct qed_rdma_port* (*rdma_query_port)(void *rdma_cxt);
+	int (*rdma_get_start_sb)(struct qed_dev *cdev);
+	int (*rdma_get_min_cnq_msix)(struct qed_dev *cdev);
+	void (*rdma_cnq_prod_update)(void *rdma_cxt, u8 cnq_index, u16 prod);
+	int (*rdma_get_rdma_int)(struct qed_dev *cdev,
+				 struct qed_int_info *info);
+	int (*rdma_set_rdma_int)(struct qed_dev *cdev, u16 cnt);
+	int (*rdma_alloc_pd)(void *rdma_cxt, u16 *pd);
+	void (*rdma_dealloc_pd)(void *rdma_cxt, u16 pd);
+	int (*rdma_create_cq)(void *rdma_cxt,
+			      struct qed_rdma_create_cq_in_params *params,
+			      u16 *icid);
+	int (*rdma_destroy_cq)(void *rdma_cxt,
+			       struct qed_rdma_destroy_cq_in_params *iparams,
+			       struct qed_rdma_destroy_cq_out_params *oparams);
+	struct qed_rdma_qp *
+	(*rdma_create_qp)(void *rdma_cxt,
+			  struct qed_rdma_create_qp_in_params *iparams,
+			  struct qed_rdma_create_qp_out_params *oparams);
+
+	int (*rdma_modify_qp)(void *roce_cxt, struct qed_rdma_qp *qp,
+			      struct qed_rdma_modify_qp_in_params *iparams);
+
+	int (*rdma_query_qp)(void *rdma_cxt, struct qed_rdma_qp *qp,
+			     struct qed_rdma_query_qp_out_params *oparams);
+	int (*rdma_destroy_qp)(void *rdma_cxt, struct qed_rdma_qp *qp);
+
+	int
+	(*rdma_register_tid)(void *rdma_cxt,
+			     struct qed_rdma_register_tid_in_params *iparams);
+
+	int (*rdma_deregister_tid)(void *rdma_cxt, u32 itid);
+	int (*rdma_alloc_tid)(void *rdma_cxt, u32 *itid);
+	void (*rdma_free_tid)(void *rdma_cxt, u32 itid);
+
+	int (*ll2_acquire_connection)(void *rdma_cxt,
+				      struct qed_ll2_acquire_data *data);
+
+	int (*ll2_establish_connection)(void *rdma_cxt, u8 connection_handle);
+	int (*ll2_terminate_connection)(void *rdma_cxt, u8 connection_handle);
+	void (*ll2_release_connection)(void *rdma_cxt, u8 connection_handle);
+
+	int (*ll2_prepare_tx_packet)(void *rdma_cxt,
+				     u8 connection_handle,
+				     struct qed_ll2_tx_pkt_info *pkt,
+				     bool notify_fw);
+
+	int (*ll2_set_fragment_of_tx_packet)(void *rdma_cxt,
+					     u8 connection_handle,
+					     dma_addr_t addr,
+					     u16 nbytes);
+	int (*ll2_post_rx_buffer)(void *rdma_cxt, u8 connection_handle,
+				  dma_addr_t addr, u16 buf_len, void *cookie,
+				  u8 notify_fw);
+	int (*ll2_get_stats)(void *rdma_cxt,
+			     u8 connection_handle,
+			     struct qed_ll2_stats *p_stats);
+	int (*ll2_set_mac_filter)(struct qed_dev *cdev,
+				  u8 *old_mac_address, u8 *new_mac_address);
+
+	int (*iwarp_connect)(void *rdma_cxt,
+			     struct qed_iwarp_connect_in *iparams,
+			     struct qed_iwarp_connect_out *oparams);
+
+	int (*iwarp_create_listen)(void *rdma_cxt,
+				   struct qed_iwarp_listen_in *iparams,
+				   struct qed_iwarp_listen_out *oparams);
+
+	int (*iwarp_accept)(void *rdma_cxt,
+			    struct qed_iwarp_accept_in *iparams);
+
+	int (*iwarp_reject)(void *rdma_cxt,
+			    struct qed_iwarp_reject_in *iparams);
+
+	int (*iwarp_destroy_listen)(void *rdma_cxt, void *handle);
+
+	int (*iwarp_send_rtr)(void *rdma_cxt,
+			      struct qed_iwarp_send_rtr_in *iparams);
+};
+
+const struct qed_rdma_ops *qed_get_rdma_ops(void);
+
+#endif
diff --git a/include/linux/qed/qede_rdma.h b/include/linux/qed/qede_rdma.h
new file mode 100644
index 0000000..9904617
--- /dev/null
+++ b/include/linux/qed/qede_rdma.h
@@ -0,0 +1,94 @@
+/* QLogic qedr NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef QEDE_ROCE_H
+#define QEDE_ROCE_H
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+struct qedr_dev;
+struct qed_dev;
+struct qede_dev;
+
+enum qede_rdma_event {
+	QEDE_UP,
+	QEDE_DOWN,
+	QEDE_CHANGE_ADDR,
+	QEDE_CLOSE
+};
+
+struct qede_rdma_event_work {
+	struct list_head list;
+	struct work_struct work;
+	void *ptr;
+	enum qede_rdma_event event;
+};
+
+struct qedr_driver {
+	unsigned char name[32];
+
+	struct qedr_dev* (*add)(struct qed_dev *, struct pci_dev *,
+				struct net_device *);
+
+	void (*remove)(struct qedr_dev *);
+	void (*notify)(struct qedr_dev *, enum qede_rdma_event);
+};
+
+/* APIs for RDMA driver to register callback handlers,
+ * which will be invoked when device is added, removed, ifup, ifdown
+ */
+int qede_rdma_register_driver(struct qedr_driver *drv);
+void qede_rdma_unregister_driver(struct qedr_driver *drv);
+
+bool qede_rdma_supported(struct qede_dev *dev);
+
+#if IS_ENABLED(CONFIG_QED_RDMA)
+int qede_rdma_dev_add(struct qede_dev *dev);
+void qede_rdma_dev_event_open(struct qede_dev *dev);
+void qede_rdma_dev_event_close(struct qede_dev *dev);
+void qede_rdma_dev_remove(struct qede_dev *dev);
+void qede_rdma_event_changeaddr(struct qede_dev *edr);
+
+#else
+static inline int qede_rdma_dev_add(struct qede_dev *dev)
+{
+	return 0;
+}
+
+static inline void qede_rdma_dev_event_open(struct qede_dev *dev) {}
+static inline void qede_rdma_dev_event_close(struct qede_dev *dev) {}
+static inline void qede_rdma_dev_remove(struct qede_dev *dev) {}
+static inline void qede_rdma_event_changeaddr(struct qede_dev *edr) {}
+#endif
+#endif
diff --git a/include/linux/qed/rdma_common.h b/include/linux/qed/rdma_common.h
new file mode 100644
index 0000000..480a57e
--- /dev/null
+++ b/include/linux/qed/rdma_common.h
@@ -0,0 +1,73 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __RDMA_COMMON__
+#define __RDMA_COMMON__
+
+/************************/
+/* RDMA FW CONSTANTS */
+/************************/
+
+#define RDMA_RESERVED_LKEY		(0)
+#define RDMA_RING_PAGE_SIZE		(0x1000)
+
+#define RDMA_MAX_SGE_PER_SQ_WQE		(4)
+#define RDMA_MAX_SGE_PER_RQ_WQE		(4)
+
+#define RDMA_MAX_DATA_SIZE_IN_WQE	(0x80000000)
+
+#define RDMA_REQ_RD_ATOMIC_ELM_SIZE	(0x50)
+#define RDMA_RESP_RD_ATOMIC_ELM_SIZE	(0x20)
+
+#define RDMA_MAX_CQS			(64 * 1024)
+#define RDMA_MAX_TIDS			(128 * 1024 - 1)
+#define RDMA_MAX_PDS			(64 * 1024)
+#define RDMA_MAX_XRC_SRQS                       (1024)
+#define RDMA_MAX_SRQS                           (32 * 1024)
+
+#define RDMA_NUM_STATISTIC_COUNTERS	MAX_NUM_VPORTS
+#define RDMA_NUM_STATISTIC_COUNTERS_K2	MAX_NUM_VPORTS_K2
+#define RDMA_NUM_STATISTIC_COUNTERS_BB	MAX_NUM_VPORTS_BB
+
+#define RDMA_TASK_TYPE (PROTOCOLID_ROCE)
+
+struct rdma_srq_id {
+	__le16 srq_idx;
+	__le16 opaque_fid;
+};
+
+struct rdma_srq_producers {
+	__le32 sge_prod;
+	__le32 wqe_prod;
+};
+
+#endif /* __RDMA_COMMON__ */
diff --git a/include/linux/qed/roce_common.h b/include/linux/qed/roce_common.h
new file mode 100644
index 0000000..193bcef
--- /dev/null
+++ b/include/linux/qed/roce_common.h
@@ -0,0 +1,68 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ROCE_COMMON__
+#define __ROCE_COMMON__
+
+/************************/
+/* ROCE FW CONSTANTS */
+/************************/
+
+#define ROCE_REQ_MAX_INLINE_DATA_SIZE	(256)
+#define ROCE_REQ_MAX_SINGLE_SQ_WQE_SIZE	(288)
+
+#define ROCE_MAX_QPS			(32 * 1024)
+#define ROCE_DCQCN_NP_MAX_QPS		(64)
+#define ROCE_DCQCN_RP_MAX_QPS		(64)
+
+/* Affiliated asynchronous events / errors enumeration */
+enum roce_async_events_type {
+	ROCE_ASYNC_EVENT_NONE = 0,
+	ROCE_ASYNC_EVENT_COMM_EST = 1,
+	ROCE_ASYNC_EVENT_SQ_DRAINED,
+	ROCE_ASYNC_EVENT_SRQ_LIMIT,
+	ROCE_ASYNC_EVENT_LAST_WQE_REACHED,
+	ROCE_ASYNC_EVENT_CQ_ERR,
+	ROCE_ASYNC_EVENT_LOCAL_INVALID_REQUEST_ERR,
+	ROCE_ASYNC_EVENT_LOCAL_CATASTROPHIC_ERR,
+	ROCE_ASYNC_EVENT_LOCAL_ACCESS_ERR,
+	ROCE_ASYNC_EVENT_QP_CATASTROPHIC_ERR,
+	ROCE_ASYNC_EVENT_CQ_OVERFLOW_ERR,
+	ROCE_ASYNC_EVENT_SRQ_EMPTY,
+	ROCE_ASYNC_EVENT_DESTROY_QP_DONE,
+	ROCE_ASYNC_EVENT_XRC_DOMAIN_ERR,
+	ROCE_ASYNC_EVENT_INVALID_XRCETH_ERR,
+	ROCE_ASYNC_EVENT_XRC_SRQ_CATASTROPHIC_ERR,
+	MAX_ROCE_ASYNC_EVENTS_TYPE
+};
+
+#endif /* __ROCE_COMMON__ */
diff --git a/include/linux/qed/storage_common.h b/include/linux/qed/storage_common.h
new file mode 100644
index 0000000..505c0b4
--- /dev/null
+++ b/include/linux/qed/storage_common.h
@@ -0,0 +1,182 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __STORAGE_COMMON__
+#define __STORAGE_COMMON__
+
+/*********************/
+/* SCSI CONSTANTS */
+/*********************/
+
+#define SCSI_MAX_NUM_OF_CMDQS		(NUM_OF_GLOBAL_QUEUES / 2)
+#define BDQ_NUM_RESOURCES		(4)
+
+#define BDQ_ID_RQ			(0)
+#define BDQ_ID_IMM_DATA			(1)
+#define BDQ_ID_TQ			(2)
+#define BDQ_NUM_IDS			(3)
+
+#define SCSI_NUM_SGES_SLOW_SGL_THR	8
+
+#define BDQ_MAX_EXTERNAL_RING_SIZE	BIT(15)
+
+/* SCSI op codes */
+#define SCSI_OPCODE_COMPARE_AND_WRITE	(0x89)
+#define SCSI_OPCODE_READ_10		(0x28)
+#define SCSI_OPCODE_WRITE_6		(0x0A)
+#define SCSI_OPCODE_WRITE_10		(0x2A)
+#define SCSI_OPCODE_WRITE_12		(0xAA)
+#define SCSI_OPCODE_WRITE_16		(0x8A)
+#define SCSI_OPCODE_WRITE_AND_VERIFY_10	(0x2E)
+#define SCSI_OPCODE_WRITE_AND_VERIFY_12	(0xAE)
+#define SCSI_OPCODE_WRITE_AND_VERIFY_16	(0x8E)
+
+/* iSCSI Drv opaque */
+struct iscsi_drv_opaque {
+	__le16 reserved_zero[3];
+	__le16 opaque;
+};
+
+/* Scsi 2B/8B opaque union */
+union scsi_opaque {
+	struct regpair fcoe_opaque;
+	struct iscsi_drv_opaque iscsi_opaque;
+};
+
+/* SCSI buffer descriptor */
+struct scsi_bd {
+	struct regpair address;
+	union scsi_opaque opaque;
+};
+
+/* Scsi Drv BDQ struct */
+struct scsi_bdq_ram_drv_data {
+	__le16 external_producer;
+	__le16 reserved0[3];
+};
+
+/* SCSI SGE entry */
+struct scsi_sge {
+	struct regpair sge_addr;
+	__le32 sge_len;
+	__le32 reserved;
+};
+
+/* Cached SGEs section */
+struct scsi_cached_sges {
+	struct scsi_sge sge[4];
+};
+
+/* Scsi Drv CMDQ struct */
+struct scsi_drv_cmdq {
+	__le16 cmdq_cons;
+	__le16 reserved0;
+	__le32 reserved1;
+};
+
+/* Common SCSI init params passed by driver to FW in function init ramrod */
+struct scsi_init_func_params {
+	__le16 num_tasks;
+	u8 log_page_size;
+	u8 debug_mode;
+	u8 reserved2[12];
+};
+
+/* SCSI RQ/CQ/CMDQ firmware function init parameters */
+struct scsi_init_func_queues {
+	struct regpair glbl_q_params_addr;
+	__le16 rq_buffer_size;
+	__le16 cq_num_entries;
+	__le16 cmdq_num_entries;
+	u8 bdq_resource_id;
+	u8 q_validity;
+#define SCSI_INIT_FUNC_QUEUES_RQ_VALID_MASK			0x1
+#define SCSI_INIT_FUNC_QUEUES_RQ_VALID_SHIFT			0
+#define SCSI_INIT_FUNC_QUEUES_IMM_DATA_VALID_MASK		0x1
+#define SCSI_INIT_FUNC_QUEUES_IMM_DATA_VALID_SHIFT		1
+#define SCSI_INIT_FUNC_QUEUES_CMD_VALID_MASK			0x1
+#define SCSI_INIT_FUNC_QUEUES_CMD_VALID_SHIFT			2
+#define SCSI_INIT_FUNC_QUEUES_TQ_VALID_MASK			0x1
+#define SCSI_INIT_FUNC_QUEUES_TQ_VALID_SHIFT			3
+#define SCSI_INIT_FUNC_QUEUES_SOC_EN_MASK			0x1
+#define SCSI_INIT_FUNC_QUEUES_SOC_EN_SHIFT			4
+#define SCSI_INIT_FUNC_QUEUES_SOC_NUM_OF_BLOCKS_LOG_MASK	0x7
+#define SCSI_INIT_FUNC_QUEUES_SOC_NUM_OF_BLOCKS_LOG_SHIFT	5
+	__le16 cq_cmdq_sb_num_arr[SCSI_MAX_NUM_OF_CMDQS];
+	u8 num_queues;
+	u8 queue_relative_offset;
+	u8 cq_sb_pi;
+	u8 cmdq_sb_pi;
+	u8 bdq_pbl_num_entries[BDQ_NUM_IDS];
+	u8 reserved1;
+	struct regpair bdq_pbl_base_address[BDQ_NUM_IDS];
+	__le16 bdq_xoff_threshold[BDQ_NUM_IDS];
+	__le16 cmdq_xoff_threshold;
+	__le16 bdq_xon_threshold[BDQ_NUM_IDS];
+	__le16 cmdq_xon_threshold;
+};
+
+/* Scsi Drv BDQ Data struct (2 BDQ IDs: 0 - RQ, 1 - Immediate Data) */
+struct scsi_ram_per_bdq_resource_drv_data {
+	struct scsi_bdq_ram_drv_data drv_data_per_bdq_id[BDQ_NUM_IDS];
+};
+
+/* SCSI SGL types */
+enum scsi_sgl_mode {
+	SCSI_TX_SLOW_SGL,
+	SCSI_FAST_SGL,
+	MAX_SCSI_SGL_MODE
+};
+
+/* SCSI SGL parameters */
+struct scsi_sgl_params {
+	struct regpair sgl_addr;
+	__le32 sgl_total_length;
+	__le32 sge_offset;
+	__le16 sgl_num_sges;
+	u8 sgl_index;
+	u8 reserved;
+};
+
+/* SCSI terminate connection params */
+struct scsi_terminate_extra_params {
+	__le16 unsolicited_cq_count;
+	__le16 cmdq_count;
+	u8 reserved[4];
+};
+
+/* SCSI Task Queue Element */
+struct scsi_tqe {
+	__le16 itid;
+};
+
+#endif /* __STORAGE_COMMON__ */
diff --git a/include/linux/qed/tcp_common.h b/include/linux/qed/tcp_common.h
new file mode 100644
index 0000000..4a48451
--- /dev/null
+++ b/include/linux/qed/tcp_common.h
@@ -0,0 +1,281 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015-2017  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __TCP_COMMON__
+#define __TCP_COMMON__
+
+/********************/
+/* TCP FW CONSTANTS */
+/********************/
+
+#define TCP_INVALID_TIMEOUT_VAL	-1
+
+/* OOO opaque data received from LL2 */
+struct ooo_opaque {
+	__le32 cid;
+	u8 drop_isle;
+	u8 drop_size;
+	u8 ooo_opcode;
+	u8 ooo_isle;
+};
+
+/* tcp connect mode enum */
+enum tcp_connect_mode {
+	TCP_CONNECT_ACTIVE,
+	TCP_CONNECT_PASSIVE,
+	MAX_TCP_CONNECT_MODE
+};
+
+/* tcp function init parameters */
+struct tcp_init_params {
+	__le32 two_msl_timer;
+	__le16 tx_sws_timer;
+	u8 max_fin_rt;
+	u8 reserved[9];
+};
+
+/* tcp IPv4/IPv6 enum */
+enum tcp_ip_version {
+	TCP_IPV4,
+	TCP_IPV6,
+	MAX_TCP_IP_VERSION
+};
+
+/* tcp offload parameters */
+struct tcp_offload_params {
+	__le16 local_mac_addr_lo;
+	__le16 local_mac_addr_mid;
+	__le16 local_mac_addr_hi;
+	__le16 remote_mac_addr_lo;
+	__le16 remote_mac_addr_mid;
+	__le16 remote_mac_addr_hi;
+	__le16 vlan_id;
+	__le16 flags;
+#define TCP_OFFLOAD_PARAMS_TS_EN_MASK			0x1
+#define TCP_OFFLOAD_PARAMS_TS_EN_SHIFT			0
+#define TCP_OFFLOAD_PARAMS_DA_EN_MASK			0x1
+#define TCP_OFFLOAD_PARAMS_DA_EN_SHIFT			1
+#define TCP_OFFLOAD_PARAMS_KA_EN_MASK			0x1
+#define TCP_OFFLOAD_PARAMS_KA_EN_SHIFT			2
+#define TCP_OFFLOAD_PARAMS_ECN_SENDER_EN_MASK		0x1
+#define TCP_OFFLOAD_PARAMS_ECN_SENDER_EN_SHIFT		3
+#define TCP_OFFLOAD_PARAMS_ECN_RECEIVER_EN_MASK		0x1
+#define TCP_OFFLOAD_PARAMS_ECN_RECEIVER_EN_SHIFT	4
+#define TCP_OFFLOAD_PARAMS_NAGLE_EN_MASK		0x1
+#define TCP_OFFLOAD_PARAMS_NAGLE_EN_SHIFT		5
+#define TCP_OFFLOAD_PARAMS_DA_CNT_EN_MASK		0x1
+#define TCP_OFFLOAD_PARAMS_DA_CNT_EN_SHIFT		6
+#define TCP_OFFLOAD_PARAMS_FIN_SENT_MASK		0x1
+#define TCP_OFFLOAD_PARAMS_FIN_SENT_SHIFT		7
+#define TCP_OFFLOAD_PARAMS_FIN_RECEIVED_MASK		0x1
+#define TCP_OFFLOAD_PARAMS_FIN_RECEIVED_SHIFT		8
+#define TCP_OFFLOAD_PARAMS_RESERVED_MASK		0x7F
+#define TCP_OFFLOAD_PARAMS_RESERVED_SHIFT		9
+	u8 ip_version;
+	u8 reserved0[3];
+	__le32 remote_ip[4];
+	__le32 local_ip[4];
+	__le32 flow_label;
+	u8 ttl;
+	u8 tos_or_tc;
+	__le16 remote_port;
+	__le16 local_port;
+	__le16 mss;
+	u8 rcv_wnd_scale;
+	u8 connect_mode;
+	__le16 srtt;
+	__le32 ss_thresh;
+	__le32 rcv_wnd;
+	__le32 cwnd;
+	u8 ka_max_probe_cnt;
+	u8 dup_ack_theshold;
+	__le16 reserved1;
+	__le32 ka_timeout;
+	__le32 ka_interval;
+	__le32 max_rt_time;
+	__le32 initial_rcv_wnd;
+	__le32 rcv_next;
+	__le32 snd_una;
+	__le32 snd_next;
+	__le32 snd_max;
+	__le32 snd_wnd;
+	__le32 snd_wl1;
+	__le32 ts_recent;
+	__le32 ts_recent_age;
+	__le32 total_rt;
+	__le32 ka_timeout_delta;
+	__le32 rt_timeout_delta;
+	u8 dup_ack_cnt;
+	u8 snd_wnd_probe_cnt;
+	u8 ka_probe_cnt;
+	u8 rt_cnt;
+	__le16 rtt_var;
+	__le16 fw_internal;
+	u8 snd_wnd_scale;
+	u8 ack_frequency;
+	__le16 da_timeout_value;
+	__le32 reserved3;
+};
+
+/* tcp offload parameters */
+struct tcp_offload_params_opt2 {
+	__le16 local_mac_addr_lo;
+	__le16 local_mac_addr_mid;
+	__le16 local_mac_addr_hi;
+	__le16 remote_mac_addr_lo;
+	__le16 remote_mac_addr_mid;
+	__le16 remote_mac_addr_hi;
+	__le16 vlan_id;
+	__le16 flags;
+#define TCP_OFFLOAD_PARAMS_OPT2_TS_EN_MASK	0x1
+#define TCP_OFFLOAD_PARAMS_OPT2_TS_EN_SHIFT	0
+#define TCP_OFFLOAD_PARAMS_OPT2_DA_EN_MASK	0x1
+#define TCP_OFFLOAD_PARAMS_OPT2_DA_EN_SHIFT	1
+#define TCP_OFFLOAD_PARAMS_OPT2_KA_EN_MASK	0x1
+#define TCP_OFFLOAD_PARAMS_OPT2_KA_EN_SHIFT	2
+#define TCP_OFFLOAD_PARAMS_OPT2_ECN_EN_MASK	0x1
+#define TCP_OFFLOAD_PARAMS_OPT2_ECN_EN_SHIFT	3
+#define TCP_OFFLOAD_PARAMS_OPT2_RESERVED0_MASK	0xFFF
+#define TCP_OFFLOAD_PARAMS_OPT2_RESERVED0_SHIFT	4
+	u8 ip_version;
+	u8 reserved1[3];
+	__le32 remote_ip[4];
+	__le32 local_ip[4];
+	__le32 flow_label;
+	u8 ttl;
+	u8 tos_or_tc;
+	__le16 remote_port;
+	__le16 local_port;
+	__le16 mss;
+	u8 rcv_wnd_scale;
+	u8 connect_mode;
+	__le16 syn_ip_payload_length;
+	__le32 syn_phy_addr_lo;
+	__le32 syn_phy_addr_hi;
+	__le32 cwnd;
+	u8 ka_max_probe_cnt;
+	u8 reserved2[3];
+	__le32 ka_timeout;
+	__le32 ka_interval;
+	__le32 max_rt_time;
+	__le32 reserved3[16];
+};
+
+/* tcp IPv4/IPv6 enum */
+enum tcp_seg_placement_event {
+	TCP_EVENT_ADD_PEN,
+	TCP_EVENT_ADD_NEW_ISLE,
+	TCP_EVENT_ADD_ISLE_RIGHT,
+	TCP_EVENT_ADD_ISLE_LEFT,
+	TCP_EVENT_JOIN,
+	TCP_EVENT_DELETE_ISLES,
+	TCP_EVENT_NOP,
+	MAX_TCP_SEG_PLACEMENT_EVENT
+};
+
+/* tcp init parameters */
+struct tcp_update_params {
+	__le16 flags;
+#define TCP_UPDATE_PARAMS_REMOTE_MAC_ADDR_CHANGED_MASK		0x1
+#define TCP_UPDATE_PARAMS_REMOTE_MAC_ADDR_CHANGED_SHIFT		0
+#define TCP_UPDATE_PARAMS_MSS_CHANGED_MASK			0x1
+#define TCP_UPDATE_PARAMS_MSS_CHANGED_SHIFT			1
+#define TCP_UPDATE_PARAMS_TTL_CHANGED_MASK			0x1
+#define TCP_UPDATE_PARAMS_TTL_CHANGED_SHIFT			2
+#define TCP_UPDATE_PARAMS_TOS_OR_TC_CHANGED_MASK		0x1
+#define TCP_UPDATE_PARAMS_TOS_OR_TC_CHANGED_SHIFT		3
+#define TCP_UPDATE_PARAMS_KA_TIMEOUT_CHANGED_MASK		0x1
+#define TCP_UPDATE_PARAMS_KA_TIMEOUT_CHANGED_SHIFT		4
+#define TCP_UPDATE_PARAMS_KA_INTERVAL_CHANGED_MASK		0x1
+#define TCP_UPDATE_PARAMS_KA_INTERVAL_CHANGED_SHIFT		5
+#define TCP_UPDATE_PARAMS_MAX_RT_TIME_CHANGED_MASK		0x1
+#define TCP_UPDATE_PARAMS_MAX_RT_TIME_CHANGED_SHIFT		6
+#define TCP_UPDATE_PARAMS_FLOW_LABEL_CHANGED_MASK		0x1
+#define TCP_UPDATE_PARAMS_FLOW_LABEL_CHANGED_SHIFT		7
+#define TCP_UPDATE_PARAMS_INITIAL_RCV_WND_CHANGED_MASK		0x1
+#define TCP_UPDATE_PARAMS_INITIAL_RCV_WND_CHANGED_SHIFT		8
+#define TCP_UPDATE_PARAMS_KA_MAX_PROBE_CNT_CHANGED_MASK		0x1
+#define TCP_UPDATE_PARAMS_KA_MAX_PROBE_CNT_CHANGED_SHIFT	9
+#define TCP_UPDATE_PARAMS_KA_EN_CHANGED_MASK			0x1
+#define TCP_UPDATE_PARAMS_KA_EN_CHANGED_SHIFT			10
+#define TCP_UPDATE_PARAMS_NAGLE_EN_CHANGED_MASK			0x1
+#define TCP_UPDATE_PARAMS_NAGLE_EN_CHANGED_SHIFT		11
+#define TCP_UPDATE_PARAMS_KA_EN_MASK				0x1
+#define TCP_UPDATE_PARAMS_KA_EN_SHIFT				12
+#define TCP_UPDATE_PARAMS_NAGLE_EN_MASK				0x1
+#define TCP_UPDATE_PARAMS_NAGLE_EN_SHIFT			13
+#define TCP_UPDATE_PARAMS_KA_RESTART_MASK			0x1
+#define TCP_UPDATE_PARAMS_KA_RESTART_SHIFT			14
+#define TCP_UPDATE_PARAMS_RETRANSMIT_RESTART_MASK		0x1
+#define TCP_UPDATE_PARAMS_RETRANSMIT_RESTART_SHIFT		15
+	__le16 remote_mac_addr_lo;
+	__le16 remote_mac_addr_mid;
+	__le16 remote_mac_addr_hi;
+	__le16 mss;
+	u8 ttl;
+	u8 tos_or_tc;
+	__le32 ka_timeout;
+	__le32 ka_interval;
+	__le32 max_rt_time;
+	__le32 flow_label;
+	__le32 initial_rcv_wnd;
+	u8 ka_max_probe_cnt;
+	u8 reserved1[7];
+};
+
+/* toe upload parameters */
+struct tcp_upload_params {
+	__le32 rcv_next;
+	__le32 snd_una;
+	__le32 snd_next;
+	__le32 snd_max;
+	__le32 snd_wnd;
+	__le32 rcv_wnd;
+	__le32 snd_wl1;
+	__le32 cwnd;
+	__le32 ss_thresh;
+	__le16 srtt;
+	__le16 rtt_var;
+	__le32 ts_time;
+	__le32 ts_recent;
+	__le32 ts_recent_age;
+	__le32 total_rt;
+	__le32 ka_timeout_delta;
+	__le32 rt_timeout_delta;
+	u8 dup_ack_cnt;
+	u8 snd_wnd_probe_cnt;
+	u8 ka_probe_cnt;
+	u8 rt_cnt;
+	__le32 reserved;
+};
+
+#endif /* __TCP_COMMON__ */
diff --git a/include/linux/sunrpc/addr.h b/include/linux/sunrpc/addr.h
new file mode 100644
index 0000000..07d4548
--- /dev/null
+++ b/include/linux/sunrpc/addr.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/addr.h
+ *
+ * Various routines for copying and comparing sockaddrs and for
+ * converting them to and from presentation format.
+ */
+#ifndef _LINUX_SUNRPC_ADDR_H
+#define _LINUX_SUNRPC_ADDR_H
+
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
+
+size_t		rpc_ntop(const struct sockaddr *, char *, const size_t);
+size_t		rpc_pton(struct net *, const char *, const size_t,
+			 struct sockaddr *, const size_t);
+char *		rpc_sockaddr2uaddr(const struct sockaddr *, gfp_t);
+size_t		rpc_uaddr2sockaddr(struct net *, const char *, const size_t,
+				   struct sockaddr *, const size_t);
+
+static inline unsigned short rpc_get_port(const struct sockaddr *sap)
+{
+	switch (sap->sa_family) {
+	case AF_INET:
+		return ntohs(((struct sockaddr_in *)sap)->sin_port);
+	case AF_INET6:
+		return ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
+	}
+	return 0;
+}
+
+static inline void rpc_set_port(struct sockaddr *sap,
+				const unsigned short port)
+{
+	switch (sap->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)sap)->sin_port = htons(port);
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *)sap)->sin6_port = htons(port);
+		break;
+	}
+}
+
+#define IPV6_SCOPE_DELIMITER		'%'
+#define IPV6_SCOPE_ID_LEN		sizeof("%nnnnnnnnnn")
+
+static inline bool rpc_cmp_addr4(const struct sockaddr *sap1,
+				 const struct sockaddr *sap2)
+{
+	const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1;
+	const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2;
+
+	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
+}
+
+static inline bool __rpc_copy_addr4(struct sockaddr *dst,
+				    const struct sockaddr *src)
+{
+	const struct sockaddr_in *ssin = (struct sockaddr_in *) src;
+	struct sockaddr_in *dsin = (struct sockaddr_in *) dst;
+
+	dsin->sin_family = ssin->sin_family;
+	dsin->sin_addr.s_addr = ssin->sin_addr.s_addr;
+	return true;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static inline bool rpc_cmp_addr6(const struct sockaddr *sap1,
+				 const struct sockaddr *sap2)
+{
+	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1;
+	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2;
+
+	if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
+		return false;
+	else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+		return sin1->sin6_scope_id == sin2->sin6_scope_id;
+
+	return true;
+}
+
+static inline bool __rpc_copy_addr6(struct sockaddr *dst,
+				    const struct sockaddr *src)
+{
+	const struct sockaddr_in6 *ssin6 = (const struct sockaddr_in6 *) src;
+	struct sockaddr_in6 *dsin6 = (struct sockaddr_in6 *) dst;
+
+	dsin6->sin6_family = ssin6->sin6_family;
+	dsin6->sin6_addr = ssin6->sin6_addr;
+	dsin6->sin6_scope_id = ssin6->sin6_scope_id;
+	return true;
+}
+#else	/* !(IS_ENABLED(CONFIG_IPV6) */
+static inline bool rpc_cmp_addr6(const struct sockaddr *sap1,
+				   const struct sockaddr *sap2)
+{
+	return false;
+}
+
+static inline bool __rpc_copy_addr6(struct sockaddr *dst,
+				    const struct sockaddr *src)
+{
+	return false;
+}
+#endif	/* !(IS_ENABLED(CONFIG_IPV6) */
+
+/**
+ * rpc_cmp_addr - compare the address portion of two sockaddrs.
+ * @sap1: first sockaddr
+ * @sap2: second sockaddr
+ *
+ * Just compares the family and address portion. Ignores port, but
+ * compares the scope if it's a link-local address.
+ *
+ * Returns true if the addrs are equal, false if they aren't.
+ */
+static inline bool rpc_cmp_addr(const struct sockaddr *sap1,
+				const struct sockaddr *sap2)
+{
+	if (sap1->sa_family == sap2->sa_family) {
+		switch (sap1->sa_family) {
+		case AF_INET:
+			return rpc_cmp_addr4(sap1, sap2);
+		case AF_INET6:
+			return rpc_cmp_addr6(sap1, sap2);
+		}
+	}
+	return false;
+}
+
+/**
+ * rpc_cmp_addr_port - compare the address and port number of two sockaddrs.
+ * @sap1: first sockaddr
+ * @sap2: second sockaddr
+ */
+static inline bool rpc_cmp_addr_port(const struct sockaddr *sap1,
+				     const struct sockaddr *sap2)
+{
+	if (!rpc_cmp_addr(sap1, sap2))
+		return false;
+	return rpc_get_port(sap1) == rpc_get_port(sap2);
+}
+
+/**
+ * rpc_copy_addr - copy the address portion of one sockaddr to another
+ * @dst: destination sockaddr
+ * @src: source sockaddr
+ *
+ * Just copies the address portion and family. Ignores port, scope, etc.
+ * Caller is responsible for making certain that dst is large enough to hold
+ * the address in src. Returns true if address family is supported. Returns
+ * false otherwise.
+ */
+static inline bool rpc_copy_addr(struct sockaddr *dst,
+				 const struct sockaddr *src)
+{
+	switch (src->sa_family) {
+	case AF_INET:
+		return __rpc_copy_addr4(dst, src);
+	case AF_INET6:
+		return __rpc_copy_addr6(dst, src);
+	}
+	return false;
+}
+
+/**
+ * rpc_get_scope_id - return scopeid for a given sockaddr
+ * @sa: sockaddr to get scopeid from
+ *
+ * Returns the value of the sin6_scope_id for AF_INET6 addrs, or 0 if
+ * not an AF_INET6 address.
+ */
+static inline u32 rpc_get_scope_id(const struct sockaddr *sa)
+{
+	if (sa->sa_family != AF_INET6)
+		return 0;
+
+	return ((struct sockaddr_in6 *) sa)->sin6_scope_id;
+}
+
+#endif /* _LINUX_SUNRPC_ADDR_H */
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
new file mode 100644
index 0000000..d9af474
--- /dev/null
+++ b/include/linux/sunrpc/auth.h
@@ -0,0 +1,232 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/auth.h
+ *
+ * Declarations for the RPC client authentication machinery.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef _LINUX_SUNRPC_AUTH_H
+#define _LINUX_SUNRPC_AUTH_H
+
+#ifdef __KERNEL__
+
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/xdr.h>
+
+#include <linux/atomic.h>
+#include <linux/rcupdate.h>
+#include <linux/uidgid.h>
+#include <linux/utsname.h>
+
+/*
+ * Maximum size of AUTH_NONE authentication information, in XDR words.
+ */
+#define NUL_CALLSLACK	(4)
+#define NUL_REPLYSLACK	(2)
+
+/*
+ * Size of the nodename buffer. RFC1831 specifies a hard limit of 255 bytes,
+ * but Linux hostnames are actually limited to __NEW_UTS_LEN bytes.
+ */
+#define UNX_MAXNODENAME	__NEW_UTS_LEN
+#define UNX_CALLSLACK	(21 + XDR_QUADLEN(UNX_MAXNODENAME))
+#define UNX_NGROUPS	16
+
+struct rpcsec_gss_info;
+
+/* auth_cred ac_flags bits */
+enum {
+	RPC_CRED_KEY_EXPIRE_SOON = 1, /* underlying cred key will expire soon */
+	RPC_CRED_NOTIFY_TIMEOUT = 2,   /* nofity generic cred when underlying
+					key will expire soon */
+};
+
+/* Work around the lack of a VFS credential */
+struct auth_cred {
+	kuid_t	uid;
+	kgid_t	gid;
+	struct group_info *group_info;
+	const char *principal;
+	unsigned long ac_flags;
+	unsigned char machine_cred : 1;
+};
+
+/*
+ * Client user credentials
+ */
+struct rpc_auth;
+struct rpc_credops;
+struct rpc_cred {
+	struct hlist_node	cr_hash;	/* hash chain */
+	struct list_head	cr_lru;		/* lru garbage collection */
+	struct rcu_head		cr_rcu;
+	struct rpc_auth *	cr_auth;
+	const struct rpc_credops *cr_ops;
+	unsigned long		cr_expire;	/* when to gc */
+	unsigned long		cr_flags;	/* various flags */
+	atomic_t		cr_count;	/* ref count */
+
+	kuid_t			cr_uid;
+
+	/* per-flavor data */
+};
+#define RPCAUTH_CRED_NEW	0
+#define RPCAUTH_CRED_UPTODATE	1
+#define RPCAUTH_CRED_HASHED	2
+#define RPCAUTH_CRED_NEGATIVE	3
+
+/* rpc_auth au_flags */
+#define RPCAUTH_AUTH_NO_CRKEY_TIMEOUT	0x0001 /* underlying cred has no key timeout */
+
+/*
+ * Client authentication handle
+ */
+struct rpc_cred_cache;
+struct rpc_authops;
+struct rpc_auth {
+	unsigned int		au_cslack;	/* call cred size estimate */
+				/* guess at number of u32's auth adds before
+				 * reply data; normally the verifier size: */
+	unsigned int		au_rslack;
+				/* for gss, used to calculate au_rslack: */
+	unsigned int		au_verfsize;
+
+	unsigned int		au_flags;	/* various flags */
+	const struct rpc_authops *au_ops;		/* operations */
+	rpc_authflavor_t	au_flavor;	/* pseudoflavor (note may
+						 * differ from the flavor in
+						 * au_ops->au_flavor in gss
+						 * case) */
+	atomic_t		au_count;	/* Reference counter */
+
+	struct rpc_cred_cache *	au_credcache;
+	/* per-flavor data */
+};
+
+/* rpc_auth au_flags */
+#define RPCAUTH_AUTH_DATATOUCH	0x00000002
+
+struct rpc_auth_create_args {
+	rpc_authflavor_t pseudoflavor;
+	const char *target_name;
+};
+
+/* Flags for rpcauth_lookupcred() */
+#define RPCAUTH_LOOKUP_NEW		0x01	/* Accept an uninitialised cred */
+#define RPCAUTH_LOOKUP_RCU		0x02	/* lock-less lookup */
+
+/*
+ * Client authentication ops
+ */
+struct rpc_authops {
+	struct module		*owner;
+	rpc_authflavor_t	au_flavor;	/* flavor (RPC_AUTH_*) */
+	char *			au_name;
+	struct rpc_auth *	(*create)(struct rpc_auth_create_args *, struct rpc_clnt *);
+	void			(*destroy)(struct rpc_auth *);
+
+	int			(*hash_cred)(struct auth_cred *, unsigned int);
+	struct rpc_cred *	(*lookup_cred)(struct rpc_auth *, struct auth_cred *, int);
+	struct rpc_cred *	(*crcreate)(struct rpc_auth*, struct auth_cred *, int, gfp_t);
+	int			(*list_pseudoflavors)(rpc_authflavor_t *, int);
+	rpc_authflavor_t	(*info2flavor)(struct rpcsec_gss_info *);
+	int			(*flavor2info)(rpc_authflavor_t,
+						struct rpcsec_gss_info *);
+	int			(*key_timeout)(struct rpc_auth *,
+						struct rpc_cred *);
+};
+
+struct rpc_credops {
+	const char *		cr_name;	/* Name of the auth flavour */
+	int			(*cr_init)(struct rpc_auth *, struct rpc_cred *);
+	void			(*crdestroy)(struct rpc_cred *);
+
+	int			(*crmatch)(struct auth_cred *, struct rpc_cred *, int);
+	struct rpc_cred *	(*crbind)(struct rpc_task *, struct rpc_cred *, int);
+	__be32 *		(*crmarshal)(struct rpc_task *, __be32 *);
+	int			(*crrefresh)(struct rpc_task *);
+	__be32 *		(*crvalidate)(struct rpc_task *, __be32 *);
+	int			(*crwrap_req)(struct rpc_task *, kxdreproc_t,
+						void *, __be32 *, void *);
+	int			(*crunwrap_resp)(struct rpc_task *, kxdrdproc_t,
+						void *, __be32 *, void *);
+	int			(*crkey_timeout)(struct rpc_cred *);
+	bool			(*crkey_to_expire)(struct rpc_cred *);
+	char *			(*crstringify_acceptor)(struct rpc_cred *);
+};
+
+extern const struct rpc_authops	authunix_ops;
+extern const struct rpc_authops	authnull_ops;
+
+int __init		rpc_init_authunix(void);
+int __init		rpc_init_generic_auth(void);
+int __init		rpcauth_init_module(void);
+void			rpcauth_remove_module(void);
+void			rpc_destroy_generic_auth(void);
+void 			rpc_destroy_authunix(void);
+
+struct rpc_cred *	rpc_lookup_cred(void);
+struct rpc_cred *	rpc_lookup_cred_nonblock(void);
+struct rpc_cred *	rpc_lookup_generic_cred(struct auth_cred *, int, gfp_t);
+struct rpc_cred *	rpc_lookup_machine_cred(const char *service_name);
+int			rpcauth_register(const struct rpc_authops *);
+int			rpcauth_unregister(const struct rpc_authops *);
+struct rpc_auth *	rpcauth_create(struct rpc_auth_create_args *,
+				struct rpc_clnt *);
+void			rpcauth_release(struct rpc_auth *);
+rpc_authflavor_t	rpcauth_get_pseudoflavor(rpc_authflavor_t,
+				struct rpcsec_gss_info *);
+int			rpcauth_get_gssinfo(rpc_authflavor_t,
+				struct rpcsec_gss_info *);
+int			rpcauth_list_flavors(rpc_authflavor_t *, int);
+struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int, gfp_t);
+void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
+struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
+struct rpc_cred *	rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int);
+void			put_rpccred(struct rpc_cred *);
+__be32 *		rpcauth_marshcred(struct rpc_task *, __be32 *);
+__be32 *		rpcauth_checkverf(struct rpc_task *, __be32 *);
+int			rpcauth_wrap_req(struct rpc_task *task, kxdreproc_t encode, void *rqstp, __be32 *data, void *obj);
+int			rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, __be32 *data, void *obj);
+int			rpcauth_refreshcred(struct rpc_task *);
+void			rpcauth_invalcred(struct rpc_task *);
+int			rpcauth_uptodatecred(struct rpc_task *);
+int			rpcauth_init_credcache(struct rpc_auth *);
+void			rpcauth_destroy_credcache(struct rpc_auth *);
+void			rpcauth_clear_credcache(struct rpc_cred_cache *);
+int			rpcauth_key_timeout_notify(struct rpc_auth *,
+						struct rpc_cred *);
+bool			rpcauth_cred_key_to_expire(struct rpc_auth *, struct rpc_cred *);
+char *			rpcauth_stringify_acceptor(struct rpc_cred *);
+
+static inline
+struct rpc_cred *	get_rpccred(struct rpc_cred *cred)
+{
+	if (cred != NULL)
+		atomic_inc(&cred->cr_count);
+	return cred;
+}
+
+/**
+ * get_rpccred_rcu - get a reference to a cred using rcu-protected pointer
+ * @cred: cred of which to take a reference
+ *
+ * In some cases, we may have a pointer to a credential to which we
+ * want to take a reference, but don't already have one. Because these
+ * objects are freed using RCU, we can access the cr_count while its
+ * on its way to destruction and only take a reference if it's not already
+ * zero.
+ */
+static inline struct rpc_cred *
+get_rpccred_rcu(struct rpc_cred *cred)
+{
+	if (atomic_inc_not_zero(&cred->cr_count))
+		return cred;
+	return NULL;
+}
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_SUNRPC_AUTH_H */
diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h
new file mode 100644
index 0000000..0c9eac3
--- /dev/null
+++ b/include/linux/sunrpc/auth_gss.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/auth_gss.h
+ *
+ * Declarations for RPCSEC_GSS
+ *
+ * Dug Song <dugsong@monkey.org>
+ * Andy Adamson <andros@umich.edu>
+ * Bruce Fields <bfields@umich.edu>
+ * Copyright (c) 2000 The Regents of the University of Michigan
+ */
+
+#ifndef _LINUX_SUNRPC_AUTH_GSS_H
+#define _LINUX_SUNRPC_AUTH_GSS_H
+
+#ifdef __KERNEL__
+#include <linux/refcount.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/gss_api.h>
+
+#define RPC_GSS_VERSION		1
+
+#define MAXSEQ 0x80000000 /* maximum legal sequence number, from rfc 2203 */
+
+enum rpc_gss_proc {
+	RPC_GSS_PROC_DATA = 0,
+	RPC_GSS_PROC_INIT = 1,
+	RPC_GSS_PROC_CONTINUE_INIT = 2,
+	RPC_GSS_PROC_DESTROY = 3
+};
+
+enum rpc_gss_svc {
+	RPC_GSS_SVC_NONE = 1,
+	RPC_GSS_SVC_INTEGRITY = 2,
+	RPC_GSS_SVC_PRIVACY = 3
+};
+
+/* on-the-wire gss cred: */
+struct rpc_gss_wire_cred {
+	u32			gc_v;		/* version */
+	u32			gc_proc;	/* control procedure */
+	u32			gc_seq;		/* sequence number */
+	u32			gc_svc;		/* service */
+	struct xdr_netobj	gc_ctx;		/* context handle */
+};
+
+/* on-the-wire gss verifier: */
+struct rpc_gss_wire_verf {
+	u32			gv_flavor;
+	struct xdr_netobj	gv_verf;
+};
+
+/* return from gss NULL PROC init sec context */
+struct rpc_gss_init_res {
+	struct xdr_netobj	gr_ctx;		/* context handle */
+	u32			gr_major;	/* major status */
+	u32			gr_minor;	/* minor status */
+	u32			gr_win;		/* sequence window */
+	struct xdr_netobj	gr_token;	/* token */
+};
+
+/* The gss_cl_ctx struct holds all the information the rpcsec_gss client
+ * code needs to know about a single security context.  In particular,
+ * gc_gss_ctx is the context handle that is used to do gss-api calls, while
+ * gc_wire_ctx is the context handle that is used to identify the context on
+ * the wire when communicating with a server. */
+
+struct gss_cl_ctx {
+	refcount_t		count;
+	enum rpc_gss_proc	gc_proc;
+	u32			gc_seq;
+	spinlock_t		gc_seq_lock;
+	struct gss_ctx		*gc_gss_ctx;
+	struct xdr_netobj	gc_wire_ctx;
+	struct xdr_netobj	gc_acceptor;
+	u32			gc_win;
+	unsigned long		gc_expiry;
+	struct rcu_head		gc_rcu;
+};
+
+struct gss_upcall_msg;
+struct gss_cred {
+	struct rpc_cred		gc_base;
+	enum rpc_gss_svc	gc_service;
+	struct gss_cl_ctx __rcu	*gc_ctx;
+	struct gss_upcall_msg	*gc_upcall;
+	const char		*gc_principal;
+	unsigned long		gc_upcall_timestamp;
+};
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_SUNRPC_AUTH_GSS_H */
+
diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
new file mode 100644
index 0000000..4397a48
--- /dev/null
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -0,0 +1,72 @@
+/******************************************************************************
+
+(c) 2008 NetApp.  All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+/*
+ * Functions to create and manage the backchannel
+ */
+
+#ifndef _LINUX_SUNRPC_BC_XPRT_H
+#define _LINUX_SUNRPC_BC_XPRT_H
+
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/sched.h>
+
+#ifdef CONFIG_SUNRPC_BACKCHANNEL
+struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid);
+void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied);
+void xprt_free_bc_request(struct rpc_rqst *req);
+int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
+void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
+
+/* Socket backchannel transport methods */
+int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs);
+void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs);
+void xprt_free_bc_rqst(struct rpc_rqst *req);
+
+/*
+ * Determine if a shared backchannel is in use
+ */
+static inline int svc_is_backchannel(const struct svc_rqst *rqstp)
+{
+	if (rqstp->rq_server->sv_bc_xprt)
+		return 1;
+	return 0;
+}
+#else /* CONFIG_SUNRPC_BACKCHANNEL */
+static inline int xprt_setup_backchannel(struct rpc_xprt *xprt,
+					 unsigned int min_reqs)
+{
+	return 0;
+}
+
+static inline int svc_is_backchannel(const struct svc_rqst *rqstp)
+{
+	return 0;
+}
+
+static inline void xprt_free_bc_request(struct rpc_rqst *req)
+{
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+#endif /* _LINUX_SUNRPC_BC_XPRT_H */
+
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
new file mode 100644
index 0000000..40d2822
--- /dev/null
+++ b/include/linux/sunrpc/cache.h
@@ -0,0 +1,302 @@
+/*
+ * include/linux/sunrpc/cache.h
+ *
+ * Generic code for various authentication-related caches
+ * used by sunrpc clients and servers.
+ *
+ * Copyright (C) 2002 Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ * Released under terms in GPL version 2.  See COPYING.
+ *
+ */
+
+#ifndef _LINUX_SUNRPC_CACHE_H_
+#define _LINUX_SUNRPC_CACHE_H_
+
+#include <linux/kref.h>
+#include <linux/slab.h>
+#include <linux/atomic.h>
+#include <linux/proc_fs.h>
+
+/*
+ * Each cache requires:
+ *  - A 'struct cache_detail' which contains information specific to the cache
+ *    for common code to use.
+ *  - An item structure that must contain a "struct cache_head"
+ *  - A lookup function defined using DefineCacheLookup
+ *  - A 'put' function that can release a cache item. It will only
+ *    be called after cache_put has succeed, so there are guarantee
+ *    to be no references.
+ *  - A function to calculate a hash of an item's key.
+ *
+ * as well as assorted code fragments (e.g. compare keys) and numbers
+ * (e.g. hash size, goal_age, etc).
+ *
+ * Each cache must be registered so that it can be cleaned regularly.
+ * When the cache is unregistered, it is flushed completely.
+ *
+ * Entries have a ref count and a 'hashed' flag which counts the existence
+ * in the hash table.
+ * We only expire entries when refcount is zero.
+ * Existence in the cache is counted  the refcount.
+ */
+
+/* Every cache item has a common header that is used
+ * for expiring and refreshing entries.
+ * 
+ */
+struct cache_head {
+	struct hlist_node	cache_list;
+	time_t		expiry_time;	/* After time time, don't use the data */
+	time_t		last_refresh;   /* If CACHE_PENDING, this is when upcall was
+					 * sent, else this is when update was
+					 * received, though it is alway set to
+					 * be *after* ->flush_time.
+					 */
+	struct kref	ref;
+	unsigned long	flags;
+};
+#define	CACHE_VALID	0	/* Entry contains valid data */
+#define	CACHE_NEGATIVE	1	/* Negative entry - there is no match for the key */
+#define	CACHE_PENDING	2	/* An upcall has been sent but no reply received yet*/
+#define	CACHE_CLEANED	3	/* Entry has been cleaned from cache */
+
+#define	CACHE_NEW_EXPIRY 120	/* keep new things pending confirmation for 120 seconds */
+
+struct cache_detail {
+	struct module *		owner;
+	int			hash_size;
+	struct hlist_head *	hash_table;
+	rwlock_t		hash_lock;
+
+	char			*name;
+	void			(*cache_put)(struct kref *);
+
+	int			(*cache_upcall)(struct cache_detail *,
+						struct cache_head *);
+
+	void			(*cache_request)(struct cache_detail *cd,
+						 struct cache_head *ch,
+						 char **bpp, int *blen);
+
+	int			(*cache_parse)(struct cache_detail *,
+					       char *buf, int len);
+
+	int			(*cache_show)(struct seq_file *m,
+					      struct cache_detail *cd,
+					      struct cache_head *h);
+	void			(*warn_no_listener)(struct cache_detail *cd,
+					      int has_died);
+
+	struct cache_head *	(*alloc)(void);
+	int			(*match)(struct cache_head *orig, struct cache_head *new);
+	void			(*init)(struct cache_head *orig, struct cache_head *new);
+	void			(*update)(struct cache_head *orig, struct cache_head *new);
+
+	/* fields below this comment are for internal use
+	 * and should not be touched by cache owners
+	 */
+	time_t			flush_time;		/* flush all cache items with
+							 * last_refresh at or earlier
+							 * than this.  last_refresh
+							 * is never set at or earlier
+							 * than this.
+							 */
+	struct list_head	others;
+	time_t			nextcheck;
+	int			entries;
+
+	/* fields for communication over channel */
+	struct list_head	queue;
+
+	atomic_t		readers;		/* how many time is /chennel open */
+	time_t			last_close;		/* if no readers, when did last close */
+	time_t			last_warn;		/* when we last warned about no readers */
+
+	union {
+		struct proc_dir_entry	*procfs;
+		struct dentry		*pipefs;
+	};
+	struct net		*net;
+};
+
+
+/* this must be embedded in any request structure that
+ * identifies an object that will want a callback on
+ * a cache fill
+ */
+struct cache_req {
+	struct cache_deferred_req *(*defer)(struct cache_req *req);
+	int thread_wait;  /* How long (jiffies) we can block the
+			   * current thread to wait for updates.
+			   */
+};
+/* this must be embedded in a deferred_request that is being
+ * delayed awaiting cache-fill
+ */
+struct cache_deferred_req {
+	struct hlist_node	hash;	/* on hash chain */
+	struct list_head	recent; /* on fifo */
+	struct cache_head	*item;  /* cache item we wait on */
+	void			*owner; /* we might need to discard all defered requests
+					 * owned by someone */
+	void			(*revisit)(struct cache_deferred_req *req,
+					   int too_many);
+};
+
+/*
+ * timestamps kept in the cache are expressed in seconds
+ * since boot.  This is the best for measuring differences in
+ * real time.
+ */
+static inline time_t seconds_since_boot(void)
+{
+	struct timespec boot;
+	getboottime(&boot);
+	return get_seconds() - boot.tv_sec;
+}
+
+static inline time_t convert_to_wallclock(time_t sinceboot)
+{
+	struct timespec boot;
+	getboottime(&boot);
+	return boot.tv_sec + sinceboot;
+}
+
+extern const struct file_operations cache_file_operations_pipefs;
+extern const struct file_operations content_file_operations_pipefs;
+extern const struct file_operations cache_flush_operations_pipefs;
+
+extern struct cache_head *
+sunrpc_cache_lookup(struct cache_detail *detail,
+		    struct cache_head *key, int hash);
+extern struct cache_head *
+sunrpc_cache_update(struct cache_detail *detail,
+		    struct cache_head *new, struct cache_head *old, int hash);
+
+extern int
+sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h);
+
+
+extern void cache_clean_deferred(void *owner);
+
+static inline struct cache_head  *cache_get(struct cache_head *h)
+{
+	kref_get(&h->ref);
+	return h;
+}
+
+
+static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
+{
+	if (kref_read(&h->ref) <= 2 &&
+	    h->expiry_time < cd->nextcheck)
+		cd->nextcheck = h->expiry_time;
+	kref_put(&h->ref, cd->cache_put);
+}
+
+static inline bool cache_is_expired(struct cache_detail *detail, struct cache_head *h)
+{
+	if (!test_bit(CACHE_VALID, &h->flags))
+		return false;
+
+	return  (h->expiry_time < seconds_since_boot()) ||
+		(detail->flush_time >= h->last_refresh);
+}
+
+extern int cache_check(struct cache_detail *detail,
+		       struct cache_head *h, struct cache_req *rqstp);
+extern void cache_flush(void);
+extern void cache_purge(struct cache_detail *detail);
+#define NEVER (0x7FFFFFFF)
+extern void __init cache_initialize(void);
+extern int cache_register_net(struct cache_detail *cd, struct net *net);
+extern void cache_unregister_net(struct cache_detail *cd, struct net *net);
+
+extern struct cache_detail *cache_create_net(const struct cache_detail *tmpl, struct net *net);
+extern void cache_destroy_net(struct cache_detail *cd, struct net *net);
+
+extern void sunrpc_init_cache_detail(struct cache_detail *cd);
+extern void sunrpc_destroy_cache_detail(struct cache_detail *cd);
+extern int sunrpc_cache_register_pipefs(struct dentry *parent, const char *,
+					umode_t, struct cache_detail *);
+extern void sunrpc_cache_unregister_pipefs(struct cache_detail *);
+extern void sunrpc_cache_unhash(struct cache_detail *, struct cache_head *);
+
+/* Must store cache_detail in seq_file->private if using next three functions */
+extern void *cache_seq_start(struct seq_file *file, loff_t *pos);
+extern void *cache_seq_next(struct seq_file *file, void *p, loff_t *pos);
+extern void cache_seq_stop(struct seq_file *file, void *p);
+
+extern void qword_add(char **bpp, int *lp, char *str);
+extern void qword_addhex(char **bpp, int *lp, char *buf, int blen);
+extern int qword_get(char **bpp, char *dest, int bufsize);
+
+static inline int get_int(char **bpp, int *anint)
+{
+	char buf[50];
+	char *ep;
+	int rv;
+	int len = qword_get(bpp, buf, sizeof(buf));
+
+	if (len < 0)
+		return -EINVAL;
+	if (len == 0)
+		return -ENOENT;
+
+	rv = simple_strtol(buf, &ep, 0);
+	if (*ep)
+		return -EINVAL;
+
+	*anint = rv;
+	return 0;
+}
+
+static inline int get_uint(char **bpp, unsigned int *anint)
+{
+	char buf[50];
+	int len = qword_get(bpp, buf, sizeof(buf));
+
+	if (len < 0)
+		return -EINVAL;
+	if (len == 0)
+		return -ENOENT;
+
+	if (kstrtouint(buf, 0, anint))
+		return -EINVAL;
+
+	return 0;
+}
+
+static inline int get_time(char **bpp, time_t *time)
+{
+	char buf[50];
+	long long ll;
+	int len = qword_get(bpp, buf, sizeof(buf));
+
+	if (len < 0)
+		return -EINVAL;
+	if (len == 0)
+		return -ENOENT;
+
+	if (kstrtoll(buf, 0, &ll))
+		return -EINVAL;
+
+	*time = (time_t)ll;
+	return 0;
+}
+
+static inline time_t get_expiry(char **bpp)
+{
+	time_t rv;
+	struct timespec boot;
+
+	if (get_time(bpp, &rv))
+		return 0;
+	if (rv < 0)
+		return 0;
+	getboottime(&boot);
+	return rv - boot.tv_sec;
+}
+
+#endif /*  _LINUX_SUNRPC_CACHE_H_ */
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
new file mode 100644
index 0000000..9b11b6a
--- /dev/null
+++ b/include/linux/sunrpc/clnt.h
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  linux/include/linux/sunrpc/clnt.h
+ *
+ *  Declarations for the high-level RPC client interface
+ *
+ *  Copyright (C) 1995, 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef _LINUX_SUNRPC_CLNT_H
+#define _LINUX_SUNRPC_CLNT_H
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/timer.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <asm/signal.h>
+#include <linux/path.h>
+#include <net/ipv6.h>
+#include <linux/sunrpc/xprtmultipath.h>
+
+struct rpc_inode;
+
+/*
+ * The high-level client handle
+ */
+struct rpc_clnt {
+	atomic_t		cl_count;	/* Number of references */
+	unsigned int		cl_clid;	/* client id */
+	struct list_head	cl_clients;	/* Global list of clients */
+	struct list_head	cl_tasks;	/* List of tasks */
+	spinlock_t		cl_lock;	/* spinlock */
+	struct rpc_xprt __rcu *	cl_xprt;	/* transport */
+	const struct rpc_procinfo *cl_procinfo;	/* procedure info */
+	u32			cl_prog,	/* RPC program number */
+				cl_vers,	/* RPC version number */
+				cl_maxproc;	/* max procedure number */
+
+	struct rpc_auth *	cl_auth;	/* authenticator */
+	struct rpc_stat *	cl_stats;	/* per-program statistics */
+	struct rpc_iostats *	cl_metrics;	/* per-client statistics */
+
+	unsigned int		cl_softrtry : 1,/* soft timeouts */
+				cl_discrtry : 1,/* disconnect before retry */
+				cl_noretranstimeo: 1,/* No retransmit timeouts */
+				cl_autobind : 1,/* use getport() */
+				cl_chatty   : 1;/* be verbose */
+
+	struct rpc_rtt *	cl_rtt;		/* RTO estimator data */
+	const struct rpc_timeout *cl_timeout;	/* Timeout strategy */
+
+	atomic_t		cl_swapper;	/* swapfile count */
+	int			cl_nodelen;	/* nodename length */
+	char 			cl_nodename[UNX_MAXNODENAME+1];
+	struct rpc_pipe_dir_head cl_pipedir_objects;
+	struct rpc_clnt *	cl_parent;	/* Points to parent of clones */
+	struct rpc_rtt		cl_rtt_default;
+	struct rpc_timeout	cl_timeout_default;
+	const struct rpc_program *cl_program;
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+	struct dentry		*cl_debugfs;	/* debugfs directory */
+#endif
+	struct rpc_xprt_iter	cl_xpi;
+};
+
+/*
+ * General RPC program info
+ */
+#define RPC_MAXVERSION		4
+struct rpc_program {
+	const char *		name;		/* protocol name */
+	u32			number;		/* program number */
+	unsigned int		nrvers;		/* number of versions */
+	const struct rpc_version **	version;	/* version array */
+	struct rpc_stat *	stats;		/* statistics */
+	const char *		pipe_dir_name;	/* path to rpc_pipefs dir */
+};
+
+struct rpc_version {
+	u32			number;		/* version number */
+	unsigned int		nrprocs;	/* number of procs */
+	const struct rpc_procinfo *procs;	/* procedure array */
+	unsigned int		*counts;	/* call counts */
+};
+
+/*
+ * Procedure information
+ */
+struct rpc_procinfo {
+	u32			p_proc;		/* RPC procedure number */
+	kxdreproc_t		p_encode;	/* XDR encode function */
+	kxdrdproc_t		p_decode;	/* XDR decode function */
+	unsigned int		p_arglen;	/* argument hdr length (u32) */
+	unsigned int		p_replen;	/* reply hdr length (u32) */
+	unsigned int		p_timer;	/* Which RTT timer to use */
+	u32			p_statidx;	/* Which procedure to account */
+	const char *		p_name;		/* name of procedure */
+};
+
+#ifdef __KERNEL__
+
+struct rpc_create_args {
+	struct net		*net;
+	int			protocol;
+	struct sockaddr		*address;
+	size_t			addrsize;
+	struct sockaddr		*saddress;
+	const struct rpc_timeout *timeout;
+	const char		*servername;
+	const char		*nodename;
+	const struct rpc_program *program;
+	u32			prognumber;	/* overrides program->number */
+	u32			version;
+	rpc_authflavor_t	authflavor;
+	unsigned long		flags;
+	char			*client_name;
+	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
+};
+
+struct rpc_add_xprt_test {
+	int (*add_xprt_test)(struct rpc_clnt *,
+		struct rpc_xprt *,
+		void *calldata);
+	void *data;
+};
+
+/* Values for "flags" field */
+#define RPC_CLNT_CREATE_HARDRTRY	(1UL << 0)
+#define RPC_CLNT_CREATE_AUTOBIND	(1UL << 2)
+#define RPC_CLNT_CREATE_NONPRIVPORT	(1UL << 3)
+#define RPC_CLNT_CREATE_NOPING		(1UL << 4)
+#define RPC_CLNT_CREATE_DISCRTRY	(1UL << 5)
+#define RPC_CLNT_CREATE_QUIET		(1UL << 6)
+#define RPC_CLNT_CREATE_INFINITE_SLOTS	(1UL << 7)
+#define RPC_CLNT_CREATE_NO_IDLE_TIMEOUT	(1UL << 8)
+#define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT	(1UL << 9)
+
+struct rpc_clnt *rpc_create(struct rpc_create_args *args);
+struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
+				const struct rpc_program *, u32);
+struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
+struct rpc_clnt *rpc_clone_client_set_auth(struct rpc_clnt *,
+				rpc_authflavor_t);
+int		rpc_switch_client_transport(struct rpc_clnt *,
+				struct xprt_create *,
+				const struct rpc_timeout *);
+
+void		rpc_shutdown_client(struct rpc_clnt *);
+void		rpc_release_client(struct rpc_clnt *);
+void		rpc_task_release_client(struct rpc_task *);
+
+int		rpcb_create_local(struct net *);
+void		rpcb_put_local(struct net *);
+int		rpcb_register(struct net *, u32, u32, int, unsigned short);
+int		rpcb_v4_register(struct net *net, const u32 program,
+				 const u32 version,
+				 const struct sockaddr *address,
+				 const char *netid);
+void		rpcb_getport_async(struct rpc_task *);
+
+void		rpc_call_start(struct rpc_task *);
+int		rpc_call_async(struct rpc_clnt *clnt,
+			       const struct rpc_message *msg, int flags,
+			       const struct rpc_call_ops *tk_ops,
+			       void *calldata);
+int		rpc_call_sync(struct rpc_clnt *clnt,
+			      const struct rpc_message *msg, int flags);
+struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred,
+			       int flags);
+int		rpc_restart_call_prepare(struct rpc_task *);
+int		rpc_restart_call(struct rpc_task *);
+void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
+struct net *	rpc_net_ns(struct rpc_clnt *);
+size_t		rpc_max_payload(struct rpc_clnt *);
+size_t		rpc_max_bc_payload(struct rpc_clnt *);
+void		rpc_force_rebind(struct rpc_clnt *);
+size_t		rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
+const char	*rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t);
+int		rpc_localaddr(struct rpc_clnt *, struct sockaddr *, size_t);
+
+int 		rpc_clnt_iterate_for_each_xprt(struct rpc_clnt *clnt,
+			int (*fn)(struct rpc_clnt *, struct rpc_xprt *, void *),
+			void *data);
+
+int 		rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
+			struct rpc_xprt_switch *xps,
+			struct rpc_xprt *xprt,
+			void *dummy);
+int		rpc_clnt_add_xprt(struct rpc_clnt *, struct xprt_create *,
+			int (*setup)(struct rpc_clnt *,
+				struct rpc_xprt_switch *,
+				struct rpc_xprt *,
+				void *),
+			void *data);
+void		rpc_set_connect_timeout(struct rpc_clnt *clnt,
+			unsigned long connect_timeout,
+			unsigned long reconnect_timeout);
+
+int		rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *,
+			struct rpc_xprt_switch *,
+			struct rpc_xprt *,
+			void *);
+
+const char *rpc_proc_name(const struct rpc_task *task);
+
+void rpc_clnt_xprt_switch_put(struct rpc_clnt *);
+void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *);
+bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
+			const struct sockaddr *sap);
+void rpc_cleanup_clids(void);
+
+static inline int rpc_reply_expected(struct rpc_task *task)
+{
+	return (task->tk_msg.rpc_proc != NULL) &&
+		(task->tk_msg.rpc_proc->p_decode != NULL);
+}
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_SUNRPC_CLNT_H */
diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
new file mode 100644
index 0000000..f6aeed0
--- /dev/null
+++ b/include/linux/sunrpc/debug.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/debug.h
+ *
+ * Debugging support for sunrpc module
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef _LINUX_SUNRPC_DEBUG_H_
+#define _LINUX_SUNRPC_DEBUG_H_
+
+#include <uapi/linux/sunrpc/debug.h>
+
+/*
+ * Debugging macros etc
+ */
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+extern unsigned int		rpc_debug;
+extern unsigned int		nfs_debug;
+extern unsigned int		nfsd_debug;
+extern unsigned int		nlm_debug;
+#endif
+
+#define dprintk(fmt, ...)						\
+	dfprintk(FACILITY, fmt, ##__VA_ARGS__)
+#define dprintk_cont(fmt, ...)						\
+	dfprintk_cont(FACILITY, fmt, ##__VA_ARGS__)
+#define dprintk_rcu(fmt, ...)						\
+	dfprintk_rcu(FACILITY, fmt, ##__VA_ARGS__)
+#define dprintk_rcu_cont(fmt, ...)					\
+	dfprintk_rcu_cont(FACILITY, fmt, ##__VA_ARGS__)
+
+#undef ifdebug
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define ifdebug(fac)		if (unlikely(rpc_debug & RPCDBG_##fac))
+
+# define dfprintk(fac, fmt, ...)					\
+do {									\
+	ifdebug(fac)							\
+		printk(KERN_DEFAULT fmt, ##__VA_ARGS__);		\
+} while (0)
+
+# define dfprintk_cont(fac, fmt, ...)					\
+do {									\
+	ifdebug(fac)							\
+		printk(KERN_CONT fmt, ##__VA_ARGS__);			\
+} while (0)
+
+# define dfprintk_rcu(fac, fmt, ...)					\
+do {									\
+	ifdebug(fac) {							\
+		rcu_read_lock();					\
+		printk(KERN_DEFAULT fmt, ##__VA_ARGS__);		\
+		rcu_read_unlock();					\
+	}								\
+} while (0)
+
+# define dfprintk_rcu_cont(fac, fmt, ...)				\
+do {									\
+	ifdebug(fac) {							\
+		rcu_read_lock();					\
+		printk(KERN_CONT fmt, ##__VA_ARGS__);			\
+		rcu_read_unlock();					\
+	}								\
+} while (0)
+
+# define RPC_IFDEBUG(x)		x
+#else
+# define ifdebug(fac)		if (0)
+# define dfprintk(fac, fmt, ...)	do {} while (0)
+# define dfprintk_cont(fac, fmt, ...)	do {} while (0)
+# define dfprintk_rcu(fac, fmt, ...)	do {} while (0)
+# define RPC_IFDEBUG(x)
+#endif
+
+/*
+ * Sysctl interface for RPC debugging
+ */
+
+struct rpc_clnt;
+struct rpc_xprt;
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+void		rpc_register_sysctl(void);
+void		rpc_unregister_sysctl(void);
+void		sunrpc_debugfs_init(void);
+void		sunrpc_debugfs_exit(void);
+void		rpc_clnt_debugfs_register(struct rpc_clnt *);
+void		rpc_clnt_debugfs_unregister(struct rpc_clnt *);
+void		rpc_xprt_debugfs_register(struct rpc_xprt *);
+void		rpc_xprt_debugfs_unregister(struct rpc_xprt *);
+#else
+static inline void
+sunrpc_debugfs_init(void)
+{
+	return;
+}
+
+static inline void
+sunrpc_debugfs_exit(void)
+{
+	return;
+}
+
+static inline void
+rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
+{
+	return;
+}
+
+static inline void
+rpc_clnt_debugfs_unregister(struct rpc_clnt *clnt)
+{
+	return;
+}
+
+static inline void
+rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
+{
+	return;
+}
+
+static inline void
+rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt)
+{
+	return;
+}
+#endif
+
+#endif /* _LINUX_SUNRPC_DEBUG_H_ */
diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h
new file mode 100644
index 0000000..5ac5db4
--- /dev/null
+++ b/include/linux/sunrpc/gss_api.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/gss_api.h
+ *
+ * Somewhat simplified version of the gss api.
+ *
+ * Dug Song <dugsong@monkey.org>
+ * Andy Adamson <andros@umich.edu>
+ * Bruce Fields <bfields@umich.edu>
+ * Copyright (c) 2000 The Regents of the University of Michigan
+ */
+
+#ifndef _LINUX_SUNRPC_GSS_API_H
+#define _LINUX_SUNRPC_GSS_API_H
+
+#ifdef __KERNEL__
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/uio.h>
+
+/* The mechanism-independent gss-api context: */
+struct gss_ctx {
+	struct gss_api_mech	*mech_type;
+	void			*internal_ctx_id;
+};
+
+#define GSS_C_NO_BUFFER		((struct xdr_netobj) 0)
+#define GSS_C_NO_CONTEXT	((struct gss_ctx *) 0)
+#define GSS_C_QOP_DEFAULT	(0)
+
+/*XXX  arbitrary length - is this set somewhere? */
+#define GSS_OID_MAX_LEN 32
+struct rpcsec_gss_oid {
+	unsigned int	len;
+	u8		data[GSS_OID_MAX_LEN];
+};
+
+/* From RFC 3530 */
+struct rpcsec_gss_info {
+	struct rpcsec_gss_oid	oid;
+	u32			qop;
+	u32			service;
+};
+
+/* gss-api prototypes; note that these are somewhat simplified versions of
+ * the prototypes specified in RFC 2744. */
+int gss_import_sec_context(
+		const void*		input_token,
+		size_t			bufsize,
+		struct gss_api_mech	*mech,
+		struct gss_ctx		**ctx_id,
+		time_t			*endtime,
+		gfp_t			gfp_mask);
+u32 gss_get_mic(
+		struct gss_ctx		*ctx_id,
+		struct xdr_buf		*message,
+		struct xdr_netobj	*mic_token);
+u32 gss_verify_mic(
+		struct gss_ctx		*ctx_id,
+		struct xdr_buf		*message,
+		struct xdr_netobj	*mic_token);
+u32 gss_wrap(
+		struct gss_ctx		*ctx_id,
+		int			offset,
+		struct xdr_buf		*outbuf,
+		struct page		**inpages);
+u32 gss_unwrap(
+		struct gss_ctx		*ctx_id,
+		int			offset,
+		struct xdr_buf		*inbuf);
+u32 gss_delete_sec_context(
+		struct gss_ctx		**ctx_id);
+
+rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 qop,
+					u32 service);
+u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor);
+bool gss_pseudoflavor_to_datatouch(struct gss_api_mech *, u32 pseudoflavor);
+char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service);
+
+struct pf_desc {
+	u32	pseudoflavor;
+	u32	qop;
+	u32	service;
+	char	*name;
+	char	*auth_domain_name;
+	bool	datatouch;
+};
+
+/* Different mechanisms (e.g., krb5 or spkm3) may implement gss-api, and
+ * mechanisms may be dynamically registered or unregistered by modules. */
+
+/* Each mechanism is described by the following struct: */
+struct gss_api_mech {
+	struct list_head	gm_list;
+	struct module		*gm_owner;
+	struct rpcsec_gss_oid	gm_oid;
+	char			*gm_name;
+	const struct gss_api_ops *gm_ops;
+	/* pseudoflavors supported by this mechanism: */
+	int			gm_pf_num;
+	struct pf_desc *	gm_pfs;
+	/* Should the following be a callback operation instead? */
+	const char		*gm_upcall_enctypes;
+};
+
+/* and must provide the following operations: */
+struct gss_api_ops {
+	int (*gss_import_sec_context)(
+			const void		*input_token,
+			size_t			bufsize,
+			struct gss_ctx		*ctx_id,
+			time_t			*endtime,
+			gfp_t			gfp_mask);
+	u32 (*gss_get_mic)(
+			struct gss_ctx		*ctx_id,
+			struct xdr_buf		*message,
+			struct xdr_netobj	*mic_token);
+	u32 (*gss_verify_mic)(
+			struct gss_ctx		*ctx_id,
+			struct xdr_buf		*message,
+			struct xdr_netobj	*mic_token);
+	u32 (*gss_wrap)(
+			struct gss_ctx		*ctx_id,
+			int			offset,
+			struct xdr_buf		*outbuf,
+			struct page		**inpages);
+	u32 (*gss_unwrap)(
+			struct gss_ctx		*ctx_id,
+			int			offset,
+			struct xdr_buf		*buf);
+	void (*gss_delete_sec_context)(
+			void			*internal_ctx_id);
+};
+
+int gss_mech_register(struct gss_api_mech *);
+void gss_mech_unregister(struct gss_api_mech *);
+
+/* returns a mechanism descriptor given an OID, and increments the mechanism's
+ * reference count. */
+struct gss_api_mech * gss_mech_get_by_OID(struct rpcsec_gss_oid *);
+
+/* Given a GSS security tuple, look up a pseudoflavor */
+rpc_authflavor_t gss_mech_info2flavor(struct rpcsec_gss_info *);
+
+/* Given a pseudoflavor, look up a GSS security tuple */
+int gss_mech_flavor2info(rpc_authflavor_t, struct rpcsec_gss_info *);
+
+/* Returns a reference to a mechanism, given a name like "krb5" etc. */
+struct gss_api_mech *gss_mech_get_by_name(const char *);
+
+/* Similar, but get by pseudoflavor. */
+struct gss_api_mech *gss_mech_get_by_pseudoflavor(u32);
+
+/* Fill in an array with a list of supported pseudoflavors */
+int gss_mech_list_pseudoflavors(rpc_authflavor_t *, int);
+
+struct gss_api_mech * gss_mech_get(struct gss_api_mech *);
+
+/* For every successful gss_mech_get or gss_mech_get_by_* call there must be a
+ * corresponding call to gss_mech_put. */
+void gss_mech_put(struct gss_api_mech *);
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_SUNRPC_GSS_API_H */
+
diff --git a/include/linux/sunrpc/gss_asn1.h b/include/linux/sunrpc/gss_asn1.h
new file mode 100644
index 0000000..3ccecd0
--- /dev/null
+++ b/include/linux/sunrpc/gss_asn1.h
@@ -0,0 +1,81 @@
+/*
+ *  linux/include/linux/sunrpc/gss_asn1.h
+ *
+ *  minimal asn1 for generic encoding/decoding of gss tokens
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+
+#include <linux/sunrpc/gss_api.h>
+
+#define SIZEOF_INT 4
+
+/* from gssapi_err_generic.h */
+#define G_BAD_SERVICE_NAME                       (-2045022976L)
+#define G_BAD_STRING_UID                         (-2045022975L)
+#define G_NOUSER                                 (-2045022974L)
+#define G_VALIDATE_FAILED                        (-2045022973L)
+#define G_BUFFER_ALLOC                           (-2045022972L)
+#define G_BAD_MSG_CTX                            (-2045022971L)
+#define G_WRONG_SIZE                             (-2045022970L)
+#define G_BAD_USAGE                              (-2045022969L)
+#define G_UNKNOWN_QOP                            (-2045022968L)
+#define G_NO_HOSTNAME                            (-2045022967L)
+#define G_BAD_HOSTNAME                           (-2045022966L)
+#define G_WRONG_MECH                             (-2045022965L)
+#define G_BAD_TOK_HEADER                         (-2045022964L)
+#define G_BAD_DIRECTION                          (-2045022963L)
+#define G_TOK_TRUNC                              (-2045022962L)
+#define G_REFLECT                                (-2045022961L)
+#define G_WRONG_TOKID                            (-2045022960L)
+
+#define g_OID_equal(o1,o2) \
+   (((o1)->len == (o2)->len) && \
+    (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0))
+
+u32 g_verify_token_header(
+     struct xdr_netobj *mech,
+     int *body_size,
+     unsigned char **buf_in,
+     int toksize);
+
+int g_token_size(
+     struct xdr_netobj *mech,
+     unsigned int body_size);
+
+void g_make_token_header(
+     struct xdr_netobj *mech,
+     int body_size,
+     unsigned char **buf);
diff --git a/include/linux/sunrpc/gss_err.h b/include/linux/sunrpc/gss_err.h
new file mode 100644
index 0000000..a680786
--- /dev/null
+++ b/include/linux/sunrpc/gss_err.h
@@ -0,0 +1,167 @@
+/*
+ *  linux/include/sunrpc/gss_err.h
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 include/gssapi/gssapi.h
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ * 
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ * 
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _LINUX_SUNRPC_GSS_ERR_H
+#define _LINUX_SUNRPC_GSS_ERR_H
+
+#ifdef __KERNEL__
+
+typedef unsigned int OM_uint32;
+
+/*
+ * Flag bits for context-level services.
+ */
+#define GSS_C_DELEG_FLAG 1
+#define GSS_C_MUTUAL_FLAG 2
+#define GSS_C_REPLAY_FLAG 4
+#define GSS_C_SEQUENCE_FLAG 8
+#define GSS_C_CONF_FLAG 16
+#define GSS_C_INTEG_FLAG 32
+#define	GSS_C_ANON_FLAG 64
+#define GSS_C_PROT_READY_FLAG 128
+#define GSS_C_TRANS_FLAG 256
+
+/*
+ * Credential usage options
+ */
+#define GSS_C_BOTH 0
+#define GSS_C_INITIATE 1
+#define GSS_C_ACCEPT 2
+
+/*
+ * Status code types for gss_display_status
+ */
+#define GSS_C_GSS_CODE 1
+#define GSS_C_MECH_CODE 2
+
+
+/*
+ * Expiration time of 2^32-1 seconds means infinite lifetime for a
+ * credential or security context
+ */
+#define GSS_C_INDEFINITE ((OM_uint32) 0xfffffffful)
+
+
+/* Major status codes */
+
+#define GSS_S_COMPLETE 0
+
+/*
+ * Some "helper" definitions to make the status code macros obvious.
+ */
+#define GSS_C_CALLING_ERROR_OFFSET 24
+#define GSS_C_ROUTINE_ERROR_OFFSET 16
+#define GSS_C_SUPPLEMENTARY_OFFSET 0
+#define GSS_C_CALLING_ERROR_MASK ((OM_uint32) 0377ul)
+#define GSS_C_ROUTINE_ERROR_MASK ((OM_uint32) 0377ul)
+#define GSS_C_SUPPLEMENTARY_MASK ((OM_uint32) 0177777ul)
+
+/*
+ * The macros that test status codes for error conditions.  Note that the
+ * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now
+ * evaluates its argument only once.
+ */
+#define GSS_CALLING_ERROR(x) \
+  ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET))
+#define GSS_ROUTINE_ERROR(x) \
+  ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))
+#define GSS_SUPPLEMENTARY_INFO(x) \
+  ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET))
+#define GSS_ERROR(x) \
+  ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \
+	  (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)))
+
+/*
+ * Now the actual status code definitions
+ */
+
+/*
+ * Calling errors:
+ */
+#define GSS_S_CALL_INACCESSIBLE_READ \
+                             (((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_INACCESSIBLE_WRITE \
+                             (((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_BAD_STRUCTURE \
+                             (((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET)
+
+/*
+ * Routine errors:
+ */
+#define GSS_S_BAD_MECH (((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAME (((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAMETYPE (((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_BINDINGS (((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_STATUS (((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_SIG (((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CRED (((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CONTEXT (((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_TOKEN (((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_CREDENTIAL \
+     (((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CREDENTIALS_EXPIRED \
+     (((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CONTEXT_EXPIRED \
+     (((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_FAILURE (((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_QOP (((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAUTHORIZED (((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAVAILABLE (((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DUPLICATE_ELEMENT \
+     (((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NAME_NOT_MN \
+     (((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+
+/*
+ * Supplementary info bits:
+ */
+#define GSS_S_CONTINUE_NEEDED (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 0))
+#define GSS_S_DUPLICATE_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 1))
+#define GSS_S_OLD_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 2))
+#define GSS_S_UNSEQ_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 3))
+#define GSS_S_GAP_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 4))
+
+/* XXXX these are not part of the GSSAPI C bindings!  (but should be) */
+
+#define GSS_CALLING_ERROR_FIELD(x) \
+   (((x) >> GSS_C_CALLING_ERROR_OFFSET) & GSS_C_CALLING_ERROR_MASK)
+#define GSS_ROUTINE_ERROR_FIELD(x) \
+   (((x) >> GSS_C_ROUTINE_ERROR_OFFSET) & GSS_C_ROUTINE_ERROR_MASK)
+#define GSS_SUPPLEMENTARY_INFO_FIELD(x) \
+   (((x) >> GSS_C_SUPPLEMENTARY_OFFSET) & GSS_C_SUPPLEMENTARY_MASK)
+
+/* XXXX This is a necessary evil until the spec is fixed */
+#define GSS_S_CRED_UNAVAIL GSS_S_FAILURE
+
+#endif /* __KERNEL__ */
+#endif /* __LINUX_SUNRPC_GSS_ERR_H */
diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h
new file mode 100644
index 0000000..7df625d
--- /dev/null
+++ b/include/linux/sunrpc/gss_krb5.h
@@ -0,0 +1,331 @@
+/*
+ *  linux/include/linux/sunrpc/gss_krb5_types.h
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000-2008 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ *  Bruce Fields   <bfields@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#include <crypto/skcipher.h>
+#include <linux/sunrpc/auth_gss.h>
+#include <linux/sunrpc/gss_err.h>
+#include <linux/sunrpc/gss_asn1.h>
+
+/* Length of constant used in key derivation */
+#define GSS_KRB5_K5CLENGTH (5)
+
+/* Maximum key length (in bytes) for the supported crypto algorithms*/
+#define GSS_KRB5_MAX_KEYLEN (32)
+
+/* Maximum checksum function output for the supported crypto algorithms */
+#define GSS_KRB5_MAX_CKSUM_LEN  (20)
+
+/* Maximum blocksize for the supported crypto algorithms */
+#define GSS_KRB5_MAX_BLOCKSIZE  (16)
+
+struct krb5_ctx;
+
+struct gss_krb5_enctype {
+	const u32		etype;		/* encryption (key) type */
+	const u32		ctype;		/* checksum type */
+	const char		*name;		/* "friendly" name */
+	const char		*encrypt_name;	/* crypto encrypt name */
+	const char		*cksum_name;	/* crypto checksum name */
+	const u16		signalg;	/* signing algorithm */
+	const u16		sealalg;	/* sealing algorithm */
+	const u32		blocksize;	/* encryption blocksize */
+	const u32		conflen;	/* confounder length
+						   (normally the same as
+						   the blocksize) */
+	const u32		cksumlength;	/* checksum length */
+	const u32		keyed_cksum;	/* is it a keyed cksum? */
+	const u32		keybytes;	/* raw key len, in bytes */
+	const u32		keylength;	/* final key len, in bytes */
+	u32 (*encrypt) (struct crypto_skcipher *tfm,
+			void *iv, void *in, void *out,
+			int length);		/* encryption function */
+	u32 (*decrypt) (struct crypto_skcipher *tfm,
+			void *iv, void *in, void *out,
+			int length);		/* decryption function */
+	u32 (*mk_key) (const struct gss_krb5_enctype *gk5e,
+		       struct xdr_netobj *in,
+		       struct xdr_netobj *out);	/* complete key generation */
+	u32 (*encrypt_v2) (struct krb5_ctx *kctx, u32 offset,
+			   struct xdr_buf *buf,
+			   struct page **pages); /* v2 encryption function */
+	u32 (*decrypt_v2) (struct krb5_ctx *kctx, u32 offset,
+			   struct xdr_buf *buf, u32 *headskip,
+			   u32 *tailskip);	/* v2 decryption function */
+};
+
+/* krb5_ctx flags definitions */
+#define KRB5_CTX_FLAG_INITIATOR         0x00000001
+#define KRB5_CTX_FLAG_CFX               0x00000002
+#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY   0x00000004
+
+struct krb5_ctx {
+	int			initiate; /* 1 = initiating, 0 = accepting */
+	u32			enctype;
+	u32			flags;
+	const struct gss_krb5_enctype *gk5e; /* enctype-specific info */
+	struct crypto_skcipher	*enc;
+	struct crypto_skcipher	*seq;
+	struct crypto_skcipher *acceptor_enc;
+	struct crypto_skcipher *initiator_enc;
+	struct crypto_skcipher *acceptor_enc_aux;
+	struct crypto_skcipher *initiator_enc_aux;
+	u8			Ksess[GSS_KRB5_MAX_KEYLEN]; /* session key */
+	u8			cksum[GSS_KRB5_MAX_KEYLEN];
+	s32			endtime;
+	u32			seq_send;
+	u64			seq_send64;
+	struct xdr_netobj	mech_used;
+	u8			initiator_sign[GSS_KRB5_MAX_KEYLEN];
+	u8			acceptor_sign[GSS_KRB5_MAX_KEYLEN];
+	u8			initiator_seal[GSS_KRB5_MAX_KEYLEN];
+	u8			acceptor_seal[GSS_KRB5_MAX_KEYLEN];
+	u8			initiator_integ[GSS_KRB5_MAX_KEYLEN];
+	u8			acceptor_integ[GSS_KRB5_MAX_KEYLEN];
+};
+
+extern spinlock_t krb5_seq_lock;
+
+/* The length of the Kerberos GSS token header */
+#define GSS_KRB5_TOK_HDR_LEN	(16)
+
+#define KG_TOK_MIC_MSG    0x0101
+#define KG_TOK_WRAP_MSG   0x0201
+
+#define KG2_TOK_INITIAL     0x0101
+#define KG2_TOK_RESPONSE    0x0202
+#define KG2_TOK_MIC         0x0404
+#define KG2_TOK_WRAP        0x0504
+
+#define KG2_TOKEN_FLAG_SENTBYACCEPTOR   0x01
+#define KG2_TOKEN_FLAG_SEALED           0x02
+#define KG2_TOKEN_FLAG_ACCEPTORSUBKEY   0x04
+
+#define KG2_RESP_FLAG_ERROR             0x0001
+#define KG2_RESP_FLAG_DELEG_OK          0x0002
+
+enum sgn_alg {
+	SGN_ALG_DES_MAC_MD5 = 0x0000,
+	SGN_ALG_MD2_5 = 0x0001,
+	SGN_ALG_DES_MAC = 0x0002,
+	SGN_ALG_3 = 0x0003,		/* not published */
+	SGN_ALG_HMAC_MD5 = 0x0011,	/* microsoft w2k; no support */
+	SGN_ALG_HMAC_SHA1_DES3_KD = 0x0004
+};
+enum seal_alg {
+	SEAL_ALG_NONE = 0xffff,
+	SEAL_ALG_DES = 0x0000,
+	SEAL_ALG_1 = 0x0001,		/* not published */
+	SEAL_ALG_MICROSOFT_RC4 = 0x0010,/* microsoft w2k; no support */
+	SEAL_ALG_DES3KD = 0x0002
+};
+
+#define CKSUMTYPE_CRC32			0x0001
+#define CKSUMTYPE_RSA_MD4		0x0002
+#define CKSUMTYPE_RSA_MD4_DES		0x0003
+#define CKSUMTYPE_DESCBC		0x0004
+#define CKSUMTYPE_RSA_MD5		0x0007
+#define CKSUMTYPE_RSA_MD5_DES		0x0008
+#define CKSUMTYPE_NIST_SHA		0x0009
+#define CKSUMTYPE_HMAC_SHA1_DES3	0x000c
+#define CKSUMTYPE_HMAC_SHA1_96_AES128   0x000f
+#define CKSUMTYPE_HMAC_SHA1_96_AES256   0x0010
+#define CKSUMTYPE_HMAC_MD5_ARCFOUR      -138 /* Microsoft md5 hmac cksumtype */
+
+/* from gssapi_err_krb5.h */
+#define KG_CCACHE_NOMATCH                        (39756032L)
+#define KG_KEYTAB_NOMATCH                        (39756033L)
+#define KG_TGT_MISSING                           (39756034L)
+#define KG_NO_SUBKEY                             (39756035L)
+#define KG_CONTEXT_ESTABLISHED                   (39756036L)
+#define KG_BAD_SIGN_TYPE                         (39756037L)
+#define KG_BAD_LENGTH                            (39756038L)
+#define KG_CTX_INCOMPLETE                        (39756039L)
+#define KG_CONTEXT                               (39756040L)
+#define KG_CRED                                  (39756041L)
+#define KG_ENC_DESC                              (39756042L)
+#define KG_BAD_SEQ                               (39756043L)
+#define KG_EMPTY_CCACHE                          (39756044L)
+#define KG_NO_CTYPES                             (39756045L)
+
+/* per Kerberos v5 protocol spec crypto types from the wire. 
+ * these get mapped to linux kernel crypto routines.  
+ */
+#define ENCTYPE_NULL            0x0000
+#define ENCTYPE_DES_CBC_CRC     0x0001	/* DES cbc mode with CRC-32 */
+#define ENCTYPE_DES_CBC_MD4     0x0002	/* DES cbc mode with RSA-MD4 */
+#define ENCTYPE_DES_CBC_MD5     0x0003	/* DES cbc mode with RSA-MD5 */
+#define ENCTYPE_DES_CBC_RAW     0x0004	/* DES cbc mode raw */
+/* XXX deprecated? */
+#define ENCTYPE_DES3_CBC_SHA    0x0005	/* DES-3 cbc mode with NIST-SHA */
+#define ENCTYPE_DES3_CBC_RAW    0x0006	/* DES-3 cbc mode raw */
+#define ENCTYPE_DES_HMAC_SHA1   0x0008
+#define ENCTYPE_DES3_CBC_SHA1   0x0010
+#define ENCTYPE_AES128_CTS_HMAC_SHA1_96 0x0011
+#define ENCTYPE_AES256_CTS_HMAC_SHA1_96 0x0012
+#define ENCTYPE_ARCFOUR_HMAC            0x0017
+#define ENCTYPE_ARCFOUR_HMAC_EXP        0x0018
+#define ENCTYPE_UNKNOWN         0x01ff
+
+/*
+ * Constants used for key derivation
+ */
+/* for 3DES */
+#define KG_USAGE_SEAL (22)
+#define KG_USAGE_SIGN (23)
+#define KG_USAGE_SEQ  (24)
+
+/* from rfc3961 */
+#define KEY_USAGE_SEED_CHECKSUM         (0x99)
+#define KEY_USAGE_SEED_ENCRYPTION       (0xAA)
+#define KEY_USAGE_SEED_INTEGRITY        (0x55)
+
+/* from rfc4121 */
+#define KG_USAGE_ACCEPTOR_SEAL  (22)
+#define KG_USAGE_ACCEPTOR_SIGN  (23)
+#define KG_USAGE_INITIATOR_SEAL (24)
+#define KG_USAGE_INITIATOR_SIGN (25)
+
+/*
+ * This compile-time check verifies that we will not exceed the
+ * slack space allotted by the client and server auth_gss code
+ * before they call gss_wrap().
+ */
+#define GSS_KRB5_MAX_SLACK_NEEDED \
+	(GSS_KRB5_TOK_HDR_LEN     /* gss token header */         \
+	+ GSS_KRB5_MAX_CKSUM_LEN  /* gss token checksum */       \
+	+ GSS_KRB5_MAX_BLOCKSIZE  /* confounder */               \
+	+ GSS_KRB5_MAX_BLOCKSIZE  /* possible padding */         \
+	+ GSS_KRB5_TOK_HDR_LEN    /* encrypted hdr in v2 token */\
+	+ GSS_KRB5_MAX_CKSUM_LEN  /* encryption hmac */          \
+	+ 4 + 4                   /* RPC verifier */             \
+	+ GSS_KRB5_TOK_HDR_LEN                                   \
+	+ GSS_KRB5_MAX_CKSUM_LEN)
+
+u32
+make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
+		struct xdr_buf *body, int body_offset, u8 *cksumkey,
+		unsigned int usage, struct xdr_netobj *cksumout);
+
+u32
+make_checksum_v2(struct krb5_ctx *, char *header, int hdrlen,
+		 struct xdr_buf *body, int body_offset, u8 *key,
+		 unsigned int usage, struct xdr_netobj *cksum);
+
+u32 gss_get_mic_kerberos(struct gss_ctx *, struct xdr_buf *,
+		struct xdr_netobj *);
+
+u32 gss_verify_mic_kerberos(struct gss_ctx *, struct xdr_buf *,
+		struct xdr_netobj *);
+
+u32
+gss_wrap_kerberos(struct gss_ctx *ctx_id, int offset,
+		struct xdr_buf *outbuf, struct page **pages);
+
+u32
+gss_unwrap_kerberos(struct gss_ctx *ctx_id, int offset,
+		struct xdr_buf *buf);
+
+
+u32
+krb5_encrypt(struct crypto_skcipher *key,
+	     void *iv, void *in, void *out, int length);
+
+u32
+krb5_decrypt(struct crypto_skcipher *key,
+	     void *iv, void *in, void *out, int length); 
+
+int
+gss_encrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *outbuf,
+		    int offset, struct page **pages);
+
+int
+gss_decrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *inbuf,
+		    int offset);
+
+s32
+krb5_make_seq_num(struct krb5_ctx *kctx,
+		struct crypto_skcipher *key,
+		int direction,
+		u32 seqnum, unsigned char *cksum, unsigned char *buf);
+
+s32
+krb5_get_seq_num(struct krb5_ctx *kctx,
+	       unsigned char *cksum,
+	       unsigned char *buf, int *direction, u32 *seqnum);
+
+int
+xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen);
+
+u32
+krb5_derive_key(const struct gss_krb5_enctype *gk5e,
+		const struct xdr_netobj *inkey,
+		struct xdr_netobj *outkey,
+		const struct xdr_netobj *in_constant,
+		gfp_t gfp_mask);
+
+u32
+gss_krb5_des3_make_key(const struct gss_krb5_enctype *gk5e,
+		       struct xdr_netobj *randombits,
+		       struct xdr_netobj *key);
+
+u32
+gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e,
+		      struct xdr_netobj *randombits,
+		      struct xdr_netobj *key);
+
+u32
+gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
+		     struct xdr_buf *buf,
+		     struct page **pages);
+
+u32
+gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset,
+		     struct xdr_buf *buf, u32 *plainoffset,
+		     u32 *plainlen);
+
+int
+krb5_rc4_setup_seq_key(struct krb5_ctx *kctx,
+		       struct crypto_skcipher *cipher,
+		       unsigned char *cksum);
+
+int
+krb5_rc4_setup_enc_key(struct krb5_ctx *kctx,
+		       struct crypto_skcipher *cipher,
+		       s32 seqnum);
+void
+gss_krb5_make_confounder(char *p, u32 conflen);
diff --git a/include/linux/sunrpc/gss_krb5_enctypes.h b/include/linux/sunrpc/gss_krb5_enctypes.h
new file mode 100644
index 0000000..ec6234e
--- /dev/null
+++ b/include/linux/sunrpc/gss_krb5_enctypes.h
@@ -0,0 +1,4 @@
+/*
+ * Dumb way to share this static piece of information with nfsd
+ */
+#define KRB5_SUPPORTED_ENCTYPES "18,17,16,23,3,1,2"
diff --git a/include/linux/sunrpc/metrics.h b/include/linux/sunrpc/metrics.h
new file mode 100644
index 0000000..9baed7b
--- /dev/null
+++ b/include/linux/sunrpc/metrics.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  linux/include/linux/sunrpc/metrics.h
+ *
+ *  Declarations for RPC client per-operation metrics
+ *
+ *  Copyright (C) 2005	Chuck Lever <cel@netapp.com>
+ *
+ *  RPC client per-operation statistics provide latency and retry
+ *  information about each type of RPC procedure in a given RPC program.
+ *  These statistics are not for detailed problem diagnosis, but simply
+ *  to indicate whether the problem is local or remote.
+ *
+ *  These counters are not meant to be human-readable, but are meant to be
+ *  integrated into system monitoring tools such as "sar" and "iostat".  As
+ *  such, the counters are sampled by the tools over time, and are never
+ *  zeroed after a file system is mounted.  Moving averages can be computed
+ *  by the tools by taking the difference between two instantaneous samples
+ *  and dividing that by the time between the samples.
+ *
+ *  The counters are maintained in a single array per RPC client, indexed
+ *  by procedure number.  There is no need to maintain separate counter
+ *  arrays per-CPU because these counters are always modified behind locks.
+ */
+
+#ifndef _LINUX_SUNRPC_METRICS_H
+#define _LINUX_SUNRPC_METRICS_H
+
+#include <linux/seq_file.h>
+#include <linux/ktime.h>
+#include <linux/spinlock.h>
+
+#define RPC_IOSTATS_VERS	"1.0"
+
+struct rpc_iostats {
+	spinlock_t		om_lock;
+
+	/*
+	 * These counters give an idea about how many request
+	 * transmissions are required, on average, to complete that
+	 * particular procedure.  Some procedures may require more
+	 * than one transmission because the server is unresponsive,
+	 * the client is retransmitting too aggressively, or the
+	 * requests are large and the network is congested.
+	 */
+	unsigned long		om_ops,		/* count of operations */
+				om_ntrans,	/* count of RPC transmissions */
+				om_timeouts;	/* count of major timeouts */
+
+	/*
+	 * These count how many bytes are sent and received for a
+	 * given RPC procedure type.  This indicates how much load a
+	 * particular procedure is putting on the network.  These
+	 * counts include the RPC and ULP headers, and the request
+	 * payload.
+	 */
+	unsigned long long      om_bytes_sent,	/* count of bytes out */
+				om_bytes_recv;	/* count of bytes in */
+
+	/*
+	 * The length of time an RPC request waits in queue before
+	 * transmission, the network + server latency of the request,
+	 * and the total time the request spent from init to release
+	 * are measured.
+	 */
+	ktime_t			om_queue,	/* queued for xmit */
+				om_rtt,		/* RPC RTT */
+				om_execute;	/* RPC execution */
+} ____cacheline_aligned;
+
+struct rpc_task;
+struct rpc_clnt;
+
+/*
+ * EXPORTed functions for managing rpc_iostats structures
+ */
+
+#ifdef CONFIG_PROC_FS
+
+struct rpc_iostats *	rpc_alloc_iostats(struct rpc_clnt *);
+void			rpc_count_iostats(const struct rpc_task *,
+					  struct rpc_iostats *);
+void			rpc_count_iostats_metrics(const struct rpc_task *,
+					  struct rpc_iostats *);
+void			rpc_print_iostats(struct seq_file *, struct rpc_clnt *);
+void			rpc_free_iostats(struct rpc_iostats *);
+
+#else  /*  CONFIG_PROC_FS  */
+
+static inline struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) { return NULL; }
+static inline void rpc_count_iostats(const struct rpc_task *task,
+				     struct rpc_iostats *stats) {}
+static inline void rpc_count_iostats_metrics(const struct rpc_task *task,
+					     struct rpc_iostats *stats)
+{
+}
+
+static inline void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt) {}
+static inline void rpc_free_iostats(struct rpc_iostats *stats) {}
+
+#endif  /*  CONFIG_PROC_FS  */
+
+#endif /* _LINUX_SUNRPC_METRICS_H */
diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
new file mode 100644
index 0000000..4722b28
--- /dev/null
+++ b/include/linux/sunrpc/msg_prot.h
@@ -0,0 +1,221 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/msg_prot.h
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef _LINUX_SUNRPC_MSGPROT_H_
+#define _LINUX_SUNRPC_MSGPROT_H_
+
+#ifdef __KERNEL__ /* user programs should get these from the rpc header files */
+
+#define RPC_VERSION 2
+
+/* size of an XDR encoding unit in bytes, i.e. 32bit */
+#define XDR_UNIT	(4)
+
+/* spec defines authentication flavor as an unsigned 32 bit integer */
+typedef u32	rpc_authflavor_t;
+
+enum rpc_auth_flavors {
+	RPC_AUTH_NULL  = 0,
+	RPC_AUTH_UNIX  = 1,
+	RPC_AUTH_SHORT = 2,
+	RPC_AUTH_DES   = 3,
+	RPC_AUTH_KRB   = 4,
+	RPC_AUTH_GSS   = 6,
+	RPC_AUTH_MAXFLAVOR = 8,
+	/* pseudoflavors: */
+	RPC_AUTH_GSS_KRB5  = 390003,
+	RPC_AUTH_GSS_KRB5I = 390004,
+	RPC_AUTH_GSS_KRB5P = 390005,
+	RPC_AUTH_GSS_LKEY  = 390006,
+	RPC_AUTH_GSS_LKEYI = 390007,
+	RPC_AUTH_GSS_LKEYP = 390008,
+	RPC_AUTH_GSS_SPKM  = 390009,
+	RPC_AUTH_GSS_SPKMI = 390010,
+	RPC_AUTH_GSS_SPKMP = 390011,
+};
+
+/* Maximum size (in bytes) of an rpc credential or verifier */
+#define RPC_MAX_AUTH_SIZE (400)
+
+enum rpc_msg_type {
+	RPC_CALL = 0,
+	RPC_REPLY = 1
+};
+
+enum rpc_reply_stat {
+	RPC_MSG_ACCEPTED = 0,
+	RPC_MSG_DENIED = 1
+};
+
+enum rpc_accept_stat {
+	RPC_SUCCESS = 0,
+	RPC_PROG_UNAVAIL = 1,
+	RPC_PROG_MISMATCH = 2,
+	RPC_PROC_UNAVAIL = 3,
+	RPC_GARBAGE_ARGS = 4,
+	RPC_SYSTEM_ERR = 5,
+	/* internal use only */
+	RPC_DROP_REPLY = 60000,
+};
+
+enum rpc_reject_stat {
+	RPC_MISMATCH = 0,
+	RPC_AUTH_ERROR = 1
+};
+
+enum rpc_auth_stat {
+	RPC_AUTH_OK = 0,
+	RPC_AUTH_BADCRED = 1,
+	RPC_AUTH_REJECTEDCRED = 2,
+	RPC_AUTH_BADVERF = 3,
+	RPC_AUTH_REJECTEDVERF = 4,
+	RPC_AUTH_TOOWEAK = 5,
+	/* RPCSEC_GSS errors */
+	RPCSEC_GSS_CREDPROBLEM = 13,
+	RPCSEC_GSS_CTXPROBLEM = 14
+};
+
+#define RPC_MAXNETNAMELEN	256
+
+/*
+ * From RFC 1831:
+ *
+ * "A record is composed of one or more record fragments.  A record
+ *  fragment is a four-byte header followed by 0 to (2**31) - 1 bytes of
+ *  fragment data.  The bytes encode an unsigned binary number; as with
+ *  XDR integers, the byte order is from highest to lowest.  The number
+ *  encodes two values -- a boolean which indicates whether the fragment
+ *  is the last fragment of the record (bit value 1 implies the fragment
+ *  is the last fragment) and a 31-bit unsigned binary value which is the
+ *  length in bytes of the fragment's data.  The boolean value is the
+ *  highest-order bit of the header; the length is the 31 low-order bits.
+ *  (Note that this record specification is NOT in XDR standard form!)"
+ *
+ * The Linux RPC client always sends its requests in a single record
+ * fragment, limiting the maximum payload size for stream transports to
+ * 2GB.
+ */
+
+typedef __be32	rpc_fraghdr;
+
+#define	RPC_LAST_STREAM_FRAGMENT	(1U << 31)
+#define	RPC_FRAGMENT_SIZE_MASK		(~RPC_LAST_STREAM_FRAGMENT)
+#define	RPC_MAX_FRAGMENT_SIZE		((1U << 31) - 1)
+
+/*
+ * RPC call and reply header size as number of 32bit words (verifier
+ * size computed separately, see below)
+ */
+#define RPC_CALLHDRSIZE		(6)
+#define RPC_REPHDRSIZE		(4)
+
+
+/*
+ * Maximum RPC header size, including authentication,
+ * as number of 32bit words (see RFCs 1831, 1832).
+ *
+ *	xid			    1 xdr unit = 4 bytes
+ *	mtype			    1
+ *	rpc_version		    1
+ *	program			    1
+ *	prog_version		    1
+ *	procedure		    1
+ *	cred {
+ *	    flavor		    1
+ *	    length		    1
+ *	    body<RPC_MAX_AUTH_SIZE> 100 xdr units = 400 bytes
+ *	}
+ *	verf {
+ *	    flavor		    1
+ *	    length		    1
+ *	    body<RPC_MAX_AUTH_SIZE> 100 xdr units = 400 bytes
+ *	}
+ *	TOTAL			    210 xdr units = 840 bytes
+ */
+#define RPC_MAX_HEADER_WITH_AUTH \
+	(RPC_CALLHDRSIZE + 2*(2+RPC_MAX_AUTH_SIZE/4))
+
+#define RPC_MAX_REPHEADER_WITH_AUTH \
+	(RPC_REPHDRSIZE + (2 + RPC_MAX_AUTH_SIZE/4))
+
+/*
+ * Well-known netids. See:
+ *
+ *   http://www.iana.org/assignments/rpc-netids/rpc-netids.xhtml
+ */
+#define RPCBIND_NETID_UDP	"udp"
+#define RPCBIND_NETID_TCP	"tcp"
+#define RPCBIND_NETID_RDMA	"rdma"
+#define RPCBIND_NETID_SCTP	"sctp"
+#define RPCBIND_NETID_UDP6	"udp6"
+#define RPCBIND_NETID_TCP6	"tcp6"
+#define RPCBIND_NETID_RDMA6	"rdma6"
+#define RPCBIND_NETID_SCTP6	"sctp6"
+#define RPCBIND_NETID_LOCAL	"local"
+
+/*
+ * Note that RFC 1833 does not put any size restrictions on the
+ * netid string, but all currently defined netid's fit in 5 bytes.
+ */
+#define RPCBIND_MAXNETIDLEN	(5u)
+
+/*
+ * Universal addresses are introduced in RFC 1833 and further spelled
+ * out in RFC 3530.  RPCBIND_MAXUADDRLEN defines a maximum byte length
+ * of a universal address for use in allocating buffers and character
+ * arrays.
+ *
+ * Quoting RFC 3530, section 2.2:
+ *
+ * For TCP over IPv4 and for UDP over IPv4, the format of r_addr is the
+ * US-ASCII string:
+ *
+ *	h1.h2.h3.h4.p1.p2
+ *
+ * The prefix, "h1.h2.h3.h4", is the standard textual form for
+ * representing an IPv4 address, which is always four octets long.
+ * Assuming big-endian ordering, h1, h2, h3, and h4, are respectively,
+ * the first through fourth octets each converted to ASCII-decimal.
+ * Assuming big-endian ordering, p1 and p2 are, respectively, the first
+ * and second octets each converted to ASCII-decimal.  For example, if a
+ * host, in big-endian order, has an address of 0x0A010307 and there is
+ * a service listening on, in big endian order, port 0x020F (decimal
+ * 527), then the complete universal address is "10.1.3.7.2.15".
+ *
+ * ...
+ *
+ * For TCP over IPv6 and for UDP over IPv6, the format of r_addr is the
+ * US-ASCII string:
+ *
+ *	x1:x2:x3:x4:x5:x6:x7:x8.p1.p2
+ *
+ * The suffix "p1.p2" is the service port, and is computed the same way
+ * as with universal addresses for TCP and UDP over IPv4.  The prefix,
+ * "x1:x2:x3:x4:x5:x6:x7:x8", is the standard textual form for
+ * representing an IPv6 address as defined in Section 2.2 of [RFC2373].
+ * Additionally, the two alternative forms specified in Section 2.2 of
+ * [RFC2373] are also acceptable.
+ */
+
+#include <linux/inet.h>
+
+/* Maximum size of the port number part of a universal address */
+#define RPCBIND_MAXUADDRPLEN	sizeof(".255.255")
+
+/* Maximum size of an IPv4 universal address */
+#define RPCBIND_MAXUADDR4LEN	\
+		(INET_ADDRSTRLEN + RPCBIND_MAXUADDRPLEN)
+
+/* Maximum size of an IPv6 universal address */
+#define RPCBIND_MAXUADDR6LEN	\
+		(INET6_ADDRSTRLEN + RPCBIND_MAXUADDRPLEN)
+
+/* Assume INET6_ADDRSTRLEN will always be larger than INET_ADDRSTRLEN... */
+#define RPCBIND_MAXUADDRLEN	RPCBIND_MAXUADDR6LEN
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_SUNRPC_MSGPROT_H_ */
diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
new file mode 100644
index 0000000..a5704da
--- /dev/null
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SUNRPC_RPC_PIPE_FS_H
+#define _LINUX_SUNRPC_RPC_PIPE_FS_H
+
+#ifdef __KERNEL__
+
+#include <linux/workqueue.h>
+
+struct rpc_pipe_dir_head {
+	struct list_head pdh_entries;
+	struct dentry *pdh_dentry;
+};
+
+struct rpc_pipe_dir_object_ops;
+struct rpc_pipe_dir_object {
+	struct list_head pdo_head;
+	const struct rpc_pipe_dir_object_ops *pdo_ops;
+
+	void *pdo_data;
+};
+
+struct rpc_pipe_dir_object_ops {
+	int (*create)(struct dentry *dir,
+			struct rpc_pipe_dir_object *pdo);
+	void (*destroy)(struct dentry *dir,
+			struct rpc_pipe_dir_object *pdo);
+};
+
+struct rpc_pipe_msg {
+	struct list_head list;
+	void *data;
+	size_t len;
+	size_t copied;
+	int errno;
+};
+
+struct rpc_pipe_ops {
+	ssize_t (*upcall)(struct file *, struct rpc_pipe_msg *, char __user *, size_t);
+	ssize_t (*downcall)(struct file *, const char __user *, size_t);
+	void (*release_pipe)(struct inode *);
+	int (*open_pipe)(struct inode *);
+	void (*destroy_msg)(struct rpc_pipe_msg *);
+};
+
+struct rpc_pipe {
+	struct list_head pipe;
+	struct list_head in_upcall;
+	struct list_head in_downcall;
+	int pipelen;
+	int nreaders;
+	int nwriters;
+#define RPC_PIPE_WAIT_FOR_OPEN	1
+	int flags;
+	struct delayed_work queue_timeout;
+	const struct rpc_pipe_ops *ops;
+	spinlock_t lock;
+	struct dentry *dentry;
+};
+
+struct rpc_inode {
+	struct inode vfs_inode;
+	void *private;
+	struct rpc_pipe *pipe;
+	wait_queue_head_t waitq;
+};
+
+static inline struct rpc_inode *
+RPC_I(struct inode *inode)
+{
+	return container_of(inode, struct rpc_inode, vfs_inode);
+}
+
+enum {
+	SUNRPC_PIPEFS_NFS_PRIO,
+	SUNRPC_PIPEFS_RPC_PRIO,
+};
+
+extern int rpc_pipefs_notifier_register(struct notifier_block *);
+extern void rpc_pipefs_notifier_unregister(struct notifier_block *);
+
+enum {
+	RPC_PIPEFS_MOUNT,
+	RPC_PIPEFS_UMOUNT,
+};
+
+extern struct dentry *rpc_d_lookup_sb(const struct super_block *sb,
+				      const unsigned char *dir_name);
+extern int rpc_pipefs_init_net(struct net *net);
+extern void rpc_pipefs_exit_net(struct net *net);
+extern struct super_block *rpc_get_sb_net(const struct net *net);
+extern void rpc_put_sb_net(const struct net *net);
+
+extern ssize_t rpc_pipe_generic_upcall(struct file *, struct rpc_pipe_msg *,
+				       char __user *, size_t);
+extern int rpc_queue_upcall(struct rpc_pipe *, struct rpc_pipe_msg *);
+
+struct rpc_clnt;
+extern struct dentry *rpc_create_client_dir(struct dentry *, const char *, struct rpc_clnt *);
+extern int rpc_remove_client_dir(struct rpc_clnt *);
+
+extern void rpc_init_pipe_dir_head(struct rpc_pipe_dir_head *pdh);
+extern void rpc_init_pipe_dir_object(struct rpc_pipe_dir_object *pdo,
+		const struct rpc_pipe_dir_object_ops *pdo_ops,
+		void *pdo_data);
+extern int rpc_add_pipe_dir_object(struct net *net,
+		struct rpc_pipe_dir_head *pdh,
+		struct rpc_pipe_dir_object *pdo);
+extern void rpc_remove_pipe_dir_object(struct net *net,
+		struct rpc_pipe_dir_head *pdh,
+		struct rpc_pipe_dir_object *pdo);
+extern struct rpc_pipe_dir_object *rpc_find_or_alloc_pipe_dir_object(
+		struct net *net,
+		struct rpc_pipe_dir_head *pdh,
+		int (*match)(struct rpc_pipe_dir_object *, void *),
+		struct rpc_pipe_dir_object *(*alloc)(void *),
+		void *data);
+
+struct cache_detail;
+extern struct dentry *rpc_create_cache_dir(struct dentry *,
+					   const char *,
+					   umode_t umode,
+					   struct cache_detail *);
+extern void rpc_remove_cache_dir(struct dentry *);
+
+extern int rpc_rmdir(struct dentry *dentry);
+
+struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags);
+void rpc_destroy_pipe_data(struct rpc_pipe *pipe);
+extern struct dentry *rpc_mkpipe_dentry(struct dentry *, const char *, void *,
+					struct rpc_pipe *);
+extern int rpc_unlink(struct dentry *);
+extern int register_rpc_pipefs(void);
+extern void unregister_rpc_pipefs(void);
+
+extern bool gssd_running(struct net *net);
+
+#endif
+#endif
diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
new file mode 100644
index 0000000..8f144db
--- /dev/null
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2015-2017 Oracle. All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_RPC_RDMA_H
+#define _LINUX_SUNRPC_RPC_RDMA_H
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+
+#define RPCRDMA_VERSION		1
+#define rpcrdma_version		cpu_to_be32(RPCRDMA_VERSION)
+
+enum {
+	RPCRDMA_V1_DEF_INLINE_SIZE	= 1024,
+};
+
+/*
+ * XDR sizes, in quads
+ */
+enum {
+	rpcrdma_fixed_maxsz	= 4,
+	rpcrdma_segment_maxsz	= 4,
+	rpcrdma_readchunk_maxsz	= 2 + rpcrdma_segment_maxsz,
+};
+
+/*
+ * Smallest RPC/RDMA header: rm_xid through rm_type, then rm_nochunks
+ */
+#define RPCRDMA_HDRLEN_MIN	(sizeof(__be32) * 7)
+#define RPCRDMA_HDRLEN_ERR	(sizeof(__be32) * 5)
+
+enum rpcrdma_errcode {
+	ERR_VERS = 1,
+	ERR_CHUNK = 2
+};
+
+enum rpcrdma_proc {
+	RDMA_MSG = 0,		/* An RPC call or reply msg */
+	RDMA_NOMSG = 1,		/* An RPC call or reply msg - separate body */
+	RDMA_MSGP = 2,		/* An RPC call or reply msg with padding */
+	RDMA_DONE = 3,		/* Client signals reply completion */
+	RDMA_ERROR = 4		/* An RPC RDMA encoding error */
+};
+
+#define rdma_msg	cpu_to_be32(RDMA_MSG)
+#define rdma_nomsg	cpu_to_be32(RDMA_NOMSG)
+#define rdma_msgp	cpu_to_be32(RDMA_MSGP)
+#define rdma_done	cpu_to_be32(RDMA_DONE)
+#define rdma_error	cpu_to_be32(RDMA_ERROR)
+
+#define err_vers	cpu_to_be32(ERR_VERS)
+#define err_chunk	cpu_to_be32(ERR_CHUNK)
+
+/*
+ * Private extension to RPC-over-RDMA Version One.
+ * Message passed during RDMA-CM connection set-up.
+ *
+ * Add new fields at the end, and don't permute existing
+ * fields.
+ */
+struct rpcrdma_connect_private {
+	__be32			cp_magic;
+	u8			cp_version;
+	u8			cp_flags;
+	u8			cp_send_size;
+	u8			cp_recv_size;
+} __packed;
+
+#define rpcrdma_cmp_magic	__cpu_to_be32(0xf6ab0e18)
+
+enum {
+	RPCRDMA_CMP_VERSION		= 1,
+	RPCRDMA_CMP_F_SND_W_INV_OK	= BIT(0),
+};
+
+static inline u8
+rpcrdma_encode_buffer_size(unsigned int size)
+{
+	return (size >> 10) - 1;
+}
+
+static inline unsigned int
+rpcrdma_decode_buffer_size(u8 val)
+{
+	return ((unsigned int)val + 1) << 10;
+}
+
+#endif				/* _LINUX_SUNRPC_RPC_RDMA_H */
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
new file mode 100644
index 0000000..592653b
--- /dev/null
+++ b/include/linux/sunrpc/sched.h
@@ -0,0 +1,302 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/sched.h
+ *
+ * Scheduling primitives for kernel Sun RPC.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef _LINUX_SUNRPC_SCHED_H_
+#define _LINUX_SUNRPC_SCHED_H_
+
+#include <linux/timer.h>
+#include <linux/ktime.h>
+#include <linux/sunrpc/types.h>
+#include <linux/spinlock.h>
+#include <linux/wait_bit.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/xdr.h>
+
+/*
+ * This is the actual RPC procedure call info.
+ */
+struct rpc_procinfo;
+struct rpc_message {
+	const struct rpc_procinfo *rpc_proc;	/* Procedure information */
+	void *			rpc_argp;	/* Arguments */
+	void *			rpc_resp;	/* Result */
+	struct rpc_cred *	rpc_cred;	/* Credentials */
+};
+
+struct rpc_call_ops;
+struct rpc_wait_queue;
+struct rpc_wait {
+	struct list_head	list;		/* wait queue links */
+	struct list_head	links;		/* Links to related tasks */
+	struct list_head	timer_list;	/* Timer list */
+	unsigned long		expires;
+};
+
+/*
+ * This is the RPC task struct
+ */
+struct rpc_task {
+	atomic_t		tk_count;	/* Reference count */
+	int			tk_status;	/* result of last operation */
+	struct list_head	tk_task;	/* global list of tasks */
+
+	/*
+	 * callback	to be executed after waking up
+	 * action	next procedure for async tasks
+	 */
+	void			(*tk_callback)(struct rpc_task *);
+	void			(*tk_action)(struct rpc_task *);
+
+	unsigned long		tk_timeout;	/* timeout for rpc_sleep() */
+	unsigned long		tk_runstate;	/* Task run status */
+
+	struct rpc_wait_queue 	*tk_waitqueue;	/* RPC wait queue we're on */
+	union {
+		struct work_struct	tk_work;	/* Async task work queue */
+		struct rpc_wait		tk_wait;	/* RPC wait */
+	} u;
+
+	/*
+	 * RPC call state
+	 */
+	struct rpc_message	tk_msg;		/* RPC call info */
+	void *			tk_calldata;	/* Caller private data */
+	const struct rpc_call_ops *tk_ops;	/* Caller callbacks */
+
+	struct rpc_clnt *	tk_client;	/* RPC client */
+	struct rpc_xprt *	tk_xprt;	/* Transport */
+
+	struct rpc_rqst *	tk_rqstp;	/* RPC request */
+
+	struct workqueue_struct	*tk_workqueue;	/* Normally rpciod, but could
+						 * be any workqueue
+						 */
+	ktime_t			tk_start;	/* RPC task init timestamp */
+
+	pid_t			tk_owner;	/* Process id for batching tasks */
+	unsigned short		tk_flags;	/* misc flags */
+	unsigned short		tk_timeouts;	/* maj timeouts */
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS)
+	unsigned short		tk_pid;		/* debugging aid */
+#endif
+	unsigned char		tk_priority : 2,/* Task priority */
+				tk_garb_retry : 2,
+				tk_cred_retry : 2,
+				tk_rebind_retry : 2;
+};
+
+typedef void			(*rpc_action)(struct rpc_task *);
+
+struct rpc_call_ops {
+	void (*rpc_call_prepare)(struct rpc_task *, void *);
+	void (*rpc_call_done)(struct rpc_task *, void *);
+	void (*rpc_count_stats)(struct rpc_task *, void *);
+	void (*rpc_release)(void *);
+};
+
+struct rpc_task_setup {
+	struct rpc_task *task;
+	struct rpc_clnt *rpc_client;
+	struct rpc_xprt *rpc_xprt;
+	const struct rpc_message *rpc_message;
+	const struct rpc_call_ops *callback_ops;
+	void *callback_data;
+	struct workqueue_struct *workqueue;
+	unsigned short flags;
+	signed char priority;
+};
+
+/*
+ * RPC task flags
+ */
+#define RPC_TASK_ASYNC		0x0001		/* is an async task */
+#define RPC_TASK_SWAPPER	0x0002		/* is swapping in/out */
+#define RPC_CALL_MAJORSEEN	0x0020		/* major timeout seen */
+#define RPC_TASK_ROOTCREDS	0x0040		/* force root creds */
+#define RPC_TASK_DYNAMIC	0x0080		/* task was kmalloc'ed */
+#define RPC_TASK_KILLED		0x0100		/* task was killed */
+#define RPC_TASK_SOFT		0x0200		/* Use soft timeouts */
+#define RPC_TASK_SOFTCONN	0x0400		/* Fail if can't connect */
+#define RPC_TASK_SENT		0x0800		/* message was sent */
+#define RPC_TASK_TIMEOUT	0x1000		/* fail with ETIMEDOUT on timeout */
+#define RPC_TASK_NOCONNECT	0x2000		/* return ENOTCONN if not connected */
+#define RPC_TASK_NO_RETRANS_TIMEOUT	0x4000		/* wait forever for a reply */
+
+#define RPC_IS_ASYNC(t)		((t)->tk_flags & RPC_TASK_ASYNC)
+#define RPC_IS_SWAPPER(t)	((t)->tk_flags & RPC_TASK_SWAPPER)
+#define RPC_DO_ROOTOVERRIDE(t)	((t)->tk_flags & RPC_TASK_ROOTCREDS)
+#define RPC_ASSASSINATED(t)	((t)->tk_flags & RPC_TASK_KILLED)
+#define RPC_IS_SOFT(t)		((t)->tk_flags & (RPC_TASK_SOFT|RPC_TASK_TIMEOUT))
+#define RPC_IS_SOFTCONN(t)	((t)->tk_flags & RPC_TASK_SOFTCONN)
+#define RPC_WAS_SENT(t)		((t)->tk_flags & RPC_TASK_SENT)
+
+#define RPC_TASK_RUNNING	0
+#define RPC_TASK_QUEUED		1
+#define RPC_TASK_ACTIVE		2
+#define RPC_TASK_MSG_RECV	3
+#define RPC_TASK_MSG_RECV_WAIT	4
+
+#define RPC_IS_RUNNING(t)	test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
+#define rpc_set_running(t)	set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
+#define rpc_test_and_set_running(t) \
+				test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
+#define rpc_clear_running(t)	\
+	do { \
+		smp_mb__before_atomic(); \
+		clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate); \
+		smp_mb__after_atomic(); \
+	} while (0)
+
+#define RPC_IS_QUEUED(t)	test_bit(RPC_TASK_QUEUED, &(t)->tk_runstate)
+#define rpc_set_queued(t)	set_bit(RPC_TASK_QUEUED, &(t)->tk_runstate)
+#define rpc_clear_queued(t)	\
+	do { \
+		smp_mb__before_atomic(); \
+		clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate); \
+		smp_mb__after_atomic(); \
+	} while (0)
+
+#define RPC_IS_ACTIVATED(t)	test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate)
+
+/*
+ * Task priorities.
+ * Note: if you change these, you must also change
+ * the task initialization definitions below.
+ */
+#define RPC_PRIORITY_LOW	(-1)
+#define RPC_PRIORITY_NORMAL	(0)
+#define RPC_PRIORITY_HIGH	(1)
+#define RPC_PRIORITY_PRIVILEGED	(2)
+#define RPC_NR_PRIORITY		(1 + RPC_PRIORITY_PRIVILEGED - RPC_PRIORITY_LOW)
+
+struct rpc_timer {
+	struct timer_list timer;
+	struct list_head list;
+	unsigned long expires;
+};
+
+/*
+ * RPC synchronization objects
+ */
+struct rpc_wait_queue {
+	spinlock_t		lock;
+	struct list_head	tasks[RPC_NR_PRIORITY];	/* task queue for each priority level */
+	pid_t			owner;			/* process id of last task serviced */
+	unsigned char		maxpriority;		/* maximum priority (0 if queue is not a priority queue) */
+	unsigned char		priority;		/* current priority */
+	unsigned char		nr;			/* # tasks remaining for cookie */
+	unsigned short		qlen;			/* total # tasks waiting in queue */
+	struct rpc_timer	timer_list;
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS)
+	const char *		name;
+#endif
+};
+
+/*
+ * This is the # requests to send consecutively
+ * from a single cookie.  The aim is to improve
+ * performance of NFS operations such as read/write.
+ */
+#define RPC_BATCH_COUNT			16
+#define RPC_IS_PRIORITY(q)		((q)->maxpriority > 0)
+
+/*
+ * Function prototypes
+ */
+struct rpc_task *rpc_new_task(const struct rpc_task_setup *);
+struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req);
+void		rpc_put_task(struct rpc_task *);
+void		rpc_put_task_async(struct rpc_task *);
+void		rpc_exit_task(struct rpc_task *);
+void		rpc_exit(struct rpc_task *, int);
+void		rpc_release_calldata(const struct rpc_call_ops *, void *);
+void		rpc_killall_tasks(struct rpc_clnt *);
+void		rpc_execute(struct rpc_task *);
+void		rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
+void		rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
+void		rpc_destroy_wait_queue(struct rpc_wait_queue *);
+void		rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *,
+					rpc_action action);
+void		rpc_sleep_on_priority(struct rpc_wait_queue *,
+					struct rpc_task *,
+					rpc_action action,
+					int priority);
+void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
+		struct rpc_wait_queue *queue,
+		struct rpc_task *task);
+void		rpc_wake_up_queued_task(struct rpc_wait_queue *,
+					struct rpc_task *);
+void		rpc_wake_up(struct rpc_wait_queue *);
+struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *);
+struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
+					struct rpc_wait_queue *,
+					bool (*)(struct rpc_task *, void *),
+					void *);
+struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *,
+					bool (*)(struct rpc_task *, void *),
+					void *);
+void		rpc_wake_up_status(struct rpc_wait_queue *, int);
+void		rpc_delay(struct rpc_task *, unsigned long);
+int		rpc_malloc(struct rpc_task *);
+void		rpc_free(struct rpc_task *);
+int		rpciod_up(void);
+void		rpciod_down(void);
+int		__rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *);
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+struct net;
+void		rpc_show_tasks(struct net *);
+#endif
+int		rpc_init_mempool(void);
+void		rpc_destroy_mempool(void);
+extern struct workqueue_struct *rpciod_workqueue;
+extern struct workqueue_struct *xprtiod_workqueue;
+void		rpc_prepare_task(struct rpc_task *task);
+
+static inline int rpc_wait_for_completion_task(struct rpc_task *task)
+{
+	return __rpc_wait_for_completion_task(task, NULL);
+}
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS)
+static inline const char * rpc_qname(const struct rpc_wait_queue *q)
+{
+	return ((q && q->name) ? q->name : "unknown");
+}
+
+static inline void rpc_assign_waitqueue_name(struct rpc_wait_queue *q,
+		const char *name)
+{
+	q->name = name;
+}
+#else
+static inline void rpc_assign_waitqueue_name(struct rpc_wait_queue *q,
+		const char *name)
+{
+}
+#endif
+
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+int rpc_clnt_swap_activate(struct rpc_clnt *clnt);
+void rpc_clnt_swap_deactivate(struct rpc_clnt *clnt);
+#else
+static inline int
+rpc_clnt_swap_activate(struct rpc_clnt *clnt)
+{
+	return -EINVAL;
+}
+
+static inline void
+rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
+{
+}
+#endif /* CONFIG_SUNRPC_SWAP */
+
+#endif /* _LINUX_SUNRPC_SCHED_H_ */
diff --git a/include/linux/sunrpc/stats.h b/include/linux/sunrpc/stats.h
new file mode 100644
index 0000000..84b92b4
--- /dev/null
+++ b/include/linux/sunrpc/stats.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/stats.h
+ *
+ * Client statistics collection for SUN RPC
+ *
+ * Copyright (C) 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef _LINUX_SUNRPC_STATS_H
+#define _LINUX_SUNRPC_STATS_H
+
+#include <linux/proc_fs.h>
+
+struct rpc_stat {
+	const struct rpc_program *program;
+
+	unsigned int		netcnt,
+				netudpcnt,
+				nettcpcnt,
+				nettcpconn,
+				netreconn;
+	unsigned int		rpccnt,
+				rpcretrans,
+				rpcauthrefresh,
+				rpcgarbage;
+};
+
+struct svc_stat {
+	struct svc_program *	program;
+
+	unsigned int		netcnt,
+				netudpcnt,
+				nettcpcnt,
+				nettcpconn;
+	unsigned int		rpccnt,
+				rpcbadfmt,
+				rpcbadauth,
+				rpcbadclnt;
+};
+
+struct net;
+#ifdef CONFIG_PROC_FS
+int			rpc_proc_init(struct net *);
+void			rpc_proc_exit(struct net *);
+#else
+static inline int rpc_proc_init(struct net *net)
+{
+	return 0;
+}
+
+static inline void rpc_proc_exit(struct net *net)
+{
+}
+#endif
+
+#ifdef MODULE
+void			rpc_modcount(struct inode *, int);
+#endif
+
+#ifdef CONFIG_PROC_FS
+struct proc_dir_entry *	rpc_proc_register(struct net *,struct rpc_stat *);
+void			rpc_proc_unregister(struct net *,const char *);
+void			rpc_proc_zero(const struct rpc_program *);
+struct proc_dir_entry *	svc_proc_register(struct net *, struct svc_stat *,
+					  const struct file_operations *);
+void			svc_proc_unregister(struct net *, const char *);
+
+void			svc_seq_show(struct seq_file *,
+				     const struct svc_stat *);
+#else
+
+static inline struct proc_dir_entry *rpc_proc_register(struct net *net, struct rpc_stat *s) { return NULL; }
+static inline void rpc_proc_unregister(struct net *net, const char *p) {}
+static inline void rpc_proc_zero(const struct rpc_program *p) {}
+
+static inline struct proc_dir_entry *svc_proc_register(struct net *net, struct svc_stat *s,
+						       const struct file_operations *f) { return NULL; }
+static inline void svc_proc_unregister(struct net *net, const char *p) {}
+
+static inline void svc_seq_show(struct seq_file *seq,
+				const struct svc_stat *st) {}
+#endif
+
+#endif /* _LINUX_SUNRPC_STATS_H */
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
new file mode 100644
index 0000000..574368e
--- /dev/null
+++ b/include/linux/sunrpc/svc.h
@@ -0,0 +1,517 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/svc.h
+ *
+ * RPC server declarations.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+
+#ifndef SUNRPC_SVC_H
+#define SUNRPC_SVC_H
+
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+
+/* statistics for svc_pool structures */
+struct svc_pool_stats {
+	atomic_long_t	packets;
+	unsigned long	sockets_queued;
+	atomic_long_t	threads_woken;
+	atomic_long_t	threads_timedout;
+};
+
+/*
+ *
+ * RPC service thread pool.
+ *
+ * Pool of threads and temporary sockets.  Generally there is only
+ * a single one of these per RPC service, but on NUMA machines those
+ * services that can benefit from it (i.e. nfs but not lockd) will
+ * have one pool per NUMA node.  This optimisation reduces cross-
+ * node traffic on multi-node NUMA NFS servers.
+ */
+struct svc_pool {
+	unsigned int		sp_id;	    	/* pool id; also node id on NUMA */
+	spinlock_t		sp_lock;	/* protects all fields */
+	struct list_head	sp_sockets;	/* pending sockets */
+	unsigned int		sp_nrthreads;	/* # of threads in pool */
+	struct list_head	sp_all_threads;	/* all server threads */
+	struct svc_pool_stats	sp_stats;	/* statistics on pool operation */
+#define	SP_TASK_PENDING		(0)		/* still work to do even if no
+						 * xprt is queued. */
+#define SP_CONGESTED		(1)
+	unsigned long		sp_flags;
+} ____cacheline_aligned_in_smp;
+
+struct svc_serv;
+
+struct svc_serv_ops {
+	/* Callback to use when last thread exits. */
+	void		(*svo_shutdown)(struct svc_serv *, struct net *);
+
+	/* function for service threads to run */
+	int		(*svo_function)(void *);
+
+	/* queue up a transport for servicing */
+	void		(*svo_enqueue_xprt)(struct svc_xprt *);
+
+	/* set up thread (or whatever) execution context */
+	int		(*svo_setup)(struct svc_serv *, struct svc_pool *, int);
+
+	/* optional module to count when adding threads (pooled svcs only) */
+	struct module	*svo_module;
+};
+
+/*
+ * RPC service.
+ *
+ * An RPC service is a ``daemon,'' possibly multithreaded, which
+ * receives and processes incoming RPC messages.
+ * It has one or more transport sockets associated with it, and maintains
+ * a list of idle threads waiting for input.
+ *
+ * We currently do not support more than one RPC program per daemon.
+ */
+struct svc_serv {
+	struct svc_program *	sv_program;	/* RPC program */
+	struct svc_stat *	sv_stats;	/* RPC statistics */
+	spinlock_t		sv_lock;
+	unsigned int		sv_nrthreads;	/* # of server threads */
+	unsigned int		sv_maxconn;	/* max connections allowed or
+						 * '0' causing max to be based
+						 * on number of threads. */
+
+	unsigned int		sv_max_payload;	/* datagram payload size */
+	unsigned int		sv_max_mesg;	/* max_payload + 1 page for overheads */
+	unsigned int		sv_xdrsize;	/* XDR buffer size */
+	struct list_head	sv_permsocks;	/* all permanent sockets */
+	struct list_head	sv_tempsocks;	/* all temporary sockets */
+	int			sv_tmpcnt;	/* count of temporary sockets */
+	struct timer_list	sv_temptimer;	/* timer for aging temporary sockets */
+
+	char *			sv_name;	/* service name */
+
+	unsigned int		sv_nrpools;	/* number of thread pools */
+	struct svc_pool *	sv_pools;	/* array of thread pools */
+	const struct svc_serv_ops *sv_ops;	/* server operations */
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	struct list_head	sv_cb_list;	/* queue for callback requests
+						 * that arrive over the same
+						 * connection */
+	spinlock_t		sv_cb_lock;	/* protects the svc_cb_list */
+	wait_queue_head_t	sv_cb_waitq;	/* sleep here if there are no
+						 * entries in the svc_cb_list */
+	struct svc_xprt		*sv_bc_xprt;	/* callback on fore channel */
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+};
+
+/*
+ * We use sv_nrthreads as a reference count.  svc_destroy() drops
+ * this refcount, so we need to bump it up around operations that
+ * change the number of threads.  Horrible, but there it is.
+ * Should be called with the "service mutex" held.
+ */
+static inline void svc_get(struct svc_serv *serv)
+{
+	serv->sv_nrthreads++;
+}
+
+/*
+ * Maximum payload size supported by a kernel RPC server.
+ * This is use to determine the max number of pages nfsd is
+ * willing to return in a single READ operation.
+ *
+ * These happen to all be powers of 2, which is not strictly
+ * necessary but helps enforce the real limitation, which is
+ * that they should be multiples of PAGE_SIZE.
+ *
+ * For UDP transports, a block plus NFS,RPC, and UDP headers
+ * has to fit into the IP datagram limit of 64K.  The largest
+ * feasible number for all known page sizes is probably 48K,
+ * but we choose 32K here.  This is the same as the historical
+ * Linux limit; someone who cares more about NFS/UDP performance
+ * can test a larger number.
+ *
+ * For TCP transports we have more freedom.  A size of 1MB is
+ * chosen to match the client limit.  Other OSes are known to
+ * have larger limits, but those numbers are probably beyond
+ * the point of diminishing returns.
+ */
+#define RPCSVC_MAXPAYLOAD	(1*1024*1024u)
+#define RPCSVC_MAXPAYLOAD_TCP	RPCSVC_MAXPAYLOAD
+#define RPCSVC_MAXPAYLOAD_UDP	(32*1024u)
+
+extern u32 svc_max_payload(const struct svc_rqst *rqstp);
+
+/*
+ * RPC Requsts and replies are stored in one or more pages.
+ * We maintain an array of pages for each server thread.
+ * Requests are copied into these pages as they arrive.  Remaining
+ * pages are available to write the reply into.
+ *
+ * Pages are sent using ->sendpage so each server thread needs to
+ * allocate more to replace those used in sending.  To help keep track
+ * of these pages we have a receive list where all pages initialy live,
+ * and a send list where pages are moved to when there are to be part
+ * of a reply.
+ *
+ * We use xdr_buf for holding responses as it fits well with NFS
+ * read responses (that have a header, and some data pages, and possibly
+ * a tail) and means we can share some client side routines.
+ *
+ * The xdr_buf.head kvec always points to the first page in the rq_*pages
+ * list.  The xdr_buf.pages pointer points to the second page on that
+ * list.  xdr_buf.tail points to the end of the first page.
+ * This assumes that the non-page part of an rpc reply will fit
+ * in a page - NFSd ensures this.  lockd also has no trouble.
+ *
+ * Each request/reply pair can have at most one "payload", plus two pages,
+ * one for the request, and one for the reply.
+ * We using ->sendfile to return read data, we might need one extra page
+ * if the request is not page-aligned.  So add another '1'.
+ */
+#define RPCSVC_MAXPAGES		((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE \
+				+ 2 + 1)
+
+static inline u32 svc_getnl(struct kvec *iov)
+{
+	__be32 val, *vp;
+	vp = iov->iov_base;
+	val = *vp++;
+	iov->iov_base = (void*)vp;
+	iov->iov_len -= sizeof(__be32);
+	return ntohl(val);
+}
+
+static inline void svc_putnl(struct kvec *iov, u32 val)
+{
+	__be32 *vp = iov->iov_base + iov->iov_len;
+	*vp = htonl(val);
+	iov->iov_len += sizeof(__be32);
+}
+
+static inline __be32 svc_getu32(struct kvec *iov)
+{
+	__be32 val, *vp;
+	vp = iov->iov_base;
+	val = *vp++;
+	iov->iov_base = (void*)vp;
+	iov->iov_len -= sizeof(__be32);
+	return val;
+}
+
+static inline void svc_ungetu32(struct kvec *iov)
+{
+	__be32 *vp = (__be32 *)iov->iov_base;
+	iov->iov_base = (void *)(vp - 1);
+	iov->iov_len += sizeof(*vp);
+}
+
+static inline void svc_putu32(struct kvec *iov, __be32 val)
+{
+	__be32 *vp = iov->iov_base + iov->iov_len;
+	*vp = val;
+	iov->iov_len += sizeof(__be32);
+}
+
+/*
+ * The context of a single thread, including the request currently being
+ * processed.
+ */
+struct svc_rqst {
+	struct list_head	rq_all;		/* all threads list */
+	struct rcu_head		rq_rcu_head;	/* for RCU deferred kfree */
+	struct svc_xprt *	rq_xprt;	/* transport ptr */
+
+	struct sockaddr_storage	rq_addr;	/* peer address */
+	size_t			rq_addrlen;
+	struct sockaddr_storage	rq_daddr;	/* dest addr of request
+						 *  - reply from here */
+	size_t			rq_daddrlen;
+
+	struct svc_serv *	rq_server;	/* RPC service definition */
+	struct svc_pool *	rq_pool;	/* thread pool */
+	const struct svc_procedure *rq_procinfo;/* procedure info */
+	struct auth_ops *	rq_authop;	/* authentication flavour */
+	struct svc_cred		rq_cred;	/* auth info */
+	void *			rq_xprt_ctxt;	/* transport specific context ptr */
+	struct svc_deferred_req*rq_deferred;	/* deferred request we are replaying */
+
+	size_t			rq_xprt_hlen;	/* xprt header len */
+	struct xdr_buf		rq_arg;
+	struct xdr_buf		rq_res;
+	struct page		*rq_pages[RPCSVC_MAXPAGES + 1];
+	struct page *		*rq_respages;	/* points into rq_pages */
+	struct page *		*rq_next_page; /* next reply page to use */
+	struct page *		*rq_page_end;  /* one past the last page */
+
+	struct kvec		rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */
+
+	__be32			rq_xid;		/* transmission id */
+	u32			rq_prog;	/* program number */
+	u32			rq_vers;	/* program version */
+	u32			rq_proc;	/* procedure number */
+	u32			rq_prot;	/* IP protocol */
+	int			rq_cachetype;	/* catering to nfsd */
+#define	RQ_SECURE	(0)			/* secure port */
+#define	RQ_LOCAL	(1)			/* local request */
+#define	RQ_USEDEFERRAL	(2)			/* use deferral */
+#define	RQ_DROPME	(3)			/* drop current reply */
+#define	RQ_SPLICE_OK	(4)			/* turned off in gss privacy
+						 * to prevent encrypting page
+						 * cache pages */
+#define	RQ_VICTIM	(5)			/* about to be shut down */
+#define	RQ_BUSY		(6)			/* request is busy */
+#define	RQ_DATA		(7)			/* request has data */
+	unsigned long		rq_flags;	/* flags field */
+	ktime_t			rq_qtime;	/* enqueue time */
+
+	void *			rq_argp;	/* decoded arguments */
+	void *			rq_resp;	/* xdr'd results */
+	void *			rq_auth_data;	/* flavor-specific data */
+	int			rq_auth_slack;	/* extra space xdr code
+						 * should leave in head
+						 * for krb5i, krb5p.
+						 */
+	int			rq_reserved;	/* space on socket outq
+						 * reserved for this request
+						 */
+	ktime_t			rq_stime;	/* start time */
+
+	struct cache_req	rq_chandle;	/* handle passed to caches for 
+						 * request delaying 
+						 */
+	/* Catering to nfsd */
+	struct auth_domain *	rq_client;	/* RPC peer info */
+	struct auth_domain *	rq_gssclient;	/* "gss/"-style peer info */
+	struct svc_cacherep *	rq_cacherep;	/* cache info */
+	struct task_struct	*rq_task;	/* service thread */
+	spinlock_t		rq_lock;	/* per-request lock */
+};
+
+#define SVC_NET(svc_rqst)	(svc_rqst->rq_xprt->xpt_net)
+
+/*
+ * Rigorous type checking on sockaddr type conversions
+ */
+static inline struct sockaddr_in *svc_addr_in(const struct svc_rqst *rqst)
+{
+	return (struct sockaddr_in *) &rqst->rq_addr;
+}
+
+static inline struct sockaddr_in6 *svc_addr_in6(const struct svc_rqst *rqst)
+{
+	return (struct sockaddr_in6 *) &rqst->rq_addr;
+}
+
+static inline struct sockaddr *svc_addr(const struct svc_rqst *rqst)
+{
+	return (struct sockaddr *) &rqst->rq_addr;
+}
+
+static inline struct sockaddr_in *svc_daddr_in(const struct svc_rqst *rqst)
+{
+	return (struct sockaddr_in *) &rqst->rq_daddr;
+}
+
+static inline struct sockaddr_in6 *svc_daddr_in6(const struct svc_rqst *rqst)
+{
+	return (struct sockaddr_in6 *) &rqst->rq_daddr;
+}
+
+static inline struct sockaddr *svc_daddr(const struct svc_rqst *rqst)
+{
+	return (struct sockaddr *) &rqst->rq_daddr;
+}
+
+/*
+ * Check buffer bounds after decoding arguments
+ */
+static inline int
+xdr_argsize_check(struct svc_rqst *rqstp, __be32 *p)
+{
+	char *cp = (char *)p;
+	struct kvec *vec = &rqstp->rq_arg.head[0];
+	return cp >= (char*)vec->iov_base
+		&& cp <= (char*)vec->iov_base + vec->iov_len;
+}
+
+static inline int
+xdr_ressize_check(struct svc_rqst *rqstp, __be32 *p)
+{
+	struct kvec *vec = &rqstp->rq_res.head[0];
+	char *cp = (char*)p;
+
+	vec->iov_len = cp - (char*)vec->iov_base;
+
+	return vec->iov_len <= PAGE_SIZE;
+}
+
+static inline void svc_free_res_pages(struct svc_rqst *rqstp)
+{
+	while (rqstp->rq_next_page != rqstp->rq_respages) {
+		struct page **pp = --rqstp->rq_next_page;
+		if (*pp) {
+			put_page(*pp);
+			*pp = NULL;
+		}
+	}
+}
+
+struct svc_deferred_req {
+	u32			prot;	/* protocol (UDP or TCP) */
+	struct svc_xprt		*xprt;
+	struct sockaddr_storage	addr;	/* where reply must go */
+	size_t			addrlen;
+	struct sockaddr_storage	daddr;	/* where reply must come from */
+	size_t			daddrlen;
+	struct cache_deferred_req handle;
+	size_t			xprt_hlen;
+	int			argslen;
+	__be32			args[0];
+};
+
+/*
+ * List of RPC programs on the same transport endpoint
+ */
+struct svc_program {
+	struct svc_program *	pg_next;	/* other programs (same xprt) */
+	u32			pg_prog;	/* program number */
+	unsigned int		pg_lovers;	/* lowest version */
+	unsigned int		pg_hivers;	/* highest version */
+	unsigned int		pg_nvers;	/* number of versions */
+	const struct svc_version **pg_vers;	/* version array */
+	char *			pg_name;	/* service name */
+	char *			pg_class;	/* class name: services sharing authentication */
+	struct svc_stat *	pg_stats;	/* rpc statistics */
+	int			(*pg_authenticate)(struct svc_rqst *);
+};
+
+/*
+ * RPC program version
+ */
+struct svc_version {
+	u32			vs_vers;	/* version number */
+	u32			vs_nproc;	/* number of procedures */
+	const struct svc_procedure *vs_proc;	/* per-procedure info */
+	unsigned int		*vs_count;	/* call counts */
+	u32			vs_xdrsize;	/* xdrsize needed for this version */
+
+	/* Don't register with rpcbind */
+	bool			vs_hidden;
+
+	/* Don't care if the rpcbind registration fails */
+	bool			vs_rpcb_optnl;
+
+	/* Need xprt with congestion control */
+	bool			vs_need_cong_ctrl;
+
+	/* Override dispatch function (e.g. when caching replies).
+	 * A return value of 0 means drop the request. 
+	 * vs_dispatch == NULL means use default dispatcher.
+	 */
+	int			(*vs_dispatch)(struct svc_rqst *, __be32 *);
+};
+
+/*
+ * RPC procedure info
+ */
+struct svc_procedure {
+	/* process the request: */
+	__be32			(*pc_func)(struct svc_rqst *);
+	/* XDR decode args: */
+	int			(*pc_decode)(struct svc_rqst *, __be32 *data);
+	/* XDR encode result: */
+	int			(*pc_encode)(struct svc_rqst *, __be32 *data);
+	/* XDR free result: */
+	void			(*pc_release)(struct svc_rqst *);
+	unsigned int		pc_argsize;	/* argument struct size */
+	unsigned int		pc_ressize;	/* result struct size */
+	unsigned int		pc_cachetype;	/* cache info (NFS) */
+	unsigned int		pc_xdrressize;	/* maximum size of XDR reply */
+};
+
+/*
+ * Mode for mapping cpus to pools.
+ */
+enum {
+	SVC_POOL_AUTO = -1,	/* choose one of the others */
+	SVC_POOL_GLOBAL,	/* no mapping, just a single global pool
+				 * (legacy & UP mode) */
+	SVC_POOL_PERCPU,	/* one pool per cpu */
+	SVC_POOL_PERNODE	/* one pool per numa node */
+};
+
+struct svc_pool_map {
+	int count;			/* How many svc_servs use us */
+	int mode;			/* Note: int not enum to avoid
+					 * warnings about "enumeration value
+					 * not handled in switch" */
+	unsigned int npools;
+	unsigned int *pool_to;		/* maps pool id to cpu or node */
+	unsigned int *to_pool;		/* maps cpu or node to pool id */
+};
+
+extern struct svc_pool_map svc_pool_map;
+
+/*
+ * Function prototypes.
+ */
+int svc_rpcb_setup(struct svc_serv *serv, struct net *net);
+void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net);
+int svc_bind(struct svc_serv *serv, struct net *net);
+struct svc_serv *svc_create(struct svc_program *, unsigned int,
+			    const struct svc_serv_ops *);
+struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv,
+					struct svc_pool *pool, int node);
+struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
+					struct svc_pool *pool, int node);
+void		   svc_rqst_free(struct svc_rqst *);
+void		   svc_exit_thread(struct svc_rqst *);
+unsigned int	   svc_pool_map_get(void);
+void		   svc_pool_map_put(void);
+struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
+			const struct svc_serv_ops *);
+int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
+int		   svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int);
+int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
+void		   svc_destroy(struct svc_serv *);
+void		   svc_shutdown_net(struct svc_serv *, struct net *);
+int		   svc_process(struct svc_rqst *);
+int		   bc_svc_process(struct svc_serv *, struct rpc_rqst *,
+			struct svc_rqst *);
+int		   svc_register(const struct svc_serv *, struct net *, const int,
+				const unsigned short, const unsigned short);
+
+void		   svc_wake_up(struct svc_serv *);
+void		   svc_reserve(struct svc_rqst *rqstp, int space);
+struct svc_pool *  svc_pool_for_cpu(struct svc_serv *serv, int cpu);
+char *		   svc_print_addr(struct svc_rqst *, char *, size_t);
+unsigned int	   svc_fill_write_vector(struct svc_rqst *rqstp,
+					 struct kvec *first, size_t total);
+char		  *svc_fill_symlink_pathname(struct svc_rqst *rqstp,
+					     struct kvec *first, size_t total);
+
+#define	RPC_MAX_ADDRBUFLEN	(63U)
+
+/*
+ * When we want to reduce the size of the reserved space in the response
+ * buffer, we need to take into account the size of any checksum data that
+ * may be at the end of the packet. This is difficult to determine exactly
+ * for all cases without actually generating the checksum, so we just use a
+ * static value.
+ */
+static inline void svc_reserve_auth(struct svc_rqst *rqstp, int space)
+{
+	svc_reserve(rqstp, space + rqstp->rq_auth_slack);
+}
+
+#endif /* SUNRPC_SVC_H */
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
new file mode 100644
index 0000000..7337e12
--- /dev/null
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#ifndef SVC_RDMA_H
+#define SVC_RDMA_H
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#define SVCRDMA_DEBUG
+
+/* Default and maximum inline threshold sizes */
+enum {
+	RPCRDMA_DEF_INLINE_THRESH = 4096,
+	RPCRDMA_MAX_INLINE_THRESH = 65536
+};
+
+/* RPC/RDMA parameters and stats */
+extern unsigned int svcrdma_ord;
+extern unsigned int svcrdma_max_requests;
+extern unsigned int svcrdma_max_bc_requests;
+extern unsigned int svcrdma_max_req_size;
+
+extern atomic_t rdma_stat_recv;
+extern atomic_t rdma_stat_read;
+extern atomic_t rdma_stat_write;
+extern atomic_t rdma_stat_sq_starve;
+extern atomic_t rdma_stat_rq_starve;
+extern atomic_t rdma_stat_rq_poll;
+extern atomic_t rdma_stat_rq_prod;
+extern atomic_t rdma_stat_sq_poll;
+extern atomic_t rdma_stat_sq_prod;
+
+/*
+ * Contexts are built when an RDMA request is created and are a
+ * record of the resources that can be recovered when the request
+ * completes.
+ */
+struct svc_rdma_op_ctxt {
+	struct list_head list;
+	struct xdr_buf arg;
+	struct ib_cqe cqe;
+	u32 byte_len;
+	struct svcxprt_rdma *xprt;
+	enum dma_data_direction direction;
+	int count;
+	unsigned int mapped_sges;
+	int hdr_count;
+	struct ib_send_wr send_wr;
+	struct ib_sge sge[1 + RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE];
+	struct page *pages[RPCSVC_MAXPAGES];
+};
+
+struct svcxprt_rdma {
+	struct svc_xprt      sc_xprt;		/* SVC transport structure */
+	struct rdma_cm_id    *sc_cm_id;		/* RDMA connection id */
+	struct list_head     sc_accept_q;	/* Conn. waiting accept */
+	int		     sc_ord;		/* RDMA read limit */
+	int                  sc_max_sge;
+	bool		     sc_snd_w_inv;	/* OK to use Send With Invalidate */
+
+	atomic_t             sc_sq_avail;	/* SQEs ready to be consumed */
+	unsigned int	     sc_sq_depth;	/* Depth of SQ */
+	unsigned int	     sc_rq_depth;	/* Depth of RQ */
+	__be32		     sc_fc_credits;	/* Forward credits */
+	u32		     sc_max_requests;	/* Max requests */
+	u32		     sc_max_bc_requests;/* Backward credits */
+	int                  sc_max_req_size;	/* Size of each RQ WR buf */
+	u8		     sc_port_num;
+
+	struct ib_pd         *sc_pd;
+
+	spinlock_t	     sc_ctxt_lock;
+	struct list_head     sc_ctxts;
+	int		     sc_ctxt_used;
+	spinlock_t	     sc_rw_ctxt_lock;
+	struct list_head     sc_rw_ctxts;
+
+	struct list_head     sc_rq_dto_q;
+	spinlock_t	     sc_rq_dto_lock;
+	struct ib_qp         *sc_qp;
+	struct ib_cq         *sc_rq_cq;
+	struct ib_cq         *sc_sq_cq;
+
+	spinlock_t	     sc_lock;		/* transport lock */
+
+	wait_queue_head_t    sc_send_wait;	/* SQ exhaustion waitlist */
+	unsigned long	     sc_flags;
+	struct list_head     sc_read_complete_q;
+	struct work_struct   sc_work;
+};
+/* sc_flags */
+#define RDMAXPRT_CONN_PENDING	3
+
+#define RPCRDMA_LISTEN_BACKLOG  10
+#define RPCRDMA_MAX_REQUESTS    32
+
+/* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our
+ * current NFSv4.1 implementation supports one backchannel slot.
+ */
+#define RPCRDMA_MAX_BC_REQUESTS	2
+
+#define RPCSVC_MAXPAYLOAD_RDMA	RPCSVC_MAXPAYLOAD
+
+/* Track DMA maps for this transport and context */
+static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma,
+					   struct svc_rdma_op_ctxt *ctxt)
+{
+	ctxt->mapped_sges++;
+}
+
+/* svc_rdma_backchannel.c */
+extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
+				    __be32 *rdma_resp,
+				    struct xdr_buf *rcvbuf);
+
+/* svc_rdma_recvfrom.c */
+extern int svc_rdma_recvfrom(struct svc_rqst *);
+
+/* svc_rdma_rw.c */
+extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
+extern int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma,
+				    struct svc_rqst *rqstp,
+				    struct svc_rdma_op_ctxt *head, __be32 *p);
+extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
+				     __be32 *wr_ch, struct xdr_buf *xdr);
+extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
+				     __be32 *rp_ch, bool writelist,
+				     struct xdr_buf *xdr);
+
+/* svc_rdma_sendto.c */
+extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
+				  struct svc_rdma_op_ctxt *ctxt,
+				  __be32 *rdma_resp, unsigned int len);
+extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
+				 struct svc_rdma_op_ctxt *ctxt,
+				 int num_sge, u32 inv_rkey);
+extern int svc_rdma_sendto(struct svc_rqst *);
+
+/* svc_rdma_transport.c */
+extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *);
+extern void svc_rdma_wc_reg(struct ib_cq *, struct ib_wc *);
+extern void svc_rdma_wc_read(struct ib_cq *, struct ib_wc *);
+extern void svc_rdma_wc_inv(struct ib_cq *, struct ib_wc *);
+extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *);
+extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
+extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
+extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
+extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
+extern void svc_sq_reap(struct svcxprt_rdma *);
+extern void svc_rq_reap(struct svcxprt_rdma *);
+extern void svc_rdma_prep_reply_hdr(struct svc_rqst *);
+
+extern struct svc_xprt_class svc_rdma_class;
+#ifdef CONFIG_SUNRPC_BACKCHANNEL
+extern struct svc_xprt_class svc_rdma_bc_class;
+#endif
+
+/* svc_rdma.c */
+extern struct workqueue_struct *svc_rdma_wq;
+extern int svc_rdma_init(void);
+extern void svc_rdma_cleanup(void);
+
+#endif
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
new file mode 100644
index 0000000..c3d7206
--- /dev/null
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -0,0 +1,221 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/svc_xprt.h
+ *
+ * RPC server transport I/O
+ */
+
+#ifndef SUNRPC_SVC_XPRT_H
+#define SUNRPC_SVC_XPRT_H
+
+#include <linux/sunrpc/svc.h>
+
+struct module;
+
+struct svc_xprt_ops {
+	struct svc_xprt	*(*xpo_create)(struct svc_serv *,
+				       struct net *net,
+				       struct sockaddr *, int,
+				       int);
+	struct svc_xprt	*(*xpo_accept)(struct svc_xprt *);
+	int		(*xpo_has_wspace)(struct svc_xprt *);
+	int		(*xpo_recvfrom)(struct svc_rqst *);
+	void		(*xpo_prep_reply_hdr)(struct svc_rqst *);
+	int		(*xpo_sendto)(struct svc_rqst *);
+	void		(*xpo_release_rqst)(struct svc_rqst *);
+	void		(*xpo_detach)(struct svc_xprt *);
+	void		(*xpo_free)(struct svc_xprt *);
+	void		(*xpo_secure_port)(struct svc_rqst *rqstp);
+	void		(*xpo_kill_temp_xprt)(struct svc_xprt *);
+};
+
+struct svc_xprt_class {
+	const char		*xcl_name;
+	struct module		*xcl_owner;
+	const struct svc_xprt_ops *xcl_ops;
+	struct list_head	xcl_list;
+	u32			xcl_max_payload;
+	int			xcl_ident;
+};
+
+/*
+ * This is embedded in an object that wants a callback before deleting
+ * an xprt; intended for use by NFSv4.1, which needs to know when a
+ * client's tcp connection (and hence possibly a backchannel) goes away.
+ */
+struct svc_xpt_user {
+	struct list_head list;
+	void (*callback)(struct svc_xpt_user *);
+};
+
+struct svc_xprt {
+	struct svc_xprt_class	*xpt_class;
+	const struct svc_xprt_ops *xpt_ops;
+	struct kref		xpt_ref;
+	struct list_head	xpt_list;
+	struct list_head	xpt_ready;
+	unsigned long		xpt_flags;
+#define	XPT_BUSY	0		/* enqueued/receiving */
+#define	XPT_CONN	1		/* conn pending */
+#define	XPT_CLOSE	2		/* dead or dying */
+#define	XPT_DATA	3		/* data pending */
+#define	XPT_TEMP	4		/* connected transport */
+#define	XPT_DEAD	6		/* transport closed */
+#define	XPT_CHNGBUF	7		/* need to change snd/rcv buf sizes */
+#define	XPT_DEFERRED	8		/* deferred request pending */
+#define	XPT_OLD		9		/* used for xprt aging mark+sweep */
+#define XPT_LISTENER	10		/* listening endpoint */
+#define XPT_CACHE_AUTH	11		/* cache auth info */
+#define XPT_LOCAL	12		/* connection from loopback interface */
+#define XPT_KILL_TEMP   13		/* call xpo_kill_temp_xprt before closing */
+#define XPT_CONG_CTRL	14		/* has congestion control */
+
+	struct svc_serv		*xpt_server;	/* service for transport */
+	atomic_t    	    	xpt_reserved;	/* space on outq that is rsvd */
+	atomic_t		xpt_nr_rqsts;	/* Number of requests */
+	struct mutex		xpt_mutex;	/* to serialize sending data */
+	spinlock_t		xpt_lock;	/* protects sk_deferred
+						 * and xpt_auth_cache */
+	void			*xpt_auth_cache;/* auth cache */
+	struct list_head	xpt_deferred;	/* deferred requests that need
+						 * to be revisted */
+	struct sockaddr_storage	xpt_local;	/* local address */
+	size_t			xpt_locallen;	/* length of address */
+	struct sockaddr_storage	xpt_remote;	/* remote peer's address */
+	size_t			xpt_remotelen;	/* length of address */
+	char			xpt_remotebuf[INET6_ADDRSTRLEN + 10];
+	struct rpc_wait_queue	xpt_bc_pending;	/* backchannel wait queue */
+	struct list_head	xpt_users;	/* callbacks on free */
+
+	struct net		*xpt_net;
+	struct rpc_xprt		*xpt_bc_xprt;	/* NFSv4.1 backchannel */
+	struct rpc_xprt_switch	*xpt_bc_xps;	/* NFSv4.1 backchannel */
+};
+
+static inline void unregister_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u)
+{
+	spin_lock(&xpt->xpt_lock);
+	list_del_init(&u->list);
+	spin_unlock(&xpt->xpt_lock);
+}
+
+static inline int register_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u)
+{
+	spin_lock(&xpt->xpt_lock);
+	if (test_bit(XPT_CLOSE, &xpt->xpt_flags)) {
+		/*
+		 * The connection is about to be deleted soon (or,
+		 * worse, may already be deleted--in which case we've
+		 * already notified the xpt_users).
+		 */
+		spin_unlock(&xpt->xpt_lock);
+		return -ENOTCONN;
+	}
+	list_add(&u->list, &xpt->xpt_users);
+	spin_unlock(&xpt->xpt_lock);
+	return 0;
+}
+
+int	svc_reg_xprt_class(struct svc_xprt_class *);
+void	svc_unreg_xprt_class(struct svc_xprt_class *);
+void	svc_xprt_init(struct net *, struct svc_xprt_class *, struct svc_xprt *,
+		      struct svc_serv *);
+int	svc_create_xprt(struct svc_serv *, const char *, struct net *,
+			const int, const unsigned short, int);
+void	svc_xprt_do_enqueue(struct svc_xprt *xprt);
+void	svc_xprt_enqueue(struct svc_xprt *xprt);
+void	svc_xprt_put(struct svc_xprt *xprt);
+void	svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt);
+void	svc_close_xprt(struct svc_xprt *xprt);
+int	svc_port_is_privileged(struct sockaddr *sin);
+int	svc_print_xprts(char *buf, int maxlen);
+struct	svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
+			struct net *net, const sa_family_t af,
+			const unsigned short port);
+int	svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen);
+void	svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *xprt);
+void	svc_age_temp_xprts_now(struct svc_serv *, struct sockaddr *);
+
+static inline void svc_xprt_get(struct svc_xprt *xprt)
+{
+	kref_get(&xprt->xpt_ref);
+}
+static inline void svc_xprt_set_local(struct svc_xprt *xprt,
+				      const struct sockaddr *sa,
+				      const size_t salen)
+{
+	memcpy(&xprt->xpt_local, sa, salen);
+	xprt->xpt_locallen = salen;
+}
+static inline void svc_xprt_set_remote(struct svc_xprt *xprt,
+				       const struct sockaddr *sa,
+				       const size_t salen)
+{
+	memcpy(&xprt->xpt_remote, sa, salen);
+	xprt->xpt_remotelen = salen;
+	snprintf(xprt->xpt_remotebuf, sizeof(xprt->xpt_remotebuf) - 1,
+		 "%pISpc", sa);
+}
+
+static inline unsigned short svc_addr_port(const struct sockaddr *sa)
+{
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)sa;
+	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sa;
+
+	switch (sa->sa_family) {
+	case AF_INET:
+		return ntohs(sin->sin_port);
+	case AF_INET6:
+		return ntohs(sin6->sin6_port);
+	}
+
+	return 0;
+}
+
+static inline size_t svc_addr_len(const struct sockaddr *sa)
+{
+	switch (sa->sa_family) {
+	case AF_INET:
+		return sizeof(struct sockaddr_in);
+	case AF_INET6:
+		return sizeof(struct sockaddr_in6);
+	}
+	BUG();
+}
+
+static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt)
+{
+	return svc_addr_port((const struct sockaddr *)&xprt->xpt_local);
+}
+
+static inline unsigned short svc_xprt_remote_port(const struct svc_xprt *xprt)
+{
+	return svc_addr_port((const struct sockaddr *)&xprt->xpt_remote);
+}
+
+static inline char *__svc_print_addr(const struct sockaddr *addr,
+				     char *buf, const size_t len)
+{
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)addr;
+	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)addr;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		snprintf(buf, len, "%pI4, port=%u", &sin->sin_addr,
+			ntohs(sin->sin_port));
+		break;
+
+	case AF_INET6:
+		snprintf(buf, len, "%pI6, port=%u",
+			 &sin6->sin6_addr,
+			ntohs(sin6->sin6_port));
+		break;
+
+	default:
+		snprintf(buf, len, "unknown address type: %d", addr->sa_family);
+		break;
+	}
+
+	return buf;
+}
+#endif /* SUNRPC_SVC_XPRT_H */
diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
new file mode 100644
index 0000000..7c36565
--- /dev/null
+++ b/include/linux/sunrpc/svcauth.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/svcauth.h
+ *
+ * RPC server-side authentication stuff.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef _LINUX_SUNRPC_SVCAUTH_H_
+#define _LINUX_SUNRPC_SVCAUTH_H_
+
+#ifdef __KERNEL__
+
+#include <linux/string.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/hash.h>
+#include <linux/stringhash.h>
+#include <linux/cred.h>
+
+struct svc_cred {
+	kuid_t			cr_uid;
+	kgid_t			cr_gid;
+	struct group_info	*cr_group_info;
+	u32			cr_flavor; /* pseudoflavor */
+	/* name of form servicetype/hostname@REALM, passed down by
+	 * gss-proxy: */
+	char			*cr_raw_principal;
+	/* name of form servicetype@hostname, passed down by
+	 * rpc.svcgssd, or computed from the above: */
+	char			*cr_principal;
+	struct gss_api_mech	*cr_gss_mech;
+};
+
+static inline void init_svc_cred(struct svc_cred *cred)
+{
+	cred->cr_group_info = NULL;
+	cred->cr_raw_principal = NULL;
+	cred->cr_principal = NULL;
+	cred->cr_gss_mech = NULL;
+}
+
+static inline void free_svc_cred(struct svc_cred *cred)
+{
+	if (cred->cr_group_info)
+		put_group_info(cred->cr_group_info);
+	kfree(cred->cr_raw_principal);
+	kfree(cred->cr_principal);
+	gss_mech_put(cred->cr_gss_mech);
+	init_svc_cred(cred);
+}
+
+struct svc_rqst;		/* forward decl */
+struct in6_addr;
+
+/* Authentication is done in the context of a domain.
+ *
+ * Currently, the nfs server uses the auth_domain to stand
+ * for the "client" listed in /etc/exports.
+ *
+ * More generally, a domain might represent a group of clients using
+ * a common mechanism for authentication and having a common mapping
+ * between local identity (uid) and network identity.  All clients
+ * in a domain have similar general access rights.  Each domain can
+ * contain multiple principals which will have different specific right
+ * based on normal Discretionary Access Control.
+ *
+ * A domain is created by an authentication flavour module based on name
+ * only.  Userspace then fills in detail on demand.
+ *
+ * In the case of auth_unix and auth_null, the auth_domain is also
+ * associated with entries in another cache representing the mapping
+ * of ip addresses to the given client.
+ */
+struct auth_domain {
+	struct kref		ref;
+	struct hlist_node	hash;
+	char			*name;
+	struct auth_ops		*flavour;
+};
+
+/*
+ * Each authentication flavour registers an auth_ops
+ * structure.
+ * name is simply the name.
+ * flavour gives the auth flavour. It determines where the flavour is registered
+ * accept() is given a request and should verify it.
+ *   It should inspect the authenticator and verifier, and possibly the data.
+ *    If there is a problem with the authentication *authp should be set.
+ *    The return value of accept() can indicate:
+ *      OK - authorised. client and credential are set in rqstp.
+ *           reqbuf points to arguments
+ *           resbuf points to good place for results.  verfier
+ *             is (probably) already in place.  Certainly space is
+ *	       reserved for it.
+ *      DROP - simply drop the request. It may have been deferred
+ *      GARBAGE - rpc garbage_args error
+ *      SYSERR - rpc system_err error
+ *      DENIED - authp holds reason for denial.
+ *      COMPLETE - the reply is encoded already and ready to be sent; no
+ *		further processing is necessary.  (This is used for processing
+ *		null procedure calls which are used to set up encryption
+ *		contexts.)
+ *
+ *   accept is passed the proc number so that it can accept NULL rpc requests
+ *   even if it cannot authenticate the client (as is sometimes appropriate).
+ *
+ * release() is given a request after the procedure has been run.
+ *  It should sign/encrypt the results if needed
+ * It should return:
+ *    OK - the resbuf is ready to be sent
+ *    DROP - the reply should be quitely dropped
+ *    DENIED - authp holds a reason for MSG_DENIED
+ *    SYSERR - rpc system_err
+ *
+ * domain_release()
+ *   This call releases a domain.
+ * set_client()
+ *   Givens a pending request (struct svc_rqst), finds and assigns
+ *   an appropriate 'auth_domain' as the client.
+ */
+struct auth_ops {
+	char *	name;
+	struct module *owner;
+	int	flavour;
+	int	(*accept)(struct svc_rqst *rq, __be32 *authp);
+	int	(*release)(struct svc_rqst *rq);
+	void	(*domain_release)(struct auth_domain *);
+	int	(*set_client)(struct svc_rqst *rq);
+};
+
+#define	SVC_GARBAGE	1
+#define	SVC_SYSERR	2
+#define	SVC_VALID	3
+#define	SVC_NEGATIVE	4
+#define	SVC_OK		5
+#define	SVC_DROP	6
+#define	SVC_CLOSE	7	/* Like SVC_DROP, but request is definitely
+				 * lost so if there is a tcp connection, it
+				 * should be closed
+				 */
+#define	SVC_DENIED	8
+#define	SVC_PENDING	9
+#define	SVC_COMPLETE	10
+
+struct svc_xprt;
+
+extern int	svc_authenticate(struct svc_rqst *rqstp, __be32 *authp);
+extern int	svc_authorise(struct svc_rqst *rqstp);
+extern int	svc_set_client(struct svc_rqst *rqstp);
+extern int	svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops);
+extern void	svc_auth_unregister(rpc_authflavor_t flavor);
+
+extern struct auth_domain *unix_domain_find(char *name);
+extern void auth_domain_put(struct auth_domain *item);
+extern int auth_unix_add_addr(struct net *net, struct in6_addr *addr, struct auth_domain *dom);
+extern struct auth_domain *auth_domain_lookup(char *name, struct auth_domain *new);
+extern struct auth_domain *auth_domain_find(char *name);
+extern struct auth_domain *auth_unix_lookup(struct net *net, struct in6_addr *addr);
+extern int auth_unix_forget_old(struct auth_domain *dom);
+extern void svcauth_unix_purge(struct net *net);
+extern void svcauth_unix_info_release(struct svc_xprt *xpt);
+extern int svcauth_unix_set_client(struct svc_rqst *rqstp);
+
+extern int unix_gid_cache_create(struct net *net);
+extern void unix_gid_cache_destroy(struct net *net);
+
+/*
+ * The <stringhash.h> functions are good enough that we don't need to
+ * use hash_32() on them; just extracting the high bits is enough.
+ */
+static inline unsigned long hash_str(char const *name, int bits)
+{
+	return hashlen_hash(hashlen_string(NULL, name)) >> (32 - bits);
+}
+
+static inline unsigned long hash_mem(char const *buf, int length, int bits)
+{
+	return full_name_hash(NULL, buf, length) >> (32 - bits);
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_SUNRPC_SVCAUTH_H_ */
diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h
new file mode 100644
index 0000000..a4528b2
--- /dev/null
+++ b/include/linux/sunrpc/svcauth_gss.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/svcauth_gss.h
+ *
+ * Bruce Fields <bfields@umich.edu>
+ * Copyright (c) 2002 The Regents of the University of Michigan
+ */
+
+#ifndef _LINUX_SUNRPC_SVCAUTH_GSS_H
+#define _LINUX_SUNRPC_SVCAUTH_GSS_H
+
+#ifdef __KERNEL__
+#include <linux/sched.h>
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/auth_gss.h>
+
+int gss_svc_init(void);
+void gss_svc_shutdown(void);
+int gss_svc_init_net(struct net *net);
+void gss_svc_shutdown_net(struct net *net);
+int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name);
+u32 svcauth_gss_flavor(struct auth_domain *dom);
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
new file mode 100644
index 0000000..119718a
--- /dev/null
+++ b/include/linux/sunrpc/svcsock.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/svcsock.h
+ *
+ * RPC server socket I/O.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef SUNRPC_SVCSOCK_H
+#define SUNRPC_SVCSOCK_H
+
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+/*
+ * RPC server socket.
+ */
+struct svc_sock {
+	struct svc_xprt		sk_xprt;
+	struct socket *		sk_sock;	/* berkeley socket layer */
+	struct sock *		sk_sk;		/* INET layer */
+
+	/* We keep the old state_change and data_ready CB's here */
+	void			(*sk_ostate)(struct sock *);
+	void			(*sk_odata)(struct sock *);
+	void			(*sk_owspace)(struct sock *);
+
+	/* private TCP part */
+	/* On-the-wire fragment header: */
+	__be32			sk_reclen;
+	/* As we receive a record, this includes the length received so
+	 * far (including the fragment header): */
+	u32			sk_tcplen;
+	/* Total length of the data (not including fragment headers)
+	 * received so far in the fragments making up this rpc: */
+	u32			sk_datalen;
+
+	struct page *		sk_pages[RPCSVC_MAXPAGES];	/* received data */
+};
+
+static inline u32 svc_sock_reclen(struct svc_sock *svsk)
+{
+	return ntohl(svsk->sk_reclen) & RPC_FRAGMENT_SIZE_MASK;
+}
+
+static inline u32 svc_sock_final_rec(struct svc_sock *svsk)
+{
+	return ntohl(svsk->sk_reclen) & RPC_LAST_STREAM_FRAGMENT;
+}
+
+/*
+ * Function prototypes.
+ */
+void		svc_close_net(struct svc_serv *, struct net *);
+int		svc_recv(struct svc_rqst *, long);
+int		svc_send(struct svc_rqst *);
+void		svc_drop(struct svc_rqst *);
+void		svc_sock_update_bufs(struct svc_serv *serv);
+bool		svc_alien_sock(struct net *net, int fd);
+int		svc_addsock(struct svc_serv *serv, const int fd,
+					char *name_return, const size_t len);
+void		svc_init_xprt_sock(void);
+void		svc_cleanup_xprt_sock(void);
+struct svc_xprt *svc_sock_create(struct svc_serv *serv, int prot);
+void		svc_sock_destroy(struct svc_xprt *);
+
+/*
+ * svc_makesock socket characteristics
+ */
+#define SVC_SOCK_DEFAULTS	(0U)
+#define SVC_SOCK_ANONYMOUS	(1U << 0)	/* don't register with pmap */
+#define SVC_SOCK_TEMPORARY	(1U << 1)	/* flag socket as temporary */
+
+#endif /* SUNRPC_SVCSOCK_H */
diff --git a/include/linux/sunrpc/timer.h b/include/linux/sunrpc/timer.h
new file mode 100644
index 0000000..242dbe0
--- /dev/null
+++ b/include/linux/sunrpc/timer.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  linux/include/linux/sunrpc/timer.h
+ *
+ *  Declarations for the RPC transport timer.
+ *
+ *  Copyright (C) 2002 Trond Myklebust <trond.myklebust@fys.uio.no>
+ */
+
+#ifndef _LINUX_SUNRPC_TIMER_H
+#define _LINUX_SUNRPC_TIMER_H
+
+#include <linux/atomic.h>
+
+struct rpc_rtt {
+	unsigned long timeo;	/* default timeout value */
+	unsigned long srtt[5];	/* smoothed round trip time << 3 */
+	unsigned long sdrtt[5];	/* smoothed medium deviation of RTT */
+	int ntimeouts[5];	/* Number of timeouts for the last request */
+};
+
+
+extern void rpc_init_rtt(struct rpc_rtt *rt, unsigned long timeo);
+extern void rpc_update_rtt(struct rpc_rtt *rt, unsigned timer, long m);
+extern unsigned long rpc_calc_rto(struct rpc_rtt *rt, unsigned timer);
+
+static inline void rpc_set_timeo(struct rpc_rtt *rt, int timer, int ntimeo)
+{
+	int *t;
+	if (!timer)
+		return;
+	t = &rt->ntimeouts[timer-1];
+	if (ntimeo < *t) {
+		if (*t > 0)
+			(*t)--;
+	} else {
+		if (ntimeo > 8)
+			ntimeo = 8;
+		*t = ntimeo;
+	}
+}
+
+static inline int rpc_ntimeo(struct rpc_rtt *rt, int timer)
+{
+	if (!timer)
+		return 0;
+	return rt->ntimeouts[timer-1];
+}
+
+#endif /* _LINUX_SUNRPC_TIMER_H */
diff --git a/include/linux/sunrpc/types.h b/include/linux/sunrpc/types.h
new file mode 100644
index 0000000..bd3c8e0
--- /dev/null
+++ b/include/linux/sunrpc/types.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/linux/sunrpc/types.h
+ *
+ * Generic types and misc stuff for RPC.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef _LINUX_SUNRPC_TYPES_H_
+#define _LINUX_SUNRPC_TYPES_H_
+
+#include <linux/timer.h>
+#include <linux/sched/signal.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/list.h>
+
+/*
+ * Shorthands
+ */
+#define signalled()		(signal_pending(current))
+
+#endif /* _LINUX_SUNRPC_TYPES_H_ */
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
new file mode 100644
index 0000000..2bd6817
--- /dev/null
+++ b/include/linux/sunrpc/xdr.h
@@ -0,0 +1,531 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * XDR standard data types and function declarations
+ *
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ *
+ * Based on:
+ *   RFC 4506 "XDR: External Data Representation Standard", May 2006
+ */
+
+#ifndef _SUNRPC_XDR_H_
+#define _SUNRPC_XDR_H_
+
+#ifdef __KERNEL__
+
+#include <linux/uio.h>
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/scatterlist.h>
+
+struct rpc_rqst;
+
+/*
+ * Buffer adjustment
+ */
+#define XDR_QUADLEN(l)		(((l) + 3) >> 2)
+
+/*
+ * Generic opaque `network object.' At the kernel level, this type
+ * is used only by lockd.
+ */
+#define XDR_MAX_NETOBJ		1024
+struct xdr_netobj {
+	unsigned int		len;
+	u8 *			data;
+};
+
+/*
+ * Basic structure for transmission/reception of a client XDR message.
+ * Features a header (for a linear buffer containing RPC headers
+ * and the data payload for short messages), and then an array of
+ * pages.
+ * The tail iovec allows you to append data after the page array. Its
+ * main interest is for appending padding to the pages in order to
+ * satisfy the int_32-alignment requirements in RFC1832.
+ *
+ * For the future, we might want to string several of these together
+ * in a list if anybody wants to make use of NFSv4 COMPOUND
+ * operations and/or has a need for scatter/gather involving pages.
+ */
+struct xdr_buf {
+	struct kvec	head[1],	/* RPC header + non-page data */
+			tail[1];	/* Appended after page data */
+
+	struct page **	pages;		/* Array of pages */
+	unsigned int	page_base,	/* Start of page data */
+			page_len,	/* Length of page data */
+			flags;		/* Flags for data disposition */
+#define XDRBUF_READ		0x01		/* target of file read */
+#define XDRBUF_WRITE		0x02		/* source of file write */
+
+	unsigned int	buflen,		/* Total length of storage buffer */
+			len;		/* Length of XDR encoded message */
+};
+
+static inline void
+xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
+{
+	buf->head[0].iov_base = start;
+	buf->head[0].iov_len = len;
+	buf->tail[0].iov_len = 0;
+	buf->page_len = 0;
+	buf->flags = 0;
+	buf->len = 0;
+	buf->buflen = len;
+}
+
+/*
+ * pre-xdr'ed macros.
+ */
+
+#define	xdr_zero	cpu_to_be32(0)
+#define	xdr_one		cpu_to_be32(1)
+#define	xdr_two		cpu_to_be32(2)
+
+#define	rpc_success		cpu_to_be32(RPC_SUCCESS)
+#define	rpc_prog_unavail	cpu_to_be32(RPC_PROG_UNAVAIL)
+#define	rpc_prog_mismatch	cpu_to_be32(RPC_PROG_MISMATCH)
+#define	rpc_proc_unavail	cpu_to_be32(RPC_PROC_UNAVAIL)
+#define	rpc_garbage_args	cpu_to_be32(RPC_GARBAGE_ARGS)
+#define	rpc_system_err		cpu_to_be32(RPC_SYSTEM_ERR)
+#define	rpc_drop_reply		cpu_to_be32(RPC_DROP_REPLY)
+
+#define	rpc_auth_ok		cpu_to_be32(RPC_AUTH_OK)
+#define	rpc_autherr_badcred	cpu_to_be32(RPC_AUTH_BADCRED)
+#define	rpc_autherr_rejectedcred cpu_to_be32(RPC_AUTH_REJECTEDCRED)
+#define	rpc_autherr_badverf	cpu_to_be32(RPC_AUTH_BADVERF)
+#define	rpc_autherr_rejectedverf cpu_to_be32(RPC_AUTH_REJECTEDVERF)
+#define	rpc_autherr_tooweak	cpu_to_be32(RPC_AUTH_TOOWEAK)
+#define	rpcsec_gsserr_credproblem	cpu_to_be32(RPCSEC_GSS_CREDPROBLEM)
+#define	rpcsec_gsserr_ctxproblem	cpu_to_be32(RPCSEC_GSS_CTXPROBLEM)
+#define	rpc_autherr_oldseqnum	cpu_to_be32(101)
+
+/*
+ * Miscellaneous XDR helper functions
+ */
+__be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int len);
+__be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int len);
+__be32 *xdr_encode_string(__be32 *p, const char *s);
+__be32 *xdr_decode_string_inplace(__be32 *p, char **sp, unsigned int *lenp,
+			unsigned int maxlen);
+__be32 *xdr_encode_netobj(__be32 *p, const struct xdr_netobj *);
+__be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *);
+
+void	xdr_inline_pages(struct xdr_buf *, unsigned int,
+			 struct page **, unsigned int, unsigned int);
+void	xdr_terminate_string(struct xdr_buf *, const u32);
+
+static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int len)
+{
+	return xdr_encode_opaque(p, s, len);
+}
+
+/*
+ * Decode 64bit quantities (NFSv3 support)
+ */
+static inline __be32 *
+xdr_encode_hyper(__be32 *p, __u64 val)
+{
+	put_unaligned_be64(val, p);
+	return p + 2;
+}
+
+static inline __be32 *
+xdr_decode_hyper(__be32 *p, __u64 *valp)
+{
+	*valp = get_unaligned_be64(p);
+	return p + 2;
+}
+
+static inline __be32 *
+xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len)
+{
+	memcpy(ptr, p, len);
+	return p + XDR_QUADLEN(len);
+}
+
+/*
+ * Adjust kvec to reflect end of xdr'ed data (RPC client XDR)
+ */
+static inline int
+xdr_adjust_iovec(struct kvec *iov, __be32 *p)
+{
+	return iov->iov_len = ((u8 *) p - (u8 *) iov->iov_base);
+}
+
+/*
+ * XDR buffer helper functions
+ */
+extern void xdr_shift_buf(struct xdr_buf *, size_t);
+extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *);
+extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
+extern void xdr_buf_trim(struct xdr_buf *, unsigned int);
+extern int xdr_buf_read_netobj(struct xdr_buf *, struct xdr_netobj *, unsigned int);
+extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
+extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
+
+/*
+ * Helper structure for copying from an sk_buff.
+ */
+struct xdr_skb_reader {
+	struct sk_buff	*skb;
+	unsigned int	offset;
+	size_t		count;
+	__wsum		csum;
+};
+
+typedef size_t (*xdr_skb_read_actor)(struct xdr_skb_reader *desc, void *to, size_t len);
+
+size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len);
+extern int csum_partial_copy_to_xdr(struct xdr_buf *, struct sk_buff *);
+extern ssize_t xdr_partial_copy_from_skb(struct xdr_buf *, unsigned int,
+		struct xdr_skb_reader *, xdr_skb_read_actor);
+
+extern int xdr_encode_word(struct xdr_buf *, unsigned int, u32);
+extern int xdr_decode_word(struct xdr_buf *, unsigned int, u32 *);
+
+struct xdr_array2_desc;
+typedef int (*xdr_xcode_elem_t)(struct xdr_array2_desc *desc, void *elem);
+struct xdr_array2_desc {
+	unsigned int elem_size;
+	unsigned int array_len;
+	unsigned int array_maxlen;
+	xdr_xcode_elem_t xcode;
+};
+
+extern int xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
+			     struct xdr_array2_desc *desc);
+extern int xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
+			     struct xdr_array2_desc *desc);
+extern void _copy_from_pages(char *p, struct page **pages, size_t pgbase,
+			     size_t len);
+
+/*
+ * Provide some simple tools for XDR buffer overflow-checking etc.
+ */
+struct xdr_stream {
+	__be32 *p;		/* start of available buffer */
+	struct xdr_buf *buf;	/* XDR buffer to read/write */
+
+	__be32 *end;		/* end of available buffer space */
+	struct kvec *iov;	/* pointer to the current kvec */
+	struct kvec scratch;	/* Scratch buffer */
+	struct page **page_ptr;	/* pointer to the current page */
+	unsigned int nwords;	/* Remaining decode buffer length */
+};
+
+/*
+ * These are the xdr_stream style generic XDR encode and decode functions.
+ */
+typedef void	(*kxdreproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+		const void *obj);
+typedef int	(*kxdrdproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+		void *obj);
+
+extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
+extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
+extern void xdr_commit_encode(struct xdr_stream *xdr);
+extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len);
+extern int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen);
+extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
+		unsigned int base, unsigned int len);
+extern unsigned int xdr_stream_pos(const struct xdr_stream *xdr);
+extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
+extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
+		struct page **pages, unsigned int len);
+extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen);
+extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
+extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
+extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
+extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
+
+/**
+ * xdr_stream_remaining - Return the number of bytes remaining in the stream
+ * @xdr: pointer to struct xdr_stream
+ *
+ * Return value:
+ *   Number of bytes remaining in @xdr before xdr->end
+ */
+static inline size_t
+xdr_stream_remaining(const struct xdr_stream *xdr)
+{
+	return xdr->nwords << 2;
+}
+
+ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr,
+		size_t size);
+ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr,
+		size_t maxlen, gfp_t gfp_flags);
+ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str,
+		size_t size);
+ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
+		size_t maxlen, gfp_t gfp_flags);
+/**
+ * xdr_align_size - Calculate padded size of an object
+ * @n: Size of an object being XDR encoded (in bytes)
+ *
+ * Return value:
+ *   Size (in bytes) of the object including xdr padding
+ */
+static inline size_t
+xdr_align_size(size_t n)
+{
+	const size_t mask = sizeof(__u32) - 1;
+
+	return (n + mask) & ~mask;
+}
+
+/**
+ * xdr_stream_encode_u32 - Encode a 32-bit integer
+ * @xdr: pointer to xdr_stream
+ * @n: integer to encode
+ *
+ * Return values:
+ *   On success, returns length in bytes of XDR buffer consumed
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_encode_u32(struct xdr_stream *xdr, __u32 n)
+{
+	const size_t len = sizeof(n);
+	__be32 *p = xdr_reserve_space(xdr, len);
+
+	if (unlikely(!p))
+		return -EMSGSIZE;
+	*p = cpu_to_be32(n);
+	return len;
+}
+
+/**
+ * xdr_stream_encode_u64 - Encode a 64-bit integer
+ * @xdr: pointer to xdr_stream
+ * @n: 64-bit integer to encode
+ *
+ * Return values:
+ *   On success, returns length in bytes of XDR buffer consumed
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_encode_u64(struct xdr_stream *xdr, __u64 n)
+{
+	const size_t len = sizeof(n);
+	__be32 *p = xdr_reserve_space(xdr, len);
+
+	if (unlikely(!p))
+		return -EMSGSIZE;
+	xdr_encode_hyper(p, n);
+	return len;
+}
+
+/**
+ * xdr_stream_encode_opaque_inline - Encode opaque xdr data
+ * @xdr: pointer to xdr_stream
+ * @ptr: pointer to void pointer
+ * @len: size of object
+ *
+ * Return values:
+ *   On success, returns length in bytes of XDR buffer consumed
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_encode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t len)
+{
+	size_t count = sizeof(__u32) + xdr_align_size(len);
+	__be32 *p = xdr_reserve_space(xdr, count);
+
+	if (unlikely(!p)) {
+		*ptr = NULL;
+		return -EMSGSIZE;
+	}
+	xdr_encode_opaque(p, NULL, len);
+	*ptr = ++p;
+	return count;
+}
+
+/**
+ * xdr_stream_encode_opaque_fixed - Encode fixed length opaque xdr data
+ * @xdr: pointer to xdr_stream
+ * @ptr: pointer to opaque data object
+ * @len: size of object pointed to by @ptr
+ *
+ * Return values:
+ *   On success, returns length in bytes of XDR buffer consumed
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_encode_opaque_fixed(struct xdr_stream *xdr, const void *ptr, size_t len)
+{
+	__be32 *p = xdr_reserve_space(xdr, len);
+
+	if (unlikely(!p))
+		return -EMSGSIZE;
+	xdr_encode_opaque_fixed(p, ptr, len);
+	return xdr_align_size(len);
+}
+
+/**
+ * xdr_stream_encode_opaque - Encode variable length opaque xdr data
+ * @xdr: pointer to xdr_stream
+ * @ptr: pointer to opaque data object
+ * @len: size of object pointed to by @ptr
+ *
+ * Return values:
+ *   On success, returns length in bytes of XDR buffer consumed
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_encode_opaque(struct xdr_stream *xdr, const void *ptr, size_t len)
+{
+	size_t count = sizeof(__u32) + xdr_align_size(len);
+	__be32 *p = xdr_reserve_space(xdr, count);
+
+	if (unlikely(!p))
+		return -EMSGSIZE;
+	xdr_encode_opaque(p, ptr, len);
+	return count;
+}
+
+/**
+ * xdr_stream_encode_uint32_array - Encode variable length array of integers
+ * @xdr: pointer to xdr_stream
+ * @array: array of integers
+ * @array_size: number of elements in @array
+ *
+ * Return values:
+ *   On success, returns length in bytes of XDR buffer consumed
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_encode_uint32_array(struct xdr_stream *xdr,
+		const __u32 *array, size_t array_size)
+{
+	ssize_t ret = (array_size+1) * sizeof(__u32);
+	__be32 *p = xdr_reserve_space(xdr, ret);
+
+	if (unlikely(!p))
+		return -EMSGSIZE;
+	*p++ = cpu_to_be32(array_size);
+	for (; array_size > 0; p++, array++, array_size--)
+		*p = cpu_to_be32p(array);
+	return ret;
+}
+
+/**
+ * xdr_stream_decode_u32 - Decode a 32-bit integer
+ * @xdr: pointer to xdr_stream
+ * @ptr: location to store integer
+ *
+ * Return values:
+ *   %0 on success
+ *   %-EBADMSG on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr)
+{
+	const size_t count = sizeof(*ptr);
+	__be32 *p = xdr_inline_decode(xdr, count);
+
+	if (unlikely(!p))
+		return -EBADMSG;
+	*ptr = be32_to_cpup(p);
+	return 0;
+}
+
+/**
+ * xdr_stream_decode_opaque_fixed - Decode fixed length opaque xdr data
+ * @xdr: pointer to xdr_stream
+ * @ptr: location to store data
+ * @len: size of buffer pointed to by @ptr
+ *
+ * Return values:
+ *   On success, returns size of object stored in @ptr
+ *   %-EBADMSG on XDR buffer overflow
+ */
+static inline ssize_t
+xdr_stream_decode_opaque_fixed(struct xdr_stream *xdr, void *ptr, size_t len)
+{
+	__be32 *p = xdr_inline_decode(xdr, len);
+
+	if (unlikely(!p))
+		return -EBADMSG;
+	xdr_decode_opaque_fixed(p, ptr, len);
+	return len;
+}
+
+/**
+ * xdr_stream_decode_opaque_inline - Decode variable length opaque xdr data
+ * @xdr: pointer to xdr_stream
+ * @ptr: location to store pointer to opaque data
+ * @maxlen: maximum acceptable object size
+ *
+ * Note: the pointer stored in @ptr cannot be assumed valid after the XDR
+ * buffer has been destroyed, or even after calling xdr_inline_decode()
+ * on @xdr. It is therefore expected that the object it points to should
+ * be processed immediately.
+ *
+ * Return values:
+ *   On success, returns size of object stored in *@ptr
+ *   %-EBADMSG on XDR buffer overflow
+ *   %-EMSGSIZE if the size of the object would exceed @maxlen
+ */
+static inline ssize_t
+xdr_stream_decode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t maxlen)
+{
+	__be32 *p;
+	__u32 len;
+
+	*ptr = NULL;
+	if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0))
+		return -EBADMSG;
+	if (len != 0) {
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			return -EBADMSG;
+		if (unlikely(len > maxlen))
+			return -EMSGSIZE;
+		*ptr = p;
+	}
+	return len;
+}
+
+/**
+ * xdr_stream_decode_uint32_array - Decode variable length array of integers
+ * @xdr: pointer to xdr_stream
+ * @array: location to store the integer array or NULL
+ * @array_size: number of elements to store
+ *
+ * Return values:
+ *   On success, returns number of elements stored in @array
+ *   %-EBADMSG on XDR buffer overflow
+ *   %-EMSGSIZE if the size of the array exceeds @array_size
+ */
+static inline ssize_t
+xdr_stream_decode_uint32_array(struct xdr_stream *xdr,
+		__u32 *array, size_t array_size)
+{
+	__be32 *p;
+	__u32 len;
+	ssize_t retval;
+
+	if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0))
+		return -EBADMSG;
+	p = xdr_inline_decode(xdr, len * sizeof(*p));
+	if (unlikely(!p))
+		return -EBADMSG;
+	if (array == NULL)
+		return len;
+	if (len <= array_size) {
+		if (len < array_size)
+			memset(array+len, 0, (array_size-len)*sizeof(*array));
+		array_size = len;
+		retval = len;
+	} else
+		retval = -EMSGSIZE;
+	for (; array_size > 0; p++, array++, array_size--)
+		*array = be32_to_cpup(p);
+	return retval;
+}
+#endif /* __KERNEL__ */
+
+#endif /* _SUNRPC_XDR_H_ */
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
new file mode 100644
index 0000000..5fea0fb
--- /dev/null
+++ b/include/linux/sunrpc/xprt.h
@@ -0,0 +1,488 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  linux/include/linux/sunrpc/xprt.h
+ *
+ *  Declarations for the RPC transport interface.
+ *
+ *  Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef _LINUX_SUNRPC_XPRT_H
+#define _LINUX_SUNRPC_XPRT_H
+
+#include <linux/uio.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/ktime.h>
+#include <linux/kref.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/msg_prot.h>
+
+#ifdef __KERNEL__
+
+#define RPC_MIN_SLOT_TABLE	(2U)
+#define RPC_DEF_SLOT_TABLE	(16U)
+#define RPC_MAX_SLOT_TABLE_LIMIT	(65536U)
+#define RPC_MAX_SLOT_TABLE	RPC_MAX_SLOT_TABLE_LIMIT
+
+#define RPC_CWNDSHIFT		(8U)
+#define RPC_CWNDSCALE		(1U << RPC_CWNDSHIFT)
+#define RPC_INITCWND		RPC_CWNDSCALE
+#define RPC_MAXCWND(xprt)	((xprt)->max_reqs << RPC_CWNDSHIFT)
+#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
+
+/*
+ * This describes a timeout strategy
+ */
+struct rpc_timeout {
+	unsigned long		to_initval,		/* initial timeout */
+				to_maxval,		/* max timeout */
+				to_increment;		/* if !exponential */
+	unsigned int		to_retries;		/* max # of retries */
+	unsigned char		to_exponential;
+};
+
+enum rpc_display_format_t {
+	RPC_DISPLAY_ADDR = 0,
+	RPC_DISPLAY_PORT,
+	RPC_DISPLAY_PROTO,
+	RPC_DISPLAY_HEX_ADDR,
+	RPC_DISPLAY_HEX_PORT,
+	RPC_DISPLAY_NETID,
+	RPC_DISPLAY_MAX,
+};
+
+struct rpc_task;
+struct rpc_xprt;
+struct seq_file;
+struct svc_serv;
+struct net;
+
+/*
+ * This describes a complete RPC request
+ */
+struct rpc_rqst {
+	/*
+	 * This is the user-visible part
+	 */
+	struct rpc_xprt *	rq_xprt;		/* RPC client */
+	struct xdr_buf		rq_snd_buf;		/* send buffer */
+	struct xdr_buf		rq_rcv_buf;		/* recv buffer */
+
+	/*
+	 * This is the private part
+	 */
+	struct rpc_task *	rq_task;	/* RPC task data */
+	struct rpc_cred *	rq_cred;	/* Bound cred */
+	__be32			rq_xid;		/* request XID */
+	int			rq_cong;	/* has incremented xprt->cong */
+	u32			rq_seqno;	/* gss seq no. used on req. */
+	int			rq_enc_pages_num;
+	struct page		**rq_enc_pages;	/* scratch pages for use by
+						   gss privacy code */
+	void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */
+	struct list_head	rq_list;
+
+	void			*rq_xprtdata;	/* Per-xprt private data */
+	void			*rq_buffer;	/* Call XDR encode buffer */
+	size_t			rq_callsize;
+	void			*rq_rbuffer;	/* Reply XDR decode buffer */
+	size_t			rq_rcvsize;
+	size_t			rq_xmit_bytes_sent;	/* total bytes sent */
+	size_t			rq_reply_bytes_recvd;	/* total reply bytes */
+							/* received */
+
+	struct xdr_buf		rq_private_buf;		/* The receive buffer
+							 * used in the softirq.
+							 */
+	unsigned long		rq_majortimeo;	/* major timeout alarm */
+	unsigned long		rq_timeout;	/* Current timeout value */
+	ktime_t			rq_rtt;		/* round-trip time */
+	unsigned int		rq_retries;	/* # of retries */
+	unsigned int		rq_connect_cookie;
+						/* A cookie used to track the
+						   state of the transport
+						   connection */
+	
+	/*
+	 * Partial send handling
+	 */
+	u32			rq_bytes_sent;	/* Bytes we have sent */
+
+	ktime_t			rq_xtime;	/* transmit time stamp */
+	int			rq_ntrans;
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	struct list_head	rq_bc_list;	/* Callback service list */
+	unsigned long		rq_bc_pa_state;	/* Backchannel prealloc state */
+	struct list_head	rq_bc_pa_list;	/* Backchannel prealloc list */
+#endif /* CONFIG_SUNRPC_BACKCHANEL */
+};
+#define rq_svec			rq_snd_buf.head
+#define rq_slen			rq_snd_buf.len
+
+struct rpc_xprt_ops {
+	void		(*set_buffer_size)(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize);
+	int		(*reserve_xprt)(struct rpc_xprt *xprt, struct rpc_task *task);
+	void		(*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task);
+	void		(*alloc_slot)(struct rpc_xprt *xprt, struct rpc_task *task);
+	void		(*rpcbind)(struct rpc_task *task);
+	void		(*set_port)(struct rpc_xprt *xprt, unsigned short port);
+	void		(*connect)(struct rpc_xprt *xprt, struct rpc_task *task);
+	int		(*buf_alloc)(struct rpc_task *task);
+	void		(*buf_free)(struct rpc_task *task);
+	int		(*send_request)(struct rpc_task *task);
+	void		(*set_retrans_timeout)(struct rpc_task *task);
+	void		(*timer)(struct rpc_xprt *xprt, struct rpc_task *task);
+	void		(*release_request)(struct rpc_task *task);
+	void		(*close)(struct rpc_xprt *xprt);
+	void		(*destroy)(struct rpc_xprt *xprt);
+	void		(*set_connect_timeout)(struct rpc_xprt *xprt,
+					unsigned long connect_timeout,
+					unsigned long reconnect_timeout);
+	void		(*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq);
+	int		(*enable_swap)(struct rpc_xprt *xprt);
+	void		(*disable_swap)(struct rpc_xprt *xprt);
+	void		(*inject_disconnect)(struct rpc_xprt *xprt);
+	int		(*bc_setup)(struct rpc_xprt *xprt,
+				    unsigned int min_reqs);
+	int		(*bc_up)(struct svc_serv *serv, struct net *net);
+	size_t		(*bc_maxpayload)(struct rpc_xprt *xprt);
+	void		(*bc_free_rqst)(struct rpc_rqst *rqst);
+	void		(*bc_destroy)(struct rpc_xprt *xprt,
+				      unsigned int max_reqs);
+};
+
+/*
+ * RPC transport identifiers
+ *
+ * To preserve compatibility with the historical use of raw IP protocol
+ * id's for transport selection, UDP and TCP identifiers are specified
+ * with the previous values. No such restriction exists for new transports,
+ * except that they may not collide with these values (17 and 6,
+ * respectively).
+ */
+#define XPRT_TRANSPORT_BC       (1 << 31)
+enum xprt_transports {
+	XPRT_TRANSPORT_UDP	= IPPROTO_UDP,
+	XPRT_TRANSPORT_TCP	= IPPROTO_TCP,
+	XPRT_TRANSPORT_BC_TCP	= IPPROTO_TCP | XPRT_TRANSPORT_BC,
+	XPRT_TRANSPORT_RDMA	= 256,
+	XPRT_TRANSPORT_BC_RDMA	= XPRT_TRANSPORT_RDMA | XPRT_TRANSPORT_BC,
+	XPRT_TRANSPORT_LOCAL	= 257,
+};
+
+struct rpc_xprt {
+	struct kref		kref;		/* Reference count */
+	const struct rpc_xprt_ops *ops;		/* transport methods */
+
+	const struct rpc_timeout *timeout;	/* timeout parms */
+	struct sockaddr_storage	addr;		/* server address */
+	size_t			addrlen;	/* size of server address */
+	int			prot;		/* IP protocol */
+
+	unsigned long		cong;		/* current congestion */
+	unsigned long		cwnd;		/* congestion window */
+
+	size_t			max_payload;	/* largest RPC payload size,
+						   in bytes */
+	unsigned int		tsh_size;	/* size of transport specific
+						   header */
+
+	struct rpc_wait_queue	binding;	/* requests waiting on rpcbind */
+	struct rpc_wait_queue	sending;	/* requests waiting to send */
+	struct rpc_wait_queue	pending;	/* requests in flight */
+	struct rpc_wait_queue	backlog;	/* waiting for slot */
+	struct list_head	free;		/* free slots */
+	unsigned int		max_reqs;	/* max number of slots */
+	unsigned int		min_reqs;	/* min number of slots */
+	unsigned int		num_reqs;	/* total slots */
+	unsigned long		state;		/* transport state */
+	unsigned char		resvport   : 1; /* use a reserved port */
+	atomic_t		swapper;	/* we're swapping over this
+						   transport */
+	unsigned int		bind_index;	/* bind function index */
+
+	/*
+	 * Multipath
+	 */
+	struct list_head	xprt_switch;
+
+	/*
+	 * Connection of transports
+	 */
+	unsigned long		bind_timeout,
+				reestablish_timeout;
+	unsigned int		connect_cookie;	/* A cookie that gets bumped
+						   every time the transport
+						   is reconnected */
+
+	/*
+	 * Disconnection of idle transports
+	 */
+	struct work_struct	task_cleanup;
+	struct timer_list	timer;
+	unsigned long		last_used,
+				idle_timeout,
+				connect_timeout,
+				max_reconnect_timeout;
+
+	/*
+	 * Send stuff
+	 */
+	spinlock_t		transport_lock;	/* lock transport info */
+	spinlock_t		reserve_lock;	/* lock slot table */
+	spinlock_t		recv_lock;	/* lock receive list */
+	u32			xid;		/* Next XID value to use */
+	struct rpc_task *	snd_task;	/* Task blocked in send */
+	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	struct svc_serv		*bc_serv;       /* The RPC service which will */
+						/* process the callback */
+	int			bc_alloc_count;	/* Total number of preallocs */
+	atomic_t		bc_free_slots;
+	spinlock_t		bc_pa_lock;	/* Protects the preallocated
+						 * items */
+	struct list_head	bc_pa_list;	/* List of preallocated
+						 * backchannel rpc_rqst's */
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+	struct list_head	recv;
+
+	struct {
+		unsigned long		bind_count,	/* total number of binds */
+					connect_count,	/* total number of connects */
+					connect_start,	/* connect start timestamp */
+					connect_time,	/* jiffies waiting for connect */
+					sends,		/* how many complete requests */
+					recvs,		/* how many complete requests */
+					bad_xids,	/* lookup_rqst didn't find XID */
+					max_slots;	/* max rpc_slots used */
+
+		unsigned long long	req_u,		/* average requests on the wire */
+					bklog_u,	/* backlog queue utilization */
+					sending_u,	/* send q utilization */
+					pending_u;	/* pend q utilization */
+	} stat;
+
+	struct net		*xprt_net;
+	const char		*servername;
+	const char		*address_strings[RPC_DISPLAY_MAX];
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+	struct dentry		*debugfs;		/* debugfs directory */
+	atomic_t		inject_disconnect;
+#endif
+	struct rcu_head		rcu;
+};
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+/*
+ * Backchannel flags
+ */
+#define	RPC_BC_PA_IN_USE	0x0001		/* Preallocated backchannel */
+						/* buffer in use */
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static inline int bc_prealloc(struct rpc_rqst *req)
+{
+	return test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+}
+#else
+static inline int bc_prealloc(struct rpc_rqst *req)
+{
+	return 0;
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
+#define XPRT_CREATE_INFINITE_SLOTS	(1U)
+#define XPRT_CREATE_NO_IDLE_TIMEOUT	(1U << 1)
+
+struct xprt_create {
+	int			ident;		/* XPRT_TRANSPORT identifier */
+	struct net *		net;
+	struct sockaddr *	srcaddr;	/* optional local address */
+	struct sockaddr *	dstaddr;	/* remote peer address */
+	size_t			addrlen;
+	const char		*servername;
+	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
+	struct rpc_xprt_switch	*bc_xps;
+	unsigned int		flags;
+};
+
+struct xprt_class {
+	struct list_head	list;
+	int			ident;		/* XPRT_TRANSPORT identifier */
+	struct rpc_xprt *	(*setup)(struct xprt_create *);
+	struct module		*owner;
+	char			name[32];
+};
+
+/*
+ * Generic internal transport functions
+ */
+struct rpc_xprt		*xprt_create_transport(struct xprt_create *args);
+void			xprt_connect(struct rpc_task *task);
+void			xprt_reserve(struct rpc_task *task);
+void			xprt_retry_reserve(struct rpc_task *task);
+int			xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
+int			xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task);
+void			xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task);
+void			xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task);
+bool			xprt_prepare_transmit(struct rpc_task *task);
+void			xprt_transmit(struct rpc_task *task);
+void			xprt_end_transmit(struct rpc_task *task);
+int			xprt_adjust_timeout(struct rpc_rqst *req);
+void			xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
+void			xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task);
+void			xprt_release(struct rpc_task *task);
+struct rpc_xprt *	xprt_get(struct rpc_xprt *xprt);
+void			xprt_put(struct rpc_xprt *xprt);
+struct rpc_xprt *	xprt_alloc(struct net *net, size_t size,
+				unsigned int num_prealloc,
+				unsigned int max_req);
+void			xprt_free(struct rpc_xprt *);
+
+static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *p)
+{
+	return p + xprt->tsh_size;
+}
+
+static inline int
+xprt_enable_swap(struct rpc_xprt *xprt)
+{
+	return xprt->ops->enable_swap(xprt);
+}
+
+static inline void
+xprt_disable_swap(struct rpc_xprt *xprt)
+{
+	xprt->ops->disable_swap(xprt);
+}
+
+/*
+ * Transport switch helper functions
+ */
+int			xprt_register_transport(struct xprt_class *type);
+int			xprt_unregister_transport(struct xprt_class *type);
+int			xprt_load_transport(const char *);
+void			xprt_set_retrans_timeout_def(struct rpc_task *task);
+void			xprt_set_retrans_timeout_rtt(struct rpc_task *task);
+void			xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
+void			xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action);
+void			xprt_write_space(struct rpc_xprt *xprt);
+void			xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result);
+struct rpc_rqst *	xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid);
+void			xprt_update_rtt(struct rpc_task *task);
+void			xprt_complete_rqst(struct rpc_task *task, int copied);
+void			xprt_pin_rqst(struct rpc_rqst *req);
+void			xprt_unpin_rqst(struct rpc_rqst *req);
+void			xprt_release_rqst_cong(struct rpc_task *task);
+void			xprt_disconnect_done(struct rpc_xprt *xprt);
+void			xprt_force_disconnect(struct rpc_xprt *xprt);
+void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
+
+bool			xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *);
+void			xprt_unlock_connect(struct rpc_xprt *, void *);
+
+/*
+ * Reserved bit positions in xprt->state
+ */
+#define XPRT_LOCKED		(0)
+#define XPRT_CONNECTED		(1)
+#define XPRT_CONNECTING		(2)
+#define XPRT_CLOSE_WAIT		(3)
+#define XPRT_BOUND		(4)
+#define XPRT_BINDING		(5)
+#define XPRT_CLOSING		(6)
+#define XPRT_CONGESTED		(9)
+
+static inline void xprt_set_connected(struct rpc_xprt *xprt)
+{
+	set_bit(XPRT_CONNECTED, &xprt->state);
+}
+
+static inline void xprt_clear_connected(struct rpc_xprt *xprt)
+{
+	clear_bit(XPRT_CONNECTED, &xprt->state);
+}
+
+static inline int xprt_connected(struct rpc_xprt *xprt)
+{
+	return test_bit(XPRT_CONNECTED, &xprt->state);
+}
+
+static inline int xprt_test_and_set_connected(struct rpc_xprt *xprt)
+{
+	return test_and_set_bit(XPRT_CONNECTED, &xprt->state);
+}
+
+static inline int xprt_test_and_clear_connected(struct rpc_xprt *xprt)
+{
+	return test_and_clear_bit(XPRT_CONNECTED, &xprt->state);
+}
+
+static inline void xprt_clear_connecting(struct rpc_xprt *xprt)
+{
+	smp_mb__before_atomic();
+	clear_bit(XPRT_CONNECTING, &xprt->state);
+	smp_mb__after_atomic();
+}
+
+static inline int xprt_connecting(struct rpc_xprt *xprt)
+{
+	return test_bit(XPRT_CONNECTING, &xprt->state);
+}
+
+static inline int xprt_test_and_set_connecting(struct rpc_xprt *xprt)
+{
+	return test_and_set_bit(XPRT_CONNECTING, &xprt->state);
+}
+
+static inline void xprt_set_bound(struct rpc_xprt *xprt)
+{
+	test_and_set_bit(XPRT_BOUND, &xprt->state);
+}
+
+static inline int xprt_bound(struct rpc_xprt *xprt)
+{
+	return test_bit(XPRT_BOUND, &xprt->state);
+}
+
+static inline void xprt_clear_bound(struct rpc_xprt *xprt)
+{
+	clear_bit(XPRT_BOUND, &xprt->state);
+}
+
+static inline void xprt_clear_binding(struct rpc_xprt *xprt)
+{
+	smp_mb__before_atomic();
+	clear_bit(XPRT_BINDING, &xprt->state);
+	smp_mb__after_atomic();
+}
+
+static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt)
+{
+	return test_and_set_bit(XPRT_BINDING, &xprt->state);
+}
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+extern unsigned int rpc_inject_disconnect;
+static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
+{
+	if (!rpc_inject_disconnect)
+		return;
+	if (atomic_dec_return(&xprt->inject_disconnect))
+		return;
+	atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
+	xprt->ops->inject_disconnect(xprt);
+}
+#else
+static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
+{
+}
+#endif
+
+#endif /* __KERNEL__*/
+
+#endif /* _LINUX_SUNRPC_XPRT_H */
diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
new file mode 100644
index 0000000..af1257c
--- /dev/null
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RPC client multipathing definitions
+ *
+ * Copyright (c) 2015, 2016, Primary Data, Inc. All rights reserved.
+ *
+ * Trond Myklebust <trond.myklebust@primarydata.com>
+ */
+#ifndef _NET_SUNRPC_XPRTMULTIPATH_H
+#define _NET_SUNRPC_XPRTMULTIPATH_H
+
+struct rpc_xprt_iter_ops;
+struct rpc_xprt_switch {
+	spinlock_t		xps_lock;
+	struct kref		xps_kref;
+
+	unsigned int		xps_nxprts;
+	struct list_head	xps_xprt_list;
+
+	struct net *		xps_net;
+
+	const struct rpc_xprt_iter_ops *xps_iter_ops;
+
+	struct rcu_head		xps_rcu;
+};
+
+struct rpc_xprt_iter {
+	struct rpc_xprt_switch __rcu *xpi_xpswitch;
+	struct rpc_xprt *	xpi_cursor;
+
+	const struct rpc_xprt_iter_ops *xpi_ops;
+};
+
+
+struct rpc_xprt_iter_ops {
+	void (*xpi_rewind)(struct rpc_xprt_iter *);
+	struct rpc_xprt *(*xpi_xprt)(struct rpc_xprt_iter *);
+	struct rpc_xprt *(*xpi_next)(struct rpc_xprt_iter *);
+};
+
+extern struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt,
+		gfp_t gfp_flags);
+
+extern struct rpc_xprt_switch *xprt_switch_get(struct rpc_xprt_switch *xps);
+extern void xprt_switch_put(struct rpc_xprt_switch *xps);
+
+extern void rpc_xprt_switch_set_roundrobin(struct rpc_xprt_switch *xps);
+
+extern void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
+		struct rpc_xprt *xprt);
+extern void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps,
+		struct rpc_xprt *xprt);
+
+extern void xprt_iter_init(struct rpc_xprt_iter *xpi,
+		struct rpc_xprt_switch *xps);
+
+extern void xprt_iter_init_listall(struct rpc_xprt_iter *xpi,
+		struct rpc_xprt_switch *xps);
+
+extern void xprt_iter_destroy(struct rpc_xprt_iter *xpi);
+
+extern struct rpc_xprt_switch *xprt_iter_xchg_switch(
+		struct rpc_xprt_iter *xpi,
+		struct rpc_xprt_switch *newswitch);
+
+extern struct rpc_xprt *xprt_iter_xprt(struct rpc_xprt_iter *xpi);
+extern struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi);
+extern struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi);
+
+extern bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
+		const struct sockaddr *sap);
+#endif
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
new file mode 100644
index 0000000..5859563
--- /dev/null
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRTRDMA_H
+#define _LINUX_SUNRPC_XPRTRDMA_H
+
+/*
+ * Constants. Max RPC/NFS header is big enough to account for
+ * additional marshaling buffers passed down by Linux client.
+ *
+ * RDMA header is currently fixed max size, and is big enough for a
+ * fully-chunked NFS message (read chunks are the largest). Note only
+ * a single chunk type per message is supported currently.
+ */
+#define RPCRDMA_MIN_SLOT_TABLE	(2U)
+#define RPCRDMA_DEF_SLOT_TABLE	(128U)
+#define RPCRDMA_MAX_SLOT_TABLE	(256U)
+
+#define RPCRDMA_MIN_INLINE  (1024)	/* min inline thresh */
+#define RPCRDMA_DEF_INLINE  (4096)	/* default inline thresh */
+#define RPCRDMA_MAX_INLINE  (65536)	/* max inline thresh */
+
+/* Memory registration strategies, by number.
+ * This is part of a kernel / user space API. Do not remove. */
+enum rpcrdma_memreg {
+	RPCRDMA_BOUNCEBUFFERS = 0,
+	RPCRDMA_REGISTER,
+	RPCRDMA_MEMWINDOWS,
+	RPCRDMA_MEMWINDOWS_ASYNC,
+	RPCRDMA_MTHCAFMR,
+	RPCRDMA_FRWR,
+	RPCRDMA_ALLPHYSICAL,
+	RPCRDMA_LAST
+};
+
+#endif /* _LINUX_SUNRPC_XPRTRDMA_H */
diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
new file mode 100644
index 0000000..ae0f99b
--- /dev/null
+++ b/include/linux/sunrpc/xprtsock.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  linux/include/linux/sunrpc/xprtsock.h
+ *
+ *  Declarations for the RPC transport socket provider.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRTSOCK_H
+#define _LINUX_SUNRPC_XPRTSOCK_H
+
+#ifdef __KERNEL__
+
+int		init_socket_xprt(void);
+void		cleanup_socket_xprt(void);
+
+#define RPC_MIN_RESVPORT	(1U)
+#define RPC_MAX_RESVPORT	(65535U)
+#define RPC_DEF_MIN_RESVPORT	(665U)
+#define RPC_DEF_MAX_RESVPORT	(1023U)
+
+struct sock_xprt {
+	struct rpc_xprt		xprt;
+
+	/*
+	 * Network layer
+	 */
+	struct socket *		sock;
+	struct sock *		inet;
+
+	/*
+	 * State of TCP reply receive
+	 */
+	__be32			tcp_fraghdr,
+				tcp_xid,
+				tcp_calldir;
+
+	u32			tcp_offset,
+				tcp_reclen;
+
+	unsigned long		tcp_copied,
+				tcp_flags;
+
+	/*
+	 * Connection of transports
+	 */
+	unsigned long		sock_state;
+	struct delayed_work	connect_worker;
+	struct work_struct	recv_worker;
+	struct mutex		recv_mutex;
+	struct sockaddr_storage	srcaddr;
+	unsigned short		srcport;
+
+	/*
+	 * UDP socket buffer size parameters
+	 */
+	size_t			rcvsize,
+				sndsize;
+
+	struct rpc_timeout	tcp_timeout;
+
+	/*
+	 * Saved socket callback addresses
+	 */
+	void			(*old_data_ready)(struct sock *);
+	void			(*old_state_change)(struct sock *);
+	void			(*old_write_space)(struct sock *);
+	void			(*old_error_report)(struct sock *);
+};
+
+/*
+ * TCP receive state flags
+ */
+#define TCP_RCV_LAST_FRAG	(1UL << 0)
+#define TCP_RCV_COPY_FRAGHDR	(1UL << 1)
+#define TCP_RCV_COPY_XID	(1UL << 2)
+#define TCP_RCV_COPY_DATA	(1UL << 3)
+#define TCP_RCV_READ_CALLDIR	(1UL << 4)
+#define TCP_RCV_COPY_CALLDIR	(1UL << 5)
+
+/*
+ * TCP RPC flags
+ */
+#define TCP_RPC_REPLY		(1UL << 6)
+
+#define XPRT_SOCK_CONNECTING	1U
+#define XPRT_SOCK_DATA_READY	(2)
+#define XPRT_SOCK_UPD_TIMEOUT	(3)
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_SUNRPC_XPRTSOCK_H */
diff --git a/include/rdma/ib.h b/include/rdma/ib.h
new file mode 100644
index 0000000..66dbed0
--- /dev/null
+++ b/include/rdma/ib.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2010 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(_RDMA_IB_H)
+#define _RDMA_IB_H
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/cred.h>
+
+struct ib_addr {
+	union {
+		__u8		uib_addr8[16];
+		__be16		uib_addr16[8];
+		__be32		uib_addr32[4];
+		__be64		uib_addr64[2];
+	} ib_u;
+#define sib_addr8		ib_u.uib_addr8
+#define sib_addr16		ib_u.uib_addr16
+#define sib_addr32		ib_u.uib_addr32
+#define sib_addr64		ib_u.uib_addr64
+#define sib_raw			ib_u.uib_addr8
+#define sib_subnet_prefix	ib_u.uib_addr64[0]
+#define sib_interface_id	ib_u.uib_addr64[1]
+};
+
+static inline int ib_addr_any(const struct ib_addr *a)
+{
+	return ((a->sib_addr64[0] | a->sib_addr64[1]) == 0);
+}
+
+static inline int ib_addr_loopback(const struct ib_addr *a)
+{
+	return ((a->sib_addr32[0] | a->sib_addr32[1] |
+		 a->sib_addr32[2] | (a->sib_addr32[3] ^ htonl(1))) == 0);
+}
+
+static inline void ib_addr_set(struct ib_addr *addr,
+			       __be32 w1, __be32 w2, __be32 w3, __be32 w4)
+{
+	addr->sib_addr32[0] = w1;
+	addr->sib_addr32[1] = w2;
+	addr->sib_addr32[2] = w3;
+	addr->sib_addr32[3] = w4;
+}
+
+static inline int ib_addr_cmp(const struct ib_addr *a1, const struct ib_addr *a2)
+{
+	return memcmp(a1, a2, sizeof(struct ib_addr));
+}
+
+struct sockaddr_ib {
+	unsigned short int	sib_family;	/* AF_IB */
+	__be16			sib_pkey;
+	__be32			sib_flowinfo;
+	struct ib_addr		sib_addr;
+	__be64			sib_sid;
+	__be64			sib_sid_mask;
+	__u64			sib_scope_id;
+};
+
+/*
+ * The IB interfaces that use write() as bi-directional ioctl() are
+ * fundamentally unsafe, since there are lots of ways to trigger "write()"
+ * calls from various contexts with elevated privileges. That includes the
+ * traditional suid executable error message writes, but also various kernel
+ * interfaces that can write to file descriptors.
+ *
+ * This function provides protection for the legacy API by restricting the
+ * calling context.
+ */
+static inline bool ib_safe_file_access(struct file *filp)
+{
+	return filp->f_cred == current_cred() && !uaccess_kernel();
+}
+
+#endif /* _RDMA_IB_H */
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
new file mode 100644
index 0000000..a08cc72
--- /dev/null
+++ b/include/rdma/ib_addr.h
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_ADDR_H)
+#define IB_ADDR_H
+
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/socket.h>
+#include <linux/if_vlan.h>
+#include <net/ipv6.h>
+#include <net/if_inet6.h>
+#include <net/ip.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <net/ipv6.h>
+#include <net/net_namespace.h>
+
+struct rdma_addr_client {
+	atomic_t refcount;
+	struct completion comp;
+};
+
+/**
+ * rdma_addr_register_client - Register an address client.
+ */
+void rdma_addr_register_client(struct rdma_addr_client *client);
+
+/**
+ * rdma_addr_unregister_client - Deregister an address client.
+ * @client: Client object to deregister.
+ */
+void rdma_addr_unregister_client(struct rdma_addr_client *client);
+
+/**
+ * struct rdma_dev_addr - Contains resolved RDMA hardware addresses
+ * @src_dev_addr:	Source MAC address.
+ * @dst_dev_addr:	Destination MAC address.
+ * @broadcast:		Broadcast address of the device.
+ * @dev_type:		The interface hardware type of the device.
+ * @bound_dev_if:	An optional device interface index.
+ * @transport:		The transport type used.
+ * @net:		Network namespace containing the bound_dev_if net_dev.
+ */
+struct rdma_dev_addr {
+	unsigned char src_dev_addr[MAX_ADDR_LEN];
+	unsigned char dst_dev_addr[MAX_ADDR_LEN];
+	unsigned char broadcast[MAX_ADDR_LEN];
+	unsigned short dev_type;
+	int bound_dev_if;
+	enum rdma_transport_type transport;
+	struct net *net;
+	enum rdma_network_type network;
+	int hoplimit;
+};
+
+/**
+ * rdma_translate_ip - Translate a local IP address to an RDMA hardware
+ *   address.
+ *
+ * The dev_addr->net field must be initialized.
+ */
+int rdma_translate_ip(const struct sockaddr *addr,
+		      struct rdma_dev_addr *dev_addr);
+
+/**
+ * rdma_resolve_ip - Resolve source and destination IP addresses to
+ *   RDMA hardware addresses.
+ * @client: Address client associated with request.
+ * @src_addr: An optional source address to use in the resolution.  If a
+ *   source address is not provided, a usable address will be returned via
+ *   the callback.
+ * @dst_addr: The destination address to resolve.
+ * @addr: A reference to a data location that will receive the resolved
+ *   addresses.  The data location must remain valid until the callback has
+ *   been invoked. The net field of the addr struct must be valid.
+ * @timeout_ms: Amount of time to wait for the address resolution to complete.
+ * @callback: Call invoked once address resolution has completed, timed out,
+ *   or been canceled.  A status of 0 indicates success.
+ * @context: User-specified context associated with the call.
+ */
+int rdma_resolve_ip(struct rdma_addr_client *client,
+		    struct sockaddr *src_addr, struct sockaddr *dst_addr,
+		    struct rdma_dev_addr *addr, int timeout_ms,
+		    void (*callback)(int status, struct sockaddr *src_addr,
+				     struct rdma_dev_addr *addr, void *context),
+		    void *context);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr);
+
+void rdma_copy_addr(struct rdma_dev_addr *dev_addr,
+		    const struct net_device *dev,
+		    const unsigned char *dst_dev_addr);
+
+int rdma_addr_size(struct sockaddr *addr);
+int rdma_addr_size_in6(struct sockaddr_in6 *addr);
+int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr);
+
+static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
+{
+	return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9];
+}
+
+static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey)
+{
+	dev_addr->broadcast[8] = pkey >> 8;
+	dev_addr->broadcast[9] = (unsigned char) pkey;
+}
+
+static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr,
+				    union ib_gid *gid)
+{
+	memcpy(gid, dev_addr->broadcast + 4, sizeof *gid);
+}
+
+static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr)
+{
+	return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0;
+}
+
+static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev)
+{
+	return is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 0xffff;
+}
+
+static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		ipv6_addr_set_v4mapped(((struct sockaddr_in *)
+					addr)->sin_addr.s_addr,
+				       (struct in6_addr *)gid);
+		break;
+	case AF_INET6:
+		*(struct in6_addr *)&gid->raw =
+			((struct sockaddr_in6 *)addr)->sin6_addr;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */
+static inline void rdma_gid2ip(struct sockaddr *out, const union ib_gid *gid)
+{
+	if (ipv6_addr_v4mapped((struct in6_addr *)gid)) {
+		struct sockaddr_in *out_in = (struct sockaddr_in *)out;
+		memset(out_in, 0, sizeof(*out_in));
+		out_in->sin_family = AF_INET;
+		memcpy(&out_in->sin_addr.s_addr, gid->raw + 12, 4);
+	} else {
+		struct sockaddr_in6 *out_in = (struct sockaddr_in6 *)out;
+		memset(out_in, 0, sizeof(*out_in));
+		out_in->sin6_family = AF_INET6;
+		memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16);
+	}
+}
+
+/*
+ * rdma_get/set_sgid/dgid() APIs are applicable to IB, and iWarp.
+ * They are not applicable to RoCE.
+ * RoCE GIDs are derived from the IP addresses.
+ */
+static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+	memcpy(gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr),
+	       sizeof(*gid));
+}
+
+static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+	memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid);
+}
+
+static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+	memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid);
+}
+
+static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+	memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid);
+}
+
+static inline enum ib_mtu iboe_get_mtu(int mtu)
+{
+	/*
+	 * Reduce IB headers from effective IBoE MTU.
+	 */
+	mtu = mtu - (IB_GRH_BYTES + IB_UDP_BYTES + IB_BTH_BYTES +
+		     IB_EXT_XRC_BYTES + IB_EXT_ATOMICETH_BYTES +
+		     IB_ICRC_BYTES);
+
+	if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096))
+		return IB_MTU_4096;
+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048))
+		return IB_MTU_2048;
+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024))
+		return IB_MTU_1024;
+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512))
+		return IB_MTU_512;
+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256))
+		return IB_MTU_256;
+	else
+		return 0;
+}
+
+static inline int iboe_get_rate(struct net_device *dev)
+{
+	struct ethtool_link_ksettings cmd;
+	int err;
+
+	rtnl_lock();
+	err = __ethtool_get_link_ksettings(dev, &cmd);
+	rtnl_unlock();
+	if (err)
+		return IB_RATE_PORT_CURRENT;
+
+	if (cmd.base.speed >= 40000)
+		return IB_RATE_40_GBPS;
+	else if (cmd.base.speed >= 30000)
+		return IB_RATE_30_GBPS;
+	else if (cmd.base.speed >= 20000)
+		return IB_RATE_20_GBPS;
+	else if (cmd.base.speed >= 10000)
+		return IB_RATE_10_GBPS;
+	else
+		return IB_RATE_PORT_CURRENT;
+}
+
+static inline int rdma_link_local_addr(struct in6_addr *addr)
+{
+	if (addr->s6_addr32[0] == htonl(0xfe800000) &&
+	    addr->s6_addr32[1] == 0)
+		return 1;
+
+	return 0;
+}
+
+static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac)
+{
+	memcpy(mac, &addr->s6_addr[8], 3);
+	memcpy(mac + 3, &addr->s6_addr[13], 3);
+	mac[0] ^= 2;
+}
+
+static inline int rdma_is_multicast_addr(struct in6_addr *addr)
+{
+	__be32 ipv4_addr;
+
+	if (addr->s6_addr[0] == 0xff)
+		return 1;
+
+	ipv4_addr = addr->s6_addr32[3];
+	return (ipv6_addr_v4mapped(addr) && ipv4_is_multicast(ipv4_addr));
+}
+
+static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac)
+{
+	int i;
+
+	mac[0] = 0x33;
+	mac[1] = 0x33;
+	for (i = 2; i < 6; ++i)
+		mac[i] = addr->s6_addr[i + 10];
+}
+
+static inline u16 rdma_get_vlan_id(union ib_gid *dgid)
+{
+	u16 vid;
+
+	vid = dgid->raw[11] << 8 | dgid->raw[12];
+	return vid < 0x1000 ? vid : 0xffff;
+}
+
+static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev)
+{
+	return is_vlan_dev(dev) ? vlan_dev_real_dev(dev) : NULL;
+}
+
+#endif /* IB_ADDR_H */
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
new file mode 100644
index 0000000..eb49cc8
--- /dev/null
+++ b/include/rdma/ib_cache.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IB_CACHE_H
+#define _IB_CACHE_H
+
+#include <rdma/ib_verbs.h>
+
+/**
+ * ib_get_cached_gid - Returns a cached GID table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached GID table to query.
+ * @gid: The GID value found at the specified index.
+ * @attr: The GID attribute found at the specified index (only in RoCE).
+ *   NULL means ignore (output parameter).
+ *
+ * ib_get_cached_gid() fetches the specified GID table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_gid(struct ib_device    *device,
+		      u8                   port_num,
+		      int                  index,
+		      union ib_gid        *gid,
+		      struct ib_gid_attr  *attr);
+
+int ib_find_cached_gid(struct ib_device *device,
+		       const union ib_gid *gid,
+		       enum ib_gid_type gid_type,
+		       struct net_device *ndev,
+		       u8               *port_num,
+		       u16              *index);
+
+int ib_find_cached_gid_by_port(struct ib_device *device,
+			       const union ib_gid *gid,
+			       enum ib_gid_type gid_type,
+			       u8               port_num,
+			       struct net_device *ndev,
+			       u16              *index);
+
+int ib_find_gid_by_filter(struct ib_device *device,
+			  const union ib_gid *gid,
+			  u8 port_num,
+			  bool (*filter)(const union ib_gid *gid,
+					 const struct ib_gid_attr *,
+					 void *),
+			  void *context, u16 *index);
+/**
+ * ib_get_cached_pkey - Returns a cached PKey table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached PKey table to query.
+ * @pkey: The PKey value found at the specified index.
+ *
+ * ib_get_cached_pkey() fetches the specified PKey table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_pkey(struct ib_device    *device_handle,
+		       u8                   port_num,
+		       int                  index,
+		       u16                 *pkey);
+
+/**
+ * ib_find_cached_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the cached PKey table where the PKey was found.
+ *
+ * ib_find_cached_pkey() searches the specified PKey table in
+ * the local software cache.
+ */
+int ib_find_cached_pkey(struct ib_device    *device,
+			u8                   port_num,
+			u16                  pkey,
+			u16                 *index);
+
+/**
+ * ib_find_exact_cached_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs. Comparison uses the FULL 16 bits (incl membership bit)
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the cached PKey table where the PKey was found.
+ *
+ * ib_find_exact_cached_pkey() searches the specified PKey table in
+ * the local software cache.
+ */
+int ib_find_exact_cached_pkey(struct ib_device    *device,
+			      u8                   port_num,
+			      u16                  pkey,
+			      u16                 *index);
+
+/**
+ * ib_get_cached_lmc - Returns a cached lmc table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @lmc: The lmc value for the specified port for that device.
+ *
+ * ib_get_cached_lmc() fetches the specified lmc table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_lmc(struct ib_device *device,
+		      u8                port_num,
+		      u8                *lmc);
+
+/**
+ * ib_get_cached_port_state - Returns a cached port state table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @port_state: port_state for the specified port for that device.
+ *
+ * ib_get_cached_port_state() fetches the specified port_state table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_port_state(struct ib_device *device,
+			      u8                port_num,
+			      enum ib_port_state *port_active);
+
+#endif /* _IB_CACHE_H */
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
new file mode 100644
index 0000000..7979cb0
--- /dev/null
+++ b/include/rdma/ib_cm.h
@@ -0,0 +1,612 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if !defined(IB_CM_H)
+#define IB_CM_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_sa.h>
+
+/* ib_cm and ib_user_cm modules share /sys/class/infiniband_cm */
+extern struct class cm_class;
+
+enum ib_cm_state {
+	IB_CM_IDLE,
+	IB_CM_LISTEN,
+	IB_CM_REQ_SENT,
+	IB_CM_REQ_RCVD,
+	IB_CM_MRA_REQ_SENT,
+	IB_CM_MRA_REQ_RCVD,
+	IB_CM_REP_SENT,
+	IB_CM_REP_RCVD,
+	IB_CM_MRA_REP_SENT,
+	IB_CM_MRA_REP_RCVD,
+	IB_CM_ESTABLISHED,
+	IB_CM_DREQ_SENT,
+	IB_CM_DREQ_RCVD,
+	IB_CM_TIMEWAIT,
+	IB_CM_SIDR_REQ_SENT,
+	IB_CM_SIDR_REQ_RCVD
+};
+
+enum ib_cm_lap_state {
+	IB_CM_LAP_UNINIT,
+	IB_CM_LAP_IDLE,
+	IB_CM_LAP_SENT,
+	IB_CM_LAP_RCVD,
+	IB_CM_MRA_LAP_SENT,
+	IB_CM_MRA_LAP_RCVD,
+};
+
+enum ib_cm_event_type {
+	IB_CM_REQ_ERROR,
+	IB_CM_REQ_RECEIVED,
+	IB_CM_REP_ERROR,
+	IB_CM_REP_RECEIVED,
+	IB_CM_RTU_RECEIVED,
+	IB_CM_USER_ESTABLISHED,
+	IB_CM_DREQ_ERROR,
+	IB_CM_DREQ_RECEIVED,
+	IB_CM_DREP_RECEIVED,
+	IB_CM_TIMEWAIT_EXIT,
+	IB_CM_MRA_RECEIVED,
+	IB_CM_REJ_RECEIVED,
+	IB_CM_LAP_ERROR,
+	IB_CM_LAP_RECEIVED,
+	IB_CM_APR_RECEIVED,
+	IB_CM_SIDR_REQ_ERROR,
+	IB_CM_SIDR_REQ_RECEIVED,
+	IB_CM_SIDR_REP_RECEIVED
+};
+
+enum ib_cm_data_size {
+	IB_CM_REQ_PRIVATE_DATA_SIZE	 = 92,
+	IB_CM_MRA_PRIVATE_DATA_SIZE	 = 222,
+	IB_CM_REJ_PRIVATE_DATA_SIZE	 = 148,
+	IB_CM_REP_PRIVATE_DATA_SIZE	 = 196,
+	IB_CM_RTU_PRIVATE_DATA_SIZE	 = 224,
+	IB_CM_DREQ_PRIVATE_DATA_SIZE	 = 220,
+	IB_CM_DREP_PRIVATE_DATA_SIZE	 = 224,
+	IB_CM_REJ_ARI_LENGTH		 = 72,
+	IB_CM_LAP_PRIVATE_DATA_SIZE	 = 168,
+	IB_CM_APR_PRIVATE_DATA_SIZE	 = 148,
+	IB_CM_APR_INFO_LENGTH		 = 72,
+	IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216,
+	IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136,
+	IB_CM_SIDR_REP_INFO_LENGTH	 = 72,
+};
+
+struct ib_cm_id;
+
+struct ib_cm_req_event_param {
+	struct ib_cm_id		*listen_id;
+
+	/* P_Key that was used by the GMP's BTH header */
+	u16			bth_pkey;
+
+	u8			port;
+
+	struct sa_path_rec	*primary_path;
+	struct sa_path_rec	*alternate_path;
+
+	__be64			remote_ca_guid;
+	u32			remote_qkey;
+	u32			remote_qpn;
+	enum ib_qp_type		qp_type;
+
+	u32			starting_psn;
+	u8			responder_resources;
+	u8			initiator_depth;
+	unsigned int		local_cm_response_timeout:5;
+	unsigned int		flow_control:1;
+	unsigned int		remote_cm_response_timeout:5;
+	unsigned int		retry_count:3;
+	unsigned int		rnr_retry_count:3;
+	unsigned int		srq:1;
+};
+
+struct ib_cm_rep_event_param {
+	__be64			remote_ca_guid;
+	u32			remote_qkey;
+	u32			remote_qpn;
+	u32			starting_psn;
+	u8			responder_resources;
+	u8			initiator_depth;
+	unsigned int		target_ack_delay:5;
+	unsigned int		failover_accepted:2;
+	unsigned int		flow_control:1;
+	unsigned int		rnr_retry_count:3;
+	unsigned int		srq:1;
+};
+
+enum ib_cm_rej_reason {
+	IB_CM_REJ_NO_QP				= 1,
+	IB_CM_REJ_NO_EEC			= 2,
+	IB_CM_REJ_NO_RESOURCES			= 3,
+	IB_CM_REJ_TIMEOUT			= 4,
+	IB_CM_REJ_UNSUPPORTED			= 5,
+	IB_CM_REJ_INVALID_COMM_ID		= 6,
+	IB_CM_REJ_INVALID_COMM_INSTANCE		= 7,
+	IB_CM_REJ_INVALID_SERVICE_ID		= 8,
+	IB_CM_REJ_INVALID_TRANSPORT_TYPE	= 9,
+	IB_CM_REJ_STALE_CONN			= 10,
+	IB_CM_REJ_RDC_NOT_EXIST			= 11,
+	IB_CM_REJ_INVALID_GID			= 12,
+	IB_CM_REJ_INVALID_LID			= 13,
+	IB_CM_REJ_INVALID_SL			= 14,
+	IB_CM_REJ_INVALID_TRAFFIC_CLASS		= 15,
+	IB_CM_REJ_INVALID_HOP_LIMIT		= 16,
+	IB_CM_REJ_INVALID_PACKET_RATE		= 17,
+	IB_CM_REJ_INVALID_ALT_GID		= 18,
+	IB_CM_REJ_INVALID_ALT_LID		= 19,
+	IB_CM_REJ_INVALID_ALT_SL		= 20,
+	IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS	= 21,
+	IB_CM_REJ_INVALID_ALT_HOP_LIMIT		= 22,
+	IB_CM_REJ_INVALID_ALT_PACKET_RATE	= 23,
+	IB_CM_REJ_PORT_CM_REDIRECT		= 24,
+	IB_CM_REJ_PORT_REDIRECT			= 25,
+	IB_CM_REJ_INVALID_MTU			= 26,
+	IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES	= 27,
+	IB_CM_REJ_CONSUMER_DEFINED		= 28,
+	IB_CM_REJ_INVALID_RNR_RETRY		= 29,
+	IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID	= 30,
+	IB_CM_REJ_INVALID_CLASS_VERSION		= 31,
+	IB_CM_REJ_INVALID_FLOW_LABEL		= 32,
+	IB_CM_REJ_INVALID_ALT_FLOW_LABEL	= 33
+};
+
+struct ib_cm_rej_event_param {
+	enum ib_cm_rej_reason	reason;
+	void			*ari;
+	u8			ari_length;
+};
+
+struct ib_cm_mra_event_param {
+	u8	service_timeout;
+};
+
+struct ib_cm_lap_event_param {
+	struct sa_path_rec	*alternate_path;
+};
+
+enum ib_cm_apr_status {
+	IB_CM_APR_SUCCESS,
+	IB_CM_APR_INVALID_COMM_ID,
+	IB_CM_APR_UNSUPPORTED,
+	IB_CM_APR_REJECT,
+	IB_CM_APR_REDIRECT,
+	IB_CM_APR_IS_CURRENT,
+	IB_CM_APR_INVALID_QPN_EECN,
+	IB_CM_APR_INVALID_LID,
+	IB_CM_APR_INVALID_GID,
+	IB_CM_APR_INVALID_FLOW_LABEL,
+	IB_CM_APR_INVALID_TCLASS,
+	IB_CM_APR_INVALID_HOP_LIMIT,
+	IB_CM_APR_INVALID_PACKET_RATE,
+	IB_CM_APR_INVALID_SL
+};
+
+struct ib_cm_apr_event_param {
+	enum ib_cm_apr_status	ap_status;
+	void			*apr_info;
+	u8			info_len;
+};
+
+struct ib_cm_sidr_req_event_param {
+	struct ib_cm_id		*listen_id;
+	__be64			service_id;
+	/* P_Key that was used by the GMP's BTH header */
+	u16			bth_pkey;
+	u8			port;
+	u16			pkey;
+};
+
+enum ib_cm_sidr_status {
+	IB_SIDR_SUCCESS,
+	IB_SIDR_UNSUPPORTED,
+	IB_SIDR_REJECT,
+	IB_SIDR_NO_QP,
+	IB_SIDR_REDIRECT,
+	IB_SIDR_UNSUPPORTED_VERSION
+};
+
+struct ib_cm_sidr_rep_event_param {
+	enum ib_cm_sidr_status	status;
+	u32			qkey;
+	u32			qpn;
+	void			*info;
+	u8			info_len;
+};
+
+struct ib_cm_event {
+	enum ib_cm_event_type	event;
+	union {
+		struct ib_cm_req_event_param	req_rcvd;
+		struct ib_cm_rep_event_param	rep_rcvd;
+		/* No data for RTU received events. */
+		struct ib_cm_rej_event_param	rej_rcvd;
+		struct ib_cm_mra_event_param	mra_rcvd;
+		struct ib_cm_lap_event_param	lap_rcvd;
+		struct ib_cm_apr_event_param	apr_rcvd;
+		/* No data for DREQ/DREP received events. */
+		struct ib_cm_sidr_req_event_param sidr_req_rcvd;
+		struct ib_cm_sidr_rep_event_param sidr_rep_rcvd;
+		enum ib_wc_status		send_status;
+	} param;
+
+	void			*private_data;
+};
+
+#define CM_REQ_ATTR_ID		cpu_to_be16(0x0010)
+#define CM_MRA_ATTR_ID		cpu_to_be16(0x0011)
+#define CM_REJ_ATTR_ID		cpu_to_be16(0x0012)
+#define CM_REP_ATTR_ID		cpu_to_be16(0x0013)
+#define CM_RTU_ATTR_ID		cpu_to_be16(0x0014)
+#define CM_DREQ_ATTR_ID		cpu_to_be16(0x0015)
+#define CM_DREP_ATTR_ID		cpu_to_be16(0x0016)
+#define CM_SIDR_REQ_ATTR_ID	cpu_to_be16(0x0017)
+#define CM_SIDR_REP_ATTR_ID	cpu_to_be16(0x0018)
+#define CM_LAP_ATTR_ID		cpu_to_be16(0x0019)
+#define CM_APR_ATTR_ID		cpu_to_be16(0x001A)
+
+/**
+ * ib_cm_handler - User-defined callback to process communication events.
+ * @cm_id: Communication identifier associated with the reported event.
+ * @event: Information about the communication event.
+ *
+ * IB_CM_REQ_RECEIVED and IB_CM_SIDR_REQ_RECEIVED communication events
+ * generated as a result of listen requests result in the allocation of a
+ * new @cm_id.  The new @cm_id is returned to the user through this callback.
+ * Clients are responsible for destroying the new @cm_id.  For peer-to-peer
+ * IB_CM_REQ_RECEIVED and all other events, the returned @cm_id corresponds
+ * to a user's existing communication identifier.
+ *
+ * Users may not call ib_destroy_cm_id while in the context of this callback;
+ * however, returning a non-zero value instructs the communication manager to
+ * destroy the @cm_id after the callback completes.
+ */
+typedef int (*ib_cm_handler)(struct ib_cm_id *cm_id,
+			     struct ib_cm_event *event);
+
+struct ib_cm_id {
+	ib_cm_handler		cm_handler;
+	void			*context;
+	struct ib_device	*device;
+	__be64			service_id;
+	__be64			service_mask;
+	enum ib_cm_state	state;		/* internal CM/debug use */
+	enum ib_cm_lap_state	lap_state;	/* internal CM/debug use */
+	__be32			local_id;
+	__be32			remote_id;
+	u32			remote_cm_qpn;  /* 1 unless redirected */
+};
+
+/**
+ * ib_create_cm_id - Allocate a communication identifier.
+ * @device: Device associated with the cm_id.  All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @context: User specified context associated with the communication
+ *   identifier.
+ *
+ * Communication identifiers are used to track connection states, service
+ * ID resolution requests, and listen requests.
+ */
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+				 ib_cm_handler cm_handler,
+				 void *context);
+
+/**
+ * ib_destroy_cm_id - Destroy a connection identifier.
+ * @cm_id: Connection identifier to destroy.
+ *
+ * This call blocks until the connection identifier is destroyed.
+ */
+void ib_destroy_cm_id(struct ib_cm_id *cm_id);
+
+#define IB_SERVICE_ID_AGN_MASK	cpu_to_be64(0xFF00000000000000ULL)
+#define IB_CM_ASSIGN_SERVICE_ID	cpu_to_be64(0x0200000000000000ULL)
+#define IB_CMA_SERVICE_ID	cpu_to_be64(0x0000000001000000ULL)
+#define IB_CMA_SERVICE_ID_MASK	cpu_to_be64(0xFFFFFFFFFF000000ULL)
+#define IB_SDP_SERVICE_ID	cpu_to_be64(0x0000000000010000ULL)
+#define IB_SDP_SERVICE_ID_MASK	cpu_to_be64(0xFFFFFFFFFFFF0000ULL)
+
+/**
+ * ib_cm_listen - Initiates listening on the specified service ID for
+ *   connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ *   and service ID resolution requests.  The service ID should be specified
+ *   network-byte order.  If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ *   assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ *   range of service IDs.  If set to 0, the service ID is matched
+ *   exactly.  This parameter is ignored if %service_id is set to
+ *   IB_CM_ASSIGN_SERVICE_ID.
+ */
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+		 __be64 service_mask);
+
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+				     ib_cm_handler cm_handler,
+				     __be64 service_id);
+
+struct ib_cm_req_param {
+	struct sa_path_rec	*primary_path;
+	struct sa_path_rec	*alternate_path;
+	__be64			service_id;
+	u32			qp_num;
+	enum ib_qp_type		qp_type;
+	u32			starting_psn;
+	const void		*private_data;
+	u8			private_data_len;
+	u8			peer_to_peer;
+	u8			responder_resources;
+	u8			initiator_depth;
+	u8			remote_cm_response_timeout;
+	u8			flow_control;
+	u8			local_cm_response_timeout;
+	u8			retry_count;
+	u8			rnr_retry_count;
+	u8			max_cm_retries;
+	u8			srq;
+};
+
+/**
+ * ib_send_cm_req - Sends a connection request to the remote node.
+ * @cm_id: Connection identifier that will be associated with the
+ *   connection request.
+ * @param: Connection request information needed to establish the
+ *   connection.
+ */
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+		   struct ib_cm_req_param *param);
+
+struct ib_cm_rep_param {
+	u32		qp_num;
+	u32		starting_psn;
+	const void	*private_data;
+	u8		private_data_len;
+	u8		responder_resources;
+	u8		initiator_depth;
+	u8		failover_accepted;
+	u8		flow_control;
+	u8		rnr_retry_count;
+	u8		srq;
+};
+
+/**
+ * ib_send_cm_rep - Sends a connection reply in response to a connection
+ *   request.
+ * @cm_id: Connection identifier that will be associated with the
+ *   connection request.
+ * @param: Connection reply information needed to establish the
+ *   connection.
+ */
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+		   struct ib_cm_rep_param *param);
+
+/**
+ * ib_send_cm_rtu - Sends a connection ready to use message in response
+ *   to a connection reply message.
+ * @cm_id: Connection identifier associated with the connection request.
+ * @private_data: Optional user-defined private data sent with the
+ *   ready to use message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+		   const void *private_data,
+		   u8 private_data_len);
+
+/**
+ * ib_send_cm_dreq - Sends a disconnection request for an existing
+ *   connection.
+ * @cm_id: Connection identifier associated with the connection being
+ *   released.
+ * @private_data: Optional user-defined private data sent with the
+ *   disconnection request message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len);
+
+/**
+ * ib_send_cm_drep - Sends a disconnection reply to a disconnection request.
+ * @cm_id: Connection identifier associated with the connection being
+ *   released.
+ * @private_data: Optional user-defined private data sent with the
+ *   disconnection reply message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ *
+ * If the cm_id is in the correct state, the CM will transition the connection
+ * to the timewait state, even if an error occurs sending the DREP message.
+ */
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len);
+
+/**
+ * ib_cm_notify - Notifies the CM of an event reported to the consumer.
+ * @cm_id: Connection identifier to transition to established.
+ * @event: Type of event.
+ *
+ * This routine should be invoked by users to notify the CM of relevant
+ * communication events.  Events that should be reported to the CM and
+ * when to report them are:
+ *
+ * IB_EVENT_COMM_EST - Used when a message is received on a connected
+ *    QP before an RTU has been received.
+ * IB_EVENT_PATH_MIG - Notifies the CM that the connection has failed over
+ *   to the alternate path.
+ */
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event);
+
+/**
+ * ib_send_cm_rej - Sends a connection rejection message to the
+ *   remote node.
+ * @cm_id: Connection identifier associated with the connection being
+ *   rejected.
+ * @reason: Reason for the connection request rejection.
+ * @ari: Optional additional rejection information.
+ * @ari_length: Size of the additional rejection information, in bytes.
+ * @private_data: Optional user-defined private data sent with the
+ *   rejection message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+		   enum ib_cm_rej_reason reason,
+		   void *ari,
+		   u8 ari_length,
+		   const void *private_data,
+		   u8 private_data_len);
+
+#define IB_CM_MRA_FLAG_DELAY 0x80  /* Send MRA only after a duplicate msg */
+
+/**
+ * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection
+ *   message.
+ * @cm_id: Connection identifier associated with the connection message.
+ * @service_timeout: The lower 5-bits specify the maximum time required for
+ *   the sender to reply to the connection message.  The upper 3-bits
+ *   specify additional control flags.
+ * @private_data: Optional user-defined private data sent with the
+ *   message receipt acknowledgement.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+		   u8 service_timeout,
+		   const void *private_data,
+		   u8 private_data_len);
+
+/**
+ * ib_send_cm_lap - Sends a load alternate path request.
+ * @cm_id: Connection identifier associated with the load alternate path
+ *   message.
+ * @alternate_path: A path record that identifies the alternate path to
+ *   load.
+ * @private_data: Optional user-defined private data sent with the
+ *   load alternate path message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_lap(struct ib_cm_id *cm_id,
+		   struct sa_path_rec *alternate_path,
+		   const void *private_data,
+		   u8 private_data_len);
+
+/**
+ * ib_cm_init_qp_attr - Initializes the QP attributes for use in transitioning
+ *   to a specified QP state.
+ * @cm_id: Communication identifier associated with the QP attributes to
+ *   initialize.
+ * @qp_attr: On input, specifies the desired QP state.  On output, the
+ *   mandatory and desired optional attributes will be set in order to
+ *   modify the QP to the specified state.
+ * @qp_attr_mask: The QP attribute mask that may be used to transition the
+ *   QP to the specified state.
+ *
+ * Users must set the @qp_attr->qp_state to the desired QP state.  This call
+ * will set all required attributes for the given transition, along with
+ * known optional attributes.  Users may override the attributes returned from
+ * this call before calling ib_modify_qp.
+ */
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+		       struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask);
+
+/**
+ * ib_send_cm_apr - Sends an alternate path response message in response to
+ *   a load alternate path request.
+ * @cm_id: Connection identifier associated with the alternate path response.
+ * @status: Reply status sent with the alternate path response.
+ * @info: Optional additional information sent with the alternate path
+ *   response.
+ * @info_length: Size of the additional information, in bytes.
+ * @private_data: Optional user-defined private data sent with the
+ *   alternate path response message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_apr(struct ib_cm_id *cm_id,
+		   enum ib_cm_apr_status status,
+		   void *info,
+		   u8 info_length,
+		   const void *private_data,
+		   u8 private_data_len);
+
+struct ib_cm_sidr_req_param {
+	struct sa_path_rec	*path;
+	__be64			service_id;
+	int			timeout_ms;
+	const void		*private_data;
+	u8			private_data_len;
+	u8			max_cm_retries;
+};
+
+/**
+ * ib_send_cm_sidr_req - Sends a service ID resolution request to the
+ *   remote node.
+ * @cm_id: Communication identifier that will be associated with the
+ *   service ID resolution request.
+ * @param: Service ID resolution request information.
+ */
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_req_param *param);
+
+struct ib_cm_sidr_rep_param {
+	u32			qp_num;
+	u32			qkey;
+	enum ib_cm_sidr_status	status;
+	const void		*info;
+	u8			info_length;
+	const void		*private_data;
+	u8			private_data_len;
+};
+
+/**
+ * ib_send_cm_sidr_rep - Sends a service ID resolution reply to the
+ *   remote node.
+ * @cm_id: Communication identifier associated with the received service ID
+ *   resolution request.
+ * @param: Service ID resolution reply information.
+ */
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_rep_param *param);
+
+/**
+ * ibcm_reject_msg - return a pointer to a reject message string.
+ * @reason: Value returned in the REJECT event status field.
+ */
+const char *__attribute_const__ ibcm_reject_msg(int reason);
+
+#endif /* IB_CM_H */
diff --git a/include/rdma/ib_fmr_pool.h b/include/rdma/ib_fmr_pool.h
new file mode 100644
index 0000000..f62b842
--- /dev/null
+++ b/include/rdma/ib_fmr_pool.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_FMR_POOL_H)
+#define IB_FMR_POOL_H
+
+#include <rdma/ib_verbs.h>
+
+struct ib_fmr_pool;
+
+/**
+ * struct ib_fmr_pool_param - Parameters for creating FMR pool
+ * @max_pages_per_fmr:Maximum number of pages per map request.
+ * @page_shift: Log2 of sizeof "pages" mapped by this fmr
+ * @access:Access flags for FMRs in pool.
+ * @pool_size:Number of FMRs to allocate for pool.
+ * @dirty_watermark:Flush is triggered when @dirty_watermark dirty
+ *     FMRs are present.
+ * @flush_function:Callback called when unmapped FMRs are flushed and
+ *     more FMRs are possibly available for mapping
+ * @flush_arg:Context passed to user's flush function.
+ * @cache:If set, FMRs may be reused after unmapping for identical map
+ *     requests.
+ */
+struct ib_fmr_pool_param {
+	int                     max_pages_per_fmr;
+	int                     page_shift;
+	enum ib_access_flags    access;
+	int                     pool_size;
+	int                     dirty_watermark;
+	void                  (*flush_function)(struct ib_fmr_pool *pool,
+						void               *arg);
+	void                   *flush_arg;
+	unsigned                cache:1;
+};
+
+struct ib_pool_fmr {
+	struct ib_fmr      *fmr;
+	struct ib_fmr_pool *pool;
+	struct list_head    list;
+	struct hlist_node   cache_node;
+	int                 ref_count;
+	int                 remap_count;
+	u64                 io_virtual_address;
+	int                 page_list_len;
+	u64                 page_list[0];
+};
+
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
+				       struct ib_fmr_pool_param *params);
+
+void ib_destroy_fmr_pool(struct ib_fmr_pool *pool);
+
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool);
+
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+					 u64                *page_list,
+					 int                 list_len,
+					 u64                 io_virtual_address);
+
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr);
+
+#endif /* IB_FMR_POOL_H */
diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h
new file mode 100644
index 0000000..6e35416
--- /dev/null
+++ b/include/rdma/ib_hdrs.h
@@ -0,0 +1,341 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef IB_HDRS_H
+#define IB_HDRS_H
+
+#include <linux/types.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+
+#define IB_SEQ_NAK	(3 << 29)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK                      0x20
+#define IB_NAK_PSN_ERROR                0x60
+#define IB_NAK_INVALID_REQUEST          0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR      0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST       0x64
+
+#define IB_BTH_REQ_ACK		BIT(31)
+#define IB_BTH_SOLICITED	BIT(23)
+#define IB_BTH_MIG_REQ		BIT(22)
+
+#define IB_GRH_VERSION		6
+#define IB_GRH_VERSION_MASK	0xF
+#define IB_GRH_VERSION_SHIFT	28
+#define IB_GRH_TCLASS_MASK	0xFF
+#define IB_GRH_TCLASS_SHIFT	20
+#define IB_GRH_FLOW_MASK	0xFFFFF
+#define IB_GRH_FLOW_SHIFT	0
+#define IB_GRH_NEXT_HDR		0x1B
+#define IB_FECN_SHIFT 31
+#define IB_FECN_MASK 1
+#define IB_FECN_SMASK BIT(IB_FECN_SHIFT)
+#define IB_BECN_SHIFT 30
+#define IB_BECN_MASK 1
+#define IB_BECN_SMASK BIT(IB_BECN_SHIFT)
+
+#define IB_AETH_CREDIT_SHIFT	24
+#define IB_AETH_CREDIT_MASK	0x1F
+#define IB_AETH_CREDIT_INVAL	0x1F
+#define IB_AETH_NAK_SHIFT	29
+#define IB_MSN_MASK		0xFFFFFF
+
+struct ib_reth {
+	__be64 vaddr;        /* potentially unaligned */
+	__be32 rkey;
+	__be32 length;
+} __packed;
+
+struct ib_atomic_eth {
+	__be64 vaddr;        /* potentially unaligned */
+	__be32 rkey;
+	__be64 swap_data;    /* potentially unaligned */
+	__be64 compare_data; /* potentially unaligned */
+} __packed;
+
+union ib_ehdrs {
+	struct {
+		__be32 deth[2];
+		__be32 imm_data;
+	} ud;
+	struct {
+		struct ib_reth reth;
+		__be32 imm_data;
+	} rc;
+	struct {
+		__be32 aeth;
+		__be64 atomic_ack_eth; /* potentially unaligned */
+	} __packed at;
+	__be32 imm_data;
+	__be32 aeth;
+	__be32 ieth;
+	struct ib_atomic_eth atomic_eth;
+}  __packed;
+
+struct ib_other_headers {
+	__be32 bth[3];
+	union ib_ehdrs u;
+} __packed;
+
+struct ib_header {
+	__be16 lrh[4];
+	union {
+		struct {
+			struct ib_grh grh;
+			struct ib_other_headers oth;
+		} l;
+		struct ib_other_headers oth;
+	} u;
+} __packed;
+
+/* accessors for unaligned __be64 items */
+
+static inline u64 ib_u64_get(__be64 *p)
+{
+	return get_unaligned_be64(p);
+}
+
+static inline void ib_u64_put(u64 val, __be64 *p)
+{
+	put_unaligned_be64(val, p);
+}
+
+static inline u64 get_ib_reth_vaddr(struct ib_reth *reth)
+{
+	return ib_u64_get(&reth->vaddr);
+}
+
+static inline void put_ib_reth_vaddr(u64 val, struct ib_reth *reth)
+{
+	ib_u64_put(val, &reth->vaddr);
+}
+
+static inline u64 get_ib_ateth_vaddr(struct ib_atomic_eth *ateth)
+{
+	return ib_u64_get(&ateth->vaddr);
+}
+
+static inline void put_ib_ateth_vaddr(u64 val, struct ib_atomic_eth *ateth)
+{
+	ib_u64_put(val, &ateth->vaddr);
+}
+
+static inline u64 get_ib_ateth_swap(struct ib_atomic_eth *ateth)
+{
+	return ib_u64_get(&ateth->swap_data);
+}
+
+static inline void put_ib_ateth_swap(u64 val, struct ib_atomic_eth *ateth)
+{
+	ib_u64_put(val, &ateth->swap_data);
+}
+
+static inline u64 get_ib_ateth_compare(struct ib_atomic_eth *ateth)
+{
+	return ib_u64_get(&ateth->compare_data);
+}
+
+static inline void put_ib_ateth_compare(u64 val, struct ib_atomic_eth *ateth)
+{
+	ib_u64_put(val, &ateth->compare_data);
+}
+
+/*
+ * 9B/IB Packet Format
+ */
+#define IB_LNH_MASK		3
+#define IB_SC_MASK		0xf
+#define IB_SC_SHIFT		12
+#define IB_SC5_MASK		0x10
+#define IB_SL_MASK		0xf
+#define IB_SL_SHIFT		4
+#define IB_SL_SHIFT		4
+#define IB_LVER_MASK	0xf
+#define IB_LVER_SHIFT	8
+
+static inline u8 ib_get_lnh(struct ib_header *hdr)
+{
+	return (be16_to_cpu(hdr->lrh[0]) & IB_LNH_MASK);
+}
+
+static inline u8 ib_get_sc(struct ib_header *hdr)
+{
+	return ((be16_to_cpu(hdr->lrh[0]) >> IB_SC_SHIFT) & IB_SC_MASK);
+}
+
+static inline bool ib_is_sc5(u16 sc5)
+{
+	return !!(sc5 & IB_SC5_MASK);
+}
+
+static inline u8 ib_get_sl(struct ib_header *hdr)
+{
+	return ((be16_to_cpu(hdr->lrh[0]) >> IB_SL_SHIFT) & IB_SL_MASK);
+}
+
+static inline u16 ib_get_dlid(struct ib_header *hdr)
+{
+	return (be16_to_cpu(hdr->lrh[1]));
+}
+
+static inline u16 ib_get_slid(struct ib_header *hdr)
+{
+	return (be16_to_cpu(hdr->lrh[3]));
+}
+
+static inline u8 ib_get_lver(struct ib_header *hdr)
+{
+	return (u8)((be16_to_cpu(hdr->lrh[0]) >> IB_LVER_SHIFT) &
+		   IB_LVER_MASK);
+}
+
+static inline u16 ib_get_len(struct ib_header *hdr)
+{
+	return (u16)(be16_to_cpu(hdr->lrh[2]));
+}
+
+static inline u32 ib_get_qkey(struct ib_other_headers *ohdr)
+{
+	return be32_to_cpu(ohdr->u.ud.deth[0]);
+}
+
+static inline u32 ib_get_sqpn(struct ib_other_headers *ohdr)
+{
+	return ((be32_to_cpu(ohdr->u.ud.deth[1])) & IB_QPN_MASK);
+}
+
+/*
+ * BTH
+ */
+#define IB_BTH_OPCODE_MASK	0xff
+#define IB_BTH_OPCODE_SHIFT	24
+#define IB_BTH_PAD_MASK	3
+#define IB_BTH_PKEY_MASK	0xffff
+#define IB_BTH_PAD_SHIFT	20
+#define IB_BTH_A_MASK		1
+#define IB_BTH_A_SHIFT		31
+#define IB_BTH_M_MASK		1
+#define IB_BTH_M_SHIFT		22
+#define IB_BTH_SE_MASK		1
+#define IB_BTH_SE_SHIFT	23
+#define IB_BTH_TVER_MASK	0xf
+#define IB_BTH_TVER_SHIFT	16
+
+static inline u8 ib_bth_get_pad(struct ib_other_headers *ohdr)
+{
+	return ((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_PAD_SHIFT) &
+		   IB_BTH_PAD_MASK);
+}
+
+static inline u16 ib_bth_get_pkey(struct ib_other_headers *ohdr)
+{
+	return (be32_to_cpu(ohdr->bth[0]) & IB_BTH_PKEY_MASK);
+}
+
+static inline u8 ib_bth_get_opcode(struct ib_other_headers *ohdr)
+{
+	return ((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_OPCODE_SHIFT) &
+		   IB_BTH_OPCODE_MASK);
+}
+
+static inline u8 ib_bth_get_ackreq(struct ib_other_headers *ohdr)
+{
+	return (u8)((be32_to_cpu(ohdr->bth[2]) >> IB_BTH_A_SHIFT) &
+		   IB_BTH_A_MASK);
+}
+
+static inline u8 ib_bth_get_migreq(struct ib_other_headers *ohdr)
+{
+	return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_M_SHIFT) &
+		    IB_BTH_M_MASK);
+}
+
+static inline u8 ib_bth_get_se(struct ib_other_headers *ohdr)
+{
+	return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_SE_SHIFT) &
+		    IB_BTH_SE_MASK);
+}
+
+static inline u32 ib_bth_get_psn(struct ib_other_headers *ohdr)
+{
+	return (u32)(be32_to_cpu(ohdr->bth[2]));
+}
+
+static inline u32 ib_bth_get_qpn(struct ib_other_headers *ohdr)
+{
+	return (u32)((be32_to_cpu(ohdr->bth[1])) & IB_QPN_MASK);
+}
+
+static inline bool ib_bth_get_becn(struct ib_other_headers *ohdr)
+{
+	return (ohdr->bth[1]) & cpu_to_be32(IB_BECN_SMASK);
+}
+
+static inline bool ib_bth_get_fecn(struct ib_other_headers *ohdr)
+{
+	return (ohdr->bth[1]) & cpu_to_be32(IB_FECN_SMASK);
+}
+
+static inline u8 ib_bth_get_tver(struct ib_other_headers *ohdr)
+{
+	return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_TVER_SHIFT)  &
+		    IB_BTH_TVER_MASK);
+}
+
+static inline bool ib_bth_is_solicited(struct ib_other_headers *ohdr)
+{
+	return ohdr->bth[0] & cpu_to_be32(IB_BTH_SOLICITED);
+}
+
+static inline bool ib_bth_is_migration(struct ib_other_headers *ohdr)
+{
+	return ohdr->bth[0] & cpu_to_be32(IB_BTH_MIG_REQ);
+}
+#endif                          /* IB_HDRS_H */
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
new file mode 100644
index 0000000..2f4f176
--- /dev/null
+++ b/include/rdma/ib_mad.h
@@ -0,0 +1,891 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2006 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_MAD_H)
+#define IB_MAD_H
+
+#include <linux/list.h>
+
+#include <rdma/ib_verbs.h>
+#include <uapi/rdma/ib_user_mad.h>
+
+/* Management base versions */
+#define IB_MGMT_BASE_VERSION			1
+#define OPA_MGMT_BASE_VERSION			0x80
+
+#define OPA_SM_CLASS_VERSION			0x80
+
+/* Management classes */
+#define IB_MGMT_CLASS_SUBN_LID_ROUTED		0x01
+#define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE	0x81
+#define IB_MGMT_CLASS_SUBN_ADM			0x03
+#define IB_MGMT_CLASS_PERF_MGMT			0x04
+#define IB_MGMT_CLASS_BM			0x05
+#define IB_MGMT_CLASS_DEVICE_MGMT		0x06
+#define IB_MGMT_CLASS_CM			0x07
+#define IB_MGMT_CLASS_SNMP			0x08
+#define IB_MGMT_CLASS_DEVICE_ADM		0x10
+#define IB_MGMT_CLASS_BOOT_MGMT			0x11
+#define IB_MGMT_CLASS_BIS			0x12
+#define IB_MGMT_CLASS_CONG_MGMT			0x21
+#define IB_MGMT_CLASS_VENDOR_RANGE2_START	0x30
+#define IB_MGMT_CLASS_VENDOR_RANGE2_END		0x4F
+
+#define	IB_OPENIB_OUI				(0x001405)
+
+/* Management methods */
+#define IB_MGMT_METHOD_GET			0x01
+#define IB_MGMT_METHOD_SET			0x02
+#define IB_MGMT_METHOD_GET_RESP			0x81
+#define IB_MGMT_METHOD_SEND			0x03
+#define IB_MGMT_METHOD_TRAP			0x05
+#define IB_MGMT_METHOD_REPORT			0x06
+#define IB_MGMT_METHOD_REPORT_RESP		0x86
+#define IB_MGMT_METHOD_TRAP_REPRESS		0x07
+
+#define IB_MGMT_METHOD_RESP			0x80
+#define IB_BM_ATTR_MOD_RESP			cpu_to_be32(1)
+
+#define IB_MGMT_MAX_METHODS			128
+
+/* MAD Status field bit masks */
+#define IB_MGMT_MAD_STATUS_SUCCESS			0x0000
+#define IB_MGMT_MAD_STATUS_BUSY				0x0001
+#define IB_MGMT_MAD_STATUS_REDIRECT_REQD		0x0002
+#define IB_MGMT_MAD_STATUS_BAD_VERSION			0x0004
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD		0x0008
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB	0x000c
+#define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE		0x001c
+
+/* RMPP information */
+#define IB_MGMT_RMPP_VERSION			1
+
+#define IB_MGMT_RMPP_TYPE_DATA			1
+#define IB_MGMT_RMPP_TYPE_ACK			2
+#define IB_MGMT_RMPP_TYPE_STOP			3
+#define IB_MGMT_RMPP_TYPE_ABORT			4
+
+#define IB_MGMT_RMPP_FLAG_ACTIVE		1
+#define IB_MGMT_RMPP_FLAG_FIRST			(1<<1)
+#define IB_MGMT_RMPP_FLAG_LAST			(1<<2)
+
+#define IB_MGMT_RMPP_NO_RESPTIME		0x1F
+
+#define	IB_MGMT_RMPP_STATUS_SUCCESS		0
+#define	IB_MGMT_RMPP_STATUS_RESX		1
+#define	IB_MGMT_RMPP_STATUS_ABORT_MIN		118
+#define	IB_MGMT_RMPP_STATUS_T2L			118
+#define	IB_MGMT_RMPP_STATUS_BAD_LEN		119
+#define	IB_MGMT_RMPP_STATUS_BAD_SEG		120
+#define	IB_MGMT_RMPP_STATUS_BADT		121
+#define	IB_MGMT_RMPP_STATUS_W2S			122
+#define	IB_MGMT_RMPP_STATUS_S2B			123
+#define	IB_MGMT_RMPP_STATUS_BAD_STATUS		124
+#define	IB_MGMT_RMPP_STATUS_UNV			125
+#define	IB_MGMT_RMPP_STATUS_TMR			126
+#define	IB_MGMT_RMPP_STATUS_UNSPEC		127
+#define	IB_MGMT_RMPP_STATUS_ABORT_MAX		127
+
+#define IB_QP0		0
+#define IB_QP1		cpu_to_be32(1)
+#define IB_QP1_QKEY	0x80010000
+#define IB_QP_SET_QKEY	0x80000000
+
+#define IB_DEFAULT_PKEY_PARTIAL 0x7FFF
+#define IB_DEFAULT_PKEY_FULL	0xFFFF
+
+/*
+ * Generic trap/notice types
+ */
+#define IB_NOTICE_TYPE_FATAL	0x80
+#define IB_NOTICE_TYPE_URGENT	0x81
+#define IB_NOTICE_TYPE_SECURITY	0x82
+#define IB_NOTICE_TYPE_SM	0x83
+#define IB_NOTICE_TYPE_INFO	0x84
+
+/*
+ * Generic trap/notice producers
+ */
+#define IB_NOTICE_PROD_CA		cpu_to_be16(1)
+#define IB_NOTICE_PROD_SWITCH		cpu_to_be16(2)
+#define IB_NOTICE_PROD_ROUTER		cpu_to_be16(3)
+#define IB_NOTICE_PROD_CLASS_MGR	cpu_to_be16(4)
+
+enum {
+	IB_MGMT_MAD_HDR = 24,
+	IB_MGMT_MAD_DATA = 232,
+	IB_MGMT_RMPP_HDR = 36,
+	IB_MGMT_RMPP_DATA = 220,
+	IB_MGMT_VENDOR_HDR = 40,
+	IB_MGMT_VENDOR_DATA = 216,
+	IB_MGMT_SA_HDR = 56,
+	IB_MGMT_SA_DATA = 200,
+	IB_MGMT_DEVICE_HDR = 64,
+	IB_MGMT_DEVICE_DATA = 192,
+	IB_MGMT_MAD_SIZE = IB_MGMT_MAD_HDR + IB_MGMT_MAD_DATA,
+	OPA_MGMT_MAD_DATA = 2024,
+	OPA_MGMT_RMPP_DATA = 2012,
+	OPA_MGMT_MAD_SIZE = IB_MGMT_MAD_HDR + OPA_MGMT_MAD_DATA,
+};
+
+struct ib_mad_hdr {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	__be16	status;
+	__be16	class_specific;
+	__be64	tid;
+	__be16	attr_id;
+	__be16	resv;
+	__be32	attr_mod;
+};
+
+struct ib_rmpp_hdr {
+	u8	rmpp_version;
+	u8	rmpp_type;
+	u8	rmpp_rtime_flags;
+	u8	rmpp_status;
+	__be32	seg_num;
+	__be32	paylen_newwin;
+};
+
+typedef u64 __bitwise ib_sa_comp_mask;
+
+#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << (n)))
+
+/*
+ * ib_sa_hdr and ib_sa_mad structures must be packed because they have
+ * 64-bit fields that are only 32-bit aligned. 64-bit architectures will
+ * lay them out wrong otherwise.  (And unfortunately they are sent on
+ * the wire so we can't change the layout)
+ */
+struct ib_sa_hdr {
+	__be64			sm_key;
+	__be16			attr_offset;
+	__be16			reserved;
+	ib_sa_comp_mask		comp_mask;
+} __attribute__ ((packed));
+
+struct ib_mad {
+	struct ib_mad_hdr	mad_hdr;
+	u8			data[IB_MGMT_MAD_DATA];
+};
+
+struct opa_mad {
+	struct ib_mad_hdr	mad_hdr;
+	u8			data[OPA_MGMT_MAD_DATA];
+};
+
+struct ib_rmpp_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	u8			data[IB_MGMT_RMPP_DATA];
+};
+
+struct opa_rmpp_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	u8			data[OPA_MGMT_RMPP_DATA];
+};
+
+struct ib_sa_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	struct ib_sa_hdr	sa_hdr;
+	u8			data[IB_MGMT_SA_DATA];
+} __attribute__ ((packed));
+
+struct ib_vendor_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	u8			reserved;
+	u8			oui[3];
+	u8			data[IB_MGMT_VENDOR_DATA];
+};
+
+#define IB_MGMT_CLASSPORTINFO_ATTR_ID	cpu_to_be16(0x0001)
+
+#define IB_CLASS_PORT_INFO_RESP_TIME_MASK	0x1F
+#define IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE 5
+
+struct ib_class_port_info {
+	u8			base_version;
+	u8			class_version;
+	__be16			capability_mask;
+	  /* 27 bits for cap_mask2, 5 bits for resp_time */
+	__be32			cap_mask2_resp_time;
+	u8			redirect_gid[16];
+	__be32			redirect_tcslfl;
+	__be16			redirect_lid;
+	__be16			redirect_pkey;
+	__be32			redirect_qp;
+	__be32			redirect_qkey;
+	u8			trap_gid[16];
+	__be32			trap_tcslfl;
+	__be16			trap_lid;
+	__be16			trap_pkey;
+	__be32			trap_hlqp;
+	__be32			trap_qkey;
+};
+
+#define OPA_CLASS_PORT_INFO_PR_SUPPORT BIT(26)
+
+struct opa_class_port_info {
+	u8 base_version;
+	u8 class_version;
+	__be16 cap_mask;
+	__be32 cap_mask2_resp_time;
+
+	u8 redirect_gid[16];
+	__be32 redirect_tc_fl;
+	__be32 redirect_lid;
+	__be32 redirect_sl_qp;
+	__be32 redirect_qkey;
+
+	u8 trap_gid[16];
+	__be32 trap_tc_fl;
+	__be32 trap_lid;
+	__be32 trap_hl_qp;
+	__be32 trap_qkey;
+
+	__be16 trap_pkey;
+	__be16 redirect_pkey;
+
+	u8 trap_sl_rsvd;
+	u8 reserved[3];
+} __packed;
+
+/**
+ * ib_get_cpi_resp_time - Returns the resp_time value from
+ * cap_mask2_resp_time in ib_class_port_info.
+ * @cpi: A struct ib_class_port_info mad.
+ */
+static inline u8 ib_get_cpi_resp_time(struct ib_class_port_info *cpi)
+{
+	return (u8)(be32_to_cpu(cpi->cap_mask2_resp_time) &
+		    IB_CLASS_PORT_INFO_RESP_TIME_MASK);
+}
+
+/**
+ * ib_set_cpi_resptime - Sets the response time in an
+ * ib_class_port_info mad.
+ * @cpi: A struct ib_class_port_info.
+ * @rtime: The response time to set.
+ */
+static inline void ib_set_cpi_resp_time(struct ib_class_port_info *cpi,
+					u8 rtime)
+{
+	cpi->cap_mask2_resp_time =
+		(cpi->cap_mask2_resp_time &
+		 cpu_to_be32(~IB_CLASS_PORT_INFO_RESP_TIME_MASK)) |
+		cpu_to_be32(rtime & IB_CLASS_PORT_INFO_RESP_TIME_MASK);
+}
+
+/**
+ * ib_get_cpi_capmask2 - Returns the capmask2 value from
+ * cap_mask2_resp_time in ib_class_port_info.
+ * @cpi: A struct ib_class_port_info mad.
+ */
+static inline u32 ib_get_cpi_capmask2(struct ib_class_port_info *cpi)
+{
+	return (be32_to_cpu(cpi->cap_mask2_resp_time) >>
+		IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE);
+}
+
+/**
+ * ib_set_cpi_capmask2 - Sets the capmask2 in an
+ * ib_class_port_info mad.
+ * @cpi: A struct ib_class_port_info.
+ * @capmask2: The capmask2 to set.
+ */
+static inline void ib_set_cpi_capmask2(struct ib_class_port_info *cpi,
+				       u32 capmask2)
+{
+	cpi->cap_mask2_resp_time =
+		(cpi->cap_mask2_resp_time &
+		 cpu_to_be32(IB_CLASS_PORT_INFO_RESP_TIME_MASK)) |
+		cpu_to_be32(capmask2 <<
+			    IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE);
+}
+
+/**
+ * opa_get_cpi_capmask2 - Returns the capmask2 value from
+ * cap_mask2_resp_time in ib_class_port_info.
+ * @cpi: A struct opa_class_port_info mad.
+ */
+static inline u32 opa_get_cpi_capmask2(struct opa_class_port_info *cpi)
+{
+	return (be32_to_cpu(cpi->cap_mask2_resp_time) >>
+		IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE);
+}
+
+struct ib_mad_notice_attr {
+	u8 generic_type;
+	u8 prod_type_msb;
+	__be16 prod_type_lsb;
+	__be16 trap_num;
+	__be16 issuer_lid;
+	__be16 toggle_count;
+
+	union {
+		struct {
+			u8	details[54];
+		} raw_data;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;		/* where violation happened */
+			u8	port_num;	/* where violation happened */
+		} __packed ntc_129_131;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;		/* LID where change occurred */
+			u8	reserved2;
+			u8	local_changes;	/* low bit - local changes */
+			__be32	new_cap_mask;	/* new capability mask */
+			u8	reserved3;
+			u8	change_flags;	/* low 3 bits only */
+		} __packed ntc_144;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;		/* lid where sys guid changed */
+			__be16	reserved2;
+			__be64	new_sys_guid;
+		} __packed ntc_145;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;
+			__be16	dr_slid;
+			u8	method;
+			u8	reserved2;
+			__be16	attr_id;
+			__be32	attr_mod;
+			__be64	mkey;
+			u8	reserved3;
+			u8	dr_trunc_hop;
+			u8	dr_rtn_path[30];
+		} __packed ntc_256;
+
+		struct {
+			__be16		reserved;
+			__be16		lid1;
+			__be16		lid2;
+			__be32		key;
+			__be32		sl_qp1;	/* SL: high 4 bits */
+			__be32		qp2;	/* high 8 bits reserved */
+			union ib_gid	gid1;
+			union ib_gid	gid2;
+		} __packed ntc_257_258;
+
+	} details;
+};
+
+/**
+ * ib_mad_send_buf - MAD data buffer and work request for sends.
+ * @next: A pointer used to chain together MADs for posting.
+ * @mad: References an allocated MAD data buffer for MADs that do not have
+ *   RMPP active.  For MADs using RMPP, references the common and management
+ *   class specific headers.
+ * @mad_agent: MAD agent that allocated the buffer.
+ * @ah: The address handle to use when sending the MAD.
+ * @context: User-controlled context fields.
+ * @hdr_len: Indicates the size of the data header of the MAD.  This length
+ *   includes the common MAD, RMPP, and class specific headers.
+ * @data_len: Indicates the total size of user-transferred data.
+ * @seg_count: The number of RMPP segments allocated for this send.
+ * @seg_size: Size of the data in each RMPP segment.  This does not include
+ *   class specific headers.
+ * @seg_rmpp_size: Size of each RMPP segment including the class specific
+ *   headers.
+ * @timeout_ms: Time to wait for a response.
+ * @retries: Number of times to retry a request for a response.  For MADs
+ *   using RMPP, this applies per window.  On completion, returns the number
+ *   of retries needed to complete the transfer.
+ *
+ * Users are responsible for initializing the MAD buffer itself, with the
+ * exception of any RMPP header.  Additional segment buffer space allocated
+ * beyond data_len is padding.
+ */
+struct ib_mad_send_buf {
+	struct ib_mad_send_buf	*next;
+	void			*mad;
+	struct ib_mad_agent	*mad_agent;
+	struct ib_ah		*ah;
+	void			*context[2];
+	int			hdr_len;
+	int			data_len;
+	int			seg_count;
+	int			seg_size;
+	int			seg_rmpp_size;
+	int			timeout_ms;
+	int			retries;
+};
+
+/**
+ * ib_response_mad - Returns if the specified MAD has been generated in
+ *   response to a sent request or trap.
+ */
+int ib_response_mad(const struct ib_mad_hdr *hdr);
+
+/**
+ * ib_get_rmpp_resptime - Returns the RMPP response time.
+ * @rmpp_hdr: An RMPP header.
+ */
+static inline u8 ib_get_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr)
+{
+	return rmpp_hdr->rmpp_rtime_flags >> 3;
+}
+
+/**
+ * ib_get_rmpp_flags - Returns the RMPP flags.
+ * @rmpp_hdr: An RMPP header.
+ */
+static inline u8 ib_get_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr)
+{
+	return rmpp_hdr->rmpp_rtime_flags & 0x7;
+}
+
+/**
+ * ib_set_rmpp_resptime - Sets the response time in an RMPP header.
+ * @rmpp_hdr: An RMPP header.
+ * @rtime: The response time to set.
+ */
+static inline void ib_set_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr, u8 rtime)
+{
+	rmpp_hdr->rmpp_rtime_flags = ib_get_rmpp_flags(rmpp_hdr) | (rtime << 3);
+}
+
+/**
+ * ib_set_rmpp_flags - Sets the flags in an RMPP header.
+ * @rmpp_hdr: An RMPP header.
+ * @flags: The flags to set.
+ */
+static inline void ib_set_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr, u8 flags)
+{
+	rmpp_hdr->rmpp_rtime_flags = (rmpp_hdr->rmpp_rtime_flags & 0xF8) |
+				     (flags & 0x7);
+}
+
+struct ib_mad_agent;
+struct ib_mad_send_wc;
+struct ib_mad_recv_wc;
+
+/**
+ * ib_mad_send_handler - callback handler for a sent MAD.
+ * @mad_agent: MAD agent that sent the MAD.
+ * @mad_send_wc: Send work completion information on the sent MAD.
+ */
+typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent,
+				    struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_snoop_handler - Callback handler for snooping sent MADs.
+ * @mad_agent: MAD agent that snooped the MAD.
+ * @send_buf: send MAD data buffer.
+ * @mad_send_wc: Work completion information on the sent MAD.  Valid
+ *   only for snooping that occurs on a send completion.
+ *
+ * Clients snooping MADs should not modify data referenced by the @send_buf
+ * or @mad_send_wc.
+ */
+typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
+				     struct ib_mad_send_buf *send_buf,
+				     struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_recv_handler - callback handler for a received MAD.
+ * @mad_agent: MAD agent requesting the received MAD.
+ * @send_buf: Send buffer if found, else NULL
+ * @mad_recv_wc: Received work completion information on the received MAD.
+ *
+ * MADs received in response to a send request operation will be handed to
+ * the user before the send operation completes.  All data buffers given
+ * to registered agents through this routine are owned by the receiving
+ * client, except for snooping agents.  Clients snooping MADs should not
+ * modify the data referenced by @mad_recv_wc.
+ */
+typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
+				    struct ib_mad_send_buf *send_buf,
+				    struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_mad_agent - Used to track MAD registration with the access layer.
+ * @device: Reference to device registration is on.
+ * @qp: Reference to QP used for sending and receiving MADs.
+ * @mr: Memory region for system memory usable for DMA.
+ * @recv_handler: Callback handler for a received MAD.
+ * @send_handler: Callback handler for a sent MAD.
+ * @snoop_handler: Callback handler for snooped sent MADs.
+ * @context: User-specified context associated with this registration.
+ * @hi_tid: Access layer assigned transaction ID for this client.
+ *   Unsolicited MADs sent by this client will have the upper 32-bits
+ *   of their TID set to this value.
+ * @flags: registration flags
+ * @port_num: Port number on which QP is registered
+ * @rmpp_version: If set, indicates the RMPP version used by this agent.
+ */
+enum {
+	IB_MAD_USER_RMPP = IB_USER_MAD_USER_RMPP,
+};
+struct ib_mad_agent {
+	struct ib_device	*device;
+	struct ib_qp		*qp;
+	ib_mad_recv_handler	recv_handler;
+	ib_mad_send_handler	send_handler;
+	ib_mad_snoop_handler	snoop_handler;
+	void			*context;
+	u32			hi_tid;
+	u32			flags;
+	u8			port_num;
+	u8			rmpp_version;
+	void			*security;
+	bool			smp_allowed;
+	bool			lsm_nb_reg;
+	struct notifier_block   lsm_nb;
+};
+
+/**
+ * ib_mad_send_wc - MAD send completion information.
+ * @send_buf: Send MAD data buffer associated with the send MAD request.
+ * @status: Completion status.
+ * @vendor_err: Optional vendor error information returned with a failed
+ *   request.
+ */
+struct ib_mad_send_wc {
+	struct ib_mad_send_buf	*send_buf;
+	enum ib_wc_status	status;
+	u32			vendor_err;
+};
+
+/**
+ * ib_mad_recv_buf - received MAD buffer information.
+ * @list: Reference to next data buffer for a received RMPP MAD.
+ * @grh: References a data buffer containing the global route header.
+ *   The data refereced by this buffer is only valid if the GRH is
+ *   valid.
+ * @mad: References the start of the received MAD.
+ */
+struct ib_mad_recv_buf {
+	struct list_head	list;
+	struct ib_grh		*grh;
+	union {
+		struct ib_mad	*mad;
+		struct opa_mad	*opa_mad;
+	};
+};
+
+/**
+ * ib_mad_recv_wc - received MAD information.
+ * @wc: Completion information for the received data.
+ * @recv_buf: Specifies the location of the received data buffer(s).
+ * @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers.
+ * @mad_len: The length of the received MAD, without duplicated headers.
+ * @mad_seg_size: The size of individual MAD segments
+ *
+ * For received response, the wr_id contains a pointer to the ib_mad_send_buf
+ *   for the corresponding send request.
+ */
+struct ib_mad_recv_wc {
+	struct ib_wc		*wc;
+	struct ib_mad_recv_buf	recv_buf;
+	struct list_head	rmpp_list;
+	int			mad_len;
+	size_t			mad_seg_size;
+};
+
+/**
+ * ib_mad_reg_req - MAD registration request
+ * @mgmt_class: Indicates which management class of MADs should be receive
+ *   by the caller.  This field is only required if the user wishes to
+ *   receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version: Indicates which version of MADs for the given
+ *   management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ *   in the range from 0x30 to 0x4f. Otherwise not used.
+ * @method_mask: The caller will receive unsolicited MADs for any method
+ *   where @method_mask = 1.
+ *
+ */
+struct ib_mad_reg_req {
+	u8	mgmt_class;
+	u8	mgmt_class_version;
+	u8	oui[3];
+	DECLARE_BITMAP(method_mask, IB_MGMT_MAX_METHODS);
+};
+
+/**
+ * ib_register_mad_agent - Register to send/receive MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP to access.  Must be either
+ *   IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_reg_req: Specifies which unsolicited MADs should be received
+ *   by the caller.  This parameter may be NULL if the caller only
+ *   wishes to receive solicited responses.
+ * @rmpp_version: If set, indicates that the client will send
+ *   and receive MADs that contain the RMPP header for the given version.
+ *   If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ *   request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ *   MAD.
+ * @context: User specified context associated with the registration.
+ * @registration_flags: Registration flags to set for this agent
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   struct ib_mad_reg_req *mad_reg_req,
+					   u8 rmpp_version,
+					   ib_mad_send_handler send_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context,
+					   u32 registration_flags);
+
+enum ib_mad_snoop_flags {
+	/*IB_MAD_SNOOP_POSTED_SENDS	   = 1,*/
+	/*IB_MAD_SNOOP_RMPP_SENDS	   = (1<<1),*/
+	IB_MAD_SNOOP_SEND_COMPLETIONS	   = (1<<2),
+	/*IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS = (1<<3),*/
+	IB_MAD_SNOOP_RECVS		   = (1<<4)
+	/*IB_MAD_SNOOP_RMPP_RECVS	   = (1<<5),*/
+	/*IB_MAD_SNOOP_REDIRECTED_QPS	   = (1<<6)*/
+};
+
+/**
+ * ib_register_mad_snoop - Register to snoop sent and received MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP traffic to snoop.  Must be either
+ *   IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_snoop_flags: Specifies information where snooping occurs.
+ * @send_handler: The callback routine invoked for a snooped send.
+ * @recv_handler: The callback routine invoked for a snooped receive.
+ * @context: User specified context associated with the registration.
+ */
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   int mad_snoop_flags,
+					   ib_mad_snoop_handler snoop_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context);
+
+/**
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services.
+ * @mad_agent: Corresponding MAD registration request to deregister.
+ *
+ * After invoking this routine, MAD services are no longer usable by the
+ * client on the associated QP.
+ */
+void ib_unregister_mad_agent(struct ib_mad_agent *mad_agent);
+
+/**
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ *   with the registered client.
+ * @send_buf: Specifies the information needed to send the MAD(s).
+ * @bad_send_buf: Specifies the MAD on which an error was encountered.  This
+ *   parameter is optional if only a single MAD is posted.
+ *
+ * Sent MADs are not guaranteed to complete in the order that they were posted.
+ *
+ * If the MAD requires RMPP, the data buffer should contain a single copy
+ * of the common MAD, RMPP, and class specific headers, followed by the class
+ * defined data.  If the class defined data would not divide evenly into
+ * RMPP segments, then space must be allocated at the end of the referenced
+ * buffer for any required padding.  To indicate the amount of class defined
+ * data being transferred, the paylen_newwin field in the RMPP header should
+ * be set to the size of the class specific header plus the amount of class
+ * defined data being transferred.  The paylen_newwin field should be
+ * specified in network-byte order.
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+		     struct ib_mad_send_buf **bad_send_buf);
+
+
+/**
+ * ib_free_recv_mad - Returns data buffers used to receive a MAD.
+ * @mad_recv_wc: Work completion information for a received MAD.
+ *
+ * Clients receiving MADs through their ib_mad_recv_handler must call this
+ * routine to return the work completion buffers to the access layer.
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_cancel_mad - Cancels an outstanding send MAD operation.
+ * @mad_agent: Specifies the registration associated with sent MAD.
+ * @send_buf: Indicates the MAD to cancel.
+ *
+ * MADs will be returned to the user through the corresponding
+ * ib_mad_send_handler.
+ */
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+		   struct ib_mad_send_buf *send_buf);
+
+/**
+ * ib_modify_mad - Modifies an outstanding send MAD operation.
+ * @mad_agent: Specifies the registration associated with sent MAD.
+ * @send_buf: Indicates the MAD to modify.
+ * @timeout_ms: New timeout value for sent MAD.
+ *
+ * This call will reset the timeout value for a sent MAD to the specified
+ * value.
+ */
+int ib_modify_mad(struct ib_mad_agent *mad_agent,
+		  struct ib_mad_send_buf *send_buf, u32 timeout_ms);
+
+/**
+ * ib_redirect_mad_qp - Registers a QP for MAD services.
+ * @qp: Reference to a QP that requires MAD services.
+ * @rmpp_version: If set, indicates that the client will send
+ *   and receive MADs that contain the RMPP header for the given version.
+ *   If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ *   request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ *   MAD.
+ * @context: User specified context associated with the registration.
+ *
+ * Use of this call allows clients to use MAD services, such as RMPP,
+ * on user-owned QPs.  After calling this routine, users may send
+ * MADs on the specified QP by calling ib_mad_post_send.
+ */
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+					u8 rmpp_version,
+					ib_mad_send_handler send_handler,
+					ib_mad_recv_handler recv_handler,
+					void *context);
+
+/**
+ * ib_process_mad_wc - Processes a work completion associated with a
+ *   MAD sent or received on a redirected QP.
+ * @mad_agent: Specifies the registered MAD service using the redirected QP.
+ * @wc: References a work completion associated with a sent or received
+ *   MAD segment.
+ *
+ * This routine is used to complete or continue processing on a MAD request.
+ * If the work completion is associated with a send operation, calling
+ * this routine is required to continue an RMPP transfer or to wait for a
+ * corresponding response, if it is a request.  If the work completion is
+ * associated with a receive operation, calling this routine is required to
+ * process an inbound or outbound RMPP transfer, or to match a response MAD
+ * with its corresponding request.
+ */
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+		      struct ib_wc *wc);
+
+/**
+ * ib_create_send_mad - Allocate and initialize a data buffer and work request
+ *   for sending a MAD.
+ * @mad_agent: Specifies the registered MAD service to associate with the MAD.
+ * @remote_qpn: Specifies the QPN of the receiving node.
+ * @pkey_index: Specifies which PKey the MAD will be sent using.  This field
+ *   is valid only if the remote_qpn is QP 1.
+ * @rmpp_active: Indicates if the send will enable RMPP.
+ * @hdr_len: Indicates the size of the data header of the MAD.  This length
+ *   should include the common MAD header, RMPP header, plus any class
+ *   specific header.
+ * @data_len: Indicates the size of any user-transferred data.  The call will
+ *   automatically adjust the allocated buffer size to account for any
+ *   additional padding that may be necessary.
+ * @gfp_mask: GFP mask used for the memory allocation.
+ * @base_version: Base Version of this MAD
+ *
+ * This routine allocates a MAD for sending.  The returned MAD send buffer
+ * will reference a data buffer usable for sending a MAD, along
+ * with an initialized work request structure.  Users may modify the returned
+ * MAD data buffer before posting the send.
+ *
+ * The returned MAD header, class specific headers, and any padding will be
+ * cleared.  Users are responsible for initializing the common MAD header,
+ * any class specific header, and MAD data area.
+ * If @rmpp_active is set, the RMPP header will be initialized for sending.
+ */
+struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent,
+					   u32 remote_qpn, u16 pkey_index,
+					   int rmpp_active,
+					   int hdr_len, int data_len,
+					   gfp_t gfp_mask,
+					   u8 base_version);
+
+/**
+ * ib_is_mad_class_rmpp - returns whether given management class
+ * supports RMPP.
+ * @mgmt_class: management class
+ *
+ * This routine returns whether the management class supports RMPP.
+ */
+int ib_is_mad_class_rmpp(u8 mgmt_class);
+
+/**
+ * ib_get_mad_data_offset - returns the data offset for a given
+ * management class.
+ * @mgmt_class: management class
+ *
+ * This routine returns the data offset in the MAD for the management
+ * class requested.
+ */
+int ib_get_mad_data_offset(u8 mgmt_class);
+
+/**
+ * ib_get_rmpp_segment - returns the data buffer for a given RMPP segment.
+ * @send_buf: Previously allocated send data buffer.
+ * @seg_num: number of segment to return
+ *
+ * This routine returns a pointer to the data buffer of an RMPP MAD.
+ * Users must provide synchronization to @send_buf around this call.
+ */
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num);
+
+/**
+ * ib_free_send_mad - Returns data buffers used to send a MAD.
+ * @send_buf: Previously allocated send data buffer.
+ */
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf);
+
+/**
+ * ib_mad_kernel_rmpp_agent - Returns if the agent is performing RMPP.
+ * @agent: the agent in question
+ * @return: true if agent is performing rmpp, false otherwise.
+ */
+int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent);
+
+#endif /* IB_MAD_H */
diff --git a/include/rdma/ib_marshall.h b/include/rdma/ib_marshall.h
new file mode 100644
index 0000000..8ebf84a
--- /dev/null
+++ b/include/rdma/ib_marshall.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_USER_MARSHALL_H)
+#define IB_USER_MARSHALL_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_user_sa.h>
+
+void ib_copy_qp_attr_to_user(struct ib_device *device,
+			     struct ib_uverbs_qp_attr *dst,
+			     struct ib_qp_attr *src);
+
+void ib_copy_ah_attr_to_user(struct ib_device *device,
+			     struct ib_uverbs_ah_attr *dst,
+			     struct rdma_ah_attr *src);
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+			      struct sa_path_rec *src);
+
+void ib_copy_path_rec_from_user(struct sa_path_rec *dst,
+				struct ib_user_path_rec *src);
+
+#endif /* IB_USER_MARSHALL_H */
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
new file mode 100644
index 0000000..7ea1382
--- /dev/null
+++ b/include/rdma/ib_pack.h
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_PACK_H
+#define IB_PACK_H
+
+#include <rdma/ib_verbs.h>
+#include <uapi/linux/if_ether.h>
+
+enum {
+	IB_LRH_BYTES		= 8,
+	IB_ETH_BYTES		= 14,
+	IB_VLAN_BYTES		= 4,
+	IB_GRH_BYTES		= 40,
+	IB_IP4_BYTES		= 20,
+	IB_UDP_BYTES		= 8,
+	IB_BTH_BYTES		= 12,
+	IB_DETH_BYTES		= 8,
+	IB_EXT_ATOMICETH_BYTES	= 28,
+	IB_EXT_XRC_BYTES	= 4,
+	IB_ICRC_BYTES		= 4
+};
+
+struct ib_field {
+	size_t struct_offset_bytes;
+	size_t struct_size_bytes;
+	int    offset_words;
+	int    offset_bits;
+	int    size_bits;
+	char  *field_name;
+};
+
+#define RESERVED \
+	.field_name          = "reserved"
+
+/*
+ * This macro cleans up the definitions of constants for BTH opcodes.
+ * It is used to define constants such as IB_OPCODE_UD_SEND_ONLY,
+ * which becomes IB_OPCODE_UD + IB_OPCODE_SEND_ONLY, and this gives
+ * the correct value.
+ *
+ * In short, user code should use the constants defined using the
+ * macro rather than worrying about adding together other constants.
+*/
+#define IB_OPCODE(transport, op) \
+	IB_OPCODE_ ## transport ## _ ## op = \
+		IB_OPCODE_ ## transport + IB_OPCODE_ ## op
+
+enum {
+	/* transport types -- just used to define real constants */
+	IB_OPCODE_RC                                = 0x00,
+	IB_OPCODE_UC                                = 0x20,
+	IB_OPCODE_RD                                = 0x40,
+	IB_OPCODE_UD                                = 0x60,
+	/* per IBTA 1.3 vol 1 Table 38, A10.3.2 */
+	IB_OPCODE_CNP                               = 0x80,
+	/* Manufacturer specific */
+	IB_OPCODE_MSP                               = 0xe0,
+
+	/* operations -- just used to define real constants */
+	IB_OPCODE_SEND_FIRST                        = 0x00,
+	IB_OPCODE_SEND_MIDDLE                       = 0x01,
+	IB_OPCODE_SEND_LAST                         = 0x02,
+	IB_OPCODE_SEND_LAST_WITH_IMMEDIATE          = 0x03,
+	IB_OPCODE_SEND_ONLY                         = 0x04,
+	IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE          = 0x05,
+	IB_OPCODE_RDMA_WRITE_FIRST                  = 0x06,
+	IB_OPCODE_RDMA_WRITE_MIDDLE                 = 0x07,
+	IB_OPCODE_RDMA_WRITE_LAST                   = 0x08,
+	IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE    = 0x09,
+	IB_OPCODE_RDMA_WRITE_ONLY                   = 0x0a,
+	IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE    = 0x0b,
+	IB_OPCODE_RDMA_READ_REQUEST                 = 0x0c,
+	IB_OPCODE_RDMA_READ_RESPONSE_FIRST          = 0x0d,
+	IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE         = 0x0e,
+	IB_OPCODE_RDMA_READ_RESPONSE_LAST           = 0x0f,
+	IB_OPCODE_RDMA_READ_RESPONSE_ONLY           = 0x10,
+	IB_OPCODE_ACKNOWLEDGE                       = 0x11,
+	IB_OPCODE_ATOMIC_ACKNOWLEDGE                = 0x12,
+	IB_OPCODE_COMPARE_SWAP                      = 0x13,
+	IB_OPCODE_FETCH_ADD                         = 0x14,
+	/* opcode 0x15 is reserved */
+	IB_OPCODE_SEND_LAST_WITH_INVALIDATE         = 0x16,
+	IB_OPCODE_SEND_ONLY_WITH_INVALIDATE         = 0x17,
+
+	/* real constants follow -- see comment about above IB_OPCODE()
+	   macro for more details */
+
+	/* RC */
+	IB_OPCODE(RC, SEND_FIRST),
+	IB_OPCODE(RC, SEND_MIDDLE),
+	IB_OPCODE(RC, SEND_LAST),
+	IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RC, SEND_ONLY),
+	IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RC, RDMA_WRITE_FIRST),
+	IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
+	IB_OPCODE(RC, RDMA_WRITE_LAST),
+	IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RC, RDMA_WRITE_ONLY),
+	IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RC, RDMA_READ_REQUEST),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
+	IB_OPCODE(RC, ACKNOWLEDGE),
+	IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
+	IB_OPCODE(RC, COMPARE_SWAP),
+	IB_OPCODE(RC, FETCH_ADD),
+	IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
+	IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
+
+	/* UC */
+	IB_OPCODE(UC, SEND_FIRST),
+	IB_OPCODE(UC, SEND_MIDDLE),
+	IB_OPCODE(UC, SEND_LAST),
+	IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(UC, SEND_ONLY),
+	IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(UC, RDMA_WRITE_FIRST),
+	IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
+	IB_OPCODE(UC, RDMA_WRITE_LAST),
+	IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(UC, RDMA_WRITE_ONLY),
+	IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+
+	/* RD */
+	IB_OPCODE(RD, SEND_FIRST),
+	IB_OPCODE(RD, SEND_MIDDLE),
+	IB_OPCODE(RD, SEND_LAST),
+	IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RD, SEND_ONLY),
+	IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RD, RDMA_WRITE_FIRST),
+	IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
+	IB_OPCODE(RD, RDMA_WRITE_LAST),
+	IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RD, RDMA_WRITE_ONLY),
+	IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RD, RDMA_READ_REQUEST),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
+	IB_OPCODE(RD, ACKNOWLEDGE),
+	IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
+	IB_OPCODE(RD, COMPARE_SWAP),
+	IB_OPCODE(RD, FETCH_ADD),
+
+	/* UD */
+	IB_OPCODE(UD, SEND_ONLY),
+	IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
+};
+
+enum {
+	IB_LNH_RAW        = 0,
+	IB_LNH_IP         = 1,
+	IB_LNH_IBA_LOCAL  = 2,
+	IB_LNH_IBA_GLOBAL = 3
+};
+
+struct ib_unpacked_lrh {
+	u8        virtual_lane;
+	u8        link_version;
+	u8        service_level;
+	u8        link_next_header;
+	__be16    destination_lid;
+	__be16    packet_length;
+	__be16    source_lid;
+};
+
+struct ib_unpacked_grh {
+	u8    	     ip_version;
+	u8    	     traffic_class;
+	__be32 	     flow_label;
+	__be16       payload_length;
+	u8    	     next_header;
+	u8    	     hop_limit;
+	union ib_gid source_gid;
+	union ib_gid destination_gid;
+};
+
+struct ib_unpacked_bth {
+	u8           opcode;
+	u8           solicited_event;
+	u8           mig_req;
+	u8           pad_count;
+	u8           transport_header_version;
+	__be16       pkey;
+	__be32       destination_qpn;
+	u8           ack_req;
+	__be32       psn;
+};
+
+struct ib_unpacked_deth {
+	__be32       qkey;
+	__be32       source_qpn;
+};
+
+struct ib_unpacked_eth {
+	u8	dmac_h[4];
+	u8	dmac_l[2];
+	u8	smac_h[2];
+	u8	smac_l[4];
+	__be16	type;
+};
+
+struct ib_unpacked_ip4 {
+	u8	ver;
+	u8	hdr_len;
+	u8	tos;
+	__be16	tot_len;
+	__be16	id;
+	__be16	frag_off;
+	u8	ttl;
+	u8	protocol;
+	__sum16	check;
+	__be32	saddr;
+	__be32	daddr;
+};
+
+struct ib_unpacked_udp {
+	__be16	sport;
+	__be16	dport;
+	__be16	length;
+	__be16	csum;
+};
+
+struct ib_unpacked_vlan {
+	__be16  tag;
+	__be16  type;
+};
+
+struct ib_ud_header {
+	int                     lrh_present;
+	struct ib_unpacked_lrh  lrh;
+	int			eth_present;
+	struct ib_unpacked_eth	eth;
+	int                     vlan_present;
+	struct ib_unpacked_vlan vlan;
+	int			grh_present;
+	struct ib_unpacked_grh	grh;
+	int			ipv4_present;
+	struct ib_unpacked_ip4	ip4;
+	int			udp_present;
+	struct ib_unpacked_udp	udp;
+	struct ib_unpacked_bth	bth;
+	struct ib_unpacked_deth deth;
+	int			immediate_present;
+	__be32			immediate_data;
+};
+
+void ib_pack(const struct ib_field        *desc,
+	     int                           desc_len,
+	     void                         *structure,
+	     void                         *buf);
+
+void ib_unpack(const struct ib_field        *desc,
+	       int                           desc_len,
+	       void                         *buf,
+	       void                         *structure);
+
+__sum16 ib_ud_ip4_csum(struct ib_ud_header *header);
+
+int ib_ud_header_init(int		    payload_bytes,
+		      int		    lrh_present,
+		      int		    eth_present,
+		      int		    vlan_present,
+		      int		    grh_present,
+		      int		    ip_version,
+		      int		    udp_present,
+		      int		    immediate_present,
+		      struct ib_ud_header *header);
+
+int ib_ud_header_pack(struct ib_ud_header *header,
+		      void                *buf);
+
+int ib_ud_header_unpack(void                *buf,
+			struct ib_ud_header *header);
+
+#endif /* IB_PACK_H */
diff --git a/include/rdma/ib_pma.h b/include/rdma/ib_pma.h
new file mode 100644
index 0000000..2f8a65c
--- /dev/null
+++ b/include/rdma/ib_pma.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_PMA_H)
+#define IB_PMA_H
+
+#include <rdma/ib_mad.h>
+
+/*
+ * PMA class portinfo capability mask bits
+ */
+#define IB_PMA_CLASS_CAP_ALLPORTSELECT  cpu_to_be16(1 << 8)
+#define IB_PMA_CLASS_CAP_EXT_WIDTH      cpu_to_be16(1 << 9)
+#define IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF cpu_to_be16(1 << 10)
+#define IB_PMA_CLASS_CAP_XMIT_WAIT      cpu_to_be16(1 << 12)
+
+#define IB_PMA_CLASS_PORT_INFO          cpu_to_be16(0x0001)
+#define IB_PMA_PORT_SAMPLES_CONTROL     cpu_to_be16(0x0010)
+#define IB_PMA_PORT_SAMPLES_RESULT      cpu_to_be16(0x0011)
+#define IB_PMA_PORT_COUNTERS            cpu_to_be16(0x0012)
+#define IB_PMA_PORT_COUNTERS_EXT        cpu_to_be16(0x001D)
+#define IB_PMA_PORT_SAMPLES_RESULT_EXT  cpu_to_be16(0x001E)
+
+struct ib_pma_mad {
+	struct ib_mad_hdr mad_hdr;
+	u8 reserved[40];
+	u8 data[192];
+} __packed;
+
+struct ib_pma_portsamplescontrol {
+	u8 opcode;
+	u8 port_select;
+	u8 tick;
+	u8 counter_width;		/* resv: 7:3, counter width: 2:0 */
+	__be32 counter_mask0_9;		/* 2, 10 3-bit fields */
+	__be16 counter_mask10_14;	/* 1, 5 3-bit fields */
+	u8 sample_mechanisms;
+	u8 sample_status;		/* only lower 2 bits */
+	__be64 option_mask;
+	__be64 vendor_mask;
+	__be32 sample_start;
+	__be32 sample_interval;
+	__be16 tag;
+	__be16 counter_select[15];
+	__be32 reserved1;
+	__be64 samples_only_option_mask;
+	__be32 reserved2[28];
+};
+
+struct ib_pma_portsamplesresult {
+	__be16 tag;
+	__be16 sample_status;   /* only lower 2 bits */
+	__be32 counter[15];
+};
+
+struct ib_pma_portsamplesresult_ext {
+	__be16 tag;
+	__be16 sample_status;   /* only lower 2 bits */
+	__be32 extended_width;  /* only upper 2 bits */
+	__be64 counter[15];
+};
+
+struct ib_pma_portcounters {
+	u8 reserved;
+	u8 port_select;
+	__be16 counter_select;
+	__be16 symbol_error_counter;
+	u8 link_error_recovery_counter;
+	u8 link_downed_counter;
+	__be16 port_rcv_errors;
+	__be16 port_rcv_remphys_errors;
+	__be16 port_rcv_switch_relay_errors;
+	__be16 port_xmit_discards;
+	u8 port_xmit_constraint_errors;
+	u8 port_rcv_constraint_errors;
+	u8 reserved1;
+	u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+	__be16 reserved2;
+	__be16 vl15_dropped;
+	__be32 port_xmit_data;
+	__be32 port_rcv_data;
+	__be32 port_xmit_packets;
+	__be32 port_rcv_packets;
+	__be32 port_xmit_wait;
+} __packed;
+
+
+#define IB_PMA_SEL_SYMBOL_ERROR                 cpu_to_be16(0x0001)
+#define IB_PMA_SEL_LINK_ERROR_RECOVERY          cpu_to_be16(0x0002)
+#define IB_PMA_SEL_LINK_DOWNED                  cpu_to_be16(0x0004)
+#define IB_PMA_SEL_PORT_RCV_ERRORS              cpu_to_be16(0x0008)
+#define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS      cpu_to_be16(0x0010)
+#define IB_PMA_SEL_PORT_XMIT_DISCARDS           cpu_to_be16(0x0040)
+#define IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS  cpu_to_be16(0x0200)
+#define IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS    cpu_to_be16(0x0400)
+#define IB_PMA_SEL_PORT_VL15_DROPPED            cpu_to_be16(0x0800)
+#define IB_PMA_SEL_PORT_XMIT_DATA               cpu_to_be16(0x1000)
+#define IB_PMA_SEL_PORT_RCV_DATA                cpu_to_be16(0x2000)
+#define IB_PMA_SEL_PORT_XMIT_PACKETS            cpu_to_be16(0x4000)
+#define IB_PMA_SEL_PORT_RCV_PACKETS             cpu_to_be16(0x8000)
+
+struct ib_pma_portcounters_ext {
+	u8 reserved;
+	u8 port_select;
+	__be16 counter_select;
+	__be32 reserved1;
+	__be64 port_xmit_data;
+	__be64 port_rcv_data;
+	__be64 port_xmit_packets;
+	__be64 port_rcv_packets;
+	__be64 port_unicast_xmit_packets;
+	__be64 port_unicast_rcv_packets;
+	__be64 port_multicast_xmit_packets;
+	__be64 port_multicast_rcv_packets;
+} __packed;
+
+#define IB_PMA_SELX_PORT_XMIT_DATA              cpu_to_be16(0x0001)
+#define IB_PMA_SELX_PORT_RCV_DATA               cpu_to_be16(0x0002)
+#define IB_PMA_SELX_PORT_XMIT_PACKETS           cpu_to_be16(0x0004)
+#define IB_PMA_SELX_PORT_RCV_PACKETS            cpu_to_be16(0x0008)
+#define IB_PMA_SELX_PORT_UNI_XMIT_PACKETS       cpu_to_be16(0x0010)
+#define IB_PMA_SELX_PORT_UNI_RCV_PACKETS        cpu_to_be16(0x0020)
+#define IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS     cpu_to_be16(0x0040)
+#define IB_PMA_SELX_PORT_MULTI_RCV_PACKETS      cpu_to_be16(0x0080)
+
+#endif /* IB_PMA_H */
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
new file mode 100644
index 0000000..bacb144
--- /dev/null
+++ b/include/rdma/ib_sa.h
@@ -0,0 +1,711 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_SA_H
+#define IB_SA_H
+
+#include <linux/completion.h>
+#include <linux/compiler.h>
+
+#include <linux/atomic.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_addr.h>
+#include <rdma/opa_addr.h>
+
+enum {
+	IB_SA_CLASS_VERSION		= 2,	/* IB spec version 1.1/1.2 */
+
+	IB_SA_METHOD_GET_TABLE		= 0x12,
+	IB_SA_METHOD_GET_TABLE_RESP	= 0x92,
+	IB_SA_METHOD_DELETE		= 0x15,
+	IB_SA_METHOD_DELETE_RESP	= 0x95,
+	IB_SA_METHOD_GET_MULTI		= 0x14,
+	IB_SA_METHOD_GET_MULTI_RESP	= 0x94,
+	IB_SA_METHOD_GET_TRACE_TBL	= 0x13
+};
+
+#define OPA_SA_CLASS_VERSION	0x80
+enum {
+	IB_SA_ATTR_CLASS_PORTINFO    = 0x01,
+	IB_SA_ATTR_NOTICE	     = 0x02,
+	IB_SA_ATTR_INFORM_INFO	     = 0x03,
+	IB_SA_ATTR_NODE_REC	     = 0x11,
+	IB_SA_ATTR_PORT_INFO_REC     = 0x12,
+	IB_SA_ATTR_SL2VL_REC	     = 0x13,
+	IB_SA_ATTR_SWITCH_REC	     = 0x14,
+	IB_SA_ATTR_LINEAR_FDB_REC    = 0x15,
+	IB_SA_ATTR_RANDOM_FDB_REC    = 0x16,
+	IB_SA_ATTR_MCAST_FDB_REC     = 0x17,
+	IB_SA_ATTR_SM_INFO_REC	     = 0x18,
+	IB_SA_ATTR_LINK_REC	     = 0x20,
+	IB_SA_ATTR_GUID_INFO_REC     = 0x30,
+	IB_SA_ATTR_SERVICE_REC	     = 0x31,
+	IB_SA_ATTR_PARTITION_REC     = 0x33,
+	IB_SA_ATTR_PATH_REC	     = 0x35,
+	IB_SA_ATTR_VL_ARB_REC	     = 0x36,
+	IB_SA_ATTR_MC_MEMBER_REC     = 0x38,
+	IB_SA_ATTR_TRACE_REC	     = 0x39,
+	IB_SA_ATTR_MULTI_PATH_REC    = 0x3a,
+	IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b,
+	IB_SA_ATTR_INFORM_INFO_REC   = 0xf3
+};
+
+enum ib_sa_selector {
+	IB_SA_GT   = 0,
+	IB_SA_LT   = 1,
+	IB_SA_EQ   = 2,
+	/*
+	 * The meaning of "best" depends on the attribute: for
+	 * example, for MTU best will return the largest available
+	 * MTU, while for packet life time, best will return the
+	 * smallest available life time.
+	 */
+	IB_SA_BEST = 3
+};
+
+/*
+ * There are 4 types of join states:
+ * FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember.
+ * The order corresponds to JoinState bits in MCMemberRecord.
+ */
+enum ib_sa_mc_join_states {
+	FULLMEMBER_JOIN,
+	NONMEMBER_JOIN,
+	SENDONLY_NONMEBER_JOIN,
+	SENDONLY_FULLMEMBER_JOIN,
+	NUM_JOIN_MEMBERSHIP_TYPES,
+};
+
+#define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT	BIT(12)
+
+/*
+ * Structures for SA records are named "struct ib_sa_xxx_rec."  No
+ * attempt is made to pack structures to match the physical layout of
+ * SA records in SA MADs; all packing and unpacking is handled by the
+ * SA query code.
+ *
+ * For a record with structure ib_sa_xxx_rec, the naming convention
+ * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we
+ * never use different abbreviations or otherwise change the spelling
+ * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY).
+ *
+ * Reserved rows are indicated with comments to help maintainability.
+ */
+
+#define IB_SA_PATH_REC_SERVICE_ID		       (IB_SA_COMP_MASK( 0) |\
+							IB_SA_COMP_MASK( 1))
+#define IB_SA_PATH_REC_DGID				IB_SA_COMP_MASK( 2)
+#define IB_SA_PATH_REC_SGID				IB_SA_COMP_MASK( 3)
+#define IB_SA_PATH_REC_DLID				IB_SA_COMP_MASK( 4)
+#define IB_SA_PATH_REC_SLID				IB_SA_COMP_MASK( 5)
+#define IB_SA_PATH_REC_RAW_TRAFFIC			IB_SA_COMP_MASK( 6)
+/* reserved:								 7 */
+#define IB_SA_PATH_REC_FLOW_LABEL       		IB_SA_COMP_MASK( 8)
+#define IB_SA_PATH_REC_HOP_LIMIT			IB_SA_COMP_MASK( 9)
+#define IB_SA_PATH_REC_TRAFFIC_CLASS			IB_SA_COMP_MASK(10)
+#define IB_SA_PATH_REC_REVERSIBLE			IB_SA_COMP_MASK(11)
+#define IB_SA_PATH_REC_NUMB_PATH			IB_SA_COMP_MASK(12)
+#define IB_SA_PATH_REC_PKEY				IB_SA_COMP_MASK(13)
+#define IB_SA_PATH_REC_QOS_CLASS			IB_SA_COMP_MASK(14)
+#define IB_SA_PATH_REC_SL				IB_SA_COMP_MASK(15)
+#define IB_SA_PATH_REC_MTU_SELECTOR			IB_SA_COMP_MASK(16)
+#define IB_SA_PATH_REC_MTU				IB_SA_COMP_MASK(17)
+#define IB_SA_PATH_REC_RATE_SELECTOR			IB_SA_COMP_MASK(18)
+#define IB_SA_PATH_REC_RATE				IB_SA_COMP_MASK(19)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR	IB_SA_COMP_MASK(20)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME			IB_SA_COMP_MASK(21)
+#define IB_SA_PATH_REC_PREFERENCE			IB_SA_COMP_MASK(22)
+
+enum sa_path_rec_type {
+	SA_PATH_REC_TYPE_IB,
+	SA_PATH_REC_TYPE_ROCE_V1,
+	SA_PATH_REC_TYPE_ROCE_V2,
+	SA_PATH_REC_TYPE_OPA
+};
+
+struct sa_path_rec_ib {
+	__be16       dlid;
+	__be16       slid;
+	u8           raw_traffic;
+};
+
+/**
+ * struct sa_path_rec_roce - RoCE specific portion of the path record entry
+ * @route_resolved:	When set, it indicates that this route is already
+ *			resolved for this path record entry.
+ * @dmac:		Destination mac address for the given DGID entry
+ *			of the path record entry.
+ */
+struct sa_path_rec_roce {
+	bool	route_resolved;
+	u8           dmac[ETH_ALEN];
+	/* ignored in IB */
+	int	     ifindex;
+	/* ignored in IB */
+	struct net  *net;
+
+};
+
+struct sa_path_rec_opa {
+	__be32       dlid;
+	__be32       slid;
+	u8           raw_traffic;
+	u8	     l2_8B;
+	u8	     l2_10B;
+	u8	     l2_9B;
+	u8	     l2_16B;
+	u8	     qos_type;
+	u8	     qos_priority;
+};
+
+struct sa_path_rec {
+	union ib_gid dgid;
+	union ib_gid sgid;
+	__be64       service_id;
+	/* reserved */
+	__be32       flow_label;
+	u8           hop_limit;
+	u8           traffic_class;
+	u8           reversible;
+	u8           numb_path;
+	__be16       pkey;
+	__be16       qos_class;
+	u8           sl;
+	u8           mtu_selector;
+	u8           mtu;
+	u8           rate_selector;
+	u8           rate;
+	u8           packet_life_time_selector;
+	u8           packet_life_time;
+	u8           preference;
+	union {
+		struct sa_path_rec_ib ib;
+		struct sa_path_rec_roce roce;
+		struct sa_path_rec_opa opa;
+	};
+	enum sa_path_rec_type rec_type;
+};
+
+static inline enum ib_gid_type
+		sa_conv_pathrec_to_gid_type(struct sa_path_rec *rec)
+{
+	switch (rec->rec_type) {
+	case SA_PATH_REC_TYPE_ROCE_V1:
+		return IB_GID_TYPE_ROCE;
+	case SA_PATH_REC_TYPE_ROCE_V2:
+		return IB_GID_TYPE_ROCE_UDP_ENCAP;
+	default:
+		return IB_GID_TYPE_IB;
+	}
+}
+
+static inline enum sa_path_rec_type
+		sa_conv_gid_to_pathrec_type(enum ib_gid_type type)
+{
+	switch (type) {
+	case IB_GID_TYPE_ROCE:
+		return SA_PATH_REC_TYPE_ROCE_V1;
+	case IB_GID_TYPE_ROCE_UDP_ENCAP:
+		return SA_PATH_REC_TYPE_ROCE_V2;
+	default:
+		return SA_PATH_REC_TYPE_IB;
+	}
+}
+
+static inline void path_conv_opa_to_ib(struct sa_path_rec *ib,
+				       struct sa_path_rec *opa)
+{
+	if ((be32_to_cpu(opa->opa.dlid) >=
+	     be16_to_cpu(IB_MULTICAST_LID_BASE)) ||
+	    (be32_to_cpu(opa->opa.slid) >=
+	     be16_to_cpu(IB_MULTICAST_LID_BASE))) {
+		/* Create OPA GID and zero out the LID */
+		ib->dgid.global.interface_id
+				= OPA_MAKE_ID(be32_to_cpu(opa->opa.dlid));
+		ib->dgid.global.subnet_prefix
+				= opa->dgid.global.subnet_prefix;
+		ib->sgid.global.interface_id
+				= OPA_MAKE_ID(be32_to_cpu(opa->opa.slid));
+		ib->dgid.global.subnet_prefix
+				= opa->dgid.global.subnet_prefix;
+		ib->ib.dlid	= 0;
+
+		ib->ib.slid	= 0;
+	} else {
+		ib->ib.dlid	= htons(ntohl(opa->opa.dlid));
+		ib->ib.slid	= htons(ntohl(opa->opa.slid));
+	}
+	ib->service_id		= opa->service_id;
+	ib->ib.raw_traffic	= opa->opa.raw_traffic;
+}
+
+static inline void path_conv_ib_to_opa(struct sa_path_rec *opa,
+				       struct sa_path_rec *ib)
+{
+	__be32 slid, dlid;
+
+	if ((ib_is_opa_gid(&ib->sgid)) ||
+	    (ib_is_opa_gid(&ib->dgid))) {
+		slid = htonl(opa_get_lid_from_gid(&ib->sgid));
+		dlid = htonl(opa_get_lid_from_gid(&ib->dgid));
+	} else {
+		slid = htonl(ntohs(ib->ib.slid));
+		dlid = htonl(ntohs(ib->ib.dlid));
+	}
+	opa->opa.slid		= slid;
+	opa->opa.dlid		= dlid;
+	opa->service_id		= ib->service_id;
+	opa->opa.raw_traffic	= ib->ib.raw_traffic;
+}
+
+/* Convert from OPA to IB path record */
+static inline void sa_convert_path_opa_to_ib(struct sa_path_rec *dest,
+					     struct sa_path_rec *src)
+{
+	if (src->rec_type != SA_PATH_REC_TYPE_OPA)
+		return;
+
+	*dest = *src;
+	dest->rec_type = SA_PATH_REC_TYPE_IB;
+	path_conv_opa_to_ib(dest, src);
+}
+
+/* Convert from IB to OPA path record */
+static inline void sa_convert_path_ib_to_opa(struct sa_path_rec *dest,
+					     struct sa_path_rec *src)
+{
+	if (src->rec_type != SA_PATH_REC_TYPE_IB)
+		return;
+
+	/* Do a structure copy and overwrite the relevant fields */
+	*dest = *src;
+	dest->rec_type = SA_PATH_REC_TYPE_OPA;
+	path_conv_ib_to_opa(dest, src);
+}
+
+#define IB_SA_MCMEMBER_REC_MGID				IB_SA_COMP_MASK( 0)
+#define IB_SA_MCMEMBER_REC_PORT_GID			IB_SA_COMP_MASK( 1)
+#define IB_SA_MCMEMBER_REC_QKEY				IB_SA_COMP_MASK( 2)
+#define IB_SA_MCMEMBER_REC_MLID				IB_SA_COMP_MASK( 3)
+#define IB_SA_MCMEMBER_REC_MTU_SELECTOR			IB_SA_COMP_MASK( 4)
+#define IB_SA_MCMEMBER_REC_MTU				IB_SA_COMP_MASK( 5)
+#define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS		IB_SA_COMP_MASK( 6)
+#define IB_SA_MCMEMBER_REC_PKEY				IB_SA_COMP_MASK( 7)
+#define IB_SA_MCMEMBER_REC_RATE_SELECTOR		IB_SA_COMP_MASK( 8)
+#define IB_SA_MCMEMBER_REC_RATE				IB_SA_COMP_MASK( 9)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR	IB_SA_COMP_MASK(10)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME		IB_SA_COMP_MASK(11)
+#define IB_SA_MCMEMBER_REC_SL				IB_SA_COMP_MASK(12)
+#define IB_SA_MCMEMBER_REC_FLOW_LABEL			IB_SA_COMP_MASK(13)
+#define IB_SA_MCMEMBER_REC_HOP_LIMIT			IB_SA_COMP_MASK(14)
+#define IB_SA_MCMEMBER_REC_SCOPE			IB_SA_COMP_MASK(15)
+#define IB_SA_MCMEMBER_REC_JOIN_STATE			IB_SA_COMP_MASK(16)
+#define IB_SA_MCMEMBER_REC_PROXY_JOIN			IB_SA_COMP_MASK(17)
+
+struct ib_sa_mcmember_rec {
+	union ib_gid mgid;
+	union ib_gid port_gid;
+	__be32       qkey;
+	__be16       mlid;
+	u8           mtu_selector;
+	u8           mtu;
+	u8           traffic_class;
+	__be16       pkey;
+	u8 	     rate_selector;
+	u8 	     rate;
+	u8 	     packet_life_time_selector;
+	u8 	     packet_life_time;
+	u8           sl;
+	__be32       flow_label;
+	u8           hop_limit;
+	u8           scope;
+	u8           join_state;
+	u8           proxy_join;
+};
+
+/* Service Record Component Mask Sec 15.2.5.14 Ver 1.1	*/
+#define IB_SA_SERVICE_REC_SERVICE_ID			IB_SA_COMP_MASK( 0)
+#define IB_SA_SERVICE_REC_SERVICE_GID			IB_SA_COMP_MASK( 1)
+#define IB_SA_SERVICE_REC_SERVICE_PKEY			IB_SA_COMP_MASK( 2)
+/* reserved:								 3 */
+#define IB_SA_SERVICE_REC_SERVICE_LEASE			IB_SA_COMP_MASK( 4)
+#define IB_SA_SERVICE_REC_SERVICE_KEY			IB_SA_COMP_MASK( 5)
+#define IB_SA_SERVICE_REC_SERVICE_NAME			IB_SA_COMP_MASK( 6)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_0		IB_SA_COMP_MASK( 7)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_1		IB_SA_COMP_MASK( 8)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_2		IB_SA_COMP_MASK( 9)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_3		IB_SA_COMP_MASK(10)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_4		IB_SA_COMP_MASK(11)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_5		IB_SA_COMP_MASK(12)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_6		IB_SA_COMP_MASK(13)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_7		IB_SA_COMP_MASK(14)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_8		IB_SA_COMP_MASK(15)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_9		IB_SA_COMP_MASK(16)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_10		IB_SA_COMP_MASK(17)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_11		IB_SA_COMP_MASK(18)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_12		IB_SA_COMP_MASK(19)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_13		IB_SA_COMP_MASK(20)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_14		IB_SA_COMP_MASK(21)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_15		IB_SA_COMP_MASK(22)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_0		IB_SA_COMP_MASK(23)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_1		IB_SA_COMP_MASK(24)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_2		IB_SA_COMP_MASK(25)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_3		IB_SA_COMP_MASK(26)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_4		IB_SA_COMP_MASK(27)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_5		IB_SA_COMP_MASK(28)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_6		IB_SA_COMP_MASK(29)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_7		IB_SA_COMP_MASK(30)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_0		IB_SA_COMP_MASK(31)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_1		IB_SA_COMP_MASK(32)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_2		IB_SA_COMP_MASK(33)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_3		IB_SA_COMP_MASK(34)
+#define IB_SA_SERVICE_REC_SERVICE_DATA64_0		IB_SA_COMP_MASK(35)
+#define IB_SA_SERVICE_REC_SERVICE_DATA64_1		IB_SA_COMP_MASK(36)
+
+#define IB_DEFAULT_SERVICE_LEASE 	0xFFFFFFFF
+
+struct ib_sa_service_rec {
+	u64		id;
+	union ib_gid	gid;
+	__be16 		pkey;
+	/* reserved */
+	u32		lease;
+	u8		key[16];
+	u8		name[64];
+	u8		data8[16];
+	u16		data16[8];
+	u32		data32[4];
+	u64		data64[2];
+};
+
+#define IB_SA_GUIDINFO_REC_LID		IB_SA_COMP_MASK(0)
+#define IB_SA_GUIDINFO_REC_BLOCK_NUM	IB_SA_COMP_MASK(1)
+#define IB_SA_GUIDINFO_REC_RES1		IB_SA_COMP_MASK(2)
+#define IB_SA_GUIDINFO_REC_RES2		IB_SA_COMP_MASK(3)
+#define IB_SA_GUIDINFO_REC_GID0		IB_SA_COMP_MASK(4)
+#define IB_SA_GUIDINFO_REC_GID1		IB_SA_COMP_MASK(5)
+#define IB_SA_GUIDINFO_REC_GID2		IB_SA_COMP_MASK(6)
+#define IB_SA_GUIDINFO_REC_GID3		IB_SA_COMP_MASK(7)
+#define IB_SA_GUIDINFO_REC_GID4		IB_SA_COMP_MASK(8)
+#define IB_SA_GUIDINFO_REC_GID5		IB_SA_COMP_MASK(9)
+#define IB_SA_GUIDINFO_REC_GID6		IB_SA_COMP_MASK(10)
+#define IB_SA_GUIDINFO_REC_GID7		IB_SA_COMP_MASK(11)
+
+struct ib_sa_guidinfo_rec {
+	__be16	lid;
+	u8	block_num;
+	/* reserved */
+	u8	res1;
+	__be32	res2;
+	u8	guid_info_list[64];
+};
+
+struct ib_sa_client {
+	atomic_t users;
+	struct completion comp;
+};
+
+/**
+ * ib_sa_register_client - Register an SA client.
+ */
+void ib_sa_register_client(struct ib_sa_client *client);
+
+/**
+ * ib_sa_unregister_client - Deregister an SA client.
+ * @client: Client object to deregister.
+ */
+void ib_sa_unregister_client(struct ib_sa_client *client);
+
+struct ib_sa_query;
+
+void ib_sa_cancel_query(int id, struct ib_sa_query *query);
+
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+		       struct ib_device *device, u8 port_num,
+		       struct sa_path_rec *rec,
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, gfp_t gfp_mask,
+		       void (*callback)(int status,
+					struct sa_path_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **query);
+
+int ib_sa_service_rec_query(struct ib_sa_client *client,
+			 struct ib_device *device, u8 port_num,
+			 u8 method,
+			 struct ib_sa_service_rec *rec,
+			 ib_sa_comp_mask comp_mask,
+			 int timeout_ms, gfp_t gfp_mask,
+			 void (*callback)(int status,
+					  struct ib_sa_service_rec *resp,
+					  void *context),
+			 void *context,
+			 struct ib_sa_query **sa_query);
+
+struct ib_sa_multicast {
+	struct ib_sa_mcmember_rec rec;
+	ib_sa_comp_mask		comp_mask;
+	int			(*callback)(int status,
+					    struct ib_sa_multicast *multicast);
+	void			*context;
+};
+
+/**
+ * ib_sa_join_multicast - Initiates a join request to the specified multicast
+ *   group.
+ * @client: SA client
+ * @device: Device associated with the multicast group.
+ * @port_num: Port on the specified device to associate with the multicast
+ *   group.
+ * @rec: SA multicast member record specifying group attributes.
+ * @comp_mask: Component mask indicating which group attributes of %rec are
+ *   valid.
+ * @gfp_mask: GFP mask for memory allocations.
+ * @callback: User callback invoked once the join operation completes.
+ * @context: User specified context stored with the ib_sa_multicast structure.
+ *
+ * This call initiates a multicast join request with the SA for the specified
+ * multicast group.  If the join operation is started successfully, it returns
+ * an ib_sa_multicast structure that is used to track the multicast operation.
+ * Users must free this structure by calling ib_free_multicast, even if the
+ * join operation later fails.  (The callback status is non-zero.)
+ *
+ * If the join operation fails; status will be non-zero, with the following
+ * failures possible:
+ * -ETIMEDOUT: The request timed out.
+ * -EIO: An error occurred sending the query.
+ * -EINVAL: The MCMemberRecord values differed from the existing group's.
+ * -ENETRESET: Indicates that an fatal error has occurred on the multicast
+ *   group, and the user must rejoin the group to continue using it.
+ */
+struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client,
+					     struct ib_device *device, u8 port_num,
+					     struct ib_sa_mcmember_rec *rec,
+					     ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+					     int (*callback)(int status,
+							     struct ib_sa_multicast
+								    *multicast),
+					     void *context);
+
+/**
+ * ib_free_multicast - Frees the multicast tracking structure, and releases
+ *    any reference on the multicast group.
+ * @multicast: Multicast tracking structure allocated by ib_join_multicast.
+ *
+ * This call blocks until the multicast identifier is destroyed.  It may
+ * not be called from within the multicast callback; however, returning a non-
+ * zero value from the callback will result in destroying the multicast
+ * tracking structure.
+ */
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast);
+
+/**
+ * ib_get_mcmember_rec - Looks up a multicast member record by its MGID and
+ *   returns it if found.
+ * @device: Device associated with the multicast group.
+ * @port_num: Port on the specified device to associate with the multicast
+ *   group.
+ * @mgid: MGID of multicast group.
+ * @rec: Location to copy SA multicast member record.
+ */
+int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+			   union ib_gid *mgid, struct ib_sa_mcmember_rec *rec);
+
+/**
+ * ib_init_ah_from_mcmember - Initialize address handle attributes based on
+ * an SA multicast member record.
+ */
+int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+			     struct ib_sa_mcmember_rec *rec,
+			     struct net_device *ndev,
+			     enum ib_gid_type gid_type,
+			     struct rdma_ah_attr *ah_attr);
+
+/**
+ * ib_init_ah_attr_from_path - Initialize address handle attributes based on
+ *   an SA path record.
+ */
+int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num,
+			      struct sa_path_rec *rec,
+			      struct rdma_ah_attr *ah_attr);
+
+/**
+ * ib_sa_pack_path - Conert a path record from struct ib_sa_path_rec
+ * to IB MAD wire format.
+ */
+void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute);
+
+/**
+ * ib_sa_unpack_path - Convert a path record from MAD format to struct
+ * ib_sa_path_rec.
+ */
+void ib_sa_unpack_path(void *attribute, struct sa_path_rec *rec);
+
+/* Support GuidInfoRecord */
+int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
+			      struct ib_device *device, u8 port_num,
+			      struct ib_sa_guidinfo_rec *rec,
+			      ib_sa_comp_mask comp_mask, u8 method,
+			      int timeout_ms, gfp_t gfp_mask,
+			      void (*callback)(int status,
+					       struct ib_sa_guidinfo_rec *resp,
+					       void *context),
+			      void *context,
+			      struct ib_sa_query **sa_query);
+
+bool ib_sa_sendonly_fullmem_support(struct ib_sa_client *client,
+				    struct ib_device *device,
+				    u8 port_num);
+
+static inline bool sa_path_is_roce(struct sa_path_rec *rec)
+{
+	return ((rec->rec_type == SA_PATH_REC_TYPE_ROCE_V1) ||
+		(rec->rec_type == SA_PATH_REC_TYPE_ROCE_V2));
+}
+
+static inline bool sa_path_is_opa(struct sa_path_rec *rec)
+{
+	return (rec->rec_type == SA_PATH_REC_TYPE_OPA);
+}
+
+static inline void sa_path_set_slid(struct sa_path_rec *rec, u32 slid)
+{
+	if (rec->rec_type == SA_PATH_REC_TYPE_IB)
+		rec->ib.slid = cpu_to_be16(slid);
+	else if (rec->rec_type == SA_PATH_REC_TYPE_OPA)
+		rec->opa.slid = cpu_to_be32(slid);
+}
+
+static inline void sa_path_set_dlid(struct sa_path_rec *rec, u32 dlid)
+{
+	if (rec->rec_type == SA_PATH_REC_TYPE_IB)
+		rec->ib.dlid = cpu_to_be16(dlid);
+	else if (rec->rec_type == SA_PATH_REC_TYPE_OPA)
+		rec->opa.dlid = cpu_to_be32(dlid);
+}
+
+static inline void sa_path_set_raw_traffic(struct sa_path_rec *rec,
+					   u8 raw_traffic)
+{
+	if (rec->rec_type == SA_PATH_REC_TYPE_IB)
+		rec->ib.raw_traffic = raw_traffic;
+	else if (rec->rec_type == SA_PATH_REC_TYPE_OPA)
+		rec->opa.raw_traffic = raw_traffic;
+}
+
+static inline __be32 sa_path_get_slid(struct sa_path_rec *rec)
+{
+	if (rec->rec_type == SA_PATH_REC_TYPE_IB)
+		return htonl(ntohs(rec->ib.slid));
+	else if (rec->rec_type == SA_PATH_REC_TYPE_OPA)
+		return rec->opa.slid;
+	return 0;
+}
+
+static inline __be32 sa_path_get_dlid(struct sa_path_rec *rec)
+{
+	if (rec->rec_type == SA_PATH_REC_TYPE_IB)
+		return htonl(ntohs(rec->ib.dlid));
+	else if (rec->rec_type == SA_PATH_REC_TYPE_OPA)
+		return rec->opa.dlid;
+	return 0;
+}
+
+static inline u8 sa_path_get_raw_traffic(struct sa_path_rec *rec)
+{
+	if (rec->rec_type == SA_PATH_REC_TYPE_IB)
+		return rec->ib.raw_traffic;
+	else if (rec->rec_type == SA_PATH_REC_TYPE_OPA)
+		return rec->opa.raw_traffic;
+	return 0;
+}
+
+static inline void sa_path_set_dmac(struct sa_path_rec *rec, u8 *dmac)
+{
+	if (sa_path_is_roce(rec))
+		memcpy(rec->roce.dmac, dmac, ETH_ALEN);
+}
+
+static inline void sa_path_set_dmac_zero(struct sa_path_rec *rec)
+{
+	if (sa_path_is_roce(rec))
+		eth_zero_addr(rec->roce.dmac);
+}
+
+static inline void sa_path_set_ifindex(struct sa_path_rec *rec, int ifindex)
+{
+	if (sa_path_is_roce(rec))
+		rec->roce.ifindex = ifindex;
+}
+
+static inline void sa_path_set_ndev(struct sa_path_rec *rec, struct net *net)
+{
+	if (sa_path_is_roce(rec))
+		rec->roce.net = net;
+}
+
+static inline u8 *sa_path_get_dmac(struct sa_path_rec *rec)
+{
+	if (sa_path_is_roce(rec))
+		return rec->roce.dmac;
+	return NULL;
+}
+
+static inline int sa_path_get_ifindex(struct sa_path_rec *rec)
+{
+	if (sa_path_is_roce(rec))
+		return rec->roce.ifindex;
+	return 0;
+}
+
+static inline struct net *sa_path_get_ndev(struct sa_path_rec *rec)
+{
+	if (sa_path_is_roce(rec))
+		return rec->roce.net;
+	return NULL;
+}
+
+static inline struct net_device *ib_get_ndev_from_path(struct sa_path_rec *rec)
+{
+	return sa_path_get_ndev(rec) ?
+		dev_get_by_index(sa_path_get_ndev(rec),
+				 sa_path_get_ifindex(rec))
+		: NULL;
+}
+
+#endif /* IB_SA_H */
diff --git a/include/rdma/ib_smi.h b/include/rdma/ib_smi.h
new file mode 100644
index 0000000..b439e98
--- /dev/null
+++ b/include/rdma/ib_smi.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_SMI_H)
+#define IB_SMI_H
+
+#include <rdma/ib_mad.h>
+
+#define IB_SMP_DATA_SIZE			64
+#define IB_SMP_MAX_PATH_HOPS			64
+
+struct ib_smp {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	__be16	status;
+	u8	hop_ptr;
+	u8	hop_cnt;
+	__be64	tid;
+	__be16	attr_id;
+	__be16	resv;
+	__be32	attr_mod;
+	__be64	mkey;
+	__be16	dr_slid;
+	__be16	dr_dlid;
+	u8	reserved[28];
+	u8	data[IB_SMP_DATA_SIZE];
+	u8	initial_path[IB_SMP_MAX_PATH_HOPS];
+	u8	return_path[IB_SMP_MAX_PATH_HOPS];
+} __attribute__ ((packed));
+
+#define IB_SMP_DIRECTION			cpu_to_be16(0x8000)
+
+/* Subnet management attributes */
+#define IB_SMP_ATTR_NOTICE			cpu_to_be16(0x0002)
+#define IB_SMP_ATTR_NODE_DESC			cpu_to_be16(0x0010)
+#define IB_SMP_ATTR_NODE_INFO			cpu_to_be16(0x0011)
+#define IB_SMP_ATTR_SWITCH_INFO			cpu_to_be16(0x0012)
+#define IB_SMP_ATTR_GUID_INFO			cpu_to_be16(0x0014)
+#define IB_SMP_ATTR_PORT_INFO			cpu_to_be16(0x0015)
+#define IB_SMP_ATTR_PKEY_TABLE			cpu_to_be16(0x0016)
+#define IB_SMP_ATTR_SL_TO_VL_TABLE		cpu_to_be16(0x0017)
+#define IB_SMP_ATTR_VL_ARB_TABLE		cpu_to_be16(0x0018)
+#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE	cpu_to_be16(0x0019)
+#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE	cpu_to_be16(0x001A)
+#define IB_SMP_ATTR_MCAST_FORWARD_TABLE		cpu_to_be16(0x001B)
+#define IB_SMP_ATTR_SM_INFO			cpu_to_be16(0x0020)
+#define IB_SMP_ATTR_VENDOR_DIAG			cpu_to_be16(0x0030)
+#define IB_SMP_ATTR_LED_INFO			cpu_to_be16(0x0031)
+#define IB_SMP_ATTR_VENDOR_MASK			cpu_to_be16(0xFF00)
+
+struct ib_port_info {
+	__be64 mkey;
+	__be64 gid_prefix;
+	__be16 lid;
+	__be16 sm_lid;
+	__be32 cap_mask;
+	__be16 diag_code;
+	__be16 mkey_lease_period;
+	u8 local_port_num;
+	u8 link_width_enabled;
+	u8 link_width_supported;
+	u8 link_width_active;
+	u8 linkspeed_portstate;			/* 4 bits, 4 bits */
+	u8 portphysstate_linkdown;		/* 4 bits, 4 bits */
+	u8 mkeyprot_resv_lmc;			/* 2 bits, 3, 3 */
+	u8 linkspeedactive_enabled;		/* 4 bits, 4 bits */
+	u8 neighbormtu_mastersmsl;		/* 4 bits, 4 bits */
+	u8 vlcap_inittype;			/* 4 bits, 4 bits */
+	u8 vl_high_limit;
+	u8 vl_arb_high_cap;
+	u8 vl_arb_low_cap;
+	u8 inittypereply_mtucap;		/* 4 bits, 4 bits */
+	u8 vlstallcnt_hoqlife;			/* 3 bits, 5 bits */
+	u8 operationalvl_pei_peo_fpi_fpo;	/* 4 bits, 1, 1, 1, 1 */
+	__be16 mkey_violations;
+	__be16 pkey_violations;
+	__be16 qkey_violations;
+	u8 guid_cap;
+	u8 clientrereg_resv_subnetto;		/* 1 bit, 2 bits, 5 */
+	u8 resv_resptimevalue;			/* 3 bits, 5 bits */
+	u8 localphyerrors_overrunerrors;	/* 4 bits, 4 bits */
+	__be16 max_credit_hint;
+	u8 resv;
+	u8 link_roundtrip_latency[3];
+};
+
+struct ib_node_info {
+	u8 base_version;
+	u8 class_version;
+	u8 node_type;
+	u8 num_ports;
+	__be64 sys_guid;
+	__be64 node_guid;
+	__be64 port_guid;
+	__be16 partition_cap;
+	__be16 device_id;
+	__be32 revision;
+	u8 local_port_num;
+	u8 vendor_id[3];
+} __packed;
+
+struct ib_vl_weight_elem {
+	u8      vl;     /* IB: VL is low 4 bits, upper 4 bits reserved */
+                        /* OPA: VL is low 5 bits, upper 3 bits reserved */
+	u8      weight;
+};
+
+static inline u8
+ib_get_smp_direction(struct ib_smp *smp)
+{
+	return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION);
+}
+
+/*
+ * SM Trap/Notice numbers
+ */
+#define IB_NOTICE_TRAP_LLI_THRESH	cpu_to_be16(129)
+#define IB_NOTICE_TRAP_EBO_THRESH	cpu_to_be16(130)
+#define IB_NOTICE_TRAP_FLOW_UPDATE	cpu_to_be16(131)
+#define IB_NOTICE_TRAP_CAP_MASK_CHG	cpu_to_be16(144)
+#define IB_NOTICE_TRAP_SYS_GUID_CHG	cpu_to_be16(145)
+#define IB_NOTICE_TRAP_BAD_MKEY		cpu_to_be16(256)
+#define IB_NOTICE_TRAP_BAD_PKEY		cpu_to_be16(257)
+#define IB_NOTICE_TRAP_BAD_QKEY		cpu_to_be16(258)
+
+/*
+ * Other local changes flags (trap 144).
+ */
+#define IB_NOTICE_TRAP_LSE_CHG		0x04	/* Link Speed Enable changed */
+#define IB_NOTICE_TRAP_LWE_CHG		0x02	/* Link Width Enable changed */
+#define IB_NOTICE_TRAP_NODE_DESC_CHG	0x01
+
+/*
+ * M_Key volation flags in dr_trunc_hop (trap 256).
+ */
+#define IB_NOTICE_TRAP_DR_NOTICE	0x80
+#define IB_NOTICE_TRAP_DR_TRUNC		0x40
+
+
+#endif /* IB_SMI_H */
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
new file mode 100644
index 0000000..a1fd638
--- /dev/null
+++ b/include/rdma/ib_umem.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_UMEM_H
+#define IB_UMEM_H
+
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/workqueue.h>
+
+struct ib_ucontext;
+struct ib_umem_odp;
+
+struct ib_umem {
+	struct ib_ucontext     *context;
+	size_t			length;
+	unsigned long		address;
+	int			page_shift;
+	int                     writable;
+	int                     hugetlb;
+	struct work_struct	work;
+	struct mm_struct       *mm;
+	unsigned long		diff;
+	struct ib_umem_odp     *odp_data;
+	struct sg_table sg_head;
+	int             nmap;
+	int             npages;
+};
+
+/* Returns the offset of the umem start relative to the first page. */
+static inline int ib_umem_offset(struct ib_umem *umem)
+{
+	return umem->address & (BIT(umem->page_shift) - 1);
+}
+
+/* Returns the first page of an ODP umem. */
+static inline unsigned long ib_umem_start(struct ib_umem *umem)
+{
+	return umem->address - ib_umem_offset(umem);
+}
+
+/* Returns the address of the page after the last one of an ODP umem. */
+static inline unsigned long ib_umem_end(struct ib_umem *umem)
+{
+	return ALIGN(umem->address + umem->length, BIT(umem->page_shift));
+}
+
+static inline size_t ib_umem_num_pages(struct ib_umem *umem)
+{
+	return (ib_umem_end(umem) - ib_umem_start(umem)) >> umem->page_shift;
+}
+
+#ifdef CONFIG_INFINIBAND_USER_MEM
+
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+			    size_t size, int access, int dmasync);
+void ib_umem_release(struct ib_umem *umem);
+int ib_umem_page_count(struct ib_umem *umem);
+int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+		      size_t length);
+
+#else /* CONFIG_INFINIBAND_USER_MEM */
+
+#include <linux/err.h>
+
+static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context,
+					  unsigned long addr, size_t size,
+					  int access, int dmasync) {
+	return ERR_PTR(-EINVAL);
+}
+static inline void ib_umem_release(struct ib_umem *umem) { }
+static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; }
+static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+		      		    size_t length) {
+	return -EINVAL;
+}
+#endif /* CONFIG_INFINIBAND_USER_MEM */
+
+#endif /* IB_UMEM_H */
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
new file mode 100644
index 0000000..6a17f85
--- /dev/null
+++ b/include/rdma/ib_umem_odp.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_UMEM_ODP_H
+#define IB_UMEM_ODP_H
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_verbs.h>
+#include <linux/interval_tree.h>
+
+struct umem_odp_node {
+	u64 __subtree_last;
+	struct rb_node rb;
+};
+
+struct ib_umem_odp {
+	/*
+	 * An array of the pages included in the on-demand paging umem.
+	 * Indices of pages that are currently not mapped into the device will
+	 * contain NULL.
+	 */
+	struct page		**page_list;
+	/*
+	 * An array of the same size as page_list, with DMA addresses mapped
+	 * for pages the pages in page_list. The lower two bits designate
+	 * access permissions. See ODP_READ_ALLOWED_BIT and
+	 * ODP_WRITE_ALLOWED_BIT.
+	 */
+	dma_addr_t		*dma_list;
+	/*
+	 * The umem_mutex protects the page_list and dma_list fields of an ODP
+	 * umem, allowing only a single thread to map/unmap pages. The mutex
+	 * also protects access to the mmu notifier counters.
+	 */
+	struct mutex		umem_mutex;
+	void			*private; /* for the HW driver to use. */
+
+	/* When false, use the notifier counter in the ucontext struct. */
+	bool mn_counters_active;
+	int notifiers_seq;
+	int notifiers_count;
+
+	/* A linked list of umems that don't have private mmu notifier
+	 * counters yet. */
+	struct list_head no_private_counters;
+	struct ib_umem		*umem;
+
+	/* Tree tracking */
+	struct umem_odp_node	interval_tree;
+
+	struct completion	notifier_completion;
+	int			dying;
+	struct work_struct	work;
+};
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+
+int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem,
+		    int access);
+struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context,
+				  unsigned long addr,
+				  size_t size);
+
+void ib_umem_odp_release(struct ib_umem *umem);
+
+/*
+ * The lower 2 bits of the DMA address signal the R/W permissions for
+ * the entry. To upgrade the permissions, provide the appropriate
+ * bitmask to the map_dma_pages function.
+ *
+ * Be aware that upgrading a mapped address might result in change of
+ * the DMA address for the page.
+ */
+#define ODP_READ_ALLOWED_BIT  (1<<0ULL)
+#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
+
+#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
+
+int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt,
+			      u64 access_mask, unsigned long current_seq);
+
+void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset,
+				 u64 bound);
+
+typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end,
+			      void *cookie);
+/*
+ * Call the callback on each ib_umem in the range. Returns the logical or of
+ * the return values of the functions called.
+ */
+int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
+				  u64 start, u64 end,
+				  umem_call_back cb, void *cookie);
+
+/*
+ * Find first region intersecting with address range.
+ * Return NULL if not found
+ */
+struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root,
+				       u64 addr, u64 length);
+
+static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item,
+					     unsigned long mmu_seq)
+{
+	/*
+	 * This code is strongly based on the KVM code from
+	 * mmu_notifier_retry. Should be called with
+	 * the relevant locks taken (item->odp_data->umem_mutex
+	 * and the ucontext umem_mutex semaphore locked for read).
+	 */
+
+	/* Do not allow page faults while the new ib_umem hasn't seen a state
+	 * with zero notifiers yet, and doesn't have its own valid set of
+	 * private counters. */
+	if (!item->odp_data->mn_counters_active)
+		return 1;
+
+	if (unlikely(item->odp_data->notifiers_count))
+		return 1;
+	if (item->odp_data->notifiers_seq != mmu_seq)
+		return 1;
+	return 0;
+}
+
+#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
+static inline int ib_umem_odp_get(struct ib_ucontext *context,
+				  struct ib_umem *umem,
+				  int access)
+{
+	return -EINVAL;
+}
+
+static inline struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context,
+						unsigned long addr,
+						size_t size)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline void ib_umem_odp_release(struct ib_umem *umem) {}
+
+#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
+#endif /* IB_UMEM_ODP_H */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
new file mode 100644
index 0000000..9fc8a82
--- /dev/null
+++ b/include/rdma/ib_verbs.h
@@ -0,0 +1,4015 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_VERBS_H)
+#define IB_VERBS_H
+
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+#include <linux/scatterlist.h>
+#include <linux/workqueue.h>
+#include <linux/socket.h>
+#include <linux/irq_poll.h>
+#include <uapi/linux/if_ether.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+
+#include <linux/if_link.h>
+#include <linux/atomic.h>
+#include <linux/mmu_notifier.h>
+#include <linux/uaccess.h>
+#include <linux/cgroup_rdma.h>
+#include <uapi/rdma/ib_user_verbs.h>
+#include <rdma/restrack.h>
+#include <uapi/rdma/rdma_user_ioctl.h>
+#include <uapi/rdma/ib_user_ioctl_verbs.h>
+
+#define IB_FW_VERSION_NAME_MAX	ETHTOOL_FWVERS_LEN
+
+extern struct workqueue_struct *ib_wq;
+extern struct workqueue_struct *ib_comp_wq;
+
+union ib_gid {
+	u8	raw[16];
+	struct {
+		__be64	subnet_prefix;
+		__be64	interface_id;
+	} global;
+};
+
+extern union ib_gid zgid;
+
+enum ib_gid_type {
+	/* If link layer is Ethernet, this is RoCE V1 */
+	IB_GID_TYPE_IB        = 0,
+	IB_GID_TYPE_ROCE      = 0,
+	IB_GID_TYPE_ROCE_UDP_ENCAP = 1,
+	IB_GID_TYPE_SIZE
+};
+
+#define ROCE_V2_UDP_DPORT      4791
+struct ib_gid_attr {
+	struct net_device	*ndev;
+	struct ib_device	*device;
+	enum ib_gid_type	gid_type;
+	u16			index;
+	u8			port_num;
+};
+
+enum rdma_node_type {
+	/* IB values map to NodeInfo:NodeType. */
+	RDMA_NODE_IB_CA 	= 1,
+	RDMA_NODE_IB_SWITCH,
+	RDMA_NODE_IB_ROUTER,
+	RDMA_NODE_RNIC,
+	RDMA_NODE_USNIC,
+	RDMA_NODE_USNIC_UDP,
+};
+
+enum {
+	/* set the local administered indication */
+	IB_SA_WELL_KNOWN_GUID	= BIT_ULL(57) | 2,
+};
+
+enum rdma_transport_type {
+	RDMA_TRANSPORT_IB,
+	RDMA_TRANSPORT_IWARP,
+	RDMA_TRANSPORT_USNIC,
+	RDMA_TRANSPORT_USNIC_UDP
+};
+
+enum rdma_protocol_type {
+	RDMA_PROTOCOL_IB,
+	RDMA_PROTOCOL_IBOE,
+	RDMA_PROTOCOL_IWARP,
+	RDMA_PROTOCOL_USNIC_UDP
+};
+
+__attribute_const__ enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type);
+
+enum rdma_network_type {
+	RDMA_NETWORK_IB,
+	RDMA_NETWORK_ROCE_V1 = RDMA_NETWORK_IB,
+	RDMA_NETWORK_IPV4,
+	RDMA_NETWORK_IPV6
+};
+
+static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type)
+{
+	if (network_type == RDMA_NETWORK_IPV4 ||
+	    network_type == RDMA_NETWORK_IPV6)
+		return IB_GID_TYPE_ROCE_UDP_ENCAP;
+
+	/* IB_GID_TYPE_IB same as RDMA_NETWORK_ROCE_V1 */
+	return IB_GID_TYPE_IB;
+}
+
+static inline enum rdma_network_type ib_gid_to_network_type(enum ib_gid_type gid_type,
+							    union ib_gid *gid)
+{
+	if (gid_type == IB_GID_TYPE_IB)
+		return RDMA_NETWORK_IB;
+
+	if (ipv6_addr_v4mapped((struct in6_addr *)gid))
+		return RDMA_NETWORK_IPV4;
+	else
+		return RDMA_NETWORK_IPV6;
+}
+
+enum rdma_link_layer {
+	IB_LINK_LAYER_UNSPECIFIED,
+	IB_LINK_LAYER_INFINIBAND,
+	IB_LINK_LAYER_ETHERNET,
+};
+
+enum ib_device_cap_flags {
+	IB_DEVICE_RESIZE_MAX_WR			= (1 << 0),
+	IB_DEVICE_BAD_PKEY_CNTR			= (1 << 1),
+	IB_DEVICE_BAD_QKEY_CNTR			= (1 << 2),
+	IB_DEVICE_RAW_MULTI			= (1 << 3),
+	IB_DEVICE_AUTO_PATH_MIG			= (1 << 4),
+	IB_DEVICE_CHANGE_PHY_PORT		= (1 << 5),
+	IB_DEVICE_UD_AV_PORT_ENFORCE		= (1 << 6),
+	IB_DEVICE_CURR_QP_STATE_MOD		= (1 << 7),
+	IB_DEVICE_SHUTDOWN_PORT			= (1 << 8),
+	/* Not in use, former INIT_TYPE		= (1 << 9),*/
+	IB_DEVICE_PORT_ACTIVE_EVENT		= (1 << 10),
+	IB_DEVICE_SYS_IMAGE_GUID		= (1 << 11),
+	IB_DEVICE_RC_RNR_NAK_GEN		= (1 << 12),
+	IB_DEVICE_SRQ_RESIZE			= (1 << 13),
+	IB_DEVICE_N_NOTIFY_CQ			= (1 << 14),
+
+	/*
+	 * This device supports a per-device lkey or stag that can be
+	 * used without performing a memory registration for the local
+	 * memory.  Note that ULPs should never check this flag, but
+	 * instead of use the local_dma_lkey flag in the ib_pd structure,
+	 * which will always contain a usable lkey.
+	 */
+	IB_DEVICE_LOCAL_DMA_LKEY		= (1 << 15),
+	/* Reserved, old SEND_W_INV		= (1 << 16),*/
+	IB_DEVICE_MEM_WINDOW			= (1 << 17),
+	/*
+	 * Devices should set IB_DEVICE_UD_IP_SUM if they support
+	 * insertion of UDP and TCP checksum on outgoing UD IPoIB
+	 * messages and can verify the validity of checksum for
+	 * incoming messages.  Setting this flag implies that the
+	 * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode.
+	 */
+	IB_DEVICE_UD_IP_CSUM			= (1 << 18),
+	IB_DEVICE_UD_TSO			= (1 << 19),
+	IB_DEVICE_XRC				= (1 << 20),
+
+	/*
+	 * This device supports the IB "base memory management extension",
+	 * which includes support for fast registrations (IB_WR_REG_MR,
+	 * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs).  This flag should
+	 * also be set by any iWarp device which must support FRs to comply
+	 * to the iWarp verbs spec.  iWarp devices also support the
+	 * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the
+	 * stag.
+	 */
+	IB_DEVICE_MEM_MGT_EXTENSIONS		= (1 << 21),
+	IB_DEVICE_BLOCK_MULTICAST_LOOPBACK	= (1 << 22),
+	IB_DEVICE_MEM_WINDOW_TYPE_2A		= (1 << 23),
+	IB_DEVICE_MEM_WINDOW_TYPE_2B		= (1 << 24),
+	IB_DEVICE_RC_IP_CSUM			= (1 << 25),
+	/* Deprecated. Please use IB_RAW_PACKET_CAP_IP_CSUM. */
+	IB_DEVICE_RAW_IP_CSUM			= (1 << 26),
+	/*
+	 * Devices should set IB_DEVICE_CROSS_CHANNEL if they
+	 * support execution of WQEs that involve synchronization
+	 * of I/O operations with single completion queue managed
+	 * by hardware.
+	 */
+	IB_DEVICE_CROSS_CHANNEL			= (1 << 27),
+	IB_DEVICE_MANAGED_FLOW_STEERING		= (1 << 29),
+	IB_DEVICE_SIGNATURE_HANDOVER		= (1 << 30),
+	IB_DEVICE_ON_DEMAND_PAGING		= (1ULL << 31),
+	IB_DEVICE_SG_GAPS_REG			= (1ULL << 32),
+	IB_DEVICE_VIRTUAL_FUNCTION		= (1ULL << 33),
+	/* Deprecated. Please use IB_RAW_PACKET_CAP_SCATTER_FCS. */
+	IB_DEVICE_RAW_SCATTER_FCS		= (1ULL << 34),
+	IB_DEVICE_RDMA_NETDEV_OPA_VNIC		= (1ULL << 35),
+	/* The device supports padding incoming writes to cacheline. */
+	IB_DEVICE_PCI_WRITE_END_PADDING		= (1ULL << 36),
+};
+
+enum ib_signature_prot_cap {
+	IB_PROT_T10DIF_TYPE_1 = 1,
+	IB_PROT_T10DIF_TYPE_2 = 1 << 1,
+	IB_PROT_T10DIF_TYPE_3 = 1 << 2,
+};
+
+enum ib_signature_guard_cap {
+	IB_GUARD_T10DIF_CRC	= 1,
+	IB_GUARD_T10DIF_CSUM	= 1 << 1,
+};
+
+enum ib_atomic_cap {
+	IB_ATOMIC_NONE,
+	IB_ATOMIC_HCA,
+	IB_ATOMIC_GLOB
+};
+
+enum ib_odp_general_cap_bits {
+	IB_ODP_SUPPORT		= 1 << 0,
+	IB_ODP_SUPPORT_IMPLICIT = 1 << 1,
+};
+
+enum ib_odp_transport_cap_bits {
+	IB_ODP_SUPPORT_SEND	= 1 << 0,
+	IB_ODP_SUPPORT_RECV	= 1 << 1,
+	IB_ODP_SUPPORT_WRITE	= 1 << 2,
+	IB_ODP_SUPPORT_READ	= 1 << 3,
+	IB_ODP_SUPPORT_ATOMIC	= 1 << 4,
+};
+
+struct ib_odp_caps {
+	uint64_t general_caps;
+	struct {
+		uint32_t  rc_odp_caps;
+		uint32_t  uc_odp_caps;
+		uint32_t  ud_odp_caps;
+	} per_transport_caps;
+};
+
+struct ib_rss_caps {
+	/* Corresponding bit will be set if qp type from
+	 * 'enum ib_qp_type' is supported, e.g.
+	 * supported_qpts |= 1 << IB_QPT_UD
+	 */
+	u32 supported_qpts;
+	u32 max_rwq_indirection_tables;
+	u32 max_rwq_indirection_table_size;
+};
+
+enum ib_tm_cap_flags {
+	/*  Support tag matching on RC transport */
+	IB_TM_CAP_RC		    = 1 << 0,
+};
+
+struct ib_tm_caps {
+	/* Max size of RNDV header */
+	u32 max_rndv_hdr_size;
+	/* Max number of entries in tag matching list */
+	u32 max_num_tags;
+	/* From enum ib_tm_cap_flags */
+	u32 flags;
+	/* Max number of outstanding list operations */
+	u32 max_ops;
+	/* Max number of SGE in tag matching entry */
+	u32 max_sge;
+};
+
+struct ib_cq_init_attr {
+	unsigned int	cqe;
+	int		comp_vector;
+	u32		flags;
+};
+
+enum ib_cq_attr_mask {
+	IB_CQ_MODERATE = 1 << 0,
+};
+
+struct ib_cq_caps {
+	u16     max_cq_moderation_count;
+	u16     max_cq_moderation_period;
+};
+
+struct ib_dm_mr_attr {
+	u64		length;
+	u64		offset;
+	u32		access_flags;
+};
+
+struct ib_dm_alloc_attr {
+	u64	length;
+	u32	alignment;
+	u32	flags;
+};
+
+struct ib_device_attr {
+	u64			fw_ver;
+	__be64			sys_image_guid;
+	u64			max_mr_size;
+	u64			page_size_cap;
+	u32			vendor_id;
+	u32			vendor_part_id;
+	u32			hw_ver;
+	int			max_qp;
+	int			max_qp_wr;
+	u64			device_cap_flags;
+	int			max_sge;
+	int			max_sge_rd;
+	int			max_cq;
+	int			max_cqe;
+	int			max_mr;
+	int			max_pd;
+	int			max_qp_rd_atom;
+	int			max_ee_rd_atom;
+	int			max_res_rd_atom;
+	int			max_qp_init_rd_atom;
+	int			max_ee_init_rd_atom;
+	enum ib_atomic_cap	atomic_cap;
+	enum ib_atomic_cap	masked_atomic_cap;
+	int			max_ee;
+	int			max_rdd;
+	int			max_mw;
+	int			max_raw_ipv6_qp;
+	int			max_raw_ethy_qp;
+	int			max_mcast_grp;
+	int			max_mcast_qp_attach;
+	int			max_total_mcast_qp_attach;
+	int			max_ah;
+	int			max_fmr;
+	int			max_map_per_fmr;
+	int			max_srq;
+	int			max_srq_wr;
+	int			max_srq_sge;
+	unsigned int		max_fast_reg_page_list_len;
+	u16			max_pkeys;
+	u8			local_ca_ack_delay;
+	int			sig_prot_cap;
+	int			sig_guard_cap;
+	struct ib_odp_caps	odp_caps;
+	uint64_t		timestamp_mask;
+	uint64_t		hca_core_clock; /* in KHZ */
+	struct ib_rss_caps	rss_caps;
+	u32			max_wq_type_rq;
+	u32			raw_packet_caps; /* Use ib_raw_packet_caps enum */
+	struct ib_tm_caps	tm_caps;
+	struct ib_cq_caps       cq_caps;
+	u64			max_dm_size;
+};
+
+enum ib_mtu {
+	IB_MTU_256  = 1,
+	IB_MTU_512  = 2,
+	IB_MTU_1024 = 3,
+	IB_MTU_2048 = 4,
+	IB_MTU_4096 = 5
+};
+
+static inline int ib_mtu_enum_to_int(enum ib_mtu mtu)
+{
+	switch (mtu) {
+	case IB_MTU_256:  return  256;
+	case IB_MTU_512:  return  512;
+	case IB_MTU_1024: return 1024;
+	case IB_MTU_2048: return 2048;
+	case IB_MTU_4096: return 4096;
+	default: 	  return -1;
+	}
+}
+
+static inline enum ib_mtu ib_mtu_int_to_enum(int mtu)
+{
+	if (mtu >= 4096)
+		return IB_MTU_4096;
+	else if (mtu >= 2048)
+		return IB_MTU_2048;
+	else if (mtu >= 1024)
+		return IB_MTU_1024;
+	else if (mtu >= 512)
+		return IB_MTU_512;
+	else
+		return IB_MTU_256;
+}
+
+enum ib_port_state {
+	IB_PORT_NOP		= 0,
+	IB_PORT_DOWN		= 1,
+	IB_PORT_INIT		= 2,
+	IB_PORT_ARMED		= 3,
+	IB_PORT_ACTIVE		= 4,
+	IB_PORT_ACTIVE_DEFER	= 5
+};
+
+enum ib_port_cap_flags {
+	IB_PORT_SM				= 1 <<  1,
+	IB_PORT_NOTICE_SUP			= 1 <<  2,
+	IB_PORT_TRAP_SUP			= 1 <<  3,
+	IB_PORT_OPT_IPD_SUP                     = 1 <<  4,
+	IB_PORT_AUTO_MIGR_SUP			= 1 <<  5,
+	IB_PORT_SL_MAP_SUP			= 1 <<  6,
+	IB_PORT_MKEY_NVRAM			= 1 <<  7,
+	IB_PORT_PKEY_NVRAM			= 1 <<  8,
+	IB_PORT_LED_INFO_SUP			= 1 <<  9,
+	IB_PORT_SM_DISABLED			= 1 << 10,
+	IB_PORT_SYS_IMAGE_GUID_SUP		= 1 << 11,
+	IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP	= 1 << 12,
+	IB_PORT_EXTENDED_SPEEDS_SUP             = 1 << 14,
+	IB_PORT_CM_SUP				= 1 << 16,
+	IB_PORT_SNMP_TUNNEL_SUP			= 1 << 17,
+	IB_PORT_REINIT_SUP			= 1 << 18,
+	IB_PORT_DEVICE_MGMT_SUP			= 1 << 19,
+	IB_PORT_VENDOR_CLASS_SUP		= 1 << 20,
+	IB_PORT_DR_NOTICE_SUP			= 1 << 21,
+	IB_PORT_CAP_MASK_NOTICE_SUP		= 1 << 22,
+	IB_PORT_BOOT_MGMT_SUP			= 1 << 23,
+	IB_PORT_LINK_LATENCY_SUP		= 1 << 24,
+	IB_PORT_CLIENT_REG_SUP			= 1 << 25,
+	IB_PORT_IP_BASED_GIDS			= 1 << 26,
+};
+
+enum ib_port_width {
+	IB_WIDTH_1X	= 1,
+	IB_WIDTH_4X	= 2,
+	IB_WIDTH_8X	= 4,
+	IB_WIDTH_12X	= 8
+};
+
+static inline int ib_width_enum_to_int(enum ib_port_width width)
+{
+	switch (width) {
+	case IB_WIDTH_1X:  return  1;
+	case IB_WIDTH_4X:  return  4;
+	case IB_WIDTH_8X:  return  8;
+	case IB_WIDTH_12X: return 12;
+	default: 	  return -1;
+	}
+}
+
+enum ib_port_speed {
+	IB_SPEED_SDR	= 1,
+	IB_SPEED_DDR	= 2,
+	IB_SPEED_QDR	= 4,
+	IB_SPEED_FDR10	= 8,
+	IB_SPEED_FDR	= 16,
+	IB_SPEED_EDR	= 32,
+	IB_SPEED_HDR	= 64
+};
+
+/**
+ * struct rdma_hw_stats
+ * @lock - Mutex to protect parallel write access to lifespan and values
+ *    of counters, which are 64bits and not guaranteeed to be written
+ *    atomicaly on 32bits systems.
+ * @timestamp - Used by the core code to track when the last update was
+ * @lifespan - Used by the core code to determine how old the counters
+ *   should be before being updated again.  Stored in jiffies, defaults
+ *   to 10 milliseconds, drivers can override the default be specifying
+ *   their own value during their allocation routine.
+ * @name - Array of pointers to static names used for the counters in
+ *   directory.
+ * @num_counters - How many hardware counters there are.  If name is
+ *   shorter than this number, a kernel oops will result.  Driver authors
+ *   are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters)
+ *   in their code to prevent this.
+ * @value - Array of u64 counters that are accessed by the sysfs code and
+ *   filled in by the drivers get_stats routine
+ */
+struct rdma_hw_stats {
+	struct mutex	lock; /* Protect lifespan and values[] */
+	unsigned long	timestamp;
+	unsigned long	lifespan;
+	const char * const *names;
+	int		num_counters;
+	u64		value[];
+};
+
+#define RDMA_HW_STATS_DEFAULT_LIFESPAN 10
+/**
+ * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct
+ *   for drivers.
+ * @names - Array of static const char *
+ * @num_counters - How many elements in array
+ * @lifespan - How many milliseconds between updates
+ */
+static inline struct rdma_hw_stats *rdma_alloc_hw_stats_struct(
+		const char * const *names, int num_counters,
+		unsigned long lifespan)
+{
+	struct rdma_hw_stats *stats;
+
+	stats = kzalloc(sizeof(*stats) + num_counters * sizeof(u64),
+			GFP_KERNEL);
+	if (!stats)
+		return NULL;
+	stats->names = names;
+	stats->num_counters = num_counters;
+	stats->lifespan = msecs_to_jiffies(lifespan);
+
+	return stats;
+}
+
+
+/* Define bits for the various functionality this port needs to be supported by
+ * the core.
+ */
+/* Management                           0x00000FFF */
+#define RDMA_CORE_CAP_IB_MAD            0x00000001
+#define RDMA_CORE_CAP_IB_SMI            0x00000002
+#define RDMA_CORE_CAP_IB_CM             0x00000004
+#define RDMA_CORE_CAP_IW_CM             0x00000008
+#define RDMA_CORE_CAP_IB_SA             0x00000010
+#define RDMA_CORE_CAP_OPA_MAD           0x00000020
+
+/* Address format                       0x000FF000 */
+#define RDMA_CORE_CAP_AF_IB             0x00001000
+#define RDMA_CORE_CAP_ETH_AH            0x00002000
+#define RDMA_CORE_CAP_OPA_AH            0x00004000
+
+/* Protocol                             0xFFF00000 */
+#define RDMA_CORE_CAP_PROT_IB           0x00100000
+#define RDMA_CORE_CAP_PROT_ROCE         0x00200000
+#define RDMA_CORE_CAP_PROT_IWARP        0x00400000
+#define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000
+#define RDMA_CORE_CAP_PROT_RAW_PACKET   0x01000000
+#define RDMA_CORE_CAP_PROT_USNIC        0x02000000
+
+#define RDMA_CORE_PORT_IBA_IB          (RDMA_CORE_CAP_PROT_IB  \
+					| RDMA_CORE_CAP_IB_MAD \
+					| RDMA_CORE_CAP_IB_SMI \
+					| RDMA_CORE_CAP_IB_CM  \
+					| RDMA_CORE_CAP_IB_SA  \
+					| RDMA_CORE_CAP_AF_IB)
+#define RDMA_CORE_PORT_IBA_ROCE        (RDMA_CORE_CAP_PROT_ROCE \
+					| RDMA_CORE_CAP_IB_MAD  \
+					| RDMA_CORE_CAP_IB_CM   \
+					| RDMA_CORE_CAP_AF_IB   \
+					| RDMA_CORE_CAP_ETH_AH)
+#define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP			\
+					(RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \
+					| RDMA_CORE_CAP_IB_MAD  \
+					| RDMA_CORE_CAP_IB_CM   \
+					| RDMA_CORE_CAP_AF_IB   \
+					| RDMA_CORE_CAP_ETH_AH)
+#define RDMA_CORE_PORT_IWARP           (RDMA_CORE_CAP_PROT_IWARP \
+					| RDMA_CORE_CAP_IW_CM)
+#define RDMA_CORE_PORT_INTEL_OPA       (RDMA_CORE_PORT_IBA_IB  \
+					| RDMA_CORE_CAP_OPA_MAD)
+
+#define RDMA_CORE_PORT_RAW_PACKET	(RDMA_CORE_CAP_PROT_RAW_PACKET)
+
+#define RDMA_CORE_PORT_USNIC		(RDMA_CORE_CAP_PROT_USNIC)
+
+struct ib_port_attr {
+	u64			subnet_prefix;
+	enum ib_port_state	state;
+	enum ib_mtu		max_mtu;
+	enum ib_mtu		active_mtu;
+	int			gid_tbl_len;
+	u32			port_cap_flags;
+	u32			max_msg_sz;
+	u32			bad_pkey_cntr;
+	u32			qkey_viol_cntr;
+	u16			pkey_tbl_len;
+	u32			sm_lid;
+	u32			lid;
+	u8			lmc;
+	u8			max_vl_num;
+	u8			sm_sl;
+	u8			subnet_timeout;
+	u8			init_type_reply;
+	u8			active_width;
+	u8			active_speed;
+	u8                      phys_state;
+	bool			grh_required;
+};
+
+enum ib_device_modify_flags {
+	IB_DEVICE_MODIFY_SYS_IMAGE_GUID	= 1 << 0,
+	IB_DEVICE_MODIFY_NODE_DESC	= 1 << 1
+};
+
+#define IB_DEVICE_NODE_DESC_MAX 64
+
+struct ib_device_modify {
+	u64	sys_image_guid;
+	char	node_desc[IB_DEVICE_NODE_DESC_MAX];
+};
+
+enum ib_port_modify_flags {
+	IB_PORT_SHUTDOWN		= 1,
+	IB_PORT_INIT_TYPE		= (1<<2),
+	IB_PORT_RESET_QKEY_CNTR		= (1<<3),
+	IB_PORT_OPA_MASK_CHG		= (1<<4)
+};
+
+struct ib_port_modify {
+	u32	set_port_cap_mask;
+	u32	clr_port_cap_mask;
+	u8	init_type;
+};
+
+enum ib_event_type {
+	IB_EVENT_CQ_ERR,
+	IB_EVENT_QP_FATAL,
+	IB_EVENT_QP_REQ_ERR,
+	IB_EVENT_QP_ACCESS_ERR,
+	IB_EVENT_COMM_EST,
+	IB_EVENT_SQ_DRAINED,
+	IB_EVENT_PATH_MIG,
+	IB_EVENT_PATH_MIG_ERR,
+	IB_EVENT_DEVICE_FATAL,
+	IB_EVENT_PORT_ACTIVE,
+	IB_EVENT_PORT_ERR,
+	IB_EVENT_LID_CHANGE,
+	IB_EVENT_PKEY_CHANGE,
+	IB_EVENT_SM_CHANGE,
+	IB_EVENT_SRQ_ERR,
+	IB_EVENT_SRQ_LIMIT_REACHED,
+	IB_EVENT_QP_LAST_WQE_REACHED,
+	IB_EVENT_CLIENT_REREGISTER,
+	IB_EVENT_GID_CHANGE,
+	IB_EVENT_WQ_FATAL,
+};
+
+const char *__attribute_const__ ib_event_msg(enum ib_event_type event);
+
+struct ib_event {
+	struct ib_device	*device;
+	union {
+		struct ib_cq	*cq;
+		struct ib_qp	*qp;
+		struct ib_srq	*srq;
+		struct ib_wq	*wq;
+		u8		port_num;
+	} element;
+	enum ib_event_type	event;
+};
+
+struct ib_event_handler {
+	struct ib_device *device;
+	void            (*handler)(struct ib_event_handler *, struct ib_event *);
+	struct list_head  list;
+};
+
+#define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler)		\
+	do {							\
+		(_ptr)->device  = _device;			\
+		(_ptr)->handler = _handler;			\
+		INIT_LIST_HEAD(&(_ptr)->list);			\
+	} while (0)
+
+struct ib_global_route {
+	union ib_gid	dgid;
+	u32		flow_label;
+	u8		sgid_index;
+	u8		hop_limit;
+	u8		traffic_class;
+};
+
+struct ib_grh {
+	__be32		version_tclass_flow;
+	__be16		paylen;
+	u8		next_hdr;
+	u8		hop_limit;
+	union ib_gid	sgid;
+	union ib_gid	dgid;
+};
+
+union rdma_network_hdr {
+	struct ib_grh ibgrh;
+	struct {
+		/* The IB spec states that if it's IPv4, the header
+		 * is located in the last 20 bytes of the header.
+		 */
+		u8		reserved[20];
+		struct iphdr	roce4grh;
+	};
+};
+
+#define IB_QPN_MASK		0xFFFFFF
+
+enum {
+	IB_MULTICAST_QPN = 0xffffff
+};
+
+#define IB_LID_PERMISSIVE	cpu_to_be16(0xFFFF)
+#define IB_MULTICAST_LID_BASE	cpu_to_be16(0xC000)
+
+enum ib_ah_flags {
+	IB_AH_GRH	= 1
+};
+
+enum ib_rate {
+	IB_RATE_PORT_CURRENT = 0,
+	IB_RATE_2_5_GBPS = 2,
+	IB_RATE_5_GBPS   = 5,
+	IB_RATE_10_GBPS  = 3,
+	IB_RATE_20_GBPS  = 6,
+	IB_RATE_30_GBPS  = 4,
+	IB_RATE_40_GBPS  = 7,
+	IB_RATE_60_GBPS  = 8,
+	IB_RATE_80_GBPS  = 9,
+	IB_RATE_120_GBPS = 10,
+	IB_RATE_14_GBPS  = 11,
+	IB_RATE_56_GBPS  = 12,
+	IB_RATE_112_GBPS = 13,
+	IB_RATE_168_GBPS = 14,
+	IB_RATE_25_GBPS  = 15,
+	IB_RATE_100_GBPS = 16,
+	IB_RATE_200_GBPS = 17,
+	IB_RATE_300_GBPS = 18
+};
+
+/**
+ * ib_rate_to_mult - Convert the IB rate enum to a multiple of the
+ * base rate of 2.5 Gbit/sec.  For example, IB_RATE_5_GBPS will be
+ * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
+ * @rate: rate to convert.
+ */
+__attribute_const__ int ib_rate_to_mult(enum ib_rate rate);
+
+/**
+ * ib_rate_to_mbps - Convert the IB rate enum to Mbps.
+ * For example, IB_RATE_2_5_GBPS will be converted to 2500.
+ * @rate: rate to convert.
+ */
+__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);
+
+
+/**
+ * enum ib_mr_type - memory region type
+ * @IB_MR_TYPE_MEM_REG:       memory region that is used for
+ *                            normal registration
+ * @IB_MR_TYPE_SIGNATURE:     memory region that is used for
+ *                            signature operations (data-integrity
+ *                            capable regions)
+ * @IB_MR_TYPE_SG_GAPS:       memory region that is capable to
+ *                            register any arbitrary sg lists (without
+ *                            the normal mr constraints - see
+ *                            ib_map_mr_sg)
+ */
+enum ib_mr_type {
+	IB_MR_TYPE_MEM_REG,
+	IB_MR_TYPE_SIGNATURE,
+	IB_MR_TYPE_SG_GAPS,
+};
+
+/**
+ * Signature types
+ * IB_SIG_TYPE_NONE: Unprotected.
+ * IB_SIG_TYPE_T10_DIF: Type T10-DIF
+ */
+enum ib_signature_type {
+	IB_SIG_TYPE_NONE,
+	IB_SIG_TYPE_T10_DIF,
+};
+
+/**
+ * Signature T10-DIF block-guard types
+ * IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules.
+ * IB_T10DIF_CSUM: Corresponds to IP checksum rules.
+ */
+enum ib_t10_dif_bg_type {
+	IB_T10DIF_CRC,
+	IB_T10DIF_CSUM
+};
+
+/**
+ * struct ib_t10_dif_domain - Parameters specific for T10-DIF
+ *     domain.
+ * @bg_type: T10-DIF block guard type (CRC|CSUM)
+ * @pi_interval: protection information interval.
+ * @bg: seed of guard computation.
+ * @app_tag: application tag of guard block
+ * @ref_tag: initial guard block reference tag.
+ * @ref_remap: Indicate wethear the reftag increments each block
+ * @app_escape: Indicate to skip block check if apptag=0xffff
+ * @ref_escape: Indicate to skip block check if reftag=0xffffffff
+ * @apptag_check_mask: check bitmask of application tag.
+ */
+struct ib_t10_dif_domain {
+	enum ib_t10_dif_bg_type bg_type;
+	u16			pi_interval;
+	u16			bg;
+	u16			app_tag;
+	u32			ref_tag;
+	bool			ref_remap;
+	bool			app_escape;
+	bool			ref_escape;
+	u16			apptag_check_mask;
+};
+
+/**
+ * struct ib_sig_domain - Parameters for signature domain
+ * @sig_type: specific signauture type
+ * @sig: union of all signature domain attributes that may
+ *     be used to set domain layout.
+ */
+struct ib_sig_domain {
+	enum ib_signature_type sig_type;
+	union {
+		struct ib_t10_dif_domain dif;
+	} sig;
+};
+
+/**
+ * struct ib_sig_attrs - Parameters for signature handover operation
+ * @check_mask: bitmask for signature byte check (8 bytes)
+ * @mem: memory domain layout desciptor.
+ * @wire: wire domain layout desciptor.
+ */
+struct ib_sig_attrs {
+	u8			check_mask;
+	struct ib_sig_domain	mem;
+	struct ib_sig_domain	wire;
+};
+
+enum ib_sig_err_type {
+	IB_SIG_BAD_GUARD,
+	IB_SIG_BAD_REFTAG,
+	IB_SIG_BAD_APPTAG,
+};
+
+/**
+ * struct ib_sig_err - signature error descriptor
+ */
+struct ib_sig_err {
+	enum ib_sig_err_type	err_type;
+	u32			expected;
+	u32			actual;
+	u64			sig_err_offset;
+	u32			key;
+};
+
+enum ib_mr_status_check {
+	IB_MR_CHECK_SIG_STATUS = 1,
+};
+
+/**
+ * struct ib_mr_status - Memory region status container
+ *
+ * @fail_status: Bitmask of MR checks status. For each
+ *     failed check a corresponding status bit is set.
+ * @sig_err: Additional info for IB_MR_CEHCK_SIG_STATUS
+ *     failure.
+ */
+struct ib_mr_status {
+	u32		    fail_status;
+	struct ib_sig_err   sig_err;
+};
+
+/**
+ * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate
+ * enum.
+ * @mult: multiple to convert.
+ */
+__attribute_const__ enum ib_rate mult_to_ib_rate(int mult);
+
+enum rdma_ah_attr_type {
+	RDMA_AH_ATTR_TYPE_UNDEFINED,
+	RDMA_AH_ATTR_TYPE_IB,
+	RDMA_AH_ATTR_TYPE_ROCE,
+	RDMA_AH_ATTR_TYPE_OPA,
+};
+
+struct ib_ah_attr {
+	u16			dlid;
+	u8			src_path_bits;
+};
+
+struct roce_ah_attr {
+	u8			dmac[ETH_ALEN];
+};
+
+struct opa_ah_attr {
+	u32			dlid;
+	u8			src_path_bits;
+	bool			make_grd;
+};
+
+struct rdma_ah_attr {
+	struct ib_global_route	grh;
+	u8			sl;
+	u8			static_rate;
+	u8			port_num;
+	u8			ah_flags;
+	enum rdma_ah_attr_type type;
+	union {
+		struct ib_ah_attr ib;
+		struct roce_ah_attr roce;
+		struct opa_ah_attr opa;
+	};
+};
+
+enum ib_wc_status {
+	IB_WC_SUCCESS,
+	IB_WC_LOC_LEN_ERR,
+	IB_WC_LOC_QP_OP_ERR,
+	IB_WC_LOC_EEC_OP_ERR,
+	IB_WC_LOC_PROT_ERR,
+	IB_WC_WR_FLUSH_ERR,
+	IB_WC_MW_BIND_ERR,
+	IB_WC_BAD_RESP_ERR,
+	IB_WC_LOC_ACCESS_ERR,
+	IB_WC_REM_INV_REQ_ERR,
+	IB_WC_REM_ACCESS_ERR,
+	IB_WC_REM_OP_ERR,
+	IB_WC_RETRY_EXC_ERR,
+	IB_WC_RNR_RETRY_EXC_ERR,
+	IB_WC_LOC_RDD_VIOL_ERR,
+	IB_WC_REM_INV_RD_REQ_ERR,
+	IB_WC_REM_ABORT_ERR,
+	IB_WC_INV_EECN_ERR,
+	IB_WC_INV_EEC_STATE_ERR,
+	IB_WC_FATAL_ERR,
+	IB_WC_RESP_TIMEOUT_ERR,
+	IB_WC_GENERAL_ERR
+};
+
+const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status);
+
+enum ib_wc_opcode {
+	IB_WC_SEND,
+	IB_WC_RDMA_WRITE,
+	IB_WC_RDMA_READ,
+	IB_WC_COMP_SWAP,
+	IB_WC_FETCH_ADD,
+	IB_WC_LSO,
+	IB_WC_LOCAL_INV,
+	IB_WC_REG_MR,
+	IB_WC_MASKED_COMP_SWAP,
+	IB_WC_MASKED_FETCH_ADD,
+/*
+ * Set value of IB_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IB_WC_RECV).
+ */
+	IB_WC_RECV			= 1 << 7,
+	IB_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ib_wc_flags {
+	IB_WC_GRH		= 1,
+	IB_WC_WITH_IMM		= (1<<1),
+	IB_WC_WITH_INVALIDATE	= (1<<2),
+	IB_WC_IP_CSUM_OK	= (1<<3),
+	IB_WC_WITH_SMAC		= (1<<4),
+	IB_WC_WITH_VLAN		= (1<<5),
+	IB_WC_WITH_NETWORK_HDR_TYPE	= (1<<6),
+};
+
+struct ib_wc {
+	union {
+		u64		wr_id;
+		struct ib_cqe	*wr_cqe;
+	};
+	enum ib_wc_status	status;
+	enum ib_wc_opcode	opcode;
+	u32			vendor_err;
+	u32			byte_len;
+	struct ib_qp	       *qp;
+	union {
+		__be32		imm_data;
+		u32		invalidate_rkey;
+	} ex;
+	u32			src_qp;
+	u32			slid;
+	int			wc_flags;
+	u16			pkey_index;
+	u8			sl;
+	u8			dlid_path_bits;
+	u8			port_num;	/* valid only for DR SMPs on switches */
+	u8			smac[ETH_ALEN];
+	u16			vlan_id;
+	u8			network_hdr_type;
+};
+
+enum ib_cq_notify_flags {
+	IB_CQ_SOLICITED			= 1 << 0,
+	IB_CQ_NEXT_COMP			= 1 << 1,
+	IB_CQ_SOLICITED_MASK		= IB_CQ_SOLICITED | IB_CQ_NEXT_COMP,
+	IB_CQ_REPORT_MISSED_EVENTS	= 1 << 2,
+};
+
+enum ib_srq_type {
+	IB_SRQT_BASIC,
+	IB_SRQT_XRC,
+	IB_SRQT_TM,
+};
+
+static inline bool ib_srq_has_cq(enum ib_srq_type srq_type)
+{
+	return srq_type == IB_SRQT_XRC ||
+	       srq_type == IB_SRQT_TM;
+}
+
+enum ib_srq_attr_mask {
+	IB_SRQ_MAX_WR	= 1 << 0,
+	IB_SRQ_LIMIT	= 1 << 1,
+};
+
+struct ib_srq_attr {
+	u32	max_wr;
+	u32	max_sge;
+	u32	srq_limit;
+};
+
+struct ib_srq_init_attr {
+	void		      (*event_handler)(struct ib_event *, void *);
+	void		       *srq_context;
+	struct ib_srq_attr	attr;
+	enum ib_srq_type	srq_type;
+
+	struct {
+		struct ib_cq   *cq;
+		union {
+			struct {
+				struct ib_xrcd *xrcd;
+			} xrc;
+
+			struct {
+				u32		max_num_tags;
+			} tag_matching;
+		};
+	} ext;
+};
+
+struct ib_qp_cap {
+	u32	max_send_wr;
+	u32	max_recv_wr;
+	u32	max_send_sge;
+	u32	max_recv_sge;
+	u32	max_inline_data;
+
+	/*
+	 * Maximum number of rdma_rw_ctx structures in flight at a time.
+	 * ib_create_qp() will calculate the right amount of neededed WRs
+	 * and MRs based on this.
+	 */
+	u32	max_rdma_ctxs;
+};
+
+enum ib_sig_type {
+	IB_SIGNAL_ALL_WR,
+	IB_SIGNAL_REQ_WR
+};
+
+enum ib_qp_type {
+	/*
+	 * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+	 * here (and in that order) since the MAD layer uses them as
+	 * indices into a 2-entry table.
+	 */
+	IB_QPT_SMI,
+	IB_QPT_GSI,
+
+	IB_QPT_RC,
+	IB_QPT_UC,
+	IB_QPT_UD,
+	IB_QPT_RAW_IPV6,
+	IB_QPT_RAW_ETHERTYPE,
+	IB_QPT_RAW_PACKET = 8,
+	IB_QPT_XRC_INI = 9,
+	IB_QPT_XRC_TGT,
+	IB_QPT_MAX,
+	IB_QPT_DRIVER = 0xFF,
+	/* Reserve a range for qp types internal to the low level driver.
+	 * These qp types will not be visible at the IB core layer, so the
+	 * IB_QPT_MAX usages should not be affected in the core layer
+	 */
+	IB_QPT_RESERVED1 = 0x1000,
+	IB_QPT_RESERVED2,
+	IB_QPT_RESERVED3,
+	IB_QPT_RESERVED4,
+	IB_QPT_RESERVED5,
+	IB_QPT_RESERVED6,
+	IB_QPT_RESERVED7,
+	IB_QPT_RESERVED8,
+	IB_QPT_RESERVED9,
+	IB_QPT_RESERVED10,
+};
+
+enum ib_qp_create_flags {
+	IB_QP_CREATE_IPOIB_UD_LSO		= 1 << 0,
+	IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK	= 1 << 1,
+	IB_QP_CREATE_CROSS_CHANNEL              = 1 << 2,
+	IB_QP_CREATE_MANAGED_SEND               = 1 << 3,
+	IB_QP_CREATE_MANAGED_RECV               = 1 << 4,
+	IB_QP_CREATE_NETIF_QP			= 1 << 5,
+	IB_QP_CREATE_SIGNATURE_EN		= 1 << 6,
+	/* FREE					= 1 << 7, */
+	IB_QP_CREATE_SCATTER_FCS		= 1 << 8,
+	IB_QP_CREATE_CVLAN_STRIPPING		= 1 << 9,
+	IB_QP_CREATE_SOURCE_QPN			= 1 << 10,
+	IB_QP_CREATE_PCI_WRITE_END_PADDING	= 1 << 11,
+	/* reserve bits 26-31 for low level drivers' internal use */
+	IB_QP_CREATE_RESERVED_START		= 1 << 26,
+	IB_QP_CREATE_RESERVED_END		= 1 << 31,
+};
+
+/*
+ * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler
+ * callback to destroy the passed in QP.
+ */
+
+struct ib_qp_init_attr {
+	void                  (*event_handler)(struct ib_event *, void *);
+	void		       *qp_context;
+	struct ib_cq	       *send_cq;
+	struct ib_cq	       *recv_cq;
+	struct ib_srq	       *srq;
+	struct ib_xrcd	       *xrcd;     /* XRC TGT QPs only */
+	struct ib_qp_cap	cap;
+	enum ib_sig_type	sq_sig_type;
+	enum ib_qp_type		qp_type;
+	enum ib_qp_create_flags	create_flags;
+
+	/*
+	 * Only needed for special QP types, or when using the RW API.
+	 */
+	u8			port_num;
+	struct ib_rwq_ind_table *rwq_ind_tbl;
+	u32			source_qpn;
+};
+
+struct ib_qp_open_attr {
+	void                  (*event_handler)(struct ib_event *, void *);
+	void		       *qp_context;
+	u32			qp_num;
+	enum ib_qp_type		qp_type;
+};
+
+enum ib_rnr_timeout {
+	IB_RNR_TIMER_655_36 =  0,
+	IB_RNR_TIMER_000_01 =  1,
+	IB_RNR_TIMER_000_02 =  2,
+	IB_RNR_TIMER_000_03 =  3,
+	IB_RNR_TIMER_000_04 =  4,
+	IB_RNR_TIMER_000_06 =  5,
+	IB_RNR_TIMER_000_08 =  6,
+	IB_RNR_TIMER_000_12 =  7,
+	IB_RNR_TIMER_000_16 =  8,
+	IB_RNR_TIMER_000_24 =  9,
+	IB_RNR_TIMER_000_32 = 10,
+	IB_RNR_TIMER_000_48 = 11,
+	IB_RNR_TIMER_000_64 = 12,
+	IB_RNR_TIMER_000_96 = 13,
+	IB_RNR_TIMER_001_28 = 14,
+	IB_RNR_TIMER_001_92 = 15,
+	IB_RNR_TIMER_002_56 = 16,
+	IB_RNR_TIMER_003_84 = 17,
+	IB_RNR_TIMER_005_12 = 18,
+	IB_RNR_TIMER_007_68 = 19,
+	IB_RNR_TIMER_010_24 = 20,
+	IB_RNR_TIMER_015_36 = 21,
+	IB_RNR_TIMER_020_48 = 22,
+	IB_RNR_TIMER_030_72 = 23,
+	IB_RNR_TIMER_040_96 = 24,
+	IB_RNR_TIMER_061_44 = 25,
+	IB_RNR_TIMER_081_92 = 26,
+	IB_RNR_TIMER_122_88 = 27,
+	IB_RNR_TIMER_163_84 = 28,
+	IB_RNR_TIMER_245_76 = 29,
+	IB_RNR_TIMER_327_68 = 30,
+	IB_RNR_TIMER_491_52 = 31
+};
+
+enum ib_qp_attr_mask {
+	IB_QP_STATE			= 1,
+	IB_QP_CUR_STATE			= (1<<1),
+	IB_QP_EN_SQD_ASYNC_NOTIFY	= (1<<2),
+	IB_QP_ACCESS_FLAGS		= (1<<3),
+	IB_QP_PKEY_INDEX		= (1<<4),
+	IB_QP_PORT			= (1<<5),
+	IB_QP_QKEY			= (1<<6),
+	IB_QP_AV			= (1<<7),
+	IB_QP_PATH_MTU			= (1<<8),
+	IB_QP_TIMEOUT			= (1<<9),
+	IB_QP_RETRY_CNT			= (1<<10),
+	IB_QP_RNR_RETRY			= (1<<11),
+	IB_QP_RQ_PSN			= (1<<12),
+	IB_QP_MAX_QP_RD_ATOMIC		= (1<<13),
+	IB_QP_ALT_PATH			= (1<<14),
+	IB_QP_MIN_RNR_TIMER		= (1<<15),
+	IB_QP_SQ_PSN			= (1<<16),
+	IB_QP_MAX_DEST_RD_ATOMIC	= (1<<17),
+	IB_QP_PATH_MIG_STATE		= (1<<18),
+	IB_QP_CAP			= (1<<19),
+	IB_QP_DEST_QPN			= (1<<20),
+	IB_QP_RESERVED1			= (1<<21),
+	IB_QP_RESERVED2			= (1<<22),
+	IB_QP_RESERVED3			= (1<<23),
+	IB_QP_RESERVED4			= (1<<24),
+	IB_QP_RATE_LIMIT		= (1<<25),
+};
+
+enum ib_qp_state {
+	IB_QPS_RESET,
+	IB_QPS_INIT,
+	IB_QPS_RTR,
+	IB_QPS_RTS,
+	IB_QPS_SQD,
+	IB_QPS_SQE,
+	IB_QPS_ERR
+};
+
+enum ib_mig_state {
+	IB_MIG_MIGRATED,
+	IB_MIG_REARM,
+	IB_MIG_ARMED
+};
+
+enum ib_mw_type {
+	IB_MW_TYPE_1 = 1,
+	IB_MW_TYPE_2 = 2
+};
+
+struct ib_qp_attr {
+	enum ib_qp_state	qp_state;
+	enum ib_qp_state	cur_qp_state;
+	enum ib_mtu		path_mtu;
+	enum ib_mig_state	path_mig_state;
+	u32			qkey;
+	u32			rq_psn;
+	u32			sq_psn;
+	u32			dest_qp_num;
+	int			qp_access_flags;
+	struct ib_qp_cap	cap;
+	struct rdma_ah_attr	ah_attr;
+	struct rdma_ah_attr	alt_ah_attr;
+	u16			pkey_index;
+	u16			alt_pkey_index;
+	u8			en_sqd_async_notify;
+	u8			sq_draining;
+	u8			max_rd_atomic;
+	u8			max_dest_rd_atomic;
+	u8			min_rnr_timer;
+	u8			port_num;
+	u8			timeout;
+	u8			retry_cnt;
+	u8			rnr_retry;
+	u8			alt_port_num;
+	u8			alt_timeout;
+	u32			rate_limit;
+};
+
+enum ib_wr_opcode {
+	IB_WR_RDMA_WRITE,
+	IB_WR_RDMA_WRITE_WITH_IMM,
+	IB_WR_SEND,
+	IB_WR_SEND_WITH_IMM,
+	IB_WR_RDMA_READ,
+	IB_WR_ATOMIC_CMP_AND_SWP,
+	IB_WR_ATOMIC_FETCH_AND_ADD,
+	IB_WR_LSO,
+	IB_WR_SEND_WITH_INV,
+	IB_WR_RDMA_READ_WITH_INV,
+	IB_WR_LOCAL_INV,
+	IB_WR_REG_MR,
+	IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
+	IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+	IB_WR_REG_SIG_MR,
+	/* reserve values for low level drivers' internal use.
+	 * These values will not be used at all in the ib core layer.
+	 */
+	IB_WR_RESERVED1 = 0xf0,
+	IB_WR_RESERVED2,
+	IB_WR_RESERVED3,
+	IB_WR_RESERVED4,
+	IB_WR_RESERVED5,
+	IB_WR_RESERVED6,
+	IB_WR_RESERVED7,
+	IB_WR_RESERVED8,
+	IB_WR_RESERVED9,
+	IB_WR_RESERVED10,
+};
+
+enum ib_send_flags {
+	IB_SEND_FENCE		= 1,
+	IB_SEND_SIGNALED	= (1<<1),
+	IB_SEND_SOLICITED	= (1<<2),
+	IB_SEND_INLINE		= (1<<3),
+	IB_SEND_IP_CSUM		= (1<<4),
+
+	/* reserve bits 26-31 for low level drivers' internal use */
+	IB_SEND_RESERVED_START	= (1 << 26),
+	IB_SEND_RESERVED_END	= (1 << 31),
+};
+
+struct ib_sge {
+	u64	addr;
+	u32	length;
+	u32	lkey;
+};
+
+struct ib_cqe {
+	void (*done)(struct ib_cq *cq, struct ib_wc *wc);
+};
+
+struct ib_send_wr {
+	struct ib_send_wr      *next;
+	union {
+		u64		wr_id;
+		struct ib_cqe	*wr_cqe;
+	};
+	struct ib_sge	       *sg_list;
+	int			num_sge;
+	enum ib_wr_opcode	opcode;
+	int			send_flags;
+	union {
+		__be32		imm_data;
+		u32		invalidate_rkey;
+	} ex;
+};
+
+struct ib_rdma_wr {
+	struct ib_send_wr	wr;
+	u64			remote_addr;
+	u32			rkey;
+};
+
+static inline struct ib_rdma_wr *rdma_wr(struct ib_send_wr *wr)
+{
+	return container_of(wr, struct ib_rdma_wr, wr);
+}
+
+struct ib_atomic_wr {
+	struct ib_send_wr	wr;
+	u64			remote_addr;
+	u64			compare_add;
+	u64			swap;
+	u64			compare_add_mask;
+	u64			swap_mask;
+	u32			rkey;
+};
+
+static inline struct ib_atomic_wr *atomic_wr(struct ib_send_wr *wr)
+{
+	return container_of(wr, struct ib_atomic_wr, wr);
+}
+
+struct ib_ud_wr {
+	struct ib_send_wr	wr;
+	struct ib_ah		*ah;
+	void			*header;
+	int			hlen;
+	int			mss;
+	u32			remote_qpn;
+	u32			remote_qkey;
+	u16			pkey_index; /* valid for GSI only */
+	u8			port_num;   /* valid for DR SMPs on switch only */
+};
+
+static inline struct ib_ud_wr *ud_wr(struct ib_send_wr *wr)
+{
+	return container_of(wr, struct ib_ud_wr, wr);
+}
+
+struct ib_reg_wr {
+	struct ib_send_wr	wr;
+	struct ib_mr		*mr;
+	u32			key;
+	int			access;
+};
+
+static inline struct ib_reg_wr *reg_wr(struct ib_send_wr *wr)
+{
+	return container_of(wr, struct ib_reg_wr, wr);
+}
+
+struct ib_sig_handover_wr {
+	struct ib_send_wr	wr;
+	struct ib_sig_attrs    *sig_attrs;
+	struct ib_mr	       *sig_mr;
+	int			access_flags;
+	struct ib_sge	       *prot;
+};
+
+static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr)
+{
+	return container_of(wr, struct ib_sig_handover_wr, wr);
+}
+
+struct ib_recv_wr {
+	struct ib_recv_wr      *next;
+	union {
+		u64		wr_id;
+		struct ib_cqe	*wr_cqe;
+	};
+	struct ib_sge	       *sg_list;
+	int			num_sge;
+};
+
+enum ib_access_flags {
+	IB_ACCESS_LOCAL_WRITE	= 1,
+	IB_ACCESS_REMOTE_WRITE	= (1<<1),
+	IB_ACCESS_REMOTE_READ	= (1<<2),
+	IB_ACCESS_REMOTE_ATOMIC	= (1<<3),
+	IB_ACCESS_MW_BIND	= (1<<4),
+	IB_ZERO_BASED		= (1<<5),
+	IB_ACCESS_ON_DEMAND     = (1<<6),
+	IB_ACCESS_HUGETLB	= (1<<7),
+};
+
+/*
+ * XXX: these are apparently used for ->rereg_user_mr, no idea why they
+ * are hidden here instead of a uapi header!
+ */
+enum ib_mr_rereg_flags {
+	IB_MR_REREG_TRANS	= 1,
+	IB_MR_REREG_PD		= (1<<1),
+	IB_MR_REREG_ACCESS	= (1<<2),
+	IB_MR_REREG_SUPPORTED	= ((IB_MR_REREG_ACCESS << 1) - 1)
+};
+
+struct ib_fmr_attr {
+	int	max_pages;
+	int	max_maps;
+	u8	page_shift;
+};
+
+struct ib_umem;
+
+enum rdma_remove_reason {
+	/* Userspace requested uobject deletion. Call could fail */
+	RDMA_REMOVE_DESTROY,
+	/* Context deletion. This call should delete the actual object itself */
+	RDMA_REMOVE_CLOSE,
+	/* Driver is being hot-unplugged. This call should delete the actual object itself */
+	RDMA_REMOVE_DRIVER_REMOVE,
+	/* Context is being cleaned-up, but commit was just completed */
+	RDMA_REMOVE_DURING_CLEANUP,
+};
+
+struct ib_rdmacg_object {
+#ifdef CONFIG_CGROUP_RDMA
+	struct rdma_cgroup	*cg;		/* owner rdma cgroup */
+#endif
+};
+
+struct ib_ucontext {
+	struct ib_device       *device;
+	struct ib_uverbs_file  *ufile;
+	int			closing;
+
+	/* locking the uobjects_list */
+	struct mutex		uobjects_lock;
+	struct list_head	uobjects;
+	/* protects cleanup process from other actions */
+	struct rw_semaphore	cleanup_rwsem;
+	enum rdma_remove_reason cleanup_reason;
+
+	struct pid             *tgid;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	struct rb_root_cached   umem_tree;
+	/*
+	 * Protects .umem_rbroot and tree, as well as odp_mrs_count and
+	 * mmu notifiers registration.
+	 */
+	struct rw_semaphore	umem_rwsem;
+	void (*invalidate_range)(struct ib_umem *umem,
+				 unsigned long start, unsigned long end);
+
+	struct mmu_notifier	mn;
+	atomic_t		notifier_count;
+	/* A list of umems that don't have private mmu notifier counters yet. */
+	struct list_head	no_private_counters;
+	int                     odp_mrs_count;
+#endif
+
+	struct ib_rdmacg_object	cg_obj;
+};
+
+struct ib_uobject {
+	u64			user_handle;	/* handle given to us by userspace */
+	struct ib_ucontext     *context;	/* associated user context */
+	void		       *object;		/* containing object */
+	struct list_head	list;		/* link to context's list */
+	struct ib_rdmacg_object	cg_obj;		/* rdmacg object */
+	int			id;		/* index into kernel idr */
+	struct kref		ref;
+	atomic_t		usecnt;		/* protects exclusive access */
+	struct rcu_head		rcu;		/* kfree_rcu() overhead */
+
+	const struct uverbs_obj_type *type;
+};
+
+struct ib_uobject_file {
+	struct ib_uobject	uobj;
+	/* ufile contains the lock between context release and file close */
+	struct ib_uverbs_file	*ufile;
+};
+
+struct ib_udata {
+	const void __user *inbuf;
+	void __user *outbuf;
+	size_t       inlen;
+	size_t       outlen;
+};
+
+struct ib_pd {
+	u32			local_dma_lkey;
+	u32			flags;
+	struct ib_device       *device;
+	struct ib_uobject      *uobject;
+	atomic_t          	usecnt; /* count all resources */
+
+	u32			unsafe_global_rkey;
+
+	/*
+	 * Implementation details of the RDMA core, don't use in drivers:
+	 */
+	struct ib_mr	       *__internal_mr;
+	struct rdma_restrack_entry res;
+};
+
+struct ib_xrcd {
+	struct ib_device       *device;
+	atomic_t		usecnt; /* count all exposed resources */
+	struct inode	       *inode;
+
+	struct mutex		tgt_qp_mutex;
+	struct list_head	tgt_qp_list;
+};
+
+struct ib_ah {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	struct ib_uobject	*uobject;
+	enum rdma_ah_attr_type	type;
+};
+
+typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
+
+enum ib_poll_context {
+	IB_POLL_DIRECT,		/* caller context, no hw completions */
+	IB_POLL_SOFTIRQ,	/* poll from softirq context */
+	IB_POLL_WORKQUEUE,	/* poll from workqueue */
+};
+
+struct ib_cq {
+	struct ib_device       *device;
+	struct ib_uobject      *uobject;
+	ib_comp_handler   	comp_handler;
+	void                  (*event_handler)(struct ib_event *, void *);
+	void                   *cq_context;
+	int               	cqe;
+	atomic_t          	usecnt; /* count number of work queues */
+	enum ib_poll_context	poll_ctx;
+	struct ib_wc		*wc;
+	union {
+		struct irq_poll		iop;
+		struct work_struct	work;
+	};
+	/*
+	 * Implementation details of the RDMA core, don't use in drivers:
+	 */
+	struct rdma_restrack_entry res;
+};
+
+struct ib_srq {
+	struct ib_device       *device;
+	struct ib_pd	       *pd;
+	struct ib_uobject      *uobject;
+	void		      (*event_handler)(struct ib_event *, void *);
+	void		       *srq_context;
+	enum ib_srq_type	srq_type;
+	atomic_t		usecnt;
+
+	struct {
+		struct ib_cq   *cq;
+		union {
+			struct {
+				struct ib_xrcd *xrcd;
+				u32		srq_num;
+			} xrc;
+		};
+	} ext;
+};
+
+enum ib_raw_packet_caps {
+	/* Strip cvlan from incoming packet and report it in the matching work
+	 * completion is supported.
+	 */
+	IB_RAW_PACKET_CAP_CVLAN_STRIPPING	= (1 << 0),
+	/* Scatter FCS field of an incoming packet to host memory is supported.
+	 */
+	IB_RAW_PACKET_CAP_SCATTER_FCS		= (1 << 1),
+	/* Checksum offloads are supported (for both send and receive). */
+	IB_RAW_PACKET_CAP_IP_CSUM		= (1 << 2),
+	/* When a packet is received for an RQ with no receive WQEs, the
+	 * packet processing is delayed.
+	 */
+	IB_RAW_PACKET_CAP_DELAY_DROP		= (1 << 3),
+};
+
+enum ib_wq_type {
+	IB_WQT_RQ
+};
+
+enum ib_wq_state {
+	IB_WQS_RESET,
+	IB_WQS_RDY,
+	IB_WQS_ERR
+};
+
+struct ib_wq {
+	struct ib_device       *device;
+	struct ib_uobject      *uobject;
+	void		    *wq_context;
+	void		    (*event_handler)(struct ib_event *, void *);
+	struct ib_pd	       *pd;
+	struct ib_cq	       *cq;
+	u32		wq_num;
+	enum ib_wq_state       state;
+	enum ib_wq_type	wq_type;
+	atomic_t		usecnt;
+};
+
+enum ib_wq_flags {
+	IB_WQ_FLAGS_CVLAN_STRIPPING	= 1 << 0,
+	IB_WQ_FLAGS_SCATTER_FCS		= 1 << 1,
+	IB_WQ_FLAGS_DELAY_DROP		= 1 << 2,
+	IB_WQ_FLAGS_PCI_WRITE_END_PADDING = 1 << 3,
+};
+
+struct ib_wq_init_attr {
+	void		       *wq_context;
+	enum ib_wq_type	wq_type;
+	u32		max_wr;
+	u32		max_sge;
+	struct	ib_cq	       *cq;
+	void		    (*event_handler)(struct ib_event *, void *);
+	u32		create_flags; /* Use enum ib_wq_flags */
+};
+
+enum ib_wq_attr_mask {
+	IB_WQ_STATE		= 1 << 0,
+	IB_WQ_CUR_STATE		= 1 << 1,
+	IB_WQ_FLAGS		= 1 << 2,
+};
+
+struct ib_wq_attr {
+	enum	ib_wq_state	wq_state;
+	enum	ib_wq_state	curr_wq_state;
+	u32			flags; /* Use enum ib_wq_flags */
+	u32			flags_mask; /* Use enum ib_wq_flags */
+};
+
+struct ib_rwq_ind_table {
+	struct ib_device	*device;
+	struct ib_uobject      *uobject;
+	atomic_t		usecnt;
+	u32		ind_tbl_num;
+	u32		log_ind_tbl_size;
+	struct ib_wq	**ind_tbl;
+};
+
+struct ib_rwq_ind_table_init_attr {
+	u32		log_ind_tbl_size;
+	/* Each entry is a pointer to Receive Work Queue */
+	struct ib_wq	**ind_tbl;
+};
+
+enum port_pkey_state {
+	IB_PORT_PKEY_NOT_VALID = 0,
+	IB_PORT_PKEY_VALID = 1,
+	IB_PORT_PKEY_LISTED = 2,
+};
+
+struct ib_qp_security;
+
+struct ib_port_pkey {
+	enum port_pkey_state	state;
+	u16			pkey_index;
+	u8			port_num;
+	struct list_head	qp_list;
+	struct list_head	to_error_list;
+	struct ib_qp_security  *sec;
+};
+
+struct ib_ports_pkeys {
+	struct ib_port_pkey	main;
+	struct ib_port_pkey	alt;
+};
+
+struct ib_qp_security {
+	struct ib_qp	       *qp;
+	struct ib_device       *dev;
+	/* Hold this mutex when changing port and pkey settings. */
+	struct mutex		mutex;
+	struct ib_ports_pkeys  *ports_pkeys;
+	/* A list of all open shared QP handles.  Required to enforce security
+	 * properly for all users of a shared QP.
+	 */
+	struct list_head        shared_qp_list;
+	void                   *security;
+	bool			destroying;
+	atomic_t		error_list_count;
+	struct completion	error_complete;
+	int			error_comps_pending;
+};
+
+/*
+ * @max_write_sge: Maximum SGE elements per RDMA WRITE request.
+ * @max_read_sge:  Maximum SGE elements per RDMA READ request.
+ */
+struct ib_qp {
+	struct ib_device       *device;
+	struct ib_pd	       *pd;
+	struct ib_cq	       *send_cq;
+	struct ib_cq	       *recv_cq;
+	spinlock_t		mr_lock;
+	int			mrs_used;
+	struct list_head	rdma_mrs;
+	struct list_head	sig_mrs;
+	struct ib_srq	       *srq;
+	struct ib_xrcd	       *xrcd; /* XRC TGT QPs only */
+	struct list_head	xrcd_list;
+
+	/* count times opened, mcast attaches, flow attaches */
+	atomic_t		usecnt;
+	struct list_head	open_list;
+	struct ib_qp           *real_qp;
+	struct ib_uobject      *uobject;
+	void                  (*event_handler)(struct ib_event *, void *);
+	void		       *qp_context;
+	u32			qp_num;
+	u32			max_write_sge;
+	u32			max_read_sge;
+	enum ib_qp_type		qp_type;
+	struct ib_rwq_ind_table *rwq_ind_tbl;
+	struct ib_qp_security  *qp_sec;
+	u8			port;
+
+	/*
+	 * Implementation details of the RDMA core, don't use in drivers:
+	 */
+	struct rdma_restrack_entry     res;
+};
+
+struct ib_dm {
+	struct ib_device  *device;
+	u32		   length;
+	u32		   flags;
+	struct ib_uobject *uobject;
+	atomic_t	   usecnt;
+};
+
+struct ib_mr {
+	struct ib_device  *device;
+	struct ib_pd	  *pd;
+	u32		   lkey;
+	u32		   rkey;
+	u64		   iova;
+	u64		   length;
+	unsigned int	   page_size;
+	bool		   need_inval;
+	union {
+		struct ib_uobject	*uobject;	/* user */
+		struct list_head	qp_entry;	/* FR */
+	};
+
+	struct ib_dm      *dm;
+
+	/*
+	 * Implementation details of the RDMA core, don't use in drivers:
+	 */
+	struct rdma_restrack_entry res;
+};
+
+struct ib_mw {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	struct ib_uobject	*uobject;
+	u32			rkey;
+	enum ib_mw_type         type;
+};
+
+struct ib_fmr {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	struct list_head	list;
+	u32			lkey;
+	u32			rkey;
+};
+
+/* Supported steering options */
+enum ib_flow_attr_type {
+	/* steering according to rule specifications */
+	IB_FLOW_ATTR_NORMAL		= 0x0,
+	/* default unicast and multicast rule -
+	 * receive all Eth traffic which isn't steered to any QP
+	 */
+	IB_FLOW_ATTR_ALL_DEFAULT	= 0x1,
+	/* default multicast rule -
+	 * receive all Eth multicast traffic which isn't steered to any QP
+	 */
+	IB_FLOW_ATTR_MC_DEFAULT		= 0x2,
+	/* sniffer rule - receive all port traffic */
+	IB_FLOW_ATTR_SNIFFER		= 0x3
+};
+
+/* Supported steering header types */
+enum ib_flow_spec_type {
+	/* L2 headers*/
+	IB_FLOW_SPEC_ETH		= 0x20,
+	IB_FLOW_SPEC_IB			= 0x22,
+	/* L3 header*/
+	IB_FLOW_SPEC_IPV4		= 0x30,
+	IB_FLOW_SPEC_IPV6		= 0x31,
+	IB_FLOW_SPEC_ESP                = 0x34,
+	/* L4 headers*/
+	IB_FLOW_SPEC_TCP		= 0x40,
+	IB_FLOW_SPEC_UDP		= 0x41,
+	IB_FLOW_SPEC_VXLAN_TUNNEL	= 0x50,
+	IB_FLOW_SPEC_INNER		= 0x100,
+	/* Actions */
+	IB_FLOW_SPEC_ACTION_TAG         = 0x1000,
+	IB_FLOW_SPEC_ACTION_DROP        = 0x1001,
+	IB_FLOW_SPEC_ACTION_HANDLE	= 0x1002,
+};
+#define IB_FLOW_SPEC_LAYER_MASK	0xF0
+#define IB_FLOW_SPEC_SUPPORT_LAYERS 8
+
+/* Flow steering rule priority is set according to it's domain.
+ * Lower domain value means higher priority.
+ */
+enum ib_flow_domain {
+	IB_FLOW_DOMAIN_USER,
+	IB_FLOW_DOMAIN_ETHTOOL,
+	IB_FLOW_DOMAIN_RFS,
+	IB_FLOW_DOMAIN_NIC,
+	IB_FLOW_DOMAIN_NUM /* Must be last */
+};
+
+enum ib_flow_flags {
+	IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */
+	IB_FLOW_ATTR_FLAGS_EGRESS = 1UL << 2, /* Egress flow */
+	IB_FLOW_ATTR_FLAGS_RESERVED  = 1UL << 3  /* Must be last */
+};
+
+struct ib_flow_eth_filter {
+	u8	dst_mac[6];
+	u8	src_mac[6];
+	__be16	ether_type;
+	__be16	vlan_tag;
+	/* Must be last */
+	u8	real_sz[0];
+};
+
+struct ib_flow_spec_eth {
+	u32			  type;
+	u16			  size;
+	struct ib_flow_eth_filter val;
+	struct ib_flow_eth_filter mask;
+};
+
+struct ib_flow_ib_filter {
+	__be16 dlid;
+	__u8   sl;
+	/* Must be last */
+	u8	real_sz[0];
+};
+
+struct ib_flow_spec_ib {
+	u32			 type;
+	u16			 size;
+	struct ib_flow_ib_filter val;
+	struct ib_flow_ib_filter mask;
+};
+
+/* IPv4 header flags */
+enum ib_ipv4_flags {
+	IB_IPV4_DONT_FRAG = 0x2, /* Don't enable packet fragmentation */
+	IB_IPV4_MORE_FRAG = 0X4  /* For All fragmented packets except the
+				    last have this flag set */
+};
+
+struct ib_flow_ipv4_filter {
+	__be32	src_ip;
+	__be32	dst_ip;
+	u8	proto;
+	u8	tos;
+	u8	ttl;
+	u8	flags;
+	/* Must be last */
+	u8	real_sz[0];
+};
+
+struct ib_flow_spec_ipv4 {
+	u32			   type;
+	u16			   size;
+	struct ib_flow_ipv4_filter val;
+	struct ib_flow_ipv4_filter mask;
+};
+
+struct ib_flow_ipv6_filter {
+	u8	src_ip[16];
+	u8	dst_ip[16];
+	__be32	flow_label;
+	u8	next_hdr;
+	u8	traffic_class;
+	u8	hop_limit;
+	/* Must be last */
+	u8	real_sz[0];
+};
+
+struct ib_flow_spec_ipv6 {
+	u32			   type;
+	u16			   size;
+	struct ib_flow_ipv6_filter val;
+	struct ib_flow_ipv6_filter mask;
+};
+
+struct ib_flow_tcp_udp_filter {
+	__be16	dst_port;
+	__be16	src_port;
+	/* Must be last */
+	u8	real_sz[0];
+};
+
+struct ib_flow_spec_tcp_udp {
+	u32			      type;
+	u16			      size;
+	struct ib_flow_tcp_udp_filter val;
+	struct ib_flow_tcp_udp_filter mask;
+};
+
+struct ib_flow_tunnel_filter {
+	__be32	tunnel_id;
+	u8	real_sz[0];
+};
+
+/* ib_flow_spec_tunnel describes the Vxlan tunnel
+ * the tunnel_id from val has the vni value
+ */
+struct ib_flow_spec_tunnel {
+	u32			      type;
+	u16			      size;
+	struct ib_flow_tunnel_filter  val;
+	struct ib_flow_tunnel_filter  mask;
+};
+
+struct ib_flow_esp_filter {
+	__be32	spi;
+	__be32  seq;
+	/* Must be last */
+	u8	real_sz[0];
+};
+
+struct ib_flow_spec_esp {
+	u32                           type;
+	u16			      size;
+	struct ib_flow_esp_filter     val;
+	struct ib_flow_esp_filter     mask;
+};
+
+struct ib_flow_spec_action_tag {
+	enum ib_flow_spec_type	      type;
+	u16			      size;
+	u32                           tag_id;
+};
+
+struct ib_flow_spec_action_drop {
+	enum ib_flow_spec_type	      type;
+	u16			      size;
+};
+
+struct ib_flow_spec_action_handle {
+	enum ib_flow_spec_type	      type;
+	u16			      size;
+	struct ib_flow_action	     *act;
+};
+
+union ib_flow_spec {
+	struct {
+		u32			type;
+		u16			size;
+	};
+	struct ib_flow_spec_eth		eth;
+	struct ib_flow_spec_ib		ib;
+	struct ib_flow_spec_ipv4        ipv4;
+	struct ib_flow_spec_tcp_udp	tcp_udp;
+	struct ib_flow_spec_ipv6        ipv6;
+	struct ib_flow_spec_tunnel      tunnel;
+	struct ib_flow_spec_esp		esp;
+	struct ib_flow_spec_action_tag  flow_tag;
+	struct ib_flow_spec_action_drop drop;
+	struct ib_flow_spec_action_handle action;
+};
+
+struct ib_flow_attr {
+	enum ib_flow_attr_type type;
+	u16	     size;
+	u16	     priority;
+	u32	     flags;
+	u8	     num_of_specs;
+	u8	     port;
+	/* Following are the optional layers according to user request
+	 * struct ib_flow_spec_xxx
+	 * struct ib_flow_spec_yyy
+	 */
+};
+
+struct ib_flow {
+	struct ib_qp		*qp;
+	struct ib_uobject	*uobject;
+};
+
+enum ib_flow_action_type {
+	IB_FLOW_ACTION_UNSPECIFIED,
+	IB_FLOW_ACTION_ESP = 1,
+};
+
+struct ib_flow_action_attrs_esp_keymats {
+	enum ib_uverbs_flow_action_esp_keymat			protocol;
+	union {
+		struct ib_uverbs_flow_action_esp_keymat_aes_gcm aes_gcm;
+	} keymat;
+};
+
+struct ib_flow_action_attrs_esp_replays {
+	enum ib_uverbs_flow_action_esp_replay			protocol;
+	union {
+		struct ib_uverbs_flow_action_esp_replay_bmp	bmp;
+	} replay;
+};
+
+enum ib_flow_action_attrs_esp_flags {
+	/* All user-space flags at the top: Use enum ib_uverbs_flow_action_esp_flags
+	 * This is done in order to share the same flags between user-space and
+	 * kernel and spare an unnecessary translation.
+	 */
+
+	/* Kernel flags */
+	IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED	= 1ULL << 32,
+	IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS	= 1ULL << 33,
+};
+
+struct ib_flow_spec_list {
+	struct ib_flow_spec_list	*next;
+	union ib_flow_spec		spec;
+};
+
+struct ib_flow_action_attrs_esp {
+	struct ib_flow_action_attrs_esp_keymats		*keymat;
+	struct ib_flow_action_attrs_esp_replays		*replay;
+	struct ib_flow_spec_list			*encap;
+	/* Used only if IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED is enabled.
+	 * Value of 0 is a valid value.
+	 */
+	u32						esn;
+	u32						spi;
+	u32						seq;
+	u32						tfc_pad;
+	/* Use enum ib_flow_action_attrs_esp_flags */
+	u64						flags;
+	u64						hard_limit_pkts;
+};
+
+struct ib_flow_action {
+	struct ib_device		*device;
+	struct ib_uobject		*uobject;
+	enum ib_flow_action_type	type;
+	atomic_t			usecnt;
+};
+
+struct ib_mad_hdr;
+struct ib_grh;
+
+enum ib_process_mad_flags {
+	IB_MAD_IGNORE_MKEY	= 1,
+	IB_MAD_IGNORE_BKEY	= 2,
+	IB_MAD_IGNORE_ALL	= IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY
+};
+
+enum ib_mad_result {
+	IB_MAD_RESULT_FAILURE  = 0,      /* (!SUCCESS is the important flag) */
+	IB_MAD_RESULT_SUCCESS  = 1 << 0, /* MAD was successfully processed   */
+	IB_MAD_RESULT_REPLY    = 1 << 1, /* Reply packet needs to be sent    */
+	IB_MAD_RESULT_CONSUMED = 1 << 2  /* Packet consumed: stop processing */
+};
+
+struct ib_port_cache {
+	u64		      subnet_prefix;
+	struct ib_pkey_cache  *pkey;
+	struct ib_gid_table   *gid;
+	u8                     lmc;
+	enum ib_port_state     port_state;
+};
+
+struct ib_cache {
+	rwlock_t                lock;
+	struct ib_event_handler event_handler;
+	struct ib_port_cache   *ports;
+};
+
+struct iw_cm_verbs;
+
+struct ib_port_immutable {
+	int                           pkey_tbl_len;
+	int                           gid_tbl_len;
+	u32                           core_cap_flags;
+	u32                           max_mad_size;
+};
+
+/* rdma netdev type - specifies protocol type */
+enum rdma_netdev_t {
+	RDMA_NETDEV_OPA_VNIC,
+	RDMA_NETDEV_IPOIB,
+};
+
+/**
+ * struct rdma_netdev - rdma netdev
+ * For cases where netstack interfacing is required.
+ */
+struct rdma_netdev {
+	void              *clnt_priv;
+	struct ib_device  *hca;
+	u8                 port_num;
+
+	/* cleanup function must be specified */
+	void (*free_rdma_netdev)(struct net_device *netdev);
+
+	/* control functions */
+	void (*set_id)(struct net_device *netdev, int id);
+	/* send packet */
+	int (*send)(struct net_device *dev, struct sk_buff *skb,
+		    struct ib_ah *address, u32 dqpn);
+	/* multicast */
+	int (*attach_mcast)(struct net_device *dev, struct ib_device *hca,
+			    union ib_gid *gid, u16 mlid,
+			    int set_qkey, u32 qkey);
+	int (*detach_mcast)(struct net_device *dev, struct ib_device *hca,
+			    union ib_gid *gid, u16 mlid);
+};
+
+struct ib_port_pkey_list {
+	/* Lock to hold while modifying the list. */
+	spinlock_t                    list_lock;
+	struct list_head              pkey_list;
+};
+
+struct uverbs_attr_bundle;
+
+struct ib_device {
+	/* Do not access @dma_device directly from ULP nor from HW drivers. */
+	struct device                *dma_device;
+
+	char                          name[IB_DEVICE_NAME_MAX];
+
+	struct list_head              event_handler_list;
+	spinlock_t                    event_handler_lock;
+
+	spinlock_t                    client_data_lock;
+	struct list_head              core_list;
+	/* Access to the client_data_list is protected by the client_data_lock
+	 * spinlock and the lists_rwsem read-write semaphore */
+	struct list_head              client_data_list;
+
+	struct ib_cache               cache;
+	/**
+	 * port_immutable is indexed by port number
+	 */
+	struct ib_port_immutable     *port_immutable;
+
+	int			      num_comp_vectors;
+
+	struct ib_port_pkey_list     *port_pkey_list;
+
+	struct iw_cm_verbs	     *iwcm;
+
+	/**
+	 * alloc_hw_stats - Allocate a struct rdma_hw_stats and fill in the
+	 *   driver initialized data.  The struct is kfree()'ed by the sysfs
+	 *   core when the device is removed.  A lifespan of -1 in the return
+	 *   struct tells the core to set a default lifespan.
+	 */
+	struct rdma_hw_stats      *(*alloc_hw_stats)(struct ib_device *device,
+						     u8 port_num);
+	/**
+	 * get_hw_stats - Fill in the counter value(s) in the stats struct.
+	 * @index - The index in the value array we wish to have updated, or
+	 *   num_counters if we want all stats updated
+	 * Return codes -
+	 *   < 0 - Error, no counters updated
+	 *   index - Updated the single counter pointed to by index
+	 *   num_counters - Updated all counters (will reset the timestamp
+	 *     and prevent further calls for lifespan milliseconds)
+	 * Drivers are allowed to update all counters in leiu of just the
+	 *   one given in index at their option
+	 */
+	int		           (*get_hw_stats)(struct ib_device *device,
+						   struct rdma_hw_stats *stats,
+						   u8 port, int index);
+	int		           (*query_device)(struct ib_device *device,
+						   struct ib_device_attr *device_attr,
+						   struct ib_udata *udata);
+	int		           (*query_port)(struct ib_device *device,
+						 u8 port_num,
+						 struct ib_port_attr *port_attr);
+	enum rdma_link_layer	   (*get_link_layer)(struct ib_device *device,
+						     u8 port_num);
+	/* When calling get_netdev, the HW vendor's driver should return the
+	 * net device of device @device at port @port_num or NULL if such
+	 * a net device doesn't exist. The vendor driver should call dev_hold
+	 * on this net device. The HW vendor's device driver must guarantee
+	 * that this function returns NULL before the net device has finished
+	 * NETDEV_UNREGISTER state.
+	 */
+	struct net_device	  *(*get_netdev)(struct ib_device *device,
+						 u8 port_num);
+	/* query_gid should be return GID value for @device, when @port_num
+	 * link layer is either IB or iWarp. It is no-op if @port_num port
+	 * is RoCE link layer.
+	 */
+	int		           (*query_gid)(struct ib_device *device,
+						u8 port_num, int index,
+						union ib_gid *gid);
+	/* When calling add_gid, the HW vendor's driver should add the gid
+	 * of device of port at gid index available at @attr. Meta-info of
+	 * that gid (for example, the network device related to this gid) is
+	 * available at @attr. @context allows the HW vendor driver to store
+	 * extra information together with a GID entry. The HW vendor driver may
+	 * allocate memory to contain this information and store it in @context
+	 * when a new GID entry is written to. Params are consistent until the
+	 * next call of add_gid or delete_gid. The function should return 0 on
+	 * success or error otherwise. The function could be called
+	 * concurrently for different ports. This function is only called when
+	 * roce_gid_table is used.
+	 */
+	int		           (*add_gid)(const union ib_gid *gid,
+					      const struct ib_gid_attr *attr,
+					      void **context);
+	/* When calling del_gid, the HW vendor's driver should delete the
+	 * gid of device @device at gid index gid_index of port port_num
+	 * available in @attr.
+	 * Upon the deletion of a GID entry, the HW vendor must free any
+	 * allocated memory. The caller will clear @context afterwards.
+	 * This function is only called when roce_gid_table is used.
+	 */
+	int		           (*del_gid)(const struct ib_gid_attr *attr,
+					      void **context);
+	int		           (*query_pkey)(struct ib_device *device,
+						 u8 port_num, u16 index, u16 *pkey);
+	int		           (*modify_device)(struct ib_device *device,
+						    int device_modify_mask,
+						    struct ib_device_modify *device_modify);
+	int		           (*modify_port)(struct ib_device *device,
+						  u8 port_num, int port_modify_mask,
+						  struct ib_port_modify *port_modify);
+	struct ib_ucontext *       (*alloc_ucontext)(struct ib_device *device,
+						     struct ib_udata *udata);
+	int                        (*dealloc_ucontext)(struct ib_ucontext *context);
+	int                        (*mmap)(struct ib_ucontext *context,
+					   struct vm_area_struct *vma);
+	struct ib_pd *             (*alloc_pd)(struct ib_device *device,
+					       struct ib_ucontext *context,
+					       struct ib_udata *udata);
+	int                        (*dealloc_pd)(struct ib_pd *pd);
+	struct ib_ah *             (*create_ah)(struct ib_pd *pd,
+						struct rdma_ah_attr *ah_attr,
+						struct ib_udata *udata);
+	int                        (*modify_ah)(struct ib_ah *ah,
+						struct rdma_ah_attr *ah_attr);
+	int                        (*query_ah)(struct ib_ah *ah,
+					       struct rdma_ah_attr *ah_attr);
+	int                        (*destroy_ah)(struct ib_ah *ah);
+	struct ib_srq *            (*create_srq)(struct ib_pd *pd,
+						 struct ib_srq_init_attr *srq_init_attr,
+						 struct ib_udata *udata);
+	int                        (*modify_srq)(struct ib_srq *srq,
+						 struct ib_srq_attr *srq_attr,
+						 enum ib_srq_attr_mask srq_attr_mask,
+						 struct ib_udata *udata);
+	int                        (*query_srq)(struct ib_srq *srq,
+						struct ib_srq_attr *srq_attr);
+	int                        (*destroy_srq)(struct ib_srq *srq);
+	int                        (*post_srq_recv)(struct ib_srq *srq,
+						    struct ib_recv_wr *recv_wr,
+						    struct ib_recv_wr **bad_recv_wr);
+	struct ib_qp *             (*create_qp)(struct ib_pd *pd,
+						struct ib_qp_init_attr *qp_init_attr,
+						struct ib_udata *udata);
+	int                        (*modify_qp)(struct ib_qp *qp,
+						struct ib_qp_attr *qp_attr,
+						int qp_attr_mask,
+						struct ib_udata *udata);
+	int                        (*query_qp)(struct ib_qp *qp,
+					       struct ib_qp_attr *qp_attr,
+					       int qp_attr_mask,
+					       struct ib_qp_init_attr *qp_init_attr);
+	int                        (*destroy_qp)(struct ib_qp *qp);
+	int                        (*post_send)(struct ib_qp *qp,
+						struct ib_send_wr *send_wr,
+						struct ib_send_wr **bad_send_wr);
+	int                        (*post_recv)(struct ib_qp *qp,
+						struct ib_recv_wr *recv_wr,
+						struct ib_recv_wr **bad_recv_wr);
+	struct ib_cq *             (*create_cq)(struct ib_device *device,
+						const struct ib_cq_init_attr *attr,
+						struct ib_ucontext *context,
+						struct ib_udata *udata);
+	int                        (*modify_cq)(struct ib_cq *cq, u16 cq_count,
+						u16 cq_period);
+	int                        (*destroy_cq)(struct ib_cq *cq);
+	int                        (*resize_cq)(struct ib_cq *cq, int cqe,
+						struct ib_udata *udata);
+	int                        (*poll_cq)(struct ib_cq *cq, int num_entries,
+					      struct ib_wc *wc);
+	int                        (*peek_cq)(struct ib_cq *cq, int wc_cnt);
+	int                        (*req_notify_cq)(struct ib_cq *cq,
+						    enum ib_cq_notify_flags flags);
+	int                        (*req_ncomp_notif)(struct ib_cq *cq,
+						      int wc_cnt);
+	struct ib_mr *             (*get_dma_mr)(struct ib_pd *pd,
+						 int mr_access_flags);
+	struct ib_mr *             (*reg_user_mr)(struct ib_pd *pd,
+						  u64 start, u64 length,
+						  u64 virt_addr,
+						  int mr_access_flags,
+						  struct ib_udata *udata);
+	int			   (*rereg_user_mr)(struct ib_mr *mr,
+						    int flags,
+						    u64 start, u64 length,
+						    u64 virt_addr,
+						    int mr_access_flags,
+						    struct ib_pd *pd,
+						    struct ib_udata *udata);
+	int                        (*dereg_mr)(struct ib_mr *mr);
+	struct ib_mr *		   (*alloc_mr)(struct ib_pd *pd,
+					       enum ib_mr_type mr_type,
+					       u32 max_num_sg);
+	int                        (*map_mr_sg)(struct ib_mr *mr,
+						struct scatterlist *sg,
+						int sg_nents,
+						unsigned int *sg_offset);
+	struct ib_mw *             (*alloc_mw)(struct ib_pd *pd,
+					       enum ib_mw_type type,
+					       struct ib_udata *udata);
+	int                        (*dealloc_mw)(struct ib_mw *mw);
+	struct ib_fmr *	           (*alloc_fmr)(struct ib_pd *pd,
+						int mr_access_flags,
+						struct ib_fmr_attr *fmr_attr);
+	int		           (*map_phys_fmr)(struct ib_fmr *fmr,
+						   u64 *page_list, int list_len,
+						   u64 iova);
+	int		           (*unmap_fmr)(struct list_head *fmr_list);
+	int		           (*dealloc_fmr)(struct ib_fmr *fmr);
+	int                        (*attach_mcast)(struct ib_qp *qp,
+						   union ib_gid *gid,
+						   u16 lid);
+	int                        (*detach_mcast)(struct ib_qp *qp,
+						   union ib_gid *gid,
+						   u16 lid);
+	int                        (*process_mad)(struct ib_device *device,
+						  int process_mad_flags,
+						  u8 port_num,
+						  const struct ib_wc *in_wc,
+						  const struct ib_grh *in_grh,
+						  const struct ib_mad_hdr *in_mad,
+						  size_t in_mad_size,
+						  struct ib_mad_hdr *out_mad,
+						  size_t *out_mad_size,
+						  u16 *out_mad_pkey_index);
+	struct ib_xrcd *	   (*alloc_xrcd)(struct ib_device *device,
+						 struct ib_ucontext *ucontext,
+						 struct ib_udata *udata);
+	int			   (*dealloc_xrcd)(struct ib_xrcd *xrcd);
+	struct ib_flow *	   (*create_flow)(struct ib_qp *qp,
+						  struct ib_flow_attr
+						  *flow_attr,
+						  int domain);
+	int			   (*destroy_flow)(struct ib_flow *flow_id);
+	int			   (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
+						      struct ib_mr_status *mr_status);
+	void			   (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
+	void			   (*drain_rq)(struct ib_qp *qp);
+	void			   (*drain_sq)(struct ib_qp *qp);
+	int			   (*set_vf_link_state)(struct ib_device *device, int vf, u8 port,
+							int state);
+	int			   (*get_vf_config)(struct ib_device *device, int vf, u8 port,
+						   struct ifla_vf_info *ivf);
+	int			   (*get_vf_stats)(struct ib_device *device, int vf, u8 port,
+						   struct ifla_vf_stats *stats);
+	int			   (*set_vf_guid)(struct ib_device *device, int vf, u8 port, u64 guid,
+						  int type);
+	struct ib_wq *		   (*create_wq)(struct ib_pd *pd,
+						struct ib_wq_init_attr *init_attr,
+						struct ib_udata *udata);
+	int			   (*destroy_wq)(struct ib_wq *wq);
+	int			   (*modify_wq)(struct ib_wq *wq,
+						struct ib_wq_attr *attr,
+						u32 wq_attr_mask,
+						struct ib_udata *udata);
+	struct ib_rwq_ind_table *  (*create_rwq_ind_table)(struct ib_device *device,
+							   struct ib_rwq_ind_table_init_attr *init_attr,
+							   struct ib_udata *udata);
+	int                        (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table);
+	struct ib_flow_action *	   (*create_flow_action_esp)(struct ib_device *device,
+							     const struct ib_flow_action_attrs_esp *attr,
+							     struct uverbs_attr_bundle *attrs);
+	int			   (*destroy_flow_action)(struct ib_flow_action *action);
+	int			   (*modify_flow_action_esp)(struct ib_flow_action *action,
+							     const struct ib_flow_action_attrs_esp *attr,
+							     struct uverbs_attr_bundle *attrs);
+	struct ib_dm *             (*alloc_dm)(struct ib_device *device,
+					       struct ib_ucontext *context,
+					       struct ib_dm_alloc_attr *attr,
+					       struct uverbs_attr_bundle *attrs);
+	int                        (*dealloc_dm)(struct ib_dm *dm);
+	struct ib_mr *             (*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm,
+						struct ib_dm_mr_attr *attr,
+						struct uverbs_attr_bundle *attrs);
+	/**
+	 * rdma netdev operation
+	 *
+	 * Driver implementing alloc_rdma_netdev must return -EOPNOTSUPP if it
+	 * doesn't support the specified rdma netdev type.
+	 */
+	struct net_device *(*alloc_rdma_netdev)(
+					struct ib_device *device,
+					u8 port_num,
+					enum rdma_netdev_t type,
+					const char *name,
+					unsigned char name_assign_type,
+					void (*setup)(struct net_device *));
+
+	struct module               *owner;
+	struct device                dev;
+	struct kobject               *ports_parent;
+	struct list_head             port_list;
+
+	enum {
+		IB_DEV_UNINITIALIZED,
+		IB_DEV_REGISTERED,
+		IB_DEV_UNREGISTERED
+	}                            reg_state;
+
+	int			     uverbs_abi_ver;
+	u64			     uverbs_cmd_mask;
+	u64			     uverbs_ex_cmd_mask;
+
+	char			     node_desc[IB_DEVICE_NODE_DESC_MAX];
+	__be64			     node_guid;
+	u32			     local_dma_lkey;
+	u16                          is_switch:1;
+	u8                           node_type;
+	u8                           phys_port_cnt;
+	struct ib_device_attr        attrs;
+	struct attribute_group	     *hw_stats_ag;
+	struct rdma_hw_stats         *hw_stats;
+
+#ifdef CONFIG_CGROUP_RDMA
+	struct rdmacg_device         cg_device;
+#endif
+
+	u32                          index;
+	/*
+	 * Implementation details of the RDMA core, don't use in drivers
+	 */
+	struct rdma_restrack_root     res;
+
+	/**
+	 * The following mandatory functions are used only at device
+	 * registration.  Keep functions such as these at the end of this
+	 * structure to avoid cache line misses when accessing struct ib_device
+	 * in fast paths.
+	 */
+	int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *);
+	void (*get_dev_fw_str)(struct ib_device *, char *str);
+	const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev,
+						     int comp_vector);
+
+	struct uverbs_root_spec		*specs_root;
+	enum rdma_driver_id		driver_id;
+};
+
+struct ib_client {
+	char  *name;
+	void (*add)   (struct ib_device *);
+	void (*remove)(struct ib_device *, void *client_data);
+
+	/* Returns the net_dev belonging to this ib_client and matching the
+	 * given parameters.
+	 * @dev:	 An RDMA device that the net_dev use for communication.
+	 * @port:	 A physical port number on the RDMA device.
+	 * @pkey:	 P_Key that the net_dev uses if applicable.
+	 * @gid:	 A GID that the net_dev uses to communicate.
+	 * @addr:	 An IP address the net_dev is configured with.
+	 * @client_data: The device's client data set by ib_set_client_data().
+	 *
+	 * An ib_client that implements a net_dev on top of RDMA devices
+	 * (such as IP over IB) should implement this callback, allowing the
+	 * rdma_cm module to find the right net_dev for a given request.
+	 *
+	 * The caller is responsible for calling dev_put on the returned
+	 * netdev. */
+	struct net_device *(*get_net_dev_by_params)(
+			struct ib_device *dev,
+			u8 port,
+			u16 pkey,
+			const union ib_gid *gid,
+			const struct sockaddr *addr,
+			void *client_data);
+	struct list_head list;
+};
+
+struct ib_device *ib_alloc_device(size_t size);
+void ib_dealloc_device(struct ib_device *device);
+
+void ib_get_device_fw_str(struct ib_device *device, char *str);
+
+int ib_register_device(struct ib_device *device,
+		       int (*port_callback)(struct ib_device *,
+					    u8, struct kobject *));
+void ib_unregister_device(struct ib_device *device);
+
+int ib_register_client   (struct ib_client *client);
+void ib_unregister_client(struct ib_client *client);
+
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
+void  ib_set_client_data(struct ib_device *device, struct ib_client *client,
+			 void *data);
+
+static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
+{
+	return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0;
+}
+
+static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
+{
+	return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
+}
+
+static inline bool ib_is_buffer_cleared(const void __user *p,
+					size_t len)
+{
+	bool ret;
+	u8 *buf;
+
+	if (len > USHRT_MAX)
+		return false;
+
+	buf = memdup_user(p, len);
+	if (IS_ERR(buf))
+		return false;
+
+	ret = !memchr_inv(buf, 0, len);
+	kfree(buf);
+	return ret;
+}
+
+static inline bool ib_is_udata_cleared(struct ib_udata *udata,
+				       size_t offset,
+				       size_t len)
+{
+	return ib_is_buffer_cleared(udata->inbuf + offset, len);
+}
+
+/**
+ * ib_modify_qp_is_ok - Check that the supplied attribute mask
+ * contains all required attributes and no attributes not allowed for
+ * the given QP state transition.
+ * @cur_state: Current QP state
+ * @next_state: Next QP state
+ * @type: QP type
+ * @mask: Mask of supplied QP attributes
+ * @ll : link layer of port
+ *
+ * This function is a helper function that a low-level driver's
+ * modify_qp method can use to validate the consumer's input.  It
+ * checks that cur_state and next_state are valid QP states, that a
+ * transition from cur_state to next_state is allowed by the IB spec,
+ * and that the attribute mask supplied is allowed for the transition.
+ */
+bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+			enum ib_qp_type type, enum ib_qp_attr_mask mask,
+			enum rdma_link_layer ll);
+
+void ib_register_event_handler(struct ib_event_handler *event_handler);
+void ib_unregister_event_handler(struct ib_event_handler *event_handler);
+void ib_dispatch_event(struct ib_event *event);
+
+int ib_query_port(struct ib_device *device,
+		  u8 port_num, struct ib_port_attr *port_attr);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device,
+					       u8 port_num);
+
+/**
+ * rdma_cap_ib_switch - Check if the device is IB switch
+ * @device: Device to check
+ *
+ * Device driver is responsible for setting is_switch bit on
+ * in ib_device structure at init time.
+ *
+ * Return: true if the device is IB switch.
+ */
+static inline bool rdma_cap_ib_switch(const struct ib_device *device)
+{
+	return device->is_switch;
+}
+
+/**
+ * rdma_start_port - Return the first valid port number for the device
+ * specified
+ *
+ * @device: Device to be checked
+ *
+ * Return start port number
+ */
+static inline u8 rdma_start_port(const struct ib_device *device)
+{
+	return rdma_cap_ib_switch(device) ? 0 : 1;
+}
+
+/**
+ * rdma_end_port - Return the last valid port number for the device
+ * specified
+ *
+ * @device: Device to be checked
+ *
+ * Return last port number
+ */
+static inline u8 rdma_end_port(const struct ib_device *device)
+{
+	return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt;
+}
+
+static inline int rdma_is_port_valid(const struct ib_device *device,
+				     unsigned int port)
+{
+	return (port >= rdma_start_port(device) &&
+		port <= rdma_end_port(device));
+}
+
+static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IB;
+}
+
+static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags &
+		(RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP);
+}
+
+static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
+}
+
+static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE;
+}
+
+static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP;
+}
+
+static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num)
+{
+	return rdma_protocol_ib(device, port_num) ||
+		rdma_protocol_roce(device, port_num);
+}
+
+static inline bool rdma_protocol_raw_packet(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_RAW_PACKET;
+}
+
+static inline bool rdma_protocol_usnic(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_USNIC;
+}
+
+/**
+ * rdma_cap_ib_mad - Check if the port of a device supports Infiniband
+ * Management Datagrams.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Management Datagrams (MAD) are a required part of the InfiniBand
+ * specification and are supported on all InfiniBand devices.  A slightly
+ * extended version are also supported on OPA interfaces.
+ *
+ * Return: true if the port supports sending/receiving of MAD packets.
+ */
+static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_MAD;
+}
+
+/**
+ * rdma_cap_opa_mad - Check if the port of device provides support for OPA
+ * Management Datagrams.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Intel OmniPath devices extend and/or replace the InfiniBand Management
+ * datagrams with their own versions.  These OPA MADs share many but not all of
+ * the characteristics of InfiniBand MADs.
+ *
+ * OPA MADs differ in the following ways:
+ *
+ *    1) MADs are variable size up to 2K
+ *       IBTA defined MADs remain fixed at 256 bytes
+ *    2) OPA SMPs must carry valid PKeys
+ *    3) OPA SMP packets are a different format
+ *
+ * Return: true if the port supports OPA MAD packet formats.
+ */
+static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num)
+{
+	return (device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_OPA_MAD)
+		== RDMA_CORE_CAP_OPA_MAD;
+}
+
+/**
+ * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband
+ * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI).
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Each InfiniBand node is required to provide a Subnet Management Agent
+ * that the subnet manager can access.  Prior to the fabric being fully
+ * configured by the subnet manager, the SMA is accessed via a well known
+ * interface called the Subnet Management Interface (SMI).  This interface
+ * uses directed route packets to communicate with the SM to get around the
+ * chicken and egg problem of the SM needing to know what's on the fabric
+ * in order to configure the fabric, and needing to configure the fabric in
+ * order to send packets to the devices on the fabric.  These directed
+ * route packets do not need the fabric fully configured in order to reach
+ * their destination.  The SMI is the only method allowed to send
+ * directed route packets on an InfiniBand fabric.
+ *
+ * Return: true if the port provides an SMI.
+ */
+static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SMI;
+}
+
+/**
+ * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband
+ * Communication Manager.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * The InfiniBand Communication Manager is one of many pre-defined General
+ * Service Agents (GSA) that are accessed via the General Service
+ * Interface (GSI).  It's role is to facilitate establishment of connections
+ * between nodes as well as other management related tasks for established
+ * connections.
+ *
+ * Return: true if the port supports an IB CM (this does not guarantee that
+ * a CM is actually running however).
+ */
+static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_CM;
+}
+
+/**
+ * rdma_cap_iw_cm - Check if the port of device has the capability IWARP
+ * Communication Manager.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Similar to above, but specific to iWARP connections which have a different
+ * managment protocol than InfiniBand.
+ *
+ * Return: true if the port supports an iWARP CM (this does not guarantee that
+ * a CM is actually running however).
+ */
+static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IW_CM;
+}
+
+/**
+ * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband
+ * Subnet Administration.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * An InfiniBand Subnet Administration (SA) service is a pre-defined General
+ * Service Agent (GSA) provided by the Subnet Manager (SM).  On InfiniBand
+ * fabrics, devices should resolve routes to other hosts by contacting the
+ * SA to query the proper route.
+ *
+ * Return: true if the port should act as a client to the fabric Subnet
+ * Administration interface.  This does not imply that the SA service is
+ * running locally.
+ */
+static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SA;
+}
+
+/**
+ * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband
+ * Multicast.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * InfiniBand multicast registration is more complex than normal IPv4 or
+ * IPv6 multicast registration.  Each Host Channel Adapter must register
+ * with the Subnet Manager when it wishes to join a multicast group.  It
+ * should do so only once regardless of how many queue pairs it subscribes
+ * to this group.  And it should leave the group only after all queue pairs
+ * attached to the group have been detached.
+ *
+ * Return: true if the port must undertake the additional adminstrative
+ * overhead of registering/unregistering with the SM and tracking of the
+ * total number of queue pairs attached to the multicast group.
+ */
+static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num)
+{
+	return rdma_cap_ib_sa(device, port_num);
+}
+
+/**
+ * rdma_cap_af_ib - Check if the port of device has the capability
+ * Native Infiniband Address.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default
+ * GID.  RoCE uses a different mechanism, but still generates a GID via
+ * a prescribed mechanism and port specific data.
+ *
+ * Return: true if the port uses a GID address to identify devices on the
+ * network.
+ */
+static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_AF_IB;
+}
+
+/**
+ * rdma_cap_eth_ah - Check if the port of device has the capability
+ * Ethernet Address Handle.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * RoCE is InfiniBand over Ethernet, and it uses a well defined technique
+ * to fabricate GIDs over Ethernet/IP specific addresses native to the
+ * port.  Normally, packet headers are generated by the sending host
+ * adapter, but when sending connectionless datagrams, we must manually
+ * inject the proper headers for the fabric we are communicating over.
+ *
+ * Return: true if we are running as a RoCE port and must force the
+ * addition of a Global Route Header built from our Ethernet Address
+ * Handle into our header list for connectionless packets.
+ */
+static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_ETH_AH;
+}
+
+/**
+ * rdma_cap_opa_ah - Check if the port of device supports
+ * OPA Address handles
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Return: true if we are running on an OPA device which supports
+ * the extended OPA addressing.
+ */
+static inline bool rdma_cap_opa_ah(struct ib_device *device, u8 port_num)
+{
+	return (device->port_immutable[port_num].core_cap_flags &
+		RDMA_CORE_CAP_OPA_AH) == RDMA_CORE_CAP_OPA_AH;
+}
+
+/**
+ * rdma_max_mad_size - Return the max MAD size required by this RDMA Port.
+ *
+ * @device: Device
+ * @port_num: Port number
+ *
+ * This MAD size includes the MAD headers and MAD payload.  No other headers
+ * are included.
+ *
+ * Return the max MAD size required by the Port.  Will return 0 if the port
+ * does not support MADs
+ */
+static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_num)
+{
+	return device->port_immutable[port_num].max_mad_size;
+}
+
+/**
+ * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * RoCE GID table mechanism manages the various GIDs for a device.
+ *
+ * NOTE: if allocating the port's GID table has failed, this call will still
+ * return true, but any RoCE GID table API will fail.
+ *
+ * Return: true if the port uses RoCE GID table mechanism in order to manage
+ * its GIDs.
+ */
+static inline bool rdma_cap_roce_gid_table(const struct ib_device *device,
+					   u8 port_num)
+{
+	return rdma_protocol_roce(device, port_num) &&
+		device->add_gid && device->del_gid;
+}
+
+/*
+ * Check if the device supports READ W/ INVALIDATE.
+ */
+static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num)
+{
+	/*
+	 * iWarp drivers must support READ W/ INVALIDATE.  No other protocol
+	 * has support for it yet.
+	 */
+	return rdma_protocol_iwarp(dev, port_num);
+}
+
+int ib_query_gid(struct ib_device *device,
+		 u8 port_num, int index, union ib_gid *gid,
+		 struct ib_gid_attr *attr);
+
+int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port,
+			 int state);
+int ib_get_vf_config(struct ib_device *device, int vf, u8 port,
+		     struct ifla_vf_info *info);
+int ib_get_vf_stats(struct ib_device *device, int vf, u8 port,
+		    struct ifla_vf_stats *stats);
+int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
+		   int type);
+
+int ib_query_pkey(struct ib_device *device,
+		  u8 port_num, u16 index, u16 *pkey);
+
+int ib_modify_device(struct ib_device *device,
+		     int device_modify_mask,
+		     struct ib_device_modify *device_modify);
+
+int ib_modify_port(struct ib_device *device,
+		   u8 port_num, int port_modify_mask,
+		   struct ib_port_modify *port_modify);
+
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+		u8 *port_num, u16 *index);
+
+int ib_find_pkey(struct ib_device *device,
+		 u8 port_num, u16 pkey, u16 *index);
+
+enum ib_pd_flags {
+	/*
+	 * Create a memory registration for all memory in the system and place
+	 * the rkey for it into pd->unsafe_global_rkey.  This can be used by
+	 * ULPs to avoid the overhead of dynamic MRs.
+	 *
+	 * This flag is generally considered unsafe and must only be used in
+	 * extremly trusted environments.  Every use of it will log a warning
+	 * in the kernel log.
+	 */
+	IB_PD_UNSAFE_GLOBAL_RKEY	= 0x01,
+};
+
+struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
+		const char *caller);
+#define ib_alloc_pd(device, flags) \
+	__ib_alloc_pd((device), (flags), KBUILD_MODNAME)
+void ib_dealloc_pd(struct ib_pd *pd);
+
+/**
+ * rdma_create_ah - Creates an address handle for the given address vector.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr);
+
+/**
+ * rdma_create_user_ah - Creates an address handle for the given address vector.
+ * It resolves destination mac address for ah attribute of RoCE type.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ * @udata: pointer to user's input output buffer information need by
+ *         provider driver.
+ *
+ * It returns 0 on success and returns appropriate error code on error.
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *rdma_create_user_ah(struct ib_pd *pd,
+				  struct rdma_ah_attr *ah_attr,
+				  struct ib_udata *udata);
+/**
+ * ib_get_gids_from_rdma_hdr - Get sgid and dgid from GRH or IPv4 header
+ *   work completion.
+ * @hdr: the L3 header to parse
+ * @net_type: type of header to parse
+ * @sgid: place to store source gid
+ * @dgid: place to store destination gid
+ */
+int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr,
+			      enum rdma_network_type net_type,
+			      union ib_gid *sgid, union ib_gid *dgid);
+
+/**
+ * ib_get_rdma_header_version - Get the header version
+ * @hdr: the L3 header to parse
+ */
+int ib_get_rdma_header_version(const union rdma_network_hdr *hdr);
+
+/**
+ * ib_init_ah_attr_from_wc - Initializes address handle attributes from a
+ *   work completion.
+ * @device: Device on which the received message arrived.
+ * @port_num: Port on which the received message arrived.
+ * @wc: Work completion associated with the received message.
+ * @grh: References the received global route header.  This parameter is
+ *   ignored unless the work completion indicates that the GRH is valid.
+ * @ah_attr: Returned attributes that can be used when creating an address
+ *   handle for replying to the message.
+ */
+int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num,
+			    const struct ib_wc *wc, const struct ib_grh *grh,
+			    struct rdma_ah_attr *ah_attr);
+
+/**
+ * ib_create_ah_from_wc - Creates an address handle associated with the
+ *   sender of the specified work completion.
+ * @pd: The protection domain associated with the address handle.
+ * @wc: Work completion information associated with a received message.
+ * @grh: References the received global route header.  This parameter is
+ *   ignored unless the work completion indicates that the GRH is valid.
+ * @port_num: The outbound port number to associate with the address.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
+				   const struct ib_grh *grh, u8 port_num);
+
+/**
+ * rdma_modify_ah - Modifies the address vector associated with an address
+ *   handle.
+ * @ah: The address handle to modify.
+ * @ah_attr: The new address vector attributes to associate with the
+ *   address handle.
+ */
+int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
+
+/**
+ * rdma_query_ah - Queries the address vector associated with an address
+ *   handle.
+ * @ah: The address handle to query.
+ * @ah_attr: The address vector attributes associated with the address
+ *   handle.
+ */
+int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
+
+/**
+ * rdma_destroy_ah - Destroys an address handle.
+ * @ah: The address handle to destroy.
+ */
+int rdma_destroy_ah(struct ib_ah *ah);
+
+/**
+ * ib_create_srq - Creates a SRQ associated with the specified protection
+ *   domain.
+ * @pd: The protection domain associated with the SRQ.
+ * @srq_init_attr: A list of initial attributes required to create the
+ *   SRQ.  If SRQ creation succeeds, then the attributes are updated to
+ *   the actual capabilities of the created SRQ.
+ *
+ * srq_attr->max_wr and srq_attr->max_sge are read the determine the
+ * requested size of the SRQ, and set to the actual values allocated
+ * on return.  If ib_create_srq() succeeds, then max_wr and max_sge
+ * will always be at least as large as the requested values.
+ */
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+			     struct ib_srq_init_attr *srq_init_attr);
+
+/**
+ * ib_modify_srq - Modifies the attributes for the specified SRQ.
+ * @srq: The SRQ to modify.
+ * @srq_attr: On input, specifies the SRQ attributes to modify.  On output,
+ *   the current values of selected SRQ attributes are returned.
+ * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ
+ *   are being modified.
+ *
+ * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or
+ * IB_SRQ_LIMIT to set the SRQ's limit and request notification when
+ * the number of receives queued drops below the limit.
+ */
+int ib_modify_srq(struct ib_srq *srq,
+		  struct ib_srq_attr *srq_attr,
+		  enum ib_srq_attr_mask srq_attr_mask);
+
+/**
+ * ib_query_srq - Returns the attribute list and current values for the
+ *   specified SRQ.
+ * @srq: The SRQ to query.
+ * @srq_attr: The attributes of the specified SRQ.
+ */
+int ib_query_srq(struct ib_srq *srq,
+		 struct ib_srq_attr *srq_attr);
+
+/**
+ * ib_destroy_srq - Destroys the specified SRQ.
+ * @srq: The SRQ to destroy.
+ */
+int ib_destroy_srq(struct ib_srq *srq);
+
+/**
+ * ib_post_srq_recv - Posts a list of work requests to the specified SRQ.
+ * @srq: The SRQ to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ *   the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_srq_recv(struct ib_srq *srq,
+				   struct ib_recv_wr *recv_wr,
+				   struct ib_recv_wr **bad_recv_wr)
+{
+	return srq->device->post_srq_recv(srq, recv_wr, bad_recv_wr);
+}
+
+/**
+ * ib_create_qp - Creates a QP associated with the specified protection
+ *   domain.
+ * @pd: The protection domain associated with the QP.
+ * @qp_init_attr: A list of initial attributes required to create the
+ *   QP.  If QP creation succeeds, then the attributes are updated to
+ *   the actual capabilities of the created QP.
+ */
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+			   struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_modify_qp_with_udata - Modifies the attributes for the specified QP.
+ * @qp: The QP to modify.
+ * @attr: On input, specifies the QP attributes to modify.  On output,
+ *   the current values of selected QP attributes are returned.
+ * @attr_mask: A bit-mask used to specify which attributes of the QP
+ *   are being modified.
+ * @udata: pointer to user's input output buffer information
+ *   are being modified.
+ * It returns 0 on success and returns appropriate error code on error.
+ */
+int ib_modify_qp_with_udata(struct ib_qp *qp,
+			    struct ib_qp_attr *attr,
+			    int attr_mask,
+			    struct ib_udata *udata);
+
+/**
+ * ib_modify_qp - Modifies the attributes for the specified QP and then
+ *   transitions the QP to the given state.
+ * @qp: The QP to modify.
+ * @qp_attr: On input, specifies the QP attributes to modify.  On output,
+ *   the current values of selected QP attributes are returned.
+ * @qp_attr_mask: A bit-mask used to specify which attributes of the QP
+ *   are being modified.
+ */
+int ib_modify_qp(struct ib_qp *qp,
+		 struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask);
+
+/**
+ * ib_query_qp - Returns the attribute list and current values for the
+ *   specified QP.
+ * @qp: The QP to query.
+ * @qp_attr: The attributes of the specified QP.
+ * @qp_attr_mask: A bit-mask used to select specific attributes to query.
+ * @qp_init_attr: Additional attributes of the selected QP.
+ *
+ * The qp_attr_mask may be used to limit the query to gathering only the
+ * selected attributes.
+ */
+int ib_query_qp(struct ib_qp *qp,
+		struct ib_qp_attr *qp_attr,
+		int qp_attr_mask,
+		struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_destroy_qp - Destroys the specified QP.
+ * @qp: The QP to destroy.
+ */
+int ib_destroy_qp(struct ib_qp *qp);
+
+/**
+ * ib_open_qp - Obtain a reference to an existing sharable QP.
+ * @xrcd - XRC domain
+ * @qp_open_attr: Attributes identifying the QP to open.
+ *
+ * Returns a reference to a sharable QP.
+ */
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
+			 struct ib_qp_open_attr *qp_open_attr);
+
+/**
+ * ib_close_qp - Release an external reference to a QP.
+ * @qp: The QP handle to release
+ *
+ * The opened QP handle is released by the caller.  The underlying
+ * shared QP is not destroyed until all internal references are released.
+ */
+int ib_close_qp(struct ib_qp *qp);
+
+/**
+ * ib_post_send - Posts a list of work requests to the send queue of
+ *   the specified QP.
+ * @qp: The QP to post the work request on.
+ * @send_wr: A list of work requests to post on the send queue.
+ * @bad_send_wr: On an immediate failure, this parameter will reference
+ *   the work request that failed to be posted on the QP.
+ *
+ * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate
+ * error is returned, the QP state shall not be affected,
+ * ib_post_send() will return an immediate error after queueing any
+ * earlier work requests in the list.
+ */
+static inline int ib_post_send(struct ib_qp *qp,
+			       struct ib_send_wr *send_wr,
+			       struct ib_send_wr **bad_send_wr)
+{
+	return qp->device->post_send(qp, send_wr, bad_send_wr);
+}
+
+/**
+ * ib_post_recv - Posts a list of work requests to the receive queue of
+ *   the specified QP.
+ * @qp: The QP to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ *   the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_recv(struct ib_qp *qp,
+			       struct ib_recv_wr *recv_wr,
+			       struct ib_recv_wr **bad_recv_wr)
+{
+	return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
+}
+
+struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
+			    int nr_cqe, int comp_vector,
+			    enum ib_poll_context poll_ctx, const char *caller);
+#define ib_alloc_cq(device, priv, nr_cqe, comp_vect, poll_ctx) \
+	__ib_alloc_cq((device), (priv), (nr_cqe), (comp_vect), (poll_ctx), KBUILD_MODNAME)
+
+void ib_free_cq(struct ib_cq *cq);
+int ib_process_cq_direct(struct ib_cq *cq, int budget);
+
+/**
+ * ib_create_cq - Creates a CQ on the specified device.
+ * @device: The device on which to create the CQ.
+ * @comp_handler: A user-specified callback that is invoked when a
+ *   completion event occurs on the CQ.
+ * @event_handler: A user-specified callback that is invoked when an
+ *   asynchronous event not associated with a completion occurs on the CQ.
+ * @cq_context: Context associated with the CQ returned to the user via
+ *   the associated completion and event handlers.
+ * @cq_attr: The attributes the CQ should be created upon.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+struct ib_cq *ib_create_cq(struct ib_device *device,
+			   ib_comp_handler comp_handler,
+			   void (*event_handler)(struct ib_event *, void *),
+			   void *cq_context,
+			   const struct ib_cq_init_attr *cq_attr);
+
+/**
+ * ib_resize_cq - Modifies the capacity of the CQ.
+ * @cq: The CQ to resize.
+ * @cqe: The minimum size of the CQ.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+int ib_resize_cq(struct ib_cq *cq, int cqe);
+
+/**
+ * rdma_set_cq_moderation - Modifies moderation params of the CQ
+ * @cq: The CQ to modify.
+ * @cq_count: number of CQEs that will trigger an event
+ * @cq_period: max period of time in usec before triggering an event
+ *
+ */
+int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+
+/**
+ * ib_destroy_cq - Destroys the specified CQ.
+ * @cq: The CQ to destroy.
+ */
+int ib_destroy_cq(struct ib_cq *cq);
+
+/**
+ * ib_poll_cq - poll a CQ for completion(s)
+ * @cq:the CQ being polled
+ * @num_entries:maximum number of completions to return
+ * @wc:array of at least @num_entries &struct ib_wc where completions
+ *   will be returned
+ *
+ * Poll a CQ for (possibly multiple) completions.  If the return value
+ * is < 0, an error occurred.  If the return value is >= 0, it is the
+ * number of completions returned.  If the return value is
+ * non-negative and < num_entries, then the CQ was emptied.
+ */
+static inline int ib_poll_cq(struct ib_cq *cq, int num_entries,
+			     struct ib_wc *wc)
+{
+	return cq->device->poll_cq(cq, num_entries, wc);
+}
+
+/**
+ * ib_req_notify_cq - Request completion notification on a CQ.
+ * @cq: The CQ to generate an event for.
+ * @flags:
+ *   Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP
+ *   to request an event on the next solicited event or next work
+ *   completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS
+ *   may also be |ed in to request a hint about missed events, as
+ *   described below.
+ *
+ * Return Value:
+ *    < 0 means an error occurred while requesting notification
+ *   == 0 means notification was requested successfully, and if
+ *        IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events
+ *        were missed and it is safe to wait for another event.  In
+ *        this case is it guaranteed that any work completions added
+ *        to the CQ since the last CQ poll will trigger a completion
+ *        notification event.
+ *    > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed
+ *        in.  It means that the consumer must poll the CQ again to
+ *        make sure it is empty to avoid missing an event because of a
+ *        race between requesting notification and an entry being
+ *        added to the CQ.  This return value means it is possible
+ *        (but not guaranteed) that a work completion has been added
+ *        to the CQ since the last poll without triggering a
+ *        completion notification event.
+ */
+static inline int ib_req_notify_cq(struct ib_cq *cq,
+				   enum ib_cq_notify_flags flags)
+{
+	return cq->device->req_notify_cq(cq, flags);
+}
+
+/**
+ * ib_req_ncomp_notif - Request completion notification when there are
+ *   at least the specified number of unreaped completions on the CQ.
+ * @cq: The CQ to generate an event for.
+ * @wc_cnt: The number of unreaped completions that should be on the
+ *   CQ before an event is generated.
+ */
+static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt)
+{
+	return cq->device->req_ncomp_notif ?
+		cq->device->req_ncomp_notif(cq, wc_cnt) :
+		-ENOSYS;
+}
+
+/**
+ * ib_dma_mapping_error - check a DMA addr for error
+ * @dev: The device for which the dma_addr was created
+ * @dma_addr: The DMA address to check
+ */
+static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return dma_mapping_error(dev->dma_device, dma_addr);
+}
+
+/**
+ * ib_dma_map_single - Map a kernel virtual address to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @cpu_addr: The kernel virtual address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline u64 ib_dma_map_single(struct ib_device *dev,
+				    void *cpu_addr, size_t size,
+				    enum dma_data_direction direction)
+{
+	return dma_map_single(dev->dma_device, cpu_addr, size, direction);
+}
+
+/**
+ * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single()
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_single(struct ib_device *dev,
+				       u64 addr, size_t size,
+				       enum dma_data_direction direction)
+{
+	dma_unmap_single(dev->dma_device, addr, size, direction);
+}
+
+/**
+ * ib_dma_map_page - Map a physical page to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @page: The page to be mapped
+ * @offset: The offset within the page
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline u64 ib_dma_map_page(struct ib_device *dev,
+				  struct page *page,
+				  unsigned long offset,
+				  size_t size,
+					 enum dma_data_direction direction)
+{
+	return dma_map_page(dev->dma_device, page, offset, size, direction);
+}
+
+/**
+ * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page()
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_page(struct ib_device *dev,
+				     u64 addr, size_t size,
+				     enum dma_data_direction direction)
+{
+	dma_unmap_page(dev->dma_device, addr, size, direction);
+}
+
+/**
+ * ib_dma_map_sg - Map a scatter/gather list to DMA addresses
+ * @dev: The device for which the DMA addresses are to be created
+ * @sg: The array of scatter/gather entries
+ * @nents: The number of scatter/gather entries
+ * @direction: The direction of the DMA
+ */
+static inline int ib_dma_map_sg(struct ib_device *dev,
+				struct scatterlist *sg, int nents,
+				enum dma_data_direction direction)
+{
+	return dma_map_sg(dev->dma_device, sg, nents, direction);
+}
+
+/**
+ * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The array of scatter/gather entries
+ * @nents: The number of scatter/gather entries
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_sg(struct ib_device *dev,
+				   struct scatterlist *sg, int nents,
+				   enum dma_data_direction direction)
+{
+	dma_unmap_sg(dev->dma_device, sg, nents, direction);
+}
+
+static inline int ib_dma_map_sg_attrs(struct ib_device *dev,
+				      struct scatterlist *sg, int nents,
+				      enum dma_data_direction direction,
+				      unsigned long dma_attrs)
+{
+	return dma_map_sg_attrs(dev->dma_device, sg, nents, direction,
+				dma_attrs);
+}
+
+static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev,
+					 struct scatterlist *sg, int nents,
+					 enum dma_data_direction direction,
+					 unsigned long dma_attrs)
+{
+	dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs);
+}
+/**
+ * ib_sg_dma_address - Return the DMA address from a scatter/gather entry
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The scatter/gather entry
+ *
+ * Note: this function is obsolete. To do: change all occurrences of
+ * ib_sg_dma_address() into sg_dma_address().
+ */
+static inline u64 ib_sg_dma_address(struct ib_device *dev,
+				    struct scatterlist *sg)
+{
+	return sg_dma_address(sg);
+}
+
+/**
+ * ib_sg_dma_len - Return the DMA length from a scatter/gather entry
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The scatter/gather entry
+ *
+ * Note: this function is obsolete. To do: change all occurrences of
+ * ib_sg_dma_len() into sg_dma_len().
+ */
+static inline unsigned int ib_sg_dma_len(struct ib_device *dev,
+					 struct scatterlist *sg)
+{
+	return sg_dma_len(sg);
+}
+
+/**
+ * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @dir: The direction of the DMA
+ */
+static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev,
+					      u64 addr,
+					      size_t size,
+					      enum dma_data_direction dir)
+{
+	dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+/**
+ * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @dir: The direction of the DMA
+ */
+static inline void ib_dma_sync_single_for_device(struct ib_device *dev,
+						 u64 addr,
+						 size_t size,
+						 enum dma_data_direction dir)
+{
+	dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+/**
+ * ib_dma_alloc_coherent - Allocate memory and map it for DMA
+ * @dev: The device for which the DMA address is requested
+ * @size: The size of the region to allocate in bytes
+ * @dma_handle: A pointer for returning the DMA address of the region
+ * @flag: memory allocator flags
+ */
+static inline void *ib_dma_alloc_coherent(struct ib_device *dev,
+					   size_t size,
+					   dma_addr_t *dma_handle,
+					   gfp_t flag)
+{
+	return dma_alloc_coherent(dev->dma_device, size, dma_handle, flag);
+}
+
+/**
+ * ib_dma_free_coherent - Free memory allocated by ib_dma_alloc_coherent()
+ * @dev: The device for which the DMA addresses were allocated
+ * @size: The size of the region
+ * @cpu_addr: the address returned by ib_dma_alloc_coherent()
+ * @dma_handle: the DMA address returned by ib_dma_alloc_coherent()
+ */
+static inline void ib_dma_free_coherent(struct ib_device *dev,
+					size_t size, void *cpu_addr,
+					dma_addr_t dma_handle)
+{
+	dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle);
+}
+
+/**
+ * ib_dereg_mr - Deregisters a memory region and removes it from the
+ *   HCA translation table.
+ * @mr: The memory region to deregister.
+ *
+ * This function can fail, if the memory region has memory windows bound to it.
+ */
+int ib_dereg_mr(struct ib_mr *mr);
+
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+			  enum ib_mr_type mr_type,
+			  u32 max_num_sg);
+
+/**
+ * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
+ *   R_Key and L_Key.
+ * @mr - struct ib_mr pointer to be updated.
+ * @newkey - new key to be used.
+ */
+static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
+{
+	mr->lkey = (mr->lkey & 0xffffff00) | newkey;
+	mr->rkey = (mr->rkey & 0xffffff00) | newkey;
+}
+
+/**
+ * ib_inc_rkey - increments the key portion of the given rkey. Can be used
+ * for calculating a new rkey for type 2 memory windows.
+ * @rkey - the rkey to increment.
+ */
+static inline u32 ib_inc_rkey(u32 rkey)
+{
+	const u32 mask = 0x000000ff;
+	return ((rkey + 1) & mask) | (rkey & ~mask);
+}
+
+/**
+ * ib_alloc_fmr - Allocates a unmapped fast memory region.
+ * @pd: The protection domain associated with the unmapped region.
+ * @mr_access_flags: Specifies the memory access rights.
+ * @fmr_attr: Attributes of the unmapped region.
+ *
+ * A fast memory region must be mapped before it can be used as part of
+ * a work request.
+ */
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+			    int mr_access_flags,
+			    struct ib_fmr_attr *fmr_attr);
+
+/**
+ * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region.
+ * @fmr: The fast memory region to associate with the pages.
+ * @page_list: An array of physical pages to map to the fast memory region.
+ * @list_len: The number of pages in page_list.
+ * @iova: The I/O virtual address to use with the mapped region.
+ */
+static inline int ib_map_phys_fmr(struct ib_fmr *fmr,
+				  u64 *page_list, int list_len,
+				  u64 iova)
+{
+	return fmr->device->map_phys_fmr(fmr, page_list, list_len, iova);
+}
+
+/**
+ * ib_unmap_fmr - Removes the mapping from a list of fast memory regions.
+ * @fmr_list: A linked list of fast memory regions to unmap.
+ */
+int ib_unmap_fmr(struct list_head *fmr_list);
+
+/**
+ * ib_dealloc_fmr - Deallocates a fast memory region.
+ * @fmr: The fast memory region to deallocate.
+ */
+int ib_dealloc_fmr(struct ib_fmr *fmr);
+
+/**
+ * ib_attach_mcast - Attaches the specified QP to a multicast group.
+ * @qp: QP to attach to the multicast group.  The QP must be type
+ *   IB_QPT_UD.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ *
+ * In order to send and receive multicast packets, subnet
+ * administration must have created the multicast group and configured
+ * the fabric appropriately.  The port associated with the specified
+ * QP must also be a member of the multicast group.
+ */
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+/**
+ * ib_detach_mcast - Detaches the specified QP from a multicast group.
+ * @qp: QP to detach from the multicast group.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ */
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+/**
+ * ib_alloc_xrcd - Allocates an XRC domain.
+ * @device: The device on which to allocate the XRC domain.
+ * @caller: Module name for kernel consumers
+ */
+struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller);
+#define ib_alloc_xrcd(device) \
+	__ib_alloc_xrcd((device), KBUILD_MODNAME)
+
+/**
+ * ib_dealloc_xrcd - Deallocates an XRC domain.
+ * @xrcd: The XRC domain to deallocate.
+ */
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd);
+
+struct ib_flow *ib_create_flow(struct ib_qp *qp,
+			       struct ib_flow_attr *flow_attr, int domain);
+int ib_destroy_flow(struct ib_flow *flow_id);
+
+static inline int ib_check_mr_access(int flags)
+{
+	/*
+	 * Local write permission is required if remote write or
+	 * remote atomic permission is also requested.
+	 */
+	if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
+	    !(flags & IB_ACCESS_LOCAL_WRITE))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ib_check_mr_status: lightweight check of MR status.
+ *     This routine may provide status checks on a selected
+ *     ib_mr. first use is for signature status check.
+ *
+ * @mr: A memory region.
+ * @check_mask: Bitmask of which checks to perform from
+ *     ib_mr_status_check enumeration.
+ * @mr_status: The container of relevant status checks.
+ *     failed checks will be indicated in the status bitmask
+ *     and the relevant info shall be in the error item.
+ */
+int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
+		       struct ib_mr_status *mr_status);
+
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
+					    u16 pkey, const union ib_gid *gid,
+					    const struct sockaddr *addr);
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+			   struct ib_wq_init_attr *init_attr);
+int ib_destroy_wq(struct ib_wq *wq);
+int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *attr,
+		 u32 wq_attr_mask);
+struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
+						 struct ib_rwq_ind_table_init_attr*
+						 wq_ind_table_init_attr);
+int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
+
+int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+		 unsigned int *sg_offset, unsigned int page_size);
+
+static inline int
+ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+		  unsigned int *sg_offset, unsigned int page_size)
+{
+	int n;
+
+	n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size);
+	mr->iova = 0;
+
+	return n;
+}
+
+int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents,
+		unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64));
+
+void ib_drain_rq(struct ib_qp *qp);
+void ib_drain_sq(struct ib_qp *qp);
+void ib_drain_qp(struct ib_qp *qp);
+
+int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width);
+
+static inline u8 *rdma_ah_retrieve_dmac(struct rdma_ah_attr *attr)
+{
+	if (attr->type == RDMA_AH_ATTR_TYPE_ROCE)
+		return attr->roce.dmac;
+	return NULL;
+}
+
+static inline void rdma_ah_set_dlid(struct rdma_ah_attr *attr, u32 dlid)
+{
+	if (attr->type == RDMA_AH_ATTR_TYPE_IB)
+		attr->ib.dlid = (u16)dlid;
+	else if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+		attr->opa.dlid = dlid;
+}
+
+static inline u32 rdma_ah_get_dlid(const struct rdma_ah_attr *attr)
+{
+	if (attr->type == RDMA_AH_ATTR_TYPE_IB)
+		return attr->ib.dlid;
+	else if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+		return attr->opa.dlid;
+	return 0;
+}
+
+static inline void rdma_ah_set_sl(struct rdma_ah_attr *attr, u8 sl)
+{
+	attr->sl = sl;
+}
+
+static inline u8 rdma_ah_get_sl(const struct rdma_ah_attr *attr)
+{
+	return attr->sl;
+}
+
+static inline void rdma_ah_set_path_bits(struct rdma_ah_attr *attr,
+					 u8 src_path_bits)
+{
+	if (attr->type == RDMA_AH_ATTR_TYPE_IB)
+		attr->ib.src_path_bits = src_path_bits;
+	else if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+		attr->opa.src_path_bits = src_path_bits;
+}
+
+static inline u8 rdma_ah_get_path_bits(const struct rdma_ah_attr *attr)
+{
+	if (attr->type == RDMA_AH_ATTR_TYPE_IB)
+		return attr->ib.src_path_bits;
+	else if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+		return attr->opa.src_path_bits;
+	return 0;
+}
+
+static inline void rdma_ah_set_make_grd(struct rdma_ah_attr *attr,
+					bool make_grd)
+{
+	if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+		attr->opa.make_grd = make_grd;
+}
+
+static inline bool rdma_ah_get_make_grd(const struct rdma_ah_attr *attr)
+{
+	if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+		return attr->opa.make_grd;
+	return false;
+}
+
+static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u8 port_num)
+{
+	attr->port_num = port_num;
+}
+
+static inline u8 rdma_ah_get_port_num(const struct rdma_ah_attr *attr)
+{
+	return attr->port_num;
+}
+
+static inline void rdma_ah_set_static_rate(struct rdma_ah_attr *attr,
+					   u8 static_rate)
+{
+	attr->static_rate = static_rate;
+}
+
+static inline u8 rdma_ah_get_static_rate(const struct rdma_ah_attr *attr)
+{
+	return attr->static_rate;
+}
+
+static inline void rdma_ah_set_ah_flags(struct rdma_ah_attr *attr,
+					enum ib_ah_flags flag)
+{
+	attr->ah_flags = flag;
+}
+
+static inline enum ib_ah_flags
+		rdma_ah_get_ah_flags(const struct rdma_ah_attr *attr)
+{
+	return attr->ah_flags;
+}
+
+static inline const struct ib_global_route
+		*rdma_ah_read_grh(const struct rdma_ah_attr *attr)
+{
+	return &attr->grh;
+}
+
+/*To retrieve and modify the grh */
+static inline struct ib_global_route
+		*rdma_ah_retrieve_grh(struct rdma_ah_attr *attr)
+{
+	return &attr->grh;
+}
+
+static inline void rdma_ah_set_dgid_raw(struct rdma_ah_attr *attr, void *dgid)
+{
+	struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);
+
+	memcpy(grh->dgid.raw, dgid, sizeof(grh->dgid));
+}
+
+static inline void rdma_ah_set_subnet_prefix(struct rdma_ah_attr *attr,
+					     __be64 prefix)
+{
+	struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);
+
+	grh->dgid.global.subnet_prefix = prefix;
+}
+
+static inline void rdma_ah_set_interface_id(struct rdma_ah_attr *attr,
+					    __be64 if_id)
+{
+	struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);
+
+	grh->dgid.global.interface_id = if_id;
+}
+
+static inline void rdma_ah_set_grh(struct rdma_ah_attr *attr,
+				   union ib_gid *dgid, u32 flow_label,
+				   u8 sgid_index, u8 hop_limit,
+				   u8 traffic_class)
+{
+	struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);
+
+	attr->ah_flags = IB_AH_GRH;
+	if (dgid)
+		grh->dgid = *dgid;
+	grh->flow_label = flow_label;
+	grh->sgid_index = sgid_index;
+	grh->hop_limit = hop_limit;
+	grh->traffic_class = traffic_class;
+}
+
+/**
+ * rdma_ah_find_type - Return address handle type.
+ *
+ * @dev: Device to be checked
+ * @port_num: Port number
+ */
+static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev,
+						       u8 port_num)
+{
+	if (rdma_protocol_roce(dev, port_num))
+		return RDMA_AH_ATTR_TYPE_ROCE;
+	if (rdma_protocol_ib(dev, port_num)) {
+		if (rdma_cap_opa_ah(dev, port_num))
+			return RDMA_AH_ATTR_TYPE_OPA;
+		return RDMA_AH_ATTR_TYPE_IB;
+	}
+
+	return RDMA_AH_ATTR_TYPE_UNDEFINED;
+}
+
+/**
+ * ib_lid_cpu16 - Return lid in 16bit CPU encoding.
+ *     In the current implementation the only way to get
+ *     get the 32bit lid is from other sources for OPA.
+ *     For IB, lids will always be 16bits so cast the
+ *     value accordingly.
+ *
+ * @lid: A 32bit LID
+ */
+static inline u16 ib_lid_cpu16(u32 lid)
+{
+	WARN_ON_ONCE(lid & 0xFFFF0000);
+	return (u16)lid;
+}
+
+/**
+ * ib_lid_be16 - Return lid in 16bit BE encoding.
+ *
+ * @lid: A 32bit LID
+ */
+static inline __be16 ib_lid_be16(u32 lid)
+{
+	WARN_ON_ONCE(lid & 0xFFFF0000);
+	return cpu_to_be16((u16)lid);
+}
+
+/**
+ * ib_get_vector_affinity - Get the affinity mappings of a given completion
+ *   vector
+ * @device:         the rdma device
+ * @comp_vector:    index of completion vector
+ *
+ * Returns NULL on failure, otherwise a corresponding cpu map of the
+ * completion vector (returns all-cpus map if the device driver doesn't
+ * implement get_vector_affinity).
+ */
+static inline const struct cpumask *
+ib_get_vector_affinity(struct ib_device *device, int comp_vector)
+{
+	if (comp_vector < 0 || comp_vector >= device->num_comp_vectors ||
+	    !device->get_vector_affinity)
+		return NULL;
+
+	return device->get_vector_affinity(device, comp_vector);
+
+}
+
+/**
+ * rdma_roce_rescan_device - Rescan all of the network devices in the system
+ * and add their gids, as needed, to the relevant RoCE devices.
+ *
+ * @device:         the rdma device
+ */
+void rdma_roce_rescan_device(struct ib_device *ibdev);
+
+#endif /* IB_VERBS_H */
diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h
new file mode 100644
index 0000000..5cd7701
--- /dev/null
+++ b/include/rdma/iw_cm.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef IW_CM_H
+#define IW_CM_H
+
+#include <linux/in.h>
+#include <rdma/ib_cm.h>
+
+struct iw_cm_id;
+
+enum iw_cm_event_type {
+	IW_CM_EVENT_CONNECT_REQUEST = 1, /* connect request received */
+	IW_CM_EVENT_CONNECT_REPLY,	 /* reply from active connect request */
+	IW_CM_EVENT_ESTABLISHED,	 /* passive side accept successful */
+	IW_CM_EVENT_DISCONNECT,		 /* orderly shutdown */
+	IW_CM_EVENT_CLOSE		 /* close complete */
+};
+
+struct iw_cm_event {
+	enum iw_cm_event_type event;
+	int			 status;
+	struct sockaddr_storage local_addr;
+	struct sockaddr_storage remote_addr;
+	void *private_data;
+	void *provider_data;
+	u8 private_data_len;
+	u8 ord;
+	u8 ird;
+};
+
+/**
+ * iw_cm_handler - Function to be called by the IW CM when delivering events
+ * to the client.
+ *
+ * @cm_id: The IW CM identifier associated with the event.
+ * @event: Pointer to the event structure.
+ */
+typedef int (*iw_cm_handler)(struct iw_cm_id *cm_id,
+			     struct iw_cm_event *event);
+
+/**
+ * iw_event_handler - Function called by the provider when delivering provider
+ * events to the IW CM.  Returns either 0 indicating the event was processed
+ * or -errno if the event could not be processed.
+ *
+ * @cm_id: The IW CM identifier associated with the event.
+ * @event: Pointer to the event structure.
+ */
+typedef int (*iw_event_handler)(struct iw_cm_id *cm_id,
+				 struct iw_cm_event *event);
+
+struct iw_cm_id {
+	iw_cm_handler		cm_handler;      /* client callback function */
+	void		        *context;	 /* client cb context */
+	struct ib_device	*device;
+	struct sockaddr_storage local_addr;      /* local addr */
+	struct sockaddr_storage	remote_addr;
+	struct sockaddr_storage m_local_addr;	 /* nmapped local addr */
+	struct sockaddr_storage	m_remote_addr;	 /* nmapped rem addr */
+	void			*provider_data;	 /* provider private data */
+	iw_event_handler        event_handler;   /* cb for provider
+						    events */
+	/* Used by provider to add and remove refs on IW cm_id */
+	void (*add_ref)(struct iw_cm_id *);
+	void (*rem_ref)(struct iw_cm_id *);
+	u8  tos;
+	bool mapped;
+};
+
+struct iw_cm_conn_param {
+	const void *private_data;
+	u16 private_data_len;
+	u32 ord;
+	u32 ird;
+	u32 qpn;
+};
+
+struct iw_cm_verbs {
+	void		(*add_ref)(struct ib_qp *qp);
+
+	void		(*rem_ref)(struct ib_qp *qp);
+
+	struct ib_qp *	(*get_qp)(struct ib_device *device,
+				  int qpn);
+
+	int		(*connect)(struct iw_cm_id *cm_id,
+				   struct iw_cm_conn_param *conn_param);
+
+	int		(*accept)(struct iw_cm_id *cm_id,
+				  struct iw_cm_conn_param *conn_param);
+
+	int		(*reject)(struct iw_cm_id *cm_id,
+				  const void *pdata, u8 pdata_len);
+
+	int		(*create_listen)(struct iw_cm_id *cm_id,
+					 int backlog);
+
+	int		(*destroy_listen)(struct iw_cm_id *cm_id);
+	char		ifname[IFNAMSIZ];
+};
+
+/**
+ * iw_create_cm_id - Create an IW CM identifier.
+ *
+ * @device: The IB device on which to create the IW CM identier.
+ * @event_handler: User callback invoked to report events associated with the
+ *   returned IW CM identifier.
+ * @context: User specified context associated with the id.
+ */
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+				 iw_cm_handler cm_handler, void *context);
+
+/**
+ * iw_destroy_cm_id - Destroy an IW CM identifier.
+ *
+ * @cm_id: The previously created IW CM identifier to destroy.
+ *
+ * The client can assume that no events will be delivered for the CM ID after
+ * this function returns.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id);
+
+/**
+ * iw_cm_bind_qp - Unbind the specified IW CM identifier and QP
+ *
+ * @cm_id: The IW CM idenfier to unbind from the QP.
+ * @qp: The QP
+ *
+ * This is called by the provider when destroying the QP to ensure
+ * that any references held by the IWCM are released. It may also
+ * be called by the IWCM when destroying a CM_ID to that any
+ * references held by the provider are released.
+ */
+void iw_cm_unbind_qp(struct iw_cm_id *cm_id, struct ib_qp *qp);
+
+/**
+ * iw_cm_get_qp - Return the ib_qp associated with a QPN
+ *
+ * @ib_device: The IB device
+ * @qpn: The queue pair number
+ */
+struct ib_qp *iw_cm_get_qp(struct ib_device *device, int qpn);
+
+/**
+ * iw_cm_listen - Listen for incoming connection requests on the
+ * specified IW CM id.
+ *
+ * @cm_id: The IW CM identifier.
+ * @backlog: The maximum number of outstanding un-accepted inbound listen
+ *   requests to queue.
+ *
+ * The source address and port number are specified in the IW CM identifier
+ * structure.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog);
+
+/**
+ * iw_cm_accept - Called to accept an incoming connect request.
+ *
+ * @cm_id: The IW CM identifier associated with the connection request.
+ * @iw_param: Pointer to a structure containing connection establishment
+ *   parameters.
+ *
+ * The specified cm_id will have been provided in the event data for a
+ * CONNECT_REQUEST event. Subsequent events related to this connection will be
+ * delivered to the specified IW CM identifier prior and may occur prior to
+ * the return of this function. If this function returns a non-zero value, the
+ * client can assume that no events will be delivered to the specified IW CM
+ * identifier.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param);
+
+/**
+ * iw_cm_reject - Reject an incoming connection request.
+ *
+ * @cm_id: Connection identifier associated with the request.
+ * @private_daa: Pointer to data to deliver to the remote peer as part of the
+ *   reject message.
+ * @private_data_len: The number of bytes in the private_data parameter.
+ *
+ * The client can assume that no events will be delivered to the specified IW
+ * CM identifier following the return of this function. The private_data
+ * buffer is available for reuse when this function returns.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data,
+		 u8 private_data_len);
+
+/**
+ * iw_cm_connect - Called to request a connection to a remote peer.
+ *
+ * @cm_id: The IW CM identifier for the connection.
+ * @iw_param: Pointer to a structure containing connection  establishment
+ *   parameters.
+ *
+ * Events may be delivered to the specified IW CM identifier prior to the
+ * return of this function. If this function returns a non-zero value, the
+ * client can assume that no events will be delivered to the specified IW CM
+ * identifier.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param);
+
+/**
+ * iw_cm_disconnect - Close the specified connection.
+ *
+ * @cm_id: The IW CM identifier to close.
+ * @abrupt: If 0, the connection will be closed gracefully, otherwise, the
+ *   connection will be reset.
+ *
+ * The IW CM identifier is still active until the IW_CM_EVENT_CLOSE event is
+ * delivered.
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt);
+
+/**
+ * iw_cm_init_qp_attr - Called to initialize the attributes of the QP
+ * associated with a IW CM identifier.
+ *
+ * @cm_id: The IW CM identifier associated with the QP
+ * @qp_attr: Pointer to the QP attributes structure.
+ * @qp_attr_mask: Pointer to a bit vector specifying which QP attributes are
+ *   valid.
+ */
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask);
+
+/**
+ * iwcm_reject_msg - return a pointer to a reject message string.
+ * @reason: Value returned in the REJECT event status field.
+ */
+const char *__attribute_const__ iwcm_reject_msg(int reason);
+
+#endif /* IW_CM_H */
diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h
new file mode 100644
index 0000000..fda3167
--- /dev/null
+++ b/include/rdma/iw_portmap.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IW_PORTMAP_H
+#define _IW_PORTMAP_H
+
+#define IWPM_ULIBNAME_SIZE	32
+#define IWPM_DEVNAME_SIZE	32
+#define IWPM_IFNAME_SIZE	16
+#define IWPM_IPADDR_SIZE	16
+
+enum {
+	IWPM_INVALID_NLMSG_ERR = 10,
+	IWPM_CREATE_MAPPING_ERR,
+	IWPM_DUPLICATE_MAPPING_ERR,
+	IWPM_UNKNOWN_MAPPING_ERR,
+	IWPM_CLIENT_DEV_INFO_ERR,
+	IWPM_USER_LIB_INFO_ERR,
+	IWPM_REMOTE_QUERY_REJECT
+};
+
+struct iwpm_dev_data {
+	char dev_name[IWPM_DEVNAME_SIZE];
+	char if_name[IWPM_IFNAME_SIZE];
+};
+
+struct iwpm_sa_data {
+	struct sockaddr_storage loc_addr;
+	struct sockaddr_storage mapped_loc_addr;
+	struct sockaddr_storage rem_addr;
+	struct sockaddr_storage mapped_rem_addr;
+};
+
+/**
+ * iwpm_init - Allocate resources for the iwarp port mapper
+ *
+ * Should be called when network interface goes up.
+ */
+int iwpm_init(u8);
+
+/**
+ * iwpm_exit - Deallocate resources for the iwarp port mapper
+ *
+ * Should be called when network interface goes down.
+ */
+int iwpm_exit(u8);
+
+/**
+ * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid
+ *
+ * Returns true if the pid is greater than zero, otherwise returns false
+ */
+int iwpm_valid_pid(void);
+
+/**
+ * iwpm_register_pid - Send a netlink query to userspace
+ *                     to get the iwarp port mapper pid
+ * @pm_msg: Contains driver info to send to the userspace port mapper
+ * @nl_client: The index of the netlink client
+ */
+int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client);
+
+/**
+ * iwpm_add_mapping - Send a netlink add mapping request to
+ *                    the userspace port mapper
+ * @pm_msg: Contains the local ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
+ * If the request is successful, the pm_msg stores
+ * the port mapper response (mapped address info)
+ */
+int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client);
+
+/**
+ * iwpm_add_and_query_mapping - Send a netlink add and query mapping request
+ *				 to the userspace port mapper
+ * @pm_msg: Contains the local and remote ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
+ * If the request is successful, the pm_msg stores the
+ * port mapper response (mapped local and remote address info)
+ */
+int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client);
+
+/**
+ * iwpm_remove_mapping - Send a netlink remove mapping request
+ *                       to the userspace port mapper
+ *
+ * @local_addr: Local ip/tcp address to remove
+ * @nl_client: The index of the netlink client
+ */
+int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client);
+
+/**
+ * iwpm_register_pid_cb - Process the port mapper response to
+ *                        iwpm_register_pid query
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * If successful, the function receives the userspace port mapper pid
+ * which is used in future communication with the port mapper
+ */
+int iwpm_register_pid_cb(struct sk_buff *, struct netlink_callback *);
+
+/**
+ * iwpm_add_mapping_cb - Process the port mapper response to
+ *                       iwpm_add_mapping request
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ */
+int iwpm_add_mapping_cb(struct sk_buff *, struct netlink_callback *);
+
+/**
+ * iwpm_add_and_query_mapping_cb - Process the port mapper response to
+ *                                 iwpm_add_and_query_mapping request
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ */
+int iwpm_add_and_query_mapping_cb(struct sk_buff *, struct netlink_callback *);
+
+/**
+ * iwpm_remote_info_cb - Process remote connecting peer address info, which
+ *                       the port mapper has received from the connecting peer
+ *
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Stores the IPv4/IPv6 address info in a hash table
+ */
+int iwpm_remote_info_cb(struct sk_buff *, struct netlink_callback *);
+
+/**
+ * iwpm_mapping_error_cb - Process port mapper notification for error
+ *
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ */
+int iwpm_mapping_error_cb(struct sk_buff *, struct netlink_callback *);
+
+/**
+ * iwpm_mapping_info_cb - Process a notification that the userspace
+ *                        port mapper daemon is started
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send all the local mapping
+ * info records to the userspace port mapper
+ */
+int iwpm_mapping_info_cb(struct sk_buff *, struct netlink_callback *);
+
+/**
+ * iwpm_ack_mapping_info_cb - Process the port mapper ack for
+ *                            the provided local mapping info records
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ */
+int iwpm_ack_mapping_info_cb(struct sk_buff *, struct netlink_callback *);
+
+/**
+ * iwpm_get_remote_info - Get the remote connecting peer address info
+ *
+ * @mapped_loc_addr: Mapped local address of the listening peer
+ * @mapped_rem_addr: Mapped remote address of the connecting peer
+ * @remote_addr: To store the remote address of the connecting peer
+ * @nl_client: The index of the netlink client
+ *
+ * The remote address info is retrieved and provided to the client in
+ * the remote_addr. After that it is removed from the hash table
+ */
+int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
+			struct sockaddr_storage *mapped_rem_addr,
+			struct sockaddr_storage *remote_addr, u8 nl_client);
+
+/**
+ * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address
+ *                       info in a hash table
+ * @local_addr: Local ip/tcp address
+ * @mapped_addr: Mapped local ip/tcp address
+ * @nl_client: The index of the netlink client
+ */
+int iwpm_create_mapinfo(struct sockaddr_storage *local_addr,
+			struct sockaddr_storage *mapped_addr, u8 nl_client);
+
+/**
+ * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address
+ *                       info from the hash table
+ * @local_addr: Local ip/tcp address
+ * @mapped_addr: Mapped local ip/tcp address
+ *
+ * Returns err code if mapping info is not found in the hash table,
+ * otherwise returns 0
+ */
+int iwpm_remove_mapinfo(struct sockaddr_storage *local_addr,
+			struct sockaddr_storage *mapped_addr);
+
+#endif /* _IW_PORTMAP_H */
diff --git a/include/rdma/mr_pool.h b/include/rdma/mr_pool.h
new file mode 100644
index 0000000..986010b
--- /dev/null
+++ b/include/rdma/mr_pool.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef _RDMA_MR_POOL_H
+#define _RDMA_MR_POOL_H 1
+
+#include <rdma/ib_verbs.h>
+
+struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list);
+void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr);
+
+int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
+		enum ib_mr_type type, u32 max_num_sg);
+void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list);
+
+#endif /* _RDMA_MR_POOL_H */
diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h
new file mode 100644
index 0000000..2bbb7a6
--- /dev/null
+++ b/include/rdma/opa_addr.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef OPA_ADDR_H
+#define OPA_ADDR_H
+
+#include <rdma/opa_smi.h>
+
+#define	OPA_SPECIAL_OUI		(0x00066AULL)
+#define OPA_MAKE_ID(x)          (cpu_to_be64(OPA_SPECIAL_OUI << 40 | (x)))
+#define OPA_TO_IB_UCAST_LID(x) (((x) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) \
+				? 0 : x)
+#define OPA_GID_INDEX		0x1
+/**
+ * 0xF8 - 4 bits of multicast range and 1 bit for collective range
+ * Example: For 24 bit LID space,
+ * Multicast range: 0xF00000 to 0xF7FFFF
+ * Collective range: 0xF80000 to 0xFFFFFE
+ */
+#define OPA_MCAST_NR 0x4 /* Number of top bits set */
+#define OPA_COLLECTIVE_NR 0x1 /* Number of bits after MCAST_NR */
+
+/**
+ * ib_is_opa_gid: Returns true if the top 24 bits of the gid
+ * contains the OPA_STL_OUI identifier. This identifies that
+ * the provided gid is a special purpose GID meant to carry
+ * extended LID information.
+ *
+ * @gid: The Global identifier
+ */
+static inline bool ib_is_opa_gid(const union ib_gid *gid)
+{
+	return ((be64_to_cpu(gid->global.interface_id) >> 40) ==
+		OPA_SPECIAL_OUI);
+}
+
+/**
+ * opa_get_lid_from_gid: Returns the last 32 bits of the gid.
+ * OPA devices use one of the gids in the gid table to also
+ * store the lid.
+ *
+ * @gid: The Global identifier
+ */
+static inline u32 opa_get_lid_from_gid(const union ib_gid *gid)
+{
+	return be64_to_cpu(gid->global.interface_id) & 0xFFFFFFFF;
+}
+
+/**
+ * opa_is_extended_lid: Returns true if dlid or slid are
+ * extended.
+ *
+ * @dlid: The DLID
+ * @slid: The SLID
+ */
+static inline bool opa_is_extended_lid(__be32 dlid, __be32 slid)
+{
+	if ((be32_to_cpu(dlid) >=
+	     be16_to_cpu(IB_MULTICAST_LID_BASE)) ||
+	    (be32_to_cpu(slid) >=
+	     be16_to_cpu(IB_MULTICAST_LID_BASE)))
+		return true;
+
+	return false;
+}
+
+/* Get multicast lid base */
+static inline u32 opa_get_mcast_base(u32 nr_top_bits)
+{
+	return (be32_to_cpu(OPA_LID_PERMISSIVE) << (32 - nr_top_bits));
+}
+
+/* Check for a valid unicast LID for non-SM traffic types */
+static inline bool rdma_is_valid_unicast_lid(struct rdma_ah_attr *attr)
+{
+	if (attr->type == RDMA_AH_ATTR_TYPE_IB) {
+		if (!rdma_ah_get_dlid(attr) ||
+		    rdma_ah_get_dlid(attr) >=
+		    be32_to_cpu(IB_MULTICAST_LID_BASE))
+			return false;
+	} else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) {
+		if (!rdma_ah_get_dlid(attr) ||
+		    rdma_ah_get_dlid(attr) >=
+		    opa_get_mcast_base(OPA_MCAST_NR))
+			return false;
+	}
+	return true;
+}
+#endif /* OPA_ADDR_H */
diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h
new file mode 100644
index 0000000..b4f0ac0
--- /dev/null
+++ b/include/rdma/opa_port_info.h
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2014-2017 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(OPA_PORT_INFO_H)
+#define OPA_PORT_INFO_H
+
+#define OPA_PORT_LINK_MODE_NOP	0		/* No change */
+#define OPA_PORT_LINK_MODE_OPA	4		/* Port mode is OPA */
+
+#define OPA_PORT_PACKET_FORMAT_NOP	0		/* No change */
+#define OPA_PORT_PACKET_FORMAT_8B	1		/* Format 8B */
+#define OPA_PORT_PACKET_FORMAT_9B	2		/* Format 9B */
+#define OPA_PORT_PACKET_FORMAT_10B	4		/* Format 10B */
+#define OPA_PORT_PACKET_FORMAT_16B	8		/* Format 16B */
+
+#define OPA_PORT_LTP_CRC_MODE_NONE	0	/* No change */
+#define OPA_PORT_LTP_CRC_MODE_14	1	/* 14-bit LTP CRC mode (optional) */
+#define OPA_PORT_LTP_CRC_MODE_16	2	/* 16-bit LTP CRC mode */
+#define OPA_PORT_LTP_CRC_MODE_48	4	/* 48-bit LTP CRC mode (optional) */
+#define OPA_PORT_LTP_CRC_MODE_PER_LANE  8	/* 12/16-bit per lane LTP CRC mode */
+
+/* Link Down / Neighbor Link Down Reason; indicated as follows: */
+#define OPA_LINKDOWN_REASON_NONE				0	/* No specified reason */
+#define OPA_LINKDOWN_REASON_RCV_ERROR_0				1
+#define OPA_LINKDOWN_REASON_BAD_PKT_LEN				2
+#define OPA_LINKDOWN_REASON_PKT_TOO_LONG			3
+#define OPA_LINKDOWN_REASON_PKT_TOO_SHORT			4
+#define OPA_LINKDOWN_REASON_BAD_SLID				5
+#define OPA_LINKDOWN_REASON_BAD_DLID				6
+#define OPA_LINKDOWN_REASON_BAD_L2				7
+#define OPA_LINKDOWN_REASON_BAD_SC				8
+#define OPA_LINKDOWN_REASON_RCV_ERROR_8				9
+#define OPA_LINKDOWN_REASON_BAD_MID_TAIL			10
+#define OPA_LINKDOWN_REASON_RCV_ERROR_10			11
+#define OPA_LINKDOWN_REASON_PREEMPT_ERROR			12
+#define OPA_LINKDOWN_REASON_PREEMPT_VL15			13
+#define OPA_LINKDOWN_REASON_BAD_VL_MARKER			14
+#define OPA_LINKDOWN_REASON_RCV_ERROR_14			15
+#define OPA_LINKDOWN_REASON_RCV_ERROR_15			16
+#define OPA_LINKDOWN_REASON_BAD_HEAD_DIST			17
+#define OPA_LINKDOWN_REASON_BAD_TAIL_DIST			18
+#define OPA_LINKDOWN_REASON_BAD_CTRL_DIST			19
+#define OPA_LINKDOWN_REASON_BAD_CREDIT_ACK			20
+#define OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER		21
+#define OPA_LINKDOWN_REASON_BAD_PREEMPT				22
+#define OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT			23
+#define OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT		24
+#define OPA_LINKDOWN_REASON_RCV_ERROR_24			25
+#define OPA_LINKDOWN_REASON_RCV_ERROR_25			26
+#define OPA_LINKDOWN_REASON_RCV_ERROR_26			27
+#define OPA_LINKDOWN_REASON_RCV_ERROR_27			28
+#define OPA_LINKDOWN_REASON_RCV_ERROR_28			29
+#define OPA_LINKDOWN_REASON_RCV_ERROR_29			30
+#define OPA_LINKDOWN_REASON_RCV_ERROR_30			31
+#define OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN		32
+#define OPA_LINKDOWN_REASON_UNKNOWN				33
+/* 34 -reserved */
+#define OPA_LINKDOWN_REASON_REBOOT				35
+#define OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN			36
+/* 37-38 reserved */
+#define OPA_LINKDOWN_REASON_FM_BOUNCE				39
+#define OPA_LINKDOWN_REASON_SPEED_POLICY			40
+#define OPA_LINKDOWN_REASON_WIDTH_POLICY			41
+/* 42-48 reserved */
+#define OPA_LINKDOWN_REASON_DISCONNECTED			49
+#define OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED		50
+#define OPA_LINKDOWN_REASON_NOT_INSTALLED			51
+#define OPA_LINKDOWN_REASON_CHASSIS_CONFIG			52
+/* 53 reserved */
+#define OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED		54
+/* 55 reserved */
+#define OPA_LINKDOWN_REASON_POWER_POLICY			56
+#define OPA_LINKDOWN_REASON_LINKSPEED_POLICY			57
+#define OPA_LINKDOWN_REASON_LINKWIDTH_POLICY			58
+/* 59 reserved */
+#define OPA_LINKDOWN_REASON_SWITCH_MGMT				60
+#define OPA_LINKDOWN_REASON_SMA_DISABLED			61
+/* 62 reserved */
+#define OPA_LINKDOWN_REASON_TRANSIENT				63
+/* 64-255 reserved */
+
+/* OPA Link Init reason; indicated as follows: */
+/* 3-7; 11-15 reserved; 8-15 cleared on Polling->LinkUp */
+#define OPA_LINKINIT_REASON_NOP                 0
+#define OPA_LINKINIT_REASON_LINKUP              (1 << 4)
+#define OPA_LINKINIT_REASON_FLAPPING            (2 << 4)
+#define OPA_LINKINIT_REASON_CLEAR               (8 << 4)
+#define OPA_LINKINIT_OUTSIDE_POLICY             (8 << 4)
+#define OPA_LINKINIT_QUARANTINED                (9 << 4)
+#define OPA_LINKINIT_INSUFIC_CAPABILITY         (10 << 4)
+
+#define OPA_LINK_SPEED_NOP              0x0000  /*  Reserved (1-5 Gbps) */
+#define OPA_LINK_SPEED_12_5G            0x0001  /*  12.5 Gbps */
+#define OPA_LINK_SPEED_25G              0x0002  /*  25.78125?  Gbps (EDR) */
+
+#define OPA_LINK_WIDTH_1X            0x0001
+#define OPA_LINK_WIDTH_2X            0x0002
+#define OPA_LINK_WIDTH_3X            0x0004
+#define OPA_LINK_WIDTH_4X            0x0008
+
+#define OPA_CAP_MASK3_IsEthOnFabricSupported      (1 << 13)
+#define OPA_CAP_MASK3_IsSnoopSupported            (1 << 7)
+#define OPA_CAP_MASK3_IsAsyncSC2VLSupported       (1 << 6)
+#define OPA_CAP_MASK3_IsAddrRangeConfigSupported  (1 << 5)
+#define OPA_CAP_MASK3_IsPassThroughSupported      (1 << 4)
+#define OPA_CAP_MASK3_IsSharedSpaceSupported      (1 << 3)
+/* reserved (1 << 2) */
+#define OPA_CAP_MASK3_IsVLMarkerSupported         (1 << 1)
+#define OPA_CAP_MASK3_IsVLrSupported              (1 << 0)
+
+/**
+ * new MTU values
+ */
+enum {
+	OPA_MTU_8192  = 6,
+	OPA_MTU_10240 = 7,
+};
+
+enum {
+	OPA_PORT_PHYS_CONF_DISCONNECTED = 0,
+	OPA_PORT_PHYS_CONF_STANDARD     = 1,
+	OPA_PORT_PHYS_CONF_FIXED        = 2,
+	OPA_PORT_PHYS_CONF_VARIABLE     = 3,
+	OPA_PORT_PHYS_CONF_SI_PHOTO     = 4
+};
+
+enum port_info_field_masks {
+	/* vl.cap */
+	OPA_PI_MASK_VL_CAP                        = 0x1F,
+	/* port_states.ledenable_offlinereason */
+	OPA_PI_MASK_OFFLINE_REASON                = 0x0F,
+	OPA_PI_MASK_LED_ENABLE                    = 0x40,
+	/* port_states.unsleepstate_downdefstate */
+	OPA_PI_MASK_UNSLEEP_STATE                 = 0xF0,
+	OPA_PI_MASK_DOWNDEF_STATE                 = 0x0F,
+	/* port_states.portphysstate_portstate */
+	OPA_PI_MASK_PORT_PHYSICAL_STATE           = 0xF0,
+	OPA_PI_MASK_PORT_STATE                    = 0x0F,
+	/* port_phys_conf */
+	OPA_PI_MASK_PORT_PHYSICAL_CONF            = 0x0F,
+	/* collectivemask_multicastmask */
+	OPA_PI_MASK_COLLECT_MASK                  = 0x38,
+	OPA_PI_MASK_MULTICAST_MASK                = 0x07,
+	/* mkeyprotect_lmc */
+	OPA_PI_MASK_MKEY_PROT_BIT                 = 0xC0,
+	OPA_PI_MASK_LMC                           = 0x0F,
+	/* smsl */
+	OPA_PI_MASK_SMSL                          = 0x1F,
+	/* partenforce_filterraw */
+	/* Filter Raw In/Out bits 1 and 2 were removed */
+	OPA_PI_MASK_LINKINIT_REASON               = 0xF0,
+	OPA_PI_MASK_PARTITION_ENFORCE_IN          = 0x08,
+	OPA_PI_MASK_PARTITION_ENFORCE_OUT         = 0x04,
+	/* operational_vls */
+	OPA_PI_MASK_OPERATIONAL_VL                = 0x1F,
+	/* sa_qp */
+	OPA_PI_MASK_SA_QP                         = 0x00FFFFFF,
+	/* sm_trap_qp */
+	OPA_PI_MASK_SM_TRAP_QP                    = 0x00FFFFFF,
+	/* localphy_overrun_errors */
+	OPA_PI_MASK_LOCAL_PHY_ERRORS              = 0xF0,
+	OPA_PI_MASK_OVERRUN_ERRORS                = 0x0F,
+	/* clientrereg_subnettimeout */
+	OPA_PI_MASK_CLIENT_REREGISTER             = 0x80,
+	OPA_PI_MASK_SUBNET_TIMEOUT                = 0x1F,
+	/* port_link_mode */
+	OPA_PI_MASK_PORT_LINK_SUPPORTED           = (0x001F << 10),
+	OPA_PI_MASK_PORT_LINK_ENABLED             = (0x001F <<  5),
+	OPA_PI_MASK_PORT_LINK_ACTIVE              = (0x001F <<  0),
+	/* port_link_crc_mode */
+	OPA_PI_MASK_PORT_LINK_CRC_SUPPORTED       = 0x0F00,
+	OPA_PI_MASK_PORT_LINK_CRC_ENABLED         = 0x00F0,
+	OPA_PI_MASK_PORT_LINK_CRC_ACTIVE          = 0x000F,
+	/* port_mode */
+	OPA_PI_MASK_PORT_MODE_SECURITY_CHECK      = 0x0001,
+	OPA_PI_MASK_PORT_MODE_16B_TRAP_QUERY      = 0x0002,
+	OPA_PI_MASK_PORT_MODE_PKEY_CONVERT        = 0x0004,
+	OPA_PI_MASK_PORT_MODE_SC2SC_MAPPING       = 0x0008,
+	OPA_PI_MASK_PORT_MODE_VL_MARKER           = 0x0010,
+	OPA_PI_MASK_PORT_PASS_THROUGH             = 0x0020,
+	OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE          = 0x0040,
+	/* flit_control.interleave */
+	OPA_PI_MASK_INTERLEAVE_DIST_SUP           = (0x0003 << 12),
+	OPA_PI_MASK_INTERLEAVE_DIST_ENABLE        = (0x0003 << 10),
+	OPA_PI_MASK_INTERLEAVE_MAX_NEST_TX        = (0x001F <<  5),
+	OPA_PI_MASK_INTERLEAVE_MAX_NEST_RX        = (0x001F <<  0),
+
+	/* port_error_action */
+	OPA_PI_MASK_EX_BUFFER_OVERRUN                  = 0x80000000,
+		/* 7 bits reserved */
+	OPA_PI_MASK_FM_CFG_ERR_EXCEED_MULTICAST_LIMIT  = 0x00800000,
+	OPA_PI_MASK_FM_CFG_BAD_CONTROL_FLIT            = 0x00400000,
+	OPA_PI_MASK_FM_CFG_BAD_PREEMPT                 = 0x00200000,
+	OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER       = 0x00100000,
+	OPA_PI_MASK_FM_CFG_BAD_CRDT_ACK                = 0x00080000,
+	OPA_PI_MASK_FM_CFG_BAD_CTRL_DIST               = 0x00040000,
+	OPA_PI_MASK_FM_CFG_BAD_TAIL_DIST               = 0x00020000,
+	OPA_PI_MASK_FM_CFG_BAD_HEAD_DIST               = 0x00010000,
+		/* 2 bits reserved */
+	OPA_PI_MASK_PORT_RCV_BAD_VL_MARKER             = 0x00002000,
+	OPA_PI_MASK_PORT_RCV_PREEMPT_VL15              = 0x00001000,
+	OPA_PI_MASK_PORT_RCV_PREEMPT_ERROR             = 0x00000800,
+		/* 1 bit reserved */
+	OPA_PI_MASK_PORT_RCV_BAD_MidTail               = 0x00000200,
+		/* 1 bit reserved */
+	OPA_PI_MASK_PORT_RCV_BAD_SC                    = 0x00000080,
+	OPA_PI_MASK_PORT_RCV_BAD_L2                    = 0x00000040,
+	OPA_PI_MASK_PORT_RCV_BAD_DLID                  = 0x00000020,
+	OPA_PI_MASK_PORT_RCV_BAD_SLID                  = 0x00000010,
+	OPA_PI_MASK_PORT_RCV_PKTLEN_TOOSHORT           = 0x00000008,
+	OPA_PI_MASK_PORT_RCV_PKTLEN_TOOLONG            = 0x00000004,
+	OPA_PI_MASK_PORT_RCV_BAD_PKTLEN                = 0x00000002,
+	OPA_PI_MASK_PORT_RCV_BAD_LT                    = 0x00000001,
+
+	/* pass_through.res_drctl */
+	OPA_PI_MASK_PASS_THROUGH_DR_CONTROL       = 0x01,
+
+	/* buffer_units */
+	OPA_PI_MASK_BUF_UNIT_VL15_INIT            = (0x00000FFF  << 11),
+	OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE     = (0x0000001F  <<  6),
+	OPA_PI_MASK_BUF_UNIT_CREDIT_ACK           = (0x00000003  <<  3),
+	OPA_PI_MASK_BUF_UNIT_BUF_ALLOC            = (0x00000003  <<  0),
+
+	/* neigh_mtu.pvlx_to_mtu */
+	OPA_PI_MASK_NEIGH_MTU_PVL0                = 0xF0,
+	OPA_PI_MASK_NEIGH_MTU_PVL1                = 0x0F,
+
+	/* neigh_mtu.vlstall_hoq_life */
+	OPA_PI_MASK_VL_STALL                      = (0x03 << 5),
+	OPA_PI_MASK_HOQ_LIFE                      = (0x1F << 0),
+
+	/* port_neigh_mode */
+	OPA_PI_MASK_NEIGH_MGMT_ALLOWED            = (0x01 << 3),
+	OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS          = (0x01 << 2),
+	OPA_PI_MASK_NEIGH_NODE_TYPE               = (0x03 << 0),
+
+	/* resptime_value */
+	OPA_PI_MASK_RESPONSE_TIME_VALUE           = 0x1F,
+
+	/* mtucap */
+	OPA_PI_MASK_MTU_CAP                       = 0x0F,
+};
+
+struct opa_port_states {
+	u8     reserved;
+	u8     ledenable_offlinereason;   /* 1 res, 1 bit, 6 bits */
+	u8     reserved2;
+	u8     portphysstate_portstate;   /* 4 bits, 4 bits */
+};
+
+struct opa_port_state_info {
+	struct opa_port_states port_states;
+	__be16 link_width_downgrade_tx_active;
+	__be16 link_width_downgrade_rx_active;
+};
+
+struct opa_port_info {
+	__be32 lid;
+	__be32 flow_control_mask;
+
+	struct {
+		u8     res;                       /* was inittype */
+		u8     cap;                       /* 3 res, 5 bits */
+		__be16 high_limit;
+		__be16 preempt_limit;
+		u8     arb_high_cap;
+		u8     arb_low_cap;
+	} vl;
+
+	struct opa_port_states  port_states;
+	u8     port_phys_conf;                    /* 4 res, 4 bits */
+	u8     collectivemask_multicastmask;      /* 2 res, 3, 3 */
+	u8     mkeyprotect_lmc;                   /* 2 bits, 2 res, 4 bits */
+	u8     smsl;                              /* 3 res, 5 bits */
+
+	u8     partenforce_filterraw;             /* bit fields */
+	u8     operational_vls;                    /* 3 res, 5 bits */
+	__be16 pkey_8b;
+	__be16 pkey_10b;
+	__be16 mkey_violations;
+
+	__be16 pkey_violations;
+	__be16 qkey_violations;
+	__be32 sm_trap_qp;                        /* 8 bits, 24 bits */
+
+	__be32 sa_qp;                             /* 8 bits, 24 bits */
+	u8     neigh_port_num;
+	u8     link_down_reason;
+	u8     neigh_link_down_reason;
+	u8     clientrereg_subnettimeout;	  /* 1 bit, 2 bits, 5 */
+
+	struct {
+		__be16 supported;
+		__be16 enabled;
+		__be16 active;
+	} link_speed;
+	struct {
+		__be16 supported;
+		__be16 enabled;
+		__be16 active;
+	} link_width;
+	struct {
+		__be16 supported;
+		__be16 enabled;
+		__be16 tx_active;
+		__be16 rx_active;
+	} link_width_downgrade;
+	__be16 port_link_mode;                  /* 1 res, 5 bits, 5 bits, 5 bits */
+	__be16 port_ltp_crc_mode;               /* 4 res, 4 bits, 4 bits, 4 bits */
+
+	__be16 port_mode;                       /* 9 res, bit fields */
+	struct {
+		__be16 supported;
+		__be16 enabled;
+	} port_packet_format;
+	struct {
+		__be16 interleave;  /* 2 res, 2,2,5,5 */
+		struct {
+			__be16 min_initial;
+			__be16 min_tail;
+			u8     large_pkt_limit;
+			u8     small_pkt_limit;
+			u8     max_small_pkt_limit;
+			u8     preemption_limit;
+		} preemption;
+	} flit_control;
+
+	__be32 reserved4;
+	__be32 port_error_action; /* bit field */
+
+	struct {
+		u8 egress_port;
+		u8 res_drctl;                    /* 7 res, 1 */
+	} pass_through;
+	__be16 mkey_lease_period;
+	__be32 buffer_units;                     /* 9 res, 12, 5, 3, 3 */
+
+	__be32 reserved5;
+	__be32 sm_lid;
+
+	__be64 mkey;
+
+	__be64 subnet_prefix;
+
+	struct {
+		u8 pvlx_to_mtu[OPA_MAX_VLS/2]; /* 4 bits, 4 bits */
+	} neigh_mtu;
+
+	struct {
+		u8 vlstall_hoqlife;             /* 3 bits, 5 bits */
+	} xmit_q[OPA_MAX_VLS];
+
+	struct {
+		u8 addr[16];
+	} ipaddr_ipv6;
+
+	struct {
+		u8 addr[4];
+	} ipaddr_ipv4;
+
+	u32    reserved6;
+	u32    reserved7;
+	u32    reserved8;
+
+	__be64 neigh_node_guid;
+
+	__be32 ib_cap_mask;
+	__be16 reserved9;                    /* was ib_cap_mask2 */
+	__be16 opa_cap_mask;
+
+	__be32 reserved10;                   /* was link_roundtrip_latency */
+	__be16 overall_buffer_space;
+	__be16 reserved11;                   /* was max_credit_hint */
+
+	__be16 diag_code;
+	struct {
+		u8 buffer;
+		u8 wire;
+	} replay_depth;
+	u8     port_neigh_mode;
+	u8     mtucap;                          /* 4 res, 4 bits */
+
+	u8     resptimevalue;		        /* 3 res, 5 bits */
+	u8     local_port_num;
+	u8     reserved12;
+	u8     reserved13;                       /* was guid_cap */
+} __attribute__ ((packed));
+
+#endif /* OPA_PORT_INFO_H */
diff --git a/include/rdma/opa_smi.h b/include/rdma/opa_smi.h
new file mode 100644
index 0000000..f789611
--- /dev/null
+++ b/include/rdma/opa_smi.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2014 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(OPA_SMI_H)
+#define OPA_SMI_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#define OPA_SMP_LID_DATA_SIZE			2016
+#define OPA_SMP_DR_DATA_SIZE			1872
+#define OPA_SMP_MAX_PATH_HOPS			64
+
+#define OPA_MAX_VLS				32
+#define OPA_MAX_SLS				32
+#define OPA_MAX_SCS				32
+
+#define OPA_LID_PERMISSIVE			cpu_to_be32(0xFFFFFFFF)
+
+struct opa_smp {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	__be16	status;
+	u8	hop_ptr;
+	u8	hop_cnt;
+	__be64	tid;
+	__be16	attr_id;
+	__be16	resv;
+	__be32	attr_mod;
+	__be64	mkey;
+	union {
+		struct {
+			uint8_t data[OPA_SMP_LID_DATA_SIZE];
+		} lid;
+		struct {
+			__be32	dr_slid;
+			__be32	dr_dlid;
+			u8	initial_path[OPA_SMP_MAX_PATH_HOPS];
+			u8	return_path[OPA_SMP_MAX_PATH_HOPS];
+			u8	reserved[8];
+			u8	data[OPA_SMP_DR_DATA_SIZE];
+		} dr;
+	} route;
+} __packed;
+
+
+/* Subnet management attributes */
+/* ... */
+#define OPA_ATTRIB_ID_NODE_DESCRIPTION		cpu_to_be16(0x0010)
+#define OPA_ATTRIB_ID_NODE_INFO			cpu_to_be16(0x0011)
+#define OPA_ATTRIB_ID_PORT_INFO			cpu_to_be16(0x0015)
+#define OPA_ATTRIB_ID_PARTITION_TABLE		cpu_to_be16(0x0016)
+#define OPA_ATTRIB_ID_SL_TO_SC_MAP		cpu_to_be16(0x0017)
+#define OPA_ATTRIB_ID_VL_ARBITRATION		cpu_to_be16(0x0018)
+#define OPA_ATTRIB_ID_SM_INFO			cpu_to_be16(0x0020)
+#define OPA_ATTRIB_ID_CABLE_INFO		cpu_to_be16(0x0032)
+#define OPA_ATTRIB_ID_AGGREGATE			cpu_to_be16(0x0080)
+#define OPA_ATTRIB_ID_SC_TO_SL_MAP		cpu_to_be16(0x0082)
+#define OPA_ATTRIB_ID_SC_TO_VLR_MAP		cpu_to_be16(0x0083)
+#define OPA_ATTRIB_ID_SC_TO_VLT_MAP		cpu_to_be16(0x0084)
+#define OPA_ATTRIB_ID_SC_TO_VLNT_MAP		cpu_to_be16(0x0085)
+/* ... */
+#define OPA_ATTRIB_ID_PORT_STATE_INFO		cpu_to_be16(0x0087)
+/* ... */
+#define OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE	cpu_to_be16(0x008A)
+/* ... */
+
+struct opa_node_description {
+	u8 data[64];
+} __attribute__ ((packed));
+
+struct opa_node_info {
+	u8      base_version;
+	u8      class_version;
+	u8      node_type;
+	u8      num_ports;
+	__be32  reserved;
+	__be64  system_image_guid;
+	__be64  node_guid;
+	__be64  port_guid;
+	__be16  partition_cap;
+	__be16  device_id;
+	__be32  revision;
+	u8      local_port_num;
+	u8      vendor_id[3];   /* network byte order */
+} __attribute__ ((packed));
+
+#define OPA_PARTITION_TABLE_BLK_SIZE 32
+
+static inline u8
+opa_get_smp_direction(struct opa_smp *smp)
+{
+	return ib_get_smp_direction((struct ib_smp *)smp);
+}
+
+static inline u8 *opa_get_smp_data(struct opa_smp *smp)
+{
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		return smp->route.dr.data;
+
+	return smp->route.lid.data;
+}
+
+static inline size_t opa_get_smp_data_size(struct opa_smp *smp)
+{
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		return sizeof(smp->route.dr.data);
+
+	return sizeof(smp->route.lid.data);
+}
+
+static inline size_t opa_get_smp_header_size(struct opa_smp *smp)
+{
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		return sizeof(*smp) - sizeof(smp->route.dr.data);
+
+	return sizeof(*smp) - sizeof(smp->route.lid.data);
+}
+
+#endif /* OPA_SMI_H */
diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h
new file mode 100644
index 0000000..0c07a70
--- /dev/null
+++ b/include/rdma/opa_vnic.h
@@ -0,0 +1,138 @@
+#ifndef _OPA_VNIC_H
+#define _OPA_VNIC_H
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains Intel Omni-Path (OPA) Virtual Network Interface
+ * Controller (VNIC) specific declarations.
+ */
+
+#include <rdma/ib_verbs.h>
+
+/* 16 header bytes + 2 reserved bytes */
+#define OPA_VNIC_L2_HDR_LEN   (16 + 2)
+
+#define OPA_VNIC_L4_HDR_LEN   2
+
+#define OPA_VNIC_HDR_LEN      (OPA_VNIC_L2_HDR_LEN + \
+			       OPA_VNIC_L4_HDR_LEN)
+
+#define OPA_VNIC_L4_ETHR  0x78
+
+#define OPA_VNIC_ICRC_LEN   4
+#define OPA_VNIC_TAIL_LEN   1
+#define OPA_VNIC_ICRC_TAIL_LEN  (OPA_VNIC_ICRC_LEN + OPA_VNIC_TAIL_LEN)
+
+#define OPA_VNIC_SKB_MDATA_LEN         4
+#define OPA_VNIC_SKB_MDATA_ENCAP_ERR   0x1
+
+/* opa vnic rdma netdev's private data structure */
+struct opa_vnic_rdma_netdev {
+	struct rdma_netdev rn;  /* keep this first */
+	/* followed by device private data */
+	char *dev_priv[0];
+};
+
+static inline void *opa_vnic_priv(const struct net_device *dev)
+{
+	struct rdma_netdev *rn = netdev_priv(dev);
+
+	return rn->clnt_priv;
+}
+
+static inline void *opa_vnic_dev_priv(const struct net_device *dev)
+{
+	struct opa_vnic_rdma_netdev *oparn = netdev_priv(dev);
+
+	return oparn->dev_priv;
+}
+
+/* opa_vnic skb meta data structrue */
+struct opa_vnic_skb_mdata {
+	u8 vl;
+	u8 entropy;
+	u8 flags;
+	u8 rsvd;
+} __packed;
+
+/* OPA VNIC group statistics */
+struct opa_vnic_grp_stats {
+	u64 unicast;
+	u64 mcastbcast;
+	u64 untagged;
+	u64 vlan;
+	u64 s_64;
+	u64 s_65_127;
+	u64 s_128_255;
+	u64 s_256_511;
+	u64 s_512_1023;
+	u64 s_1024_1518;
+	u64 s_1519_max;
+};
+
+struct opa_vnic_stats {
+	/* standard netdev statistics */
+	struct rtnl_link_stats64 netstats;
+
+	/* OPA VNIC statistics */
+	struct opa_vnic_grp_stats tx_grp;
+	struct opa_vnic_grp_stats rx_grp;
+	u64 tx_dlid_zero;
+	u64 tx_drop_state;
+	u64 rx_drop_state;
+	u64 rx_runt;
+	u64 rx_oversize;
+};
+
+static inline bool rdma_cap_opa_vnic(struct ib_device *device)
+{
+	return !!(device->attrs.device_cap_flags &
+		  IB_DEVICE_RDMA_NETDEV_OPA_VNIC);
+}
+
+#endif /* _OPA_VNIC_H */
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
new file mode 100644
index 0000000..6909347
--- /dev/null
+++ b/include/rdma/rdma_cm.h
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(RDMA_CM_H)
+#define RDMA_CM_H
+
+#include <linux/socket.h>
+#include <linux/in6.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_sa.h>
+#include <uapi/rdma/rdma_user_cm.h>
+
+/*
+ * Upon receiving a device removal event, users must destroy the associated
+ * RDMA identifier and release all resources allocated with the device.
+ */
+enum rdma_cm_event_type {
+	RDMA_CM_EVENT_ADDR_RESOLVED,
+	RDMA_CM_EVENT_ADDR_ERROR,
+	RDMA_CM_EVENT_ROUTE_RESOLVED,
+	RDMA_CM_EVENT_ROUTE_ERROR,
+	RDMA_CM_EVENT_CONNECT_REQUEST,
+	RDMA_CM_EVENT_CONNECT_RESPONSE,
+	RDMA_CM_EVENT_CONNECT_ERROR,
+	RDMA_CM_EVENT_UNREACHABLE,
+	RDMA_CM_EVENT_REJECTED,
+	RDMA_CM_EVENT_ESTABLISHED,
+	RDMA_CM_EVENT_DISCONNECTED,
+	RDMA_CM_EVENT_DEVICE_REMOVAL,
+	RDMA_CM_EVENT_MULTICAST_JOIN,
+	RDMA_CM_EVENT_MULTICAST_ERROR,
+	RDMA_CM_EVENT_ADDR_CHANGE,
+	RDMA_CM_EVENT_TIMEWAIT_EXIT
+};
+
+const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event);
+
+#define RDMA_IB_IP_PS_MASK   0xFFFFFFFFFFFF0000ULL
+#define RDMA_IB_IP_PS_TCP    0x0000000001060000ULL
+#define RDMA_IB_IP_PS_UDP    0x0000000001110000ULL
+#define RDMA_IB_IP_PS_IB     0x00000000013F0000ULL
+
+struct rdma_addr {
+	struct sockaddr_storage src_addr;
+	struct sockaddr_storage dst_addr;
+	struct rdma_dev_addr dev_addr;
+};
+
+struct rdma_route {
+	struct rdma_addr addr;
+	struct sa_path_rec *path_rec;
+	int num_paths;
+};
+
+struct rdma_conn_param {
+	const void *private_data;
+	u8 private_data_len;
+	u8 responder_resources;
+	u8 initiator_depth;
+	u8 flow_control;
+	u8 retry_count;		/* ignored when accepting */
+	u8 rnr_retry_count;
+	/* Fields below ignored if a QP is created on the rdma_cm_id. */
+	u8 srq;
+	u32 qp_num;
+	u32 qkey;
+};
+
+struct rdma_ud_param {
+	const void *private_data;
+	u8 private_data_len;
+	struct rdma_ah_attr ah_attr;
+	u32 qp_num;
+	u32 qkey;
+};
+
+struct rdma_cm_event {
+	enum rdma_cm_event_type	 event;
+	int			 status;
+	union {
+		struct rdma_conn_param	conn;
+		struct rdma_ud_param	ud;
+	} param;
+};
+
+struct rdma_cm_id;
+
+/**
+ * rdma_cm_event_handler - Callback used to report user events.
+ *
+ * Notes: Users may not call rdma_destroy_id from this callback to destroy
+ *   the passed in id, or a corresponding listen id.  Returning a
+ *   non-zero value from the callback will destroy the passed in id.
+ */
+typedef int (*rdma_cm_event_handler)(struct rdma_cm_id *id,
+				     struct rdma_cm_event *event);
+
+struct rdma_cm_id {
+	struct ib_device	*device;
+	void			*context;
+	struct ib_qp		*qp;
+	rdma_cm_event_handler	 event_handler;
+	struct rdma_route	 route;
+	enum rdma_ucm_port_space ps;
+	enum ib_qp_type		 qp_type;
+	u8			 port_num;
+};
+
+struct rdma_cm_id *__rdma_create_id(struct net *net,
+				    rdma_cm_event_handler event_handler,
+				    void *context, enum rdma_ucm_port_space ps,
+				    enum ib_qp_type qp_type,
+				    const char *caller);
+
+/**
+ * rdma_create_id - Create an RDMA identifier.
+ *
+ * @net: The network namespace in which to create the new id.
+ * @event_handler: User callback invoked to report events associated with the
+ *   returned rdma_id.
+ * @context: User specified context associated with the id.
+ * @ps: RDMA port space.
+ * @qp_type: type of queue pair associated with the id.
+ *
+ * The id holds a reference on the network namespace until it is destroyed.
+ */
+#define rdma_create_id(net, event_handler, context, ps, qp_type) \
+	__rdma_create_id((net), (event_handler), (context), (ps), (qp_type), \
+			 KBUILD_MODNAME)
+
+/**
+  * rdma_destroy_id - Destroys an RDMA identifier.
+  *
+  * @id: RDMA identifier.
+  *
+  * Note: calling this function has the effect of canceling in-flight
+  * asynchronous operations associated with the id.
+  */
+void rdma_destroy_id(struct rdma_cm_id *id);
+
+/**
+ * rdma_bind_addr - Bind an RDMA identifier to a source address and
+ *   associated RDMA device, if needed.
+ *
+ * @id: RDMA identifier.
+ * @addr: Local address information.  Wildcard values are permitted.
+ *
+ * This associates a source address with the RDMA identifier before calling
+ * rdma_listen.  If a specific local address is given, the RDMA identifier will
+ * be bound to a local RDMA device.
+ */
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_resolve_addr - Resolve destination and optional source addresses
+ *   from IP addresses to an RDMA address.  If successful, the specified
+ *   rdma_cm_id will be bound to a local device.
+ *
+ * @id: RDMA identifier.
+ * @src_addr: Source address information.  This parameter may be NULL.
+ * @dst_addr: Destination address information.
+ * @timeout_ms: Time to wait for resolution to complete.
+ */
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+		      struct sockaddr *dst_addr, int timeout_ms);
+
+/**
+ * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier
+ *   into route information needed to establish a connection.
+ *
+ * This is called on the client side of a connection.
+ * Users must have first called rdma_resolve_addr to resolve a dst_addr
+ * into an RDMA address before calling this routine.
+ */
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms);
+
+/**
+ * rdma_create_qp - Allocate a QP and associate it with the specified RDMA
+ * identifier.
+ *
+ * QPs allocated to an rdma_cm_id will automatically be transitioned by the CMA
+ * through their states.
+ */
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+		   struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * rdma_destroy_qp - Deallocate the QP associated with the specified RDMA
+ * identifier.
+ *
+ * Users must destroy any QP associated with an RDMA identifier before
+ * destroying the RDMA ID.
+ */
+void rdma_destroy_qp(struct rdma_cm_id *id);
+
+/**
+ * rdma_init_qp_attr - Initializes the QP attributes for use in transitioning
+ *   to a specified QP state.
+ * @id: Communication identifier associated with the QP attributes to
+ *   initialize.
+ * @qp_attr: On input, specifies the desired QP state.  On output, the
+ *   mandatory and desired optional attributes will be set in order to
+ *   modify the QP to the specified state.
+ * @qp_attr_mask: The QP attribute mask that may be used to transition the
+ *   QP to the specified state.
+ *
+ * Users must set the @qp_attr->qp_state to the desired QP state.  This call
+ * will set all required attributes for the given transition, along with
+ * known optional attributes.  Users may override the attributes returned from
+ * this call before calling ib_modify_qp.
+ *
+ * Users that wish to have their QP automatically transitioned through its
+ * states can associate a QP with the rdma_cm_id by calling rdma_create_qp().
+ */
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask);
+
+/**
+ * rdma_connect - Initiate an active connection request.
+ * @id: Connection identifier to connect.
+ * @conn_param: Connection information used for connected QPs.
+ *
+ * Users must have resolved a route for the rdma_cm_id to connect with
+ * by having called rdma_resolve_route before calling this routine.
+ *
+ * This call will either connect to a remote QP or obtain remote QP
+ * information for unconnected rdma_cm_id's.  The actual operation is
+ * based on the rdma_cm_id's port space.
+ */
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param);
+
+/**
+ * rdma_listen - This function is called by the passive side to
+ *   listen for incoming connection requests.
+ *
+ * Users must have bound the rdma_cm_id to a local address by calling
+ * rdma_bind_addr before calling this routine.
+ */
+int rdma_listen(struct rdma_cm_id *id, int backlog);
+
+int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param,
+		  const char *caller);
+
+/**
+ * rdma_accept - Called to accept a connection request or response.
+ * @id: Connection identifier associated with the request.
+ * @conn_param: Information needed to establish the connection.  This must be
+ *   provided if accepting a connection request.  If accepting a connection
+ *   response, this parameter must be NULL.
+ *
+ * Typically, this routine is only called by the listener to accept a connection
+ * request.  It must also be called on the active side of a connection if the
+ * user is performing their own QP transitions.
+ *
+ * In the case of error, a reject message is sent to the remote side and the
+ * state of the qp associated with the id is modified to error, such that any
+ * previously posted receive buffers would be flushed.
+ */
+#define rdma_accept(id, conn_param) \
+	__rdma_accept((id), (conn_param),  KBUILD_MODNAME)
+
+/**
+ * rdma_notify - Notifies the RDMA CM of an asynchronous event that has
+ * occurred on the connection.
+ * @id: Connection identifier to transition to established.
+ * @event: Asynchronous event.
+ *
+ * This routine should be invoked by users to notify the CM of relevant
+ * communication events.  Events that should be reported to the CM and
+ * when to report them are:
+ *
+ * IB_EVENT_COMM_EST - Used when a message is received on a connected
+ *    QP before an RTU has been received.
+ */
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event);
+
+/**
+ * rdma_reject - Called to reject a connection request or response.
+ */
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+		u8 private_data_len);
+
+/**
+ * rdma_disconnect - This function disconnects the associated QP and
+ *   transitions it into the error state.
+ */
+int rdma_disconnect(struct rdma_cm_id *id);
+
+/**
+ * rdma_join_multicast - Join the multicast group specified by the given
+ *   address.
+ * @id: Communication identifier associated with the request.
+ * @addr: Multicast address identifying the group to join.
+ * @join_state: Multicast JoinState bitmap requested by port.
+ *		Bitmap is based on IB_SA_MCMEMBER_REC_JOIN_STATE bits.
+ * @context: User-defined context associated with the join request, returned
+ * to the user through the private_data pointer in multicast events.
+ */
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+			u8 join_state, void *context);
+
+/**
+ * rdma_leave_multicast - Leave the multicast group specified by the given
+ *   address.
+ */
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_set_service_type - Set the type of service associated with a
+ *   connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @tos: Type of service.
+ *
+ * The type of service is interpretted as a differentiated service
+ * field (RFC 2474).  The service type should be specified before
+ * performing route resolution, as existing communication on the
+ * connection identifier may be unaffected.  The type of service
+ * requested may not be supported by the network to all destinations.
+ */
+void rdma_set_service_type(struct rdma_cm_id *id, int tos);
+
+/**
+ * rdma_set_reuseaddr - Allow the reuse of local addresses when binding
+ *    the rdma_cm_id.
+ * @id: Communication identifier to configure.
+ * @reuse: Value indicating if the bound address is reusable.
+ *
+ * Reuse must be set before an address is bound to the id.
+ */
+int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse);
+
+/**
+ * rdma_set_afonly - Specify that listens are restricted to the
+ *    bound address family only.
+ * @id: Communication identifer to configure.
+ * @afonly: Value indicating if listens are restricted.
+ *
+ * Must be set before identifier is in the listening state.
+ */
+int rdma_set_afonly(struct rdma_cm_id *id, int afonly);
+
+ /**
+ * rdma_get_service_id - Return the IB service ID for a specified address.
+ * @id: Communication identifier associated with the address.
+ * @addr: Address for the service ID.
+ */
+__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_reject_msg - return a pointer to a reject message string.
+ * @id: Communication identifier that received the REJECT event.
+ * @reason: Value returned in the REJECT event status field.
+ */
+const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id,
+						int reason);
+/**
+ * rdma_is_consumer_reject - return true if the consumer rejected the connect
+ *                           request.
+ * @id: Communication identifier that received the REJECT event.
+ * @reason: Value returned in the REJECT event status field.
+ */
+bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason);
+
+/**
+ * rdma_consumer_reject_data - return the consumer reject private data and
+ *			       length, if any.
+ * @id: Communication identifier that received the REJECT event.
+ * @ev: RDMA CM reject event.
+ * @data_len: Pointer to the resulting length of the consumer data.
+ */
+const void *rdma_consumer_reject_data(struct rdma_cm_id *id,
+				      struct rdma_cm_event *ev, u8 *data_len);
+
+/**
+ * rdma_read_gids - Return the SGID and DGID used for establishing
+ *                  connection. This can be used after rdma_resolve_addr()
+ *                  on client side. This can be use on new connection
+ *                  on server side. This is applicable to IB, RoCE, iWarp.
+ *                  If cm_id is not bound yet to the RDMA device, it doesn't
+ *                  copy and SGID or DGID to the given pointers.
+ * @id: Communication identifier whose GIDs are queried.
+ * @sgid: Pointer to SGID where SGID will be returned. It is optional.
+ * @dgid: Pointer to DGID where DGID will be returned. It is optional.
+ * Note: This API should not be used by any new ULPs or new code.
+ * Instead, users interested in querying GIDs should refer to path record
+ * of the rdma_cm_id to query the GIDs.
+ * This API is provided for compatibility for existing users.
+ */
+
+void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid,
+		    union ib_gid *dgid);
+
+#endif /* RDMA_CM_H */
diff --git a/include/rdma/rdma_cm_ib.h b/include/rdma/rdma_cm_ib.h
new file mode 100644
index 0000000..6a69d71
--- /dev/null
+++ b/include/rdma/rdma_cm_ib.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(RDMA_CM_IB_H)
+#define RDMA_CM_IB_H
+
+#include <rdma/rdma_cm.h>
+
+/**
+ * rdma_set_ib_path - Manually sets the path record used to establish a
+ *   connection.
+ * @id: Connection identifier associated with the request.
+ * @path_rec: Reference to the path record
+ *
+ * This call permits a user to specify routing information for rdma_cm_id's
+ * bound to InfiniBand devices. It is called on the client side of a
+ * connection and replaces the call to rdma_resolve_route.
+ */
+int rdma_set_ib_path(struct rdma_cm_id *id,
+		     struct sa_path_rec *path_rec);
+
+/* Global qkey for UDP QPs and multicast groups. */
+#define RDMA_UDP_QKEY 0x01234567
+
+#endif /* RDMA_CM_IB_H */
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
new file mode 100644
index 0000000..c369703
--- /dev/null
+++ b/include/rdma/rdma_netlink.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDMA_NETLINK_H
+#define _RDMA_NETLINK_H
+
+
+#include <linux/netlink.h>
+#include <uapi/rdma/rdma_netlink.h>
+
+struct rdma_nl_cbs {
+	int (*doit)(struct sk_buff *skb, struct nlmsghdr *nlh,
+		    struct netlink_ext_ack *extack);
+	int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb);
+	u8 flags;
+};
+
+enum rdma_nl_flags {
+	/* Require CAP_NET_ADMIN */
+	RDMA_NL_ADMIN_PERM	= 1 << 0,
+};
+
+/* Define this module as providing netlink services for NETLINK_RDMA, with
+ * index _index.  Since the client indexes were setup in a uapi header as an
+ * enum and we do no want to change that, the user must supply the expanded
+ * constant as well and the compiler checks they are the same.
+ */
+#define MODULE_ALIAS_RDMA_NETLINK(_index, _val)                                \
+	static inline void __chk_##_index(void)                                \
+	{                                                                      \
+		BUILD_BUG_ON(_index != _val);                                  \
+	}                                                                      \
+	MODULE_ALIAS("rdma-netlink-subsys-" __stringify(_val))
+
+/**
+ * Register client in RDMA netlink.
+ * @index: Index of the added client
+ * @cb_table: A table for op->callback
+ */
+void rdma_nl_register(unsigned int index,
+		      const struct rdma_nl_cbs cb_table[]);
+
+/**
+ * Remove a client from IB netlink.
+ * @index: Index of the removed IB client.
+ */
+void rdma_nl_unregister(unsigned int index);
+
+/**
+ * Put a new message in a supplied skb.
+ * @skb: The netlink skb.
+ * @nlh: Pointer to put the header of the new netlink message.
+ * @seq: The message sequence number.
+ * @len: The requested message length to allocate.
+ * @client: Calling IB netlink client.
+ * @op: message content op.
+ * Returns the allocated buffer on success and NULL on failure.
+ */
+void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
+		   int len, int client, int op, int flags);
+/**
+ * Put a new attribute in a supplied skb.
+ * @skb: The netlink skb.
+ * @nlh: Header of the netlink message to append the attribute to.
+ * @len: The length of the attribute data.
+ * @data: The attribute data to put.
+ * @type: The attribute type.
+ * Returns the 0 and a negative error code on failure.
+ */
+int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
+		  int len, void *data, int type);
+
+/**
+ * Send the supplied skb to a specific userspace PID.
+ * @skb: The netlink skb
+ * @pid: Userspace netlink process ID
+ * Returns 0 on success or a negative error code.
+ */
+int rdma_nl_unicast(struct sk_buff *skb, u32 pid);
+
+/**
+ * Send, with wait/1 retry, the supplied skb to a specific userspace PID.
+ * @skb: The netlink skb
+ * @pid: Userspace netlink process ID
+ * Returns 0 on success or a negative error code.
+ */
+int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid);
+
+/**
+ * Send the supplied skb to a netlink group.
+ * @skb: The netlink skb
+ * @group: Netlink group ID
+ * @flags: allocation flags
+ * Returns 0 on success or a negative error code.
+ */
+int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags);
+
+/**
+ * Check if there are any listeners to the netlink group
+ * @group: the netlink group ID
+ * Returns 0 on success or a negative for no listeners.
+ */
+int rdma_nl_chk_listeners(unsigned int group);
+#endif /* _RDMA_NETLINK_H */
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
new file mode 100644
index 0000000..3f4c187
--- /dev/null
+++ b/include/rdma/rdma_vt.h
@@ -0,0 +1,557 @@
+#ifndef DEF_RDMA_VT_H
+#define DEF_RDMA_VT_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Structure that low level drivers will populate in order to register with the
+ * rdmavt layer.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/rdmavt_mr.h>
+#include <rdma/rdmavt_qp.h>
+
+#define RVT_MAX_PKEY_VALUES 16
+
+#define RVT_MAX_TRAP_LEN 100 /* Limit pending trap list */
+#define RVT_MAX_TRAP_LISTS 5 /*((IB_NOTICE_TYPE_INFO & 0x0F) + 1)*/
+#define RVT_TRAP_TIMEOUT 4096 /* 4.096 usec */
+
+struct trap_list {
+	u32 list_len;
+	struct list_head list;
+};
+
+struct rvt_ibport {
+	struct rvt_qp __rcu *qp[2];
+	struct ib_mad_agent *send_agent;	/* agent for SMI (traps) */
+	struct rb_root mcast_tree;
+	spinlock_t lock;		/* protect changes in this struct */
+
+	/* non-zero when timer is set */
+	unsigned long mkey_lease_timeout;
+	unsigned long trap_timeout;
+	__be64 gid_prefix;      /* in network order */
+	__be64 mkey;
+	u64 tid;
+	u32 port_cap_flags;
+	u16 port_cap3_flags;
+	u32 pma_sample_start;
+	u32 pma_sample_interval;
+	__be16 pma_counter_select[5];
+	u16 pma_tag;
+	u16 mkey_lease_period;
+	u32 sm_lid;
+	u8 sm_sl;
+	u8 mkeyprot;
+	u8 subnet_timeout;
+	u8 vl_high_limit;
+
+	/*
+	 * Driver is expected to keep these up to date. These
+	 * counters are informational only and not required to be
+	 * completely accurate.
+	 */
+	u64 n_rc_resends;
+	u64 n_seq_naks;
+	u64 n_rdma_seq;
+	u64 n_rnr_naks;
+	u64 n_other_naks;
+	u64 n_loop_pkts;
+	u64 n_pkt_drops;
+	u64 n_vl15_dropped;
+	u64 n_rc_timeouts;
+	u64 n_dmawait;
+	u64 n_unaligned;
+	u64 n_rc_dupreq;
+	u64 n_rc_seqnak;
+	u16 pkey_violations;
+	u16 qkey_violations;
+	u16 mkey_violations;
+
+	/* Hot-path per CPU counters to avoid cacheline trading to update */
+	u64 z_rc_acks;
+	u64 z_rc_qacks;
+	u64 z_rc_delayed_comp;
+	u64 __percpu *rc_acks;
+	u64 __percpu *rc_qacks;
+	u64 __percpu *rc_delayed_comp;
+
+	void *priv; /* driver private data */
+
+	/*
+	 * The pkey table is allocated and maintained by the driver. Drivers
+	 * need to have access to this before registering with rdmav. However
+	 * rdmavt will need access to it so drivers need to proviee this during
+	 * the attach port API call.
+	 */
+	u16 *pkey_table;
+
+	struct rvt_ah *sm_ah;
+
+	/*
+	 * Keep a list of traps that have not been repressed.  They will be
+	 * resent based on trap_timer.
+	 */
+	struct trap_list trap_lists[RVT_MAX_TRAP_LISTS];
+	struct timer_list trap_timer;
+};
+
+#define RVT_CQN_MAX 16 /* maximum length of cq name */
+
+/*
+ * Things that are driver specific, module parameters in hfi1 and qib
+ */
+struct rvt_driver_params {
+	struct ib_device_attr props;
+
+	/*
+	 * Anything driver specific that is not covered by props
+	 * For instance special module parameters. Goes here.
+	 */
+	unsigned int lkey_table_size;
+	unsigned int qp_table_size;
+	int qpn_start;
+	int qpn_inc;
+	int qpn_res_start;
+	int qpn_res_end;
+	int nports;
+	int npkeys;
+	char cq_name[RVT_CQN_MAX];
+	int node;
+	int psn_mask;
+	int psn_shift;
+	int psn_modify_mask;
+	u32 core_cap_flags;
+	u32 max_mad_size;
+	u8 qos_shift;
+	u8 max_rdma_atomic;
+	u8 reserved_operations;
+};
+
+/* Protection domain */
+struct rvt_pd {
+	struct ib_pd ibpd;
+	bool user;
+};
+
+/* Address handle */
+struct rvt_ah {
+	struct ib_ah ibah;
+	struct rdma_ah_attr attr;
+	atomic_t refcount;
+	u8 vl;
+	u8 log_pmtu;
+};
+
+struct rvt_dev_info;
+struct rvt_swqe;
+struct rvt_driver_provided {
+	/*
+	 * Which functions are required depends on which verbs rdmavt is
+	 * providing and which verbs the driver is overriding. See
+	 * check_support() for details.
+	 */
+
+	/* hot path calldowns in a single cacheline */
+
+	/*
+	 * Give the driver a notice that there is send work to do. It is up to
+	 * the driver to generally push the packets out, this just queues the
+	 * work with the driver. There are two variants here. The no_lock
+	 * version requires the s_lock not to be held. The other assumes the
+	 * s_lock is held.
+	 */
+	void (*schedule_send)(struct rvt_qp *qp);
+	void (*schedule_send_no_lock)(struct rvt_qp *qp);
+
+	/* Driver specific work request checking */
+	int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+	/*
+	 * Sometimes rdmavt needs to kick the driver's send progress. That is
+	 * done by this call back.
+	 */
+	void (*do_send)(struct rvt_qp *qp);
+
+	/* Passed to ib core registration. Callback to create syfs files */
+	int (*port_callback)(struct ib_device *, u8, struct kobject *);
+
+	/*
+	 * Returns a pointer to the undelying hardware's PCI device. This is
+	 * used to display information as to what hardware is being referenced
+	 * in an output message
+	 */
+	struct pci_dev * (*get_pci_dev)(struct rvt_dev_info *rdi);
+
+	/*
+	 * Allocate a private queue pair data structure for driver specific
+	 * information which is opaque to rdmavt.  Errors are returned via
+	 * ERR_PTR(err).  The driver is free to return NULL or a valid
+	 * pointer.
+	 */
+	void * (*qp_priv_alloc)(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+
+	/*
+	 * Free the driver's private qp structure.
+	 */
+	void (*qp_priv_free)(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+
+	/*
+	 * Inform the driver the particular qp in quesiton has been reset so
+	 * that it can clean up anything it needs to.
+	 */
+	void (*notify_qp_reset)(struct rvt_qp *qp);
+
+	/*
+	 * Get a path mtu from the driver based on qp attributes.
+	 */
+	int (*get_pmtu_from_attr)(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+				  struct ib_qp_attr *attr);
+
+	/*
+	 * Notify driver that it needs to flush any outstanding IO requests that
+	 * are waiting on a qp.
+	 */
+	void (*flush_qp_waiters)(struct rvt_qp *qp);
+
+	/*
+	 * Notify driver to stop its queue of sending packets. Nothing else
+	 * should be posted to the queue pair after this has been called.
+	 */
+	void (*stop_send_queue)(struct rvt_qp *qp);
+
+	/*
+	 * Have the drivr drain any in progress operations
+	 */
+	void (*quiesce_qp)(struct rvt_qp *qp);
+
+	/*
+	 * Inform the driver a qp has went to error state.
+	 */
+	void (*notify_error_qp)(struct rvt_qp *qp);
+
+	/*
+	 * Get an MTU for a qp.
+	 */
+	u32 (*mtu_from_qp)(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+			   u32 pmtu);
+	/*
+	 * Convert an mtu to a path mtu
+	 */
+	int (*mtu_to_path_mtu)(u32 mtu);
+
+	/*
+	 * Get the guid of a port in big endian byte order
+	 */
+	int (*get_guid_be)(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
+			   int guid_index, __be64 *guid);
+
+	/*
+	 * Query driver for the state of the port.
+	 */
+	int (*query_port_state)(struct rvt_dev_info *rdi, u8 port_num,
+				struct ib_port_attr *props);
+
+	/*
+	 * Tell driver to shutdown a port
+	 */
+	int (*shut_down_port)(struct rvt_dev_info *rdi, u8 port_num);
+
+	/* Tell driver to send a trap for changed  port capabilities */
+	void (*cap_mask_chg)(struct rvt_dev_info *rdi, u8 port_num);
+
+	/*
+	 * The following functions can be safely ignored completely. Any use of
+	 * these is checked for NULL before blindly calling. Rdmavt should also
+	 * be functional if drivers omit these.
+	 */
+
+	/* Called to inform the driver that all qps should now be freed. */
+	unsigned (*free_all_qps)(struct rvt_dev_info *rdi);
+
+	/* Driver specific AH validation */
+	int (*check_ah)(struct ib_device *, struct rdma_ah_attr *);
+
+	/* Inform the driver a new AH has been created */
+	void (*notify_new_ah)(struct ib_device *, struct rdma_ah_attr *,
+			      struct rvt_ah *);
+
+	/* Let the driver pick the next queue pair number*/
+	int (*alloc_qpn)(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
+			 enum ib_qp_type type, u8 port_num);
+
+	/* Determine if its safe or allowed to modify the qp */
+	int (*check_modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr,
+			       int attr_mask, struct ib_udata *udata);
+
+	/* Driver specific QP modification/notification-of */
+	void (*modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr,
+			  int attr_mask, struct ib_udata *udata);
+
+	/* Notify driver a mad agent has been created */
+	void (*notify_create_mad_agent)(struct rvt_dev_info *rdi, int port_idx);
+
+	/* Notify driver a mad agent has been removed */
+	void (*notify_free_mad_agent)(struct rvt_dev_info *rdi, int port_idx);
+
+	/* Notify driver to restart rc */
+	void (*notify_restart_rc)(struct rvt_qp *qp, u32 psn, int wait);
+};
+
+struct rvt_dev_info {
+	struct ib_device ibdev; /* Keep this first. Nothing above here */
+
+	/*
+	 * Prior to calling for registration the driver will be responsible for
+	 * allocating space for this structure.
+	 *
+	 * The driver will also be responsible for filling in certain members of
+	 * dparms.props. The driver needs to fill in dparms exactly as it would
+	 * want values reported to a ULP. This will be returned to the caller
+	 * in rdmavt's device. The driver should also therefore refrain from
+	 * modifying this directly after registration with rdmavt.
+	 */
+
+	/* Driver specific properties */
+	struct rvt_driver_params dparms;
+
+	/* post send table */
+	const struct rvt_operation_params *post_parms;
+
+	/* Driver specific helper functions */
+	struct rvt_driver_provided driver_f;
+
+	struct rvt_mregion __rcu *dma_mr;
+	struct rvt_lkey_table lkey_table;
+
+	/* Internal use */
+	int n_pds_allocated;
+	spinlock_t n_pds_lock; /* Protect pd allocated count */
+
+	int n_ahs_allocated;
+	spinlock_t n_ahs_lock; /* Protect ah allocated count */
+
+	u32 n_srqs_allocated;
+	spinlock_t n_srqs_lock; /* Protect srqs allocated count */
+
+	int flags;
+	struct rvt_ibport **ports;
+
+	/* QP */
+	struct rvt_qp_ibdev *qp_dev;
+	u32 n_qps_allocated;    /* number of QPs allocated for device */
+	u32 n_rc_qps;		/* number of RC QPs allocated for device */
+	u32 busy_jiffies;	/* timeout scaling based on RC QP count */
+	spinlock_t n_qps_lock;	/* protect qps, rc qps and busy jiffy counts */
+
+	/* memory maps */
+	struct list_head pending_mmaps;
+	spinlock_t mmap_offset_lock; /* protect mmap_offset */
+	u32 mmap_offset;
+	spinlock_t pending_lock; /* protect pending mmap list */
+
+	/* CQ */
+	struct kthread_worker *worker; /* per device cq worker */
+	u32 n_cqs_allocated;    /* number of CQs allocated for device */
+	spinlock_t n_cqs_lock; /* protect count of in use cqs */
+
+	/* Multicast */
+	u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
+	spinlock_t n_mcast_grps_lock;
+
+};
+
+/**
+ * rvt_set_ibdev_name - Craft an IB device name from client info
+ * @rdi: pointer to the client rvt_dev_info structure
+ * @name: client specific name
+ * @unit: client specific unit number.
+ */
+static inline void rvt_set_ibdev_name(struct rvt_dev_info *rdi,
+				      const char *fmt, const char *name,
+				      const int unit)
+{
+	snprintf(rdi->ibdev.name, sizeof(rdi->ibdev.name), fmt, name, unit);
+}
+
+/**
+ * rvt_get_ibdev_name - return the IB name
+ * @rdi: rdmavt device
+ *
+ * Return the registered name of the device.
+ */
+static inline const char *rvt_get_ibdev_name(const struct rvt_dev_info *rdi)
+{
+	return rdi->ibdev.name;
+}
+
+static inline struct rvt_pd *ibpd_to_rvtpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct rvt_pd, ibpd);
+}
+
+static inline struct rvt_ah *ibah_to_rvtah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct rvt_ah, ibah);
+}
+
+static inline struct rvt_dev_info *ib_to_rvt(struct ib_device *ibdev)
+{
+	return  container_of(ibdev, struct rvt_dev_info, ibdev);
+}
+
+static inline struct rvt_srq *ibsrq_to_rvtsrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct rvt_srq, ibsrq);
+}
+
+static inline struct rvt_qp *ibqp_to_rvtqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct rvt_qp, ibqp);
+}
+
+static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi)
+{
+	/*
+	 * All ports have same number of pkeys.
+	 */
+	return rdi->dparms.npkeys;
+}
+
+/*
+ * Return the max atomic suitable for determining
+ * the size of the ack ring buffer in a QP.
+ */
+static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi)
+{
+	return rdi->dparms.max_rdma_atomic + 1;
+}
+
+/*
+ * Return the indexed PKEY from the port PKEY table.
+ */
+static inline u16 rvt_get_pkey(struct rvt_dev_info *rdi,
+			       int port_index,
+			       unsigned index)
+{
+	if (index >= rvt_get_npkeys(rdi))
+		return 0;
+	else
+		return rdi->ports[port_index]->pkey_table[index];
+}
+
+/**
+ * rvt_lookup_qpn - return the QP with the given QPN
+ * @ibp: the ibport
+ * @qpn: the QP number to look up
+ *
+ * The caller must hold the rcu_read_lock(), and keep the lock until
+ * the returned qp is no longer in use.
+ */
+/* TODO: Remove this and put in rdmavt/qp.h when no longer needed by drivers */
+static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
+					    struct rvt_ibport *rvp,
+					    u32 qpn) __must_hold(RCU)
+{
+	struct rvt_qp *qp = NULL;
+
+	if (unlikely(qpn <= 1)) {
+		qp = rcu_dereference(rvp->qp[qpn]);
+	} else {
+		u32 n = hash_32(qpn, rdi->qp_dev->qp_table_bits);
+
+		for (qp = rcu_dereference(rdi->qp_dev->qp_table[n]); qp;
+			qp = rcu_dereference(qp->next))
+			if (qp->ibqp.qp_num == qpn)
+				break;
+	}
+	return qp;
+}
+
+/**
+ * rvt_mod_retry_timer - mod a retry timer
+ * @qp - the QP
+ * Modify a potentially already running retry timer
+ */
+static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
+{
+	struct ib_qp *ibqp = &qp->ibqp;
+	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+	lockdep_assert_held(&qp->s_lock);
+	qp->s_flags |= RVT_S_TIMER;
+	/* 4.096 usec. * (1 << qp->timeout) */
+	mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies +
+		  rdi->busy_jiffies);
+}
+
+struct rvt_dev_info *rvt_alloc_device(size_t size, int nports);
+void rvt_dealloc_device(struct rvt_dev_info *rdi);
+int rvt_register_device(struct rvt_dev_info *rvd, u32 driver_id);
+void rvt_unregister_device(struct rvt_dev_info *rvd);
+int rvt_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr);
+int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port,
+		  int port_index, u16 *pkey_table);
+int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
+		    int access);
+int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey);
+int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
+		u32 len, u64 vaddr, u32 rkey, int acc);
+int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
+		struct rvt_sge *isge, struct rvt_sge *last_sge,
+		struct ib_sge *sge, int acc);
+struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid,
+				 u16 lid);
+
+#endif          /* DEF_RDMA_VT_H */
diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h
new file mode 100644
index 0000000..51fd00b
--- /dev/null
+++ b/include/rdma/rdmavt_cq.h
@@ -0,0 +1,99 @@
+#ifndef DEF_RDMAVT_INCCQ_H
+#define DEF_RDMAVT_INCCQ_H
+
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/kthread.h>
+#include <rdma/ib_user_verbs.h>
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define RVT_CQ_NONE      (IB_CQ_NEXT_COMP + 1)
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct rvt_cq_wc {
+	u32 head;               /* index of next entry to fill */
+	u32 tail;               /* index of next ib_poll_cq() entry */
+	union {
+		/* these are actually size ibcq.cqe + 1 */
+		struct ib_uverbs_wc uqueue[0];
+		struct ib_wc kqueue[0];
+	};
+};
+
+/*
+ * The completion queue structure.
+ */
+struct rvt_cq {
+	struct ib_cq ibcq;
+	struct kthread_work comptask;
+	spinlock_t lock; /* protect changes in this struct */
+	u8 notify;
+	u8 triggered;
+	struct rvt_dev_info *rdi;
+	struct rvt_cq_wc *queue;
+	struct rvt_mmap_info *ip;
+};
+
+static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct rvt_cq, ibcq);
+}
+
+void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited);
+
+#endif          /* DEF_RDMAVT_INCCQH */
diff --git a/include/rdma/rdmavt_mr.h b/include/rdma/rdmavt_mr.h
new file mode 100644
index 0000000..72a3856
--- /dev/null
+++ b/include/rdma/rdmavt_mr.h
@@ -0,0 +1,197 @@
+#ifndef DEF_RDMAVT_INCMR_H
+#define DEF_RDMAVT_INCMR_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * For Memory Regions. This stuff should probably be moved into rdmavt/mr.h once
+ * drivers no longer need access to the MR directly.
+ */
+#include <linux/percpu-refcount.h>
+
+/*
+ * A segment is a linear region of low physical memory.
+ * Used by the verbs layer.
+ */
+struct rvt_seg {
+	void *vaddr;
+	size_t length;
+};
+
+/* The number of rvt_segs that fit in a page. */
+#define RVT_SEGSZ     (PAGE_SIZE / sizeof(struct rvt_seg))
+
+struct rvt_segarray {
+	struct rvt_seg segs[RVT_SEGSZ];
+};
+
+struct rvt_mregion {
+	struct ib_pd *pd;       /* shares refcnt of ibmr.pd */
+	u64 user_base;          /* User's address for this region */
+	u64 iova;               /* IB start address of this region */
+	size_t length;
+	u32 lkey;
+	u32 offset;             /* offset (bytes) to start of region */
+	int access_flags;
+	u32 max_segs;           /* number of rvt_segs in all the arrays */
+	u32 mapsz;              /* size of the map array */
+	atomic_t lkey_invalid;	/* true if current lkey is invalid */
+	u8  page_shift;         /* 0 - non unform/non powerof2 sizes */
+	u8  lkey_published;     /* in global table */
+	struct percpu_ref refcount;
+	struct completion comp; /* complete when refcount goes to zero */
+	struct rvt_segarray *map[0];    /* the segments */
+};
+
+#define RVT_MAX_LKEY_TABLE_BITS 23
+
+struct rvt_lkey_table {
+	/* read mostly fields */
+	u32 max;                /* size of the table */
+	u32 shift;              /* lkey/rkey shift */
+	struct rvt_mregion __rcu **table;
+	/* writeable fields */
+	/* protect changes in this struct */
+	spinlock_t lock ____cacheline_aligned_in_smp;
+	u32 next;               /* next unused index (speeds search) */
+	u32 gen;                /* generation count */
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct rvt_sge {
+	struct rvt_mregion *mr;
+	void *vaddr;            /* kernel virtual address of segment */
+	u32 sge_length;         /* length of the SGE */
+	u32 length;             /* remaining length of the segment */
+	u16 m;                  /* current index: mr->map[m] */
+	u16 n;                  /* current index: mr->map[m]->segs[n] */
+};
+
+struct rvt_sge_state {
+	struct rvt_sge *sg_list;      /* next SGE to be used if any */
+	struct rvt_sge sge;   /* progress state for the current SGE */
+	u32 total_len;
+	u8 num_sge;
+};
+
+static inline void rvt_put_mr(struct rvt_mregion *mr)
+{
+	percpu_ref_put(&mr->refcount);
+}
+
+static inline void rvt_get_mr(struct rvt_mregion *mr)
+{
+	percpu_ref_get(&mr->refcount);
+}
+
+static inline void rvt_put_ss(struct rvt_sge_state *ss)
+{
+	while (ss->num_sge) {
+		rvt_put_mr(ss->sge.mr);
+		if (--ss->num_sge)
+			ss->sge = *ss->sg_list++;
+	}
+}
+
+static inline u32 rvt_get_sge_length(struct rvt_sge *sge, u32 length)
+{
+	u32 len = sge->length;
+
+	if (len > length)
+		len = length;
+	if (len > sge->sge_length)
+		len = sge->sge_length;
+
+	return len;
+}
+
+static inline void rvt_update_sge(struct rvt_sge_state *ss, u32 length,
+				  bool release)
+{
+	struct rvt_sge *sge = &ss->sge;
+
+	sge->vaddr += length;
+	sge->length -= length;
+	sge->sge_length -= length;
+	if (sge->sge_length == 0) {
+		if (release)
+			rvt_put_mr(sge->mr);
+		if (--ss->num_sge)
+			*sge = *ss->sg_list++;
+	} else if (sge->length == 0 && sge->mr->lkey) {
+		if (++sge->n >= RVT_SEGSZ) {
+			if (++sge->m >= sge->mr->mapsz)
+				return;
+			sge->n = 0;
+		}
+		sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+		sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+	}
+}
+
+static inline void rvt_skip_sge(struct rvt_sge_state *ss, u32 length,
+				bool release)
+{
+	struct rvt_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = rvt_get_sge_length(sge, length);
+
+		WARN_ON_ONCE(len == 0);
+		rvt_update_sge(ss, len, release);
+		length -= len;
+	}
+}
+
+bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey);
+bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey);
+
+#endif          /* DEF_RDMAVT_INCMRH */
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
new file mode 100644
index 0000000..89ab88c
--- /dev/null
+++ b/include/rdma/rdmavt_qp.h
@@ -0,0 +1,706 @@
+#ifndef DEF_RDMAVT_INCQP_H
+#define DEF_RDMAVT_INCQP_H
+
+/*
+ * Copyright(c) 2016, 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/rdma_vt.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdmavt_cq.h>
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define RVT_R_WRID_VALID        0
+#define RVT_R_REWIND_SGE        1
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define RVT_R_REUSE_SGE 0x01
+#define RVT_R_RDMAR_SEQ 0x02
+#define RVT_R_RSP_NAK   0x04
+#define RVT_R_RSP_SEND  0x08
+#define RVT_R_COMM_EST  0x10
+
+/*
+ * Bit definitions for s_flags.
+ *
+ * RVT_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled
+ * RVT_S_BUSY - send tasklet is processing the QP
+ * RVT_S_TIMER - the RC retry timer is active
+ * RVT_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics
+ * RVT_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs
+ *                         before processing the next SWQE
+ * RVT_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete
+ *                         before processing the next SWQE
+ * RVT_S_WAIT_RNR - waiting for RNR timeout
+ * RVT_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * RVT_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ *                  next send completion entry not via send DMA
+ * RVT_S_WAIT_PIO - waiting for a send buffer to be available
+ * RVT_S_WAIT_PIO_DRAIN - waiting for a qp to drain pio packets
+ * RVT_S_WAIT_TX - waiting for a struct verbs_txreq to be available
+ * RVT_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available
+ * RVT_S_WAIT_KMEM - waiting for kernel memory to be available
+ * RVT_S_WAIT_PSN - waiting for a packet to exit the send DMA queue
+ * RVT_S_WAIT_ACK - waiting for an ACK packet before sending more requests
+ * RVT_S_SEND_ONE - send one packet, request ACK, then wait for ACK
+ * RVT_S_ECN - a BECN was queued to the send engine
+ */
+#define RVT_S_SIGNAL_REQ_WR	0x0001
+#define RVT_S_BUSY		0x0002
+#define RVT_S_TIMER		0x0004
+#define RVT_S_RESP_PENDING	0x0008
+#define RVT_S_ACK_PENDING	0x0010
+#define RVT_S_WAIT_FENCE	0x0020
+#define RVT_S_WAIT_RDMAR	0x0040
+#define RVT_S_WAIT_RNR		0x0080
+#define RVT_S_WAIT_SSN_CREDIT	0x0100
+#define RVT_S_WAIT_DMA		0x0200
+#define RVT_S_WAIT_PIO		0x0400
+#define RVT_S_WAIT_PIO_DRAIN    0x0800
+#define RVT_S_WAIT_TX		0x1000
+#define RVT_S_WAIT_DMA_DESC	0x2000
+#define RVT_S_WAIT_KMEM		0x4000
+#define RVT_S_WAIT_PSN		0x8000
+#define RVT_S_WAIT_ACK		0x10000
+#define RVT_S_SEND_ONE		0x20000
+#define RVT_S_UNLIMITED_CREDIT	0x40000
+#define RVT_S_AHG_VALID		0x80000
+#define RVT_S_AHG_CLEAR		0x100000
+#define RVT_S_ECN		0x200000
+
+/*
+ * Wait flags that would prevent any packet type from being sent.
+ */
+#define RVT_S_ANY_WAIT_IO \
+	(RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN | RVT_S_WAIT_TX | \
+	 RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM)
+
+/*
+ * Wait flags that would prevent send work requests from making progress.
+ */
+#define RVT_S_ANY_WAIT_SEND (RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | \
+	RVT_S_WAIT_RNR | RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA | \
+	RVT_S_WAIT_PSN | RVT_S_WAIT_ACK)
+
+#define RVT_S_ANY_WAIT (RVT_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
+
+/* Number of bits to pay attention to in the opcode for checking qp type */
+#define RVT_OPCODE_QP_MASK 0xE0
+
+/* Flags for checking QP state (see ib_rvt_state_ops[]) */
+#define RVT_POST_SEND_OK                0x01
+#define RVT_POST_RECV_OK                0x02
+#define RVT_PROCESS_RECV_OK             0x04
+#define RVT_PROCESS_SEND_OK             0x08
+#define RVT_PROCESS_NEXT_SEND_OK        0x10
+#define RVT_FLUSH_SEND			0x20
+#define RVT_FLUSH_RECV			0x40
+#define RVT_PROCESS_OR_FLUSH_SEND \
+	(RVT_PROCESS_SEND_OK | RVT_FLUSH_SEND)
+#define RVT_SEND_OR_FLUSH_OR_RECV_OK \
+	(RVT_PROCESS_SEND_OK | RVT_FLUSH_SEND | RVT_PROCESS_RECV_OK)
+
+/*
+ * Internal send flags
+ */
+#define RVT_SEND_RESERVE_USED           IB_SEND_RESERVED_START
+#define RVT_SEND_COMPLETION_ONLY	(IB_SEND_RESERVED_START << 1)
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct rvt_swqe {
+	union {
+		struct ib_send_wr wr;   /* don't use wr.sg_list */
+		struct ib_ud_wr ud_wr;
+		struct ib_reg_wr reg_wr;
+		struct ib_rdma_wr rdma_wr;
+		struct ib_atomic_wr atomic_wr;
+	};
+	u32 psn;                /* first packet sequence number */
+	u32 lpsn;               /* last packet sequence number */
+	u32 ssn;                /* send sequence number */
+	u32 length;             /* total length of data in sg_list */
+	struct rvt_sge sg_list[0];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct rvt_rwqe {
+	u64 wr_id;
+	u8 num_sge;
+	struct ib_sge sg_list[0];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() instead.
+ */
+struct rvt_rwq {
+	u32 head;               /* new work requests posted to the head */
+	u32 tail;               /* receives pull requests from here. */
+	struct rvt_rwqe wq[0];
+};
+
+struct rvt_rq {
+	struct rvt_rwq *wq;
+	u32 size;               /* size of RWQE array */
+	u8 max_sge;
+	/* protect changes in this struct */
+	spinlock_t lock ____cacheline_aligned_in_smp;
+};
+
+/*
+ * This structure is used by rvt_mmap() to validate an offset
+ * when an mmap() request is made.  The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct rvt_mmap_info {
+	struct list_head pending_mmaps;
+	struct ib_ucontext *context;
+	void *obj;
+	__u64 offset;
+	struct kref ref;
+	unsigned size;
+};
+
+/*
+ * This structure holds the information that the send tasklet needs
+ * to send a RDMA read response or atomic operation.
+ */
+struct rvt_ack_entry {
+	struct rvt_sge rdma_sge;
+	u64 atomic_data;
+	u32 psn;
+	u32 lpsn;
+	u8 opcode;
+	u8 sent;
+};
+
+#define	RC_QP_SCALING_INTERVAL	5
+
+#define RVT_OPERATION_PRIV        0x00000001
+#define RVT_OPERATION_ATOMIC      0x00000002
+#define RVT_OPERATION_ATOMIC_SGE  0x00000004
+#define RVT_OPERATION_LOCAL       0x00000008
+#define RVT_OPERATION_USE_RESERVE 0x00000010
+
+#define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1)
+
+/**
+ * rvt_operation_params - op table entry
+ * @length - the length to copy into the swqe entry
+ * @qpt_support - a bit mask indicating QP type support
+ * @flags - RVT_OPERATION flags (see above)
+ *
+ * This supports table driven post send so that
+ * the driver can have differing an potentially
+ * different sets of operations.
+ *
+ **/
+
+struct rvt_operation_params {
+	size_t length;
+	u32 qpt_support;
+	u32 flags;
+};
+
+/*
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct rvt_qp {
+	struct ib_qp ibqp;
+	void *priv; /* Driver private data */
+	/* read mostly fields above and below */
+	struct rdma_ah_attr remote_ah_attr;
+	struct rdma_ah_attr alt_ah_attr;
+	struct rvt_qp __rcu *next;           /* link list for QPN hash table */
+	struct rvt_swqe *s_wq;  /* send work queue */
+	struct rvt_mmap_info *ip;
+
+	unsigned long timeout_jiffies;  /* computed from timeout */
+
+	int srate_mbps;		/* s_srate (below) converted to Mbit/s */
+	pid_t pid;		/* pid for user mode QPs */
+	u32 remote_qpn;
+	u32 qkey;               /* QKEY for this QP (for UD or RD) */
+	u32 s_size;             /* send work queue size */
+
+	u16 pmtu;		/* decoded from path_mtu */
+	u8 log_pmtu;		/* shift for pmtu */
+	u8 state;               /* QP state */
+	u8 allowed_ops;		/* high order bits of allowed opcodes */
+	u8 qp_access_flags;
+	u8 alt_timeout;         /* Alternate path timeout for this QP */
+	u8 timeout;             /* Timeout for this QP */
+	u8 s_srate;
+	u8 s_mig_state;
+	u8 port_num;
+	u8 s_pkey_index;        /* PKEY index to use */
+	u8 s_alt_pkey_index;    /* Alternate path PKEY index to use */
+	u8 r_max_rd_atomic;     /* max number of RDMA read/atomic to receive */
+	u8 s_max_rd_atomic;     /* max number of RDMA read/atomic to send */
+	u8 s_retry_cnt;         /* number of times to retry */
+	u8 s_rnr_retry_cnt;
+	u8 r_min_rnr_timer;     /* retry timeout value for RNR NAKs */
+	u8 s_max_sge;           /* size of s_wq->sg_list */
+	u8 s_draining;
+
+	/* start of read/write fields */
+	atomic_t refcount ____cacheline_aligned_in_smp;
+	wait_queue_head_t wait;
+
+	struct rvt_ack_entry *s_ack_queue;
+	struct rvt_sge_state s_rdma_read_sge;
+
+	spinlock_t r_lock ____cacheline_aligned_in_smp;      /* used for APM */
+	u32 r_psn;              /* expected rcv packet sequence number */
+	unsigned long r_aflags;
+	u64 r_wr_id;            /* ID for current receive WQE */
+	u32 r_ack_psn;          /* PSN for next ACK or atomic ACK */
+	u32 r_len;              /* total length of r_sge */
+	u32 r_rcv_len;          /* receive data len processed */
+	u32 r_msn;              /* message sequence number */
+
+	u8 r_state;             /* opcode of last packet received */
+	u8 r_flags;
+	u8 r_head_ack_queue;    /* index into s_ack_queue[] */
+	u8 r_adefered;          /* defered ack count */
+
+	struct list_head rspwait;       /* link for waiting to respond */
+
+	struct rvt_sge_state r_sge;     /* current receive data */
+	struct rvt_rq r_rq;             /* receive work queue */
+
+	/* post send line */
+	spinlock_t s_hlock ____cacheline_aligned_in_smp;
+	u32 s_head;             /* new entries added here */
+	u32 s_next_psn;         /* PSN for next request */
+	u32 s_avail;            /* number of entries avail */
+	u32 s_ssn;              /* SSN of tail entry */
+	atomic_t s_reserved_used; /* reserved entries in use */
+
+	spinlock_t s_lock ____cacheline_aligned_in_smp;
+	u32 s_flags;
+	struct rvt_sge_state *s_cur_sge;
+	struct rvt_swqe *s_wqe;
+	struct rvt_sge_state s_sge;     /* current send request data */
+	struct rvt_mregion *s_rdma_mr;
+	u32 s_len;              /* total length of s_sge */
+	u32 s_rdma_read_len;    /* total length of s_rdma_read_sge */
+	u32 s_last_psn;         /* last response PSN processed */
+	u32 s_sending_psn;      /* lowest PSN that is being sent */
+	u32 s_sending_hpsn;     /* highest PSN that is being sent */
+	u32 s_psn;              /* current packet sequence number */
+	u32 s_ack_rdma_psn;     /* PSN for sending RDMA read responses */
+	u32 s_ack_psn;          /* PSN for acking sends and RDMA writes */
+	u32 s_tail;             /* next entry to process */
+	u32 s_cur;              /* current work queue entry */
+	u32 s_acked;            /* last un-ACK'ed entry */
+	u32 s_last;             /* last completed entry */
+	u32 s_lsn;              /* limit sequence number (credit) */
+	u32 s_ahgpsn;           /* set to the psn in the copy of the header */
+	u16 s_cur_size;         /* size of send packet in bytes */
+	u16 s_rdma_ack_cnt;
+	u8 s_hdrwords;         /* size of s_hdr in 32 bit words */
+	s8 s_ahgidx;
+	u8 s_state;             /* opcode of last packet sent */
+	u8 s_ack_state;         /* opcode of packet to ACK */
+	u8 s_nak_state;         /* non-zero if NAK is pending */
+	u8 r_nak_state;         /* non-zero if NAK is pending */
+	u8 s_retry;             /* requester retry counter */
+	u8 s_rnr_retry;         /* requester RNR retry counter */
+	u8 s_num_rd_atomic;     /* number of RDMA read/atomic pending */
+	u8 s_tail_ack_queue;    /* index into s_ack_queue[] */
+
+	struct rvt_sge_state s_ack_rdma_sge;
+	struct timer_list s_timer;
+	struct hrtimer s_rnr_timer;
+
+	atomic_t local_ops_pending; /* number of fast_reg/local_inv reqs */
+
+	/*
+	 * This sge list MUST be last. Do not add anything below here.
+	 */
+	struct rvt_sge r_sg_list[0] /* verified SGEs */
+		____cacheline_aligned_in_smp;
+};
+
+struct rvt_srq {
+	struct ib_srq ibsrq;
+	struct rvt_rq rq;
+	struct rvt_mmap_info *ip;
+	/* send signal when number of RWQEs < limit */
+	u32 limit;
+};
+
+#define RVT_QPN_MAX                 BIT(24)
+#define RVT_QPNMAP_ENTRIES          (RVT_QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+#define RVT_BITS_PER_PAGE           (PAGE_SIZE * BITS_PER_BYTE)
+#define RVT_BITS_PER_PAGE_MASK      (RVT_BITS_PER_PAGE - 1)
+#define RVT_QPN_MASK		    IB_QPN_MASK
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct rvt_qpn_map {
+	void *page;
+};
+
+struct rvt_qpn_table {
+	spinlock_t lock; /* protect changes to the qp table */
+	unsigned flags;         /* flags for QP0/1 allocated for each port */
+	u32 last;               /* last QP number allocated */
+	u32 nmaps;              /* size of the map table */
+	u16 limit;
+	u8  incr;
+	/* bit map of free QP numbers other than 0/1 */
+	struct rvt_qpn_map map[RVT_QPNMAP_ENTRIES];
+};
+
+struct rvt_qp_ibdev {
+	u32 qp_table_size;
+	u32 qp_table_bits;
+	struct rvt_qp __rcu **qp_table;
+	spinlock_t qpt_lock; /* qptable lock */
+	struct rvt_qpn_table qpn_table;
+};
+
+/*
+ * There is one struct rvt_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct rvt_mcast_qp.
+ */
+struct rvt_mcast_qp {
+	struct list_head list;
+	struct rvt_qp *qp;
+};
+
+struct rvt_mcast_addr {
+	union ib_gid mgid;
+	u16 lid;
+};
+
+struct rvt_mcast {
+	struct rb_node rb_node;
+	struct rvt_mcast_addr mcast_addr;
+	struct list_head qp_list;
+	wait_queue_head_t wait;
+	atomic_t refcount;
+	int n_attached;
+};
+
+/*
+ * Since struct rvt_swqe is not a fixed size, we can't simply index into
+ * struct rvt_qp.s_wq.  This function does the array index computation.
+ */
+static inline struct rvt_swqe *rvt_get_swqe_ptr(struct rvt_qp *qp,
+						unsigned n)
+{
+	return (struct rvt_swqe *)((char *)qp->s_wq +
+				     (sizeof(struct rvt_swqe) +
+				      qp->s_max_sge *
+				      sizeof(struct rvt_sge)) * n);
+}
+
+/*
+ * Since struct rvt_rwqe is not a fixed size, we can't simply index into
+ * struct rvt_rwq.wq.  This function does the array index computation.
+ */
+static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n)
+{
+	return (struct rvt_rwqe *)
+		((char *)rq->wq->wq +
+		 (sizeof(struct rvt_rwqe) +
+		  rq->max_sge * sizeof(struct ib_sge)) * n);
+}
+
+/**
+ * rvt_is_user_qp - return if this is user mode QP
+ * @qp - the target QP
+ */
+static inline bool rvt_is_user_qp(struct rvt_qp *qp)
+{
+	return !!qp->pid;
+}
+
+/**
+ * rvt_get_qp - get a QP reference
+ * @qp - the QP to hold
+ */
+static inline void rvt_get_qp(struct rvt_qp *qp)
+{
+	atomic_inc(&qp->refcount);
+}
+
+/**
+ * rvt_put_qp - release a QP reference
+ * @qp - the QP to release
+ */
+static inline void rvt_put_qp(struct rvt_qp *qp)
+{
+	if (qp && atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+/**
+ * rvt_put_swqe - drop mr refs held by swqe
+ * @wqe - the send wqe
+ *
+ * This drops any mr references held by the swqe
+ */
+static inline void rvt_put_swqe(struct rvt_swqe *wqe)
+{
+	int i;
+
+	for (i = 0; i < wqe->wr.num_sge; i++) {
+		struct rvt_sge *sge = &wqe->sg_list[i];
+
+		rvt_put_mr(sge->mr);
+	}
+}
+
+/**
+ * rvt_qp_wqe_reserve - reserve operation
+ * @qp - the rvt qp
+ * @wqe - the send wqe
+ *
+ * This routine used in post send to record
+ * a wqe relative reserved operation use.
+ */
+static inline void rvt_qp_wqe_reserve(
+	struct rvt_qp *qp,
+	struct rvt_swqe *wqe)
+{
+	atomic_inc(&qp->s_reserved_used);
+}
+
+/**
+ * rvt_qp_wqe_unreserve - clean reserved operation
+ * @qp - the rvt qp
+ * @wqe - the send wqe
+ *
+ * This decrements the reserve use count.
+ *
+ * This call MUST precede the change to
+ * s_last to insure that post send sees a stable
+ * s_avail.
+ *
+ * An smp_mp__after_atomic() is used to insure
+ * the compiler does not juggle the order of the s_last
+ * ring index and the decrementing of s_reserved_used.
+ */
+static inline void rvt_qp_wqe_unreserve(
+	struct rvt_qp *qp,
+	struct rvt_swqe *wqe)
+{
+	if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) {
+		atomic_dec(&qp->s_reserved_used);
+		/* insure no compiler re-order up to s_last change */
+		smp_mb__after_atomic();
+	}
+}
+
+extern const enum ib_wc_opcode ib_rvt_wc_opcode[];
+
+/**
+ * rvt_qp_swqe_complete() - insert send completion
+ * @qp - the qp
+ * @wqe - the send wqe
+ * @status - completion status
+ *
+ * Insert a send completion into the completion
+ * queue if the qp indicates it should be done.
+ *
+ * See IBTA 10.7.3.1 for info on completion
+ * control.
+ */
+static inline void rvt_qp_swqe_complete(
+	struct rvt_qp *qp,
+	struct rvt_swqe *wqe,
+	enum ib_wc_opcode opcode,
+	enum ib_wc_status status)
+{
+	if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED))
+		return;
+	if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+	     status != IB_WC_SUCCESS) {
+		struct ib_wc wc;
+
+		memset(&wc, 0, sizeof(wc));
+		wc.wr_id = wqe->wr.wr_id;
+		wc.status = status;
+		wc.opcode = opcode;
+		wc.qp = &qp->ibqp;
+		wc.byte_len = wqe->length;
+		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc,
+			     status != IB_WC_SUCCESS);
+	}
+}
+
+/*
+ * Compare the lower 24 bits of the msn values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int rvt_cmp_msn(u32 a, u32 b)
+{
+	return (((int)a) - ((int)b)) << 8;
+}
+
+/**
+ * rvt_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 rvt_compute_aeth(struct rvt_qp *qp);
+
+/**
+ * rvt_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void rvt_get_credit(struct rvt_qp *qp, u32 aeth);
+
+/**
+ * @qp - the qp pair
+ * @len - the length
+ *
+ * Perform a shift based mtu round up divide
+ */
+static inline u32 rvt_div_round_up_mtu(struct rvt_qp *qp, u32 len)
+{
+	return (len + qp->pmtu - 1) >> qp->log_pmtu;
+}
+
+/**
+ * @qp - the qp pair
+ * @len - the length
+ *
+ * Perform a shift based mtu divide
+ */
+static inline u32 rvt_div_mtu(struct rvt_qp *qp, u32 len)
+{
+	return len >> qp->log_pmtu;
+}
+
+/**
+ * rvt_timeout_to_jiffies - Convert a ULP timeout input into jiffies
+ * @timeout - timeout input(0 - 31).
+ *
+ * Return a timeout value in jiffies.
+ */
+static inline unsigned long rvt_timeout_to_jiffies(u8 timeout)
+{
+	if (timeout > 31)
+		timeout = 31;
+
+	return usecs_to_jiffies(1U << timeout) * 4096UL / 1000UL;
+}
+
+extern const int  ib_rvt_state_ops[];
+
+struct rvt_dev_info;
+void rvt_comm_est(struct rvt_qp *qp);
+int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err);
+void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err);
+unsigned long rvt_rnr_tbl_to_usec(u32 index);
+enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t);
+void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth);
+void rvt_del_timers_sync(struct rvt_qp *qp);
+void rvt_stop_rc_timers(struct rvt_qp *qp);
+void rvt_add_retry_timer(struct rvt_qp *qp);
+
+/**
+ * struct rvt_qp_iter - the iterator for QPs
+ * @qp - the current QP
+ *
+ * This structure defines the current iterator
+ * state for sequenced access to all QPs relative
+ * to an rvt_dev_info.
+ */
+struct rvt_qp_iter {
+	struct rvt_qp *qp;
+	/* private: backpointer */
+	struct rvt_dev_info *rdi;
+	/* private: callback routine */
+	void (*cb)(struct rvt_qp *qp, u64 v);
+	/* private: for arg to callback routine */
+	u64 v;
+	/* private: number of SMI,GSI QPs for device */
+	int specials;
+	/* private: current iterator index */
+	int n;
+};
+
+struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi,
+				     u64 v,
+				     void (*cb)(struct rvt_qp *qp, u64 v));
+int rvt_qp_iter_next(struct rvt_qp_iter *iter);
+void rvt_qp_iter(struct rvt_dev_info *rdi,
+		 u64 v,
+		 void (*cb)(struct rvt_qp *qp, u64 v));
+void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey);
+#endif          /* DEF_RDMAVT_INCQP_H */
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
new file mode 100644
index 0000000..f3b3e35
--- /dev/null
+++ b/include/rdma/restrack.h
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_RESTRACK_H_
+#define _RDMA_RESTRACK_H_
+
+#include <linux/typecheck.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/kref.h>
+#include <linux/completion.h>
+#include <linux/sched/task.h>
+
+/**
+ * enum rdma_restrack_type - HW objects to track
+ */
+enum rdma_restrack_type {
+	/**
+	 * @RDMA_RESTRACK_PD: Protection domain (PD)
+	 */
+	RDMA_RESTRACK_PD,
+	/**
+	 * @RDMA_RESTRACK_CQ: Completion queue (CQ)
+	 */
+	RDMA_RESTRACK_CQ,
+	/**
+	 * @RDMA_RESTRACK_QP: Queue pair (QP)
+	 */
+	RDMA_RESTRACK_QP,
+	/**
+	 * @RDMA_RESTRACK_CM_ID: Connection Manager ID (CM_ID)
+	 */
+	RDMA_RESTRACK_CM_ID,
+	/**
+	 * @RDMA_RESTRACK_MR: Memory Region (MR)
+	 */
+	RDMA_RESTRACK_MR,
+	/**
+	 * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations
+	 */
+	RDMA_RESTRACK_MAX
+};
+
+#define RDMA_RESTRACK_HASH_BITS	8
+/**
+ * struct rdma_restrack_root - main resource tracking management
+ * entity, per-device
+ */
+struct rdma_restrack_root {
+	/*
+	 * @rwsem: Read/write lock to protect lists
+	 */
+	struct rw_semaphore	rwsem;
+	/**
+	 * @hash: global database for all resources per-device
+	 */
+	DECLARE_HASHTABLE(hash, RDMA_RESTRACK_HASH_BITS);
+};
+
+/**
+ * struct rdma_restrack_entry - metadata per-entry
+ */
+struct rdma_restrack_entry {
+	/**
+	 * @valid: validity indicator
+	 *
+	 * The entries are filled during rdma_restrack_add,
+	 * can be attempted to be free during rdma_restrack_del.
+	 *
+	 * As an example for that, see mlx5 QPs with type MLX5_IB_QPT_HW_GSI
+	 */
+	bool			valid;
+	/*
+	 * @kref: Protect destroy of the resource
+	 */
+	struct kref		kref;
+	/*
+	 * @comp: Signal that all consumers of resource are completed their work
+	 */
+	struct completion	comp;
+	/**
+	 * @task: owner of resource tracking entity
+	 *
+	 * There are two types of entities: created by user and created
+	 * by kernel.
+	 *
+	 * This is relevant for the entities created by users.
+	 * For the entities created by kernel, this pointer will be NULL.
+	 */
+	struct task_struct	*task;
+	/**
+	 * @kern_name: name of owner for the kernel created entities.
+	 */
+	const char		*kern_name;
+	/**
+	 * @node: hash table entry
+	 */
+	struct hlist_node	node;
+	/**
+	 * @type: various objects in restrack database
+	 */
+	enum rdma_restrack_type	type;
+};
+
+/**
+ * rdma_restrack_init() - initialize resource tracking
+ * @res:  resource tracking root
+ */
+void rdma_restrack_init(struct rdma_restrack_root *res);
+
+/**
+ * rdma_restrack_clean() - clean resource tracking
+ * @res:  resource tracking root
+ */
+void rdma_restrack_clean(struct rdma_restrack_root *res);
+
+/**
+ * rdma_restrack_count() - the current usage of specific object
+ * @res:  resource entry
+ * @type: actual type of object to operate
+ * @ns:   PID namespace
+ */
+int rdma_restrack_count(struct rdma_restrack_root *res,
+			enum rdma_restrack_type type,
+			struct pid_namespace *ns);
+
+/**
+ * rdma_restrack_add() - add object to the reource tracking database
+ * @res:  resource entry
+ */
+void rdma_restrack_add(struct rdma_restrack_entry *res);
+
+/**
+ * rdma_restrack_del() - delete object from the reource tracking database
+ * @res:  resource entry
+ * @type: actual type of object to operate
+ */
+void rdma_restrack_del(struct rdma_restrack_entry *res);
+
+/**
+ * rdma_is_kernel_res() - check the owner of resource
+ * @res:  resource entry
+ */
+static inline bool rdma_is_kernel_res(struct rdma_restrack_entry *res)
+{
+	return !res->task;
+}
+
+/**
+ * rdma_restrack_get() - grab to protect resource from release
+ * @res:  resource entry
+ */
+int __must_check rdma_restrack_get(struct rdma_restrack_entry *res);
+
+/**
+ * rdma_restrack_put() - release resource
+ * @res:  resource entry
+ */
+int rdma_restrack_put(struct rdma_restrack_entry *res);
+
+/**
+ * rdma_restrack_set_task() - set the task for this resource
+ * @res:  resource entry
+ * @task: task struct
+ */
+static inline void rdma_restrack_set_task(struct rdma_restrack_entry *res,
+					  struct task_struct *task)
+{
+	if (res->task)
+		put_task_struct(res->task);
+	get_task_struct(task);
+	res->task = task;
+}
+
+#endif /* _RDMA_RESTRACK_H_ */
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
new file mode 100644
index 0000000..a3cbbc7
--- /dev/null
+++ b/include/rdma/rw.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef _RDMA_RW_H
+#define _RDMA_RW_H
+
+#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/mr_pool.h>
+
+struct rdma_rw_ctx {
+	/* number of RDMA READ/WRITE WRs (not counting MR WRs) */
+	u32			nr_ops;
+
+	/* tag for the union below: */
+	u8			type;
+
+	union {
+		/* for mapping a single SGE: */
+		struct {
+			struct ib_sge		sge;
+			struct ib_rdma_wr	wr;
+		} single;
+
+		/* for mapping of multiple SGEs: */
+		struct {
+			struct ib_sge		*sges;
+			struct ib_rdma_wr	*wrs;
+		} map;
+
+		/* for registering multiple WRs: */
+		struct rdma_rw_reg_ctx {
+			struct ib_sge		sge;
+			struct ib_rdma_wr	wr;
+			struct ib_reg_wr	reg_wr;
+			struct ib_send_wr	inv_wr;
+			struct ib_mr		*mr;
+		} *reg;
+
+		struct {
+			struct rdma_rw_reg_ctx	data;
+			struct rdma_rw_reg_ctx	prot;
+			struct ib_send_wr	sig_inv_wr;
+			struct ib_mr		*sig_mr;
+			struct ib_sge		sig_sge;
+			struct ib_sig_handover_wr sig_wr;
+		} *sig;
+	};
+};
+
+int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+		struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
+		u64 remote_addr, u32 rkey, enum dma_data_direction dir);
+void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+		struct scatterlist *sg, u32 sg_cnt,
+		enum dma_data_direction dir);
+
+int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+		u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+		struct scatterlist *prot_sg, u32 prot_sg_cnt,
+		struct ib_sig_attrs *sig_attrs, u64 remote_addr, u32 rkey,
+		enum dma_data_direction dir);
+void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+		u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+		struct scatterlist *prot_sg, u32 prot_sg_cnt,
+		enum dma_data_direction dir);
+
+struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+		u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr);
+int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+		struct ib_cqe *cqe, struct ib_send_wr *chain_wr);
+
+unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num,
+		unsigned int maxpages);
+void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr);
+int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr);
+void rdma_rw_cleanup_mrs(struct ib_qp *qp);
+
+#endif /* _RDMA_RW_H */
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
new file mode 100644
index 0000000..095383a
--- /dev/null
+++ b/include/rdma/uverbs_ioctl.h
@@ -0,0 +1,558 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _UVERBS_IOCTL_
+#define _UVERBS_IOCTL_
+
+#include <rdma/uverbs_types.h>
+#include <linux/uaccess.h>
+#include <rdma/rdma_user_ioctl.h>
+#include <rdma/ib_user_ioctl_verbs.h>
+#include <rdma/ib_user_ioctl_cmds.h>
+
+/*
+ * =======================================
+ *	Verbs action specifications
+ * =======================================
+ */
+
+enum uverbs_attr_type {
+	UVERBS_ATTR_TYPE_NA,
+	UVERBS_ATTR_TYPE_PTR_IN,
+	UVERBS_ATTR_TYPE_PTR_OUT,
+	UVERBS_ATTR_TYPE_IDR,
+	UVERBS_ATTR_TYPE_FD,
+	UVERBS_ATTR_TYPE_ENUM_IN,
+};
+
+enum uverbs_obj_access {
+	UVERBS_ACCESS_READ,
+	UVERBS_ACCESS_WRITE,
+	UVERBS_ACCESS_NEW,
+	UVERBS_ACCESS_DESTROY
+};
+
+enum {
+	UVERBS_ATTR_SPEC_F_MANDATORY	= 1U << 0,
+	/* Support extending attributes by length, validate all unknown size == zero  */
+	UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO = 1U << 1,
+};
+
+/* Specification of a single attribute inside the ioctl message */
+struct uverbs_attr_spec {
+	union {
+		/* Header shared by all following union members - to reduce space. */
+		struct {
+			enum uverbs_attr_type		type;
+			/* Combination of bits from enum UVERBS_ATTR_SPEC_F_XXXX */
+			u8				flags;
+		};
+		struct {
+			enum uverbs_attr_type		type;
+			/* Combination of bits from enum UVERBS_ATTR_SPEC_F_XXXX */
+			u8				flags;
+			/* Current known size to kernel */
+			u16				len;
+			/* User isn't allowed to provide something < min_len */
+			u16				min_len;
+		} ptr;
+		struct {
+			enum uverbs_attr_type		type;
+			/* Combination of bits from enum UVERBS_ATTR_SPEC_F_XXXX */
+			u8				flags;
+			/*
+			 * higher bits mean the namespace and lower bits mean
+			 * the type id within the namespace.
+			 */
+			u16			obj_type;
+			u8			access;
+		} obj;
+		struct {
+			enum uverbs_attr_type		type;
+			/* Combination of bits from enum UVERBS_ATTR_SPEC_F_XXXX */
+			u8				flags;
+			u8				num_elems;
+			/*
+			 * The enum attribute can select one of the attributes
+			 * contained in the ids array. Currently only PTR_IN
+			 * attributes are supported in the ids array.
+			 */
+			const struct uverbs_attr_spec	*ids;
+		} enum_def;
+	};
+};
+
+struct uverbs_attr_spec_hash {
+	size_t				num_attrs;
+	unsigned long			*mandatory_attrs_bitmask;
+	struct uverbs_attr_spec		attrs[0];
+};
+
+struct uverbs_attr_bundle;
+struct ib_uverbs_file;
+
+enum {
+	/*
+	 * Action marked with this flag creates a context (or root for all
+	 * objects).
+	 */
+	UVERBS_ACTION_FLAG_CREATE_ROOT = 1U << 0,
+};
+
+struct uverbs_method_spec {
+	/* Combination of bits from enum UVERBS_ACTION_FLAG_XXXX */
+	u32						flags;
+	size_t						num_buckets;
+	size_t						num_child_attrs;
+	int (*handler)(struct ib_device *ib_dev, struct ib_uverbs_file *ufile,
+		       struct uverbs_attr_bundle *ctx);
+	struct uverbs_attr_spec_hash		*attr_buckets[0];
+};
+
+struct uverbs_method_spec_hash {
+	size_t					num_methods;
+	struct uverbs_method_spec		*methods[0];
+};
+
+struct uverbs_object_spec {
+	const struct uverbs_obj_type		*type_attrs;
+	size_t					num_buckets;
+	struct uverbs_method_spec_hash		*method_buckets[0];
+};
+
+struct uverbs_object_spec_hash {
+	size_t					num_objects;
+	struct uverbs_object_spec		*objects[0];
+};
+
+struct uverbs_root_spec {
+	size_t					num_buckets;
+	struct uverbs_object_spec_hash		*object_buckets[0];
+};
+
+/*
+ * =======================================
+ *	Verbs definitions
+ * =======================================
+ */
+
+struct uverbs_attr_def {
+	u16                           id;
+	struct uverbs_attr_spec       attr;
+};
+
+struct uverbs_method_def {
+	u16                                  id;
+	/* Combination of bits from enum UVERBS_ACTION_FLAG_XXXX */
+	u32				     flags;
+	size_t				     num_attrs;
+	const struct uverbs_attr_def * const (*attrs)[];
+	int (*handler)(struct ib_device *ib_dev, struct ib_uverbs_file *ufile,
+		       struct uverbs_attr_bundle *ctx);
+};
+
+struct uverbs_object_def {
+	u16					 id;
+	const struct uverbs_obj_type	        *type_attrs;
+	size_t				         num_methods;
+	const struct uverbs_method_def * const (*methods)[];
+};
+
+struct uverbs_object_tree_def {
+	size_t					 num_objects;
+	const struct uverbs_object_def * const (*objects)[];
+};
+
+#define UA_FLAGS(_flags)  .flags = _flags
+#define __UVERBS_ATTR0(_id, _type, _fld, _attr, ...)              \
+	((const struct uverbs_attr_def)				  \
+	 {.id = _id, .attr = {{._fld = {.type = _type, _attr, .flags = 0, } }, } })
+#define __UVERBS_ATTR1(_id, _type, _fld, _attr, _extra1, ...)      \
+	((const struct uverbs_attr_def)				  \
+	 {.id = _id, .attr = {{._fld = {.type = _type, _attr, _extra1 } },} })
+#define __UVERBS_ATTR2(_id, _type, _fld, _attr, _extra1, _extra2)    \
+	((const struct uverbs_attr_def)				  \
+	 {.id = _id, .attr = {{._fld = {.type = _type, _attr, _extra1, _extra2 } },} })
+#define __UVERBS_ATTR(_id, _type, _fld, _attr, _extra1, _extra2, _n, ...)	\
+	__UVERBS_ATTR##_n(_id, _type, _fld, _attr, _extra1, _extra2)
+
+#define UVERBS_ATTR_TYPE(_type)					\
+	.min_len = sizeof(_type), .len = sizeof(_type)
+#define UVERBS_ATTR_STRUCT(_type, _last)			\
+	.min_len = ((uintptr_t)(&((_type *)0)->_last + 1)), .len = sizeof(_type)
+#define UVERBS_ATTR_SIZE(_min_len, _len)			\
+	.min_len = _min_len, .len = _len
+
+/*
+ * In new compiler, UVERBS_ATTR could be simplified by declaring it as
+ * [_id] = {.type = _type, .len = _len, ##__VA_ARGS__}
+ * But since we support older compilers too, we need the more complex code.
+ */
+#define UVERBS_ATTR(_id, _type, _fld, _attr, ...)			\
+	__UVERBS_ATTR(_id, _type, _fld, _attr, ##__VA_ARGS__, 2, 1, 0)
+#define UVERBS_ATTR_PTR_IN_SZ(_id, _len, ...)				\
+	UVERBS_ATTR(_id, UVERBS_ATTR_TYPE_PTR_IN, ptr, _len, ##__VA_ARGS__)
+/* If sizeof(_type) <= sizeof(u64), this will be inlined rather than a pointer */
+#define UVERBS_ATTR_PTR_IN(_id, _type, ...)				\
+	UVERBS_ATTR_PTR_IN_SZ(_id, _type, ##__VA_ARGS__)
+#define UVERBS_ATTR_PTR_OUT_SZ(_id, _len, ...)				\
+	UVERBS_ATTR(_id, UVERBS_ATTR_TYPE_PTR_OUT, ptr, _len, ##__VA_ARGS__)
+#define UVERBS_ATTR_PTR_OUT(_id, _type, ...)				\
+	UVERBS_ATTR_PTR_OUT_SZ(_id, _type, ##__VA_ARGS__)
+#define UVERBS_ATTR_ENUM_IN(_id, _enum_arr, ...)			\
+	UVERBS_ATTR(_id, UVERBS_ATTR_TYPE_ENUM_IN, enum_def,		\
+		    .ids = (_enum_arr),					\
+		    .num_elems = ARRAY_SIZE(_enum_arr), ##__VA_ARGS__)
+
+/*
+ * In new compiler, UVERBS_ATTR_IDR (and FD) could be simplified by declaring
+ * it as
+ * {.id = _id,								\
+ *  .attr {.type = __obj_class,						\
+ *         .obj = {.obj_type = _idr_type,				\
+ *                       .access = _access                              \
+ *                }, ##__VA_ARGS__ } }
+ * But since we support older compilers too, we need the more complex code.
+ */
+#define ___UVERBS_ATTR_OBJ0(_id, _obj_class, _obj_type, _access, ...)\
+	((const struct uverbs_attr_def)					\
+	{.id = _id,							\
+	 .attr = { {.obj = {.type = _obj_class, .obj_type = _obj_type,	\
+			    .access = _access, .flags = 0 } }, } })
+#define ___UVERBS_ATTR_OBJ1(_id, _obj_class, _obj_type, _access, _flags)\
+	((const struct uverbs_attr_def)					\
+	{.id = _id,							\
+	.attr = { {.obj = {.type = _obj_class, .obj_type = _obj_type,	\
+			   .access = _access, _flags} }, } })
+#define ___UVERBS_ATTR_OBJ(_id, _obj_class, _obj_type, _access, _flags, \
+			   _n, ...)					\
+	___UVERBS_ATTR_OBJ##_n(_id, _obj_class, _obj_type, _access, _flags)
+#define __UVERBS_ATTR_OBJ(_id, _obj_class, _obj_type, _access, ...)	\
+	___UVERBS_ATTR_OBJ(_id, _obj_class, _obj_type, _access,		\
+			   ##__VA_ARGS__, 1, 0)
+#define UVERBS_ATTR_IDR(_id, _idr_type, _access, ...)			 \
+	__UVERBS_ATTR_OBJ(_id, UVERBS_ATTR_TYPE_IDR, _idr_type, _access,\
+			  ##__VA_ARGS__)
+#define UVERBS_ATTR_FD(_id, _fd_type, _access, ...)			\
+	__UVERBS_ATTR_OBJ(_id, UVERBS_ATTR_TYPE_FD, _fd_type,		\
+			  (_access) + BUILD_BUG_ON_ZERO(		\
+				(_access) != UVERBS_ACCESS_NEW &&	\
+				(_access) != UVERBS_ACCESS_READ),	\
+			  ##__VA_ARGS__)
+#define DECLARE_UVERBS_ATTR_SPEC(_name, ...)				\
+	const struct uverbs_attr_def _name = __VA_ARGS__
+
+#define DECLARE_UVERBS_ENUM(_name, ...)					\
+	const struct uverbs_enum_spec _name = {				\
+		.len = ARRAY_SIZE(((struct uverbs_attr_spec[]){__VA_ARGS__})),\
+		.ids = {__VA_ARGS__},					\
+	}
+#define _UVERBS_METHOD_ATTRS_SZ(...)					\
+	(sizeof((const struct uverbs_attr_def * const []){__VA_ARGS__}) /\
+	 sizeof(const struct uverbs_attr_def *))
+#define _UVERBS_METHOD(_id, _handler, _flags, ...)			\
+	((const struct uverbs_method_def) {				\
+	 .id = _id,							\
+	 .flags = _flags,						\
+	 .handler = _handler,						\
+	 .num_attrs = _UVERBS_METHOD_ATTRS_SZ(__VA_ARGS__),		\
+	 .attrs = &(const struct uverbs_attr_def * const []){__VA_ARGS__} })
+#define DECLARE_UVERBS_METHOD(_name, _id, _handler, ...)		\
+	const struct uverbs_method_def _name =				\
+		_UVERBS_METHOD(_id, _handler, 0, ##__VA_ARGS__)
+#define DECLARE_UVERBS_CTX_METHOD(_name, _id, _handler, _flags, ...)	\
+	const struct uverbs_method_def _name =				\
+		_UVERBS_METHOD(_id, _handler,				\
+			       UVERBS_ACTION_FLAG_CREATE_ROOT,		\
+			       ##__VA_ARGS__)
+#define _UVERBS_OBJECT_METHODS_SZ(...)					\
+	(sizeof((const struct uverbs_method_def * const []){__VA_ARGS__}) / \
+	 sizeof(const struct uverbs_method_def *))
+#define _UVERBS_OBJECT(_id, _type_attrs, ...)				\
+	((const struct uverbs_object_def) {				\
+	 .id = _id,							\
+	 .type_attrs = _type_attrs,					\
+	 .num_methods = _UVERBS_OBJECT_METHODS_SZ(__VA_ARGS__),		\
+	 .methods = &(const struct uverbs_method_def * const []){__VA_ARGS__} })
+#define DECLARE_UVERBS_OBJECT(_name, _id, _type_attrs, ...)		\
+	const struct uverbs_object_def _name =				\
+		_UVERBS_OBJECT(_id, _type_attrs, ##__VA_ARGS__)
+#define _UVERBS_TREE_OBJECTS_SZ(...)					\
+	(sizeof((const struct uverbs_object_def * const []){__VA_ARGS__}) / \
+	 sizeof(const struct uverbs_object_def *))
+#define _UVERBS_OBJECT_TREE(...)					\
+	((const struct uverbs_object_tree_def) {			\
+	 .num_objects = _UVERBS_TREE_OBJECTS_SZ(__VA_ARGS__),		\
+	 .objects = &(const struct uverbs_object_def * const []){__VA_ARGS__} })
+#define DECLARE_UVERBS_OBJECT_TREE(_name, ...)				\
+	const struct uverbs_object_tree_def _name =			\
+		_UVERBS_OBJECT_TREE(__VA_ARGS__)
+
+/* =================================================
+ *              Parsing infrastructure
+ * =================================================
+ */
+
+struct uverbs_ptr_attr {
+	u64		data;
+	u16		len;
+	/* Combination of bits from enum UVERBS_ATTR_F_XXXX */
+	u16		flags;
+	u8		enum_id;
+};
+
+struct uverbs_obj_attr {
+	/* pointer to the kernel descriptor -> type, access, etc */
+	const struct uverbs_obj_type	*type;
+	struct ib_uobject		*uobject;
+	/* fd or id in idr of this object */
+	int				id;
+};
+
+struct uverbs_attr {
+	/*
+	 * pointer to the user-space given attribute, in order to write the
+	 * new uobject's id or update flags.
+	 */
+	struct ib_uverbs_attr __user	*uattr;
+	union {
+		struct uverbs_ptr_attr	ptr_attr;
+		struct uverbs_obj_attr	obj_attr;
+	};
+};
+
+struct uverbs_attr_bundle_hash {
+	/* if bit i is set, it means attrs[i] contains valid information */
+	unsigned long *valid_bitmap;
+	size_t num_attrs;
+	/*
+	 * arrays of attributes, each element corresponds to the specification
+	 * of the attribute in the same index.
+	 */
+	struct uverbs_attr *attrs;
+};
+
+struct uverbs_attr_bundle {
+	size_t				num_buckets;
+	struct uverbs_attr_bundle_hash  hash[];
+};
+
+static inline bool uverbs_attr_is_valid_in_hash(const struct uverbs_attr_bundle_hash *attrs_hash,
+						unsigned int idx)
+{
+	return test_bit(idx, attrs_hash->valid_bitmap);
+}
+
+static inline bool uverbs_attr_is_valid(const struct uverbs_attr_bundle *attrs_bundle,
+					unsigned int idx)
+{
+	u16 idx_bucket = idx >>	UVERBS_ID_NS_SHIFT;
+
+	if (attrs_bundle->num_buckets <= idx_bucket)
+		return false;
+
+	return uverbs_attr_is_valid_in_hash(&attrs_bundle->hash[idx_bucket],
+					    idx & ~UVERBS_ID_NS_MASK);
+}
+
+#define IS_UVERBS_COPY_ERR(_ret)		((_ret) && (_ret) != -ENOENT)
+
+static inline const struct uverbs_attr *uverbs_attr_get(const struct uverbs_attr_bundle *attrs_bundle,
+							u16 idx)
+{
+	u16 idx_bucket = idx >>	UVERBS_ID_NS_SHIFT;
+
+	if (!uverbs_attr_is_valid(attrs_bundle, idx))
+		return ERR_PTR(-ENOENT);
+
+	return &attrs_bundle->hash[idx_bucket].attrs[idx & ~UVERBS_ID_NS_MASK];
+}
+
+static inline int uverbs_attr_get_enum_id(const struct uverbs_attr_bundle *attrs_bundle,
+					  u16 idx)
+{
+	const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+
+	if (IS_ERR(attr))
+		return PTR_ERR(attr);
+
+	return attr->ptr_attr.enum_id;
+}
+
+static inline void *uverbs_attr_get_obj(const struct uverbs_attr_bundle *attrs_bundle,
+					u16 idx)
+{
+	const struct uverbs_attr *attr;
+
+	attr = uverbs_attr_get(attrs_bundle, idx);
+	if (IS_ERR(attr))
+		return ERR_CAST(attr);
+
+	return attr->obj_attr.uobject->object;
+}
+
+static inline int uverbs_copy_to(const struct uverbs_attr_bundle *attrs_bundle,
+				 size_t idx, const void *from, size_t size)
+{
+	const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+	u16 flags;
+	size_t min_size;
+
+	if (IS_ERR(attr))
+		return PTR_ERR(attr);
+
+	min_size = min_t(size_t, attr->ptr_attr.len, size);
+	if (copy_to_user(u64_to_user_ptr(attr->ptr_attr.data), from, min_size))
+		return -EFAULT;
+
+	flags = attr->ptr_attr.flags | UVERBS_ATTR_F_VALID_OUTPUT;
+	if (put_user(flags, &attr->uattr->flags))
+		return -EFAULT;
+
+	return 0;
+}
+
+static inline bool uverbs_attr_ptr_is_inline(const struct uverbs_attr *attr)
+{
+	return attr->ptr_attr.len <= sizeof(attr->ptr_attr.data);
+}
+
+static inline int _uverbs_copy_from(void *to,
+				    const struct uverbs_attr_bundle *attrs_bundle,
+				    size_t idx,
+				    size_t size)
+{
+	const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+
+	if (IS_ERR(attr))
+		return PTR_ERR(attr);
+
+	/*
+	 * Validation ensures attr->ptr_attr.len >= size. If the caller is
+	 * using UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO then it must call
+	 * uverbs_copy_from_or_zero.
+	 */
+	if (unlikely(size < attr->ptr_attr.len))
+		return -EINVAL;
+
+	if (uverbs_attr_ptr_is_inline(attr))
+		memcpy(to, &attr->ptr_attr.data, attr->ptr_attr.len);
+	else if (copy_from_user(to, u64_to_user_ptr(attr->ptr_attr.data),
+				attr->ptr_attr.len))
+		return -EFAULT;
+
+	return 0;
+}
+
+static inline int _uverbs_copy_from_or_zero(void *to,
+					    const struct uverbs_attr_bundle *attrs_bundle,
+					    size_t idx,
+					    size_t size)
+{
+	const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+	size_t min_size;
+
+	if (IS_ERR(attr))
+		return PTR_ERR(attr);
+
+	min_size = min_t(size_t, size, attr->ptr_attr.len);
+
+	if (uverbs_attr_ptr_is_inline(attr))
+		memcpy(to, &attr->ptr_attr.data, min_size);
+	else if (copy_from_user(to, u64_to_user_ptr(attr->ptr_attr.data),
+				min_size))
+		return -EFAULT;
+
+	if (size > min_size)
+		memset(to + min_size, 0, size - min_size);
+
+	return 0;
+}
+
+#define uverbs_copy_from(to, attrs_bundle, idx)				      \
+	_uverbs_copy_from(to, attrs_bundle, idx, sizeof(*to))
+
+#define uverbs_copy_from_or_zero(to, attrs_bundle, idx)			      \
+	_uverbs_copy_from_or_zero(to, attrs_bundle, idx, sizeof(*to))
+
+/* =================================================
+ *	 Definitions -> Specs infrastructure
+ * =================================================
+ */
+
+/*
+ * uverbs_alloc_spec_tree - Merges different common and driver specific feature
+ *	into one parsing tree that every uverbs command will be parsed upon.
+ *
+ * @num_trees: Number of trees in the array @trees.
+ * @trees: Array of pointers to tree root definitions to merge. Each such tree
+ *	   possibly contains objects, methods and attributes definitions.
+ *
+ * Returns:
+ *	uverbs_root_spec *: The root of the merged parsing tree.
+ *	On error, we return an error code. Error is checked via IS_ERR.
+ *
+ * The following merges could take place:
+ * a. Two trees representing the same method with different handler
+ *	-> We take the handler of the tree that its handler != NULL
+ *	   and its index in the trees array is greater. The incentive for that
+ *	   is that developers are expected to first merge common trees and then
+ *	   merge trees that gives specialized the behaviour.
+ * b. Two trees representing the same object with different
+ *    type_attrs (struct uverbs_obj_type):
+ *	-> We take the type_attrs of the tree that its type_attr != NULL
+ *	   and its index in the trees array is greater. This could be used
+ *	   in order to override the free function, allocation size, etc.
+ * c. Two trees representing the same method attribute (same id but possibly
+ *    different attributes):
+ *	-> ERROR (-ENOENT), we believe that's not the programmer's intent.
+ *
+ * An object without any methods is considered invalid and will abort the
+ * function with -ENOENT error.
+ */
+#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
+struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees,
+						const struct uverbs_object_tree_def **trees);
+void uverbs_free_spec_tree(struct uverbs_root_spec *root);
+#else
+static inline struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees,
+							      const struct uverbs_object_tree_def **trees)
+{
+	return NULL;
+}
+
+static inline void uverbs_free_spec_tree(struct uverbs_root_spec *root)
+{
+}
+#endif
+
+#endif
diff --git a/include/rdma/uverbs_named_ioctl.h b/include/rdma/uverbs_named_ioctl.h
new file mode 100644
index 0000000..c5bb4eb
--- /dev/null
+++ b/include/rdma/uverbs_named_ioctl.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _UVERBS_NAMED_IOCTL_
+#define _UVERBS_NAMED_IOCTL_
+
+#include <rdma/uverbs_ioctl.h>
+
+#ifndef UVERBS_MODULE_NAME
+#error "Please #define UVERBS_MODULE_NAME before including rdma/uverbs_named_ioctl.h"
+#endif
+
+#define _UVERBS_PASTE(x, y)	x ## y
+#define _UVERBS_NAME(x, y)	_UVERBS_PASTE(x, y)
+#define UVERBS_METHOD(id)	_UVERBS_NAME(UVERBS_MODULE_NAME, _method_##id)
+#define UVERBS_HANDLER(id)	_UVERBS_NAME(UVERBS_MODULE_NAME, _handler_##id)
+
+#define DECLARE_UVERBS_NAMED_METHOD(id, ...)	\
+	DECLARE_UVERBS_METHOD(UVERBS_METHOD(id), id, UVERBS_HANDLER(id), ##__VA_ARGS__)
+
+#define DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(id, handler, ...)	\
+	DECLARE_UVERBS_METHOD(UVERBS_METHOD(id), id, handler, ##__VA_ARGS__)
+
+#define DECLARE_UVERBS_NAMED_METHOD_NO_OVERRIDE(id, handler, ...)	\
+	DECLARE_UVERBS_METHOD(UVERBS_METHOD(id), id, NULL, ##__VA_ARGS__)
+
+#define DECLARE_UVERBS_NAMED_OBJECT(id, ...)	\
+	DECLARE_UVERBS_OBJECT(UVERBS_OBJECT(id), id, ##__VA_ARGS__)
+
+#define _UVERBS_COMP_NAME(x, y, z) _UVERBS_NAME(_UVERBS_NAME(x, y), z)
+
+#define UVERBS_NO_OVERRIDE	NULL
+
+/* This declares a parsing tree with one object and one method. This is usually
+ * used for merging driver attributes to the common attributes. The driver has
+ * a chance to override the handler and type attrs of the original object.
+ * The __VA_ARGS__ just contains a list of attributes.
+ */
+#define ADD_UVERBS_ATTRIBUTES(_name, _object, _method, _type_attrs, _handler, ...) \
+static DECLARE_UVERBS_METHOD(_UVERBS_COMP_NAME(UVERBS_MODULE_NAME,	     \
+					       _method_, _name),	     \
+			     _method, _handler, ##__VA_ARGS__);		     \
+									     \
+static DECLARE_UVERBS_OBJECT(_UVERBS_COMP_NAME(UVERBS_MODULE_NAME,	     \
+					       _object_, _name),	     \
+			     _object, _type_attrs,			     \
+			     &_UVERBS_COMP_NAME(UVERBS_MODULE_NAME,	     \
+					       _method_, _name));	     \
+									     \
+static DECLARE_UVERBS_OBJECT_TREE(_name,				     \
+				  &_UVERBS_COMP_NAME(UVERBS_MODULE_NAME,     \
+						     _object_, _name))
+
+/* A very common use case is that the driver doesn't override the handler and
+ * type_attrs. Therefore, we provide a simplified macro for this common case.
+ */
+#define ADD_UVERBS_ATTRIBUTES_SIMPLE(_name, _object, _method, ...)	     \
+	ADD_UVERBS_ATTRIBUTES(_name, _object, _method, UVERBS_NO_OVERRIDE,   \
+			      UVERBS_NO_OVERRIDE, ##__VA_ARGS__)
+
+#endif
diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h
new file mode 100644
index 0000000..9d56cdb
--- /dev/null
+++ b/include/rdma/uverbs_std_types.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _UVERBS_STD_TYPES__
+#define _UVERBS_STD_TYPES__
+
+#include <rdma/uverbs_types.h>
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/ib_user_ioctl_verbs.h>
+
+#define UVERBS_OBJECT(id)	uverbs_object_##id
+
+#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
+const struct uverbs_object_tree_def *uverbs_default_get_objects(void);
+#else
+static inline const struct uverbs_object_tree_def *uverbs_default_get_objects(void)
+{
+	return NULL;
+}
+#endif
+
+static inline struct ib_uobject *__uobj_get(const struct uverbs_obj_type *type,
+					    bool write,
+					    struct ib_ucontext *ucontext,
+					    int id)
+{
+	return rdma_lookup_get_uobject(type, ucontext, id, write);
+}
+
+#define uobj_get_type(_object) UVERBS_OBJECT(_object).type_attrs
+
+#define uobj_get_read(_type, _id, _ucontext)				\
+	 __uobj_get(uobj_get_type(_type), false, _ucontext, _id)
+
+#define uobj_get_obj_read(_object, _type, _id, _ucontext)		\
+({									\
+	struct ib_uobject *__uobj =					\
+		__uobj_get(uobj_get_type(_type),			\
+			   false, _ucontext, _id);			\
+									\
+	(struct ib_##_object *)(IS_ERR(__uobj) ? NULL : __uobj->object);\
+})
+
+#define uobj_get_write(_type, _id, _ucontext)				\
+	 __uobj_get(uobj_get_type(_type), true, _ucontext, _id)
+
+static inline void uobj_put_read(struct ib_uobject *uobj)
+{
+	rdma_lookup_put_uobject(uobj, false);
+}
+
+#define uobj_put_obj_read(_obj)					\
+	uobj_put_read((_obj)->uobject)
+
+static inline void uobj_put_write(struct ib_uobject *uobj)
+{
+	rdma_lookup_put_uobject(uobj, true);
+}
+
+static inline int __must_check uobj_remove_commit(struct ib_uobject *uobj)
+{
+	return rdma_remove_commit_uobject(uobj);
+}
+
+static inline void uobj_alloc_commit(struct ib_uobject *uobj)
+{
+	rdma_alloc_commit_uobject(uobj);
+}
+
+static inline void uobj_alloc_abort(struct ib_uobject *uobj)
+{
+	rdma_alloc_abort_uobject(uobj);
+}
+
+static inline struct ib_uobject *__uobj_alloc(const struct uverbs_obj_type *type,
+					      struct ib_ucontext *ucontext)
+{
+	return rdma_alloc_begin_uobject(type, ucontext);
+}
+
+#define uobj_alloc(_type, ucontext)	\
+	__uobj_alloc(uobj_get_type(_type), ucontext)
+
+#endif
+
diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h
new file mode 100644
index 0000000..cc04ec6
--- /dev/null
+++ b/include/rdma/uverbs_types.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _UVERBS_TYPES_
+#define _UVERBS_TYPES_
+
+#include <linux/kernel.h>
+#include <rdma/ib_verbs.h>
+
+struct uverbs_obj_type;
+
+struct uverbs_obj_type_class {
+	/*
+	 * Get an ib_uobject that corresponds to the given id from ucontext,
+	 * These functions could create or destroy objects if required.
+	 * The action will be finalized only when commit, abort or put fops are
+	 * called.
+	 * The flow of the different actions is:
+	 * [alloc]:	 Starts with alloc_begin. The handlers logic is than
+	 *		 executed. If the handler is successful, alloc_commit
+	 *		 is called and the object is inserted to the repository.
+	 *		 Once alloc_commit completes the object is visible to
+	 *		 other threads and userspace.
+	 e		 Otherwise, alloc_abort is called and the object is
+	 *		 destroyed.
+	 * [lookup]:	 Starts with lookup_get which fetches and locks the
+	 *		 object. After the handler finished using the object, it
+	 *		 needs to call lookup_put to unlock it. The exclusive
+	 *		 flag indicates if the object is locked for exclusive
+	 *		 access.
+	 * [remove]:	 Starts with lookup_get with exclusive flag set. This
+	 *		 locks the object for exclusive access. If the handler
+	 *		 code completed successfully, remove_commit is called
+	 *		 and the ib_uobject is removed from the context's
+	 *		 uobjects repository and put. The object itself is
+	 *		 destroyed as well. Once remove succeeds new krefs to
+	 *		 the object cannot be acquired by other threads or
+	 *		 userspace and the hardware driver is removed from the
+	 *		 object. Other krefs on the object may still exist.
+	 *		 If the handler code failed, lookup_put should be
+	 *		 called. This callback is used when the context
+	 *		 is destroyed as well (process termination,
+	 *		 reset flow).
+	 */
+	struct ib_uobject *(*alloc_begin)(const struct uverbs_obj_type *type,
+					  struct ib_ucontext *ucontext);
+	void (*alloc_commit)(struct ib_uobject *uobj);
+	void (*alloc_abort)(struct ib_uobject *uobj);
+
+	struct ib_uobject *(*lookup_get)(const struct uverbs_obj_type *type,
+					 struct ib_ucontext *ucontext, int id,
+					 bool exclusive);
+	void (*lookup_put)(struct ib_uobject *uobj, bool exclusive);
+	/*
+	 * Must be called with the exclusive lock held. If successful uobj is
+	 * invalid on return. On failure uobject is left completely
+	 * unchanged
+	 */
+	int __must_check (*remove_commit)(struct ib_uobject *uobj,
+					  enum rdma_remove_reason why);
+	u8    needs_kfree_rcu;
+};
+
+struct uverbs_obj_type {
+	const struct uverbs_obj_type_class * const type_class;
+	size_t	     obj_size;
+	unsigned int destroy_order;
+};
+
+/*
+ * Objects type classes which support a detach state (object is still alive but
+ * it's not attached to any context need to make sure:
+ * (a) no call through to a driver after a detach is called
+ * (b) detach isn't called concurrently with context_cleanup
+ */
+
+struct uverbs_obj_idr_type {
+	/*
+	 * In idr based objects, uverbs_obj_type_class points to a generic
+	 * idr operations. In order to specialize the underlying types (e.g. CQ,
+	 * QPs, etc.), we add destroy_object specific callbacks.
+	 */
+	struct uverbs_obj_type  type;
+
+	/* Free driver resources from the uobject, make the driver uncallable,
+	 * and move the uobject to the detached state. If the object was
+	 * destroyed by the user's request, a failure should leave the uobject
+	 * completely unchanged.
+	 */
+	int __must_check (*destroy_object)(struct ib_uobject *uobj,
+					   enum rdma_remove_reason why);
+};
+
+struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_obj_type *type,
+					   struct ib_ucontext *ucontext,
+					   int id, bool exclusive);
+void rdma_lookup_put_uobject(struct ib_uobject *uobj, bool exclusive);
+struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_obj_type *type,
+					    struct ib_ucontext *ucontext);
+void rdma_alloc_abort_uobject(struct ib_uobject *uobj);
+int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj);
+int rdma_alloc_commit_uobject(struct ib_uobject *uobj);
+int rdma_explicit_destroy(struct ib_uobject *uobject);
+
+struct uverbs_obj_fd_type {
+	/*
+	 * In fd based objects, uverbs_obj_type_ops points to generic
+	 * fd operations. In order to specialize the underlying types (e.g.
+	 * completion_channel), we use fops, name and flags for fd creation.
+	 * context_closed is called when the context is closed either when
+	 * the driver is removed or the process terminated.
+	 */
+	struct uverbs_obj_type  type;
+	int (*context_closed)(struct ib_uobject_file *uobj_file,
+			      enum rdma_remove_reason why);
+	const struct file_operations	*fops;
+	const char			*name;
+	int				flags;
+};
+
+extern const struct uverbs_obj_type_class uverbs_idr_class;
+extern const struct uverbs_obj_type_class uverbs_fd_class;
+
+#define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) -	\
+				   sizeof(char))
+#define UVERBS_TYPE_ALLOC_FD(_order, _obj_size, _context_closed, _fops, _name, _flags)\
+	((&((const struct uverbs_obj_fd_type)				\
+	 {.type = {							\
+		.destroy_order = _order,				\
+		.type_class = &uverbs_fd_class,				\
+		.obj_size = (_obj_size) +				\
+			UVERBS_BUILD_BUG_ON((_obj_size) < sizeof(struct ib_uobject_file)), \
+	 },								\
+	 .context_closed = _context_closed,				\
+	 .fops = _fops,							\
+	 .name = _name,							\
+	 .flags = _flags}))->type)
+#define UVERBS_TYPE_ALLOC_IDR_SZ(_size, _order, _destroy_object)	\
+	((&((const struct uverbs_obj_idr_type)				\
+	 {.type = {							\
+		.destroy_order = _order,				\
+		.type_class = &uverbs_idr_class,			\
+		.obj_size = (_size) +					\
+			UVERBS_BUILD_BUG_ON((_size) <			\
+					    sizeof(struct ib_uobject))	\
+	 },								\
+	 .destroy_object = _destroy_object,}))->type)
+#define UVERBS_TYPE_ALLOC_IDR(_order, _destroy_object)			\
+	 UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uobject), _order,	\
+				  _destroy_object)
+
+#endif
diff --git a/include/scsi/iser.h b/include/scsi/iser.h
new file mode 100644
index 0000000..2e678fa
--- /dev/null
+++ b/include/scsi/iser.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ISCSI_ISER_H
+#define ISCSI_ISER_H
+
+#define ISER_ZBVA_NOT_SUP		0x80
+#define ISER_SEND_W_INV_NOT_SUP		0x40
+#define ISERT_ZBVA_NOT_USED		0x80
+#define ISERT_SEND_W_INV_NOT_USED	0x40
+
+#define ISCSI_CTRL	0x10
+#define ISER_HELLO	0x20
+#define ISER_HELLORPLY	0x30
+
+#define ISER_VER	0x10
+#define ISER_WSV	0x08
+#define ISER_RSV	0x04
+
+/**
+ * struct iser_cm_hdr - iSER CM header (from iSER Annex A12)
+ *
+ * @flags:        flags support (zbva, send_w_inv)
+ * @rsvd:         reserved
+ */
+struct iser_cm_hdr {
+	u8      flags;
+	u8      rsvd[3];
+} __packed;
+
+/**
+ * struct iser_ctrl - iSER header of iSCSI control PDU
+ *
+ * @flags:        opcode and read/write valid bits
+ * @rsvd:         reserved
+ * @write_stag:   write rkey
+ * @write_va:     write virtual address
+ * @reaf_stag:    read rkey
+ * @read_va:      read virtual address
+ */
+struct iser_ctrl {
+	u8      flags;
+	u8      rsvd[3];
+	__be32  write_stag;
+	__be64  write_va;
+	__be32  read_stag;
+	__be64  read_va;
+} __packed;
+
+#endif /* ISCSI_ISER_H */
diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h
new file mode 100644
index 0000000..d22df12
--- /dev/null
+++ b/include/scsi/scsi_transport_srp.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef SCSI_TRANSPORT_SRP_H
+#define SCSI_TRANSPORT_SRP_H
+
+#include <linux/transport_class.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+
+#define SRP_RPORT_ROLE_INITIATOR 0
+#define SRP_RPORT_ROLE_TARGET 1
+
+struct srp_rport_identifiers {
+	u8 port_id[16];
+	u8 roles;
+};
+
+/**
+ * enum srp_rport_state - SRP transport layer state
+ * @SRP_RPORT_RUNNING:   Transport layer operational.
+ * @SRP_RPORT_BLOCKED:   Transport layer not operational; fast I/O fail timer
+ *                       is running and I/O has been blocked.
+ * @SRP_RPORT_FAIL_FAST: Fast I/O fail timer has expired; fail I/O fast.
+ * @SRP_RPORT_LOST:      Port is being removed.
+ */
+enum srp_rport_state {
+	SRP_RPORT_RUNNING,
+	SRP_RPORT_BLOCKED,
+	SRP_RPORT_FAIL_FAST,
+	SRP_RPORT_LOST,
+};
+
+/**
+ * struct srp_rport - SRP initiator or target port
+ *
+ * Fields that are relevant for SRP initiator and SRP target drivers:
+ * @dev:               Device associated with this rport.
+ * @port_id:           16-byte port identifier.
+ * @roles:             Role of this port - initiator or target.
+ *
+ * Fields that are only relevant for SRP initiator drivers:
+ * @lld_data:          LLD private data.
+ * @mutex:             Protects against concurrent rport reconnect /
+ *                     fast_io_fail / dev_loss_tmo activity.
+ * @state:             rport state.
+ * @reconnect_delay:   Reconnect delay in seconds.
+ * @failed_reconnects: Number of failed reconnect attempts.
+ * @reconnect_work:    Work structure used for scheduling reconnect attempts.
+ * @fast_io_fail_tmo:  Fast I/O fail timeout in seconds.
+ * @dev_loss_tmo:      Device loss timeout in seconds.
+ * @fast_io_fail_work: Work structure used for scheduling fast I/O fail work.
+ * @dev_loss_work:     Work structure used for scheduling device loss work.
+ */
+struct srp_rport {
+	/* for initiator and target drivers */
+
+	struct device dev;
+
+	u8 port_id[16];
+	u8 roles;
+
+	/* for initiator drivers */
+
+	void			*lld_data;
+
+	struct mutex		mutex;
+	enum srp_rport_state	state;
+	int			reconnect_delay;
+	int			failed_reconnects;
+	struct delayed_work	reconnect_work;
+	int			fast_io_fail_tmo;
+	int			dev_loss_tmo;
+	struct delayed_work	fast_io_fail_work;
+	struct delayed_work	dev_loss_work;
+};
+
+/**
+ * struct srp_function_template
+ *
+ * Fields that are only relevant for SRP initiator drivers:
+ * @has_rport_state: Whether or not to create the state, fast_io_fail_tmo and
+ *     dev_loss_tmo sysfs attribute for an rport.
+ * @reset_timer_if_blocked: Whether or srp_timed_out() should reset the command
+ *     timer if the device on which it has been queued is blocked.
+ * @reconnect_delay: If not NULL, points to the default reconnect_delay value.
+ * @fast_io_fail_tmo: If not NULL, points to the default fast_io_fail_tmo value.
+ * @dev_loss_tmo: If not NULL, points to the default dev_loss_tmo value.
+ * @reconnect: Callback function for reconnecting to the target. See also
+ *     srp_reconnect_rport().
+ * @terminate_rport_io: Callback function for terminating all outstanding I/O
+ *     requests for an rport.
+ * @rport_delete: Callback function that deletes an rport.
+ */
+struct srp_function_template {
+	/* for initiator drivers */
+	bool has_rport_state;
+	bool reset_timer_if_blocked;
+	int *reconnect_delay;
+	int *fast_io_fail_tmo;
+	int *dev_loss_tmo;
+	int (*reconnect)(struct srp_rport *rport);
+	void (*terminate_rport_io)(struct srp_rport *rport);
+	void (*rport_delete)(struct srp_rport *rport);
+};
+
+extern struct scsi_transport_template *
+srp_attach_transport(struct srp_function_template *);
+extern void srp_release_transport(struct scsi_transport_template *);
+
+extern void srp_rport_get(struct srp_rport *rport);
+extern void srp_rport_put(struct srp_rport *rport);
+extern struct srp_rport *srp_rport_add(struct Scsi_Host *,
+				       struct srp_rport_identifiers *);
+extern void srp_rport_del(struct srp_rport *);
+extern int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo,
+			 long dev_loss_tmo);
+int srp_parse_tmo(int *tmo, const char *buf);
+extern int srp_reconnect_rport(struct srp_rport *rport);
+extern void srp_start_tl_fail_timers(struct srp_rport *rport);
+extern void srp_remove_host(struct Scsi_Host *);
+extern void srp_stop_rport_timers(struct srp_rport *rport);
+enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd);
+
+/**
+ * srp_chkready() - evaluate the transport layer state before I/O
+ * @rport: SRP target port pointer.
+ *
+ * Returns a SCSI result code that can be returned by the LLD queuecommand()
+ * implementation. The role of this function is similar to that of
+ * fc_remote_port_chkready().
+ */
+static inline int srp_chkready(struct srp_rport *rport)
+{
+	switch (rport->state) {
+	case SRP_RPORT_RUNNING:
+	case SRP_RPORT_BLOCKED:
+	default:
+		return 0;
+	case SRP_RPORT_FAIL_FAST:
+		return DID_TRANSPORT_FAILFAST << 16;
+	case SRP_RPORT_LOST:
+		return DID_NO_CONNECT << 16;
+	}
+}
+
+#endif
diff --git a/include/scsi/srp.h b/include/scsi/srp.h
new file mode 100644
index 0000000..c16a3c9
--- /dev/null
+++ b/include/scsi/srp.h
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#ifndef SCSI_SRP_H
+#define SCSI_SRP_H
+
+/*
+ * Structures and constants for the SCSI RDMA Protocol (SRP) as
+ * defined by the INCITS T10 committee.  This file was written using
+ * draft Revision 16a of the SRP standard.
+ */
+
+#include <linux/types.h>
+#include <scsi/scsi.h>
+
+enum {
+	SRP_LOGIN_REQ	= 0x00,
+	SRP_TSK_MGMT	= 0x01,
+	SRP_CMD		= 0x02,
+	SRP_I_LOGOUT	= 0x03,
+	SRP_LOGIN_RSP	= 0xc0,
+	SRP_RSP		= 0xc1,
+	SRP_LOGIN_REJ	= 0xc2,
+	SRP_T_LOGOUT	= 0x80,
+	SRP_CRED_REQ	= 0x81,
+	SRP_AER_REQ	= 0x82,
+	SRP_CRED_RSP	= 0x41,
+	SRP_AER_RSP	= 0x42
+};
+
+enum {
+	SRP_BUF_FORMAT_DIRECT	= 1 << 1,
+	SRP_BUF_FORMAT_INDIRECT	= 1 << 2
+};
+
+enum {
+	SRP_NO_DATA_DESC	= 0,
+	SRP_DATA_DESC_DIRECT	= 1,
+	SRP_DATA_DESC_INDIRECT	= 2
+};
+
+enum {
+	SRP_TSK_ABORT_TASK	= 0x01,
+	SRP_TSK_ABORT_TASK_SET	= 0x02,
+	SRP_TSK_CLEAR_TASK_SET	= 0x04,
+	SRP_TSK_LUN_RESET	= 0x08,
+	SRP_TSK_CLEAR_ACA	= 0x40
+};
+
+enum srp_login_rej_reason {
+	SRP_LOGIN_REJ_UNABLE_ESTABLISH_CHANNEL		= 0x00010000,
+	SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES		= 0x00010001,
+	SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE	= 0x00010002,
+	SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL		= 0x00010003,
+	SRP_LOGIN_REJ_UNSUPPORTED_DESCRIPTOR_FMT	= 0x00010004,
+	SRP_LOGIN_REJ_MULTI_CHANNEL_UNSUPPORTED		= 0x00010005,
+	SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED		= 0x00010006
+};
+
+enum {
+	SRP_REV10_IB_IO_CLASS	= 0xff00,
+	SRP_REV16A_IB_IO_CLASS	= 0x0100
+};
+
+struct srp_direct_buf {
+	__be64	va;
+	__be32	key;
+	__be32  len;
+};
+
+/*
+ * We need the packed attribute because the SRP spec puts the list of
+ * descriptors at an offset of 20, which is not aligned to the size of
+ * struct srp_direct_buf.  The whole structure must be packed to avoid
+ * having the 20-byte structure padded to 24 bytes on 64-bit architectures.
+ */
+struct srp_indirect_buf {
+	struct srp_direct_buf	table_desc;
+	__be32			len;
+	struct srp_direct_buf	desc_list[0];
+} __attribute__((packed));
+
+enum {
+	SRP_MULTICHAN_SINGLE = 0,
+	SRP_MULTICHAN_MULTI  = 1
+};
+
+struct srp_login_req {
+	u8	opcode;
+	u8	reserved1[7];
+	u64	tag;
+	__be32	req_it_iu_len;
+	u8	reserved2[4];
+	__be16	req_buf_fmt;
+	u8	req_flags;
+	u8	reserved3[5];
+	u8	initiator_port_id[16];
+	u8	target_port_id[16];
+};
+
+/**
+ * struct srp_login_req_rdma - RDMA/CM login parameters.
+ *
+ * RDMA/CM over InfiniBand can only carry 92 - 36 = 56 bytes of private
+ * data. The %srp_login_req_rdma structure contains the same information as
+ * %srp_login_req but with the reserved data removed.
+ */
+struct srp_login_req_rdma {
+	u64	tag;
+	__be16	req_buf_fmt;
+	u8	req_flags;
+	u8	opcode;
+	__be32	req_it_iu_len;
+	u8	initiator_port_id[16];
+	u8	target_port_id[16];
+};
+
+/*
+ * The SRP spec defines the size of the LOGIN_RSP structure to be 52
+ * bytes, so it needs to be packed to avoid having it padded to 56
+ * bytes on 64-bit architectures.
+ */
+struct srp_login_rsp {
+	u8	opcode;
+	u8	reserved1[3];
+	__be32	req_lim_delta;
+	u64	tag;
+	__be32	max_it_iu_len;
+	__be32	max_ti_iu_len;
+	__be16	buf_fmt;
+	u8	rsp_flags;
+	u8	reserved2[25];
+} __attribute__((packed));
+
+struct srp_login_rej {
+	u8	opcode;
+	u8	reserved1[3];
+	__be32	reason;
+	u64	tag;
+	u8	reserved2[8];
+	__be16	buf_fmt;
+	u8	reserved3[6];
+};
+
+struct srp_i_logout {
+	u8	opcode;
+	u8	reserved[7];
+	u64	tag;
+};
+
+struct srp_t_logout {
+	u8	opcode;
+	u8	sol_not;
+	u8	reserved[2];
+	__be32	reason;
+	u64	tag;
+};
+
+/*
+ * We need the packed attribute because the SRP spec only aligns the
+ * 8-byte LUN field to 4 bytes.
+ */
+struct srp_tsk_mgmt {
+	u8	opcode;
+	u8	sol_not;
+	u8	reserved1[6];
+	u64	tag;
+	u8	reserved2[4];
+	struct scsi_lun	lun;
+	u8	reserved3[2];
+	u8	tsk_mgmt_func;
+	u8	reserved4;
+	u64	task_tag;
+	u8	reserved5[8];
+};
+
+/*
+ * We need the packed attribute because the SRP spec only aligns the
+ * 8-byte LUN field to 4 bytes.
+ */
+struct srp_cmd {
+	u8	opcode;
+	u8	sol_not;
+	u8	reserved1[3];
+	u8	buf_fmt;
+	u8	data_out_desc_cnt;
+	u8	data_in_desc_cnt;
+	u64	tag;
+	u8	reserved2[4];
+	struct scsi_lun	lun;
+	u8	reserved3;
+	u8	task_attr;
+	u8	reserved4;
+	u8	add_cdb_len;
+	u8	cdb[16];
+	u8	add_data[0];
+};
+
+enum {
+	SRP_RSP_FLAG_RSPVALID = 1 << 0,
+	SRP_RSP_FLAG_SNSVALID = 1 << 1,
+	SRP_RSP_FLAG_DOOVER   = 1 << 2,
+	SRP_RSP_FLAG_DOUNDER  = 1 << 3,
+	SRP_RSP_FLAG_DIOVER   = 1 << 4,
+	SRP_RSP_FLAG_DIUNDER  = 1 << 5
+};
+
+/*
+ * The SRP spec defines the size of the RSP structure to be 36 bytes,
+ * so it needs to be packed to avoid having it padded to 40 bytes on
+ * 64-bit architectures.
+ */
+struct srp_rsp {
+	u8	opcode;
+	u8	sol_not;
+	u8	reserved1[2];
+	__be32	req_lim_delta;
+	u64	tag;
+	u8	reserved2[2];
+	u8	flags;
+	u8	status;
+	__be32	data_out_res_cnt;
+	__be32	data_in_res_cnt;
+	__be32	sense_data_len;
+	__be32	resp_data_len;
+	u8	data[0];
+} __attribute__((packed));
+
+struct srp_cred_req {
+	u8	opcode;
+	u8	sol_not;
+	u8	reserved[2];
+	__be32	req_lim_delta;
+	u64	tag;
+};
+
+struct srp_cred_rsp {
+	u8	opcode;
+	u8	reserved[7];
+	u64	tag;
+};
+
+/*
+ * The SRP spec defines the fixed portion of the AER_REQ structure to be
+ * 36 bytes, so it needs to be packed to avoid having it padded to 40 bytes
+ * on 64-bit architectures.
+ */
+struct srp_aer_req {
+	u8	opcode;
+	u8	sol_not;
+	u8	reserved[2];
+	__be32	req_lim_delta;
+	u64	tag;
+	u32	reserved2;
+	struct scsi_lun	lun;
+	__be32	sense_data_len;
+	u32	reserved3;
+	u8	sense_data[0];
+} __attribute__((packed));
+
+struct srp_aer_rsp {
+	u8	opcode;
+	u8	reserved[7];
+	u64	tag;
+};
+
+#endif /* SCSI_SRP_H */
diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h
new file mode 100644
index 0000000..6e74b1e
--- /dev/null
+++ b/include/uapi/linux/nvme_ioctl.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Definitions for the NVM Express ioctl interface
+ * Copyright (c) 2011-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _UAPI_LINUX_NVME_IOCTL_H
+#define _UAPI_LINUX_NVME_IOCTL_H
+
+#include <linux/types.h>
+
+struct nvme_user_io {
+	__u8	opcode;
+	__u8	flags;
+	__u16	control;
+	__u16	nblocks;
+	__u16	rsvd;
+	__u64	metadata;
+	__u64	addr;
+	__u64	slba;
+	__u32	dsmgmt;
+	__u32	reftag;
+	__u16	apptag;
+	__u16	appmask;
+};
+
+struct nvme_passthru_cmd {
+	__u8	opcode;
+	__u8	flags;
+	__u16	rsvd1;
+	__u32	nsid;
+	__u32	cdw2;
+	__u32	cdw3;
+	__u64	metadata;
+	__u64	addr;
+	__u32	metadata_len;
+	__u32	data_len;
+	__u32	cdw10;
+	__u32	cdw11;
+	__u32	cdw12;
+	__u32	cdw13;
+	__u32	cdw14;
+	__u32	cdw15;
+	__u32	timeout_ms;
+	__u32	result;
+};
+
+#define nvme_admin_cmd nvme_passthru_cmd
+
+#define NVME_IOCTL_ID		_IO('N', 0x40)
+#define NVME_IOCTL_ADMIN_CMD	_IOWR('N', 0x41, struct nvme_admin_cmd)
+#define NVME_IOCTL_SUBMIT_IO	_IOW('N', 0x42, struct nvme_user_io)
+#define NVME_IOCTL_IO_CMD	_IOWR('N', 0x43, struct nvme_passthru_cmd)
+#define NVME_IOCTL_RESET	_IO('N', 0x44)
+#define NVME_IOCTL_SUBSYS_RESET	_IO('N', 0x45)
+#define NVME_IOCTL_RESCAN	_IO('N', 0x46)
+
+#endif /* _UAPI_LINUX_NVME_IOCTL_H */
diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
new file mode 100644
index 0000000..20c6bd0
--- /dev/null
+++ b/include/uapi/linux/rds.h
@@ -0,0 +1,338 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2008 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _LINUX_RDS_H
+#define _LINUX_RDS_H
+
+#include <linux/types.h>
+#include <linux/socket.h>		/* For __kernel_sockaddr_storage. */
+
+#define RDS_IB_ABI_VERSION		0x301
+
+#define	SOL_RDS		276
+
+/*
+ * setsockopt/getsockopt for SOL_RDS
+ */
+#define RDS_CANCEL_SENT_TO      	1
+#define RDS_GET_MR			2
+#define RDS_FREE_MR			3
+/* deprecated: RDS_BARRIER 4 */
+#define RDS_RECVERR			5
+#define RDS_CONG_MONITOR		6
+#define RDS_GET_MR_FOR_DEST		7
+#define SO_RDS_TRANSPORT		8
+
+/* Socket option to tap receive path latency
+ *	SO_RDS: SO_RDS_MSG_RXPATH_LATENCY
+ *	Format used struct rds_rx_trace_so
+ */
+#define SO_RDS_MSG_RXPATH_LATENCY	10
+
+
+/* supported values for SO_RDS_TRANSPORT */
+#define	RDS_TRANS_IB	0
+#define	RDS_TRANS_IWARP	1
+#define	RDS_TRANS_TCP	2
+#define RDS_TRANS_COUNT	3
+#define	RDS_TRANS_NONE	(~0)
+
+/*
+ * Control message types for SOL_RDS.
+ *
+ * CMSG_RDMA_ARGS (sendmsg)
+ *	Request a RDMA transfer to/from the specified
+ *	memory ranges.
+ *	The cmsg_data is a struct rds_rdma_args.
+ * RDS_CMSG_RDMA_DEST (recvmsg, sendmsg)
+ *	Kernel informs application about intended
+ *	source/destination of a RDMA transfer
+ * RDS_CMSG_RDMA_MAP (sendmsg)
+ *	Application asks kernel to map the given
+ *	memory range into a IB MR, and send the
+ *	R_Key along in an RDS extension header.
+ *	The cmsg_data is a struct rds_get_mr_args,
+ *	the same as for the GET_MR setsockopt.
+ * RDS_CMSG_RDMA_STATUS (recvmsg)
+ *	Returns the status of a completed RDMA operation.
+ * RDS_CMSG_RXPATH_LATENCY(recvmsg)
+ *	Returns rds message latencies in various stages of receive
+ *	path in nS. Its set per socket using SO_RDS_MSG_RXPATH_LATENCY
+ *	socket option. Legitimate points are defined in
+ *	enum rds_message_rxpath_latency. More points can be added in
+ *	future. CSMG format is struct rds_cmsg_rx_trace.
+ */
+#define RDS_CMSG_RDMA_ARGS		1
+#define RDS_CMSG_RDMA_DEST		2
+#define RDS_CMSG_RDMA_MAP		3
+#define RDS_CMSG_RDMA_STATUS		4
+#define RDS_CMSG_CONG_UPDATE		5
+#define RDS_CMSG_ATOMIC_FADD		6
+#define RDS_CMSG_ATOMIC_CSWP		7
+#define RDS_CMSG_MASKED_ATOMIC_FADD	8
+#define RDS_CMSG_MASKED_ATOMIC_CSWP	9
+#define RDS_CMSG_RXPATH_LATENCY		11
+#define	RDS_CMSG_ZCOPY_COOKIE		12
+#define	RDS_CMSG_ZCOPY_COMPLETION	13
+
+#define RDS_INFO_FIRST			10000
+#define RDS_INFO_COUNTERS		10000
+#define RDS_INFO_CONNECTIONS		10001
+/* 10002 aka RDS_INFO_FLOWS is deprecated */
+#define RDS_INFO_SEND_MESSAGES		10003
+#define RDS_INFO_RETRANS_MESSAGES       10004
+#define RDS_INFO_RECV_MESSAGES          10005
+#define RDS_INFO_SOCKETS                10006
+#define RDS_INFO_TCP_SOCKETS            10007
+#define RDS_INFO_IB_CONNECTIONS		10008
+#define RDS_INFO_CONNECTION_STATS	10009
+#define RDS_INFO_IWARP_CONNECTIONS	10010
+#define RDS_INFO_LAST			10010
+
+struct rds_info_counter {
+	__u8	name[32];
+	__u64	value;
+} __attribute__((packed));
+
+#define RDS_INFO_CONNECTION_FLAG_SENDING	0x01
+#define RDS_INFO_CONNECTION_FLAG_CONNECTING	0x02
+#define RDS_INFO_CONNECTION_FLAG_CONNECTED	0x04
+
+#define TRANSNAMSIZ	16
+
+struct rds_info_connection {
+	__u64		next_tx_seq;
+	__u64		next_rx_seq;
+	__be32		laddr;
+	__be32		faddr;
+	__u8		transport[TRANSNAMSIZ];		/* null term ascii */
+	__u8		flags;
+} __attribute__((packed));
+
+#define RDS_INFO_MESSAGE_FLAG_ACK               0x01
+#define RDS_INFO_MESSAGE_FLAG_FAST_ACK          0x02
+
+struct rds_info_message {
+	__u64		seq;
+	__u32		len;
+	__be32		laddr;
+	__be32		faddr;
+	__be16		lport;
+	__be16		fport;
+	__u8		flags;
+} __attribute__((packed));
+
+struct rds_info_socket {
+	__u32		sndbuf;
+	__be32		bound_addr;
+	__be32		connected_addr;
+	__be16		bound_port;
+	__be16		connected_port;
+	__u32		rcvbuf;
+	__u64		inum;
+} __attribute__((packed));
+
+struct rds_info_tcp_socket {
+	__be32          local_addr;
+	__be16          local_port;
+	__be32          peer_addr;
+	__be16          peer_port;
+	__u64           hdr_rem;
+	__u64           data_rem;
+	__u32           last_sent_nxt;
+	__u32           last_expected_una;
+	__u32           last_seen_una;
+} __attribute__((packed));
+
+#define RDS_IB_GID_LEN	16
+struct rds_info_rdma_connection {
+	__be32		src_addr;
+	__be32		dst_addr;
+	__u8		src_gid[RDS_IB_GID_LEN];
+	__u8		dst_gid[RDS_IB_GID_LEN];
+
+	__u32		max_send_wr;
+	__u32		max_recv_wr;
+	__u32		max_send_sge;
+	__u32		rdma_mr_max;
+	__u32		rdma_mr_size;
+};
+
+/* RDS message Receive Path Latency points */
+enum rds_message_rxpath_latency {
+	RDS_MSG_RX_HDR_TO_DGRAM_START = 0,
+	RDS_MSG_RX_DGRAM_REASSEMBLE,
+	RDS_MSG_RX_DGRAM_DELIVERED,
+	RDS_MSG_RX_DGRAM_TRACE_MAX
+};
+
+struct rds_rx_trace_so {
+	__u8 rx_traces;
+	__u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
+};
+
+struct rds_cmsg_rx_trace {
+	__u8 rx_traces;
+	__u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
+	__u64 rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
+};
+
+/*
+ * Congestion monitoring.
+ * Congestion control in RDS happens at the host connection
+ * level by exchanging a bitmap marking congested ports.
+ * By default, a process sleeping in poll() is always woken
+ * up when the congestion map is updated.
+ * With explicit monitoring, an application can have more
+ * fine-grained control.
+ * The application installs a 64bit mask value in the socket,
+ * where each bit corresponds to a group of ports.
+ * When a congestion update arrives, RDS checks the set of
+ * ports that are now uncongested against the list bit mask
+ * installed in the socket, and if they overlap, we queue a
+ * cong_notification on the socket.
+ *
+ * To install the congestion monitor bitmask, use RDS_CONG_MONITOR
+ * with the 64bit mask.
+ * Congestion updates are received via RDS_CMSG_CONG_UPDATE
+ * control messages.
+ *
+ * The correspondence between bits and ports is
+ *	1 << (portnum % 64)
+ */
+#define RDS_CONG_MONITOR_SIZE	64
+#define RDS_CONG_MONITOR_BIT(port)  (((unsigned int) port) % RDS_CONG_MONITOR_SIZE)
+#define RDS_CONG_MONITOR_MASK(port) (1ULL << RDS_CONG_MONITOR_BIT(port))
+
+/*
+ * RDMA related types
+ */
+
+/*
+ * This encapsulates a remote memory location.
+ * In the current implementation, it contains the R_Key
+ * of the remote memory region, and the offset into it
+ * (so that the application does not have to worry about
+ * alignment).
+ */
+typedef __u64		rds_rdma_cookie_t;
+
+struct rds_iovec {
+	__u64		addr;
+	__u64		bytes;
+};
+
+struct rds_get_mr_args {
+	struct rds_iovec vec;
+	__u64		cookie_addr;
+	__u64		flags;
+};
+
+struct rds_get_mr_for_dest_args {
+	struct __kernel_sockaddr_storage dest_addr;
+	struct rds_iovec 	vec;
+	__u64			cookie_addr;
+	__u64			flags;
+};
+
+struct rds_free_mr_args {
+	rds_rdma_cookie_t cookie;
+	__u64		flags;
+};
+
+struct rds_rdma_args {
+	rds_rdma_cookie_t cookie;
+	struct rds_iovec remote_vec;
+	__u64		local_vec_addr;
+	__u64		nr_local;
+	__u64		flags;
+	__u64		user_token;
+};
+
+struct rds_atomic_args {
+	rds_rdma_cookie_t cookie;
+	__u64		local_addr;
+	__u64		remote_addr;
+	union {
+		struct {
+			__u64		compare;
+			__u64		swap;
+		} cswp;
+		struct {
+			__u64		add;
+		} fadd;
+		struct {
+			__u64		compare;
+			__u64		swap;
+			__u64		compare_mask;
+			__u64		swap_mask;
+		} m_cswp;
+		struct {
+			__u64		add;
+			__u64		nocarry_mask;
+		} m_fadd;
+	};
+	__u64		flags;
+	__u64		user_token;
+};
+
+struct rds_rdma_notify {
+	__u64		user_token;
+	__s32		status;
+};
+
+#define RDS_RDMA_SUCCESS	0
+#define RDS_RDMA_REMOTE_ERROR	1
+#define RDS_RDMA_CANCELED	2
+#define RDS_RDMA_DROPPED	3
+#define RDS_RDMA_OTHER_ERROR	4
+
+#define	RDS_MAX_ZCOOKIES	8
+struct rds_zcopy_cookies {
+	__u32 num;
+	__u32 cookies[RDS_MAX_ZCOOKIES];
+};
+
+/*
+ * Common set of flags for all RDMA related structs
+ */
+#define RDS_RDMA_READWRITE	0x0001
+#define RDS_RDMA_FENCE		0x0002	/* use FENCE for immediate send */
+#define RDS_RDMA_INVALIDATE	0x0004	/* invalidate R_Key after freeing MR */
+#define RDS_RDMA_USE_ONCE	0x0008	/* free MR after use */
+#define RDS_RDMA_DONTWAIT	0x0010	/* Don't wait in SET_BARRIER */
+#define RDS_RDMA_NOTIFY_ME	0x0020	/* Notify when operation completes */
+#define RDS_RDMA_SILENT		0x0040	/* Do not interrupt remote */
+
+#endif /* IB_RDS_H */
diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h
new file mode 100644
index 0000000..a7a6111
--- /dev/null
+++ b/include/uapi/rdma/bnxt_re-abi.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
+/*
+ * Broadcom NetXtreme-E RoCE driver.
+ *
+ * Copyright (c) 2016 - 2017, Broadcom. All rights reserved.  The term
+ * Broadcom refers to Broadcom Limited and/or its subsidiaries.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Description: Uverbs ABI header file
+ */
+
+#ifndef __BNXT_RE_UVERBS_ABI_H__
+#define __BNXT_RE_UVERBS_ABI_H__
+
+#include <linux/types.h>
+
+#define BNXT_RE_ABI_VERSION	1
+
+struct bnxt_re_uctx_resp {
+	__u32 dev_id;
+	__u32 max_qp;
+	__u32 pg_size;
+	__u32 cqe_sz;
+	__u32 max_cqd;
+	__u32 rsvd;
+};
+
+/*
+ * This struct is placed after the ib_uverbs_alloc_pd_resp struct, which is
+ * not 8 byted aligned. To avoid undesired padding in various cases we have to
+ * set this struct to packed.
+ */
+struct bnxt_re_pd_resp {
+	__u32 pdid;
+	__u32 dpi;
+	__u64 dbr;
+} __attribute__((packed, aligned(4)));
+
+struct bnxt_re_cq_req {
+	__aligned_u64 cq_va;
+	__aligned_u64 cq_handle;
+};
+
+struct bnxt_re_cq_resp {
+	__u32 cqid;
+	__u32 tail;
+	__u32 phase;
+	__u32 rsvd;
+};
+
+struct bnxt_re_qp_req {
+	__aligned_u64 qpsva;
+	__aligned_u64 qprva;
+	__aligned_u64 qp_handle;
+};
+
+struct bnxt_re_qp_resp {
+	__u32 qpid;
+	__u32 rsvd;
+};
+
+struct bnxt_re_srq_req {
+	__aligned_u64 srqva;
+	__aligned_u64 srq_handle;
+};
+
+struct bnxt_re_srq_resp {
+	__u32 srqid;
+};
+
+enum bnxt_re_shpg_offt {
+	BNXT_RE_BEG_RESV_OFFT	= 0x00,
+	BNXT_RE_AVID_OFFT	= 0x10,
+	BNXT_RE_AVID_SIZE	= 0x04,
+	BNXT_RE_END_RESV_OFFT	= 0xFF0
+};
+
+#endif /* __BNXT_RE_UVERBS_ABI_H__*/
diff --git a/include/uapi/rdma/cxgb3-abi.h b/include/uapi/rdma/cxgb3-abi.h
new file mode 100644
index 0000000..85aed67
--- /dev/null
+++ b/include/uapi/rdma/cxgb3-abi.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef CXGB3_ABI_USER_H
+#define CXGB3_ABI_USER_H
+
+#include <linux/types.h>
+
+#define IWCH_UVERBS_ABI_VERSION	1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __aligned_u64
+ * instead.
+ */
+struct iwch_create_cq_req {
+	__aligned_u64 user_rptr_addr;
+};
+
+struct iwch_create_cq_resp_v0 {
+	__aligned_u64 key;
+	__u32 cqid;
+	__u32 size_log2;
+};
+
+struct iwch_create_cq_resp {
+	__aligned_u64 key;
+	__u32 cqid;
+	__u32 size_log2;
+	__u32 memsize;
+	__u32 reserved;
+};
+
+struct iwch_create_qp_resp {
+	__aligned_u64 key;
+	__aligned_u64 db_key;
+	__u32 qpid;
+	__u32 size_log2;
+	__u32 sq_size_log2;
+	__u32 rq_size_log2;
+};
+
+struct iwch_reg_user_mr_resp {
+	__u32 pbl_addr;
+};
+
+struct iwch_alloc_pd_resp {
+	__u32 pdid;
+};
+
+#endif /* CXGB3_ABI_USER_H */
diff --git a/include/uapi/rdma/cxgb4-abi.h b/include/uapi/rdma/cxgb4-abi.h
new file mode 100644
index 0000000..a159ba8
--- /dev/null
+++ b/include/uapi/rdma/cxgb4-abi.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef CXGB4_ABI_USER_H
+#define CXGB4_ABI_USER_H
+
+#include <linux/types.h>
+
+#define C4IW_UVERBS_ABI_VERSION	3
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __aligned_u64
+ * instead.
+ */
+struct c4iw_create_cq_resp {
+	__aligned_u64 key;
+	__aligned_u64 gts_key;
+	__aligned_u64 memsize;
+	__u32 cqid;
+	__u32 size;
+	__u32 qid_mask;
+	__u32 reserved; /* explicit padding (optional for i386) */
+};
+
+enum {
+	C4IW_QPF_ONCHIP = (1 << 0)
+};
+
+struct c4iw_create_qp_resp {
+	__aligned_u64 ma_sync_key;
+	__aligned_u64 sq_key;
+	__aligned_u64 rq_key;
+	__aligned_u64 sq_db_gts_key;
+	__aligned_u64 rq_db_gts_key;
+	__aligned_u64 sq_memsize;
+	__aligned_u64 rq_memsize;
+	__u32 sqid;
+	__u32 rqid;
+	__u32 sq_size;
+	__u32 rq_size;
+	__u32 qid_mask;
+	__u32 flags;
+};
+
+struct c4iw_alloc_ucontext_resp {
+	__aligned_u64 status_page_key;
+	__u32 status_page_size;
+	__u32 reserved; /* explicit padding (optional for i386) */
+};
+
+struct c4iw_alloc_pd_resp {
+	__u32 pdid;
+};
+
+#endif /* CXGB4_ABI_USER_H */
diff --git a/include/uapi/rdma/hfi/hfi1_ioctl.h b/include/uapi/rdma/hfi/hfi1_ioctl.h
new file mode 100644
index 0000000..8f3d9fe
--- /dev/null
+++ b/include/uapi/rdma/hfi/hfi1_ioctl.h
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _LINUX__HFI1_IOCTL_H
+#define _LINUX__HFI1_IOCTL_H
+#include <linux/types.h>
+
+/*
+ * This structure is passed to the driver to tell it where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct hfi1_user_info {
+	/*
+	 * version of user software, to detect compatibility issues.
+	 * Should be set to HFI1_USER_SWVERSION.
+	 */
+	__u32 userversion;
+	__u32 pad;
+	/*
+	 * If two or more processes wish to share a context, each process
+	 * must set the subcontext_cnt and subcontext_id to the same
+	 * values.  The only restriction on the subcontext_id is that
+	 * it be unique for a given node.
+	 */
+	__u16 subctxt_cnt;
+	__u16 subctxt_id;
+	/* 128bit UUID passed in by PSM. */
+	__u8 uuid[16];
+};
+
+struct hfi1_ctxt_info {
+	__aligned_u64 runtime_flags;    /* chip/drv runtime flags (HFI1_CAP_*) */
+	__u32 rcvegr_size;      /* size of each eager buffer */
+	__u16 num_active;       /* number of active units */
+	__u16 unit;             /* unit (chip) assigned to caller */
+	__u16 ctxt;             /* ctxt on unit assigned to caller */
+	__u16 subctxt;          /* subctxt on unit assigned to caller */
+	__u16 rcvtids;          /* number of Rcv TIDs for this context */
+	__u16 credits;          /* number of PIO credits for this context */
+	__u16 numa_node;        /* NUMA node of the assigned device */
+	__u16 rec_cpu;          /* cpu # for affinity (0xffff if none) */
+	__u16 send_ctxt;        /* send context in use by this user context */
+	__u16 egrtids;          /* number of RcvArray entries for Eager Rcvs */
+	__u16 rcvhdrq_cnt;      /* number of RcvHdrQ entries */
+	__u16 rcvhdrq_entsize;  /* size (in bytes) for each RcvHdrQ entry */
+	__u16 sdma_ring_size;   /* number of entries in SDMA request ring */
+};
+
+struct hfi1_tid_info {
+	/* virtual address of first page in transfer */
+	__aligned_u64 vaddr;
+	/* pointer to tid array. this array is big enough */
+	__aligned_u64 tidlist;
+	/* number of tids programmed by this request */
+	__u32 tidcnt;
+	/* length of transfer buffer programmed by this request */
+	__u32 length;
+};
+
+/*
+ * This structure is returned by the driver immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explicit pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct hfi1_base_info {
+	/* version of hardware, for feature checking. */
+	__u32 hw_version;
+	/* version of software, for feature checking. */
+	__u32 sw_version;
+	/* Job key */
+	__u16 jkey;
+	__u16 padding1;
+	/*
+	 * The special QP (queue pair) value that identifies PSM
+	 * protocol packet from standard IB packets.
+	 */
+	__u32 bthqp;
+	/* PIO credit return address, */
+	__aligned_u64 sc_credits_addr;
+	/*
+	 * Base address of write-only pio buffers for this process.
+	 * Each buffer has sendpio_credits*64 bytes.
+	 */
+	__aligned_u64 pio_bufbase_sop;
+	/*
+	 * Base address of write-only pio buffers for this process.
+	 * Each buffer has sendpio_credits*64 bytes.
+	 */
+	__aligned_u64 pio_bufbase;
+	/* address where receive buffer queue is mapped into */
+	__aligned_u64 rcvhdr_bufbase;
+	/* base address of Eager receive buffers. */
+	__aligned_u64 rcvegr_bufbase;
+	/* base address of SDMA completion ring */
+	__aligned_u64 sdma_comp_bufbase;
+	/*
+	 * User register base for init code, not to be used directly by
+	 * protocol or applications.  Always maps real chip register space.
+	 * the register addresses are:
+	 * ur_rcvhdrhead, ur_rcvhdrtail, ur_rcvegrhead, ur_rcvegrtail,
+	 * ur_rcvtidflow
+	 */
+	__aligned_u64 user_regbase;
+	/* notification events */
+	__aligned_u64 events_bufbase;
+	/* status page */
+	__aligned_u64 status_bufbase;
+	/* rcvhdrtail update */
+	__aligned_u64 rcvhdrtail_base;
+	/*
+	 * shared memory pages for subctxts if ctxt is shared; these cover
+	 * all the processes in the group sharing a single context.
+	 * all have enough space for the num_subcontexts value on this job.
+	 */
+	__aligned_u64 subctxt_uregbase;
+	__aligned_u64 subctxt_rcvegrbuf;
+	__aligned_u64 subctxt_rcvhdrbuf;
+};
+#endif /* _LINIUX__HFI1_IOCTL_H */
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
new file mode 100644
index 0000000..c6a984c
--- /dev/null
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -0,0 +1,267 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+#ifndef _LINUX__HFI1_USER_H
+#define _LINUX__HFI1_USER_H
+
+#include <linux/types.h>
+#include <rdma/rdma_user_ioctl.h>
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of hfi1_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures change in an incompatible
+ * way. The driver must be the same for initialization to succeed.
+ */
+#define HFI1_USER_SWMAJOR 6
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference.
+ */
+#define HFI1_USER_SWMINOR 3
+
+/*
+ * We will encode the major/minor inside a single 32bit version number.
+ */
+#define HFI1_SWMAJOR_SHIFT 16
+
+/*
+ * Set of HW and driver capability/feature bits.
+ * These bit values are used to configure enabled/disabled HW and
+ * driver features. The same set of bits are communicated to user
+ * space.
+ */
+#define HFI1_CAP_DMA_RTAIL        (1UL <<  0) /* Use DMA'ed RTail value */
+#define HFI1_CAP_SDMA             (1UL <<  1) /* Enable SDMA support */
+#define HFI1_CAP_SDMA_AHG         (1UL <<  2) /* Enable SDMA AHG support */
+#define HFI1_CAP_EXTENDED_PSN     (1UL <<  3) /* Enable Extended PSN support */
+#define HFI1_CAP_HDRSUPP          (1UL <<  4) /* Enable Header Suppression */
+/* 1UL << 5 unused */
+#define HFI1_CAP_USE_SDMA_HEAD    (1UL <<  6) /* DMA Hdr Q tail vs. use CSR */
+#define HFI1_CAP_MULTI_PKT_EGR    (1UL <<  7) /* Enable multi-packet Egr buffs*/
+#define HFI1_CAP_NODROP_RHQ_FULL  (1UL <<  8) /* Don't drop on Hdr Q full */
+#define HFI1_CAP_NODROP_EGR_FULL  (1UL <<  9) /* Don't drop on EGR buffs full */
+#define HFI1_CAP_TID_UNMAP        (1UL << 10) /* Disable Expected TID caching */
+#define HFI1_CAP_PRINT_UNIMPL     (1UL << 11) /* Show for unimplemented feats */
+#define HFI1_CAP_ALLOW_PERM_JKEY  (1UL << 12) /* Allow use of permissive JKEY */
+#define HFI1_CAP_NO_INTEGRITY     (1UL << 13) /* Enable ctxt integrity checks */
+#define HFI1_CAP_PKEY_CHECK       (1UL << 14) /* Enable ctxt PKey checking */
+#define HFI1_CAP_STATIC_RATE_CTRL (1UL << 15) /* Allow PBC.StaticRateControl */
+/* 1UL << 16 unused */
+#define HFI1_CAP_SDMA_HEAD_CHECK  (1UL << 17) /* SDMA head checking */
+#define HFI1_CAP_EARLY_CREDIT_RETURN (1UL << 18) /* early credit return */
+
+#define HFI1_RCVHDR_ENTSIZE_2    (1UL << 0)
+#define HFI1_RCVHDR_ENTSIZE_16   (1UL << 1)
+#define HFI1_RCVDHR_ENTSIZE_32   (1UL << 2)
+
+#define _HFI1_EVENT_FROZEN_BIT         0
+#define _HFI1_EVENT_LINKDOWN_BIT       1
+#define _HFI1_EVENT_LID_CHANGE_BIT     2
+#define _HFI1_EVENT_LMC_CHANGE_BIT     3
+#define _HFI1_EVENT_SL2VL_CHANGE_BIT   4
+#define _HFI1_EVENT_TID_MMU_NOTIFY_BIT 5
+#define _HFI1_MAX_EVENT_BIT _HFI1_EVENT_TID_MMU_NOTIFY_BIT
+
+#define HFI1_EVENT_FROZEN            (1UL << _HFI1_EVENT_FROZEN_BIT)
+#define HFI1_EVENT_LINKDOWN          (1UL << _HFI1_EVENT_LINKDOWN_BIT)
+#define HFI1_EVENT_LID_CHANGE        (1UL << _HFI1_EVENT_LID_CHANGE_BIT)
+#define HFI1_EVENT_LMC_CHANGE        (1UL << _HFI1_EVENT_LMC_CHANGE_BIT)
+#define HFI1_EVENT_SL2VL_CHANGE      (1UL << _HFI1_EVENT_SL2VL_CHANGE_BIT)
+#define HFI1_EVENT_TID_MMU_NOTIFY    (1UL << _HFI1_EVENT_TID_MMU_NOTIFY_BIT)
+
+/*
+ * These are the status bits readable (in ASCII form, 64bit value)
+ * from the "status" sysfs file.  For binary compatibility, values
+ * must remain as is; removed states can be reused for different
+ * purposes.
+ */
+#define HFI1_STATUS_INITTED       0x1    /* basic initialization done */
+/* Chip has been found and initialized */
+#define HFI1_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define HFI1_STATUS_IB_READY     0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define HFI1_STATUS_IB_CONF      0x80
+/* A Fatal hardware error has occurred. */
+#define HFI1_STATUS_HWERROR     0x200
+
+/*
+ * Number of supported shared contexts.
+ * This is the maximum number of software contexts that can share
+ * a hardware send/receive context.
+ */
+#define HFI1_MAX_SHARED_CTXTS 8
+
+/*
+ * Poll types
+ */
+#define HFI1_POLL_TYPE_ANYRCV     0x0
+#define HFI1_POLL_TYPE_URGENT     0x1
+
+enum hfi1_sdma_comp_state {
+	FREE = 0,
+	QUEUED,
+	COMPLETE,
+	ERROR
+};
+
+/*
+ * SDMA completion ring entry
+ */
+struct hfi1_sdma_comp_entry {
+	__u32 status;
+	__u32 errcode;
+};
+
+/*
+ * Device status and notifications from driver to user-space.
+ */
+struct hfi1_status {
+	__aligned_u64 dev;      /* device/hw status bits */
+	__aligned_u64 port;     /* port state and status bits */
+	char freezemsg[0];
+};
+
+enum sdma_req_opcode {
+	EXPECTED = 0,
+	EAGER
+};
+
+#define HFI1_SDMA_REQ_VERSION_MASK 0xF
+#define HFI1_SDMA_REQ_VERSION_SHIFT 0x0
+#define HFI1_SDMA_REQ_OPCODE_MASK 0xF
+#define HFI1_SDMA_REQ_OPCODE_SHIFT 0x4
+#define HFI1_SDMA_REQ_IOVCNT_MASK 0xFF
+#define HFI1_SDMA_REQ_IOVCNT_SHIFT 0x8
+
+struct sdma_req_info {
+	/*
+	 * bits 0-3 - version (currently unused)
+	 * bits 4-7 - opcode (enum sdma_req_opcode)
+	 * bits 8-15 - io vector count
+	 */
+	__u16 ctrl;
+	/*
+	 * Number of fragments contained in this request.
+	 * User-space has already computed how many
+	 * fragment-sized packet the user buffer will be
+	 * split into.
+	 */
+	__u16 npkts;
+	/*
+	 * Size of each fragment the user buffer will be
+	 * split into.
+	 */
+	__u16 fragsize;
+	/*
+	 * Index of the slot in the SDMA completion ring
+	 * this request should be using. User-space is
+	 * in charge of managing its own ring.
+	 */
+	__u16 comp_idx;
+} __attribute__((__packed__));
+
+/*
+ * SW KDETH header.
+ * swdata is SW defined portion.
+ */
+struct hfi1_kdeth_header {
+	__le32 ver_tid_offset;
+	__le16 jkey;
+	__le16 hcrc;
+	__le32 swdata[7];
+}  __attribute__((__packed__));
+
+/*
+ * Structure describing the headers that User space uses. The
+ * structure above is a subset of this one.
+ */
+struct hfi1_pkt_header {
+	__le16 pbc[4];
+	__be16 lrh[4];
+	__be32 bth[3];
+	struct hfi1_kdeth_header kdeth;
+}  __attribute__((__packed__));
+
+
+/*
+ * The list of usermode accessible registers.
+ */
+enum hfi1_ureg {
+	/* (RO)  DMA RcvHdr to be used next. */
+	ur_rcvhdrtail = 0,
+	/* (RW)  RcvHdr entry to be processed next by host. */
+	ur_rcvhdrhead = 1,
+	/* (RO)  Index of next Eager index to use. */
+	ur_rcvegrindextail = 2,
+	/* (RW)  Eager TID to be processed next */
+	ur_rcvegrindexhead = 3,
+	/* (RO)  Receive Eager Offset Tail */
+	ur_rcvegroffsettail = 4,
+	/* For internal use only; max register number. */
+	ur_maxreg,
+	/* (RW)  Receive TID flow table */
+	ur_rcvtidflowtable = 256
+};
+
+#endif /* _LINIUX__HFI1_USER_H */
diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
new file mode 100644
index 0000000..78613b6
--- /dev/null
+++ b/include/uapi/rdma/hns-abi.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2016 Hisilicon Limited.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef HNS_ABI_USER_H
+#define HNS_ABI_USER_H
+
+#include <linux/types.h>
+
+struct hns_roce_ib_create_cq {
+	__aligned_u64 buf_addr;
+	__aligned_u64 db_addr;
+};
+
+struct hns_roce_ib_create_cq_resp {
+	__aligned_u64 cqn; /* Only 32 bits used, 64 for compat */
+	__aligned_u64 cap_flags;
+};
+
+struct hns_roce_ib_create_qp {
+	__aligned_u64 buf_addr;
+	__aligned_u64 db_addr;
+	__u8    log_sq_bb_count;
+	__u8    log_sq_stride;
+	__u8    sq_no_prefetch;
+	__u8    reserved[5];
+};
+
+struct hns_roce_ib_create_qp_resp {
+	__aligned_u64 cap_flags;
+};
+
+struct hns_roce_ib_alloc_ucontext_resp {
+	__u32	qp_tab_size;
+	__u32	reserved;
+};
+
+struct hns_roce_ib_alloc_pd_resp {
+	__u32 pdn;
+};
+
+#endif /* HNS_ABI_USER_H */
diff --git a/include/uapi/rdma/i40iw-abi.h b/include/uapi/rdma/i40iw-abi.h
new file mode 100644
index 0000000..79890ba
--- /dev/null
+++ b/include/uapi/rdma/i40iw-abi.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2006 - 2016 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef I40IW_ABI_H
+#define I40IW_ABI_H
+
+#include <linux/types.h>
+
+#define I40IW_ABI_VER 5
+
+struct i40iw_alloc_ucontext_req {
+	__u32 reserved32;
+	__u8 userspace_ver;
+	__u8 reserved8[3];
+};
+
+struct i40iw_alloc_ucontext_resp {
+	__u32 max_pds;		/* maximum pds allowed for this user process */
+	__u32 max_qps;		/* maximum qps allowed for this user process */
+	__u32 wq_size;		/* size of the WQs (sq+rq) allocated to the mmaped area */
+	__u8 kernel_ver;
+	__u8 reserved[3];
+};
+
+struct i40iw_alloc_pd_resp {
+	__u32 pd_id;
+	__u8 reserved[4];
+};
+
+struct i40iw_create_cq_req {
+	__aligned_u64 user_cq_buffer;
+	__aligned_u64 user_shadow_area;
+};
+
+struct i40iw_create_qp_req {
+	__aligned_u64 user_wqe_buffers;
+	__aligned_u64 user_compl_ctx;
+
+	/* UDA QP PHB */
+	__aligned_u64 user_sq_phb;	/* place for VA of the sq phb buff */
+	__aligned_u64 user_rq_phb;	/* place for VA of the rq phb buff */
+};
+
+enum i40iw_memreg_type {
+	IW_MEMREG_TYPE_MEM = 0x0000,
+	IW_MEMREG_TYPE_QP = 0x0001,
+	IW_MEMREG_TYPE_CQ = 0x0002,
+};
+
+struct i40iw_mem_reg_req {
+	__u16 reg_type;		/* Memory, QP or CQ */
+	__u16 cq_pages;
+	__u16 rq_pages;
+	__u16 sq_pages;
+};
+
+struct i40iw_create_cq_resp {
+	__u32 cq_id;
+	__u32 cq_size;
+	__u32 mmap_db_index;
+	__u32 reserved;
+};
+
+struct i40iw_create_qp_resp {
+	__u32 qp_id;
+	__u32 actual_sq_size;
+	__u32 actual_rq_size;
+	__u32 i40iw_drv_opt;
+	__u16 push_idx;
+	__u8  lsmm;
+	__u8  rsvd2;
+};
+
+#endif
diff --git a/include/uapi/rdma/ib_user_cm.h b/include/uapi/rdma/ib_user_cm.h
new file mode 100644
index 0000000..e2709bb
--- /dev/null
+++ b/include/uapi/rdma/ib_user_cm.h
@@ -0,0 +1,326 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_CM_H
+#define IB_USER_CM_H
+
+#include <linux/types.h>
+#include <rdma/ib_user_sa.h>
+
+#define IB_USER_CM_ABI_VERSION 5
+
+enum {
+	IB_USER_CM_CMD_CREATE_ID,
+	IB_USER_CM_CMD_DESTROY_ID,
+	IB_USER_CM_CMD_ATTR_ID,
+
+	IB_USER_CM_CMD_LISTEN,
+	IB_USER_CM_CMD_NOTIFY,
+
+	IB_USER_CM_CMD_SEND_REQ,
+	IB_USER_CM_CMD_SEND_REP,
+	IB_USER_CM_CMD_SEND_RTU,
+	IB_USER_CM_CMD_SEND_DREQ,
+	IB_USER_CM_CMD_SEND_DREP,
+	IB_USER_CM_CMD_SEND_REJ,
+	IB_USER_CM_CMD_SEND_MRA,
+	IB_USER_CM_CMD_SEND_LAP,
+	IB_USER_CM_CMD_SEND_APR,
+	IB_USER_CM_CMD_SEND_SIDR_REQ,
+	IB_USER_CM_CMD_SEND_SIDR_REP,
+
+	IB_USER_CM_CMD_EVENT,
+	IB_USER_CM_CMD_INIT_QP_ATTR,
+};
+/*
+ * command ABI structures.
+ */
+struct ib_ucm_cmd_hdr {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+};
+
+struct ib_ucm_create_id {
+	__aligned_u64 uid;
+	__aligned_u64 response;
+};
+
+struct ib_ucm_create_id_resp {
+	__u32 id;
+};
+
+struct ib_ucm_destroy_id {
+	__aligned_u64 response;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct ib_ucm_destroy_id_resp {
+	__u32 events_reported;
+};
+
+struct ib_ucm_attr_id {
+	__aligned_u64 response;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct ib_ucm_attr_id_resp {
+	__be64 service_id;
+	__be64 service_mask;
+	__be32 local_id;
+	__be32 remote_id;
+};
+
+struct ib_ucm_init_qp_attr {
+	__aligned_u64 response;
+	__u32 id;
+	__u32 qp_state;
+};
+
+struct ib_ucm_listen {
+	__be64 service_id;
+	__be64 service_mask;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct ib_ucm_notify {
+	__u32 id;
+	__u32 event;
+};
+
+struct ib_ucm_private_data {
+	__aligned_u64 data;
+	__u32 id;
+	__u8  len;
+	__u8  reserved[3];
+};
+
+struct ib_ucm_req {
+	__u32 id;
+	__u32 qpn;
+	__u32 qp_type;
+	__u32 psn;
+	__be64 sid;
+	__aligned_u64 data;
+	__aligned_u64 primary_path;
+	__aligned_u64 alternate_path;
+	__u8  len;
+	__u8  peer_to_peer;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  remote_cm_response_timeout;
+	__u8  flow_control;
+	__u8  local_cm_response_timeout;
+	__u8  retry_count;
+	__u8  rnr_retry_count;
+	__u8  max_cm_retries;
+	__u8  srq;
+	__u8  reserved[5];
+};
+
+struct ib_ucm_rep {
+	__aligned_u64 uid;
+	__aligned_u64 data;
+	__u32 id;
+	__u32 qpn;
+	__u32 psn;
+	__u8  len;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  target_ack_delay;
+	__u8  failover_accepted;
+	__u8  flow_control;
+	__u8  rnr_retry_count;
+	__u8  srq;
+	__u8  reserved[4];
+};
+
+struct ib_ucm_info {
+	__u32 id;
+	__u32 status;
+	__aligned_u64 info;
+	__aligned_u64 data;
+	__u8  info_len;
+	__u8  data_len;
+	__u8  reserved[6];
+};
+
+struct ib_ucm_mra {
+	__aligned_u64 data;
+	__u32 id;
+	__u8  len;
+	__u8  timeout;
+	__u8  reserved[2];
+};
+
+struct ib_ucm_lap {
+	__aligned_u64 path;
+	__aligned_u64 data;
+	__u32 id;
+	__u8  len;
+	__u8  reserved[3];
+};
+
+struct ib_ucm_sidr_req {
+	__u32 id;
+	__u32 timeout;
+	__be64 sid;
+	__aligned_u64 data;
+	__aligned_u64 path;
+	__u16 reserved_pkey;
+	__u8  len;
+	__u8  max_cm_retries;
+	__u8  reserved[4];
+};
+
+struct ib_ucm_sidr_rep {
+	__u32 id;
+	__u32 qpn;
+	__u32 qkey;
+	__u32 status;
+	__aligned_u64 info;
+	__aligned_u64 data;
+	__u8  info_len;
+	__u8  data_len;
+	__u8  reserved[6];
+};
+/*
+ * event notification ABI structures.
+ */
+struct ib_ucm_event_get {
+	__aligned_u64 response;
+	__aligned_u64 data;
+	__aligned_u64 info;
+	__u8  data_len;
+	__u8  info_len;
+	__u8  reserved[6];
+};
+
+struct ib_ucm_req_event_resp {
+	struct ib_user_path_rec primary_path;
+	struct ib_user_path_rec alternate_path;
+	__be64                 remote_ca_guid;
+	__u32                  remote_qkey;
+	__u32                  remote_qpn;
+	__u32                  qp_type;
+	__u32                  starting_psn;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  local_cm_response_timeout;
+	__u8  flow_control;
+	__u8  remote_cm_response_timeout;
+	__u8  retry_count;
+	__u8  rnr_retry_count;
+	__u8  srq;
+	__u8  port;
+	__u8  reserved[7];
+};
+
+struct ib_ucm_rep_event_resp {
+	__be64 remote_ca_guid;
+	__u32 remote_qkey;
+	__u32 remote_qpn;
+	__u32 starting_psn;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  target_ack_delay;
+	__u8  failover_accepted;
+	__u8  flow_control;
+	__u8  rnr_retry_count;
+	__u8  srq;
+	__u8  reserved[5];
+};
+
+struct ib_ucm_rej_event_resp {
+	__u32 reason;
+	/* ari in ib_ucm_event_get info field. */
+};
+
+struct ib_ucm_mra_event_resp {
+	__u8  timeout;
+	__u8  reserved[3];
+};
+
+struct ib_ucm_lap_event_resp {
+	struct ib_user_path_rec path;
+};
+
+struct ib_ucm_apr_event_resp {
+	__u32 status;
+	/* apr info in ib_ucm_event_get info field. */
+};
+
+struct ib_ucm_sidr_req_event_resp {
+	__u16 pkey;
+	__u8  port;
+	__u8  reserved;
+};
+
+struct ib_ucm_sidr_rep_event_resp {
+	__u32 status;
+	__u32 qkey;
+	__u32 qpn;
+	/* info in ib_ucm_event_get info field. */
+};
+
+#define IB_UCM_PRES_DATA      0x01
+#define IB_UCM_PRES_INFO      0x02
+#define IB_UCM_PRES_PRIMARY   0x04
+#define IB_UCM_PRES_ALTERNATE 0x08
+
+struct ib_ucm_event_resp {
+	__aligned_u64 uid;
+	__u32 id;
+	__u32 event;
+	__u32 present;
+	__u32 reserved;
+	union {
+		struct ib_ucm_req_event_resp req_resp;
+		struct ib_ucm_rep_event_resp rep_resp;
+		struct ib_ucm_rej_event_resp rej_resp;
+		struct ib_ucm_mra_event_resp mra_resp;
+		struct ib_ucm_lap_event_resp lap_resp;
+		struct ib_ucm_apr_event_resp apr_resp;
+
+		struct ib_ucm_sidr_req_event_resp sidr_req_resp;
+		struct ib_ucm_sidr_rep_event_resp sidr_rep_resp;
+
+		__u32                             send_status;
+	} u;
+};
+
+#endif /* IB_USER_CM_H */
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
new file mode 100644
index 0000000..83e3890
--- /dev/null
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_IOCTL_CMDS_H
+#define IB_USER_IOCTL_CMDS_H
+
+#define UVERBS_ID_NS_MASK 0xF000
+#define UVERBS_ID_NS_SHIFT 12
+
+#define UVERBS_UDATA_DRIVER_DATA_NS	1
+#define UVERBS_UDATA_DRIVER_DATA_FLAG	(1UL << UVERBS_ID_NS_SHIFT)
+
+enum uverbs_default_objects {
+	UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */
+	UVERBS_OBJECT_PD,
+	UVERBS_OBJECT_COMP_CHANNEL,
+	UVERBS_OBJECT_CQ,
+	UVERBS_OBJECT_QP,
+	UVERBS_OBJECT_SRQ,
+	UVERBS_OBJECT_AH,
+	UVERBS_OBJECT_MR,
+	UVERBS_OBJECT_MW,
+	UVERBS_OBJECT_FLOW,
+	UVERBS_OBJECT_XRCD,
+	UVERBS_OBJECT_RWQ_IND_TBL,
+	UVERBS_OBJECT_WQ,
+	UVERBS_OBJECT_FLOW_ACTION,
+	UVERBS_OBJECT_DM,
+};
+
+enum {
+	UVERBS_ATTR_UHW_IN = UVERBS_UDATA_DRIVER_DATA_FLAG,
+	UVERBS_ATTR_UHW_OUT,
+};
+
+enum uverbs_attrs_create_cq_cmd_attr_ids {
+	UVERBS_ATTR_CREATE_CQ_HANDLE,
+	UVERBS_ATTR_CREATE_CQ_CQE,
+	UVERBS_ATTR_CREATE_CQ_USER_HANDLE,
+	UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL,
+	UVERBS_ATTR_CREATE_CQ_COMP_VECTOR,
+	UVERBS_ATTR_CREATE_CQ_FLAGS,
+	UVERBS_ATTR_CREATE_CQ_RESP_CQE,
+};
+
+enum uverbs_attrs_destroy_cq_cmd_attr_ids {
+	UVERBS_ATTR_DESTROY_CQ_HANDLE,
+	UVERBS_ATTR_DESTROY_CQ_RESP,
+};
+
+enum uverbs_attrs_create_flow_action_esp {
+	UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE,
+	UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS,
+	UVERBS_ATTR_FLOW_ACTION_ESP_ESN,
+	UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT,
+	UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY,
+	UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP,
+};
+
+enum uverbs_attrs_destroy_flow_action_esp {
+	UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE,
+};
+
+enum uverbs_methods_cq {
+	UVERBS_METHOD_CQ_CREATE,
+	UVERBS_METHOD_CQ_DESTROY,
+};
+
+enum uverbs_methods_actions_flow_action_ops {
+	UVERBS_METHOD_FLOW_ACTION_ESP_CREATE,
+	UVERBS_METHOD_FLOW_ACTION_DESTROY,
+	UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY,
+};
+
+enum uverbs_attrs_alloc_dm_cmd_attr_ids {
+	UVERBS_ATTR_ALLOC_DM_HANDLE,
+	UVERBS_ATTR_ALLOC_DM_LENGTH,
+	UVERBS_ATTR_ALLOC_DM_ALIGNMENT,
+};
+
+enum uverbs_attrs_free_dm_cmd_attr_ids {
+	UVERBS_ATTR_FREE_DM_HANDLE,
+};
+
+enum uverbs_methods_dm {
+	UVERBS_METHOD_DM_ALLOC,
+	UVERBS_METHOD_DM_FREE,
+};
+
+enum uverbs_attrs_reg_dm_mr_cmd_attr_ids {
+	UVERBS_ATTR_REG_DM_MR_HANDLE,
+	UVERBS_ATTR_REG_DM_MR_OFFSET,
+	UVERBS_ATTR_REG_DM_MR_LENGTH,
+	UVERBS_ATTR_REG_DM_MR_PD_HANDLE,
+	UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS,
+	UVERBS_ATTR_REG_DM_MR_DM_HANDLE,
+	UVERBS_ATTR_REG_DM_MR_RESP_LKEY,
+	UVERBS_ATTR_REG_DM_MR_RESP_RKEY,
+};
+
+enum uverbs_methods_mr {
+	UVERBS_METHOD_DM_MR_REG,
+};
+
+#endif
diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h
new file mode 100644
index 0000000..625545d
--- /dev/null
+++ b/include/uapi/rdma/ib_user_ioctl_verbs.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2017-2018, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_IOCTL_VERBS_H
+#define IB_USER_IOCTL_VERBS_H
+
+#include <linux/types.h>
+
+#ifndef RDMA_UAPI_PTR
+#define RDMA_UAPI_PTR(_type, _name)	__aligned_u64 _name
+#endif
+
+enum ib_uverbs_flow_action_esp_keymat {
+	IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM,
+};
+
+enum ib_uverbs_flow_action_esp_keymat_aes_gcm_iv_algo {
+	IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ,
+};
+
+struct ib_uverbs_flow_action_esp_keymat_aes_gcm {
+	__aligned_u64	iv;
+	__u32		iv_algo; /* Use enum ib_uverbs_flow_action_esp_keymat_aes_gcm_iv_algo */
+
+	__u32		salt;
+	__u32		icv_len;
+
+	__u32		key_len;
+	__u32		aes_key[256 / 32];
+};
+
+enum ib_uverbs_flow_action_esp_replay {
+	IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE,
+	IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP,
+};
+
+struct ib_uverbs_flow_action_esp_replay_bmp {
+	__u32	size;
+};
+
+enum ib_uverbs_flow_action_esp_flags {
+	IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO	= 0UL << 0,	/* Default */
+	IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD	= 1UL << 0,
+
+	IB_UVERBS_FLOW_ACTION_ESP_FLAGS_TUNNEL		= 0UL << 1,	/* Default */
+	IB_UVERBS_FLOW_ACTION_ESP_FLAGS_TRANSPORT	= 1UL << 1,
+
+	IB_UVERBS_FLOW_ACTION_ESP_FLAGS_DECRYPT		= 0UL << 2,	/* Default */
+	IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT		= 1UL << 2,
+
+	IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW	= 1UL << 3,
+};
+
+struct ib_uverbs_flow_action_esp_encap {
+	/* This struct represents a list of pointers to flow_xxxx_filter that
+	 * encapsulates the payload in ESP tunnel mode.
+	 */
+	RDMA_UAPI_PTR(void *, val_ptr); /* pointer to a flow_xxxx_filter */
+	RDMA_UAPI_PTR(struct ib_uverbs_flow_action_esp_encap *, next_ptr);
+	__u16	len;		/* Len of the filter struct val_ptr points to */
+	__u16	type;		/* Use flow_spec_type enum */
+};
+
+struct ib_uverbs_flow_action_esp {
+	__u32		spi;
+	__u32		seq;
+	__u32		tfc_pad;
+	__u32		flags;
+	__aligned_u64	hard_limit_pkts;
+};
+
+#endif
diff --git a/include/uapi/rdma/ib_user_mad.h b/include/uapi/rdma/ib_user_mad.h
new file mode 100644
index 0000000..90c0cf2
--- /dev/null
+++ b/include/uapi/rdma/ib_user_mad.h
@@ -0,0 +1,234 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_MAD_H
+#define IB_USER_MAD_H
+
+#include <linux/types.h>
+#include <rdma/rdma_user_ioctl.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_MAD_ABI_VERSION	5
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ */
+
+/**
+ * ib_user_mad_hdr_old - Old version of MAD packet header without pkey_index
+ * @id - ID of agent MAD received with/to be sent with
+ * @status - 0 on successful receive, ETIMEDOUT if no response
+ *   received (transaction ID in data[] will be set to TID of original
+ *   request) (ignored on send)
+ * @timeout_ms - Milliseconds to wait for response (unset on receive)
+ * @retries - Number of automatic retries to attempt
+ * @qpn - Remote QP number received from/to be sent to
+ * @qkey - Remote Q_Key to be sent with (unset on receive)
+ * @lid - Remote lid received from/to be sent to
+ * @sl - Service level received with/to be sent with
+ * @path_bits - Local path bits received with/to be sent with
+ * @grh_present - If set, GRH was received/should be sent
+ * @gid_index - Local GID index to send with (unset on receive)
+ * @hop_limit - Hop limit in GRH
+ * @traffic_class - Traffic class in GRH
+ * @gid - Remote GID in GRH
+ * @flow_label - Flow label in GRH
+ */
+struct ib_user_mad_hdr_old {
+	__u32	id;
+	__u32	status;
+	__u32	timeout_ms;
+	__u32	retries;
+	__u32	length;
+	__be32	qpn;
+	__be32  qkey;
+	__be16	lid;
+	__u8	sl;
+	__u8	path_bits;
+	__u8	grh_present;
+	__u8	gid_index;
+	__u8	hop_limit;
+	__u8	traffic_class;
+	__u8	gid[16];
+	__be32	flow_label;
+};
+
+/**
+ * ib_user_mad_hdr - MAD packet header
+ *   This layout allows specifying/receiving the P_Key index.  To use
+ *   this capability, an application must call the
+ *   IB_USER_MAD_ENABLE_PKEY ioctl on the user MAD file handle before
+ *   any other actions with the file handle.
+ * @id - ID of agent MAD received with/to be sent with
+ * @status - 0 on successful receive, ETIMEDOUT if no response
+ *   received (transaction ID in data[] will be set to TID of original
+ *   request) (ignored on send)
+ * @timeout_ms - Milliseconds to wait for response (unset on receive)
+ * @retries - Number of automatic retries to attempt
+ * @qpn - Remote QP number received from/to be sent to
+ * @qkey - Remote Q_Key to be sent with (unset on receive)
+ * @lid - Remote lid received from/to be sent to
+ * @sl - Service level received with/to be sent with
+ * @path_bits - Local path bits received with/to be sent with
+ * @grh_present - If set, GRH was received/should be sent
+ * @gid_index - Local GID index to send with (unset on receive)
+ * @hop_limit - Hop limit in GRH
+ * @traffic_class - Traffic class in GRH
+ * @gid - Remote GID in GRH
+ * @flow_label - Flow label in GRH
+ * @pkey_index - P_Key index
+ */
+struct ib_user_mad_hdr {
+	__u32	id;
+	__u32	status;
+	__u32	timeout_ms;
+	__u32	retries;
+	__u32	length;
+	__be32	qpn;
+	__be32  qkey;
+	__be16	lid;
+	__u8	sl;
+	__u8	path_bits;
+	__u8	grh_present;
+	__u8	gid_index;
+	__u8	hop_limit;
+	__u8	traffic_class;
+	__u8	gid[16];
+	__be32	flow_label;
+	__u16	pkey_index;
+	__u8	reserved[6];
+};
+
+/**
+ * ib_user_mad - MAD packet
+ * @hdr - MAD packet header
+ * @data - Contents of MAD
+ *
+ */
+struct ib_user_mad {
+	struct ib_user_mad_hdr hdr;
+	__aligned_u64	data[0];
+};
+
+/*
+ * Earlier versions of this interface definition declared the
+ * method_mask[] member as an array of __u32 but treated it as a
+ * bitmap made up of longs in the kernel.  This ambiguity meant that
+ * 32-bit big-endian applications that can run on both 32-bit and
+ * 64-bit kernels had no consistent ABI to rely on, and 64-bit
+ * big-endian applications that treated method_mask as being made up
+ * of 32-bit words would have their bitmap misinterpreted.
+ *
+ * To clear up this confusion, we change the declaration of
+ * method_mask[] to use unsigned long and handle the conversion from
+ * 32-bit userspace to 64-bit kernel for big-endian systems in the
+ * compat_ioctl method.  Unfortunately, to keep the structure layout
+ * the same, we need the method_mask[] array to be aligned only to 4
+ * bytes even when long is 64 bits, which forces us into this ugly
+ * typedef.
+ */
+typedef unsigned long __attribute__((aligned(4))) packed_ulong;
+#define IB_USER_MAD_LONGS_PER_METHOD_MASK (128 / (8 * sizeof (long)))
+
+/**
+ * ib_user_mad_reg_req - MAD registration request
+ * @id - Set by the kernel; used to identify agent in future requests.
+ * @qpn - Queue pair number; must be 0 or 1.
+ * @method_mask - The caller will receive unsolicited MADs for any method
+ *   where @method_mask = 1.
+ * @mgmt_class - Indicates which management class of MADs should be receive
+ *   by the caller.  This field is only required if the user wishes to
+ *   receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version - Indicates which version of MADs for the given
+ *   management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ *   in the range from 0x30 to 0x4f. Otherwise not used.
+ * @rmpp_version: If set, indicates the RMPP version used.
+ *
+ */
+struct ib_user_mad_reg_req {
+	__u32	id;
+	packed_ulong method_mask[IB_USER_MAD_LONGS_PER_METHOD_MASK];
+	__u8	qpn;
+	__u8	mgmt_class;
+	__u8	mgmt_class_version;
+	__u8    oui[3];
+	__u8	rmpp_version;
+};
+
+/**
+ * ib_user_mad_reg_req2 - MAD registration request
+ *
+ * @id                 - Set by the _kernel_; used by userspace to identify the
+ *                       registered agent in future requests.
+ * @qpn                - Queue pair number; must be 0 or 1.
+ * @mgmt_class         - Indicates which management class of MADs should be
+ *                       receive by the caller.  This field is only required if
+ *                       the user wishes to receive unsolicited MADs, otherwise
+ *                       it should be 0.
+ * @mgmt_class_version - Indicates which version of MADs for the given
+ *                       management class to receive.
+ * @res                - Ignored.
+ * @flags              - additional registration flags; Must be in the set of
+ *                       flags defined in IB_USER_MAD_REG_FLAGS_CAP
+ * @method_mask        - The caller wishes to receive unsolicited MADs for the
+ *                       methods whose bit(s) is(are) set.
+ * @oui                - Indicates IEEE OUI to use when mgmt_class is a vendor
+ *                       class in the range from 0x30 to 0x4f. Otherwise not
+ *                       used.
+ * @rmpp_version       - If set, indicates the RMPP version to use.
+ */
+enum {
+	IB_USER_MAD_USER_RMPP = (1 << 0),
+};
+#define IB_USER_MAD_REG_FLAGS_CAP (IB_USER_MAD_USER_RMPP)
+struct ib_user_mad_reg_req2 {
+	__u32	id;
+	__u32	qpn;
+	__u8	mgmt_class;
+	__u8	mgmt_class_version;
+	__u16   res;
+	__u32   flags;
+	__aligned_u64 method_mask[2];
+	__u32   oui;
+	__u8	rmpp_version;
+	__u8	reserved[3];
+};
+
+#endif /* IB_USER_MAD_H */
diff --git a/include/uapi/rdma/ib_user_sa.h b/include/uapi/rdma/ib_user_sa.h
new file mode 100644
index 0000000..435155d
--- /dev/null
+++ b/include/uapi/rdma/ib_user_sa.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_SA_H
+#define IB_USER_SA_H
+
+#include <linux/types.h>
+
+enum {
+	IB_PATH_GMP		= 1,
+	IB_PATH_PRIMARY		= (1<<1),
+	IB_PATH_ALTERNATE	= (1<<2),
+	IB_PATH_OUTBOUND	= (1<<3),
+	IB_PATH_INBOUND		= (1<<4),
+	IB_PATH_INBOUND_REVERSE = (1<<5),
+	IB_PATH_BIDIRECTIONAL	= IB_PATH_OUTBOUND | IB_PATH_INBOUND_REVERSE
+};
+
+struct ib_path_rec_data {
+	__u32	flags;
+	__u32	reserved;
+	__u32	path_rec[16];
+};
+
+struct ib_user_path_rec {
+	__u8	dgid[16];
+	__u8	sgid[16];
+	__be16	dlid;
+	__be16	slid;
+	__u32	raw_traffic;
+	__be32	flow_label;
+	__u32	reversible;
+	__u32	mtu;
+	__be16	pkey;
+	__u8	hop_limit;
+	__u8	traffic_class;
+	__u8	numb_path;
+	__u8	sl;
+	__u8	mtu_selector;
+	__u8	rate_selector;
+	__u8	rate;
+	__u8	packet_life_time_selector;
+	__u8	packet_life_time;
+	__u8	preference;
+};
+
+#endif /* IB_USER_SA_H */
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
new file mode 100644
index 0000000..6aeb033
--- /dev/null
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -0,0 +1,1210 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_VERBS_H
+#define IB_USER_VERBS_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_VERBS_ABI_VERSION	6
+#define IB_USER_VERBS_CMD_THRESHOLD    50
+
+enum {
+	IB_USER_VERBS_CMD_GET_CONTEXT,
+	IB_USER_VERBS_CMD_QUERY_DEVICE,
+	IB_USER_VERBS_CMD_QUERY_PORT,
+	IB_USER_VERBS_CMD_ALLOC_PD,
+	IB_USER_VERBS_CMD_DEALLOC_PD,
+	IB_USER_VERBS_CMD_CREATE_AH,
+	IB_USER_VERBS_CMD_MODIFY_AH,
+	IB_USER_VERBS_CMD_QUERY_AH,
+	IB_USER_VERBS_CMD_DESTROY_AH,
+	IB_USER_VERBS_CMD_REG_MR,
+	IB_USER_VERBS_CMD_REG_SMR,
+	IB_USER_VERBS_CMD_REREG_MR,
+	IB_USER_VERBS_CMD_QUERY_MR,
+	IB_USER_VERBS_CMD_DEREG_MR,
+	IB_USER_VERBS_CMD_ALLOC_MW,
+	IB_USER_VERBS_CMD_BIND_MW,
+	IB_USER_VERBS_CMD_DEALLOC_MW,
+	IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL,
+	IB_USER_VERBS_CMD_CREATE_CQ,
+	IB_USER_VERBS_CMD_RESIZE_CQ,
+	IB_USER_VERBS_CMD_DESTROY_CQ,
+	IB_USER_VERBS_CMD_POLL_CQ,
+	IB_USER_VERBS_CMD_PEEK_CQ,
+	IB_USER_VERBS_CMD_REQ_NOTIFY_CQ,
+	IB_USER_VERBS_CMD_CREATE_QP,
+	IB_USER_VERBS_CMD_QUERY_QP,
+	IB_USER_VERBS_CMD_MODIFY_QP,
+	IB_USER_VERBS_CMD_DESTROY_QP,
+	IB_USER_VERBS_CMD_POST_SEND,
+	IB_USER_VERBS_CMD_POST_RECV,
+	IB_USER_VERBS_CMD_ATTACH_MCAST,
+	IB_USER_VERBS_CMD_DETACH_MCAST,
+	IB_USER_VERBS_CMD_CREATE_SRQ,
+	IB_USER_VERBS_CMD_MODIFY_SRQ,
+	IB_USER_VERBS_CMD_QUERY_SRQ,
+	IB_USER_VERBS_CMD_DESTROY_SRQ,
+	IB_USER_VERBS_CMD_POST_SRQ_RECV,
+	IB_USER_VERBS_CMD_OPEN_XRCD,
+	IB_USER_VERBS_CMD_CLOSE_XRCD,
+	IB_USER_VERBS_CMD_CREATE_XSRQ,
+	IB_USER_VERBS_CMD_OPEN_QP,
+};
+
+enum {
+	IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE,
+	IB_USER_VERBS_EX_CMD_CREATE_CQ = IB_USER_VERBS_CMD_CREATE_CQ,
+	IB_USER_VERBS_EX_CMD_CREATE_QP = IB_USER_VERBS_CMD_CREATE_QP,
+	IB_USER_VERBS_EX_CMD_MODIFY_QP = IB_USER_VERBS_CMD_MODIFY_QP,
+	IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD,
+	IB_USER_VERBS_EX_CMD_DESTROY_FLOW,
+	IB_USER_VERBS_EX_CMD_CREATE_WQ,
+	IB_USER_VERBS_EX_CMD_MODIFY_WQ,
+	IB_USER_VERBS_EX_CMD_DESTROY_WQ,
+	IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL,
+	IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL,
+	IB_USER_VERBS_EX_CMD_MODIFY_CQ
+};
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * Specifically:
+ *  - Do not use pointer types -- pass pointers in __u64 instead.
+ *  - Make sure that any structure larger than 4 bytes is padded to a
+ *    multiple of 8 bytes.  Otherwise the structure size will be
+ *    different between 32-bit and 64-bit architectures.
+ */
+
+struct ib_uverbs_async_event_desc {
+	__aligned_u64 element;
+	__u32 event_type;	/* enum ib_event_type */
+	__u32 reserved;
+};
+
+struct ib_uverbs_comp_event_desc {
+	__aligned_u64 cq_handle;
+};
+
+struct ib_uverbs_cq_moderation_caps {
+	__u16     max_cq_moderation_count;
+	__u16     max_cq_moderation_period;
+	__u32     reserved;
+};
+
+/*
+ * All commands from userspace should start with a __u32 command field
+ * followed by __u16 in_words and out_words fields (which give the
+ * length of the command block and response buffer if any in 32-bit
+ * words).  The kernel driver will read these fields first and read
+ * the rest of the command struct based on these value.
+ */
+
+#define IB_USER_VERBS_CMD_COMMAND_MASK 0xff
+#define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80000000u
+
+struct ib_uverbs_cmd_hdr {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+};
+
+struct ib_uverbs_ex_cmd_hdr {
+	__aligned_u64 response;
+	__u16 provider_in_words;
+	__u16 provider_out_words;
+	__u32 cmd_hdr_reserved;
+};
+
+struct ib_uverbs_get_context {
+	__aligned_u64 response;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_get_context_resp {
+	__u32 async_fd;
+	__u32 num_comp_vectors;
+};
+
+struct ib_uverbs_query_device {
+	__aligned_u64 response;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_query_device_resp {
+	__aligned_u64 fw_ver;
+	__be64 node_guid;
+	__be64 sys_image_guid;
+	__aligned_u64 max_mr_size;
+	__aligned_u64 page_size_cap;
+	__u32 vendor_id;
+	__u32 vendor_part_id;
+	__u32 hw_ver;
+	__u32 max_qp;
+	__u32 max_qp_wr;
+	__u32 device_cap_flags;
+	__u32 max_sge;
+	__u32 max_sge_rd;
+	__u32 max_cq;
+	__u32 max_cqe;
+	__u32 max_mr;
+	__u32 max_pd;
+	__u32 max_qp_rd_atom;
+	__u32 max_ee_rd_atom;
+	__u32 max_res_rd_atom;
+	__u32 max_qp_init_rd_atom;
+	__u32 max_ee_init_rd_atom;
+	__u32 atomic_cap;
+	__u32 max_ee;
+	__u32 max_rdd;
+	__u32 max_mw;
+	__u32 max_raw_ipv6_qp;
+	__u32 max_raw_ethy_qp;
+	__u32 max_mcast_grp;
+	__u32 max_mcast_qp_attach;
+	__u32 max_total_mcast_qp_attach;
+	__u32 max_ah;
+	__u32 max_fmr;
+	__u32 max_map_per_fmr;
+	__u32 max_srq;
+	__u32 max_srq_wr;
+	__u32 max_srq_sge;
+	__u16 max_pkeys;
+	__u8  local_ca_ack_delay;
+	__u8  phys_port_cnt;
+	__u8  reserved[4];
+};
+
+struct ib_uverbs_ex_query_device {
+	__u32 comp_mask;
+	__u32 reserved;
+};
+
+struct ib_uverbs_odp_caps {
+	__aligned_u64 general_caps;
+	struct {
+		__u32 rc_odp_caps;
+		__u32 uc_odp_caps;
+		__u32 ud_odp_caps;
+	} per_transport_caps;
+	__u32 reserved;
+};
+
+struct ib_uverbs_rss_caps {
+	/* Corresponding bit will be set if qp type from
+	 * 'enum ib_qp_type' is supported, e.g.
+	 * supported_qpts |= 1 << IB_QPT_UD
+	 */
+	__u32 supported_qpts;
+	__u32 max_rwq_indirection_tables;
+	__u32 max_rwq_indirection_table_size;
+	__u32 reserved;
+};
+
+struct ib_uverbs_tm_caps {
+	/* Max size of rendezvous request message */
+	__u32 max_rndv_hdr_size;
+	/* Max number of entries in tag matching list */
+	__u32 max_num_tags;
+	/* TM flags */
+	__u32 flags;
+	/* Max number of outstanding list operations */
+	__u32 max_ops;
+	/* Max number of SGE in tag matching entry */
+	__u32 max_sge;
+	__u32 reserved;
+};
+
+struct ib_uverbs_ex_query_device_resp {
+	struct ib_uverbs_query_device_resp base;
+	__u32 comp_mask;
+	__u32 response_length;
+	struct ib_uverbs_odp_caps odp_caps;
+	__aligned_u64 timestamp_mask;
+	__aligned_u64 hca_core_clock; /* in KHZ */
+	__aligned_u64 device_cap_flags_ex;
+	struct ib_uverbs_rss_caps rss_caps;
+	__u32  max_wq_type_rq;
+	__u32 raw_packet_caps;
+	struct ib_uverbs_tm_caps tm_caps;
+	struct ib_uverbs_cq_moderation_caps cq_moderation_caps;
+	__aligned_u64 max_dm_size;
+};
+
+struct ib_uverbs_query_port {
+	__aligned_u64 response;
+	__u8  port_num;
+	__u8  reserved[7];
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_query_port_resp {
+	__u32 port_cap_flags;
+	__u32 max_msg_sz;
+	__u32 bad_pkey_cntr;
+	__u32 qkey_viol_cntr;
+	__u32 gid_tbl_len;
+	__u16 pkey_tbl_len;
+	__u16 lid;
+	__u16 sm_lid;
+	__u8  state;
+	__u8  max_mtu;
+	__u8  active_mtu;
+	__u8  lmc;
+	__u8  max_vl_num;
+	__u8  sm_sl;
+	__u8  subnet_timeout;
+	__u8  init_type_reply;
+	__u8  active_width;
+	__u8  active_speed;
+	__u8  phys_state;
+	__u8  link_layer;
+	__u8  reserved[2];
+};
+
+struct ib_uverbs_alloc_pd {
+	__aligned_u64 response;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_alloc_pd_resp {
+	__u32 pd_handle;
+};
+
+struct ib_uverbs_dealloc_pd {
+	__u32 pd_handle;
+};
+
+struct ib_uverbs_open_xrcd {
+	__aligned_u64 response;
+	__u32 fd;
+	__u32 oflags;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_open_xrcd_resp {
+	__u32 xrcd_handle;
+};
+
+struct ib_uverbs_close_xrcd {
+	__u32 xrcd_handle;
+};
+
+struct ib_uverbs_reg_mr {
+	__aligned_u64 response;
+	__aligned_u64 start;
+	__aligned_u64 length;
+	__aligned_u64 hca_va;
+	__u32 pd_handle;
+	__u32 access_flags;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_reg_mr_resp {
+	__u32 mr_handle;
+	__u32 lkey;
+	__u32 rkey;
+};
+
+struct ib_uverbs_rereg_mr {
+	__aligned_u64 response;
+	__u32 mr_handle;
+	__u32 flags;
+	__aligned_u64 start;
+	__aligned_u64 length;
+	__aligned_u64 hca_va;
+	__u32 pd_handle;
+	__u32 access_flags;
+};
+
+struct ib_uverbs_rereg_mr_resp {
+	__u32 lkey;
+	__u32 rkey;
+};
+
+struct ib_uverbs_dereg_mr {
+	__u32 mr_handle;
+};
+
+struct ib_uverbs_alloc_mw {
+	__aligned_u64 response;
+	__u32 pd_handle;
+	__u8  mw_type;
+	__u8  reserved[3];
+};
+
+struct ib_uverbs_alloc_mw_resp {
+	__u32 mw_handle;
+	__u32 rkey;
+};
+
+struct ib_uverbs_dealloc_mw {
+	__u32 mw_handle;
+};
+
+struct ib_uverbs_create_comp_channel {
+	__aligned_u64 response;
+};
+
+struct ib_uverbs_create_comp_channel_resp {
+	__u32 fd;
+};
+
+struct ib_uverbs_create_cq {
+	__aligned_u64 response;
+	__aligned_u64 user_handle;
+	__u32 cqe;
+	__u32 comp_vector;
+	__s32 comp_channel;
+	__u32 reserved;
+	__aligned_u64 driver_data[0];
+};
+
+enum ib_uverbs_ex_create_cq_flags {
+	IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0,
+	IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1,
+};
+
+struct ib_uverbs_ex_create_cq {
+	__aligned_u64 user_handle;
+	__u32 cqe;
+	__u32 comp_vector;
+	__s32 comp_channel;
+	__u32 comp_mask;
+	__u32 flags;  /* bitmask of ib_uverbs_ex_create_cq_flags */
+	__u32 reserved;
+};
+
+struct ib_uverbs_create_cq_resp {
+	__u32 cq_handle;
+	__u32 cqe;
+};
+
+struct ib_uverbs_ex_create_cq_resp {
+	struct ib_uverbs_create_cq_resp base;
+	__u32 comp_mask;
+	__u32 response_length;
+};
+
+struct ib_uverbs_resize_cq {
+	__aligned_u64 response;
+	__u32 cq_handle;
+	__u32 cqe;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_resize_cq_resp {
+	__u32 cqe;
+	__u32 reserved;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_poll_cq {
+	__aligned_u64 response;
+	__u32 cq_handle;
+	__u32 ne;
+};
+
+struct ib_uverbs_wc {
+	__aligned_u64 wr_id;
+	__u32 status;
+	__u32 opcode;
+	__u32 vendor_err;
+	__u32 byte_len;
+	union {
+		__be32 imm_data;
+		__u32 invalidate_rkey;
+	} ex;
+	__u32 qp_num;
+	__u32 src_qp;
+	__u32 wc_flags;
+	__u16 pkey_index;
+	__u16 slid;
+	__u8 sl;
+	__u8 dlid_path_bits;
+	__u8 port_num;
+	__u8 reserved;
+};
+
+struct ib_uverbs_poll_cq_resp {
+	__u32 count;
+	__u32 reserved;
+	struct ib_uverbs_wc wc[0];
+};
+
+struct ib_uverbs_req_notify_cq {
+	__u32 cq_handle;
+	__u32 solicited_only;
+};
+
+struct ib_uverbs_destroy_cq {
+	__aligned_u64 response;
+	__u32 cq_handle;
+	__u32 reserved;
+};
+
+struct ib_uverbs_destroy_cq_resp {
+	__u32 comp_events_reported;
+	__u32 async_events_reported;
+};
+
+struct ib_uverbs_global_route {
+	__u8  dgid[16];
+	__u32 flow_label;
+	__u8  sgid_index;
+	__u8  hop_limit;
+	__u8  traffic_class;
+	__u8  reserved;
+};
+
+struct ib_uverbs_ah_attr {
+	struct ib_uverbs_global_route grh;
+	__u16 dlid;
+	__u8  sl;
+	__u8  src_path_bits;
+	__u8  static_rate;
+	__u8  is_global;
+	__u8  port_num;
+	__u8  reserved;
+};
+
+struct ib_uverbs_qp_attr {
+	__u32	qp_attr_mask;
+	__u32	qp_state;
+	__u32	cur_qp_state;
+	__u32	path_mtu;
+	__u32	path_mig_state;
+	__u32	qkey;
+	__u32	rq_psn;
+	__u32	sq_psn;
+	__u32	dest_qp_num;
+	__u32	qp_access_flags;
+
+	struct ib_uverbs_ah_attr ah_attr;
+	struct ib_uverbs_ah_attr alt_ah_attr;
+
+	/* ib_qp_cap */
+	__u32	max_send_wr;
+	__u32	max_recv_wr;
+	__u32	max_send_sge;
+	__u32	max_recv_sge;
+	__u32	max_inline_data;
+
+	__u16	pkey_index;
+	__u16	alt_pkey_index;
+	__u8	en_sqd_async_notify;
+	__u8	sq_draining;
+	__u8	max_rd_atomic;
+	__u8	max_dest_rd_atomic;
+	__u8	min_rnr_timer;
+	__u8	port_num;
+	__u8	timeout;
+	__u8	retry_cnt;
+	__u8	rnr_retry;
+	__u8	alt_port_num;
+	__u8	alt_timeout;
+	__u8	reserved[5];
+};
+
+struct ib_uverbs_create_qp {
+	__aligned_u64 response;
+	__aligned_u64 user_handle;
+	__u32 pd_handle;
+	__u32 send_cq_handle;
+	__u32 recv_cq_handle;
+	__u32 srq_handle;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u8  sq_sig_all;
+	__u8  qp_type;
+	__u8  is_srq;
+	__u8  reserved;
+	__aligned_u64 driver_data[0];
+};
+
+enum ib_uverbs_create_qp_mask {
+	IB_UVERBS_CREATE_QP_MASK_IND_TABLE = 1UL << 0,
+};
+
+enum {
+	IB_UVERBS_CREATE_QP_SUP_COMP_MASK = IB_UVERBS_CREATE_QP_MASK_IND_TABLE,
+};
+
+enum {
+	/*
+	 * This value is equal to IB_QP_DEST_QPN.
+	 */
+	IB_USER_LEGACY_LAST_QP_ATTR_MASK = 1ULL << 20,
+};
+
+enum {
+	/*
+	 * This value is equal to IB_QP_RATE_LIMIT.
+	 */
+	IB_USER_LAST_QP_ATTR_MASK = 1ULL << 25,
+};
+
+struct ib_uverbs_ex_create_qp {
+	__aligned_u64 user_handle;
+	__u32 pd_handle;
+	__u32 send_cq_handle;
+	__u32 recv_cq_handle;
+	__u32 srq_handle;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u8  sq_sig_all;
+	__u8  qp_type;
+	__u8  is_srq;
+	__u8 reserved;
+	__u32 comp_mask;
+	__u32 create_flags;
+	__u32 rwq_ind_tbl_handle;
+	__u32  source_qpn;
+};
+
+struct ib_uverbs_open_qp {
+	__aligned_u64 response;
+	__aligned_u64 user_handle;
+	__u32 pd_handle;
+	__u32 qpn;
+	__u8  qp_type;
+	__u8  reserved[7];
+	__aligned_u64 driver_data[0];
+};
+
+/* also used for open response */
+struct ib_uverbs_create_qp_resp {
+	__u32 qp_handle;
+	__u32 qpn;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u32 reserved;
+};
+
+struct ib_uverbs_ex_create_qp_resp {
+	struct ib_uverbs_create_qp_resp base;
+	__u32 comp_mask;
+	__u32 response_length;
+};
+
+/*
+ * This struct needs to remain a multiple of 8 bytes to keep the
+ * alignment of the modify QP parameters.
+ */
+struct ib_uverbs_qp_dest {
+	__u8  dgid[16];
+	__u32 flow_label;
+	__u16 dlid;
+	__u16 reserved;
+	__u8  sgid_index;
+	__u8  hop_limit;
+	__u8  traffic_class;
+	__u8  sl;
+	__u8  src_path_bits;
+	__u8  static_rate;
+	__u8  is_global;
+	__u8  port_num;
+};
+
+struct ib_uverbs_query_qp {
+	__aligned_u64 response;
+	__u32 qp_handle;
+	__u32 attr_mask;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_query_qp_resp {
+	struct ib_uverbs_qp_dest dest;
+	struct ib_uverbs_qp_dest alt_dest;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u32 qkey;
+	__u32 rq_psn;
+	__u32 sq_psn;
+	__u32 dest_qp_num;
+	__u32 qp_access_flags;
+	__u16 pkey_index;
+	__u16 alt_pkey_index;
+	__u8  qp_state;
+	__u8  cur_qp_state;
+	__u8  path_mtu;
+	__u8  path_mig_state;
+	__u8  sq_draining;
+	__u8  max_rd_atomic;
+	__u8  max_dest_rd_atomic;
+	__u8  min_rnr_timer;
+	__u8  port_num;
+	__u8  timeout;
+	__u8  retry_cnt;
+	__u8  rnr_retry;
+	__u8  alt_port_num;
+	__u8  alt_timeout;
+	__u8  sq_sig_all;
+	__u8  reserved[5];
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_modify_qp {
+	struct ib_uverbs_qp_dest dest;
+	struct ib_uverbs_qp_dest alt_dest;
+	__u32 qp_handle;
+	__u32 attr_mask;
+	__u32 qkey;
+	__u32 rq_psn;
+	__u32 sq_psn;
+	__u32 dest_qp_num;
+	__u32 qp_access_flags;
+	__u16 pkey_index;
+	__u16 alt_pkey_index;
+	__u8  qp_state;
+	__u8  cur_qp_state;
+	__u8  path_mtu;
+	__u8  path_mig_state;
+	__u8  en_sqd_async_notify;
+	__u8  max_rd_atomic;
+	__u8  max_dest_rd_atomic;
+	__u8  min_rnr_timer;
+	__u8  port_num;
+	__u8  timeout;
+	__u8  retry_cnt;
+	__u8  rnr_retry;
+	__u8  alt_port_num;
+	__u8  alt_timeout;
+	__u8  reserved[2];
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_ex_modify_qp {
+	struct ib_uverbs_modify_qp base;
+	__u32	rate_limit;
+	__u32	reserved;
+};
+
+struct ib_uverbs_modify_qp_resp {
+};
+
+struct ib_uverbs_ex_modify_qp_resp {
+	__u32  comp_mask;
+	__u32  response_length;
+};
+
+struct ib_uverbs_destroy_qp {
+	__aligned_u64 response;
+	__u32 qp_handle;
+	__u32 reserved;
+};
+
+struct ib_uverbs_destroy_qp_resp {
+	__u32 events_reported;
+};
+
+/*
+ * The ib_uverbs_sge structure isn't used anywhere, since we assume
+ * the ib_sge structure is packed the same way on 32-bit and 64-bit
+ * architectures in both kernel and user space.  It's just here to
+ * document the ABI.
+ */
+struct ib_uverbs_sge {
+	__aligned_u64 addr;
+	__u32 length;
+	__u32 lkey;
+};
+
+struct ib_uverbs_send_wr {
+	__aligned_u64 wr_id;
+	__u32 num_sge;
+	__u32 opcode;
+	__u32 send_flags;
+	union {
+		__be32 imm_data;
+		__u32 invalidate_rkey;
+	} ex;
+	union {
+		struct {
+			__aligned_u64 remote_addr;
+			__u32 rkey;
+			__u32 reserved;
+		} rdma;
+		struct {
+			__aligned_u64 remote_addr;
+			__aligned_u64 compare_add;
+			__aligned_u64 swap;
+			__u32 rkey;
+			__u32 reserved;
+		} atomic;
+		struct {
+			__u32 ah;
+			__u32 remote_qpn;
+			__u32 remote_qkey;
+			__u32 reserved;
+		} ud;
+	} wr;
+};
+
+struct ib_uverbs_post_send {
+	__aligned_u64 response;
+	__u32 qp_handle;
+	__u32 wr_count;
+	__u32 sge_count;
+	__u32 wqe_size;
+	struct ib_uverbs_send_wr send_wr[0];
+};
+
+struct ib_uverbs_post_send_resp {
+	__u32 bad_wr;
+};
+
+struct ib_uverbs_recv_wr {
+	__aligned_u64 wr_id;
+	__u32 num_sge;
+	__u32 reserved;
+};
+
+struct ib_uverbs_post_recv {
+	__aligned_u64 response;
+	__u32 qp_handle;
+	__u32 wr_count;
+	__u32 sge_count;
+	__u32 wqe_size;
+	struct ib_uverbs_recv_wr recv_wr[0];
+};
+
+struct ib_uverbs_post_recv_resp {
+	__u32 bad_wr;
+};
+
+struct ib_uverbs_post_srq_recv {
+	__aligned_u64 response;
+	__u32 srq_handle;
+	__u32 wr_count;
+	__u32 sge_count;
+	__u32 wqe_size;
+	struct ib_uverbs_recv_wr recv[0];
+};
+
+struct ib_uverbs_post_srq_recv_resp {
+	__u32 bad_wr;
+};
+
+struct ib_uverbs_create_ah {
+	__aligned_u64 response;
+	__aligned_u64 user_handle;
+	__u32 pd_handle;
+	__u32 reserved;
+	struct ib_uverbs_ah_attr attr;
+};
+
+struct ib_uverbs_create_ah_resp {
+	__u32 ah_handle;
+};
+
+struct ib_uverbs_destroy_ah {
+	__u32 ah_handle;
+};
+
+struct ib_uverbs_attach_mcast {
+	__u8  gid[16];
+	__u32 qp_handle;
+	__u16 mlid;
+	__u16 reserved;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_detach_mcast {
+	__u8  gid[16];
+	__u32 qp_handle;
+	__u16 mlid;
+	__u16 reserved;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_flow_spec_hdr {
+	__u32 type;
+	__u16 size;
+	__u16 reserved;
+	/* followed by flow_spec */
+	__aligned_u64 flow_spec_data[0];
+};
+
+struct ib_uverbs_flow_eth_filter {
+	__u8  dst_mac[6];
+	__u8  src_mac[6];
+	__be16 ether_type;
+	__be16 vlan_tag;
+};
+
+struct ib_uverbs_flow_spec_eth {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	struct ib_uverbs_flow_eth_filter val;
+	struct ib_uverbs_flow_eth_filter mask;
+};
+
+struct ib_uverbs_flow_ipv4_filter {
+	__be32 src_ip;
+	__be32 dst_ip;
+	__u8	proto;
+	__u8	tos;
+	__u8	ttl;
+	__u8	flags;
+};
+
+struct ib_uverbs_flow_spec_ipv4 {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	struct ib_uverbs_flow_ipv4_filter val;
+	struct ib_uverbs_flow_ipv4_filter mask;
+};
+
+struct ib_uverbs_flow_tcp_udp_filter {
+	__be16 dst_port;
+	__be16 src_port;
+};
+
+struct ib_uverbs_flow_spec_tcp_udp {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	struct ib_uverbs_flow_tcp_udp_filter val;
+	struct ib_uverbs_flow_tcp_udp_filter mask;
+};
+
+struct ib_uverbs_flow_ipv6_filter {
+	__u8    src_ip[16];
+	__u8    dst_ip[16];
+	__be32	flow_label;
+	__u8	next_hdr;
+	__u8	traffic_class;
+	__u8	hop_limit;
+	__u8	reserved;
+};
+
+struct ib_uverbs_flow_spec_ipv6 {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	struct ib_uverbs_flow_ipv6_filter val;
+	struct ib_uverbs_flow_ipv6_filter mask;
+};
+
+struct ib_uverbs_flow_spec_action_tag {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	__u32			      tag_id;
+	__u32			      reserved1;
+};
+
+struct ib_uverbs_flow_spec_action_drop {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+};
+
+struct ib_uverbs_flow_spec_action_handle {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	__u32			      handle;
+	__u32			      reserved1;
+};
+
+struct ib_uverbs_flow_tunnel_filter {
+	__be32 tunnel_id;
+};
+
+struct ib_uverbs_flow_spec_tunnel {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	struct ib_uverbs_flow_tunnel_filter val;
+	struct ib_uverbs_flow_tunnel_filter mask;
+};
+
+struct ib_uverbs_flow_spec_esp_filter {
+	__u32 spi;
+	__u32 seq;
+};
+
+struct ib_uverbs_flow_spec_esp {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	struct ib_uverbs_flow_spec_esp_filter val;
+	struct ib_uverbs_flow_spec_esp_filter mask;
+};
+
+struct ib_uverbs_flow_attr {
+	__u32 type;
+	__u16 size;
+	__u16 priority;
+	__u8  num_of_specs;
+	__u8  reserved[2];
+	__u8  port;
+	__u32 flags;
+	/* Following are the optional layers according to user request
+	 * struct ib_flow_spec_xxx
+	 * struct ib_flow_spec_yyy
+	 */
+	struct ib_uverbs_flow_spec_hdr flow_specs[0];
+};
+
+struct ib_uverbs_create_flow  {
+	__u32 comp_mask;
+	__u32 qp_handle;
+	struct ib_uverbs_flow_attr flow_attr;
+};
+
+struct ib_uverbs_create_flow_resp {
+	__u32 comp_mask;
+	__u32 flow_handle;
+};
+
+struct ib_uverbs_destroy_flow  {
+	__u32 comp_mask;
+	__u32 flow_handle;
+};
+
+struct ib_uverbs_create_srq {
+	__aligned_u64 response;
+	__aligned_u64 user_handle;
+	__u32 pd_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srq_limit;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_create_xsrq {
+	__aligned_u64 response;
+	__aligned_u64 user_handle;
+	__u32 srq_type;
+	__u32 pd_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srq_limit;
+	__u32 max_num_tags;
+	__u32 xrcd_handle;
+	__u32 cq_handle;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_create_srq_resp {
+	__u32 srq_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srqn;
+};
+
+struct ib_uverbs_modify_srq {
+	__u32 srq_handle;
+	__u32 attr_mask;
+	__u32 max_wr;
+	__u32 srq_limit;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_query_srq {
+	__aligned_u64 response;
+	__u32 srq_handle;
+	__u32 reserved;
+	__aligned_u64 driver_data[0];
+};
+
+struct ib_uverbs_query_srq_resp {
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srq_limit;
+	__u32 reserved;
+};
+
+struct ib_uverbs_destroy_srq {
+	__aligned_u64 response;
+	__u32 srq_handle;
+	__u32 reserved;
+};
+
+struct ib_uverbs_destroy_srq_resp {
+	__u32 events_reported;
+};
+
+struct ib_uverbs_ex_create_wq  {
+	__u32 comp_mask;
+	__u32 wq_type;
+	__aligned_u64 user_handle;
+	__u32 pd_handle;
+	__u32 cq_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 create_flags; /* Use enum ib_wq_flags */
+	__u32 reserved;
+};
+
+struct ib_uverbs_ex_create_wq_resp {
+	__u32 comp_mask;
+	__u32 response_length;
+	__u32 wq_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 wqn;
+};
+
+struct ib_uverbs_ex_destroy_wq  {
+	__u32 comp_mask;
+	__u32 wq_handle;
+};
+
+struct ib_uverbs_ex_destroy_wq_resp {
+	__u32 comp_mask;
+	__u32 response_length;
+	__u32 events_reported;
+	__u32 reserved;
+};
+
+struct ib_uverbs_ex_modify_wq  {
+	__u32 attr_mask;
+	__u32 wq_handle;
+	__u32 wq_state;
+	__u32 curr_wq_state;
+	__u32 flags; /* Use enum ib_wq_flags */
+	__u32 flags_mask; /* Use enum ib_wq_flags */
+};
+
+/* Prevent memory allocation rather than max expected size */
+#define IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE 0x0d
+struct ib_uverbs_ex_create_rwq_ind_table  {
+	__u32 comp_mask;
+	__u32 log_ind_tbl_size;
+	/* Following are the wq handles according to log_ind_tbl_size
+	 * wq_handle1
+	 * wq_handle2
+	 */
+	__u32 wq_handles[0];
+};
+
+struct ib_uverbs_ex_create_rwq_ind_table_resp {
+	__u32 comp_mask;
+	__u32 response_length;
+	__u32 ind_tbl_handle;
+	__u32 ind_tbl_num;
+};
+
+struct ib_uverbs_ex_destroy_rwq_ind_table  {
+	__u32 comp_mask;
+	__u32 ind_tbl_handle;
+};
+
+struct ib_uverbs_cq_moderation {
+	__u16 cq_count;
+	__u16 cq_period;
+};
+
+struct ib_uverbs_ex_modify_cq {
+	__u32 cq_handle;
+	__u32 attr_mask;
+	struct ib_uverbs_cq_moderation attr;
+	__u32 reserved;
+};
+
+#define IB_DEVICE_NAME_MAX 64
+
+#endif /* IB_USER_VERBS_H */
diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h
new file mode 100644
index 0000000..f745575
--- /dev/null
+++ b/include/uapi/rdma/mlx4-abi.h
@@ -0,0 +1,191 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_ABI_USER_H
+#define MLX4_ABI_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+
+#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION	3
+#define MLX4_IB_UVERBS_ABI_VERSION		4
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx4_ib_alloc_ucontext_resp_v3 {
+	__u32	qp_tab_size;
+	__u16	bf_reg_size;
+	__u16	bf_regs_per_page;
+};
+
+enum {
+	MLX4_USER_DEV_CAP_LARGE_CQE	= 1L << 0,
+};
+
+struct mlx4_ib_alloc_ucontext_resp {
+	__u32	dev_caps;
+	__u32	qp_tab_size;
+	__u16	bf_reg_size;
+	__u16	bf_regs_per_page;
+	__u32	cqe_size;
+};
+
+struct mlx4_ib_alloc_pd_resp {
+	__u32	pdn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_create_cq {
+	__aligned_u64 buf_addr;
+	__aligned_u64 db_addr;
+};
+
+struct mlx4_ib_create_cq_resp {
+	__u32	cqn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_resize_cq {
+	__aligned_u64 buf_addr;
+};
+
+struct mlx4_ib_create_srq {
+	__aligned_u64 buf_addr;
+	__aligned_u64 db_addr;
+};
+
+struct mlx4_ib_create_srq_resp {
+	__u32	srqn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_create_qp_rss {
+	__aligned_u64 rx_hash_fields_mask; /* Use  enum mlx4_ib_rx_hash_fields */
+	__u8    rx_hash_function; /* Use enum mlx4_ib_rx_hash_function_flags */
+	__u8    reserved[7];
+	__u8    rx_hash_key[40];
+	__u32   comp_mask;
+	__u32   reserved1;
+};
+
+struct mlx4_ib_create_qp {
+	__aligned_u64 buf_addr;
+	__aligned_u64 db_addr;
+	__u8	log_sq_bb_count;
+	__u8	log_sq_stride;
+	__u8	sq_no_prefetch;
+	__u8	reserved;
+	__u32	inl_recv_sz;
+};
+
+struct mlx4_ib_create_wq {
+	__aligned_u64 buf_addr;
+	__aligned_u64 db_addr;
+	__u8	log_range_size;
+	__u8	reserved[3];
+	__u32   comp_mask;
+};
+
+struct mlx4_ib_modify_wq {
+	__u32	comp_mask;
+	__u32	reserved;
+};
+
+struct mlx4_ib_create_rwq_ind_tbl_resp {
+	__u32	response_length;
+	__u32	reserved;
+};
+
+/* RX Hash function flags */
+enum mlx4_ib_rx_hash_function_flags {
+	MLX4_IB_RX_HASH_FUNC_TOEPLITZ	= 1 << 0,
+};
+
+/*
+ * RX Hash flags, these flags allows to set which incoming packet's field should
+ * participates in RX Hash. Each flag represent certain packet's field,
+ * when the flag is set the field that is represented by the flag will
+ * participate in RX Hash calculation.
+ */
+enum mlx4_ib_rx_hash_fields {
+	MLX4_IB_RX_HASH_SRC_IPV4	= 1 << 0,
+	MLX4_IB_RX_HASH_DST_IPV4	= 1 << 1,
+	MLX4_IB_RX_HASH_SRC_IPV6	= 1 << 2,
+	MLX4_IB_RX_HASH_DST_IPV6	= 1 << 3,
+	MLX4_IB_RX_HASH_SRC_PORT_TCP	= 1 << 4,
+	MLX4_IB_RX_HASH_DST_PORT_TCP	= 1 << 5,
+	MLX4_IB_RX_HASH_SRC_PORT_UDP	= 1 << 6,
+	MLX4_IB_RX_HASH_DST_PORT_UDP	= 1 << 7,
+	MLX4_IB_RX_HASH_INNER		= 1ULL << 31,
+};
+
+struct mlx4_ib_rss_caps {
+	__aligned_u64 rx_hash_fields_mask; /* enum mlx4_ib_rx_hash_fields */
+	__u8 rx_hash_function; /* enum mlx4_ib_rx_hash_function_flags */
+	__u8 reserved[7];
+};
+
+enum query_device_resp_mask {
+	MLX4_IB_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0,
+};
+
+struct mlx4_ib_tso_caps {
+	__u32 max_tso; /* Maximum tso payload size in bytes */
+	/* Corresponding bit will be set if qp type from
+	 * 'enum ib_qp_type' is supported.
+	 */
+	__u32 supported_qpts;
+};
+
+struct mlx4_uverbs_ex_query_device_resp {
+	__u32			comp_mask;
+	__u32			response_length;
+	__aligned_u64		hca_core_clock_offset;
+	__u32			max_inl_recv_sz;
+	__u32			reserved;
+	struct mlx4_ib_rss_caps	rss_caps;
+	struct mlx4_ib_tso_caps tso_caps;
+};
+
+#endif /* MLX4_ABI_USER_H */
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
new file mode 100644
index 0000000..fdaf00e
--- /dev/null
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -0,0 +1,444 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_ABI_USER_H
+#define MLX5_ABI_USER_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>	/* For ETH_ALEN. */
+
+enum {
+	MLX5_QP_FLAG_SIGNATURE		= 1 << 0,
+	MLX5_QP_FLAG_SCATTER_CQE	= 1 << 1,
+	MLX5_QP_FLAG_TUNNEL_OFFLOADS	= 1 << 2,
+	MLX5_QP_FLAG_BFREG_INDEX	= 1 << 3,
+	MLX5_QP_FLAG_TYPE_DCT		= 1 << 4,
+	MLX5_QP_FLAG_TYPE_DCI		= 1 << 5,
+};
+
+enum {
+	MLX5_SRQ_FLAG_SIGNATURE		= 1 << 0,
+};
+
+enum {
+	MLX5_WQ_FLAG_SIGNATURE		= 1 << 0,
+};
+
+/* Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MLX5_IB_UVERBS_ABI_VERSION	1
+
+/* Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx5_ib_alloc_ucontext_req {
+	__u32	total_num_bfregs;
+	__u32	num_low_latency_bfregs;
+};
+
+enum mlx5_lib_caps {
+	MLX5_LIB_CAP_4K_UAR	= (__u64)1 << 0,
+};
+
+struct mlx5_ib_alloc_ucontext_req_v2 {
+	__u32	total_num_bfregs;
+	__u32	num_low_latency_bfregs;
+	__u32	flags;
+	__u32	comp_mask;
+	__u8	max_cqe_version;
+	__u8	reserved0;
+	__u16	reserved1;
+	__u32	reserved2;
+	__aligned_u64 lib_caps;
+};
+
+enum mlx5_ib_alloc_ucontext_resp_mask {
+	MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0,
+};
+
+enum mlx5_user_cmds_supp_uhw {
+	MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE = 1 << 0,
+	MLX5_USER_CMDS_SUPP_UHW_CREATE_AH    = 1 << 1,
+};
+
+/* The eth_min_inline response value is set to off-by-one vs the FW
+ * returned value to allow user-space to deal with older kernels.
+ */
+enum mlx5_user_inline_mode {
+	MLX5_USER_INLINE_MODE_NA,
+	MLX5_USER_INLINE_MODE_NONE,
+	MLX5_USER_INLINE_MODE_L2,
+	MLX5_USER_INLINE_MODE_IP,
+	MLX5_USER_INLINE_MODE_TCP_UDP,
+};
+
+enum {
+	MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM = 1 << 0,
+	MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA = 1 << 1,
+	MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING = 1 << 2,
+	MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD = 1 << 3,
+	MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN = 1 << 4,
+};
+
+struct mlx5_ib_alloc_ucontext_resp {
+	__u32	qp_tab_size;
+	__u32	bf_reg_size;
+	__u32	tot_bfregs;
+	__u32	cache_line_size;
+	__u16	max_sq_desc_sz;
+	__u16	max_rq_desc_sz;
+	__u32	max_send_wqebb;
+	__u32	max_recv_wr;
+	__u32	max_srq_recv_wr;
+	__u16	num_ports;
+	__u16	flow_action_flags;
+	__u32	comp_mask;
+	__u32	response_length;
+	__u8	cqe_version;
+	__u8	cmds_supp_uhw;
+	__u8	eth_min_inline;
+	__u8	clock_info_versions;
+	__aligned_u64 hca_core_clock_offset;
+	__u32	log_uar_size;
+	__u32	num_uars_per_page;
+	__u32	num_dyn_bfregs;
+	__u32	reserved3;
+};
+
+struct mlx5_ib_alloc_pd_resp {
+	__u32	pdn;
+};
+
+struct mlx5_ib_tso_caps {
+	__u32 max_tso; /* Maximum tso payload size in bytes */
+
+	/* Corresponding bit will be set if qp type from
+	 * 'enum ib_qp_type' is supported, e.g.
+	 * supported_qpts |= 1 << IB_QPT_UD
+	 */
+	__u32 supported_qpts;
+};
+
+struct mlx5_ib_rss_caps {
+	__aligned_u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */
+	__u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */
+	__u8 reserved[7];
+};
+
+enum mlx5_ib_cqe_comp_res_format {
+	MLX5_IB_CQE_RES_FORMAT_HASH	= 1 << 0,
+	MLX5_IB_CQE_RES_FORMAT_CSUM	= 1 << 1,
+	MLX5_IB_CQE_RES_RESERVED	= 1 << 2,
+};
+
+struct mlx5_ib_cqe_comp_caps {
+	__u32 max_num;
+	__u32 supported_format; /* enum mlx5_ib_cqe_comp_res_format */
+};
+
+enum mlx5_ib_packet_pacing_cap_flags {
+	MLX5_IB_PP_SUPPORT_BURST	= 1 << 0,
+};
+
+struct mlx5_packet_pacing_caps {
+	__u32 qp_rate_limit_min;
+	__u32 qp_rate_limit_max; /* In kpbs */
+
+	/* Corresponding bit will be set if qp type from
+	 * 'enum ib_qp_type' is supported, e.g.
+	 * supported_qpts |= 1 << IB_QPT_RAW_PACKET
+	 */
+	__u32 supported_qpts;
+	__u8  cap_flags; /* enum mlx5_ib_packet_pacing_cap_flags */
+	__u8  reserved[3];
+};
+
+enum mlx5_ib_mpw_caps {
+	MPW_RESERVED		= 1 << 0,
+	MLX5_IB_ALLOW_MPW	= 1 << 1,
+	MLX5_IB_SUPPORT_EMPW	= 1 << 2,
+};
+
+enum mlx5_ib_sw_parsing_offloads {
+	MLX5_IB_SW_PARSING = 1 << 0,
+	MLX5_IB_SW_PARSING_CSUM = 1 << 1,
+	MLX5_IB_SW_PARSING_LSO = 1 << 2,
+};
+
+struct mlx5_ib_sw_parsing_caps {
+	__u32 sw_parsing_offloads; /* enum mlx5_ib_sw_parsing_offloads */
+
+	/* Corresponding bit will be set if qp type from
+	 * 'enum ib_qp_type' is supported, e.g.
+	 * supported_qpts |= 1 << IB_QPT_RAW_PACKET
+	 */
+	__u32 supported_qpts;
+};
+
+struct mlx5_ib_striding_rq_caps {
+	__u32 min_single_stride_log_num_of_bytes;
+	__u32 max_single_stride_log_num_of_bytes;
+	__u32 min_single_wqe_log_num_of_strides;
+	__u32 max_single_wqe_log_num_of_strides;
+
+	/* Corresponding bit will be set if qp type from
+	 * 'enum ib_qp_type' is supported, e.g.
+	 * supported_qpts |= 1 << IB_QPT_RAW_PACKET
+	 */
+	__u32 supported_qpts;
+	__u32 reserved;
+};
+
+enum mlx5_ib_query_dev_resp_flags {
+	/* Support 128B CQE compression */
+	MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP = 1 << 0,
+	MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD  = 1 << 1,
+};
+
+enum mlx5_ib_tunnel_offloads {
+	MLX5_IB_TUNNELED_OFFLOADS_VXLAN  = 1 << 0,
+	MLX5_IB_TUNNELED_OFFLOADS_GRE    = 1 << 1,
+	MLX5_IB_TUNNELED_OFFLOADS_GENEVE = 1 << 2
+};
+
+struct mlx5_ib_query_device_resp {
+	__u32	comp_mask;
+	__u32	response_length;
+	struct	mlx5_ib_tso_caps tso_caps;
+	struct	mlx5_ib_rss_caps rss_caps;
+	struct	mlx5_ib_cqe_comp_caps cqe_comp_caps;
+	struct	mlx5_packet_pacing_caps packet_pacing_caps;
+	__u32	mlx5_ib_support_multi_pkt_send_wqes;
+	__u32	flags; /* Use enum mlx5_ib_query_dev_resp_flags */
+	struct mlx5_ib_sw_parsing_caps sw_parsing_caps;
+	struct mlx5_ib_striding_rq_caps striding_rq_caps;
+	__u32	tunnel_offloads_caps; /* enum mlx5_ib_tunnel_offloads */
+	__u32	reserved;
+};
+
+enum mlx5_ib_create_cq_flags {
+	MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD	= 1 << 0,
+};
+
+struct mlx5_ib_create_cq {
+	__aligned_u64 buf_addr;
+	__aligned_u64 db_addr;
+	__u32	cqe_size;
+	__u8    cqe_comp_en;
+	__u8    cqe_comp_res_format;
+	__u16	flags;
+};
+
+struct mlx5_ib_create_cq_resp {
+	__u32	cqn;
+	__u32	reserved;
+};
+
+struct mlx5_ib_resize_cq {
+	__aligned_u64 buf_addr;
+	__u16	cqe_size;
+	__u16	reserved0;
+	__u32	reserved1;
+};
+
+struct mlx5_ib_create_srq {
+	__aligned_u64 buf_addr;
+	__aligned_u64 db_addr;
+	__u32	flags;
+	__u32	reserved0; /* explicit padding (optional on i386) */
+	__u32	uidx;
+	__u32	reserved1;
+};
+
+struct mlx5_ib_create_srq_resp {
+	__u32	srqn;
+	__u32	reserved;
+};
+
+struct mlx5_ib_create_qp {
+	__aligned_u64 buf_addr;
+	__aligned_u64 db_addr;
+	__u32	sq_wqe_count;
+	__u32	rq_wqe_count;
+	__u32	rq_wqe_shift;
+	__u32	flags;
+	__u32	uidx;
+	__u32	bfreg_index;
+	union {
+		__aligned_u64 sq_buf_addr;
+		__aligned_u64 access_key;
+	};
+};
+
+/* RX Hash function flags */
+enum mlx5_rx_hash_function_flags {
+	MLX5_RX_HASH_FUNC_TOEPLITZ	= 1 << 0,
+};
+
+/*
+ * RX Hash flags, these flags allows to set which incoming packet's field should
+ * participates in RX Hash. Each flag represent certain packet's field,
+ * when the flag is set the field that is represented by the flag will
+ * participate in RX Hash calculation.
+ * Note: *IPV4 and *IPV6 flags can't be enabled together on the same QP
+ * and *TCP and *UDP flags can't be enabled together on the same QP.
+*/
+enum mlx5_rx_hash_fields {
+	MLX5_RX_HASH_SRC_IPV4	= 1 << 0,
+	MLX5_RX_HASH_DST_IPV4	= 1 << 1,
+	MLX5_RX_HASH_SRC_IPV6	= 1 << 2,
+	MLX5_RX_HASH_DST_IPV6	= 1 << 3,
+	MLX5_RX_HASH_SRC_PORT_TCP	= 1 << 4,
+	MLX5_RX_HASH_DST_PORT_TCP	= 1 << 5,
+	MLX5_RX_HASH_SRC_PORT_UDP	= 1 << 6,
+	MLX5_RX_HASH_DST_PORT_UDP	= 1 << 7,
+	MLX5_RX_HASH_IPSEC_SPI		= 1 << 8,
+	/* Save bits for future fields */
+	MLX5_RX_HASH_INNER		= (1UL << 31),
+};
+
+struct mlx5_ib_create_qp_rss {
+	__aligned_u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */
+	__u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */
+	__u8 rx_key_len; /* valid only for Toeplitz */
+	__u8 reserved[6];
+	__u8 rx_hash_key[128]; /* valid only for Toeplitz */
+	__u32   comp_mask;
+	__u32	flags;
+};
+
+struct mlx5_ib_create_qp_resp {
+	__u32	bfreg_index;
+	__u32   reserved;
+};
+
+struct mlx5_ib_alloc_mw {
+	__u32	comp_mask;
+	__u8	num_klms;
+	__u8	reserved1;
+	__u16	reserved2;
+};
+
+enum mlx5_ib_create_wq_mask {
+	MLX5_IB_CREATE_WQ_STRIDING_RQ	= (1 << 0),
+};
+
+struct mlx5_ib_create_wq {
+	__aligned_u64 buf_addr;
+	__aligned_u64 db_addr;
+	__u32   rq_wqe_count;
+	__u32   rq_wqe_shift;
+	__u32   user_index;
+	__u32   flags;
+	__u32   comp_mask;
+	__u32	single_stride_log_num_of_bytes;
+	__u32	single_wqe_log_num_of_strides;
+	__u32	two_byte_shift_en;
+};
+
+struct mlx5_ib_create_ah_resp {
+	__u32	response_length;
+	__u8	dmac[ETH_ALEN];
+	__u8	reserved[6];
+};
+
+struct mlx5_ib_burst_info {
+	__u32       max_burst_sz;
+	__u16       typical_pkt_sz;
+	__u16       reserved;
+};
+
+struct mlx5_ib_modify_qp {
+	__u32			   comp_mask;
+	struct mlx5_ib_burst_info  burst_info;
+	__u32			   reserved;
+};
+
+struct mlx5_ib_modify_qp_resp {
+	__u32	response_length;
+	__u32	dctn;
+};
+
+struct mlx5_ib_create_wq_resp {
+	__u32	response_length;
+	__u32	reserved;
+};
+
+struct mlx5_ib_create_rwq_ind_tbl_resp {
+	__u32	response_length;
+	__u32	reserved;
+};
+
+struct mlx5_ib_modify_wq {
+	__u32	comp_mask;
+	__u32	reserved;
+};
+
+struct mlx5_ib_clock_info {
+	__u32 sign;
+	__u32 resv;
+	__aligned_u64 nsec;
+	__aligned_u64 cycles;
+	__aligned_u64 frac;
+	__u32 mult;
+	__u32 shift;
+	__aligned_u64 mask;
+	__aligned_u64 overflow_period;
+};
+
+enum mlx5_ib_mmap_cmd {
+	MLX5_IB_MMAP_REGULAR_PAGE               = 0,
+	MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES       = 1,
+	MLX5_IB_MMAP_WC_PAGE                    = 2,
+	MLX5_IB_MMAP_NC_PAGE                    = 3,
+	/* 5 is chosen in order to be compatible with old versions of libmlx5 */
+	MLX5_IB_MMAP_CORE_CLOCK                 = 5,
+	MLX5_IB_MMAP_ALLOC_WC                   = 6,
+	MLX5_IB_MMAP_CLOCK_INFO                 = 7,
+	MLX5_IB_MMAP_DEVICE_MEM                 = 8,
+};
+
+enum {
+	MLX5_IB_CLOCK_INFO_KERNEL_UPDATING = 1,
+};
+
+/* Bit indexes for the mlx5_alloc_ucontext_resp.clock_info_versions bitmap */
+enum {
+	MLX5_IB_CLOCK_INFO_V1              = 0,
+};
+#endif /* MLX5_ABI_USER_H */
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
new file mode 100644
index 0000000..f7d685e
--- /dev/null
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_USER_IOCTL_CMDS_H
+#define MLX5_USER_IOCTL_CMDS_H
+
+#include <rdma/ib_user_ioctl_cmds.h>
+
+enum mlx5_ib_create_flow_action_attrs {
+	/* This attribute belong to the driver namespace */
+	MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS = (1U << UVERBS_ID_NS_SHIFT),
+};
+
+enum mlx5_ib_alloc_dm_attrs {
+	MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
+};
+
+#endif
diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
new file mode 100644
index 0000000..8a2fb33
--- /dev/null
+++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_USER_IOCTL_VERBS_H
+#define MLX5_USER_IOCTL_VERBS_H
+
+#include <linux/types.h>
+
+enum mlx5_ib_uapi_flow_action_flags {
+	MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA	= 1 << 0,
+};
+
+#endif
+
diff --git a/include/uapi/rdma/mthca-abi.h b/include/uapi/rdma/mthca-abi.h
new file mode 100644
index 0000000..91b12e1
--- /dev/null
+++ b/include/uapi/rdma/mthca-abi.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_ABI_USER_H
+#define MTHCA_ABI_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MTHCA_UVERBS_ABI_VERSION	1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+struct mthca_alloc_ucontext_resp {
+	__u32 qp_tab_size;
+	__u32 uarc_size;
+};
+
+struct mthca_alloc_pd_resp {
+	__u32 pdn;
+	__u32 reserved;
+};
+
+/*
+ * Mark the memory region with a DMA attribute that causes
+ * in-flight DMA to be flushed when the region is written to:
+ */
+#define MTHCA_MR_DMASYNC	0x1
+
+struct mthca_reg_mr {
+	__u32 mr_attrs;
+	__u32 reserved;
+};
+
+struct mthca_create_cq {
+	__u32 lkey;
+	__u32 pdn;
+	__aligned_u64 arm_db_page;
+	__aligned_u64 set_db_page;
+	__u32 arm_db_index;
+	__u32 set_db_index;
+};
+
+struct mthca_create_cq_resp {
+	__u32 cqn;
+	__u32 reserved;
+};
+
+struct mthca_resize_cq {
+	__u32 lkey;
+	__u32 reserved;
+};
+
+struct mthca_create_srq {
+	__u32 lkey;
+	__u32 db_index;
+	__aligned_u64 db_page;
+};
+
+struct mthca_create_srq_resp {
+	__u32 srqn;
+	__u32 reserved;
+};
+
+struct mthca_create_qp {
+	__u32 lkey;
+	__u32 reserved;
+	__aligned_u64 sq_db_page;
+	__aligned_u64 rq_db_page;
+	__u32 sq_db_index;
+	__u32 rq_db_index;
+};
+#endif /* MTHCA_ABI_USER_H */
diff --git a/include/uapi/rdma/nes-abi.h b/include/uapi/rdma/nes-abi.h
new file mode 100644
index 0000000..f80495b
--- /dev/null
+++ b/include/uapi/rdma/nes-abi.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef NES_ABI_USER_H
+#define NES_ABI_USER_H
+
+#include <linux/types.h>
+
+#define NES_ABI_USERSPACE_VER 2
+#define NES_ABI_KERNEL_VER    2
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct nes_alloc_ucontext_req {
+	__u32 reserved32;
+	__u8  userspace_ver;
+	__u8  reserved8[3];
+};
+
+struct nes_alloc_ucontext_resp {
+	__u32 max_pds; /* maximum pds allowed for this user process */
+	__u32 max_qps; /* maximum qps allowed for this user process */
+	__u32 wq_size; /* size of the WQs (sq+rq) allocated to the mmaped area */
+	__u8  virtwq;  /* flag to indicate if virtual WQ are to be used or not */
+	__u8  kernel_ver;
+	__u8  reserved[2];
+};
+
+struct nes_alloc_pd_resp {
+	__u32 pd_id;
+	__u32 mmap_db_index;
+};
+
+struct nes_create_cq_req {
+	__aligned_u64 user_cq_buffer;
+	__u32 mcrqf;
+	__u8 reserved[4];
+};
+
+struct nes_create_qp_req {
+	__aligned_u64 user_wqe_buffers;
+	__aligned_u64 user_qp_buffer;
+};
+
+enum iwnes_memreg_type {
+	IWNES_MEMREG_TYPE_MEM = 0x0000,
+	IWNES_MEMREG_TYPE_QP = 0x0001,
+	IWNES_MEMREG_TYPE_CQ = 0x0002,
+	IWNES_MEMREG_TYPE_MW = 0x0003,
+	IWNES_MEMREG_TYPE_FMR = 0x0004,
+	IWNES_MEMREG_TYPE_FMEM = 0x0005,
+};
+
+struct nes_mem_reg_req {
+	__u32 reg_type;	/* indicates if id is memory, QP or CQ */
+	__u32 reserved;
+};
+
+struct nes_create_cq_resp {
+	__u32 cq_id;
+	__u32 cq_size;
+	__u32 mmap_db_index;
+	__u32 reserved;
+};
+
+struct nes_create_qp_resp {
+	__u32 qp_id;
+	__u32 actual_sq_size;
+	__u32 actual_rq_size;
+	__u32 mmap_sq_db_index;
+	__u32 mmap_rq_db_index;
+	__u32 nes_drv_opt;
+};
+
+#endif	/* NES_ABI_USER_H */
diff --git a/include/uapi/rdma/ocrdma-abi.h b/include/uapi/rdma/ocrdma-abi.h
new file mode 100644
index 0000000..284d47b
--- /dev/null
+++ b/include/uapi/rdma/ocrdma-abi.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
+/* This file is part of the Emulex RoCE Device Driver for
+ * RoCE (RDMA over Converged Ethernet) adapters.
+ * Copyright (C) 2012-2015 Emulex. All rights reserved.
+ * EMULEX and SLI are trademarks of Emulex.
+ * www.emulex.com
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License (GPL) Version 2, available from the file COPYING in the main
+ * directory of this source tree, or the BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#ifndef OCRDMA_ABI_USER_H
+#define OCRDMA_ABI_USER_H
+
+#include <linux/types.h>
+
+#define OCRDMA_ABI_VERSION 2
+#define OCRDMA_BE_ROCE_ABI_VERSION 1
+/* user kernel communication data structures. */
+
+struct ocrdma_alloc_ucontext_resp {
+	__u32 dev_id;
+	__u32 wqe_size;
+	__u32 max_inline_data;
+	__u32 dpp_wqe_size;
+	__aligned_u64 ah_tbl_page;
+	__u32 ah_tbl_len;
+	__u32 rqe_size;
+	__u8 fw_ver[32];
+	/* for future use/new features in progress */
+	__aligned_u64 rsvd1;
+	__aligned_u64 rsvd2;
+};
+
+struct ocrdma_alloc_pd_ureq {
+	__u32 rsvd[2];
+};
+
+struct ocrdma_alloc_pd_uresp {
+	__u32 id;
+	__u32 dpp_enabled;
+	__u32 dpp_page_addr_hi;
+	__u32 dpp_page_addr_lo;
+	__u32 rsvd[2];
+};
+
+struct ocrdma_create_cq_ureq {
+	__u32 dpp_cq;
+	__u32 rsvd; /* pad */
+};
+
+#define MAX_CQ_PAGES 8
+struct ocrdma_create_cq_uresp {
+	__u32 cq_id;
+	__u32 page_size;
+	__u32 num_pages;
+	__u32 max_hw_cqe;
+	__aligned_u64 page_addr[MAX_CQ_PAGES];
+	__aligned_u64 db_page_addr;
+	__u32 db_page_size;
+	__u32 phase_change;
+	/* for future use/new features in progress */
+	__aligned_u64 rsvd1;
+	__aligned_u64 rsvd2;
+};
+
+#define MAX_QP_PAGES 8
+#define MAX_UD_AV_PAGES 8
+
+struct ocrdma_create_qp_ureq {
+	__u8 enable_dpp_cq;
+	__u8 rsvd;
+	__u16 dpp_cq_id;
+	__u32 rsvd1;	/* pad */
+};
+
+struct ocrdma_create_qp_uresp {
+	__u16 qp_id;
+	__u16 sq_dbid;
+	__u16 rq_dbid;
+	__u16 resv0;	/* pad */
+	__u32 sq_page_size;
+	__u32 rq_page_size;
+	__u32 num_sq_pages;
+	__u32 num_rq_pages;
+	__aligned_u64 sq_page_addr[MAX_QP_PAGES];
+	__aligned_u64 rq_page_addr[MAX_QP_PAGES];
+	__aligned_u64 db_page_addr;
+	__u32 db_page_size;
+	__u32 dpp_credit;
+	__u32 dpp_offset;
+	__u32 num_wqe_allocated;
+	__u32 num_rqe_allocated;
+	__u32 db_sq_offset;
+	__u32 db_rq_offset;
+	__u32 db_shift;
+	__aligned_u64 rsvd[11];
+};
+
+struct ocrdma_create_srq_uresp {
+	__u16 rq_dbid;
+	__u16 resv0;	/* pad */
+	__u32 resv1;
+
+	__u32 rq_page_size;
+	__u32 num_rq_pages;
+
+	__aligned_u64 rq_page_addr[MAX_QP_PAGES];
+	__aligned_u64 db_page_addr;
+
+	__u32 db_page_size;
+	__u32 num_rqe_allocated;
+	__u32 db_rq_offset;
+	__u32 db_shift;
+
+	__aligned_u64 rsvd2;
+	__aligned_u64 rsvd3;
+};
+
+#endif	/* OCRDMA_ABI_USER_H */
diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h
new file mode 100644
index 0000000..24c658b
--- /dev/null
+++ b/include/uapi/rdma/qedr-abi.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/* QLogic qedr NIC Driver
+ * Copyright (c) 2015-2016  QLogic Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and /or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __QEDR_USER_H__
+#define __QEDR_USER_H__
+
+#include <linux/types.h>
+
+#define QEDR_ABI_VERSION		(8)
+
+/* user kernel communication data structures. */
+
+struct qedr_alloc_ucontext_resp {
+	__aligned_u64 db_pa;
+	__u32 db_size;
+
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_srq_wr;
+	__u32 sges_per_send_wr;
+	__u32 sges_per_recv_wr;
+	__u32 sges_per_srq_wr;
+	__u32 max_cqes;
+	__u8 dpm_enabled;
+	__u8 wids_enabled;
+	__u16 wid_count;
+	__u32 reserved;
+};
+
+struct qedr_alloc_pd_ureq {
+	__aligned_u64 rsvd1;
+};
+
+struct qedr_alloc_pd_uresp {
+	__u32 pd_id;
+	__u32 reserved;
+};
+
+struct qedr_create_cq_ureq {
+	__aligned_u64 addr;
+	__aligned_u64 len;
+};
+
+struct qedr_create_cq_uresp {
+	__u32 db_offset;
+	__u16 icid;
+	__u16 reserved;
+};
+
+struct qedr_create_qp_ureq {
+	__u32 qp_handle_hi;
+	__u32 qp_handle_lo;
+
+	/* SQ */
+	/* user space virtual address of SQ buffer */
+	__aligned_u64 sq_addr;
+
+	/* length of SQ buffer */
+	__aligned_u64 sq_len;
+
+	/* RQ */
+	/* user space virtual address of RQ buffer */
+	__aligned_u64 rq_addr;
+
+	/* length of RQ buffer */
+	__aligned_u64 rq_len;
+};
+
+struct qedr_create_qp_uresp {
+	__u32 qp_id;
+	__u32 atomic_supported;
+
+	/* SQ */
+	__u32 sq_db_offset;
+	__u16 sq_icid;
+
+	/* RQ */
+	__u32 rq_db_offset;
+	__u16 rq_icid;
+
+	__u32 rq_db2_offset;
+	__u32 reserved;
+};
+
+#endif /* __QEDR_USER_H__ */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
new file mode 100644
index 0000000..0ce0943
--- /dev/null
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -0,0 +1,406 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_RDMA_NETLINK_H
+#define _UAPI_RDMA_NETLINK_H
+
+#include <linux/types.h>
+
+enum {
+	RDMA_NL_RDMA_CM = 1,
+	RDMA_NL_IWCM,
+	RDMA_NL_RSVD,
+	RDMA_NL_LS,	/* RDMA Local Services */
+	RDMA_NL_NLDEV,	/* RDMA device interface */
+	RDMA_NL_NUM_CLIENTS
+};
+
+enum {
+	RDMA_NL_GROUP_CM = 1,
+	RDMA_NL_GROUP_IWPM,
+	RDMA_NL_GROUP_LS,
+	RDMA_NL_NUM_GROUPS
+};
+
+#define RDMA_NL_GET_CLIENT(type) ((type & (((1 << 6) - 1) << 10)) >> 10)
+#define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1))
+#define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op)
+
+enum {
+	RDMA_NL_RDMA_CM_ID_STATS = 0,
+	RDMA_NL_RDMA_CM_NUM_OPS
+};
+
+enum {
+	RDMA_NL_RDMA_CM_ATTR_SRC_ADDR = 1,
+	RDMA_NL_RDMA_CM_ATTR_DST_ADDR,
+	RDMA_NL_RDMA_CM_NUM_ATTR,
+};
+
+/* iwarp port mapper op-codes */
+enum {
+	RDMA_NL_IWPM_REG_PID = 0,
+	RDMA_NL_IWPM_ADD_MAPPING,
+	RDMA_NL_IWPM_QUERY_MAPPING,
+	RDMA_NL_IWPM_REMOVE_MAPPING,
+	RDMA_NL_IWPM_REMOTE_INFO,
+	RDMA_NL_IWPM_HANDLE_ERR,
+	RDMA_NL_IWPM_MAPINFO,
+	RDMA_NL_IWPM_MAPINFO_NUM,
+	RDMA_NL_IWPM_NUM_OPS
+};
+
+struct rdma_cm_id_stats {
+	__u32	qp_num;
+	__u32	bound_dev_if;
+	__u32	port_space;
+	__s32	pid;
+	__u8	cm_state;
+	__u8	node_type;
+	__u8	port_num;
+	__u8	qp_type;
+};
+
+enum {
+	IWPM_NLA_REG_PID_UNSPEC = 0,
+	IWPM_NLA_REG_PID_SEQ,
+	IWPM_NLA_REG_IF_NAME,
+	IWPM_NLA_REG_IBDEV_NAME,
+	IWPM_NLA_REG_ULIB_NAME,
+	IWPM_NLA_REG_PID_MAX
+};
+
+enum {
+	IWPM_NLA_RREG_PID_UNSPEC = 0,
+	IWPM_NLA_RREG_PID_SEQ,
+	IWPM_NLA_RREG_IBDEV_NAME,
+	IWPM_NLA_RREG_ULIB_NAME,
+	IWPM_NLA_RREG_ULIB_VER,
+	IWPM_NLA_RREG_PID_ERR,
+	IWPM_NLA_RREG_PID_MAX
+
+};
+
+enum {
+	IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0,
+	IWPM_NLA_MANAGE_MAPPING_SEQ,
+	IWPM_NLA_MANAGE_ADDR,
+	IWPM_NLA_MANAGE_MAPPED_LOC_ADDR,
+	IWPM_NLA_RMANAGE_MAPPING_ERR,
+	IWPM_NLA_RMANAGE_MAPPING_MAX
+};
+
+#define IWPM_NLA_MANAGE_MAPPING_MAX 3
+#define IWPM_NLA_QUERY_MAPPING_MAX  4
+#define IWPM_NLA_MAPINFO_SEND_MAX   3
+
+enum {
+	IWPM_NLA_QUERY_MAPPING_UNSPEC = 0,
+	IWPM_NLA_QUERY_MAPPING_SEQ,
+	IWPM_NLA_QUERY_LOCAL_ADDR,
+	IWPM_NLA_QUERY_REMOTE_ADDR,
+	IWPM_NLA_RQUERY_MAPPED_LOC_ADDR,
+	IWPM_NLA_RQUERY_MAPPED_REM_ADDR,
+	IWPM_NLA_RQUERY_MAPPING_ERR,
+	IWPM_NLA_RQUERY_MAPPING_MAX
+};
+
+enum {
+	IWPM_NLA_MAPINFO_REQ_UNSPEC = 0,
+	IWPM_NLA_MAPINFO_ULIB_NAME,
+	IWPM_NLA_MAPINFO_ULIB_VER,
+	IWPM_NLA_MAPINFO_REQ_MAX
+};
+
+enum {
+	IWPM_NLA_MAPINFO_UNSPEC = 0,
+	IWPM_NLA_MAPINFO_LOCAL_ADDR,
+	IWPM_NLA_MAPINFO_MAPPED_ADDR,
+	IWPM_NLA_MAPINFO_MAX
+};
+
+enum {
+	IWPM_NLA_MAPINFO_NUM_UNSPEC = 0,
+	IWPM_NLA_MAPINFO_SEQ,
+	IWPM_NLA_MAPINFO_SEND_NUM,
+	IWPM_NLA_MAPINFO_ACK_NUM,
+	IWPM_NLA_MAPINFO_NUM_MAX
+};
+
+enum {
+	IWPM_NLA_ERR_UNSPEC = 0,
+	IWPM_NLA_ERR_SEQ,
+	IWPM_NLA_ERR_CODE,
+	IWPM_NLA_ERR_MAX
+};
+
+/*
+ * Local service operations:
+ *   RESOLVE - The client requests the local service to resolve a path.
+ *   SET_TIMEOUT - The local service requests the client to set the timeout.
+ *   IP_RESOLVE - The client requests the local service to resolve an IP to GID.
+ */
+enum {
+	RDMA_NL_LS_OP_RESOLVE = 0,
+	RDMA_NL_LS_OP_SET_TIMEOUT,
+	RDMA_NL_LS_OP_IP_RESOLVE,
+	RDMA_NL_LS_NUM_OPS
+};
+
+/* Local service netlink message flags */
+#define RDMA_NL_LS_F_ERR	0x0100	/* Failed response */
+
+/*
+ * Local service resolve operation family header.
+ * The layout for the resolve operation:
+ *    nlmsg header
+ *    family header
+ *    attributes
+ */
+
+/*
+ * Local service path use:
+ * Specify how the path(s) will be used.
+ *   ALL - For connected CM operation (6 pathrecords)
+ *   UNIDIRECTIONAL - For unidirectional UD (1 pathrecord)
+ *   GMP - For miscellaneous GMP like operation (at least 1 reversible
+ *         pathrecord)
+ */
+enum {
+	LS_RESOLVE_PATH_USE_ALL = 0,
+	LS_RESOLVE_PATH_USE_UNIDIRECTIONAL,
+	LS_RESOLVE_PATH_USE_GMP,
+	LS_RESOLVE_PATH_USE_MAX
+};
+
+#define LS_DEVICE_NAME_MAX 64
+
+struct rdma_ls_resolve_header {
+	__u8 device_name[LS_DEVICE_NAME_MAX];
+	__u8 port_num;
+	__u8 path_use;
+};
+
+struct rdma_ls_ip_resolve_header {
+	__u32 ifindex;
+};
+
+/* Local service attribute type */
+#define RDMA_NLA_F_MANDATORY	(1 << 13)
+#define RDMA_NLA_TYPE_MASK	(~(NLA_F_NESTED | NLA_F_NET_BYTEORDER | \
+				  RDMA_NLA_F_MANDATORY))
+
+/*
+ * Local service attributes:
+ *   Attr Name       Size                       Byte order
+ *   -----------------------------------------------------
+ *   PATH_RECORD     struct ib_path_rec_data
+ *   TIMEOUT         u32                        cpu
+ *   SERVICE_ID      u64                        cpu
+ *   DGID            u8[16]                     BE
+ *   SGID            u8[16]                     BE
+ *   TCLASS          u8
+ *   PKEY            u16                        cpu
+ *   QOS_CLASS       u16                        cpu
+ *   IPV4            u32                        BE
+ *   IPV6            u8[16]                     BE
+ */
+enum {
+	LS_NLA_TYPE_UNSPEC = 0,
+	LS_NLA_TYPE_PATH_RECORD,
+	LS_NLA_TYPE_TIMEOUT,
+	LS_NLA_TYPE_SERVICE_ID,
+	LS_NLA_TYPE_DGID,
+	LS_NLA_TYPE_SGID,
+	LS_NLA_TYPE_TCLASS,
+	LS_NLA_TYPE_PKEY,
+	LS_NLA_TYPE_QOS_CLASS,
+	LS_NLA_TYPE_IPV4,
+	LS_NLA_TYPE_IPV6,
+	LS_NLA_TYPE_MAX
+};
+
+/* Local service DGID/SGID attribute: big endian */
+struct rdma_nla_ls_gid {
+	__u8		gid[16];
+};
+
+enum rdma_nldev_command {
+	RDMA_NLDEV_CMD_UNSPEC,
+
+	RDMA_NLDEV_CMD_GET, /* can dump */
+
+	/* 2 - 4 are free to use */
+
+	RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */
+
+	/* 6 - 8 are free to use */
+
+	RDMA_NLDEV_CMD_RES_GET = 9, /* can dump */
+
+	RDMA_NLDEV_CMD_RES_QP_GET, /* can dump */
+
+	RDMA_NLDEV_CMD_RES_CM_ID_GET, /* can dump */
+
+	RDMA_NLDEV_CMD_RES_CQ_GET, /* can dump */
+
+	RDMA_NLDEV_CMD_RES_MR_GET, /* can dump */
+
+	RDMA_NLDEV_CMD_RES_PD_GET, /* can dump */
+
+	RDMA_NLDEV_NUM_OPS
+};
+
+enum rdma_nldev_attr {
+	/* don't change the order or add anything between, this is ABI! */
+	RDMA_NLDEV_ATTR_UNSPEC,
+
+	/* Identifier for ib_device */
+	RDMA_NLDEV_ATTR_DEV_INDEX,		/* u32 */
+
+	RDMA_NLDEV_ATTR_DEV_NAME,		/* string */
+	/*
+	 * Device index together with port index are identifiers
+	 * for port/link properties.
+	 *
+	 * For RDMA_NLDEV_CMD_GET commamnd, port index will return number
+	 * of available ports in ib_device, while for port specific operations,
+	 * it will be real port index as it appears in sysfs. Port index follows
+	 * sysfs notation and starts from 1 for the first port.
+	 */
+	RDMA_NLDEV_ATTR_PORT_INDEX,		/* u32 */
+
+	/*
+	 * Device and port capabilities
+	 */
+	RDMA_NLDEV_ATTR_CAP_FLAGS,		/* u64 */
+
+	/*
+	 * FW version
+	 */
+	RDMA_NLDEV_ATTR_FW_VERSION,		/* string */
+
+	/*
+	 * Node GUID (in host byte order) associated with the RDMA device.
+	 */
+	RDMA_NLDEV_ATTR_NODE_GUID,			/* u64 */
+
+	/*
+	 * System image GUID (in host byte order) associated with
+	 * this RDMA device and other devices which are part of a
+	 * single system.
+	 */
+	RDMA_NLDEV_ATTR_SYS_IMAGE_GUID,		/* u64 */
+
+	/*
+	 * Subnet prefix (in host byte order)
+	 */
+	RDMA_NLDEV_ATTR_SUBNET_PREFIX,		/* u64 */
+
+	/*
+	 * Local Identifier (LID),
+	 * According to IB specification, It is 16-bit address assigned
+	 * by the Subnet Manager. Extended to be 32-bit for OmniPath users.
+	 */
+	RDMA_NLDEV_ATTR_LID,			/* u32 */
+	RDMA_NLDEV_ATTR_SM_LID,			/* u32 */
+
+	/*
+	 * LID mask control (LMC)
+	 */
+	RDMA_NLDEV_ATTR_LMC,			/* u8 */
+
+	RDMA_NLDEV_ATTR_PORT_STATE,		/* u8 */
+	RDMA_NLDEV_ATTR_PORT_PHYS_STATE,	/* u8 */
+
+	RDMA_NLDEV_ATTR_DEV_NODE_TYPE,		/* u8 */
+
+	RDMA_NLDEV_ATTR_RES_SUMMARY,		/* nested table */
+	RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY,	/* nested table */
+	RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME,	/* string */
+	RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR,	/* u64 */
+
+	RDMA_NLDEV_ATTR_RES_QP,			/* nested table */
+	RDMA_NLDEV_ATTR_RES_QP_ENTRY,		/* nested table */
+	/*
+	 * Local QPN
+	 */
+	RDMA_NLDEV_ATTR_RES_LQPN,		/* u32 */
+	/*
+	 * Remote QPN,
+	 * Applicable for RC and UC only IBTA 11.2.5.3 QUERY QUEUE PAIR
+	 */
+	RDMA_NLDEV_ATTR_RES_RQPN,		/* u32 */
+	/*
+	 * Receive Queue PSN,
+	 * Applicable for RC and UC only 11.2.5.3 QUERY QUEUE PAIR
+	 */
+	RDMA_NLDEV_ATTR_RES_RQ_PSN,		/* u32 */
+	/*
+	 * Send Queue PSN
+	 */
+	RDMA_NLDEV_ATTR_RES_SQ_PSN,		/* u32 */
+	RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE,	/* u8 */
+	/*
+	 * QP types as visible to RDMA/core, the reserved QPT
+	 * are not exported through this interface.
+	 */
+	RDMA_NLDEV_ATTR_RES_TYPE,		/* u8 */
+	RDMA_NLDEV_ATTR_RES_STATE,		/* u8 */
+	/*
+	 * Process ID which created object,
+	 * in case of kernel origin, PID won't exist.
+	 */
+	RDMA_NLDEV_ATTR_RES_PID,		/* u32 */
+	/*
+	 * The name of process created following resource.
+	 * It will exist only for kernel objects.
+	 * For user created objects, the user is supposed
+	 * to read /proc/PID/comm file.
+	 */
+	RDMA_NLDEV_ATTR_RES_KERN_NAME,		/* string */
+
+	RDMA_NLDEV_ATTR_RES_CM_ID,		/* nested table */
+	RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY,	/* nested table */
+	/*
+	 * rdma_cm_id port space.
+	 */
+	RDMA_NLDEV_ATTR_RES_PS,			/* u32 */
+	/*
+	 * Source and destination socket addresses
+	 */
+	RDMA_NLDEV_ATTR_RES_SRC_ADDR,		/* __kernel_sockaddr_storage */
+	RDMA_NLDEV_ATTR_RES_DST_ADDR,		/* __kernel_sockaddr_storage */
+
+	RDMA_NLDEV_ATTR_RES_CQ,			/* nested table */
+	RDMA_NLDEV_ATTR_RES_CQ_ENTRY,		/* nested table */
+	RDMA_NLDEV_ATTR_RES_CQE,		/* u32 */
+	RDMA_NLDEV_ATTR_RES_USECNT,		/* u64 */
+	RDMA_NLDEV_ATTR_RES_POLL_CTX,		/* u8 */
+
+	RDMA_NLDEV_ATTR_RES_MR,			/* nested table */
+	RDMA_NLDEV_ATTR_RES_MR_ENTRY,		/* nested table */
+	RDMA_NLDEV_ATTR_RES_RKEY,		/* u32 */
+	RDMA_NLDEV_ATTR_RES_LKEY,		/* u32 */
+	RDMA_NLDEV_ATTR_RES_IOVA,		/* u64 */
+	RDMA_NLDEV_ATTR_RES_MRLEN,		/* u64 */
+
+	RDMA_NLDEV_ATTR_RES_PD,			/* nested table */
+	RDMA_NLDEV_ATTR_RES_PD_ENTRY,		/* nested table */
+	RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY,	/* u32 */
+	RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY,	/* u32 */
+
+	/*
+	 * Provides logical name and index of netdevice which is
+	 * connected to physical port. This information is relevant
+	 * for RoCE and iWARP.
+	 *
+	 * The netdevices which are associated with containers are
+	 * supposed to be exported together with GID table once it
+	 * will be exposed through the netlink. Because the
+	 * associated netdevices are properties of GIDs.
+	 */
+	RDMA_NLDEV_ATTR_NDEV_INDEX,		/* u32 */
+	RDMA_NLDEV_ATTR_NDEV_NAME,		/* string */
+
+	RDMA_NLDEV_ATTR_MAX
+};
+#endif /* _UAPI_RDMA_NETLINK_H */
diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h
new file mode 100644
index 0000000..0d1e78e
--- /dev/null
+++ b/include/uapi/rdma/rdma_user_cm.h
@@ -0,0 +1,324 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_USER_CM_H
+#define RDMA_USER_CM_H
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in6.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_user_sa.h>
+
+#define RDMA_USER_CM_ABI_VERSION	4
+
+#define RDMA_MAX_PRIVATE_DATA		256
+
+enum {
+	RDMA_USER_CM_CMD_CREATE_ID,
+	RDMA_USER_CM_CMD_DESTROY_ID,
+	RDMA_USER_CM_CMD_BIND_IP,
+	RDMA_USER_CM_CMD_RESOLVE_IP,
+	RDMA_USER_CM_CMD_RESOLVE_ROUTE,
+	RDMA_USER_CM_CMD_QUERY_ROUTE,
+	RDMA_USER_CM_CMD_CONNECT,
+	RDMA_USER_CM_CMD_LISTEN,
+	RDMA_USER_CM_CMD_ACCEPT,
+	RDMA_USER_CM_CMD_REJECT,
+	RDMA_USER_CM_CMD_DISCONNECT,
+	RDMA_USER_CM_CMD_INIT_QP_ATTR,
+	RDMA_USER_CM_CMD_GET_EVENT,
+	RDMA_USER_CM_CMD_GET_OPTION,
+	RDMA_USER_CM_CMD_SET_OPTION,
+	RDMA_USER_CM_CMD_NOTIFY,
+	RDMA_USER_CM_CMD_JOIN_IP_MCAST,
+	RDMA_USER_CM_CMD_LEAVE_MCAST,
+	RDMA_USER_CM_CMD_MIGRATE_ID,
+	RDMA_USER_CM_CMD_QUERY,
+	RDMA_USER_CM_CMD_BIND,
+	RDMA_USER_CM_CMD_RESOLVE_ADDR,
+	RDMA_USER_CM_CMD_JOIN_MCAST
+};
+
+/* See IBTA Annex A11, servies ID bytes 4 & 5 */
+enum rdma_ucm_port_space {
+	RDMA_PS_IPOIB = 0x0002,
+	RDMA_PS_IB    = 0x013F,
+	RDMA_PS_TCP   = 0x0106,
+	RDMA_PS_UDP   = 0x0111,
+};
+
+/*
+ * command ABI structures.
+ */
+struct rdma_ucm_cmd_hdr {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+};
+
+struct rdma_ucm_create_id {
+	__aligned_u64 uid;
+	__aligned_u64 response;
+	__u16 ps;                  /* use enum rdma_ucm_port_space */
+	__u8  qp_type;
+	__u8  reserved[5];
+};
+
+struct rdma_ucm_create_id_resp {
+	__u32 id;
+};
+
+struct rdma_ucm_destroy_id {
+	__aligned_u64 response;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct rdma_ucm_destroy_id_resp {
+	__u32 events_reported;
+};
+
+struct rdma_ucm_bind_ip {
+	__aligned_u64 response;
+	struct sockaddr_in6 addr;
+	__u32 id;
+};
+
+struct rdma_ucm_bind {
+	__u32 id;
+	__u16 addr_size;
+	__u16 reserved;
+	struct __kernel_sockaddr_storage addr;
+};
+
+struct rdma_ucm_resolve_ip {
+	struct sockaddr_in6 src_addr;
+	struct sockaddr_in6 dst_addr;
+	__u32 id;
+	__u32 timeout_ms;
+};
+
+struct rdma_ucm_resolve_addr {
+	__u32 id;
+	__u32 timeout_ms;
+	__u16 src_size;
+	__u16 dst_size;
+	__u32 reserved;
+	struct __kernel_sockaddr_storage src_addr;
+	struct __kernel_sockaddr_storage dst_addr;
+};
+
+struct rdma_ucm_resolve_route {
+	__u32 id;
+	__u32 timeout_ms;
+};
+
+enum {
+	RDMA_USER_CM_QUERY_ADDR,
+	RDMA_USER_CM_QUERY_PATH,
+	RDMA_USER_CM_QUERY_GID
+};
+
+struct rdma_ucm_query {
+	__aligned_u64 response;
+	__u32 id;
+	__u32 option;
+};
+
+struct rdma_ucm_query_route_resp {
+	__aligned_u64 node_guid;
+	struct ib_user_path_rec ib_route[2];
+	struct sockaddr_in6 src_addr;
+	struct sockaddr_in6 dst_addr;
+	__u32 num_paths;
+	__u8 port_num;
+	__u8 reserved[3];
+};
+
+struct rdma_ucm_query_addr_resp {
+	__aligned_u64 node_guid;
+	__u8  port_num;
+	__u8  reserved;
+	__u16 pkey;
+	__u16 src_size;
+	__u16 dst_size;
+	struct __kernel_sockaddr_storage src_addr;
+	struct __kernel_sockaddr_storage dst_addr;
+};
+
+struct rdma_ucm_query_path_resp {
+	__u32 num_paths;
+	__u32 reserved;
+	struct ib_path_rec_data path_data[0];
+};
+
+struct rdma_ucm_conn_param {
+	__u32 qp_num;
+	__u32 qkey;
+	__u8  private_data[RDMA_MAX_PRIVATE_DATA];
+	__u8  private_data_len;
+	__u8  srq;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  flow_control;
+	__u8  retry_count;
+	__u8  rnr_retry_count;
+	__u8  valid;
+};
+
+struct rdma_ucm_ud_param {
+	__u32 qp_num;
+	__u32 qkey;
+	struct ib_uverbs_ah_attr ah_attr;
+	__u8  private_data[RDMA_MAX_PRIVATE_DATA];
+	__u8  private_data_len;
+	__u8  reserved[7];
+};
+
+struct rdma_ucm_connect {
+	struct rdma_ucm_conn_param conn_param;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct rdma_ucm_listen {
+	__u32 id;
+	__u32 backlog;
+};
+
+struct rdma_ucm_accept {
+	__aligned_u64 uid;
+	struct rdma_ucm_conn_param conn_param;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct rdma_ucm_reject {
+	__u32 id;
+	__u8  private_data_len;
+	__u8  reserved[3];
+	__u8  private_data[RDMA_MAX_PRIVATE_DATA];
+};
+
+struct rdma_ucm_disconnect {
+	__u32 id;
+};
+
+struct rdma_ucm_init_qp_attr {
+	__aligned_u64 response;
+	__u32 id;
+	__u32 qp_state;
+};
+
+struct rdma_ucm_notify {
+	__u32 id;
+	__u32 event;
+};
+
+struct rdma_ucm_join_ip_mcast {
+	__aligned_u64 response;		/* rdma_ucm_create_id_resp */
+	__aligned_u64 uid;
+	struct sockaddr_in6 addr;
+	__u32 id;
+};
+
+/* Multicast join flags */
+enum {
+	RDMA_MC_JOIN_FLAG_FULLMEMBER,
+	RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER,
+	RDMA_MC_JOIN_FLAG_RESERVED,
+};
+
+struct rdma_ucm_join_mcast {
+	__aligned_u64 response;		/* rdma_ucma_create_id_resp */
+	__aligned_u64 uid;
+	__u32 id;
+	__u16 addr_size;
+	__u16 join_flags;
+	struct __kernel_sockaddr_storage addr;
+};
+
+struct rdma_ucm_get_event {
+	__aligned_u64 response;
+};
+
+struct rdma_ucm_event_resp {
+	__aligned_u64 uid;
+	__u32 id;
+	__u32 event;
+	__u32 status;
+	/*
+	 * NOTE: This union is not aligned to 8 bytes so none of the union
+	 * members may contain a u64 or anything with higher alignment than 4.
+	 */
+	union {
+		struct rdma_ucm_conn_param conn;
+		struct rdma_ucm_ud_param   ud;
+	} param;
+	__u32 reserved;
+};
+
+/* Option levels */
+enum {
+	RDMA_OPTION_ID		= 0,
+	RDMA_OPTION_IB		= 1
+};
+
+/* Option details */
+enum {
+	RDMA_OPTION_ID_TOS	 = 0,
+	RDMA_OPTION_ID_REUSEADDR = 1,
+	RDMA_OPTION_ID_AFONLY	 = 2,
+	RDMA_OPTION_IB_PATH	 = 1
+};
+
+struct rdma_ucm_set_option {
+	__aligned_u64 optval;
+	__u32 id;
+	__u32 level;
+	__u32 optname;
+	__u32 optlen;
+};
+
+struct rdma_ucm_migrate_id {
+	__aligned_u64 response;
+	__u32 id;
+	__u32 fd;
+};
+
+struct rdma_ucm_migrate_resp {
+	__u32 events_reported;
+};
+
+#endif /* RDMA_USER_CM_H */
diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h
new file mode 100644
index 0000000..d92d272
--- /dev/null
+++ b/include/uapi/rdma/rdma_user_ioctl.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2016 Mellanox Technologies, LTD. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_USER_IOCTL_H
+#define RDMA_USER_IOCTL_H
+
+#include <rdma/ib_user_mad.h>
+#include <rdma/hfi/hfi1_ioctl.h>
+#include <rdma/rdma_user_ioctl_cmds.h>
+
+/* Legacy name, for user space application which already use it */
+#define IB_IOCTL_MAGIC		RDMA_IOCTL_MAGIC
+
+/*
+ * General blocks assignments
+ * It is closed on purpose do not expose it it user space
+ * #define MAD_CMD_BASE		0x00
+ * #define HFI1_CMD_BAS		0xE0
+ */
+
+/* MAD specific section */
+#define IB_USER_MAD_REGISTER_AGENT	_IOWR(RDMA_IOCTL_MAGIC, 0x01, struct ib_user_mad_reg_req)
+#define IB_USER_MAD_UNREGISTER_AGENT	_IOW(RDMA_IOCTL_MAGIC,  0x02, __u32)
+#define IB_USER_MAD_ENABLE_PKEY		_IO(RDMA_IOCTL_MAGIC,   0x03)
+#define IB_USER_MAD_REGISTER_AGENT2	_IOWR(RDMA_IOCTL_MAGIC, 0x04, struct ib_user_mad_reg_req2)
+
+/* HFI specific section */
+/* allocate HFI and context */
+#define HFI1_IOCTL_ASSIGN_CTXT		_IOWR(RDMA_IOCTL_MAGIC, 0xE1, struct hfi1_user_info)
+/* find out what resources we got */
+#define HFI1_IOCTL_CTXT_INFO		_IOW(RDMA_IOCTL_MAGIC,  0xE2, struct hfi1_ctxt_info)
+/* set up userspace */
+#define HFI1_IOCTL_USER_INFO		_IOW(RDMA_IOCTL_MAGIC,  0xE3, struct hfi1_base_info)
+/* update expected TID entries */
+#define HFI1_IOCTL_TID_UPDATE		_IOWR(RDMA_IOCTL_MAGIC, 0xE4, struct hfi1_tid_info)
+/* free expected TID entries */
+#define HFI1_IOCTL_TID_FREE		_IOWR(RDMA_IOCTL_MAGIC, 0xE5, struct hfi1_tid_info)
+/* force an update of PIO credit */
+#define HFI1_IOCTL_CREDIT_UPD		_IO(RDMA_IOCTL_MAGIC,   0xE6)
+/* control receipt of packets */
+#define HFI1_IOCTL_RECV_CTRL		_IOW(RDMA_IOCTL_MAGIC,  0xE8, int)
+/* set the kind of polling we want */
+#define HFI1_IOCTL_POLL_TYPE		_IOW(RDMA_IOCTL_MAGIC,  0xE9, int)
+/* ack & clear user status bits */
+#define HFI1_IOCTL_ACK_EVENT		_IOW(RDMA_IOCTL_MAGIC,  0xEA, unsigned long)
+/* set context's pkey */
+#define HFI1_IOCTL_SET_PKEY		_IOW(RDMA_IOCTL_MAGIC,  0xEB, __u16)
+/* reset context's HW send context */
+#define HFI1_IOCTL_CTXT_RESET		_IO(RDMA_IOCTL_MAGIC,   0xEC)
+/* read TID cache invalidations */
+#define HFI1_IOCTL_TID_INVAL_READ	_IOWR(RDMA_IOCTL_MAGIC, 0xED, struct hfi1_tid_info)
+/* get the version of the user cdev */
+#define HFI1_IOCTL_GET_VERS		_IOR(RDMA_IOCTL_MAGIC,  0xEE, int)
+
+#endif /* RDMA_USER_IOCTL_H */
diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h
new file mode 100644
index 0000000..1da5a1e
--- /dev/null
+++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_USER_IOCTL_CMDS_H
+#define RDMA_USER_IOCTL_CMDS_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/* Documentation/ioctl/ioctl-number.txt */
+#define RDMA_IOCTL_MAGIC	0x1b
+#define RDMA_VERBS_IOCTL \
+	_IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr)
+
+enum {
+	/* User input */
+	UVERBS_ATTR_F_MANDATORY = 1U << 0,
+	/*
+	 * Valid output bit should be ignored and considered set in
+	 * mandatory fields. This bit is kernel output.
+	 */
+	UVERBS_ATTR_F_VALID_OUTPUT = 1U << 1,
+};
+
+struct ib_uverbs_attr {
+	__u16 attr_id;		/* command specific type attribute */
+	__u16 len;		/* only for pointers */
+	__u16 flags;		/* combination of UVERBS_ATTR_F_XXXX */
+	union {
+		struct {
+			__u8 elem_id;
+			__u8 reserved;
+		} enum_data;
+		__u16 reserved;
+	} attr_data;
+	__aligned_u64 data;	/* ptr to command, inline data or idr/fd */
+};
+
+struct ib_uverbs_ioctl_hdr {
+	__u16 length;
+	__u16 object_id;
+	__u16 method_id;
+	__u16 num_attrs;
+	__aligned_u64 reserved1;
+	__u32 driver_id;
+	__u32 reserved2;
+	struct ib_uverbs_attr  attrs[0];
+};
+
+enum rdma_driver_id {
+	RDMA_DRIVER_UNKNOWN,
+	RDMA_DRIVER_MLX5,
+	RDMA_DRIVER_MLX4,
+	RDMA_DRIVER_CXGB3,
+	RDMA_DRIVER_CXGB4,
+	RDMA_DRIVER_MTHCA,
+	RDMA_DRIVER_BNXT_RE,
+	RDMA_DRIVER_OCRDMA,
+	RDMA_DRIVER_NES,
+	RDMA_DRIVER_I40IW,
+	RDMA_DRIVER_VMW_PVRDMA,
+	RDMA_DRIVER_QEDR,
+	RDMA_DRIVER_HNS,
+	RDMA_DRIVER_USNIC,
+	RDMA_DRIVER_RXE,
+	RDMA_DRIVER_HFI1,
+	RDMA_DRIVER_QIB,
+};
+
+#endif
diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h
new file mode 100644
index 0000000..44ef6a3
--- /dev/null
+++ b/include/uapi/rdma/rdma_user_rxe.h
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_USER_RXE_H
+#define RDMA_USER_RXE_H
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+
+union rxe_gid {
+	__u8	raw[16];
+	struct {
+		__be64	subnet_prefix;
+		__be64	interface_id;
+	} global;
+};
+
+struct rxe_global_route {
+	union rxe_gid	dgid;
+	__u32		flow_label;
+	__u8		sgid_index;
+	__u8		hop_limit;
+	__u8		traffic_class;
+};
+
+struct rxe_av {
+	__u8			port_num;
+	__u8			network_type;
+	__u16			reserved1;
+	__u32			reserved2;
+	struct rxe_global_route	grh;
+	union {
+		struct sockaddr_in	_sockaddr_in;
+		struct sockaddr_in6	_sockaddr_in6;
+	} sgid_addr, dgid_addr;
+};
+
+struct rxe_send_wr {
+	__aligned_u64		wr_id;
+	__u32			num_sge;
+	__u32			opcode;
+	__u32			send_flags;
+	union {
+		__be32		imm_data;
+		__u32		invalidate_rkey;
+	} ex;
+	union {
+		struct {
+			__aligned_u64 remote_addr;
+			__u32	rkey;
+			__u32	reserved;
+		} rdma;
+		struct {
+			__aligned_u64 remote_addr;
+			__aligned_u64 compare_add;
+			__aligned_u64 swap;
+			__u32	rkey;
+			__u32	reserved;
+		} atomic;
+		struct {
+			__u32	remote_qpn;
+			__u32	remote_qkey;
+			__u16	pkey_index;
+		} ud;
+		/* reg is only used by the kernel and is not part of the uapi */
+		struct {
+			union {
+				struct ib_mr *mr;
+				__aligned_u64 reserved;
+			};
+			__u32        key;
+			__u32        access;
+		} reg;
+	} wr;
+};
+
+struct rxe_sge {
+	__aligned_u64 addr;
+	__u32	length;
+	__u32	lkey;
+};
+
+struct mminfo {
+	__aligned_u64  		offset;
+	__u32			size;
+	__u32			pad;
+};
+
+struct rxe_dma_info {
+	__u32			length;
+	__u32			resid;
+	__u32			cur_sge;
+	__u32			num_sge;
+	__u32			sge_offset;
+	__u32			reserved;
+	union {
+		__u8		inline_data[0];
+		struct rxe_sge	sge[0];
+	};
+};
+
+struct rxe_send_wqe {
+	struct rxe_send_wr	wr;
+	struct rxe_av		av;
+	__u32			status;
+	__u32			state;
+	__aligned_u64		iova;
+	__u32			mask;
+	__u32			first_psn;
+	__u32			last_psn;
+	__u32			ack_length;
+	__u32			ssn;
+	__u32			has_rd_atomic;
+	struct rxe_dma_info	dma;
+};
+
+struct rxe_recv_wqe {
+	__aligned_u64		wr_id;
+	__u32			num_sge;
+	__u32			padding;
+	struct rxe_dma_info	dma;
+};
+
+struct rxe_create_cq_resp {
+	struct mminfo mi;
+};
+
+struct rxe_resize_cq_resp {
+	struct mminfo mi;
+};
+
+struct rxe_create_qp_resp {
+	struct mminfo rq_mi;
+	struct mminfo sq_mi;
+};
+
+struct rxe_create_srq_resp {
+	struct mminfo mi;
+	__u32 srq_num;
+	__u32 reserved;
+};
+
+struct rxe_modify_srq_cmd {
+	__aligned_u64 mmap_info_addr;
+};
+
+#endif /* RDMA_USER_RXE_H */
diff --git a/include/uapi/rdma/vmw_pvrdma-abi.h b/include/uapi/rdma/vmw_pvrdma-abi.h
new file mode 100644
index 0000000..d13fd49
--- /dev/null
+++ b/include/uapi/rdma/vmw_pvrdma-abi.h
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of EITHER the GNU General Public License
+ * version 2 as published by the Free Software Foundation or the BSD
+ * 2-Clause License. This program is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
+ * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License version 2 for more details at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program available in the file COPYING in the main
+ * directory of this source tree.
+ *
+ * The BSD 2-Clause License
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __VMW_PVRDMA_ABI_H__
+#define __VMW_PVRDMA_ABI_H__
+
+#include <linux/types.h>
+
+#define PVRDMA_UVERBS_ABI_VERSION	3		/* ABI Version. */
+#define PVRDMA_UAR_HANDLE_MASK		0x00FFFFFF	/* Bottom 24 bits. */
+#define PVRDMA_UAR_QP_OFFSET		0		/* QP doorbell. */
+#define PVRDMA_UAR_QP_SEND		(1 << 30)	/* Send bit. */
+#define PVRDMA_UAR_QP_RECV		(1 << 31)	/* Recv bit. */
+#define PVRDMA_UAR_CQ_OFFSET		4		/* CQ doorbell. */
+#define PVRDMA_UAR_CQ_ARM_SOL		(1 << 29)	/* Arm solicited bit. */
+#define PVRDMA_UAR_CQ_ARM		(1 << 30)	/* Arm bit. */
+#define PVRDMA_UAR_CQ_POLL		(1 << 31)	/* Poll bit. */
+#define PVRDMA_UAR_SRQ_OFFSET		8		/* SRQ doorbell. */
+#define PVRDMA_UAR_SRQ_RECV		(1 << 30)	/* Recv bit. */
+
+enum pvrdma_wr_opcode {
+	PVRDMA_WR_RDMA_WRITE,
+	PVRDMA_WR_RDMA_WRITE_WITH_IMM,
+	PVRDMA_WR_SEND,
+	PVRDMA_WR_SEND_WITH_IMM,
+	PVRDMA_WR_RDMA_READ,
+	PVRDMA_WR_ATOMIC_CMP_AND_SWP,
+	PVRDMA_WR_ATOMIC_FETCH_AND_ADD,
+	PVRDMA_WR_LSO,
+	PVRDMA_WR_SEND_WITH_INV,
+	PVRDMA_WR_RDMA_READ_WITH_INV,
+	PVRDMA_WR_LOCAL_INV,
+	PVRDMA_WR_FAST_REG_MR,
+	PVRDMA_WR_MASKED_ATOMIC_CMP_AND_SWP,
+	PVRDMA_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+	PVRDMA_WR_BIND_MW,
+	PVRDMA_WR_REG_SIG_MR,
+};
+
+enum pvrdma_wc_status {
+	PVRDMA_WC_SUCCESS,
+	PVRDMA_WC_LOC_LEN_ERR,
+	PVRDMA_WC_LOC_QP_OP_ERR,
+	PVRDMA_WC_LOC_EEC_OP_ERR,
+	PVRDMA_WC_LOC_PROT_ERR,
+	PVRDMA_WC_WR_FLUSH_ERR,
+	PVRDMA_WC_MW_BIND_ERR,
+	PVRDMA_WC_BAD_RESP_ERR,
+	PVRDMA_WC_LOC_ACCESS_ERR,
+	PVRDMA_WC_REM_INV_REQ_ERR,
+	PVRDMA_WC_REM_ACCESS_ERR,
+	PVRDMA_WC_REM_OP_ERR,
+	PVRDMA_WC_RETRY_EXC_ERR,
+	PVRDMA_WC_RNR_RETRY_EXC_ERR,
+	PVRDMA_WC_LOC_RDD_VIOL_ERR,
+	PVRDMA_WC_REM_INV_RD_REQ_ERR,
+	PVRDMA_WC_REM_ABORT_ERR,
+	PVRDMA_WC_INV_EECN_ERR,
+	PVRDMA_WC_INV_EEC_STATE_ERR,
+	PVRDMA_WC_FATAL_ERR,
+	PVRDMA_WC_RESP_TIMEOUT_ERR,
+	PVRDMA_WC_GENERAL_ERR,
+};
+
+enum pvrdma_wc_opcode {
+	PVRDMA_WC_SEND,
+	PVRDMA_WC_RDMA_WRITE,
+	PVRDMA_WC_RDMA_READ,
+	PVRDMA_WC_COMP_SWAP,
+	PVRDMA_WC_FETCH_ADD,
+	PVRDMA_WC_BIND_MW,
+	PVRDMA_WC_LSO,
+	PVRDMA_WC_LOCAL_INV,
+	PVRDMA_WC_FAST_REG_MR,
+	PVRDMA_WC_MASKED_COMP_SWAP,
+	PVRDMA_WC_MASKED_FETCH_ADD,
+	PVRDMA_WC_RECV = 1 << 7,
+	PVRDMA_WC_RECV_RDMA_WITH_IMM,
+};
+
+enum pvrdma_wc_flags {
+	PVRDMA_WC_GRH			= 1 << 0,
+	PVRDMA_WC_WITH_IMM		= 1 << 1,
+	PVRDMA_WC_WITH_INVALIDATE	= 1 << 2,
+	PVRDMA_WC_IP_CSUM_OK		= 1 << 3,
+	PVRDMA_WC_WITH_SMAC		= 1 << 4,
+	PVRDMA_WC_WITH_VLAN		= 1 << 5,
+	PVRDMA_WC_WITH_NETWORK_HDR_TYPE	= 1 << 6,
+	PVRDMA_WC_FLAGS_MAX		= PVRDMA_WC_WITH_NETWORK_HDR_TYPE,
+};
+
+struct pvrdma_alloc_ucontext_resp {
+	__u32 qp_tab_size;
+	__u32 reserved;
+};
+
+struct pvrdma_alloc_pd_resp {
+	__u32 pdn;
+	__u32 reserved;
+};
+
+struct pvrdma_create_cq {
+	__aligned_u64 buf_addr;
+	__u32 buf_size;
+	__u32 reserved;
+};
+
+struct pvrdma_create_cq_resp {
+	__u32 cqn;
+	__u32 reserved;
+};
+
+struct pvrdma_resize_cq {
+	__aligned_u64 buf_addr;
+	__u32 buf_size;
+	__u32 reserved;
+};
+
+struct pvrdma_create_srq {
+	__aligned_u64 buf_addr;
+	__u32 buf_size;
+	__u32 reserved;
+};
+
+struct pvrdma_create_srq_resp {
+	__u32 srqn;
+	__u32 reserved;
+};
+
+struct pvrdma_create_qp {
+	__aligned_u64 rbuf_addr;
+	__aligned_u64 sbuf_addr;
+	__u32 rbuf_size;
+	__u32 sbuf_size;
+	__aligned_u64 qp_addr;
+};
+
+/* PVRDMA masked atomic compare and swap */
+struct pvrdma_ex_cmp_swap {
+	__aligned_u64 swap_val;
+	__aligned_u64 compare_val;
+	__aligned_u64 swap_mask;
+	__aligned_u64 compare_mask;
+};
+
+/* PVRDMA masked atomic fetch and add */
+struct pvrdma_ex_fetch_add {
+	__aligned_u64 add_val;
+	__aligned_u64 field_boundary;
+};
+
+/* PVRDMA address vector. */
+struct pvrdma_av {
+	__u32 port_pd;
+	__u32 sl_tclass_flowlabel;
+	__u8 dgid[16];
+	__u8 src_path_bits;
+	__u8 gid_index;
+	__u8 stat_rate;
+	__u8 hop_limit;
+	__u8 dmac[6];
+	__u8 reserved[6];
+};
+
+/* PVRDMA scatter/gather entry */
+struct pvrdma_sge {
+	__aligned_u64 addr;
+	__u32   length;
+	__u32   lkey;
+};
+
+/* PVRDMA receive queue work request */
+struct pvrdma_rq_wqe_hdr {
+	__aligned_u64 wr_id;		/* wr id */
+	__u32 num_sge;		/* size of s/g array */
+	__u32 total_len;	/* reserved */
+};
+/* Use pvrdma_sge (ib_sge) for receive queue s/g array elements. */
+
+/* PVRDMA send queue work request */
+struct pvrdma_sq_wqe_hdr {
+	__aligned_u64 wr_id;		/* wr id */
+	__u32 num_sge;		/* size of s/g array */
+	__u32 total_len;	/* reserved */
+	__u32 opcode;		/* operation type */
+	__u32 send_flags;	/* wr flags */
+	union {
+		__be32 imm_data;
+		__u32 invalidate_rkey;
+	} ex;
+	__u32 reserved;
+	union {
+		struct {
+			__aligned_u64 remote_addr;
+			__u32 rkey;
+			__u8 reserved[4];
+		} rdma;
+		struct {
+			__aligned_u64 remote_addr;
+			__aligned_u64 compare_add;
+			__aligned_u64 swap;
+			__u32 rkey;
+			__u32 reserved;
+		} atomic;
+		struct {
+			__aligned_u64 remote_addr;
+			__u32 log_arg_sz;
+			__u32 rkey;
+			union {
+				struct pvrdma_ex_cmp_swap  cmp_swap;
+				struct pvrdma_ex_fetch_add fetch_add;
+			} wr_data;
+		} masked_atomics;
+		struct {
+			__aligned_u64 iova_start;
+			__aligned_u64 pl_pdir_dma;
+			__u32 page_shift;
+			__u32 page_list_len;
+			__u32 length;
+			__u32 access_flags;
+			__u32 rkey;
+			__u32 reserved;
+		} fast_reg;
+		struct {
+			__u32 remote_qpn;
+			__u32 remote_qkey;
+			struct pvrdma_av av;
+		} ud;
+	} wr;
+};
+/* Use pvrdma_sge (ib_sge) for send queue s/g array elements. */
+
+/* Completion queue element. */
+struct pvrdma_cqe {
+	__aligned_u64 wr_id;
+	__aligned_u64 qp;
+	__u32 opcode;
+	__u32 status;
+	__u32 byte_len;
+	__be32 imm_data;
+	__u32 src_qp;
+	__u32 wc_flags;
+	__u32 vendor_err;
+	__u16 pkey_index;
+	__u16 slid;
+	__u8 sl;
+	__u8 dlid_path_bits;
+	__u8 port_num;
+	__u8 smac[6];
+	__u8 network_hdr_type;
+	__u8 reserved2[6]; /* Pad to next power of 2 (64). */
+};
+
+#endif /* __VMW_PVRDMA_ABI_H__ */
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
new file mode 100644
index 0000000..bffde4b
--- /dev/null
+++ b/net/rds/Kconfig
@@ -0,0 +1,27 @@
+
+config RDS
+	tristate "The RDS Protocol"
+	depends on INET
+	---help---
+	  The RDS (Reliable Datagram Sockets) protocol provides reliable,
+	  sequenced delivery of datagrams over Infiniband or TCP.
+
+config RDS_RDMA
+	tristate "RDS over Infiniband"
+	depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
+	---help---
+	  Allow RDS to use Infiniband as a transport.
+	  This transport supports RDMA operations.
+
+config RDS_TCP
+	tristate "RDS over TCP"
+	depends on RDS
+	---help---
+	  Allow RDS to use TCP as a transport.
+	  This transport does not support RDMA operations.
+
+config RDS_DEBUG
+        bool "RDS debugging messages"
+	depends on RDS
+        default n
+
diff --git a/net/rds/Makefile b/net/rds/Makefile
new file mode 100644
index 0000000..b5d568b
--- /dev/null
+++ b/net/rds/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_RDS) += rds.o
+rds-y :=	af_rds.o bind.o cong.o connection.o info.o message.o   \
+			recv.o send.o stats.o sysctl.o threads.o transport.o \
+			loop.o page.o rdma.o
+
+obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
+rds_rdma-y :=	rdma_transport.o \
+			ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
+			ib_sysctl.o ib_rdma.o ib_fmr.o ib_frmr.o
+
+
+obj-$(CONFIG_RDS_TCP) += rds_tcp.o
+rds_tcp-y :=		tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
+			tcp_send.o tcp_stats.o
+
+ccflags-$(CONFIG_RDS_DEBUG)	:=	-DRDS_DEBUG
+
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
new file mode 100644
index 0000000..ab751a1
--- /dev/null
+++ b/net/rds/af_rds.c
@@ -0,0 +1,707 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <linux/poll.h>
+#include <net/sock.h>
+
+#include "rds.h"
+
+/* this is just used for stats gathering :/ */
+static DEFINE_SPINLOCK(rds_sock_lock);
+static unsigned long rds_sock_count;
+static LIST_HEAD(rds_sock_list);
+DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
+
+/*
+ * This is called as the final descriptor referencing this socket is closed.
+ * We have to unbind the socket so that another socket can be bound to the
+ * address it was using.
+ *
+ * We have to be careful about racing with the incoming path.  sock_orphan()
+ * sets SOCK_DEAD and we use that as an indicator to the rx path that new
+ * messages shouldn't be queued.
+ */
+static int rds_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs;
+
+	if (!sk)
+		goto out;
+
+	rs = rds_sk_to_rs(sk);
+
+	sock_orphan(sk);
+	/* Note - rds_clear_recv_queue grabs rs_recv_lock, so
+	 * that ensures the recv path has completed messing
+	 * with the socket. */
+	rds_clear_recv_queue(rs);
+	rds_cong_remove_socket(rs);
+
+	rds_remove_bound(rs);
+
+	rds_send_drop_to(rs, NULL);
+	rds_rdma_drop_keys(rs);
+	rds_notify_queue_get(rs, NULL);
+	rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue);
+
+	spin_lock_bh(&rds_sock_lock);
+	list_del_init(&rs->rs_item);
+	rds_sock_count--;
+	spin_unlock_bh(&rds_sock_lock);
+
+	rds_trans_put(rs->rs_transport);
+
+	sock->sk = NULL;
+	sock_put(sk);
+out:
+	return 0;
+}
+
+/*
+ * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
+ * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
+ * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
+ * this seems more conservative.
+ * NB - normally, one would use sk_callback_lock for this, but we can
+ * get here from interrupts, whereas the network code grabs sk_callback_lock
+ * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
+ */
+void rds_wake_sk_sleep(struct rds_sock *rs)
+{
+	unsigned long flags;
+
+	read_lock_irqsave(&rs->rs_recv_lock, flags);
+	__rds_wake_sk_sleep(rds_rs_to_sk(rs));
+	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
+		       int peer)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+
+	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+
+	/* racey, don't care */
+	if (peer) {
+		if (!rs->rs_conn_addr)
+			return -ENOTCONN;
+
+		sin->sin_port = rs->rs_conn_port;
+		sin->sin_addr.s_addr = rs->rs_conn_addr;
+	} else {
+		sin->sin_port = rs->rs_bound_port;
+		sin->sin_addr.s_addr = rs->rs_bound_addr;
+	}
+
+	sin->sin_family = AF_INET;
+
+	return sizeof(*sin);
+}
+
+/*
+ * RDS' poll is without a doubt the least intuitive part of the interface,
+ * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from
+ * a network protocol.
+ *
+ * EPOLLIN is asserted if
+ *  -	there is data on the receive queue.
+ *  -	to signal that a previously congested destination may have become
+ *	uncongested
+ *  -	A notification has been queued to the socket (this can be a congestion
+ *	update, or a RDMA completion, or a MSG_ZEROCOPY completion).
+ *
+ * EPOLLOUT is asserted if there is room on the send queue. This does not mean
+ * however, that the next sendmsg() call will succeed. If the application tries
+ * to send to a congested destination, the system call may still fail (and
+ * return ENOBUFS).
+ */
+static __poll_t rds_poll(struct file *file, struct socket *sock,
+			     poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	__poll_t mask = 0;
+	unsigned long flags;
+
+	poll_wait(file, sk_sleep(sk), wait);
+
+	if (rs->rs_seen_congestion)
+		poll_wait(file, &rds_poll_waitq, wait);
+
+	read_lock_irqsave(&rs->rs_recv_lock, flags);
+	if (!rs->rs_cong_monitor) {
+		/* When a congestion map was updated, we signal EPOLLIN for
+		 * "historical" reasons. Applications can also poll for
+		 * WRBAND instead. */
+		if (rds_cong_updated_since(&rs->rs_cong_track))
+			mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND);
+	} else {
+		spin_lock(&rs->rs_lock);
+		if (rs->rs_cong_notify)
+			mask |= (EPOLLIN | EPOLLRDNORM);
+		spin_unlock(&rs->rs_lock);
+	}
+	if (!list_empty(&rs->rs_recv_queue) ||
+	    !list_empty(&rs->rs_notify_queue) ||
+	    !list_empty(&rs->rs_zcookie_queue.zcookie_head))
+		mask |= (EPOLLIN | EPOLLRDNORM);
+	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
+		mask |= (EPOLLOUT | EPOLLWRNORM);
+	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+		mask |= POLLERR;
+	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+	/* clear state any time we wake a seen-congested socket */
+	if (mask)
+		rs->rs_seen_congestion = 0;
+
+	return mask;
+}
+
+static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	return -ENOIOCTLCMD;
+}
+
+static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
+			      int len)
+{
+	struct sockaddr_in sin;
+	int ret = 0;
+
+	/* racing with another thread binding seems ok here */
+	if (rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+
+	if (len < sizeof(struct sockaddr_in)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (copy_from_user(&sin, optval, sizeof(sin))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	rds_send_drop_to(rs, &sin);
+out:
+	return ret;
+}
+
+static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
+			       int optlen)
+{
+	int value;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+	if (get_user(value, (int __user *) optval))
+		return -EFAULT;
+	*optvar = !!value;
+	return 0;
+}
+
+static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
+			    int optlen)
+{
+	int ret;
+
+	ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
+	if (ret == 0) {
+		if (rs->rs_cong_monitor) {
+			rds_cong_add_socket(rs);
+		} else {
+			rds_cong_remove_socket(rs);
+			rs->rs_cong_mask = 0;
+			rs->rs_cong_notify = 0;
+		}
+	}
+	return ret;
+}
+
+static int rds_set_transport(struct rds_sock *rs, char __user *optval,
+			     int optlen)
+{
+	int t_type;
+
+	if (rs->rs_transport)
+		return -EOPNOTSUPP; /* previously attached to transport */
+
+	if (optlen != sizeof(int))
+		return -EINVAL;
+
+	if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type)))
+		return -EFAULT;
+
+	if (t_type < 0 || t_type >= RDS_TRANS_COUNT)
+		return -EINVAL;
+
+	rs->rs_transport = rds_trans_get(t_type);
+
+	return rs->rs_transport ? 0 : -ENOPROTOOPT;
+}
+
+static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
+				 int optlen)
+{
+	int val, valbool;
+
+	if (optlen != sizeof(int))
+		return -EFAULT;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	valbool = val ? 1 : 0;
+
+	if (valbool)
+		sock_set_flag(sk, SOCK_RCVTSTAMP);
+	else
+		sock_reset_flag(sk, SOCK_RCVTSTAMP);
+
+	return 0;
+}
+
+static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
+				  int optlen)
+{
+	struct rds_rx_trace_so trace;
+	int i;
+
+	if (optlen != sizeof(struct rds_rx_trace_so))
+		return -EFAULT;
+
+	if (copy_from_user(&trace, optval, sizeof(trace)))
+		return -EFAULT;
+
+	if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
+		return -EFAULT;
+
+	rs->rs_rx_traces = trace.rx_traces;
+	for (i = 0; i < rs->rs_rx_traces; i++) {
+		if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
+			rs->rs_rx_traces = 0;
+			return -EFAULT;
+		}
+		rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
+	}
+
+	return 0;
+}
+
+static int rds_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+	int ret;
+
+	if (level != SOL_RDS) {
+		ret = -ENOPROTOOPT;
+		goto out;
+	}
+
+	switch (optname) {
+	case RDS_CANCEL_SENT_TO:
+		ret = rds_cancel_sent_to(rs, optval, optlen);
+		break;
+	case RDS_GET_MR:
+		ret = rds_get_mr(rs, optval, optlen);
+		break;
+	case RDS_GET_MR_FOR_DEST:
+		ret = rds_get_mr_for_dest(rs, optval, optlen);
+		break;
+	case RDS_FREE_MR:
+		ret = rds_free_mr(rs, optval, optlen);
+		break;
+	case RDS_RECVERR:
+		ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
+		break;
+	case RDS_CONG_MONITOR:
+		ret = rds_cong_monitor(rs, optval, optlen);
+		break;
+	case SO_RDS_TRANSPORT:
+		lock_sock(sock->sk);
+		ret = rds_set_transport(rs, optval, optlen);
+		release_sock(sock->sk);
+		break;
+	case SO_TIMESTAMP:
+		lock_sock(sock->sk);
+		ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
+		release_sock(sock->sk);
+		break;
+	case SO_RDS_MSG_RXPATH_LATENCY:
+		ret = rds_recv_track_latency(rs, optval, optlen);
+		break;
+	default:
+		ret = -ENOPROTOOPT;
+	}
+out:
+	return ret;
+}
+
+static int rds_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+	int ret = -ENOPROTOOPT, len;
+	int trans;
+
+	if (level != SOL_RDS)
+		goto out;
+
+	if (get_user(len, optlen)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	switch (optname) {
+	case RDS_INFO_FIRST ... RDS_INFO_LAST:
+		ret = rds_info_getsockopt(sock, optname, optval,
+					  optlen);
+		break;
+
+	case RDS_RECVERR:
+		if (len < sizeof(int))
+			ret = -EINVAL;
+		else
+		if (put_user(rs->rs_recverr, (int __user *) optval) ||
+		    put_user(sizeof(int), optlen))
+			ret = -EFAULT;
+		else
+			ret = 0;
+		break;
+	case SO_RDS_TRANSPORT:
+		if (len < sizeof(int)) {
+			ret = -EINVAL;
+			break;
+		}
+		trans = (rs->rs_transport ? rs->rs_transport->t_type :
+			 RDS_TRANS_NONE); /* unbound */
+		if (put_user(trans, (int __user *)optval) ||
+		    put_user(sizeof(int), optlen))
+			ret = -EFAULT;
+		else
+			ret = 0;
+		break;
+	default:
+		break;
+	}
+
+out:
+	return ret;
+
+}
+
+static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
+		       int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	int ret = 0;
+
+	lock_sock(sk);
+
+	if (addr_len != sizeof(struct sockaddr_in)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (sin->sin_family != AF_INET) {
+		ret = -EAFNOSUPPORT;
+		goto out;
+	}
+
+	if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+		ret = -EDESTADDRREQ;
+		goto out;
+	}
+
+	rs->rs_conn_addr = sin->sin_addr.s_addr;
+	rs->rs_conn_port = sin->sin_port;
+
+out:
+	release_sock(sk);
+	return ret;
+}
+
+static struct proto rds_proto = {
+	.name	  = "RDS",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct rds_sock),
+};
+
+static const struct proto_ops rds_proto_ops = {
+	.family =	AF_RDS,
+	.owner =	THIS_MODULE,
+	.release =	rds_release,
+	.bind =		rds_bind,
+	.connect =	rds_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	rds_getname,
+	.poll =		rds_poll,
+	.ioctl =	rds_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	rds_setsockopt,
+	.getsockopt =	rds_getsockopt,
+	.sendmsg =	rds_sendmsg,
+	.recvmsg =	rds_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static void rds_sock_destruct(struct sock *sk)
+{
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+
+	WARN_ON((&rs->rs_item != rs->rs_item.next ||
+		 &rs->rs_item != rs->rs_item.prev));
+}
+
+static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
+{
+	struct rds_sock *rs;
+
+	sock_init_data(sock, sk);
+	sock->ops		= &rds_proto_ops;
+	sk->sk_protocol		= protocol;
+	sk->sk_destruct		= rds_sock_destruct;
+
+	rs = rds_sk_to_rs(sk);
+	spin_lock_init(&rs->rs_lock);
+	rwlock_init(&rs->rs_recv_lock);
+	INIT_LIST_HEAD(&rs->rs_send_queue);
+	INIT_LIST_HEAD(&rs->rs_recv_queue);
+	INIT_LIST_HEAD(&rs->rs_notify_queue);
+	INIT_LIST_HEAD(&rs->rs_cong_list);
+	rds_message_zcopy_queue_init(&rs->rs_zcookie_queue);
+	spin_lock_init(&rs->rs_rdma_lock);
+	rs->rs_rdma_keys = RB_ROOT;
+	rs->rs_rx_traces = 0;
+
+	spin_lock_bh(&rds_sock_lock);
+	list_add_tail(&rs->rs_item, &rds_sock_list);
+	rds_sock_count++;
+	spin_unlock_bh(&rds_sock_lock);
+
+	return 0;
+}
+
+static int rds_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct sock *sk;
+
+	if (sock->type != SOCK_SEQPACKET || protocol)
+		return -ESOCKTNOSUPPORT;
+
+	sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto, kern);
+	if (!sk)
+		return -ENOMEM;
+
+	return __rds_create(sock, sk, protocol);
+}
+
+void rds_sock_addref(struct rds_sock *rs)
+{
+	sock_hold(rds_rs_to_sk(rs));
+}
+
+void rds_sock_put(struct rds_sock *rs)
+{
+	sock_put(rds_rs_to_sk(rs));
+}
+
+static const struct net_proto_family rds_family_ops = {
+	.family =	AF_RDS,
+	.create =	rds_create,
+	.owner	=	THIS_MODULE,
+};
+
+static void rds_sock_inc_info(struct socket *sock, unsigned int len,
+			      struct rds_info_iterator *iter,
+			      struct rds_info_lengths *lens)
+{
+	struct rds_sock *rs;
+	struct rds_incoming *inc;
+	unsigned int total = 0;
+
+	len /= sizeof(struct rds_info_message);
+
+	spin_lock_bh(&rds_sock_lock);
+
+	list_for_each_entry(rs, &rds_sock_list, rs_item) {
+		read_lock(&rs->rs_recv_lock);
+
+		/* XXX too lazy to maintain counts.. */
+		list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
+			total++;
+			if (total <= len)
+				rds_inc_info_copy(inc, iter, inc->i_saddr,
+						  rs->rs_bound_addr, 1);
+		}
+
+		read_unlock(&rs->rs_recv_lock);
+	}
+
+	spin_unlock_bh(&rds_sock_lock);
+
+	lens->nr = total;
+	lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_sock_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens)
+{
+	struct rds_info_socket sinfo;
+	struct rds_sock *rs;
+
+	len /= sizeof(struct rds_info_socket);
+
+	spin_lock_bh(&rds_sock_lock);
+
+	if (len < rds_sock_count)
+		goto out;
+
+	list_for_each_entry(rs, &rds_sock_list, rs_item) {
+		sinfo.sndbuf = rds_sk_sndbuf(rs);
+		sinfo.rcvbuf = rds_sk_rcvbuf(rs);
+		sinfo.bound_addr = rs->rs_bound_addr;
+		sinfo.connected_addr = rs->rs_conn_addr;
+		sinfo.bound_port = rs->rs_bound_port;
+		sinfo.connected_port = rs->rs_conn_port;
+		sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
+
+		rds_info_copy(iter, &sinfo, sizeof(sinfo));
+	}
+
+out:
+	lens->nr = rds_sock_count;
+	lens->each = sizeof(struct rds_info_socket);
+
+	spin_unlock_bh(&rds_sock_lock);
+}
+
+static void rds_exit(void)
+{
+	sock_unregister(rds_family_ops.family);
+	proto_unregister(&rds_proto);
+	rds_conn_exit();
+	rds_cong_exit();
+	rds_sysctl_exit();
+	rds_threads_exit();
+	rds_stats_exit();
+	rds_page_exit();
+	rds_bind_lock_destroy();
+	rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
+	rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+}
+module_exit(rds_exit);
+
+u32 rds_gen_num;
+
+static int rds_init(void)
+{
+	int ret;
+
+	net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));
+
+	ret = rds_bind_lock_init();
+	if (ret)
+		goto out;
+
+	ret = rds_conn_init();
+	if (ret)
+		goto out_bind;
+
+	ret = rds_threads_init();
+	if (ret)
+		goto out_conn;
+	ret = rds_sysctl_init();
+	if (ret)
+		goto out_threads;
+	ret = rds_stats_init();
+	if (ret)
+		goto out_sysctl;
+	ret = proto_register(&rds_proto, 1);
+	if (ret)
+		goto out_stats;
+	ret = sock_register(&rds_family_ops);
+	if (ret)
+		goto out_proto;
+
+	rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
+	rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+
+	goto out;
+
+out_proto:
+	proto_unregister(&rds_proto);
+out_stats:
+	rds_stats_exit();
+out_sysctl:
+	rds_sysctl_exit();
+out_threads:
+	rds_threads_exit();
+out_conn:
+	rds_conn_exit();
+	rds_cong_exit();
+	rds_page_exit();
+out_bind:
+	rds_bind_lock_destroy();
+out:
+	return ret;
+}
+module_init(rds_init);
+
+#define DRV_VERSION     "4.0"
+#define DRV_RELDATE     "Feb 12, 2009"
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
+		   " v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NETPROTO(PF_RDS);
diff --git a/net/rds/bind.c b/net/rds/bind.c
new file mode 100644
index 0000000..5aa3a64
--- /dev/null
+++ b/net/rds/bind.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/jhash.h>
+#include <linux/ratelimit.h>
+#include "rds.h"
+
+static struct rhashtable bind_hash_table;
+
+static const struct rhashtable_params ht_parms = {
+	.nelem_hint = 768,
+	.key_len = sizeof(u64),
+	.key_offset = offsetof(struct rds_sock, rs_bound_key),
+	.head_offset = offsetof(struct rds_sock, rs_bound_node),
+	.max_size = 16384,
+	.min_size = 1024,
+};
+
+/*
+ * Return the rds_sock bound at the given local address.
+ *
+ * The rx path can race with rds_release.  We notice if rds_release() has
+ * marked this socket and don't return a rs ref to the rx path.
+ */
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
+{
+	u64 key = ((u64)addr << 32) | port;
+	struct rds_sock *rs;
+
+	rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms);
+	if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
+		rds_sock_addref(rs);
+	else
+		rs = NULL;
+
+	rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
+		ntohs(port));
+
+	return rs;
+}
+
+/* returns -ve errno or +ve port */
+static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
+{
+	int ret = -EADDRINUSE;
+	u16 rover, last;
+	u64 key;
+
+	if (*port != 0) {
+		rover = be16_to_cpu(*port);
+		if (rover == RDS_FLAG_PROBE_PORT)
+			return -EINVAL;
+		last = rover;
+	} else {
+		rover = max_t(u16, prandom_u32(), 2);
+		last = rover - 1;
+	}
+
+	do {
+		if (rover == 0)
+			rover++;
+
+		if (rover == RDS_FLAG_PROBE_PORT)
+			continue;
+		key = ((u64)addr << 32) | cpu_to_be16(rover);
+		if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms))
+			continue;
+
+		rs->rs_bound_key = key;
+		rs->rs_bound_addr = addr;
+		net_get_random_once(&rs->rs_hash_initval,
+				    sizeof(rs->rs_hash_initval));
+		rs->rs_bound_port = cpu_to_be16(rover);
+		rs->rs_bound_node.next = NULL;
+		rds_sock_addref(rs);
+		if (!rhashtable_insert_fast(&bind_hash_table,
+					    &rs->rs_bound_node, ht_parms)) {
+			*port = rs->rs_bound_port;
+			ret = 0;
+			rdsdebug("rs %p binding to %pI4:%d\n",
+			  rs, &addr, (int)ntohs(*port));
+			break;
+		} else {
+			rs->rs_bound_addr = 0;
+			rds_sock_put(rs);
+			ret = -ENOMEM;
+			break;
+		}
+	} while (rover++ != last);
+
+	return ret;
+}
+
+void rds_remove_bound(struct rds_sock *rs)
+{
+
+	if (!rs->rs_bound_addr)
+		return;
+
+	rdsdebug("rs %p unbinding from %pI4:%d\n",
+		 rs, &rs->rs_bound_addr,
+		 ntohs(rs->rs_bound_port));
+
+	rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms);
+	rds_sock_put(rs);
+	rs->rs_bound_addr = 0;
+}
+
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	struct rds_transport *trans;
+	int ret = 0;
+
+	lock_sock(sk);
+
+	if (addr_len != sizeof(struct sockaddr_in) ||
+	    sin->sin_family != AF_INET ||
+	    rs->rs_bound_addr ||
+	    sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
+	if (ret)
+		goto out;
+
+	if (rs->rs_transport) { /* previously bound */
+		trans = rs->rs_transport;
+		if (trans->laddr_check(sock_net(sock->sk),
+				       sin->sin_addr.s_addr) != 0) {
+			ret = -ENOPROTOOPT;
+			rds_remove_bound(rs);
+		} else {
+			ret = 0;
+		}
+		goto out;
+	}
+	trans = rds_trans_get_preferred(sock_net(sock->sk),
+					sin->sin_addr.s_addr);
+	if (!trans) {
+		ret = -EADDRNOTAVAIL;
+		rds_remove_bound(rs);
+		pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n",
+				    __func__, &sin->sin_addr.s_addr);
+		goto out;
+	}
+
+	rs->rs_transport = trans;
+	ret = 0;
+
+out:
+	release_sock(sk);
+	return ret;
+}
+
+void rds_bind_lock_destroy(void)
+{
+	rhashtable_destroy(&bind_hash_table);
+}
+
+int rds_bind_lock_init(void)
+{
+	return rhashtable_init(&bind_hash_table, &ht_parms);
+}
diff --git a/net/rds/cong.c b/net/rds/cong.c
new file mode 100644
index 0000000..63da9d2
--- /dev/null
+++ b/net/rds/cong.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2007 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/bitops.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/*
+ * This file implements the receive side of the unconventional congestion
+ * management in RDS.
+ *
+ * Messages waiting in the receive queue on the receiving socket are accounted
+ * against the sockets SO_RCVBUF option value.  Only the payload bytes in the
+ * message are accounted for.  If the number of bytes queued equals or exceeds
+ * rcvbuf then the socket is congested.  All sends attempted to this socket's
+ * address should return block or return -EWOULDBLOCK.
+ *
+ * Applications are expected to be reasonably tuned such that this situation
+ * very rarely occurs.  An application encountering this "back-pressure" is
+ * considered a bug.
+ *
+ * This is implemented by having each node maintain bitmaps which indicate
+ * which ports on bound addresses are congested.  As the bitmap changes it is
+ * sent through all the connections which terminate in the local address of the
+ * bitmap which changed.
+ *
+ * The bitmaps are allocated as connections are brought up.  This avoids
+ * allocation in the interrupt handling path which queues messages on sockets.
+ * The dense bitmaps let transports send the entire bitmap on any bitmap change
+ * reasonably efficiently.  This is much easier to implement than some
+ * finer-grained communication of per-port congestion.  The sender does a very
+ * inexpensive bit test to test if the port it's about to send to is congested
+ * or not.
+ */
+
+/*
+ * Interaction with poll is a tad tricky. We want all processes stuck in
+ * poll to wake up and check whether a congested destination became uncongested.
+ * The really sad thing is we have no idea which destinations the application
+ * wants to send to - we don't even know which rds_connections are involved.
+ * So until we implement a more flexible rds poll interface, we have to make
+ * do with this:
+ * We maintain a global counter that is incremented each time a congestion map
+ * update is received. Each rds socket tracks this value, and if rds_poll
+ * finds that the saved generation number is smaller than the global generation
+ * number, it wakes up the process.
+ */
+static atomic_t		rds_cong_generation = ATOMIC_INIT(0);
+
+/*
+ * Congestion monitoring
+ */
+static LIST_HEAD(rds_cong_monitor);
+static DEFINE_RWLOCK(rds_cong_monitor_lock);
+
+/*
+ * Yes, a global lock.  It's used so infrequently that it's worth keeping it
+ * global to simplify the locking.  It's only used in the following
+ * circumstances:
+ *
+ *  - on connection buildup to associate a conn with its maps
+ *  - on map changes to inform conns of a new map to send
+ *
+ *  It's sadly ordered under the socket callback lock and the connection lock.
+ *  Receive paths can mark ports congested from interrupt context so the
+ *  lock masks interrupts.
+ */
+static DEFINE_SPINLOCK(rds_cong_lock);
+static struct rb_root rds_cong_tree = RB_ROOT;
+
+static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
+					       struct rds_cong_map *insert)
+{
+	struct rb_node **p = &rds_cong_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct rds_cong_map *map;
+
+	while (*p) {
+		parent = *p;
+		map = rb_entry(parent, struct rds_cong_map, m_rb_node);
+
+		if (addr < map->m_addr)
+			p = &(*p)->rb_left;
+		else if (addr > map->m_addr)
+			p = &(*p)->rb_right;
+		else
+			return map;
+	}
+
+	if (insert) {
+		rb_link_node(&insert->m_rb_node, parent, p);
+		rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
+	}
+	return NULL;
+}
+
+/*
+ * There is only ever one bitmap for any address.  Connections try and allocate
+ * these bitmaps in the process getting pointers to them.  The bitmaps are only
+ * ever freed as the module is removed after all connections have been freed.
+ */
+static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
+{
+	struct rds_cong_map *map;
+	struct rds_cong_map *ret = NULL;
+	unsigned long zp;
+	unsigned long i;
+	unsigned long flags;
+
+	map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
+	if (!map)
+		return NULL;
+
+	map->m_addr = addr;
+	init_waitqueue_head(&map->m_waitq);
+	INIT_LIST_HEAD(&map->m_conn_list);
+
+	for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+		zp = get_zeroed_page(GFP_KERNEL);
+		if (zp == 0)
+			goto out;
+		map->m_page_addrs[i] = zp;
+	}
+
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	ret = rds_cong_tree_walk(addr, map);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+	if (!ret) {
+		ret = map;
+		map = NULL;
+	}
+
+out:
+	if (map) {
+		for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+			free_page(map->m_page_addrs[i]);
+		kfree(map);
+	}
+
+	rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
+
+	return ret;
+}
+
+/*
+ * Put the conn on its local map's list.  This is called when the conn is
+ * really added to the hash.  It's nested under the rds_conn_lock, sadly.
+ */
+void rds_cong_add_conn(struct rds_connection *conn)
+{
+	unsigned long flags;
+
+	rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_remove_conn(struct rds_connection *conn)
+{
+	unsigned long flags;
+
+	rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	list_del_init(&conn->c_map_item);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+int rds_cong_get_maps(struct rds_connection *conn)
+{
+	conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
+	conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
+
+	if (!(conn->c_lcong && conn->c_fcong))
+		return -ENOMEM;
+
+	return 0;
+}
+
+void rds_cong_queue_updates(struct rds_cong_map *map)
+{
+	struct rds_connection *conn;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_cong_lock, flags);
+
+	list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
+		struct rds_conn_path *cp = &conn->c_path[0];
+
+		rcu_read_lock();
+		if (!test_and_set_bit(0, &conn->c_map_queued) &&
+		    !rds_destroy_pending(cp->cp_conn)) {
+			rds_stats_inc(s_cong_update_queued);
+			/* We cannot inline the call to rds_send_xmit() here
+			 * for two reasons (both pertaining to a TCP transport):
+			 * 1. When we get here from the receive path, we
+			 *    are already holding the sock_lock (held by
+			 *    tcp_v4_rcv()). So inlining calls to
+			 *    tcp_setsockopt and/or tcp_sendmsg will deadlock
+			 *    when it tries to get the sock_lock())
+			 * 2. Interrupts are masked so that we can mark the
+			 *    the port congested from both send and recv paths.
+			 *    (See comment around declaration of rdc_cong_lock).
+			 *    An attempt to get the sock_lock() here will
+			 *    therefore trigger warnings.
+			 * Defer the xmit to rds_send_worker() instead.
+			 */
+			queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+		}
+		rcu_read_unlock();
+	}
+
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
+{
+	rdsdebug("waking map %p for %pI4\n",
+	  map, &map->m_addr);
+	rds_stats_inc(s_cong_update_received);
+	atomic_inc(&rds_cong_generation);
+	if (waitqueue_active(&map->m_waitq))
+		wake_up(&map->m_waitq);
+	if (waitqueue_active(&rds_poll_waitq))
+		wake_up_all(&rds_poll_waitq);
+
+	if (portmask && !list_empty(&rds_cong_monitor)) {
+		unsigned long flags;
+		struct rds_sock *rs;
+
+		read_lock_irqsave(&rds_cong_monitor_lock, flags);
+		list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
+			spin_lock(&rs->rs_lock);
+			rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
+			rs->rs_cong_mask &= ~portmask;
+			spin_unlock(&rs->rs_lock);
+			if (rs->rs_cong_notify)
+				rds_wake_sk_sleep(rs);
+		}
+		read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+	}
+}
+EXPORT_SYMBOL_GPL(rds_cong_map_updated);
+
+int rds_cong_updated_since(unsigned long *recent)
+{
+	unsigned long gen = atomic_read(&rds_cong_generation);
+
+	if (likely(*recent == gen))
+		return 0;
+	*recent = gen;
+	return 1;
+}
+
+/*
+ * We're called under the locking that protects the sockets receive buffer
+ * consumption.  This makes it a lot easier for the caller to only call us
+ * when it knows that an existing set bit needs to be cleared, and vice versa.
+ * We can't block and we need to deal with concurrent sockets working against
+ * the same per-address map.
+ */
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
+{
+	unsigned long i;
+	unsigned long off;
+
+	rdsdebug("setting congestion for %pI4:%u in map %p\n",
+	  &map->m_addr, ntohs(port), map);
+
+	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+	set_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
+{
+	unsigned long i;
+	unsigned long off;
+
+	rdsdebug("clearing congestion for %pI4:%u in map %p\n",
+	  &map->m_addr, ntohs(port), map);
+
+	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+	clear_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
+{
+	unsigned long i;
+	unsigned long off;
+
+	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+	return test_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_add_socket(struct rds_sock *rs)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&rds_cong_monitor_lock, flags);
+	if (list_empty(&rs->rs_cong_list))
+		list_add(&rs->rs_cong_list, &rds_cong_monitor);
+	write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+}
+
+void rds_cong_remove_socket(struct rds_sock *rs)
+{
+	unsigned long flags;
+	struct rds_cong_map *map;
+
+	write_lock_irqsave(&rds_cong_monitor_lock, flags);
+	list_del_init(&rs->rs_cong_list);
+	write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+
+	/* update congestion map for now-closed port */
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+	if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
+		rds_cong_clear_bit(map, rs->rs_bound_port);
+		rds_cong_queue_updates(map);
+	}
+}
+
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
+		  struct rds_sock *rs)
+{
+	if (!rds_cong_test_bit(map, port))
+		return 0;
+	if (nonblock) {
+		if (rs && rs->rs_cong_monitor) {
+			unsigned long flags;
+
+			/* It would have been nice to have an atomic set_bit on
+			 * a uint64_t. */
+			spin_lock_irqsave(&rs->rs_lock, flags);
+			rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port));
+			spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+			/* Test again - a congestion update may have arrived in
+			 * the meantime. */
+			if (!rds_cong_test_bit(map, port))
+				return 0;
+		}
+		rds_stats_inc(s_cong_send_error);
+		return -ENOBUFS;
+	}
+
+	rds_stats_inc(s_cong_send_blocked);
+	rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));
+
+	return wait_event_interruptible(map->m_waitq,
+					!rds_cong_test_bit(map, port));
+}
+
+void rds_cong_exit(void)
+{
+	struct rb_node *node;
+	struct rds_cong_map *map;
+	unsigned long i;
+
+	while ((node = rb_first(&rds_cong_tree))) {
+		map = rb_entry(node, struct rds_cong_map, m_rb_node);
+		rdsdebug("freeing map %p\n", map);
+		rb_erase(&map->m_rb_node, &rds_cong_tree);
+		for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+			free_page(map->m_page_addrs[i]);
+		kfree(map);
+	}
+}
+
+/*
+ * Allocate a RDS message containing a congestion update.
+ */
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn)
+{
+	struct rds_cong_map *map = conn->c_lcong;
+	struct rds_message *rm;
+
+	rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
+	if (!IS_ERR(rm))
+		rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;
+
+	return rm;
+}
diff --git a/net/rds/connection.c b/net/rds/connection.c
new file mode 100644
index 0000000..abef75d
--- /dev/null
+++ b/net/rds/connection.c
@@ -0,0 +1,751 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/inet_hashtables.h>
+
+#include "rds.h"
+#include "loop.h"
+
+#define RDS_CONNECTION_HASH_BITS 12
+#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
+#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1)
+
+/* converting this to RCU is a chore for another day.. */
+static DEFINE_SPINLOCK(rds_conn_lock);
+static unsigned long rds_conn_count;
+static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
+static struct kmem_cache *rds_conn_slab;
+
+static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
+{
+	static u32 rds_hash_secret __read_mostly;
+
+	unsigned long hash;
+
+	net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
+
+	/* Pass NULL, don't need struct net for hash */
+	hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
+			      be32_to_cpu(faddr), 0,
+			      rds_hash_secret);
+	return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
+}
+
+#define rds_conn_info_set(var, test, suffix) do {		\
+	if (test)						\
+		var |= RDS_INFO_CONNECTION_FLAG_##suffix;	\
+} while (0)
+
+/* rcu read lock must be held or the connection spinlock */
+static struct rds_connection *rds_conn_lookup(struct net *net,
+					      struct hlist_head *head,
+					      __be32 laddr, __be32 faddr,
+					      struct rds_transport *trans)
+{
+	struct rds_connection *conn, *ret = NULL;
+
+	hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+		if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
+		    conn->c_trans == trans && net == rds_conn_net(conn)) {
+			ret = conn;
+			break;
+		}
+	}
+	rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
+		 &laddr, &faddr);
+	return ret;
+}
+
+/*
+ * This is called by transports as they're bringing down a connection.
+ * It clears partial message state so that the transport can start sending
+ * and receiving over this connection again in the future.  It is up to
+ * the transport to have serialized this call with its send and recv.
+ */
+static void rds_conn_path_reset(struct rds_conn_path *cp)
+{
+	struct rds_connection *conn = cp->cp_conn;
+
+	rdsdebug("connection %pI4 to %pI4 reset\n",
+	  &conn->c_laddr, &conn->c_faddr);
+
+	rds_stats_inc(s_conn_reset);
+	rds_send_path_reset(cp);
+	cp->cp_flags = 0;
+
+	/* Do not clear next_rx_seq here, else we cannot distinguish
+	 * retransmitted packets from new packets, and will hand all
+	 * of them to the application. That is not consistent with the
+	 * reliability guarantees of RDS. */
+}
+
+static void __rds_conn_path_init(struct rds_connection *conn,
+				 struct rds_conn_path *cp, bool is_outgoing)
+{
+	spin_lock_init(&cp->cp_lock);
+	cp->cp_next_tx_seq = 1;
+	init_waitqueue_head(&cp->cp_waitq);
+	INIT_LIST_HEAD(&cp->cp_send_queue);
+	INIT_LIST_HEAD(&cp->cp_retrans);
+
+	cp->cp_conn = conn;
+	atomic_set(&cp->cp_state, RDS_CONN_DOWN);
+	cp->cp_send_gen = 0;
+	cp->cp_reconnect_jiffies = 0;
+	INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker);
+	INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker);
+	INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker);
+	INIT_WORK(&cp->cp_down_w, rds_shutdown_worker);
+	mutex_init(&cp->cp_cm_lock);
+	cp->cp_flags = 0;
+}
+
+/*
+ * There is only every one 'conn' for a given pair of addresses in the
+ * system at a time.  They contain messages to be retransmitted and so
+ * span the lifetime of the actual underlying transport connections.
+ *
+ * For now they are not garbage collected once they're created.  They
+ * are torn down as the module is removed, if ever.
+ */
+static struct rds_connection *__rds_conn_create(struct net *net,
+						__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp,
+				       int is_outgoing)
+{
+	struct rds_connection *conn, *parent = NULL;
+	struct hlist_head *head = rds_conn_bucket(laddr, faddr);
+	struct rds_transport *loop_trans;
+	unsigned long flags;
+	int ret, i;
+	int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
+
+	rcu_read_lock();
+	conn = rds_conn_lookup(net, head, laddr, faddr, trans);
+	if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
+	    laddr == faddr && !is_outgoing) {
+		/* This is a looped back IB connection, and we're
+		 * called by the code handling the incoming connect.
+		 * We need a second connection object into which we
+		 * can stick the other QP. */
+		parent = conn;
+		conn = parent->c_passive;
+	}
+	rcu_read_unlock();
+	if (conn)
+		goto out;
+
+	conn = kmem_cache_zalloc(rds_conn_slab, gfp);
+	if (!conn) {
+		conn = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	conn->c_path = kcalloc(npaths, sizeof(struct rds_conn_path), gfp);
+	if (!conn->c_path) {
+		kmem_cache_free(rds_conn_slab, conn);
+		conn = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	INIT_HLIST_NODE(&conn->c_hash_node);
+	conn->c_laddr = laddr;
+	conn->c_faddr = faddr;
+
+	rds_conn_net_set(conn, net);
+
+	ret = rds_cong_get_maps(conn);
+	if (ret) {
+		kfree(conn->c_path);
+		kmem_cache_free(rds_conn_slab, conn);
+		conn = ERR_PTR(ret);
+		goto out;
+	}
+
+	/*
+	 * This is where a connection becomes loopback.  If *any* RDS sockets
+	 * can bind to the destination address then we'd rather the messages
+	 * flow through loopback rather than either transport.
+	 */
+	loop_trans = rds_trans_get_preferred(net, faddr);
+	if (loop_trans) {
+		rds_trans_put(loop_trans);
+		conn->c_loopback = 1;
+		if (is_outgoing && trans->t_prefer_loopback) {
+			/* "outgoing" connection - and the transport
+			 * says it wants the connection handled by the
+			 * loopback transport. This is what TCP does.
+			 */
+			trans = &rds_loop_transport;
+		}
+	}
+
+	conn->c_trans = trans;
+
+	init_waitqueue_head(&conn->c_hs_waitq);
+	for (i = 0; i < npaths; i++) {
+		__rds_conn_path_init(conn, &conn->c_path[i],
+				     is_outgoing);
+		conn->c_path[i].cp_index = i;
+	}
+	rcu_read_lock();
+	if (rds_destroy_pending(conn))
+		ret = -ENETDOWN;
+	else
+		ret = trans->conn_alloc(conn, GFP_ATOMIC);
+	if (ret) {
+		rcu_read_unlock();
+		kfree(conn->c_path);
+		kmem_cache_free(rds_conn_slab, conn);
+		conn = ERR_PTR(ret);
+		goto out;
+	}
+
+	rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
+	  conn, &laddr, &faddr,
+	  strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name :
+	  "[unknown]", is_outgoing ? "(outgoing)" : "");
+
+	/*
+	 * Since we ran without holding the conn lock, someone could
+	 * have created the same conn (either normal or passive) in the
+	 * interim. We check while holding the lock. If we won, we complete
+	 * init and return our conn. If we lost, we rollback and return the
+	 * other one.
+	 */
+	spin_lock_irqsave(&rds_conn_lock, flags);
+	if (parent) {
+		/* Creating passive conn */
+		if (parent->c_passive) {
+			trans->conn_free(conn->c_path[0].cp_transport_data);
+			kfree(conn->c_path);
+			kmem_cache_free(rds_conn_slab, conn);
+			conn = parent->c_passive;
+		} else {
+			parent->c_passive = conn;
+			rds_cong_add_conn(conn);
+			rds_conn_count++;
+		}
+	} else {
+		/* Creating normal conn */
+		struct rds_connection *found;
+
+		found = rds_conn_lookup(net, head, laddr, faddr, trans);
+		if (found) {
+			struct rds_conn_path *cp;
+			int i;
+
+			for (i = 0; i < npaths; i++) {
+				cp = &conn->c_path[i];
+				/* The ->conn_alloc invocation may have
+				 * allocated resource for all paths, so all
+				 * of them may have to be freed here.
+				 */
+				if (cp->cp_transport_data)
+					trans->conn_free(cp->cp_transport_data);
+			}
+			kfree(conn->c_path);
+			kmem_cache_free(rds_conn_slab, conn);
+			conn = found;
+		} else {
+			conn->c_my_gen_num = rds_gen_num;
+			conn->c_peer_gen_num = 0;
+			hlist_add_head_rcu(&conn->c_hash_node, head);
+			rds_cong_add_conn(conn);
+			rds_conn_count++;
+		}
+	}
+	spin_unlock_irqrestore(&rds_conn_lock, flags);
+	rcu_read_unlock();
+
+out:
+	return conn;
+}
+
+struct rds_connection *rds_conn_create(struct net *net,
+				       __be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp)
+{
+	return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
+}
+EXPORT_SYMBOL_GPL(rds_conn_create);
+
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+						__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp)
+{
+	return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
+}
+EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
+
+void rds_conn_shutdown(struct rds_conn_path *cp)
+{
+	struct rds_connection *conn = cp->cp_conn;
+
+	/* shut it down unless it's down already */
+	if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+		/*
+		 * Quiesce the connection mgmt handlers before we start tearing
+		 * things down. We don't hold the mutex for the entire
+		 * duration of the shutdown operation, else we may be
+		 * deadlocking with the CM handler. Instead, the CM event
+		 * handler is supposed to check for state DISCONNECTING
+		 */
+		mutex_lock(&cp->cp_cm_lock);
+		if (!rds_conn_path_transition(cp, RDS_CONN_UP,
+					      RDS_CONN_DISCONNECTING) &&
+		    !rds_conn_path_transition(cp, RDS_CONN_ERROR,
+					      RDS_CONN_DISCONNECTING)) {
+			rds_conn_path_error(cp,
+					    "shutdown called in state %d\n",
+					    atomic_read(&cp->cp_state));
+			mutex_unlock(&cp->cp_cm_lock);
+			return;
+		}
+		mutex_unlock(&cp->cp_cm_lock);
+
+		wait_event(cp->cp_waitq,
+			   !test_bit(RDS_IN_XMIT, &cp->cp_flags));
+		wait_event(cp->cp_waitq,
+			   !test_bit(RDS_RECV_REFILL, &cp->cp_flags));
+
+		conn->c_trans->conn_path_shutdown(cp);
+		rds_conn_path_reset(cp);
+
+		if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING,
+					      RDS_CONN_DOWN) &&
+		    !rds_conn_path_transition(cp, RDS_CONN_ERROR,
+					      RDS_CONN_DOWN)) {
+			/* This can happen - eg when we're in the middle of tearing
+			 * down the connection, and someone unloads the rds module.
+			 * Quite reproducible with loopback connections.
+			 * Mostly harmless.
+			 *
+			 * Note that this also happens with rds-tcp because
+			 * we could have triggered rds_conn_path_drop in irq
+			 * mode from rds_tcp_state change on the receipt of
+			 * a FIN, thus we need to recheck for RDS_CONN_ERROR
+			 * here.
+			 */
+			rds_conn_path_error(cp, "%s: failed to transition "
+					    "to state DOWN, current state "
+					    "is %d\n", __func__,
+					    atomic_read(&cp->cp_state));
+			return;
+		}
+	}
+
+	/* Then reconnect if it's still live.
+	 * The passive side of an IB loopback connection is never added
+	 * to the conn hash, so we never trigger a reconnect on this
+	 * conn - the reconnect is always triggered by the active peer. */
+	cancel_delayed_work_sync(&cp->cp_conn_w);
+	rcu_read_lock();
+	if (!hlist_unhashed(&conn->c_hash_node)) {
+		rcu_read_unlock();
+		rds_queue_reconnect(cp);
+	} else {
+		rcu_read_unlock();
+	}
+}
+
+/* destroy a single rds_conn_path. rds_conn_destroy() iterates over
+ * all paths using rds_conn_path_destroy()
+ */
+static void rds_conn_path_destroy(struct rds_conn_path *cp)
+{
+	struct rds_message *rm, *rtmp;
+
+	if (!cp->cp_transport_data)
+		return;
+
+	/* make sure lingering queued work won't try to ref the conn */
+	cancel_delayed_work_sync(&cp->cp_send_w);
+	cancel_delayed_work_sync(&cp->cp_recv_w);
+
+	rds_conn_path_drop(cp, true);
+	flush_work(&cp->cp_down_w);
+
+	/* tear down queued messages */
+	list_for_each_entry_safe(rm, rtmp,
+				 &cp->cp_send_queue,
+				 m_conn_item) {
+		list_del_init(&rm->m_conn_item);
+		BUG_ON(!list_empty(&rm->m_sock_item));
+		rds_message_put(rm);
+	}
+	if (cp->cp_xmit_rm)
+		rds_message_put(cp->cp_xmit_rm);
+
+	WARN_ON(delayed_work_pending(&cp->cp_send_w));
+	WARN_ON(delayed_work_pending(&cp->cp_recv_w));
+	WARN_ON(delayed_work_pending(&cp->cp_conn_w));
+	WARN_ON(work_pending(&cp->cp_down_w));
+
+	cp->cp_conn->c_trans->conn_free(cp->cp_transport_data);
+}
+
+/*
+ * Stop and free a connection.
+ *
+ * This can only be used in very limited circumstances.  It assumes that once
+ * the conn has been shutdown that no one else is referencing the connection.
+ * We can only ensure this in the rmmod path in the current code.
+ */
+void rds_conn_destroy(struct rds_connection *conn)
+{
+	unsigned long flags;
+	int i;
+	struct rds_conn_path *cp;
+	int npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
+
+	rdsdebug("freeing conn %p for %pI4 -> "
+		 "%pI4\n", conn, &conn->c_laddr,
+		 &conn->c_faddr);
+
+	/* Ensure conn will not be scheduled for reconnect */
+	spin_lock_irq(&rds_conn_lock);
+	hlist_del_init_rcu(&conn->c_hash_node);
+	spin_unlock_irq(&rds_conn_lock);
+	synchronize_rcu();
+
+	/* shut the connection down */
+	for (i = 0; i < npaths; i++) {
+		cp = &conn->c_path[i];
+		rds_conn_path_destroy(cp);
+		BUG_ON(!list_empty(&cp->cp_retrans));
+	}
+
+	/*
+	 * The congestion maps aren't freed up here.  They're
+	 * freed by rds_cong_exit() after all the connections
+	 * have been freed.
+	 */
+	rds_cong_remove_conn(conn);
+
+	kfree(conn->c_path);
+	kmem_cache_free(rds_conn_slab, conn);
+
+	spin_lock_irqsave(&rds_conn_lock, flags);
+	rds_conn_count--;
+	spin_unlock_irqrestore(&rds_conn_lock, flags);
+}
+EXPORT_SYMBOL_GPL(rds_conn_destroy);
+
+static void rds_conn_message_info(struct socket *sock, unsigned int len,
+				  struct rds_info_iterator *iter,
+				  struct rds_info_lengths *lens,
+				  int want_send)
+{
+	struct hlist_head *head;
+	struct list_head *list;
+	struct rds_connection *conn;
+	struct rds_message *rm;
+	unsigned int total = 0;
+	unsigned long flags;
+	size_t i;
+	int j;
+
+	len /= sizeof(struct rds_info_message);
+
+	rcu_read_lock();
+
+	for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+	     i++, head++) {
+		hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+			struct rds_conn_path *cp;
+			int npaths;
+
+			npaths = (conn->c_trans->t_mp_capable ?
+				 RDS_MPATH_WORKERS : 1);
+
+			for (j = 0; j < npaths; j++) {
+				cp = &conn->c_path[j];
+				if (want_send)
+					list = &cp->cp_send_queue;
+				else
+					list = &cp->cp_retrans;
+
+				spin_lock_irqsave(&cp->cp_lock, flags);
+
+				/* XXX too lazy to maintain counts.. */
+				list_for_each_entry(rm, list, m_conn_item) {
+					total++;
+					if (total <= len)
+						rds_inc_info_copy(&rm->m_inc,
+								  iter,
+								  conn->c_laddr,
+								  conn->c_faddr,
+								  0);
+				}
+
+				spin_unlock_irqrestore(&cp->cp_lock, flags);
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	lens->nr = total;
+	lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
+				       struct rds_info_iterator *iter,
+				       struct rds_info_lengths *lens)
+{
+	rds_conn_message_info(sock, len, iter, lens, 1);
+}
+
+static void rds_conn_message_info_retrans(struct socket *sock,
+					  unsigned int len,
+					  struct rds_info_iterator *iter,
+					  struct rds_info_lengths *lens)
+{
+	rds_conn_message_info(sock, len, iter, lens, 0);
+}
+
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens,
+			  int (*visitor)(struct rds_connection *, void *),
+			  u64 *buffer,
+			  size_t item_len)
+{
+	struct hlist_head *head;
+	struct rds_connection *conn;
+	size_t i;
+
+	rcu_read_lock();
+
+	lens->nr = 0;
+	lens->each = item_len;
+
+	for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+	     i++, head++) {
+		hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+
+			/* XXX no c_lock usage.. */
+			if (!visitor(conn, buffer))
+				continue;
+
+			/* We copy as much as we can fit in the buffer,
+			 * but we count all items so that the caller
+			 * can resize the buffer. */
+			if (len >= item_len) {
+				rds_info_copy(iter, buffer, item_len);
+				len -= item_len;
+			}
+			lens->nr++;
+		}
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
+
+static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
+				    struct rds_info_iterator *iter,
+				    struct rds_info_lengths *lens,
+				    int (*visitor)(struct rds_conn_path *, void *),
+				    u64 *buffer,
+				    size_t item_len)
+{
+	struct hlist_head *head;
+	struct rds_connection *conn;
+	size_t i;
+	int j;
+
+	rcu_read_lock();
+
+	lens->nr = 0;
+	lens->each = item_len;
+
+	for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+	     i++, head++) {
+		hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+			struct rds_conn_path *cp;
+			int npaths;
+
+			npaths = (conn->c_trans->t_mp_capable ?
+				 RDS_MPATH_WORKERS : 1);
+			for (j = 0; j < npaths; j++) {
+				cp = &conn->c_path[j];
+
+				/* XXX no cp_lock usage.. */
+				if (!visitor(cp, buffer))
+					continue;
+			}
+
+			/* We copy as much as we can fit in the buffer,
+			 * but we count all items so that the caller
+			 * can resize the buffer.
+			 */
+			if (len >= item_len) {
+				rds_info_copy(iter, buffer, item_len);
+				len -= item_len;
+			}
+			lens->nr++;
+		}
+	}
+	rcu_read_unlock();
+}
+
+static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
+{
+	struct rds_info_connection *cinfo = buffer;
+
+	cinfo->next_tx_seq = cp->cp_next_tx_seq;
+	cinfo->next_rx_seq = cp->cp_next_rx_seq;
+	cinfo->laddr = cp->cp_conn->c_laddr;
+	cinfo->faddr = cp->cp_conn->c_faddr;
+	strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name,
+		sizeof(cinfo->transport));
+	cinfo->flags = 0;
+
+	rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
+			  SENDING);
+	/* XXX Future: return the state rather than these funky bits */
+	rds_conn_info_set(cinfo->flags,
+			  atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
+			  CONNECTING);
+	rds_conn_info_set(cinfo->flags,
+			  atomic_read(&cp->cp_state) == RDS_CONN_UP,
+			  CONNECTED);
+	return 1;
+}
+
+static void rds_conn_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens)
+{
+	u64 buffer[(sizeof(struct rds_info_connection) + 7) / 8];
+
+	rds_walk_conn_path_info(sock, len, iter, lens,
+				rds_conn_info_visitor,
+				buffer,
+				sizeof(struct rds_info_connection));
+}
+
+int rds_conn_init(void)
+{
+	rds_conn_slab = kmem_cache_create("rds_connection",
+					  sizeof(struct rds_connection),
+					  0, 0, NULL);
+	if (!rds_conn_slab)
+		return -ENOMEM;
+
+	rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+	rds_info_register_func(RDS_INFO_SEND_MESSAGES,
+			       rds_conn_message_info_send);
+	rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
+			       rds_conn_message_info_retrans);
+
+	return 0;
+}
+
+void rds_conn_exit(void)
+{
+	rds_loop_exit();
+
+	WARN_ON(!hlist_empty(rds_conn_hash));
+
+	kmem_cache_destroy(rds_conn_slab);
+
+	rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+	rds_info_deregister_func(RDS_INFO_SEND_MESSAGES,
+				 rds_conn_message_info_send);
+	rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
+				 rds_conn_message_info_retrans);
+}
+
+/*
+ * Force a disconnect
+ */
+void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy)
+{
+	atomic_set(&cp->cp_state, RDS_CONN_ERROR);
+
+	rcu_read_lock();
+	if (!destroy && rds_destroy_pending(cp->cp_conn)) {
+		rcu_read_unlock();
+		return;
+	}
+	queue_work(rds_wq, &cp->cp_down_w);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rds_conn_path_drop);
+
+void rds_conn_drop(struct rds_connection *conn)
+{
+	WARN_ON(conn->c_trans->t_mp_capable);
+	rds_conn_path_drop(&conn->c_path[0], false);
+}
+EXPORT_SYMBOL_GPL(rds_conn_drop);
+
+/*
+ * If the connection is down, trigger a connect. We may have scheduled a
+ * delayed reconnect however - in this case we should not interfere.
+ */
+void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
+{
+	rcu_read_lock();
+	if (rds_destroy_pending(cp->cp_conn)) {
+		rcu_read_unlock();
+		return;
+	}
+	if (rds_conn_path_state(cp) == RDS_CONN_DOWN &&
+	    !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
+		queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
+
+void rds_conn_connect_if_down(struct rds_connection *conn)
+{
+	WARN_ON(conn->c_trans->t_mp_capable);
+	rds_conn_path_connect_if_down(&conn->c_path[0]);
+}
+EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
+
+void
+__rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vprintk(fmt, ap);
+	va_end(ap);
+
+	rds_conn_path_drop(cp, false);
+}
diff --git a/net/rds/ib.c b/net/rds/ib.c
new file mode 100644
index 0000000..02deee2
--- /dev/null
+++ b/net/rds/ib.c
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include "rds_single_path.h"
+#include "rds.h"
+#include "ib.h"
+#include "ib_mr.h"
+
+static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
+static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
+unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
+static atomic_t rds_ib_unloading;
+
+module_param(rds_ib_mr_1m_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA");
+module_param(rds_ib_mr_8k_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA");
+module_param(rds_ib_retry_count, int, 0444);
+MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
+
+/*
+ * we have a clumsy combination of RCU and a rwsem protecting this list
+ * because it is used both in the get_mr fast path and while blocking in
+ * the FMR flushing path.
+ */
+DECLARE_RWSEM(rds_ib_devices_lock);
+struct list_head rds_ib_devices;
+
+/* NOTE: if also grabbing ibdev lock, grab this first */
+DEFINE_SPINLOCK(ib_nodev_conns_lock);
+LIST_HEAD(ib_nodev_conns);
+
+static void rds_ib_nodev_connect(void)
+{
+	struct rds_ib_connection *ic;
+
+	spin_lock(&ib_nodev_conns_lock);
+	list_for_each_entry(ic, &ib_nodev_conns, ib_node)
+		rds_conn_connect_if_down(ic->conn);
+	spin_unlock(&ib_nodev_conns_lock);
+}
+
+static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
+{
+	struct rds_ib_connection *ic;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_ibdev->spinlock, flags);
+	list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
+		rds_conn_drop(ic->conn);
+	spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
+}
+
+/*
+ * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
+ * from interrupt context so we push freing off into a work struct in krdsd.
+ */
+static void rds_ib_dev_free(struct work_struct *work)
+{
+	struct rds_ib_ipaddr *i_ipaddr, *i_next;
+	struct rds_ib_device *rds_ibdev = container_of(work,
+					struct rds_ib_device, free_work);
+
+	if (rds_ibdev->mr_8k_pool)
+		rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool);
+	if (rds_ibdev->mr_1m_pool)
+		rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
+	if (rds_ibdev->pd)
+		ib_dealloc_pd(rds_ibdev->pd);
+
+	list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+		list_del(&i_ipaddr->list);
+		kfree(i_ipaddr);
+	}
+
+	kfree(rds_ibdev->vector_load);
+
+	kfree(rds_ibdev);
+}
+
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
+{
+	BUG_ON(refcount_read(&rds_ibdev->refcount) == 0);
+	if (refcount_dec_and_test(&rds_ibdev->refcount))
+		queue_work(rds_wq, &rds_ibdev->free_work);
+}
+
+static void rds_ib_add_one(struct ib_device *device)
+{
+	struct rds_ib_device *rds_ibdev;
+	bool has_fr, has_fmr;
+
+	/* Only handle IB (no iWARP) devices */
+	if (device->node_type != RDMA_NODE_IB_CA)
+		return;
+
+	rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
+				 ibdev_to_node(device));
+	if (!rds_ibdev)
+		return;
+
+	spin_lock_init(&rds_ibdev->spinlock);
+	refcount_set(&rds_ibdev->refcount, 1);
+	INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
+
+	rds_ibdev->max_wrs = device->attrs.max_qp_wr;
+	rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
+
+	has_fr = (device->attrs.device_cap_flags &
+		  IB_DEVICE_MEM_MGT_EXTENSIONS);
+	has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
+		   device->map_phys_fmr && device->unmap_fmr);
+	rds_ibdev->use_fastreg = (has_fr && !has_fmr);
+
+	rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
+	rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
+		min_t(unsigned int, (device->attrs.max_mr / 2),
+		      rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size;
+
+	rds_ibdev->max_8k_mrs = device->attrs.max_mr ?
+		min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
+		      rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size;
+
+	rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
+	rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
+
+	rds_ibdev->vector_load = kzalloc(sizeof(int) * device->num_comp_vectors,
+					 GFP_KERNEL);
+	if (!rds_ibdev->vector_load) {
+		pr_err("RDS/IB: %s failed to allocate vector memory\n",
+			__func__);
+		goto put_dev;
+	}
+
+	rds_ibdev->dev = device;
+	rds_ibdev->pd = ib_alloc_pd(device, 0);
+	if (IS_ERR(rds_ibdev->pd)) {
+		rds_ibdev->pd = NULL;
+		goto put_dev;
+	}
+
+	rds_ibdev->mr_1m_pool =
+		rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL);
+	if (IS_ERR(rds_ibdev->mr_1m_pool)) {
+		rds_ibdev->mr_1m_pool = NULL;
+		goto put_dev;
+	}
+
+	rds_ibdev->mr_8k_pool =
+		rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL);
+	if (IS_ERR(rds_ibdev->mr_8k_pool)) {
+		rds_ibdev->mr_8k_pool = NULL;
+		goto put_dev;
+	}
+
+	rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
+		 device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
+		 rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs,
+		 rds_ibdev->max_8k_mrs);
+
+	pr_info("RDS/IB: %s: %s supported and preferred\n",
+		device->name,
+		rds_ibdev->use_fastreg ? "FRMR" : "FMR");
+
+	INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
+	INIT_LIST_HEAD(&rds_ibdev->conn_list);
+
+	down_write(&rds_ib_devices_lock);
+	list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
+	up_write(&rds_ib_devices_lock);
+	refcount_inc(&rds_ibdev->refcount);
+
+	ib_set_client_data(device, &rds_ib_client, rds_ibdev);
+	refcount_inc(&rds_ibdev->refcount);
+
+	rds_ib_nodev_connect();
+
+put_dev:
+	rds_ib_dev_put(rds_ibdev);
+}
+
+/*
+ * New connections use this to find the device to associate with the
+ * connection.  It's not in the fast path so we're not concerned about the
+ * performance of the IB call.  (As of this writing, it uses an interrupt
+ * blocking spinlock to serialize walking a per-device list of all registered
+ * clients.)
+ *
+ * RCU is used to handle incoming connections racing with device teardown.
+ * Rather than use a lock to serialize removal from the client_data and
+ * getting a new reference, we use an RCU grace period.  The destruction
+ * path removes the device from client_data and then waits for all RCU
+ * readers to finish.
+ *
+ * A new connection can get NULL from this if its arriving on a
+ * device that is in the process of being removed.
+ */
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
+{
+	struct rds_ib_device *rds_ibdev;
+
+	rcu_read_lock();
+	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+	if (rds_ibdev)
+		refcount_inc(&rds_ibdev->refcount);
+	rcu_read_unlock();
+	return rds_ibdev;
+}
+
+/*
+ * The IB stack is letting us know that a device is going away.  This can
+ * happen if the underlying HCA driver is removed or if PCI hotplug is removing
+ * the pci function, for example.
+ *
+ * This can be called at any time and can be racing with any other RDS path.
+ */
+static void rds_ib_remove_one(struct ib_device *device, void *client_data)
+{
+	struct rds_ib_device *rds_ibdev = client_data;
+
+	if (!rds_ibdev)
+		return;
+
+	rds_ib_dev_shutdown(rds_ibdev);
+
+	/* stop connection attempts from getting a reference to this device. */
+	ib_set_client_data(device, &rds_ib_client, NULL);
+
+	down_write(&rds_ib_devices_lock);
+	list_del_rcu(&rds_ibdev->list);
+	up_write(&rds_ib_devices_lock);
+
+	/*
+	 * This synchronize rcu is waiting for readers of both the ib
+	 * client data and the devices list to finish before we drop
+	 * both of those references.
+	 */
+	synchronize_rcu();
+	rds_ib_dev_put(rds_ibdev);
+	rds_ib_dev_put(rds_ibdev);
+}
+
+struct ib_client rds_ib_client = {
+	.name   = "rds_ib",
+	.add    = rds_ib_add_one,
+	.remove = rds_ib_remove_one
+};
+
+static int rds_ib_conn_info_visitor(struct rds_connection *conn,
+				    void *buffer)
+{
+	struct rds_info_rdma_connection *iinfo = buffer;
+	struct rds_ib_connection *ic;
+
+	/* We will only ever look at IB transports */
+	if (conn->c_trans != &rds_ib_transport)
+		return 0;
+
+	iinfo->src_addr = conn->c_laddr;
+	iinfo->dst_addr = conn->c_faddr;
+
+	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		struct rds_ib_device *rds_ibdev;
+
+		ic = conn->c_transport_data;
+
+		rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo->src_gid,
+			       (union ib_gid *)&iinfo->dst_gid);
+
+		rds_ibdev = ic->rds_ibdev;
+		iinfo->max_send_wr = ic->i_send_ring.w_nr;
+		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+		iinfo->max_send_sge = rds_ibdev->max_sge;
+		rds_ib_get_mr_info(rds_ibdev, iinfo);
+	}
+	return 1;
+}
+
+static void rds_ib_ic_info(struct socket *sock, unsigned int len,
+			   struct rds_info_iterator *iter,
+			   struct rds_info_lengths *lens)
+{
+	u64 buffer[(sizeof(struct rds_info_rdma_connection) + 7) / 8];
+
+	rds_for_each_conn_info(sock, len, iter, lens,
+				rds_ib_conn_info_visitor,
+				buffer,
+				sizeof(struct rds_info_rdma_connection));
+}
+
+
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible.  Sending and
+ * receiving should be device-agnostic.  Transports would try and maintain
+ * connections between peers who have messages queued.  Userspace would be
+ * allowed to influence which paths have priority.  We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_ib_laddr_check(struct net *net, __be32 addr)
+{
+	int ret;
+	struct rdma_cm_id *cm_id;
+	struct sockaddr_in sin;
+
+	/* Create a CMA ID and try to bind it. This catches both
+	 * IB and iWARP capable NICs.
+	 */
+	cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler,
+			       NULL, RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = addr;
+
+	/* rdma_bind_addr will only succeed for IB & iWARP devices */
+	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+	/* due to this, we will claim to support iWARP devices unless we
+	   check node_type. */
+	if (ret || !cm_id->device ||
+	    cm_id->device->node_type != RDMA_NODE_IB_CA)
+		ret = -EADDRNOTAVAIL;
+
+	rdsdebug("addr %pI4 ret %d node type %d\n",
+		&addr, ret,
+		cm_id->device ? cm_id->device->node_type : -1);
+
+	rdma_destroy_id(cm_id);
+
+	return ret;
+}
+
+static void rds_ib_unregister_client(void)
+{
+	ib_unregister_client(&rds_ib_client);
+	/* wait for rds_ib_dev_free() to complete */
+	flush_workqueue(rds_wq);
+}
+
+static void rds_ib_set_unloading(void)
+{
+	atomic_set(&rds_ib_unloading, 1);
+}
+
+static bool rds_ib_is_unloading(struct rds_connection *conn)
+{
+	struct rds_conn_path *cp = &conn->c_path[0];
+
+	return (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags) ||
+		atomic_read(&rds_ib_unloading) != 0);
+}
+
+void rds_ib_exit(void)
+{
+	rds_ib_set_unloading();
+	synchronize_rcu();
+	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+	rds_ib_unregister_client();
+	rds_ib_destroy_nodev_conns();
+	rds_ib_sysctl_exit();
+	rds_ib_recv_exit();
+	rds_trans_unregister(&rds_ib_transport);
+	rds_ib_mr_exit();
+}
+
+struct rds_transport rds_ib_transport = {
+	.laddr_check		= rds_ib_laddr_check,
+	.xmit_path_complete	= rds_ib_xmit_path_complete,
+	.xmit			= rds_ib_xmit,
+	.xmit_rdma		= rds_ib_xmit_rdma,
+	.xmit_atomic		= rds_ib_xmit_atomic,
+	.recv_path		= rds_ib_recv_path,
+	.conn_alloc		= rds_ib_conn_alloc,
+	.conn_free		= rds_ib_conn_free,
+	.conn_path_connect	= rds_ib_conn_path_connect,
+	.conn_path_shutdown	= rds_ib_conn_path_shutdown,
+	.inc_copy_to_user	= rds_ib_inc_copy_to_user,
+	.inc_free		= rds_ib_inc_free,
+	.cm_initiate_connect	= rds_ib_cm_initiate_connect,
+	.cm_handle_connect	= rds_ib_cm_handle_connect,
+	.cm_connect_complete	= rds_ib_cm_connect_complete,
+	.stats_info_copy	= rds_ib_stats_info_copy,
+	.exit			= rds_ib_exit,
+	.get_mr			= rds_ib_get_mr,
+	.sync_mr		= rds_ib_sync_mr,
+	.free_mr		= rds_ib_free_mr,
+	.flush_mrs		= rds_ib_flush_mrs,
+	.t_owner		= THIS_MODULE,
+	.t_name			= "infiniband",
+	.t_unloading		= rds_ib_is_unloading,
+	.t_type			= RDS_TRANS_IB
+};
+
+int rds_ib_init(void)
+{
+	int ret;
+
+	INIT_LIST_HEAD(&rds_ib_devices);
+
+	ret = rds_ib_mr_init();
+	if (ret)
+		goto out;
+
+	ret = ib_register_client(&rds_ib_client);
+	if (ret)
+		goto out_mr_exit;
+
+	ret = rds_ib_sysctl_init();
+	if (ret)
+		goto out_ibreg;
+
+	ret = rds_ib_recv_init();
+	if (ret)
+		goto out_sysctl;
+
+	rds_trans_register(&rds_ib_transport);
+
+	rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+
+	goto out;
+
+out_sysctl:
+	rds_ib_sysctl_exit();
+out_ibreg:
+	rds_ib_unregister_client();
+out_mr_exit:
+	rds_ib_mr_exit();
+out:
+	return ret;
+}
+
+MODULE_LICENSE("GPL");
+
diff --git a/net/rds/ib.h b/net/rds/ib.h
new file mode 100644
index 0000000..a6f4d7d
--- /dev/null
+++ b/net/rds/ib.h
@@ -0,0 +1,434 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDS_IB_H
+#define _RDS_IB_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include "rds.h"
+#include "rdma_transport.h"
+
+#define RDS_IB_MAX_SGE			8
+#define RDS_IB_RECV_SGE 		2
+
+#define RDS_IB_DEFAULT_RECV_WR		1024
+#define RDS_IB_DEFAULT_SEND_WR		256
+#define RDS_IB_DEFAULT_FR_WR		256
+#define RDS_IB_DEFAULT_FR_INV_WR	256
+
+#define RDS_IB_DEFAULT_RETRY_COUNT	1
+
+#define RDS_IB_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
+
+#define RDS_IB_RECYCLE_BATCH_COUNT	32
+
+#define RDS_IB_WC_MAX			32
+
+extern struct rw_semaphore rds_ib_devices_lock;
+extern struct list_head rds_ib_devices;
+
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+struct rds_page_frag {
+	struct list_head	f_item;
+	struct list_head	f_cache_entry;
+	struct scatterlist	f_sg;
+};
+
+struct rds_ib_incoming {
+	struct list_head	ii_frags;
+	struct list_head	ii_cache_entry;
+	struct rds_incoming	ii_inc;
+};
+
+struct rds_ib_cache_head {
+	struct list_head *first;
+	unsigned long count;
+};
+
+struct rds_ib_refill_cache {
+	struct rds_ib_cache_head __percpu *percpu;
+	struct list_head	 *xfer;
+	struct list_head	 *ready;
+};
+
+struct rds_ib_connect_private {
+	/* Add new fields at the end, and don't permute existing fields. */
+	__be32			dp_saddr;
+	__be32			dp_daddr;
+	u8			dp_protocol_major;
+	u8			dp_protocol_minor;
+	__be16			dp_protocol_minor_mask; /* bitmask */
+	__be32			dp_reserved1;
+	__be64			dp_ack_seq;
+	__be32			dp_credit;		/* non-zero enables flow ctl */
+};
+
+struct rds_ib_send_work {
+	void			*s_op;
+	union {
+		struct ib_send_wr	s_wr;
+		struct ib_rdma_wr	s_rdma_wr;
+		struct ib_atomic_wr	s_atomic_wr;
+	};
+	struct ib_sge		s_sge[RDS_IB_MAX_SGE];
+	unsigned long		s_queued;
+};
+
+struct rds_ib_recv_work {
+	struct rds_ib_incoming 	*r_ibinc;
+	struct rds_page_frag	*r_frag;
+	struct ib_recv_wr	r_wr;
+	struct ib_sge		r_sge[2];
+};
+
+struct rds_ib_work_ring {
+	u32		w_nr;
+	u32		w_alloc_ptr;
+	u32		w_alloc_ctr;
+	u32		w_free_ptr;
+	atomic_t	w_free_ctr;
+};
+
+/* Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_ib_ack_state {
+	u64		ack_next;
+	u64		ack_recv;
+	unsigned int	ack_required:1;
+	unsigned int	ack_next_valid:1;
+	unsigned int	ack_recv_valid:1;
+};
+
+
+struct rds_ib_device;
+
+struct rds_ib_connection {
+
+	struct list_head	ib_node;
+	struct rds_ib_device	*rds_ibdev;
+	struct rds_connection	*conn;
+
+	/* alphabet soup, IBTA style */
+	struct rdma_cm_id	*i_cm_id;
+	struct ib_pd		*i_pd;
+	struct ib_cq		*i_send_cq;
+	struct ib_cq		*i_recv_cq;
+	struct ib_wc		i_send_wc[RDS_IB_WC_MAX];
+	struct ib_wc		i_recv_wc[RDS_IB_WC_MAX];
+
+	/* To control the number of wrs from fastreg */
+	atomic_t		i_fastreg_wrs;
+	atomic_t		i_fastunreg_wrs;
+
+	/* interrupt handling */
+	struct tasklet_struct	i_send_tasklet;
+	struct tasklet_struct	i_recv_tasklet;
+
+	/* tx */
+	struct rds_ib_work_ring	i_send_ring;
+	struct rm_data_op	*i_data_op;
+	struct rds_header	*i_send_hdrs;
+	dma_addr_t		i_send_hdrs_dma;
+	struct rds_ib_send_work *i_sends;
+	atomic_t		i_signaled_sends;
+
+	/* rx */
+	struct mutex		i_recv_mutex;
+	struct rds_ib_work_ring	i_recv_ring;
+	struct rds_ib_incoming	*i_ibinc;
+	u32			i_recv_data_rem;
+	struct rds_header	*i_recv_hdrs;
+	dma_addr_t		i_recv_hdrs_dma;
+	struct rds_ib_recv_work *i_recvs;
+	u64			i_ack_recv;	/* last ACK received */
+	struct rds_ib_refill_cache i_cache_incs;
+	struct rds_ib_refill_cache i_cache_frags;
+	atomic_t		i_cache_allocs;
+
+	/* sending acks */
+	unsigned long		i_ack_flags;
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_t		i_ack_next;	/* next ACK to send */
+#else
+	spinlock_t		i_ack_lock;	/* protect i_ack_next */
+	u64			i_ack_next;	/* next ACK to send */
+#endif
+	struct rds_header	*i_ack;
+	struct ib_send_wr	i_ack_wr;
+	struct ib_sge		i_ack_sge;
+	dma_addr_t		i_ack_dma;
+	unsigned long		i_ack_queued;
+
+	/* Flow control related information
+	 *
+	 * Our algorithm uses a pair variables that we need to access
+	 * atomically - one for the send credits, and one posted
+	 * recv credits we need to transfer to remote.
+	 * Rather than protect them using a slow spinlock, we put both into
+	 * a single atomic_t and update it using cmpxchg
+	 */
+	atomic_t		i_credits;
+
+	/* Protocol version specific information */
+	unsigned int		i_flowctl:1;	/* enable/disable flow ctl */
+
+	/* Batched completions */
+	unsigned int		i_unsignaled_wrs;
+
+	/* Endpoint role in connection */
+	bool			i_active_side;
+	atomic_t		i_cq_quiesce;
+
+	/* Send/Recv vectors */
+	int			i_scq_vector;
+	int			i_rcq_vector;
+};
+
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v)	((v) >> 16)
+#define IB_SET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v)	((v) << 16)
+
+struct rds_ib_ipaddr {
+	struct list_head	list;
+	__be32			ipaddr;
+	struct rcu_head		rcu;
+};
+
+enum {
+	RDS_IB_MR_8K_POOL,
+	RDS_IB_MR_1M_POOL,
+};
+
+struct rds_ib_device {
+	struct list_head	list;
+	struct list_head	ipaddr_list;
+	struct list_head	conn_list;
+	struct ib_device	*dev;
+	struct ib_pd		*pd;
+	bool                    use_fastreg;
+
+	unsigned int		max_mrs;
+	struct rds_ib_mr_pool	*mr_1m_pool;
+	struct rds_ib_mr_pool   *mr_8k_pool;
+	unsigned int		fmr_max_remaps;
+	unsigned int		max_8k_mrs;
+	unsigned int		max_1m_mrs;
+	int			max_sge;
+	unsigned int		max_wrs;
+	unsigned int		max_initiator_depth;
+	unsigned int		max_responder_resources;
+	spinlock_t		spinlock;	/* protect the above */
+	refcount_t		refcount;
+	struct work_struct	free_work;
+	int			*vector_load;
+};
+
+#define ibdev_to_node(ibdev) dev_to_node((ibdev)->dev.parent)
+#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
+
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT	0
+#define IB_ACK_REQUESTED	1
+
+/* Magic WR_ID for ACKs */
+#define RDS_IB_ACK_WR_ID	(~(u64) 0)
+
+struct rds_ib_statistics {
+	uint64_t	s_ib_connect_raced;
+	uint64_t	s_ib_listen_closed_stale;
+	uint64_t	s_ib_evt_handler_call;
+	uint64_t	s_ib_tasklet_call;
+	uint64_t	s_ib_tx_cq_event;
+	uint64_t	s_ib_tx_ring_full;
+	uint64_t	s_ib_tx_throttle;
+	uint64_t	s_ib_tx_sg_mapping_failure;
+	uint64_t	s_ib_tx_stalled;
+	uint64_t	s_ib_tx_credit_updates;
+	uint64_t	s_ib_rx_cq_event;
+	uint64_t	s_ib_rx_ring_empty;
+	uint64_t	s_ib_rx_refill_from_cq;
+	uint64_t	s_ib_rx_refill_from_thread;
+	uint64_t	s_ib_rx_alloc_limit;
+	uint64_t	s_ib_rx_total_frags;
+	uint64_t	s_ib_rx_total_incs;
+	uint64_t	s_ib_rx_credit_updates;
+	uint64_t	s_ib_ack_sent;
+	uint64_t	s_ib_ack_send_failure;
+	uint64_t	s_ib_ack_send_delayed;
+	uint64_t	s_ib_ack_send_piggybacked;
+	uint64_t	s_ib_ack_received;
+	uint64_t	s_ib_rdma_mr_8k_alloc;
+	uint64_t	s_ib_rdma_mr_8k_free;
+	uint64_t	s_ib_rdma_mr_8k_used;
+	uint64_t	s_ib_rdma_mr_8k_pool_flush;
+	uint64_t	s_ib_rdma_mr_8k_pool_wait;
+	uint64_t	s_ib_rdma_mr_8k_pool_depleted;
+	uint64_t	s_ib_rdma_mr_1m_alloc;
+	uint64_t	s_ib_rdma_mr_1m_free;
+	uint64_t	s_ib_rdma_mr_1m_used;
+	uint64_t	s_ib_rdma_mr_1m_pool_flush;
+	uint64_t	s_ib_rdma_mr_1m_pool_wait;
+	uint64_t	s_ib_rdma_mr_1m_pool_depleted;
+	uint64_t	s_ib_rdma_mr_8k_reused;
+	uint64_t	s_ib_rdma_mr_1m_reused;
+	uint64_t	s_ib_atomic_cswp;
+	uint64_t	s_ib_atomic_fadd;
+	uint64_t	s_ib_recv_added_to_cache;
+	uint64_t	s_ib_recv_removed_from_cache;
+};
+
+extern struct workqueue_struct *rds_ib_wq;
+
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
+					      struct scatterlist *sglist,
+					      unsigned int sg_dma_len,
+					      int direction)
+{
+	struct scatterlist *sg;
+	unsigned int i;
+
+	for_each_sg(sglist, sg, sg_dma_len, i) {
+		ib_dma_sync_single_for_cpu(dev,
+				ib_sg_dma_address(dev, sg),
+				ib_sg_dma_len(dev, sg),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_cpu	rds_ib_dma_sync_sg_for_cpu
+
+static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
+						 struct scatterlist *sglist,
+						 unsigned int sg_dma_len,
+						 int direction)
+{
+	struct scatterlist *sg;
+	unsigned int i;
+
+	for_each_sg(sglist, sg, sg_dma_len, i) {
+		ib_dma_sync_single_for_device(dev,
+				ib_sg_dma_address(dev, sg),
+				ib_sg_dma_len(dev, sg),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_device	rds_ib_dma_sync_sg_for_device
+
+
+/* ib.c */
+extern struct rds_transport rds_ib_transport;
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
+extern struct ib_client rds_ib_client;
+
+extern unsigned int rds_ib_retry_count;
+
+extern spinlock_t ib_nodev_conns_lock;
+extern struct list_head ib_nodev_conns;
+
+/* ib_cm.c */
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_ib_conn_free(void *arg);
+int rds_ib_conn_path_connect(struct rds_conn_path *cp);
+void rds_ib_conn_path_shutdown(struct rds_conn_path *cp);
+void rds_ib_state_change(struct sock *sk);
+int rds_ib_listen_init(void);
+void rds_ib_listen_stop(void);
+__printf(2, 3)
+void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+			     struct rdma_cm_event *event);
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rds_ib_cm_connect_complete(struct rds_connection *conn,
+				struct rdma_cm_event *event);
+
+
+#define rds_ib_conn_error(conn, fmt...) \
+	__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
+
+/* ib_rdma.c */
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_destroy_nodev_conns(void);
+void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
+
+/* ib_recv.c */
+int rds_ib_recv_init(void);
+void rds_ib_recv_exit(void);
+int rds_ib_recv_path(struct rds_conn_path *conn);
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp);
+void rds_ib_inc_free(struct rds_incoming *inc);
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc,
+			     struct rds_ib_ack_state *state);
+void rds_ib_recv_tasklet_fn(unsigned long data);
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
+void rds_ib_attempt_ack(struct rds_ib_connection *ic);
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
+void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required);
+
+/* ib_ring.c */
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr);
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos);
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val);
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val);
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring);
+int rds_ib_ring_low(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_ib_ring_empty_wait;
+
+/* ib_send.c */
+void rds_ib_xmit_path_complete(struct rds_conn_path *cp);
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
+void rds_ib_send_init_ring(struct rds_ib_connection *ic);
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
+			     u32 *adv_credits, int need_posted, int max_posted);
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
+
+/* ib_stats.c */
+DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
+#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
+#define rds_ib_stats_add(member, count) \
+		rds_stats_add_which(rds_ib_stats, member, count)
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail);
+
+/* ib_sysctl.c */
+int rds_ib_sysctl_init(void);
+void rds_ib_sysctl_exit(void);
+extern unsigned long rds_ib_sysctl_max_send_wr;
+extern unsigned long rds_ib_sysctl_max_recv_wr;
+extern unsigned long rds_ib_sysctl_max_unsig_wrs;
+extern unsigned long rds_ib_sysctl_max_unsig_bytes;
+extern unsigned long rds_ib_sysctl_max_recv_allocation;
+extern unsigned int rds_ib_sysctl_flow_control;
+
+#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
new file mode 100644
index 0000000..13b38ad
--- /dev/null
+++ b/net/rds/ib_cm.c
@@ -0,0 +1,1026 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/ratelimit.h>
+
+#include "rds_single_path.h"
+#include "rds.h"
+#include "ib.h"
+
+/*
+ * Set the selected protocol version
+ */
+static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+	conn->c_version = version;
+}
+
+/*
+ * Set up flow control
+ */
+static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	if (rds_ib_sysctl_flow_control && credits != 0) {
+		/* We're doing flow control */
+		ic->i_flowctl = 1;
+		rds_ib_send_add_credits(conn, credits);
+	} else {
+		ic->i_flowctl = 0;
+	}
+}
+
+/*
+ * Tune RNR behavior. Without flow control, we use a rather
+ * low timeout, but not the absolute minimum - this should
+ * be tunable.
+ *
+ * We already set the RNR retry count to 7 (which is the
+ * smallest infinite number :-) above.
+ * If flow control is off, we want to change this back to 0
+ * so that we learn quickly when our credit accounting is
+ * buggy.
+ *
+ * Caller passes in a qp_attr pointer - don't waste stack spacv
+ * by allocation this twice.
+ */
+static void
+rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
+{
+	int ret;
+
+	attr->min_rnr_timer = IB_RNR_TIMER_000_32;
+	ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
+	if (ret)
+		printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
+}
+
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+	const struct rds_ib_connect_private *dp = NULL;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_qp_attr qp_attr;
+	int err;
+
+	if (event->param.conn.private_data_len >= sizeof(*dp)) {
+		dp = event->param.conn.private_data;
+
+		/* make sure it isn't empty data */
+		if (dp->dp_protocol_major) {
+			rds_ib_set_protocol(conn,
+				RDS_PROTOCOL(dp->dp_protocol_major,
+				dp->dp_protocol_minor));
+			rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+		}
+	}
+
+	if (conn->c_version < RDS_PROTOCOL(3, 1)) {
+		pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n",
+			  &conn->c_laddr, &conn->c_faddr,
+			  RDS_PROTOCOL_MAJOR(conn->c_version),
+			  RDS_PROTOCOL_MINOR(conn->c_version));
+		set_bit(RDS_DESTROY_PENDING, &conn->c_path[0].cp_flags);
+		rds_conn_destroy(conn);
+		return;
+	} else {
+		pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n",
+			  ic->i_active_side ? "Active" : "Passive",
+			  &conn->c_laddr, &conn->c_faddr,
+			  RDS_PROTOCOL_MAJOR(conn->c_version),
+			  RDS_PROTOCOL_MINOR(conn->c_version),
+			  ic->i_flowctl ? ", flow control" : "");
+	}
+
+	atomic_set(&ic->i_cq_quiesce, 0);
+
+	/* Init rings and fill recv. this needs to wait until protocol
+	 * negotiation is complete, since ring layout is different
+	 * from 3.1 to 4.1.
+	 */
+	rds_ib_send_init_ring(ic);
+	rds_ib_recv_init_ring(ic);
+	/* Post receive buffers - as a side effect, this will update
+	 * the posted credit count. */
+	rds_ib_recv_refill(conn, 1, GFP_KERNEL);
+
+	/* Tune RNR behavior */
+	rds_ib_tune_rnr(ic, &qp_attr);
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+	if (err)
+		printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
+
+	/* update ib_device with this local ipaddr */
+	err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
+	if (err)
+		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
+			err);
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp) {
+		/* dp structure start is not guaranteed to be 8 bytes aligned.
+		 * Since dp_ack_seq is 64-bit extended load operations can be
+		 * used so go through get_unaligned to avoid unaligned errors.
+		 */
+		__be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
+
+		if (dp_ack_seq)
+			rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
+					    NULL);
+	}
+
+	rds_connect_complete(conn);
+}
+
+static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
+			struct rdma_conn_param *conn_param,
+			struct rds_ib_connect_private *dp,
+			u32 protocol_version,
+			u32 max_responder_resources,
+			u32 max_initiator_depth)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
+
+	memset(conn_param, 0, sizeof(struct rdma_conn_param));
+
+	conn_param->responder_resources =
+		min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
+	conn_param->initiator_depth =
+		min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
+	conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
+	conn_param->rnr_retry_count = 7;
+
+	if (dp) {
+		memset(dp, 0, sizeof(*dp));
+		dp->dp_saddr = conn->c_laddr;
+		dp->dp_daddr = conn->c_faddr;
+		dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+		dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+		dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+		dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic));
+
+		/* Advertise flow control */
+		if (ic->i_flowctl) {
+			unsigned int credits;
+
+			credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+			dp->dp_credit = cpu_to_be32(credits);
+			atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+		}
+
+		conn_param->private_data = dp;
+		conn_param->private_data_len = sizeof(*dp);
+	}
+}
+
+static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
+{
+	rdsdebug("event %u (%s) data %p\n",
+		 event->event, ib_event_msg(event->event), data);
+}
+
+/* Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring.  Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	rdsdebug("conn %p cq %p\n", conn, cq);
+
+	rds_ib_stats_inc(s_ib_evt_handler_call);
+
+	tasklet_schedule(&ic->i_recv_tasklet);
+}
+
+static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq,
+		     struct ib_wc *wcs)
+{
+	int nr, i;
+	struct ib_wc *wc;
+
+	while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
+		for (i = 0; i < nr; i++) {
+			wc = wcs + i;
+			rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+				 (unsigned long long)wc->wr_id, wc->status,
+				 wc->byte_len, be32_to_cpu(wc->ex.imm_data));
+
+			if (wc->wr_id <= ic->i_send_ring.w_nr ||
+			    wc->wr_id == RDS_IB_ACK_WR_ID)
+				rds_ib_send_cqe_handler(ic, wc);
+			else
+				rds_ib_mr_cqe_handler(ic, wc);
+
+		}
+	}
+}
+
+static void rds_ib_tasklet_fn_send(unsigned long data)
+{
+	struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
+	struct rds_connection *conn = ic->conn;
+
+	rds_ib_stats_inc(s_ib_tasklet_call);
+
+	/* if cq has been already reaped, ignore incoming cq event */
+	if (atomic_read(&ic->i_cq_quiesce))
+		return;
+
+	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
+	ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
+	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
+
+	if (rds_conn_up(conn) &&
+	    (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
+	    test_bit(0, &conn->c_map_queued)))
+		rds_send_xmit(&ic->conn->c_path[0]);
+}
+
+static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
+		     struct ib_wc *wcs,
+		     struct rds_ib_ack_state *ack_state)
+{
+	int nr, i;
+	struct ib_wc *wc;
+
+	while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
+		for (i = 0; i < nr; i++) {
+			wc = wcs + i;
+			rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+				 (unsigned long long)wc->wr_id, wc->status,
+				 wc->byte_len, be32_to_cpu(wc->ex.imm_data));
+
+			rds_ib_recv_cqe_handler(ic, wc, ack_state);
+		}
+	}
+}
+
+static void rds_ib_tasklet_fn_recv(unsigned long data)
+{
+	struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
+	struct rds_connection *conn = ic->conn;
+	struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
+	struct rds_ib_ack_state state;
+
+	if (!rds_ibdev)
+		rds_conn_drop(conn);
+
+	rds_ib_stats_inc(s_ib_tasklet_call);
+
+	/* if cq has been already reaped, ignore incoming cq event */
+	if (atomic_read(&ic->i_cq_quiesce))
+		return;
+
+	memset(&state, 0, sizeof(state));
+	poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+	ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+	poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+
+	if (state.ack_next_valid)
+		rds_ib_set_ack(ic, state.ack_next, state.ack_required);
+	if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+		rds_send_drop_acked(conn, state.ack_recv, NULL);
+		ic->i_ack_recv = state.ack_recv;
+	}
+
+	if (rds_conn_up(conn))
+		rds_ib_attempt_ack(ic);
+}
+
+static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
+{
+	struct rds_connection *conn = data;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
+		 ib_event_msg(event->event));
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+		break;
+	default:
+		rdsdebug("Fatal QP Event %u (%s) "
+			"- connection %pI4->%pI4, reconnecting\n",
+			event->event, ib_event_msg(event->event),
+			&conn->c_laddr, &conn->c_faddr);
+		rds_conn_drop(conn);
+		break;
+	}
+}
+
+static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	rdsdebug("conn %p cq %p\n", conn, cq);
+
+	rds_ib_stats_inc(s_ib_evt_handler_call);
+
+	tasklet_schedule(&ic->i_send_tasklet);
+}
+
+static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev)
+{
+	int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1];
+	int index = rds_ibdev->dev->num_comp_vectors - 1;
+	int i;
+
+	for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) {
+		if (rds_ibdev->vector_load[i] < min) {
+			index = i;
+			min = rds_ibdev->vector_load[i];
+		}
+	}
+
+	rds_ibdev->vector_load[index]++;
+	return index;
+}
+
+static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
+{
+	rds_ibdev->vector_load[index]--;
+}
+
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_ib_setup_qp(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct ib_qp_init_attr attr;
+	struct ib_cq_init_attr cq_attr = {};
+	struct rds_ib_device *rds_ibdev;
+	int ret, fr_queue_space;
+
+	/*
+	 * It's normal to see a null device if an incoming connection races
+	 * with device removal, so we don't print a warning.
+	 */
+	rds_ibdev = rds_ib_get_client_data(dev);
+	if (!rds_ibdev)
+		return -EOPNOTSUPP;
+
+	/* The fr_queue_space is currently set to 512, to add extra space on
+	 * completion queue and send queue. This extra space is used for FRMR
+	 * registration and invalidation work requests
+	 */
+	fr_queue_space = rds_ibdev->use_fastreg ?
+			 (RDS_IB_DEFAULT_FR_WR + 1) +
+			 (RDS_IB_DEFAULT_FR_INV_WR + 1)
+			 : 0;
+
+	/* add the conn now so that connection establishment has the dev */
+	rds_ib_add_conn(rds_ibdev, conn);
+
+	if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
+		rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
+	if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
+		rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
+
+	/* Protection domain and memory range */
+	ic->i_pd = rds_ibdev->pd;
+
+	ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev);
+	cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
+	cq_attr.comp_vector = ic->i_scq_vector;
+	ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
+				     rds_ib_cq_event_handler, conn,
+				     &cq_attr);
+	if (IS_ERR(ic->i_send_cq)) {
+		ret = PTR_ERR(ic->i_send_cq);
+		ic->i_send_cq = NULL;
+		ibdev_put_vector(rds_ibdev, ic->i_scq_vector);
+		rdsdebug("ib_create_cq send failed: %d\n", ret);
+		goto rds_ibdev_out;
+	}
+
+	ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev);
+	cq_attr.cqe = ic->i_recv_ring.w_nr;
+	cq_attr.comp_vector = ic->i_rcq_vector;
+	ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv,
+				     rds_ib_cq_event_handler, conn,
+				     &cq_attr);
+	if (IS_ERR(ic->i_recv_cq)) {
+		ret = PTR_ERR(ic->i_recv_cq);
+		ic->i_recv_cq = NULL;
+		ibdev_put_vector(rds_ibdev, ic->i_rcq_vector);
+		rdsdebug("ib_create_cq recv failed: %d\n", ret);
+		goto send_cq_out;
+	}
+
+	ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+		goto recv_cq_out;
+	}
+
+	ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+		goto recv_cq_out;
+	}
+
+	/* XXX negotiate max send/recv with remote? */
+	memset(&attr, 0, sizeof(attr));
+	attr.event_handler = rds_ib_qp_event_handler;
+	attr.qp_context = conn;
+	/* + 1 to allow for the single ack message */
+	attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1;
+	attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
+	attr.cap.max_send_sge = rds_ibdev->max_sge;
+	attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
+	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	attr.qp_type = IB_QPT_RC;
+	attr.send_cq = ic->i_send_cq;
+	attr.recv_cq = ic->i_recv_cq;
+	atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
+	atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR);
+
+	/*
+	 * XXX this can fail if max_*_wr is too large?  Are we supposed
+	 * to back off until we get a value that the hardware can support?
+	 */
+	ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+	if (ret) {
+		rdsdebug("rdma_create_qp failed: %d\n", ret);
+		goto recv_cq_out;
+	}
+
+	ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_send_hdrs_dma, GFP_KERNEL);
+	if (!ic->i_send_hdrs) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent send failed\n");
+		goto qp_out;
+	}
+
+	ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_recv_hdrs_dma, GFP_KERNEL);
+	if (!ic->i_recv_hdrs) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent recv failed\n");
+		goto send_hdrs_dma_out;
+	}
+
+	ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
+				       &ic->i_ack_dma, GFP_KERNEL);
+	if (!ic->i_ack) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent ack failed\n");
+		goto recv_hdrs_dma_out;
+	}
+
+	ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
+				   ibdev_to_node(dev));
+	if (!ic->i_sends) {
+		ret = -ENOMEM;
+		rdsdebug("send allocation failed\n");
+		goto ack_dma_out;
+	}
+
+	ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
+				   ibdev_to_node(dev));
+	if (!ic->i_recvs) {
+		ret = -ENOMEM;
+		rdsdebug("recv allocation failed\n");
+		goto sends_out;
+	}
+
+	rds_ib_recv_init_ack(ic);
+
+	rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
+		 ic->i_send_cq, ic->i_recv_cq);
+
+	goto out;
+
+sends_out:
+	vfree(ic->i_sends);
+ack_dma_out:
+	ib_dma_free_coherent(dev, sizeof(struct rds_header),
+			     ic->i_ack, ic->i_ack_dma);
+recv_hdrs_dma_out:
+	ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr *
+					sizeof(struct rds_header),
+					ic->i_recv_hdrs, ic->i_recv_hdrs_dma);
+send_hdrs_dma_out:
+	ib_dma_free_coherent(dev, ic->i_send_ring.w_nr *
+					sizeof(struct rds_header),
+					ic->i_send_hdrs, ic->i_send_hdrs_dma);
+qp_out:
+	rdma_destroy_qp(ic->i_cm_id);
+recv_cq_out:
+	if (!ib_destroy_cq(ic->i_recv_cq))
+		ic->i_recv_cq = NULL;
+send_cq_out:
+	if (!ib_destroy_cq(ic->i_send_cq))
+		ic->i_send_cq = NULL;
+rds_ibdev_out:
+	rds_ib_remove_conn(rds_ibdev, conn);
+out:
+	rds_ib_dev_put(rds_ibdev);
+
+	return ret;
+}
+
+static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
+{
+	const struct rds_ib_connect_private *dp = event->param.conn.private_data;
+	u16 common;
+	u32 version = 0;
+
+	/*
+	 * rdma_cm private data is odd - when there is any private data in the
+	 * request, we will be given a pretty large buffer without telling us the
+	 * original size. The only way to tell the difference is by looking at
+	 * the contents, which are initialized to zero.
+	 * If the protocol version fields aren't set, this is a connection attempt
+	 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+	 * We really should have changed this for OFED 1.3 :-(
+	 */
+
+	/* Be paranoid. RDS always has privdata */
+	if (!event->param.conn.private_data_len) {
+		printk(KERN_NOTICE "RDS incoming connection has no private data, "
+			"rejecting\n");
+		return 0;
+	}
+
+	/* Even if len is crap *now* I still want to check it. -ASG */
+	if (event->param.conn.private_data_len < sizeof (*dp) ||
+	    dp->dp_protocol_major == 0)
+		return RDS_PROTOCOL_3_0;
+
+	common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
+	if (dp->dp_protocol_major == 3 && common) {
+		version = RDS_PROTOCOL_3_0;
+		while ((common >>= 1) != 0)
+			version++;
+	} else
+		printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
+				&dp->dp_saddr,
+				dp->dp_protocol_major,
+				dp->dp_protocol_minor);
+	return version;
+}
+
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+				    struct rdma_cm_event *event)
+{
+	__be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
+	__be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
+	const struct rds_ib_connect_private *dp = event->param.conn.private_data;
+	struct rds_ib_connect_private dp_rep;
+	struct rds_connection *conn = NULL;
+	struct rds_ib_connection *ic = NULL;
+	struct rdma_conn_param conn_param;
+	u32 version;
+	int err = 1, destroy = 1;
+
+	/* Check whether the remote protocol version matches ours. */
+	version = rds_ib_protocol_compatible(event);
+	if (!version)
+		goto out;
+
+	rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
+		 "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
+		 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
+		 (unsigned long long)be64_to_cpu(lguid),
+		 (unsigned long long)be64_to_cpu(fguid));
+
+	/* RDS/IB is not currently netns aware, thus init_net */
+	conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
+			       &rds_ib_transport, GFP_KERNEL);
+	if (IS_ERR(conn)) {
+		rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+		conn = NULL;
+		goto out;
+	}
+
+	/*
+	 * The connection request may occur while the
+	 * previous connection exist, e.g. in case of failover.
+	 * But as connections may be initiated simultaneously
+	 * by both hosts, we have a random backoff mechanism -
+	 * see the comment above rds_queue_reconnect()
+	 */
+	mutex_lock(&conn->c_cm_lock);
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+		if (rds_conn_state(conn) == RDS_CONN_UP) {
+			rdsdebug("incoming connect while connecting\n");
+			rds_conn_drop(conn);
+			rds_ib_stats_inc(s_ib_listen_closed_stale);
+		} else
+		if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+			/* Wait and see - our connect may still be succeeding */
+			rds_ib_stats_inc(s_ib_connect_raced);
+		}
+		goto out;
+	}
+
+	ic = conn->c_transport_data;
+
+	rds_ib_set_protocol(conn, version);
+	rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp->dp_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+	BUG_ON(cm_id->context);
+	BUG_ON(ic->i_cm_id);
+
+	ic->i_cm_id = cm_id;
+	cm_id->context = conn;
+
+	/* We got halfway through setting up the ib_connection, if we
+	 * fail now, we have to take the long route out of this mess. */
+	destroy = 0;
+
+	err = rds_ib_setup_qp(conn);
+	if (err) {
+		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
+		goto out;
+	}
+
+	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
+		event->param.conn.responder_resources,
+		event->param.conn.initiator_depth);
+
+	/* rdma_accept() calls rdma_reject() internally if it fails */
+	if (rdma_accept(cm_id, &conn_param))
+		rds_ib_conn_error(conn, "rdma_accept failed\n");
+
+out:
+	if (conn)
+		mutex_unlock(&conn->c_cm_lock);
+	if (err)
+		rdma_reject(cm_id, NULL, 0);
+	return destroy;
+}
+
+
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+	struct rds_connection *conn = cm_id->context;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rdma_conn_param conn_param;
+	struct rds_ib_connect_private dp;
+	int ret;
+
+	/* If the peer doesn't do protocol negotiation, we must
+	 * default to RDSv3.0 */
+	rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
+	ic->i_flowctl = rds_ib_sysctl_flow_control;	/* advertise flow control */
+
+	ret = rds_ib_setup_qp(conn);
+	if (ret) {
+		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
+		goto out;
+	}
+
+	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
+		UINT_MAX, UINT_MAX);
+	ret = rdma_connect(cm_id, &conn_param);
+	if (ret)
+		rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+
+out:
+	/* Beware - returning non-zero tells the rdma_cm to destroy
+	 * the cm_id. We should certainly not do it as long as we still
+	 * "own" the cm_id. */
+	if (ret) {
+		if (ic->i_cm_id == cm_id)
+			ret = 0;
+	}
+	ic->i_active_side = true;
+	return ret;
+}
+
+int rds_ib_conn_path_connect(struct rds_conn_path *cp)
+{
+	struct rds_connection *conn = cp->cp_conn;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct sockaddr_in src, dest;
+	int ret;
+
+	/* XXX I wonder what affect the port space has */
+	/* delegate cm event handler to rdma_transport */
+	ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn,
+				     RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(ic->i_cm_id)) {
+		ret = PTR_ERR(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+		rdsdebug("rdma_create_id() failed: %d\n", ret);
+		goto out;
+	}
+
+	rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+
+	src.sin_family = AF_INET;
+	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+	src.sin_port = (__force u16)htons(0);
+
+	dest.sin_family = AF_INET;
+	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+	dest.sin_port = (__force u16)htons(RDS_PORT);
+
+	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+				(struct sockaddr *)&dest,
+				RDS_RDMA_RESOLVE_TIMEOUT_MS);
+	if (ret) {
+		rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+			 ret);
+		rdma_destroy_id(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup.  In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
+{
+	struct rds_connection *conn = cp->cp_conn;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	int err = 0;
+
+	rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+		 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+		 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+
+	if (ic->i_cm_id) {
+		struct ib_device *dev = ic->i_cm_id->device;
+
+		rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+		err = rdma_disconnect(ic->i_cm_id);
+		if (err) {
+			/* Actually this may happen quite frequently, when
+			 * an outgoing connect raced with an incoming connect.
+			 */
+			rdsdebug("failed to disconnect, cm: %p err %d\n",
+				ic->i_cm_id, err);
+		}
+
+		/*
+		 * We want to wait for tx and rx completion to finish
+		 * before we tear down the connection, but we have to be
+		 * careful not to get stuck waiting on a send ring that
+		 * only has unsignaled sends in it.  We've shutdown new
+		 * sends before getting here so by waiting for signaled
+		 * sends to complete we're ensured that there will be no
+		 * more tx processing.
+		 */
+		wait_event(rds_ib_ring_empty_wait,
+			   rds_ib_ring_empty(&ic->i_recv_ring) &&
+			   (atomic_read(&ic->i_signaled_sends) == 0) &&
+			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) &&
+			   (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR));
+		tasklet_kill(&ic->i_send_tasklet);
+		tasklet_kill(&ic->i_recv_tasklet);
+
+		atomic_set(&ic->i_cq_quiesce, 1);
+
+		/* first destroy the ib state that generates callbacks */
+		if (ic->i_cm_id->qp)
+			rdma_destroy_qp(ic->i_cm_id);
+		if (ic->i_send_cq) {
+			if (ic->rds_ibdev)
+				ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector);
+			ib_destroy_cq(ic->i_send_cq);
+		}
+
+		if (ic->i_recv_cq) {
+			if (ic->rds_ibdev)
+				ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector);
+			ib_destroy_cq(ic->i_recv_cq);
+		}
+
+		/* then free the resources that ib callbacks use */
+		if (ic->i_send_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_send_hdrs,
+					   ic->i_send_hdrs_dma);
+
+		if (ic->i_recv_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_recv_hdrs,
+					   ic->i_recv_hdrs_dma);
+
+		if (ic->i_ack)
+			ib_dma_free_coherent(dev, sizeof(struct rds_header),
+					     ic->i_ack, ic->i_ack_dma);
+
+		if (ic->i_sends)
+			rds_ib_send_clear_ring(ic);
+		if (ic->i_recvs)
+			rds_ib_recv_clear_ring(ic);
+
+		rdma_destroy_id(ic->i_cm_id);
+
+		/*
+		 * Move connection back to the nodev list.
+		 */
+		if (ic->rds_ibdev)
+			rds_ib_remove_conn(ic->rds_ibdev, conn);
+
+		ic->i_cm_id = NULL;
+		ic->i_pd = NULL;
+		ic->i_send_cq = NULL;
+		ic->i_recv_cq = NULL;
+		ic->i_send_hdrs = NULL;
+		ic->i_recv_hdrs = NULL;
+		ic->i_ack = NULL;
+	}
+	BUG_ON(ic->rds_ibdev);
+
+	/* Clear pending transmit */
+	if (ic->i_data_op) {
+		struct rds_message *rm;
+
+		rm = container_of(ic->i_data_op, struct rds_message, data);
+		rds_message_put(rm);
+		ic->i_data_op = NULL;
+	}
+
+	/* Clear the ACK state */
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_set(&ic->i_ack_next, 0);
+#else
+	ic->i_ack_next = 0;
+#endif
+	ic->i_ack_recv = 0;
+
+	/* Clear flow control state */
+	ic->i_flowctl = 0;
+	atomic_set(&ic->i_credits, 0);
+
+	rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+	rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+
+	if (ic->i_ibinc) {
+		rds_inc_put(&ic->i_ibinc->ii_inc);
+		ic->i_ibinc = NULL;
+	}
+
+	vfree(ic->i_sends);
+	ic->i_sends = NULL;
+	vfree(ic->i_recvs);
+	ic->i_recvs = NULL;
+	ic->i_active_side = false;
+}
+
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_ib_connection *ic;
+	unsigned long flags;
+	int ret;
+
+	/* XXX too lazy? */
+	ic = kzalloc(sizeof(struct rds_ib_connection), gfp);
+	if (!ic)
+		return -ENOMEM;
+
+	ret = rds_ib_recv_alloc_caches(ic);
+	if (ret) {
+		kfree(ic);
+		return ret;
+	}
+
+	INIT_LIST_HEAD(&ic->ib_node);
+	tasklet_init(&ic->i_send_tasklet, rds_ib_tasklet_fn_send,
+		     (unsigned long)ic);
+	tasklet_init(&ic->i_recv_tasklet, rds_ib_tasklet_fn_recv,
+		     (unsigned long)ic);
+	mutex_init(&ic->i_recv_mutex);
+#ifndef KERNEL_HAS_ATOMIC64
+	spin_lock_init(&ic->i_ack_lock);
+#endif
+	atomic_set(&ic->i_signaled_sends, 0);
+
+	/*
+	 * rds_ib_conn_shutdown() waits for these to be emptied so they
+	 * must be initialized before it can be called.
+	 */
+	rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+	rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+
+	ic->conn = conn;
+	conn->c_transport_data = ic;
+
+	spin_lock_irqsave(&ib_nodev_conns_lock, flags);
+	list_add_tail(&ic->ib_node, &ib_nodev_conns);
+	spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
+
+
+	rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+	return 0;
+}
+
+/*
+ * Free a connection. Connection must be shut down and not set for reconnect.
+ */
+void rds_ib_conn_free(void *arg)
+{
+	struct rds_ib_connection *ic = arg;
+	spinlock_t	*lock_ptr;
+
+	rdsdebug("ic %p\n", ic);
+
+	/*
+	 * Conn is either on a dev's list or on the nodev list.
+	 * A race with shutdown() or connect() would cause problems
+	 * (since rds_ibdev would change) but that should never happen.
+	 */
+	lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
+
+	spin_lock_irq(lock_ptr);
+	list_del(&ic->ib_node);
+	spin_unlock_irq(lock_ptr);
+
+	rds_ib_recv_free_caches(ic);
+
+	kfree(ic);
+}
+
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+	va_list ap;
+
+	rds_conn_drop(conn);
+
+	va_start(ap, fmt);
+	vprintk(fmt, ap);
+	va_end(ap);
+}
diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c
new file mode 100644
index 0000000..e0f70c4
--- /dev/null
+++ b/net/rds/ib_fmr.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2016 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ib_mr.h"
+
+struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages)
+{
+	struct rds_ib_mr_pool *pool;
+	struct rds_ib_mr *ibmr = NULL;
+	struct rds_ib_fmr *fmr;
+	int err = 0;
+
+	if (npages <= RDS_MR_8K_MSG_SIZE)
+		pool = rds_ibdev->mr_8k_pool;
+	else
+		pool = rds_ibdev->mr_1m_pool;
+
+	ibmr = rds_ib_try_reuse_ibmr(pool);
+	if (ibmr)
+		return ibmr;
+
+	ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
+			    rdsibdev_to_node(rds_ibdev));
+	if (!ibmr) {
+		err = -ENOMEM;
+		goto out_no_cigar;
+	}
+
+	fmr = &ibmr->u.fmr;
+	fmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
+			(IB_ACCESS_LOCAL_WRITE |
+			 IB_ACCESS_REMOTE_READ |
+			 IB_ACCESS_REMOTE_WRITE |
+			 IB_ACCESS_REMOTE_ATOMIC),
+			&pool->fmr_attr);
+	if (IS_ERR(fmr->fmr)) {
+		err = PTR_ERR(fmr->fmr);
+		fmr->fmr = NULL;
+		pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err);
+		goto out_no_cigar;
+	}
+
+	ibmr->pool = pool;
+	if (pool->pool_type == RDS_IB_MR_8K_POOL)
+		rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
+	else
+		rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
+	return ibmr;
+
+out_no_cigar:
+	kfree(ibmr);
+	atomic_dec(&pool->item_count);
+
+	return ERR_PTR(err);
+}
+
+static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev,
+			  struct rds_ib_mr *ibmr, struct scatterlist *sg,
+			  unsigned int nents)
+{
+	struct ib_device *dev = rds_ibdev->dev;
+	struct rds_ib_fmr *fmr = &ibmr->u.fmr;
+	struct scatterlist *scat = sg;
+	u64 io_addr = 0;
+	u64 *dma_pages;
+	u32 len;
+	int page_cnt, sg_dma_len;
+	int i, j;
+	int ret;
+
+	sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
+	if (unlikely(!sg_dma_len)) {
+		pr_warn("RDS/IB: %s failed!\n", __func__);
+		return -EBUSY;
+	}
+
+	len = 0;
+	page_cnt = 0;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+		if (dma_addr & ~PAGE_MASK) {
+			if (i > 0) {
+				ib_dma_unmap_sg(dev, sg, nents,
+						DMA_BIDIRECTIONAL);
+				return -EINVAL;
+			} else {
+				++page_cnt;
+			}
+		}
+		if ((dma_addr + dma_len) & ~PAGE_MASK) {
+			if (i < sg_dma_len - 1) {
+				ib_dma_unmap_sg(dev, sg, nents,
+						DMA_BIDIRECTIONAL);
+				return -EINVAL;
+			} else {
+				++page_cnt;
+			}
+		}
+
+		len += dma_len;
+	}
+
+	page_cnt += len >> PAGE_SHIFT;
+	if (page_cnt > ibmr->pool->fmr_attr.max_pages) {
+		ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
+		return -EINVAL;
+	}
+
+	dma_pages = kmalloc_array_node(sizeof(u64), page_cnt, GFP_ATOMIC,
+				       rdsibdev_to_node(rds_ibdev));
+	if (!dma_pages) {
+		ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
+		return -ENOMEM;
+	}
+
+	page_cnt = 0;
+	for (i = 0; i < sg_dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+		for (j = 0; j < dma_len; j += PAGE_SIZE)
+			dma_pages[page_cnt++] =
+				(dma_addr & PAGE_MASK) + j;
+	}
+
+	ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr);
+	if (ret) {
+		ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
+		goto out;
+	}
+
+	/* Success - we successfully remapped the MR, so we can
+	 * safely tear down the old mapping.
+	 */
+	rds_ib_teardown_mr(ibmr);
+
+	ibmr->sg = scat;
+	ibmr->sg_len = nents;
+	ibmr->sg_dma_len = sg_dma_len;
+	ibmr->remap_count++;
+
+	if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+		rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
+	else
+		rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
+	ret = 0;
+
+out:
+	kfree(dma_pages);
+
+	return ret;
+}
+
+struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *rds_ibdev,
+				 struct scatterlist *sg,
+				 unsigned long nents,
+				 u32 *key)
+{
+	struct rds_ib_mr *ibmr = NULL;
+	struct rds_ib_fmr *fmr;
+	int ret;
+
+	ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
+	if (IS_ERR(ibmr))
+		return ibmr;
+
+	ibmr->device = rds_ibdev;
+	fmr = &ibmr->u.fmr;
+	ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
+	if (ret == 0)
+		*key = fmr->fmr->rkey;
+	else
+		rds_ib_free_mr(ibmr, 0);
+
+	return ibmr;
+}
+
+void rds_ib_unreg_fmr(struct list_head *list, unsigned int *nfreed,
+		      unsigned long *unpinned, unsigned int goal)
+{
+	struct rds_ib_mr *ibmr, *next;
+	struct rds_ib_fmr *fmr;
+	LIST_HEAD(fmr_list);
+	int ret = 0;
+	unsigned int freed = *nfreed;
+
+	/* String all ib_mr's onto one list and hand them to  ib_unmap_fmr */
+	list_for_each_entry(ibmr, list, unmap_list) {
+		fmr = &ibmr->u.fmr;
+		list_add(&fmr->fmr->list, &fmr_list);
+	}
+
+	ret = ib_unmap_fmr(&fmr_list);
+	if (ret)
+		pr_warn("RDS/IB: FMR invalidation failed (err=%d)\n", ret);
+
+	/* Now we can destroy the DMA mapping and unpin any pages */
+	list_for_each_entry_safe(ibmr, next, list, unmap_list) {
+		fmr = &ibmr->u.fmr;
+		*unpinned += ibmr->sg_len;
+		__rds_ib_teardown_mr(ibmr);
+		if (freed < goal ||
+		    ibmr->remap_count >= ibmr->pool->fmr_attr.max_maps) {
+			if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+				rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
+			else
+				rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
+			list_del(&ibmr->unmap_list);
+			ib_dealloc_fmr(fmr->fmr);
+			kfree(ibmr);
+			freed++;
+		}
+	}
+	*nfreed = freed;
+}
+
+void rds_ib_free_fmr_list(struct rds_ib_mr *ibmr)
+{
+	struct rds_ib_mr_pool *pool = ibmr->pool;
+
+	if (ibmr->remap_count >= pool->fmr_attr.max_maps)
+		llist_add(&ibmr->llnode, &pool->drop_list);
+	else
+		llist_add(&ibmr->llnode, &pool->free_list);
+}
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
new file mode 100644
index 0000000..48332a6
--- /dev/null
+++ b/net/rds/ib_frmr.c
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2016 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ib_mr.h"
+
+static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
+					   int npages)
+{
+	struct rds_ib_mr_pool *pool;
+	struct rds_ib_mr *ibmr = NULL;
+	struct rds_ib_frmr *frmr;
+	int err = 0;
+
+	if (npages <= RDS_MR_8K_MSG_SIZE)
+		pool = rds_ibdev->mr_8k_pool;
+	else
+		pool = rds_ibdev->mr_1m_pool;
+
+	ibmr = rds_ib_try_reuse_ibmr(pool);
+	if (ibmr)
+		return ibmr;
+
+	ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
+			    rdsibdev_to_node(rds_ibdev));
+	if (!ibmr) {
+		err = -ENOMEM;
+		goto out_no_cigar;
+	}
+
+	frmr = &ibmr->u.frmr;
+	frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG,
+			 pool->fmr_attr.max_pages);
+	if (IS_ERR(frmr->mr)) {
+		pr_warn("RDS/IB: %s failed to allocate MR", __func__);
+		goto out_no_cigar;
+	}
+
+	ibmr->pool = pool;
+	if (pool->pool_type == RDS_IB_MR_8K_POOL)
+		rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
+	else
+		rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
+	if (atomic_read(&pool->item_count) > pool->max_items_soft)
+		pool->max_items_soft = pool->max_items;
+
+	frmr->fr_state = FRMR_IS_FREE;
+	return ibmr;
+
+out_no_cigar:
+	kfree(ibmr);
+	atomic_dec(&pool->item_count);
+	return ERR_PTR(err);
+}
+
+static void rds_ib_free_frmr(struct rds_ib_mr *ibmr, bool drop)
+{
+	struct rds_ib_mr_pool *pool = ibmr->pool;
+
+	if (drop)
+		llist_add(&ibmr->llnode, &pool->drop_list);
+	else
+		llist_add(&ibmr->llnode, &pool->free_list);
+	atomic_add(ibmr->sg_len, &pool->free_pinned);
+	atomic_inc(&pool->dirty_count);
+
+	/* If we've pinned too many pages, request a flush */
+	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+	    atomic_read(&pool->dirty_count) >= pool->max_items / 5)
+		queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+}
+
+static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
+{
+	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+	struct ib_send_wr *failed_wr;
+	struct ib_reg_wr reg_wr;
+	int ret, off = 0;
+
+	while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
+		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+		cpu_relax();
+	}
+
+	ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len,
+				&off, PAGE_SIZE);
+	if (unlikely(ret != ibmr->sg_len))
+		return ret < 0 ? ret : -EINVAL;
+
+	/* Perform a WR for the fast_reg_mr. Each individual page
+	 * in the sg list is added to the fast reg page list and placed
+	 * inside the fast_reg_mr WR.  The key used is a rolling 8bit
+	 * counter, which should guarantee uniqueness.
+	 */
+	ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++);
+	frmr->fr_state = FRMR_IS_INUSE;
+
+	memset(&reg_wr, 0, sizeof(reg_wr));
+	reg_wr.wr.wr_id = (unsigned long)(void *)ibmr;
+	reg_wr.wr.opcode = IB_WR_REG_MR;
+	reg_wr.wr.num_sge = 0;
+	reg_wr.mr = frmr->mr;
+	reg_wr.key = frmr->mr->rkey;
+	reg_wr.access = IB_ACCESS_LOCAL_WRITE |
+			IB_ACCESS_REMOTE_READ |
+			IB_ACCESS_REMOTE_WRITE;
+	reg_wr.wr.send_flags = IB_SEND_SIGNALED;
+
+	failed_wr = &reg_wr.wr;
+	ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, &failed_wr);
+	WARN_ON(failed_wr != &reg_wr.wr);
+	if (unlikely(ret)) {
+		/* Failure here can be because of -ENOMEM as well */
+		frmr->fr_state = FRMR_IS_STALE;
+		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+		if (printk_ratelimit())
+			pr_warn("RDS/IB: %s returned error(%d)\n",
+				__func__, ret);
+	}
+	return ret;
+}
+
+static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev,
+			   struct rds_ib_mr_pool *pool,
+			   struct rds_ib_mr *ibmr,
+			   struct scatterlist *sg, unsigned int sg_len)
+{
+	struct ib_device *dev = rds_ibdev->dev;
+	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+	int i;
+	u32 len;
+	int ret = 0;
+
+	/* We want to teardown old ibmr values here and fill it up with
+	 * new sg values
+	 */
+	rds_ib_teardown_mr(ibmr);
+
+	ibmr->sg = sg;
+	ibmr->sg_len = sg_len;
+	ibmr->sg_dma_len = 0;
+	frmr->sg_byte_len = 0;
+	WARN_ON(ibmr->sg_dma_len);
+	ibmr->sg_dma_len = ib_dma_map_sg(dev, ibmr->sg, ibmr->sg_len,
+					 DMA_BIDIRECTIONAL);
+	if (unlikely(!ibmr->sg_dma_len)) {
+		pr_warn("RDS/IB: %s failed!\n", __func__);
+		return -EBUSY;
+	}
+
+	frmr->sg_byte_len = 0;
+	frmr->dma_npages = 0;
+	len = 0;
+
+	ret = -EINVAL;
+	for (i = 0; i < ibmr->sg_dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &ibmr->sg[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &ibmr->sg[i]);
+
+		frmr->sg_byte_len += dma_len;
+		if (dma_addr & ~PAGE_MASK) {
+			if (i > 0)
+				goto out_unmap;
+			else
+				++frmr->dma_npages;
+		}
+
+		if ((dma_addr + dma_len) & ~PAGE_MASK) {
+			if (i < ibmr->sg_dma_len - 1)
+				goto out_unmap;
+			else
+				++frmr->dma_npages;
+		}
+
+		len += dma_len;
+	}
+	frmr->dma_npages += len >> PAGE_SHIFT;
+
+	if (frmr->dma_npages > ibmr->pool->fmr_attr.max_pages) {
+		ret = -EMSGSIZE;
+		goto out_unmap;
+	}
+
+	ret = rds_ib_post_reg_frmr(ibmr);
+	if (ret)
+		goto out_unmap;
+
+	if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+		rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
+	else
+		rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
+
+	return ret;
+
+out_unmap:
+	ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len,
+			DMA_BIDIRECTIONAL);
+	ibmr->sg_dma_len = 0;
+	return ret;
+}
+
+static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
+{
+	struct ib_send_wr *s_wr, *failed_wr;
+	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+	struct rdma_cm_id *i_cm_id = ibmr->ic->i_cm_id;
+	int ret = -EINVAL;
+
+	if (!i_cm_id || !i_cm_id->qp || !frmr->mr)
+		goto out;
+
+	if (frmr->fr_state != FRMR_IS_INUSE)
+		goto out;
+
+	while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) {
+		atomic_inc(&ibmr->ic->i_fastunreg_wrs);
+		cpu_relax();
+	}
+
+	frmr->fr_inv = true;
+	s_wr = &frmr->fr_wr;
+
+	memset(s_wr, 0, sizeof(*s_wr));
+	s_wr->wr_id = (unsigned long)(void *)ibmr;
+	s_wr->opcode = IB_WR_LOCAL_INV;
+	s_wr->ex.invalidate_rkey = frmr->mr->rkey;
+	s_wr->send_flags = IB_SEND_SIGNALED;
+
+	failed_wr = s_wr;
+	ret = ib_post_send(i_cm_id->qp, s_wr, &failed_wr);
+	WARN_ON(failed_wr != s_wr);
+	if (unlikely(ret)) {
+		frmr->fr_state = FRMR_IS_STALE;
+		frmr->fr_inv = false;
+		atomic_inc(&ibmr->ic->i_fastunreg_wrs);
+		pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
+		goto out;
+	}
+out:
+	return ret;
+}
+
+void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
+{
+	struct rds_ib_mr *ibmr = (void *)(unsigned long)wc->wr_id;
+	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+
+	if (wc->status != IB_WC_SUCCESS) {
+		frmr->fr_state = FRMR_IS_STALE;
+		if (rds_conn_up(ic->conn))
+			rds_ib_conn_error(ic->conn,
+					  "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n",
+					  &ic->conn->c_laddr,
+					  &ic->conn->c_faddr,
+					  wc->status,
+					  ib_wc_status_msg(wc->status),
+					  wc->vendor_err);
+	}
+
+	if (frmr->fr_inv) {
+		frmr->fr_state = FRMR_IS_FREE;
+		frmr->fr_inv = false;
+		atomic_inc(&ic->i_fastreg_wrs);
+	} else {
+		atomic_inc(&ic->i_fastunreg_wrs);
+	}
+}
+
+void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
+		       unsigned long *unpinned, unsigned int goal)
+{
+	struct rds_ib_mr *ibmr, *next;
+	struct rds_ib_frmr *frmr;
+	int ret = 0;
+	unsigned int freed = *nfreed;
+
+	/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+	list_for_each_entry(ibmr, list, unmap_list) {
+		if (ibmr->sg_dma_len)
+			ret |= rds_ib_post_inv(ibmr);
+	}
+	if (ret)
+		pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret);
+
+	/* Now we can destroy the DMA mapping and unpin any pages */
+	list_for_each_entry_safe(ibmr, next, list, unmap_list) {
+		*unpinned += ibmr->sg_len;
+		frmr = &ibmr->u.frmr;
+		__rds_ib_teardown_mr(ibmr);
+		if (freed < goal || frmr->fr_state == FRMR_IS_STALE) {
+			/* Don't de-allocate if the MR is not free yet */
+			if (frmr->fr_state == FRMR_IS_INUSE)
+				continue;
+
+			if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+				rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
+			else
+				rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
+			list_del(&ibmr->unmap_list);
+			if (frmr->mr)
+				ib_dereg_mr(frmr->mr);
+			kfree(ibmr);
+			freed++;
+		}
+	}
+	*nfreed = freed;
+}
+
+struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
+				  struct rds_ib_connection *ic,
+				  struct scatterlist *sg,
+				  unsigned long nents, u32 *key)
+{
+	struct rds_ib_mr *ibmr = NULL;
+	struct rds_ib_frmr *frmr;
+	int ret;
+
+	do {
+		if (ibmr)
+			rds_ib_free_frmr(ibmr, true);
+		ibmr = rds_ib_alloc_frmr(rds_ibdev, nents);
+		if (IS_ERR(ibmr))
+			return ibmr;
+		frmr = &ibmr->u.frmr;
+	} while (frmr->fr_state != FRMR_IS_FREE);
+
+	ibmr->ic = ic;
+	ibmr->device = rds_ibdev;
+	ret = rds_ib_map_frmr(rds_ibdev, ibmr->pool, ibmr, sg, nents);
+	if (ret == 0) {
+		*key = frmr->mr->rkey;
+	} else {
+		rds_ib_free_frmr(ibmr, false);
+		ibmr = ERR_PTR(ret);
+	}
+
+	return ibmr;
+}
+
+void rds_ib_free_frmr_list(struct rds_ib_mr *ibmr)
+{
+	struct rds_ib_mr_pool *pool = ibmr->pool;
+	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+
+	if (frmr->fr_state == FRMR_IS_STALE)
+		llist_add(&ibmr->llnode, &pool->drop_list);
+	else
+		llist_add(&ibmr->llnode, &pool->free_list);
+}
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
new file mode 100644
index 0000000..0ea4ab0
--- /dev/null
+++ b/net/rds/ib_mr.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2016 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _RDS_IB_MR_H
+#define _RDS_IB_MR_H
+
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "ib.h"
+
+#define RDS_MR_1M_POOL_SIZE		(8192 / 2)
+#define RDS_MR_1M_MSG_SIZE		256
+#define RDS_MR_8K_MSG_SIZE		2
+#define RDS_MR_8K_SCALE			(256 / (RDS_MR_8K_MSG_SIZE + 1))
+#define RDS_MR_8K_POOL_SIZE		(RDS_MR_8K_SCALE * (8192 / 2))
+
+struct rds_ib_fmr {
+	struct ib_fmr		*fmr;
+};
+
+enum rds_ib_fr_state {
+	FRMR_IS_FREE,	/* mr invalidated & ready for use */
+	FRMR_IS_INUSE,	/* mr is in use or used & can be invalidated */
+	FRMR_IS_STALE,	/* Stale MR and needs to be dropped  */
+};
+
+struct rds_ib_frmr {
+	struct ib_mr		*mr;
+	enum rds_ib_fr_state	fr_state;
+	bool			fr_inv;
+	struct ib_send_wr	fr_wr;
+	unsigned int		dma_npages;
+	unsigned int		sg_byte_len;
+};
+
+/* This is stored as mr->r_trans_private. */
+struct rds_ib_mr {
+	struct rds_ib_device		*device;
+	struct rds_ib_mr_pool		*pool;
+	struct rds_ib_connection	*ic;
+
+	struct llist_node		llnode;
+
+	/* unmap_list is for freeing */
+	struct list_head		unmap_list;
+	unsigned int			remap_count;
+
+	struct scatterlist		*sg;
+	unsigned int			sg_len;
+	int				sg_dma_len;
+
+	union {
+		struct rds_ib_fmr	fmr;
+		struct rds_ib_frmr	frmr;
+	} u;
+};
+
+/* Our own little MR pool */
+struct rds_ib_mr_pool {
+	unsigned int            pool_type;
+	struct mutex		flush_lock;	/* serialize fmr invalidate */
+	struct delayed_work	flush_worker;	/* flush worker */
+
+	atomic_t		item_count;	/* total # of MRs */
+	atomic_t		dirty_count;	/* # dirty of MRs */
+
+	struct llist_head	drop_list;	/* MRs not reached max_maps */
+	struct llist_head	free_list;	/* unused MRs */
+	struct llist_head	clean_list;	/* unused & unmapped MRs */
+	wait_queue_head_t	flush_wait;
+
+	atomic_t		free_pinned;	/* memory pinned by free MRs */
+	unsigned long		max_items;
+	unsigned long		max_items_soft;
+	unsigned long		max_free_pinned;
+	struct ib_fmr_attr	fmr_attr;
+	bool			use_fastreg;
+};
+
+extern struct workqueue_struct *rds_ib_mr_wq;
+extern bool prefer_frmr;
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
+					     int npages);
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+			struct rds_info_rdma_connection *iinfo);
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret);
+void rds_ib_sync_mr(void *trans_private, int dir);
+void rds_ib_free_mr(void *trans_private, int invalidate);
+void rds_ib_flush_mrs(void);
+int rds_ib_mr_init(void);
+void rds_ib_mr_exit(void);
+
+void __rds_ib_teardown_mr(struct rds_ib_mr *);
+void rds_ib_teardown_mr(struct rds_ib_mr *);
+struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *, int);
+struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *);
+int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **);
+struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *, struct scatterlist *,
+				 unsigned long, u32 *);
+struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *);
+void rds_ib_unreg_fmr(struct list_head *, unsigned int *,
+		      unsigned long *, unsigned int);
+void rds_ib_free_fmr_list(struct rds_ib_mr *);
+struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
+				  struct rds_ib_connection *ic,
+				  struct scatterlist *sg,
+				  unsigned long nents, u32 *key);
+void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
+		       unsigned long *unpinned, unsigned int goal);
+void rds_ib_free_frmr_list(struct rds_ib_mr *);
+#endif
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
new file mode 100644
index 0000000..e678699
--- /dev/null
+++ b/net/rds/ib_rdma.c
@@ -0,0 +1,635 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/llist.h>
+
+#include "rds_single_path.h"
+#include "ib_mr.h"
+
+struct workqueue_struct *rds_ib_mr_wq;
+
+static DEFINE_PER_CPU(unsigned long, clean_list_grace);
+#define CLEAN_LIST_BUSY_BIT 0
+
+static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
+{
+	struct rds_ib_device *rds_ibdev;
+	struct rds_ib_ipaddr *i_ipaddr;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
+		list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+			if (i_ipaddr->ipaddr == ipaddr) {
+				refcount_inc(&rds_ibdev->refcount);
+				rcu_read_unlock();
+				return rds_ibdev;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+	struct rds_ib_ipaddr *i_ipaddr;
+
+	i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
+	if (!i_ipaddr)
+		return -ENOMEM;
+
+	i_ipaddr->ipaddr = ipaddr;
+
+	spin_lock_irq(&rds_ibdev->spinlock);
+	list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	return 0;
+}
+
+static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+	struct rds_ib_ipaddr *i_ipaddr;
+	struct rds_ib_ipaddr *to_free = NULL;
+
+
+	spin_lock_irq(&rds_ibdev->spinlock);
+	list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+		if (i_ipaddr->ipaddr == ipaddr) {
+			list_del_rcu(&i_ipaddr->list);
+			to_free = i_ipaddr;
+			break;
+		}
+	}
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	if (to_free)
+		kfree_rcu(to_free, rcu);
+}
+
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+	struct rds_ib_device *rds_ibdev_old;
+
+	rds_ibdev_old = rds_ib_get_device(ipaddr);
+	if (!rds_ibdev_old)
+		return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+
+	if (rds_ibdev_old != rds_ibdev) {
+		rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+		rds_ib_dev_put(rds_ibdev_old);
+		return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+	}
+	rds_ib_dev_put(rds_ibdev_old);
+
+	return 0;
+}
+
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	/* conn was previously on the nodev_conns_list */
+	spin_lock_irq(&ib_nodev_conns_lock);
+	BUG_ON(list_empty(&ib_nodev_conns));
+	BUG_ON(list_empty(&ic->ib_node));
+	list_del(&ic->ib_node);
+
+	spin_lock(&rds_ibdev->spinlock);
+	list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
+	spin_unlock(&rds_ibdev->spinlock);
+	spin_unlock_irq(&ib_nodev_conns_lock);
+
+	ic->rds_ibdev = rds_ibdev;
+	refcount_inc(&rds_ibdev->refcount);
+}
+
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	/* place conn on nodev_conns_list */
+	spin_lock(&ib_nodev_conns_lock);
+
+	spin_lock_irq(&rds_ibdev->spinlock);
+	BUG_ON(list_empty(&ic->ib_node));
+	list_del(&ic->ib_node);
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	list_add_tail(&ic->ib_node, &ib_nodev_conns);
+
+	spin_unlock(&ib_nodev_conns_lock);
+
+	ic->rds_ibdev = NULL;
+	rds_ib_dev_put(rds_ibdev);
+}
+
+void rds_ib_destroy_nodev_conns(void)
+{
+	struct rds_ib_connection *ic, *_ic;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&ib_nodev_conns_lock);
+	list_splice(&ib_nodev_conns, &tmp_list);
+	spin_unlock_irq(&ib_nodev_conns_lock);
+
+	list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
+		rds_conn_destroy(ic->conn);
+}
+
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
+{
+	struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
+
+	iinfo->rdma_mr_max = pool_1m->max_items;
+	iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
+}
+
+struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
+{
+	struct rds_ib_mr *ibmr = NULL;
+	struct llist_node *ret;
+	unsigned long *flag;
+
+	preempt_disable();
+	flag = this_cpu_ptr(&clean_list_grace);
+	set_bit(CLEAN_LIST_BUSY_BIT, flag);
+	ret = llist_del_first(&pool->clean_list);
+	if (ret) {
+		ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
+		if (pool->pool_type == RDS_IB_MR_8K_POOL)
+			rds_ib_stats_inc(s_ib_rdma_mr_8k_reused);
+		else
+			rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
+	}
+
+	clear_bit(CLEAN_LIST_BUSY_BIT, flag);
+	preempt_enable();
+	return ibmr;
+}
+
+static inline void wait_clean_list_grace(void)
+{
+	int cpu;
+	unsigned long *flag;
+
+	for_each_online_cpu(cpu) {
+		flag = &per_cpu(clean_list_grace, cpu);
+		while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
+			cpu_relax();
+	}
+}
+
+void rds_ib_sync_mr(void *trans_private, int direction)
+{
+	struct rds_ib_mr *ibmr = trans_private;
+	struct rds_ib_device *rds_ibdev = ibmr->device;
+
+	switch (direction) {
+	case DMA_FROM_DEVICE:
+		ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
+			ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+		break;
+	case DMA_TO_DEVICE:
+		ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
+			ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+		break;
+	}
+}
+
+void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+	struct rds_ib_device *rds_ibdev = ibmr->device;
+
+	if (ibmr->sg_dma_len) {
+		ib_dma_unmap_sg(rds_ibdev->dev,
+				ibmr->sg, ibmr->sg_len,
+				DMA_BIDIRECTIONAL);
+		ibmr->sg_dma_len = 0;
+	}
+
+	/* Release the s/g list */
+	if (ibmr->sg_len) {
+		unsigned int i;
+
+		for (i = 0; i < ibmr->sg_len; ++i) {
+			struct page *page = sg_page(&ibmr->sg[i]);
+
+			/* FIXME we need a way to tell a r/w MR
+			 * from a r/o MR */
+			WARN_ON(!page->mapping && irqs_disabled());
+			set_page_dirty(page);
+			put_page(page);
+		}
+		kfree(ibmr->sg);
+
+		ibmr->sg = NULL;
+		ibmr->sg_len = 0;
+	}
+}
+
+void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+	unsigned int pinned = ibmr->sg_len;
+
+	__rds_ib_teardown_mr(ibmr);
+	if (pinned) {
+		struct rds_ib_mr_pool *pool = ibmr->pool;
+
+		atomic_sub(pinned, &pool->free_pinned);
+	}
+}
+
+static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
+{
+	unsigned int item_count;
+
+	item_count = atomic_read(&pool->item_count);
+	if (free_all)
+		return item_count;
+
+	return 0;
+}
+
+/*
+ * given an llist of mrs, put them all into the list_head for more processing
+ */
+static unsigned int llist_append_to_list(struct llist_head *llist,
+					 struct list_head *list)
+{
+	struct rds_ib_mr *ibmr;
+	struct llist_node *node;
+	struct llist_node *next;
+	unsigned int count = 0;
+
+	node = llist_del_all(llist);
+	while (node) {
+		next = node->next;
+		ibmr = llist_entry(node, struct rds_ib_mr, llnode);
+		list_add_tail(&ibmr->unmap_list, list);
+		node = next;
+		count++;
+	}
+	return count;
+}
+
+/*
+ * this takes a list head of mrs and turns it into linked llist nodes
+ * of clusters.  Each cluster has linked llist nodes of
+ * MR_CLUSTER_SIZE mrs that are ready for reuse.
+ */
+static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
+				struct list_head *list,
+				struct llist_node **nodes_head,
+				struct llist_node **nodes_tail)
+{
+	struct rds_ib_mr *ibmr;
+	struct llist_node *cur = NULL;
+	struct llist_node **next = nodes_head;
+
+	list_for_each_entry(ibmr, list, unmap_list) {
+		cur = &ibmr->llnode;
+		*next = cur;
+		next = &cur->next;
+	}
+	*next = NULL;
+	*nodes_tail = cur;
+}
+
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
+			 int free_all, struct rds_ib_mr **ibmr_ret)
+{
+	struct rds_ib_mr *ibmr;
+	struct llist_node *clean_nodes;
+	struct llist_node *clean_tail;
+	LIST_HEAD(unmap_list);
+	unsigned long unpinned = 0;
+	unsigned int nfreed = 0, dirty_to_clean = 0, free_goal;
+
+	if (pool->pool_type == RDS_IB_MR_8K_POOL)
+		rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush);
+	else
+		rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush);
+
+	if (ibmr_ret) {
+		DEFINE_WAIT(wait);
+		while (!mutex_trylock(&pool->flush_lock)) {
+			ibmr = rds_ib_reuse_mr(pool);
+			if (ibmr) {
+				*ibmr_ret = ibmr;
+				finish_wait(&pool->flush_wait, &wait);
+				goto out_nolock;
+			}
+
+			prepare_to_wait(&pool->flush_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (llist_empty(&pool->clean_list))
+				schedule();
+
+			ibmr = rds_ib_reuse_mr(pool);
+			if (ibmr) {
+				*ibmr_ret = ibmr;
+				finish_wait(&pool->flush_wait, &wait);
+				goto out_nolock;
+			}
+		}
+		finish_wait(&pool->flush_wait, &wait);
+	} else
+		mutex_lock(&pool->flush_lock);
+
+	if (ibmr_ret) {
+		ibmr = rds_ib_reuse_mr(pool);
+		if (ibmr) {
+			*ibmr_ret = ibmr;
+			goto out;
+		}
+	}
+
+	/* Get the list of all MRs to be dropped. Ordering matters -
+	 * we want to put drop_list ahead of free_list.
+	 */
+	dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
+	dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
+	if (free_all)
+		llist_append_to_list(&pool->clean_list, &unmap_list);
+
+	free_goal = rds_ib_flush_goal(pool, free_all);
+
+	if (list_empty(&unmap_list))
+		goto out;
+
+	if (pool->use_fastreg)
+		rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
+	else
+		rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
+
+	if (!list_empty(&unmap_list)) {
+		/* we have to make sure that none of the things we're about
+		 * to put on the clean list would race with other cpus trying
+		 * to pull items off.  The llist would explode if we managed to
+		 * remove something from the clean list and then add it back again
+		 * while another CPU was spinning on that same item in llist_del_first.
+		 *
+		 * This is pretty unlikely, but just in case  wait for an llist grace period
+		 * here before adding anything back into the clean list.
+		 */
+		wait_clean_list_grace();
+
+		list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
+		if (ibmr_ret)
+			*ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
+
+		/* more than one entry in llist nodes */
+		if (clean_nodes->next)
+			llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list);
+
+	}
+
+	atomic_sub(unpinned, &pool->free_pinned);
+	atomic_sub(dirty_to_clean, &pool->dirty_count);
+	atomic_sub(nfreed, &pool->item_count);
+
+out:
+	mutex_unlock(&pool->flush_lock);
+	if (waitqueue_active(&pool->flush_wait))
+		wake_up(&pool->flush_wait);
+out_nolock:
+	return 0;
+}
+
+struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
+{
+	struct rds_ib_mr *ibmr = NULL;
+	int iter = 0;
+
+	if (atomic_read(&pool->dirty_count) >= pool->max_items_soft / 10)
+		queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+
+	while (1) {
+		ibmr = rds_ib_reuse_mr(pool);
+		if (ibmr)
+			return ibmr;
+
+		if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+			break;
+
+		atomic_dec(&pool->item_count);
+
+		if (++iter > 2) {
+			if (pool->pool_type == RDS_IB_MR_8K_POOL)
+				rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
+			else
+				rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
+			return ERR_PTR(-EAGAIN);
+		}
+
+		/* We do have some empty MRs. Flush them out. */
+		if (pool->pool_type == RDS_IB_MR_8K_POOL)
+			rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
+		else
+			rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
+
+		rds_ib_flush_mr_pool(pool, 0, &ibmr);
+		if (ibmr)
+			return ibmr;
+	}
+
+	return ibmr;
+}
+
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
+{
+	struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
+
+	rds_ib_flush_mr_pool(pool, 0, NULL);
+}
+
+void rds_ib_free_mr(void *trans_private, int invalidate)
+{
+	struct rds_ib_mr *ibmr = trans_private;
+	struct rds_ib_mr_pool *pool = ibmr->pool;
+	struct rds_ib_device *rds_ibdev = ibmr->device;
+
+	rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
+
+	/* Return it to the pool's free list */
+	if (rds_ibdev->use_fastreg)
+		rds_ib_free_frmr_list(ibmr);
+	else
+		rds_ib_free_fmr_list(ibmr);
+
+	atomic_add(ibmr->sg_len, &pool->free_pinned);
+	atomic_inc(&pool->dirty_count);
+
+	/* If we've pinned too many pages, request a flush */
+	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+	    atomic_read(&pool->dirty_count) >= pool->max_items / 5)
+		queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+
+	if (invalidate) {
+		if (likely(!in_interrupt())) {
+			rds_ib_flush_mr_pool(pool, 0, NULL);
+		} else {
+			/* We get here if the user created a MR marked
+			 * as use_once and invalidate at the same time.
+			 */
+			queue_delayed_work(rds_ib_mr_wq,
+					   &pool->flush_worker, 10);
+		}
+	}
+
+	rds_ib_dev_put(rds_ibdev);
+}
+
+void rds_ib_flush_mrs(void)
+{
+	struct rds_ib_device *rds_ibdev;
+
+	down_read(&rds_ib_devices_lock);
+	list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+		if (rds_ibdev->mr_8k_pool)
+			rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL);
+
+		if (rds_ibdev->mr_1m_pool)
+			rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL);
+	}
+	up_read(&rds_ib_devices_lock);
+}
+
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret)
+{
+	struct rds_ib_device *rds_ibdev;
+	struct rds_ib_mr *ibmr = NULL;
+	struct rds_ib_connection *ic = rs->rs_conn->c_transport_data;
+	int ret;
+
+	rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
+	if (!rds_ibdev) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (rds_ibdev->use_fastreg)
+		ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
+	else
+		ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret);
+	if (ibmr)
+		rds_ibdev = NULL;
+
+ out:
+	if (!ibmr)
+		pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret);
+
+	if (rds_ibdev)
+		rds_ib_dev_put(rds_ibdev);
+
+	return ibmr;
+}
+
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
+{
+	cancel_delayed_work_sync(&pool->flush_worker);
+	rds_ib_flush_mr_pool(pool, 1, NULL);
+	WARN_ON(atomic_read(&pool->item_count));
+	WARN_ON(atomic_read(&pool->free_pinned));
+	kfree(pool);
+}
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
+					     int pool_type)
+{
+	struct rds_ib_mr_pool *pool;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	pool->pool_type = pool_type;
+	init_llist_head(&pool->free_list);
+	init_llist_head(&pool->drop_list);
+	init_llist_head(&pool->clean_list);
+	mutex_init(&pool->flush_lock);
+	init_waitqueue_head(&pool->flush_wait);
+	INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+
+	if (pool_type == RDS_IB_MR_1M_POOL) {
+		/* +1 allows for unaligned MRs */
+		pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1;
+		pool->max_items = rds_ibdev->max_1m_mrs;
+	} else {
+		/* pool_type == RDS_IB_MR_8K_POOL */
+		pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1;
+		pool->max_items = rds_ibdev->max_8k_mrs;
+	}
+
+	pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
+	pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
+	pool->fmr_attr.page_shift = PAGE_SHIFT;
+	pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4;
+	pool->use_fastreg = rds_ibdev->use_fastreg;
+
+	return pool;
+}
+
+int rds_ib_mr_init(void)
+{
+	rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0);
+	if (!rds_ib_mr_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+/* By the time this is called all the IB devices should have been torn down and
+ * had their pools freed.  As each pool is freed its work struct is waited on,
+ * so the pool flushing work queue should be idle by the time we get here.
+ */
+void rds_ib_mr_exit(void)
+{
+	destroy_workqueue(rds_ib_mr_wq);
+}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
new file mode 100644
index 0000000..b4e421a
--- /dev/null
+++ b/net/rds/ib_recv.c
@@ -0,0 +1,1071 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds_single_path.h"
+#include "rds.h"
+#include "ib.h"
+
+static struct kmem_cache *rds_ib_incoming_slab;
+static struct kmem_cache *rds_ib_frag_slab;
+static atomic_t	rds_ib_allocation = ATOMIC_INIT(0);
+
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
+{
+	struct rds_ib_recv_work *recv;
+	u32 i;
+
+	for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+		struct ib_sge *sge;
+
+		recv->r_ibinc = NULL;
+		recv->r_frag = NULL;
+
+		recv->r_wr.next = NULL;
+		recv->r_wr.wr_id = i;
+		recv->r_wr.sg_list = recv->r_sge;
+		recv->r_wr.num_sge = RDS_IB_RECV_SGE;
+
+		sge = &recv->r_sge[0];
+		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = ic->i_pd->local_dma_lkey;
+
+		sge = &recv->r_sge[1];
+		sge->addr = 0;
+		sge->length = RDS_FRAG_SIZE;
+		sge->lkey = ic->i_pd->local_dma_lkey;
+	}
+}
+
+/*
+ * The entire 'from' list, including the from element itself, is put on
+ * to the tail of the 'to' list.
+ */
+static void list_splice_entire_tail(struct list_head *from,
+				    struct list_head *to)
+{
+	struct list_head *from_last = from->prev;
+
+	list_splice_tail(from_last, to);
+	list_add_tail(from_last, to);
+}
+
+static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
+{
+	struct list_head *tmp;
+
+	tmp = xchg(&cache->xfer, NULL);
+	if (tmp) {
+		if (cache->ready)
+			list_splice_entire_tail(tmp, cache->ready);
+		else
+			cache->ready = tmp;
+	}
+}
+
+static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
+{
+	struct rds_ib_cache_head *head;
+	int cpu;
+
+	cache->percpu = alloc_percpu(struct rds_ib_cache_head);
+	if (!cache->percpu)
+	       return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		head = per_cpu_ptr(cache->percpu, cpu);
+		head->first = NULL;
+		head->count = 0;
+	}
+	cache->xfer = NULL;
+	cache->ready = NULL;
+
+	return 0;
+}
+
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
+{
+	int ret;
+
+	ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
+	if (!ret) {
+		ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
+		if (ret)
+			free_percpu(ic->i_cache_incs.percpu);
+	}
+
+	return ret;
+}
+
+static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
+					  struct list_head *caller_list)
+{
+	struct rds_ib_cache_head *head;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		head = per_cpu_ptr(cache->percpu, cpu);
+		if (head->first) {
+			list_splice_entire_tail(head->first, caller_list);
+			head->first = NULL;
+		}
+	}
+
+	if (cache->ready) {
+		list_splice_entire_tail(cache->ready, caller_list);
+		cache->ready = NULL;
+	}
+}
+
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
+{
+	struct rds_ib_incoming *inc;
+	struct rds_ib_incoming *inc_tmp;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *frag_tmp;
+	LIST_HEAD(list);
+
+	rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+	rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
+	free_percpu(ic->i_cache_incs.percpu);
+
+	list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
+		list_del(&inc->ii_cache_entry);
+		WARN_ON(!list_empty(&inc->ii_frags));
+		kmem_cache_free(rds_ib_incoming_slab, inc);
+	}
+
+	rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+	rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
+	free_percpu(ic->i_cache_frags.percpu);
+
+	list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
+		list_del(&frag->f_cache_entry);
+		WARN_ON(!list_empty(&frag->f_item));
+		kmem_cache_free(rds_ib_frag_slab, frag);
+	}
+}
+
+/* fwd decl */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+				  struct rds_ib_refill_cache *cache);
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
+
+
+/* Recycle frag and attached recv buffer f_sg */
+static void rds_ib_frag_free(struct rds_ib_connection *ic,
+			     struct rds_page_frag *frag)
+{
+	rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
+
+	rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+	atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
+	rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
+}
+
+/* Recycle inc after freeing attached frags */
+void rds_ib_inc_free(struct rds_incoming *inc)
+{
+	struct rds_ib_incoming *ibinc;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *pos;
+	struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
+
+	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+
+	/* Free attached frags */
+	list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+		list_del_init(&frag->f_item);
+		rds_ib_frag_free(ic, frag);
+	}
+	BUG_ON(!list_empty(&ibinc->ii_frags));
+
+	rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+	rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
+}
+
+static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
+				  struct rds_ib_recv_work *recv)
+{
+	if (recv->r_ibinc) {
+		rds_inc_put(&recv->r_ibinc->ii_inc);
+		recv->r_ibinc = NULL;
+	}
+	if (recv->r_frag) {
+		ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+		rds_ib_frag_free(ic, recv->r_frag);
+		recv->r_frag = NULL;
+	}
+}
+
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
+{
+	u32 i;
+
+	for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+		rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
+}
+
+static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
+						     gfp_t slab_mask)
+{
+	struct rds_ib_incoming *ibinc;
+	struct list_head *cache_item;
+	int avail_allocs;
+
+	cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
+	if (cache_item) {
+		ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
+	} else {
+		avail_allocs = atomic_add_unless(&rds_ib_allocation,
+						 1, rds_ib_sysctl_max_recv_allocation);
+		if (!avail_allocs) {
+			rds_ib_stats_inc(s_ib_rx_alloc_limit);
+			return NULL;
+		}
+		ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
+		if (!ibinc) {
+			atomic_dec(&rds_ib_allocation);
+			return NULL;
+		}
+		rds_ib_stats_inc(s_ib_rx_total_incs);
+	}
+	INIT_LIST_HEAD(&ibinc->ii_frags);
+	rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
+
+	return ibinc;
+}
+
+static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
+						    gfp_t slab_mask, gfp_t page_mask)
+{
+	struct rds_page_frag *frag;
+	struct list_head *cache_item;
+	int ret;
+
+	cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
+	if (cache_item) {
+		frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+		atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
+		rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
+	} else {
+		frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
+		if (!frag)
+			return NULL;
+
+		sg_init_table(&frag->f_sg, 1);
+		ret = rds_page_remainder_alloc(&frag->f_sg,
+					       RDS_FRAG_SIZE, page_mask);
+		if (ret) {
+			kmem_cache_free(rds_ib_frag_slab, frag);
+			return NULL;
+		}
+		rds_ib_stats_inc(s_ib_rx_total_frags);
+	}
+
+	INIT_LIST_HEAD(&frag->f_item);
+
+	return frag;
+}
+
+static int rds_ib_recv_refill_one(struct rds_connection *conn,
+				  struct rds_ib_recv_work *recv, gfp_t gfp)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_sge *sge;
+	int ret = -ENOMEM;
+	gfp_t slab_mask = GFP_NOWAIT;
+	gfp_t page_mask = GFP_NOWAIT;
+
+	if (gfp & __GFP_DIRECT_RECLAIM) {
+		slab_mask = GFP_KERNEL;
+		page_mask = GFP_HIGHUSER;
+	}
+
+	if (!ic->i_cache_incs.ready)
+		rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+	if (!ic->i_cache_frags.ready)
+		rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+
+	/*
+	 * ibinc was taken from recv if recv contained the start of a message.
+	 * recvs that were continuations will still have this allocated.
+	 */
+	if (!recv->r_ibinc) {
+		recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
+		if (!recv->r_ibinc)
+			goto out;
+	}
+
+	WARN_ON(recv->r_frag); /* leak! */
+	recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
+	if (!recv->r_frag)
+		goto out;
+
+	ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
+			    1, DMA_FROM_DEVICE);
+	WARN_ON(ret != 1);
+
+	sge = &recv->r_sge[0];
+	sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+	sge->length = sizeof(struct rds_header);
+
+	sge = &recv->r_sge[1];
+	sge->addr = ib_sg_dma_address(ic->i_cm_id->device, &recv->r_frag->f_sg);
+	sge->length = ib_sg_dma_len(ic->i_cm_id->device, &recv->r_frag->f_sg);
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static int acquire_refill(struct rds_connection *conn)
+{
+	return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0;
+}
+
+static void release_refill(struct rds_connection *conn)
+{
+	clear_bit(RDS_RECV_REFILL, &conn->c_flags);
+
+	/* We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+	 * hot path and finding waiters is very rare.  We don't want to walk
+	 * the system-wide hashed waitqueue buckets in the fast path only to
+	 * almost never find waiters.
+	 */
+	if (waitqueue_active(&conn->c_waitq))
+		wake_up_all(&conn->c_waitq);
+}
+
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_recv_work *recv;
+	struct ib_recv_wr *failed_wr;
+	unsigned int posted = 0;
+	int ret = 0;
+	bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
+	u32 pos;
+
+	/* the goal here is to just make sure that someone, somewhere
+	 * is posting buffers.  If we can't get the refill lock,
+	 * let them do their thing
+	 */
+	if (!acquire_refill(conn))
+		return;
+
+	while ((prefill || rds_conn_up(conn)) &&
+	       rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+		if (pos >= ic->i_recv_ring.w_nr) {
+			printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+					pos);
+			break;
+		}
+
+		recv = &ic->i_recvs[pos];
+		ret = rds_ib_recv_refill_one(conn, recv, gfp);
+		if (ret) {
+			break;
+		}
+
+		rdsdebug("recv %p ibinc %p page %p addr %lu\n", recv,
+			 recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
+			 (long) ib_sg_dma_address(
+				ic->i_cm_id->device,
+				&recv->r_frag->f_sg));
+
+		/* XXX when can this fail? */
+		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+		if (ret) {
+			rds_ib_conn_error(conn, "recv post on "
+			       "%pI4 returned %d, disconnecting and "
+			       "reconnecting\n", &conn->c_faddr,
+			       ret);
+			break;
+		}
+
+		posted++;
+	}
+
+	/* We're doing flow control - update the window. */
+	if (ic->i_flowctl && posted)
+		rds_ib_advertise_credits(conn, posted);
+
+	if (ret)
+		rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+
+	release_refill(conn);
+
+	/* if we're called from the softirq handler, we'll be GFP_NOWAIT.
+	 * in this case the ring being low is going to lead to more interrupts
+	 * and we can safely let the softirq code take care of it unless the
+	 * ring is completely empty.
+	 *
+	 * if we're called from krdsd, we'll be GFP_KERNEL.  In this case
+	 * we might have raced with the softirq code while we had the refill
+	 * lock held.  Use rds_ib_ring_low() instead of ring_empty to decide
+	 * if we should requeue.
+	 */
+	if (rds_conn_up(conn) &&
+	    ((can_wait && rds_ib_ring_low(&ic->i_recv_ring)) ||
+	    rds_ib_ring_empty(&ic->i_recv_ring))) {
+		queue_delayed_work(rds_wq, &conn->c_recv_w, 1);
+	}
+}
+
+/*
+ * We want to recycle several types of recv allocations, like incs and frags.
+ * To use this, the *_free() function passes in the ptr to a list_head within
+ * the recyclee, as well as the cache to put it on.
+ *
+ * First, we put the memory on a percpu list. When this reaches a certain size,
+ * We move it to an intermediate non-percpu list in a lockless manner, with some
+ * xchg/compxchg wizardry.
+ *
+ * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
+ * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
+ * list_empty() will return true with one element is actually present.
+ */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+				 struct rds_ib_refill_cache *cache)
+{
+	unsigned long flags;
+	struct list_head *old, *chpfirst;
+
+	local_irq_save(flags);
+
+	chpfirst = __this_cpu_read(cache->percpu->first);
+	if (!chpfirst)
+		INIT_LIST_HEAD(new_item);
+	else /* put on front */
+		list_add_tail(new_item, chpfirst);
+
+	__this_cpu_write(cache->percpu->first, new_item);
+	__this_cpu_inc(cache->percpu->count);
+
+	if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT)
+		goto end;
+
+	/*
+	 * Return our per-cpu first list to the cache's xfer by atomically
+	 * grabbing the current xfer list, appending it to our per-cpu list,
+	 * and then atomically returning that entire list back to the
+	 * cache's xfer list as long as it's still empty.
+	 */
+	do {
+		old = xchg(&cache->xfer, NULL);
+		if (old)
+			list_splice_entire_tail(old, chpfirst);
+		old = cmpxchg(&cache->xfer, NULL, chpfirst);
+	} while (old);
+
+
+	__this_cpu_write(cache->percpu->first, NULL);
+	__this_cpu_write(cache->percpu->count, 0);
+end:
+	local_irq_restore(flags);
+}
+
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
+{
+	struct list_head *head = cache->ready;
+
+	if (head) {
+		if (!list_empty(head)) {
+			cache->ready = head->next;
+			list_del_init(head);
+		} else
+			cache->ready = NULL;
+	}
+
+	return head;
+}
+
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+{
+	struct rds_ib_incoming *ibinc;
+	struct rds_page_frag *frag;
+	unsigned long to_copy;
+	unsigned long frag_off = 0;
+	int copied = 0;
+	int ret;
+	u32 len;
+
+	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+	frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+	len = be32_to_cpu(inc->i_hdr.h_len);
+
+	while (iov_iter_count(to) && copied < len) {
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+		to_copy = min_t(unsigned long, iov_iter_count(to),
+				RDS_FRAG_SIZE - frag_off);
+		to_copy = min_t(unsigned long, to_copy, len - copied);
+
+		/* XXX needs + offset for multiple recvs per page */
+		rds_stats_add(s_copy_to_user, to_copy);
+		ret = copy_page_to_iter(sg_page(&frag->f_sg),
+					frag->f_sg.offset + frag_off,
+					to_copy,
+					to);
+		if (ret != to_copy)
+			return -EFAULT;
+
+		frag_off += to_copy;
+		copied += to_copy;
+	}
+
+	return copied;
+}
+
+/* ic starts out kzalloc()ed */
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
+{
+	struct ib_send_wr *wr = &ic->i_ack_wr;
+	struct ib_sge *sge = &ic->i_ack_sge;
+
+	sge->addr = ic->i_ack_dma;
+	sge->length = sizeof(struct rds_header);
+	sge->lkey = ic->i_pd->local_dma_lkey;
+
+	wr->sg_list = sge;
+	wr->num_sge = 1;
+	wr->opcode = IB_WR_SEND;
+	wr->wr_id = RDS_IB_ACK_WR_ID;
+	wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received.  The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory.  This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed.  This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue.  To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time.  This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight.  This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame.  This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do.  The QP attribute specifically makes
+ * room for it beyond the ring size.  Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+#ifndef KERNEL_HAS_ATOMIC64
+void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	ic->i_ack_next = seq;
+	if (ack_required)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+}
+
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+	unsigned long flags;
+	u64 seq;
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	seq = ic->i_ack_next;
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+
+	return seq;
+}
+#else
+void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
+{
+	atomic64_set(&ic->i_ack_next, seq);
+	if (ack_required) {
+		smp_mb__before_atomic();
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	}
+}
+
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	smp_mb__after_atomic();
+
+	return atomic64_read(&ic->i_ack_next);
+}
+#endif
+
+
+static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
+{
+	struct rds_header *hdr = ic->i_ack;
+	struct ib_send_wr *failed_wr;
+	u64 seq;
+	int ret;
+
+	seq = rds_ib_get_ack(ic);
+
+	rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+	rds_message_populate_header(hdr, 0, 0, 0);
+	hdr->h_ack = cpu_to_be64(seq);
+	hdr->h_credit = adv_credits;
+	rds_message_make_checksum(hdr);
+	ic->i_ack_queued = jiffies;
+
+	ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+	if (unlikely(ret)) {
+		/* Failed to send. Release the WR, and
+		 * force another ACK.
+		 */
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+		rds_ib_stats_inc(s_ib_ack_send_failure);
+
+		rds_ib_conn_error(ic->conn, "sending ack failed\n");
+	} else
+		rds_ib_stats_inc(s_ib_ack_sent);
+}
+
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ *  1.	We call rds_ib_attempt_ack from the recv completion handler
+ *	to send an ACK-only frame.
+ *	However, there can be only one such frame in the send queue
+ *	at any time, so we may have to postpone it.
+ *  2.	When another (data) packet is transmitted while there's
+ *	an ACK in the queue, we piggyback the ACK sequence number
+ *	on the data packet.
+ *  3.	If the ACK WR is done sending, we get called from the
+ *	send queue completion handler, and check whether there's
+ *	another ACK pending (postponed because the WR was on the
+ *	queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ *  -	i_ack_flags, which keeps track of whether the ACK WR
+ *	is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ *  -	i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ * It would be nice to not have to use a spinlock to synchronize things,
+ * but the one problem that rules this out is that 64bit updates are
+ * not atomic on all platforms. Things would be a lot simpler if
+ * we had atomic64 or maybe cmpxchg64 everywhere.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_ib_attempt_ack(struct rds_ib_connection *ic)
+{
+	unsigned int adv_credits;
+
+	if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		return;
+
+	if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+		rds_ib_stats_inc(s_ib_ack_send_delayed);
+		return;
+	}
+
+	/* Can we get a send credit? */
+	if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+		rds_ib_stats_inc(s_ib_tx_throttle);
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		return;
+	}
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	rds_ib_send_ack(ic, adv_credits);
+}
+
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
+{
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+	rds_ib_attempt_ack(ic);
+}
+
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
+{
+	if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		rds_ib_stats_inc(s_ib_ack_send_piggybacked);
+	return rds_ib_get_ack(ic);
+}
+
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
+ * them.  But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient.  By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_ib_cong_recv(struct rds_connection *conn,
+			      struct rds_ib_incoming *ibinc)
+{
+	struct rds_cong_map *map;
+	unsigned int map_off;
+	unsigned int map_page;
+	struct rds_page_frag *frag;
+	unsigned long frag_off;
+	unsigned long to_copy;
+	unsigned long copied;
+	uint64_t uncongested = 0;
+	void *addr;
+
+	/* catch completely corrupt packets */
+	if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+		return;
+
+	map = conn->c_fcong;
+	map_page = 0;
+	map_off = 0;
+
+	frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+	frag_off = 0;
+
+	copied = 0;
+
+	while (copied < RDS_CONG_MAP_BYTES) {
+		uint64_t *src, *dst;
+		unsigned int k;
+
+		to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+		BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+
+		addr = kmap_atomic(sg_page(&frag->f_sg));
+
+		src = addr + frag->f_sg.offset + frag_off;
+		dst = (void *)map->m_page_addrs[map_page] + map_off;
+		for (k = 0; k < to_copy; k += 8) {
+			/* Record ports that became uncongested, ie
+			 * bits that changed from 0 to 1. */
+			uncongested |= ~(*src) & *dst;
+			*dst++ = *src++;
+		}
+		kunmap_atomic(addr);
+
+		copied += to_copy;
+
+		map_off += to_copy;
+		if (map_off == PAGE_SIZE) {
+			map_off = 0;
+			map_page++;
+		}
+
+		frag_off += to_copy;
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+	}
+
+	/* the congestion map is in little endian order */
+	uncongested = le64_to_cpu(uncongested);
+
+	rds_cong_map_updated(map, uncongested);
+}
+
+static void rds_ib_process_recv(struct rds_connection *conn,
+				struct rds_ib_recv_work *recv, u32 data_len,
+				struct rds_ib_ack_state *state)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_incoming *ibinc = ic->i_ibinc;
+	struct rds_header *ihdr, *hdr;
+
+	/* XXX shut down the connection if port 0,0 are seen? */
+
+	rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
+		 data_len);
+
+	if (data_len < sizeof(struct rds_header)) {
+		rds_ib_conn_error(conn, "incoming message "
+		       "from %pI4 didn't include a "
+		       "header, disconnecting and "
+		       "reconnecting\n",
+		       &conn->c_faddr);
+		return;
+	}
+	data_len -= sizeof(struct rds_header);
+
+	ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+
+	/* Validate the checksum. */
+	if (!rds_message_verify_checksum(ihdr)) {
+		rds_ib_conn_error(conn, "incoming message "
+		       "from %pI4 has corrupted header - "
+		       "forcing a reconnect\n",
+		       &conn->c_faddr);
+		rds_stats_inc(s_recv_drop_bad_checksum);
+		return;
+	}
+
+	/* Process the ACK sequence which comes with every packet */
+	state->ack_recv = be64_to_cpu(ihdr->h_ack);
+	state->ack_recv_valid = 1;
+
+	/* Process the credits update if there was one */
+	if (ihdr->h_credit)
+		rds_ib_send_add_credits(conn, ihdr->h_credit);
+
+	if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
+		/* This is an ACK-only packet. The fact that it gets
+		 * special treatment here is that historically, ACKs
+		 * were rather special beasts.
+		 */
+		rds_ib_stats_inc(s_ib_ack_received);
+
+		/*
+		 * Usually the frags make their way on to incs and are then freed as
+		 * the inc is freed.  We don't go that route, so we have to drop the
+		 * page ref ourselves.  We can't just leave the page on the recv
+		 * because that confuses the dma mapping of pages and each recv's use
+		 * of a partial page.
+		 *
+		 * FIXME: Fold this into the code path below.
+		 */
+		rds_ib_frag_free(ic, recv->r_frag);
+		recv->r_frag = NULL;
+		return;
+	}
+
+	/*
+	 * If we don't already have an inc on the connection then this
+	 * fragment has a header and starts a message.. copy its header
+	 * into the inc and save the inc so we can hang upcoming fragments
+	 * off its list.
+	 */
+	if (!ibinc) {
+		ibinc = recv->r_ibinc;
+		recv->r_ibinc = NULL;
+		ic->i_ibinc = ibinc;
+
+		hdr = &ibinc->ii_inc.i_hdr;
+		ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+				local_clock();
+		memcpy(hdr, ihdr, sizeof(*hdr));
+		ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+		ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+				local_clock();
+
+		rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
+			 ic->i_recv_data_rem, hdr->h_flags);
+	} else {
+		hdr = &ibinc->ii_inc.i_hdr;
+		/* We can't just use memcmp here; fragments of a
+		 * single message may carry different ACKs */
+		if (hdr->h_sequence != ihdr->h_sequence ||
+		    hdr->h_len != ihdr->h_len ||
+		    hdr->h_sport != ihdr->h_sport ||
+		    hdr->h_dport != ihdr->h_dport) {
+			rds_ib_conn_error(conn,
+				"fragment header mismatch; forcing reconnect\n");
+			return;
+		}
+	}
+
+	list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
+	recv->r_frag = NULL;
+
+	if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+		ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+	else {
+		ic->i_recv_data_rem = 0;
+		ic->i_ibinc = NULL;
+
+		if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+			rds_ib_cong_recv(conn, ibinc);
+		else {
+			rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+					  &ibinc->ii_inc, GFP_ATOMIC);
+			state->ack_next = be64_to_cpu(hdr->h_sequence);
+			state->ack_next_valid = 1;
+		}
+
+		/* Evaluate the ACK_REQUIRED flag *after* we received
+		 * the complete frame, and after bumping the next_rx
+		 * sequence. */
+		if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+			rds_stats_inc(s_recv_ack_required);
+			state->ack_required = 1;
+		}
+
+		rds_inc_put(&ibinc->ii_inc);
+	}
+}
+
+void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
+			     struct ib_wc *wc,
+			     struct rds_ib_ack_state *state)
+{
+	struct rds_connection *conn = ic->conn;
+	struct rds_ib_recv_work *recv;
+
+	rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+		 (unsigned long long)wc->wr_id, wc->status,
+		 ib_wc_status_msg(wc->status), wc->byte_len,
+		 be32_to_cpu(wc->ex.imm_data));
+
+	rds_ib_stats_inc(s_ib_rx_cq_event);
+	recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
+	ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1,
+			DMA_FROM_DEVICE);
+
+	/* Also process recvs in connecting state because it is possible
+	 * to get a recv completion _before_ the rdmacm ESTABLISHED
+	 * event is processed.
+	 */
+	if (wc->status == IB_WC_SUCCESS) {
+		rds_ib_process_recv(conn, recv, wc->byte_len, state);
+	} else {
+		/* We expect errors as the qp is drained during shutdown */
+		if (rds_conn_up(conn) || rds_conn_connecting(conn))
+			rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+					  &conn->c_laddr, &conn->c_faddr,
+					  wc->status,
+					  ib_wc_status_msg(wc->status));
+	}
+
+	/* rds_ib_process_recv() doesn't always consume the frag, and
+	 * we might not have called it at all if the wc didn't indicate
+	 * success. We already unmapped the frag's pages, though, and
+	 * the following rds_ib_ring_free() call tells the refill path
+	 * that it will not find an allocated frag here. Make sure we
+	 * keep that promise by freeing a frag that's still on the ring.
+	 */
+	if (recv->r_frag) {
+		rds_ib_frag_free(ic, recv->r_frag);
+		recv->r_frag = NULL;
+	}
+	rds_ib_ring_free(&ic->i_recv_ring, 1);
+
+	/* If we ever end up with a really empty receive ring, we're
+	 * in deep trouble, as the sender will definitely see RNR
+	 * timeouts. */
+	if (rds_ib_ring_empty(&ic->i_recv_ring))
+		rds_ib_stats_inc(s_ib_rx_ring_empty);
+
+	if (rds_ib_ring_low(&ic->i_recv_ring)) {
+		rds_ib_recv_refill(conn, 0, GFP_NOWAIT);
+		rds_ib_stats_inc(s_ib_rx_refill_from_cq);
+	}
+}
+
+int rds_ib_recv_path(struct rds_conn_path *cp)
+{
+	struct rds_connection *conn = cp->cp_conn;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	int ret = 0;
+
+	rdsdebug("conn %p\n", conn);
+	if (rds_conn_up(conn)) {
+		rds_ib_attempt_ack(ic);
+		rds_ib_recv_refill(conn, 0, GFP_KERNEL);
+		rds_ib_stats_inc(s_ib_rx_refill_from_thread);
+	}
+
+	return ret;
+}
+
+int rds_ib_recv_init(void)
+{
+	struct sysinfo si;
+	int ret = -ENOMEM;
+
+	/* Default to 30% of all available RAM for recv memory */
+	si_meminfo(&si);
+	rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+
+	rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
+					sizeof(struct rds_ib_incoming),
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!rds_ib_incoming_slab)
+		goto out;
+
+	rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
+					sizeof(struct rds_page_frag),
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!rds_ib_frag_slab) {
+		kmem_cache_destroy(rds_ib_incoming_slab);
+		rds_ib_incoming_slab = NULL;
+	} else
+		ret = 0;
+out:
+	return ret;
+}
+
+void rds_ib_recv_exit(void)
+{
+	kmem_cache_destroy(rds_ib_incoming_slab);
+	kmem_cache_destroy(rds_ib_frag_slab);
+}
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c
new file mode 100644
index 0000000..ff97e8e
--- /dev/null
+++ b/net/rds/ib_ring.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "ib.h"
+
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait);
+
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr)
+{
+	memset(ring, 0, sizeof(*ring));
+	ring->w_nr = nr;
+	rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+
+static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring)
+{
+	u32 diff;
+
+	/* This assumes that atomic_t has at least as many bits as u32 */
+	diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+	BUG_ON(diff > ring->w_nr);
+
+	return diff;
+}
+
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr)
+{
+	/* We only ever get called from the connection setup code,
+	 * prior to creating the QP. */
+	BUG_ON(__rds_ib_ring_used(ring));
+	ring->w_nr = nr;
+}
+
+static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+	return __rds_ib_ring_used(ring) == 0;
+}
+
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos)
+{
+	u32 ret = 0, avail;
+
+	avail = ring->w_nr - __rds_ib_ring_used(ring);
+
+	rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+		 ring->w_alloc_ptr, avail);
+
+	if (val && avail) {
+		ret = min(val, avail);
+		*pos = ring->w_alloc_ptr;
+
+		ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+		ring->w_alloc_ctr += ret;
+	}
+
+	return ret;
+}
+
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val)
+{
+	ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+	atomic_add(val, &ring->w_free_ctr);
+
+	if (__rds_ib_ring_empty(ring) &&
+	    waitqueue_active(&rds_ib_ring_empty_wait))
+		wake_up(&rds_ib_ring_empty_wait);
+}
+
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val)
+{
+	ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+	ring->w_alloc_ctr -= val;
+}
+
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+	return __rds_ib_ring_empty(ring);
+}
+
+int rds_ib_ring_low(struct rds_ib_work_ring *ring)
+{
+	return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1);
+}
+
+/*
+ * returns the oldest alloced ring entry.  This will be the next one
+ * freed.  This can't be called if there are none allocated.
+ */
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring)
+{
+	return ring->w_free_ptr;
+}
+
+/*
+ * returns the number of completed work requests.
+ */
+
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest)
+{
+	u32 ret;
+
+	if (oldest <= (unsigned long long)wr_id)
+		ret = (unsigned long long)wr_id - oldest + 1;
+	else
+		ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+
+	rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+		 wr_id, oldest);
+	return ret;
+}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
new file mode 100644
index 0000000..8557a1c
--- /dev/null
+++ b/net/rds/ib_send.c
@@ -0,0 +1,995 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/ratelimit.h>
+
+#include "rds_single_path.h"
+#include "rds.h"
+#include "ib.h"
+
+/*
+ * Convert IB-specific error message to RDS error message and call core
+ * completion handler.
+ */
+static void rds_ib_send_complete(struct rds_message *rm,
+				 int wc_status,
+				 void (*complete)(struct rds_message *rm, int status))
+{
+	int notify_status;
+
+	switch (wc_status) {
+	case IB_WC_WR_FLUSH_ERR:
+		return;
+
+	case IB_WC_SUCCESS:
+		notify_status = RDS_RDMA_SUCCESS;
+		break;
+
+	case IB_WC_REM_ACCESS_ERR:
+		notify_status = RDS_RDMA_REMOTE_ERROR;
+		break;
+
+	default:
+		notify_status = RDS_RDMA_OTHER_ERROR;
+		break;
+	}
+	complete(rm, notify_status);
+}
+
+static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
+				   struct rm_rdma_op *op,
+				   int wc_status)
+{
+	if (op->op_mapped) {
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+				op->op_sg, op->op_nents,
+				op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		op->op_mapped = 0;
+	}
+
+	/* If the user asked for a completion notification on this
+	 * message, we can implement three different semantics:
+	 *  1.	Notify when we received the ACK on the RDS message
+	 *	that was queued with the RDMA. This provides reliable
+	 *	notification of RDMA status at the expense of a one-way
+	 *	packet delay.
+	 *  2.	Notify when the IB stack gives us the completion event for
+	 *	the RDMA operation.
+	 *  3.	Notify when the IB stack gives us the completion event for
+	 *	the accompanying RDS messages.
+	 * Here, we implement approach #3. To implement approach #2,
+	 * we would need to take an event for the rdma WR. To implement #1,
+	 * don't call rds_rdma_send_complete at all, and fall back to the notify
+	 * handling in the ACK processing code.
+	 *
+	 * Note: There's no need to explicitly sync any RDMA buffers using
+	 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+	 * operation itself unmapped the RDMA buffers, which takes care
+	 * of synching.
+	 */
+	rds_ib_send_complete(container_of(op, struct rds_message, rdma),
+			     wc_status, rds_rdma_send_complete);
+
+	if (op->op_write)
+		rds_stats_add(s_send_rdma_bytes, op->op_bytes);
+	else
+		rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
+}
+
+static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
+				     struct rm_atomic_op *op,
+				     int wc_status)
+{
+	/* unmap atomic recvbuf */
+	if (op->op_mapped) {
+		ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
+				DMA_FROM_DEVICE);
+		op->op_mapped = 0;
+	}
+
+	rds_ib_send_complete(container_of(op, struct rds_message, atomic),
+			     wc_status, rds_atomic_send_complete);
+
+	if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
+		rds_ib_stats_inc(s_ib_atomic_cswp);
+	else
+		rds_ib_stats_inc(s_ib_atomic_fadd);
+}
+
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+				   struct rm_data_op *op,
+				   int wc_status)
+{
+	struct rds_message *rm = container_of(op, struct rds_message, data);
+
+	if (op->op_nents)
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+				op->op_sg, op->op_nents,
+				DMA_TO_DEVICE);
+
+	if (rm->rdma.op_active && rm->data.op_notify)
+		rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status);
+}
+
+/*
+ * Unmap the resources associated with a struct send_work.
+ *
+ * Returns the rm for no good reason other than it is unobtainable
+ * other than by switching on wr.opcode, currently, and the caller,
+ * the event handler, needs it.
+ */
+static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
+						struct rds_ib_send_work *send,
+						int wc_status)
+{
+	struct rds_message *rm = NULL;
+
+	/* In the error case, wc.opcode sometimes contains garbage */
+	switch (send->s_wr.opcode) {
+	case IB_WR_SEND:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, data);
+			rds_ib_send_unmap_data(ic, send->s_op, wc_status);
+		}
+		break;
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_READ:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, rdma);
+			rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
+		}
+		break;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, atomic);
+			rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
+		}
+		break;
+	default:
+		printk_ratelimited(KERN_NOTICE
+			       "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+			       __func__, send->s_wr.opcode);
+		break;
+	}
+
+	send->s_wr.opcode = 0xdead;
+
+	return rm;
+}
+
+void rds_ib_send_init_ring(struct rds_ib_connection *ic)
+{
+	struct rds_ib_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		struct ib_sge *sge;
+
+		send->s_op = NULL;
+
+		send->s_wr.wr_id = i;
+		send->s_wr.sg_list = send->s_sge;
+		send->s_wr.ex.imm_data = 0;
+
+		sge = &send->s_sge[0];
+		sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = ic->i_pd->local_dma_lkey;
+
+		send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
+	}
+}
+
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
+{
+	struct rds_ib_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		if (send->s_op && send->s_wr.opcode != 0xdead)
+			rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
+	}
+}
+
+/*
+ * The only fast path caller always has a non-zero nr, so we don't
+ * bother testing nr before performing the atomic sub.
+ */
+static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
+{
+	if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
+	    waitqueue_active(&rds_ib_ring_empty_wait))
+		wake_up(&rds_ib_ring_empty_wait);
+	BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
+}
+
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path.  As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
+{
+	struct rds_message *rm = NULL;
+	struct rds_connection *conn = ic->conn;
+	struct rds_ib_send_work *send;
+	u32 completed;
+	u32 oldest;
+	u32 i = 0;
+	int nr_sig = 0;
+
+
+	rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+		 (unsigned long long)wc->wr_id, wc->status,
+		 ib_wc_status_msg(wc->status), wc->byte_len,
+		 be32_to_cpu(wc->ex.imm_data));
+	rds_ib_stats_inc(s_ib_tx_cq_event);
+
+	if (wc->wr_id == RDS_IB_ACK_WR_ID) {
+		if (time_after(jiffies, ic->i_ack_queued + HZ / 2))
+			rds_ib_stats_inc(s_ib_tx_stalled);
+		rds_ib_ack_send_complete(ic);
+		return;
+	}
+
+	oldest = rds_ib_ring_oldest(&ic->i_send_ring);
+
+	completed = rds_ib_ring_completed(&ic->i_send_ring, wc->wr_id, oldest);
+
+	for (i = 0; i < completed; i++) {
+		send = &ic->i_sends[oldest];
+		if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+			nr_sig++;
+
+		rm = rds_ib_send_unmap_op(ic, send, wc->status);
+
+		if (time_after(jiffies, send->s_queued + HZ / 2))
+			rds_ib_stats_inc(s_ib_tx_stalled);
+
+		if (send->s_op) {
+			if (send->s_op == rm->m_final_op) {
+				/* If anyone waited for this message to get
+				 * flushed out, wake them up now
+				 */
+				rds_message_unmapped(rm);
+			}
+			rds_message_put(rm);
+			send->s_op = NULL;
+		}
+
+		oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+	}
+
+	rds_ib_ring_free(&ic->i_send_ring, completed);
+	rds_ib_sub_signaled(ic, nr_sig);
+	nr_sig = 0;
+
+	if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
+	    test_bit(0, &conn->c_map_queued))
+		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+	/* We expect errors as the qp is drained during shutdown */
+	if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+		rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+				  &conn->c_laddr, &conn->c_faddr, wc->status,
+				  ib_wc_status_msg(wc->status));
+	}
+}
+
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ *  -	send credits: this tells us how many WRs we're allowed
+ *	to submit without overruning the receiver's queue. For
+ *	each SEND WR we post, we decrement this by one.
+ *
+ *  -	posted credits: this tells us how many WRs we recently
+ *	posted to the receive queue. This value is transferred
+ *	to the peer as a "credit update" in a RDS header field.
+ *	Every time we transmit credits to the peer, we subtract
+ *	the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_ib_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter.  Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
+			     u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+{
+	unsigned int avail, posted, got = 0, advertise;
+	long oldval, newval;
+
+	*adv_credits = 0;
+	if (!ic->i_flowctl)
+		return wanted;
+
+try_again:
+	advertise = 0;
+	oldval = newval = atomic_read(&ic->i_credits);
+	posted = IB_GET_POST_CREDITS(oldval);
+	avail = IB_GET_SEND_CREDITS(oldval);
+
+	rdsdebug("wanted=%u credits=%u posted=%u\n",
+			wanted, avail, posted);
+
+	/* The last credit must be used to send a credit update. */
+	if (avail && !posted)
+		avail--;
+
+	if (avail < wanted) {
+		struct rds_connection *conn = ic->i_cm_id->context;
+
+		/* Oops, there aren't that many credits left! */
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		got = avail;
+	} else {
+		/* Sometimes you get what you want, lalala. */
+		got = wanted;
+	}
+	newval -= IB_SET_SEND_CREDITS(got);
+
+	/*
+	 * If need_posted is non-zero, then the caller wants
+	 * the posted regardless of whether any send credits are
+	 * available.
+	 */
+	if (posted && (got || need_posted)) {
+		advertise = min_t(unsigned int, posted, max_posted);
+		newval -= IB_SET_POST_CREDITS(advertise);
+	}
+
+	/* Finally bill everything */
+	if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+		goto try_again;
+
+	*adv_credits = advertise;
+	return got;
+}
+
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	if (credits == 0)
+		return;
+
+	rdsdebug("credits=%u current=%u%s\n",
+			credits,
+			IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+			test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+
+	atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+	if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+	WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+
+	rds_ib_stats_inc(s_ib_rx_credit_updates);
+}
+
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	if (posted == 0)
+		return;
+
+	atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+
+	/* Decide whether to send an update to the peer now.
+	 * If we would send a credit update for every single buffer we
+	 * post, we would end up with an ACK storm (ACK arrives,
+	 * consumes buffer, we refill the ring, send ACK to remote
+	 * advertising the newly posted buffer... ad inf)
+	 *
+	 * Performance pretty much depends on how often we send
+	 * credit updates - too frequent updates mean lots of ACKs.
+	 * Too infrequent updates, and the peer will run out of
+	 * credits and has to throttle.
+	 * For the time being, 16 seems to be a good compromise.
+	 */
+	if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+
+static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
+					     struct rds_ib_send_work *send,
+					     bool notify)
+{
+	/*
+	 * We want to delay signaling completions just enough to get
+	 * the batching benefits but not so much that we create dead time
+	 * on the wire.
+	 */
+	if (ic->i_unsignaled_wrs-- == 0 || notify) {
+		ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+		send->s_wr.send_flags |= IB_SEND_SIGNALED;
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * This can be called multiple times for a given message.  The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests.  We translate the scatterlist into a series
+ * of work requests that fragment the message.  These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection.  This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct rds_ib_send_work *send = NULL;
+	struct rds_ib_send_work *first;
+	struct rds_ib_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct scatterlist *scat;
+	u32 pos;
+	u32 i;
+	u32 work_alloc;
+	u32 credit_alloc = 0;
+	u32 posted;
+	u32 adv_credits = 0;
+	int send_flags = 0;
+	int bytes_sent = 0;
+	int ret;
+	int flow_controlled = 0;
+	int nr_sig = 0;
+
+	BUG_ON(off % RDS_FRAG_SIZE);
+	BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+
+	/* Do not send cong updates to IB loopback */
+	if (conn->c_loopback
+	    && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+		rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+		scat = &rm->data.op_sg[sg];
+		ret = max_t(int, RDS_CONG_MAP_BYTES, scat->length);
+		return sizeof(struct rds_header) + ret;
+	}
+
+	/* FIXME we may overallocate here */
+	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+		i = 1;
+	else
+		i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc == 0) {
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (ic->i_flowctl) {
+		credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+		adv_credits += posted;
+		if (credit_alloc < work_alloc) {
+			rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+			work_alloc = credit_alloc;
+			flow_controlled = 1;
+		}
+		if (work_alloc == 0) {
+			set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+			rds_ib_stats_inc(s_ib_tx_throttle);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* map the message the first time we see it */
+	if (!ic->i_data_op) {
+		if (rm->data.op_nents) {
+			rm->data.op_count = ib_dma_map_sg(dev,
+							  rm->data.op_sg,
+							  rm->data.op_nents,
+							  DMA_TO_DEVICE);
+			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+			if (rm->data.op_count == 0) {
+				rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+				rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+				ret = -ENOMEM; /* XXX ? */
+				goto out;
+			}
+		} else {
+			rm->data.op_count = 0;
+		}
+
+		rds_message_addref(rm);
+		rm->data.op_dmasg = 0;
+		rm->data.op_dmaoff = 0;
+		ic->i_data_op = &rm->data;
+
+		/* Finalize the header */
+		if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+		if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+		/* If it has a RDMA op, tell the peer we did it. This is
+		 * used by the peer to release use-once RDMA MRs. */
+		if (rm->rdma.op_active) {
+			struct rds_ext_header_rdma ext_hdr;
+
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+			rds_message_add_extension(&rm->m_inc.i_hdr,
+					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+		}
+		if (rm->m_rdma_cookie) {
+			rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+					rds_rdma_cookie_key(rm->m_rdma_cookie),
+					rds_rdma_cookie_offset(rm->m_rdma_cookie));
+		}
+
+		/* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so
+		 * we should not do this unless we have a chance of at least
+		 * sticking the header into the send ring. Which is why we
+		 * should call rds_ib_ring_alloc first. */
+		rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
+		rds_message_make_checksum(&rm->m_inc.i_hdr);
+
+		/*
+		 * Update adv_credits since we reset the ACK_REQUIRED bit.
+		 */
+		if (ic->i_flowctl) {
+			rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+			adv_credits += posted;
+			BUG_ON(adv_credits > 255);
+		}
+	}
+
+	/* Sometimes you want to put a fence between an RDMA
+	 * READ and the following SEND.
+	 * We could either do this all the time
+	 * or when requested by the user. Right now, we let
+	 * the application choose.
+	 */
+	if (rm->rdma.op_active && rm->rdma.op_fence)
+		send_flags = IB_SEND_FENCE;
+
+	/* Each frag gets a header. Msgs may be 0 bytes */
+	send = &ic->i_sends[pos];
+	first = send;
+	prev = NULL;
+	scat = &ic->i_data_op->op_sg[rm->data.op_dmasg];
+	i = 0;
+	do {
+		unsigned int len = 0;
+
+		/* Set up the header */
+		send->s_wr.send_flags = send_flags;
+		send->s_wr.opcode = IB_WR_SEND;
+		send->s_wr.num_sge = 1;
+		send->s_wr.next = NULL;
+		send->s_queued = jiffies;
+		send->s_op = NULL;
+
+		send->s_sge[0].addr = ic->i_send_hdrs_dma
+			+ (pos * sizeof(struct rds_header));
+		send->s_sge[0].length = sizeof(struct rds_header);
+
+		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+		/* Set up the data, if present */
+		if (i < work_alloc
+		    && scat != &rm->data.op_sg[rm->data.op_count]) {
+			len = min(RDS_FRAG_SIZE,
+				ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff);
+			send->s_wr.num_sge = 2;
+
+			send->s_sge[1].addr = ib_sg_dma_address(dev, scat);
+			send->s_sge[1].addr += rm->data.op_dmaoff;
+			send->s_sge[1].length = len;
+
+			bytes_sent += len;
+			rm->data.op_dmaoff += len;
+			if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) {
+				scat++;
+				rm->data.op_dmasg++;
+				rm->data.op_dmaoff = 0;
+			}
+		}
+
+		rds_ib_set_wr_signal_state(ic, send, false);
+
+		/*
+		 * Always signal the last one if we're stopping due to flow control.
+		 */
+		if (ic->i_flowctl && flow_controlled && i == (work_alloc - 1)) {
+			rds_ib_set_wr_signal_state(ic, send, true);
+			send->s_wr.send_flags |= IB_SEND_SOLICITED;
+		}
+
+		if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+			nr_sig++;
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+		if (ic->i_flowctl && adv_credits) {
+			struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+			/* add credit and redo the header checksum */
+			hdr->h_credit = adv_credits;
+			rds_message_make_checksum(hdr);
+			adv_credits = 0;
+			rds_ib_stats_inc(s_ib_tx_credit_updates);
+		}
+
+		if (prev)
+			prev->s_wr.next = &send->s_wr;
+		prev = send;
+
+		pos = (pos + 1) % ic->i_send_ring.w_nr;
+		send = &ic->i_sends[pos];
+		i++;
+
+	} while (i < work_alloc
+		 && scat != &rm->data.op_sg[rm->data.op_count]);
+
+	/* Account the RDS header in the number of bytes we sent, but just once.
+	 * The caller has no concept of fragmentation. */
+	if (hdr_off == 0)
+		bytes_sent += sizeof(struct rds_header);
+
+	/* if we finished the message then send completion owns it */
+	if (scat == &rm->data.op_sg[rm->data.op_count]) {
+		prev->s_op = ic->i_data_op;
+		prev->s_wr.send_flags |= IB_SEND_SOLICITED;
+		if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED))
+			nr_sig += rds_ib_set_wr_signal_state(ic, prev, true);
+		ic->i_data_op = NULL;
+	}
+
+	/* Put back wrs & credits we didn't use */
+	if (i < work_alloc) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+	if (ic->i_flowctl && i < credit_alloc)
+		rds_ib_send_add_credits(conn, credit_alloc - i);
+
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
+	/* XXX need to worry about failed_wr and partial sends. */
+	failed_wr = &first->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
+		if (prev->s_op) {
+			ic->i_data_op = prev->s_op;
+			prev->s_op = NULL;
+		}
+
+		rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
+		goto out;
+	}
+
+	ret = bytes_sent;
+out:
+	BUG_ON(adv_credits);
+	return ret;
+}
+
+/*
+ * Issue atomic operation.
+ * A simplified version of the rdma case, we always map 1 SG, and
+ * only 8 bytes, for the return value from the atomic operation.
+ */
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_send_work *send = NULL;
+	struct ib_send_wr *failed_wr;
+	struct rds_ib_device *rds_ibdev;
+	u32 pos;
+	u32 work_alloc;
+	int ret;
+	int nr_sig = 0;
+
+	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
+	if (work_alloc != 1) {
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* address of send request in ring */
+	send = &ic->i_sends[pos];
+	send->s_queued = jiffies;
+
+	if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
+		send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
+		send->s_atomic_wr.compare_add = op->op_m_cswp.compare;
+		send->s_atomic_wr.swap = op->op_m_cswp.swap;
+		send->s_atomic_wr.compare_add_mask = op->op_m_cswp.compare_mask;
+		send->s_atomic_wr.swap_mask = op->op_m_cswp.swap_mask;
+	} else { /* FADD */
+		send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+		send->s_atomic_wr.compare_add = op->op_m_fadd.add;
+		send->s_atomic_wr.swap = 0;
+		send->s_atomic_wr.compare_add_mask = op->op_m_fadd.nocarry_mask;
+		send->s_atomic_wr.swap_mask = 0;
+	}
+	send->s_wr.send_flags = 0;
+	nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+	send->s_atomic_wr.wr.num_sge = 1;
+	send->s_atomic_wr.wr.next = NULL;
+	send->s_atomic_wr.remote_addr = op->op_remote_addr;
+	send->s_atomic_wr.rkey = op->op_rkey;
+	send->s_op = op;
+	rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
+
+	/* map 8 byte retval buffer to the device */
+	ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
+	rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
+	if (ret != 1) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+		ret = -ENOMEM; /* XXX ? */
+		goto out;
+	}
+
+	/* Convert our struct scatterlist to struct ib_sge */
+	send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
+	send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
+	send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
+
+	rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
+		 send->s_sge[0].addr, send->s_sge[0].length);
+
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
+	failed_wr = &send->s_atomic_wr.wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &send->s_atomic_wr.wr, &failed_wr);
+	rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
+		 send, &send->s_atomic_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &send->s_atomic_wr.wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
+		goto out;
+	}
+
+	if (unlikely(failed_wr != &send->s_atomic_wr.wr)) {
+		printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+		BUG_ON(failed_wr != &send->s_atomic_wr.wr);
+	}
+
+out:
+	return ret;
+}
+
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_send_work *send = NULL;
+	struct rds_ib_send_work *first;
+	struct rds_ib_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct scatterlist *scat;
+	unsigned long len;
+	u64 remote_addr = op->op_remote_addr;
+	u32 max_sge = ic->rds_ibdev->max_sge;
+	u32 pos;
+	u32 work_alloc;
+	u32 i;
+	u32 j;
+	int sent;
+	int ret;
+	int num_sge;
+	int nr_sig = 0;
+
+	/* map the op the first time we see it */
+	if (!op->op_mapped) {
+		op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+					     op->op_sg, op->op_nents, (op->op_write) ?
+					     DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+		if (op->op_count == 0) {
+			rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+			ret = -ENOMEM; /* XXX ? */
+			goto out;
+		}
+
+		op->op_mapped = 1;
+	}
+
+	/*
+	 * Instead of knowing how to return a partial rdma read/write we insist that there
+	 * be enough work requests to send the entire message.
+	 */
+	i = ceil(op->op_count, max_sge);
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc != i) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	send = &ic->i_sends[pos];
+	first = send;
+	prev = NULL;
+	scat = &op->op_sg[0];
+	sent = 0;
+	num_sge = op->op_count;
+
+	for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+		send->s_wr.send_flags = 0;
+		send->s_queued = jiffies;
+		send->s_op = NULL;
+
+		nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+
+		send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+		send->s_rdma_wr.remote_addr = remote_addr;
+		send->s_rdma_wr.rkey = op->op_rkey;
+
+		if (num_sge > max_sge) {
+			send->s_rdma_wr.wr.num_sge = max_sge;
+			num_sge -= max_sge;
+		} else {
+			send->s_rdma_wr.wr.num_sge = num_sge;
+		}
+
+		send->s_rdma_wr.wr.next = NULL;
+
+		if (prev)
+			prev->s_rdma_wr.wr.next = &send->s_rdma_wr.wr;
+
+		for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
+		     scat != &op->op_sg[op->op_count]; j++) {
+			len = ib_sg_dma_len(ic->i_cm_id->device, scat);
+			send->s_sge[j].addr =
+				 ib_sg_dma_address(ic->i_cm_id->device, scat);
+			send->s_sge[j].length = len;
+			send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
+
+			sent += len;
+			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+
+			remote_addr += len;
+			scat++;
+		}
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			&send->s_rdma_wr.wr,
+			send->s_rdma_wr.wr.num_sge,
+			send->s_rdma_wr.wr.next);
+
+		prev = send;
+		if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+			send = ic->i_sends;
+	}
+
+	/* give a reference to the last op */
+	if (scat == &op->op_sg[op->op_count]) {
+		prev->s_op = op;
+		rds_message_addref(container_of(op, struct rds_message, rdma));
+	}
+
+	if (i < work_alloc) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
+	failed_wr = &first->s_rdma_wr.wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_rdma_wr.wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_rdma_wr.wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
+		goto out;
+	}
+
+	if (unlikely(failed_wr != &first->s_rdma_wr.wr)) {
+		printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+		BUG_ON(failed_wr != &first->s_rdma_wr.wr);
+	}
+
+
+out:
+	return ret;
+}
+
+void rds_ib_xmit_path_complete(struct rds_conn_path *cp)
+{
+	struct rds_connection *conn = cp->cp_conn;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	/* We may have a pending ACK or window update we were unable
+	 * to send previously (due to flow control). Try again. */
+	rds_ib_attempt_ack(ic);
+}
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
new file mode 100644
index 0000000..9252ad1
--- /dev/null
+++ b/net/rds/ib_stats.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "ib.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
+
+static const char *const rds_ib_stat_names[] = {
+	"ib_connect_raced",
+	"ib_listen_closed_stale",
+	"s_ib_evt_handler_call",
+	"ib_tasklet_call",
+	"ib_tx_cq_event",
+	"ib_tx_ring_full",
+	"ib_tx_throttle",
+	"ib_tx_sg_mapping_failure",
+	"ib_tx_stalled",
+	"ib_tx_credit_updates",
+	"ib_rx_cq_event",
+	"ib_rx_ring_empty",
+	"ib_rx_refill_from_cq",
+	"ib_rx_refill_from_thread",
+	"ib_rx_alloc_limit",
+	"ib_rx_total_frags",
+	"ib_rx_total_incs",
+	"ib_rx_credit_updates",
+	"ib_ack_sent",
+	"ib_ack_send_failure",
+	"ib_ack_send_delayed",
+	"ib_ack_send_piggybacked",
+	"ib_ack_received",
+	"ib_rdma_mr_8k_alloc",
+	"ib_rdma_mr_8k_free",
+	"ib_rdma_mr_8k_used",
+	"ib_rdma_mr_8k_pool_flush",
+	"ib_rdma_mr_8k_pool_wait",
+	"ib_rdma_mr_8k_pool_depleted",
+	"ib_rdma_mr_1m_alloc",
+	"ib_rdma_mr_1m_free",
+	"ib_rdma_mr_1m_used",
+	"ib_rdma_mr_1m_pool_flush",
+	"ib_rdma_mr_1m_pool_wait",
+	"ib_rdma_mr_1m_pool_depleted",
+	"ib_rdma_mr_8k_reused",
+	"ib_rdma_mr_1m_reused",
+	"ib_atomic_cswp",
+	"ib_atomic_fadd",
+};
+
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail)
+{
+	struct rds_ib_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+
+	if (avail < ARRAY_SIZE(rds_ib_stat_names))
+		goto out;
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names,
+			    ARRAY_SIZE(rds_ib_stat_names));
+out:
+	return ARRAY_SIZE(rds_ib_stat_names);
+}
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
new file mode 100644
index 0000000..e4e41b3
--- /dev/null
+++ b/net/rds/ib_sysctl.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "ib.h"
+
+static struct ctl_table_header *rds_ib_sysctl_hdr;
+
+unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR;
+unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR;
+unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_ib_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0;
+
+unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
+
+/*
+ * This sysctl does nothing.
+ *
+ * Backwards compatibility with RDS 3.0 wire protocol
+ * disables initial FC credit exchange.
+ * If it's ever possible to drop 3.0 support,
+ * setting this to 1 and moving init/refill of send/recv
+ * rings from ib_cm_connect_complete() back into ib_setup_qp()
+ * will cause credits to be added before protocol negotiation.
+ */
+unsigned int rds_ib_sysctl_flow_control = 0;
+
+static struct ctl_table rds_ib_sysctl_table[] = {
+	{
+		.procname       = "max_send_wr",
+		.data		= &rds_ib_sysctl_max_send_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_ib_sysctl_max_wr_min,
+		.extra2		= &rds_ib_sysctl_max_wr_max,
+	},
+	{
+		.procname       = "max_recv_wr",
+		.data		= &rds_ib_sysctl_max_recv_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_ib_sysctl_max_wr_min,
+		.extra2		= &rds_ib_sysctl_max_wr_max,
+	},
+	{
+		.procname       = "max_unsignaled_wr",
+		.data		= &rds_ib_sysctl_max_unsig_wrs,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_ib_sysctl_max_unsig_wr_min,
+		.extra2		= &rds_ib_sysctl_max_unsig_wr_max,
+	},
+	{
+		.procname       = "max_recv_allocation",
+		.data		= &rds_ib_sysctl_max_recv_allocation,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "flow_control",
+		.data		= &rds_ib_sysctl_flow_control,
+		.maxlen		= sizeof(rds_ib_sysctl_flow_control),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+void rds_ib_sysctl_exit(void)
+{
+	if (rds_ib_sysctl_hdr)
+		unregister_net_sysctl_table(rds_ib_sysctl_hdr);
+}
+
+int rds_ib_sysctl_init(void)
+{
+	rds_ib_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/ib", rds_ib_sysctl_table);
+	if (!rds_ib_sysctl_hdr)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/net/rds/info.c b/net/rds/info.c
new file mode 100644
index 0000000..140a44a
--- /dev/null
+++ b/net/rds/info.c
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/*
+ * This file implements a getsockopt() call which copies a set of fixed
+ * sized structs into a user-specified buffer as a means of providing
+ * read-only information about RDS.
+ *
+ * For a given information source there are a given number of fixed sized
+ * structs at a given time.  The structs are only copied if the user-specified
+ * buffer is big enough.  The destination pages that make up the buffer
+ * are pinned for the duration of the copy.
+ *
+ * This gives us the following benefits:
+ *
+ * - simple implementation, no copy "position" across multiple calls
+ * - consistent snapshot of an info source
+ * - atomic copy works well with whatever locking info source has
+ * - one portable tool to get rds info across implementations
+ * - long-lived tool can get info without allocating
+ *
+ * at the following costs:
+ *
+ * - info source copy must be pinned, may be "large"
+ */
+
+struct rds_info_iterator {
+	struct page **pages;
+	void *addr;
+	unsigned long offset;
+};
+
+static DEFINE_SPINLOCK(rds_info_lock);
+static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1];
+
+void rds_info_register_func(int optname, rds_info_func func)
+{
+	int offset = optname - RDS_INFO_FIRST;
+
+	BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+	spin_lock(&rds_info_lock);
+	BUG_ON(rds_info_funcs[offset]);
+	rds_info_funcs[offset] = func;
+	spin_unlock(&rds_info_lock);
+}
+EXPORT_SYMBOL_GPL(rds_info_register_func);
+
+void rds_info_deregister_func(int optname, rds_info_func func)
+{
+	int offset = optname - RDS_INFO_FIRST;
+
+	BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+	spin_lock(&rds_info_lock);
+	BUG_ON(rds_info_funcs[offset] != func);
+	rds_info_funcs[offset] = NULL;
+	spin_unlock(&rds_info_lock);
+}
+EXPORT_SYMBOL_GPL(rds_info_deregister_func);
+
+/*
+ * Typically we hold an atomic kmap across multiple rds_info_copy() calls
+ * because the kmap is so expensive.  This must be called before using blocking
+ * operations while holding the mapping and as the iterator is torn down.
+ */
+void rds_info_iter_unmap(struct rds_info_iterator *iter)
+{
+	if (iter->addr) {
+		kunmap_atomic(iter->addr);
+		iter->addr = NULL;
+	}
+}
+
+/*
+ * get_user_pages() called flush_dcache_page() on the pages for us.
+ */
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+		   unsigned long bytes)
+{
+	unsigned long this;
+
+	while (bytes) {
+		if (!iter->addr)
+			iter->addr = kmap_atomic(*iter->pages);
+
+		this = min(bytes, PAGE_SIZE - iter->offset);
+
+		rdsdebug("page %p addr %p offset %lu this %lu data %p "
+			  "bytes %lu\n", *iter->pages, iter->addr,
+			  iter->offset, this, data, bytes);
+
+		memcpy(iter->addr + iter->offset, data, this);
+
+		data += this;
+		bytes -= this;
+		iter->offset += this;
+
+		if (iter->offset == PAGE_SIZE) {
+			kunmap_atomic(iter->addr);
+			iter->addr = NULL;
+			iter->offset = 0;
+			iter->pages++;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(rds_info_copy);
+
+/*
+ * @optval points to the userspace buffer that the information snapshot
+ * will be copied into.
+ *
+ * @optlen on input is the size of the buffer in userspace.  @optlen
+ * on output is the size of the requested snapshot in bytes.
+ *
+ * This function returns -errno if there is a failure, particularly -ENOSPC
+ * if the given userspace buffer was not large enough to fit the snapshot.
+ * On success it returns the positive number of bytes of each array element
+ * in the snapshot.
+ */
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+			int __user *optlen)
+{
+	struct rds_info_iterator iter;
+	struct rds_info_lengths lens;
+	unsigned long nr_pages = 0;
+	unsigned long start;
+	unsigned long i;
+	rds_info_func func;
+	struct page **pages = NULL;
+	int ret;
+	int len;
+	int total;
+
+	if (get_user(len, optlen)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* check for all kinds of wrapping and the like */
+	start = (unsigned long)optval;
+	if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* a 0 len call is just trying to probe its length */
+	if (len == 0)
+		goto call_func;
+
+	nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
+			>> PAGE_SHIFT;
+
+	pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = get_user_pages_fast(start, nr_pages, 1, pages);
+	if (ret != nr_pages) {
+		if (ret > 0)
+			nr_pages = ret;
+		else
+			nr_pages = 0;
+		ret = -EAGAIN; /* XXX ? */
+		goto out;
+	}
+
+	rdsdebug("len %d nr_pages %lu\n", len, nr_pages);
+
+call_func:
+	func = rds_info_funcs[optname - RDS_INFO_FIRST];
+	if (!func) {
+		ret = -ENOPROTOOPT;
+		goto out;
+	}
+
+	iter.pages = pages;
+	iter.addr = NULL;
+	iter.offset = start & (PAGE_SIZE - 1);
+
+	func(sock, len, &iter, &lens);
+	BUG_ON(lens.each == 0);
+
+	total = lens.nr * lens.each;
+
+	rds_info_iter_unmap(&iter);
+
+	if (total > len) {
+		len = total;
+		ret = -ENOSPC;
+	} else {
+		len = total;
+		ret = lens.each;
+	}
+
+	if (put_user(len, optlen))
+		ret = -EFAULT;
+
+out:
+	for (i = 0; pages && i < nr_pages; i++)
+		put_page(pages[i]);
+	kfree(pages);
+
+	return ret;
+}
diff --git a/net/rds/info.h b/net/rds/info.h
new file mode 100644
index 0000000..a069b51
--- /dev/null
+++ b/net/rds/info.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDS_INFO_H
+#define _RDS_INFO_H
+
+struct rds_info_lengths {
+	unsigned int	nr;
+	unsigned int	each;
+};
+
+struct rds_info_iterator;
+
+/*
+ * These functions must fill in the fields of @lens to reflect the size
+ * of the available info source.  If the snapshot fits in @len then it
+ * should be copied using @iter.  The caller will deduce if it was copied
+ * or not by comparing the lengths.
+ */
+typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
+			      struct rds_info_iterator *iter,
+			      struct rds_info_lengths *lens);
+
+void rds_info_register_func(int optname, rds_info_func func);
+void rds_info_deregister_func(int optname, rds_info_func func);
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+			int __user *optlen);
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+		   unsigned long bytes);
+void rds_info_iter_unmap(struct rds_info_iterator *iter);
+
+
+#endif
diff --git a/net/rds/loop.c b/net/rds/loop.c
new file mode 100644
index 0000000..f2bf78d
--- /dev/null
+++ b/net/rds/loop.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+
+#include "rds_single_path.h"
+#include "rds.h"
+#include "loop.h"
+
+static DEFINE_SPINLOCK(loop_conns_lock);
+static LIST_HEAD(loop_conns);
+
+/*
+ * This 'loopback' transport is a special case for flows that originate
+ * and terminate on the same machine.
+ *
+ * Connection build-up notices if the destination address is thought of
+ * as a local address by a transport.  At that time it decides to use the
+ * loopback transport instead of the bound transport of the sending socket.
+ *
+ * The loopback transport's sending path just hands the sent rds_message
+ * straight to the receiving path via an embedded rds_incoming.
+ */
+
+/*
+ * Usually a message transits both the sender and receiver's conns as it
+ * flows to the receiver.  In the loopback case, though, the receive path
+ * is handed the sending conn so the sense of the addresses is reversed.
+ */
+static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
+			 unsigned int hdr_off, unsigned int sg,
+			 unsigned int off)
+{
+	struct scatterlist *sgp = &rm->data.op_sg[sg];
+	int ret = sizeof(struct rds_header) +
+			be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	/* Do not send cong updates to loopback */
+	if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+		rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+		ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off);
+		goto out;
+	}
+
+	BUG_ON(hdr_off || sg || off);
+
+	rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
+	/* For the embedded inc. Matching put is in loop_inc_free() */
+	rds_message_addref(rm);
+
+	rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
+			  GFP_KERNEL);
+
+	rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
+			    NULL);
+
+	rds_inc_put(&rm->m_inc);
+out:
+	return ret;
+}
+
+/*
+ * See rds_loop_xmit(). Since our inc is embedded in the rm, we
+ * make sure the rm lives at least until the inc is done.
+ */
+static void rds_loop_inc_free(struct rds_incoming *inc)
+{
+	struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+
+	rds_message_put(rm);
+}
+
+/* we need to at least give the thread something to succeed */
+static int rds_loop_recv_path(struct rds_conn_path *cp)
+{
+	return 0;
+}
+
+struct rds_loop_connection {
+	struct list_head loop_node;
+	struct rds_connection *conn;
+};
+
+/*
+ * Even the loopback transport needs to keep track of its connections,
+ * so it can call rds_conn_destroy() on them on exit. N.B. there are
+ * 1+ loopback addresses (127.*.*.*) so it's not a bug to have
+ * multiple loopback conns allocated, although rather useless.
+ */
+static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_loop_connection *lc;
+	unsigned long flags;
+
+	lc = kzalloc(sizeof(struct rds_loop_connection), gfp);
+	if (!lc)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&lc->loop_node);
+	lc->conn = conn;
+	conn->c_transport_data = lc;
+
+	spin_lock_irqsave(&loop_conns_lock, flags);
+	list_add_tail(&lc->loop_node, &loop_conns);
+	spin_unlock_irqrestore(&loop_conns_lock, flags);
+
+	return 0;
+}
+
+static void rds_loop_conn_free(void *arg)
+{
+	struct rds_loop_connection *lc = arg;
+	unsigned long flags;
+
+	rdsdebug("lc %p\n", lc);
+	spin_lock_irqsave(&loop_conns_lock, flags);
+	list_del(&lc->loop_node);
+	spin_unlock_irqrestore(&loop_conns_lock, flags);
+	kfree(lc);
+}
+
+static int rds_loop_conn_path_connect(struct rds_conn_path *cp)
+{
+	rds_connect_complete(cp->cp_conn);
+	return 0;
+}
+
+static void rds_loop_conn_path_shutdown(struct rds_conn_path *cp)
+{
+}
+
+void rds_loop_exit(void)
+{
+	struct rds_loop_connection *lc, *_lc;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&loop_conns_lock);
+	list_splice(&loop_conns, &tmp_list);
+	INIT_LIST_HEAD(&loop_conns);
+	spin_unlock_irq(&loop_conns_lock);
+
+	list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
+		WARN_ON(lc->conn->c_passive);
+		rds_conn_destroy(lc->conn);
+	}
+}
+
+/*
+ * This is missing .xmit_* because loop doesn't go through generic
+ * rds_send_xmit() and doesn't call rds_recv_incoming().  .listen_stop and
+ * .laddr_check are missing because transport.c doesn't iterate over
+ * rds_loop_transport.
+ */
+struct rds_transport rds_loop_transport = {
+	.xmit			= rds_loop_xmit,
+	.recv_path		= rds_loop_recv_path,
+	.conn_alloc		= rds_loop_conn_alloc,
+	.conn_free		= rds_loop_conn_free,
+	.conn_path_connect	= rds_loop_conn_path_connect,
+	.conn_path_shutdown	= rds_loop_conn_path_shutdown,
+	.inc_copy_to_user	= rds_message_inc_copy_to_user,
+	.inc_free		= rds_loop_inc_free,
+	.t_name			= "loopback",
+};
diff --git a/net/rds/loop.h b/net/rds/loop.h
new file mode 100644
index 0000000..469fa4b
--- /dev/null
+++ b/net/rds/loop.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDS_LOOP_H
+#define _RDS_LOOP_H
+
+/* loop.c */
+extern struct rds_transport rds_loop_transport;
+
+void rds_loop_exit(void);
+
+#endif
diff --git a/net/rds/message.c b/net/rds/message.c
new file mode 100644
index 0000000..a35f769
--- /dev/null
+++ b/net/rds/message.c
@@ -0,0 +1,517 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/errqueue.h>
+
+#include "rds.h"
+
+static unsigned int	rds_exthdr_size[__RDS_EXTHDR_MAX] = {
+[RDS_EXTHDR_NONE]	= 0,
+[RDS_EXTHDR_VERSION]	= sizeof(struct rds_ext_header_version),
+[RDS_EXTHDR_RDMA]	= sizeof(struct rds_ext_header_rdma),
+[RDS_EXTHDR_RDMA_DEST]	= sizeof(struct rds_ext_header_rdma_dest),
+[RDS_EXTHDR_NPATHS]	= sizeof(u16),
+[RDS_EXTHDR_GEN_NUM]	= sizeof(u32),
+};
+
+void rds_message_addref(struct rds_message *rm)
+{
+	rdsdebug("addref rm %p ref %d\n", rm, refcount_read(&rm->m_refcount));
+	refcount_inc(&rm->m_refcount);
+}
+EXPORT_SYMBOL_GPL(rds_message_addref);
+
+static inline bool rds_zcookie_add(struct rds_msg_zcopy_info *info, u32 cookie)
+{
+	struct rds_zcopy_cookies *ck = &info->zcookies;
+	int ncookies = ck->num;
+
+	if (ncookies == RDS_MAX_ZCOOKIES)
+		return false;
+	ck->cookies[ncookies] = cookie;
+	ck->num =  ++ncookies;
+	return true;
+}
+
+static struct rds_msg_zcopy_info *rds_info_from_znotifier(struct rds_znotifier *znotif)
+{
+	return container_of(znotif, struct rds_msg_zcopy_info, znotif);
+}
+
+void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *q)
+{
+	unsigned long flags;
+	LIST_HEAD(copy);
+	struct rds_msg_zcopy_info *info, *tmp;
+
+	spin_lock_irqsave(&q->lock, flags);
+	list_splice(&q->zcookie_head, &copy);
+	INIT_LIST_HEAD(&q->zcookie_head);
+	spin_unlock_irqrestore(&q->lock, flags);
+
+	list_for_each_entry_safe(info, tmp, &copy, rs_zcookie_next) {
+		list_del(&info->rs_zcookie_next);
+		kfree(info);
+	}
+}
+
+static void rds_rm_zerocopy_callback(struct rds_sock *rs,
+				     struct rds_znotifier *znotif)
+{
+	struct rds_msg_zcopy_info *info;
+	struct rds_msg_zcopy_queue *q;
+	u32 cookie = znotif->z_cookie;
+	struct rds_zcopy_cookies *ck;
+	struct list_head *head;
+	unsigned long flags;
+
+	mm_unaccount_pinned_pages(&znotif->z_mmp);
+	q = &rs->rs_zcookie_queue;
+	spin_lock_irqsave(&q->lock, flags);
+	head = &q->zcookie_head;
+	if (!list_empty(head)) {
+		info = list_entry(head, struct rds_msg_zcopy_info,
+				  rs_zcookie_next);
+		if (info && rds_zcookie_add(info, cookie)) {
+			spin_unlock_irqrestore(&q->lock, flags);
+			kfree(rds_info_from_znotifier(znotif));
+			/* caller invokes rds_wake_sk_sleep() */
+			return;
+		}
+	}
+
+	info = rds_info_from_znotifier(znotif);
+	ck = &info->zcookies;
+	memset(ck, 0, sizeof(*ck));
+	WARN_ON(!rds_zcookie_add(info, cookie));
+	list_add_tail(&q->zcookie_head, &info->rs_zcookie_next);
+
+	spin_unlock_irqrestore(&q->lock, flags);
+	/* caller invokes rds_wake_sk_sleep() */
+}
+
+/*
+ * This relies on dma_map_sg() not touching sg[].page during merging.
+ */
+static void rds_message_purge(struct rds_message *rm)
+{
+	unsigned long i, flags;
+	bool zcopy = false;
+
+	if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
+		return;
+
+	spin_lock_irqsave(&rm->m_rs_lock, flags);
+	if (rm->m_rs) {
+		struct rds_sock *rs = rm->m_rs;
+
+		if (rm->data.op_mmp_znotifier) {
+			zcopy = true;
+			rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier);
+			rds_wake_sk_sleep(rs);
+			rm->data.op_mmp_znotifier = NULL;
+		}
+		sock_put(rds_rs_to_sk(rs));
+		rm->m_rs = NULL;
+	}
+	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+	for (i = 0; i < rm->data.op_nents; i++) {
+		/* XXX will have to put_page for page refs */
+		if (!zcopy)
+			__free_page(sg_page(&rm->data.op_sg[i]));
+		else
+			put_page(sg_page(&rm->data.op_sg[i]));
+	}
+	rm->data.op_nents = 0;
+
+	if (rm->rdma.op_active)
+		rds_rdma_free_op(&rm->rdma);
+	if (rm->rdma.op_rdma_mr)
+		rds_mr_put(rm->rdma.op_rdma_mr);
+
+	if (rm->atomic.op_active)
+		rds_atomic_free_op(&rm->atomic);
+	if (rm->atomic.op_rdma_mr)
+		rds_mr_put(rm->atomic.op_rdma_mr);
+}
+
+void rds_message_put(struct rds_message *rm)
+{
+	rdsdebug("put rm %p ref %d\n", rm, refcount_read(&rm->m_refcount));
+	WARN(!refcount_read(&rm->m_refcount), "danger refcount zero on %p\n", rm);
+	if (refcount_dec_and_test(&rm->m_refcount)) {
+		BUG_ON(!list_empty(&rm->m_sock_item));
+		BUG_ON(!list_empty(&rm->m_conn_item));
+		rds_message_purge(rm);
+
+		kfree(rm);
+	}
+}
+EXPORT_SYMBOL_GPL(rds_message_put);
+
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+				 __be16 dport, u64 seq)
+{
+	hdr->h_flags = 0;
+	hdr->h_sport = sport;
+	hdr->h_dport = dport;
+	hdr->h_sequence = cpu_to_be64(seq);
+	hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
+}
+EXPORT_SYMBOL_GPL(rds_message_populate_header);
+
+int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
+			      const void *data, unsigned int len)
+{
+	unsigned int ext_len = sizeof(u8) + len;
+	unsigned char *dst;
+
+	/* For now, refuse to add more than one extension header */
+	if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
+		return 0;
+
+	if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
+		return 0;
+
+	if (ext_len >= RDS_HEADER_EXT_SPACE)
+		return 0;
+	dst = hdr->h_exthdr;
+
+	*dst++ = type;
+	memcpy(dst, data, len);
+
+	dst[len] = RDS_EXTHDR_NONE;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(rds_message_add_extension);
+
+/*
+ * If a message has extension headers, retrieve them here.
+ * Call like this:
+ *
+ * unsigned int pos = 0;
+ *
+ * while (1) {
+ *	buflen = sizeof(buffer);
+ *	type = rds_message_next_extension(hdr, &pos, buffer, &buflen);
+ *	if (type == RDS_EXTHDR_NONE)
+ *		break;
+ *	...
+ * }
+ */
+int rds_message_next_extension(struct rds_header *hdr,
+		unsigned int *pos, void *buf, unsigned int *buflen)
+{
+	unsigned int offset, ext_type, ext_len;
+	u8 *src = hdr->h_exthdr;
+
+	offset = *pos;
+	if (offset >= RDS_HEADER_EXT_SPACE)
+		goto none;
+
+	/* Get the extension type and length. For now, the
+	 * length is implied by the extension type. */
+	ext_type = src[offset++];
+
+	if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX)
+		goto none;
+	ext_len = rds_exthdr_size[ext_type];
+	if (offset + ext_len > RDS_HEADER_EXT_SPACE)
+		goto none;
+
+	*pos = offset + ext_len;
+	if (ext_len < *buflen)
+		*buflen = ext_len;
+	memcpy(buf, src + offset, *buflen);
+	return ext_type;
+
+none:
+	*pos = RDS_HEADER_EXT_SPACE;
+	*buflen = 0;
+	return RDS_EXTHDR_NONE;
+}
+
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
+{
+	struct rds_ext_header_rdma_dest ext_hdr;
+
+	ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
+	ext_hdr.h_rdma_offset = cpu_to_be32(offset);
+	return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
+}
+EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
+
+/*
+ * Each rds_message is allocated with extra space for the scatterlist entries
+ * rds ops will need. This is to minimize memory allocation count. Then, each rds op
+ * can grab SGs when initializing its part of the rds_message.
+ */
+struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
+{
+	struct rds_message *rm;
+
+	if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message))
+		return NULL;
+
+	rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
+	if (!rm)
+		goto out;
+
+	rm->m_used_sgs = 0;
+	rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
+
+	refcount_set(&rm->m_refcount, 1);
+	INIT_LIST_HEAD(&rm->m_sock_item);
+	INIT_LIST_HEAD(&rm->m_conn_item);
+	spin_lock_init(&rm->m_rs_lock);
+	init_waitqueue_head(&rm->m_flush_wait);
+
+out:
+	return rm;
+}
+
+/*
+ * RDS ops use this to grab SG entries from the rm's sg pool.
+ */
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
+{
+	struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
+	struct scatterlist *sg_ret;
+
+	WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
+	WARN_ON(!nents);
+
+	if (rm->m_used_sgs + nents > rm->m_total_sgs)
+		return NULL;
+
+	sg_ret = &sg_first[rm->m_used_sgs];
+	sg_init_table(sg_ret, nents);
+	rm->m_used_sgs += nents;
+
+	return sg_ret;
+}
+
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
+{
+	struct rds_message *rm;
+	unsigned int i;
+	int num_sgs = ceil(total_len, PAGE_SIZE);
+	int extra_bytes = num_sgs * sizeof(struct scatterlist);
+
+	rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
+	if (!rm)
+		return ERR_PTR(-ENOMEM);
+
+	set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
+	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
+	rm->data.op_nents = ceil(total_len, PAGE_SIZE);
+	rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
+	if (!rm->data.op_sg) {
+		rds_message_put(rm);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < rm->data.op_nents; ++i) {
+		sg_set_page(&rm->data.op_sg[i],
+				virt_to_page(page_addrs[i]),
+				PAGE_SIZE, 0);
+	}
+
+	return rm;
+}
+
+static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from)
+{
+	struct scatterlist *sg;
+	int ret = 0;
+	int length = iov_iter_count(from);
+	int total_copied = 0;
+	struct rds_msg_zcopy_info *info;
+
+	rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
+
+	/*
+	 * now allocate and copy in the data payload.
+	 */
+	sg = rm->data.op_sg;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&info->rs_zcookie_next);
+	rm->data.op_mmp_znotifier = &info->znotif;
+	if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
+				    length)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	while (iov_iter_count(from)) {
+		struct page *pages;
+		size_t start;
+		ssize_t copied;
+
+		copied = iov_iter_get_pages(from, &pages, PAGE_SIZE,
+					    1, &start);
+		if (copied < 0) {
+			struct mmpin *mmp;
+			int i;
+
+			for (i = 0; i < rm->data.op_nents; i++)
+				put_page(sg_page(&rm->data.op_sg[i]));
+			mmp = &rm->data.op_mmp_znotifier->z_mmp;
+			mm_unaccount_pinned_pages(mmp);
+			ret = -EFAULT;
+			goto err;
+		}
+		total_copied += copied;
+		iov_iter_advance(from, copied);
+		length -= copied;
+		sg_set_page(sg, pages, copied, start);
+		rm->data.op_nents++;
+		sg++;
+	}
+	WARN_ON_ONCE(length != 0);
+	return ret;
+err:
+	kfree(info);
+	rm->data.op_mmp_znotifier = NULL;
+	return ret;
+}
+
+int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
+			       bool zcopy)
+{
+	unsigned long to_copy, nbytes;
+	unsigned long sg_off;
+	struct scatterlist *sg;
+	int ret = 0;
+
+	rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
+
+	/* now allocate and copy in the data payload.  */
+	sg = rm->data.op_sg;
+	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
+
+	if (zcopy)
+		return rds_message_zcopy_from_user(rm, from);
+
+	while (iov_iter_count(from)) {
+		if (!sg_page(sg)) {
+			ret = rds_page_remainder_alloc(sg, iov_iter_count(from),
+						       GFP_HIGHUSER);
+			if (ret)
+				return ret;
+			rm->data.op_nents++;
+			sg_off = 0;
+		}
+
+		to_copy = min_t(unsigned long, iov_iter_count(from),
+				sg->length - sg_off);
+
+		rds_stats_add(s_copy_from_user, to_copy);
+		nbytes = copy_page_from_iter(sg_page(sg), sg->offset + sg_off,
+					     to_copy, from);
+		if (nbytes != to_copy)
+			return -EFAULT;
+
+		sg_off += to_copy;
+
+		if (sg_off == sg->length)
+			sg++;
+	}
+
+	return ret;
+}
+
+int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+{
+	struct rds_message *rm;
+	struct scatterlist *sg;
+	unsigned long to_copy;
+	unsigned long vec_off;
+	int copied;
+	int ret;
+	u32 len;
+
+	rm = container_of(inc, struct rds_message, m_inc);
+	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	sg = rm->data.op_sg;
+	vec_off = 0;
+	copied = 0;
+
+	while (iov_iter_count(to) && copied < len) {
+		to_copy = min_t(unsigned long, iov_iter_count(to),
+				sg->length - vec_off);
+		to_copy = min_t(unsigned long, to_copy, len - copied);
+
+		rds_stats_add(s_copy_to_user, to_copy);
+		ret = copy_page_to_iter(sg_page(sg), sg->offset + vec_off,
+					to_copy, to);
+		if (ret != to_copy)
+			return -EFAULT;
+
+		vec_off += to_copy;
+		copied += to_copy;
+
+		if (vec_off == sg->length) {
+			vec_off = 0;
+			sg++;
+		}
+	}
+
+	return copied;
+}
+
+/*
+ * If the message is still on the send queue, wait until the transport
+ * is done with it. This is particularly important for RDMA operations.
+ */
+void rds_message_wait(struct rds_message *rm)
+{
+	wait_event_interruptible(rm->m_flush_wait,
+			!test_bit(RDS_MSG_MAPPED, &rm->m_flags));
+}
+
+void rds_message_unmapped(struct rds_message *rm)
+{
+	clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+	wake_up_interruptible(&rm->m_flush_wait);
+}
+EXPORT_SYMBOL_GPL(rds_message_unmapped);
+
diff --git a/net/rds/page.c b/net/rds/page.c
new file mode 100644
index 0000000..7cc57e0
--- /dev/null
+++ b/net/rds/page.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+struct rds_page_remainder {
+	struct page	*r_page;
+	unsigned long	r_offset;
+};
+
+static
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
+
+/**
+ * rds_page_remainder_alloc - build up regions of a message.
+ *
+ * @scat: Scatter list for message
+ * @bytes: the number of bytes needed.
+ * @gfp: the waiting behaviour of the allocation
+ *
+ * @gfp is always ored with __GFP_HIGHMEM.  Callers must be prepared to
+ * kmap the pages, etc.
+ *
+ * If @bytes is at least a full page then this just returns a page from
+ * alloc_page().
+ *
+ * If @bytes is a partial page then this stores the unused region of the
+ * page in a per-cpu structure.  Future partial-page allocations may be
+ * satisfied from that cached region.  This lets us waste less memory on
+ * small allocations with minimal complexity.  It works because the transmit
+ * path passes read-only page regions down to devices.  They hold a page
+ * reference until they are done with the region.
+ */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+			     gfp_t gfp)
+{
+	struct rds_page_remainder *rem;
+	unsigned long flags;
+	struct page *page;
+	int ret;
+
+	gfp |= __GFP_HIGHMEM;
+
+	/* jump straight to allocation if we're trying for a huge page */
+	if (bytes >= PAGE_SIZE) {
+		page = alloc_page(gfp);
+		if (!page) {
+			ret = -ENOMEM;
+		} else {
+			sg_set_page(scat, page, PAGE_SIZE, 0);
+			ret = 0;
+		}
+		goto out;
+	}
+
+	rem = &per_cpu(rds_page_remainders, get_cpu());
+	local_irq_save(flags);
+
+	while (1) {
+		/* avoid a tiny region getting stuck by tossing it */
+		if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) {
+			rds_stats_inc(s_page_remainder_miss);
+			__free_page(rem->r_page);
+			rem->r_page = NULL;
+		}
+
+		/* hand out a fragment from the cached page */
+		if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) {
+			sg_set_page(scat, rem->r_page, bytes, rem->r_offset);
+			get_page(sg_page(scat));
+
+			if (rem->r_offset != 0)
+				rds_stats_inc(s_page_remainder_hit);
+
+			rem->r_offset += ALIGN(bytes, 8);
+			if (rem->r_offset >= PAGE_SIZE) {
+				__free_page(rem->r_page);
+				rem->r_page = NULL;
+			}
+			ret = 0;
+			break;
+		}
+
+		/* alloc if there is nothing for us to use */
+		local_irq_restore(flags);
+		put_cpu();
+
+		page = alloc_page(gfp);
+
+		rem = &per_cpu(rds_page_remainders, get_cpu());
+		local_irq_save(flags);
+
+		if (!page) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		/* did someone race to fill the remainder before us? */
+		if (rem->r_page) {
+			__free_page(page);
+			continue;
+		}
+
+		/* otherwise install our page and loop around to alloc */
+		rem->r_page = page;
+		rem->r_offset = 0;
+	}
+
+	local_irq_restore(flags);
+	put_cpu();
+out:
+	rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
+		 ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
+		 ret ? 0 : scat->length);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
+
+void rds_page_exit(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct rds_page_remainder *rem;
+
+		rem = &per_cpu(rds_page_remainders, cpu);
+		rdsdebug("cpu %u\n", cpu);
+
+		if (rem->r_page)
+			__free_page(rem->r_page);
+		rem->r_page = NULL;
+	}
+}
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
new file mode 100644
index 0000000..634cfcb
--- /dev/null
+++ b/net/rds/rdma.c
@@ -0,0 +1,884 @@
+/*
+ * Copyright (c) 2007 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
+
+#include "rds.h"
+
+/*
+ * XXX
+ *  - build with sparse
+ *  - should we detect duplicate keys on a socket?  hmm.
+ *  - an rdma is an mlock, apply rlimit?
+ */
+
+/*
+ * get the number of pages by looking at the page indices that the start and
+ * end addresses fall in.
+ *
+ * Returns 0 if the vec is invalid.  It is invalid if the number of bytes
+ * causes the address to wrap or overflows an unsigned int.  This comes
+ * from being stored in the 'length' member of 'struct scatterlist'.
+ */
+static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
+{
+	if ((vec->addr + vec->bytes <= vec->addr) ||
+	    (vec->bytes > (u64)UINT_MAX))
+		return 0;
+
+	return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+		(vec->addr >> PAGE_SHIFT);
+}
+
+static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
+				       struct rds_mr *insert)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct rds_mr *mr;
+
+	while (*p) {
+		parent = *p;
+		mr = rb_entry(parent, struct rds_mr, r_rb_node);
+
+		if (key < mr->r_key)
+			p = &(*p)->rb_left;
+		else if (key > mr->r_key)
+			p = &(*p)->rb_right;
+		else
+			return mr;
+	}
+
+	if (insert) {
+		rb_link_node(&insert->r_rb_node, parent, p);
+		rb_insert_color(&insert->r_rb_node, root);
+		refcount_inc(&insert->r_refcount);
+	}
+	return NULL;
+}
+
+/*
+ * Destroy the transport-specific part of a MR.
+ */
+static void rds_destroy_mr(struct rds_mr *mr)
+{
+	struct rds_sock *rs = mr->r_sock;
+	void *trans_private = NULL;
+	unsigned long flags;
+
+	rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
+			mr->r_key, refcount_read(&mr->r_refcount));
+
+	if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
+		return;
+
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	if (!RB_EMPTY_NODE(&mr->r_rb_node))
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+	trans_private = mr->r_trans_private;
+	mr->r_trans_private = NULL;
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (trans_private)
+		mr->r_trans->free_mr(trans_private, mr->r_invalidate);
+}
+
+void __rds_put_mr_final(struct rds_mr *mr)
+{
+	rds_destroy_mr(mr);
+	kfree(mr);
+}
+
+/*
+ * By the time this is called we can't have any more ioctls called on
+ * the socket so we don't need to worry about racing with others.
+ */
+void rds_rdma_drop_keys(struct rds_sock *rs)
+{
+	struct rds_mr *mr;
+	struct rb_node *node;
+	unsigned long flags;
+
+	/* Release any MRs associated with this socket */
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	while ((node = rb_first(&rs->rs_rdma_keys))) {
+		mr = rb_entry(node, struct rds_mr, r_rb_node);
+		if (mr->r_trans == rs->rs_transport)
+			mr->r_invalidate = 0;
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+		RB_CLEAR_NODE(&mr->r_rb_node);
+		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+		rds_destroy_mr(mr);
+		rds_mr_put(mr);
+		spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	}
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (rs->rs_transport && rs->rs_transport->flush_mrs)
+		rs->rs_transport->flush_mrs();
+}
+
+/*
+ * Helper function to pin user pages.
+ */
+static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
+			struct page **pages, int write)
+{
+	int ret;
+
+	ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
+
+	if (ret >= 0 && ret < nr_pages) {
+		while (ret--)
+			put_page(pages[ret]);
+		ret = -EFAULT;
+	}
+
+	return ret;
+}
+
+static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
+				u64 *cookie_ret, struct rds_mr **mr_ret)
+{
+	struct rds_mr *mr = NULL, *found;
+	unsigned int nr_pages;
+	struct page **pages = NULL;
+	struct scatterlist *sg;
+	void *trans_private;
+	unsigned long flags;
+	rds_rdma_cookie_t cookie;
+	unsigned int nents;
+	long i;
+	int ret;
+
+	if (rs->rs_bound_addr == 0 || !rs->rs_transport) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+
+	if (!rs->rs_transport->get_mr) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	nr_pages = rds_pages_in_vec(&args->vec);
+	if (nr_pages == 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Restrict the size of mr irrespective of underlying transport
+	 * To account for unaligned mr regions, subtract one from nr_pages
+	 */
+	if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
+	rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
+		args->vec.addr, args->vec.bytes, nr_pages);
+
+	/* XXX clamp nr_pages to limit the size of this alloc? */
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
+	if (!mr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	refcount_set(&mr->r_refcount, 1);
+	RB_CLEAR_NODE(&mr->r_rb_node);
+	mr->r_trans = rs->rs_transport;
+	mr->r_sock = rs;
+
+	if (args->flags & RDS_RDMA_USE_ONCE)
+		mr->r_use_once = 1;
+	if (args->flags & RDS_RDMA_INVALIDATE)
+		mr->r_invalidate = 1;
+	if (args->flags & RDS_RDMA_READWRITE)
+		mr->r_write = 1;
+
+	/*
+	 * Pin the pages that make up the user buffer and transfer the page
+	 * pointers to the mr's sg array.  We check to see if we've mapped
+	 * the whole region after transferring the partial page references
+	 * to the sg array so that we can have one page ref cleanup path.
+	 *
+	 * For now we have no flag that tells us whether the mapping is
+	 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
+	 * the zero page.
+	 */
+	ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
+	if (ret < 0)
+		goto out;
+
+	nents = ret;
+	sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
+	if (!sg) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	WARN_ON(!nents);
+	sg_init_table(sg, nents);
+
+	/* Stick all pages into the scatterlist */
+	for (i = 0 ; i < nents; i++)
+		sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+
+	rdsdebug("RDS: trans_private nents is %u\n", nents);
+
+	/* Obtain a transport specific MR. If this succeeds, the
+	 * s/g list is now owned by the MR.
+	 * Note that dma_map() implies that pending writes are
+	 * flushed to RAM, so no dma_sync is needed here. */
+	trans_private = rs->rs_transport->get_mr(sg, nents, rs,
+						 &mr->r_key);
+
+	if (IS_ERR(trans_private)) {
+		for (i = 0 ; i < nents; i++)
+			put_page(sg_page(&sg[i]));
+		kfree(sg);
+		ret = PTR_ERR(trans_private);
+		goto out;
+	}
+
+	mr->r_trans_private = trans_private;
+
+	rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n",
+	       mr->r_key, (void *)(unsigned long) args->cookie_addr);
+
+	/* The user may pass us an unaligned address, but we can only
+	 * map page aligned regions. So we keep the offset, and build
+	 * a 64bit cookie containing <R_Key, offset> and pass that
+	 * around. */
+	cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
+	if (cookie_ret)
+		*cookie_ret = cookie;
+
+	if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* Inserting the new MR into the rbtree bumps its
+	 * reference count. */
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	BUG_ON(found && found != mr);
+
+	rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
+	if (mr_ret) {
+		refcount_inc(&mr->r_refcount);
+		*mr_ret = mr;
+	}
+
+	ret = 0;
+out:
+	kfree(pages);
+	if (mr)
+		rds_mr_put(mr);
+	return ret;
+}
+
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+	struct rds_get_mr_args args;
+
+	if (optlen != sizeof(struct rds_get_mr_args))
+		return -EINVAL;
+
+	if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
+			   sizeof(struct rds_get_mr_args)))
+		return -EFAULT;
+
+	return __rds_rdma_map(rs, &args, NULL, NULL);
+}
+
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
+{
+	struct rds_get_mr_for_dest_args args;
+	struct rds_get_mr_args new_args;
+
+	if (optlen != sizeof(struct rds_get_mr_for_dest_args))
+		return -EINVAL;
+
+	if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
+			   sizeof(struct rds_get_mr_for_dest_args)))
+		return -EFAULT;
+
+	/*
+	 * Initially, just behave like get_mr().
+	 * TODO: Implement get_mr as wrapper around this
+	 *	 and deprecate it.
+	 */
+	new_args.vec = args.vec;
+	new_args.cookie_addr = args.cookie_addr;
+	new_args.flags = args.flags;
+
+	return __rds_rdma_map(rs, &new_args, NULL, NULL);
+}
+
+/*
+ * Free the MR indicated by the given R_Key
+ */
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+	struct rds_free_mr_args args;
+	struct rds_mr *mr;
+	unsigned long flags;
+
+	if (optlen != sizeof(struct rds_free_mr_args))
+		return -EINVAL;
+
+	if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
+			   sizeof(struct rds_free_mr_args)))
+		return -EFAULT;
+
+	/* Special case - a null cookie means flush all unused MRs */
+	if (args.cookie == 0) {
+		if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
+			return -EINVAL;
+		rs->rs_transport->flush_mrs();
+		return 0;
+	}
+
+	/* Look up the MR given its R_key and remove it from the rbtree
+	 * so nobody else finds it.
+	 * This should also prevent races with rds_rdma_unuse.
+	 */
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
+	if (mr) {
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+		RB_CLEAR_NODE(&mr->r_rb_node);
+		if (args.flags & RDS_RDMA_INVALIDATE)
+			mr->r_invalidate = 1;
+	}
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (!mr)
+		return -EINVAL;
+
+	/*
+	 * call rds_destroy_mr() ourselves so that we're sure it's done by the time
+	 * we return.  If we let rds_mr_put() do it it might not happen until
+	 * someone else drops their ref.
+	 */
+	rds_destroy_mr(mr);
+	rds_mr_put(mr);
+	return 0;
+}
+
+/*
+ * This is called when we receive an extension header that
+ * tells us this MR was used. It allows us to implement
+ * use_once semantics
+ */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
+{
+	struct rds_mr *mr;
+	unsigned long flags;
+	int zot_me = 0;
+
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+	if (!mr) {
+		pr_debug("rds: trying to unuse MR with unknown r_key %u!\n",
+			 r_key);
+		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+		return;
+	}
+
+	if (mr->r_use_once || force) {
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+		RB_CLEAR_NODE(&mr->r_rb_node);
+		zot_me = 1;
+	}
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	/* May have to issue a dma_sync on this memory region.
+	 * Note we could avoid this if the operation was a RDMA READ,
+	 * but at this point we can't tell. */
+	if (mr->r_trans->sync_mr)
+		mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+
+	/* If the MR was marked as invalidate, this will
+	 * trigger an async flush. */
+	if (zot_me) {
+		rds_destroy_mr(mr);
+		rds_mr_put(mr);
+	}
+}
+
+void rds_rdma_free_op(struct rm_rdma_op *ro)
+{
+	unsigned int i;
+
+	for (i = 0; i < ro->op_nents; i++) {
+		struct page *page = sg_page(&ro->op_sg[i]);
+
+		/* Mark page dirty if it was possibly modified, which
+		 * is the case for a RDMA_READ which copies from remote
+		 * to local memory */
+		if (!ro->op_write) {
+			WARN_ON(!page->mapping && irqs_disabled());
+			set_page_dirty(page);
+		}
+		put_page(page);
+	}
+
+	kfree(ro->op_notifier);
+	ro->op_notifier = NULL;
+	ro->op_active = 0;
+}
+
+void rds_atomic_free_op(struct rm_atomic_op *ao)
+{
+	struct page *page = sg_page(ao->op_sg);
+
+	/* Mark page dirty if it was possibly modified, which
+	 * is the case for a RDMA_READ which copies from remote
+	 * to local memory */
+	set_page_dirty(page);
+	put_page(page);
+
+	kfree(ao->op_notifier);
+	ao->op_notifier = NULL;
+	ao->op_active = 0;
+}
+
+
+/*
+ * Count the number of pages needed to describe an incoming iovec array.
+ */
+static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
+{
+	int tot_pages = 0;
+	unsigned int nr_pages;
+	unsigned int i;
+
+	/* figure out the number of pages in the vector */
+	for (i = 0; i < nr_iovecs; i++) {
+		nr_pages = rds_pages_in_vec(&iov[i]);
+		if (nr_pages == 0)
+			return -EINVAL;
+
+		tot_pages += nr_pages;
+
+		/*
+		 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
+		 * so tot_pages cannot overflow without first going negative.
+		 */
+		if (tot_pages < 0)
+			return -EINVAL;
+	}
+
+	return tot_pages;
+}
+
+int rds_rdma_extra_size(struct rds_rdma_args *args)
+{
+	struct rds_iovec vec;
+	struct rds_iovec __user *local_vec;
+	int tot_pages = 0;
+	unsigned int nr_pages;
+	unsigned int i;
+
+	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+	if (args->nr_local == 0)
+		return -EINVAL;
+
+	/* figure out the number of pages in the vector */
+	for (i = 0; i < args->nr_local; i++) {
+		if (copy_from_user(&vec, &local_vec[i],
+				   sizeof(struct rds_iovec)))
+			return -EFAULT;
+
+		nr_pages = rds_pages_in_vec(&vec);
+		if (nr_pages == 0)
+			return -EINVAL;
+
+		tot_pages += nr_pages;
+
+		/*
+		 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
+		 * so tot_pages cannot overflow without first going negative.
+		 */
+		if (tot_pages < 0)
+			return -EINVAL;
+	}
+
+	return tot_pages * sizeof(struct scatterlist);
+}
+
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	struct rds_rdma_args *args;
+	struct rm_rdma_op *op = &rm->rdma;
+	int nr_pages;
+	unsigned int nr_bytes;
+	struct page **pages = NULL;
+	struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
+	int iov_size;
+	unsigned int i, j;
+	int ret = 0;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+	    || rm->rdma.op_active)
+		return -EINVAL;
+
+	args = CMSG_DATA(cmsg);
+
+	if (rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out_ret;
+	}
+
+	if (args->nr_local > UIO_MAXIOV) {
+		ret = -EMSGSIZE;
+		goto out_ret;
+	}
+
+	/* Check whether to allocate the iovec area */
+	iov_size = args->nr_local * sizeof(struct rds_iovec);
+	if (args->nr_local > UIO_FASTIOV) {
+		iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
+		if (!iovs) {
+			ret = -ENOMEM;
+			goto out_ret;
+		}
+	}
+
+	if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	nr_pages = rds_rdma_pages(iovs, args->nr_local);
+	if (nr_pages < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
+	op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
+	op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
+	op->op_active = 1;
+	op->op_recverr = rs->rs_recverr;
+	WARN_ON(!nr_pages);
+	op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
+	if (!op->op_sg) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (op->op_notify || op->op_recverr) {
+		/* We allocate an uninitialized notifier here, because
+		 * we don't want to do that in the completion handler. We
+		 * would have to use GFP_ATOMIC there, and don't want to deal
+		 * with failed allocations.
+		 */
+		op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+		if (!op->op_notifier) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		op->op_notifier->n_user_token = args->user_token;
+		op->op_notifier->n_status = RDS_RDMA_SUCCESS;
+
+		/* Enable rmda notification on data operation for composite
+		 * rds messages and make sure notification is enabled only
+		 * for the data operation which follows it so that application
+		 * gets notified only after full message gets delivered.
+		 */
+		if (rm->data.op_sg) {
+			rm->rdma.op_notify = 0;
+			rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+		}
+	}
+
+	/* The cookie contains the R_Key of the remote memory region, and
+	 * optionally an offset into it. This is how we implement RDMA into
+	 * unaligned memory.
+	 * When setting up the RDMA, we need to add that offset to the
+	 * destination address (which is really an offset into the MR)
+	 * FIXME: We may want to move this into ib_rdma.c
+	 */
+	op->op_rkey = rds_rdma_cookie_key(args->cookie);
+	op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+
+	nr_bytes = 0;
+
+	rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
+	       (unsigned long long)args->nr_local,
+	       (unsigned long long)args->remote_vec.addr,
+	       op->op_rkey);
+
+	for (i = 0; i < args->nr_local; i++) {
+		struct rds_iovec *iov = &iovs[i];
+		/* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
+		unsigned int nr = rds_pages_in_vec(iov);
+
+		rs->rs_user_addr = iov->addr;
+		rs->rs_user_bytes = iov->bytes;
+
+		/* If it's a WRITE operation, we want to pin the pages for reading.
+		 * If it's a READ operation, we need to pin the pages for writing.
+		 */
+		ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
+		if (ret < 0)
+			goto out;
+		else
+			ret = 0;
+
+		rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
+			 nr_bytes, nr, iov->bytes, iov->addr);
+
+		nr_bytes += iov->bytes;
+
+		for (j = 0; j < nr; j++) {
+			unsigned int offset = iov->addr & ~PAGE_MASK;
+			struct scatterlist *sg;
+
+			sg = &op->op_sg[op->op_nents + j];
+			sg_set_page(sg, pages[j],
+					min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
+					offset);
+
+			rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
+			       sg->offset, sg->length, iov->addr, iov->bytes);
+
+			iov->addr += sg->length;
+			iov->bytes -= sg->length;
+		}
+
+		op->op_nents += nr;
+	}
+
+	if (nr_bytes > args->remote_vec.bytes) {
+		rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
+				nr_bytes,
+				(unsigned int) args->remote_vec.bytes);
+		ret = -EINVAL;
+		goto out;
+	}
+	op->op_bytes = nr_bytes;
+
+out:
+	if (iovs != iovstack)
+		sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
+	kfree(pages);
+out_ret:
+	if (ret)
+		rds_rdma_free_op(op);
+	else
+		rds_stats_inc(s_send_rdma);
+
+	return ret;
+}
+
+/*
+ * The application wants us to pass an RDMA destination (aka MR)
+ * to the remote
+ */
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	unsigned long flags;
+	struct rds_mr *mr;
+	u32 r_key;
+	int err = 0;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) ||
+	    rm->m_rdma_cookie != 0)
+		return -EINVAL;
+
+	memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
+
+	/* We are reusing a previously mapped MR here. Most likely, the
+	 * application has written to the buffer, so we need to explicitly
+	 * flush those writes to RAM. Otherwise the HCA may not see them
+	 * when doing a DMA from that buffer.
+	 */
+	r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
+
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+	if (!mr)
+		err = -EINVAL;	/* invalid r_key */
+	else
+		refcount_inc(&mr->r_refcount);
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (mr) {
+		mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
+		rm->rdma.op_rdma_mr = mr;
+	}
+	return err;
+}
+
+/*
+ * The application passes us an address range it wants to enable RDMA
+ * to/from. We map the area, and save the <R_Key,offset> pair
+ * in rm->m_rdma_cookie. This causes it to be sent along to the peer
+ * in an extension header.
+ */
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) ||
+	    rm->m_rdma_cookie != 0)
+		return -EINVAL;
+
+	return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
+}
+
+/*
+ * Fill in rds_message for an atomic request.
+ */
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+		    struct cmsghdr *cmsg)
+{
+	struct page *page = NULL;
+	struct rds_atomic_args *args;
+	int ret = 0;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
+	 || rm->atomic.op_active)
+		return -EINVAL;
+
+	args = CMSG_DATA(cmsg);
+
+	/* Nonmasked & masked cmsg ops converted to masked hw ops */
+	switch (cmsg->cmsg_type) {
+	case RDS_CMSG_ATOMIC_FADD:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+		rm->atomic.op_m_fadd.add = args->fadd.add;
+		rm->atomic.op_m_fadd.nocarry_mask = 0;
+		break;
+	case RDS_CMSG_MASKED_ATOMIC_FADD:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+		rm->atomic.op_m_fadd.add = args->m_fadd.add;
+		rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
+		break;
+	case RDS_CMSG_ATOMIC_CSWP:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+		rm->atomic.op_m_cswp.compare = args->cswp.compare;
+		rm->atomic.op_m_cswp.swap = args->cswp.swap;
+		rm->atomic.op_m_cswp.compare_mask = ~0;
+		rm->atomic.op_m_cswp.swap_mask = ~0;
+		break;
+	case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+		rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
+		rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
+		rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
+		rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
+		break;
+	default:
+		BUG(); /* should never happen */
+	}
+
+	rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
+	rm->atomic.op_active = 1;
+	rm->atomic.op_recverr = rs->rs_recverr;
+	rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
+	if (!rm->atomic.op_sg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/* verify 8 byte-aligned */
+	if (args->local_addr & 0x7) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	ret = rds_pin_pages(args->local_addr, 1, &page, 1);
+	if (ret != 1)
+		goto err;
+	ret = 0;
+
+	sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
+
+	if (rm->atomic.op_notify || rm->atomic.op_recverr) {
+		/* We allocate an uninitialized notifier here, because
+		 * we don't want to do that in the completion handler. We
+		 * would have to use GFP_ATOMIC there, and don't want to deal
+		 * with failed allocations.
+		 */
+		rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
+		if (!rm->atomic.op_notifier) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		rm->atomic.op_notifier->n_user_token = args->user_token;
+		rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
+	}
+
+	rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
+	rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
+
+	return ret;
+err:
+	if (page)
+		put_page(page);
+	rm->atomic.op_active = 0;
+	kfree(rm->atomic.op_notifier);
+
+	return ret;
+}
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
new file mode 100644
index 0000000..fc59821
--- /dev/null
+++ b/net/rds/rdma_transport.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2009 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds_single_path.h"
+#include "rdma_transport.h"
+#include "ib.h"
+
+static struct rdma_cm_id *rds_rdma_listen_id;
+
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event)
+{
+	/* this can be null in the listening path */
+	struct rds_connection *conn = cm_id->context;
+	struct rds_transport *trans;
+	int ret = 0;
+
+	rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
+		 event->event, rdma_event_msg(event->event));
+
+	if (cm_id->device->node_type == RDMA_NODE_IB_CA)
+		trans = &rds_ib_transport;
+
+	/* Prevent shutdown from tearing down the connection
+	 * while we're executing. */
+	if (conn) {
+		mutex_lock(&conn->c_cm_lock);
+
+		/* If the connection is being shut down, bail out
+		 * right away. We return 0 so cm_id doesn't get
+		 * destroyed prematurely */
+		if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) {
+			/* Reject incoming connections while we're tearing
+			 * down an existing one. */
+			if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
+				ret = 1;
+			goto out;
+		}
+	}
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		ret = trans->cm_handle_connect(cm_id, event);
+		break;
+
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		/* XXX do we need to clean up if this fails? */
+		ret = rdma_resolve_route(cm_id,
+					 RDS_RDMA_RESOLVE_TIMEOUT_MS);
+		break;
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		/* Connection could have been dropped so make sure the
+		 * cm_id is valid before proceeding
+		 */
+		if (conn) {
+			struct rds_ib_connection *ibic;
+
+			ibic = conn->c_transport_data;
+			if (ibic && ibic->i_cm_id == cm_id)
+				ret = trans->cm_initiate_connect(cm_id);
+			else
+				rds_conn_drop(conn);
+		}
+		break;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		trans->cm_connect_complete(conn, event);
+		break;
+
+	case RDMA_CM_EVENT_REJECTED:
+		rdsdebug("Connection rejected: %s\n",
+			 rdma_reject_msg(cm_id, event->status));
+		/* FALLTHROUGH */
+	case RDMA_CM_EVENT_ADDR_ERROR:
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		if (conn)
+			rds_conn_drop(conn);
+		break;
+
+	case RDMA_CM_EVENT_DISCONNECTED:
+		rdsdebug("DISCONNECT event - dropping connection "
+			"%pI4->%pI4\n", &conn->c_laddr,
+			 &conn->c_faddr);
+		rds_conn_drop(conn);
+		break;
+
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		if (conn) {
+			pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n",
+				&conn->c_laddr, &conn->c_faddr);
+			rds_conn_drop(conn);
+		}
+		break;
+
+	default:
+		/* things like device disconnect? */
+		printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
+		       event->event, rdma_event_msg(event->event));
+		break;
+	}
+
+out:
+	if (conn)
+		mutex_unlock(&conn->c_cm_lock);
+
+	rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
+		 rdma_event_msg(event->event), ret);
+
+	return ret;
+}
+
+static int rds_rdma_listen_init(void)
+{
+	struct sockaddr_in sin;
+	struct rdma_cm_id *cm_id;
+	int ret;
+
+	cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL,
+			       RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(cm_id)) {
+		ret = PTR_ERR(cm_id);
+		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+		       "rdma_create_id() returned %d\n", ret);
+		return ret;
+	}
+
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
+	sin.sin_port = (__force u16)htons(RDS_PORT);
+
+	/*
+	 * XXX I bet this binds the cm_id to a device.  If we want to support
+	 * fail-over we'll have to take this into consideration.
+	 */
+	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+	if (ret) {
+		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+		       "rdma_bind_addr() returned %d\n", ret);
+		goto out;
+	}
+
+	ret = rdma_listen(cm_id, 128);
+	if (ret) {
+		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+		       "rdma_listen() returned %d\n", ret);
+		goto out;
+	}
+
+	rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
+
+	rds_rdma_listen_id = cm_id;
+	cm_id = NULL;
+out:
+	if (cm_id)
+		rdma_destroy_id(cm_id);
+	return ret;
+}
+
+static void rds_rdma_listen_stop(void)
+{
+	if (rds_rdma_listen_id) {
+		rdsdebug("cm %p\n", rds_rdma_listen_id);
+		rdma_destroy_id(rds_rdma_listen_id);
+		rds_rdma_listen_id = NULL;
+	}
+}
+
+static int rds_rdma_init(void)
+{
+	int ret;
+
+	ret = rds_ib_init();
+	if (ret)
+		goto out;
+
+	ret = rds_rdma_listen_init();
+	if (ret)
+		rds_ib_exit();
+out:
+	return ret;
+}
+module_init(rds_rdma_init);
+
+static void rds_rdma_exit(void)
+{
+	/* stop listening first to ensure no new connections are attempted */
+	rds_rdma_listen_stop();
+	rds_ib_exit();
+}
+module_exit(rds_rdma_exit);
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: IB transport");
+MODULE_LICENSE("Dual BSD/GPL");
+
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
new file mode 100644
index 0000000..d309c44
--- /dev/null
+++ b/net/rds/rdma_transport.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDMA_TRANSPORT_H
+#define _RDMA_TRANSPORT_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+
+#define RDS_RDMA_RESOLVE_TIMEOUT_MS     5000
+
+int rds_rdma_conn_connect(struct rds_connection *conn);
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event);
+
+/* from ib.c */
+extern struct rds_transport rds_ib_transport;
+int rds_ib_init(void);
+void rds_ib_exit(void);
+
+#endif
diff --git a/net/rds/rds.h b/net/rds/rds.h
new file mode 100644
index 0000000..b04c333
--- /dev/null
+++ b/net/rds/rds.h
@@ -0,0 +1,956 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDS_RDS_H
+#define _RDS_RDS_H
+
+#include <net/sock.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <rdma/rdma_cm.h>
+#include <linux/mutex.h>
+#include <linux/rds.h>
+#include <linux/rhashtable.h>
+#include <linux/refcount.h>
+
+#include "info.h"
+
+/*
+ * RDS Network protocol version
+ */
+#define RDS_PROTOCOL_3_0	0x0300
+#define RDS_PROTOCOL_3_1	0x0301
+#define RDS_PROTOCOL_VERSION	RDS_PROTOCOL_3_1
+#define RDS_PROTOCOL_MAJOR(v)	((v) >> 8)
+#define RDS_PROTOCOL_MINOR(v)	((v) & 255)
+#define RDS_PROTOCOL(maj, min)	(((maj) << 8) | min)
+
+/*
+ * XXX randomly chosen, but at least seems to be unused:
+ * #               18464-18768 Unassigned
+ * We should do better.  We want a reserved port to discourage unpriv'ed
+ * userspace from listening.
+ */
+#define RDS_PORT	18634
+
+#ifdef ATOMIC64_INIT
+#define KERNEL_HAS_ATOMIC64
+#endif
+
+#ifdef RDS_DEBUG
+#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
+#else
+/* sigh, pr_debug() causes unused variable warnings */
+static inline __printf(1, 2)
+void rdsdebug(char *fmt, ...)
+{
+}
+#endif
+
+/* XXX is there one of these somewhere? */
+#define ceil(x, y) \
+	({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; })
+
+#define RDS_FRAG_SHIFT	12
+#define RDS_FRAG_SIZE	((unsigned int)(1 << RDS_FRAG_SHIFT))
+
+/* Used to limit both RDMA and non-RDMA RDS message to 1MB */
+#define RDS_MAX_MSG_SIZE	((unsigned int)(1 << 20))
+
+#define RDS_CONG_MAP_BYTES	(65536 / 8)
+#define RDS_CONG_MAP_PAGES	(PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
+#define RDS_CONG_MAP_PAGE_BITS	(PAGE_SIZE * 8)
+
+struct rds_cong_map {
+	struct rb_node		m_rb_node;
+	__be32			m_addr;
+	wait_queue_head_t	m_waitq;
+	struct list_head	m_conn_list;
+	unsigned long		m_page_addrs[RDS_CONG_MAP_PAGES];
+};
+
+
+/*
+ * This is how we will track the connection state:
+ * A connection is always in one of the following
+ * states. Updates to the state are atomic and imply
+ * a memory barrier.
+ */
+enum {
+	RDS_CONN_DOWN = 0,
+	RDS_CONN_CONNECTING,
+	RDS_CONN_DISCONNECTING,
+	RDS_CONN_UP,
+	RDS_CONN_RESETTING,
+	RDS_CONN_ERROR,
+};
+
+/* Bits for c_flags */
+#define RDS_LL_SEND_FULL	0
+#define RDS_RECONNECT_PENDING	1
+#define RDS_IN_XMIT		2
+#define RDS_RECV_REFILL		3
+#define	RDS_DESTROY_PENDING	4
+
+/* Max number of multipaths per RDS connection. Must be a power of 2 */
+#define	RDS_MPATH_WORKERS	8
+#define	RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \
+			       (rs)->rs_hash_initval) & ((n) - 1))
+
+#define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr))
+
+/* Per mpath connection state */
+struct rds_conn_path {
+	struct rds_connection	*cp_conn;
+	struct rds_message	*cp_xmit_rm;
+	unsigned long		cp_xmit_sg;
+	unsigned int		cp_xmit_hdr_off;
+	unsigned int		cp_xmit_data_off;
+	unsigned int		cp_xmit_atomic_sent;
+	unsigned int		cp_xmit_rdma_sent;
+	unsigned int		cp_xmit_data_sent;
+
+	spinlock_t		cp_lock;		/* protect msg queues */
+	u64			cp_next_tx_seq;
+	struct list_head	cp_send_queue;
+	struct list_head	cp_retrans;
+
+	u64			cp_next_rx_seq;
+
+	void			*cp_transport_data;
+
+	atomic_t		cp_state;
+	unsigned long		cp_send_gen;
+	unsigned long		cp_flags;
+	unsigned long		cp_reconnect_jiffies;
+	struct delayed_work	cp_send_w;
+	struct delayed_work	cp_recv_w;
+	struct delayed_work	cp_conn_w;
+	struct work_struct	cp_down_w;
+	struct mutex		cp_cm_lock;	/* protect cp_state & cm */
+	wait_queue_head_t	cp_waitq;
+
+	unsigned int		cp_unacked_packets;
+	unsigned int		cp_unacked_bytes;
+	unsigned int		cp_index;
+};
+
+/* One rds_connection per RDS address pair */
+struct rds_connection {
+	struct hlist_node	c_hash_node;
+	__be32			c_laddr;
+	__be32			c_faddr;
+	unsigned int		c_loopback:1,
+				c_ping_triggered:1,
+				c_pad_to_32:30;
+	int			c_npaths;
+	struct rds_connection	*c_passive;
+	struct rds_transport	*c_trans;
+
+	struct rds_cong_map	*c_lcong;
+	struct rds_cong_map	*c_fcong;
+
+	/* Protocol version */
+	unsigned int		c_version;
+	possible_net_t		c_net;
+
+	struct list_head	c_map_item;
+	unsigned long		c_map_queued;
+
+	struct rds_conn_path	*c_path;
+	wait_queue_head_t	c_hs_waitq; /* handshake waitq */
+
+	u32			c_my_gen_num;
+	u32			c_peer_gen_num;
+};
+
+static inline
+struct net *rds_conn_net(struct rds_connection *conn)
+{
+	return read_pnet(&conn->c_net);
+}
+
+static inline
+void rds_conn_net_set(struct rds_connection *conn, struct net *net)
+{
+	write_pnet(&conn->c_net, net);
+}
+
+#define RDS_FLAG_CONG_BITMAP	0x01
+#define RDS_FLAG_ACK_REQUIRED	0x02
+#define RDS_FLAG_RETRANSMITTED	0x04
+#define RDS_MAX_ADV_CREDIT	255
+
+/* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping
+ * probe to exchange control information before establishing a connection.
+ * Currently the control information that is exchanged is the number of
+ * supported paths. If the peer is a legacy (older kernel revision) peer,
+ * it would return a pong message without additional control information
+ * that would then alert the sender that the peer was an older rev.
+ */
+#define RDS_FLAG_PROBE_PORT	1
+#define	RDS_HS_PROBE(sport, dport) \
+		((sport == RDS_FLAG_PROBE_PORT && dport == 0) || \
+		 (sport == 0 && dport == RDS_FLAG_PROBE_PORT))
+/*
+ * Maximum space available for extension headers.
+ */
+#define RDS_HEADER_EXT_SPACE	16
+
+struct rds_header {
+	__be64	h_sequence;
+	__be64	h_ack;
+	__be32	h_len;
+	__be16	h_sport;
+	__be16	h_dport;
+	u8	h_flags;
+	u8	h_credit;
+	u8	h_padding[4];
+	__sum16	h_csum;
+
+	u8	h_exthdr[RDS_HEADER_EXT_SPACE];
+};
+
+/*
+ * Reserved - indicates end of extensions
+ */
+#define RDS_EXTHDR_NONE		0
+
+/*
+ * This extension header is included in the very
+ * first message that is sent on a new connection,
+ * and identifies the protocol level. This will help
+ * rolling updates if a future change requires breaking
+ * the protocol.
+ * NB: This is no longer true for IB, where we do a version
+ * negotiation during the connection setup phase (protocol
+ * version information is included in the RDMA CM private data).
+ */
+#define RDS_EXTHDR_VERSION	1
+struct rds_ext_header_version {
+	__be32			h_version;
+};
+
+/*
+ * This extension header is included in the RDS message
+ * chasing an RDMA operation.
+ */
+#define RDS_EXTHDR_RDMA		2
+struct rds_ext_header_rdma {
+	__be32			h_rdma_rkey;
+};
+
+/*
+ * This extension header tells the peer about the
+ * destination <R_Key,offset> of the requested RDMA
+ * operation.
+ */
+#define RDS_EXTHDR_RDMA_DEST	3
+struct rds_ext_header_rdma_dest {
+	__be32			h_rdma_rkey;
+	__be32			h_rdma_offset;
+};
+
+/* Extension header announcing number of paths.
+ * Implicit length = 2 bytes.
+ */
+#define RDS_EXTHDR_NPATHS	5
+#define RDS_EXTHDR_GEN_NUM	6
+
+#define __RDS_EXTHDR_MAX	16 /* for now */
+#define RDS_RX_MAX_TRACES	(RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
+#define	RDS_MSG_RX_HDR		0
+#define	RDS_MSG_RX_START	1
+#define	RDS_MSG_RX_END		2
+#define	RDS_MSG_RX_CMSG		3
+
+struct rds_incoming {
+	refcount_t		i_refcount;
+	struct list_head	i_item;
+	struct rds_connection	*i_conn;
+	struct rds_conn_path	*i_conn_path;
+	struct rds_header	i_hdr;
+	unsigned long		i_rx_jiffies;
+	__be32			i_saddr;
+
+	rds_rdma_cookie_t	i_rdma_cookie;
+	struct timeval		i_rx_tstamp;
+	u64			i_rx_lat_trace[RDS_RX_MAX_TRACES];
+};
+
+struct rds_mr {
+	struct rb_node		r_rb_node;
+	refcount_t		r_refcount;
+	u32			r_key;
+
+	/* A copy of the creation flags */
+	unsigned int		r_use_once:1;
+	unsigned int		r_invalidate:1;
+	unsigned int		r_write:1;
+
+	/* This is for RDS_MR_DEAD.
+	 * It would be nice & consistent to make this part of the above
+	 * bit field here, but we need to use test_and_set_bit.
+	 */
+	unsigned long		r_state;
+	struct rds_sock		*r_sock; /* back pointer to the socket that owns us */
+	struct rds_transport	*r_trans;
+	void			*r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD		0
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+	return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+	return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+	return cookie >> 32;
+}
+
+/* atomic operation types */
+#define RDS_ATOMIC_TYPE_CSWP		0
+#define RDS_ATOMIC_TYPE_FADD		1
+
+/*
+ * m_sock_item and m_conn_item are on lists that are serialized under
+ * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
+ * the message will not be put back on the retransmit list after being sent.
+ * messages that are canceled while being sent rely on this.
+ *
+ * m_inc is used by loopback so that it can pass an incoming message straight
+ * back up into the rx path.  It embeds a wire header which is also used by
+ * the send path, which is kind of awkward.
+ *
+ * m_sock_item indicates the message's presence on a socket's send or receive
+ * queue.  m_rs will point to that socket.
+ *
+ * m_daddr is used by cancellation to prune messages to a given destination.
+ *
+ * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
+ * nesting.  As paths iterate over messages on a sock, or conn, they must
+ * also lock the conn, or sock, to remove the message from those lists too.
+ * Testing the flag to determine if the message is still on the lists lets
+ * us avoid testing the list_head directly.  That means each path can use
+ * the message's list_head to keep it on a local list while juggling locks
+ * without confusing the other path.
+ *
+ * m_ack_seq is an optional field set by transports who need a different
+ * sequence number range to invalidate.  They can use this in a callback
+ * that they pass to rds_send_drop_acked() to see if each message has been
+ * acked.  The HAS_ACK_SEQ flag can be used to detect messages which haven't
+ * had ack_seq set yet.
+ */
+#define RDS_MSG_ON_SOCK		1
+#define RDS_MSG_ON_CONN		2
+#define RDS_MSG_HAS_ACK_SEQ	3
+#define RDS_MSG_ACK_REQUIRED	4
+#define RDS_MSG_RETRANSMITTED	5
+#define RDS_MSG_MAPPED		6
+#define RDS_MSG_PAGEVEC		7
+#define RDS_MSG_FLUSH		8
+
+struct rds_znotifier {
+	struct mmpin		z_mmp;
+	u32			z_cookie;
+};
+
+struct rds_msg_zcopy_info {
+	struct list_head rs_zcookie_next;
+	union {
+		struct rds_znotifier znotif;
+		struct rds_zcopy_cookies zcookies;
+	};
+};
+
+struct rds_msg_zcopy_queue {
+	struct list_head zcookie_head;
+	spinlock_t lock; /* protects zcookie_head queue */
+};
+
+static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q)
+{
+	spin_lock_init(&q->lock);
+	INIT_LIST_HEAD(&q->zcookie_head);
+}
+
+struct rds_message {
+	refcount_t		m_refcount;
+	struct list_head	m_sock_item;
+	struct list_head	m_conn_item;
+	struct rds_incoming	m_inc;
+	u64			m_ack_seq;
+	__be32			m_daddr;
+	unsigned long		m_flags;
+
+	/* Never access m_rs without holding m_rs_lock.
+	 * Lock nesting is
+	 *  rm->m_rs_lock
+	 *   -> rs->rs_lock
+	 */
+	spinlock_t		m_rs_lock;
+	wait_queue_head_t	m_flush_wait;
+
+	struct rds_sock		*m_rs;
+
+	/* cookie to send to remote, in rds header */
+	rds_rdma_cookie_t	m_rdma_cookie;
+
+	unsigned int		m_used_sgs;
+	unsigned int		m_total_sgs;
+
+	void			*m_final_op;
+
+	struct {
+		struct rm_atomic_op {
+			int			op_type;
+			union {
+				struct {
+					uint64_t	compare;
+					uint64_t	swap;
+					uint64_t	compare_mask;
+					uint64_t	swap_mask;
+				} op_m_cswp;
+				struct {
+					uint64_t	add;
+					uint64_t	nocarry_mask;
+				} op_m_fadd;
+			};
+
+			u32			op_rkey;
+			u64			op_remote_addr;
+			unsigned int		op_notify:1;
+			unsigned int		op_recverr:1;
+			unsigned int		op_mapped:1;
+			unsigned int		op_silent:1;
+			unsigned int		op_active:1;
+			struct scatterlist	*op_sg;
+			struct rds_notifier	*op_notifier;
+
+			struct rds_mr		*op_rdma_mr;
+		} atomic;
+		struct rm_rdma_op {
+			u32			op_rkey;
+			u64			op_remote_addr;
+			unsigned int		op_write:1;
+			unsigned int		op_fence:1;
+			unsigned int		op_notify:1;
+			unsigned int		op_recverr:1;
+			unsigned int		op_mapped:1;
+			unsigned int		op_silent:1;
+			unsigned int		op_active:1;
+			unsigned int		op_bytes;
+			unsigned int		op_nents;
+			unsigned int		op_count;
+			struct scatterlist	*op_sg;
+			struct rds_notifier	*op_notifier;
+
+			struct rds_mr		*op_rdma_mr;
+		} rdma;
+		struct rm_data_op {
+			unsigned int		op_active:1;
+			unsigned int		op_notify:1;
+			unsigned int		op_nents;
+			unsigned int		op_count;
+			unsigned int		op_dmasg;
+			unsigned int		op_dmaoff;
+			struct rds_znotifier	*op_mmp_znotifier;
+			struct scatterlist	*op_sg;
+		} data;
+	};
+};
+
+/*
+ * The RDS notifier is used (optionally) to tell the application about
+ * completed RDMA operations. Rather than keeping the whole rds message
+ * around on the queue, we allocate a small notifier that is put on the
+ * socket's notifier_list. Notifications are delivered to the application
+ * through control messages.
+ */
+struct rds_notifier {
+	struct list_head	n_list;
+	uint64_t		n_user_token;
+	int			n_status;
+};
+
+/**
+ * struct rds_transport -  transport specific behavioural hooks
+ *
+ * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
+ *        part of a message.  The caller serializes on the send_sem so this
+ *        doesn't need to be reentrant for a given conn.  The header must be
+ *        sent before the data payload.  .xmit must be prepared to send a
+ *        message with no data payload.  .xmit should return the number of
+ *        bytes that were sent down the connection, including header bytes.
+ *        Returning 0 tells the caller that it doesn't need to perform any
+ *        additional work now.  This is usually the case when the transport has
+ *        filled the sending queue for its connection and will handle
+ *        triggering the rds thread to continue the send when space becomes
+ *        available.  Returning -EAGAIN tells the caller to retry the send
+ *        immediately.  Returning -ENOMEM tells the caller to retry the send at
+ *        some point in the future.
+ *
+ * @conn_shutdown: conn_shutdown stops traffic on the given connection.  Once
+ *                 it returns the connection can not call rds_recv_incoming().
+ *                 This will only be called once after conn_connect returns
+ *                 non-zero success and will The caller serializes this with
+ *                 the send and connecting paths (xmit_* and conn_*).  The
+ *                 transport is responsible for other serialization, including
+ *                 rds_recv_incoming().  This is called in process context but
+ *                 should try hard not to block.
+ */
+
+struct rds_transport {
+	char			t_name[TRANSNAMSIZ];
+	struct list_head	t_item;
+	struct module		*t_owner;
+	unsigned int		t_prefer_loopback:1,
+				t_mp_capable:1;
+	unsigned int		t_type;
+
+	int (*laddr_check)(struct net *net, __be32 addr);
+	int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
+	void (*conn_free)(void *data);
+	int (*conn_path_connect)(struct rds_conn_path *cp);
+	void (*conn_path_shutdown)(struct rds_conn_path *conn);
+	void (*xmit_path_prepare)(struct rds_conn_path *cp);
+	void (*xmit_path_complete)(struct rds_conn_path *cp);
+	int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
+		    unsigned int hdr_off, unsigned int sg, unsigned int off);
+	int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
+	int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
+	int (*recv_path)(struct rds_conn_path *cp);
+	int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to);
+	void (*inc_free)(struct rds_incoming *inc);
+
+	int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
+				 struct rdma_cm_event *event);
+	int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
+	void (*cm_connect_complete)(struct rds_connection *conn,
+				    struct rdma_cm_event *event);
+
+	unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
+					unsigned int avail);
+	void (*exit)(void);
+	void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
+			struct rds_sock *rs, u32 *key_ret);
+	void (*sync_mr)(void *trans_private, int direction);
+	void (*free_mr)(void *trans_private, int invalidate);
+	void (*flush_mrs)(void);
+	bool (*t_unloading)(struct rds_connection *conn);
+};
+
+struct rds_sock {
+	struct sock		rs_sk;
+
+	u64			rs_user_addr;
+	u64			rs_user_bytes;
+
+	/*
+	 * bound_addr used for both incoming and outgoing, no INADDR_ANY
+	 * support.
+	 */
+	struct rhash_head	rs_bound_node;
+	u64			rs_bound_key;
+	__be32			rs_bound_addr;
+	__be32			rs_conn_addr;
+	__be16			rs_bound_port;
+	__be16			rs_conn_port;
+	struct rds_transport    *rs_transport;
+
+	/*
+	 * rds_sendmsg caches the conn it used the last time around.
+	 * This helps avoid costly lookups.
+	 */
+	struct rds_connection	*rs_conn;
+
+	/* flag indicating we were congested or not */
+	int			rs_congested;
+	/* seen congestion (ENOBUFS) when sending? */
+	int			rs_seen_congestion;
+
+	/* rs_lock protects all these adjacent members before the newline */
+	spinlock_t		rs_lock;
+	struct list_head	rs_send_queue;
+	u32			rs_snd_bytes;
+	int			rs_rcv_bytes;
+	struct list_head	rs_notify_queue;	/* currently used for failed RDMAs */
+
+	/* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
+	 * to decide whether the application should be woken up.
+	 * If not set, we use rs_cong_track to find out whether a cong map
+	 * update arrived.
+	 */
+	uint64_t		rs_cong_mask;
+	uint64_t		rs_cong_notify;
+	struct list_head	rs_cong_list;
+	unsigned long		rs_cong_track;
+
+	/*
+	 * rs_recv_lock protects the receive queue, and is
+	 * used to serialize with rds_release.
+	 */
+	rwlock_t		rs_recv_lock;
+	struct list_head	rs_recv_queue;
+
+	/* just for stats reporting */
+	struct list_head	rs_item;
+
+	/* these have their own lock */
+	spinlock_t		rs_rdma_lock;
+	struct rb_root		rs_rdma_keys;
+
+	/* Socket options - in case there will be more */
+	unsigned char		rs_recverr,
+				rs_cong_monitor;
+	u32			rs_hash_initval;
+
+	/* Socket receive path trace points*/
+	u8			rs_rx_traces;
+	u8			rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
+	struct rds_msg_zcopy_queue rs_zcookie_queue;
+};
+
+static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
+{
+	return container_of(sk, struct rds_sock, rs_sk);
+}
+static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
+{
+	return &rs->rs_sk;
+}
+
+/*
+ * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
+ * to account for overhead.  We don't account for overhead, we just apply
+ * the number of payload bytes to the specified value.
+ */
+static inline int rds_sk_sndbuf(struct rds_sock *rs)
+{
+	return rds_rs_to_sk(rs)->sk_sndbuf / 2;
+}
+static inline int rds_sk_rcvbuf(struct rds_sock *rs)
+{
+	return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
+}
+
+struct rds_statistics {
+	uint64_t	s_conn_reset;
+	uint64_t	s_recv_drop_bad_checksum;
+	uint64_t	s_recv_drop_old_seq;
+	uint64_t	s_recv_drop_no_sock;
+	uint64_t	s_recv_drop_dead_sock;
+	uint64_t	s_recv_deliver_raced;
+	uint64_t	s_recv_delivered;
+	uint64_t	s_recv_queued;
+	uint64_t	s_recv_immediate_retry;
+	uint64_t	s_recv_delayed_retry;
+	uint64_t	s_recv_ack_required;
+	uint64_t	s_recv_rdma_bytes;
+	uint64_t	s_recv_ping;
+	uint64_t	s_send_queue_empty;
+	uint64_t	s_send_queue_full;
+	uint64_t	s_send_lock_contention;
+	uint64_t	s_send_lock_queue_raced;
+	uint64_t	s_send_immediate_retry;
+	uint64_t	s_send_delayed_retry;
+	uint64_t	s_send_drop_acked;
+	uint64_t	s_send_ack_required;
+	uint64_t	s_send_queued;
+	uint64_t	s_send_rdma;
+	uint64_t	s_send_rdma_bytes;
+	uint64_t	s_send_pong;
+	uint64_t	s_page_remainder_hit;
+	uint64_t	s_page_remainder_miss;
+	uint64_t	s_copy_to_user;
+	uint64_t	s_copy_from_user;
+	uint64_t	s_cong_update_queued;
+	uint64_t	s_cong_update_received;
+	uint64_t	s_cong_send_error;
+	uint64_t	s_cong_send_blocked;
+	uint64_t	s_recv_bytes_added_to_socket;
+	uint64_t	s_recv_bytes_removed_from_socket;
+
+};
+
+/* af_rds.c */
+void rds_sock_addref(struct rds_sock *rs);
+void rds_sock_put(struct rds_sock *rs);
+void rds_wake_sk_sleep(struct rds_sock *rs);
+static inline void __rds_wake_sk_sleep(struct sock *sk)
+{
+	wait_queue_head_t *waitq = sk_sleep(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD) && waitq)
+		wake_up(waitq);
+}
+extern wait_queue_head_t rds_poll_waitq;
+
+
+/* bind.c */
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+void rds_remove_bound(struct rds_sock *rs);
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
+int rds_bind_lock_init(void);
+void rds_bind_lock_destroy(void);
+
+/* cong.c */
+int rds_cong_get_maps(struct rds_connection *conn);
+void rds_cong_add_conn(struct rds_connection *conn);
+void rds_cong_remove_conn(struct rds_connection *conn);
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
+void rds_cong_queue_updates(struct rds_cong_map *map);
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
+int rds_cong_updated_since(unsigned long *recent);
+void rds_cong_add_socket(struct rds_sock *);
+void rds_cong_remove_socket(struct rds_sock *);
+void rds_cong_exit(void);
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
+
+/* conn.c */
+extern u32 rds_gen_num;
+int rds_conn_init(void);
+void rds_conn_exit(void);
+struct rds_connection *rds_conn_create(struct net *net,
+				       __be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp);
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+						__be32 laddr, __be32 faddr,
+			       struct rds_transport *trans, gfp_t gfp);
+void rds_conn_shutdown(struct rds_conn_path *cpath);
+void rds_conn_destroy(struct rds_connection *conn);
+void rds_conn_drop(struct rds_connection *conn);
+void rds_conn_path_drop(struct rds_conn_path *cpath, bool destroy);
+void rds_conn_connect_if_down(struct rds_connection *conn);
+void rds_conn_path_connect_if_down(struct rds_conn_path *cp);
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens,
+			  int (*visitor)(struct rds_connection *, void *),
+			  u64 *buffer,
+			  size_t item_len);
+
+__printf(2, 3)
+void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...);
+#define rds_conn_path_error(cp, fmt...) \
+	__rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt)
+
+static inline int
+rds_conn_path_transition(struct rds_conn_path *cp, int old, int new)
+{
+	return atomic_cmpxchg(&cp->cp_state, old, new) == old;
+}
+
+static inline int
+rds_conn_transition(struct rds_connection *conn, int old, int new)
+{
+	WARN_ON(conn->c_trans->t_mp_capable);
+	return rds_conn_path_transition(&conn->c_path[0], old, new);
+}
+
+static inline int
+rds_conn_path_state(struct rds_conn_path *cp)
+{
+	return atomic_read(&cp->cp_state);
+}
+
+static inline int
+rds_conn_state(struct rds_connection *conn)
+{
+	WARN_ON(conn->c_trans->t_mp_capable);
+	return rds_conn_path_state(&conn->c_path[0]);
+}
+
+static inline int
+rds_conn_path_up(struct rds_conn_path *cp)
+{
+	return atomic_read(&cp->cp_state) == RDS_CONN_UP;
+}
+
+static inline int
+rds_conn_up(struct rds_connection *conn)
+{
+	WARN_ON(conn->c_trans->t_mp_capable);
+	return rds_conn_path_up(&conn->c_path[0]);
+}
+
+static inline int
+rds_conn_path_connecting(struct rds_conn_path *cp)
+{
+	return atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING;
+}
+
+static inline int
+rds_conn_connecting(struct rds_connection *conn)
+{
+	WARN_ON(conn->c_trans->t_mp_capable);
+	return rds_conn_path_connecting(&conn->c_path[0]);
+}
+
+/* message.c */
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
+int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
+			       bool zcopy);
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+				 __be16 dport, u64 seq);
+int rds_message_add_extension(struct rds_header *hdr,
+			      unsigned int type, const void *data, unsigned int len);
+int rds_message_next_extension(struct rds_header *hdr,
+			       unsigned int *pos, void *buf, unsigned int *buflen);
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
+int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+void rds_message_inc_free(struct rds_incoming *inc);
+void rds_message_addref(struct rds_message *rm);
+void rds_message_put(struct rds_message *rm);
+void rds_message_wait(struct rds_message *rm);
+void rds_message_unmapped(struct rds_message *rm);
+void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *info);
+
+static inline void rds_message_make_checksum(struct rds_header *hdr)
+{
+	hdr->h_csum = 0;
+	hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
+}
+
+static inline int rds_message_verify_checksum(const struct rds_header *hdr)
+{
+	return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
+}
+
+
+/* page.c */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+			     gfp_t gfp);
+void rds_page_exit(void);
+
+/* recv.c */
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+		  __be32 saddr);
+void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn,
+		       __be32 saddr);
+void rds_inc_put(struct rds_incoming *inc);
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+		       struct rds_incoming *inc, gfp_t gfp);
+int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+		int msg_flags);
+void rds_clear_recv_queue(struct rds_sock *rs);
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
+void rds_inc_info_copy(struct rds_incoming *inc,
+		       struct rds_info_iterator *iter,
+		       __be32 saddr, __be32 daddr, int flip);
+
+/* send.c */
+int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
+void rds_send_path_reset(struct rds_conn_path *conn);
+int rds_send_xmit(struct rds_conn_path *cp);
+struct sockaddr_in;
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
+typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+			 is_acked_func is_acked);
+void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
+			      is_acked_func is_acked);
+void rds_send_ping(struct rds_connection *conn, int cp_index);
+int rds_send_pong(struct rds_conn_path *cp, __be16 dport);
+
+/* rdma.c */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_rdma_extra_size(struct rds_rdma_args *args);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rm_rdma_op *ro);
+void rds_atomic_free_op(struct rm_atomic_op *ao);
+void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
+void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+		    struct cmsghdr *cmsg);
+
+void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+	if (refcount_dec_and_test(&mr->r_refcount))
+		__rds_put_mr_final(mr);
+}
+
+static inline bool rds_destroy_pending(struct rds_connection *conn)
+{
+	return !check_net(rds_conn_net(conn)) ||
+	       (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn));
+}
+
+/* stats.c */
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+#define rds_stats_inc_which(which, member) do {		\
+	per_cpu(which, get_cpu()).member++;		\
+	put_cpu();					\
+} while (0)
+#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
+#define rds_stats_add_which(which, member, count) do {		\
+	per_cpu(which, get_cpu()).member += count;	\
+	put_cpu();					\
+} while (0)
+#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
+int rds_stats_init(void);
+void rds_stats_exit(void);
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+			 uint64_t *values, const char *const *names,
+			 size_t nr);
+
+/* sysctl.c */
+int rds_sysctl_init(void);
+void rds_sysctl_exit(void);
+extern unsigned long rds_sysctl_sndbuf_min;
+extern unsigned long rds_sysctl_sndbuf_default;
+extern unsigned long rds_sysctl_sndbuf_max;
+extern unsigned long rds_sysctl_reconnect_min_jiffies;
+extern unsigned long rds_sysctl_reconnect_max_jiffies;
+extern unsigned int  rds_sysctl_max_unacked_packets;
+extern unsigned int  rds_sysctl_max_unacked_bytes;
+extern unsigned int  rds_sysctl_ping_enable;
+extern unsigned long rds_sysctl_trace_flags;
+extern unsigned int  rds_sysctl_trace_level;
+
+/* threads.c */
+int rds_threads_init(void);
+void rds_threads_exit(void);
+extern struct workqueue_struct *rds_wq;
+void rds_queue_reconnect(struct rds_conn_path *cp);
+void rds_connect_worker(struct work_struct *);
+void rds_shutdown_worker(struct work_struct *);
+void rds_send_worker(struct work_struct *);
+void rds_recv_worker(struct work_struct *);
+void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
+void rds_connect_complete(struct rds_connection *conn);
+
+/* transport.c */
+void rds_trans_register(struct rds_transport *trans);
+void rds_trans_unregister(struct rds_transport *trans);
+struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr);
+void rds_trans_put(struct rds_transport *trans);
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+				       unsigned int avail);
+struct rds_transport *rds_trans_get(int t_type);
+int rds_trans_init(void);
+void rds_trans_exit(void);
+
+#endif
diff --git a/net/rds/rds_single_path.h b/net/rds/rds_single_path.h
new file mode 100644
index 0000000..9521f6e
--- /dev/null
+++ b/net/rds/rds_single_path.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDS_RDS_SINGLE_H
+#define _RDS_RDS_SINGLE_H
+
+#define	c_xmit_rm		c_path[0].cp_xmit_rm
+#define	c_xmit_sg		c_path[0].cp_xmit_sg
+#define	c_xmit_hdr_off		c_path[0].cp_xmit_hdr_off
+#define	c_xmit_data_off		c_path[0].cp_xmit_data_off
+#define	c_xmit_atomic_sent	c_path[0].cp_xmit_atomic_sent
+#define	c_xmit_rdma_sent	c_path[0].cp_xmit_rdma_sent
+#define	c_xmit_data_sent	c_path[0].cp_xmit_data_sent
+#define	c_lock			c_path[0].cp_lock
+#define c_next_tx_seq		c_path[0].cp_next_tx_seq
+#define c_send_queue		c_path[0].cp_send_queue
+#define c_retrans		c_path[0].cp_retrans
+#define c_next_rx_seq		c_path[0].cp_next_rx_seq
+#define c_transport_data	c_path[0].cp_transport_data
+#define c_state			c_path[0].cp_state
+#define c_send_gen		c_path[0].cp_send_gen
+#define c_flags			c_path[0].cp_flags
+#define c_reconnect_jiffies	c_path[0].cp_reconnect_jiffies
+#define c_send_w		c_path[0].cp_send_w
+#define c_recv_w		c_path[0].cp_recv_w
+#define c_conn_w		c_path[0].cp_conn_w
+#define c_down_w		c_path[0].cp_down_w
+#define c_cm_lock		c_path[0].cp_cm_lock
+#define c_waitq			c_path[0].cp_waitq
+#define c_unacked_packets	c_path[0].cp_unacked_packets
+#define c_unacked_bytes		c_path[0].cp_unacked_bytes
+
+#endif /* _RDS_RDS_SINGLE_H */
diff --git a/net/rds/recv.c b/net/rds/recv.c
new file mode 100644
index 0000000..dc67458
--- /dev/null
+++ b/net/rds/recv.c
@@ -0,0 +1,772 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/export.h>
+#include <linux/time.h>
+#include <linux/rds.h>
+
+#include "rds.h"
+
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+		  __be32 saddr)
+{
+	int i;
+
+	refcount_set(&inc->i_refcount, 1);
+	INIT_LIST_HEAD(&inc->i_item);
+	inc->i_conn = conn;
+	inc->i_saddr = saddr;
+	inc->i_rdma_cookie = 0;
+	inc->i_rx_tstamp.tv_sec = 0;
+	inc->i_rx_tstamp.tv_usec = 0;
+
+	for (i = 0; i < RDS_RX_MAX_TRACES; i++)
+		inc->i_rx_lat_trace[i] = 0;
+}
+EXPORT_SYMBOL_GPL(rds_inc_init);
+
+void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
+		       __be32 saddr)
+{
+	refcount_set(&inc->i_refcount, 1);
+	INIT_LIST_HEAD(&inc->i_item);
+	inc->i_conn = cp->cp_conn;
+	inc->i_conn_path = cp;
+	inc->i_saddr = saddr;
+	inc->i_rdma_cookie = 0;
+	inc->i_rx_tstamp.tv_sec = 0;
+	inc->i_rx_tstamp.tv_usec = 0;
+}
+EXPORT_SYMBOL_GPL(rds_inc_path_init);
+
+static void rds_inc_addref(struct rds_incoming *inc)
+{
+	rdsdebug("addref inc %p ref %d\n", inc, refcount_read(&inc->i_refcount));
+	refcount_inc(&inc->i_refcount);
+}
+
+void rds_inc_put(struct rds_incoming *inc)
+{
+	rdsdebug("put inc %p ref %d\n", inc, refcount_read(&inc->i_refcount));
+	if (refcount_dec_and_test(&inc->i_refcount)) {
+		BUG_ON(!list_empty(&inc->i_item));
+
+		inc->i_conn->c_trans->inc_free(inc);
+	}
+}
+EXPORT_SYMBOL_GPL(rds_inc_put);
+
+static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
+				  struct rds_cong_map *map,
+				  int delta, __be16 port)
+{
+	int now_congested;
+
+	if (delta == 0)
+		return;
+
+	rs->rs_rcv_bytes += delta;
+	if (delta > 0)
+		rds_stats_add(s_recv_bytes_added_to_socket, delta);
+	else
+		rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
+	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
+
+	rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
+	  "now_cong %d delta %d\n",
+	  rs, &rs->rs_bound_addr,
+	  ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
+	  rds_sk_rcvbuf(rs), now_congested, delta);
+
+	/* wasn't -> am congested */
+	if (!rs->rs_congested && now_congested) {
+		rs->rs_congested = 1;
+		rds_cong_set_bit(map, port);
+		rds_cong_queue_updates(map);
+	}
+	/* was -> aren't congested */
+	/* Require more free space before reporting uncongested to prevent
+	   bouncing cong/uncong state too often */
+	else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
+		rs->rs_congested = 0;
+		rds_cong_clear_bit(map, port);
+		rds_cong_queue_updates(map);
+	}
+
+	/* do nothing if no change in cong state */
+}
+
+static void rds_conn_peer_gen_update(struct rds_connection *conn,
+				     u32 peer_gen_num)
+{
+	int i;
+	struct rds_message *rm, *tmp;
+	unsigned long flags;
+
+	WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP);
+	if (peer_gen_num != 0) {
+		if (conn->c_peer_gen_num != 0 &&
+		    peer_gen_num != conn->c_peer_gen_num) {
+			for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+				struct rds_conn_path *cp;
+
+				cp = &conn->c_path[i];
+				spin_lock_irqsave(&cp->cp_lock, flags);
+				cp->cp_next_tx_seq = 1;
+				cp->cp_next_rx_seq = 0;
+				list_for_each_entry_safe(rm, tmp,
+							 &cp->cp_retrans,
+							 m_conn_item) {
+					set_bit(RDS_MSG_FLUSH, &rm->m_flags);
+				}
+				spin_unlock_irqrestore(&cp->cp_lock, flags);
+			}
+		}
+		conn->c_peer_gen_num = peer_gen_num;
+	}
+}
+
+/*
+ * Process all extension headers that come with this message.
+ */
+static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
+{
+	struct rds_header *hdr = &inc->i_hdr;
+	unsigned int pos = 0, type, len;
+	union {
+		struct rds_ext_header_version version;
+		struct rds_ext_header_rdma rdma;
+		struct rds_ext_header_rdma_dest rdma_dest;
+	} buffer;
+
+	while (1) {
+		len = sizeof(buffer);
+		type = rds_message_next_extension(hdr, &pos, &buffer, &len);
+		if (type == RDS_EXTHDR_NONE)
+			break;
+		/* Process extension header here */
+		switch (type) {
+		case RDS_EXTHDR_RDMA:
+			rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
+			break;
+
+		case RDS_EXTHDR_RDMA_DEST:
+			/* We ignore the size for now. We could stash it
+			 * somewhere and use it for error checking. */
+			inc->i_rdma_cookie = rds_rdma_make_cookie(
+					be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
+					be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
+
+			break;
+		}
+	}
+}
+
+static void rds_recv_hs_exthdrs(struct rds_header *hdr,
+				struct rds_connection *conn)
+{
+	unsigned int pos = 0, type, len;
+	union {
+		struct rds_ext_header_version version;
+		u16 rds_npaths;
+		u32 rds_gen_num;
+	} buffer;
+	u32 new_peer_gen_num = 0;
+
+	while (1) {
+		len = sizeof(buffer);
+		type = rds_message_next_extension(hdr, &pos, &buffer, &len);
+		if (type == RDS_EXTHDR_NONE)
+			break;
+		/* Process extension header here */
+		switch (type) {
+		case RDS_EXTHDR_NPATHS:
+			conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
+					       be16_to_cpu(buffer.rds_npaths));
+			break;
+		case RDS_EXTHDR_GEN_NUM:
+			new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
+			break;
+		default:
+			pr_warn_ratelimited("ignoring unknown exthdr type "
+					     "0x%x\n", type);
+		}
+	}
+	/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
+	conn->c_npaths = max_t(int, conn->c_npaths, 1);
+	conn->c_ping_triggered = 0;
+	rds_conn_peer_gen_update(conn, new_peer_gen_num);
+}
+
+/* rds_start_mprds() will synchronously start multiple paths when appropriate.
+ * The scheme is based on the following rules:
+ *
+ * 1. rds_sendmsg on first connect attempt sends the probe ping, with the
+ *    sender's npaths (s_npaths)
+ * 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It
+ *    sends back a probe-pong with r_npaths. After that, if rcvr is the
+ *    smaller ip addr, it starts rds_conn_path_connect_if_down on all
+ *    mprds_paths.
+ * 3. sender gets woken up, and can move to rds_conn_path_connect_if_down.
+ *    If it is the smaller ipaddr, rds_conn_path_connect_if_down can be
+ *    called after reception of the probe-pong on all mprds_paths.
+ *    Otherwise (sender of probe-ping is not the smaller ip addr): just call
+ *    rds_conn_path_connect_if_down on the hashed path. (see rule 4)
+ * 4. rds_connect_worker must only trigger a connection if laddr < faddr.
+ * 5. sender may end up queuing the packet on the cp. will get sent out later.
+ *    when connection is completed.
+ */
+static void rds_start_mprds(struct rds_connection *conn)
+{
+	int i;
+	struct rds_conn_path *cp;
+
+	if (conn->c_npaths > 1 &&
+	    IS_CANONICAL(conn->c_laddr, conn->c_faddr)) {
+		for (i = 0; i < conn->c_npaths; i++) {
+			cp = &conn->c_path[i];
+			rds_conn_path_connect_if_down(cp);
+		}
+	}
+}
+
+/*
+ * The transport must make sure that this is serialized against other
+ * rx and conn reset on this specific conn.
+ *
+ * We currently assert that only one fragmented message will be sent
+ * down a connection at a time.  This lets us reassemble in the conn
+ * instead of per-flow which means that we don't have to go digging through
+ * flows to tear down partial reassembly progress on conn failure and
+ * we save flow lookup and locking for each frag arrival.  It does mean
+ * that small messages will wait behind large ones.  Fragmenting at all
+ * is only to reduce the memory consumption of pre-posted buffers.
+ *
+ * The caller passes in saddr and daddr instead of us getting it from the
+ * conn.  This lets loopback, who only has one conn for both directions,
+ * tell us which roles the addrs in the conn are playing for this message.
+ */
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+		       struct rds_incoming *inc, gfp_t gfp)
+{
+	struct rds_sock *rs = NULL;
+	struct sock *sk;
+	unsigned long flags;
+	struct rds_conn_path *cp;
+
+	inc->i_conn = conn;
+	inc->i_rx_jiffies = jiffies;
+	if (conn->c_trans->t_mp_capable)
+		cp = inc->i_conn_path;
+	else
+		cp = &conn->c_path[0];
+
+	rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
+		 "flags 0x%x rx_jiffies %lu\n", conn,
+		 (unsigned long long)cp->cp_next_rx_seq,
+		 inc,
+		 (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
+		 be32_to_cpu(inc->i_hdr.h_len),
+		 be16_to_cpu(inc->i_hdr.h_sport),
+		 be16_to_cpu(inc->i_hdr.h_dport),
+		 inc->i_hdr.h_flags,
+		 inc->i_rx_jiffies);
+
+	/*
+	 * Sequence numbers should only increase.  Messages get their
+	 * sequence number as they're queued in a sending conn.  They
+	 * can be dropped, though, if the sending socket is closed before
+	 * they hit the wire.  So sequence numbers can skip forward
+	 * under normal operation.  They can also drop back in the conn
+	 * failover case as previously sent messages are resent down the
+	 * new instance of a conn.  We drop those, otherwise we have
+	 * to assume that the next valid seq does not come after a
+	 * hole in the fragment stream.
+	 *
+	 * The headers don't give us a way to realize if fragments of
+	 * a message have been dropped.  We assume that frags that arrive
+	 * to a flow are part of the current message on the flow that is
+	 * being reassembled.  This means that senders can't drop messages
+	 * from the sending conn until all their frags are sent.
+	 *
+	 * XXX we could spend more on the wire to get more robust failure
+	 * detection, arguably worth it to avoid data corruption.
+	 */
+	if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq &&
+	    (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
+		rds_stats_inc(s_recv_drop_old_seq);
+		goto out;
+	}
+	cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
+
+	if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+		if (inc->i_hdr.h_sport == 0) {
+			rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr);
+			goto out;
+		}
+		rds_stats_inc(s_recv_ping);
+		rds_send_pong(cp, inc->i_hdr.h_sport);
+		/* if this is a handshake ping, start multipath if necessary */
+		if (RDS_HS_PROBE(be16_to_cpu(inc->i_hdr.h_sport),
+				 be16_to_cpu(inc->i_hdr.h_dport))) {
+			rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
+			rds_start_mprds(cp->cp_conn);
+		}
+		goto out;
+	}
+
+	if (be16_to_cpu(inc->i_hdr.h_dport) ==  RDS_FLAG_PROBE_PORT &&
+	    inc->i_hdr.h_sport == 0) {
+		rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
+		/* if this is a handshake pong, start multipath if necessary */
+		rds_start_mprds(cp->cp_conn);
+		wake_up(&cp->cp_conn->c_hs_waitq);
+		goto out;
+	}
+
+	rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
+	if (!rs) {
+		rds_stats_inc(s_recv_drop_no_sock);
+		goto out;
+	}
+
+	/* Process extension headers */
+	rds_recv_incoming_exthdrs(inc, rs);
+
+	/* We can be racing with rds_release() which marks the socket dead. */
+	sk = rds_rs_to_sk(rs);
+
+	/* serialize with rds_release -> sock_orphan */
+	write_lock_irqsave(&rs->rs_recv_lock, flags);
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
+		rds_stats_inc(s_recv_queued);
+		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+				      be32_to_cpu(inc->i_hdr.h_len),
+				      inc->i_hdr.h_dport);
+		if (sock_flag(sk, SOCK_RCVTSTAMP))
+			do_gettimeofday(&inc->i_rx_tstamp);
+		rds_inc_addref(inc);
+		inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
+		list_add_tail(&inc->i_item, &rs->rs_recv_queue);
+		__rds_wake_sk_sleep(sk);
+	} else {
+		rds_stats_inc(s_recv_drop_dead_sock);
+	}
+	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+out:
+	if (rs)
+		rds_sock_put(rs);
+}
+EXPORT_SYMBOL_GPL(rds_recv_incoming);
+
+/*
+ * be very careful here.  This is being called as the condition in
+ * wait_event_*() needs to cope with being called many times.
+ */
+static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
+{
+	unsigned long flags;
+
+	if (!*inc) {
+		read_lock_irqsave(&rs->rs_recv_lock, flags);
+		if (!list_empty(&rs->rs_recv_queue)) {
+			*inc = list_entry(rs->rs_recv_queue.next,
+					  struct rds_incoming,
+					  i_item);
+			rds_inc_addref(*inc);
+		}
+		read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+	}
+
+	return *inc != NULL;
+}
+
+static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
+			    int drop)
+{
+	struct sock *sk = rds_rs_to_sk(rs);
+	int ret = 0;
+	unsigned long flags;
+
+	write_lock_irqsave(&rs->rs_recv_lock, flags);
+	if (!list_empty(&inc->i_item)) {
+		ret = 1;
+		if (drop) {
+			/* XXX make sure this i_conn is reliable */
+			rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+					      -be32_to_cpu(inc->i_hdr.h_len),
+					      inc->i_hdr.h_dport);
+			list_del_init(&inc->i_item);
+			rds_inc_put(inc);
+		}
+	}
+	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+	rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
+	return ret;
+}
+
+/*
+ * Pull errors off the error queue.
+ * If msghdr is NULL, we will just purge the error queue.
+ */
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
+{
+	struct rds_notifier *notifier;
+	struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
+	unsigned int count = 0, max_messages = ~0U;
+	unsigned long flags;
+	LIST_HEAD(copy);
+	int err = 0;
+
+
+	/* put_cmsg copies to user space and thus may sleep. We can't do this
+	 * with rs_lock held, so first grab as many notifications as we can stuff
+	 * in the user provided cmsg buffer. We don't try to copy more, to avoid
+	 * losing notifications - except when the buffer is so small that it wouldn't
+	 * even hold a single notification. Then we give him as much of this single
+	 * msg as we can squeeze in, and set MSG_CTRUNC.
+	 */
+	if (msghdr) {
+		max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
+		if (!max_messages)
+			max_messages = 1;
+	}
+
+	spin_lock_irqsave(&rs->rs_lock, flags);
+	while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
+		notifier = list_entry(rs->rs_notify_queue.next,
+				struct rds_notifier, n_list);
+		list_move(&notifier->n_list, &copy);
+		count++;
+	}
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+	if (!count)
+		return 0;
+
+	while (!list_empty(&copy)) {
+		notifier = list_entry(copy.next, struct rds_notifier, n_list);
+
+		if (msghdr) {
+			cmsg.user_token = notifier->n_user_token;
+			cmsg.status = notifier->n_status;
+
+			err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
+				       sizeof(cmsg), &cmsg);
+			if (err)
+				break;
+		}
+
+		list_del_init(&notifier->n_list);
+		kfree(notifier);
+	}
+
+	/* If we bailed out because of an error in put_cmsg,
+	 * we may be left with one or more notifications that we
+	 * didn't process. Return them to the head of the list. */
+	if (!list_empty(&copy)) {
+		spin_lock_irqsave(&rs->rs_lock, flags);
+		list_splice(&copy, &rs->rs_notify_queue);
+		spin_unlock_irqrestore(&rs->rs_lock, flags);
+	}
+
+	return err;
+}
+
+/*
+ * Queue a congestion notification
+ */
+static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
+{
+	uint64_t notify = rs->rs_cong_notify;
+	unsigned long flags;
+	int err;
+
+	err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
+			sizeof(notify), &notify);
+	if (err)
+		return err;
+
+	spin_lock_irqsave(&rs->rs_lock, flags);
+	rs->rs_cong_notify &= ~notify;
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+	return 0;
+}
+
+/*
+ * Receive any control messages.
+ */
+static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
+			 struct rds_sock *rs)
+{
+	int ret = 0;
+
+	if (inc->i_rdma_cookie) {
+		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
+				sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
+		if (ret)
+			goto out;
+	}
+
+	if ((inc->i_rx_tstamp.tv_sec != 0) &&
+	    sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
+		ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
+			       sizeof(struct timeval),
+			       &inc->i_rx_tstamp);
+		if (ret)
+			goto out;
+	}
+
+	if (rs->rs_rx_traces) {
+		struct rds_cmsg_rx_trace t;
+		int i, j;
+
+		memset(&t, 0, sizeof(t));
+		inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
+		t.rx_traces =  rs->rs_rx_traces;
+		for (i = 0; i < rs->rs_rx_traces; i++) {
+			j = rs->rs_rx_trace[i];
+			t.rx_trace_pos[i] = j;
+			t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
+					  inc->i_rx_lat_trace[j];
+		}
+
+		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
+			       sizeof(t), &t);
+		if (ret)
+			goto out;
+	}
+
+out:
+	return ret;
+}
+
+static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg)
+{
+	struct rds_msg_zcopy_queue *q = &rs->rs_zcookie_queue;
+	struct rds_msg_zcopy_info *info = NULL;
+	struct rds_zcopy_cookies *done;
+	unsigned long flags;
+
+	if (!msg->msg_control)
+		return false;
+
+	if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) ||
+	    msg->msg_controllen < CMSG_SPACE(sizeof(*done)))
+		return false;
+
+	spin_lock_irqsave(&q->lock, flags);
+	if (!list_empty(&q->zcookie_head)) {
+		info = list_entry(q->zcookie_head.next,
+				  struct rds_msg_zcopy_info, rs_zcookie_next);
+		list_del(&info->rs_zcookie_next);
+	}
+	spin_unlock_irqrestore(&q->lock, flags);
+	if (!info)
+		return false;
+	done = &info->zcookies;
+	if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done),
+		     done)) {
+		spin_lock_irqsave(&q->lock, flags);
+		list_add(&info->rs_zcookie_next, &q->zcookie_head);
+		spin_unlock_irqrestore(&q->lock, flags);
+		return false;
+	}
+	kfree(info);
+	return true;
+}
+
+int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+		int msg_flags)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	long timeo;
+	int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+	struct rds_incoming *inc = NULL;
+
+	/* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
+	timeo = sock_rcvtimeo(sk, nonblock);
+
+	rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
+
+	if (msg_flags & MSG_OOB)
+		goto out;
+	if (msg_flags & MSG_ERRQUEUE)
+		return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR);
+
+	while (1) {
+		/* If there are pending notifications, do those - and nothing else */
+		if (!list_empty(&rs->rs_notify_queue)) {
+			ret = rds_notify_queue_get(rs, msg);
+			break;
+		}
+
+		if (rs->rs_cong_notify) {
+			ret = rds_notify_cong(rs, msg);
+			break;
+		}
+
+		if (!rds_next_incoming(rs, &inc)) {
+			if (nonblock) {
+				bool reaped = rds_recvmsg_zcookie(rs, msg);
+
+				ret = reaped ?  0 : -EAGAIN;
+				break;
+			}
+
+			timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+					(!list_empty(&rs->rs_notify_queue) ||
+					 rs->rs_cong_notify ||
+					 rds_next_incoming(rs, &inc)), timeo);
+			rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
+				 timeo);
+			if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+				continue;
+
+			ret = timeo;
+			if (ret == 0)
+				ret = -ETIMEDOUT;
+			break;
+		}
+
+		rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
+			 &inc->i_conn->c_faddr,
+			 ntohs(inc->i_hdr.h_sport));
+		ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
+		if (ret < 0)
+			break;
+
+		/*
+		 * if the message we just copied isn't at the head of the
+		 * recv queue then someone else raced us to return it, try
+		 * to get the next message.
+		 */
+		if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
+			rds_inc_put(inc);
+			inc = NULL;
+			rds_stats_inc(s_recv_deliver_raced);
+			iov_iter_revert(&msg->msg_iter, ret);
+			continue;
+		}
+
+		if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
+			if (msg_flags & MSG_TRUNC)
+				ret = be32_to_cpu(inc->i_hdr.h_len);
+			msg->msg_flags |= MSG_TRUNC;
+		}
+
+		if (rds_cmsg_recv(inc, msg, rs)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		rds_recvmsg_zcookie(rs, msg);
+
+		rds_stats_inc(s_recv_delivered);
+
+		if (sin) {
+			sin->sin_family = AF_INET;
+			sin->sin_port = inc->i_hdr.h_sport;
+			sin->sin_addr.s_addr = inc->i_saddr;
+			memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+			msg->msg_namelen = sizeof(*sin);
+		}
+		break;
+	}
+
+	if (inc)
+		rds_inc_put(inc);
+
+out:
+	return ret;
+}
+
+/*
+ * The socket is being shut down and we're asked to drop messages that were
+ * queued for recvmsg.  The caller has unbound the socket so the receive path
+ * won't queue any more incoming fragments or messages on the socket.
+ */
+void rds_clear_recv_queue(struct rds_sock *rs)
+{
+	struct sock *sk = rds_rs_to_sk(rs);
+	struct rds_incoming *inc, *tmp;
+	unsigned long flags;
+
+	write_lock_irqsave(&rs->rs_recv_lock, flags);
+	list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
+		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+				      -be32_to_cpu(inc->i_hdr.h_len),
+				      inc->i_hdr.h_dport);
+		list_del_init(&inc->i_item);
+		rds_inc_put(inc);
+	}
+	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+/*
+ * inc->i_saddr isn't used here because it is only set in the receive
+ * path.
+ */
+void rds_inc_info_copy(struct rds_incoming *inc,
+		       struct rds_info_iterator *iter,
+		       __be32 saddr, __be32 daddr, int flip)
+{
+	struct rds_info_message minfo;
+
+	minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+	minfo.len = be32_to_cpu(inc->i_hdr.h_len);
+
+	if (flip) {
+		minfo.laddr = daddr;
+		minfo.faddr = saddr;
+		minfo.lport = inc->i_hdr.h_dport;
+		minfo.fport = inc->i_hdr.h_sport;
+	} else {
+		minfo.laddr = saddr;
+		minfo.faddr = daddr;
+		minfo.lport = inc->i_hdr.h_sport;
+		minfo.fport = inc->i_hdr.h_dport;
+	}
+
+	minfo.flags = 0;
+
+	rds_info_copy(iter, &minfo, sizeof(minfo));
+}
diff --git a/net/rds/send.c b/net/rds/send.c
new file mode 100644
index 0000000..94c7f74
--- /dev/null
+++ b/net/rds/send.c
@@ -0,0 +1,1367 @@
+/*
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/list.h>
+#include <linux/ratelimit.h>
+#include <linux/export.h>
+#include <linux/sizes.h>
+
+#include "rds.h"
+
+/* When transmitting messages in rds_send_xmit, we need to emerge from
+ * time to time and briefly release the CPU. Otherwise the softlock watchdog
+ * will kick our shin.
+ * Also, it seems fairer to not let one busy connection stall all the
+ * others.
+ *
+ * send_batch_count is the number of times we'll loop in send_xmit. Setting
+ * it to 0 will restore the old behavior (where we looped until we had
+ * drained the queue).
+ */
+static int send_batch_count = SZ_1K;
+module_param(send_batch_count, int, 0444);
+MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
+
+static void rds_send_remove_from_sock(struct list_head *messages, int status);
+
+/*
+ * Reset the send state.  Callers must ensure that this doesn't race with
+ * rds_send_xmit().
+ */
+void rds_send_path_reset(struct rds_conn_path *cp)
+{
+	struct rds_message *rm, *tmp;
+	unsigned long flags;
+
+	if (cp->cp_xmit_rm) {
+		rm = cp->cp_xmit_rm;
+		cp->cp_xmit_rm = NULL;
+		/* Tell the user the RDMA op is no longer mapped by the
+		 * transport. This isn't entirely true (it's flushed out
+		 * independently) but as the connection is down, there's
+		 * no ongoing RDMA to/from that memory */
+		rds_message_unmapped(rm);
+		rds_message_put(rm);
+	}
+
+	cp->cp_xmit_sg = 0;
+	cp->cp_xmit_hdr_off = 0;
+	cp->cp_xmit_data_off = 0;
+	cp->cp_xmit_atomic_sent = 0;
+	cp->cp_xmit_rdma_sent = 0;
+	cp->cp_xmit_data_sent = 0;
+
+	cp->cp_conn->c_map_queued = 0;
+
+	cp->cp_unacked_packets = rds_sysctl_max_unacked_packets;
+	cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes;
+
+	/* Mark messages as retransmissions, and move them to the send q */
+	spin_lock_irqsave(&cp->cp_lock, flags);
+	list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
+		set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+		set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
+	}
+	list_splice_init(&cp->cp_retrans, &cp->cp_send_queue);
+	spin_unlock_irqrestore(&cp->cp_lock, flags);
+}
+EXPORT_SYMBOL_GPL(rds_send_path_reset);
+
+static int acquire_in_xmit(struct rds_conn_path *cp)
+{
+	return test_and_set_bit(RDS_IN_XMIT, &cp->cp_flags) == 0;
+}
+
+static void release_in_xmit(struct rds_conn_path *cp)
+{
+	clear_bit(RDS_IN_XMIT, &cp->cp_flags);
+	smp_mb__after_atomic();
+	/*
+	 * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+	 * hot path and finding waiters is very rare.  We don't want to walk
+	 * the system-wide hashed waitqueue buckets in the fast path only to
+	 * almost never find waiters.
+	 */
+	if (waitqueue_active(&cp->cp_waitq))
+		wake_up_all(&cp->cp_waitq);
+}
+
+/*
+ * We're making the conscious trade-off here to only send one message
+ * down the connection at a time.
+ *   Pro:
+ *      - tx queueing is a simple fifo list
+ *   	- reassembly is optional and easily done by transports per conn
+ *      - no per flow rx lookup at all, straight to the socket
+ *   	- less per-frag memory and wire overhead
+ *   Con:
+ *      - queued acks can be delayed behind large messages
+ *   Depends:
+ *      - small message latency is higher behind queued large messages
+ *      - large message latency isn't starved by intervening small sends
+ */
+int rds_send_xmit(struct rds_conn_path *cp)
+{
+	struct rds_connection *conn = cp->cp_conn;
+	struct rds_message *rm;
+	unsigned long flags;
+	unsigned int tmp;
+	struct scatterlist *sg;
+	int ret = 0;
+	LIST_HEAD(to_be_dropped);
+	int batch_count;
+	unsigned long send_gen = 0;
+
+restart:
+	batch_count = 0;
+
+	/*
+	 * sendmsg calls here after having queued its message on the send
+	 * queue.  We only have one task feeding the connection at a time.  If
+	 * another thread is already feeding the queue then we back off.  This
+	 * avoids blocking the caller and trading per-connection data between
+	 * caches per message.
+	 */
+	if (!acquire_in_xmit(cp)) {
+		rds_stats_inc(s_send_lock_contention);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (rds_destroy_pending(cp->cp_conn)) {
+		release_in_xmit(cp);
+		ret = -ENETUNREACH; /* dont requeue send work */
+		goto out;
+	}
+
+	/*
+	 * we record the send generation after doing the xmit acquire.
+	 * if someone else manages to jump in and do some work, we'll use
+	 * this to avoid a goto restart farther down.
+	 *
+	 * The acquire_in_xmit() check above ensures that only one
+	 * caller can increment c_send_gen at any time.
+	 */
+	send_gen = READ_ONCE(cp->cp_send_gen) + 1;
+	WRITE_ONCE(cp->cp_send_gen, send_gen);
+
+	/*
+	 * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
+	 * we do the opposite to avoid races.
+	 */
+	if (!rds_conn_path_up(cp)) {
+		release_in_xmit(cp);
+		ret = 0;
+		goto out;
+	}
+
+	if (conn->c_trans->xmit_path_prepare)
+		conn->c_trans->xmit_path_prepare(cp);
+
+	/*
+	 * spin trying to push headers and data down the connection until
+	 * the connection doesn't make forward progress.
+	 */
+	while (1) {
+
+		rm = cp->cp_xmit_rm;
+
+		/*
+		 * If between sending messages, we can send a pending congestion
+		 * map update.
+		 */
+		if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
+			rm = rds_cong_update_alloc(conn);
+			if (IS_ERR(rm)) {
+				ret = PTR_ERR(rm);
+				break;
+			}
+			rm->data.op_active = 1;
+			rm->m_inc.i_conn_path = cp;
+			rm->m_inc.i_conn = cp->cp_conn;
+
+			cp->cp_xmit_rm = rm;
+		}
+
+		/*
+		 * If not already working on one, grab the next message.
+		 *
+		 * cp_xmit_rm holds a ref while we're sending this message down
+		 * the connction.  We can use this ref while holding the
+		 * send_sem.. rds_send_reset() is serialized with it.
+		 */
+		if (!rm) {
+			unsigned int len;
+
+			batch_count++;
+
+			/* we want to process as big a batch as we can, but
+			 * we also want to avoid softlockups.  If we've been
+			 * through a lot of messages, lets back off and see
+			 * if anyone else jumps in
+			 */
+			if (batch_count >= send_batch_count)
+				goto over_batch;
+
+			spin_lock_irqsave(&cp->cp_lock, flags);
+
+			if (!list_empty(&cp->cp_send_queue)) {
+				rm = list_entry(cp->cp_send_queue.next,
+						struct rds_message,
+						m_conn_item);
+				rds_message_addref(rm);
+
+				/*
+				 * Move the message from the send queue to the retransmit
+				 * list right away.
+				 */
+				list_move_tail(&rm->m_conn_item,
+					       &cp->cp_retrans);
+			}
+
+			spin_unlock_irqrestore(&cp->cp_lock, flags);
+
+			if (!rm)
+				break;
+
+			/* Unfortunately, the way Infiniband deals with
+			 * RDMA to a bad MR key is by moving the entire
+			 * queue pair to error state. We cold possibly
+			 * recover from that, but right now we drop the
+			 * connection.
+			 * Therefore, we never retransmit messages with RDMA ops.
+			 */
+			if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) ||
+			    (rm->rdma.op_active &&
+			    test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) {
+				spin_lock_irqsave(&cp->cp_lock, flags);
+				if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+					list_move(&rm->m_conn_item, &to_be_dropped);
+				spin_unlock_irqrestore(&cp->cp_lock, flags);
+				continue;
+			}
+
+			/* Require an ACK every once in a while */
+			len = ntohl(rm->m_inc.i_hdr.h_len);
+			if (cp->cp_unacked_packets == 0 ||
+			    cp->cp_unacked_bytes < len) {
+				set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+				cp->cp_unacked_packets =
+					rds_sysctl_max_unacked_packets;
+				cp->cp_unacked_bytes =
+					rds_sysctl_max_unacked_bytes;
+				rds_stats_inc(s_send_ack_required);
+			} else {
+				cp->cp_unacked_bytes -= len;
+				cp->cp_unacked_packets--;
+			}
+
+			cp->cp_xmit_rm = rm;
+		}
+
+		/* The transport either sends the whole rdma or none of it */
+		if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) {
+			rm->m_final_op = &rm->rdma;
+			/* The transport owns the mapped memory for now.
+			 * You can't unmap it while it's on the send queue
+			 */
+			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
+			if (ret) {
+				clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+				wake_up_interruptible(&rm->m_flush_wait);
+				break;
+			}
+			cp->cp_xmit_rdma_sent = 1;
+
+		}
+
+		if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) {
+			rm->m_final_op = &rm->atomic;
+			/* The transport owns the mapped memory for now.
+			 * You can't unmap it while it's on the send queue
+			 */
+			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+			ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
+			if (ret) {
+				clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+				wake_up_interruptible(&rm->m_flush_wait);
+				break;
+			}
+			cp->cp_xmit_atomic_sent = 1;
+
+		}
+
+		/*
+		 * A number of cases require an RDS header to be sent
+		 * even if there is no data.
+		 * We permit 0-byte sends; rds-ping depends on this.
+		 * However, if there are exclusively attached silent ops,
+		 * we skip the hdr/data send, to enable silent operation.
+		 */
+		if (rm->data.op_nents == 0) {
+			int ops_present;
+			int all_ops_are_silent = 1;
+
+			ops_present = (rm->atomic.op_active || rm->rdma.op_active);
+			if (rm->atomic.op_active && !rm->atomic.op_silent)
+				all_ops_are_silent = 0;
+			if (rm->rdma.op_active && !rm->rdma.op_silent)
+				all_ops_are_silent = 0;
+
+			if (ops_present && all_ops_are_silent
+			    && !rm->m_rdma_cookie)
+				rm->data.op_active = 0;
+		}
+
+		if (rm->data.op_active && !cp->cp_xmit_data_sent) {
+			rm->m_final_op = &rm->data;
+
+			ret = conn->c_trans->xmit(conn, rm,
+						  cp->cp_xmit_hdr_off,
+						  cp->cp_xmit_sg,
+						  cp->cp_xmit_data_off);
+			if (ret <= 0)
+				break;
+
+			if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) {
+				tmp = min_t(int, ret,
+					    sizeof(struct rds_header) -
+					    cp->cp_xmit_hdr_off);
+				cp->cp_xmit_hdr_off += tmp;
+				ret -= tmp;
+			}
+
+			sg = &rm->data.op_sg[cp->cp_xmit_sg];
+			while (ret) {
+				tmp = min_t(int, ret, sg->length -
+						      cp->cp_xmit_data_off);
+				cp->cp_xmit_data_off += tmp;
+				ret -= tmp;
+				if (cp->cp_xmit_data_off == sg->length) {
+					cp->cp_xmit_data_off = 0;
+					sg++;
+					cp->cp_xmit_sg++;
+					BUG_ON(ret != 0 && cp->cp_xmit_sg ==
+					       rm->data.op_nents);
+				}
+			}
+
+			if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) &&
+			    (cp->cp_xmit_sg == rm->data.op_nents))
+				cp->cp_xmit_data_sent = 1;
+		}
+
+		/*
+		 * A rm will only take multiple times through this loop
+		 * if there is a data op. Thus, if the data is sent (or there was
+		 * none), then we're done with the rm.
+		 */
+		if (!rm->data.op_active || cp->cp_xmit_data_sent) {
+			cp->cp_xmit_rm = NULL;
+			cp->cp_xmit_sg = 0;
+			cp->cp_xmit_hdr_off = 0;
+			cp->cp_xmit_data_off = 0;
+			cp->cp_xmit_rdma_sent = 0;
+			cp->cp_xmit_atomic_sent = 0;
+			cp->cp_xmit_data_sent = 0;
+
+			rds_message_put(rm);
+		}
+	}
+
+over_batch:
+	if (conn->c_trans->xmit_path_complete)
+		conn->c_trans->xmit_path_complete(cp);
+	release_in_xmit(cp);
+
+	/* Nuke any messages we decided not to retransmit. */
+	if (!list_empty(&to_be_dropped)) {
+		/* irqs on here, so we can put(), unlike above */
+		list_for_each_entry(rm, &to_be_dropped, m_conn_item)
+			rds_message_put(rm);
+		rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
+	}
+
+	/*
+	 * Other senders can queue a message after we last test the send queue
+	 * but before we clear RDS_IN_XMIT.  In that case they'd back off and
+	 * not try and send their newly queued message.  We need to check the
+	 * send queue after having cleared RDS_IN_XMIT so that their message
+	 * doesn't get stuck on the send queue.
+	 *
+	 * If the transport cannot continue (i.e ret != 0), then it must
+	 * call us when more room is available, such as from the tx
+	 * completion handler.
+	 *
+	 * We have an extra generation check here so that if someone manages
+	 * to jump in after our release_in_xmit, we'll see that they have done
+	 * some work and we will skip our goto
+	 */
+	if (ret == 0) {
+		bool raced;
+
+		smp_mb();
+		raced = send_gen != READ_ONCE(cp->cp_send_gen);
+
+		if ((test_bit(0, &conn->c_map_queued) ||
+		    !list_empty(&cp->cp_send_queue)) && !raced) {
+			if (batch_count < send_batch_count)
+				goto restart;
+			rcu_read_lock();
+			if (rds_destroy_pending(cp->cp_conn))
+				ret = -ENETUNREACH;
+			else
+				queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+			rcu_read_unlock();
+		} else if (raced) {
+			rds_stats_inc(s_send_lock_queue_raced);
+		}
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rds_send_xmit);
+
+static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm)
+{
+	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	assert_spin_locked(&rs->rs_lock);
+
+	BUG_ON(rs->rs_snd_bytes < len);
+	rs->rs_snd_bytes -= len;
+
+	if (rs->rs_snd_bytes == 0)
+		rds_stats_inc(s_send_queue_empty);
+}
+
+static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
+				    is_acked_func is_acked)
+{
+	if (is_acked)
+		return is_acked(rm, ack);
+	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
+}
+
+/*
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
+ */
+void rds_rdma_send_complete(struct rds_message *rm, int status)
+{
+	struct rds_sock *rs = NULL;
+	struct rm_rdma_op *ro;
+	struct rds_notifier *notifier;
+	unsigned long flags;
+	unsigned int notify = 0;
+
+	spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+	notify =  rm->rdma.op_notify | rm->data.op_notify;
+	ro = &rm->rdma;
+	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
+	    ro->op_active && notify && ro->op_notifier) {
+		notifier = ro->op_notifier;
+		rs = rm->m_rs;
+		sock_hold(rds_rs_to_sk(rs));
+
+		notifier->n_status = status;
+		spin_lock(&rs->rs_lock);
+		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+		spin_unlock(&rs->rs_lock);
+
+		ro->op_notifier = NULL;
+	}
+
+	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+	if (rs) {
+		rds_wake_sk_sleep(rs);
+		sock_put(rds_rs_to_sk(rs));
+	}
+}
+EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
+
+/*
+ * Just like above, except looks at atomic op
+ */
+void rds_atomic_send_complete(struct rds_message *rm, int status)
+{
+	struct rds_sock *rs = NULL;
+	struct rm_atomic_op *ao;
+	struct rds_notifier *notifier;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+	ao = &rm->atomic;
+	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
+	    && ao->op_active && ao->op_notify && ao->op_notifier) {
+		notifier = ao->op_notifier;
+		rs = rm->m_rs;
+		sock_hold(rds_rs_to_sk(rs));
+
+		notifier->n_status = status;
+		spin_lock(&rs->rs_lock);
+		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+		spin_unlock(&rs->rs_lock);
+
+		ao->op_notifier = NULL;
+	}
+
+	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+	if (rs) {
+		rds_wake_sk_sleep(rs);
+		sock_put(rds_rs_to_sk(rs));
+	}
+}
+EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
+
+/*
+ * This is the same as rds_rdma_send_complete except we
+ * don't do any locking - we have all the ingredients (message,
+ * socket, socket lock) and can just move the notifier.
+ */
+static inline void
+__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+{
+	struct rm_rdma_op *ro;
+	struct rm_atomic_op *ao;
+
+	ro = &rm->rdma;
+	if (ro->op_active && ro->op_notify && ro->op_notifier) {
+		ro->op_notifier->n_status = status;
+		list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
+		ro->op_notifier = NULL;
+	}
+
+	ao = &rm->atomic;
+	if (ao->op_active && ao->op_notify && ao->op_notifier) {
+		ao->op_notifier->n_status = status;
+		list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
+		ao->op_notifier = NULL;
+	}
+
+	/* No need to wake the app - caller does this */
+}
+
+/*
+ * This removes messages from the socket's list if they're on it.  The list
+ * argument must be private to the caller, we must be able to modify it
+ * without locks.  The messages must have a reference held for their
+ * position on the list.  This function will drop that reference after
+ * removing the messages from the 'messages' list regardless of if it found
+ * the messages on the socket list or not.
+ */
+static void rds_send_remove_from_sock(struct list_head *messages, int status)
+{
+	unsigned long flags;
+	struct rds_sock *rs = NULL;
+	struct rds_message *rm;
+
+	while (!list_empty(messages)) {
+		int was_on_sock = 0;
+
+		rm = list_entry(messages->next, struct rds_message,
+				m_conn_item);
+		list_del_init(&rm->m_conn_item);
+
+		/*
+		 * If we see this flag cleared then we're *sure* that someone
+		 * else beat us to removing it from the sock.  If we race
+		 * with their flag update we'll get the lock and then really
+		 * see that the flag has been cleared.
+		 *
+		 * The message spinlock makes sure nobody clears rm->m_rs
+		 * while we're messing with it. It does not prevent the
+		 * message from being removed from the socket, though.
+		 */
+		spin_lock_irqsave(&rm->m_rs_lock, flags);
+		if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
+			goto unlock_and_drop;
+
+		if (rs != rm->m_rs) {
+			if (rs) {
+				rds_wake_sk_sleep(rs);
+				sock_put(rds_rs_to_sk(rs));
+			}
+			rs = rm->m_rs;
+			if (rs)
+				sock_hold(rds_rs_to_sk(rs));
+		}
+		if (!rs)
+			goto unlock_and_drop;
+		spin_lock(&rs->rs_lock);
+
+		if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
+			struct rm_rdma_op *ro = &rm->rdma;
+			struct rds_notifier *notifier;
+
+			list_del_init(&rm->m_sock_item);
+			rds_send_sndbuf_remove(rs, rm);
+
+			if (ro->op_active && ro->op_notifier &&
+			       (ro->op_notify || (ro->op_recverr && status))) {
+				notifier = ro->op_notifier;
+				list_add_tail(&notifier->n_list,
+						&rs->rs_notify_queue);
+				if (!notifier->n_status)
+					notifier->n_status = status;
+				rm->rdma.op_notifier = NULL;
+			}
+			was_on_sock = 1;
+		}
+		spin_unlock(&rs->rs_lock);
+
+unlock_and_drop:
+		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+		rds_message_put(rm);
+		if (was_on_sock)
+			rds_message_put(rm);
+	}
+
+	if (rs) {
+		rds_wake_sk_sleep(rs);
+		sock_put(rds_rs_to_sk(rs));
+	}
+}
+
+/*
+ * Transports call here when they've determined that the receiver queued
+ * messages up to, and including, the given sequence number.  Messages are
+ * moved to the retrans queue when rds_send_xmit picks them off the send
+ * queue. This means that in the TCP case, the message may not have been
+ * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
+ * checks the RDS_MSG_HAS_ACK_SEQ bit.
+ */
+void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
+			      is_acked_func is_acked)
+{
+	struct rds_message *rm, *tmp;
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	spin_lock_irqsave(&cp->cp_lock, flags);
+
+	list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
+		if (!rds_send_is_acked(rm, ack, is_acked))
+			break;
+
+		list_move(&rm->m_conn_item, &list);
+		clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+	}
+
+	/* order flag updates with spin locks */
+	if (!list_empty(&list))
+		smp_mb__after_atomic();
+
+	spin_unlock_irqrestore(&cp->cp_lock, flags);
+
+	/* now remove the messages from the sock list as needed */
+	rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
+}
+EXPORT_SYMBOL_GPL(rds_send_path_drop_acked);
+
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+			 is_acked_func is_acked)
+{
+	WARN_ON(conn->c_trans->t_mp_capable);
+	rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked);
+}
+EXPORT_SYMBOL_GPL(rds_send_drop_acked);
+
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
+{
+	struct rds_message *rm, *tmp;
+	struct rds_connection *conn;
+	struct rds_conn_path *cp;
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	/* get all the messages we're dropping under the rs lock */
+	spin_lock_irqsave(&rs->rs_lock, flags);
+
+	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
+		if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
+			     dest->sin_port != rm->m_inc.i_hdr.h_dport))
+			continue;
+
+		list_move(&rm->m_sock_item, &list);
+		rds_send_sndbuf_remove(rs, rm);
+		clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+	}
+
+	/* order flag updates with the rs lock */
+	smp_mb__after_atomic();
+
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+	if (list_empty(&list))
+		return;
+
+	/* Remove the messages from the conn */
+	list_for_each_entry(rm, &list, m_sock_item) {
+
+		conn = rm->m_inc.i_conn;
+		if (conn->c_trans->t_mp_capable)
+			cp = rm->m_inc.i_conn_path;
+		else
+			cp = &conn->c_path[0];
+
+		spin_lock_irqsave(&cp->cp_lock, flags);
+		/*
+		 * Maybe someone else beat us to removing rm from the conn.
+		 * If we race with their flag update we'll get the lock and
+		 * then really see that the flag has been cleared.
+		 */
+		if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+			spin_unlock_irqrestore(&cp->cp_lock, flags);
+			continue;
+		}
+		list_del_init(&rm->m_conn_item);
+		spin_unlock_irqrestore(&cp->cp_lock, flags);
+
+		/*
+		 * Couldn't grab m_rs_lock in top loop (lock ordering),
+		 * but we can now.
+		 */
+		spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+		spin_lock(&rs->rs_lock);
+		__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
+		spin_unlock(&rs->rs_lock);
+
+		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+		rds_message_put(rm);
+	}
+
+	rds_wake_sk_sleep(rs);
+
+	while (!list_empty(&list)) {
+		rm = list_entry(list.next, struct rds_message, m_sock_item);
+		list_del_init(&rm->m_sock_item);
+		rds_message_wait(rm);
+
+		/* just in case the code above skipped this message
+		 * because RDS_MSG_ON_CONN wasn't set, run it again here
+		 * taking m_rs_lock is the only thing that keeps us
+		 * from racing with ack processing.
+		 */
+		spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+		spin_lock(&rs->rs_lock);
+		__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
+		spin_unlock(&rs->rs_lock);
+
+		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+		rds_message_put(rm);
+	}
+}
+
+/*
+ * we only want this to fire once so we use the callers 'queued'.  It's
+ * possible that another thread can race with us and remove the
+ * message from the flow with RDS_CANCEL_SENT_TO.
+ */
+static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
+			     struct rds_conn_path *cp,
+			     struct rds_message *rm, __be16 sport,
+			     __be16 dport, int *queued)
+{
+	unsigned long flags;
+	u32 len;
+
+	if (*queued)
+		goto out;
+
+	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	/* this is the only place which holds both the socket's rs_lock
+	 * and the connection's c_lock */
+	spin_lock_irqsave(&rs->rs_lock, flags);
+
+	/*
+	 * If there is a little space in sndbuf, we don't queue anything,
+	 * and userspace gets -EAGAIN. But poll() indicates there's send
+	 * room. This can lead to bad behavior (spinning) if snd_bytes isn't
+	 * freed up by incoming acks. So we check the *old* value of
+	 * rs_snd_bytes here to allow the last msg to exceed the buffer,
+	 * and poll() now knows no more data can be sent.
+	 */
+	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
+		rs->rs_snd_bytes += len;
+
+		/* let recv side know we are close to send space exhaustion.
+		 * This is probably not the optimal way to do it, as this
+		 * means we set the flag on *all* messages as soon as our
+		 * throughput hits a certain threshold.
+		 */
+		if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
+			set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+		list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
+		set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+		rds_message_addref(rm);
+		sock_hold(rds_rs_to_sk(rs));
+		rm->m_rs = rs;
+
+		/* The code ordering is a little weird, but we're
+		   trying to minimize the time we hold c_lock */
+		rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
+		rm->m_inc.i_conn = conn;
+		rm->m_inc.i_conn_path = cp;
+		rds_message_addref(rm);
+
+		spin_lock(&cp->cp_lock);
+		rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++);
+		list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
+		set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+		spin_unlock(&cp->cp_lock);
+
+		rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
+			 rm, len, rs, rs->rs_snd_bytes,
+			 (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
+
+		*queued = 1;
+	}
+
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+out:
+	return *queued;
+}
+
+/*
+ * rds_message is getting to be quite complicated, and we'd like to allocate
+ * it all in one go. This figures out how big it needs to be up front.
+ */
+static int rds_rm_size(struct msghdr *msg, int num_sgs)
+{
+	struct cmsghdr *cmsg;
+	int size = 0;
+	int cmsg_groups = 0;
+	int retval;
+	bool zcopy_cookie = false;
+
+	for_each_cmsghdr(cmsg, msg) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		switch (cmsg->cmsg_type) {
+		case RDS_CMSG_RDMA_ARGS:
+			cmsg_groups |= 1;
+			retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
+			if (retval < 0)
+				return retval;
+			size += retval;
+
+			break;
+
+		case RDS_CMSG_ZCOPY_COOKIE:
+			zcopy_cookie = true;
+			/* fall through */
+
+		case RDS_CMSG_RDMA_DEST:
+		case RDS_CMSG_RDMA_MAP:
+			cmsg_groups |= 2;
+			/* these are valid but do no add any size */
+			break;
+
+		case RDS_CMSG_ATOMIC_CSWP:
+		case RDS_CMSG_ATOMIC_FADD:
+		case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		case RDS_CMSG_MASKED_ATOMIC_FADD:
+			cmsg_groups |= 1;
+			size += sizeof(struct scatterlist);
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
+	}
+
+	if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie)
+		return -EINVAL;
+
+	size += num_sgs * sizeof(struct scatterlist);
+
+	/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
+	if (cmsg_groups == 3)
+		return -EINVAL;
+
+	return size;
+}
+
+static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	u32 *cookie;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)) ||
+	    !rm->data.op_mmp_znotifier)
+		return -EINVAL;
+	cookie = CMSG_DATA(cmsg);
+	rm->data.op_mmp_znotifier->z_cookie = *cookie;
+	return 0;
+}
+
+static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
+			 struct msghdr *msg, int *allocated_mr)
+{
+	struct cmsghdr *cmsg;
+	int ret = 0;
+
+	for_each_cmsghdr(cmsg, msg) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		/* As a side effect, RDMA_DEST and RDMA_MAP will set
+		 * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
+		 */
+		switch (cmsg->cmsg_type) {
+		case RDS_CMSG_RDMA_ARGS:
+			ret = rds_cmsg_rdma_args(rs, rm, cmsg);
+			break;
+
+		case RDS_CMSG_RDMA_DEST:
+			ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
+			break;
+
+		case RDS_CMSG_RDMA_MAP:
+			ret = rds_cmsg_rdma_map(rs, rm, cmsg);
+			if (!ret)
+				*allocated_mr = 1;
+			else if (ret == -ENODEV)
+				/* Accommodate the get_mr() case which can fail
+				 * if connection isn't established yet.
+				 */
+				ret = -EAGAIN;
+			break;
+		case RDS_CMSG_ATOMIC_CSWP:
+		case RDS_CMSG_ATOMIC_FADD:
+		case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		case RDS_CMSG_MASKED_ATOMIC_FADD:
+			ret = rds_cmsg_atomic(rs, rm, cmsg);
+			break;
+
+		case RDS_CMSG_ZCOPY_COOKIE:
+			ret = rds_cmsg_zcopy(rs, rm, cmsg);
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
+{
+	int hash;
+
+	if (conn->c_npaths == 0)
+		hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS);
+	else
+		hash = RDS_MPATH_HASH(rs, conn->c_npaths);
+	if (conn->c_npaths == 0 && hash != 0) {
+		rds_send_ping(conn, 0);
+
+		/* The underlying connection is not up yet.  Need to wait
+		 * until it is up to be sure that the non-zero c_path can be
+		 * used.  But if we are interrupted, we have to use the zero
+		 * c_path in case the connection ends up being non-MP capable.
+		 */
+		if (conn->c_npaths == 0)
+			if (wait_event_interruptible(conn->c_hs_waitq,
+						     conn->c_npaths != 0))
+				hash = 0;
+		if (conn->c_npaths == 1)
+			hash = 0;
+	}
+	return hash;
+}
+
+static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
+{
+	struct rds_rdma_args *args;
+	struct cmsghdr *cmsg;
+
+	for_each_cmsghdr(cmsg, msg) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
+			if (cmsg->cmsg_len <
+			    CMSG_LEN(sizeof(struct rds_rdma_args)))
+				return -EINVAL;
+			args = CMSG_DATA(cmsg);
+			*rdma_bytes += args->remote_vec.bytes;
+		}
+	}
+	return 0;
+}
+
+int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+	__be32 daddr;
+	__be16 dport;
+	struct rds_message *rm = NULL;
+	struct rds_connection *conn;
+	int ret = 0;
+	int queued = 0, allocated_mr = 0;
+	int nonblock = msg->msg_flags & MSG_DONTWAIT;
+	long timeo = sock_sndtimeo(sk, nonblock);
+	struct rds_conn_path *cpath;
+	size_t total_payload_len = payload_len, rdma_payload_len = 0;
+	bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
+		      sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
+	int num_sgs = ceil(payload_len, PAGE_SIZE);
+
+	/* Mirror Linux UDP mirror of BSD error message compatibility */
+	/* XXX: Perhaps MSG_MORE someday */
+	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (msg->msg_namelen) {
+		/* XXX fail non-unicast destination IPs? */
+		if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
+			ret = -EINVAL;
+			goto out;
+		}
+		daddr = usin->sin_addr.s_addr;
+		dport = usin->sin_port;
+	} else {
+		/* We only care about consistency with ->connect() */
+		lock_sock(sk);
+		daddr = rs->rs_conn_addr;
+		dport = rs->rs_conn_port;
+		release_sock(sk);
+	}
+
+	lock_sock(sk);
+	if (daddr == 0 || rs->rs_bound_addr == 0) {
+		release_sock(sk);
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+	release_sock(sk);
+
+	ret = rds_rdma_bytes(msg, &rdma_payload_len);
+	if (ret)
+		goto out;
+
+	total_payload_len += rdma_payload_len;
+	if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
+	if (payload_len > rds_sk_sndbuf(rs)) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
+	if (zcopy) {
+		if (rs->rs_transport->t_type != RDS_TRANS_TCP) {
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
+		num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX);
+	}
+	/* size of rm including all sgs */
+	ret = rds_rm_size(msg, num_sgs);
+	if (ret < 0)
+		goto out;
+
+	rm = rds_message_alloc(ret, GFP_KERNEL);
+	if (!rm) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* Attach data to the rm */
+	if (payload_len) {
+		rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
+		if (!rm->data.op_sg) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy);
+		if (ret)
+			goto out;
+	}
+	rm->data.op_active = 1;
+
+	rm->m_daddr = daddr;
+
+	/* rds_conn_create has a spinlock that runs with IRQ off.
+	 * Caching the conn in the socket helps a lot. */
+	if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
+		conn = rs->rs_conn;
+	else {
+		conn = rds_conn_create_outgoing(sock_net(sock->sk),
+						rs->rs_bound_addr, daddr,
+					rs->rs_transport,
+					sock->sk->sk_allocation);
+		if (IS_ERR(conn)) {
+			ret = PTR_ERR(conn);
+			goto out;
+		}
+		rs->rs_conn = conn;
+	}
+
+	/* Parse any control messages the user may have included. */
+	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
+	if (ret) {
+		/* Trigger connection so that its ready for the next retry */
+		if (ret ==  -EAGAIN)
+			rds_conn_connect_if_down(conn);
+		goto out;
+	}
+
+	if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
+		printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
+			       &rm->rdma, conn->c_trans->xmit_rdma);
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
+		printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
+			       &rm->atomic, conn->c_trans->xmit_atomic);
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (conn->c_trans->t_mp_capable)
+		cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)];
+	else
+		cpath = &conn->c_path[0];
+
+	if (rds_destroy_pending(conn)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	rds_conn_path_connect_if_down(cpath);
+
+	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
+	if (ret) {
+		rs->rs_seen_congestion = 1;
+		goto out;
+	}
+	while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port,
+				  dport, &queued)) {
+		rds_stats_inc(s_send_queue_full);
+
+		if (nonblock) {
+			ret = -EAGAIN;
+			goto out;
+		}
+
+		timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+					rds_send_queue_rm(rs, conn, cpath, rm,
+							  rs->rs_bound_port,
+							  dport,
+							  &queued),
+					timeo);
+		rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
+		if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+			continue;
+
+		ret = timeo;
+		if (ret == 0)
+			ret = -ETIMEDOUT;
+		goto out;
+	}
+
+	/*
+	 * By now we've committed to the send.  We reuse rds_send_worker()
+	 * to retry sends in the rds thread if the transport asks us to.
+	 */
+	rds_stats_inc(s_send_queued);
+
+	ret = rds_send_xmit(cpath);
+	if (ret == -ENOMEM || ret == -EAGAIN) {
+		ret = 0;
+		rcu_read_lock();
+		if (rds_destroy_pending(cpath->cp_conn))
+			ret = -ENETUNREACH;
+		else
+			queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
+		rcu_read_unlock();
+	}
+	if (ret)
+		goto out;
+	rds_message_put(rm);
+	return payload_len;
+
+out:
+	/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
+	 * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
+	 * or in any other way, we need to destroy the MR again */
+	if (allocated_mr)
+		rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
+
+	if (rm)
+		rds_message_put(rm);
+	return ret;
+}
+
+/*
+ * send out a probe. Can be shared by rds_send_ping,
+ * rds_send_pong, rds_send_hb.
+ * rds_send_hb should use h_flags
+ *   RDS_FLAG_HB_PING|RDS_FLAG_ACK_REQUIRED
+ * or
+ *   RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED
+ */
+static int
+rds_send_probe(struct rds_conn_path *cp, __be16 sport,
+	       __be16 dport, u8 h_flags)
+{
+	struct rds_message *rm;
+	unsigned long flags;
+	int ret = 0;
+
+	rm = rds_message_alloc(0, GFP_ATOMIC);
+	if (!rm) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rm->m_daddr = cp->cp_conn->c_faddr;
+	rm->data.op_active = 1;
+
+	rds_conn_path_connect_if_down(cp);
+
+	ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL);
+	if (ret)
+		goto out;
+
+	spin_lock_irqsave(&cp->cp_lock, flags);
+	list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
+	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+	rds_message_addref(rm);
+	rm->m_inc.i_conn = cp->cp_conn;
+	rm->m_inc.i_conn_path = cp;
+
+	rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport,
+				    cp->cp_next_tx_seq);
+	rm->m_inc.i_hdr.h_flags |= h_flags;
+	cp->cp_next_tx_seq++;
+
+	if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) &&
+	    cp->cp_conn->c_trans->t_mp_capable) {
+		u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
+		u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
+
+		rds_message_add_extension(&rm->m_inc.i_hdr,
+					  RDS_EXTHDR_NPATHS, &npaths,
+					  sizeof(npaths));
+		rds_message_add_extension(&rm->m_inc.i_hdr,
+					  RDS_EXTHDR_GEN_NUM,
+					  &my_gen_num,
+					  sizeof(u32));
+	}
+	spin_unlock_irqrestore(&cp->cp_lock, flags);
+
+	rds_stats_inc(s_send_queued);
+	rds_stats_inc(s_send_pong);
+
+	/* schedule the send work on rds_wq */
+	rcu_read_lock();
+	if (!rds_destroy_pending(cp->cp_conn))
+		queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+	rcu_read_unlock();
+
+	rds_message_put(rm);
+	return 0;
+
+out:
+	if (rm)
+		rds_message_put(rm);
+	return ret;
+}
+
+int
+rds_send_pong(struct rds_conn_path *cp, __be16 dport)
+{
+	return rds_send_probe(cp, 0, dport, 0);
+}
+
+void
+rds_send_ping(struct rds_connection *conn, int cp_index)
+{
+	unsigned long flags;
+	struct rds_conn_path *cp = &conn->c_path[cp_index];
+
+	spin_lock_irqsave(&cp->cp_lock, flags);
+	if (conn->c_ping_triggered) {
+		spin_unlock_irqrestore(&cp->cp_lock, flags);
+		return;
+	}
+	conn->c_ping_triggered = 1;
+	spin_unlock_irqrestore(&cp->cp_lock, flags);
+	rds_send_probe(cp, cpu_to_be16(RDS_FLAG_PROBE_PORT), 0, 0);
+}
+EXPORT_SYMBOL_GPL(rds_send_ping);
diff --git a/net/rds/stats.c b/net/rds/stats.c
new file mode 100644
index 0000000..73be187
--- /dev/null
+++ b/net/rds/stats.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+EXPORT_PER_CPU_SYMBOL_GPL(rds_stats);
+
+/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
+
+static const char *const rds_stat_names[] = {
+	"conn_reset",
+	"recv_drop_bad_checksum",
+	"recv_drop_old_seq",
+	"recv_drop_no_sock",
+	"recv_drop_dead_sock",
+	"recv_deliver_raced",
+	"recv_delivered",
+	"recv_queued",
+	"recv_immediate_retry",
+	"recv_delayed_retry",
+	"recv_ack_required",
+	"recv_rdma_bytes",
+	"recv_ping",
+	"send_queue_empty",
+	"send_queue_full",
+	"send_lock_contention",
+	"send_lock_queue_raced",
+	"send_immediate_retry",
+	"send_delayed_retry",
+	"send_drop_acked",
+	"send_ack_required",
+	"send_queued",
+	"send_rdma",
+	"send_rdma_bytes",
+	"send_pong",
+	"page_remainder_hit",
+	"page_remainder_miss",
+	"copy_to_user",
+	"copy_from_user",
+	"cong_update_queued",
+	"cong_update_received",
+	"cong_send_error",
+	"cong_send_blocked",
+};
+
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+			 uint64_t *values, const char *const *names, size_t nr)
+{
+	struct rds_info_counter ctr;
+	size_t i;
+
+	for (i = 0; i < nr; i++) {
+		BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
+		strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
+		ctr.name[sizeof(ctr.name) - 1] = '\0';
+		ctr.value = values[i];
+
+		rds_info_copy(iter, &ctr, sizeof(ctr));
+	}
+}
+EXPORT_SYMBOL_GPL(rds_stats_info_copy);
+
+/*
+ * This gives global counters across all the transports.  The strings
+ * are copied in so that the tool doesn't need knowledge of the specific
+ * stats that we're exporting.  Some are pretty implementation dependent
+ * and may change over time.  That doesn't stop them from being useful.
+ *
+ * This is the only function in the chain that knows about the byte granular
+ * length in userspace.  It converts it to number of stat entries that the
+ * rest of the functions operate in.
+ */
+static void rds_stats_info(struct socket *sock, unsigned int len,
+			   struct rds_info_iterator *iter,
+			   struct rds_info_lengths *lens)
+{
+	struct rds_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+	unsigned int avail;
+
+	avail = len / sizeof(struct rds_info_counter);
+
+	if (avail < ARRAY_SIZE(rds_stat_names)) {
+		avail = 0;
+		goto trans;
+	}
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names,
+			    ARRAY_SIZE(rds_stat_names));
+	avail -= ARRAY_SIZE(rds_stat_names);
+
+trans:
+	lens->each = sizeof(struct rds_info_counter);
+	lens->nr = rds_trans_stats_info_copy(iter, avail) +
+		   ARRAY_SIZE(rds_stat_names);
+}
+
+void rds_stats_exit(void)
+{
+	rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
+}
+
+int rds_stats_init(void)
+{
+	rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
+	return 0;
+}
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
new file mode 100644
index 0000000..e381bbc
--- /dev/null
+++ b/net/rds/sysctl.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+
+static struct ctl_table_header *rds_sysctl_reg_table;
+
+static unsigned long rds_sysctl_reconnect_min = 1;
+static unsigned long rds_sysctl_reconnect_max = ~0UL;
+
+unsigned long rds_sysctl_reconnect_min_jiffies;
+unsigned long rds_sysctl_reconnect_max_jiffies = HZ;
+
+unsigned int  rds_sysctl_max_unacked_packets = 8;
+unsigned int  rds_sysctl_max_unacked_bytes = (16 << 20);
+
+unsigned int rds_sysctl_ping_enable = 1;
+
+static struct ctl_table rds_sysctl_rds_table[] = {
+	{
+		.procname       = "reconnect_min_delay_ms",
+		.data		= &rds_sysctl_reconnect_min_jiffies,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= &rds_sysctl_reconnect_min,
+		.extra2		= &rds_sysctl_reconnect_max_jiffies,
+	},
+	{
+		.procname       = "reconnect_max_delay_ms",
+		.data		= &rds_sysctl_reconnect_max_jiffies,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= &rds_sysctl_reconnect_min_jiffies,
+		.extra2		= &rds_sysctl_reconnect_max,
+	},
+	{
+		.procname	= "max_unacked_packets",
+		.data		= &rds_sysctl_max_unacked_packets,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname	= "max_unacked_bytes",
+		.data		= &rds_sysctl_max_unacked_bytes,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname	= "ping_enable",
+		.data		= &rds_sysctl_ping_enable,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{ }
+};
+
+void rds_sysctl_exit(void)
+{
+	unregister_net_sysctl_table(rds_sysctl_reg_table);
+}
+
+int rds_sysctl_init(void)
+{
+	rds_sysctl_reconnect_min = msecs_to_jiffies(1);
+	rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
+
+	rds_sysctl_reg_table =
+		register_net_sysctl(&init_net, "net/rds", rds_sysctl_rds_table);
+	if (!rds_sysctl_reg_table)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
new file mode 100644
index 0000000..351a284
--- /dev/null
+++ b/net/rds/tcp.c
@@ -0,0 +1,636 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+/* only for info exporting */
+static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
+static LIST_HEAD(rds_tcp_tc_list);
+static unsigned int rds_tcp_tc_count;
+
+/* Track rds_tcp_connection structs so they can be cleaned up */
+static DEFINE_SPINLOCK(rds_tcp_conn_lock);
+static LIST_HEAD(rds_tcp_conn_list);
+static atomic_t rds_tcp_unloading = ATOMIC_INIT(0);
+
+static struct kmem_cache *rds_tcp_conn_slab;
+
+static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *fpos);
+
+static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
+static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
+
+static struct ctl_table rds_tcp_sysctl_table[] = {
+#define	RDS_TCP_SNDBUF	0
+	{
+		.procname       = "rds_tcp_sndbuf",
+		/* data is per-net pointer */
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = rds_tcp_skbuf_handler,
+		.extra1		= &rds_tcp_min_sndbuf,
+	},
+#define	RDS_TCP_RCVBUF	1
+	{
+		.procname       = "rds_tcp_rcvbuf",
+		/* data is per-net pointer */
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = rds_tcp_skbuf_handler,
+		.extra1		= &rds_tcp_min_rcvbuf,
+	},
+	{ }
+};
+
+/* doing it this way avoids calling tcp_sk() */
+void rds_tcp_nonagle(struct socket *sock)
+{
+	int val = 1;
+
+	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (void *)&val,
+			      sizeof(val));
+}
+
+u32 rds_tcp_write_seq(struct rds_tcp_connection *tc)
+{
+	/* seq# of the last byte of data in tcp send buffer */
+	return tcp_sk(tc->t_sock->sk)->write_seq;
+}
+
+u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
+{
+	return tcp_sk(tc->t_sock->sk)->snd_una;
+}
+
+void rds_tcp_restore_callbacks(struct socket *sock,
+			       struct rds_tcp_connection *tc)
+{
+	rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
+	write_lock_bh(&sock->sk->sk_callback_lock);
+
+	/* done under the callback_lock to serialize with write_space */
+	spin_lock(&rds_tcp_tc_list_lock);
+	list_del_init(&tc->t_list_item);
+	rds_tcp_tc_count--;
+	spin_unlock(&rds_tcp_tc_list_lock);
+
+	tc->t_sock = NULL;
+
+	sock->sk->sk_write_space = tc->t_orig_write_space;
+	sock->sk->sk_data_ready = tc->t_orig_data_ready;
+	sock->sk->sk_state_change = tc->t_orig_state_change;
+	sock->sk->sk_user_data = NULL;
+
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+/*
+ * rds_tcp_reset_callbacks() switches the to the new sock and
+ * returns the existing tc->t_sock.
+ *
+ * The only functions that set tc->t_sock are rds_tcp_set_callbacks
+ * and rds_tcp_reset_callbacks.  Send and receive trust that
+ * it is set.  The absence of RDS_CONN_UP bit protects those paths
+ * from being called while it isn't set.
+ */
+void rds_tcp_reset_callbacks(struct socket *sock,
+			     struct rds_conn_path *cp)
+{
+	struct rds_tcp_connection *tc = cp->cp_transport_data;
+	struct socket *osock = tc->t_sock;
+
+	if (!osock)
+		goto newsock;
+
+	/* Need to resolve a duelling SYN between peers.
+	 * We have an outstanding SYN to this peer, which may
+	 * potentially have transitioned to the RDS_CONN_UP state,
+	 * so we must quiesce any send threads before resetting
+	 * cp_transport_data. We quiesce these threads by setting
+	 * cp_state to something other than RDS_CONN_UP, and then
+	 * waiting for any existing threads in rds_send_xmit to
+	 * complete release_in_xmit(). (Subsequent threads entering
+	 * rds_send_xmit() will bail on !rds_conn_up().
+	 *
+	 * However an incoming syn-ack at this point would end up
+	 * marking the conn as RDS_CONN_UP, and would again permit
+	 * rds_send_xmi() threads through, so ideally we would
+	 * synchronize on RDS_CONN_UP after lock_sock(), but cannot
+	 * do that: waiting on !RDS_IN_XMIT after lock_sock() may
+	 * end up deadlocking with tcp_sendmsg(), and the RDS_IN_XMIT
+	 * would not get set. As a result, we set c_state to
+	 * RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change
+	 * cannot mark rds_conn_path_up() in the window before lock_sock()
+	 */
+	atomic_set(&cp->cp_state, RDS_CONN_RESETTING);
+	wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags));
+	lock_sock(osock->sk);
+	/* reset receive side state for rds_tcp_data_recv() for osock  */
+	cancel_delayed_work_sync(&cp->cp_send_w);
+	cancel_delayed_work_sync(&cp->cp_recv_w);
+	if (tc->t_tinc) {
+		rds_inc_put(&tc->t_tinc->ti_inc);
+		tc->t_tinc = NULL;
+	}
+	tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+	tc->t_tinc_data_rem = 0;
+	rds_tcp_restore_callbacks(osock, tc);
+	release_sock(osock->sk);
+	sock_release(osock);
+newsock:
+	rds_send_path_reset(cp);
+	lock_sock(sock->sk);
+	rds_tcp_set_callbacks(sock, cp);
+	release_sock(sock->sk);
+}
+
+/* Add tc to rds_tcp_tc_list and set tc->t_sock. See comments
+ * above rds_tcp_reset_callbacks for notes about synchronization
+ * with data path
+ */
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
+{
+	struct rds_tcp_connection *tc = cp->cp_transport_data;
+
+	rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
+	write_lock_bh(&sock->sk->sk_callback_lock);
+
+	/* done under the callback_lock to serialize with write_space */
+	spin_lock(&rds_tcp_tc_list_lock);
+	list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
+	rds_tcp_tc_count++;
+	spin_unlock(&rds_tcp_tc_list_lock);
+
+	/* accepted sockets need our listen data ready undone */
+	if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
+		sock->sk->sk_data_ready = sock->sk->sk_user_data;
+
+	tc->t_sock = sock;
+	tc->t_cpath = cp;
+	tc->t_orig_data_ready = sock->sk->sk_data_ready;
+	tc->t_orig_write_space = sock->sk->sk_write_space;
+	tc->t_orig_state_change = sock->sk->sk_state_change;
+
+	sock->sk->sk_user_data = cp;
+	sock->sk->sk_data_ready = rds_tcp_data_ready;
+	sock->sk->sk_write_space = rds_tcp_write_space;
+	sock->sk->sk_state_change = rds_tcp_state_change;
+
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
+			    struct rds_info_iterator *iter,
+			    struct rds_info_lengths *lens)
+{
+	struct rds_info_tcp_socket tsinfo;
+	struct rds_tcp_connection *tc;
+	unsigned long flags;
+	struct sockaddr_in sin;
+	struct socket *sock;
+
+	spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
+
+	if (len / sizeof(tsinfo) < rds_tcp_tc_count)
+		goto out;
+
+	list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+
+		sock = tc->t_sock;
+		if (sock) {
+			sock->ops->getname(sock, (struct sockaddr *)&sin, 0);
+			tsinfo.local_addr = sin.sin_addr.s_addr;
+			tsinfo.local_port = sin.sin_port;
+			sock->ops->getname(sock, (struct sockaddr *)&sin, 1);
+			tsinfo.peer_addr = sin.sin_addr.s_addr;
+			tsinfo.peer_port = sin.sin_port;
+		}
+
+		tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
+		tsinfo.data_rem = tc->t_tinc_data_rem;
+		tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
+		tsinfo.last_expected_una = tc->t_last_expected_una;
+		tsinfo.last_seen_una = tc->t_last_seen_una;
+
+		rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
+	}
+
+out:
+	lens->nr = rds_tcp_tc_count;
+	lens->each = sizeof(tsinfo);
+
+	spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
+}
+
+static int rds_tcp_laddr_check(struct net *net, __be32 addr)
+{
+	if (inet_addr_type(net, addr) == RTN_LOCAL)
+		return 0;
+	return -EADDRNOTAVAIL;
+}
+
+static void rds_tcp_conn_free(void *arg)
+{
+	struct rds_tcp_connection *tc = arg;
+	unsigned long flags;
+
+	rdsdebug("freeing tc %p\n", tc);
+
+	spin_lock_irqsave(&rds_tcp_conn_lock, flags);
+	if (!tc->t_tcp_node_detached)
+		list_del(&tc->t_tcp_node);
+	spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
+
+	kmem_cache_free(rds_tcp_conn_slab, tc);
+}
+
+static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_tcp_connection *tc;
+	int i, j;
+	int ret = 0;
+
+	for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+		tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
+		if (!tc) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+		mutex_init(&tc->t_conn_path_lock);
+		tc->t_sock = NULL;
+		tc->t_tinc = NULL;
+		tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+		tc->t_tinc_data_rem = 0;
+
+		conn->c_path[i].cp_transport_data = tc;
+		tc->t_cpath = &conn->c_path[i];
+		tc->t_tcp_node_detached = true;
+
+		rdsdebug("rds_conn_path [%d] tc %p\n", i,
+			 conn->c_path[i].cp_transport_data);
+	}
+	spin_lock_irq(&rds_tcp_conn_lock);
+	for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+		tc = conn->c_path[i].cp_transport_data;
+		tc->t_tcp_node_detached = false;
+		list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
+	}
+	spin_unlock_irq(&rds_tcp_conn_lock);
+fail:
+	if (ret) {
+		for (j = 0; j < i; j++)
+			rds_tcp_conn_free(conn->c_path[j].cp_transport_data);
+	}
+	return ret;
+}
+
+static bool list_has_conn(struct list_head *list, struct rds_connection *conn)
+{
+	struct rds_tcp_connection *tc, *_tc;
+
+	list_for_each_entry_safe(tc, _tc, list, t_tcp_node) {
+		if (tc->t_cpath->cp_conn == conn)
+			return true;
+	}
+	return false;
+}
+
+static void rds_tcp_set_unloading(void)
+{
+	atomic_set(&rds_tcp_unloading, 1);
+}
+
+static bool rds_tcp_is_unloading(struct rds_connection *conn)
+{
+	return atomic_read(&rds_tcp_unloading) != 0;
+}
+
+static void rds_tcp_destroy_conns(void)
+{
+	struct rds_tcp_connection *tc, *_tc;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&rds_tcp_conn_lock);
+	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+		if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn))
+			list_move_tail(&tc->t_tcp_node, &tmp_list);
+	}
+	spin_unlock_irq(&rds_tcp_conn_lock);
+
+	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
+		rds_conn_destroy(tc->t_cpath->cp_conn);
+}
+
+static void rds_tcp_exit(void);
+
+struct rds_transport rds_tcp_transport = {
+	.laddr_check		= rds_tcp_laddr_check,
+	.xmit_path_prepare	= rds_tcp_xmit_path_prepare,
+	.xmit_path_complete	= rds_tcp_xmit_path_complete,
+	.xmit			= rds_tcp_xmit,
+	.recv_path		= rds_tcp_recv_path,
+	.conn_alloc		= rds_tcp_conn_alloc,
+	.conn_free		= rds_tcp_conn_free,
+	.conn_path_connect	= rds_tcp_conn_path_connect,
+	.conn_path_shutdown	= rds_tcp_conn_path_shutdown,
+	.inc_copy_to_user	= rds_tcp_inc_copy_to_user,
+	.inc_free		= rds_tcp_inc_free,
+	.stats_info_copy	= rds_tcp_stats_info_copy,
+	.exit			= rds_tcp_exit,
+	.t_owner		= THIS_MODULE,
+	.t_name			= "tcp",
+	.t_type			= RDS_TRANS_TCP,
+	.t_prefer_loopback	= 1,
+	.t_mp_capable		= 1,
+	.t_unloading		= rds_tcp_is_unloading,
+};
+
+static unsigned int rds_tcp_netid;
+
+/* per-network namespace private data for this module */
+struct rds_tcp_net {
+	struct socket *rds_tcp_listen_sock;
+	struct work_struct rds_tcp_accept_w;
+	struct ctl_table_header *rds_tcp_sysctl;
+	struct ctl_table *ctl_table;
+	int sndbuf_size;
+	int rcvbuf_size;
+};
+
+/* All module specific customizations to the RDS-TCP socket should be done in
+ * rds_tcp_tune() and applied after socket creation.
+ */
+void rds_tcp_tune(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+	rds_tcp_nonagle(sock);
+	lock_sock(sk);
+	if (rtn->sndbuf_size > 0) {
+		sk->sk_sndbuf = rtn->sndbuf_size;
+		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+	}
+	if (rtn->rcvbuf_size > 0) {
+		sk->sk_sndbuf = rtn->rcvbuf_size;
+		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+	}
+	release_sock(sk);
+}
+
+static void rds_tcp_accept_worker(struct work_struct *work)
+{
+	struct rds_tcp_net *rtn = container_of(work,
+					       struct rds_tcp_net,
+					       rds_tcp_accept_w);
+
+	while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0)
+		cond_resched();
+}
+
+void rds_tcp_accept_work(struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+	queue_work(rds_wq, &rtn->rds_tcp_accept_w);
+}
+
+static __net_init int rds_tcp_init_net(struct net *net)
+{
+	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+	struct ctl_table *tbl;
+	int err = 0;
+
+	memset(rtn, 0, sizeof(*rtn));
+
+	/* {snd, rcv}buf_size default to 0, which implies we let the
+	 * stack pick the value, and permit auto-tuning of buffer size.
+	 */
+	if (net == &init_net) {
+		tbl = rds_tcp_sysctl_table;
+	} else {
+		tbl = kmemdup(rds_tcp_sysctl_table,
+			      sizeof(rds_tcp_sysctl_table), GFP_KERNEL);
+		if (!tbl) {
+			pr_warn("could not set allocate syctl table\n");
+			return -ENOMEM;
+		}
+		rtn->ctl_table = tbl;
+	}
+	tbl[RDS_TCP_SNDBUF].data = &rtn->sndbuf_size;
+	tbl[RDS_TCP_RCVBUF].data = &rtn->rcvbuf_size;
+	rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp", tbl);
+	if (!rtn->rds_tcp_sysctl) {
+		pr_warn("could not register sysctl\n");
+		err = -ENOMEM;
+		goto fail;
+	}
+	rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
+	if (!rtn->rds_tcp_listen_sock) {
+		pr_warn("could not set up listen sock\n");
+		unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+		rtn->rds_tcp_sysctl = NULL;
+		err = -EAFNOSUPPORT;
+		goto fail;
+	}
+	INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
+	return 0;
+
+fail:
+	if (net != &init_net)
+		kfree(tbl);
+	return err;
+}
+
+static void rds_tcp_kill_sock(struct net *net)
+{
+	struct rds_tcp_connection *tc, *_tc;
+	LIST_HEAD(tmp_list);
+	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+	struct socket *lsock = rtn->rds_tcp_listen_sock;
+
+	rtn->rds_tcp_listen_sock = NULL;
+	rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
+	spin_lock_irq(&rds_tcp_conn_lock);
+	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
+
+		if (net != c_net || !tc->t_sock)
+			continue;
+		if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) {
+			list_move_tail(&tc->t_tcp_node, &tmp_list);
+		} else {
+			list_del(&tc->t_tcp_node);
+			tc->t_tcp_node_detached = true;
+		}
+	}
+	spin_unlock_irq(&rds_tcp_conn_lock);
+	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
+		rds_conn_destroy(tc->t_cpath->cp_conn);
+}
+
+static void __net_exit rds_tcp_exit_net(struct net *net)
+{
+	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+	rds_tcp_kill_sock(net);
+
+	if (rtn->rds_tcp_sysctl)
+		unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+
+	if (net != &init_net && rtn->ctl_table)
+		kfree(rtn->ctl_table);
+}
+
+static struct pernet_operations rds_tcp_net_ops = {
+	.init = rds_tcp_init_net,
+	.exit = rds_tcp_exit_net,
+	.id = &rds_tcp_netid,
+	.size = sizeof(struct rds_tcp_net),
+};
+
+void *rds_tcp_listen_sock_def_readable(struct net *net)
+{
+	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+	struct socket *lsock = rtn->rds_tcp_listen_sock;
+
+	if (!lsock)
+		return NULL;
+
+	return lsock->sk->sk_user_data;
+}
+
+/* when sysctl is used to modify some kernel socket parameters,this
+ * function  resets the RDS connections in that netns  so that we can
+ * restart with new parameters.  The assumption is that such reset
+ * events are few and far-between.
+ */
+static void rds_tcp_sysctl_reset(struct net *net)
+{
+	struct rds_tcp_connection *tc, *_tc;
+
+	spin_lock_irq(&rds_tcp_conn_lock);
+	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
+
+		if (net != c_net || !tc->t_sock)
+			continue;
+
+		/* reconnect with new parameters */
+		rds_conn_path_drop(tc->t_cpath, false);
+	}
+	spin_unlock_irq(&rds_tcp_conn_lock);
+}
+
+static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *fpos)
+{
+	struct net *net = current->nsproxy->net_ns;
+	int err;
+
+	err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos);
+	if (err < 0) {
+		pr_warn("Invalid input. Must be >= %d\n",
+			*(int *)(ctl->extra1));
+		return err;
+	}
+	if (write)
+		rds_tcp_sysctl_reset(net);
+	return 0;
+}
+
+static void rds_tcp_exit(void)
+{
+	rds_tcp_set_unloading();
+	synchronize_rcu();
+	rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+	unregister_pernet_device(&rds_tcp_net_ops);
+	rds_tcp_destroy_conns();
+	rds_trans_unregister(&rds_tcp_transport);
+	rds_tcp_recv_exit();
+	kmem_cache_destroy(rds_tcp_conn_slab);
+}
+module_exit(rds_tcp_exit);
+
+static int rds_tcp_init(void)
+{
+	int ret;
+
+	rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
+					      sizeof(struct rds_tcp_connection),
+					      0, 0, NULL);
+	if (!rds_tcp_conn_slab) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = rds_tcp_recv_init();
+	if (ret)
+		goto out_slab;
+
+	ret = register_pernet_device(&rds_tcp_net_ops);
+	if (ret)
+		goto out_recv;
+
+	rds_trans_register(&rds_tcp_transport);
+
+	rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+
+	goto out;
+out_recv:
+	rds_tcp_recv_exit();
+out_slab:
+	kmem_cache_destroy(rds_tcp_conn_slab);
+out:
+	return ret;
+}
+module_init(rds_tcp_init);
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: TCP transport");
+MODULE_LICENSE("Dual BSD/GPL");
+
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
new file mode 100644
index 0000000..c6fa080
--- /dev/null
+++ b/net/rds/tcp.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDS_TCP_H
+#define _RDS_TCP_H
+
+#define RDS_TCP_PORT	16385
+
+struct rds_tcp_incoming {
+	struct rds_incoming	ti_inc;
+	struct sk_buff_head	ti_skb_list;
+};
+
+struct rds_tcp_connection {
+
+	struct list_head	t_tcp_node;
+	bool			t_tcp_node_detached;
+	struct rds_conn_path	*t_cpath;
+	/* t_conn_path_lock synchronizes the connection establishment between
+	 * rds_tcp_accept_one and rds_tcp_conn_path_connect
+	 */
+	struct mutex		t_conn_path_lock;
+	struct socket		*t_sock;
+	void			*t_orig_write_space;
+	void			*t_orig_data_ready;
+	void			*t_orig_state_change;
+
+	struct rds_tcp_incoming	*t_tinc;
+	size_t			t_tinc_hdr_rem;
+	size_t			t_tinc_data_rem;
+
+	/* XXX error report? */
+	struct work_struct	t_conn_w;
+	struct work_struct	t_send_w;
+	struct work_struct	t_down_w;
+	struct work_struct	t_recv_w;
+
+	/* for info exporting only */
+	struct list_head	t_list_item;
+	u32			t_last_sent_nxt;
+	u32			t_last_expected_una;
+	u32			t_last_seen_una;
+};
+
+struct rds_tcp_statistics {
+	uint64_t	s_tcp_data_ready_calls;
+	uint64_t	s_tcp_write_space_calls;
+	uint64_t	s_tcp_sndbuf_full;
+	uint64_t	s_tcp_connect_raced;
+	uint64_t	s_tcp_listen_closed_stale;
+};
+
+/* tcp.c */
+void rds_tcp_tune(struct socket *sock);
+void rds_tcp_nonagle(struct socket *sock);
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp);
+void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp);
+void rds_tcp_restore_callbacks(struct socket *sock,
+			       struct rds_tcp_connection *tc);
+u32 rds_tcp_write_seq(struct rds_tcp_connection *tc);
+u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
+u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
+extern struct rds_transport rds_tcp_transport;
+void rds_tcp_accept_work(struct sock *sk);
+
+/* tcp_connect.c */
+int rds_tcp_conn_path_connect(struct rds_conn_path *cp);
+void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn);
+void rds_tcp_state_change(struct sock *sk);
+
+/* tcp_listen.c */
+struct socket *rds_tcp_listen_init(struct net *);
+void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
+void rds_tcp_listen_data_ready(struct sock *sk);
+int rds_tcp_accept_one(struct socket *sock);
+int rds_tcp_keepalive(struct socket *sock);
+void *rds_tcp_listen_sock_def_readable(struct net *net);
+void rds_tcp_set_linger(struct socket *sock);
+
+/* tcp_recv.c */
+int rds_tcp_recv_init(void);
+void rds_tcp_recv_exit(void);
+void rds_tcp_data_ready(struct sock *sk);
+int rds_tcp_recv_path(struct rds_conn_path *cp);
+void rds_tcp_inc_free(struct rds_incoming *inc);
+int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+
+/* tcp_send.c */
+void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp);
+void rds_tcp_xmit_path_complete(struct rds_conn_path *cp);
+int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
+		 unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_tcp_write_space(struct sock *sk);
+
+/* tcp_stats.c */
+DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
+#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member)
+unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
+				     unsigned int avail);
+
+#endif
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
new file mode 100644
index 0000000..d999e70
--- /dev/null
+++ b/net/rds/tcp_connect.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+void rds_tcp_state_change(struct sock *sk)
+{
+	void (*state_change)(struct sock *sk);
+	struct rds_conn_path *cp;
+	struct rds_tcp_connection *tc;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	cp = sk->sk_user_data;
+	if (!cp) {
+		state_change = sk->sk_state_change;
+		goto out;
+	}
+	tc = cp->cp_transport_data;
+	state_change = tc->t_orig_state_change;
+
+	rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
+
+	switch (sk->sk_state) {
+	/* ignore connecting sockets as they make progress */
+	case TCP_SYN_SENT:
+	case TCP_SYN_RECV:
+		break;
+	case TCP_ESTABLISHED:
+		/* Force the peer to reconnect so that we have the
+		 * TCP ports going from <smaller-ip>.<transient> to
+		 * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the
+		 * RDS connection as RDS_CONN_UP until the reconnect,
+		 * to avoid RDS datagram loss.
+		 */
+		if (!IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr) &&
+		    rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
+					     RDS_CONN_ERROR)) {
+			rds_conn_path_drop(cp, false);
+		} else {
+			rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+		}
+		break;
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSE:
+		rds_conn_path_drop(cp, false);
+	default:
+		break;
+	}
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+	state_change(sk);
+}
+
+int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
+{
+	struct socket *sock = NULL;
+	struct sockaddr_in src, dest;
+	int ret;
+	struct rds_connection *conn = cp->cp_conn;
+	struct rds_tcp_connection *tc = cp->cp_transport_data;
+
+	/* for multipath rds,we only trigger the connection after
+	 * the handshake probe has determined the number of paths.
+	 */
+	if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2)
+		return -EAGAIN;
+
+	mutex_lock(&tc->t_conn_path_lock);
+
+	if (rds_conn_path_up(cp)) {
+		mutex_unlock(&tc->t_conn_path_lock);
+		return 0;
+	}
+	ret = sock_create_kern(rds_conn_net(conn), PF_INET,
+			       SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0)
+		goto out;
+
+	rds_tcp_tune(sock);
+
+	src.sin_family = AF_INET;
+	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+	src.sin_port = (__force u16)htons(0);
+
+	ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
+	if (ret) {
+		rdsdebug("bind failed with %d at address %pI4\n",
+			 ret, &conn->c_laddr);
+		goto out;
+	}
+
+	dest.sin_family = AF_INET;
+	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+	dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
+
+	/*
+	 * once we call connect() we can start getting callbacks and they
+	 * own the socket
+	 */
+	rds_tcp_set_callbacks(sock, cp);
+	ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
+				 O_NONBLOCK);
+
+	rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
+	if (ret == -EINPROGRESS)
+		ret = 0;
+	if (ret == 0) {
+		rds_tcp_keepalive(sock);
+		sock = NULL;
+	} else {
+		rds_tcp_restore_callbacks(sock, cp->cp_transport_data);
+	}
+
+out:
+	mutex_unlock(&tc->t_conn_path_lock);
+	if (sock)
+		sock_release(sock);
+	return ret;
+}
+
+/*
+ * Before killing the tcp socket this needs to serialize with callbacks.  The
+ * caller has already grabbed the sending sem so we're serialized with other
+ * senders.
+ *
+ * TCP calls the callbacks with the sock lock so we hold it while we reset the
+ * callbacks to those set by TCP.  Our callbacks won't execute again once we
+ * hold the sock lock.
+ */
+void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
+{
+	struct rds_tcp_connection *tc = cp->cp_transport_data;
+	struct socket *sock = tc->t_sock;
+
+	rdsdebug("shutting down conn %p tc %p sock %p\n",
+		 cp->cp_conn, tc, sock);
+
+	if (sock) {
+		if (rds_destroy_pending(cp->cp_conn))
+			rds_tcp_set_linger(sock);
+		sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
+		lock_sock(sock->sk);
+		rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
+
+		release_sock(sock->sk);
+		sock_release(sock);
+	}
+
+	if (tc->t_tinc) {
+		rds_inc_put(&tc->t_tinc->ti_inc);
+		tc->t_tinc = NULL;
+	}
+	tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+	tc->t_tinc_data_rem = 0;
+}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
new file mode 100644
index 0000000..2257118
--- /dev/null
+++ b/net/rds/tcp_listen.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2006, 2018 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+int rds_tcp_keepalive(struct socket *sock)
+{
+	/* values below based on xs_udp_default_timeout */
+	int keepidle = 5; /* send a probe 'keepidle' secs after last data */
+	int keepcnt = 5; /* number of unack'ed probes before declaring dead */
+	int keepalive = 1;
+	int ret = 0;
+
+	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+				(char *)&keepalive, sizeof(keepalive));
+	if (ret < 0)
+		goto bail;
+
+	ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
+				(char *)&keepcnt, sizeof(keepcnt));
+	if (ret < 0)
+		goto bail;
+
+	ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
+				(char *)&keepidle, sizeof(keepidle));
+	if (ret < 0)
+		goto bail;
+
+	/* KEEPINTVL is the interval between successive probes. We follow
+	 * the model in xs_tcp_finish_connecting() and re-use keepidle.
+	 */
+	ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
+				(char *)&keepidle, sizeof(keepidle));
+bail:
+	return ret;
+}
+
+/* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the
+ * client's ipaddr < server's ipaddr. Otherwise, close the accepted
+ * socket and force a reconneect from smaller -> larger ip addr. The reason
+ * we special case cp_index 0 is to allow the rds probe ping itself to itself
+ * get through efficiently.
+ * Since reconnects are only initiated from the node with the numerically
+ * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side
+ * by moving them to CONNECTING in this function.
+ */
+static
+struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
+{
+	int i;
+	bool peer_is_smaller = IS_CANONICAL(conn->c_faddr, conn->c_laddr);
+	int npaths = max_t(int, 1, conn->c_npaths);
+
+	/* for mprds, all paths MUST be initiated by the peer
+	 * with the smaller address.
+	 */
+	if (!peer_is_smaller) {
+		/* Make sure we initiate at least one path if this
+		 * has not already been done; rds_start_mprds() will
+		 * take care of additional paths, if necessary.
+		 */
+		if (npaths == 1)
+			rds_conn_path_connect_if_down(&conn->c_path[0]);
+		return NULL;
+	}
+
+	for (i = 0; i < npaths; i++) {
+		struct rds_conn_path *cp = &conn->c_path[i];
+
+		if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
+					     RDS_CONN_CONNECTING) ||
+		    rds_conn_path_transition(cp, RDS_CONN_ERROR,
+					     RDS_CONN_CONNECTING)) {
+			return cp->cp_transport_data;
+		}
+	}
+	return NULL;
+}
+
+void rds_tcp_set_linger(struct socket *sock)
+{
+	struct linger no_linger = {
+		.l_onoff = 1,
+		.l_linger = 0,
+	};
+
+	kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
+			  (char *)&no_linger, sizeof(no_linger));
+}
+
+int rds_tcp_accept_one(struct socket *sock)
+{
+	struct socket *new_sock = NULL;
+	struct rds_connection *conn;
+	int ret;
+	struct inet_sock *inet;
+	struct rds_tcp_connection *rs_tcp = NULL;
+	int conn_state;
+	struct rds_conn_path *cp;
+
+	if (!sock) /* module unload or netns delete in progress */
+		return -ENETUNREACH;
+
+	ret = sock_create_lite(sock->sk->sk_family,
+			       sock->sk->sk_type, sock->sk->sk_protocol,
+			       &new_sock);
+	if (ret)
+		goto out;
+
+	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, true);
+	if (ret < 0)
+		goto out;
+
+	/* sock_create_lite() does not get a hold on the owner module so we
+	 * need to do it here.  Note that sock_release() uses sock->ops to
+	 * determine if it needs to decrement the reference count.  So set
+	 * sock->ops after calling accept() in case that fails.  And there's
+	 * no need to do try_module_get() as the listener should have a hold
+	 * already.
+	 */
+	new_sock->ops = sock->ops;
+	__module_get(new_sock->ops->owner);
+
+	ret = rds_tcp_keepalive(new_sock);
+	if (ret < 0)
+		goto out;
+
+	rds_tcp_tune(new_sock);
+
+	inet = inet_sk(new_sock->sk);
+
+	rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n",
+		 &inet->inet_saddr, ntohs(inet->inet_sport),
+		 &inet->inet_daddr, ntohs(inet->inet_dport));
+
+	conn = rds_conn_create(sock_net(sock->sk),
+			       inet->inet_saddr, inet->inet_daddr,
+			       &rds_tcp_transport, GFP_KERNEL);
+	if (IS_ERR(conn)) {
+		ret = PTR_ERR(conn);
+		goto out;
+	}
+	/* An incoming SYN request came in, and TCP just accepted it.
+	 *
+	 * If the client reboots, this conn will need to be cleaned up.
+	 * rds_tcp_state_change() will do that cleanup
+	 */
+	rs_tcp = rds_tcp_accept_one_path(conn);
+	if (!rs_tcp)
+		goto rst_nsk;
+	mutex_lock(&rs_tcp->t_conn_path_lock);
+	cp = rs_tcp->t_cpath;
+	conn_state = rds_conn_path_state(cp);
+	WARN_ON(conn_state == RDS_CONN_UP);
+	if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR)
+		goto rst_nsk;
+	if (rs_tcp->t_sock) {
+		/* Duelling SYN has been handled in rds_tcp_accept_one() */
+		rds_tcp_reset_callbacks(new_sock, cp);
+		/* rds_connect_path_complete() marks RDS_CONN_UP */
+		rds_connect_path_complete(cp, RDS_CONN_RESETTING);
+	} else {
+		rds_tcp_set_callbacks(new_sock, cp);
+		rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+	}
+	new_sock = NULL;
+	ret = 0;
+	if (conn->c_npaths == 0)
+		rds_send_ping(cp->cp_conn, cp->cp_index);
+	goto out;
+rst_nsk:
+	/* reset the newly returned accept sock and bail.
+	 * It is safe to set linger on new_sock because the RDS connection
+	 * has not been brought up on new_sock, so no RDS-level data could
+	 * be pending on it. By setting linger, we achieve the side-effect
+	 * of avoiding TIME_WAIT state on new_sock.
+	 */
+	rds_tcp_set_linger(new_sock);
+	kernel_sock_shutdown(new_sock, SHUT_RDWR);
+	ret = 0;
+out:
+	if (rs_tcp)
+		mutex_unlock(&rs_tcp->t_conn_path_lock);
+	if (new_sock)
+		sock_release(new_sock);
+	return ret;
+}
+
+void rds_tcp_listen_data_ready(struct sock *sk)
+{
+	void (*ready)(struct sock *sk);
+
+	rdsdebug("listen data ready sk %p\n", sk);
+
+	read_lock_bh(&sk->sk_callback_lock);
+	ready = sk->sk_user_data;
+	if (!ready) { /* check for teardown race */
+		ready = sk->sk_data_ready;
+		goto out;
+	}
+
+	/*
+	 * ->sk_data_ready is also called for a newly established child socket
+	 * before it has been accepted and the accepter has set up their
+	 * data_ready.. we only want to queue listen work for our listening
+	 * socket
+	 *
+	 * (*ready)() may be null if we are racing with netns delete, and
+	 * the listen socket is being torn down.
+	 */
+	if (sk->sk_state == TCP_LISTEN)
+		rds_tcp_accept_work(sk);
+	else
+		ready = rds_tcp_listen_sock_def_readable(sock_net(sk));
+
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+	if (ready)
+		ready(sk);
+}
+
+struct socket *rds_tcp_listen_init(struct net *net)
+{
+	struct sockaddr_in sin;
+	struct socket *sock = NULL;
+	int ret;
+
+	ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0)
+		goto out;
+
+	sock->sk->sk_reuse = SK_CAN_REUSE;
+	rds_tcp_nonagle(sock);
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data = sock->sk->sk_data_ready;
+	sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	sin.sin_family = PF_INET;
+	sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
+	sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
+
+	ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+	if (ret < 0)
+		goto out;
+
+	ret = sock->ops->listen(sock, 64);
+	if (ret < 0)
+		goto out;
+
+	return sock;
+out:
+	if (sock)
+		sock_release(sock);
+	return NULL;
+}
+
+void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor)
+{
+	struct sock *sk;
+
+	if (!sock)
+		return;
+
+	sk = sock->sk;
+
+	/* serialize with and prevent further callbacks */
+	lock_sock(sk);
+	write_lock_bh(&sk->sk_callback_lock);
+	if (sk->sk_user_data) {
+		sk->sk_data_ready = sk->sk_user_data;
+		sk->sk_user_data = NULL;
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+	release_sock(sk);
+
+	/* wait for accepts to stop and close the socket */
+	flush_workqueue(rds_wq);
+	flush_work(acceptor);
+	sock_release(sock);
+}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
new file mode 100644
index 0000000..b9fbd2e
--- /dev/null
+++ b/net/rds/tcp_recv.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+static struct kmem_cache *rds_tcp_incoming_slab;
+
+static void rds_tcp_inc_purge(struct rds_incoming *inc)
+{
+	struct rds_tcp_incoming *tinc;
+	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+	rdsdebug("purging tinc %p inc %p\n", tinc, inc);
+	skb_queue_purge(&tinc->ti_skb_list);
+}
+
+void rds_tcp_inc_free(struct rds_incoming *inc)
+{
+	struct rds_tcp_incoming *tinc;
+	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+	rds_tcp_inc_purge(inc);
+	rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
+	kmem_cache_free(rds_tcp_incoming_slab, tinc);
+}
+
+/*
+ * this is pretty lame, but, whatever.
+ */
+int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+{
+	struct rds_tcp_incoming *tinc;
+	struct sk_buff *skb;
+	int ret = 0;
+
+	if (!iov_iter_count(to))
+		goto out;
+
+	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+
+	skb_queue_walk(&tinc->ti_skb_list, skb) {
+		unsigned long to_copy, skb_off;
+		for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) {
+			to_copy = iov_iter_count(to);
+			to_copy = min(to_copy, skb->len - skb_off);
+
+			if (skb_copy_datagram_iter(skb, skb_off, to, to_copy))
+				return -EFAULT;
+
+			rds_stats_add(s_copy_to_user, to_copy);
+			ret += to_copy;
+
+			if (!iov_iter_count(to))
+				goto out;
+		}
+	}
+out:
+	return ret;
+}
+
+/*
+ * We have a series of skbs that have fragmented pieces of the congestion
+ * bitmap.  They must add up to the exact size of the congestion bitmap.  We
+ * use the skb helpers to copy those into the pages that make up the in-memory
+ * congestion bitmap for the remote address of this connection.  We then tell
+ * the congestion core that the bitmap has been changed so that it can wake up
+ * sleepers.
+ *
+ * This is racing with sending paths which are using test_bit to see if the
+ * bitmap indicates that their recipient is congested.
+ */
+
+static void rds_tcp_cong_recv(struct rds_connection *conn,
+			      struct rds_tcp_incoming *tinc)
+{
+	struct sk_buff *skb;
+	unsigned int to_copy, skb_off;
+	unsigned int map_off;
+	unsigned int map_page;
+	struct rds_cong_map *map;
+	int ret;
+
+	/* catch completely corrupt packets */
+	if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+		return;
+
+	map_page = 0;
+	map_off = 0;
+	map = conn->c_fcong;
+
+	skb_queue_walk(&tinc->ti_skb_list, skb) {
+		skb_off = 0;
+		while (skb_off < skb->len) {
+			to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
+					skb->len - skb_off);
+
+			BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
+
+			/* only returns 0 or -error */
+			ret = skb_copy_bits(skb, skb_off,
+				(void *)map->m_page_addrs[map_page] + map_off,
+				to_copy);
+			BUG_ON(ret != 0);
+
+			skb_off += to_copy;
+			map_off += to_copy;
+			if (map_off == PAGE_SIZE) {
+				map_off = 0;
+				map_page++;
+			}
+		}
+	}
+
+	rds_cong_map_updated(map, ~(u64) 0);
+}
+
+struct rds_tcp_desc_arg {
+	struct rds_conn_path *conn_path;
+	gfp_t gfp;
+};
+
+static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
+			     unsigned int offset, size_t len)
+{
+	struct rds_tcp_desc_arg *arg = desc->arg.data;
+	struct rds_conn_path *cp = arg->conn_path;
+	struct rds_tcp_connection *tc = cp->cp_transport_data;
+	struct rds_tcp_incoming *tinc = tc->t_tinc;
+	struct sk_buff *clone;
+	size_t left = len, to_copy;
+
+	rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
+		 len);
+
+	/*
+	 * tcp_read_sock() interprets partial progress as an indication to stop
+	 * processing.
+	 */
+	while (left) {
+		if (!tinc) {
+			tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
+						arg->gfp);
+			if (!tinc) {
+				desc->error = -ENOMEM;
+				goto out;
+			}
+			tc->t_tinc = tinc;
+			rdsdebug("alloced tinc %p\n", tinc);
+			rds_inc_path_init(&tinc->ti_inc, cp,
+					  cp->cp_conn->c_faddr);
+			tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+					local_clock();
+
+			/*
+			 * XXX * we might be able to use the __ variants when
+			 * we've already serialized at a higher level.
+			 */
+			skb_queue_head_init(&tinc->ti_skb_list);
+		}
+
+		if (left && tc->t_tinc_hdr_rem) {
+			to_copy = min(tc->t_tinc_hdr_rem, left);
+			rdsdebug("copying %zu header from skb %p\n", to_copy,
+				 skb);
+			skb_copy_bits(skb, offset,
+				      (char *)&tinc->ti_inc.i_hdr +
+						sizeof(struct rds_header) -
+						tc->t_tinc_hdr_rem,
+				      to_copy);
+			tc->t_tinc_hdr_rem -= to_copy;
+			left -= to_copy;
+			offset += to_copy;
+
+			if (tc->t_tinc_hdr_rem == 0) {
+				/* could be 0 for a 0 len message */
+				tc->t_tinc_data_rem =
+					be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
+				tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+					local_clock();
+			}
+		}
+
+		if (left && tc->t_tinc_data_rem) {
+			to_copy = min(tc->t_tinc_data_rem, left);
+
+			clone = pskb_extract(skb, offset, to_copy, arg->gfp);
+			if (!clone) {
+				desc->error = -ENOMEM;
+				goto out;
+			}
+
+			skb_queue_tail(&tinc->ti_skb_list, clone);
+
+			rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
+				 "clone %p data %p len %d\n",
+				 skb, skb->data, skb->len, offset, to_copy,
+				 clone, clone->data, clone->len);
+
+			tc->t_tinc_data_rem -= to_copy;
+			left -= to_copy;
+			offset += to_copy;
+		}
+
+		if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
+			struct rds_connection *conn = cp->cp_conn;
+
+			if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+				rds_tcp_cong_recv(conn, tinc);
+			else
+				rds_recv_incoming(conn, conn->c_faddr,
+						  conn->c_laddr, &tinc->ti_inc,
+						  arg->gfp);
+
+			tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+			tc->t_tinc_data_rem = 0;
+			tc->t_tinc = NULL;
+			rds_inc_put(&tinc->ti_inc);
+			tinc = NULL;
+		}
+	}
+out:
+	rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
+		 len, left, skb->len,
+		 skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
+	return len - left;
+}
+
+/* the caller has to hold the sock lock */
+static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp)
+{
+	struct rds_tcp_connection *tc = cp->cp_transport_data;
+	struct socket *sock = tc->t_sock;
+	read_descriptor_t desc;
+	struct rds_tcp_desc_arg arg;
+
+	/* It's like glib in the kernel! */
+	arg.conn_path = cp;
+	arg.gfp = gfp;
+	desc.arg.data = &arg;
+	desc.error = 0;
+	desc.count = 1; /* give more than one skb per call */
+
+	tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
+	rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
+		 desc.error);
+
+	return desc.error;
+}
+
+/*
+ * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
+ * data_ready.
+ *
+ * if we fail to allocate we're in trouble.. blindly wait some time before
+ * trying again to see if the VM can free up something for us.
+ */
+int rds_tcp_recv_path(struct rds_conn_path *cp)
+{
+	struct rds_tcp_connection *tc = cp->cp_transport_data;
+	struct socket *sock = tc->t_sock;
+	int ret = 0;
+
+	rdsdebug("recv worker path [%d] tc %p sock %p\n",
+		 cp->cp_index, tc, sock);
+
+	lock_sock(sock->sk);
+	ret = rds_tcp_read_sock(cp, GFP_KERNEL);
+	release_sock(sock->sk);
+
+	return ret;
+}
+
+void rds_tcp_data_ready(struct sock *sk)
+{
+	void (*ready)(struct sock *sk);
+	struct rds_conn_path *cp;
+	struct rds_tcp_connection *tc;
+
+	rdsdebug("data ready sk %p\n", sk);
+
+	read_lock_bh(&sk->sk_callback_lock);
+	cp = sk->sk_user_data;
+	if (!cp) { /* check for teardown race */
+		ready = sk->sk_data_ready;
+		goto out;
+	}
+
+	tc = cp->cp_transport_data;
+	ready = tc->t_orig_data_ready;
+	rds_tcp_stats_inc(s_tcp_data_ready_calls);
+
+	if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) {
+		rcu_read_lock();
+		if (!rds_destroy_pending(cp->cp_conn))
+			queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+		rcu_read_unlock();
+	}
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+	ready(sk);
+}
+
+int rds_tcp_recv_init(void)
+{
+	rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
+					sizeof(struct rds_tcp_incoming),
+					0, 0, NULL);
+	if (!rds_tcp_incoming_slab)
+		return -ENOMEM;
+	return 0;
+}
+
+void rds_tcp_recv_exit(void)
+{
+	kmem_cache_destroy(rds_tcp_incoming_slab);
+}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
new file mode 100644
index 0000000..7df869d
--- /dev/null
+++ b/net/rds/tcp_send.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds_single_path.h"
+#include "rds.h"
+#include "tcp.h"
+
+static void rds_tcp_cork(struct socket *sock, int val)
+{
+	kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (void *)&val, sizeof(val));
+}
+
+void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp)
+{
+	struct rds_tcp_connection *tc = cp->cp_transport_data;
+
+	rds_tcp_cork(tc->t_sock, 1);
+}
+
+void rds_tcp_xmit_path_complete(struct rds_conn_path *cp)
+{
+	struct rds_tcp_connection *tc = cp->cp_transport_data;
+
+	rds_tcp_cork(tc->t_sock, 0);
+}
+
+/* the core send_sem serializes this with other xmit and shutdown */
+static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
+{
+	struct kvec vec = {
+		.iov_base = data,
+		.iov_len = len,
+	};
+	struct msghdr msg = {
+		.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
+	};
+
+	return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
+}
+
+/* the core send_sem serializes this with other xmit and shutdown */
+int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
+		 unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+	struct rds_conn_path *cp = rm->m_inc.i_conn_path;
+	struct rds_tcp_connection *tc = cp->cp_transport_data;
+	int done = 0;
+	int ret = 0;
+	int more;
+
+	if (hdr_off == 0) {
+		/*
+		 * m_ack_seq is set to the sequence number of the last byte of
+		 * header and data.  see rds_tcp_is_acked().
+		 */
+		tc->t_last_sent_nxt = rds_tcp_write_seq(tc);
+		rm->m_ack_seq = tc->t_last_sent_nxt +
+				sizeof(struct rds_header) +
+				be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1;
+		smp_mb__before_atomic();
+		set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
+		tc->t_last_expected_una = rm->m_ack_seq + 1;
+
+		if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+		rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
+			 rm, rds_tcp_write_seq(tc),
+			 (unsigned long long)rm->m_ack_seq);
+	}
+
+	if (hdr_off < sizeof(struct rds_header)) {
+		/* see rds_tcp_write_space() */
+		set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags);
+
+		ret = rds_tcp_sendmsg(tc->t_sock,
+				      (void *)&rm->m_inc.i_hdr + hdr_off,
+				      sizeof(rm->m_inc.i_hdr) - hdr_off);
+		if (ret < 0)
+			goto out;
+		done += ret;
+		if (hdr_off + done != sizeof(struct rds_header))
+			goto out;
+	}
+
+	more = rm->data.op_nents > 1 ? (MSG_MORE | MSG_SENDPAGE_NOTLAST) : 0;
+	while (sg < rm->data.op_nents) {
+		int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
+
+		ret = tc->t_sock->ops->sendpage(tc->t_sock,
+						sg_page(&rm->data.op_sg[sg]),
+						rm->data.op_sg[sg].offset + off,
+						rm->data.op_sg[sg].length - off,
+						flags);
+		rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
+			 rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
+			 ret);
+		if (ret <= 0)
+			break;
+
+		off += ret;
+		done += ret;
+		if (off == rm->data.op_sg[sg].length) {
+			off = 0;
+			sg++;
+		}
+		if (sg == rm->data.op_nents - 1)
+			more = 0;
+	}
+
+out:
+	if (ret <= 0) {
+		/* write_space will hit after EAGAIN, all else fatal */
+		if (ret == -EAGAIN) {
+			rds_tcp_stats_inc(s_tcp_sndbuf_full);
+			ret = 0;
+		} else {
+			/* No need to disconnect/reconnect if path_drop
+			 * has already been triggered, because, e.g., of
+			 * an incoming RST.
+			 */
+			if (rds_conn_path_up(cp)) {
+				pr_warn("RDS/tcp: send to %pI4 on cp [%d]"
+					"returned %d, "
+					"disconnecting and reconnecting\n",
+					&conn->c_faddr, cp->cp_index, ret);
+				rds_conn_path_drop(cp, false);
+			}
+		}
+	}
+	if (done == 0)
+		done = ret;
+	return done;
+}
+
+/*
+ * rm->m_ack_seq is set to the tcp sequence number that corresponds to the
+ * last byte of the message, including the header.  This means that the
+ * entire message has been received if rm->m_ack_seq is "before" the next
+ * unacked byte of the TCP sequence space.  We have to do very careful
+ * wrapping 32bit comparisons here.
+ */
+static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
+{
+	if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags))
+		return 0;
+	return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0;
+}
+
+void rds_tcp_write_space(struct sock *sk)
+{
+	void (*write_space)(struct sock *sk);
+	struct rds_conn_path *cp;
+	struct rds_tcp_connection *tc;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	cp = sk->sk_user_data;
+	if (!cp) {
+		write_space = sk->sk_write_space;
+		goto out;
+	}
+
+	tc = cp->cp_transport_data;
+	rdsdebug("write_space for tc %p\n", tc);
+	write_space = tc->t_orig_write_space;
+	rds_tcp_stats_inc(s_tcp_write_space_calls);
+
+	rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc));
+	tc->t_last_seen_una = rds_tcp_snd_una(tc);
+	rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), rds_tcp_is_acked);
+
+	rcu_read_lock();
+	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf &&
+	    !rds_destroy_pending(cp->cp_conn))
+		queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+	rcu_read_unlock();
+
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+
+	/*
+	 * write_space is only called when data leaves tcp's send queue if
+	 * SOCK_NOSPACE is set.  We set SOCK_NOSPACE every time we put
+	 * data in tcp's send queue because we use write_space to parse the
+	 * sequence numbers and notice that rds messages have been fully
+	 * received.
+	 *
+	 * tcp's write_space clears SOCK_NOSPACE if the send queue has more
+	 * than a certain amount of space. So we need to set it again *after*
+	 * we call tcp's write_space or else we might only get called on the
+	 * first of a series of incoming tcp acks.
+	 */
+	write_space(sk);
+
+	if (sk->sk_socket)
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c
new file mode 100644
index 0000000..f8a7954
--- /dev/null
+++ b/net/rds/tcp_stats.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats)
+	____cacheline_aligned;
+
+static const char * const rds_tcp_stat_names[] = {
+	"tcp_data_ready_calls",
+	"tcp_write_space_calls",
+	"tcp_sndbuf_full",
+	"tcp_connect_raced",
+	"tcp_listen_closed_stale",
+};
+
+unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
+				     unsigned int avail)
+{
+	struct rds_tcp_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+
+	if (avail < ARRAY_SIZE(rds_tcp_stat_names))
+		goto out;
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names,
+			    ARRAY_SIZE(rds_tcp_stat_names));
+out:
+	return ARRAY_SIZE(rds_tcp_stat_names);
+}
diff --git a/net/rds/threads.c b/net/rds/threads.c
new file mode 100644
index 0000000..c52861d
--- /dev/null
+++ b/net/rds/threads.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/*
+ * All of connection management is simplified by serializing it through
+ * work queues that execute in a connection managing thread.
+ *
+ * TCP wants to send acks through sendpage() in response to data_ready(),
+ * but it needs a process context to do so.
+ *
+ * The receive paths need to allocate but can't drop packets (!) so we have
+ * a thread around to block allocating if the receive fast path sees an
+ * allocation failure.
+ */
+
+/* Grand Unified Theory of connection life cycle:
+ * At any point in time, the connection can be in one of these states:
+ * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
+ *
+ * The following transitions are possible:
+ *  ANY		  -> ERROR
+ *  UP		  -> DISCONNECTING
+ *  ERROR	  -> DISCONNECTING
+ *  DISCONNECTING -> DOWN
+ *  DOWN	  -> CONNECTING
+ *  CONNECTING	  -> UP
+ *
+ * Transition to state DISCONNECTING/DOWN:
+ *  -	Inside the shutdown worker; synchronizes with xmit path
+ *	through RDS_IN_XMIT, and with connection management callbacks
+ *	via c_cm_lock.
+ *
+ *	For receive callbacks, we rely on the underlying transport
+ *	(TCP, IB/RDMA) to provide the necessary synchronisation.
+ */
+struct workqueue_struct *rds_wq;
+EXPORT_SYMBOL_GPL(rds_wq);
+
+void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
+{
+	if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) {
+		printk(KERN_WARNING "%s: Cannot transition to state UP, "
+				"current state is %d\n",
+				__func__,
+				atomic_read(&cp->cp_state));
+		rds_conn_path_drop(cp, false);
+		return;
+	}
+
+	rdsdebug("conn %p for %pI4 to %pI4 complete\n",
+	  cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
+
+	cp->cp_reconnect_jiffies = 0;
+	set_bit(0, &cp->cp_conn->c_map_queued);
+	rcu_read_lock();
+	if (!rds_destroy_pending(cp->cp_conn)) {
+		queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+		queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rds_connect_path_complete);
+
+void rds_connect_complete(struct rds_connection *conn)
+{
+	rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING);
+}
+EXPORT_SYMBOL_GPL(rds_connect_complete);
+
+/*
+ * This random exponential backoff is relied on to eventually resolve racing
+ * connects.
+ *
+ * If connect attempts race then both parties drop both connections and come
+ * here to wait for a random amount of time before trying again.  Eventually
+ * the backoff range will be so much greater than the time it takes to
+ * establish a connection that one of the pair will establish the connection
+ * before the other's random delay fires.
+ *
+ * Connection attempts that arrive while a connection is already established
+ * are also considered to be racing connects.  This lets a connection from
+ * a rebooted machine replace an existing stale connection before the transport
+ * notices that the connection has failed.
+ *
+ * We should *always* start with a random backoff; otherwise a broken connection
+ * will always take several iterations to be re-established.
+ */
+void rds_queue_reconnect(struct rds_conn_path *cp)
+{
+	unsigned long rand;
+	struct rds_connection *conn = cp->cp_conn;
+
+	rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
+	  conn, &conn->c_laddr, &conn->c_faddr,
+	  cp->cp_reconnect_jiffies);
+
+	/* let peer with smaller addr initiate reconnect, to avoid duels */
+	if (conn->c_trans->t_type == RDS_TRANS_TCP &&
+	    !IS_CANONICAL(conn->c_laddr, conn->c_faddr))
+		return;
+
+	set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
+	if (cp->cp_reconnect_jiffies == 0) {
+		cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
+		rcu_read_lock();
+		if (!rds_destroy_pending(cp->cp_conn))
+			queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+		rcu_read_unlock();
+		return;
+	}
+
+	get_random_bytes(&rand, sizeof(rand));
+	rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
+		 rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
+		 conn, &conn->c_laddr, &conn->c_faddr);
+	rcu_read_lock();
+	if (!rds_destroy_pending(cp->cp_conn))
+		queue_delayed_work(rds_wq, &cp->cp_conn_w,
+				   rand % cp->cp_reconnect_jiffies);
+	rcu_read_unlock();
+
+	cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2,
+					rds_sysctl_reconnect_max_jiffies);
+}
+
+void rds_connect_worker(struct work_struct *work)
+{
+	struct rds_conn_path *cp = container_of(work,
+						struct rds_conn_path,
+						cp_conn_w.work);
+	struct rds_connection *conn = cp->cp_conn;
+	int ret;
+
+	if (cp->cp_index > 0 &&
+	    !IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr))
+		return;
+	clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
+	ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
+	if (ret) {
+		ret = conn->c_trans->conn_path_connect(cp);
+		rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
+			conn, &conn->c_laddr, &conn->c_faddr, ret);
+
+		if (ret) {
+			if (rds_conn_path_transition(cp,
+						     RDS_CONN_CONNECTING,
+						     RDS_CONN_DOWN))
+				rds_queue_reconnect(cp);
+			else
+				rds_conn_path_error(cp, "connect failed\n");
+		}
+	}
+}
+
+void rds_send_worker(struct work_struct *work)
+{
+	struct rds_conn_path *cp = container_of(work,
+						struct rds_conn_path,
+						cp_send_w.work);
+	int ret;
+
+	if (rds_conn_path_state(cp) == RDS_CONN_UP) {
+		clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags);
+		ret = rds_send_xmit(cp);
+		cond_resched();
+		rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
+		switch (ret) {
+		case -EAGAIN:
+			rds_stats_inc(s_send_immediate_retry);
+			queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+			break;
+		case -ENOMEM:
+			rds_stats_inc(s_send_delayed_retry);
+			queue_delayed_work(rds_wq, &cp->cp_send_w, 2);
+		default:
+			break;
+		}
+	}
+}
+
+void rds_recv_worker(struct work_struct *work)
+{
+	struct rds_conn_path *cp = container_of(work,
+						struct rds_conn_path,
+						cp_recv_w.work);
+	int ret;
+
+	if (rds_conn_path_state(cp) == RDS_CONN_UP) {
+		ret = cp->cp_conn->c_trans->recv_path(cp);
+		rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
+		switch (ret) {
+		case -EAGAIN:
+			rds_stats_inc(s_recv_immediate_retry);
+			queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+			break;
+		case -ENOMEM:
+			rds_stats_inc(s_recv_delayed_retry);
+			queue_delayed_work(rds_wq, &cp->cp_recv_w, 2);
+		default:
+			break;
+		}
+	}
+}
+
+void rds_shutdown_worker(struct work_struct *work)
+{
+	struct rds_conn_path *cp = container_of(work,
+						struct rds_conn_path,
+						cp_down_w);
+
+	rds_conn_shutdown(cp);
+}
+
+void rds_threads_exit(void)
+{
+	destroy_workqueue(rds_wq);
+}
+
+int rds_threads_init(void)
+{
+	rds_wq = create_singlethread_workqueue("krdsd");
+	if (!rds_wq)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/net/rds/transport.c b/net/rds/transport.c
new file mode 100644
index 0000000..0b188dd
--- /dev/null
+++ b/net/rds/transport.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/in.h>
+
+#include "rds.h"
+#include "loop.h"
+
+static struct rds_transport *transports[RDS_TRANS_COUNT];
+static DECLARE_RWSEM(rds_trans_sem);
+
+void rds_trans_register(struct rds_transport *trans)
+{
+	BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
+
+	down_write(&rds_trans_sem);
+
+	if (transports[trans->t_type])
+		printk(KERN_ERR "RDS Transport type %d already registered\n",
+			trans->t_type);
+	else {
+		transports[trans->t_type] = trans;
+		printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
+	}
+
+	up_write(&rds_trans_sem);
+}
+EXPORT_SYMBOL_GPL(rds_trans_register);
+
+void rds_trans_unregister(struct rds_transport *trans)
+{
+	down_write(&rds_trans_sem);
+
+	transports[trans->t_type] = NULL;
+	printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
+
+	up_write(&rds_trans_sem);
+}
+EXPORT_SYMBOL_GPL(rds_trans_unregister);
+
+void rds_trans_put(struct rds_transport *trans)
+{
+	if (trans)
+		module_put(trans->t_owner);
+}
+
+struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr)
+{
+	struct rds_transport *ret = NULL;
+	struct rds_transport *trans;
+	unsigned int i;
+
+	if (IN_LOOPBACK(ntohl(addr)))
+		return &rds_loop_transport;
+
+	down_read(&rds_trans_sem);
+	for (i = 0; i < RDS_TRANS_COUNT; i++) {
+		trans = transports[i];
+
+		if (trans && (trans->laddr_check(net, addr) == 0) &&
+		    (!trans->t_owner || try_module_get(trans->t_owner))) {
+			ret = trans;
+			break;
+		}
+	}
+	up_read(&rds_trans_sem);
+
+	return ret;
+}
+
+struct rds_transport *rds_trans_get(int t_type)
+{
+	struct rds_transport *ret = NULL;
+	struct rds_transport *trans;
+	unsigned int i;
+
+	down_read(&rds_trans_sem);
+	for (i = 0; i < RDS_TRANS_COUNT; i++) {
+		trans = transports[i];
+
+		if (trans && trans->t_type == t_type &&
+		    (!trans->t_owner || try_module_get(trans->t_owner))) {
+			ret = trans;
+			break;
+		}
+	}
+	up_read(&rds_trans_sem);
+
+	return ret;
+}
+
+/*
+ * This returns the number of stats entries in the snapshot and only
+ * copies them using the iter if there is enough space for them.  The
+ * caller passes in the global stats so that we can size and copy while
+ * holding the lock.
+ */
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+				       unsigned int avail)
+
+{
+	struct rds_transport *trans;
+	unsigned int total = 0;
+	unsigned int part;
+	int i;
+
+	rds_info_iter_unmap(iter);
+	down_read(&rds_trans_sem);
+
+	for (i = 0; i < RDS_TRANS_COUNT; i++) {
+		trans = transports[i];
+		if (!trans || !trans->stats_info_copy)
+			continue;
+
+		part = trans->stats_info_copy(iter, avail);
+		avail -= min(avail, part);
+		total += part;
+	}
+
+	up_read(&rds_trans_sem);
+
+	return total;
+}
+
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
new file mode 100644
index 0000000..ac09ca8
--- /dev/null
+++ b/net/sunrpc/Kconfig
@@ -0,0 +1,64 @@
+config SUNRPC
+	tristate
+	depends on MULTIUSER
+
+config SUNRPC_GSS
+	tristate
+	select OID_REGISTRY
+	depends on MULTIUSER
+
+config SUNRPC_BACKCHANNEL
+	bool
+	depends on SUNRPC
+
+config SUNRPC_SWAP
+	bool
+	depends on SUNRPC
+
+config RPCSEC_GSS_KRB5
+	tristate "Secure RPC: Kerberos V mechanism"
+	depends on SUNRPC && CRYPTO
+	depends on CRYPTO_MD5 && CRYPTO_DES && CRYPTO_CBC && CRYPTO_CTS
+	depends on CRYPTO_ECB && CRYPTO_HMAC && CRYPTO_SHA1 && CRYPTO_AES
+	depends on CRYPTO_ARC4
+	default y
+	select SUNRPC_GSS
+	help
+	  Choose Y here to enable Secure RPC using the Kerberos version 5
+	  GSS-API mechanism (RFC 1964).
+
+	  Secure RPC calls with Kerberos require an auxiliary user-space
+	  daemon which may be found in the Linux nfs-utils package
+	  available from http://linux-nfs.org/.  In addition, user-space
+	  Kerberos support should be installed.
+
+	  If unsure, say Y.
+
+config SUNRPC_DEBUG
+	bool "RPC: Enable dprintk debugging"
+	depends on SUNRPC && SYSCTL
+	select DEBUG_FS
+	help
+	  This option enables a sysctl-based debugging interface
+	  that is be used by the 'rpcdebug' utility to turn on or off
+	  logging of different aspects of the kernel RPC activity.
+
+	  Disabling this option will make your kernel slightly smaller,
+	  but makes troubleshooting NFS issues significantly harder.
+
+	  If unsure, say Y.
+
+config SUNRPC_XPRT_RDMA
+	tristate "RPC-over-RDMA transport"
+	depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
+	default SUNRPC && INFINIBAND
+	select SG_POOL
+	help
+	  This option allows the NFS client and server to use RDMA
+	  transports (InfiniBand, iWARP, or RoCE).
+
+	  To compile this support as a module, choose M. The module
+	  will be called rpcrdma.ko.
+
+	  If unsure, or you know there is no RDMA capability on your
+	  hardware platform, say N.
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
new file mode 100644
index 0000000..8bf19e1
--- /dev/null
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
+
+rpcrdma-y := transport.o rpc_rdma.o verbs.o \
+	fmr_ops.o frwr_ops.o \
+	svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
+	svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
+	module.o
+rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
new file mode 100644
index 0000000..47ebac9
--- /dev/null
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ *
+ * Support for backward direction RPCs on RPC/RDMA.
+ */
+
+#include <linux/module.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+#undef RPCRDMA_BACKCHANNEL_DEBUG
+
+static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
+				 struct rpc_rqst *rqst)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+
+	spin_lock(&buf->rb_reqslock);
+	list_del(&req->rl_all);
+	spin_unlock(&buf->rb_reqslock);
+
+	rpcrdma_destroy_req(req);
+
+	kfree(rqst);
+}
+
+static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
+				 struct rpc_rqst *rqst)
+{
+	struct rpcrdma_regbuf *rb;
+	struct rpcrdma_req *req;
+	size_t size;
+
+	req = rpcrdma_create_req(r_xprt);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	size = r_xprt->rx_data.inline_rsize;
+	rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
+	if (IS_ERR(rb))
+		goto out_fail;
+	req->rl_sendbuf = rb;
+	xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base,
+		     min_t(size_t, size, PAGE_SIZE));
+	rpcrdma_set_xprtdata(rqst, req);
+	return 0;
+
+out_fail:
+	rpcrdma_bc_free_rqst(r_xprt, rqst);
+	return -ENOMEM;
+}
+
+/* Allocate and add receive buffers to the rpcrdma_buffer's
+ * existing list of rep's. These are released when the
+ * transport is destroyed.
+ */
+static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
+				 unsigned int count)
+{
+	int rc = 0;
+
+	while (count--) {
+		rc = rpcrdma_create_rep(r_xprt);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+
+/**
+ * xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests
+ * @xprt: transport associated with these backchannel resources
+ * @reqs: number of concurrent incoming requests to expect
+ *
+ * Returns 0 on success; otherwise a negative errno
+ */
+int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
+	struct rpc_rqst *rqst;
+	unsigned int i;
+	int rc;
+
+	/* The backchannel reply path returns each rpc_rqst to the
+	 * bc_pa_list _after_ the reply is sent. If the server is
+	 * faster than the client, it can send another backward
+	 * direction request before the rpc_rqst is returned to the
+	 * list. The client rejects the request in this case.
+	 *
+	 * Twice as many rpc_rqsts are prepared to ensure there is
+	 * always an rpc_rqst available as soon as a reply is sent.
+	 */
+	if (reqs > RPCRDMA_BACKWARD_WRS >> 1)
+		goto out_err;
+
+	for (i = 0; i < (reqs << 1); i++) {
+		rqst = kzalloc(sizeof(*rqst), GFP_KERNEL);
+		if (!rqst)
+			goto out_free;
+
+		dprintk("RPC:       %s: new rqst %p\n", __func__, rqst);
+
+		rqst->rq_xprt = &r_xprt->rx_xprt;
+		INIT_LIST_HEAD(&rqst->rq_list);
+		INIT_LIST_HEAD(&rqst->rq_bc_list);
+		__set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
+
+		if (rpcrdma_bc_setup_rqst(r_xprt, rqst))
+			goto out_free;
+
+		spin_lock_bh(&xprt->bc_pa_lock);
+		list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
+		spin_unlock_bh(&xprt->bc_pa_lock);
+	}
+
+	rc = rpcrdma_bc_setup_reps(r_xprt, reqs);
+	if (rc)
+		goto out_free;
+
+	rc = rpcrdma_ep_post_extra_recv(r_xprt, reqs);
+	if (rc)
+		goto out_free;
+
+	buffer->rb_bc_srv_max_requests = reqs;
+	request_module("svcrdma");
+	trace_xprtrdma_cb_setup(r_xprt, reqs);
+	return 0;
+
+out_free:
+	xprt_rdma_bc_destroy(xprt, reqs);
+
+out_err:
+	pr_err("RPC:       %s: setup backchannel transport failed\n", __func__);
+	return -ENOMEM;
+}
+
+/**
+ * xprt_rdma_bc_up - Create transport endpoint for backchannel service
+ * @serv: server endpoint
+ * @net: network namespace
+ *
+ * The "xprt" is an implied argument: it supplies the name of the
+ * backchannel transport class.
+ *
+ * Returns zero on success, negative errno on failure
+ */
+int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
+{
+	int ret;
+
+	ret = svc_create_xprt(serv, "rdma-bc", net, PF_INET, 0, 0);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+/**
+ * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
+ * @xprt: transport
+ *
+ * Returns maximum size, in bytes, of a backchannel message
+ */
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	size_t maxmsg;
+
+	maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+	maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
+	return maxmsg - RPCRDMA_HDRLEN_MIN;
+}
+
+static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	__be32 *p;
+
+	rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
+	xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf,
+			req->rl_rdmabuf->rg_base);
+
+	p = xdr_reserve_space(&req->rl_stream, 28);
+	if (unlikely(!p))
+		return -EIO;
+	*p++ = rqst->rq_xid;
+	*p++ = rpcrdma_version;
+	*p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests);
+	*p++ = rdma_msg;
+	*p++ = xdr_zero;
+	*p++ = xdr_zero;
+	*p = xdr_zero;
+
+	if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
+				      &rqst->rq_snd_buf, rpcrdma_noch))
+		return -EIO;
+
+	trace_xprtrdma_cb_reply(rqst);
+	return 0;
+}
+
+/**
+ * xprt_rdma_bc_send_reply - marshal and send a backchannel reply
+ * @rqst: RPC rqst with a backchannel RPC reply in rq_snd_buf
+ *
+ * Caller holds the transport's write lock.
+ *
+ * Returns:
+ *	%0 if the RPC message has been sent
+ *	%-ENOTCONN if the caller should reconnect and call again
+ *	%-EIO if a permanent error occurred and the request was not
+ *		sent. Do not try to send this message again.
+ */
+int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	int rc;
+
+	if (!xprt_connected(rqst->rq_xprt))
+		goto drop_connection;
+
+	rc = rpcrdma_bc_marshal_reply(rqst);
+	if (rc < 0)
+		goto failed_marshal;
+
+	if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+		goto drop_connection;
+	return 0;
+
+failed_marshal:
+	if (rc != -ENOTCONN)
+		return rc;
+drop_connection:
+	xprt_disconnect_done(rqst->rq_xprt);
+	return -ENOTCONN;
+}
+
+/**
+ * xprt_rdma_bc_destroy - Release resources for handling backchannel requests
+ * @xprt: transport associated with these backchannel resources
+ * @reqs: number of incoming requests to destroy; ignored
+ */
+void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpc_rqst *rqst, *tmp;
+
+	spin_lock_bh(&xprt->bc_pa_lock);
+	list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
+		list_del(&rqst->rq_bc_pa_list);
+		spin_unlock_bh(&xprt->bc_pa_lock);
+
+		rpcrdma_bc_free_rqst(r_xprt, rqst);
+
+		spin_lock_bh(&xprt->bc_pa_lock);
+	}
+	spin_unlock_bh(&xprt->bc_pa_lock);
+}
+
+/**
+ * xprt_rdma_bc_free_rqst - Release a backchannel rqst
+ * @rqst: request to release
+ */
+void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
+{
+	struct rpc_xprt *xprt = rqst->rq_xprt;
+
+	dprintk("RPC:       %s: freeing rqst %p (req %p)\n",
+		__func__, rqst, rpcr_to_rdmar(rqst));
+
+	spin_lock_bh(&xprt->bc_pa_lock);
+	list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
+	spin_unlock_bh(&xprt->bc_pa_lock);
+}
+
+/**
+ * rpcrdma_bc_receive_call - Handle a backward direction call
+ * @r_xprt: transport receiving the call
+ * @rep: receive buffer containing the call
+ *
+ * Operational assumptions:
+ *    o Backchannel credits are ignored, just as the NFS server
+ *      forechannel currently does
+ *    o The ULP manages a replay cache (eg, NFSv4.1 sessions).
+ *      No replay detection is done at the transport level
+ */
+void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
+			     struct rpcrdma_rep *rep)
+{
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct svc_serv *bc_serv;
+	struct rpcrdma_req *req;
+	struct rpc_rqst *rqst;
+	struct xdr_buf *buf;
+	size_t size;
+	__be32 *p;
+
+	p = xdr_inline_decode(&rep->rr_stream, 0);
+	size = xdr_stream_remaining(&rep->rr_stream);
+
+#ifdef RPCRDMA_BACKCHANNEL_DEBUG
+	pr_info("RPC:       %s: callback XID %08x, length=%u\n",
+		__func__, be32_to_cpup(p), size);
+	pr_info("RPC:       %s: %*ph\n", __func__, size, p);
+#endif
+
+	/* Grab a free bc rqst */
+	spin_lock(&xprt->bc_pa_lock);
+	if (list_empty(&xprt->bc_pa_list)) {
+		spin_unlock(&xprt->bc_pa_lock);
+		goto out_overflow;
+	}
+	rqst = list_first_entry(&xprt->bc_pa_list,
+				struct rpc_rqst, rq_bc_pa_list);
+	list_del(&rqst->rq_bc_pa_list);
+	spin_unlock(&xprt->bc_pa_lock);
+
+	/* Prepare rqst */
+	rqst->rq_reply_bytes_recvd = 0;
+	rqst->rq_bytes_sent = 0;
+	rqst->rq_xid = *p;
+
+	rqst->rq_private_buf.len = size;
+
+	buf = &rqst->rq_rcv_buf;
+	memset(buf, 0, sizeof(*buf));
+	buf->head[0].iov_base = p;
+	buf->head[0].iov_len = size;
+	buf->len = size;
+
+	/* The receive buffer has to be hooked to the rpcrdma_req
+	 * so that it is not released while the req is pointing
+	 * to its buffer, and so that it can be reposted after
+	 * the Upper Layer is done decoding it.
+	 */
+	req = rpcr_to_rdmar(rqst);
+	req->rl_reply = rep;
+	trace_xprtrdma_cb_call(rqst);
+
+	/* Queue rqst for ULP's callback service */
+	bc_serv = xprt->bc_serv;
+	spin_lock(&bc_serv->sv_cb_lock);
+	list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
+	spin_unlock(&bc_serv->sv_cb_lock);
+
+	wake_up(&bc_serv->sv_cb_waitq);
+
+	r_xprt->rx_stats.bcall_count++;
+	return;
+
+out_overflow:
+	pr_warn("RPC/RDMA backchannel overflow\n");
+	xprt_disconnect_done(xprt);
+	/* This receive buffer gets reposted automatically
+	 * when the connection is re-established.
+	 */
+	return;
+}
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
new file mode 100644
index 0000000..f2f6395
--- /dev/null
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015, 2017 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* Lightweight memory registration using Fast Memory Regions (FMR).
+ * Referred to sometimes as MTHCAFMR mode.
+ *
+ * FMR uses synchronous memory registration and deregistration.
+ * FMR registration is known to be fast, but FMR deregistration
+ * can take tens of usecs to complete.
+ */
+
+/* Normal operation
+ *
+ * A Memory Region is prepared for RDMA READ or WRITE using the
+ * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
+ * finished, the Memory Region is unmapped using the ib_unmap_fmr
+ * verb (fmr_op_unmap).
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+/* Maximum scatter/gather per FMR */
+#define RPCRDMA_MAX_FMR_SGES	(64)
+
+/* Access mode of externally registered pages */
+enum {
+	RPCRDMA_FMR_ACCESS_FLAGS	= IB_ACCESS_REMOTE_WRITE |
+					  IB_ACCESS_REMOTE_READ,
+};
+
+bool
+fmr_is_supported(struct rpcrdma_ia *ia)
+{
+	if (!ia->ri_device->alloc_fmr) {
+		pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
+			ia->ri_device->name);
+		return false;
+	}
+	return true;
+}
+
+static int
+fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
+{
+	static struct ib_fmr_attr fmr_attr = {
+		.max_pages	= RPCRDMA_MAX_FMR_SGES,
+		.max_maps	= 1,
+		.page_shift	= PAGE_SHIFT
+	};
+
+	mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
+				       sizeof(u64), GFP_KERNEL);
+	if (!mr->fmr.fm_physaddrs)
+		goto out_free;
+
+	mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
+			    sizeof(*mr->mr_sg), GFP_KERNEL);
+	if (!mr->mr_sg)
+		goto out_free;
+
+	sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES);
+
+	mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
+				     &fmr_attr);
+	if (IS_ERR(mr->fmr.fm_mr))
+		goto out_fmr_err;
+
+	INIT_LIST_HEAD(&mr->mr_list);
+	return 0;
+
+out_fmr_err:
+	dprintk("RPC:       %s: ib_alloc_fmr returned %ld\n", __func__,
+		PTR_ERR(mr->fmr.fm_mr));
+
+out_free:
+	kfree(mr->mr_sg);
+	kfree(mr->fmr.fm_physaddrs);
+	return -ENOMEM;
+}
+
+static int
+__fmr_unmap(struct rpcrdma_mr *mr)
+{
+	LIST_HEAD(l);
+	int rc;
+
+	list_add(&mr->fmr.fm_mr->list, &l);
+	rc = ib_unmap_fmr(&l);
+	list_del(&mr->fmr.fm_mr->list);
+	return rc;
+}
+
+static void
+fmr_op_release_mr(struct rpcrdma_mr *mr)
+{
+	LIST_HEAD(unmap_list);
+	int rc;
+
+	kfree(mr->fmr.fm_physaddrs);
+	kfree(mr->mr_sg);
+
+	/* In case this one was left mapped, try to unmap it
+	 * to prevent dealloc_fmr from failing with EBUSY
+	 */
+	rc = __fmr_unmap(mr);
+	if (rc)
+		pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
+		       mr, rc);
+
+	rc = ib_dealloc_fmr(mr->fmr.fm_mr);
+	if (rc)
+		pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
+		       mr, rc);
+
+	kfree(mr);
+}
+
+/* Reset of a single FMR.
+ */
+static void
+fmr_op_recover_mr(struct rpcrdma_mr *mr)
+{
+	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+	int rc;
+
+	/* ORDER: invalidate first */
+	rc = __fmr_unmap(mr);
+	if (rc)
+		goto out_release;
+
+	/* ORDER: then DMA unmap */
+	rpcrdma_mr_unmap_and_put(mr);
+
+	r_xprt->rx_stats.mrs_recovered++;
+	return;
+
+out_release:
+	pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr);
+	r_xprt->rx_stats.mrs_orphaned++;
+
+	trace_xprtrdma_dma_unmap(mr);
+	ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+			mr->mr_sg, mr->mr_nents, mr->mr_dir);
+
+	spin_lock(&r_xprt->rx_buf.rb_mrlock);
+	list_del(&mr->mr_all);
+	spin_unlock(&r_xprt->rx_buf.rb_mrlock);
+
+	fmr_op_release_mr(mr);
+}
+
+static int
+fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+	    struct rpcrdma_create_data_internal *cdata)
+{
+	ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
+				RPCRDMA_MAX_FMR_SGES);
+	return 0;
+}
+
+/* FMR mode conveys up to 64 pages of payload per chunk segment.
+ */
+static size_t
+fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+		     RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
+}
+
+/* Use the ib_map_phys_fmr() verb to register a memory region
+ * for remote access via RDMA READ or RDMA WRITE.
+ */
+static struct rpcrdma_mr_seg *
+fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+	   int nsegs, bool writing, struct rpcrdma_mr **out)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	int len, pageoff, i, rc;
+	struct rpcrdma_mr *mr;
+	u64 *dma_pages;
+
+	mr = rpcrdma_mr_get(r_xprt);
+	if (!mr)
+		return ERR_PTR(-EAGAIN);
+
+	pageoff = offset_in_page(seg1->mr_offset);
+	seg1->mr_offset -= pageoff;	/* start of page */
+	seg1->mr_len += pageoff;
+	len = -pageoff;
+	if (nsegs > RPCRDMA_MAX_FMR_SGES)
+		nsegs = RPCRDMA_MAX_FMR_SGES;
+	for (i = 0; i < nsegs;) {
+		if (seg->mr_page)
+			sg_set_page(&mr->mr_sg[i],
+				    seg->mr_page,
+				    seg->mr_len,
+				    offset_in_page(seg->mr_offset));
+		else
+			sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
+				   seg->mr_len);
+		len += seg->mr_len;
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+	mr->mr_dir = rpcrdma_data_dir(writing);
+
+	mr->mr_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device,
+				     mr->mr_sg, i, mr->mr_dir);
+	if (!mr->mr_nents)
+		goto out_dmamap_err;
+
+	for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++)
+		dma_pages[i] = sg_dma_address(&mr->mr_sg[i]);
+	rc = ib_map_phys_fmr(mr->fmr.fm_mr, dma_pages, mr->mr_nents,
+			     dma_pages[0]);
+	if (rc)
+		goto out_maperr;
+
+	mr->mr_handle = mr->fmr.fm_mr->rkey;
+	mr->mr_length = len;
+	mr->mr_offset = dma_pages[0] + pageoff;
+
+	*out = mr;
+	return seg;
+
+out_dmamap_err:
+	pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
+	       mr->mr_sg, i);
+	rpcrdma_mr_put(mr);
+	return ERR_PTR(-EIO);
+
+out_maperr:
+	pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
+	       len, (unsigned long long)dma_pages[0],
+	       pageoff, mr->mr_nents, rc);
+	rpcrdma_mr_unmap_and_put(mr);
+	return ERR_PTR(-EIO);
+}
+
+/* Post Send WR containing the RPC Call message.
+ */
+static int
+fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+{
+	struct ib_send_wr *bad_wr;
+
+	return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, &bad_wr);
+}
+
+/* Invalidate all memory regions that were registered for "req".
+ *
+ * Sleeps until it is safe for the host CPU to access the
+ * previously mapped memory regions.
+ *
+ * Caller ensures that @mrs is not empty before the call. This
+ * function empties the list.
+ */
+static void
+fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
+{
+	struct rpcrdma_mr *mr;
+	LIST_HEAD(unmap_list);
+	int rc;
+
+	/* ORDER: Invalidate all of the req's MRs first
+	 *
+	 * ib_unmap_fmr() is slow, so use a single call instead
+	 * of one call per mapped FMR.
+	 */
+	list_for_each_entry(mr, mrs, mr_list) {
+		dprintk("RPC:       %s: unmapping fmr %p\n",
+			__func__, &mr->fmr);
+		trace_xprtrdma_localinv(mr);
+		list_add_tail(&mr->fmr.fm_mr->list, &unmap_list);
+	}
+	r_xprt->rx_stats.local_inv_needed++;
+	rc = ib_unmap_fmr(&unmap_list);
+	if (rc)
+		goto out_reset;
+
+	/* ORDER: Now DMA unmap all of the req's MRs, and return
+	 * them to the free MW list.
+	 */
+	while (!list_empty(mrs)) {
+		mr = rpcrdma_mr_pop(mrs);
+		list_del(&mr->fmr.fm_mr->list);
+		rpcrdma_mr_unmap_and_put(mr);
+	}
+
+	return;
+
+out_reset:
+	pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
+
+	while (!list_empty(mrs)) {
+		mr = rpcrdma_mr_pop(mrs);
+		list_del(&mr->fmr.fm_mr->list);
+		fmr_op_recover_mr(mr);
+	}
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
+	.ro_map				= fmr_op_map,
+	.ro_send			= fmr_op_send,
+	.ro_unmap_sync			= fmr_op_unmap_sync,
+	.ro_recover_mr			= fmr_op_recover_mr,
+	.ro_open			= fmr_op_open,
+	.ro_maxpages			= fmr_op_maxpages,
+	.ro_init_mr			= fmr_op_init_mr,
+	.ro_release_mr			= fmr_op_release_mr,
+	.ro_displayname			= "fmr",
+	.ro_send_w_inv_ok		= 0,
+};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
new file mode 100644
index 0000000..c59c5c7
--- /dev/null
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -0,0 +1,589 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015, 2017 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* Lightweight memory registration using Fast Registration Work
+ * Requests (FRWR).
+ *
+ * FRWR features ordered asynchronous registration and deregistration
+ * of arbitrarily sized memory regions. This is the fastest and safest
+ * but most complex memory registration mode.
+ */
+
+/* Normal operation
+ *
+ * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG
+ * Work Request (frwr_op_map). When the RDMA operation is finished, this
+ * Memory Region is invalidated using a LOCAL_INV Work Request
+ * (frwr_op_unmap_sync).
+ *
+ * Typically these Work Requests are not signaled, and neither are RDMA
+ * SEND Work Requests (with the exception of signaling occasionally to
+ * prevent provider work queue overflows). This greatly reduces HCA
+ * interrupt workload.
+ *
+ * As an optimization, frwr_op_unmap marks MRs INVALID before the
+ * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on
+ * rb_mrs immediately so that no work (like managing a linked list
+ * under a spinlock) is needed in the completion upcall.
+ *
+ * But this means that frwr_op_map() can occasionally encounter an MR
+ * that is INVALID but the LOCAL_INV WR has not completed. Work Queue
+ * ordering prevents a subsequent FAST_REG WR from executing against
+ * that MR while it is still being invalidated.
+ */
+
+/* Transport recovery
+ *
+ * ->op_map and the transport connect worker cannot run at the same
+ * time, but ->op_unmap can fire while the transport connect worker
+ * is running. Thus MR recovery is handled in ->op_map, to guarantee
+ * that recovered MRs are owned by a sending RPC, and not one where
+ * ->op_unmap could fire at the same time transport reconnect is
+ * being done.
+ *
+ * When the underlying transport disconnects, MRs are left in one of
+ * four states:
+ *
+ * INVALID:	The MR was not in use before the QP entered ERROR state.
+ *
+ * VALID:	The MR was registered before the QP entered ERROR state.
+ *
+ * FLUSHED_FR:	The MR was being registered when the QP entered ERROR
+ *		state, and the pending WR was flushed.
+ *
+ * FLUSHED_LI:	The MR was being invalidated when the QP entered ERROR
+ *		state, and the pending WR was flushed.
+ *
+ * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered
+ * with ib_dereg_mr and then are re-initialized. Because MR recovery
+ * allocates fresh resources, it is deferred to a workqueue, and the
+ * recovered MRs are placed back on the rb_mrs list when recovery is
+ * complete. frwr_op_map allocates another MR for the current RPC while
+ * the broken MR is reset.
+ *
+ * To ensure that frwr_op_map doesn't encounter an MR that is marked
+ * INVALID but that is about to be flushed due to a previous transport
+ * disconnect, the transport connect worker attempts to drain all
+ * pending send queue WRs before the transport is reconnected.
+ */
+
+#include <linux/sunrpc/rpc_rdma.h>
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+bool
+frwr_is_supported(struct rpcrdma_ia *ia)
+{
+	struct ib_device_attr *attrs = &ia->ri_device->attrs;
+
+	if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+		goto out_not_supported;
+	if (attrs->max_fast_reg_page_list_len == 0)
+		goto out_not_supported;
+	return true;
+
+out_not_supported:
+	pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
+		ia->ri_device->name);
+	return false;
+}
+
+static int
+frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
+{
+	unsigned int depth = ia->ri_max_frwr_depth;
+	struct rpcrdma_frwr *frwr = &mr->frwr;
+	int rc;
+
+	frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
+	if (IS_ERR(frwr->fr_mr))
+		goto out_mr_err;
+
+	mr->mr_sg = kcalloc(depth, sizeof(*mr->mr_sg), GFP_KERNEL);
+	if (!mr->mr_sg)
+		goto out_list_err;
+
+	INIT_LIST_HEAD(&mr->mr_list);
+	sg_init_table(mr->mr_sg, depth);
+	init_completion(&frwr->fr_linv_done);
+	return 0;
+
+out_mr_err:
+	rc = PTR_ERR(frwr->fr_mr);
+	dprintk("RPC:       %s: ib_alloc_mr status %i\n",
+		__func__, rc);
+	return rc;
+
+out_list_err:
+	rc = -ENOMEM;
+	dprintk("RPC:       %s: sg allocation failure\n",
+		__func__);
+	ib_dereg_mr(frwr->fr_mr);
+	return rc;
+}
+
+static void
+frwr_op_release_mr(struct rpcrdma_mr *mr)
+{
+	int rc;
+
+	rc = ib_dereg_mr(mr->frwr.fr_mr);
+	if (rc)
+		pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
+		       mr, rc);
+	kfree(mr->mr_sg);
+	kfree(mr);
+}
+
+static int
+__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
+{
+	struct rpcrdma_frwr *frwr = &mr->frwr;
+	int rc;
+
+	rc = ib_dereg_mr(frwr->fr_mr);
+	if (rc) {
+		pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
+			rc, mr);
+		return rc;
+	}
+
+	frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype,
+				  ia->ri_max_frwr_depth);
+	if (IS_ERR(frwr->fr_mr)) {
+		pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
+			PTR_ERR(frwr->fr_mr), mr);
+		return PTR_ERR(frwr->fr_mr);
+	}
+
+	dprintk("RPC:       %s: recovered FRWR %p\n", __func__, frwr);
+	frwr->fr_state = FRWR_IS_INVALID;
+	return 0;
+}
+
+/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR.
+ */
+static void
+frwr_op_recover_mr(struct rpcrdma_mr *mr)
+{
+	enum rpcrdma_frwr_state state = mr->frwr.fr_state;
+	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	int rc;
+
+	rc = __frwr_mr_reset(ia, mr);
+	if (state != FRWR_FLUSHED_LI) {
+		trace_xprtrdma_dma_unmap(mr);
+		ib_dma_unmap_sg(ia->ri_device,
+				mr->mr_sg, mr->mr_nents, mr->mr_dir);
+	}
+	if (rc)
+		goto out_release;
+
+	rpcrdma_mr_put(mr);
+	r_xprt->rx_stats.mrs_recovered++;
+	return;
+
+out_release:
+	pr_err("rpcrdma: FRWR reset failed %d, %p released\n", rc, mr);
+	r_xprt->rx_stats.mrs_orphaned++;
+
+	spin_lock(&r_xprt->rx_buf.rb_mrlock);
+	list_del(&mr->mr_all);
+	spin_unlock(&r_xprt->rx_buf.rb_mrlock);
+
+	frwr_op_release_mr(mr);
+}
+
+static int
+frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+	     struct rpcrdma_create_data_internal *cdata)
+{
+	struct ib_device_attr *attrs = &ia->ri_device->attrs;
+	int depth, delta;
+
+	ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
+	if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+		ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
+
+	ia->ri_max_frwr_depth =
+			min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+			      attrs->max_fast_reg_page_list_len);
+	dprintk("RPC:       %s: device's max FR page list len = %u\n",
+		__func__, ia->ri_max_frwr_depth);
+
+	/* Add room for frwr register and invalidate WRs.
+	 * 1. FRWR reg WR for head
+	 * 2. FRWR invalidate WR for head
+	 * 3. N FRWR reg WRs for pagelist
+	 * 4. N FRWR invalidate WRs for pagelist
+	 * 5. FRWR reg WR for tail
+	 * 6. FRWR invalidate WR for tail
+	 * 7. The RDMA_SEND WR
+	 */
+	depth = 7;
+
+	/* Calculate N if the device max FRWR depth is smaller than
+	 * RPCRDMA_MAX_DATA_SEGS.
+	 */
+	if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) {
+		delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth;
+		do {
+			depth += 2; /* FRWR reg + invalidate */
+			delta -= ia->ri_max_frwr_depth;
+		} while (delta > 0);
+	}
+
+	ep->rep_attr.cap.max_send_wr *= depth;
+	if (ep->rep_attr.cap.max_send_wr > attrs->max_qp_wr) {
+		cdata->max_requests = attrs->max_qp_wr / depth;
+		if (!cdata->max_requests)
+			return -EINVAL;
+		ep->rep_attr.cap.max_send_wr = cdata->max_requests *
+					       depth;
+	}
+
+	ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
+				ia->ri_max_frwr_depth);
+	return 0;
+}
+
+/* FRWR mode conveys a list of pages per chunk segment. The
+ * maximum length of that list is the FRWR page list depth.
+ */
+static size_t
+frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+		     RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frwr_depth);
+}
+
+static void
+__frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr)
+{
+	if (wc->status != IB_WC_WR_FLUSH_ERR)
+		pr_err("rpcrdma: %s: %s (%u/0x%x)\n",
+		       wr, ib_wc_status_msg(wc->status),
+		       wc->status, wc->vendor_err);
+}
+
+/**
+ * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
+ * @cq:	completion queue (ignored)
+ * @wc:	completed WR
+ *
+ */
+static void
+frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr =
+			container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	if (wc->status != IB_WC_SUCCESS) {
+		frwr->fr_state = FRWR_FLUSHED_FR;
+		__frwr_sendcompletion_flush(wc, "fastreg");
+	}
+	trace_xprtrdma_wc_fastreg(wc, frwr);
+}
+
+/**
+ * frwr_wc_localinv - Invoked by RDMA provider for a flushed LocalInv WC
+ * @cq:	completion queue (ignored)
+ * @wc:	completed WR
+ *
+ */
+static void
+frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr,
+						 fr_cqe);
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	if (wc->status != IB_WC_SUCCESS) {
+		frwr->fr_state = FRWR_FLUSHED_LI;
+		__frwr_sendcompletion_flush(wc, "localinv");
+	}
+	trace_xprtrdma_wc_li(wc, frwr);
+}
+
+/**
+ * frwr_wc_localinv_wake - Invoked by RDMA provider for a signaled LocalInv WC
+ * @cq:	completion queue (ignored)
+ * @wc:	completed WR
+ *
+ * Awaken anyone waiting for an MR to finish being fenced.
+ */
+static void
+frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr,
+						 fr_cqe);
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	if (wc->status != IB_WC_SUCCESS) {
+		frwr->fr_state = FRWR_FLUSHED_LI;
+		__frwr_sendcompletion_flush(wc, "localinv");
+	}
+	complete(&frwr->fr_linv_done);
+	trace_xprtrdma_wc_li_wake(wc, frwr);
+}
+
+/* Post a REG_MR Work Request to register a memory region
+ * for remote access via RDMA READ or RDMA WRITE.
+ */
+static struct rpcrdma_mr_seg *
+frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+	    int nsegs, bool writing, struct rpcrdma_mr **out)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
+	struct rpcrdma_frwr *frwr;
+	struct rpcrdma_mr *mr;
+	struct ib_mr *ibmr;
+	struct ib_reg_wr *reg_wr;
+	int i, n;
+	u8 key;
+
+	mr = NULL;
+	do {
+		if (mr)
+			rpcrdma_mr_defer_recovery(mr);
+		mr = rpcrdma_mr_get(r_xprt);
+		if (!mr)
+			return ERR_PTR(-EAGAIN);
+	} while (mr->frwr.fr_state != FRWR_IS_INVALID);
+	frwr = &mr->frwr;
+	frwr->fr_state = FRWR_IS_VALID;
+
+	if (nsegs > ia->ri_max_frwr_depth)
+		nsegs = ia->ri_max_frwr_depth;
+	for (i = 0; i < nsegs;) {
+		if (seg->mr_page)
+			sg_set_page(&mr->mr_sg[i],
+				    seg->mr_page,
+				    seg->mr_len,
+				    offset_in_page(seg->mr_offset));
+		else
+			sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
+				   seg->mr_len);
+
+		++seg;
+		++i;
+		if (holes_ok)
+			continue;
+		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+	mr->mr_dir = rpcrdma_data_dir(writing);
+
+	mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir);
+	if (!mr->mr_nents)
+		goto out_dmamap_err;
+
+	ibmr = frwr->fr_mr;
+	n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
+	if (unlikely(n != mr->mr_nents))
+		goto out_mapmr_err;
+
+	key = (u8)(ibmr->rkey & 0x000000FF);
+	ib_update_fast_reg_key(ibmr, ++key);
+
+	reg_wr = &frwr->fr_regwr;
+	reg_wr->mr = ibmr;
+	reg_wr->key = ibmr->rkey;
+	reg_wr->access = writing ?
+			 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+			 IB_ACCESS_REMOTE_READ;
+
+	mr->mr_handle = ibmr->rkey;
+	mr->mr_length = ibmr->length;
+	mr->mr_offset = ibmr->iova;
+
+	*out = mr;
+	return seg;
+
+out_dmamap_err:
+	pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
+	       mr->mr_sg, i);
+	frwr->fr_state = FRWR_IS_INVALID;
+	rpcrdma_mr_put(mr);
+	return ERR_PTR(-EIO);
+
+out_mapmr_err:
+	pr_err("rpcrdma: failed to map mr %p (%d/%d)\n",
+	       frwr->fr_mr, n, mr->mr_nents);
+	rpcrdma_mr_defer_recovery(mr);
+	return ERR_PTR(-EIO);
+}
+
+/* Post Send WR containing the RPC Call message.
+ *
+ * For FRMR, chain any FastReg WRs to the Send WR. Only a
+ * single ib_post_send call is needed to register memory
+ * and then post the Send WR.
+ */
+static int
+frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+{
+	struct ib_send_wr *post_wr, *bad_wr;
+	struct rpcrdma_mr *mr;
+
+	post_wr = &req->rl_sendctx->sc_wr;
+	list_for_each_entry(mr, &req->rl_registered, mr_list) {
+		struct rpcrdma_frwr *frwr;
+
+		frwr = &mr->frwr;
+
+		frwr->fr_cqe.done = frwr_wc_fastreg;
+		frwr->fr_regwr.wr.next = post_wr;
+		frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe;
+		frwr->fr_regwr.wr.num_sge = 0;
+		frwr->fr_regwr.wr.opcode = IB_WR_REG_MR;
+		frwr->fr_regwr.wr.send_flags = 0;
+
+		post_wr = &frwr->fr_regwr.wr;
+	}
+
+	/* If ib_post_send fails, the next ->send_request for
+	 * @req will queue these MWs for recovery.
+	 */
+	return ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
+}
+
+/* Handle a remotely invalidated mr on the @mrs list
+ */
+static void
+frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
+{
+	struct rpcrdma_mr *mr;
+
+	list_for_each_entry(mr, mrs, mr_list)
+		if (mr->mr_handle == rep->rr_inv_rkey) {
+			list_del_init(&mr->mr_list);
+			trace_xprtrdma_remoteinv(mr);
+			mr->frwr.fr_state = FRWR_IS_INVALID;
+			rpcrdma_mr_unmap_and_put(mr);
+			break;	/* only one invalidated MR per RPC */
+		}
+}
+
+/* Invalidate all memory regions that were registered for "req".
+ *
+ * Sleeps until it is safe for the host CPU to access the
+ * previously mapped memory regions.
+ *
+ * Caller ensures that @mrs is not empty before the call. This
+ * function empties the list.
+ */
+static void
+frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
+{
+	struct ib_send_wr *first, **prev, *last, *bad_wr;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct rpcrdma_frwr *frwr;
+	struct rpcrdma_mr *mr;
+	int count, rc;
+
+	/* ORDER: Invalidate all of the MRs first
+	 *
+	 * Chain the LOCAL_INV Work Requests and post them with
+	 * a single ib_post_send() call.
+	 */
+	frwr = NULL;
+	count = 0;
+	prev = &first;
+	list_for_each_entry(mr, mrs, mr_list) {
+		mr->frwr.fr_state = FRWR_IS_INVALID;
+
+		frwr = &mr->frwr;
+		trace_xprtrdma_localinv(mr);
+
+		frwr->fr_cqe.done = frwr_wc_localinv;
+		last = &frwr->fr_invwr;
+		memset(last, 0, sizeof(*last));
+		last->wr_cqe = &frwr->fr_cqe;
+		last->opcode = IB_WR_LOCAL_INV;
+		last->ex.invalidate_rkey = mr->mr_handle;
+		count++;
+
+		*prev = last;
+		prev = &last->next;
+	}
+	if (!frwr)
+		goto unmap;
+
+	/* Strong send queue ordering guarantees that when the
+	 * last WR in the chain completes, all WRs in the chain
+	 * are complete.
+	 */
+	last->send_flags = IB_SEND_SIGNALED;
+	frwr->fr_cqe.done = frwr_wc_localinv_wake;
+	reinit_completion(&frwr->fr_linv_done);
+
+	/* Transport disconnect drains the receive CQ before it
+	 * replaces the QP. The RPC reply handler won't call us
+	 * unless ri_id->qp is a valid pointer.
+	 */
+	r_xprt->rx_stats.local_inv_needed++;
+	bad_wr = NULL;
+	rc = ib_post_send(ia->ri_id->qp, first, &bad_wr);
+	if (bad_wr != first)
+		wait_for_completion(&frwr->fr_linv_done);
+	if (rc)
+		goto reset_mrs;
+
+	/* ORDER: Now DMA unmap all of the MRs, and return
+	 * them to the free MR list.
+	 */
+unmap:
+	while (!list_empty(mrs)) {
+		mr = rpcrdma_mr_pop(mrs);
+		rpcrdma_mr_unmap_and_put(mr);
+	}
+	return;
+
+reset_mrs:
+	pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc);
+
+	/* Find and reset the MRs in the LOCAL_INV WRs that did not
+	 * get posted.
+	 */
+	while (bad_wr) {
+		frwr = container_of(bad_wr, struct rpcrdma_frwr,
+				    fr_invwr);
+		mr = container_of(frwr, struct rpcrdma_mr, frwr);
+
+		__frwr_mr_reset(ia, mr);
+
+		bad_wr = bad_wr->next;
+	}
+	goto unmap;
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
+	.ro_map				= frwr_op_map,
+	.ro_send			= frwr_op_send,
+	.ro_reminv			= frwr_op_reminv,
+	.ro_unmap_sync			= frwr_op_unmap_sync,
+	.ro_recover_mr			= frwr_op_recover_mr,
+	.ro_open			= frwr_op_open,
+	.ro_maxpages			= frwr_op_maxpages,
+	.ro_init_mr			= frwr_op_init_mr,
+	.ro_release_mr			= frwr_op_release_mr,
+	.ro_displayname			= "frwr",
+	.ro_send_w_inv_ok		= RPCRDMA_CMP_F_SND_W_INV_OK,
+};
diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c
new file mode 100644
index 0000000..a762d19
--- /dev/null
+++ b/net/sunrpc/xprtrdma/module.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2015, 2017 Oracle.  All rights reserved.
+ */
+
+/* rpcrdma.ko module initialization
+ */
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#include <asm/swab.h>
+
+#define CREATE_TRACE_POINTS
+#include "xprt_rdma.h"
+
+MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc.");
+MODULE_DESCRIPTION("RPC/RDMA Transport");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("svcrdma");
+MODULE_ALIAS("xprtrdma");
+
+static void __exit rpc_rdma_cleanup(void)
+{
+	xprt_rdma_cleanup();
+	svc_rdma_cleanup();
+}
+
+static int __init rpc_rdma_init(void)
+{
+	int rc;
+
+	rc = svc_rdma_init();
+	if (rc)
+		goto out;
+
+	rc = xprt_rdma_init();
+	if (rc)
+		svc_rdma_cleanup();
+
+out:
+	return rc;
+}
+
+module_init(rpc_rdma_init);
+module_exit(rpc_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
new file mode 100644
index 0000000..e8adad3
--- /dev/null
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -0,0 +1,1414 @@
+/*
+ * Copyright (c) 2014-2017 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * rpc_rdma.c
+ *
+ * This file contains the guts of the RPC RDMA protocol, and
+ * does marshaling/unmarshaling, etc. It is also where interfacing
+ * to the Linux RPC framework lives.
+ */
+
+#include "xprt_rdma.h"
+
+#include <linux/highmem.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+static const char transfertypes[][12] = {
+	"inline",	/* no chunks */
+	"read list",	/* some argument via rdma read */
+	"*read list",	/* entire request via rdma read */
+	"write list",	/* some result via rdma write */
+	"reply chunk"	/* entire reply via rdma write */
+};
+
+/* Returns size of largest RPC-over-RDMA header in a Call message
+ *
+ * The largest Call header contains a full-size Read list and a
+ * minimal Reply chunk.
+ */
+static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
+{
+	unsigned int size;
+
+	/* Fixed header fields and list discriminators */
+	size = RPCRDMA_HDRLEN_MIN;
+
+	/* Maximum Read list size */
+	maxsegs += 2;	/* segment for head and tail buffers */
+	size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
+
+	/* Minimal Read chunk size */
+	size += sizeof(__be32);	/* segment count */
+	size += rpcrdma_segment_maxsz * sizeof(__be32);
+	size += sizeof(__be32);	/* list discriminator */
+
+	dprintk("RPC:       %s: max call header size = %u\n",
+		__func__, size);
+	return size;
+}
+
+/* Returns size of largest RPC-over-RDMA header in a Reply message
+ *
+ * There is only one Write list or one Reply chunk per Reply
+ * message.  The larger list is the Write list.
+ */
+static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
+{
+	unsigned int size;
+
+	/* Fixed header fields and list discriminators */
+	size = RPCRDMA_HDRLEN_MIN;
+
+	/* Maximum Write list size */
+	maxsegs += 2;	/* segment for head and tail buffers */
+	size = sizeof(__be32);		/* segment count */
+	size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
+	size += sizeof(__be32);	/* list discriminator */
+
+	dprintk("RPC:       %s: max reply header size = %u\n",
+		__func__, size);
+	return size;
+}
+
+void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	unsigned int maxsegs = ia->ri_max_segs;
+
+	ia->ri_max_inline_write = cdata->inline_wsize -
+				  rpcrdma_max_call_header_size(maxsegs);
+	ia->ri_max_inline_read = cdata->inline_rsize -
+				 rpcrdma_max_reply_header_size(maxsegs);
+}
+
+/* The client can send a request inline as long as the RPCRDMA header
+ * plus the RPC call fit under the transport's inline limit. If the
+ * combined call message size exceeds that limit, the client must use
+ * a Read chunk for this operation.
+ *
+ * A Read chunk is also required if sending the RPC call inline would
+ * exceed this device's max_sge limit.
+ */
+static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
+				struct rpc_rqst *rqst)
+{
+	struct xdr_buf *xdr = &rqst->rq_snd_buf;
+	unsigned int count, remaining, offset;
+
+	if (xdr->len > r_xprt->rx_ia.ri_max_inline_write)
+		return false;
+
+	if (xdr->page_len) {
+		remaining = xdr->page_len;
+		offset = offset_in_page(xdr->page_base);
+		count = RPCRDMA_MIN_SEND_SGES;
+		while (remaining) {
+			remaining -= min_t(unsigned int,
+					   PAGE_SIZE - offset, remaining);
+			offset = 0;
+			if (++count > r_xprt->rx_ia.ri_max_send_sges)
+				return false;
+		}
+	}
+
+	return true;
+}
+
+/* The client can't know how large the actual reply will be. Thus it
+ * plans for the largest possible reply for that particular ULP
+ * operation. If the maximum combined reply message size exceeds that
+ * limit, the client must provide a write list or a reply chunk for
+ * this request.
+ */
+static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
+				   struct rpc_rqst *rqst)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+	return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
+}
+
+/* Split @vec on page boundaries into SGEs. FMR registers pages, not
+ * a byte range. Other modes coalesce these SGEs into a single MR
+ * when they can.
+ *
+ * Returns pointer to next available SGE, and bumps the total number
+ * of SGEs consumed.
+ */
+static struct rpcrdma_mr_seg *
+rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
+		     unsigned int *n)
+{
+	u32 remaining, page_offset;
+	char *base;
+
+	base = vec->iov_base;
+	page_offset = offset_in_page(base);
+	remaining = vec->iov_len;
+	while (remaining) {
+		seg->mr_page = NULL;
+		seg->mr_offset = base;
+		seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
+		remaining -= seg->mr_len;
+		base += seg->mr_len;
+		++seg;
+		++(*n);
+		page_offset = 0;
+	}
+	return seg;
+}
+
+/* Convert @xdrbuf into SGEs no larger than a page each. As they
+ * are registered, these SGEs are then coalesced into RDMA segments
+ * when the selected memreg mode supports it.
+ *
+ * Returns positive number of SGEs consumed, or a negative errno.
+ */
+
+static int
+rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
+		     unsigned int pos, enum rpcrdma_chunktype type,
+		     struct rpcrdma_mr_seg *seg)
+{
+	unsigned long page_base;
+	unsigned int len, n;
+	struct page **ppages;
+
+	n = 0;
+	if (pos == 0)
+		seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n);
+
+	len = xdrbuf->page_len;
+	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
+	page_base = offset_in_page(xdrbuf->page_base);
+	while (len) {
+		if (unlikely(!*ppages)) {
+			/* XXX: Certain upper layer operations do
+			 *	not provide receive buffer pages.
+			 */
+			*ppages = alloc_page(GFP_ATOMIC);
+			if (!*ppages)
+				return -EAGAIN;
+		}
+		seg->mr_page = *ppages;
+		seg->mr_offset = (char *)page_base;
+		seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
+		len -= seg->mr_len;
+		++ppages;
+		++seg;
+		++n;
+		page_base = 0;
+	}
+
+	/* When encoding a Read chunk, the tail iovec contains an
+	 * XDR pad and may be omitted.
+	 */
+	if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
+		goto out;
+
+	/* When encoding a Write chunk, some servers need to see an
+	 * extra segment for non-XDR-aligned Write chunks. The upper
+	 * layer provides space in the tail iovec that may be used
+	 * for this purpose.
+	 */
+	if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
+		goto out;
+
+	if (xdrbuf->tail[0].iov_len)
+		seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
+
+out:
+	if (unlikely(n > RPCRDMA_MAX_SEGS))
+		return -EIO;
+	return n;
+}
+
+static inline int
+encode_item_present(struct xdr_stream *xdr)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, sizeof(*p));
+	if (unlikely(!p))
+		return -EMSGSIZE;
+
+	*p = xdr_one;
+	return 0;
+}
+
+static inline int
+encode_item_not_present(struct xdr_stream *xdr)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, sizeof(*p));
+	if (unlikely(!p))
+		return -EMSGSIZE;
+
+	*p = xdr_zero;
+	return 0;
+}
+
+static void
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr)
+{
+	*iptr++ = cpu_to_be32(mr->mr_handle);
+	*iptr++ = cpu_to_be32(mr->mr_length);
+	xdr_encode_hyper(iptr, mr->mr_offset);
+}
+
+static int
+encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4 * sizeof(*p));
+	if (unlikely(!p))
+		return -EMSGSIZE;
+
+	xdr_encode_rdma_segment(p, mr);
+	return 0;
+}
+
+static int
+encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
+		    u32 position)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 6 * sizeof(*p));
+	if (unlikely(!p))
+		return -EMSGSIZE;
+
+	*p++ = xdr_one;			/* Item present */
+	*p++ = cpu_to_be32(position);
+	xdr_encode_rdma_segment(p, mr);
+	return 0;
+}
+
+/* Register and XDR encode the Read list. Supports encoding a list of read
+ * segments that belong to a single read chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ *  Read chunklist (a linked list):
+ *   N elements, position P (same P for all chunks of same arg!):
+ *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
+ *
+ * Returns zero on success, or a negative errno if a failure occurred.
+ * @xdr is advanced to the next position in the stream.
+ *
+ * Only a single @pos value is currently supported.
+ */
+static noinline int
+rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+			 struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype)
+{
+	struct xdr_stream *xdr = &req->rl_stream;
+	struct rpcrdma_mr_seg *seg;
+	struct rpcrdma_mr *mr;
+	unsigned int pos;
+	int nsegs;
+
+	pos = rqst->rq_snd_buf.head[0].iov_len;
+	if (rtype == rpcrdma_areadch)
+		pos = 0;
+	seg = req->rl_segments;
+	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
+				     rtype, seg);
+	if (nsegs < 0)
+		return nsegs;
+
+	do {
+		seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+						   false, &mr);
+		if (IS_ERR(seg))
+			goto out_maperr;
+		rpcrdma_mr_push(mr, &req->rl_registered);
+
+		if (encode_read_segment(xdr, mr, pos) < 0)
+			return -EMSGSIZE;
+
+		trace_xprtrdma_read_chunk(rqst->rq_task, pos, mr, nsegs);
+		r_xprt->rx_stats.read_chunk_count++;
+		nsegs -= mr->mr_nents;
+	} while (nsegs);
+
+	return 0;
+
+out_maperr:
+	if (PTR_ERR(seg) == -EAGAIN)
+		xprt_wait_for_buffer_space(rqst->rq_task, NULL);
+	return PTR_ERR(seg);
+}
+
+/* Register and XDR encode the Write list. Supports encoding a list
+ * containing one array of plain segments that belong to a single
+ * write chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ *  Write chunklist (a list of (one) counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO - 0
+ *
+ * Returns zero on success, or a negative errno if a failure occurred.
+ * @xdr is advanced to the next position in the stream.
+ *
+ * Only a single Write chunk is currently supported.
+ */
+static noinline int
+rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+			  struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
+{
+	struct xdr_stream *xdr = &req->rl_stream;
+	struct rpcrdma_mr_seg *seg;
+	struct rpcrdma_mr *mr;
+	int nsegs, nchunks;
+	__be32 *segcount;
+
+	seg = req->rl_segments;
+	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
+				     rqst->rq_rcv_buf.head[0].iov_len,
+				     wtype, seg);
+	if (nsegs < 0)
+		return nsegs;
+
+	if (encode_item_present(xdr) < 0)
+		return -EMSGSIZE;
+	segcount = xdr_reserve_space(xdr, sizeof(*segcount));
+	if (unlikely(!segcount))
+		return -EMSGSIZE;
+	/* Actual value encoded below */
+
+	nchunks = 0;
+	do {
+		seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+						   true, &mr);
+		if (IS_ERR(seg))
+			goto out_maperr;
+		rpcrdma_mr_push(mr, &req->rl_registered);
+
+		if (encode_rdma_segment(xdr, mr) < 0)
+			return -EMSGSIZE;
+
+		trace_xprtrdma_write_chunk(rqst->rq_task, mr, nsegs);
+		r_xprt->rx_stats.write_chunk_count++;
+		r_xprt->rx_stats.total_rdma_request += mr->mr_length;
+		nchunks++;
+		nsegs -= mr->mr_nents;
+	} while (nsegs);
+
+	/* Update count of segments in this Write chunk */
+	*segcount = cpu_to_be32(nchunks);
+
+	return 0;
+
+out_maperr:
+	if (PTR_ERR(seg) == -EAGAIN)
+		xprt_wait_for_buffer_space(rqst->rq_task, NULL);
+	return PTR_ERR(seg);
+}
+
+/* Register and XDR encode the Reply chunk. Supports encoding an array
+ * of plain segments that belong to a single write (reply) chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ *  Reply chunk (a counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO
+ *
+ * Returns zero on success, or a negative errno if a failure occurred.
+ * @xdr is advanced to the next position in the stream.
+ */
+static noinline int
+rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+			   struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
+{
+	struct xdr_stream *xdr = &req->rl_stream;
+	struct rpcrdma_mr_seg *seg;
+	struct rpcrdma_mr *mr;
+	int nsegs, nchunks;
+	__be32 *segcount;
+
+	seg = req->rl_segments;
+	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
+	if (nsegs < 0)
+		return nsegs;
+
+	if (encode_item_present(xdr) < 0)
+		return -EMSGSIZE;
+	segcount = xdr_reserve_space(xdr, sizeof(*segcount));
+	if (unlikely(!segcount))
+		return -EMSGSIZE;
+	/* Actual value encoded below */
+
+	nchunks = 0;
+	do {
+		seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+						   true, &mr);
+		if (IS_ERR(seg))
+			goto out_maperr;
+		rpcrdma_mr_push(mr, &req->rl_registered);
+
+		if (encode_rdma_segment(xdr, mr) < 0)
+			return -EMSGSIZE;
+
+		trace_xprtrdma_reply_chunk(rqst->rq_task, mr, nsegs);
+		r_xprt->rx_stats.reply_chunk_count++;
+		r_xprt->rx_stats.total_rdma_request += mr->mr_length;
+		nchunks++;
+		nsegs -= mr->mr_nents;
+	} while (nsegs);
+
+	/* Update count of segments in the Reply chunk */
+	*segcount = cpu_to_be32(nchunks);
+
+	return 0;
+
+out_maperr:
+	if (PTR_ERR(seg) == -EAGAIN)
+		xprt_wait_for_buffer_space(rqst->rq_task, NULL);
+	return PTR_ERR(seg);
+}
+
+/**
+ * rpcrdma_unmap_sendctx - DMA-unmap Send buffers
+ * @sc: sendctx containing SGEs to unmap
+ *
+ */
+void
+rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc)
+{
+	struct rpcrdma_ia *ia = &sc->sc_xprt->rx_ia;
+	struct ib_sge *sge;
+	unsigned int count;
+
+	/* The first two SGEs contain the transport header and
+	 * the inline buffer. These are always left mapped so
+	 * they can be cheaply re-used.
+	 */
+	sge = &sc->sc_sges[2];
+	for (count = sc->sc_unmap_count; count; ++sge, --count)
+		ib_dma_unmap_page(ia->ri_device,
+				  sge->addr, sge->length, DMA_TO_DEVICE);
+
+	if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) {
+		smp_mb__after_atomic();
+		wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES);
+	}
+}
+
+/* Prepare an SGE for the RPC-over-RDMA transport header.
+ */
+static bool
+rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+			u32 len)
+{
+	struct rpcrdma_sendctx *sc = req->rl_sendctx;
+	struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
+	struct ib_sge *sge = sc->sc_sges;
+
+	if (!rpcrdma_dma_map_regbuf(ia, rb))
+		goto out_regbuf;
+	sge->addr = rdmab_addr(rb);
+	sge->length = len;
+	sge->lkey = rdmab_lkey(rb);
+
+	ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr,
+				      sge->length, DMA_TO_DEVICE);
+	sc->sc_wr.num_sge++;
+	return true;
+
+out_regbuf:
+	pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+	return false;
+}
+
+/* Prepare the Send SGEs. The head and tail iovec, and each entry
+ * in the page list, gets its own SGE.
+ */
+static bool
+rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+			 struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
+{
+	struct rpcrdma_sendctx *sc = req->rl_sendctx;
+	unsigned int sge_no, page_base, len, remaining;
+	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+	struct ib_device *device = ia->ri_device;
+	struct ib_sge *sge = sc->sc_sges;
+	u32 lkey = ia->ri_pd->local_dma_lkey;
+	struct page *page, **ppages;
+
+	/* The head iovec is straightforward, as it is already
+	 * DMA-mapped. Sync the content that has changed.
+	 */
+	if (!rpcrdma_dma_map_regbuf(ia, rb))
+		goto out_regbuf;
+	sge_no = 1;
+	sge[sge_no].addr = rdmab_addr(rb);
+	sge[sge_no].length = xdr->head[0].iov_len;
+	sge[sge_no].lkey = rdmab_lkey(rb);
+	ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr,
+				      sge[sge_no].length, DMA_TO_DEVICE);
+
+	/* If there is a Read chunk, the page list is being handled
+	 * via explicit RDMA, and thus is skipped here. However, the
+	 * tail iovec may include an XDR pad for the page list, as
+	 * well as additional content, and may not reside in the
+	 * same page as the head iovec.
+	 */
+	if (rtype == rpcrdma_readch) {
+		len = xdr->tail[0].iov_len;
+
+		/* Do not include the tail if it is only an XDR pad */
+		if (len < 4)
+			goto out;
+
+		page = virt_to_page(xdr->tail[0].iov_base);
+		page_base = offset_in_page(xdr->tail[0].iov_base);
+
+		/* If the content in the page list is an odd length,
+		 * xdr_write_pages() has added a pad at the beginning
+		 * of the tail iovec. Force the tail's non-pad content
+		 * to land at the next XDR position in the Send message.
+		 */
+		page_base += len & 3;
+		len -= len & 3;
+		goto map_tail;
+	}
+
+	/* If there is a page list present, temporarily DMA map
+	 * and prepare an SGE for each page to be sent.
+	 */
+	if (xdr->page_len) {
+		ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+		page_base = offset_in_page(xdr->page_base);
+		remaining = xdr->page_len;
+		while (remaining) {
+			sge_no++;
+			if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
+				goto out_mapping_overflow;
+
+			len = min_t(u32, PAGE_SIZE - page_base, remaining);
+			sge[sge_no].addr = ib_dma_map_page(device, *ppages,
+							   page_base, len,
+							   DMA_TO_DEVICE);
+			if (ib_dma_mapping_error(device, sge[sge_no].addr))
+				goto out_mapping_err;
+			sge[sge_no].length = len;
+			sge[sge_no].lkey = lkey;
+
+			sc->sc_unmap_count++;
+			ppages++;
+			remaining -= len;
+			page_base = 0;
+		}
+	}
+
+	/* The tail iovec is not always constructed in the same
+	 * page where the head iovec resides (see, for example,
+	 * gss_wrap_req_priv). To neatly accommodate that case,
+	 * DMA map it separately.
+	 */
+	if (xdr->tail[0].iov_len) {
+		page = virt_to_page(xdr->tail[0].iov_base);
+		page_base = offset_in_page(xdr->tail[0].iov_base);
+		len = xdr->tail[0].iov_len;
+
+map_tail:
+		sge_no++;
+		sge[sge_no].addr = ib_dma_map_page(device, page,
+						   page_base, len,
+						   DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(device, sge[sge_no].addr))
+			goto out_mapping_err;
+		sge[sge_no].length = len;
+		sge[sge_no].lkey = lkey;
+		sc->sc_unmap_count++;
+	}
+
+out:
+	sc->sc_wr.num_sge += sge_no;
+	if (sc->sc_unmap_count)
+		__set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
+	return true;
+
+out_regbuf:
+	pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+	return false;
+
+out_mapping_overflow:
+	rpcrdma_unmap_sendctx(sc);
+	pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
+	return false;
+
+out_mapping_err:
+	rpcrdma_unmap_sendctx(sc);
+	pr_err("rpcrdma: Send mapping error\n");
+	return false;
+}
+
+/**
+ * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR
+ * @r_xprt: controlling transport
+ * @req: context of RPC Call being marshalled
+ * @hdrlen: size of transport header, in bytes
+ * @xdr: xdr_buf containing RPC Call
+ * @rtype: chunk type being encoded
+ *
+ * Returns 0 on success; otherwise a negative errno is returned.
+ */
+int
+rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
+			  struct rpcrdma_req *req, u32 hdrlen,
+			  struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
+{
+	req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf);
+	if (!req->rl_sendctx)
+		return -ENOBUFS;
+	req->rl_sendctx->sc_wr.num_sge = 0;
+	req->rl_sendctx->sc_unmap_count = 0;
+	req->rl_sendctx->sc_req = req;
+	__clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
+
+	if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen))
+		return -EIO;
+
+	if (rtype != rpcrdma_areadch)
+		if (!rpcrdma_prepare_msg_sges(&r_xprt->rx_ia, req, xdr, rtype))
+			return -EIO;
+
+	return 0;
+}
+
+/**
+ * rpcrdma_marshal_req - Marshal and send one RPC request
+ * @r_xprt: controlling transport
+ * @rqst: RPC request to be marshaled
+ *
+ * For the RPC in "rqst", this function:
+ *  - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG)
+ *  - Registers Read, Write, and Reply chunks
+ *  - Constructs the transport header
+ *  - Posts a Send WR to send the transport header and request
+ *
+ * Returns:
+ *	%0 if the RPC was sent successfully,
+ *	%-ENOTCONN if the connection was lost,
+ *	%-EAGAIN if the caller should call again with the same arguments,
+ *	%-ENOBUFS if the caller should call again after a delay,
+ *	%-EMSGSIZE if the transport header is too small,
+ *	%-EIO if a permanent problem occurred while marshaling.
+ */
+int
+rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
+{
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct xdr_stream *xdr = &req->rl_stream;
+	enum rpcrdma_chunktype rtype, wtype;
+	bool ddp_allowed;
+	__be32 *p;
+	int ret;
+
+	rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
+	xdr_init_encode(xdr, &req->rl_hdrbuf,
+			req->rl_rdmabuf->rg_base);
+
+	/* Fixed header fields */
+	ret = -EMSGSIZE;
+	p = xdr_reserve_space(xdr, 4 * sizeof(*p));
+	if (!p)
+		goto out_err;
+	*p++ = rqst->rq_xid;
+	*p++ = rpcrdma_version;
+	*p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
+
+	/* When the ULP employs a GSS flavor that guarantees integrity
+	 * or privacy, direct data placement of individual data items
+	 * is not allowed.
+	 */
+	ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
+						RPCAUTH_AUTH_DATATOUCH);
+
+	/*
+	 * Chunks needed for results?
+	 *
+	 * o If the expected result is under the inline threshold, all ops
+	 *   return as inline.
+	 * o Large read ops return data as write chunk(s), header as
+	 *   inline.
+	 * o Large non-read ops return as a single reply chunk.
+	 */
+	if (rpcrdma_results_inline(r_xprt, rqst))
+		wtype = rpcrdma_noch;
+	else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ)
+		wtype = rpcrdma_writech;
+	else
+		wtype = rpcrdma_replych;
+
+	/*
+	 * Chunks needed for arguments?
+	 *
+	 * o If the total request is under the inline threshold, all ops
+	 *   are sent as inline.
+	 * o Large write ops transmit data as read chunk(s), header as
+	 *   inline.
+	 * o Large non-write ops are sent with the entire message as a
+	 *   single read chunk (protocol 0-position special case).
+	 *
+	 * This assumes that the upper layer does not present a request
+	 * that both has a data payload, and whose non-data arguments
+	 * by themselves are larger than the inline threshold.
+	 */
+	if (rpcrdma_args_inline(r_xprt, rqst)) {
+		*p++ = rdma_msg;
+		rtype = rpcrdma_noch;
+	} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
+		*p++ = rdma_msg;
+		rtype = rpcrdma_readch;
+	} else {
+		r_xprt->rx_stats.nomsg_call_count++;
+		*p++ = rdma_nomsg;
+		rtype = rpcrdma_areadch;
+	}
+
+	/* If this is a retransmit, discard previously registered
+	 * chunks. Very likely the connection has been replaced,
+	 * so these registrations are invalid and unusable.
+	 */
+	while (unlikely(!list_empty(&req->rl_registered))) {
+		struct rpcrdma_mr *mr;
+
+		mr = rpcrdma_mr_pop(&req->rl_registered);
+		rpcrdma_mr_defer_recovery(mr);
+	}
+
+	/* This implementation supports the following combinations
+	 * of chunk lists in one RPC-over-RDMA Call message:
+	 *
+	 *   - Read list
+	 *   - Write list
+	 *   - Reply chunk
+	 *   - Read list + Reply chunk
+	 *
+	 * It might not yet support the following combinations:
+	 *
+	 *   - Read list + Write list
+	 *
+	 * It does not support the following combinations:
+	 *
+	 *   - Write list + Reply chunk
+	 *   - Read list + Write list + Reply chunk
+	 *
+	 * This implementation supports only a single chunk in each
+	 * Read or Write list. Thus for example the client cannot
+	 * send a Call message with a Position Zero Read chunk and a
+	 * regular Read chunk at the same time.
+	 */
+	if (rtype != rpcrdma_noch) {
+		ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype);
+		if (ret)
+			goto out_err;
+	}
+	ret = encode_item_not_present(xdr);
+	if (ret)
+		goto out_err;
+
+	if (wtype == rpcrdma_writech) {
+		ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype);
+		if (ret)
+			goto out_err;
+	}
+	ret = encode_item_not_present(xdr);
+	if (ret)
+		goto out_err;
+
+	if (wtype != rpcrdma_replych)
+		ret = encode_item_not_present(xdr);
+	else
+		ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype);
+	if (ret)
+		goto out_err;
+
+	trace_xprtrdma_marshal(rqst, xdr_stream_pos(xdr), rtype, wtype);
+
+	ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr),
+					&rqst->rq_snd_buf, rtype);
+	if (ret)
+		goto out_err;
+	return 0;
+
+out_err:
+	r_xprt->rx_stats.failed_marshal_count++;
+	return ret;
+}
+
+/**
+ * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
+ * @rqst: controlling RPC request
+ * @srcp: points to RPC message payload in receive buffer
+ * @copy_len: remaining length of receive buffer content
+ * @pad: Write chunk pad bytes needed (zero for pure inline)
+ *
+ * The upper layer has set the maximum number of bytes it can
+ * receive in each component of rq_rcv_buf. These values are set in
+ * the head.iov_len, page_len, tail.iov_len, and buflen fields.
+ *
+ * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
+ * many cases this function simply updates iov_base pointers in
+ * rq_rcv_buf to point directly to the received reply data, to
+ * avoid copying reply data.
+ *
+ * Returns the count of bytes which had to be memcopied.
+ */
+static unsigned long
+rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
+{
+	unsigned long fixup_copy_count;
+	int i, npages, curlen;
+	char *destp;
+	struct page **ppages;
+	int page_base;
+
+	/* The head iovec is redirected to the RPC reply message
+	 * in the receive buffer, to avoid a memcopy.
+	 */
+	rqst->rq_rcv_buf.head[0].iov_base = srcp;
+	rqst->rq_private_buf.head[0].iov_base = srcp;
+
+	/* The contents of the receive buffer that follow
+	 * head.iov_len bytes are copied into the page list.
+	 */
+	curlen = rqst->rq_rcv_buf.head[0].iov_len;
+	if (curlen > copy_len)
+		curlen = copy_len;
+	trace_xprtrdma_fixup(rqst, copy_len, curlen);
+	srcp += curlen;
+	copy_len -= curlen;
+
+	ppages = rqst->rq_rcv_buf.pages +
+		(rqst->rq_rcv_buf.page_base >> PAGE_SHIFT);
+	page_base = offset_in_page(rqst->rq_rcv_buf.page_base);
+	fixup_copy_count = 0;
+	if (copy_len && rqst->rq_rcv_buf.page_len) {
+		int pagelist_len;
+
+		pagelist_len = rqst->rq_rcv_buf.page_len;
+		if (pagelist_len > copy_len)
+			pagelist_len = copy_len;
+		npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
+		for (i = 0; i < npages; i++) {
+			curlen = PAGE_SIZE - page_base;
+			if (curlen > pagelist_len)
+				curlen = pagelist_len;
+
+			trace_xprtrdma_fixup_pg(rqst, i, srcp,
+						copy_len, curlen);
+			destp = kmap_atomic(ppages[i]);
+			memcpy(destp + page_base, srcp, curlen);
+			flush_dcache_page(ppages[i]);
+			kunmap_atomic(destp);
+			srcp += curlen;
+			copy_len -= curlen;
+			fixup_copy_count += curlen;
+			pagelist_len -= curlen;
+			if (!pagelist_len)
+				break;
+			page_base = 0;
+		}
+
+		/* Implicit padding for the last segment in a Write
+		 * chunk is inserted inline at the front of the tail
+		 * iovec. The upper layer ignores the content of
+		 * the pad. Simply ensure inline content in the tail
+		 * that follows the Write chunk is properly aligned.
+		 */
+		if (pad)
+			srcp -= pad;
+	}
+
+	/* The tail iovec is redirected to the remaining data
+	 * in the receive buffer, to avoid a memcopy.
+	 */
+	if (copy_len || pad) {
+		rqst->rq_rcv_buf.tail[0].iov_base = srcp;
+		rqst->rq_private_buf.tail[0].iov_base = srcp;
+	}
+
+	return fixup_copy_count;
+}
+
+/* By convention, backchannel calls arrive via rdma_msg type
+ * messages, and never populate the chunk lists. This makes
+ * the RPC/RDMA header small and fixed in size, so it is
+ * straightforward to check the RPC header's direction field.
+ */
+static bool
+rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+{
+	struct xdr_stream *xdr = &rep->rr_stream;
+	__be32 *p;
+
+	if (rep->rr_proc != rdma_msg)
+		return false;
+
+	/* Peek at stream contents without advancing. */
+	p = xdr_inline_decode(xdr, 0);
+
+	/* Chunk lists */
+	if (*p++ != xdr_zero)
+		return false;
+	if (*p++ != xdr_zero)
+		return false;
+	if (*p++ != xdr_zero)
+		return false;
+
+	/* RPC header */
+	if (*p++ != rep->rr_xid)
+		return false;
+	if (*p != cpu_to_be32(RPC_CALL))
+		return false;
+
+	/* Now that we are sure this is a backchannel call,
+	 * advance to the RPC header.
+	 */
+	p = xdr_inline_decode(xdr, 3 * sizeof(*p));
+	if (unlikely(!p))
+		goto out_short;
+
+	rpcrdma_bc_receive_call(r_xprt, rep);
+	return true;
+
+out_short:
+	pr_warn("RPC/RDMA short backward direction call\n");
+	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
+		xprt_disconnect_done(&r_xprt->rx_xprt);
+	return true;
+}
+#else	/* CONFIG_SUNRPC_BACKCHANNEL */
+{
+	return false;
+}
+#endif	/* CONFIG_SUNRPC_BACKCHANNEL */
+
+static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length)
+{
+	u32 handle;
+	u64 offset;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 * sizeof(*p));
+	if (unlikely(!p))
+		return -EIO;
+
+	handle = be32_to_cpup(p++);
+	*length = be32_to_cpup(p++);
+	xdr_decode_hyper(p, &offset);
+
+	trace_xprtrdma_decode_seg(handle, *length, offset);
+	return 0;
+}
+
+static int decode_write_chunk(struct xdr_stream *xdr, u32 *length)
+{
+	u32 segcount, seglength;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, sizeof(*p));
+	if (unlikely(!p))
+		return -EIO;
+
+	*length = 0;
+	segcount = be32_to_cpup(p);
+	while (segcount--) {
+		if (decode_rdma_segment(xdr, &seglength))
+			return -EIO;
+		*length += seglength;
+	}
+
+	return 0;
+}
+
+/* In RPC-over-RDMA Version One replies, a Read list is never
+ * expected. This decoder is a stub that returns an error if
+ * a Read list is present.
+ */
+static int decode_read_list(struct xdr_stream *xdr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, sizeof(*p));
+	if (unlikely(!p))
+		return -EIO;
+	if (unlikely(*p != xdr_zero))
+		return -EIO;
+	return 0;
+}
+
+/* Supports only one Write chunk in the Write list
+ */
+static int decode_write_list(struct xdr_stream *xdr, u32 *length)
+{
+	u32 chunklen;
+	bool first;
+	__be32 *p;
+
+	*length = 0;
+	first = true;
+	do {
+		p = xdr_inline_decode(xdr, sizeof(*p));
+		if (unlikely(!p))
+			return -EIO;
+		if (*p == xdr_zero)
+			break;
+		if (!first)
+			return -EIO;
+
+		if (decode_write_chunk(xdr, &chunklen))
+			return -EIO;
+		*length += chunklen;
+		first = false;
+	} while (true);
+	return 0;
+}
+
+static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, sizeof(*p));
+	if (unlikely(!p))
+		return -EIO;
+
+	*length = 0;
+	if (*p != xdr_zero)
+		if (decode_write_chunk(xdr, length))
+			return -EIO;
+	return 0;
+}
+
+static int
+rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+		   struct rpc_rqst *rqst)
+{
+	struct xdr_stream *xdr = &rep->rr_stream;
+	u32 writelist, replychunk, rpclen;
+	char *base;
+
+	/* Decode the chunk lists */
+	if (decode_read_list(xdr))
+		return -EIO;
+	if (decode_write_list(xdr, &writelist))
+		return -EIO;
+	if (decode_reply_chunk(xdr, &replychunk))
+		return -EIO;
+
+	/* RDMA_MSG sanity checks */
+	if (unlikely(replychunk))
+		return -EIO;
+
+	/* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */
+	base = (char *)xdr_inline_decode(xdr, 0);
+	rpclen = xdr_stream_remaining(xdr);
+	r_xprt->rx_stats.fixup_copy_count +=
+		rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3);
+
+	r_xprt->rx_stats.total_rdma_reply += writelist;
+	return rpclen + xdr_align_size(writelist);
+}
+
+static noinline int
+rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
+{
+	struct xdr_stream *xdr = &rep->rr_stream;
+	u32 writelist, replychunk;
+
+	/* Decode the chunk lists */
+	if (decode_read_list(xdr))
+		return -EIO;
+	if (decode_write_list(xdr, &writelist))
+		return -EIO;
+	if (decode_reply_chunk(xdr, &replychunk))
+		return -EIO;
+
+	/* RDMA_NOMSG sanity checks */
+	if (unlikely(writelist))
+		return -EIO;
+	if (unlikely(!replychunk))
+		return -EIO;
+
+	/* Reply chunk buffer already is the reply vector */
+	r_xprt->rx_stats.total_rdma_reply += replychunk;
+	return replychunk;
+}
+
+static noinline int
+rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+		     struct rpc_rqst *rqst)
+{
+	struct xdr_stream *xdr = &rep->rr_stream;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, sizeof(*p));
+	if (unlikely(!p))
+		return -EIO;
+
+	switch (*p) {
+	case err_vers:
+		p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+		if (!p)
+			break;
+		dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n",
+			rqst->rq_task->tk_pid, __func__,
+			be32_to_cpup(p), be32_to_cpu(*(p + 1)));
+		break;
+	case err_chunk:
+		dprintk("RPC: %5u: %s: server reports header decoding error\n",
+			rqst->rq_task->tk_pid, __func__);
+		break;
+	default:
+		dprintk("RPC: %5u: %s: server reports unrecognized error %d\n",
+			rqst->rq_task->tk_pid, __func__, be32_to_cpup(p));
+	}
+
+	r_xprt->rx_stats.bad_reply_count++;
+	return -EREMOTEIO;
+}
+
+/* Perform XID lookup, reconstruction of the RPC reply, and
+ * RPC completion while holding the transport lock to ensure
+ * the rep, rqst, and rq_task pointers remain stable.
+ */
+void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct rpc_rqst *rqst = rep->rr_rqst;
+	unsigned long cwnd;
+	int status;
+
+	xprt->reestablish_timeout = 0;
+
+	switch (rep->rr_proc) {
+	case rdma_msg:
+		status = rpcrdma_decode_msg(r_xprt, rep, rqst);
+		break;
+	case rdma_nomsg:
+		status = rpcrdma_decode_nomsg(r_xprt, rep);
+		break;
+	case rdma_error:
+		status = rpcrdma_decode_error(r_xprt, rep, rqst);
+		break;
+	default:
+		status = -EIO;
+	}
+	if (status < 0)
+		goto out_badheader;
+
+out:
+	spin_lock(&xprt->recv_lock);
+	cwnd = xprt->cwnd;
+	xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT;
+	if (xprt->cwnd > cwnd)
+		xprt_release_rqst_cong(rqst->rq_task);
+
+	xprt_complete_rqst(rqst->rq_task, status);
+	xprt_unpin_rqst(rqst);
+	spin_unlock(&xprt->recv_lock);
+	return;
+
+/* If the incoming reply terminated a pending RPC, the next
+ * RPC call will post a replacement receive buffer as it is
+ * being marshaled.
+ */
+out_badheader:
+	trace_xprtrdma_reply_hdr(rep);
+	r_xprt->rx_stats.bad_reply_count++;
+	status = -EIO;
+	goto out;
+}
+
+void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+	/* Invalidate and unmap the data payloads before waking
+	 * the waiting application. This guarantees the memory
+	 * regions are properly fenced from the server before the
+	 * application accesses the data. It also ensures proper
+	 * send flow control: waking the next RPC waits until this
+	 * RPC has relinquished all its Send Queue entries.
+	 */
+	if (!list_empty(&req->rl_registered))
+		r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt,
+						    &req->rl_registered);
+
+	/* Ensure that any DMA mapped pages associated with
+	 * the Send of the RPC Call have been unmapped before
+	 * allowing the RPC to complete. This protects argument
+	 * memory not controlled by the RPC client from being
+	 * re-used before we're done with it.
+	 */
+	if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
+		r_xprt->rx_stats.reply_waits_for_send++;
+		out_of_line_wait_on_bit(&req->rl_flags,
+					RPCRDMA_REQ_F_TX_RESOURCES,
+					bit_wait,
+					TASK_UNINTERRUPTIBLE);
+	}
+}
+
+/* Reply handling runs in the poll worker thread. Anything that
+ * might wait is deferred to a separate workqueue.
+ */
+void rpcrdma_deferred_completion(struct work_struct *work)
+{
+	struct rpcrdma_rep *rep =
+			container_of(work, struct rpcrdma_rep, rr_work);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst);
+	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+
+	trace_xprtrdma_defer_cmp(rep);
+	if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
+		r_xprt->rx_ia.ri_ops->ro_reminv(rep, &req->rl_registered);
+	rpcrdma_release_rqst(r_xprt, req);
+	rpcrdma_complete_rqst(rep);
+}
+
+/* Process received RPC/RDMA messages.
+ *
+ * Errors must result in the RPC task either being awakened, or
+ * allowed to timeout, to discover the errors at that time.
+ */
+void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_req *req;
+	struct rpc_rqst *rqst;
+	u32 credits;
+	__be32 *p;
+
+	if (rep->rr_hdrbuf.head[0].iov_len == 0)
+		goto out_badstatus;
+
+	xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
+			rep->rr_hdrbuf.head[0].iov_base);
+
+	/* Fixed transport header fields */
+	p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p));
+	if (unlikely(!p))
+		goto out_shortreply;
+	rep->rr_xid = *p++;
+	rep->rr_vers = *p++;
+	credits = be32_to_cpu(*p++);
+	rep->rr_proc = *p++;
+
+	if (rep->rr_vers != rpcrdma_version)
+		goto out_badversion;
+
+	if (rpcrdma_is_bcall(r_xprt, rep))
+		return;
+
+	/* Match incoming rpcrdma_rep to an rpcrdma_req to
+	 * get context for handling any incoming chunks.
+	 */
+	spin_lock(&xprt->recv_lock);
+	rqst = xprt_lookup_rqst(xprt, rep->rr_xid);
+	if (!rqst)
+		goto out_norqst;
+	xprt_pin_rqst(rqst);
+
+	if (credits == 0)
+		credits = 1;	/* don't deadlock */
+	else if (credits > buf->rb_max_requests)
+		credits = buf->rb_max_requests;
+	buf->rb_credits = credits;
+
+	spin_unlock(&xprt->recv_lock);
+
+	req = rpcr_to_rdmar(rqst);
+	req->rl_reply = rep;
+	rep->rr_rqst = rqst;
+	clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
+
+	trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
+
+	queue_work(rpcrdma_receive_wq, &rep->rr_work);
+	return;
+
+out_badstatus:
+	rpcrdma_recv_buffer_put(rep);
+	if (r_xprt->rx_ep.rep_connected == 1) {
+		r_xprt->rx_ep.rep_connected = -EIO;
+		rpcrdma_conn_func(&r_xprt->rx_ep);
+	}
+	return;
+
+out_badversion:
+	trace_xprtrdma_reply_vers(rep);
+	goto repost;
+
+/* The RPC transaction has already been terminated, or the header
+ * is corrupt.
+ */
+out_norqst:
+	spin_unlock(&xprt->recv_lock);
+	trace_xprtrdma_reply_rqst(rep);
+	goto repost;
+
+out_shortreply:
+	trace_xprtrdma_reply_short(rep);
+
+/* If no pending RPC transaction was matched, post a replacement
+ * receive buffer before returning.
+ */
+repost:
+	r_xprt->rx_stats.bad_reply_count++;
+	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
+		rpcrdma_recv_buffer_put(rep);
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
new file mode 100644
index 0000000..dd8a431
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include "xprt_rdma.h"
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+/* RPC/RDMA parameters */
+unsigned int svcrdma_ord = 16;	/* historical default */
+static unsigned int min_ord = 1;
+static unsigned int max_ord = 255;
+unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
+unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
+static unsigned int min_max_requests = 4;
+static unsigned int max_max_requests = 16384;
+unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
+
+atomic_t rdma_stat_recv;
+atomic_t rdma_stat_read;
+atomic_t rdma_stat_write;
+atomic_t rdma_stat_sq_starve;
+atomic_t rdma_stat_rq_starve;
+atomic_t rdma_stat_rq_poll;
+atomic_t rdma_stat_rq_prod;
+atomic_t rdma_stat_sq_poll;
+atomic_t rdma_stat_sq_prod;
+
+struct workqueue_struct *svc_rdma_wq;
+
+/*
+ * This function implements reading and resetting an atomic_t stat
+ * variable through read/write to a proc file. Any write to the file
+ * resets the associated statistic to zero. Any read returns it's
+ * current value.
+ */
+static int read_reset_stat(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp,
+			   loff_t *ppos)
+{
+	atomic_t *stat = (atomic_t *)table->data;
+
+	if (!stat)
+		return -EINVAL;
+
+	if (write)
+		atomic_set(stat, 0);
+	else {
+		char str_buf[32];
+		char *data;
+		int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
+		if (len >= 32)
+			return -EFAULT;
+		len = strlen(str_buf);
+		if (*ppos > len) {
+			*lenp = 0;
+			return 0;
+		}
+		data = &str_buf[*ppos];
+		len -= *ppos;
+		if (len > *lenp)
+			len = *lenp;
+		if (len && copy_to_user(buffer, str_buf, len))
+			return -EFAULT;
+		*lenp = len;
+		*ppos += len;
+	}
+	return 0;
+}
+
+static struct ctl_table_header *svcrdma_table_header;
+static struct ctl_table svcrdma_parm_table[] = {
+	{
+		.procname	= "max_requests",
+		.data		= &svcrdma_max_requests,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_max_requests,
+		.extra2		= &max_max_requests
+	},
+	{
+		.procname	= "max_req_size",
+		.data		= &svcrdma_max_req_size,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_max_inline,
+		.extra2		= &max_max_inline
+	},
+	{
+		.procname	= "max_outbound_read_requests",
+		.data		= &svcrdma_ord,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_ord,
+		.extra2		= &max_ord,
+	},
+
+	{
+		.procname	= "rdma_stat_read",
+		.data		= &rdma_stat_read,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_recv",
+		.data		= &rdma_stat_recv,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_write",
+		.data		= &rdma_stat_write,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_sq_starve",
+		.data		= &rdma_stat_sq_starve,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_rq_starve",
+		.data		= &rdma_stat_rq_starve,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_rq_poll",
+		.data		= &rdma_stat_rq_poll,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_rq_prod",
+		.data		= &rdma_stat_rq_prod,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_sq_poll",
+		.data		= &rdma_stat_sq_poll,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_sq_prod",
+		.data		= &rdma_stat_sq_prod,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{ },
+};
+
+static struct ctl_table svcrdma_table[] = {
+	{
+		.procname	= "svc_rdma",
+		.mode		= 0555,
+		.child		= svcrdma_parm_table
+	},
+	{ },
+};
+
+static struct ctl_table svcrdma_root_table[] = {
+	{
+		.procname	= "sunrpc",
+		.mode		= 0555,
+		.child		= svcrdma_table
+	},
+	{ },
+};
+
+void svc_rdma_cleanup(void)
+{
+	dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
+	destroy_workqueue(svc_rdma_wq);
+	if (svcrdma_table_header) {
+		unregister_sysctl_table(svcrdma_table_header);
+		svcrdma_table_header = NULL;
+	}
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	svc_unreg_xprt_class(&svc_rdma_bc_class);
+#endif
+	svc_unreg_xprt_class(&svc_rdma_class);
+}
+
+int svc_rdma_init(void)
+{
+	dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
+	dprintk("\tsvcrdma_ord      : %d\n", svcrdma_ord);
+	dprintk("\tmax_requests     : %u\n", svcrdma_max_requests);
+	dprintk("\tmax_bc_requests  : %u\n", svcrdma_max_bc_requests);
+	dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
+
+	svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
+	if (!svc_rdma_wq)
+		return -ENOMEM;
+
+	if (!svcrdma_table_header)
+		svcrdma_table_header =
+			register_sysctl_table(svcrdma_root_table);
+
+	/* Register RDMA with the SVC transport switch */
+	svc_reg_xprt_class(&svc_rdma_class);
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	svc_reg_xprt_class(&svc_rdma_bc_class);
+#endif
+	return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
new file mode 100644
index 0000000..a73632c
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ *
+ * Support for backward direction RPCs on RPC/RDMA (server-side).
+ */
+
+#include <linux/module.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include "xprt_rdma.h"
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+#undef SVCRDMA_BACKCHANNEL_DEBUG
+
+/**
+ * svc_rdma_handle_bc_reply - Process incoming backchannel reply
+ * @xprt: controlling backchannel transport
+ * @rdma_resp: pointer to incoming transport header
+ * @rcvbuf: XDR buffer into which to decode the reply
+ *
+ * Returns:
+ *	%0 if @rcvbuf is filled in, xprt_complete_rqst called,
+ *	%-EAGAIN if server should call ->recvfrom again.
+ */
+int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
+			     struct xdr_buf *rcvbuf)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct kvec *dst, *src = &rcvbuf->head[0];
+	struct rpc_rqst *req;
+	unsigned long cwnd;
+	u32 credits;
+	size_t len;
+	__be32 xid;
+	__be32 *p;
+	int ret;
+
+	p = (__be32 *)src->iov_base;
+	len = src->iov_len;
+	xid = *rdma_resp;
+
+#ifdef SVCRDMA_BACKCHANNEL_DEBUG
+	pr_info("%s: xid=%08x, length=%zu\n",
+		__func__, be32_to_cpu(xid), len);
+	pr_info("%s: RPC/RDMA: %*ph\n",
+		__func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp);
+	pr_info("%s:      RPC: %*ph\n",
+		__func__, (int)len, p);
+#endif
+
+	ret = -EAGAIN;
+	if (src->iov_len < 24)
+		goto out_shortreply;
+
+	spin_lock(&xprt->recv_lock);
+	req = xprt_lookup_rqst(xprt, xid);
+	if (!req)
+		goto out_notfound;
+
+	dst = &req->rq_private_buf.head[0];
+	memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
+	if (dst->iov_len < len)
+		goto out_unlock;
+	memcpy(dst->iov_base, p, len);
+
+	credits = be32_to_cpup(rdma_resp + 2);
+	if (credits == 0)
+		credits = 1;	/* don't deadlock */
+	else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
+		credits = r_xprt->rx_buf.rb_bc_max_requests;
+
+	spin_lock_bh(&xprt->transport_lock);
+	cwnd = xprt->cwnd;
+	xprt->cwnd = credits << RPC_CWNDSHIFT;
+	if (xprt->cwnd > cwnd)
+		xprt_release_rqst_cong(req->rq_task);
+	spin_unlock_bh(&xprt->transport_lock);
+
+
+	ret = 0;
+	xprt_complete_rqst(req->rq_task, rcvbuf->len);
+	rcvbuf->len = 0;
+
+out_unlock:
+	spin_unlock(&xprt->recv_lock);
+out:
+	return ret;
+
+out_shortreply:
+	dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n",
+		xprt, src->iov_len);
+	goto out;
+
+out_notfound:
+	dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n",
+		xprt, be32_to_cpu(xid));
+	goto out_unlock;
+}
+
+/* Send a backwards direction RPC call.
+ *
+ * Caller holds the connection's mutex and has already marshaled
+ * the RPC/RDMA request.
+ *
+ * This is similar to svc_rdma_send_reply_msg, but takes a struct
+ * rpc_rqst instead, does not support chunks, and avoids blocking
+ * memory allocation.
+ *
+ * XXX: There is still an opportunity to block in svc_rdma_send()
+ * if there are no SQ entries to post the Send. This may occur if
+ * the adapter has a small maximum SQ depth.
+ */
+static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
+			      struct rpc_rqst *rqst)
+{
+	struct svc_rdma_op_ctxt *ctxt;
+	int ret;
+
+	ctxt = svc_rdma_get_context(rdma);
+
+	/* rpcrdma_bc_send_request builds the transport header and
+	 * the backchannel RPC message in the same buffer. Thus only
+	 * one SGE is needed to send both.
+	 */
+	ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer,
+				     rqst->rq_snd_buf.len);
+	if (ret < 0)
+		goto out_err;
+
+	/* Bump page refcnt so Send completion doesn't release
+	 * the rq_buffer before all retransmits are complete.
+	 */
+	get_page(virt_to_page(rqst->rq_buffer));
+	ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0);
+	if (ret)
+		goto out_unmap;
+
+out_err:
+	dprintk("svcrdma: %s returns %d\n", __func__, ret);
+	return ret;
+
+out_unmap:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 1);
+	ret = -EIO;
+	goto out_err;
+}
+
+/* Server-side transport endpoint wants a whole page for its send
+ * buffer. The client RPC code constructs the RPC header in this
+ * buffer before it invokes ->send_request.
+ */
+static int
+xprt_rdma_bc_allocate(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	size_t size = rqst->rq_callsize;
+	struct page *page;
+
+	if (size > PAGE_SIZE) {
+		WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
+			  size);
+		return -EINVAL;
+	}
+
+	page = alloc_page(RPCRDMA_DEF_GFP);
+	if (!page)
+		return -ENOMEM;
+	rqst->rq_buffer = page_address(page);
+
+	rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, RPCRDMA_DEF_GFP);
+	if (!rqst->rq_rbuffer) {
+		put_page(page);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void
+xprt_rdma_bc_free(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+
+	put_page(virt_to_page(rqst->rq_buffer));
+	kfree(rqst->rq_rbuffer);
+}
+
+static int
+rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
+{
+	struct rpc_xprt *xprt = rqst->rq_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	__be32 *p;
+	int rc;
+
+	/* Space in the send buffer for an RPC/RDMA header is reserved
+	 * via xprt->tsh_size.
+	 */
+	p = rqst->rq_buffer;
+	*p++ = rqst->rq_xid;
+	*p++ = rpcrdma_version;
+	*p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
+	*p++ = rdma_msg;
+	*p++ = xdr_zero;
+	*p++ = xdr_zero;
+	*p   = xdr_zero;
+
+#ifdef SVCRDMA_BACKCHANNEL_DEBUG
+	pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
+#endif
+
+	rc = svc_rdma_bc_sendto(rdma, rqst);
+	if (rc)
+		goto drop_connection;
+	return rc;
+
+drop_connection:
+	dprintk("svcrdma: failed to send bc call\n");
+	xprt_disconnect_done(xprt);
+	return -ENOTCONN;
+}
+
+/* Send an RPC call on the passive end of a transport
+ * connection.
+ */
+static int
+xprt_rdma_bc_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+	struct svcxprt_rdma *rdma;
+	int ret;
+
+	dprintk("svcrdma: sending bc call with xid: %08x\n",
+		be32_to_cpu(rqst->rq_xid));
+
+	if (!mutex_trylock(&sxprt->xpt_mutex)) {
+		rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL);
+		if (!mutex_trylock(&sxprt->xpt_mutex))
+			return -EAGAIN;
+		rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task);
+	}
+
+	ret = -ENOTCONN;
+	rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
+	if (!test_bit(XPT_DEAD, &sxprt->xpt_flags))
+		ret = rpcrdma_bc_send_request(rdma, rqst);
+
+	mutex_unlock(&sxprt->xpt_mutex);
+
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+static void
+xprt_rdma_bc_close(struct rpc_xprt *xprt)
+{
+	dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+}
+
+static void
+xprt_rdma_bc_put(struct rpc_xprt *xprt)
+{
+	dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+
+	xprt_free(xprt);
+	module_put(THIS_MODULE);
+}
+
+static const struct rpc_xprt_ops xprt_rdma_bc_procs = {
+	.reserve_xprt		= xprt_reserve_xprt_cong,
+	.release_xprt		= xprt_release_xprt_cong,
+	.alloc_slot		= xprt_alloc_slot,
+	.release_request	= xprt_release_rqst_cong,
+	.buf_alloc		= xprt_rdma_bc_allocate,
+	.buf_free		= xprt_rdma_bc_free,
+	.send_request		= xprt_rdma_bc_send_request,
+	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
+	.close			= xprt_rdma_bc_close,
+	.destroy		= xprt_rdma_bc_put,
+	.print_stats		= xprt_rdma_print_stats
+};
+
+static const struct rpc_timeout xprt_rdma_bc_timeout = {
+	.to_initval = 60 * HZ,
+	.to_maxval = 60 * HZ,
+};
+
+/* It shouldn't matter if the number of backchannel session slots
+ * doesn't match the number of RPC/RDMA credits. That just means
+ * one or the other will have extra slots that aren't used.
+ */
+static struct rpc_xprt *
+xprt_setup_rdma_bc(struct xprt_create *args)
+{
+	struct rpc_xprt *xprt;
+	struct rpcrdma_xprt *new_xprt;
+
+	if (args->addrlen > sizeof(xprt->addr)) {
+		dprintk("RPC:       %s: address too large\n", __func__);
+		return ERR_PTR(-EBADF);
+	}
+
+	xprt = xprt_alloc(args->net, sizeof(*new_xprt),
+			  RPCRDMA_MAX_BC_REQUESTS,
+			  RPCRDMA_MAX_BC_REQUESTS);
+	if (!xprt) {
+		dprintk("RPC:       %s: couldn't allocate rpc_xprt\n",
+			__func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	xprt->timeout = &xprt_rdma_bc_timeout;
+	xprt_set_bound(xprt);
+	xprt_set_connected(xprt);
+	xprt->bind_timeout = RPCRDMA_BIND_TO;
+	xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+	xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+
+	xprt->prot = XPRT_TRANSPORT_BC_RDMA;
+	xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32);
+	xprt->ops = &xprt_rdma_bc_procs;
+
+	memcpy(&xprt->addr, args->dstaddr, args->addrlen);
+	xprt->addrlen = args->addrlen;
+	xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr);
+	xprt->resvport = 0;
+
+	xprt->max_payload = xprt_rdma_max_inline_read;
+
+	new_xprt = rpcx_to_rdmax(xprt);
+	new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs;
+
+	xprt_get(xprt);
+	args->bc_xprt->xpt_bc_xprt = xprt;
+	xprt->bc_xprt = args->bc_xprt;
+
+	if (!try_module_get(THIS_MODULE))
+		goto out_fail;
+
+	/* Final put for backchannel xprt is in __svc_rdma_free */
+	xprt_get(xprt);
+	return xprt;
+
+out_fail:
+	xprt_rdma_free_addresses(xprt);
+	args->bc_xprt->xpt_bc_xprt = NULL;
+	args->bc_xprt->xpt_bc_xps = NULL;
+	xprt_put(xprt);
+	xprt_free(xprt);
+	return ERR_PTR(-EINVAL);
+}
+
+struct xprt_class xprt_rdma_bc = {
+	.list			= LIST_HEAD_INIT(xprt_rdma_bc.list),
+	.name			= "rdma backchannel",
+	.owner			= THIS_MODULE,
+	.ident			= XPRT_TRANSPORT_BC_RDMA,
+	.setup			= xprt_setup_rdma_bc,
+};
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
new file mode 100644
index 0000000..3d45015
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -0,0 +1,587 @@
+/*
+ * Copyright (c) 2016, 2017 Oracle. All rights reserved.
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+/* Operation
+ *
+ * The main entry point is svc_rdma_recvfrom. This is called from
+ * svc_recv when the transport indicates there is incoming data to
+ * be read. "Data Ready" is signaled when an RDMA Receive completes,
+ * or when a set of RDMA Reads complete.
+ *
+ * An svc_rqst is passed in. This structure contains an array of
+ * free pages (rq_pages) that will contain the incoming RPC message.
+ *
+ * Short messages are moved directly into svc_rqst::rq_arg, and
+ * the RPC Call is ready to be processed by the Upper Layer.
+ * svc_rdma_recvfrom returns the length of the RPC Call message,
+ * completing the reception of the RPC Call.
+ *
+ * However, when an incoming message has Read chunks,
+ * svc_rdma_recvfrom must post RDMA Reads to pull the RPC Call's
+ * data payload from the client. svc_rdma_recvfrom sets up the
+ * RDMA Reads using pages in svc_rqst::rq_pages, which are
+ * transferred to an svc_rdma_op_ctxt for the duration of the
+ * I/O. svc_rdma_recvfrom then returns zero, since the RPC message
+ * is still not yet ready.
+ *
+ * When the Read chunk payloads have become available on the
+ * server, "Data Ready" is raised again, and svc_recv calls
+ * svc_rdma_recvfrom again. This second call may use a different
+ * svc_rqst than the first one, thus any information that needs
+ * to be preserved across these two calls is kept in an
+ * svc_rdma_op_ctxt.
+ *
+ * The second call to svc_rdma_recvfrom performs final assembly
+ * of the RPC Call message, using the RDMA Read sink pages kept in
+ * the svc_rdma_op_ctxt. The xdr_buf is copied from the
+ * svc_rdma_op_ctxt to the second svc_rqst. The second call returns
+ * the length of the completed RPC Call message.
+ *
+ * Page Management
+ *
+ * Pages under I/O must be transferred from the first svc_rqst to an
+ * svc_rdma_op_ctxt before the first svc_rdma_recvfrom call returns.
+ *
+ * The first svc_rqst supplies pages for RDMA Reads. These are moved
+ * from rqstp::rq_pages into ctxt::pages. The consumed elements of
+ * the rq_pages array are set to NULL and refilled with the first
+ * svc_rdma_recvfrom call returns.
+ *
+ * During the second svc_rdma_recvfrom call, RDMA Read sink pages
+ * are transferred from the svc_rdma_op_ctxt to the second svc_rqst
+ * (see rdma_read_complete() below).
+ */
+
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+
+#include <linux/spinlock.h>
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+/*
+ * Replace the pages in the rq_argpages array with the pages from the SGE in
+ * the RDMA_RECV completion. The SGL should contain full pages up until the
+ * last one.
+ */
+static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
+				   struct svc_rdma_op_ctxt *ctxt)
+{
+	struct page *page;
+	int sge_no;
+	u32 len;
+
+	/* The reply path assumes the Call's transport header resides
+	 * in rqstp->rq_pages[0].
+	 */
+	page = ctxt->pages[0];
+	put_page(rqstp->rq_pages[0]);
+	rqstp->rq_pages[0] = page;
+
+	/* Set up the XDR head */
+	rqstp->rq_arg.head[0].iov_base = page_address(page);
+	rqstp->rq_arg.head[0].iov_len =
+		min_t(size_t, ctxt->byte_len, ctxt->sge[0].length);
+	rqstp->rq_arg.len = ctxt->byte_len;
+	rqstp->rq_arg.buflen = ctxt->byte_len;
+
+	/* Compute bytes past head in the SGL */
+	len = ctxt->byte_len - rqstp->rq_arg.head[0].iov_len;
+
+	/* If data remains, store it in the pagelist */
+	rqstp->rq_arg.page_len = len;
+	rqstp->rq_arg.page_base = 0;
+
+	sge_no = 1;
+	while (len && sge_no < ctxt->count) {
+		page = ctxt->pages[sge_no];
+		put_page(rqstp->rq_pages[sge_no]);
+		rqstp->rq_pages[sge_no] = page;
+		len -= min_t(u32, len, ctxt->sge[sge_no].length);
+		sge_no++;
+	}
+	rqstp->rq_respages = &rqstp->rq_pages[sge_no];
+	rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+	/* If not all pages were used from the SGL, free the remaining ones */
+	len = sge_no;
+	while (sge_no < ctxt->count) {
+		page = ctxt->pages[sge_no++];
+		put_page(page);
+	}
+	ctxt->count = len;
+
+	/* Set up tail */
+	rqstp->rq_arg.tail[0].iov_base = NULL;
+	rqstp->rq_arg.tail[0].iov_len = 0;
+}
+
+/* This accommodates the largest possible Write chunk,
+ * in one segment.
+ */
+#define MAX_BYTES_WRITE_SEG	((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT))
+
+/* This accommodates the largest possible Position-Zero
+ * Read chunk or Reply chunk, in one segment.
+ */
+#define MAX_BYTES_SPECIAL_SEG	((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT))
+
+/* Sanity check the Read list.
+ *
+ * Implementation limits:
+ * - This implementation supports only one Read chunk.
+ *
+ * Sanity checks:
+ * - Read list does not overflow buffer.
+ * - Segment size limited by largest NFS data payload.
+ *
+ * The segment count is limited to how many segments can
+ * fit in the transport header without overflowing the
+ * buffer. That's about 40 Read segments for a 1KB inline
+ * threshold.
+ *
+ * Returns pointer to the following Write list.
+ */
+static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end)
+{
+	u32 position;
+	bool first;
+
+	first = true;
+	while (*p++ != xdr_zero) {
+		if (first) {
+			position = be32_to_cpup(p++);
+			first = false;
+		} else if (be32_to_cpup(p++) != position) {
+			return NULL;
+		}
+		p++;	/* handle */
+		if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG)
+			return NULL;
+		p += 2;	/* offset */
+
+		if (p > end)
+			return NULL;
+	}
+	return p;
+}
+
+/* The segment count is limited to how many segments can
+ * fit in the transport header without overflowing the
+ * buffer. That's about 60 Write segments for a 1KB inline
+ * threshold.
+ */
+static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end,
+				     u32 maxlen)
+{
+	u32 i, segcount;
+
+	segcount = be32_to_cpup(p++);
+	for (i = 0; i < segcount; i++) {
+		p++;	/* handle */
+		if (be32_to_cpup(p++) > maxlen)
+			return NULL;
+		p += 2;	/* offset */
+
+		if (p > end)
+			return NULL;
+	}
+
+	return p;
+}
+
+/* Sanity check the Write list.
+ *
+ * Implementation limits:
+ * - This implementation supports only one Write chunk.
+ *
+ * Sanity checks:
+ * - Write list does not overflow buffer.
+ * - Segment size limited by largest NFS data payload.
+ *
+ * Returns pointer to the following Reply chunk.
+ */
+static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end)
+{
+	u32 chcount;
+
+	chcount = 0;
+	while (*p++ != xdr_zero) {
+		p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG);
+		if (!p)
+			return NULL;
+		if (chcount++ > 1)
+			return NULL;
+	}
+	return p;
+}
+
+/* Sanity check the Reply chunk.
+ *
+ * Sanity checks:
+ * - Reply chunk does not overflow buffer.
+ * - Segment size limited by largest NFS data payload.
+ *
+ * Returns pointer to the following RPC header.
+ */
+static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
+{
+	if (*p++ != xdr_zero) {
+		p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG);
+		if (!p)
+			return NULL;
+	}
+	return p;
+}
+
+/* On entry, xdr->head[0].iov_base points to first byte in the
+ * RPC-over-RDMA header.
+ *
+ * On successful exit, head[0] points to first byte past the
+ * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
+ * The length of the RPC-over-RDMA header is returned.
+ *
+ * Assumptions:
+ * - The transport header is entirely contained in the head iovec.
+ */
+static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
+{
+	__be32 *p, *end, *rdma_argp;
+	unsigned int hdr_len;
+	char *proc;
+
+	/* Verify that there's enough bytes for header + something */
+	if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
+		goto out_short;
+
+	rdma_argp = rq_arg->head[0].iov_base;
+	if (*(rdma_argp + 1) != rpcrdma_version)
+		goto out_version;
+
+	switch (*(rdma_argp + 3)) {
+	case rdma_msg:
+		proc = "RDMA_MSG";
+		break;
+	case rdma_nomsg:
+		proc = "RDMA_NOMSG";
+		break;
+
+	case rdma_done:
+		goto out_drop;
+
+	case rdma_error:
+		goto out_drop;
+
+	default:
+		goto out_proc;
+	}
+
+	end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len);
+	p = xdr_check_read_list(rdma_argp + 4, end);
+	if (!p)
+		goto out_inval;
+	p = xdr_check_write_list(p, end);
+	if (!p)
+		goto out_inval;
+	p = xdr_check_reply_chunk(p, end);
+	if (!p)
+		goto out_inval;
+	if (p > end)
+		goto out_inval;
+
+	rq_arg->head[0].iov_base = p;
+	hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
+	rq_arg->head[0].iov_len -= hdr_len;
+	rq_arg->len -= hdr_len;
+	dprintk("svcrdma: received %s request for XID 0x%08x, hdr_len=%u\n",
+		proc, be32_to_cpup(rdma_argp), hdr_len);
+	return hdr_len;
+
+out_short:
+	dprintk("svcrdma: header too short = %d\n", rq_arg->len);
+	return -EINVAL;
+
+out_version:
+	dprintk("svcrdma: bad xprt version: %u\n",
+		be32_to_cpup(rdma_argp + 1));
+	return -EPROTONOSUPPORT;
+
+out_drop:
+	dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n");
+	return 0;
+
+out_proc:
+	dprintk("svcrdma: bad rdma procedure (%u)\n",
+		be32_to_cpup(rdma_argp + 3));
+	return -EINVAL;
+
+out_inval:
+	dprintk("svcrdma: failed to parse transport header\n");
+	return -EINVAL;
+}
+
+static void rdma_read_complete(struct svc_rqst *rqstp,
+			       struct svc_rdma_op_ctxt *head)
+{
+	int page_no;
+
+	/* Copy RPC pages */
+	for (page_no = 0; page_no < head->count; page_no++) {
+		put_page(rqstp->rq_pages[page_no]);
+		rqstp->rq_pages[page_no] = head->pages[page_no];
+	}
+
+	/* Point rq_arg.pages past header */
+	rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
+	rqstp->rq_arg.page_len = head->arg.page_len;
+
+	/* rq_respages starts after the last arg page */
+	rqstp->rq_respages = &rqstp->rq_pages[page_no];
+	rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+	/* Rebuild rq_arg head and tail. */
+	rqstp->rq_arg.head[0] = head->arg.head[0];
+	rqstp->rq_arg.tail[0] = head->arg.tail[0];
+	rqstp->rq_arg.len = head->arg.len;
+	rqstp->rq_arg.buflen = head->arg.buflen;
+}
+
+static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
+				__be32 *rdma_argp, int status)
+{
+	struct svc_rdma_op_ctxt *ctxt;
+	__be32 *p, *err_msgp;
+	unsigned int length;
+	struct page *page;
+	int ret;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return;
+	err_msgp = page_address(page);
+
+	p = err_msgp;
+	*p++ = *rdma_argp;
+	*p++ = *(rdma_argp + 1);
+	*p++ = xprt->sc_fc_credits;
+	*p++ = rdma_error;
+	if (status == -EPROTONOSUPPORT) {
+		*p++ = err_vers;
+		*p++ = rpcrdma_version;
+		*p++ = rpcrdma_version;
+	} else {
+		*p++ = err_chunk;
+	}
+	length = (unsigned long)p - (unsigned long)err_msgp;
+
+	/* Map transport header; no RPC message payload */
+	ctxt = svc_rdma_get_context(xprt);
+	ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length);
+	if (ret) {
+		dprintk("svcrdma: Error %d mapping send for protocol error\n",
+			ret);
+		return;
+	}
+
+	ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
+	if (ret) {
+		dprintk("svcrdma: Error %d posting send for protocol error\n",
+			ret);
+		svc_rdma_unmap_dma(ctxt);
+		svc_rdma_put_context(ctxt, 1);
+	}
+}
+
+/* By convention, backchannel calls arrive via rdma_msg type
+ * messages, and never populate the chunk lists. This makes
+ * the RPC/RDMA header small and fixed in size, so it is
+ * straightforward to check the RPC header's direction field.
+ */
+static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
+					  __be32 *rdma_resp)
+{
+	__be32 *p;
+
+	if (!xprt->xpt_bc_xprt)
+		return false;
+
+	p = rdma_resp + 3;
+	if (*p++ != rdma_msg)
+		return false;
+
+	if (*p++ != xdr_zero)
+		return false;
+	if (*p++ != xdr_zero)
+		return false;
+	if (*p++ != xdr_zero)
+		return false;
+
+	/* XID sanity */
+	if (*p++ != *rdma_resp)
+		return false;
+	/* call direction */
+	if (*p == cpu_to_be32(RPC_CALL))
+		return false;
+
+	return true;
+}
+
+/**
+ * svc_rdma_recvfrom - Receive an RPC call
+ * @rqstp: request structure into which to receive an RPC Call
+ *
+ * Returns:
+ *	The positive number of bytes in the RPC Call message,
+ *	%0 if there were no Calls ready to return,
+ *	%-EINVAL if the Read chunk data is too large,
+ *	%-ENOMEM if rdma_rw context pool was exhausted,
+ *	%-ENOTCONN if posting failed (connection is lost),
+ *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ *
+ * Called in a loop when XPT_DATA is set. XPT_DATA is cleared only
+ * when there are no remaining ctxt's to process.
+ *
+ * The next ctxt is removed from the "receive" lists.
+ *
+ * - If the ctxt completes a Read, then finish assembling the Call
+ *   message and return the number of bytes in the message.
+ *
+ * - If the ctxt completes a Receive, then construct the Call
+ *   message from the contents of the Receive buffer.
+ *
+ *   - If there are no Read chunks in this message, then finish
+ *     assembling the Call message and return the number of bytes
+ *     in the message.
+ *
+ *   - If there are Read chunks in this message, post Read WRs to
+ *     pull that payload and return 0.
+ */
+int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+{
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+	struct svcxprt_rdma *rdma_xprt =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	struct svc_rdma_op_ctxt *ctxt;
+	__be32 *p;
+	int ret;
+
+	spin_lock(&rdma_xprt->sc_rq_dto_lock);
+	if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
+		ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q,
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
+		spin_unlock(&rdma_xprt->sc_rq_dto_lock);
+		rdma_read_complete(rqstp, ctxt);
+		goto complete;
+	} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
+		ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q,
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
+	} else {
+		/* No new incoming requests, terminate the loop */
+		clear_bit(XPT_DATA, &xprt->xpt_flags);
+		spin_unlock(&rdma_xprt->sc_rq_dto_lock);
+		return 0;
+	}
+	spin_unlock(&rdma_xprt->sc_rq_dto_lock);
+
+	dprintk("svcrdma: recvfrom: ctxt=%p on xprt=%p, rqstp=%p\n",
+		ctxt, rdma_xprt, rqstp);
+	atomic_inc(&rdma_stat_recv);
+
+	svc_rdma_build_arg_xdr(rqstp, ctxt);
+
+	p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
+	ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
+	if (ret < 0)
+		goto out_err;
+	if (ret == 0)
+		goto out_drop;
+	rqstp->rq_xprt_hlen = ret;
+
+	if (svc_rdma_is_backchannel_reply(xprt, p)) {
+		ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p,
+					       &rqstp->rq_arg);
+		svc_rdma_put_context(ctxt, 0);
+		return ret;
+	}
+
+	p += rpcrdma_fixed_maxsz;
+	if (*p != xdr_zero)
+		goto out_readchunk;
+
+complete:
+	svc_rdma_put_context(ctxt, 0);
+	dprintk("svcrdma: recvfrom: xprt=%p, rqstp=%p, rq_arg.len=%u\n",
+		rdma_xprt, rqstp, rqstp->rq_arg.len);
+	rqstp->rq_prot = IPPROTO_MAX;
+	svc_xprt_copy_addrs(rqstp, xprt);
+	return rqstp->rq_arg.len;
+
+out_readchunk:
+	ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p);
+	if (ret < 0)
+		goto out_postfail;
+	return 0;
+
+out_err:
+	svc_rdma_send_error(rdma_xprt, p, ret);
+	svc_rdma_put_context(ctxt, 0);
+	return 0;
+
+out_postfail:
+	if (ret == -EINVAL)
+		svc_rdma_send_error(rdma_xprt, p, ret);
+	svc_rdma_put_context(ctxt, 1);
+	return ret;
+
+out_drop:
+	svc_rdma_put_context(ctxt, 1);
+	return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
new file mode 100644
index 0000000..12b9a7e
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -0,0 +1,868 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016 Oracle.  All rights reserved.
+ *
+ * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
+ */
+
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/debug.h>
+
+#include <rdma/rw.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc);
+static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc);
+
+/* Each R/W context contains state for one chain of RDMA Read or
+ * Write Work Requests.
+ *
+ * Each WR chain handles a single contiguous server-side buffer,
+ * because scatterlist entries after the first have to start on
+ * page alignment. xdr_buf iovecs cannot guarantee alignment.
+ *
+ * Each WR chain handles only one R_key. Each RPC-over-RDMA segment
+ * from a client may contain a unique R_key, so each WR chain moves
+ * up to one segment at a time.
+ *
+ * The scatterlist makes this data structure over 4KB in size. To
+ * make it less likely to fail, and to handle the allocation for
+ * smaller I/O requests without disabling bottom-halves, these
+ * contexts are created on demand, but cached and reused until the
+ * controlling svcxprt_rdma is destroyed.
+ */
+struct svc_rdma_rw_ctxt {
+	struct list_head	rw_list;
+	struct rdma_rw_ctx	rw_ctx;
+	int			rw_nents;
+	struct sg_table		rw_sg_table;
+	struct scatterlist	rw_first_sgl[0];
+};
+
+static inline struct svc_rdma_rw_ctxt *
+svc_rdma_next_ctxt(struct list_head *list)
+{
+	return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt,
+					rw_list);
+}
+
+static struct svc_rdma_rw_ctxt *
+svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
+{
+	struct svc_rdma_rw_ctxt *ctxt;
+
+	spin_lock(&rdma->sc_rw_ctxt_lock);
+
+	ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts);
+	if (ctxt) {
+		list_del(&ctxt->rw_list);
+		spin_unlock(&rdma->sc_rw_ctxt_lock);
+	} else {
+		spin_unlock(&rdma->sc_rw_ctxt_lock);
+		ctxt = kmalloc(sizeof(*ctxt) +
+			       SG_CHUNK_SIZE * sizeof(struct scatterlist),
+			       GFP_KERNEL);
+		if (!ctxt)
+			goto out;
+		INIT_LIST_HEAD(&ctxt->rw_list);
+	}
+
+	ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
+	if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
+				   ctxt->rw_sg_table.sgl)) {
+		kfree(ctxt);
+		ctxt = NULL;
+	}
+out:
+	return ctxt;
+}
+
+static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
+				 struct svc_rdma_rw_ctxt *ctxt)
+{
+	sg_free_table_chained(&ctxt->rw_sg_table, true);
+
+	spin_lock(&rdma->sc_rw_ctxt_lock);
+	list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
+	spin_unlock(&rdma->sc_rw_ctxt_lock);
+}
+
+/**
+ * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts
+ * @rdma: transport about to be destroyed
+ *
+ */
+void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_rw_ctxt *ctxt;
+
+	while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) {
+		list_del(&ctxt->rw_list);
+		kfree(ctxt);
+	}
+}
+
+/* A chunk context tracks all I/O for moving one Read or Write
+ * chunk. This is a a set of rdma_rw's that handle data movement
+ * for all segments of one chunk.
+ *
+ * These are small, acquired with a single allocator call, and
+ * no more than one is needed per chunk. They are allocated on
+ * demand, and not cached.
+ */
+struct svc_rdma_chunk_ctxt {
+	struct ib_cqe		cc_cqe;
+	struct svcxprt_rdma	*cc_rdma;
+	struct list_head	cc_rwctxts;
+	int			cc_sqecount;
+};
+
+static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
+			     struct svc_rdma_chunk_ctxt *cc)
+{
+	cc->cc_rdma = rdma;
+	svc_xprt_get(&rdma->sc_xprt);
+
+	INIT_LIST_HEAD(&cc->cc_rwctxts);
+	cc->cc_sqecount = 0;
+}
+
+static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
+				enum dma_data_direction dir)
+{
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_rw_ctxt *ctxt;
+
+	while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
+		list_del(&ctxt->rw_list);
+
+		rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
+				    rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+				    ctxt->rw_nents, dir);
+		svc_rdma_put_rw_ctxt(rdma, ctxt);
+	}
+	svc_xprt_put(&rdma->sc_xprt);
+}
+
+/* State for sending a Write or Reply chunk.
+ *  - Tracks progress of writing one chunk over all its segments
+ *  - Stores arguments for the SGL constructor functions
+ */
+struct svc_rdma_write_info {
+	/* write state of this chunk */
+	unsigned int		wi_seg_off;
+	unsigned int		wi_seg_no;
+	unsigned int		wi_nsegs;
+	__be32			*wi_segs;
+
+	/* SGL constructor arguments */
+	struct xdr_buf		*wi_xdr;
+	unsigned char		*wi_base;
+	unsigned int		wi_next_off;
+
+	struct svc_rdma_chunk_ctxt	wi_cc;
+};
+
+static struct svc_rdma_write_info *
+svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
+{
+	struct svc_rdma_write_info *info;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return info;
+
+	info->wi_seg_off = 0;
+	info->wi_seg_no = 0;
+	info->wi_nsegs = be32_to_cpup(++chunk);
+	info->wi_segs = ++chunk;
+	svc_rdma_cc_init(rdma, &info->wi_cc);
+	info->wi_cc.cc_cqe.done = svc_rdma_write_done;
+	return info;
+}
+
+static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
+{
+	svc_rdma_cc_release(&info->wi_cc, DMA_TO_DEVICE);
+	kfree(info);
+}
+
+/**
+ * svc_rdma_write_done - Write chunk completion
+ * @cq: controlling Completion Queue
+ * @wc: Work Completion
+ *
+ * Pages under I/O are freed by a subsequent Send completion.
+ */
+static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct svc_rdma_chunk_ctxt *cc =
+			container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_write_info *info =
+			container_of(cc, struct svc_rdma_write_info, wi_cc);
+
+	atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+	wake_up(&rdma->sc_send_wait);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			pr_err("svcrdma: write ctx: %s (%u/0x%x)\n",
+			       ib_wc_status_msg(wc->status),
+			       wc->status, wc->vendor_err);
+	}
+
+	svc_rdma_write_info_free(info);
+}
+
+/* State for pulling a Read chunk.
+ */
+struct svc_rdma_read_info {
+	struct svc_rdma_op_ctxt		*ri_readctxt;
+	unsigned int			ri_position;
+	unsigned int			ri_pageno;
+	unsigned int			ri_pageoff;
+	unsigned int			ri_chunklen;
+
+	struct svc_rdma_chunk_ctxt	ri_cc;
+};
+
+static struct svc_rdma_read_info *
+svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_read_info *info;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return info;
+
+	svc_rdma_cc_init(rdma, &info->ri_cc);
+	info->ri_cc.cc_cqe.done = svc_rdma_wc_read_done;
+	return info;
+}
+
+static void svc_rdma_read_info_free(struct svc_rdma_read_info *info)
+{
+	svc_rdma_cc_release(&info->ri_cc, DMA_FROM_DEVICE);
+	kfree(info);
+}
+
+/**
+ * svc_rdma_wc_read_done - Handle completion of an RDMA Read ctx
+ * @cq: controlling Completion Queue
+ * @wc: Work Completion
+ *
+ */
+static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct svc_rdma_chunk_ctxt *cc =
+			container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_read_info *info =
+			container_of(cc, struct svc_rdma_read_info, ri_cc);
+
+	atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+	wake_up(&rdma->sc_send_wait);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			pr_err("svcrdma: read ctx: %s (%u/0x%x)\n",
+			       ib_wc_status_msg(wc->status),
+			       wc->status, wc->vendor_err);
+		svc_rdma_put_context(info->ri_readctxt, 1);
+	} else {
+		spin_lock(&rdma->sc_rq_dto_lock);
+		list_add_tail(&info->ri_readctxt->list,
+			      &rdma->sc_read_complete_q);
+		spin_unlock(&rdma->sc_rq_dto_lock);
+
+		set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
+		svc_xprt_enqueue(&rdma->sc_xprt);
+	}
+
+	svc_rdma_read_info_free(info);
+}
+
+/* This function sleeps when the transport's Send Queue is congested.
+ *
+ * Assumptions:
+ * - If ib_post_send() succeeds, only one completion is expected,
+ *   even if one or more WRs are flushed. This is true when posting
+ *   an rdma_rw_ctx or when posting a single signaled WR.
+ */
+static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
+{
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_xprt *xprt = &rdma->sc_xprt;
+	struct ib_send_wr *first_wr, *bad_wr;
+	struct list_head *tmp;
+	struct ib_cqe *cqe;
+	int ret;
+
+	if (cc->cc_sqecount > rdma->sc_sq_depth)
+		return -EINVAL;
+
+	first_wr = NULL;
+	cqe = &cc->cc_cqe;
+	list_for_each(tmp, &cc->cc_rwctxts) {
+		struct svc_rdma_rw_ctxt *ctxt;
+
+		ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list);
+		first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp,
+					   rdma->sc_port_num, cqe, first_wr);
+		cqe = NULL;
+	}
+
+	do {
+		if (atomic_sub_return(cc->cc_sqecount,
+				      &rdma->sc_sq_avail) > 0) {
+			ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+			if (ret)
+				break;
+			return 0;
+		}
+
+		atomic_inc(&rdma_stat_sq_starve);
+		atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+		wait_event(rdma->sc_send_wait,
+			   atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
+	} while (1);
+
+	pr_err("svcrdma: ib_post_send failed (%d)\n", ret);
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+
+	/* If even one was posted, there will be a completion. */
+	if (bad_wr != first_wr)
+		return 0;
+
+	atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+	wake_up(&rdma->sc_send_wait);
+	return -ENOTCONN;
+}
+
+/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
+ */
+static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
+			       unsigned int len,
+			       struct svc_rdma_rw_ctxt *ctxt)
+{
+	struct scatterlist *sg = ctxt->rw_sg_table.sgl;
+
+	sg_set_buf(&sg[0], info->wi_base, len);
+	info->wi_base += len;
+
+	ctxt->rw_nents = 1;
+}
+
+/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
+ */
+static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
+				    unsigned int remaining,
+				    struct svc_rdma_rw_ctxt *ctxt)
+{
+	unsigned int sge_no, sge_bytes, page_off, page_no;
+	struct xdr_buf *xdr = info->wi_xdr;
+	struct scatterlist *sg;
+	struct page **page;
+
+	page_off = info->wi_next_off + xdr->page_base;
+	page_no = page_off >> PAGE_SHIFT;
+	page_off = offset_in_page(page_off);
+	page = xdr->pages + page_no;
+	info->wi_next_off += remaining;
+	sg = ctxt->rw_sg_table.sgl;
+	sge_no = 0;
+	do {
+		sge_bytes = min_t(unsigned int, remaining,
+				  PAGE_SIZE - page_off);
+		sg_set_page(sg, *page, sge_bytes, page_off);
+
+		remaining -= sge_bytes;
+		sg = sg_next(sg);
+		page_off = 0;
+		sge_no++;
+		page++;
+	} while (remaining);
+
+	ctxt->rw_nents = sge_no;
+}
+
+/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
+ * an RPC Reply.
+ */
+static int
+svc_rdma_build_writes(struct svc_rdma_write_info *info,
+		      void (*constructor)(struct svc_rdma_write_info *info,
+					  unsigned int len,
+					  struct svc_rdma_rw_ctxt *ctxt),
+		      unsigned int remaining)
+{
+	struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_rw_ctxt *ctxt;
+	__be32 *seg;
+	int ret;
+
+	seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
+	do {
+		unsigned int write_len;
+		u32 seg_length, seg_handle;
+		u64 seg_offset;
+
+		if (info->wi_seg_no >= info->wi_nsegs)
+			goto out_overflow;
+
+		seg_handle = be32_to_cpup(seg);
+		seg_length = be32_to_cpup(seg + 1);
+		xdr_decode_hyper(seg + 2, &seg_offset);
+		seg_offset += info->wi_seg_off;
+
+		write_len = min(remaining, seg_length - info->wi_seg_off);
+		ctxt = svc_rdma_get_rw_ctxt(rdma,
+					    (write_len >> PAGE_SHIFT) + 2);
+		if (!ctxt)
+			goto out_noctx;
+
+		constructor(info, write_len, ctxt);
+		ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp,
+				       rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+				       ctxt->rw_nents, 0, seg_offset,
+				       seg_handle, DMA_TO_DEVICE);
+		if (ret < 0)
+			goto out_initerr;
+
+		list_add(&ctxt->rw_list, &cc->cc_rwctxts);
+		cc->cc_sqecount += ret;
+		if (write_len == seg_length - info->wi_seg_off) {
+			seg += 4;
+			info->wi_seg_no++;
+			info->wi_seg_off = 0;
+		} else {
+			info->wi_seg_off += write_len;
+		}
+		remaining -= write_len;
+	} while (remaining);
+
+	return 0;
+
+out_overflow:
+	dprintk("svcrdma: inadequate space in Write chunk (%u)\n",
+		info->wi_nsegs);
+	return -E2BIG;
+
+out_noctx:
+	dprintk("svcrdma: no R/W ctxs available\n");
+	return -ENOMEM;
+
+out_initerr:
+	svc_rdma_put_rw_ctxt(rdma, ctxt);
+	pr_err("svcrdma: failed to map pagelist (%d)\n", ret);
+	return -EIO;
+}
+
+/* Send one of an xdr_buf's kvecs by itself. To send a Reply
+ * chunk, the whole RPC Reply is written back to the client.
+ * This function writes either the head or tail of the xdr_buf
+ * containing the Reply.
+ */
+static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
+				  struct kvec *vec)
+{
+	info->wi_base = vec->iov_base;
+	return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
+				     vec->iov_len);
+}
+
+/* Send an xdr_buf's page list by itself. A Write chunk is
+ * just the page list. a Reply chunk is the head, page list,
+ * and tail. This function is shared between the two types
+ * of chunk.
+ */
+static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
+				      struct xdr_buf *xdr)
+{
+	info->wi_xdr = xdr;
+	info->wi_next_off = 0;
+	return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
+				     xdr->page_len);
+}
+
+/**
+ * svc_rdma_send_write_chunk - Write all segments in a Write chunk
+ * @rdma: controlling RDMA transport
+ * @wr_ch: Write chunk provided by client
+ * @xdr: xdr_buf containing the data payload
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ *	%-E2BIG if the payload was larger than the Write chunk,
+ *	%-EINVAL if client provided too many segments,
+ *	%-ENOMEM if rdma_rw context pool was exhausted,
+ *	%-ENOTCONN if posting failed (connection is lost),
+ *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
+			      struct xdr_buf *xdr)
+{
+	struct svc_rdma_write_info *info;
+	int ret;
+
+	if (!xdr->page_len)
+		return 0;
+
+	info = svc_rdma_write_info_alloc(rdma, wr_ch);
+	if (!info)
+		return -ENOMEM;
+
+	ret = svc_rdma_send_xdr_pagelist(info, xdr);
+	if (ret < 0)
+		goto out_err;
+
+	ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+	if (ret < 0)
+		goto out_err;
+	return xdr->page_len;
+
+out_err:
+	svc_rdma_write_info_free(info);
+	return ret;
+}
+
+/**
+ * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
+ * @rdma: controlling RDMA transport
+ * @rp_ch: Reply chunk provided by client
+ * @writelist: true if client provided a Write list
+ * @xdr: xdr_buf containing an RPC Reply
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ *	%-E2BIG if the payload was larger than the Reply chunk,
+ *	%-EINVAL if client provided too many segments,
+ *	%-ENOMEM if rdma_rw context pool was exhausted,
+ *	%-ENOTCONN if posting failed (connection is lost),
+ *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
+			      bool writelist, struct xdr_buf *xdr)
+{
+	struct svc_rdma_write_info *info;
+	int consumed, ret;
+
+	info = svc_rdma_write_info_alloc(rdma, rp_ch);
+	if (!info)
+		return -ENOMEM;
+
+	ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
+	if (ret < 0)
+		goto out_err;
+	consumed = xdr->head[0].iov_len;
+
+	/* Send the page list in the Reply chunk only if the
+	 * client did not provide Write chunks.
+	 */
+	if (!writelist && xdr->page_len) {
+		ret = svc_rdma_send_xdr_pagelist(info, xdr);
+		if (ret < 0)
+			goto out_err;
+		consumed += xdr->page_len;
+	}
+
+	if (xdr->tail[0].iov_len) {
+		ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
+		if (ret < 0)
+			goto out_err;
+		consumed += xdr->tail[0].iov_len;
+	}
+
+	ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+	if (ret < 0)
+		goto out_err;
+	return consumed;
+
+out_err:
+	svc_rdma_write_info_free(info);
+	return ret;
+}
+
+static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
+				       struct svc_rqst *rqstp,
+				       u32 rkey, u32 len, u64 offset)
+{
+	struct svc_rdma_op_ctxt *head = info->ri_readctxt;
+	struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
+	struct svc_rdma_rw_ctxt *ctxt;
+	unsigned int sge_no, seg_len;
+	struct scatterlist *sg;
+	int ret;
+
+	sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT;
+	ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no);
+	if (!ctxt)
+		goto out_noctx;
+	ctxt->rw_nents = sge_no;
+
+	dprintk("svcrdma: reading segment %u@0x%016llx:0x%08x (%u sges)\n",
+		len, offset, rkey, sge_no);
+
+	sg = ctxt->rw_sg_table.sgl;
+	for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) {
+		seg_len = min_t(unsigned int, len,
+				PAGE_SIZE - info->ri_pageoff);
+
+		head->arg.pages[info->ri_pageno] =
+			rqstp->rq_pages[info->ri_pageno];
+		if (!info->ri_pageoff)
+			head->count++;
+
+		sg_set_page(sg, rqstp->rq_pages[info->ri_pageno],
+			    seg_len, info->ri_pageoff);
+		sg = sg_next(sg);
+
+		info->ri_pageoff += seg_len;
+		if (info->ri_pageoff == PAGE_SIZE) {
+			info->ri_pageno++;
+			info->ri_pageoff = 0;
+		}
+		len -= seg_len;
+
+		/* Safety check */
+		if (len &&
+		    &rqstp->rq_pages[info->ri_pageno + 1] > rqstp->rq_page_end)
+			goto out_overrun;
+	}
+
+	ret = rdma_rw_ctx_init(&ctxt->rw_ctx, cc->cc_rdma->sc_qp,
+			       cc->cc_rdma->sc_port_num,
+			       ctxt->rw_sg_table.sgl, ctxt->rw_nents,
+			       0, offset, rkey, DMA_FROM_DEVICE);
+	if (ret < 0)
+		goto out_initerr;
+
+	list_add(&ctxt->rw_list, &cc->cc_rwctxts);
+	cc->cc_sqecount += ret;
+	return 0;
+
+out_noctx:
+	dprintk("svcrdma: no R/W ctxs available\n");
+	return -ENOMEM;
+
+out_overrun:
+	dprintk("svcrdma: request overruns rq_pages\n");
+	return -EINVAL;
+
+out_initerr:
+	svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt);
+	pr_err("svcrdma: failed to map pagelist (%d)\n", ret);
+	return -EIO;
+}
+
+/* Walk the segments in the Read chunk starting at @p and construct
+ * RDMA Read operations to pull the chunk to the server.
+ */
+static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
+				     struct svc_rdma_read_info *info,
+				     __be32 *p)
+{
+	int ret;
+
+	ret = -EINVAL;
+	info->ri_chunklen = 0;
+	while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) {
+		u32 rs_handle, rs_length;
+		u64 rs_offset;
+
+		rs_handle = be32_to_cpup(p++);
+		rs_length = be32_to_cpup(p++);
+		p = xdr_decode_hyper(p, &rs_offset);
+
+		ret = svc_rdma_build_read_segment(info, rqstp,
+						  rs_handle, rs_length,
+						  rs_offset);
+		if (ret < 0)
+			break;
+
+		info->ri_chunklen += rs_length;
+	}
+
+	return ret;
+}
+
+/* Construct RDMA Reads to pull over a normal Read chunk. The chunk
+ * data lands in the page list of head->arg.pages.
+ *
+ * Currently NFSD does not look at the head->arg.tail[0] iovec.
+ * Therefore, XDR round-up of the Read chunk and trailing
+ * inline content must both be added at the end of the pagelist.
+ */
+static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
+					    struct svc_rdma_read_info *info,
+					    __be32 *p)
+{
+	struct svc_rdma_op_ctxt *head = info->ri_readctxt;
+	int ret;
+
+	dprintk("svcrdma: Reading Read chunk at position %u\n",
+		info->ri_position);
+
+	info->ri_pageno = head->hdr_count;
+	info->ri_pageoff = 0;
+
+	ret = svc_rdma_build_read_chunk(rqstp, info, p);
+	if (ret < 0)
+		goto out;
+
+	/* Split the Receive buffer between the head and tail
+	 * buffers at Read chunk's position. XDR roundup of the
+	 * chunk is not included in either the pagelist or in
+	 * the tail.
+	 */
+	head->arg.tail[0].iov_base =
+		head->arg.head[0].iov_base + info->ri_position;
+	head->arg.tail[0].iov_len =
+		head->arg.head[0].iov_len - info->ri_position;
+	head->arg.head[0].iov_len = info->ri_position;
+
+	/* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2).
+	 *
+	 * If the client already rounded up the chunk length, the
+	 * length does not change. Otherwise, the length of the page
+	 * list is increased to include XDR round-up.
+	 *
+	 * Currently these chunks always start at page offset 0,
+	 * thus the rounded-up length never crosses a page boundary.
+	 */
+	info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2;
+
+	head->arg.page_len = info->ri_chunklen;
+	head->arg.len += info->ri_chunklen;
+	head->arg.buflen += info->ri_chunklen;
+
+out:
+	return ret;
+}
+
+/* Construct RDMA Reads to pull over a Position Zero Read chunk.
+ * The start of the data lands in the first page just after
+ * the Transport header, and the rest lands in the page list of
+ * head->arg.pages.
+ *
+ * Assumptions:
+ *	- A PZRC has an XDR-aligned length (no implicit round-up).
+ *	- There can be no trailing inline content (IOW, we assume
+ *	  a PZRC is never sent in an RDMA_MSG message, though it's
+ *	  allowed by spec).
+ */
+static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
+					struct svc_rdma_read_info *info,
+					__be32 *p)
+{
+	struct svc_rdma_op_ctxt *head = info->ri_readctxt;
+	int ret;
+
+	dprintk("svcrdma: Reading Position Zero Read chunk\n");
+
+	info->ri_pageno = head->hdr_count - 1;
+	info->ri_pageoff = offset_in_page(head->byte_len);
+
+	ret = svc_rdma_build_read_chunk(rqstp, info, p);
+	if (ret < 0)
+		goto out;
+
+	head->arg.len += info->ri_chunklen;
+	head->arg.buflen += info->ri_chunklen;
+
+	if (head->arg.buflen <= head->sge[0].length) {
+		/* Transport header and RPC message fit entirely
+		 * in page where head iovec resides.
+		 */
+		head->arg.head[0].iov_len = info->ri_chunklen;
+	} else {
+		/* Transport header and part of RPC message reside
+		 * in the head iovec's page.
+		 */
+		head->arg.head[0].iov_len =
+				head->sge[0].length - head->byte_len;
+		head->arg.page_len =
+				info->ri_chunklen - head->arg.head[0].iov_len;
+	}
+
+out:
+	return ret;
+}
+
+/**
+ * svc_rdma_recv_read_chunk - Pull a Read chunk from the client
+ * @rdma: controlling RDMA transport
+ * @rqstp: set of pages to use as Read sink buffers
+ * @head: pages under I/O collect here
+ * @p: pointer to start of Read chunk
+ *
+ * Returns:
+ *	%0 if all needed RDMA Reads were posted successfully,
+ *	%-EINVAL if client provided too many segments,
+ *	%-ENOMEM if rdma_rw context pool was exhausted,
+ *	%-ENOTCONN if posting failed (connection is lost),
+ *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ *
+ * Assumptions:
+ * - All Read segments in @p have the same Position value.
+ */
+int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
+			     struct svc_rdma_op_ctxt *head, __be32 *p)
+{
+	struct svc_rdma_read_info *info;
+	struct page **page;
+	int ret;
+
+	/* The request (with page list) is constructed in
+	 * head->arg. Pages involved with RDMA Read I/O are
+	 * transferred there.
+	 */
+	head->hdr_count = head->count;
+	head->arg.head[0] = rqstp->rq_arg.head[0];
+	head->arg.tail[0] = rqstp->rq_arg.tail[0];
+	head->arg.pages = head->pages;
+	head->arg.page_base = 0;
+	head->arg.page_len = 0;
+	head->arg.len = rqstp->rq_arg.len;
+	head->arg.buflen = rqstp->rq_arg.buflen;
+
+	info = svc_rdma_read_info_alloc(rdma);
+	if (!info)
+		return -ENOMEM;
+	info->ri_readctxt = head;
+
+	info->ri_position = be32_to_cpup(p + 1);
+	if (info->ri_position)
+		ret = svc_rdma_build_normal_read_chunk(rqstp, info, p);
+	else
+		ret = svc_rdma_build_pz_read_chunk(rqstp, info, p);
+
+	/* Mark the start of the pages that can be used for the reply */
+	if (info->ri_pageoff > 0)
+		info->ri_pageno++;
+	rqstp->rq_respages = &rqstp->rq_pages[info->ri_pageno];
+	rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+	if (ret < 0)
+		goto out;
+
+	ret = svc_rdma_post_chunk_ctxt(&info->ri_cc);
+
+out:
+	/* Read sink pages have been moved from rqstp->rq_pages to
+	 * head->arg.pages. Force svc_recv to refill those slots
+	 * in rq_pages.
+	 */
+	for (page = rqstp->rq_pages; page < rqstp->rq_respages; page++)
+		*page = NULL;
+
+	if (ret < 0)
+		svc_rdma_read_info_free(info);
+	return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
new file mode 100644
index 0000000..649441d
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -0,0 +1,699 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+/* Operation
+ *
+ * The main entry point is svc_rdma_sendto. This is called by the
+ * RPC server when an RPC Reply is ready to be transmitted to a client.
+ *
+ * The passed-in svc_rqst contains a struct xdr_buf which holds an
+ * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA
+ * transport header, post all Write WRs needed for this Reply, then post
+ * a Send WR conveying the transport header and the RPC message itself to
+ * the client.
+ *
+ * svc_rdma_sendto must fully transmit the Reply before returning, as
+ * the svc_rqst will be recycled as soon as sendto returns. Remaining
+ * resources referred to by the svc_rqst are also recycled at that time.
+ * Therefore any resources that must remain longer must be detached
+ * from the svc_rqst and released later.
+ *
+ * Page Management
+ *
+ * The I/O that performs Reply transmission is asynchronous, and may
+ * complete well after sendto returns. Thus pages under I/O must be
+ * removed from the svc_rqst before sendto returns.
+ *
+ * The logic here depends on Send Queue and completion ordering. Since
+ * the Send WR is always posted last, it will always complete last. Thus
+ * when it completes, it is guaranteed that all previous Write WRs have
+ * also completed.
+ *
+ * Write WRs are constructed and posted. Each Write segment gets its own
+ * svc_rdma_rw_ctxt, allowing the Write completion handler to find and
+ * DMA-unmap the pages under I/O for that Write segment. The Write
+ * completion handler does not release any pages.
+ *
+ * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt.
+ * The ownership of all of the Reply's pages are transferred into that
+ * ctxt, the Send WR is posted, and sendto returns.
+ *
+ * The svc_rdma_op_ctxt is presented when the Send WR completes. The
+ * Send completion handler finally releases the Reply's pages.
+ *
+ * This mechanism also assumes that completions on the transport's Send
+ * Completion Queue do not run in parallel. Otherwise a Write completion
+ * and Send completion running at the same time could release pages that
+ * are still DMA-mapped.
+ *
+ * Error Handling
+ *
+ * - If the Send WR is posted successfully, it will either complete
+ *   successfully, or get flushed. Either way, the Send completion
+ *   handler releases the Reply's pages.
+ * - If the Send WR cannot be not posted, the forward path releases
+ *   the Reply's pages.
+ *
+ * This handles the case, without the use of page reference counting,
+ * where two different Write segments send portions of the same page.
+ */
+
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/spinlock.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+static u32 xdr_padsize(u32 len)
+{
+	return (len & 3) ? (4 - (len & 3)) : 0;
+}
+
+/* Returns length of transport header, in bytes.
+ */
+static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp)
+{
+	unsigned int nsegs;
+	__be32 *p;
+
+	p = rdma_resp;
+
+	/* RPC-over-RDMA V1 replies never have a Read list. */
+	p += rpcrdma_fixed_maxsz + 1;
+
+	/* Skip Write list. */
+	while (*p++ != xdr_zero) {
+		nsegs = be32_to_cpup(p++);
+		p += nsegs * rpcrdma_segment_maxsz;
+	}
+
+	/* Skip Reply chunk. */
+	if (*p++ != xdr_zero) {
+		nsegs = be32_to_cpup(p++);
+		p += nsegs * rpcrdma_segment_maxsz;
+	}
+
+	return (unsigned long)p - (unsigned long)rdma_resp;
+}
+
+/* One Write chunk is copied from Call transport header to Reply
+ * transport header. Each segment's length field is updated to
+ * reflect number of bytes consumed in the segment.
+ *
+ * Returns number of segments in this chunk.
+ */
+static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src,
+					   unsigned int remaining)
+{
+	unsigned int i, nsegs;
+	u32 seg_len;
+
+	/* Write list discriminator */
+	*dst++ = *src++;
+
+	/* number of segments in this chunk */
+	nsegs = be32_to_cpup(src);
+	*dst++ = *src++;
+
+	for (i = nsegs; i; i--) {
+		/* segment's RDMA handle */
+		*dst++ = *src++;
+
+		/* bytes returned in this segment */
+		seg_len = be32_to_cpu(*src);
+		if (remaining >= seg_len) {
+			/* entire segment was consumed */
+			*dst = *src;
+			remaining -= seg_len;
+		} else {
+			/* segment only partly filled */
+			*dst = cpu_to_be32(remaining);
+			remaining = 0;
+		}
+		dst++; src++;
+
+		/* segment's RDMA offset */
+		*dst++ = *src++;
+		*dst++ = *src++;
+	}
+
+	return nsegs;
+}
+
+/* The client provided a Write list in the Call message. Fill in
+ * the segments in the first Write chunk in the Reply's transport
+ * header with the number of bytes consumed in each segment.
+ * Remaining chunks are returned unused.
+ *
+ * Assumptions:
+ *  - Client has provided only one Write chunk
+ */
+static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch,
+					   unsigned int consumed)
+{
+	unsigned int nsegs;
+	__be32 *p, *q;
+
+	/* RPC-over-RDMA V1 replies never have a Read list. */
+	p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+
+	q = wr_ch;
+	while (*q != xdr_zero) {
+		nsegs = xdr_encode_write_chunk(p, q, consumed);
+		q += 2 + nsegs * rpcrdma_segment_maxsz;
+		p += 2 + nsegs * rpcrdma_segment_maxsz;
+		consumed = 0;
+	}
+
+	/* Terminate Write list */
+	*p++ = xdr_zero;
+
+	/* Reply chunk discriminator; may be replaced later */
+	*p = xdr_zero;
+}
+
+/* The client provided a Reply chunk in the Call message. Fill in
+ * the segments in the Reply chunk in the Reply message with the
+ * number of bytes consumed in each segment.
+ *
+ * Assumptions:
+ * - Reply can always fit in the provided Reply chunk
+ */
+static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
+					    unsigned int consumed)
+{
+	__be32 *p;
+
+	/* Find the Reply chunk in the Reply's xprt header.
+	 * RPC-over-RDMA V1 replies never have a Read list.
+	 */
+	p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+
+	/* Skip past Write list */
+	while (*p++ != xdr_zero)
+		p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+
+	xdr_encode_write_chunk(p, rp_ch, consumed);
+}
+
+/* Parse the RPC Call's transport header.
+ */
+static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
+				      __be32 **write, __be32 **reply)
+{
+	__be32 *p;
+
+	p = rdma_argp + rpcrdma_fixed_maxsz;
+
+	/* Read list */
+	while (*p++ != xdr_zero)
+		p += 5;
+
+	/* Write list */
+	if (*p != xdr_zero) {
+		*write = p;
+		while (*p++ != xdr_zero)
+			p += 1 + be32_to_cpu(*p) * 4;
+	} else {
+		*write = NULL;
+		p++;
+	}
+
+	/* Reply chunk */
+	if (*p != xdr_zero)
+		*reply = p;
+	else
+		*reply = NULL;
+}
+
+/* RPC-over-RDMA Version One private extension: Remote Invalidation.
+ * Responder's choice: requester signals it can handle Send With
+ * Invalidate, and responder chooses one rkey to invalidate.
+ *
+ * Find a candidate rkey to invalidate when sending a reply.  Picks the
+ * first R_key it finds in the chunk lists.
+ *
+ * Returns zero if RPC's chunk lists are empty.
+ */
+static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
+				 __be32 *wr_lst, __be32 *rp_ch)
+{
+	__be32 *p;
+
+	p = rdma_argp + rpcrdma_fixed_maxsz;
+	if (*p != xdr_zero)
+		p += 2;
+	else if (wr_lst && be32_to_cpup(wr_lst + 1))
+		p = wr_lst + 2;
+	else if (rp_ch && be32_to_cpup(rp_ch + 1))
+		p = rp_ch + 2;
+	else
+		return 0;
+	return be32_to_cpup(p);
+}
+
+/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
+ * is used during completion to DMA-unmap this memory, and
+ * it uses ib_dma_unmap_page() exclusively.
+ */
+static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
+				struct svc_rdma_op_ctxt *ctxt,
+				unsigned int sge_no,
+				unsigned char *base,
+				unsigned int len)
+{
+	unsigned long offset = (unsigned long)base & ~PAGE_MASK;
+	struct ib_device *dev = rdma->sc_cm_id->device;
+	dma_addr_t dma_addr;
+
+	dma_addr = ib_dma_map_page(dev, virt_to_page(base),
+				   offset, len, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(dev, dma_addr))
+		goto out_maperr;
+
+	ctxt->sge[sge_no].addr = dma_addr;
+	ctxt->sge[sge_no].length = len;
+	ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+	svc_rdma_count_mappings(rdma, ctxt);
+	return 0;
+
+out_maperr:
+	pr_err("svcrdma: failed to map buffer\n");
+	return -EIO;
+}
+
+static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
+				 struct svc_rdma_op_ctxt *ctxt,
+				 unsigned int sge_no,
+				 struct page *page,
+				 unsigned int offset,
+				 unsigned int len)
+{
+	struct ib_device *dev = rdma->sc_cm_id->device;
+	dma_addr_t dma_addr;
+
+	dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(dev, dma_addr))
+		goto out_maperr;
+
+	ctxt->sge[sge_no].addr = dma_addr;
+	ctxt->sge[sge_no].length = len;
+	ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+	svc_rdma_count_mappings(rdma, ctxt);
+	return 0;
+
+out_maperr:
+	pr_err("svcrdma: failed to map page\n");
+	return -EIO;
+}
+
+/**
+ * svc_rdma_map_reply_hdr - DMA map the transport header buffer
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for the Send WR
+ * @rdma_resp: buffer containing transport header
+ * @len: length of transport header
+ *
+ * Returns:
+ *	%0 if the header is DMA mapped,
+ *	%-EIO if DMA mapping failed.
+ */
+int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
+			   struct svc_rdma_op_ctxt *ctxt,
+			   __be32 *rdma_resp,
+			   unsigned int len)
+{
+	ctxt->direction = DMA_TO_DEVICE;
+	ctxt->pages[0] = virt_to_page(rdma_resp);
+	ctxt->count = 1;
+	return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len);
+}
+
+/* Load the xdr_buf into the ctxt's sge array, and DMA map each
+ * element as it is added.
+ *
+ * Returns the number of sge elements loaded on success, or
+ * a negative errno on failure.
+ */
+static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
+				  struct svc_rdma_op_ctxt *ctxt,
+				  struct xdr_buf *xdr, __be32 *wr_lst)
+{
+	unsigned int len, sge_no, remaining, page_off;
+	struct page **ppages;
+	unsigned char *base;
+	u32 xdr_pad;
+	int ret;
+
+	sge_no = 1;
+
+	ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++,
+				   xdr->head[0].iov_base,
+				   xdr->head[0].iov_len);
+	if (ret < 0)
+		return ret;
+
+	/* If a Write chunk is present, the xdr_buf's page list
+	 * is not included inline. However the Upper Layer may
+	 * have added XDR padding in the tail buffer, and that
+	 * should not be included inline.
+	 */
+	if (wr_lst) {
+		base = xdr->tail[0].iov_base;
+		len = xdr->tail[0].iov_len;
+		xdr_pad = xdr_padsize(xdr->page_len);
+
+		if (len && xdr_pad) {
+			base += xdr_pad;
+			len -= xdr_pad;
+		}
+
+		goto tail;
+	}
+
+	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+	page_off = xdr->page_base & ~PAGE_MASK;
+	remaining = xdr->page_len;
+	while (remaining) {
+		len = min_t(u32, PAGE_SIZE - page_off, remaining);
+
+		ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++,
+					    *ppages++, page_off, len);
+		if (ret < 0)
+			return ret;
+
+		remaining -= len;
+		page_off = 0;
+	}
+
+	base = xdr->tail[0].iov_base;
+	len = xdr->tail[0].iov_len;
+tail:
+	if (len) {
+		ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len);
+		if (ret < 0)
+			return ret;
+	}
+
+	return sge_no - 1;
+}
+
+/* The svc_rqst and all resources it owns are released as soon as
+ * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
+ * so they are released by the Send completion handler.
+ */
+static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
+				   struct svc_rdma_op_ctxt *ctxt)
+{
+	int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
+
+	ctxt->count += pages;
+	for (i = 0; i < pages; i++) {
+		ctxt->pages[i + 1] = rqstp->rq_respages[i];
+		rqstp->rq_respages[i] = NULL;
+	}
+	rqstp->rq_next_page = rqstp->rq_respages + 1;
+}
+
+/**
+ * svc_rdma_post_send_wr - Set up and post one Send Work Request
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for transmitting the Send WR
+ * @num_sge: number of SGEs to send
+ * @inv_rkey: R_key argument to Send With Invalidate, or zero
+ *
+ * Returns:
+ *	%0 if the Send* was posted successfully,
+ *	%-ENOTCONN if the connection was lost or dropped,
+ *	%-EINVAL if there was a problem with the Send we built,
+ *	%-ENOMEM if ib_post_send failed.
+ */
+int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
+			  struct svc_rdma_op_ctxt *ctxt, int num_sge,
+			  u32 inv_rkey)
+{
+	struct ib_send_wr *send_wr = &ctxt->send_wr;
+
+	dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge);
+
+	send_wr->next = NULL;
+	ctxt->cqe.done = svc_rdma_wc_send;
+	send_wr->wr_cqe = &ctxt->cqe;
+	send_wr->sg_list = ctxt->sge;
+	send_wr->num_sge = num_sge;
+	send_wr->send_flags = IB_SEND_SIGNALED;
+	if (inv_rkey) {
+		send_wr->opcode = IB_WR_SEND_WITH_INV;
+		send_wr->ex.invalidate_rkey = inv_rkey;
+	} else {
+		send_wr->opcode = IB_WR_SEND;
+	}
+
+	return svc_rdma_send(rdma, send_wr);
+}
+
+/* Prepare the portion of the RPC Reply that will be transmitted
+ * via RDMA Send. The RPC-over-RDMA transport header is prepared
+ * in sge[0], and the RPC xdr_buf is prepared in following sges.
+ *
+ * Depending on whether a Write list or Reply chunk is present,
+ * the server may send all, a portion of, or none of the xdr_buf.
+ * In the latter case, only the transport header (sge[0]) is
+ * transmitted.
+ *
+ * RDMA Send is the last step of transmitting an RPC reply. Pages
+ * involved in the earlier RDMA Writes are here transferred out
+ * of the rqstp and into the ctxt's page array. These pages are
+ * DMA unmapped by each Write completion, but the subsequent Send
+ * completion finally releases these pages.
+ *
+ * Assumptions:
+ * - The Reply's transport header will never be larger than a page.
+ */
+static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
+				   __be32 *rdma_argp, __be32 *rdma_resp,
+				   struct svc_rqst *rqstp,
+				   __be32 *wr_lst, __be32 *rp_ch)
+{
+	struct svc_rdma_op_ctxt *ctxt;
+	u32 inv_rkey;
+	int ret;
+
+	dprintk("svcrdma: sending %s reply: head=%zu, pagelen=%u, tail=%zu\n",
+		(rp_ch ? "RDMA_NOMSG" : "RDMA_MSG"),
+		rqstp->rq_res.head[0].iov_len,
+		rqstp->rq_res.page_len,
+		rqstp->rq_res.tail[0].iov_len);
+
+	ctxt = svc_rdma_get_context(rdma);
+
+	ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp,
+				     svc_rdma_reply_hdr_len(rdma_resp));
+	if (ret < 0)
+		goto err;
+
+	if (!rp_ch) {
+		ret = svc_rdma_map_reply_msg(rdma, ctxt,
+					     &rqstp->rq_res, wr_lst);
+		if (ret < 0)
+			goto err;
+	}
+
+	svc_rdma_save_io_pages(rqstp, ctxt);
+
+	inv_rkey = 0;
+	if (rdma->sc_snd_w_inv)
+		inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
+	ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 1);
+	return ret;
+}
+
+/* Given the client-provided Write and Reply chunks, the server was not
+ * able to form a complete reply. Return an RDMA_ERROR message so the
+ * client can retire this RPC transaction. As above, the Send completion
+ * routine releases payload pages that were part of a previous RDMA Write.
+ *
+ * Remote Invalidation is skipped for simplicity.
+ */
+static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
+				   __be32 *rdma_resp, struct svc_rqst *rqstp)
+{
+	struct svc_rdma_op_ctxt *ctxt;
+	__be32 *p;
+	int ret;
+
+	ctxt = svc_rdma_get_context(rdma);
+
+	/* Replace the original transport header with an
+	 * RDMA_ERROR response. XID etc are preserved.
+	 */
+	p = rdma_resp + 3;
+	*p++ = rdma_error;
+	*p   = err_chunk;
+
+	ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 20);
+	if (ret < 0)
+		goto err;
+
+	svc_rdma_save_io_pages(rqstp, ctxt);
+
+	ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, 0);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 1);
+	return ret;
+}
+
+void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+}
+
+/**
+ * svc_rdma_sendto - Transmit an RPC reply
+ * @rqstp: processed RPC request, reply XDR already in ::rq_res
+ *
+ * Any resources still associated with @rqstp are released upon return.
+ * If no reply message was possible, the connection is closed.
+ *
+ * Returns:
+ *	%0 if an RPC reply has been successfully posted,
+ *	%-ENOMEM if a resource shortage occurred (connection is lost),
+ *	%-ENOTCONN if posting failed (connection is lost).
+ */
+int svc_rdma_sendto(struct svc_rqst *rqstp)
+{
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	__be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch;
+	struct xdr_buf *xdr = &rqstp->rq_res;
+	struct page *res_page;
+	int ret;
+
+	/* Find the call's chunk lists to decide how to send the reply.
+	 * Receive places the Call's xprt header at the start of page 0.
+	 */
+	rdma_argp = page_address(rqstp->rq_pages[0]);
+	svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
+
+	dprintk("svcrdma: preparing response for XID 0x%08x\n",
+		be32_to_cpup(rdma_argp));
+
+	/* Create the RDMA response header. xprt->xpt_mutex,
+	 * acquired in svc_send(), serializes RPC replies. The
+	 * code path below that inserts the credit grant value
+	 * into each transport header runs only inside this
+	 * critical section.
+	 */
+	ret = -ENOMEM;
+	res_page = alloc_page(GFP_KERNEL);
+	if (!res_page)
+		goto err0;
+	rdma_resp = page_address(res_page);
+
+	p = rdma_resp;
+	*p++ = *rdma_argp;
+	*p++ = *(rdma_argp + 1);
+	*p++ = rdma->sc_fc_credits;
+	*p++ = rp_ch ? rdma_nomsg : rdma_msg;
+
+	/* Start with empty chunks */
+	*p++ = xdr_zero;
+	*p++ = xdr_zero;
+	*p   = xdr_zero;
+
+	if (wr_lst) {
+		/* XXX: Presume the client sent only one Write chunk */
+		ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr);
+		if (ret < 0)
+			goto err2;
+		svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret);
+	}
+	if (rp_ch) {
+		ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr);
+		if (ret < 0)
+			goto err2;
+		svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret);
+	}
+
+	ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp,
+				      wr_lst, rp_ch);
+	if (ret < 0)
+		goto err0;
+	return 0;
+
+ err2:
+	if (ret != -E2BIG && ret != -EINVAL)
+		goto err1;
+
+	ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp);
+	if (ret < 0)
+		goto err0;
+	return 0;
+
+ err1:
+	put_page(res_page);
+ err0:
+	pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
+	       ret);
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+	return -ENOTCONN;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
new file mode 100644
index 0000000..96cc8f6
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -0,0 +1,1048 @@
+/*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rw.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/export.h>
+#include "xprt_rdma.h"
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+static int svc_rdma_post_recv(struct svcxprt_rdma *xprt);
+static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *, int);
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+					struct net *net,
+					struct sockaddr *sa, int salen,
+					int flags);
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
+static void svc_rdma_release_rqst(struct svc_rqst *);
+static void svc_rdma_detach(struct svc_xprt *xprt);
+static void svc_rdma_free(struct svc_xprt *xprt);
+static int svc_rdma_has_wspace(struct svc_xprt *xprt);
+static void svc_rdma_secure_port(struct svc_rqst *);
+static void svc_rdma_kill_temp_xprt(struct svc_xprt *);
+
+static const struct svc_xprt_ops svc_rdma_ops = {
+	.xpo_create = svc_rdma_create,
+	.xpo_recvfrom = svc_rdma_recvfrom,
+	.xpo_sendto = svc_rdma_sendto,
+	.xpo_release_rqst = svc_rdma_release_rqst,
+	.xpo_detach = svc_rdma_detach,
+	.xpo_free = svc_rdma_free,
+	.xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
+	.xpo_has_wspace = svc_rdma_has_wspace,
+	.xpo_accept = svc_rdma_accept,
+	.xpo_secure_port = svc_rdma_secure_port,
+	.xpo_kill_temp_xprt = svc_rdma_kill_temp_xprt,
+};
+
+struct svc_xprt_class svc_rdma_class = {
+	.xcl_name = "rdma",
+	.xcl_owner = THIS_MODULE,
+	.xcl_ops = &svc_rdma_ops,
+	.xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA,
+	.xcl_ident = XPRT_TRANSPORT_RDMA,
+};
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *,
+					   struct sockaddr *, int, int);
+static void svc_rdma_bc_detach(struct svc_xprt *);
+static void svc_rdma_bc_free(struct svc_xprt *);
+
+static const struct svc_xprt_ops svc_rdma_bc_ops = {
+	.xpo_create = svc_rdma_bc_create,
+	.xpo_detach = svc_rdma_bc_detach,
+	.xpo_free = svc_rdma_bc_free,
+	.xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
+	.xpo_secure_port = svc_rdma_secure_port,
+};
+
+struct svc_xprt_class svc_rdma_bc_class = {
+	.xcl_name = "rdma-bc",
+	.xcl_owner = THIS_MODULE,
+	.xcl_ops = &svc_rdma_bc_ops,
+	.xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN)
+};
+
+static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv,
+					   struct net *net,
+					   struct sockaddr *sa, int salen,
+					   int flags)
+{
+	struct svcxprt_rdma *cma_xprt;
+	struct svc_xprt *xprt;
+
+	cma_xprt = rdma_create_xprt(serv, 0);
+	if (!cma_xprt)
+		return ERR_PTR(-ENOMEM);
+	xprt = &cma_xprt->sc_xprt;
+
+	svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv);
+	set_bit(XPT_CONG_CTRL, &xprt->xpt_flags);
+	serv->sv_bc_xprt = xprt;
+
+	dprintk("svcrdma: %s(%p)\n", __func__, xprt);
+	return xprt;
+}
+
+static void svc_rdma_bc_detach(struct svc_xprt *xprt)
+{
+	dprintk("svcrdma: %s(%p)\n", __func__, xprt);
+}
+
+static void svc_rdma_bc_free(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+	dprintk("svcrdma: %s(%p)\n", __func__, xprt);
+	if (xprt)
+		kfree(rdma);
+}
+#endif	/* CONFIG_SUNRPC_BACKCHANNEL */
+
+static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
+					   gfp_t flags)
+{
+	struct svc_rdma_op_ctxt *ctxt;
+
+	ctxt = kmalloc(sizeof(*ctxt), flags);
+	if (ctxt) {
+		ctxt->xprt = xprt;
+		INIT_LIST_HEAD(&ctxt->list);
+	}
+	return ctxt;
+}
+
+static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
+{
+	unsigned int i;
+
+	/* Each RPC/RDMA credit can consume one Receive and
+	 * one Send WQE at the same time.
+	 */
+	i = xprt->sc_sq_depth + xprt->sc_rq_depth;
+
+	while (i--) {
+		struct svc_rdma_op_ctxt *ctxt;
+
+		ctxt = alloc_ctxt(xprt, GFP_KERNEL);
+		if (!ctxt) {
+			dprintk("svcrdma: No memory for RDMA ctxt\n");
+			return false;
+		}
+		list_add(&ctxt->list, &xprt->sc_ctxts);
+	}
+	return true;
+}
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+{
+	struct svc_rdma_op_ctxt *ctxt = NULL;
+
+	spin_lock(&xprt->sc_ctxt_lock);
+	xprt->sc_ctxt_used++;
+	if (list_empty(&xprt->sc_ctxts))
+		goto out_empty;
+
+	ctxt = list_first_entry(&xprt->sc_ctxts,
+				struct svc_rdma_op_ctxt, list);
+	list_del(&ctxt->list);
+	spin_unlock(&xprt->sc_ctxt_lock);
+
+out:
+	ctxt->count = 0;
+	ctxt->mapped_sges = 0;
+	return ctxt;
+
+out_empty:
+	/* Either pre-allocation missed the mark, or send
+	 * queue accounting is broken.
+	 */
+	spin_unlock(&xprt->sc_ctxt_lock);
+
+	ctxt = alloc_ctxt(xprt, GFP_NOIO);
+	if (ctxt)
+		goto out;
+
+	spin_lock(&xprt->sc_ctxt_lock);
+	xprt->sc_ctxt_used--;
+	spin_unlock(&xprt->sc_ctxt_lock);
+	WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
+	return NULL;
+}
+
+void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
+{
+	struct svcxprt_rdma *xprt = ctxt->xprt;
+	struct ib_device *device = xprt->sc_cm_id->device;
+	unsigned int i;
+
+	for (i = 0; i < ctxt->mapped_sges; i++)
+		ib_dma_unmap_page(device,
+				  ctxt->sge[i].addr,
+				  ctxt->sge[i].length,
+				  ctxt->direction);
+	ctxt->mapped_sges = 0;
+}
+
+void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
+{
+	struct svcxprt_rdma *xprt = ctxt->xprt;
+	int i;
+
+	if (free_pages)
+		for (i = 0; i < ctxt->count; i++)
+			put_page(ctxt->pages[i]);
+
+	spin_lock(&xprt->sc_ctxt_lock);
+	xprt->sc_ctxt_used--;
+	list_add(&ctxt->list, &xprt->sc_ctxts);
+	spin_unlock(&xprt->sc_ctxt_lock);
+}
+
+static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
+{
+	while (!list_empty(&xprt->sc_ctxts)) {
+		struct svc_rdma_op_ctxt *ctxt;
+
+		ctxt = list_first_entry(&xprt->sc_ctxts,
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
+		kfree(ctxt);
+	}
+}
+
+/* QP event handler */
+static void qp_event_handler(struct ib_event *event, void *context)
+{
+	struct svc_xprt *xprt = context;
+
+	switch (event->event) {
+	/* These are considered benign events */
+	case IB_EVENT_PATH_MIG:
+	case IB_EVENT_COMM_EST:
+	case IB_EVENT_SQ_DRAINED:
+	case IB_EVENT_QP_LAST_WQE_REACHED:
+		dprintk("svcrdma: QP event %s (%d) received for QP=%p\n",
+			ib_event_msg(event->event), event->event,
+			event->element.qp);
+		break;
+	/* These are considered fatal events */
+	case IB_EVENT_PATH_MIG_ERR:
+	case IB_EVENT_QP_FATAL:
+	case IB_EVENT_QP_REQ_ERR:
+	case IB_EVENT_QP_ACCESS_ERR:
+	case IB_EVENT_DEVICE_FATAL:
+	default:
+		dprintk("svcrdma: QP ERROR event %s (%d) received for QP=%p, "
+			"closing transport\n",
+			ib_event_msg(event->event), event->event,
+			event->element.qp);
+		set_bit(XPT_CLOSE, &xprt->xpt_flags);
+		svc_xprt_enqueue(xprt);
+		break;
+	}
+}
+
+/**
+ * svc_rdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
+ * @cq:        completion queue
+ * @wc:        completed WR
+ *
+ */
+static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct svcxprt_rdma *xprt = cq->cq_context;
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct svc_rdma_op_ctxt *ctxt;
+
+	/* WARNING: Only wc->wr_cqe and wc->status are reliable */
+	ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
+	svc_rdma_unmap_dma(ctxt);
+
+	if (wc->status != IB_WC_SUCCESS)
+		goto flushed;
+
+	/* All wc fields are now known to be valid */
+	ctxt->byte_len = wc->byte_len;
+	spin_lock(&xprt->sc_rq_dto_lock);
+	list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q);
+	spin_unlock(&xprt->sc_rq_dto_lock);
+
+	svc_rdma_post_recv(xprt);
+
+	set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+	if (test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
+		goto out;
+	goto out_enqueue;
+
+flushed:
+	if (wc->status != IB_WC_WR_FLUSH_ERR)
+		pr_err("svcrdma: Recv: %s (%u/0x%x)\n",
+		       ib_wc_status_msg(wc->status),
+		       wc->status, wc->vendor_err);
+	set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+	svc_rdma_put_context(ctxt, 1);
+
+out_enqueue:
+	svc_xprt_enqueue(&xprt->sc_xprt);
+out:
+	svc_xprt_put(&xprt->sc_xprt);
+}
+
+/**
+ * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC
+ * @cq:        completion queue
+ * @wc:        completed WR
+ *
+ */
+void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct svcxprt_rdma *xprt = cq->cq_context;
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct svc_rdma_op_ctxt *ctxt;
+
+	atomic_inc(&xprt->sc_sq_avail);
+	wake_up(&xprt->sc_send_wait);
+
+	ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 1);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+		svc_xprt_enqueue(&xprt->sc_xprt);
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			pr_err("svcrdma: Send: %s (%u/0x%x)\n",
+			       ib_wc_status_msg(wc->status),
+			       wc->status, wc->vendor_err);
+	}
+
+	svc_xprt_put(&xprt->sc_xprt);
+}
+
+static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
+					     int listener)
+{
+	struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
+
+	if (!cma_xprt)
+		return NULL;
+	svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
+	INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
+	INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
+	init_waitqueue_head(&cma_xprt->sc_send_wait);
+
+	spin_lock_init(&cma_xprt->sc_lock);
+	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
+	spin_lock_init(&cma_xprt->sc_ctxt_lock);
+	spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
+
+	/*
+	 * Note that this implies that the underlying transport support
+	 * has some form of congestion control (see RFC 7530 section 3.1
+	 * paragraph 2). For now, we assume that all supported RDMA
+	 * transports are suitable here.
+	 */
+	set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
+
+	if (listener) {
+		strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener");
+		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
+	}
+
+	return cma_xprt;
+}
+
+static int
+svc_rdma_post_recv(struct svcxprt_rdma *xprt)
+{
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	struct svc_rdma_op_ctxt *ctxt;
+	struct page *page;
+	dma_addr_t pa;
+	int sge_no;
+	int buflen;
+	int ret;
+
+	ctxt = svc_rdma_get_context(xprt);
+	buflen = 0;
+	ctxt->direction = DMA_FROM_DEVICE;
+	ctxt->cqe.done = svc_rdma_wc_receive;
+	for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
+		if (sge_no >= xprt->sc_max_sge) {
+			pr_err("svcrdma: Too many sges (%d)\n", sge_no);
+			goto err_put_ctxt;
+		}
+		page = alloc_page(GFP_KERNEL);
+		if (!page)
+			goto err_put_ctxt;
+		ctxt->pages[sge_no] = page;
+		pa = ib_dma_map_page(xprt->sc_cm_id->device,
+				     page, 0, PAGE_SIZE,
+				     DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
+			goto err_put_ctxt;
+		svc_rdma_count_mappings(xprt, ctxt);
+		ctxt->sge[sge_no].addr = pa;
+		ctxt->sge[sge_no].length = PAGE_SIZE;
+		ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
+		ctxt->count = sge_no + 1;
+		buflen += PAGE_SIZE;
+	}
+	recv_wr.next = NULL;
+	recv_wr.sg_list = &ctxt->sge[0];
+	recv_wr.num_sge = ctxt->count;
+	recv_wr.wr_cqe = &ctxt->cqe;
+
+	svc_xprt_get(&xprt->sc_xprt);
+	ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
+	if (ret) {
+		svc_rdma_unmap_dma(ctxt);
+		svc_rdma_put_context(ctxt, 1);
+		svc_xprt_put(&xprt->sc_xprt);
+	}
+	return ret;
+
+ err_put_ctxt:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 1);
+	return -ENOMEM;
+}
+
+static void
+svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt,
+			       struct rdma_conn_param *param)
+{
+	const struct rpcrdma_connect_private *pmsg = param->private_data;
+
+	if (pmsg &&
+	    pmsg->cp_magic == rpcrdma_cmp_magic &&
+	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+		newxprt->sc_snd_w_inv = pmsg->cp_flags &
+					RPCRDMA_CMP_F_SND_W_INV_OK;
+
+		dprintk("svcrdma: client send_size %u, recv_size %u "
+			"remote inv %ssupported\n",
+			rpcrdma_decode_buffer_size(pmsg->cp_send_size),
+			rpcrdma_decode_buffer_size(pmsg->cp_recv_size),
+			newxprt->sc_snd_w_inv ? "" : "un");
+	}
+}
+
+/*
+ * This function handles the CONNECT_REQUEST event on a listening
+ * endpoint. It is passed the cma_id for the _new_ connection. The context in
+ * this cma_id is inherited from the listening cma_id and is the svc_xprt
+ * structure for the listening endpoint.
+ *
+ * This function creates a new xprt for the new connection and enqueues it on
+ * the accept queue for the listent xprt. When the listen thread is kicked, it
+ * will call the recvfrom method on the listen xprt which will accept the new
+ * connection.
+ */
+static void handle_connect_req(struct rdma_cm_id *new_cma_id,
+			       struct rdma_conn_param *param)
+{
+	struct svcxprt_rdma *listen_xprt = new_cma_id->context;
+	struct svcxprt_rdma *newxprt;
+	struct sockaddr *sa;
+
+	/* Create a new transport */
+	newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
+	if (!newxprt) {
+		dprintk("svcrdma: failed to create new transport\n");
+		return;
+	}
+	newxprt->sc_cm_id = new_cma_id;
+	new_cma_id->context = newxprt;
+	dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
+		newxprt, newxprt->sc_cm_id, listen_xprt);
+	svc_rdma_parse_connect_private(newxprt, param);
+
+	/* Save client advertised inbound read limit for use later in accept. */
+	newxprt->sc_ord = param->initiator_depth;
+
+	/* Set the local and remote addresses in the transport */
+	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+	svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+	svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+
+	/*
+	 * Enqueue the new transport on the accept queue of the listening
+	 * transport
+	 */
+	spin_lock_bh(&listen_xprt->sc_lock);
+	list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
+	spin_unlock_bh(&listen_xprt->sc_lock);
+
+	set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
+	svc_xprt_enqueue(&listen_xprt->sc_xprt);
+}
+
+/*
+ * Handles events generated on the listening endpoint. These events will be
+ * either be incoming connect requests or adapter removal  events.
+ */
+static int rdma_listen_handler(struct rdma_cm_id *cma_id,
+			       struct rdma_cm_event *event)
+{
+	struct svcxprt_rdma *xprt = cma_id->context;
+	int ret = 0;
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
+			"event = %s (%d)\n", cma_id, cma_id->context,
+			rdma_event_msg(event->event), event->event);
+		handle_connect_req(cma_id, &event->param.conn);
+		break;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		/* Accept complete */
+		dprintk("svcrdma: Connection completed on LISTEN xprt=%p, "
+			"cm_id=%p\n", xprt, cma_id);
+		break;
+
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n",
+			xprt, cma_id);
+		if (xprt) {
+			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+			svc_xprt_enqueue(&xprt->sc_xprt);
+		}
+		break;
+
+	default:
+		dprintk("svcrdma: Unexpected event on listening endpoint %p, "
+			"event = %s (%d)\n", cma_id,
+			rdma_event_msg(event->event), event->event);
+		break;
+	}
+
+	return ret;
+}
+
+static int rdma_cma_handler(struct rdma_cm_id *cma_id,
+			    struct rdma_cm_event *event)
+{
+	struct svc_xprt *xprt = cma_id->context;
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	switch (event->event) {
+	case RDMA_CM_EVENT_ESTABLISHED:
+		/* Accept complete */
+		svc_xprt_get(xprt);
+		dprintk("svcrdma: Connection completed on DTO xprt=%p, "
+			"cm_id=%p\n", xprt, cma_id);
+		clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
+		svc_xprt_enqueue(xprt);
+		break;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
+			xprt, cma_id);
+		if (xprt) {
+			set_bit(XPT_CLOSE, &xprt->xpt_flags);
+			svc_xprt_enqueue(xprt);
+			svc_xprt_put(xprt);
+		}
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
+			"event = %s (%d)\n", cma_id, xprt,
+			rdma_event_msg(event->event), event->event);
+		if (xprt) {
+			set_bit(XPT_CLOSE, &xprt->xpt_flags);
+			svc_xprt_enqueue(xprt);
+			svc_xprt_put(xprt);
+		}
+		break;
+	default:
+		dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
+			"event = %s (%d)\n", cma_id,
+			rdma_event_msg(event->event), event->event);
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Create a listening RDMA service endpoint.
+ */
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+					struct net *net,
+					struct sockaddr *sa, int salen,
+					int flags)
+{
+	struct rdma_cm_id *listen_id;
+	struct svcxprt_rdma *cma_xprt;
+	int ret;
+
+	dprintk("svcrdma: Creating RDMA socket\n");
+	if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) {
+		dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
+	cma_xprt = rdma_create_xprt(serv, 1);
+	if (!cma_xprt)
+		return ERR_PTR(-ENOMEM);
+
+	listen_id = rdma_create_id(&init_net, rdma_listen_handler, cma_xprt,
+				   RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(listen_id)) {
+		ret = PTR_ERR(listen_id);
+		dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
+		goto err0;
+	}
+
+	/* Allow both IPv4 and IPv6 sockets to bind a single port
+	 * at the same time.
+	 */
+#if IS_ENABLED(CONFIG_IPV6)
+	ret = rdma_set_afonly(listen_id, 1);
+	if (ret) {
+		dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret);
+		goto err1;
+	}
+#endif
+	ret = rdma_bind_addr(listen_id, sa);
+	if (ret) {
+		dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
+		goto err1;
+	}
+	cma_xprt->sc_cm_id = listen_id;
+
+	ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
+	if (ret) {
+		dprintk("svcrdma: rdma_listen failed = %d\n", ret);
+		goto err1;
+	}
+
+	/*
+	 * We need to use the address from the cm_id in case the
+	 * caller specified 0 for the port number.
+	 */
+	sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr;
+	svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
+
+	return &cma_xprt->sc_xprt;
+
+ err1:
+	rdma_destroy_id(listen_id);
+ err0:
+	kfree(cma_xprt);
+	return ERR_PTR(ret);
+}
+
+/*
+ * This is the xpo_recvfrom function for listening endpoints. Its
+ * purpose is to accept incoming connections. The CMA callback handler
+ * has already created a new transport and attached it to the new CMA
+ * ID.
+ *
+ * There is a queue of pending connections hung on the listening
+ * transport. This queue contains the new svc_xprt structure. This
+ * function takes svc_xprt structures off the accept_q and completes
+ * the connection.
+ */
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *listen_rdma;
+	struct svcxprt_rdma *newxprt = NULL;
+	struct rdma_conn_param conn_param;
+	struct rpcrdma_connect_private pmsg;
+	struct ib_qp_init_attr qp_attr;
+	struct ib_device *dev;
+	struct sockaddr *sap;
+	unsigned int i, ctxts;
+	int ret = 0;
+
+	listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	clear_bit(XPT_CONN, &xprt->xpt_flags);
+	/* Get the next entry off the accept list */
+	spin_lock_bh(&listen_rdma->sc_lock);
+	if (!list_empty(&listen_rdma->sc_accept_q)) {
+		newxprt = list_entry(listen_rdma->sc_accept_q.next,
+				     struct svcxprt_rdma, sc_accept_q);
+		list_del_init(&newxprt->sc_accept_q);
+	}
+	if (!list_empty(&listen_rdma->sc_accept_q))
+		set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
+	spin_unlock_bh(&listen_rdma->sc_lock);
+	if (!newxprt)
+		return NULL;
+
+	dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
+		newxprt, newxprt->sc_cm_id);
+
+	dev = newxprt->sc_cm_id->device;
+	newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
+
+	/* Qualify the transport resource defaults with the
+	 * capabilities of this particular device */
+	newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge,
+				  (size_t)RPCSVC_MAXPAGES);
+	newxprt->sc_max_req_size = svcrdma_max_req_size;
+	newxprt->sc_max_requests = svcrdma_max_requests;
+	newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;
+	newxprt->sc_rq_depth = newxprt->sc_max_requests +
+			       newxprt->sc_max_bc_requests;
+	if (newxprt->sc_rq_depth > dev->attrs.max_qp_wr) {
+		pr_warn("svcrdma: reducing receive depth to %d\n",
+			dev->attrs.max_qp_wr);
+		newxprt->sc_rq_depth = dev->attrs.max_qp_wr;
+		newxprt->sc_max_requests = newxprt->sc_rq_depth - 2;
+		newxprt->sc_max_bc_requests = 2;
+	}
+	newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
+	ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES);
+	ctxts *= newxprt->sc_max_requests;
+	newxprt->sc_sq_depth = newxprt->sc_rq_depth + ctxts;
+	if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) {
+		pr_warn("svcrdma: reducing send depth to %d\n",
+			dev->attrs.max_qp_wr);
+		newxprt->sc_sq_depth = dev->attrs.max_qp_wr;
+	}
+	atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
+
+	if (!svc_rdma_prealloc_ctxts(newxprt))
+		goto errout;
+
+	newxprt->sc_pd = ib_alloc_pd(dev, 0);
+	if (IS_ERR(newxprt->sc_pd)) {
+		dprintk("svcrdma: error creating PD for connect request\n");
+		goto errout;
+	}
+	newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
+					0, IB_POLL_WORKQUEUE);
+	if (IS_ERR(newxprt->sc_sq_cq)) {
+		dprintk("svcrdma: error creating SQ CQ for connect request\n");
+		goto errout;
+	}
+	newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth,
+					0, IB_POLL_WORKQUEUE);
+	if (IS_ERR(newxprt->sc_rq_cq)) {
+		dprintk("svcrdma: error creating RQ CQ for connect request\n");
+		goto errout;
+	}
+
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.event_handler = qp_event_handler;
+	qp_attr.qp_context = &newxprt->sc_xprt;
+	qp_attr.port_num = newxprt->sc_port_num;
+	qp_attr.cap.max_rdma_ctxs = ctxts;
+	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts;
+	qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
+	qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
+	qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
+	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	qp_attr.qp_type = IB_QPT_RC;
+	qp_attr.send_cq = newxprt->sc_sq_cq;
+	qp_attr.recv_cq = newxprt->sc_rq_cq;
+	dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n",
+		newxprt->sc_cm_id, newxprt->sc_pd);
+	dprintk("    cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
+		qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
+	dprintk("    cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
+		qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
+
+	ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
+	if (ret) {
+		dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
+		goto errout;
+	}
+	newxprt->sc_qp = newxprt->sc_cm_id->qp;
+
+	if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+		newxprt->sc_snd_w_inv = false;
+	if (!rdma_protocol_iwarp(dev, newxprt->sc_port_num) &&
+	    !rdma_ib_or_roce(dev, newxprt->sc_port_num))
+		goto errout;
+
+	/* Post receive buffers */
+	for (i = 0; i < newxprt->sc_max_requests; i++) {
+		ret = svc_rdma_post_recv(newxprt);
+		if (ret) {
+			dprintk("svcrdma: failure posting receive buffers\n");
+			goto errout;
+		}
+	}
+
+	/* Swap out the handler */
+	newxprt->sc_cm_id->event_handler = rdma_cma_handler;
+
+	/* Construct RDMA-CM private message */
+	pmsg.cp_magic = rpcrdma_cmp_magic;
+	pmsg.cp_version = RPCRDMA_CMP_VERSION;
+	pmsg.cp_flags = 0;
+	pmsg.cp_send_size = pmsg.cp_recv_size =
+		rpcrdma_encode_buffer_size(newxprt->sc_max_req_size);
+
+	/* Accept Connection */
+	set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
+	memset(&conn_param, 0, sizeof conn_param);
+	conn_param.responder_resources = 0;
+	conn_param.initiator_depth = min_t(int, newxprt->sc_ord,
+					   dev->attrs.max_qp_init_rd_atom);
+	if (!conn_param.initiator_depth) {
+		dprintk("svcrdma: invalid ORD setting\n");
+		ret = -EINVAL;
+		goto errout;
+	}
+	conn_param.private_data = &pmsg;
+	conn_param.private_data_len = sizeof(pmsg);
+	ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
+	if (ret)
+		goto errout;
+
+	dprintk("svcrdma: new connection %p accepted:\n", newxprt);
+	sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+	dprintk("    local address   : %pIS:%u\n", sap, rpc_get_port(sap));
+	sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+	dprintk("    remote address  : %pIS:%u\n", sap, rpc_get_port(sap));
+	dprintk("    max_sge         : %d\n", newxprt->sc_max_sge);
+	dprintk("    sq_depth        : %d\n", newxprt->sc_sq_depth);
+	dprintk("    rdma_rw_ctxs    : %d\n", ctxts);
+	dprintk("    max_requests    : %d\n", newxprt->sc_max_requests);
+	dprintk("    ord             : %d\n", conn_param.initiator_depth);
+
+	return &newxprt->sc_xprt;
+
+ errout:
+	dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
+	/* Take a reference in case the DTO handler runs */
+	svc_xprt_get(&newxprt->sc_xprt);
+	if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
+		ib_destroy_qp(newxprt->sc_qp);
+	rdma_destroy_id(newxprt->sc_cm_id);
+	/* This call to put will destroy the transport */
+	svc_xprt_put(&newxprt->sc_xprt);
+	return NULL;
+}
+
+static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
+{
+}
+
+/*
+ * When connected, an svc_xprt has at least two references:
+ *
+ * - A reference held by the cm_id between the ESTABLISHED and
+ *   DISCONNECTED events. If the remote peer disconnected first, this
+ *   reference could be gone.
+ *
+ * - A reference held by the svc_recv code that called this function
+ *   as part of close processing.
+ *
+ * At a minimum one references should still be held.
+ */
+static void svc_rdma_detach(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	dprintk("svc: svc_rdma_detach(%p)\n", xprt);
+
+	/* Disconnect and flush posted WQE */
+	rdma_disconnect(rdma->sc_cm_id);
+}
+
+static void __svc_rdma_free(struct work_struct *work)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(work, struct svcxprt_rdma, sc_work);
+	struct svc_xprt *xprt = &rdma->sc_xprt;
+
+	dprintk("svcrdma: %s(%p)\n", __func__, rdma);
+
+	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+		ib_drain_qp(rdma->sc_qp);
+
+	/* We should only be called from kref_put */
+	if (kref_read(&xprt->xpt_ref) != 0)
+		pr_err("svcrdma: sc_xprt still in use? (%d)\n",
+		       kref_read(&xprt->xpt_ref));
+
+	while (!list_empty(&rdma->sc_read_complete_q)) {
+		struct svc_rdma_op_ctxt *ctxt;
+		ctxt = list_first_entry(&rdma->sc_read_complete_q,
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
+		svc_rdma_put_context(ctxt, 1);
+	}
+	while (!list_empty(&rdma->sc_rq_dto_q)) {
+		struct svc_rdma_op_ctxt *ctxt;
+		ctxt = list_first_entry(&rdma->sc_rq_dto_q,
+					struct svc_rdma_op_ctxt, list);
+		list_del(&ctxt->list);
+		svc_rdma_put_context(ctxt, 1);
+	}
+
+	/* Warn if we leaked a resource or under-referenced */
+	if (rdma->sc_ctxt_used != 0)
+		pr_err("svcrdma: ctxt still in use? (%d)\n",
+		       rdma->sc_ctxt_used);
+
+	/* Final put of backchannel client transport */
+	if (xprt->xpt_bc_xprt) {
+		xprt_put(xprt->xpt_bc_xprt);
+		xprt->xpt_bc_xprt = NULL;
+	}
+
+	svc_rdma_destroy_rw_ctxts(rdma);
+	svc_rdma_destroy_ctxts(rdma);
+
+	/* Destroy the QP if present (not a listener) */
+	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+		ib_destroy_qp(rdma->sc_qp);
+
+	if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
+		ib_free_cq(rdma->sc_sq_cq);
+
+	if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
+		ib_free_cq(rdma->sc_rq_cq);
+
+	if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
+		ib_dealloc_pd(rdma->sc_pd);
+
+	/* Destroy the CM ID */
+	rdma_destroy_id(rdma->sc_cm_id);
+
+	kfree(rdma);
+}
+
+static void svc_rdma_free(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	INIT_WORK(&rdma->sc_work, __svc_rdma_free);
+	queue_work(svc_rdma_wq, &rdma->sc_work);
+}
+
+static int svc_rdma_has_wspace(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+	/*
+	 * If there are already waiters on the SQ,
+	 * return false.
+	 */
+	if (waitqueue_active(&rdma->sc_send_wait))
+		return 0;
+
+	/* Otherwise return true. */
+	return 1;
+}
+
+static void svc_rdma_secure_port(struct svc_rqst *rqstp)
+{
+	set_bit(RQ_SECURE, &rqstp->rq_flags);
+}
+
+static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt)
+{
+}
+
+int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
+{
+	struct ib_send_wr *bad_wr, *n_wr;
+	int wr_count;
+	int i;
+	int ret;
+
+	if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
+		return -ENOTCONN;
+
+	wr_count = 1;
+	for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
+		wr_count++;
+
+	/* If the SQ is full, wait until an SQ entry is available */
+	while (1) {
+		if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) {
+			atomic_inc(&rdma_stat_sq_starve);
+
+			/* Wait until SQ WR available if SQ still full */
+			atomic_add(wr_count, &xprt->sc_sq_avail);
+			wait_event(xprt->sc_send_wait,
+				   atomic_read(&xprt->sc_sq_avail) > wr_count);
+			if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
+				return -ENOTCONN;
+			continue;
+		}
+		/* Take a transport ref for each WR posted */
+		for (i = 0; i < wr_count; i++)
+			svc_xprt_get(&xprt->sc_xprt);
+
+		/* Bump used SQ WR count and post */
+		ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
+		if (ret) {
+			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+			for (i = 0; i < wr_count; i ++)
+				svc_xprt_put(&xprt->sc_xprt);
+			dprintk("svcrdma: failed to post SQ WR rc=%d\n", ret);
+			dprintk("    sc_sq_avail=%d, sc_sq_depth=%d\n",
+				atomic_read(&xprt->sc_sq_avail),
+				xprt->sc_sq_depth);
+			wake_up(&xprt->sc_send_wait);
+		}
+		break;
+	}
+	return ret;
+}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
new file mode 100644
index 0000000..cc1aad3
--- /dev/null
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -0,0 +1,874 @@
+/*
+ * Copyright (c) 2014-2017 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * transport.c
+ *
+ * This file contains the top-level implementation of an RPC RDMA
+ * transport.
+ *
+ * Naming convention: functions beginning with xprt_ are part of the
+ * transport switch. All others are RPC RDMA internal.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/sunrpc/addr.h>
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+/*
+ * tunables
+ */
+
+static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
+unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
+static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
+unsigned int xprt_rdma_memreg_strategy		= RPCRDMA_FRWR;
+int xprt_rdma_pad_optimize;
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+
+static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
+static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
+static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
+static unsigned int zero;
+static unsigned int max_padding = PAGE_SIZE;
+static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
+static unsigned int max_memreg = RPCRDMA_LAST - 1;
+static unsigned int dummy;
+
+static struct ctl_table_header *sunrpc_table_header;
+
+static struct ctl_table xr_tunables_table[] = {
+	{
+		.procname	= "rdma_slot_table_entries",
+		.data		= &xprt_rdma_slot_table_entries,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_slot_table_size,
+		.extra2		= &max_slot_table_size
+	},
+	{
+		.procname	= "rdma_max_inline_read",
+		.data		= &xprt_rdma_max_inline_read,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_inline_size,
+		.extra2		= &max_inline_size,
+	},
+	{
+		.procname	= "rdma_max_inline_write",
+		.data		= &xprt_rdma_max_inline_write,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_inline_size,
+		.extra2		= &max_inline_size,
+	},
+	{
+		.procname	= "rdma_inline_write_padding",
+		.data		= &dummy,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &max_padding,
+	},
+	{
+		.procname	= "rdma_memreg_strategy",
+		.data		= &xprt_rdma_memreg_strategy,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_memreg,
+		.extra2		= &max_memreg,
+	},
+	{
+		.procname	= "rdma_pad_optimize",
+		.data		= &xprt_rdma_pad_optimize,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ },
+};
+
+static struct ctl_table sunrpc_table[] = {
+	{
+		.procname	= "sunrpc",
+		.mode		= 0555,
+		.child		= xr_tunables_table
+	},
+	{ },
+};
+
+#endif
+
+static const struct rpc_xprt_ops xprt_rdma_procs;
+
+static void
+xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	char buf[20];
+
+	snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA;
+}
+
+static void
+xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	char buf[40];
+
+	snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
+}
+
+void
+xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+	char buf[128];
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		xprt_rdma_format_addresses4(xprt, sap);
+		break;
+	case AF_INET6:
+		xprt_rdma_format_addresses6(xprt, sap);
+		break;
+	default:
+		pr_err("rpcrdma: Unrecognized address family\n");
+		return;
+	}
+
+	(void)rpc_ntop(sap, buf, sizeof(buf));
+	xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
+	xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
+
+	snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
+
+	xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
+}
+
+void
+xprt_rdma_free_addresses(struct rpc_xprt *xprt)
+{
+	unsigned int i;
+
+	for (i = 0; i < RPC_DISPLAY_MAX; i++)
+		switch (i) {
+		case RPC_DISPLAY_PROTO:
+		case RPC_DISPLAY_NETID:
+			continue;
+		default:
+			kfree(xprt->address_strings[i]);
+		}
+}
+
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
+{
+	schedule_delayed_work(&ep->rep_connect_worker, 0);
+}
+
+void
+rpcrdma_connect_worker(struct work_struct *work)
+{
+	struct rpcrdma_ep *ep =
+		container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
+	struct rpcrdma_xprt *r_xprt =
+		container_of(ep, struct rpcrdma_xprt, rx_ep);
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+	spin_lock_bh(&xprt->transport_lock);
+	if (ep->rep_connected > 0) {
+		if (!xprt_test_and_set_connected(xprt))
+			xprt_wake_pending_tasks(xprt, 0);
+	} else {
+		if (xprt_test_and_clear_connected(xprt))
+			xprt_wake_pending_tasks(xprt, -ENOTCONN);
+	}
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
+static void
+xprt_rdma_connect_worker(struct work_struct *work)
+{
+	struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt,
+						   rx_connect_worker.work);
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	int rc = 0;
+
+	xprt_clear_connected(xprt);
+
+	rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+	if (rc)
+		xprt_wake_pending_tasks(xprt, rc);
+
+	xprt_clear_connecting(xprt);
+}
+
+static void
+xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt,
+						   rx_xprt);
+
+	trace_xprtrdma_inject_dsc(r_xprt);
+	rdma_disconnect(r_xprt->rx_ia.ri_id);
+}
+
+/*
+ * xprt_rdma_destroy
+ *
+ * Destroy the xprt.
+ * Free all memory associated with the object, including its own.
+ * NOTE: none of the *destroy methods free memory for their top-level
+ * objects, even though they may have allocated it (they do free
+ * private memory). It's up to the caller to handle it. In this
+ * case (RDMA transport), all structure memory is inlined with the
+ * struct rpcrdma_xprt.
+ */
+static void
+xprt_rdma_destroy(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	trace_xprtrdma_destroy(r_xprt);
+
+	cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
+
+	xprt_clear_connected(xprt);
+
+	rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
+	rpcrdma_buffer_destroy(&r_xprt->rx_buf);
+	rpcrdma_ia_close(&r_xprt->rx_ia);
+
+	xprt_rdma_free_addresses(xprt);
+	xprt_free(xprt);
+
+	module_put(THIS_MODULE);
+}
+
+static const struct rpc_timeout xprt_rdma_default_timeout = {
+	.to_initval = 60 * HZ,
+	.to_maxval = 60 * HZ,
+};
+
+/**
+ * xprt_setup_rdma - Set up transport to use RDMA
+ *
+ * @args: rpc transport arguments
+ */
+static struct rpc_xprt *
+xprt_setup_rdma(struct xprt_create *args)
+{
+	struct rpcrdma_create_data_internal cdata;
+	struct rpc_xprt *xprt;
+	struct rpcrdma_xprt *new_xprt;
+	struct rpcrdma_ep *new_ep;
+	struct sockaddr *sap;
+	int rc;
+
+	if (args->addrlen > sizeof(xprt->addr)) {
+		dprintk("RPC:       %s: address too large\n", __func__);
+		return ERR_PTR(-EBADF);
+	}
+
+	xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt),
+			xprt_rdma_slot_table_entries,
+			xprt_rdma_slot_table_entries);
+	if (xprt == NULL) {
+		dprintk("RPC:       %s: couldn't allocate rpcrdma_xprt\n",
+			__func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* 60 second timeout, no retries */
+	xprt->timeout = &xprt_rdma_default_timeout;
+	xprt->bind_timeout = RPCRDMA_BIND_TO;
+	xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+	xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+
+	xprt->resvport = 0;		/* privileged port not needed */
+	xprt->tsh_size = 0;		/* RPC-RDMA handles framing */
+	xprt->ops = &xprt_rdma_procs;
+
+	/*
+	 * Set up RDMA-specific connect data.
+	 */
+	sap = args->dstaddr;
+
+	/* Ensure xprt->addr holds valid server TCP (not RDMA)
+	 * address, for any side protocols which peek at it */
+	xprt->prot = IPPROTO_TCP;
+	xprt->addrlen = args->addrlen;
+	memcpy(&xprt->addr, sap, xprt->addrlen);
+
+	if (rpc_get_port(sap))
+		xprt_set_bound(xprt);
+	xprt_rdma_format_addresses(xprt, sap);
+
+	cdata.max_requests = xprt->max_reqs;
+
+	cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
+	cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
+
+	cdata.inline_wsize = xprt_rdma_max_inline_write;
+	if (cdata.inline_wsize > cdata.wsize)
+		cdata.inline_wsize = cdata.wsize;
+
+	cdata.inline_rsize = xprt_rdma_max_inline_read;
+	if (cdata.inline_rsize > cdata.rsize)
+		cdata.inline_rsize = cdata.rsize;
+
+	/*
+	 * Create new transport instance, which includes initialized
+	 *  o ia
+	 *  o endpoint
+	 *  o buffers
+	 */
+
+	new_xprt = rpcx_to_rdmax(xprt);
+
+	rc = rpcrdma_ia_open(new_xprt);
+	if (rc)
+		goto out1;
+
+	/*
+	 * initialize and create ep
+	 */
+	new_xprt->rx_data = cdata;
+	new_ep = &new_xprt->rx_ep;
+
+	rc = rpcrdma_ep_create(&new_xprt->rx_ep,
+				&new_xprt->rx_ia, &new_xprt->rx_data);
+	if (rc)
+		goto out2;
+
+	rc = rpcrdma_buffer_create(new_xprt);
+	if (rc)
+		goto out3;
+
+	INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
+			  xprt_rdma_connect_worker);
+
+	xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
+	if (xprt->max_payload == 0)
+		goto out4;
+	xprt->max_payload <<= PAGE_SHIFT;
+	dprintk("RPC:       %s: transport data payload maximum: %zu bytes\n",
+		__func__, xprt->max_payload);
+
+	if (!try_module_get(THIS_MODULE))
+		goto out4;
+
+	dprintk("RPC:       %s: %s:%s\n", __func__,
+		xprt->address_strings[RPC_DISPLAY_ADDR],
+		xprt->address_strings[RPC_DISPLAY_PORT]);
+	trace_xprtrdma_create(new_xprt);
+	return xprt;
+
+out4:
+	rpcrdma_buffer_destroy(&new_xprt->rx_buf);
+	rc = -ENODEV;
+out3:
+	rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
+out2:
+	rpcrdma_ia_close(&new_xprt->rx_ia);
+out1:
+	trace_xprtrdma_destroy(new_xprt);
+	xprt_rdma_free_addresses(xprt);
+	xprt_free(xprt);
+	return ERR_PTR(rc);
+}
+
+/**
+ * xprt_rdma_close - Close down RDMA connection
+ * @xprt: generic transport to be closed
+ *
+ * Called during transport shutdown reconnect, or device
+ * removal. Caller holds the transport's write lock.
+ */
+static void
+xprt_rdma_close(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+	dprintk("RPC:       %s: closing xprt %p\n", __func__, xprt);
+
+	if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) {
+		xprt_clear_connected(xprt);
+		rpcrdma_ia_remove(ia);
+		return;
+	}
+	if (ep->rep_connected == -ENODEV)
+		return;
+	if (ep->rep_connected > 0)
+		xprt->reestablish_timeout = 0;
+	xprt_disconnect_done(xprt);
+	rpcrdma_ep_disconnect(ep, ia);
+}
+
+/**
+ * xprt_rdma_set_port - update server port with rpcbind result
+ * @xprt: controlling RPC transport
+ * @port: new port value
+ *
+ * Transport connect status is unchanged.
+ */
+static void
+xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
+{
+	struct sockaddr *sap = (struct sockaddr *)&xprt->addr;
+	char buf[8];
+
+	dprintk("RPC:       %s: setting port for xprt %p (%s:%s) to %u\n",
+		__func__, xprt,
+		xprt->address_strings[RPC_DISPLAY_ADDR],
+		xprt->address_strings[RPC_DISPLAY_PORT],
+		port);
+
+	rpc_set_port(sap, port);
+
+	kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
+	snprintf(buf, sizeof(buf), "%u", port);
+	xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
+
+	kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
+	snprintf(buf, sizeof(buf), "%4hx", port);
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
+}
+
+/**
+ * xprt_rdma_timer - invoked when an RPC times out
+ * @xprt: controlling RPC transport
+ * @task: RPC task that timed out
+ *
+ * Invoked when the transport is still connected, but an RPC
+ * retransmit timeout occurs.
+ *
+ * Since RDMA connections don't have a keep-alive, forcibly
+ * disconnect and retry to connect. This drives full
+ * detection of the network path, and retransmissions of
+ * all pending RPCs.
+ */
+static void
+xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	xprt_force_disconnect(xprt);
+}
+
+static void
+xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	if (r_xprt->rx_ep.rep_connected != 0) {
+		/* Reconnect */
+		schedule_delayed_work(&r_xprt->rx_connect_worker,
+				      xprt->reestablish_timeout);
+		xprt->reestablish_timeout <<= 1;
+		if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
+			xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
+		else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
+			xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+	} else {
+		schedule_delayed_work(&r_xprt->rx_connect_worker, 0);
+		if (!RPC_IS_ASYNC(task))
+			flush_delayed_work(&r_xprt->rx_connect_worker);
+	}
+}
+
+static bool
+rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+		    size_t size, gfp_t flags)
+{
+	struct rpcrdma_regbuf *rb;
+
+	if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size)
+		return true;
+
+	rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
+	if (IS_ERR(rb))
+		return false;
+
+	rpcrdma_free_regbuf(req->rl_sendbuf);
+	r_xprt->rx_stats.hardway_register_count += size;
+	req->rl_sendbuf = rb;
+	return true;
+}
+
+/* The rq_rcv_buf is used only if a Reply chunk is necessary.
+ * The decision to use a Reply chunk is made later in
+ * rpcrdma_marshal_req. This buffer is registered at that time.
+ *
+ * Otherwise, the associated RPC Reply arrives in a separate
+ * Receive buffer, arbitrarily chosen by the HCA. The buffer
+ * allocated here for the RPC Reply is not utilized in that
+ * case. See rpcrdma_inline_fixup.
+ *
+ * A regbuf is used here to remember the buffer size.
+ */
+static bool
+rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+		    size_t size, gfp_t flags)
+{
+	struct rpcrdma_regbuf *rb;
+
+	if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size)
+		return true;
+
+	rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags);
+	if (IS_ERR(rb))
+		return false;
+
+	rpcrdma_free_regbuf(req->rl_recvbuf);
+	r_xprt->rx_stats.hardway_register_count += size;
+	req->rl_recvbuf = rb;
+	return true;
+}
+
+/**
+ * xprt_rdma_allocate - allocate transport resources for an RPC
+ * @task: RPC task
+ *
+ * Return values:
+ *        0:	Success; rq_buffer points to RPC buffer to use
+ *   ENOMEM:	Out of memory, call again later
+ *      EIO:	A permanent error occurred, do not retry
+ *
+ * The RDMA allocate/free functions need the task structure as a place
+ * to hide the struct rpcrdma_req, which is necessary for the actual
+ * send/recv sequence.
+ *
+ * xprt_rdma_allocate provides buffers that are already mapped for
+ * DMA, and a local DMA lkey is provided for each.
+ */
+static int
+xprt_rdma_allocate(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	struct rpcrdma_req *req;
+	gfp_t flags;
+
+	req = rpcrdma_buffer_get(&r_xprt->rx_buf);
+	if (req == NULL)
+		goto out_get;
+
+	flags = RPCRDMA_DEF_GFP;
+	if (RPC_IS_SWAPPER(task))
+		flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
+
+	if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags))
+		goto out_fail;
+	if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
+		goto out_fail;
+
+	rpcrdma_set_xprtdata(rqst, req);
+	rqst->rq_buffer = req->rl_sendbuf->rg_base;
+	rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
+	trace_xprtrdma_allocate(task, req);
+	return 0;
+
+out_fail:
+	rpcrdma_buffer_put(req);
+out_get:
+	trace_xprtrdma_allocate(task, NULL);
+	return -ENOMEM;
+}
+
+/**
+ * xprt_rdma_free - release resources allocated by xprt_rdma_allocate
+ * @task: RPC task
+ *
+ * Caller guarantees rqst->rq_buffer is non-NULL.
+ */
+static void
+xprt_rdma_free(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+
+	if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags))
+		rpcrdma_release_rqst(r_xprt, req);
+	trace_xprtrdma_rpc_done(task, req);
+	rpcrdma_buffer_put(req);
+}
+
+/**
+ * xprt_rdma_send_request - marshal and send an RPC request
+ * @task: RPC task with an RPC message in rq_snd_buf
+ *
+ * Caller holds the transport's write lock.
+ *
+ * Returns:
+ *	%0 if the RPC message has been sent
+ *	%-ENOTCONN if the caller should reconnect and call again
+ *	%-EAGAIN if the caller should call again
+ *	%-ENOBUFS if the caller should call again after a delay
+ *	%-EIO if a permanent error occurred and the request was not
+ *		sent. Do not try to send this message again.
+ */
+static int
+xprt_rdma_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct rpc_xprt *xprt = rqst->rq_xprt;
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	int rc = 0;
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	if (unlikely(!rqst->rq_buffer))
+		return xprt_rdma_bc_send_reply(rqst);
+#endif	/* CONFIG_SUNRPC_BACKCHANNEL */
+
+	if (!xprt_connected(xprt))
+		goto drop_connection;
+
+	rc = rpcrdma_marshal_req(r_xprt, rqst);
+	if (rc < 0)
+		goto failed_marshal;
+
+	if (req->rl_reply == NULL) 		/* e.g. reconnection */
+		rpcrdma_recv_buffer_get(req);
+
+	/* Must suppress retransmit to maintain credits */
+	if (rqst->rq_connect_cookie == xprt->connect_cookie)
+		goto drop_connection;
+	rqst->rq_xtime = ktime_get();
+
+	__set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
+	if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+		goto drop_connection;
+
+	rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
+	rqst->rq_bytes_sent = 0;
+
+	/* An RPC with no reply will throw off credit accounting,
+	 * so drop the connection to reset the credit grant.
+	 */
+	if (!rpc_reply_expected(task))
+		goto drop_connection;
+	return 0;
+
+failed_marshal:
+	if (rc != -ENOTCONN)
+		return rc;
+drop_connection:
+	xprt_disconnect_done(xprt);
+	return -ENOTCONN;	/* implies disconnect */
+}
+
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	long idle_time = 0;
+
+	if (xprt_connected(xprt))
+		idle_time = (long)(jiffies - xprt->last_used) / HZ;
+
+	seq_puts(seq, "\txprt:\trdma ");
+	seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ",
+		   0,	/* need a local port? */
+		   xprt->stat.bind_count,
+		   xprt->stat.connect_count,
+		   xprt->stat.connect_time,
+		   idle_time,
+		   xprt->stat.sends,
+		   xprt->stat.recvs,
+		   xprt->stat.bad_xids,
+		   xprt->stat.req_u,
+		   xprt->stat.bklog_u);
+	seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ",
+		   r_xprt->rx_stats.read_chunk_count,
+		   r_xprt->rx_stats.write_chunk_count,
+		   r_xprt->rx_stats.reply_chunk_count,
+		   r_xprt->rx_stats.total_rdma_request,
+		   r_xprt->rx_stats.total_rdma_reply,
+		   r_xprt->rx_stats.pullup_copy_count,
+		   r_xprt->rx_stats.fixup_copy_count,
+		   r_xprt->rx_stats.hardway_register_count,
+		   r_xprt->rx_stats.failed_marshal_count,
+		   r_xprt->rx_stats.bad_reply_count,
+		   r_xprt->rx_stats.nomsg_call_count);
+	seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n",
+		   r_xprt->rx_stats.mrs_recovered,
+		   r_xprt->rx_stats.mrs_orphaned,
+		   r_xprt->rx_stats.mrs_allocated,
+		   r_xprt->rx_stats.local_inv_needed,
+		   r_xprt->rx_stats.empty_sendctx_q,
+		   r_xprt->rx_stats.reply_waits_for_send);
+}
+
+static int
+xprt_rdma_enable_swap(struct rpc_xprt *xprt)
+{
+	return 0;
+}
+
+static void
+xprt_rdma_disable_swap(struct rpc_xprt *xprt)
+{
+}
+
+/*
+ * Plumbing for rpc transport switch and kernel module
+ */
+
+static const struct rpc_xprt_ops xprt_rdma_procs = {
+	.reserve_xprt		= xprt_reserve_xprt_cong,
+	.release_xprt		= xprt_release_xprt_cong, /* sunrpc/xprt.c */
+	.alloc_slot		= xprt_alloc_slot,
+	.release_request	= xprt_release_rqst_cong,       /* ditto */
+	.set_retrans_timeout	= xprt_set_retrans_timeout_def, /* ditto */
+	.timer			= xprt_rdma_timer,
+	.rpcbind		= rpcb_getport_async,	/* sunrpc/rpcb_clnt.c */
+	.set_port		= xprt_rdma_set_port,
+	.connect		= xprt_rdma_connect,
+	.buf_alloc		= xprt_rdma_allocate,
+	.buf_free		= xprt_rdma_free,
+	.send_request		= xprt_rdma_send_request,
+	.close			= xprt_rdma_close,
+	.destroy		= xprt_rdma_destroy,
+	.print_stats		= xprt_rdma_print_stats,
+	.enable_swap		= xprt_rdma_enable_swap,
+	.disable_swap		= xprt_rdma_disable_swap,
+	.inject_disconnect	= xprt_rdma_inject_disconnect,
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	.bc_setup		= xprt_rdma_bc_setup,
+	.bc_up			= xprt_rdma_bc_up,
+	.bc_maxpayload		= xprt_rdma_bc_maxpayload,
+	.bc_free_rqst		= xprt_rdma_bc_free_rqst,
+	.bc_destroy		= xprt_rdma_bc_destroy,
+#endif
+};
+
+static struct xprt_class xprt_rdma = {
+	.list			= LIST_HEAD_INIT(xprt_rdma.list),
+	.name			= "rdma",
+	.owner			= THIS_MODULE,
+	.ident			= XPRT_TRANSPORT_RDMA,
+	.setup			= xprt_setup_rdma,
+};
+
+void xprt_rdma_cleanup(void)
+{
+	int rc;
+
+	dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+	if (sunrpc_table_header) {
+		unregister_sysctl_table(sunrpc_table_header);
+		sunrpc_table_header = NULL;
+	}
+#endif
+	rc = xprt_unregister_transport(&xprt_rdma);
+	if (rc)
+		dprintk("RPC:       %s: xprt_unregister returned %i\n",
+			__func__, rc);
+
+	rpcrdma_destroy_wq();
+
+	rc = xprt_unregister_transport(&xprt_rdma_bc);
+	if (rc)
+		dprintk("RPC:       %s: xprt_unregister(bc) returned %i\n",
+			__func__, rc);
+}
+
+int xprt_rdma_init(void)
+{
+	int rc;
+
+	rc = rpcrdma_alloc_wq();
+	if (rc)
+		return rc;
+
+	rc = xprt_register_transport(&xprt_rdma);
+	if (rc) {
+		rpcrdma_destroy_wq();
+		return rc;
+	}
+
+	rc = xprt_register_transport(&xprt_rdma_bc);
+	if (rc) {
+		xprt_unregister_transport(&xprt_rdma);
+		rpcrdma_destroy_wq();
+		return rc;
+	}
+
+	dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
+
+	dprintk("Defaults:\n");
+	dprintk("\tSlots %d\n"
+		"\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
+		xprt_rdma_slot_table_entries,
+		xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
+	dprintk("\tPadding 0\n\tMemreg %d\n", xprt_rdma_memreg_strategy);
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+	if (!sunrpc_table_header)
+		sunrpc_table_header = register_sysctl_table(sunrpc_table);
+#endif
+	return 0;
+}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
new file mode 100644
index 0000000..c345d36
--- /dev/null
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -0,0 +1,1640 @@
+/*
+ * Copyright (c) 2014-2017 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * verbs.c
+ *
+ * Encapsulates the major functions managing:
+ *  o adapters
+ *  o endpoints
+ *  o connections
+ *  o buffer memory
+ */
+
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#include <asm-generic/barrier.h>
+#include <asm/bitops.h>
+
+#include <rdma/ib_cm.h>
+
+#include "xprt_rdma.h"
+
+/*
+ * Globals/Macros
+ */
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+/*
+ * internal functions
+ */
+static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
+static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
+
+struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
+
+int
+rpcrdma_alloc_wq(void)
+{
+	struct workqueue_struct *recv_wq;
+
+	recv_wq = alloc_workqueue("xprtrdma_receive",
+				  WQ_MEM_RECLAIM | WQ_HIGHPRI,
+				  0);
+	if (!recv_wq)
+		return -ENOMEM;
+
+	rpcrdma_receive_wq = recv_wq;
+	return 0;
+}
+
+void
+rpcrdma_destroy_wq(void)
+{
+	struct workqueue_struct *wq;
+
+	if (rpcrdma_receive_wq) {
+		wq = rpcrdma_receive_wq;
+		rpcrdma_receive_wq = NULL;
+		destroy_workqueue(wq);
+	}
+}
+
+static void
+rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
+{
+	struct rpcrdma_ep *ep = context;
+	struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
+						   rx_ep);
+
+	trace_xprtrdma_qp_error(r_xprt, event);
+	pr_err("rpcrdma: %s on device %s ep %p\n",
+	       ib_event_msg(event->event), event->device->name, context);
+
+	if (ep->rep_connected == 1) {
+		ep->rep_connected = -EIO;
+		rpcrdma_conn_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+	}
+}
+
+/**
+ * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
+ * @cq:	completion queue (ignored)
+ * @wc:	completed WR
+ *
+ */
+static void
+rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_sendctx *sc =
+		container_of(cqe, struct rpcrdma_sendctx, sc_cqe);
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	trace_xprtrdma_wc_send(sc, wc);
+	if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)
+		pr_err("rpcrdma: Send: %s (%u/0x%x)\n",
+		       ib_wc_status_msg(wc->status),
+		       wc->status, wc->vendor_err);
+
+	rpcrdma_sendctx_put_locked(sc);
+}
+
+/**
+ * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
+ * @cq:	completion queue (ignored)
+ * @wc:	completed WR
+ *
+ */
+static void
+rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
+					       rr_cqe);
+
+	/* WARNING: Only wr_id and status are reliable at this point */
+	trace_xprtrdma_wc_receive(rep, wc);
+	if (wc->status != IB_WC_SUCCESS)
+		goto out_fail;
+
+	/* status == SUCCESS means all fields in wc are trustworthy */
+	rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len);
+	rep->rr_wc_flags = wc->wc_flags;
+	rep->rr_inv_rkey = wc->ex.invalidate_rkey;
+
+	ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf),
+				   rdmab_addr(rep->rr_rdmabuf),
+				   wc->byte_len, DMA_FROM_DEVICE);
+
+out_schedule:
+	rpcrdma_reply_handler(rep);
+	return;
+
+out_fail:
+	if (wc->status != IB_WC_WR_FLUSH_ERR)
+		pr_err("rpcrdma: Recv: %s (%u/0x%x)\n",
+		       ib_wc_status_msg(wc->status),
+		       wc->status, wc->vendor_err);
+	rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0);
+	goto out_schedule;
+}
+
+static void
+rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
+			       struct rdma_conn_param *param)
+{
+	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	const struct rpcrdma_connect_private *pmsg = param->private_data;
+	unsigned int rsize, wsize;
+
+	/* Default settings for RPC-over-RDMA Version One */
+	r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
+	rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+	wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+
+	if (pmsg &&
+	    pmsg->cp_magic == rpcrdma_cmp_magic &&
+	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+		r_xprt->rx_ia.ri_implicit_roundup = true;
+		rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
+		wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
+	}
+
+	if (rsize < cdata->inline_rsize)
+		cdata->inline_rsize = rsize;
+	if (wsize < cdata->inline_wsize)
+		cdata->inline_wsize = wsize;
+	dprintk("RPC:       %s: max send %u, max recv %u\n",
+		__func__, cdata->inline_wsize, cdata->inline_rsize);
+	rpcrdma_set_max_header_sizes(r_xprt);
+}
+
+static int
+rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+	struct rpcrdma_xprt *xprt = id->context;
+	struct rpcrdma_ia *ia = &xprt->rx_ia;
+	struct rpcrdma_ep *ep = &xprt->rx_ep;
+	int connstate = 0;
+
+	trace_xprtrdma_conn_upcall(xprt, event);
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		ia->ri_async_rc = 0;
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		ia->ri_async_rc = -EHOSTUNREACH;
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		ia->ri_async_rc = -ENETUNREACH;
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+		pr_info("rpcrdma: removing device %s for %s:%s\n",
+			ia->ri_device->name,
+			rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt));
+#endif
+		set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
+		ep->rep_connected = -ENODEV;
+		xprt_force_disconnect(&xprt->rx_xprt);
+		wait_for_completion(&ia->ri_remove_done);
+
+		ia->ri_id = NULL;
+		ia->ri_device = NULL;
+		/* Return 1 to ensure the core destroys the id. */
+		return 1;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		++xprt->rx_xprt.connect_cookie;
+		connstate = 1;
+		rpcrdma_update_connect_private(xprt, &event->param.conn);
+		goto connected;
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		connstate = -ENOTCONN;
+		goto connected;
+	case RDMA_CM_EVENT_UNREACHABLE:
+		connstate = -ENETDOWN;
+		goto connected;
+	case RDMA_CM_EVENT_REJECTED:
+		dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
+			rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt),
+			rdma_reject_msg(id, event->status));
+		connstate = -ECONNREFUSED;
+		if (event->status == IB_CM_REJ_STALE_CONN)
+			connstate = -EAGAIN;
+		goto connected;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		++xprt->rx_xprt.connect_cookie;
+		connstate = -ECONNABORTED;
+connected:
+		xprt->rx_buf.rb_credits = 1;
+		ep->rep_connected = connstate;
+		rpcrdma_conn_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+		/*FALLTHROUGH*/
+	default:
+		dprintk("RPC:       %s: %s:%s on %s/%s (ep 0x%p): %s\n",
+			__func__,
+			rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt),
+			ia->ri_device->name, ia->ri_ops->ro_displayname,
+			ep, rdma_event_msg(event->event));
+		break;
+	}
+
+	return 0;
+}
+
+static struct rdma_cm_id *
+rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
+{
+	unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
+	struct rdma_cm_id *id;
+	int rc;
+
+	trace_xprtrdma_conn_start(xprt);
+
+	init_completion(&ia->ri_done);
+	init_completion(&ia->ri_remove_done);
+
+	id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP,
+			    IB_QPT_RC);
+	if (IS_ERR(id)) {
+		rc = PTR_ERR(id);
+		dprintk("RPC:       %s: rdma_create_id() failed %i\n",
+			__func__, rc);
+		return id;
+	}
+
+	ia->ri_async_rc = -ETIMEDOUT;
+	rc = rdma_resolve_addr(id, NULL,
+			       (struct sockaddr *)&xprt->rx_xprt.addr,
+			       RDMA_RESOLVE_TIMEOUT);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
+			__func__, rc);
+		goto out;
+	}
+	rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+	if (rc < 0) {
+		trace_xprtrdma_conn_tout(xprt);
+		goto out;
+	}
+
+	rc = ia->ri_async_rc;
+	if (rc)
+		goto out;
+
+	ia->ri_async_rc = -ETIMEDOUT;
+	rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
+			__func__, rc);
+		goto out;
+	}
+	rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+	if (rc < 0) {
+		trace_xprtrdma_conn_tout(xprt);
+		goto out;
+	}
+	rc = ia->ri_async_rc;
+	if (rc)
+		goto out;
+
+	return id;
+
+out:
+	rdma_destroy_id(id);
+	return ERR_PTR(rc);
+}
+
+/*
+ * Exported functions.
+ */
+
+/**
+ * rpcrdma_ia_open - Open and initialize an Interface Adapter.
+ * @xprt: transport with IA to (re)initialize
+ *
+ * Returns 0 on success, negative errno if an appropriate
+ * Interface Adapter could not be found and opened.
+ */
+int
+rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
+{
+	struct rpcrdma_ia *ia = &xprt->rx_ia;
+	int rc;
+
+	ia->ri_id = rpcrdma_create_id(xprt, ia);
+	if (IS_ERR(ia->ri_id)) {
+		rc = PTR_ERR(ia->ri_id);
+		goto out_err;
+	}
+	ia->ri_device = ia->ri_id->device;
+
+	ia->ri_pd = ib_alloc_pd(ia->ri_device, 0);
+	if (IS_ERR(ia->ri_pd)) {
+		rc = PTR_ERR(ia->ri_pd);
+		pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
+		goto out_err;
+	}
+
+	switch (xprt_rdma_memreg_strategy) {
+	case RPCRDMA_FRWR:
+		if (frwr_is_supported(ia)) {
+			ia->ri_ops = &rpcrdma_frwr_memreg_ops;
+			break;
+		}
+		/*FALLTHROUGH*/
+	case RPCRDMA_MTHCAFMR:
+		if (fmr_is_supported(ia)) {
+			ia->ri_ops = &rpcrdma_fmr_memreg_ops;
+			break;
+		}
+		/*FALLTHROUGH*/
+	default:
+		pr_err("rpcrdma: Device %s does not support memreg mode %d\n",
+		       ia->ri_device->name, xprt_rdma_memreg_strategy);
+		rc = -EINVAL;
+		goto out_err;
+	}
+
+	return 0;
+
+out_err:
+	rpcrdma_ia_close(ia);
+	return rc;
+}
+
+/**
+ * rpcrdma_ia_remove - Handle device driver unload
+ * @ia: interface adapter being removed
+ *
+ * Divest transport H/W resources associated with this adapter,
+ * but allow it to be restored later.
+ */
+void
+rpcrdma_ia_remove(struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
+						   rx_ia);
+	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_req *req;
+	struct rpcrdma_rep *rep;
+
+	cancel_delayed_work_sync(&buf->rb_refresh_worker);
+
+	/* This is similar to rpcrdma_ep_destroy, but:
+	 * - Don't cancel the connect worker.
+	 * - Don't call rpcrdma_ep_disconnect, which waits
+	 *   for another conn upcall, which will deadlock.
+	 * - rdma_disconnect is unneeded, the underlying
+	 *   connection is already gone.
+	 */
+	if (ia->ri_id->qp) {
+		ib_drain_qp(ia->ri_id->qp);
+		rdma_destroy_qp(ia->ri_id);
+		ia->ri_id->qp = NULL;
+	}
+	ib_free_cq(ep->rep_attr.recv_cq);
+	ep->rep_attr.recv_cq = NULL;
+	ib_free_cq(ep->rep_attr.send_cq);
+	ep->rep_attr.send_cq = NULL;
+
+	/* The ULP is responsible for ensuring all DMA
+	 * mappings and MRs are gone.
+	 */
+	list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list)
+		rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf);
+	list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
+		rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf);
+		rpcrdma_dma_unmap_regbuf(req->rl_sendbuf);
+		rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
+	}
+	rpcrdma_mrs_destroy(buf);
+	ib_dealloc_pd(ia->ri_pd);
+	ia->ri_pd = NULL;
+
+	/* Allow waiters to continue */
+	complete(&ia->ri_remove_done);
+
+	trace_xprtrdma_remove(r_xprt);
+}
+
+/**
+ * rpcrdma_ia_close - Clean up/close an IA.
+ * @ia: interface adapter to close
+ *
+ */
+void
+rpcrdma_ia_close(struct rpcrdma_ia *ia)
+{
+	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
+		if (ia->ri_id->qp)
+			rdma_destroy_qp(ia->ri_id);
+		rdma_destroy_id(ia->ri_id);
+	}
+	ia->ri_id = NULL;
+	ia->ri_device = NULL;
+
+	/* If the pd is still busy, xprtrdma missed freeing a resource */
+	if (ia->ri_pd && !IS_ERR(ia->ri_pd))
+		ib_dealloc_pd(ia->ri_pd);
+	ia->ri_pd = NULL;
+}
+
+/*
+ * Create unconnected endpoint.
+ */
+int
+rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+		  struct rpcrdma_create_data_internal *cdata)
+{
+	struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
+	unsigned int max_qp_wr, max_sge;
+	struct ib_cq *sendcq, *recvcq;
+	int rc;
+
+	max_sge = min_t(unsigned int, ia->ri_device->attrs.max_sge,
+			RPCRDMA_MAX_SEND_SGES);
+	if (max_sge < RPCRDMA_MIN_SEND_SGES) {
+		pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
+		return -ENOMEM;
+	}
+	ia->ri_max_send_sges = max_sge;
+
+	if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
+		dprintk("RPC:       %s: insufficient wqe's available\n",
+			__func__);
+		return -ENOMEM;
+	}
+	max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1;
+
+	/* check provider's send/recv wr limits */
+	if (cdata->max_requests > max_qp_wr)
+		cdata->max_requests = max_qp_wr;
+
+	ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
+	ep->rep_attr.qp_context = ep;
+	ep->rep_attr.srq = NULL;
+	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
+	ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+	ep->rep_attr.cap.max_send_wr += 1;	/* drain cqe */
+	rc = ia->ri_ops->ro_open(ia, ep, cdata);
+	if (rc)
+		return rc;
+	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
+	ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+	ep->rep_attr.cap.max_recv_wr += 1;	/* drain cqe */
+	ep->rep_attr.cap.max_send_sge = max_sge;
+	ep->rep_attr.cap.max_recv_sge = 1;
+	ep->rep_attr.cap.max_inline_data = 0;
+	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	ep->rep_attr.qp_type = IB_QPT_RC;
+	ep->rep_attr.port_num = ~0;
+
+	dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
+		"iovs: send %d recv %d\n",
+		__func__,
+		ep->rep_attr.cap.max_send_wr,
+		ep->rep_attr.cap.max_recv_wr,
+		ep->rep_attr.cap.max_send_sge,
+		ep->rep_attr.cap.max_recv_sge);
+
+	/* set trigger for requesting send completion */
+	ep->rep_send_batch = min_t(unsigned int, RPCRDMA_MAX_SEND_BATCH,
+				   cdata->max_requests >> 2);
+	ep->rep_send_count = ep->rep_send_batch;
+	init_waitqueue_head(&ep->rep_connect_wait);
+	INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
+
+	sendcq = ib_alloc_cq(ia->ri_device, NULL,
+			     ep->rep_attr.cap.max_send_wr + 1,
+			     1, IB_POLL_WORKQUEUE);
+	if (IS_ERR(sendcq)) {
+		rc = PTR_ERR(sendcq);
+		dprintk("RPC:       %s: failed to create send CQ: %i\n",
+			__func__, rc);
+		goto out1;
+	}
+
+	recvcq = ib_alloc_cq(ia->ri_device, NULL,
+			     ep->rep_attr.cap.max_recv_wr + 1,
+			     0, IB_POLL_WORKQUEUE);
+	if (IS_ERR(recvcq)) {
+		rc = PTR_ERR(recvcq);
+		dprintk("RPC:       %s: failed to create recv CQ: %i\n",
+			__func__, rc);
+		goto out2;
+	}
+
+	ep->rep_attr.send_cq = sendcq;
+	ep->rep_attr.recv_cq = recvcq;
+
+	/* Initialize cma parameters */
+	memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
+
+	/* Prepare RDMA-CM private message */
+	pmsg->cp_magic = rpcrdma_cmp_magic;
+	pmsg->cp_version = RPCRDMA_CMP_VERSION;
+	pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok;
+	pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize);
+	pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize);
+	ep->rep_remote_cma.private_data = pmsg;
+	ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
+
+	/* Client offers RDMA Read but does not initiate */
+	ep->rep_remote_cma.initiator_depth = 0;
+	ep->rep_remote_cma.responder_resources =
+		min_t(int, U8_MAX, ia->ri_device->attrs.max_qp_rd_atom);
+
+	/* Limit transport retries so client can detect server
+	 * GID changes quickly. RPC layer handles re-establishing
+	 * transport connection and retransmission.
+	 */
+	ep->rep_remote_cma.retry_count = 6;
+
+	/* RPC-over-RDMA handles its own flow control. In addition,
+	 * make all RNR NAKs visible so we know that RPC-over-RDMA
+	 * flow control is working correctly (no NAKs should be seen).
+	 */
+	ep->rep_remote_cma.flow_control = 0;
+	ep->rep_remote_cma.rnr_retry_count = 0;
+
+	return 0;
+
+out2:
+	ib_free_cq(sendcq);
+out1:
+	return rc;
+}
+
+/*
+ * rpcrdma_ep_destroy
+ *
+ * Disconnect and destroy endpoint. After this, the only
+ * valid operations on the ep are to free it (if dynamically
+ * allocated) or re-create it.
+ */
+void
+rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	cancel_delayed_work_sync(&ep->rep_connect_worker);
+
+	if (ia->ri_id && ia->ri_id->qp) {
+		rpcrdma_ep_disconnect(ep, ia);
+		rdma_destroy_qp(ia->ri_id);
+		ia->ri_id->qp = NULL;
+	}
+
+	if (ep->rep_attr.recv_cq)
+		ib_free_cq(ep->rep_attr.recv_cq);
+	if (ep->rep_attr.send_cq)
+		ib_free_cq(ep->rep_attr.send_cq);
+}
+
+/* Re-establish a connection after a device removal event.
+ * Unlike a normal reconnection, a fresh PD and a new set
+ * of MRs and buffers is needed.
+ */
+static int
+rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
+			 struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	int rc, err;
+
+	trace_xprtrdma_reinsert(r_xprt);
+
+	rc = -EHOSTUNREACH;
+	if (rpcrdma_ia_open(r_xprt))
+		goto out1;
+
+	rc = -ENOMEM;
+	err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data);
+	if (err) {
+		pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
+		goto out2;
+	}
+
+	rc = -ENETUNREACH;
+	err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+	if (err) {
+		pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
+		goto out3;
+	}
+
+	rpcrdma_mrs_create(r_xprt);
+	return 0;
+
+out3:
+	rpcrdma_ep_destroy(ep, ia);
+out2:
+	rpcrdma_ia_close(ia);
+out1:
+	return rc;
+}
+
+static int
+rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
+		     struct rpcrdma_ia *ia)
+{
+	struct rdma_cm_id *id, *old;
+	int err, rc;
+
+	trace_xprtrdma_reconnect(r_xprt);
+
+	rpcrdma_ep_disconnect(ep, ia);
+
+	rc = -EHOSTUNREACH;
+	id = rpcrdma_create_id(r_xprt, ia);
+	if (IS_ERR(id))
+		goto out;
+
+	/* As long as the new ID points to the same device as the
+	 * old ID, we can reuse the transport's existing PD and all
+	 * previously allocated MRs. Also, the same device means
+	 * the transport's previous DMA mappings are still valid.
+	 *
+	 * This is a sanity check only. There should be no way these
+	 * point to two different devices here.
+	 */
+	old = id;
+	rc = -ENETUNREACH;
+	if (ia->ri_device != id->device) {
+		pr_err("rpcrdma: can't reconnect on different device!\n");
+		goto out_destroy;
+	}
+
+	err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
+	if (err) {
+		dprintk("RPC:       %s: rdma_create_qp returned %d\n",
+			__func__, err);
+		goto out_destroy;
+	}
+
+	/* Atomically replace the transport's ID and QP. */
+	rc = 0;
+	old = ia->ri_id;
+	ia->ri_id = id;
+	rdma_destroy_qp(old);
+
+out_destroy:
+	rdma_destroy_id(old);
+out:
+	return rc;
+}
+
+/*
+ * Connect unconnected endpoint.
+ */
+int
+rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
+						   rx_ia);
+	unsigned int extras;
+	int rc;
+
+retry:
+	switch (ep->rep_connected) {
+	case 0:
+		dprintk("RPC:       %s: connecting...\n", __func__);
+		rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+		if (rc) {
+			dprintk("RPC:       %s: rdma_create_qp failed %i\n",
+				__func__, rc);
+			rc = -ENETUNREACH;
+			goto out_noupdate;
+		}
+		break;
+	case -ENODEV:
+		rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia);
+		if (rc)
+			goto out_noupdate;
+		break;
+	default:
+		rc = rpcrdma_ep_reconnect(r_xprt, ep, ia);
+		if (rc)
+			goto out;
+	}
+
+	ep->rep_connected = 0;
+
+	rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_connect() failed with %i\n",
+				__func__, rc);
+		goto out;
+	}
+
+	wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
+	if (ep->rep_connected <= 0) {
+		if (ep->rep_connected == -EAGAIN)
+			goto retry;
+		rc = ep->rep_connected;
+		goto out;
+	}
+
+	dprintk("RPC:       %s: connected\n", __func__);
+	extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
+	if (extras)
+		rpcrdma_ep_post_extra_recv(r_xprt, extras);
+
+out:
+	if (rc)
+		ep->rep_connected = rc;
+
+out_noupdate:
+	return rc;
+}
+
+/*
+ * rpcrdma_ep_disconnect
+ *
+ * This is separate from destroy to facilitate the ability
+ * to reconnect without recreating the endpoint.
+ *
+ * This call is not reentrant, and must not be made in parallel
+ * on the same endpoint.
+ */
+void
+rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	int rc;
+
+	rc = rdma_disconnect(ia->ri_id);
+	if (!rc)
+		/* returns without wait if not connected */
+		wait_event_interruptible(ep->rep_connect_wait,
+							ep->rep_connected != 1);
+	else
+		ep->rep_connected = rc;
+	trace_xprtrdma_disconnect(container_of(ep, struct rpcrdma_xprt,
+					       rx_ep), rc);
+
+	ib_drain_qp(ia->ri_id->qp);
+}
+
+/* Fixed-size circular FIFO queue. This implementation is wait-free and
+ * lock-free.
+ *
+ * Consumer is the code path that posts Sends. This path dequeues a
+ * sendctx for use by a Send operation. Multiple consumer threads
+ * are serialized by the RPC transport lock, which allows only one
+ * ->send_request call at a time.
+ *
+ * Producer is the code path that handles Send completions. This path
+ * enqueues a sendctx that has been completed. Multiple producer
+ * threads are serialized by the ib_poll_cq() function.
+ */
+
+/* rpcrdma_sendctxs_destroy() assumes caller has already quiesced
+ * queue activity, and ib_drain_qp has flushed all remaining Send
+ * requests.
+ */
+static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf)
+{
+	unsigned long i;
+
+	for (i = 0; i <= buf->rb_sc_last; i++)
+		kfree(buf->rb_sc_ctxs[i]);
+	kfree(buf->rb_sc_ctxs);
+}
+
+static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_sendctx *sc;
+
+	sc = kzalloc(sizeof(*sc) +
+		     ia->ri_max_send_sges * sizeof(struct ib_sge),
+		     GFP_KERNEL);
+	if (!sc)
+		return NULL;
+
+	sc->sc_wr.wr_cqe = &sc->sc_cqe;
+	sc->sc_wr.sg_list = sc->sc_sges;
+	sc->sc_wr.opcode = IB_WR_SEND;
+	sc->sc_cqe.done = rpcrdma_wc_send;
+	return sc;
+}
+
+static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_sendctx *sc;
+	unsigned long i;
+
+	/* Maximum number of concurrent outstanding Send WRs. Capping
+	 * the circular queue size stops Send Queue overflow by causing
+	 * the ->send_request call to fail temporarily before too many
+	 * Sends are posted.
+	 */
+	i = buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS;
+	dprintk("RPC:       %s: allocating %lu send_ctxs\n", __func__, i);
+	buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
+	if (!buf->rb_sc_ctxs)
+		return -ENOMEM;
+
+	buf->rb_sc_last = i - 1;
+	for (i = 0; i <= buf->rb_sc_last; i++) {
+		sc = rpcrdma_sendctx_create(&r_xprt->rx_ia);
+		if (!sc)
+			goto out_destroy;
+
+		sc->sc_xprt = r_xprt;
+		buf->rb_sc_ctxs[i] = sc;
+	}
+
+	return 0;
+
+out_destroy:
+	rpcrdma_sendctxs_destroy(buf);
+	return -ENOMEM;
+}
+
+/* The sendctx queue is not guaranteed to have a size that is a
+ * power of two, thus the helpers in circ_buf.h cannot be used.
+ * The other option is to use modulus (%), which can be expensive.
+ */
+static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf,
+					  unsigned long item)
+{
+	return likely(item < buf->rb_sc_last) ? item + 1 : 0;
+}
+
+/**
+ * rpcrdma_sendctx_get_locked - Acquire a send context
+ * @buf: transport buffers from which to acquire an unused context
+ *
+ * Returns pointer to a free send completion context; or NULL if
+ * the queue is empty.
+ *
+ * Usage: Called to acquire an SGE array before preparing a Send WR.
+ *
+ * The caller serializes calls to this function (per rpcrdma_buffer),
+ * and provides an effective memory barrier that flushes the new value
+ * of rb_sc_head.
+ */
+struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_xprt *r_xprt;
+	struct rpcrdma_sendctx *sc;
+	unsigned long next_head;
+
+	next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head);
+
+	if (next_head == READ_ONCE(buf->rb_sc_tail))
+		goto out_emptyq;
+
+	/* ORDER: item must be accessed _before_ head is updated */
+	sc = buf->rb_sc_ctxs[next_head];
+
+	/* Releasing the lock in the caller acts as a memory
+	 * barrier that flushes rb_sc_head.
+	 */
+	buf->rb_sc_head = next_head;
+
+	return sc;
+
+out_emptyq:
+	/* The queue is "empty" if there have not been enough Send
+	 * completions recently. This is a sign the Send Queue is
+	 * backing up. Cause the caller to pause and try again.
+	 */
+	dprintk("RPC:       %s: empty sendctx queue\n", __func__);
+	r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf);
+	r_xprt->rx_stats.empty_sendctx_q++;
+	return NULL;
+}
+
+/**
+ * rpcrdma_sendctx_put_locked - Release a send context
+ * @sc: send context to release
+ *
+ * Usage: Called from Send completion to return a sendctxt
+ * to the queue.
+ *
+ * The caller serializes calls to this function (per rpcrdma_buffer).
+ */
+void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc)
+{
+	struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf;
+	unsigned long next_tail;
+
+	/* Unmap SGEs of previously completed by unsignaled
+	 * Sends by walking up the queue until @sc is found.
+	 */
+	next_tail = buf->rb_sc_tail;
+	do {
+		next_tail = rpcrdma_sendctx_next(buf, next_tail);
+
+		/* ORDER: item must be accessed _before_ tail is updated */
+		rpcrdma_unmap_sendctx(buf->rb_sc_ctxs[next_tail]);
+
+	} while (buf->rb_sc_ctxs[next_tail] != sc);
+
+	/* Paired with READ_ONCE */
+	smp_store_release(&buf->rb_sc_tail, next_tail);
+}
+
+static void
+rpcrdma_mr_recovery_worker(struct work_struct *work)
+{
+	struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
+						  rb_recovery_worker.work);
+	struct rpcrdma_mr *mr;
+
+	spin_lock(&buf->rb_recovery_lock);
+	while (!list_empty(&buf->rb_stale_mrs)) {
+		mr = rpcrdma_mr_pop(&buf->rb_stale_mrs);
+		spin_unlock(&buf->rb_recovery_lock);
+
+		trace_xprtrdma_recover_mr(mr);
+		mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr);
+
+		spin_lock(&buf->rb_recovery_lock);
+	}
+	spin_unlock(&buf->rb_recovery_lock);
+}
+
+void
+rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr)
+{
+	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+
+	spin_lock(&buf->rb_recovery_lock);
+	rpcrdma_mr_push(mr, &buf->rb_stale_mrs);
+	spin_unlock(&buf->rb_recovery_lock);
+
+	schedule_delayed_work(&buf->rb_recovery_worker, 0);
+}
+
+static void
+rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	unsigned int count;
+	LIST_HEAD(free);
+	LIST_HEAD(all);
+
+	for (count = 0; count < 3; count++) {
+		struct rpcrdma_mr *mr;
+		int rc;
+
+		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+		if (!mr)
+			break;
+
+		rc = ia->ri_ops->ro_init_mr(ia, mr);
+		if (rc) {
+			kfree(mr);
+			break;
+		}
+
+		mr->mr_xprt = r_xprt;
+
+		list_add(&mr->mr_list, &free);
+		list_add(&mr->mr_all, &all);
+	}
+
+	spin_lock(&buf->rb_mrlock);
+	list_splice(&free, &buf->rb_mrs);
+	list_splice(&all, &buf->rb_all);
+	r_xprt->rx_stats.mrs_allocated += count;
+	spin_unlock(&buf->rb_mrlock);
+	trace_xprtrdma_createmrs(r_xprt, count);
+
+	xprt_write_space(&r_xprt->rx_xprt);
+}
+
+static void
+rpcrdma_mr_refresh_worker(struct work_struct *work)
+{
+	struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
+						  rb_refresh_worker.work);
+	struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
+						   rx_buf);
+
+	rpcrdma_mrs_create(r_xprt);
+}
+
+struct rpcrdma_req *
+rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
+	struct rpcrdma_regbuf *rb;
+	struct rpcrdma_req *req;
+
+	req = kzalloc(sizeof(*req), GFP_KERNEL);
+	if (req == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
+				  DMA_TO_DEVICE, GFP_KERNEL);
+	if (IS_ERR(rb)) {
+		kfree(req);
+		return ERR_PTR(-ENOMEM);
+	}
+	req->rl_rdmabuf = rb;
+	xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
+	req->rl_buffer = buffer;
+	INIT_LIST_HEAD(&req->rl_registered);
+
+	spin_lock(&buffer->rb_reqslock);
+	list_add(&req->rl_all, &buffer->rb_allreqs);
+	spin_unlock(&buffer->rb_reqslock);
+	return req;
+}
+
+/**
+ * rpcrdma_create_rep - Allocate an rpcrdma_rep object
+ * @r_xprt: controlling transport
+ *
+ * Returns 0 on success or a negative errno on failure.
+ */
+int
+rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_rep *rep;
+	int rc;
+
+	rc = -ENOMEM;
+	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+	if (rep == NULL)
+		goto out;
+
+	rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize,
+					       DMA_FROM_DEVICE, GFP_KERNEL);
+	if (IS_ERR(rep->rr_rdmabuf)) {
+		rc = PTR_ERR(rep->rr_rdmabuf);
+		goto out_free;
+	}
+	xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base,
+		     rdmab_length(rep->rr_rdmabuf));
+
+	rep->rr_cqe.done = rpcrdma_wc_receive;
+	rep->rr_rxprt = r_xprt;
+	INIT_WORK(&rep->rr_work, rpcrdma_deferred_completion);
+	rep->rr_recv_wr.next = NULL;
+	rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
+	rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
+	rep->rr_recv_wr.num_sge = 1;
+
+	spin_lock(&buf->rb_lock);
+	list_add(&rep->rr_list, &buf->rb_recv_bufs);
+	spin_unlock(&buf->rb_lock);
+	return 0;
+
+out_free:
+	kfree(rep);
+out:
+	dprintk("RPC:       %s: reply buffer %d alloc failed\n",
+		__func__, rc);
+	return rc;
+}
+
+int
+rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	int i, rc;
+
+	buf->rb_max_requests = r_xprt->rx_data.max_requests;
+	buf->rb_bc_srv_max_requests = 0;
+	spin_lock_init(&buf->rb_mrlock);
+	spin_lock_init(&buf->rb_lock);
+	spin_lock_init(&buf->rb_recovery_lock);
+	INIT_LIST_HEAD(&buf->rb_mrs);
+	INIT_LIST_HEAD(&buf->rb_all);
+	INIT_LIST_HEAD(&buf->rb_stale_mrs);
+	INIT_DELAYED_WORK(&buf->rb_refresh_worker,
+			  rpcrdma_mr_refresh_worker);
+	INIT_DELAYED_WORK(&buf->rb_recovery_worker,
+			  rpcrdma_mr_recovery_worker);
+
+	rpcrdma_mrs_create(r_xprt);
+
+	INIT_LIST_HEAD(&buf->rb_send_bufs);
+	INIT_LIST_HEAD(&buf->rb_allreqs);
+	spin_lock_init(&buf->rb_reqslock);
+	for (i = 0; i < buf->rb_max_requests; i++) {
+		struct rpcrdma_req *req;
+
+		req = rpcrdma_create_req(r_xprt);
+		if (IS_ERR(req)) {
+			dprintk("RPC:       %s: request buffer %d alloc"
+				" failed\n", __func__, i);
+			rc = PTR_ERR(req);
+			goto out;
+		}
+		list_add(&req->rl_list, &buf->rb_send_bufs);
+	}
+
+	INIT_LIST_HEAD(&buf->rb_recv_bufs);
+	for (i = 0; i <= buf->rb_max_requests; i++) {
+		rc = rpcrdma_create_rep(r_xprt);
+		if (rc)
+			goto out;
+	}
+
+	rc = rpcrdma_sendctxs_create(r_xprt);
+	if (rc)
+		goto out;
+
+	return 0;
+out:
+	rpcrdma_buffer_destroy(buf);
+	return rc;
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_req_locked(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_req *req;
+
+	req = list_first_entry(&buf->rb_send_bufs,
+			       struct rpcrdma_req, rl_list);
+	list_del_init(&req->rl_list);
+	return req;
+}
+
+static struct rpcrdma_rep *
+rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_rep *rep;
+
+	rep = list_first_entry(&buf->rb_recv_bufs,
+			       struct rpcrdma_rep, rr_list);
+	list_del(&rep->rr_list);
+	return rep;
+}
+
+static void
+rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
+{
+	rpcrdma_free_regbuf(rep->rr_rdmabuf);
+	kfree(rep);
+}
+
+void
+rpcrdma_destroy_req(struct rpcrdma_req *req)
+{
+	rpcrdma_free_regbuf(req->rl_recvbuf);
+	rpcrdma_free_regbuf(req->rl_sendbuf);
+	rpcrdma_free_regbuf(req->rl_rdmabuf);
+	kfree(req);
+}
+
+static void
+rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
+						   rx_buf);
+	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+	struct rpcrdma_mr *mr;
+	unsigned int count;
+
+	count = 0;
+	spin_lock(&buf->rb_mrlock);
+	while (!list_empty(&buf->rb_all)) {
+		mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all);
+		list_del(&mr->mr_all);
+
+		spin_unlock(&buf->rb_mrlock);
+
+		/* Ensure MW is not on any rl_registered list */
+		if (!list_empty(&mr->mr_list))
+			list_del(&mr->mr_list);
+
+		ia->ri_ops->ro_release_mr(mr);
+		count++;
+		spin_lock(&buf->rb_mrlock);
+	}
+	spin_unlock(&buf->rb_mrlock);
+	r_xprt->rx_stats.mrs_allocated = 0;
+
+	dprintk("RPC:       %s: released %u MRs\n", __func__, count);
+}
+
+void
+rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+{
+	cancel_delayed_work_sync(&buf->rb_recovery_worker);
+	cancel_delayed_work_sync(&buf->rb_refresh_worker);
+
+	rpcrdma_sendctxs_destroy(buf);
+
+	while (!list_empty(&buf->rb_recv_bufs)) {
+		struct rpcrdma_rep *rep;
+
+		rep = rpcrdma_buffer_get_rep_locked(buf);
+		rpcrdma_destroy_rep(rep);
+	}
+	buf->rb_send_count = 0;
+
+	spin_lock(&buf->rb_reqslock);
+	while (!list_empty(&buf->rb_allreqs)) {
+		struct rpcrdma_req *req;
+
+		req = list_first_entry(&buf->rb_allreqs,
+				       struct rpcrdma_req, rl_all);
+		list_del(&req->rl_all);
+
+		spin_unlock(&buf->rb_reqslock);
+		rpcrdma_destroy_req(req);
+		spin_lock(&buf->rb_reqslock);
+	}
+	spin_unlock(&buf->rb_reqslock);
+	buf->rb_recv_count = 0;
+
+	rpcrdma_mrs_destroy(buf);
+}
+
+/**
+ * rpcrdma_mr_get - Allocate an rpcrdma_mr object
+ * @r_xprt: controlling transport
+ *
+ * Returns an initialized rpcrdma_mr or NULL if no free
+ * rpcrdma_mr objects are available.
+ */
+struct rpcrdma_mr *
+rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_mr *mr = NULL;
+
+	spin_lock(&buf->rb_mrlock);
+	if (!list_empty(&buf->rb_mrs))
+		mr = rpcrdma_mr_pop(&buf->rb_mrs);
+	spin_unlock(&buf->rb_mrlock);
+
+	if (!mr)
+		goto out_nomrs;
+	return mr;
+
+out_nomrs:
+	trace_xprtrdma_nomrs(r_xprt);
+	if (r_xprt->rx_ep.rep_connected != -ENODEV)
+		schedule_delayed_work(&buf->rb_refresh_worker, 0);
+
+	/* Allow the reply handler and refresh worker to run */
+	cond_resched();
+
+	return NULL;
+}
+
+static void
+__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr)
+{
+	spin_lock(&buf->rb_mrlock);
+	rpcrdma_mr_push(mr, &buf->rb_mrs);
+	spin_unlock(&buf->rb_mrlock);
+}
+
+/**
+ * rpcrdma_mr_put - Release an rpcrdma_mr object
+ * @mr: object to release
+ *
+ */
+void
+rpcrdma_mr_put(struct rpcrdma_mr *mr)
+{
+	__rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr);
+}
+
+/**
+ * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it
+ * @mr: object to release
+ *
+ */
+void
+rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
+{
+	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+
+	trace_xprtrdma_dma_unmap(mr);
+	ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+			mr->mr_sg, mr->mr_nents, mr->mr_dir);
+	__rpcrdma_mr_put(&r_xprt->rx_buf, mr);
+}
+
+static struct rpcrdma_rep *
+rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers)
+{
+	/* If an RPC previously completed without a reply (say, a
+	 * credential problem or a soft timeout occurs) then hold off
+	 * on supplying more Receive buffers until the number of new
+	 * pending RPCs catches up to the number of posted Receives.
+	 */
+	if (unlikely(buffers->rb_send_count < buffers->rb_recv_count))
+		return NULL;
+
+	if (unlikely(list_empty(&buffers->rb_recv_bufs)))
+		return NULL;
+	buffers->rb_recv_count++;
+	return rpcrdma_buffer_get_rep_locked(buffers);
+}
+
+/*
+ * Get a set of request/reply buffers.
+ *
+ * Reply buffer (if available) is attached to send buffer upon return.
+ */
+struct rpcrdma_req *
+rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
+{
+	struct rpcrdma_req *req;
+
+	spin_lock(&buffers->rb_lock);
+	if (list_empty(&buffers->rb_send_bufs))
+		goto out_reqbuf;
+	buffers->rb_send_count++;
+	req = rpcrdma_buffer_get_req_locked(buffers);
+	req->rl_reply = rpcrdma_buffer_get_rep(buffers);
+	spin_unlock(&buffers->rb_lock);
+
+	return req;
+
+out_reqbuf:
+	spin_unlock(&buffers->rb_lock);
+	return NULL;
+}
+
+/*
+ * Put request/reply buffers back into pool.
+ * Pre-decrement counter/array index.
+ */
+void
+rpcrdma_buffer_put(struct rpcrdma_req *req)
+{
+	struct rpcrdma_buffer *buffers = req->rl_buffer;
+	struct rpcrdma_rep *rep = req->rl_reply;
+
+	req->rl_reply = NULL;
+
+	spin_lock(&buffers->rb_lock);
+	buffers->rb_send_count--;
+	list_add_tail(&req->rl_list, &buffers->rb_send_bufs);
+	if (rep) {
+		buffers->rb_recv_count--;
+		list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+	}
+	spin_unlock(&buffers->rb_lock);
+}
+
+/*
+ * Recover reply buffers from pool.
+ * This happens when recovering from disconnect.
+ */
+void
+rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
+{
+	struct rpcrdma_buffer *buffers = req->rl_buffer;
+
+	spin_lock(&buffers->rb_lock);
+	req->rl_reply = rpcrdma_buffer_get_rep(buffers);
+	spin_unlock(&buffers->rb_lock);
+}
+
+/*
+ * Put reply buffers back into pool when not attached to
+ * request. This happens in error conditions.
+ */
+void
+rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
+
+	spin_lock(&buffers->rb_lock);
+	buffers->rb_recv_count--;
+	list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+	spin_unlock(&buffers->rb_lock);
+}
+
+/**
+ * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers
+ * @size: size of buffer to be allocated, in bytes
+ * @direction: direction of data movement
+ * @flags: GFP flags
+ *
+ * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that
+ * can be persistently DMA-mapped for I/O.
+ *
+ * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
+ * receiving the payload of RDMA RECV operations. During Long Calls
+ * or Replies they may be registered externally via ro_map.
+ */
+struct rpcrdma_regbuf *
+rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
+		     gfp_t flags)
+{
+	struct rpcrdma_regbuf *rb;
+
+	rb = kmalloc(sizeof(*rb) + size, flags);
+	if (rb == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	rb->rg_device = NULL;
+	rb->rg_direction = direction;
+	rb->rg_iov.length = size;
+
+	return rb;
+}
+
+/**
+ * __rpcrdma_map_regbuf - DMA-map a regbuf
+ * @ia: controlling rpcrdma_ia
+ * @rb: regbuf to be mapped
+ */
+bool
+__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+{
+	struct ib_device *device = ia->ri_device;
+
+	if (rb->rg_direction == DMA_NONE)
+		return false;
+
+	rb->rg_iov.addr = ib_dma_map_single(device,
+					    (void *)rb->rg_base,
+					    rdmab_length(rb),
+					    rb->rg_direction);
+	if (ib_dma_mapping_error(device, rdmab_addr(rb)))
+		return false;
+
+	rb->rg_device = device;
+	rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
+	return true;
+}
+
+static void
+rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
+{
+	if (!rb)
+		return;
+
+	if (!rpcrdma_regbuf_is_mapped(rb))
+		return;
+
+	ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb),
+			    rdmab_length(rb), rb->rg_direction);
+	rb->rg_device = NULL;
+}
+
+/**
+ * rpcrdma_free_regbuf - deregister and free registered buffer
+ * @rb: regbuf to be deregistered and freed
+ */
+void
+rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
+{
+	rpcrdma_dma_unmap_regbuf(rb);
+	kfree(rb);
+}
+
+/*
+ * Prepost any receive buffer, then post send.
+ *
+ * Receive buffer is donated to hardware, reclaimed upon recv completion.
+ */
+int
+rpcrdma_ep_post(struct rpcrdma_ia *ia,
+		struct rpcrdma_ep *ep,
+		struct rpcrdma_req *req)
+{
+	struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
+	int rc;
+
+	if (req->rl_reply) {
+		rc = rpcrdma_ep_post_recv(ia, req->rl_reply);
+		if (rc)
+			return rc;
+		req->rl_reply = NULL;
+	}
+
+	if (!ep->rep_send_count ||
+	    test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
+		send_wr->send_flags |= IB_SEND_SIGNALED;
+		ep->rep_send_count = ep->rep_send_batch;
+	} else {
+		send_wr->send_flags &= ~IB_SEND_SIGNALED;
+		--ep->rep_send_count;
+	}
+
+	rc = ia->ri_ops->ro_send(ia, req);
+	trace_xprtrdma_post_send(req, rc);
+	if (rc)
+		return -ENOTCONN;
+	return 0;
+}
+
+int
+rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
+		     struct rpcrdma_rep *rep)
+{
+	struct ib_recv_wr *recv_wr_fail;
+	int rc;
+
+	if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
+		goto out_map;
+	rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
+	trace_xprtrdma_post_recv(rep, rc);
+	if (rc)
+		return -ENOTCONN;
+	return 0;
+
+out_map:
+	pr_err("rpcrdma: failed to DMA map the Receive buffer\n");
+	return -EIO;
+}
+
+/**
+ * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests
+ * @r_xprt: transport associated with these backchannel resources
+ * @count: minimum number of incoming requests expected
+ *
+ * Returns zero if all requested buffers were posted, or a negative errno.
+ */
+int
+rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
+{
+	struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct rpcrdma_rep *rep;
+	int rc;
+
+	while (count--) {
+		spin_lock(&buffers->rb_lock);
+		if (list_empty(&buffers->rb_recv_bufs))
+			goto out_reqbuf;
+		rep = rpcrdma_buffer_get_rep_locked(buffers);
+		spin_unlock(&buffers->rb_lock);
+
+		rc = rpcrdma_ep_post_recv(ia, rep);
+		if (rc)
+			goto out_rc;
+	}
+
+	return 0;
+
+out_reqbuf:
+	spin_unlock(&buffers->rb_lock);
+	trace_xprtrdma_noreps(r_xprt);
+	return -ENOMEM;
+
+out_rc:
+	rpcrdma_recv_buffer_put(rep);
+	return rc;
+}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
new file mode 100644
index 0000000..cb41b12
--- /dev/null
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2014-2017 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
+#define _LINUX_SUNRPC_XPRT_RDMA_H
+
+#include <linux/wait.h> 		/* wait_queue_head_t, etc */
+#include <linux/spinlock.h> 		/* spinlock_t, etc */
+#include <linux/atomic.h>			/* atomic_t, etc */
+#include <linux/workqueue.h>		/* struct work_struct */
+
+#include <rdma/rdma_cm.h>		/* RDMA connection api */
+#include <rdma/ib_verbs.h>		/* RDMA verbs api */
+
+#include <linux/sunrpc/clnt.h> 		/* rpc_xprt */
+#include <linux/sunrpc/rpc_rdma.h> 	/* RPC/RDMA protocol */
+#include <linux/sunrpc/xprtrdma.h> 	/* xprt parameters */
+
+#define RDMA_RESOLVE_TIMEOUT	(5000)	/* 5 seconds */
+#define RDMA_CONNECT_RETRY_MAX	(2)	/* retries if no listener backlog */
+
+#define RPCRDMA_BIND_TO		(60U * HZ)
+#define RPCRDMA_INIT_REEST_TO	(5U * HZ)
+#define RPCRDMA_MAX_REEST_TO	(30U * HZ)
+#define RPCRDMA_IDLE_DISC_TO	(5U * 60 * HZ)
+
+/*
+ * Interface Adapter -- one per transport instance
+ */
+struct rpcrdma_ia {
+	const struct rpcrdma_memreg_ops	*ri_ops;
+	struct ib_device	*ri_device;
+	struct rdma_cm_id 	*ri_id;
+	struct ib_pd		*ri_pd;
+	struct completion	ri_done;
+	struct completion	ri_remove_done;
+	int			ri_async_rc;
+	unsigned int		ri_max_segs;
+	unsigned int		ri_max_frwr_depth;
+	unsigned int		ri_max_inline_write;
+	unsigned int		ri_max_inline_read;
+	unsigned int		ri_max_send_sges;
+	bool			ri_implicit_roundup;
+	enum ib_mr_type		ri_mrtype;
+	unsigned long		ri_flags;
+	struct ib_qp_attr	ri_qp_attr;
+	struct ib_qp_init_attr	ri_qp_init_attr;
+};
+
+enum {
+	RPCRDMA_IAF_REMOVING = 0,
+};
+
+/*
+ * RDMA Endpoint -- one per transport instance
+ */
+
+struct rpcrdma_ep {
+	unsigned int		rep_send_count;
+	unsigned int		rep_send_batch;
+	int			rep_connected;
+	struct ib_qp_init_attr	rep_attr;
+	wait_queue_head_t 	rep_connect_wait;
+	struct rpcrdma_connect_private	rep_cm_private;
+	struct rdma_conn_param	rep_remote_cma;
+	struct delayed_work	rep_connect_worker;
+};
+
+/* Pre-allocate extra Work Requests for handling backward receives
+ * and sends. This is a fixed value because the Work Queues are
+ * allocated when the forward channel is set up.
+ */
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+#define RPCRDMA_BACKWARD_WRS		(8)
+#else
+#define RPCRDMA_BACKWARD_WRS		(0)
+#endif
+
+/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
+ *
+ * The below structure appears at the front of a large region of kmalloc'd
+ * memory, which always starts on a good alignment boundary.
+ */
+
+struct rpcrdma_regbuf {
+	struct ib_sge		rg_iov;
+	struct ib_device	*rg_device;
+	enum dma_data_direction	rg_direction;
+	__be32			rg_base[0] __attribute__ ((aligned(256)));
+};
+
+static inline u64
+rdmab_addr(struct rpcrdma_regbuf *rb)
+{
+	return rb->rg_iov.addr;
+}
+
+static inline u32
+rdmab_length(struct rpcrdma_regbuf *rb)
+{
+	return rb->rg_iov.length;
+}
+
+static inline u32
+rdmab_lkey(struct rpcrdma_regbuf *rb)
+{
+	return rb->rg_iov.lkey;
+}
+
+static inline struct ib_device *
+rdmab_device(struct rpcrdma_regbuf *rb)
+{
+	return rb->rg_device;
+}
+
+#define RPCRDMA_DEF_GFP		(GFP_NOIO | __GFP_NOWARN)
+
+/* To ensure a transport can always make forward progress,
+ * the number of RDMA segments allowed in header chunk lists
+ * is capped at 8. This prevents less-capable devices and
+ * memory registrations from overrunning the Send buffer
+ * while building chunk lists.
+ *
+ * Elements of the Read list take up more room than the
+ * Write list or Reply chunk. 8 read segments means the Read
+ * list (or Write list or Reply chunk) cannot consume more
+ * than
+ *
+ * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
+ *
+ * And the fixed part of the header is another 24 bytes.
+ *
+ * The smallest inline threshold is 1024 bytes, ensuring that
+ * at least 750 bytes are available for RPC messages.
+ */
+enum {
+	RPCRDMA_MAX_HDR_SEGS = 8,
+	RPCRDMA_HDRBUF_SIZE = 256,
+};
+
+/*
+ * struct rpcrdma_rep -- this structure encapsulates state required
+ * to receive and complete an RPC Reply, asychronously. It needs
+ * several pieces of state:
+ *
+ *   o receive buffer and ib_sge (donated to provider)
+ *   o status of receive (success or not, length, inv rkey)
+ *   o bookkeeping state to get run by reply handler (XDR stream)
+ *
+ * These structures are allocated during transport initialization.
+ * N of these are associated with a transport instance, managed by
+ * struct rpcrdma_buffer. N is the max number of outstanding RPCs.
+ */
+
+struct rpcrdma_rep {
+	struct ib_cqe		rr_cqe;
+	__be32			rr_xid;
+	__be32			rr_vers;
+	__be32			rr_proc;
+	int			rr_wc_flags;
+	u32			rr_inv_rkey;
+	struct rpcrdma_regbuf	*rr_rdmabuf;
+	struct rpcrdma_xprt	*rr_rxprt;
+	struct work_struct	rr_work;
+	struct xdr_buf		rr_hdrbuf;
+	struct xdr_stream	rr_stream;
+	struct rpc_rqst		*rr_rqst;
+	struct list_head	rr_list;
+	struct ib_recv_wr	rr_recv_wr;
+};
+
+/* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes
+ */
+struct rpcrdma_req;
+struct rpcrdma_xprt;
+struct rpcrdma_sendctx {
+	struct ib_send_wr	sc_wr;
+	struct ib_cqe		sc_cqe;
+	struct rpcrdma_xprt	*sc_xprt;
+	struct rpcrdma_req	*sc_req;
+	unsigned int		sc_unmap_count;
+	struct ib_sge		sc_sges[];
+};
+
+/* Limit the number of SGEs that can be unmapped during one
+ * Send completion. This caps the amount of work a single
+ * completion can do before returning to the provider.
+ *
+ * Setting this to zero disables Send completion batching.
+ */
+enum {
+	RPCRDMA_MAX_SEND_BATCH = 7,
+};
+
+/*
+ * struct rpcrdma_mr - external memory region metadata
+ *
+ * An external memory region is any buffer or page that is registered
+ * on the fly (ie, not pre-registered).
+ *
+ * Each rpcrdma_buffer has a list of free MWs anchored in rb_mrs. During
+ * call_allocate, rpcrdma_buffer_get() assigns one to each segment in
+ * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
+ * track of registration metadata while each RPC is pending.
+ * rpcrdma_deregister_external() uses this metadata to unmap and
+ * release these resources when an RPC is complete.
+ */
+enum rpcrdma_frwr_state {
+	FRWR_IS_INVALID,	/* ready to be used */
+	FRWR_IS_VALID,		/* in use */
+	FRWR_FLUSHED_FR,	/* flushed FASTREG WR */
+	FRWR_FLUSHED_LI,	/* flushed LOCALINV WR */
+};
+
+struct rpcrdma_frwr {
+	struct ib_mr			*fr_mr;
+	struct ib_cqe			fr_cqe;
+	enum rpcrdma_frwr_state		fr_state;
+	struct completion		fr_linv_done;
+	union {
+		struct ib_reg_wr	fr_regwr;
+		struct ib_send_wr	fr_invwr;
+	};
+};
+
+struct rpcrdma_fmr {
+	struct ib_fmr		*fm_mr;
+	u64			*fm_physaddrs;
+};
+
+struct rpcrdma_mr {
+	struct list_head	mr_list;
+	struct scatterlist	*mr_sg;
+	int			mr_nents;
+	enum dma_data_direction	mr_dir;
+	union {
+		struct rpcrdma_fmr	fmr;
+		struct rpcrdma_frwr	frwr;
+	};
+	struct rpcrdma_xprt	*mr_xprt;
+	u32			mr_handle;
+	u32			mr_length;
+	u64			mr_offset;
+	struct list_head	mr_all;
+};
+
+/*
+ * struct rpcrdma_req -- structure central to the request/reply sequence.
+ *
+ * N of these are associated with a transport instance, and stored in
+ * struct rpcrdma_buffer. N is the max number of outstanding requests.
+ *
+ * It includes pre-registered buffer memory for send AND recv.
+ * The recv buffer, however, is not owned by this structure, and
+ * is "donated" to the hardware when a recv is posted. When a
+ * reply is handled, the recv buffer used is given back to the
+ * struct rpcrdma_req associated with the request.
+ *
+ * In addition to the basic memory, this structure includes an array
+ * of iovs for send operations. The reason is that the iovs passed to
+ * ib_post_{send,recv} must not be modified until the work request
+ * completes.
+ */
+
+/* Maximum number of page-sized "segments" per chunk list to be
+ * registered or invalidated. Must handle a Reply chunk:
+ */
+enum {
+	RPCRDMA_MAX_IOV_SEGS	= 3,
+	RPCRDMA_MAX_DATA_SEGS	= ((1 * 1024 * 1024) / PAGE_SIZE) + 1,
+	RPCRDMA_MAX_SEGS	= RPCRDMA_MAX_DATA_SEGS +
+				  RPCRDMA_MAX_IOV_SEGS,
+};
+
+struct rpcrdma_mr_seg {		/* chunk descriptors */
+	u32		mr_len;		/* length of chunk or segment */
+	struct page	*mr_page;	/* owning page, if any */
+	char		*mr_offset;	/* kva if no page, else offset */
+};
+
+/* The Send SGE array is provisioned to send a maximum size
+ * inline request:
+ * - RPC-over-RDMA header
+ * - xdr_buf head iovec
+ * - RPCRDMA_MAX_INLINE bytes, in pages
+ * - xdr_buf tail iovec
+ *
+ * The actual number of array elements consumed by each RPC
+ * depends on the device's max_sge limit.
+ */
+enum {
+	RPCRDMA_MIN_SEND_SGES = 3,
+	RPCRDMA_MAX_PAGE_SGES = RPCRDMA_MAX_INLINE >> PAGE_SHIFT,
+	RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1,
+};
+
+struct rpcrdma_buffer;
+struct rpcrdma_req {
+	struct list_head	rl_list;
+	struct rpcrdma_buffer	*rl_buffer;
+	struct rpcrdma_rep	*rl_reply;
+	struct xdr_stream	rl_stream;
+	struct xdr_buf		rl_hdrbuf;
+	struct rpcrdma_sendctx	*rl_sendctx;
+	struct rpcrdma_regbuf	*rl_rdmabuf;	/* xprt header */
+	struct rpcrdma_regbuf	*rl_sendbuf;	/* rq_snd_buf */
+	struct rpcrdma_regbuf	*rl_recvbuf;	/* rq_rcv_buf */
+
+	struct list_head	rl_all;
+	unsigned long		rl_flags;
+
+	struct list_head	rl_registered;	/* registered segments */
+	struct rpcrdma_mr_seg	rl_segments[RPCRDMA_MAX_SEGS];
+};
+
+/* rl_flags */
+enum {
+	RPCRDMA_REQ_F_PENDING = 0,
+	RPCRDMA_REQ_F_TX_RESOURCES,
+};
+
+static inline void
+rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req)
+{
+	rqst->rq_xprtdata = req;
+}
+
+static inline struct rpcrdma_req *
+rpcr_to_rdmar(const struct rpc_rqst *rqst)
+{
+	return rqst->rq_xprtdata;
+}
+
+static inline void
+rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list)
+{
+	list_add_tail(&mr->mr_list, list);
+}
+
+static inline struct rpcrdma_mr *
+rpcrdma_mr_pop(struct list_head *list)
+{
+	struct rpcrdma_mr *mr;
+
+	mr = list_first_entry(list, struct rpcrdma_mr, mr_list);
+	list_del_init(&mr->mr_list);
+	return mr;
+}
+
+/*
+ * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
+ * inline requests/replies, and client/server credits.
+ *
+ * One of these is associated with a transport instance
+ */
+struct rpcrdma_buffer {
+	spinlock_t		rb_mrlock;	/* protect rb_mrs list */
+	struct list_head	rb_mrs;
+	struct list_head	rb_all;
+
+	unsigned long		rb_sc_head;
+	unsigned long		rb_sc_tail;
+	unsigned long		rb_sc_last;
+	struct rpcrdma_sendctx	**rb_sc_ctxs;
+
+	spinlock_t		rb_lock;	/* protect buf lists */
+	int			rb_send_count, rb_recv_count;
+	struct list_head	rb_send_bufs;
+	struct list_head	rb_recv_bufs;
+	u32			rb_max_requests;
+	u32			rb_credits;	/* most recent credit grant */
+
+	u32			rb_bc_srv_max_requests;
+	spinlock_t		rb_reqslock;	/* protect rb_allreqs */
+	struct list_head	rb_allreqs;
+
+	u32			rb_bc_max_requests;
+
+	spinlock_t		rb_recovery_lock; /* protect rb_stale_mrs */
+	struct list_head	rb_stale_mrs;
+	struct delayed_work	rb_recovery_worker;
+	struct delayed_work	rb_refresh_worker;
+};
+#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
+
+/*
+ * Internal structure for transport instance creation. This
+ * exists primarily for modularity.
+ *
+ * This data should be set with mount options
+ */
+struct rpcrdma_create_data_internal {
+	unsigned int	max_requests;	/* max requests (slots) in flight */
+	unsigned int	rsize;		/* mount rsize - max read hdr+data */
+	unsigned int	wsize;		/* mount wsize - max write hdr+data */
+	unsigned int	inline_rsize;	/* max non-rdma read data payload */
+	unsigned int	inline_wsize;	/* max non-rdma write data payload */
+};
+
+/*
+ * Statistics for RPCRDMA
+ */
+struct rpcrdma_stats {
+	/* accessed when sending a call */
+	unsigned long		read_chunk_count;
+	unsigned long		write_chunk_count;
+	unsigned long		reply_chunk_count;
+	unsigned long long	total_rdma_request;
+
+	/* rarely accessed error counters */
+	unsigned long long	pullup_copy_count;
+	unsigned long		hardway_register_count;
+	unsigned long		failed_marshal_count;
+	unsigned long		bad_reply_count;
+	unsigned long		mrs_recovered;
+	unsigned long		mrs_orphaned;
+	unsigned long		mrs_allocated;
+	unsigned long		empty_sendctx_q;
+
+	/* accessed when receiving a reply */
+	unsigned long long	total_rdma_reply;
+	unsigned long long	fixup_copy_count;
+	unsigned long		reply_waits_for_send;
+	unsigned long		local_inv_needed;
+	unsigned long		nomsg_call_count;
+	unsigned long		bcall_count;
+};
+
+/*
+ * Per-registration mode operations
+ */
+struct rpcrdma_xprt;
+struct rpcrdma_memreg_ops {
+	struct rpcrdma_mr_seg *
+			(*ro_map)(struct rpcrdma_xprt *,
+				  struct rpcrdma_mr_seg *, int, bool,
+				  struct rpcrdma_mr **);
+	int		(*ro_send)(struct rpcrdma_ia *ia,
+				   struct rpcrdma_req *req);
+	void		(*ro_reminv)(struct rpcrdma_rep *rep,
+				     struct list_head *mrs);
+	void		(*ro_unmap_sync)(struct rpcrdma_xprt *,
+					 struct list_head *);
+	void		(*ro_recover_mr)(struct rpcrdma_mr *mr);
+	int		(*ro_open)(struct rpcrdma_ia *,
+				   struct rpcrdma_ep *,
+				   struct rpcrdma_create_data_internal *);
+	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
+	int		(*ro_init_mr)(struct rpcrdma_ia *,
+				      struct rpcrdma_mr *);
+	void		(*ro_release_mr)(struct rpcrdma_mr *mr);
+	const char	*ro_displayname;
+	const int	ro_send_w_inv_ok;
+};
+
+extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
+extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
+
+/*
+ * RPCRDMA transport -- encapsulates the structures above for
+ * integration with RPC.
+ *
+ * The contained structures are embedded, not pointers,
+ * for convenience. This structure need not be visible externally.
+ *
+ * It is allocated and initialized during mount, and released
+ * during unmount.
+ */
+struct rpcrdma_xprt {
+	struct rpc_xprt		rx_xprt;
+	struct rpcrdma_ia	rx_ia;
+	struct rpcrdma_ep	rx_ep;
+	struct rpcrdma_buffer	rx_buf;
+	struct rpcrdma_create_data_internal rx_data;
+	struct delayed_work	rx_connect_worker;
+	struct rpcrdma_stats	rx_stats;
+};
+
+#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt)
+#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
+
+static inline const char *
+rpcrdma_addrstr(const struct rpcrdma_xprt *r_xprt)
+{
+	return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR];
+}
+
+static inline const char *
+rpcrdma_portstr(const struct rpcrdma_xprt *r_xprt)
+{
+	return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_PORT];
+}
+
+/* Setting this to 0 ensures interoperability with early servers.
+ * Setting this to 1 enhances certain unaligned read/write performance.
+ * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
+extern int xprt_rdma_pad_optimize;
+
+/* This setting controls the hunt for a supported memory
+ * registration strategy.
+ */
+extern unsigned int xprt_rdma_memreg_strategy;
+
+/*
+ * Interface Adapter calls - xprtrdma/verbs.c
+ */
+int rpcrdma_ia_open(struct rpcrdma_xprt *xprt);
+void rpcrdma_ia_remove(struct rpcrdma_ia *ia);
+void rpcrdma_ia_close(struct rpcrdma_ia *);
+bool frwr_is_supported(struct rpcrdma_ia *);
+bool fmr_is_supported(struct rpcrdma_ia *);
+
+extern struct workqueue_struct *rpcrdma_receive_wq;
+
+/*
+ * Endpoint calls - xprtrdma/verbs.c
+ */
+int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
+				struct rpcrdma_create_data_internal *);
+void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
+int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_conn_func(struct rpcrdma_ep *ep);
+void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+
+int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
+				struct rpcrdma_req *);
+int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *);
+
+/*
+ * Buffer calls - xprtrdma/verbs.c
+ */
+struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
+void rpcrdma_destroy_req(struct rpcrdma_req *);
+int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt);
+int rpcrdma_buffer_create(struct rpcrdma_xprt *);
+void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
+struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf);
+void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc);
+
+struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
+void rpcrdma_mr_put(struct rpcrdma_mr *mr);
+void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr);
+void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr);
+
+struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
+void rpcrdma_buffer_put(struct rpcrdma_req *);
+void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
+void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
+
+struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction,
+					    gfp_t);
+bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *);
+void rpcrdma_free_regbuf(struct rpcrdma_regbuf *);
+
+static inline bool
+rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb)
+{
+	return rb->rg_device != NULL;
+}
+
+static inline bool
+rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+{
+	if (likely(rpcrdma_regbuf_is_mapped(rb)))
+		return true;
+	return __rpcrdma_dma_map_regbuf(ia, rb);
+}
+
+int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
+
+int rpcrdma_alloc_wq(void);
+void rpcrdma_destroy_wq(void);
+
+/*
+ * Wrappers for chunk registration, shared by read/write chunk code.
+ */
+
+static inline enum dma_data_direction
+rpcrdma_data_dir(bool writing)
+{
+	return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+}
+
+/*
+ * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
+ */
+
+enum rpcrdma_chunktype {
+	rpcrdma_noch = 0,
+	rpcrdma_readch,
+	rpcrdma_areadch,
+	rpcrdma_writech,
+	rpcrdma_replych
+};
+
+int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
+			      struct rpcrdma_req *req, u32 hdrlen,
+			      struct xdr_buf *xdr,
+			      enum rpcrdma_chunktype rtype);
+void rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc);
+int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
+void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
+void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
+void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt,
+			  struct rpcrdma_req *req);
+void rpcrdma_deferred_completion(struct work_struct *work);
+
+static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
+{
+	xdr->head[0].iov_len = len;
+	xdr->len = len;
+}
+
+/* RPC/RDMA module init - xprtrdma/transport.c
+ */
+extern unsigned int xprt_rdma_max_inline_read;
+void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
+void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
+void rpcrdma_connect_worker(struct work_struct *work);
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
+int xprt_rdma_init(void);
+void xprt_rdma_cleanup(void);
+
+/* Backchannel calls - xprtrdma/backchannel.c
+ */
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
+int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
+int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
+void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
+int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst);
+void xprt_rdma_bc_free_rqst(struct rpc_rqst *);
+void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
+#endif	/* CONFIG_SUNRPC_BACKCHANNEL */
+
+extern struct xprt_class xprt_rdma_bc;
+
+#endif				/* _LINUX_SUNRPC_XPRT_RDMA_H */
+
+#include <trace/events/rpcrdma.h>
-- 
2.41.0